unaligned / trainer_state.json

Training in progress, step 10

2a57b00 verified almost 2 years ago

159 kB

	{
	"best_metric": 1.7247449159622192,
	"best_model_checkpoint": "lora_lr/google/gemma-1.1-7b-it/unaligned/checkpoint-1000",
	"epoch": 1.310300703774792,
	"eval_steps": 50,
	"global_step": 1024,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.0,
	"grad_norm": 1.0390625,
	"learning_rate": 2.0000000000000003e-06,
	"loss": 4.7488,
	"step": 1
	},
	{
	"epoch": 0.0,
	"grad_norm": 0.97265625,
	"learning_rate": 4.000000000000001e-06,
	"loss": 4.5149,
	"step": 2
	},
	{
	"epoch": 0.0,
	"grad_norm": 0.9296875,
	"learning_rate": 6e-06,
	"loss": 4.3836,
	"step": 3
	},
	{
	"epoch": 0.01,
	"grad_norm": 1.0234375,
	"learning_rate": 8.000000000000001e-06,
	"loss": 4.6777,
	"step": 4
	},
	{
	"epoch": 0.01,
	"grad_norm": 1.03125,
	"learning_rate": 1e-05,
	"loss": 4.6688,
	"step": 5
	},
	{
	"epoch": 0.01,
	"grad_norm": 0.94921875,
	"learning_rate": 1.2e-05,
	"loss": 4.5565,
	"step": 6
	},
	{
	"epoch": 0.01,
	"grad_norm": 1.0078125,
	"learning_rate": 1.4000000000000001e-05,
	"loss": 4.5658,
	"step": 7
	},
	{
	"epoch": 0.01,
	"grad_norm": 1.015625,
	"learning_rate": 1.6000000000000003e-05,
	"loss": 4.6618,
	"step": 8
	},
	{
	"epoch": 0.01,
	"grad_norm": 1.0546875,
	"learning_rate": 1.8e-05,
	"loss": 4.6095,
	"step": 9
	},
	{
	"epoch": 0.01,
	"grad_norm": 1.0703125,
	"learning_rate": 2e-05,
	"loss": 4.7608,
	"step": 10
	},
	{
	"epoch": 0.01,
	"grad_norm": 1.0546875,
	"learning_rate": 2.2000000000000003e-05,
	"loss": 4.5421,
	"step": 11
	},
	{
	"epoch": 0.02,
	"grad_norm": 0.9609375,
	"learning_rate": 2.4e-05,
	"loss": 4.2895,
	"step": 12
	},
	{
	"epoch": 0.02,
	"grad_norm": 1.125,
	"learning_rate": 2.6000000000000002e-05,
	"loss": 4.8112,
	"step": 13
	},
	{
	"epoch": 0.02,
	"grad_norm": 1.1328125,
	"learning_rate": 2.8000000000000003e-05,
	"loss": 4.6709,
	"step": 14
	},
	{
	"epoch": 0.02,
	"grad_norm": 1.1484375,
	"learning_rate": 3e-05,
	"loss": 4.5234,
	"step": 15
	},
	{
	"epoch": 0.02,
	"grad_norm": 1.1328125,
	"learning_rate": 3.2000000000000005e-05,
	"loss": 4.3751,
	"step": 16
	},
	{
	"epoch": 0.02,
	"grad_norm": 1.3046875,
	"learning_rate": 3.4000000000000007e-05,
	"loss": 4.5247,
	"step": 17
	},
	{
	"epoch": 0.02,
	"grad_norm": 1.265625,
	"learning_rate": 3.6e-05,
	"loss": 4.3906,
	"step": 18
	},
	{
	"epoch": 0.02,
	"grad_norm": 1.40625,
	"learning_rate": 3.8e-05,
	"loss": 4.3193,
	"step": 19
	},
	{
	"epoch": 0.03,
	"grad_norm": 1.4296875,
	"learning_rate": 4e-05,
	"loss": 4.2987,
	"step": 20
	},
	{
	"epoch": 0.03,
	"grad_norm": 1.4375,
	"learning_rate": 4.2e-05,
	"loss": 4.2167,
	"step": 21
	},
	{
	"epoch": 0.03,
	"grad_norm": 1.6171875,
	"learning_rate": 4.4000000000000006e-05,
	"loss": 4.3088,
	"step": 22
	},
	{
	"epoch": 0.03,
	"grad_norm": 1.6640625,
	"learning_rate": 4.600000000000001e-05,
	"loss": 4.2658,
	"step": 23
	},
	{
	"epoch": 0.03,
	"grad_norm": 1.734375,
	"learning_rate": 4.8e-05,
	"loss": 4.1625,
	"step": 24
	},
	{
	"epoch": 0.03,
	"grad_norm": 1.8203125,
	"learning_rate": 5e-05,
	"loss": 4.0392,
	"step": 25
	},
	{
	"epoch": 0.03,
	"grad_norm": 1.8828125,
	"learning_rate": 5.2000000000000004e-05,
	"loss": 3.9772,
	"step": 26
	},
	{
	"epoch": 0.03,
	"grad_norm": 1.8828125,
	"learning_rate": 5.4000000000000005e-05,
	"loss": 3.9029,
	"step": 27
	},
	{
	"epoch": 0.04,
	"grad_norm": 1.875,
	"learning_rate": 5.6000000000000006e-05,
	"loss": 3.8107,
	"step": 28
	},
	{
	"epoch": 0.04,
	"grad_norm": 1.96875,
	"learning_rate": 5.8e-05,
	"loss": 3.8199,
	"step": 29
	},
	{
	"epoch": 0.04,
	"grad_norm": 1.953125,
	"learning_rate": 6e-05,
	"loss": 3.5035,
	"step": 30
	},
	{
	"epoch": 0.04,
	"grad_norm": 2.03125,
	"learning_rate": 6.2e-05,
	"loss": 3.6236,
	"step": 31
	},
	{
	"epoch": 0.04,
	"grad_norm": 1.796875,
	"learning_rate": 6.400000000000001e-05,
	"loss": 3.3359,
	"step": 32
	},
	{
	"epoch": 0.04,
	"grad_norm": 1.7578125,
	"learning_rate": 6.6e-05,
	"loss": 3.3019,
	"step": 33
	},
	{
	"epoch": 0.04,
	"grad_norm": 1.7109375,
	"learning_rate": 6.800000000000001e-05,
	"loss": 3.2003,
	"step": 34
	},
	{
	"epoch": 0.04,
	"grad_norm": 1.6640625,
	"learning_rate": 7e-05,
	"loss": 3.1585,
	"step": 35
	},
	{
	"epoch": 0.05,
	"grad_norm": 1.53125,
	"learning_rate": 7.2e-05,
	"loss": 2.9377,
	"step": 36
	},
	{
	"epoch": 0.05,
	"grad_norm": 1.5625,
	"learning_rate": 7.4e-05,
	"loss": 3.13,
	"step": 37
	},
	{
	"epoch": 0.05,
	"grad_norm": 1.453125,
	"learning_rate": 7.6e-05,
	"loss": 2.9445,
	"step": 38
	},
	{
	"epoch": 0.05,
	"grad_norm": 1.21875,
	"learning_rate": 7.800000000000001e-05,
	"loss": 2.7348,
	"step": 39
	},
	{
	"epoch": 0.05,
	"grad_norm": 1.1171875,
	"learning_rate": 8e-05,
	"loss": 2.6842,
	"step": 40
	},
	{
	"epoch": 0.05,
	"grad_norm": 0.90625,
	"learning_rate": 8.2e-05,
	"loss": 2.6921,
	"step": 41
	},
	{
	"epoch": 0.05,
	"grad_norm": 0.9453125,
	"learning_rate": 8.4e-05,
	"loss": 2.6504,
	"step": 42
	},
	{
	"epoch": 0.06,
	"grad_norm": 1.125,
	"learning_rate": 8.6e-05,
	"loss": 2.4995,
	"step": 43
	},
	{
	"epoch": 0.06,
	"grad_norm": 1.34375,
	"learning_rate": 8.800000000000001e-05,
	"loss": 2.4545,
	"step": 44
	},
	{
	"epoch": 0.06,
	"grad_norm": 1.28125,
	"learning_rate": 9e-05,
	"loss": 2.4311,
	"step": 45
	},
	{
	"epoch": 0.06,
	"grad_norm": 0.640625,
	"learning_rate": 9.200000000000001e-05,
	"loss": 2.4045,
	"step": 46
	},
	{
	"epoch": 0.06,
	"grad_norm": 0.54296875,
	"learning_rate": 9.4e-05,
	"loss": 2.413,
	"step": 47
	},
	{
	"epoch": 0.06,
	"grad_norm": 0.578125,
	"learning_rate": 9.6e-05,
	"loss": 2.3691,
	"step": 48
	},
	{
	"epoch": 0.06,
	"grad_norm": 0.65234375,
	"learning_rate": 9.8e-05,
	"loss": 2.2936,
	"step": 49
	},
	{
	"epoch": 0.06,
	"grad_norm": 0.62109375,
	"learning_rate": 0.0001,
	"loss": 2.3459,
	"step": 50
	},
	{
	"epoch": 0.06,
	"eval_loss": 2.298555374145508,
	"eval_runtime": 125.5458,
	"eval_samples_per_second": 39.826,
	"eval_steps_per_second": 1.251,
	"step": 50
	},
	{
	"epoch": 0.07,
	"grad_norm": 0.5859375,
	"learning_rate": 0.00010200000000000001,
	"loss": 2.458,
	"step": 51
	},
	{
	"epoch": 0.07,
	"grad_norm": 0.61328125,
	"learning_rate": 0.00010400000000000001,
	"loss": 2.2395,
	"step": 52
	},
	{
	"epoch": 0.07,
	"grad_norm": 0.55859375,
	"learning_rate": 0.00010600000000000002,
	"loss": 2.2084,
	"step": 53
	},
	{
	"epoch": 0.07,
	"grad_norm": 0.83203125,
	"learning_rate": 0.00010800000000000001,
	"loss": 2.2806,
	"step": 54
	},
	{
	"epoch": 0.07,
	"grad_norm": 1.15625,
	"learning_rate": 0.00011000000000000002,
	"loss": 2.1546,
	"step": 55
	},
	{
	"epoch": 0.07,
	"grad_norm": 0.91015625,
	"learning_rate": 0.00011200000000000001,
	"loss": 2.2027,
	"step": 56
	},
	{
	"epoch": 0.07,
	"grad_norm": 0.6640625,
	"learning_rate": 0.00011399999999999999,
	"loss": 2.1795,
	"step": 57
	},
	{
	"epoch": 0.07,
	"grad_norm": 0.5,
	"learning_rate": 0.000116,
	"loss": 2.1918,
	"step": 58
	},
	{
	"epoch": 0.08,
	"grad_norm": 0.39453125,
	"learning_rate": 0.000118,
	"loss": 2.143,
	"step": 59
	},
	{
	"epoch": 0.08,
	"grad_norm": 0.4375,
	"learning_rate": 0.00012,
	"loss": 2.1451,
	"step": 60
	},
	{
	"epoch": 0.08,
	"grad_norm": 0.388671875,
	"learning_rate": 0.000122,
	"loss": 2.1542,
	"step": 61
	},
	{
	"epoch": 0.08,
	"grad_norm": 0.3984375,
	"learning_rate": 0.000124,
	"loss": 2.1125,
	"step": 62
	},
	{
	"epoch": 0.08,
	"grad_norm": 0.6953125,
	"learning_rate": 0.000126,
	"loss": 2.1123,
	"step": 63
	},
	{
	"epoch": 0.08,
	"grad_norm": 0.451171875,
	"learning_rate": 0.00012800000000000002,
	"loss": 2.0354,
	"step": 64
	},
	{
	"epoch": 0.08,
	"grad_norm": 0.451171875,
	"learning_rate": 0.00013000000000000002,
	"loss": 2.1228,
	"step": 65
	},
	{
	"epoch": 0.08,
	"grad_norm": 0.77734375,
	"learning_rate": 0.000132,
	"loss": 2.1666,
	"step": 66
	},
	{
	"epoch": 0.09,
	"grad_norm": 0.455078125,
	"learning_rate": 0.000134,
	"loss": 2.0429,
	"step": 67
	},
	{
	"epoch": 0.09,
	"grad_norm": 0.384765625,
	"learning_rate": 0.00013600000000000003,
	"loss": 2.0367,
	"step": 68
	},
	{
	"epoch": 0.09,
	"grad_norm": 0.365234375,
	"learning_rate": 0.000138,
	"loss": 2.0139,
	"step": 69
	},
	{
	"epoch": 0.09,
	"grad_norm": 0.3671875,
	"learning_rate": 0.00014,
	"loss": 2.0284,
	"step": 70
	},
	{
	"epoch": 0.09,
	"grad_norm": 0.404296875,
	"learning_rate": 0.000142,
	"loss": 2.0303,
	"step": 71
	},
	{
	"epoch": 0.09,
	"grad_norm": 0.3515625,
	"learning_rate": 0.000144,
	"loss": 2.0645,
	"step": 72
	},
	{
	"epoch": 0.09,
	"grad_norm": 0.365234375,
	"learning_rate": 0.000146,
	"loss": 2.001,
	"step": 73
	},
	{
	"epoch": 0.09,
	"grad_norm": 0.3046875,
	"learning_rate": 0.000148,
	"loss": 2.0946,
	"step": 74
	},
	{
	"epoch": 0.1,
	"grad_norm": 0.314453125,
	"learning_rate": 0.00015000000000000001,
	"loss": 1.9969,
	"step": 75
	},
	{
	"epoch": 0.1,
	"grad_norm": 0.419921875,
	"learning_rate": 0.000152,
	"loss": 1.9911,
	"step": 76
	},
	{
	"epoch": 0.1,
	"grad_norm": 0.328125,
	"learning_rate": 0.000154,
	"loss": 2.0011,
	"step": 77
	},
	{
	"epoch": 0.1,
	"grad_norm": 0.234375,
	"learning_rate": 0.00015600000000000002,
	"loss": 1.9893,
	"step": 78
	},
	{
	"epoch": 0.1,
	"grad_norm": 0.380859375,
	"learning_rate": 0.00015800000000000002,
	"loss": 1.9336,
	"step": 79
	},
	{
	"epoch": 0.1,
	"grad_norm": 0.2890625,
	"learning_rate": 0.00016,
	"loss": 1.9876,
	"step": 80
	},
	{
	"epoch": 0.1,
	"grad_norm": 0.255859375,
	"learning_rate": 0.000162,
	"loss": 1.9679,
	"step": 81
	},
	{
	"epoch": 0.1,
	"grad_norm": 0.23828125,
	"learning_rate": 0.000164,
	"loss": 1.9157,
	"step": 82
	},
	{
	"epoch": 0.11,
	"grad_norm": 0.373046875,
	"learning_rate": 0.000166,
	"loss": 1.9939,
	"step": 83
	},
	{
	"epoch": 0.11,
	"grad_norm": 0.255859375,
	"learning_rate": 0.000168,
	"loss": 1.9457,
	"step": 84
	},
	{
	"epoch": 0.11,
	"grad_norm": 0.26171875,
	"learning_rate": 0.00017,
	"loss": 1.9924,
	"step": 85
	},
	{
	"epoch": 0.11,
	"grad_norm": 0.2275390625,
	"learning_rate": 0.000172,
	"loss": 1.8708,
	"step": 86
	},
	{
	"epoch": 0.11,
	"grad_norm": 0.26953125,
	"learning_rate": 0.000174,
	"loss": 1.946,
	"step": 87
	},
	{
	"epoch": 0.11,
	"grad_norm": 0.255859375,
	"learning_rate": 0.00017600000000000002,
	"loss": 1.9743,
	"step": 88
	},
	{
	"epoch": 0.11,
	"grad_norm": 0.2490234375,
	"learning_rate": 0.00017800000000000002,
	"loss": 1.9257,
	"step": 89
	},
	{
	"epoch": 0.12,
	"grad_norm": 0.26953125,
	"learning_rate": 0.00018,
	"loss": 1.9041,
	"step": 90
	},
	{
	"epoch": 0.12,
	"grad_norm": 0.265625,
	"learning_rate": 0.000182,
	"loss": 1.9502,
	"step": 91
	},
	{
	"epoch": 0.12,
	"grad_norm": 0.240234375,
	"learning_rate": 0.00018400000000000003,
	"loss": 1.8514,
	"step": 92
	},
	{
	"epoch": 0.12,
	"grad_norm": 0.255859375,
	"learning_rate": 0.00018600000000000002,
	"loss": 2.0176,
	"step": 93
	},
	{
	"epoch": 0.12,
	"grad_norm": 0.298828125,
	"learning_rate": 0.000188,
	"loss": 1.9056,
	"step": 94
	},
	{
	"epoch": 0.12,
	"grad_norm": 0.26171875,
	"learning_rate": 0.00019,
	"loss": 1.9149,
	"step": 95
	},
	{
	"epoch": 0.12,
	"grad_norm": 0.212890625,
	"learning_rate": 0.000192,
	"loss": 1.9587,
	"step": 96
	},
	{
	"epoch": 0.12,
	"grad_norm": 0.26953125,
	"learning_rate": 0.000194,
	"loss": 1.9103,
	"step": 97
	},
	{
	"epoch": 0.13,
	"grad_norm": 0.28515625,
	"learning_rate": 0.000196,
	"loss": 1.8814,
	"step": 98
	},
	{
	"epoch": 0.13,
	"grad_norm": 0.25,
	"learning_rate": 0.00019800000000000002,
	"loss": 1.842,
	"step": 99
	},
	{
	"epoch": 0.13,
	"grad_norm": 0.205078125,
	"learning_rate": 0.0002,
	"loss": 1.8251,
	"step": 100
	},
	{
	"epoch": 0.13,
	"eval_loss": 1.8968226909637451,
	"eval_runtime": 125.222,
	"eval_samples_per_second": 39.929,
	"eval_steps_per_second": 1.254,
	"step": 100
	},
	{
	"epoch": 0.13,
	"grad_norm": 0.275390625,
	"learning_rate": 0.00019978354978354978,
	"loss": 1.84,
	"step": 101
	},
	{
	"epoch": 0.13,
	"grad_norm": 0.2236328125,
	"learning_rate": 0.00019956709956709957,
	"loss": 1.9066,
	"step": 102
	},
	{
	"epoch": 0.13,
	"grad_norm": 0.244140625,
	"learning_rate": 0.00019935064935064936,
	"loss": 1.9553,
	"step": 103
	},
	{
	"epoch": 0.13,
	"grad_norm": 0.2451171875,
	"learning_rate": 0.00019913419913419916,
	"loss": 1.922,
	"step": 104
	},
	{
	"epoch": 0.13,
	"grad_norm": 0.255859375,
	"learning_rate": 0.00019891774891774892,
	"loss": 1.8927,
	"step": 105
	},
	{
	"epoch": 0.14,
	"grad_norm": 0.21484375,
	"learning_rate": 0.00019870129870129872,
	"loss": 1.9244,
	"step": 106
	},
	{
	"epoch": 0.14,
	"grad_norm": 0.265625,
	"learning_rate": 0.0001984848484848485,
	"loss": 1.8875,
	"step": 107
	},
	{
	"epoch": 0.14,
	"grad_norm": 0.388671875,
	"learning_rate": 0.00019826839826839827,
	"loss": 1.837,
	"step": 108
	},
	{
	"epoch": 0.14,
	"grad_norm": 0.228515625,
	"learning_rate": 0.00019805194805194807,
	"loss": 1.8613,
	"step": 109
	},
	{
	"epoch": 0.14,
	"grad_norm": 0.2216796875,
	"learning_rate": 0.00019783549783549783,
	"loss": 1.9369,
	"step": 110
	},
	{
	"epoch": 0.14,
	"grad_norm": 0.234375,
	"learning_rate": 0.00019761904761904763,
	"loss": 1.8546,
	"step": 111
	},
	{
	"epoch": 0.14,
	"grad_norm": 0.3125,
	"learning_rate": 0.00019740259740259742,
	"loss": 1.8774,
	"step": 112
	},
	{
	"epoch": 0.14,
	"grad_norm": 0.28125,
	"learning_rate": 0.0001971861471861472,
	"loss": 1.8799,
	"step": 113
	},
	{
	"epoch": 0.15,
	"grad_norm": 0.224609375,
	"learning_rate": 0.00019696969696969698,
	"loss": 1.8776,
	"step": 114
	},
	{
	"epoch": 0.15,
	"grad_norm": 0.2412109375,
	"learning_rate": 0.00019675324675324675,
	"loss": 1.8823,
	"step": 115
	},
	{
	"epoch": 0.15,
	"grad_norm": 0.234375,
	"learning_rate": 0.00019653679653679654,
	"loss": 1.8877,
	"step": 116
	},
	{
	"epoch": 0.15,
	"grad_norm": 0.259765625,
	"learning_rate": 0.00019632034632034633,
	"loss": 1.9692,
	"step": 117
	},
	{
	"epoch": 0.15,
	"grad_norm": 0.208984375,
	"learning_rate": 0.00019610389610389613,
	"loss": 1.8985,
	"step": 118
	},
	{
	"epoch": 0.15,
	"grad_norm": 0.248046875,
	"learning_rate": 0.0001958874458874459,
	"loss": 1.8564,
	"step": 119
	},
	{
	"epoch": 0.15,
	"grad_norm": 0.2109375,
	"learning_rate": 0.00019567099567099566,
	"loss": 1.9001,
	"step": 120
	},
	{
	"epoch": 0.15,
	"grad_norm": 0.2080078125,
	"learning_rate": 0.00019545454545454548,
	"loss": 1.879,
	"step": 121
	},
	{
	"epoch": 0.16,
	"grad_norm": 0.326171875,
	"learning_rate": 0.00019523809523809525,
	"loss": 1.8385,
	"step": 122
	},
	{
	"epoch": 0.16,
	"grad_norm": 0.24609375,
	"learning_rate": 0.00019502164502164504,
	"loss": 1.8581,
	"step": 123
	},
	{
	"epoch": 0.16,
	"grad_norm": 0.2294921875,
	"learning_rate": 0.0001948051948051948,
	"loss": 1.8517,
	"step": 124
	},
	{
	"epoch": 0.16,
	"grad_norm": 0.21875,
	"learning_rate": 0.0001945887445887446,
	"loss": 1.8695,
	"step": 125
	},
	{
	"epoch": 0.16,
	"grad_norm": 0.3125,
	"learning_rate": 0.0001943722943722944,
	"loss": 1.8675,
	"step": 126
	},
	{
	"epoch": 0.16,
	"grad_norm": 0.220703125,
	"learning_rate": 0.00019415584415584416,
	"loss": 1.9089,
	"step": 127
	},
	{
	"epoch": 0.16,
	"grad_norm": 0.2314453125,
	"learning_rate": 0.00019393939393939395,
	"loss": 1.831,
	"step": 128
	},
	{
	"epoch": 0.17,
	"grad_norm": 0.2734375,
	"learning_rate": 0.00019372294372294372,
	"loss": 1.932,
	"step": 129
	},
	{
	"epoch": 0.17,
	"grad_norm": 0.2216796875,
	"learning_rate": 0.00019350649350649354,
	"loss": 1.917,
	"step": 130
	},
	{
	"epoch": 0.17,
	"grad_norm": 0.28125,
	"learning_rate": 0.0001932900432900433,
	"loss": 1.8179,
	"step": 131
	},
	{
	"epoch": 0.17,
	"grad_norm": 0.294921875,
	"learning_rate": 0.00019307359307359307,
	"loss": 1.8764,
	"step": 132
	},
	{
	"epoch": 0.17,
	"grad_norm": 0.26171875,
	"learning_rate": 0.00019285714285714286,
	"loss": 1.8926,
	"step": 133
	},
	{
	"epoch": 0.17,
	"grad_norm": 0.24609375,
	"learning_rate": 0.00019264069264069266,
	"loss": 1.9443,
	"step": 134
	},
	{
	"epoch": 0.17,
	"grad_norm": 0.251953125,
	"learning_rate": 0.00019242424242424245,
	"loss": 1.8702,
	"step": 135
	},
	{
	"epoch": 0.17,
	"grad_norm": 0.32421875,
	"learning_rate": 0.00019220779220779222,
	"loss": 1.8728,
	"step": 136
	},
	{
	"epoch": 0.18,
	"grad_norm": 0.287109375,
	"learning_rate": 0.000191991341991342,
	"loss": 1.7953,
	"step": 137
	},
	{
	"epoch": 0.18,
	"grad_norm": 0.283203125,
	"learning_rate": 0.00019177489177489178,
	"loss": 1.7883,
	"step": 138
	},
	{
	"epoch": 0.18,
	"grad_norm": 0.302734375,
	"learning_rate": 0.00019155844155844157,
	"loss": 1.8741,
	"step": 139
	},
	{
	"epoch": 0.18,
	"grad_norm": 0.25390625,
	"learning_rate": 0.00019134199134199136,
	"loss": 1.8094,
	"step": 140
	},
	{
	"epoch": 0.18,
	"grad_norm": 0.302734375,
	"learning_rate": 0.00019112554112554113,
	"loss": 1.8629,
	"step": 141
	},
	{
	"epoch": 0.18,
	"grad_norm": 0.2333984375,
	"learning_rate": 0.00019090909090909092,
	"loss": 1.855,
	"step": 142
	},
	{
	"epoch": 0.18,
	"grad_norm": 0.236328125,
	"learning_rate": 0.0001906926406926407,
	"loss": 1.848,
	"step": 143
	},
	{
	"epoch": 0.18,
	"grad_norm": 0.265625,
	"learning_rate": 0.00019047619047619048,
	"loss": 1.8853,
	"step": 144
	},
	{
	"epoch": 0.19,
	"grad_norm": 0.47265625,
	"learning_rate": 0.00019025974025974027,
	"loss": 1.8435,
	"step": 145
	},
	{
	"epoch": 0.19,
	"grad_norm": 0.263671875,
	"learning_rate": 0.00019004329004329004,
	"loss": 1.8619,
	"step": 146
	},
	{
	"epoch": 0.19,
	"grad_norm": 0.37890625,
	"learning_rate": 0.00018982683982683983,
	"loss": 1.8298,
	"step": 147
	},
	{
	"epoch": 0.19,
	"grad_norm": 0.236328125,
	"learning_rate": 0.00018961038961038963,
	"loss": 1.7478,
	"step": 148
	},
	{
	"epoch": 0.19,
	"grad_norm": 0.275390625,
	"learning_rate": 0.00018939393939393942,
	"loss": 1.906,
	"step": 149
	},
	{
	"epoch": 0.19,
	"grad_norm": 0.3046875,
	"learning_rate": 0.0001891774891774892,
	"loss": 1.9013,
	"step": 150
	},
	{
	"epoch": 0.19,
	"eval_loss": 1.8299968242645264,
	"eval_runtime": 125.2613,
	"eval_samples_per_second": 39.917,
	"eval_steps_per_second": 1.253,
	"step": 150
	},
	{
	"epoch": 0.19,
	"grad_norm": 0.29296875,
	"learning_rate": 0.00018896103896103895,
	"loss": 1.7772,
	"step": 151
	},
	{
	"epoch": 0.19,
	"grad_norm": 0.2490234375,
	"learning_rate": 0.00018874458874458875,
	"loss": 1.9428,
	"step": 152
	},
	{
	"epoch": 0.2,
	"grad_norm": 0.2314453125,
	"learning_rate": 0.00018852813852813854,
	"loss": 1.8121,
	"step": 153
	},
	{
	"epoch": 0.2,
	"grad_norm": 0.29296875,
	"learning_rate": 0.00018831168831168833,
	"loss": 1.7921,
	"step": 154
	},
	{
	"epoch": 0.2,
	"grad_norm": 0.357421875,
	"learning_rate": 0.0001880952380952381,
	"loss": 1.846,
	"step": 155
	},
	{
	"epoch": 0.2,
	"grad_norm": 0.2236328125,
	"learning_rate": 0.0001878787878787879,
	"loss": 1.7703,
	"step": 156
	},
	{
	"epoch": 0.2,
	"grad_norm": 0.2119140625,
	"learning_rate": 0.00018766233766233769,
	"loss": 1.8048,
	"step": 157
	},
	{
	"epoch": 0.2,
	"grad_norm": 0.33203125,
	"learning_rate": 0.00018744588744588745,
	"loss": 1.8353,
	"step": 158
	},
	{
	"epoch": 0.2,
	"grad_norm": 0.26953125,
	"learning_rate": 0.00018722943722943725,
	"loss": 1.8538,
	"step": 159
	},
	{
	"epoch": 0.2,
	"grad_norm": 0.259765625,
	"learning_rate": 0.000187012987012987,
	"loss": 1.828,
	"step": 160
	},
	{
	"epoch": 0.21,
	"grad_norm": 0.2412109375,
	"learning_rate": 0.0001867965367965368,
	"loss": 1.8555,
	"step": 161
	},
	{
	"epoch": 0.21,
	"grad_norm": 0.3125,
	"learning_rate": 0.0001865800865800866,
	"loss": 1.8,
	"step": 162
	},
	{
	"epoch": 0.21,
	"grad_norm": 0.236328125,
	"learning_rate": 0.00018636363636363636,
	"loss": 1.8241,
	"step": 163
	},
	{
	"epoch": 0.21,
	"grad_norm": 0.25,
	"learning_rate": 0.00018614718614718616,
	"loss": 1.7903,
	"step": 164
	},
	{
	"epoch": 0.21,
	"grad_norm": 0.2431640625,
	"learning_rate": 0.00018593073593073592,
	"loss": 1.7279,
	"step": 165
	},
	{
	"epoch": 0.21,
	"grad_norm": 0.33984375,
	"learning_rate": 0.00018571428571428572,
	"loss": 1.8231,
	"step": 166
	},
	{
	"epoch": 0.21,
	"grad_norm": 0.2451171875,
	"learning_rate": 0.0001854978354978355,
	"loss": 1.8534,
	"step": 167
	},
	{
	"epoch": 0.21,
	"grad_norm": 0.2470703125,
	"learning_rate": 0.0001852813852813853,
	"loss": 1.9175,
	"step": 168
	},
	{
	"epoch": 0.22,
	"grad_norm": 0.341796875,
	"learning_rate": 0.00018506493506493507,
	"loss": 1.7986,
	"step": 169
	},
	{
	"epoch": 0.22,
	"grad_norm": 0.294921875,
	"learning_rate": 0.00018484848484848484,
	"loss": 1.9097,
	"step": 170
	},
	{
	"epoch": 0.22,
	"grad_norm": 0.3046875,
	"learning_rate": 0.00018463203463203466,
	"loss": 1.8232,
	"step": 171
	},
	{
	"epoch": 0.22,
	"grad_norm": 0.38671875,
	"learning_rate": 0.00018441558441558442,
	"loss": 1.7505,
	"step": 172
	},
	{
	"epoch": 0.22,
	"grad_norm": 0.283203125,
	"learning_rate": 0.00018419913419913422,
	"loss": 1.8454,
	"step": 173
	},
	{
	"epoch": 0.22,
	"grad_norm": 0.306640625,
	"learning_rate": 0.00018398268398268398,
	"loss": 1.8028,
	"step": 174
	},
	{
	"epoch": 0.22,
	"grad_norm": 0.28125,
	"learning_rate": 0.00018376623376623378,
	"loss": 1.9089,
	"step": 175
	},
	{
	"epoch": 0.23,
	"grad_norm": 0.26953125,
	"learning_rate": 0.00018354978354978357,
	"loss": 1.8681,
	"step": 176
	},
	{
	"epoch": 0.23,
	"grad_norm": 0.45703125,
	"learning_rate": 0.00018333333333333334,
	"loss": 1.8012,
	"step": 177
	},
	{
	"epoch": 0.23,
	"grad_norm": 0.376953125,
	"learning_rate": 0.00018311688311688313,
	"loss": 1.7399,
	"step": 178
	},
	{
	"epoch": 0.23,
	"grad_norm": 0.3125,
	"learning_rate": 0.0001829004329004329,
	"loss": 1.8339,
	"step": 179
	},
	{
	"epoch": 0.23,
	"grad_norm": 0.470703125,
	"learning_rate": 0.00018268398268398272,
	"loss": 1.8236,
	"step": 180
	},
	{
	"epoch": 0.23,
	"grad_norm": 0.466796875,
	"learning_rate": 0.00018246753246753248,
	"loss": 1.8088,
	"step": 181
	},
	{
	"epoch": 0.23,
	"grad_norm": 0.2890625,
	"learning_rate": 0.00018225108225108225,
	"loss": 1.77,
	"step": 182
	},
	{
	"epoch": 0.23,
	"grad_norm": 0.453125,
	"learning_rate": 0.00018203463203463204,
	"loss": 1.7954,
	"step": 183
	},
	{
	"epoch": 0.24,
	"grad_norm": 0.53125,
	"learning_rate": 0.00018181818181818183,
	"loss": 1.7682,
	"step": 184
	},
	{
	"epoch": 0.24,
	"grad_norm": 0.248046875,
	"learning_rate": 0.00018160173160173163,
	"loss": 1.9032,
	"step": 185
	},
	{
	"epoch": 0.24,
	"grad_norm": 0.2431640625,
	"learning_rate": 0.0001813852813852814,
	"loss": 1.8145,
	"step": 186
	},
	{
	"epoch": 0.24,
	"grad_norm": 0.482421875,
	"learning_rate": 0.0001811688311688312,
	"loss": 1.8433,
	"step": 187
	},
	{
	"epoch": 0.24,
	"grad_norm": 0.3515625,
	"learning_rate": 0.00018095238095238095,
	"loss": 1.7845,
	"step": 188
	},
	{
	"epoch": 0.24,
	"grad_norm": 0.306640625,
	"learning_rate": 0.00018073593073593075,
	"loss": 1.8512,
	"step": 189
	},
	{
	"epoch": 0.24,
	"grad_norm": 0.328125,
	"learning_rate": 0.00018051948051948054,
	"loss": 1.8208,
	"step": 190
	},
	{
	"epoch": 0.24,
	"grad_norm": 0.28515625,
	"learning_rate": 0.0001803030303030303,
	"loss": 1.8353,
	"step": 191
	},
	{
	"epoch": 0.25,
	"grad_norm": 0.251953125,
	"learning_rate": 0.0001800865800865801,
	"loss": 1.8627,
	"step": 192
	},
	{
	"epoch": 0.25,
	"grad_norm": 0.275390625,
	"learning_rate": 0.00017987012987012987,
	"loss": 1.7228,
	"step": 193
	},
	{
	"epoch": 0.25,
	"grad_norm": 0.279296875,
	"learning_rate": 0.00017965367965367966,
	"loss": 1.878,
	"step": 194
	},
	{
	"epoch": 0.25,
	"grad_norm": 0.255859375,
	"learning_rate": 0.00017943722943722945,
	"loss": 1.7724,
	"step": 195
	},
	{
	"epoch": 0.25,
	"grad_norm": 0.251953125,
	"learning_rate": 0.00017922077922077922,
	"loss": 1.7845,
	"step": 196
	},
	{
	"epoch": 0.25,
	"grad_norm": 0.326171875,
	"learning_rate": 0.000179004329004329,
	"loss": 1.7848,
	"step": 197
	},
	{
	"epoch": 0.25,
	"grad_norm": 0.265625,
	"learning_rate": 0.0001787878787878788,
	"loss": 1.808,
	"step": 198
	},
	{
	"epoch": 0.25,
	"grad_norm": 0.310546875,
	"learning_rate": 0.0001785714285714286,
	"loss": 1.834,
	"step": 199
	},
	{
	"epoch": 0.26,
	"grad_norm": 0.275390625,
	"learning_rate": 0.00017835497835497836,
	"loss": 1.7679,
	"step": 200
	},
	{
	"epoch": 0.26,
	"eval_loss": 1.8024407625198364,
	"eval_runtime": 125.3989,
	"eval_samples_per_second": 39.873,
	"eval_steps_per_second": 1.252,
	"step": 200
	},
	{
	"epoch": 0.26,
	"grad_norm": 0.283203125,
	"learning_rate": 0.00017813852813852813,
	"loss": 1.8003,
	"step": 201
	},
	{
	"epoch": 0.26,
	"grad_norm": 0.267578125,
	"learning_rate": 0.00017792207792207792,
	"loss": 1.8544,
	"step": 202
	},
	{
	"epoch": 0.26,
	"grad_norm": 0.259765625,
	"learning_rate": 0.00017770562770562772,
	"loss": 1.8894,
	"step": 203
	},
	{
	"epoch": 0.26,
	"grad_norm": 0.28125,
	"learning_rate": 0.0001774891774891775,
	"loss": 1.8496,
	"step": 204
	},
	{
	"epoch": 0.26,
	"grad_norm": 0.265625,
	"learning_rate": 0.00017727272727272728,
	"loss": 1.7319,
	"step": 205
	},
	{
	"epoch": 0.26,
	"grad_norm": 0.29296875,
	"learning_rate": 0.00017705627705627707,
	"loss": 1.8156,
	"step": 206
	},
	{
	"epoch": 0.26,
	"grad_norm": 0.294921875,
	"learning_rate": 0.00017683982683982684,
	"loss": 1.6916,
	"step": 207
	},
	{
	"epoch": 0.27,
	"grad_norm": 0.267578125,
	"learning_rate": 0.00017662337662337663,
	"loss": 1.826,
	"step": 208
	},
	{
	"epoch": 0.27,
	"grad_norm": 0.3046875,
	"learning_rate": 0.00017640692640692642,
	"loss": 1.8066,
	"step": 209
	},
	{
	"epoch": 0.27,
	"grad_norm": 0.32421875,
	"learning_rate": 0.0001761904761904762,
	"loss": 1.8503,
	"step": 210
	},
	{
	"epoch": 0.27,
	"grad_norm": 0.279296875,
	"learning_rate": 0.00017597402597402598,
	"loss": 1.8168,
	"step": 211
	},
	{
	"epoch": 0.27,
	"grad_norm": 0.26953125,
	"learning_rate": 0.00017575757575757578,
	"loss": 1.7903,
	"step": 212
	},
	{
	"epoch": 0.27,
	"grad_norm": 0.287109375,
	"learning_rate": 0.00017554112554112554,
	"loss": 1.8139,
	"step": 213
	},
	{
	"epoch": 0.27,
	"grad_norm": 0.26171875,
	"learning_rate": 0.00017532467532467534,
	"loss": 1.7982,
	"step": 214
	},
	{
	"epoch": 0.28,
	"grad_norm": 0.291015625,
	"learning_rate": 0.0001751082251082251,
	"loss": 1.7828,
	"step": 215
	},
	{
	"epoch": 0.28,
	"grad_norm": 0.26953125,
	"learning_rate": 0.0001748917748917749,
	"loss": 1.7416,
	"step": 216
	},
	{
	"epoch": 0.28,
	"grad_norm": 0.34375,
	"learning_rate": 0.0001746753246753247,
	"loss": 1.806,
	"step": 217
	},
	{
	"epoch": 0.28,
	"grad_norm": 0.265625,
	"learning_rate": 0.00017445887445887448,
	"loss": 1.8822,
	"step": 218
	},
	{
	"epoch": 0.28,
	"grad_norm": 0.345703125,
	"learning_rate": 0.00017424242424242425,
	"loss": 1.7818,
	"step": 219
	},
	{
	"epoch": 0.28,
	"grad_norm": 0.275390625,
	"learning_rate": 0.00017402597402597401,
	"loss": 1.843,
	"step": 220
	},
	{
	"epoch": 0.28,
	"grad_norm": 0.33984375,
	"learning_rate": 0.00017380952380952383,
	"loss": 1.8087,
	"step": 221
	},
	{
	"epoch": 0.28,
	"grad_norm": 0.27734375,
	"learning_rate": 0.0001735930735930736,
	"loss": 1.8346,
	"step": 222
	},
	{
	"epoch": 0.29,
	"grad_norm": 0.296875,
	"learning_rate": 0.0001733766233766234,
	"loss": 1.7843,
	"step": 223
	},
	{
	"epoch": 0.29,
	"grad_norm": 0.330078125,
	"learning_rate": 0.00017316017316017316,
	"loss": 1.7974,
	"step": 224
	},
	{
	"epoch": 0.29,
	"grad_norm": 0.26953125,
	"learning_rate": 0.00017294372294372295,
	"loss": 1.7798,
	"step": 225
	},
	{
	"epoch": 0.29,
	"grad_norm": 0.287109375,
	"learning_rate": 0.00017272727272727275,
	"loss": 1.7318,
	"step": 226
	},
	{
	"epoch": 0.29,
	"grad_norm": 0.27734375,
	"learning_rate": 0.0001725108225108225,
	"loss": 1.8244,
	"step": 227
	},
	{
	"epoch": 0.29,
	"grad_norm": 0.345703125,
	"learning_rate": 0.0001722943722943723,
	"loss": 1.8291,
	"step": 228
	},
	{
	"epoch": 0.29,
	"grad_norm": 0.275390625,
	"learning_rate": 0.00017207792207792207,
	"loss": 1.7127,
	"step": 229
	},
	{
	"epoch": 0.29,
	"grad_norm": 0.27734375,
	"learning_rate": 0.00017186147186147187,
	"loss": 1.8416,
	"step": 230
	},
	{
	"epoch": 0.3,
	"grad_norm": 0.283203125,
	"learning_rate": 0.00017164502164502166,
	"loss": 1.8327,
	"step": 231
	},
	{
	"epoch": 0.3,
	"grad_norm": 0.29296875,
	"learning_rate": 0.00017142857142857143,
	"loss": 1.7058,
	"step": 232
	},
	{
	"epoch": 0.3,
	"grad_norm": 0.3046875,
	"learning_rate": 0.00017121212121212122,
	"loss": 1.7438,
	"step": 233
	},
	{
	"epoch": 0.3,
	"grad_norm": 0.298828125,
	"learning_rate": 0.00017099567099567098,
	"loss": 1.822,
	"step": 234
	},
	{
	"epoch": 0.3,
	"grad_norm": 0.275390625,
	"learning_rate": 0.0001707792207792208,
	"loss": 1.8308,
	"step": 235
	},
	{
	"epoch": 0.3,
	"grad_norm": 0.283203125,
	"learning_rate": 0.00017056277056277057,
	"loss": 1.8354,
	"step": 236
	},
	{
	"epoch": 0.3,
	"grad_norm": 0.265625,
	"learning_rate": 0.00017034632034632036,
	"loss": 1.7753,
	"step": 237
	},
	{
	"epoch": 0.3,
	"grad_norm": 0.337890625,
	"learning_rate": 0.00017012987012987013,
	"loss": 1.8091,
	"step": 238
	},
	{
	"epoch": 0.31,
	"grad_norm": 0.3046875,
	"learning_rate": 0.00016991341991341992,
	"loss": 1.8526,
	"step": 239
	},
	{
	"epoch": 0.31,
	"grad_norm": 0.33984375,
	"learning_rate": 0.00016969696969696972,
	"loss": 1.8296,
	"step": 240
	},
	{
	"epoch": 0.31,
	"grad_norm": 0.322265625,
	"learning_rate": 0.00016948051948051948,
	"loss": 1.8309,
	"step": 241
	},
	{
	"epoch": 0.31,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00016926406926406928,
	"loss": 1.7423,
	"step": 242
	},
	{
	"epoch": 0.31,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00016904761904761904,
	"loss": 1.7644,
	"step": 243
	},
	{
	"epoch": 0.31,
	"grad_norm": 0.30859375,
	"learning_rate": 0.00016883116883116884,
	"loss": 1.7392,
	"step": 244
	},
	{
	"epoch": 0.31,
	"grad_norm": 0.349609375,
	"learning_rate": 0.00016861471861471863,
	"loss": 1.8515,
	"step": 245
	},
	{
	"epoch": 0.31,
	"grad_norm": 0.30859375,
	"learning_rate": 0.0001683982683982684,
	"loss": 1.751,
	"step": 246
	},
	{
	"epoch": 0.32,
	"grad_norm": 0.3046875,
	"learning_rate": 0.0001681818181818182,
	"loss": 1.8196,
	"step": 247
	},
	{
	"epoch": 0.32,
	"grad_norm": 0.28515625,
	"learning_rate": 0.00016796536796536798,
	"loss": 1.8556,
	"step": 248
	},
	{
	"epoch": 0.32,
	"grad_norm": 0.390625,
	"learning_rate": 0.00016774891774891778,
	"loss": 1.7934,
	"step": 249
	},
	{
	"epoch": 0.32,
	"grad_norm": 0.29296875,
	"learning_rate": 0.00016753246753246754,
	"loss": 1.753,
	"step": 250
	},
	{
	"epoch": 0.32,
	"eval_loss": 1.7896223068237305,
	"eval_runtime": 125.3191,
	"eval_samples_per_second": 39.898,
	"eval_steps_per_second": 1.253,
	"step": 250
	},
	{
	"epoch": 0.32,
	"grad_norm": 0.2890625,
	"learning_rate": 0.0001673160173160173,
	"loss": 1.8485,
	"step": 251
	},
	{
	"epoch": 0.32,
	"grad_norm": 0.33984375,
	"learning_rate": 0.0001670995670995671,
	"loss": 1.7847,
	"step": 252
	},
	{
	"epoch": 0.32,
	"grad_norm": 0.294921875,
	"learning_rate": 0.0001668831168831169,
	"loss": 1.688,
	"step": 253
	},
	{
	"epoch": 0.33,
	"grad_norm": 0.271484375,
	"learning_rate": 0.0001666666666666667,
	"loss": 1.7079,
	"step": 254
	},
	{
	"epoch": 0.33,
	"grad_norm": 0.33203125,
	"learning_rate": 0.00016645021645021645,
	"loss": 1.7858,
	"step": 255
	},
	{
	"epoch": 0.33,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00016623376623376625,
	"loss": 1.8125,
	"step": 256
	},
	{
	"epoch": 0.33,
	"grad_norm": 0.32421875,
	"learning_rate": 0.00016601731601731601,
	"loss": 1.7826,
	"step": 257
	},
	{
	"epoch": 0.33,
	"grad_norm": 0.3515625,
	"learning_rate": 0.0001658008658008658,
	"loss": 1.7352,
	"step": 258
	},
	{
	"epoch": 0.33,
	"grad_norm": 0.357421875,
	"learning_rate": 0.0001655844155844156,
	"loss": 1.7863,
	"step": 259
	},
	{
	"epoch": 0.33,
	"grad_norm": 0.283203125,
	"learning_rate": 0.00016536796536796537,
	"loss": 1.7868,
	"step": 260
	},
	{
	"epoch": 0.33,
	"grad_norm": 0.279296875,
	"learning_rate": 0.00016515151515151516,
	"loss": 1.7958,
	"step": 261
	},
	{
	"epoch": 0.34,
	"grad_norm": 0.29296875,
	"learning_rate": 0.00016493506493506495,
	"loss": 1.7685,
	"step": 262
	},
	{
	"epoch": 0.34,
	"grad_norm": 0.279296875,
	"learning_rate": 0.00016471861471861472,
	"loss": 1.7561,
	"step": 263
	},
	{
	"epoch": 0.34,
	"grad_norm": 0.2890625,
	"learning_rate": 0.0001645021645021645,
	"loss": 1.7634,
	"step": 264
	},
	{
	"epoch": 0.34,
	"grad_norm": 0.296875,
	"learning_rate": 0.00016428571428571428,
	"loss": 1.8092,
	"step": 265
	},
	{
	"epoch": 0.34,
	"grad_norm": 0.31640625,
	"learning_rate": 0.00016406926406926407,
	"loss": 1.8324,
	"step": 266
	},
	{
	"epoch": 0.34,
	"grad_norm": 0.28125,
	"learning_rate": 0.00016385281385281387,
	"loss": 1.7221,
	"step": 267
	},
	{
	"epoch": 0.34,
	"grad_norm": 0.298828125,
	"learning_rate": 0.00016363636363636366,
	"loss": 1.7796,
	"step": 268
	},
	{
	"epoch": 0.34,
	"grad_norm": 0.302734375,
	"learning_rate": 0.00016341991341991343,
	"loss": 1.7713,
	"step": 269
	},
	{
	"epoch": 0.35,
	"grad_norm": 0.265625,
	"learning_rate": 0.0001632034632034632,
	"loss": 1.8178,
	"step": 270
	},
	{
	"epoch": 0.35,
	"grad_norm": 0.404296875,
	"learning_rate": 0.000162987012987013,
	"loss": 1.8104,
	"step": 271
	},
	{
	"epoch": 0.35,
	"grad_norm": 0.3125,
	"learning_rate": 0.00016277056277056278,
	"loss": 1.8387,
	"step": 272
	},
	{
	"epoch": 0.35,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00016255411255411257,
	"loss": 1.8897,
	"step": 273
	},
	{
	"epoch": 0.35,
	"grad_norm": 0.28125,
	"learning_rate": 0.00016233766233766234,
	"loss": 1.8162,
	"step": 274
	},
	{
	"epoch": 0.35,
	"grad_norm": 0.40625,
	"learning_rate": 0.00016212121212121213,
	"loss": 1.8086,
	"step": 275
	},
	{
	"epoch": 0.35,
	"grad_norm": 0.283203125,
	"learning_rate": 0.00016190476190476192,
	"loss": 1.9186,
	"step": 276
	},
	{
	"epoch": 0.35,
	"grad_norm": 0.345703125,
	"learning_rate": 0.0001616883116883117,
	"loss": 1.8123,
	"step": 277
	},
	{
	"epoch": 0.36,
	"grad_norm": 0.27734375,
	"learning_rate": 0.00016147186147186148,
	"loss": 1.8115,
	"step": 278
	},
	{
	"epoch": 0.36,
	"grad_norm": 0.33203125,
	"learning_rate": 0.00016125541125541125,
	"loss": 1.8785,
	"step": 279
	},
	{
	"epoch": 0.36,
	"grad_norm": 0.3125,
	"learning_rate": 0.00016103896103896104,
	"loss": 1.805,
	"step": 280
	},
	{
	"epoch": 0.36,
	"grad_norm": 0.3046875,
	"learning_rate": 0.00016082251082251084,
	"loss": 1.8656,
	"step": 281
	},
	{
	"epoch": 0.36,
	"grad_norm": 0.3515625,
	"learning_rate": 0.0001606060606060606,
	"loss": 1.7848,
	"step": 282
	},
	{
	"epoch": 0.36,
	"grad_norm": 0.3125,
	"learning_rate": 0.0001603896103896104,
	"loss": 1.7817,
	"step": 283
	},
	{
	"epoch": 0.36,
	"grad_norm": 0.30078125,
	"learning_rate": 0.00016017316017316016,
	"loss": 1.8217,
	"step": 284
	},
	{
	"epoch": 0.36,
	"grad_norm": 0.318359375,
	"learning_rate": 0.00015995670995670998,
	"loss": 1.8771,
	"step": 285
	},
	{
	"epoch": 0.37,
	"grad_norm": 0.31640625,
	"learning_rate": 0.00015974025974025975,
	"loss": 1.8028,
	"step": 286
	},
	{
	"epoch": 0.37,
	"grad_norm": 0.291015625,
	"learning_rate": 0.00015952380952380954,
	"loss": 1.8331,
	"step": 287
	},
	{
	"epoch": 0.37,
	"grad_norm": 0.357421875,
	"learning_rate": 0.0001593073593073593,
	"loss": 1.7528,
	"step": 288
	},
	{
	"epoch": 0.37,
	"grad_norm": 0.267578125,
	"learning_rate": 0.0001590909090909091,
	"loss": 1.8012,
	"step": 289
	},
	{
	"epoch": 0.37,
	"grad_norm": 0.2734375,
	"learning_rate": 0.0001588744588744589,
	"loss": 1.7787,
	"step": 290
	},
	{
	"epoch": 0.37,
	"grad_norm": 0.3125,
	"learning_rate": 0.00015865800865800866,
	"loss": 1.803,
	"step": 291
	},
	{
	"epoch": 0.37,
	"grad_norm": 0.4375,
	"learning_rate": 0.00015844155844155845,
	"loss": 1.7466,
	"step": 292
	},
	{
	"epoch": 0.37,
	"grad_norm": 0.30078125,
	"learning_rate": 0.00015822510822510822,
	"loss": 1.8551,
	"step": 293
	},
	{
	"epoch": 0.38,
	"grad_norm": 0.279296875,
	"learning_rate": 0.00015800865800865801,
	"loss": 1.7932,
	"step": 294
	},
	{
	"epoch": 0.38,
	"grad_norm": 0.359375,
	"learning_rate": 0.0001577922077922078,
	"loss": 1.7295,
	"step": 295
	},
	{
	"epoch": 0.38,
	"grad_norm": 0.318359375,
	"learning_rate": 0.00015757575757575757,
	"loss": 1.7984,
	"step": 296
	},
	{
	"epoch": 0.38,
	"grad_norm": 0.40234375,
	"learning_rate": 0.00015735930735930737,
	"loss": 1.8666,
	"step": 297
	},
	{
	"epoch": 0.38,
	"grad_norm": 0.287109375,
	"learning_rate": 0.00015714285714285716,
	"loss": 1.7149,
	"step": 298
	},
	{
	"epoch": 0.38,
	"grad_norm": 0.462890625,
	"learning_rate": 0.00015692640692640695,
	"loss": 1.7447,
	"step": 299
	},
	{
	"epoch": 0.38,
	"grad_norm": 0.4296875,
	"learning_rate": 0.00015670995670995672,
	"loss": 1.8199,
	"step": 300
	},
	{
	"epoch": 0.38,
	"eval_loss": 1.7805144786834717,
	"eval_runtime": 125.5041,
	"eval_samples_per_second": 39.839,
	"eval_steps_per_second": 1.251,
	"step": 300
	},
	{
	"epoch": 0.39,
	"grad_norm": 0.29296875,
	"learning_rate": 0.00015649350649350649,
	"loss": 1.7937,
	"step": 301
	},
	{
	"epoch": 0.39,
	"grad_norm": 0.400390625,
	"learning_rate": 0.00015627705627705628,
	"loss": 1.7617,
	"step": 302
	},
	{
	"epoch": 0.39,
	"grad_norm": 0.435546875,
	"learning_rate": 0.00015606060606060607,
	"loss": 1.7755,
	"step": 303
	},
	{
	"epoch": 0.39,
	"grad_norm": 0.326171875,
	"learning_rate": 0.00015584415584415587,
	"loss": 1.862,
	"step": 304
	},
	{
	"epoch": 0.39,
	"grad_norm": 0.365234375,
	"learning_rate": 0.00015562770562770563,
	"loss": 1.8572,
	"step": 305
	},
	{
	"epoch": 0.39,
	"grad_norm": 0.330078125,
	"learning_rate": 0.00015541125541125543,
	"loss": 1.7678,
	"step": 306
	},
	{
	"epoch": 0.39,
	"grad_norm": 0.359375,
	"learning_rate": 0.0001551948051948052,
	"loss": 1.7435,
	"step": 307
	},
	{
	"epoch": 0.39,
	"grad_norm": 0.326171875,
	"learning_rate": 0.00015497835497835498,
	"loss": 1.7535,
	"step": 308
	},
	{
	"epoch": 0.4,
	"grad_norm": 0.39453125,
	"learning_rate": 0.00015476190476190478,
	"loss": 1.7495,
	"step": 309
	},
	{
	"epoch": 0.4,
	"grad_norm": 0.28515625,
	"learning_rate": 0.00015454545454545454,
	"loss": 1.7107,
	"step": 310
	},
	{
	"epoch": 0.4,
	"grad_norm": 0.298828125,
	"learning_rate": 0.00015432900432900434,
	"loss": 1.7419,
	"step": 311
	},
	{
	"epoch": 0.4,
	"grad_norm": 0.296875,
	"learning_rate": 0.00015411255411255413,
	"loss": 1.7466,
	"step": 312
	},
	{
	"epoch": 0.4,
	"grad_norm": 0.3046875,
	"learning_rate": 0.0001538961038961039,
	"loss": 1.8045,
	"step": 313
	},
	{
	"epoch": 0.4,
	"grad_norm": 0.369140625,
	"learning_rate": 0.0001536796536796537,
	"loss": 1.7756,
	"step": 314
	},
	{
	"epoch": 0.4,
	"grad_norm": 0.35546875,
	"learning_rate": 0.00015346320346320346,
	"loss": 1.8018,
	"step": 315
	},
	{
	"epoch": 0.4,
	"grad_norm": 0.287109375,
	"learning_rate": 0.00015324675324675325,
	"loss": 1.8246,
	"step": 316
	},
	{
	"epoch": 0.41,
	"grad_norm": 0.28515625,
	"learning_rate": 0.00015303030303030304,
	"loss": 1.761,
	"step": 317
	},
	{
	"epoch": 0.41,
	"grad_norm": 0.29296875,
	"learning_rate": 0.00015281385281385284,
	"loss": 1.773,
	"step": 318
	},
	{
	"epoch": 0.41,
	"grad_norm": 0.345703125,
	"learning_rate": 0.0001525974025974026,
	"loss": 1.7889,
	"step": 319
	},
	{
	"epoch": 0.41,
	"grad_norm": 0.38671875,
	"learning_rate": 0.00015238095238095237,
	"loss": 1.827,
	"step": 320
	},
	{
	"epoch": 0.41,
	"grad_norm": 0.333984375,
	"learning_rate": 0.0001521645021645022,
	"loss": 1.7928,
	"step": 321
	},
	{
	"epoch": 0.41,
	"grad_norm": 0.33203125,
	"learning_rate": 0.00015194805194805196,
	"loss": 1.7618,
	"step": 322
	},
	{
	"epoch": 0.41,
	"grad_norm": 0.36328125,
	"learning_rate": 0.00015173160173160175,
	"loss": 1.7576,
	"step": 323
	},
	{
	"epoch": 0.41,
	"grad_norm": 0.2890625,
	"learning_rate": 0.00015151515151515152,
	"loss": 1.6683,
	"step": 324
	},
	{
	"epoch": 0.42,
	"grad_norm": 0.27734375,
	"learning_rate": 0.0001512987012987013,
	"loss": 1.7967,
	"step": 325
	},
	{
	"epoch": 0.42,
	"grad_norm": 0.28125,
	"learning_rate": 0.0001510822510822511,
	"loss": 1.8209,
	"step": 326
	},
	{
	"epoch": 0.42,
	"grad_norm": 0.39453125,
	"learning_rate": 0.00015086580086580087,
	"loss": 1.8743,
	"step": 327
	},
	{
	"epoch": 0.42,
	"grad_norm": 0.27734375,
	"learning_rate": 0.00015064935064935066,
	"loss": 1.8559,
	"step": 328
	},
	{
	"epoch": 0.42,
	"grad_norm": 0.337890625,
	"learning_rate": 0.00015043290043290043,
	"loss": 1.7791,
	"step": 329
	},
	{
	"epoch": 0.42,
	"grad_norm": 0.353515625,
	"learning_rate": 0.00015021645021645022,
	"loss": 1.8626,
	"step": 330
	},
	{
	"epoch": 0.42,
	"grad_norm": 0.4140625,
	"learning_rate": 0.00015000000000000001,
	"loss": 1.8169,
	"step": 331
	},
	{
	"epoch": 0.42,
	"grad_norm": 0.373046875,
	"learning_rate": 0.00014978354978354978,
	"loss": 1.8006,
	"step": 332
	},
	{
	"epoch": 0.43,
	"grad_norm": 0.302734375,
	"learning_rate": 0.00014956709956709957,
	"loss": 1.8818,
	"step": 333
	},
	{
	"epoch": 0.43,
	"grad_norm": 0.423828125,
	"learning_rate": 0.00014935064935064934,
	"loss": 1.7898,
	"step": 334
	},
	{
	"epoch": 0.43,
	"grad_norm": 0.314453125,
	"learning_rate": 0.00014913419913419916,
	"loss": 1.7639,
	"step": 335
	},
	{
	"epoch": 0.43,
	"grad_norm": 0.375,
	"learning_rate": 0.00014891774891774893,
	"loss": 1.8276,
	"step": 336
	},
	{
	"epoch": 0.43,
	"grad_norm": 0.34375,
	"learning_rate": 0.00014870129870129872,
	"loss": 1.8718,
	"step": 337
	},
	{
	"epoch": 0.43,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00014848484848484849,
	"loss": 1.7142,
	"step": 338
	},
	{
	"epoch": 0.43,
	"grad_norm": 0.330078125,
	"learning_rate": 0.00014826839826839828,
	"loss": 1.8261,
	"step": 339
	},
	{
	"epoch": 0.44,
	"grad_norm": 0.328125,
	"learning_rate": 0.00014805194805194807,
	"loss": 1.8177,
	"step": 340
	},
	{
	"epoch": 0.44,
	"grad_norm": 0.357421875,
	"learning_rate": 0.00014783549783549784,
	"loss": 1.739,
	"step": 341
	},
	{
	"epoch": 0.44,
	"grad_norm": 0.291015625,
	"learning_rate": 0.00014761904761904763,
	"loss": 1.7867,
	"step": 342
	},
	{
	"epoch": 0.44,
	"grad_norm": 0.380859375,
	"learning_rate": 0.0001474025974025974,
	"loss": 1.7825,
	"step": 343
	},
	{
	"epoch": 0.44,
	"grad_norm": 0.29296875,
	"learning_rate": 0.0001471861471861472,
	"loss": 1.7784,
	"step": 344
	},
	{
	"epoch": 0.44,
	"grad_norm": 0.2890625,
	"learning_rate": 0.00014696969696969698,
	"loss": 1.8077,
	"step": 345
	},
	{
	"epoch": 0.44,
	"grad_norm": 0.296875,
	"learning_rate": 0.00014675324675324675,
	"loss": 1.7616,
	"step": 346
	},
	{
	"epoch": 0.44,
	"grad_norm": 0.2890625,
	"learning_rate": 0.00014653679653679654,
	"loss": 1.7738,
	"step": 347
	},
	{
	"epoch": 0.45,
	"grad_norm": 0.345703125,
	"learning_rate": 0.00014632034632034634,
	"loss": 1.7019,
	"step": 348
	},
	{
	"epoch": 0.45,
	"grad_norm": 0.33984375,
	"learning_rate": 0.00014610389610389613,
	"loss": 1.6911,
	"step": 349
	},
	{
	"epoch": 0.45,
	"grad_norm": 0.3359375,
	"learning_rate": 0.0001458874458874459,
	"loss": 1.7708,
	"step": 350
	},
	{
	"epoch": 0.45,
	"eval_loss": 1.7709890604019165,
	"eval_runtime": 125.3163,
	"eval_samples_per_second": 39.899,
	"eval_steps_per_second": 1.253,
	"step": 350
	},
	{
	"epoch": 0.45,
	"grad_norm": 0.349609375,
	"learning_rate": 0.00014567099567099566,
	"loss": 1.76,
	"step": 351
	},
	{
	"epoch": 0.45,
	"grad_norm": 0.38671875,
	"learning_rate": 0.00014545454545454546,
	"loss": 1.7913,
	"step": 352
	},
	{
	"epoch": 0.45,
	"grad_norm": 0.421875,
	"learning_rate": 0.00014523809523809525,
	"loss": 1.8083,
	"step": 353
	},
	{
	"epoch": 0.45,
	"grad_norm": 0.388671875,
	"learning_rate": 0.00014502164502164504,
	"loss": 1.7886,
	"step": 354
	},
	{
	"epoch": 0.45,
	"grad_norm": 0.265625,
	"learning_rate": 0.0001448051948051948,
	"loss": 1.8125,
	"step": 355
	},
	{
	"epoch": 0.46,
	"grad_norm": 0.353515625,
	"learning_rate": 0.00014458874458874458,
	"loss": 1.6922,
	"step": 356
	},
	{
	"epoch": 0.46,
	"grad_norm": 0.41796875,
	"learning_rate": 0.00014437229437229437,
	"loss": 1.6483,
	"step": 357
	},
	{
	"epoch": 0.46,
	"grad_norm": 0.38671875,
	"learning_rate": 0.00014415584415584416,
	"loss": 1.805,
	"step": 358
	},
	{
	"epoch": 0.46,
	"grad_norm": 0.29296875,
	"learning_rate": 0.00014393939393939396,
	"loss": 1.7458,
	"step": 359
	},
	{
	"epoch": 0.46,
	"grad_norm": 0.416015625,
	"learning_rate": 0.00014372294372294372,
	"loss": 1.7989,
	"step": 360
	},
	{
	"epoch": 0.46,
	"grad_norm": 0.404296875,
	"learning_rate": 0.00014350649350649352,
	"loss": 1.7271,
	"step": 361
	},
	{
	"epoch": 0.46,
	"grad_norm": 0.296875,
	"learning_rate": 0.0001432900432900433,
	"loss": 1.7804,
	"step": 362
	},
	{
	"epoch": 0.46,
	"grad_norm": 0.296875,
	"learning_rate": 0.00014307359307359307,
	"loss": 1.8581,
	"step": 363
	},
	{
	"epoch": 0.47,
	"grad_norm": 0.353515625,
	"learning_rate": 0.00014285714285714287,
	"loss": 1.757,
	"step": 364
	},
	{
	"epoch": 0.47,
	"grad_norm": 0.3203125,
	"learning_rate": 0.00014264069264069263,
	"loss": 1.7536,
	"step": 365
	},
	{
	"epoch": 0.47,
	"grad_norm": 0.3203125,
	"learning_rate": 0.00014242424242424243,
	"loss": 1.7191,
	"step": 366
	},
	{
	"epoch": 0.47,
	"grad_norm": 0.384765625,
	"learning_rate": 0.00014220779220779222,
	"loss": 1.7741,
	"step": 367
	},
	{
	"epoch": 0.47,
	"grad_norm": 0.3359375,
	"learning_rate": 0.00014199134199134201,
	"loss": 1.7626,
	"step": 368
	},
	{
	"epoch": 0.47,
	"grad_norm": 0.4140625,
	"learning_rate": 0.00014177489177489178,
	"loss": 1.8351,
	"step": 369
	},
	{
	"epoch": 0.47,
	"grad_norm": 0.4921875,
	"learning_rate": 0.00014155844155844155,
	"loss": 1.7943,
	"step": 370
	},
	{
	"epoch": 0.47,
	"grad_norm": 0.28515625,
	"learning_rate": 0.00014134199134199137,
	"loss": 1.7729,
	"step": 371
	},
	{
	"epoch": 0.48,
	"grad_norm": 0.36328125,
	"learning_rate": 0.00014112554112554113,
	"loss": 1.853,
	"step": 372
	},
	{
	"epoch": 0.48,
	"grad_norm": 0.373046875,
	"learning_rate": 0.00014090909090909093,
	"loss": 1.7805,
	"step": 373
	},
	{
	"epoch": 0.48,
	"grad_norm": 0.375,
	"learning_rate": 0.0001406926406926407,
	"loss": 1.805,
	"step": 374
	},
	{
	"epoch": 0.48,
	"grad_norm": 0.365234375,
	"learning_rate": 0.00014047619047619049,
	"loss": 1.7717,
	"step": 375
	},
	{
	"epoch": 0.48,
	"grad_norm": 0.375,
	"learning_rate": 0.00014025974025974028,
	"loss": 1.793,
	"step": 376
	},
	{
	"epoch": 0.48,
	"grad_norm": 0.353515625,
	"learning_rate": 0.00014004329004329005,
	"loss": 1.8017,
	"step": 377
	},
	{
	"epoch": 0.48,
	"grad_norm": 0.359375,
	"learning_rate": 0.00013982683982683984,
	"loss": 1.758,
	"step": 378
	},
	{
	"epoch": 0.48,
	"grad_norm": 0.35546875,
	"learning_rate": 0.0001396103896103896,
	"loss": 1.7761,
	"step": 379
	},
	{
	"epoch": 0.49,
	"grad_norm": 0.33203125,
	"learning_rate": 0.0001393939393939394,
	"loss": 1.7626,
	"step": 380
	},
	{
	"epoch": 0.49,
	"grad_norm": 0.361328125,
	"learning_rate": 0.0001391774891774892,
	"loss": 1.8463,
	"step": 381
	},
	{
	"epoch": 0.49,
	"grad_norm": 0.4140625,
	"learning_rate": 0.00013896103896103896,
	"loss": 1.7563,
	"step": 382
	},
	{
	"epoch": 0.49,
	"grad_norm": 0.318359375,
	"learning_rate": 0.00013874458874458875,
	"loss": 1.7085,
	"step": 383
	},
	{
	"epoch": 0.49,
	"grad_norm": 0.337890625,
	"learning_rate": 0.00013852813852813852,
	"loss": 1.8202,
	"step": 384
	},
	{
	"epoch": 0.49,
	"grad_norm": 0.30078125,
	"learning_rate": 0.00013831168831168834,
	"loss": 1.7618,
	"step": 385
	},
	{
	"epoch": 0.49,
	"grad_norm": 0.3046875,
	"learning_rate": 0.0001380952380952381,
	"loss": 1.8142,
	"step": 386
	},
	{
	"epoch": 0.5,
	"grad_norm": 0.318359375,
	"learning_rate": 0.0001378787878787879,
	"loss": 1.6778,
	"step": 387
	},
	{
	"epoch": 0.5,
	"grad_norm": 0.287109375,
	"learning_rate": 0.00013766233766233766,
	"loss": 1.749,
	"step": 388
	},
	{
	"epoch": 0.5,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00013744588744588746,
	"loss": 1.7236,
	"step": 389
	},
	{
	"epoch": 0.5,
	"grad_norm": 0.322265625,
	"learning_rate": 0.00013722943722943725,
	"loss": 1.7329,
	"step": 390
	},
	{
	"epoch": 0.5,
	"grad_norm": 0.384765625,
	"learning_rate": 0.00013701298701298702,
	"loss": 1.8287,
	"step": 391
	},
	{
	"epoch": 0.5,
	"grad_norm": 0.318359375,
	"learning_rate": 0.0001367965367965368,
	"loss": 1.8173,
	"step": 392
	},
	{
	"epoch": 0.5,
	"grad_norm": 0.5234375,
	"learning_rate": 0.00013658008658008658,
	"loss": 1.8881,
	"step": 393
	},
	{
	"epoch": 0.5,
	"grad_norm": 0.4453125,
	"learning_rate": 0.00013636363636363637,
	"loss": 1.7895,
	"step": 394
	},
	{
	"epoch": 0.51,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00013614718614718616,
	"loss": 1.7531,
	"step": 395
	},
	{
	"epoch": 0.51,
	"grad_norm": 0.337890625,
	"learning_rate": 0.00013593073593073593,
	"loss": 1.7601,
	"step": 396
	},
	{
	"epoch": 0.51,
	"grad_norm": 0.353515625,
	"learning_rate": 0.00013571428571428572,
	"loss": 1.6574,
	"step": 397
	},
	{
	"epoch": 0.51,
	"grad_norm": 0.345703125,
	"learning_rate": 0.0001354978354978355,
	"loss": 1.7577,
	"step": 398
	},
	{
	"epoch": 0.51,
	"grad_norm": 0.455078125,
	"learning_rate": 0.0001352813852813853,
	"loss": 1.6886,
	"step": 399
	},
	{
	"epoch": 0.51,
	"grad_norm": 0.337890625,
	"learning_rate": 0.00013506493506493507,
	"loss": 1.7786,
	"step": 400
	},
	{
	"epoch": 0.51,
	"eval_loss": 1.7625454664230347,
	"eval_runtime": 125.3037,
	"eval_samples_per_second": 39.903,
	"eval_steps_per_second": 1.253,
	"step": 400
	},
	{
	"epoch": 0.51,
	"grad_norm": 0.392578125,
	"learning_rate": 0.00013484848484848484,
	"loss": 1.7421,
	"step": 401
	},
	{
	"epoch": 0.51,
	"grad_norm": 0.53125,
	"learning_rate": 0.00013463203463203463,
	"loss": 1.7626,
	"step": 402
	},
	{
	"epoch": 0.52,
	"grad_norm": 0.34765625,
	"learning_rate": 0.00013441558441558443,
	"loss": 1.7943,
	"step": 403
	},
	{
	"epoch": 0.52,
	"grad_norm": 0.435546875,
	"learning_rate": 0.00013419913419913422,
	"loss": 1.8116,
	"step": 404
	},
	{
	"epoch": 0.52,
	"grad_norm": 0.419921875,
	"learning_rate": 0.000133982683982684,
	"loss": 1.7532,
	"step": 405
	},
	{
	"epoch": 0.52,
	"grad_norm": 0.345703125,
	"learning_rate": 0.00013376623376623375,
	"loss": 1.7478,
	"step": 406
	},
	{
	"epoch": 0.52,
	"grad_norm": 0.40234375,
	"learning_rate": 0.00013354978354978355,
	"loss": 1.8208,
	"step": 407
	},
	{
	"epoch": 0.52,
	"grad_norm": 0.314453125,
	"learning_rate": 0.00013333333333333334,
	"loss": 1.7518,
	"step": 408
	},
	{
	"epoch": 0.52,
	"grad_norm": 0.470703125,
	"learning_rate": 0.00013311688311688313,
	"loss": 1.8063,
	"step": 409
	},
	{
	"epoch": 0.52,
	"grad_norm": 0.419921875,
	"learning_rate": 0.0001329004329004329,
	"loss": 1.7366,
	"step": 410
	},
	{
	"epoch": 0.53,
	"grad_norm": 0.326171875,
	"learning_rate": 0.0001326839826839827,
	"loss": 1.7605,
	"step": 411
	},
	{
	"epoch": 0.53,
	"grad_norm": 0.287109375,
	"learning_rate": 0.00013246753246753249,
	"loss": 1.8454,
	"step": 412
	},
	{
	"epoch": 0.53,
	"grad_norm": 0.376953125,
	"learning_rate": 0.00013225108225108225,
	"loss": 1.7342,
	"step": 413
	},
	{
	"epoch": 0.53,
	"grad_norm": 0.416015625,
	"learning_rate": 0.00013203463203463205,
	"loss": 1.7031,
	"step": 414
	},
	{
	"epoch": 0.53,
	"grad_norm": 0.3984375,
	"learning_rate": 0.0001318181818181818,
	"loss": 1.8397,
	"step": 415
	},
	{
	"epoch": 0.53,
	"grad_norm": 0.369140625,
	"learning_rate": 0.0001316017316017316,
	"loss": 1.8038,
	"step": 416
	},
	{
	"epoch": 0.53,
	"grad_norm": 0.369140625,
	"learning_rate": 0.0001313852813852814,
	"loss": 1.7529,
	"step": 417
	},
	{
	"epoch": 0.53,
	"grad_norm": 0.396484375,
	"learning_rate": 0.0001311688311688312,
	"loss": 1.7746,
	"step": 418
	},
	{
	"epoch": 0.54,
	"grad_norm": 0.31640625,
	"learning_rate": 0.00013095238095238096,
	"loss": 1.8441,
	"step": 419
	},
	{
	"epoch": 0.54,
	"grad_norm": 0.287109375,
	"learning_rate": 0.00013073593073593072,
	"loss": 1.7302,
	"step": 420
	},
	{
	"epoch": 0.54,
	"grad_norm": 0.470703125,
	"learning_rate": 0.00013051948051948052,
	"loss": 1.7149,
	"step": 421
	},
	{
	"epoch": 0.54,
	"grad_norm": 0.330078125,
	"learning_rate": 0.0001303030303030303,
	"loss": 1.6881,
	"step": 422
	},
	{
	"epoch": 0.54,
	"grad_norm": 0.28515625,
	"learning_rate": 0.0001300865800865801,
	"loss": 1.7627,
	"step": 423
	},
	{
	"epoch": 0.54,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00012987012987012987,
	"loss": 1.7386,
	"step": 424
	},
	{
	"epoch": 0.54,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00012965367965367964,
	"loss": 1.7331,
	"step": 425
	},
	{
	"epoch": 0.55,
	"grad_norm": 0.384765625,
	"learning_rate": 0.00012943722943722946,
	"loss": 1.766,
	"step": 426
	},
	{
	"epoch": 0.55,
	"grad_norm": 0.3828125,
	"learning_rate": 0.00012922077922077922,
	"loss": 1.7111,
	"step": 427
	},
	{
	"epoch": 0.55,
	"grad_norm": 0.37890625,
	"learning_rate": 0.00012900432900432902,
	"loss": 1.7874,
	"step": 428
	},
	{
	"epoch": 0.55,
	"grad_norm": 0.380859375,
	"learning_rate": 0.00012878787878787878,
	"loss": 1.7565,
	"step": 429
	},
	{
	"epoch": 0.55,
	"grad_norm": 0.37109375,
	"learning_rate": 0.00012857142857142858,
	"loss": 1.8451,
	"step": 430
	},
	{
	"epoch": 0.55,
	"grad_norm": 0.419921875,
	"learning_rate": 0.00012835497835497837,
	"loss": 1.7024,
	"step": 431
	},
	{
	"epoch": 0.55,
	"grad_norm": 0.294921875,
	"learning_rate": 0.00012813852813852814,
	"loss": 1.7037,
	"step": 432
	},
	{
	"epoch": 0.55,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00012792207792207793,
	"loss": 1.7166,
	"step": 433
	},
	{
	"epoch": 0.56,
	"grad_norm": 0.404296875,
	"learning_rate": 0.0001277056277056277,
	"loss": 1.7534,
	"step": 434
	},
	{
	"epoch": 0.56,
	"grad_norm": 0.3359375,
	"learning_rate": 0.00012748917748917752,
	"loss": 1.7315,
	"step": 435
	},
	{
	"epoch": 0.56,
	"grad_norm": 0.349609375,
	"learning_rate": 0.00012727272727272728,
	"loss": 1.7707,
	"step": 436
	},
	{
	"epoch": 0.56,
	"grad_norm": 0.328125,
	"learning_rate": 0.00012705627705627707,
	"loss": 1.811,
	"step": 437
	},
	{
	"epoch": 0.56,
	"grad_norm": 0.2890625,
	"learning_rate": 0.00012683982683982684,
	"loss": 1.8211,
	"step": 438
	},
	{
	"epoch": 0.56,
	"grad_norm": 0.330078125,
	"learning_rate": 0.00012662337662337663,
	"loss": 1.6992,
	"step": 439
	},
	{
	"epoch": 0.56,
	"grad_norm": 0.33203125,
	"learning_rate": 0.00012640692640692643,
	"loss": 1.8294,
	"step": 440
	},
	{
	"epoch": 0.56,
	"grad_norm": 0.326171875,
	"learning_rate": 0.0001261904761904762,
	"loss": 1.7351,
	"step": 441
	},
	{
	"epoch": 0.57,
	"grad_norm": 0.384765625,
	"learning_rate": 0.000125974025974026,
	"loss": 1.732,
	"step": 442
	},
	{
	"epoch": 0.57,
	"grad_norm": 0.41796875,
	"learning_rate": 0.00012575757575757575,
	"loss": 1.7839,
	"step": 443
	},
	{
	"epoch": 0.57,
	"grad_norm": 0.34375,
	"learning_rate": 0.00012554112554112555,
	"loss": 1.7892,
	"step": 444
	},
	{
	"epoch": 0.57,
	"grad_norm": 0.37890625,
	"learning_rate": 0.00012532467532467534,
	"loss": 1.7428,
	"step": 445
	},
	{
	"epoch": 0.57,
	"grad_norm": 0.3359375,
	"learning_rate": 0.0001251082251082251,
	"loss": 1.8055,
	"step": 446
	},
	{
	"epoch": 0.57,
	"grad_norm": 0.384765625,
	"learning_rate": 0.0001248917748917749,
	"loss": 1.8227,
	"step": 447
	},
	{
	"epoch": 0.57,
	"grad_norm": 0.349609375,
	"learning_rate": 0.00012467532467532467,
	"loss": 1.7273,
	"step": 448
	},
	{
	"epoch": 0.57,
	"grad_norm": 0.328125,
	"learning_rate": 0.00012445887445887449,
	"loss": 1.8338,
	"step": 449
	},
	{
	"epoch": 0.58,
	"grad_norm": 0.4140625,
	"learning_rate": 0.00012424242424242425,
	"loss": 1.71,
	"step": 450
	},
	{
	"epoch": 0.58,
	"eval_loss": 1.7571938037872314,
	"eval_runtime": 125.7305,
	"eval_samples_per_second": 39.768,
	"eval_steps_per_second": 1.249,
	"step": 450
	},
	{
	"epoch": 0.58,
	"grad_norm": 0.408203125,
	"learning_rate": 0.00012402597402597402,
	"loss": 1.6852,
	"step": 451
	},
	{
	"epoch": 0.58,
	"grad_norm": 0.3125,
	"learning_rate": 0.0001238095238095238,
	"loss": 1.7876,
	"step": 452
	},
	{
	"epoch": 0.58,
	"grad_norm": 0.314453125,
	"learning_rate": 0.0001235930735930736,
	"loss": 1.7676,
	"step": 453
	},
	{
	"epoch": 0.58,
	"grad_norm": 0.439453125,
	"learning_rate": 0.0001233766233766234,
	"loss": 1.7567,
	"step": 454
	},
	{
	"epoch": 0.58,
	"grad_norm": 0.294921875,
	"learning_rate": 0.00012316017316017316,
	"loss": 1.7835,
	"step": 455
	},
	{
	"epoch": 0.58,
	"grad_norm": 0.3046875,
	"learning_rate": 0.00012294372294372293,
	"loss": 1.8254,
	"step": 456
	},
	{
	"epoch": 0.58,
	"grad_norm": 0.30078125,
	"learning_rate": 0.00012272727272727272,
	"loss": 1.7415,
	"step": 457
	},
	{
	"epoch": 0.59,
	"grad_norm": 0.3359375,
	"learning_rate": 0.00012251082251082252,
	"loss": 1.7718,
	"step": 458
	},
	{
	"epoch": 0.59,
	"grad_norm": 0.361328125,
	"learning_rate": 0.0001222943722943723,
	"loss": 1.7734,
	"step": 459
	},
	{
	"epoch": 0.59,
	"grad_norm": 0.3125,
	"learning_rate": 0.00012207792207792208,
	"loss": 1.7413,
	"step": 460
	},
	{
	"epoch": 0.59,
	"grad_norm": 0.283203125,
	"learning_rate": 0.00012186147186147187,
	"loss": 1.8832,
	"step": 461
	},
	{
	"epoch": 0.59,
	"grad_norm": 0.328125,
	"learning_rate": 0.00012164502164502165,
	"loss": 1.71,
	"step": 462
	},
	{
	"epoch": 0.59,
	"grad_norm": 0.3203125,
	"learning_rate": 0.00012142857142857143,
	"loss": 1.7611,
	"step": 463
	},
	{
	"epoch": 0.59,
	"grad_norm": 0.373046875,
	"learning_rate": 0.00012121212121212122,
	"loss": 1.809,
	"step": 464
	},
	{
	"epoch": 0.6,
	"grad_norm": 0.29296875,
	"learning_rate": 0.00012099567099567099,
	"loss": 1.6685,
	"step": 465
	},
	{
	"epoch": 0.6,
	"grad_norm": 0.310546875,
	"learning_rate": 0.0001207792207792208,
	"loss": 1.7246,
	"step": 466
	},
	{
	"epoch": 0.6,
	"grad_norm": 0.30859375,
	"learning_rate": 0.00012056277056277056,
	"loss": 1.7187,
	"step": 467
	},
	{
	"epoch": 0.6,
	"grad_norm": 0.296875,
	"learning_rate": 0.00012034632034632037,
	"loss": 1.7165,
	"step": 468
	},
	{
	"epoch": 0.6,
	"grad_norm": 0.353515625,
	"learning_rate": 0.00012012987012987014,
	"loss": 1.7451,
	"step": 469
	},
	{
	"epoch": 0.6,
	"grad_norm": 0.33203125,
	"learning_rate": 0.00011991341991341991,
	"loss": 1.7889,
	"step": 470
	},
	{
	"epoch": 0.6,
	"grad_norm": 0.34375,
	"learning_rate": 0.00011969696969696971,
	"loss": 1.8299,
	"step": 471
	},
	{
	"epoch": 0.6,
	"grad_norm": 0.314453125,
	"learning_rate": 0.00011948051948051949,
	"loss": 1.7567,
	"step": 472
	},
	{
	"epoch": 0.61,
	"grad_norm": 0.3359375,
	"learning_rate": 0.00011926406926406928,
	"loss": 1.7335,
	"step": 473
	},
	{
	"epoch": 0.61,
	"grad_norm": 0.30859375,
	"learning_rate": 0.00011904761904761905,
	"loss": 1.7394,
	"step": 474
	},
	{
	"epoch": 0.61,
	"grad_norm": 0.34765625,
	"learning_rate": 0.00011883116883116883,
	"loss": 1.7834,
	"step": 475
	},
	{
	"epoch": 0.61,
	"grad_norm": 0.322265625,
	"learning_rate": 0.00011861471861471862,
	"loss": 1.7819,
	"step": 476
	},
	{
	"epoch": 0.61,
	"grad_norm": 0.271484375,
	"learning_rate": 0.0001183982683982684,
	"loss": 1.7574,
	"step": 477
	},
	{
	"epoch": 0.61,
	"grad_norm": 0.330078125,
	"learning_rate": 0.0001181818181818182,
	"loss": 1.7031,
	"step": 478
	},
	{
	"epoch": 0.61,
	"grad_norm": 0.30859375,
	"learning_rate": 0.00011796536796536797,
	"loss": 1.7361,
	"step": 479
	},
	{
	"epoch": 0.61,
	"grad_norm": 0.333984375,
	"learning_rate": 0.00011774891774891777,
	"loss": 1.7638,
	"step": 480
	},
	{
	"epoch": 0.62,
	"grad_norm": 0.34375,
	"learning_rate": 0.00011753246753246753,
	"loss": 1.7476,
	"step": 481
	},
	{
	"epoch": 0.62,
	"grad_norm": 0.333984375,
	"learning_rate": 0.00011731601731601731,
	"loss": 1.6884,
	"step": 482
	},
	{
	"epoch": 0.62,
	"grad_norm": 0.296875,
	"learning_rate": 0.0001170995670995671,
	"loss": 1.6966,
	"step": 483
	},
	{
	"epoch": 0.62,
	"grad_norm": 0.306640625,
	"learning_rate": 0.00011688311688311689,
	"loss": 1.7041,
	"step": 484
	},
	{
	"epoch": 0.62,
	"grad_norm": 0.33203125,
	"learning_rate": 0.00011666666666666668,
	"loss": 1.7788,
	"step": 485
	},
	{
	"epoch": 0.62,
	"grad_norm": 0.328125,
	"learning_rate": 0.00011645021645021646,
	"loss": 1.7744,
	"step": 486
	},
	{
	"epoch": 0.62,
	"grad_norm": 0.423828125,
	"learning_rate": 0.00011623376623376625,
	"loss": 1.7492,
	"step": 487
	},
	{
	"epoch": 0.62,
	"grad_norm": 0.318359375,
	"learning_rate": 0.00011601731601731602,
	"loss": 1.7537,
	"step": 488
	},
	{
	"epoch": 0.63,
	"grad_norm": 0.337890625,
	"learning_rate": 0.0001158008658008658,
	"loss": 1.7363,
	"step": 489
	},
	{
	"epoch": 0.63,
	"grad_norm": 0.349609375,
	"learning_rate": 0.00011558441558441559,
	"loss": 1.7846,
	"step": 490
	},
	{
	"epoch": 0.63,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00011536796536796537,
	"loss": 1.7081,
	"step": 491
	},
	{
	"epoch": 0.63,
	"grad_norm": 0.33203125,
	"learning_rate": 0.00011515151515151516,
	"loss": 1.7275,
	"step": 492
	},
	{
	"epoch": 0.63,
	"grad_norm": 0.33203125,
	"learning_rate": 0.00011493506493506494,
	"loss": 1.7196,
	"step": 493
	},
	{
	"epoch": 0.63,
	"grad_norm": 0.30078125,
	"learning_rate": 0.00011471861471861471,
	"loss": 1.6845,
	"step": 494
	},
	{
	"epoch": 0.63,
	"grad_norm": 0.337890625,
	"learning_rate": 0.00011450216450216452,
	"loss": 1.7443,
	"step": 495
	},
	{
	"epoch": 0.63,
	"grad_norm": 0.3359375,
	"learning_rate": 0.00011428571428571428,
	"loss": 1.7104,
	"step": 496
	},
	{
	"epoch": 0.64,
	"grad_norm": 0.380859375,
	"learning_rate": 0.00011406926406926408,
	"loss": 1.7307,
	"step": 497
	},
	{
	"epoch": 0.64,
	"grad_norm": 0.361328125,
	"learning_rate": 0.00011385281385281386,
	"loss": 1.8338,
	"step": 498
	},
	{
	"epoch": 0.64,
	"grad_norm": 0.296875,
	"learning_rate": 0.00011363636363636365,
	"loss": 1.7235,
	"step": 499
	},
	{
	"epoch": 0.64,
	"grad_norm": 0.291015625,
	"learning_rate": 0.00011341991341991343,
	"loss": 1.8493,
	"step": 500
	},
	{
	"epoch": 0.64,
	"eval_loss": 1.7474199533462524,
	"eval_runtime": 125.2746,
	"eval_samples_per_second": 39.912,
	"eval_steps_per_second": 1.253,
	"step": 500
	},
	{
	"epoch": 0.64,
	"grad_norm": 0.33203125,
	"learning_rate": 0.0001132034632034632,
	"loss": 1.7369,
	"step": 501
	},
	{
	"epoch": 0.64,
	"grad_norm": 0.322265625,
	"learning_rate": 0.000112987012987013,
	"loss": 1.7215,
	"step": 502
	},
	{
	"epoch": 0.64,
	"grad_norm": 0.36328125,
	"learning_rate": 0.00011277056277056277,
	"loss": 1.7981,
	"step": 503
	},
	{
	"epoch": 0.64,
	"grad_norm": 0.318359375,
	"learning_rate": 0.00011255411255411256,
	"loss": 1.6949,
	"step": 504
	},
	{
	"epoch": 0.65,
	"grad_norm": 0.38671875,
	"learning_rate": 0.00011233766233766234,
	"loss": 1.7577,
	"step": 505
	},
	{
	"epoch": 0.65,
	"grad_norm": 0.52734375,
	"learning_rate": 0.00011212121212121212,
	"loss": 1.7396,
	"step": 506
	},
	{
	"epoch": 0.65,
	"grad_norm": 0.3359375,
	"learning_rate": 0.00011190476190476191,
	"loss": 1.7094,
	"step": 507
	},
	{
	"epoch": 0.65,
	"grad_norm": 0.314453125,
	"learning_rate": 0.00011168831168831168,
	"loss": 1.7354,
	"step": 508
	},
	{
	"epoch": 0.65,
	"grad_norm": 0.37109375,
	"learning_rate": 0.00011147186147186149,
	"loss": 1.7109,
	"step": 509
	},
	{
	"epoch": 0.65,
	"grad_norm": 0.4140625,
	"learning_rate": 0.00011125541125541125,
	"loss": 1.7809,
	"step": 510
	},
	{
	"epoch": 0.65,
	"grad_norm": 0.455078125,
	"learning_rate": 0.00011103896103896105,
	"loss": 1.7939,
	"step": 511
	},
	{
	"epoch": 0.66,
	"grad_norm": 0.291015625,
	"learning_rate": 0.00011082251082251083,
	"loss": 1.7942,
	"step": 512
	},
	{
	"epoch": 0.66,
	"grad_norm": 0.609375,
	"learning_rate": 0.00011060606060606061,
	"loss": 1.7189,
	"step": 513
	},
	{
	"epoch": 0.66,
	"grad_norm": 0.67578125,
	"learning_rate": 0.0001103896103896104,
	"loss": 1.7662,
	"step": 514
	},
	{
	"epoch": 0.66,
	"grad_norm": 0.353515625,
	"learning_rate": 0.00011017316017316017,
	"loss": 1.7288,
	"step": 515
	},
	{
	"epoch": 0.66,
	"grad_norm": 0.322265625,
	"learning_rate": 0.00010995670995670997,
	"loss": 1.715,
	"step": 516
	},
	{
	"epoch": 0.66,
	"grad_norm": 0.40625,
	"learning_rate": 0.00010974025974025974,
	"loss": 1.6725,
	"step": 517
	},
	{
	"epoch": 0.66,
	"grad_norm": 0.52734375,
	"learning_rate": 0.00010952380952380953,
	"loss": 1.7752,
	"step": 518
	},
	{
	"epoch": 0.66,
	"grad_norm": 0.48828125,
	"learning_rate": 0.00010930735930735931,
	"loss": 1.7345,
	"step": 519
	},
	{
	"epoch": 0.67,
	"grad_norm": 0.3203125,
	"learning_rate": 0.00010909090909090909,
	"loss": 1.7054,
	"step": 520
	},
	{
	"epoch": 0.67,
	"grad_norm": 0.45703125,
	"learning_rate": 0.00010887445887445889,
	"loss": 1.8508,
	"step": 521
	},
	{
	"epoch": 0.67,
	"grad_norm": 0.50390625,
	"learning_rate": 0.00010865800865800865,
	"loss": 1.723,
	"step": 522
	},
	{
	"epoch": 0.67,
	"grad_norm": 0.361328125,
	"learning_rate": 0.00010844155844155846,
	"loss": 1.7008,
	"step": 523
	},
	{
	"epoch": 0.67,
	"grad_norm": 0.330078125,
	"learning_rate": 0.00010822510822510823,
	"loss": 1.6947,
	"step": 524
	},
	{
	"epoch": 0.67,
	"grad_norm": 0.439453125,
	"learning_rate": 0.000108008658008658,
	"loss": 1.8002,
	"step": 525
	},
	{
	"epoch": 0.67,
	"grad_norm": 0.419921875,
	"learning_rate": 0.0001077922077922078,
	"loss": 1.7645,
	"step": 526
	},
	{
	"epoch": 0.67,
	"grad_norm": 0.392578125,
	"learning_rate": 0.00010757575757575758,
	"loss": 1.7568,
	"step": 527
	},
	{
	"epoch": 0.68,
	"grad_norm": 0.34375,
	"learning_rate": 0.00010735930735930737,
	"loss": 1.7684,
	"step": 528
	},
	{
	"epoch": 0.68,
	"grad_norm": 0.337890625,
	"learning_rate": 0.00010714285714285715,
	"loss": 1.7795,
	"step": 529
	},
	{
	"epoch": 0.68,
	"grad_norm": 0.375,
	"learning_rate": 0.00010692640692640694,
	"loss": 1.7471,
	"step": 530
	},
	{
	"epoch": 0.68,
	"grad_norm": 0.369140625,
	"learning_rate": 0.00010670995670995671,
	"loss": 1.8058,
	"step": 531
	},
	{
	"epoch": 0.68,
	"grad_norm": 0.326171875,
	"learning_rate": 0.00010649350649350649,
	"loss": 1.673,
	"step": 532
	},
	{
	"epoch": 0.68,
	"grad_norm": 0.365234375,
	"learning_rate": 0.00010627705627705628,
	"loss": 1.728,
	"step": 533
	},
	{
	"epoch": 0.68,
	"grad_norm": 0.423828125,
	"learning_rate": 0.00010606060606060606,
	"loss": 1.6786,
	"step": 534
	},
	{
	"epoch": 0.68,
	"grad_norm": 0.37890625,
	"learning_rate": 0.00010584415584415586,
	"loss": 1.733,
	"step": 535
	},
	{
	"epoch": 0.69,
	"grad_norm": 0.34375,
	"learning_rate": 0.00010562770562770564,
	"loss": 1.7435,
	"step": 536
	},
	{
	"epoch": 0.69,
	"grad_norm": 0.3125,
	"learning_rate": 0.00010541125541125543,
	"loss": 1.7421,
	"step": 537
	},
	{
	"epoch": 0.69,
	"grad_norm": 0.376953125,
	"learning_rate": 0.0001051948051948052,
	"loss": 1.8177,
	"step": 538
	},
	{
	"epoch": 0.69,
	"grad_norm": 0.390625,
	"learning_rate": 0.00010497835497835498,
	"loss": 1.853,
	"step": 539
	},
	{
	"epoch": 0.69,
	"grad_norm": 0.384765625,
	"learning_rate": 0.00010476190476190477,
	"loss": 1.7235,
	"step": 540
	},
	{
	"epoch": 0.69,
	"grad_norm": 0.35546875,
	"learning_rate": 0.00010454545454545455,
	"loss": 1.6917,
	"step": 541
	},
	{
	"epoch": 0.69,
	"grad_norm": 0.314453125,
	"learning_rate": 0.00010432900432900434,
	"loss": 1.7533,
	"step": 542
	},
	{
	"epoch": 0.69,
	"grad_norm": 0.390625,
	"learning_rate": 0.00010411255411255412,
	"loss": 1.699,
	"step": 543
	},
	{
	"epoch": 0.7,
	"grad_norm": 0.32421875,
	"learning_rate": 0.00010389610389610389,
	"loss": 1.7324,
	"step": 544
	},
	{
	"epoch": 0.7,
	"grad_norm": 0.3515625,
	"learning_rate": 0.00010367965367965368,
	"loss": 1.7698,
	"step": 545
	},
	{
	"epoch": 0.7,
	"grad_norm": 0.294921875,
	"learning_rate": 0.00010346320346320346,
	"loss": 1.7495,
	"step": 546
	},
	{
	"epoch": 0.7,
	"grad_norm": 0.318359375,
	"learning_rate": 0.00010324675324675325,
	"loss": 1.7885,
	"step": 547
	},
	{
	"epoch": 0.7,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00010303030303030303,
	"loss": 1.7624,
	"step": 548
	},
	{
	"epoch": 0.7,
	"grad_norm": 0.306640625,
	"learning_rate": 0.00010281385281385283,
	"loss": 1.7373,
	"step": 549
	},
	{
	"epoch": 0.7,
	"grad_norm": 0.353515625,
	"learning_rate": 0.00010259740259740261,
	"loss": 1.7663,
	"step": 550
	},
	{
	"epoch": 0.7,
	"eval_loss": 1.7420332431793213,
	"eval_runtime": 125.4197,
	"eval_samples_per_second": 39.866,
	"eval_steps_per_second": 1.252,
	"step": 550
	},
	{
	"epoch": 0.71,
	"grad_norm": 0.326171875,
	"learning_rate": 0.00010238095238095237,
	"loss": 1.8104,
	"step": 551
	},
	{
	"epoch": 0.71,
	"grad_norm": 0.34375,
	"learning_rate": 0.00010216450216450218,
	"loss": 1.7708,
	"step": 552
	},
	{
	"epoch": 0.71,
	"grad_norm": 0.3984375,
	"learning_rate": 0.00010194805194805195,
	"loss": 1.7057,
	"step": 553
	},
	{
	"epoch": 0.71,
	"grad_norm": 0.328125,
	"learning_rate": 0.00010173160173160174,
	"loss": 1.718,
	"step": 554
	},
	{
	"epoch": 0.71,
	"grad_norm": 0.3671875,
	"learning_rate": 0.00010151515151515152,
	"loss": 1.6913,
	"step": 555
	},
	{
	"epoch": 0.71,
	"grad_norm": 0.5078125,
	"learning_rate": 0.0001012987012987013,
	"loss": 1.7444,
	"step": 556
	},
	{
	"epoch": 0.71,
	"grad_norm": 0.357421875,
	"learning_rate": 0.00010108225108225109,
	"loss": 1.7699,
	"step": 557
	},
	{
	"epoch": 0.71,
	"grad_norm": 0.423828125,
	"learning_rate": 0.00010086580086580086,
	"loss": 1.8066,
	"step": 558
	},
	{
	"epoch": 0.72,
	"grad_norm": 0.333984375,
	"learning_rate": 0.00010064935064935067,
	"loss": 1.7054,
	"step": 559
	},
	{
	"epoch": 0.72,
	"grad_norm": 0.41796875,
	"learning_rate": 0.00010043290043290043,
	"loss": 1.7562,
	"step": 560
	},
	{
	"epoch": 0.72,
	"grad_norm": 0.41015625,
	"learning_rate": 0.00010021645021645023,
	"loss": 1.6979,
	"step": 561
	},
	{
	"epoch": 0.72,
	"grad_norm": 0.34765625,
	"learning_rate": 0.0001,
	"loss": 1.8067,
	"step": 562
	},
	{
	"epoch": 0.72,
	"grad_norm": 0.31640625,
	"learning_rate": 9.978354978354978e-05,
	"loss": 1.7718,
	"step": 563
	},
	{
	"epoch": 0.72,
	"grad_norm": 0.36328125,
	"learning_rate": 9.956709956709958e-05,
	"loss": 1.8256,
	"step": 564
	},
	{
	"epoch": 0.72,
	"grad_norm": 0.33984375,
	"learning_rate": 9.935064935064936e-05,
	"loss": 1.7328,
	"step": 565
	},
	{
	"epoch": 0.72,
	"grad_norm": 0.361328125,
	"learning_rate": 9.913419913419914e-05,
	"loss": 1.6943,
	"step": 566
	},
	{
	"epoch": 0.73,
	"grad_norm": 0.380859375,
	"learning_rate": 9.891774891774892e-05,
	"loss": 1.7398,
	"step": 567
	},
	{
	"epoch": 0.73,
	"grad_norm": 0.39453125,
	"learning_rate": 9.870129870129871e-05,
	"loss": 1.6486,
	"step": 568
	},
	{
	"epoch": 0.73,
	"grad_norm": 0.388671875,
	"learning_rate": 9.848484848484849e-05,
	"loss": 1.7058,
	"step": 569
	},
	{
	"epoch": 0.73,
	"grad_norm": 0.30859375,
	"learning_rate": 9.826839826839827e-05,
	"loss": 1.7299,
	"step": 570
	},
	{
	"epoch": 0.73,
	"grad_norm": 0.30078125,
	"learning_rate": 9.805194805194806e-05,
	"loss": 1.7634,
	"step": 571
	},
	{
	"epoch": 0.73,
	"grad_norm": 0.34375,
	"learning_rate": 9.783549783549783e-05,
	"loss": 1.7341,
	"step": 572
	},
	{
	"epoch": 0.73,
	"grad_norm": 0.337890625,
	"learning_rate": 9.761904761904762e-05,
	"loss": 1.7888,
	"step": 573
	},
	{
	"epoch": 0.73,
	"grad_norm": 0.34375,
	"learning_rate": 9.74025974025974e-05,
	"loss": 1.7765,
	"step": 574
	},
	{
	"epoch": 0.74,
	"grad_norm": 0.55078125,
	"learning_rate": 9.71861471861472e-05,
	"loss": 1.7764,
	"step": 575
	},
	{
	"epoch": 0.74,
	"grad_norm": 0.314453125,
	"learning_rate": 9.696969696969698e-05,
	"loss": 1.745,
	"step": 576
	},
	{
	"epoch": 0.74,
	"grad_norm": 0.357421875,
	"learning_rate": 9.675324675324677e-05,
	"loss": 1.7382,
	"step": 577
	},
	{
	"epoch": 0.74,
	"grad_norm": 0.384765625,
	"learning_rate": 9.653679653679654e-05,
	"loss": 1.7118,
	"step": 578
	},
	{
	"epoch": 0.74,
	"grad_norm": 0.380859375,
	"learning_rate": 9.632034632034633e-05,
	"loss": 1.6819,
	"step": 579
	},
	{
	"epoch": 0.74,
	"grad_norm": 0.326171875,
	"learning_rate": 9.610389610389611e-05,
	"loss": 1.8091,
	"step": 580
	},
	{
	"epoch": 0.74,
	"grad_norm": 0.380859375,
	"learning_rate": 9.588744588744589e-05,
	"loss": 1.796,
	"step": 581
	},
	{
	"epoch": 0.74,
	"grad_norm": 0.380859375,
	"learning_rate": 9.567099567099568e-05,
	"loss": 1.7027,
	"step": 582
	},
	{
	"epoch": 0.75,
	"grad_norm": 0.43359375,
	"learning_rate": 9.545454545454546e-05,
	"loss": 1.736,
	"step": 583
	},
	{
	"epoch": 0.75,
	"grad_norm": 0.337890625,
	"learning_rate": 9.523809523809524e-05,
	"loss": 1.738,
	"step": 584
	},
	{
	"epoch": 0.75,
	"grad_norm": 0.369140625,
	"learning_rate": 9.502164502164502e-05,
	"loss": 1.7425,
	"step": 585
	},
	{
	"epoch": 0.75,
	"grad_norm": 0.333984375,
	"learning_rate": 9.480519480519481e-05,
	"loss": 1.6909,
	"step": 586
	},
	{
	"epoch": 0.75,
	"grad_norm": 0.50390625,
	"learning_rate": 9.45887445887446e-05,
	"loss": 1.7845,
	"step": 587
	},
	{
	"epoch": 0.75,
	"grad_norm": 0.44140625,
	"learning_rate": 9.437229437229437e-05,
	"loss": 1.7345,
	"step": 588
	},
	{
	"epoch": 0.75,
	"grad_norm": 0.283203125,
	"learning_rate": 9.415584415584417e-05,
	"loss": 1.7717,
	"step": 589
	},
	{
	"epoch": 0.75,
	"grad_norm": 0.44921875,
	"learning_rate": 9.393939393939395e-05,
	"loss": 1.7243,
	"step": 590
	},
	{
	"epoch": 0.76,
	"grad_norm": 0.431640625,
	"learning_rate": 9.372294372294373e-05,
	"loss": 1.7662,
	"step": 591
	},
	{
	"epoch": 0.76,
	"grad_norm": 0.40625,
	"learning_rate": 9.35064935064935e-05,
	"loss": 1.7209,
	"step": 592
	},
	{
	"epoch": 0.76,
	"grad_norm": 0.3828125,
	"learning_rate": 9.32900432900433e-05,
	"loss": 1.6923,
	"step": 593
	},
	{
	"epoch": 0.76,
	"grad_norm": 0.34375,
	"learning_rate": 9.307359307359308e-05,
	"loss": 1.8138,
	"step": 594
	},
	{
	"epoch": 0.76,
	"grad_norm": 0.3515625,
	"learning_rate": 9.285714285714286e-05,
	"loss": 1.7474,
	"step": 595
	},
	{
	"epoch": 0.76,
	"grad_norm": 0.421875,
	"learning_rate": 9.264069264069265e-05,
	"loss": 1.776,
	"step": 596
	},
	{
	"epoch": 0.76,
	"grad_norm": 0.361328125,
	"learning_rate": 9.242424242424242e-05,
	"loss": 1.7624,
	"step": 597
	},
	{
	"epoch": 0.77,
	"grad_norm": 0.341796875,
	"learning_rate": 9.220779220779221e-05,
	"loss": 1.733,
	"step": 598
	},
	{
	"epoch": 0.77,
	"grad_norm": 0.326171875,
	"learning_rate": 9.199134199134199e-05,
	"loss": 1.7617,
	"step": 599
	},
	{
	"epoch": 0.77,
	"grad_norm": 0.326171875,
	"learning_rate": 9.177489177489178e-05,
	"loss": 1.6983,
	"step": 600
	},
	{
	"epoch": 0.77,
	"eval_loss": 1.7379374504089355,
	"eval_runtime": 124.9706,
	"eval_samples_per_second": 40.009,
	"eval_steps_per_second": 1.256,
	"step": 600
	},
	{
	"epoch": 0.77,
	"grad_norm": 0.34375,
	"learning_rate": 9.155844155844156e-05,
	"loss": 1.7099,
	"step": 601
	},
	{
	"epoch": 0.77,
	"grad_norm": 0.32421875,
	"learning_rate": 9.134199134199136e-05,
	"loss": 1.671,
	"step": 602
	},
	{
	"epoch": 0.77,
	"grad_norm": 0.302734375,
	"learning_rate": 9.112554112554112e-05,
	"loss": 1.7249,
	"step": 603
	},
	{
	"epoch": 0.77,
	"grad_norm": 0.318359375,
	"learning_rate": 9.090909090909092e-05,
	"loss": 1.7958,
	"step": 604
	},
	{
	"epoch": 0.77,
	"grad_norm": 0.37109375,
	"learning_rate": 9.06926406926407e-05,
	"loss": 1.7202,
	"step": 605
	},
	{
	"epoch": 0.78,
	"grad_norm": 0.33203125,
	"learning_rate": 9.047619047619048e-05,
	"loss": 1.7261,
	"step": 606
	},
	{
	"epoch": 0.78,
	"grad_norm": 0.32421875,
	"learning_rate": 9.025974025974027e-05,
	"loss": 1.8016,
	"step": 607
	},
	{
	"epoch": 0.78,
	"grad_norm": 0.310546875,
	"learning_rate": 9.004329004329005e-05,
	"loss": 1.7957,
	"step": 608
	},
	{
	"epoch": 0.78,
	"grad_norm": 0.3125,
	"learning_rate": 8.982683982683983e-05,
	"loss": 1.7856,
	"step": 609
	},
	{
	"epoch": 0.78,
	"grad_norm": 0.341796875,
	"learning_rate": 8.961038961038961e-05,
	"loss": 1.7807,
	"step": 610
	},
	{
	"epoch": 0.78,
	"grad_norm": 0.341796875,
	"learning_rate": 8.93939393939394e-05,
	"loss": 1.7536,
	"step": 611
	},
	{
	"epoch": 0.78,
	"grad_norm": 0.3125,
	"learning_rate": 8.917748917748918e-05,
	"loss": 1.7499,
	"step": 612
	},
	{
	"epoch": 0.78,
	"grad_norm": 0.365234375,
	"learning_rate": 8.896103896103896e-05,
	"loss": 1.7726,
	"step": 613
	},
	{
	"epoch": 0.79,
	"grad_norm": 0.37109375,
	"learning_rate": 8.874458874458876e-05,
	"loss": 1.7558,
	"step": 614
	},
	{
	"epoch": 0.79,
	"grad_norm": 0.298828125,
	"learning_rate": 8.852813852813854e-05,
	"loss": 1.726,
	"step": 615
	},
	{
	"epoch": 0.79,
	"grad_norm": 0.330078125,
	"learning_rate": 8.831168831168831e-05,
	"loss": 1.7632,
	"step": 616
	},
	{
	"epoch": 0.79,
	"grad_norm": 0.361328125,
	"learning_rate": 8.80952380952381e-05,
	"loss": 1.7293,
	"step": 617
	},
	{
	"epoch": 0.79,
	"grad_norm": 0.337890625,
	"learning_rate": 8.787878787878789e-05,
	"loss": 1.7263,
	"step": 618
	},
	{
	"epoch": 0.79,
	"grad_norm": 0.35546875,
	"learning_rate": 8.766233766233767e-05,
	"loss": 1.8403,
	"step": 619
	},
	{
	"epoch": 0.79,
	"grad_norm": 0.33984375,
	"learning_rate": 8.744588744588745e-05,
	"loss": 1.6896,
	"step": 620
	},
	{
	"epoch": 0.79,
	"grad_norm": 0.328125,
	"learning_rate": 8.722943722943724e-05,
	"loss": 1.797,
	"step": 621
	},
	{
	"epoch": 0.8,
	"grad_norm": 0.439453125,
	"learning_rate": 8.701298701298701e-05,
	"loss": 1.7971,
	"step": 622
	},
	{
	"epoch": 0.8,
	"grad_norm": 0.39453125,
	"learning_rate": 8.67965367965368e-05,
	"loss": 1.7697,
	"step": 623
	},
	{
	"epoch": 0.8,
	"grad_norm": 0.326171875,
	"learning_rate": 8.658008658008658e-05,
	"loss": 1.7567,
	"step": 624
	},
	{
	"epoch": 0.8,
	"grad_norm": 0.3203125,
	"learning_rate": 8.636363636363637e-05,
	"loss": 1.659,
	"step": 625
	},
	{
	"epoch": 0.8,
	"grad_norm": 0.3671875,
	"learning_rate": 8.614718614718615e-05,
	"loss": 1.7546,
	"step": 626
	},
	{
	"epoch": 0.8,
	"grad_norm": 0.32421875,
	"learning_rate": 8.593073593073593e-05,
	"loss": 1.6824,
	"step": 627
	},
	{
	"epoch": 0.8,
	"grad_norm": 0.42578125,
	"learning_rate": 8.571428571428571e-05,
	"loss": 1.7049,
	"step": 628
	},
	{
	"epoch": 0.8,
	"grad_norm": 0.3125,
	"learning_rate": 8.549783549783549e-05,
	"loss": 1.6639,
	"step": 629
	},
	{
	"epoch": 0.81,
	"grad_norm": 0.337890625,
	"learning_rate": 8.528138528138529e-05,
	"loss": 1.7667,
	"step": 630
	},
	{
	"epoch": 0.81,
	"grad_norm": 0.326171875,
	"learning_rate": 8.506493506493507e-05,
	"loss": 1.7703,
	"step": 631
	},
	{
	"epoch": 0.81,
	"grad_norm": 0.388671875,
	"learning_rate": 8.484848484848486e-05,
	"loss": 1.7526,
	"step": 632
	},
	{
	"epoch": 0.81,
	"grad_norm": 0.376953125,
	"learning_rate": 8.463203463203464e-05,
	"loss": 1.7658,
	"step": 633
	},
	{
	"epoch": 0.81,
	"grad_norm": 0.318359375,
	"learning_rate": 8.441558441558442e-05,
	"loss": 1.7888,
	"step": 634
	},
	{
	"epoch": 0.81,
	"grad_norm": 0.3125,
	"learning_rate": 8.41991341991342e-05,
	"loss": 1.7197,
	"step": 635
	},
	{
	"epoch": 0.81,
	"grad_norm": 0.328125,
	"learning_rate": 8.398268398268399e-05,
	"loss": 1.6745,
	"step": 636
	},
	{
	"epoch": 0.82,
	"grad_norm": 0.369140625,
	"learning_rate": 8.376623376623377e-05,
	"loss": 1.7168,
	"step": 637
	},
	{
	"epoch": 0.82,
	"grad_norm": 0.33984375,
	"learning_rate": 8.354978354978355e-05,
	"loss": 1.6904,
	"step": 638
	},
	{
	"epoch": 0.82,
	"grad_norm": 0.396484375,
	"learning_rate": 8.333333333333334e-05,
	"loss": 1.6792,
	"step": 639
	},
	{
	"epoch": 0.82,
	"grad_norm": 0.34375,
	"learning_rate": 8.311688311688312e-05,
	"loss": 1.7299,
	"step": 640
	},
	{
	"epoch": 0.82,
	"grad_norm": 0.365234375,
	"learning_rate": 8.29004329004329e-05,
	"loss": 1.7334,
	"step": 641
	},
	{
	"epoch": 0.82,
	"grad_norm": 0.32421875,
	"learning_rate": 8.268398268398268e-05,
	"loss": 1.7236,
	"step": 642
	},
	{
	"epoch": 0.82,
	"grad_norm": 0.408203125,
	"learning_rate": 8.246753246753248e-05,
	"loss": 1.7398,
	"step": 643
	},
	{
	"epoch": 0.82,
	"grad_norm": 0.337890625,
	"learning_rate": 8.225108225108226e-05,
	"loss": 1.7407,
	"step": 644
	},
	{
	"epoch": 0.83,
	"grad_norm": 0.34375,
	"learning_rate": 8.203463203463204e-05,
	"loss": 1.8405,
	"step": 645
	},
	{
	"epoch": 0.83,
	"grad_norm": 0.306640625,
	"learning_rate": 8.181818181818183e-05,
	"loss": 1.7225,
	"step": 646
	},
	{
	"epoch": 0.83,
	"grad_norm": 0.318359375,
	"learning_rate": 8.16017316017316e-05,
	"loss": 1.8285,
	"step": 647
	},
	{
	"epoch": 0.83,
	"grad_norm": 0.4453125,
	"learning_rate": 8.138528138528139e-05,
	"loss": 1.7646,
	"step": 648
	},
	{
	"epoch": 0.83,
	"grad_norm": 0.3203125,
	"learning_rate": 8.116883116883117e-05,
	"loss": 1.7172,
	"step": 649
	},
	{
	"epoch": 0.83,
	"grad_norm": 0.39453125,
	"learning_rate": 8.095238095238096e-05,
	"loss": 1.8033,
	"step": 650
	},
	{
	"epoch": 0.83,
	"eval_loss": 1.7356816530227661,
	"eval_runtime": 125.4294,
	"eval_samples_per_second": 39.863,
	"eval_steps_per_second": 1.252,
	"step": 650
	},
	{
	"epoch": 0.83,
	"grad_norm": 0.30859375,
	"learning_rate": 8.073593073593074e-05,
	"loss": 1.7494,
	"step": 651
	},
	{
	"epoch": 0.83,
	"grad_norm": 0.40234375,
	"learning_rate": 8.051948051948052e-05,
	"loss": 1.7573,
	"step": 652
	},
	{
	"epoch": 0.84,
	"grad_norm": 0.46484375,
	"learning_rate": 8.03030303030303e-05,
	"loss": 1.7712,
	"step": 653
	},
	{
	"epoch": 0.84,
	"grad_norm": 0.40234375,
	"learning_rate": 8.008658008658008e-05,
	"loss": 1.6894,
	"step": 654
	},
	{
	"epoch": 0.84,
	"grad_norm": 0.361328125,
	"learning_rate": 7.987012987012987e-05,
	"loss": 1.6714,
	"step": 655
	},
	{
	"epoch": 0.84,
	"grad_norm": 0.33984375,
	"learning_rate": 7.965367965367965e-05,
	"loss": 1.801,
	"step": 656
	},
	{
	"epoch": 0.84,
	"grad_norm": 0.41015625,
	"learning_rate": 7.943722943722945e-05,
	"loss": 1.75,
	"step": 657
	},
	{
	"epoch": 0.84,
	"grad_norm": 0.421875,
	"learning_rate": 7.922077922077923e-05,
	"loss": 1.6875,
	"step": 658
	},
	{
	"epoch": 0.84,
	"grad_norm": 0.421875,
	"learning_rate": 7.900432900432901e-05,
	"loss": 1.6635,
	"step": 659
	},
	{
	"epoch": 0.84,
	"grad_norm": 0.302734375,
	"learning_rate": 7.878787878787879e-05,
	"loss": 1.782,
	"step": 660
	},
	{
	"epoch": 0.85,
	"grad_norm": 0.3671875,
	"learning_rate": 7.857142857142858e-05,
	"loss": 1.6838,
	"step": 661
	},
	{
	"epoch": 0.85,
	"grad_norm": 0.341796875,
	"learning_rate": 7.835497835497836e-05,
	"loss": 1.6957,
	"step": 662
	},
	{
	"epoch": 0.85,
	"grad_norm": 0.3359375,
	"learning_rate": 7.813852813852814e-05,
	"loss": 1.7366,
	"step": 663
	},
	{
	"epoch": 0.85,
	"grad_norm": 0.404296875,
	"learning_rate": 7.792207792207793e-05,
	"loss": 1.6709,
	"step": 664
	},
	{
	"epoch": 0.85,
	"grad_norm": 0.373046875,
	"learning_rate": 7.770562770562771e-05,
	"loss": 1.7653,
	"step": 665
	},
	{
	"epoch": 0.85,
	"grad_norm": 0.375,
	"learning_rate": 7.748917748917749e-05,
	"loss": 1.7467,
	"step": 666
	},
	{
	"epoch": 0.85,
	"grad_norm": 0.3046875,
	"learning_rate": 7.727272727272727e-05,
	"loss": 1.7499,
	"step": 667
	},
	{
	"epoch": 0.85,
	"grad_norm": 0.337890625,
	"learning_rate": 7.705627705627707e-05,
	"loss": 1.7462,
	"step": 668
	},
	{
	"epoch": 0.86,
	"grad_norm": 0.3828125,
	"learning_rate": 7.683982683982685e-05,
	"loss": 1.6883,
	"step": 669
	},
	{
	"epoch": 0.86,
	"grad_norm": 0.337890625,
	"learning_rate": 7.662337662337662e-05,
	"loss": 1.7674,
	"step": 670
	},
	{
	"epoch": 0.86,
	"grad_norm": 0.37890625,
	"learning_rate": 7.640692640692642e-05,
	"loss": 1.7388,
	"step": 671
	},
	{
	"epoch": 0.86,
	"grad_norm": 0.33203125,
	"learning_rate": 7.619047619047618e-05,
	"loss": 1.7206,
	"step": 672
	},
	{
	"epoch": 0.86,
	"grad_norm": 0.341796875,
	"learning_rate": 7.597402597402598e-05,
	"loss": 1.749,
	"step": 673
	},
	{
	"epoch": 0.86,
	"grad_norm": 0.361328125,
	"learning_rate": 7.575757575757576e-05,
	"loss": 1.7318,
	"step": 674
	},
	{
	"epoch": 0.86,
	"grad_norm": 0.333984375,
	"learning_rate": 7.554112554112555e-05,
	"loss": 1.7234,
	"step": 675
	},
	{
	"epoch": 0.87,
	"grad_norm": 0.3359375,
	"learning_rate": 7.532467532467533e-05,
	"loss": 1.6407,
	"step": 676
	},
	{
	"epoch": 0.87,
	"grad_norm": 0.431640625,
	"learning_rate": 7.510822510822511e-05,
	"loss": 1.7951,
	"step": 677
	},
	{
	"epoch": 0.87,
	"grad_norm": 0.49609375,
	"learning_rate": 7.489177489177489e-05,
	"loss": 1.7958,
	"step": 678
	},
	{
	"epoch": 0.87,
	"grad_norm": 0.345703125,
	"learning_rate": 7.467532467532467e-05,
	"loss": 1.745,
	"step": 679
	},
	{
	"epoch": 0.87,
	"grad_norm": 0.396484375,
	"learning_rate": 7.445887445887446e-05,
	"loss": 1.6285,
	"step": 680
	},
	{
	"epoch": 0.87,
	"grad_norm": 0.390625,
	"learning_rate": 7.424242424242424e-05,
	"loss": 1.792,
	"step": 681
	},
	{
	"epoch": 0.87,
	"grad_norm": 0.466796875,
	"learning_rate": 7.402597402597404e-05,
	"loss": 1.8207,
	"step": 682
	},
	{
	"epoch": 0.87,
	"grad_norm": 0.4609375,
	"learning_rate": 7.380952380952382e-05,
	"loss": 1.7013,
	"step": 683
	},
	{
	"epoch": 0.88,
	"grad_norm": 0.349609375,
	"learning_rate": 7.35930735930736e-05,
	"loss": 1.759,
	"step": 684
	},
	{
	"epoch": 0.88,
	"grad_norm": 0.3671875,
	"learning_rate": 7.337662337662338e-05,
	"loss": 1.7398,
	"step": 685
	},
	{
	"epoch": 0.88,
	"grad_norm": 0.318359375,
	"learning_rate": 7.316017316017317e-05,
	"loss": 1.6995,
	"step": 686
	},
	{
	"epoch": 0.88,
	"grad_norm": 0.4296875,
	"learning_rate": 7.294372294372295e-05,
	"loss": 1.7617,
	"step": 687
	},
	{
	"epoch": 0.88,
	"grad_norm": 0.431640625,
	"learning_rate": 7.272727272727273e-05,
	"loss": 1.6959,
	"step": 688
	},
	{
	"epoch": 0.88,
	"grad_norm": 0.322265625,
	"learning_rate": 7.251082251082252e-05,
	"loss": 1.7879,
	"step": 689
	},
	{
	"epoch": 0.88,
	"grad_norm": 0.35546875,
	"learning_rate": 7.229437229437229e-05,
	"loss": 1.6613,
	"step": 690
	},
	{
	"epoch": 0.88,
	"grad_norm": 0.3515625,
	"learning_rate": 7.207792207792208e-05,
	"loss": 1.6485,
	"step": 691
	},
	{
	"epoch": 0.89,
	"grad_norm": 0.400390625,
	"learning_rate": 7.186147186147186e-05,
	"loss": 1.719,
	"step": 692
	},
	{
	"epoch": 0.89,
	"grad_norm": 0.32421875,
	"learning_rate": 7.164502164502165e-05,
	"loss": 1.7316,
	"step": 693
	},
	{
	"epoch": 0.89,
	"grad_norm": 0.33984375,
	"learning_rate": 7.142857142857143e-05,
	"loss": 1.7509,
	"step": 694
	},
	{
	"epoch": 0.89,
	"grad_norm": 0.375,
	"learning_rate": 7.121212121212121e-05,
	"loss": 1.785,
	"step": 695
	},
	{
	"epoch": 0.89,
	"grad_norm": 0.328125,
	"learning_rate": 7.099567099567101e-05,
	"loss": 1.7643,
	"step": 696
	},
	{
	"epoch": 0.89,
	"grad_norm": 0.341796875,
	"learning_rate": 7.077922077922077e-05,
	"loss": 1.7058,
	"step": 697
	},
	{
	"epoch": 0.89,
	"grad_norm": 0.34375,
	"learning_rate": 7.056277056277057e-05,
	"loss": 1.807,
	"step": 698
	},
	{
	"epoch": 0.89,
	"grad_norm": 0.349609375,
	"learning_rate": 7.034632034632035e-05,
	"loss": 1.6489,
	"step": 699
	},
	{
	"epoch": 0.9,
	"grad_norm": 0.359375,
	"learning_rate": 7.012987012987014e-05,
	"loss": 1.8146,
	"step": 700
	},
	{
	"epoch": 0.9,
	"eval_loss": 1.7302906513214111,
	"eval_runtime": 125.8875,
	"eval_samples_per_second": 39.718,
	"eval_steps_per_second": 1.247,
	"step": 700
	},
	{
	"epoch": 0.9,
	"grad_norm": 0.298828125,
	"learning_rate": 6.991341991341992e-05,
	"loss": 1.6505,
	"step": 701
	},
	{
	"epoch": 0.9,
	"grad_norm": 0.39453125,
	"learning_rate": 6.96969696969697e-05,
	"loss": 1.7259,
	"step": 702
	},
	{
	"epoch": 0.9,
	"grad_norm": 0.314453125,
	"learning_rate": 6.948051948051948e-05,
	"loss": 1.81,
	"step": 703
	},
	{
	"epoch": 0.9,
	"grad_norm": 0.3515625,
	"learning_rate": 6.926406926406926e-05,
	"loss": 1.7801,
	"step": 704
	},
	{
	"epoch": 0.9,
	"grad_norm": 0.3359375,
	"learning_rate": 6.904761904761905e-05,
	"loss": 1.7213,
	"step": 705
	},
	{
	"epoch": 0.9,
	"grad_norm": 0.3125,
	"learning_rate": 6.883116883116883e-05,
	"loss": 1.8054,
	"step": 706
	},
	{
	"epoch": 0.9,
	"grad_norm": 0.31640625,
	"learning_rate": 6.861471861471862e-05,
	"loss": 1.7339,
	"step": 707
	},
	{
	"epoch": 0.91,
	"grad_norm": 0.328125,
	"learning_rate": 6.83982683982684e-05,
	"loss": 1.79,
	"step": 708
	},
	{
	"epoch": 0.91,
	"grad_norm": 0.36328125,
	"learning_rate": 6.818181818181818e-05,
	"loss": 1.6615,
	"step": 709
	},
	{
	"epoch": 0.91,
	"grad_norm": 0.306640625,
	"learning_rate": 6.796536796536796e-05,
	"loss": 1.8298,
	"step": 710
	},
	{
	"epoch": 0.91,
	"grad_norm": 0.298828125,
	"learning_rate": 6.774891774891774e-05,
	"loss": 1.6835,
	"step": 711
	},
	{
	"epoch": 0.91,
	"grad_norm": 0.32421875,
	"learning_rate": 6.753246753246754e-05,
	"loss": 1.7739,
	"step": 712
	},
	{
	"epoch": 0.91,
	"grad_norm": 0.337890625,
	"learning_rate": 6.731601731601732e-05,
	"loss": 1.7431,
	"step": 713
	},
	{
	"epoch": 0.91,
	"grad_norm": 0.337890625,
	"learning_rate": 6.709956709956711e-05,
	"loss": 1.7816,
	"step": 714
	},
	{
	"epoch": 0.91,
	"grad_norm": 0.32421875,
	"learning_rate": 6.688311688311688e-05,
	"loss": 1.7516,
	"step": 715
	},
	{
	"epoch": 0.92,
	"grad_norm": 0.341796875,
	"learning_rate": 6.666666666666667e-05,
	"loss": 1.7276,
	"step": 716
	},
	{
	"epoch": 0.92,
	"grad_norm": 0.330078125,
	"learning_rate": 6.645021645021645e-05,
	"loss": 1.7521,
	"step": 717
	},
	{
	"epoch": 0.92,
	"grad_norm": 0.36328125,
	"learning_rate": 6.623376623376624e-05,
	"loss": 1.7368,
	"step": 718
	},
	{
	"epoch": 0.92,
	"grad_norm": 0.3828125,
	"learning_rate": 6.601731601731602e-05,
	"loss": 1.7896,
	"step": 719
	},
	{
	"epoch": 0.92,
	"grad_norm": 0.365234375,
	"learning_rate": 6.58008658008658e-05,
	"loss": 1.8057,
	"step": 720
	},
	{
	"epoch": 0.92,
	"grad_norm": 0.3828125,
	"learning_rate": 6.55844155844156e-05,
	"loss": 1.7596,
	"step": 721
	},
	{
	"epoch": 0.92,
	"grad_norm": 0.40625,
	"learning_rate": 6.536796536796536e-05,
	"loss": 1.7793,
	"step": 722
	},
	{
	"epoch": 0.93,
	"grad_norm": 0.365234375,
	"learning_rate": 6.515151515151516e-05,
	"loss": 1.6966,
	"step": 723
	},
	{
	"epoch": 0.93,
	"grad_norm": 0.32421875,
	"learning_rate": 6.493506493506494e-05,
	"loss": 1.6619,
	"step": 724
	},
	{
	"epoch": 0.93,
	"grad_norm": 0.3359375,
	"learning_rate": 6.471861471861473e-05,
	"loss": 1.7305,
	"step": 725
	},
	{
	"epoch": 0.93,
	"grad_norm": 0.365234375,
	"learning_rate": 6.450216450216451e-05,
	"loss": 1.7337,
	"step": 726
	},
	{
	"epoch": 0.93,
	"grad_norm": 0.37890625,
	"learning_rate": 6.428571428571429e-05,
	"loss": 1.7255,
	"step": 727
	},
	{
	"epoch": 0.93,
	"grad_norm": 0.345703125,
	"learning_rate": 6.406926406926407e-05,
	"loss": 1.75,
	"step": 728
	},
	{
	"epoch": 0.93,
	"grad_norm": 0.337890625,
	"learning_rate": 6.385281385281385e-05,
	"loss": 1.7166,
	"step": 729
	},
	{
	"epoch": 0.93,
	"grad_norm": 0.3203125,
	"learning_rate": 6.363636363636364e-05,
	"loss": 1.7407,
	"step": 730
	},
	{
	"epoch": 0.94,
	"grad_norm": 0.349609375,
	"learning_rate": 6.341991341991342e-05,
	"loss": 1.6759,
	"step": 731
	},
	{
	"epoch": 0.94,
	"grad_norm": 0.33203125,
	"learning_rate": 6.320346320346321e-05,
	"loss": 1.7485,
	"step": 732
	},
	{
	"epoch": 0.94,
	"grad_norm": 0.37890625,
	"learning_rate": 6.2987012987013e-05,
	"loss": 1.8,
	"step": 733
	},
	{
	"epoch": 0.94,
	"grad_norm": 0.404296875,
	"learning_rate": 6.277056277056277e-05,
	"loss": 1.6966,
	"step": 734
	},
	{
	"epoch": 0.94,
	"grad_norm": 0.37890625,
	"learning_rate": 6.255411255411255e-05,
	"loss": 1.7524,
	"step": 735
	},
	{
	"epoch": 0.94,
	"grad_norm": 0.3515625,
	"learning_rate": 6.233766233766233e-05,
	"loss": 1.7087,
	"step": 736
	},
	{
	"epoch": 0.94,
	"grad_norm": 0.37109375,
	"learning_rate": 6.212121212121213e-05,
	"loss": 1.7963,
	"step": 737
	},
	{
	"epoch": 0.94,
	"grad_norm": 0.361328125,
	"learning_rate": 6.19047619047619e-05,
	"loss": 1.7043,
	"step": 738
	},
	{
	"epoch": 0.95,
	"grad_norm": 0.3046875,
	"learning_rate": 6.16883116883117e-05,
	"loss": 1.6797,
	"step": 739
	},
	{
	"epoch": 0.95,
	"grad_norm": 0.396484375,
	"learning_rate": 6.147186147186147e-05,
	"loss": 1.6791,
	"step": 740
	},
	{
	"epoch": 0.95,
	"grad_norm": 0.330078125,
	"learning_rate": 6.125541125541126e-05,
	"loss": 1.7069,
	"step": 741
	},
	{
	"epoch": 0.95,
	"grad_norm": 0.33984375,
	"learning_rate": 6.103896103896104e-05,
	"loss": 1.6997,
	"step": 742
	},
	{
	"epoch": 0.95,
	"grad_norm": 0.376953125,
	"learning_rate": 6.0822510822510825e-05,
	"loss": 1.7395,
	"step": 743
	},
	{
	"epoch": 0.95,
	"grad_norm": 0.388671875,
	"learning_rate": 6.060606060606061e-05,
	"loss": 1.7289,
	"step": 744
	},
	{
	"epoch": 0.95,
	"grad_norm": 0.375,
	"learning_rate": 6.03896103896104e-05,
	"loss": 1.7963,
	"step": 745
	},
	{
	"epoch": 0.95,
	"grad_norm": 0.318359375,
	"learning_rate": 6.0173160173160184e-05,
	"loss": 1.8322,
	"step": 746
	},
	{
	"epoch": 0.96,
	"grad_norm": 0.3984375,
	"learning_rate": 5.995670995670996e-05,
	"loss": 1.6923,
	"step": 747
	},
	{
	"epoch": 0.96,
	"grad_norm": 0.306640625,
	"learning_rate": 5.9740259740259744e-05,
	"loss": 1.7534,
	"step": 748
	},
	{
	"epoch": 0.96,
	"grad_norm": 0.326171875,
	"learning_rate": 5.9523809523809524e-05,
	"loss": 1.7843,
	"step": 749
	},
	{
	"epoch": 0.96,
	"grad_norm": 0.330078125,
	"learning_rate": 5.930735930735931e-05,
	"loss": 1.7837,
	"step": 750
	},
	{
	"epoch": 0.96,
	"eval_loss": 1.728518009185791,
	"eval_runtime": 125.8224,
	"eval_samples_per_second": 39.739,
	"eval_steps_per_second": 1.248,
	"step": 750
	},
	{
	"epoch": 0.96,
	"grad_norm": 0.34375,
	"learning_rate": 5.90909090909091e-05,
	"loss": 1.8208,
	"step": 751
	},
	{
	"epoch": 0.96,
	"grad_norm": 0.39453125,
	"learning_rate": 5.887445887445888e-05,
	"loss": 1.7173,
	"step": 752
	},
	{
	"epoch": 0.96,
	"grad_norm": 0.37109375,
	"learning_rate": 5.8658008658008656e-05,
	"loss": 1.6923,
	"step": 753
	},
	{
	"epoch": 0.96,
	"grad_norm": 0.41015625,
	"learning_rate": 5.844155844155844e-05,
	"loss": 1.7685,
	"step": 754
	},
	{
	"epoch": 0.97,
	"grad_norm": 0.3125,
	"learning_rate": 5.822510822510823e-05,
	"loss": 1.677,
	"step": 755
	},
	{
	"epoch": 0.97,
	"grad_norm": 0.33203125,
	"learning_rate": 5.800865800865801e-05,
	"loss": 1.6898,
	"step": 756
	},
	{
	"epoch": 0.97,
	"grad_norm": 0.3671875,
	"learning_rate": 5.7792207792207796e-05,
	"loss": 1.7492,
	"step": 757
	},
	{
	"epoch": 0.97,
	"grad_norm": 0.478515625,
	"learning_rate": 5.757575757575758e-05,
	"loss": 1.7804,
	"step": 758
	},
	{
	"epoch": 0.97,
	"grad_norm": 0.388671875,
	"learning_rate": 5.7359307359307355e-05,
	"loss": 1.7285,
	"step": 759
	},
	{
	"epoch": 0.97,
	"grad_norm": 0.34765625,
	"learning_rate": 5.714285714285714e-05,
	"loss": 1.7054,
	"step": 760
	},
	{
	"epoch": 0.97,
	"grad_norm": 0.361328125,
	"learning_rate": 5.692640692640693e-05,
	"loss": 1.7949,
	"step": 761
	},
	{
	"epoch": 0.98,
	"grad_norm": 0.3828125,
	"learning_rate": 5.6709956709956715e-05,
	"loss": 1.6984,
	"step": 762
	},
	{
	"epoch": 0.98,
	"grad_norm": 0.3359375,
	"learning_rate": 5.64935064935065e-05,
	"loss": 1.5885,
	"step": 763
	},
	{
	"epoch": 0.98,
	"grad_norm": 0.373046875,
	"learning_rate": 5.627705627705628e-05,
	"loss": 1.8189,
	"step": 764
	},
	{
	"epoch": 0.98,
	"grad_norm": 0.3359375,
	"learning_rate": 5.606060606060606e-05,
	"loss": 1.7489,
	"step": 765
	},
	{
	"epoch": 0.98,
	"grad_norm": 0.318359375,
	"learning_rate": 5.584415584415584e-05,
	"loss": 1.692,
	"step": 766
	},
	{
	"epoch": 0.98,
	"grad_norm": 0.337890625,
	"learning_rate": 5.562770562770563e-05,
	"loss": 1.7128,
	"step": 767
	},
	{
	"epoch": 0.98,
	"grad_norm": 0.369140625,
	"learning_rate": 5.5411255411255414e-05,
	"loss": 1.7056,
	"step": 768
	},
	{
	"epoch": 0.98,
	"grad_norm": 0.345703125,
	"learning_rate": 5.51948051948052e-05,
	"loss": 1.8224,
	"step": 769
	},
	{
	"epoch": 0.99,
	"grad_norm": 0.310546875,
	"learning_rate": 5.497835497835499e-05,
	"loss": 1.7297,
	"step": 770
	},
	{
	"epoch": 0.99,
	"grad_norm": 0.34375,
	"learning_rate": 5.4761904761904766e-05,
	"loss": 1.7245,
	"step": 771
	},
	{
	"epoch": 0.99,
	"grad_norm": 0.330078125,
	"learning_rate": 5.4545454545454546e-05,
	"loss": 1.7286,
	"step": 772
	},
	{
	"epoch": 0.99,
	"grad_norm": 0.357421875,
	"learning_rate": 5.4329004329004326e-05,
	"loss": 1.616,
	"step": 773
	},
	{
	"epoch": 0.99,
	"grad_norm": 0.361328125,
	"learning_rate": 5.411255411255411e-05,
	"loss": 1.7709,
	"step": 774
	},
	{
	"epoch": 0.99,
	"grad_norm": 0.423828125,
	"learning_rate": 5.38961038961039e-05,
	"loss": 1.7746,
	"step": 775
	},
	{
	"epoch": 0.99,
	"grad_norm": 0.34765625,
	"learning_rate": 5.3679653679653686e-05,
	"loss": 1.7077,
	"step": 776
	},
	{
	"epoch": 0.99,
	"grad_norm": 0.3515625,
	"learning_rate": 5.346320346320347e-05,
	"loss": 1.6281,
	"step": 777
	},
	{
	"epoch": 1.0,
	"grad_norm": 0.361328125,
	"learning_rate": 5.3246753246753245e-05,
	"loss": 1.7438,
	"step": 778
	},
	{
	"epoch": 1.0,
	"grad_norm": 0.353515625,
	"learning_rate": 5.303030303030303e-05,
	"loss": 1.6973,
	"step": 779
	},
	{
	"epoch": 1.0,
	"grad_norm": 0.380859375,
	"learning_rate": 5.281385281385282e-05,
	"loss": 1.7698,
	"step": 780
	},
	{
	"epoch": 1.0,
	"grad_norm": 0.40234375,
	"learning_rate": 5.25974025974026e-05,
	"loss": 1.7707,
	"step": 781
	},
	{
	"epoch": 1.0,
	"grad_norm": 0.375,
	"learning_rate": 5.2380952380952384e-05,
	"loss": 1.7133,
	"step": 782
	},
	{
	"epoch": 1.0,
	"grad_norm": 0.34765625,
	"learning_rate": 5.216450216450217e-05,
	"loss": 1.7593,
	"step": 783
	},
	{
	"epoch": 1.0,
	"grad_norm": 0.349609375,
	"learning_rate": 5.1948051948051944e-05,
	"loss": 1.7286,
	"step": 784
	},
	{
	"epoch": 1.0,
	"grad_norm": 0.33984375,
	"learning_rate": 5.173160173160173e-05,
	"loss": 1.724,
	"step": 785
	},
	{
	"epoch": 1.01,
	"grad_norm": 0.3359375,
	"learning_rate": 5.151515151515152e-05,
	"loss": 1.6978,
	"step": 786
	},
	{
	"epoch": 1.01,
	"grad_norm": 0.32421875,
	"learning_rate": 5.1298701298701304e-05,
	"loss": 1.8041,
	"step": 787
	},
	{
	"epoch": 1.01,
	"grad_norm": 0.287109375,
	"learning_rate": 5.108225108225109e-05,
	"loss": 1.7141,
	"step": 788
	},
	{
	"epoch": 1.01,
	"grad_norm": 0.326171875,
	"learning_rate": 5.086580086580087e-05,
	"loss": 1.6585,
	"step": 789
	},
	{
	"epoch": 1.01,
	"grad_norm": 0.33203125,
	"learning_rate": 5.064935064935065e-05,
	"loss": 1.733,
	"step": 790
	},
	{
	"epoch": 1.01,
	"grad_norm": 0.388671875,
	"learning_rate": 5.043290043290043e-05,
	"loss": 1.7602,
	"step": 791
	},
	{
	"epoch": 1.01,
	"grad_norm": 0.310546875,
	"learning_rate": 5.0216450216450216e-05,
	"loss": 1.804,
	"step": 792
	},
	{
	"epoch": 1.01,
	"grad_norm": 0.302734375,
	"learning_rate": 5e-05,
	"loss": 1.6816,
	"step": 793
	},
	{
	"epoch": 1.02,
	"grad_norm": 0.34375,
	"learning_rate": 4.978354978354979e-05,
	"loss": 1.7515,
	"step": 794
	},
	{
	"epoch": 1.02,
	"grad_norm": 0.33203125,
	"learning_rate": 4.956709956709957e-05,
	"loss": 1.6996,
	"step": 795
	},
	{
	"epoch": 1.02,
	"grad_norm": 0.333984375,
	"learning_rate": 4.9350649350649355e-05,
	"loss": 1.7045,
	"step": 796
	},
	{
	"epoch": 1.02,
	"grad_norm": 0.357421875,
	"learning_rate": 4.9134199134199135e-05,
	"loss": 1.7026,
	"step": 797
	},
	{
	"epoch": 1.02,
	"grad_norm": 0.30078125,
	"learning_rate": 4.8917748917748915e-05,
	"loss": 1.6444,
	"step": 798
	},
	{
	"epoch": 1.02,
	"grad_norm": 0.345703125,
	"learning_rate": 4.87012987012987e-05,
	"loss": 1.7705,
	"step": 799
	},
	{
	"epoch": 1.02,
	"grad_norm": 0.35546875,
	"learning_rate": 4.848484848484849e-05,
	"loss": 1.7178,
	"step": 800
	},
	{
	"epoch": 1.02,
	"eval_loss": 1.7301437854766846,
	"eval_runtime": 124.4902,
	"eval_samples_per_second": 40.164,
	"eval_steps_per_second": 1.261,
	"step": 800
	},
	{
	"epoch": 1.02,
	"grad_norm": 0.33203125,
	"learning_rate": 4.826839826839827e-05,
	"loss": 1.7434,
	"step": 801
	},
	{
	"epoch": 1.03,
	"grad_norm": 0.365234375,
	"learning_rate": 4.8051948051948054e-05,
	"loss": 1.7148,
	"step": 802
	},
	{
	"epoch": 1.03,
	"grad_norm": 0.330078125,
	"learning_rate": 4.783549783549784e-05,
	"loss": 1.7685,
	"step": 803
	},
	{
	"epoch": 1.03,
	"grad_norm": 0.3515625,
	"learning_rate": 4.761904761904762e-05,
	"loss": 1.8048,
	"step": 804
	},
	{
	"epoch": 1.03,
	"grad_norm": 0.35546875,
	"learning_rate": 4.740259740259741e-05,
	"loss": 1.7467,
	"step": 805
	},
	{
	"epoch": 1.03,
	"grad_norm": 0.33203125,
	"learning_rate": 4.718614718614719e-05,
	"loss": 1.6937,
	"step": 806
	},
	{
	"epoch": 1.03,
	"grad_norm": 0.306640625,
	"learning_rate": 4.696969696969697e-05,
	"loss": 1.6432,
	"step": 807
	},
	{
	"epoch": 1.03,
	"grad_norm": 0.3515625,
	"learning_rate": 4.675324675324675e-05,
	"loss": 1.6467,
	"step": 808
	},
	{
	"epoch": 1.04,
	"grad_norm": 0.373046875,
	"learning_rate": 4.653679653679654e-05,
	"loss": 1.7572,
	"step": 809
	},
	{
	"epoch": 1.04,
	"grad_norm": 0.365234375,
	"learning_rate": 4.6320346320346326e-05,
	"loss": 1.7623,
	"step": 810
	},
	{
	"epoch": 1.04,
	"grad_norm": 0.333984375,
	"learning_rate": 4.6103896103896106e-05,
	"loss": 1.6898,
	"step": 811
	},
	{
	"epoch": 1.04,
	"grad_norm": 0.412109375,
	"learning_rate": 4.588744588744589e-05,
	"loss": 1.6788,
	"step": 812
	},
	{
	"epoch": 1.04,
	"grad_norm": 0.373046875,
	"learning_rate": 4.567099567099568e-05,
	"loss": 1.6631,
	"step": 813
	},
	{
	"epoch": 1.04,
	"grad_norm": 0.33203125,
	"learning_rate": 4.545454545454546e-05,
	"loss": 1.6311,
	"step": 814
	},
	{
	"epoch": 1.04,
	"grad_norm": 0.408203125,
	"learning_rate": 4.523809523809524e-05,
	"loss": 1.6636,
	"step": 815
	},
	{
	"epoch": 1.04,
	"grad_norm": 0.357421875,
	"learning_rate": 4.5021645021645025e-05,
	"loss": 1.8099,
	"step": 816
	},
	{
	"epoch": 1.05,
	"grad_norm": 0.359375,
	"learning_rate": 4.4805194805194805e-05,
	"loss": 1.6849,
	"step": 817
	},
	{
	"epoch": 1.05,
	"grad_norm": 0.36328125,
	"learning_rate": 4.458874458874459e-05,
	"loss": 1.6601,
	"step": 818
	},
	{
	"epoch": 1.05,
	"grad_norm": 0.365234375,
	"learning_rate": 4.437229437229438e-05,
	"loss": 1.7999,
	"step": 819
	},
	{
	"epoch": 1.05,
	"grad_norm": 0.396484375,
	"learning_rate": 4.415584415584416e-05,
	"loss": 1.6323,
	"step": 820
	},
	{
	"epoch": 1.05,
	"grad_norm": 0.3203125,
	"learning_rate": 4.3939393939393944e-05,
	"loss": 1.7141,
	"step": 821
	},
	{
	"epoch": 1.05,
	"grad_norm": 0.3359375,
	"learning_rate": 4.3722943722943724e-05,
	"loss": 1.7166,
	"step": 822
	},
	{
	"epoch": 1.05,
	"grad_norm": 0.421875,
	"learning_rate": 4.3506493506493503e-05,
	"loss": 1.6969,
	"step": 823
	},
	{
	"epoch": 1.05,
	"grad_norm": 0.333984375,
	"learning_rate": 4.329004329004329e-05,
	"loss": 1.674,
	"step": 824
	},
	{
	"epoch": 1.06,
	"grad_norm": 0.32421875,
	"learning_rate": 4.3073593073593077e-05,
	"loss": 1.6893,
	"step": 825
	},
	{
	"epoch": 1.06,
	"grad_norm": 0.36328125,
	"learning_rate": 4.2857142857142856e-05,
	"loss": 1.6985,
	"step": 826
	},
	{
	"epoch": 1.06,
	"grad_norm": 0.328125,
	"learning_rate": 4.264069264069264e-05,
	"loss": 1.7245,
	"step": 827
	},
	{
	"epoch": 1.06,
	"grad_norm": 0.35546875,
	"learning_rate": 4.242424242424243e-05,
	"loss": 1.7277,
	"step": 828
	},
	{
	"epoch": 1.06,
	"grad_norm": 0.3671875,
	"learning_rate": 4.220779220779221e-05,
	"loss": 1.6765,
	"step": 829
	},
	{
	"epoch": 1.06,
	"grad_norm": 0.36328125,
	"learning_rate": 4.1991341991341996e-05,
	"loss": 1.7863,
	"step": 830
	},
	{
	"epoch": 1.06,
	"grad_norm": 0.33984375,
	"learning_rate": 4.1774891774891775e-05,
	"loss": 1.6689,
	"step": 831
	},
	{
	"epoch": 1.06,
	"grad_norm": 0.365234375,
	"learning_rate": 4.155844155844156e-05,
	"loss": 1.7957,
	"step": 832
	},
	{
	"epoch": 1.07,
	"grad_norm": 0.298828125,
	"learning_rate": 4.134199134199134e-05,
	"loss": 1.6454,
	"step": 833
	},
	{
	"epoch": 1.07,
	"grad_norm": 0.341796875,
	"learning_rate": 4.112554112554113e-05,
	"loss": 1.6748,
	"step": 834
	},
	{
	"epoch": 1.07,
	"grad_norm": 0.380859375,
	"learning_rate": 4.0909090909090915e-05,
	"loss": 1.8057,
	"step": 835
	},
	{
	"epoch": 1.07,
	"grad_norm": 0.38671875,
	"learning_rate": 4.0692640692640695e-05,
	"loss": 1.7343,
	"step": 836
	},
	{
	"epoch": 1.07,
	"grad_norm": 0.353515625,
	"learning_rate": 4.047619047619048e-05,
	"loss": 1.6868,
	"step": 837
	},
	{
	"epoch": 1.07,
	"grad_norm": 0.365234375,
	"learning_rate": 4.025974025974026e-05,
	"loss": 1.6663,
	"step": 838
	},
	{
	"epoch": 1.07,
	"grad_norm": 0.3359375,
	"learning_rate": 4.004329004329004e-05,
	"loss": 1.6779,
	"step": 839
	},
	{
	"epoch": 1.07,
	"grad_norm": 0.353515625,
	"learning_rate": 3.982683982683983e-05,
	"loss": 1.7324,
	"step": 840
	},
	{
	"epoch": 1.08,
	"grad_norm": 0.458984375,
	"learning_rate": 3.9610389610389614e-05,
	"loss": 1.7377,
	"step": 841
	},
	{
	"epoch": 1.08,
	"grad_norm": 0.3515625,
	"learning_rate": 3.939393939393939e-05,
	"loss": 1.7468,
	"step": 842
	},
	{
	"epoch": 1.08,
	"grad_norm": 0.3359375,
	"learning_rate": 3.917748917748918e-05,
	"loss": 1.703,
	"step": 843
	},
	{
	"epoch": 1.08,
	"grad_norm": 0.38671875,
	"learning_rate": 3.8961038961038966e-05,
	"loss": 1.7124,
	"step": 844
	},
	{
	"epoch": 1.08,
	"grad_norm": 0.341796875,
	"learning_rate": 3.8744588744588746e-05,
	"loss": 1.6444,
	"step": 845
	},
	{
	"epoch": 1.08,
	"grad_norm": 0.341796875,
	"learning_rate": 3.852813852813853e-05,
	"loss": 1.7465,
	"step": 846
	},
	{
	"epoch": 1.08,
	"grad_norm": 0.41015625,
	"learning_rate": 3.831168831168831e-05,
	"loss": 1.7153,
	"step": 847
	},
	{
	"epoch": 1.09,
	"grad_norm": 0.359375,
	"learning_rate": 3.809523809523809e-05,
	"loss": 1.756,
	"step": 848
	},
	{
	"epoch": 1.09,
	"grad_norm": 0.37890625,
	"learning_rate": 3.787878787878788e-05,
	"loss": 1.6992,
	"step": 849
	},
	{
	"epoch": 1.09,
	"grad_norm": 0.33203125,
	"learning_rate": 3.7662337662337665e-05,
	"loss": 1.7487,
	"step": 850
	},
	{
	"epoch": 1.09,
	"eval_loss": 1.725529670715332,
	"eval_runtime": 124.4606,
	"eval_samples_per_second": 40.173,
	"eval_steps_per_second": 1.261,
	"step": 850
	},
	{
	"epoch": 1.09,
	"grad_norm": 0.3828125,
	"learning_rate": 3.7445887445887445e-05,
	"loss": 1.7665,
	"step": 851
	},
	{
	"epoch": 1.09,
	"grad_norm": 0.349609375,
	"learning_rate": 3.722943722943723e-05,
	"loss": 1.7517,
	"step": 852
	},
	{
	"epoch": 1.09,
	"grad_norm": 0.353515625,
	"learning_rate": 3.701298701298702e-05,
	"loss": 1.7896,
	"step": 853
	},
	{
	"epoch": 1.09,
	"grad_norm": 0.37109375,
	"learning_rate": 3.67965367965368e-05,
	"loss": 1.7554,
	"step": 854
	},
	{
	"epoch": 1.09,
	"grad_norm": 0.3515625,
	"learning_rate": 3.6580086580086584e-05,
	"loss": 1.7757,
	"step": 855
	},
	{
	"epoch": 1.1,
	"grad_norm": 0.390625,
	"learning_rate": 3.6363636363636364e-05,
	"loss": 1.7977,
	"step": 856
	},
	{
	"epoch": 1.1,
	"grad_norm": 0.376953125,
	"learning_rate": 3.6147186147186144e-05,
	"loss": 1.7066,
	"step": 857
	},
	{
	"epoch": 1.1,
	"grad_norm": 0.3671875,
	"learning_rate": 3.593073593073593e-05,
	"loss": 1.7159,
	"step": 858
	},
	{
	"epoch": 1.1,
	"grad_norm": 0.3515625,
	"learning_rate": 3.571428571428572e-05,
	"loss": 1.5972,
	"step": 859
	},
	{
	"epoch": 1.1,
	"grad_norm": 0.365234375,
	"learning_rate": 3.5497835497835503e-05,
	"loss": 1.7102,
	"step": 860
	},
	{
	"epoch": 1.1,
	"grad_norm": 0.369140625,
	"learning_rate": 3.528138528138528e-05,
	"loss": 1.661,
	"step": 861
	},
	{
	"epoch": 1.1,
	"grad_norm": 0.47265625,
	"learning_rate": 3.506493506493507e-05,
	"loss": 1.691,
	"step": 862
	},
	{
	"epoch": 1.1,
	"grad_norm": 0.3125,
	"learning_rate": 3.484848484848485e-05,
	"loss": 1.7247,
	"step": 863
	},
	{
	"epoch": 1.11,
	"grad_norm": 0.390625,
	"learning_rate": 3.463203463203463e-05,
	"loss": 1.6295,
	"step": 864
	},
	{
	"epoch": 1.11,
	"grad_norm": 0.361328125,
	"learning_rate": 3.4415584415584416e-05,
	"loss": 1.7508,
	"step": 865
	},
	{
	"epoch": 1.11,
	"grad_norm": 0.330078125,
	"learning_rate": 3.41991341991342e-05,
	"loss": 1.6085,
	"step": 866
	},
	{
	"epoch": 1.11,
	"grad_norm": 0.35546875,
	"learning_rate": 3.398268398268398e-05,
	"loss": 1.7507,
	"step": 867
	},
	{
	"epoch": 1.11,
	"grad_norm": 0.296875,
	"learning_rate": 3.376623376623377e-05,
	"loss": 1.7143,
	"step": 868
	},
	{
	"epoch": 1.11,
	"grad_norm": 0.34765625,
	"learning_rate": 3.3549783549783555e-05,
	"loss": 1.7195,
	"step": 869
	},
	{
	"epoch": 1.11,
	"grad_norm": 0.3984375,
	"learning_rate": 3.3333333333333335e-05,
	"loss": 1.7606,
	"step": 870
	},
	{
	"epoch": 1.11,
	"grad_norm": 0.3515625,
	"learning_rate": 3.311688311688312e-05,
	"loss": 1.7038,
	"step": 871
	},
	{
	"epoch": 1.12,
	"grad_norm": 0.33984375,
	"learning_rate": 3.29004329004329e-05,
	"loss": 1.7545,
	"step": 872
	},
	{
	"epoch": 1.12,
	"grad_norm": 0.306640625,
	"learning_rate": 3.268398268398268e-05,
	"loss": 1.6866,
	"step": 873
	},
	{
	"epoch": 1.12,
	"grad_norm": 0.404296875,
	"learning_rate": 3.246753246753247e-05,
	"loss": 1.683,
	"step": 874
	},
	{
	"epoch": 1.12,
	"grad_norm": 0.345703125,
	"learning_rate": 3.2251082251082254e-05,
	"loss": 1.7329,
	"step": 875
	},
	{
	"epoch": 1.12,
	"grad_norm": 0.341796875,
	"learning_rate": 3.2034632034632034e-05,
	"loss": 1.7675,
	"step": 876
	},
	{
	"epoch": 1.12,
	"grad_norm": 0.333984375,
	"learning_rate": 3.181818181818182e-05,
	"loss": 1.6585,
	"step": 877
	},
	{
	"epoch": 1.12,
	"grad_norm": 0.34375,
	"learning_rate": 3.160173160173161e-05,
	"loss": 1.7628,
	"step": 878
	},
	{
	"epoch": 1.12,
	"grad_norm": 0.384765625,
	"learning_rate": 3.1385281385281387e-05,
	"loss": 1.6784,
	"step": 879
	},
	{
	"epoch": 1.13,
	"grad_norm": 0.345703125,
	"learning_rate": 3.1168831168831166e-05,
	"loss": 1.7177,
	"step": 880
	},
	{
	"epoch": 1.13,
	"grad_norm": 0.34375,
	"learning_rate": 3.095238095238095e-05,
	"loss": 1.6945,
	"step": 881
	},
	{
	"epoch": 1.13,
	"grad_norm": 0.380859375,
	"learning_rate": 3.073593073593073e-05,
	"loss": 1.7096,
	"step": 882
	},
	{
	"epoch": 1.13,
	"grad_norm": 0.361328125,
	"learning_rate": 3.051948051948052e-05,
	"loss": 1.7545,
	"step": 883
	},
	{
	"epoch": 1.13,
	"grad_norm": 0.369140625,
	"learning_rate": 3.0303030303030306e-05,
	"loss": 1.7122,
	"step": 884
	},
	{
	"epoch": 1.13,
	"grad_norm": 0.36328125,
	"learning_rate": 3.0086580086580092e-05,
	"loss": 1.6433,
	"step": 885
	},
	{
	"epoch": 1.13,
	"grad_norm": 0.37109375,
	"learning_rate": 2.9870129870129872e-05,
	"loss": 1.6908,
	"step": 886
	},
	{
	"epoch": 1.13,
	"grad_norm": 0.326171875,
	"learning_rate": 2.9653679653679655e-05,
	"loss": 1.8096,
	"step": 887
	},
	{
	"epoch": 1.14,
	"grad_norm": 0.375,
	"learning_rate": 2.943722943722944e-05,
	"loss": 1.5972,
	"step": 888
	},
	{
	"epoch": 1.14,
	"grad_norm": 0.345703125,
	"learning_rate": 2.922077922077922e-05,
	"loss": 1.7858,
	"step": 889
	},
	{
	"epoch": 1.14,
	"grad_norm": 0.326171875,
	"learning_rate": 2.9004329004329005e-05,
	"loss": 1.7353,
	"step": 890
	},
	{
	"epoch": 1.14,
	"grad_norm": 0.35546875,
	"learning_rate": 2.878787878787879e-05,
	"loss": 1.7572,
	"step": 891
	},
	{
	"epoch": 1.14,
	"grad_norm": 0.361328125,
	"learning_rate": 2.857142857142857e-05,
	"loss": 1.7268,
	"step": 892
	},
	{
	"epoch": 1.14,
	"grad_norm": 0.37109375,
	"learning_rate": 2.8354978354978357e-05,
	"loss": 1.7919,
	"step": 893
	},
	{
	"epoch": 1.14,
	"grad_norm": 0.37109375,
	"learning_rate": 2.813852813852814e-05,
	"loss": 1.735,
	"step": 894
	},
	{
	"epoch": 1.15,
	"grad_norm": 0.32421875,
	"learning_rate": 2.792207792207792e-05,
	"loss": 1.7174,
	"step": 895
	},
	{
	"epoch": 1.15,
	"grad_norm": 0.37890625,
	"learning_rate": 2.7705627705627707e-05,
	"loss": 1.6896,
	"step": 896
	},
	{
	"epoch": 1.15,
	"grad_norm": 0.318359375,
	"learning_rate": 2.7489177489177493e-05,
	"loss": 1.7908,
	"step": 897
	},
	{
	"epoch": 1.15,
	"grad_norm": 0.33984375,
	"learning_rate": 2.7272727272727273e-05,
	"loss": 1.7684,
	"step": 898
	},
	{
	"epoch": 1.15,
	"grad_norm": 0.33203125,
	"learning_rate": 2.7056277056277056e-05,
	"loss": 1.7138,
	"step": 899
	},
	{
	"epoch": 1.15,
	"grad_norm": 0.34375,
	"learning_rate": 2.6839826839826843e-05,
	"loss": 1.7012,
	"step": 900
	},
	{
	"epoch": 1.15,
	"eval_loss": 1.725927710533142,
	"eval_runtime": 125.737,
	"eval_samples_per_second": 39.766,
	"eval_steps_per_second": 1.249,
	"step": 900
	},
	{
	"epoch": 1.15,
	"grad_norm": 0.35546875,
	"learning_rate": 2.6623376623376623e-05,
	"loss": 1.7415,
	"step": 901
	},
	{
	"epoch": 1.15,
	"grad_norm": 0.365234375,
	"learning_rate": 2.640692640692641e-05,
	"loss": 1.7195,
	"step": 902
	},
	{
	"epoch": 1.16,
	"grad_norm": 0.337890625,
	"learning_rate": 2.6190476190476192e-05,
	"loss": 1.7539,
	"step": 903
	},
	{
	"epoch": 1.16,
	"grad_norm": 0.369140625,
	"learning_rate": 2.5974025974025972e-05,
	"loss": 1.6218,
	"step": 904
	},
	{
	"epoch": 1.16,
	"grad_norm": 0.3125,
	"learning_rate": 2.575757575757576e-05,
	"loss": 1.6949,
	"step": 905
	},
	{
	"epoch": 1.16,
	"grad_norm": 0.361328125,
	"learning_rate": 2.5541125541125545e-05,
	"loss": 1.7539,
	"step": 906
	},
	{
	"epoch": 1.16,
	"grad_norm": 0.345703125,
	"learning_rate": 2.5324675324675325e-05,
	"loss": 1.7398,
	"step": 907
	},
	{
	"epoch": 1.16,
	"grad_norm": 0.3515625,
	"learning_rate": 2.5108225108225108e-05,
	"loss": 1.7924,
	"step": 908
	},
	{
	"epoch": 1.16,
	"grad_norm": 0.34765625,
	"learning_rate": 2.4891774891774894e-05,
	"loss": 1.7182,
	"step": 909
	},
	{
	"epoch": 1.16,
	"grad_norm": 0.412109375,
	"learning_rate": 2.4675324675324678e-05,
	"loss": 1.7197,
	"step": 910
	},
	{
	"epoch": 1.17,
	"grad_norm": 0.310546875,
	"learning_rate": 2.4458874458874457e-05,
	"loss": 1.6976,
	"step": 911
	},
	{
	"epoch": 1.17,
	"grad_norm": 0.369140625,
	"learning_rate": 2.4242424242424244e-05,
	"loss": 1.7478,
	"step": 912
	},
	{
	"epoch": 1.17,
	"grad_norm": 0.333984375,
	"learning_rate": 2.4025974025974027e-05,
	"loss": 1.6187,
	"step": 913
	},
	{
	"epoch": 1.17,
	"grad_norm": 0.345703125,
	"learning_rate": 2.380952380952381e-05,
	"loss": 1.7191,
	"step": 914
	},
	{
	"epoch": 1.17,
	"grad_norm": 0.439453125,
	"learning_rate": 2.3593073593073593e-05,
	"loss": 1.6616,
	"step": 915
	},
	{
	"epoch": 1.17,
	"grad_norm": 0.328125,
	"learning_rate": 2.3376623376623376e-05,
	"loss": 1.6345,
	"step": 916
	},
	{
	"epoch": 1.17,
	"grad_norm": 0.447265625,
	"learning_rate": 2.3160173160173163e-05,
	"loss": 1.7367,
	"step": 917
	},
	{
	"epoch": 1.17,
	"grad_norm": 0.40625,
	"learning_rate": 2.2943722943722946e-05,
	"loss": 1.7257,
	"step": 918
	},
	{
	"epoch": 1.18,
	"grad_norm": 0.35546875,
	"learning_rate": 2.272727272727273e-05,
	"loss": 1.6999,
	"step": 919
	},
	{
	"epoch": 1.18,
	"grad_norm": 0.37109375,
	"learning_rate": 2.2510822510822512e-05,
	"loss": 1.6337,
	"step": 920
	},
	{
	"epoch": 1.18,
	"grad_norm": 0.349609375,
	"learning_rate": 2.2294372294372296e-05,
	"loss": 1.7632,
	"step": 921
	},
	{
	"epoch": 1.18,
	"grad_norm": 0.341796875,
	"learning_rate": 2.207792207792208e-05,
	"loss": 1.7686,
	"step": 922
	},
	{
	"epoch": 1.18,
	"grad_norm": 0.3671875,
	"learning_rate": 2.1861471861471862e-05,
	"loss": 1.6897,
	"step": 923
	},
	{
	"epoch": 1.18,
	"grad_norm": 0.365234375,
	"learning_rate": 2.1645021645021645e-05,
	"loss": 1.8555,
	"step": 924
	},
	{
	"epoch": 1.18,
	"grad_norm": 0.408203125,
	"learning_rate": 2.1428571428571428e-05,
	"loss": 1.7255,
	"step": 925
	},
	{
	"epoch": 1.18,
	"grad_norm": 0.3359375,
	"learning_rate": 2.1212121212121215e-05,
	"loss": 1.6774,
	"step": 926
	},
	{
	"epoch": 1.19,
	"grad_norm": 0.39453125,
	"learning_rate": 2.0995670995670998e-05,
	"loss": 1.7524,
	"step": 927
	},
	{
	"epoch": 1.19,
	"grad_norm": 0.328125,
	"learning_rate": 2.077922077922078e-05,
	"loss": 1.6569,
	"step": 928
	},
	{
	"epoch": 1.19,
	"grad_norm": 0.5078125,
	"learning_rate": 2.0562770562770564e-05,
	"loss": 1.7028,
	"step": 929
	},
	{
	"epoch": 1.19,
	"grad_norm": 0.349609375,
	"learning_rate": 2.0346320346320347e-05,
	"loss": 1.7377,
	"step": 930
	},
	{
	"epoch": 1.19,
	"grad_norm": 0.341796875,
	"learning_rate": 2.012987012987013e-05,
	"loss": 1.6855,
	"step": 931
	},
	{
	"epoch": 1.19,
	"grad_norm": 0.326171875,
	"learning_rate": 1.9913419913419914e-05,
	"loss": 1.6917,
	"step": 932
	},
	{
	"epoch": 1.19,
	"grad_norm": 0.341796875,
	"learning_rate": 1.9696969696969697e-05,
	"loss": 1.714,
	"step": 933
	},
	{
	"epoch": 1.2,
	"grad_norm": 0.333984375,
	"learning_rate": 1.9480519480519483e-05,
	"loss": 1.7653,
	"step": 934
	},
	{
	"epoch": 1.2,
	"grad_norm": 0.337890625,
	"learning_rate": 1.9264069264069266e-05,
	"loss": 1.6973,
	"step": 935
	},
	{
	"epoch": 1.2,
	"grad_norm": 0.34765625,
	"learning_rate": 1.9047619047619046e-05,
	"loss": 1.7066,
	"step": 936
	},
	{
	"epoch": 1.2,
	"grad_norm": 0.34375,
	"learning_rate": 1.8831168831168833e-05,
	"loss": 1.6812,
	"step": 937
	},
	{
	"epoch": 1.2,
	"grad_norm": 0.341796875,
	"learning_rate": 1.8614718614718616e-05,
	"loss": 1.7114,
	"step": 938
	},
	{
	"epoch": 1.2,
	"grad_norm": 0.3984375,
	"learning_rate": 1.83982683982684e-05,
	"loss": 1.7109,
	"step": 939
	},
	{
	"epoch": 1.2,
	"grad_norm": 0.375,
	"learning_rate": 1.8181818181818182e-05,
	"loss": 1.6037,
	"step": 940
	},
	{
	"epoch": 1.2,
	"grad_norm": 0.34765625,
	"learning_rate": 1.7965367965367965e-05,
	"loss": 1.7588,
	"step": 941
	},
	{
	"epoch": 1.21,
	"grad_norm": 0.412109375,
	"learning_rate": 1.7748917748917752e-05,
	"loss": 1.7093,
	"step": 942
	},
	{
	"epoch": 1.21,
	"grad_norm": 0.353515625,
	"learning_rate": 1.7532467532467535e-05,
	"loss": 1.7642,
	"step": 943
	},
	{
	"epoch": 1.21,
	"grad_norm": 0.466796875,
	"learning_rate": 1.7316017316017315e-05,
	"loss": 1.724,
	"step": 944
	},
	{
	"epoch": 1.21,
	"grad_norm": 0.34375,
	"learning_rate": 1.70995670995671e-05,
	"loss": 1.6136,
	"step": 945
	},
	{
	"epoch": 1.21,
	"grad_norm": 0.431640625,
	"learning_rate": 1.6883116883116884e-05,
	"loss": 1.6256,
	"step": 946
	},
	{
	"epoch": 1.21,
	"grad_norm": 0.400390625,
	"learning_rate": 1.6666666666666667e-05,
	"loss": 1.6878,
	"step": 947
	},
	{
	"epoch": 1.21,
	"grad_norm": 0.388671875,
	"learning_rate": 1.645021645021645e-05,
	"loss": 1.7281,
	"step": 948
	},
	{
	"epoch": 1.21,
	"grad_norm": 0.3359375,
	"learning_rate": 1.6233766233766234e-05,
	"loss": 1.7021,
	"step": 949
	},
	{
	"epoch": 1.22,
	"grad_norm": 0.375,
	"learning_rate": 1.6017316017316017e-05,
	"loss": 1.7982,
	"step": 950
	},
	{
	"epoch": 1.22,
	"eval_loss": 1.7258570194244385,
	"eval_runtime": 124.9465,
	"eval_samples_per_second": 40.017,
	"eval_steps_per_second": 1.257,
	"step": 950
	},
	{
	"epoch": 1.22,
	"grad_norm": 0.400390625,
	"learning_rate": 1.5800865800865803e-05,
	"loss": 1.641,
	"step": 951
	},
	{
	"epoch": 1.22,
	"grad_norm": 0.357421875,
	"learning_rate": 1.5584415584415583e-05,
	"loss": 1.6773,
	"step": 952
	},
	{
	"epoch": 1.22,
	"grad_norm": 0.341796875,
	"learning_rate": 1.5367965367965366e-05,
	"loss": 1.654,
	"step": 953
	},
	{
	"epoch": 1.22,
	"grad_norm": 0.380859375,
	"learning_rate": 1.5151515151515153e-05,
	"loss": 1.7277,
	"step": 954
	},
	{
	"epoch": 1.22,
	"grad_norm": 0.359375,
	"learning_rate": 1.4935064935064936e-05,
	"loss": 1.6995,
	"step": 955
	},
	{
	"epoch": 1.22,
	"grad_norm": 0.359375,
	"learning_rate": 1.471861471861472e-05,
	"loss": 1.7112,
	"step": 956
	},
	{
	"epoch": 1.22,
	"grad_norm": 0.35546875,
	"learning_rate": 1.4502164502164502e-05,
	"loss": 1.7486,
	"step": 957
	},
	{
	"epoch": 1.23,
	"grad_norm": 0.40234375,
	"learning_rate": 1.4285714285714285e-05,
	"loss": 1.7627,
	"step": 958
	},
	{
	"epoch": 1.23,
	"grad_norm": 0.34375,
	"learning_rate": 1.406926406926407e-05,
	"loss": 1.6709,
	"step": 959
	},
	{
	"epoch": 1.23,
	"grad_norm": 0.345703125,
	"learning_rate": 1.3852813852813853e-05,
	"loss": 1.7112,
	"step": 960
	},
	{
	"epoch": 1.23,
	"grad_norm": 0.375,
	"learning_rate": 1.3636363636363637e-05,
	"loss": 1.7299,
	"step": 961
	},
	{
	"epoch": 1.23,
	"grad_norm": 0.390625,
	"learning_rate": 1.3419913419913421e-05,
	"loss": 1.6656,
	"step": 962
	},
	{
	"epoch": 1.23,
	"grad_norm": 0.37890625,
	"learning_rate": 1.3203463203463205e-05,
	"loss": 1.6621,
	"step": 963
	},
	{
	"epoch": 1.23,
	"grad_norm": 0.33203125,
	"learning_rate": 1.2987012987012986e-05,
	"loss": 1.6751,
	"step": 964
	},
	{
	"epoch": 1.23,
	"grad_norm": 0.48828125,
	"learning_rate": 1.2770562770562773e-05,
	"loss": 1.7,
	"step": 965
	},
	{
	"epoch": 1.24,
	"grad_norm": 0.345703125,
	"learning_rate": 1.2554112554112554e-05,
	"loss": 1.7547,
	"step": 966
	},
	{
	"epoch": 1.24,
	"grad_norm": 0.357421875,
	"learning_rate": 1.2337662337662339e-05,
	"loss": 1.6858,
	"step": 967
	},
	{
	"epoch": 1.24,
	"grad_norm": 0.33984375,
	"learning_rate": 1.2121212121212122e-05,
	"loss": 1.5868,
	"step": 968
	},
	{
	"epoch": 1.24,
	"grad_norm": 0.3359375,
	"learning_rate": 1.1904761904761905e-05,
	"loss": 1.7628,
	"step": 969
	},
	{
	"epoch": 1.24,
	"grad_norm": 0.314453125,
	"learning_rate": 1.1688311688311688e-05,
	"loss": 1.6664,
	"step": 970
	},
	{
	"epoch": 1.24,
	"grad_norm": 0.33984375,
	"learning_rate": 1.1471861471861473e-05,
	"loss": 1.7387,
	"step": 971
	},
	{
	"epoch": 1.24,
	"grad_norm": 0.392578125,
	"learning_rate": 1.1255411255411256e-05,
	"loss": 1.7765,
	"step": 972
	},
	{
	"epoch": 1.25,
	"grad_norm": 0.341796875,
	"learning_rate": 1.103896103896104e-05,
	"loss": 1.7887,
	"step": 973
	},
	{
	"epoch": 1.25,
	"grad_norm": 0.341796875,
	"learning_rate": 1.0822510822510823e-05,
	"loss": 1.7143,
	"step": 974
	},
	{
	"epoch": 1.25,
	"grad_norm": 0.33984375,
	"learning_rate": 1.0606060606060607e-05,
	"loss": 1.6573,
	"step": 975
	},
	{
	"epoch": 1.25,
	"grad_norm": 0.33203125,
	"learning_rate": 1.038961038961039e-05,
	"loss": 1.7191,
	"step": 976
	},
	{
	"epoch": 1.25,
	"grad_norm": 0.337890625,
	"learning_rate": 1.0173160173160174e-05,
	"loss": 1.7437,
	"step": 977
	},
	{
	"epoch": 1.25,
	"grad_norm": 0.36328125,
	"learning_rate": 9.956709956709957e-06,
	"loss": 1.8012,
	"step": 978
	},
	{
	"epoch": 1.25,
	"grad_norm": 0.3515625,
	"learning_rate": 9.740259740259742e-06,
	"loss": 1.6769,
	"step": 979
	},
	{
	"epoch": 1.25,
	"grad_norm": 0.326171875,
	"learning_rate": 9.523809523809523e-06,
	"loss": 1.7836,
	"step": 980
	},
	{
	"epoch": 1.26,
	"grad_norm": 0.375,
	"learning_rate": 9.307359307359308e-06,
	"loss": 1.6654,
	"step": 981
	},
	{
	"epoch": 1.26,
	"grad_norm": 0.37109375,
	"learning_rate": 9.090909090909091e-06,
	"loss": 1.7367,
	"step": 982
	},
	{
	"epoch": 1.26,
	"grad_norm": 0.353515625,
	"learning_rate": 8.874458874458876e-06,
	"loss": 1.6735,
	"step": 983
	},
	{
	"epoch": 1.26,
	"grad_norm": 0.32421875,
	"learning_rate": 8.658008658008657e-06,
	"loss": 1.5795,
	"step": 984
	},
	{
	"epoch": 1.26,
	"grad_norm": 0.333984375,
	"learning_rate": 8.441558441558442e-06,
	"loss": 1.6784,
	"step": 985
	},
	{
	"epoch": 1.26,
	"grad_norm": 0.33203125,
	"learning_rate": 8.225108225108225e-06,
	"loss": 1.6743,
	"step": 986
	},
	{
	"epoch": 1.26,
	"grad_norm": 0.3671875,
	"learning_rate": 8.008658008658008e-06,
	"loss": 1.6877,
	"step": 987
	},
	{
	"epoch": 1.26,
	"grad_norm": 0.3125,
	"learning_rate": 7.792207792207792e-06,
	"loss": 1.7913,
	"step": 988
	},
	{
	"epoch": 1.27,
	"grad_norm": 0.38671875,
	"learning_rate": 7.5757575757575764e-06,
	"loss": 1.6147,
	"step": 989
	},
	{
	"epoch": 1.27,
	"grad_norm": 0.33203125,
	"learning_rate": 7.35930735930736e-06,
	"loss": 1.7615,
	"step": 990
	},
	{
	"epoch": 1.27,
	"grad_norm": 0.3515625,
	"learning_rate": 7.142857142857143e-06,
	"loss": 1.7406,
	"step": 991
	},
	{
	"epoch": 1.27,
	"grad_norm": 0.427734375,
	"learning_rate": 6.926406926406927e-06,
	"loss": 1.6287,
	"step": 992
	},
	{
	"epoch": 1.27,
	"grad_norm": 0.345703125,
	"learning_rate": 6.709956709956711e-06,
	"loss": 1.7655,
	"step": 993
	},
	{
	"epoch": 1.27,
	"grad_norm": 0.330078125,
	"learning_rate": 6.493506493506493e-06,
	"loss": 1.7003,
	"step": 994
	},
	{
	"epoch": 1.27,
	"grad_norm": 0.345703125,
	"learning_rate": 6.277056277056277e-06,
	"loss": 1.7531,
	"step": 995
	},
	{
	"epoch": 1.27,
	"grad_norm": 0.365234375,
	"learning_rate": 6.060606060606061e-06,
	"loss": 1.7475,
	"step": 996
	},
	{
	"epoch": 1.28,
	"grad_norm": 0.361328125,
	"learning_rate": 5.844155844155844e-06,
	"loss": 1.6811,
	"step": 997
	},
	{
	"epoch": 1.28,
	"grad_norm": 0.3203125,
	"learning_rate": 5.627705627705628e-06,
	"loss": 1.7256,
	"step": 998
	},
	{
	"epoch": 1.28,
	"grad_norm": 0.349609375,
	"learning_rate": 5.411255411255411e-06,
	"loss": 1.7238,
	"step": 999
	},
	{
	"epoch": 1.28,
	"grad_norm": 0.34375,
	"learning_rate": 5.194805194805195e-06,
	"loss": 1.6972,
	"step": 1000
	},
	{
	"epoch": 1.28,
	"eval_loss": 1.7247449159622192,
	"eval_runtime": 125.5655,
	"eval_samples_per_second": 39.82,
	"eval_steps_per_second": 1.25,
	"step": 1000
	},
	{
	"epoch": 1.28,
	"grad_norm": 0.359375,
	"learning_rate": 4.978354978354978e-06,
	"loss": 1.6889,
	"step": 1001
	},
	{
	"epoch": 1.28,
	"grad_norm": 0.330078125,
	"learning_rate": 4.7619047619047615e-06,
	"loss": 1.7495,
	"step": 1002
	},
	{
	"epoch": 1.28,
	"grad_norm": 0.37890625,
	"learning_rate": 4.5454545454545455e-06,
	"loss": 1.6499,
	"step": 1003
	},
	{
	"epoch": 1.28,
	"grad_norm": 0.353515625,
	"learning_rate": 4.329004329004329e-06,
	"loss": 1.6646,
	"step": 1004
	},
	{
	"epoch": 1.29,
	"grad_norm": 0.3671875,
	"learning_rate": 4.112554112554113e-06,
	"loss": 1.7523,
	"step": 1005
	},
	{
	"epoch": 1.29,
	"grad_norm": 0.333984375,
	"learning_rate": 3.896103896103896e-06,
	"loss": 1.7232,
	"step": 1006
	},
	{
	"epoch": 1.29,
	"grad_norm": 0.400390625,
	"learning_rate": 3.67965367965368e-06,
	"loss": 1.7209,
	"step": 1007
	},
	{
	"epoch": 1.29,
	"grad_norm": 0.333984375,
	"learning_rate": 3.4632034632034634e-06,
	"loss": 1.6627,
	"step": 1008
	},
	{
	"epoch": 1.29,
	"grad_norm": 0.3203125,
	"learning_rate": 3.2467532467532465e-06,
	"loss": 1.6678,
	"step": 1009
	},
	{
	"epoch": 1.29,
	"grad_norm": 0.333984375,
	"learning_rate": 3.0303030303030305e-06,
	"loss": 1.6757,
	"step": 1010
	},
	{
	"epoch": 1.29,
	"grad_norm": 0.33203125,
	"learning_rate": 2.813852813852814e-06,
	"loss": 1.7232,
	"step": 1011
	},
	{
	"epoch": 1.29,
	"grad_norm": 0.33203125,
	"learning_rate": 2.5974025974025976e-06,
	"loss": 1.6797,
	"step": 1012
	},
	{
	"epoch": 1.3,
	"grad_norm": 0.337890625,
	"learning_rate": 2.3809523809523808e-06,
	"loss": 1.825,
	"step": 1013
	},
	{
	"epoch": 1.3,
	"grad_norm": 0.337890625,
	"learning_rate": 2.1645021645021643e-06,
	"loss": 1.6841,
	"step": 1014
	},
	{
	"epoch": 1.3,
	"grad_norm": 0.328125,
	"learning_rate": 1.948051948051948e-06,
	"loss": 1.726,
	"step": 1015
	},
	{
	"epoch": 1.3,
	"grad_norm": 0.392578125,
	"learning_rate": 1.7316017316017317e-06,
	"loss": 1.7702,
	"step": 1016
	},
	{
	"epoch": 1.3,
	"grad_norm": 0.3359375,
	"learning_rate": 1.5151515151515152e-06,
	"loss": 1.7689,
	"step": 1017
	},
	{
	"epoch": 1.3,
	"grad_norm": 0.37109375,
	"learning_rate": 1.2987012987012988e-06,
	"loss": 1.7003,
	"step": 1018
	},
	{
	"epoch": 1.3,
	"grad_norm": 0.40234375,
	"learning_rate": 1.0822510822510822e-06,
	"loss": 1.6841,
	"step": 1019
	},
	{
	"epoch": 1.31,
	"grad_norm": 0.380859375,
	"learning_rate": 8.658008658008658e-07,
	"loss": 1.7318,
	"step": 1020
	},
	{
	"epoch": 1.31,
	"grad_norm": 0.345703125,
	"learning_rate": 6.493506493506494e-07,
	"loss": 1.7868,
	"step": 1021
	},
	{
	"epoch": 1.31,
	"grad_norm": 0.34765625,
	"learning_rate": 4.329004329004329e-07,
	"loss": 1.7199,
	"step": 1022
	},
	{
	"epoch": 1.31,
	"grad_norm": 0.359375,
	"learning_rate": 2.1645021645021646e-07,
	"loss": 1.7633,
	"step": 1023
	},
	{
	"epoch": 1.31,
	"grad_norm": 0.328125,
	"learning_rate": 0.0,
	"loss": 1.7878,
	"step": 1024
	},
	{
	"epoch": 1.31,
	"step": 1024,
	"total_flos": 7.46638513614422e+17,
	"train_loss": 1.8673716291086748,
	"train_runtime": 7714.204,
	"train_samples_per_second": 8.495,
	"train_steps_per_second": 0.133
	},
	{
	"epoch": 1.31,
	"eval_loss": 1.7247449159622192,
	"eval_runtime": 125.8509,
	"eval_samples_per_second": 39.73,
	"eval_steps_per_second": 1.248,
	"step": 1024
	}
	],
	"logging_steps": 1,
	"max_steps": 1024,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 2,
	"save_steps": 50,
	"total_flos": 7.46638513614422e+17,
	"train_batch_size": 32,
	"trial_name": null,
	"trial_params": null
	}