hard-math-684 / trainer_state.json
BoboboChen's picture
Add files using upload-large-folder tool
9695878 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.9609375,
"eval_steps": 500,
"global_step": 684,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01171875,
"grad_norm": 0.9319692850112915,
"learning_rate": 1.1764705882352942e-07,
"loss": 0.7028,
"step": 1
},
{
"epoch": 0.0234375,
"grad_norm": 0.9757155179977417,
"learning_rate": 2.3529411764705883e-07,
"loss": 0.7416,
"step": 2
},
{
"epoch": 0.03515625,
"grad_norm": 1.0889487266540527,
"learning_rate": 3.529411764705883e-07,
"loss": 0.8392,
"step": 3
},
{
"epoch": 0.046875,
"grad_norm": 1.0020272731781006,
"learning_rate": 4.7058823529411767e-07,
"loss": 0.7549,
"step": 4
},
{
"epoch": 0.05859375,
"grad_norm": 1.0064201354980469,
"learning_rate": 5.882352941176471e-07,
"loss": 0.802,
"step": 5
},
{
"epoch": 0.0703125,
"grad_norm": 0.9806166291236877,
"learning_rate": 7.058823529411766e-07,
"loss": 0.7754,
"step": 6
},
{
"epoch": 0.08203125,
"grad_norm": 0.9506519436836243,
"learning_rate": 8.235294117647059e-07,
"loss": 0.7591,
"step": 7
},
{
"epoch": 0.09375,
"grad_norm": 0.9138185977935791,
"learning_rate": 9.411764705882353e-07,
"loss": 0.7681,
"step": 8
},
{
"epoch": 0.10546875,
"grad_norm": 0.872790515422821,
"learning_rate": 1.0588235294117648e-06,
"loss": 0.7332,
"step": 9
},
{
"epoch": 0.1171875,
"grad_norm": 0.8308555483818054,
"learning_rate": 1.1764705882352942e-06,
"loss": 0.7991,
"step": 10
},
{
"epoch": 0.12890625,
"grad_norm": 0.7250374555587769,
"learning_rate": 1.2941176470588237e-06,
"loss": 0.6812,
"step": 11
},
{
"epoch": 0.140625,
"grad_norm": 0.6780915856361389,
"learning_rate": 1.4117647058823531e-06,
"loss": 0.7178,
"step": 12
},
{
"epoch": 0.15234375,
"grad_norm": 0.6684752702713013,
"learning_rate": 1.5294117647058826e-06,
"loss": 0.7335,
"step": 13
},
{
"epoch": 0.1640625,
"grad_norm": 0.48017001152038574,
"learning_rate": 1.6470588235294118e-06,
"loss": 0.7293,
"step": 14
},
{
"epoch": 0.17578125,
"grad_norm": 0.4176006317138672,
"learning_rate": 1.7647058823529414e-06,
"loss": 0.6453,
"step": 15
},
{
"epoch": 0.1875,
"grad_norm": 0.4581579566001892,
"learning_rate": 1.8823529411764707e-06,
"loss": 0.709,
"step": 16
},
{
"epoch": 0.19921875,
"grad_norm": 0.41040804982185364,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.6965,
"step": 17
},
{
"epoch": 0.2109375,
"grad_norm": 0.38801896572113037,
"learning_rate": 2.1176470588235296e-06,
"loss": 0.7142,
"step": 18
},
{
"epoch": 0.22265625,
"grad_norm": 0.5587085485458374,
"learning_rate": 2.2352941176470592e-06,
"loss": 0.6624,
"step": 19
},
{
"epoch": 0.234375,
"grad_norm": 0.6821652054786682,
"learning_rate": 2.3529411764705885e-06,
"loss": 0.6708,
"step": 20
},
{
"epoch": 0.24609375,
"grad_norm": 0.7130780816078186,
"learning_rate": 2.470588235294118e-06,
"loss": 0.68,
"step": 21
},
{
"epoch": 0.2578125,
"grad_norm": 0.6799156069755554,
"learning_rate": 2.5882352941176473e-06,
"loss": 0.6069,
"step": 22
},
{
"epoch": 0.26953125,
"grad_norm": 0.6155521869659424,
"learning_rate": 2.7058823529411766e-06,
"loss": 0.6541,
"step": 23
},
{
"epoch": 0.28125,
"grad_norm": 0.5290796160697937,
"learning_rate": 2.8235294117647062e-06,
"loss": 0.6276,
"step": 24
},
{
"epoch": 0.29296875,
"grad_norm": 0.460702121257782,
"learning_rate": 2.9411764705882355e-06,
"loss": 0.6172,
"step": 25
},
{
"epoch": 0.3046875,
"grad_norm": 0.4080013334751129,
"learning_rate": 3.058823529411765e-06,
"loss": 0.609,
"step": 26
},
{
"epoch": 0.31640625,
"grad_norm": 0.32690924406051636,
"learning_rate": 3.1764705882352943e-06,
"loss": 0.585,
"step": 27
},
{
"epoch": 0.328125,
"grad_norm": 0.3010313808917999,
"learning_rate": 3.2941176470588236e-06,
"loss": 0.5684,
"step": 28
},
{
"epoch": 0.33984375,
"grad_norm": 0.30681392550468445,
"learning_rate": 3.4117647058823532e-06,
"loss": 0.6241,
"step": 29
},
{
"epoch": 0.3515625,
"grad_norm": 0.2821861505508423,
"learning_rate": 3.529411764705883e-06,
"loss": 0.5612,
"step": 30
},
{
"epoch": 0.36328125,
"grad_norm": 0.3114885985851288,
"learning_rate": 3.6470588235294117e-06,
"loss": 0.5777,
"step": 31
},
{
"epoch": 0.375,
"grad_norm": 0.2961554527282715,
"learning_rate": 3.7647058823529414e-06,
"loss": 0.6118,
"step": 32
},
{
"epoch": 0.38671875,
"grad_norm": 0.2794322371482849,
"learning_rate": 3.882352941176471e-06,
"loss": 0.5221,
"step": 33
},
{
"epoch": 0.3984375,
"grad_norm": 0.33694833517074585,
"learning_rate": 4.000000000000001e-06,
"loss": 0.5857,
"step": 34
},
{
"epoch": 0.41015625,
"grad_norm": 0.2691885530948639,
"learning_rate": 4.11764705882353e-06,
"loss": 0.6122,
"step": 35
},
{
"epoch": 0.421875,
"grad_norm": 0.2636789381504059,
"learning_rate": 4.235294117647059e-06,
"loss": 0.5355,
"step": 36
},
{
"epoch": 0.43359375,
"grad_norm": 0.25277385115623474,
"learning_rate": 4.352941176470588e-06,
"loss": 0.5373,
"step": 37
},
{
"epoch": 0.4453125,
"grad_norm": 0.2117689847946167,
"learning_rate": 4.4705882352941184e-06,
"loss": 0.5308,
"step": 38
},
{
"epoch": 0.45703125,
"grad_norm": 0.19506287574768066,
"learning_rate": 4.588235294117647e-06,
"loss": 0.5372,
"step": 39
},
{
"epoch": 0.46875,
"grad_norm": 0.21475745737552643,
"learning_rate": 4.705882352941177e-06,
"loss": 0.565,
"step": 40
},
{
"epoch": 0.48046875,
"grad_norm": 0.20359668135643005,
"learning_rate": 4.823529411764706e-06,
"loss": 0.5349,
"step": 41
},
{
"epoch": 0.4921875,
"grad_norm": 0.2028248906135559,
"learning_rate": 4.941176470588236e-06,
"loss": 0.5587,
"step": 42
},
{
"epoch": 0.50390625,
"grad_norm": 0.21315933763980865,
"learning_rate": 5.058823529411765e-06,
"loss": 0.542,
"step": 43
},
{
"epoch": 0.515625,
"grad_norm": 0.21315297484397888,
"learning_rate": 5.176470588235295e-06,
"loss": 0.5878,
"step": 44
},
{
"epoch": 0.52734375,
"grad_norm": 0.19848909974098206,
"learning_rate": 5.294117647058824e-06,
"loss": 0.5238,
"step": 45
},
{
"epoch": 0.5390625,
"grad_norm": 0.20311777293682098,
"learning_rate": 5.411764705882353e-06,
"loss": 0.548,
"step": 46
},
{
"epoch": 0.55078125,
"grad_norm": 0.18243664503097534,
"learning_rate": 5.529411764705883e-06,
"loss": 0.5282,
"step": 47
},
{
"epoch": 0.5625,
"grad_norm": 0.16974785923957825,
"learning_rate": 5.6470588235294125e-06,
"loss": 0.4836,
"step": 48
},
{
"epoch": 0.57421875,
"grad_norm": 0.1762179285287857,
"learning_rate": 5.764705882352941e-06,
"loss": 0.5304,
"step": 49
},
{
"epoch": 0.5859375,
"grad_norm": 0.20087581872940063,
"learning_rate": 5.882352941176471e-06,
"loss": 0.5317,
"step": 50
},
{
"epoch": 0.59765625,
"grad_norm": 0.1862439662218094,
"learning_rate": 6e-06,
"loss": 0.5536,
"step": 51
},
{
"epoch": 0.609375,
"grad_norm": 0.1928662210702896,
"learning_rate": 6.11764705882353e-06,
"loss": 0.4844,
"step": 52
},
{
"epoch": 0.62109375,
"grad_norm": 0.20256248116493225,
"learning_rate": 6.2352941176470595e-06,
"loss": 0.5428,
"step": 53
},
{
"epoch": 0.6328125,
"grad_norm": 0.2007371336221695,
"learning_rate": 6.352941176470589e-06,
"loss": 0.5225,
"step": 54
},
{
"epoch": 0.64453125,
"grad_norm": 0.16175827383995056,
"learning_rate": 6.470588235294119e-06,
"loss": 0.5137,
"step": 55
},
{
"epoch": 0.65625,
"grad_norm": 0.17586955428123474,
"learning_rate": 6.588235294117647e-06,
"loss": 0.5034,
"step": 56
},
{
"epoch": 0.66796875,
"grad_norm": 0.17159290611743927,
"learning_rate": 6.705882352941176e-06,
"loss": 0.5267,
"step": 57
},
{
"epoch": 0.6796875,
"grad_norm": 0.17129066586494446,
"learning_rate": 6.8235294117647065e-06,
"loss": 0.4634,
"step": 58
},
{
"epoch": 0.69140625,
"grad_norm": 0.14943340420722961,
"learning_rate": 6.941176470588236e-06,
"loss": 0.4749,
"step": 59
},
{
"epoch": 0.703125,
"grad_norm": 0.17984403669834137,
"learning_rate": 7.058823529411766e-06,
"loss": 0.5254,
"step": 60
},
{
"epoch": 0.71484375,
"grad_norm": 0.15619614720344543,
"learning_rate": 7.176470588235295e-06,
"loss": 0.4941,
"step": 61
},
{
"epoch": 0.7265625,
"grad_norm": 0.14417926967144012,
"learning_rate": 7.294117647058823e-06,
"loss": 0.4234,
"step": 62
},
{
"epoch": 0.73828125,
"grad_norm": 0.16936203837394714,
"learning_rate": 7.4117647058823535e-06,
"loss": 0.5016,
"step": 63
},
{
"epoch": 0.75,
"grad_norm": 0.17044682800769806,
"learning_rate": 7.529411764705883e-06,
"loss": 0.4817,
"step": 64
},
{
"epoch": 0.76171875,
"grad_norm": 0.1539342850446701,
"learning_rate": 7.647058823529411e-06,
"loss": 0.4804,
"step": 65
},
{
"epoch": 0.7734375,
"grad_norm": 0.15344035625457764,
"learning_rate": 7.764705882352941e-06,
"loss": 0.4875,
"step": 66
},
{
"epoch": 0.78515625,
"grad_norm": 0.15734320878982544,
"learning_rate": 7.882352941176471e-06,
"loss": 0.436,
"step": 67
},
{
"epoch": 0.796875,
"grad_norm": 0.17874149978160858,
"learning_rate": 8.000000000000001e-06,
"loss": 0.5315,
"step": 68
},
{
"epoch": 0.80859375,
"grad_norm": 0.18438568711280823,
"learning_rate": 8.11764705882353e-06,
"loss": 0.5206,
"step": 69
},
{
"epoch": 0.8203125,
"grad_norm": 0.16773667931556702,
"learning_rate": 8.23529411764706e-06,
"loss": 0.4604,
"step": 70
},
{
"epoch": 0.83203125,
"grad_norm": 0.146653413772583,
"learning_rate": 8.35294117647059e-06,
"loss": 0.4445,
"step": 71
},
{
"epoch": 0.84375,
"grad_norm": 0.1633101850748062,
"learning_rate": 8.470588235294118e-06,
"loss": 0.4741,
"step": 72
},
{
"epoch": 0.85546875,
"grad_norm": 0.1426013708114624,
"learning_rate": 8.588235294117647e-06,
"loss": 0.4447,
"step": 73
},
{
"epoch": 0.8671875,
"grad_norm": 0.19708958268165588,
"learning_rate": 8.705882352941177e-06,
"loss": 0.4915,
"step": 74
},
{
"epoch": 0.87890625,
"grad_norm": 0.13479125499725342,
"learning_rate": 8.823529411764707e-06,
"loss": 0.4302,
"step": 75
},
{
"epoch": 0.890625,
"grad_norm": 0.16112269461154938,
"learning_rate": 8.941176470588237e-06,
"loss": 0.4457,
"step": 76
},
{
"epoch": 0.90234375,
"grad_norm": 0.1683078110218048,
"learning_rate": 9.058823529411765e-06,
"loss": 0.4617,
"step": 77
},
{
"epoch": 0.9140625,
"grad_norm": 0.17289473116397858,
"learning_rate": 9.176470588235294e-06,
"loss": 0.4885,
"step": 78
},
{
"epoch": 0.92578125,
"grad_norm": 0.14186030626296997,
"learning_rate": 9.294117647058824e-06,
"loss": 0.4063,
"step": 79
},
{
"epoch": 0.9375,
"grad_norm": 0.17006702721118927,
"learning_rate": 9.411764705882354e-06,
"loss": 0.4684,
"step": 80
},
{
"epoch": 0.94921875,
"grad_norm": 0.16714246571063995,
"learning_rate": 9.529411764705882e-06,
"loss": 0.4416,
"step": 81
},
{
"epoch": 0.9609375,
"grad_norm": 0.1373886913061142,
"learning_rate": 9.647058823529412e-06,
"loss": 0.4677,
"step": 82
},
{
"epoch": 0.97265625,
"grad_norm": 0.15918415784835815,
"learning_rate": 9.764705882352942e-06,
"loss": 0.4479,
"step": 83
},
{
"epoch": 0.984375,
"grad_norm": 0.12358763813972473,
"learning_rate": 9.882352941176472e-06,
"loss": 0.4033,
"step": 84
},
{
"epoch": 0.99609375,
"grad_norm": 0.19624371826648712,
"learning_rate": 1e-05,
"loss": 0.4579,
"step": 85
},
{
"epoch": 1.0,
"grad_norm": 0.19624371826648712,
"learning_rate": 9.99995783847866e-06,
"loss": 0.4138,
"step": 86
},
{
"epoch": 1.01171875,
"grad_norm": 0.2939195930957794,
"learning_rate": 9.999831354625678e-06,
"loss": 0.3971,
"step": 87
},
{
"epoch": 1.0234375,
"grad_norm": 0.14151132106781006,
"learning_rate": 9.999620550574155e-06,
"loss": 0.4168,
"step": 88
},
{
"epoch": 1.03515625,
"grad_norm": 0.174483984708786,
"learning_rate": 9.999325429879215e-06,
"loss": 0.3952,
"step": 89
},
{
"epoch": 1.046875,
"grad_norm": 0.16392698884010315,
"learning_rate": 9.998945997517957e-06,
"loss": 0.4031,
"step": 90
},
{
"epoch": 1.05859375,
"grad_norm": 0.13462132215499878,
"learning_rate": 9.99848225988936e-06,
"loss": 0.3974,
"step": 91
},
{
"epoch": 1.0703125,
"grad_norm": 0.20017389953136444,
"learning_rate": 9.997934224814173e-06,
"loss": 0.4054,
"step": 92
},
{
"epoch": 1.08203125,
"grad_norm": 0.15260270237922668,
"learning_rate": 9.997301901534797e-06,
"loss": 0.43,
"step": 93
},
{
"epoch": 1.09375,
"grad_norm": 0.15193504095077515,
"learning_rate": 9.996585300715117e-06,
"loss": 0.3885,
"step": 94
},
{
"epoch": 1.10546875,
"grad_norm": 0.16993655264377594,
"learning_rate": 9.99578443444032e-06,
"loss": 0.4191,
"step": 95
},
{
"epoch": 1.1171875,
"grad_norm": 0.15632706880569458,
"learning_rate": 9.994899316216709e-06,
"loss": 0.3439,
"step": 96
},
{
"epoch": 1.12890625,
"grad_norm": 0.1436368227005005,
"learning_rate": 9.99392996097145e-06,
"loss": 0.3609,
"step": 97
},
{
"epoch": 1.140625,
"grad_norm": 0.14202651381492615,
"learning_rate": 9.992876385052346e-06,
"loss": 0.4133,
"step": 98
},
{
"epoch": 1.15234375,
"grad_norm": 0.17068006098270416,
"learning_rate": 9.991738606227537e-06,
"loss": 0.3775,
"step": 99
},
{
"epoch": 1.1640625,
"grad_norm": 0.15222977101802826,
"learning_rate": 9.990516643685222e-06,
"loss": 0.4191,
"step": 100
},
{
"epoch": 1.17578125,
"grad_norm": 0.1581757664680481,
"learning_rate": 9.989210518033316e-06,
"loss": 0.4098,
"step": 101
},
{
"epoch": 1.1875,
"grad_norm": 0.14198894798755646,
"learning_rate": 9.987820251299121e-06,
"loss": 0.4031,
"step": 102
},
{
"epoch": 1.19921875,
"grad_norm": 0.1587005853652954,
"learning_rate": 9.98634586692894e-06,
"loss": 0.3632,
"step": 103
},
{
"epoch": 1.2109375,
"grad_norm": 0.1380324512720108,
"learning_rate": 9.984787389787689e-06,
"loss": 0.4229,
"step": 104
},
{
"epoch": 1.22265625,
"grad_norm": 0.1664944589138031,
"learning_rate": 9.983144846158472e-06,
"loss": 0.3952,
"step": 105
},
{
"epoch": 1.234375,
"grad_norm": 0.15593458712100983,
"learning_rate": 9.981418263742148e-06,
"loss": 0.3657,
"step": 106
},
{
"epoch": 1.24609375,
"grad_norm": 0.14692318439483643,
"learning_rate": 9.979607671656852e-06,
"loss": 0.3777,
"step": 107
},
{
"epoch": 1.2578125,
"grad_norm": 0.19152411818504333,
"learning_rate": 9.97771310043751e-06,
"loss": 0.378,
"step": 108
},
{
"epoch": 1.26953125,
"grad_norm": 0.1495964378118515,
"learning_rate": 9.975734582035323e-06,
"loss": 0.3662,
"step": 109
},
{
"epoch": 1.28125,
"grad_norm": 0.14938651025295258,
"learning_rate": 9.973672149817232e-06,
"loss": 0.4197,
"step": 110
},
{
"epoch": 1.29296875,
"grad_norm": 0.15581446886062622,
"learning_rate": 9.971525838565348e-06,
"loss": 0.3804,
"step": 111
},
{
"epoch": 1.3046875,
"grad_norm": 0.14621272683143616,
"learning_rate": 9.96929568447637e-06,
"loss": 0.3372,
"step": 112
},
{
"epoch": 1.31640625,
"grad_norm": 0.14326448738574982,
"learning_rate": 9.966981725160972e-06,
"loss": 0.4092,
"step": 113
},
{
"epoch": 1.328125,
"grad_norm": 0.1629864126443863,
"learning_rate": 9.964583999643174e-06,
"loss": 0.3829,
"step": 114
},
{
"epoch": 1.33984375,
"grad_norm": 0.16448885202407837,
"learning_rate": 9.96210254835968e-06,
"loss": 0.3952,
"step": 115
},
{
"epoch": 1.3515625,
"grad_norm": 0.13906875252723694,
"learning_rate": 9.95953741315919e-06,
"loss": 0.3501,
"step": 116
},
{
"epoch": 1.36328125,
"grad_norm": 0.1344955563545227,
"learning_rate": 9.95688863730171e-06,
"loss": 0.4212,
"step": 117
},
{
"epoch": 1.375,
"grad_norm": 0.2054166942834854,
"learning_rate": 9.954156265457801e-06,
"loss": 0.4155,
"step": 118
},
{
"epoch": 1.38671875,
"grad_norm": 0.14791074395179749,
"learning_rate": 9.951340343707852e-06,
"loss": 0.4104,
"step": 119
},
{
"epoch": 1.3984375,
"grad_norm": 0.17642416059970856,
"learning_rate": 9.948440919541277e-06,
"loss": 0.3502,
"step": 120
},
{
"epoch": 1.41015625,
"grad_norm": 0.14525847136974335,
"learning_rate": 9.945458041855732e-06,
"loss": 0.383,
"step": 121
},
{
"epoch": 1.421875,
"grad_norm": 0.15956953167915344,
"learning_rate": 9.942391760956277e-06,
"loss": 0.3864,
"step": 122
},
{
"epoch": 1.43359375,
"grad_norm": 0.16518352925777435,
"learning_rate": 9.939242128554542e-06,
"loss": 0.4374,
"step": 123
},
{
"epoch": 1.4453125,
"grad_norm": 0.15952655673027039,
"learning_rate": 9.936009197767847e-06,
"loss": 0.3719,
"step": 124
},
{
"epoch": 1.45703125,
"grad_norm": 0.13431760668754578,
"learning_rate": 9.932693023118299e-06,
"loss": 0.3693,
"step": 125
},
{
"epoch": 1.46875,
"grad_norm": 0.15348534286022186,
"learning_rate": 9.929293660531889e-06,
"loss": 0.3708,
"step": 126
},
{
"epoch": 1.48046875,
"grad_norm": 0.1592601090669632,
"learning_rate": 9.925811167337533e-06,
"loss": 0.3598,
"step": 127
},
{
"epoch": 1.4921875,
"grad_norm": 0.16625793278217316,
"learning_rate": 9.922245602266119e-06,
"loss": 0.3548,
"step": 128
},
{
"epoch": 1.50390625,
"grad_norm": 0.16050715744495392,
"learning_rate": 9.918597025449505e-06,
"loss": 0.3732,
"step": 129
},
{
"epoch": 1.515625,
"grad_norm": 0.1567108929157257,
"learning_rate": 9.91486549841951e-06,
"loss": 0.3875,
"step": 130
},
{
"epoch": 1.52734375,
"grad_norm": 0.13900204002857208,
"learning_rate": 9.911051084106877e-06,
"loss": 0.3134,
"step": 131
},
{
"epoch": 1.5390625,
"grad_norm": 0.17397968471050262,
"learning_rate": 9.90715384684021e-06,
"loss": 0.3283,
"step": 132
},
{
"epoch": 1.55078125,
"grad_norm": 0.13944688439369202,
"learning_rate": 9.903173852344889e-06,
"loss": 0.3801,
"step": 133
},
{
"epoch": 1.5625,
"grad_norm": 0.1763840615749359,
"learning_rate": 9.899111167741966e-06,
"loss": 0.3426,
"step": 134
},
{
"epoch": 1.57421875,
"grad_norm": 0.15428921580314636,
"learning_rate": 9.894965861547023e-06,
"loss": 0.4046,
"step": 135
},
{
"epoch": 1.5859375,
"grad_norm": 0.18792587518692017,
"learning_rate": 9.890738003669029e-06,
"loss": 0.3948,
"step": 136
},
{
"epoch": 1.59765625,
"grad_norm": 0.14246441423892975,
"learning_rate": 9.88642766540915e-06,
"loss": 0.343,
"step": 137
},
{
"epoch": 1.609375,
"grad_norm": 0.14254647493362427,
"learning_rate": 9.882034919459556e-06,
"loss": 0.3908,
"step": 138
},
{
"epoch": 1.62109375,
"grad_norm": 0.15610483288764954,
"learning_rate": 9.877559839902185e-06,
"loss": 0.4127,
"step": 139
},
{
"epoch": 1.6328125,
"grad_norm": 0.14824527502059937,
"learning_rate": 9.873002502207502e-06,
"loss": 0.3469,
"step": 140
},
{
"epoch": 1.64453125,
"grad_norm": 0.13025063276290894,
"learning_rate": 9.868362983233226e-06,
"loss": 0.3467,
"step": 141
},
{
"epoch": 1.65625,
"grad_norm": 0.12863893806934357,
"learning_rate": 9.863641361223025e-06,
"loss": 0.3441,
"step": 142
},
{
"epoch": 1.66796875,
"grad_norm": 0.1754492223262787,
"learning_rate": 9.858837715805207e-06,
"loss": 0.3893,
"step": 143
},
{
"epoch": 1.6796875,
"grad_norm": 0.14538809657096863,
"learning_rate": 9.853952127991374e-06,
"loss": 0.361,
"step": 144
},
{
"epoch": 1.69140625,
"grad_norm": 0.13839296996593475,
"learning_rate": 9.848984680175049e-06,
"loss": 0.4422,
"step": 145
},
{
"epoch": 1.703125,
"grad_norm": 0.16836000978946686,
"learning_rate": 9.843935456130295e-06,
"loss": 0.3741,
"step": 146
},
{
"epoch": 1.71484375,
"grad_norm": 0.1621960699558258,
"learning_rate": 9.8388045410103e-06,
"loss": 0.4299,
"step": 147
},
{
"epoch": 1.7265625,
"grad_norm": 0.15016399323940277,
"learning_rate": 9.833592021345938e-06,
"loss": 0.3613,
"step": 148
},
{
"epoch": 1.73828125,
"grad_norm": 0.1778838336467743,
"learning_rate": 9.828297985044314e-06,
"loss": 0.4127,
"step": 149
},
{
"epoch": 1.75,
"grad_norm": 0.20492997765541077,
"learning_rate": 9.822922521387277e-06,
"loss": 0.4566,
"step": 150
},
{
"epoch": 1.76171875,
"grad_norm": 0.142822265625,
"learning_rate": 9.817465721029916e-06,
"loss": 0.3561,
"step": 151
},
{
"epoch": 1.7734375,
"grad_norm": 0.15397801995277405,
"learning_rate": 9.811927675999035e-06,
"loss": 0.3854,
"step": 152
},
{
"epoch": 1.78515625,
"grad_norm": 0.13618412613868713,
"learning_rate": 9.806308479691595e-06,
"loss": 0.3933,
"step": 153
},
{
"epoch": 1.796875,
"grad_norm": 0.19063451886177063,
"learning_rate": 9.800608226873143e-06,
"loss": 0.382,
"step": 154
},
{
"epoch": 1.80859375,
"grad_norm": 0.16610917448997498,
"learning_rate": 9.794827013676206e-06,
"loss": 0.402,
"step": 155
},
{
"epoch": 1.8203125,
"grad_norm": 0.14735649526119232,
"learning_rate": 9.788964937598688e-06,
"loss": 0.3646,
"step": 156
},
{
"epoch": 1.83203125,
"grad_norm": 0.1583123356103897,
"learning_rate": 9.783022097502204e-06,
"loss": 0.3519,
"step": 157
},
{
"epoch": 1.84375,
"grad_norm": 0.14677587151527405,
"learning_rate": 9.776998593610428e-06,
"loss": 0.3739,
"step": 158
},
{
"epoch": 1.85546875,
"grad_norm": 0.15498070418834686,
"learning_rate": 9.770894527507393e-06,
"loss": 0.344,
"step": 159
},
{
"epoch": 1.8671875,
"grad_norm": 0.164178267121315,
"learning_rate": 9.764710002135784e-06,
"loss": 0.3701,
"step": 160
},
{
"epoch": 1.87890625,
"grad_norm": 0.12638989090919495,
"learning_rate": 9.7584451217952e-06,
"loss": 0.3985,
"step": 161
},
{
"epoch": 1.890625,
"grad_norm": 0.17551939189434052,
"learning_rate": 9.752099992140401e-06,
"loss": 0.367,
"step": 162
},
{
"epoch": 1.90234375,
"grad_norm": 0.13494940102100372,
"learning_rate": 9.745674720179507e-06,
"loss": 0.345,
"step": 163
},
{
"epoch": 1.9140625,
"grad_norm": 0.15139806270599365,
"learning_rate": 9.739169414272219e-06,
"loss": 0.3767,
"step": 164
},
{
"epoch": 1.92578125,
"grad_norm": 0.16584157943725586,
"learning_rate": 9.732584184127973e-06,
"loss": 0.4002,
"step": 165
},
{
"epoch": 1.9375,
"grad_norm": 0.14180031418800354,
"learning_rate": 9.7259191408041e-06,
"loss": 0.3494,
"step": 166
},
{
"epoch": 1.94921875,
"grad_norm": 0.15869130194187164,
"learning_rate": 9.719174396703941e-06,
"loss": 0.3527,
"step": 167
},
{
"epoch": 1.9609375,
"grad_norm": 0.15733729302883148,
"learning_rate": 9.71235006557497e-06,
"loss": 0.4277,
"step": 168
},
{
"epoch": 1.97265625,
"grad_norm": 0.16619561612606049,
"learning_rate": 9.705446262506858e-06,
"loss": 0.398,
"step": 169
},
{
"epoch": 1.984375,
"grad_norm": 0.156788632273674,
"learning_rate": 9.698463103929542e-06,
"loss": 0.4272,
"step": 170
},
{
"epoch": 1.99609375,
"grad_norm": 0.17090734839439392,
"learning_rate": 9.691400707611258e-06,
"loss": 0.3683,
"step": 171
},
{
"epoch": 2.0,
"grad_norm": 0.17090734839439392,
"learning_rate": 9.684259192656554e-06,
"loss": 0.3783,
"step": 172
},
{
"epoch": 2.01171875,
"grad_norm": 0.269039511680603,
"learning_rate": 9.677038679504285e-06,
"loss": 0.3033,
"step": 173
},
{
"epoch": 2.0234375,
"grad_norm": 0.18037453293800354,
"learning_rate": 9.669739289925578e-06,
"loss": 0.2647,
"step": 174
},
{
"epoch": 2.03515625,
"grad_norm": 0.16502933204174042,
"learning_rate": 9.66236114702178e-06,
"loss": 0.2689,
"step": 175
},
{
"epoch": 2.046875,
"grad_norm": 0.13461141288280487,
"learning_rate": 9.654904375222384e-06,
"loss": 0.299,
"step": 176
},
{
"epoch": 2.05859375,
"grad_norm": 0.15441769361495972,
"learning_rate": 9.647369100282928e-06,
"loss": 0.2987,
"step": 177
},
{
"epoch": 2.0703125,
"grad_norm": 0.22800779342651367,
"learning_rate": 9.639755449282874e-06,
"loss": 0.2801,
"step": 178
},
{
"epoch": 2.08203125,
"grad_norm": 0.1886795461177826,
"learning_rate": 9.632063550623465e-06,
"loss": 0.264,
"step": 179
},
{
"epoch": 2.09375,
"grad_norm": 0.16359379887580872,
"learning_rate": 9.62429353402556e-06,
"loss": 0.278,
"step": 180
},
{
"epoch": 2.10546875,
"grad_norm": 0.1788463145494461,
"learning_rate": 9.616445530527448e-06,
"loss": 0.2813,
"step": 181
},
{
"epoch": 2.1171875,
"grad_norm": 0.12653109431266785,
"learning_rate": 9.608519672482635e-06,
"loss": 0.2997,
"step": 182
},
{
"epoch": 2.12890625,
"grad_norm": 0.20340153574943542,
"learning_rate": 9.600516093557618e-06,
"loss": 0.3049,
"step": 183
},
{
"epoch": 2.140625,
"grad_norm": 0.1629520207643509,
"learning_rate": 9.592434928729617e-06,
"loss": 0.2765,
"step": 184
},
{
"epoch": 2.15234375,
"grad_norm": 0.17957361042499542,
"learning_rate": 9.584276314284316e-06,
"loss": 0.281,
"step": 185
},
{
"epoch": 2.1640625,
"grad_norm": 0.13229969143867493,
"learning_rate": 9.576040387813553e-06,
"loss": 0.2668,
"step": 186
},
{
"epoch": 2.17578125,
"grad_norm": 0.19403038918972015,
"learning_rate": 9.567727288213005e-06,
"loss": 0.2529,
"step": 187
},
{
"epoch": 2.1875,
"grad_norm": 0.1554100513458252,
"learning_rate": 9.559337155679843e-06,
"loss": 0.2756,
"step": 188
},
{
"epoch": 2.19921875,
"grad_norm": 0.1644996702671051,
"learning_rate": 9.550870131710366e-06,
"loss": 0.2765,
"step": 189
},
{
"epoch": 2.2109375,
"grad_norm": 0.16561852395534515,
"learning_rate": 9.542326359097619e-06,
"loss": 0.2834,
"step": 190
},
{
"epoch": 2.22265625,
"grad_norm": 0.15680573880672455,
"learning_rate": 9.533705981928984e-06,
"loss": 0.2611,
"step": 191
},
{
"epoch": 2.234375,
"grad_norm": 0.14902271330356598,
"learning_rate": 9.525009145583746e-06,
"loss": 0.2759,
"step": 192
},
{
"epoch": 2.24609375,
"grad_norm": 0.1453092247247696,
"learning_rate": 9.516235996730645e-06,
"loss": 0.2453,
"step": 193
},
{
"epoch": 2.2578125,
"grad_norm": 0.15511390566825867,
"learning_rate": 9.507386683325404e-06,
"loss": 0.2878,
"step": 194
},
{
"epoch": 2.26953125,
"grad_norm": 0.146243616938591,
"learning_rate": 9.498461354608228e-06,
"loss": 0.2813,
"step": 195
},
{
"epoch": 2.28125,
"grad_norm": 0.13351713120937347,
"learning_rate": 9.489460161101291e-06,
"loss": 0.2464,
"step": 196
},
{
"epoch": 2.29296875,
"grad_norm": 0.12981724739074707,
"learning_rate": 9.4803832546062e-06,
"loss": 0.2364,
"step": 197
},
{
"epoch": 2.3046875,
"grad_norm": 0.1395965814590454,
"learning_rate": 9.471230788201429e-06,
"loss": 0.2699,
"step": 198
},
{
"epoch": 2.31640625,
"grad_norm": 0.13987243175506592,
"learning_rate": 9.46200291623974e-06,
"loss": 0.2634,
"step": 199
},
{
"epoch": 2.328125,
"grad_norm": 0.17181117832660675,
"learning_rate": 9.452699794345583e-06,
"loss": 0.2501,
"step": 200
},
{
"epoch": 2.33984375,
"grad_norm": 0.12983955442905426,
"learning_rate": 9.443321579412465e-06,
"loss": 0.2383,
"step": 201
},
{
"epoch": 2.3515625,
"grad_norm": 0.17716065049171448,
"learning_rate": 9.43386842960031e-06,
"loss": 0.3096,
"step": 202
},
{
"epoch": 2.36328125,
"grad_norm": 0.16192656755447388,
"learning_rate": 9.42434050433279e-06,
"loss": 0.2825,
"step": 203
},
{
"epoch": 2.375,
"grad_norm": 0.1350512057542801,
"learning_rate": 9.414737964294636e-06,
"loss": 0.2301,
"step": 204
},
{
"epoch": 2.38671875,
"grad_norm": 0.1251179575920105,
"learning_rate": 9.405060971428924e-06,
"loss": 0.2682,
"step": 205
},
{
"epoch": 2.3984375,
"grad_norm": 0.13068172335624695,
"learning_rate": 9.39530968893435e-06,
"loss": 0.3303,
"step": 206
},
{
"epoch": 2.41015625,
"grad_norm": 0.16019192337989807,
"learning_rate": 9.38548428126248e-06,
"loss": 0.2651,
"step": 207
},
{
"epoch": 2.421875,
"grad_norm": 0.14367878437042236,
"learning_rate": 9.375584914114963e-06,
"loss": 0.2712,
"step": 208
},
{
"epoch": 2.43359375,
"grad_norm": 0.14105841517448425,
"learning_rate": 9.365611754440756e-06,
"loss": 0.2541,
"step": 209
},
{
"epoch": 2.4453125,
"grad_norm": 0.14219817519187927,
"learning_rate": 9.355564970433288e-06,
"loss": 0.2874,
"step": 210
},
{
"epoch": 2.45703125,
"grad_norm": 0.14144977927207947,
"learning_rate": 9.345444731527642e-06,
"loss": 0.255,
"step": 211
},
{
"epoch": 2.46875,
"grad_norm": 0.16394628584384918,
"learning_rate": 9.335251208397684e-06,
"loss": 0.2946,
"step": 212
},
{
"epoch": 2.48046875,
"grad_norm": 0.1422003209590912,
"learning_rate": 9.32498457295319e-06,
"loss": 0.275,
"step": 213
},
{
"epoch": 2.4921875,
"grad_norm": 0.153361514210701,
"learning_rate": 9.31464499833695e-06,
"loss": 0.2695,
"step": 214
},
{
"epoch": 2.50390625,
"grad_norm": 0.13666994869709015,
"learning_rate": 9.30423265892184e-06,
"loss": 0.2708,
"step": 215
},
{
"epoch": 2.515625,
"grad_norm": 0.13980181515216827,
"learning_rate": 9.29374773030789e-06,
"loss": 0.3132,
"step": 216
},
{
"epoch": 2.52734375,
"grad_norm": 0.1572592854499817,
"learning_rate": 9.283190389319315e-06,
"loss": 0.2802,
"step": 217
},
{
"epoch": 2.5390625,
"grad_norm": 0.12849898636341095,
"learning_rate": 9.27256081400154e-06,
"loss": 0.2813,
"step": 218
},
{
"epoch": 2.55078125,
"grad_norm": 0.14952799677848816,
"learning_rate": 9.26185918361819e-06,
"loss": 0.2662,
"step": 219
},
{
"epoch": 2.5625,
"grad_norm": 0.1414070427417755,
"learning_rate": 9.251085678648072e-06,
"loss": 0.2833,
"step": 220
},
{
"epoch": 2.57421875,
"grad_norm": 0.14631874859333038,
"learning_rate": 9.24024048078213e-06,
"loss": 0.2645,
"step": 221
},
{
"epoch": 2.5859375,
"grad_norm": 0.16351047158241272,
"learning_rate": 9.229323772920383e-06,
"loss": 0.2563,
"step": 222
},
{
"epoch": 2.59765625,
"grad_norm": 0.1420244574546814,
"learning_rate": 9.218335739168833e-06,
"loss": 0.3132,
"step": 223
},
{
"epoch": 2.609375,
"grad_norm": 0.16125503182411194,
"learning_rate": 9.207276564836367e-06,
"loss": 0.2533,
"step": 224
},
{
"epoch": 2.62109375,
"grad_norm": 0.16347342729568481,
"learning_rate": 9.196146436431635e-06,
"loss": 0.2884,
"step": 225
},
{
"epoch": 2.6328125,
"grad_norm": 0.15693005919456482,
"learning_rate": 9.18494554165989e-06,
"loss": 0.2644,
"step": 226
},
{
"epoch": 2.64453125,
"grad_norm": 0.15239854156970978,
"learning_rate": 9.173674069419843e-06,
"loss": 0.259,
"step": 227
},
{
"epoch": 2.65625,
"grad_norm": 0.14987824857234955,
"learning_rate": 9.162332209800455e-06,
"loss": 0.2792,
"step": 228
},
{
"epoch": 2.66796875,
"grad_norm": 0.14547227323055267,
"learning_rate": 9.150920154077753e-06,
"loss": 0.2663,
"step": 229
},
{
"epoch": 2.6796875,
"grad_norm": 0.15145206451416016,
"learning_rate": 9.13943809471159e-06,
"loss": 0.2885,
"step": 230
},
{
"epoch": 2.69140625,
"grad_norm": 0.17260988056659698,
"learning_rate": 9.1278862253424e-06,
"loss": 0.3377,
"step": 231
},
{
"epoch": 2.703125,
"grad_norm": 0.1483244150876999,
"learning_rate": 9.116264740787937e-06,
"loss": 0.2742,
"step": 232
},
{
"epoch": 2.71484375,
"grad_norm": 0.13184918463230133,
"learning_rate": 9.104573837039992e-06,
"loss": 0.2746,
"step": 233
},
{
"epoch": 2.7265625,
"grad_norm": 0.14674793183803558,
"learning_rate": 9.092813711261075e-06,
"loss": 0.2372,
"step": 234
},
{
"epoch": 2.73828125,
"grad_norm": 0.13162650167942047,
"learning_rate": 9.08098456178111e-06,
"loss": 0.287,
"step": 235
},
{
"epoch": 2.75,
"grad_norm": 0.14429397881031036,
"learning_rate": 9.069086588094067e-06,
"loss": 0.2561,
"step": 236
},
{
"epoch": 2.76171875,
"grad_norm": 0.1422855108976364,
"learning_rate": 9.057119990854617e-06,
"loss": 0.3037,
"step": 237
},
{
"epoch": 2.7734375,
"grad_norm": 0.15853242576122284,
"learning_rate": 9.045084971874738e-06,
"loss": 0.2888,
"step": 238
},
{
"epoch": 2.78515625,
"grad_norm": 0.12172834575176239,
"learning_rate": 9.032981734120312e-06,
"loss": 0.2477,
"step": 239
},
{
"epoch": 2.796875,
"grad_norm": 0.1423560231924057,
"learning_rate": 9.020810481707709e-06,
"loss": 0.2867,
"step": 240
},
{
"epoch": 2.80859375,
"grad_norm": 0.1702727973461151,
"learning_rate": 9.008571419900334e-06,
"loss": 0.3015,
"step": 241
},
{
"epoch": 2.8203125,
"grad_norm": 0.14154493808746338,
"learning_rate": 8.996264755105173e-06,
"loss": 0.2771,
"step": 242
},
{
"epoch": 2.83203125,
"grad_norm": 0.14596149325370789,
"learning_rate": 8.983890694869312e-06,
"loss": 0.3017,
"step": 243
},
{
"epoch": 2.84375,
"grad_norm": 0.13621939718723297,
"learning_rate": 8.97144944787643e-06,
"loss": 0.2496,
"step": 244
},
{
"epoch": 2.85546875,
"grad_norm": 0.18428286910057068,
"learning_rate": 8.958941223943292e-06,
"loss": 0.2751,
"step": 245
},
{
"epoch": 2.8671875,
"grad_norm": 0.1273455023765564,
"learning_rate": 8.946366234016192e-06,
"loss": 0.2678,
"step": 246
},
{
"epoch": 2.87890625,
"grad_norm": 0.1442483514547348,
"learning_rate": 8.933724690167417e-06,
"loss": 0.2574,
"step": 247
},
{
"epoch": 2.890625,
"grad_norm": 0.12419985979795456,
"learning_rate": 8.921016805591654e-06,
"loss": 0.2512,
"step": 248
},
{
"epoch": 2.90234375,
"grad_norm": 0.13012221455574036,
"learning_rate": 8.908242794602401e-06,
"loss": 0.2674,
"step": 249
},
{
"epoch": 2.9140625,
"grad_norm": 0.13028892874717712,
"learning_rate": 8.895402872628352e-06,
"loss": 0.2566,
"step": 250
},
{
"epoch": 2.92578125,
"grad_norm": 0.1266089677810669,
"learning_rate": 8.882497256209767e-06,
"loss": 0.2671,
"step": 251
},
{
"epoch": 2.9375,
"grad_norm": 0.15992388129234314,
"learning_rate": 8.869526162994814e-06,
"loss": 0.276,
"step": 252
},
{
"epoch": 2.94921875,
"grad_norm": 0.13113290071487427,
"learning_rate": 8.856489811735904e-06,
"loss": 0.3073,
"step": 253
},
{
"epoch": 2.9609375,
"grad_norm": 0.15523600578308105,
"learning_rate": 8.843388422285995e-06,
"loss": 0.2704,
"step": 254
},
{
"epoch": 2.97265625,
"grad_norm": 0.13755646347999573,
"learning_rate": 8.83022221559489e-06,
"loss": 0.3174,
"step": 255
},
{
"epoch": 2.984375,
"grad_norm": 0.15444748103618622,
"learning_rate": 8.816991413705515e-06,
"loss": 0.2955,
"step": 256
},
{
"epoch": 2.99609375,
"grad_norm": 0.13223521411418915,
"learning_rate": 8.803696239750163e-06,
"loss": 0.2563,
"step": 257
},
{
"epoch": 3.0,
"grad_norm": 0.28410595655441284,
"learning_rate": 8.790336917946737e-06,
"loss": 0.242,
"step": 258
},
{
"epoch": 3.01171875,
"grad_norm": 0.16228830814361572,
"learning_rate": 8.776913673594968e-06,
"loss": 0.1626,
"step": 259
},
{
"epoch": 3.0234375,
"grad_norm": 0.14956367015838623,
"learning_rate": 8.763426733072624e-06,
"loss": 0.1751,
"step": 260
},
{
"epoch": 3.03515625,
"grad_norm": 0.15641574561595917,
"learning_rate": 8.74987632383167e-06,
"loss": 0.1932,
"step": 261
},
{
"epoch": 3.046875,
"grad_norm": 0.18344224989414215,
"learning_rate": 8.736262674394455e-06,
"loss": 0.1646,
"step": 262
},
{
"epoch": 3.05859375,
"grad_norm": 0.23181785643100739,
"learning_rate": 8.722586014349851e-06,
"loss": 0.1855,
"step": 263
},
{
"epoch": 3.0703125,
"grad_norm": 0.1877327412366867,
"learning_rate": 8.708846574349372e-06,
"loss": 0.1779,
"step": 264
},
{
"epoch": 3.08203125,
"grad_norm": 0.1676245629787445,
"learning_rate": 8.695044586103297e-06,
"loss": 0.1621,
"step": 265
},
{
"epoch": 3.09375,
"grad_norm": 0.16094529628753662,
"learning_rate": 8.681180282376754e-06,
"loss": 0.1732,
"step": 266
},
{
"epoch": 3.10546875,
"grad_norm": 0.14831165969371796,
"learning_rate": 8.667253896985796e-06,
"loss": 0.1678,
"step": 267
},
{
"epoch": 3.1171875,
"grad_norm": 0.1500885933637619,
"learning_rate": 8.653265664793466e-06,
"loss": 0.1597,
"step": 268
},
{
"epoch": 3.12890625,
"grad_norm": 0.13528233766555786,
"learning_rate": 8.639215821705821e-06,
"loss": 0.2005,
"step": 269
},
{
"epoch": 3.140625,
"grad_norm": 0.1544928252696991,
"learning_rate": 8.625104604667965e-06,
"loss": 0.1658,
"step": 270
},
{
"epoch": 3.15234375,
"grad_norm": 0.1660798341035843,
"learning_rate": 8.610932251660046e-06,
"loss": 0.2022,
"step": 271
},
{
"epoch": 3.1640625,
"grad_norm": 0.17334631085395813,
"learning_rate": 8.596699001693257e-06,
"loss": 0.1821,
"step": 272
},
{
"epoch": 3.17578125,
"grad_norm": 0.23266500234603882,
"learning_rate": 8.58240509480578e-06,
"loss": 0.1905,
"step": 273
},
{
"epoch": 3.1875,
"grad_norm": 0.14414437115192413,
"learning_rate": 8.568050772058763e-06,
"loss": 0.1792,
"step": 274
},
{
"epoch": 3.19921875,
"grad_norm": 0.1535835564136505,
"learning_rate": 8.553636275532236e-06,
"loss": 0.1699,
"step": 275
},
{
"epoch": 3.2109375,
"grad_norm": 0.17544692754745483,
"learning_rate": 8.539161848321047e-06,
"loss": 0.216,
"step": 276
},
{
"epoch": 3.22265625,
"grad_norm": 0.14662209153175354,
"learning_rate": 8.524627734530738e-06,
"loss": 0.1655,
"step": 277
},
{
"epoch": 3.234375,
"grad_norm": 0.14544612169265747,
"learning_rate": 8.51003417927345e-06,
"loss": 0.1764,
"step": 278
},
{
"epoch": 3.24609375,
"grad_norm": 0.16317211091518402,
"learning_rate": 8.49538142866378e-06,
"loss": 0.1791,
"step": 279
},
{
"epoch": 3.2578125,
"grad_norm": 0.1408390998840332,
"learning_rate": 8.480669729814635e-06,
"loss": 0.1571,
"step": 280
},
{
"epoch": 3.26953125,
"grad_norm": 0.14905256032943726,
"learning_rate": 8.465899330833051e-06,
"loss": 0.1513,
"step": 281
},
{
"epoch": 3.28125,
"grad_norm": 0.13584789633750916,
"learning_rate": 8.451070480816027e-06,
"loss": 0.1677,
"step": 282
},
{
"epoch": 3.29296875,
"grad_norm": 0.14696229994297028,
"learning_rate": 8.436183429846314e-06,
"loss": 0.1812,
"step": 283
},
{
"epoch": 3.3046875,
"grad_norm": 0.21364903450012207,
"learning_rate": 8.421238428988199e-06,
"loss": 0.1875,
"step": 284
},
{
"epoch": 3.31640625,
"grad_norm": 0.15450192987918854,
"learning_rate": 8.40623573028327e-06,
"loss": 0.1763,
"step": 285
},
{
"epoch": 3.328125,
"grad_norm": 0.14626288414001465,
"learning_rate": 8.39117558674617e-06,
"loss": 0.1625,
"step": 286
},
{
"epoch": 3.33984375,
"grad_norm": 0.15964478254318237,
"learning_rate": 8.376058252360322e-06,
"loss": 0.1896,
"step": 287
},
{
"epoch": 3.3515625,
"grad_norm": 0.14050042629241943,
"learning_rate": 8.360883982073653e-06,
"loss": 0.1695,
"step": 288
},
{
"epoch": 3.36328125,
"grad_norm": 0.16250889003276825,
"learning_rate": 8.345653031794292e-06,
"loss": 0.1638,
"step": 289
},
{
"epoch": 3.375,
"grad_norm": 0.14831937849521637,
"learning_rate": 8.330365658386252e-06,
"loss": 0.1714,
"step": 290
},
{
"epoch": 3.38671875,
"grad_norm": 0.15942546725273132,
"learning_rate": 8.3150221196651e-06,
"loss": 0.1744,
"step": 291
},
{
"epoch": 3.3984375,
"grad_norm": 0.14313259720802307,
"learning_rate": 8.299622674393615e-06,
"loss": 0.1578,
"step": 292
},
{
"epoch": 3.41015625,
"grad_norm": 0.1886913925409317,
"learning_rate": 8.284167582277406e-06,
"loss": 0.2234,
"step": 293
},
{
"epoch": 3.421875,
"grad_norm": 0.16603605449199677,
"learning_rate": 8.268657103960558e-06,
"loss": 0.1891,
"step": 294
},
{
"epoch": 3.43359375,
"grad_norm": 0.18278054893016815,
"learning_rate": 8.25309150102121e-06,
"loss": 0.1847,
"step": 295
},
{
"epoch": 3.4453125,
"grad_norm": 0.15222598612308502,
"learning_rate": 8.237471035967168e-06,
"loss": 0.181,
"step": 296
},
{
"epoch": 3.45703125,
"grad_norm": 0.16404354572296143,
"learning_rate": 8.221795972231459e-06,
"loss": 0.1682,
"step": 297
},
{
"epoch": 3.46875,
"grad_norm": 0.15001647174358368,
"learning_rate": 8.206066574167893e-06,
"loss": 0.1589,
"step": 298
},
{
"epoch": 3.48046875,
"grad_norm": 0.1620359718799591,
"learning_rate": 8.190283107046613e-06,
"loss": 0.1914,
"step": 299
},
{
"epoch": 3.4921875,
"grad_norm": 0.17575767636299133,
"learning_rate": 8.174445837049614e-06,
"loss": 0.1713,
"step": 300
},
{
"epoch": 3.50390625,
"grad_norm": 0.16479137539863586,
"learning_rate": 8.158555031266255e-06,
"loss": 0.1958,
"step": 301
},
{
"epoch": 3.515625,
"grad_norm": 0.14474551379680634,
"learning_rate": 8.142610957688755e-06,
"loss": 0.1741,
"step": 302
},
{
"epoch": 3.52734375,
"grad_norm": 0.15629877150058746,
"learning_rate": 8.12661388520767e-06,
"loss": 0.1925,
"step": 303
},
{
"epoch": 3.5390625,
"grad_norm": 0.21056914329528809,
"learning_rate": 8.110564083607371e-06,
"loss": 0.1776,
"step": 304
},
{
"epoch": 3.55078125,
"grad_norm": 0.15697474777698517,
"learning_rate": 8.094461823561473e-06,
"loss": 0.16,
"step": 305
},
{
"epoch": 3.5625,
"grad_norm": 0.15944577753543854,
"learning_rate": 8.078307376628292e-06,
"loss": 0.1784,
"step": 306
},
{
"epoch": 3.57421875,
"grad_norm": 0.1611972153186798,
"learning_rate": 8.06210101524625e-06,
"loss": 0.1914,
"step": 307
},
{
"epoch": 3.5859375,
"grad_norm": 0.18928834795951843,
"learning_rate": 8.045843012729288e-06,
"loss": 0.2047,
"step": 308
},
{
"epoch": 3.59765625,
"grad_norm": 0.17320488393306732,
"learning_rate": 8.029533643262257e-06,
"loss": 0.1882,
"step": 309
},
{
"epoch": 3.609375,
"grad_norm": 0.1613771617412567,
"learning_rate": 8.013173181896283e-06,
"loss": 0.1814,
"step": 310
},
{
"epoch": 3.62109375,
"grad_norm": 0.15916645526885986,
"learning_rate": 7.996761904544146e-06,
"loss": 0.1905,
"step": 311
},
{
"epoch": 3.6328125,
"grad_norm": 0.1529555767774582,
"learning_rate": 7.980300087975612e-06,
"loss": 0.1835,
"step": 312
},
{
"epoch": 3.64453125,
"grad_norm": 0.16544155776500702,
"learning_rate": 7.963788009812775e-06,
"loss": 0.1791,
"step": 313
},
{
"epoch": 3.65625,
"grad_norm": 0.1579754799604416,
"learning_rate": 7.94722594852537e-06,
"loss": 0.1943,
"step": 314
},
{
"epoch": 3.66796875,
"grad_norm": 0.16517029702663422,
"learning_rate": 7.930614183426074e-06,
"loss": 0.1949,
"step": 315
},
{
"epoch": 3.6796875,
"grad_norm": 0.19699627161026,
"learning_rate": 7.913952994665805e-06,
"loss": 0.1816,
"step": 316
},
{
"epoch": 3.69140625,
"grad_norm": 0.15589186549186707,
"learning_rate": 7.89724266322899e-06,
"loss": 0.1857,
"step": 317
},
{
"epoch": 3.703125,
"grad_norm": 0.15042664110660553,
"learning_rate": 7.880483470928823e-06,
"loss": 0.1659,
"step": 318
},
{
"epoch": 3.71484375,
"grad_norm": 0.1443316787481308,
"learning_rate": 7.863675700402527e-06,
"loss": 0.1741,
"step": 319
},
{
"epoch": 3.7265625,
"grad_norm": 0.14981609582901,
"learning_rate": 7.846819635106569e-06,
"loss": 0.1998,
"step": 320
},
{
"epoch": 3.73828125,
"grad_norm": 0.13699378073215485,
"learning_rate": 7.829915559311892e-06,
"loss": 0.1636,
"step": 321
},
{
"epoch": 3.75,
"grad_norm": 0.14549602568149567,
"learning_rate": 7.812963758099118e-06,
"loss": 0.1732,
"step": 322
},
{
"epoch": 3.76171875,
"grad_norm": 0.15110832452774048,
"learning_rate": 7.795964517353734e-06,
"loss": 0.1816,
"step": 323
},
{
"epoch": 3.7734375,
"grad_norm": 0.1504000872373581,
"learning_rate": 7.778918123761287e-06,
"loss": 0.1656,
"step": 324
},
{
"epoch": 3.78515625,
"grad_norm": 0.15452761948108673,
"learning_rate": 7.76182486480253e-06,
"loss": 0.1764,
"step": 325
},
{
"epoch": 3.796875,
"grad_norm": 0.14827081561088562,
"learning_rate": 7.744685028748582e-06,
"loss": 0.159,
"step": 326
},
{
"epoch": 3.80859375,
"grad_norm": 0.140132874250412,
"learning_rate": 7.72749890465607e-06,
"loss": 0.1923,
"step": 327
},
{
"epoch": 3.8203125,
"grad_norm": 0.1527477502822876,
"learning_rate": 7.710266782362248e-06,
"loss": 0.1586,
"step": 328
},
{
"epoch": 3.83203125,
"grad_norm": 0.14466483891010284,
"learning_rate": 7.692988952480114e-06,
"loss": 0.1669,
"step": 329
},
{
"epoch": 3.84375,
"grad_norm": 0.15228520333766937,
"learning_rate": 7.675665706393502e-06,
"loss": 0.167,
"step": 330
},
{
"epoch": 3.85546875,
"grad_norm": 0.18122923374176025,
"learning_rate": 7.658297336252181e-06,
"loss": 0.1981,
"step": 331
},
{
"epoch": 3.8671875,
"grad_norm": 0.1492803692817688,
"learning_rate": 7.64088413496691e-06,
"loss": 0.1802,
"step": 332
},
{
"epoch": 3.87890625,
"grad_norm": 0.15203335881233215,
"learning_rate": 7.623426396204516e-06,
"loss": 0.1935,
"step": 333
},
{
"epoch": 3.890625,
"grad_norm": 0.15901945531368256,
"learning_rate": 7.605924414382926e-06,
"loss": 0.1711,
"step": 334
},
{
"epoch": 3.90234375,
"grad_norm": 0.14871086180210114,
"learning_rate": 7.588378484666214e-06,
"loss": 0.1658,
"step": 335
},
{
"epoch": 3.9140625,
"grad_norm": 0.16054388880729675,
"learning_rate": 7.570788902959612e-06,
"loss": 0.1915,
"step": 336
},
{
"epoch": 3.92578125,
"grad_norm": 0.14552587270736694,
"learning_rate": 7.553155965904535e-06,
"loss": 0.1901,
"step": 337
},
{
"epoch": 3.9375,
"grad_norm": 0.15138490498065948,
"learning_rate": 7.535479970873563e-06,
"loss": 0.2054,
"step": 338
},
{
"epoch": 3.94921875,
"grad_norm": 0.15584439039230347,
"learning_rate": 7.517761215965429e-06,
"loss": 0.1863,
"step": 339
},
{
"epoch": 3.9609375,
"grad_norm": 0.15878607332706451,
"learning_rate": 7.500000000000001e-06,
"loss": 0.2298,
"step": 340
},
{
"epoch": 3.97265625,
"grad_norm": 0.14984527230262756,
"learning_rate": 7.482196622513233e-06,
"loss": 0.1862,
"step": 341
},
{
"epoch": 3.984375,
"grad_norm": 0.1370866894721985,
"learning_rate": 7.464351383752117e-06,
"loss": 0.1711,
"step": 342
},
{
"epoch": 3.99609375,
"grad_norm": 0.13851135969161987,
"learning_rate": 7.4464645846696186e-06,
"loss": 0.1564,
"step": 343
},
{
"epoch": 4.0,
"grad_norm": 0.13851135969161987,
"learning_rate": 7.428536526919603e-06,
"loss": 0.1124,
"step": 344
},
{
"epoch": 4.01171875,
"grad_norm": 0.21845901012420654,
"learning_rate": 7.4105675128517456e-06,
"loss": 0.1096,
"step": 345
},
{
"epoch": 4.0234375,
"grad_norm": 0.18661820888519287,
"learning_rate": 7.392557845506433e-06,
"loss": 0.1378,
"step": 346
},
{
"epoch": 4.03515625,
"grad_norm": 0.16629564762115479,
"learning_rate": 7.374507828609657e-06,
"loss": 0.1173,
"step": 347
},
{
"epoch": 4.046875,
"grad_norm": 0.15008185803890228,
"learning_rate": 7.356417766567887e-06,
"loss": 0.1028,
"step": 348
},
{
"epoch": 4.05859375,
"grad_norm": 0.1788836270570755,
"learning_rate": 7.3382879644629345e-06,
"loss": 0.1065,
"step": 349
},
{
"epoch": 4.0703125,
"grad_norm": 0.19552384316921234,
"learning_rate": 7.320118728046818e-06,
"loss": 0.0964,
"step": 350
},
{
"epoch": 4.08203125,
"grad_norm": 0.17942851781845093,
"learning_rate": 7.301910363736596e-06,
"loss": 0.093,
"step": 351
},
{
"epoch": 4.09375,
"grad_norm": 0.1425376981496811,
"learning_rate": 7.283663178609204e-06,
"loss": 0.0923,
"step": 352
},
{
"epoch": 4.10546875,
"grad_norm": 0.14930015802383423,
"learning_rate": 7.265377480396277e-06,
"loss": 0.1254,
"step": 353
},
{
"epoch": 4.1171875,
"grad_norm": 0.1838270127773285,
"learning_rate": 7.247053577478955e-06,
"loss": 0.0997,
"step": 354
},
{
"epoch": 4.12890625,
"grad_norm": 0.16155952215194702,
"learning_rate": 7.2286917788826926e-06,
"loss": 0.0889,
"step": 355
},
{
"epoch": 4.140625,
"grad_norm": 0.15314266085624695,
"learning_rate": 7.210292394272029e-06,
"loss": 0.1011,
"step": 356
},
{
"epoch": 4.15234375,
"grad_norm": 0.14763030409812927,
"learning_rate": 7.191855733945388e-06,
"loss": 0.1017,
"step": 357
},
{
"epoch": 4.1640625,
"grad_norm": 0.1658443957567215,
"learning_rate": 7.173382108829826e-06,
"loss": 0.0825,
"step": 358
},
{
"epoch": 4.17578125,
"grad_norm": 0.14803840219974518,
"learning_rate": 7.154871830475798e-06,
"loss": 0.0986,
"step": 359
},
{
"epoch": 4.1875,
"grad_norm": 0.13587775826454163,
"learning_rate": 7.136325211051905e-06,
"loss": 0.1074,
"step": 360
},
{
"epoch": 4.19921875,
"grad_norm": 0.1521381437778473,
"learning_rate": 7.117742563339622e-06,
"loss": 0.095,
"step": 361
},
{
"epoch": 4.2109375,
"grad_norm": 0.1659468561410904,
"learning_rate": 7.099124200728028e-06,
"loss": 0.1028,
"step": 362
},
{
"epoch": 4.22265625,
"grad_norm": 0.17924466729164124,
"learning_rate": 7.0804704372085205e-06,
"loss": 0.0952,
"step": 363
},
{
"epoch": 4.234375,
"grad_norm": 0.17670685052871704,
"learning_rate": 7.061781587369518e-06,
"loss": 0.1299,
"step": 364
},
{
"epoch": 4.24609375,
"grad_norm": 0.15658561885356903,
"learning_rate": 7.043057966391158e-06,
"loss": 0.1098,
"step": 365
},
{
"epoch": 4.2578125,
"grad_norm": 0.1510300636291504,
"learning_rate": 7.024299890039978e-06,
"loss": 0.0922,
"step": 366
},
{
"epoch": 4.26953125,
"grad_norm": 0.15231317281723022,
"learning_rate": 7.005507674663594e-06,
"loss": 0.1098,
"step": 367
},
{
"epoch": 4.28125,
"grad_norm": 0.16875913739204407,
"learning_rate": 6.986681637185361e-06,
"loss": 0.0994,
"step": 368
},
{
"epoch": 4.29296875,
"grad_norm": 0.19646456837654114,
"learning_rate": 6.967822095099031e-06,
"loss": 0.1197,
"step": 369
},
{
"epoch": 4.3046875,
"grad_norm": 0.14720042049884796,
"learning_rate": 6.948929366463397e-06,
"loss": 0.1208,
"step": 370
},
{
"epoch": 4.31640625,
"grad_norm": 0.1687857210636139,
"learning_rate": 6.930003769896936e-06,
"loss": 0.1019,
"step": 371
},
{
"epoch": 4.328125,
"grad_norm": 0.17267028987407684,
"learning_rate": 6.91104562457242e-06,
"loss": 0.1051,
"step": 372
},
{
"epoch": 4.33984375,
"grad_norm": 0.15575772523880005,
"learning_rate": 6.892055250211552e-06,
"loss": 0.1319,
"step": 373
},
{
"epoch": 4.3515625,
"grad_norm": 0.16643179953098297,
"learning_rate": 6.873032967079562e-06,
"loss": 0.1241,
"step": 374
},
{
"epoch": 4.36328125,
"grad_norm": 0.18457186222076416,
"learning_rate": 6.8539790959798045e-06,
"loss": 0.1018,
"step": 375
},
{
"epoch": 4.375,
"grad_norm": 0.15639866888523102,
"learning_rate": 6.834893958248361e-06,
"loss": 0.1073,
"step": 376
},
{
"epoch": 4.38671875,
"grad_norm": 0.14814910292625427,
"learning_rate": 6.815777875748607e-06,
"loss": 0.1018,
"step": 377
},
{
"epoch": 4.3984375,
"grad_norm": 0.14821702241897583,
"learning_rate": 6.7966311708657884e-06,
"loss": 0.096,
"step": 378
},
{
"epoch": 4.41015625,
"grad_norm": 0.16026665270328522,
"learning_rate": 6.77745416650159e-06,
"loss": 0.0961,
"step": 379
},
{
"epoch": 4.421875,
"grad_norm": 0.135820209980011,
"learning_rate": 6.758247186068684e-06,
"loss": 0.0898,
"step": 380
},
{
"epoch": 4.43359375,
"grad_norm": 0.1573071926832199,
"learning_rate": 6.739010553485276e-06,
"loss": 0.1238,
"step": 381
},
{
"epoch": 4.4453125,
"grad_norm": 0.15640759468078613,
"learning_rate": 6.719744593169642e-06,
"loss": 0.1109,
"step": 382
},
{
"epoch": 4.45703125,
"grad_norm": 0.1494980901479721,
"learning_rate": 6.700449630034662e-06,
"loss": 0.0996,
"step": 383
},
{
"epoch": 4.46875,
"grad_norm": 0.13157877326011658,
"learning_rate": 6.681125989482337e-06,
"loss": 0.1078,
"step": 384
},
{
"epoch": 4.48046875,
"grad_norm": 0.17184874415397644,
"learning_rate": 6.6617739973982985e-06,
"loss": 0.1042,
"step": 385
},
{
"epoch": 4.4921875,
"grad_norm": 0.14946678280830383,
"learning_rate": 6.64239398014632e-06,
"loss": 0.1083,
"step": 386
},
{
"epoch": 4.50390625,
"grad_norm": 0.19056496024131775,
"learning_rate": 6.622986264562804e-06,
"loss": 0.127,
"step": 387
},
{
"epoch": 4.515625,
"grad_norm": 0.14448995888233185,
"learning_rate": 6.6035511779512764e-06,
"loss": 0.1038,
"step": 388
},
{
"epoch": 4.52734375,
"grad_norm": 0.1500634104013443,
"learning_rate": 6.584089048076866e-06,
"loss": 0.1069,
"step": 389
},
{
"epoch": 4.5390625,
"grad_norm": 0.14609850943088531,
"learning_rate": 6.5646002031607726e-06,
"loss": 0.103,
"step": 390
},
{
"epoch": 4.55078125,
"grad_norm": 0.15358710289001465,
"learning_rate": 6.545084971874738e-06,
"loss": 0.1128,
"step": 391
},
{
"epoch": 4.5625,
"grad_norm": 0.158953458070755,
"learning_rate": 6.525543683335497e-06,
"loss": 0.1028,
"step": 392
},
{
"epoch": 4.57421875,
"grad_norm": 0.1573840230703354,
"learning_rate": 6.505976667099233e-06,
"loss": 0.1167,
"step": 393
},
{
"epoch": 4.5859375,
"grad_norm": 0.16527414321899414,
"learning_rate": 6.486384253156014e-06,
"loss": 0.1178,
"step": 394
},
{
"epoch": 4.59765625,
"grad_norm": 0.14780762791633606,
"learning_rate": 6.466766771924231e-06,
"loss": 0.0999,
"step": 395
},
{
"epoch": 4.609375,
"grad_norm": 0.175028994679451,
"learning_rate": 6.447124554245026e-06,
"loss": 0.1273,
"step": 396
},
{
"epoch": 4.62109375,
"grad_norm": 0.16705100238323212,
"learning_rate": 6.427457931376712e-06,
"loss": 0.0969,
"step": 397
},
{
"epoch": 4.6328125,
"grad_norm": 0.1503540724515915,
"learning_rate": 6.407767234989181e-06,
"loss": 0.0962,
"step": 398
},
{
"epoch": 4.64453125,
"grad_norm": 0.17881572246551514,
"learning_rate": 6.388052797158324e-06,
"loss": 0.1143,
"step": 399
},
{
"epoch": 4.65625,
"grad_norm": 0.13578246533870697,
"learning_rate": 6.368314950360416e-06,
"loss": 0.122,
"step": 400
},
{
"epoch": 4.66796875,
"grad_norm": 0.19132906198501587,
"learning_rate": 6.3485540274665134e-06,
"loss": 0.0979,
"step": 401
},
{
"epoch": 4.6796875,
"grad_norm": 0.12635491788387299,
"learning_rate": 6.32877036173685e-06,
"loss": 0.1021,
"step": 402
},
{
"epoch": 4.69140625,
"grad_norm": 0.2278641015291214,
"learning_rate": 6.308964286815203e-06,
"loss": 0.1107,
"step": 403
},
{
"epoch": 4.703125,
"grad_norm": 0.2541256844997406,
"learning_rate": 6.289136136723268e-06,
"loss": 0.1026,
"step": 404
},
{
"epoch": 4.71484375,
"grad_norm": 0.16736242175102234,
"learning_rate": 6.269286245855039e-06,
"loss": 0.1211,
"step": 405
},
{
"epoch": 4.7265625,
"grad_norm": 0.22079572081565857,
"learning_rate": 6.249414948971154e-06,
"loss": 0.1321,
"step": 406
},
{
"epoch": 4.73828125,
"grad_norm": 0.16761334240436554,
"learning_rate": 6.229522581193257e-06,
"loss": 0.1219,
"step": 407
},
{
"epoch": 4.75,
"grad_norm": 0.19656169414520264,
"learning_rate": 6.209609477998339e-06,
"loss": 0.1115,
"step": 408
},
{
"epoch": 4.76171875,
"grad_norm": 0.14829809963703156,
"learning_rate": 6.189675975213094e-06,
"loss": 0.0969,
"step": 409
},
{
"epoch": 4.7734375,
"grad_norm": 0.14715968072414398,
"learning_rate": 6.169722409008244e-06,
"loss": 0.1179,
"step": 410
},
{
"epoch": 4.78515625,
"grad_norm": 0.17647211253643036,
"learning_rate": 6.1497491158928694e-06,
"loss": 0.0951,
"step": 411
},
{
"epoch": 4.796875,
"grad_norm": 0.1545528769493103,
"learning_rate": 6.129756432708739e-06,
"loss": 0.1028,
"step": 412
},
{
"epoch": 4.80859375,
"grad_norm": 0.15555883944034576,
"learning_rate": 6.109744696624631e-06,
"loss": 0.0929,
"step": 413
},
{
"epoch": 4.8203125,
"grad_norm": 0.1452905237674713,
"learning_rate": 6.089714245130639e-06,
"loss": 0.1001,
"step": 414
},
{
"epoch": 4.83203125,
"grad_norm": 0.1887577474117279,
"learning_rate": 6.0696654160324875e-06,
"loss": 0.1297,
"step": 415
},
{
"epoch": 4.84375,
"grad_norm": 0.15246422588825226,
"learning_rate": 6.049598547445829e-06,
"loss": 0.1085,
"step": 416
},
{
"epoch": 4.85546875,
"grad_norm": 0.1306416392326355,
"learning_rate": 6.02951397779055e-06,
"loss": 0.0879,
"step": 417
},
{
"epoch": 4.8671875,
"grad_norm": 0.16870683431625366,
"learning_rate": 6.009412045785051e-06,
"loss": 0.114,
"step": 418
},
{
"epoch": 4.87890625,
"grad_norm": 0.16079163551330566,
"learning_rate": 5.98929309044055e-06,
"loss": 0.1084,
"step": 419
},
{
"epoch": 4.890625,
"grad_norm": 0.1713971495628357,
"learning_rate": 5.9691574510553505e-06,
"loss": 0.1185,
"step": 420
},
{
"epoch": 4.90234375,
"grad_norm": 0.14745403826236725,
"learning_rate": 5.9490054672091305e-06,
"loss": 0.1064,
"step": 421
},
{
"epoch": 4.9140625,
"grad_norm": 0.1969185173511505,
"learning_rate": 5.928837478757206e-06,
"loss": 0.0908,
"step": 422
},
{
"epoch": 4.92578125,
"grad_norm": 0.2246403843164444,
"learning_rate": 5.908653825824808e-06,
"loss": 0.1175,
"step": 423
},
{
"epoch": 4.9375,
"grad_norm": 0.149653822183609,
"learning_rate": 5.888454848801345e-06,
"loss": 0.0957,
"step": 424
},
{
"epoch": 4.94921875,
"grad_norm": 0.13323569297790527,
"learning_rate": 5.8682408883346535e-06,
"loss": 0.0923,
"step": 425
},
{
"epoch": 4.9609375,
"grad_norm": 0.16845755279064178,
"learning_rate": 5.848012285325264e-06,
"loss": 0.1047,
"step": 426
},
{
"epoch": 4.97265625,
"grad_norm": 0.12530267238616943,
"learning_rate": 5.82776938092065e-06,
"loss": 0.1052,
"step": 427
},
{
"epoch": 4.984375,
"grad_norm": 0.16033221781253815,
"learning_rate": 5.807512516509468e-06,
"loss": 0.1021,
"step": 428
},
{
"epoch": 4.99609375,
"grad_norm": 0.15662384033203125,
"learning_rate": 5.787242033715809e-06,
"loss": 0.1029,
"step": 429
},
{
"epoch": 5.0,
"grad_norm": 0.15662384033203125,
"learning_rate": 5.766958274393428e-06,
"loss": 0.0769,
"step": 430
},
{
"epoch": 5.01171875,
"grad_norm": 0.23790201544761658,
"learning_rate": 5.746661580619985e-06,
"loss": 0.0638,
"step": 431
},
{
"epoch": 5.0234375,
"grad_norm": 0.13754722476005554,
"learning_rate": 5.726352294691275e-06,
"loss": 0.0464,
"step": 432
},
{
"epoch": 5.03515625,
"grad_norm": 0.10853724926710129,
"learning_rate": 5.706030759115458e-06,
"loss": 0.0506,
"step": 433
},
{
"epoch": 5.046875,
"grad_norm": 0.13977329432964325,
"learning_rate": 5.685697316607274e-06,
"loss": 0.0551,
"step": 434
},
{
"epoch": 5.05859375,
"grad_norm": 0.15899504721164703,
"learning_rate": 5.66535231008227e-06,
"loss": 0.0577,
"step": 435
},
{
"epoch": 5.0703125,
"grad_norm": 0.15688322484493256,
"learning_rate": 5.644996082651018e-06,
"loss": 0.0613,
"step": 436
},
{
"epoch": 5.08203125,
"grad_norm": 0.1396789848804474,
"learning_rate": 5.6246289776133246e-06,
"loss": 0.0594,
"step": 437
},
{
"epoch": 5.09375,
"grad_norm": 0.18655186891555786,
"learning_rate": 5.604251338452444e-06,
"loss": 0.0511,
"step": 438
},
{
"epoch": 5.10546875,
"grad_norm": 0.1351338028907776,
"learning_rate": 5.583863508829281e-06,
"loss": 0.0652,
"step": 439
},
{
"epoch": 5.1171875,
"grad_norm": 0.14356586337089539,
"learning_rate": 5.5634658325766066e-06,
"loss": 0.0672,
"step": 440
},
{
"epoch": 5.12890625,
"grad_norm": 0.17796984314918518,
"learning_rate": 5.543058653693241e-06,
"loss": 0.0604,
"step": 441
},
{
"epoch": 5.140625,
"grad_norm": 0.14155617356300354,
"learning_rate": 5.522642316338268e-06,
"loss": 0.0638,
"step": 442
},
{
"epoch": 5.15234375,
"grad_norm": 0.15060634911060333,
"learning_rate": 5.5022171648252265e-06,
"loss": 0.0489,
"step": 443
},
{
"epoch": 5.1640625,
"grad_norm": 0.1309528797864914,
"learning_rate": 5.4817835436163e-06,
"loss": 0.0561,
"step": 444
},
{
"epoch": 5.17578125,
"grad_norm": 0.14138580858707428,
"learning_rate": 5.46134179731651e-06,
"loss": 0.0556,
"step": 445
},
{
"epoch": 5.1875,
"grad_norm": 0.13814423978328705,
"learning_rate": 5.440892270667909e-06,
"loss": 0.0614,
"step": 446
},
{
"epoch": 5.19921875,
"grad_norm": 0.15363521873950958,
"learning_rate": 5.420435308543757e-06,
"loss": 0.0622,
"step": 447
},
{
"epoch": 5.2109375,
"grad_norm": 0.12922365963459015,
"learning_rate": 5.399971255942708e-06,
"loss": 0.0611,
"step": 448
},
{
"epoch": 5.22265625,
"grad_norm": 0.15916524827480316,
"learning_rate": 5.379500457983006e-06,
"loss": 0.0611,
"step": 449
},
{
"epoch": 5.234375,
"grad_norm": 0.14161787927150726,
"learning_rate": 5.359023259896638e-06,
"loss": 0.0463,
"step": 450
},
{
"epoch": 5.24609375,
"grad_norm": 0.14080356061458588,
"learning_rate": 5.338540007023538e-06,
"loss": 0.0574,
"step": 451
},
{
"epoch": 5.2578125,
"grad_norm": 0.14684629440307617,
"learning_rate": 5.318051044805745e-06,
"loss": 0.0508,
"step": 452
},
{
"epoch": 5.26953125,
"grad_norm": 0.14448480308055878,
"learning_rate": 5.297556718781588e-06,
"loss": 0.0649,
"step": 453
},
{
"epoch": 5.28125,
"grad_norm": 0.15748435258865356,
"learning_rate": 5.27705737457985e-06,
"loss": 0.0672,
"step": 454
},
{
"epoch": 5.29296875,
"grad_norm": 0.13277721405029297,
"learning_rate": 5.2565533579139484e-06,
"loss": 0.0474,
"step": 455
},
{
"epoch": 5.3046875,
"grad_norm": 0.14888741075992584,
"learning_rate": 5.236045014576098e-06,
"loss": 0.0699,
"step": 456
},
{
"epoch": 5.31640625,
"grad_norm": 0.1279619187116623,
"learning_rate": 5.2155326904314795e-06,
"loss": 0.0746,
"step": 457
},
{
"epoch": 5.328125,
"grad_norm": 0.17560450732707977,
"learning_rate": 5.1950167314124085e-06,
"loss": 0.058,
"step": 458
},
{
"epoch": 5.33984375,
"grad_norm": 0.15874771773815155,
"learning_rate": 5.174497483512506e-06,
"loss": 0.0575,
"step": 459
},
{
"epoch": 5.3515625,
"grad_norm": 0.13492193818092346,
"learning_rate": 5.153975292780852e-06,
"loss": 0.0469,
"step": 460
},
{
"epoch": 5.36328125,
"grad_norm": 0.14720965921878815,
"learning_rate": 5.133450505316162e-06,
"loss": 0.0605,
"step": 461
},
{
"epoch": 5.375,
"grad_norm": 0.1355433613061905,
"learning_rate": 5.112923467260941e-06,
"loss": 0.0509,
"step": 462
},
{
"epoch": 5.38671875,
"grad_norm": 0.15568669140338898,
"learning_rate": 5.09239452479565e-06,
"loss": 0.0687,
"step": 463
},
{
"epoch": 5.3984375,
"grad_norm": 0.15161432325839996,
"learning_rate": 5.071864024132868e-06,
"loss": 0.064,
"step": 464
},
{
"epoch": 5.41015625,
"grad_norm": 0.17019574344158173,
"learning_rate": 5.05133231151145e-06,
"loss": 0.0451,
"step": 465
},
{
"epoch": 5.421875,
"grad_norm": 0.11086393147706985,
"learning_rate": 5.030799733190694e-06,
"loss": 0.0559,
"step": 466
},
{
"epoch": 5.43359375,
"grad_norm": 0.15558047592639923,
"learning_rate": 5.010266635444495e-06,
"loss": 0.0585,
"step": 467
},
{
"epoch": 5.4453125,
"grad_norm": 0.148808091878891,
"learning_rate": 4.989733364555507e-06,
"loss": 0.0593,
"step": 468
},
{
"epoch": 5.45703125,
"grad_norm": 0.158802792429924,
"learning_rate": 4.9692002668093075e-06,
"loss": 0.0525,
"step": 469
},
{
"epoch": 5.46875,
"grad_norm": 0.12689101696014404,
"learning_rate": 4.948667688488552e-06,
"loss": 0.0543,
"step": 470
},
{
"epoch": 5.48046875,
"grad_norm": 0.13123546540737152,
"learning_rate": 4.928135975867134e-06,
"loss": 0.0617,
"step": 471
},
{
"epoch": 5.4921875,
"grad_norm": 0.16200099885463715,
"learning_rate": 4.907605475204352e-06,
"loss": 0.0588,
"step": 472
},
{
"epoch": 5.50390625,
"grad_norm": 0.16697384417057037,
"learning_rate": 4.887076532739061e-06,
"loss": 0.0578,
"step": 473
},
{
"epoch": 5.515625,
"grad_norm": 0.14646796882152557,
"learning_rate": 4.866549494683839e-06,
"loss": 0.0781,
"step": 474
},
{
"epoch": 5.52734375,
"grad_norm": 0.14703866839408875,
"learning_rate": 4.846024707219149e-06,
"loss": 0.0597,
"step": 475
},
{
"epoch": 5.5390625,
"grad_norm": 0.1428564488887787,
"learning_rate": 4.825502516487497e-06,
"loss": 0.0497,
"step": 476
},
{
"epoch": 5.55078125,
"grad_norm": 0.13470599055290222,
"learning_rate": 4.804983268587593e-06,
"loss": 0.0574,
"step": 477
},
{
"epoch": 5.5625,
"grad_norm": 0.13724403083324432,
"learning_rate": 4.784467309568524e-06,
"loss": 0.0626,
"step": 478
},
{
"epoch": 5.57421875,
"grad_norm": 0.13949063420295715,
"learning_rate": 4.7639549854239045e-06,
"loss": 0.0577,
"step": 479
},
{
"epoch": 5.5859375,
"grad_norm": 0.14538227021694183,
"learning_rate": 4.7434466420860515e-06,
"loss": 0.054,
"step": 480
},
{
"epoch": 5.59765625,
"grad_norm": 0.13742738962173462,
"learning_rate": 4.7229426254201504e-06,
"loss": 0.0541,
"step": 481
},
{
"epoch": 5.609375,
"grad_norm": 0.1454109251499176,
"learning_rate": 4.702443281218413e-06,
"loss": 0.0492,
"step": 482
},
{
"epoch": 5.62109375,
"grad_norm": 0.1340155303478241,
"learning_rate": 4.681948955194256e-06,
"loss": 0.0519,
"step": 483
},
{
"epoch": 5.6328125,
"grad_norm": 0.11645597219467163,
"learning_rate": 4.661459992976463e-06,
"loss": 0.0772,
"step": 484
},
{
"epoch": 5.64453125,
"grad_norm": 0.1487962156534195,
"learning_rate": 4.640976740103363e-06,
"loss": 0.0461,
"step": 485
},
{
"epoch": 5.65625,
"grad_norm": 0.1314113289117813,
"learning_rate": 4.620499542016996e-06,
"loss": 0.0498,
"step": 486
},
{
"epoch": 5.66796875,
"grad_norm": 0.11778878420591354,
"learning_rate": 4.6000287440572925e-06,
"loss": 0.0553,
"step": 487
},
{
"epoch": 5.6796875,
"grad_norm": 0.14744260907173157,
"learning_rate": 4.579564691456245e-06,
"loss": 0.0582,
"step": 488
},
{
"epoch": 5.69140625,
"grad_norm": 0.1958065778017044,
"learning_rate": 4.5591077293320925e-06,
"loss": 0.0481,
"step": 489
},
{
"epoch": 5.703125,
"grad_norm": 0.11204442381858826,
"learning_rate": 4.53865820268349e-06,
"loss": 0.0417,
"step": 490
},
{
"epoch": 5.71484375,
"grad_norm": 0.14066173136234283,
"learning_rate": 4.5182164563837015e-06,
"loss": 0.042,
"step": 491
},
{
"epoch": 5.7265625,
"grad_norm": 0.11793823540210724,
"learning_rate": 4.497782835174775e-06,
"loss": 0.0536,
"step": 492
},
{
"epoch": 5.73828125,
"grad_norm": 0.12355687469244003,
"learning_rate": 4.477357683661734e-06,
"loss": 0.0621,
"step": 493
},
{
"epoch": 5.75,
"grad_norm": 0.16047550737857819,
"learning_rate": 4.456941346306761e-06,
"loss": 0.0603,
"step": 494
},
{
"epoch": 5.76171875,
"grad_norm": 0.1558404564857483,
"learning_rate": 4.436534167423395e-06,
"loss": 0.0578,
"step": 495
},
{
"epoch": 5.7734375,
"grad_norm": 0.1236644983291626,
"learning_rate": 4.41613649117072e-06,
"loss": 0.0455,
"step": 496
},
{
"epoch": 5.78515625,
"grad_norm": 0.10916972905397415,
"learning_rate": 4.395748661547558e-06,
"loss": 0.0505,
"step": 497
},
{
"epoch": 5.796875,
"grad_norm": 0.13890743255615234,
"learning_rate": 4.375371022386677e-06,
"loss": 0.0476,
"step": 498
},
{
"epoch": 5.80859375,
"grad_norm": 0.14354218542575836,
"learning_rate": 4.355003917348985e-06,
"loss": 0.055,
"step": 499
},
{
"epoch": 5.8203125,
"grad_norm": 0.11266707628965378,
"learning_rate": 4.334647689917734e-06,
"loss": 0.0429,
"step": 500
},
{
"epoch": 5.83203125,
"grad_norm": 0.12821117043495178,
"learning_rate": 4.314302683392729e-06,
"loss": 0.0497,
"step": 501
},
{
"epoch": 5.84375,
"grad_norm": 0.1462571769952774,
"learning_rate": 4.293969240884545e-06,
"loss": 0.0563,
"step": 502
},
{
"epoch": 5.85546875,
"grad_norm": 0.12532669305801392,
"learning_rate": 4.273647705308726e-06,
"loss": 0.0745,
"step": 503
},
{
"epoch": 5.8671875,
"grad_norm": 0.4210283160209656,
"learning_rate": 4.253338419380016e-06,
"loss": 0.0481,
"step": 504
},
{
"epoch": 5.87890625,
"grad_norm": 0.13547341525554657,
"learning_rate": 4.233041725606573e-06,
"loss": 0.0609,
"step": 505
},
{
"epoch": 5.890625,
"grad_norm": 0.11836741119623184,
"learning_rate": 4.212757966284191e-06,
"loss": 0.051,
"step": 506
},
{
"epoch": 5.90234375,
"grad_norm": 0.13804543018341064,
"learning_rate": 4.192487483490532e-06,
"loss": 0.0619,
"step": 507
},
{
"epoch": 5.9140625,
"grad_norm": 0.1685003638267517,
"learning_rate": 4.17223061907935e-06,
"loss": 0.0515,
"step": 508
},
{
"epoch": 5.92578125,
"grad_norm": 0.14643150568008423,
"learning_rate": 4.151987714674737e-06,
"loss": 0.0538,
"step": 509
},
{
"epoch": 5.9375,
"grad_norm": 0.11812538653612137,
"learning_rate": 4.131759111665349e-06,
"loss": 0.0417,
"step": 510
},
{
"epoch": 5.94921875,
"grad_norm": 0.12235013395547867,
"learning_rate": 4.111545151198657e-06,
"loss": 0.0488,
"step": 511
},
{
"epoch": 5.9609375,
"grad_norm": 0.14277981221675873,
"learning_rate": 4.091346174175193e-06,
"loss": 0.052,
"step": 512
},
{
"epoch": 5.97265625,
"grad_norm": 0.13800203800201416,
"learning_rate": 4.071162521242796e-06,
"loss": 0.0462,
"step": 513
},
{
"epoch": 5.984375,
"grad_norm": 0.12089977413415909,
"learning_rate": 4.050994532790871e-06,
"loss": 0.0594,
"step": 514
},
{
"epoch": 5.99609375,
"grad_norm": 0.13905519247055054,
"learning_rate": 4.03084254894465e-06,
"loss": 0.0571,
"step": 515
},
{
"epoch": 6.0,
"grad_norm": 0.2540355622768402,
"learning_rate": 4.010706909559452e-06,
"loss": 0.0377,
"step": 516
},
{
"epoch": 6.01171875,
"grad_norm": 0.11269644647836685,
"learning_rate": 3.99058795421495e-06,
"loss": 0.0322,
"step": 517
},
{
"epoch": 6.0234375,
"grad_norm": 0.10453534126281738,
"learning_rate": 3.970486022209451e-06,
"loss": 0.0262,
"step": 518
},
{
"epoch": 6.03515625,
"grad_norm": 0.09565100073814392,
"learning_rate": 3.950401452554171e-06,
"loss": 0.0251,
"step": 519
},
{
"epoch": 6.046875,
"grad_norm": 0.09003366529941559,
"learning_rate": 3.930334583967514e-06,
"loss": 0.0206,
"step": 520
},
{
"epoch": 6.05859375,
"grad_norm": 0.08756420016288757,
"learning_rate": 3.910285754869362e-06,
"loss": 0.0223,
"step": 521
},
{
"epoch": 6.0703125,
"grad_norm": 0.09546218812465668,
"learning_rate": 3.890255303375371e-06,
"loss": 0.0246,
"step": 522
},
{
"epoch": 6.08203125,
"grad_norm": 0.08503632992506027,
"learning_rate": 3.870243567291263e-06,
"loss": 0.0232,
"step": 523
},
{
"epoch": 6.09375,
"grad_norm": 0.10584919154644012,
"learning_rate": 3.850250884107133e-06,
"loss": 0.0202,
"step": 524
},
{
"epoch": 6.10546875,
"grad_norm": 0.11919157952070236,
"learning_rate": 3.8302775909917585e-06,
"loss": 0.0292,
"step": 525
},
{
"epoch": 6.1171875,
"grad_norm": 0.09001903235912323,
"learning_rate": 3.8103240247869077e-06,
"loss": 0.0176,
"step": 526
},
{
"epoch": 6.12890625,
"grad_norm": 0.10065510869026184,
"learning_rate": 3.790390522001662e-06,
"loss": 0.0218,
"step": 527
},
{
"epoch": 6.140625,
"grad_norm": 0.10913336277008057,
"learning_rate": 3.770477418806744e-06,
"loss": 0.0251,
"step": 528
},
{
"epoch": 6.15234375,
"grad_norm": 0.11968547105789185,
"learning_rate": 3.7505850510288455e-06,
"loss": 0.0255,
"step": 529
},
{
"epoch": 6.1640625,
"grad_norm": 0.10445302724838257,
"learning_rate": 3.730713754144961e-06,
"loss": 0.0242,
"step": 530
},
{
"epoch": 6.17578125,
"grad_norm": 0.09664919227361679,
"learning_rate": 3.7108638632767314e-06,
"loss": 0.0249,
"step": 531
},
{
"epoch": 6.1875,
"grad_norm": 0.10129770636558533,
"learning_rate": 3.6910357131847986e-06,
"loss": 0.0263,
"step": 532
},
{
"epoch": 6.19921875,
"grad_norm": 0.10122045874595642,
"learning_rate": 3.6712296382631505e-06,
"loss": 0.0223,
"step": 533
},
{
"epoch": 6.2109375,
"grad_norm": 0.09673990309238434,
"learning_rate": 3.6514459725334874e-06,
"loss": 0.0231,
"step": 534
},
{
"epoch": 6.22265625,
"grad_norm": 0.0976557508111,
"learning_rate": 3.6316850496395863e-06,
"loss": 0.0246,
"step": 535
},
{
"epoch": 6.234375,
"grad_norm": 0.09999102354049683,
"learning_rate": 3.6119472028416776e-06,
"loss": 0.0253,
"step": 536
},
{
"epoch": 6.24609375,
"grad_norm": 0.102074034512043,
"learning_rate": 3.5922327650108203e-06,
"loss": 0.0295,
"step": 537
},
{
"epoch": 6.2578125,
"grad_norm": 0.09054411947727203,
"learning_rate": 3.5725420686232903e-06,
"loss": 0.0203,
"step": 538
},
{
"epoch": 6.26953125,
"grad_norm": 0.09414811432361603,
"learning_rate": 3.5528754457549754e-06,
"loss": 0.0219,
"step": 539
},
{
"epoch": 6.28125,
"grad_norm": 0.1114683449268341,
"learning_rate": 3.5332332280757706e-06,
"loss": 0.031,
"step": 540
},
{
"epoch": 6.29296875,
"grad_norm": 0.09884975850582123,
"learning_rate": 3.513615746843987e-06,
"loss": 0.0243,
"step": 541
},
{
"epoch": 6.3046875,
"grad_norm": 0.11346939206123352,
"learning_rate": 3.494023332900768e-06,
"loss": 0.0277,
"step": 542
},
{
"epoch": 6.31640625,
"grad_norm": 0.09808623790740967,
"learning_rate": 3.474456316664504e-06,
"loss": 0.0205,
"step": 543
},
{
"epoch": 6.328125,
"grad_norm": 0.12470114231109619,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.0251,
"step": 544
},
{
"epoch": 6.33984375,
"grad_norm": 0.10183770954608917,
"learning_rate": 3.4353997968392295e-06,
"loss": 0.0234,
"step": 545
},
{
"epoch": 6.3515625,
"grad_norm": 0.09088829159736633,
"learning_rate": 3.415910951923137e-06,
"loss": 0.02,
"step": 546
},
{
"epoch": 6.36328125,
"grad_norm": 0.1014619767665863,
"learning_rate": 3.3964488220487252e-06,
"loss": 0.0197,
"step": 547
},
{
"epoch": 6.375,
"grad_norm": 0.10920997709035873,
"learning_rate": 3.3770137354371977e-06,
"loss": 0.0262,
"step": 548
},
{
"epoch": 6.38671875,
"grad_norm": 0.0838565081357956,
"learning_rate": 3.357606019853682e-06,
"loss": 0.0152,
"step": 549
},
{
"epoch": 6.3984375,
"grad_norm": 0.09784085303544998,
"learning_rate": 3.3382260026017027e-06,
"loss": 0.02,
"step": 550
},
{
"epoch": 6.41015625,
"grad_norm": 0.10152312368154526,
"learning_rate": 3.3188740105176655e-06,
"loss": 0.0239,
"step": 551
},
{
"epoch": 6.421875,
"grad_norm": 0.09764476120471954,
"learning_rate": 3.2995503699653385e-06,
"loss": 0.0225,
"step": 552
},
{
"epoch": 6.43359375,
"grad_norm": 0.10174062848091125,
"learning_rate": 3.2802554068303595e-06,
"loss": 0.0188,
"step": 553
},
{
"epoch": 6.4453125,
"grad_norm": 0.10995883494615555,
"learning_rate": 3.260989446514726e-06,
"loss": 0.0195,
"step": 554
},
{
"epoch": 6.45703125,
"grad_norm": 0.08863966166973114,
"learning_rate": 3.241752813931316e-06,
"loss": 0.0218,
"step": 555
},
{
"epoch": 6.46875,
"grad_norm": 0.09596288204193115,
"learning_rate": 3.22254583349841e-06,
"loss": 0.0228,
"step": 556
},
{
"epoch": 6.48046875,
"grad_norm": 0.09713637083768845,
"learning_rate": 3.2033688291342124e-06,
"loss": 0.0228,
"step": 557
},
{
"epoch": 6.4921875,
"grad_norm": 0.1030144989490509,
"learning_rate": 3.1842221242513944e-06,
"loss": 0.0235,
"step": 558
},
{
"epoch": 6.50390625,
"grad_norm": 0.10174969583749771,
"learning_rate": 3.16510604175164e-06,
"loss": 0.0287,
"step": 559
},
{
"epoch": 6.515625,
"grad_norm": 0.10522494465112686,
"learning_rate": 3.1460209040201967e-06,
"loss": 0.0255,
"step": 560
},
{
"epoch": 6.52734375,
"grad_norm": 0.08618924766778946,
"learning_rate": 3.12696703292044e-06,
"loss": 0.0182,
"step": 561
},
{
"epoch": 6.5390625,
"grad_norm": 0.09024260938167572,
"learning_rate": 3.107944749788449e-06,
"loss": 0.0188,
"step": 562
},
{
"epoch": 6.55078125,
"grad_norm": 0.09242939203977585,
"learning_rate": 3.088954375427582e-06,
"loss": 0.025,
"step": 563
},
{
"epoch": 6.5625,
"grad_norm": 0.11064324527978897,
"learning_rate": 3.069996230103066e-06,
"loss": 0.0264,
"step": 564
},
{
"epoch": 6.57421875,
"grad_norm": 0.08767686039209366,
"learning_rate": 3.0510706335366034e-06,
"loss": 0.0218,
"step": 565
},
{
"epoch": 6.5859375,
"grad_norm": 0.10618072748184204,
"learning_rate": 3.0321779049009714e-06,
"loss": 0.0277,
"step": 566
},
{
"epoch": 6.59765625,
"grad_norm": 0.08497366309165955,
"learning_rate": 3.01331836281464e-06,
"loss": 0.0182,
"step": 567
},
{
"epoch": 6.609375,
"grad_norm": 0.11079303175210953,
"learning_rate": 2.9944923253364066e-06,
"loss": 0.0262,
"step": 568
},
{
"epoch": 6.62109375,
"grad_norm": 0.11850834637880325,
"learning_rate": 2.975700109960023e-06,
"loss": 0.0296,
"step": 569
},
{
"epoch": 6.6328125,
"grad_norm": 0.10579863935709,
"learning_rate": 2.956942033608843e-06,
"loss": 0.0253,
"step": 570
},
{
"epoch": 6.64453125,
"grad_norm": 0.09152594953775406,
"learning_rate": 2.9382184126304834e-06,
"loss": 0.0194,
"step": 571
},
{
"epoch": 6.65625,
"grad_norm": 0.09527917951345444,
"learning_rate": 2.919529562791482e-06,
"loss": 0.0243,
"step": 572
},
{
"epoch": 6.66796875,
"grad_norm": 0.09928877651691437,
"learning_rate": 2.9008757992719734e-06,
"loss": 0.019,
"step": 573
},
{
"epoch": 6.6796875,
"grad_norm": 0.0893445611000061,
"learning_rate": 2.8822574366603804e-06,
"loss": 0.0212,
"step": 574
},
{
"epoch": 6.69140625,
"grad_norm": 0.10207302868366241,
"learning_rate": 2.863674788948097e-06,
"loss": 0.0275,
"step": 575
},
{
"epoch": 6.703125,
"grad_norm": 0.09759923815727234,
"learning_rate": 2.8451281695242013e-06,
"loss": 0.0217,
"step": 576
},
{
"epoch": 6.71484375,
"grad_norm": 0.08684232085943222,
"learning_rate": 2.8266178911701757e-06,
"loss": 0.0196,
"step": 577
},
{
"epoch": 6.7265625,
"grad_norm": 0.08394382894039154,
"learning_rate": 2.8081442660546126e-06,
"loss": 0.0201,
"step": 578
},
{
"epoch": 6.73828125,
"grad_norm": 0.09071553498506546,
"learning_rate": 2.7897076057279703e-06,
"loss": 0.017,
"step": 579
},
{
"epoch": 6.75,
"grad_norm": 0.09102096408605576,
"learning_rate": 2.771308221117309e-06,
"loss": 0.0228,
"step": 580
},
{
"epoch": 6.76171875,
"grad_norm": 0.08504775166511536,
"learning_rate": 2.7529464225210447e-06,
"loss": 0.0208,
"step": 581
},
{
"epoch": 6.7734375,
"grad_norm": 0.08983936905860901,
"learning_rate": 2.734622519603726e-06,
"loss": 0.0185,
"step": 582
},
{
"epoch": 6.78515625,
"grad_norm": 0.11145118623971939,
"learning_rate": 2.7163368213907975e-06,
"loss": 0.0279,
"step": 583
},
{
"epoch": 6.796875,
"grad_norm": 0.08211053907871246,
"learning_rate": 2.698089636263405e-06,
"loss": 0.0192,
"step": 584
},
{
"epoch": 6.80859375,
"grad_norm": 0.08780679106712341,
"learning_rate": 2.6798812719531843e-06,
"loss": 0.0172,
"step": 585
},
{
"epoch": 6.8203125,
"grad_norm": 0.11070062220096588,
"learning_rate": 2.6617120355370667e-06,
"loss": 0.0243,
"step": 586
},
{
"epoch": 6.83203125,
"grad_norm": 0.09188354015350342,
"learning_rate": 2.643582233432115e-06,
"loss": 0.0197,
"step": 587
},
{
"epoch": 6.84375,
"grad_norm": 0.09905336797237396,
"learning_rate": 2.6254921713903447e-06,
"loss": 0.0238,
"step": 588
},
{
"epoch": 6.85546875,
"grad_norm": 0.09748519957065582,
"learning_rate": 2.607442154493568e-06,
"loss": 0.0242,
"step": 589
},
{
"epoch": 6.8671875,
"grad_norm": 0.09982873499393463,
"learning_rate": 2.5894324871482557e-06,
"loss": 0.0273,
"step": 590
},
{
"epoch": 6.87890625,
"grad_norm": 0.09111794084310532,
"learning_rate": 2.5714634730803993e-06,
"loss": 0.0224,
"step": 591
},
{
"epoch": 6.890625,
"grad_norm": 0.09995567053556442,
"learning_rate": 2.5535354153303827e-06,
"loss": 0.0213,
"step": 592
},
{
"epoch": 6.90234375,
"grad_norm": 0.09687034785747528,
"learning_rate": 2.5356486162478843e-06,
"loss": 0.0242,
"step": 593
},
{
"epoch": 6.9140625,
"grad_norm": 0.0928439050912857,
"learning_rate": 2.5178033774867692e-06,
"loss": 0.018,
"step": 594
},
{
"epoch": 6.92578125,
"grad_norm": 0.1063237190246582,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.0238,
"step": 595
},
{
"epoch": 6.9375,
"grad_norm": 0.11429858952760696,
"learning_rate": 2.482238784034573e-06,
"loss": 0.0285,
"step": 596
},
{
"epoch": 6.94921875,
"grad_norm": 0.09505951404571533,
"learning_rate": 2.464520029126441e-06,
"loss": 0.0233,
"step": 597
},
{
"epoch": 6.9609375,
"grad_norm": 0.08929964900016785,
"learning_rate": 2.4468440340954664e-06,
"loss": 0.0229,
"step": 598
},
{
"epoch": 6.97265625,
"grad_norm": 0.10107742249965668,
"learning_rate": 2.4292110970403885e-06,
"loss": 0.0188,
"step": 599
},
{
"epoch": 6.984375,
"grad_norm": 0.08496974408626556,
"learning_rate": 2.411621515333788e-06,
"loss": 0.0186,
"step": 600
},
{
"epoch": 6.99609375,
"grad_norm": 0.078733891248703,
"learning_rate": 2.3940755856170744e-06,
"loss": 0.0143,
"step": 601
},
{
"epoch": 7.0,
"grad_norm": 0.078733891248703,
"learning_rate": 2.376573603795486e-06,
"loss": 0.015,
"step": 602
},
{
"epoch": 7.01171875,
"grad_norm": 0.13934215903282166,
"learning_rate": 2.3591158650330896e-06,
"loss": 0.0097,
"step": 603
},
{
"epoch": 7.0234375,
"grad_norm": 0.04730260372161865,
"learning_rate": 2.341702663747819e-06,
"loss": 0.007,
"step": 604
},
{
"epoch": 7.03515625,
"grad_norm": 0.05305750295519829,
"learning_rate": 2.324334293606499e-06,
"loss": 0.0125,
"step": 605
},
{
"epoch": 7.046875,
"grad_norm": 0.057967476546764374,
"learning_rate": 2.307011047519888e-06,
"loss": 0.0106,
"step": 606
},
{
"epoch": 7.05859375,
"grad_norm": 0.04883718490600586,
"learning_rate": 2.289733217637753e-06,
"loss": 0.0114,
"step": 607
},
{
"epoch": 7.0703125,
"grad_norm": 0.05335000902414322,
"learning_rate": 2.2725010953439323e-06,
"loss": 0.009,
"step": 608
},
{
"epoch": 7.08203125,
"grad_norm": 0.07616371661424637,
"learning_rate": 2.2553149712514193e-06,
"loss": 0.012,
"step": 609
},
{
"epoch": 7.09375,
"grad_norm": 0.05109984427690506,
"learning_rate": 2.238175135197471e-06,
"loss": 0.0096,
"step": 610
},
{
"epoch": 7.10546875,
"grad_norm": 0.05964595451951027,
"learning_rate": 2.2210818762387143e-06,
"loss": 0.0077,
"step": 611
},
{
"epoch": 7.1171875,
"grad_norm": 0.05734318867325783,
"learning_rate": 2.204035482646267e-06,
"loss": 0.0095,
"step": 612
},
{
"epoch": 7.12890625,
"grad_norm": 0.07098759710788727,
"learning_rate": 2.1870362419008844e-06,
"loss": 0.0083,
"step": 613
},
{
"epoch": 7.140625,
"grad_norm": 0.07144724577665329,
"learning_rate": 2.170084440688111e-06,
"loss": 0.0102,
"step": 614
},
{
"epoch": 7.15234375,
"grad_norm": 0.06327182799577713,
"learning_rate": 2.1531803648934333e-06,
"loss": 0.0076,
"step": 615
},
{
"epoch": 7.1640625,
"grad_norm": 0.06361842155456543,
"learning_rate": 2.136324299597474e-06,
"loss": 0.0103,
"step": 616
},
{
"epoch": 7.17578125,
"grad_norm": 0.053663451224565506,
"learning_rate": 2.1195165290711782e-06,
"loss": 0.0069,
"step": 617
},
{
"epoch": 7.1875,
"grad_norm": 0.055607955902814865,
"learning_rate": 2.1027573367710124e-06,
"loss": 0.0097,
"step": 618
},
{
"epoch": 7.19921875,
"grad_norm": 0.07250375300645828,
"learning_rate": 2.0860470053341957e-06,
"loss": 0.0088,
"step": 619
},
{
"epoch": 7.2109375,
"grad_norm": 0.04863898083567619,
"learning_rate": 2.069385816573928e-06,
"loss": 0.0096,
"step": 620
},
{
"epoch": 7.22265625,
"grad_norm": 0.059452399611473083,
"learning_rate": 2.0527740514746324e-06,
"loss": 0.0068,
"step": 621
},
{
"epoch": 7.234375,
"grad_norm": 0.0650862529873848,
"learning_rate": 2.0362119901872262e-06,
"loss": 0.0123,
"step": 622
},
{
"epoch": 7.24609375,
"grad_norm": 0.05099206417798996,
"learning_rate": 2.0196999120243886e-06,
"loss": 0.0056,
"step": 623
},
{
"epoch": 7.2578125,
"grad_norm": 0.062553271651268,
"learning_rate": 2.003238095455855e-06,
"loss": 0.0099,
"step": 624
},
{
"epoch": 7.26953125,
"grad_norm": 0.0628080889582634,
"learning_rate": 1.9868268181037186e-06,
"loss": 0.0092,
"step": 625
},
{
"epoch": 7.28125,
"grad_norm": 0.059045687317848206,
"learning_rate": 1.9704663567377445e-06,
"loss": 0.0093,
"step": 626
},
{
"epoch": 7.29296875,
"grad_norm": 0.05613759160041809,
"learning_rate": 1.954156987270711e-06,
"loss": 0.0097,
"step": 627
},
{
"epoch": 7.3046875,
"grad_norm": 0.05174103379249573,
"learning_rate": 1.937898984753751e-06,
"loss": 0.01,
"step": 628
},
{
"epoch": 7.31640625,
"grad_norm": 0.05605224519968033,
"learning_rate": 1.9216926233717087e-06,
"loss": 0.006,
"step": 629
},
{
"epoch": 7.328125,
"grad_norm": 0.06480531394481659,
"learning_rate": 1.9055381764385272e-06,
"loss": 0.0116,
"step": 630
},
{
"epoch": 7.33984375,
"grad_norm": 0.05986357480287552,
"learning_rate": 1.8894359163926312e-06,
"loss": 0.0084,
"step": 631
},
{
"epoch": 7.3515625,
"grad_norm": 0.040649283677339554,
"learning_rate": 1.8733861147923298e-06,
"loss": 0.0116,
"step": 632
},
{
"epoch": 7.36328125,
"grad_norm": 0.07037989050149918,
"learning_rate": 1.8573890423112461e-06,
"loss": 0.0086,
"step": 633
},
{
"epoch": 7.375,
"grad_norm": 0.053933579474687576,
"learning_rate": 1.8414449687337467e-06,
"loss": 0.0077,
"step": 634
},
{
"epoch": 7.38671875,
"grad_norm": 0.048108648508787155,
"learning_rate": 1.8255541629503865e-06,
"loss": 0.0097,
"step": 635
},
{
"epoch": 7.3984375,
"grad_norm": 0.058660659939050674,
"learning_rate": 1.8097168929533876e-06,
"loss": 0.0085,
"step": 636
},
{
"epoch": 7.41015625,
"grad_norm": 0.05836372449994087,
"learning_rate": 1.7939334258321094e-06,
"loss": 0.0095,
"step": 637
},
{
"epoch": 7.421875,
"grad_norm": 0.04579600691795349,
"learning_rate": 1.7782040277685436e-06,
"loss": 0.0062,
"step": 638
},
{
"epoch": 7.43359375,
"grad_norm": 0.045197177678346634,
"learning_rate": 1.762528964032832e-06,
"loss": 0.0064,
"step": 639
},
{
"epoch": 7.4453125,
"grad_norm": 0.04665205627679825,
"learning_rate": 1.746908498978791e-06,
"loss": 0.0077,
"step": 640
},
{
"epoch": 7.45703125,
"grad_norm": 0.04843874275684357,
"learning_rate": 1.731342896039444e-06,
"loss": 0.0101,
"step": 641
},
{
"epoch": 7.46875,
"grad_norm": 0.06198574975132942,
"learning_rate": 1.7158324177225948e-06,
"loss": 0.0095,
"step": 642
},
{
"epoch": 7.48046875,
"grad_norm": 0.059951577335596085,
"learning_rate": 1.7003773256063882e-06,
"loss": 0.0069,
"step": 643
},
{
"epoch": 7.4921875,
"grad_norm": 0.062151823192834854,
"learning_rate": 1.684977880334901e-06,
"loss": 0.008,
"step": 644
},
{
"epoch": 7.50390625,
"grad_norm": 0.05667927488684654,
"learning_rate": 1.6696343416137495e-06,
"loss": 0.0088,
"step": 645
},
{
"epoch": 7.515625,
"grad_norm": 0.047324296087026596,
"learning_rate": 1.6543469682057105e-06,
"loss": 0.0078,
"step": 646
},
{
"epoch": 7.52734375,
"grad_norm": 0.05814821645617485,
"learning_rate": 1.6391160179263467e-06,
"loss": 0.0068,
"step": 647
},
{
"epoch": 7.5390625,
"grad_norm": 0.05795707181096077,
"learning_rate": 1.623941747639679e-06,
"loss": 0.0083,
"step": 648
},
{
"epoch": 7.55078125,
"grad_norm": 0.052849121391773224,
"learning_rate": 1.60882441325383e-06,
"loss": 0.0076,
"step": 649
},
{
"epoch": 7.5625,
"grad_norm": 0.07074020057916641,
"learning_rate": 1.5937642697167288e-06,
"loss": 0.0101,
"step": 650
},
{
"epoch": 7.57421875,
"grad_norm": 0.04596783220767975,
"learning_rate": 1.578761571011802e-06,
"loss": 0.0081,
"step": 651
},
{
"epoch": 7.5859375,
"grad_norm": 0.06518401205539703,
"learning_rate": 1.5638165701536866e-06,
"loss": 0.0116,
"step": 652
},
{
"epoch": 7.59765625,
"grad_norm": 0.05404844135046005,
"learning_rate": 1.5489295191839738e-06,
"loss": 0.006,
"step": 653
},
{
"epoch": 7.609375,
"grad_norm": 0.048237938433885574,
"learning_rate": 1.5341006691669513e-06,
"loss": 0.006,
"step": 654
},
{
"epoch": 7.62109375,
"grad_norm": 0.04134788364171982,
"learning_rate": 1.5193302701853674e-06,
"loss": 0.0068,
"step": 655
},
{
"epoch": 7.6328125,
"grad_norm": 0.05492478609085083,
"learning_rate": 1.5046185713362199e-06,
"loss": 0.0076,
"step": 656
},
{
"epoch": 7.64453125,
"grad_norm": 0.05365385860204697,
"learning_rate": 1.489965820726552e-06,
"loss": 0.0074,
"step": 657
},
{
"epoch": 7.65625,
"grad_norm": 0.055742137134075165,
"learning_rate": 1.475372265469265e-06,
"loss": 0.0085,
"step": 658
},
{
"epoch": 7.66796875,
"grad_norm": 0.04848693311214447,
"learning_rate": 1.460838151678955e-06,
"loss": 0.0072,
"step": 659
},
{
"epoch": 7.6796875,
"grad_norm": 0.061075836420059204,
"learning_rate": 1.4463637244677648e-06,
"loss": 0.0082,
"step": 660
},
{
"epoch": 7.69140625,
"grad_norm": 0.05354008078575134,
"learning_rate": 1.4319492279412388e-06,
"loss": 0.0096,
"step": 661
},
{
"epoch": 7.703125,
"grad_norm": 0.05801296979188919,
"learning_rate": 1.4175949051942207e-06,
"loss": 0.0054,
"step": 662
},
{
"epoch": 7.71484375,
"grad_norm": 0.053386397659778595,
"learning_rate": 1.4033009983067454e-06,
"loss": 0.0091,
"step": 663
},
{
"epoch": 7.7265625,
"grad_norm": 0.05271941423416138,
"learning_rate": 1.389067748339954e-06,
"loss": 0.0083,
"step": 664
},
{
"epoch": 7.73828125,
"grad_norm": 0.04694118723273277,
"learning_rate": 1.374895395332037e-06,
"loss": 0.0077,
"step": 665
},
{
"epoch": 7.75,
"grad_norm": 0.05342872068285942,
"learning_rate": 1.360784178294181e-06,
"loss": 0.0065,
"step": 666
},
{
"epoch": 7.76171875,
"grad_norm": 0.04856446385383606,
"learning_rate": 1.3467343352065349e-06,
"loss": 0.0096,
"step": 667
},
{
"epoch": 7.7734375,
"grad_norm": 0.06789608299732208,
"learning_rate": 1.3327461030142037e-06,
"loss": 0.0097,
"step": 668
},
{
"epoch": 7.78515625,
"grad_norm": 0.06705653667449951,
"learning_rate": 1.3188197176232486e-06,
"loss": 0.009,
"step": 669
},
{
"epoch": 7.796875,
"grad_norm": 0.06834590435028076,
"learning_rate": 1.3049554138967052e-06,
"loss": 0.0074,
"step": 670
},
{
"epoch": 7.80859375,
"grad_norm": 0.0560673326253891,
"learning_rate": 1.2911534256506297e-06,
"loss": 0.0073,
"step": 671
},
{
"epoch": 7.8203125,
"grad_norm": 0.041104648262262344,
"learning_rate": 1.27741398565015e-06,
"loss": 0.0095,
"step": 672
},
{
"epoch": 7.83203125,
"grad_norm": 0.06327271461486816,
"learning_rate": 1.2637373256055445e-06,
"loss": 0.0066,
"step": 673
},
{
"epoch": 7.84375,
"grad_norm": 0.07115282863378525,
"learning_rate": 1.2501236761683321e-06,
"loss": 0.0103,
"step": 674
},
{
"epoch": 7.85546875,
"grad_norm": 0.04808028042316437,
"learning_rate": 1.2365732669273778e-06,
"loss": 0.0087,
"step": 675
},
{
"epoch": 7.8671875,
"grad_norm": 0.05055437982082367,
"learning_rate": 1.2230863264050308e-06,
"loss": 0.0069,
"step": 676
},
{
"epoch": 7.87890625,
"grad_norm": 0.054747920483350754,
"learning_rate": 1.2096630820532652e-06,
"loss": 0.0065,
"step": 677
},
{
"epoch": 7.890625,
"grad_norm": 0.05327949672937393,
"learning_rate": 1.1963037602498385e-06,
"loss": 0.0076,
"step": 678
},
{
"epoch": 7.90234375,
"grad_norm": 0.05374129116535187,
"learning_rate": 1.1830085862944851e-06,
"loss": 0.0079,
"step": 679
},
{
"epoch": 7.9140625,
"grad_norm": 0.056705743074417114,
"learning_rate": 1.1697777844051105e-06,
"loss": 0.007,
"step": 680
},
{
"epoch": 7.92578125,
"grad_norm": 0.051028985530138016,
"learning_rate": 1.1566115777140069e-06,
"loss": 0.0088,
"step": 681
},
{
"epoch": 7.9375,
"grad_norm": 0.05366922542452812,
"learning_rate": 1.1435101882640964e-06,
"loss": 0.007,
"step": 682
},
{
"epoch": 7.94921875,
"grad_norm": 0.056716907769441605,
"learning_rate": 1.130473837005186e-06,
"loss": 0.0054,
"step": 683
},
{
"epoch": 7.9609375,
"grad_norm": 0.04551590234041214,
"learning_rate": 1.117502743790233e-06,
"loss": 0.0064,
"step": 684
}
],
"logging_steps": 1,
"max_steps": 850,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 171,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 146333418242048.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}