nemo_nano_math_300k / trainer_state.json
sedrickkeh's picture
End of training
16fe58b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.998573466476462,
"eval_steps": 500,
"global_step": 1095,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00456490727532097,
"grad_norm": 6.303826851202761,
"learning_rate": 7.272727272727273e-07,
"loss": 0.8692,
"step": 1
},
{
"epoch": 0.00912981455064194,
"grad_norm": 6.417436981270692,
"learning_rate": 1.4545454545454546e-06,
"loss": 0.8764,
"step": 2
},
{
"epoch": 0.013694721825962911,
"grad_norm": 6.266046371260386,
"learning_rate": 2.181818181818182e-06,
"loss": 0.8604,
"step": 3
},
{
"epoch": 0.01825962910128388,
"grad_norm": 5.871418428231522,
"learning_rate": 2.9090909090909093e-06,
"loss": 0.8563,
"step": 4
},
{
"epoch": 0.02282453637660485,
"grad_norm": 4.515904039751017,
"learning_rate": 3.6363636363636366e-06,
"loss": 0.8116,
"step": 5
},
{
"epoch": 0.027389443651925822,
"grad_norm": 4.154460736926,
"learning_rate": 4.363636363636364e-06,
"loss": 0.8014,
"step": 6
},
{
"epoch": 0.03195435092724679,
"grad_norm": 2.436881516437234,
"learning_rate": 5.090909090909091e-06,
"loss": 0.7707,
"step": 7
},
{
"epoch": 0.03651925820256776,
"grad_norm": 2.091681221098197,
"learning_rate": 5.8181818181818185e-06,
"loss": 0.7609,
"step": 8
},
{
"epoch": 0.04108416547788873,
"grad_norm": 4.371182375936429,
"learning_rate": 6.545454545454546e-06,
"loss": 0.7714,
"step": 9
},
{
"epoch": 0.0456490727532097,
"grad_norm": 4.419533826001375,
"learning_rate": 7.272727272727273e-06,
"loss": 0.7639,
"step": 10
},
{
"epoch": 0.05021398002853067,
"grad_norm": 4.0789627340324515,
"learning_rate": 8.000000000000001e-06,
"loss": 0.738,
"step": 11
},
{
"epoch": 0.054778887303851644,
"grad_norm": 4.249564440992856,
"learning_rate": 8.727272727272728e-06,
"loss": 0.7147,
"step": 12
},
{
"epoch": 0.05934379457917261,
"grad_norm": 3.376070643486793,
"learning_rate": 9.454545454545456e-06,
"loss": 0.6984,
"step": 13
},
{
"epoch": 0.06390870185449359,
"grad_norm": 2.084910644051683,
"learning_rate": 1.0181818181818182e-05,
"loss": 0.6752,
"step": 14
},
{
"epoch": 0.06847360912981455,
"grad_norm": 1.7829575333835848,
"learning_rate": 1.0909090909090909e-05,
"loss": 0.6564,
"step": 15
},
{
"epoch": 0.07303851640513552,
"grad_norm": 2.3763989053887546,
"learning_rate": 1.1636363636363637e-05,
"loss": 0.6595,
"step": 16
},
{
"epoch": 0.07760342368045649,
"grad_norm": 1.8994179663807598,
"learning_rate": 1.2363636363636364e-05,
"loss": 0.6332,
"step": 17
},
{
"epoch": 0.08216833095577745,
"grad_norm": 1.0918180057614881,
"learning_rate": 1.3090909090909092e-05,
"loss": 0.6275,
"step": 18
},
{
"epoch": 0.08673323823109844,
"grad_norm": 1.162384665458545,
"learning_rate": 1.381818181818182e-05,
"loss": 0.617,
"step": 19
},
{
"epoch": 0.0912981455064194,
"grad_norm": 1.1197113251451245,
"learning_rate": 1.4545454545454546e-05,
"loss": 0.6102,
"step": 20
},
{
"epoch": 0.09586305278174037,
"grad_norm": 0.7298206551544844,
"learning_rate": 1.5272727272727276e-05,
"loss": 0.6001,
"step": 21
},
{
"epoch": 0.10042796005706134,
"grad_norm": 0.8218899465689595,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.5908,
"step": 22
},
{
"epoch": 0.1049928673323823,
"grad_norm": 0.7081964845957092,
"learning_rate": 1.672727272727273e-05,
"loss": 0.5857,
"step": 23
},
{
"epoch": 0.10955777460770329,
"grad_norm": 0.7152113888700518,
"learning_rate": 1.7454545454545456e-05,
"loss": 0.5813,
"step": 24
},
{
"epoch": 0.11412268188302425,
"grad_norm": 0.5908192981103763,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.5654,
"step": 25
},
{
"epoch": 0.11868758915834522,
"grad_norm": 0.5657878890257612,
"learning_rate": 1.8909090909090912e-05,
"loss": 0.5681,
"step": 26
},
{
"epoch": 0.12325249643366619,
"grad_norm": 0.5407715119275086,
"learning_rate": 1.963636363636364e-05,
"loss": 0.5726,
"step": 27
},
{
"epoch": 0.12781740370898717,
"grad_norm": 0.6238305162701853,
"learning_rate": 2.0363636363636365e-05,
"loss": 0.5707,
"step": 28
},
{
"epoch": 0.13238231098430814,
"grad_norm": 0.7048129462669643,
"learning_rate": 2.109090909090909e-05,
"loss": 0.562,
"step": 29
},
{
"epoch": 0.1369472182596291,
"grad_norm": 0.4792558557034262,
"learning_rate": 2.1818181818181818e-05,
"loss": 0.5622,
"step": 30
},
{
"epoch": 0.14151212553495007,
"grad_norm": 0.648556738189594,
"learning_rate": 2.2545454545454544e-05,
"loss": 0.5556,
"step": 31
},
{
"epoch": 0.14607703281027104,
"grad_norm": 0.5375842492193321,
"learning_rate": 2.3272727272727274e-05,
"loss": 0.5521,
"step": 32
},
{
"epoch": 0.150641940085592,
"grad_norm": 0.6204936152045187,
"learning_rate": 2.4e-05,
"loss": 0.5521,
"step": 33
},
{
"epoch": 0.15520684736091298,
"grad_norm": 0.7793633373804746,
"learning_rate": 2.4727272727272727e-05,
"loss": 0.5556,
"step": 34
},
{
"epoch": 0.15977175463623394,
"grad_norm": 1.077779426955439,
"learning_rate": 2.5454545454545457e-05,
"loss": 0.5439,
"step": 35
},
{
"epoch": 0.1643366619115549,
"grad_norm": 0.8548536747868706,
"learning_rate": 2.6181818181818183e-05,
"loss": 0.5399,
"step": 36
},
{
"epoch": 0.1689015691868759,
"grad_norm": 0.6156323737152537,
"learning_rate": 2.690909090909091e-05,
"loss": 0.534,
"step": 37
},
{
"epoch": 0.17346647646219687,
"grad_norm": 0.7996160287459234,
"learning_rate": 2.763636363636364e-05,
"loss": 0.5432,
"step": 38
},
{
"epoch": 0.17803138373751784,
"grad_norm": 0.8822172922338606,
"learning_rate": 2.8363636363636366e-05,
"loss": 0.538,
"step": 39
},
{
"epoch": 0.1825962910128388,
"grad_norm": 0.7341791467955449,
"learning_rate": 2.9090909090909093e-05,
"loss": 0.5311,
"step": 40
},
{
"epoch": 0.18716119828815977,
"grad_norm": 0.7448434985433166,
"learning_rate": 2.9818181818181823e-05,
"loss": 0.5302,
"step": 41
},
{
"epoch": 0.19172610556348074,
"grad_norm": 1.3134209857981531,
"learning_rate": 3.054545454545455e-05,
"loss": 0.5287,
"step": 42
},
{
"epoch": 0.1962910128388017,
"grad_norm": 1.4079955673671256,
"learning_rate": 3.127272727272728e-05,
"loss": 0.529,
"step": 43
},
{
"epoch": 0.20085592011412268,
"grad_norm": 0.7806435793966361,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.5211,
"step": 44
},
{
"epoch": 0.20542082738944364,
"grad_norm": 1.696793688392228,
"learning_rate": 3.272727272727273e-05,
"loss": 0.5286,
"step": 45
},
{
"epoch": 0.2099857346647646,
"grad_norm": 0.9163370159272217,
"learning_rate": 3.345454545454546e-05,
"loss": 0.5251,
"step": 46
},
{
"epoch": 0.21455064194008558,
"grad_norm": 1.6162977222772477,
"learning_rate": 3.4181818181818185e-05,
"loss": 0.5307,
"step": 47
},
{
"epoch": 0.21911554921540657,
"grad_norm": 0.8813157838119612,
"learning_rate": 3.490909090909091e-05,
"loss": 0.5119,
"step": 48
},
{
"epoch": 0.22368045649072754,
"grad_norm": 1.6994696349279637,
"learning_rate": 3.563636363636364e-05,
"loss": 0.522,
"step": 49
},
{
"epoch": 0.2282453637660485,
"grad_norm": 1.1550073074270106,
"learning_rate": 3.6363636363636364e-05,
"loss": 0.5185,
"step": 50
},
{
"epoch": 0.23281027104136948,
"grad_norm": 2.273035117030142,
"learning_rate": 3.709090909090909e-05,
"loss": 0.5178,
"step": 51
},
{
"epoch": 0.23737517831669044,
"grad_norm": 2.142670073640285,
"learning_rate": 3.7818181818181824e-05,
"loss": 0.5164,
"step": 52
},
{
"epoch": 0.2419400855920114,
"grad_norm": 1.178743753159302,
"learning_rate": 3.854545454545455e-05,
"loss": 0.5161,
"step": 53
},
{
"epoch": 0.24650499286733238,
"grad_norm": 2.0029521626283864,
"learning_rate": 3.927272727272728e-05,
"loss": 0.5142,
"step": 54
},
{
"epoch": 0.25106990014265335,
"grad_norm": 1.2261785948783424,
"learning_rate": 4e-05,
"loss": 0.5097,
"step": 55
},
{
"epoch": 0.25563480741797434,
"grad_norm": 1.9929717547816588,
"learning_rate": 4.072727272727273e-05,
"loss": 0.52,
"step": 56
},
{
"epoch": 0.2601997146932953,
"grad_norm": 1.6292265517873907,
"learning_rate": 4.1454545454545456e-05,
"loss": 0.5214,
"step": 57
},
{
"epoch": 0.2647646219686163,
"grad_norm": 1.3334399259774528,
"learning_rate": 4.218181818181818e-05,
"loss": 0.5131,
"step": 58
},
{
"epoch": 0.2693295292439372,
"grad_norm": 1.7895060448687017,
"learning_rate": 4.2909090909090916e-05,
"loss": 0.5156,
"step": 59
},
{
"epoch": 0.2738944365192582,
"grad_norm": 1.2434711772815448,
"learning_rate": 4.3636363636363636e-05,
"loss": 0.5063,
"step": 60
},
{
"epoch": 0.27845934379457915,
"grad_norm": 1.674712591320644,
"learning_rate": 4.436363636363637e-05,
"loss": 0.5058,
"step": 61
},
{
"epoch": 0.28302425106990015,
"grad_norm": 1.7391744678652716,
"learning_rate": 4.509090909090909e-05,
"loss": 0.5169,
"step": 62
},
{
"epoch": 0.2875891583452211,
"grad_norm": 1.2704765036397798,
"learning_rate": 4.581818181818182e-05,
"loss": 0.5077,
"step": 63
},
{
"epoch": 0.2921540656205421,
"grad_norm": 1.7222012804003408,
"learning_rate": 4.654545454545455e-05,
"loss": 0.5137,
"step": 64
},
{
"epoch": 0.2967189728958631,
"grad_norm": 1.293937134094515,
"learning_rate": 4.727272727272728e-05,
"loss": 0.5066,
"step": 65
},
{
"epoch": 0.301283880171184,
"grad_norm": 1.1101483554895153,
"learning_rate": 4.8e-05,
"loss": 0.5012,
"step": 66
},
{
"epoch": 0.305848787446505,
"grad_norm": 1.2651179859008774,
"learning_rate": 4.8727272727272734e-05,
"loss": 0.4996,
"step": 67
},
{
"epoch": 0.31041369472182595,
"grad_norm": 1.4372242319377802,
"learning_rate": 4.9454545454545454e-05,
"loss": 0.5048,
"step": 68
},
{
"epoch": 0.31497860199714695,
"grad_norm": 1.1173991577334563,
"learning_rate": 5.018181818181819e-05,
"loss": 0.5005,
"step": 69
},
{
"epoch": 0.3195435092724679,
"grad_norm": 1.0219934142555631,
"learning_rate": 5.0909090909090914e-05,
"loss": 0.5011,
"step": 70
},
{
"epoch": 0.3241084165477889,
"grad_norm": 1.7032035010169793,
"learning_rate": 5.163636363636365e-05,
"loss": 0.5094,
"step": 71
},
{
"epoch": 0.3286733238231098,
"grad_norm": 0.8364496496396441,
"learning_rate": 5.236363636363637e-05,
"loss": 0.4963,
"step": 72
},
{
"epoch": 0.3332382310984308,
"grad_norm": 1.3858540890091062,
"learning_rate": 5.30909090909091e-05,
"loss": 0.5079,
"step": 73
},
{
"epoch": 0.3378031383737518,
"grad_norm": 1.3469249650056214,
"learning_rate": 5.381818181818182e-05,
"loss": 0.5094,
"step": 74
},
{
"epoch": 0.34236804564907275,
"grad_norm": 2.3532033300244266,
"learning_rate": 5.4545454545454546e-05,
"loss": 0.5101,
"step": 75
},
{
"epoch": 0.34693295292439374,
"grad_norm": 1.2379724037013697,
"learning_rate": 5.527272727272728e-05,
"loss": 0.5009,
"step": 76
},
{
"epoch": 0.3514978601997147,
"grad_norm": 2.1483188283998365,
"learning_rate": 5.6e-05,
"loss": 0.5185,
"step": 77
},
{
"epoch": 0.3560627674750357,
"grad_norm": 1.6081458329954548,
"learning_rate": 5.672727272727273e-05,
"loss": 0.5033,
"step": 78
},
{
"epoch": 0.3606276747503566,
"grad_norm": 1.379511883135697,
"learning_rate": 5.745454545454546e-05,
"loss": 0.5038,
"step": 79
},
{
"epoch": 0.3651925820256776,
"grad_norm": 2.314662935076633,
"learning_rate": 5.8181818181818185e-05,
"loss": 0.4969,
"step": 80
},
{
"epoch": 0.36975748930099855,
"grad_norm": 1.3583834941356125,
"learning_rate": 5.890909090909091e-05,
"loss": 0.5045,
"step": 81
},
{
"epoch": 0.37432239657631955,
"grad_norm": 2.6348157549287157,
"learning_rate": 5.9636363636363645e-05,
"loss": 0.5058,
"step": 82
},
{
"epoch": 0.3788873038516405,
"grad_norm": 1.944213494123695,
"learning_rate": 6.0363636363636365e-05,
"loss": 0.5064,
"step": 83
},
{
"epoch": 0.3834522111269615,
"grad_norm": 2.0575793444325194,
"learning_rate": 6.10909090909091e-05,
"loss": 0.5018,
"step": 84
},
{
"epoch": 0.3880171184022825,
"grad_norm": 1.978062761509244,
"learning_rate": 6.181818181818182e-05,
"loss": 0.5017,
"step": 85
},
{
"epoch": 0.3925820256776034,
"grad_norm": 1.4801623619535438,
"learning_rate": 6.254545454545456e-05,
"loss": 0.498,
"step": 86
},
{
"epoch": 0.3971469329529244,
"grad_norm": 1.8537084794208918,
"learning_rate": 6.327272727272727e-05,
"loss": 0.5019,
"step": 87
},
{
"epoch": 0.40171184022824535,
"grad_norm": 1.3702877983773376,
"learning_rate": 6.400000000000001e-05,
"loss": 0.4968,
"step": 88
},
{
"epoch": 0.40627674750356635,
"grad_norm": 1.885572104808451,
"learning_rate": 6.472727272727274e-05,
"loss": 0.4957,
"step": 89
},
{
"epoch": 0.4108416547788873,
"grad_norm": 1.3542328558823338,
"learning_rate": 6.545454545454546e-05,
"loss": 0.4966,
"step": 90
},
{
"epoch": 0.4154065620542083,
"grad_norm": 1.7718230138924214,
"learning_rate": 6.618181818181819e-05,
"loss": 0.4966,
"step": 91
},
{
"epoch": 0.4199714693295292,
"grad_norm": 1.3980578794491678,
"learning_rate": 6.690909090909092e-05,
"loss": 0.4925,
"step": 92
},
{
"epoch": 0.4245363766048502,
"grad_norm": 1.5334202672866126,
"learning_rate": 6.763636363636364e-05,
"loss": 0.4876,
"step": 93
},
{
"epoch": 0.42910128388017116,
"grad_norm": 1.4720834156887759,
"learning_rate": 6.836363636363637e-05,
"loss": 0.4913,
"step": 94
},
{
"epoch": 0.43366619115549215,
"grad_norm": 1.338014419694779,
"learning_rate": 6.90909090909091e-05,
"loss": 0.4903,
"step": 95
},
{
"epoch": 0.43823109843081315,
"grad_norm": 1.0761852253986315,
"learning_rate": 6.981818181818182e-05,
"loss": 0.4912,
"step": 96
},
{
"epoch": 0.4427960057061341,
"grad_norm": 1.6024764467654846,
"learning_rate": 7.054545454545455e-05,
"loss": 0.4901,
"step": 97
},
{
"epoch": 0.4473609129814551,
"grad_norm": 1.492316053791499,
"learning_rate": 7.127272727272728e-05,
"loss": 0.4873,
"step": 98
},
{
"epoch": 0.451925820256776,
"grad_norm": 1.525604026590939,
"learning_rate": 7.2e-05,
"loss": 0.4891,
"step": 99
},
{
"epoch": 0.456490727532097,
"grad_norm": 1.3034542842679084,
"learning_rate": 7.272727272727273e-05,
"loss": 0.49,
"step": 100
},
{
"epoch": 0.46105563480741796,
"grad_norm": 2.1642815195092666,
"learning_rate": 7.345454545454547e-05,
"loss": 0.5055,
"step": 101
},
{
"epoch": 0.46562054208273895,
"grad_norm": 1.1678257677428678,
"learning_rate": 7.418181818181818e-05,
"loss": 0.4852,
"step": 102
},
{
"epoch": 0.4701854493580599,
"grad_norm": 1.4078725478189906,
"learning_rate": 7.490909090909092e-05,
"loss": 0.4919,
"step": 103
},
{
"epoch": 0.4747503566333809,
"grad_norm": 2.357567405283945,
"learning_rate": 7.563636363636365e-05,
"loss": 0.4968,
"step": 104
},
{
"epoch": 0.4793152639087018,
"grad_norm": 1.4758593059891392,
"learning_rate": 7.636363636363637e-05,
"loss": 0.4874,
"step": 105
},
{
"epoch": 0.4838801711840228,
"grad_norm": 1.980018157376651,
"learning_rate": 7.70909090909091e-05,
"loss": 0.4953,
"step": 106
},
{
"epoch": 0.4884450784593438,
"grad_norm": 2.0401864619014467,
"learning_rate": 7.781818181818183e-05,
"loss": 0.5006,
"step": 107
},
{
"epoch": 0.49300998573466476,
"grad_norm": 1.0897659104208783,
"learning_rate": 7.854545454545455e-05,
"loss": 0.4871,
"step": 108
},
{
"epoch": 0.49757489300998575,
"grad_norm": 1.8417886476775482,
"learning_rate": 7.927272727272728e-05,
"loss": 0.4975,
"step": 109
},
{
"epoch": 0.5021398002853067,
"grad_norm": 1.2728786506557457,
"learning_rate": 8e-05,
"loss": 0.492,
"step": 110
},
{
"epoch": 0.5067047075606277,
"grad_norm": 2.4453848065966817,
"learning_rate": 7.999979655036647e-05,
"loss": 0.5094,
"step": 111
},
{
"epoch": 0.5112696148359487,
"grad_norm": 1.6374824473909455,
"learning_rate": 7.999918620353548e-05,
"loss": 0.4931,
"step": 112
},
{
"epoch": 0.5158345221112696,
"grad_norm": 1.8705264691341723,
"learning_rate": 7.999816896571574e-05,
"loss": 0.5051,
"step": 113
},
{
"epoch": 0.5203994293865906,
"grad_norm": 1.6343167214375214,
"learning_rate": 7.999674484725512e-05,
"loss": 0.4984,
"step": 114
},
{
"epoch": 0.5249643366619116,
"grad_norm": 1.9112883595327201,
"learning_rate": 7.999491386264042e-05,
"loss": 0.492,
"step": 115
},
{
"epoch": 0.5295292439372326,
"grad_norm": 1.35541163706321,
"learning_rate": 7.999267603049729e-05,
"loss": 0.4902,
"step": 116
},
{
"epoch": 0.5340941512125535,
"grad_norm": 1.6103620556897125,
"learning_rate": 7.999003137359006e-05,
"loss": 0.4927,
"step": 117
},
{
"epoch": 0.5386590584878744,
"grad_norm": 1.4810569251620247,
"learning_rate": 7.998697991882144e-05,
"loss": 0.4876,
"step": 118
},
{
"epoch": 0.5432239657631954,
"grad_norm": 1.3665267966836736,
"learning_rate": 7.998352169723229e-05,
"loss": 0.4865,
"step": 119
},
{
"epoch": 0.5477888730385164,
"grad_norm": 1.1684492343192325,
"learning_rate": 7.997965674400132e-05,
"loss": 0.4898,
"step": 120
},
{
"epoch": 0.5523537803138374,
"grad_norm": 1.6135417427578114,
"learning_rate": 7.997538509844469e-05,
"loss": 0.4884,
"step": 121
},
{
"epoch": 0.5569186875891583,
"grad_norm": 1.081634967694094,
"learning_rate": 7.997070680401562e-05,
"loss": 0.4814,
"step": 122
},
{
"epoch": 0.5614835948644793,
"grad_norm": 1.323277108291229,
"learning_rate": 7.9965621908304e-05,
"loss": 0.4862,
"step": 123
},
{
"epoch": 0.5660485021398003,
"grad_norm": 1.5122505161166728,
"learning_rate": 7.996013046303583e-05,
"loss": 0.4907,
"step": 124
},
{
"epoch": 0.5706134094151213,
"grad_norm": 1.3517400921660416,
"learning_rate": 7.995423252407275e-05,
"loss": 0.4849,
"step": 125
},
{
"epoch": 0.5751783166904422,
"grad_norm": 1.2964472361111674,
"learning_rate": 7.99479281514114e-05,
"loss": 0.4829,
"step": 126
},
{
"epoch": 0.5797432239657632,
"grad_norm": 1.441528044357245,
"learning_rate": 7.994121740918293e-05,
"loss": 0.4888,
"step": 127
},
{
"epoch": 0.5843081312410842,
"grad_norm": 1.2409245733342162,
"learning_rate": 7.993410036565223e-05,
"loss": 0.4776,
"step": 128
},
{
"epoch": 0.5888730385164052,
"grad_norm": 1.255055709641733,
"learning_rate": 7.992657709321728e-05,
"loss": 0.4856,
"step": 129
},
{
"epoch": 0.5934379457917262,
"grad_norm": 1.2617494433472416,
"learning_rate": 7.991864766840846e-05,
"loss": 0.4832,
"step": 130
},
{
"epoch": 0.598002853067047,
"grad_norm": 1.3678248560826736,
"learning_rate": 7.991031217188769e-05,
"loss": 0.483,
"step": 131
},
{
"epoch": 0.602567760342368,
"grad_norm": 1.5165238137397685,
"learning_rate": 7.990157068844764e-05,
"loss": 0.4762,
"step": 132
},
{
"epoch": 0.607132667617689,
"grad_norm": 1.0122655955764246,
"learning_rate": 7.989242330701089e-05,
"loss": 0.4794,
"step": 133
},
{
"epoch": 0.61169757489301,
"grad_norm": 1.4882369944794684,
"learning_rate": 7.988287012062902e-05,
"loss": 0.4772,
"step": 134
},
{
"epoch": 0.6162624821683309,
"grad_norm": 0.9815809304445058,
"learning_rate": 7.987291122648165e-05,
"loss": 0.4844,
"step": 135
},
{
"epoch": 0.6208273894436519,
"grad_norm": 1.8345734791985764,
"learning_rate": 7.986254672587544e-05,
"loss": 0.4872,
"step": 136
},
{
"epoch": 0.6253922967189729,
"grad_norm": 1.2322860510435374,
"learning_rate": 7.985177672424309e-05,
"loss": 0.4742,
"step": 137
},
{
"epoch": 0.6299572039942939,
"grad_norm": 1.0934283610887234,
"learning_rate": 7.984060133114222e-05,
"loss": 0.4828,
"step": 138
},
{
"epoch": 0.6345221112696149,
"grad_norm": 1.0617428680491003,
"learning_rate": 7.982902066025433e-05,
"loss": 0.4841,
"step": 139
},
{
"epoch": 0.6390870185449358,
"grad_norm": 1.4504986628664012,
"learning_rate": 7.981703482938361e-05,
"loss": 0.4765,
"step": 140
},
{
"epoch": 0.6436519258202568,
"grad_norm": 1.2196737951896366,
"learning_rate": 7.980464396045565e-05,
"loss": 0.48,
"step": 141
},
{
"epoch": 0.6482168330955778,
"grad_norm": 1.1746117544945598,
"learning_rate": 7.979184817951638e-05,
"loss": 0.472,
"step": 142
},
{
"epoch": 0.6527817403708988,
"grad_norm": 1.462647851373356,
"learning_rate": 7.977864761673062e-05,
"loss": 0.4819,
"step": 143
},
{
"epoch": 0.6573466476462196,
"grad_norm": 1.0920910565051827,
"learning_rate": 7.976504240638088e-05,
"loss": 0.4759,
"step": 144
},
{
"epoch": 0.6619115549215406,
"grad_norm": 0.8915983200577166,
"learning_rate": 7.975103268686587e-05,
"loss": 0.4708,
"step": 145
},
{
"epoch": 0.6664764621968616,
"grad_norm": 1.1195632191410694,
"learning_rate": 7.973661860069925e-05,
"loss": 0.481,
"step": 146
},
{
"epoch": 0.6710413694721826,
"grad_norm": 0.9280898217043119,
"learning_rate": 7.972180029450804e-05,
"loss": 0.4771,
"step": 147
},
{
"epoch": 0.6756062767475036,
"grad_norm": 1.2010937440977014,
"learning_rate": 7.970657791903115e-05,
"loss": 0.4766,
"step": 148
},
{
"epoch": 0.6801711840228245,
"grad_norm": 1.1648502616511074,
"learning_rate": 7.969095162911796e-05,
"loss": 0.4765,
"step": 149
},
{
"epoch": 0.6847360912981455,
"grad_norm": 1.629472140008341,
"learning_rate": 7.967492158372659e-05,
"loss": 0.4708,
"step": 150
},
{
"epoch": 0.6893009985734665,
"grad_norm": 0.7685771258524148,
"learning_rate": 7.965848794592241e-05,
"loss": 0.4702,
"step": 151
},
{
"epoch": 0.6938659058487875,
"grad_norm": 1.2979818888222598,
"learning_rate": 7.964165088287627e-05,
"loss": 0.4756,
"step": 152
},
{
"epoch": 0.6984308131241084,
"grad_norm": 1.479596160746902,
"learning_rate": 7.96244105658629e-05,
"loss": 0.4684,
"step": 153
},
{
"epoch": 0.7029957203994294,
"grad_norm": 0.9533216688157553,
"learning_rate": 7.960676717025912e-05,
"loss": 0.4674,
"step": 154
},
{
"epoch": 0.7075606276747504,
"grad_norm": 1.1144585740648676,
"learning_rate": 7.958872087554204e-05,
"loss": 0.4759,
"step": 155
},
{
"epoch": 0.7121255349500714,
"grad_norm": 1.1392667564805274,
"learning_rate": 7.957027186528724e-05,
"loss": 0.4792,
"step": 156
},
{
"epoch": 0.7166904422253922,
"grad_norm": 1.576213566462258,
"learning_rate": 7.955142032716696e-05,
"loss": 0.4711,
"step": 157
},
{
"epoch": 0.7212553495007132,
"grad_norm": 1.0153629340656216,
"learning_rate": 7.953216645294813e-05,
"loss": 0.4785,
"step": 158
},
{
"epoch": 0.7258202567760342,
"grad_norm": 1.178946056258982,
"learning_rate": 7.951251043849043e-05,
"loss": 0.4703,
"step": 159
},
{
"epoch": 0.7303851640513552,
"grad_norm": 1.2039627482563626,
"learning_rate": 7.94924524837443e-05,
"loss": 0.4691,
"step": 160
},
{
"epoch": 0.7349500713266762,
"grad_norm": 1.2758536553065434,
"learning_rate": 7.947199279274892e-05,
"loss": 0.4719,
"step": 161
},
{
"epoch": 0.7395149786019971,
"grad_norm": 0.7199695023812126,
"learning_rate": 7.945113157363012e-05,
"loss": 0.4705,
"step": 162
},
{
"epoch": 0.7440798858773181,
"grad_norm": 1.187228986720961,
"learning_rate": 7.942986903859826e-05,
"loss": 0.476,
"step": 163
},
{
"epoch": 0.7486447931526391,
"grad_norm": 1.2191468243222228,
"learning_rate": 7.940820540394611e-05,
"loss": 0.4685,
"step": 164
},
{
"epoch": 0.7532097004279601,
"grad_norm": 1.763163381472643,
"learning_rate": 7.938614089004659e-05,
"loss": 0.4634,
"step": 165
},
{
"epoch": 0.757774607703281,
"grad_norm": 0.7821277377021038,
"learning_rate": 7.936367572135056e-05,
"loss": 0.4741,
"step": 166
},
{
"epoch": 0.762339514978602,
"grad_norm": 2.297761933831456,
"learning_rate": 7.934081012638452e-05,
"loss": 0.4801,
"step": 167
},
{
"epoch": 0.766904422253923,
"grad_norm": 1.3332466148839683,
"learning_rate": 7.931754433774835e-05,
"loss": 0.4753,
"step": 168
},
{
"epoch": 0.771469329529244,
"grad_norm": 2.149549037927675,
"learning_rate": 7.929387859211283e-05,
"loss": 0.4864,
"step": 169
},
{
"epoch": 0.776034236804565,
"grad_norm": 1.5521858392131513,
"learning_rate": 7.926981313021734e-05,
"loss": 0.4845,
"step": 170
},
{
"epoch": 0.7805991440798858,
"grad_norm": 1.5726199718313127,
"learning_rate": 7.924534819686735e-05,
"loss": 0.4807,
"step": 171
},
{
"epoch": 0.7851640513552068,
"grad_norm": 1.25504183274313,
"learning_rate": 7.922048404093193e-05,
"loss": 0.4875,
"step": 172
},
{
"epoch": 0.7897289586305278,
"grad_norm": 0.9918446302267643,
"learning_rate": 7.919522091534125e-05,
"loss": 0.4751,
"step": 173
},
{
"epoch": 0.7942938659058488,
"grad_norm": 1.2133961064200827,
"learning_rate": 7.916955907708403e-05,
"loss": 0.4751,
"step": 174
},
{
"epoch": 0.7988587731811697,
"grad_norm": 1.0816089763802976,
"learning_rate": 7.91434987872048e-05,
"loss": 0.467,
"step": 175
},
{
"epoch": 0.8034236804564907,
"grad_norm": 0.8971343840889208,
"learning_rate": 7.911704031080142e-05,
"loss": 0.4734,
"step": 176
},
{
"epoch": 0.8079885877318117,
"grad_norm": 0.9356730941151444,
"learning_rate": 7.909018391702224e-05,
"loss": 0.47,
"step": 177
},
{
"epoch": 0.8125534950071327,
"grad_norm": 0.7910454206167575,
"learning_rate": 7.906292987906343e-05,
"loss": 0.4683,
"step": 178
},
{
"epoch": 0.8171184022824537,
"grad_norm": 1.192520435930123,
"learning_rate": 7.90352784741662e-05,
"loss": 0.4683,
"step": 179
},
{
"epoch": 0.8216833095577746,
"grad_norm": 1.0963304160109035,
"learning_rate": 7.900722998361394e-05,
"loss": 0.4667,
"step": 180
},
{
"epoch": 0.8262482168330956,
"grad_norm": 1.1053490528947663,
"learning_rate": 7.897878469272943e-05,
"loss": 0.472,
"step": 181
},
{
"epoch": 0.8308131241084166,
"grad_norm": 0.8848774693139051,
"learning_rate": 7.894994289087187e-05,
"loss": 0.4628,
"step": 182
},
{
"epoch": 0.8353780313837376,
"grad_norm": 1.115946428081902,
"learning_rate": 7.892070487143395e-05,
"loss": 0.4621,
"step": 183
},
{
"epoch": 0.8399429386590584,
"grad_norm": 1.0565554494854816,
"learning_rate": 7.88910709318389e-05,
"loss": 0.4665,
"step": 184
},
{
"epoch": 0.8445078459343794,
"grad_norm": 1.2557316528836924,
"learning_rate": 7.88610413735374e-05,
"loss": 0.4629,
"step": 185
},
{
"epoch": 0.8490727532097004,
"grad_norm": 0.9500438718947202,
"learning_rate": 7.883061650200459e-05,
"loss": 0.4671,
"step": 186
},
{
"epoch": 0.8536376604850214,
"grad_norm": 1.0129984995159482,
"learning_rate": 7.879979662673695e-05,
"loss": 0.4613,
"step": 187
},
{
"epoch": 0.8582025677603423,
"grad_norm": 1.2473784057361619,
"learning_rate": 7.876858206124907e-05,
"loss": 0.4697,
"step": 188
},
{
"epoch": 0.8627674750356633,
"grad_norm": 0.8979338734910455,
"learning_rate": 7.873697312307054e-05,
"loss": 0.4696,
"step": 189
},
{
"epoch": 0.8673323823109843,
"grad_norm": 0.7812764034273535,
"learning_rate": 7.870497013374272e-05,
"loss": 0.4639,
"step": 190
},
{
"epoch": 0.8718972895863053,
"grad_norm": 1.012860566654511,
"learning_rate": 7.867257341881542e-05,
"loss": 0.4653,
"step": 191
},
{
"epoch": 0.8764621968616263,
"grad_norm": 1.4227576277179481,
"learning_rate": 7.863978330784364e-05,
"loss": 0.4675,
"step": 192
},
{
"epoch": 0.8810271041369472,
"grad_norm": 0.7511911038781995,
"learning_rate": 7.860660013438418e-05,
"loss": 0.4602,
"step": 193
},
{
"epoch": 0.8855920114122682,
"grad_norm": 1.0008483459158608,
"learning_rate": 7.857302423599225e-05,
"loss": 0.4642,
"step": 194
},
{
"epoch": 0.8901569186875892,
"grad_norm": 1.3635357598280822,
"learning_rate": 7.853905595421808e-05,
"loss": 0.4718,
"step": 195
},
{
"epoch": 0.8947218259629102,
"grad_norm": 1.1150832530124606,
"learning_rate": 7.850469563460339e-05,
"loss": 0.4697,
"step": 196
},
{
"epoch": 0.899286733238231,
"grad_norm": 1.306691006655152,
"learning_rate": 7.84699436266779e-05,
"loss": 0.4645,
"step": 197
},
{
"epoch": 0.903851640513552,
"grad_norm": 0.5646785863195107,
"learning_rate": 7.843480028395578e-05,
"loss": 0.4598,
"step": 198
},
{
"epoch": 0.908416547788873,
"grad_norm": 1.416929427327476,
"learning_rate": 7.839926596393202e-05,
"loss": 0.4613,
"step": 199
},
{
"epoch": 0.912981455064194,
"grad_norm": 0.6815794459118021,
"learning_rate": 7.836334102807886e-05,
"loss": 0.4552,
"step": 200
},
{
"epoch": 0.917546362339515,
"grad_norm": 1.3377535332836705,
"learning_rate": 7.832702584184204e-05,
"loss": 0.4609,
"step": 201
},
{
"epoch": 0.9221112696148359,
"grad_norm": 0.7289345910500463,
"learning_rate": 7.829032077463713e-05,
"loss": 0.4614,
"step": 202
},
{
"epoch": 0.9266761768901569,
"grad_norm": 0.7256574283679788,
"learning_rate": 7.825322619984576e-05,
"loss": 0.4583,
"step": 203
},
{
"epoch": 0.9312410841654779,
"grad_norm": 0.7702106659855981,
"learning_rate": 7.821574249481179e-05,
"loss": 0.4568,
"step": 204
},
{
"epoch": 0.9358059914407989,
"grad_norm": 0.9102238446313804,
"learning_rate": 7.817787004083756e-05,
"loss": 0.4586,
"step": 205
},
{
"epoch": 0.9403708987161198,
"grad_norm": 1.7972329933237008,
"learning_rate": 7.813960922317988e-05,
"loss": 0.4604,
"step": 206
},
{
"epoch": 0.9449358059914408,
"grad_norm": 0.6227351510323046,
"learning_rate": 7.810096043104623e-05,
"loss": 0.4622,
"step": 207
},
{
"epoch": 0.9495007132667618,
"grad_norm": 1.7735653279890655,
"learning_rate": 7.806192405759074e-05,
"loss": 0.4649,
"step": 208
},
{
"epoch": 0.9540656205420828,
"grad_norm": 1.0308472520464786,
"learning_rate": 7.80225004999102e-05,
"loss": 0.4664,
"step": 209
},
{
"epoch": 0.9586305278174037,
"grad_norm": 0.8116494461929854,
"learning_rate": 7.798269015904004e-05,
"loss": 0.4617,
"step": 210
},
{
"epoch": 0.9631954350927246,
"grad_norm": 1.047955603167114,
"learning_rate": 7.79424934399502e-05,
"loss": 0.4614,
"step": 211
},
{
"epoch": 0.9677603423680456,
"grad_norm": 1.5898132874597775,
"learning_rate": 7.790191075154109e-05,
"loss": 0.4607,
"step": 212
},
{
"epoch": 0.9723252496433666,
"grad_norm": 0.9545090947993602,
"learning_rate": 7.786094250663936e-05,
"loss": 0.4652,
"step": 213
},
{
"epoch": 0.9768901569186876,
"grad_norm": 1.6571396240037084,
"learning_rate": 7.781958912199372e-05,
"loss": 0.4618,
"step": 214
},
{
"epoch": 0.9814550641940085,
"grad_norm": 1.2024242198019108,
"learning_rate": 7.777785101827073e-05,
"loss": 0.4662,
"step": 215
},
{
"epoch": 0.9860199714693295,
"grad_norm": 1.1021742864631379,
"learning_rate": 7.773572862005048e-05,
"loss": 0.4573,
"step": 216
},
{
"epoch": 0.9905848787446505,
"grad_norm": 1.199436776468982,
"learning_rate": 7.76932223558223e-05,
"loss": 0.461,
"step": 217
},
{
"epoch": 0.9951497860199715,
"grad_norm": 1.063128318792733,
"learning_rate": 7.765033265798038e-05,
"loss": 0.455,
"step": 218
},
{
"epoch": 0.9997146932952924,
"grad_norm": 0.6606867422821489,
"learning_rate": 7.760705996281937e-05,
"loss": 0.4591,
"step": 219
},
{
"epoch": 1.0042796005706134,
"grad_norm": 1.8960244617895399,
"learning_rate": 7.756340471052998e-05,
"loss": 0.8785,
"step": 220
},
{
"epoch": 1.0088445078459345,
"grad_norm": 1.4080967185223767,
"learning_rate": 7.751936734519448e-05,
"loss": 0.4574,
"step": 221
},
{
"epoch": 1.0134094151212554,
"grad_norm": 0.6771897541703611,
"learning_rate": 7.747494831478214e-05,
"loss": 0.4431,
"step": 222
},
{
"epoch": 1.0179743223965763,
"grad_norm": 1.2677400355158506,
"learning_rate": 7.743014807114475e-05,
"loss": 0.4477,
"step": 223
},
{
"epoch": 1.0225392296718974,
"grad_norm": 0.8527685554112466,
"learning_rate": 7.738496707001195e-05,
"loss": 0.4383,
"step": 224
},
{
"epoch": 1.0271041369472182,
"grad_norm": 0.8122477750639683,
"learning_rate": 7.733940577098666e-05,
"loss": 0.4418,
"step": 225
},
{
"epoch": 1.0316690442225391,
"grad_norm": 0.8426217905101726,
"learning_rate": 7.729346463754035e-05,
"loss": 0.4421,
"step": 226
},
{
"epoch": 1.0362339514978602,
"grad_norm": 0.9171730058782802,
"learning_rate": 7.724714413700836e-05,
"loss": 0.4418,
"step": 227
},
{
"epoch": 1.0407988587731811,
"grad_norm": 1.0216657454100806,
"learning_rate": 7.720044474058515e-05,
"loss": 0.4463,
"step": 228
},
{
"epoch": 1.0453637660485022,
"grad_norm": 1.1305237470904832,
"learning_rate": 7.715336692331944e-05,
"loss": 0.4382,
"step": 229
},
{
"epoch": 1.0499286733238231,
"grad_norm": 1.0426775975151317,
"learning_rate": 7.71059111641095e-05,
"loss": 0.4466,
"step": 230
},
{
"epoch": 1.054493580599144,
"grad_norm": 1.0597968056899956,
"learning_rate": 7.705807794569815e-05,
"loss": 0.4486,
"step": 231
},
{
"epoch": 1.059058487874465,
"grad_norm": 1.1716663560960234,
"learning_rate": 7.700986775466792e-05,
"loss": 0.4439,
"step": 232
},
{
"epoch": 1.063623395149786,
"grad_norm": 0.9803996514179776,
"learning_rate": 7.696128108143612e-05,
"loss": 0.4461,
"step": 233
},
{
"epoch": 1.0681883024251069,
"grad_norm": 1.2259077645154464,
"learning_rate": 7.691231842024977e-05,
"loss": 0.4489,
"step": 234
},
{
"epoch": 1.072753209700428,
"grad_norm": 0.853759148071036,
"learning_rate": 7.686298026918067e-05,
"loss": 0.4421,
"step": 235
},
{
"epoch": 1.0773181169757489,
"grad_norm": 0.7461561534105968,
"learning_rate": 7.681326713012024e-05,
"loss": 0.4389,
"step": 236
},
{
"epoch": 1.08188302425107,
"grad_norm": 1.1421762335895749,
"learning_rate": 7.676317950877446e-05,
"loss": 0.4396,
"step": 237
},
{
"epoch": 1.0864479315263909,
"grad_norm": 0.8992066222764188,
"learning_rate": 7.671271791465877e-05,
"loss": 0.4404,
"step": 238
},
{
"epoch": 1.0910128388017117,
"grad_norm": 0.8169936643046695,
"learning_rate": 7.666188286109279e-05,
"loss": 0.4375,
"step": 239
},
{
"epoch": 1.0955777460770328,
"grad_norm": 1.0409973585291696,
"learning_rate": 7.66106748651952e-05,
"loss": 0.4395,
"step": 240
},
{
"epoch": 1.1001426533523537,
"grad_norm": 1.0016647356785184,
"learning_rate": 7.655909444787837e-05,
"loss": 0.4436,
"step": 241
},
{
"epoch": 1.1047075606276748,
"grad_norm": 0.9158140898097673,
"learning_rate": 7.650714213384317e-05,
"loss": 0.4362,
"step": 242
},
{
"epoch": 1.1092724679029957,
"grad_norm": 0.8583658856818915,
"learning_rate": 7.645481845157353e-05,
"loss": 0.4359,
"step": 243
},
{
"epoch": 1.1138373751783166,
"grad_norm": 0.6939369526031025,
"learning_rate": 7.640212393333117e-05,
"loss": 0.4306,
"step": 244
},
{
"epoch": 1.1184022824536377,
"grad_norm": 0.592410776540685,
"learning_rate": 7.634905911515014e-05,
"loss": 0.4354,
"step": 245
},
{
"epoch": 1.1229671897289586,
"grad_norm": 0.8601769845121724,
"learning_rate": 7.62956245368313e-05,
"loss": 0.4355,
"step": 246
},
{
"epoch": 1.1275320970042797,
"grad_norm": 0.7237332967962335,
"learning_rate": 7.624182074193691e-05,
"loss": 0.4399,
"step": 247
},
{
"epoch": 1.1320970042796006,
"grad_norm": 0.6811877132981848,
"learning_rate": 7.61876482777851e-05,
"loss": 0.4411,
"step": 248
},
{
"epoch": 1.1366619115549215,
"grad_norm": 0.8543886977059173,
"learning_rate": 7.613310769544428e-05,
"loss": 0.4355,
"step": 249
},
{
"epoch": 1.1412268188302426,
"grad_norm": 0.8286191112581163,
"learning_rate": 7.607819954972752e-05,
"loss": 0.4383,
"step": 250
},
{
"epoch": 1.1457917261055635,
"grad_norm": 1.022170225407258,
"learning_rate": 7.60229243991869e-05,
"loss": 0.4416,
"step": 251
},
{
"epoch": 1.1503566333808846,
"grad_norm": 1.1340537050864723,
"learning_rate": 7.59672828061079e-05,
"loss": 0.4382,
"step": 252
},
{
"epoch": 1.1549215406562054,
"grad_norm": 0.5980326106048516,
"learning_rate": 7.591127533650362e-05,
"loss": 0.4369,
"step": 253
},
{
"epoch": 1.1594864479315263,
"grad_norm": 0.5810971453495677,
"learning_rate": 7.585490256010899e-05,
"loss": 0.4319,
"step": 254
},
{
"epoch": 1.1640513552068474,
"grad_norm": 0.8641814822644016,
"learning_rate": 7.579816505037505e-05,
"loss": 0.4386,
"step": 255
},
{
"epoch": 1.1686162624821683,
"grad_norm": 0.9787426567126707,
"learning_rate": 7.574106338446309e-05,
"loss": 0.4327,
"step": 256
},
{
"epoch": 1.1731811697574892,
"grad_norm": 1.1084677725616903,
"learning_rate": 7.568359814323876e-05,
"loss": 0.4364,
"step": 257
},
{
"epoch": 1.1777460770328103,
"grad_norm": 0.8794156667429015,
"learning_rate": 7.562576991126616e-05,
"loss": 0.4387,
"step": 258
},
{
"epoch": 1.1823109843081312,
"grad_norm": 0.7016232671145584,
"learning_rate": 7.556757927680192e-05,
"loss": 0.4334,
"step": 259
},
{
"epoch": 1.1868758915834523,
"grad_norm": 0.7028177191087993,
"learning_rate": 7.550902683178923e-05,
"loss": 0.4346,
"step": 260
},
{
"epoch": 1.1914407988587732,
"grad_norm": 0.7601681237280019,
"learning_rate": 7.545011317185172e-05,
"loss": 0.4374,
"step": 261
},
{
"epoch": 1.196005706134094,
"grad_norm": 0.9527083913769249,
"learning_rate": 7.539083889628755e-05,
"loss": 0.4394,
"step": 262
},
{
"epoch": 1.2005706134094152,
"grad_norm": 1.144372718254701,
"learning_rate": 7.53312046080632e-05,
"loss": 0.4445,
"step": 263
},
{
"epoch": 1.205135520684736,
"grad_norm": 0.9321132440752812,
"learning_rate": 7.527121091380737e-05,
"loss": 0.436,
"step": 264
},
{
"epoch": 1.209700427960057,
"grad_norm": 1.0349154486726628,
"learning_rate": 7.52108584238048e-05,
"loss": 0.436,
"step": 265
},
{
"epoch": 1.214265335235378,
"grad_norm": 1.0640705130270223,
"learning_rate": 7.515014775199011e-05,
"loss": 0.4394,
"step": 266
},
{
"epoch": 1.218830242510699,
"grad_norm": 1.0477802116229236,
"learning_rate": 7.508907951594149e-05,
"loss": 0.4326,
"step": 267
},
{
"epoch": 1.22339514978602,
"grad_norm": 1.0464613549121728,
"learning_rate": 7.502765433687444e-05,
"loss": 0.4377,
"step": 268
},
{
"epoch": 1.227960057061341,
"grad_norm": 0.8450812112061008,
"learning_rate": 7.496587283963549e-05,
"loss": 0.4369,
"step": 269
},
{
"epoch": 1.2325249643366618,
"grad_norm": 0.6557155968758558,
"learning_rate": 7.490373565269575e-05,
"loss": 0.4339,
"step": 270
},
{
"epoch": 1.237089871611983,
"grad_norm": 0.5572121626093545,
"learning_rate": 7.484124340814467e-05,
"loss": 0.4344,
"step": 271
},
{
"epoch": 1.2416547788873038,
"grad_norm": 0.5678369025757513,
"learning_rate": 7.477839674168342e-05,
"loss": 0.4256,
"step": 272
},
{
"epoch": 1.246219686162625,
"grad_norm": 0.5209219264497257,
"learning_rate": 7.471519629261859e-05,
"loss": 0.4327,
"step": 273
},
{
"epoch": 1.2507845934379458,
"grad_norm": 0.4721960308852361,
"learning_rate": 7.465164270385558e-05,
"loss": 0.4304,
"step": 274
},
{
"epoch": 1.2553495007132667,
"grad_norm": 0.5657942084693068,
"learning_rate": 7.45877366218921e-05,
"loss": 0.4382,
"step": 275
},
{
"epoch": 1.2599144079885878,
"grad_norm": 0.7196787321817991,
"learning_rate": 7.452347869681159e-05,
"loss": 0.4356,
"step": 276
},
{
"epoch": 1.2644793152639087,
"grad_norm": 0.6908825865399278,
"learning_rate": 7.445886958227665e-05,
"loss": 0.4291,
"step": 277
},
{
"epoch": 1.2690442225392298,
"grad_norm": 0.5440820157677637,
"learning_rate": 7.439390993552227e-05,
"loss": 0.4362,
"step": 278
},
{
"epoch": 1.2736091298145507,
"grad_norm": 0.6147849930389729,
"learning_rate": 7.43286004173493e-05,
"loss": 0.4321,
"step": 279
},
{
"epoch": 1.2781740370898715,
"grad_norm": 0.5681098435881913,
"learning_rate": 7.426294169211762e-05,
"loss": 0.4348,
"step": 280
},
{
"epoch": 1.2827389443651926,
"grad_norm": 0.44708040023745554,
"learning_rate": 7.419693442773937e-05,
"loss": 0.4375,
"step": 281
},
{
"epoch": 1.2873038516405135,
"grad_norm": 0.5713599240412557,
"learning_rate": 7.413057929567227e-05,
"loss": 0.4298,
"step": 282
},
{
"epoch": 1.2918687589158346,
"grad_norm": 0.6323338024890408,
"learning_rate": 7.406387697091269e-05,
"loss": 0.4374,
"step": 283
},
{
"epoch": 1.2964336661911555,
"grad_norm": 0.9056516523433347,
"learning_rate": 7.399682813198879e-05,
"loss": 0.4362,
"step": 284
},
{
"epoch": 1.3009985734664764,
"grad_norm": 1.182899493470329,
"learning_rate": 7.392943346095366e-05,
"loss": 0.4361,
"step": 285
},
{
"epoch": 1.3055634807417975,
"grad_norm": 0.7775593404006284,
"learning_rate": 7.386169364337833e-05,
"loss": 0.438,
"step": 286
},
{
"epoch": 1.3101283880171184,
"grad_norm": 0.6198494591654299,
"learning_rate": 7.379360936834492e-05,
"loss": 0.429,
"step": 287
},
{
"epoch": 1.3146932952924395,
"grad_norm": 0.7793764973737607,
"learning_rate": 7.372518132843941e-05,
"loss": 0.4385,
"step": 288
},
{
"epoch": 1.3192582025677604,
"grad_norm": 0.9303862541132153,
"learning_rate": 7.365641021974478e-05,
"loss": 0.4335,
"step": 289
},
{
"epoch": 1.3238231098430813,
"grad_norm": 0.8432690646689237,
"learning_rate": 7.358729674183392e-05,
"loss": 0.4317,
"step": 290
},
{
"epoch": 1.3283880171184022,
"grad_norm": 0.8874816517796578,
"learning_rate": 7.351784159776238e-05,
"loss": 0.4304,
"step": 291
},
{
"epoch": 1.3329529243937233,
"grad_norm": 0.868796496930731,
"learning_rate": 7.344804549406135e-05,
"loss": 0.4371,
"step": 292
},
{
"epoch": 1.3375178316690441,
"grad_norm": 0.7852711094517002,
"learning_rate": 7.337790914073043e-05,
"loss": 0.4294,
"step": 293
},
{
"epoch": 1.3420827389443652,
"grad_norm": 0.936953145650728,
"learning_rate": 7.330743325123036e-05,
"loss": 0.4391,
"step": 294
},
{
"epoch": 1.3466476462196861,
"grad_norm": 0.9704040812645053,
"learning_rate": 7.323661854247587e-05,
"loss": 0.4349,
"step": 295
},
{
"epoch": 1.351212553495007,
"grad_norm": 0.8823873892473993,
"learning_rate": 7.316546573482828e-05,
"loss": 0.4315,
"step": 296
},
{
"epoch": 1.3557774607703281,
"grad_norm": 0.9155700667464731,
"learning_rate": 7.309397555208817e-05,
"loss": 0.4352,
"step": 297
},
{
"epoch": 1.360342368045649,
"grad_norm": 0.9500779897069331,
"learning_rate": 7.302214872148817e-05,
"loss": 0.4341,
"step": 298
},
{
"epoch": 1.3649072753209701,
"grad_norm": 0.715286965255515,
"learning_rate": 7.29499859736854e-05,
"loss": 0.43,
"step": 299
},
{
"epoch": 1.369472182596291,
"grad_norm": 0.42900565746793906,
"learning_rate": 7.287748804275406e-05,
"loss": 0.4257,
"step": 300
},
{
"epoch": 1.3740370898716119,
"grad_norm": 0.44848107810486365,
"learning_rate": 7.280465566617804e-05,
"loss": 0.4282,
"step": 301
},
{
"epoch": 1.378601997146933,
"grad_norm": 0.5499399617735641,
"learning_rate": 7.273148958484335e-05,
"loss": 0.4342,
"step": 302
},
{
"epoch": 1.3831669044222539,
"grad_norm": 0.7779728624601656,
"learning_rate": 7.265799054303062e-05,
"loss": 0.4338,
"step": 303
},
{
"epoch": 1.387731811697575,
"grad_norm": 0.7555091815426274,
"learning_rate": 7.258415928840749e-05,
"loss": 0.4311,
"step": 304
},
{
"epoch": 1.3922967189728959,
"grad_norm": 0.6206089776911131,
"learning_rate": 7.250999657202107e-05,
"loss": 0.4296,
"step": 305
},
{
"epoch": 1.3968616262482167,
"grad_norm": 0.7912293160493962,
"learning_rate": 7.24355031482902e-05,
"loss": 0.436,
"step": 306
},
{
"epoch": 1.4014265335235379,
"grad_norm": 1.125338548493621,
"learning_rate": 7.236067977499791e-05,
"loss": 0.4342,
"step": 307
},
{
"epoch": 1.4059914407988587,
"grad_norm": 1.04913732233305,
"learning_rate": 7.228552721328354e-05,
"loss": 0.4388,
"step": 308
},
{
"epoch": 1.4105563480741798,
"grad_norm": 0.8118850698239931,
"learning_rate": 7.22100462276352e-05,
"loss": 0.4257,
"step": 309
},
{
"epoch": 1.4151212553495007,
"grad_norm": 0.6971634722121615,
"learning_rate": 7.213423758588182e-05,
"loss": 0.4314,
"step": 310
},
{
"epoch": 1.4196861626248216,
"grad_norm": 0.9112349240874238,
"learning_rate": 7.20581020591854e-05,
"loss": 0.4311,
"step": 311
},
{
"epoch": 1.4242510699001427,
"grad_norm": 0.8933771253980181,
"learning_rate": 7.19816404220332e-05,
"loss": 0.4337,
"step": 312
},
{
"epoch": 1.4288159771754636,
"grad_norm": 0.7078422348562297,
"learning_rate": 7.190485345222981e-05,
"loss": 0.4297,
"step": 313
},
{
"epoch": 1.4333808844507847,
"grad_norm": 0.659819970126346,
"learning_rate": 7.18277419308893e-05,
"loss": 0.4258,
"step": 314
},
{
"epoch": 1.4379457917261056,
"grad_norm": 0.5948618977549561,
"learning_rate": 7.17503066424272e-05,
"loss": 0.4307,
"step": 315
},
{
"epoch": 1.4425106990014265,
"grad_norm": 0.5522078226460925,
"learning_rate": 7.167254837455254e-05,
"loss": 0.423,
"step": 316
},
{
"epoch": 1.4470756062767476,
"grad_norm": 0.46370094074478,
"learning_rate": 7.15944679182599e-05,
"loss": 0.4226,
"step": 317
},
{
"epoch": 1.4516405135520685,
"grad_norm": 0.4538500924366929,
"learning_rate": 7.15160660678213e-05,
"loss": 0.4334,
"step": 318
},
{
"epoch": 1.4562054208273896,
"grad_norm": 0.5046426426810093,
"learning_rate": 7.143734362077809e-05,
"loss": 0.4333,
"step": 319
},
{
"epoch": 1.4607703281027105,
"grad_norm": 0.4578123337897367,
"learning_rate": 7.135830137793295e-05,
"loss": 0.4236,
"step": 320
},
{
"epoch": 1.4653352353780313,
"grad_norm": 0.4655315773535026,
"learning_rate": 7.127894014334163e-05,
"loss": 0.4279,
"step": 321
},
{
"epoch": 1.4699001426533522,
"grad_norm": 0.4651249472363875,
"learning_rate": 7.119926072430485e-05,
"loss": 0.4253,
"step": 322
},
{
"epoch": 1.4744650499286733,
"grad_norm": 0.4978505684470615,
"learning_rate": 7.111926393136002e-05,
"loss": 0.428,
"step": 323
},
{
"epoch": 1.4790299572039942,
"grad_norm": 0.536231847135624,
"learning_rate": 7.103895057827308e-05,
"loss": 0.43,
"step": 324
},
{
"epoch": 1.4835948644793153,
"grad_norm": 0.3912812640636196,
"learning_rate": 7.095832148203013e-05,
"loss": 0.4295,
"step": 325
},
{
"epoch": 1.4881597717546362,
"grad_norm": 0.40464250838428917,
"learning_rate": 7.087737746282916e-05,
"loss": 0.4277,
"step": 326
},
{
"epoch": 1.492724679029957,
"grad_norm": 0.5277590990428639,
"learning_rate": 7.079611934407173e-05,
"loss": 0.4335,
"step": 327
},
{
"epoch": 1.4972895863052782,
"grad_norm": 0.7191902847601642,
"learning_rate": 7.071454795235457e-05,
"loss": 0.428,
"step": 328
},
{
"epoch": 1.5018544935805993,
"grad_norm": 0.8201553068276188,
"learning_rate": 7.063266411746116e-05,
"loss": 0.4243,
"step": 329
},
{
"epoch": 1.5064194008559202,
"grad_norm": 0.8029687675080593,
"learning_rate": 7.055046867235331e-05,
"loss": 0.4297,
"step": 330
},
{
"epoch": 1.510984308131241,
"grad_norm": 0.7677427204023601,
"learning_rate": 7.046796245316267e-05,
"loss": 0.428,
"step": 331
},
{
"epoch": 1.515549215406562,
"grad_norm": 0.7499549959421509,
"learning_rate": 7.038514629918228e-05,
"loss": 0.4307,
"step": 332
},
{
"epoch": 1.520114122681883,
"grad_norm": 0.8211692796656908,
"learning_rate": 7.030202105285792e-05,
"loss": 0.433,
"step": 333
},
{
"epoch": 1.524679029957204,
"grad_norm": 1.0730200098155518,
"learning_rate": 7.021858755977964e-05,
"loss": 0.4241,
"step": 334
},
{
"epoch": 1.529243937232525,
"grad_norm": 1.1727832985354492,
"learning_rate": 7.013484666867312e-05,
"loss": 0.4333,
"step": 335
},
{
"epoch": 1.533808844507846,
"grad_norm": 0.7964684055522075,
"learning_rate": 7.005079923139104e-05,
"loss": 0.4317,
"step": 336
},
{
"epoch": 1.5383737517831668,
"grad_norm": 0.7927868140861069,
"learning_rate": 6.996644610290441e-05,
"loss": 0.4293,
"step": 337
},
{
"epoch": 1.542938659058488,
"grad_norm": 0.722547974680119,
"learning_rate": 6.988178814129388e-05,
"loss": 0.4297,
"step": 338
},
{
"epoch": 1.5475035663338088,
"grad_norm": 0.5891193027337546,
"learning_rate": 6.979682620774104e-05,
"loss": 0.4301,
"step": 339
},
{
"epoch": 1.55206847360913,
"grad_norm": 0.5088111344725053,
"learning_rate": 6.971156116651958e-05,
"loss": 0.4244,
"step": 340
},
{
"epoch": 1.5566333808844508,
"grad_norm": 0.5076201573377456,
"learning_rate": 6.962599388498657e-05,
"loss": 0.4257,
"step": 341
},
{
"epoch": 1.5611982881597717,
"grad_norm": 0.5666496245303375,
"learning_rate": 6.954012523357362e-05,
"loss": 0.4308,
"step": 342
},
{
"epoch": 1.5657631954350926,
"grad_norm": 0.5307337032247661,
"learning_rate": 6.945395608577801e-05,
"loss": 0.4356,
"step": 343
},
{
"epoch": 1.5703281027104137,
"grad_norm": 0.44314588391528825,
"learning_rate": 6.936748731815382e-05,
"loss": 0.4249,
"step": 344
},
{
"epoch": 1.5748930099857348,
"grad_norm": 0.4804047628466697,
"learning_rate": 6.9280719810303e-05,
"loss": 0.4332,
"step": 345
},
{
"epoch": 1.5794579172610557,
"grad_norm": 0.4642513033873333,
"learning_rate": 6.919365444486644e-05,
"loss": 0.4226,
"step": 346
},
{
"epoch": 1.5840228245363766,
"grad_norm": 0.5568335056382052,
"learning_rate": 6.910629210751497e-05,
"loss": 0.4238,
"step": 347
},
{
"epoch": 1.5885877318116974,
"grad_norm": 0.5686750155147203,
"learning_rate": 6.901863368694036e-05,
"loss": 0.4229,
"step": 348
},
{
"epoch": 1.5931526390870185,
"grad_norm": 0.5594409908646193,
"learning_rate": 6.893068007484628e-05,
"loss": 0.431,
"step": 349
},
{
"epoch": 1.5977175463623396,
"grad_norm": 0.7542738892463182,
"learning_rate": 6.884243216593928e-05,
"loss": 0.4278,
"step": 350
},
{
"epoch": 1.6022824536376605,
"grad_norm": 0.909785720326975,
"learning_rate": 6.875389085791956e-05,
"loss": 0.4237,
"step": 351
},
{
"epoch": 1.6068473609129814,
"grad_norm": 1.0518321051704833,
"learning_rate": 6.866505705147195e-05,
"loss": 0.4297,
"step": 352
},
{
"epoch": 1.6114122681883023,
"grad_norm": 1.1017254090246513,
"learning_rate": 6.857593165025674e-05,
"loss": 0.4245,
"step": 353
},
{
"epoch": 1.6159771754636234,
"grad_norm": 0.8187539090198782,
"learning_rate": 6.848651556090042e-05,
"loss": 0.4258,
"step": 354
},
{
"epoch": 1.6205420827389445,
"grad_norm": 0.5573740074216886,
"learning_rate": 6.839680969298653e-05,
"loss": 0.4285,
"step": 355
},
{
"epoch": 1.6251069900142654,
"grad_norm": 0.6164738882872269,
"learning_rate": 6.830681495904637e-05,
"loss": 0.4286,
"step": 356
},
{
"epoch": 1.6296718972895863,
"grad_norm": 0.8801449131438988,
"learning_rate": 6.821653227454973e-05,
"loss": 0.4221,
"step": 357
},
{
"epoch": 1.6342368045649072,
"grad_norm": 1.0792158764449444,
"learning_rate": 6.812596255789553e-05,
"loss": 0.4267,
"step": 358
},
{
"epoch": 1.6388017118402283,
"grad_norm": 0.8680834731545233,
"learning_rate": 6.80351067304026e-05,
"loss": 0.424,
"step": 359
},
{
"epoch": 1.6433666191155494,
"grad_norm": 0.6086251558676785,
"learning_rate": 6.794396571630015e-05,
"loss": 0.4293,
"step": 360
},
{
"epoch": 1.6479315263908703,
"grad_norm": 0.501395339998093,
"learning_rate": 6.785254044271848e-05,
"loss": 0.4285,
"step": 361
},
{
"epoch": 1.6524964336661911,
"grad_norm": 0.5979602956912436,
"learning_rate": 6.776083183967951e-05,
"loss": 0.4275,
"step": 362
},
{
"epoch": 1.657061340941512,
"grad_norm": 0.6937203619342673,
"learning_rate": 6.766884084008734e-05,
"loss": 0.425,
"step": 363
},
{
"epoch": 1.6616262482168331,
"grad_norm": 0.7101090354502062,
"learning_rate": 6.757656837971872e-05,
"loss": 0.4309,
"step": 364
},
{
"epoch": 1.666191155492154,
"grad_norm": 0.643930488180388,
"learning_rate": 6.748401539721353e-05,
"loss": 0.4287,
"step": 365
},
{
"epoch": 1.6707560627674751,
"grad_norm": 0.7810718246548566,
"learning_rate": 6.739118283406533e-05,
"loss": 0.4264,
"step": 366
},
{
"epoch": 1.675320970042796,
"grad_norm": 0.9283460979463781,
"learning_rate": 6.729807163461165e-05,
"loss": 0.4319,
"step": 367
},
{
"epoch": 1.679885877318117,
"grad_norm": 0.9210967683307637,
"learning_rate": 6.720468274602446e-05,
"loss": 0.4282,
"step": 368
},
{
"epoch": 1.684450784593438,
"grad_norm": 0.6497876560161834,
"learning_rate": 6.711101711830054e-05,
"loss": 0.4284,
"step": 369
},
{
"epoch": 1.6890156918687589,
"grad_norm": 0.4725196750675396,
"learning_rate": 6.701707570425174e-05,
"loss": 0.4262,
"step": 370
},
{
"epoch": 1.69358059914408,
"grad_norm": 0.5985153130104276,
"learning_rate": 6.69228594594954e-05,
"loss": 0.4309,
"step": 371
},
{
"epoch": 1.6981455064194009,
"grad_norm": 0.7045200474002776,
"learning_rate": 6.682836934244452e-05,
"loss": 0.4268,
"step": 372
},
{
"epoch": 1.7027104136947218,
"grad_norm": 0.7522673563908682,
"learning_rate": 6.67336063142981e-05,
"loss": 0.429,
"step": 373
},
{
"epoch": 1.7072753209700426,
"grad_norm": 0.5776326484944384,
"learning_rate": 6.663857133903128e-05,
"loss": 0.4243,
"step": 374
},
{
"epoch": 1.7118402282453637,
"grad_norm": 0.34017458565072894,
"learning_rate": 6.654326538338565e-05,
"loss": 0.4235,
"step": 375
},
{
"epoch": 1.7164051355206849,
"grad_norm": 0.5718314127403422,
"learning_rate": 6.644768941685928e-05,
"loss": 0.4223,
"step": 376
},
{
"epoch": 1.7209700427960057,
"grad_norm": 0.7259779441818939,
"learning_rate": 6.63518444116969e-05,
"loss": 0.4257,
"step": 377
},
{
"epoch": 1.7255349500713266,
"grad_norm": 0.5302976449413679,
"learning_rate": 6.625573134288012e-05,
"loss": 0.4156,
"step": 378
},
{
"epoch": 1.7300998573466475,
"grad_norm": 0.4044589367438007,
"learning_rate": 6.615935118811737e-05,
"loss": 0.4217,
"step": 379
},
{
"epoch": 1.7346647646219686,
"grad_norm": 0.6930212301593457,
"learning_rate": 6.606270492783395e-05,
"loss": 0.4228,
"step": 380
},
{
"epoch": 1.7392296718972897,
"grad_norm": 0.7344014385124219,
"learning_rate": 6.596579354516225e-05,
"loss": 0.4232,
"step": 381
},
{
"epoch": 1.7437945791726106,
"grad_norm": 0.5534474445480675,
"learning_rate": 6.586861802593147e-05,
"loss": 0.4233,
"step": 382
},
{
"epoch": 1.7483594864479315,
"grad_norm": 0.6126829361696522,
"learning_rate": 6.577117935865785e-05,
"loss": 0.4268,
"step": 383
},
{
"epoch": 1.7529243937232524,
"grad_norm": 0.6533034681110785,
"learning_rate": 6.567347853453439e-05,
"loss": 0.422,
"step": 384
},
{
"epoch": 1.7574893009985735,
"grad_norm": 0.5424516205530866,
"learning_rate": 6.557551654742099e-05,
"loss": 0.4198,
"step": 385
},
{
"epoch": 1.7620542082738946,
"grad_norm": 0.6137606774154565,
"learning_rate": 6.547729439383414e-05,
"loss": 0.4266,
"step": 386
},
{
"epoch": 1.7666191155492155,
"grad_norm": 0.6945685962943688,
"learning_rate": 6.537881307293691e-05,
"loss": 0.4231,
"step": 387
},
{
"epoch": 1.7711840228245364,
"grad_norm": 0.6051697889503573,
"learning_rate": 6.528007358652871e-05,
"loss": 0.424,
"step": 388
},
{
"epoch": 1.7757489300998572,
"grad_norm": 0.497689232889849,
"learning_rate": 6.518107693903519e-05,
"loss": 0.4221,
"step": 389
},
{
"epoch": 1.7803138373751783,
"grad_norm": 0.3881400873717001,
"learning_rate": 6.50818241374979e-05,
"loss": 0.4266,
"step": 390
},
{
"epoch": 1.7848787446504994,
"grad_norm": 0.4053350043040276,
"learning_rate": 6.498231619156416e-05,
"loss": 0.4212,
"step": 391
},
{
"epoch": 1.7894436519258203,
"grad_norm": 0.4228076243058947,
"learning_rate": 6.488255411347673e-05,
"loss": 0.421,
"step": 392
},
{
"epoch": 1.7940085592011412,
"grad_norm": 0.3651462975399496,
"learning_rate": 6.478253891806353e-05,
"loss": 0.4203,
"step": 393
},
{
"epoch": 1.798573466476462,
"grad_norm": 0.3813241127258593,
"learning_rate": 6.468227162272726e-05,
"loss": 0.4256,
"step": 394
},
{
"epoch": 1.8031383737517832,
"grad_norm": 0.4876471377127721,
"learning_rate": 6.45817532474352e-05,
"loss": 0.4249,
"step": 395
},
{
"epoch": 1.807703281027104,
"grad_norm": 0.5450947841751673,
"learning_rate": 6.448098481470863e-05,
"loss": 0.4203,
"step": 396
},
{
"epoch": 1.8122681883024252,
"grad_norm": 0.6516167352248161,
"learning_rate": 6.437996734961262e-05,
"loss": 0.4306,
"step": 397
},
{
"epoch": 1.816833095577746,
"grad_norm": 0.7968992579354538,
"learning_rate": 6.427870187974548e-05,
"loss": 0.4213,
"step": 398
},
{
"epoch": 1.821398002853067,
"grad_norm": 0.8730538354851684,
"learning_rate": 6.417718943522835e-05,
"loss": 0.4315,
"step": 399
},
{
"epoch": 1.825962910128388,
"grad_norm": 0.801131615179908,
"learning_rate": 6.407543104869469e-05,
"loss": 0.4209,
"step": 400
},
{
"epoch": 1.830527817403709,
"grad_norm": 0.645824395128567,
"learning_rate": 6.397342775527982e-05,
"loss": 0.4277,
"step": 401
},
{
"epoch": 1.83509272467903,
"grad_norm": 0.7410730591928698,
"learning_rate": 6.38711805926104e-05,
"loss": 0.4196,
"step": 402
},
{
"epoch": 1.839657631954351,
"grad_norm": 0.9347780145308188,
"learning_rate": 6.376869060079381e-05,
"loss": 0.4226,
"step": 403
},
{
"epoch": 1.8442225392296718,
"grad_norm": 0.8659116193168973,
"learning_rate": 6.366595882240761e-05,
"loss": 0.4255,
"step": 404
},
{
"epoch": 1.8487874465049927,
"grad_norm": 0.7573788312065334,
"learning_rate": 6.356298630248893e-05,
"loss": 0.4319,
"step": 405
},
{
"epoch": 1.8533523537803138,
"grad_norm": 0.698786388331663,
"learning_rate": 6.345977408852383e-05,
"loss": 0.423,
"step": 406
},
{
"epoch": 1.857917261055635,
"grad_norm": 0.6540039704024841,
"learning_rate": 6.335632323043671e-05,
"loss": 0.4239,
"step": 407
},
{
"epoch": 1.8624821683309558,
"grad_norm": 0.5449620669554056,
"learning_rate": 6.325263478057947e-05,
"loss": 0.4279,
"step": 408
},
{
"epoch": 1.8670470756062767,
"grad_norm": 0.4777538440000814,
"learning_rate": 6.314870979372102e-05,
"loss": 0.4247,
"step": 409
},
{
"epoch": 1.8716119828815976,
"grad_norm": 0.38933468433699886,
"learning_rate": 6.304454932703633e-05,
"loss": 0.4231,
"step": 410
},
{
"epoch": 1.8761768901569187,
"grad_norm": 0.4120383799689869,
"learning_rate": 6.29401544400959e-05,
"loss": 0.4293,
"step": 411
},
{
"epoch": 1.8807417974322398,
"grad_norm": 0.47381071482947645,
"learning_rate": 6.283552619485476e-05,
"loss": 0.4177,
"step": 412
},
{
"epoch": 1.8853067047075607,
"grad_norm": 0.3580390483686331,
"learning_rate": 6.27306656556418e-05,
"loss": 0.4196,
"step": 413
},
{
"epoch": 1.8898716119828816,
"grad_norm": 0.38078475072495094,
"learning_rate": 6.2625573889149e-05,
"loss": 0.4193,
"step": 414
},
{
"epoch": 1.8944365192582024,
"grad_norm": 0.4278816772061641,
"learning_rate": 6.25202519644204e-05,
"loss": 0.418,
"step": 415
},
{
"epoch": 1.8990014265335236,
"grad_norm": 0.4475822733130566,
"learning_rate": 6.241470095284133e-05,
"loss": 0.4262,
"step": 416
},
{
"epoch": 1.9035663338088447,
"grad_norm": 0.5284917554953473,
"learning_rate": 6.230892192812752e-05,
"loss": 0.4241,
"step": 417
},
{
"epoch": 1.9081312410841655,
"grad_norm": 0.5411660617767071,
"learning_rate": 6.220291596631417e-05,
"loss": 0.425,
"step": 418
},
{
"epoch": 1.9126961483594864,
"grad_norm": 0.4446094839845814,
"learning_rate": 6.209668414574502e-05,
"loss": 0.4217,
"step": 419
},
{
"epoch": 1.9172610556348073,
"grad_norm": 0.38671232428761154,
"learning_rate": 6.199022754706127e-05,
"loss": 0.4288,
"step": 420
},
{
"epoch": 1.9218259629101284,
"grad_norm": 0.3611456455331726,
"learning_rate": 6.188354725319074e-05,
"loss": 0.4217,
"step": 421
},
{
"epoch": 1.9263908701854495,
"grad_norm": 0.327267880459524,
"learning_rate": 6.177664434933676e-05,
"loss": 0.4239,
"step": 422
},
{
"epoch": 1.9309557774607704,
"grad_norm": 0.2734657156850823,
"learning_rate": 6.166951992296716e-05,
"loss": 0.424,
"step": 423
},
{
"epoch": 1.9355206847360913,
"grad_norm": 0.30352197416491244,
"learning_rate": 6.15621750638032e-05,
"loss": 0.422,
"step": 424
},
{
"epoch": 1.9400855920114122,
"grad_norm": 0.2974349992819165,
"learning_rate": 6.145461086380848e-05,
"loss": 0.4251,
"step": 425
},
{
"epoch": 1.9446504992867333,
"grad_norm": 0.3823668291519408,
"learning_rate": 6.134682841717792e-05,
"loss": 0.4165,
"step": 426
},
{
"epoch": 1.9492154065620542,
"grad_norm": 0.6312244786498952,
"learning_rate": 6.123882882032639e-05,
"loss": 0.4194,
"step": 427
},
{
"epoch": 1.9537803138373753,
"grad_norm": 0.8700056231822155,
"learning_rate": 6.113061317187789e-05,
"loss": 0.4231,
"step": 428
},
{
"epoch": 1.9583452211126962,
"grad_norm": 0.9887590587117958,
"learning_rate": 6.1022182572654063e-05,
"loss": 0.4228,
"step": 429
},
{
"epoch": 1.962910128388017,
"grad_norm": 0.9884807393198494,
"learning_rate": 6.0913538125663236e-05,
"loss": 0.4306,
"step": 430
},
{
"epoch": 1.967475035663338,
"grad_norm": 0.911102518039347,
"learning_rate": 6.0804680936089025e-05,
"loss": 0.4225,
"step": 431
},
{
"epoch": 1.972039942938659,
"grad_norm": 0.7402245060126661,
"learning_rate": 6.069561211127919e-05,
"loss": 0.4222,
"step": 432
},
{
"epoch": 1.9766048502139801,
"grad_norm": 0.5083536699969557,
"learning_rate": 6.05863327607344e-05,
"loss": 0.4219,
"step": 433
},
{
"epoch": 1.981169757489301,
"grad_norm": 0.3496862590580212,
"learning_rate": 6.0476843996096795e-05,
"loss": 0.4203,
"step": 434
},
{
"epoch": 1.985734664764622,
"grad_norm": 0.4037616186832233,
"learning_rate": 6.0367146931138866e-05,
"loss": 0.4184,
"step": 435
},
{
"epoch": 1.9902995720399428,
"grad_norm": 0.4436594227881451,
"learning_rate": 6.025724268175197e-05,
"loss": 0.4208,
"step": 436
},
{
"epoch": 1.994864479315264,
"grad_norm": 0.4512851557242651,
"learning_rate": 6.0147132365935065e-05,
"loss": 0.4165,
"step": 437
},
{
"epoch": 1.999429386590585,
"grad_norm": 0.4765645074474141,
"learning_rate": 6.003681710378335e-05,
"loss": 0.4225,
"step": 438
},
{
"epoch": 2.003994293865906,
"grad_norm": 0.9681866786412074,
"learning_rate": 5.9926298017476774e-05,
"loss": 0.7716,
"step": 439
},
{
"epoch": 2.0085592011412268,
"grad_norm": 1.5630521704235794,
"learning_rate": 5.981557623126876e-05,
"loss": 0.4014,
"step": 440
},
{
"epoch": 2.0131241084165477,
"grad_norm": 0.571055488243649,
"learning_rate": 5.970465287147461e-05,
"loss": 0.4024,
"step": 441
},
{
"epoch": 2.017689015691869,
"grad_norm": 1.7926013030849834,
"learning_rate": 5.959352906646018e-05,
"loss": 0.4049,
"step": 442
},
{
"epoch": 2.02225392296719,
"grad_norm": 0.8802074425647698,
"learning_rate": 5.948220594663035e-05,
"loss": 0.4054,
"step": 443
},
{
"epoch": 2.0268188302425107,
"grad_norm": 1.8078241844652807,
"learning_rate": 5.93706846444175e-05,
"loss": 0.4158,
"step": 444
},
{
"epoch": 2.0313837375178316,
"grad_norm": 1.3366588545623441,
"learning_rate": 5.925896629427006e-05,
"loss": 0.4088,
"step": 445
},
{
"epoch": 2.0359486447931525,
"grad_norm": 1.2979771341051034,
"learning_rate": 5.9147052032640886e-05,
"loss": 0.4112,
"step": 446
},
{
"epoch": 2.0405135520684734,
"grad_norm": 1.0266378699260492,
"learning_rate": 5.9034942997975744e-05,
"loss": 0.4105,
"step": 447
},
{
"epoch": 2.0450784593437947,
"grad_norm": 1.0531734053882162,
"learning_rate": 5.8922640330701734e-05,
"loss": 0.4069,
"step": 448
},
{
"epoch": 2.0496433666191156,
"grad_norm": 0.7366171898891467,
"learning_rate": 5.8810145173215694e-05,
"loss": 0.3995,
"step": 449
},
{
"epoch": 2.0542082738944365,
"grad_norm": 0.8679822170363295,
"learning_rate": 5.869745866987256e-05,
"loss": 0.4064,
"step": 450
},
{
"epoch": 2.0587731811697574,
"grad_norm": 0.6944746602819868,
"learning_rate": 5.8584581966973696e-05,
"loss": 0.403,
"step": 451
},
{
"epoch": 2.0633380884450783,
"grad_norm": 0.5466028481958466,
"learning_rate": 5.847151621275531e-05,
"loss": 0.3997,
"step": 452
},
{
"epoch": 2.0679029957203996,
"grad_norm": 0.6055399950709338,
"learning_rate": 5.8358262557376725e-05,
"loss": 0.3994,
"step": 453
},
{
"epoch": 2.0724679029957205,
"grad_norm": 0.4848917512592404,
"learning_rate": 5.824482215290865e-05,
"loss": 0.404,
"step": 454
},
{
"epoch": 2.0770328102710414,
"grad_norm": 0.498636327804515,
"learning_rate": 5.813119615332154e-05,
"loss": 0.3993,
"step": 455
},
{
"epoch": 2.0815977175463622,
"grad_norm": 0.46905229993366143,
"learning_rate": 5.801738571447378e-05,
"loss": 0.4053,
"step": 456
},
{
"epoch": 2.086162624821683,
"grad_norm": 0.4431759724923076,
"learning_rate": 5.79033919941e-05,
"loss": 0.3966,
"step": 457
},
{
"epoch": 2.0907275320970045,
"grad_norm": 0.5210110584044579,
"learning_rate": 5.7789216151799196e-05,
"loss": 0.3918,
"step": 458
},
{
"epoch": 2.0952924393723253,
"grad_norm": 0.43971729383879754,
"learning_rate": 5.7674859349023064e-05,
"loss": 0.4008,
"step": 459
},
{
"epoch": 2.0998573466476462,
"grad_norm": 0.3602233291051038,
"learning_rate": 5.756032274906405e-05,
"loss": 0.3985,
"step": 460
},
{
"epoch": 2.104422253922967,
"grad_norm": 0.3514674260539934,
"learning_rate": 5.7445607517043646e-05,
"loss": 0.3948,
"step": 461
},
{
"epoch": 2.108987161198288,
"grad_norm": 0.3740970628030136,
"learning_rate": 5.733071481990046e-05,
"loss": 0.3969,
"step": 462
},
{
"epoch": 2.1135520684736093,
"grad_norm": 0.3848411215975852,
"learning_rate": 5.721564582637829e-05,
"loss": 0.3997,
"step": 463
},
{
"epoch": 2.11811697574893,
"grad_norm": 0.36362834255516524,
"learning_rate": 5.710040170701443e-05,
"loss": 0.3941,
"step": 464
},
{
"epoch": 2.122681883024251,
"grad_norm": 0.387749019583618,
"learning_rate": 5.6984983634127534e-05,
"loss": 0.3964,
"step": 465
},
{
"epoch": 2.127246790299572,
"grad_norm": 0.28828513667702704,
"learning_rate": 5.686939278180585e-05,
"loss": 0.3947,
"step": 466
},
{
"epoch": 2.131811697574893,
"grad_norm": 0.30878428184767404,
"learning_rate": 5.675363032589521e-05,
"loss": 0.4029,
"step": 467
},
{
"epoch": 2.1363766048502137,
"grad_norm": 0.3090749309313414,
"learning_rate": 5.6637697443987044e-05,
"loss": 0.3957,
"step": 468
},
{
"epoch": 2.140941512125535,
"grad_norm": 0.3150439666682486,
"learning_rate": 5.6521595315406505e-05,
"loss": 0.3982,
"step": 469
},
{
"epoch": 2.145506419400856,
"grad_norm": 0.30763731181006676,
"learning_rate": 5.640532512120036e-05,
"loss": 0.3978,
"step": 470
},
{
"epoch": 2.150071326676177,
"grad_norm": 0.3156068121770724,
"learning_rate": 5.6288888044125005e-05,
"loss": 0.3989,
"step": 471
},
{
"epoch": 2.1546362339514977,
"grad_norm": 0.3220879180231534,
"learning_rate": 5.6172285268634503e-05,
"loss": 0.3966,
"step": 472
},
{
"epoch": 2.159201141226819,
"grad_norm": 0.2977772434873077,
"learning_rate": 5.6055517980868434e-05,
"loss": 0.3953,
"step": 473
},
{
"epoch": 2.16376604850214,
"grad_norm": 0.29810536611214883,
"learning_rate": 5.59385873686399e-05,
"loss": 0.3918,
"step": 474
},
{
"epoch": 2.168330955777461,
"grad_norm": 0.2852081639489458,
"learning_rate": 5.582149462142341e-05,
"loss": 0.3917,
"step": 475
},
{
"epoch": 2.1728958630527817,
"grad_norm": 0.3081209690427737,
"learning_rate": 5.570424093034279e-05,
"loss": 0.3968,
"step": 476
},
{
"epoch": 2.1774607703281026,
"grad_norm": 0.29194114681550415,
"learning_rate": 5.558682748815907e-05,
"loss": 0.3976,
"step": 477
},
{
"epoch": 2.1820256776034235,
"grad_norm": 0.2933275523587884,
"learning_rate": 5.546925548925831e-05,
"loss": 0.396,
"step": 478
},
{
"epoch": 2.186590584878745,
"grad_norm": 0.3010159866072722,
"learning_rate": 5.5351526129639556e-05,
"loss": 0.3947,
"step": 479
},
{
"epoch": 2.1911554921540657,
"grad_norm": 0.22593376332738507,
"learning_rate": 5.523364060690253e-05,
"loss": 0.3947,
"step": 480
},
{
"epoch": 2.1957203994293866,
"grad_norm": 0.263407864251361,
"learning_rate": 5.511560012023558e-05,
"loss": 0.3968,
"step": 481
},
{
"epoch": 2.2002853067047075,
"grad_norm": 0.2621235557428232,
"learning_rate": 5.499740587040337e-05,
"loss": 0.3957,
"step": 482
},
{
"epoch": 2.2048502139800283,
"grad_norm": 0.28672274408401716,
"learning_rate": 5.487905905973474e-05,
"loss": 0.3982,
"step": 483
},
{
"epoch": 2.2094151212553497,
"grad_norm": 0.2961044553045987,
"learning_rate": 5.476056089211047e-05,
"loss": 0.3953,
"step": 484
},
{
"epoch": 2.2139800285306706,
"grad_norm": 0.24362816536099371,
"learning_rate": 5.464191257295099e-05,
"loss": 0.3947,
"step": 485
},
{
"epoch": 2.2185449358059914,
"grad_norm": 0.20568524425771714,
"learning_rate": 5.4523115309204154e-05,
"loss": 0.3904,
"step": 486
},
{
"epoch": 2.2231098430813123,
"grad_norm": 0.23433795861015624,
"learning_rate": 5.440417030933296e-05,
"loss": 0.3887,
"step": 487
},
{
"epoch": 2.227674750356633,
"grad_norm": 0.2637906019753822,
"learning_rate": 5.4285078783303204e-05,
"loss": 0.398,
"step": 488
},
{
"epoch": 2.2322396576319545,
"grad_norm": 0.27336753680428566,
"learning_rate": 5.41658419425713e-05,
"loss": 0.4012,
"step": 489
},
{
"epoch": 2.2368045649072754,
"grad_norm": 0.36009442351022874,
"learning_rate": 5.404646100007179e-05,
"loss": 0.3946,
"step": 490
},
{
"epoch": 2.2413694721825963,
"grad_norm": 0.34745386802364503,
"learning_rate": 5.3926937170205147e-05,
"loss": 0.3988,
"step": 491
},
{
"epoch": 2.245934379457917,
"grad_norm": 0.2560762047673561,
"learning_rate": 5.3807271668825336e-05,
"loss": 0.3981,
"step": 492
},
{
"epoch": 2.250499286733238,
"grad_norm": 0.3039971848868009,
"learning_rate": 5.368746571322746e-05,
"loss": 0.3983,
"step": 493
},
{
"epoch": 2.2550641940085594,
"grad_norm": 0.3132362113371044,
"learning_rate": 5.356752052213543e-05,
"loss": 0.3949,
"step": 494
},
{
"epoch": 2.2596291012838803,
"grad_norm": 0.24114011641199656,
"learning_rate": 5.344743731568947e-05,
"loss": 0.398,
"step": 495
},
{
"epoch": 2.264194008559201,
"grad_norm": 0.3047112832257029,
"learning_rate": 5.3327217315433836e-05,
"loss": 0.3942,
"step": 496
},
{
"epoch": 2.268758915834522,
"grad_norm": 0.2902868163178195,
"learning_rate": 5.320686174430426e-05,
"loss": 0.3968,
"step": 497
},
{
"epoch": 2.273323823109843,
"grad_norm": 0.21554384652324848,
"learning_rate": 5.30863718266156e-05,
"loss": 0.3945,
"step": 498
},
{
"epoch": 2.277888730385164,
"grad_norm": 0.27255050697912414,
"learning_rate": 5.296574878804931e-05,
"loss": 0.3968,
"step": 499
},
{
"epoch": 2.282453637660485,
"grad_norm": 0.24417856593881826,
"learning_rate": 5.284499385564105e-05,
"loss": 0.3943,
"step": 500
},
{
"epoch": 2.287018544935806,
"grad_norm": 0.2765091011577604,
"learning_rate": 5.272410825776817e-05,
"loss": 0.3977,
"step": 501
},
{
"epoch": 2.291583452211127,
"grad_norm": 0.25251367644483425,
"learning_rate": 5.260309322413717e-05,
"loss": 0.3965,
"step": 502
},
{
"epoch": 2.296148359486448,
"grad_norm": 0.225515136380665,
"learning_rate": 5.2481949985771296e-05,
"loss": 0.397,
"step": 503
},
{
"epoch": 2.300713266761769,
"grad_norm": 0.2463270131913551,
"learning_rate": 5.23606797749979e-05,
"loss": 0.3911,
"step": 504
},
{
"epoch": 2.30527817403709,
"grad_norm": 0.2004757886081446,
"learning_rate": 5.223928382543599e-05,
"loss": 0.4003,
"step": 505
},
{
"epoch": 2.309843081312411,
"grad_norm": 0.2455120366926149,
"learning_rate": 5.211776337198362e-05,
"loss": 0.3966,
"step": 506
},
{
"epoch": 2.314407988587732,
"grad_norm": 0.22468493984434562,
"learning_rate": 5.199611965080539e-05,
"loss": 0.3936,
"step": 507
},
{
"epoch": 2.3189728958630527,
"grad_norm": 0.2272194472097314,
"learning_rate": 5.187435389931984e-05,
"loss": 0.3995,
"step": 508
},
{
"epoch": 2.3235378031383735,
"grad_norm": 0.24682570308387836,
"learning_rate": 5.175246735618681e-05,
"loss": 0.3936,
"step": 509
},
{
"epoch": 2.328102710413695,
"grad_norm": 0.27271797206857185,
"learning_rate": 5.163046126129496e-05,
"loss": 0.3979,
"step": 510
},
{
"epoch": 2.3326676176890158,
"grad_norm": 0.32336247263945955,
"learning_rate": 5.1508336855749046e-05,
"loss": 0.3984,
"step": 511
},
{
"epoch": 2.3372325249643366,
"grad_norm": 0.3117192297000078,
"learning_rate": 5.138609538185732e-05,
"loss": 0.3957,
"step": 512
},
{
"epoch": 2.3417974322396575,
"grad_norm": 0.24977202926389208,
"learning_rate": 5.126373808311897e-05,
"loss": 0.401,
"step": 513
},
{
"epoch": 2.3463623395149784,
"grad_norm": 0.2724907116929327,
"learning_rate": 5.114126620421135e-05,
"loss": 0.3992,
"step": 514
},
{
"epoch": 2.3509272467902997,
"grad_norm": 0.30820474754190696,
"learning_rate": 5.101868099097741e-05,
"loss": 0.3891,
"step": 515
},
{
"epoch": 2.3554921540656206,
"grad_norm": 0.3185274771440913,
"learning_rate": 5.0895983690413013e-05,
"loss": 0.3901,
"step": 516
},
{
"epoch": 2.3600570613409415,
"grad_norm": 0.2581547898350787,
"learning_rate": 5.077317555065417e-05,
"loss": 0.3996,
"step": 517
},
{
"epoch": 2.3646219686162624,
"grad_norm": 0.3277207154296582,
"learning_rate": 5.065025782096443e-05,
"loss": 0.4004,
"step": 518
},
{
"epoch": 2.3691868758915833,
"grad_norm": 0.3507469445057821,
"learning_rate": 5.052723175172216e-05,
"loss": 0.3961,
"step": 519
},
{
"epoch": 2.3737517831669046,
"grad_norm": 0.31296981719792,
"learning_rate": 5.04040985944078e-05,
"loss": 0.3916,
"step": 520
},
{
"epoch": 2.3783166904422255,
"grad_norm": 0.25011058332771036,
"learning_rate": 5.0280859601591134e-05,
"loss": 0.3946,
"step": 521
},
{
"epoch": 2.3828815977175464,
"grad_norm": 0.2843447982406519,
"learning_rate": 5.015751602691853e-05,
"loss": 0.3951,
"step": 522
},
{
"epoch": 2.3874465049928673,
"grad_norm": 0.2611424665922288,
"learning_rate": 5.003406912510028e-05,
"loss": 0.3905,
"step": 523
},
{
"epoch": 2.392011412268188,
"grad_norm": 0.24619751471031087,
"learning_rate": 4.99105201518977e-05,
"loss": 0.395,
"step": 524
},
{
"epoch": 2.3965763195435095,
"grad_norm": 0.2808716720105363,
"learning_rate": 4.9786870364110496e-05,
"loss": 0.3999,
"step": 525
},
{
"epoch": 2.4011412268188304,
"grad_norm": 0.2353990460137413,
"learning_rate": 4.9663121019563825e-05,
"loss": 0.3998,
"step": 526
},
{
"epoch": 2.4057061340941512,
"grad_norm": 0.2143855784356749,
"learning_rate": 4.953927337709564e-05,
"loss": 0.39,
"step": 527
},
{
"epoch": 2.410271041369472,
"grad_norm": 0.30814345667504994,
"learning_rate": 4.941532869654383e-05,
"loss": 0.3946,
"step": 528
},
{
"epoch": 2.414835948644793,
"grad_norm": 0.2524848431138124,
"learning_rate": 4.929128823873338e-05,
"loss": 0.3916,
"step": 529
},
{
"epoch": 2.419400855920114,
"grad_norm": 0.2949956098139477,
"learning_rate": 4.916715326546356e-05,
"loss": 0.3973,
"step": 530
},
{
"epoch": 2.423965763195435,
"grad_norm": 0.2866631443858236,
"learning_rate": 4.9042925039495126e-05,
"loss": 0.4008,
"step": 531
},
{
"epoch": 2.428530670470756,
"grad_norm": 0.24434538853731672,
"learning_rate": 4.8918604824537426e-05,
"loss": 0.3925,
"step": 532
},
{
"epoch": 2.433095577746077,
"grad_norm": 0.27376916436365306,
"learning_rate": 4.879419388523557e-05,
"loss": 0.396,
"step": 533
},
{
"epoch": 2.437660485021398,
"grad_norm": 0.29192166428895333,
"learning_rate": 4.866969348715755e-05,
"loss": 0.3939,
"step": 534
},
{
"epoch": 2.442225392296719,
"grad_norm": 0.22678026571928478,
"learning_rate": 4.8545104896781396e-05,
"loss": 0.3931,
"step": 535
},
{
"epoch": 2.44679029957204,
"grad_norm": 0.2714093005509255,
"learning_rate": 4.8420429381482254e-05,
"loss": 0.3918,
"step": 536
},
{
"epoch": 2.451355206847361,
"grad_norm": 0.3672921491066907,
"learning_rate": 4.829566820951953e-05,
"loss": 0.3989,
"step": 537
},
{
"epoch": 2.455920114122682,
"grad_norm": 0.37866544196679924,
"learning_rate": 4.817082265002395e-05,
"loss": 0.3945,
"step": 538
},
{
"epoch": 2.4604850213980027,
"grad_norm": 0.342928509500153,
"learning_rate": 4.804589397298467e-05,
"loss": 0.3926,
"step": 539
},
{
"epoch": 2.4650499286733236,
"grad_norm": 0.30273238621233484,
"learning_rate": 4.792088344923639e-05,
"loss": 0.3964,
"step": 540
},
{
"epoch": 2.469614835948645,
"grad_norm": 0.2137070264105446,
"learning_rate": 4.779579235044635e-05,
"loss": 0.4013,
"step": 541
},
{
"epoch": 2.474179743223966,
"grad_norm": 0.3118277046924234,
"learning_rate": 4.767062194910147e-05,
"loss": 0.3927,
"step": 542
},
{
"epoch": 2.4787446504992867,
"grad_norm": 0.34415527393891965,
"learning_rate": 4.7545373518495376e-05,
"loss": 0.3981,
"step": 543
},
{
"epoch": 2.4833095577746076,
"grad_norm": 0.25521983324842423,
"learning_rate": 4.7420048332715424e-05,
"loss": 0.3993,
"step": 544
},
{
"epoch": 2.4878744650499285,
"grad_norm": 0.2861375081450235,
"learning_rate": 4.7294647666629764e-05,
"loss": 0.3962,
"step": 545
},
{
"epoch": 2.49243937232525,
"grad_norm": 0.35327944054617416,
"learning_rate": 4.716917279587438e-05,
"loss": 0.3929,
"step": 546
},
{
"epoch": 2.4970042796005707,
"grad_norm": 0.3065127258311644,
"learning_rate": 4.704362499684009e-05,
"loss": 0.3917,
"step": 547
},
{
"epoch": 2.5015691868758916,
"grad_norm": 0.2971240514783471,
"learning_rate": 4.691800554665959e-05,
"loss": 0.3951,
"step": 548
},
{
"epoch": 2.5061340941512125,
"grad_norm": 0.2526840084770543,
"learning_rate": 4.679231572319442e-05,
"loss": 0.3965,
"step": 549
},
{
"epoch": 2.5106990014265333,
"grad_norm": 0.27506914680614103,
"learning_rate": 4.666655680502203e-05,
"loss": 0.3909,
"step": 550
},
{
"epoch": 2.5152639087018542,
"grad_norm": 0.30990634996692296,
"learning_rate": 4.654073007142268e-05,
"loss": 0.3991,
"step": 551
},
{
"epoch": 2.5198288159771756,
"grad_norm": 0.2418594039084614,
"learning_rate": 4.641483680236654e-05,
"loss": 0.3942,
"step": 552
},
{
"epoch": 2.5243937232524964,
"grad_norm": 0.2176242576204114,
"learning_rate": 4.628887827850056e-05,
"loss": 0.3925,
"step": 553
},
{
"epoch": 2.5289586305278173,
"grad_norm": 0.21882885808826003,
"learning_rate": 4.6162855781135534e-05,
"loss": 0.3967,
"step": 554
},
{
"epoch": 2.533523537803138,
"grad_norm": 0.2753010159681235,
"learning_rate": 4.6036770592233e-05,
"loss": 0.3988,
"step": 555
},
{
"epoch": 2.5380884450784595,
"grad_norm": 0.27865187586221307,
"learning_rate": 4.591062399439223e-05,
"loss": 0.3892,
"step": 556
},
{
"epoch": 2.5426533523537804,
"grad_norm": 0.25469855214873266,
"learning_rate": 4.578441727083718e-05,
"loss": 0.3975,
"step": 557
},
{
"epoch": 2.5472182596291013,
"grad_norm": 0.20957376603739114,
"learning_rate": 4.5658151705403416e-05,
"loss": 0.4014,
"step": 558
},
{
"epoch": 2.551783166904422,
"grad_norm": 0.2600853257379872,
"learning_rate": 4.553182858252514e-05,
"loss": 0.3971,
"step": 559
},
{
"epoch": 2.556348074179743,
"grad_norm": 0.2776565181123319,
"learning_rate": 4.540544918722195e-05,
"loss": 0.4039,
"step": 560
},
{
"epoch": 2.560912981455064,
"grad_norm": 0.23379879408515905,
"learning_rate": 4.527901480508595e-05,
"loss": 0.3945,
"step": 561
},
{
"epoch": 2.5654778887303853,
"grad_norm": 0.24124737895746404,
"learning_rate": 4.515252672226858e-05,
"loss": 0.3945,
"step": 562
},
{
"epoch": 2.570042796005706,
"grad_norm": 0.2917521663677071,
"learning_rate": 4.5025986225467533e-05,
"loss": 0.3934,
"step": 563
},
{
"epoch": 2.574607703281027,
"grad_norm": 0.2530412971141297,
"learning_rate": 4.4899394601913724e-05,
"loss": 0.3937,
"step": 564
},
{
"epoch": 2.579172610556348,
"grad_norm": 0.22637706164941146,
"learning_rate": 4.477275313935807e-05,
"loss": 0.3926,
"step": 565
},
{
"epoch": 2.5837375178316693,
"grad_norm": 0.23838840534635813,
"learning_rate": 4.464606312605858e-05,
"loss": 0.3977,
"step": 566
},
{
"epoch": 2.58830242510699,
"grad_norm": 0.24947138899802643,
"learning_rate": 4.451932585076707e-05,
"loss": 0.3959,
"step": 567
},
{
"epoch": 2.592867332382311,
"grad_norm": 0.27260964086966344,
"learning_rate": 4.439254260271615e-05,
"loss": 0.3914,
"step": 568
},
{
"epoch": 2.597432239657632,
"grad_norm": 0.23394296464858344,
"learning_rate": 4.426571467160609e-05,
"loss": 0.3987,
"step": 569
},
{
"epoch": 2.601997146932953,
"grad_norm": 0.24289503390540143,
"learning_rate": 4.413884334759169e-05,
"loss": 0.3939,
"step": 570
},
{
"epoch": 2.6065620542082737,
"grad_norm": 0.29117956540296497,
"learning_rate": 4.401192992126918e-05,
"loss": 0.3956,
"step": 571
},
{
"epoch": 2.611126961483595,
"grad_norm": 0.2237089907238584,
"learning_rate": 4.3884975683663076e-05,
"loss": 0.3955,
"step": 572
},
{
"epoch": 2.615691868758916,
"grad_norm": 0.24264576691075865,
"learning_rate": 4.375798192621298e-05,
"loss": 0.3928,
"step": 573
},
{
"epoch": 2.620256776034237,
"grad_norm": 0.2751685982168088,
"learning_rate": 4.363094994076063e-05,
"loss": 0.3966,
"step": 574
},
{
"epoch": 2.6248216833095577,
"grad_norm": 0.28747813965799607,
"learning_rate": 4.350388101953652e-05,
"loss": 0.3943,
"step": 575
},
{
"epoch": 2.629386590584879,
"grad_norm": 0.2805974268000736,
"learning_rate": 4.337677645514696e-05,
"loss": 0.3937,
"step": 576
},
{
"epoch": 2.6339514978602,
"grad_norm": 0.25668107207724133,
"learning_rate": 4.3249637540560775e-05,
"loss": 0.3992,
"step": 577
},
{
"epoch": 2.6385164051355208,
"grad_norm": 0.22123626260157714,
"learning_rate": 4.312246556909625e-05,
"loss": 0.3905,
"step": 578
},
{
"epoch": 2.6430813124108417,
"grad_norm": 0.2889764064202931,
"learning_rate": 4.299526183440795e-05,
"loss": 0.3939,
"step": 579
},
{
"epoch": 2.6476462196861625,
"grad_norm": 0.2813860421214819,
"learning_rate": 4.286802763047351e-05,
"loss": 0.3915,
"step": 580
},
{
"epoch": 2.6522111269614834,
"grad_norm": 0.19910247285499882,
"learning_rate": 4.274076425158056e-05,
"loss": 0.3948,
"step": 581
},
{
"epoch": 2.6567760342368043,
"grad_norm": 0.29602208357801046,
"learning_rate": 4.2613472992313475e-05,
"loss": 0.394,
"step": 582
},
{
"epoch": 2.6613409415121256,
"grad_norm": 0.3211859467751027,
"learning_rate": 4.2486155147540275e-05,
"loss": 0.3952,
"step": 583
},
{
"epoch": 2.6659058487874465,
"grad_norm": 0.2957713814210968,
"learning_rate": 4.235881201239938e-05,
"loss": 0.3944,
"step": 584
},
{
"epoch": 2.6704707560627674,
"grad_norm": 0.19210627965781088,
"learning_rate": 4.22314448822865e-05,
"loss": 0.3973,
"step": 585
},
{
"epoch": 2.6750356633380883,
"grad_norm": 0.22185230850956422,
"learning_rate": 4.210405505284146e-05,
"loss": 0.3913,
"step": 586
},
{
"epoch": 2.6796005706134096,
"grad_norm": 0.26397988476550216,
"learning_rate": 4.197664381993495e-05,
"loss": 0.3933,
"step": 587
},
{
"epoch": 2.6841654778887305,
"grad_norm": 0.2616617955056205,
"learning_rate": 4.1849212479655404e-05,
"loss": 0.3978,
"step": 588
},
{
"epoch": 2.6887303851640514,
"grad_norm": 0.2067735440913514,
"learning_rate": 4.172176232829579e-05,
"loss": 0.3918,
"step": 589
},
{
"epoch": 2.6932952924393723,
"grad_norm": 0.2129097941200269,
"learning_rate": 4.159429466234042e-05,
"loss": 0.3934,
"step": 590
},
{
"epoch": 2.697860199714693,
"grad_norm": 0.19320324303685169,
"learning_rate": 4.146681077845184e-05,
"loss": 0.4005,
"step": 591
},
{
"epoch": 2.702425106990014,
"grad_norm": 0.21243002834135477,
"learning_rate": 4.133931197345747e-05,
"loss": 0.3914,
"step": 592
},
{
"epoch": 2.7069900142653354,
"grad_norm": 0.21832438259968404,
"learning_rate": 4.1211799544336604e-05,
"loss": 0.3969,
"step": 593
},
{
"epoch": 2.7115549215406562,
"grad_norm": 0.22033236523137045,
"learning_rate": 4.108427478820707e-05,
"loss": 0.3914,
"step": 594
},
{
"epoch": 2.716119828815977,
"grad_norm": 0.22106513057663857,
"learning_rate": 4.095673900231212e-05,
"loss": 0.3951,
"step": 595
},
{
"epoch": 2.720684736091298,
"grad_norm": 0.22871838319618967,
"learning_rate": 4.0829193484007216e-05,
"loss": 0.3965,
"step": 596
},
{
"epoch": 2.7252496433666193,
"grad_norm": 0.22285026812146833,
"learning_rate": 4.070163953074676e-05,
"loss": 0.3896,
"step": 597
},
{
"epoch": 2.7298145506419402,
"grad_norm": 0.21511862495234116,
"learning_rate": 4.0574078440071056e-05,
"loss": 0.3908,
"step": 598
},
{
"epoch": 2.734379457917261,
"grad_norm": 0.21107167137669283,
"learning_rate": 4.044651150959294e-05,
"loss": 0.3917,
"step": 599
},
{
"epoch": 2.738944365192582,
"grad_norm": 0.23870814693322587,
"learning_rate": 4.031894003698467e-05,
"loss": 0.3955,
"step": 600
},
{
"epoch": 2.743509272467903,
"grad_norm": 0.22220033003757905,
"learning_rate": 4.0191365319964724e-05,
"loss": 0.3917,
"step": 601
},
{
"epoch": 2.7480741797432238,
"grad_norm": 0.19453278202618698,
"learning_rate": 4.006378865628455e-05,
"loss": 0.3893,
"step": 602
},
{
"epoch": 2.752639087018545,
"grad_norm": 0.23239542400687424,
"learning_rate": 3.993621134371545e-05,
"loss": 0.3933,
"step": 603
},
{
"epoch": 2.757203994293866,
"grad_norm": 0.24428871107116487,
"learning_rate": 3.980863468003529e-05,
"loss": 0.3935,
"step": 604
},
{
"epoch": 2.761768901569187,
"grad_norm": 0.19760601046120077,
"learning_rate": 3.968105996301535e-05,
"loss": 0.3918,
"step": 605
},
{
"epoch": 2.7663338088445077,
"grad_norm": 0.2501742444848214,
"learning_rate": 3.955348849040707e-05,
"loss": 0.3897,
"step": 606
},
{
"epoch": 2.770898716119829,
"grad_norm": 0.2319084390296337,
"learning_rate": 3.942592155992895e-05,
"loss": 0.3961,
"step": 607
},
{
"epoch": 2.77546362339515,
"grad_norm": 0.2244356749086854,
"learning_rate": 3.929836046925323e-05,
"loss": 0.3943,
"step": 608
},
{
"epoch": 2.780028530670471,
"grad_norm": 0.21979544227639491,
"learning_rate": 3.91708065159928e-05,
"loss": 0.3955,
"step": 609
},
{
"epoch": 2.7845934379457917,
"grad_norm": 0.19742195901212328,
"learning_rate": 3.904326099768789e-05,
"loss": 0.3949,
"step": 610
},
{
"epoch": 2.7891583452211126,
"grad_norm": 0.22234755877545048,
"learning_rate": 3.8915725211792944e-05,
"loss": 0.3953,
"step": 611
},
{
"epoch": 2.7937232524964335,
"grad_norm": 0.2231966829166695,
"learning_rate": 3.8788200455663416e-05,
"loss": 0.3881,
"step": 612
},
{
"epoch": 2.7982881597717544,
"grad_norm": 0.23610363359578615,
"learning_rate": 3.8660688026542544e-05,
"loss": 0.3953,
"step": 613
},
{
"epoch": 2.8028530670470757,
"grad_norm": 0.24368628185935592,
"learning_rate": 3.853318922154818e-05,
"loss": 0.3913,
"step": 614
},
{
"epoch": 2.8074179743223966,
"grad_norm": 0.22986532483466307,
"learning_rate": 3.840570533765959e-05,
"loss": 0.393,
"step": 615
},
{
"epoch": 2.8119828815977175,
"grad_norm": 0.2384209501930521,
"learning_rate": 3.827823767170423e-05,
"loss": 0.391,
"step": 616
},
{
"epoch": 2.8165477888730384,
"grad_norm": 0.22125793484405842,
"learning_rate": 3.815078752034461e-05,
"loss": 0.3946,
"step": 617
},
{
"epoch": 2.8211126961483597,
"grad_norm": 0.2535177819158247,
"learning_rate": 3.802335618006506e-05,
"loss": 0.3915,
"step": 618
},
{
"epoch": 2.8256776034236806,
"grad_norm": 0.2396475170274598,
"learning_rate": 3.7895944947158535e-05,
"loss": 0.3958,
"step": 619
},
{
"epoch": 2.8302425106990015,
"grad_norm": 0.20749976321329167,
"learning_rate": 3.77685551177135e-05,
"loss": 0.3952,
"step": 620
},
{
"epoch": 2.8348074179743223,
"grad_norm": 0.16508755912089354,
"learning_rate": 3.764118798760064e-05,
"loss": 0.3844,
"step": 621
},
{
"epoch": 2.8393723252496432,
"grad_norm": 0.20726854721909735,
"learning_rate": 3.7513844852459745e-05,
"loss": 0.3945,
"step": 622
},
{
"epoch": 2.843937232524964,
"grad_norm": 0.18560003219863108,
"learning_rate": 3.738652700768653e-05,
"loss": 0.3974,
"step": 623
},
{
"epoch": 2.8485021398002854,
"grad_norm": 0.17587194547537272,
"learning_rate": 3.725923574841945e-05,
"loss": 0.3903,
"step": 624
},
{
"epoch": 2.8530670470756063,
"grad_norm": 0.2133689614478487,
"learning_rate": 3.7131972369526505e-05,
"loss": 0.3939,
"step": 625
},
{
"epoch": 2.857631954350927,
"grad_norm": 0.17416385677862986,
"learning_rate": 3.700473816559207e-05,
"loss": 0.3913,
"step": 626
},
{
"epoch": 2.862196861626248,
"grad_norm": 0.17910642220727754,
"learning_rate": 3.687753443090375e-05,
"loss": 0.3991,
"step": 627
},
{
"epoch": 2.8667617689015694,
"grad_norm": 0.18645917268183274,
"learning_rate": 3.675036245943923e-05,
"loss": 0.3933,
"step": 628
},
{
"epoch": 2.8713266761768903,
"grad_norm": 0.22702514829934878,
"learning_rate": 3.662322354485306e-05,
"loss": 0.3927,
"step": 629
},
{
"epoch": 2.875891583452211,
"grad_norm": 0.19751990109383138,
"learning_rate": 3.6496118980463486e-05,
"loss": 0.3913,
"step": 630
},
{
"epoch": 2.880456490727532,
"grad_norm": 0.17351220103657103,
"learning_rate": 3.6369050059239384e-05,
"loss": 0.3918,
"step": 631
},
{
"epoch": 2.885021398002853,
"grad_norm": 0.20221410813950186,
"learning_rate": 3.624201807378703e-05,
"loss": 0.3934,
"step": 632
},
{
"epoch": 2.889586305278174,
"grad_norm": 0.15933130426619596,
"learning_rate": 3.6115024316336944e-05,
"loss": 0.3891,
"step": 633
},
{
"epoch": 2.894151212553495,
"grad_norm": 0.18767258317171362,
"learning_rate": 3.598807007873083e-05,
"loss": 0.3935,
"step": 634
},
{
"epoch": 2.898716119828816,
"grad_norm": 0.17072434668538097,
"learning_rate": 3.586115665240832e-05,
"loss": 0.3923,
"step": 635
},
{
"epoch": 2.903281027104137,
"grad_norm": 0.19788995991405794,
"learning_rate": 3.573428532839392e-05,
"loss": 0.3924,
"step": 636
},
{
"epoch": 2.907845934379458,
"grad_norm": 0.18823937945201608,
"learning_rate": 3.560745739728387e-05,
"loss": 0.389,
"step": 637
},
{
"epoch": 2.912410841654779,
"grad_norm": 0.16176660376720053,
"learning_rate": 3.548067414923294e-05,
"loss": 0.3974,
"step": 638
},
{
"epoch": 2.9169757489301,
"grad_norm": 0.15959896392390563,
"learning_rate": 3.5353936873941435e-05,
"loss": 0.3888,
"step": 639
},
{
"epoch": 2.921540656205421,
"grad_norm": 0.161034937587383,
"learning_rate": 3.522724686064194e-05,
"loss": 0.3953,
"step": 640
},
{
"epoch": 2.926105563480742,
"grad_norm": 0.18385006824388012,
"learning_rate": 3.5100605398086296e-05,
"loss": 0.3923,
"step": 641
},
{
"epoch": 2.9306704707560627,
"grad_norm": 0.16430295552203472,
"learning_rate": 3.497401377453247e-05,
"loss": 0.3916,
"step": 642
},
{
"epoch": 2.9352353780313836,
"grad_norm": 0.15722062849324564,
"learning_rate": 3.484747327773142e-05,
"loss": 0.3887,
"step": 643
},
{
"epoch": 2.9398002853067045,
"grad_norm": 0.14393903212852108,
"learning_rate": 3.472098519491406e-05,
"loss": 0.3914,
"step": 644
},
{
"epoch": 2.944365192582026,
"grad_norm": 0.17232972585909226,
"learning_rate": 3.459455081277806e-05,
"loss": 0.393,
"step": 645
},
{
"epoch": 2.9489300998573467,
"grad_norm": 0.16872545932202826,
"learning_rate": 3.446817141747487e-05,
"loss": 0.394,
"step": 646
},
{
"epoch": 2.9534950071326675,
"grad_norm": 0.20373345038244411,
"learning_rate": 3.434184829459659e-05,
"loss": 0.3955,
"step": 647
},
{
"epoch": 2.9580599144079884,
"grad_norm": 0.16582792220621265,
"learning_rate": 3.421558272916284e-05,
"loss": 0.3897,
"step": 648
},
{
"epoch": 2.9626248216833098,
"grad_norm": 0.17195177157795527,
"learning_rate": 3.408937600560778e-05,
"loss": 0.3931,
"step": 649
},
{
"epoch": 2.9671897289586306,
"grad_norm": 0.17099363542536267,
"learning_rate": 3.3963229407767014e-05,
"loss": 0.3932,
"step": 650
},
{
"epoch": 2.9717546362339515,
"grad_norm": 0.18406454501706795,
"learning_rate": 3.3837144218864466e-05,
"loss": 0.3926,
"step": 651
},
{
"epoch": 2.9763195435092724,
"grad_norm": 0.17696057601736395,
"learning_rate": 3.371112172149945e-05,
"loss": 0.3951,
"step": 652
},
{
"epoch": 2.9808844507845933,
"grad_norm": 0.20588927188905604,
"learning_rate": 3.358516319763348e-05,
"loss": 0.3908,
"step": 653
},
{
"epoch": 2.985449358059914,
"grad_norm": 0.15024406311611282,
"learning_rate": 3.3459269928577326e-05,
"loss": 0.3965,
"step": 654
},
{
"epoch": 2.9900142653352355,
"grad_norm": 0.18315742998535253,
"learning_rate": 3.3333443194977985e-05,
"loss": 0.3878,
"step": 655
},
{
"epoch": 2.9945791726105564,
"grad_norm": 0.15190454402880918,
"learning_rate": 3.32076842768056e-05,
"loss": 0.4001,
"step": 656
},
{
"epoch": 2.9991440798858773,
"grad_norm": 0.1802522348541421,
"learning_rate": 3.3081994453340425e-05,
"loss": 0.3912,
"step": 657
},
{
"epoch": 3.003708987161198,
"grad_norm": 0.4371593688422328,
"learning_rate": 3.295637500315992e-05,
"loss": 0.7621,
"step": 658
},
{
"epoch": 3.008273894436519,
"grad_norm": 0.44775433837572887,
"learning_rate": 3.283082720412563e-05,
"loss": 0.3662,
"step": 659
},
{
"epoch": 3.0128388017118404,
"grad_norm": 0.3890902219171831,
"learning_rate": 3.270535233337024e-05,
"loss": 0.3743,
"step": 660
},
{
"epoch": 3.0174037089871613,
"grad_norm": 0.3003030553223454,
"learning_rate": 3.2579951667284596e-05,
"loss": 0.3721,
"step": 661
},
{
"epoch": 3.021968616262482,
"grad_norm": 0.3515656966921728,
"learning_rate": 3.245462648150463e-05,
"loss": 0.3684,
"step": 662
},
{
"epoch": 3.026533523537803,
"grad_norm": 0.2934678186313399,
"learning_rate": 3.232937805089854e-05,
"loss": 0.3688,
"step": 663
},
{
"epoch": 3.031098430813124,
"grad_norm": 0.2665389758970002,
"learning_rate": 3.2204207649553665e-05,
"loss": 0.3687,
"step": 664
},
{
"epoch": 3.0356633380884452,
"grad_norm": 0.2886505257763211,
"learning_rate": 3.2079116550763624e-05,
"loss": 0.3715,
"step": 665
},
{
"epoch": 3.040228245363766,
"grad_norm": 0.26410869047435404,
"learning_rate": 3.195410602701535e-05,
"loss": 0.3755,
"step": 666
},
{
"epoch": 3.044793152639087,
"grad_norm": 0.33726378634419146,
"learning_rate": 3.182917734997607e-05,
"loss": 0.3715,
"step": 667
},
{
"epoch": 3.049358059914408,
"grad_norm": 0.23119146133187443,
"learning_rate": 3.170433179048048e-05,
"loss": 0.3752,
"step": 668
},
{
"epoch": 3.0539229671897288,
"grad_norm": 0.317803893708356,
"learning_rate": 3.157957061851775e-05,
"loss": 0.3668,
"step": 669
},
{
"epoch": 3.05848787446505,
"grad_norm": 0.32056814826697005,
"learning_rate": 3.1454895103218604e-05,
"loss": 0.3684,
"step": 670
},
{
"epoch": 3.063052781740371,
"grad_norm": 0.2227493640724446,
"learning_rate": 3.133030651284246e-05,
"loss": 0.3708,
"step": 671
},
{
"epoch": 3.067617689015692,
"grad_norm": 0.3474416944410334,
"learning_rate": 3.1205806114764455e-05,
"loss": 0.37,
"step": 672
},
{
"epoch": 3.0721825962910128,
"grad_norm": 0.23598864945070333,
"learning_rate": 3.108139517546259e-05,
"loss": 0.3679,
"step": 673
},
{
"epoch": 3.0767475035663336,
"grad_norm": 0.26225028839730374,
"learning_rate": 3.095707496050489e-05,
"loss": 0.3711,
"step": 674
},
{
"epoch": 3.081312410841655,
"grad_norm": 0.30541556508523,
"learning_rate": 3.083284673453645e-05,
"loss": 0.3705,
"step": 675
},
{
"epoch": 3.085877318116976,
"grad_norm": 0.2375554525248402,
"learning_rate": 3.070871176126664e-05,
"loss": 0.3674,
"step": 676
},
{
"epoch": 3.0904422253922967,
"grad_norm": 0.24938738146004832,
"learning_rate": 3.058467130345619e-05,
"loss": 0.3693,
"step": 677
},
{
"epoch": 3.0950071326676176,
"grad_norm": 0.194861014641011,
"learning_rate": 3.0460726622904362e-05,
"loss": 0.3709,
"step": 678
},
{
"epoch": 3.0995720399429385,
"grad_norm": 0.22427886537482727,
"learning_rate": 3.033687898043619e-05,
"loss": 0.3674,
"step": 679
},
{
"epoch": 3.10413694721826,
"grad_norm": 0.1931234684152515,
"learning_rate": 3.0213129635889527e-05,
"loss": 0.3707,
"step": 680
},
{
"epoch": 3.1087018544935807,
"grad_norm": 0.19750887437108353,
"learning_rate": 3.0089479848102302e-05,
"loss": 0.3726,
"step": 681
},
{
"epoch": 3.1132667617689016,
"grad_norm": 0.22984346500906291,
"learning_rate": 2.9965930874899734e-05,
"loss": 0.3691,
"step": 682
},
{
"epoch": 3.1178316690442225,
"grad_norm": 0.16695271136501227,
"learning_rate": 2.984248397308149e-05,
"loss": 0.3669,
"step": 683
},
{
"epoch": 3.1223965763195434,
"grad_norm": 0.17893989774867847,
"learning_rate": 2.971914039840888e-05,
"loss": 0.369,
"step": 684
},
{
"epoch": 3.1269614835948643,
"grad_norm": 0.17571871130738057,
"learning_rate": 2.9595901405592215e-05,
"loss": 0.3716,
"step": 685
},
{
"epoch": 3.1315263908701856,
"grad_norm": 0.1594921785853266,
"learning_rate": 2.947276824827784e-05,
"loss": 0.3712,
"step": 686
},
{
"epoch": 3.1360912981455065,
"grad_norm": 0.17980710605526548,
"learning_rate": 2.9349742179035575e-05,
"loss": 0.3656,
"step": 687
},
{
"epoch": 3.1406562054208274,
"grad_norm": 0.18019296867903034,
"learning_rate": 2.9226824449345854e-05,
"loss": 0.3711,
"step": 688
},
{
"epoch": 3.1452211126961482,
"grad_norm": 0.20009151820526971,
"learning_rate": 2.910401630958699e-05,
"loss": 0.3696,
"step": 689
},
{
"epoch": 3.1497860199714696,
"grad_norm": 0.19469556991313045,
"learning_rate": 2.898131900902259e-05,
"loss": 0.3664,
"step": 690
},
{
"epoch": 3.1543509272467904,
"grad_norm": 0.1658297168263364,
"learning_rate": 2.8858733795788666e-05,
"loss": 0.3698,
"step": 691
},
{
"epoch": 3.1589158345221113,
"grad_norm": 0.1770728229288905,
"learning_rate": 2.873626191688104e-05,
"loss": 0.3707,
"step": 692
},
{
"epoch": 3.163480741797432,
"grad_norm": 0.158629076656555,
"learning_rate": 2.8613904618142698e-05,
"loss": 0.369,
"step": 693
},
{
"epoch": 3.168045649072753,
"grad_norm": 0.15694838864187477,
"learning_rate": 2.8491663144250964e-05,
"loss": 0.3714,
"step": 694
},
{
"epoch": 3.172610556348074,
"grad_norm": 0.17453143005622082,
"learning_rate": 2.836953873870505e-05,
"loss": 0.3724,
"step": 695
},
{
"epoch": 3.1771754636233953,
"grad_norm": 0.1700537102783032,
"learning_rate": 2.824753264381319e-05,
"loss": 0.3692,
"step": 696
},
{
"epoch": 3.181740370898716,
"grad_norm": 0.16759539668413778,
"learning_rate": 2.812564610068017e-05,
"loss": 0.371,
"step": 697
},
{
"epoch": 3.186305278174037,
"grad_norm": 0.16867962773238068,
"learning_rate": 2.800388034919461e-05,
"loss": 0.3662,
"step": 698
},
{
"epoch": 3.190870185449358,
"grad_norm": 0.19731656299531658,
"learning_rate": 2.788223662801639e-05,
"loss": 0.3731,
"step": 699
},
{
"epoch": 3.195435092724679,
"grad_norm": 0.14390885288139355,
"learning_rate": 2.776071617456402e-05,
"loss": 0.3685,
"step": 700
},
{
"epoch": 3.2,
"grad_norm": 0.17429954735264463,
"learning_rate": 2.7639320225002108e-05,
"loss": 0.3692,
"step": 701
},
{
"epoch": 3.204564907275321,
"grad_norm": 0.16193213783821153,
"learning_rate": 2.7518050014228707e-05,
"loss": 0.3694,
"step": 702
},
{
"epoch": 3.209129814550642,
"grad_norm": 0.17943687683108553,
"learning_rate": 2.739690677586284e-05,
"loss": 0.3735,
"step": 703
},
{
"epoch": 3.213694721825963,
"grad_norm": 0.16115261882412688,
"learning_rate": 2.7275891742231847e-05,
"loss": 0.3688,
"step": 704
},
{
"epoch": 3.2182596291012837,
"grad_norm": 0.15989318235835606,
"learning_rate": 2.7155006144358958e-05,
"loss": 0.3669,
"step": 705
},
{
"epoch": 3.222824536376605,
"grad_norm": 0.1833740301603784,
"learning_rate": 2.70342512119507e-05,
"loss": 0.3681,
"step": 706
},
{
"epoch": 3.227389443651926,
"grad_norm": 0.1701603517000583,
"learning_rate": 2.691362817338442e-05,
"loss": 0.3732,
"step": 707
},
{
"epoch": 3.231954350927247,
"grad_norm": 0.17981663756006744,
"learning_rate": 2.6793138255695743e-05,
"loss": 0.3669,
"step": 708
},
{
"epoch": 3.2365192582025677,
"grad_norm": 0.1611749828625592,
"learning_rate": 2.6672782684566167e-05,
"loss": 0.368,
"step": 709
},
{
"epoch": 3.2410841654778886,
"grad_norm": 0.1704405344311701,
"learning_rate": 2.6552562684310532e-05,
"loss": 0.3746,
"step": 710
},
{
"epoch": 3.24564907275321,
"grad_norm": 0.15452048356052153,
"learning_rate": 2.6432479477864588e-05,
"loss": 0.3668,
"step": 711
},
{
"epoch": 3.250213980028531,
"grad_norm": 0.1636048167954058,
"learning_rate": 2.6312534286772558e-05,
"loss": 0.3665,
"step": 712
},
{
"epoch": 3.2547788873038517,
"grad_norm": 0.15805813062246202,
"learning_rate": 2.619272833117468e-05,
"loss": 0.3671,
"step": 713
},
{
"epoch": 3.2593437945791726,
"grad_norm": 0.15722251462834871,
"learning_rate": 2.6073062829794863e-05,
"loss": 0.3702,
"step": 714
},
{
"epoch": 3.2639087018544934,
"grad_norm": 0.1515655825894355,
"learning_rate": 2.5953538999928217e-05,
"loss": 0.3698,
"step": 715
},
{
"epoch": 3.2684736091298143,
"grad_norm": 0.15294949505967717,
"learning_rate": 2.5834158057428704e-05,
"loss": 0.3659,
"step": 716
},
{
"epoch": 3.2730385164051357,
"grad_norm": 0.17430429089098687,
"learning_rate": 2.5714921216696806e-05,
"loss": 0.3658,
"step": 717
},
{
"epoch": 3.2776034236804565,
"grad_norm": 0.15283446685063015,
"learning_rate": 2.559582969066706e-05,
"loss": 0.3655,
"step": 718
},
{
"epoch": 3.2821683309557774,
"grad_norm": 0.15900008730134152,
"learning_rate": 2.5476884690795853e-05,
"loss": 0.3686,
"step": 719
},
{
"epoch": 3.2867332382310983,
"grad_norm": 0.17376312137687447,
"learning_rate": 2.5358087427049016e-05,
"loss": 0.3679,
"step": 720
},
{
"epoch": 3.2912981455064196,
"grad_norm": 0.16367071666419647,
"learning_rate": 2.523943910788953e-05,
"loss": 0.3652,
"step": 721
},
{
"epoch": 3.2958630527817405,
"grad_norm": 0.1493453159768574,
"learning_rate": 2.5120940940265276e-05,
"loss": 0.3712,
"step": 722
},
{
"epoch": 3.3004279600570614,
"grad_norm": 0.1600651037670884,
"learning_rate": 2.500259412959665e-05,
"loss": 0.368,
"step": 723
},
{
"epoch": 3.3049928673323823,
"grad_norm": 0.15510229863064848,
"learning_rate": 2.4884399879764437e-05,
"loss": 0.3714,
"step": 724
},
{
"epoch": 3.309557774607703,
"grad_norm": 0.1371776463153879,
"learning_rate": 2.4766359393097476e-05,
"loss": 0.3723,
"step": 725
},
{
"epoch": 3.314122681883024,
"grad_norm": 0.17941656580285384,
"learning_rate": 2.464847387036045e-05,
"loss": 0.3702,
"step": 726
},
{
"epoch": 3.3186875891583454,
"grad_norm": 0.13405327935015152,
"learning_rate": 2.4530744510741703e-05,
"loss": 0.3715,
"step": 727
},
{
"epoch": 3.3232524964336663,
"grad_norm": 0.16167817588999067,
"learning_rate": 2.4413172511840958e-05,
"loss": 0.3693,
"step": 728
},
{
"epoch": 3.327817403708987,
"grad_norm": 0.1509260817331625,
"learning_rate": 2.429575906965722e-05,
"loss": 0.3599,
"step": 729
},
{
"epoch": 3.332382310984308,
"grad_norm": 0.144720817062596,
"learning_rate": 2.4178505378576605e-05,
"loss": 0.3708,
"step": 730
},
{
"epoch": 3.336947218259629,
"grad_norm": 0.1424755317935734,
"learning_rate": 2.4061412631360116e-05,
"loss": 0.3647,
"step": 731
},
{
"epoch": 3.3415121255349503,
"grad_norm": 0.1582677626986222,
"learning_rate": 2.394448201913158e-05,
"loss": 0.3656,
"step": 732
},
{
"epoch": 3.346077032810271,
"grad_norm": 0.13727256015572376,
"learning_rate": 2.3827714731365513e-05,
"loss": 0.3669,
"step": 733
},
{
"epoch": 3.350641940085592,
"grad_norm": 0.1482716204121623,
"learning_rate": 2.3711111955875018e-05,
"loss": 0.3661,
"step": 734
},
{
"epoch": 3.355206847360913,
"grad_norm": 0.1429391445826853,
"learning_rate": 2.3594674878799656e-05,
"loss": 0.3702,
"step": 735
},
{
"epoch": 3.359771754636234,
"grad_norm": 0.15144931311040605,
"learning_rate": 2.347840468459351e-05,
"loss": 0.3722,
"step": 736
},
{
"epoch": 3.364336661911555,
"grad_norm": 0.1584841215631846,
"learning_rate": 2.336230255601296e-05,
"loss": 0.3651,
"step": 737
},
{
"epoch": 3.368901569186876,
"grad_norm": 0.15860593207749524,
"learning_rate": 2.324636967410481e-05,
"loss": 0.3706,
"step": 738
},
{
"epoch": 3.373466476462197,
"grad_norm": 0.16524287919377015,
"learning_rate": 2.3130607218194153e-05,
"loss": 0.3667,
"step": 739
},
{
"epoch": 3.3780313837375178,
"grad_norm": 0.1553010070756002,
"learning_rate": 2.3015016365872462e-05,
"loss": 0.3694,
"step": 740
},
{
"epoch": 3.3825962910128387,
"grad_norm": 0.19177517583446688,
"learning_rate": 2.289959829298558e-05,
"loss": 0.3684,
"step": 741
},
{
"epoch": 3.38716119828816,
"grad_norm": 0.13228111850501834,
"learning_rate": 2.2784354173621726e-05,
"loss": 0.3695,
"step": 742
},
{
"epoch": 3.391726105563481,
"grad_norm": 0.18742867451573136,
"learning_rate": 2.266928518009957e-05,
"loss": 0.3663,
"step": 743
},
{
"epoch": 3.3962910128388017,
"grad_norm": 0.14121057196985098,
"learning_rate": 2.2554392482956357e-05,
"loss": 0.362,
"step": 744
},
{
"epoch": 3.4008559201141226,
"grad_norm": 0.16109339289265964,
"learning_rate": 2.243967725093595e-05,
"loss": 0.3644,
"step": 745
},
{
"epoch": 3.4054208273894435,
"grad_norm": 0.13332750913355695,
"learning_rate": 2.2325140650976957e-05,
"loss": 0.3696,
"step": 746
},
{
"epoch": 3.4099857346647644,
"grad_norm": 0.14371453147838442,
"learning_rate": 2.221078384820082e-05,
"loss": 0.3672,
"step": 747
},
{
"epoch": 3.4145506419400857,
"grad_norm": 0.12959264833893533,
"learning_rate": 2.209660800590002e-05,
"loss": 0.369,
"step": 748
},
{
"epoch": 3.4191155492154066,
"grad_norm": 0.19007915393126823,
"learning_rate": 2.1982614285526214e-05,
"loss": 0.3695,
"step": 749
},
{
"epoch": 3.4236804564907275,
"grad_norm": 0.1374069769687416,
"learning_rate": 2.1868803846678475e-05,
"loss": 0.364,
"step": 750
},
{
"epoch": 3.4282453637660484,
"grad_norm": 0.16184614734626082,
"learning_rate": 2.1755177847091357e-05,
"loss": 0.3701,
"step": 751
},
{
"epoch": 3.4328102710413697,
"grad_norm": 0.12874829449890968,
"learning_rate": 2.1641737442623295e-05,
"loss": 0.3702,
"step": 752
},
{
"epoch": 3.4373751783166906,
"grad_norm": 0.15265797312556922,
"learning_rate": 2.1528483787244695e-05,
"loss": 0.3631,
"step": 753
},
{
"epoch": 3.4419400855920115,
"grad_norm": 0.13901496827521181,
"learning_rate": 2.1415418033026303e-05,
"loss": 0.3698,
"step": 754
},
{
"epoch": 3.4465049928673324,
"grad_norm": 0.14408559598989043,
"learning_rate": 2.1302541330127456e-05,
"loss": 0.3722,
"step": 755
},
{
"epoch": 3.4510699001426532,
"grad_norm": 0.1553274863431962,
"learning_rate": 2.1189854826784306e-05,
"loss": 0.373,
"step": 756
},
{
"epoch": 3.455634807417974,
"grad_norm": 0.13896160154548598,
"learning_rate": 2.107735966929828e-05,
"loss": 0.3674,
"step": 757
},
{
"epoch": 3.4601997146932955,
"grad_norm": 0.14190658414204307,
"learning_rate": 2.096505700202427e-05,
"loss": 0.3713,
"step": 758
},
{
"epoch": 3.4647646219686163,
"grad_norm": 0.13297816575448404,
"learning_rate": 2.0852947967359124e-05,
"loss": 0.3659,
"step": 759
},
{
"epoch": 3.4693295292439372,
"grad_norm": 0.13293797385160067,
"learning_rate": 2.0741033705729946e-05,
"loss": 0.3688,
"step": 760
},
{
"epoch": 3.473894436519258,
"grad_norm": 0.13476039599236855,
"learning_rate": 2.0629315355582493e-05,
"loss": 0.3734,
"step": 761
},
{
"epoch": 3.478459343794579,
"grad_norm": 0.12322657009585795,
"learning_rate": 2.0517794053369668e-05,
"loss": 0.3674,
"step": 762
},
{
"epoch": 3.4830242510699003,
"grad_norm": 0.11867881868833804,
"learning_rate": 2.040647093353983e-05,
"loss": 0.3656,
"step": 763
},
{
"epoch": 3.487589158345221,
"grad_norm": 0.11899054622753805,
"learning_rate": 2.02953471285254e-05,
"loss": 0.3636,
"step": 764
},
{
"epoch": 3.492154065620542,
"grad_norm": 0.12418349656227172,
"learning_rate": 2.018442376873126e-05,
"loss": 0.3717,
"step": 765
},
{
"epoch": 3.496718972895863,
"grad_norm": 0.11310966967990665,
"learning_rate": 2.007370198252324e-05,
"loss": 0.3611,
"step": 766
},
{
"epoch": 3.501283880171184,
"grad_norm": 0.12055741426438335,
"learning_rate": 1.9963182896216667e-05,
"loss": 0.3674,
"step": 767
},
{
"epoch": 3.5058487874465047,
"grad_norm": 0.12618096595715805,
"learning_rate": 1.985286763406494e-05,
"loss": 0.3725,
"step": 768
},
{
"epoch": 3.510413694721826,
"grad_norm": 0.11538183128314908,
"learning_rate": 1.974275731824804e-05,
"loss": 0.3703,
"step": 769
},
{
"epoch": 3.514978601997147,
"grad_norm": 0.12599278064501612,
"learning_rate": 1.9632853068861147e-05,
"loss": 0.3628,
"step": 770
},
{
"epoch": 3.519543509272468,
"grad_norm": 0.1139458312287587,
"learning_rate": 1.9523156003903215e-05,
"loss": 0.3664,
"step": 771
},
{
"epoch": 3.5241084165477887,
"grad_norm": 0.14077407102475697,
"learning_rate": 1.9413667239265615e-05,
"loss": 0.3652,
"step": 772
},
{
"epoch": 3.52867332382311,
"grad_norm": 0.13032016684712433,
"learning_rate": 1.9304387888720804e-05,
"loss": 0.3685,
"step": 773
},
{
"epoch": 3.533238231098431,
"grad_norm": 0.14031858267414743,
"learning_rate": 1.919531906391099e-05,
"loss": 0.3733,
"step": 774
},
{
"epoch": 3.537803138373752,
"grad_norm": 0.1486352640869334,
"learning_rate": 1.9086461874336777e-05,
"loss": 0.3713,
"step": 775
},
{
"epoch": 3.5423680456490727,
"grad_norm": 0.1357215608169487,
"learning_rate": 1.8977817427345946e-05,
"loss": 0.3617,
"step": 776
},
{
"epoch": 3.5469329529243936,
"grad_norm": 0.13543444529267776,
"learning_rate": 1.8869386828122125e-05,
"loss": 0.3723,
"step": 777
},
{
"epoch": 3.5514978601997145,
"grad_norm": 0.14518880548921387,
"learning_rate": 1.8761171179673604e-05,
"loss": 0.3646,
"step": 778
},
{
"epoch": 3.556062767475036,
"grad_norm": 0.1530615652429703,
"learning_rate": 1.8653171582822104e-05,
"loss": 0.3676,
"step": 779
},
{
"epoch": 3.5606276747503567,
"grad_norm": 0.14320914259229844,
"learning_rate": 1.854538913619151e-05,
"loss": 0.3708,
"step": 780
},
{
"epoch": 3.5651925820256776,
"grad_norm": 0.1371728530944999,
"learning_rate": 1.8437824936196823e-05,
"loss": 0.3749,
"step": 781
},
{
"epoch": 3.5697574893009985,
"grad_norm": 0.15231861361219254,
"learning_rate": 1.8330480077032858e-05,
"loss": 0.3672,
"step": 782
},
{
"epoch": 3.57432239657632,
"grad_norm": 0.11059078549485153,
"learning_rate": 1.822335565066325e-05,
"loss": 0.3677,
"step": 783
},
{
"epoch": 3.5788873038516407,
"grad_norm": 0.1576919764066523,
"learning_rate": 1.8116452746809275e-05,
"loss": 0.367,
"step": 784
},
{
"epoch": 3.5834522111269616,
"grad_norm": 0.12879507206191718,
"learning_rate": 1.800977245293875e-05,
"loss": 0.368,
"step": 785
},
{
"epoch": 3.5880171184022824,
"grad_norm": 0.12629882573611204,
"learning_rate": 1.7903315854254994e-05,
"loss": 0.3654,
"step": 786
},
{
"epoch": 3.5925820256776033,
"grad_norm": 0.13427520014143732,
"learning_rate": 1.779708403368582e-05,
"loss": 0.3713,
"step": 787
},
{
"epoch": 3.597146932952924,
"grad_norm": 0.1176497941246944,
"learning_rate": 1.7691078071872477e-05,
"loss": 0.3711,
"step": 788
},
{
"epoch": 3.601711840228245,
"grad_norm": 0.13036521061876197,
"learning_rate": 1.7585299047158688e-05,
"loss": 0.3703,
"step": 789
},
{
"epoch": 3.6062767475035664,
"grad_norm": 0.12201908677886805,
"learning_rate": 1.7479748035579625e-05,
"loss": 0.3664,
"step": 790
},
{
"epoch": 3.6108416547788873,
"grad_norm": 0.11159322212904035,
"learning_rate": 1.7374426110851e-05,
"loss": 0.3726,
"step": 791
},
{
"epoch": 3.615406562054208,
"grad_norm": 0.12179562963863867,
"learning_rate": 1.726933434435819e-05,
"loss": 0.3598,
"step": 792
},
{
"epoch": 3.6199714693295295,
"grad_norm": 0.12279846137825234,
"learning_rate": 1.716447380514526e-05,
"loss": 0.3643,
"step": 793
},
{
"epoch": 3.6245363766048504,
"grad_norm": 0.12287836059833575,
"learning_rate": 1.7059845559904115e-05,
"loss": 0.3702,
"step": 794
},
{
"epoch": 3.6291012838801713,
"grad_norm": 0.11039673465914712,
"learning_rate": 1.695545067296368e-05,
"loss": 0.3675,
"step": 795
},
{
"epoch": 3.633666191155492,
"grad_norm": 0.12022240785762481,
"learning_rate": 1.6851290206279e-05,
"loss": 0.3695,
"step": 796
},
{
"epoch": 3.638231098430813,
"grad_norm": 0.11435555984531806,
"learning_rate": 1.674736521942053e-05,
"loss": 0.365,
"step": 797
},
{
"epoch": 3.642796005706134,
"grad_norm": 0.131712471568275,
"learning_rate": 1.664367676956331e-05,
"loss": 0.3713,
"step": 798
},
{
"epoch": 3.647360912981455,
"grad_norm": 0.11477057659282204,
"learning_rate": 1.6540225911476172e-05,
"loss": 0.3648,
"step": 799
},
{
"epoch": 3.651925820256776,
"grad_norm": 0.11030623092197417,
"learning_rate": 1.643701369751109e-05,
"loss": 0.3646,
"step": 800
},
{
"epoch": 3.656490727532097,
"grad_norm": 0.13051313133439071,
"learning_rate": 1.6334041177592403e-05,
"loss": 0.3711,
"step": 801
},
{
"epoch": 3.661055634807418,
"grad_norm": 0.12099586891245266,
"learning_rate": 1.623130939920619e-05,
"loss": 0.3665,
"step": 802
},
{
"epoch": 3.665620542082739,
"grad_norm": 0.12353194281368282,
"learning_rate": 1.6128819407389606e-05,
"loss": 0.3698,
"step": 803
},
{
"epoch": 3.67018544935806,
"grad_norm": 0.12746501918260686,
"learning_rate": 1.602657224472018e-05,
"loss": 0.3651,
"step": 804
},
{
"epoch": 3.674750356633381,
"grad_norm": 0.10793911581481992,
"learning_rate": 1.5924568951305328e-05,
"loss": 0.3651,
"step": 805
},
{
"epoch": 3.679315263908702,
"grad_norm": 0.13767950934095166,
"learning_rate": 1.5822810564771663e-05,
"loss": 0.3681,
"step": 806
},
{
"epoch": 3.683880171184023,
"grad_norm": 0.11149734780728035,
"learning_rate": 1.5721298120254514e-05,
"loss": 0.372,
"step": 807
},
{
"epoch": 3.6884450784593437,
"grad_norm": 0.1187146085257134,
"learning_rate": 1.562003265038738e-05,
"loss": 0.3698,
"step": 808
},
{
"epoch": 3.6930099857346645,
"grad_norm": 0.12127988390555491,
"learning_rate": 1.551901518529138e-05,
"loss": 0.3692,
"step": 809
},
{
"epoch": 3.697574893009986,
"grad_norm": 0.11702704275441794,
"learning_rate": 1.541824675256482e-05,
"loss": 0.3661,
"step": 810
},
{
"epoch": 3.7021398002853068,
"grad_norm": 0.11306685620844899,
"learning_rate": 1.531772837727274e-05,
"loss": 0.3695,
"step": 811
},
{
"epoch": 3.7067047075606276,
"grad_norm": 0.11312484841891161,
"learning_rate": 1.5217461081936478e-05,
"loss": 0.3671,
"step": 812
},
{
"epoch": 3.7112696148359485,
"grad_norm": 0.11428174495717341,
"learning_rate": 1.5117445886523272e-05,
"loss": 0.3674,
"step": 813
},
{
"epoch": 3.71583452211127,
"grad_norm": 0.10987603615873456,
"learning_rate": 1.501768380843585e-05,
"loss": 0.3673,
"step": 814
},
{
"epoch": 3.7203994293865907,
"grad_norm": 0.11178040039287562,
"learning_rate": 1.4918175862502104e-05,
"loss": 0.3719,
"step": 815
},
{
"epoch": 3.7249643366619116,
"grad_norm": 0.10859792201293215,
"learning_rate": 1.4818923060964814e-05,
"loss": 0.3615,
"step": 816
},
{
"epoch": 3.7295292439372325,
"grad_norm": 0.13172517052119273,
"learning_rate": 1.471992641347129e-05,
"loss": 0.3694,
"step": 817
},
{
"epoch": 3.7340941512125534,
"grad_norm": 0.12757152194459273,
"learning_rate": 1.4621186927063095e-05,
"loss": 0.3649,
"step": 818
},
{
"epoch": 3.7386590584878743,
"grad_norm": 0.1154895484553408,
"learning_rate": 1.4522705606165865e-05,
"loss": 0.3694,
"step": 819
},
{
"epoch": 3.743223965763195,
"grad_norm": 0.11273080460771058,
"learning_rate": 1.4424483452579012e-05,
"loss": 0.3625,
"step": 820
},
{
"epoch": 3.7477888730385165,
"grad_norm": 0.13138142426919347,
"learning_rate": 1.4326521465465604e-05,
"loss": 0.366,
"step": 821
},
{
"epoch": 3.7523537803138374,
"grad_norm": 0.11883743670221694,
"learning_rate": 1.4228820641342172e-05,
"loss": 0.3682,
"step": 822
},
{
"epoch": 3.7569186875891583,
"grad_norm": 0.10968178906111543,
"learning_rate": 1.4131381974068533e-05,
"loss": 0.3655,
"step": 823
},
{
"epoch": 3.7614835948644796,
"grad_norm": 0.1326313322060334,
"learning_rate": 1.4034206454837768e-05,
"loss": 0.3715,
"step": 824
},
{
"epoch": 3.7660485021398005,
"grad_norm": 0.12319894462182598,
"learning_rate": 1.3937295072166061e-05,
"loss": 0.3739,
"step": 825
},
{
"epoch": 3.7706134094151214,
"grad_norm": 0.11752644369102541,
"learning_rate": 1.3840648811882646e-05,
"loss": 0.3641,
"step": 826
},
{
"epoch": 3.7751783166904422,
"grad_norm": 0.10779720073463613,
"learning_rate": 1.3744268657119886e-05,
"loss": 0.3672,
"step": 827
},
{
"epoch": 3.779743223965763,
"grad_norm": 0.12396985602516226,
"learning_rate": 1.3648155588303097e-05,
"loss": 0.3682,
"step": 828
},
{
"epoch": 3.784308131241084,
"grad_norm": 0.13035541174831453,
"learning_rate": 1.3552310583140744e-05,
"loss": 0.3695,
"step": 829
},
{
"epoch": 3.788873038516405,
"grad_norm": 0.11977978636195129,
"learning_rate": 1.3456734616614369e-05,
"loss": 0.3693,
"step": 830
},
{
"epoch": 3.793437945791726,
"grad_norm": 0.10612949145458676,
"learning_rate": 1.3361428660968713e-05,
"loss": 0.3654,
"step": 831
},
{
"epoch": 3.798002853067047,
"grad_norm": 0.12185371327777074,
"learning_rate": 1.3266393685701919e-05,
"loss": 0.3644,
"step": 832
},
{
"epoch": 3.802567760342368,
"grad_norm": 0.10533358955019909,
"learning_rate": 1.3171630657555499e-05,
"loss": 0.3688,
"step": 833
},
{
"epoch": 3.807132667617689,
"grad_norm": 0.1124577538423527,
"learning_rate": 1.3077140540504614e-05,
"loss": 0.3638,
"step": 834
},
{
"epoch": 3.81169757489301,
"grad_norm": 0.11642170801308592,
"learning_rate": 1.2982924295748274e-05,
"loss": 0.3684,
"step": 835
},
{
"epoch": 3.816262482168331,
"grad_norm": 0.10695378952599915,
"learning_rate": 1.2888982881699472e-05,
"loss": 0.3691,
"step": 836
},
{
"epoch": 3.820827389443652,
"grad_norm": 0.13400187716659637,
"learning_rate": 1.2795317253975537e-05,
"loss": 0.3691,
"step": 837
},
{
"epoch": 3.825392296718973,
"grad_norm": 0.09951593298438159,
"learning_rate": 1.270192836538836e-05,
"loss": 0.3697,
"step": 838
},
{
"epoch": 3.8299572039942937,
"grad_norm": 0.10935426565797807,
"learning_rate": 1.2608817165934681e-05,
"loss": 0.3674,
"step": 839
},
{
"epoch": 3.8345221112696146,
"grad_norm": 0.11667620915560488,
"learning_rate": 1.2515984602786487e-05,
"loss": 0.3662,
"step": 840
},
{
"epoch": 3.839087018544936,
"grad_norm": 0.10968676357221958,
"learning_rate": 1.2423431620281306e-05,
"loss": 0.3618,
"step": 841
},
{
"epoch": 3.843651925820257,
"grad_norm": 0.12006579571295849,
"learning_rate": 1.2331159159912667e-05,
"loss": 0.3703,
"step": 842
},
{
"epoch": 3.8482168330955777,
"grad_norm": 0.11231326039006076,
"learning_rate": 1.22391681603205e-05,
"loss": 0.3674,
"step": 843
},
{
"epoch": 3.8527817403708986,
"grad_norm": 0.11239420460642152,
"learning_rate": 1.2147459557281543e-05,
"loss": 0.3731,
"step": 844
},
{
"epoch": 3.85734664764622,
"grad_norm": 0.10105192704809327,
"learning_rate": 1.2056034283699866e-05,
"loss": 0.3634,
"step": 845
},
{
"epoch": 3.861911554921541,
"grad_norm": 0.1324600771108394,
"learning_rate": 1.1964893269597408e-05,
"loss": 0.3698,
"step": 846
},
{
"epoch": 3.8664764621968617,
"grad_norm": 0.10492225515470834,
"learning_rate": 1.1874037442104464e-05,
"loss": 0.3656,
"step": 847
},
{
"epoch": 3.8710413694721826,
"grad_norm": 0.1049947478937808,
"learning_rate": 1.1783467725450288e-05,
"loss": 0.3673,
"step": 848
},
{
"epoch": 3.8756062767475035,
"grad_norm": 0.12343105623506188,
"learning_rate": 1.1693185040953647e-05,
"loss": 0.3692,
"step": 849
},
{
"epoch": 3.8801711840228243,
"grad_norm": 0.10907582190242261,
"learning_rate": 1.1603190307013485e-05,
"loss": 0.3689,
"step": 850
},
{
"epoch": 3.8847360912981452,
"grad_norm": 0.1048746000015905,
"learning_rate": 1.1513484439099592e-05,
"loss": 0.3688,
"step": 851
},
{
"epoch": 3.8893009985734666,
"grad_norm": 0.110532809391589,
"learning_rate": 1.1424068349743282e-05,
"loss": 0.3707,
"step": 852
},
{
"epoch": 3.8938659058487874,
"grad_norm": 0.1082453458710121,
"learning_rate": 1.133494294852806e-05,
"loss": 0.3684,
"step": 853
},
{
"epoch": 3.8984308131241083,
"grad_norm": 0.10345660407308066,
"learning_rate": 1.1246109142080463e-05,
"loss": 0.3678,
"step": 854
},
{
"epoch": 3.9029957203994297,
"grad_norm": 0.10898385130369857,
"learning_rate": 1.1157567834060732e-05,
"loss": 0.3703,
"step": 855
},
{
"epoch": 3.9075606276747505,
"grad_norm": 0.10868802232496799,
"learning_rate": 1.1069319925153716e-05,
"loss": 0.3672,
"step": 856
},
{
"epoch": 3.9121255349500714,
"grad_norm": 0.1001172075462793,
"learning_rate": 1.098136631305966e-05,
"loss": 0.3647,
"step": 857
},
{
"epoch": 3.9166904422253923,
"grad_norm": 0.10002086132257204,
"learning_rate": 1.0893707892485046e-05,
"loss": 0.3687,
"step": 858
},
{
"epoch": 3.921255349500713,
"grad_norm": 0.11817525489059527,
"learning_rate": 1.080634555513358e-05,
"loss": 0.3681,
"step": 859
},
{
"epoch": 3.925820256776034,
"grad_norm": 0.10005386509374838,
"learning_rate": 1.0719280189697012e-05,
"loss": 0.371,
"step": 860
},
{
"epoch": 3.930385164051355,
"grad_norm": 0.09735019950539485,
"learning_rate": 1.0632512681846188e-05,
"loss": 0.3647,
"step": 861
},
{
"epoch": 3.9349500713266763,
"grad_norm": 0.1096708423704249,
"learning_rate": 1.0546043914222004e-05,
"loss": 0.3689,
"step": 862
},
{
"epoch": 3.939514978601997,
"grad_norm": 0.10947940483538458,
"learning_rate": 1.045987476642639e-05,
"loss": 0.3653,
"step": 863
},
{
"epoch": 3.944079885877318,
"grad_norm": 0.10334503635714308,
"learning_rate": 1.0374006115013446e-05,
"loss": 0.3705,
"step": 864
},
{
"epoch": 3.948644793152639,
"grad_norm": 0.09613211541479365,
"learning_rate": 1.0288438833480434e-05,
"loss": 0.3653,
"step": 865
},
{
"epoch": 3.9532097004279603,
"grad_norm": 0.11391043223040556,
"learning_rate": 1.0203173792258964e-05,
"loss": 0.3709,
"step": 866
},
{
"epoch": 3.957774607703281,
"grad_norm": 0.1112000931274841,
"learning_rate": 1.0118211858706126e-05,
"loss": 0.3695,
"step": 867
},
{
"epoch": 3.962339514978602,
"grad_norm": 0.09623531613971865,
"learning_rate": 1.0033553897095611e-05,
"loss": 0.3673,
"step": 868
},
{
"epoch": 3.966904422253923,
"grad_norm": 0.12054451116095541,
"learning_rate": 9.949200768608978e-06,
"loss": 0.3662,
"step": 869
},
{
"epoch": 3.971469329529244,
"grad_norm": 0.10469643561676546,
"learning_rate": 9.865153331326888e-06,
"loss": 0.3722,
"step": 870
},
{
"epoch": 3.9760342368045647,
"grad_norm": 0.10369572994121254,
"learning_rate": 9.781412440220364e-06,
"loss": 0.3658,
"step": 871
},
{
"epoch": 3.980599144079886,
"grad_norm": 0.11265124183687726,
"learning_rate": 9.697978947142083e-06,
"loss": 0.366,
"step": 872
},
{
"epoch": 3.985164051355207,
"grad_norm": 0.09909112429172705,
"learning_rate": 9.61485370081773e-06,
"loss": 0.3655,
"step": 873
},
{
"epoch": 3.989728958630528,
"grad_norm": 0.10458100196949884,
"learning_rate": 9.532037546837328e-06,
"loss": 0.3669,
"step": 874
},
{
"epoch": 3.9942938659058487,
"grad_norm": 0.10060786891352515,
"learning_rate": 9.4495313276467e-06,
"loss": 0.3684,
"step": 875
},
{
"epoch": 3.99885877318117,
"grad_norm": 0.11243962749211547,
"learning_rate": 9.367335882538859e-06,
"loss": 0.3651,
"step": 876
},
{
"epoch": 4.003423680456491,
"grad_norm": 0.37014150627969,
"learning_rate": 9.285452047645447e-06,
"loss": 0.6984,
"step": 877
},
{
"epoch": 4.007988587731812,
"grad_norm": 0.15671543024887574,
"learning_rate": 9.20388065592829e-06,
"loss": 0.3516,
"step": 878
},
{
"epoch": 4.012553495007133,
"grad_norm": 0.16714819900057928,
"learning_rate": 9.122622537170858e-06,
"loss": 0.3535,
"step": 879
},
{
"epoch": 4.0171184022824535,
"grad_norm": 0.15835660325889345,
"learning_rate": 9.041678517969878e-06,
"loss": 0.3516,
"step": 880
},
{
"epoch": 4.021683309557774,
"grad_norm": 0.1420672767585784,
"learning_rate": 8.961049421726927e-06,
"loss": 0.3484,
"step": 881
},
{
"epoch": 4.026248216833095,
"grad_norm": 0.13783878715714176,
"learning_rate": 8.880736068639972e-06,
"loss": 0.3476,
"step": 882
},
{
"epoch": 4.030813124108416,
"grad_norm": 0.13731172171411218,
"learning_rate": 8.800739275695162e-06,
"loss": 0.3559,
"step": 883
},
{
"epoch": 4.035378031383738,
"grad_norm": 0.14178993491605563,
"learning_rate": 8.721059856658374e-06,
"loss": 0.3505,
"step": 884
},
{
"epoch": 4.039942938659059,
"grad_norm": 0.12951830366700642,
"learning_rate": 8.641698622067056e-06,
"loss": 0.3483,
"step": 885
},
{
"epoch": 4.04450784593438,
"grad_norm": 0.13767174076249047,
"learning_rate": 8.56265637922192e-06,
"loss": 0.3516,
"step": 886
},
{
"epoch": 4.049072753209701,
"grad_norm": 0.13591547948547417,
"learning_rate": 8.483933932178714e-06,
"loss": 0.3569,
"step": 887
},
{
"epoch": 4.0536376604850215,
"grad_norm": 0.12191863460297671,
"learning_rate": 8.405532081740104e-06,
"loss": 0.3479,
"step": 888
},
{
"epoch": 4.058202567760342,
"grad_norm": 0.1248887971732098,
"learning_rate": 8.327451625447462e-06,
"loss": 0.3494,
"step": 889
},
{
"epoch": 4.062767475035663,
"grad_norm": 0.13804862882003327,
"learning_rate": 8.24969335757281e-06,
"loss": 0.3508,
"step": 890
},
{
"epoch": 4.067332382310984,
"grad_norm": 0.11333955902964997,
"learning_rate": 8.17225806911071e-06,
"loss": 0.3515,
"step": 891
},
{
"epoch": 4.071897289586305,
"grad_norm": 0.11550369078052372,
"learning_rate": 8.095146547770202e-06,
"loss": 0.3523,
"step": 892
},
{
"epoch": 4.076462196861626,
"grad_norm": 0.12066342905233718,
"learning_rate": 8.018359577966822e-06,
"loss": 0.351,
"step": 893
},
{
"epoch": 4.081027104136947,
"grad_norm": 0.12020962194954402,
"learning_rate": 7.941897940814613e-06,
"loss": 0.3511,
"step": 894
},
{
"epoch": 4.085592011412269,
"grad_norm": 0.10735766251661075,
"learning_rate": 7.865762414118197e-06,
"loss": 0.3515,
"step": 895
},
{
"epoch": 4.0901569186875895,
"grad_norm": 0.10152101132548039,
"learning_rate": 7.7899537723648e-06,
"loss": 0.3494,
"step": 896
},
{
"epoch": 4.09472182596291,
"grad_norm": 0.11175247171368977,
"learning_rate": 7.71447278671646e-06,
"loss": 0.3434,
"step": 897
},
{
"epoch": 4.099286733238231,
"grad_norm": 0.11008323120104475,
"learning_rate": 7.639320225002106e-06,
"loss": 0.3562,
"step": 898
},
{
"epoch": 4.103851640513552,
"grad_norm": 0.10079393628102,
"learning_rate": 7.564496851709799e-06,
"loss": 0.3462,
"step": 899
},
{
"epoch": 4.108416547788873,
"grad_norm": 0.10224032651804278,
"learning_rate": 7.490003427978947e-06,
"loss": 0.352,
"step": 900
},
{
"epoch": 4.112981455064194,
"grad_norm": 0.10528872332896734,
"learning_rate": 7.415840711592515e-06,
"loss": 0.3583,
"step": 901
},
{
"epoch": 4.117546362339515,
"grad_norm": 0.09810153639928686,
"learning_rate": 7.342009456969394e-06,
"loss": 0.3507,
"step": 902
},
{
"epoch": 4.122111269614836,
"grad_norm": 0.09377946822299223,
"learning_rate": 7.26851041515666e-06,
"loss": 0.3478,
"step": 903
},
{
"epoch": 4.1266761768901565,
"grad_norm": 0.09941074294315165,
"learning_rate": 7.1953443338219635e-06,
"loss": 0.3474,
"step": 904
},
{
"epoch": 4.131241084165478,
"grad_norm": 0.09923689941405467,
"learning_rate": 7.12251195724595e-06,
"loss": 0.3497,
"step": 905
},
{
"epoch": 4.135805991440799,
"grad_norm": 0.09632085335191928,
"learning_rate": 7.0500140263146085e-06,
"loss": 0.3502,
"step": 906
},
{
"epoch": 4.14037089871612,
"grad_norm": 0.09318553780432773,
"learning_rate": 6.977851278511831e-06,
"loss": 0.3519,
"step": 907
},
{
"epoch": 4.144935805991441,
"grad_norm": 0.1055662018772662,
"learning_rate": 6.9060244479118325e-06,
"loss": 0.3447,
"step": 908
},
{
"epoch": 4.149500713266762,
"grad_norm": 0.09794765647352846,
"learning_rate": 6.8345342651717415e-06,
"loss": 0.3521,
"step": 909
},
{
"epoch": 4.154065620542083,
"grad_norm": 0.10262064175384165,
"learning_rate": 6.763381457524137e-06,
"loss": 0.3467,
"step": 910
},
{
"epoch": 4.158630527817404,
"grad_norm": 0.0924225272063823,
"learning_rate": 6.692566748769645e-06,
"loss": 0.348,
"step": 911
},
{
"epoch": 4.1631954350927245,
"grad_norm": 0.09396615551983964,
"learning_rate": 6.622090859269579e-06,
"loss": 0.3485,
"step": 912
},
{
"epoch": 4.167760342368045,
"grad_norm": 0.1085909553244327,
"learning_rate": 6.5519545059386495e-06,
"loss": 0.3503,
"step": 913
},
{
"epoch": 4.172325249643366,
"grad_norm": 0.10045646940751854,
"learning_rate": 6.482158402237622e-06,
"loss": 0.3515,
"step": 914
},
{
"epoch": 4.176890156918688,
"grad_norm": 0.10030605637198417,
"learning_rate": 6.412703258166089e-06,
"loss": 0.3513,
"step": 915
},
{
"epoch": 4.181455064194009,
"grad_norm": 0.08816024777952751,
"learning_rate": 6.343589780255226e-06,
"loss": 0.3463,
"step": 916
},
{
"epoch": 4.18601997146933,
"grad_norm": 0.10373572668229089,
"learning_rate": 6.274818671560612e-06,
"loss": 0.3521,
"step": 917
},
{
"epoch": 4.190584878744651,
"grad_norm": 0.09661236890820955,
"learning_rate": 6.2063906316550944e-06,
"loss": 0.3535,
"step": 918
},
{
"epoch": 4.195149786019972,
"grad_norm": 0.09307921133377361,
"learning_rate": 6.138306356621666e-06,
"loss": 0.352,
"step": 919
},
{
"epoch": 4.1997146932952925,
"grad_norm": 0.09643011644388448,
"learning_rate": 6.0705665390463545e-06,
"loss": 0.3495,
"step": 920
},
{
"epoch": 4.204279600570613,
"grad_norm": 0.09380570498729787,
"learning_rate": 6.003171868011226e-06,
"loss": 0.351,
"step": 921
},
{
"epoch": 4.208844507845934,
"grad_norm": 0.09110098326246356,
"learning_rate": 5.9361230290873175e-06,
"loss": 0.3501,
"step": 922
},
{
"epoch": 4.213409415121255,
"grad_norm": 0.10109563700718154,
"learning_rate": 5.869420704327722e-06,
"loss": 0.3523,
"step": 923
},
{
"epoch": 4.217974322396576,
"grad_norm": 0.08878313714056194,
"learning_rate": 5.803065572260633e-06,
"loss": 0.3482,
"step": 924
},
{
"epoch": 4.222539229671897,
"grad_norm": 0.08818694187121523,
"learning_rate": 5.737058307882391e-06,
"loss": 0.3528,
"step": 925
},
{
"epoch": 4.227104136947219,
"grad_norm": 0.09295440449783292,
"learning_rate": 5.671399582650705e-06,
"loss": 0.3461,
"step": 926
},
{
"epoch": 4.2316690442225395,
"grad_norm": 0.09685863269312232,
"learning_rate": 5.606090064477738e-06,
"loss": 0.3503,
"step": 927
},
{
"epoch": 4.23623395149786,
"grad_norm": 0.09009726502122058,
"learning_rate": 5.541130417723359e-06,
"loss": 0.3439,
"step": 928
},
{
"epoch": 4.240798858773181,
"grad_norm": 0.08660914537757003,
"learning_rate": 5.476521303188414e-06,
"loss": 0.353,
"step": 929
},
{
"epoch": 4.245363766048502,
"grad_norm": 0.09086935202462741,
"learning_rate": 5.4122633781079135e-06,
"loss": 0.3523,
"step": 930
},
{
"epoch": 4.249928673323823,
"grad_norm": 0.09963483067768689,
"learning_rate": 5.348357296144437e-06,
"loss": 0.3528,
"step": 931
},
{
"epoch": 4.254493580599144,
"grad_norm": 0.08898518315403384,
"learning_rate": 5.2848037073814255e-06,
"loss": 0.3492,
"step": 932
},
{
"epoch": 4.259058487874465,
"grad_norm": 0.09983473387703683,
"learning_rate": 5.221603258316577e-06,
"loss": 0.3537,
"step": 933
},
{
"epoch": 4.263623395149786,
"grad_norm": 0.09231469544481466,
"learning_rate": 5.158756591855336e-06,
"loss": 0.3505,
"step": 934
},
{
"epoch": 4.268188302425107,
"grad_norm": 0.08932201492732685,
"learning_rate": 5.0962643473042536e-06,
"loss": 0.3506,
"step": 935
},
{
"epoch": 4.2727532097004275,
"grad_norm": 0.09165674817192716,
"learning_rate": 5.034127160364528e-06,
"loss": 0.3542,
"step": 936
},
{
"epoch": 4.277318116975749,
"grad_norm": 0.09295144892311318,
"learning_rate": 4.972345663125575e-06,
"loss": 0.3458,
"step": 937
},
{
"epoch": 4.28188302425107,
"grad_norm": 0.09773667025776654,
"learning_rate": 4.910920484058519e-06,
"loss": 0.3489,
"step": 938
},
{
"epoch": 4.286447931526391,
"grad_norm": 0.09478260876178425,
"learning_rate": 4.849852248009899e-06,
"loss": 0.3518,
"step": 939
},
{
"epoch": 4.291012838801712,
"grad_norm": 0.08633818761277473,
"learning_rate": 4.789141576195207e-06,
"loss": 0.3481,
"step": 940
},
{
"epoch": 4.295577746077033,
"grad_norm": 0.09290651843039628,
"learning_rate": 4.72878908619264e-06,
"loss": 0.3459,
"step": 941
},
{
"epoch": 4.300142653352354,
"grad_norm": 0.0901405739554656,
"learning_rate": 4.668795391936805e-06,
"loss": 0.3438,
"step": 942
},
{
"epoch": 4.304707560627675,
"grad_norm": 0.09095933811292778,
"learning_rate": 4.609161103712447e-06,
"loss": 0.35,
"step": 943
},
{
"epoch": 4.3092724679029955,
"grad_norm": 0.08514226082856571,
"learning_rate": 4.54988682814828e-06,
"loss": 0.3514,
"step": 944
},
{
"epoch": 4.313837375178316,
"grad_norm": 0.09103184637713459,
"learning_rate": 4.490973168210788e-06,
"loss": 0.351,
"step": 945
},
{
"epoch": 4.318402282453638,
"grad_norm": 0.08771070966673426,
"learning_rate": 4.43242072319809e-06,
"loss": 0.3514,
"step": 946
},
{
"epoch": 4.322967189728959,
"grad_norm": 0.08785642021314082,
"learning_rate": 4.374230088733855e-06,
"loss": 0.349,
"step": 947
},
{
"epoch": 4.32753209700428,
"grad_norm": 0.09441411098621398,
"learning_rate": 4.3164018567612495e-06,
"loss": 0.3505,
"step": 948
},
{
"epoch": 4.332097004279601,
"grad_norm": 0.08194712702951042,
"learning_rate": 4.2589366155369125e-06,
"loss": 0.3487,
"step": 949
},
{
"epoch": 4.336661911554922,
"grad_norm": 0.08937141345166866,
"learning_rate": 4.201834949624957e-06,
"loss": 0.3523,
"step": 950
},
{
"epoch": 4.3412268188302425,
"grad_norm": 0.08875313880651403,
"learning_rate": 4.145097439891026e-06,
"loss": 0.3529,
"step": 951
},
{
"epoch": 4.345791726105563,
"grad_norm": 0.08216164995036938,
"learning_rate": 4.088724663496391e-06,
"loss": 0.3487,
"step": 952
},
{
"epoch": 4.350356633380884,
"grad_norm": 0.0844163738277272,
"learning_rate": 4.032717193892097e-06,
"loss": 0.3536,
"step": 953
},
{
"epoch": 4.354921540656205,
"grad_norm": 0.07958475981719833,
"learning_rate": 3.977075600813112e-06,
"loss": 0.3524,
"step": 954
},
{
"epoch": 4.359486447931526,
"grad_norm": 0.08312472608098892,
"learning_rate": 3.921800450272497e-06,
"loss": 0.3564,
"step": 955
},
{
"epoch": 4.364051355206847,
"grad_norm": 0.09090031843488945,
"learning_rate": 3.866892304555729e-06,
"loss": 0.3503,
"step": 956
},
{
"epoch": 4.368616262482169,
"grad_norm": 0.08646905004538202,
"learning_rate": 3.8123517222149064e-06,
"loss": 0.3539,
"step": 957
},
{
"epoch": 4.37318116975749,
"grad_norm": 0.08630929584464324,
"learning_rate": 3.7581792580630995e-06,
"loss": 0.3508,
"step": 958
},
{
"epoch": 4.3777460770328105,
"grad_norm": 0.08420773366472539,
"learning_rate": 3.7043754631687168e-06,
"loss": 0.3506,
"step": 959
},
{
"epoch": 4.382310984308131,
"grad_norm": 0.08626254558605037,
"learning_rate": 3.650940884849865e-06,
"loss": 0.3496,
"step": 960
},
{
"epoch": 4.386875891583452,
"grad_norm": 0.08202547191435747,
"learning_rate": 3.5978760666688283e-06,
"loss": 0.3463,
"step": 961
},
{
"epoch": 4.391440798858773,
"grad_norm": 0.08400830894322005,
"learning_rate": 3.545181548426482e-06,
"loss": 0.3534,
"step": 962
},
{
"epoch": 4.396005706134094,
"grad_norm": 0.08174812448321671,
"learning_rate": 3.4928578661568513e-06,
"loss": 0.3505,
"step": 963
},
{
"epoch": 4.400570613409415,
"grad_norm": 0.08855138380230874,
"learning_rate": 3.4409055521216472e-06,
"loss": 0.3484,
"step": 964
},
{
"epoch": 4.405135520684736,
"grad_norm": 0.08806266323171609,
"learning_rate": 3.3893251348048107e-06,
"loss": 0.3517,
"step": 965
},
{
"epoch": 4.409700427960057,
"grad_norm": 0.08598387207772194,
"learning_rate": 3.3381171389072155e-06,
"loss": 0.3464,
"step": 966
},
{
"epoch": 4.414265335235378,
"grad_norm": 0.08257564200380783,
"learning_rate": 3.287282085341237e-06,
"loss": 0.3434,
"step": 967
},
{
"epoch": 4.418830242510699,
"grad_norm": 0.08050286636135041,
"learning_rate": 3.236820491225543e-06,
"loss": 0.3507,
"step": 968
},
{
"epoch": 4.42339514978602,
"grad_norm": 0.0804766651869741,
"learning_rate": 3.1867328698797784e-06,
"loss": 0.3559,
"step": 969
},
{
"epoch": 4.427960057061341,
"grad_norm": 0.08321920927722377,
"learning_rate": 3.1370197308193464e-06,
"loss": 0.3495,
"step": 970
},
{
"epoch": 4.432524964336662,
"grad_norm": 0.08340309487334537,
"learning_rate": 3.08768157975023e-06,
"loss": 0.3485,
"step": 971
},
{
"epoch": 4.437089871611983,
"grad_norm": 0.08628533559205914,
"learning_rate": 3.0387189185638877e-06,
"loss": 0.3465,
"step": 972
},
{
"epoch": 4.441654778887304,
"grad_norm": 0.0839367124944577,
"learning_rate": 2.99013224533208e-06,
"loss": 0.3514,
"step": 973
},
{
"epoch": 4.446219686162625,
"grad_norm": 0.08431513019450357,
"learning_rate": 2.9419220543018647e-06,
"loss": 0.35,
"step": 974
},
{
"epoch": 4.4507845934379455,
"grad_norm": 0.08138210314265082,
"learning_rate": 2.894088835890512e-06,
"loss": 0.3503,
"step": 975
},
{
"epoch": 4.455349500713266,
"grad_norm": 0.08209216743333449,
"learning_rate": 2.846633076680565e-06,
"loss": 0.3501,
"step": 976
},
{
"epoch": 4.459914407988588,
"grad_norm": 0.07980157930120314,
"learning_rate": 2.7995552594148613e-06,
"loss": 0.3477,
"step": 977
},
{
"epoch": 4.464479315263909,
"grad_norm": 0.08454274847735264,
"learning_rate": 2.7528558629916457e-06,
"loss": 0.3508,
"step": 978
},
{
"epoch": 4.46904422253923,
"grad_norm": 0.07827269612143122,
"learning_rate": 2.706535362459657e-06,
"loss": 0.3541,
"step": 979
},
{
"epoch": 4.473609129814551,
"grad_norm": 0.08066212537936188,
"learning_rate": 2.6605942290133515e-06,
"loss": 0.3468,
"step": 980
},
{
"epoch": 4.478174037089872,
"grad_norm": 0.08038509522306664,
"learning_rate": 2.615032929988055e-06,
"loss": 0.3493,
"step": 981
},
{
"epoch": 4.482738944365193,
"grad_norm": 0.08121370144409773,
"learning_rate": 2.569851928855256e-06,
"loss": 0.3486,
"step": 982
},
{
"epoch": 4.4873038516405135,
"grad_norm": 0.07856168793082442,
"learning_rate": 2.525051685217865e-06,
"loss": 0.3507,
"step": 983
},
{
"epoch": 4.491868758915834,
"grad_norm": 0.07958873197319534,
"learning_rate": 2.4806326548055238e-06,
"loss": 0.3493,
"step": 984
},
{
"epoch": 4.496433666191155,
"grad_norm": 0.08171369971311804,
"learning_rate": 2.436595289470023e-06,
"loss": 0.3508,
"step": 985
},
{
"epoch": 4.500998573466476,
"grad_norm": 0.08120306757647351,
"learning_rate": 2.3929400371806377e-06,
"loss": 0.3521,
"step": 986
},
{
"epoch": 4.505563480741797,
"grad_norm": 0.07637699134693174,
"learning_rate": 2.3496673420196326e-06,
"loss": 0.3466,
"step": 987
},
{
"epoch": 4.510128388017119,
"grad_norm": 0.08223264882483422,
"learning_rate": 2.306777644177709e-06,
"loss": 0.3507,
"step": 988
},
{
"epoch": 4.51469329529244,
"grad_norm": 0.07915711031881265,
"learning_rate": 2.2642713799495207e-06,
"loss": 0.3554,
"step": 989
},
{
"epoch": 4.519258202567761,
"grad_norm": 0.08144647282074215,
"learning_rate": 2.222148981729273e-06,
"loss": 0.3529,
"step": 990
},
{
"epoch": 4.5238231098430814,
"grad_norm": 0.08053252018883734,
"learning_rate": 2.1804108780062805e-06,
"loss": 0.3464,
"step": 991
},
{
"epoch": 4.528388017118402,
"grad_norm": 0.08178510466321848,
"learning_rate": 2.139057493360643e-06,
"loss": 0.3477,
"step": 992
},
{
"epoch": 4.532952924393723,
"grad_norm": 0.08178556560050178,
"learning_rate": 2.098089248458912e-06,
"loss": 0.3485,
"step": 993
},
{
"epoch": 4.537517831669044,
"grad_norm": 0.08220979435959991,
"learning_rate": 2.0575065600498067e-06,
"loss": 0.3517,
"step": 994
},
{
"epoch": 4.542082738944365,
"grad_norm": 0.07762389992501054,
"learning_rate": 2.0173098409599757e-06,
"loss": 0.3504,
"step": 995
},
{
"epoch": 4.546647646219686,
"grad_norm": 0.07845697347936922,
"learning_rate": 1.977499500089808e-06,
"loss": 0.3473,
"step": 996
},
{
"epoch": 4.551212553495007,
"grad_norm": 0.08243028759784551,
"learning_rate": 1.9380759424092722e-06,
"loss": 0.3488,
"step": 997
},
{
"epoch": 4.555777460770328,
"grad_norm": 0.08482588178810972,
"learning_rate": 1.899039568953782e-06,
"loss": 0.3485,
"step": 998
},
{
"epoch": 4.560342368045649,
"grad_norm": 0.08930393433101652,
"learning_rate": 1.8603907768201335e-06,
"loss": 0.3477,
"step": 999
},
{
"epoch": 4.56490727532097,
"grad_norm": 0.07646082270023902,
"learning_rate": 1.8221299591624531e-06,
"loss": 0.3541,
"step": 1000
},
{
"epoch": 4.569472182596291,
"grad_norm": 0.08228826529784623,
"learning_rate": 1.7842575051882117e-06,
"loss": 0.3455,
"step": 1001
},
{
"epoch": 4.574037089871612,
"grad_norm": 0.0826320638970819,
"learning_rate": 1.7467738001542534e-06,
"loss": 0.3541,
"step": 1002
},
{
"epoch": 4.578601997146933,
"grad_norm": 0.07749734600577586,
"learning_rate": 1.7096792253628747e-06,
"loss": 0.3508,
"step": 1003
},
{
"epoch": 4.583166904422254,
"grad_norm": 0.07849844115759268,
"learning_rate": 1.6729741581579695e-06,
"loss": 0.3483,
"step": 1004
},
{
"epoch": 4.587731811697575,
"grad_norm": 0.08061411035349854,
"learning_rate": 1.6366589719211478e-06,
"loss": 0.3459,
"step": 1005
},
{
"epoch": 4.592296718972896,
"grad_norm": 0.07707354315110565,
"learning_rate": 1.6007340360679835e-06,
"loss": 0.3472,
"step": 1006
},
{
"epoch": 4.5968616262482165,
"grad_norm": 0.07564998473974219,
"learning_rate": 1.56519971604423e-06,
"loss": 0.3536,
"step": 1007
},
{
"epoch": 4.601426533523538,
"grad_norm": 0.07973487309962904,
"learning_rate": 1.5300563733220997e-06,
"loss": 0.3524,
"step": 1008
},
{
"epoch": 4.605991440798859,
"grad_norm": 0.0798405736175932,
"learning_rate": 1.4953043653966125e-06,
"loss": 0.3437,
"step": 1009
},
{
"epoch": 4.61055634807418,
"grad_norm": 0.07661822938491461,
"learning_rate": 1.4609440457819201e-06,
"loss": 0.3505,
"step": 1010
},
{
"epoch": 4.615121255349501,
"grad_norm": 0.0803917140814825,
"learning_rate": 1.4269757640077474e-06,
"loss": 0.3473,
"step": 1011
},
{
"epoch": 4.619686162624822,
"grad_norm": 0.08142790095124294,
"learning_rate": 1.393399865615832e-06,
"loss": 0.356,
"step": 1012
},
{
"epoch": 4.624251069900143,
"grad_norm": 0.07941212240240511,
"learning_rate": 1.3602166921563709e-06,
"loss": 0.3469,
"step": 1013
},
{
"epoch": 4.628815977175464,
"grad_norm": 0.07567140453271615,
"learning_rate": 1.3274265811845877e-06,
"loss": 0.3565,
"step": 1014
},
{
"epoch": 4.633380884450784,
"grad_norm": 0.07693089631655367,
"learning_rate": 1.2950298662572914e-06,
"loss": 0.3511,
"step": 1015
},
{
"epoch": 4.637945791726105,
"grad_norm": 0.07649326760766532,
"learning_rate": 1.2630268769294695e-06,
"loss": 0.3488,
"step": 1016
},
{
"epoch": 4.642510699001426,
"grad_norm": 0.07907653808032476,
"learning_rate": 1.2314179387509451e-06,
"loss": 0.3498,
"step": 1017
},
{
"epoch": 4.647075606276747,
"grad_norm": 0.08218341042555163,
"learning_rate": 1.2002033732630624e-06,
"loss": 0.3456,
"step": 1018
},
{
"epoch": 4.651640513552069,
"grad_norm": 0.078114066869552,
"learning_rate": 1.169383497995411e-06,
"loss": 0.3519,
"step": 1019
},
{
"epoch": 4.65620542082739,
"grad_norm": 0.08045459715620887,
"learning_rate": 1.1389586264626141e-06,
"loss": 0.3509,
"step": 1020
},
{
"epoch": 4.660770328102711,
"grad_norm": 0.07667337101962392,
"learning_rate": 1.108929068161122e-06,
"loss": 0.3497,
"step": 1021
},
{
"epoch": 4.6653352353780315,
"grad_norm": 0.08000069107101468,
"learning_rate": 1.0792951285660601e-06,
"loss": 0.3496,
"step": 1022
},
{
"epoch": 4.669900142653352,
"grad_norm": 0.07789612908736167,
"learning_rate": 1.0500571091281375e-06,
"loss": 0.3493,
"step": 1023
},
{
"epoch": 4.674465049928673,
"grad_norm": 0.08267558856337517,
"learning_rate": 1.0212153072705732e-06,
"loss": 0.3479,
"step": 1024
},
{
"epoch": 4.679029957203994,
"grad_norm": 0.07420363546139669,
"learning_rate": 9.927700163860642e-07,
"loss": 0.3533,
"step": 1025
},
{
"epoch": 4.683594864479315,
"grad_norm": 0.07447011872925986,
"learning_rate": 9.647215258338138e-07,
"loss": 0.3546,
"step": 1026
},
{
"epoch": 4.688159771754636,
"grad_norm": 0.07716250968421168,
"learning_rate": 9.370701209365784e-07,
"loss": 0.3482,
"step": 1027
},
{
"epoch": 4.692724679029957,
"grad_norm": 0.07514426209617693,
"learning_rate": 9.098160829777724e-07,
"loss": 0.3455,
"step": 1028
},
{
"epoch": 4.697289586305278,
"grad_norm": 0.079548149699182,
"learning_rate": 8.829596891985859e-07,
"loss": 0.3511,
"step": 1029
},
{
"epoch": 4.7018544935805995,
"grad_norm": 0.07934376344480484,
"learning_rate": 8.565012127951955e-07,
"loss": 0.3519,
"step": 1030
},
{
"epoch": 4.70641940085592,
"grad_norm": 0.07581124441915535,
"learning_rate": 8.304409229159804e-07,
"loss": 0.346,
"step": 1031
},
{
"epoch": 4.710984308131241,
"grad_norm": 0.07394023460690179,
"learning_rate": 8.047790846587467e-07,
"loss": 0.3533,
"step": 1032
},
{
"epoch": 4.715549215406562,
"grad_norm": 0.07412942941301379,
"learning_rate": 7.7951595906808e-07,
"loss": 0.3525,
"step": 1033
},
{
"epoch": 4.720114122681883,
"grad_norm": 0.07682082219491011,
"learning_rate": 7.546518031326644e-07,
"loss": 0.3515,
"step": 1034
},
{
"epoch": 4.724679029957204,
"grad_norm": 0.07823025700829145,
"learning_rate": 7.301868697826608e-07,
"loss": 0.3492,
"step": 1035
},
{
"epoch": 4.729243937232525,
"grad_norm": 0.0766178375837658,
"learning_rate": 7.061214078871725e-07,
"loss": 0.3519,
"step": 1036
},
{
"epoch": 4.733808844507846,
"grad_norm": 0.0760779964551882,
"learning_rate": 6.824556622516599e-07,
"loss": 0.3512,
"step": 1037
},
{
"epoch": 4.7383737517831666,
"grad_norm": 0.07486994881455176,
"learning_rate": 6.591898736154801e-07,
"loss": 0.3508,
"step": 1038
},
{
"epoch": 4.742938659058488,
"grad_norm": 0.0759371546012345,
"learning_rate": 6.363242786494539e-07,
"loss": 0.3489,
"step": 1039
},
{
"epoch": 4.747503566333809,
"grad_norm": 0.0749018727178018,
"learning_rate": 6.138591099534141e-07,
"loss": 0.3493,
"step": 1040
},
{
"epoch": 4.75206847360913,
"grad_norm": 0.0753265709026964,
"learning_rate": 5.917945960538918e-07,
"loss": 0.3466,
"step": 1041
},
{
"epoch": 4.756633380884451,
"grad_norm": 0.07840244976592427,
"learning_rate": 5.701309614017447e-07,
"loss": 0.3505,
"step": 1042
},
{
"epoch": 4.761198288159772,
"grad_norm": 0.07383075199179212,
"learning_rate": 5.488684263698929e-07,
"loss": 0.3536,
"step": 1043
},
{
"epoch": 4.765763195435093,
"grad_norm": 0.11122241090091968,
"learning_rate": 5.280072072510933e-07,
"loss": 0.3461,
"step": 1044
},
{
"epoch": 4.770328102710414,
"grad_norm": 0.0762041664722857,
"learning_rate": 5.075475162557109e-07,
"loss": 0.3506,
"step": 1045
},
{
"epoch": 4.7748930099857345,
"grad_norm": 0.07751822838032038,
"learning_rate": 4.874895615095776e-07,
"loss": 0.3492,
"step": 1046
},
{
"epoch": 4.779457917261055,
"grad_norm": 0.0719584207148954,
"learning_rate": 4.6783354705187466e-07,
"loss": 0.3466,
"step": 1047
},
{
"epoch": 4.784022824536376,
"grad_norm": 0.07737673543131036,
"learning_rate": 4.485796728330449e-07,
"loss": 0.3541,
"step": 1048
},
{
"epoch": 4.788587731811697,
"grad_norm": 0.07610517221913907,
"learning_rate": 4.29728134712768e-07,
"loss": 0.3515,
"step": 1049
},
{
"epoch": 4.793152639087019,
"grad_norm": 0.07503429819011145,
"learning_rate": 4.11279124457975e-07,
"loss": 0.3499,
"step": 1050
},
{
"epoch": 4.79771754636234,
"grad_norm": 0.07627319972442853,
"learning_rate": 3.9323282974088164e-07,
"loss": 0.3487,
"step": 1051
},
{
"epoch": 4.802282453637661,
"grad_norm": 0.07282031998275167,
"learning_rate": 3.7558943413709583e-07,
"loss": 0.3465,
"step": 1052
},
{
"epoch": 4.806847360912982,
"grad_norm": 0.07525573928294778,
"learning_rate": 3.5834911712373076e-07,
"loss": 0.3488,
"step": 1053
},
{
"epoch": 4.8114122681883025,
"grad_norm": 0.07245362406747542,
"learning_rate": 3.4151205407759736e-07,
"loss": 0.3515,
"step": 1054
},
{
"epoch": 4.815977175463623,
"grad_norm": 0.0732430683885161,
"learning_rate": 3.2507841627341e-07,
"loss": 0.3531,
"step": 1055
},
{
"epoch": 4.820542082738944,
"grad_norm": 0.0745846212106986,
"learning_rate": 3.090483708820502e-07,
"loss": 0.3497,
"step": 1056
},
{
"epoch": 4.825106990014265,
"grad_norm": 0.07347343844152147,
"learning_rate": 2.934220809688526e-07,
"loss": 0.3518,
"step": 1057
},
{
"epoch": 4.829671897289586,
"grad_norm": 0.07558186917350596,
"learning_rate": 2.7819970549197937e-07,
"loss": 0.3511,
"step": 1058
},
{
"epoch": 4.834236804564907,
"grad_norm": 0.07275618304565004,
"learning_rate": 2.63381399300755e-07,
"loss": 0.3472,
"step": 1059
},
{
"epoch": 4.838801711840228,
"grad_norm": 0.07287512649205556,
"learning_rate": 2.489673131341297e-07,
"loss": 0.3462,
"step": 1060
},
{
"epoch": 4.8433666191155496,
"grad_norm": 0.07194032638043274,
"learning_rate": 2.349575936191384e-07,
"loss": 0.3476,
"step": 1061
},
{
"epoch": 4.84793152639087,
"grad_norm": 0.07547182977690173,
"learning_rate": 2.2135238326938646e-07,
"loss": 0.3471,
"step": 1062
},
{
"epoch": 4.852496433666191,
"grad_norm": 0.07389752198521447,
"learning_rate": 2.0815182048362858e-07,
"loss": 0.3511,
"step": 1063
},
{
"epoch": 4.857061340941512,
"grad_norm": 0.07948970245950392,
"learning_rate": 1.953560395443521e-07,
"loss": 0.3515,
"step": 1064
},
{
"epoch": 4.861626248216833,
"grad_norm": 0.07428844813444034,
"learning_rate": 1.829651706164004e-07,
"loss": 0.3481,
"step": 1065
},
{
"epoch": 4.866191155492154,
"grad_norm": 0.07304235969500948,
"learning_rate": 1.7097933974566272e-07,
"loss": 0.3553,
"step": 1066
},
{
"epoch": 4.870756062767475,
"grad_norm": 0.07616979320061058,
"learning_rate": 1.5939866885778198e-07,
"loss": 0.3532,
"step": 1067
},
{
"epoch": 4.875320970042796,
"grad_norm": 0.07825400387440254,
"learning_rate": 1.4822327575692464e-07,
"loss": 0.3479,
"step": 1068
},
{
"epoch": 4.879885877318117,
"grad_norm": 0.08117745285619632,
"learning_rate": 1.374532741245682e-07,
"loss": 0.3512,
"step": 1069
},
{
"epoch": 4.884450784593438,
"grad_norm": 0.07977853089608672,
"learning_rate": 1.2708877351835569e-07,
"loss": 0.3485,
"step": 1070
},
{
"epoch": 4.889015691868759,
"grad_norm": 0.07367012787142511,
"learning_rate": 1.1712987937098519e-07,
"loss": 0.3518,
"step": 1071
},
{
"epoch": 4.89358059914408,
"grad_norm": 0.07383920187031665,
"learning_rate": 1.0757669298912199e-07,
"loss": 0.3478,
"step": 1072
},
{
"epoch": 4.898145506419401,
"grad_norm": 0.07294907888019937,
"learning_rate": 9.842931155238156e-08,
"loss": 0.3528,
"step": 1073
},
{
"epoch": 4.902710413694722,
"grad_norm": 0.07451910212338729,
"learning_rate": 8.96878281123259e-08,
"loss": 0.349,
"step": 1074
},
{
"epoch": 4.907275320970043,
"grad_norm": 0.07200140543046635,
"learning_rate": 8.135233159154431e-08,
"loss": 0.3485,
"step": 1075
},
{
"epoch": 4.911840228245364,
"grad_norm": 0.07562528202585733,
"learning_rate": 7.342290678272079e-08,
"loss": 0.3498,
"step": 1076
},
{
"epoch": 4.916405135520685,
"grad_norm": 0.07457317771382703,
"learning_rate": 6.58996343477769e-08,
"loss": 0.3537,
"step": 1077
},
{
"epoch": 4.9209700427960055,
"grad_norm": 0.07363984134436599,
"learning_rate": 5.878259081707249e-08,
"loss": 0.3543,
"step": 1078
},
{
"epoch": 4.925534950071326,
"grad_norm": 0.07252929837329791,
"learning_rate": 5.2071848588601815e-08,
"loss": 0.3473,
"step": 1079
},
{
"epoch": 4.930099857346647,
"grad_norm": 0.07518239735309995,
"learning_rate": 4.576747592726083e-08,
"loss": 0.3503,
"step": 1080
},
{
"epoch": 4.934664764621969,
"grad_norm": 0.07799050676264847,
"learning_rate": 3.9869536964167734e-08,
"loss": 0.3548,
"step": 1081
},
{
"epoch": 4.93922967189729,
"grad_norm": 0.075339876329051,
"learning_rate": 3.437809169600126e-08,
"loss": 0.3502,
"step": 1082
},
{
"epoch": 4.943794579172611,
"grad_norm": 0.07479700944498346,
"learning_rate": 2.9293195984383405e-08,
"loss": 0.3468,
"step": 1083
},
{
"epoch": 4.948359486447932,
"grad_norm": 0.07210974118530611,
"learning_rate": 2.461490155532875e-08,
"loss": 0.3507,
"step": 1084
},
{
"epoch": 4.9529243937232525,
"grad_norm": 0.07652036354657446,
"learning_rate": 2.03432559986938e-08,
"loss": 0.3476,
"step": 1085
},
{
"epoch": 4.957489300998573,
"grad_norm": 0.07127307671646077,
"learning_rate": 1.6478302767719555e-08,
"loss": 0.3504,
"step": 1086
},
{
"epoch": 4.962054208273894,
"grad_norm": 0.07366450733389644,
"learning_rate": 1.3020081178574117e-08,
"loss": 0.3488,
"step": 1087
},
{
"epoch": 4.966619115549215,
"grad_norm": 0.07323536126136222,
"learning_rate": 9.968626409948556e-09,
"loss": 0.353,
"step": 1088
},
{
"epoch": 4.971184022824536,
"grad_norm": 0.07340553491000143,
"learning_rate": 7.323969502710526e-09,
"loss": 0.3501,
"step": 1089
},
{
"epoch": 4.975748930099857,
"grad_norm": 0.07588823565965415,
"learning_rate": 5.0861373595889605e-09,
"loss": 0.349,
"step": 1090
},
{
"epoch": 4.980313837375178,
"grad_norm": 0.07676438153761625,
"learning_rate": 3.255152744885415e-09,
"loss": 0.3526,
"step": 1091
},
{
"epoch": 4.9848787446505,
"grad_norm": 0.07181335005694361,
"learning_rate": 1.831034284260902e-09,
"loss": 0.3477,
"step": 1092
},
{
"epoch": 4.9894436519258205,
"grad_norm": 0.07171739407306418,
"learning_rate": 8.137964645316132e-10,
"loss": 0.3434,
"step": 1093
},
{
"epoch": 4.994008559201141,
"grad_norm": 0.07293042506861117,
"learning_rate": 2.0344963353124969e-10,
"loss": 0.3491,
"step": 1094
},
{
"epoch": 4.998573466476462,
"grad_norm": 0.07498693212494786,
"learning_rate": 0.0,
"loss": 0.3471,
"step": 1095
},
{
"epoch": 4.998573466476462,
"step": 1095,
"total_flos": 2.8177610658514207e+19,
"train_loss": 0.41344334662777105,
"train_runtime": 239120.7765,
"train_samples_per_second": 2.345,
"train_steps_per_second": 0.005
}
],
"logging_steps": 1.0,
"max_steps": 1095,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.8177610658514207e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}