flyingbugs's picture
Model save
3460e4e verified
raw
history blame
438 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 2505,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011976047904191617,
"grad_norm": 54.79514950430285,
"learning_rate": 0.0,
"loss": 10.8943,
"step": 1
},
{
"epoch": 0.0023952095808383233,
"grad_norm": 56.757628879119885,
"learning_rate": 1.99203187250996e-07,
"loss": 10.89,
"step": 2
},
{
"epoch": 0.003592814371257485,
"grad_norm": 57.17581639773861,
"learning_rate": 3.98406374501992e-07,
"loss": 11.0068,
"step": 3
},
{
"epoch": 0.004790419161676647,
"grad_norm": 54.984872427364465,
"learning_rate": 5.976095617529881e-07,
"loss": 11.0276,
"step": 4
},
{
"epoch": 0.005988023952095809,
"grad_norm": 56.05619835646869,
"learning_rate": 7.96812749003984e-07,
"loss": 10.9392,
"step": 5
},
{
"epoch": 0.00718562874251497,
"grad_norm": 58.19338045642581,
"learning_rate": 9.9601593625498e-07,
"loss": 10.7936,
"step": 6
},
{
"epoch": 0.008383233532934131,
"grad_norm": 56.10308353018647,
"learning_rate": 1.1952191235059762e-06,
"loss": 10.8775,
"step": 7
},
{
"epoch": 0.009580838323353293,
"grad_norm": 58.76885285664543,
"learning_rate": 1.3944223107569721e-06,
"loss": 10.7064,
"step": 8
},
{
"epoch": 0.010778443113772455,
"grad_norm": 63.754383527769804,
"learning_rate": 1.593625498007968e-06,
"loss": 10.5962,
"step": 9
},
{
"epoch": 0.011976047904191617,
"grad_norm": 68.7490518177735,
"learning_rate": 1.7928286852589644e-06,
"loss": 10.2544,
"step": 10
},
{
"epoch": 0.013173652694610778,
"grad_norm": 79.2975478966192,
"learning_rate": 1.99203187250996e-06,
"loss": 9.3725,
"step": 11
},
{
"epoch": 0.01437125748502994,
"grad_norm": 86.03971672310577,
"learning_rate": 2.1912350597609563e-06,
"loss": 9.2187,
"step": 12
},
{
"epoch": 0.015568862275449102,
"grad_norm": 99.30779048441995,
"learning_rate": 2.3904382470119524e-06,
"loss": 8.759,
"step": 13
},
{
"epoch": 0.016766467065868262,
"grad_norm": 84.21150595610874,
"learning_rate": 2.5896414342629486e-06,
"loss": 4.794,
"step": 14
},
{
"epoch": 0.017964071856287425,
"grad_norm": 60.370641116029375,
"learning_rate": 2.7888446215139443e-06,
"loss": 3.6129,
"step": 15
},
{
"epoch": 0.019161676646706587,
"grad_norm": 53.78094632352366,
"learning_rate": 2.9880478087649404e-06,
"loss": 3.2516,
"step": 16
},
{
"epoch": 0.02035928143712575,
"grad_norm": 39.58037739760117,
"learning_rate": 3.187250996015936e-06,
"loss": 2.6104,
"step": 17
},
{
"epoch": 0.02155688622754491,
"grad_norm": 33.972168591482195,
"learning_rate": 3.3864541832669323e-06,
"loss": 2.342,
"step": 18
},
{
"epoch": 0.022754491017964073,
"grad_norm": 9.65497863737739,
"learning_rate": 3.585657370517929e-06,
"loss": 1.2909,
"step": 19
},
{
"epoch": 0.023952095808383235,
"grad_norm": 16.52898252384529,
"learning_rate": 3.7848605577689246e-06,
"loss": 1.1415,
"step": 20
},
{
"epoch": 0.025149700598802394,
"grad_norm": 5.7650467821097084,
"learning_rate": 3.98406374501992e-06,
"loss": 1.1247,
"step": 21
},
{
"epoch": 0.026347305389221556,
"grad_norm": 4.8096421357938395,
"learning_rate": 4.183266932270916e-06,
"loss": 1.0868,
"step": 22
},
{
"epoch": 0.027544910179640718,
"grad_norm": 3.4996438222698383,
"learning_rate": 4.382470119521913e-06,
"loss": 1.0,
"step": 23
},
{
"epoch": 0.02874251497005988,
"grad_norm": 3.1652486025019098,
"learning_rate": 4.581673306772908e-06,
"loss": 0.9626,
"step": 24
},
{
"epoch": 0.029940119760479042,
"grad_norm": 2.7401859792599335,
"learning_rate": 4.780876494023905e-06,
"loss": 0.8619,
"step": 25
},
{
"epoch": 0.031137724550898204,
"grad_norm": 1.97821765865516,
"learning_rate": 4.980079681274901e-06,
"loss": 0.8227,
"step": 26
},
{
"epoch": 0.032335329341317366,
"grad_norm": 1.5130707179834126,
"learning_rate": 5.179282868525897e-06,
"loss": 0.7924,
"step": 27
},
{
"epoch": 0.033532934131736525,
"grad_norm": 1.2442130553921003,
"learning_rate": 5.378486055776893e-06,
"loss": 0.7553,
"step": 28
},
{
"epoch": 0.03473053892215569,
"grad_norm": 0.9286793757204014,
"learning_rate": 5.577689243027889e-06,
"loss": 0.6935,
"step": 29
},
{
"epoch": 0.03592814371257485,
"grad_norm": 6.359963226534411,
"learning_rate": 5.776892430278884e-06,
"loss": 0.6688,
"step": 30
},
{
"epoch": 0.037125748502994015,
"grad_norm": 1.2268275277395675,
"learning_rate": 5.976095617529881e-06,
"loss": 0.6332,
"step": 31
},
{
"epoch": 0.03832335329341317,
"grad_norm": 0.81783019580707,
"learning_rate": 6.175298804780877e-06,
"loss": 0.6361,
"step": 32
},
{
"epoch": 0.03952095808383234,
"grad_norm": 0.7407153887830317,
"learning_rate": 6.374501992031872e-06,
"loss": 0.6539,
"step": 33
},
{
"epoch": 0.0407185628742515,
"grad_norm": 0.6745797482499979,
"learning_rate": 6.573705179282869e-06,
"loss": 0.6681,
"step": 34
},
{
"epoch": 0.041916167664670656,
"grad_norm": 0.5175561647539161,
"learning_rate": 6.772908366533865e-06,
"loss": 0.6006,
"step": 35
},
{
"epoch": 0.04311377245508982,
"grad_norm": 0.5378646356672169,
"learning_rate": 6.97211155378486e-06,
"loss": 0.6342,
"step": 36
},
{
"epoch": 0.04431137724550898,
"grad_norm": 0.49715188207347616,
"learning_rate": 7.171314741035858e-06,
"loss": 0.5571,
"step": 37
},
{
"epoch": 0.045508982035928146,
"grad_norm": 0.5752464046611664,
"learning_rate": 7.3705179282868534e-06,
"loss": 0.5922,
"step": 38
},
{
"epoch": 0.046706586826347304,
"grad_norm": 0.5435163951515639,
"learning_rate": 7.569721115537849e-06,
"loss": 0.629,
"step": 39
},
{
"epoch": 0.04790419161676647,
"grad_norm": 0.5134689457787763,
"learning_rate": 7.768924302788846e-06,
"loss": 0.6164,
"step": 40
},
{
"epoch": 0.04910179640718563,
"grad_norm": 0.47657749884191064,
"learning_rate": 7.96812749003984e-06,
"loss": 0.5668,
"step": 41
},
{
"epoch": 0.05029940119760479,
"grad_norm": 0.5871521418869853,
"learning_rate": 8.167330677290837e-06,
"loss": 0.562,
"step": 42
},
{
"epoch": 0.05149700598802395,
"grad_norm": 0.44041947130800496,
"learning_rate": 8.366533864541832e-06,
"loss": 0.5375,
"step": 43
},
{
"epoch": 0.05269461077844311,
"grad_norm": 0.37786020702142803,
"learning_rate": 8.565737051792829e-06,
"loss": 0.5031,
"step": 44
},
{
"epoch": 0.05389221556886228,
"grad_norm": 0.3546561169561431,
"learning_rate": 8.764940239043825e-06,
"loss": 0.4911,
"step": 45
},
{
"epoch": 0.055089820359281436,
"grad_norm": 0.3631813102100884,
"learning_rate": 8.964143426294822e-06,
"loss": 0.5171,
"step": 46
},
{
"epoch": 0.0562874251497006,
"grad_norm": 0.3753660482862505,
"learning_rate": 9.163346613545817e-06,
"loss": 0.4901,
"step": 47
},
{
"epoch": 0.05748502994011976,
"grad_norm": 0.39655673648434475,
"learning_rate": 9.362549800796813e-06,
"loss": 0.5222,
"step": 48
},
{
"epoch": 0.058682634730538925,
"grad_norm": 0.3611712803097438,
"learning_rate": 9.56175298804781e-06,
"loss": 0.4982,
"step": 49
},
{
"epoch": 0.059880239520958084,
"grad_norm": 0.33240242630794137,
"learning_rate": 9.760956175298805e-06,
"loss": 0.5162,
"step": 50
},
{
"epoch": 0.06107784431137724,
"grad_norm": 0.3439819583613628,
"learning_rate": 9.960159362549801e-06,
"loss": 0.5272,
"step": 51
},
{
"epoch": 0.06227544910179641,
"grad_norm": 0.3450488168440597,
"learning_rate": 1.0159362549800798e-05,
"loss": 0.4994,
"step": 52
},
{
"epoch": 0.06347305389221557,
"grad_norm": 0.28776800800147034,
"learning_rate": 1.0358565737051794e-05,
"loss": 0.4946,
"step": 53
},
{
"epoch": 0.06467065868263473,
"grad_norm": 0.3448764076361581,
"learning_rate": 1.055776892430279e-05,
"loss": 0.5387,
"step": 54
},
{
"epoch": 0.0658682634730539,
"grad_norm": 0.3075959613743854,
"learning_rate": 1.0756972111553786e-05,
"loss": 0.5354,
"step": 55
},
{
"epoch": 0.06706586826347305,
"grad_norm": 0.3201400478325335,
"learning_rate": 1.095617529880478e-05,
"loss": 0.4907,
"step": 56
},
{
"epoch": 0.06826347305389222,
"grad_norm": 0.30735738322741646,
"learning_rate": 1.1155378486055777e-05,
"loss": 0.5034,
"step": 57
},
{
"epoch": 0.06946107784431138,
"grad_norm": 0.2840857794073668,
"learning_rate": 1.1354581673306774e-05,
"loss": 0.4824,
"step": 58
},
{
"epoch": 0.07065868263473053,
"grad_norm": 0.2801029135458567,
"learning_rate": 1.1553784860557769e-05,
"loss": 0.4838,
"step": 59
},
{
"epoch": 0.0718562874251497,
"grad_norm": 0.266234993630981,
"learning_rate": 1.1752988047808767e-05,
"loss": 0.4617,
"step": 60
},
{
"epoch": 0.07305389221556886,
"grad_norm": 0.27251489808686596,
"learning_rate": 1.1952191235059762e-05,
"loss": 0.4697,
"step": 61
},
{
"epoch": 0.07425149700598803,
"grad_norm": 0.2563484261234609,
"learning_rate": 1.2151394422310758e-05,
"loss": 0.4522,
"step": 62
},
{
"epoch": 0.07544910179640718,
"grad_norm": 0.2506205826726324,
"learning_rate": 1.2350597609561753e-05,
"loss": 0.4635,
"step": 63
},
{
"epoch": 0.07664670658682635,
"grad_norm": 0.25702175253546583,
"learning_rate": 1.254980079681275e-05,
"loss": 0.4595,
"step": 64
},
{
"epoch": 0.07784431137724551,
"grad_norm": 0.28570191205299905,
"learning_rate": 1.2749003984063745e-05,
"loss": 0.4969,
"step": 65
},
{
"epoch": 0.07904191616766468,
"grad_norm": 0.26334450844218193,
"learning_rate": 1.2948207171314741e-05,
"loss": 0.4859,
"step": 66
},
{
"epoch": 0.08023952095808383,
"grad_norm": 0.29732192002968366,
"learning_rate": 1.3147410358565738e-05,
"loss": 0.4477,
"step": 67
},
{
"epoch": 0.081437125748503,
"grad_norm": 0.23068215041219625,
"learning_rate": 1.3346613545816733e-05,
"loss": 0.4211,
"step": 68
},
{
"epoch": 0.08263473053892216,
"grad_norm": 0.24492349941903468,
"learning_rate": 1.354581673306773e-05,
"loss": 0.453,
"step": 69
},
{
"epoch": 0.08383233532934131,
"grad_norm": 0.29283443324208586,
"learning_rate": 1.3745019920318724e-05,
"loss": 0.4474,
"step": 70
},
{
"epoch": 0.08502994011976048,
"grad_norm": 0.25341775346509604,
"learning_rate": 1.394422310756972e-05,
"loss": 0.4707,
"step": 71
},
{
"epoch": 0.08622754491017964,
"grad_norm": 0.2856091918134518,
"learning_rate": 1.4143426294820719e-05,
"loss": 0.4751,
"step": 72
},
{
"epoch": 0.08742514970059881,
"grad_norm": 0.23081477218376004,
"learning_rate": 1.4342629482071715e-05,
"loss": 0.4363,
"step": 73
},
{
"epoch": 0.08862275449101796,
"grad_norm": 0.22259343239433793,
"learning_rate": 1.454183266932271e-05,
"loss": 0.4425,
"step": 74
},
{
"epoch": 0.08982035928143713,
"grad_norm": 0.24348339684396708,
"learning_rate": 1.4741035856573707e-05,
"loss": 0.4269,
"step": 75
},
{
"epoch": 0.09101796407185629,
"grad_norm": 0.24324929818658816,
"learning_rate": 1.4940239043824702e-05,
"loss": 0.4588,
"step": 76
},
{
"epoch": 0.09221556886227544,
"grad_norm": 0.2470127106367357,
"learning_rate": 1.5139442231075698e-05,
"loss": 0.419,
"step": 77
},
{
"epoch": 0.09341317365269461,
"grad_norm": 0.24506592150454967,
"learning_rate": 1.5338645418326695e-05,
"loss": 0.4538,
"step": 78
},
{
"epoch": 0.09461077844311377,
"grad_norm": 0.2337266085209308,
"learning_rate": 1.553784860557769e-05,
"loss": 0.4509,
"step": 79
},
{
"epoch": 0.09580838323353294,
"grad_norm": 0.2505585711243096,
"learning_rate": 1.5737051792828685e-05,
"loss": 0.4327,
"step": 80
},
{
"epoch": 0.09700598802395209,
"grad_norm": 0.2258018505308189,
"learning_rate": 1.593625498007968e-05,
"loss": 0.4049,
"step": 81
},
{
"epoch": 0.09820359281437126,
"grad_norm": 0.2348517105492361,
"learning_rate": 1.6135458167330678e-05,
"loss": 0.4122,
"step": 82
},
{
"epoch": 0.09940119760479042,
"grad_norm": 0.2653494876353725,
"learning_rate": 1.6334661354581674e-05,
"loss": 0.4665,
"step": 83
},
{
"epoch": 0.10059880239520957,
"grad_norm": 0.22144726609700485,
"learning_rate": 1.653386454183267e-05,
"loss": 0.418,
"step": 84
},
{
"epoch": 0.10179640718562874,
"grad_norm": 0.2508792844760677,
"learning_rate": 1.6733067729083664e-05,
"loss": 0.4443,
"step": 85
},
{
"epoch": 0.1029940119760479,
"grad_norm": 0.2854393823455875,
"learning_rate": 1.693227091633466e-05,
"loss": 0.4195,
"step": 86
},
{
"epoch": 0.10419161676646707,
"grad_norm": 0.23606062711925102,
"learning_rate": 1.7131474103585657e-05,
"loss": 0.4243,
"step": 87
},
{
"epoch": 0.10538922155688622,
"grad_norm": 0.2521278291896226,
"learning_rate": 1.7330677290836657e-05,
"loss": 0.4297,
"step": 88
},
{
"epoch": 0.10658682634730539,
"grad_norm": 0.26420114667301314,
"learning_rate": 1.752988047808765e-05,
"loss": 0.4388,
"step": 89
},
{
"epoch": 0.10778443113772455,
"grad_norm": 0.21305136032103889,
"learning_rate": 1.7729083665338647e-05,
"loss": 0.3927,
"step": 90
},
{
"epoch": 0.1089820359281437,
"grad_norm": 0.23802281226405544,
"learning_rate": 1.7928286852589643e-05,
"loss": 0.4488,
"step": 91
},
{
"epoch": 0.11017964071856287,
"grad_norm": 0.23810508930336838,
"learning_rate": 1.812749003984064e-05,
"loss": 0.3997,
"step": 92
},
{
"epoch": 0.11137724550898204,
"grad_norm": 0.24259468568054085,
"learning_rate": 1.8326693227091633e-05,
"loss": 0.3758,
"step": 93
},
{
"epoch": 0.1125748502994012,
"grad_norm": 0.20374027655583166,
"learning_rate": 1.852589641434263e-05,
"loss": 0.407,
"step": 94
},
{
"epoch": 0.11377245508982035,
"grad_norm": 0.2601964844787908,
"learning_rate": 1.8725099601593626e-05,
"loss": 0.4595,
"step": 95
},
{
"epoch": 0.11497005988023952,
"grad_norm": 0.26250921025118357,
"learning_rate": 1.8924302788844623e-05,
"loss": 0.4231,
"step": 96
},
{
"epoch": 0.11616766467065869,
"grad_norm": 0.20571620251881487,
"learning_rate": 1.912350597609562e-05,
"loss": 0.4009,
"step": 97
},
{
"epoch": 0.11736526946107785,
"grad_norm": 0.24327139439088305,
"learning_rate": 1.9322709163346613e-05,
"loss": 0.4276,
"step": 98
},
{
"epoch": 0.118562874251497,
"grad_norm": 0.23102205091133293,
"learning_rate": 1.952191235059761e-05,
"loss": 0.4147,
"step": 99
},
{
"epoch": 0.11976047904191617,
"grad_norm": 0.24330722231406138,
"learning_rate": 1.9721115537848606e-05,
"loss": 0.3958,
"step": 100
},
{
"epoch": 0.12095808383233533,
"grad_norm": 0.21737639324281494,
"learning_rate": 1.9920318725099602e-05,
"loss": 0.3933,
"step": 101
},
{
"epoch": 0.12215568862275449,
"grad_norm": 0.2850399834495755,
"learning_rate": 2.01195219123506e-05,
"loss": 0.4511,
"step": 102
},
{
"epoch": 0.12335329341317365,
"grad_norm": 0.24656275278410664,
"learning_rate": 2.0318725099601595e-05,
"loss": 0.4115,
"step": 103
},
{
"epoch": 0.12455089820359282,
"grad_norm": 0.2595491643663737,
"learning_rate": 2.0517928286852592e-05,
"loss": 0.4418,
"step": 104
},
{
"epoch": 0.12574850299401197,
"grad_norm": 0.2656997923726184,
"learning_rate": 2.071713147410359e-05,
"loss": 0.4505,
"step": 105
},
{
"epoch": 0.12694610778443113,
"grad_norm": 0.24698935654165657,
"learning_rate": 2.0916334661354585e-05,
"loss": 0.3885,
"step": 106
},
{
"epoch": 0.1281437125748503,
"grad_norm": 0.2845153521329936,
"learning_rate": 2.111553784860558e-05,
"loss": 0.4086,
"step": 107
},
{
"epoch": 0.12934131736526946,
"grad_norm": 0.23683154513963428,
"learning_rate": 2.1314741035856575e-05,
"loss": 0.3966,
"step": 108
},
{
"epoch": 0.13053892215568863,
"grad_norm": 0.2452609886137008,
"learning_rate": 2.151394422310757e-05,
"loss": 0.4275,
"step": 109
},
{
"epoch": 0.1317365269461078,
"grad_norm": 0.24583645481257677,
"learning_rate": 2.1713147410358568e-05,
"loss": 0.4208,
"step": 110
},
{
"epoch": 0.13293413173652693,
"grad_norm": 0.2335052939985238,
"learning_rate": 2.191235059760956e-05,
"loss": 0.3814,
"step": 111
},
{
"epoch": 0.1341317365269461,
"grad_norm": 0.2809428037244834,
"learning_rate": 2.2111553784860558e-05,
"loss": 0.4117,
"step": 112
},
{
"epoch": 0.13532934131736526,
"grad_norm": 0.23545228730587056,
"learning_rate": 2.2310756972111554e-05,
"loss": 0.3839,
"step": 113
},
{
"epoch": 0.13652694610778443,
"grad_norm": 0.26897637202441166,
"learning_rate": 2.250996015936255e-05,
"loss": 0.4042,
"step": 114
},
{
"epoch": 0.1377245508982036,
"grad_norm": 0.2265361525295171,
"learning_rate": 2.2709163346613547e-05,
"loss": 0.3715,
"step": 115
},
{
"epoch": 0.13892215568862276,
"grad_norm": 0.26611127166446824,
"learning_rate": 2.290836653386454e-05,
"loss": 0.4034,
"step": 116
},
{
"epoch": 0.14011976047904193,
"grad_norm": 0.2420653470666383,
"learning_rate": 2.3107569721115537e-05,
"loss": 0.4103,
"step": 117
},
{
"epoch": 0.14131736526946106,
"grad_norm": 0.2660045426662254,
"learning_rate": 2.3306772908366534e-05,
"loss": 0.4554,
"step": 118
},
{
"epoch": 0.14251497005988023,
"grad_norm": 0.2539644168607277,
"learning_rate": 2.3505976095617534e-05,
"loss": 0.4164,
"step": 119
},
{
"epoch": 0.1437125748502994,
"grad_norm": 0.2455010355834903,
"learning_rate": 2.3705179282868527e-05,
"loss": 0.4008,
"step": 120
},
{
"epoch": 0.14491017964071856,
"grad_norm": 0.27969754850717093,
"learning_rate": 2.3904382470119523e-05,
"loss": 0.3977,
"step": 121
},
{
"epoch": 0.14610778443113773,
"grad_norm": 0.2564093731357206,
"learning_rate": 2.410358565737052e-05,
"loss": 0.4045,
"step": 122
},
{
"epoch": 0.1473053892215569,
"grad_norm": 0.2725047091003095,
"learning_rate": 2.4302788844621517e-05,
"loss": 0.45,
"step": 123
},
{
"epoch": 0.14850299401197606,
"grad_norm": 0.2683676822039239,
"learning_rate": 2.4501992031872513e-05,
"loss": 0.3734,
"step": 124
},
{
"epoch": 0.1497005988023952,
"grad_norm": 0.26483446734800303,
"learning_rate": 2.4701195219123506e-05,
"loss": 0.4299,
"step": 125
},
{
"epoch": 0.15089820359281436,
"grad_norm": 0.2906299908821503,
"learning_rate": 2.4900398406374503e-05,
"loss": 0.4099,
"step": 126
},
{
"epoch": 0.15209580838323353,
"grad_norm": 0.23050388188423987,
"learning_rate": 2.50996015936255e-05,
"loss": 0.3894,
"step": 127
},
{
"epoch": 0.1532934131736527,
"grad_norm": 0.23056102747518514,
"learning_rate": 2.5298804780876496e-05,
"loss": 0.3992,
"step": 128
},
{
"epoch": 0.15449101796407186,
"grad_norm": 0.24134668374557078,
"learning_rate": 2.549800796812749e-05,
"loss": 0.4163,
"step": 129
},
{
"epoch": 0.15568862275449102,
"grad_norm": 0.2307077465785839,
"learning_rate": 2.5697211155378486e-05,
"loss": 0.375,
"step": 130
},
{
"epoch": 0.1568862275449102,
"grad_norm": 0.24021193677640243,
"learning_rate": 2.5896414342629482e-05,
"loss": 0.3794,
"step": 131
},
{
"epoch": 0.15808383233532936,
"grad_norm": 0.2678650551381604,
"learning_rate": 2.609561752988048e-05,
"loss": 0.3858,
"step": 132
},
{
"epoch": 0.1592814371257485,
"grad_norm": 0.27236190558783313,
"learning_rate": 2.6294820717131475e-05,
"loss": 0.4135,
"step": 133
},
{
"epoch": 0.16047904191616766,
"grad_norm": 0.23485527911901088,
"learning_rate": 2.649402390438247e-05,
"loss": 0.3861,
"step": 134
},
{
"epoch": 0.16167664670658682,
"grad_norm": 0.23480089742347224,
"learning_rate": 2.6693227091633465e-05,
"loss": 0.371,
"step": 135
},
{
"epoch": 0.162874251497006,
"grad_norm": 0.2983694170319573,
"learning_rate": 2.6892430278884462e-05,
"loss": 0.4039,
"step": 136
},
{
"epoch": 0.16407185628742516,
"grad_norm": 0.29179565712508754,
"learning_rate": 2.709163346613546e-05,
"loss": 0.3959,
"step": 137
},
{
"epoch": 0.16526946107784432,
"grad_norm": 0.2513327875627995,
"learning_rate": 2.7290836653386455e-05,
"loss": 0.3777,
"step": 138
},
{
"epoch": 0.1664670658682635,
"grad_norm": 0.27047538596871695,
"learning_rate": 2.7490039840637448e-05,
"loss": 0.3734,
"step": 139
},
{
"epoch": 0.16766467065868262,
"grad_norm": 0.32761627677688937,
"learning_rate": 2.7689243027888445e-05,
"loss": 0.417,
"step": 140
},
{
"epoch": 0.1688622754491018,
"grad_norm": 0.22883525551154243,
"learning_rate": 2.788844621513944e-05,
"loss": 0.3434,
"step": 141
},
{
"epoch": 0.17005988023952096,
"grad_norm": 0.2562395606218259,
"learning_rate": 2.8087649402390438e-05,
"loss": 0.404,
"step": 142
},
{
"epoch": 0.17125748502994012,
"grad_norm": 0.29494754757019614,
"learning_rate": 2.8286852589641438e-05,
"loss": 0.4433,
"step": 143
},
{
"epoch": 0.1724550898203593,
"grad_norm": 0.25388548151598117,
"learning_rate": 2.8486055776892434e-05,
"loss": 0.4015,
"step": 144
},
{
"epoch": 0.17365269461077845,
"grad_norm": 0.26204950338557254,
"learning_rate": 2.868525896414343e-05,
"loss": 0.3722,
"step": 145
},
{
"epoch": 0.17485029940119762,
"grad_norm": 0.2513952618022329,
"learning_rate": 2.8884462151394427e-05,
"loss": 0.388,
"step": 146
},
{
"epoch": 0.17604790419161676,
"grad_norm": 0.25016047670377756,
"learning_rate": 2.908366533864542e-05,
"loss": 0.3713,
"step": 147
},
{
"epoch": 0.17724550898203592,
"grad_norm": 0.25453929166897055,
"learning_rate": 2.9282868525896417e-05,
"loss": 0.3895,
"step": 148
},
{
"epoch": 0.1784431137724551,
"grad_norm": 0.24307840406104905,
"learning_rate": 2.9482071713147414e-05,
"loss": 0.3725,
"step": 149
},
{
"epoch": 0.17964071856287425,
"grad_norm": 0.2191241248643133,
"learning_rate": 2.968127490039841e-05,
"loss": 0.3482,
"step": 150
},
{
"epoch": 0.18083832335329342,
"grad_norm": 0.25693345565909925,
"learning_rate": 2.9880478087649403e-05,
"loss": 0.4163,
"step": 151
},
{
"epoch": 0.18203592814371258,
"grad_norm": 0.23308707341873378,
"learning_rate": 3.00796812749004e-05,
"loss": 0.3819,
"step": 152
},
{
"epoch": 0.18323353293413175,
"grad_norm": 0.2386056331815225,
"learning_rate": 3.0278884462151397e-05,
"loss": 0.3998,
"step": 153
},
{
"epoch": 0.1844311377245509,
"grad_norm": 0.2867245204720249,
"learning_rate": 3.0478087649402393e-05,
"loss": 0.3839,
"step": 154
},
{
"epoch": 0.18562874251497005,
"grad_norm": 0.21748406693520597,
"learning_rate": 3.067729083665339e-05,
"loss": 0.366,
"step": 155
},
{
"epoch": 0.18682634730538922,
"grad_norm": 0.3219383050512995,
"learning_rate": 3.0876494023904386e-05,
"loss": 0.4023,
"step": 156
},
{
"epoch": 0.18802395209580838,
"grad_norm": 0.2500451387335218,
"learning_rate": 3.107569721115538e-05,
"loss": 0.3844,
"step": 157
},
{
"epoch": 0.18922155688622755,
"grad_norm": 0.27968656589063207,
"learning_rate": 3.127490039840637e-05,
"loss": 0.3935,
"step": 158
},
{
"epoch": 0.19041916167664671,
"grad_norm": 0.3491618657871612,
"learning_rate": 3.147410358565737e-05,
"loss": 0.4024,
"step": 159
},
{
"epoch": 0.19161676646706588,
"grad_norm": 0.30633369512175473,
"learning_rate": 3.1673306772908366e-05,
"loss": 0.3841,
"step": 160
},
{
"epoch": 0.19281437125748502,
"grad_norm": 0.26102067418156066,
"learning_rate": 3.187250996015936e-05,
"loss": 0.3625,
"step": 161
},
{
"epoch": 0.19401197604790418,
"grad_norm": 0.23661664847118677,
"learning_rate": 3.207171314741036e-05,
"loss": 0.383,
"step": 162
},
{
"epoch": 0.19520958083832335,
"grad_norm": 0.29091445392052667,
"learning_rate": 3.2270916334661356e-05,
"loss": 0.4026,
"step": 163
},
{
"epoch": 0.19640718562874251,
"grad_norm": 0.27550623923498735,
"learning_rate": 3.247011952191235e-05,
"loss": 0.3868,
"step": 164
},
{
"epoch": 0.19760479041916168,
"grad_norm": 0.2892918037074825,
"learning_rate": 3.266932270916335e-05,
"loss": 0.3966,
"step": 165
},
{
"epoch": 0.19880239520958085,
"grad_norm": 0.2483048642585193,
"learning_rate": 3.2868525896414345e-05,
"loss": 0.339,
"step": 166
},
{
"epoch": 0.2,
"grad_norm": 0.2342647267903194,
"learning_rate": 3.306772908366534e-05,
"loss": 0.3765,
"step": 167
},
{
"epoch": 0.20119760479041915,
"grad_norm": 0.32883189069564467,
"learning_rate": 3.326693227091633e-05,
"loss": 0.3954,
"step": 168
},
{
"epoch": 0.20239520958083831,
"grad_norm": 0.2697533731408389,
"learning_rate": 3.346613545816733e-05,
"loss": 0.385,
"step": 169
},
{
"epoch": 0.20359281437125748,
"grad_norm": 0.31437525994470433,
"learning_rate": 3.3665338645418325e-05,
"loss": 0.3814,
"step": 170
},
{
"epoch": 0.20479041916167665,
"grad_norm": 0.3278365243812927,
"learning_rate": 3.386454183266932e-05,
"loss": 0.3901,
"step": 171
},
{
"epoch": 0.2059880239520958,
"grad_norm": 0.2891349012548104,
"learning_rate": 3.406374501992032e-05,
"loss": 0.3664,
"step": 172
},
{
"epoch": 0.20718562874251498,
"grad_norm": 0.28167731846072225,
"learning_rate": 3.4262948207171314e-05,
"loss": 0.3758,
"step": 173
},
{
"epoch": 0.20838323353293414,
"grad_norm": 0.30558071336770487,
"learning_rate": 3.446215139442232e-05,
"loss": 0.3634,
"step": 174
},
{
"epoch": 0.20958083832335328,
"grad_norm": 0.2979910431008939,
"learning_rate": 3.4661354581673314e-05,
"loss": 0.3728,
"step": 175
},
{
"epoch": 0.21077844311377245,
"grad_norm": 0.26161991698984943,
"learning_rate": 3.4860557768924304e-05,
"loss": 0.3595,
"step": 176
},
{
"epoch": 0.2119760479041916,
"grad_norm": 0.2922755051205938,
"learning_rate": 3.50597609561753e-05,
"loss": 0.3437,
"step": 177
},
{
"epoch": 0.21317365269461078,
"grad_norm": 0.32478914544732235,
"learning_rate": 3.52589641434263e-05,
"loss": 0.402,
"step": 178
},
{
"epoch": 0.21437125748502994,
"grad_norm": 0.28170489175067986,
"learning_rate": 3.5458167330677294e-05,
"loss": 0.382,
"step": 179
},
{
"epoch": 0.2155688622754491,
"grad_norm": 0.26382087617189154,
"learning_rate": 3.565737051792829e-05,
"loss": 0.355,
"step": 180
},
{
"epoch": 0.21676646706586827,
"grad_norm": 0.25260444156764983,
"learning_rate": 3.585657370517929e-05,
"loss": 0.3833,
"step": 181
},
{
"epoch": 0.2179640718562874,
"grad_norm": 0.3083664195261954,
"learning_rate": 3.6055776892430283e-05,
"loss": 0.3763,
"step": 182
},
{
"epoch": 0.21916167664670658,
"grad_norm": 0.23722968308922246,
"learning_rate": 3.625498007968128e-05,
"loss": 0.3756,
"step": 183
},
{
"epoch": 0.22035928143712574,
"grad_norm": 0.3155849038267966,
"learning_rate": 3.6454183266932277e-05,
"loss": 0.3829,
"step": 184
},
{
"epoch": 0.2215568862275449,
"grad_norm": 0.25673147930496615,
"learning_rate": 3.6653386454183266e-05,
"loss": 0.3763,
"step": 185
},
{
"epoch": 0.22275449101796407,
"grad_norm": 0.3143133170451701,
"learning_rate": 3.685258964143426e-05,
"loss": 0.4085,
"step": 186
},
{
"epoch": 0.22395209580838324,
"grad_norm": 0.311884306930222,
"learning_rate": 3.705179282868526e-05,
"loss": 0.3849,
"step": 187
},
{
"epoch": 0.2251497005988024,
"grad_norm": 0.3735540149800909,
"learning_rate": 3.7250996015936256e-05,
"loss": 0.3886,
"step": 188
},
{
"epoch": 0.22634730538922157,
"grad_norm": 0.2688101816137173,
"learning_rate": 3.745019920318725e-05,
"loss": 0.3974,
"step": 189
},
{
"epoch": 0.2275449101796407,
"grad_norm": 0.25403189254810266,
"learning_rate": 3.764940239043825e-05,
"loss": 0.3665,
"step": 190
},
{
"epoch": 0.22874251497005987,
"grad_norm": 0.2887056785721433,
"learning_rate": 3.7848605577689246e-05,
"loss": 0.3683,
"step": 191
},
{
"epoch": 0.22994011976047904,
"grad_norm": 0.26208585486857516,
"learning_rate": 3.804780876494024e-05,
"loss": 0.3787,
"step": 192
},
{
"epoch": 0.2311377245508982,
"grad_norm": 0.3062431402255136,
"learning_rate": 3.824701195219124e-05,
"loss": 0.3764,
"step": 193
},
{
"epoch": 0.23233532934131737,
"grad_norm": 0.30126495686102084,
"learning_rate": 3.844621513944223e-05,
"loss": 0.3869,
"step": 194
},
{
"epoch": 0.23353293413173654,
"grad_norm": 0.3222757305414551,
"learning_rate": 3.8645418326693225e-05,
"loss": 0.3972,
"step": 195
},
{
"epoch": 0.2347305389221557,
"grad_norm": 0.2829300681247945,
"learning_rate": 3.884462151394422e-05,
"loss": 0.3823,
"step": 196
},
{
"epoch": 0.23592814371257484,
"grad_norm": 0.31063599147941956,
"learning_rate": 3.904382470119522e-05,
"loss": 0.4,
"step": 197
},
{
"epoch": 0.237125748502994,
"grad_norm": 0.3036834141942823,
"learning_rate": 3.9243027888446215e-05,
"loss": 0.3703,
"step": 198
},
{
"epoch": 0.23832335329341317,
"grad_norm": 0.2686776388010693,
"learning_rate": 3.944223107569721e-05,
"loss": 0.3779,
"step": 199
},
{
"epoch": 0.23952095808383234,
"grad_norm": 0.3447259363814691,
"learning_rate": 3.964143426294821e-05,
"loss": 0.3617,
"step": 200
},
{
"epoch": 0.2407185628742515,
"grad_norm": 0.3139206077088481,
"learning_rate": 3.9840637450199205e-05,
"loss": 0.3866,
"step": 201
},
{
"epoch": 0.24191616766467067,
"grad_norm": 0.3486607243770188,
"learning_rate": 4.00398406374502e-05,
"loss": 0.3781,
"step": 202
},
{
"epoch": 0.24311377245508983,
"grad_norm": 0.4387160212073611,
"learning_rate": 4.02390438247012e-05,
"loss": 0.4146,
"step": 203
},
{
"epoch": 0.24431137724550897,
"grad_norm": 0.37377648749560133,
"learning_rate": 4.043824701195219e-05,
"loss": 0.4041,
"step": 204
},
{
"epoch": 0.24550898203592814,
"grad_norm": 0.32125360018168553,
"learning_rate": 4.063745019920319e-05,
"loss": 0.3565,
"step": 205
},
{
"epoch": 0.2467065868263473,
"grad_norm": 0.49967816642495066,
"learning_rate": 4.083665338645419e-05,
"loss": 0.3614,
"step": 206
},
{
"epoch": 0.24790419161676647,
"grad_norm": 0.5402473223311781,
"learning_rate": 4.1035856573705184e-05,
"loss": 0.422,
"step": 207
},
{
"epoch": 0.24910179640718563,
"grad_norm": 0.32821153428425537,
"learning_rate": 4.123505976095618e-05,
"loss": 0.3696,
"step": 208
},
{
"epoch": 0.2502994011976048,
"grad_norm": 0.4889950657295354,
"learning_rate": 4.143426294820718e-05,
"loss": 0.3687,
"step": 209
},
{
"epoch": 0.25149700598802394,
"grad_norm": 0.2791491581156364,
"learning_rate": 4.1633466135458174e-05,
"loss": 0.3697,
"step": 210
},
{
"epoch": 0.25269461077844313,
"grad_norm": 0.38714745872721174,
"learning_rate": 4.183266932270917e-05,
"loss": 0.3714,
"step": 211
},
{
"epoch": 0.25389221556886227,
"grad_norm": 0.3197661019061884,
"learning_rate": 4.203187250996016e-05,
"loss": 0.4045,
"step": 212
},
{
"epoch": 0.25508982035928146,
"grad_norm": 0.27463464296136614,
"learning_rate": 4.223107569721116e-05,
"loss": 0.3472,
"step": 213
},
{
"epoch": 0.2562874251497006,
"grad_norm": 0.3067353010634746,
"learning_rate": 4.243027888446215e-05,
"loss": 0.3521,
"step": 214
},
{
"epoch": 0.25748502994011974,
"grad_norm": 0.2601882806440002,
"learning_rate": 4.262948207171315e-05,
"loss": 0.3902,
"step": 215
},
{
"epoch": 0.25868263473053893,
"grad_norm": 0.30191260321748786,
"learning_rate": 4.2828685258964146e-05,
"loss": 0.3422,
"step": 216
},
{
"epoch": 0.25988023952095807,
"grad_norm": 0.27646465860757347,
"learning_rate": 4.302788844621514e-05,
"loss": 0.3281,
"step": 217
},
{
"epoch": 0.26107784431137726,
"grad_norm": 0.24329089028179018,
"learning_rate": 4.322709163346614e-05,
"loss": 0.3415,
"step": 218
},
{
"epoch": 0.2622754491017964,
"grad_norm": 0.35475835958644286,
"learning_rate": 4.3426294820717136e-05,
"loss": 0.3791,
"step": 219
},
{
"epoch": 0.2634730538922156,
"grad_norm": 0.25212010587890915,
"learning_rate": 4.362549800796813e-05,
"loss": 0.4022,
"step": 220
},
{
"epoch": 0.26467065868263473,
"grad_norm": 0.2831847020053301,
"learning_rate": 4.382470119521912e-05,
"loss": 0.3681,
"step": 221
},
{
"epoch": 0.26586826347305387,
"grad_norm": 0.36699845680029125,
"learning_rate": 4.402390438247012e-05,
"loss": 0.3702,
"step": 222
},
{
"epoch": 0.26706586826347306,
"grad_norm": 0.27661067247010346,
"learning_rate": 4.4223107569721116e-05,
"loss": 0.361,
"step": 223
},
{
"epoch": 0.2682634730538922,
"grad_norm": 0.31768517551188696,
"learning_rate": 4.442231075697211e-05,
"loss": 0.3506,
"step": 224
},
{
"epoch": 0.2694610778443114,
"grad_norm": 0.29488428807562567,
"learning_rate": 4.462151394422311e-05,
"loss": 0.3723,
"step": 225
},
{
"epoch": 0.27065868263473053,
"grad_norm": 0.35105607890298585,
"learning_rate": 4.4820717131474105e-05,
"loss": 0.3496,
"step": 226
},
{
"epoch": 0.2718562874251497,
"grad_norm": 0.30116222927662106,
"learning_rate": 4.50199203187251e-05,
"loss": 0.3823,
"step": 227
},
{
"epoch": 0.27305389221556886,
"grad_norm": 0.28191884282125973,
"learning_rate": 4.52191235059761e-05,
"loss": 0.3507,
"step": 228
},
{
"epoch": 0.274251497005988,
"grad_norm": 0.318223781522659,
"learning_rate": 4.5418326693227095e-05,
"loss": 0.3566,
"step": 229
},
{
"epoch": 0.2754491017964072,
"grad_norm": 0.2837650647910211,
"learning_rate": 4.561752988047809e-05,
"loss": 0.3704,
"step": 230
},
{
"epoch": 0.27664670658682633,
"grad_norm": 0.2991189774788242,
"learning_rate": 4.581673306772908e-05,
"loss": 0.3899,
"step": 231
},
{
"epoch": 0.2778443113772455,
"grad_norm": 0.2691288082656912,
"learning_rate": 4.601593625498008e-05,
"loss": 0.361,
"step": 232
},
{
"epoch": 0.27904191616766466,
"grad_norm": 0.2821573134088317,
"learning_rate": 4.6215139442231074e-05,
"loss": 0.3555,
"step": 233
},
{
"epoch": 0.28023952095808385,
"grad_norm": 0.3108148573988189,
"learning_rate": 4.641434262948207e-05,
"loss": 0.367,
"step": 234
},
{
"epoch": 0.281437125748503,
"grad_norm": 0.2869639004458061,
"learning_rate": 4.661354581673307e-05,
"loss": 0.3812,
"step": 235
},
{
"epoch": 0.28263473053892213,
"grad_norm": 0.3318259221407622,
"learning_rate": 4.6812749003984064e-05,
"loss": 0.3839,
"step": 236
},
{
"epoch": 0.2838323353293413,
"grad_norm": 0.2544786596273584,
"learning_rate": 4.701195219123507e-05,
"loss": 0.334,
"step": 237
},
{
"epoch": 0.28502994011976046,
"grad_norm": 0.27802647034447453,
"learning_rate": 4.721115537848606e-05,
"loss": 0.3507,
"step": 238
},
{
"epoch": 0.28622754491017965,
"grad_norm": 0.29997412959933245,
"learning_rate": 4.7410358565737054e-05,
"loss": 0.3757,
"step": 239
},
{
"epoch": 0.2874251497005988,
"grad_norm": 0.2772639858832257,
"learning_rate": 4.760956175298805e-05,
"loss": 0.393,
"step": 240
},
{
"epoch": 0.288622754491018,
"grad_norm": 0.3002475623210769,
"learning_rate": 4.780876494023905e-05,
"loss": 0.3552,
"step": 241
},
{
"epoch": 0.2898203592814371,
"grad_norm": 0.29877950632045425,
"learning_rate": 4.8007968127490044e-05,
"loss": 0.3755,
"step": 242
},
{
"epoch": 0.29101796407185626,
"grad_norm": 0.2857917900267476,
"learning_rate": 4.820717131474104e-05,
"loss": 0.3578,
"step": 243
},
{
"epoch": 0.29221556886227545,
"grad_norm": 0.3595218483012504,
"learning_rate": 4.840637450199204e-05,
"loss": 0.3696,
"step": 244
},
{
"epoch": 0.2934131736526946,
"grad_norm": 0.3066662194389393,
"learning_rate": 4.860557768924303e-05,
"loss": 0.3923,
"step": 245
},
{
"epoch": 0.2946107784431138,
"grad_norm": 0.3096175430227455,
"learning_rate": 4.880478087649403e-05,
"loss": 0.4043,
"step": 246
},
{
"epoch": 0.2958083832335329,
"grad_norm": 0.2830917685627784,
"learning_rate": 4.9003984063745026e-05,
"loss": 0.3564,
"step": 247
},
{
"epoch": 0.2970059880239521,
"grad_norm": 0.29475276986208476,
"learning_rate": 4.9203187250996016e-05,
"loss": 0.3555,
"step": 248
},
{
"epoch": 0.29820359281437125,
"grad_norm": 0.25276960569078416,
"learning_rate": 4.940239043824701e-05,
"loss": 0.3419,
"step": 249
},
{
"epoch": 0.2994011976047904,
"grad_norm": 0.3191184261650036,
"learning_rate": 4.960159362549801e-05,
"loss": 0.4055,
"step": 250
},
{
"epoch": 0.3005988023952096,
"grad_norm": 0.44512386497717127,
"learning_rate": 4.9800796812749006e-05,
"loss": 0.3777,
"step": 251
},
{
"epoch": 0.3017964071856287,
"grad_norm": 0.25451628111626934,
"learning_rate": 5e-05,
"loss": 0.3897,
"step": 252
},
{
"epoch": 0.3029940119760479,
"grad_norm": 0.2911042893361434,
"learning_rate": 4.997781721384206e-05,
"loss": 0.3409,
"step": 253
},
{
"epoch": 0.30419161676646705,
"grad_norm": 0.3402621031736664,
"learning_rate": 4.995563442768412e-05,
"loss": 0.3556,
"step": 254
},
{
"epoch": 0.30538922155688625,
"grad_norm": 0.3227376600910865,
"learning_rate": 4.993345164152618e-05,
"loss": 0.3865,
"step": 255
},
{
"epoch": 0.3065868263473054,
"grad_norm": 0.33371382715432174,
"learning_rate": 4.9911268855368236e-05,
"loss": 0.3791,
"step": 256
},
{
"epoch": 0.3077844311377245,
"grad_norm": 0.3947159866157393,
"learning_rate": 4.9889086069210295e-05,
"loss": 0.3443,
"step": 257
},
{
"epoch": 0.3089820359281437,
"grad_norm": 0.2978274316602382,
"learning_rate": 4.986690328305235e-05,
"loss": 0.3536,
"step": 258
},
{
"epoch": 0.31017964071856285,
"grad_norm": 0.24413658144309386,
"learning_rate": 4.984472049689442e-05,
"loss": 0.3424,
"step": 259
},
{
"epoch": 0.31137724550898205,
"grad_norm": 0.3197615863472984,
"learning_rate": 4.982253771073647e-05,
"loss": 0.3631,
"step": 260
},
{
"epoch": 0.3125748502994012,
"grad_norm": 0.2703820080949831,
"learning_rate": 4.980035492457853e-05,
"loss": 0.3603,
"step": 261
},
{
"epoch": 0.3137724550898204,
"grad_norm": 0.3281548205148847,
"learning_rate": 4.977817213842059e-05,
"loss": 0.3838,
"step": 262
},
{
"epoch": 0.3149700598802395,
"grad_norm": 0.3053033751239081,
"learning_rate": 4.9755989352262645e-05,
"loss": 0.351,
"step": 263
},
{
"epoch": 0.3161676646706587,
"grad_norm": 0.291930345866699,
"learning_rate": 4.973380656610471e-05,
"loss": 0.3474,
"step": 264
},
{
"epoch": 0.31736526946107785,
"grad_norm": 0.30906042494609376,
"learning_rate": 4.971162377994676e-05,
"loss": 0.3829,
"step": 265
},
{
"epoch": 0.318562874251497,
"grad_norm": 0.31301345983850587,
"learning_rate": 4.968944099378882e-05,
"loss": 0.3938,
"step": 266
},
{
"epoch": 0.3197604790419162,
"grad_norm": 0.26755263285291625,
"learning_rate": 4.966725820763088e-05,
"loss": 0.3806,
"step": 267
},
{
"epoch": 0.3209580838323353,
"grad_norm": 0.3479320600644947,
"learning_rate": 4.9645075421472944e-05,
"loss": 0.3756,
"step": 268
},
{
"epoch": 0.3221556886227545,
"grad_norm": 0.27363040010425177,
"learning_rate": 4.9622892635315e-05,
"loss": 0.3605,
"step": 269
},
{
"epoch": 0.32335329341317365,
"grad_norm": 0.3232584182797657,
"learning_rate": 4.9600709849157054e-05,
"loss": 0.3576,
"step": 270
},
{
"epoch": 0.32455089820359284,
"grad_norm": 0.2831176616772001,
"learning_rate": 4.957852706299911e-05,
"loss": 0.3524,
"step": 271
},
{
"epoch": 0.325748502994012,
"grad_norm": 0.29730557305641886,
"learning_rate": 4.955634427684117e-05,
"loss": 0.3905,
"step": 272
},
{
"epoch": 0.3269461077844311,
"grad_norm": 0.3013638186445516,
"learning_rate": 4.9534161490683236e-05,
"loss": 0.3743,
"step": 273
},
{
"epoch": 0.3281437125748503,
"grad_norm": 0.25959795964177507,
"learning_rate": 4.9511978704525295e-05,
"loss": 0.3795,
"step": 274
},
{
"epoch": 0.32934131736526945,
"grad_norm": 0.318106717436789,
"learning_rate": 4.9489795918367346e-05,
"loss": 0.3618,
"step": 275
},
{
"epoch": 0.33053892215568864,
"grad_norm": 0.25079676175984195,
"learning_rate": 4.9467613132209405e-05,
"loss": 0.3395,
"step": 276
},
{
"epoch": 0.3317365269461078,
"grad_norm": 0.34120551826551904,
"learning_rate": 4.944543034605147e-05,
"loss": 0.3659,
"step": 277
},
{
"epoch": 0.332934131736527,
"grad_norm": 0.27347086900221873,
"learning_rate": 4.942324755989353e-05,
"loss": 0.3791,
"step": 278
},
{
"epoch": 0.3341317365269461,
"grad_norm": 0.40281111239008693,
"learning_rate": 4.940106477373559e-05,
"loss": 0.3694,
"step": 279
},
{
"epoch": 0.33532934131736525,
"grad_norm": 0.24248307378841152,
"learning_rate": 4.937888198757764e-05,
"loss": 0.3549,
"step": 280
},
{
"epoch": 0.33652694610778444,
"grad_norm": 0.36236276947990886,
"learning_rate": 4.93566992014197e-05,
"loss": 0.3843,
"step": 281
},
{
"epoch": 0.3377245508982036,
"grad_norm": 0.263998277233993,
"learning_rate": 4.933451641526176e-05,
"loss": 0.3486,
"step": 282
},
{
"epoch": 0.3389221556886228,
"grad_norm": 0.31788287788609065,
"learning_rate": 4.931233362910382e-05,
"loss": 0.3556,
"step": 283
},
{
"epoch": 0.3401197604790419,
"grad_norm": 0.28269561504838503,
"learning_rate": 4.929015084294588e-05,
"loss": 0.3639,
"step": 284
},
{
"epoch": 0.3413173652694611,
"grad_norm": 0.34214456160727974,
"learning_rate": 4.926796805678793e-05,
"loss": 0.3697,
"step": 285
},
{
"epoch": 0.34251497005988024,
"grad_norm": 0.24532922840074284,
"learning_rate": 4.9245785270629996e-05,
"loss": 0.327,
"step": 286
},
{
"epoch": 0.3437125748502994,
"grad_norm": 0.3128379521933796,
"learning_rate": 4.9223602484472054e-05,
"loss": 0.4017,
"step": 287
},
{
"epoch": 0.3449101796407186,
"grad_norm": 0.32039392160840563,
"learning_rate": 4.920141969831411e-05,
"loss": 0.373,
"step": 288
},
{
"epoch": 0.3461077844311377,
"grad_norm": 0.2534493690563105,
"learning_rate": 4.917923691215617e-05,
"loss": 0.3503,
"step": 289
},
{
"epoch": 0.3473053892215569,
"grad_norm": 0.278692143683875,
"learning_rate": 4.915705412599822e-05,
"loss": 0.3572,
"step": 290
},
{
"epoch": 0.34850299401197604,
"grad_norm": 0.28419531027646033,
"learning_rate": 4.913487133984029e-05,
"loss": 0.3313,
"step": 291
},
{
"epoch": 0.34970059880239523,
"grad_norm": 0.3014624788139745,
"learning_rate": 4.9112688553682346e-05,
"loss": 0.396,
"step": 292
},
{
"epoch": 0.3508982035928144,
"grad_norm": 0.2899830415222191,
"learning_rate": 4.9090505767524405e-05,
"loss": 0.3627,
"step": 293
},
{
"epoch": 0.3520958083832335,
"grad_norm": 0.2554583317244621,
"learning_rate": 4.906832298136646e-05,
"loss": 0.3548,
"step": 294
},
{
"epoch": 0.3532934131736527,
"grad_norm": 0.30460137118714187,
"learning_rate": 4.904614019520852e-05,
"loss": 0.3675,
"step": 295
},
{
"epoch": 0.35449101796407184,
"grad_norm": 0.2873662181262538,
"learning_rate": 4.902395740905058e-05,
"loss": 0.3646,
"step": 296
},
{
"epoch": 0.35568862275449104,
"grad_norm": 0.2758033637744926,
"learning_rate": 4.900177462289264e-05,
"loss": 0.3642,
"step": 297
},
{
"epoch": 0.3568862275449102,
"grad_norm": 0.3006562348331249,
"learning_rate": 4.89795918367347e-05,
"loss": 0.3786,
"step": 298
},
{
"epoch": 0.35808383233532937,
"grad_norm": 0.2410999145837626,
"learning_rate": 4.8957409050576755e-05,
"loss": 0.3349,
"step": 299
},
{
"epoch": 0.3592814371257485,
"grad_norm": 0.30874942651915754,
"learning_rate": 4.8935226264418814e-05,
"loss": 0.3863,
"step": 300
},
{
"epoch": 0.36047904191616764,
"grad_norm": 0.2647698789531625,
"learning_rate": 4.891304347826087e-05,
"loss": 0.3743,
"step": 301
},
{
"epoch": 0.36167664670658684,
"grad_norm": 0.32253868013701426,
"learning_rate": 4.889086069210293e-05,
"loss": 0.3428,
"step": 302
},
{
"epoch": 0.362874251497006,
"grad_norm": 0.25292512953191515,
"learning_rate": 4.886867790594499e-05,
"loss": 0.3723,
"step": 303
},
{
"epoch": 0.36407185628742517,
"grad_norm": 0.2934751971691142,
"learning_rate": 4.884649511978705e-05,
"loss": 0.3654,
"step": 304
},
{
"epoch": 0.3652694610778443,
"grad_norm": 0.2583452188786837,
"learning_rate": 4.8824312333629106e-05,
"loss": 0.3489,
"step": 305
},
{
"epoch": 0.3664670658682635,
"grad_norm": 0.230421751922922,
"learning_rate": 4.8802129547471164e-05,
"loss": 0.3584,
"step": 306
},
{
"epoch": 0.36766467065868264,
"grad_norm": 0.2621796723066993,
"learning_rate": 4.877994676131322e-05,
"loss": 0.3466,
"step": 307
},
{
"epoch": 0.3688622754491018,
"grad_norm": 0.2917892754254444,
"learning_rate": 4.875776397515528e-05,
"loss": 0.395,
"step": 308
},
{
"epoch": 0.37005988023952097,
"grad_norm": 0.2491979541695299,
"learning_rate": 4.873558118899734e-05,
"loss": 0.3735,
"step": 309
},
{
"epoch": 0.3712574850299401,
"grad_norm": 0.26553989562214075,
"learning_rate": 4.87133984028394e-05,
"loss": 0.3705,
"step": 310
},
{
"epoch": 0.3724550898203593,
"grad_norm": 0.23595244339333982,
"learning_rate": 4.8691215616681456e-05,
"loss": 0.3276,
"step": 311
},
{
"epoch": 0.37365269461077844,
"grad_norm": 0.245160648915256,
"learning_rate": 4.8669032830523515e-05,
"loss": 0.3402,
"step": 312
},
{
"epoch": 0.37485029940119763,
"grad_norm": 0.2542723683179529,
"learning_rate": 4.864685004436558e-05,
"loss": 0.3361,
"step": 313
},
{
"epoch": 0.37604790419161677,
"grad_norm": 0.2928084987526323,
"learning_rate": 4.862466725820763e-05,
"loss": 0.3463,
"step": 314
},
{
"epoch": 0.3772455089820359,
"grad_norm": 0.2655621467330252,
"learning_rate": 4.860248447204969e-05,
"loss": 0.37,
"step": 315
},
{
"epoch": 0.3784431137724551,
"grad_norm": 0.3805249306646299,
"learning_rate": 4.858030168589175e-05,
"loss": 0.3877,
"step": 316
},
{
"epoch": 0.37964071856287424,
"grad_norm": 0.2952552074096925,
"learning_rate": 4.855811889973381e-05,
"loss": 0.3708,
"step": 317
},
{
"epoch": 0.38083832335329343,
"grad_norm": 0.3089218124284288,
"learning_rate": 4.853593611357587e-05,
"loss": 0.3706,
"step": 318
},
{
"epoch": 0.38203592814371257,
"grad_norm": 0.2630203824048498,
"learning_rate": 4.8513753327417924e-05,
"loss": 0.3601,
"step": 319
},
{
"epoch": 0.38323353293413176,
"grad_norm": 2.5443012132068596,
"learning_rate": 4.849157054125998e-05,
"loss": 0.3495,
"step": 320
},
{
"epoch": 0.3844311377245509,
"grad_norm": 0.4876896292866147,
"learning_rate": 4.846938775510204e-05,
"loss": 0.3738,
"step": 321
},
{
"epoch": 0.38562874251497004,
"grad_norm": 0.29757955906395733,
"learning_rate": 4.8447204968944106e-05,
"loss": 0.3771,
"step": 322
},
{
"epoch": 0.38682634730538923,
"grad_norm": 0.3082540731555661,
"learning_rate": 4.8425022182786164e-05,
"loss": 0.3536,
"step": 323
},
{
"epoch": 0.38802395209580837,
"grad_norm": 0.23239296400955792,
"learning_rate": 4.8402839396628216e-05,
"loss": 0.3295,
"step": 324
},
{
"epoch": 0.38922155688622756,
"grad_norm": 0.31308875887910603,
"learning_rate": 4.8380656610470274e-05,
"loss": 0.3676,
"step": 325
},
{
"epoch": 0.3904191616766467,
"grad_norm": 0.34291163441117933,
"learning_rate": 4.835847382431233e-05,
"loss": 0.3641,
"step": 326
},
{
"epoch": 0.3916167664670659,
"grad_norm": 0.25581609549090234,
"learning_rate": 4.83362910381544e-05,
"loss": 0.3476,
"step": 327
},
{
"epoch": 0.39281437125748503,
"grad_norm": 0.2974335755415904,
"learning_rate": 4.8314108251996456e-05,
"loss": 0.39,
"step": 328
},
{
"epoch": 0.39401197604790417,
"grad_norm": 0.26760243543480977,
"learning_rate": 4.829192546583851e-05,
"loss": 0.3428,
"step": 329
},
{
"epoch": 0.39520958083832336,
"grad_norm": 0.2531330960311572,
"learning_rate": 4.8269742679680566e-05,
"loss": 0.3276,
"step": 330
},
{
"epoch": 0.3964071856287425,
"grad_norm": 0.281609587457732,
"learning_rate": 4.824755989352263e-05,
"loss": 0.365,
"step": 331
},
{
"epoch": 0.3976047904191617,
"grad_norm": 0.34939855588012303,
"learning_rate": 4.822537710736469e-05,
"loss": 0.3475,
"step": 332
},
{
"epoch": 0.39880239520958083,
"grad_norm": 0.3126189360906153,
"learning_rate": 4.820319432120675e-05,
"loss": 0.3988,
"step": 333
},
{
"epoch": 0.4,
"grad_norm": 0.25272373701860695,
"learning_rate": 4.81810115350488e-05,
"loss": 0.3394,
"step": 334
},
{
"epoch": 0.40119760479041916,
"grad_norm": 0.287927354539625,
"learning_rate": 4.815882874889086e-05,
"loss": 0.3395,
"step": 335
},
{
"epoch": 0.4023952095808383,
"grad_norm": 0.2394615268069723,
"learning_rate": 4.8136645962732924e-05,
"loss": 0.3174,
"step": 336
},
{
"epoch": 0.4035928143712575,
"grad_norm": 0.22396320796680286,
"learning_rate": 4.811446317657498e-05,
"loss": 0.3757,
"step": 337
},
{
"epoch": 0.40479041916167663,
"grad_norm": 0.2598068283456728,
"learning_rate": 4.809228039041704e-05,
"loss": 0.3708,
"step": 338
},
{
"epoch": 0.4059880239520958,
"grad_norm": 0.21484724146463047,
"learning_rate": 4.807009760425909e-05,
"loss": 0.3297,
"step": 339
},
{
"epoch": 0.40718562874251496,
"grad_norm": 0.21996826556370427,
"learning_rate": 4.804791481810116e-05,
"loss": 0.3439,
"step": 340
},
{
"epoch": 0.40838323353293415,
"grad_norm": 0.24918317220372097,
"learning_rate": 4.8025732031943216e-05,
"loss": 0.3582,
"step": 341
},
{
"epoch": 0.4095808383233533,
"grad_norm": 0.22894661934823818,
"learning_rate": 4.8003549245785274e-05,
"loss": 0.3425,
"step": 342
},
{
"epoch": 0.41077844311377243,
"grad_norm": 0.23561888808015422,
"learning_rate": 4.798136645962733e-05,
"loss": 0.3214,
"step": 343
},
{
"epoch": 0.4119760479041916,
"grad_norm": 0.22969090553479712,
"learning_rate": 4.795918367346939e-05,
"loss": 0.336,
"step": 344
},
{
"epoch": 0.41317365269461076,
"grad_norm": 0.23898759941696487,
"learning_rate": 4.793700088731145e-05,
"loss": 0.3376,
"step": 345
},
{
"epoch": 0.41437125748502995,
"grad_norm": 0.1984043886594236,
"learning_rate": 4.791481810115351e-05,
"loss": 0.3202,
"step": 346
},
{
"epoch": 0.4155688622754491,
"grad_norm": 0.2515590937600679,
"learning_rate": 4.7892635314995566e-05,
"loss": 0.3337,
"step": 347
},
{
"epoch": 0.4167664670658683,
"grad_norm": 0.24834934303277903,
"learning_rate": 4.7870452528837625e-05,
"loss": 0.3921,
"step": 348
},
{
"epoch": 0.4179640718562874,
"grad_norm": 0.264487439367407,
"learning_rate": 4.784826974267968e-05,
"loss": 0.3636,
"step": 349
},
{
"epoch": 0.41916167664670656,
"grad_norm": 0.24102334832349553,
"learning_rate": 4.782608695652174e-05,
"loss": 0.3329,
"step": 350
},
{
"epoch": 0.42035928143712575,
"grad_norm": 0.23032131868616668,
"learning_rate": 4.78039041703638e-05,
"loss": 0.3295,
"step": 351
},
{
"epoch": 0.4215568862275449,
"grad_norm": 0.30501448173372464,
"learning_rate": 4.778172138420586e-05,
"loss": 0.3661,
"step": 352
},
{
"epoch": 0.4227544910179641,
"grad_norm": 0.2624522573671576,
"learning_rate": 4.775953859804792e-05,
"loss": 0.3446,
"step": 353
},
{
"epoch": 0.4239520958083832,
"grad_norm": 0.2378367853677313,
"learning_rate": 4.7737355811889975e-05,
"loss": 0.3748,
"step": 354
},
{
"epoch": 0.4251497005988024,
"grad_norm": 0.2986594669236365,
"learning_rate": 4.7715173025732034e-05,
"loss": 0.3705,
"step": 355
},
{
"epoch": 0.42634730538922155,
"grad_norm": 0.22807986193120927,
"learning_rate": 4.769299023957409e-05,
"loss": 0.3559,
"step": 356
},
{
"epoch": 0.4275449101796407,
"grad_norm": 0.21232595946375987,
"learning_rate": 4.767080745341615e-05,
"loss": 0.3455,
"step": 357
},
{
"epoch": 0.4287425149700599,
"grad_norm": 0.3145641870732241,
"learning_rate": 4.7648624667258216e-05,
"loss": 0.3512,
"step": 358
},
{
"epoch": 0.429940119760479,
"grad_norm": 0.237373067771583,
"learning_rate": 4.762644188110027e-05,
"loss": 0.3782,
"step": 359
},
{
"epoch": 0.4311377245508982,
"grad_norm": 0.27030277545262377,
"learning_rate": 4.7604259094942326e-05,
"loss": 0.353,
"step": 360
},
{
"epoch": 0.43233532934131735,
"grad_norm": 0.23028577835434236,
"learning_rate": 4.7582076308784384e-05,
"loss": 0.3439,
"step": 361
},
{
"epoch": 0.43353293413173655,
"grad_norm": 0.2621081714043022,
"learning_rate": 4.755989352262644e-05,
"loss": 0.3639,
"step": 362
},
{
"epoch": 0.4347305389221557,
"grad_norm": 0.23535017031660524,
"learning_rate": 4.753771073646851e-05,
"loss": 0.3576,
"step": 363
},
{
"epoch": 0.4359281437125748,
"grad_norm": 0.2631478777585787,
"learning_rate": 4.751552795031056e-05,
"loss": 0.3256,
"step": 364
},
{
"epoch": 0.437125748502994,
"grad_norm": 0.2220055150623633,
"learning_rate": 4.749334516415262e-05,
"loss": 0.344,
"step": 365
},
{
"epoch": 0.43832335329341315,
"grad_norm": 0.26673487997597184,
"learning_rate": 4.7471162377994677e-05,
"loss": 0.3858,
"step": 366
},
{
"epoch": 0.43952095808383235,
"grad_norm": 0.21126566499487867,
"learning_rate": 4.744897959183674e-05,
"loss": 0.329,
"step": 367
},
{
"epoch": 0.4407185628742515,
"grad_norm": 0.22518497596925183,
"learning_rate": 4.74267968056788e-05,
"loss": 0.3185,
"step": 368
},
{
"epoch": 0.4419161676646707,
"grad_norm": 0.2593667510095556,
"learning_rate": 4.740461401952085e-05,
"loss": 0.3237,
"step": 369
},
{
"epoch": 0.4431137724550898,
"grad_norm": 0.26004786108297895,
"learning_rate": 4.738243123336291e-05,
"loss": 0.3369,
"step": 370
},
{
"epoch": 0.444311377245509,
"grad_norm": 0.2894153441545581,
"learning_rate": 4.736024844720497e-05,
"loss": 0.3724,
"step": 371
},
{
"epoch": 0.44550898203592815,
"grad_norm": 0.2554994897671519,
"learning_rate": 4.7338065661047034e-05,
"loss": 0.3743,
"step": 372
},
{
"epoch": 0.4467065868263473,
"grad_norm": 0.2212870103468295,
"learning_rate": 4.731588287488909e-05,
"loss": 0.371,
"step": 373
},
{
"epoch": 0.4479041916167665,
"grad_norm": 0.269271014067454,
"learning_rate": 4.7293700088731144e-05,
"loss": 0.3692,
"step": 374
},
{
"epoch": 0.4491017964071856,
"grad_norm": 0.26869483605957956,
"learning_rate": 4.72715173025732e-05,
"loss": 0.3726,
"step": 375
},
{
"epoch": 0.4502994011976048,
"grad_norm": 0.22905751141609806,
"learning_rate": 4.724933451641527e-05,
"loss": 0.3398,
"step": 376
},
{
"epoch": 0.45149700598802395,
"grad_norm": 0.2667242228539303,
"learning_rate": 4.7227151730257326e-05,
"loss": 0.3492,
"step": 377
},
{
"epoch": 0.45269461077844314,
"grad_norm": 0.2580710025424521,
"learning_rate": 4.7204968944099384e-05,
"loss": 0.3577,
"step": 378
},
{
"epoch": 0.4538922155688623,
"grad_norm": 0.2663343241588604,
"learning_rate": 4.7182786157941436e-05,
"loss": 0.3412,
"step": 379
},
{
"epoch": 0.4550898203592814,
"grad_norm": 0.25582181153991107,
"learning_rate": 4.7160603371783495e-05,
"loss": 0.3344,
"step": 380
},
{
"epoch": 0.4562874251497006,
"grad_norm": 0.2582485326218665,
"learning_rate": 4.713842058562556e-05,
"loss": 0.337,
"step": 381
},
{
"epoch": 0.45748502994011975,
"grad_norm": 0.22205959403581788,
"learning_rate": 4.711623779946762e-05,
"loss": 0.3589,
"step": 382
},
{
"epoch": 0.45868263473053894,
"grad_norm": 0.21756986604144946,
"learning_rate": 4.7094055013309677e-05,
"loss": 0.3392,
"step": 383
},
{
"epoch": 0.4598802395209581,
"grad_norm": 0.22225332039104104,
"learning_rate": 4.707187222715173e-05,
"loss": 0.328,
"step": 384
},
{
"epoch": 0.46107784431137727,
"grad_norm": 0.22196809932355974,
"learning_rate": 4.7049689440993793e-05,
"loss": 0.3278,
"step": 385
},
{
"epoch": 0.4622754491017964,
"grad_norm": 0.22067923473342027,
"learning_rate": 4.702750665483585e-05,
"loss": 0.3596,
"step": 386
},
{
"epoch": 0.46347305389221555,
"grad_norm": 0.25718065589723405,
"learning_rate": 4.700532386867791e-05,
"loss": 0.3615,
"step": 387
},
{
"epoch": 0.46467065868263474,
"grad_norm": 0.24839297281580522,
"learning_rate": 4.698314108251997e-05,
"loss": 0.3482,
"step": 388
},
{
"epoch": 0.4658682634730539,
"grad_norm": 0.2528813829309828,
"learning_rate": 4.696095829636202e-05,
"loss": 0.3694,
"step": 389
},
{
"epoch": 0.46706586826347307,
"grad_norm": 0.25431733395319017,
"learning_rate": 4.6938775510204086e-05,
"loss": 0.3549,
"step": 390
},
{
"epoch": 0.4682634730538922,
"grad_norm": 0.26732091551633536,
"learning_rate": 4.6916592724046144e-05,
"loss": 0.3489,
"step": 391
},
{
"epoch": 0.4694610778443114,
"grad_norm": 0.25714832401571425,
"learning_rate": 4.68944099378882e-05,
"loss": 0.3711,
"step": 392
},
{
"epoch": 0.47065868263473054,
"grad_norm": 0.24617821194101752,
"learning_rate": 4.687222715173026e-05,
"loss": 0.3425,
"step": 393
},
{
"epoch": 0.4718562874251497,
"grad_norm": 0.24281764913491666,
"learning_rate": 4.685004436557231e-05,
"loss": 0.3635,
"step": 394
},
{
"epoch": 0.47305389221556887,
"grad_norm": 0.21802379566788851,
"learning_rate": 4.682786157941438e-05,
"loss": 0.3089,
"step": 395
},
{
"epoch": 0.474251497005988,
"grad_norm": 0.23618443664767785,
"learning_rate": 4.6805678793256436e-05,
"loss": 0.3417,
"step": 396
},
{
"epoch": 0.4754491017964072,
"grad_norm": 0.22735469663791008,
"learning_rate": 4.6783496007098495e-05,
"loss": 0.354,
"step": 397
},
{
"epoch": 0.47664670658682634,
"grad_norm": 0.20785438423222535,
"learning_rate": 4.676131322094055e-05,
"loss": 0.3256,
"step": 398
},
{
"epoch": 0.47784431137724553,
"grad_norm": 0.21552494921556023,
"learning_rate": 4.673913043478261e-05,
"loss": 0.3373,
"step": 399
},
{
"epoch": 0.47904191616766467,
"grad_norm": 0.21193447038651214,
"learning_rate": 4.671694764862467e-05,
"loss": 0.3328,
"step": 400
},
{
"epoch": 0.4802395209580838,
"grad_norm": 0.30267181391011794,
"learning_rate": 4.669476486246673e-05,
"loss": 0.4016,
"step": 401
},
{
"epoch": 0.481437125748503,
"grad_norm": 0.24765497367732625,
"learning_rate": 4.667258207630879e-05,
"loss": 0.3537,
"step": 402
},
{
"epoch": 0.48263473053892214,
"grad_norm": 0.22321665565774365,
"learning_rate": 4.6650399290150845e-05,
"loss": 0.3456,
"step": 403
},
{
"epoch": 0.48383233532934133,
"grad_norm": 0.21823498021471543,
"learning_rate": 4.6628216503992904e-05,
"loss": 0.3132,
"step": 404
},
{
"epoch": 0.48502994011976047,
"grad_norm": 0.2481480609865313,
"learning_rate": 4.660603371783496e-05,
"loss": 0.3439,
"step": 405
},
{
"epoch": 0.48622754491017967,
"grad_norm": 0.2548493639459095,
"learning_rate": 4.658385093167702e-05,
"loss": 0.3834,
"step": 406
},
{
"epoch": 0.4874251497005988,
"grad_norm": 0.2279941216554901,
"learning_rate": 4.656166814551908e-05,
"loss": 0.3398,
"step": 407
},
{
"epoch": 0.48862275449101794,
"grad_norm": 0.25633693599212404,
"learning_rate": 4.653948535936114e-05,
"loss": 0.3428,
"step": 408
},
{
"epoch": 0.48982035928143713,
"grad_norm": 0.2151287678371385,
"learning_rate": 4.6517302573203196e-05,
"loss": 0.3364,
"step": 409
},
{
"epoch": 0.49101796407185627,
"grad_norm": 0.23964449521493947,
"learning_rate": 4.6495119787045254e-05,
"loss": 0.3214,
"step": 410
},
{
"epoch": 0.49221556886227547,
"grad_norm": 0.2760546149132272,
"learning_rate": 4.647293700088731e-05,
"loss": 0.3679,
"step": 411
},
{
"epoch": 0.4934131736526946,
"grad_norm": 0.2309412401561408,
"learning_rate": 4.645075421472937e-05,
"loss": 0.3129,
"step": 412
},
{
"epoch": 0.4946107784431138,
"grad_norm": 0.24870422958969973,
"learning_rate": 4.642857142857143e-05,
"loss": 0.3472,
"step": 413
},
{
"epoch": 0.49580838323353293,
"grad_norm": 0.21821795940460365,
"learning_rate": 4.640638864241349e-05,
"loss": 0.3175,
"step": 414
},
{
"epoch": 0.49700598802395207,
"grad_norm": 0.30498309396108575,
"learning_rate": 4.6384205856255546e-05,
"loss": 0.3655,
"step": 415
},
{
"epoch": 0.49820359281437127,
"grad_norm": 0.24233025702398628,
"learning_rate": 4.6362023070097605e-05,
"loss": 0.3388,
"step": 416
},
{
"epoch": 0.4994011976047904,
"grad_norm": 0.285779721321345,
"learning_rate": 4.633984028393967e-05,
"loss": 0.3568,
"step": 417
},
{
"epoch": 0.5005988023952096,
"grad_norm": 0.2533726999030836,
"learning_rate": 4.631765749778172e-05,
"loss": 0.3327,
"step": 418
},
{
"epoch": 0.5017964071856288,
"grad_norm": 0.2427897295113954,
"learning_rate": 4.629547471162378e-05,
"loss": 0.3502,
"step": 419
},
{
"epoch": 0.5029940119760479,
"grad_norm": 0.2671662624396468,
"learning_rate": 4.627329192546584e-05,
"loss": 0.3507,
"step": 420
},
{
"epoch": 0.5041916167664671,
"grad_norm": 0.22989537585625808,
"learning_rate": 4.62511091393079e-05,
"loss": 0.346,
"step": 421
},
{
"epoch": 0.5053892215568863,
"grad_norm": 0.2433103892160332,
"learning_rate": 4.622892635314996e-05,
"loss": 0.3328,
"step": 422
},
{
"epoch": 0.5065868263473053,
"grad_norm": 0.21499407549309948,
"learning_rate": 4.6206743566992014e-05,
"loss": 0.3428,
"step": 423
},
{
"epoch": 0.5077844311377245,
"grad_norm": 0.2213750585606025,
"learning_rate": 4.618456078083407e-05,
"loss": 0.3535,
"step": 424
},
{
"epoch": 0.5089820359281437,
"grad_norm": 0.28475141786375585,
"learning_rate": 4.616237799467613e-05,
"loss": 0.3663,
"step": 425
},
{
"epoch": 0.5101796407185629,
"grad_norm": 0.23643795876861698,
"learning_rate": 4.6140195208518196e-05,
"loss": 0.3478,
"step": 426
},
{
"epoch": 0.511377245508982,
"grad_norm": 0.23628493029158837,
"learning_rate": 4.6118012422360254e-05,
"loss": 0.3517,
"step": 427
},
{
"epoch": 0.5125748502994012,
"grad_norm": 0.23676112900890417,
"learning_rate": 4.6095829636202306e-05,
"loss": 0.3007,
"step": 428
},
{
"epoch": 0.5137724550898204,
"grad_norm": 0.25290379380349204,
"learning_rate": 4.6073646850044364e-05,
"loss": 0.3592,
"step": 429
},
{
"epoch": 0.5149700598802395,
"grad_norm": 0.23725788893275923,
"learning_rate": 4.605146406388642e-05,
"loss": 0.3251,
"step": 430
},
{
"epoch": 0.5161676646706587,
"grad_norm": 0.24633169817736417,
"learning_rate": 4.602928127772849e-05,
"loss": 0.3601,
"step": 431
},
{
"epoch": 0.5173652694610779,
"grad_norm": 0.2009206323272319,
"learning_rate": 4.6007098491570546e-05,
"loss": 0.3493,
"step": 432
},
{
"epoch": 0.518562874251497,
"grad_norm": 0.20947880165683178,
"learning_rate": 4.59849157054126e-05,
"loss": 0.2925,
"step": 433
},
{
"epoch": 0.5197604790419161,
"grad_norm": 0.22053100268704234,
"learning_rate": 4.5962732919254656e-05,
"loss": 0.3099,
"step": 434
},
{
"epoch": 0.5209580838323353,
"grad_norm": 0.22028518369188624,
"learning_rate": 4.594055013309672e-05,
"loss": 0.3438,
"step": 435
},
{
"epoch": 0.5221556886227545,
"grad_norm": 0.2182525992688785,
"learning_rate": 4.591836734693878e-05,
"loss": 0.3338,
"step": 436
},
{
"epoch": 0.5233532934131736,
"grad_norm": 0.18821857371495546,
"learning_rate": 4.589618456078084e-05,
"loss": 0.3109,
"step": 437
},
{
"epoch": 0.5245508982035928,
"grad_norm": 0.24727919169574522,
"learning_rate": 4.58740017746229e-05,
"loss": 0.3456,
"step": 438
},
{
"epoch": 0.525748502994012,
"grad_norm": 0.3673980340835294,
"learning_rate": 4.585181898846495e-05,
"loss": 0.3506,
"step": 439
},
{
"epoch": 0.5269461077844312,
"grad_norm": 0.2070313905350641,
"learning_rate": 4.5829636202307014e-05,
"loss": 0.3509,
"step": 440
},
{
"epoch": 0.5281437125748503,
"grad_norm": 0.26354315198942513,
"learning_rate": 4.580745341614907e-05,
"loss": 0.3662,
"step": 441
},
{
"epoch": 0.5293413173652695,
"grad_norm": 0.2675382803385073,
"learning_rate": 4.578527062999113e-05,
"loss": 0.3539,
"step": 442
},
{
"epoch": 0.5305389221556887,
"grad_norm": 0.2550337674607903,
"learning_rate": 4.576308784383319e-05,
"loss": 0.3549,
"step": 443
},
{
"epoch": 0.5317365269461077,
"grad_norm": 0.2653989874081486,
"learning_rate": 4.574090505767525e-05,
"loss": 0.3482,
"step": 444
},
{
"epoch": 0.5329341317365269,
"grad_norm": 0.2578176104660128,
"learning_rate": 4.5718722271517306e-05,
"loss": 0.3519,
"step": 445
},
{
"epoch": 0.5341317365269461,
"grad_norm": 1.0563204803127988,
"learning_rate": 4.5696539485359364e-05,
"loss": 0.3795,
"step": 446
},
{
"epoch": 0.5353293413173653,
"grad_norm": 0.25561359302125125,
"learning_rate": 4.567435669920142e-05,
"loss": 0.3288,
"step": 447
},
{
"epoch": 0.5365269461077844,
"grad_norm": 0.25409022963627814,
"learning_rate": 4.565217391304348e-05,
"loss": 0.3265,
"step": 448
},
{
"epoch": 0.5377245508982036,
"grad_norm": 0.24350966123391107,
"learning_rate": 4.562999112688554e-05,
"loss": 0.3444,
"step": 449
},
{
"epoch": 0.5389221556886228,
"grad_norm": 0.23693107235835195,
"learning_rate": 4.56078083407276e-05,
"loss": 0.3133,
"step": 450
},
{
"epoch": 0.5401197604790419,
"grad_norm": 0.21684020904981183,
"learning_rate": 4.5585625554569656e-05,
"loss": 0.344,
"step": 451
},
{
"epoch": 0.5413173652694611,
"grad_norm": 0.2433929708051567,
"learning_rate": 4.5563442768411715e-05,
"loss": 0.3395,
"step": 452
},
{
"epoch": 0.5425149700598803,
"grad_norm": 0.218087858237466,
"learning_rate": 4.554125998225377e-05,
"loss": 0.3256,
"step": 453
},
{
"epoch": 0.5437125748502994,
"grad_norm": 0.22753073895011652,
"learning_rate": 4.551907719609583e-05,
"loss": 0.323,
"step": 454
},
{
"epoch": 0.5449101796407185,
"grad_norm": 0.21388991972347163,
"learning_rate": 4.549689440993789e-05,
"loss": 0.3543,
"step": 455
},
{
"epoch": 0.5461077844311377,
"grad_norm": 0.21222394833799066,
"learning_rate": 4.547471162377995e-05,
"loss": 0.3452,
"step": 456
},
{
"epoch": 0.5473053892215569,
"grad_norm": 0.2730977797609656,
"learning_rate": 4.545252883762201e-05,
"loss": 0.3424,
"step": 457
},
{
"epoch": 0.548502994011976,
"grad_norm": 0.22809436404644354,
"learning_rate": 4.5430346051464065e-05,
"loss": 0.3363,
"step": 458
},
{
"epoch": 0.5497005988023952,
"grad_norm": 0.23622090495237327,
"learning_rate": 4.5408163265306124e-05,
"loss": 0.3311,
"step": 459
},
{
"epoch": 0.5508982035928144,
"grad_norm": 0.2331426370855948,
"learning_rate": 4.538598047914818e-05,
"loss": 0.3298,
"step": 460
},
{
"epoch": 0.5520958083832336,
"grad_norm": 0.25539268351374816,
"learning_rate": 4.536379769299024e-05,
"loss": 0.3501,
"step": 461
},
{
"epoch": 0.5532934131736527,
"grad_norm": 0.22263552152658977,
"learning_rate": 4.5341614906832306e-05,
"loss": 0.3363,
"step": 462
},
{
"epoch": 0.5544910179640719,
"grad_norm": 0.25021260034553683,
"learning_rate": 4.531943212067436e-05,
"loss": 0.3677,
"step": 463
},
{
"epoch": 0.555688622754491,
"grad_norm": 0.2616482870587574,
"learning_rate": 4.5297249334516416e-05,
"loss": 0.3411,
"step": 464
},
{
"epoch": 0.5568862275449101,
"grad_norm": 0.2678548742748626,
"learning_rate": 4.5275066548358474e-05,
"loss": 0.3448,
"step": 465
},
{
"epoch": 0.5580838323353293,
"grad_norm": 0.3253637826842061,
"learning_rate": 4.525288376220053e-05,
"loss": 0.3382,
"step": 466
},
{
"epoch": 0.5592814371257485,
"grad_norm": 0.23817461729681572,
"learning_rate": 4.52307009760426e-05,
"loss": 0.3152,
"step": 467
},
{
"epoch": 0.5604790419161677,
"grad_norm": 0.233063155812277,
"learning_rate": 4.520851818988465e-05,
"loss": 0.3175,
"step": 468
},
{
"epoch": 0.5616766467065868,
"grad_norm": 0.2952566189034759,
"learning_rate": 4.518633540372671e-05,
"loss": 0.3561,
"step": 469
},
{
"epoch": 0.562874251497006,
"grad_norm": 0.22859174820526293,
"learning_rate": 4.5164152617568766e-05,
"loss": 0.3364,
"step": 470
},
{
"epoch": 0.5640718562874252,
"grad_norm": 0.25260874711367515,
"learning_rate": 4.514196983141083e-05,
"loss": 0.3201,
"step": 471
},
{
"epoch": 0.5652694610778443,
"grad_norm": 0.22338628766369695,
"learning_rate": 4.511978704525289e-05,
"loss": 0.3424,
"step": 472
},
{
"epoch": 0.5664670658682635,
"grad_norm": 0.30409867234715177,
"learning_rate": 4.509760425909494e-05,
"loss": 0.3699,
"step": 473
},
{
"epoch": 0.5676646706586826,
"grad_norm": 0.26420581876501714,
"learning_rate": 4.5075421472937e-05,
"loss": 0.3344,
"step": 474
},
{
"epoch": 0.5688622754491018,
"grad_norm": 0.22031005618935637,
"learning_rate": 4.505323868677906e-05,
"loss": 0.3335,
"step": 475
},
{
"epoch": 0.5700598802395209,
"grad_norm": 0.28068402703359174,
"learning_rate": 4.5031055900621124e-05,
"loss": 0.3274,
"step": 476
},
{
"epoch": 0.5712574850299401,
"grad_norm": 0.26327299125849435,
"learning_rate": 4.500887311446318e-05,
"loss": 0.3185,
"step": 477
},
{
"epoch": 0.5724550898203593,
"grad_norm": 0.24727678681567525,
"learning_rate": 4.4986690328305234e-05,
"loss": 0.3484,
"step": 478
},
{
"epoch": 0.5736526946107784,
"grad_norm": 0.2727034380294155,
"learning_rate": 4.496450754214729e-05,
"loss": 0.3591,
"step": 479
},
{
"epoch": 0.5748502994011976,
"grad_norm": 0.21959793459711452,
"learning_rate": 4.494232475598936e-05,
"loss": 0.2901,
"step": 480
},
{
"epoch": 0.5760479041916168,
"grad_norm": 0.2458425182531576,
"learning_rate": 4.4920141969831416e-05,
"loss": 0.3314,
"step": 481
},
{
"epoch": 0.577245508982036,
"grad_norm": 0.23701377269881507,
"learning_rate": 4.4897959183673474e-05,
"loss": 0.3259,
"step": 482
},
{
"epoch": 0.578443113772455,
"grad_norm": 0.25303688031323224,
"learning_rate": 4.4875776397515526e-05,
"loss": 0.3378,
"step": 483
},
{
"epoch": 0.5796407185628742,
"grad_norm": 0.2378807404846371,
"learning_rate": 4.4853593611357584e-05,
"loss": 0.3623,
"step": 484
},
{
"epoch": 0.5808383233532934,
"grad_norm": 0.3230252153191925,
"learning_rate": 4.483141082519965e-05,
"loss": 0.3423,
"step": 485
},
{
"epoch": 0.5820359281437125,
"grad_norm": 0.25697597175501635,
"learning_rate": 4.480922803904171e-05,
"loss": 0.3509,
"step": 486
},
{
"epoch": 0.5832335329341317,
"grad_norm": 0.24403221065505054,
"learning_rate": 4.4787045252883766e-05,
"loss": 0.3403,
"step": 487
},
{
"epoch": 0.5844311377245509,
"grad_norm": 0.25496002856271793,
"learning_rate": 4.476486246672582e-05,
"loss": 0.3351,
"step": 488
},
{
"epoch": 0.5856287425149701,
"grad_norm": 0.23223905962292068,
"learning_rate": 4.474267968056788e-05,
"loss": 0.348,
"step": 489
},
{
"epoch": 0.5868263473053892,
"grad_norm": 0.2714611444612145,
"learning_rate": 4.472049689440994e-05,
"loss": 0.3258,
"step": 490
},
{
"epoch": 0.5880239520958084,
"grad_norm": 0.2256121745237642,
"learning_rate": 4.4698314108252e-05,
"loss": 0.3252,
"step": 491
},
{
"epoch": 0.5892215568862276,
"grad_norm": 0.24375129548898958,
"learning_rate": 4.467613132209406e-05,
"loss": 0.3296,
"step": 492
},
{
"epoch": 0.5904191616766467,
"grad_norm": 0.2139006598926973,
"learning_rate": 4.465394853593611e-05,
"loss": 0.3314,
"step": 493
},
{
"epoch": 0.5916167664670658,
"grad_norm": 0.23464452254626222,
"learning_rate": 4.4631765749778175e-05,
"loss": 0.3043,
"step": 494
},
{
"epoch": 0.592814371257485,
"grad_norm": 0.2618939430240119,
"learning_rate": 4.4609582963620234e-05,
"loss": 0.3463,
"step": 495
},
{
"epoch": 0.5940119760479042,
"grad_norm": 0.2580903669553573,
"learning_rate": 4.458740017746229e-05,
"loss": 0.3794,
"step": 496
},
{
"epoch": 0.5952095808383233,
"grad_norm": 0.2100295587347098,
"learning_rate": 4.456521739130435e-05,
"loss": 0.3393,
"step": 497
},
{
"epoch": 0.5964071856287425,
"grad_norm": 0.24850163206332257,
"learning_rate": 4.454303460514641e-05,
"loss": 0.3516,
"step": 498
},
{
"epoch": 0.5976047904191617,
"grad_norm": 0.24803772367207197,
"learning_rate": 4.452085181898847e-05,
"loss": 0.3645,
"step": 499
},
{
"epoch": 0.5988023952095808,
"grad_norm": 0.2225633896502514,
"learning_rate": 4.4498669032830526e-05,
"loss": 0.3343,
"step": 500
},
{
"epoch": 0.6,
"grad_norm": 0.24287331190678255,
"learning_rate": 4.4476486246672584e-05,
"loss": 0.3488,
"step": 501
},
{
"epoch": 0.6011976047904192,
"grad_norm": 0.22660312073044833,
"learning_rate": 4.445430346051464e-05,
"loss": 0.3112,
"step": 502
},
{
"epoch": 0.6023952095808384,
"grad_norm": 0.2357202600054252,
"learning_rate": 4.44321206743567e-05,
"loss": 0.3222,
"step": 503
},
{
"epoch": 0.6035928143712574,
"grad_norm": 0.266085116398461,
"learning_rate": 4.440993788819876e-05,
"loss": 0.3553,
"step": 504
},
{
"epoch": 0.6047904191616766,
"grad_norm": 0.22971034102872515,
"learning_rate": 4.438775510204082e-05,
"loss": 0.3247,
"step": 505
},
{
"epoch": 0.6059880239520958,
"grad_norm": 0.2290474936284046,
"learning_rate": 4.4365572315882877e-05,
"loss": 0.345,
"step": 506
},
{
"epoch": 0.6071856287425149,
"grad_norm": 0.23759376037274024,
"learning_rate": 4.4343389529724935e-05,
"loss": 0.3451,
"step": 507
},
{
"epoch": 0.6083832335329341,
"grad_norm": 0.2666423773432962,
"learning_rate": 4.432120674356699e-05,
"loss": 0.3353,
"step": 508
},
{
"epoch": 0.6095808383233533,
"grad_norm": 0.2179172576060944,
"learning_rate": 4.429902395740905e-05,
"loss": 0.356,
"step": 509
},
{
"epoch": 0.6107784431137725,
"grad_norm": 0.2523833682458426,
"learning_rate": 4.427684117125111e-05,
"loss": 0.32,
"step": 510
},
{
"epoch": 0.6119760479041916,
"grad_norm": 0.27996986174002797,
"learning_rate": 4.425465838509317e-05,
"loss": 0.3301,
"step": 511
},
{
"epoch": 0.6131736526946108,
"grad_norm": 0.22736244378548737,
"learning_rate": 4.423247559893523e-05,
"loss": 0.3349,
"step": 512
},
{
"epoch": 0.61437125748503,
"grad_norm": 0.24939267497628415,
"learning_rate": 4.4210292812777286e-05,
"loss": 0.3425,
"step": 513
},
{
"epoch": 0.615568862275449,
"grad_norm": 0.24637538041097198,
"learning_rate": 4.4188110026619344e-05,
"loss": 0.3278,
"step": 514
},
{
"epoch": 0.6167664670658682,
"grad_norm": 0.2372861541879641,
"learning_rate": 4.41659272404614e-05,
"loss": 0.3296,
"step": 515
},
{
"epoch": 0.6179640718562874,
"grad_norm": 0.21992565542636597,
"learning_rate": 4.414374445430347e-05,
"loss": 0.3338,
"step": 516
},
{
"epoch": 0.6191616766467066,
"grad_norm": 0.22340402624831074,
"learning_rate": 4.412156166814552e-05,
"loss": 0.3412,
"step": 517
},
{
"epoch": 0.6203592814371257,
"grad_norm": 0.2346930018760123,
"learning_rate": 4.409937888198758e-05,
"loss": 0.3366,
"step": 518
},
{
"epoch": 0.6215568862275449,
"grad_norm": 0.20748493163402046,
"learning_rate": 4.4077196095829636e-05,
"loss": 0.3261,
"step": 519
},
{
"epoch": 0.6227544910179641,
"grad_norm": 0.23409192554575836,
"learning_rate": 4.4055013309671695e-05,
"loss": 0.3307,
"step": 520
},
{
"epoch": 0.6239520958083832,
"grad_norm": 0.2159903930167686,
"learning_rate": 4.403283052351376e-05,
"loss": 0.3337,
"step": 521
},
{
"epoch": 0.6251497005988024,
"grad_norm": 0.22862523196263096,
"learning_rate": 4.401064773735581e-05,
"loss": 0.3316,
"step": 522
},
{
"epoch": 0.6263473053892216,
"grad_norm": 0.2553816538699405,
"learning_rate": 4.398846495119787e-05,
"loss": 0.3597,
"step": 523
},
{
"epoch": 0.6275449101796408,
"grad_norm": 0.27584469548672336,
"learning_rate": 4.396628216503993e-05,
"loss": 0.3432,
"step": 524
},
{
"epoch": 0.6287425149700598,
"grad_norm": 0.25498576182414046,
"learning_rate": 4.3944099378881993e-05,
"loss": 0.3255,
"step": 525
},
{
"epoch": 0.629940119760479,
"grad_norm": 0.20117252253654047,
"learning_rate": 4.392191659272405e-05,
"loss": 0.3201,
"step": 526
},
{
"epoch": 0.6311377245508982,
"grad_norm": 0.24776110213160324,
"learning_rate": 4.3899733806566103e-05,
"loss": 0.3188,
"step": 527
},
{
"epoch": 0.6323353293413174,
"grad_norm": 0.2125055181220414,
"learning_rate": 4.387755102040816e-05,
"loss": 0.3222,
"step": 528
},
{
"epoch": 0.6335329341317365,
"grad_norm": 0.24148967219734127,
"learning_rate": 4.385536823425022e-05,
"loss": 0.323,
"step": 529
},
{
"epoch": 0.6347305389221557,
"grad_norm": 0.19980992977878154,
"learning_rate": 4.3833185448092286e-05,
"loss": 0.3307,
"step": 530
},
{
"epoch": 0.6359281437125749,
"grad_norm": 0.28533113860955867,
"learning_rate": 4.3811002661934344e-05,
"loss": 0.3509,
"step": 531
},
{
"epoch": 0.637125748502994,
"grad_norm": 0.21085618656348862,
"learning_rate": 4.3788819875776396e-05,
"loss": 0.338,
"step": 532
},
{
"epoch": 0.6383233532934132,
"grad_norm": 0.204845867006273,
"learning_rate": 4.3766637089618454e-05,
"loss": 0.3386,
"step": 533
},
{
"epoch": 0.6395209580838324,
"grad_norm": 0.2640004586241013,
"learning_rate": 4.374445430346052e-05,
"loss": 0.365,
"step": 534
},
{
"epoch": 0.6407185628742516,
"grad_norm": 0.20374117439420414,
"learning_rate": 4.372227151730258e-05,
"loss": 0.3227,
"step": 535
},
{
"epoch": 0.6419161676646706,
"grad_norm": 0.24614240443242358,
"learning_rate": 4.3700088731144636e-05,
"loss": 0.365,
"step": 536
},
{
"epoch": 0.6431137724550898,
"grad_norm": 0.20541468168210358,
"learning_rate": 4.3677905944986695e-05,
"loss": 0.3374,
"step": 537
},
{
"epoch": 0.644311377245509,
"grad_norm": 0.23126435452085198,
"learning_rate": 4.3655723158828746e-05,
"loss": 0.3313,
"step": 538
},
{
"epoch": 0.6455089820359281,
"grad_norm": 0.20638908057499986,
"learning_rate": 4.363354037267081e-05,
"loss": 0.3642,
"step": 539
},
{
"epoch": 0.6467065868263473,
"grad_norm": 0.2473774522741683,
"learning_rate": 4.361135758651287e-05,
"loss": 0.3292,
"step": 540
},
{
"epoch": 0.6479041916167665,
"grad_norm": 0.2186010910093557,
"learning_rate": 4.358917480035493e-05,
"loss": 0.3208,
"step": 541
},
{
"epoch": 0.6491017964071857,
"grad_norm": 0.24917764910771278,
"learning_rate": 4.356699201419699e-05,
"loss": 0.3248,
"step": 542
},
{
"epoch": 0.6502994011976048,
"grad_norm": 0.21399946744872583,
"learning_rate": 4.3544809228039045e-05,
"loss": 0.3074,
"step": 543
},
{
"epoch": 0.651497005988024,
"grad_norm": 0.243101209797286,
"learning_rate": 4.3522626441881104e-05,
"loss": 0.3353,
"step": 544
},
{
"epoch": 0.6526946107784432,
"grad_norm": 0.25664208694714563,
"learning_rate": 4.350044365572316e-05,
"loss": 0.3393,
"step": 545
},
{
"epoch": 0.6538922155688622,
"grad_norm": 0.20699983715585787,
"learning_rate": 4.347826086956522e-05,
"loss": 0.3178,
"step": 546
},
{
"epoch": 0.6550898203592814,
"grad_norm": 0.25518957392711195,
"learning_rate": 4.345607808340728e-05,
"loss": 0.3194,
"step": 547
},
{
"epoch": 0.6562874251497006,
"grad_norm": 0.26610871146066,
"learning_rate": 4.343389529724934e-05,
"loss": 0.3293,
"step": 548
},
{
"epoch": 0.6574850299401198,
"grad_norm": 0.283951537223024,
"learning_rate": 4.3411712511091396e-05,
"loss": 0.3262,
"step": 549
},
{
"epoch": 0.6586826347305389,
"grad_norm": 0.2463882509988367,
"learning_rate": 4.3389529724933454e-05,
"loss": 0.3338,
"step": 550
},
{
"epoch": 0.6598802395209581,
"grad_norm": 0.3478398931533286,
"learning_rate": 4.336734693877551e-05,
"loss": 0.3365,
"step": 551
},
{
"epoch": 0.6610778443113773,
"grad_norm": 0.22958733525843036,
"learning_rate": 4.334516415261757e-05,
"loss": 0.3202,
"step": 552
},
{
"epoch": 0.6622754491017964,
"grad_norm": 0.25143872014169116,
"learning_rate": 4.332298136645963e-05,
"loss": 0.3374,
"step": 553
},
{
"epoch": 0.6634730538922156,
"grad_norm": 0.23196040408308916,
"learning_rate": 4.330079858030169e-05,
"loss": 0.3384,
"step": 554
},
{
"epoch": 0.6646706586826348,
"grad_norm": 0.26978893695651185,
"learning_rate": 4.3278615794143746e-05,
"loss": 0.3547,
"step": 555
},
{
"epoch": 0.665868263473054,
"grad_norm": 0.21102937910760552,
"learning_rate": 4.3256433007985805e-05,
"loss": 0.3375,
"step": 556
},
{
"epoch": 0.667065868263473,
"grad_norm": 0.2608485047847889,
"learning_rate": 4.323425022182786e-05,
"loss": 0.3362,
"step": 557
},
{
"epoch": 0.6682634730538922,
"grad_norm": 0.25937536632462077,
"learning_rate": 4.321206743566992e-05,
"loss": 0.3389,
"step": 558
},
{
"epoch": 0.6694610778443114,
"grad_norm": 0.22509923819283922,
"learning_rate": 4.318988464951198e-05,
"loss": 0.3251,
"step": 559
},
{
"epoch": 0.6706586826347305,
"grad_norm": 0.28609667706976655,
"learning_rate": 4.316770186335404e-05,
"loss": 0.3252,
"step": 560
},
{
"epoch": 0.6718562874251497,
"grad_norm": 0.2202082586170876,
"learning_rate": 4.3145519077196104e-05,
"loss": 0.3231,
"step": 561
},
{
"epoch": 0.6730538922155689,
"grad_norm": 0.26022248195841946,
"learning_rate": 4.3123336291038155e-05,
"loss": 0.3461,
"step": 562
},
{
"epoch": 0.6742514970059881,
"grad_norm": 0.2385280289227432,
"learning_rate": 4.3101153504880214e-05,
"loss": 0.3556,
"step": 563
},
{
"epoch": 0.6754491017964072,
"grad_norm": 0.22086890190795375,
"learning_rate": 4.307897071872227e-05,
"loss": 0.3104,
"step": 564
},
{
"epoch": 0.6766467065868264,
"grad_norm": 0.2443205807865148,
"learning_rate": 4.305678793256433e-05,
"loss": 0.3378,
"step": 565
},
{
"epoch": 0.6778443113772455,
"grad_norm": 0.21626014442843972,
"learning_rate": 4.3034605146406396e-05,
"loss": 0.3315,
"step": 566
},
{
"epoch": 0.6790419161676646,
"grad_norm": 0.24408799004709655,
"learning_rate": 4.301242236024845e-05,
"loss": 0.3802,
"step": 567
},
{
"epoch": 0.6802395209580838,
"grad_norm": 0.21076808059348676,
"learning_rate": 4.2990239574090506e-05,
"loss": 0.3436,
"step": 568
},
{
"epoch": 0.681437125748503,
"grad_norm": 0.23168133502815944,
"learning_rate": 4.2968056787932564e-05,
"loss": 0.3697,
"step": 569
},
{
"epoch": 0.6826347305389222,
"grad_norm": 0.19670256891506313,
"learning_rate": 4.294587400177463e-05,
"loss": 0.3386,
"step": 570
},
{
"epoch": 0.6838323353293413,
"grad_norm": 0.20742619429201456,
"learning_rate": 4.292369121561669e-05,
"loss": 0.326,
"step": 571
},
{
"epoch": 0.6850299401197605,
"grad_norm": 0.221208699240613,
"learning_rate": 4.290150842945874e-05,
"loss": 0.3162,
"step": 572
},
{
"epoch": 0.6862275449101797,
"grad_norm": 0.21961706273577986,
"learning_rate": 4.28793256433008e-05,
"loss": 0.3149,
"step": 573
},
{
"epoch": 0.6874251497005988,
"grad_norm": 0.35663464815592244,
"learning_rate": 4.2857142857142856e-05,
"loss": 0.3303,
"step": 574
},
{
"epoch": 0.688622754491018,
"grad_norm": 0.2533854246618938,
"learning_rate": 4.283496007098492e-05,
"loss": 0.3773,
"step": 575
},
{
"epoch": 0.6898203592814371,
"grad_norm": 0.20783724943599116,
"learning_rate": 4.281277728482698e-05,
"loss": 0.319,
"step": 576
},
{
"epoch": 0.6910179640718563,
"grad_norm": 0.2736131041776681,
"learning_rate": 4.279059449866903e-05,
"loss": 0.3707,
"step": 577
},
{
"epoch": 0.6922155688622754,
"grad_norm": 0.194324059401936,
"learning_rate": 4.276841171251109e-05,
"loss": 0.3114,
"step": 578
},
{
"epoch": 0.6934131736526946,
"grad_norm": 0.22951528794361756,
"learning_rate": 4.2746228926353155e-05,
"loss": 0.3258,
"step": 579
},
{
"epoch": 0.6946107784431138,
"grad_norm": 0.20891824110848514,
"learning_rate": 4.2724046140195214e-05,
"loss": 0.3437,
"step": 580
},
{
"epoch": 0.6958083832335329,
"grad_norm": 0.20424717777428297,
"learning_rate": 4.270186335403727e-05,
"loss": 0.3419,
"step": 581
},
{
"epoch": 0.6970059880239521,
"grad_norm": 0.21612387168259004,
"learning_rate": 4.2679680567879324e-05,
"loss": 0.3451,
"step": 582
},
{
"epoch": 0.6982035928143713,
"grad_norm": 0.2081553477443472,
"learning_rate": 4.265749778172138e-05,
"loss": 0.3227,
"step": 583
},
{
"epoch": 0.6994011976047905,
"grad_norm": 0.19685085875699895,
"learning_rate": 4.263531499556345e-05,
"loss": 0.3131,
"step": 584
},
{
"epoch": 0.7005988023952096,
"grad_norm": 0.2193770494232225,
"learning_rate": 4.2613132209405506e-05,
"loss": 0.3388,
"step": 585
},
{
"epoch": 0.7017964071856287,
"grad_norm": 0.1796008680607434,
"learning_rate": 4.2590949423247564e-05,
"loss": 0.2988,
"step": 586
},
{
"epoch": 0.7029940119760479,
"grad_norm": 0.19984893616602253,
"learning_rate": 4.2568766637089616e-05,
"loss": 0.3182,
"step": 587
},
{
"epoch": 0.704191616766467,
"grad_norm": 0.22684076824628693,
"learning_rate": 4.254658385093168e-05,
"loss": 0.3237,
"step": 588
},
{
"epoch": 0.7053892215568862,
"grad_norm": 0.21157844194898806,
"learning_rate": 4.252440106477374e-05,
"loss": 0.3327,
"step": 589
},
{
"epoch": 0.7065868263473054,
"grad_norm": 0.2554596364569243,
"learning_rate": 4.25022182786158e-05,
"loss": 0.3507,
"step": 590
},
{
"epoch": 0.7077844311377246,
"grad_norm": 0.2573041538872736,
"learning_rate": 4.2480035492457856e-05,
"loss": 0.3418,
"step": 591
},
{
"epoch": 0.7089820359281437,
"grad_norm": 0.1974080948906318,
"learning_rate": 4.245785270629991e-05,
"loss": 0.3209,
"step": 592
},
{
"epoch": 0.7101796407185629,
"grad_norm": 0.23537884604474346,
"learning_rate": 4.243566992014197e-05,
"loss": 0.3399,
"step": 593
},
{
"epoch": 0.7113772455089821,
"grad_norm": 0.22325701843862378,
"learning_rate": 4.241348713398403e-05,
"loss": 0.3156,
"step": 594
},
{
"epoch": 0.7125748502994012,
"grad_norm": 0.22011928969532712,
"learning_rate": 4.239130434782609e-05,
"loss": 0.3329,
"step": 595
},
{
"epoch": 0.7137724550898203,
"grad_norm": 0.2455814437955591,
"learning_rate": 4.236912156166815e-05,
"loss": 0.3399,
"step": 596
},
{
"epoch": 0.7149700598802395,
"grad_norm": 0.22800675050448937,
"learning_rate": 4.234693877551021e-05,
"loss": 0.3503,
"step": 597
},
{
"epoch": 0.7161676646706587,
"grad_norm": 0.20057245428457157,
"learning_rate": 4.2324755989352265e-05,
"loss": 0.319,
"step": 598
},
{
"epoch": 0.7173652694610778,
"grad_norm": 0.23267261080128251,
"learning_rate": 4.2302573203194324e-05,
"loss": 0.3469,
"step": 599
},
{
"epoch": 0.718562874251497,
"grad_norm": 0.20345538543379318,
"learning_rate": 4.228039041703638e-05,
"loss": 0.3468,
"step": 600
},
{
"epoch": 0.7197604790419162,
"grad_norm": 0.22578355855404286,
"learning_rate": 4.225820763087844e-05,
"loss": 0.3249,
"step": 601
},
{
"epoch": 0.7209580838323353,
"grad_norm": 0.19193558949230294,
"learning_rate": 4.22360248447205e-05,
"loss": 0.3138,
"step": 602
},
{
"epoch": 0.7221556886227545,
"grad_norm": 0.20566532900621798,
"learning_rate": 4.221384205856256e-05,
"loss": 0.3341,
"step": 603
},
{
"epoch": 0.7233532934131737,
"grad_norm": 0.21450623361890125,
"learning_rate": 4.2191659272404616e-05,
"loss": 0.3124,
"step": 604
},
{
"epoch": 0.7245508982035929,
"grad_norm": 0.20559552731333963,
"learning_rate": 4.2169476486246674e-05,
"loss": 0.3345,
"step": 605
},
{
"epoch": 0.725748502994012,
"grad_norm": 0.2072064565605423,
"learning_rate": 4.214729370008873e-05,
"loss": 0.3316,
"step": 606
},
{
"epoch": 0.7269461077844311,
"grad_norm": 0.19887162886967824,
"learning_rate": 4.212511091393079e-05,
"loss": 0.3137,
"step": 607
},
{
"epoch": 0.7281437125748503,
"grad_norm": 0.20110251344667018,
"learning_rate": 4.210292812777285e-05,
"loss": 0.3063,
"step": 608
},
{
"epoch": 0.7293413173652694,
"grad_norm": 0.22244552995953615,
"learning_rate": 4.208074534161491e-05,
"loss": 0.3264,
"step": 609
},
{
"epoch": 0.7305389221556886,
"grad_norm": 0.18896942665463481,
"learning_rate": 4.2058562555456966e-05,
"loss": 0.3351,
"step": 610
},
{
"epoch": 0.7317365269461078,
"grad_norm": 0.2322456259939173,
"learning_rate": 4.2036379769299025e-05,
"loss": 0.3278,
"step": 611
},
{
"epoch": 0.732934131736527,
"grad_norm": 0.23262436919856785,
"learning_rate": 4.201419698314108e-05,
"loss": 0.34,
"step": 612
},
{
"epoch": 0.7341317365269461,
"grad_norm": 0.18194382535305154,
"learning_rate": 4.199201419698314e-05,
"loss": 0.2937,
"step": 613
},
{
"epoch": 0.7353293413173653,
"grad_norm": 0.2737364933513732,
"learning_rate": 4.19698314108252e-05,
"loss": 0.3257,
"step": 614
},
{
"epoch": 0.7365269461077845,
"grad_norm": 0.20227665850011398,
"learning_rate": 4.1947648624667265e-05,
"loss": 0.3491,
"step": 615
},
{
"epoch": 0.7377245508982035,
"grad_norm": 0.21452510873375477,
"learning_rate": 4.192546583850932e-05,
"loss": 0.3276,
"step": 616
},
{
"epoch": 0.7389221556886227,
"grad_norm": 0.28063495208393857,
"learning_rate": 4.1903283052351375e-05,
"loss": 0.3217,
"step": 617
},
{
"epoch": 0.7401197604790419,
"grad_norm": 0.22439687513256168,
"learning_rate": 4.1881100266193434e-05,
"loss": 0.3153,
"step": 618
},
{
"epoch": 0.7413173652694611,
"grad_norm": 0.24341152004160369,
"learning_rate": 4.185891748003549e-05,
"loss": 0.3292,
"step": 619
},
{
"epoch": 0.7425149700598802,
"grad_norm": 0.22061446857154574,
"learning_rate": 4.183673469387756e-05,
"loss": 0.3448,
"step": 620
},
{
"epoch": 0.7437125748502994,
"grad_norm": 0.2112791724886851,
"learning_rate": 4.181455190771961e-05,
"loss": 0.327,
"step": 621
},
{
"epoch": 0.7449101796407186,
"grad_norm": 0.2324481483091018,
"learning_rate": 4.179236912156167e-05,
"loss": 0.3216,
"step": 622
},
{
"epoch": 0.7461077844311377,
"grad_norm": 0.37271559184155306,
"learning_rate": 4.1770186335403726e-05,
"loss": 0.3862,
"step": 623
},
{
"epoch": 0.7473053892215569,
"grad_norm": 0.23373202599668294,
"learning_rate": 4.174800354924579e-05,
"loss": 0.3061,
"step": 624
},
{
"epoch": 0.7485029940119761,
"grad_norm": 0.18616396444869868,
"learning_rate": 4.172582076308785e-05,
"loss": 0.3026,
"step": 625
},
{
"epoch": 0.7497005988023953,
"grad_norm": 0.21685818931820627,
"learning_rate": 4.17036379769299e-05,
"loss": 0.3222,
"step": 626
},
{
"epoch": 0.7508982035928143,
"grad_norm": 0.25646656067223034,
"learning_rate": 4.168145519077196e-05,
"loss": 0.3594,
"step": 627
},
{
"epoch": 0.7520958083832335,
"grad_norm": 0.198612127210094,
"learning_rate": 4.165927240461402e-05,
"loss": 0.2989,
"step": 628
},
{
"epoch": 0.7532934131736527,
"grad_norm": 0.2424527500822902,
"learning_rate": 4.163708961845608e-05,
"loss": 0.2957,
"step": 629
},
{
"epoch": 0.7544910179640718,
"grad_norm": 0.21250485945401584,
"learning_rate": 4.161490683229814e-05,
"loss": 0.332,
"step": 630
},
{
"epoch": 0.755688622754491,
"grad_norm": 0.2415334596470528,
"learning_rate": 4.15927240461402e-05,
"loss": 0.3531,
"step": 631
},
{
"epoch": 0.7568862275449102,
"grad_norm": 0.22299551469521908,
"learning_rate": 4.157054125998225e-05,
"loss": 0.3168,
"step": 632
},
{
"epoch": 0.7580838323353294,
"grad_norm": 0.2601066631878153,
"learning_rate": 4.154835847382432e-05,
"loss": 0.3228,
"step": 633
},
{
"epoch": 0.7592814371257485,
"grad_norm": 0.2443945500099475,
"learning_rate": 4.1526175687666375e-05,
"loss": 0.3708,
"step": 634
},
{
"epoch": 0.7604790419161677,
"grad_norm": 0.2127080502171392,
"learning_rate": 4.1503992901508434e-05,
"loss": 0.2992,
"step": 635
},
{
"epoch": 0.7616766467065869,
"grad_norm": 0.19902727069871762,
"learning_rate": 4.148181011535049e-05,
"loss": 0.3078,
"step": 636
},
{
"epoch": 0.7628742514970059,
"grad_norm": 0.2087870483260939,
"learning_rate": 4.1459627329192544e-05,
"loss": 0.3617,
"step": 637
},
{
"epoch": 0.7640718562874251,
"grad_norm": 0.22049022235887872,
"learning_rate": 4.143744454303461e-05,
"loss": 0.3254,
"step": 638
},
{
"epoch": 0.7652694610778443,
"grad_norm": 0.18978964984634428,
"learning_rate": 4.141526175687667e-05,
"loss": 0.2958,
"step": 639
},
{
"epoch": 0.7664670658682635,
"grad_norm": 0.19650629350234033,
"learning_rate": 4.1393078970718726e-05,
"loss": 0.3162,
"step": 640
},
{
"epoch": 0.7676646706586826,
"grad_norm": 0.1975155827915533,
"learning_rate": 4.1370896184560784e-05,
"loss": 0.3301,
"step": 641
},
{
"epoch": 0.7688622754491018,
"grad_norm": 0.2423523112718179,
"learning_rate": 4.134871339840284e-05,
"loss": 0.3343,
"step": 642
},
{
"epoch": 0.770059880239521,
"grad_norm": 0.18252317638152807,
"learning_rate": 4.13265306122449e-05,
"loss": 0.3095,
"step": 643
},
{
"epoch": 0.7712574850299401,
"grad_norm": 0.25841087738473745,
"learning_rate": 4.130434782608696e-05,
"loss": 0.3271,
"step": 644
},
{
"epoch": 0.7724550898203593,
"grad_norm": 0.20464325769534156,
"learning_rate": 4.128216503992902e-05,
"loss": 0.3567,
"step": 645
},
{
"epoch": 0.7736526946107785,
"grad_norm": 0.2166272662030261,
"learning_rate": 4.1259982253771077e-05,
"loss": 0.3459,
"step": 646
},
{
"epoch": 0.7748502994011977,
"grad_norm": 0.22062230311157072,
"learning_rate": 4.1237799467613135e-05,
"loss": 0.2989,
"step": 647
},
{
"epoch": 0.7760479041916167,
"grad_norm": 0.19030213598661094,
"learning_rate": 4.121561668145519e-05,
"loss": 0.331,
"step": 648
},
{
"epoch": 0.7772455089820359,
"grad_norm": 0.20562604306291613,
"learning_rate": 4.119343389529725e-05,
"loss": 0.3229,
"step": 649
},
{
"epoch": 0.7784431137724551,
"grad_norm": 0.21441292538027745,
"learning_rate": 4.117125110913931e-05,
"loss": 0.3233,
"step": 650
},
{
"epoch": 0.7796407185628742,
"grad_norm": 0.26501852528815206,
"learning_rate": 4.114906832298137e-05,
"loss": 0.3164,
"step": 651
},
{
"epoch": 0.7808383233532934,
"grad_norm": 0.2303184230932821,
"learning_rate": 4.112688553682343e-05,
"loss": 0.3197,
"step": 652
},
{
"epoch": 0.7820359281437126,
"grad_norm": 0.19565934380673397,
"learning_rate": 4.1104702750665486e-05,
"loss": 0.2929,
"step": 653
},
{
"epoch": 0.7832335329341318,
"grad_norm": 0.205498104384855,
"learning_rate": 4.1082519964507544e-05,
"loss": 0.3332,
"step": 654
},
{
"epoch": 0.7844311377245509,
"grad_norm": 0.24681145084744188,
"learning_rate": 4.10603371783496e-05,
"loss": 0.3,
"step": 655
},
{
"epoch": 0.7856287425149701,
"grad_norm": 0.20404457732134149,
"learning_rate": 4.103815439219166e-05,
"loss": 0.3258,
"step": 656
},
{
"epoch": 0.7868263473053893,
"grad_norm": 0.19516542675141083,
"learning_rate": 4.101597160603372e-05,
"loss": 0.3306,
"step": 657
},
{
"epoch": 0.7880239520958083,
"grad_norm": 0.23492752650042212,
"learning_rate": 4.099378881987578e-05,
"loss": 0.3238,
"step": 658
},
{
"epoch": 0.7892215568862275,
"grad_norm": 0.3139940190214274,
"learning_rate": 4.0971606033717836e-05,
"loss": 0.321,
"step": 659
},
{
"epoch": 0.7904191616766467,
"grad_norm": 0.19805725596625315,
"learning_rate": 4.09494232475599e-05,
"loss": 0.3027,
"step": 660
},
{
"epoch": 0.7916167664670659,
"grad_norm": 0.20200858519667628,
"learning_rate": 4.092724046140195e-05,
"loss": 0.3362,
"step": 661
},
{
"epoch": 0.792814371257485,
"grad_norm": 0.22970246489640006,
"learning_rate": 4.090505767524401e-05,
"loss": 0.3415,
"step": 662
},
{
"epoch": 0.7940119760479042,
"grad_norm": 0.18808333704229785,
"learning_rate": 4.088287488908607e-05,
"loss": 0.2969,
"step": 663
},
{
"epoch": 0.7952095808383234,
"grad_norm": 0.22549251702095033,
"learning_rate": 4.086069210292813e-05,
"loss": 0.3285,
"step": 664
},
{
"epoch": 0.7964071856287425,
"grad_norm": 0.23662619311526203,
"learning_rate": 4.0838509316770193e-05,
"loss": 0.3414,
"step": 665
},
{
"epoch": 0.7976047904191617,
"grad_norm": 0.22359304440225106,
"learning_rate": 4.0816326530612245e-05,
"loss": 0.3532,
"step": 666
},
{
"epoch": 0.7988023952095809,
"grad_norm": 0.22009599302517538,
"learning_rate": 4.0794143744454303e-05,
"loss": 0.3016,
"step": 667
},
{
"epoch": 0.8,
"grad_norm": 0.22312912968070886,
"learning_rate": 4.077196095829636e-05,
"loss": 0.3553,
"step": 668
},
{
"epoch": 0.8011976047904191,
"grad_norm": 0.2366726362777618,
"learning_rate": 4.074977817213843e-05,
"loss": 0.335,
"step": 669
},
{
"epoch": 0.8023952095808383,
"grad_norm": 0.2172982961869319,
"learning_rate": 4.0727595385980486e-05,
"loss": 0.3025,
"step": 670
},
{
"epoch": 0.8035928143712575,
"grad_norm": 0.22915658771346178,
"learning_rate": 4.070541259982254e-05,
"loss": 0.3326,
"step": 671
},
{
"epoch": 0.8047904191616766,
"grad_norm": 0.21389896290816862,
"learning_rate": 4.0683229813664596e-05,
"loss": 0.3261,
"step": 672
},
{
"epoch": 0.8059880239520958,
"grad_norm": 0.2132524607595645,
"learning_rate": 4.0661047027506654e-05,
"loss": 0.3149,
"step": 673
},
{
"epoch": 0.807185628742515,
"grad_norm": 0.1882296622959376,
"learning_rate": 4.063886424134872e-05,
"loss": 0.309,
"step": 674
},
{
"epoch": 0.8083832335329342,
"grad_norm": 0.20445763483789553,
"learning_rate": 4.061668145519078e-05,
"loss": 0.3355,
"step": 675
},
{
"epoch": 0.8095808383233533,
"grad_norm": 0.21889812819355134,
"learning_rate": 4.059449866903283e-05,
"loss": 0.3287,
"step": 676
},
{
"epoch": 0.8107784431137725,
"grad_norm": 0.21785558073500952,
"learning_rate": 4.057231588287489e-05,
"loss": 0.3559,
"step": 677
},
{
"epoch": 0.8119760479041916,
"grad_norm": 0.2064168434324772,
"learning_rate": 4.0550133096716946e-05,
"loss": 0.3223,
"step": 678
},
{
"epoch": 0.8131736526946107,
"grad_norm": 0.19272137371442055,
"learning_rate": 4.052795031055901e-05,
"loss": 0.3127,
"step": 679
},
{
"epoch": 0.8143712574850299,
"grad_norm": 0.21568042658358963,
"learning_rate": 4.050576752440107e-05,
"loss": 0.3131,
"step": 680
},
{
"epoch": 0.8155688622754491,
"grad_norm": 0.21455256274579937,
"learning_rate": 4.048358473824312e-05,
"loss": 0.3385,
"step": 681
},
{
"epoch": 0.8167664670658683,
"grad_norm": 0.23821395948031648,
"learning_rate": 4.046140195208518e-05,
"loss": 0.323,
"step": 682
},
{
"epoch": 0.8179640718562874,
"grad_norm": 0.24769503420531214,
"learning_rate": 4.0439219165927245e-05,
"loss": 0.333,
"step": 683
},
{
"epoch": 0.8191616766467066,
"grad_norm": 0.19316737326967143,
"learning_rate": 4.0417036379769304e-05,
"loss": 0.3022,
"step": 684
},
{
"epoch": 0.8203592814371258,
"grad_norm": 0.22765825209635954,
"learning_rate": 4.039485359361136e-05,
"loss": 0.3119,
"step": 685
},
{
"epoch": 0.8215568862275449,
"grad_norm": 0.23620548119830403,
"learning_rate": 4.0372670807453414e-05,
"loss": 0.3179,
"step": 686
},
{
"epoch": 0.822754491017964,
"grad_norm": 0.18330938842636582,
"learning_rate": 4.035048802129547e-05,
"loss": 0.2971,
"step": 687
},
{
"epoch": 0.8239520958083832,
"grad_norm": 0.21822789553498437,
"learning_rate": 4.032830523513754e-05,
"loss": 0.3355,
"step": 688
},
{
"epoch": 0.8251497005988024,
"grad_norm": 0.21116628697462667,
"learning_rate": 4.0306122448979596e-05,
"loss": 0.3248,
"step": 689
},
{
"epoch": 0.8263473053892215,
"grad_norm": 0.19431614838971056,
"learning_rate": 4.0283939662821654e-05,
"loss": 0.3103,
"step": 690
},
{
"epoch": 0.8275449101796407,
"grad_norm": 0.18015623473813816,
"learning_rate": 4.0261756876663706e-05,
"loss": 0.303,
"step": 691
},
{
"epoch": 0.8287425149700599,
"grad_norm": 0.21670963190042805,
"learning_rate": 4.023957409050577e-05,
"loss": 0.3207,
"step": 692
},
{
"epoch": 0.829940119760479,
"grad_norm": 0.2431566624426393,
"learning_rate": 4.021739130434783e-05,
"loss": 0.3427,
"step": 693
},
{
"epoch": 0.8311377245508982,
"grad_norm": 0.22693016630622811,
"learning_rate": 4.019520851818989e-05,
"loss": 0.3437,
"step": 694
},
{
"epoch": 0.8323353293413174,
"grad_norm": 0.21610280694385497,
"learning_rate": 4.0173025732031946e-05,
"loss": 0.3345,
"step": 695
},
{
"epoch": 0.8335329341317366,
"grad_norm": 0.2524253370820126,
"learning_rate": 4.0150842945874e-05,
"loss": 0.3397,
"step": 696
},
{
"epoch": 0.8347305389221557,
"grad_norm": 0.2458551734609897,
"learning_rate": 4.012866015971606e-05,
"loss": 0.3347,
"step": 697
},
{
"epoch": 0.8359281437125748,
"grad_norm": 0.21395721294865025,
"learning_rate": 4.010647737355812e-05,
"loss": 0.3207,
"step": 698
},
{
"epoch": 0.837125748502994,
"grad_norm": 0.25953656447952433,
"learning_rate": 4.008429458740018e-05,
"loss": 0.3204,
"step": 699
},
{
"epoch": 0.8383233532934131,
"grad_norm": 0.2156935980887668,
"learning_rate": 4.006211180124224e-05,
"loss": 0.3018,
"step": 700
},
{
"epoch": 0.8395209580838323,
"grad_norm": 0.18511901365380456,
"learning_rate": 4.00399290150843e-05,
"loss": 0.3165,
"step": 701
},
{
"epoch": 0.8407185628742515,
"grad_norm": 0.2017028529504323,
"learning_rate": 4.0017746228926355e-05,
"loss": 0.2828,
"step": 702
},
{
"epoch": 0.8419161676646707,
"grad_norm": 0.236226217792665,
"learning_rate": 3.9995563442768414e-05,
"loss": 0.3187,
"step": 703
},
{
"epoch": 0.8431137724550898,
"grad_norm": 0.2117862092429732,
"learning_rate": 3.997338065661047e-05,
"loss": 0.314,
"step": 704
},
{
"epoch": 0.844311377245509,
"grad_norm": 0.2113385156449893,
"learning_rate": 3.995119787045253e-05,
"loss": 0.3126,
"step": 705
},
{
"epoch": 0.8455089820359282,
"grad_norm": 0.19116262398854852,
"learning_rate": 3.992901508429459e-05,
"loss": 0.3351,
"step": 706
},
{
"epoch": 0.8467065868263473,
"grad_norm": 0.21257687464765374,
"learning_rate": 3.990683229813665e-05,
"loss": 0.3326,
"step": 707
},
{
"epoch": 0.8479041916167664,
"grad_norm": 0.2312232888781925,
"learning_rate": 3.9884649511978706e-05,
"loss": 0.3298,
"step": 708
},
{
"epoch": 0.8491017964071856,
"grad_norm": 0.19348336497553445,
"learning_rate": 3.9862466725820764e-05,
"loss": 0.3174,
"step": 709
},
{
"epoch": 0.8502994011976048,
"grad_norm": 0.22101467985624962,
"learning_rate": 3.984028393966282e-05,
"loss": 0.3067,
"step": 710
},
{
"epoch": 0.8514970059880239,
"grad_norm": 0.23328051081947626,
"learning_rate": 3.981810115350488e-05,
"loss": 0.3544,
"step": 711
},
{
"epoch": 0.8526946107784431,
"grad_norm": 0.21512110973547308,
"learning_rate": 3.979591836734694e-05,
"loss": 0.3346,
"step": 712
},
{
"epoch": 0.8538922155688623,
"grad_norm": 0.24327687642524676,
"learning_rate": 3.9773735581189e-05,
"loss": 0.3374,
"step": 713
},
{
"epoch": 0.8550898203592814,
"grad_norm": 0.2083897698714801,
"learning_rate": 3.9751552795031056e-05,
"loss": 0.3348,
"step": 714
},
{
"epoch": 0.8562874251497006,
"grad_norm": 0.21492021699520705,
"learning_rate": 3.9729370008873115e-05,
"loss": 0.3268,
"step": 715
},
{
"epoch": 0.8574850299401198,
"grad_norm": 0.16826051020613395,
"learning_rate": 3.970718722271517e-05,
"loss": 0.3031,
"step": 716
},
{
"epoch": 0.858682634730539,
"grad_norm": 0.21495624592628354,
"learning_rate": 3.968500443655723e-05,
"loss": 0.3214,
"step": 717
},
{
"epoch": 0.859880239520958,
"grad_norm": 0.1810770518488255,
"learning_rate": 3.966282165039929e-05,
"loss": 0.3205,
"step": 718
},
{
"epoch": 0.8610778443113772,
"grad_norm": 0.20819741430217228,
"learning_rate": 3.9640638864241355e-05,
"loss": 0.3312,
"step": 719
},
{
"epoch": 0.8622754491017964,
"grad_norm": 0.20048930481097385,
"learning_rate": 3.961845607808341e-05,
"loss": 0.3341,
"step": 720
},
{
"epoch": 0.8634730538922155,
"grad_norm": 0.19820582793430785,
"learning_rate": 3.9596273291925465e-05,
"loss": 0.3013,
"step": 721
},
{
"epoch": 0.8646706586826347,
"grad_norm": 0.22160613963381282,
"learning_rate": 3.9574090505767524e-05,
"loss": 0.3307,
"step": 722
},
{
"epoch": 0.8658682634730539,
"grad_norm": 0.21409628532897212,
"learning_rate": 3.955190771960958e-05,
"loss": 0.3175,
"step": 723
},
{
"epoch": 0.8670658682634731,
"grad_norm": 0.2299507823167447,
"learning_rate": 3.952972493345165e-05,
"loss": 0.3166,
"step": 724
},
{
"epoch": 0.8682634730538922,
"grad_norm": 0.1906898018103573,
"learning_rate": 3.9507542147293706e-05,
"loss": 0.3178,
"step": 725
},
{
"epoch": 0.8694610778443114,
"grad_norm": 0.2137091234317929,
"learning_rate": 3.948535936113576e-05,
"loss": 0.3232,
"step": 726
},
{
"epoch": 0.8706586826347306,
"grad_norm": 0.21569790647183099,
"learning_rate": 3.9463176574977816e-05,
"loss": 0.3278,
"step": 727
},
{
"epoch": 0.8718562874251496,
"grad_norm": 0.18698247558682404,
"learning_rate": 3.944099378881988e-05,
"loss": 0.3086,
"step": 728
},
{
"epoch": 0.8730538922155688,
"grad_norm": 0.221552074642151,
"learning_rate": 3.941881100266194e-05,
"loss": 0.3332,
"step": 729
},
{
"epoch": 0.874251497005988,
"grad_norm": 0.21184261488279812,
"learning_rate": 3.9396628216504e-05,
"loss": 0.3136,
"step": 730
},
{
"epoch": 0.8754491017964072,
"grad_norm": 0.20781449170779723,
"learning_rate": 3.937444543034605e-05,
"loss": 0.355,
"step": 731
},
{
"epoch": 0.8766467065868263,
"grad_norm": 0.19877510914464083,
"learning_rate": 3.935226264418811e-05,
"loss": 0.3028,
"step": 732
},
{
"epoch": 0.8778443113772455,
"grad_norm": 0.1887745730520327,
"learning_rate": 3.933007985803017e-05,
"loss": 0.3105,
"step": 733
},
{
"epoch": 0.8790419161676647,
"grad_norm": 0.18778770244268467,
"learning_rate": 3.930789707187223e-05,
"loss": 0.2869,
"step": 734
},
{
"epoch": 0.8802395209580839,
"grad_norm": 0.2055164609491325,
"learning_rate": 3.928571428571429e-05,
"loss": 0.3119,
"step": 735
},
{
"epoch": 0.881437125748503,
"grad_norm": 0.20131331685425083,
"learning_rate": 3.926353149955634e-05,
"loss": 0.3287,
"step": 736
},
{
"epoch": 0.8826347305389222,
"grad_norm": 0.20356853871425862,
"learning_rate": 3.924134871339841e-05,
"loss": 0.3288,
"step": 737
},
{
"epoch": 0.8838323353293414,
"grad_norm": 0.19683037641329584,
"learning_rate": 3.9219165927240465e-05,
"loss": 0.3249,
"step": 738
},
{
"epoch": 0.8850299401197604,
"grad_norm": 0.2103871555601633,
"learning_rate": 3.9196983141082524e-05,
"loss": 0.3301,
"step": 739
},
{
"epoch": 0.8862275449101796,
"grad_norm": 0.2067974619962334,
"learning_rate": 3.917480035492458e-05,
"loss": 0.3274,
"step": 740
},
{
"epoch": 0.8874251497005988,
"grad_norm": 0.20048941994982802,
"learning_rate": 3.9152617568766634e-05,
"loss": 0.3343,
"step": 741
},
{
"epoch": 0.888622754491018,
"grad_norm": 0.20115810357447014,
"learning_rate": 3.91304347826087e-05,
"loss": 0.3225,
"step": 742
},
{
"epoch": 0.8898203592814371,
"grad_norm": 0.2104761120416256,
"learning_rate": 3.910825199645076e-05,
"loss": 0.3127,
"step": 743
},
{
"epoch": 0.8910179640718563,
"grad_norm": 0.19708847371398108,
"learning_rate": 3.9086069210292816e-05,
"loss": 0.328,
"step": 744
},
{
"epoch": 0.8922155688622755,
"grad_norm": 0.20852510784967215,
"learning_rate": 3.9063886424134874e-05,
"loss": 0.3475,
"step": 745
},
{
"epoch": 0.8934131736526946,
"grad_norm": 0.19058105726051283,
"learning_rate": 3.904170363797693e-05,
"loss": 0.329,
"step": 746
},
{
"epoch": 0.8946107784431138,
"grad_norm": 0.1821979848377328,
"learning_rate": 3.901952085181899e-05,
"loss": 0.3221,
"step": 747
},
{
"epoch": 0.895808383233533,
"grad_norm": 0.20028010745766828,
"learning_rate": 3.899733806566105e-05,
"loss": 0.3528,
"step": 748
},
{
"epoch": 0.8970059880239521,
"grad_norm": 0.18736690501089687,
"learning_rate": 3.897515527950311e-05,
"loss": 0.3063,
"step": 749
},
{
"epoch": 0.8982035928143712,
"grad_norm": 0.1787621024545975,
"learning_rate": 3.8952972493345166e-05,
"loss": 0.2992,
"step": 750
},
{
"epoch": 0.8994011976047904,
"grad_norm": 0.2001726696500066,
"learning_rate": 3.8930789707187225e-05,
"loss": 0.316,
"step": 751
},
{
"epoch": 0.9005988023952096,
"grad_norm": 0.18940523143673244,
"learning_rate": 3.890860692102928e-05,
"loss": 0.3112,
"step": 752
},
{
"epoch": 0.9017964071856287,
"grad_norm": 0.19392552860425666,
"learning_rate": 3.888642413487134e-05,
"loss": 0.3227,
"step": 753
},
{
"epoch": 0.9029940119760479,
"grad_norm": 0.2131859634774514,
"learning_rate": 3.88642413487134e-05,
"loss": 0.3244,
"step": 754
},
{
"epoch": 0.9041916167664671,
"grad_norm": 0.20485560152672597,
"learning_rate": 3.884205856255546e-05,
"loss": 0.3154,
"step": 755
},
{
"epoch": 0.9053892215568863,
"grad_norm": 0.23566449390914718,
"learning_rate": 3.881987577639752e-05,
"loss": 0.3539,
"step": 756
},
{
"epoch": 0.9065868263473054,
"grad_norm": 0.22125932473644458,
"learning_rate": 3.8797692990239575e-05,
"loss": 0.3024,
"step": 757
},
{
"epoch": 0.9077844311377246,
"grad_norm": 0.25136450245744646,
"learning_rate": 3.8775510204081634e-05,
"loss": 0.3268,
"step": 758
},
{
"epoch": 0.9089820359281438,
"grad_norm": 0.19137207130444941,
"learning_rate": 3.875332741792369e-05,
"loss": 0.3015,
"step": 759
},
{
"epoch": 0.9101796407185628,
"grad_norm": 0.26010045619074273,
"learning_rate": 3.873114463176575e-05,
"loss": 0.3403,
"step": 760
},
{
"epoch": 0.911377245508982,
"grad_norm": 0.2449273538668273,
"learning_rate": 3.870896184560781e-05,
"loss": 0.3172,
"step": 761
},
{
"epoch": 0.9125748502994012,
"grad_norm": 0.19809929461866815,
"learning_rate": 3.868677905944987e-05,
"loss": 0.3146,
"step": 762
},
{
"epoch": 0.9137724550898204,
"grad_norm": 0.24865137608566187,
"learning_rate": 3.8664596273291926e-05,
"loss": 0.2929,
"step": 763
},
{
"epoch": 0.9149700598802395,
"grad_norm": 0.2155765759513919,
"learning_rate": 3.864241348713399e-05,
"loss": 0.3043,
"step": 764
},
{
"epoch": 0.9161676646706587,
"grad_norm": 0.21030745501198797,
"learning_rate": 3.862023070097604e-05,
"loss": 0.3079,
"step": 765
},
{
"epoch": 0.9173652694610779,
"grad_norm": 0.24004124089136486,
"learning_rate": 3.85980479148181e-05,
"loss": 0.3428,
"step": 766
},
{
"epoch": 0.918562874251497,
"grad_norm": 0.19360569898301122,
"learning_rate": 3.857586512866016e-05,
"loss": 0.3145,
"step": 767
},
{
"epoch": 0.9197604790419162,
"grad_norm": 0.25199443824198753,
"learning_rate": 3.855368234250222e-05,
"loss": 0.3203,
"step": 768
},
{
"epoch": 0.9209580838323354,
"grad_norm": 0.198771411020094,
"learning_rate": 3.853149955634428e-05,
"loss": 0.321,
"step": 769
},
{
"epoch": 0.9221556886227545,
"grad_norm": 0.2302885974105059,
"learning_rate": 3.8509316770186335e-05,
"loss": 0.3165,
"step": 770
},
{
"epoch": 0.9233532934131736,
"grad_norm": 0.18934156304555894,
"learning_rate": 3.848713398402839e-05,
"loss": 0.3027,
"step": 771
},
{
"epoch": 0.9245508982035928,
"grad_norm": 0.18283522520813716,
"learning_rate": 3.846495119787045e-05,
"loss": 0.3075,
"step": 772
},
{
"epoch": 0.925748502994012,
"grad_norm": 0.2365712188470767,
"learning_rate": 3.844276841171252e-05,
"loss": 0.327,
"step": 773
},
{
"epoch": 0.9269461077844311,
"grad_norm": 0.19092607382221163,
"learning_rate": 3.8420585625554575e-05,
"loss": 0.3007,
"step": 774
},
{
"epoch": 0.9281437125748503,
"grad_norm": 0.1929191200378652,
"learning_rate": 3.839840283939663e-05,
"loss": 0.324,
"step": 775
},
{
"epoch": 0.9293413173652695,
"grad_norm": 0.22050174301395198,
"learning_rate": 3.8376220053238685e-05,
"loss": 0.324,
"step": 776
},
{
"epoch": 0.9305389221556887,
"grad_norm": 0.23200969980328304,
"learning_rate": 3.8354037267080744e-05,
"loss": 0.3334,
"step": 777
},
{
"epoch": 0.9317365269461078,
"grad_norm": 0.20821388701393387,
"learning_rate": 3.833185448092281e-05,
"loss": 0.3116,
"step": 778
},
{
"epoch": 0.932934131736527,
"grad_norm": 0.21885588697727637,
"learning_rate": 3.830967169476487e-05,
"loss": 0.3138,
"step": 779
},
{
"epoch": 0.9341317365269461,
"grad_norm": 0.21032371587084386,
"learning_rate": 3.828748890860692e-05,
"loss": 0.3002,
"step": 780
},
{
"epoch": 0.9353293413173652,
"grad_norm": 0.21080358065817992,
"learning_rate": 3.826530612244898e-05,
"loss": 0.3237,
"step": 781
},
{
"epoch": 0.9365269461077844,
"grad_norm": 0.1953135780866051,
"learning_rate": 3.824312333629104e-05,
"loss": 0.3362,
"step": 782
},
{
"epoch": 0.9377245508982036,
"grad_norm": 0.21186875023488816,
"learning_rate": 3.82209405501331e-05,
"loss": 0.361,
"step": 783
},
{
"epoch": 0.9389221556886228,
"grad_norm": 0.1978641645562516,
"learning_rate": 3.819875776397516e-05,
"loss": 0.3079,
"step": 784
},
{
"epoch": 0.9401197604790419,
"grad_norm": 0.21108548319769946,
"learning_rate": 3.817657497781721e-05,
"loss": 0.313,
"step": 785
},
{
"epoch": 0.9413173652694611,
"grad_norm": 0.22201607639118554,
"learning_rate": 3.815439219165927e-05,
"loss": 0.3542,
"step": 786
},
{
"epoch": 0.9425149700598803,
"grad_norm": 0.1905354085551497,
"learning_rate": 3.8132209405501335e-05,
"loss": 0.3194,
"step": 787
},
{
"epoch": 0.9437125748502994,
"grad_norm": 0.18436103189140818,
"learning_rate": 3.811002661934339e-05,
"loss": 0.3203,
"step": 788
},
{
"epoch": 0.9449101796407186,
"grad_norm": 0.21112209483937863,
"learning_rate": 3.808784383318545e-05,
"loss": 0.3247,
"step": 789
},
{
"epoch": 0.9461077844311377,
"grad_norm": 0.1952093945035393,
"learning_rate": 3.8065661047027503e-05,
"loss": 0.341,
"step": 790
},
{
"epoch": 0.9473053892215569,
"grad_norm": 0.1804635502475908,
"learning_rate": 3.804347826086957e-05,
"loss": 0.3256,
"step": 791
},
{
"epoch": 0.948502994011976,
"grad_norm": 0.17543728509898107,
"learning_rate": 3.802129547471163e-05,
"loss": 0.31,
"step": 792
},
{
"epoch": 0.9497005988023952,
"grad_norm": 0.18890055264116806,
"learning_rate": 3.7999112688553686e-05,
"loss": 0.3228,
"step": 793
},
{
"epoch": 0.9508982035928144,
"grad_norm": 0.17765721503127072,
"learning_rate": 3.7976929902395744e-05,
"loss": 0.3054,
"step": 794
},
{
"epoch": 0.9520958083832335,
"grad_norm": 0.18326501331427,
"learning_rate": 3.7954747116237796e-05,
"loss": 0.3269,
"step": 795
},
{
"epoch": 0.9532934131736527,
"grad_norm": 0.1755807772627999,
"learning_rate": 3.793256433007986e-05,
"loss": 0.2867,
"step": 796
},
{
"epoch": 0.9544910179640719,
"grad_norm": 0.19090635077304152,
"learning_rate": 3.791038154392192e-05,
"loss": 0.3015,
"step": 797
},
{
"epoch": 0.9556886227544911,
"grad_norm": 0.1891261444534879,
"learning_rate": 3.788819875776398e-05,
"loss": 0.3199,
"step": 798
},
{
"epoch": 0.9568862275449102,
"grad_norm": 0.22122139714919675,
"learning_rate": 3.7866015971606036e-05,
"loss": 0.3313,
"step": 799
},
{
"epoch": 0.9580838323353293,
"grad_norm": 0.19833251813457223,
"learning_rate": 3.7843833185448094e-05,
"loss": 0.3243,
"step": 800
},
{
"epoch": 0.9592814371257485,
"grad_norm": 0.202084942916931,
"learning_rate": 3.782165039929015e-05,
"loss": 0.3059,
"step": 801
},
{
"epoch": 0.9604790419161676,
"grad_norm": 0.22853452716266984,
"learning_rate": 3.779946761313221e-05,
"loss": 0.3288,
"step": 802
},
{
"epoch": 0.9616766467065868,
"grad_norm": 0.21269829767904655,
"learning_rate": 3.777728482697427e-05,
"loss": 0.2935,
"step": 803
},
{
"epoch": 0.962874251497006,
"grad_norm": 0.21854781742762155,
"learning_rate": 3.775510204081633e-05,
"loss": 0.3245,
"step": 804
},
{
"epoch": 0.9640718562874252,
"grad_norm": 0.21783133062493315,
"learning_rate": 3.773291925465839e-05,
"loss": 0.3036,
"step": 805
},
{
"epoch": 0.9652694610778443,
"grad_norm": 0.1823232738597752,
"learning_rate": 3.7710736468500445e-05,
"loss": 0.3031,
"step": 806
},
{
"epoch": 0.9664670658682635,
"grad_norm": 0.22478882549017082,
"learning_rate": 3.7688553682342503e-05,
"loss": 0.3233,
"step": 807
},
{
"epoch": 0.9676646706586827,
"grad_norm": 0.20869889312882087,
"learning_rate": 3.766637089618456e-05,
"loss": 0.3234,
"step": 808
},
{
"epoch": 0.9688622754491018,
"grad_norm": 0.186560131091878,
"learning_rate": 3.764418811002662e-05,
"loss": 0.3366,
"step": 809
},
{
"epoch": 0.9700598802395209,
"grad_norm": 0.24405275359422968,
"learning_rate": 3.762200532386868e-05,
"loss": 0.327,
"step": 810
},
{
"epoch": 0.9712574850299401,
"grad_norm": 0.18311313308496674,
"learning_rate": 3.759982253771074e-05,
"loss": 0.3103,
"step": 811
},
{
"epoch": 0.9724550898203593,
"grad_norm": 0.17765143087581203,
"learning_rate": 3.7577639751552796e-05,
"loss": 0.31,
"step": 812
},
{
"epoch": 0.9736526946107784,
"grad_norm": 0.17798908786037654,
"learning_rate": 3.7555456965394854e-05,
"loss": 0.3037,
"step": 813
},
{
"epoch": 0.9748502994011976,
"grad_norm": 0.18460482184205454,
"learning_rate": 3.753327417923691e-05,
"loss": 0.3207,
"step": 814
},
{
"epoch": 0.9760479041916168,
"grad_norm": 0.20114890438607802,
"learning_rate": 3.751109139307897e-05,
"loss": 0.3285,
"step": 815
},
{
"epoch": 0.9772455089820359,
"grad_norm": 0.27028333455037035,
"learning_rate": 3.748890860692103e-05,
"loss": 0.3398,
"step": 816
},
{
"epoch": 0.9784431137724551,
"grad_norm": 0.18279136621946335,
"learning_rate": 3.746672582076309e-05,
"loss": 0.3188,
"step": 817
},
{
"epoch": 0.9796407185628743,
"grad_norm": 0.208016371983066,
"learning_rate": 3.744454303460515e-05,
"loss": 0.3227,
"step": 818
},
{
"epoch": 0.9808383233532935,
"grad_norm": 0.17925145243860893,
"learning_rate": 3.742236024844721e-05,
"loss": 0.3096,
"step": 819
},
{
"epoch": 0.9820359281437125,
"grad_norm": 0.19783559582337518,
"learning_rate": 3.740017746228926e-05,
"loss": 0.3336,
"step": 820
},
{
"epoch": 0.9832335329341317,
"grad_norm": 0.19021442012067089,
"learning_rate": 3.737799467613132e-05,
"loss": 0.3225,
"step": 821
},
{
"epoch": 0.9844311377245509,
"grad_norm": 0.19669605405145774,
"learning_rate": 3.735581188997338e-05,
"loss": 0.3234,
"step": 822
},
{
"epoch": 0.98562874251497,
"grad_norm": 0.1735301643421043,
"learning_rate": 3.7333629103815445e-05,
"loss": 0.2983,
"step": 823
},
{
"epoch": 0.9868263473053892,
"grad_norm": 0.21228496225671392,
"learning_rate": 3.7311446317657504e-05,
"loss": 0.3312,
"step": 824
},
{
"epoch": 0.9880239520958084,
"grad_norm": 0.17400628411797348,
"learning_rate": 3.7289263531499555e-05,
"loss": 0.3216,
"step": 825
},
{
"epoch": 0.9892215568862276,
"grad_norm": 0.18220212002125352,
"learning_rate": 3.7267080745341614e-05,
"loss": 0.3135,
"step": 826
},
{
"epoch": 0.9904191616766467,
"grad_norm": 0.19054981982326405,
"learning_rate": 3.724489795918368e-05,
"loss": 0.2963,
"step": 827
},
{
"epoch": 0.9916167664670659,
"grad_norm": 0.18953746853639974,
"learning_rate": 3.722271517302574e-05,
"loss": 0.3288,
"step": 828
},
{
"epoch": 0.9928143712574851,
"grad_norm": 0.1752781044981784,
"learning_rate": 3.7200532386867796e-05,
"loss": 0.3239,
"step": 829
},
{
"epoch": 0.9940119760479041,
"grad_norm": 0.19456063286248626,
"learning_rate": 3.717834960070985e-05,
"loss": 0.316,
"step": 830
},
{
"epoch": 0.9952095808383233,
"grad_norm": 0.19521416733220492,
"learning_rate": 3.7156166814551906e-05,
"loss": 0.3057,
"step": 831
},
{
"epoch": 0.9964071856287425,
"grad_norm": 0.19970755739061535,
"learning_rate": 3.713398402839397e-05,
"loss": 0.3236,
"step": 832
},
{
"epoch": 0.9976047904191617,
"grad_norm": 0.17805360960986638,
"learning_rate": 3.711180124223603e-05,
"loss": 0.3221,
"step": 833
},
{
"epoch": 0.9988023952095808,
"grad_norm": 0.2011936668854863,
"learning_rate": 3.708961845607809e-05,
"loss": 0.3249,
"step": 834
},
{
"epoch": 1.0,
"grad_norm": 0.1889738422914907,
"learning_rate": 3.706743566992014e-05,
"loss": 0.3261,
"step": 835
},
{
"epoch": 1.0011976047904192,
"grad_norm": 0.22788495987013785,
"learning_rate": 3.7045252883762205e-05,
"loss": 0.255,
"step": 836
},
{
"epoch": 1.0023952095808384,
"grad_norm": 0.18265873734696644,
"learning_rate": 3.702307009760426e-05,
"loss": 0.2464,
"step": 837
},
{
"epoch": 1.0035928143712576,
"grad_norm": 0.24341803693152755,
"learning_rate": 3.700088731144632e-05,
"loss": 0.2755,
"step": 838
},
{
"epoch": 1.0047904191616766,
"grad_norm": 0.17230525981554265,
"learning_rate": 3.697870452528838e-05,
"loss": 0.2594,
"step": 839
},
{
"epoch": 1.0059880239520957,
"grad_norm": 0.20707410931839673,
"learning_rate": 3.695652173913043e-05,
"loss": 0.2506,
"step": 840
},
{
"epoch": 1.007185628742515,
"grad_norm": 0.19492208036279524,
"learning_rate": 3.69343389529725e-05,
"loss": 0.2544,
"step": 841
},
{
"epoch": 1.0083832335329341,
"grad_norm": 0.18320733003252568,
"learning_rate": 3.6912156166814555e-05,
"loss": 0.2625,
"step": 842
},
{
"epoch": 1.0095808383233533,
"grad_norm": 0.19859077954792695,
"learning_rate": 3.6889973380656614e-05,
"loss": 0.2522,
"step": 843
},
{
"epoch": 1.0107784431137725,
"grad_norm": 0.20121621806403395,
"learning_rate": 3.686779059449867e-05,
"loss": 0.2504,
"step": 844
},
{
"epoch": 1.0119760479041917,
"grad_norm": 0.19032764755722023,
"learning_rate": 3.684560780834073e-05,
"loss": 0.2521,
"step": 845
},
{
"epoch": 1.0131736526946107,
"grad_norm": 0.20754523576854905,
"learning_rate": 3.682342502218279e-05,
"loss": 0.2766,
"step": 846
},
{
"epoch": 1.0143712574850299,
"grad_norm": 0.19056274374933926,
"learning_rate": 3.680124223602485e-05,
"loss": 0.2641,
"step": 847
},
{
"epoch": 1.015568862275449,
"grad_norm": 0.17470583827130434,
"learning_rate": 3.6779059449866906e-05,
"loss": 0.2449,
"step": 848
},
{
"epoch": 1.0167664670658683,
"grad_norm": 0.2075982181567944,
"learning_rate": 3.6756876663708964e-05,
"loss": 0.2539,
"step": 849
},
{
"epoch": 1.0179640718562875,
"grad_norm": 0.18975792265434843,
"learning_rate": 3.673469387755102e-05,
"loss": 0.2552,
"step": 850
},
{
"epoch": 1.0191616766467066,
"grad_norm": 0.20029637606873688,
"learning_rate": 3.671251109139308e-05,
"loss": 0.2626,
"step": 851
},
{
"epoch": 1.0203592814371258,
"grad_norm": 0.19754325910201997,
"learning_rate": 3.669032830523514e-05,
"loss": 0.2747,
"step": 852
},
{
"epoch": 1.0215568862275448,
"grad_norm": 0.22913814514953804,
"learning_rate": 3.66681455190772e-05,
"loss": 0.2585,
"step": 853
},
{
"epoch": 1.022754491017964,
"grad_norm": 0.21305759274344407,
"learning_rate": 3.6645962732919256e-05,
"loss": 0.263,
"step": 854
},
{
"epoch": 1.0239520958083832,
"grad_norm": 0.24518562223120285,
"learning_rate": 3.6623779946761315e-05,
"loss": 0.2792,
"step": 855
},
{
"epoch": 1.0251497005988024,
"grad_norm": 0.19839661339121345,
"learning_rate": 3.660159716060337e-05,
"loss": 0.2534,
"step": 856
},
{
"epoch": 1.0263473053892216,
"grad_norm": 0.20610652581529026,
"learning_rate": 3.657941437444543e-05,
"loss": 0.2614,
"step": 857
},
{
"epoch": 1.0275449101796408,
"grad_norm": 0.20094204434410493,
"learning_rate": 3.655723158828749e-05,
"loss": 0.2688,
"step": 858
},
{
"epoch": 1.02874251497006,
"grad_norm": 0.20014399902536592,
"learning_rate": 3.653504880212955e-05,
"loss": 0.2679,
"step": 859
},
{
"epoch": 1.029940119760479,
"grad_norm": 0.19772730500972935,
"learning_rate": 3.651286601597161e-05,
"loss": 0.2463,
"step": 860
},
{
"epoch": 1.0311377245508981,
"grad_norm": 0.22815103363054823,
"learning_rate": 3.6490683229813665e-05,
"loss": 0.2888,
"step": 861
},
{
"epoch": 1.0323353293413173,
"grad_norm": 0.17944907316124226,
"learning_rate": 3.6468500443655724e-05,
"loss": 0.2541,
"step": 862
},
{
"epoch": 1.0335329341317365,
"grad_norm": 0.2157602279833511,
"learning_rate": 3.644631765749779e-05,
"loss": 0.2705,
"step": 863
},
{
"epoch": 1.0347305389221557,
"grad_norm": 0.21111767332310152,
"learning_rate": 3.642413487133984e-05,
"loss": 0.2648,
"step": 864
},
{
"epoch": 1.035928143712575,
"grad_norm": 0.1989276308502225,
"learning_rate": 3.64019520851819e-05,
"loss": 0.26,
"step": 865
},
{
"epoch": 1.037125748502994,
"grad_norm": 0.21161054164356863,
"learning_rate": 3.637976929902396e-05,
"loss": 0.2791,
"step": 866
},
{
"epoch": 1.038323353293413,
"grad_norm": 0.20426078184334712,
"learning_rate": 3.6357586512866016e-05,
"loss": 0.2695,
"step": 867
},
{
"epoch": 1.0395209580838323,
"grad_norm": 0.20489423856075809,
"learning_rate": 3.633540372670808e-05,
"loss": 0.2609,
"step": 868
},
{
"epoch": 1.0407185628742515,
"grad_norm": 0.19367793447617573,
"learning_rate": 3.631322094055013e-05,
"loss": 0.2572,
"step": 869
},
{
"epoch": 1.0419161676646707,
"grad_norm": 0.2017313733601768,
"learning_rate": 3.629103815439219e-05,
"loss": 0.2598,
"step": 870
},
{
"epoch": 1.0431137724550898,
"grad_norm": 0.21048593181305156,
"learning_rate": 3.626885536823425e-05,
"loss": 0.2591,
"step": 871
},
{
"epoch": 1.044311377245509,
"grad_norm": 0.18225088256490518,
"learning_rate": 3.6246672582076315e-05,
"loss": 0.2626,
"step": 872
},
{
"epoch": 1.0455089820359282,
"grad_norm": 0.20541397052126517,
"learning_rate": 3.622448979591837e-05,
"loss": 0.2378,
"step": 873
},
{
"epoch": 1.0467065868263472,
"grad_norm": 0.2391290358613303,
"learning_rate": 3.6202307009760425e-05,
"loss": 0.2658,
"step": 874
},
{
"epoch": 1.0479041916167664,
"grad_norm": 0.23124472193725928,
"learning_rate": 3.618012422360248e-05,
"loss": 0.2637,
"step": 875
},
{
"epoch": 1.0491017964071856,
"grad_norm": 0.21805598749377872,
"learning_rate": 3.615794143744454e-05,
"loss": 0.2681,
"step": 876
},
{
"epoch": 1.0502994011976048,
"grad_norm": 0.20869048922275357,
"learning_rate": 3.613575865128661e-05,
"loss": 0.2792,
"step": 877
},
{
"epoch": 1.051497005988024,
"grad_norm": 0.22579140873191403,
"learning_rate": 3.6113575865128665e-05,
"loss": 0.2847,
"step": 878
},
{
"epoch": 1.0526946107784432,
"grad_norm": 0.24210944545699908,
"learning_rate": 3.609139307897072e-05,
"loss": 0.2939,
"step": 879
},
{
"epoch": 1.0538922155688624,
"grad_norm": 0.2019162541617041,
"learning_rate": 3.6069210292812775e-05,
"loss": 0.2631,
"step": 880
},
{
"epoch": 1.0550898203592813,
"grad_norm": 0.19396321928544952,
"learning_rate": 3.604702750665484e-05,
"loss": 0.2491,
"step": 881
},
{
"epoch": 1.0562874251497005,
"grad_norm": 0.22098905698078194,
"learning_rate": 3.60248447204969e-05,
"loss": 0.2462,
"step": 882
},
{
"epoch": 1.0574850299401197,
"grad_norm": 0.19806872895893604,
"learning_rate": 3.600266193433896e-05,
"loss": 0.255,
"step": 883
},
{
"epoch": 1.058682634730539,
"grad_norm": 0.2861226012875371,
"learning_rate": 3.598047914818101e-05,
"loss": 0.2855,
"step": 884
},
{
"epoch": 1.0598802395209581,
"grad_norm": 0.19292980574457883,
"learning_rate": 3.595829636202307e-05,
"loss": 0.2342,
"step": 885
},
{
"epoch": 1.0610778443113773,
"grad_norm": 0.2075053083013594,
"learning_rate": 3.593611357586513e-05,
"loss": 0.2735,
"step": 886
},
{
"epoch": 1.0622754491017965,
"grad_norm": 0.19634102113753232,
"learning_rate": 3.591393078970719e-05,
"loss": 0.2324,
"step": 887
},
{
"epoch": 1.0634730538922155,
"grad_norm": 0.21930529532945534,
"learning_rate": 3.589174800354925e-05,
"loss": 0.2597,
"step": 888
},
{
"epoch": 1.0646706586826347,
"grad_norm": 0.17109702701331458,
"learning_rate": 3.58695652173913e-05,
"loss": 0.2523,
"step": 889
},
{
"epoch": 1.0658682634730539,
"grad_norm": 0.21556415718834426,
"learning_rate": 3.5847382431233366e-05,
"loss": 0.2592,
"step": 890
},
{
"epoch": 1.067065868263473,
"grad_norm": 0.2251211369862316,
"learning_rate": 3.5825199645075425e-05,
"loss": 0.2604,
"step": 891
},
{
"epoch": 1.0682634730538922,
"grad_norm": 0.18839651345260205,
"learning_rate": 3.580301685891748e-05,
"loss": 0.2503,
"step": 892
},
{
"epoch": 1.0694610778443114,
"grad_norm": 0.20801854288195668,
"learning_rate": 3.578083407275954e-05,
"loss": 0.2815,
"step": 893
},
{
"epoch": 1.0706586826347306,
"grad_norm": 0.2270911312556433,
"learning_rate": 3.575865128660159e-05,
"loss": 0.2814,
"step": 894
},
{
"epoch": 1.0718562874251496,
"grad_norm": 0.19465475407405028,
"learning_rate": 3.573646850044366e-05,
"loss": 0.2605,
"step": 895
},
{
"epoch": 1.0730538922155688,
"grad_norm": 0.23226176872880255,
"learning_rate": 3.571428571428572e-05,
"loss": 0.2703,
"step": 896
},
{
"epoch": 1.074251497005988,
"grad_norm": 0.20579337588913182,
"learning_rate": 3.5692102928127775e-05,
"loss": 0.264,
"step": 897
},
{
"epoch": 1.0754491017964072,
"grad_norm": 0.1952564162224969,
"learning_rate": 3.5669920141969834e-05,
"loss": 0.2545,
"step": 898
},
{
"epoch": 1.0766467065868264,
"grad_norm": 0.25337149795123926,
"learning_rate": 3.564773735581189e-05,
"loss": 0.2585,
"step": 899
},
{
"epoch": 1.0778443113772456,
"grad_norm": 0.20385445018882617,
"learning_rate": 3.562555456965395e-05,
"loss": 0.2559,
"step": 900
},
{
"epoch": 1.0790419161676648,
"grad_norm": 0.19038419392115238,
"learning_rate": 3.560337178349601e-05,
"loss": 0.2427,
"step": 901
},
{
"epoch": 1.0802395209580837,
"grad_norm": 0.25552609475109284,
"learning_rate": 3.558118899733807e-05,
"loss": 0.2749,
"step": 902
},
{
"epoch": 1.081437125748503,
"grad_norm": 0.1903094986551938,
"learning_rate": 3.5559006211180126e-05,
"loss": 0.2547,
"step": 903
},
{
"epoch": 1.0826347305389221,
"grad_norm": 0.23817703484438174,
"learning_rate": 3.5536823425022184e-05,
"loss": 0.2851,
"step": 904
},
{
"epoch": 1.0838323353293413,
"grad_norm": 0.21116930820777924,
"learning_rate": 3.551464063886424e-05,
"loss": 0.271,
"step": 905
},
{
"epoch": 1.0850299401197605,
"grad_norm": 0.19798322191888704,
"learning_rate": 3.54924578527063e-05,
"loss": 0.2653,
"step": 906
},
{
"epoch": 1.0862275449101797,
"grad_norm": 0.17824043312055776,
"learning_rate": 3.547027506654836e-05,
"loss": 0.2351,
"step": 907
},
{
"epoch": 1.087425149700599,
"grad_norm": 0.21365871128146302,
"learning_rate": 3.544809228039042e-05,
"loss": 0.2774,
"step": 908
},
{
"epoch": 1.0886227544910179,
"grad_norm": 0.19515389536316946,
"learning_rate": 3.5425909494232477e-05,
"loss": 0.2663,
"step": 909
},
{
"epoch": 1.089820359281437,
"grad_norm": 0.17393034695248935,
"learning_rate": 3.5403726708074535e-05,
"loss": 0.2494,
"step": 910
},
{
"epoch": 1.0910179640718562,
"grad_norm": 0.19253419929210244,
"learning_rate": 3.538154392191659e-05,
"loss": 0.284,
"step": 911
},
{
"epoch": 1.0922155688622754,
"grad_norm": 0.19797211540356335,
"learning_rate": 3.535936113575865e-05,
"loss": 0.2607,
"step": 912
},
{
"epoch": 1.0934131736526946,
"grad_norm": 0.17041418257016872,
"learning_rate": 3.533717834960072e-05,
"loss": 0.2497,
"step": 913
},
{
"epoch": 1.0946107784431138,
"grad_norm": 0.19415893126420763,
"learning_rate": 3.531499556344277e-05,
"loss": 0.26,
"step": 914
},
{
"epoch": 1.095808383233533,
"grad_norm": 0.18909571187160537,
"learning_rate": 3.529281277728483e-05,
"loss": 0.2623,
"step": 915
},
{
"epoch": 1.097005988023952,
"grad_norm": 0.19302362207648502,
"learning_rate": 3.5270629991126885e-05,
"loss": 0.2678,
"step": 916
},
{
"epoch": 1.0982035928143712,
"grad_norm": 0.18635395147727485,
"learning_rate": 3.524844720496895e-05,
"loss": 0.2687,
"step": 917
},
{
"epoch": 1.0994011976047904,
"grad_norm": 0.1760370952877239,
"learning_rate": 3.522626441881101e-05,
"loss": 0.2397,
"step": 918
},
{
"epoch": 1.1005988023952096,
"grad_norm": 0.20658833620149647,
"learning_rate": 3.520408163265306e-05,
"loss": 0.2666,
"step": 919
},
{
"epoch": 1.1017964071856288,
"grad_norm": 0.18150456373180246,
"learning_rate": 3.518189884649512e-05,
"loss": 0.273,
"step": 920
},
{
"epoch": 1.102994011976048,
"grad_norm": 0.19654821354275157,
"learning_rate": 3.515971606033718e-05,
"loss": 0.2553,
"step": 921
},
{
"epoch": 1.1041916167664672,
"grad_norm": 0.24911722619099527,
"learning_rate": 3.513753327417924e-05,
"loss": 0.2676,
"step": 922
},
{
"epoch": 1.1053892215568861,
"grad_norm": 0.17864160758158948,
"learning_rate": 3.51153504880213e-05,
"loss": 0.2423,
"step": 923
},
{
"epoch": 1.1065868263473053,
"grad_norm": 0.18031769624358954,
"learning_rate": 3.509316770186335e-05,
"loss": 0.2557,
"step": 924
},
{
"epoch": 1.1077844311377245,
"grad_norm": 0.20437896091717098,
"learning_rate": 3.507098491570541e-05,
"loss": 0.2375,
"step": 925
},
{
"epoch": 1.1089820359281437,
"grad_norm": 0.24964264795967406,
"learning_rate": 3.5048802129547477e-05,
"loss": 0.2469,
"step": 926
},
{
"epoch": 1.110179640718563,
"grad_norm": 0.17649778256275647,
"learning_rate": 3.5026619343389535e-05,
"loss": 0.2491,
"step": 927
},
{
"epoch": 1.111377245508982,
"grad_norm": 0.20152708694560312,
"learning_rate": 3.500443655723159e-05,
"loss": 0.2758,
"step": 928
},
{
"epoch": 1.1125748502994013,
"grad_norm": 0.2187769051263318,
"learning_rate": 3.4982253771073645e-05,
"loss": 0.26,
"step": 929
},
{
"epoch": 1.1137724550898203,
"grad_norm": 0.1997026094417269,
"learning_rate": 3.4960070984915703e-05,
"loss": 0.2651,
"step": 930
},
{
"epoch": 1.1149700598802395,
"grad_norm": 0.23483693004176315,
"learning_rate": 3.493788819875777e-05,
"loss": 0.2625,
"step": 931
},
{
"epoch": 1.1161676646706586,
"grad_norm": 0.272849321062047,
"learning_rate": 3.491570541259983e-05,
"loss": 0.2584,
"step": 932
},
{
"epoch": 1.1173652694610778,
"grad_norm": 0.1939675818629627,
"learning_rate": 3.4893522626441886e-05,
"loss": 0.2784,
"step": 933
},
{
"epoch": 1.118562874251497,
"grad_norm": 0.20710747669143537,
"learning_rate": 3.487133984028394e-05,
"loss": 0.2526,
"step": 934
},
{
"epoch": 1.1197604790419162,
"grad_norm": 0.20624552961508288,
"learning_rate": 3.4849157054126e-05,
"loss": 0.2729,
"step": 935
},
{
"epoch": 1.1209580838323354,
"grad_norm": 0.21322177892369318,
"learning_rate": 3.482697426796806e-05,
"loss": 0.2966,
"step": 936
},
{
"epoch": 1.1221556886227544,
"grad_norm": 0.17653599022580355,
"learning_rate": 3.480479148181012e-05,
"loss": 0.2432,
"step": 937
},
{
"epoch": 1.1233532934131736,
"grad_norm": 0.20205397750781223,
"learning_rate": 3.478260869565218e-05,
"loss": 0.2422,
"step": 938
},
{
"epoch": 1.1245508982035928,
"grad_norm": 0.24738467968515304,
"learning_rate": 3.476042590949423e-05,
"loss": 0.256,
"step": 939
},
{
"epoch": 1.125748502994012,
"grad_norm": 0.21441864368514546,
"learning_rate": 3.4738243123336294e-05,
"loss": 0.2543,
"step": 940
},
{
"epoch": 1.1269461077844312,
"grad_norm": 0.21472227902405158,
"learning_rate": 3.471606033717835e-05,
"loss": 0.2926,
"step": 941
},
{
"epoch": 1.1281437125748504,
"grad_norm": 0.22174526303699463,
"learning_rate": 3.469387755102041e-05,
"loss": 0.2685,
"step": 942
},
{
"epoch": 1.1293413173652695,
"grad_norm": 0.19653814743670178,
"learning_rate": 3.467169476486247e-05,
"loss": 0.2518,
"step": 943
},
{
"epoch": 1.1305389221556887,
"grad_norm": 0.29394855069439696,
"learning_rate": 3.464951197870453e-05,
"loss": 0.2705,
"step": 944
},
{
"epoch": 1.1317365269461077,
"grad_norm": 0.21905198119358832,
"learning_rate": 3.462732919254659e-05,
"loss": 0.2902,
"step": 945
},
{
"epoch": 1.132934131736527,
"grad_norm": 0.21536217657094828,
"learning_rate": 3.4605146406388645e-05,
"loss": 0.2511,
"step": 946
},
{
"epoch": 1.134131736526946,
"grad_norm": 0.2720166609208309,
"learning_rate": 3.4582963620230703e-05,
"loss": 0.2579,
"step": 947
},
{
"epoch": 1.1353293413173653,
"grad_norm": 0.2098338409724728,
"learning_rate": 3.456078083407276e-05,
"loss": 0.244,
"step": 948
},
{
"epoch": 1.1365269461077845,
"grad_norm": 0.20377406340311155,
"learning_rate": 3.453859804791482e-05,
"loss": 0.2615,
"step": 949
},
{
"epoch": 1.1377245508982037,
"grad_norm": 0.2028090313863889,
"learning_rate": 3.451641526175688e-05,
"loss": 0.2484,
"step": 950
},
{
"epoch": 1.1389221556886229,
"grad_norm": 0.21840555209594542,
"learning_rate": 3.449423247559894e-05,
"loss": 0.2641,
"step": 951
},
{
"epoch": 1.1401197604790418,
"grad_norm": 0.18547848222253538,
"learning_rate": 3.4472049689440996e-05,
"loss": 0.2542,
"step": 952
},
{
"epoch": 1.141317365269461,
"grad_norm": 0.2048860246722607,
"learning_rate": 3.4449866903283054e-05,
"loss": 0.2791,
"step": 953
},
{
"epoch": 1.1425149700598802,
"grad_norm": 0.2325720305932213,
"learning_rate": 3.442768411712511e-05,
"loss": 0.2765,
"step": 954
},
{
"epoch": 1.1437125748502994,
"grad_norm": 0.19724690982862028,
"learning_rate": 3.440550133096717e-05,
"loss": 0.2506,
"step": 955
},
{
"epoch": 1.1449101796407186,
"grad_norm": 0.19537889322373725,
"learning_rate": 3.438331854480923e-05,
"loss": 0.2552,
"step": 956
},
{
"epoch": 1.1461077844311378,
"grad_norm": 0.1992364706030575,
"learning_rate": 3.436113575865129e-05,
"loss": 0.262,
"step": 957
},
{
"epoch": 1.147305389221557,
"grad_norm": 0.22336651790933973,
"learning_rate": 3.4338952972493346e-05,
"loss": 0.2689,
"step": 958
},
{
"epoch": 1.148502994011976,
"grad_norm": 0.2534198255221812,
"learning_rate": 3.4316770186335405e-05,
"loss": 0.2961,
"step": 959
},
{
"epoch": 1.1497005988023952,
"grad_norm": 0.24028957624776567,
"learning_rate": 3.429458740017746e-05,
"loss": 0.2491,
"step": 960
},
{
"epoch": 1.1508982035928144,
"grad_norm": 0.21129358550919747,
"learning_rate": 3.427240461401952e-05,
"loss": 0.2554,
"step": 961
},
{
"epoch": 1.1520958083832336,
"grad_norm": 0.19797778564259688,
"learning_rate": 3.425022182786158e-05,
"loss": 0.2715,
"step": 962
},
{
"epoch": 1.1532934131736527,
"grad_norm": 0.1901344348665796,
"learning_rate": 3.422803904170364e-05,
"loss": 0.2523,
"step": 963
},
{
"epoch": 1.154491017964072,
"grad_norm": 0.25651734674083254,
"learning_rate": 3.42058562555457e-05,
"loss": 0.2812,
"step": 964
},
{
"epoch": 1.1556886227544911,
"grad_norm": 0.22787184729125445,
"learning_rate": 3.4183673469387755e-05,
"loss": 0.2787,
"step": 965
},
{
"epoch": 1.15688622754491,
"grad_norm": 0.22072906790053945,
"learning_rate": 3.4161490683229814e-05,
"loss": 0.2504,
"step": 966
},
{
"epoch": 1.1580838323353293,
"grad_norm": 0.24902645084916908,
"learning_rate": 3.413930789707188e-05,
"loss": 0.2765,
"step": 967
},
{
"epoch": 1.1592814371257485,
"grad_norm": 0.2179141265160038,
"learning_rate": 3.411712511091393e-05,
"loss": 0.2802,
"step": 968
},
{
"epoch": 1.1604790419161677,
"grad_norm": 0.2272928584987232,
"learning_rate": 3.409494232475599e-05,
"loss": 0.2862,
"step": 969
},
{
"epoch": 1.1616766467065869,
"grad_norm": 0.2291825770249093,
"learning_rate": 3.407275953859805e-05,
"loss": 0.2468,
"step": 970
},
{
"epoch": 1.162874251497006,
"grad_norm": 0.202616565055818,
"learning_rate": 3.4050576752440106e-05,
"loss": 0.2611,
"step": 971
},
{
"epoch": 1.1640718562874253,
"grad_norm": 0.20874762276598405,
"learning_rate": 3.402839396628217e-05,
"loss": 0.269,
"step": 972
},
{
"epoch": 1.1652694610778442,
"grad_norm": 0.2140944109312408,
"learning_rate": 3.400621118012422e-05,
"loss": 0.2655,
"step": 973
},
{
"epoch": 1.1664670658682634,
"grad_norm": 0.19375146907255353,
"learning_rate": 3.398402839396628e-05,
"loss": 0.2414,
"step": 974
},
{
"epoch": 1.1676646706586826,
"grad_norm": 0.17764001079236189,
"learning_rate": 3.396184560780834e-05,
"loss": 0.246,
"step": 975
},
{
"epoch": 1.1688622754491018,
"grad_norm": 0.18678610803098034,
"learning_rate": 3.3939662821650405e-05,
"loss": 0.2601,
"step": 976
},
{
"epoch": 1.170059880239521,
"grad_norm": 0.22984309056562327,
"learning_rate": 3.391748003549246e-05,
"loss": 0.2831,
"step": 977
},
{
"epoch": 1.1712574850299402,
"grad_norm": 0.20773875269328607,
"learning_rate": 3.3895297249334515e-05,
"loss": 0.283,
"step": 978
},
{
"epoch": 1.1724550898203594,
"grad_norm": 0.19748856603201478,
"learning_rate": 3.387311446317657e-05,
"loss": 0.2574,
"step": 979
},
{
"epoch": 1.1736526946107784,
"grad_norm": 0.193096918925366,
"learning_rate": 3.385093167701863e-05,
"loss": 0.2583,
"step": 980
},
{
"epoch": 1.1748502994011976,
"grad_norm": 0.20455442509423816,
"learning_rate": 3.38287488908607e-05,
"loss": 0.2577,
"step": 981
},
{
"epoch": 1.1760479041916168,
"grad_norm": 0.20163474572378806,
"learning_rate": 3.3806566104702755e-05,
"loss": 0.2581,
"step": 982
},
{
"epoch": 1.177245508982036,
"grad_norm": 0.19904883209684757,
"learning_rate": 3.378438331854481e-05,
"loss": 0.2417,
"step": 983
},
{
"epoch": 1.1784431137724551,
"grad_norm": 0.179707961294151,
"learning_rate": 3.3762200532386865e-05,
"loss": 0.2494,
"step": 984
},
{
"epoch": 1.1796407185628743,
"grad_norm": 0.1729769708958319,
"learning_rate": 3.374001774622893e-05,
"loss": 0.2503,
"step": 985
},
{
"epoch": 1.1808383233532935,
"grad_norm": 0.21159121141717033,
"learning_rate": 3.371783496007099e-05,
"loss": 0.2572,
"step": 986
},
{
"epoch": 1.1820359281437125,
"grad_norm": 0.1915905801180326,
"learning_rate": 3.369565217391305e-05,
"loss": 0.2467,
"step": 987
},
{
"epoch": 1.1832335329341317,
"grad_norm": 0.21019137284864586,
"learning_rate": 3.36734693877551e-05,
"loss": 0.2821,
"step": 988
},
{
"epoch": 1.1844311377245509,
"grad_norm": 0.18407369449374963,
"learning_rate": 3.365128660159716e-05,
"loss": 0.2617,
"step": 989
},
{
"epoch": 1.18562874251497,
"grad_norm": 0.18434567471558988,
"learning_rate": 3.362910381543922e-05,
"loss": 0.2681,
"step": 990
},
{
"epoch": 1.1868263473053893,
"grad_norm": 0.17450080329046636,
"learning_rate": 3.360692102928128e-05,
"loss": 0.241,
"step": 991
},
{
"epoch": 1.1880239520958085,
"grad_norm": 0.18822615806422033,
"learning_rate": 3.358473824312334e-05,
"loss": 0.2602,
"step": 992
},
{
"epoch": 1.1892215568862277,
"grad_norm": 0.1850973605106648,
"learning_rate": 3.356255545696539e-05,
"loss": 0.2562,
"step": 993
},
{
"epoch": 1.1904191616766466,
"grad_norm": 0.1659334066350733,
"learning_rate": 3.3540372670807456e-05,
"loss": 0.2254,
"step": 994
},
{
"epoch": 1.1916167664670658,
"grad_norm": 0.1844205605652284,
"learning_rate": 3.3518189884649515e-05,
"loss": 0.2464,
"step": 995
},
{
"epoch": 1.192814371257485,
"grad_norm": 0.165888940180616,
"learning_rate": 3.349600709849157e-05,
"loss": 0.2358,
"step": 996
},
{
"epoch": 1.1940119760479042,
"grad_norm": 0.2642009793534568,
"learning_rate": 3.347382431233363e-05,
"loss": 0.2831,
"step": 997
},
{
"epoch": 1.1952095808383234,
"grad_norm": 0.19892595107647654,
"learning_rate": 3.345164152617568e-05,
"loss": 0.2733,
"step": 998
},
{
"epoch": 1.1964071856287426,
"grad_norm": 0.1795438472367515,
"learning_rate": 3.342945874001775e-05,
"loss": 0.2661,
"step": 999
},
{
"epoch": 1.1976047904191618,
"grad_norm": 0.21538454440892804,
"learning_rate": 3.340727595385981e-05,
"loss": 0.2701,
"step": 1000
},
{
"epoch": 1.1988023952095808,
"grad_norm": 0.19077144627496725,
"learning_rate": 3.3385093167701865e-05,
"loss": 0.2622,
"step": 1001
},
{
"epoch": 1.2,
"grad_norm": 0.19432277465608053,
"learning_rate": 3.3362910381543924e-05,
"loss": 0.2548,
"step": 1002
},
{
"epoch": 1.2011976047904191,
"grad_norm": 0.20056257570075126,
"learning_rate": 3.334072759538598e-05,
"loss": 0.2727,
"step": 1003
},
{
"epoch": 1.2023952095808383,
"grad_norm": 0.18646463085245432,
"learning_rate": 3.331854480922804e-05,
"loss": 0.2465,
"step": 1004
},
{
"epoch": 1.2035928143712575,
"grad_norm": 0.17411180417164107,
"learning_rate": 3.32963620230701e-05,
"loss": 0.2691,
"step": 1005
},
{
"epoch": 1.2047904191616767,
"grad_norm": 0.22500520460657797,
"learning_rate": 3.327417923691216e-05,
"loss": 0.2867,
"step": 1006
},
{
"epoch": 1.205988023952096,
"grad_norm": 0.16689699972347474,
"learning_rate": 3.3251996450754216e-05,
"loss": 0.2631,
"step": 1007
},
{
"epoch": 1.207185628742515,
"grad_norm": 0.18666900819863302,
"learning_rate": 3.3229813664596274e-05,
"loss": 0.2458,
"step": 1008
},
{
"epoch": 1.208383233532934,
"grad_norm": 0.1916237657812727,
"learning_rate": 3.320763087843833e-05,
"loss": 0.2488,
"step": 1009
},
{
"epoch": 1.2095808383233533,
"grad_norm": 0.18245576211880393,
"learning_rate": 3.318544809228039e-05,
"loss": 0.2524,
"step": 1010
},
{
"epoch": 1.2107784431137725,
"grad_norm": 0.20825780876506508,
"learning_rate": 3.316326530612245e-05,
"loss": 0.2721,
"step": 1011
},
{
"epoch": 1.2119760479041917,
"grad_norm": 0.20099562991318762,
"learning_rate": 3.3141082519964515e-05,
"loss": 0.2389,
"step": 1012
},
{
"epoch": 1.2131736526946109,
"grad_norm": 0.21632474675073018,
"learning_rate": 3.3118899733806566e-05,
"loss": 0.2643,
"step": 1013
},
{
"epoch": 1.21437125748503,
"grad_norm": 0.21166393799553684,
"learning_rate": 3.3096716947648625e-05,
"loss": 0.2623,
"step": 1014
},
{
"epoch": 1.215568862275449,
"grad_norm": 0.1956768615010716,
"learning_rate": 3.307453416149068e-05,
"loss": 0.2542,
"step": 1015
},
{
"epoch": 1.2167664670658682,
"grad_norm": 0.2782460486464013,
"learning_rate": 3.305235137533274e-05,
"loss": 0.2788,
"step": 1016
},
{
"epoch": 1.2179640718562874,
"grad_norm": 0.2071655454213513,
"learning_rate": 3.303016858917481e-05,
"loss": 0.2822,
"step": 1017
},
{
"epoch": 1.2191616766467066,
"grad_norm": 0.22697362893225143,
"learning_rate": 3.300798580301686e-05,
"loss": 0.2575,
"step": 1018
},
{
"epoch": 1.2203592814371258,
"grad_norm": 0.18211196988099906,
"learning_rate": 3.298580301685892e-05,
"loss": 0.2445,
"step": 1019
},
{
"epoch": 1.221556886227545,
"grad_norm": 0.18861512885773257,
"learning_rate": 3.2963620230700975e-05,
"loss": 0.2629,
"step": 1020
},
{
"epoch": 1.2227544910179642,
"grad_norm": 0.1989477513119728,
"learning_rate": 3.294143744454304e-05,
"loss": 0.2729,
"step": 1021
},
{
"epoch": 1.2239520958083832,
"grad_norm": 0.20249616140785925,
"learning_rate": 3.29192546583851e-05,
"loss": 0.2472,
"step": 1022
},
{
"epoch": 1.2251497005988023,
"grad_norm": 0.18790544190479083,
"learning_rate": 3.289707187222715e-05,
"loss": 0.2584,
"step": 1023
},
{
"epoch": 1.2263473053892215,
"grad_norm": 0.17626061138305835,
"learning_rate": 3.287488908606921e-05,
"loss": 0.2559,
"step": 1024
},
{
"epoch": 1.2275449101796407,
"grad_norm": 0.18144273659633317,
"learning_rate": 3.285270629991127e-05,
"loss": 0.2347,
"step": 1025
},
{
"epoch": 1.22874251497006,
"grad_norm": 0.21479396639571022,
"learning_rate": 3.283052351375333e-05,
"loss": 0.2769,
"step": 1026
},
{
"epoch": 1.2299401197604791,
"grad_norm": 0.18277495810310315,
"learning_rate": 3.280834072759539e-05,
"loss": 0.2591,
"step": 1027
},
{
"epoch": 1.2311377245508983,
"grad_norm": 0.17776766135035088,
"learning_rate": 3.278615794143744e-05,
"loss": 0.2547,
"step": 1028
},
{
"epoch": 1.2323353293413173,
"grad_norm": 0.17255720371035857,
"learning_rate": 3.27639751552795e-05,
"loss": 0.2365,
"step": 1029
},
{
"epoch": 1.2335329341317365,
"grad_norm": 0.1750251657224931,
"learning_rate": 3.2741792369121566e-05,
"loss": 0.2401,
"step": 1030
},
{
"epoch": 1.2347305389221557,
"grad_norm": 0.21088451421962634,
"learning_rate": 3.2719609582963625e-05,
"loss": 0.2601,
"step": 1031
},
{
"epoch": 1.2359281437125749,
"grad_norm": 0.1876490955804945,
"learning_rate": 3.269742679680568e-05,
"loss": 0.2604,
"step": 1032
},
{
"epoch": 1.237125748502994,
"grad_norm": 0.17872445305806278,
"learning_rate": 3.2675244010647735e-05,
"loss": 0.2693,
"step": 1033
},
{
"epoch": 1.2383233532934133,
"grad_norm": 0.18862527167041404,
"learning_rate": 3.265306122448979e-05,
"loss": 0.2618,
"step": 1034
},
{
"epoch": 1.2395209580838324,
"grad_norm": 0.4073764960303839,
"learning_rate": 3.263087843833186e-05,
"loss": 0.2707,
"step": 1035
},
{
"epoch": 1.2407185628742514,
"grad_norm": 0.20294592092526792,
"learning_rate": 3.260869565217392e-05,
"loss": 0.2717,
"step": 1036
},
{
"epoch": 1.2419161676646706,
"grad_norm": 0.17253669361452204,
"learning_rate": 3.2586512866015975e-05,
"loss": 0.2511,
"step": 1037
},
{
"epoch": 1.2431137724550898,
"grad_norm": 0.1808701964746184,
"learning_rate": 3.256433007985803e-05,
"loss": 0.2396,
"step": 1038
},
{
"epoch": 1.244311377245509,
"grad_norm": 0.18479541595502988,
"learning_rate": 3.254214729370009e-05,
"loss": 0.2665,
"step": 1039
},
{
"epoch": 1.2455089820359282,
"grad_norm": 0.2141052663280559,
"learning_rate": 3.251996450754215e-05,
"loss": 0.2578,
"step": 1040
},
{
"epoch": 1.2467065868263474,
"grad_norm": 0.18000750970269358,
"learning_rate": 3.249778172138421e-05,
"loss": 0.2497,
"step": 1041
},
{
"epoch": 1.2479041916167666,
"grad_norm": 0.17074450188842474,
"learning_rate": 3.247559893522627e-05,
"loss": 0.2349,
"step": 1042
},
{
"epoch": 1.2491017964071855,
"grad_norm": 0.1806384412867808,
"learning_rate": 3.245341614906832e-05,
"loss": 0.2493,
"step": 1043
},
{
"epoch": 1.2502994011976047,
"grad_norm": 0.19990357407679332,
"learning_rate": 3.2431233362910384e-05,
"loss": 0.2676,
"step": 1044
},
{
"epoch": 1.251497005988024,
"grad_norm": 0.19598058668173005,
"learning_rate": 3.240905057675244e-05,
"loss": 0.2532,
"step": 1045
},
{
"epoch": 1.2526946107784431,
"grad_norm": 0.18657772040128157,
"learning_rate": 3.23868677905945e-05,
"loss": 0.2566,
"step": 1046
},
{
"epoch": 1.2538922155688623,
"grad_norm": 0.19929972372004534,
"learning_rate": 3.236468500443656e-05,
"loss": 0.2609,
"step": 1047
},
{
"epoch": 1.2550898203592815,
"grad_norm": 0.2265675960046129,
"learning_rate": 3.234250221827862e-05,
"loss": 0.2711,
"step": 1048
},
{
"epoch": 1.2562874251497007,
"grad_norm": 0.17838431623532214,
"learning_rate": 3.2320319432120677e-05,
"loss": 0.2624,
"step": 1049
},
{
"epoch": 1.2574850299401197,
"grad_norm": 0.19560980755236124,
"learning_rate": 3.2298136645962735e-05,
"loss": 0.2544,
"step": 1050
},
{
"epoch": 1.2586826347305389,
"grad_norm": 0.2068525222503613,
"learning_rate": 3.227595385980479e-05,
"loss": 0.2755,
"step": 1051
},
{
"epoch": 1.259880239520958,
"grad_norm": 0.17775319246470705,
"learning_rate": 3.225377107364685e-05,
"loss": 0.2571,
"step": 1052
},
{
"epoch": 1.2610778443113773,
"grad_norm": 0.20121965094977648,
"learning_rate": 3.223158828748891e-05,
"loss": 0.2686,
"step": 1053
},
{
"epoch": 1.2622754491017965,
"grad_norm": 0.19343802612361544,
"learning_rate": 3.220940550133097e-05,
"loss": 0.2493,
"step": 1054
},
{
"epoch": 1.2634730538922156,
"grad_norm": 0.18688281454928837,
"learning_rate": 3.218722271517303e-05,
"loss": 0.2614,
"step": 1055
},
{
"epoch": 1.2646706586826348,
"grad_norm": 0.23207864599807537,
"learning_rate": 3.2165039929015085e-05,
"loss": 0.2835,
"step": 1056
},
{
"epoch": 1.2658682634730538,
"grad_norm": 0.16926200071380107,
"learning_rate": 3.2142857142857144e-05,
"loss": 0.2517,
"step": 1057
},
{
"epoch": 1.267065868263473,
"grad_norm": 0.181513448070909,
"learning_rate": 3.21206743566992e-05,
"loss": 0.2247,
"step": 1058
},
{
"epoch": 1.2682634730538922,
"grad_norm": 0.19283546286953723,
"learning_rate": 3.209849157054126e-05,
"loss": 0.2428,
"step": 1059
},
{
"epoch": 1.2694610778443114,
"grad_norm": 0.20929483318484188,
"learning_rate": 3.207630878438332e-05,
"loss": 0.257,
"step": 1060
},
{
"epoch": 1.2706586826347306,
"grad_norm": 0.2100705018261652,
"learning_rate": 3.205412599822538e-05,
"loss": 0.2658,
"step": 1061
},
{
"epoch": 1.2718562874251498,
"grad_norm": 0.19494480022774524,
"learning_rate": 3.2031943212067436e-05,
"loss": 0.2471,
"step": 1062
},
{
"epoch": 1.273053892215569,
"grad_norm": 0.19535106918796608,
"learning_rate": 3.2009760425909494e-05,
"loss": 0.2388,
"step": 1063
},
{
"epoch": 1.274251497005988,
"grad_norm": 0.18388059761884196,
"learning_rate": 3.198757763975155e-05,
"loss": 0.251,
"step": 1064
},
{
"epoch": 1.2754491017964071,
"grad_norm": 0.19841383162956808,
"learning_rate": 3.196539485359361e-05,
"loss": 0.2593,
"step": 1065
},
{
"epoch": 1.2766467065868263,
"grad_norm": 0.2207845360913485,
"learning_rate": 3.1943212067435677e-05,
"loss": 0.2863,
"step": 1066
},
{
"epoch": 1.2778443113772455,
"grad_norm": 0.18062089975343604,
"learning_rate": 3.192102928127773e-05,
"loss": 0.2408,
"step": 1067
},
{
"epoch": 1.2790419161676647,
"grad_norm": 0.1878546760408993,
"learning_rate": 3.1898846495119787e-05,
"loss": 0.2638,
"step": 1068
},
{
"epoch": 1.280239520958084,
"grad_norm": 0.16744786360896374,
"learning_rate": 3.1876663708961845e-05,
"loss": 0.2574,
"step": 1069
},
{
"epoch": 1.281437125748503,
"grad_norm": 0.17366791016080338,
"learning_rate": 3.1854480922803903e-05,
"loss": 0.2531,
"step": 1070
},
{
"epoch": 1.282634730538922,
"grad_norm": 0.18714455733162735,
"learning_rate": 3.183229813664597e-05,
"loss": 0.2623,
"step": 1071
},
{
"epoch": 1.2838323353293413,
"grad_norm": 0.19130159343640016,
"learning_rate": 3.181011535048802e-05,
"loss": 0.2535,
"step": 1072
},
{
"epoch": 1.2850299401197605,
"grad_norm": 0.17377687556994406,
"learning_rate": 3.178793256433008e-05,
"loss": 0.2315,
"step": 1073
},
{
"epoch": 1.2862275449101797,
"grad_norm": 0.16421807246299844,
"learning_rate": 3.176574977817214e-05,
"loss": 0.2343,
"step": 1074
},
{
"epoch": 1.2874251497005988,
"grad_norm": 0.19345583747600212,
"learning_rate": 3.17435669920142e-05,
"loss": 0.2447,
"step": 1075
},
{
"epoch": 1.288622754491018,
"grad_norm": 0.1728043535081962,
"learning_rate": 3.172138420585626e-05,
"loss": 0.2552,
"step": 1076
},
{
"epoch": 1.2898203592814372,
"grad_norm": 0.1899600228549291,
"learning_rate": 3.169920141969831e-05,
"loss": 0.2507,
"step": 1077
},
{
"epoch": 1.2910179640718562,
"grad_norm": 0.16469713367775685,
"learning_rate": 3.167701863354037e-05,
"loss": 0.256,
"step": 1078
},
{
"epoch": 1.2922155688622754,
"grad_norm": 0.1834149013714549,
"learning_rate": 3.165483584738243e-05,
"loss": 0.2574,
"step": 1079
},
{
"epoch": 1.2934131736526946,
"grad_norm": 0.21539087919346359,
"learning_rate": 3.1632653061224494e-05,
"loss": 0.2647,
"step": 1080
},
{
"epoch": 1.2946107784431138,
"grad_norm": 0.17018177987488706,
"learning_rate": 3.161047027506655e-05,
"loss": 0.2561,
"step": 1081
},
{
"epoch": 1.295808383233533,
"grad_norm": 0.1752321694984895,
"learning_rate": 3.1588287488908605e-05,
"loss": 0.241,
"step": 1082
},
{
"epoch": 1.2970059880239522,
"grad_norm": 0.19840664543587053,
"learning_rate": 3.156610470275066e-05,
"loss": 0.265,
"step": 1083
},
{
"epoch": 1.2982035928143714,
"grad_norm": 0.19867670263962559,
"learning_rate": 3.154392191659273e-05,
"loss": 0.2556,
"step": 1084
},
{
"epoch": 1.2994011976047903,
"grad_norm": 0.17380066507578285,
"learning_rate": 3.152173913043479e-05,
"loss": 0.271,
"step": 1085
},
{
"epoch": 1.3005988023952095,
"grad_norm": 0.17404326041420254,
"learning_rate": 3.1499556344276845e-05,
"loss": 0.2423,
"step": 1086
},
{
"epoch": 1.3017964071856287,
"grad_norm": 0.18263591228728293,
"learning_rate": 3.14773735581189e-05,
"loss": 0.2488,
"step": 1087
},
{
"epoch": 1.302994011976048,
"grad_norm": 0.1761542803632887,
"learning_rate": 3.1455190771960955e-05,
"loss": 0.2525,
"step": 1088
},
{
"epoch": 1.304191616766467,
"grad_norm": 0.18140919015104467,
"learning_rate": 3.143300798580302e-05,
"loss": 0.2728,
"step": 1089
},
{
"epoch": 1.3053892215568863,
"grad_norm": 0.19437687947633406,
"learning_rate": 3.141082519964508e-05,
"loss": 0.2486,
"step": 1090
},
{
"epoch": 1.3065868263473055,
"grad_norm": 0.19021173021042737,
"learning_rate": 3.138864241348714e-05,
"loss": 0.2456,
"step": 1091
},
{
"epoch": 1.3077844311377245,
"grad_norm": 0.17627745319550572,
"learning_rate": 3.136645962732919e-05,
"loss": 0.2455,
"step": 1092
},
{
"epoch": 1.3089820359281437,
"grad_norm": 0.16851402670752924,
"learning_rate": 3.1344276841171254e-05,
"loss": 0.2304,
"step": 1093
},
{
"epoch": 1.3101796407185629,
"grad_norm": 0.19432948688816548,
"learning_rate": 3.132209405501331e-05,
"loss": 0.2765,
"step": 1094
},
{
"epoch": 1.311377245508982,
"grad_norm": 0.1840409937196863,
"learning_rate": 3.129991126885537e-05,
"loss": 0.2442,
"step": 1095
},
{
"epoch": 1.3125748502994012,
"grad_norm": 0.18853411573488923,
"learning_rate": 3.127772848269743e-05,
"loss": 0.2595,
"step": 1096
},
{
"epoch": 1.3137724550898204,
"grad_norm": 0.19349076151737063,
"learning_rate": 3.125554569653949e-05,
"loss": 0.2413,
"step": 1097
},
{
"epoch": 1.3149700598802396,
"grad_norm": 0.18284231220356947,
"learning_rate": 3.1233362910381546e-05,
"loss": 0.2393,
"step": 1098
},
{
"epoch": 1.3161676646706586,
"grad_norm": 0.17223164909575683,
"learning_rate": 3.1211180124223605e-05,
"loss": 0.2403,
"step": 1099
},
{
"epoch": 1.3173652694610778,
"grad_norm": 0.22228193960332604,
"learning_rate": 3.118899733806566e-05,
"loss": 0.2658,
"step": 1100
},
{
"epoch": 1.318562874251497,
"grad_norm": 0.17105881991902408,
"learning_rate": 3.116681455190772e-05,
"loss": 0.2398,
"step": 1101
},
{
"epoch": 1.3197604790419162,
"grad_norm": 0.21855877773317778,
"learning_rate": 3.114463176574978e-05,
"loss": 0.2858,
"step": 1102
},
{
"epoch": 1.3209580838323354,
"grad_norm": 0.3424748095149493,
"learning_rate": 3.112244897959184e-05,
"loss": 0.2902,
"step": 1103
},
{
"epoch": 1.3221556886227546,
"grad_norm": 0.17987380994429872,
"learning_rate": 3.11002661934339e-05,
"loss": 0.2545,
"step": 1104
},
{
"epoch": 1.3233532934131738,
"grad_norm": 0.2864710733490976,
"learning_rate": 3.1078083407275955e-05,
"loss": 0.2744,
"step": 1105
},
{
"epoch": 1.3245508982035927,
"grad_norm": 0.17575688043478133,
"learning_rate": 3.1055900621118014e-05,
"loss": 0.2879,
"step": 1106
},
{
"epoch": 1.325748502994012,
"grad_norm": 0.15587496676418228,
"learning_rate": 3.103371783496007e-05,
"loss": 0.2298,
"step": 1107
},
{
"epoch": 1.3269461077844311,
"grad_norm": 0.1887507240876081,
"learning_rate": 3.101153504880213e-05,
"loss": 0.2603,
"step": 1108
},
{
"epoch": 1.3281437125748503,
"grad_norm": 0.19524420455802893,
"learning_rate": 3.098935226264419e-05,
"loss": 0.2501,
"step": 1109
},
{
"epoch": 1.3293413173652695,
"grad_norm": 0.1735984733077188,
"learning_rate": 3.096716947648625e-05,
"loss": 0.2602,
"step": 1110
},
{
"epoch": 1.3305389221556887,
"grad_norm": 0.17952879931299068,
"learning_rate": 3.094498669032831e-05,
"loss": 0.2605,
"step": 1111
},
{
"epoch": 1.331736526946108,
"grad_norm": 0.19544926095026602,
"learning_rate": 3.0922803904170364e-05,
"loss": 0.2599,
"step": 1112
},
{
"epoch": 1.3329341317365269,
"grad_norm": 0.17710738101903223,
"learning_rate": 3.090062111801242e-05,
"loss": 0.2389,
"step": 1113
},
{
"epoch": 1.334131736526946,
"grad_norm": 0.19286707998290997,
"learning_rate": 3.087843833185448e-05,
"loss": 0.2541,
"step": 1114
},
{
"epoch": 1.3353293413173652,
"grad_norm": 0.19025203366255014,
"learning_rate": 3.085625554569654e-05,
"loss": 0.2824,
"step": 1115
},
{
"epoch": 1.3365269461077844,
"grad_norm": 0.18470830108172046,
"learning_rate": 3.0834072759538605e-05,
"loss": 0.2447,
"step": 1116
},
{
"epoch": 1.3377245508982036,
"grad_norm": 0.16605317248352983,
"learning_rate": 3.0811889973380656e-05,
"loss": 0.2454,
"step": 1117
},
{
"epoch": 1.3389221556886228,
"grad_norm": 0.2017182434026456,
"learning_rate": 3.0789707187222715e-05,
"loss": 0.2936,
"step": 1118
},
{
"epoch": 1.340119760479042,
"grad_norm": 0.20033608797178815,
"learning_rate": 3.076752440106477e-05,
"loss": 0.2868,
"step": 1119
},
{
"epoch": 1.341317365269461,
"grad_norm": 0.17096783321801365,
"learning_rate": 3.074534161490684e-05,
"loss": 0.2562,
"step": 1120
},
{
"epoch": 1.3425149700598802,
"grad_norm": 0.17474022675656092,
"learning_rate": 3.07231588287489e-05,
"loss": 0.2738,
"step": 1121
},
{
"epoch": 1.3437125748502994,
"grad_norm": 0.19446158072221498,
"learning_rate": 3.070097604259095e-05,
"loss": 0.259,
"step": 1122
},
{
"epoch": 1.3449101796407186,
"grad_norm": 0.205732442662704,
"learning_rate": 3.067879325643301e-05,
"loss": 0.2568,
"step": 1123
},
{
"epoch": 1.3461077844311378,
"grad_norm": 0.17455821473723274,
"learning_rate": 3.0656610470275065e-05,
"loss": 0.2454,
"step": 1124
},
{
"epoch": 1.347305389221557,
"grad_norm": 0.19535321609244832,
"learning_rate": 3.063442768411713e-05,
"loss": 0.2517,
"step": 1125
},
{
"epoch": 1.3485029940119762,
"grad_norm": 0.2189415758891102,
"learning_rate": 3.061224489795919e-05,
"loss": 0.2735,
"step": 1126
},
{
"epoch": 1.3497005988023951,
"grad_norm": 0.17620582946230287,
"learning_rate": 3.059006211180124e-05,
"loss": 0.2585,
"step": 1127
},
{
"epoch": 1.3508982035928143,
"grad_norm": 0.1967693705194727,
"learning_rate": 3.05678793256433e-05,
"loss": 0.2616,
"step": 1128
},
{
"epoch": 1.3520958083832335,
"grad_norm": 0.20488249208223722,
"learning_rate": 3.0545696539485364e-05,
"loss": 0.2415,
"step": 1129
},
{
"epoch": 1.3532934131736527,
"grad_norm": 0.19248080967475356,
"learning_rate": 3.052351375332742e-05,
"loss": 0.2329,
"step": 1130
},
{
"epoch": 1.354491017964072,
"grad_norm": 0.18579549933239226,
"learning_rate": 3.0501330967169478e-05,
"loss": 0.2704,
"step": 1131
},
{
"epoch": 1.355688622754491,
"grad_norm": 0.19886277778946024,
"learning_rate": 3.0479148181011536e-05,
"loss": 0.2487,
"step": 1132
},
{
"epoch": 1.3568862275449103,
"grad_norm": 0.19476150195834302,
"learning_rate": 3.045696539485359e-05,
"loss": 0.2641,
"step": 1133
},
{
"epoch": 1.3580838323353293,
"grad_norm": 0.2062893151965033,
"learning_rate": 3.0434782608695656e-05,
"loss": 0.2535,
"step": 1134
},
{
"epoch": 1.3592814371257484,
"grad_norm": 0.1807324480197101,
"learning_rate": 3.041259982253771e-05,
"loss": 0.234,
"step": 1135
},
{
"epoch": 1.3604790419161676,
"grad_norm": 0.19961832983538416,
"learning_rate": 3.039041703637977e-05,
"loss": 0.262,
"step": 1136
},
{
"epoch": 1.3616766467065868,
"grad_norm": 0.21934385212564358,
"learning_rate": 3.0368234250221828e-05,
"loss": 0.2795,
"step": 1137
},
{
"epoch": 1.362874251497006,
"grad_norm": 0.17189124569394595,
"learning_rate": 3.034605146406389e-05,
"loss": 0.2487,
"step": 1138
},
{
"epoch": 1.3640718562874252,
"grad_norm": 0.2090333949413717,
"learning_rate": 3.032386867790595e-05,
"loss": 0.2757,
"step": 1139
},
{
"epoch": 1.3652694610778444,
"grad_norm": 0.18848619903648509,
"learning_rate": 3.0301685891748003e-05,
"loss": 0.2581,
"step": 1140
},
{
"epoch": 1.3664670658682634,
"grad_norm": 0.17774655444455395,
"learning_rate": 3.0279503105590062e-05,
"loss": 0.2359,
"step": 1141
},
{
"epoch": 1.3676646706586826,
"grad_norm": 0.22281968590322365,
"learning_rate": 3.025732031943212e-05,
"loss": 0.2692,
"step": 1142
},
{
"epoch": 1.3688622754491018,
"grad_norm": 0.19393816423235338,
"learning_rate": 3.0235137533274182e-05,
"loss": 0.25,
"step": 1143
},
{
"epoch": 1.370059880239521,
"grad_norm": 0.1832860920719974,
"learning_rate": 3.021295474711624e-05,
"loss": 0.2508,
"step": 1144
},
{
"epoch": 1.3712574850299402,
"grad_norm": 0.20619181948125376,
"learning_rate": 3.01907719609583e-05,
"loss": 0.2415,
"step": 1145
},
{
"epoch": 1.3724550898203594,
"grad_norm": 0.19684853830745586,
"learning_rate": 3.0168589174800354e-05,
"loss": 0.2634,
"step": 1146
},
{
"epoch": 1.3736526946107785,
"grad_norm": 0.1895027940150831,
"learning_rate": 3.0146406388642416e-05,
"loss": 0.2576,
"step": 1147
},
{
"epoch": 1.3748502994011975,
"grad_norm": 0.32122671797986485,
"learning_rate": 3.0124223602484474e-05,
"loss": 0.2818,
"step": 1148
},
{
"epoch": 1.3760479041916167,
"grad_norm": 0.20422805547184247,
"learning_rate": 3.0102040816326533e-05,
"loss": 0.2544,
"step": 1149
},
{
"epoch": 1.377245508982036,
"grad_norm": 0.2027458193211812,
"learning_rate": 3.007985803016859e-05,
"loss": 0.2662,
"step": 1150
},
{
"epoch": 1.378443113772455,
"grad_norm": 0.20978077097525422,
"learning_rate": 3.0057675244010646e-05,
"loss": 0.2631,
"step": 1151
},
{
"epoch": 1.3796407185628743,
"grad_norm": 0.18368515422031664,
"learning_rate": 3.003549245785271e-05,
"loss": 0.2672,
"step": 1152
},
{
"epoch": 1.3808383233532935,
"grad_norm": 0.18703183002805313,
"learning_rate": 3.0013309671694766e-05,
"loss": 0.2657,
"step": 1153
},
{
"epoch": 1.3820359281437127,
"grad_norm": 0.1920349475578725,
"learning_rate": 2.9991126885536825e-05,
"loss": 0.2509,
"step": 1154
},
{
"epoch": 1.3832335329341316,
"grad_norm": 0.17425196712365873,
"learning_rate": 2.9968944099378883e-05,
"loss": 0.2443,
"step": 1155
},
{
"epoch": 1.3844311377245508,
"grad_norm": 0.1852871966433273,
"learning_rate": 2.9946761313220945e-05,
"loss": 0.2448,
"step": 1156
},
{
"epoch": 1.38562874251497,
"grad_norm": 0.21871262584048176,
"learning_rate": 2.9924578527063003e-05,
"loss": 0.2676,
"step": 1157
},
{
"epoch": 1.3868263473053892,
"grad_norm": 0.19982016386904464,
"learning_rate": 2.990239574090506e-05,
"loss": 0.2617,
"step": 1158
},
{
"epoch": 1.3880239520958084,
"grad_norm": 0.18363390205398158,
"learning_rate": 2.9880212954747117e-05,
"loss": 0.2668,
"step": 1159
},
{
"epoch": 1.3892215568862276,
"grad_norm": 0.19785429429250417,
"learning_rate": 2.9858030168589175e-05,
"loss": 0.268,
"step": 1160
},
{
"epoch": 1.3904191616766468,
"grad_norm": 0.17354512561319907,
"learning_rate": 2.9835847382431237e-05,
"loss": 0.2616,
"step": 1161
},
{
"epoch": 1.3916167664670658,
"grad_norm": 0.17874631037739397,
"learning_rate": 2.9813664596273296e-05,
"loss": 0.2554,
"step": 1162
},
{
"epoch": 1.392814371257485,
"grad_norm": 0.17522260504783405,
"learning_rate": 2.979148181011535e-05,
"loss": 0.2532,
"step": 1163
},
{
"epoch": 1.3940119760479042,
"grad_norm": 0.18821544194247017,
"learning_rate": 2.976929902395741e-05,
"loss": 0.2715,
"step": 1164
},
{
"epoch": 1.3952095808383234,
"grad_norm": 0.18202130058752033,
"learning_rate": 2.974711623779947e-05,
"loss": 0.2494,
"step": 1165
},
{
"epoch": 1.3964071856287426,
"grad_norm": 0.20973255835360965,
"learning_rate": 2.972493345164153e-05,
"loss": 0.2535,
"step": 1166
},
{
"epoch": 1.3976047904191617,
"grad_norm": 0.15831302391024257,
"learning_rate": 2.9702750665483588e-05,
"loss": 0.2388,
"step": 1167
},
{
"epoch": 1.398802395209581,
"grad_norm": 0.22207141851443948,
"learning_rate": 2.9680567879325643e-05,
"loss": 0.274,
"step": 1168
},
{
"epoch": 1.4,
"grad_norm": 0.21140832062182355,
"learning_rate": 2.96583850931677e-05,
"loss": 0.2612,
"step": 1169
},
{
"epoch": 1.401197604790419,
"grad_norm": 0.1753297213458931,
"learning_rate": 2.9636202307009763e-05,
"loss": 0.2412,
"step": 1170
},
{
"epoch": 1.4023952095808383,
"grad_norm": 0.1840953153259694,
"learning_rate": 2.961401952085182e-05,
"loss": 0.2495,
"step": 1171
},
{
"epoch": 1.4035928143712575,
"grad_norm": 0.19802320677321048,
"learning_rate": 2.959183673469388e-05,
"loss": 0.2473,
"step": 1172
},
{
"epoch": 1.4047904191616767,
"grad_norm": 0.19785327960018692,
"learning_rate": 2.9569653948535935e-05,
"loss": 0.2721,
"step": 1173
},
{
"epoch": 1.4059880239520959,
"grad_norm": 0.17738464871077947,
"learning_rate": 2.9547471162378e-05,
"loss": 0.266,
"step": 1174
},
{
"epoch": 1.407185628742515,
"grad_norm": 0.22008089637503137,
"learning_rate": 2.9525288376220055e-05,
"loss": 0.236,
"step": 1175
},
{
"epoch": 1.408383233532934,
"grad_norm": 0.19491940039887354,
"learning_rate": 2.9503105590062114e-05,
"loss": 0.2601,
"step": 1176
},
{
"epoch": 1.4095808383233532,
"grad_norm": 0.18700384170650927,
"learning_rate": 2.9480922803904172e-05,
"loss": 0.2666,
"step": 1177
},
{
"epoch": 1.4107784431137724,
"grad_norm": 0.201022841356387,
"learning_rate": 2.9458740017746227e-05,
"loss": 0.2666,
"step": 1178
},
{
"epoch": 1.4119760479041916,
"grad_norm": 0.1867757314614112,
"learning_rate": 2.9436557231588292e-05,
"loss": 0.2485,
"step": 1179
},
{
"epoch": 1.4131736526946108,
"grad_norm": 0.20830318768243875,
"learning_rate": 2.9414374445430347e-05,
"loss": 0.2792,
"step": 1180
},
{
"epoch": 1.41437125748503,
"grad_norm": 0.18448184720489394,
"learning_rate": 2.9392191659272406e-05,
"loss": 0.2551,
"step": 1181
},
{
"epoch": 1.4155688622754492,
"grad_norm": 0.18457854852341515,
"learning_rate": 2.9370008873114464e-05,
"loss": 0.2677,
"step": 1182
},
{
"epoch": 1.4167664670658682,
"grad_norm": 0.1959973319645692,
"learning_rate": 2.9347826086956526e-05,
"loss": 0.2556,
"step": 1183
},
{
"epoch": 1.4179640718562874,
"grad_norm": 0.18073720164507096,
"learning_rate": 2.9325643300798584e-05,
"loss": 0.2448,
"step": 1184
},
{
"epoch": 1.4191616766467066,
"grad_norm": 0.20339159326143869,
"learning_rate": 2.930346051464064e-05,
"loss": 0.2558,
"step": 1185
},
{
"epoch": 1.4203592814371258,
"grad_norm": 0.20342986331974386,
"learning_rate": 2.9281277728482698e-05,
"loss": 0.2483,
"step": 1186
},
{
"epoch": 1.421556886227545,
"grad_norm": 0.17843546888738115,
"learning_rate": 2.9259094942324756e-05,
"loss": 0.2371,
"step": 1187
},
{
"epoch": 1.4227544910179641,
"grad_norm": 0.1961371924740406,
"learning_rate": 2.9236912156166818e-05,
"loss": 0.267,
"step": 1188
},
{
"epoch": 1.4239520958083833,
"grad_norm": 0.1793440076637141,
"learning_rate": 2.9214729370008877e-05,
"loss": 0.2698,
"step": 1189
},
{
"epoch": 1.4251497005988023,
"grad_norm": 0.17206432442396702,
"learning_rate": 2.919254658385093e-05,
"loss": 0.2511,
"step": 1190
},
{
"epoch": 1.4263473053892215,
"grad_norm": 0.1731028138111655,
"learning_rate": 2.917036379769299e-05,
"loss": 0.2526,
"step": 1191
},
{
"epoch": 1.4275449101796407,
"grad_norm": 0.19074777013809976,
"learning_rate": 2.9148181011535052e-05,
"loss": 0.2584,
"step": 1192
},
{
"epoch": 1.4287425149700599,
"grad_norm": 0.17020267100637534,
"learning_rate": 2.912599822537711e-05,
"loss": 0.2347,
"step": 1193
},
{
"epoch": 1.429940119760479,
"grad_norm": 0.18232945806038206,
"learning_rate": 2.910381543921917e-05,
"loss": 0.2527,
"step": 1194
},
{
"epoch": 1.4311377245508983,
"grad_norm": 0.1784374749022422,
"learning_rate": 2.9081632653061224e-05,
"loss": 0.2369,
"step": 1195
},
{
"epoch": 1.4323353293413175,
"grad_norm": 0.19834671245066368,
"learning_rate": 2.9059449866903282e-05,
"loss": 0.2636,
"step": 1196
},
{
"epoch": 1.4335329341317364,
"grad_norm": 0.17931417581073253,
"learning_rate": 2.9037267080745344e-05,
"loss": 0.2746,
"step": 1197
},
{
"epoch": 1.4347305389221556,
"grad_norm": 0.17389782851637392,
"learning_rate": 2.9015084294587402e-05,
"loss": 0.2594,
"step": 1198
},
{
"epoch": 1.4359281437125748,
"grad_norm": 0.19533408706292207,
"learning_rate": 2.899290150842946e-05,
"loss": 0.2509,
"step": 1199
},
{
"epoch": 1.437125748502994,
"grad_norm": 0.1787370108044661,
"learning_rate": 2.8970718722271516e-05,
"loss": 0.2618,
"step": 1200
},
{
"epoch": 1.4383233532934132,
"grad_norm": 0.16444228226638063,
"learning_rate": 2.894853593611358e-05,
"loss": 0.2548,
"step": 1201
},
{
"epoch": 1.4395209580838324,
"grad_norm": 0.18605382510950771,
"learning_rate": 2.8926353149955636e-05,
"loss": 0.2609,
"step": 1202
},
{
"epoch": 1.4407185628742516,
"grad_norm": 0.18056984168269574,
"learning_rate": 2.8904170363797694e-05,
"loss": 0.2611,
"step": 1203
},
{
"epoch": 1.4419161676646706,
"grad_norm": 0.191732017551859,
"learning_rate": 2.8881987577639753e-05,
"loss": 0.2496,
"step": 1204
},
{
"epoch": 1.4431137724550898,
"grad_norm": 0.16272633151907037,
"learning_rate": 2.8859804791481808e-05,
"loss": 0.2393,
"step": 1205
},
{
"epoch": 1.444311377245509,
"grad_norm": 0.16974981279296245,
"learning_rate": 2.8837622005323873e-05,
"loss": 0.2655,
"step": 1206
},
{
"epoch": 1.4455089820359281,
"grad_norm": 0.19704460669120746,
"learning_rate": 2.8815439219165928e-05,
"loss": 0.2587,
"step": 1207
},
{
"epoch": 1.4467065868263473,
"grad_norm": 0.18970824307961956,
"learning_rate": 2.8793256433007987e-05,
"loss": 0.2461,
"step": 1208
},
{
"epoch": 1.4479041916167665,
"grad_norm": 0.1792992091799711,
"learning_rate": 2.8771073646850045e-05,
"loss": 0.2542,
"step": 1209
},
{
"epoch": 1.4491017964071857,
"grad_norm": 0.16664679886136213,
"learning_rate": 2.8748890860692107e-05,
"loss": 0.2343,
"step": 1210
},
{
"epoch": 1.4502994011976047,
"grad_norm": 0.19089472239919922,
"learning_rate": 2.8726708074534165e-05,
"loss": 0.2576,
"step": 1211
},
{
"epoch": 1.451497005988024,
"grad_norm": 0.18924911805495045,
"learning_rate": 2.870452528837622e-05,
"loss": 0.2471,
"step": 1212
},
{
"epoch": 1.452694610778443,
"grad_norm": 0.161743200583376,
"learning_rate": 2.868234250221828e-05,
"loss": 0.2632,
"step": 1213
},
{
"epoch": 1.4538922155688623,
"grad_norm": 0.18709222263780179,
"learning_rate": 2.8660159716060337e-05,
"loss": 0.2647,
"step": 1214
},
{
"epoch": 1.4550898203592815,
"grad_norm": 0.1650217718475197,
"learning_rate": 2.86379769299024e-05,
"loss": 0.2464,
"step": 1215
},
{
"epoch": 1.4562874251497007,
"grad_norm": 0.17161257717244632,
"learning_rate": 2.8615794143744457e-05,
"loss": 0.2446,
"step": 1216
},
{
"epoch": 1.4574850299401199,
"grad_norm": 0.18673045134937352,
"learning_rate": 2.8593611357586512e-05,
"loss": 0.2405,
"step": 1217
},
{
"epoch": 1.4586826347305388,
"grad_norm": 0.18255808935103623,
"learning_rate": 2.857142857142857e-05,
"loss": 0.2557,
"step": 1218
},
{
"epoch": 1.459880239520958,
"grad_norm": 0.19069090914869338,
"learning_rate": 2.8549245785270633e-05,
"loss": 0.2656,
"step": 1219
},
{
"epoch": 1.4610778443113772,
"grad_norm": 0.2115001289403074,
"learning_rate": 2.852706299911269e-05,
"loss": 0.28,
"step": 1220
},
{
"epoch": 1.4622754491017964,
"grad_norm": 0.21813906939160227,
"learning_rate": 2.850488021295475e-05,
"loss": 0.2553,
"step": 1221
},
{
"epoch": 1.4634730538922156,
"grad_norm": 0.1898286294898088,
"learning_rate": 2.8482697426796805e-05,
"loss": 0.2659,
"step": 1222
},
{
"epoch": 1.4646706586826348,
"grad_norm": 0.2169242451933379,
"learning_rate": 2.8460514640638863e-05,
"loss": 0.2653,
"step": 1223
},
{
"epoch": 1.465868263473054,
"grad_norm": 0.20188001902390137,
"learning_rate": 2.8438331854480925e-05,
"loss": 0.2622,
"step": 1224
},
{
"epoch": 1.467065868263473,
"grad_norm": 0.18686166874056817,
"learning_rate": 2.8416149068322983e-05,
"loss": 0.2531,
"step": 1225
},
{
"epoch": 1.4682634730538922,
"grad_norm": 0.19190430690017118,
"learning_rate": 2.839396628216504e-05,
"loss": 0.2502,
"step": 1226
},
{
"epoch": 1.4694610778443113,
"grad_norm": 0.1737042754870274,
"learning_rate": 2.8371783496007097e-05,
"loss": 0.2477,
"step": 1227
},
{
"epoch": 1.4706586826347305,
"grad_norm": 0.1819280188422528,
"learning_rate": 2.8349600709849162e-05,
"loss": 0.2471,
"step": 1228
},
{
"epoch": 1.4718562874251497,
"grad_norm": 0.19158855135802888,
"learning_rate": 2.8327417923691217e-05,
"loss": 0.2506,
"step": 1229
},
{
"epoch": 1.473053892215569,
"grad_norm": 0.17503135697169486,
"learning_rate": 2.8305235137533275e-05,
"loss": 0.2442,
"step": 1230
},
{
"epoch": 1.4742514970059881,
"grad_norm": 0.20792591652989392,
"learning_rate": 2.8283052351375334e-05,
"loss": 0.2675,
"step": 1231
},
{
"epoch": 1.475449101796407,
"grad_norm": 0.1666976673303719,
"learning_rate": 2.826086956521739e-05,
"loss": 0.25,
"step": 1232
},
{
"epoch": 1.4766467065868263,
"grad_norm": 0.1870356170879395,
"learning_rate": 2.8238686779059454e-05,
"loss": 0.2544,
"step": 1233
},
{
"epoch": 1.4778443113772455,
"grad_norm": 0.18143190287548,
"learning_rate": 2.821650399290151e-05,
"loss": 0.2488,
"step": 1234
},
{
"epoch": 1.4790419161676647,
"grad_norm": 0.19340964636128705,
"learning_rate": 2.8194321206743567e-05,
"loss": 0.25,
"step": 1235
},
{
"epoch": 1.4802395209580839,
"grad_norm": 0.17771619023845203,
"learning_rate": 2.8172138420585626e-05,
"loss": 0.2447,
"step": 1236
},
{
"epoch": 1.481437125748503,
"grad_norm": 0.2164413672069501,
"learning_rate": 2.8149955634427688e-05,
"loss": 0.2677,
"step": 1237
},
{
"epoch": 1.4826347305389223,
"grad_norm": 0.21999705651359125,
"learning_rate": 2.8127772848269746e-05,
"loss": 0.2454,
"step": 1238
},
{
"epoch": 1.4838323353293412,
"grad_norm": 0.20115588033539697,
"learning_rate": 2.8105590062111805e-05,
"loss": 0.2532,
"step": 1239
},
{
"epoch": 1.4850299401197604,
"grad_norm": 0.199371747238992,
"learning_rate": 2.808340727595386e-05,
"loss": 0.2752,
"step": 1240
},
{
"epoch": 1.4862275449101796,
"grad_norm": 0.22854939922117745,
"learning_rate": 2.8061224489795918e-05,
"loss": 0.2533,
"step": 1241
},
{
"epoch": 1.4874251497005988,
"grad_norm": 0.19557283673668407,
"learning_rate": 2.803904170363798e-05,
"loss": 0.2407,
"step": 1242
},
{
"epoch": 1.488622754491018,
"grad_norm": 0.18709818891947222,
"learning_rate": 2.8016858917480038e-05,
"loss": 0.241,
"step": 1243
},
{
"epoch": 1.4898203592814372,
"grad_norm": 0.17858376834735615,
"learning_rate": 2.7994676131322097e-05,
"loss": 0.2448,
"step": 1244
},
{
"epoch": 1.4910179640718564,
"grad_norm": 0.22028324401069385,
"learning_rate": 2.7972493345164152e-05,
"loss": 0.2506,
"step": 1245
},
{
"epoch": 1.4922155688622754,
"grad_norm": 0.15998203216331003,
"learning_rate": 2.795031055900621e-05,
"loss": 0.2293,
"step": 1246
},
{
"epoch": 1.4934131736526945,
"grad_norm": 0.18985932150333243,
"learning_rate": 2.7928127772848272e-05,
"loss": 0.2804,
"step": 1247
},
{
"epoch": 1.4946107784431137,
"grad_norm": 0.19341198011455335,
"learning_rate": 2.790594498669033e-05,
"loss": 0.2406,
"step": 1248
},
{
"epoch": 1.495808383233533,
"grad_norm": 0.19282975262455032,
"learning_rate": 2.788376220053239e-05,
"loss": 0.2804,
"step": 1249
},
{
"epoch": 1.4970059880239521,
"grad_norm": 0.18738557998689592,
"learning_rate": 2.7861579414374444e-05,
"loss": 0.2696,
"step": 1250
},
{
"epoch": 1.4982035928143713,
"grad_norm": 0.18072666030338957,
"learning_rate": 2.783939662821651e-05,
"loss": 0.2378,
"step": 1251
},
{
"epoch": 1.4994011976047905,
"grad_norm": 0.21519950538135185,
"learning_rate": 2.7817213842058564e-05,
"loss": 0.2604,
"step": 1252
},
{
"epoch": 1.5005988023952095,
"grad_norm": 0.18706309905711418,
"learning_rate": 2.7795031055900623e-05,
"loss": 0.2658,
"step": 1253
},
{
"epoch": 1.501796407185629,
"grad_norm": 0.20839242747122358,
"learning_rate": 2.777284826974268e-05,
"loss": 0.2458,
"step": 1254
},
{
"epoch": 1.5029940119760479,
"grad_norm": 0.1967480393483631,
"learning_rate": 2.7750665483584736e-05,
"loss": 0.2619,
"step": 1255
},
{
"epoch": 1.504191616766467,
"grad_norm": 0.18918911064057692,
"learning_rate": 2.77284826974268e-05,
"loss": 0.2517,
"step": 1256
},
{
"epoch": 1.5053892215568863,
"grad_norm": 0.196214216839238,
"learning_rate": 2.7706299911268856e-05,
"loss": 0.2589,
"step": 1257
},
{
"epoch": 1.5065868263473052,
"grad_norm": 0.19732276126373444,
"learning_rate": 2.7684117125110915e-05,
"loss": 0.253,
"step": 1258
},
{
"epoch": 1.5077844311377246,
"grad_norm": 0.19997131067506901,
"learning_rate": 2.7661934338952973e-05,
"loss": 0.2745,
"step": 1259
},
{
"epoch": 1.5089820359281436,
"grad_norm": 0.19002406769647082,
"learning_rate": 2.7639751552795035e-05,
"loss": 0.2606,
"step": 1260
},
{
"epoch": 1.510179640718563,
"grad_norm": 0.19529853199516814,
"learning_rate": 2.7617568766637093e-05,
"loss": 0.2554,
"step": 1261
},
{
"epoch": 1.511377245508982,
"grad_norm": 0.17385246839572444,
"learning_rate": 2.759538598047915e-05,
"loss": 0.2364,
"step": 1262
},
{
"epoch": 1.5125748502994012,
"grad_norm": 0.1832814558271417,
"learning_rate": 2.7573203194321207e-05,
"loss": 0.2428,
"step": 1263
},
{
"epoch": 1.5137724550898204,
"grad_norm": 0.17934244483191789,
"learning_rate": 2.7551020408163265e-05,
"loss": 0.2435,
"step": 1264
},
{
"epoch": 1.5149700598802394,
"grad_norm": 0.19021177282877538,
"learning_rate": 2.7528837622005327e-05,
"loss": 0.2561,
"step": 1265
},
{
"epoch": 1.5161676646706588,
"grad_norm": 0.18013904368366543,
"learning_rate": 2.7506654835847385e-05,
"loss": 0.2819,
"step": 1266
},
{
"epoch": 1.5173652694610777,
"grad_norm": 0.22009036099619075,
"learning_rate": 2.748447204968944e-05,
"loss": 0.2697,
"step": 1267
},
{
"epoch": 1.5185628742514972,
"grad_norm": 0.18703080570404185,
"learning_rate": 2.74622892635315e-05,
"loss": 0.2511,
"step": 1268
},
{
"epoch": 1.5197604790419161,
"grad_norm": 0.16902585307093912,
"learning_rate": 2.744010647737356e-05,
"loss": 0.2342,
"step": 1269
},
{
"epoch": 1.5209580838323353,
"grad_norm": 0.19315825774482973,
"learning_rate": 2.741792369121562e-05,
"loss": 0.2462,
"step": 1270
},
{
"epoch": 1.5221556886227545,
"grad_norm": 0.18179134540208697,
"learning_rate": 2.7395740905057678e-05,
"loss": 0.2393,
"step": 1271
},
{
"epoch": 1.5233532934131735,
"grad_norm": 0.16755893790082527,
"learning_rate": 2.7373558118899733e-05,
"loss": 0.2449,
"step": 1272
},
{
"epoch": 1.524550898203593,
"grad_norm": 0.17433099812804784,
"learning_rate": 2.735137533274179e-05,
"loss": 0.259,
"step": 1273
},
{
"epoch": 1.5257485029940119,
"grad_norm": 0.19650084920763655,
"learning_rate": 2.7329192546583853e-05,
"loss": 0.2653,
"step": 1274
},
{
"epoch": 1.5269461077844313,
"grad_norm": 0.18781820102319616,
"learning_rate": 2.730700976042591e-05,
"loss": 0.2464,
"step": 1275
},
{
"epoch": 1.5281437125748503,
"grad_norm": 0.19377542053553504,
"learning_rate": 2.728482697426797e-05,
"loss": 0.2599,
"step": 1276
},
{
"epoch": 1.5293413173652695,
"grad_norm": 0.1988304743761568,
"learning_rate": 2.7262644188110025e-05,
"loss": 0.2704,
"step": 1277
},
{
"epoch": 1.5305389221556887,
"grad_norm": 0.18509143125082422,
"learning_rate": 2.724046140195209e-05,
"loss": 0.2655,
"step": 1278
},
{
"epoch": 1.5317365269461076,
"grad_norm": 0.18007173949226743,
"learning_rate": 2.7218278615794145e-05,
"loss": 0.2601,
"step": 1279
},
{
"epoch": 1.532934131736527,
"grad_norm": 0.1959131648783704,
"learning_rate": 2.7196095829636203e-05,
"loss": 0.2319,
"step": 1280
},
{
"epoch": 1.534131736526946,
"grad_norm": 0.1809114030578386,
"learning_rate": 2.7173913043478262e-05,
"loss": 0.2535,
"step": 1281
},
{
"epoch": 1.5353293413173654,
"grad_norm": 0.18293213104778552,
"learning_rate": 2.7151730257320317e-05,
"loss": 0.2533,
"step": 1282
},
{
"epoch": 1.5365269461077844,
"grad_norm": 0.19870960425826534,
"learning_rate": 2.7129547471162382e-05,
"loss": 0.2571,
"step": 1283
},
{
"epoch": 1.5377245508982036,
"grad_norm": 0.17704505798611658,
"learning_rate": 2.7107364685004437e-05,
"loss": 0.2534,
"step": 1284
},
{
"epoch": 1.5389221556886228,
"grad_norm": 0.16820374916378336,
"learning_rate": 2.7085181898846496e-05,
"loss": 0.2409,
"step": 1285
},
{
"epoch": 1.5401197604790418,
"grad_norm": 0.1832993027184732,
"learning_rate": 2.7062999112688554e-05,
"loss": 0.2631,
"step": 1286
},
{
"epoch": 1.5413173652694612,
"grad_norm": 0.19369265090297016,
"learning_rate": 2.7040816326530616e-05,
"loss": 0.256,
"step": 1287
},
{
"epoch": 1.5425149700598801,
"grad_norm": 0.16652602827508642,
"learning_rate": 2.7018633540372674e-05,
"loss": 0.2424,
"step": 1288
},
{
"epoch": 1.5437125748502996,
"grad_norm": 0.17660293524763454,
"learning_rate": 2.699645075421473e-05,
"loss": 0.2485,
"step": 1289
},
{
"epoch": 1.5449101796407185,
"grad_norm": 0.1938171613552156,
"learning_rate": 2.6974267968056788e-05,
"loss": 0.2565,
"step": 1290
},
{
"epoch": 1.5461077844311377,
"grad_norm": 0.18745163718037788,
"learning_rate": 2.6952085181898846e-05,
"loss": 0.2592,
"step": 1291
},
{
"epoch": 1.547305389221557,
"grad_norm": 0.1842805266247045,
"learning_rate": 2.6929902395740908e-05,
"loss": 0.2494,
"step": 1292
},
{
"epoch": 1.5485029940119759,
"grad_norm": 0.1740362966633568,
"learning_rate": 2.6907719609582966e-05,
"loss": 0.241,
"step": 1293
},
{
"epoch": 1.5497005988023953,
"grad_norm": 0.17326402412125694,
"learning_rate": 2.688553682342502e-05,
"loss": 0.2461,
"step": 1294
},
{
"epoch": 1.5508982035928143,
"grad_norm": 0.16544058787254162,
"learning_rate": 2.686335403726708e-05,
"loss": 0.2439,
"step": 1295
},
{
"epoch": 1.5520958083832337,
"grad_norm": 0.18076379946748655,
"learning_rate": 2.684117125110914e-05,
"loss": 0.2605,
"step": 1296
},
{
"epoch": 1.5532934131736527,
"grad_norm": 0.1676727292067849,
"learning_rate": 2.68189884649512e-05,
"loss": 0.2442,
"step": 1297
},
{
"epoch": 1.5544910179640719,
"grad_norm": 0.1755234559859168,
"learning_rate": 2.679680567879326e-05,
"loss": 0.2522,
"step": 1298
},
{
"epoch": 1.555688622754491,
"grad_norm": 0.17492891575730593,
"learning_rate": 2.6774622892635314e-05,
"loss": 0.261,
"step": 1299
},
{
"epoch": 1.55688622754491,
"grad_norm": 0.17964511611735173,
"learning_rate": 2.6752440106477372e-05,
"loss": 0.2595,
"step": 1300
},
{
"epoch": 1.5580838323353294,
"grad_norm": 0.19227773992617522,
"learning_rate": 2.6730257320319434e-05,
"loss": 0.2531,
"step": 1301
},
{
"epoch": 1.5592814371257484,
"grad_norm": 0.16767362570537972,
"learning_rate": 2.6708074534161492e-05,
"loss": 0.2455,
"step": 1302
},
{
"epoch": 1.5604790419161678,
"grad_norm": 0.19246869797809132,
"learning_rate": 2.668589174800355e-05,
"loss": 0.268,
"step": 1303
},
{
"epoch": 1.5616766467065868,
"grad_norm": 0.18495268127541142,
"learning_rate": 2.6663708961845606e-05,
"loss": 0.2576,
"step": 1304
},
{
"epoch": 1.562874251497006,
"grad_norm": 0.19461970442131696,
"learning_rate": 2.664152617568767e-05,
"loss": 0.2547,
"step": 1305
},
{
"epoch": 1.5640718562874252,
"grad_norm": 0.18246195422128939,
"learning_rate": 2.6619343389529726e-05,
"loss": 0.2586,
"step": 1306
},
{
"epoch": 1.5652694610778441,
"grad_norm": 0.23670267556915722,
"learning_rate": 2.6597160603371784e-05,
"loss": 0.2817,
"step": 1307
},
{
"epoch": 1.5664670658682636,
"grad_norm": 0.20767937208612458,
"learning_rate": 2.6574977817213843e-05,
"loss": 0.2614,
"step": 1308
},
{
"epoch": 1.5676646706586825,
"grad_norm": 0.19395887396615288,
"learning_rate": 2.6552795031055898e-05,
"loss": 0.2756,
"step": 1309
},
{
"epoch": 1.568862275449102,
"grad_norm": 0.18906223152614898,
"learning_rate": 2.6530612244897963e-05,
"loss": 0.2488,
"step": 1310
},
{
"epoch": 1.570059880239521,
"grad_norm": 0.20413687486862803,
"learning_rate": 2.6508429458740018e-05,
"loss": 0.2528,
"step": 1311
},
{
"epoch": 1.5712574850299401,
"grad_norm": 0.19522046274220883,
"learning_rate": 2.6486246672582076e-05,
"loss": 0.2646,
"step": 1312
},
{
"epoch": 1.5724550898203593,
"grad_norm": 0.18294283489492033,
"learning_rate": 2.6464063886424135e-05,
"loss": 0.2351,
"step": 1313
},
{
"epoch": 1.5736526946107783,
"grad_norm": 0.19232037100070182,
"learning_rate": 2.6441881100266197e-05,
"loss": 0.2533,
"step": 1314
},
{
"epoch": 1.5748502994011977,
"grad_norm": 0.19030607148232212,
"learning_rate": 2.6419698314108255e-05,
"loss": 0.2441,
"step": 1315
},
{
"epoch": 1.5760479041916167,
"grad_norm": 0.17815615650301855,
"learning_rate": 2.639751552795031e-05,
"loss": 0.2553,
"step": 1316
},
{
"epoch": 1.577245508982036,
"grad_norm": 0.18177527565373655,
"learning_rate": 2.637533274179237e-05,
"loss": 0.2655,
"step": 1317
},
{
"epoch": 1.578443113772455,
"grad_norm": 0.1921928211330294,
"learning_rate": 2.6353149955634427e-05,
"loss": 0.2652,
"step": 1318
},
{
"epoch": 1.5796407185628742,
"grad_norm": 0.17403100835248328,
"learning_rate": 2.633096716947649e-05,
"loss": 0.245,
"step": 1319
},
{
"epoch": 1.5808383233532934,
"grad_norm": 0.16174115195808092,
"learning_rate": 2.6308784383318547e-05,
"loss": 0.2536,
"step": 1320
},
{
"epoch": 1.5820359281437124,
"grad_norm": 0.17108184837734064,
"learning_rate": 2.6286601597160602e-05,
"loss": 0.2438,
"step": 1321
},
{
"epoch": 1.5832335329341318,
"grad_norm": 0.1782484646687252,
"learning_rate": 2.626441881100266e-05,
"loss": 0.2541,
"step": 1322
},
{
"epoch": 1.5844311377245508,
"grad_norm": 0.18384462732529316,
"learning_rate": 2.6242236024844723e-05,
"loss": 0.2455,
"step": 1323
},
{
"epoch": 1.5856287425149702,
"grad_norm": 0.1621166535330843,
"learning_rate": 2.622005323868678e-05,
"loss": 0.2375,
"step": 1324
},
{
"epoch": 1.5868263473053892,
"grad_norm": 0.18909087308509276,
"learning_rate": 2.619787045252884e-05,
"loss": 0.2768,
"step": 1325
},
{
"epoch": 1.5880239520958084,
"grad_norm": 0.17121922333711262,
"learning_rate": 2.6175687666370894e-05,
"loss": 0.2466,
"step": 1326
},
{
"epoch": 1.5892215568862276,
"grad_norm": 0.1826617447833997,
"learning_rate": 2.6153504880212953e-05,
"loss": 0.2366,
"step": 1327
},
{
"epoch": 1.5904191616766465,
"grad_norm": 0.1572487953031154,
"learning_rate": 2.6131322094055015e-05,
"loss": 0.221,
"step": 1328
},
{
"epoch": 1.591616766467066,
"grad_norm": 0.1789351338239946,
"learning_rate": 2.6109139307897073e-05,
"loss": 0.2382,
"step": 1329
},
{
"epoch": 1.592814371257485,
"grad_norm": 0.17701505974456228,
"learning_rate": 2.608695652173913e-05,
"loss": 0.2529,
"step": 1330
},
{
"epoch": 1.5940119760479043,
"grad_norm": 0.17533439000085402,
"learning_rate": 2.606477373558119e-05,
"loss": 0.2507,
"step": 1331
},
{
"epoch": 1.5952095808383233,
"grad_norm": 0.17820003705926152,
"learning_rate": 2.6042590949423252e-05,
"loss": 0.2376,
"step": 1332
},
{
"epoch": 1.5964071856287425,
"grad_norm": 0.18813271218912547,
"learning_rate": 2.6020408163265307e-05,
"loss": 0.2494,
"step": 1333
},
{
"epoch": 1.5976047904191617,
"grad_norm": 0.21017840212658676,
"learning_rate": 2.5998225377107365e-05,
"loss": 0.2602,
"step": 1334
},
{
"epoch": 1.5988023952095807,
"grad_norm": 0.206463088236004,
"learning_rate": 2.5976042590949424e-05,
"loss": 0.2688,
"step": 1335
},
{
"epoch": 1.6,
"grad_norm": 0.1712216765290739,
"learning_rate": 2.5953859804791482e-05,
"loss": 0.2513,
"step": 1336
},
{
"epoch": 1.601197604790419,
"grad_norm": 0.1859730351376408,
"learning_rate": 2.5931677018633544e-05,
"loss": 0.2551,
"step": 1337
},
{
"epoch": 1.6023952095808385,
"grad_norm": 0.17277898851016674,
"learning_rate": 2.5909494232475602e-05,
"loss": 0.2574,
"step": 1338
},
{
"epoch": 1.6035928143712574,
"grad_norm": 0.1894988250766726,
"learning_rate": 2.5887311446317657e-05,
"loss": 0.2661,
"step": 1339
},
{
"epoch": 1.6047904191616766,
"grad_norm": 0.18101451390599493,
"learning_rate": 2.5865128660159716e-05,
"loss": 0.2581,
"step": 1340
},
{
"epoch": 1.6059880239520958,
"grad_norm": 0.1700339077004057,
"learning_rate": 2.5842945874001778e-05,
"loss": 0.2491,
"step": 1341
},
{
"epoch": 1.6071856287425148,
"grad_norm": 0.16605511390845168,
"learning_rate": 2.5820763087843836e-05,
"loss": 0.2381,
"step": 1342
},
{
"epoch": 1.6083832335329342,
"grad_norm": 0.16605315779372393,
"learning_rate": 2.5798580301685894e-05,
"loss": 0.2352,
"step": 1343
},
{
"epoch": 1.6095808383233532,
"grad_norm": 0.1773570540115524,
"learning_rate": 2.577639751552795e-05,
"loss": 0.2462,
"step": 1344
},
{
"epoch": 1.6107784431137726,
"grad_norm": 0.162911532977113,
"learning_rate": 2.5754214729370008e-05,
"loss": 0.2499,
"step": 1345
},
{
"epoch": 1.6119760479041916,
"grad_norm": 0.16538645467747756,
"learning_rate": 2.573203194321207e-05,
"loss": 0.2269,
"step": 1346
},
{
"epoch": 1.6131736526946108,
"grad_norm": 0.17591912309035732,
"learning_rate": 2.5709849157054128e-05,
"loss": 0.2285,
"step": 1347
},
{
"epoch": 1.61437125748503,
"grad_norm": 0.1716497874255762,
"learning_rate": 2.5687666370896187e-05,
"loss": 0.249,
"step": 1348
},
{
"epoch": 1.615568862275449,
"grad_norm": 0.21531640726533183,
"learning_rate": 2.566548358473824e-05,
"loss": 0.2578,
"step": 1349
},
{
"epoch": 1.6167664670658684,
"grad_norm": 0.177184441065365,
"learning_rate": 2.5643300798580307e-05,
"loss": 0.2558,
"step": 1350
},
{
"epoch": 1.6179640718562873,
"grad_norm": 0.1608996014448995,
"learning_rate": 2.5621118012422362e-05,
"loss": 0.2476,
"step": 1351
},
{
"epoch": 1.6191616766467067,
"grad_norm": 0.15546937559091437,
"learning_rate": 2.559893522626442e-05,
"loss": 0.2302,
"step": 1352
},
{
"epoch": 1.6203592814371257,
"grad_norm": 0.17941330859363053,
"learning_rate": 2.557675244010648e-05,
"loss": 0.2544,
"step": 1353
},
{
"epoch": 1.621556886227545,
"grad_norm": 0.17747811724119844,
"learning_rate": 2.5554569653948534e-05,
"loss": 0.2546,
"step": 1354
},
{
"epoch": 1.622754491017964,
"grad_norm": 0.16722870909941023,
"learning_rate": 2.55323868677906e-05,
"loss": 0.2561,
"step": 1355
},
{
"epoch": 1.623952095808383,
"grad_norm": 0.1775703139781177,
"learning_rate": 2.5510204081632654e-05,
"loss": 0.2473,
"step": 1356
},
{
"epoch": 1.6251497005988025,
"grad_norm": 0.178920134317408,
"learning_rate": 2.5488021295474712e-05,
"loss": 0.2207,
"step": 1357
},
{
"epoch": 1.6263473053892215,
"grad_norm": 0.1824636351523728,
"learning_rate": 2.546583850931677e-05,
"loss": 0.2695,
"step": 1358
},
{
"epoch": 1.6275449101796409,
"grad_norm": 0.19042636962373713,
"learning_rate": 2.5443655723158833e-05,
"loss": 0.2665,
"step": 1359
},
{
"epoch": 1.6287425149700598,
"grad_norm": 0.27877569186698126,
"learning_rate": 2.542147293700089e-05,
"loss": 0.2555,
"step": 1360
},
{
"epoch": 1.629940119760479,
"grad_norm": 0.16762335507934042,
"learning_rate": 2.5399290150842946e-05,
"loss": 0.2526,
"step": 1361
},
{
"epoch": 1.6311377245508982,
"grad_norm": 0.17825721602721925,
"learning_rate": 2.5377107364685005e-05,
"loss": 0.2659,
"step": 1362
},
{
"epoch": 1.6323353293413174,
"grad_norm": 0.18976006109922694,
"learning_rate": 2.5354924578527063e-05,
"loss": 0.2606,
"step": 1363
},
{
"epoch": 1.6335329341317366,
"grad_norm": 0.18334243687937288,
"learning_rate": 2.5332741792369125e-05,
"loss": 0.2577,
"step": 1364
},
{
"epoch": 1.6347305389221556,
"grad_norm": 0.1628801751635123,
"learning_rate": 2.5310559006211183e-05,
"loss": 0.2434,
"step": 1365
},
{
"epoch": 1.635928143712575,
"grad_norm": 0.20432234789796766,
"learning_rate": 2.5288376220053238e-05,
"loss": 0.2571,
"step": 1366
},
{
"epoch": 1.637125748502994,
"grad_norm": 0.17377099855933612,
"learning_rate": 2.5266193433895297e-05,
"loss": 0.2731,
"step": 1367
},
{
"epoch": 1.6383233532934132,
"grad_norm": 0.1866904117128582,
"learning_rate": 2.524401064773736e-05,
"loss": 0.2693,
"step": 1368
},
{
"epoch": 1.6395209580838324,
"grad_norm": 0.18337588350779857,
"learning_rate": 2.5221827861579417e-05,
"loss": 0.246,
"step": 1369
},
{
"epoch": 1.6407185628742516,
"grad_norm": 0.17786037793430218,
"learning_rate": 2.5199645075421475e-05,
"loss": 0.2198,
"step": 1370
},
{
"epoch": 1.6419161676646707,
"grad_norm": 0.17773252971697304,
"learning_rate": 2.517746228926353e-05,
"loss": 0.2615,
"step": 1371
},
{
"epoch": 1.6431137724550897,
"grad_norm": 0.1779113074464276,
"learning_rate": 2.515527950310559e-05,
"loss": 0.2404,
"step": 1372
},
{
"epoch": 1.6443113772455091,
"grad_norm": 0.1981215614436777,
"learning_rate": 2.513309671694765e-05,
"loss": 0.2653,
"step": 1373
},
{
"epoch": 1.645508982035928,
"grad_norm": 0.17745760791125267,
"learning_rate": 2.511091393078971e-05,
"loss": 0.2718,
"step": 1374
},
{
"epoch": 1.6467065868263473,
"grad_norm": 0.18286555157467174,
"learning_rate": 2.5088731144631767e-05,
"loss": 0.2671,
"step": 1375
},
{
"epoch": 1.6479041916167665,
"grad_norm": 0.18479143191218517,
"learning_rate": 2.5066548358473823e-05,
"loss": 0.2251,
"step": 1376
},
{
"epoch": 1.6491017964071857,
"grad_norm": 0.1876716522720573,
"learning_rate": 2.5044365572315888e-05,
"loss": 0.2709,
"step": 1377
},
{
"epoch": 1.6502994011976049,
"grad_norm": 0.16483358261819947,
"learning_rate": 2.5022182786157943e-05,
"loss": 0.2288,
"step": 1378
},
{
"epoch": 1.6514970059880238,
"grad_norm": 0.19478336804157773,
"learning_rate": 2.5e-05,
"loss": 0.2449,
"step": 1379
},
{
"epoch": 1.6526946107784433,
"grad_norm": 0.17074765441801643,
"learning_rate": 2.497781721384206e-05,
"loss": 0.2356,
"step": 1380
},
{
"epoch": 1.6538922155688622,
"grad_norm": 0.1988066260395488,
"learning_rate": 2.4955634427684118e-05,
"loss": 0.2777,
"step": 1381
},
{
"epoch": 1.6550898203592814,
"grad_norm": 0.19187868934649338,
"learning_rate": 2.4933451641526176e-05,
"loss": 0.2425,
"step": 1382
},
{
"epoch": 1.6562874251497006,
"grad_norm": 0.18815552303994434,
"learning_rate": 2.4911268855368235e-05,
"loss": 0.2605,
"step": 1383
},
{
"epoch": 1.6574850299401198,
"grad_norm": 0.17425296730800727,
"learning_rate": 2.4889086069210293e-05,
"loss": 0.2453,
"step": 1384
},
{
"epoch": 1.658682634730539,
"grad_norm": 0.18386764415832574,
"learning_rate": 2.4866903283052355e-05,
"loss": 0.2645,
"step": 1385
},
{
"epoch": 1.659880239520958,
"grad_norm": 0.18623003250215892,
"learning_rate": 2.484472049689441e-05,
"loss": 0.2652,
"step": 1386
},
{
"epoch": 1.6610778443113774,
"grad_norm": 0.164887940998589,
"learning_rate": 2.4822537710736472e-05,
"loss": 0.2473,
"step": 1387
},
{
"epoch": 1.6622754491017964,
"grad_norm": 0.1793211171980339,
"learning_rate": 2.4800354924578527e-05,
"loss": 0.2595,
"step": 1388
},
{
"epoch": 1.6634730538922156,
"grad_norm": 0.181756532275855,
"learning_rate": 2.4778172138420585e-05,
"loss": 0.2406,
"step": 1389
},
{
"epoch": 1.6646706586826348,
"grad_norm": 0.1845809658528292,
"learning_rate": 2.4755989352262647e-05,
"loss": 0.2499,
"step": 1390
},
{
"epoch": 1.665868263473054,
"grad_norm": 0.2122033807898885,
"learning_rate": 2.4733806566104702e-05,
"loss": 0.2377,
"step": 1391
},
{
"epoch": 1.6670658682634731,
"grad_norm": 0.19861374979712562,
"learning_rate": 2.4711623779946764e-05,
"loss": 0.263,
"step": 1392
},
{
"epoch": 1.668263473053892,
"grad_norm": 0.17253365544554503,
"learning_rate": 2.468944099378882e-05,
"loss": 0.2604,
"step": 1393
},
{
"epoch": 1.6694610778443115,
"grad_norm": 0.17887382639531244,
"learning_rate": 2.466725820763088e-05,
"loss": 0.2513,
"step": 1394
},
{
"epoch": 1.6706586826347305,
"grad_norm": 0.20405084206219926,
"learning_rate": 2.464507542147294e-05,
"loss": 0.2477,
"step": 1395
},
{
"epoch": 1.6718562874251497,
"grad_norm": 0.16057139938225007,
"learning_rate": 2.4622892635314998e-05,
"loss": 0.237,
"step": 1396
},
{
"epoch": 1.6730538922155689,
"grad_norm": 0.1838566645628161,
"learning_rate": 2.4600709849157056e-05,
"loss": 0.2266,
"step": 1397
},
{
"epoch": 1.674251497005988,
"grad_norm": 0.19495027379008448,
"learning_rate": 2.457852706299911e-05,
"loss": 0.2586,
"step": 1398
},
{
"epoch": 1.6754491017964073,
"grad_norm": 0.16893322231753372,
"learning_rate": 2.4556344276841173e-05,
"loss": 0.2579,
"step": 1399
},
{
"epoch": 1.6766467065868262,
"grad_norm": 0.19697317758607466,
"learning_rate": 2.453416149068323e-05,
"loss": 0.2437,
"step": 1400
},
{
"epoch": 1.6778443113772457,
"grad_norm": 0.18304633242958093,
"learning_rate": 2.451197870452529e-05,
"loss": 0.2573,
"step": 1401
},
{
"epoch": 1.6790419161676646,
"grad_norm": 0.18027072401122293,
"learning_rate": 2.448979591836735e-05,
"loss": 0.2711,
"step": 1402
},
{
"epoch": 1.6802395209580838,
"grad_norm": 0.18063315372553004,
"learning_rate": 2.4467613132209407e-05,
"loss": 0.2577,
"step": 1403
},
{
"epoch": 1.681437125748503,
"grad_norm": 0.20708056901533498,
"learning_rate": 2.4445430346051465e-05,
"loss": 0.2715,
"step": 1404
},
{
"epoch": 1.6826347305389222,
"grad_norm": 0.18428454873635292,
"learning_rate": 2.4423247559893524e-05,
"loss": 0.2437,
"step": 1405
},
{
"epoch": 1.6838323353293414,
"grad_norm": 0.1941205509491644,
"learning_rate": 2.4401064773735582e-05,
"loss": 0.2468,
"step": 1406
},
{
"epoch": 1.6850299401197604,
"grad_norm": 0.16150448614064275,
"learning_rate": 2.437888198757764e-05,
"loss": 0.2307,
"step": 1407
},
{
"epoch": 1.6862275449101798,
"grad_norm": 0.1840126230963795,
"learning_rate": 2.43566992014197e-05,
"loss": 0.2492,
"step": 1408
},
{
"epoch": 1.6874251497005988,
"grad_norm": 0.2008307650580576,
"learning_rate": 2.4334516415261757e-05,
"loss": 0.2546,
"step": 1409
},
{
"epoch": 1.688622754491018,
"grad_norm": 0.1594106590789615,
"learning_rate": 2.4312333629103816e-05,
"loss": 0.2472,
"step": 1410
},
{
"epoch": 1.6898203592814371,
"grad_norm": 0.2064989247266305,
"learning_rate": 2.4290150842945874e-05,
"loss": 0.2613,
"step": 1411
},
{
"epoch": 1.6910179640718563,
"grad_norm": 0.1746449768750473,
"learning_rate": 2.4267968056787936e-05,
"loss": 0.2384,
"step": 1412
},
{
"epoch": 1.6922155688622755,
"grad_norm": 0.2559132115395545,
"learning_rate": 2.424578527062999e-05,
"loss": 0.2494,
"step": 1413
},
{
"epoch": 1.6934131736526945,
"grad_norm": 0.1860958708624756,
"learning_rate": 2.4223602484472053e-05,
"loss": 0.2727,
"step": 1414
},
{
"epoch": 1.694610778443114,
"grad_norm": 0.19935076513273312,
"learning_rate": 2.4201419698314108e-05,
"loss": 0.2761,
"step": 1415
},
{
"epoch": 1.695808383233533,
"grad_norm": 0.1679086581113317,
"learning_rate": 2.4179236912156166e-05,
"loss": 0.2416,
"step": 1416
},
{
"epoch": 1.697005988023952,
"grad_norm": 0.21529441334467608,
"learning_rate": 2.4157054125998228e-05,
"loss": 0.2831,
"step": 1417
},
{
"epoch": 1.6982035928143713,
"grad_norm": 0.1809670727188505,
"learning_rate": 2.4134871339840283e-05,
"loss": 0.2403,
"step": 1418
},
{
"epoch": 1.6994011976047905,
"grad_norm": 0.16416549964009036,
"learning_rate": 2.4112688553682345e-05,
"loss": 0.267,
"step": 1419
},
{
"epoch": 1.7005988023952097,
"grad_norm": 0.1779912959334911,
"learning_rate": 2.40905057675244e-05,
"loss": 0.2576,
"step": 1420
},
{
"epoch": 1.7017964071856286,
"grad_norm": 0.19480255025493726,
"learning_rate": 2.4068322981366462e-05,
"loss": 0.25,
"step": 1421
},
{
"epoch": 1.702994011976048,
"grad_norm": 0.1729812468237824,
"learning_rate": 2.404614019520852e-05,
"loss": 0.245,
"step": 1422
},
{
"epoch": 1.704191616766467,
"grad_norm": 0.19595403033274814,
"learning_rate": 2.402395740905058e-05,
"loss": 0.2645,
"step": 1423
},
{
"epoch": 1.7053892215568862,
"grad_norm": 0.17399677368740835,
"learning_rate": 2.4001774622892637e-05,
"loss": 0.2384,
"step": 1424
},
{
"epoch": 1.7065868263473054,
"grad_norm": 0.1667985393123726,
"learning_rate": 2.3979591836734696e-05,
"loss": 0.244,
"step": 1425
},
{
"epoch": 1.7077844311377246,
"grad_norm": 0.17907505217544215,
"learning_rate": 2.3957409050576754e-05,
"loss": 0.244,
"step": 1426
},
{
"epoch": 1.7089820359281438,
"grad_norm": 0.1700976906601959,
"learning_rate": 2.3935226264418812e-05,
"loss": 0.2332,
"step": 1427
},
{
"epoch": 1.7101796407185628,
"grad_norm": 0.19121122373104382,
"learning_rate": 2.391304347826087e-05,
"loss": 0.2511,
"step": 1428
},
{
"epoch": 1.7113772455089822,
"grad_norm": 0.16568694462001893,
"learning_rate": 2.389086069210293e-05,
"loss": 0.2443,
"step": 1429
},
{
"epoch": 1.7125748502994012,
"grad_norm": 0.18317532645764487,
"learning_rate": 2.3868677905944988e-05,
"loss": 0.2509,
"step": 1430
},
{
"epoch": 1.7137724550898203,
"grad_norm": 0.18100394723021546,
"learning_rate": 2.3846495119787046e-05,
"loss": 0.2263,
"step": 1431
},
{
"epoch": 1.7149700598802395,
"grad_norm": 0.17609121958466176,
"learning_rate": 2.3824312333629108e-05,
"loss": 0.2317,
"step": 1432
},
{
"epoch": 1.7161676646706587,
"grad_norm": 0.16928045699897623,
"learning_rate": 2.3802129547471163e-05,
"loss": 0.2481,
"step": 1433
},
{
"epoch": 1.717365269461078,
"grad_norm": 0.16030389216442484,
"learning_rate": 2.377994676131322e-05,
"loss": 0.2364,
"step": 1434
},
{
"epoch": 1.718562874251497,
"grad_norm": 0.17699147336069931,
"learning_rate": 2.375776397515528e-05,
"loss": 0.2458,
"step": 1435
},
{
"epoch": 1.7197604790419163,
"grad_norm": 0.1808006673598191,
"learning_rate": 2.3735581188997338e-05,
"loss": 0.2472,
"step": 1436
},
{
"epoch": 1.7209580838323353,
"grad_norm": 0.18272666667043674,
"learning_rate": 2.37133984028394e-05,
"loss": 0.2527,
"step": 1437
},
{
"epoch": 1.7221556886227545,
"grad_norm": 0.1765152722103066,
"learning_rate": 2.3691215616681455e-05,
"loss": 0.2473,
"step": 1438
},
{
"epoch": 1.7233532934131737,
"grad_norm": 0.16926979210803353,
"learning_rate": 2.3669032830523517e-05,
"loss": 0.2487,
"step": 1439
},
{
"epoch": 1.7245508982035929,
"grad_norm": 0.1687838491768147,
"learning_rate": 2.3646850044365572e-05,
"loss": 0.2379,
"step": 1440
},
{
"epoch": 1.725748502994012,
"grad_norm": 0.19417041480113606,
"learning_rate": 2.3624667258207634e-05,
"loss": 0.2542,
"step": 1441
},
{
"epoch": 1.726946107784431,
"grad_norm": 0.18043761522919255,
"learning_rate": 2.3602484472049692e-05,
"loss": 0.252,
"step": 1442
},
{
"epoch": 1.7281437125748504,
"grad_norm": 0.18404654943883786,
"learning_rate": 2.3580301685891747e-05,
"loss": 0.2352,
"step": 1443
},
{
"epoch": 1.7293413173652694,
"grad_norm": 0.18720150073601868,
"learning_rate": 2.355811889973381e-05,
"loss": 0.2458,
"step": 1444
},
{
"epoch": 1.7305389221556886,
"grad_norm": 0.1759385685805571,
"learning_rate": 2.3535936113575864e-05,
"loss": 0.2518,
"step": 1445
},
{
"epoch": 1.7317365269461078,
"grad_norm": 0.16659847031781005,
"learning_rate": 2.3513753327417926e-05,
"loss": 0.2426,
"step": 1446
},
{
"epoch": 1.732934131736527,
"grad_norm": 0.16365270486840428,
"learning_rate": 2.3491570541259984e-05,
"loss": 0.2566,
"step": 1447
},
{
"epoch": 1.7341317365269462,
"grad_norm": 0.15995469795837855,
"learning_rate": 2.3469387755102043e-05,
"loss": 0.2584,
"step": 1448
},
{
"epoch": 1.7353293413173652,
"grad_norm": 0.1887218997853253,
"learning_rate": 2.34472049689441e-05,
"loss": 0.2546,
"step": 1449
},
{
"epoch": 1.7365269461077846,
"grad_norm": 0.1701862247242305,
"learning_rate": 2.3425022182786156e-05,
"loss": 0.24,
"step": 1450
},
{
"epoch": 1.7377245508982035,
"grad_norm": 0.16853832091151583,
"learning_rate": 2.3402839396628218e-05,
"loss": 0.2407,
"step": 1451
},
{
"epoch": 1.7389221556886227,
"grad_norm": 0.18038141398242707,
"learning_rate": 2.3380656610470276e-05,
"loss": 0.2446,
"step": 1452
},
{
"epoch": 1.740119760479042,
"grad_norm": 0.18323765514669427,
"learning_rate": 2.3358473824312335e-05,
"loss": 0.2506,
"step": 1453
},
{
"epoch": 1.7413173652694611,
"grad_norm": 0.1931344852819194,
"learning_rate": 2.3336291038154393e-05,
"loss": 0.2385,
"step": 1454
},
{
"epoch": 1.7425149700598803,
"grad_norm": 0.1803318461303765,
"learning_rate": 2.3314108251996452e-05,
"loss": 0.2514,
"step": 1455
},
{
"epoch": 1.7437125748502993,
"grad_norm": 0.15587100758534478,
"learning_rate": 2.329192546583851e-05,
"loss": 0.2359,
"step": 1456
},
{
"epoch": 1.7449101796407187,
"grad_norm": 0.16688654387006174,
"learning_rate": 2.326974267968057e-05,
"loss": 0.2532,
"step": 1457
},
{
"epoch": 1.7461077844311377,
"grad_norm": 0.191214961748042,
"learning_rate": 2.3247559893522627e-05,
"loss": 0.2456,
"step": 1458
},
{
"epoch": 1.7473053892215569,
"grad_norm": 0.17331035616865076,
"learning_rate": 2.3225377107364685e-05,
"loss": 0.2501,
"step": 1459
},
{
"epoch": 1.748502994011976,
"grad_norm": 0.200512047605763,
"learning_rate": 2.3203194321206744e-05,
"loss": 0.2584,
"step": 1460
},
{
"epoch": 1.7497005988023953,
"grad_norm": 0.22796320347993143,
"learning_rate": 2.3181011535048802e-05,
"loss": 0.2822,
"step": 1461
},
{
"epoch": 1.7508982035928145,
"grad_norm": 0.15312615493257148,
"learning_rate": 2.315882874889086e-05,
"loss": 0.2317,
"step": 1462
},
{
"epoch": 1.7520958083832334,
"grad_norm": 0.1792560765653609,
"learning_rate": 2.313664596273292e-05,
"loss": 0.2498,
"step": 1463
},
{
"epoch": 1.7532934131736528,
"grad_norm": 0.17972514178020366,
"learning_rate": 2.311446317657498e-05,
"loss": 0.2474,
"step": 1464
},
{
"epoch": 1.7544910179640718,
"grad_norm": 0.18043898663995997,
"learning_rate": 2.3092280390417036e-05,
"loss": 0.2452,
"step": 1465
},
{
"epoch": 1.755688622754491,
"grad_norm": 0.15903186492829688,
"learning_rate": 2.3070097604259098e-05,
"loss": 0.2317,
"step": 1466
},
{
"epoch": 1.7568862275449102,
"grad_norm": 0.18187638878114779,
"learning_rate": 2.3047914818101153e-05,
"loss": 0.2477,
"step": 1467
},
{
"epoch": 1.7580838323353294,
"grad_norm": 0.17496850369002573,
"learning_rate": 2.302573203194321e-05,
"loss": 0.2428,
"step": 1468
},
{
"epoch": 1.7592814371257486,
"grad_norm": 0.2034816218809776,
"learning_rate": 2.3003549245785273e-05,
"loss": 0.2714,
"step": 1469
},
{
"epoch": 1.7604790419161676,
"grad_norm": 0.19016018643652358,
"learning_rate": 2.2981366459627328e-05,
"loss": 0.2636,
"step": 1470
},
{
"epoch": 1.761676646706587,
"grad_norm": 0.16928057293472507,
"learning_rate": 2.295918367346939e-05,
"loss": 0.2324,
"step": 1471
},
{
"epoch": 1.762874251497006,
"grad_norm": 0.18277501316601022,
"learning_rate": 2.293700088731145e-05,
"loss": 0.2239,
"step": 1472
},
{
"epoch": 1.7640718562874251,
"grad_norm": 0.19395287745604076,
"learning_rate": 2.2914818101153507e-05,
"loss": 0.2604,
"step": 1473
},
{
"epoch": 1.7652694610778443,
"grad_norm": 0.17516262689583717,
"learning_rate": 2.2892635314995565e-05,
"loss": 0.247,
"step": 1474
},
{
"epoch": 1.7664670658682635,
"grad_norm": 0.19809238210367652,
"learning_rate": 2.2870452528837624e-05,
"loss": 0.2659,
"step": 1475
},
{
"epoch": 1.7676646706586827,
"grad_norm": 0.1916734880520613,
"learning_rate": 2.2848269742679682e-05,
"loss": 0.2424,
"step": 1476
},
{
"epoch": 1.7688622754491017,
"grad_norm": 0.17004164267830713,
"learning_rate": 2.282608695652174e-05,
"loss": 0.2535,
"step": 1477
},
{
"epoch": 1.770059880239521,
"grad_norm": 0.18175962729278708,
"learning_rate": 2.28039041703638e-05,
"loss": 0.2517,
"step": 1478
},
{
"epoch": 1.77125748502994,
"grad_norm": 0.18993884723626234,
"learning_rate": 2.2781721384205857e-05,
"loss": 0.2463,
"step": 1479
},
{
"epoch": 1.7724550898203593,
"grad_norm": 0.18587068846643628,
"learning_rate": 2.2759538598047916e-05,
"loss": 0.2629,
"step": 1480
},
{
"epoch": 1.7736526946107785,
"grad_norm": 0.1970972827130029,
"learning_rate": 2.2737355811889974e-05,
"loss": 0.247,
"step": 1481
},
{
"epoch": 1.7748502994011977,
"grad_norm": 0.18996856585971247,
"learning_rate": 2.2715173025732033e-05,
"loss": 0.2438,
"step": 1482
},
{
"epoch": 1.7760479041916168,
"grad_norm": 0.19498992059799147,
"learning_rate": 2.269299023957409e-05,
"loss": 0.2352,
"step": 1483
},
{
"epoch": 1.7772455089820358,
"grad_norm": 0.1831549599618803,
"learning_rate": 2.2670807453416153e-05,
"loss": 0.2572,
"step": 1484
},
{
"epoch": 1.7784431137724552,
"grad_norm": 0.2029347434992285,
"learning_rate": 2.2648624667258208e-05,
"loss": 0.2577,
"step": 1485
},
{
"epoch": 1.7796407185628742,
"grad_norm": 0.18136825131621717,
"learning_rate": 2.2626441881100266e-05,
"loss": 0.267,
"step": 1486
},
{
"epoch": 1.7808383233532934,
"grad_norm": 0.19574917359731797,
"learning_rate": 2.2604259094942325e-05,
"loss": 0.2794,
"step": 1487
},
{
"epoch": 1.7820359281437126,
"grad_norm": 0.19773066617776663,
"learning_rate": 2.2582076308784383e-05,
"loss": 0.2587,
"step": 1488
},
{
"epoch": 1.7832335329341318,
"grad_norm": 0.17550007468693501,
"learning_rate": 2.2559893522626445e-05,
"loss": 0.2538,
"step": 1489
},
{
"epoch": 1.784431137724551,
"grad_norm": 0.19405308255502374,
"learning_rate": 2.25377107364685e-05,
"loss": 0.2367,
"step": 1490
},
{
"epoch": 1.78562874251497,
"grad_norm": 0.1904570669122457,
"learning_rate": 2.2515527950310562e-05,
"loss": 0.2405,
"step": 1491
},
{
"epoch": 1.7868263473053894,
"grad_norm": 0.1987915814319208,
"learning_rate": 2.2493345164152617e-05,
"loss": 0.2584,
"step": 1492
},
{
"epoch": 1.7880239520958083,
"grad_norm": 0.17439148511645305,
"learning_rate": 2.247116237799468e-05,
"loss": 0.2518,
"step": 1493
},
{
"epoch": 1.7892215568862275,
"grad_norm": 0.17240037363669355,
"learning_rate": 2.2448979591836737e-05,
"loss": 0.2273,
"step": 1494
},
{
"epoch": 1.7904191616766467,
"grad_norm": 0.17755251698367883,
"learning_rate": 2.2426796805678792e-05,
"loss": 0.2587,
"step": 1495
},
{
"epoch": 1.791616766467066,
"grad_norm": 0.23853387789287483,
"learning_rate": 2.2404614019520854e-05,
"loss": 0.2479,
"step": 1496
},
{
"epoch": 1.792814371257485,
"grad_norm": 0.18048021499769135,
"learning_rate": 2.238243123336291e-05,
"loss": 0.2562,
"step": 1497
},
{
"epoch": 1.794011976047904,
"grad_norm": 0.16664179553221917,
"learning_rate": 2.236024844720497e-05,
"loss": 0.2233,
"step": 1498
},
{
"epoch": 1.7952095808383235,
"grad_norm": 0.1906196118173108,
"learning_rate": 2.233806566104703e-05,
"loss": 0.2356,
"step": 1499
},
{
"epoch": 1.7964071856287425,
"grad_norm": 0.1921555124139323,
"learning_rate": 2.2315882874889088e-05,
"loss": 0.2539,
"step": 1500
},
{
"epoch": 1.7976047904191617,
"grad_norm": 0.1962722276800485,
"learning_rate": 2.2293700088731146e-05,
"loss": 0.2475,
"step": 1501
},
{
"epoch": 1.7988023952095809,
"grad_norm": 0.18140074345842855,
"learning_rate": 2.2271517302573205e-05,
"loss": 0.2468,
"step": 1502
},
{
"epoch": 1.8,
"grad_norm": 0.1875392090132192,
"learning_rate": 2.2249334516415263e-05,
"loss": 0.2402,
"step": 1503
},
{
"epoch": 1.8011976047904192,
"grad_norm": 0.17258530898014685,
"learning_rate": 2.222715173025732e-05,
"loss": 0.2347,
"step": 1504
},
{
"epoch": 1.8023952095808382,
"grad_norm": 0.18492938288058236,
"learning_rate": 2.220496894409938e-05,
"loss": 0.2188,
"step": 1505
},
{
"epoch": 1.8035928143712576,
"grad_norm": 0.7446503006859507,
"learning_rate": 2.2182786157941438e-05,
"loss": 0.2602,
"step": 1506
},
{
"epoch": 1.8047904191616766,
"grad_norm": 0.18007024310213887,
"learning_rate": 2.2160603371783497e-05,
"loss": 0.2475,
"step": 1507
},
{
"epoch": 1.8059880239520958,
"grad_norm": 0.16850754324773026,
"learning_rate": 2.2138420585625555e-05,
"loss": 0.2491,
"step": 1508
},
{
"epoch": 1.807185628742515,
"grad_norm": 0.18844933485367427,
"learning_rate": 2.2116237799467614e-05,
"loss": 0.2478,
"step": 1509
},
{
"epoch": 1.8083832335329342,
"grad_norm": 0.1766384494272321,
"learning_rate": 2.2094055013309672e-05,
"loss": 0.2532,
"step": 1510
},
{
"epoch": 1.8095808383233534,
"grad_norm": 0.17992450748542577,
"learning_rate": 2.2071872227151734e-05,
"loss": 0.2519,
"step": 1511
},
{
"epoch": 1.8107784431137723,
"grad_norm": 0.16090544904773224,
"learning_rate": 2.204968944099379e-05,
"loss": 0.2276,
"step": 1512
},
{
"epoch": 1.8119760479041918,
"grad_norm": 0.17421430364064414,
"learning_rate": 2.2027506654835847e-05,
"loss": 0.2569,
"step": 1513
},
{
"epoch": 1.8131736526946107,
"grad_norm": 0.17361338471958032,
"learning_rate": 2.2005323868677906e-05,
"loss": 0.2555,
"step": 1514
},
{
"epoch": 1.81437125748503,
"grad_norm": 0.16999430381060623,
"learning_rate": 2.1983141082519964e-05,
"loss": 0.2453,
"step": 1515
},
{
"epoch": 1.8155688622754491,
"grad_norm": 0.16818702676644962,
"learning_rate": 2.1960958296362026e-05,
"loss": 0.239,
"step": 1516
},
{
"epoch": 1.8167664670658683,
"grad_norm": 0.1599647093720731,
"learning_rate": 2.193877551020408e-05,
"loss": 0.2506,
"step": 1517
},
{
"epoch": 1.8179640718562875,
"grad_norm": 0.16889698267673575,
"learning_rate": 2.1916592724046143e-05,
"loss": 0.2361,
"step": 1518
},
{
"epoch": 1.8191616766467065,
"grad_norm": 0.16789545998195468,
"learning_rate": 2.1894409937888198e-05,
"loss": 0.2355,
"step": 1519
},
{
"epoch": 1.8203592814371259,
"grad_norm": 0.1577261545143947,
"learning_rate": 2.187222715173026e-05,
"loss": 0.2248,
"step": 1520
},
{
"epoch": 1.8215568862275449,
"grad_norm": 0.19379132247542025,
"learning_rate": 2.1850044365572318e-05,
"loss": 0.2684,
"step": 1521
},
{
"epoch": 1.822754491017964,
"grad_norm": 0.18329138711031784,
"learning_rate": 2.1827861579414373e-05,
"loss": 0.2416,
"step": 1522
},
{
"epoch": 1.8239520958083832,
"grad_norm": 0.16768191767271243,
"learning_rate": 2.1805678793256435e-05,
"loss": 0.2596,
"step": 1523
},
{
"epoch": 1.8251497005988024,
"grad_norm": 0.1637464316137773,
"learning_rate": 2.1783496007098493e-05,
"loss": 0.2508,
"step": 1524
},
{
"epoch": 1.8263473053892216,
"grad_norm": 0.16103497991672075,
"learning_rate": 2.1761313220940552e-05,
"loss": 0.2281,
"step": 1525
},
{
"epoch": 1.8275449101796406,
"grad_norm": 0.17001975625349683,
"learning_rate": 2.173913043478261e-05,
"loss": 0.2274,
"step": 1526
},
{
"epoch": 1.82874251497006,
"grad_norm": 0.17542876674348032,
"learning_rate": 2.171694764862467e-05,
"loss": 0.2401,
"step": 1527
},
{
"epoch": 1.829940119760479,
"grad_norm": 0.15629213843425085,
"learning_rate": 2.1694764862466727e-05,
"loss": 0.229,
"step": 1528
},
{
"epoch": 1.8311377245508982,
"grad_norm": 0.1649084012455484,
"learning_rate": 2.1672582076308785e-05,
"loss": 0.2214,
"step": 1529
},
{
"epoch": 1.8323353293413174,
"grad_norm": 0.1883156817548525,
"learning_rate": 2.1650399290150844e-05,
"loss": 0.2585,
"step": 1530
},
{
"epoch": 1.8335329341317366,
"grad_norm": 0.16424655204740632,
"learning_rate": 2.1628216503992902e-05,
"loss": 0.2542,
"step": 1531
},
{
"epoch": 1.8347305389221558,
"grad_norm": 0.16759247804627936,
"learning_rate": 2.160603371783496e-05,
"loss": 0.2405,
"step": 1532
},
{
"epoch": 1.8359281437125747,
"grad_norm": 0.16364753140856517,
"learning_rate": 2.158385093167702e-05,
"loss": 0.2568,
"step": 1533
},
{
"epoch": 1.8371257485029941,
"grad_norm": 0.1866702764773445,
"learning_rate": 2.1561668145519078e-05,
"loss": 0.2374,
"step": 1534
},
{
"epoch": 1.8383233532934131,
"grad_norm": 0.1764963699347218,
"learning_rate": 2.1539485359361136e-05,
"loss": 0.2464,
"step": 1535
},
{
"epoch": 1.8395209580838323,
"grad_norm": 0.16116132025541988,
"learning_rate": 2.1517302573203198e-05,
"loss": 0.225,
"step": 1536
},
{
"epoch": 1.8407185628742515,
"grad_norm": 0.17916031514098885,
"learning_rate": 2.1495119787045253e-05,
"loss": 0.2574,
"step": 1537
},
{
"epoch": 1.8419161676646707,
"grad_norm": 0.17884759595011784,
"learning_rate": 2.1472937000887315e-05,
"loss": 0.273,
"step": 1538
},
{
"epoch": 1.84311377245509,
"grad_norm": 0.17587156760520048,
"learning_rate": 2.145075421472937e-05,
"loss": 0.2455,
"step": 1539
},
{
"epoch": 1.8443113772455089,
"grad_norm": 0.19241070506487035,
"learning_rate": 2.1428571428571428e-05,
"loss": 0.2506,
"step": 1540
},
{
"epoch": 1.8455089820359283,
"grad_norm": 0.2120790980102968,
"learning_rate": 2.140638864241349e-05,
"loss": 0.2683,
"step": 1541
},
{
"epoch": 1.8467065868263473,
"grad_norm": 0.17314665910027618,
"learning_rate": 2.1384205856255545e-05,
"loss": 0.2477,
"step": 1542
},
{
"epoch": 1.8479041916167664,
"grad_norm": 0.18786355749229466,
"learning_rate": 2.1362023070097607e-05,
"loss": 0.2632,
"step": 1543
},
{
"epoch": 1.8491017964071856,
"grad_norm": 0.18455973930324865,
"learning_rate": 2.1339840283939662e-05,
"loss": 0.2686,
"step": 1544
},
{
"epoch": 1.8502994011976048,
"grad_norm": 0.20071098660417322,
"learning_rate": 2.1317657497781724e-05,
"loss": 0.2478,
"step": 1545
},
{
"epoch": 1.851497005988024,
"grad_norm": 0.20585344281579565,
"learning_rate": 2.1295474711623782e-05,
"loss": 0.2576,
"step": 1546
},
{
"epoch": 1.852694610778443,
"grad_norm": 0.172700683019708,
"learning_rate": 2.127329192546584e-05,
"loss": 0.2426,
"step": 1547
},
{
"epoch": 1.8538922155688624,
"grad_norm": 0.1708405640326825,
"learning_rate": 2.12511091393079e-05,
"loss": 0.2268,
"step": 1548
},
{
"epoch": 1.8550898203592814,
"grad_norm": 0.1996844928878843,
"learning_rate": 2.1228926353149954e-05,
"loss": 0.2529,
"step": 1549
},
{
"epoch": 1.8562874251497006,
"grad_norm": 0.19899140396366907,
"learning_rate": 2.1206743566992016e-05,
"loss": 0.2659,
"step": 1550
},
{
"epoch": 1.8574850299401198,
"grad_norm": 0.17244688114474435,
"learning_rate": 2.1184560780834074e-05,
"loss": 0.2443,
"step": 1551
},
{
"epoch": 1.858682634730539,
"grad_norm": 0.211483765060586,
"learning_rate": 2.1162377994676133e-05,
"loss": 0.2584,
"step": 1552
},
{
"epoch": 1.8598802395209582,
"grad_norm": 0.17605246462669613,
"learning_rate": 2.114019520851819e-05,
"loss": 0.2535,
"step": 1553
},
{
"epoch": 1.8610778443113771,
"grad_norm": 0.18292002684558967,
"learning_rate": 2.111801242236025e-05,
"loss": 0.2396,
"step": 1554
},
{
"epoch": 1.8622754491017965,
"grad_norm": 0.18323742888647135,
"learning_rate": 2.1095829636202308e-05,
"loss": 0.2416,
"step": 1555
},
{
"epoch": 1.8634730538922155,
"grad_norm": 0.20545031713957645,
"learning_rate": 2.1073646850044366e-05,
"loss": 0.2333,
"step": 1556
},
{
"epoch": 1.8646706586826347,
"grad_norm": 0.21367578001004967,
"learning_rate": 2.1051464063886425e-05,
"loss": 0.2279,
"step": 1557
},
{
"epoch": 1.865868263473054,
"grad_norm": 0.18204556633007676,
"learning_rate": 2.1029281277728483e-05,
"loss": 0.2511,
"step": 1558
},
{
"epoch": 1.867065868263473,
"grad_norm": 0.19166752149189337,
"learning_rate": 2.100709849157054e-05,
"loss": 0.2425,
"step": 1559
},
{
"epoch": 1.8682634730538923,
"grad_norm": 0.192874663385084,
"learning_rate": 2.09849157054126e-05,
"loss": 0.2396,
"step": 1560
},
{
"epoch": 1.8694610778443113,
"grad_norm": 1.7224980652944684,
"learning_rate": 2.096273291925466e-05,
"loss": 0.2567,
"step": 1561
},
{
"epoch": 1.8706586826347307,
"grad_norm": 0.17938734146251595,
"learning_rate": 2.0940550133096717e-05,
"loss": 0.2344,
"step": 1562
},
{
"epoch": 1.8718562874251496,
"grad_norm": 0.17877234895717797,
"learning_rate": 2.091836734693878e-05,
"loss": 0.2454,
"step": 1563
},
{
"epoch": 1.8730538922155688,
"grad_norm": 0.18604205858582673,
"learning_rate": 2.0896184560780834e-05,
"loss": 0.2221,
"step": 1564
},
{
"epoch": 1.874251497005988,
"grad_norm": 0.17448819154291847,
"learning_rate": 2.0874001774622896e-05,
"loss": 0.2413,
"step": 1565
},
{
"epoch": 1.8754491017964072,
"grad_norm": 0.1880983304519039,
"learning_rate": 2.085181898846495e-05,
"loss": 0.2454,
"step": 1566
},
{
"epoch": 1.8766467065868264,
"grad_norm": 0.18696498999263853,
"learning_rate": 2.082963620230701e-05,
"loss": 0.2499,
"step": 1567
},
{
"epoch": 1.8778443113772454,
"grad_norm": 0.17469042624707734,
"learning_rate": 2.080745341614907e-05,
"loss": 0.2401,
"step": 1568
},
{
"epoch": 1.8790419161676648,
"grad_norm": 0.1853271064732628,
"learning_rate": 2.0785270629991126e-05,
"loss": 0.273,
"step": 1569
},
{
"epoch": 1.8802395209580838,
"grad_norm": 0.18718264276946017,
"learning_rate": 2.0763087843833188e-05,
"loss": 0.2413,
"step": 1570
},
{
"epoch": 1.881437125748503,
"grad_norm": 0.20627611322066347,
"learning_rate": 2.0740905057675246e-05,
"loss": 0.2311,
"step": 1571
},
{
"epoch": 1.8826347305389222,
"grad_norm": 0.20999283838348554,
"learning_rate": 2.0718722271517305e-05,
"loss": 0.2526,
"step": 1572
},
{
"epoch": 1.8838323353293414,
"grad_norm": 0.17527781753053395,
"learning_rate": 2.0696539485359363e-05,
"loss": 0.2408,
"step": 1573
},
{
"epoch": 1.8850299401197605,
"grad_norm": 0.2667080531795302,
"learning_rate": 2.067435669920142e-05,
"loss": 0.2684,
"step": 1574
},
{
"epoch": 1.8862275449101795,
"grad_norm": 0.18260947579712256,
"learning_rate": 2.065217391304348e-05,
"loss": 0.2532,
"step": 1575
},
{
"epoch": 1.887425149700599,
"grad_norm": 0.19449241821181001,
"learning_rate": 2.0629991126885538e-05,
"loss": 0.2405,
"step": 1576
},
{
"epoch": 1.888622754491018,
"grad_norm": 0.22324075636184026,
"learning_rate": 2.0607808340727597e-05,
"loss": 0.2557,
"step": 1577
},
{
"epoch": 1.889820359281437,
"grad_norm": 0.16926654790067655,
"learning_rate": 2.0585625554569655e-05,
"loss": 0.2553,
"step": 1578
},
{
"epoch": 1.8910179640718563,
"grad_norm": 0.21008925875648693,
"learning_rate": 2.0563442768411714e-05,
"loss": 0.2469,
"step": 1579
},
{
"epoch": 1.8922155688622755,
"grad_norm": 0.2013192771202742,
"learning_rate": 2.0541259982253772e-05,
"loss": 0.2726,
"step": 1580
},
{
"epoch": 1.8934131736526947,
"grad_norm": 0.1822996899995872,
"learning_rate": 2.051907719609583e-05,
"loss": 0.2449,
"step": 1581
},
{
"epoch": 1.8946107784431137,
"grad_norm": 0.1788804589142345,
"learning_rate": 2.049689440993789e-05,
"loss": 0.2485,
"step": 1582
},
{
"epoch": 1.895808383233533,
"grad_norm": 0.18625930679063463,
"learning_rate": 2.047471162377995e-05,
"loss": 0.2401,
"step": 1583
},
{
"epoch": 1.897005988023952,
"grad_norm": 0.20373233500054888,
"learning_rate": 2.0452528837622006e-05,
"loss": 0.2389,
"step": 1584
},
{
"epoch": 1.8982035928143712,
"grad_norm": 0.17374767646706657,
"learning_rate": 2.0430346051464064e-05,
"loss": 0.2492,
"step": 1585
},
{
"epoch": 1.8994011976047904,
"grad_norm": 0.18249819700694636,
"learning_rate": 2.0408163265306123e-05,
"loss": 0.2324,
"step": 1586
},
{
"epoch": 1.9005988023952096,
"grad_norm": 0.20079218950552427,
"learning_rate": 2.038598047914818e-05,
"loss": 0.2534,
"step": 1587
},
{
"epoch": 1.9017964071856288,
"grad_norm": 0.16672436848501518,
"learning_rate": 2.0363797692990243e-05,
"loss": 0.2438,
"step": 1588
},
{
"epoch": 1.9029940119760478,
"grad_norm": 0.17523557052570882,
"learning_rate": 2.0341614906832298e-05,
"loss": 0.2416,
"step": 1589
},
{
"epoch": 1.9041916167664672,
"grad_norm": 0.2153649720914859,
"learning_rate": 2.031943212067436e-05,
"loss": 0.268,
"step": 1590
},
{
"epoch": 1.9053892215568862,
"grad_norm": 0.16592616603210525,
"learning_rate": 2.0297249334516415e-05,
"loss": 0.2428,
"step": 1591
},
{
"epoch": 1.9065868263473054,
"grad_norm": 0.17067081543193777,
"learning_rate": 2.0275066548358473e-05,
"loss": 0.2541,
"step": 1592
},
{
"epoch": 1.9077844311377246,
"grad_norm": 0.1932729648175747,
"learning_rate": 2.0252883762200535e-05,
"loss": 0.2305,
"step": 1593
},
{
"epoch": 1.9089820359281438,
"grad_norm": 0.18160156050762483,
"learning_rate": 2.023070097604259e-05,
"loss": 0.2565,
"step": 1594
},
{
"epoch": 1.910179640718563,
"grad_norm": 0.19051553462672977,
"learning_rate": 2.0208518189884652e-05,
"loss": 0.2464,
"step": 1595
},
{
"epoch": 1.911377245508982,
"grad_norm": 0.18004042164857798,
"learning_rate": 2.0186335403726707e-05,
"loss": 0.2375,
"step": 1596
},
{
"epoch": 1.9125748502994013,
"grad_norm": 0.16941311892094726,
"learning_rate": 2.016415261756877e-05,
"loss": 0.2515,
"step": 1597
},
{
"epoch": 1.9137724550898203,
"grad_norm": 0.1854836659130977,
"learning_rate": 2.0141969831410827e-05,
"loss": 0.2501,
"step": 1598
},
{
"epoch": 1.9149700598802395,
"grad_norm": 0.18180452501644906,
"learning_rate": 2.0119787045252885e-05,
"loss": 0.2436,
"step": 1599
},
{
"epoch": 1.9161676646706587,
"grad_norm": 0.1819322757244242,
"learning_rate": 2.0097604259094944e-05,
"loss": 0.2473,
"step": 1600
},
{
"epoch": 1.9173652694610779,
"grad_norm": 0.18041726404499492,
"learning_rate": 2.0075421472937e-05,
"loss": 0.2572,
"step": 1601
},
{
"epoch": 1.918562874251497,
"grad_norm": 0.16914456781713486,
"learning_rate": 2.005323868677906e-05,
"loss": 0.2281,
"step": 1602
},
{
"epoch": 1.919760479041916,
"grad_norm": 0.16218608721177533,
"learning_rate": 2.003105590062112e-05,
"loss": 0.2135,
"step": 1603
},
{
"epoch": 1.9209580838323355,
"grad_norm": 0.17120409899492733,
"learning_rate": 2.0008873114463178e-05,
"loss": 0.2445,
"step": 1604
},
{
"epoch": 1.9221556886227544,
"grad_norm": 0.18076564772299986,
"learning_rate": 1.9986690328305236e-05,
"loss": 0.2407,
"step": 1605
},
{
"epoch": 1.9233532934131736,
"grad_norm": 0.1675682124124539,
"learning_rate": 1.9964507542147294e-05,
"loss": 0.2327,
"step": 1606
},
{
"epoch": 1.9245508982035928,
"grad_norm": 0.18665617420363045,
"learning_rate": 1.9942324755989353e-05,
"loss": 0.2626,
"step": 1607
},
{
"epoch": 1.925748502994012,
"grad_norm": 0.19037598898996505,
"learning_rate": 1.992014196983141e-05,
"loss": 0.2605,
"step": 1608
},
{
"epoch": 1.9269461077844312,
"grad_norm": 0.17035431670979614,
"learning_rate": 1.989795918367347e-05,
"loss": 0.2503,
"step": 1609
},
{
"epoch": 1.9281437125748502,
"grad_norm": 0.1704434409999629,
"learning_rate": 1.9875776397515528e-05,
"loss": 0.234,
"step": 1610
},
{
"epoch": 1.9293413173652696,
"grad_norm": 0.1973401793003133,
"learning_rate": 1.9853593611357587e-05,
"loss": 0.237,
"step": 1611
},
{
"epoch": 1.9305389221556886,
"grad_norm": 0.18478131254683355,
"learning_rate": 1.9831410825199645e-05,
"loss": 0.2383,
"step": 1612
},
{
"epoch": 1.9317365269461078,
"grad_norm": 0.17916483416517343,
"learning_rate": 1.9809228039041703e-05,
"loss": 0.2467,
"step": 1613
},
{
"epoch": 1.932934131736527,
"grad_norm": 0.18571377779948978,
"learning_rate": 1.9787045252883762e-05,
"loss": 0.2423,
"step": 1614
},
{
"epoch": 1.9341317365269461,
"grad_norm": 0.18082318228270175,
"learning_rate": 1.9764862466725824e-05,
"loss": 0.2425,
"step": 1615
},
{
"epoch": 1.9353293413173653,
"grad_norm": 0.15609259615006404,
"learning_rate": 1.974267968056788e-05,
"loss": 0.2446,
"step": 1616
},
{
"epoch": 1.9365269461077843,
"grad_norm": 0.19466138159479826,
"learning_rate": 1.972049689440994e-05,
"loss": 0.2572,
"step": 1617
},
{
"epoch": 1.9377245508982037,
"grad_norm": 0.2034944096837654,
"learning_rate": 1.9698314108252e-05,
"loss": 0.2516,
"step": 1618
},
{
"epoch": 1.9389221556886227,
"grad_norm": 0.16655263888371802,
"learning_rate": 1.9676131322094054e-05,
"loss": 0.2477,
"step": 1619
},
{
"epoch": 1.9401197604790419,
"grad_norm": 0.1810325796087433,
"learning_rate": 1.9653948535936116e-05,
"loss": 0.2682,
"step": 1620
},
{
"epoch": 1.941317365269461,
"grad_norm": 0.1697122320242675,
"learning_rate": 1.963176574977817e-05,
"loss": 0.2555,
"step": 1621
},
{
"epoch": 1.9425149700598803,
"grad_norm": 0.17763041982408748,
"learning_rate": 1.9609582963620233e-05,
"loss": 0.2402,
"step": 1622
},
{
"epoch": 1.9437125748502995,
"grad_norm": 0.18729098869021296,
"learning_rate": 1.958740017746229e-05,
"loss": 0.2616,
"step": 1623
},
{
"epoch": 1.9449101796407184,
"grad_norm": 0.18357548710339092,
"learning_rate": 1.956521739130435e-05,
"loss": 0.2651,
"step": 1624
},
{
"epoch": 1.9461077844311379,
"grad_norm": 0.19635311907273706,
"learning_rate": 1.9543034605146408e-05,
"loss": 0.2488,
"step": 1625
},
{
"epoch": 1.9473053892215568,
"grad_norm": 0.20420528846207664,
"learning_rate": 1.9520851818988466e-05,
"loss": 0.2434,
"step": 1626
},
{
"epoch": 1.948502994011976,
"grad_norm": 0.18234255653798404,
"learning_rate": 1.9498669032830525e-05,
"loss": 0.2466,
"step": 1627
},
{
"epoch": 1.9497005988023952,
"grad_norm": 0.20335444751343837,
"learning_rate": 1.9476486246672583e-05,
"loss": 0.2487,
"step": 1628
},
{
"epoch": 1.9508982035928144,
"grad_norm": 0.19623454809327115,
"learning_rate": 1.945430346051464e-05,
"loss": 0.2365,
"step": 1629
},
{
"epoch": 1.9520958083832336,
"grad_norm": 0.18402513386873937,
"learning_rate": 1.94321206743567e-05,
"loss": 0.2432,
"step": 1630
},
{
"epoch": 1.9532934131736526,
"grad_norm": 0.17048606368517186,
"learning_rate": 1.940993788819876e-05,
"loss": 0.2246,
"step": 1631
},
{
"epoch": 1.954491017964072,
"grad_norm": 0.2137835037217273,
"learning_rate": 1.9387755102040817e-05,
"loss": 0.2741,
"step": 1632
},
{
"epoch": 1.955688622754491,
"grad_norm": 0.18824149691910855,
"learning_rate": 1.9365572315882875e-05,
"loss": 0.2532,
"step": 1633
},
{
"epoch": 1.9568862275449102,
"grad_norm": 0.1778159908620853,
"learning_rate": 1.9343389529724934e-05,
"loss": 0.2166,
"step": 1634
},
{
"epoch": 1.9580838323353293,
"grad_norm": 0.1716353914035476,
"learning_rate": 1.9321206743566996e-05,
"loss": 0.2317,
"step": 1635
},
{
"epoch": 1.9592814371257485,
"grad_norm": 0.19157824307955132,
"learning_rate": 1.929902395740905e-05,
"loss": 0.2365,
"step": 1636
},
{
"epoch": 1.9604790419161677,
"grad_norm": 0.16813053145895274,
"learning_rate": 1.927684117125111e-05,
"loss": 0.2301,
"step": 1637
},
{
"epoch": 1.9616766467065867,
"grad_norm": 0.18350909241428445,
"learning_rate": 1.9254658385093167e-05,
"loss": 0.2535,
"step": 1638
},
{
"epoch": 1.9628742514970061,
"grad_norm": 0.18496898301310682,
"learning_rate": 1.9232475598935226e-05,
"loss": 0.2393,
"step": 1639
},
{
"epoch": 1.964071856287425,
"grad_norm": 0.1822435260086062,
"learning_rate": 1.9210292812777288e-05,
"loss": 0.252,
"step": 1640
},
{
"epoch": 1.9652694610778443,
"grad_norm": 0.15577389516356482,
"learning_rate": 1.9188110026619343e-05,
"loss": 0.2384,
"step": 1641
},
{
"epoch": 1.9664670658682635,
"grad_norm": 0.1696522149778908,
"learning_rate": 1.9165927240461405e-05,
"loss": 0.2394,
"step": 1642
},
{
"epoch": 1.9676646706586827,
"grad_norm": 0.16128630448863462,
"learning_rate": 1.914374445430346e-05,
"loss": 0.2378,
"step": 1643
},
{
"epoch": 1.9688622754491019,
"grad_norm": 0.19699548112496104,
"learning_rate": 1.912156166814552e-05,
"loss": 0.2539,
"step": 1644
},
{
"epoch": 1.9700598802395208,
"grad_norm": 0.19409006916703095,
"learning_rate": 1.909937888198758e-05,
"loss": 0.2548,
"step": 1645
},
{
"epoch": 1.9712574850299402,
"grad_norm": 0.16548982279777716,
"learning_rate": 1.9077196095829635e-05,
"loss": 0.2354,
"step": 1646
},
{
"epoch": 1.9724550898203592,
"grad_norm": 0.20035588625692033,
"learning_rate": 1.9055013309671697e-05,
"loss": 0.2383,
"step": 1647
},
{
"epoch": 1.9736526946107784,
"grad_norm": 0.164312126525316,
"learning_rate": 1.9032830523513752e-05,
"loss": 0.2425,
"step": 1648
},
{
"epoch": 1.9748502994011976,
"grad_norm": 0.17031514647993318,
"learning_rate": 1.9010647737355814e-05,
"loss": 0.2424,
"step": 1649
},
{
"epoch": 1.9760479041916168,
"grad_norm": 0.16243359539570779,
"learning_rate": 1.8988464951197872e-05,
"loss": 0.2546,
"step": 1650
},
{
"epoch": 1.977245508982036,
"grad_norm": 0.17449610642460203,
"learning_rate": 1.896628216503993e-05,
"loss": 0.2386,
"step": 1651
},
{
"epoch": 1.978443113772455,
"grad_norm": 0.15732816767266422,
"learning_rate": 1.894409937888199e-05,
"loss": 0.2423,
"step": 1652
},
{
"epoch": 1.9796407185628744,
"grad_norm": 0.15611806229022346,
"learning_rate": 1.8921916592724047e-05,
"loss": 0.2374,
"step": 1653
},
{
"epoch": 1.9808383233532934,
"grad_norm": 0.15271853161345586,
"learning_rate": 1.8899733806566106e-05,
"loss": 0.2333,
"step": 1654
},
{
"epoch": 1.9820359281437125,
"grad_norm": 0.2055491864071245,
"learning_rate": 1.8877551020408164e-05,
"loss": 0.2318,
"step": 1655
},
{
"epoch": 1.9832335329341317,
"grad_norm": 0.16762609009858068,
"learning_rate": 1.8855368234250223e-05,
"loss": 0.2508,
"step": 1656
},
{
"epoch": 1.984431137724551,
"grad_norm": 0.1656636356303819,
"learning_rate": 1.883318544809228e-05,
"loss": 0.2484,
"step": 1657
},
{
"epoch": 1.9856287425149701,
"grad_norm": 0.16779030746156495,
"learning_rate": 1.881100266193434e-05,
"loss": 0.2381,
"step": 1658
},
{
"epoch": 1.986826347305389,
"grad_norm": 0.1670589753903133,
"learning_rate": 1.8788819875776398e-05,
"loss": 0.2356,
"step": 1659
},
{
"epoch": 1.9880239520958085,
"grad_norm": 0.16592634165852785,
"learning_rate": 1.8766637089618456e-05,
"loss": 0.2385,
"step": 1660
},
{
"epoch": 1.9892215568862275,
"grad_norm": 0.16133368951919108,
"learning_rate": 1.8744454303460515e-05,
"loss": 0.2372,
"step": 1661
},
{
"epoch": 1.9904191616766467,
"grad_norm": 0.15608176541565158,
"learning_rate": 1.8722271517302576e-05,
"loss": 0.2315,
"step": 1662
},
{
"epoch": 1.9916167664670659,
"grad_norm": 0.17063414771297972,
"learning_rate": 1.870008873114463e-05,
"loss": 0.2695,
"step": 1663
},
{
"epoch": 1.992814371257485,
"grad_norm": 0.1622460161968529,
"learning_rate": 1.867790594498669e-05,
"loss": 0.2434,
"step": 1664
},
{
"epoch": 1.9940119760479043,
"grad_norm": 0.16871872128533147,
"learning_rate": 1.8655723158828752e-05,
"loss": 0.2438,
"step": 1665
},
{
"epoch": 1.9952095808383232,
"grad_norm": 0.1597345469320724,
"learning_rate": 1.8633540372670807e-05,
"loss": 0.2523,
"step": 1666
},
{
"epoch": 1.9964071856287426,
"grad_norm": 0.15506884385720063,
"learning_rate": 1.861135758651287e-05,
"loss": 0.2376,
"step": 1667
},
{
"epoch": 1.9976047904191616,
"grad_norm": 0.170451581262367,
"learning_rate": 1.8589174800354924e-05,
"loss": 0.2659,
"step": 1668
},
{
"epoch": 1.9988023952095808,
"grad_norm": 0.16720752610520792,
"learning_rate": 1.8566992014196985e-05,
"loss": 0.2474,
"step": 1669
},
{
"epoch": 2.0,
"grad_norm": 0.169648280328626,
"learning_rate": 1.8544809228039044e-05,
"loss": 0.2383,
"step": 1670
},
{
"epoch": 2.001197604790419,
"grad_norm": 0.24602861288291406,
"learning_rate": 1.8522626441881102e-05,
"loss": 0.1731,
"step": 1671
},
{
"epoch": 2.0023952095808384,
"grad_norm": 0.1847202417313519,
"learning_rate": 1.850044365572316e-05,
"loss": 0.1774,
"step": 1672
},
{
"epoch": 2.0035928143712574,
"grad_norm": 0.22479611181535628,
"learning_rate": 1.8478260869565216e-05,
"loss": 0.1724,
"step": 1673
},
{
"epoch": 2.0047904191616768,
"grad_norm": 0.23017195105103844,
"learning_rate": 1.8456078083407278e-05,
"loss": 0.177,
"step": 1674
},
{
"epoch": 2.0059880239520957,
"grad_norm": 0.1960870652743473,
"learning_rate": 1.8433895297249336e-05,
"loss": 0.1773,
"step": 1675
},
{
"epoch": 2.007185628742515,
"grad_norm": 0.24583908112051184,
"learning_rate": 1.8411712511091394e-05,
"loss": 0.1803,
"step": 1676
},
{
"epoch": 2.008383233532934,
"grad_norm": 0.23232740540417454,
"learning_rate": 1.8389529724933453e-05,
"loss": 0.1672,
"step": 1677
},
{
"epoch": 2.009580838323353,
"grad_norm": 0.1719628311713271,
"learning_rate": 1.836734693877551e-05,
"loss": 0.1774,
"step": 1678
},
{
"epoch": 2.0107784431137725,
"grad_norm": 0.21024043042503496,
"learning_rate": 1.834516415261757e-05,
"loss": 0.1623,
"step": 1679
},
{
"epoch": 2.0119760479041915,
"grad_norm": 0.437266408849283,
"learning_rate": 1.8322981366459628e-05,
"loss": 0.1706,
"step": 1680
},
{
"epoch": 2.013173652694611,
"grad_norm": 0.17589824133042045,
"learning_rate": 1.8300798580301687e-05,
"loss": 0.1635,
"step": 1681
},
{
"epoch": 2.01437125748503,
"grad_norm": 0.17506945142531075,
"learning_rate": 1.8278615794143745e-05,
"loss": 0.1824,
"step": 1682
},
{
"epoch": 2.0155688622754493,
"grad_norm": 0.1999910353930593,
"learning_rate": 1.8256433007985803e-05,
"loss": 0.1893,
"step": 1683
},
{
"epoch": 2.0167664670658683,
"grad_norm": 0.19940140537011744,
"learning_rate": 1.8234250221827862e-05,
"loss": 0.1807,
"step": 1684
},
{
"epoch": 2.0179640718562872,
"grad_norm": 0.18380285293480753,
"learning_rate": 1.821206743566992e-05,
"loss": 0.1638,
"step": 1685
},
{
"epoch": 2.0191616766467066,
"grad_norm": 0.17418629038206557,
"learning_rate": 1.818988464951198e-05,
"loss": 0.1806,
"step": 1686
},
{
"epoch": 2.0203592814371256,
"grad_norm": 0.18301427050078234,
"learning_rate": 1.816770186335404e-05,
"loss": 0.167,
"step": 1687
},
{
"epoch": 2.021556886227545,
"grad_norm": 0.2021729946420112,
"learning_rate": 1.8145519077196096e-05,
"loss": 0.1886,
"step": 1688
},
{
"epoch": 2.022754491017964,
"grad_norm": 0.20024546327503015,
"learning_rate": 1.8123336291038157e-05,
"loss": 0.1607,
"step": 1689
},
{
"epoch": 2.0239520958083834,
"grad_norm": 0.17076706438838463,
"learning_rate": 1.8101153504880212e-05,
"loss": 0.1741,
"step": 1690
},
{
"epoch": 2.0251497005988024,
"grad_norm": 0.20583870227902895,
"learning_rate": 1.807897071872227e-05,
"loss": 0.1641,
"step": 1691
},
{
"epoch": 2.0263473053892214,
"grad_norm": 0.1861619495402734,
"learning_rate": 1.8056787932564333e-05,
"loss": 0.1785,
"step": 1692
},
{
"epoch": 2.027544910179641,
"grad_norm": 0.16633700475159224,
"learning_rate": 1.8034605146406388e-05,
"loss": 0.1744,
"step": 1693
},
{
"epoch": 2.0287425149700598,
"grad_norm": 0.18198305394642028,
"learning_rate": 1.801242236024845e-05,
"loss": 0.1973,
"step": 1694
},
{
"epoch": 2.029940119760479,
"grad_norm": 0.17240197035921453,
"learning_rate": 1.7990239574090505e-05,
"loss": 0.1798,
"step": 1695
},
{
"epoch": 2.031137724550898,
"grad_norm": 0.1647892222441613,
"learning_rate": 1.7968056787932566e-05,
"loss": 0.169,
"step": 1696
},
{
"epoch": 2.0323353293413176,
"grad_norm": 0.1577395194578381,
"learning_rate": 1.7945874001774625e-05,
"loss": 0.171,
"step": 1697
},
{
"epoch": 2.0335329341317365,
"grad_norm": 0.16505264873565167,
"learning_rate": 1.7923691215616683e-05,
"loss": 0.1709,
"step": 1698
},
{
"epoch": 2.0347305389221555,
"grad_norm": 0.16788295411229234,
"learning_rate": 1.790150842945874e-05,
"loss": 0.1728,
"step": 1699
},
{
"epoch": 2.035928143712575,
"grad_norm": 0.17074381301479255,
"learning_rate": 1.7879325643300797e-05,
"loss": 0.176,
"step": 1700
},
{
"epoch": 2.037125748502994,
"grad_norm": 0.18163496392898934,
"learning_rate": 1.785714285714286e-05,
"loss": 0.1819,
"step": 1701
},
{
"epoch": 2.0383233532934133,
"grad_norm": 0.1636926628393607,
"learning_rate": 1.7834960070984917e-05,
"loss": 0.1743,
"step": 1702
},
{
"epoch": 2.0395209580838323,
"grad_norm": 0.18351895817399633,
"learning_rate": 1.7812777284826975e-05,
"loss": 0.1823,
"step": 1703
},
{
"epoch": 2.0407185628742517,
"grad_norm": 0.18236613701770232,
"learning_rate": 1.7790594498669034e-05,
"loss": 0.1752,
"step": 1704
},
{
"epoch": 2.0419161676646707,
"grad_norm": 0.17736179894166776,
"learning_rate": 1.7768411712511092e-05,
"loss": 0.1902,
"step": 1705
},
{
"epoch": 2.0431137724550896,
"grad_norm": 0.17097162429318324,
"learning_rate": 1.774622892635315e-05,
"loss": 0.1739,
"step": 1706
},
{
"epoch": 2.044311377245509,
"grad_norm": 0.1652148762312668,
"learning_rate": 1.772404614019521e-05,
"loss": 0.1614,
"step": 1707
},
{
"epoch": 2.045508982035928,
"grad_norm": 0.1674892483435149,
"learning_rate": 1.7701863354037267e-05,
"loss": 0.1781,
"step": 1708
},
{
"epoch": 2.0467065868263474,
"grad_norm": 0.17949302163368835,
"learning_rate": 1.7679680567879326e-05,
"loss": 0.184,
"step": 1709
},
{
"epoch": 2.0479041916167664,
"grad_norm": 0.17793716385110372,
"learning_rate": 1.7657497781721384e-05,
"loss": 0.1808,
"step": 1710
},
{
"epoch": 2.049101796407186,
"grad_norm": 0.1702837592042184,
"learning_rate": 1.7635314995563443e-05,
"loss": 0.1782,
"step": 1711
},
{
"epoch": 2.050299401197605,
"grad_norm": 0.18971281138385174,
"learning_rate": 1.7613132209405505e-05,
"loss": 0.2017,
"step": 1712
},
{
"epoch": 2.0514970059880238,
"grad_norm": 0.16206055500906585,
"learning_rate": 1.759094942324756e-05,
"loss": 0.1757,
"step": 1713
},
{
"epoch": 2.052694610778443,
"grad_norm": 0.17337441028929695,
"learning_rate": 1.756876663708962e-05,
"loss": 0.1761,
"step": 1714
},
{
"epoch": 2.053892215568862,
"grad_norm": 0.15278519066893706,
"learning_rate": 1.7546583850931676e-05,
"loss": 0.1645,
"step": 1715
},
{
"epoch": 2.0550898203592816,
"grad_norm": 0.17531277628506142,
"learning_rate": 1.7524401064773738e-05,
"loss": 0.1761,
"step": 1716
},
{
"epoch": 2.0562874251497005,
"grad_norm": 0.16195659933786363,
"learning_rate": 1.7502218278615797e-05,
"loss": 0.1607,
"step": 1717
},
{
"epoch": 2.05748502994012,
"grad_norm": 0.17653609299382456,
"learning_rate": 1.7480035492457852e-05,
"loss": 0.1628,
"step": 1718
},
{
"epoch": 2.058682634730539,
"grad_norm": 0.20345797858593823,
"learning_rate": 1.7457852706299914e-05,
"loss": 0.1739,
"step": 1719
},
{
"epoch": 2.059880239520958,
"grad_norm": 0.1572831043623081,
"learning_rate": 1.743566992014197e-05,
"loss": 0.1622,
"step": 1720
},
{
"epoch": 2.0610778443113773,
"grad_norm": 0.17723489481739582,
"learning_rate": 1.741348713398403e-05,
"loss": 0.1732,
"step": 1721
},
{
"epoch": 2.0622754491017963,
"grad_norm": 0.17209107185736075,
"learning_rate": 1.739130434782609e-05,
"loss": 0.1928,
"step": 1722
},
{
"epoch": 2.0634730538922157,
"grad_norm": 0.18872022891001053,
"learning_rate": 1.7369121561668147e-05,
"loss": 0.1755,
"step": 1723
},
{
"epoch": 2.0646706586826347,
"grad_norm": 0.1672630013902874,
"learning_rate": 1.7346938775510206e-05,
"loss": 0.1645,
"step": 1724
},
{
"epoch": 2.065868263473054,
"grad_norm": 0.16004807108756025,
"learning_rate": 1.7324755989352264e-05,
"loss": 0.1581,
"step": 1725
},
{
"epoch": 2.067065868263473,
"grad_norm": 0.1680912338450365,
"learning_rate": 1.7302573203194323e-05,
"loss": 0.1971,
"step": 1726
},
{
"epoch": 2.068263473053892,
"grad_norm": 0.17602136546727032,
"learning_rate": 1.728039041703638e-05,
"loss": 0.1877,
"step": 1727
},
{
"epoch": 2.0694610778443114,
"grad_norm": 0.1652550404010507,
"learning_rate": 1.725820763087844e-05,
"loss": 0.1825,
"step": 1728
},
{
"epoch": 2.0706586826347304,
"grad_norm": 0.16393622680987244,
"learning_rate": 1.7236024844720498e-05,
"loss": 0.1767,
"step": 1729
},
{
"epoch": 2.07185628742515,
"grad_norm": 0.17397022225241615,
"learning_rate": 1.7213842058562556e-05,
"loss": 0.1722,
"step": 1730
},
{
"epoch": 2.073053892215569,
"grad_norm": 0.16741492548190193,
"learning_rate": 1.7191659272404615e-05,
"loss": 0.172,
"step": 1731
},
{
"epoch": 2.074251497005988,
"grad_norm": 0.1732810732917652,
"learning_rate": 1.7169476486246673e-05,
"loss": 0.1798,
"step": 1732
},
{
"epoch": 2.075449101796407,
"grad_norm": 0.17242006679800226,
"learning_rate": 1.714729370008873e-05,
"loss": 0.172,
"step": 1733
},
{
"epoch": 2.076646706586826,
"grad_norm": 0.2017021237394758,
"learning_rate": 1.712511091393079e-05,
"loss": 0.1748,
"step": 1734
},
{
"epoch": 2.0778443113772456,
"grad_norm": 0.21432249847529442,
"learning_rate": 1.710292812777285e-05,
"loss": 0.1804,
"step": 1735
},
{
"epoch": 2.0790419161676645,
"grad_norm": 0.16938272085253153,
"learning_rate": 1.7080745341614907e-05,
"loss": 0.1628,
"step": 1736
},
{
"epoch": 2.080239520958084,
"grad_norm": 0.18135901378783964,
"learning_rate": 1.7058562555456965e-05,
"loss": 0.1729,
"step": 1737
},
{
"epoch": 2.081437125748503,
"grad_norm": 0.1864554305898938,
"learning_rate": 1.7036379769299024e-05,
"loss": 0.1577,
"step": 1738
},
{
"epoch": 2.0826347305389223,
"grad_norm": 0.15544394903864064,
"learning_rate": 1.7014196983141085e-05,
"loss": 0.1697,
"step": 1739
},
{
"epoch": 2.0838323353293413,
"grad_norm": 0.1733850712308992,
"learning_rate": 1.699201419698314e-05,
"loss": 0.1795,
"step": 1740
},
{
"epoch": 2.0850299401197603,
"grad_norm": 0.1579873495276183,
"learning_rate": 1.6969831410825202e-05,
"loss": 0.1659,
"step": 1741
},
{
"epoch": 2.0862275449101797,
"grad_norm": 0.16488084633770964,
"learning_rate": 1.6947648624667257e-05,
"loss": 0.1761,
"step": 1742
},
{
"epoch": 2.0874251497005987,
"grad_norm": 0.17356593456308556,
"learning_rate": 1.6925465838509316e-05,
"loss": 0.1713,
"step": 1743
},
{
"epoch": 2.088622754491018,
"grad_norm": 0.17022326766163998,
"learning_rate": 1.6903283052351378e-05,
"loss": 0.1759,
"step": 1744
},
{
"epoch": 2.089820359281437,
"grad_norm": 0.1651426384553345,
"learning_rate": 1.6881100266193433e-05,
"loss": 0.1764,
"step": 1745
},
{
"epoch": 2.0910179640718565,
"grad_norm": 0.17530127708448454,
"learning_rate": 1.6858917480035494e-05,
"loss": 0.1767,
"step": 1746
},
{
"epoch": 2.0922155688622754,
"grad_norm": 0.17077328107484852,
"learning_rate": 1.683673469387755e-05,
"loss": 0.1713,
"step": 1747
},
{
"epoch": 2.0934131736526944,
"grad_norm": 0.2764463951983248,
"learning_rate": 1.681455190771961e-05,
"loss": 0.1913,
"step": 1748
},
{
"epoch": 2.094610778443114,
"grad_norm": 0.17318677964889367,
"learning_rate": 1.679236912156167e-05,
"loss": 0.1712,
"step": 1749
},
{
"epoch": 2.095808383233533,
"grad_norm": 0.17241351061966365,
"learning_rate": 1.6770186335403728e-05,
"loss": 0.1718,
"step": 1750
},
{
"epoch": 2.097005988023952,
"grad_norm": 0.17329728691430654,
"learning_rate": 1.6748003549245787e-05,
"loss": 0.174,
"step": 1751
},
{
"epoch": 2.098203592814371,
"grad_norm": 0.16102321333902028,
"learning_rate": 1.672582076308784e-05,
"loss": 0.1761,
"step": 1752
},
{
"epoch": 2.0994011976047906,
"grad_norm": 0.17201040957264374,
"learning_rate": 1.6703637976929903e-05,
"loss": 0.1743,
"step": 1753
},
{
"epoch": 2.1005988023952096,
"grad_norm": 0.15981277786010864,
"learning_rate": 1.6681455190771962e-05,
"loss": 0.1842,
"step": 1754
},
{
"epoch": 2.1017964071856285,
"grad_norm": 0.17415256085755168,
"learning_rate": 1.665927240461402e-05,
"loss": 0.1641,
"step": 1755
},
{
"epoch": 2.102994011976048,
"grad_norm": 0.15454902442851245,
"learning_rate": 1.663708961845608e-05,
"loss": 0.1568,
"step": 1756
},
{
"epoch": 2.104191616766467,
"grad_norm": 0.15827996259491595,
"learning_rate": 1.6614906832298137e-05,
"loss": 0.1794,
"step": 1757
},
{
"epoch": 2.1053892215568863,
"grad_norm": 0.17246445668637111,
"learning_rate": 1.6592724046140196e-05,
"loss": 0.176,
"step": 1758
},
{
"epoch": 2.1065868263473053,
"grad_norm": 0.17414794874494086,
"learning_rate": 1.6570541259982257e-05,
"loss": 0.1923,
"step": 1759
},
{
"epoch": 2.1077844311377247,
"grad_norm": 0.16716400479872845,
"learning_rate": 1.6548358473824312e-05,
"loss": 0.1763,
"step": 1760
},
{
"epoch": 2.1089820359281437,
"grad_norm": 0.17100107642675533,
"learning_rate": 1.652617568766637e-05,
"loss": 0.1728,
"step": 1761
},
{
"epoch": 2.1101796407185627,
"grad_norm": 0.175250595631819,
"learning_rate": 1.650399290150843e-05,
"loss": 0.1931,
"step": 1762
},
{
"epoch": 2.111377245508982,
"grad_norm": 0.15483799385805108,
"learning_rate": 1.6481810115350488e-05,
"loss": 0.1497,
"step": 1763
},
{
"epoch": 2.112574850299401,
"grad_norm": 0.16549298166358825,
"learning_rate": 1.645962732919255e-05,
"loss": 0.1637,
"step": 1764
},
{
"epoch": 2.1137724550898205,
"grad_norm": 0.18346015876012947,
"learning_rate": 1.6437444543034605e-05,
"loss": 0.1781,
"step": 1765
},
{
"epoch": 2.1149700598802395,
"grad_norm": 0.17509472631274445,
"learning_rate": 1.6415261756876666e-05,
"loss": 0.1806,
"step": 1766
},
{
"epoch": 2.116167664670659,
"grad_norm": 0.1608699048065402,
"learning_rate": 1.639307897071872e-05,
"loss": 0.1765,
"step": 1767
},
{
"epoch": 2.117365269461078,
"grad_norm": 0.16012492230047937,
"learning_rate": 1.6370896184560783e-05,
"loss": 0.174,
"step": 1768
},
{
"epoch": 2.118562874251497,
"grad_norm": 0.15299186823381192,
"learning_rate": 1.634871339840284e-05,
"loss": 0.1583,
"step": 1769
},
{
"epoch": 2.1197604790419162,
"grad_norm": 0.1654251835070233,
"learning_rate": 1.6326530612244897e-05,
"loss": 0.1714,
"step": 1770
},
{
"epoch": 2.120958083832335,
"grad_norm": 0.17188684877551677,
"learning_rate": 1.630434782608696e-05,
"loss": 0.1693,
"step": 1771
},
{
"epoch": 2.1221556886227546,
"grad_norm": 0.16095757053039436,
"learning_rate": 1.6282165039929014e-05,
"loss": 0.178,
"step": 1772
},
{
"epoch": 2.1233532934131736,
"grad_norm": 0.16851670389343842,
"learning_rate": 1.6259982253771075e-05,
"loss": 0.1778,
"step": 1773
},
{
"epoch": 2.124550898203593,
"grad_norm": 0.17974997290843128,
"learning_rate": 1.6237799467613134e-05,
"loss": 0.1796,
"step": 1774
},
{
"epoch": 2.125748502994012,
"grad_norm": 0.18156932716645607,
"learning_rate": 1.6215616681455192e-05,
"loss": 0.1784,
"step": 1775
},
{
"epoch": 2.126946107784431,
"grad_norm": 0.1668709913303963,
"learning_rate": 1.619343389529725e-05,
"loss": 0.177,
"step": 1776
},
{
"epoch": 2.1281437125748504,
"grad_norm": 0.16449051895800842,
"learning_rate": 1.617125110913931e-05,
"loss": 0.1615,
"step": 1777
},
{
"epoch": 2.1293413173652693,
"grad_norm": 0.16486945990669113,
"learning_rate": 1.6149068322981367e-05,
"loss": 0.1702,
"step": 1778
},
{
"epoch": 2.1305389221556887,
"grad_norm": 0.16468850784645975,
"learning_rate": 1.6126885536823426e-05,
"loss": 0.173,
"step": 1779
},
{
"epoch": 2.1317365269461077,
"grad_norm": 0.1706682153593111,
"learning_rate": 1.6104702750665484e-05,
"loss": 0.1768,
"step": 1780
},
{
"epoch": 2.132934131736527,
"grad_norm": 0.17333156127785485,
"learning_rate": 1.6082519964507543e-05,
"loss": 0.1771,
"step": 1781
},
{
"epoch": 2.134131736526946,
"grad_norm": 0.1609851987155438,
"learning_rate": 1.60603371783496e-05,
"loss": 0.1683,
"step": 1782
},
{
"epoch": 2.135329341317365,
"grad_norm": 0.17592875960233392,
"learning_rate": 1.603815439219166e-05,
"loss": 0.1653,
"step": 1783
},
{
"epoch": 2.1365269461077845,
"grad_norm": 0.1776454632567326,
"learning_rate": 1.6015971606033718e-05,
"loss": 0.1784,
"step": 1784
},
{
"epoch": 2.1377245508982035,
"grad_norm": 0.15896389908017516,
"learning_rate": 1.5993788819875776e-05,
"loss": 0.1652,
"step": 1785
},
{
"epoch": 2.138922155688623,
"grad_norm": 0.17856246602700412,
"learning_rate": 1.5971606033717838e-05,
"loss": 0.172,
"step": 1786
},
{
"epoch": 2.140119760479042,
"grad_norm": 0.20415842472282394,
"learning_rate": 1.5949423247559893e-05,
"loss": 0.1837,
"step": 1787
},
{
"epoch": 2.1413173652694613,
"grad_norm": 0.1658468987641765,
"learning_rate": 1.5927240461401952e-05,
"loss": 0.1754,
"step": 1788
},
{
"epoch": 2.1425149700598802,
"grad_norm": 0.1711592518639678,
"learning_rate": 1.590505767524401e-05,
"loss": 0.1782,
"step": 1789
},
{
"epoch": 2.143712574850299,
"grad_norm": 0.15532043198504308,
"learning_rate": 1.588287488908607e-05,
"loss": 0.1635,
"step": 1790
},
{
"epoch": 2.1449101796407186,
"grad_norm": 0.16288143949832154,
"learning_rate": 1.586069210292813e-05,
"loss": 0.1878,
"step": 1791
},
{
"epoch": 2.1461077844311376,
"grad_norm": 0.17266015234969334,
"learning_rate": 1.5838509316770185e-05,
"loss": 0.1792,
"step": 1792
},
{
"epoch": 2.147305389221557,
"grad_norm": 0.15855429487969439,
"learning_rate": 1.5816326530612247e-05,
"loss": 0.1722,
"step": 1793
},
{
"epoch": 2.148502994011976,
"grad_norm": 0.16097354748354664,
"learning_rate": 1.5794143744454302e-05,
"loss": 0.1693,
"step": 1794
},
{
"epoch": 2.1497005988023954,
"grad_norm": 0.16702387356466733,
"learning_rate": 1.5771960958296364e-05,
"loss": 0.186,
"step": 1795
},
{
"epoch": 2.1508982035928144,
"grad_norm": 0.16695192113622337,
"learning_rate": 1.5749778172138423e-05,
"loss": 0.1677,
"step": 1796
},
{
"epoch": 2.1520958083832333,
"grad_norm": 0.16521690287819452,
"learning_rate": 1.5727595385980478e-05,
"loss": 0.1638,
"step": 1797
},
{
"epoch": 2.1532934131736527,
"grad_norm": 0.16117397684957088,
"learning_rate": 1.570541259982254e-05,
"loss": 0.176,
"step": 1798
},
{
"epoch": 2.1544910179640717,
"grad_norm": 0.17113003933265034,
"learning_rate": 1.5683229813664594e-05,
"loss": 0.1779,
"step": 1799
},
{
"epoch": 2.155688622754491,
"grad_norm": 0.15573135950095388,
"learning_rate": 1.5661047027506656e-05,
"loss": 0.1759,
"step": 1800
},
{
"epoch": 2.15688622754491,
"grad_norm": 0.1629884518620977,
"learning_rate": 1.5638864241348715e-05,
"loss": 0.1748,
"step": 1801
},
{
"epoch": 2.1580838323353295,
"grad_norm": 0.15504384157784964,
"learning_rate": 1.5616681455190773e-05,
"loss": 0.1713,
"step": 1802
},
{
"epoch": 2.1592814371257485,
"grad_norm": 0.17119498348220705,
"learning_rate": 1.559449866903283e-05,
"loss": 0.1818,
"step": 1803
},
{
"epoch": 2.1604790419161675,
"grad_norm": 0.1638464267373224,
"learning_rate": 1.557231588287489e-05,
"loss": 0.1587,
"step": 1804
},
{
"epoch": 2.161676646706587,
"grad_norm": 0.1616425382662505,
"learning_rate": 1.555013309671695e-05,
"loss": 0.1756,
"step": 1805
},
{
"epoch": 2.162874251497006,
"grad_norm": 0.18075732034696607,
"learning_rate": 1.5527950310559007e-05,
"loss": 0.1697,
"step": 1806
},
{
"epoch": 2.1640718562874253,
"grad_norm": 0.16686513546122766,
"learning_rate": 1.5505767524401065e-05,
"loss": 0.183,
"step": 1807
},
{
"epoch": 2.1652694610778442,
"grad_norm": 0.17808274382373684,
"learning_rate": 1.5483584738243124e-05,
"loss": 0.1912,
"step": 1808
},
{
"epoch": 2.1664670658682637,
"grad_norm": 0.1601103782432847,
"learning_rate": 1.5461401952085182e-05,
"loss": 0.1651,
"step": 1809
},
{
"epoch": 2.1676646706586826,
"grad_norm": 0.16042481581213186,
"learning_rate": 1.543921916592724e-05,
"loss": 0.174,
"step": 1810
},
{
"epoch": 2.1688622754491016,
"grad_norm": 0.19323341819230153,
"learning_rate": 1.5417036379769302e-05,
"loss": 0.1813,
"step": 1811
},
{
"epoch": 2.170059880239521,
"grad_norm": 0.1633069168014639,
"learning_rate": 1.5394853593611357e-05,
"loss": 0.1574,
"step": 1812
},
{
"epoch": 2.17125748502994,
"grad_norm": 0.17258392485547666,
"learning_rate": 1.537267080745342e-05,
"loss": 0.1764,
"step": 1813
},
{
"epoch": 2.1724550898203594,
"grad_norm": 0.16335821422160382,
"learning_rate": 1.5350488021295474e-05,
"loss": 0.1665,
"step": 1814
},
{
"epoch": 2.1736526946107784,
"grad_norm": 0.1660483501685823,
"learning_rate": 1.5328305235137533e-05,
"loss": 0.1703,
"step": 1815
},
{
"epoch": 2.174850299401198,
"grad_norm": 0.16727750281976264,
"learning_rate": 1.5306122448979594e-05,
"loss": 0.1769,
"step": 1816
},
{
"epoch": 2.1760479041916168,
"grad_norm": 0.19123777364040392,
"learning_rate": 1.528393966282165e-05,
"loss": 0.1836,
"step": 1817
},
{
"epoch": 2.1772455089820357,
"grad_norm": 0.17536133871005105,
"learning_rate": 1.526175687666371e-05,
"loss": 0.1722,
"step": 1818
},
{
"epoch": 2.178443113772455,
"grad_norm": 0.18150364011021858,
"learning_rate": 1.5239574090505768e-05,
"loss": 0.1789,
"step": 1819
},
{
"epoch": 2.179640718562874,
"grad_norm": 0.17164899410716603,
"learning_rate": 1.5217391304347828e-05,
"loss": 0.1751,
"step": 1820
},
{
"epoch": 2.1808383233532935,
"grad_norm": 0.16754569363837338,
"learning_rate": 1.5195208518189885e-05,
"loss": 0.1665,
"step": 1821
},
{
"epoch": 2.1820359281437125,
"grad_norm": 0.15952636050303987,
"learning_rate": 1.5173025732031945e-05,
"loss": 0.1595,
"step": 1822
},
{
"epoch": 2.183233532934132,
"grad_norm": 0.1592524685759855,
"learning_rate": 1.5150842945874002e-05,
"loss": 0.1634,
"step": 1823
},
{
"epoch": 2.184431137724551,
"grad_norm": 0.3329848539613615,
"learning_rate": 1.512866015971606e-05,
"loss": 0.1844,
"step": 1824
},
{
"epoch": 2.18562874251497,
"grad_norm": 0.16411980714980548,
"learning_rate": 1.510647737355812e-05,
"loss": 0.1722,
"step": 1825
},
{
"epoch": 2.1868263473053893,
"grad_norm": 0.17195868614649903,
"learning_rate": 1.5084294587400177e-05,
"loss": 0.1696,
"step": 1826
},
{
"epoch": 2.1880239520958082,
"grad_norm": 0.16267953614733902,
"learning_rate": 1.5062111801242237e-05,
"loss": 0.1707,
"step": 1827
},
{
"epoch": 2.1892215568862277,
"grad_norm": 0.17329509061905768,
"learning_rate": 1.5039929015084296e-05,
"loss": 0.1762,
"step": 1828
},
{
"epoch": 2.1904191616766466,
"grad_norm": 0.16299541770738152,
"learning_rate": 1.5017746228926356e-05,
"loss": 0.1914,
"step": 1829
},
{
"epoch": 2.191616766467066,
"grad_norm": 0.16763496295932126,
"learning_rate": 1.4995563442768412e-05,
"loss": 0.1654,
"step": 1830
},
{
"epoch": 2.192814371257485,
"grad_norm": 0.17325818906983345,
"learning_rate": 1.4973380656610473e-05,
"loss": 0.1605,
"step": 1831
},
{
"epoch": 2.194011976047904,
"grad_norm": 0.17330713593945332,
"learning_rate": 1.495119787045253e-05,
"loss": 0.1809,
"step": 1832
},
{
"epoch": 2.1952095808383234,
"grad_norm": 0.150414368756861,
"learning_rate": 1.4929015084294588e-05,
"loss": 0.1552,
"step": 1833
},
{
"epoch": 2.1964071856287424,
"grad_norm": 0.16591258142898066,
"learning_rate": 1.4906832298136648e-05,
"loss": 0.1809,
"step": 1834
},
{
"epoch": 2.197604790419162,
"grad_norm": 0.18631296509920361,
"learning_rate": 1.4884649511978705e-05,
"loss": 0.1702,
"step": 1835
},
{
"epoch": 2.1988023952095808,
"grad_norm": 0.17130856354188903,
"learning_rate": 1.4862466725820765e-05,
"loss": 0.176,
"step": 1836
},
{
"epoch": 2.2,
"grad_norm": 0.15780251491516872,
"learning_rate": 1.4840283939662821e-05,
"loss": 0.1748,
"step": 1837
},
{
"epoch": 2.201197604790419,
"grad_norm": 0.1694165913905079,
"learning_rate": 1.4818101153504882e-05,
"loss": 0.1841,
"step": 1838
},
{
"epoch": 2.202395209580838,
"grad_norm": 0.18301028014543377,
"learning_rate": 1.479591836734694e-05,
"loss": 0.1703,
"step": 1839
},
{
"epoch": 2.2035928143712575,
"grad_norm": 0.17232321254745694,
"learning_rate": 1.4773735581189e-05,
"loss": 0.1722,
"step": 1840
},
{
"epoch": 2.2047904191616765,
"grad_norm": 0.17099663434095405,
"learning_rate": 1.4751552795031057e-05,
"loss": 0.1665,
"step": 1841
},
{
"epoch": 2.205988023952096,
"grad_norm": 0.1528881804872003,
"learning_rate": 1.4729370008873114e-05,
"loss": 0.167,
"step": 1842
},
{
"epoch": 2.207185628742515,
"grad_norm": 0.1588373249593002,
"learning_rate": 1.4707187222715174e-05,
"loss": 0.1591,
"step": 1843
},
{
"epoch": 2.2083832335329343,
"grad_norm": 0.15824829101502366,
"learning_rate": 1.4685004436557232e-05,
"loss": 0.166,
"step": 1844
},
{
"epoch": 2.2095808383233533,
"grad_norm": 0.16034107306884154,
"learning_rate": 1.4662821650399292e-05,
"loss": 0.1695,
"step": 1845
},
{
"epoch": 2.2107784431137723,
"grad_norm": 0.18546417076742658,
"learning_rate": 1.4640638864241349e-05,
"loss": 0.1622,
"step": 1846
},
{
"epoch": 2.2119760479041917,
"grad_norm": 0.184351990182473,
"learning_rate": 1.4618456078083409e-05,
"loss": 0.1833,
"step": 1847
},
{
"epoch": 2.2131736526946106,
"grad_norm": 0.1790883010405989,
"learning_rate": 1.4596273291925466e-05,
"loss": 0.1914,
"step": 1848
},
{
"epoch": 2.21437125748503,
"grad_norm": 0.16417637355982748,
"learning_rate": 1.4574090505767526e-05,
"loss": 0.1765,
"step": 1849
},
{
"epoch": 2.215568862275449,
"grad_norm": 0.16349668462428069,
"learning_rate": 1.4551907719609584e-05,
"loss": 0.1607,
"step": 1850
},
{
"epoch": 2.2167664670658684,
"grad_norm": 0.16821674209251478,
"learning_rate": 1.4529724933451641e-05,
"loss": 0.1731,
"step": 1851
},
{
"epoch": 2.2179640718562874,
"grad_norm": 0.183780882816518,
"learning_rate": 1.4507542147293701e-05,
"loss": 0.1862,
"step": 1852
},
{
"epoch": 2.2191616766467064,
"grad_norm": 0.1634034282708209,
"learning_rate": 1.4485359361135758e-05,
"loss": 0.1642,
"step": 1853
},
{
"epoch": 2.220359281437126,
"grad_norm": 0.1616751410300635,
"learning_rate": 1.4463176574977818e-05,
"loss": 0.1617,
"step": 1854
},
{
"epoch": 2.2215568862275448,
"grad_norm": 0.16902614803415386,
"learning_rate": 1.4440993788819876e-05,
"loss": 0.1782,
"step": 1855
},
{
"epoch": 2.222754491017964,
"grad_norm": 0.16359118800074995,
"learning_rate": 1.4418811002661937e-05,
"loss": 0.1674,
"step": 1856
},
{
"epoch": 2.223952095808383,
"grad_norm": 0.16111628508560094,
"learning_rate": 1.4396628216503993e-05,
"loss": 0.1699,
"step": 1857
},
{
"epoch": 2.2251497005988026,
"grad_norm": 0.1589893734693367,
"learning_rate": 1.4374445430346053e-05,
"loss": 0.1501,
"step": 1858
},
{
"epoch": 2.2263473053892215,
"grad_norm": 0.1873214366427537,
"learning_rate": 1.435226264418811e-05,
"loss": 0.1747,
"step": 1859
},
{
"epoch": 2.2275449101796405,
"grad_norm": 0.16299321933949412,
"learning_rate": 1.4330079858030169e-05,
"loss": 0.168,
"step": 1860
},
{
"epoch": 2.22874251497006,
"grad_norm": 0.1694149425130483,
"learning_rate": 1.4307897071872229e-05,
"loss": 0.1631,
"step": 1861
},
{
"epoch": 2.229940119760479,
"grad_norm": 0.1629750987711165,
"learning_rate": 1.4285714285714285e-05,
"loss": 0.1898,
"step": 1862
},
{
"epoch": 2.2311377245508983,
"grad_norm": 0.1711073418935274,
"learning_rate": 1.4263531499556346e-05,
"loss": 0.1847,
"step": 1863
},
{
"epoch": 2.2323353293413173,
"grad_norm": 0.1742008093990278,
"learning_rate": 1.4241348713398402e-05,
"loss": 0.1724,
"step": 1864
},
{
"epoch": 2.2335329341317367,
"grad_norm": 0.16866245867579907,
"learning_rate": 1.4219165927240462e-05,
"loss": 0.1889,
"step": 1865
},
{
"epoch": 2.2347305389221557,
"grad_norm": 0.15653877648803938,
"learning_rate": 1.419698314108252e-05,
"loss": 0.1731,
"step": 1866
},
{
"epoch": 2.2359281437125746,
"grad_norm": 0.16274882875978816,
"learning_rate": 1.4174800354924581e-05,
"loss": 0.1753,
"step": 1867
},
{
"epoch": 2.237125748502994,
"grad_norm": 0.18856325246079353,
"learning_rate": 1.4152617568766638e-05,
"loss": 0.1908,
"step": 1868
},
{
"epoch": 2.238323353293413,
"grad_norm": 0.16910483816041694,
"learning_rate": 1.4130434782608694e-05,
"loss": 0.1796,
"step": 1869
},
{
"epoch": 2.2395209580838324,
"grad_norm": 0.16171790353149806,
"learning_rate": 1.4108251996450755e-05,
"loss": 0.167,
"step": 1870
},
{
"epoch": 2.2407185628742514,
"grad_norm": 0.16960049226343624,
"learning_rate": 1.4086069210292813e-05,
"loss": 0.1777,
"step": 1871
},
{
"epoch": 2.241916167664671,
"grad_norm": 0.15518951439788647,
"learning_rate": 1.4063886424134873e-05,
"loss": 0.1736,
"step": 1872
},
{
"epoch": 2.24311377245509,
"grad_norm": 0.17081982503434384,
"learning_rate": 1.404170363797693e-05,
"loss": 0.1675,
"step": 1873
},
{
"epoch": 2.2443113772455088,
"grad_norm": 0.16539440664381377,
"learning_rate": 1.401952085181899e-05,
"loss": 0.1774,
"step": 1874
},
{
"epoch": 2.245508982035928,
"grad_norm": 0.17144681977169038,
"learning_rate": 1.3997338065661048e-05,
"loss": 0.1839,
"step": 1875
},
{
"epoch": 2.246706586826347,
"grad_norm": 0.15951696980779848,
"learning_rate": 1.3975155279503105e-05,
"loss": 0.1652,
"step": 1876
},
{
"epoch": 2.2479041916167666,
"grad_norm": 0.16636513670032504,
"learning_rate": 1.3952972493345165e-05,
"loss": 0.1783,
"step": 1877
},
{
"epoch": 2.2491017964071855,
"grad_norm": 0.1539192570759403,
"learning_rate": 1.3930789707187222e-05,
"loss": 0.1558,
"step": 1878
},
{
"epoch": 2.250299401197605,
"grad_norm": 0.16336983120886017,
"learning_rate": 1.3908606921029282e-05,
"loss": 0.1693,
"step": 1879
},
{
"epoch": 2.251497005988024,
"grad_norm": 0.19795793593873065,
"learning_rate": 1.388642413487134e-05,
"loss": 0.1886,
"step": 1880
},
{
"epoch": 2.252694610778443,
"grad_norm": 0.1669047727407888,
"learning_rate": 1.38642413487134e-05,
"loss": 0.1652,
"step": 1881
},
{
"epoch": 2.2538922155688623,
"grad_norm": 0.1703446937886893,
"learning_rate": 1.3842058562555457e-05,
"loss": 0.1793,
"step": 1882
},
{
"epoch": 2.2550898203592813,
"grad_norm": 0.16197951062912289,
"learning_rate": 1.3819875776397517e-05,
"loss": 0.1675,
"step": 1883
},
{
"epoch": 2.2562874251497007,
"grad_norm": 0.15507817433603593,
"learning_rate": 1.3797692990239574e-05,
"loss": 0.1792,
"step": 1884
},
{
"epoch": 2.2574850299401197,
"grad_norm": 0.16389644404136963,
"learning_rate": 1.3775510204081633e-05,
"loss": 0.1579,
"step": 1885
},
{
"epoch": 2.258682634730539,
"grad_norm": 0.1796978138988881,
"learning_rate": 1.3753327417923693e-05,
"loss": 0.1825,
"step": 1886
},
{
"epoch": 2.259880239520958,
"grad_norm": 0.1637487329834389,
"learning_rate": 1.373114463176575e-05,
"loss": 0.1681,
"step": 1887
},
{
"epoch": 2.2610778443113775,
"grad_norm": 0.1653821350692558,
"learning_rate": 1.370896184560781e-05,
"loss": 0.175,
"step": 1888
},
{
"epoch": 2.2622754491017965,
"grad_norm": 0.16717531860336354,
"learning_rate": 1.3686779059449866e-05,
"loss": 0.171,
"step": 1889
},
{
"epoch": 2.2634730538922154,
"grad_norm": 0.1721478053538159,
"learning_rate": 1.3664596273291926e-05,
"loss": 0.1892,
"step": 1890
},
{
"epoch": 2.264670658682635,
"grad_norm": 0.17230321081084915,
"learning_rate": 1.3642413487133985e-05,
"loss": 0.1812,
"step": 1891
},
{
"epoch": 2.265868263473054,
"grad_norm": 0.16796075389717502,
"learning_rate": 1.3620230700976045e-05,
"loss": 0.1772,
"step": 1892
},
{
"epoch": 2.2670658682634732,
"grad_norm": 0.16524074993154872,
"learning_rate": 1.3598047914818102e-05,
"loss": 0.1762,
"step": 1893
},
{
"epoch": 2.268263473053892,
"grad_norm": 0.18087293004485067,
"learning_rate": 1.3575865128660158e-05,
"loss": 0.1881,
"step": 1894
},
{
"epoch": 2.269461077844311,
"grad_norm": 0.17397596261444465,
"learning_rate": 1.3553682342502219e-05,
"loss": 0.1892,
"step": 1895
},
{
"epoch": 2.2706586826347306,
"grad_norm": 0.16869266820030404,
"learning_rate": 1.3531499556344277e-05,
"loss": 0.1914,
"step": 1896
},
{
"epoch": 2.2718562874251496,
"grad_norm": 0.18178141864300176,
"learning_rate": 1.3509316770186337e-05,
"loss": 0.1842,
"step": 1897
},
{
"epoch": 2.273053892215569,
"grad_norm": 0.17575907724352902,
"learning_rate": 1.3487133984028394e-05,
"loss": 0.1835,
"step": 1898
},
{
"epoch": 2.274251497005988,
"grad_norm": 0.16370074718590905,
"learning_rate": 1.3464951197870454e-05,
"loss": 0.1737,
"step": 1899
},
{
"epoch": 2.2754491017964074,
"grad_norm": 0.15599935386993632,
"learning_rate": 1.344276841171251e-05,
"loss": 0.1703,
"step": 1900
},
{
"epoch": 2.2766467065868263,
"grad_norm": 0.17793645899930705,
"learning_rate": 1.342058562555457e-05,
"loss": 0.1819,
"step": 1901
},
{
"epoch": 2.2778443113772457,
"grad_norm": 0.1784342432316549,
"learning_rate": 1.339840283939663e-05,
"loss": 0.1672,
"step": 1902
},
{
"epoch": 2.2790419161676647,
"grad_norm": 0.15072152891889096,
"learning_rate": 1.3376220053238686e-05,
"loss": 0.1563,
"step": 1903
},
{
"epoch": 2.2802395209580837,
"grad_norm": 0.1530017555398019,
"learning_rate": 1.3354037267080746e-05,
"loss": 0.1682,
"step": 1904
},
{
"epoch": 2.281437125748503,
"grad_norm": 0.15258980310188788,
"learning_rate": 1.3331854480922803e-05,
"loss": 0.1509,
"step": 1905
},
{
"epoch": 2.282634730538922,
"grad_norm": 0.157385513004231,
"learning_rate": 1.3309671694764863e-05,
"loss": 0.1586,
"step": 1906
},
{
"epoch": 2.2838323353293415,
"grad_norm": 0.15721270054284997,
"learning_rate": 1.3287488908606921e-05,
"loss": 0.1671,
"step": 1907
},
{
"epoch": 2.2850299401197605,
"grad_norm": 0.16907563748414564,
"learning_rate": 1.3265306122448982e-05,
"loss": 0.1742,
"step": 1908
},
{
"epoch": 2.2862275449101794,
"grad_norm": 0.1655071093246718,
"learning_rate": 1.3243123336291038e-05,
"loss": 0.1777,
"step": 1909
},
{
"epoch": 2.287425149700599,
"grad_norm": 0.1649556492454644,
"learning_rate": 1.3220940550133098e-05,
"loss": 0.1793,
"step": 1910
},
{
"epoch": 2.288622754491018,
"grad_norm": 0.15952348094143956,
"learning_rate": 1.3198757763975155e-05,
"loss": 0.1817,
"step": 1911
},
{
"epoch": 2.2898203592814372,
"grad_norm": 0.1842044444587944,
"learning_rate": 1.3176574977817214e-05,
"loss": 0.1722,
"step": 1912
},
{
"epoch": 2.291017964071856,
"grad_norm": 0.17089874510840528,
"learning_rate": 1.3154392191659274e-05,
"loss": 0.182,
"step": 1913
},
{
"epoch": 2.2922155688622756,
"grad_norm": 0.15777193583764476,
"learning_rate": 1.313220940550133e-05,
"loss": 0.1749,
"step": 1914
},
{
"epoch": 2.2934131736526946,
"grad_norm": 0.1671839431337055,
"learning_rate": 1.311002661934339e-05,
"loss": 0.1896,
"step": 1915
},
{
"epoch": 2.294610778443114,
"grad_norm": 0.15785489354302412,
"learning_rate": 1.3087843833185447e-05,
"loss": 0.1736,
"step": 1916
},
{
"epoch": 2.295808383233533,
"grad_norm": 0.1513178140057061,
"learning_rate": 1.3065661047027507e-05,
"loss": 0.1625,
"step": 1917
},
{
"epoch": 2.297005988023952,
"grad_norm": 0.1699754853537784,
"learning_rate": 1.3043478260869566e-05,
"loss": 0.1812,
"step": 1918
},
{
"epoch": 2.2982035928143714,
"grad_norm": 0.16577899831513068,
"learning_rate": 1.3021295474711626e-05,
"loss": 0.1772,
"step": 1919
},
{
"epoch": 2.2994011976047903,
"grad_norm": 0.15325975809066755,
"learning_rate": 1.2999112688553683e-05,
"loss": 0.1622,
"step": 1920
},
{
"epoch": 2.3005988023952098,
"grad_norm": 0.18546303383144525,
"learning_rate": 1.2976929902395741e-05,
"loss": 0.1678,
"step": 1921
},
{
"epoch": 2.3017964071856287,
"grad_norm": 0.16479414010254306,
"learning_rate": 1.2954747116237801e-05,
"loss": 0.1652,
"step": 1922
},
{
"epoch": 2.3029940119760477,
"grad_norm": 0.15704387442226617,
"learning_rate": 1.2932564330079858e-05,
"loss": 0.1656,
"step": 1923
},
{
"epoch": 2.304191616766467,
"grad_norm": 0.17977235744035613,
"learning_rate": 1.2910381543921918e-05,
"loss": 0.1788,
"step": 1924
},
{
"epoch": 2.305389221556886,
"grad_norm": 0.1628338798501438,
"learning_rate": 1.2888198757763975e-05,
"loss": 0.1642,
"step": 1925
},
{
"epoch": 2.3065868263473055,
"grad_norm": 0.3124373370894629,
"learning_rate": 1.2866015971606035e-05,
"loss": 0.2116,
"step": 1926
},
{
"epoch": 2.3077844311377245,
"grad_norm": 0.17853785970292463,
"learning_rate": 1.2843833185448093e-05,
"loss": 0.166,
"step": 1927
},
{
"epoch": 2.308982035928144,
"grad_norm": 0.17385199513410768,
"learning_rate": 1.2821650399290153e-05,
"loss": 0.1644,
"step": 1928
},
{
"epoch": 2.310179640718563,
"grad_norm": 0.8990868094172658,
"learning_rate": 1.279946761313221e-05,
"loss": 0.1912,
"step": 1929
},
{
"epoch": 2.3113772455089823,
"grad_norm": 0.17790369877876383,
"learning_rate": 1.2777284826974267e-05,
"loss": 0.1716,
"step": 1930
},
{
"epoch": 2.3125748502994012,
"grad_norm": 0.17441919238881964,
"learning_rate": 1.2755102040816327e-05,
"loss": 0.1606,
"step": 1931
},
{
"epoch": 2.31377245508982,
"grad_norm": 0.17024962721940798,
"learning_rate": 1.2732919254658385e-05,
"loss": 0.1794,
"step": 1932
},
{
"epoch": 2.3149700598802396,
"grad_norm": 0.1817629296171488,
"learning_rate": 1.2710736468500446e-05,
"loss": 0.1719,
"step": 1933
},
{
"epoch": 2.3161676646706586,
"grad_norm": 0.17839022263870447,
"learning_rate": 1.2688553682342502e-05,
"loss": 0.1821,
"step": 1934
},
{
"epoch": 2.317365269461078,
"grad_norm": 0.1825567443028695,
"learning_rate": 1.2666370896184562e-05,
"loss": 0.1797,
"step": 1935
},
{
"epoch": 2.318562874251497,
"grad_norm": 0.3151460344617281,
"learning_rate": 1.2644188110026619e-05,
"loss": 0.1764,
"step": 1936
},
{
"epoch": 2.319760479041916,
"grad_norm": 0.20552740826470184,
"learning_rate": 1.262200532386868e-05,
"loss": 0.1787,
"step": 1937
},
{
"epoch": 2.3209580838323354,
"grad_norm": 0.19035348840746188,
"learning_rate": 1.2599822537710738e-05,
"loss": 0.1783,
"step": 1938
},
{
"epoch": 2.3221556886227543,
"grad_norm": 0.17300513660174904,
"learning_rate": 1.2577639751552794e-05,
"loss": 0.1638,
"step": 1939
},
{
"epoch": 2.3233532934131738,
"grad_norm": 0.18489448369015954,
"learning_rate": 1.2555456965394855e-05,
"loss": 0.1772,
"step": 1940
},
{
"epoch": 2.3245508982035927,
"grad_norm": 0.42896328920393134,
"learning_rate": 1.2533274179236911e-05,
"loss": 0.1648,
"step": 1941
},
{
"epoch": 2.325748502994012,
"grad_norm": 0.17532631109122662,
"learning_rate": 1.2511091393078971e-05,
"loss": 0.1834,
"step": 1942
},
{
"epoch": 2.326946107784431,
"grad_norm": 0.1703329154263236,
"learning_rate": 1.248890860692103e-05,
"loss": 0.1643,
"step": 1943
},
{
"epoch": 2.3281437125748505,
"grad_norm": 0.18049528423365352,
"learning_rate": 1.2466725820763088e-05,
"loss": 0.1724,
"step": 1944
},
{
"epoch": 2.3293413173652695,
"grad_norm": 0.1791992714991953,
"learning_rate": 1.2444543034605147e-05,
"loss": 0.1814,
"step": 1945
},
{
"epoch": 2.3305389221556885,
"grad_norm": 0.17042098300847752,
"learning_rate": 1.2422360248447205e-05,
"loss": 0.1716,
"step": 1946
},
{
"epoch": 2.331736526946108,
"grad_norm": 0.1741408866717636,
"learning_rate": 1.2400177462289264e-05,
"loss": 0.1593,
"step": 1947
},
{
"epoch": 2.332934131736527,
"grad_norm": 0.16207292448077085,
"learning_rate": 1.2377994676131324e-05,
"loss": 0.1637,
"step": 1948
},
{
"epoch": 2.3341317365269463,
"grad_norm": 0.17238466953617604,
"learning_rate": 1.2355811889973382e-05,
"loss": 0.1714,
"step": 1949
},
{
"epoch": 2.3353293413173652,
"grad_norm": 0.16315776315908778,
"learning_rate": 1.233362910381544e-05,
"loss": 0.1708,
"step": 1950
},
{
"epoch": 2.336526946107784,
"grad_norm": 0.160183189419497,
"learning_rate": 1.2311446317657499e-05,
"loss": 0.1664,
"step": 1951
},
{
"epoch": 2.3377245508982036,
"grad_norm": 0.1673560481868311,
"learning_rate": 1.2289263531499556e-05,
"loss": 0.1547,
"step": 1952
},
{
"epoch": 2.3389221556886226,
"grad_norm": 0.1711282859668782,
"learning_rate": 1.2267080745341616e-05,
"loss": 0.1589,
"step": 1953
},
{
"epoch": 2.340119760479042,
"grad_norm": 0.15453629443130976,
"learning_rate": 1.2244897959183674e-05,
"loss": 0.1725,
"step": 1954
},
{
"epoch": 2.341317365269461,
"grad_norm": 0.16406542094590312,
"learning_rate": 1.2222715173025733e-05,
"loss": 0.174,
"step": 1955
},
{
"epoch": 2.3425149700598804,
"grad_norm": 0.16617039611280177,
"learning_rate": 1.2200532386867791e-05,
"loss": 0.1628,
"step": 1956
},
{
"epoch": 2.3437125748502994,
"grad_norm": 0.17643661303401986,
"learning_rate": 1.217834960070985e-05,
"loss": 0.1703,
"step": 1957
},
{
"epoch": 2.344910179640719,
"grad_norm": 0.16443558263184974,
"learning_rate": 1.2156166814551908e-05,
"loss": 0.1604,
"step": 1958
},
{
"epoch": 2.3461077844311378,
"grad_norm": 0.15744084732231847,
"learning_rate": 1.2133984028393968e-05,
"loss": 0.1512,
"step": 1959
},
{
"epoch": 2.3473053892215567,
"grad_norm": 0.16213340472172774,
"learning_rate": 1.2111801242236026e-05,
"loss": 0.1837,
"step": 1960
},
{
"epoch": 2.348502994011976,
"grad_norm": 0.17469940127328742,
"learning_rate": 1.2089618456078083e-05,
"loss": 0.1739,
"step": 1961
},
{
"epoch": 2.349700598802395,
"grad_norm": 0.16730768989719677,
"learning_rate": 1.2067435669920142e-05,
"loss": 0.1753,
"step": 1962
},
{
"epoch": 2.3508982035928145,
"grad_norm": 0.22858067801669557,
"learning_rate": 1.20452528837622e-05,
"loss": 0.1734,
"step": 1963
},
{
"epoch": 2.3520958083832335,
"grad_norm": 0.15911370784821305,
"learning_rate": 1.202307009760426e-05,
"loss": 0.1748,
"step": 1964
},
{
"epoch": 2.3532934131736525,
"grad_norm": 0.17946902933569087,
"learning_rate": 1.2000887311446319e-05,
"loss": 0.1723,
"step": 1965
},
{
"epoch": 2.354491017964072,
"grad_norm": 0.18307585057447623,
"learning_rate": 1.1978704525288377e-05,
"loss": 0.1656,
"step": 1966
},
{
"epoch": 2.355688622754491,
"grad_norm": 0.16775808818898158,
"learning_rate": 1.1956521739130435e-05,
"loss": 0.1683,
"step": 1967
},
{
"epoch": 2.3568862275449103,
"grad_norm": 0.1590095968470191,
"learning_rate": 1.1934338952972494e-05,
"loss": 0.1685,
"step": 1968
},
{
"epoch": 2.3580838323353293,
"grad_norm": 0.16883849468508869,
"learning_rate": 1.1912156166814554e-05,
"loss": 0.1573,
"step": 1969
},
{
"epoch": 2.3592814371257487,
"grad_norm": 0.16277243573011008,
"learning_rate": 1.188997338065661e-05,
"loss": 0.1655,
"step": 1970
},
{
"epoch": 2.3604790419161676,
"grad_norm": 0.16623156215997686,
"learning_rate": 1.1867790594498669e-05,
"loss": 0.1697,
"step": 1971
},
{
"epoch": 2.361676646706587,
"grad_norm": 0.17800769874744232,
"learning_rate": 1.1845607808340728e-05,
"loss": 0.1745,
"step": 1972
},
{
"epoch": 2.362874251497006,
"grad_norm": 0.16432851162741588,
"learning_rate": 1.1823425022182786e-05,
"loss": 0.1598,
"step": 1973
},
{
"epoch": 2.364071856287425,
"grad_norm": 0.15661690911183246,
"learning_rate": 1.1801242236024846e-05,
"loss": 0.1629,
"step": 1974
},
{
"epoch": 2.3652694610778444,
"grad_norm": 0.1579288649834301,
"learning_rate": 1.1779059449866905e-05,
"loss": 0.1656,
"step": 1975
},
{
"epoch": 2.3664670658682634,
"grad_norm": 0.15898922356763312,
"learning_rate": 1.1756876663708963e-05,
"loss": 0.1639,
"step": 1976
},
{
"epoch": 2.367664670658683,
"grad_norm": 0.16531806282299663,
"learning_rate": 1.1734693877551021e-05,
"loss": 0.1758,
"step": 1977
},
{
"epoch": 2.3688622754491018,
"grad_norm": 0.1579712424721319,
"learning_rate": 1.1712511091393078e-05,
"loss": 0.1579,
"step": 1978
},
{
"epoch": 2.3700598802395207,
"grad_norm": 0.155203128593749,
"learning_rate": 1.1690328305235138e-05,
"loss": 0.1669,
"step": 1979
},
{
"epoch": 2.37125748502994,
"grad_norm": 0.16625223540900394,
"learning_rate": 1.1668145519077197e-05,
"loss": 0.1678,
"step": 1980
},
{
"epoch": 2.372455089820359,
"grad_norm": 0.2018154632933105,
"learning_rate": 1.1645962732919255e-05,
"loss": 0.1689,
"step": 1981
},
{
"epoch": 2.3736526946107785,
"grad_norm": 0.18423002768860122,
"learning_rate": 1.1623779946761314e-05,
"loss": 0.1694,
"step": 1982
},
{
"epoch": 2.3748502994011975,
"grad_norm": 0.17376417019593088,
"learning_rate": 1.1601597160603372e-05,
"loss": 0.1735,
"step": 1983
},
{
"epoch": 2.376047904191617,
"grad_norm": 0.15187333623143023,
"learning_rate": 1.157941437444543e-05,
"loss": 0.1646,
"step": 1984
},
{
"epoch": 2.377245508982036,
"grad_norm": 0.16719325791763245,
"learning_rate": 1.155723158828749e-05,
"loss": 0.1813,
"step": 1985
},
{
"epoch": 2.3784431137724553,
"grad_norm": 0.1631426092163094,
"learning_rate": 1.1535048802129549e-05,
"loss": 0.1798,
"step": 1986
},
{
"epoch": 2.3796407185628743,
"grad_norm": 0.15503761318056278,
"learning_rate": 1.1512866015971606e-05,
"loss": 0.1606,
"step": 1987
},
{
"epoch": 2.3808383233532933,
"grad_norm": 0.1608262072622163,
"learning_rate": 1.1490683229813664e-05,
"loss": 0.1676,
"step": 1988
},
{
"epoch": 2.3820359281437127,
"grad_norm": 0.1637918088922538,
"learning_rate": 1.1468500443655724e-05,
"loss": 0.1682,
"step": 1989
},
{
"epoch": 2.3832335329341316,
"grad_norm": 0.17576800644518908,
"learning_rate": 1.1446317657497783e-05,
"loss": 0.1824,
"step": 1990
},
{
"epoch": 2.384431137724551,
"grad_norm": 0.1645896134969204,
"learning_rate": 1.1424134871339841e-05,
"loss": 0.1607,
"step": 1991
},
{
"epoch": 2.38562874251497,
"grad_norm": 0.16648499889958465,
"learning_rate": 1.14019520851819e-05,
"loss": 0.1736,
"step": 1992
},
{
"epoch": 2.386826347305389,
"grad_norm": 0.16951821755704877,
"learning_rate": 1.1379769299023958e-05,
"loss": 0.1776,
"step": 1993
},
{
"epoch": 2.3880239520958084,
"grad_norm": 0.15549155642489887,
"learning_rate": 1.1357586512866016e-05,
"loss": 0.1758,
"step": 1994
},
{
"epoch": 2.3892215568862274,
"grad_norm": 0.16923400648047587,
"learning_rate": 1.1335403726708076e-05,
"loss": 0.164,
"step": 1995
},
{
"epoch": 2.390419161676647,
"grad_norm": 0.15345308980896924,
"learning_rate": 1.1313220940550133e-05,
"loss": 0.1615,
"step": 1996
},
{
"epoch": 2.391616766467066,
"grad_norm": 0.16640476778957194,
"learning_rate": 1.1291038154392192e-05,
"loss": 0.1672,
"step": 1997
},
{
"epoch": 2.392814371257485,
"grad_norm": 0.1703709003819181,
"learning_rate": 1.126885536823425e-05,
"loss": 0.1911,
"step": 1998
},
{
"epoch": 2.394011976047904,
"grad_norm": 0.16188098453368777,
"learning_rate": 1.1246672582076308e-05,
"loss": 0.1774,
"step": 1999
},
{
"epoch": 2.3952095808383236,
"grad_norm": 0.1607618202743878,
"learning_rate": 1.1224489795918369e-05,
"loss": 0.1642,
"step": 2000
},
{
"epoch": 2.3964071856287426,
"grad_norm": 0.1551650146193555,
"learning_rate": 1.1202307009760427e-05,
"loss": 0.1661,
"step": 2001
},
{
"epoch": 2.3976047904191615,
"grad_norm": 0.15562462478847577,
"learning_rate": 1.1180124223602485e-05,
"loss": 0.1671,
"step": 2002
},
{
"epoch": 2.398802395209581,
"grad_norm": 0.16670913006687654,
"learning_rate": 1.1157941437444544e-05,
"loss": 0.1618,
"step": 2003
},
{
"epoch": 2.4,
"grad_norm": 0.15944140281293187,
"learning_rate": 1.1135758651286602e-05,
"loss": 0.1787,
"step": 2004
},
{
"epoch": 2.4011976047904193,
"grad_norm": 0.17460325665046517,
"learning_rate": 1.111357586512866e-05,
"loss": 0.1798,
"step": 2005
},
{
"epoch": 2.4023952095808383,
"grad_norm": 0.16295726589681367,
"learning_rate": 1.1091393078970719e-05,
"loss": 0.1619,
"step": 2006
},
{
"epoch": 2.4035928143712573,
"grad_norm": 0.1610184925225202,
"learning_rate": 1.1069210292812778e-05,
"loss": 0.1793,
"step": 2007
},
{
"epoch": 2.4047904191616767,
"grad_norm": 0.15605804813364607,
"learning_rate": 1.1047027506654836e-05,
"loss": 0.1746,
"step": 2008
},
{
"epoch": 2.4059880239520957,
"grad_norm": 0.17794911844242986,
"learning_rate": 1.1024844720496894e-05,
"loss": 0.1836,
"step": 2009
},
{
"epoch": 2.407185628742515,
"grad_norm": 0.16690964922745266,
"learning_rate": 1.1002661934338953e-05,
"loss": 0.1835,
"step": 2010
},
{
"epoch": 2.408383233532934,
"grad_norm": 0.16164806777643675,
"learning_rate": 1.0980479148181013e-05,
"loss": 0.1811,
"step": 2011
},
{
"epoch": 2.4095808383233535,
"grad_norm": 0.15621879368921757,
"learning_rate": 1.0958296362023071e-05,
"loss": 0.1678,
"step": 2012
},
{
"epoch": 2.4107784431137724,
"grad_norm": 0.15472252580102092,
"learning_rate": 1.093611357586513e-05,
"loss": 0.1582,
"step": 2013
},
{
"epoch": 2.411976047904192,
"grad_norm": 0.1629265732342378,
"learning_rate": 1.0913930789707187e-05,
"loss": 0.1674,
"step": 2014
},
{
"epoch": 2.413173652694611,
"grad_norm": 0.16871242220792243,
"learning_rate": 1.0891748003549247e-05,
"loss": 0.1765,
"step": 2015
},
{
"epoch": 2.41437125748503,
"grad_norm": 0.16367612504805815,
"learning_rate": 1.0869565217391305e-05,
"loss": 0.1559,
"step": 2016
},
{
"epoch": 2.415568862275449,
"grad_norm": 0.34839894622157513,
"learning_rate": 1.0847382431233364e-05,
"loss": 0.1716,
"step": 2017
},
{
"epoch": 2.416766467065868,
"grad_norm": 0.1607785995700165,
"learning_rate": 1.0825199645075422e-05,
"loss": 0.1729,
"step": 2018
},
{
"epoch": 2.4179640718562876,
"grad_norm": 0.16367595624593617,
"learning_rate": 1.080301685891748e-05,
"loss": 0.1781,
"step": 2019
},
{
"epoch": 2.4191616766467066,
"grad_norm": 0.16166125032247824,
"learning_rate": 1.0780834072759539e-05,
"loss": 0.1707,
"step": 2020
},
{
"epoch": 2.4203592814371255,
"grad_norm": 0.16471303621056266,
"learning_rate": 1.0758651286601599e-05,
"loss": 0.1486,
"step": 2021
},
{
"epoch": 2.421556886227545,
"grad_norm": 0.16330561559000548,
"learning_rate": 1.0736468500443657e-05,
"loss": 0.1783,
"step": 2022
},
{
"epoch": 2.422754491017964,
"grad_norm": 0.1614660763521988,
"learning_rate": 1.0714285714285714e-05,
"loss": 0.1666,
"step": 2023
},
{
"epoch": 2.4239520958083833,
"grad_norm": 0.16727420514959038,
"learning_rate": 1.0692102928127772e-05,
"loss": 0.1737,
"step": 2024
},
{
"epoch": 2.4251497005988023,
"grad_norm": 0.17152421441558344,
"learning_rate": 1.0669920141969831e-05,
"loss": 0.1816,
"step": 2025
},
{
"epoch": 2.4263473053892217,
"grad_norm": 0.15952642960473618,
"learning_rate": 1.0647737355811891e-05,
"loss": 0.1715,
"step": 2026
},
{
"epoch": 2.4275449101796407,
"grad_norm": 0.164076168440548,
"learning_rate": 1.062555456965395e-05,
"loss": 0.1822,
"step": 2027
},
{
"epoch": 2.42874251497006,
"grad_norm": 0.16934370997105447,
"learning_rate": 1.0603371783496008e-05,
"loss": 0.1702,
"step": 2028
},
{
"epoch": 2.429940119760479,
"grad_norm": 0.1552835078261528,
"learning_rate": 1.0581188997338066e-05,
"loss": 0.1651,
"step": 2029
},
{
"epoch": 2.431137724550898,
"grad_norm": 0.16584824447154348,
"learning_rate": 1.0559006211180125e-05,
"loss": 0.168,
"step": 2030
},
{
"epoch": 2.4323353293413175,
"grad_norm": 0.16194543067648237,
"learning_rate": 1.0536823425022183e-05,
"loss": 0.1601,
"step": 2031
},
{
"epoch": 2.4335329341317364,
"grad_norm": 0.19609781567434026,
"learning_rate": 1.0514640638864242e-05,
"loss": 0.1657,
"step": 2032
},
{
"epoch": 2.434730538922156,
"grad_norm": 0.16148727107051955,
"learning_rate": 1.04924578527063e-05,
"loss": 0.1507,
"step": 2033
},
{
"epoch": 2.435928143712575,
"grad_norm": 0.16668666218420028,
"learning_rate": 1.0470275066548358e-05,
"loss": 0.1813,
"step": 2034
},
{
"epoch": 2.437125748502994,
"grad_norm": 0.1668991790748943,
"learning_rate": 1.0448092280390417e-05,
"loss": 0.1669,
"step": 2035
},
{
"epoch": 2.438323353293413,
"grad_norm": 0.157660543730784,
"learning_rate": 1.0425909494232475e-05,
"loss": 0.1623,
"step": 2036
},
{
"epoch": 2.439520958083832,
"grad_norm": 0.16416661091363752,
"learning_rate": 1.0403726708074535e-05,
"loss": 0.1634,
"step": 2037
},
{
"epoch": 2.4407185628742516,
"grad_norm": 0.1609272135782453,
"learning_rate": 1.0381543921916594e-05,
"loss": 0.174,
"step": 2038
},
{
"epoch": 2.4419161676646706,
"grad_norm": 0.14788046985956813,
"learning_rate": 1.0359361135758652e-05,
"loss": 0.1654,
"step": 2039
},
{
"epoch": 2.44311377245509,
"grad_norm": 0.15878902898357256,
"learning_rate": 1.033717834960071e-05,
"loss": 0.1612,
"step": 2040
},
{
"epoch": 2.444311377245509,
"grad_norm": 0.16471597719930212,
"learning_rate": 1.0314995563442769e-05,
"loss": 0.1647,
"step": 2041
},
{
"epoch": 2.4455089820359284,
"grad_norm": 0.18918134083961344,
"learning_rate": 1.0292812777284828e-05,
"loss": 0.1759,
"step": 2042
},
{
"epoch": 2.4467065868263473,
"grad_norm": 0.18311963549717503,
"learning_rate": 1.0270629991126886e-05,
"loss": 0.1849,
"step": 2043
},
{
"epoch": 2.4479041916167663,
"grad_norm": 0.15813171286451871,
"learning_rate": 1.0248447204968944e-05,
"loss": 0.1717,
"step": 2044
},
{
"epoch": 2.4491017964071857,
"grad_norm": 0.1506073725695152,
"learning_rate": 1.0226264418811003e-05,
"loss": 0.1632,
"step": 2045
},
{
"epoch": 2.4502994011976047,
"grad_norm": 0.1565719365921749,
"learning_rate": 1.0204081632653061e-05,
"loss": 0.1636,
"step": 2046
},
{
"epoch": 2.451497005988024,
"grad_norm": 0.17233443429323753,
"learning_rate": 1.0181898846495121e-05,
"loss": 0.1587,
"step": 2047
},
{
"epoch": 2.452694610778443,
"grad_norm": 0.1614624364701864,
"learning_rate": 1.015971606033718e-05,
"loss": 0.1529,
"step": 2048
},
{
"epoch": 2.453892215568862,
"grad_norm": 0.18076739715310164,
"learning_rate": 1.0137533274179237e-05,
"loss": 0.1759,
"step": 2049
},
{
"epoch": 2.4550898203592815,
"grad_norm": 0.18142753425146557,
"learning_rate": 1.0115350488021295e-05,
"loss": 0.1716,
"step": 2050
},
{
"epoch": 2.4562874251497004,
"grad_norm": 0.16511639659742272,
"learning_rate": 1.0093167701863353e-05,
"loss": 0.1583,
"step": 2051
},
{
"epoch": 2.45748502994012,
"grad_norm": 0.16782338247727713,
"learning_rate": 1.0070984915705414e-05,
"loss": 0.1823,
"step": 2052
},
{
"epoch": 2.458682634730539,
"grad_norm": 0.16019910113964975,
"learning_rate": 1.0048802129547472e-05,
"loss": 0.1699,
"step": 2053
},
{
"epoch": 2.4598802395209582,
"grad_norm": 0.15831980834593679,
"learning_rate": 1.002661934338953e-05,
"loss": 0.1662,
"step": 2054
},
{
"epoch": 2.461077844311377,
"grad_norm": 0.1579279272761143,
"learning_rate": 1.0004436557231589e-05,
"loss": 0.1658,
"step": 2055
},
{
"epoch": 2.4622754491017966,
"grad_norm": 0.16304855076350008,
"learning_rate": 9.982253771073647e-06,
"loss": 0.1703,
"step": 2056
},
{
"epoch": 2.4634730538922156,
"grad_norm": 0.15217267977601934,
"learning_rate": 9.960070984915706e-06,
"loss": 0.1607,
"step": 2057
},
{
"epoch": 2.4646706586826346,
"grad_norm": 0.16138041780397602,
"learning_rate": 9.937888198757764e-06,
"loss": 0.1623,
"step": 2058
},
{
"epoch": 2.465868263473054,
"grad_norm": 0.15860246561269467,
"learning_rate": 9.915705412599822e-06,
"loss": 0.1798,
"step": 2059
},
{
"epoch": 2.467065868263473,
"grad_norm": 0.16259792717655958,
"learning_rate": 9.893522626441881e-06,
"loss": 0.1743,
"step": 2060
},
{
"epoch": 2.4682634730538924,
"grad_norm": 0.1689633709949378,
"learning_rate": 9.87133984028394e-06,
"loss": 0.1815,
"step": 2061
},
{
"epoch": 2.4694610778443113,
"grad_norm": 0.17886057617394743,
"learning_rate": 9.849157054126e-06,
"loss": 0.176,
"step": 2062
},
{
"epoch": 2.4706586826347303,
"grad_norm": 0.16889142240216007,
"learning_rate": 9.826974267968058e-06,
"loss": 0.1855,
"step": 2063
},
{
"epoch": 2.4718562874251497,
"grad_norm": 0.15698670944336485,
"learning_rate": 9.804791481810116e-06,
"loss": 0.1726,
"step": 2064
},
{
"epoch": 2.4730538922155687,
"grad_norm": 0.16882367004623833,
"learning_rate": 9.782608695652175e-06,
"loss": 0.1603,
"step": 2065
},
{
"epoch": 2.474251497005988,
"grad_norm": 0.18232402425301059,
"learning_rate": 9.760425909494233e-06,
"loss": 0.1731,
"step": 2066
},
{
"epoch": 2.475449101796407,
"grad_norm": 0.16033780111849633,
"learning_rate": 9.738243123336292e-06,
"loss": 0.1672,
"step": 2067
},
{
"epoch": 2.4766467065868265,
"grad_norm": 0.1630063119785867,
"learning_rate": 9.71606033717835e-06,
"loss": 0.154,
"step": 2068
},
{
"epoch": 2.4778443113772455,
"grad_norm": 0.15716456089559305,
"learning_rate": 9.693877551020408e-06,
"loss": 0.158,
"step": 2069
},
{
"epoch": 2.479041916167665,
"grad_norm": 0.15908558454690588,
"learning_rate": 9.671694764862467e-06,
"loss": 0.1702,
"step": 2070
},
{
"epoch": 2.480239520958084,
"grad_norm": 0.1672151569410849,
"learning_rate": 9.649511978704525e-06,
"loss": 0.1772,
"step": 2071
},
{
"epoch": 2.481437125748503,
"grad_norm": 0.1691360261049528,
"learning_rate": 9.627329192546584e-06,
"loss": 0.1738,
"step": 2072
},
{
"epoch": 2.4826347305389223,
"grad_norm": 0.157953695410929,
"learning_rate": 9.605146406388644e-06,
"loss": 0.1602,
"step": 2073
},
{
"epoch": 2.4838323353293412,
"grad_norm": 0.14798789060980166,
"learning_rate": 9.582963620230702e-06,
"loss": 0.1553,
"step": 2074
},
{
"epoch": 2.4850299401197606,
"grad_norm": 0.159540122672957,
"learning_rate": 9.56078083407276e-06,
"loss": 0.1708,
"step": 2075
},
{
"epoch": 2.4862275449101796,
"grad_norm": 0.15368286604232767,
"learning_rate": 9.538598047914817e-06,
"loss": 0.1465,
"step": 2076
},
{
"epoch": 2.4874251497005986,
"grad_norm": 0.16819485062789205,
"learning_rate": 9.516415261756876e-06,
"loss": 0.1786,
"step": 2077
},
{
"epoch": 2.488622754491018,
"grad_norm": 0.16462877952499455,
"learning_rate": 9.494232475598936e-06,
"loss": 0.1564,
"step": 2078
},
{
"epoch": 2.489820359281437,
"grad_norm": 0.16039930561702645,
"learning_rate": 9.472049689440994e-06,
"loss": 0.1667,
"step": 2079
},
{
"epoch": 2.4910179640718564,
"grad_norm": 0.16519331466699014,
"learning_rate": 9.449866903283053e-06,
"loss": 0.1676,
"step": 2080
},
{
"epoch": 2.4922155688622754,
"grad_norm": 0.18057020865047205,
"learning_rate": 9.427684117125111e-06,
"loss": 0.1743,
"step": 2081
},
{
"epoch": 2.4934131736526948,
"grad_norm": 0.16332984256407374,
"learning_rate": 9.40550133096717e-06,
"loss": 0.1667,
"step": 2082
},
{
"epoch": 2.4946107784431137,
"grad_norm": 0.17926198642797933,
"learning_rate": 9.383318544809228e-06,
"loss": 0.1635,
"step": 2083
},
{
"epoch": 2.495808383233533,
"grad_norm": 0.16792061140082587,
"learning_rate": 9.361135758651288e-06,
"loss": 0.1713,
"step": 2084
},
{
"epoch": 2.497005988023952,
"grad_norm": 0.17580567127258076,
"learning_rate": 9.338952972493345e-06,
"loss": 0.1765,
"step": 2085
},
{
"epoch": 2.498203592814371,
"grad_norm": 0.1601811887237646,
"learning_rate": 9.316770186335403e-06,
"loss": 0.1731,
"step": 2086
},
{
"epoch": 2.4994011976047905,
"grad_norm": 0.16733608188388543,
"learning_rate": 9.294587400177462e-06,
"loss": 0.1647,
"step": 2087
},
{
"epoch": 2.5005988023952095,
"grad_norm": 0.18890857239895875,
"learning_rate": 9.272404614019522e-06,
"loss": 0.1721,
"step": 2088
},
{
"epoch": 2.501796407185629,
"grad_norm": 0.16424248772921413,
"learning_rate": 9.25022182786158e-06,
"loss": 0.1789,
"step": 2089
},
{
"epoch": 2.502994011976048,
"grad_norm": 0.15415565499543377,
"learning_rate": 9.228039041703639e-06,
"loss": 0.1638,
"step": 2090
},
{
"epoch": 2.504191616766467,
"grad_norm": 0.17625810420412974,
"learning_rate": 9.205856255545697e-06,
"loss": 0.1847,
"step": 2091
},
{
"epoch": 2.5053892215568863,
"grad_norm": 0.15987423961854294,
"learning_rate": 9.183673469387756e-06,
"loss": 0.1683,
"step": 2092
},
{
"epoch": 2.5065868263473052,
"grad_norm": 0.1638737273760118,
"learning_rate": 9.161490683229814e-06,
"loss": 0.176,
"step": 2093
},
{
"epoch": 2.5077844311377246,
"grad_norm": 0.15690872989701127,
"learning_rate": 9.139307897071872e-06,
"loss": 0.1546,
"step": 2094
},
{
"epoch": 2.5089820359281436,
"grad_norm": 0.1518851075429766,
"learning_rate": 9.117125110913931e-06,
"loss": 0.1697,
"step": 2095
},
{
"epoch": 2.510179640718563,
"grad_norm": 0.3045305497481858,
"learning_rate": 9.09494232475599e-06,
"loss": 0.1571,
"step": 2096
},
{
"epoch": 2.511377245508982,
"grad_norm": 0.16751961058864412,
"learning_rate": 9.072759538598048e-06,
"loss": 0.1862,
"step": 2097
},
{
"epoch": 2.5125748502994014,
"grad_norm": 0.15790872182532226,
"learning_rate": 9.050576752440106e-06,
"loss": 0.1521,
"step": 2098
},
{
"epoch": 2.5137724550898204,
"grad_norm": 0.1648549549455859,
"learning_rate": 9.028393966282166e-06,
"loss": 0.1649,
"step": 2099
},
{
"epoch": 2.5149700598802394,
"grad_norm": 0.16721707112604006,
"learning_rate": 9.006211180124225e-06,
"loss": 0.1697,
"step": 2100
},
{
"epoch": 2.5161676646706588,
"grad_norm": 0.15079707573971984,
"learning_rate": 8.984028393966283e-06,
"loss": 0.1573,
"step": 2101
},
{
"epoch": 2.5173652694610777,
"grad_norm": 0.16634733637773455,
"learning_rate": 8.961845607808342e-06,
"loss": 0.1646,
"step": 2102
},
{
"epoch": 2.518562874251497,
"grad_norm": 0.1661370439547068,
"learning_rate": 8.939662821650398e-06,
"loss": 0.1806,
"step": 2103
},
{
"epoch": 2.519760479041916,
"grad_norm": 0.16153965748031562,
"learning_rate": 8.917480035492458e-06,
"loss": 0.1582,
"step": 2104
},
{
"epoch": 2.520958083832335,
"grad_norm": 0.1634006043011155,
"learning_rate": 8.895297249334517e-06,
"loss": 0.1812,
"step": 2105
},
{
"epoch": 2.5221556886227545,
"grad_norm": 0.1633838382961149,
"learning_rate": 8.873114463176575e-06,
"loss": 0.1887,
"step": 2106
},
{
"epoch": 2.5233532934131735,
"grad_norm": 0.15053675491163845,
"learning_rate": 8.850931677018634e-06,
"loss": 0.1475,
"step": 2107
},
{
"epoch": 2.524550898203593,
"grad_norm": 0.16542353541204136,
"learning_rate": 8.828748890860692e-06,
"loss": 0.1886,
"step": 2108
},
{
"epoch": 2.525748502994012,
"grad_norm": 0.1705269394456366,
"learning_rate": 8.806566104702752e-06,
"loss": 0.1741,
"step": 2109
},
{
"epoch": 2.5269461077844313,
"grad_norm": 0.15692424653253104,
"learning_rate": 8.78438331854481e-06,
"loss": 0.1658,
"step": 2110
},
{
"epoch": 2.5281437125748503,
"grad_norm": 0.16237416161286394,
"learning_rate": 8.762200532386869e-06,
"loss": 0.1664,
"step": 2111
},
{
"epoch": 2.5293413173652697,
"grad_norm": 0.16688915467798304,
"learning_rate": 8.740017746228926e-06,
"loss": 0.17,
"step": 2112
},
{
"epoch": 2.5305389221556887,
"grad_norm": 0.16397591513398255,
"learning_rate": 8.717834960070984e-06,
"loss": 0.1784,
"step": 2113
},
{
"epoch": 2.5317365269461076,
"grad_norm": 0.1588505647128213,
"learning_rate": 8.695652173913044e-06,
"loss": 0.1652,
"step": 2114
},
{
"epoch": 2.532934131736527,
"grad_norm": 0.15147808736574478,
"learning_rate": 8.673469387755103e-06,
"loss": 0.1638,
"step": 2115
},
{
"epoch": 2.534131736526946,
"grad_norm": 0.1689325837546282,
"learning_rate": 8.651286601597161e-06,
"loss": 0.171,
"step": 2116
},
{
"epoch": 2.5353293413173654,
"grad_norm": 0.15305710786405446,
"learning_rate": 8.62910381543922e-06,
"loss": 0.1509,
"step": 2117
},
{
"epoch": 2.5365269461077844,
"grad_norm": 0.16356323437468914,
"learning_rate": 8.606921029281278e-06,
"loss": 0.164,
"step": 2118
},
{
"epoch": 2.5377245508982034,
"grad_norm": 0.16218254912225308,
"learning_rate": 8.584738243123337e-06,
"loss": 0.1569,
"step": 2119
},
{
"epoch": 2.538922155688623,
"grad_norm": 0.1658680310901566,
"learning_rate": 8.562555456965395e-06,
"loss": 0.1726,
"step": 2120
},
{
"epoch": 2.5401197604790418,
"grad_norm": 0.17328516804960595,
"learning_rate": 8.540372670807453e-06,
"loss": 0.1658,
"step": 2121
},
{
"epoch": 2.541317365269461,
"grad_norm": 0.15513125336104616,
"learning_rate": 8.518189884649512e-06,
"loss": 0.1651,
"step": 2122
},
{
"epoch": 2.54251497005988,
"grad_norm": 0.16047668505604815,
"learning_rate": 8.49600709849157e-06,
"loss": 0.1644,
"step": 2123
},
{
"epoch": 2.5437125748502996,
"grad_norm": 0.16619019028201018,
"learning_rate": 8.473824312333629e-06,
"loss": 0.1733,
"step": 2124
},
{
"epoch": 2.5449101796407185,
"grad_norm": 0.15643466492214245,
"learning_rate": 8.451641526175689e-06,
"loss": 0.1725,
"step": 2125
},
{
"epoch": 2.546107784431138,
"grad_norm": 0.16358728432008346,
"learning_rate": 8.429458740017747e-06,
"loss": 0.1729,
"step": 2126
},
{
"epoch": 2.547305389221557,
"grad_norm": 0.15944731568238307,
"learning_rate": 8.407275953859806e-06,
"loss": 0.1716,
"step": 2127
},
{
"epoch": 2.548502994011976,
"grad_norm": 0.16233379919437227,
"learning_rate": 8.385093167701864e-06,
"loss": 0.1667,
"step": 2128
},
{
"epoch": 2.5497005988023953,
"grad_norm": 0.15663493158392292,
"learning_rate": 8.36291038154392e-06,
"loss": 0.1811,
"step": 2129
},
{
"epoch": 2.5508982035928143,
"grad_norm": 0.16699995019958347,
"learning_rate": 8.340727595385981e-06,
"loss": 0.1831,
"step": 2130
},
{
"epoch": 2.5520958083832337,
"grad_norm": 0.15425375791878077,
"learning_rate": 8.31854480922804e-06,
"loss": 0.1622,
"step": 2131
},
{
"epoch": 2.5532934131736527,
"grad_norm": 0.15620046950475996,
"learning_rate": 8.296362023070098e-06,
"loss": 0.1746,
"step": 2132
},
{
"epoch": 2.5544910179640716,
"grad_norm": 0.16205512641319267,
"learning_rate": 8.274179236912156e-06,
"loss": 0.1634,
"step": 2133
},
{
"epoch": 2.555688622754491,
"grad_norm": 0.14992064947480055,
"learning_rate": 8.251996450754215e-06,
"loss": 0.1664,
"step": 2134
},
{
"epoch": 2.55688622754491,
"grad_norm": 0.1718533592073993,
"learning_rate": 8.229813664596275e-06,
"loss": 0.1781,
"step": 2135
},
{
"epoch": 2.5580838323353294,
"grad_norm": 0.14839225702401662,
"learning_rate": 8.207630878438333e-06,
"loss": 0.1715,
"step": 2136
},
{
"epoch": 2.5592814371257484,
"grad_norm": 0.17032376492731222,
"learning_rate": 8.185448092280392e-06,
"loss": 0.1843,
"step": 2137
},
{
"epoch": 2.560479041916168,
"grad_norm": 0.15167012071349992,
"learning_rate": 8.163265306122448e-06,
"loss": 0.1609,
"step": 2138
},
{
"epoch": 2.561676646706587,
"grad_norm": 0.15945108836297944,
"learning_rate": 8.141082519964507e-06,
"loss": 0.1613,
"step": 2139
},
{
"epoch": 2.562874251497006,
"grad_norm": 0.16519660614606788,
"learning_rate": 8.118899733806567e-06,
"loss": 0.1773,
"step": 2140
},
{
"epoch": 2.564071856287425,
"grad_norm": 0.16102836575737786,
"learning_rate": 8.096716947648625e-06,
"loss": 0.157,
"step": 2141
},
{
"epoch": 2.565269461077844,
"grad_norm": 0.16589781009090568,
"learning_rate": 8.074534161490684e-06,
"loss": 0.1828,
"step": 2142
},
{
"epoch": 2.5664670658682636,
"grad_norm": 0.8921390434381338,
"learning_rate": 8.052351375332742e-06,
"loss": 0.1632,
"step": 2143
},
{
"epoch": 2.5676646706586825,
"grad_norm": 0.1695594760275289,
"learning_rate": 8.0301685891748e-06,
"loss": 0.1927,
"step": 2144
},
{
"epoch": 2.568862275449102,
"grad_norm": 0.16014613511017203,
"learning_rate": 8.007985803016859e-06,
"loss": 0.1629,
"step": 2145
},
{
"epoch": 2.570059880239521,
"grad_norm": 0.1590900536359354,
"learning_rate": 7.985803016858919e-06,
"loss": 0.1701,
"step": 2146
},
{
"epoch": 2.57125748502994,
"grad_norm": 0.15711138781597894,
"learning_rate": 7.963620230700976e-06,
"loss": 0.1691,
"step": 2147
},
{
"epoch": 2.5724550898203593,
"grad_norm": 0.14714088426219946,
"learning_rate": 7.941437444543034e-06,
"loss": 0.1632,
"step": 2148
},
{
"epoch": 2.5736526946107783,
"grad_norm": 0.17091774316698663,
"learning_rate": 7.919254658385093e-06,
"loss": 0.1679,
"step": 2149
},
{
"epoch": 2.5748502994011977,
"grad_norm": 0.15276290596195982,
"learning_rate": 7.897071872227151e-06,
"loss": 0.1664,
"step": 2150
},
{
"epoch": 2.5760479041916167,
"grad_norm": 0.16839062967847926,
"learning_rate": 7.874889086069211e-06,
"loss": 0.1595,
"step": 2151
},
{
"epoch": 2.577245508982036,
"grad_norm": 0.15828420931190362,
"learning_rate": 7.85270629991127e-06,
"loss": 0.1755,
"step": 2152
},
{
"epoch": 2.578443113772455,
"grad_norm": 0.1530069903042569,
"learning_rate": 7.830523513753328e-06,
"loss": 0.1641,
"step": 2153
},
{
"epoch": 2.5796407185628745,
"grad_norm": 0.1469510961056098,
"learning_rate": 7.808340727595387e-06,
"loss": 0.1486,
"step": 2154
},
{
"epoch": 2.5808383233532934,
"grad_norm": 0.1886490431526823,
"learning_rate": 7.786157941437445e-06,
"loss": 0.1661,
"step": 2155
},
{
"epoch": 2.5820359281437124,
"grad_norm": 0.16208005105059956,
"learning_rate": 7.763975155279503e-06,
"loss": 0.1704,
"step": 2156
},
{
"epoch": 2.583233532934132,
"grad_norm": 0.16829456806839582,
"learning_rate": 7.741792369121562e-06,
"loss": 0.155,
"step": 2157
},
{
"epoch": 2.584431137724551,
"grad_norm": 0.16125099686618738,
"learning_rate": 7.71960958296362e-06,
"loss": 0.1533,
"step": 2158
},
{
"epoch": 2.58562874251497,
"grad_norm": 0.1593700125483189,
"learning_rate": 7.697426796805679e-06,
"loss": 0.1545,
"step": 2159
},
{
"epoch": 2.586826347305389,
"grad_norm": 0.1602424334646817,
"learning_rate": 7.675244010647737e-06,
"loss": 0.1702,
"step": 2160
},
{
"epoch": 2.588023952095808,
"grad_norm": 0.16255748102211154,
"learning_rate": 7.653061224489797e-06,
"loss": 0.1755,
"step": 2161
},
{
"epoch": 2.5892215568862276,
"grad_norm": 0.1532425114696054,
"learning_rate": 7.630878438331856e-06,
"loss": 0.166,
"step": 2162
},
{
"epoch": 2.5904191616766465,
"grad_norm": 0.1621990994017345,
"learning_rate": 7.608695652173914e-06,
"loss": 0.1692,
"step": 2163
},
{
"epoch": 2.591616766467066,
"grad_norm": 0.15494424454466818,
"learning_rate": 7.5865128660159725e-06,
"loss": 0.1695,
"step": 2164
},
{
"epoch": 2.592814371257485,
"grad_norm": 0.16049147786731183,
"learning_rate": 7.56433007985803e-06,
"loss": 0.1748,
"step": 2165
},
{
"epoch": 2.5940119760479043,
"grad_norm": 0.16091735622969142,
"learning_rate": 7.5421472937000885e-06,
"loss": 0.182,
"step": 2166
},
{
"epoch": 2.5952095808383233,
"grad_norm": 0.16757956793375745,
"learning_rate": 7.519964507542148e-06,
"loss": 0.1745,
"step": 2167
},
{
"epoch": 2.5964071856287427,
"grad_norm": 0.16639955398110762,
"learning_rate": 7.497781721384206e-06,
"loss": 0.1887,
"step": 2168
},
{
"epoch": 2.5976047904191617,
"grad_norm": 0.1596073024150438,
"learning_rate": 7.475598935226265e-06,
"loss": 0.1704,
"step": 2169
},
{
"epoch": 2.5988023952095807,
"grad_norm": 0.1641472135057493,
"learning_rate": 7.453416149068324e-06,
"loss": 0.1748,
"step": 2170
},
{
"epoch": 2.6,
"grad_norm": 0.1564889876743691,
"learning_rate": 7.431233362910382e-06,
"loss": 0.1638,
"step": 2171
},
{
"epoch": 2.601197604790419,
"grad_norm": 0.15797506087536803,
"learning_rate": 7.409050576752441e-06,
"loss": 0.1607,
"step": 2172
},
{
"epoch": 2.6023952095808385,
"grad_norm": 0.15978704907247465,
"learning_rate": 7.3868677905945e-06,
"loss": 0.1871,
"step": 2173
},
{
"epoch": 2.6035928143712574,
"grad_norm": 0.1572890119426756,
"learning_rate": 7.364685004436557e-06,
"loss": 0.1638,
"step": 2174
},
{
"epoch": 2.6047904191616764,
"grad_norm": 0.15078464071372763,
"learning_rate": 7.342502218278616e-06,
"loss": 0.1559,
"step": 2175
},
{
"epoch": 2.605988023952096,
"grad_norm": 0.15404730923914484,
"learning_rate": 7.3203194321206745e-06,
"loss": 0.1725,
"step": 2176
},
{
"epoch": 2.607185628742515,
"grad_norm": 0.1631864458045795,
"learning_rate": 7.298136645962733e-06,
"loss": 0.1685,
"step": 2177
},
{
"epoch": 2.608383233532934,
"grad_norm": 0.16221284190757004,
"learning_rate": 7.275953859804792e-06,
"loss": 0.1654,
"step": 2178
},
{
"epoch": 2.609580838323353,
"grad_norm": 0.16049208041188825,
"learning_rate": 7.253771073646851e-06,
"loss": 0.1621,
"step": 2179
},
{
"epoch": 2.6107784431137726,
"grad_norm": 0.153001498463329,
"learning_rate": 7.231588287488909e-06,
"loss": 0.1635,
"step": 2180
},
{
"epoch": 2.6119760479041916,
"grad_norm": 0.15815079364794737,
"learning_rate": 7.209405501330968e-06,
"loss": 0.1769,
"step": 2181
},
{
"epoch": 2.613173652694611,
"grad_norm": 0.15585323919305422,
"learning_rate": 7.187222715173027e-06,
"loss": 0.1638,
"step": 2182
},
{
"epoch": 2.61437125748503,
"grad_norm": 0.1562944031417155,
"learning_rate": 7.165039929015084e-06,
"loss": 0.1699,
"step": 2183
},
{
"epoch": 2.615568862275449,
"grad_norm": 0.16031696092043235,
"learning_rate": 7.142857142857143e-06,
"loss": 0.1631,
"step": 2184
},
{
"epoch": 2.6167664670658684,
"grad_norm": 0.14943485380284013,
"learning_rate": 7.120674356699201e-06,
"loss": 0.1622,
"step": 2185
},
{
"epoch": 2.6179640718562873,
"grad_norm": 0.15755697403775804,
"learning_rate": 7.09849157054126e-06,
"loss": 0.1652,
"step": 2186
},
{
"epoch": 2.6191616766467067,
"grad_norm": 0.1903785836751996,
"learning_rate": 7.076308784383319e-06,
"loss": 0.186,
"step": 2187
},
{
"epoch": 2.6203592814371257,
"grad_norm": 0.15851845005363518,
"learning_rate": 7.054125998225377e-06,
"loss": 0.1616,
"step": 2188
},
{
"epoch": 2.6215568862275447,
"grad_norm": 0.164580928962622,
"learning_rate": 7.0319432120674365e-06,
"loss": 0.1653,
"step": 2189
},
{
"epoch": 2.622754491017964,
"grad_norm": 0.15124679278514003,
"learning_rate": 7.009760425909495e-06,
"loss": 0.1702,
"step": 2190
},
{
"epoch": 2.623952095808383,
"grad_norm": 0.1623026818996861,
"learning_rate": 6.9875776397515525e-06,
"loss": 0.1654,
"step": 2191
},
{
"epoch": 2.6251497005988025,
"grad_norm": 0.1506947328499455,
"learning_rate": 6.965394853593611e-06,
"loss": 0.1699,
"step": 2192
},
{
"epoch": 2.6263473053892215,
"grad_norm": 0.1545683839428047,
"learning_rate": 6.94321206743567e-06,
"loss": 0.1716,
"step": 2193
},
{
"epoch": 2.627544910179641,
"grad_norm": 0.15303368340037013,
"learning_rate": 6.921029281277729e-06,
"loss": 0.1593,
"step": 2194
},
{
"epoch": 2.62874251497006,
"grad_norm": 0.15902288004677945,
"learning_rate": 6.898846495119787e-06,
"loss": 0.1579,
"step": 2195
},
{
"epoch": 2.6299401197604793,
"grad_norm": 0.16371444522960188,
"learning_rate": 6.876663708961846e-06,
"loss": 0.1778,
"step": 2196
},
{
"epoch": 2.6311377245508982,
"grad_norm": 0.15607620116118565,
"learning_rate": 6.854480922803905e-06,
"loss": 0.1742,
"step": 2197
},
{
"epoch": 2.632335329341317,
"grad_norm": 0.17981373660780958,
"learning_rate": 6.832298136645963e-06,
"loss": 0.172,
"step": 2198
},
{
"epoch": 2.6335329341317366,
"grad_norm": 0.16363555643136177,
"learning_rate": 6.8101153504880225e-06,
"loss": 0.1791,
"step": 2199
},
{
"epoch": 2.6347305389221556,
"grad_norm": 0.15822769134988107,
"learning_rate": 6.787932564330079e-06,
"loss": 0.1712,
"step": 2200
},
{
"epoch": 2.635928143712575,
"grad_norm": 0.1675916201225635,
"learning_rate": 6.7657497781721385e-06,
"loss": 0.1823,
"step": 2201
},
{
"epoch": 2.637125748502994,
"grad_norm": 0.16271165013716862,
"learning_rate": 6.743566992014197e-06,
"loss": 0.1802,
"step": 2202
},
{
"epoch": 2.638323353293413,
"grad_norm": 0.17818761353146262,
"learning_rate": 6.721384205856255e-06,
"loss": 0.1645,
"step": 2203
},
{
"epoch": 2.6395209580838324,
"grad_norm": 0.16561221532955792,
"learning_rate": 6.699201419698315e-06,
"loss": 0.1632,
"step": 2204
},
{
"epoch": 2.6407185628742518,
"grad_norm": 0.15403638491182006,
"learning_rate": 6.677018633540373e-06,
"loss": 0.1716,
"step": 2205
},
{
"epoch": 2.6419161676646707,
"grad_norm": 0.15243888444983156,
"learning_rate": 6.6548358473824315e-06,
"loss": 0.1634,
"step": 2206
},
{
"epoch": 2.6431137724550897,
"grad_norm": 0.1659440086279514,
"learning_rate": 6.632653061224491e-06,
"loss": 0.1815,
"step": 2207
},
{
"epoch": 2.644311377245509,
"grad_norm": 0.15377026179154232,
"learning_rate": 6.610470275066549e-06,
"loss": 0.1706,
"step": 2208
},
{
"epoch": 2.645508982035928,
"grad_norm": 0.1571756800121783,
"learning_rate": 6.588287488908607e-06,
"loss": 0.1693,
"step": 2209
},
{
"epoch": 2.6467065868263475,
"grad_norm": 0.15209615139603194,
"learning_rate": 6.566104702750665e-06,
"loss": 0.1518,
"step": 2210
},
{
"epoch": 2.6479041916167665,
"grad_norm": 0.16868283233046674,
"learning_rate": 6.543921916592724e-06,
"loss": 0.1819,
"step": 2211
},
{
"epoch": 2.6491017964071855,
"grad_norm": 0.1560141584924181,
"learning_rate": 6.521739130434783e-06,
"loss": 0.1748,
"step": 2212
},
{
"epoch": 2.650299401197605,
"grad_norm": 0.15125101225427237,
"learning_rate": 6.499556344276841e-06,
"loss": 0.162,
"step": 2213
},
{
"epoch": 2.651497005988024,
"grad_norm": 0.16031168873175722,
"learning_rate": 6.477373558118901e-06,
"loss": 0.1681,
"step": 2214
},
{
"epoch": 2.6526946107784433,
"grad_norm": 0.1641934888301461,
"learning_rate": 6.455190771960959e-06,
"loss": 0.1833,
"step": 2215
},
{
"epoch": 2.6538922155688622,
"grad_norm": 0.16098694800348887,
"learning_rate": 6.4330079858030174e-06,
"loss": 0.1752,
"step": 2216
},
{
"epoch": 2.655089820359281,
"grad_norm": 0.1613126534263348,
"learning_rate": 6.410825199645077e-06,
"loss": 0.1631,
"step": 2217
},
{
"epoch": 2.6562874251497006,
"grad_norm": 0.1542418179279918,
"learning_rate": 6.3886424134871334e-06,
"loss": 0.1824,
"step": 2218
},
{
"epoch": 2.65748502994012,
"grad_norm": 0.15395111396624953,
"learning_rate": 6.366459627329193e-06,
"loss": 0.1675,
"step": 2219
},
{
"epoch": 2.658682634730539,
"grad_norm": 0.1603187834617355,
"learning_rate": 6.344276841171251e-06,
"loss": 0.1602,
"step": 2220
},
{
"epoch": 2.659880239520958,
"grad_norm": 0.1728534751827956,
"learning_rate": 6.3220940550133096e-06,
"loss": 0.1825,
"step": 2221
},
{
"epoch": 2.6610778443113774,
"grad_norm": 0.16976952769179626,
"learning_rate": 6.299911268855369e-06,
"loss": 0.1707,
"step": 2222
},
{
"epoch": 2.6622754491017964,
"grad_norm": 0.1884837668180829,
"learning_rate": 6.277728482697427e-06,
"loss": 0.1744,
"step": 2223
},
{
"epoch": 2.663473053892216,
"grad_norm": 0.16297162323751135,
"learning_rate": 6.255545696539486e-06,
"loss": 0.1713,
"step": 2224
},
{
"epoch": 2.6646706586826348,
"grad_norm": 0.1698260725794208,
"learning_rate": 6.233362910381544e-06,
"loss": 0.1882,
"step": 2225
},
{
"epoch": 2.6658682634730537,
"grad_norm": 0.1638795925847457,
"learning_rate": 6.2111801242236025e-06,
"loss": 0.1853,
"step": 2226
},
{
"epoch": 2.667065868263473,
"grad_norm": 0.1567428988539364,
"learning_rate": 6.188997338065662e-06,
"loss": 0.1628,
"step": 2227
},
{
"epoch": 2.668263473053892,
"grad_norm": 0.15968341489418797,
"learning_rate": 6.16681455190772e-06,
"loss": 0.1652,
"step": 2228
},
{
"epoch": 2.6694610778443115,
"grad_norm": 0.16443140911333007,
"learning_rate": 6.144631765749778e-06,
"loss": 0.1553,
"step": 2229
},
{
"epoch": 2.6706586826347305,
"grad_norm": 0.17338424620534748,
"learning_rate": 6.122448979591837e-06,
"loss": 0.1932,
"step": 2230
},
{
"epoch": 2.6718562874251495,
"grad_norm": 0.15761880811342283,
"learning_rate": 6.1002661934338955e-06,
"loss": 0.1601,
"step": 2231
},
{
"epoch": 2.673053892215569,
"grad_norm": 0.18107837159407555,
"learning_rate": 6.078083407275954e-06,
"loss": 0.1816,
"step": 2232
},
{
"epoch": 2.6742514970059883,
"grad_norm": 0.1511748619885111,
"learning_rate": 6.055900621118013e-06,
"loss": 0.1701,
"step": 2233
},
{
"epoch": 2.6754491017964073,
"grad_norm": 0.17382044842404867,
"learning_rate": 6.033717834960071e-06,
"loss": 0.1735,
"step": 2234
},
{
"epoch": 2.6766467065868262,
"grad_norm": 0.15313423446687682,
"learning_rate": 6.01153504880213e-06,
"loss": 0.1615,
"step": 2235
},
{
"epoch": 2.6778443113772457,
"grad_norm": 0.16314665765414257,
"learning_rate": 5.9893522626441885e-06,
"loss": 0.1798,
"step": 2236
},
{
"epoch": 2.6790419161676646,
"grad_norm": 0.15887218605081294,
"learning_rate": 5.967169476486247e-06,
"loss": 0.1809,
"step": 2237
},
{
"epoch": 2.680239520958084,
"grad_norm": 0.15259575027207753,
"learning_rate": 5.944986690328305e-06,
"loss": 0.1661,
"step": 2238
},
{
"epoch": 2.681437125748503,
"grad_norm": 0.15805905928500016,
"learning_rate": 5.922803904170364e-06,
"loss": 0.1767,
"step": 2239
},
{
"epoch": 2.682634730538922,
"grad_norm": 0.16602971481538856,
"learning_rate": 5.900621118012423e-06,
"loss": 0.1874,
"step": 2240
},
{
"epoch": 2.6838323353293414,
"grad_norm": 0.15756481502087633,
"learning_rate": 5.8784383318544815e-06,
"loss": 0.1644,
"step": 2241
},
{
"epoch": 2.6850299401197604,
"grad_norm": 0.15583295028275518,
"learning_rate": 5.856255545696539e-06,
"loss": 0.1649,
"step": 2242
},
{
"epoch": 2.68622754491018,
"grad_norm": 0.15299598626714542,
"learning_rate": 5.834072759538598e-06,
"loss": 0.1554,
"step": 2243
},
{
"epoch": 2.6874251497005988,
"grad_norm": 0.15332882009291351,
"learning_rate": 5.811889973380657e-06,
"loss": 0.1584,
"step": 2244
},
{
"epoch": 2.6886227544910177,
"grad_norm": 0.16329186509427193,
"learning_rate": 5.789707187222715e-06,
"loss": 0.1686,
"step": 2245
},
{
"epoch": 2.689820359281437,
"grad_norm": 0.1601040108668266,
"learning_rate": 5.7675244010647745e-06,
"loss": 0.167,
"step": 2246
},
{
"epoch": 2.6910179640718566,
"grad_norm": 0.15931716493089096,
"learning_rate": 5.745341614906832e-06,
"loss": 0.1702,
"step": 2247
},
{
"epoch": 2.6922155688622755,
"grad_norm": 0.1529484915347619,
"learning_rate": 5.723158828748891e-06,
"loss": 0.1565,
"step": 2248
},
{
"epoch": 2.6934131736526945,
"grad_norm": 0.14979269738882112,
"learning_rate": 5.70097604259095e-06,
"loss": 0.16,
"step": 2249
},
{
"epoch": 2.694610778443114,
"grad_norm": 0.1497940708184568,
"learning_rate": 5.678793256433008e-06,
"loss": 0.1497,
"step": 2250
},
{
"epoch": 2.695808383233533,
"grad_norm": 0.4352228497671906,
"learning_rate": 5.656610470275067e-06,
"loss": 0.18,
"step": 2251
},
{
"epoch": 2.6970059880239523,
"grad_norm": 0.1617978735449994,
"learning_rate": 5.634427684117125e-06,
"loss": 0.1727,
"step": 2252
},
{
"epoch": 2.6982035928143713,
"grad_norm": 0.16579491042271477,
"learning_rate": 5.612244897959184e-06,
"loss": 0.1631,
"step": 2253
},
{
"epoch": 2.6994011976047902,
"grad_norm": 0.15318212258617203,
"learning_rate": 5.590062111801243e-06,
"loss": 0.1559,
"step": 2254
},
{
"epoch": 2.7005988023952097,
"grad_norm": 0.1582671472819581,
"learning_rate": 5.567879325643301e-06,
"loss": 0.1686,
"step": 2255
},
{
"epoch": 2.7017964071856286,
"grad_norm": 0.16534483752816193,
"learning_rate": 5.5456965394853596e-06,
"loss": 0.1709,
"step": 2256
},
{
"epoch": 2.702994011976048,
"grad_norm": 0.15064546889219446,
"learning_rate": 5.523513753327418e-06,
"loss": 0.1548,
"step": 2257
},
{
"epoch": 2.704191616766467,
"grad_norm": 0.1506094355947151,
"learning_rate": 5.501330967169476e-06,
"loss": 0.1741,
"step": 2258
},
{
"epoch": 2.705389221556886,
"grad_norm": 0.1552580459536264,
"learning_rate": 5.479148181011536e-06,
"loss": 0.1502,
"step": 2259
},
{
"epoch": 2.7065868263473054,
"grad_norm": 0.15424424811427548,
"learning_rate": 5.456965394853593e-06,
"loss": 0.1655,
"step": 2260
},
{
"epoch": 2.707784431137725,
"grad_norm": 0.15326127385074345,
"learning_rate": 5.4347826086956525e-06,
"loss": 0.158,
"step": 2261
},
{
"epoch": 2.708982035928144,
"grad_norm": 0.14416109529021762,
"learning_rate": 5.412599822537711e-06,
"loss": 0.16,
"step": 2262
},
{
"epoch": 2.7101796407185628,
"grad_norm": 0.15743534122923677,
"learning_rate": 5.390417036379769e-06,
"loss": 0.1751,
"step": 2263
},
{
"epoch": 2.711377245508982,
"grad_norm": 0.154421825223699,
"learning_rate": 5.368234250221829e-06,
"loss": 0.1716,
"step": 2264
},
{
"epoch": 2.712574850299401,
"grad_norm": 0.16058258432672304,
"learning_rate": 5.346051464063886e-06,
"loss": 0.1852,
"step": 2265
},
{
"epoch": 2.7137724550898206,
"grad_norm": 0.15024879059737897,
"learning_rate": 5.3238686779059455e-06,
"loss": 0.1668,
"step": 2266
},
{
"epoch": 2.7149700598802395,
"grad_norm": 0.17969407450590577,
"learning_rate": 5.301685891748004e-06,
"loss": 0.1637,
"step": 2267
},
{
"epoch": 2.7161676646706585,
"grad_norm": 0.1463311654633755,
"learning_rate": 5.279503105590062e-06,
"loss": 0.1477,
"step": 2268
},
{
"epoch": 2.717365269461078,
"grad_norm": 0.157980066853183,
"learning_rate": 5.257320319432121e-06,
"loss": 0.1593,
"step": 2269
},
{
"epoch": 2.718562874251497,
"grad_norm": 0.16239735482843187,
"learning_rate": 5.235137533274179e-06,
"loss": 0.1803,
"step": 2270
},
{
"epoch": 2.7197604790419163,
"grad_norm": 0.152623040146997,
"learning_rate": 5.212954747116238e-06,
"loss": 0.1716,
"step": 2271
},
{
"epoch": 2.7209580838323353,
"grad_norm": 0.14937279550850685,
"learning_rate": 5.190771960958297e-06,
"loss": 0.1558,
"step": 2272
},
{
"epoch": 2.7221556886227543,
"grad_norm": 0.1500705294948104,
"learning_rate": 5.168589174800355e-06,
"loss": 0.155,
"step": 2273
},
{
"epoch": 2.7233532934131737,
"grad_norm": 0.15621507235776813,
"learning_rate": 5.146406388642414e-06,
"loss": 0.1629,
"step": 2274
},
{
"epoch": 2.724550898203593,
"grad_norm": 0.1649723147392917,
"learning_rate": 5.124223602484472e-06,
"loss": 0.1843,
"step": 2275
},
{
"epoch": 2.725748502994012,
"grad_norm": 0.15316203128334488,
"learning_rate": 5.102040816326531e-06,
"loss": 0.1602,
"step": 2276
},
{
"epoch": 2.726946107784431,
"grad_norm": 0.14999607410917354,
"learning_rate": 5.07985803016859e-06,
"loss": 0.1564,
"step": 2277
},
{
"epoch": 2.7281437125748504,
"grad_norm": 0.15324525044791265,
"learning_rate": 5.0576752440106475e-06,
"loss": 0.1661,
"step": 2278
},
{
"epoch": 2.7293413173652694,
"grad_norm": 0.14893386940326622,
"learning_rate": 5.035492457852707e-06,
"loss": 0.16,
"step": 2279
},
{
"epoch": 2.730538922155689,
"grad_norm": 0.15146185586195643,
"learning_rate": 5.013309671694765e-06,
"loss": 0.1498,
"step": 2280
},
{
"epoch": 2.731736526946108,
"grad_norm": 0.1632041780621683,
"learning_rate": 4.991126885536824e-06,
"loss": 0.1623,
"step": 2281
},
{
"epoch": 2.7329341317365268,
"grad_norm": 0.16085592485365646,
"learning_rate": 4.968944099378882e-06,
"loss": 0.1557,
"step": 2282
},
{
"epoch": 2.734131736526946,
"grad_norm": 0.1655419104685232,
"learning_rate": 4.9467613132209405e-06,
"loss": 0.1881,
"step": 2283
},
{
"epoch": 2.735329341317365,
"grad_norm": 0.16131149099187114,
"learning_rate": 4.924578527063e-06,
"loss": 0.1736,
"step": 2284
},
{
"epoch": 2.7365269461077846,
"grad_norm": 0.17396082016286837,
"learning_rate": 4.902395740905058e-06,
"loss": 0.1668,
"step": 2285
},
{
"epoch": 2.7377245508982035,
"grad_norm": 0.15242431999755487,
"learning_rate": 4.880212954747117e-06,
"loss": 0.1555,
"step": 2286
},
{
"epoch": 2.7389221556886225,
"grad_norm": 0.1518580841565282,
"learning_rate": 4.858030168589175e-06,
"loss": 0.1751,
"step": 2287
},
{
"epoch": 2.740119760479042,
"grad_norm": 0.15694265150163686,
"learning_rate": 4.8358473824312334e-06,
"loss": 0.1547,
"step": 2288
},
{
"epoch": 2.7413173652694613,
"grad_norm": 0.1523554667788311,
"learning_rate": 4.813664596273292e-06,
"loss": 0.1612,
"step": 2289
},
{
"epoch": 2.7425149700598803,
"grad_norm": 0.24847167198776163,
"learning_rate": 4.791481810115351e-06,
"loss": 0.1698,
"step": 2290
},
{
"epoch": 2.7437125748502993,
"grad_norm": 0.1594353146629418,
"learning_rate": 4.769299023957409e-06,
"loss": 0.1657,
"step": 2291
},
{
"epoch": 2.7449101796407187,
"grad_norm": 0.15437653258061684,
"learning_rate": 4.747116237799468e-06,
"loss": 0.1714,
"step": 2292
},
{
"epoch": 2.7461077844311377,
"grad_norm": 0.14853411132979533,
"learning_rate": 4.724933451641526e-06,
"loss": 0.1541,
"step": 2293
},
{
"epoch": 2.747305389221557,
"grad_norm": 0.15422209729559222,
"learning_rate": 4.702750665483585e-06,
"loss": 0.1665,
"step": 2294
},
{
"epoch": 2.748502994011976,
"grad_norm": 0.15738148818400333,
"learning_rate": 4.680567879325644e-06,
"loss": 0.1702,
"step": 2295
},
{
"epoch": 2.749700598802395,
"grad_norm": 0.1538758371930384,
"learning_rate": 4.658385093167702e-06,
"loss": 0.1761,
"step": 2296
},
{
"epoch": 2.7508982035928145,
"grad_norm": 0.15628389859431616,
"learning_rate": 4.636202307009761e-06,
"loss": 0.1663,
"step": 2297
},
{
"epoch": 2.7520958083832334,
"grad_norm": 0.14502437281061017,
"learning_rate": 4.614019520851819e-06,
"loss": 0.1574,
"step": 2298
},
{
"epoch": 2.753293413173653,
"grad_norm": 0.16233199939193133,
"learning_rate": 4.591836734693878e-06,
"loss": 0.1702,
"step": 2299
},
{
"epoch": 2.754491017964072,
"grad_norm": 0.15972310242491547,
"learning_rate": 4.569653948535936e-06,
"loss": 0.1784,
"step": 2300
},
{
"epoch": 2.755688622754491,
"grad_norm": 0.1506429159436433,
"learning_rate": 4.547471162377995e-06,
"loss": 0.1729,
"step": 2301
},
{
"epoch": 2.75688622754491,
"grad_norm": 0.15338452528227162,
"learning_rate": 4.525288376220053e-06,
"loss": 0.1736,
"step": 2302
},
{
"epoch": 2.7580838323353296,
"grad_norm": 0.1511572433742401,
"learning_rate": 4.503105590062112e-06,
"loss": 0.1652,
"step": 2303
},
{
"epoch": 2.7592814371257486,
"grad_norm": 0.15822457211596141,
"learning_rate": 4.480922803904171e-06,
"loss": 0.1636,
"step": 2304
},
{
"epoch": 2.7604790419161676,
"grad_norm": 0.14958053346530678,
"learning_rate": 4.458740017746229e-06,
"loss": 0.1694,
"step": 2305
},
{
"epoch": 2.761676646706587,
"grad_norm": 0.1460795354065669,
"learning_rate": 4.436557231588288e-06,
"loss": 0.1575,
"step": 2306
},
{
"epoch": 2.762874251497006,
"grad_norm": 0.15506015829114328,
"learning_rate": 4.414374445430346e-06,
"loss": 0.1612,
"step": 2307
},
{
"epoch": 2.7640718562874254,
"grad_norm": 0.16051238557718325,
"learning_rate": 4.392191659272405e-06,
"loss": 0.1606,
"step": 2308
},
{
"epoch": 2.7652694610778443,
"grad_norm": 0.15655756255808934,
"learning_rate": 4.370008873114463e-06,
"loss": 0.1751,
"step": 2309
},
{
"epoch": 2.7664670658682633,
"grad_norm": 0.1555229659133124,
"learning_rate": 4.347826086956522e-06,
"loss": 0.1677,
"step": 2310
},
{
"epoch": 2.7676646706586827,
"grad_norm": 0.16156035377280015,
"learning_rate": 4.325643300798581e-06,
"loss": 0.1789,
"step": 2311
},
{
"epoch": 2.7688622754491017,
"grad_norm": 0.16120359103151474,
"learning_rate": 4.303460514640639e-06,
"loss": 0.1714,
"step": 2312
},
{
"epoch": 2.770059880239521,
"grad_norm": 0.1661353321939627,
"learning_rate": 4.2812777284826975e-06,
"loss": 0.1747,
"step": 2313
},
{
"epoch": 2.77125748502994,
"grad_norm": 0.1582910330883366,
"learning_rate": 4.259094942324756e-06,
"loss": 0.1728,
"step": 2314
},
{
"epoch": 2.772455089820359,
"grad_norm": 0.15438769177462286,
"learning_rate": 4.236912156166814e-06,
"loss": 0.1542,
"step": 2315
},
{
"epoch": 2.7736526946107785,
"grad_norm": 0.15842760459293426,
"learning_rate": 4.214729370008874e-06,
"loss": 0.1658,
"step": 2316
},
{
"epoch": 2.774850299401198,
"grad_norm": 0.14721678206011019,
"learning_rate": 4.192546583850932e-06,
"loss": 0.1484,
"step": 2317
},
{
"epoch": 2.776047904191617,
"grad_norm": 0.15595292587345724,
"learning_rate": 4.1703637976929905e-06,
"loss": 0.1649,
"step": 2318
},
{
"epoch": 2.777245508982036,
"grad_norm": 0.1507099071708286,
"learning_rate": 4.148181011535049e-06,
"loss": 0.1729,
"step": 2319
},
{
"epoch": 2.7784431137724552,
"grad_norm": 0.1487095285406687,
"learning_rate": 4.125998225377107e-06,
"loss": 0.1761,
"step": 2320
},
{
"epoch": 2.779640718562874,
"grad_norm": 0.15501056113802347,
"learning_rate": 4.103815439219167e-06,
"loss": 0.173,
"step": 2321
},
{
"epoch": 2.7808383233532936,
"grad_norm": 0.15642105923534197,
"learning_rate": 4.081632653061224e-06,
"loss": 0.1667,
"step": 2322
},
{
"epoch": 2.7820359281437126,
"grad_norm": 0.14652043574891976,
"learning_rate": 4.0594498669032834e-06,
"loss": 0.1595,
"step": 2323
},
{
"epoch": 2.7832335329341316,
"grad_norm": 0.16223046288658283,
"learning_rate": 4.037267080745342e-06,
"loss": 0.1704,
"step": 2324
},
{
"epoch": 2.784431137724551,
"grad_norm": 0.15937222388994127,
"learning_rate": 4.0150842945874e-06,
"loss": 0.169,
"step": 2325
},
{
"epoch": 2.78562874251497,
"grad_norm": 0.1459637966699843,
"learning_rate": 3.9929015084294596e-06,
"loss": 0.1695,
"step": 2326
},
{
"epoch": 2.7868263473053894,
"grad_norm": 0.16032365800179044,
"learning_rate": 3.970718722271517e-06,
"loss": 0.1552,
"step": 2327
},
{
"epoch": 2.7880239520958083,
"grad_norm": 0.16636245689622084,
"learning_rate": 3.9485359361135756e-06,
"loss": 0.1769,
"step": 2328
},
{
"epoch": 2.7892215568862273,
"grad_norm": 0.15606122438031664,
"learning_rate": 3.926353149955635e-06,
"loss": 0.1608,
"step": 2329
},
{
"epoch": 2.7904191616766467,
"grad_norm": 0.15462211624675187,
"learning_rate": 3.904170363797693e-06,
"loss": 0.1634,
"step": 2330
},
{
"epoch": 2.791616766467066,
"grad_norm": 0.1544234155501237,
"learning_rate": 3.881987577639752e-06,
"loss": 0.1657,
"step": 2331
},
{
"epoch": 2.792814371257485,
"grad_norm": 0.15143467302367955,
"learning_rate": 3.85980479148181e-06,
"loss": 0.1607,
"step": 2332
},
{
"epoch": 2.794011976047904,
"grad_norm": 0.15794870273037478,
"learning_rate": 3.8376220053238685e-06,
"loss": 0.15,
"step": 2333
},
{
"epoch": 2.7952095808383235,
"grad_norm": 0.1477895496090608,
"learning_rate": 3.815439219165928e-06,
"loss": 0.1693,
"step": 2334
},
{
"epoch": 2.7964071856287425,
"grad_norm": 0.1657528236511207,
"learning_rate": 3.7932564330079862e-06,
"loss": 0.1684,
"step": 2335
},
{
"epoch": 2.797604790419162,
"grad_norm": 0.15002075018092972,
"learning_rate": 3.7710736468500443e-06,
"loss": 0.1551,
"step": 2336
},
{
"epoch": 2.798802395209581,
"grad_norm": 0.1612910909469956,
"learning_rate": 3.748890860692103e-06,
"loss": 0.1669,
"step": 2337
},
{
"epoch": 2.8,
"grad_norm": 0.1617641096027572,
"learning_rate": 3.726708074534162e-06,
"loss": 0.1704,
"step": 2338
},
{
"epoch": 2.8011976047904192,
"grad_norm": 0.15980544828434465,
"learning_rate": 3.7045252883762204e-06,
"loss": 0.1639,
"step": 2339
},
{
"epoch": 2.802395209580838,
"grad_norm": 0.2486171770143427,
"learning_rate": 3.6823425022182784e-06,
"loss": 0.1757,
"step": 2340
},
{
"epoch": 2.8035928143712576,
"grad_norm": 0.15119865335627908,
"learning_rate": 3.6601597160603372e-06,
"loss": 0.1604,
"step": 2341
},
{
"epoch": 2.8047904191616766,
"grad_norm": 0.15732866313065286,
"learning_rate": 3.637976929902396e-06,
"loss": 0.1608,
"step": 2342
},
{
"epoch": 2.8059880239520956,
"grad_norm": 0.1548031507751122,
"learning_rate": 3.6157941437444545e-06,
"loss": 0.164,
"step": 2343
},
{
"epoch": 2.807185628742515,
"grad_norm": 0.15131276012342876,
"learning_rate": 3.5936113575865134e-06,
"loss": 0.1642,
"step": 2344
},
{
"epoch": 2.8083832335329344,
"grad_norm": 0.14274141752314592,
"learning_rate": 3.5714285714285714e-06,
"loss": 0.1524,
"step": 2345
},
{
"epoch": 2.8095808383233534,
"grad_norm": 0.15274141476079933,
"learning_rate": 3.54924578527063e-06,
"loss": 0.1591,
"step": 2346
},
{
"epoch": 2.8107784431137723,
"grad_norm": 0.15968801720981932,
"learning_rate": 3.5270629991126886e-06,
"loss": 0.1686,
"step": 2347
},
{
"epoch": 2.8119760479041918,
"grad_norm": 0.15919991320351876,
"learning_rate": 3.5048802129547475e-06,
"loss": 0.177,
"step": 2348
},
{
"epoch": 2.8131736526946107,
"grad_norm": 0.1596924420731482,
"learning_rate": 3.4826974267968055e-06,
"loss": 0.1722,
"step": 2349
},
{
"epoch": 2.81437125748503,
"grad_norm": 0.1624818651080605,
"learning_rate": 3.4605146406388643e-06,
"loss": 0.1712,
"step": 2350
},
{
"epoch": 2.815568862275449,
"grad_norm": 0.1515267647397139,
"learning_rate": 3.438331854480923e-06,
"loss": 0.1605,
"step": 2351
},
{
"epoch": 2.816766467065868,
"grad_norm": 0.15267845572872016,
"learning_rate": 3.4161490683229816e-06,
"loss": 0.1626,
"step": 2352
},
{
"epoch": 2.8179640718562875,
"grad_norm": 0.15775431230457926,
"learning_rate": 3.3939662821650396e-06,
"loss": 0.171,
"step": 2353
},
{
"epoch": 2.8191616766467065,
"grad_norm": 0.16053429824098345,
"learning_rate": 3.3717834960070985e-06,
"loss": 0.1797,
"step": 2354
},
{
"epoch": 2.820359281437126,
"grad_norm": 0.15398082866386786,
"learning_rate": 3.3496007098491573e-06,
"loss": 0.1694,
"step": 2355
},
{
"epoch": 2.821556886227545,
"grad_norm": 0.15173647988779385,
"learning_rate": 3.3274179236912157e-06,
"loss": 0.1514,
"step": 2356
},
{
"epoch": 2.822754491017964,
"grad_norm": 0.14742513261857312,
"learning_rate": 3.3052351375332746e-06,
"loss": 0.1544,
"step": 2357
},
{
"epoch": 2.8239520958083832,
"grad_norm": 0.1463555561318012,
"learning_rate": 3.2830523513753326e-06,
"loss": 0.1684,
"step": 2358
},
{
"epoch": 2.8251497005988027,
"grad_norm": 0.15324452175503275,
"learning_rate": 3.2608695652173914e-06,
"loss": 0.152,
"step": 2359
},
{
"epoch": 2.8263473053892216,
"grad_norm": 0.15839309638149288,
"learning_rate": 3.2386867790594503e-06,
"loss": 0.168,
"step": 2360
},
{
"epoch": 2.8275449101796406,
"grad_norm": 0.1565424189017417,
"learning_rate": 3.2165039929015087e-06,
"loss": 0.1581,
"step": 2361
},
{
"epoch": 2.82874251497006,
"grad_norm": 0.15906800123977574,
"learning_rate": 3.1943212067435667e-06,
"loss": 0.1757,
"step": 2362
},
{
"epoch": 2.829940119760479,
"grad_norm": 0.15350569605836123,
"learning_rate": 3.1721384205856256e-06,
"loss": 0.1629,
"step": 2363
},
{
"epoch": 2.8311377245508984,
"grad_norm": 0.1542344530424924,
"learning_rate": 3.1499556344276844e-06,
"loss": 0.1667,
"step": 2364
},
{
"epoch": 2.8323353293413174,
"grad_norm": 0.1632097709817799,
"learning_rate": 3.127772848269743e-06,
"loss": 0.1615,
"step": 2365
},
{
"epoch": 2.8335329341317363,
"grad_norm": 0.2759798897286628,
"learning_rate": 3.1055900621118013e-06,
"loss": 0.1465,
"step": 2366
},
{
"epoch": 2.8347305389221558,
"grad_norm": 0.16501392209530175,
"learning_rate": 3.08340727595386e-06,
"loss": 0.1684,
"step": 2367
},
{
"epoch": 2.8359281437125747,
"grad_norm": 0.150362180915444,
"learning_rate": 3.0612244897959185e-06,
"loss": 0.1631,
"step": 2368
},
{
"epoch": 2.837125748502994,
"grad_norm": 0.15765435489883994,
"learning_rate": 3.039041703637977e-06,
"loss": 0.1585,
"step": 2369
},
{
"epoch": 2.838323353293413,
"grad_norm": 0.16672787406643091,
"learning_rate": 3.0168589174800354e-06,
"loss": 0.1782,
"step": 2370
},
{
"epoch": 2.839520958083832,
"grad_norm": 0.15126682415531673,
"learning_rate": 2.9946761313220943e-06,
"loss": 0.1505,
"step": 2371
},
{
"epoch": 2.8407185628742515,
"grad_norm": 0.17436322005457827,
"learning_rate": 2.9724933451641527e-06,
"loss": 0.1436,
"step": 2372
},
{
"epoch": 2.841916167664671,
"grad_norm": 0.15024684507969982,
"learning_rate": 2.9503105590062115e-06,
"loss": 0.1655,
"step": 2373
},
{
"epoch": 2.84311377245509,
"grad_norm": 0.16047932748256566,
"learning_rate": 2.9281277728482695e-06,
"loss": 0.1787,
"step": 2374
},
{
"epoch": 2.844311377245509,
"grad_norm": 0.15103204531919842,
"learning_rate": 2.9059449866903284e-06,
"loss": 0.163,
"step": 2375
},
{
"epoch": 2.8455089820359283,
"grad_norm": 0.15622657765674852,
"learning_rate": 2.8837622005323872e-06,
"loss": 0.1721,
"step": 2376
},
{
"epoch": 2.8467065868263473,
"grad_norm": 0.15827601175338635,
"learning_rate": 2.8615794143744457e-06,
"loss": 0.1766,
"step": 2377
},
{
"epoch": 2.8479041916167667,
"grad_norm": 0.1589667656147383,
"learning_rate": 2.839396628216504e-06,
"loss": 0.1674,
"step": 2378
},
{
"epoch": 2.8491017964071856,
"grad_norm": 0.14752924016259306,
"learning_rate": 2.8172138420585625e-06,
"loss": 0.1515,
"step": 2379
},
{
"epoch": 2.8502994011976046,
"grad_norm": 0.15741230734667452,
"learning_rate": 2.7950310559006214e-06,
"loss": 0.1613,
"step": 2380
},
{
"epoch": 2.851497005988024,
"grad_norm": 0.26759548526545046,
"learning_rate": 2.7728482697426798e-06,
"loss": 0.1802,
"step": 2381
},
{
"epoch": 2.852694610778443,
"grad_norm": 0.15800870918977497,
"learning_rate": 2.750665483584738e-06,
"loss": 0.1797,
"step": 2382
},
{
"epoch": 2.8538922155688624,
"grad_norm": 0.15629999064904856,
"learning_rate": 2.7284826974267966e-06,
"loss": 0.1549,
"step": 2383
},
{
"epoch": 2.8550898203592814,
"grad_norm": 0.158833461776742,
"learning_rate": 2.7062999112688555e-06,
"loss": 0.177,
"step": 2384
},
{
"epoch": 2.8562874251497004,
"grad_norm": 0.15079080777275036,
"learning_rate": 2.6841171251109143e-06,
"loss": 0.1724,
"step": 2385
},
{
"epoch": 2.8574850299401198,
"grad_norm": 0.15427314977687417,
"learning_rate": 2.6619343389529728e-06,
"loss": 0.1716,
"step": 2386
},
{
"epoch": 2.858682634730539,
"grad_norm": 0.15442598484907957,
"learning_rate": 2.639751552795031e-06,
"loss": 0.1801,
"step": 2387
},
{
"epoch": 2.859880239520958,
"grad_norm": 0.1645733133484981,
"learning_rate": 2.6175687666370896e-06,
"loss": 0.1567,
"step": 2388
},
{
"epoch": 2.861077844311377,
"grad_norm": 0.1541387341457926,
"learning_rate": 2.5953859804791485e-06,
"loss": 0.1623,
"step": 2389
},
{
"epoch": 2.8622754491017965,
"grad_norm": 0.16792329636467032,
"learning_rate": 2.573203194321207e-06,
"loss": 0.1825,
"step": 2390
},
{
"epoch": 2.8634730538922155,
"grad_norm": 0.15385912017072007,
"learning_rate": 2.5510204081632653e-06,
"loss": 0.1575,
"step": 2391
},
{
"epoch": 2.864670658682635,
"grad_norm": 0.15306709527427942,
"learning_rate": 2.5288376220053237e-06,
"loss": 0.163,
"step": 2392
},
{
"epoch": 2.865868263473054,
"grad_norm": 0.26459037233323646,
"learning_rate": 2.5066548358473826e-06,
"loss": 0.1695,
"step": 2393
},
{
"epoch": 2.867065868263473,
"grad_norm": 0.15617821687861033,
"learning_rate": 2.484472049689441e-06,
"loss": 0.1626,
"step": 2394
},
{
"epoch": 2.8682634730538923,
"grad_norm": 0.15168993620851076,
"learning_rate": 2.4622892635315e-06,
"loss": 0.1557,
"step": 2395
},
{
"epoch": 2.8694610778443113,
"grad_norm": 0.15973799939210573,
"learning_rate": 2.4401064773735583e-06,
"loss": 0.1969,
"step": 2396
},
{
"epoch": 2.8706586826347307,
"grad_norm": 0.16401521151559123,
"learning_rate": 2.4179236912156167e-06,
"loss": 0.1717,
"step": 2397
},
{
"epoch": 2.8718562874251496,
"grad_norm": 0.15547674935374287,
"learning_rate": 2.3957409050576756e-06,
"loss": 0.1708,
"step": 2398
},
{
"epoch": 2.8730538922155686,
"grad_norm": 0.16146558464553395,
"learning_rate": 2.373558118899734e-06,
"loss": 0.1848,
"step": 2399
},
{
"epoch": 2.874251497005988,
"grad_norm": 2.1625754264227326,
"learning_rate": 2.3513753327417924e-06,
"loss": 0.1673,
"step": 2400
},
{
"epoch": 2.8754491017964074,
"grad_norm": 0.15567931379057331,
"learning_rate": 2.329192546583851e-06,
"loss": 0.1746,
"step": 2401
},
{
"epoch": 2.8766467065868264,
"grad_norm": 0.16094181352032821,
"learning_rate": 2.3070097604259097e-06,
"loss": 0.165,
"step": 2402
},
{
"epoch": 2.8778443113772454,
"grad_norm": 0.16085932574242376,
"learning_rate": 2.284826974267968e-06,
"loss": 0.1747,
"step": 2403
},
{
"epoch": 2.879041916167665,
"grad_norm": 0.1628590008591137,
"learning_rate": 2.2626441881100266e-06,
"loss": 0.1802,
"step": 2404
},
{
"epoch": 2.8802395209580838,
"grad_norm": 0.1538531245260321,
"learning_rate": 2.2404614019520854e-06,
"loss": 0.152,
"step": 2405
},
{
"epoch": 2.881437125748503,
"grad_norm": 0.1475864091772814,
"learning_rate": 2.218278615794144e-06,
"loss": 0.1584,
"step": 2406
},
{
"epoch": 2.882634730538922,
"grad_norm": 0.1642856735828716,
"learning_rate": 2.1960958296362027e-06,
"loss": 0.1749,
"step": 2407
},
{
"epoch": 2.883832335329341,
"grad_norm": 0.1529653692736844,
"learning_rate": 2.173913043478261e-06,
"loss": 0.1677,
"step": 2408
},
{
"epoch": 2.8850299401197605,
"grad_norm": 0.22698971062747098,
"learning_rate": 2.1517302573203195e-06,
"loss": 0.163,
"step": 2409
},
{
"epoch": 2.8862275449101795,
"grad_norm": 0.14803147303120975,
"learning_rate": 2.129547471162378e-06,
"loss": 0.163,
"step": 2410
},
{
"epoch": 2.887425149700599,
"grad_norm": 0.1530048330582273,
"learning_rate": 2.107364685004437e-06,
"loss": 0.176,
"step": 2411
},
{
"epoch": 2.888622754491018,
"grad_norm": 0.15226556037686878,
"learning_rate": 2.0851818988464952e-06,
"loss": 0.1736,
"step": 2412
},
{
"epoch": 2.889820359281437,
"grad_norm": 0.14743729549583917,
"learning_rate": 2.0629991126885537e-06,
"loss": 0.1576,
"step": 2413
},
{
"epoch": 2.8910179640718563,
"grad_norm": 0.1566241447840284,
"learning_rate": 2.040816326530612e-06,
"loss": 0.1673,
"step": 2414
},
{
"epoch": 2.8922155688622757,
"grad_norm": 0.1503996267285887,
"learning_rate": 2.018633540372671e-06,
"loss": 0.1577,
"step": 2415
},
{
"epoch": 2.8934131736526947,
"grad_norm": 0.1573293421814861,
"learning_rate": 1.9964507542147298e-06,
"loss": 0.1783,
"step": 2416
},
{
"epoch": 2.8946107784431137,
"grad_norm": 0.1566617226334367,
"learning_rate": 1.9742679680567878e-06,
"loss": 0.158,
"step": 2417
},
{
"epoch": 2.895808383233533,
"grad_norm": 0.15396974147046918,
"learning_rate": 1.9520851818988466e-06,
"loss": 0.1679,
"step": 2418
},
{
"epoch": 2.897005988023952,
"grad_norm": 0.1482063743558714,
"learning_rate": 1.929902395740905e-06,
"loss": 0.1618,
"step": 2419
},
{
"epoch": 2.8982035928143715,
"grad_norm": 0.15079007981253204,
"learning_rate": 1.907719609582964e-06,
"loss": 0.1735,
"step": 2420
},
{
"epoch": 2.8994011976047904,
"grad_norm": 0.15624103588162605,
"learning_rate": 1.8855368234250221e-06,
"loss": 0.1578,
"step": 2421
},
{
"epoch": 2.9005988023952094,
"grad_norm": 0.15390984172974145,
"learning_rate": 1.863354037267081e-06,
"loss": 0.1656,
"step": 2422
},
{
"epoch": 2.901796407185629,
"grad_norm": 0.16132264647349293,
"learning_rate": 1.8411712511091392e-06,
"loss": 0.1652,
"step": 2423
},
{
"epoch": 2.902994011976048,
"grad_norm": 0.1465273793843091,
"learning_rate": 1.818988464951198e-06,
"loss": 0.1466,
"step": 2424
},
{
"epoch": 2.904191616766467,
"grad_norm": 0.16063715268350545,
"learning_rate": 1.7968056787932567e-06,
"loss": 0.1779,
"step": 2425
},
{
"epoch": 2.905389221556886,
"grad_norm": 0.14918852734547722,
"learning_rate": 1.774622892635315e-06,
"loss": 0.1553,
"step": 2426
},
{
"epoch": 2.906586826347305,
"grad_norm": 0.14615642644596394,
"learning_rate": 1.7524401064773737e-06,
"loss": 0.1712,
"step": 2427
},
{
"epoch": 2.9077844311377246,
"grad_norm": 0.14912878104624255,
"learning_rate": 1.7302573203194322e-06,
"loss": 0.1596,
"step": 2428
},
{
"epoch": 2.908982035928144,
"grad_norm": 0.14765254594533814,
"learning_rate": 1.7080745341614908e-06,
"loss": 0.1653,
"step": 2429
},
{
"epoch": 2.910179640718563,
"grad_norm": 0.15158827965389962,
"learning_rate": 1.6858917480035492e-06,
"loss": 0.1788,
"step": 2430
},
{
"epoch": 2.911377245508982,
"grad_norm": 0.1483533745261271,
"learning_rate": 1.6637089618456079e-06,
"loss": 0.1737,
"step": 2431
},
{
"epoch": 2.9125748502994013,
"grad_norm": 0.16188961432467408,
"learning_rate": 1.6415261756876663e-06,
"loss": 0.1659,
"step": 2432
},
{
"epoch": 2.9137724550898203,
"grad_norm": 0.14920376957371742,
"learning_rate": 1.6193433895297251e-06,
"loss": 0.1627,
"step": 2433
},
{
"epoch": 2.9149700598802397,
"grad_norm": 0.16438907794408017,
"learning_rate": 1.5971606033717834e-06,
"loss": 0.1658,
"step": 2434
},
{
"epoch": 2.9161676646706587,
"grad_norm": 0.15996644709341604,
"learning_rate": 1.5749778172138422e-06,
"loss": 0.153,
"step": 2435
},
{
"epoch": 2.9173652694610777,
"grad_norm": 0.14538606764643053,
"learning_rate": 1.5527950310559006e-06,
"loss": 0.1573,
"step": 2436
},
{
"epoch": 2.918562874251497,
"grad_norm": 0.1447454783869211,
"learning_rate": 1.5306122448979593e-06,
"loss": 0.1547,
"step": 2437
},
{
"epoch": 2.919760479041916,
"grad_norm": 0.14654490679145535,
"learning_rate": 1.5084294587400177e-06,
"loss": 0.1735,
"step": 2438
},
{
"epoch": 2.9209580838323355,
"grad_norm": 0.16752530759508666,
"learning_rate": 1.4862466725820763e-06,
"loss": 0.1682,
"step": 2439
},
{
"epoch": 2.9221556886227544,
"grad_norm": 0.16504001260355902,
"learning_rate": 1.4640638864241348e-06,
"loss": 0.1699,
"step": 2440
},
{
"epoch": 2.9233532934131734,
"grad_norm": 0.14429260367557692,
"learning_rate": 1.4418811002661936e-06,
"loss": 0.1622,
"step": 2441
},
{
"epoch": 2.924550898203593,
"grad_norm": 0.17635591115792582,
"learning_rate": 1.419698314108252e-06,
"loss": 0.1829,
"step": 2442
},
{
"epoch": 2.9257485029940122,
"grad_norm": 0.1493420050492301,
"learning_rate": 1.3975155279503107e-06,
"loss": 0.1572,
"step": 2443
},
{
"epoch": 2.926946107784431,
"grad_norm": 0.1577844776125497,
"learning_rate": 1.375332741792369e-06,
"loss": 0.1819,
"step": 2444
},
{
"epoch": 2.92814371257485,
"grad_norm": 0.1513933878009785,
"learning_rate": 1.3531499556344277e-06,
"loss": 0.162,
"step": 2445
},
{
"epoch": 2.9293413173652696,
"grad_norm": 0.15212494707292973,
"learning_rate": 1.3309671694764864e-06,
"loss": 0.1532,
"step": 2446
},
{
"epoch": 2.9305389221556886,
"grad_norm": 0.20552890677122546,
"learning_rate": 1.3087843833185448e-06,
"loss": 0.1641,
"step": 2447
},
{
"epoch": 2.931736526946108,
"grad_norm": 0.48726514824627415,
"learning_rate": 1.2866015971606034e-06,
"loss": 0.1681,
"step": 2448
},
{
"epoch": 2.932934131736527,
"grad_norm": 0.14763712432989942,
"learning_rate": 1.2644188110026619e-06,
"loss": 0.1606,
"step": 2449
},
{
"epoch": 2.934131736526946,
"grad_norm": 0.15101575582097196,
"learning_rate": 1.2422360248447205e-06,
"loss": 0.1551,
"step": 2450
},
{
"epoch": 2.9353293413173653,
"grad_norm": 0.15849285092073018,
"learning_rate": 1.2200532386867791e-06,
"loss": 0.178,
"step": 2451
},
{
"epoch": 2.9365269461077843,
"grad_norm": 0.15443095561721884,
"learning_rate": 1.1978704525288378e-06,
"loss": 0.1596,
"step": 2452
},
{
"epoch": 2.9377245508982037,
"grad_norm": 0.15652921527293723,
"learning_rate": 1.1756876663708962e-06,
"loss": 0.159,
"step": 2453
},
{
"epoch": 2.9389221556886227,
"grad_norm": 0.15710086881586738,
"learning_rate": 1.1535048802129548e-06,
"loss": 0.1572,
"step": 2454
},
{
"epoch": 2.9401197604790417,
"grad_norm": 0.1567008611122271,
"learning_rate": 1.1313220940550133e-06,
"loss": 0.1644,
"step": 2455
},
{
"epoch": 2.941317365269461,
"grad_norm": 0.14728098629986186,
"learning_rate": 1.109139307897072e-06,
"loss": 0.1436,
"step": 2456
},
{
"epoch": 2.9425149700598805,
"grad_norm": 0.5013978779333401,
"learning_rate": 1.0869565217391306e-06,
"loss": 0.1596,
"step": 2457
},
{
"epoch": 2.9437125748502995,
"grad_norm": 0.14726572952070655,
"learning_rate": 1.064773735581189e-06,
"loss": 0.1615,
"step": 2458
},
{
"epoch": 2.9449101796407184,
"grad_norm": 0.15474404995609142,
"learning_rate": 1.0425909494232476e-06,
"loss": 0.1624,
"step": 2459
},
{
"epoch": 2.946107784431138,
"grad_norm": 0.14608618855773164,
"learning_rate": 1.020408163265306e-06,
"loss": 0.1515,
"step": 2460
},
{
"epoch": 2.947305389221557,
"grad_norm": 0.15329347428913515,
"learning_rate": 9.982253771073649e-07,
"loss": 0.1639,
"step": 2461
},
{
"epoch": 2.9485029940119762,
"grad_norm": 0.16072572928204998,
"learning_rate": 9.760425909494233e-07,
"loss": 0.1622,
"step": 2462
},
{
"epoch": 2.949700598802395,
"grad_norm": 0.14564057637562797,
"learning_rate": 9.53859804791482e-07,
"loss": 0.1655,
"step": 2463
},
{
"epoch": 2.950898203592814,
"grad_norm": 0.14334373658928418,
"learning_rate": 9.316770186335405e-07,
"loss": 0.1584,
"step": 2464
},
{
"epoch": 2.9520958083832336,
"grad_norm": 0.15501660972801634,
"learning_rate": 9.09494232475599e-07,
"loss": 0.1666,
"step": 2465
},
{
"epoch": 2.9532934131736526,
"grad_norm": 0.14903969688030513,
"learning_rate": 8.873114463176576e-07,
"loss": 0.1764,
"step": 2466
},
{
"epoch": 2.954491017964072,
"grad_norm": 0.17224725533237156,
"learning_rate": 8.651286601597161e-07,
"loss": 0.1575,
"step": 2467
},
{
"epoch": 2.955688622754491,
"grad_norm": 0.15706473089312847,
"learning_rate": 8.429458740017746e-07,
"loss": 0.1658,
"step": 2468
},
{
"epoch": 2.95688622754491,
"grad_norm": 0.1499230090401328,
"learning_rate": 8.207630878438331e-07,
"loss": 0.1668,
"step": 2469
},
{
"epoch": 2.9580838323353293,
"grad_norm": 0.14502046666457774,
"learning_rate": 7.985803016858917e-07,
"loss": 0.1671,
"step": 2470
},
{
"epoch": 2.9592814371257488,
"grad_norm": 0.14612007378600134,
"learning_rate": 7.763975155279503e-07,
"loss": 0.1583,
"step": 2471
},
{
"epoch": 2.9604790419161677,
"grad_norm": 0.1537455844117843,
"learning_rate": 7.542147293700089e-07,
"loss": 0.1733,
"step": 2472
},
{
"epoch": 2.9616766467065867,
"grad_norm": 0.16122698939036798,
"learning_rate": 7.320319432120674e-07,
"loss": 0.1752,
"step": 2473
},
{
"epoch": 2.962874251497006,
"grad_norm": 0.1483356386465962,
"learning_rate": 7.09849157054126e-07,
"loss": 0.1689,
"step": 2474
},
{
"epoch": 2.964071856287425,
"grad_norm": 0.15534034218798232,
"learning_rate": 6.876663708961846e-07,
"loss": 0.183,
"step": 2475
},
{
"epoch": 2.9652694610778445,
"grad_norm": 0.14387663270384457,
"learning_rate": 6.654835847382432e-07,
"loss": 0.149,
"step": 2476
},
{
"epoch": 2.9664670658682635,
"grad_norm": 0.15418572934413097,
"learning_rate": 6.433007985803017e-07,
"loss": 0.1828,
"step": 2477
},
{
"epoch": 2.9676646706586824,
"grad_norm": 0.15421582612110082,
"learning_rate": 6.211180124223603e-07,
"loss": 0.1614,
"step": 2478
},
{
"epoch": 2.968862275449102,
"grad_norm": 0.15669063548762793,
"learning_rate": 5.989352262644189e-07,
"loss": 0.1591,
"step": 2479
},
{
"epoch": 2.970059880239521,
"grad_norm": 0.16491295532159322,
"learning_rate": 5.767524401064774e-07,
"loss": 0.1864,
"step": 2480
},
{
"epoch": 2.9712574850299402,
"grad_norm": 0.15375315249237606,
"learning_rate": 5.54569653948536e-07,
"loss": 0.1705,
"step": 2481
},
{
"epoch": 2.972455089820359,
"grad_norm": 0.1586911751279606,
"learning_rate": 5.323868677905945e-07,
"loss": 0.1763,
"step": 2482
},
{
"epoch": 2.973652694610778,
"grad_norm": 0.1466150066366294,
"learning_rate": 5.10204081632653e-07,
"loss": 0.1598,
"step": 2483
},
{
"epoch": 2.9748502994011976,
"grad_norm": 0.14539227261304494,
"learning_rate": 4.880212954747117e-07,
"loss": 0.1729,
"step": 2484
},
{
"epoch": 2.976047904191617,
"grad_norm": 0.1433179111656557,
"learning_rate": 4.6583850931677024e-07,
"loss": 0.1765,
"step": 2485
},
{
"epoch": 2.977245508982036,
"grad_norm": 0.1481380177019813,
"learning_rate": 4.436557231588288e-07,
"loss": 0.171,
"step": 2486
},
{
"epoch": 2.978443113772455,
"grad_norm": 0.1551766543218047,
"learning_rate": 4.214729370008873e-07,
"loss": 0.17,
"step": 2487
},
{
"epoch": 2.9796407185628744,
"grad_norm": 0.14534377680821142,
"learning_rate": 3.9929015084294584e-07,
"loss": 0.1625,
"step": 2488
},
{
"epoch": 2.9808383233532934,
"grad_norm": 0.14719585970814497,
"learning_rate": 3.771073646850044e-07,
"loss": 0.1621,
"step": 2489
},
{
"epoch": 2.9820359281437128,
"grad_norm": 0.15071952344451522,
"learning_rate": 3.54924578527063e-07,
"loss": 0.1718,
"step": 2490
},
{
"epoch": 2.9832335329341317,
"grad_norm": 0.15323507163224095,
"learning_rate": 3.327417923691216e-07,
"loss": 0.1553,
"step": 2491
},
{
"epoch": 2.9844311377245507,
"grad_norm": 0.14626544919770315,
"learning_rate": 3.1055900621118013e-07,
"loss": 0.1554,
"step": 2492
},
{
"epoch": 2.98562874251497,
"grad_norm": 0.15094373815863943,
"learning_rate": 2.883762200532387e-07,
"loss": 0.1745,
"step": 2493
},
{
"epoch": 2.986826347305389,
"grad_norm": 0.15761591147709722,
"learning_rate": 2.6619343389529724e-07,
"loss": 0.1595,
"step": 2494
},
{
"epoch": 2.9880239520958085,
"grad_norm": 0.15278489123759498,
"learning_rate": 2.4401064773735583e-07,
"loss": 0.1546,
"step": 2495
},
{
"epoch": 2.9892215568862275,
"grad_norm": 0.14783384475507502,
"learning_rate": 2.218278615794144e-07,
"loss": 0.1747,
"step": 2496
},
{
"epoch": 2.9904191616766465,
"grad_norm": 0.15430071031394896,
"learning_rate": 1.9964507542147292e-07,
"loss": 0.1513,
"step": 2497
},
{
"epoch": 2.991616766467066,
"grad_norm": 0.16287096724902003,
"learning_rate": 1.774622892635315e-07,
"loss": 0.1825,
"step": 2498
},
{
"epoch": 2.9928143712574853,
"grad_norm": 0.1493248260510814,
"learning_rate": 1.5527950310559006e-07,
"loss": 0.1581,
"step": 2499
},
{
"epoch": 2.9940119760479043,
"grad_norm": 0.1496358207908914,
"learning_rate": 1.3309671694764862e-07,
"loss": 0.1685,
"step": 2500
},
{
"epoch": 2.9952095808383232,
"grad_norm": 0.1553298717498303,
"learning_rate": 1.109139307897072e-07,
"loss": 0.181,
"step": 2501
},
{
"epoch": 2.9964071856287426,
"grad_norm": 0.14889291118121548,
"learning_rate": 8.873114463176575e-08,
"loss": 0.1563,
"step": 2502
},
{
"epoch": 2.9976047904191616,
"grad_norm": 0.14239627480049186,
"learning_rate": 6.654835847382431e-08,
"loss": 0.1688,
"step": 2503
},
{
"epoch": 2.998802395209581,
"grad_norm": 0.14622572567583675,
"learning_rate": 4.4365572315882876e-08,
"loss": 0.1584,
"step": 2504
},
{
"epoch": 3.0,
"grad_norm": 0.13936265128307954,
"learning_rate": 2.2182786157941438e-08,
"loss": 0.1514,
"step": 2505
},
{
"epoch": 3.0,
"step": 2505,
"total_flos": 2.785964053938307e+19,
"train_loss": 0.3213875060071964,
"train_runtime": 71417.1033,
"train_samples_per_second": 0.561,
"train_steps_per_second": 0.035
}
],
"logging_steps": 1,
"max_steps": 2505,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.785964053938307e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}