1355 / trainer_state.json
iamPi's picture
Add files using upload-large-folder tool
677a7e4 verified
Raw
History Blame Contribute Delete
250 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1355,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007380073800738007,
"grad_norm": 52.96585464477539,
"learning_rate": 0.0,
"loss": 7.329597473144531,
"step": 1
},
{
"epoch": 0.0014760147601476014,
"grad_norm": 42.55315017700195,
"learning_rate": 1.4e-05,
"loss": 7.168418884277344,
"step": 2
},
{
"epoch": 0.002214022140221402,
"grad_norm": 21.52372169494629,
"learning_rate": 2.8e-05,
"loss": 6.947352409362793,
"step": 3
},
{
"epoch": 0.002952029520295203,
"grad_norm": 19.89319610595703,
"learning_rate": 4.2e-05,
"loss": 6.611477851867676,
"step": 4
},
{
"epoch": 0.0036900369003690036,
"grad_norm": 12.127403259277344,
"learning_rate": 5.6e-05,
"loss": 6.690403938293457,
"step": 5
},
{
"epoch": 0.004428044280442804,
"grad_norm": 11.600789070129395,
"learning_rate": 7.000000000000001e-05,
"loss": 6.540159225463867,
"step": 6
},
{
"epoch": 0.0051660516605166054,
"grad_norm": 8.64883804321289,
"learning_rate": 8.4e-05,
"loss": 6.4675188064575195,
"step": 7
},
{
"epoch": 0.005904059040590406,
"grad_norm": 8.694304466247559,
"learning_rate": 9.800000000000001e-05,
"loss": 6.344979286193848,
"step": 8
},
{
"epoch": 0.006642066420664207,
"grad_norm": 8.474891662597656,
"learning_rate": 0.000112,
"loss": 6.483427047729492,
"step": 9
},
{
"epoch": 0.007380073800738007,
"grad_norm": 8.267909049987793,
"learning_rate": 0.000126,
"loss": 6.328839302062988,
"step": 10
},
{
"epoch": 0.008118081180811807,
"grad_norm": 6.391619682312012,
"learning_rate": 0.00014000000000000001,
"loss": 6.344330787658691,
"step": 11
},
{
"epoch": 0.008856088560885609,
"grad_norm": 4.2130842208862305,
"learning_rate": 0.000154,
"loss": 6.2279744148254395,
"step": 12
},
{
"epoch": 0.00959409594095941,
"grad_norm": 4.580661296844482,
"learning_rate": 0.000168,
"loss": 6.267205715179443,
"step": 13
},
{
"epoch": 0.010332103321033211,
"grad_norm": 4.600402355194092,
"learning_rate": 0.000182,
"loss": 6.177546501159668,
"step": 14
},
{
"epoch": 0.01107011070110701,
"grad_norm": 6.000468730926514,
"learning_rate": 0.00019600000000000002,
"loss": 6.123383522033691,
"step": 15
},
{
"epoch": 0.011808118081180811,
"grad_norm": 6.2054548263549805,
"learning_rate": 0.00020999999999999998,
"loss": 6.029158592224121,
"step": 16
},
{
"epoch": 0.012546125461254613,
"grad_norm": 5.766181945800781,
"learning_rate": 0.000224,
"loss": 6.146026611328125,
"step": 17
},
{
"epoch": 0.013284132841328414,
"grad_norm": 3.5282742977142334,
"learning_rate": 0.000238,
"loss": 6.347329139709473,
"step": 18
},
{
"epoch": 0.014022140221402213,
"grad_norm": 10.378168106079102,
"learning_rate": 0.000252,
"loss": 5.982622146606445,
"step": 19
},
{
"epoch": 0.014760147601476014,
"grad_norm": 6.26217794418335,
"learning_rate": 0.000266,
"loss": 6.232936859130859,
"step": 20
},
{
"epoch": 0.015498154981549815,
"grad_norm": 3.640542984008789,
"learning_rate": 0.00028000000000000003,
"loss": 6.282479763031006,
"step": 21
},
{
"epoch": 0.016236162361623615,
"grad_norm": 4.074864864349365,
"learning_rate": 0.000294,
"loss": 6.1197309494018555,
"step": 22
},
{
"epoch": 0.016974169741697416,
"grad_norm": 4.15755558013916,
"learning_rate": 0.000308,
"loss": 6.1190900802612305,
"step": 23
},
{
"epoch": 0.017712177121771217,
"grad_norm": 8.528851509094238,
"learning_rate": 0.000322,
"loss": 6.108499050140381,
"step": 24
},
{
"epoch": 0.01845018450184502,
"grad_norm": 4.248746395111084,
"learning_rate": 0.000336,
"loss": 5.993032932281494,
"step": 25
},
{
"epoch": 0.01918819188191882,
"grad_norm": 5.643017292022705,
"learning_rate": 0.00035,
"loss": 6.173605918884277,
"step": 26
},
{
"epoch": 0.01992619926199262,
"grad_norm": 3.0032365322113037,
"learning_rate": 0.000364,
"loss": 5.9087629318237305,
"step": 27
},
{
"epoch": 0.020664206642066422,
"grad_norm": 6.890568733215332,
"learning_rate": 0.000378,
"loss": 6.026180267333984,
"step": 28
},
{
"epoch": 0.021402214022140223,
"grad_norm": 4.55826473236084,
"learning_rate": 0.00039200000000000004,
"loss": 6.302541732788086,
"step": 29
},
{
"epoch": 0.02214022140221402,
"grad_norm": 5.366292953491211,
"learning_rate": 0.00040599999999999995,
"loss": 6.086678981781006,
"step": 30
},
{
"epoch": 0.022878228782287822,
"grad_norm": 2.9198176860809326,
"learning_rate": 0.00041999999999999996,
"loss": 6.034950256347656,
"step": 31
},
{
"epoch": 0.023616236162361623,
"grad_norm": 3.0416109561920166,
"learning_rate": 0.000434,
"loss": 5.959887981414795,
"step": 32
},
{
"epoch": 0.024354243542435424,
"grad_norm": 3.6983375549316406,
"learning_rate": 0.000448,
"loss": 5.958649635314941,
"step": 33
},
{
"epoch": 0.025092250922509225,
"grad_norm": 3.3332769870758057,
"learning_rate": 0.000462,
"loss": 6.053283214569092,
"step": 34
},
{
"epoch": 0.025830258302583026,
"grad_norm": 4.3135857582092285,
"learning_rate": 0.000476,
"loss": 5.938570499420166,
"step": 35
},
{
"epoch": 0.026568265682656828,
"grad_norm": 3.9662985801696777,
"learning_rate": 0.00049,
"loss": 6.1224799156188965,
"step": 36
},
{
"epoch": 0.02730627306273063,
"grad_norm": 3.4459118843078613,
"learning_rate": 0.000504,
"loss": 6.048614501953125,
"step": 37
},
{
"epoch": 0.028044280442804426,
"grad_norm": 4.011275768280029,
"learning_rate": 0.000518,
"loss": 6.095024108886719,
"step": 38
},
{
"epoch": 0.028782287822878228,
"grad_norm": 4.109455108642578,
"learning_rate": 0.000532,
"loss": 6.097041130065918,
"step": 39
},
{
"epoch": 0.02952029520295203,
"grad_norm": 2.0187416076660156,
"learning_rate": 0.000546,
"loss": 5.855551719665527,
"step": 40
},
{
"epoch": 0.03025830258302583,
"grad_norm": 4.543977737426758,
"learning_rate": 0.0005600000000000001,
"loss": 5.990810394287109,
"step": 41
},
{
"epoch": 0.03099630996309963,
"grad_norm": 3.6285860538482666,
"learning_rate": 0.000574,
"loss": 6.1089982986450195,
"step": 42
},
{
"epoch": 0.03173431734317343,
"grad_norm": 2.802408218383789,
"learning_rate": 0.000588,
"loss": 6.059175491333008,
"step": 43
},
{
"epoch": 0.03247232472324723,
"grad_norm": 5.055509090423584,
"learning_rate": 0.000602,
"loss": 6.0541791915893555,
"step": 44
},
{
"epoch": 0.033210332103321034,
"grad_norm": 5.420635223388672,
"learning_rate": 0.000616,
"loss": 5.914989471435547,
"step": 45
},
{
"epoch": 0.03394833948339483,
"grad_norm": 3.779264211654663,
"learning_rate": 0.00063,
"loss": 5.772123336791992,
"step": 46
},
{
"epoch": 0.03468634686346864,
"grad_norm": 4.194505214691162,
"learning_rate": 0.000644,
"loss": 6.127632141113281,
"step": 47
},
{
"epoch": 0.035424354243542434,
"grad_norm": 2.183096170425415,
"learning_rate": 0.000658,
"loss": 5.898839950561523,
"step": 48
},
{
"epoch": 0.03616236162361624,
"grad_norm": 3.0196142196655273,
"learning_rate": 0.000672,
"loss": 5.775443077087402,
"step": 49
},
{
"epoch": 0.03690036900369004,
"grad_norm": 4.503098011016846,
"learning_rate": 0.000686,
"loss": 5.992775917053223,
"step": 50
},
{
"epoch": 0.037638376383763834,
"grad_norm": 2.646671772003174,
"learning_rate": 0.0007,
"loss": 5.891811370849609,
"step": 51
},
{
"epoch": 0.03837638376383764,
"grad_norm": 4.828780651092529,
"learning_rate": 0.0006999989858164525,
"loss": 5.944026947021484,
"step": 52
},
{
"epoch": 0.03911439114391144,
"grad_norm": 5.056863784790039,
"learning_rate": 0.0006999959432716873,
"loss": 6.092121601104736,
"step": 53
},
{
"epoch": 0.03985239852398524,
"grad_norm": 2.9205923080444336,
"learning_rate": 0.0006999908723833372,
"loss": 6.128796577453613,
"step": 54
},
{
"epoch": 0.04059040590405904,
"grad_norm": 2.503229856491089,
"learning_rate": 0.0006999837731807897,
"loss": 5.857043266296387,
"step": 55
},
{
"epoch": 0.041328413284132844,
"grad_norm": 2.815605640411377,
"learning_rate": 0.0006999746457051868,
"loss": 5.79864501953125,
"step": 56
},
{
"epoch": 0.04206642066420664,
"grad_norm": 2.630692481994629,
"learning_rate": 0.0006999634900094256,
"loss": 6.038992881774902,
"step": 57
},
{
"epoch": 0.042804428044280446,
"grad_norm": 2.103322982788086,
"learning_rate": 0.0006999503061581567,
"loss": 5.8827619552612305,
"step": 58
},
{
"epoch": 0.043542435424354244,
"grad_norm": 4.4402265548706055,
"learning_rate": 0.0006999350942277852,
"loss": 6.193219184875488,
"step": 59
},
{
"epoch": 0.04428044280442804,
"grad_norm": 2.784449815750122,
"learning_rate": 0.0006999178543064694,
"loss": 5.896166801452637,
"step": 60
},
{
"epoch": 0.045018450184501846,
"grad_norm": 2.158843755722046,
"learning_rate": 0.0006998985864941203,
"loss": 5.9794487953186035,
"step": 61
},
{
"epoch": 0.045756457564575644,
"grad_norm": 3.826530933380127,
"learning_rate": 0.0006998772909024012,
"loss": 5.747754096984863,
"step": 62
},
{
"epoch": 0.04649446494464945,
"grad_norm": 1.7147290706634521,
"learning_rate": 0.0006998539676547274,
"loss": 5.834345817565918,
"step": 63
},
{
"epoch": 0.047232472324723246,
"grad_norm": 2.6357598304748535,
"learning_rate": 0.0006998286168862646,
"loss": 5.970273017883301,
"step": 64
},
{
"epoch": 0.04797047970479705,
"grad_norm": 2.0640320777893066,
"learning_rate": 0.0006998012387439294,
"loss": 6.20042610168457,
"step": 65
},
{
"epoch": 0.04870848708487085,
"grad_norm": 1.840738296508789,
"learning_rate": 0.0006997718333863869,
"loss": 5.69251823425293,
"step": 66
},
{
"epoch": 0.04944649446494465,
"grad_norm": 1.5103991031646729,
"learning_rate": 0.0006997404009840512,
"loss": 5.718031883239746,
"step": 67
},
{
"epoch": 0.05018450184501845,
"grad_norm": 2.454057455062866,
"learning_rate": 0.0006997069417190837,
"loss": 5.718637466430664,
"step": 68
},
{
"epoch": 0.05092250922509225,
"grad_norm": 1.7299764156341553,
"learning_rate": 0.0006996714557853919,
"loss": 5.874034404754639,
"step": 69
},
{
"epoch": 0.05166051660516605,
"grad_norm": 1.983879566192627,
"learning_rate": 0.0006996339433886285,
"loss": 5.866864204406738,
"step": 70
},
{
"epoch": 0.05239852398523985,
"grad_norm": 1.7243304252624512,
"learning_rate": 0.0006995944047461907,
"loss": 5.6140642166137695,
"step": 71
},
{
"epoch": 0.053136531365313655,
"grad_norm": 2.1467807292938232,
"learning_rate": 0.0006995528400872179,
"loss": 5.7456207275390625,
"step": 72
},
{
"epoch": 0.05387453874538745,
"grad_norm": 1.8860361576080322,
"learning_rate": 0.0006995092496525912,
"loss": 5.868312835693359,
"step": 73
},
{
"epoch": 0.05461254612546126,
"grad_norm": 1.9977107048034668,
"learning_rate": 0.000699463633694932,
"loss": 5.8535919189453125,
"step": 74
},
{
"epoch": 0.055350553505535055,
"grad_norm": 1.6792536973953247,
"learning_rate": 0.0006994159924785998,
"loss": 5.957564353942871,
"step": 75
},
{
"epoch": 0.05608856088560885,
"grad_norm": 1.83674955368042,
"learning_rate": 0.0006993663262796917,
"loss": 5.801642894744873,
"step": 76
},
{
"epoch": 0.05682656826568266,
"grad_norm": 1.7811754941940308,
"learning_rate": 0.0006993146353860395,
"loss": 5.8649001121521,
"step": 77
},
{
"epoch": 0.057564575645756455,
"grad_norm": 2.130631446838379,
"learning_rate": 0.0006992609200972095,
"loss": 5.959519386291504,
"step": 78
},
{
"epoch": 0.05830258302583026,
"grad_norm": 1.914402961730957,
"learning_rate": 0.0006992051807244997,
"loss": 5.6643877029418945,
"step": 79
},
{
"epoch": 0.05904059040590406,
"grad_norm": 2.480494737625122,
"learning_rate": 0.0006991474175909385,
"loss": 5.705104827880859,
"step": 80
},
{
"epoch": 0.05977859778597786,
"grad_norm": 1.6274583339691162,
"learning_rate": 0.0006990876310312825,
"loss": 5.786376953125,
"step": 81
},
{
"epoch": 0.06051660516605166,
"grad_norm": 3.1301629543304443,
"learning_rate": 0.0006990258213920147,
"loss": 5.652984142303467,
"step": 82
},
{
"epoch": 0.061254612546125464,
"grad_norm": 1.7219048738479614,
"learning_rate": 0.0006989619890313428,
"loss": 5.684242248535156,
"step": 83
},
{
"epoch": 0.06199261992619926,
"grad_norm": 2.016432046890259,
"learning_rate": 0.0006988961343191968,
"loss": 5.919116973876953,
"step": 84
},
{
"epoch": 0.06273062730627306,
"grad_norm": 1.9674099683761597,
"learning_rate": 0.0006988282576372264,
"loss": 5.706339359283447,
"step": 85
},
{
"epoch": 0.06346863468634686,
"grad_norm": 1.6389487981796265,
"learning_rate": 0.0006987583593788001,
"loss": 6.144864082336426,
"step": 86
},
{
"epoch": 0.06420664206642067,
"grad_norm": 1.9105358123779297,
"learning_rate": 0.0006986864399490014,
"loss": 5.812554359436035,
"step": 87
},
{
"epoch": 0.06494464944649446,
"grad_norm": 2.2395148277282715,
"learning_rate": 0.0006986124997646276,
"loss": 5.818288803100586,
"step": 88
},
{
"epoch": 0.06568265682656826,
"grad_norm": 1.4297045469284058,
"learning_rate": 0.0006985365392541869,
"loss": 5.988651275634766,
"step": 89
},
{
"epoch": 0.06642066420664207,
"grad_norm": 2.393372058868408,
"learning_rate": 0.0006984585588578955,
"loss": 5.834245681762695,
"step": 90
},
{
"epoch": 0.06715867158671587,
"grad_norm": 1.7424498796463013,
"learning_rate": 0.0006983785590276763,
"loss": 5.847927570343018,
"step": 91
},
{
"epoch": 0.06789667896678966,
"grad_norm": 1.7180150747299194,
"learning_rate": 0.0006982965402271549,
"loss": 5.745847702026367,
"step": 92
},
{
"epoch": 0.06863468634686347,
"grad_norm": 1.5406665802001953,
"learning_rate": 0.0006982125029316576,
"loss": 5.680943012237549,
"step": 93
},
{
"epoch": 0.06937269372693727,
"grad_norm": 1.9634002447128296,
"learning_rate": 0.0006981264476282089,
"loss": 5.660253524780273,
"step": 94
},
{
"epoch": 0.07011070110701106,
"grad_norm": 1.7053471803665161,
"learning_rate": 0.0006980383748155278,
"loss": 5.777673721313477,
"step": 95
},
{
"epoch": 0.07084870848708487,
"grad_norm": 1.8611632585525513,
"learning_rate": 0.0006979482850040258,
"loss": 5.753267288208008,
"step": 96
},
{
"epoch": 0.07158671586715867,
"grad_norm": 2.236954689025879,
"learning_rate": 0.0006978561787158036,
"loss": 5.762792587280273,
"step": 97
},
{
"epoch": 0.07232472324723248,
"grad_norm": 1.5513856410980225,
"learning_rate": 0.0006977620564846479,
"loss": 5.847312927246094,
"step": 98
},
{
"epoch": 0.07306273062730627,
"grad_norm": 1.6298314332962036,
"learning_rate": 0.0006976659188560285,
"loss": 5.568481922149658,
"step": 99
},
{
"epoch": 0.07380073800738007,
"grad_norm": 1.6806327104568481,
"learning_rate": 0.0006975677663870951,
"loss": 5.746288776397705,
"step": 100
},
{
"epoch": 0.07453874538745388,
"grad_norm": 1.5393524169921875,
"learning_rate": 0.0006974675996466741,
"loss": 5.562119960784912,
"step": 101
},
{
"epoch": 0.07527675276752767,
"grad_norm": 1.3935660123825073,
"learning_rate": 0.0006973654192152653,
"loss": 5.655695915222168,
"step": 102
},
{
"epoch": 0.07601476014760147,
"grad_norm": 1.8384559154510498,
"learning_rate": 0.0006972612256850385,
"loss": 5.717691421508789,
"step": 103
},
{
"epoch": 0.07675276752767528,
"grad_norm": 1.4056777954101562,
"learning_rate": 0.00069715501965983,
"loss": 5.4914422035217285,
"step": 104
},
{
"epoch": 0.07749077490774908,
"grad_norm": 1.5063185691833496,
"learning_rate": 0.0006970468017551393,
"loss": 5.804128170013428,
"step": 105
},
{
"epoch": 0.07822878228782287,
"grad_norm": 1.5670958757400513,
"learning_rate": 0.0006969365725981253,
"loss": 5.555459976196289,
"step": 106
},
{
"epoch": 0.07896678966789668,
"grad_norm": 1.4736913442611694,
"learning_rate": 0.000696824332827603,
"loss": 5.734355926513672,
"step": 107
},
{
"epoch": 0.07970479704797048,
"grad_norm": 1.2875981330871582,
"learning_rate": 0.0006967100830940393,
"loss": 5.615688800811768,
"step": 108
},
{
"epoch": 0.08044280442804429,
"grad_norm": 1.6725730895996094,
"learning_rate": 0.0006965938240595497,
"loss": 5.705436706542969,
"step": 109
},
{
"epoch": 0.08118081180811808,
"grad_norm": 1.2822149991989136,
"learning_rate": 0.000696475556397894,
"loss": 5.77439022064209,
"step": 110
},
{
"epoch": 0.08191881918819188,
"grad_norm": 1.5231584310531616,
"learning_rate": 0.0006963552807944731,
"loss": 5.540444374084473,
"step": 111
},
{
"epoch": 0.08265682656826569,
"grad_norm": 1.3938168287277222,
"learning_rate": 0.0006962329979463242,
"loss": 5.578408241271973,
"step": 112
},
{
"epoch": 0.08339483394833948,
"grad_norm": 1.80418062210083,
"learning_rate": 0.0006961087085621174,
"loss": 5.822021484375,
"step": 113
},
{
"epoch": 0.08413284132841328,
"grad_norm": 1.3559857606887817,
"learning_rate": 0.0006959824133621514,
"loss": 5.527395248413086,
"step": 114
},
{
"epoch": 0.08487084870848709,
"grad_norm": 1.6934373378753662,
"learning_rate": 0.0006958541130783489,
"loss": 5.64322566986084,
"step": 115
},
{
"epoch": 0.08560885608856089,
"grad_norm": 2.645036220550537,
"learning_rate": 0.0006957238084542531,
"loss": 5.786375999450684,
"step": 116
},
{
"epoch": 0.08634686346863468,
"grad_norm": 1.7617570161819458,
"learning_rate": 0.0006955915002450227,
"loss": 5.706923484802246,
"step": 117
},
{
"epoch": 0.08708487084870849,
"grad_norm": 1.4721003770828247,
"learning_rate": 0.0006954571892174282,
"loss": 5.816807746887207,
"step": 118
},
{
"epoch": 0.08782287822878229,
"grad_norm": 1.4024418592453003,
"learning_rate": 0.0006953208761498471,
"loss": 5.504622459411621,
"step": 119
},
{
"epoch": 0.08856088560885608,
"grad_norm": 1.1762746572494507,
"learning_rate": 0.0006951825618322589,
"loss": 5.638977527618408,
"step": 120
},
{
"epoch": 0.08929889298892989,
"grad_norm": 1.4858025312423706,
"learning_rate": 0.0006950422470662416,
"loss": 5.7883405685424805,
"step": 121
},
{
"epoch": 0.09003690036900369,
"grad_norm": 1.197791576385498,
"learning_rate": 0.0006948999326649661,
"loss": 5.5270586013793945,
"step": 122
},
{
"epoch": 0.0907749077490775,
"grad_norm": 1.280106782913208,
"learning_rate": 0.000694755619453192,
"loss": 5.614171504974365,
"step": 123
},
{
"epoch": 0.09151291512915129,
"grad_norm": 1.1635382175445557,
"learning_rate": 0.0006946093082672625,
"loss": 5.714271545410156,
"step": 124
},
{
"epoch": 0.09225092250922509,
"grad_norm": 1.5833303928375244,
"learning_rate": 0.0006944609999551001,
"loss": 5.534208297729492,
"step": 125
},
{
"epoch": 0.0929889298892989,
"grad_norm": 1.2109582424163818,
"learning_rate": 0.0006943106953762009,
"loss": 5.419297218322754,
"step": 126
},
{
"epoch": 0.09372693726937269,
"grad_norm": 1.551060676574707,
"learning_rate": 0.0006941583954016304,
"loss": 5.700986385345459,
"step": 127
},
{
"epoch": 0.09446494464944649,
"grad_norm": 1.1270159482955933,
"learning_rate": 0.0006940041009140178,
"loss": 5.61196231842041,
"step": 128
},
{
"epoch": 0.0952029520295203,
"grad_norm": 1.288231372833252,
"learning_rate": 0.0006938478128075513,
"loss": 5.599189758300781,
"step": 129
},
{
"epoch": 0.0959409594095941,
"grad_norm": 1.7800358533859253,
"learning_rate": 0.0006936895319879727,
"loss": 5.359455108642578,
"step": 130
},
{
"epoch": 0.09667896678966789,
"grad_norm": 1.5556919574737549,
"learning_rate": 0.0006935292593725724,
"loss": 5.530261516571045,
"step": 131
},
{
"epoch": 0.0974169741697417,
"grad_norm": 1.737862229347229,
"learning_rate": 0.0006933669958901836,
"loss": 5.362129211425781,
"step": 132
},
{
"epoch": 0.0981549815498155,
"grad_norm": 1.5239074230194092,
"learning_rate": 0.0006932027424811779,
"loss": 5.559414863586426,
"step": 133
},
{
"epoch": 0.0988929889298893,
"grad_norm": 1.206781029701233,
"learning_rate": 0.0006930365000974584,
"loss": 5.415935516357422,
"step": 134
},
{
"epoch": 0.0996309963099631,
"grad_norm": 1.5241954326629639,
"learning_rate": 0.0006928682697024555,
"loss": 5.514790058135986,
"step": 135
},
{
"epoch": 0.1003690036900369,
"grad_norm": 1.7540452480316162,
"learning_rate": 0.0006926980522711204,
"loss": 5.370218276977539,
"step": 136
},
{
"epoch": 0.1011070110701107,
"grad_norm": 1.4406752586364746,
"learning_rate": 0.0006925258487899203,
"loss": 5.334672451019287,
"step": 137
},
{
"epoch": 0.1018450184501845,
"grad_norm": 1.2946128845214844,
"learning_rate": 0.000692351660256832,
"loss": 5.602551460266113,
"step": 138
},
{
"epoch": 0.1025830258302583,
"grad_norm": 1.2579693794250488,
"learning_rate": 0.0006921754876813361,
"loss": 5.522645473480225,
"step": 139
},
{
"epoch": 0.1033210332103321,
"grad_norm": 1.2886651754379272,
"learning_rate": 0.0006919973320844118,
"loss": 5.577740669250488,
"step": 140
},
{
"epoch": 0.10405904059040591,
"grad_norm": 1.0571826696395874,
"learning_rate": 0.0006918171944985303,
"loss": 5.557397842407227,
"step": 141
},
{
"epoch": 0.1047970479704797,
"grad_norm": 1.4176267385482788,
"learning_rate": 0.0006916350759676493,
"loss": 5.38129997253418,
"step": 142
},
{
"epoch": 0.1055350553505535,
"grad_norm": 1.2939625978469849,
"learning_rate": 0.0006914509775472065,
"loss": 5.3804121017456055,
"step": 143
},
{
"epoch": 0.10627306273062731,
"grad_norm": 1.3399301767349243,
"learning_rate": 0.0006912649003041137,
"loss": 5.509670734405518,
"step": 144
},
{
"epoch": 0.1070110701107011,
"grad_norm": 1.1282126903533936,
"learning_rate": 0.000691076845316751,
"loss": 5.5377583503723145,
"step": 145
},
{
"epoch": 0.1077490774907749,
"grad_norm": 1.372504711151123,
"learning_rate": 0.00069088681367496,
"loss": 5.6342878341674805,
"step": 146
},
{
"epoch": 0.10848708487084871,
"grad_norm": 1.4673429727554321,
"learning_rate": 0.0006906948064800376,
"loss": 5.346056938171387,
"step": 147
},
{
"epoch": 0.10922509225092251,
"grad_norm": 1.4786832332611084,
"learning_rate": 0.0006905008248447296,
"loss": 5.530672073364258,
"step": 148
},
{
"epoch": 0.1099630996309963,
"grad_norm": 1.14403235912323,
"learning_rate": 0.0006903048698932245,
"loss": 5.126125812530518,
"step": 149
},
{
"epoch": 0.11070110701107011,
"grad_norm": 1.4274934530258179,
"learning_rate": 0.0006901069427611469,
"loss": 5.36081600189209,
"step": 150
},
{
"epoch": 0.11143911439114391,
"grad_norm": 1.224621295928955,
"learning_rate": 0.0006899070445955507,
"loss": 5.192722797393799,
"step": 151
},
{
"epoch": 0.1121771217712177,
"grad_norm": 1.1289647817611694,
"learning_rate": 0.0006897051765549127,
"loss": 5.438913822174072,
"step": 152
},
{
"epoch": 0.11291512915129151,
"grad_norm": 1.3115386962890625,
"learning_rate": 0.0006895013398091256,
"loss": 5.402008533477783,
"step": 153
},
{
"epoch": 0.11365313653136531,
"grad_norm": 1.4054917097091675,
"learning_rate": 0.0006892955355394918,
"loss": 5.593056678771973,
"step": 154
},
{
"epoch": 0.11439114391143912,
"grad_norm": 1.2027919292449951,
"learning_rate": 0.0006890877649387155,
"loss": 5.359673500061035,
"step": 155
},
{
"epoch": 0.11512915129151291,
"grad_norm": 1.1730295419692993,
"learning_rate": 0.0006888780292108971,
"loss": 5.578248023986816,
"step": 156
},
{
"epoch": 0.11586715867158671,
"grad_norm": 1.2120227813720703,
"learning_rate": 0.0006886663295715254,
"loss": 5.643091678619385,
"step": 157
},
{
"epoch": 0.11660516605166052,
"grad_norm": 1.2268054485321045,
"learning_rate": 0.0006884526672474704,
"loss": 5.381834030151367,
"step": 158
},
{
"epoch": 0.11734317343173432,
"grad_norm": 1.3834030628204346,
"learning_rate": 0.0006882370434769769,
"loss": 5.615821838378906,
"step": 159
},
{
"epoch": 0.11808118081180811,
"grad_norm": 1.7289725542068481,
"learning_rate": 0.0006880194595096567,
"loss": 5.346611499786377,
"step": 160
},
{
"epoch": 0.11881918819188192,
"grad_norm": 1.434497356414795,
"learning_rate": 0.0006877999166064817,
"loss": 5.427518844604492,
"step": 161
},
{
"epoch": 0.11955719557195572,
"grad_norm": 1.2287393808364868,
"learning_rate": 0.0006875784160397766,
"loss": 5.595153331756592,
"step": 162
},
{
"epoch": 0.12029520295202951,
"grad_norm": 1.327791690826416,
"learning_rate": 0.0006873549590932111,
"loss": 5.294317722320557,
"step": 163
},
{
"epoch": 0.12103321033210332,
"grad_norm": 1.358208179473877,
"learning_rate": 0.0006871295470617932,
"loss": 5.65151309967041,
"step": 164
},
{
"epoch": 0.12177121771217712,
"grad_norm": 1.1277738809585571,
"learning_rate": 0.0006869021812518607,
"loss": 5.721683979034424,
"step": 165
},
{
"epoch": 0.12250922509225093,
"grad_norm": 1.407368540763855,
"learning_rate": 0.0006866728629810749,
"loss": 5.473011016845703,
"step": 166
},
{
"epoch": 0.12324723247232472,
"grad_norm": 1.3105313777923584,
"learning_rate": 0.0006864415935784116,
"loss": 5.670052528381348,
"step": 167
},
{
"epoch": 0.12398523985239852,
"grad_norm": 1.4188215732574463,
"learning_rate": 0.0006862083743841545,
"loss": 5.493824005126953,
"step": 168
},
{
"epoch": 0.12472324723247233,
"grad_norm": 1.2717117071151733,
"learning_rate": 0.0006859732067498869,
"loss": 5.524445056915283,
"step": 169
},
{
"epoch": 0.12546125461254612,
"grad_norm": 1.1162827014923096,
"learning_rate": 0.0006857360920384839,
"loss": 5.39989709854126,
"step": 170
},
{
"epoch": 0.12619926199261994,
"grad_norm": 1.166066288948059,
"learning_rate": 0.0006854970316241045,
"loss": 5.495843887329102,
"step": 171
},
{
"epoch": 0.12693726937269373,
"grad_norm": 1.9042305946350098,
"learning_rate": 0.0006852560268921838,
"loss": 5.403502464294434,
"step": 172
},
{
"epoch": 0.12767527675276752,
"grad_norm": 1.0880268812179565,
"learning_rate": 0.0006850130792394249,
"loss": 5.439591407775879,
"step": 173
},
{
"epoch": 0.12841328413284134,
"grad_norm": 1.0691889524459839,
"learning_rate": 0.0006847681900737907,
"loss": 5.504947185516357,
"step": 174
},
{
"epoch": 0.12915129151291513,
"grad_norm": 1.2986247539520264,
"learning_rate": 0.0006845213608144958,
"loss": 5.43480920791626,
"step": 175
},
{
"epoch": 0.12988929889298892,
"grad_norm": 1.1326215267181396,
"learning_rate": 0.0006842725928919984,
"loss": 5.448299407958984,
"step": 176
},
{
"epoch": 0.13062730627306274,
"grad_norm": 1.1839748620986938,
"learning_rate": 0.0006840218877479918,
"loss": 5.370269775390625,
"step": 177
},
{
"epoch": 0.13136531365313653,
"grad_norm": 1.3466558456420898,
"learning_rate": 0.0006837692468353963,
"loss": 5.503698348999023,
"step": 178
},
{
"epoch": 0.13210332103321032,
"grad_norm": 1.2086361646652222,
"learning_rate": 0.0006835146716183503,
"loss": 5.3210554122924805,
"step": 179
},
{
"epoch": 0.13284132841328414,
"grad_norm": 1.0457011461257935,
"learning_rate": 0.0006832581635722026,
"loss": 5.430882930755615,
"step": 180
},
{
"epoch": 0.13357933579335793,
"grad_norm": 1.2964543104171753,
"learning_rate": 0.0006829997241835029,
"loss": 5.3685688972473145,
"step": 181
},
{
"epoch": 0.13431734317343175,
"grad_norm": 1.12661612033844,
"learning_rate": 0.0006827393549499941,
"loss": 5.366943359375,
"step": 182
},
{
"epoch": 0.13505535055350554,
"grad_norm": 1.4851716756820679,
"learning_rate": 0.0006824770573806029,
"loss": 5.4124755859375,
"step": 183
},
{
"epoch": 0.13579335793357933,
"grad_norm": 2.0913474559783936,
"learning_rate": 0.0006822128329954316,
"loss": 5.477243423461914,
"step": 184
},
{
"epoch": 0.13653136531365315,
"grad_norm": 1.6759217977523804,
"learning_rate": 0.0006819466833257487,
"loss": 5.315946578979492,
"step": 185
},
{
"epoch": 0.13726937269372694,
"grad_norm": 1.5114970207214355,
"learning_rate": 0.0006816786099139809,
"loss": 5.488532066345215,
"step": 186
},
{
"epoch": 0.13800738007380073,
"grad_norm": 1.229912519454956,
"learning_rate": 0.0006814086143137029,
"loss": 5.235088348388672,
"step": 187
},
{
"epoch": 0.13874538745387455,
"grad_norm": 1.1838656663894653,
"learning_rate": 0.0006811366980896299,
"loss": 5.650766372680664,
"step": 188
},
{
"epoch": 0.13948339483394834,
"grad_norm": 1.2359192371368408,
"learning_rate": 0.0006808628628176073,
"loss": 5.51072883605957,
"step": 189
},
{
"epoch": 0.14022140221402213,
"grad_norm": 1.2534209489822388,
"learning_rate": 0.0006805871100846018,
"loss": 5.4855170249938965,
"step": 190
},
{
"epoch": 0.14095940959409595,
"grad_norm": 1.1044737100601196,
"learning_rate": 0.0006803094414886932,
"loss": 5.416131973266602,
"step": 191
},
{
"epoch": 0.14169741697416974,
"grad_norm": 1.1578259468078613,
"learning_rate": 0.0006800298586390637,
"loss": 5.303211688995361,
"step": 192
},
{
"epoch": 0.14243542435424356,
"grad_norm": 1.2732160091400146,
"learning_rate": 0.0006797483631559893,
"loss": 5.596409320831299,
"step": 193
},
{
"epoch": 0.14317343173431735,
"grad_norm": 1.3185418844223022,
"learning_rate": 0.0006794649566708308,
"loss": 5.081386089324951,
"step": 194
},
{
"epoch": 0.14391143911439114,
"grad_norm": 1.2399559020996094,
"learning_rate": 0.0006791796408260233,
"loss": 5.367499828338623,
"step": 195
},
{
"epoch": 0.14464944649446496,
"grad_norm": 1.4244142770767212,
"learning_rate": 0.000678892417275068,
"loss": 5.420333385467529,
"step": 196
},
{
"epoch": 0.14538745387453875,
"grad_norm": 1.079671025276184,
"learning_rate": 0.000678603287682521,
"loss": 5.452577114105225,
"step": 197
},
{
"epoch": 0.14612546125461254,
"grad_norm": 1.2236963510513306,
"learning_rate": 0.0006783122537239852,
"loss": 5.477599143981934,
"step": 198
},
{
"epoch": 0.14686346863468636,
"grad_norm": 1.2248585224151611,
"learning_rate": 0.0006780193170860999,
"loss": 5.277920722961426,
"step": 199
},
{
"epoch": 0.14760147601476015,
"grad_norm": 1.1838936805725098,
"learning_rate": 0.0006777244794665307,
"loss": 5.3089447021484375,
"step": 200
},
{
"epoch": 0.14833948339483394,
"grad_norm": 1.0920487642288208,
"learning_rate": 0.0006774277425739603,
"loss": 5.312920570373535,
"step": 201
},
{
"epoch": 0.14907749077490776,
"grad_norm": 1.5156118869781494,
"learning_rate": 0.0006771291081280784,
"loss": 5.365443229675293,
"step": 202
},
{
"epoch": 0.14981549815498155,
"grad_norm": 1.1590790748596191,
"learning_rate": 0.0006768285778595714,
"loss": 5.726003646850586,
"step": 203
},
{
"epoch": 0.15055350553505534,
"grad_norm": 1.1078206300735474,
"learning_rate": 0.0006765261535101128,
"loss": 5.49555778503418,
"step": 204
},
{
"epoch": 0.15129151291512916,
"grad_norm": 1.1094913482666016,
"learning_rate": 0.0006762218368323528,
"loss": 5.463008880615234,
"step": 205
},
{
"epoch": 0.15202952029520295,
"grad_norm": 1.043042540550232,
"learning_rate": 0.0006759156295899086,
"loss": 5.329763889312744,
"step": 206
},
{
"epoch": 0.15276752767527677,
"grad_norm": 0.9944074153900146,
"learning_rate": 0.0006756075335573533,
"loss": 5.15687370300293,
"step": 207
},
{
"epoch": 0.15350553505535056,
"grad_norm": 1.320447564125061,
"learning_rate": 0.0006752975505202067,
"loss": 5.366092681884766,
"step": 208
},
{
"epoch": 0.15424354243542435,
"grad_norm": 0.9683417081832886,
"learning_rate": 0.0006749856822749241,
"loss": 5.286744117736816,
"step": 209
},
{
"epoch": 0.15498154981549817,
"grad_norm": 1.0429140329360962,
"learning_rate": 0.0006746719306288863,
"loss": 5.36182165145874,
"step": 210
},
{
"epoch": 0.15571955719557196,
"grad_norm": 0.9789266586303711,
"learning_rate": 0.0006743562974003891,
"loss": 5.401203155517578,
"step": 211
},
{
"epoch": 0.15645756457564575,
"grad_norm": 1.5062106847763062,
"learning_rate": 0.0006740387844186328,
"loss": 5.4269890785217285,
"step": 212
},
{
"epoch": 0.15719557195571957,
"grad_norm": 1.2152825593948364,
"learning_rate": 0.0006737193935237112,
"loss": 5.164780616760254,
"step": 213
},
{
"epoch": 0.15793357933579336,
"grad_norm": 1.0402345657348633,
"learning_rate": 0.0006733981265666012,
"loss": 5.193200588226318,
"step": 214
},
{
"epoch": 0.15867158671586715,
"grad_norm": 1.123574137687683,
"learning_rate": 0.0006730749854091528,
"loss": 5.191850185394287,
"step": 215
},
{
"epoch": 0.15940959409594097,
"grad_norm": 1.2188745737075806,
"learning_rate": 0.0006727499719240766,
"loss": 5.239185810089111,
"step": 216
},
{
"epoch": 0.16014760147601476,
"grad_norm": 1.1848610639572144,
"learning_rate": 0.0006724230879949348,
"loss": 5.381966590881348,
"step": 217
},
{
"epoch": 0.16088560885608857,
"grad_norm": 1.0746642351150513,
"learning_rate": 0.000672094335516129,
"loss": 5.212441444396973,
"step": 218
},
{
"epoch": 0.16162361623616237,
"grad_norm": 1.389511227607727,
"learning_rate": 0.0006717637163928899,
"loss": 5.391989707946777,
"step": 219
},
{
"epoch": 0.16236162361623616,
"grad_norm": 1.4301270246505737,
"learning_rate": 0.0006714312325412659,
"loss": 5.462432861328125,
"step": 220
},
{
"epoch": 0.16309963099630997,
"grad_norm": 0.9488272666931152,
"learning_rate": 0.000671096885888112,
"loss": 5.5252790451049805,
"step": 221
},
{
"epoch": 0.16383763837638377,
"grad_norm": 1.0613595247268677,
"learning_rate": 0.0006707606783710791,
"loss": 5.263217926025391,
"step": 222
},
{
"epoch": 0.16457564575645756,
"grad_norm": 1.04259192943573,
"learning_rate": 0.0006704226119386022,
"loss": 5.378625869750977,
"step": 223
},
{
"epoch": 0.16531365313653137,
"grad_norm": 1.0206512212753296,
"learning_rate": 0.0006700826885498893,
"loss": 5.315357208251953,
"step": 224
},
{
"epoch": 0.16605166051660517,
"grad_norm": 0.9734926819801331,
"learning_rate": 0.0006697409101749102,
"loss": 5.143043518066406,
"step": 225
},
{
"epoch": 0.16678966789667896,
"grad_norm": 1.2763937711715698,
"learning_rate": 0.0006693972787943851,
"loss": 5.372148513793945,
"step": 226
},
{
"epoch": 0.16752767527675277,
"grad_norm": 1.0929063558578491,
"learning_rate": 0.0006690517963997727,
"loss": 5.465537071228027,
"step": 227
},
{
"epoch": 0.16826568265682657,
"grad_norm": 1.098317265510559,
"learning_rate": 0.0006687044649932588,
"loss": 5.2183990478515625,
"step": 228
},
{
"epoch": 0.16900369003690036,
"grad_norm": 1.4758684635162354,
"learning_rate": 0.0006683552865877454,
"loss": 5.08128023147583,
"step": 229
},
{
"epoch": 0.16974169741697417,
"grad_norm": 1.425257921218872,
"learning_rate": 0.0006680042632068382,
"loss": 5.4712233543396,
"step": 230
},
{
"epoch": 0.17047970479704797,
"grad_norm": 1.0762962102890015,
"learning_rate": 0.000667651396884835,
"loss": 5.113118648529053,
"step": 231
},
{
"epoch": 0.17121771217712178,
"grad_norm": 1.028893232345581,
"learning_rate": 0.0006672966896667142,
"loss": 5.485983848571777,
"step": 232
},
{
"epoch": 0.17195571955719557,
"grad_norm": 0.9485227465629578,
"learning_rate": 0.0006669401436081229,
"loss": 5.1485090255737305,
"step": 233
},
{
"epoch": 0.17269372693726937,
"grad_norm": 1.146479606628418,
"learning_rate": 0.0006665817607753645,
"loss": 5.232944011688232,
"step": 234
},
{
"epoch": 0.17343173431734318,
"grad_norm": 1.1146334409713745,
"learning_rate": 0.0006662215432453878,
"loss": 5.381141662597656,
"step": 235
},
{
"epoch": 0.17416974169741697,
"grad_norm": 1.4399679899215698,
"learning_rate": 0.0006658594931057739,
"loss": 5.011406421661377,
"step": 236
},
{
"epoch": 0.17490774907749077,
"grad_norm": 1.268681287765503,
"learning_rate": 0.0006654956124547241,
"loss": 5.245846748352051,
"step": 237
},
{
"epoch": 0.17564575645756458,
"grad_norm": 1.001254677772522,
"learning_rate": 0.0006651299034010487,
"loss": 5.424437522888184,
"step": 238
},
{
"epoch": 0.17638376383763837,
"grad_norm": 1.2181994915008545,
"learning_rate": 0.0006647623680641542,
"loss": 5.456673622131348,
"step": 239
},
{
"epoch": 0.17712177121771217,
"grad_norm": 1.1333999633789062,
"learning_rate": 0.0006643930085740306,
"loss": 5.315772533416748,
"step": 240
},
{
"epoch": 0.17785977859778598,
"grad_norm": 1.0992910861968994,
"learning_rate": 0.0006640218270712397,
"loss": 5.436305999755859,
"step": 241
},
{
"epoch": 0.17859778597785977,
"grad_norm": 1.0746663808822632,
"learning_rate": 0.0006636488257069027,
"loss": 5.308970928192139,
"step": 242
},
{
"epoch": 0.1793357933579336,
"grad_norm": 1.0561422109603882,
"learning_rate": 0.0006632740066426873,
"loss": 5.426042079925537,
"step": 243
},
{
"epoch": 0.18007380073800738,
"grad_norm": 1.031684160232544,
"learning_rate": 0.0006628973720507951,
"loss": 5.2547478675842285,
"step": 244
},
{
"epoch": 0.18081180811808117,
"grad_norm": 1.0969058275222778,
"learning_rate": 0.0006625189241139498,
"loss": 5.28012752532959,
"step": 245
},
{
"epoch": 0.181549815498155,
"grad_norm": 1.047112226486206,
"learning_rate": 0.0006621386650253838,
"loss": 5.20250129699707,
"step": 246
},
{
"epoch": 0.18228782287822878,
"grad_norm": 0.9869337677955627,
"learning_rate": 0.0006617565969888257,
"loss": 5.25740909576416,
"step": 247
},
{
"epoch": 0.18302583025830257,
"grad_norm": 1.0927937030792236,
"learning_rate": 0.0006613727222184874,
"loss": 5.5139288902282715,
"step": 248
},
{
"epoch": 0.1837638376383764,
"grad_norm": 1.2841873168945312,
"learning_rate": 0.000660987042939052,
"loss": 5.233647346496582,
"step": 249
},
{
"epoch": 0.18450184501845018,
"grad_norm": 0.9890136122703552,
"learning_rate": 0.0006605995613856595,
"loss": 5.420958518981934,
"step": 250
},
{
"epoch": 0.18523985239852397,
"grad_norm": 0.8926162719726562,
"learning_rate": 0.0006602102798038957,
"loss": 5.308608055114746,
"step": 251
},
{
"epoch": 0.1859778597785978,
"grad_norm": 1.0019422769546509,
"learning_rate": 0.0006598192004497771,
"loss": 5.302347660064697,
"step": 252
},
{
"epoch": 0.18671586715867158,
"grad_norm": 0.8486745953559875,
"learning_rate": 0.0006594263255897396,
"loss": 5.099376678466797,
"step": 253
},
{
"epoch": 0.18745387453874537,
"grad_norm": 1.0783238410949707,
"learning_rate": 0.0006590316575006244,
"loss": 5.218788146972656,
"step": 254
},
{
"epoch": 0.1881918819188192,
"grad_norm": 0.9183611869812012,
"learning_rate": 0.0006586351984696653,
"loss": 5.240777969360352,
"step": 255
},
{
"epoch": 0.18892988929889298,
"grad_norm": 0.9513900876045227,
"learning_rate": 0.0006582369507944747,
"loss": 5.222758769989014,
"step": 256
},
{
"epoch": 0.1896678966789668,
"grad_norm": 0.9337455630302429,
"learning_rate": 0.0006578369167830314,
"loss": 5.062905311584473,
"step": 257
},
{
"epoch": 0.1904059040590406,
"grad_norm": 1.158604383468628,
"learning_rate": 0.0006574350987536662,
"loss": 5.026293754577637,
"step": 258
},
{
"epoch": 0.19114391143911438,
"grad_norm": 1.0550696849822998,
"learning_rate": 0.000657031499035049,
"loss": 5.1148905754089355,
"step": 259
},
{
"epoch": 0.1918819188191882,
"grad_norm": 0.9606300592422485,
"learning_rate": 0.0006566261199661753,
"loss": 5.163092613220215,
"step": 260
},
{
"epoch": 0.192619926199262,
"grad_norm": 1.0590009689331055,
"learning_rate": 0.0006562189638963524,
"loss": 5.3179521560668945,
"step": 261
},
{
"epoch": 0.19335793357933578,
"grad_norm": 0.9940695762634277,
"learning_rate": 0.0006558100331851859,
"loss": 5.129310607910156,
"step": 262
},
{
"epoch": 0.1940959409594096,
"grad_norm": 1.0227980613708496,
"learning_rate": 0.0006553993302025659,
"loss": 5.162182807922363,
"step": 263
},
{
"epoch": 0.1948339483394834,
"grad_norm": 1.0441575050354004,
"learning_rate": 0.0006549868573286539,
"loss": 5.2034454345703125,
"step": 264
},
{
"epoch": 0.19557195571955718,
"grad_norm": 1.1191506385803223,
"learning_rate": 0.0006545726169538681,
"loss": 4.916297435760498,
"step": 265
},
{
"epoch": 0.196309963099631,
"grad_norm": 1.1132999658584595,
"learning_rate": 0.00065415661147887,
"loss": 5.2382707595825195,
"step": 266
},
{
"epoch": 0.1970479704797048,
"grad_norm": 1.352728247642517,
"learning_rate": 0.0006537388433145504,
"loss": 5.228781700134277,
"step": 267
},
{
"epoch": 0.1977859778597786,
"grad_norm": 1.0661629438400269,
"learning_rate": 0.0006533193148820159,
"loss": 5.341499328613281,
"step": 268
},
{
"epoch": 0.1985239852398524,
"grad_norm": 1.1771162748336792,
"learning_rate": 0.0006528980286125739,
"loss": 5.339306831359863,
"step": 269
},
{
"epoch": 0.1992619926199262,
"grad_norm": 0.9680821895599365,
"learning_rate": 0.0006524749869477192,
"loss": 5.367077827453613,
"step": 270
},
{
"epoch": 0.2,
"grad_norm": 1.0213592052459717,
"learning_rate": 0.00065205019233912,
"loss": 5.1391096115112305,
"step": 271
},
{
"epoch": 0.2007380073800738,
"grad_norm": 0.8894791603088379,
"learning_rate": 0.0006516236472486032,
"loss": 5.218973159790039,
"step": 272
},
{
"epoch": 0.2014760147601476,
"grad_norm": 1.1796555519104004,
"learning_rate": 0.00065119535414814,
"loss": 5.110937118530273,
"step": 273
},
{
"epoch": 0.2022140221402214,
"grad_norm": 0.9279013872146606,
"learning_rate": 0.0006507653155198322,
"loss": 5.301558494567871,
"step": 274
},
{
"epoch": 0.2029520295202952,
"grad_norm": 0.9340477585792542,
"learning_rate": 0.000650333533855898,
"loss": 5.283820152282715,
"step": 275
},
{
"epoch": 0.203690036900369,
"grad_norm": 1.0362911224365234,
"learning_rate": 0.0006499000116586562,
"loss": 4.982748031616211,
"step": 276
},
{
"epoch": 0.2044280442804428,
"grad_norm": 1.1206884384155273,
"learning_rate": 0.0006494647514405131,
"loss": 4.973568916320801,
"step": 277
},
{
"epoch": 0.2051660516605166,
"grad_norm": 1.0366051197052002,
"learning_rate": 0.0006490277557239472,
"loss": 5.242402076721191,
"step": 278
},
{
"epoch": 0.2059040590405904,
"grad_norm": 1.0412499904632568,
"learning_rate": 0.000648589027041495,
"loss": 5.113008499145508,
"step": 279
},
{
"epoch": 0.2066420664206642,
"grad_norm": 1.0954289436340332,
"learning_rate": 0.0006481485679357359,
"loss": 5.448449611663818,
"step": 280
},
{
"epoch": 0.207380073800738,
"grad_norm": 0.9032571911811829,
"learning_rate": 0.0006477063809592778,
"loss": 4.939189910888672,
"step": 281
},
{
"epoch": 0.20811808118081182,
"grad_norm": 0.890612006187439,
"learning_rate": 0.0006472624686747421,
"loss": 5.256400108337402,
"step": 282
},
{
"epoch": 0.2088560885608856,
"grad_norm": 0.9753661751747131,
"learning_rate": 0.000646816833654749,
"loss": 5.353178024291992,
"step": 283
},
{
"epoch": 0.2095940959409594,
"grad_norm": 0.8233433365821838,
"learning_rate": 0.0006463694784819029,
"loss": 5.223405838012695,
"step": 284
},
{
"epoch": 0.21033210332103322,
"grad_norm": 1.0614573955535889,
"learning_rate": 0.0006459204057487762,
"loss": 5.132536888122559,
"step": 285
},
{
"epoch": 0.211070110701107,
"grad_norm": 1.074107050895691,
"learning_rate": 0.0006454696180578957,
"loss": 5.2558369636535645,
"step": 286
},
{
"epoch": 0.2118081180811808,
"grad_norm": 1.0157700777053833,
"learning_rate": 0.0006450171180217273,
"loss": 4.989593505859375,
"step": 287
},
{
"epoch": 0.21254612546125462,
"grad_norm": 0.886896550655365,
"learning_rate": 0.0006445629082626595,
"loss": 5.041266441345215,
"step": 288
},
{
"epoch": 0.2132841328413284,
"grad_norm": 0.8866286873817444,
"learning_rate": 0.0006441069914129903,
"loss": 5.1668171882629395,
"step": 289
},
{
"epoch": 0.2140221402214022,
"grad_norm": 0.9136367440223694,
"learning_rate": 0.0006436493701149102,
"loss": 5.044548988342285,
"step": 290
},
{
"epoch": 0.21476014760147602,
"grad_norm": 1.0716575384140015,
"learning_rate": 0.0006431900470204876,
"loss": 4.962906837463379,
"step": 291
},
{
"epoch": 0.2154981549815498,
"grad_norm": 1.0485093593597412,
"learning_rate": 0.0006427290247916537,
"loss": 5.0265655517578125,
"step": 292
},
{
"epoch": 0.21623616236162363,
"grad_norm": 0.9726313352584839,
"learning_rate": 0.0006422663061001865,
"loss": 5.10546875,
"step": 293
},
{
"epoch": 0.21697416974169742,
"grad_norm": 0.8890307545661926,
"learning_rate": 0.0006418018936276956,
"loss": 4.885697841644287,
"step": 294
},
{
"epoch": 0.2177121771217712,
"grad_norm": 1.256881594657898,
"learning_rate": 0.0006413357900656066,
"loss": 5.05020809173584,
"step": 295
},
{
"epoch": 0.21845018450184503,
"grad_norm": 0.8236335515975952,
"learning_rate": 0.0006408679981151456,
"loss": 5.077518463134766,
"step": 296
},
{
"epoch": 0.21918819188191882,
"grad_norm": 1.0636200904846191,
"learning_rate": 0.0006403985204873235,
"loss": 5.087857246398926,
"step": 297
},
{
"epoch": 0.2199261992619926,
"grad_norm": 1.0351738929748535,
"learning_rate": 0.0006399273599029202,
"loss": 5.218321800231934,
"step": 298
},
{
"epoch": 0.22066420664206643,
"grad_norm": 1.1184179782867432,
"learning_rate": 0.000639454519092469,
"loss": 5.41963529586792,
"step": 299
},
{
"epoch": 0.22140221402214022,
"grad_norm": 1.005051851272583,
"learning_rate": 0.0006389800007962404,
"loss": 5.267976760864258,
"step": 300
},
{
"epoch": 0.222140221402214,
"grad_norm": 0.8542754054069519,
"learning_rate": 0.0006385038077642268,
"loss": 5.143088340759277,
"step": 301
},
{
"epoch": 0.22287822878228783,
"grad_norm": 1.0211315155029297,
"learning_rate": 0.0006380259427561262,
"loss": 5.287484169006348,
"step": 302
},
{
"epoch": 0.22361623616236162,
"grad_norm": 0.9097702503204346,
"learning_rate": 0.000637546408541326,
"loss": 5.212584972381592,
"step": 303
},
{
"epoch": 0.2243542435424354,
"grad_norm": 1.0342856645584106,
"learning_rate": 0.0006370652078988876,
"loss": 5.081629753112793,
"step": 304
},
{
"epoch": 0.22509225092250923,
"grad_norm": 1.046463131904602,
"learning_rate": 0.0006365823436175296,
"loss": 5.043882369995117,
"step": 305
},
{
"epoch": 0.22583025830258302,
"grad_norm": 0.9955232739448547,
"learning_rate": 0.0006360978184956121,
"loss": 5.135004997253418,
"step": 306
},
{
"epoch": 0.22656826568265684,
"grad_norm": 0.8027132153511047,
"learning_rate": 0.0006356116353411203,
"loss": 5.337245941162109,
"step": 307
},
{
"epoch": 0.22730627306273063,
"grad_norm": 1.2090736627578735,
"learning_rate": 0.0006351237969716482,
"loss": 5.095905780792236,
"step": 308
},
{
"epoch": 0.22804428044280442,
"grad_norm": 1.090334177017212,
"learning_rate": 0.0006346343062143824,
"loss": 5.060598373413086,
"step": 309
},
{
"epoch": 0.22878228782287824,
"grad_norm": 1.190928339958191,
"learning_rate": 0.0006341431659060856,
"loss": 5.230974197387695,
"step": 310
},
{
"epoch": 0.22952029520295203,
"grad_norm": 1.0362082719802856,
"learning_rate": 0.0006336503788930801,
"loss": 4.835149765014648,
"step": 311
},
{
"epoch": 0.23025830258302582,
"grad_norm": 1.1221998929977417,
"learning_rate": 0.0006331559480312316,
"loss": 5.359483242034912,
"step": 312
},
{
"epoch": 0.23099630996309964,
"grad_norm": 0.8904932737350464,
"learning_rate": 0.0006326598761859323,
"loss": 5.057035446166992,
"step": 313
},
{
"epoch": 0.23173431734317343,
"grad_norm": 0.9242574572563171,
"learning_rate": 0.0006321621662320847,
"loss": 5.011726379394531,
"step": 314
},
{
"epoch": 0.23247232472324722,
"grad_norm": 1.059004306793213,
"learning_rate": 0.0006316628210540842,
"loss": 4.693303108215332,
"step": 315
},
{
"epoch": 0.23321033210332104,
"grad_norm": 1.2370541095733643,
"learning_rate": 0.0006311618435458034,
"loss": 5.188898086547852,
"step": 316
},
{
"epoch": 0.23394833948339483,
"grad_norm": 1.0214468240737915,
"learning_rate": 0.0006306592366105744,
"loss": 5.003267288208008,
"step": 317
},
{
"epoch": 0.23468634686346865,
"grad_norm": 0.9486777782440186,
"learning_rate": 0.0006301550031611726,
"loss": 4.848117828369141,
"step": 318
},
{
"epoch": 0.23542435424354244,
"grad_norm": 0.9645982980728149,
"learning_rate": 0.0006296491461197996,
"loss": 5.02429723739624,
"step": 319
},
{
"epoch": 0.23616236162361623,
"grad_norm": 1.2168879508972168,
"learning_rate": 0.0006291416684180662,
"loss": 5.0632429122924805,
"step": 320
},
{
"epoch": 0.23690036900369005,
"grad_norm": 0.8020527362823486,
"learning_rate": 0.0006286325729969753,
"loss": 4.867977142333984,
"step": 321
},
{
"epoch": 0.23763837638376384,
"grad_norm": 0.900245726108551,
"learning_rate": 0.0006281218628069054,
"loss": 4.880187511444092,
"step": 322
},
{
"epoch": 0.23837638376383763,
"grad_norm": 0.9947592616081238,
"learning_rate": 0.0006276095408075927,
"loss": 5.083812236785889,
"step": 323
},
{
"epoch": 0.23911439114391145,
"grad_norm": 0.8965998888015747,
"learning_rate": 0.0006270956099681148,
"loss": 4.908682823181152,
"step": 324
},
{
"epoch": 0.23985239852398524,
"grad_norm": 1.1312414407730103,
"learning_rate": 0.0006265800732668727,
"loss": 5.093230247497559,
"step": 325
},
{
"epoch": 0.24059040590405903,
"grad_norm": 0.925629734992981,
"learning_rate": 0.0006260629336915741,
"loss": 4.874239444732666,
"step": 326
},
{
"epoch": 0.24132841328413285,
"grad_norm": 1.307646632194519,
"learning_rate": 0.0006255441942392159,
"loss": 5.057682514190674,
"step": 327
},
{
"epoch": 0.24206642066420664,
"grad_norm": 1.0218299627304077,
"learning_rate": 0.0006250238579160666,
"loss": 5.127986907958984,
"step": 328
},
{
"epoch": 0.24280442804428043,
"grad_norm": 0.8645507097244263,
"learning_rate": 0.0006245019277376496,
"loss": 5.13686466217041,
"step": 329
},
{
"epoch": 0.24354243542435425,
"grad_norm": 0.9296532273292542,
"learning_rate": 0.0006239784067287245,
"loss": 5.124481678009033,
"step": 330
},
{
"epoch": 0.24428044280442804,
"grad_norm": 0.9728212952613831,
"learning_rate": 0.0006234532979232711,
"loss": 5.0022687911987305,
"step": 331
},
{
"epoch": 0.24501845018450186,
"grad_norm": 0.8225215077400208,
"learning_rate": 0.0006229266043644702,
"loss": 4.9633378982543945,
"step": 332
},
{
"epoch": 0.24575645756457565,
"grad_norm": 0.960574209690094,
"learning_rate": 0.0006223983291046875,
"loss": 4.844850540161133,
"step": 333
},
{
"epoch": 0.24649446494464944,
"grad_norm": 0.9003048539161682,
"learning_rate": 0.0006218684752054549,
"loss": 5.180695056915283,
"step": 334
},
{
"epoch": 0.24723247232472326,
"grad_norm": 0.9519006013870239,
"learning_rate": 0.0006213370457374527,
"loss": 4.989326477050781,
"step": 335
},
{
"epoch": 0.24797047970479705,
"grad_norm": 0.9743554592132568,
"learning_rate": 0.0006208040437804927,
"loss": 4.731540679931641,
"step": 336
},
{
"epoch": 0.24870848708487084,
"grad_norm": 0.9855546951293945,
"learning_rate": 0.0006202694724234994,
"loss": 5.105901718139648,
"step": 337
},
{
"epoch": 0.24944649446494466,
"grad_norm": 1.8261312246322632,
"learning_rate": 0.0006197333347644928,
"loss": 5.079566478729248,
"step": 338
},
{
"epoch": 0.25018450184501845,
"grad_norm": 1.058996558189392,
"learning_rate": 0.0006191956339105701,
"loss": 4.985716819763184,
"step": 339
},
{
"epoch": 0.25092250922509224,
"grad_norm": 1.018185019493103,
"learning_rate": 0.0006186563729778875,
"loss": 4.921426296234131,
"step": 340
},
{
"epoch": 0.25166051660516603,
"grad_norm": 1.154246211051941,
"learning_rate": 0.0006181155550916423,
"loss": 5.044010162353516,
"step": 341
},
{
"epoch": 0.2523985239852399,
"grad_norm": 0.9054587483406067,
"learning_rate": 0.0006175731833860554,
"loss": 4.953484535217285,
"step": 342
},
{
"epoch": 0.25313653136531367,
"grad_norm": 0.8154107928276062,
"learning_rate": 0.0006170292610043523,
"loss": 5.044363975524902,
"step": 343
},
{
"epoch": 0.25387453874538746,
"grad_norm": 1.3822500705718994,
"learning_rate": 0.0006164837910987449,
"loss": 5.227883338928223,
"step": 344
},
{
"epoch": 0.25461254612546125,
"grad_norm": 0.9022698402404785,
"learning_rate": 0.000615936776830414,
"loss": 4.860300064086914,
"step": 345
},
{
"epoch": 0.25535055350553504,
"grad_norm": 1.0594429969787598,
"learning_rate": 0.0006153882213694903,
"loss": 5.256074905395508,
"step": 346
},
{
"epoch": 0.25608856088560883,
"grad_norm": 0.9493646025657654,
"learning_rate": 0.0006148381278950362,
"loss": 4.957509994506836,
"step": 347
},
{
"epoch": 0.2568265682656827,
"grad_norm": 1.0270938873291016,
"learning_rate": 0.0006142864995950273,
"loss": 4.809982776641846,
"step": 348
},
{
"epoch": 0.25756457564575647,
"grad_norm": 1.663167953491211,
"learning_rate": 0.0006137333396663342,
"loss": 4.888598918914795,
"step": 349
},
{
"epoch": 0.25830258302583026,
"grad_norm": 0.983447253704071,
"learning_rate": 0.0006131786513147038,
"loss": 5.165590763092041,
"step": 350
},
{
"epoch": 0.25904059040590405,
"grad_norm": 0.980798065662384,
"learning_rate": 0.0006126224377547408,
"loss": 4.966999053955078,
"step": 351
},
{
"epoch": 0.25977859778597784,
"grad_norm": 1.074250340461731,
"learning_rate": 0.0006120647022098887,
"loss": 4.936653137207031,
"step": 352
},
{
"epoch": 0.2605166051660517,
"grad_norm": 0.9961046576499939,
"learning_rate": 0.0006115054479124115,
"loss": 5.0761308670043945,
"step": 353
},
{
"epoch": 0.2612546125461255,
"grad_norm": 0.937942385673523,
"learning_rate": 0.0006109446781033752,
"loss": 4.909850597381592,
"step": 354
},
{
"epoch": 0.26199261992619927,
"grad_norm": 1.0551375150680542,
"learning_rate": 0.0006103823960326283,
"loss": 4.967006683349609,
"step": 355
},
{
"epoch": 0.26273062730627306,
"grad_norm": 1.0866034030914307,
"learning_rate": 0.0006098186049587834,
"loss": 5.049051284790039,
"step": 356
},
{
"epoch": 0.26346863468634685,
"grad_norm": 1.0815985202789307,
"learning_rate": 0.0006092533081491987,
"loss": 4.931700229644775,
"step": 357
},
{
"epoch": 0.26420664206642064,
"grad_norm": 1.0863465070724487,
"learning_rate": 0.000608686508879958,
"loss": 5.032581329345703,
"step": 358
},
{
"epoch": 0.2649446494464945,
"grad_norm": 0.871529757976532,
"learning_rate": 0.000608118210435853,
"loss": 5.066904544830322,
"step": 359
},
{
"epoch": 0.2656826568265683,
"grad_norm": 0.9786545038223267,
"learning_rate": 0.0006075484161103631,
"loss": 5.073785305023193,
"step": 360
},
{
"epoch": 0.26642066420664207,
"grad_norm": 0.8924750089645386,
"learning_rate": 0.000606977129205637,
"loss": 4.997740745544434,
"step": 361
},
{
"epoch": 0.26715867158671586,
"grad_norm": 1.49006986618042,
"learning_rate": 0.0006064043530324738,
"loss": 5.006748676300049,
"step": 362
},
{
"epoch": 0.26789667896678965,
"grad_norm": 1.0208152532577515,
"learning_rate": 0.0006058300909103026,
"loss": 5.057985305786133,
"step": 363
},
{
"epoch": 0.2686346863468635,
"grad_norm": 0.8836379051208496,
"learning_rate": 0.000605254346167165,
"loss": 4.999290466308594,
"step": 364
},
{
"epoch": 0.2693726937269373,
"grad_norm": 0.8716019988059998,
"learning_rate": 0.0006046771221396938,
"loss": 5.058474540710449,
"step": 365
},
{
"epoch": 0.2701107011070111,
"grad_norm": 1.1286225318908691,
"learning_rate": 0.0006040984221730958,
"loss": 4.990628719329834,
"step": 366
},
{
"epoch": 0.27084870848708487,
"grad_norm": 1.099913477897644,
"learning_rate": 0.0006035182496211308,
"loss": 4.981925010681152,
"step": 367
},
{
"epoch": 0.27158671586715866,
"grad_norm": 1.0594391822814941,
"learning_rate": 0.0006029366078460929,
"loss": 4.859918594360352,
"step": 368
},
{
"epoch": 0.27232472324723245,
"grad_norm": 0.8651653528213501,
"learning_rate": 0.0006023535002187907,
"loss": 4.930809020996094,
"step": 369
},
{
"epoch": 0.2730627306273063,
"grad_norm": 0.9700394868850708,
"learning_rate": 0.0006017689301185279,
"loss": 4.7630720138549805,
"step": 370
},
{
"epoch": 0.2738007380073801,
"grad_norm": 0.9684885740280151,
"learning_rate": 0.000601182900933084,
"loss": 4.800506114959717,
"step": 371
},
{
"epoch": 0.2745387453874539,
"grad_norm": 1.2140804529190063,
"learning_rate": 0.0006005954160586941,
"loss": 5.034149646759033,
"step": 372
},
{
"epoch": 0.27527675276752767,
"grad_norm": 1.0811138153076172,
"learning_rate": 0.0006000064789000295,
"loss": 4.837162494659424,
"step": 373
},
{
"epoch": 0.27601476014760146,
"grad_norm": 1.328092098236084,
"learning_rate": 0.0005994160928701782,
"loss": 5.215338706970215,
"step": 374
},
{
"epoch": 0.2767527675276753,
"grad_norm": 0.9813052415847778,
"learning_rate": 0.0005988242613906248,
"loss": 5.164502143859863,
"step": 375
},
{
"epoch": 0.2774907749077491,
"grad_norm": 1.1087919473648071,
"learning_rate": 0.0005982309878912306,
"loss": 5.113296031951904,
"step": 376
},
{
"epoch": 0.2782287822878229,
"grad_norm": 1.0566635131835938,
"learning_rate": 0.000597636275810214,
"loss": 4.566821098327637,
"step": 377
},
{
"epoch": 0.2789667896678967,
"grad_norm": 1.1309762001037598,
"learning_rate": 0.0005970401285941305,
"loss": 5.184887886047363,
"step": 378
},
{
"epoch": 0.27970479704797047,
"grad_norm": 1.3037056922912598,
"learning_rate": 0.0005964425496978528,
"loss": 4.654736042022705,
"step": 379
},
{
"epoch": 0.28044280442804426,
"grad_norm": 1.0882046222686768,
"learning_rate": 0.0005958435425845504,
"loss": 4.828828811645508,
"step": 380
},
{
"epoch": 0.2811808118081181,
"grad_norm": 0.9877819418907166,
"learning_rate": 0.0005952431107256698,
"loss": 4.909351348876953,
"step": 381
},
{
"epoch": 0.2819188191881919,
"grad_norm": 1.0387706756591797,
"learning_rate": 0.0005946412576009148,
"loss": 4.700501441955566,
"step": 382
},
{
"epoch": 0.2826568265682657,
"grad_norm": 0.9511588215827942,
"learning_rate": 0.0005940379866982255,
"loss": 4.84822940826416,
"step": 383
},
{
"epoch": 0.2833948339483395,
"grad_norm": 1.4258911609649658,
"learning_rate": 0.0005934333015137585,
"loss": 4.82274055480957,
"step": 384
},
{
"epoch": 0.28413284132841327,
"grad_norm": 0.9327899217605591,
"learning_rate": 0.0005928272055518667,
"loss": 4.844176292419434,
"step": 385
},
{
"epoch": 0.2848708487084871,
"grad_norm": 0.9245155453681946,
"learning_rate": 0.0005922197023250793,
"loss": 5.153466701507568,
"step": 386
},
{
"epoch": 0.2856088560885609,
"grad_norm": 1.0576754808425903,
"learning_rate": 0.0005916107953540805,
"loss": 4.96760368347168,
"step": 387
},
{
"epoch": 0.2863468634686347,
"grad_norm": 0.8730959892272949,
"learning_rate": 0.0005910004881676898,
"loss": 4.808976650238037,
"step": 388
},
{
"epoch": 0.2870848708487085,
"grad_norm": 0.8937351107597351,
"learning_rate": 0.0005903887843028418,
"loss": 4.953003883361816,
"step": 389
},
{
"epoch": 0.2878228782287823,
"grad_norm": 0.9199606776237488,
"learning_rate": 0.0005897756873045648,
"loss": 5.063399314880371,
"step": 390
},
{
"epoch": 0.28856088560885607,
"grad_norm": 0.9909579753875732,
"learning_rate": 0.0005891612007259613,
"loss": 4.7940473556518555,
"step": 391
},
{
"epoch": 0.2892988929889299,
"grad_norm": 0.9309024214744568,
"learning_rate": 0.0005885453281281863,
"loss": 4.881161689758301,
"step": 392
},
{
"epoch": 0.2900369003690037,
"grad_norm": 1.1199829578399658,
"learning_rate": 0.0005879280730804277,
"loss": 5.138465404510498,
"step": 393
},
{
"epoch": 0.2907749077490775,
"grad_norm": 0.9535595178604126,
"learning_rate": 0.000587309439159885,
"loss": 4.985682487487793,
"step": 394
},
{
"epoch": 0.2915129151291513,
"grad_norm": 0.9754979014396667,
"learning_rate": 0.0005866894299517488,
"loss": 4.827736854553223,
"step": 395
},
{
"epoch": 0.2922509225092251,
"grad_norm": 0.9567784667015076,
"learning_rate": 0.0005860680490491798,
"loss": 4.916905879974365,
"step": 396
},
{
"epoch": 0.29298892988929887,
"grad_norm": 0.9050018191337585,
"learning_rate": 0.0005854453000532884,
"loss": 5.034615993499756,
"step": 397
},
{
"epoch": 0.2937269372693727,
"grad_norm": 0.8965482115745544,
"learning_rate": 0.0005848211865731131,
"loss": 4.918941497802734,
"step": 398
},
{
"epoch": 0.2944649446494465,
"grad_norm": 0.9906476140022278,
"learning_rate": 0.0005841957122256004,
"loss": 4.973904609680176,
"step": 399
},
{
"epoch": 0.2952029520295203,
"grad_norm": 0.9818461537361145,
"learning_rate": 0.0005835688806355835,
"loss": 4.993786811828613,
"step": 400
},
{
"epoch": 0.2959409594095941,
"grad_norm": 0.99074786901474,
"learning_rate": 0.0005829406954357611,
"loss": 5.0351457595825195,
"step": 401
},
{
"epoch": 0.2966789667896679,
"grad_norm": 0.9858592748641968,
"learning_rate": 0.0005823111602666765,
"loss": 4.854518413543701,
"step": 402
},
{
"epoch": 0.2974169741697417,
"grad_norm": 1.0143122673034668,
"learning_rate": 0.0005816802787766969,
"loss": 4.9962921142578125,
"step": 403
},
{
"epoch": 0.2981549815498155,
"grad_norm": 0.8965126276016235,
"learning_rate": 0.0005810480546219914,
"loss": 4.845615386962891,
"step": 404
},
{
"epoch": 0.2988929889298893,
"grad_norm": 0.9247124791145325,
"learning_rate": 0.0005804144914665105,
"loss": 4.576415061950684,
"step": 405
},
{
"epoch": 0.2996309963099631,
"grad_norm": 1.0036989450454712,
"learning_rate": 0.0005797795929819646,
"loss": 4.833454132080078,
"step": 406
},
{
"epoch": 0.3003690036900369,
"grad_norm": 1.1179319620132446,
"learning_rate": 0.0005791433628478031,
"loss": 5.014064311981201,
"step": 407
},
{
"epoch": 0.3011070110701107,
"grad_norm": 1.1040972471237183,
"learning_rate": 0.0005785058047511922,
"loss": 4.786684513092041,
"step": 408
},
{
"epoch": 0.3018450184501845,
"grad_norm": 0.9538096785545349,
"learning_rate": 0.0005778669223869945,
"loss": 4.815490245819092,
"step": 409
},
{
"epoch": 0.3025830258302583,
"grad_norm": 1.1620954275131226,
"learning_rate": 0.0005772267194577469,
"loss": 4.706133842468262,
"step": 410
},
{
"epoch": 0.3033210332103321,
"grad_norm": 1.137211799621582,
"learning_rate": 0.0005765851996736397,
"loss": 4.959315299987793,
"step": 411
},
{
"epoch": 0.3040590405904059,
"grad_norm": 0.9818885922431946,
"learning_rate": 0.0005759423667524947,
"loss": 4.72605037689209,
"step": 412
},
{
"epoch": 0.3047970479704797,
"grad_norm": 0.9897336959838867,
"learning_rate": 0.0005752982244197436,
"loss": 4.857034683227539,
"step": 413
},
{
"epoch": 0.30553505535055353,
"grad_norm": 0.9276419281959534,
"learning_rate": 0.0005746527764084068,
"loss": 4.825818061828613,
"step": 414
},
{
"epoch": 0.3062730627306273,
"grad_norm": 0.9956037998199463,
"learning_rate": 0.0005740060264590714,
"loss": 4.663302421569824,
"step": 415
},
{
"epoch": 0.3070110701107011,
"grad_norm": 0.9424338340759277,
"learning_rate": 0.00057335797831987,
"loss": 4.876203536987305,
"step": 416
},
{
"epoch": 0.3077490774907749,
"grad_norm": 0.9253562092781067,
"learning_rate": 0.000572708635746458,
"loss": 4.914989471435547,
"step": 417
},
{
"epoch": 0.3084870848708487,
"grad_norm": 1.0256803035736084,
"learning_rate": 0.000572058002501993,
"loss": 4.6925859451293945,
"step": 418
},
{
"epoch": 0.3092250922509225,
"grad_norm": 0.9552437663078308,
"learning_rate": 0.0005714060823571126,
"loss": 4.923905372619629,
"step": 419
},
{
"epoch": 0.30996309963099633,
"grad_norm": 0.8474723696708679,
"learning_rate": 0.0005707528790899117,
"loss": 4.9794769287109375,
"step": 420
},
{
"epoch": 0.3107011070110701,
"grad_norm": 0.9562621712684631,
"learning_rate": 0.0005700983964859219,
"loss": 4.790196418762207,
"step": 421
},
{
"epoch": 0.3114391143911439,
"grad_norm": 0.9205673336982727,
"learning_rate": 0.000569442638338089,
"loss": 4.9875407218933105,
"step": 422
},
{
"epoch": 0.3121771217712177,
"grad_norm": 0.9803183674812317,
"learning_rate": 0.0005687856084467509,
"loss": 4.777838230133057,
"step": 423
},
{
"epoch": 0.3129151291512915,
"grad_norm": 1.1361783742904663,
"learning_rate": 0.0005681273106196154,
"loss": 4.891695976257324,
"step": 424
},
{
"epoch": 0.31365313653136534,
"grad_norm": 0.937116265296936,
"learning_rate": 0.0005674677486717386,
"loss": 4.8178324699401855,
"step": 425
},
{
"epoch": 0.31439114391143913,
"grad_norm": 0.7977975606918335,
"learning_rate": 0.000566806926425503,
"loss": 4.76682710647583,
"step": 426
},
{
"epoch": 0.3151291512915129,
"grad_norm": 1.0328474044799805,
"learning_rate": 0.0005661448477105944,
"loss": 4.845677852630615,
"step": 427
},
{
"epoch": 0.3158671586715867,
"grad_norm": 0.977131187915802,
"learning_rate": 0.0005654815163639804,
"loss": 4.799696922302246,
"step": 428
},
{
"epoch": 0.3166051660516605,
"grad_norm": 1.0650629997253418,
"learning_rate": 0.0005648169362298881,
"loss": 4.999658584594727,
"step": 429
},
{
"epoch": 0.3173431734317343,
"grad_norm": 1.0764038562774658,
"learning_rate": 0.0005641511111597818,
"loss": 4.789190292358398,
"step": 430
},
{
"epoch": 0.31808118081180814,
"grad_norm": 0.8251600861549377,
"learning_rate": 0.0005634840450123405,
"loss": 4.80035400390625,
"step": 431
},
{
"epoch": 0.31881918819188193,
"grad_norm": 0.9509308338165283,
"learning_rate": 0.0005628157416534356,
"loss": 4.975335597991943,
"step": 432
},
{
"epoch": 0.3195571955719557,
"grad_norm": 0.9815534353256226,
"learning_rate": 0.000562146204956109,
"loss": 4.860112190246582,
"step": 433
},
{
"epoch": 0.3202952029520295,
"grad_norm": 1.0006123781204224,
"learning_rate": 0.0005614754388005494,
"loss": 4.970834732055664,
"step": 434
},
{
"epoch": 0.3210332103321033,
"grad_norm": 0.9528962969779968,
"learning_rate": 0.0005608034470740712,
"loss": 4.864804267883301,
"step": 435
},
{
"epoch": 0.32177121771217715,
"grad_norm": 0.9165347218513489,
"learning_rate": 0.0005601302336710914,
"loss": 4.844846725463867,
"step": 436
},
{
"epoch": 0.32250922509225094,
"grad_norm": 0.8079850673675537,
"learning_rate": 0.0005594558024931068,
"loss": 4.501960754394531,
"step": 437
},
{
"epoch": 0.32324723247232473,
"grad_norm": 1.0351104736328125,
"learning_rate": 0.000558780157448672,
"loss": 4.843764305114746,
"step": 438
},
{
"epoch": 0.3239852398523985,
"grad_norm": 1.180903673171997,
"learning_rate": 0.0005581033024533757,
"loss": 4.818984508514404,
"step": 439
},
{
"epoch": 0.3247232472324723,
"grad_norm": 1.0522313117980957,
"learning_rate": 0.0005574252414298192,
"loss": 4.750001907348633,
"step": 440
},
{
"epoch": 0.3254612546125461,
"grad_norm": 1.0479143857955933,
"learning_rate": 0.0005567459783075928,
"loss": 4.75580358505249,
"step": 441
},
{
"epoch": 0.32619926199261995,
"grad_norm": 0.9943499565124512,
"learning_rate": 0.000556065517023254,
"loss": 4.778286933898926,
"step": 442
},
{
"epoch": 0.32693726937269374,
"grad_norm": 1.0374329090118408,
"learning_rate": 0.0005553838615203031,
"loss": 4.718173027038574,
"step": 443
},
{
"epoch": 0.32767527675276753,
"grad_norm": 0.8165338635444641,
"learning_rate": 0.0005547010157491621,
"loss": 4.73118257522583,
"step": 444
},
{
"epoch": 0.3284132841328413,
"grad_norm": 0.840587854385376,
"learning_rate": 0.0005540169836671505,
"loss": 4.625949859619141,
"step": 445
},
{
"epoch": 0.3291512915129151,
"grad_norm": 0.9079518914222717,
"learning_rate": 0.0005533317692384632,
"loss": 4.873010158538818,
"step": 446
},
{
"epoch": 0.3298892988929889,
"grad_norm": 0.9068413376808167,
"learning_rate": 0.000552645376434147,
"loss": 4.96326208114624,
"step": 447
},
{
"epoch": 0.33062730627306275,
"grad_norm": 0.91420578956604,
"learning_rate": 0.0005519578092320779,
"loss": 4.897895336151123,
"step": 448
},
{
"epoch": 0.33136531365313654,
"grad_norm": 0.986501932144165,
"learning_rate": 0.0005512690716169378,
"loss": 5.1402740478515625,
"step": 449
},
{
"epoch": 0.33210332103321033,
"grad_norm": 0.9404821395874023,
"learning_rate": 0.0005505791675801916,
"loss": 4.783200740814209,
"step": 450
},
{
"epoch": 0.3328413284132841,
"grad_norm": 0.9507829546928406,
"learning_rate": 0.0005498881011200641,
"loss": 4.688559532165527,
"step": 451
},
{
"epoch": 0.3335793357933579,
"grad_norm": 0.9329268932342529,
"learning_rate": 0.0005491958762415166,
"loss": 4.877443790435791,
"step": 452
},
{
"epoch": 0.33431734317343176,
"grad_norm": 1.0837727785110474,
"learning_rate": 0.0005485024969562237,
"loss": 4.7941789627075195,
"step": 453
},
{
"epoch": 0.33505535055350555,
"grad_norm": 1.0305438041687012,
"learning_rate": 0.0005478079672825504,
"loss": 4.639592170715332,
"step": 454
},
{
"epoch": 0.33579335793357934,
"grad_norm": 0.8832004070281982,
"learning_rate": 0.0005471122912455287,
"loss": 4.873642444610596,
"step": 455
},
{
"epoch": 0.33653136531365313,
"grad_norm": 0.9681616425514221,
"learning_rate": 0.0005464154728768339,
"loss": 4.844632148742676,
"step": 456
},
{
"epoch": 0.3372693726937269,
"grad_norm": 0.9066919088363647,
"learning_rate": 0.0005457175162147614,
"loss": 4.705622673034668,
"step": 457
},
{
"epoch": 0.3380073800738007,
"grad_norm": 0.8449764251708984,
"learning_rate": 0.0005450184253042037,
"loss": 4.834818363189697,
"step": 458
},
{
"epoch": 0.33874538745387456,
"grad_norm": 1.009404182434082,
"learning_rate": 0.0005443182041966266,
"loss": 4.893503665924072,
"step": 459
},
{
"epoch": 0.33948339483394835,
"grad_norm": 0.9231082201004028,
"learning_rate": 0.0005436168569500456,
"loss": 4.946817398071289,
"step": 460
},
{
"epoch": 0.34022140221402214,
"grad_norm": 0.9415441155433655,
"learning_rate": 0.0005429143876290025,
"loss": 4.941875457763672,
"step": 461
},
{
"epoch": 0.34095940959409593,
"grad_norm": 0.8538420796394348,
"learning_rate": 0.0005422108003045423,
"loss": 4.770623207092285,
"step": 462
},
{
"epoch": 0.3416974169741697,
"grad_norm": 1.1796035766601562,
"learning_rate": 0.0005415060990541887,
"loss": 5.057588577270508,
"step": 463
},
{
"epoch": 0.34243542435424357,
"grad_norm": 0.9784668684005737,
"learning_rate": 0.0005408002879619213,
"loss": 4.873748779296875,
"step": 464
},
{
"epoch": 0.34317343173431736,
"grad_norm": 0.7987930774688721,
"learning_rate": 0.0005400933711181515,
"loss": 4.990841865539551,
"step": 465
},
{
"epoch": 0.34391143911439115,
"grad_norm": 0.9904403686523438,
"learning_rate": 0.0005393853526196988,
"loss": 4.766284942626953,
"step": 466
},
{
"epoch": 0.34464944649446494,
"grad_norm": 1.3431742191314697,
"learning_rate": 0.0005386762365697678,
"loss": 4.805080413818359,
"step": 467
},
{
"epoch": 0.34538745387453873,
"grad_norm": 0.9435102343559265,
"learning_rate": 0.0005379660270779224,
"loss": 4.853346824645996,
"step": 468
},
{
"epoch": 0.3461254612546125,
"grad_norm": 0.9755591154098511,
"learning_rate": 0.0005372547282600649,
"loss": 4.719388008117676,
"step": 469
},
{
"epoch": 0.34686346863468637,
"grad_norm": 0.8468083739280701,
"learning_rate": 0.0005365423442384097,
"loss": 4.8452301025390625,
"step": 470
},
{
"epoch": 0.34760147601476016,
"grad_norm": 0.8244897723197937,
"learning_rate": 0.0005358288791414604,
"loss": 4.7897844314575195,
"step": 471
},
{
"epoch": 0.34833948339483395,
"grad_norm": 1.2169724702835083,
"learning_rate": 0.0005351143371039861,
"loss": 4.922556400299072,
"step": 472
},
{
"epoch": 0.34907749077490774,
"grad_norm": 0.7928814888000488,
"learning_rate": 0.0005343987222669969,
"loss": 4.509468078613281,
"step": 473
},
{
"epoch": 0.34981549815498153,
"grad_norm": 0.8514037132263184,
"learning_rate": 0.0005336820387777202,
"loss": 4.7827959060668945,
"step": 474
},
{
"epoch": 0.3505535055350554,
"grad_norm": 1.0094245672225952,
"learning_rate": 0.0005329642907895766,
"loss": 4.922459602355957,
"step": 475
},
{
"epoch": 0.35129151291512917,
"grad_norm": 0.8496381044387817,
"learning_rate": 0.0005322454824621558,
"loss": 4.833901405334473,
"step": 476
},
{
"epoch": 0.35202952029520296,
"grad_norm": 0.9164729714393616,
"learning_rate": 0.0005315256179611926,
"loss": 4.579873085021973,
"step": 477
},
{
"epoch": 0.35276752767527675,
"grad_norm": 0.9636603593826294,
"learning_rate": 0.0005308047014585427,
"loss": 4.682124614715576,
"step": 478
},
{
"epoch": 0.35350553505535054,
"grad_norm": 0.8201807141304016,
"learning_rate": 0.000530082737132158,
"loss": 4.792829513549805,
"step": 479
},
{
"epoch": 0.35424354243542433,
"grad_norm": 1.1766455173492432,
"learning_rate": 0.0005293597291660638,
"loss": 4.957970142364502,
"step": 480
},
{
"epoch": 0.3549815498154982,
"grad_norm": 1.2056677341461182,
"learning_rate": 0.0005286356817503329,
"loss": 4.584798812866211,
"step": 481
},
{
"epoch": 0.35571955719557197,
"grad_norm": 0.9210802316665649,
"learning_rate": 0.0005279105990810624,
"loss": 4.629232406616211,
"step": 482
},
{
"epoch": 0.35645756457564576,
"grad_norm": 0.9124312400817871,
"learning_rate": 0.0005271844853603489,
"loss": 4.6753435134887695,
"step": 483
},
{
"epoch": 0.35719557195571955,
"grad_norm": 0.9933983087539673,
"learning_rate": 0.0005264573447962644,
"loss": 4.6301984786987305,
"step": 484
},
{
"epoch": 0.35793357933579334,
"grad_norm": 0.93276047706604,
"learning_rate": 0.0005257291816028317,
"loss": 4.541720390319824,
"step": 485
},
{
"epoch": 0.3586715867158672,
"grad_norm": 1.028709053993225,
"learning_rate": 0.000525,
"loss": 4.660837650299072,
"step": 486
},
{
"epoch": 0.359409594095941,
"grad_norm": 0.935407817363739,
"learning_rate": 0.0005242698042136208,
"loss": 4.810506820678711,
"step": 487
},
{
"epoch": 0.36014760147601477,
"grad_norm": 0.9050341844558716,
"learning_rate": 0.000523538598475423,
"loss": 4.896993637084961,
"step": 488
},
{
"epoch": 0.36088560885608856,
"grad_norm": 0.9780540466308594,
"learning_rate": 0.0005228063870229883,
"loss": 4.808036804199219,
"step": 489
},
{
"epoch": 0.36162361623616235,
"grad_norm": 1.107608675956726,
"learning_rate": 0.0005220731740997273,
"loss": 4.784989833831787,
"step": 490
},
{
"epoch": 0.36236162361623614,
"grad_norm": 1.0185608863830566,
"learning_rate": 0.0005213389639548539,
"loss": 4.635310173034668,
"step": 491
},
{
"epoch": 0.36309963099631,
"grad_norm": 0.9982399940490723,
"learning_rate": 0.0005206037608433617,
"loss": 4.810551643371582,
"step": 492
},
{
"epoch": 0.3638376383763838,
"grad_norm": 0.7861829400062561,
"learning_rate": 0.0005198675690259988,
"loss": 4.704036712646484,
"step": 493
},
{
"epoch": 0.36457564575645757,
"grad_norm": 0.935389518737793,
"learning_rate": 0.0005191303927692428,
"loss": 5.006328582763672,
"step": 494
},
{
"epoch": 0.36531365313653136,
"grad_norm": 0.9794636368751526,
"learning_rate": 0.0005183922363452768,
"loss": 4.736790180206299,
"step": 495
},
{
"epoch": 0.36605166051660515,
"grad_norm": 0.915691614151001,
"learning_rate": 0.0005176531040319643,
"loss": 4.851039409637451,
"step": 496
},
{
"epoch": 0.36678966789667894,
"grad_norm": 1.203650712966919,
"learning_rate": 0.0005169130001128246,
"loss": 4.811964988708496,
"step": 497
},
{
"epoch": 0.3675276752767528,
"grad_norm": 1.001997947692871,
"learning_rate": 0.000516171928877007,
"loss": 4.62536096572876,
"step": 498
},
{
"epoch": 0.3682656826568266,
"grad_norm": 0.9129559993743896,
"learning_rate": 0.0005154298946192679,
"loss": 4.895463466644287,
"step": 499
},
{
"epoch": 0.36900369003690037,
"grad_norm": 0.9465571045875549,
"learning_rate": 0.0005146869016399432,
"loss": 4.418019771575928,
"step": 500
},
{
"epoch": 0.36974169741697416,
"grad_norm": 1.1695398092269897,
"learning_rate": 0.0005139429542449265,
"loss": 4.949154376983643,
"step": 501
},
{
"epoch": 0.37047970479704795,
"grad_norm": 0.9523534774780273,
"learning_rate": 0.0005131980567456417,
"loss": 4.633477687835693,
"step": 502
},
{
"epoch": 0.3712177121771218,
"grad_norm": 0.9152795076370239,
"learning_rate": 0.0005124522134590188,
"loss": 4.966670989990234,
"step": 503
},
{
"epoch": 0.3719557195571956,
"grad_norm": 1.2700462341308594,
"learning_rate": 0.0005117054287074694,
"loss": 4.756679534912109,
"step": 504
},
{
"epoch": 0.3726937269372694,
"grad_norm": 1.0250760316848755,
"learning_rate": 0.0005109577068188609,
"loss": 5.008725166320801,
"step": 505
},
{
"epoch": 0.37343173431734317,
"grad_norm": 1.0058673620224,
"learning_rate": 0.0005102090521264917,
"loss": 4.794961452484131,
"step": 506
},
{
"epoch": 0.37416974169741696,
"grad_norm": 0.9521406292915344,
"learning_rate": 0.0005094594689690664,
"loss": 4.621726989746094,
"step": 507
},
{
"epoch": 0.37490774907749075,
"grad_norm": 1.1385680437088013,
"learning_rate": 0.0005087089616906701,
"loss": 4.789394378662109,
"step": 508
},
{
"epoch": 0.3756457564575646,
"grad_norm": 1.1471185684204102,
"learning_rate": 0.0005079575346407434,
"loss": 4.895359039306641,
"step": 509
},
{
"epoch": 0.3763837638376384,
"grad_norm": 0.9710941910743713,
"learning_rate": 0.0005072051921740577,
"loss": 4.706118583679199,
"step": 510
},
{
"epoch": 0.3771217712177122,
"grad_norm": 1.0084307193756104,
"learning_rate": 0.0005064519386506892,
"loss": 4.653249740600586,
"step": 511
},
{
"epoch": 0.37785977859778597,
"grad_norm": 0.8812188506126404,
"learning_rate": 0.000505697778435994,
"loss": 4.762184143066406,
"step": 512
},
{
"epoch": 0.37859778597785976,
"grad_norm": 0.902651846408844,
"learning_rate": 0.0005049427159005829,
"loss": 4.499927520751953,
"step": 513
},
{
"epoch": 0.3793357933579336,
"grad_norm": 0.9034081697463989,
"learning_rate": 0.000504186755420296,
"loss": 4.713489055633545,
"step": 514
},
{
"epoch": 0.3800738007380074,
"grad_norm": 0.9847108721733093,
"learning_rate": 0.000503429901376177,
"loss": 4.567604064941406,
"step": 515
},
{
"epoch": 0.3808118081180812,
"grad_norm": 1.009543776512146,
"learning_rate": 0.0005026721581544485,
"loss": 4.737997055053711,
"step": 516
},
{
"epoch": 0.381549815498155,
"grad_norm": 0.8869209289550781,
"learning_rate": 0.0005019135301464861,
"loss": 4.873485565185547,
"step": 517
},
{
"epoch": 0.38228782287822877,
"grad_norm": 1.083253026008606,
"learning_rate": 0.0005011540217487924,
"loss": 4.698840618133545,
"step": 518
},
{
"epoch": 0.38302583025830256,
"grad_norm": 0.9394760131835938,
"learning_rate": 0.0005003936373629732,
"loss": 4.629981994628906,
"step": 519
},
{
"epoch": 0.3837638376383764,
"grad_norm": 0.9423143863677979,
"learning_rate": 0.00049963238139571,
"loss": 4.612939834594727,
"step": 520
},
{
"epoch": 0.3845018450184502,
"grad_norm": 0.9476136565208435,
"learning_rate": 0.000498870258258736,
"loss": 4.849869728088379,
"step": 521
},
{
"epoch": 0.385239852398524,
"grad_norm": 0.9509242177009583,
"learning_rate": 0.0004981072723688098,
"loss": 4.818325996398926,
"step": 522
},
{
"epoch": 0.3859778597785978,
"grad_norm": 0.834679901599884,
"learning_rate": 0.0004973434281476899,
"loss": 4.750872611999512,
"step": 523
},
{
"epoch": 0.38671586715867157,
"grad_norm": 0.799146831035614,
"learning_rate": 0.0004965787300221089,
"loss": 4.632112503051758,
"step": 524
},
{
"epoch": 0.3874538745387454,
"grad_norm": 0.8336266875267029,
"learning_rate": 0.0004958131824237484,
"loss": 4.630362510681152,
"step": 525
},
{
"epoch": 0.3881918819188192,
"grad_norm": 0.9787878394126892,
"learning_rate": 0.0004950467897892132,
"loss": 4.63228702545166,
"step": 526
},
{
"epoch": 0.388929889298893,
"grad_norm": 1.0535902976989746,
"learning_rate": 0.0004942795565600044,
"loss": 4.849504470825195,
"step": 527
},
{
"epoch": 0.3896678966789668,
"grad_norm": 0.953353226184845,
"learning_rate": 0.0004935114871824956,
"loss": 4.9335222244262695,
"step": 528
},
{
"epoch": 0.3904059040590406,
"grad_norm": 0.9515429735183716,
"learning_rate": 0.0004927425861079057,
"loss": 4.6670451164245605,
"step": 529
},
{
"epoch": 0.39114391143911437,
"grad_norm": 1.0168793201446533,
"learning_rate": 0.0004919728577922739,
"loss": 4.654256820678711,
"step": 530
},
{
"epoch": 0.3918819188191882,
"grad_norm": 0.9733904600143433,
"learning_rate": 0.000491202306696433,
"loss": 4.637208938598633,
"step": 531
},
{
"epoch": 0.392619926199262,
"grad_norm": 0.9687494039535522,
"learning_rate": 0.0004904309372859844,
"loss": 4.994683742523193,
"step": 532
},
{
"epoch": 0.3933579335793358,
"grad_norm": 1.066312313079834,
"learning_rate": 0.0004896587540312722,
"loss": 4.801863670349121,
"step": 533
},
{
"epoch": 0.3940959409594096,
"grad_norm": 0.9010938405990601,
"learning_rate": 0.0004888857614073565,
"loss": 4.627843856811523,
"step": 534
},
{
"epoch": 0.3948339483394834,
"grad_norm": 0.9718567728996277,
"learning_rate": 0.00048811196389398823,
"loss": 4.693809509277344,
"step": 535
},
{
"epoch": 0.3955719557195572,
"grad_norm": 1.1631206274032593,
"learning_rate": 0.00048733736597558264,
"loss": 4.649688720703125,
"step": 536
},
{
"epoch": 0.396309963099631,
"grad_norm": 1.0241073369979858,
"learning_rate": 0.0004865619721411941,
"loss": 4.654960632324219,
"step": 537
},
{
"epoch": 0.3970479704797048,
"grad_norm": 0.9926833510398865,
"learning_rate": 0.0004857857868844891,
"loss": 4.601881504058838,
"step": 538
},
{
"epoch": 0.3977859778597786,
"grad_norm": 1.0237977504730225,
"learning_rate": 0.0004850088147037211,
"loss": 4.73300838470459,
"step": 539
},
{
"epoch": 0.3985239852398524,
"grad_norm": 1.060038685798645,
"learning_rate": 0.0004842310601017036,
"loss": 4.862484931945801,
"step": 540
},
{
"epoch": 0.3992619926199262,
"grad_norm": 1.2825253009796143,
"learning_rate": 0.00048345252758578484,
"loss": 4.497199058532715,
"step": 541
},
{
"epoch": 0.4,
"grad_norm": 0.9599003791809082,
"learning_rate": 0.00048267322166782123,
"loss": 4.726795673370361,
"step": 542
},
{
"epoch": 0.4007380073800738,
"grad_norm": 0.9577689170837402,
"learning_rate": 0.0004818931468641511,
"loss": 4.560695648193359,
"step": 543
},
{
"epoch": 0.4014760147601476,
"grad_norm": 0.9004948139190674,
"learning_rate": 0.0004811123076955693,
"loss": 4.900054931640625,
"step": 544
},
{
"epoch": 0.4022140221402214,
"grad_norm": 1.0973906517028809,
"learning_rate": 0.0004803307086872996,
"loss": 4.605217933654785,
"step": 545
},
{
"epoch": 0.4029520295202952,
"grad_norm": 0.9591420888900757,
"learning_rate": 0.0004795483543689701,
"loss": 4.580148696899414,
"step": 546
},
{
"epoch": 0.40369003690036903,
"grad_norm": 0.9317168593406677,
"learning_rate": 0.00047876524927458554,
"loss": 4.676855087280273,
"step": 547
},
{
"epoch": 0.4044280442804428,
"grad_norm": 0.8841069936752319,
"learning_rate": 0.0004779813979425022,
"loss": 4.510927677154541,
"step": 548
},
{
"epoch": 0.4051660516605166,
"grad_norm": 0.8826556205749512,
"learning_rate": 0.0004771968049154005,
"loss": 4.688409805297852,
"step": 549
},
{
"epoch": 0.4059040590405904,
"grad_norm": 1.0118300914764404,
"learning_rate": 0.00047641147474025973,
"loss": 4.612986087799072,
"step": 550
},
{
"epoch": 0.4066420664206642,
"grad_norm": 0.7752644419670105,
"learning_rate": 0.00047562541196833106,
"loss": 4.80881929397583,
"step": 551
},
{
"epoch": 0.407380073800738,
"grad_norm": 0.8387100100517273,
"learning_rate": 0.000474838621155111,
"loss": 4.728845596313477,
"step": 552
},
{
"epoch": 0.40811808118081183,
"grad_norm": 0.8829529285430908,
"learning_rate": 0.00047405110686031575,
"loss": 4.959627151489258,
"step": 553
},
{
"epoch": 0.4088560885608856,
"grad_norm": 0.8168521523475647,
"learning_rate": 0.000473262873647854,
"loss": 4.744089603424072,
"step": 554
},
{
"epoch": 0.4095940959409594,
"grad_norm": 0.8763787150382996,
"learning_rate": 0.000472473926085801,
"loss": 4.78373908996582,
"step": 555
},
{
"epoch": 0.4103321033210332,
"grad_norm": 0.8329116106033325,
"learning_rate": 0.00047168426874637167,
"loss": 4.739480018615723,
"step": 556
},
{
"epoch": 0.411070110701107,
"grad_norm": 0.8480055332183838,
"learning_rate": 0.0004708939062058946,
"loss": 4.604061126708984,
"step": 557
},
{
"epoch": 0.4118081180811808,
"grad_norm": 0.8095789551734924,
"learning_rate": 0.0004701028430447852,
"loss": 4.522249221801758,
"step": 558
},
{
"epoch": 0.41254612546125463,
"grad_norm": 1.0202033519744873,
"learning_rate": 0.00046931108384751897,
"loss": 4.50852632522583,
"step": 559
},
{
"epoch": 0.4132841328413284,
"grad_norm": 1.0811773538589478,
"learning_rate": 0.00046851863320260544,
"loss": 4.552791118621826,
"step": 560
},
{
"epoch": 0.4140221402214022,
"grad_norm": 1.1033447980880737,
"learning_rate": 0.00046772549570256125,
"loss": 4.458186149597168,
"step": 561
},
{
"epoch": 0.414760147601476,
"grad_norm": 0.8855783939361572,
"learning_rate": 0.00046693167594388357,
"loss": 4.8609724044799805,
"step": 562
},
{
"epoch": 0.4154981549815498,
"grad_norm": 0.8844220042228699,
"learning_rate": 0.00046613717852702345,
"loss": 4.472495079040527,
"step": 563
},
{
"epoch": 0.41623616236162364,
"grad_norm": 1.000057339668274,
"learning_rate": 0.0004653420080563592,
"loss": 4.571652412414551,
"step": 564
},
{
"epoch": 0.41697416974169743,
"grad_norm": 1.2004189491271973,
"learning_rate": 0.0004645461691401697,
"loss": 4.222049713134766,
"step": 565
},
{
"epoch": 0.4177121771217712,
"grad_norm": 0.891960859298706,
"learning_rate": 0.0004637496663906077,
"loss": 4.547060966491699,
"step": 566
},
{
"epoch": 0.418450184501845,
"grad_norm": 0.895974338054657,
"learning_rate": 0.0004629525044236733,
"loss": 4.556779861450195,
"step": 567
},
{
"epoch": 0.4191881918819188,
"grad_norm": 0.9765421152114868,
"learning_rate": 0.0004621546878591865,
"loss": 4.732317924499512,
"step": 568
},
{
"epoch": 0.4199261992619926,
"grad_norm": 0.8740888237953186,
"learning_rate": 0.00046135622132076153,
"loss": 4.561002731323242,
"step": 569
},
{
"epoch": 0.42066420664206644,
"grad_norm": 0.768431544303894,
"learning_rate": 0.00046055710943577896,
"loss": 4.428035259246826,
"step": 570
},
{
"epoch": 0.42140221402214023,
"grad_norm": 0.9561269879341125,
"learning_rate": 0.0004597573568353595,
"loss": 4.324114799499512,
"step": 571
},
{
"epoch": 0.422140221402214,
"grad_norm": 0.9126472473144531,
"learning_rate": 0.00045895696815433687,
"loss": 4.664113521575928,
"step": 572
},
{
"epoch": 0.4228782287822878,
"grad_norm": 0.8882591128349304,
"learning_rate": 0.0004581559480312316,
"loss": 4.339204788208008,
"step": 573
},
{
"epoch": 0.4236162361623616,
"grad_norm": 1.081982135772705,
"learning_rate": 0.00045735430110822303,
"loss": 4.641040802001953,
"step": 574
},
{
"epoch": 0.42435424354243545,
"grad_norm": 0.7895275950431824,
"learning_rate": 0.0004565520320311235,
"loss": 4.488674163818359,
"step": 575
},
{
"epoch": 0.42509225092250924,
"grad_norm": 0.9767966866493225,
"learning_rate": 0.0004557491454493504,
"loss": 5.026608943939209,
"step": 576
},
{
"epoch": 0.42583025830258303,
"grad_norm": 0.8868175148963928,
"learning_rate": 0.0004549456460159004,
"loss": 4.576347351074219,
"step": 577
},
{
"epoch": 0.4265682656826568,
"grad_norm": 0.8501465320587158,
"learning_rate": 0.00045414153838732135,
"loss": 4.619839668273926,
"step": 578
},
{
"epoch": 0.4273062730627306,
"grad_norm": 0.8614507913589478,
"learning_rate": 0.00045333682722368597,
"loss": 4.661761283874512,
"step": 579
},
{
"epoch": 0.4280442804428044,
"grad_norm": 1.0277959108352661,
"learning_rate": 0.0004525315171885648,
"loss": 4.562242031097412,
"step": 580
},
{
"epoch": 0.42878228782287825,
"grad_norm": 0.9864504933357239,
"learning_rate": 0.00045172561294899884,
"loss": 4.4832258224487305,
"step": 581
},
{
"epoch": 0.42952029520295204,
"grad_norm": 0.8841885924339294,
"learning_rate": 0.0004509191191754728,
"loss": 4.594321250915527,
"step": 582
},
{
"epoch": 0.43025830258302583,
"grad_norm": 0.8487964272499084,
"learning_rate": 0.00045011204054188784,
"loss": 4.805062294006348,
"step": 583
},
{
"epoch": 0.4309963099630996,
"grad_norm": 1.027441143989563,
"learning_rate": 0.0004493043817255347,
"loss": 4.6832685470581055,
"step": 584
},
{
"epoch": 0.4317343173431734,
"grad_norm": 0.9376983046531677,
"learning_rate": 0.0004484961474070665,
"loss": 4.687745094299316,
"step": 585
},
{
"epoch": 0.43247232472324726,
"grad_norm": 0.927667498588562,
"learning_rate": 0.00044768734227047146,
"loss": 4.67139196395874,
"step": 586
},
{
"epoch": 0.43321033210332105,
"grad_norm": 0.8729023933410645,
"learning_rate": 0.00044687797100304596,
"loss": 4.648367404937744,
"step": 587
},
{
"epoch": 0.43394833948339484,
"grad_norm": 0.9207971692085266,
"learning_rate": 0.0004460680382953672,
"loss": 4.687824249267578,
"step": 588
},
{
"epoch": 0.43468634686346863,
"grad_norm": 0.8276870846748352,
"learning_rate": 0.00044525754884126634,
"loss": 4.622544288635254,
"step": 589
},
{
"epoch": 0.4354243542435424,
"grad_norm": 0.9223991632461548,
"learning_rate": 0.0004444465073378007,
"loss": 4.5522003173828125,
"step": 590
},
{
"epoch": 0.4361623616236162,
"grad_norm": 1.1231549978256226,
"learning_rate": 0.00044363491848522737,
"loss": 4.543008804321289,
"step": 591
},
{
"epoch": 0.43690036900369006,
"grad_norm": 0.867957592010498,
"learning_rate": 0.00044282278698697504,
"loss": 4.716594219207764,
"step": 592
},
{
"epoch": 0.43763837638376385,
"grad_norm": 0.7886962890625,
"learning_rate": 0.0004420101175496176,
"loss": 4.6924920082092285,
"step": 593
},
{
"epoch": 0.43837638376383764,
"grad_norm": 0.8600431680679321,
"learning_rate": 0.00044119691488284644,
"loss": 4.623996257781982,
"step": 594
},
{
"epoch": 0.43911439114391143,
"grad_norm": 0.8535895347595215,
"learning_rate": 0.0004403831836994428,
"loss": 4.559450149536133,
"step": 595
},
{
"epoch": 0.4398523985239852,
"grad_norm": 0.8784546256065369,
"learning_rate": 0.00043956892871525123,
"loss": 4.410243988037109,
"step": 596
},
{
"epoch": 0.44059040590405907,
"grad_norm": 0.9997196197509766,
"learning_rate": 0.0004387541546491518,
"loss": 4.677160739898682,
"step": 597
},
{
"epoch": 0.44132841328413286,
"grad_norm": 0.9354564547538757,
"learning_rate": 0.000437938866223033,
"loss": 4.577181816101074,
"step": 598
},
{
"epoch": 0.44206642066420665,
"grad_norm": 0.8507137298583984,
"learning_rate": 0.00043712306816176365,
"loss": 4.933267593383789,
"step": 599
},
{
"epoch": 0.44280442804428044,
"grad_norm": 0.7964354753494263,
"learning_rate": 0.0004363067651931667,
"loss": 4.742018222808838,
"step": 600
},
{
"epoch": 0.44354243542435423,
"grad_norm": 0.8398452997207642,
"learning_rate": 0.0004354899620479909,
"loss": 4.496376991271973,
"step": 601
},
{
"epoch": 0.444280442804428,
"grad_norm": 0.8302538990974426,
"learning_rate": 0.00043467266345988365,
"loss": 4.4834885597229,
"step": 602
},
{
"epoch": 0.44501845018450187,
"grad_norm": 0.8685540556907654,
"learning_rate": 0.00043385487416536397,
"loss": 4.598426342010498,
"step": 603
},
{
"epoch": 0.44575645756457566,
"grad_norm": 1.008470892906189,
"learning_rate": 0.0004330365989037941,
"loss": 4.579464912414551,
"step": 604
},
{
"epoch": 0.44649446494464945,
"grad_norm": 0.9266964793205261,
"learning_rate": 0.00043221784241735315,
"loss": 4.776824474334717,
"step": 605
},
{
"epoch": 0.44723247232472324,
"grad_norm": 0.8900343775749207,
"learning_rate": 0.00043139860945100864,
"loss": 4.573504447937012,
"step": 606
},
{
"epoch": 0.44797047970479703,
"grad_norm": 0.9872782826423645,
"learning_rate": 0.0004305789047524901,
"loss": 4.563179969787598,
"step": 607
},
{
"epoch": 0.4487084870848708,
"grad_norm": 0.8987732529640198,
"learning_rate": 0.00042975873307226,
"loss": 4.483942031860352,
"step": 608
},
{
"epoch": 0.44944649446494467,
"grad_norm": 0.9567626714706421,
"learning_rate": 0.000428938099163488,
"loss": 4.630576133728027,
"step": 609
},
{
"epoch": 0.45018450184501846,
"grad_norm": 0.8059940934181213,
"learning_rate": 0.000428117007782022,
"loss": 4.429983615875244,
"step": 610
},
{
"epoch": 0.45092250922509225,
"grad_norm": 0.8970604538917542,
"learning_rate": 0.0004272954636863613,
"loss": 4.672665596008301,
"step": 611
},
{
"epoch": 0.45166051660516604,
"grad_norm": 0.9387950301170349,
"learning_rate": 0.0004264734716376287,
"loss": 4.554316520690918,
"step": 612
},
{
"epoch": 0.45239852398523983,
"grad_norm": 0.8920540809631348,
"learning_rate": 0.0004256510363995433,
"loss": 4.600342750549316,
"step": 613
},
{
"epoch": 0.4531365313653137,
"grad_norm": 1.0435482263565063,
"learning_rate": 0.0004248281627383923,
"loss": 4.5729475021362305,
"step": 614
},
{
"epoch": 0.45387453874538747,
"grad_norm": 0.8200010657310486,
"learning_rate": 0.0004240048554230039,
"loss": 4.369121551513672,
"step": 615
},
{
"epoch": 0.45461254612546126,
"grad_norm": 0.9972869157791138,
"learning_rate": 0.0004231811192247195,
"loss": 4.570677757263184,
"step": 616
},
{
"epoch": 0.45535055350553505,
"grad_norm": 0.9263824224472046,
"learning_rate": 0.00042235695891736585,
"loss": 4.355930328369141,
"step": 617
},
{
"epoch": 0.45608856088560884,
"grad_norm": 1.002906084060669,
"learning_rate": 0.00042153237927722775,
"loss": 4.620849609375,
"step": 618
},
{
"epoch": 0.45682656826568263,
"grad_norm": 0.9105566143989563,
"learning_rate": 0.00042070738508302003,
"loss": 4.353985786437988,
"step": 619
},
{
"epoch": 0.4575645756457565,
"grad_norm": 0.8016074895858765,
"learning_rate": 0.0004198819811158601,
"loss": 4.468338966369629,
"step": 620
},
{
"epoch": 0.45830258302583027,
"grad_norm": 0.8135733604431152,
"learning_rate": 0.00041905617215924,
"loss": 4.608132362365723,
"step": 621
},
{
"epoch": 0.45904059040590406,
"grad_norm": 0.9293224215507507,
"learning_rate": 0.00041822996299899906,
"loss": 4.565390586853027,
"step": 622
},
{
"epoch": 0.45977859778597785,
"grad_norm": 1.1056631803512573,
"learning_rate": 0.00041740335842329566,
"loss": 4.949249267578125,
"step": 623
},
{
"epoch": 0.46051660516605164,
"grad_norm": 0.840045154094696,
"learning_rate": 0.00041657636322257993,
"loss": 4.710245609283447,
"step": 624
},
{
"epoch": 0.4612546125461255,
"grad_norm": 0.9296345710754395,
"learning_rate": 0.0004157489821895657,
"loss": 4.73885440826416,
"step": 625
},
{
"epoch": 0.4619926199261993,
"grad_norm": 0.8654890656471252,
"learning_rate": 0.0004149212201192029,
"loss": 4.420188903808594,
"step": 626
},
{
"epoch": 0.46273062730627307,
"grad_norm": 1.0963070392608643,
"learning_rate": 0.0004140930818086497,
"loss": 4.5778985023498535,
"step": 627
},
{
"epoch": 0.46346863468634686,
"grad_norm": 0.8319039940834045,
"learning_rate": 0.00041326457205724445,
"loss": 4.544205188751221,
"step": 628
},
{
"epoch": 0.46420664206642065,
"grad_norm": 0.9679455757141113,
"learning_rate": 0.0004124356956664786,
"loss": 4.58363151550293,
"step": 629
},
{
"epoch": 0.46494464944649444,
"grad_norm": 0.9498420357704163,
"learning_rate": 0.00041160645743996803,
"loss": 4.450014114379883,
"step": 630
},
{
"epoch": 0.4656826568265683,
"grad_norm": 0.8234408497810364,
"learning_rate": 0.0004107768621834257,
"loss": 4.5670857429504395,
"step": 631
},
{
"epoch": 0.4664206642066421,
"grad_norm": 1.0177075862884521,
"learning_rate": 0.0004099469147046336,
"loss": 4.445223808288574,
"step": 632
},
{
"epoch": 0.46715867158671587,
"grad_norm": 0.7691503167152405,
"learning_rate": 0.0004091166198134151,
"loss": 4.425694465637207,
"step": 633
},
{
"epoch": 0.46789667896678966,
"grad_norm": 0.966654896736145,
"learning_rate": 0.00040828598232160696,
"loss": 4.650933265686035,
"step": 634
},
{
"epoch": 0.46863468634686345,
"grad_norm": 1.0035220384597778,
"learning_rate": 0.0004074550070430312,
"loss": 4.69790506362915,
"step": 635
},
{
"epoch": 0.4693726937269373,
"grad_norm": 0.8333247900009155,
"learning_rate": 0.0004066236987934677,
"loss": 4.4094438552856445,
"step": 636
},
{
"epoch": 0.4701107011070111,
"grad_norm": 0.9272137880325317,
"learning_rate": 0.0004057920623906257,
"loss": 4.437854766845703,
"step": 637
},
{
"epoch": 0.4708487084870849,
"grad_norm": 0.9257310628890991,
"learning_rate": 0.0004049601026541166,
"loss": 4.607282638549805,
"step": 638
},
{
"epoch": 0.47158671586715867,
"grad_norm": 0.9032636880874634,
"learning_rate": 0.0004041278244054253,
"loss": 4.529732704162598,
"step": 639
},
{
"epoch": 0.47232472324723246,
"grad_norm": 0.8978585600852966,
"learning_rate": 0.0004032952324678826,
"loss": 4.577826499938965,
"step": 640
},
{
"epoch": 0.47306273062730625,
"grad_norm": 0.9967110753059387,
"learning_rate": 0.0004024623316666376,
"loss": 4.280439376831055,
"step": 641
},
{
"epoch": 0.4738007380073801,
"grad_norm": 0.9799245595932007,
"learning_rate": 0.00040162912682862884,
"loss": 4.567631721496582,
"step": 642
},
{
"epoch": 0.4745387453874539,
"grad_norm": 0.9125800728797913,
"learning_rate": 0.00040079562278255726,
"loss": 4.556615352630615,
"step": 643
},
{
"epoch": 0.4752767527675277,
"grad_norm": 0.8560841679573059,
"learning_rate": 0.00039996182435885744,
"loss": 4.567816734313965,
"step": 644
},
{
"epoch": 0.47601476014760147,
"grad_norm": 0.8384515643119812,
"learning_rate": 0.00039912773638967053,
"loss": 4.32409143447876,
"step": 645
},
{
"epoch": 0.47675276752767526,
"grad_norm": 0.9469295144081116,
"learning_rate": 0.0003982933637088151,
"loss": 4.505819797515869,
"step": 646
},
{
"epoch": 0.4774907749077491,
"grad_norm": 0.8418838381767273,
"learning_rate": 0.0003974587111517601,
"loss": 4.288963317871094,
"step": 647
},
{
"epoch": 0.4782287822878229,
"grad_norm": 0.9017887711524963,
"learning_rate": 0.00039662378355559636,
"loss": 4.349027633666992,
"step": 648
},
{
"epoch": 0.4789667896678967,
"grad_norm": 0.9656051993370056,
"learning_rate": 0.00039578858575900857,
"loss": 4.5458478927612305,
"step": 649
},
{
"epoch": 0.4797047970479705,
"grad_norm": 0.8434630632400513,
"learning_rate": 0.0003949531226022474,
"loss": 4.536887168884277,
"step": 650
},
{
"epoch": 0.48044280442804427,
"grad_norm": 0.8146916627883911,
"learning_rate": 0.0003941173989271013,
"loss": 4.554960250854492,
"step": 651
},
{
"epoch": 0.48118081180811806,
"grad_norm": 0.8592056035995483,
"learning_rate": 0.0003932814195768687,
"loss": 4.47853422164917,
"step": 652
},
{
"epoch": 0.4819188191881919,
"grad_norm": 0.8136284351348877,
"learning_rate": 0.0003924451893963294,
"loss": 4.614603042602539,
"step": 653
},
{
"epoch": 0.4826568265682657,
"grad_norm": 0.8898813724517822,
"learning_rate": 0.0003916087132317173,
"loss": 4.604781150817871,
"step": 654
},
{
"epoch": 0.4833948339483395,
"grad_norm": 0.9451072216033936,
"learning_rate": 0.0003907719959306915,
"loss": 4.379412651062012,
"step": 655
},
{
"epoch": 0.4841328413284133,
"grad_norm": 1.0912781953811646,
"learning_rate": 0.0003899350423423087,
"loss": 4.53802490234375,
"step": 656
},
{
"epoch": 0.48487084870848707,
"grad_norm": 0.9553581476211548,
"learning_rate": 0.0003890978573169949,
"loss": 4.305476188659668,
"step": 657
},
{
"epoch": 0.48560885608856086,
"grad_norm": 0.942167341709137,
"learning_rate": 0.00038826044570651756,
"loss": 4.399786949157715,
"step": 658
},
{
"epoch": 0.4863468634686347,
"grad_norm": 0.9437850117683411,
"learning_rate": 0.00038742281236395703,
"loss": 4.361236572265625,
"step": 659
},
{
"epoch": 0.4870848708487085,
"grad_norm": 0.9068073034286499,
"learning_rate": 0.00038658496214367873,
"loss": 4.441727638244629,
"step": 660
},
{
"epoch": 0.4878228782287823,
"grad_norm": 0.9844712615013123,
"learning_rate": 0.00038574689990130513,
"loss": 4.4561309814453125,
"step": 661
},
{
"epoch": 0.4885608856088561,
"grad_norm": 0.8944956064224243,
"learning_rate": 0.00038490863049368704,
"loss": 4.5960493087768555,
"step": 662
},
{
"epoch": 0.48929889298892987,
"grad_norm": 0.8984336853027344,
"learning_rate": 0.0003840701587788765,
"loss": 4.440349578857422,
"step": 663
},
{
"epoch": 0.4900369003690037,
"grad_norm": 1.0019009113311768,
"learning_rate": 0.0003832314896160973,
"loss": 4.5855865478515625,
"step": 664
},
{
"epoch": 0.4907749077490775,
"grad_norm": 0.949760913848877,
"learning_rate": 0.00038239262786571787,
"loss": 4.4265828132629395,
"step": 665
},
{
"epoch": 0.4915129151291513,
"grad_norm": 1.0857264995574951,
"learning_rate": 0.0003815535783892229,
"loss": 4.488886833190918,
"step": 666
},
{
"epoch": 0.4922509225092251,
"grad_norm": 1.0607296228408813,
"learning_rate": 0.00038071434604918463,
"loss": 4.221587657928467,
"step": 667
},
{
"epoch": 0.4929889298892989,
"grad_norm": 1.056148648262024,
"learning_rate": 0.0003798749357092352,
"loss": 4.554340362548828,
"step": 668
},
{
"epoch": 0.49372693726937267,
"grad_norm": 0.8420839309692383,
"learning_rate": 0.00037903535223403855,
"loss": 4.401950359344482,
"step": 669
},
{
"epoch": 0.4944649446494465,
"grad_norm": 0.8287214040756226,
"learning_rate": 0.00037819560048926173,
"loss": 4.45570182800293,
"step": 670
},
{
"epoch": 0.4952029520295203,
"grad_norm": 1.0356512069702148,
"learning_rate": 0.000377355685341547,
"loss": 4.568255424499512,
"step": 671
},
{
"epoch": 0.4959409594095941,
"grad_norm": 0.9578806161880493,
"learning_rate": 0.0003765156116584837,
"loss": 4.606746673583984,
"step": 672
},
{
"epoch": 0.4966789667896679,
"grad_norm": 0.8309308886528015,
"learning_rate": 0.00037567538430857976,
"loss": 4.480656147003174,
"step": 673
},
{
"epoch": 0.4974169741697417,
"grad_norm": 0.9627919793128967,
"learning_rate": 0.0003748350081612339,
"loss": 4.540738105773926,
"step": 674
},
{
"epoch": 0.4981549815498155,
"grad_norm": 0.7901818752288818,
"learning_rate": 0.00037399448808670706,
"loss": 4.378629684448242,
"step": 675
},
{
"epoch": 0.4988929889298893,
"grad_norm": 1.1135075092315674,
"learning_rate": 0.0003731538289560941,
"loss": 4.591548442840576,
"step": 676
},
{
"epoch": 0.4996309963099631,
"grad_norm": 0.8672391772270203,
"learning_rate": 0.0003723130356412962,
"loss": 4.584698677062988,
"step": 677
},
{
"epoch": 0.5003690036900369,
"grad_norm": 0.879558265209198,
"learning_rate": 0.00037147211301499176,
"loss": 4.36656379699707,
"step": 678
},
{
"epoch": 0.5011070110701107,
"grad_norm": 0.8770106434822083,
"learning_rate": 0.0003706310659506087,
"loss": 4.566497802734375,
"step": 679
},
{
"epoch": 0.5018450184501845,
"grad_norm": 0.8778314590454102,
"learning_rate": 0.0003697898993222961,
"loss": 4.343081474304199,
"step": 680
},
{
"epoch": 0.5025830258302583,
"grad_norm": 0.9130513668060303,
"learning_rate": 0.00036894861800489614,
"loss": 4.3984694480896,
"step": 681
},
{
"epoch": 0.5033210332103321,
"grad_norm": 0.8704879879951477,
"learning_rate": 0.00036810722687391544,
"loss": 4.561816215515137,
"step": 682
},
{
"epoch": 0.5040590405904058,
"grad_norm": 1.010489821434021,
"learning_rate": 0.00036726573080549704,
"loss": 4.25577449798584,
"step": 683
},
{
"epoch": 0.5047970479704798,
"grad_norm": 0.9569144248962402,
"learning_rate": 0.0003664241346763924,
"loss": 4.4627227783203125,
"step": 684
},
{
"epoch": 0.5055350553505535,
"grad_norm": 0.8847797513008118,
"learning_rate": 0.00036558244336393236,
"loss": 4.437929153442383,
"step": 685
},
{
"epoch": 0.5062730627306273,
"grad_norm": 0.8487216830253601,
"learning_rate": 0.00036474066174599986,
"loss": 4.435924053192139,
"step": 686
},
{
"epoch": 0.5070110701107011,
"grad_norm": 0.8881345391273499,
"learning_rate": 0.00036389879470100095,
"loss": 4.873279094696045,
"step": 687
},
{
"epoch": 0.5077490774907749,
"grad_norm": 0.8549903035163879,
"learning_rate": 0.00036305684710783684,
"loss": 4.272536754608154,
"step": 688
},
{
"epoch": 0.5084870848708487,
"grad_norm": 0.906299889087677,
"learning_rate": 0.0003622148238458754,
"loss": 4.555997848510742,
"step": 689
},
{
"epoch": 0.5092250922509225,
"grad_norm": 0.922178328037262,
"learning_rate": 0.0003613727297949232,
"loss": 4.573604583740234,
"step": 690
},
{
"epoch": 0.5099630996309963,
"grad_norm": 0.8890010118484497,
"learning_rate": 0.00036053056983519706,
"loss": 4.512640953063965,
"step": 691
},
{
"epoch": 0.5107011070110701,
"grad_norm": 0.8093462586402893,
"learning_rate": 0.00035968834884729555,
"loss": 4.304255485534668,
"step": 692
},
{
"epoch": 0.5114391143911439,
"grad_norm": 0.9470013380050659,
"learning_rate": 0.00035884607171217126,
"loss": 4.261716365814209,
"step": 693
},
{
"epoch": 0.5121771217712177,
"grad_norm": 1.0242949724197388,
"learning_rate": 0.0003580037433111018,
"loss": 4.365228652954102,
"step": 694
},
{
"epoch": 0.5129151291512916,
"grad_norm": 0.8128859996795654,
"learning_rate": 0.0003571613685256623,
"loss": 4.409188270568848,
"step": 695
},
{
"epoch": 0.5136531365313654,
"grad_norm": 0.9906793236732483,
"learning_rate": 0.00035631895223769614,
"loss": 4.144466876983643,
"step": 696
},
{
"epoch": 0.5143911439114391,
"grad_norm": 0.8540819883346558,
"learning_rate": 0.0003554764993292878,
"loss": 4.1609907150268555,
"step": 697
},
{
"epoch": 0.5151291512915129,
"grad_norm": 0.8967404365539551,
"learning_rate": 0.00035463401468273365,
"loss": 4.335708141326904,
"step": 698
},
{
"epoch": 0.5158671586715867,
"grad_norm": 0.9019063115119934,
"learning_rate": 0.00035379150318051397,
"loss": 4.435550689697266,
"step": 699
},
{
"epoch": 0.5166051660516605,
"grad_norm": 0.8940041065216064,
"learning_rate": 0.00035294896970526504,
"loss": 4.551334381103516,
"step": 700
},
{
"epoch": 0.5173431734317343,
"grad_norm": 0.9932311773300171,
"learning_rate": 0.0003521064191397499,
"loss": 4.3837890625,
"step": 701
},
{
"epoch": 0.5180811808118081,
"grad_norm": 0.8308423757553101,
"learning_rate": 0.0003512638563668313,
"loss": 4.352203845977783,
"step": 702
},
{
"epoch": 0.5188191881918819,
"grad_norm": 1.0316524505615234,
"learning_rate": 0.00035042128626944203,
"loss": 4.69419527053833,
"step": 703
},
{
"epoch": 0.5195571955719557,
"grad_norm": 0.845513105392456,
"learning_rate": 0.00034957871373055796,
"loss": 4.403134346008301,
"step": 704
},
{
"epoch": 0.5202952029520295,
"grad_norm": 1.1955550909042358,
"learning_rate": 0.0003487361436331689,
"loss": 4.507143974304199,
"step": 705
},
{
"epoch": 0.5210332103321034,
"grad_norm": 0.9265637993812561,
"learning_rate": 0.0003478935808602501,
"loss": 4.529629707336426,
"step": 706
},
{
"epoch": 0.5217712177121772,
"grad_norm": 0.7645063400268555,
"learning_rate": 0.0003470510302947351,
"loss": 4.37443733215332,
"step": 707
},
{
"epoch": 0.522509225092251,
"grad_norm": 0.8971974849700928,
"learning_rate": 0.0003462084968194861,
"loss": 4.33015251159668,
"step": 708
},
{
"epoch": 0.5232472324723247,
"grad_norm": 0.813549816608429,
"learning_rate": 0.00034536598531726646,
"loss": 4.5936079025268555,
"step": 709
},
{
"epoch": 0.5239852398523985,
"grad_norm": 0.953991711139679,
"learning_rate": 0.0003445235006707122,
"loss": 4.403616905212402,
"step": 710
},
{
"epoch": 0.5247232472324723,
"grad_norm": 0.8936543464660645,
"learning_rate": 0.0003436810477623038,
"loss": 4.279123306274414,
"step": 711
},
{
"epoch": 0.5254612546125461,
"grad_norm": 0.9178246855735779,
"learning_rate": 0.00034283863147433776,
"loss": 4.134098052978516,
"step": 712
},
{
"epoch": 0.5261992619926199,
"grad_norm": 0.9482448101043701,
"learning_rate": 0.0003419962566888981,
"loss": 4.32552433013916,
"step": 713
},
{
"epoch": 0.5269372693726937,
"grad_norm": 0.787865161895752,
"learning_rate": 0.0003411539282878288,
"loss": 4.394043922424316,
"step": 714
},
{
"epoch": 0.5276752767527675,
"grad_norm": 0.9969366788864136,
"learning_rate": 0.00034031165115270444,
"loss": 4.486443996429443,
"step": 715
},
{
"epoch": 0.5284132841328413,
"grad_norm": 0.8682870268821716,
"learning_rate": 0.00033946943016480304,
"loss": 4.508628845214844,
"step": 716
},
{
"epoch": 0.5291512915129152,
"grad_norm": 0.9052722454071045,
"learning_rate": 0.0003386272702050769,
"loss": 4.4580583572387695,
"step": 717
},
{
"epoch": 0.529889298892989,
"grad_norm": 0.909764289855957,
"learning_rate": 0.00033778517615412477,
"loss": 4.225852012634277,
"step": 718
},
{
"epoch": 0.5306273062730628,
"grad_norm": 0.8234180808067322,
"learning_rate": 0.0003369431528921632,
"loss": 4.583276748657227,
"step": 719
},
{
"epoch": 0.5313653136531366,
"grad_norm": 0.8727805614471436,
"learning_rate": 0.0003361012052989992,
"loss": 4.535766124725342,
"step": 720
},
{
"epoch": 0.5321033210332103,
"grad_norm": 1.0564109086990356,
"learning_rate": 0.00033525933825400014,
"loss": 4.4810943603515625,
"step": 721
},
{
"epoch": 0.5328413284132841,
"grad_norm": 0.9636712074279785,
"learning_rate": 0.0003344175566360676,
"loss": 4.364070415496826,
"step": 722
},
{
"epoch": 0.5335793357933579,
"grad_norm": 0.9482673406600952,
"learning_rate": 0.00033357586532360765,
"loss": 4.78449821472168,
"step": 723
},
{
"epoch": 0.5343173431734317,
"grad_norm": 0.7933990955352783,
"learning_rate": 0.00033273426919450285,
"loss": 4.399996280670166,
"step": 724
},
{
"epoch": 0.5350553505535055,
"grad_norm": 0.8797454237937927,
"learning_rate": 0.0003318927731260846,
"loss": 4.585580825805664,
"step": 725
},
{
"epoch": 0.5357933579335793,
"grad_norm": 0.8431602716445923,
"learning_rate": 0.00033105138199510386,
"loss": 4.289941787719727,
"step": 726
},
{
"epoch": 0.5365313653136531,
"grad_norm": 0.8969424962997437,
"learning_rate": 0.00033021010067770396,
"loss": 4.353963375091553,
"step": 727
},
{
"epoch": 0.537269372693727,
"grad_norm": 0.7266989350318909,
"learning_rate": 0.00032936893404939135,
"loss": 4.287866592407227,
"step": 728
},
{
"epoch": 0.5380073800738008,
"grad_norm": 0.9192749261856079,
"learning_rate": 0.0003285278869850084,
"loss": 4.431896209716797,
"step": 729
},
{
"epoch": 0.5387453874538746,
"grad_norm": 0.9108367562294006,
"learning_rate": 0.0003276869643587038,
"loss": 4.330748558044434,
"step": 730
},
{
"epoch": 0.5394833948339484,
"grad_norm": 0.789059579372406,
"learning_rate": 0.000326846171043906,
"loss": 4.409814834594727,
"step": 731
},
{
"epoch": 0.5402214022140222,
"grad_norm": 0.931719183921814,
"learning_rate": 0.000326005511913293,
"loss": 4.5224928855896,
"step": 732
},
{
"epoch": 0.5409594095940959,
"grad_norm": 0.9140210747718811,
"learning_rate": 0.00032516499183876614,
"loss": 4.469390869140625,
"step": 733
},
{
"epoch": 0.5416974169741697,
"grad_norm": 0.7886836528778076,
"learning_rate": 0.0003243246156914203,
"loss": 4.169953346252441,
"step": 734
},
{
"epoch": 0.5424354243542435,
"grad_norm": 0.9898924827575684,
"learning_rate": 0.00032348438834151636,
"loss": 4.523615837097168,
"step": 735
},
{
"epoch": 0.5431734317343173,
"grad_norm": 0.9171273112297058,
"learning_rate": 0.00032264431465845307,
"loss": 4.362099647521973,
"step": 736
},
{
"epoch": 0.5439114391143911,
"grad_norm": 0.8603449463844299,
"learning_rate": 0.0003218043995107383,
"loss": 4.252144813537598,
"step": 737
},
{
"epoch": 0.5446494464944649,
"grad_norm": 0.9839322566986084,
"learning_rate": 0.0003209646477659615,
"loss": 4.401839256286621,
"step": 738
},
{
"epoch": 0.5453874538745388,
"grad_norm": 1.1770368814468384,
"learning_rate": 0.00032012506429076476,
"loss": 4.247356414794922,
"step": 739
},
{
"epoch": 0.5461254612546126,
"grad_norm": 0.8217732310295105,
"learning_rate": 0.0003192856539508155,
"loss": 4.566009521484375,
"step": 740
},
{
"epoch": 0.5468634686346864,
"grad_norm": 1.1834269762039185,
"learning_rate": 0.00031844642161077717,
"loss": 4.510600566864014,
"step": 741
},
{
"epoch": 0.5476014760147602,
"grad_norm": 0.9773359298706055,
"learning_rate": 0.0003176073721342822,
"loss": 4.310590744018555,
"step": 742
},
{
"epoch": 0.548339483394834,
"grad_norm": 0.9322510957717896,
"learning_rate": 0.00031676851038390277,
"loss": 4.397828102111816,
"step": 743
},
{
"epoch": 0.5490774907749078,
"grad_norm": 0.9611193537712097,
"learning_rate": 0.00031592984122112363,
"loss": 4.509471893310547,
"step": 744
},
{
"epoch": 0.5498154981549815,
"grad_norm": 0.8511263132095337,
"learning_rate": 0.00031509136950631295,
"loss": 4.605403900146484,
"step": 745
},
{
"epoch": 0.5505535055350553,
"grad_norm": 1.1331124305725098,
"learning_rate": 0.00031425310009869497,
"loss": 4.705798625946045,
"step": 746
},
{
"epoch": 0.5512915129151291,
"grad_norm": 0.9317970871925354,
"learning_rate": 0.0003134150378563213,
"loss": 4.538765907287598,
"step": 747
},
{
"epoch": 0.5520295202952029,
"grad_norm": 0.7060513496398926,
"learning_rate": 0.00031257718763604296,
"loss": 4.484154224395752,
"step": 748
},
{
"epoch": 0.5527675276752767,
"grad_norm": 0.9105408191680908,
"learning_rate": 0.00031173955429348254,
"loss": 4.227485656738281,
"step": 749
},
{
"epoch": 0.5535055350553506,
"grad_norm": 0.8890596628189087,
"learning_rate": 0.000310902142683005,
"loss": 4.158082008361816,
"step": 750
},
{
"epoch": 0.5542435424354244,
"grad_norm": 1.074188470840454,
"learning_rate": 0.00031006495765769135,
"loss": 4.741909980773926,
"step": 751
},
{
"epoch": 0.5549815498154982,
"grad_norm": 1.0221657752990723,
"learning_rate": 0.0003092280040693085,
"loss": 4.476526260375977,
"step": 752
},
{
"epoch": 0.555719557195572,
"grad_norm": 0.9633339643478394,
"learning_rate": 0.00030839128676828277,
"loss": 4.336530685424805,
"step": 753
},
{
"epoch": 0.5564575645756458,
"grad_norm": 1.0310927629470825,
"learning_rate": 0.0003075548106036706,
"loss": 4.326992988586426,
"step": 754
},
{
"epoch": 0.5571955719557196,
"grad_norm": 0.8011588454246521,
"learning_rate": 0.0003067185804231314,
"loss": 4.4770827293396,
"step": 755
},
{
"epoch": 0.5579335793357934,
"grad_norm": 0.921048641204834,
"learning_rate": 0.00030588260107289875,
"loss": 4.608548164367676,
"step": 756
},
{
"epoch": 0.5586715867158671,
"grad_norm": 0.9670724272727966,
"learning_rate": 0.0003050468773977527,
"loss": 4.357841491699219,
"step": 757
},
{
"epoch": 0.5594095940959409,
"grad_norm": 1.0081647634506226,
"learning_rate": 0.00030421141424099153,
"loss": 4.160003662109375,
"step": 758
},
{
"epoch": 0.5601476014760147,
"grad_norm": 0.8587222695350647,
"learning_rate": 0.0003033762164444036,
"loss": 4.5625104904174805,
"step": 759
},
{
"epoch": 0.5608856088560885,
"grad_norm": 0.9064732789993286,
"learning_rate": 0.00030254128884823995,
"loss": 4.558137893676758,
"step": 760
},
{
"epoch": 0.5616236162361624,
"grad_norm": 0.9167226552963257,
"learning_rate": 0.00030170663629118484,
"loss": 4.650042533874512,
"step": 761
},
{
"epoch": 0.5623616236162362,
"grad_norm": 0.9208563566207886,
"learning_rate": 0.0003008722636103295,
"loss": 4.311919212341309,
"step": 762
},
{
"epoch": 0.56309963099631,
"grad_norm": 0.8243905305862427,
"learning_rate": 0.0003000381756411425,
"loss": 4.592479705810547,
"step": 763
},
{
"epoch": 0.5638376383763838,
"grad_norm": 0.88048255443573,
"learning_rate": 0.00029920437721744285,
"loss": 4.383855819702148,
"step": 764
},
{
"epoch": 0.5645756457564576,
"grad_norm": 0.8309145569801331,
"learning_rate": 0.0002983708731713712,
"loss": 4.523615837097168,
"step": 765
},
{
"epoch": 0.5653136531365314,
"grad_norm": 0.9054703116416931,
"learning_rate": 0.0002975376683333625,
"loss": 4.120911121368408,
"step": 766
},
{
"epoch": 0.5660516605166052,
"grad_norm": 0.8789876103401184,
"learning_rate": 0.0002967047675321174,
"loss": 4.314697265625,
"step": 767
},
{
"epoch": 0.566789667896679,
"grad_norm": 1.055936336517334,
"learning_rate": 0.0002958721755945748,
"loss": 4.497006416320801,
"step": 768
},
{
"epoch": 0.5675276752767527,
"grad_norm": 1.1139589548110962,
"learning_rate": 0.00029503989734588345,
"loss": 4.493967056274414,
"step": 769
},
{
"epoch": 0.5682656826568265,
"grad_norm": 0.8091505169868469,
"learning_rate": 0.0002942079376093742,
"loss": 4.10081672668457,
"step": 770
},
{
"epoch": 0.5690036900369003,
"grad_norm": 0.8381765484809875,
"learning_rate": 0.00029337630120653235,
"loss": 4.278990745544434,
"step": 771
},
{
"epoch": 0.5697416974169742,
"grad_norm": 0.8964424729347229,
"learning_rate": 0.00029254499295696876,
"loss": 4.365828514099121,
"step": 772
},
{
"epoch": 0.570479704797048,
"grad_norm": 0.8812311887741089,
"learning_rate": 0.0002917140176783931,
"loss": 4.407172679901123,
"step": 773
},
{
"epoch": 0.5712177121771218,
"grad_norm": 0.9463404417037964,
"learning_rate": 0.0002908833801865849,
"loss": 4.176614761352539,
"step": 774
},
{
"epoch": 0.5719557195571956,
"grad_norm": 0.9128312468528748,
"learning_rate": 0.0002900530852953665,
"loss": 4.4936604499816895,
"step": 775
},
{
"epoch": 0.5726937269372694,
"grad_norm": 0.9788138270378113,
"learning_rate": 0.0002892231378165744,
"loss": 4.425959587097168,
"step": 776
},
{
"epoch": 0.5734317343173432,
"grad_norm": 0.8016911149024963,
"learning_rate": 0.0002883935425600321,
"loss": 4.351809024810791,
"step": 777
},
{
"epoch": 0.574169741697417,
"grad_norm": 0.8947065472602844,
"learning_rate": 0.00028756430433352146,
"loss": 4.333946228027344,
"step": 778
},
{
"epoch": 0.5749077490774908,
"grad_norm": 0.8357275724411011,
"learning_rate": 0.0002867354279427556,
"loss": 4.609579086303711,
"step": 779
},
{
"epoch": 0.5756457564575646,
"grad_norm": 0.9476321339607239,
"learning_rate": 0.0002859069181913503,
"loss": 4.475932598114014,
"step": 780
},
{
"epoch": 0.5763837638376383,
"grad_norm": 0.9456009268760681,
"learning_rate": 0.00028507877988079717,
"loss": 4.241294860839844,
"step": 781
},
{
"epoch": 0.5771217712177121,
"grad_norm": 0.7762236595153809,
"learning_rate": 0.0002842510178104343,
"loss": 4.2514777183532715,
"step": 782
},
{
"epoch": 0.5778597785977859,
"grad_norm": 0.8480483889579773,
"learning_rate": 0.00028342363677742,
"loss": 4.5362043380737305,
"step": 783
},
{
"epoch": 0.5785977859778598,
"grad_norm": 0.8248271942138672,
"learning_rate": 0.00028259664157670434,
"loss": 4.289585113525391,
"step": 784
},
{
"epoch": 0.5793357933579336,
"grad_norm": 0.9554965496063232,
"learning_rate": 0.00028177003700100093,
"loss": 4.234594345092773,
"step": 785
},
{
"epoch": 0.5800738007380074,
"grad_norm": 1.0218883752822876,
"learning_rate": 0.00028094382784076005,
"loss": 4.032539367675781,
"step": 786
},
{
"epoch": 0.5808118081180812,
"grad_norm": 0.9201107621192932,
"learning_rate": 0.00028011801888413996,
"loss": 4.4474711418151855,
"step": 787
},
{
"epoch": 0.581549815498155,
"grad_norm": 0.9545875191688538,
"learning_rate": 0.00027929261491698,
"loss": 4.290918350219727,
"step": 788
},
{
"epoch": 0.5822878228782288,
"grad_norm": 0.9154767394065857,
"learning_rate": 0.00027846762072277235,
"loss": 4.266115188598633,
"step": 789
},
{
"epoch": 0.5830258302583026,
"grad_norm": 0.9572087526321411,
"learning_rate": 0.00027764304108263425,
"loss": 4.489130973815918,
"step": 790
},
{
"epoch": 0.5837638376383764,
"grad_norm": 0.864920973777771,
"learning_rate": 0.0002768188807752806,
"loss": 4.22702693939209,
"step": 791
},
{
"epoch": 0.5845018450184502,
"grad_norm": 0.9186403751373291,
"learning_rate": 0.0002759951445769962,
"loss": 4.370454788208008,
"step": 792
},
{
"epoch": 0.5852398523985239,
"grad_norm": 0.9486933350563049,
"learning_rate": 0.00027517183726160775,
"loss": 4.345991611480713,
"step": 793
},
{
"epoch": 0.5859778597785977,
"grad_norm": 0.9103389382362366,
"learning_rate": 0.0002743489636004567,
"loss": 4.232224941253662,
"step": 794
},
{
"epoch": 0.5867158671586716,
"grad_norm": 0.9209710359573364,
"learning_rate": 0.0002735265283623713,
"loss": 3.9969122409820557,
"step": 795
},
{
"epoch": 0.5874538745387454,
"grad_norm": 1.2172404527664185,
"learning_rate": 0.00027270453631363876,
"loss": 4.3851318359375,
"step": 796
},
{
"epoch": 0.5881918819188192,
"grad_norm": 1.0857105255126953,
"learning_rate": 0.00027188299221797806,
"loss": 4.543056488037109,
"step": 797
},
{
"epoch": 0.588929889298893,
"grad_norm": 0.8917638659477234,
"learning_rate": 0.00027106190083651206,
"loss": 4.233307838439941,
"step": 798
},
{
"epoch": 0.5896678966789668,
"grad_norm": 0.9834994077682495,
"learning_rate": 0.0002702412669277401,
"loss": 4.3369035720825195,
"step": 799
},
{
"epoch": 0.5904059040590406,
"grad_norm": 0.9920309782028198,
"learning_rate": 0.00026942109524751,
"loss": 4.263988971710205,
"step": 800
},
{
"epoch": 0.5911439114391144,
"grad_norm": 0.7995727062225342,
"learning_rate": 0.00026860139054899146,
"loss": 4.237081050872803,
"step": 801
},
{
"epoch": 0.5918819188191882,
"grad_norm": 0.8966661095619202,
"learning_rate": 0.00026778215758264696,
"loss": 4.278907299041748,
"step": 802
},
{
"epoch": 0.592619926199262,
"grad_norm": 0.8927947282791138,
"learning_rate": 0.000266963401096206,
"loss": 4.3486151695251465,
"step": 803
},
{
"epoch": 0.5933579335793358,
"grad_norm": 0.7980582118034363,
"learning_rate": 0.0002661451258346361,
"loss": 4.231438636779785,
"step": 804
},
{
"epoch": 0.5940959409594095,
"grad_norm": 0.8703809380531311,
"learning_rate": 0.00026532733654011635,
"loss": 4.2430419921875,
"step": 805
},
{
"epoch": 0.5948339483394834,
"grad_norm": 1.0357931852340698,
"learning_rate": 0.00026451003795200913,
"loss": 4.256633281707764,
"step": 806
},
{
"epoch": 0.5955719557195572,
"grad_norm": 0.8626582026481628,
"learning_rate": 0.00026369323480683333,
"loss": 4.278927326202393,
"step": 807
},
{
"epoch": 0.596309963099631,
"grad_norm": 0.8148908615112305,
"learning_rate": 0.0002628769318382364,
"loss": 4.354986190795898,
"step": 808
},
{
"epoch": 0.5970479704797048,
"grad_norm": 0.7945446372032166,
"learning_rate": 0.000262061133776967,
"loss": 4.433017730712891,
"step": 809
},
{
"epoch": 0.5977859778597786,
"grad_norm": 0.8125186562538147,
"learning_rate": 0.00026124584535084825,
"loss": 4.323663711547852,
"step": 810
},
{
"epoch": 0.5985239852398524,
"grad_norm": 0.8656073808670044,
"learning_rate": 0.00026043107128474876,
"loss": 4.364239692687988,
"step": 811
},
{
"epoch": 0.5992619926199262,
"grad_norm": 0.7823298573493958,
"learning_rate": 0.00025961681630055737,
"loss": 4.095296382904053,
"step": 812
},
{
"epoch": 0.6,
"grad_norm": 0.8082625865936279,
"learning_rate": 0.00025880308511715366,
"loss": 4.251285552978516,
"step": 813
},
{
"epoch": 0.6007380073800738,
"grad_norm": 0.8128904104232788,
"learning_rate": 0.00025798988245038243,
"loss": 4.234792709350586,
"step": 814
},
{
"epoch": 0.6014760147601476,
"grad_norm": 0.9591745138168335,
"learning_rate": 0.00025717721301302495,
"loss": 4.191695213317871,
"step": 815
},
{
"epoch": 0.6022140221402214,
"grad_norm": 0.8306787014007568,
"learning_rate": 0.0002563650815147728,
"loss": 4.182519912719727,
"step": 816
},
{
"epoch": 0.6029520295202953,
"grad_norm": 1.0368632078170776,
"learning_rate": 0.0002555534926621994,
"loss": 4.357141971588135,
"step": 817
},
{
"epoch": 0.603690036900369,
"grad_norm": 0.9401784539222717,
"learning_rate": 0.00025474245115873377,
"loss": 4.2874016761779785,
"step": 818
},
{
"epoch": 0.6044280442804428,
"grad_norm": 0.9086504578590393,
"learning_rate": 0.00025393196170463286,
"loss": 4.135937690734863,
"step": 819
},
{
"epoch": 0.6051660516605166,
"grad_norm": 0.8185088634490967,
"learning_rate": 0.00025312202899695403,
"loss": 4.31793212890625,
"step": 820
},
{
"epoch": 0.6059040590405904,
"grad_norm": 0.8340873718261719,
"learning_rate": 0.00025231265772952864,
"loss": 4.332757949829102,
"step": 821
},
{
"epoch": 0.6066420664206642,
"grad_norm": 0.9770723581314087,
"learning_rate": 0.00025150385259293346,
"loss": 4.115085124969482,
"step": 822
},
{
"epoch": 0.607380073800738,
"grad_norm": 1.0393363237380981,
"learning_rate": 0.0002506956182744653,
"loss": 4.164813995361328,
"step": 823
},
{
"epoch": 0.6081180811808118,
"grad_norm": 0.9465534090995789,
"learning_rate": 0.00024988795945811215,
"loss": 4.53727912902832,
"step": 824
},
{
"epoch": 0.6088560885608856,
"grad_norm": 0.8929158449172974,
"learning_rate": 0.00024908088082452724,
"loss": 4.265376091003418,
"step": 825
},
{
"epoch": 0.6095940959409594,
"grad_norm": 0.7848824262619019,
"learning_rate": 0.00024827438705100116,
"loss": 4.300992965698242,
"step": 826
},
{
"epoch": 0.6103321033210332,
"grad_norm": 0.7737518548965454,
"learning_rate": 0.00024746848281143524,
"loss": 4.297072410583496,
"step": 827
},
{
"epoch": 0.6110701107011071,
"grad_norm": 1.0166592597961426,
"learning_rate": 0.00024666317277631403,
"loss": 4.4208478927612305,
"step": 828
},
{
"epoch": 0.6118081180811809,
"grad_norm": 0.8515886664390564,
"learning_rate": 0.00024585846161267875,
"loss": 4.542513847351074,
"step": 829
},
{
"epoch": 0.6125461254612546,
"grad_norm": 0.8427137732505798,
"learning_rate": 0.00024505435398409966,
"loss": 4.270936965942383,
"step": 830
},
{
"epoch": 0.6132841328413284,
"grad_norm": 0.811477541923523,
"learning_rate": 0.0002442508545506495,
"loss": 4.223374366760254,
"step": 831
},
{
"epoch": 0.6140221402214022,
"grad_norm": 0.9186045527458191,
"learning_rate": 0.00024344796796887656,
"loss": 4.369760036468506,
"step": 832
},
{
"epoch": 0.614760147601476,
"grad_norm": 0.809533417224884,
"learning_rate": 0.0002426456988917769,
"loss": 4.350223541259766,
"step": 833
},
{
"epoch": 0.6154981549815498,
"grad_norm": 0.8991212248802185,
"learning_rate": 0.00024184405196876844,
"loss": 4.136372089385986,
"step": 834
},
{
"epoch": 0.6162361623616236,
"grad_norm": 0.8988363742828369,
"learning_rate": 0.00024104303184566307,
"loss": 4.202424049377441,
"step": 835
},
{
"epoch": 0.6169741697416974,
"grad_norm": 1.3087947368621826,
"learning_rate": 0.00024024264316464065,
"loss": 4.428619384765625,
"step": 836
},
{
"epoch": 0.6177121771217712,
"grad_norm": 0.7776771783828735,
"learning_rate": 0.0002394428905642211,
"loss": 4.351472854614258,
"step": 837
},
{
"epoch": 0.618450184501845,
"grad_norm": 0.996083676815033,
"learning_rate": 0.00023864377867923852,
"loss": 3.9067325592041016,
"step": 838
},
{
"epoch": 0.6191881918819189,
"grad_norm": 0.8904930949211121,
"learning_rate": 0.00023784531214081348,
"loss": 4.205554008483887,
"step": 839
},
{
"epoch": 0.6199261992619927,
"grad_norm": 0.9460301399230957,
"learning_rate": 0.00023704749557632688,
"loss": 4.3381452560424805,
"step": 840
},
{
"epoch": 0.6206642066420665,
"grad_norm": 0.8847654461860657,
"learning_rate": 0.00023625033360939239,
"loss": 4.210631370544434,
"step": 841
},
{
"epoch": 0.6214022140221402,
"grad_norm": 0.9049587249755859,
"learning_rate": 0.00023545383085983034,
"loss": 4.128975868225098,
"step": 842
},
{
"epoch": 0.622140221402214,
"grad_norm": 0.881879985332489,
"learning_rate": 0.00023465799194364087,
"loss": 4.109155654907227,
"step": 843
},
{
"epoch": 0.6228782287822878,
"grad_norm": 0.9331649541854858,
"learning_rate": 0.00023386282147297657,
"loss": 4.180877685546875,
"step": 844
},
{
"epoch": 0.6236162361623616,
"grad_norm": 1.0155686140060425,
"learning_rate": 0.00023306832405611643,
"loss": 4.2506818771362305,
"step": 845
},
{
"epoch": 0.6243542435424354,
"grad_norm": 0.9788922667503357,
"learning_rate": 0.00023227450429743867,
"loss": 4.536131858825684,
"step": 846
},
{
"epoch": 0.6250922509225092,
"grad_norm": 1.0663362741470337,
"learning_rate": 0.00023148136679739453,
"loss": 4.059211730957031,
"step": 847
},
{
"epoch": 0.625830258302583,
"grad_norm": 0.8880152702331543,
"learning_rate": 0.00023068891615248102,
"loss": 4.163819313049316,
"step": 848
},
{
"epoch": 0.6265682656826568,
"grad_norm": 0.9166035056114197,
"learning_rate": 0.0002298971569552149,
"loss": 4.22659158706665,
"step": 849
},
{
"epoch": 0.6273062730627307,
"grad_norm": 1.1947702169418335,
"learning_rate": 0.00022910609379410546,
"loss": 4.3044633865356445,
"step": 850
},
{
"epoch": 0.6280442804428045,
"grad_norm": 1.251198410987854,
"learning_rate": 0.0002283157312536284,
"loss": 4.213165283203125,
"step": 851
},
{
"epoch": 0.6287822878228783,
"grad_norm": 0.9441475868225098,
"learning_rate": 0.00022752607391419904,
"loss": 4.37963342666626,
"step": 852
},
{
"epoch": 0.629520295202952,
"grad_norm": 0.8944138884544373,
"learning_rate": 0.0002267371263521461,
"loss": 4.479311943054199,
"step": 853
},
{
"epoch": 0.6302583025830258,
"grad_norm": 0.9756674766540527,
"learning_rate": 0.00022594889313968424,
"loss": 4.323942184448242,
"step": 854
},
{
"epoch": 0.6309963099630996,
"grad_norm": 0.9520359039306641,
"learning_rate": 0.00022516137884488895,
"loss": 4.259498596191406,
"step": 855
},
{
"epoch": 0.6317343173431734,
"grad_norm": 0.8389827609062195,
"learning_rate": 0.000224374588031669,
"loss": 4.353797435760498,
"step": 856
},
{
"epoch": 0.6324723247232472,
"grad_norm": 0.9523439407348633,
"learning_rate": 0.0002235885252597402,
"loss": 4.485894203186035,
"step": 857
},
{
"epoch": 0.633210332103321,
"grad_norm": 0.8450521230697632,
"learning_rate": 0.00022280319508459953,
"loss": 4.3302717208862305,
"step": 858
},
{
"epoch": 0.6339483394833948,
"grad_norm": 0.9799603819847107,
"learning_rate": 0.00022201860205749792,
"loss": 4.216465950012207,
"step": 859
},
{
"epoch": 0.6346863468634686,
"grad_norm": 0.8215528726577759,
"learning_rate": 0.00022123475072541456,
"loss": 4.218143463134766,
"step": 860
},
{
"epoch": 0.6354243542435425,
"grad_norm": 0.8392944931983948,
"learning_rate": 0.00022045164563102993,
"loss": 4.393090724945068,
"step": 861
},
{
"epoch": 0.6361623616236163,
"grad_norm": 0.9801323413848877,
"learning_rate": 0.00021966929131270053,
"loss": 4.3347978591918945,
"step": 862
},
{
"epoch": 0.6369003690036901,
"grad_norm": 1.0346145629882812,
"learning_rate": 0.00021888769230443076,
"loss": 4.304266452789307,
"step": 863
},
{
"epoch": 0.6376383763837639,
"grad_norm": 0.8837590217590332,
"learning_rate": 0.00021810685313584894,
"loss": 4.318976879119873,
"step": 864
},
{
"epoch": 0.6383763837638377,
"grad_norm": 0.9550504088401794,
"learning_rate": 0.00021732677833217884,
"loss": 4.0572285652160645,
"step": 865
},
{
"epoch": 0.6391143911439114,
"grad_norm": 0.9023411273956299,
"learning_rate": 0.00021654747241421515,
"loss": 4.210879325866699,
"step": 866
},
{
"epoch": 0.6398523985239852,
"grad_norm": 1.2458837032318115,
"learning_rate": 0.00021576893989829648,
"loss": 4.031771183013916,
"step": 867
},
{
"epoch": 0.640590405904059,
"grad_norm": 0.931896448135376,
"learning_rate": 0.00021499118529627893,
"loss": 4.238314151763916,
"step": 868
},
{
"epoch": 0.6413284132841328,
"grad_norm": 0.8535945415496826,
"learning_rate": 0.00021421421311551095,
"loss": 4.30747652053833,
"step": 869
},
{
"epoch": 0.6420664206642066,
"grad_norm": 0.8937339186668396,
"learning_rate": 0.0002134380278588059,
"loss": 4.368441581726074,
"step": 870
},
{
"epoch": 0.6428044280442804,
"grad_norm": 0.9691210985183716,
"learning_rate": 0.00021266263402441746,
"loss": 4.286958694458008,
"step": 871
},
{
"epoch": 0.6435424354243543,
"grad_norm": 0.9562344551086426,
"learning_rate": 0.00021188803610601187,
"loss": 4.331124305725098,
"step": 872
},
{
"epoch": 0.6442804428044281,
"grad_norm": 0.9085299372673035,
"learning_rate": 0.00021111423859264362,
"loss": 4.204074859619141,
"step": 873
},
{
"epoch": 0.6450184501845019,
"grad_norm": 1.0217558145523071,
"learning_rate": 0.00021034124596872776,
"loss": 4.061552047729492,
"step": 874
},
{
"epoch": 0.6457564575645757,
"grad_norm": 0.8775967359542847,
"learning_rate": 0.00020956906271401554,
"loss": 4.252497673034668,
"step": 875
},
{
"epoch": 0.6464944649446495,
"grad_norm": 0.9603700637817383,
"learning_rate": 0.00020879769330356705,
"loss": 4.17333984375,
"step": 876
},
{
"epoch": 0.6472324723247233,
"grad_norm": 0.9519745707511902,
"learning_rate": 0.0002080271422077262,
"loss": 4.414155006408691,
"step": 877
},
{
"epoch": 0.647970479704797,
"grad_norm": 0.8470144271850586,
"learning_rate": 0.00020725741389209423,
"loss": 4.405782699584961,
"step": 878
},
{
"epoch": 0.6487084870848708,
"grad_norm": 0.872512698173523,
"learning_rate": 0.00020648851281750437,
"loss": 4.448093414306641,
"step": 879
},
{
"epoch": 0.6494464944649446,
"grad_norm": 1.0624064207077026,
"learning_rate": 0.00020572044343999566,
"loss": 4.4731950759887695,
"step": 880
},
{
"epoch": 0.6501845018450184,
"grad_norm": 0.9333707094192505,
"learning_rate": 0.00020495321021078686,
"loss": 4.351503849029541,
"step": 881
},
{
"epoch": 0.6509225092250922,
"grad_norm": 0.8607699275016785,
"learning_rate": 0.00020418681757625152,
"loss": 4.024420738220215,
"step": 882
},
{
"epoch": 0.6516605166051661,
"grad_norm": 0.8372026085853577,
"learning_rate": 0.00020342126997789113,
"loss": 4.254813194274902,
"step": 883
},
{
"epoch": 0.6523985239852399,
"grad_norm": 0.8102350234985352,
"learning_rate": 0.00020265657185231017,
"loss": 4.309717178344727,
"step": 884
},
{
"epoch": 0.6531365313653137,
"grad_norm": 0.8655620217323303,
"learning_rate": 0.0002018927276311902,
"loss": 4.270059108734131,
"step": 885
},
{
"epoch": 0.6538745387453875,
"grad_norm": 0.8550220727920532,
"learning_rate": 0.00020112974174126406,
"loss": 4.238635063171387,
"step": 886
},
{
"epoch": 0.6546125461254613,
"grad_norm": 0.8815758228302002,
"learning_rate": 0.00020036761860428999,
"loss": 4.169132232666016,
"step": 887
},
{
"epoch": 0.6553505535055351,
"grad_norm": 0.9161958694458008,
"learning_rate": 0.00019960636263702692,
"loss": 4.314050674438477,
"step": 888
},
{
"epoch": 0.6560885608856089,
"grad_norm": 1.0340604782104492,
"learning_rate": 0.00019884597825120762,
"loss": 3.9258623123168945,
"step": 889
},
{
"epoch": 0.6568265682656826,
"grad_norm": 0.896084725856781,
"learning_rate": 0.000198086469853514,
"loss": 4.141365051269531,
"step": 890
},
{
"epoch": 0.6575645756457564,
"grad_norm": 0.9871026277542114,
"learning_rate": 0.00019732784184555138,
"loss": 4.212796211242676,
"step": 891
},
{
"epoch": 0.6583025830258302,
"grad_norm": 1.0540019273757935,
"learning_rate": 0.00019657009862382286,
"loss": 4.061999797821045,
"step": 892
},
{
"epoch": 0.659040590405904,
"grad_norm": 0.8863611817359924,
"learning_rate": 0.00019581324457970407,
"loss": 4.253866195678711,
"step": 893
},
{
"epoch": 0.6597785977859778,
"grad_norm": 1.1371312141418457,
"learning_rate": 0.00019505728409941711,
"loss": 4.08126163482666,
"step": 894
},
{
"epoch": 0.6605166051660517,
"grad_norm": 2.1047496795654297,
"learning_rate": 0.00019430222156400606,
"loss": 4.196209907531738,
"step": 895
},
{
"epoch": 0.6612546125461255,
"grad_norm": 0.85357266664505,
"learning_rate": 0.00019354806134931087,
"loss": 4.412619590759277,
"step": 896
},
{
"epoch": 0.6619926199261993,
"grad_norm": 1.048453450202942,
"learning_rate": 0.00019279480782594244,
"loss": 4.392220497131348,
"step": 897
},
{
"epoch": 0.6627306273062731,
"grad_norm": 0.8711747527122498,
"learning_rate": 0.00019204246535925654,
"loss": 4.262413024902344,
"step": 898
},
{
"epoch": 0.6634686346863469,
"grad_norm": 0.7952659130096436,
"learning_rate": 0.00019129103830933008,
"loss": 4.36223840713501,
"step": 899
},
{
"epoch": 0.6642066420664207,
"grad_norm": 0.8127221465110779,
"learning_rate": 0.00019054053103093366,
"loss": 4.27398681640625,
"step": 900
},
{
"epoch": 0.6649446494464945,
"grad_norm": 0.8177223801612854,
"learning_rate": 0.0001897909478735083,
"loss": 3.997640609741211,
"step": 901
},
{
"epoch": 0.6656826568265682,
"grad_norm": 1.2305352687835693,
"learning_rate": 0.00018904229318113914,
"loss": 4.09181022644043,
"step": 902
},
{
"epoch": 0.666420664206642,
"grad_norm": 0.862445056438446,
"learning_rate": 0.00018829457129253057,
"loss": 4.322624206542969,
"step": 903
},
{
"epoch": 0.6671586715867158,
"grad_norm": 0.8462716937065125,
"learning_rate": 0.00018754778654098123,
"loss": 4.413826942443848,
"step": 904
},
{
"epoch": 0.6678966789667896,
"grad_norm": 0.8606178164482117,
"learning_rate": 0.00018680194325435839,
"loss": 4.309714317321777,
"step": 905
},
{
"epoch": 0.6686346863468635,
"grad_norm": 0.8559933304786682,
"learning_rate": 0.00018605704575507347,
"loss": 4.162710189819336,
"step": 906
},
{
"epoch": 0.6693726937269373,
"grad_norm": 0.9497646689414978,
"learning_rate": 0.00018531309836005675,
"loss": 4.144913673400879,
"step": 907
},
{
"epoch": 0.6701107011070111,
"grad_norm": 0.8656502962112427,
"learning_rate": 0.00018457010538073236,
"loss": 4.23277473449707,
"step": 908
},
{
"epoch": 0.6708487084870849,
"grad_norm": 0.9220851063728333,
"learning_rate": 0.00018382807112299283,
"loss": 4.004146099090576,
"step": 909
},
{
"epoch": 0.6715867158671587,
"grad_norm": 0.8644999265670776,
"learning_rate": 0.0001830869998871755,
"loss": 4.135645389556885,
"step": 910
},
{
"epoch": 0.6723247232472325,
"grad_norm": 0.9802985787391663,
"learning_rate": 0.0001823468959680356,
"loss": 4.413508892059326,
"step": 911
},
{
"epoch": 0.6730627306273063,
"grad_norm": 0.8389285802841187,
"learning_rate": 0.0001816077636547232,
"loss": 4.484038829803467,
"step": 912
},
{
"epoch": 0.67380073800738,
"grad_norm": 0.9547582864761353,
"learning_rate": 0.00018086960723075727,
"loss": 4.3295416831970215,
"step": 913
},
{
"epoch": 0.6745387453874538,
"grad_norm": 0.8170531392097473,
"learning_rate": 0.00018013243097400128,
"loss": 4.145027160644531,
"step": 914
},
{
"epoch": 0.6752767527675276,
"grad_norm": 0.8581196665763855,
"learning_rate": 0.00017939623915663833,
"loss": 4.246807098388672,
"step": 915
},
{
"epoch": 0.6760147601476014,
"grad_norm": 0.9968565702438354,
"learning_rate": 0.000178661036045146,
"loss": 4.355518817901611,
"step": 916
},
{
"epoch": 0.6767527675276753,
"grad_norm": 1.08475923538208,
"learning_rate": 0.00017792682590027278,
"loss": 4.216618061065674,
"step": 917
},
{
"epoch": 0.6774907749077491,
"grad_norm": 0.9199729561805725,
"learning_rate": 0.00017719361297701167,
"loss": 4.03296422958374,
"step": 918
},
{
"epoch": 0.6782287822878229,
"grad_norm": 0.9441756010055542,
"learning_rate": 0.00017646140152457717,
"loss": 4.32381010055542,
"step": 919
},
{
"epoch": 0.6789667896678967,
"grad_norm": 0.8643115162849426,
"learning_rate": 0.00017573019578637913,
"loss": 4.274387359619141,
"step": 920
},
{
"epoch": 0.6797047970479705,
"grad_norm": 0.8102643489837646,
"learning_rate": 0.00017500000000000008,
"loss": 4.232758522033691,
"step": 921
},
{
"epoch": 0.6804428044280443,
"grad_norm": 1.063491702079773,
"learning_rate": 0.0001742708183971684,
"loss": 4.3541483879089355,
"step": 922
},
{
"epoch": 0.6811808118081181,
"grad_norm": 0.7408610582351685,
"learning_rate": 0.00017354265520373567,
"loss": 4.151790618896484,
"step": 923
},
{
"epoch": 0.6819188191881919,
"grad_norm": 0.7934446930885315,
"learning_rate": 0.0001728155146396511,
"loss": 4.396363258361816,
"step": 924
},
{
"epoch": 0.6826568265682657,
"grad_norm": 0.954188883304596,
"learning_rate": 0.00017208940091893756,
"loss": 3.97440767288208,
"step": 925
},
{
"epoch": 0.6833948339483394,
"grad_norm": 1.0053012371063232,
"learning_rate": 0.00017136431824966715,
"loss": 4.055703163146973,
"step": 926
},
{
"epoch": 0.6841328413284132,
"grad_norm": 0.8948765397071838,
"learning_rate": 0.00017064027083393612,
"loss": 4.3566484451293945,
"step": 927
},
{
"epoch": 0.6848708487084871,
"grad_norm": 0.8520956039428711,
"learning_rate": 0.000169917262867842,
"loss": 3.928354263305664,
"step": 928
},
{
"epoch": 0.6856088560885609,
"grad_norm": 0.8816937804222107,
"learning_rate": 0.00016919529854145745,
"loss": 4.179725170135498,
"step": 929
},
{
"epoch": 0.6863468634686347,
"grad_norm": 1.3806109428405762,
"learning_rate": 0.00016847438203880735,
"loss": 4.185024738311768,
"step": 930
},
{
"epoch": 0.6870848708487085,
"grad_norm": 0.7780953049659729,
"learning_rate": 0.00016775451753784414,
"loss": 4.327208995819092,
"step": 931
},
{
"epoch": 0.6878228782287823,
"grad_norm": 1.103068470954895,
"learning_rate": 0.00016703570921042344,
"loss": 4.4666948318481445,
"step": 932
},
{
"epoch": 0.6885608856088561,
"grad_norm": 0.8747889995574951,
"learning_rate": 0.00016631796122227983,
"loss": 4.146649360656738,
"step": 933
},
{
"epoch": 0.6892988929889299,
"grad_norm": 0.9435983896255493,
"learning_rate": 0.00016560127773300313,
"loss": 4.554599761962891,
"step": 934
},
{
"epoch": 0.6900369003690037,
"grad_norm": 0.800839364528656,
"learning_rate": 0.00016488566289601388,
"loss": 4.008693218231201,
"step": 935
},
{
"epoch": 0.6907749077490775,
"grad_norm": 0.8912091851234436,
"learning_rate": 0.00016417112085853969,
"loss": 4.274938583374023,
"step": 936
},
{
"epoch": 0.6915129151291513,
"grad_norm": 0.9369155168533325,
"learning_rate": 0.00016345765576159042,
"loss": 4.299943447113037,
"step": 937
},
{
"epoch": 0.692250922509225,
"grad_norm": 0.7870283722877502,
"learning_rate": 0.000162745271739935,
"loss": 4.334400177001953,
"step": 938
},
{
"epoch": 0.6929889298892989,
"grad_norm": 0.8504934310913086,
"learning_rate": 0.00016203397292207758,
"loss": 4.140174865722656,
"step": 939
},
{
"epoch": 0.6937269372693727,
"grad_norm": 1.016496181488037,
"learning_rate": 0.00016132376343023233,
"loss": 4.296517848968506,
"step": 940
},
{
"epoch": 0.6944649446494465,
"grad_norm": 1.214504599571228,
"learning_rate": 0.00016061464738030106,
"loss": 4.107439041137695,
"step": 941
},
{
"epoch": 0.6952029520295203,
"grad_norm": 0.9972517490386963,
"learning_rate": 0.0001599066288818485,
"loss": 4.203211307525635,
"step": 942
},
{
"epoch": 0.6959409594095941,
"grad_norm": 0.8465280532836914,
"learning_rate": 0.0001591997120380788,
"loss": 4.145995616912842,
"step": 943
},
{
"epoch": 0.6966789667896679,
"grad_norm": 0.9349222779273987,
"learning_rate": 0.00015849390094581142,
"loss": 4.01326847076416,
"step": 944
},
{
"epoch": 0.6974169741697417,
"grad_norm": 1.018925666809082,
"learning_rate": 0.0001577891996954578,
"loss": 4.1635026931762695,
"step": 945
},
{
"epoch": 0.6981549815498155,
"grad_norm": 0.7925598621368408,
"learning_rate": 0.0001570856123709975,
"loss": 4.1779022216796875,
"step": 946
},
{
"epoch": 0.6988929889298893,
"grad_norm": 0.932461142539978,
"learning_rate": 0.00015638314304995454,
"loss": 4.21356201171875,
"step": 947
},
{
"epoch": 0.6996309963099631,
"grad_norm": 0.9300697445869446,
"learning_rate": 0.00015568179580337333,
"loss": 4.0165696144104,
"step": 948
},
{
"epoch": 0.7003690036900369,
"grad_norm": 0.8354659676551819,
"learning_rate": 0.0001549815746957962,
"loss": 4.235401630401611,
"step": 949
},
{
"epoch": 0.7011070110701108,
"grad_norm": 0.926152765750885,
"learning_rate": 0.00015428248378523865,
"loss": 4.415463447570801,
"step": 950
},
{
"epoch": 0.7018450184501845,
"grad_norm": 1.0506153106689453,
"learning_rate": 0.0001535845271231662,
"loss": 4.269872665405273,
"step": 951
},
{
"epoch": 0.7025830258302583,
"grad_norm": 0.8655766248703003,
"learning_rate": 0.00015288770875447128,
"loss": 4.350858688354492,
"step": 952
},
{
"epoch": 0.7033210332103321,
"grad_norm": 0.7818317413330078,
"learning_rate": 0.00015219203271744954,
"loss": 4.015618801116943,
"step": 953
},
{
"epoch": 0.7040590405904059,
"grad_norm": 0.8752409815788269,
"learning_rate": 0.00015149750304377645,
"loss": 4.2518510818481445,
"step": 954
},
{
"epoch": 0.7047970479704797,
"grad_norm": 1.2109910249710083,
"learning_rate": 0.00015080412375848357,
"loss": 4.2393035888671875,
"step": 955
},
{
"epoch": 0.7055350553505535,
"grad_norm": 0.925682544708252,
"learning_rate": 0.00015011189887993598,
"loss": 4.126298904418945,
"step": 956
},
{
"epoch": 0.7062730627306273,
"grad_norm": 0.9237503409385681,
"learning_rate": 0.00014942083241980837,
"loss": 3.981215476989746,
"step": 957
},
{
"epoch": 0.7070110701107011,
"grad_norm": 0.9711774587631226,
"learning_rate": 0.0001487309283830623,
"loss": 4.350025177001953,
"step": 958
},
{
"epoch": 0.7077490774907749,
"grad_norm": 0.9577892422676086,
"learning_rate": 0.00014804219076792202,
"loss": 4.178315162658691,
"step": 959
},
{
"epoch": 0.7084870848708487,
"grad_norm": 0.9610137343406677,
"learning_rate": 0.00014735462356585302,
"loss": 4.048961639404297,
"step": 960
},
{
"epoch": 0.7092250922509226,
"grad_norm": 0.8772600889205933,
"learning_rate": 0.0001466682307615368,
"loss": 4.249874114990234,
"step": 961
},
{
"epoch": 0.7099630996309964,
"grad_norm": 0.8270952105522156,
"learning_rate": 0.00014598301633284952,
"loss": 4.296774387359619,
"step": 962
},
{
"epoch": 0.7107011070110701,
"grad_norm": 0.8505011796951294,
"learning_rate": 0.00014529898425083793,
"loss": 4.15446662902832,
"step": 963
},
{
"epoch": 0.7114391143911439,
"grad_norm": 0.7727055549621582,
"learning_rate": 0.00014461613847969687,
"loss": 4.255171298980713,
"step": 964
},
{
"epoch": 0.7121771217712177,
"grad_norm": 1.0215280055999756,
"learning_rate": 0.00014393448297674613,
"loss": 4.0843987464904785,
"step": 965
},
{
"epoch": 0.7129151291512915,
"grad_norm": 0.9580904841423035,
"learning_rate": 0.00014325402169240717,
"loss": 4.22476863861084,
"step": 966
},
{
"epoch": 0.7136531365313653,
"grad_norm": 0.8007642030715942,
"learning_rate": 0.0001425747585701809,
"loss": 4.2899861335754395,
"step": 967
},
{
"epoch": 0.7143911439114391,
"grad_norm": 1.048153042793274,
"learning_rate": 0.00014189669754662433,
"loss": 4.137915134429932,
"step": 968
},
{
"epoch": 0.7151291512915129,
"grad_norm": 0.9277073740959167,
"learning_rate": 0.00014121984255132812,
"loss": 4.19291877746582,
"step": 969
},
{
"epoch": 0.7158671586715867,
"grad_norm": 0.9412124752998352,
"learning_rate": 0.00014054419750689302,
"loss": 4.134371757507324,
"step": 970
},
{
"epoch": 0.7166051660516605,
"grad_norm": 0.9520360827445984,
"learning_rate": 0.0001398697663289086,
"loss": 3.9994983673095703,
"step": 971
},
{
"epoch": 0.7173431734317344,
"grad_norm": 1.108952522277832,
"learning_rate": 0.00013919655292592885,
"loss": 4.142839431762695,
"step": 972
},
{
"epoch": 0.7180811808118082,
"grad_norm": 0.882947564125061,
"learning_rate": 0.0001385245611994507,
"loss": 4.331300258636475,
"step": 973
},
{
"epoch": 0.718819188191882,
"grad_norm": 0.9011394381523132,
"learning_rate": 0.00013785379504389108,
"loss": 4.304719924926758,
"step": 974
},
{
"epoch": 0.7195571955719557,
"grad_norm": 0.9489427208900452,
"learning_rate": 0.00013718425834656427,
"loss": 3.873215675354004,
"step": 975
},
{
"epoch": 0.7202952029520295,
"grad_norm": 0.8889840841293335,
"learning_rate": 0.00013651595498765954,
"loss": 4.21721076965332,
"step": 976
},
{
"epoch": 0.7210332103321033,
"grad_norm": 0.8962631821632385,
"learning_rate": 0.0001358488888402181,
"loss": 4.268343925476074,
"step": 977
},
{
"epoch": 0.7217712177121771,
"grad_norm": 1.0096079111099243,
"learning_rate": 0.0001351830637701119,
"loss": 4.305258750915527,
"step": 978
},
{
"epoch": 0.7225092250922509,
"grad_norm": 0.8910917043685913,
"learning_rate": 0.0001345184836360196,
"loss": 4.095419883728027,
"step": 979
},
{
"epoch": 0.7232472324723247,
"grad_norm": 0.8660383224487305,
"learning_rate": 0.00013385515228940572,
"loss": 4.2480149269104,
"step": 980
},
{
"epoch": 0.7239852398523985,
"grad_norm": 0.7730628252029419,
"learning_rate": 0.00013319307357449696,
"loss": 4.004230499267578,
"step": 981
},
{
"epoch": 0.7247232472324723,
"grad_norm": 0.9015150666236877,
"learning_rate": 0.00013253225132826138,
"loss": 4.344229698181152,
"step": 982
},
{
"epoch": 0.7254612546125462,
"grad_norm": 0.8757840991020203,
"learning_rate": 0.0001318726893803847,
"loss": 4.284424781799316,
"step": 983
},
{
"epoch": 0.72619926199262,
"grad_norm": 0.8267972469329834,
"learning_rate": 0.00013121439155324918,
"loss": 3.9191102981567383,
"step": 984
},
{
"epoch": 0.7269372693726938,
"grad_norm": 0.998901903629303,
"learning_rate": 0.00013055736166191095,
"loss": 4.020920276641846,
"step": 985
},
{
"epoch": 0.7276752767527676,
"grad_norm": 0.9288577437400818,
"learning_rate": 0.00012990160351407804,
"loss": 4.161448001861572,
"step": 986
},
{
"epoch": 0.7284132841328413,
"grad_norm": 0.8598924279212952,
"learning_rate": 0.00012924712091008842,
"loss": 4.157841205596924,
"step": 987
},
{
"epoch": 0.7291512915129151,
"grad_norm": 0.8927615880966187,
"learning_rate": 0.0001285939176428874,
"loss": 4.054559230804443,
"step": 988
},
{
"epoch": 0.7298892988929889,
"grad_norm": 0.8624060750007629,
"learning_rate": 0.00012794199749800698,
"loss": 4.096704006195068,
"step": 989
},
{
"epoch": 0.7306273062730627,
"grad_norm": 0.9361541271209717,
"learning_rate": 0.00012729136425354204,
"loss": 4.233707427978516,
"step": 990
},
{
"epoch": 0.7313653136531365,
"grad_norm": 0.9343904256820679,
"learning_rate": 0.00012664202168013005,
"loss": 3.9704904556274414,
"step": 991
},
{
"epoch": 0.7321033210332103,
"grad_norm": 0.9579162001609802,
"learning_rate": 0.0001259939735409285,
"loss": 4.047106742858887,
"step": 992
},
{
"epoch": 0.7328413284132841,
"grad_norm": 0.9848127365112305,
"learning_rate": 0.0001253472235915933,
"loss": 4.055668830871582,
"step": 993
},
{
"epoch": 0.7335793357933579,
"grad_norm": 0.8801389932632446,
"learning_rate": 0.00012470177558025652,
"loss": 4.0792717933654785,
"step": 994
},
{
"epoch": 0.7343173431734318,
"grad_norm": 1.0689746141433716,
"learning_rate": 0.0001240576332475054,
"loss": 4.3891496658325195,
"step": 995
},
{
"epoch": 0.7350553505535056,
"grad_norm": 0.9340549111366272,
"learning_rate": 0.00012341480032636035,
"loss": 3.9269206523895264,
"step": 996
},
{
"epoch": 0.7357933579335794,
"grad_norm": 1.6336300373077393,
"learning_rate": 0.0001227732805422531,
"loss": 4.390302658081055,
"step": 997
},
{
"epoch": 0.7365313653136532,
"grad_norm": 1.0127570629119873,
"learning_rate": 0.00012213307761300567,
"loss": 4.110518455505371,
"step": 998
},
{
"epoch": 0.7372693726937269,
"grad_norm": 0.9814800024032593,
"learning_rate": 0.00012149419524880778,
"loss": 4.395967960357666,
"step": 999
},
{
"epoch": 0.7380073800738007,
"grad_norm": 0.8709611892700195,
"learning_rate": 0.00012085663715219694,
"loss": 4.2395758628845215,
"step": 1000
},
{
"epoch": 0.7387453874538745,
"grad_norm": 0.810457706451416,
"learning_rate": 0.00012022040701803532,
"loss": 4.192559242248535,
"step": 1001
},
{
"epoch": 0.7394833948339483,
"grad_norm": 0.8743382692337036,
"learning_rate": 0.00011958550853348949,
"loss": 4.053243637084961,
"step": 1002
},
{
"epoch": 0.7402214022140221,
"grad_norm": 1.0082160234451294,
"learning_rate": 0.0001189519453780086,
"loss": 4.024561405181885,
"step": 1003
},
{
"epoch": 0.7409594095940959,
"grad_norm": 0.9494944214820862,
"learning_rate": 0.00011831972122330317,
"loss": 4.133411407470703,
"step": 1004
},
{
"epoch": 0.7416974169741697,
"grad_norm": 0.729489266872406,
"learning_rate": 0.00011768883973332351,
"loss": 4.2208356857299805,
"step": 1005
},
{
"epoch": 0.7424354243542436,
"grad_norm": 0.8364970684051514,
"learning_rate": 0.000117059304564239,
"loss": 4.144787788391113,
"step": 1006
},
{
"epoch": 0.7431734317343174,
"grad_norm": 1.0048389434814453,
"learning_rate": 0.00011643111936441654,
"loss": 4.1646552085876465,
"step": 1007
},
{
"epoch": 0.7439114391143912,
"grad_norm": 0.8014469742774963,
"learning_rate": 0.00011580428777439973,
"loss": 4.183121681213379,
"step": 1008
},
{
"epoch": 0.744649446494465,
"grad_norm": 1.1298073530197144,
"learning_rate": 0.00011517881342688705,
"loss": 4.2498016357421875,
"step": 1009
},
{
"epoch": 0.7453874538745388,
"grad_norm": 0.9686313271522522,
"learning_rate": 0.00011455469994671158,
"loss": 4.157444000244141,
"step": 1010
},
{
"epoch": 0.7461254612546125,
"grad_norm": 0.7569875121116638,
"learning_rate": 0.00011393195095082015,
"loss": 4.179769515991211,
"step": 1011
},
{
"epoch": 0.7468634686346863,
"grad_norm": 0.9126372933387756,
"learning_rate": 0.00011331057004825114,
"loss": 4.2508544921875,
"step": 1012
},
{
"epoch": 0.7476014760147601,
"grad_norm": 0.9252088069915771,
"learning_rate": 0.00011269056084011492,
"loss": 4.427289009094238,
"step": 1013
},
{
"epoch": 0.7483394833948339,
"grad_norm": 0.8704126477241516,
"learning_rate": 0.00011207192691957224,
"loss": 4.120467185974121,
"step": 1014
},
{
"epoch": 0.7490774907749077,
"grad_norm": 0.7337223291397095,
"learning_rate": 0.00011145467187181378,
"loss": 4.24467658996582,
"step": 1015
},
{
"epoch": 0.7498154981549815,
"grad_norm": 1.0976858139038086,
"learning_rate": 0.0001108387992740388,
"loss": 4.356447696685791,
"step": 1016
},
{
"epoch": 0.7505535055350554,
"grad_norm": 0.9983663558959961,
"learning_rate": 0.00011022431269543517,
"loss": 4.160353660583496,
"step": 1017
},
{
"epoch": 0.7512915129151292,
"grad_norm": 1.2688814401626587,
"learning_rate": 0.00010961121569715825,
"loss": 4.209506988525391,
"step": 1018
},
{
"epoch": 0.752029520295203,
"grad_norm": 0.8176230788230896,
"learning_rate": 0.00010899951183231028,
"loss": 4.172100067138672,
"step": 1019
},
{
"epoch": 0.7527675276752768,
"grad_norm": 0.8766177892684937,
"learning_rate": 0.00010838920464591952,
"loss": 4.03950834274292,
"step": 1020
},
{
"epoch": 0.7535055350553506,
"grad_norm": 0.8611599802970886,
"learning_rate": 0.00010778029767492066,
"loss": 4.484358787536621,
"step": 1021
},
{
"epoch": 0.7542435424354244,
"grad_norm": 0.8686861395835876,
"learning_rate": 0.00010717279444813325,
"loss": 4.179934501647949,
"step": 1022
},
{
"epoch": 0.7549815498154981,
"grad_norm": 0.8060294985771179,
"learning_rate": 0.00010656669848624154,
"loss": 4.116765975952148,
"step": 1023
},
{
"epoch": 0.7557195571955719,
"grad_norm": 0.9301735162734985,
"learning_rate": 0.0001059620133017745,
"loss": 3.9561753273010254,
"step": 1024
},
{
"epoch": 0.7564575645756457,
"grad_norm": 0.8732739686965942,
"learning_rate": 0.00010535874239908514,
"loss": 4.087579250335693,
"step": 1025
},
{
"epoch": 0.7571955719557195,
"grad_norm": 0.8588765859603882,
"learning_rate": 0.00010475688927433018,
"loss": 4.3742876052856445,
"step": 1026
},
{
"epoch": 0.7579335793357933,
"grad_norm": 0.7727426290512085,
"learning_rate": 0.0001041564574154497,
"loss": 4.053977012634277,
"step": 1027
},
{
"epoch": 0.7586715867158672,
"grad_norm": 0.7998579144477844,
"learning_rate": 0.00010355745030214725,
"loss": 4.124699592590332,
"step": 1028
},
{
"epoch": 0.759409594095941,
"grad_norm": 0.8935081362724304,
"learning_rate": 0.00010295987140586949,
"loss": 4.136198997497559,
"step": 1029
},
{
"epoch": 0.7601476014760148,
"grad_norm": 0.8178191184997559,
"learning_rate": 0.00010236372418978614,
"loss": 4.050296783447266,
"step": 1030
},
{
"epoch": 0.7608856088560886,
"grad_norm": 0.8848076462745667,
"learning_rate": 0.00010176901210876947,
"loss": 3.9550304412841797,
"step": 1031
},
{
"epoch": 0.7616236162361624,
"grad_norm": 0.9924454689025879,
"learning_rate": 0.00010117573860937533,
"loss": 4.258056640625,
"step": 1032
},
{
"epoch": 0.7623616236162362,
"grad_norm": 1.1687978506088257,
"learning_rate": 0.00010058390712982184,
"loss": 4.050140380859375,
"step": 1033
},
{
"epoch": 0.76309963099631,
"grad_norm": 0.8403159379959106,
"learning_rate": 9.999352109997051e-05,
"loss": 4.150047302246094,
"step": 1034
},
{
"epoch": 0.7638376383763837,
"grad_norm": 0.92585289478302,
"learning_rate": 9.940458394130595e-05,
"loss": 3.9567012786865234,
"step": 1035
},
{
"epoch": 0.7645756457564575,
"grad_norm": 0.9140007495880127,
"learning_rate": 9.881709906691602e-05,
"loss": 4.100074291229248,
"step": 1036
},
{
"epoch": 0.7653136531365313,
"grad_norm": 0.9550725817680359,
"learning_rate": 9.823106988147217e-05,
"loss": 4.270690441131592,
"step": 1037
},
{
"epoch": 0.7660516605166051,
"grad_norm": 1.0491443872451782,
"learning_rate": 9.764649978120944e-05,
"loss": 4.158552169799805,
"step": 1038
},
{
"epoch": 0.766789667896679,
"grad_norm": 0.827170729637146,
"learning_rate": 9.706339215390715e-05,
"loss": 4.432864189147949,
"step": 1039
},
{
"epoch": 0.7675276752767528,
"grad_norm": 1.0315954685211182,
"learning_rate": 9.64817503788692e-05,
"loss": 4.234312534332275,
"step": 1040
},
{
"epoch": 0.7682656826568266,
"grad_norm": 0.9796032905578613,
"learning_rate": 9.590157782690429e-05,
"loss": 3.9558591842651367,
"step": 1041
},
{
"epoch": 0.7690036900369004,
"grad_norm": 1.082369327545166,
"learning_rate": 9.532287786030617e-05,
"loss": 4.016860485076904,
"step": 1042
},
{
"epoch": 0.7697416974169742,
"grad_norm": 0.9409294724464417,
"learning_rate": 9.474565383283518e-05,
"loss": 4.121254920959473,
"step": 1043
},
{
"epoch": 0.770479704797048,
"grad_norm": 0.9006357192993164,
"learning_rate": 9.416990908969736e-05,
"loss": 4.089673042297363,
"step": 1044
},
{
"epoch": 0.7712177121771218,
"grad_norm": 1.0219764709472656,
"learning_rate": 9.359564696752622e-05,
"loss": 3.96942138671875,
"step": 1045
},
{
"epoch": 0.7719557195571956,
"grad_norm": 0.9810526967048645,
"learning_rate": 9.302287079436289e-05,
"loss": 3.9760637283325195,
"step": 1046
},
{
"epoch": 0.7726937269372693,
"grad_norm": 0.9141161441802979,
"learning_rate": 9.245158388963689e-05,
"loss": 4.305903434753418,
"step": 1047
},
{
"epoch": 0.7734317343173431,
"grad_norm": 2.4086737632751465,
"learning_rate": 9.188178956414705e-05,
"loss": 4.438955307006836,
"step": 1048
},
{
"epoch": 0.7741697416974169,
"grad_norm": 0.9075452089309692,
"learning_rate": 9.131349112004189e-05,
"loss": 4.143951416015625,
"step": 1049
},
{
"epoch": 0.7749077490774908,
"grad_norm": 0.8627065420150757,
"learning_rate": 9.074669185080134e-05,
"loss": 4.145493984222412,
"step": 1050
},
{
"epoch": 0.7756457564575646,
"grad_norm": 0.9488852620124817,
"learning_rate": 9.018139504121653e-05,
"loss": 4.0962677001953125,
"step": 1051
},
{
"epoch": 0.7763837638376384,
"grad_norm": 0.8987521529197693,
"learning_rate": 8.96176039673717e-05,
"loss": 4.037075996398926,
"step": 1052
},
{
"epoch": 0.7771217712177122,
"grad_norm": 1.5608737468719482,
"learning_rate": 8.905532189662476e-05,
"loss": 4.093520164489746,
"step": 1053
},
{
"epoch": 0.777859778597786,
"grad_norm": 1.0437077283859253,
"learning_rate": 8.849455208758849e-05,
"loss": 4.453344821929932,
"step": 1054
},
{
"epoch": 0.7785977859778598,
"grad_norm": 0.9098041653633118,
"learning_rate": 8.793529779011133e-05,
"loss": 3.896477699279785,
"step": 1055
},
{
"epoch": 0.7793357933579336,
"grad_norm": 0.7569055557250977,
"learning_rate": 8.737756224525918e-05,
"loss": 4.115358352661133,
"step": 1056
},
{
"epoch": 0.7800738007380074,
"grad_norm": 1.0293289422988892,
"learning_rate": 8.68213486852961e-05,
"loss": 4.121119976043701,
"step": 1057
},
{
"epoch": 0.7808118081180812,
"grad_norm": 0.8127309083938599,
"learning_rate": 8.626666033366578e-05,
"loss": 4.106558799743652,
"step": 1058
},
{
"epoch": 0.7815498154981549,
"grad_norm": 0.9031323790550232,
"learning_rate": 8.57135004049728e-05,
"loss": 3.9452483654022217,
"step": 1059
},
{
"epoch": 0.7822878228782287,
"grad_norm": 1.2205437421798706,
"learning_rate": 8.516187210496385e-05,
"loss": 3.8204894065856934,
"step": 1060
},
{
"epoch": 0.7830258302583026,
"grad_norm": 0.907437801361084,
"learning_rate": 8.461177863050975e-05,
"loss": 4.430585861206055,
"step": 1061
},
{
"epoch": 0.7837638376383764,
"grad_norm": 0.8979167342185974,
"learning_rate": 8.406322316958601e-05,
"loss": 4.146002292633057,
"step": 1062
},
{
"epoch": 0.7845018450184502,
"grad_norm": 0.9116492867469788,
"learning_rate": 8.351620890125513e-05,
"loss": 4.052881240844727,
"step": 1063
},
{
"epoch": 0.785239852398524,
"grad_norm": 1.1355615854263306,
"learning_rate": 8.297073899564777e-05,
"loss": 4.160739898681641,
"step": 1064
},
{
"epoch": 0.7859778597785978,
"grad_norm": 0.8989261984825134,
"learning_rate": 8.242681661394466e-05,
"loss": 3.9555885791778564,
"step": 1065
},
{
"epoch": 0.7867158671586716,
"grad_norm": 1.0729719400405884,
"learning_rate": 8.188444490835773e-05,
"loss": 4.048243999481201,
"step": 1066
},
{
"epoch": 0.7874538745387454,
"grad_norm": 0.9004929661750793,
"learning_rate": 8.134362702211263e-05,
"loss": 4.261412143707275,
"step": 1067
},
{
"epoch": 0.7881918819188192,
"grad_norm": 0.7723477482795715,
"learning_rate": 8.080436608942988e-05,
"loss": 3.9241394996643066,
"step": 1068
},
{
"epoch": 0.788929889298893,
"grad_norm": 0.833265483379364,
"learning_rate": 8.026666523550708e-05,
"loss": 4.336735248565674,
"step": 1069
},
{
"epoch": 0.7896678966789668,
"grad_norm": 0.9609919190406799,
"learning_rate": 7.973052757650058e-05,
"loss": 3.9808225631713867,
"step": 1070
},
{
"epoch": 0.7904059040590405,
"grad_norm": 1.0244325399398804,
"learning_rate": 7.919595621950728e-05,
"loss": 4.093958854675293,
"step": 1071
},
{
"epoch": 0.7911439114391144,
"grad_norm": 0.7694634199142456,
"learning_rate": 7.866295426254735e-05,
"loss": 3.9361343383789062,
"step": 1072
},
{
"epoch": 0.7918819188191882,
"grad_norm": 0.8412328958511353,
"learning_rate": 7.813152479454516e-05,
"loss": 4.3025431632995605,
"step": 1073
},
{
"epoch": 0.792619926199262,
"grad_norm": 0.9007997512817383,
"learning_rate": 7.760167089531244e-05,
"loss": 4.1600799560546875,
"step": 1074
},
{
"epoch": 0.7933579335793358,
"grad_norm": 0.8552005887031555,
"learning_rate": 7.707339563552973e-05,
"loss": 3.9373395442962646,
"step": 1075
},
{
"epoch": 0.7940959409594096,
"grad_norm": 0.9131635427474976,
"learning_rate": 7.654670207672905e-05,
"loss": 4.242855072021484,
"step": 1076
},
{
"epoch": 0.7948339483394834,
"grad_norm": 0.8459916710853577,
"learning_rate": 7.602159327127555e-05,
"loss": 4.222464084625244,
"step": 1077
},
{
"epoch": 0.7955719557195572,
"grad_norm": 0.9173424243927002,
"learning_rate": 7.549807226235051e-05,
"loss": 4.072568416595459,
"step": 1078
},
{
"epoch": 0.796309963099631,
"grad_norm": 0.9082213640213013,
"learning_rate": 7.497614208393341e-05,
"loss": 3.9589667320251465,
"step": 1079
},
{
"epoch": 0.7970479704797048,
"grad_norm": 0.8568102717399597,
"learning_rate": 7.44558057607843e-05,
"loss": 4.217202663421631,
"step": 1080
},
{
"epoch": 0.7977859778597786,
"grad_norm": 0.9027300477027893,
"learning_rate": 7.393706630842592e-05,
"loss": 4.339812278747559,
"step": 1081
},
{
"epoch": 0.7985239852398524,
"grad_norm": 0.8236647844314575,
"learning_rate": 7.341992673312733e-05,
"loss": 3.9794492721557617,
"step": 1082
},
{
"epoch": 0.7992619926199263,
"grad_norm": 1.059795618057251,
"learning_rate": 7.290439003188531e-05,
"loss": 4.107804298400879,
"step": 1083
},
{
"epoch": 0.8,
"grad_norm": 1.0308328866958618,
"learning_rate": 7.239045919240731e-05,
"loss": 4.0905232429504395,
"step": 1084
},
{
"epoch": 0.8007380073800738,
"grad_norm": 0.9931803345680237,
"learning_rate": 7.187813719309466e-05,
"loss": 3.8613810539245605,
"step": 1085
},
{
"epoch": 0.8014760147601476,
"grad_norm": 0.9674167037010193,
"learning_rate": 7.136742700302469e-05,
"loss": 4.229313850402832,
"step": 1086
},
{
"epoch": 0.8022140221402214,
"grad_norm": 1.1589391231536865,
"learning_rate": 7.085833158193391e-05,
"loss": 4.372422695159912,
"step": 1087
},
{
"epoch": 0.8029520295202952,
"grad_norm": 1.1077316999435425,
"learning_rate": 7.035085388020041e-05,
"loss": 4.1049089431762695,
"step": 1088
},
{
"epoch": 0.803690036900369,
"grad_norm": 0.9430045485496521,
"learning_rate": 6.984499683882739e-05,
"loss": 4.282869338989258,
"step": 1089
},
{
"epoch": 0.8044280442804428,
"grad_norm": 1.254410982131958,
"learning_rate": 6.934076338942564e-05,
"loss": 3.9536659717559814,
"step": 1090
},
{
"epoch": 0.8051660516605166,
"grad_norm": 0.8754069209098816,
"learning_rate": 6.883815645419675e-05,
"loss": 4.139862060546875,
"step": 1091
},
{
"epoch": 0.8059040590405904,
"grad_norm": 0.9515761733055115,
"learning_rate": 6.833717894591579e-05,
"loss": 4.331487655639648,
"step": 1092
},
{
"epoch": 0.8066420664206642,
"grad_norm": 1.1361658573150635,
"learning_rate": 6.783783376791533e-05,
"loss": 4.143629550933838,
"step": 1093
},
{
"epoch": 0.8073800738007381,
"grad_norm": 0.8871273398399353,
"learning_rate": 6.734012381406767e-05,
"loss": 4.211644172668457,
"step": 1094
},
{
"epoch": 0.8081180811808119,
"grad_norm": 0.8796087503433228,
"learning_rate": 6.684405196876843e-05,
"loss": 4.109099864959717,
"step": 1095
},
{
"epoch": 0.8088560885608856,
"grad_norm": 1.0282338857650757,
"learning_rate": 6.634962110691991e-05,
"loss": 3.9217135906219482,
"step": 1096
},
{
"epoch": 0.8095940959409594,
"grad_norm": 0.8852423429489136,
"learning_rate": 6.585683409391441e-05,
"loss": 3.826831579208374,
"step": 1097
},
{
"epoch": 0.8103321033210332,
"grad_norm": 1.1207947731018066,
"learning_rate": 6.536569378561766e-05,
"loss": 4.236572265625,
"step": 1098
},
{
"epoch": 0.811070110701107,
"grad_norm": 0.7631810307502747,
"learning_rate": 6.487620302835181e-05,
"loss": 4.135857582092285,
"step": 1099
},
{
"epoch": 0.8118081180811808,
"grad_norm": 1.0373399257659912,
"learning_rate": 6.438836465887968e-05,
"loss": 3.926546096801758,
"step": 1100
},
{
"epoch": 0.8125461254612546,
"grad_norm": 0.8193474411964417,
"learning_rate": 6.390218150438787e-05,
"loss": 4.056336402893066,
"step": 1101
},
{
"epoch": 0.8132841328413284,
"grad_norm": 0.8076398968696594,
"learning_rate": 6.341765638247046e-05,
"loss": 4.038424968719482,
"step": 1102
},
{
"epoch": 0.8140221402214022,
"grad_norm": 0.9038758873939514,
"learning_rate": 6.29347921011124e-05,
"loss": 4.076757431030273,
"step": 1103
},
{
"epoch": 0.814760147601476,
"grad_norm": 1.0241302251815796,
"learning_rate": 6.245359145867404e-05,
"loss": 4.188800811767578,
"step": 1104
},
{
"epoch": 0.8154981549815498,
"grad_norm": 0.8670378923416138,
"learning_rate": 6.197405724387391e-05,
"loss": 3.7736902236938477,
"step": 1105
},
{
"epoch": 0.8162361623616237,
"grad_norm": 0.8043569922447205,
"learning_rate": 6.149619223577322e-05,
"loss": 4.0094099044799805,
"step": 1106
},
{
"epoch": 0.8169741697416975,
"grad_norm": 1.0722813606262207,
"learning_rate": 6.101999920375964e-05,
"loss": 4.505285263061523,
"step": 1107
},
{
"epoch": 0.8177121771217712,
"grad_norm": 0.8136195540428162,
"learning_rate": 6.054548090753103e-05,
"loss": 3.993842840194702,
"step": 1108
},
{
"epoch": 0.818450184501845,
"grad_norm": 0.8687028288841248,
"learning_rate": 6.0072640097079836e-05,
"loss": 4.127281188964844,
"step": 1109
},
{
"epoch": 0.8191881918819188,
"grad_norm": 0.879191517829895,
"learning_rate": 5.960147951267643e-05,
"loss": 4.027138710021973,
"step": 1110
},
{
"epoch": 0.8199261992619926,
"grad_norm": 0.8649862408638,
"learning_rate": 5.913200188485442e-05,
"loss": 4.080497741699219,
"step": 1111
},
{
"epoch": 0.8206642066420664,
"grad_norm": 0.9337714314460754,
"learning_rate": 5.866420993439344e-05,
"loss": 4.245942115783691,
"step": 1112
},
{
"epoch": 0.8214022140221402,
"grad_norm": 0.8696949481964111,
"learning_rate": 5.81981063723045e-05,
"loss": 4.227627754211426,
"step": 1113
},
{
"epoch": 0.822140221402214,
"grad_norm": 0.9521300792694092,
"learning_rate": 5.773369389981347e-05,
"loss": 4.130904197692871,
"step": 1114
},
{
"epoch": 0.8228782287822878,
"grad_norm": 1.0789848566055298,
"learning_rate": 5.7270975208346306e-05,
"loss": 4.207403182983398,
"step": 1115
},
{
"epoch": 0.8236162361623616,
"grad_norm": 0.8551551103591919,
"learning_rate": 5.680995297951237e-05,
"loss": 4.2299041748046875,
"step": 1116
},
{
"epoch": 0.8243542435424355,
"grad_norm": 0.790813684463501,
"learning_rate": 5.635062988508984e-05,
"loss": 4.201531410217285,
"step": 1117
},
{
"epoch": 0.8250922509225093,
"grad_norm": 0.7844054698944092,
"learning_rate": 5.5893008587009665e-05,
"loss": 3.9883697032928467,
"step": 1118
},
{
"epoch": 0.825830258302583,
"grad_norm": 0.8120241165161133,
"learning_rate": 5.543709173734044e-05,
"loss": 3.9854788780212402,
"step": 1119
},
{
"epoch": 0.8265682656826568,
"grad_norm": 1.1635088920593262,
"learning_rate": 5.498288197827285e-05,
"loss": 3.948390007019043,
"step": 1120
},
{
"epoch": 0.8273062730627306,
"grad_norm": 0.8426750898361206,
"learning_rate": 5.4530381942104213e-05,
"loss": 4.034334182739258,
"step": 1121
},
{
"epoch": 0.8280442804428044,
"grad_norm": 0.8258629441261292,
"learning_rate": 5.4079594251223894e-05,
"loss": 4.009230613708496,
"step": 1122
},
{
"epoch": 0.8287822878228782,
"grad_norm": 0.8874958157539368,
"learning_rate": 5.363052151809721e-05,
"loss": 3.9225668907165527,
"step": 1123
},
{
"epoch": 0.829520295202952,
"grad_norm": 0.9092878103256226,
"learning_rate": 5.318316634525092e-05,
"loss": 4.106935977935791,
"step": 1124
},
{
"epoch": 0.8302583025830258,
"grad_norm": 1.0611941814422607,
"learning_rate": 5.273753132525793e-05,
"loss": 4.086188793182373,
"step": 1125
},
{
"epoch": 0.8309963099630996,
"grad_norm": 0.9324346780776978,
"learning_rate": 5.229361904072231e-05,
"loss": 4.163631916046143,
"step": 1126
},
{
"epoch": 0.8317343173431734,
"grad_norm": 0.901092529296875,
"learning_rate": 5.1851432064264184e-05,
"loss": 3.8213887214660645,
"step": 1127
},
{
"epoch": 0.8324723247232473,
"grad_norm": 0.8883295655250549,
"learning_rate": 5.141097295850506e-05,
"loss": 4.020335674285889,
"step": 1128
},
{
"epoch": 0.8332103321033211,
"grad_norm": 0.8910163044929504,
"learning_rate": 5.0972244276052794e-05,
"loss": 3.904737949371338,
"step": 1129
},
{
"epoch": 0.8339483394833949,
"grad_norm": 0.9039924144744873,
"learning_rate": 5.053524855948689e-05,
"loss": 3.9267964363098145,
"step": 1130
},
{
"epoch": 0.8346863468634687,
"grad_norm": 0.7226197123527527,
"learning_rate": 5.0099988341343834e-05,
"loss": 4.004914283752441,
"step": 1131
},
{
"epoch": 0.8354243542435424,
"grad_norm": 1.0319029092788696,
"learning_rate": 4.966646614410193e-05,
"loss": 3.922898769378662,
"step": 1132
},
{
"epoch": 0.8361623616236162,
"grad_norm": 0.8679114580154419,
"learning_rate": 4.92346844801677e-05,
"loss": 4.130770206451416,
"step": 1133
},
{
"epoch": 0.83690036900369,
"grad_norm": 1.1186548471450806,
"learning_rate": 4.8804645851860066e-05,
"loss": 4.051120758056641,
"step": 1134
},
{
"epoch": 0.8376383763837638,
"grad_norm": 0.9294642806053162,
"learning_rate": 4.8376352751396885e-05,
"loss": 4.042642593383789,
"step": 1135
},
{
"epoch": 0.8383763837638376,
"grad_norm": 0.9107891917228699,
"learning_rate": 4.794980766087991e-05,
"loss": 4.207566261291504,
"step": 1136
},
{
"epoch": 0.8391143911439114,
"grad_norm": 0.8764515519142151,
"learning_rate": 4.752501305228076e-05,
"loss": 3.926863670349121,
"step": 1137
},
{
"epoch": 0.8398523985239852,
"grad_norm": 0.7595376372337341,
"learning_rate": 4.7101971387426126e-05,
"loss": 4.053175926208496,
"step": 1138
},
{
"epoch": 0.8405904059040591,
"grad_norm": 1.015899419784546,
"learning_rate": 4.668068511798407e-05,
"loss": 4.323257923126221,
"step": 1139
},
{
"epoch": 0.8413284132841329,
"grad_norm": 0.7909930348396301,
"learning_rate": 4.62611566854495e-05,
"loss": 4.0056915283203125,
"step": 1140
},
{
"epoch": 0.8420664206642067,
"grad_norm": 0.9827620387077332,
"learning_rate": 4.5843388521130024e-05,
"loss": 4.075970649719238,
"step": 1141
},
{
"epoch": 0.8428044280442805,
"grad_norm": 0.8181502223014832,
"learning_rate": 4.5427383046131974e-05,
"loss": 4.204850673675537,
"step": 1142
},
{
"epoch": 0.8435424354243543,
"grad_norm": 0.917636513710022,
"learning_rate": 4.5013142671346035e-05,
"loss": 4.204797744750977,
"step": 1143
},
{
"epoch": 0.844280442804428,
"grad_norm": 0.8521405458450317,
"learning_rate": 4.46006697974341e-05,
"loss": 3.8248231410980225,
"step": 1144
},
{
"epoch": 0.8450184501845018,
"grad_norm": 0.9880871176719666,
"learning_rate": 4.41899668148142e-05,
"loss": 4.135077476501465,
"step": 1145
},
{
"epoch": 0.8457564575645756,
"grad_norm": 0.7729653120040894,
"learning_rate": 4.3781036103647625e-05,
"loss": 4.0869975090026855,
"step": 1146
},
{
"epoch": 0.8464944649446494,
"grad_norm": 0.8864187598228455,
"learning_rate": 4.337388003382462e-05,
"loss": 3.949108600616455,
"step": 1147
},
{
"epoch": 0.8472324723247232,
"grad_norm": 0.7802934646606445,
"learning_rate": 4.296850096495096e-05,
"loss": 4.134548664093018,
"step": 1148
},
{
"epoch": 0.847970479704797,
"grad_norm": 1.1349821090698242,
"learning_rate": 4.2564901246333816e-05,
"loss": 3.8663113117218018,
"step": 1149
},
{
"epoch": 0.8487084870848709,
"grad_norm": 0.8297522068023682,
"learning_rate": 4.216308321696862e-05,
"loss": 4.069552421569824,
"step": 1150
},
{
"epoch": 0.8494464944649447,
"grad_norm": 0.8287118077278137,
"learning_rate": 4.1763049205525295e-05,
"loss": 4.17302131652832,
"step": 1151
},
{
"epoch": 0.8501845018450185,
"grad_norm": 0.9213622212409973,
"learning_rate": 4.136480153033484e-05,
"loss": 3.975867748260498,
"step": 1152
},
{
"epoch": 0.8509225092250923,
"grad_norm": 0.937636137008667,
"learning_rate": 4.096834249937555e-05,
"loss": 4.308503150939941,
"step": 1153
},
{
"epoch": 0.8516605166051661,
"grad_norm": 0.894312858581543,
"learning_rate": 4.0573674410260384e-05,
"loss": 4.0722808837890625,
"step": 1154
},
{
"epoch": 0.8523985239852399,
"grad_norm": 0.8254060745239258,
"learning_rate": 4.0180799550222964e-05,
"loss": 4.237331390380859,
"step": 1155
},
{
"epoch": 0.8531365313653136,
"grad_norm": 0.9793713092803955,
"learning_rate": 3.9789720196104374e-05,
"loss": 3.960724115371704,
"step": 1156
},
{
"epoch": 0.8538745387453874,
"grad_norm": 0.9438844323158264,
"learning_rate": 3.940043861434043e-05,
"loss": 4.011446952819824,
"step": 1157
},
{
"epoch": 0.8546125461254612,
"grad_norm": 0.9776595234870911,
"learning_rate": 3.901295706094806e-05,
"loss": 4.202037334442139,
"step": 1158
},
{
"epoch": 0.855350553505535,
"grad_norm": 0.8546213507652283,
"learning_rate": 3.862727778151262e-05,
"loss": 4.176602363586426,
"step": 1159
},
{
"epoch": 0.8560885608856088,
"grad_norm": 0.9939232468605042,
"learning_rate": 3.8243403011174406e-05,
"loss": 4.394288063049316,
"step": 1160
},
{
"epoch": 0.8568265682656827,
"grad_norm": 0.8461161851882935,
"learning_rate": 3.786133497461622e-05,
"loss": 4.105259895324707,
"step": 1161
},
{
"epoch": 0.8575645756457565,
"grad_norm": 0.8759531378746033,
"learning_rate": 3.748107588605018e-05,
"loss": 3.866830348968506,
"step": 1162
},
{
"epoch": 0.8583025830258303,
"grad_norm": 0.9277933239936829,
"learning_rate": 3.710262794920493e-05,
"loss": 4.112336158752441,
"step": 1163
},
{
"epoch": 0.8590405904059041,
"grad_norm": 0.8277254104614258,
"learning_rate": 3.672599335731272e-05,
"loss": 4.080126762390137,
"step": 1164
},
{
"epoch": 0.8597785977859779,
"grad_norm": 0.9238406419754028,
"learning_rate": 3.635117429309721e-05,
"loss": 3.9586308002471924,
"step": 1165
},
{
"epoch": 0.8605166051660517,
"grad_norm": 0.8009730577468872,
"learning_rate": 3.597817292876031e-05,
"loss": 4.31672477722168,
"step": 1166
},
{
"epoch": 0.8612546125461255,
"grad_norm": 0.9162194728851318,
"learning_rate": 3.560699142596952e-05,
"loss": 4.007983684539795,
"step": 1167
},
{
"epoch": 0.8619926199261992,
"grad_norm": 0.809906542301178,
"learning_rate": 3.523763193584591e-05,
"loss": 4.362383842468262,
"step": 1168
},
{
"epoch": 0.862730627306273,
"grad_norm": 1.0050405263900757,
"learning_rate": 3.487009659895132e-05,
"loss": 3.949605941772461,
"step": 1169
},
{
"epoch": 0.8634686346863468,
"grad_norm": 0.821631669998169,
"learning_rate": 3.4504387545276056e-05,
"loss": 4.222439765930176,
"step": 1170
},
{
"epoch": 0.8642066420664206,
"grad_norm": 1.0776225328445435,
"learning_rate": 3.414050689422626e-05,
"loss": 4.083227157592773,
"step": 1171
},
{
"epoch": 0.8649446494464945,
"grad_norm": 0.9266446232795715,
"learning_rate": 3.3778456754612195e-05,
"loss": 3.8666300773620605,
"step": 1172
},
{
"epoch": 0.8656826568265683,
"grad_norm": 0.7478688359260559,
"learning_rate": 3.341823922463545e-05,
"loss": 3.9161956310272217,
"step": 1173
},
{
"epoch": 0.8664206642066421,
"grad_norm": 0.9101294875144958,
"learning_rate": 3.305985639187726e-05,
"loss": 4.048511505126953,
"step": 1174
},
{
"epoch": 0.8671586715867159,
"grad_norm": 0.9844819903373718,
"learning_rate": 3.270331033328581e-05,
"loss": 4.01615571975708,
"step": 1175
},
{
"epoch": 0.8678966789667897,
"grad_norm": 0.8775573968887329,
"learning_rate": 3.2348603115165085e-05,
"loss": 4.202104568481445,
"step": 1176
},
{
"epoch": 0.8686346863468635,
"grad_norm": 0.8407407999038696,
"learning_rate": 3.199573679316183e-05,
"loss": 4.121450424194336,
"step": 1177
},
{
"epoch": 0.8693726937269373,
"grad_norm": 0.9224722981452942,
"learning_rate": 3.164471341225457e-05,
"loss": 3.914332389831543,
"step": 1178
},
{
"epoch": 0.870110701107011,
"grad_norm": 0.8708797693252563,
"learning_rate": 3.1295535006741184e-05,
"loss": 3.9288840293884277,
"step": 1179
},
{
"epoch": 0.8708487084870848,
"grad_norm": 0.8619300127029419,
"learning_rate": 3.0948203600227365e-05,
"loss": 4.033664226531982,
"step": 1180
},
{
"epoch": 0.8715867158671586,
"grad_norm": 1.0247814655303955,
"learning_rate": 3.060272120561491e-05,
"loss": 3.908498764038086,
"step": 1181
},
{
"epoch": 0.8723247232472324,
"grad_norm": 0.8571381568908691,
"learning_rate": 3.0259089825089657e-05,
"loss": 3.937492847442627,
"step": 1182
},
{
"epoch": 0.8730627306273063,
"grad_norm": 0.8049359917640686,
"learning_rate": 2.9917311450110688e-05,
"loss": 4.154782295227051,
"step": 1183
},
{
"epoch": 0.8738007380073801,
"grad_norm": 0.8404570817947388,
"learning_rate": 2.9577388061397813e-05,
"loss": 3.8062617778778076,
"step": 1184
},
{
"epoch": 0.8745387453874539,
"grad_norm": 0.8313830494880676,
"learning_rate": 2.92393216289209e-05,
"loss": 4.034999847412109,
"step": 1185
},
{
"epoch": 0.8752767527675277,
"grad_norm": 0.8629732131958008,
"learning_rate": 2.8903114111887997e-05,
"loss": 3.9214658737182617,
"step": 1186
},
{
"epoch": 0.8760147601476015,
"grad_norm": 0.788813054561615,
"learning_rate": 2.8568767458734206e-05,
"loss": 4.004258155822754,
"step": 1187
},
{
"epoch": 0.8767527675276753,
"grad_norm": 1.0045006275177002,
"learning_rate": 2.8236283607110122e-05,
"loss": 4.084541320800781,
"step": 1188
},
{
"epoch": 0.8774907749077491,
"grad_norm": 0.9397458434104919,
"learning_rate": 2.7905664483871018e-05,
"loss": 4.225802421569824,
"step": 1189
},
{
"epoch": 0.8782287822878229,
"grad_norm": 0.9364494681358337,
"learning_rate": 2.757691200506522e-05,
"loss": 4.048999786376953,
"step": 1190
},
{
"epoch": 0.8789667896678967,
"grad_norm": 0.9341748952865601,
"learning_rate": 2.7250028075923393e-05,
"loss": 4.081840515136719,
"step": 1191
},
{
"epoch": 0.8797047970479704,
"grad_norm": 1.1266894340515137,
"learning_rate": 2.6925014590847357e-05,
"loss": 4.127097129821777,
"step": 1192
},
{
"epoch": 0.8804428044280442,
"grad_norm": 1.646851658821106,
"learning_rate": 2.660187343339872e-05,
"loss": 3.9492740631103516,
"step": 1193
},
{
"epoch": 0.8811808118081181,
"grad_norm": 0.8900485038757324,
"learning_rate": 2.628060647628891e-05,
"loss": 4.004465103149414,
"step": 1194
},
{
"epoch": 0.8819188191881919,
"grad_norm": 1.2839107513427734,
"learning_rate": 2.596121558136723e-05,
"loss": 3.9589195251464844,
"step": 1195
},
{
"epoch": 0.8826568265682657,
"grad_norm": 0.9112879037857056,
"learning_rate": 2.564370259961085e-05,
"loss": 3.956997871398926,
"step": 1196
},
{
"epoch": 0.8833948339483395,
"grad_norm": 0.9632598161697388,
"learning_rate": 2.532806937111368e-05,
"loss": 4.068366050720215,
"step": 1197
},
{
"epoch": 0.8841328413284133,
"grad_norm": 1.0053609609603882,
"learning_rate": 2.5014317725075963e-05,
"loss": 4.128815650939941,
"step": 1198
},
{
"epoch": 0.8848708487084871,
"grad_norm": 0.9752780199050903,
"learning_rate": 2.470244947979335e-05,
"loss": 4.243814468383789,
"step": 1199
},
{
"epoch": 0.8856088560885609,
"grad_norm": 0.8026096820831299,
"learning_rate": 2.439246644264672e-05,
"loss": 3.8507800102233887,
"step": 1200
},
{
"epoch": 0.8863468634686347,
"grad_norm": 1.0604981184005737,
"learning_rate": 2.4084370410091432e-05,
"loss": 4.058777332305908,
"step": 1201
},
{
"epoch": 0.8870848708487085,
"grad_norm": 0.9318236708641052,
"learning_rate": 2.377816316764712e-05,
"loss": 3.807260751724243,
"step": 1202
},
{
"epoch": 0.8878228782287823,
"grad_norm": 0.902949333190918,
"learning_rate": 2.347384648988722e-05,
"loss": 4.102638244628906,
"step": 1203
},
{
"epoch": 0.888560885608856,
"grad_norm": 0.8091539144515991,
"learning_rate": 2.317142214042854e-05,
"loss": 4.0851216316223145,
"step": 1204
},
{
"epoch": 0.8892988929889298,
"grad_norm": 0.9384903907775879,
"learning_rate": 2.28708918719216e-05,
"loss": 3.8504605293273926,
"step": 1205
},
{
"epoch": 0.8900369003690037,
"grad_norm": 1.5000700950622559,
"learning_rate": 2.2572257426039673e-05,
"loss": 3.7018003463745117,
"step": 1206
},
{
"epoch": 0.8907749077490775,
"grad_norm": 0.853367269039154,
"learning_rate": 2.2275520533469324e-05,
"loss": 4.134353160858154,
"step": 1207
},
{
"epoch": 0.8915129151291513,
"grad_norm": 0.9626766443252563,
"learning_rate": 2.1980682913900136e-05,
"loss": 3.798292636871338,
"step": 1208
},
{
"epoch": 0.8922509225092251,
"grad_norm": 1.0532819032669067,
"learning_rate": 2.1687746276014825e-05,
"loss": 3.908432960510254,
"step": 1209
},
{
"epoch": 0.8929889298892989,
"grad_norm": 0.9355469346046448,
"learning_rate": 2.1396712317479066e-05,
"loss": 3.97414493560791,
"step": 1210
},
{
"epoch": 0.8937269372693727,
"grad_norm": 0.9721041917800903,
"learning_rate": 2.110758272493209e-05,
"loss": 4.170253753662109,
"step": 1211
},
{
"epoch": 0.8944649446494465,
"grad_norm": 1.4950439929962158,
"learning_rate": 2.082035917397661e-05,
"loss": 3.9789202213287354,
"step": 1212
},
{
"epoch": 0.8952029520295203,
"grad_norm": 0.7959940433502197,
"learning_rate": 2.05350433291692e-05,
"loss": 3.8894667625427246,
"step": 1213
},
{
"epoch": 0.8959409594095941,
"grad_norm": 0.885735034942627,
"learning_rate": 2.0251636844010645e-05,
"loss": 4.206930637359619,
"step": 1214
},
{
"epoch": 0.8966789667896679,
"grad_norm": 0.8489437103271484,
"learning_rate": 1.997014136093635e-05,
"loss": 4.1217241287231445,
"step": 1215
},
{
"epoch": 0.8974169741697416,
"grad_norm": 0.9283955693244934,
"learning_rate": 1.9690558511306816e-05,
"loss": 4.022772789001465,
"step": 1216
},
{
"epoch": 0.8981549815498155,
"grad_norm": 1.0239266157150269,
"learning_rate": 1.9412889915398164e-05,
"loss": 3.9056153297424316,
"step": 1217
},
{
"epoch": 0.8988929889298893,
"grad_norm": 0.8964755535125732,
"learning_rate": 1.91371371823928e-05,
"loss": 4.042991638183594,
"step": 1218
},
{
"epoch": 0.8996309963099631,
"grad_norm": 0.8521272540092468,
"learning_rate": 1.88633019103701e-05,
"loss": 4.2974958419799805,
"step": 1219
},
{
"epoch": 0.9003690036900369,
"grad_norm": 0.8455408215522766,
"learning_rate": 1.859138568629708e-05,
"loss": 4.03305721282959,
"step": 1220
},
{
"epoch": 0.9011070110701107,
"grad_norm": 0.9025142788887024,
"learning_rate": 1.832139008601918e-05,
"loss": 4.064189434051514,
"step": 1221
},
{
"epoch": 0.9018450184501845,
"grad_norm": 0.789932131767273,
"learning_rate": 1.8053316674251256e-05,
"loss": 3.8885226249694824,
"step": 1222
},
{
"epoch": 0.9025830258302583,
"grad_norm": 0.8709638118743896,
"learning_rate": 1.7787167004568416e-05,
"loss": 4.0818986892700195,
"step": 1223
},
{
"epoch": 0.9033210332103321,
"grad_norm": 1.0169392824172974,
"learning_rate": 1.75229426193971e-05,
"loss": 4.12254524230957,
"step": 1224
},
{
"epoch": 0.9040590405904059,
"grad_norm": 0.9385191798210144,
"learning_rate": 1.7260645050005903e-05,
"loss": 3.894554853439331,
"step": 1225
},
{
"epoch": 0.9047970479704797,
"grad_norm": 0.87216717004776,
"learning_rate": 1.7000275816497063e-05,
"loss": 4.0138773918151855,
"step": 1226
},
{
"epoch": 0.9055350553505535,
"grad_norm": 0.8227195143699646,
"learning_rate": 1.6741836427797447e-05,
"loss": 3.842376708984375,
"step": 1227
},
{
"epoch": 0.9062730627306274,
"grad_norm": 0.9171870350837708,
"learning_rate": 1.6485328381649667e-05,
"loss": 4.184534072875977,
"step": 1228
},
{
"epoch": 0.9070110701107011,
"grad_norm": 0.8216118216514587,
"learning_rate": 1.6230753164603735e-05,
"loss": 3.9486520290374756,
"step": 1229
},
{
"epoch": 0.9077490774907749,
"grad_norm": 0.8233117461204529,
"learning_rate": 1.597811225200816e-05,
"loss": 4.167961597442627,
"step": 1230
},
{
"epoch": 0.9084870848708487,
"grad_norm": 0.9360010623931885,
"learning_rate": 1.5727407108001634e-05,
"loss": 4.118611812591553,
"step": 1231
},
{
"epoch": 0.9092250922509225,
"grad_norm": 0.8383175730705261,
"learning_rate": 1.5478639185504255e-05,
"loss": 4.2346062660217285,
"step": 1232
},
{
"epoch": 0.9099630996309963,
"grad_norm": 0.7830789685249329,
"learning_rate": 1.52318099262094e-05,
"loss": 4.022680759429932,
"step": 1233
},
{
"epoch": 0.9107011070110701,
"grad_norm": 0.8860730528831482,
"learning_rate": 1.4986920760575173e-05,
"loss": 3.8851001262664795,
"step": 1234
},
{
"epoch": 0.9114391143911439,
"grad_norm": 1.0096216201782227,
"learning_rate": 1.4743973107816294e-05,
"loss": 4.16072940826416,
"step": 1235
},
{
"epoch": 0.9121771217712177,
"grad_norm": 0.8702698945999146,
"learning_rate": 1.4502968375895542e-05,
"loss": 4.074400901794434,
"step": 1236
},
{
"epoch": 0.9129151291512915,
"grad_norm": 1.0048964023590088,
"learning_rate": 1.4263907961516103e-05,
"loss": 4.206517219543457,
"step": 1237
},
{
"epoch": 0.9136531365313653,
"grad_norm": 1.0056480169296265,
"learning_rate": 1.40267932501131e-05,
"loss": 4.110833644866943,
"step": 1238
},
{
"epoch": 0.9143911439114392,
"grad_norm": 0.9925635457038879,
"learning_rate": 1.379162561584547e-05,
"loss": 3.903393507003784,
"step": 1239
},
{
"epoch": 0.915129151291513,
"grad_norm": 1.1309577226638794,
"learning_rate": 1.3558406421588386e-05,
"loss": 4.20203971862793,
"step": 1240
},
{
"epoch": 0.9158671586715867,
"grad_norm": 0.9794964790344238,
"learning_rate": 1.332713701892514e-05,
"loss": 4.138725280761719,
"step": 1241
},
{
"epoch": 0.9166051660516605,
"grad_norm": 0.9882869720458984,
"learning_rate": 1.3097818748139284e-05,
"loss": 3.934995174407959,
"step": 1242
},
{
"epoch": 0.9173431734317343,
"grad_norm": 0.9639918804168701,
"learning_rate": 1.2870452938206834e-05,
"loss": 3.992349147796631,
"step": 1243
},
{
"epoch": 0.9180811808118081,
"grad_norm": 0.761677086353302,
"learning_rate": 1.2645040906788873e-05,
"loss": 4.091512680053711,
"step": 1244
},
{
"epoch": 0.9188191881918819,
"grad_norm": 0.8653919100761414,
"learning_rate": 1.2421583960223403e-05,
"loss": 4.175684452056885,
"step": 1245
},
{
"epoch": 0.9195571955719557,
"grad_norm": 0.8463162779808044,
"learning_rate": 1.22000833935183e-05,
"loss": 3.7140493392944336,
"step": 1246
},
{
"epoch": 0.9202952029520295,
"grad_norm": 1.5709336996078491,
"learning_rate": 1.1980540490343322e-05,
"loss": 4.196260452270508,
"step": 1247
},
{
"epoch": 0.9210332103321033,
"grad_norm": 1.0555191040039062,
"learning_rate": 1.1762956523023177e-05,
"loss": 4.01348876953125,
"step": 1248
},
{
"epoch": 0.9217712177121771,
"grad_norm": 0.8740063309669495,
"learning_rate": 1.1547332752529649e-05,
"loss": 4.2362470626831055,
"step": 1249
},
{
"epoch": 0.922509225092251,
"grad_norm": 0.7975241541862488,
"learning_rate": 1.1333670428474634e-05,
"loss": 3.9546077251434326,
"step": 1250
},
{
"epoch": 0.9232472324723248,
"grad_norm": 0.9999665021896362,
"learning_rate": 1.1121970789102842e-05,
"loss": 4.26607608795166,
"step": 1251
},
{
"epoch": 0.9239852398523986,
"grad_norm": 0.8974668979644775,
"learning_rate": 1.0912235061284481e-05,
"loss": 3.8792271614074707,
"step": 1252
},
{
"epoch": 0.9247232472324723,
"grad_norm": 0.9566179513931274,
"learning_rate": 1.0704464460508312e-05,
"loss": 3.9453883171081543,
"step": 1253
},
{
"epoch": 0.9254612546125461,
"grad_norm": 0.8908551335334778,
"learning_rate": 1.0498660190874298e-05,
"loss": 4.036525726318359,
"step": 1254
},
{
"epoch": 0.9261992619926199,
"grad_norm": 0.8145546317100525,
"learning_rate": 1.0294823445087275e-05,
"loss": 4.188867568969727,
"step": 1255
},
{
"epoch": 0.9269372693726937,
"grad_norm": 0.8675177097320557,
"learning_rate": 1.0092955404449255e-05,
"loss": 4.099850654602051,
"step": 1256
},
{
"epoch": 0.9276752767527675,
"grad_norm": 0.8431053757667542,
"learning_rate": 9.893057238853053e-06,
"loss": 4.123414039611816,
"step": 1257
},
{
"epoch": 0.9284132841328413,
"grad_norm": 0.8683810830116272,
"learning_rate": 9.69513010677545e-06,
"loss": 4.246320724487305,
"step": 1258
},
{
"epoch": 0.9291512915129151,
"grad_norm": 0.8188611268997192,
"learning_rate": 9.499175155270433e-06,
"loss": 4.140353202819824,
"step": 1259
},
{
"epoch": 0.9298892988929889,
"grad_norm": 0.9163283705711365,
"learning_rate": 9.30519351996243e-06,
"loss": 4.289680480957031,
"step": 1260
},
{
"epoch": 0.9306273062730628,
"grad_norm": 0.9404510855674744,
"learning_rate": 9.113186325039935e-06,
"loss": 3.925518035888672,
"step": 1261
},
{
"epoch": 0.9313653136531366,
"grad_norm": 1.0115693807601929,
"learning_rate": 8.923154683248873e-06,
"loss": 4.255781173706055,
"step": 1262
},
{
"epoch": 0.9321033210332104,
"grad_norm": 0.9950158596038818,
"learning_rate": 8.735099695886261e-06,
"loss": 4.2205657958984375,
"step": 1263
},
{
"epoch": 0.9328413284132842,
"grad_norm": 0.886117160320282,
"learning_rate": 8.549022452793597e-06,
"loss": 4.172645568847656,
"step": 1264
},
{
"epoch": 0.933579335793358,
"grad_norm": 0.8960427045822144,
"learning_rate": 8.364924032350728e-06,
"loss": 4.060420513153076,
"step": 1265
},
{
"epoch": 0.9343173431734317,
"grad_norm": 0.9799672961235046,
"learning_rate": 8.18280550146967e-06,
"loss": 4.113704681396484,
"step": 1266
},
{
"epoch": 0.9350553505535055,
"grad_norm": 1.0600509643554688,
"learning_rate": 8.002667915588191e-06,
"loss": 4.008590221405029,
"step": 1267
},
{
"epoch": 0.9357933579335793,
"grad_norm": 1.0815836191177368,
"learning_rate": 7.824512318663873e-06,
"loss": 3.9697742462158203,
"step": 1268
},
{
"epoch": 0.9365313653136531,
"grad_norm": 0.8676935434341431,
"learning_rate": 7.648339743168008e-06,
"loss": 3.9918062686920166,
"step": 1269
},
{
"epoch": 0.9372693726937269,
"grad_norm": 0.8141337633132935,
"learning_rate": 7.474151210079654e-06,
"loss": 4.000406742095947,
"step": 1270
},
{
"epoch": 0.9380073800738007,
"grad_norm": 0.9304051995277405,
"learning_rate": 7.301947728879571e-06,
"loss": 4.1023969650268555,
"step": 1271
},
{
"epoch": 0.9387453874538746,
"grad_norm": 0.8569762110710144,
"learning_rate": 7.131730297544547e-06,
"loss": 3.9308419227600098,
"step": 1272
},
{
"epoch": 0.9394833948339484,
"grad_norm": 0.9410969018936157,
"learning_rate": 6.963499902541575e-06,
"loss": 4.188969612121582,
"step": 1273
},
{
"epoch": 0.9402214022140222,
"grad_norm": 1.1418845653533936,
"learning_rate": 6.7972575188220975e-06,
"loss": 3.789651870727539,
"step": 1274
},
{
"epoch": 0.940959409594096,
"grad_norm": 0.8757267594337463,
"learning_rate": 6.633004109816293e-06,
"loss": 4.139651298522949,
"step": 1275
},
{
"epoch": 0.9416974169741698,
"grad_norm": 0.794495165348053,
"learning_rate": 6.4707406274276015e-06,
"loss": 4.01513671875,
"step": 1276
},
{
"epoch": 0.9424354243542435,
"grad_norm": 0.8835275769233704,
"learning_rate": 6.310468012027321e-06,
"loss": 4.235984802246094,
"step": 1277
},
{
"epoch": 0.9431734317343173,
"grad_norm": 0.8845841884613037,
"learning_rate": 6.152187192448738e-06,
"loss": 4.170893669128418,
"step": 1278
},
{
"epoch": 0.9439114391143911,
"grad_norm": 0.8521038293838501,
"learning_rate": 5.995899085982198e-06,
"loss": 4.143123626708984,
"step": 1279
},
{
"epoch": 0.9446494464944649,
"grad_norm": 0.8681446313858032,
"learning_rate": 5.841604598369543e-06,
"loss": 3.9806013107299805,
"step": 1280
},
{
"epoch": 0.9453874538745387,
"grad_norm": 0.8543822765350342,
"learning_rate": 5.689304623799063e-06,
"loss": 4.092264175415039,
"step": 1281
},
{
"epoch": 0.9461254612546125,
"grad_norm": 0.8498107194900513,
"learning_rate": 5.5390000448999e-06,
"loss": 3.961348056793213,
"step": 1282
},
{
"epoch": 0.9468634686346864,
"grad_norm": 0.9987668991088867,
"learning_rate": 5.390691732737501e-06,
"loss": 3.853841781616211,
"step": 1283
},
{
"epoch": 0.9476014760147602,
"grad_norm": 0.8435112833976746,
"learning_rate": 5.244380546808064e-06,
"loss": 4.074121475219727,
"step": 1284
},
{
"epoch": 0.948339483394834,
"grad_norm": 0.9243870377540588,
"learning_rate": 5.100067335033909e-06,
"loss": 3.9349491596221924,
"step": 1285
},
{
"epoch": 0.9490774907749078,
"grad_norm": 0.9198288917541504,
"learning_rate": 4.957752933758391e-06,
"loss": 4.085498332977295,
"step": 1286
},
{
"epoch": 0.9498154981549816,
"grad_norm": 1.0337499380111694,
"learning_rate": 4.817438167741045e-06,
"loss": 4.015393257141113,
"step": 1287
},
{
"epoch": 0.9505535055350554,
"grad_norm": 0.9356955289840698,
"learning_rate": 4.679123850152955e-06,
"loss": 4.079366683959961,
"step": 1288
},
{
"epoch": 0.9512915129151291,
"grad_norm": 0.8707476854324341,
"learning_rate": 4.542810782571749e-06,
"loss": 3.9717764854431152,
"step": 1289
},
{
"epoch": 0.9520295202952029,
"grad_norm": 0.8522350788116455,
"learning_rate": 4.4084997549773184e-06,
"loss": 3.9818825721740723,
"step": 1290
},
{
"epoch": 0.9527675276752767,
"grad_norm": 0.9296215176582336,
"learning_rate": 4.276191545747004e-06,
"loss": 4.0599365234375,
"step": 1291
},
{
"epoch": 0.9535055350553505,
"grad_norm": 0.9785073399543762,
"learning_rate": 4.145886921651165e-06,
"loss": 3.9496049880981445,
"step": 1292
},
{
"epoch": 0.9542435424354243,
"grad_norm": 1.0265014171600342,
"learning_rate": 4.017586637848669e-06,
"loss": 3.9062700271606445,
"step": 1293
},
{
"epoch": 0.9549815498154982,
"grad_norm": 0.8512267470359802,
"learning_rate": 3.891291437882544e-06,
"loss": 3.8862087726593018,
"step": 1294
},
{
"epoch": 0.955719557195572,
"grad_norm": 0.9458624124526978,
"learning_rate": 3.7670020536757775e-06,
"loss": 4.076284408569336,
"step": 1295
},
{
"epoch": 0.9564575645756458,
"grad_norm": 1.0091397762298584,
"learning_rate": 3.6447192055269694e-06,
"loss": 4.171298503875732,
"step": 1296
},
{
"epoch": 0.9571955719557196,
"grad_norm": 1.085514783859253,
"learning_rate": 3.5244436021060143e-06,
"loss": 4.2273736000061035,
"step": 1297
},
{
"epoch": 0.9579335793357934,
"grad_norm": 0.9626381397247314,
"learning_rate": 3.4061759404503734e-06,
"loss": 3.9600830078125,
"step": 1298
},
{
"epoch": 0.9586715867158672,
"grad_norm": 0.892805814743042,
"learning_rate": 3.2899169059607216e-06,
"loss": 4.168842315673828,
"step": 1299
},
{
"epoch": 0.959409594095941,
"grad_norm": 1.0419422388076782,
"learning_rate": 3.1756671723969843e-06,
"loss": 4.045411109924316,
"step": 1300
},
{
"epoch": 0.9601476014760147,
"grad_norm": 0.9553176760673523,
"learning_rate": 3.0634274018746466e-06,
"loss": 4.090331077575684,
"step": 1301
},
{
"epoch": 0.9608856088560885,
"grad_norm": 0.7899879813194275,
"learning_rate": 2.9531982448607108e-06,
"loss": 4.1696577072143555,
"step": 1302
},
{
"epoch": 0.9616236162361623,
"grad_norm": 0.8720894455909729,
"learning_rate": 2.8449803401700445e-06,
"loss": 4.093484878540039,
"step": 1303
},
{
"epoch": 0.9623616236162361,
"grad_norm": 0.93953937292099,
"learning_rate": 2.738774314961534e-06,
"loss": 4.138238430023193,
"step": 1304
},
{
"epoch": 0.9630996309963099,
"grad_norm": 0.8301063179969788,
"learning_rate": 2.6345807847347413e-06,
"loss": 4.250385761260986,
"step": 1305
},
{
"epoch": 0.9638376383763838,
"grad_norm": 0.9756575226783752,
"learning_rate": 2.532400353325903e-06,
"loss": 4.020941734313965,
"step": 1306
},
{
"epoch": 0.9645756457564576,
"grad_norm": 0.955294132232666,
"learning_rate": 2.4322336129049384e-06,
"loss": 4.259998321533203,
"step": 1307
},
{
"epoch": 0.9653136531365314,
"grad_norm": 0.8463285565376282,
"learning_rate": 2.3340811439715223e-06,
"loss": 4.142585754394531,
"step": 1308
},
{
"epoch": 0.9660516605166052,
"grad_norm": 1.4179046154022217,
"learning_rate": 2.237943515352098e-06,
"loss": 3.9881856441497803,
"step": 1309
},
{
"epoch": 0.966789667896679,
"grad_norm": 0.9249223470687866,
"learning_rate": 2.1438212841963734e-06,
"loss": 4.063835144042969,
"step": 1310
},
{
"epoch": 0.9675276752767528,
"grad_norm": 0.8510631322860718,
"learning_rate": 2.051714995974141e-06,
"loss": 3.82594895362854,
"step": 1311
},
{
"epoch": 0.9682656826568266,
"grad_norm": 1.0183271169662476,
"learning_rate": 1.9616251844722042e-06,
"loss": 4.1191205978393555,
"step": 1312
},
{
"epoch": 0.9690036900369003,
"grad_norm": 0.9534389972686768,
"learning_rate": 1.873552371791115e-06,
"loss": 4.130201816558838,
"step": 1313
},
{
"epoch": 0.9697416974169741,
"grad_norm": 0.9956035017967224,
"learning_rate": 1.7874970683423364e-06,
"loss": 3.9721202850341797,
"step": 1314
},
{
"epoch": 0.9704797047970479,
"grad_norm": 0.8220584988594055,
"learning_rate": 1.703459772845095e-06,
"loss": 4.076860427856445,
"step": 1315
},
{
"epoch": 0.9712177121771217,
"grad_norm": 0.9613221287727356,
"learning_rate": 1.6214409723236623e-06,
"loss": 3.937884569168091,
"step": 1316
},
{
"epoch": 0.9719557195571956,
"grad_norm": 0.8680334687232971,
"learning_rate": 1.5414411421044382e-06,
"loss": 4.201882362365723,
"step": 1317
},
{
"epoch": 0.9726937269372694,
"grad_norm": 0.759444534778595,
"learning_rate": 1.4634607458131555e-06,
"loss": 4.007597923278809,
"step": 1318
},
{
"epoch": 0.9734317343173432,
"grad_norm": 1.0436582565307617,
"learning_rate": 1.387500235372352e-06,
"loss": 4.142274379730225,
"step": 1319
},
{
"epoch": 0.974169741697417,
"grad_norm": 0.9300816059112549,
"learning_rate": 1.3135600509985745e-06,
"loss": 4.145612716674805,
"step": 1320
},
{
"epoch": 0.9749077490774908,
"grad_norm": 0.9446290731430054,
"learning_rate": 1.2416406211999298e-06,
"loss": 4.090052604675293,
"step": 1321
},
{
"epoch": 0.9756457564575646,
"grad_norm": 0.8639572262763977,
"learning_rate": 1.171742362773559e-06,
"loss": 3.974188804626465,
"step": 1322
},
{
"epoch": 0.9763837638376384,
"grad_norm": 0.8869412541389465,
"learning_rate": 1.1038656808032675e-06,
"loss": 3.864497184753418,
"step": 1323
},
{
"epoch": 0.9771217712177122,
"grad_norm": 1.0420503616333008,
"learning_rate": 1.0380109686571549e-06,
"loss": 4.054100036621094,
"step": 1324
},
{
"epoch": 0.977859778597786,
"grad_norm": 0.8223108053207397,
"learning_rate": 9.74178607985282e-07,
"loss": 3.966116428375244,
"step": 1325
},
{
"epoch": 0.9785977859778597,
"grad_norm": 0.9738601446151733,
"learning_rate": 9.123689687175751e-07,
"loss": 4.062865257263184,
"step": 1326
},
{
"epoch": 0.9793357933579335,
"grad_norm": 0.9468348026275635,
"learning_rate": 8.525824090615308e-07,
"loss": 3.845522165298462,
"step": 1327
},
{
"epoch": 0.9800738007380074,
"grad_norm": 0.9388923048973083,
"learning_rate": 7.948192755002747e-07,
"loss": 4.0565595626831055,
"step": 1328
},
{
"epoch": 0.9808118081180812,
"grad_norm": 0.8884272575378418,
"learning_rate": 7.390799027904627e-07,
"loss": 3.9518604278564453,
"step": 1329
},
{
"epoch": 0.981549815498155,
"grad_norm": 0.8875818252563477,
"learning_rate": 6.85364613960493e-07,
"loss": 4.188823223114014,
"step": 1330
},
{
"epoch": 0.9822878228782288,
"grad_norm": 0.7383257150650024,
"learning_rate": 6.336737203083698e-07,
"loss": 4.029225826263428,
"step": 1331
},
{
"epoch": 0.9830258302583026,
"grad_norm": 0.8472557663917542,
"learning_rate": 5.840075214001095e-07,
"loss": 4.239529609680176,
"step": 1332
},
{
"epoch": 0.9837638376383764,
"grad_norm": 0.8474605679512024,
"learning_rate": 5.363663050679535e-07,
"loss": 4.032186508178711,
"step": 1333
},
{
"epoch": 0.9845018450184502,
"grad_norm": 1.059869647026062,
"learning_rate": 4.90750347408736e-07,
"loss": 4.1005859375,
"step": 1334
},
{
"epoch": 0.985239852398524,
"grad_norm": 0.8023120760917664,
"learning_rate": 4.4715991278213576e-07,
"loss": 3.901467800140381,
"step": 1335
},
{
"epoch": 0.9859778597785978,
"grad_norm": 0.8805201053619385,
"learning_rate": 4.0559525380935435e-07,
"loss": 3.9754929542541504,
"step": 1336
},
{
"epoch": 0.9867158671586715,
"grad_norm": 0.844008207321167,
"learning_rate": 3.660566113714847e-07,
"loss": 4.112626552581787,
"step": 1337
},
{
"epoch": 0.9874538745387453,
"grad_norm": 0.8604761958122253,
"learning_rate": 3.2854421460815075e-07,
"loss": 3.6430814266204834,
"step": 1338
},
{
"epoch": 0.9881918819188192,
"grad_norm": 0.8547300100326538,
"learning_rate": 2.930582809162641e-07,
"loss": 4.121878623962402,
"step": 1339
},
{
"epoch": 0.988929889298893,
"grad_norm": 0.9670364260673523,
"learning_rate": 2.5959901594870273e-07,
"loss": 3.9410769939422607,
"step": 1340
},
{
"epoch": 0.9896678966789668,
"grad_norm": 1.0530565977096558,
"learning_rate": 2.281666136130678e-07,
"loss": 3.989541530609131,
"step": 1341
},
{
"epoch": 0.9904059040590406,
"grad_norm": 1.0222362279891968,
"learning_rate": 1.9876125607067309e-07,
"loss": 4.04118537902832,
"step": 1342
},
{
"epoch": 0.9911439114391144,
"grad_norm": 0.7860491275787354,
"learning_rate": 1.713831137353794e-07,
"loss": 3.8432321548461914,
"step": 1343
},
{
"epoch": 0.9918819188191882,
"grad_norm": 0.8329381942749023,
"learning_rate": 1.460323452727008e-07,
"loss": 3.9724719524383545,
"step": 1344
},
{
"epoch": 0.992619926199262,
"grad_norm": 0.8542125225067139,
"learning_rate": 1.2270909759879432e-07,
"loss": 4.1086506843566895,
"step": 1345
},
{
"epoch": 0.9933579335793358,
"grad_norm": 0.9141987562179565,
"learning_rate": 1.0141350587972164e-07,
"loss": 3.7987635135650635,
"step": 1346
},
{
"epoch": 0.9940959409594096,
"grad_norm": 0.8679295778274536,
"learning_rate": 8.214569353055534e-08,
"loss": 4.131080627441406,
"step": 1347
},
{
"epoch": 0.9948339483394834,
"grad_norm": 0.9177278876304626,
"learning_rate": 6.490577221467953e-08,
"loss": 4.434652328491211,
"step": 1348
},
{
"epoch": 0.9955719557195571,
"grad_norm": 0.8580910563468933,
"learning_rate": 4.9693841843245764e-08,
"loss": 4.102374076843262,
"step": 1349
},
{
"epoch": 0.996309963099631,
"grad_norm": 0.9112648367881775,
"learning_rate": 3.6509990574473684e-08,
"loss": 4.2107343673706055,
"step": 1350
},
{
"epoch": 0.9970479704797048,
"grad_norm": 1.0079255104064941,
"learning_rate": 2.535429481318463e-08,
"loss": 3.846949577331543,
"step": 1351
},
{
"epoch": 0.9977859778597786,
"grad_norm": 0.7972182035446167,
"learning_rate": 1.622681921033542e-08,
"loss": 3.9979095458984375,
"step": 1352
},
{
"epoch": 0.9985239852398524,
"grad_norm": 0.8567417860031128,
"learning_rate": 9.127616662746307e-09,
"loss": 3.994305372238159,
"step": 1353
},
{
"epoch": 0.9992619926199262,
"grad_norm": 0.8455564379692078,
"learning_rate": 4.0567283126347055e-09,
"loss": 4.195075988769531,
"step": 1354
},
{
"epoch": 1.0,
"grad_norm": 0.7844815254211426,
"learning_rate": 1.0141835475374616e-09,
"loss": 4.3078813552856445,
"step": 1355
}
],
"logging_steps": 1,
"max_steps": 1355,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.572106671245492e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}