ErrorAI's picture
Training in progress, step 1595, checkpoint
08f6257 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.09190170263029011,
"eval_steps": 500,
"global_step": 1595,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 5.761862233873988e-05,
"grad_norm": 1.8894768953323364,
"learning_rate": 2e-05,
"loss": 2.5792,
"step": 1
},
{
"epoch": 0.00011523724467747976,
"grad_norm": 2.1264586448669434,
"learning_rate": 4e-05,
"loss": 3.2333,
"step": 2
},
{
"epoch": 0.00017285586701621964,
"grad_norm": 2.0583252906799316,
"learning_rate": 6e-05,
"loss": 3.3241,
"step": 3
},
{
"epoch": 0.0002304744893549595,
"grad_norm": 2.128039598464966,
"learning_rate": 8e-05,
"loss": 3.3622,
"step": 4
},
{
"epoch": 0.0002880931116936994,
"grad_norm": 2.9162023067474365,
"learning_rate": 0.0001,
"loss": 4.0966,
"step": 5
},
{
"epoch": 0.0003457117340324393,
"grad_norm": 3.0308210849761963,
"learning_rate": 9.999990240100085e-05,
"loss": 3.2622,
"step": 6
},
{
"epoch": 0.0004033303563711792,
"grad_norm": 2.849175214767456,
"learning_rate": 9.999960960438442e-05,
"loss": 3.5116,
"step": 7
},
{
"epoch": 0.000460948978709919,
"grad_norm": 2.4498026371002197,
"learning_rate": 9.999912161129376e-05,
"loss": 2.6775,
"step": 8
},
{
"epoch": 0.0005185676010486589,
"grad_norm": 2.5766794681549072,
"learning_rate": 9.999843842363401e-05,
"loss": 3.6129,
"step": 9
},
{
"epoch": 0.0005761862233873988,
"grad_norm": 3.0015687942504883,
"learning_rate": 9.999756004407229e-05,
"loss": 2.879,
"step": 10
},
{
"epoch": 0.0006338048457261387,
"grad_norm": 3.681455373764038,
"learning_rate": 9.999648647603774e-05,
"loss": 3.8898,
"step": 11
},
{
"epoch": 0.0006914234680648786,
"grad_norm": 3.9608843326568604,
"learning_rate": 9.999521772372156e-05,
"loss": 3.6782,
"step": 12
},
{
"epoch": 0.0007490420904036184,
"grad_norm": 3.7332823276519775,
"learning_rate": 9.99937537920769e-05,
"loss": 3.3551,
"step": 13
},
{
"epoch": 0.0008066607127423584,
"grad_norm": 4.441276550292969,
"learning_rate": 9.999209468681885e-05,
"loss": 4.4083,
"step": 14
},
{
"epoch": 0.0008642793350810982,
"grad_norm": 4.141984462738037,
"learning_rate": 9.999024041442456e-05,
"loss": 3.7971,
"step": 15
},
{
"epoch": 0.000921897957419838,
"grad_norm": 4.230754852294922,
"learning_rate": 9.998819098213297e-05,
"loss": 3.0315,
"step": 16
},
{
"epoch": 0.0009795165797585779,
"grad_norm": 6.226734161376953,
"learning_rate": 9.998594639794501e-05,
"loss": 3.6898,
"step": 17
},
{
"epoch": 0.0010371352020973178,
"grad_norm": 4.347520351409912,
"learning_rate": 9.998350667062346e-05,
"loss": 2.8989,
"step": 18
},
{
"epoch": 0.0010947538244360578,
"grad_norm": 4.9136128425598145,
"learning_rate": 9.998087180969289e-05,
"loss": 3.3558,
"step": 19
},
{
"epoch": 0.0011523724467747975,
"grad_norm": 5.32551383972168,
"learning_rate": 9.997804182543973e-05,
"loss": 4.5467,
"step": 20
},
{
"epoch": 0.0012099910691135375,
"grad_norm": 3.864988088607788,
"learning_rate": 9.997501672891207e-05,
"loss": 3.0544,
"step": 21
},
{
"epoch": 0.0012676096914522774,
"grad_norm": 4.165402412414551,
"learning_rate": 9.997179653191982e-05,
"loss": 3.4233,
"step": 22
},
{
"epoch": 0.0013252283137910172,
"grad_norm": 3.2870264053344727,
"learning_rate": 9.996838124703447e-05,
"loss": 2.8965,
"step": 23
},
{
"epoch": 0.0013828469361297571,
"grad_norm": 3.6975154876708984,
"learning_rate": 9.996477088758916e-05,
"loss": 3.3888,
"step": 24
},
{
"epoch": 0.001440465558468497,
"grad_norm": 3.3743979930877686,
"learning_rate": 9.99609654676786e-05,
"loss": 2.9486,
"step": 25
},
{
"epoch": 0.0014980841808072368,
"grad_norm": 3.1180832386016846,
"learning_rate": 9.995696500215898e-05,
"loss": 2.6393,
"step": 26
},
{
"epoch": 0.0015557028031459768,
"grad_norm": 3.0562663078308105,
"learning_rate": 9.995276950664796e-05,
"loss": 3.0492,
"step": 27
},
{
"epoch": 0.0016133214254847167,
"grad_norm": 3.6573245525360107,
"learning_rate": 9.99483789975246e-05,
"loss": 3.0763,
"step": 28
},
{
"epoch": 0.0016709400478234564,
"grad_norm": 3.5239665508270264,
"learning_rate": 9.994379349192926e-05,
"loss": 3.1312,
"step": 29
},
{
"epoch": 0.0017285586701621964,
"grad_norm": 3.163259983062744,
"learning_rate": 9.993901300776359e-05,
"loss": 2.6612,
"step": 30
},
{
"epoch": 0.0017861772925009364,
"grad_norm": 3.437716484069824,
"learning_rate": 9.993403756369038e-05,
"loss": 3.3271,
"step": 31
},
{
"epoch": 0.001843795914839676,
"grad_norm": 3.744062900543213,
"learning_rate": 9.992886717913356e-05,
"loss": 3.169,
"step": 32
},
{
"epoch": 0.001901414537178416,
"grad_norm": 4.297734260559082,
"learning_rate": 9.992350187427815e-05,
"loss": 3.1753,
"step": 33
},
{
"epoch": 0.0019590331595171558,
"grad_norm": 3.4629929065704346,
"learning_rate": 9.991794167007004e-05,
"loss": 3.0395,
"step": 34
},
{
"epoch": 0.002016651781855896,
"grad_norm": 3.6565771102905273,
"learning_rate": 9.991218658821608e-05,
"loss": 2.8464,
"step": 35
},
{
"epoch": 0.0020742704041946357,
"grad_norm": 4.069650173187256,
"learning_rate": 9.990623665118384e-05,
"loss": 2.9321,
"step": 36
},
{
"epoch": 0.0021318890265333754,
"grad_norm": 4.492208957672119,
"learning_rate": 9.990009188220167e-05,
"loss": 3.1554,
"step": 37
},
{
"epoch": 0.0021895076488721156,
"grad_norm": 4.2307329177856445,
"learning_rate": 9.989375230525848e-05,
"loss": 3.2979,
"step": 38
},
{
"epoch": 0.0022471262712108553,
"grad_norm": 4.012269973754883,
"learning_rate": 9.988721794510374e-05,
"loss": 3.2915,
"step": 39
},
{
"epoch": 0.002304744893549595,
"grad_norm": 3.905738115310669,
"learning_rate": 9.988048882724732e-05,
"loss": 3.3131,
"step": 40
},
{
"epoch": 0.0023623635158883352,
"grad_norm": 4.168063163757324,
"learning_rate": 9.987356497795943e-05,
"loss": 3.2027,
"step": 41
},
{
"epoch": 0.002419982138227075,
"grad_norm": 4.151683807373047,
"learning_rate": 9.986644642427051e-05,
"loss": 3.4948,
"step": 42
},
{
"epoch": 0.0024776007605658147,
"grad_norm": 4.244364261627197,
"learning_rate": 9.985913319397109e-05,
"loss": 3.1722,
"step": 43
},
{
"epoch": 0.002535219382904555,
"grad_norm": 4.407247066497803,
"learning_rate": 9.985162531561174e-05,
"loss": 3.2968,
"step": 44
},
{
"epoch": 0.0025928380052432946,
"grad_norm": 4.8157758712768555,
"learning_rate": 9.984392281850293e-05,
"loss": 3.5925,
"step": 45
},
{
"epoch": 0.0026504566275820343,
"grad_norm": 5.310198783874512,
"learning_rate": 9.983602573271485e-05,
"loss": 3.5636,
"step": 46
},
{
"epoch": 0.0027080752499207745,
"grad_norm": 5.916202068328857,
"learning_rate": 9.982793408907747e-05,
"loss": 3.862,
"step": 47
},
{
"epoch": 0.0027656938722595142,
"grad_norm": 5.015003204345703,
"learning_rate": 9.981964791918021e-05,
"loss": 3.1259,
"step": 48
},
{
"epoch": 0.002823312494598254,
"grad_norm": 4.756880283355713,
"learning_rate": 9.981116725537194e-05,
"loss": 2.6775,
"step": 49
},
{
"epoch": 0.002880931116936994,
"grad_norm": 6.32490873336792,
"learning_rate": 9.980249213076084e-05,
"loss": 3.059,
"step": 50
},
{
"epoch": 0.002938549739275734,
"grad_norm": 2.6211915016174316,
"learning_rate": 9.979362257921427e-05,
"loss": 2.3427,
"step": 51
},
{
"epoch": 0.0029961683616144736,
"grad_norm": 2.643610715866089,
"learning_rate": 9.978455863535859e-05,
"loss": 2.3558,
"step": 52
},
{
"epoch": 0.003053786983953214,
"grad_norm": 2.3332371711730957,
"learning_rate": 9.977530033457905e-05,
"loss": 2.5307,
"step": 53
},
{
"epoch": 0.0031114056062919535,
"grad_norm": 2.3695240020751953,
"learning_rate": 9.97658477130197e-05,
"loss": 2.4889,
"step": 54
},
{
"epoch": 0.0031690242286306933,
"grad_norm": 2.4462778568267822,
"learning_rate": 9.97562008075832e-05,
"loss": 2.3148,
"step": 55
},
{
"epoch": 0.0032266428509694334,
"grad_norm": 2.4437859058380127,
"learning_rate": 9.97463596559307e-05,
"loss": 2.7512,
"step": 56
},
{
"epoch": 0.003284261473308173,
"grad_norm": 2.3088741302490234,
"learning_rate": 9.973632429648165e-05,
"loss": 3.4269,
"step": 57
},
{
"epoch": 0.003341880095646913,
"grad_norm": 2.081118106842041,
"learning_rate": 9.972609476841367e-05,
"loss": 2.3737,
"step": 58
},
{
"epoch": 0.003399498717985653,
"grad_norm": 2.6344411373138428,
"learning_rate": 9.971567111166246e-05,
"loss": 2.8487,
"step": 59
},
{
"epoch": 0.003457117340324393,
"grad_norm": 2.481696367263794,
"learning_rate": 9.970505336692153e-05,
"loss": 2.6414,
"step": 60
},
{
"epoch": 0.0035147359626631325,
"grad_norm": 2.6268556118011475,
"learning_rate": 9.969424157564215e-05,
"loss": 2.4161,
"step": 61
},
{
"epoch": 0.0035723545850018727,
"grad_norm": 2.807784080505371,
"learning_rate": 9.968323578003311e-05,
"loss": 2.6353,
"step": 62
},
{
"epoch": 0.0036299732073406124,
"grad_norm": 2.64215350151062,
"learning_rate": 9.967203602306061e-05,
"loss": 2.7837,
"step": 63
},
{
"epoch": 0.003687591829679352,
"grad_norm": 2.6400628089904785,
"learning_rate": 9.966064234844804e-05,
"loss": 2.6833,
"step": 64
},
{
"epoch": 0.0037452104520180923,
"grad_norm": 2.4792282581329346,
"learning_rate": 9.964905480067586e-05,
"loss": 2.54,
"step": 65
},
{
"epoch": 0.003802829074356832,
"grad_norm": 2.542963743209839,
"learning_rate": 9.963727342498137e-05,
"loss": 2.2336,
"step": 66
},
{
"epoch": 0.003860447696695572,
"grad_norm": 3.173456907272339,
"learning_rate": 9.96252982673586e-05,
"loss": 3.2126,
"step": 67
},
{
"epoch": 0.0039180663190343116,
"grad_norm": 2.9265830516815186,
"learning_rate": 9.961312937455811e-05,
"loss": 2.6366,
"step": 68
},
{
"epoch": 0.003975684941373052,
"grad_norm": 3.8010411262512207,
"learning_rate": 9.960076679408674e-05,
"loss": 3.7591,
"step": 69
},
{
"epoch": 0.004033303563711792,
"grad_norm": 3.1719210147857666,
"learning_rate": 9.958821057420754e-05,
"loss": 3.1925,
"step": 70
},
{
"epoch": 0.004090922186050531,
"grad_norm": 3.8704843521118164,
"learning_rate": 9.957546076393943e-05,
"loss": 3.1536,
"step": 71
},
{
"epoch": 0.004148540808389271,
"grad_norm": 4.0394206047058105,
"learning_rate": 9.956251741305722e-05,
"loss": 2.8024,
"step": 72
},
{
"epoch": 0.0042061594307280115,
"grad_norm": 3.980168581008911,
"learning_rate": 9.954938057209121e-05,
"loss": 3.2307,
"step": 73
},
{
"epoch": 0.004263778053066751,
"grad_norm": 4.318230628967285,
"learning_rate": 9.953605029232711e-05,
"loss": 3.267,
"step": 74
},
{
"epoch": 0.004321396675405491,
"grad_norm": 3.24552845954895,
"learning_rate": 9.952252662580579e-05,
"loss": 2.5186,
"step": 75
},
{
"epoch": 0.004379015297744231,
"grad_norm": 2.9519386291503906,
"learning_rate": 9.950880962532309e-05,
"loss": 2.6051,
"step": 76
},
{
"epoch": 0.0044366339200829705,
"grad_norm": 3.490110397338867,
"learning_rate": 9.949489934442966e-05,
"loss": 3.2147,
"step": 77
},
{
"epoch": 0.004494252542421711,
"grad_norm": 3.4307446479797363,
"learning_rate": 9.948079583743067e-05,
"loss": 2.8519,
"step": 78
},
{
"epoch": 0.004551871164760451,
"grad_norm": 3.4160594940185547,
"learning_rate": 9.946649915938562e-05,
"loss": 3.0261,
"step": 79
},
{
"epoch": 0.00460948978709919,
"grad_norm": 3.2356300354003906,
"learning_rate": 9.94520093661082e-05,
"loss": 2.7251,
"step": 80
},
{
"epoch": 0.00466710840943793,
"grad_norm": 3.441582679748535,
"learning_rate": 9.943732651416597e-05,
"loss": 2.7831,
"step": 81
},
{
"epoch": 0.0047247270317766705,
"grad_norm": 3.8920841217041016,
"learning_rate": 9.942245066088021e-05,
"loss": 2.8823,
"step": 82
},
{
"epoch": 0.00478234565411541,
"grad_norm": 3.640735387802124,
"learning_rate": 9.940738186432565e-05,
"loss": 3.2526,
"step": 83
},
{
"epoch": 0.00483996427645415,
"grad_norm": 3.713043451309204,
"learning_rate": 9.939212018333023e-05,
"loss": 2.6078,
"step": 84
},
{
"epoch": 0.00489758289879289,
"grad_norm": 3.653604745864868,
"learning_rate": 9.937666567747501e-05,
"loss": 2.806,
"step": 85
},
{
"epoch": 0.004955201521131629,
"grad_norm": 3.635037422180176,
"learning_rate": 9.936101840709372e-05,
"loss": 2.8679,
"step": 86
},
{
"epoch": 0.0050128201434703696,
"grad_norm": 3.9301633834838867,
"learning_rate": 9.934517843327269e-05,
"loss": 3.2838,
"step": 87
},
{
"epoch": 0.00507043876580911,
"grad_norm": 3.7741904258728027,
"learning_rate": 9.932914581785052e-05,
"loss": 2.4931,
"step": 88
},
{
"epoch": 0.005128057388147849,
"grad_norm": 3.7472801208496094,
"learning_rate": 9.931292062341793e-05,
"loss": 2.7034,
"step": 89
},
{
"epoch": 0.005185676010486589,
"grad_norm": 4.327390193939209,
"learning_rate": 9.92965029133174e-05,
"loss": 3.1266,
"step": 90
},
{
"epoch": 0.005243294632825329,
"grad_norm": 3.712338924407959,
"learning_rate": 9.927989275164305e-05,
"loss": 2.4149,
"step": 91
},
{
"epoch": 0.005300913255164069,
"grad_norm": 4.686365604400635,
"learning_rate": 9.926309020324025e-05,
"loss": 3.0106,
"step": 92
},
{
"epoch": 0.005358531877502809,
"grad_norm": 5.173581123352051,
"learning_rate": 9.924609533370551e-05,
"loss": 3.336,
"step": 93
},
{
"epoch": 0.005416150499841549,
"grad_norm": 4.213613510131836,
"learning_rate": 9.922890820938608e-05,
"loss": 2.7551,
"step": 94
},
{
"epoch": 0.005473769122180288,
"grad_norm": 4.356773376464844,
"learning_rate": 9.921152889737984e-05,
"loss": 2.8447,
"step": 95
},
{
"epoch": 0.0055313877445190285,
"grad_norm": 5.500849723815918,
"learning_rate": 9.919395746553493e-05,
"loss": 2.4939,
"step": 96
},
{
"epoch": 0.005589006366857769,
"grad_norm": 4.978060245513916,
"learning_rate": 9.917619398244949e-05,
"loss": 2.5015,
"step": 97
},
{
"epoch": 0.005646624989196508,
"grad_norm": 5.546128273010254,
"learning_rate": 9.915823851747144e-05,
"loss": 2.8716,
"step": 98
},
{
"epoch": 0.005704243611535248,
"grad_norm": 5.834244251251221,
"learning_rate": 9.914009114069824e-05,
"loss": 3.063,
"step": 99
},
{
"epoch": 0.005761862233873988,
"grad_norm": 6.950061798095703,
"learning_rate": 9.912175192297648e-05,
"loss": 2.7757,
"step": 100
},
{
"epoch": 0.005819480856212728,
"grad_norm": 1.8343175649642944,
"learning_rate": 9.910322093590177e-05,
"loss": 1.9624,
"step": 101
},
{
"epoch": 0.005877099478551468,
"grad_norm": 2.0297634601593018,
"learning_rate": 9.908449825181829e-05,
"loss": 2.0552,
"step": 102
},
{
"epoch": 0.005934718100890208,
"grad_norm": 2.003079891204834,
"learning_rate": 9.90655839438187e-05,
"loss": 2.4062,
"step": 103
},
{
"epoch": 0.005992336723228947,
"grad_norm": 2.204677104949951,
"learning_rate": 9.90464780857437e-05,
"loss": 2.5236,
"step": 104
},
{
"epoch": 0.006049955345567687,
"grad_norm": 2.188649892807007,
"learning_rate": 9.902718075218176e-05,
"loss": 2.658,
"step": 105
},
{
"epoch": 0.006107573967906428,
"grad_norm": 2.1146440505981445,
"learning_rate": 9.90076920184689e-05,
"loss": 2.0561,
"step": 106
},
{
"epoch": 0.006165192590245167,
"grad_norm": 2.3634395599365234,
"learning_rate": 9.898801196068839e-05,
"loss": 2.5247,
"step": 107
},
{
"epoch": 0.006222811212583907,
"grad_norm": 2.4181549549102783,
"learning_rate": 9.896814065567036e-05,
"loss": 2.778,
"step": 108
},
{
"epoch": 0.006280429834922647,
"grad_norm": 2.7379238605499268,
"learning_rate": 9.89480781809916e-05,
"loss": 2.8723,
"step": 109
},
{
"epoch": 0.0063380484572613865,
"grad_norm": 3.001932144165039,
"learning_rate": 9.89278246149752e-05,
"loss": 3.555,
"step": 110
},
{
"epoch": 0.006395667079600127,
"grad_norm": 2.9997830390930176,
"learning_rate": 9.890738003669029e-05,
"loss": 2.9381,
"step": 111
},
{
"epoch": 0.006453285701938867,
"grad_norm": 2.6716039180755615,
"learning_rate": 9.888674452595166e-05,
"loss": 2.0351,
"step": 112
},
{
"epoch": 0.006510904324277606,
"grad_norm": 2.534346103668213,
"learning_rate": 9.886591816331954e-05,
"loss": 1.9597,
"step": 113
},
{
"epoch": 0.006568522946616346,
"grad_norm": 3.33077073097229,
"learning_rate": 9.88449010300992e-05,
"loss": 3.1284,
"step": 114
},
{
"epoch": 0.0066261415689550865,
"grad_norm": 3.4588513374328613,
"learning_rate": 9.882369320834069e-05,
"loss": 3.2817,
"step": 115
},
{
"epoch": 0.006683760191293826,
"grad_norm": 3.3376035690307617,
"learning_rate": 9.880229478083849e-05,
"loss": 2.8016,
"step": 116
},
{
"epoch": 0.006741378813632566,
"grad_norm": 4.0341596603393555,
"learning_rate": 9.878070583113123e-05,
"loss": 3.1014,
"step": 117
},
{
"epoch": 0.006798997435971306,
"grad_norm": 4.1889472007751465,
"learning_rate": 9.875892644350128e-05,
"loss": 3.0304,
"step": 118
},
{
"epoch": 0.006856616058310045,
"grad_norm": 3.9455606937408447,
"learning_rate": 9.87369567029745e-05,
"loss": 2.5678,
"step": 119
},
{
"epoch": 0.006914234680648786,
"grad_norm": 4.282383441925049,
"learning_rate": 9.87147966953199e-05,
"loss": 3.1004,
"step": 120
},
{
"epoch": 0.006971853302987526,
"grad_norm": 4.943022727966309,
"learning_rate": 9.869244650704923e-05,
"loss": 3.2788,
"step": 121
},
{
"epoch": 0.007029471925326265,
"grad_norm": 3.714052438735962,
"learning_rate": 9.866990622541677e-05,
"loss": 2.6955,
"step": 122
},
{
"epoch": 0.007087090547665005,
"grad_norm": 3.9254062175750732,
"learning_rate": 9.864717593841883e-05,
"loss": 2.6311,
"step": 123
},
{
"epoch": 0.007144709170003745,
"grad_norm": 3.8025972843170166,
"learning_rate": 9.862425573479357e-05,
"loss": 3.1083,
"step": 124
},
{
"epoch": 0.007202327792342485,
"grad_norm": 3.8078110218048096,
"learning_rate": 9.860114570402054e-05,
"loss": 2.8676,
"step": 125
},
{
"epoch": 0.007259946414681225,
"grad_norm": 2.980020046234131,
"learning_rate": 9.857784593632038e-05,
"loss": 2.1847,
"step": 126
},
{
"epoch": 0.007317565037019965,
"grad_norm": 3.267343759536743,
"learning_rate": 9.855435652265446e-05,
"loss": 2.2152,
"step": 127
},
{
"epoch": 0.007375183659358704,
"grad_norm": 3.8258607387542725,
"learning_rate": 9.853067755472446e-05,
"loss": 2.7722,
"step": 128
},
{
"epoch": 0.0074328022816974445,
"grad_norm": 3.919753074645996,
"learning_rate": 9.85068091249722e-05,
"loss": 2.911,
"step": 129
},
{
"epoch": 0.007490420904036185,
"grad_norm": 3.7234082221984863,
"learning_rate": 9.848275132657903e-05,
"loss": 2.7702,
"step": 130
},
{
"epoch": 0.007548039526374924,
"grad_norm": 3.76906681060791,
"learning_rate": 9.845850425346563e-05,
"loss": 2.5896,
"step": 131
},
{
"epoch": 0.007605658148713664,
"grad_norm": 4.047483444213867,
"learning_rate": 9.84340680002916e-05,
"loss": 2.7983,
"step": 132
},
{
"epoch": 0.007663276771052404,
"grad_norm": 4.183981418609619,
"learning_rate": 9.840944266245511e-05,
"loss": 2.822,
"step": 133
},
{
"epoch": 0.007720895393391144,
"grad_norm": 4.168220520019531,
"learning_rate": 9.838462833609248e-05,
"loss": 2.6539,
"step": 134
},
{
"epoch": 0.007778514015729884,
"grad_norm": 4.284755229949951,
"learning_rate": 9.835962511807786e-05,
"loss": 2.8802,
"step": 135
},
{
"epoch": 0.007836132638068623,
"grad_norm": 5.03453254699707,
"learning_rate": 9.83344331060228e-05,
"loss": 3.2416,
"step": 136
},
{
"epoch": 0.007893751260407363,
"grad_norm": 4.05511474609375,
"learning_rate": 9.830905239827593e-05,
"loss": 2.4886,
"step": 137
},
{
"epoch": 0.007951369882746103,
"grad_norm": 4.093688011169434,
"learning_rate": 9.828348309392247e-05,
"loss": 2.6646,
"step": 138
},
{
"epoch": 0.008008988505084844,
"grad_norm": 4.352829456329346,
"learning_rate": 9.825772529278401e-05,
"loss": 2.9977,
"step": 139
},
{
"epoch": 0.008066607127423584,
"grad_norm": 4.768601894378662,
"learning_rate": 9.823177909541794e-05,
"loss": 2.795,
"step": 140
},
{
"epoch": 0.008124225749762324,
"grad_norm": 5.152933597564697,
"learning_rate": 9.820564460311718e-05,
"loss": 3.5057,
"step": 141
},
{
"epoch": 0.008181844372101062,
"grad_norm": 4.4145636558532715,
"learning_rate": 9.817932191790978e-05,
"loss": 2.9607,
"step": 142
},
{
"epoch": 0.008239462994439803,
"grad_norm": 4.511148929595947,
"learning_rate": 9.815281114255841e-05,
"loss": 2.6359,
"step": 143
},
{
"epoch": 0.008297081616778543,
"grad_norm": 4.265496253967285,
"learning_rate": 9.812611238056009e-05,
"loss": 2.4778,
"step": 144
},
{
"epoch": 0.008354700239117283,
"grad_norm": 5.055487632751465,
"learning_rate": 9.809922573614569e-05,
"loss": 2.7199,
"step": 145
},
{
"epoch": 0.008412318861456023,
"grad_norm": 5.503633499145508,
"learning_rate": 9.807215131427965e-05,
"loss": 2.7841,
"step": 146
},
{
"epoch": 0.008469937483794763,
"grad_norm": 4.971359729766846,
"learning_rate": 9.804488922065937e-05,
"loss": 2.0263,
"step": 147
},
{
"epoch": 0.008527556106133502,
"grad_norm": 6.217228889465332,
"learning_rate": 9.801743956171501e-05,
"loss": 2.9825,
"step": 148
},
{
"epoch": 0.008585174728472242,
"grad_norm": 5.770811080932617,
"learning_rate": 9.798980244460893e-05,
"loss": 2.2995,
"step": 149
},
{
"epoch": 0.008642793350810982,
"grad_norm": 6.941093921661377,
"learning_rate": 9.796197797723532e-05,
"loss": 2.9117,
"step": 150
},
{
"epoch": 0.008700411973149722,
"grad_norm": 1.8242151737213135,
"learning_rate": 9.79339662682198e-05,
"loss": 2.0028,
"step": 151
},
{
"epoch": 0.008758030595488462,
"grad_norm": 1.891627550125122,
"learning_rate": 9.790576742691895e-05,
"loss": 2.1467,
"step": 152
},
{
"epoch": 0.008815649217827203,
"grad_norm": 2.101698875427246,
"learning_rate": 9.787738156341992e-05,
"loss": 2.3466,
"step": 153
},
{
"epoch": 0.008873267840165941,
"grad_norm": 2.0690152645111084,
"learning_rate": 9.784880878854e-05,
"loss": 2.0513,
"step": 154
},
{
"epoch": 0.008930886462504681,
"grad_norm": 2.461887836456299,
"learning_rate": 9.782004921382612e-05,
"loss": 2.4875,
"step": 155
},
{
"epoch": 0.008988505084843421,
"grad_norm": 2.3017547130584717,
"learning_rate": 9.779110295155456e-05,
"loss": 2.2959,
"step": 156
},
{
"epoch": 0.009046123707182161,
"grad_norm": 2.422527551651001,
"learning_rate": 9.776197011473033e-05,
"loss": 2.163,
"step": 157
},
{
"epoch": 0.009103742329520902,
"grad_norm": 2.4951605796813965,
"learning_rate": 9.773265081708687e-05,
"loss": 2.2408,
"step": 158
},
{
"epoch": 0.009161360951859642,
"grad_norm": 2.622241735458374,
"learning_rate": 9.770314517308554e-05,
"loss": 2.3679,
"step": 159
},
{
"epoch": 0.00921897957419838,
"grad_norm": 2.9750092029571533,
"learning_rate": 9.767345329791522e-05,
"loss": 3.003,
"step": 160
},
{
"epoch": 0.00927659819653712,
"grad_norm": 2.5526888370513916,
"learning_rate": 9.764357530749178e-05,
"loss": 2.405,
"step": 161
},
{
"epoch": 0.00933421681887586,
"grad_norm": 3.0718305110931396,
"learning_rate": 9.761351131845768e-05,
"loss": 2.6643,
"step": 162
},
{
"epoch": 0.0093918354412146,
"grad_norm": 3.5400822162628174,
"learning_rate": 9.758326144818155e-05,
"loss": 3.3399,
"step": 163
},
{
"epoch": 0.009449454063553341,
"grad_norm": 3.133429527282715,
"learning_rate": 9.755282581475769e-05,
"loss": 2.4404,
"step": 164
},
{
"epoch": 0.009507072685892081,
"grad_norm": 3.485311269760132,
"learning_rate": 9.752220453700556e-05,
"loss": 2.4284,
"step": 165
},
{
"epoch": 0.00956469130823082,
"grad_norm": 3.5158541202545166,
"learning_rate": 9.749139773446943e-05,
"loss": 2.6258,
"step": 166
},
{
"epoch": 0.00962230993056956,
"grad_norm": 3.68678617477417,
"learning_rate": 9.74604055274178e-05,
"loss": 2.9991,
"step": 167
},
{
"epoch": 0.0096799285529083,
"grad_norm": 3.9318833351135254,
"learning_rate": 9.742922803684302e-05,
"loss": 2.8223,
"step": 168
},
{
"epoch": 0.00973754717524704,
"grad_norm": 3.950181484222412,
"learning_rate": 9.739786538446076e-05,
"loss": 2.7101,
"step": 169
},
{
"epoch": 0.00979516579758578,
"grad_norm": 4.023955821990967,
"learning_rate": 9.736631769270957e-05,
"loss": 2.6821,
"step": 170
},
{
"epoch": 0.00985278441992452,
"grad_norm": 3.3458447456359863,
"learning_rate": 9.733458508475037e-05,
"loss": 2.2959,
"step": 171
},
{
"epoch": 0.009910403042263259,
"grad_norm": 4.930412292480469,
"learning_rate": 9.730266768446598e-05,
"loss": 2.9204,
"step": 172
},
{
"epoch": 0.009968021664601999,
"grad_norm": 3.9783055782318115,
"learning_rate": 9.727056561646066e-05,
"loss": 2.9848,
"step": 173
},
{
"epoch": 0.010025640286940739,
"grad_norm": 3.118838310241699,
"learning_rate": 9.723827900605962e-05,
"loss": 2.0156,
"step": 174
},
{
"epoch": 0.01008325890927948,
"grad_norm": 3.872734546661377,
"learning_rate": 9.720580797930845e-05,
"loss": 3.6258,
"step": 175
},
{
"epoch": 0.01014087753161822,
"grad_norm": 3.6694111824035645,
"learning_rate": 9.717315266297277e-05,
"loss": 2.5537,
"step": 176
},
{
"epoch": 0.01019849615395696,
"grad_norm": 4.3984479904174805,
"learning_rate": 9.714031318453764e-05,
"loss": 3.4154,
"step": 177
},
{
"epoch": 0.010256114776295698,
"grad_norm": 4.380631923675537,
"learning_rate": 9.710728967220704e-05,
"loss": 2.8647,
"step": 178
},
{
"epoch": 0.010313733398634438,
"grad_norm": 3.477525472640991,
"learning_rate": 9.707408225490344e-05,
"loss": 2.0669,
"step": 179
},
{
"epoch": 0.010371352020973178,
"grad_norm": 4.659182071685791,
"learning_rate": 9.704069106226727e-05,
"loss": 3.2767,
"step": 180
},
{
"epoch": 0.010428970643311919,
"grad_norm": 4.009170055389404,
"learning_rate": 9.700711622465643e-05,
"loss": 2.8133,
"step": 181
},
{
"epoch": 0.010486589265650659,
"grad_norm": 4.426009178161621,
"learning_rate": 9.697335787314573e-05,
"loss": 3.0768,
"step": 182
},
{
"epoch": 0.010544207887989399,
"grad_norm": 3.9654734134674072,
"learning_rate": 9.693941613952642e-05,
"loss": 2.6296,
"step": 183
},
{
"epoch": 0.010601826510328137,
"grad_norm": 4.665240287780762,
"learning_rate": 9.690529115630567e-05,
"loss": 3.0582,
"step": 184
},
{
"epoch": 0.010659445132666878,
"grad_norm": 5.048788547515869,
"learning_rate": 9.687098305670605e-05,
"loss": 3.1614,
"step": 185
},
{
"epoch": 0.010717063755005618,
"grad_norm": 4.283369064331055,
"learning_rate": 9.6836491974665e-05,
"loss": 2.3588,
"step": 186
},
{
"epoch": 0.010774682377344358,
"grad_norm": 4.863002777099609,
"learning_rate": 9.680181804483434e-05,
"loss": 2.9541,
"step": 187
},
{
"epoch": 0.010832300999683098,
"grad_norm": 5.098318099975586,
"learning_rate": 9.676696140257969e-05,
"loss": 3.0377,
"step": 188
},
{
"epoch": 0.010889919622021838,
"grad_norm": 4.282346725463867,
"learning_rate": 9.673192218398e-05,
"loss": 2.3892,
"step": 189
},
{
"epoch": 0.010947538244360577,
"grad_norm": 4.450390338897705,
"learning_rate": 9.669670052582695e-05,
"loss": 2.6212,
"step": 190
},
{
"epoch": 0.011005156866699317,
"grad_norm": 5.0615105628967285,
"learning_rate": 9.66612965656245e-05,
"loss": 3.0058,
"step": 191
},
{
"epoch": 0.011062775489038057,
"grad_norm": 5.281835556030273,
"learning_rate": 9.662571044158831e-05,
"loss": 2.7308,
"step": 192
},
{
"epoch": 0.011120394111376797,
"grad_norm": 5.784670829772949,
"learning_rate": 9.658994229264514e-05,
"loss": 2.4602,
"step": 193
},
{
"epoch": 0.011178012733715537,
"grad_norm": 4.999934196472168,
"learning_rate": 9.655399225843245e-05,
"loss": 2.3398,
"step": 194
},
{
"epoch": 0.011235631356054277,
"grad_norm": 5.754208087921143,
"learning_rate": 9.651786047929773e-05,
"loss": 2.5287,
"step": 195
},
{
"epoch": 0.011293249978393016,
"grad_norm": 5.426493167877197,
"learning_rate": 9.648154709629798e-05,
"loss": 2.2349,
"step": 196
},
{
"epoch": 0.011350868600731756,
"grad_norm": 5.01149320602417,
"learning_rate": 9.644505225119922e-05,
"loss": 2.5316,
"step": 197
},
{
"epoch": 0.011408487223070496,
"grad_norm": 5.903430938720703,
"learning_rate": 9.640837608647583e-05,
"loss": 2.7452,
"step": 198
},
{
"epoch": 0.011466105845409236,
"grad_norm": 6.292691230773926,
"learning_rate": 9.637151874531014e-05,
"loss": 2.7084,
"step": 199
},
{
"epoch": 0.011523724467747977,
"grad_norm": 7.137519359588623,
"learning_rate": 9.633448037159167e-05,
"loss": 2.4228,
"step": 200
},
{
"epoch": 0.011581343090086717,
"grad_norm": 1.7003270387649536,
"learning_rate": 9.62972611099168e-05,
"loss": 1.6855,
"step": 201
},
{
"epoch": 0.011638961712425455,
"grad_norm": 4.47566556930542,
"learning_rate": 9.625986110558801e-05,
"loss": 2.2081,
"step": 202
},
{
"epoch": 0.011696580334764195,
"grad_norm": 2.09238338470459,
"learning_rate": 9.622228050461343e-05,
"loss": 2.0333,
"step": 203
},
{
"epoch": 0.011754198957102936,
"grad_norm": 2.22066068649292,
"learning_rate": 9.618451945370622e-05,
"loss": 2.1817,
"step": 204
},
{
"epoch": 0.011811817579441676,
"grad_norm": 2.3827202320098877,
"learning_rate": 9.614657810028402e-05,
"loss": 2.2157,
"step": 205
},
{
"epoch": 0.011869436201780416,
"grad_norm": 2.484971523284912,
"learning_rate": 9.610845659246834e-05,
"loss": 2.4739,
"step": 206
},
{
"epoch": 0.011927054824119156,
"grad_norm": 2.639516592025757,
"learning_rate": 9.607015507908401e-05,
"loss": 2.6742,
"step": 207
},
{
"epoch": 0.011984673446457894,
"grad_norm": 2.6520190238952637,
"learning_rate": 9.603167370965865e-05,
"loss": 2.6892,
"step": 208
},
{
"epoch": 0.012042292068796635,
"grad_norm": 2.614192008972168,
"learning_rate": 9.599301263442192e-05,
"loss": 1.819,
"step": 209
},
{
"epoch": 0.012099910691135375,
"grad_norm": 2.7750024795532227,
"learning_rate": 9.595417200430516e-05,
"loss": 2.1509,
"step": 210
},
{
"epoch": 0.012157529313474115,
"grad_norm": 3.437013864517212,
"learning_rate": 9.591515197094064e-05,
"loss": 2.9428,
"step": 211
},
{
"epoch": 0.012215147935812855,
"grad_norm": 2.887683629989624,
"learning_rate": 9.587595268666099e-05,
"loss": 2.2205,
"step": 212
},
{
"epoch": 0.012272766558151595,
"grad_norm": 3.6182291507720947,
"learning_rate": 9.583657430449862e-05,
"loss": 3.6458,
"step": 213
},
{
"epoch": 0.012330385180490334,
"grad_norm": 3.5853371620178223,
"learning_rate": 9.579701697818519e-05,
"loss": 2.6575,
"step": 214
},
{
"epoch": 0.012388003802829074,
"grad_norm": 3.5563466548919678,
"learning_rate": 9.575728086215092e-05,
"loss": 3.2549,
"step": 215
},
{
"epoch": 0.012445622425167814,
"grad_norm": 3.733488082885742,
"learning_rate": 9.571736611152402e-05,
"loss": 2.8883,
"step": 216
},
{
"epoch": 0.012503241047506554,
"grad_norm": 4.015854835510254,
"learning_rate": 9.567727288213005e-05,
"loss": 3.2983,
"step": 217
},
{
"epoch": 0.012560859669845294,
"grad_norm": 4.0755085945129395,
"learning_rate": 9.563700133049139e-05,
"loss": 2.8935,
"step": 218
},
{
"epoch": 0.012618478292184035,
"grad_norm": 4.986447811126709,
"learning_rate": 9.559655161382657e-05,
"loss": 2.9864,
"step": 219
},
{
"epoch": 0.012676096914522773,
"grad_norm": 4.987060546875,
"learning_rate": 9.555592389004966e-05,
"loss": 2.7582,
"step": 220
},
{
"epoch": 0.012733715536861513,
"grad_norm": 3.952512502670288,
"learning_rate": 9.551511831776965e-05,
"loss": 2.5057,
"step": 221
},
{
"epoch": 0.012791334159200253,
"grad_norm": 4.00080680847168,
"learning_rate": 9.547413505628991e-05,
"loss": 2.1339,
"step": 222
},
{
"epoch": 0.012848952781538994,
"grad_norm": 4.07069206237793,
"learning_rate": 9.543297426560739e-05,
"loss": 2.3093,
"step": 223
},
{
"epoch": 0.012906571403877734,
"grad_norm": 3.894016742706299,
"learning_rate": 9.53916361064122e-05,
"loss": 2.7302,
"step": 224
},
{
"epoch": 0.012964190026216474,
"grad_norm": 3.509031295776367,
"learning_rate": 9.535012074008687e-05,
"loss": 2.5606,
"step": 225
},
{
"epoch": 0.013021808648555212,
"grad_norm": 4.635789394378662,
"learning_rate": 9.53084283287057e-05,
"loss": 3.213,
"step": 226
},
{
"epoch": 0.013079427270893952,
"grad_norm": 4.012856960296631,
"learning_rate": 9.526655903503423e-05,
"loss": 2.1775,
"step": 227
},
{
"epoch": 0.013137045893232693,
"grad_norm": 3.929569721221924,
"learning_rate": 9.522451302252847e-05,
"loss": 2.8141,
"step": 228
},
{
"epoch": 0.013194664515571433,
"grad_norm": 4.124006271362305,
"learning_rate": 9.518229045533438e-05,
"loss": 2.6621,
"step": 229
},
{
"epoch": 0.013252283137910173,
"grad_norm": 4.289190292358398,
"learning_rate": 9.513989149828718e-05,
"loss": 2.8283,
"step": 230
},
{
"epoch": 0.013309901760248913,
"grad_norm": 4.206380844116211,
"learning_rate": 9.50973163169107e-05,
"loss": 2.326,
"step": 231
},
{
"epoch": 0.013367520382587652,
"grad_norm": 4.703325271606445,
"learning_rate": 9.505456507741675e-05,
"loss": 2.8075,
"step": 232
},
{
"epoch": 0.013425139004926392,
"grad_norm": 4.1731061935424805,
"learning_rate": 9.501163794670444e-05,
"loss": 2.2397,
"step": 233
},
{
"epoch": 0.013482757627265132,
"grad_norm": 5.0699143409729,
"learning_rate": 9.496853509235958e-05,
"loss": 3.0914,
"step": 234
},
{
"epoch": 0.013540376249603872,
"grad_norm": 4.813347816467285,
"learning_rate": 9.492525668265399e-05,
"loss": 2.8118,
"step": 235
},
{
"epoch": 0.013597994871942612,
"grad_norm": 4.370533466339111,
"learning_rate": 9.488180288654485e-05,
"loss": 2.7526,
"step": 236
},
{
"epoch": 0.013655613494281352,
"grad_norm": 4.899754047393799,
"learning_rate": 9.483817387367403e-05,
"loss": 2.4702,
"step": 237
},
{
"epoch": 0.01371323211662009,
"grad_norm": 5.122339248657227,
"learning_rate": 9.479436981436746e-05,
"loss": 3.3428,
"step": 238
},
{
"epoch": 0.013770850738958831,
"grad_norm": 5.317091941833496,
"learning_rate": 9.475039087963442e-05,
"loss": 2.8145,
"step": 239
},
{
"epoch": 0.013828469361297571,
"grad_norm": 5.0421953201293945,
"learning_rate": 9.470623724116692e-05,
"loss": 2.71,
"step": 240
},
{
"epoch": 0.013886087983636311,
"grad_norm": 4.841738224029541,
"learning_rate": 9.4661909071339e-05,
"loss": 2.3204,
"step": 241
},
{
"epoch": 0.013943706605975052,
"grad_norm": 4.821130752563477,
"learning_rate": 9.461740654320607e-05,
"loss": 2.7815,
"step": 242
},
{
"epoch": 0.014001325228313792,
"grad_norm": 5.082713603973389,
"learning_rate": 9.45727298305042e-05,
"loss": 2.7365,
"step": 243
},
{
"epoch": 0.01405894385065253,
"grad_norm": 6.211714267730713,
"learning_rate": 9.452787910764948e-05,
"loss": 2.5038,
"step": 244
},
{
"epoch": 0.01411656247299127,
"grad_norm": 5.550928592681885,
"learning_rate": 9.448285454973738e-05,
"loss": 2.1507,
"step": 245
},
{
"epoch": 0.01417418109533001,
"grad_norm": 5.829604625701904,
"learning_rate": 9.443765633254191e-05,
"loss": 2.7516,
"step": 246
},
{
"epoch": 0.01423179971766875,
"grad_norm": 5.887080192565918,
"learning_rate": 9.439228463251515e-05,
"loss": 2.4283,
"step": 247
},
{
"epoch": 0.01428941834000749,
"grad_norm": 7.108898162841797,
"learning_rate": 9.434673962678638e-05,
"loss": 3.1279,
"step": 248
},
{
"epoch": 0.014347036962346231,
"grad_norm": 5.921107292175293,
"learning_rate": 9.430102149316146e-05,
"loss": 2.0765,
"step": 249
},
{
"epoch": 0.01440465558468497,
"grad_norm": 7.07598352432251,
"learning_rate": 9.425513041012219e-05,
"loss": 2.3393,
"step": 250
},
{
"epoch": 0.01446227420702371,
"grad_norm": 1.9601157903671265,
"learning_rate": 9.420906655682553e-05,
"loss": 1.6525,
"step": 251
},
{
"epoch": 0.01451989282936245,
"grad_norm": 2.1932880878448486,
"learning_rate": 9.416283011310286e-05,
"loss": 1.9939,
"step": 252
},
{
"epoch": 0.01457751145170119,
"grad_norm": 2.458268880844116,
"learning_rate": 9.411642125945945e-05,
"loss": 2.1391,
"step": 253
},
{
"epoch": 0.01463513007403993,
"grad_norm": 2.6609833240509033,
"learning_rate": 9.406984017707361e-05,
"loss": 2.28,
"step": 254
},
{
"epoch": 0.01469274869637867,
"grad_norm": 2.335190773010254,
"learning_rate": 9.402308704779599e-05,
"loss": 1.9871,
"step": 255
},
{
"epoch": 0.014750367318717409,
"grad_norm": 2.8070573806762695,
"learning_rate": 9.397616205414896e-05,
"loss": 2.65,
"step": 256
},
{
"epoch": 0.014807985941056149,
"grad_norm": 2.656156539916992,
"learning_rate": 9.392906537932582e-05,
"loss": 2.2612,
"step": 257
},
{
"epoch": 0.014865604563394889,
"grad_norm": 3.367865562438965,
"learning_rate": 9.388179720719008e-05,
"loss": 3.0792,
"step": 258
},
{
"epoch": 0.01492322318573363,
"grad_norm": 3.022843360900879,
"learning_rate": 9.38343577222748e-05,
"loss": 2.8359,
"step": 259
},
{
"epoch": 0.01498084180807237,
"grad_norm": 3.2513225078582764,
"learning_rate": 9.378674710978185e-05,
"loss": 2.5597,
"step": 260
},
{
"epoch": 0.01503846043041111,
"grad_norm": 2.675574779510498,
"learning_rate": 9.373896555558113e-05,
"loss": 1.8233,
"step": 261
},
{
"epoch": 0.015096079052749848,
"grad_norm": 3.866176128387451,
"learning_rate": 9.369101324620992e-05,
"loss": 2.5793,
"step": 262
},
{
"epoch": 0.015153697675088588,
"grad_norm": 3.9211225509643555,
"learning_rate": 9.364289036887213e-05,
"loss": 2.9929,
"step": 263
},
{
"epoch": 0.015211316297427328,
"grad_norm": 3.8750314712524414,
"learning_rate": 9.359459711143752e-05,
"loss": 2.9654,
"step": 264
},
{
"epoch": 0.015268934919766068,
"grad_norm": 4.259275913238525,
"learning_rate": 9.354613366244108e-05,
"loss": 2.8893,
"step": 265
},
{
"epoch": 0.015326553542104809,
"grad_norm": 4.7938618659973145,
"learning_rate": 9.349750021108211e-05,
"loss": 3.1871,
"step": 266
},
{
"epoch": 0.015384172164443549,
"grad_norm": 3.7825448513031006,
"learning_rate": 9.344869694722372e-05,
"loss": 2.2743,
"step": 267
},
{
"epoch": 0.015441790786782287,
"grad_norm": 5.235022068023682,
"learning_rate": 9.339972406139185e-05,
"loss": 3.0404,
"step": 268
},
{
"epoch": 0.015499409409121027,
"grad_norm": 3.8863797187805176,
"learning_rate": 9.335058174477471e-05,
"loss": 2.6207,
"step": 269
},
{
"epoch": 0.015557028031459768,
"grad_norm": 4.3296966552734375,
"learning_rate": 9.330127018922194e-05,
"loss": 3.2352,
"step": 270
},
{
"epoch": 0.015614646653798508,
"grad_norm": 3.8498082160949707,
"learning_rate": 9.325178958724386e-05,
"loss": 2.2887,
"step": 271
},
{
"epoch": 0.015672265276137246,
"grad_norm": 3.570688009262085,
"learning_rate": 9.320214013201078e-05,
"loss": 2.495,
"step": 272
},
{
"epoch": 0.015729883898475988,
"grad_norm": 4.38509464263916,
"learning_rate": 9.315232201735217e-05,
"loss": 2.3466,
"step": 273
},
{
"epoch": 0.015787502520814727,
"grad_norm": 3.247440814971924,
"learning_rate": 9.310233543775597e-05,
"loss": 2.0713,
"step": 274
},
{
"epoch": 0.01584512114315347,
"grad_norm": 3.9758951663970947,
"learning_rate": 9.305218058836778e-05,
"loss": 2.2145,
"step": 275
},
{
"epoch": 0.015902739765492207,
"grad_norm": 4.604752540588379,
"learning_rate": 9.300185766499012e-05,
"loss": 2.6974,
"step": 276
},
{
"epoch": 0.015960358387830945,
"grad_norm": 4.320189952850342,
"learning_rate": 9.295136686408166e-05,
"loss": 2.1342,
"step": 277
},
{
"epoch": 0.016017977010169687,
"grad_norm": 4.354703426361084,
"learning_rate": 9.290070838275649e-05,
"loss": 2.6596,
"step": 278
},
{
"epoch": 0.016075595632508426,
"grad_norm": 4.107254981994629,
"learning_rate": 9.284988241878326e-05,
"loss": 2.3398,
"step": 279
},
{
"epoch": 0.016133214254847168,
"grad_norm": 3.9640181064605713,
"learning_rate": 9.279888917058452e-05,
"loss": 2.134,
"step": 280
},
{
"epoch": 0.016190832877185906,
"grad_norm": 4.547280311584473,
"learning_rate": 9.274772883723587e-05,
"loss": 2.5721,
"step": 281
},
{
"epoch": 0.016248451499524648,
"grad_norm": 4.294773578643799,
"learning_rate": 9.26964016184652e-05,
"loss": 2.1551,
"step": 282
},
{
"epoch": 0.016306070121863386,
"grad_norm": 4.612068176269531,
"learning_rate": 9.264490771465191e-05,
"loss": 2.8071,
"step": 283
},
{
"epoch": 0.016363688744202125,
"grad_norm": 4.3436384201049805,
"learning_rate": 9.259324732682615e-05,
"loss": 2.5786,
"step": 284
},
{
"epoch": 0.016421307366540867,
"grad_norm": 5.532114505767822,
"learning_rate": 9.254142065666801e-05,
"loss": 3.0383,
"step": 285
},
{
"epoch": 0.016478925988879605,
"grad_norm": 3.9752144813537598,
"learning_rate": 9.248942790650672e-05,
"loss": 1.821,
"step": 286
},
{
"epoch": 0.016536544611218347,
"grad_norm": 4.935574054718018,
"learning_rate": 9.243726927931991e-05,
"loss": 2.646,
"step": 287
},
{
"epoch": 0.016594163233557085,
"grad_norm": 5.788394451141357,
"learning_rate": 9.238494497873273e-05,
"loss": 3.1577,
"step": 288
},
{
"epoch": 0.016651781855895824,
"grad_norm": 4.732761859893799,
"learning_rate": 9.233245520901723e-05,
"loss": 2.1881,
"step": 289
},
{
"epoch": 0.016709400478234566,
"grad_norm": 6.236813068389893,
"learning_rate": 9.22798001750913e-05,
"loss": 2.9188,
"step": 290
},
{
"epoch": 0.016767019100573304,
"grad_norm": 5.6987833976745605,
"learning_rate": 9.222698008251813e-05,
"loss": 2.394,
"step": 291
},
{
"epoch": 0.016824637722912046,
"grad_norm": 5.997603416442871,
"learning_rate": 9.217399513750524e-05,
"loss": 2.3094,
"step": 292
},
{
"epoch": 0.016882256345250785,
"grad_norm": 6.196407318115234,
"learning_rate": 9.21208455469037e-05,
"loss": 2.5313,
"step": 293
},
{
"epoch": 0.016939874967589526,
"grad_norm": 5.318828105926514,
"learning_rate": 9.206753151820742e-05,
"loss": 2.0242,
"step": 294
},
{
"epoch": 0.016997493589928265,
"grad_norm": 5.003267765045166,
"learning_rate": 9.201405325955221e-05,
"loss": 2.1246,
"step": 295
},
{
"epoch": 0.017055112212267003,
"grad_norm": 7.6371941566467285,
"learning_rate": 9.196041097971508e-05,
"loss": 3.0433,
"step": 296
},
{
"epoch": 0.017112730834605745,
"grad_norm": 6.163952827453613,
"learning_rate": 9.190660488811331e-05,
"loss": 2.0825,
"step": 297
},
{
"epoch": 0.017170349456944484,
"grad_norm": 6.5632524490356445,
"learning_rate": 9.185263519480377e-05,
"loss": 2.21,
"step": 298
},
{
"epoch": 0.017227968079283226,
"grad_norm": 6.606075286865234,
"learning_rate": 9.179850211048193e-05,
"loss": 2.0958,
"step": 299
},
{
"epoch": 0.017285586701621964,
"grad_norm": 7.430078506469727,
"learning_rate": 9.174420584648123e-05,
"loss": 1.7992,
"step": 300
},
{
"epoch": 0.017343205323960702,
"grad_norm": 1.8099000453948975,
"learning_rate": 9.168974661477205e-05,
"loss": 1.6688,
"step": 301
},
{
"epoch": 0.017400823946299444,
"grad_norm": 2.2371535301208496,
"learning_rate": 9.163512462796114e-05,
"loss": 1.8612,
"step": 302
},
{
"epoch": 0.017458442568638183,
"grad_norm": 2.3684723377227783,
"learning_rate": 9.158034009929046e-05,
"loss": 1.6848,
"step": 303
},
{
"epoch": 0.017516061190976925,
"grad_norm": 2.5595788955688477,
"learning_rate": 9.152539324263667e-05,
"loss": 2.1883,
"step": 304
},
{
"epoch": 0.017573679813315663,
"grad_norm": 2.551753282546997,
"learning_rate": 9.14702842725101e-05,
"loss": 1.8682,
"step": 305
},
{
"epoch": 0.017631298435654405,
"grad_norm": 2.7641727924346924,
"learning_rate": 9.141501340405394e-05,
"loss": 2.4364,
"step": 306
},
{
"epoch": 0.017688917057993143,
"grad_norm": 2.860710859298706,
"learning_rate": 9.135958085304344e-05,
"loss": 1.9544,
"step": 307
},
{
"epoch": 0.017746535680331882,
"grad_norm": 3.2336370944976807,
"learning_rate": 9.13039868358851e-05,
"loss": 2.507,
"step": 308
},
{
"epoch": 0.017804154302670624,
"grad_norm": 3.377209424972534,
"learning_rate": 9.12482315696157e-05,
"loss": 2.6772,
"step": 309
},
{
"epoch": 0.017861772925009362,
"grad_norm": 2.9247360229492188,
"learning_rate": 9.119231527190158e-05,
"loss": 2.1298,
"step": 310
},
{
"epoch": 0.017919391547348104,
"grad_norm": 3.6007273197174072,
"learning_rate": 9.113623816103773e-05,
"loss": 2.931,
"step": 311
},
{
"epoch": 0.017977010169686843,
"grad_norm": 3.4026882648468018,
"learning_rate": 9.108000045594695e-05,
"loss": 2.3958,
"step": 312
},
{
"epoch": 0.01803462879202558,
"grad_norm": 3.8160836696624756,
"learning_rate": 9.102360237617899e-05,
"loss": 2.3448,
"step": 313
},
{
"epoch": 0.018092247414364323,
"grad_norm": 4.231561183929443,
"learning_rate": 9.096704414190969e-05,
"loss": 2.3933,
"step": 314
},
{
"epoch": 0.01814986603670306,
"grad_norm": 4.625421047210693,
"learning_rate": 9.091032597394012e-05,
"loss": 2.7722,
"step": 315
},
{
"epoch": 0.018207484659041803,
"grad_norm": 3.672778367996216,
"learning_rate": 9.085344809369575e-05,
"loss": 2.6004,
"step": 316
},
{
"epoch": 0.01826510328138054,
"grad_norm": 4.438925743103027,
"learning_rate": 9.079641072322556e-05,
"loss": 2.7003,
"step": 317
},
{
"epoch": 0.018322721903719284,
"grad_norm": 4.589539051055908,
"learning_rate": 9.073921408520115e-05,
"loss": 2.6641,
"step": 318
},
{
"epoch": 0.018380340526058022,
"grad_norm": 4.870488166809082,
"learning_rate": 9.068185840291588e-05,
"loss": 2.9457,
"step": 319
},
{
"epoch": 0.01843795914839676,
"grad_norm": 4.230988025665283,
"learning_rate": 9.062434390028407e-05,
"loss": 2.8592,
"step": 320
},
{
"epoch": 0.018495577770735502,
"grad_norm": 5.677562236785889,
"learning_rate": 9.056667080184003e-05,
"loss": 3.0967,
"step": 321
},
{
"epoch": 0.01855319639307424,
"grad_norm": 4.379852294921875,
"learning_rate": 9.050883933273722e-05,
"loss": 2.1904,
"step": 322
},
{
"epoch": 0.018610815015412983,
"grad_norm": 4.194091796875,
"learning_rate": 9.045084971874738e-05,
"loss": 2.5451,
"step": 323
},
{
"epoch": 0.01866843363775172,
"grad_norm": 3.8151166439056396,
"learning_rate": 9.039270218625964e-05,
"loss": 2.2764,
"step": 324
},
{
"epoch": 0.01872605226009046,
"grad_norm": 4.090627193450928,
"learning_rate": 9.033439696227965e-05,
"loss": 1.7737,
"step": 325
},
{
"epoch": 0.0187836708824292,
"grad_norm": 4.154264450073242,
"learning_rate": 9.027593427442867e-05,
"loss": 2.2204,
"step": 326
},
{
"epoch": 0.01884128950476794,
"grad_norm": 4.772518634796143,
"learning_rate": 9.021731435094268e-05,
"loss": 2.8687,
"step": 327
},
{
"epoch": 0.018898908127106682,
"grad_norm": 4.55942440032959,
"learning_rate": 9.015853742067152e-05,
"loss": 2.4891,
"step": 328
},
{
"epoch": 0.01895652674944542,
"grad_norm": 4.553669452667236,
"learning_rate": 9.009960371307798e-05,
"loss": 3.0723,
"step": 329
},
{
"epoch": 0.019014145371784162,
"grad_norm": 4.261239051818848,
"learning_rate": 9.004051345823689e-05,
"loss": 2.0938,
"step": 330
},
{
"epoch": 0.0190717639941229,
"grad_norm": 4.823665618896484,
"learning_rate": 8.998126688683422e-05,
"loss": 2.3527,
"step": 331
},
{
"epoch": 0.01912938261646164,
"grad_norm": 4.412746906280518,
"learning_rate": 8.992186423016625e-05,
"loss": 2.1589,
"step": 332
},
{
"epoch": 0.01918700123880038,
"grad_norm": 4.935317516326904,
"learning_rate": 8.986230572013855e-05,
"loss": 2.5959,
"step": 333
},
{
"epoch": 0.01924461986113912,
"grad_norm": 4.818447589874268,
"learning_rate": 8.980259158926516e-05,
"loss": 2.2899,
"step": 334
},
{
"epoch": 0.01930223848347786,
"grad_norm": 5.081963539123535,
"learning_rate": 8.974272207066767e-05,
"loss": 2.5,
"step": 335
},
{
"epoch": 0.0193598571058166,
"grad_norm": 5.195316791534424,
"learning_rate": 8.968269739807427e-05,
"loss": 2.8518,
"step": 336
},
{
"epoch": 0.019417475728155338,
"grad_norm": 5.709744453430176,
"learning_rate": 8.962251780581887e-05,
"loss": 2.93,
"step": 337
},
{
"epoch": 0.01947509435049408,
"grad_norm": 6.777749538421631,
"learning_rate": 8.956218352884022e-05,
"loss": 2.4954,
"step": 338
},
{
"epoch": 0.01953271297283282,
"grad_norm": 5.281974792480469,
"learning_rate": 8.95016948026809e-05,
"loss": 2.9115,
"step": 339
},
{
"epoch": 0.01959033159517156,
"grad_norm": 4.8840508460998535,
"learning_rate": 8.944105186348646e-05,
"loss": 2.5028,
"step": 340
},
{
"epoch": 0.0196479502175103,
"grad_norm": 5.254883289337158,
"learning_rate": 8.938025494800454e-05,
"loss": 2.399,
"step": 341
},
{
"epoch": 0.01970556883984904,
"grad_norm": 5.181918621063232,
"learning_rate": 8.931930429358384e-05,
"loss": 2.2228,
"step": 342
},
{
"epoch": 0.01976318746218778,
"grad_norm": 5.61979866027832,
"learning_rate": 8.925820013817329e-05,
"loss": 2.3166,
"step": 343
},
{
"epoch": 0.019820806084526518,
"grad_norm": 5.614563941955566,
"learning_rate": 8.919694272032107e-05,
"loss": 2.0534,
"step": 344
},
{
"epoch": 0.01987842470686526,
"grad_norm": 5.811087131500244,
"learning_rate": 8.913553227917367e-05,
"loss": 2.6265,
"step": 345
},
{
"epoch": 0.019936043329203998,
"grad_norm": 6.630307197570801,
"learning_rate": 8.9073969054475e-05,
"loss": 2.223,
"step": 346
},
{
"epoch": 0.01999366195154274,
"grad_norm": 6.211430549621582,
"learning_rate": 8.901225328656542e-05,
"loss": 2.108,
"step": 347
},
{
"epoch": 0.020051280573881478,
"grad_norm": 7.201671600341797,
"learning_rate": 8.895038521638084e-05,
"loss": 2.2399,
"step": 348
},
{
"epoch": 0.020108899196220217,
"grad_norm": 7.742012023925781,
"learning_rate": 8.88883650854517e-05,
"loss": 2.4472,
"step": 349
},
{
"epoch": 0.02016651781855896,
"grad_norm": 8.512450218200684,
"learning_rate": 8.882619313590212e-05,
"loss": 2.0485,
"step": 350
},
{
"epoch": 0.020224136440897697,
"grad_norm": 4.9366559982299805,
"learning_rate": 8.876386961044891e-05,
"loss": 2.0633,
"step": 351
},
{
"epoch": 0.02028175506323644,
"grad_norm": 2.304917097091675,
"learning_rate": 8.87013947524006e-05,
"loss": 1.595,
"step": 352
},
{
"epoch": 0.020339373685575177,
"grad_norm": 2.6259117126464844,
"learning_rate": 8.863876880565656e-05,
"loss": 1.9893,
"step": 353
},
{
"epoch": 0.02039699230791392,
"grad_norm": 2.9849984645843506,
"learning_rate": 8.857599201470596e-05,
"loss": 2.4722,
"step": 354
},
{
"epoch": 0.020454610930252658,
"grad_norm": 2.8593645095825195,
"learning_rate": 8.851306462462688e-05,
"loss": 2.1195,
"step": 355
},
{
"epoch": 0.020512229552591396,
"grad_norm": 3.100651502609253,
"learning_rate": 8.844998688108535e-05,
"loss": 2.5124,
"step": 356
},
{
"epoch": 0.020569848174930138,
"grad_norm": 3.6389474868774414,
"learning_rate": 8.83867590303343e-05,
"loss": 2.7296,
"step": 357
},
{
"epoch": 0.020627466797268876,
"grad_norm": 3.55713152885437,
"learning_rate": 8.83233813192128e-05,
"loss": 2.9306,
"step": 358
},
{
"epoch": 0.02068508541960762,
"grad_norm": 3.298729181289673,
"learning_rate": 8.825985399514487e-05,
"loss": 2.4454,
"step": 359
},
{
"epoch": 0.020742704041946357,
"grad_norm": 3.5470151901245117,
"learning_rate": 8.819617730613862e-05,
"loss": 2.7344,
"step": 360
},
{
"epoch": 0.020800322664285095,
"grad_norm": 3.965665817260742,
"learning_rate": 8.813235150078531e-05,
"loss": 2.8588,
"step": 361
},
{
"epoch": 0.020857941286623837,
"grad_norm": 3.5389463901519775,
"learning_rate": 8.806837682825835e-05,
"loss": 2.6798,
"step": 362
},
{
"epoch": 0.020915559908962576,
"grad_norm": 3.342438220977783,
"learning_rate": 8.800425353831226e-05,
"loss": 2.2356,
"step": 363
},
{
"epoch": 0.020973178531301317,
"grad_norm": 3.7508444786071777,
"learning_rate": 8.793998188128183e-05,
"loss": 2.6085,
"step": 364
},
{
"epoch": 0.021030797153640056,
"grad_norm": 4.195164680480957,
"learning_rate": 8.787556210808101e-05,
"loss": 2.972,
"step": 365
},
{
"epoch": 0.021088415775978798,
"grad_norm": 4.329472064971924,
"learning_rate": 8.781099447020203e-05,
"loss": 2.5874,
"step": 366
},
{
"epoch": 0.021146034398317536,
"grad_norm": 4.064694404602051,
"learning_rate": 8.774627921971436e-05,
"loss": 2.1788,
"step": 367
},
{
"epoch": 0.021203653020656275,
"grad_norm": 4.8341898918151855,
"learning_rate": 8.768141660926375e-05,
"loss": 3.5546,
"step": 368
},
{
"epoch": 0.021261271642995017,
"grad_norm": 5.299094200134277,
"learning_rate": 8.761640689207123e-05,
"loss": 2.3979,
"step": 369
},
{
"epoch": 0.021318890265333755,
"grad_norm": 4.132681369781494,
"learning_rate": 8.755125032193214e-05,
"loss": 2.1,
"step": 370
},
{
"epoch": 0.021376508887672497,
"grad_norm": 5.212367534637451,
"learning_rate": 8.748594715321512e-05,
"loss": 3.3294,
"step": 371
},
{
"epoch": 0.021434127510011235,
"grad_norm": 3.6085150241851807,
"learning_rate": 8.742049764086114e-05,
"loss": 2.0759,
"step": 372
},
{
"epoch": 0.021491746132349974,
"grad_norm": 4.469633102416992,
"learning_rate": 8.735490204038243e-05,
"loss": 2.7305,
"step": 373
},
{
"epoch": 0.021549364754688716,
"grad_norm": 4.170452117919922,
"learning_rate": 8.728916060786162e-05,
"loss": 2.0771,
"step": 374
},
{
"epoch": 0.021606983377027454,
"grad_norm": 4.10450553894043,
"learning_rate": 8.722327359995064e-05,
"loss": 2.5309,
"step": 375
},
{
"epoch": 0.021664601999366196,
"grad_norm": 4.606810569763184,
"learning_rate": 8.715724127386972e-05,
"loss": 2.6873,
"step": 376
},
{
"epoch": 0.021722220621704934,
"grad_norm": 4.157223701477051,
"learning_rate": 8.709106388740642e-05,
"loss": 2.4526,
"step": 377
},
{
"epoch": 0.021779839244043676,
"grad_norm": 3.954803705215454,
"learning_rate": 8.702474169891459e-05,
"loss": 2.0582,
"step": 378
},
{
"epoch": 0.021837457866382415,
"grad_norm": 4.902177810668945,
"learning_rate": 8.695827496731342e-05,
"loss": 1.9463,
"step": 379
},
{
"epoch": 0.021895076488721153,
"grad_norm": 4.323647975921631,
"learning_rate": 8.689166395208636e-05,
"loss": 2.3647,
"step": 380
},
{
"epoch": 0.021952695111059895,
"grad_norm": 4.937097072601318,
"learning_rate": 8.682490891328016e-05,
"loss": 3.3896,
"step": 381
},
{
"epoch": 0.022010313733398634,
"grad_norm": 5.051680088043213,
"learning_rate": 8.675801011150381e-05,
"loss": 2.639,
"step": 382
},
{
"epoch": 0.022067932355737375,
"grad_norm": 3.990417242050171,
"learning_rate": 8.669096780792753e-05,
"loss": 1.8079,
"step": 383
},
{
"epoch": 0.022125550978076114,
"grad_norm": 5.31498384475708,
"learning_rate": 8.662378226428183e-05,
"loss": 2.5187,
"step": 384
},
{
"epoch": 0.022183169600414852,
"grad_norm": 4.851081848144531,
"learning_rate": 8.655645374285637e-05,
"loss": 1.8444,
"step": 385
},
{
"epoch": 0.022240788222753594,
"grad_norm": 4.884163856506348,
"learning_rate": 8.6488982506499e-05,
"loss": 1.988,
"step": 386
},
{
"epoch": 0.022298406845092333,
"grad_norm": 4.876925945281982,
"learning_rate": 8.64213688186147e-05,
"loss": 2.3561,
"step": 387
},
{
"epoch": 0.022356025467431075,
"grad_norm": 5.017867565155029,
"learning_rate": 8.635361294316464e-05,
"loss": 2.2884,
"step": 388
},
{
"epoch": 0.022413644089769813,
"grad_norm": 5.819447994232178,
"learning_rate": 8.628571514466501e-05,
"loss": 2.345,
"step": 389
},
{
"epoch": 0.022471262712108555,
"grad_norm": 5.401952743530273,
"learning_rate": 8.621767568818613e-05,
"loss": 2.132,
"step": 390
},
{
"epoch": 0.022528881334447293,
"grad_norm": 7.321330547332764,
"learning_rate": 8.61494948393513e-05,
"loss": 3.0322,
"step": 391
},
{
"epoch": 0.022586499956786032,
"grad_norm": 7.665639400482178,
"learning_rate": 8.608117286433582e-05,
"loss": 3.0676,
"step": 392
},
{
"epoch": 0.022644118579124774,
"grad_norm": 6.097929000854492,
"learning_rate": 8.601271002986595e-05,
"loss": 2.1266,
"step": 393
},
{
"epoch": 0.022701737201463512,
"grad_norm": 5.82421875,
"learning_rate": 8.594410660321786e-05,
"loss": 1.8135,
"step": 394
},
{
"epoch": 0.022759355823802254,
"grad_norm": 7.916313648223877,
"learning_rate": 8.587536285221656e-05,
"loss": 2.6945,
"step": 395
},
{
"epoch": 0.022816974446140992,
"grad_norm": 6.374413967132568,
"learning_rate": 8.580647904523493e-05,
"loss": 2.5481,
"step": 396
},
{
"epoch": 0.02287459306847973,
"grad_norm": 7.443820953369141,
"learning_rate": 8.573745545119257e-05,
"loss": 2.7293,
"step": 397
},
{
"epoch": 0.022932211690818473,
"grad_norm": 7.652746200561523,
"learning_rate": 8.566829233955485e-05,
"loss": 2.3907,
"step": 398
},
{
"epoch": 0.02298983031315721,
"grad_norm": 8.252161026000977,
"learning_rate": 8.559898998033178e-05,
"loss": 2.0131,
"step": 399
},
{
"epoch": 0.023047448935495953,
"grad_norm": 7.583081245422363,
"learning_rate": 8.552954864407699e-05,
"loss": 1.9353,
"step": 400
},
{
"epoch": 0.02310506755783469,
"grad_norm": 2.136597156524658,
"learning_rate": 8.545996860188668e-05,
"loss": 2.2714,
"step": 401
},
{
"epoch": 0.023162686180173434,
"grad_norm": 2.3141865730285645,
"learning_rate": 8.539025012539854e-05,
"loss": 2.1436,
"step": 402
},
{
"epoch": 0.023220304802512172,
"grad_norm": 2.5093986988067627,
"learning_rate": 8.532039348679073e-05,
"loss": 2.2187,
"step": 403
},
{
"epoch": 0.02327792342485091,
"grad_norm": 2.6145665645599365,
"learning_rate": 8.525039895878078e-05,
"loss": 1.9852,
"step": 404
},
{
"epoch": 0.023335542047189652,
"grad_norm": 2.691318988800049,
"learning_rate": 8.518026681462448e-05,
"loss": 1.8517,
"step": 405
},
{
"epoch": 0.02339316066952839,
"grad_norm": 2.894033432006836,
"learning_rate": 8.510999732811495e-05,
"loss": 2.3184,
"step": 406
},
{
"epoch": 0.023450779291867133,
"grad_norm": 2.834489345550537,
"learning_rate": 8.503959077358143e-05,
"loss": 2.4516,
"step": 407
},
{
"epoch": 0.02350839791420587,
"grad_norm": 3.279219388961792,
"learning_rate": 8.496904742588832e-05,
"loss": 2.058,
"step": 408
},
{
"epoch": 0.023566016536544613,
"grad_norm": 3.042360305786133,
"learning_rate": 8.4898367560434e-05,
"loss": 2.3367,
"step": 409
},
{
"epoch": 0.02362363515888335,
"grad_norm": 3.081303358078003,
"learning_rate": 8.482755145314986e-05,
"loss": 1.8629,
"step": 410
},
{
"epoch": 0.02368125378122209,
"grad_norm": 3.7365081310272217,
"learning_rate": 8.475659938049912e-05,
"loss": 2.6789,
"step": 411
},
{
"epoch": 0.02373887240356083,
"grad_norm": 3.6360349655151367,
"learning_rate": 8.468551161947584e-05,
"loss": 2.3029,
"step": 412
},
{
"epoch": 0.02379649102589957,
"grad_norm": 4.085989475250244,
"learning_rate": 8.46142884476038e-05,
"loss": 2.9578,
"step": 413
},
{
"epoch": 0.023854109648238312,
"grad_norm": 4.064683437347412,
"learning_rate": 8.454293014293539e-05,
"loss": 3.1592,
"step": 414
},
{
"epoch": 0.02391172827057705,
"grad_norm": 3.98429274559021,
"learning_rate": 8.44714369840506e-05,
"loss": 2.4153,
"step": 415
},
{
"epoch": 0.02396934689291579,
"grad_norm": 4.855700969696045,
"learning_rate": 8.439980925005586e-05,
"loss": 2.6038,
"step": 416
},
{
"epoch": 0.02402696551525453,
"grad_norm": 4.366487979888916,
"learning_rate": 8.432804722058296e-05,
"loss": 2.5389,
"step": 417
},
{
"epoch": 0.02408458413759327,
"grad_norm": 4.151554584503174,
"learning_rate": 8.4256151175788e-05,
"loss": 2.9281,
"step": 418
},
{
"epoch": 0.02414220275993201,
"grad_norm": 4.878279685974121,
"learning_rate": 8.418412139635025e-05,
"loss": 2.6943,
"step": 419
},
{
"epoch": 0.02419982138227075,
"grad_norm": 4.620284080505371,
"learning_rate": 8.41119581634711e-05,
"loss": 1.9838,
"step": 420
},
{
"epoch": 0.02425744000460949,
"grad_norm": 4.405956268310547,
"learning_rate": 8.403966175887292e-05,
"loss": 2.6235,
"step": 421
},
{
"epoch": 0.02431505862694823,
"grad_norm": 5.052657127380371,
"learning_rate": 8.396723246479797e-05,
"loss": 2.3465,
"step": 422
},
{
"epoch": 0.02437267724928697,
"grad_norm": 4.088731288909912,
"learning_rate": 8.389467056400732e-05,
"loss": 1.7849,
"step": 423
},
{
"epoch": 0.02443029587162571,
"grad_norm": 4.706538677215576,
"learning_rate": 8.382197633977973e-05,
"loss": 2.1348,
"step": 424
},
{
"epoch": 0.02448791449396445,
"grad_norm": 4.79817008972168,
"learning_rate": 8.374915007591053e-05,
"loss": 2.7331,
"step": 425
},
{
"epoch": 0.02454553311630319,
"grad_norm": 4.493376731872559,
"learning_rate": 8.367619205671054e-05,
"loss": 2.5587,
"step": 426
},
{
"epoch": 0.02460315173864193,
"grad_norm": 4.494572162628174,
"learning_rate": 8.360310256700497e-05,
"loss": 2.4358,
"step": 427
},
{
"epoch": 0.024660770360980667,
"grad_norm": 4.108890056610107,
"learning_rate": 8.352988189213222e-05,
"loss": 2.1585,
"step": 428
},
{
"epoch": 0.02471838898331941,
"grad_norm": 5.368739128112793,
"learning_rate": 8.345653031794292e-05,
"loss": 2.8628,
"step": 429
},
{
"epoch": 0.024776007605658148,
"grad_norm": 6.097105026245117,
"learning_rate": 8.338304813079865e-05,
"loss": 2.6127,
"step": 430
},
{
"epoch": 0.02483362622799689,
"grad_norm": 4.863004684448242,
"learning_rate": 8.330943561757092e-05,
"loss": 2.2945,
"step": 431
},
{
"epoch": 0.024891244850335628,
"grad_norm": 5.4962077140808105,
"learning_rate": 8.323569306564005e-05,
"loss": 2.4823,
"step": 432
},
{
"epoch": 0.02494886347267437,
"grad_norm": 5.401646614074707,
"learning_rate": 8.316182076289401e-05,
"loss": 2.3703,
"step": 433
},
{
"epoch": 0.02500648209501311,
"grad_norm": 6.308868408203125,
"learning_rate": 8.30878189977273e-05,
"loss": 3.3262,
"step": 434
},
{
"epoch": 0.025064100717351847,
"grad_norm": 5.612480640411377,
"learning_rate": 8.301368805903988e-05,
"loss": 1.9546,
"step": 435
},
{
"epoch": 0.02512171933969059,
"grad_norm": 5.437137126922607,
"learning_rate": 8.293942823623593e-05,
"loss": 1.918,
"step": 436
},
{
"epoch": 0.025179337962029327,
"grad_norm": 6.038562297821045,
"learning_rate": 8.286503981922283e-05,
"loss": 2.4432,
"step": 437
},
{
"epoch": 0.02523695658436807,
"grad_norm": 5.992568492889404,
"learning_rate": 8.279052309841e-05,
"loss": 3.0598,
"step": 438
},
{
"epoch": 0.025294575206706808,
"grad_norm": 5.9145941734313965,
"learning_rate": 8.271587836470775e-05,
"loss": 2.3731,
"step": 439
},
{
"epoch": 0.025352193829045546,
"grad_norm": 5.385437488555908,
"learning_rate": 8.264110590952609e-05,
"loss": 2.0939,
"step": 440
},
{
"epoch": 0.025409812451384288,
"grad_norm": 5.642551422119141,
"learning_rate": 8.256620602477372e-05,
"loss": 2.4044,
"step": 441
},
{
"epoch": 0.025467431073723026,
"grad_norm": 5.5099077224731445,
"learning_rate": 8.249117900285676e-05,
"loss": 2.1055,
"step": 442
},
{
"epoch": 0.02552504969606177,
"grad_norm": 6.118095397949219,
"learning_rate": 8.241602513667774e-05,
"loss": 2.3521,
"step": 443
},
{
"epoch": 0.025582668318400507,
"grad_norm": 5.9850850105285645,
"learning_rate": 8.234074471963434e-05,
"loss": 1.9222,
"step": 444
},
{
"epoch": 0.02564028694073925,
"grad_norm": 7.145914077758789,
"learning_rate": 8.226533804561827e-05,
"loss": 2.4491,
"step": 445
},
{
"epoch": 0.025697905563077987,
"grad_norm": 7.012725830078125,
"learning_rate": 8.218980540901418e-05,
"loss": 2.4023,
"step": 446
},
{
"epoch": 0.025755524185416726,
"grad_norm": 6.856975555419922,
"learning_rate": 8.211414710469845e-05,
"loss": 2.2663,
"step": 447
},
{
"epoch": 0.025813142807755467,
"grad_norm": 6.3824357986450195,
"learning_rate": 8.203836342803807e-05,
"loss": 1.9154,
"step": 448
},
{
"epoch": 0.025870761430094206,
"grad_norm": 7.734108924865723,
"learning_rate": 8.19624546748895e-05,
"loss": 2.1792,
"step": 449
},
{
"epoch": 0.025928380052432948,
"grad_norm": 7.385306358337402,
"learning_rate": 8.188642114159747e-05,
"loss": 1.6915,
"step": 450
},
{
"epoch": 0.025985998674771686,
"grad_norm": 1.7791303396224976,
"learning_rate": 8.181026312499383e-05,
"loss": 1.6307,
"step": 451
},
{
"epoch": 0.026043617297110425,
"grad_norm": 2.2404377460479736,
"learning_rate": 8.173398092239646e-05,
"loss": 1.7771,
"step": 452
},
{
"epoch": 0.026101235919449167,
"grad_norm": 2.0696511268615723,
"learning_rate": 8.165757483160798e-05,
"loss": 1.4744,
"step": 453
},
{
"epoch": 0.026158854541787905,
"grad_norm": 2.1022167205810547,
"learning_rate": 8.158104515091476e-05,
"loss": 1.1879,
"step": 454
},
{
"epoch": 0.026216473164126647,
"grad_norm": 2.3837785720825195,
"learning_rate": 8.150439217908556e-05,
"loss": 1.438,
"step": 455
},
{
"epoch": 0.026274091786465385,
"grad_norm": 2.5221476554870605,
"learning_rate": 8.142761621537055e-05,
"loss": 1.812,
"step": 456
},
{
"epoch": 0.026331710408804127,
"grad_norm": 2.6316726207733154,
"learning_rate": 8.13507175595e-05,
"loss": 1.6436,
"step": 457
},
{
"epoch": 0.026389329031142866,
"grad_norm": 3.3228557109832764,
"learning_rate": 8.12736965116832e-05,
"loss": 2.2098,
"step": 458
},
{
"epoch": 0.026446947653481604,
"grad_norm": 3.061286687850952,
"learning_rate": 8.11965533726072e-05,
"loss": 2.0349,
"step": 459
},
{
"epoch": 0.026504566275820346,
"grad_norm": 3.6266422271728516,
"learning_rate": 8.11192884434358e-05,
"loss": 2.3822,
"step": 460
},
{
"epoch": 0.026562184898159084,
"grad_norm": 3.2216854095458984,
"learning_rate": 8.104190202580812e-05,
"loss": 1.9171,
"step": 461
},
{
"epoch": 0.026619803520497826,
"grad_norm": 3.5767996311187744,
"learning_rate": 8.096439442183766e-05,
"loss": 2.3181,
"step": 462
},
{
"epoch": 0.026677422142836565,
"grad_norm": 3.7908284664154053,
"learning_rate": 8.0886765934111e-05,
"loss": 2.5111,
"step": 463
},
{
"epoch": 0.026735040765175303,
"grad_norm": 4.937187194824219,
"learning_rate": 8.080901686568665e-05,
"loss": 2.9298,
"step": 464
},
{
"epoch": 0.026792659387514045,
"grad_norm": 4.011484622955322,
"learning_rate": 8.073114752009387e-05,
"loss": 2.6274,
"step": 465
},
{
"epoch": 0.026850278009852784,
"grad_norm": 4.1421003341674805,
"learning_rate": 8.065315820133146e-05,
"loss": 2.4794,
"step": 466
},
{
"epoch": 0.026907896632191525,
"grad_norm": 4.228821754455566,
"learning_rate": 8.05750492138666e-05,
"loss": 2.7021,
"step": 467
},
{
"epoch": 0.026965515254530264,
"grad_norm": 4.556756496429443,
"learning_rate": 8.049682086263365e-05,
"loss": 2.9866,
"step": 468
},
{
"epoch": 0.027023133876869006,
"grad_norm": 5.285801887512207,
"learning_rate": 8.041847345303297e-05,
"loss": 2.299,
"step": 469
},
{
"epoch": 0.027080752499207744,
"grad_norm": 4.734602451324463,
"learning_rate": 8.034000729092968e-05,
"loss": 2.6673,
"step": 470
},
{
"epoch": 0.027138371121546483,
"grad_norm": 4.674554347991943,
"learning_rate": 8.026142268265256e-05,
"loss": 2.185,
"step": 471
},
{
"epoch": 0.027195989743885225,
"grad_norm": 6.341719150543213,
"learning_rate": 8.018271993499278e-05,
"loss": 2.5202,
"step": 472
},
{
"epoch": 0.027253608366223963,
"grad_norm": 5.137526512145996,
"learning_rate": 8.01038993552027e-05,
"loss": 2.662,
"step": 473
},
{
"epoch": 0.027311226988562705,
"grad_norm": 4.619840145111084,
"learning_rate": 8.00249612509947e-05,
"loss": 2.3164,
"step": 474
},
{
"epoch": 0.027368845610901443,
"grad_norm": 4.761074542999268,
"learning_rate": 7.994590593054001e-05,
"loss": 2.3709,
"step": 475
},
{
"epoch": 0.02742646423324018,
"grad_norm": 4.570230007171631,
"learning_rate": 7.986673370246742e-05,
"loss": 2.0409,
"step": 476
},
{
"epoch": 0.027484082855578924,
"grad_norm": 4.904114246368408,
"learning_rate": 7.978744487586214e-05,
"loss": 2.2348,
"step": 477
},
{
"epoch": 0.027541701477917662,
"grad_norm": 5.0561418533325195,
"learning_rate": 7.970803976026457e-05,
"loss": 1.8822,
"step": 478
},
{
"epoch": 0.027599320100256404,
"grad_norm": 6.338671684265137,
"learning_rate": 7.962851866566912e-05,
"loss": 2.9682,
"step": 479
},
{
"epoch": 0.027656938722595142,
"grad_norm": 5.3526716232299805,
"learning_rate": 7.954888190252292e-05,
"loss": 1.9048,
"step": 480
},
{
"epoch": 0.027714557344933884,
"grad_norm": 5.184986114501953,
"learning_rate": 7.946912978172474e-05,
"loss": 2.0768,
"step": 481
},
{
"epoch": 0.027772175967272623,
"grad_norm": 5.866233825683594,
"learning_rate": 7.938926261462366e-05,
"loss": 2.2475,
"step": 482
},
{
"epoch": 0.02782979458961136,
"grad_norm": 5.420923709869385,
"learning_rate": 7.93092807130179e-05,
"loss": 2.2374,
"step": 483
},
{
"epoch": 0.027887413211950103,
"grad_norm": 5.821062088012695,
"learning_rate": 7.922918438915361e-05,
"loss": 2.5666,
"step": 484
},
{
"epoch": 0.02794503183428884,
"grad_norm": 5.090494155883789,
"learning_rate": 7.91489739557236e-05,
"loss": 2.1862,
"step": 485
},
{
"epoch": 0.028002650456627583,
"grad_norm": 4.969261169433594,
"learning_rate": 7.906864972586624e-05,
"loss": 1.9363,
"step": 486
},
{
"epoch": 0.028060269078966322,
"grad_norm": 5.715576171875,
"learning_rate": 7.898821201316407e-05,
"loss": 1.9066,
"step": 487
},
{
"epoch": 0.02811788770130506,
"grad_norm": 5.48051643371582,
"learning_rate": 7.890766113164271e-05,
"loss": 2.1585,
"step": 488
},
{
"epoch": 0.028175506323643802,
"grad_norm": 7.4682135581970215,
"learning_rate": 7.882699739576959e-05,
"loss": 2.1458,
"step": 489
},
{
"epoch": 0.02823312494598254,
"grad_norm": 6.144767761230469,
"learning_rate": 7.874622112045269e-05,
"loss": 2.046,
"step": 490
},
{
"epoch": 0.028290743568321283,
"grad_norm": 6.479434013366699,
"learning_rate": 7.866533262103936e-05,
"loss": 1.6658,
"step": 491
},
{
"epoch": 0.02834836219066002,
"grad_norm": 7.613353252410889,
"learning_rate": 7.858433221331508e-05,
"loss": 2.2348,
"step": 492
},
{
"epoch": 0.028405980812998763,
"grad_norm": 6.663170337677002,
"learning_rate": 7.850322021350215e-05,
"loss": 1.9413,
"step": 493
},
{
"epoch": 0.0284635994353375,
"grad_norm": 6.2071533203125,
"learning_rate": 7.842199693825862e-05,
"loss": 1.5813,
"step": 494
},
{
"epoch": 0.02852121805767624,
"grad_norm": 6.498309135437012,
"learning_rate": 7.83406627046769e-05,
"loss": 1.7169,
"step": 495
},
{
"epoch": 0.02857883668001498,
"grad_norm": 7.530585765838623,
"learning_rate": 7.825921783028255e-05,
"loss": 1.7682,
"step": 496
},
{
"epoch": 0.02863645530235372,
"grad_norm": 7.18717622756958,
"learning_rate": 7.817766263303313e-05,
"loss": 1.8239,
"step": 497
},
{
"epoch": 0.028694073924692462,
"grad_norm": 8.100095748901367,
"learning_rate": 7.809599743131684e-05,
"loss": 2.1308,
"step": 498
},
{
"epoch": 0.0287516925470312,
"grad_norm": 9.867273330688477,
"learning_rate": 7.801422254395138e-05,
"loss": 2.2406,
"step": 499
},
{
"epoch": 0.02880931116936994,
"grad_norm": 8.283629417419434,
"learning_rate": 7.793233829018262e-05,
"loss": 1.4668,
"step": 500
},
{
"epoch": 0.02886692979170868,
"grad_norm": 2.057152509689331,
"learning_rate": 7.785034498968344e-05,
"loss": 1.5885,
"step": 501
},
{
"epoch": 0.02892454841404742,
"grad_norm": 2.754242420196533,
"learning_rate": 7.776824296255236e-05,
"loss": 2.0332,
"step": 502
},
{
"epoch": 0.02898216703638616,
"grad_norm": 2.3909084796905518,
"learning_rate": 7.768603252931243e-05,
"loss": 1.9647,
"step": 503
},
{
"epoch": 0.0290397856587249,
"grad_norm": 2.4323692321777344,
"learning_rate": 7.760371401090988e-05,
"loss": 1.4163,
"step": 504
},
{
"epoch": 0.02909740428106364,
"grad_norm": 2.638990879058838,
"learning_rate": 7.752128772871292e-05,
"loss": 2.0129,
"step": 505
},
{
"epoch": 0.02915502290340238,
"grad_norm": 3.085878372192383,
"learning_rate": 7.743875400451047e-05,
"loss": 2.394,
"step": 506
},
{
"epoch": 0.02921264152574112,
"grad_norm": 3.337479829788208,
"learning_rate": 7.735611316051084e-05,
"loss": 2.1309,
"step": 507
},
{
"epoch": 0.02927026014807986,
"grad_norm": 3.2174227237701416,
"learning_rate": 7.727336551934062e-05,
"loss": 2.4035,
"step": 508
},
{
"epoch": 0.0293278787704186,
"grad_norm": 3.7815494537353516,
"learning_rate": 7.719051140404327e-05,
"loss": 2.5671,
"step": 509
},
{
"epoch": 0.02938549739275734,
"grad_norm": 3.437389373779297,
"learning_rate": 7.710755113807794e-05,
"loss": 1.7228,
"step": 510
},
{
"epoch": 0.02944311601509608,
"grad_norm": 3.9019417762756348,
"learning_rate": 7.702448504531819e-05,
"loss": 2.23,
"step": 511
},
{
"epoch": 0.029500734637434817,
"grad_norm": 3.4205315113067627,
"learning_rate": 7.694131345005071e-05,
"loss": 1.9548,
"step": 512
},
{
"epoch": 0.02955835325977356,
"grad_norm": 4.597849369049072,
"learning_rate": 7.685803667697411e-05,
"loss": 2.6663,
"step": 513
},
{
"epoch": 0.029615971882112298,
"grad_norm": 4.4238691329956055,
"learning_rate": 7.677465505119754e-05,
"loss": 2.5071,
"step": 514
},
{
"epoch": 0.02967359050445104,
"grad_norm": 4.790035724639893,
"learning_rate": 7.669116889823955e-05,
"loss": 2.8143,
"step": 515
},
{
"epoch": 0.029731209126789778,
"grad_norm": 5.080414295196533,
"learning_rate": 7.660757854402671e-05,
"loss": 2.8679,
"step": 516
},
{
"epoch": 0.02978882774912852,
"grad_norm": 4.337615489959717,
"learning_rate": 7.652388431489248e-05,
"loss": 2.3524,
"step": 517
},
{
"epoch": 0.02984644637146726,
"grad_norm": 4.978126525878906,
"learning_rate": 7.644008653757571e-05,
"loss": 2.5467,
"step": 518
},
{
"epoch": 0.029904064993805997,
"grad_norm": 5.917097091674805,
"learning_rate": 7.635618553921962e-05,
"loss": 2.8325,
"step": 519
},
{
"epoch": 0.02996168361614474,
"grad_norm": 5.449569225311279,
"learning_rate": 7.627218164737031e-05,
"loss": 2.4075,
"step": 520
},
{
"epoch": 0.030019302238483477,
"grad_norm": 6.542447090148926,
"learning_rate": 7.618807518997563e-05,
"loss": 2.3709,
"step": 521
},
{
"epoch": 0.03007692086082222,
"grad_norm": 4.634735107421875,
"learning_rate": 7.610386649538384e-05,
"loss": 2.241,
"step": 522
},
{
"epoch": 0.030134539483160958,
"grad_norm": 4.284361839294434,
"learning_rate": 7.601955589234227e-05,
"loss": 1.8165,
"step": 523
},
{
"epoch": 0.030192158105499696,
"grad_norm": 4.661323547363281,
"learning_rate": 7.593514370999617e-05,
"loss": 2.3002,
"step": 524
},
{
"epoch": 0.030249776727838438,
"grad_norm": 4.856345176696777,
"learning_rate": 7.585063027788731e-05,
"loss": 1.8948,
"step": 525
},
{
"epoch": 0.030307395350177176,
"grad_norm": 4.846848964691162,
"learning_rate": 7.576601592595274e-05,
"loss": 2.205,
"step": 526
},
{
"epoch": 0.030365013972515918,
"grad_norm": 4.802872657775879,
"learning_rate": 7.568130098452351e-05,
"loss": 2.4187,
"step": 527
},
{
"epoch": 0.030422632594854657,
"grad_norm": 5.776866912841797,
"learning_rate": 7.559648578432338e-05,
"loss": 2.2555,
"step": 528
},
{
"epoch": 0.0304802512171934,
"grad_norm": 4.875247478485107,
"learning_rate": 7.551157065646746e-05,
"loss": 1.8963,
"step": 529
},
{
"epoch": 0.030537869839532137,
"grad_norm": 4.606689929962158,
"learning_rate": 7.542655593246103e-05,
"loss": 1.9575,
"step": 530
},
{
"epoch": 0.030595488461870875,
"grad_norm": 5.17656135559082,
"learning_rate": 7.534144194419817e-05,
"loss": 2.7424,
"step": 531
},
{
"epoch": 0.030653107084209617,
"grad_norm": 5.2001953125,
"learning_rate": 7.525622902396047e-05,
"loss": 1.8472,
"step": 532
},
{
"epoch": 0.030710725706548356,
"grad_norm": 5.093332290649414,
"learning_rate": 7.517091750441576e-05,
"loss": 2.0608,
"step": 533
},
{
"epoch": 0.030768344328887098,
"grad_norm": 5.426489353179932,
"learning_rate": 7.508550771861683e-05,
"loss": 2.8537,
"step": 534
},
{
"epoch": 0.030825962951225836,
"grad_norm": 5.734989643096924,
"learning_rate": 7.500000000000001e-05,
"loss": 2.6248,
"step": 535
},
{
"epoch": 0.030883581573564575,
"grad_norm": 6.2550153732299805,
"learning_rate": 7.491439468238403e-05,
"loss": 2.3276,
"step": 536
},
{
"epoch": 0.030941200195903316,
"grad_norm": 5.578352928161621,
"learning_rate": 7.482869209996867e-05,
"loss": 1.8248,
"step": 537
},
{
"epoch": 0.030998818818242055,
"grad_norm": 6.46681022644043,
"learning_rate": 7.474289258733332e-05,
"loss": 2.5873,
"step": 538
},
{
"epoch": 0.031056437440580797,
"grad_norm": 5.788388729095459,
"learning_rate": 7.465699647943586e-05,
"loss": 2.1507,
"step": 539
},
{
"epoch": 0.031114056062919535,
"grad_norm": 5.865661144256592,
"learning_rate": 7.457100411161128e-05,
"loss": 2.7711,
"step": 540
},
{
"epoch": 0.031171674685258277,
"grad_norm": 8.030291557312012,
"learning_rate": 7.44849158195703e-05,
"loss": 2.2632,
"step": 541
},
{
"epoch": 0.031229293307597016,
"grad_norm": 6.441694736480713,
"learning_rate": 7.43987319393982e-05,
"loss": 1.937,
"step": 542
},
{
"epoch": 0.031286911929935754,
"grad_norm": 8.080358505249023,
"learning_rate": 7.431245280755336e-05,
"loss": 2.7956,
"step": 543
},
{
"epoch": 0.03134453055227449,
"grad_norm": 6.134660720825195,
"learning_rate": 7.422607876086612e-05,
"loss": 1.9173,
"step": 544
},
{
"epoch": 0.03140214917461324,
"grad_norm": 7.2595038414001465,
"learning_rate": 7.413961013653726e-05,
"loss": 2.155,
"step": 545
},
{
"epoch": 0.031459767796951976,
"grad_norm": 7.13598108291626,
"learning_rate": 7.405304727213681e-05,
"loss": 1.7464,
"step": 546
},
{
"epoch": 0.031517386419290715,
"grad_norm": 6.867408752441406,
"learning_rate": 7.396639050560275e-05,
"loss": 1.8846,
"step": 547
},
{
"epoch": 0.03157500504162945,
"grad_norm": 7.415482521057129,
"learning_rate": 7.387964017523964e-05,
"loss": 2.1983,
"step": 548
},
{
"epoch": 0.03163262366396819,
"grad_norm": 7.816307544708252,
"learning_rate": 7.379279661971727e-05,
"loss": 1.4733,
"step": 549
},
{
"epoch": 0.03169024228630694,
"grad_norm": 7.897280216217041,
"learning_rate": 7.370586017806942e-05,
"loss": 1.404,
"step": 550
},
{
"epoch": 0.031747860908645675,
"grad_norm": 2.0481984615325928,
"learning_rate": 7.361883118969247e-05,
"loss": 1.7871,
"step": 551
},
{
"epoch": 0.031805479530984414,
"grad_norm": 2.2823524475097656,
"learning_rate": 7.353170999434412e-05,
"loss": 1.6266,
"step": 552
},
{
"epoch": 0.03186309815332315,
"grad_norm": 3.518951177597046,
"learning_rate": 7.3444496932142e-05,
"loss": 2.0591,
"step": 553
},
{
"epoch": 0.03192071677566189,
"grad_norm": 2.357578754425049,
"learning_rate": 7.335719234356246e-05,
"loss": 1.556,
"step": 554
},
{
"epoch": 0.031978335398000636,
"grad_norm": 2.791245937347412,
"learning_rate": 7.326979656943906e-05,
"loss": 1.738,
"step": 555
},
{
"epoch": 0.032035954020339374,
"grad_norm": 2.9653449058532715,
"learning_rate": 7.318230995096145e-05,
"loss": 1.8685,
"step": 556
},
{
"epoch": 0.03209357264267811,
"grad_norm": 3.912933588027954,
"learning_rate": 7.309473282967387e-05,
"loss": 2.368,
"step": 557
},
{
"epoch": 0.03215119126501685,
"grad_norm": 3.572033405303955,
"learning_rate": 7.30070655474739e-05,
"loss": 2.6526,
"step": 558
},
{
"epoch": 0.0322088098873556,
"grad_norm": 3.5761570930480957,
"learning_rate": 7.291930844661109e-05,
"loss": 2.4113,
"step": 559
},
{
"epoch": 0.032266428509694335,
"grad_norm": 3.8226797580718994,
"learning_rate": 7.283146186968565e-05,
"loss": 2.3685,
"step": 560
},
{
"epoch": 0.032324047132033074,
"grad_norm": 3.4981765747070312,
"learning_rate": 7.274352615964712e-05,
"loss": 2.0116,
"step": 561
},
{
"epoch": 0.03238166575437181,
"grad_norm": 4.051041126251221,
"learning_rate": 7.265550165979297e-05,
"loss": 2.2767,
"step": 562
},
{
"epoch": 0.03243928437671055,
"grad_norm": 3.901902437210083,
"learning_rate": 7.256738871376732e-05,
"loss": 1.8931,
"step": 563
},
{
"epoch": 0.032496902999049296,
"grad_norm": 4.695768356323242,
"learning_rate": 7.24791876655596e-05,
"loss": 2.8632,
"step": 564
},
{
"epoch": 0.032554521621388034,
"grad_norm": 3.891411781311035,
"learning_rate": 7.239089885950316e-05,
"loss": 1.5615,
"step": 565
},
{
"epoch": 0.03261214024372677,
"grad_norm": 4.418416976928711,
"learning_rate": 7.230252264027397e-05,
"loss": 1.9359,
"step": 566
},
{
"epoch": 0.03266975886606551,
"grad_norm": 4.310144424438477,
"learning_rate": 7.221405935288925e-05,
"loss": 1.9014,
"step": 567
},
{
"epoch": 0.03272737748840425,
"grad_norm": 5.806147575378418,
"learning_rate": 7.212550934270613e-05,
"loss": 2.921,
"step": 568
},
{
"epoch": 0.032784996110742995,
"grad_norm": 5.473169803619385,
"learning_rate": 7.203687295542032e-05,
"loss": 3.0081,
"step": 569
},
{
"epoch": 0.03284261473308173,
"grad_norm": 5.095858097076416,
"learning_rate": 7.19481505370647e-05,
"loss": 2.3119,
"step": 570
},
{
"epoch": 0.03290023335542047,
"grad_norm": 5.125074863433838,
"learning_rate": 7.185934243400806e-05,
"loss": 2.2663,
"step": 571
},
{
"epoch": 0.03295785197775921,
"grad_norm": 4.869948863983154,
"learning_rate": 7.177044899295368e-05,
"loss": 2.2135,
"step": 572
},
{
"epoch": 0.03301547060009795,
"grad_norm": 6.300400257110596,
"learning_rate": 7.168147056093797e-05,
"loss": 2.5526,
"step": 573
},
{
"epoch": 0.033073089222436694,
"grad_norm": 5.132833480834961,
"learning_rate": 7.15924074853292e-05,
"loss": 1.764,
"step": 574
},
{
"epoch": 0.03313070784477543,
"grad_norm": 5.472684383392334,
"learning_rate": 7.150326011382604e-05,
"loss": 2.1775,
"step": 575
},
{
"epoch": 0.03318832646711417,
"grad_norm": 4.476589202880859,
"learning_rate": 7.141402879445624e-05,
"loss": 1.8789,
"step": 576
},
{
"epoch": 0.03324594508945291,
"grad_norm": 3.766589879989624,
"learning_rate": 7.132471387557532e-05,
"loss": 1.3423,
"step": 577
},
{
"epoch": 0.03330356371179165,
"grad_norm": 5.912728309631348,
"learning_rate": 7.123531570586514e-05,
"loss": 2.4923,
"step": 578
},
{
"epoch": 0.03336118233413039,
"grad_norm": 4.823023796081543,
"learning_rate": 7.114583463433259e-05,
"loss": 2.0984,
"step": 579
},
{
"epoch": 0.03341880095646913,
"grad_norm": 5.755313396453857,
"learning_rate": 7.105627101030817e-05,
"loss": 2.253,
"step": 580
},
{
"epoch": 0.03347641957880787,
"grad_norm": 6.248511791229248,
"learning_rate": 7.096662518344468e-05,
"loss": 2.3171,
"step": 581
},
{
"epoch": 0.03353403820114661,
"grad_norm": 6.492971420288086,
"learning_rate": 7.087689750371587e-05,
"loss": 2.5124,
"step": 582
},
{
"epoch": 0.033591656823485354,
"grad_norm": 5.695863246917725,
"learning_rate": 7.078708832141497e-05,
"loss": 2.3419,
"step": 583
},
{
"epoch": 0.03364927544582409,
"grad_norm": 5.818359851837158,
"learning_rate": 7.069719798715346e-05,
"loss": 2.2373,
"step": 584
},
{
"epoch": 0.03370689406816283,
"grad_norm": 5.269465923309326,
"learning_rate": 7.060722685185961e-05,
"loss": 1.7463,
"step": 585
},
{
"epoch": 0.03376451269050157,
"grad_norm": 5.422830581665039,
"learning_rate": 7.051717526677711e-05,
"loss": 1.669,
"step": 586
},
{
"epoch": 0.03382213131284031,
"grad_norm": 5.423870086669922,
"learning_rate": 7.042704358346375e-05,
"loss": 1.8324,
"step": 587
},
{
"epoch": 0.03387974993517905,
"grad_norm": 5.801548480987549,
"learning_rate": 7.033683215379002e-05,
"loss": 1.7256,
"step": 588
},
{
"epoch": 0.03393736855751779,
"grad_norm": 6.18574333190918,
"learning_rate": 7.024654132993772e-05,
"loss": 1.572,
"step": 589
},
{
"epoch": 0.03399498717985653,
"grad_norm": 6.6976518630981445,
"learning_rate": 7.015617146439863e-05,
"loss": 2.2439,
"step": 590
},
{
"epoch": 0.03405260580219527,
"grad_norm": 7.734624862670898,
"learning_rate": 7.006572290997304e-05,
"loss": 2.0599,
"step": 591
},
{
"epoch": 0.03411022442453401,
"grad_norm": 6.55422830581665,
"learning_rate": 6.997519601976855e-05,
"loss": 1.6887,
"step": 592
},
{
"epoch": 0.03416784304687275,
"grad_norm": 7.435171604156494,
"learning_rate": 6.988459114719849e-05,
"loss": 1.944,
"step": 593
},
{
"epoch": 0.03422546166921149,
"grad_norm": 6.782978057861328,
"learning_rate": 6.979390864598062e-05,
"loss": 2.0181,
"step": 594
},
{
"epoch": 0.03428308029155023,
"grad_norm": 7.154095649719238,
"learning_rate": 6.970314887013584e-05,
"loss": 1.8208,
"step": 595
},
{
"epoch": 0.03434069891388897,
"grad_norm": 8.411824226379395,
"learning_rate": 6.961231217398668e-05,
"loss": 2.0538,
"step": 596
},
{
"epoch": 0.034398317536227706,
"grad_norm": 7.1375017166137695,
"learning_rate": 6.952139891215593e-05,
"loss": 1.6647,
"step": 597
},
{
"epoch": 0.03445593615856645,
"grad_norm": 8.589274406433105,
"learning_rate": 6.943040943956535e-05,
"loss": 1.7166,
"step": 598
},
{
"epoch": 0.03451355478090519,
"grad_norm": 9.486370086669922,
"learning_rate": 6.93393441114342e-05,
"loss": 1.5137,
"step": 599
},
{
"epoch": 0.03457117340324393,
"grad_norm": 10.819038391113281,
"learning_rate": 6.924820328327786e-05,
"loss": 1.9878,
"step": 600
},
{
"epoch": 0.034628792025582666,
"grad_norm": 3.152501344680786,
"learning_rate": 6.915698731090648e-05,
"loss": 1.452,
"step": 601
},
{
"epoch": 0.034686410647921405,
"grad_norm": 2.2664787769317627,
"learning_rate": 6.906569655042357e-05,
"loss": 1.54,
"step": 602
},
{
"epoch": 0.03474402927026015,
"grad_norm": 2.385829210281372,
"learning_rate": 6.897433135822461e-05,
"loss": 1.4365,
"step": 603
},
{
"epoch": 0.03480164789259889,
"grad_norm": 2.7845754623413086,
"learning_rate": 6.888289209099565e-05,
"loss": 2.164,
"step": 604
},
{
"epoch": 0.03485926651493763,
"grad_norm": 2.4939215183258057,
"learning_rate": 6.879137910571191e-05,
"loss": 1.6181,
"step": 605
},
{
"epoch": 0.034916885137276366,
"grad_norm": 3.5086007118225098,
"learning_rate": 6.869979275963643e-05,
"loss": 2.2493,
"step": 606
},
{
"epoch": 0.03497450375961511,
"grad_norm": 3.3141884803771973,
"learning_rate": 6.860813341031866e-05,
"loss": 2.211,
"step": 607
},
{
"epoch": 0.03503212238195385,
"grad_norm": 3.3828322887420654,
"learning_rate": 6.8516401415593e-05,
"loss": 2.3733,
"step": 608
},
{
"epoch": 0.03508974100429259,
"grad_norm": 3.814128875732422,
"learning_rate": 6.842459713357752e-05,
"loss": 2.4819,
"step": 609
},
{
"epoch": 0.035147359626631326,
"grad_norm": 3.229886770248413,
"learning_rate": 6.833272092267241e-05,
"loss": 1.5972,
"step": 610
},
{
"epoch": 0.035204978248970065,
"grad_norm": 3.7008516788482666,
"learning_rate": 6.824077314155877e-05,
"loss": 2.0268,
"step": 611
},
{
"epoch": 0.03526259687130881,
"grad_norm": 4.110735893249512,
"learning_rate": 6.814875414919701e-05,
"loss": 2.5064,
"step": 612
},
{
"epoch": 0.03532021549364755,
"grad_norm": 3.7989509105682373,
"learning_rate": 6.805666430482564e-05,
"loss": 1.9167,
"step": 613
},
{
"epoch": 0.03537783411598629,
"grad_norm": 3.6298511028289795,
"learning_rate": 6.79645039679597e-05,
"loss": 2.2267,
"step": 614
},
{
"epoch": 0.035435452738325025,
"grad_norm": 4.288212776184082,
"learning_rate": 6.787227349838947e-05,
"loss": 2.5939,
"step": 615
},
{
"epoch": 0.035493071360663764,
"grad_norm": 4.232641696929932,
"learning_rate": 6.777997325617898e-05,
"loss": 1.9211,
"step": 616
},
{
"epoch": 0.03555068998300251,
"grad_norm": 4.338483810424805,
"learning_rate": 6.768760360166471e-05,
"loss": 2.3078,
"step": 617
},
{
"epoch": 0.03560830860534125,
"grad_norm": 5.502989292144775,
"learning_rate": 6.759516489545408e-05,
"loss": 2.2158,
"step": 618
},
{
"epoch": 0.035665927227679986,
"grad_norm": 4.979348659515381,
"learning_rate": 6.750265749842409e-05,
"loss": 2.1872,
"step": 619
},
{
"epoch": 0.035723545850018724,
"grad_norm": 5.02641487121582,
"learning_rate": 6.741008177171995e-05,
"loss": 2.1891,
"step": 620
},
{
"epoch": 0.03578116447235746,
"grad_norm": 5.421124458312988,
"learning_rate": 6.731743807675355e-05,
"loss": 2.6276,
"step": 621
},
{
"epoch": 0.03583878309469621,
"grad_norm": 5.692699909210205,
"learning_rate": 6.722472677520219e-05,
"loss": 2.1076,
"step": 622
},
{
"epoch": 0.03589640171703495,
"grad_norm": 4.913735866546631,
"learning_rate": 6.713194822900706e-05,
"loss": 2.049,
"step": 623
},
{
"epoch": 0.035954020339373685,
"grad_norm": 4.316449165344238,
"learning_rate": 6.703910280037193e-05,
"loss": 1.4175,
"step": 624
},
{
"epoch": 0.036011638961712424,
"grad_norm": 4.753008842468262,
"learning_rate": 6.694619085176159e-05,
"loss": 2.3358,
"step": 625
},
{
"epoch": 0.03606925758405116,
"grad_norm": 4.488182067871094,
"learning_rate": 6.685321274590059e-05,
"loss": 1.7114,
"step": 626
},
{
"epoch": 0.03612687620638991,
"grad_norm": 7.066396713256836,
"learning_rate": 6.676016884577173e-05,
"loss": 2.4835,
"step": 627
},
{
"epoch": 0.036184494828728646,
"grad_norm": 5.638205528259277,
"learning_rate": 6.666705951461468e-05,
"loss": 2.2354,
"step": 628
},
{
"epoch": 0.036242113451067384,
"grad_norm": 6.0546345710754395,
"learning_rate": 6.657388511592452e-05,
"loss": 1.9222,
"step": 629
},
{
"epoch": 0.03629973207340612,
"grad_norm": 5.70328950881958,
"learning_rate": 6.64806460134504e-05,
"loss": 2.4855,
"step": 630
},
{
"epoch": 0.03635735069574487,
"grad_norm": 6.03857421875,
"learning_rate": 6.638734257119401e-05,
"loss": 2.5251,
"step": 631
},
{
"epoch": 0.036414969318083606,
"grad_norm": 5.663034915924072,
"learning_rate": 6.62939751534083e-05,
"loss": 2.2052,
"step": 632
},
{
"epoch": 0.036472587940422345,
"grad_norm": 5.329866886138916,
"learning_rate": 6.620054412459588e-05,
"loss": 1.7303,
"step": 633
},
{
"epoch": 0.03653020656276108,
"grad_norm": 7.024685859680176,
"learning_rate": 6.610704984950778e-05,
"loss": 2.6048,
"step": 634
},
{
"epoch": 0.03658782518509982,
"grad_norm": 5.490304946899414,
"learning_rate": 6.601349269314188e-05,
"loss": 2.0676,
"step": 635
},
{
"epoch": 0.03664544380743857,
"grad_norm": 6.266791820526123,
"learning_rate": 6.591987302074161e-05,
"loss": 2.0658,
"step": 636
},
{
"epoch": 0.036703062429777306,
"grad_norm": 6.760843753814697,
"learning_rate": 6.582619119779439e-05,
"loss": 2.3506,
"step": 637
},
{
"epoch": 0.036760681052116044,
"grad_norm": 6.093375205993652,
"learning_rate": 6.573244759003032e-05,
"loss": 1.9029,
"step": 638
},
{
"epoch": 0.03681829967445478,
"grad_norm": 6.246356964111328,
"learning_rate": 6.56386425634207e-05,
"loss": 2.155,
"step": 639
},
{
"epoch": 0.03687591829679352,
"grad_norm": 5.68143367767334,
"learning_rate": 6.554477648417657e-05,
"loss": 1.6199,
"step": 640
},
{
"epoch": 0.036933536919132266,
"grad_norm": 7.0699462890625,
"learning_rate": 6.545084971874738e-05,
"loss": 2.1567,
"step": 641
},
{
"epoch": 0.036991155541471005,
"grad_norm": 7.515526294708252,
"learning_rate": 6.535686263381944e-05,
"loss": 1.9459,
"step": 642
},
{
"epoch": 0.03704877416380974,
"grad_norm": 8.385588645935059,
"learning_rate": 6.526281559631458e-05,
"loss": 2.1835,
"step": 643
},
{
"epoch": 0.03710639278614848,
"grad_norm": 5.948019027709961,
"learning_rate": 6.516870897338863e-05,
"loss": 1.4861,
"step": 644
},
{
"epoch": 0.03716401140848722,
"grad_norm": 7.62818717956543,
"learning_rate": 6.507454313243015e-05,
"loss": 1.8994,
"step": 645
},
{
"epoch": 0.037221630030825965,
"grad_norm": 9.67705249786377,
"learning_rate": 6.498031844105876e-05,
"loss": 2.7569,
"step": 646
},
{
"epoch": 0.037279248653164704,
"grad_norm": 8.240128517150879,
"learning_rate": 6.48860352671239e-05,
"loss": 1.4679,
"step": 647
},
{
"epoch": 0.03733686727550344,
"grad_norm": 9.882831573486328,
"learning_rate": 6.47916939787033e-05,
"loss": 1.7527,
"step": 648
},
{
"epoch": 0.03739448589784218,
"grad_norm": 10.188104629516602,
"learning_rate": 6.469729494410158e-05,
"loss": 1.9761,
"step": 649
},
{
"epoch": 0.03745210452018092,
"grad_norm": 11.413439750671387,
"learning_rate": 6.460283853184879e-05,
"loss": 1.4839,
"step": 650
},
{
"epoch": 0.037509723142519664,
"grad_norm": 2.5987234115600586,
"learning_rate": 6.450832511069897e-05,
"loss": 2.0529,
"step": 651
},
{
"epoch": 0.0375673417648584,
"grad_norm": 2.6533713340759277,
"learning_rate": 6.441375504962877e-05,
"loss": 1.5965,
"step": 652
},
{
"epoch": 0.03762496038719714,
"grad_norm": 2.695966958999634,
"learning_rate": 6.431912871783586e-05,
"loss": 1.6689,
"step": 653
},
{
"epoch": 0.03768257900953588,
"grad_norm": 2.7431528568267822,
"learning_rate": 6.42244464847377e-05,
"loss": 1.9135,
"step": 654
},
{
"epoch": 0.037740197631874625,
"grad_norm": 3.1441702842712402,
"learning_rate": 6.412970871996995e-05,
"loss": 1.4693,
"step": 655
},
{
"epoch": 0.037797816254213364,
"grad_norm": 3.565042018890381,
"learning_rate": 6.403491579338499e-05,
"loss": 2.2943,
"step": 656
},
{
"epoch": 0.0378554348765521,
"grad_norm": 3.5387003421783447,
"learning_rate": 6.394006807505067e-05,
"loss": 1.7283,
"step": 657
},
{
"epoch": 0.03791305349889084,
"grad_norm": 3.4925148487091064,
"learning_rate": 6.384516593524865e-05,
"loss": 1.9741,
"step": 658
},
{
"epoch": 0.03797067212122958,
"grad_norm": 3.06790828704834,
"learning_rate": 6.37502097444731e-05,
"loss": 1.3718,
"step": 659
},
{
"epoch": 0.038028290743568324,
"grad_norm": 4.412354469299316,
"learning_rate": 6.365519987342917e-05,
"loss": 2.289,
"step": 660
},
{
"epoch": 0.03808590936590706,
"grad_norm": 3.799346446990967,
"learning_rate": 6.356013669303162e-05,
"loss": 1.9566,
"step": 661
},
{
"epoch": 0.0381435279882458,
"grad_norm": 4.237002372741699,
"learning_rate": 6.346502057440327e-05,
"loss": 2.1939,
"step": 662
},
{
"epoch": 0.03820114661058454,
"grad_norm": 4.741201877593994,
"learning_rate": 6.336985188887366e-05,
"loss": 2.0036,
"step": 663
},
{
"epoch": 0.03825876523292328,
"grad_norm": 5.16001558303833,
"learning_rate": 6.327463100797755e-05,
"loss": 2.5171,
"step": 664
},
{
"epoch": 0.03831638385526202,
"grad_norm": 5.431231498718262,
"learning_rate": 6.317935830345338e-05,
"loss": 2.7797,
"step": 665
},
{
"epoch": 0.03837400247760076,
"grad_norm": 4.70787239074707,
"learning_rate": 6.308403414724205e-05,
"loss": 1.9478,
"step": 666
},
{
"epoch": 0.0384316210999395,
"grad_norm": 6.642090320587158,
"learning_rate": 6.298865891148518e-05,
"loss": 2.7287,
"step": 667
},
{
"epoch": 0.03848923972227824,
"grad_norm": 6.608595371246338,
"learning_rate": 6.289323296852393e-05,
"loss": 2.1449,
"step": 668
},
{
"epoch": 0.03854685834461698,
"grad_norm": 5.075540065765381,
"learning_rate": 6.279775669089733e-05,
"loss": 1.7336,
"step": 669
},
{
"epoch": 0.03860447696695572,
"grad_norm": 3.6733851432800293,
"learning_rate": 6.270223045134096e-05,
"loss": 1.1928,
"step": 670
},
{
"epoch": 0.03866209558929446,
"grad_norm": 5.309694290161133,
"learning_rate": 6.260665462278544e-05,
"loss": 2.4931,
"step": 671
},
{
"epoch": 0.0387197142116332,
"grad_norm": 5.460869312286377,
"learning_rate": 6.251102957835493e-05,
"loss": 2.0893,
"step": 672
},
{
"epoch": 0.03877733283397194,
"grad_norm": 5.319972038269043,
"learning_rate": 6.241535569136584e-05,
"loss": 2.0432,
"step": 673
},
{
"epoch": 0.038834951456310676,
"grad_norm": 4.768582820892334,
"learning_rate": 6.231963333532515e-05,
"loss": 1.7141,
"step": 674
},
{
"epoch": 0.03889257007864942,
"grad_norm": 4.945594787597656,
"learning_rate": 6.222386288392913e-05,
"loss": 1.7553,
"step": 675
},
{
"epoch": 0.03895018870098816,
"grad_norm": 5.558775901794434,
"learning_rate": 6.212804471106178e-05,
"loss": 1.7735,
"step": 676
},
{
"epoch": 0.0390078073233269,
"grad_norm": 5.731858730316162,
"learning_rate": 6.203217919079342e-05,
"loss": 1.9911,
"step": 677
},
{
"epoch": 0.03906542594566564,
"grad_norm": 6.50372314453125,
"learning_rate": 6.193626669737919e-05,
"loss": 2.3704,
"step": 678
},
{
"epoch": 0.03912304456800438,
"grad_norm": 5.412557601928711,
"learning_rate": 6.184030760525764e-05,
"loss": 2.041,
"step": 679
},
{
"epoch": 0.03918066319034312,
"grad_norm": 5.677639484405518,
"learning_rate": 6.174430228904919e-05,
"loss": 1.9809,
"step": 680
},
{
"epoch": 0.03923828181268186,
"grad_norm": 5.83066463470459,
"learning_rate": 6.164825112355477e-05,
"loss": 1.9982,
"step": 681
},
{
"epoch": 0.0392959004350206,
"grad_norm": 6.060110569000244,
"learning_rate": 6.155215448375432e-05,
"loss": 2.1433,
"step": 682
},
{
"epoch": 0.039353519057359336,
"grad_norm": 6.152813911437988,
"learning_rate": 6.145601274480521e-05,
"loss": 2.3143,
"step": 683
},
{
"epoch": 0.03941113767969808,
"grad_norm": 5.819025993347168,
"learning_rate": 6.1359826282041e-05,
"loss": 1.7178,
"step": 684
},
{
"epoch": 0.03946875630203682,
"grad_norm": 5.957003116607666,
"learning_rate": 6.126359547096975e-05,
"loss": 1.8914,
"step": 685
},
{
"epoch": 0.03952637492437556,
"grad_norm": 6.525628089904785,
"learning_rate": 6.116732068727271e-05,
"loss": 2.3946,
"step": 686
},
{
"epoch": 0.0395839935467143,
"grad_norm": 6.202828884124756,
"learning_rate": 6.107100230680279e-05,
"loss": 1.887,
"step": 687
},
{
"epoch": 0.039641612169053035,
"grad_norm": 7.1572136878967285,
"learning_rate": 6.0974640705583067e-05,
"loss": 2.4102,
"step": 688
},
{
"epoch": 0.03969923079139178,
"grad_norm": 6.172741889953613,
"learning_rate": 6.0878236259805396e-05,
"loss": 2.0151,
"step": 689
},
{
"epoch": 0.03975684941373052,
"grad_norm": 5.8097920417785645,
"learning_rate": 6.078178934582885e-05,
"loss": 1.8821,
"step": 690
},
{
"epoch": 0.03981446803606926,
"grad_norm": 6.6389007568359375,
"learning_rate": 6.068530034017835e-05,
"loss": 1.4544,
"step": 691
},
{
"epoch": 0.039872086658407996,
"grad_norm": 6.814727306365967,
"learning_rate": 6.0588769619543086e-05,
"loss": 1.9099,
"step": 692
},
{
"epoch": 0.039929705280746734,
"grad_norm": 6.089106559753418,
"learning_rate": 6.049219756077514e-05,
"loss": 1.3365,
"step": 693
},
{
"epoch": 0.03998732390308548,
"grad_norm": 8.091950416564941,
"learning_rate": 6.0395584540887963e-05,
"loss": 2.1427,
"step": 694
},
{
"epoch": 0.04004494252542422,
"grad_norm": 8.129999160766602,
"learning_rate": 6.029893093705492e-05,
"loss": 2.7911,
"step": 695
},
{
"epoch": 0.040102561147762956,
"grad_norm": 8.245060920715332,
"learning_rate": 6.020223712660781e-05,
"loss": 1.9638,
"step": 696
},
{
"epoch": 0.040160179770101695,
"grad_norm": 7.976203918457031,
"learning_rate": 6.010550348703538e-05,
"loss": 1.7136,
"step": 697
},
{
"epoch": 0.04021779839244043,
"grad_norm": 8.391220092773438,
"learning_rate": 6.00087303959819e-05,
"loss": 1.6179,
"step": 698
},
{
"epoch": 0.04027541701477918,
"grad_norm": 10.28807544708252,
"learning_rate": 5.991191823124565e-05,
"loss": 2.1265,
"step": 699
},
{
"epoch": 0.04033303563711792,
"grad_norm": 9.866781234741211,
"learning_rate": 5.981506737077744e-05,
"loss": 1.1792,
"step": 700
},
{
"epoch": 0.040390654259456656,
"grad_norm": 2.2414968013763428,
"learning_rate": 5.971817819267913e-05,
"loss": 1.4382,
"step": 701
},
{
"epoch": 0.040448272881795394,
"grad_norm": 2.4282901287078857,
"learning_rate": 5.962125107520223e-05,
"loss": 1.4688,
"step": 702
},
{
"epoch": 0.04050589150413414,
"grad_norm": 2.4506328105926514,
"learning_rate": 5.952428639674632e-05,
"loss": 1.2702,
"step": 703
},
{
"epoch": 0.04056351012647288,
"grad_norm": 2.9661779403686523,
"learning_rate": 5.942728453585759e-05,
"loss": 1.9094,
"step": 704
},
{
"epoch": 0.040621128748811616,
"grad_norm": 3.062246799468994,
"learning_rate": 5.9330245871227454e-05,
"loss": 1.8171,
"step": 705
},
{
"epoch": 0.040678747371150355,
"grad_norm": 3.2363357543945312,
"learning_rate": 5.923317078169096e-05,
"loss": 1.527,
"step": 706
},
{
"epoch": 0.04073636599348909,
"grad_norm": 3.971209764480591,
"learning_rate": 5.9136059646225375e-05,
"loss": 2.4536,
"step": 707
},
{
"epoch": 0.04079398461582784,
"grad_norm": 4.222631931304932,
"learning_rate": 5.903891284394868e-05,
"loss": 2.2317,
"step": 708
},
{
"epoch": 0.04085160323816658,
"grad_norm": 3.7410428524017334,
"learning_rate": 5.8941730754118116e-05,
"loss": 2.0362,
"step": 709
},
{
"epoch": 0.040909221860505315,
"grad_norm": 3.920046806335449,
"learning_rate": 5.884451375612865e-05,
"loss": 1.7865,
"step": 710
},
{
"epoch": 0.040966840482844054,
"grad_norm": 4.062409400939941,
"learning_rate": 5.874726222951157e-05,
"loss": 2.2063,
"step": 711
},
{
"epoch": 0.04102445910518279,
"grad_norm": 3.557114839553833,
"learning_rate": 5.864997655393293e-05,
"loss": 1.4106,
"step": 712
},
{
"epoch": 0.04108207772752154,
"grad_norm": 4.949502944946289,
"learning_rate": 5.855265710919211e-05,
"loss": 2.3281,
"step": 713
},
{
"epoch": 0.041139696349860276,
"grad_norm": 4.479298114776611,
"learning_rate": 5.845530427522034e-05,
"loss": 2.0265,
"step": 714
},
{
"epoch": 0.041197314972199015,
"grad_norm": 4.422474384307861,
"learning_rate": 5.835791843207916e-05,
"loss": 1.7982,
"step": 715
},
{
"epoch": 0.04125493359453775,
"grad_norm": 5.5531487464904785,
"learning_rate": 5.826049995995905e-05,
"loss": 2.2113,
"step": 716
},
{
"epoch": 0.04131255221687649,
"grad_norm": 4.909703254699707,
"learning_rate": 5.8163049239177784e-05,
"loss": 2.1528,
"step": 717
},
{
"epoch": 0.04137017083921524,
"grad_norm": 5.808315753936768,
"learning_rate": 5.8065566650179084e-05,
"loss": 2.0995,
"step": 718
},
{
"epoch": 0.041427789461553975,
"grad_norm": 6.395657062530518,
"learning_rate": 5.7968052573531084e-05,
"loss": 2.522,
"step": 719
},
{
"epoch": 0.041485408083892714,
"grad_norm": 4.918295383453369,
"learning_rate": 5.787050738992482e-05,
"loss": 2.073,
"step": 720
},
{
"epoch": 0.04154302670623145,
"grad_norm": 5.549355506896973,
"learning_rate": 5.7772931480172795e-05,
"loss": 1.6686,
"step": 721
},
{
"epoch": 0.04160064532857019,
"grad_norm": 5.22744607925415,
"learning_rate": 5.7675325225207455e-05,
"loss": 1.3958,
"step": 722
},
{
"epoch": 0.041658263950908936,
"grad_norm": 5.747786998748779,
"learning_rate": 5.757768900607972e-05,
"loss": 1.8335,
"step": 723
},
{
"epoch": 0.041715882573247674,
"grad_norm": 5.3394365310668945,
"learning_rate": 5.748002320395746e-05,
"loss": 2.0888,
"step": 724
},
{
"epoch": 0.04177350119558641,
"grad_norm": 6.7308030128479,
"learning_rate": 5.738232820012407e-05,
"loss": 2.0689,
"step": 725
},
{
"epoch": 0.04183111981792515,
"grad_norm": 5.232996463775635,
"learning_rate": 5.7284604375976913e-05,
"loss": 1.6801,
"step": 726
},
{
"epoch": 0.0418887384402639,
"grad_norm": 5.374958515167236,
"learning_rate": 5.718685211302592e-05,
"loss": 1.9967,
"step": 727
},
{
"epoch": 0.041946357062602635,
"grad_norm": 5.936054706573486,
"learning_rate": 5.7089071792892e-05,
"loss": 1.8651,
"step": 728
},
{
"epoch": 0.04200397568494137,
"grad_norm": 6.371306419372559,
"learning_rate": 5.699126379730559e-05,
"loss": 2.7011,
"step": 729
},
{
"epoch": 0.04206159430728011,
"grad_norm": 5.820079803466797,
"learning_rate": 5.6893428508105225e-05,
"loss": 2.0199,
"step": 730
},
{
"epoch": 0.04211921292961885,
"grad_norm": 6.234492301940918,
"learning_rate": 5.6795566307235915e-05,
"loss": 1.7485,
"step": 731
},
{
"epoch": 0.042176831551957596,
"grad_norm": 5.691740036010742,
"learning_rate": 5.669767757674781e-05,
"loss": 2.0969,
"step": 732
},
{
"epoch": 0.042234450174296334,
"grad_norm": 6.614067554473877,
"learning_rate": 5.6599762698794554e-05,
"loss": 2.0251,
"step": 733
},
{
"epoch": 0.04229206879663507,
"grad_norm": 6.4943528175354,
"learning_rate": 5.650182205563197e-05,
"loss": 2.1946,
"step": 734
},
{
"epoch": 0.04234968741897381,
"grad_norm": 5.918944358825684,
"learning_rate": 5.640385602961634e-05,
"loss": 1.9158,
"step": 735
},
{
"epoch": 0.04240730604131255,
"grad_norm": 7.2071075439453125,
"learning_rate": 5.630586500320317e-05,
"loss": 1.7421,
"step": 736
},
{
"epoch": 0.042464924663651295,
"grad_norm": 6.350695610046387,
"learning_rate": 5.620784935894547e-05,
"loss": 1.7505,
"step": 737
},
{
"epoch": 0.04252254328599003,
"grad_norm": 5.942063808441162,
"learning_rate": 5.610980947949239e-05,
"loss": 1.3805,
"step": 738
},
{
"epoch": 0.04258016190832877,
"grad_norm": 7.537656307220459,
"learning_rate": 5.601174574758771e-05,
"loss": 1.8985,
"step": 739
},
{
"epoch": 0.04263778053066751,
"grad_norm": 7.9401092529296875,
"learning_rate": 5.5913658546068295e-05,
"loss": 2.085,
"step": 740
},
{
"epoch": 0.04269539915300625,
"grad_norm": 7.036352157592773,
"learning_rate": 5.581554825786267e-05,
"loss": 1.505,
"step": 741
},
{
"epoch": 0.042753017775344994,
"grad_norm": 8.461213111877441,
"learning_rate": 5.571741526598946e-05,
"loss": 2.4647,
"step": 742
},
{
"epoch": 0.04281063639768373,
"grad_norm": 7.788630485534668,
"learning_rate": 5.5619259953555945e-05,
"loss": 1.9391,
"step": 743
},
{
"epoch": 0.04286825502002247,
"grad_norm": 8.280106544494629,
"learning_rate": 5.552108270375654e-05,
"loss": 1.6975,
"step": 744
},
{
"epoch": 0.04292587364236121,
"grad_norm": 7.514588356018066,
"learning_rate": 5.5422883899871284e-05,
"loss": 1.3445,
"step": 745
},
{
"epoch": 0.04298349226469995,
"grad_norm": 8.3132905960083,
"learning_rate": 5.532466392526438e-05,
"loss": 1.3476,
"step": 746
},
{
"epoch": 0.04304111088703869,
"grad_norm": 9.681577682495117,
"learning_rate": 5.522642316338268e-05,
"loss": 1.8898,
"step": 747
},
{
"epoch": 0.04309872950937743,
"grad_norm": 7.265344142913818,
"learning_rate": 5.512816199775419e-05,
"loss": 1.155,
"step": 748
},
{
"epoch": 0.04315634813171617,
"grad_norm": 9.20679759979248,
"learning_rate": 5.5029880811986544e-05,
"loss": 1.4141,
"step": 749
},
{
"epoch": 0.04321396675405491,
"grad_norm": 10.745830535888672,
"learning_rate": 5.493157998976559e-05,
"loss": 1.2941,
"step": 750
},
{
"epoch": 0.043271585376393654,
"grad_norm": 2.210343360900879,
"learning_rate": 5.483325991485379e-05,
"loss": 1.7011,
"step": 751
},
{
"epoch": 0.04332920399873239,
"grad_norm": 2.6614551544189453,
"learning_rate": 5.473492097108877e-05,
"loss": 1.5224,
"step": 752
},
{
"epoch": 0.04338682262107113,
"grad_norm": 3.0926589965820312,
"learning_rate": 5.463656354238184e-05,
"loss": 1.9224,
"step": 753
},
{
"epoch": 0.04344444124340987,
"grad_norm": 2.831848382949829,
"learning_rate": 5.453818801271645e-05,
"loss": 1.7133,
"step": 754
},
{
"epoch": 0.04350205986574861,
"grad_norm": 3.15789532661438,
"learning_rate": 5.4439794766146746e-05,
"loss": 1.6763,
"step": 755
},
{
"epoch": 0.04355967848808735,
"grad_norm": 3.2974679470062256,
"learning_rate": 5.434138418679602e-05,
"loss": 1.6811,
"step": 756
},
{
"epoch": 0.04361729711042609,
"grad_norm": 3.381722927093506,
"learning_rate": 5.424295665885523e-05,
"loss": 1.3443,
"step": 757
},
{
"epoch": 0.04367491573276483,
"grad_norm": 3.899580478668213,
"learning_rate": 5.414451256658149e-05,
"loss": 2.1862,
"step": 758
},
{
"epoch": 0.04373253435510357,
"grad_norm": 4.60366153717041,
"learning_rate": 5.404605229429664e-05,
"loss": 2.3095,
"step": 759
},
{
"epoch": 0.043790152977442307,
"grad_norm": 4.542125701904297,
"learning_rate": 5.39475762263856e-05,
"loss": 1.9147,
"step": 760
},
{
"epoch": 0.04384777159978105,
"grad_norm": 4.373798370361328,
"learning_rate": 5.384908474729501e-05,
"loss": 2.3165,
"step": 761
},
{
"epoch": 0.04390539022211979,
"grad_norm": 5.214663982391357,
"learning_rate": 5.3750578241531655e-05,
"loss": 2.1573,
"step": 762
},
{
"epoch": 0.04396300884445853,
"grad_norm": 4.394557476043701,
"learning_rate": 5.365205709366099e-05,
"loss": 1.85,
"step": 763
},
{
"epoch": 0.04402062746679727,
"grad_norm": 4.53248405456543,
"learning_rate": 5.355352168830565e-05,
"loss": 1.5839,
"step": 764
},
{
"epoch": 0.044078246089136006,
"grad_norm": 5.163122177124023,
"learning_rate": 5.34549724101439e-05,
"loss": 2.2099,
"step": 765
},
{
"epoch": 0.04413586471147475,
"grad_norm": 4.584179878234863,
"learning_rate": 5.335640964390818e-05,
"loss": 1.8161,
"step": 766
},
{
"epoch": 0.04419348333381349,
"grad_norm": 5.658686637878418,
"learning_rate": 5.325783377438357e-05,
"loss": 1.7721,
"step": 767
},
{
"epoch": 0.04425110195615223,
"grad_norm": 5.498464107513428,
"learning_rate": 5.315924518640634e-05,
"loss": 2.2003,
"step": 768
},
{
"epoch": 0.044308720578490966,
"grad_norm": 5.744102478027344,
"learning_rate": 5.306064426486237e-05,
"loss": 2.4985,
"step": 769
},
{
"epoch": 0.044366339200829705,
"grad_norm": 6.669624328613281,
"learning_rate": 5.296203139468572e-05,
"loss": 2.7098,
"step": 770
},
{
"epoch": 0.04442395782316845,
"grad_norm": 7.279657363891602,
"learning_rate": 5.286340696085709e-05,
"loss": 2.5139,
"step": 771
},
{
"epoch": 0.04448157644550719,
"grad_norm": 7.794008255004883,
"learning_rate": 5.276477134840231e-05,
"loss": 1.9609,
"step": 772
},
{
"epoch": 0.04453919506784593,
"grad_norm": 6.10450553894043,
"learning_rate": 5.266612494239088e-05,
"loss": 2.069,
"step": 773
},
{
"epoch": 0.044596813690184665,
"grad_norm": 6.048048496246338,
"learning_rate": 5.2567468127934406e-05,
"loss": 1.6962,
"step": 774
},
{
"epoch": 0.04465443231252341,
"grad_norm": 4.730360984802246,
"learning_rate": 5.246880129018516e-05,
"loss": 1.3367,
"step": 775
},
{
"epoch": 0.04471205093486215,
"grad_norm": 6.04632043838501,
"learning_rate": 5.2370124814334495e-05,
"loss": 1.9,
"step": 776
},
{
"epoch": 0.04476966955720089,
"grad_norm": 5.302633285522461,
"learning_rate": 5.227143908561145e-05,
"loss": 1.3987,
"step": 777
},
{
"epoch": 0.044827288179539626,
"grad_norm": 5.701929569244385,
"learning_rate": 5.2172744489281145e-05,
"loss": 1.7886,
"step": 778
},
{
"epoch": 0.044884906801878365,
"grad_norm": 6.474079608917236,
"learning_rate": 5.207404141064334e-05,
"loss": 2.1494,
"step": 779
},
{
"epoch": 0.04494252542421711,
"grad_norm": 6.047230243682861,
"learning_rate": 5.197533023503089e-05,
"loss": 2.034,
"step": 780
},
{
"epoch": 0.04500014404655585,
"grad_norm": 7.410451889038086,
"learning_rate": 5.187661134780829e-05,
"loss": 2.4321,
"step": 781
},
{
"epoch": 0.04505776266889459,
"grad_norm": 7.621194362640381,
"learning_rate": 5.177788513437013e-05,
"loss": 1.7415,
"step": 782
},
{
"epoch": 0.045115381291233325,
"grad_norm": 6.933146953582764,
"learning_rate": 5.1679151980139564e-05,
"loss": 1.8105,
"step": 783
},
{
"epoch": 0.045172999913572064,
"grad_norm": 6.192483425140381,
"learning_rate": 5.158041227056689e-05,
"loss": 1.96,
"step": 784
},
{
"epoch": 0.04523061853591081,
"grad_norm": 6.742107391357422,
"learning_rate": 5.148166639112799e-05,
"loss": 1.8433,
"step": 785
},
{
"epoch": 0.04528823715824955,
"grad_norm": 7.034168720245361,
"learning_rate": 5.1382914727322815e-05,
"loss": 1.6494,
"step": 786
},
{
"epoch": 0.045345855780588286,
"grad_norm": 8.540271759033203,
"learning_rate": 5.128415766467392e-05,
"loss": 1.8065,
"step": 787
},
{
"epoch": 0.045403474402927024,
"grad_norm": 6.65033483505249,
"learning_rate": 5.118539558872488e-05,
"loss": 1.3827,
"step": 788
},
{
"epoch": 0.04546109302526576,
"grad_norm": 7.016667366027832,
"learning_rate": 5.1086628885038946e-05,
"loss": 1.6569,
"step": 789
},
{
"epoch": 0.04551871164760451,
"grad_norm": 7.630545139312744,
"learning_rate": 5.0987857939197324e-05,
"loss": 1.7671,
"step": 790
},
{
"epoch": 0.04557633026994325,
"grad_norm": 7.821313858032227,
"learning_rate": 5.0889083136797875e-05,
"loss": 2.0906,
"step": 791
},
{
"epoch": 0.045633948892281985,
"grad_norm": 7.7678022384643555,
"learning_rate": 5.079030486345345e-05,
"loss": 1.7571,
"step": 792
},
{
"epoch": 0.04569156751462072,
"grad_norm": 7.932411193847656,
"learning_rate": 5.0691523504790474e-05,
"loss": 1.6625,
"step": 793
},
{
"epoch": 0.04574918613695946,
"grad_norm": 7.604271411895752,
"learning_rate": 5.0592739446447425e-05,
"loss": 1.4029,
"step": 794
},
{
"epoch": 0.04580680475929821,
"grad_norm": 8.554786682128906,
"learning_rate": 5.049395307407329e-05,
"loss": 1.7333,
"step": 795
},
{
"epoch": 0.045864423381636946,
"grad_norm": 8.877863883972168,
"learning_rate": 5.039516477332614e-05,
"loss": 1.6434,
"step": 796
},
{
"epoch": 0.045922042003975684,
"grad_norm": 10.24761962890625,
"learning_rate": 5.029637492987153e-05,
"loss": 1.9937,
"step": 797
},
{
"epoch": 0.04597966062631442,
"grad_norm": 9.682822227478027,
"learning_rate": 5.0197583929381066e-05,
"loss": 1.5388,
"step": 798
},
{
"epoch": 0.04603727924865317,
"grad_norm": 9.582611083984375,
"learning_rate": 5.009879215753085e-05,
"loss": 1.8009,
"step": 799
},
{
"epoch": 0.046094897870991906,
"grad_norm": 10.857091903686523,
"learning_rate": 5e-05,
"loss": 1.1765,
"step": 800
},
{
"epoch": 0.046152516493330645,
"grad_norm": 2.3776369094848633,
"learning_rate": 4.990120784246917e-05,
"loss": 1.6999,
"step": 801
},
{
"epoch": 0.04621013511566938,
"grad_norm": 2.6144723892211914,
"learning_rate": 4.980241607061894e-05,
"loss": 1.7165,
"step": 802
},
{
"epoch": 0.04626775373800812,
"grad_norm": 3.149681806564331,
"learning_rate": 4.970362507012848e-05,
"loss": 1.7084,
"step": 803
},
{
"epoch": 0.04632537236034687,
"grad_norm": 2.9937732219696045,
"learning_rate": 4.960483522667387e-05,
"loss": 1.5925,
"step": 804
},
{
"epoch": 0.046382990982685605,
"grad_norm": 3.3111348152160645,
"learning_rate": 4.950604692592672e-05,
"loss": 1.7231,
"step": 805
},
{
"epoch": 0.046440609605024344,
"grad_norm": 3.3941500186920166,
"learning_rate": 4.940726055355259e-05,
"loss": 1.5797,
"step": 806
},
{
"epoch": 0.04649822822736308,
"grad_norm": 4.20961856842041,
"learning_rate": 4.9308476495209544e-05,
"loss": 2.0995,
"step": 807
},
{
"epoch": 0.04655584684970182,
"grad_norm": 3.890392780303955,
"learning_rate": 4.920969513654656e-05,
"loss": 1.9789,
"step": 808
},
{
"epoch": 0.046613465472040566,
"grad_norm": 3.589712619781494,
"learning_rate": 4.911091686320213e-05,
"loss": 1.4081,
"step": 809
},
{
"epoch": 0.046671084094379305,
"grad_norm": 3.8748233318328857,
"learning_rate": 4.901214206080269e-05,
"loss": 1.6858,
"step": 810
},
{
"epoch": 0.04672870271671804,
"grad_norm": 4.754543304443359,
"learning_rate": 4.891337111496107e-05,
"loss": 2.3168,
"step": 811
},
{
"epoch": 0.04678632133905678,
"grad_norm": 4.800637722015381,
"learning_rate": 4.881460441127513e-05,
"loss": 2.3802,
"step": 812
},
{
"epoch": 0.04684393996139552,
"grad_norm": 4.492969512939453,
"learning_rate": 4.87158423353261e-05,
"loss": 1.8456,
"step": 813
},
{
"epoch": 0.046901558583734265,
"grad_norm": 4.266040325164795,
"learning_rate": 4.86170852726772e-05,
"loss": 1.6628,
"step": 814
},
{
"epoch": 0.046959177206073004,
"grad_norm": 6.2340192794799805,
"learning_rate": 4.851833360887201e-05,
"loss": 2.4876,
"step": 815
},
{
"epoch": 0.04701679582841174,
"grad_norm": 5.668268203735352,
"learning_rate": 4.841958772943311e-05,
"loss": 2.4621,
"step": 816
},
{
"epoch": 0.04707441445075048,
"grad_norm": 5.882351875305176,
"learning_rate": 4.8320848019860454e-05,
"loss": 2.2458,
"step": 817
},
{
"epoch": 0.047132033073089226,
"grad_norm": 6.402505397796631,
"learning_rate": 4.8222114865629886e-05,
"loss": 2.5436,
"step": 818
},
{
"epoch": 0.047189651695427964,
"grad_norm": 5.669656276702881,
"learning_rate": 4.8123388652191715e-05,
"loss": 2.0122,
"step": 819
},
{
"epoch": 0.0472472703177667,
"grad_norm": 5.495424747467041,
"learning_rate": 4.802466976496911e-05,
"loss": 2.0122,
"step": 820
},
{
"epoch": 0.04730488894010544,
"grad_norm": 6.105950832366943,
"learning_rate": 4.7925958589356675e-05,
"loss": 2.0106,
"step": 821
},
{
"epoch": 0.04736250756244418,
"grad_norm": 6.784826755523682,
"learning_rate": 4.782725551071886e-05,
"loss": 2.2259,
"step": 822
},
{
"epoch": 0.047420126184782925,
"grad_norm": 5.131880760192871,
"learning_rate": 4.7728560914388566e-05,
"loss": 1.9704,
"step": 823
},
{
"epoch": 0.04747774480712166,
"grad_norm": 5.995504379272461,
"learning_rate": 4.762987518566551e-05,
"loss": 2.1256,
"step": 824
},
{
"epoch": 0.0475353634294604,
"grad_norm": 6.144118309020996,
"learning_rate": 4.7531198709814854e-05,
"loss": 1.8394,
"step": 825
},
{
"epoch": 0.04759298205179914,
"grad_norm": 5.513754367828369,
"learning_rate": 4.7432531872065605e-05,
"loss": 1.8083,
"step": 826
},
{
"epoch": 0.04765060067413788,
"grad_norm": 6.709282398223877,
"learning_rate": 4.7333875057609126e-05,
"loss": 2.1379,
"step": 827
},
{
"epoch": 0.047708219296476624,
"grad_norm": 7.151832103729248,
"learning_rate": 4.7235228651597694e-05,
"loss": 2.2792,
"step": 828
},
{
"epoch": 0.04776583791881536,
"grad_norm": 5.135676860809326,
"learning_rate": 4.713659303914292e-05,
"loss": 1.348,
"step": 829
},
{
"epoch": 0.0478234565411541,
"grad_norm": 7.304006576538086,
"learning_rate": 4.703796860531429e-05,
"loss": 2.339,
"step": 830
},
{
"epoch": 0.04788107516349284,
"grad_norm": 6.4769463539123535,
"learning_rate": 4.693935573513764e-05,
"loss": 1.6826,
"step": 831
},
{
"epoch": 0.04793869378583158,
"grad_norm": 6.738263130187988,
"learning_rate": 4.684075481359368e-05,
"loss": 1.6978,
"step": 832
},
{
"epoch": 0.04799631240817032,
"grad_norm": 5.58188009262085,
"learning_rate": 4.674216622561644e-05,
"loss": 1.5867,
"step": 833
},
{
"epoch": 0.04805393103050906,
"grad_norm": 6.230498313903809,
"learning_rate": 4.664359035609183e-05,
"loss": 1.6449,
"step": 834
},
{
"epoch": 0.0481115496528478,
"grad_norm": 7.621301174163818,
"learning_rate": 4.654502758985611e-05,
"loss": 1.8639,
"step": 835
},
{
"epoch": 0.04816916827518654,
"grad_norm": 7.118239402770996,
"learning_rate": 4.644647831169435e-05,
"loss": 1.8299,
"step": 836
},
{
"epoch": 0.04822678689752528,
"grad_norm": 8.021576881408691,
"learning_rate": 4.6347942906339015e-05,
"loss": 1.475,
"step": 837
},
{
"epoch": 0.04828440551986402,
"grad_norm": 8.040701866149902,
"learning_rate": 4.624942175846835e-05,
"loss": 1.5401,
"step": 838
},
{
"epoch": 0.04834202414220276,
"grad_norm": 8.714917182922363,
"learning_rate": 4.615091525270501e-05,
"loss": 2.2524,
"step": 839
},
{
"epoch": 0.0483996427645415,
"grad_norm": 7.86495304107666,
"learning_rate": 4.6052423773614404e-05,
"loss": 2.0126,
"step": 840
},
{
"epoch": 0.04845726138688024,
"grad_norm": 8.5813627243042,
"learning_rate": 4.595394770570337e-05,
"loss": 1.6492,
"step": 841
},
{
"epoch": 0.04851488000921898,
"grad_norm": 7.526153564453125,
"learning_rate": 4.585548743341851e-05,
"loss": 1.3797,
"step": 842
},
{
"epoch": 0.04857249863155772,
"grad_norm": 7.555289268493652,
"learning_rate": 4.575704334114478e-05,
"loss": 1.4509,
"step": 843
},
{
"epoch": 0.04863011725389646,
"grad_norm": 8.157431602478027,
"learning_rate": 4.5658615813204e-05,
"loss": 1.4594,
"step": 844
},
{
"epoch": 0.0486877358762352,
"grad_norm": 8.48002815246582,
"learning_rate": 4.5560205233853266e-05,
"loss": 1.3395,
"step": 845
},
{
"epoch": 0.04874535449857394,
"grad_norm": 8.465682983398438,
"learning_rate": 4.546181198728357e-05,
"loss": 1.6884,
"step": 846
},
{
"epoch": 0.04880297312091268,
"grad_norm": 9.142701148986816,
"learning_rate": 4.5363436457618174e-05,
"loss": 1.5224,
"step": 847
},
{
"epoch": 0.04886059174325142,
"grad_norm": 9.824236869812012,
"learning_rate": 4.5265079028911244e-05,
"loss": 1.2396,
"step": 848
},
{
"epoch": 0.04891821036559016,
"grad_norm": 10.611145973205566,
"learning_rate": 4.516674008514623e-05,
"loss": 1.4435,
"step": 849
},
{
"epoch": 0.0489758289879289,
"grad_norm": 10.943769454956055,
"learning_rate": 4.506842001023442e-05,
"loss": 0.9759,
"step": 850
},
{
"epoch": 0.049033447610267636,
"grad_norm": 2.414503335952759,
"learning_rate": 4.497011918801347e-05,
"loss": 1.4807,
"step": 851
},
{
"epoch": 0.04909106623260638,
"grad_norm": 3.1250979900360107,
"learning_rate": 4.487183800224583e-05,
"loss": 1.9241,
"step": 852
},
{
"epoch": 0.04914868485494512,
"grad_norm": 3.0261995792388916,
"learning_rate": 4.477357683661734e-05,
"loss": 1.4566,
"step": 853
},
{
"epoch": 0.04920630347728386,
"grad_norm": 3.0255727767944336,
"learning_rate": 4.467533607473563e-05,
"loss": 1.399,
"step": 854
},
{
"epoch": 0.0492639220996226,
"grad_norm": 3.5942564010620117,
"learning_rate": 4.4577116100128735e-05,
"loss": 2.0399,
"step": 855
},
{
"epoch": 0.049321540721961335,
"grad_norm": 3.998257637023926,
"learning_rate": 4.447891729624347e-05,
"loss": 2.2571,
"step": 856
},
{
"epoch": 0.04937915934430008,
"grad_norm": 3.902076244354248,
"learning_rate": 4.4380740046444066e-05,
"loss": 1.6864,
"step": 857
},
{
"epoch": 0.04943677796663882,
"grad_norm": 3.9801523685455322,
"learning_rate": 4.4282584734010554e-05,
"loss": 1.6063,
"step": 858
},
{
"epoch": 0.04949439658897756,
"grad_norm": 4.512144565582275,
"learning_rate": 4.418445174213734e-05,
"loss": 2.057,
"step": 859
},
{
"epoch": 0.049552015211316296,
"grad_norm": 4.7217864990234375,
"learning_rate": 4.4086341453931716e-05,
"loss": 2.1109,
"step": 860
},
{
"epoch": 0.049609633833655034,
"grad_norm": 4.4047369956970215,
"learning_rate": 4.39882542524123e-05,
"loss": 1.8576,
"step": 861
},
{
"epoch": 0.04966725245599378,
"grad_norm": 5.217447280883789,
"learning_rate": 4.389019052050762e-05,
"loss": 1.7991,
"step": 862
},
{
"epoch": 0.04972487107833252,
"grad_norm": 4.7313737869262695,
"learning_rate": 4.379215064105454e-05,
"loss": 2.0482,
"step": 863
},
{
"epoch": 0.049782489700671256,
"grad_norm": 5.718055248260498,
"learning_rate": 4.369413499679684e-05,
"loss": 2.4613,
"step": 864
},
{
"epoch": 0.049840108323009995,
"grad_norm": 6.294350624084473,
"learning_rate": 4.3596143970383664e-05,
"loss": 3.0555,
"step": 865
},
{
"epoch": 0.04989772694534874,
"grad_norm": 5.583549499511719,
"learning_rate": 4.3498177944368044e-05,
"loss": 1.7864,
"step": 866
},
{
"epoch": 0.04995534556768748,
"grad_norm": 6.036923408508301,
"learning_rate": 4.340023730120545e-05,
"loss": 1.8766,
"step": 867
},
{
"epoch": 0.05001296419002622,
"grad_norm": 6.711114883422852,
"learning_rate": 4.330232242325221e-05,
"loss": 2.4449,
"step": 868
},
{
"epoch": 0.050070582812364955,
"grad_norm": 5.418303966522217,
"learning_rate": 4.3204433692764096e-05,
"loss": 1.5403,
"step": 869
},
{
"epoch": 0.050128201434703694,
"grad_norm": 7.412199020385742,
"learning_rate": 4.3106571491894786e-05,
"loss": 2.0685,
"step": 870
},
{
"epoch": 0.05018582005704244,
"grad_norm": 10.131074905395508,
"learning_rate": 4.3008736202694414e-05,
"loss": 3.293,
"step": 871
},
{
"epoch": 0.05024343867938118,
"grad_norm": 5.617569446563721,
"learning_rate": 4.2910928207108005e-05,
"loss": 1.2099,
"step": 872
},
{
"epoch": 0.050301057301719916,
"grad_norm": 6.808053016662598,
"learning_rate": 4.281314788697408e-05,
"loss": 2.0044,
"step": 873
},
{
"epoch": 0.050358675924058655,
"grad_norm": 6.922706604003906,
"learning_rate": 4.271539562402309e-05,
"loss": 2.1682,
"step": 874
},
{
"epoch": 0.05041629454639739,
"grad_norm": 7.202422142028809,
"learning_rate": 4.2617671799875944e-05,
"loss": 2.2611,
"step": 875
},
{
"epoch": 0.05047391316873614,
"grad_norm": 6.542942523956299,
"learning_rate": 4.2519976796042554e-05,
"loss": 1.4059,
"step": 876
},
{
"epoch": 0.05053153179107488,
"grad_norm": 5.763952732086182,
"learning_rate": 4.242231099392029e-05,
"loss": 1.495,
"step": 877
},
{
"epoch": 0.050589150413413615,
"grad_norm": 7.699655532836914,
"learning_rate": 4.232467477479255e-05,
"loss": 2.405,
"step": 878
},
{
"epoch": 0.050646769035752354,
"grad_norm": 6.270684242248535,
"learning_rate": 4.222706851982721e-05,
"loss": 1.573,
"step": 879
},
{
"epoch": 0.05070438765809109,
"grad_norm": 7.307977676391602,
"learning_rate": 4.212949261007519e-05,
"loss": 1.8651,
"step": 880
},
{
"epoch": 0.05076200628042984,
"grad_norm": 7.067574977874756,
"learning_rate": 4.203194742646893e-05,
"loss": 1.7772,
"step": 881
},
{
"epoch": 0.050819624902768576,
"grad_norm": 6.7210564613342285,
"learning_rate": 4.193443334982093e-05,
"loss": 1.5807,
"step": 882
},
{
"epoch": 0.050877243525107314,
"grad_norm": 6.2213921546936035,
"learning_rate": 4.1836950760822235e-05,
"loss": 1.3346,
"step": 883
},
{
"epoch": 0.05093486214744605,
"grad_norm": 8.103436470031738,
"learning_rate": 4.1739500040040964e-05,
"loss": 2.0769,
"step": 884
},
{
"epoch": 0.05099248076978479,
"grad_norm": 7.473268985748291,
"learning_rate": 4.1642081567920846e-05,
"loss": 1.672,
"step": 885
},
{
"epoch": 0.05105009939212354,
"grad_norm": 6.710503578186035,
"learning_rate": 4.154469572477967e-05,
"loss": 1.5179,
"step": 886
},
{
"epoch": 0.051107718014462275,
"grad_norm": 7.717504024505615,
"learning_rate": 4.144734289080791e-05,
"loss": 1.4799,
"step": 887
},
{
"epoch": 0.05116533663680101,
"grad_norm": 7.528170585632324,
"learning_rate": 4.1350023446067084e-05,
"loss": 1.4937,
"step": 888
},
{
"epoch": 0.05122295525913975,
"grad_norm": 8.319198608398438,
"learning_rate": 4.125273777048845e-05,
"loss": 2.0336,
"step": 889
},
{
"epoch": 0.0512805738814785,
"grad_norm": 6.443837642669678,
"learning_rate": 4.115548624387137e-05,
"loss": 1.4709,
"step": 890
},
{
"epoch": 0.051338192503817236,
"grad_norm": 8.02401065826416,
"learning_rate": 4.1058269245881896e-05,
"loss": 1.2308,
"step": 891
},
{
"epoch": 0.051395811126155974,
"grad_norm": 8.2232666015625,
"learning_rate": 4.0961087156051334e-05,
"loss": 1.6498,
"step": 892
},
{
"epoch": 0.05145342974849471,
"grad_norm": 11.053596496582031,
"learning_rate": 4.086394035377463e-05,
"loss": 1.7836,
"step": 893
},
{
"epoch": 0.05151104837083345,
"grad_norm": 7.8841376304626465,
"learning_rate": 4.076682921830906e-05,
"loss": 1.3964,
"step": 894
},
{
"epoch": 0.051568666993172196,
"grad_norm": 8.288019180297852,
"learning_rate": 4.066975412877255e-05,
"loss": 1.2453,
"step": 895
},
{
"epoch": 0.051626285615510935,
"grad_norm": 9.921595573425293,
"learning_rate": 4.0572715464142416e-05,
"loss": 1.9608,
"step": 896
},
{
"epoch": 0.05168390423784967,
"grad_norm": 9.58582878112793,
"learning_rate": 4.04757136032537e-05,
"loss": 1.6641,
"step": 897
},
{
"epoch": 0.05174152286018841,
"grad_norm": 9.662057876586914,
"learning_rate": 4.037874892479778e-05,
"loss": 1.3649,
"step": 898
},
{
"epoch": 0.05179914148252715,
"grad_norm": 11.919939994812012,
"learning_rate": 4.028182180732088e-05,
"loss": 1.5265,
"step": 899
},
{
"epoch": 0.051856760104865895,
"grad_norm": 10.740790367126465,
"learning_rate": 4.0184932629222575e-05,
"loss": 1.6014,
"step": 900
},
{
"epoch": 0.051914378727204634,
"grad_norm": 2.332369327545166,
"learning_rate": 4.0088081768754365e-05,
"loss": 1.4444,
"step": 901
},
{
"epoch": 0.05197199734954337,
"grad_norm": 2.783017873764038,
"learning_rate": 3.99912696040181e-05,
"loss": 1.9566,
"step": 902
},
{
"epoch": 0.05202961597188211,
"grad_norm": 2.6983227729797363,
"learning_rate": 3.9894496512964635e-05,
"loss": 1.4608,
"step": 903
},
{
"epoch": 0.05208723459422085,
"grad_norm": 3.0581393241882324,
"learning_rate": 3.97977628733922e-05,
"loss": 1.7021,
"step": 904
},
{
"epoch": 0.052144853216559595,
"grad_norm": 3.1544852256774902,
"learning_rate": 3.970106906294509e-05,
"loss": 1.5047,
"step": 905
},
{
"epoch": 0.05220247183889833,
"grad_norm": 3.3631703853607178,
"learning_rate": 3.960441545911204e-05,
"loss": 1.8593,
"step": 906
},
{
"epoch": 0.05226009046123707,
"grad_norm": 4.259178161621094,
"learning_rate": 3.950780243922486e-05,
"loss": 1.7893,
"step": 907
},
{
"epoch": 0.05231770908357581,
"grad_norm": 3.513627290725708,
"learning_rate": 3.9411230380456925e-05,
"loss": 1.1336,
"step": 908
},
{
"epoch": 0.05237532770591455,
"grad_norm": 4.283318996429443,
"learning_rate": 3.9314699659821666e-05,
"loss": 1.9772,
"step": 909
},
{
"epoch": 0.052432946328253294,
"grad_norm": 4.256377696990967,
"learning_rate": 3.921821065417116e-05,
"loss": 1.5408,
"step": 910
},
{
"epoch": 0.05249056495059203,
"grad_norm": 4.703672885894775,
"learning_rate": 3.9121763740194616e-05,
"loss": 2.2098,
"step": 911
},
{
"epoch": 0.05254818357293077,
"grad_norm": 5.5464372634887695,
"learning_rate": 3.9025359294416945e-05,
"loss": 2.4128,
"step": 912
},
{
"epoch": 0.05260580219526951,
"grad_norm": 5.426605224609375,
"learning_rate": 3.892899769319722e-05,
"loss": 2.3157,
"step": 913
},
{
"epoch": 0.052663420817608254,
"grad_norm": 4.229480266571045,
"learning_rate": 3.8832679312727296e-05,
"loss": 1.1878,
"step": 914
},
{
"epoch": 0.05272103943994699,
"grad_norm": 5.823680877685547,
"learning_rate": 3.873640452903026e-05,
"loss": 2.5378,
"step": 915
},
{
"epoch": 0.05277865806228573,
"grad_norm": 4.739530563354492,
"learning_rate": 3.8640173717959007e-05,
"loss": 1.9838,
"step": 916
},
{
"epoch": 0.05283627668462447,
"grad_norm": 6.260214328765869,
"learning_rate": 3.854398725519479e-05,
"loss": 1.9614,
"step": 917
},
{
"epoch": 0.05289389530696321,
"grad_norm": 6.466116428375244,
"learning_rate": 3.8447845516245695e-05,
"loss": 2.1905,
"step": 918
},
{
"epoch": 0.052951513929301953,
"grad_norm": 6.437125205993652,
"learning_rate": 3.835174887644523e-05,
"loss": 1.9177,
"step": 919
},
{
"epoch": 0.05300913255164069,
"grad_norm": 6.781651496887207,
"learning_rate": 3.825569771095082e-05,
"loss": 2.4333,
"step": 920
},
{
"epoch": 0.05306675117397943,
"grad_norm": 7.853660583496094,
"learning_rate": 3.815969239474238e-05,
"loss": 1.9977,
"step": 921
},
{
"epoch": 0.05312436979631817,
"grad_norm": 5.589831829071045,
"learning_rate": 3.8063733302620826e-05,
"loss": 1.8375,
"step": 922
},
{
"epoch": 0.05318198841865691,
"grad_norm": 4.707467079162598,
"learning_rate": 3.796782080920659e-05,
"loss": 1.3119,
"step": 923
},
{
"epoch": 0.05323960704099565,
"grad_norm": 4.886472702026367,
"learning_rate": 3.787195528893823e-05,
"loss": 1.5678,
"step": 924
},
{
"epoch": 0.05329722566333439,
"grad_norm": 6.364172458648682,
"learning_rate": 3.777613711607087e-05,
"loss": 1.7858,
"step": 925
},
{
"epoch": 0.05335484428567313,
"grad_norm": 6.775904178619385,
"learning_rate": 3.768036666467486e-05,
"loss": 1.7451,
"step": 926
},
{
"epoch": 0.05341246290801187,
"grad_norm": 5.326086044311523,
"learning_rate": 3.758464430863417e-05,
"loss": 1.1407,
"step": 927
},
{
"epoch": 0.053470081530350606,
"grad_norm": 6.35353946685791,
"learning_rate": 3.748897042164508e-05,
"loss": 2.0648,
"step": 928
},
{
"epoch": 0.05352770015268935,
"grad_norm": 6.439608097076416,
"learning_rate": 3.7393345377214586e-05,
"loss": 1.5234,
"step": 929
},
{
"epoch": 0.05358531877502809,
"grad_norm": 6.532753944396973,
"learning_rate": 3.729776954865905e-05,
"loss": 1.7337,
"step": 930
},
{
"epoch": 0.05364293739736683,
"grad_norm": 5.889612674713135,
"learning_rate": 3.720224330910268e-05,
"loss": 1.4711,
"step": 931
},
{
"epoch": 0.05370055601970557,
"grad_norm": 7.272073745727539,
"learning_rate": 3.7106767031476075e-05,
"loss": 1.7975,
"step": 932
},
{
"epoch": 0.053758174642044305,
"grad_norm": 7.865191459655762,
"learning_rate": 3.701134108851483e-05,
"loss": 2.1245,
"step": 933
},
{
"epoch": 0.05381579326438305,
"grad_norm": 6.3021111488342285,
"learning_rate": 3.691596585275797e-05,
"loss": 1.6541,
"step": 934
},
{
"epoch": 0.05387341188672179,
"grad_norm": 6.885900974273682,
"learning_rate": 3.682064169654663e-05,
"loss": 1.6203,
"step": 935
},
{
"epoch": 0.05393103050906053,
"grad_norm": 7.885501861572266,
"learning_rate": 3.672536899202247e-05,
"loss": 1.9761,
"step": 936
},
{
"epoch": 0.053988649131399266,
"grad_norm": 7.633294582366943,
"learning_rate": 3.663014811112634e-05,
"loss": 1.3759,
"step": 937
},
{
"epoch": 0.05404626775373801,
"grad_norm": 6.931453704833984,
"learning_rate": 3.6534979425596743e-05,
"loss": 1.5243,
"step": 938
},
{
"epoch": 0.05410388637607675,
"grad_norm": 8.72075080871582,
"learning_rate": 3.6439863306968395e-05,
"loss": 2.237,
"step": 939
},
{
"epoch": 0.05416150499841549,
"grad_norm": 10.649337768554688,
"learning_rate": 3.6344800126570844e-05,
"loss": 2.0054,
"step": 940
},
{
"epoch": 0.05421912362075423,
"grad_norm": 7.299565315246582,
"learning_rate": 3.6249790255526915e-05,
"loss": 1.406,
"step": 941
},
{
"epoch": 0.054276742243092965,
"grad_norm": 10.546122550964355,
"learning_rate": 3.615483406475137e-05,
"loss": 1.5258,
"step": 942
},
{
"epoch": 0.05433436086543171,
"grad_norm": 8.8644380569458,
"learning_rate": 3.605993192494934e-05,
"loss": 1.5541,
"step": 943
},
{
"epoch": 0.05439197948777045,
"grad_norm": 8.58125114440918,
"learning_rate": 3.5965084206615014e-05,
"loss": 1.5751,
"step": 944
},
{
"epoch": 0.05444959811010919,
"grad_norm": 8.127228736877441,
"learning_rate": 3.587029128003006e-05,
"loss": 1.3735,
"step": 945
},
{
"epoch": 0.054507216732447926,
"grad_norm": 10.732064247131348,
"learning_rate": 3.577555351526229e-05,
"loss": 1.7643,
"step": 946
},
{
"epoch": 0.054564835354786664,
"grad_norm": 9.98898696899414,
"learning_rate": 3.5680871282164144e-05,
"loss": 1.2092,
"step": 947
},
{
"epoch": 0.05462245397712541,
"grad_norm": 10.777913093566895,
"learning_rate": 3.5586244950371247e-05,
"loss": 1.6657,
"step": 948
},
{
"epoch": 0.05468007259946415,
"grad_norm": 9.558432579040527,
"learning_rate": 3.549167488930103e-05,
"loss": 1.4431,
"step": 949
},
{
"epoch": 0.05473769122180289,
"grad_norm": 14.252156257629395,
"learning_rate": 3.539716146815122e-05,
"loss": 1.1596,
"step": 950
},
{
"epoch": 0.054795309844141625,
"grad_norm": 2.5063233375549316,
"learning_rate": 3.5302705055898425e-05,
"loss": 1.7395,
"step": 951
},
{
"epoch": 0.05485292846648036,
"grad_norm": 2.74783992767334,
"learning_rate": 3.52083060212967e-05,
"loss": 1.3586,
"step": 952
},
{
"epoch": 0.05491054708881911,
"grad_norm": 3.001479387283325,
"learning_rate": 3.5113964732876106e-05,
"loss": 2.0306,
"step": 953
},
{
"epoch": 0.05496816571115785,
"grad_norm": 3.1495397090911865,
"learning_rate": 3.5019681558941254e-05,
"loss": 1.8057,
"step": 954
},
{
"epoch": 0.055025784333496586,
"grad_norm": 3.73935604095459,
"learning_rate": 3.492545686756986e-05,
"loss": 2.0167,
"step": 955
},
{
"epoch": 0.055083402955835324,
"grad_norm": 3.437436580657959,
"learning_rate": 3.4831291026611365e-05,
"loss": 1.3825,
"step": 956
},
{
"epoch": 0.05514102157817406,
"grad_norm": 3.4579591751098633,
"learning_rate": 3.473718440368544e-05,
"loss": 1.5541,
"step": 957
},
{
"epoch": 0.05519864020051281,
"grad_norm": 4.052521228790283,
"learning_rate": 3.464313736618058e-05,
"loss": 2.0655,
"step": 958
},
{
"epoch": 0.055256258822851546,
"grad_norm": 4.834951877593994,
"learning_rate": 3.4549150281252636e-05,
"loss": 2.0104,
"step": 959
},
{
"epoch": 0.055313877445190285,
"grad_norm": 4.843097686767578,
"learning_rate": 3.4455223515823446e-05,
"loss": 2.1078,
"step": 960
},
{
"epoch": 0.05537149606752902,
"grad_norm": 5.0431108474731445,
"learning_rate": 3.4361357436579316e-05,
"loss": 2.3052,
"step": 961
},
{
"epoch": 0.05542911468986777,
"grad_norm": 4.58592414855957,
"learning_rate": 3.426755240996969e-05,
"loss": 1.7864,
"step": 962
},
{
"epoch": 0.05548673331220651,
"grad_norm": 4.117121696472168,
"learning_rate": 3.417380880220563e-05,
"loss": 1.1863,
"step": 963
},
{
"epoch": 0.055544351934545245,
"grad_norm": 5.660807132720947,
"learning_rate": 3.4080126979258406e-05,
"loss": 1.8763,
"step": 964
},
{
"epoch": 0.055601970556883984,
"grad_norm": 5.17064094543457,
"learning_rate": 3.3986507306858125e-05,
"loss": 1.9112,
"step": 965
},
{
"epoch": 0.05565958917922272,
"grad_norm": 5.565435409545898,
"learning_rate": 3.3892950150492234e-05,
"loss": 1.9436,
"step": 966
},
{
"epoch": 0.05571720780156147,
"grad_norm": 5.954512119293213,
"learning_rate": 3.379945587540414e-05,
"loss": 2.6872,
"step": 967
},
{
"epoch": 0.055774826423900206,
"grad_norm": 6.699615001678467,
"learning_rate": 3.3706024846591715e-05,
"loss": 2.2461,
"step": 968
},
{
"epoch": 0.055832445046238945,
"grad_norm": 5.078495025634766,
"learning_rate": 3.361265742880599e-05,
"loss": 1.2964,
"step": 969
},
{
"epoch": 0.05589006366857768,
"grad_norm": 6.249274730682373,
"learning_rate": 3.351935398654961e-05,
"loss": 2.1967,
"step": 970
},
{
"epoch": 0.05594768229091642,
"grad_norm": 8.249637603759766,
"learning_rate": 3.342611488407549e-05,
"loss": 2.783,
"step": 971
},
{
"epoch": 0.05600530091325517,
"grad_norm": 7.026087284088135,
"learning_rate": 3.3332940485385336e-05,
"loss": 1.8789,
"step": 972
},
{
"epoch": 0.056062919535593905,
"grad_norm": 6.553802490234375,
"learning_rate": 3.323983115422827e-05,
"loss": 1.7054,
"step": 973
},
{
"epoch": 0.056120538157932644,
"grad_norm": 6.306643486022949,
"learning_rate": 3.314678725409942e-05,
"loss": 1.5129,
"step": 974
},
{
"epoch": 0.05617815678027138,
"grad_norm": 5.8464179039001465,
"learning_rate": 3.3053809148238426e-05,
"loss": 1.559,
"step": 975
},
{
"epoch": 0.05623577540261012,
"grad_norm": 7.461513042449951,
"learning_rate": 3.296089719962809e-05,
"loss": 1.7222,
"step": 976
},
{
"epoch": 0.056293394024948866,
"grad_norm": 5.434988021850586,
"learning_rate": 3.286805177099293e-05,
"loss": 1.2945,
"step": 977
},
{
"epoch": 0.056351012647287604,
"grad_norm": 6.823791980743408,
"learning_rate": 3.2775273224797825e-05,
"loss": 1.7969,
"step": 978
},
{
"epoch": 0.05640863126962634,
"grad_norm": 5.9947428703308105,
"learning_rate": 3.268256192324647e-05,
"loss": 1.3397,
"step": 979
},
{
"epoch": 0.05646624989196508,
"grad_norm": 7.0553202629089355,
"learning_rate": 3.258991822828007e-05,
"loss": 1.4833,
"step": 980
},
{
"epoch": 0.05652386851430382,
"grad_norm": 7.974583148956299,
"learning_rate": 3.249734250157592e-05,
"loss": 1.7152,
"step": 981
},
{
"epoch": 0.056581487136642565,
"grad_norm": 6.147339820861816,
"learning_rate": 3.240483510454594e-05,
"loss": 1.4128,
"step": 982
},
{
"epoch": 0.056639105758981304,
"grad_norm": 7.629016876220703,
"learning_rate": 3.231239639833531e-05,
"loss": 1.9712,
"step": 983
},
{
"epoch": 0.05669672438132004,
"grad_norm": 7.414737224578857,
"learning_rate": 3.222002674382103e-05,
"loss": 1.9627,
"step": 984
},
{
"epoch": 0.05675434300365878,
"grad_norm": 6.9845452308654785,
"learning_rate": 3.212772650161056e-05,
"loss": 1.3646,
"step": 985
},
{
"epoch": 0.056811961625997526,
"grad_norm": 7.116814136505127,
"learning_rate": 3.20354960320403e-05,
"loss": 1.3854,
"step": 986
},
{
"epoch": 0.056869580248336264,
"grad_norm": 7.610520839691162,
"learning_rate": 3.1943335695174365e-05,
"loss": 1.5902,
"step": 987
},
{
"epoch": 0.056927198870675,
"grad_norm": 7.150577545166016,
"learning_rate": 3.185124585080299e-05,
"loss": 1.7962,
"step": 988
},
{
"epoch": 0.05698481749301374,
"grad_norm": 7.860030174255371,
"learning_rate": 3.175922685844125e-05,
"loss": 1.3278,
"step": 989
},
{
"epoch": 0.05704243611535248,
"grad_norm": 7.933725833892822,
"learning_rate": 3.16672790773276e-05,
"loss": 1.3638,
"step": 990
},
{
"epoch": 0.057100054737691225,
"grad_norm": 8.873092651367188,
"learning_rate": 3.15754028664225e-05,
"loss": 1.8343,
"step": 991
},
{
"epoch": 0.05715767336002996,
"grad_norm": 8.709065437316895,
"learning_rate": 3.1483598584407006e-05,
"loss": 1.5662,
"step": 992
},
{
"epoch": 0.0572152919823687,
"grad_norm": 7.3778791427612305,
"learning_rate": 3.1391866589681346e-05,
"loss": 1.0271,
"step": 993
},
{
"epoch": 0.05727291060470744,
"grad_norm": 10.030227661132812,
"learning_rate": 3.130020724036357e-05,
"loss": 1.8624,
"step": 994
},
{
"epoch": 0.05733052922704618,
"grad_norm": 9.539932250976562,
"learning_rate": 3.12086208942881e-05,
"loss": 1.8922,
"step": 995
},
{
"epoch": 0.057388147849384924,
"grad_norm": 9.47962474822998,
"learning_rate": 3.1117107909004364e-05,
"loss": 1.8324,
"step": 996
},
{
"epoch": 0.05744576647172366,
"grad_norm": 10.077855110168457,
"learning_rate": 3.10256686417754e-05,
"loss": 1.3001,
"step": 997
},
{
"epoch": 0.0575033850940624,
"grad_norm": 11.914607048034668,
"learning_rate": 3.093430344957643e-05,
"loss": 1.5453,
"step": 998
},
{
"epoch": 0.05756100371640114,
"grad_norm": 10.913701057434082,
"learning_rate": 3.084301268909353e-05,
"loss": 1.6776,
"step": 999
},
{
"epoch": 0.05761862233873988,
"grad_norm": 10.420541763305664,
"learning_rate": 3.075179671672216e-05,
"loss": 1.0552,
"step": 1000
},
{
"epoch": 0.05767624096107862,
"grad_norm": 2.658548593521118,
"learning_rate": 3.0660655888565825e-05,
"loss": 1.6705,
"step": 1001
},
{
"epoch": 0.05773385958341736,
"grad_norm": 2.5908820629119873,
"learning_rate": 3.0569590560434666e-05,
"loss": 1.4523,
"step": 1002
},
{
"epoch": 0.0577914782057561,
"grad_norm": 3.7232770919799805,
"learning_rate": 3.0478601087844096e-05,
"loss": 2.3762,
"step": 1003
},
{
"epoch": 0.05784909682809484,
"grad_norm": 3.3812122344970703,
"learning_rate": 3.038768782601335e-05,
"loss": 1.7777,
"step": 1004
},
{
"epoch": 0.05790671545043358,
"grad_norm": 2.975947618484497,
"learning_rate": 3.0296851129864168e-05,
"loss": 1.198,
"step": 1005
},
{
"epoch": 0.05796433407277232,
"grad_norm": 4.245443344116211,
"learning_rate": 3.0206091354019393e-05,
"loss": 2.2089,
"step": 1006
},
{
"epoch": 0.05802195269511106,
"grad_norm": 3.833874464035034,
"learning_rate": 3.0115408852801535e-05,
"loss": 1.4623,
"step": 1007
},
{
"epoch": 0.0580795713174498,
"grad_norm": 4.235438346862793,
"learning_rate": 3.0024803980231462e-05,
"loss": 1.5174,
"step": 1008
},
{
"epoch": 0.05813718993978854,
"grad_norm": 5.473639488220215,
"learning_rate": 2.9934277090026964e-05,
"loss": 2.5038,
"step": 1009
},
{
"epoch": 0.05819480856212728,
"grad_norm": 4.578322887420654,
"learning_rate": 2.98438285356014e-05,
"loss": 1.6425,
"step": 1010
},
{
"epoch": 0.05825242718446602,
"grad_norm": 4.82335901260376,
"learning_rate": 2.97534586700623e-05,
"loss": 2.1112,
"step": 1011
},
{
"epoch": 0.05831004580680476,
"grad_norm": 4.744386672973633,
"learning_rate": 2.9663167846209998e-05,
"loss": 1.5347,
"step": 1012
},
{
"epoch": 0.0583676644291435,
"grad_norm": 5.088996410369873,
"learning_rate": 2.9572956416536267e-05,
"loss": 1.7957,
"step": 1013
},
{
"epoch": 0.05842528305148224,
"grad_norm": 5.181789875030518,
"learning_rate": 2.94828247332229e-05,
"loss": 1.8015,
"step": 1014
},
{
"epoch": 0.05848290167382098,
"grad_norm": 6.109447002410889,
"learning_rate": 2.9392773148140408e-05,
"loss": 2.1003,
"step": 1015
},
{
"epoch": 0.05854052029615972,
"grad_norm": 5.849494457244873,
"learning_rate": 2.930280201284654e-05,
"loss": 1.7359,
"step": 1016
},
{
"epoch": 0.05859813891849846,
"grad_norm": 6.307842254638672,
"learning_rate": 2.9212911678585043e-05,
"loss": 1.525,
"step": 1017
},
{
"epoch": 0.0586557575408372,
"grad_norm": 5.742959976196289,
"learning_rate": 2.912310249628415e-05,
"loss": 1.3262,
"step": 1018
},
{
"epoch": 0.058713376163175936,
"grad_norm": 6.012917518615723,
"learning_rate": 2.9033374816555338e-05,
"loss": 1.9182,
"step": 1019
},
{
"epoch": 0.05877099478551468,
"grad_norm": 6.901543617248535,
"learning_rate": 2.894372898969186e-05,
"loss": 2.0538,
"step": 1020
},
{
"epoch": 0.05882861340785342,
"grad_norm": 6.39532995223999,
"learning_rate": 2.885416536566744e-05,
"loss": 1.8529,
"step": 1021
},
{
"epoch": 0.05888623203019216,
"grad_norm": 5.355409622192383,
"learning_rate": 2.876468429413487e-05,
"loss": 1.273,
"step": 1022
},
{
"epoch": 0.058943850652530896,
"grad_norm": 7.023514747619629,
"learning_rate": 2.8675286124424693e-05,
"loss": 1.5327,
"step": 1023
},
{
"epoch": 0.059001469274869635,
"grad_norm": 6.53577184677124,
"learning_rate": 2.858597120554378e-05,
"loss": 2.0575,
"step": 1024
},
{
"epoch": 0.05905908789720838,
"grad_norm": 7.434081554412842,
"learning_rate": 2.8496739886173995e-05,
"loss": 1.7331,
"step": 1025
},
{
"epoch": 0.05911670651954712,
"grad_norm": 7.439203262329102,
"learning_rate": 2.8407592514670812e-05,
"loss": 1.8409,
"step": 1026
},
{
"epoch": 0.05917432514188586,
"grad_norm": 6.7967000007629395,
"learning_rate": 2.8318529439062035e-05,
"loss": 1.4355,
"step": 1027
},
{
"epoch": 0.059231943764224596,
"grad_norm": 7.580361366271973,
"learning_rate": 2.822955100704634e-05,
"loss": 2.057,
"step": 1028
},
{
"epoch": 0.059289562386563334,
"grad_norm": 6.600391387939453,
"learning_rate": 2.8140657565991958e-05,
"loss": 1.7373,
"step": 1029
},
{
"epoch": 0.05934718100890208,
"grad_norm": 7.101778507232666,
"learning_rate": 2.805184946293532e-05,
"loss": 1.8316,
"step": 1030
},
{
"epoch": 0.05940479963124082,
"grad_norm": 6.378438472747803,
"learning_rate": 2.7963127044579697e-05,
"loss": 1.5304,
"step": 1031
},
{
"epoch": 0.059462418253579556,
"grad_norm": 7.539405345916748,
"learning_rate": 2.787449065729388e-05,
"loss": 1.6764,
"step": 1032
},
{
"epoch": 0.059520036875918295,
"grad_norm": 8.427040100097656,
"learning_rate": 2.7785940647110763e-05,
"loss": 1.9864,
"step": 1033
},
{
"epoch": 0.05957765549825704,
"grad_norm": 7.389402866363525,
"learning_rate": 2.7697477359726047e-05,
"loss": 1.4137,
"step": 1034
},
{
"epoch": 0.05963527412059578,
"grad_norm": 8.637269973754883,
"learning_rate": 2.7609101140496863e-05,
"loss": 1.7589,
"step": 1035
},
{
"epoch": 0.05969289274293452,
"grad_norm": 8.280218124389648,
"learning_rate": 2.752081233444041e-05,
"loss": 1.7994,
"step": 1036
},
{
"epoch": 0.059750511365273255,
"grad_norm": 7.711471080780029,
"learning_rate": 2.743261128623269e-05,
"loss": 1.3804,
"step": 1037
},
{
"epoch": 0.059808129987611994,
"grad_norm": 9.472826957702637,
"learning_rate": 2.7344498340207053e-05,
"loss": 1.6227,
"step": 1038
},
{
"epoch": 0.05986574860995074,
"grad_norm": 8.125463485717773,
"learning_rate": 2.72564738403529e-05,
"loss": 1.077,
"step": 1039
},
{
"epoch": 0.05992336723228948,
"grad_norm": 9.9483003616333,
"learning_rate": 2.716853813031435e-05,
"loss": 1.8788,
"step": 1040
},
{
"epoch": 0.059980985854628216,
"grad_norm": 8.082881927490234,
"learning_rate": 2.708069155338892e-05,
"loss": 1.1196,
"step": 1041
},
{
"epoch": 0.060038604476966954,
"grad_norm": 12.251283645629883,
"learning_rate": 2.699293445252612e-05,
"loss": 2.2599,
"step": 1042
},
{
"epoch": 0.06009622309930569,
"grad_norm": 8.400115966796875,
"learning_rate": 2.6905267170326143e-05,
"loss": 1.2582,
"step": 1043
},
{
"epoch": 0.06015384172164444,
"grad_norm": 8.842986106872559,
"learning_rate": 2.6817690049038568e-05,
"loss": 1.3316,
"step": 1044
},
{
"epoch": 0.06021146034398318,
"grad_norm": 8.60519790649414,
"learning_rate": 2.6730203430560947e-05,
"loss": 0.9415,
"step": 1045
},
{
"epoch": 0.060269078966321915,
"grad_norm": 11.412800788879395,
"learning_rate": 2.6642807656437562e-05,
"loss": 1.9491,
"step": 1046
},
{
"epoch": 0.060326697588660654,
"grad_norm": 11.116186141967773,
"learning_rate": 2.6555503067858013e-05,
"loss": 1.3879,
"step": 1047
},
{
"epoch": 0.06038431621099939,
"grad_norm": 11.536842346191406,
"learning_rate": 2.646829000565591e-05,
"loss": 1.1674,
"step": 1048
},
{
"epoch": 0.06044193483333814,
"grad_norm": 8.568710327148438,
"learning_rate": 2.6381168810307533e-05,
"loss": 0.5686,
"step": 1049
},
{
"epoch": 0.060499553455676876,
"grad_norm": 15.451824188232422,
"learning_rate": 2.6294139821930597e-05,
"loss": 1.5875,
"step": 1050
},
{
"epoch": 0.060557172078015614,
"grad_norm": 2.566513776779175,
"learning_rate": 2.6207203380282747e-05,
"loss": 1.8438,
"step": 1051
},
{
"epoch": 0.06061479070035435,
"grad_norm": 2.6996757984161377,
"learning_rate": 2.6120359824760388e-05,
"loss": 1.3146,
"step": 1052
},
{
"epoch": 0.06067240932269309,
"grad_norm": 2.9751899242401123,
"learning_rate": 2.603360949439727e-05,
"loss": 1.098,
"step": 1053
},
{
"epoch": 0.060730027945031836,
"grad_norm": 3.1023969650268555,
"learning_rate": 2.59469527278632e-05,
"loss": 1.202,
"step": 1054
},
{
"epoch": 0.060787646567370575,
"grad_norm": 3.937774658203125,
"learning_rate": 2.5860389863462765e-05,
"loss": 1.6293,
"step": 1055
},
{
"epoch": 0.06084526518970931,
"grad_norm": 5.0030341148376465,
"learning_rate": 2.5773921239133896e-05,
"loss": 1.7704,
"step": 1056
},
{
"epoch": 0.06090288381204805,
"grad_norm": 3.9582436084747314,
"learning_rate": 2.5687547192446647e-05,
"loss": 1.3183,
"step": 1057
},
{
"epoch": 0.0609605024343868,
"grad_norm": 4.461833953857422,
"learning_rate": 2.5601268060601813e-05,
"loss": 1.7695,
"step": 1058
},
{
"epoch": 0.061018121056725536,
"grad_norm": 4.458665370941162,
"learning_rate": 2.5515084180429716e-05,
"loss": 2.2326,
"step": 1059
},
{
"epoch": 0.061075739679064274,
"grad_norm": 4.708508014678955,
"learning_rate": 2.542899588838875e-05,
"loss": 1.5088,
"step": 1060
},
{
"epoch": 0.06113335830140301,
"grad_norm": 5.549221515655518,
"learning_rate": 2.5343003520564158e-05,
"loss": 2.204,
"step": 1061
},
{
"epoch": 0.06119097692374175,
"grad_norm": 4.788711071014404,
"learning_rate": 2.5257107412666708e-05,
"loss": 1.3883,
"step": 1062
},
{
"epoch": 0.061248595546080496,
"grad_norm": 5.306790351867676,
"learning_rate": 2.5171307900031345e-05,
"loss": 1.67,
"step": 1063
},
{
"epoch": 0.061306214168419235,
"grad_norm": 4.873342514038086,
"learning_rate": 2.508560531761597e-05,
"loss": 1.7895,
"step": 1064
},
{
"epoch": 0.06136383279075797,
"grad_norm": 5.944344997406006,
"learning_rate": 2.500000000000001e-05,
"loss": 1.9887,
"step": 1065
},
{
"epoch": 0.06142145141309671,
"grad_norm": 6.517125606536865,
"learning_rate": 2.4914492281383205e-05,
"loss": 1.938,
"step": 1066
},
{
"epoch": 0.06147907003543545,
"grad_norm": 6.3307366371154785,
"learning_rate": 2.4829082495584242e-05,
"loss": 2.5917,
"step": 1067
},
{
"epoch": 0.061536688657774195,
"grad_norm": 6.442718029022217,
"learning_rate": 2.4743770976039544e-05,
"loss": 1.7924,
"step": 1068
},
{
"epoch": 0.061594307280112934,
"grad_norm": 6.157473087310791,
"learning_rate": 2.465855805580185e-05,
"loss": 1.8482,
"step": 1069
},
{
"epoch": 0.06165192590245167,
"grad_norm": 7.638395309448242,
"learning_rate": 2.4573444067538986e-05,
"loss": 2.3381,
"step": 1070
},
{
"epoch": 0.06170954452479041,
"grad_norm": 8.15482234954834,
"learning_rate": 2.448842934353256e-05,
"loss": 2.0042,
"step": 1071
},
{
"epoch": 0.06176716314712915,
"grad_norm": 7.776936054229736,
"learning_rate": 2.440351421567663e-05,
"loss": 1.5819,
"step": 1072
},
{
"epoch": 0.061824781769467894,
"grad_norm": 6.04269552230835,
"learning_rate": 2.4318699015476493e-05,
"loss": 1.1962,
"step": 1073
},
{
"epoch": 0.06188240039180663,
"grad_norm": 6.812227249145508,
"learning_rate": 2.4233984074047273e-05,
"loss": 1.5325,
"step": 1074
},
{
"epoch": 0.06194001901414537,
"grad_norm": 6.410821914672852,
"learning_rate": 2.414936972211272e-05,
"loss": 1.9043,
"step": 1075
},
{
"epoch": 0.06199763763648411,
"grad_norm": 5.87167501449585,
"learning_rate": 2.406485629000386e-05,
"loss": 1.0289,
"step": 1076
},
{
"epoch": 0.06205525625882285,
"grad_norm": 6.149838924407959,
"learning_rate": 2.3980444107657747e-05,
"loss": 1.5482,
"step": 1077
},
{
"epoch": 0.062112874881161594,
"grad_norm": 6.023894786834717,
"learning_rate": 2.3896133504616185e-05,
"loss": 1.0708,
"step": 1078
},
{
"epoch": 0.06217049350350033,
"grad_norm": 6.564477920532227,
"learning_rate": 2.3811924810024384e-05,
"loss": 1.2915,
"step": 1079
},
{
"epoch": 0.06222811212583907,
"grad_norm": 8.50880241394043,
"learning_rate": 2.3727818352629712e-05,
"loss": 2.1412,
"step": 1080
},
{
"epoch": 0.06228573074817781,
"grad_norm": 7.971811294555664,
"learning_rate": 2.3643814460780394e-05,
"loss": 1.7807,
"step": 1081
},
{
"epoch": 0.062343349370516554,
"grad_norm": 7.353085994720459,
"learning_rate": 2.35599134624243e-05,
"loss": 1.2867,
"step": 1082
},
{
"epoch": 0.06240096799285529,
"grad_norm": 6.922592639923096,
"learning_rate": 2.347611568510754e-05,
"loss": 1.1274,
"step": 1083
},
{
"epoch": 0.06245858661519403,
"grad_norm": 9.213545799255371,
"learning_rate": 2.33924214559733e-05,
"loss": 1.886,
"step": 1084
},
{
"epoch": 0.06251620523753278,
"grad_norm": 7.079417705535889,
"learning_rate": 2.3308831101760486e-05,
"loss": 1.4452,
"step": 1085
},
{
"epoch": 0.06257382385987151,
"grad_norm": 8.823144912719727,
"learning_rate": 2.322534494880248e-05,
"loss": 1.5925,
"step": 1086
},
{
"epoch": 0.06263144248221025,
"grad_norm": 9.065716743469238,
"learning_rate": 2.3141963323025916e-05,
"loss": 1.537,
"step": 1087
},
{
"epoch": 0.06268906110454898,
"grad_norm": 8.086875915527344,
"learning_rate": 2.3058686549949306e-05,
"loss": 1.2424,
"step": 1088
},
{
"epoch": 0.06274667972688773,
"grad_norm": 9.3955659866333,
"learning_rate": 2.2975514954681838e-05,
"loss": 1.8639,
"step": 1089
},
{
"epoch": 0.06280429834922648,
"grad_norm": 8.69000244140625,
"learning_rate": 2.289244886192207e-05,
"loss": 1.445,
"step": 1090
},
{
"epoch": 0.06286191697156521,
"grad_norm": 9.602193832397461,
"learning_rate": 2.2809488595956745e-05,
"loss": 1.6213,
"step": 1091
},
{
"epoch": 0.06291953559390395,
"grad_norm": 7.488504886627197,
"learning_rate": 2.2726634480659403e-05,
"loss": 1.0232,
"step": 1092
},
{
"epoch": 0.06297715421624268,
"grad_norm": 10.171971321105957,
"learning_rate": 2.264388683948918e-05,
"loss": 1.5033,
"step": 1093
},
{
"epoch": 0.06303477283858143,
"grad_norm": 10.572342872619629,
"learning_rate": 2.2561245995489566e-05,
"loss": 1.5857,
"step": 1094
},
{
"epoch": 0.06309239146092017,
"grad_norm": 10.64458179473877,
"learning_rate": 2.247871227128709e-05,
"loss": 1.5704,
"step": 1095
},
{
"epoch": 0.0631500100832589,
"grad_norm": 8.843093872070312,
"learning_rate": 2.2396285989090134e-05,
"loss": 1.0939,
"step": 1096
},
{
"epoch": 0.06320762870559765,
"grad_norm": 11.950446128845215,
"learning_rate": 2.2313967470687593e-05,
"loss": 1.4868,
"step": 1097
},
{
"epoch": 0.06326524732793638,
"grad_norm": 10.60393238067627,
"learning_rate": 2.2231757037447664e-05,
"loss": 1.0224,
"step": 1098
},
{
"epoch": 0.06332286595027513,
"grad_norm": 9.963017463684082,
"learning_rate": 2.2149655010316573e-05,
"loss": 1.263,
"step": 1099
},
{
"epoch": 0.06338048457261387,
"grad_norm": 12.921998023986816,
"learning_rate": 2.2067661709817383e-05,
"loss": 1.6227,
"step": 1100
},
{
"epoch": 0.0634381031949526,
"grad_norm": 2.471111536026001,
"learning_rate": 2.1985777456048633e-05,
"loss": 1.4481,
"step": 1101
},
{
"epoch": 0.06349572181729135,
"grad_norm": 2.563838243484497,
"learning_rate": 2.1904002568683173e-05,
"loss": 1.3153,
"step": 1102
},
{
"epoch": 0.06355334043963008,
"grad_norm": 2.9359986782073975,
"learning_rate": 2.1822337366966898e-05,
"loss": 1.3924,
"step": 1103
},
{
"epoch": 0.06361095906196883,
"grad_norm": 3.37689471244812,
"learning_rate": 2.1740782169717455e-05,
"loss": 1.7816,
"step": 1104
},
{
"epoch": 0.06366857768430757,
"grad_norm": 3.576246976852417,
"learning_rate": 2.1659337295323118e-05,
"loss": 1.4992,
"step": 1105
},
{
"epoch": 0.0637261963066463,
"grad_norm": 3.4290308952331543,
"learning_rate": 2.1578003061741388e-05,
"loss": 1.2094,
"step": 1106
},
{
"epoch": 0.06378381492898505,
"grad_norm": 4.7343549728393555,
"learning_rate": 2.149677978649786e-05,
"loss": 1.853,
"step": 1107
},
{
"epoch": 0.06384143355132378,
"grad_norm": 4.07674503326416,
"learning_rate": 2.1415667786684952e-05,
"loss": 1.4323,
"step": 1108
},
{
"epoch": 0.06389905217366253,
"grad_norm": 4.4910430908203125,
"learning_rate": 2.1334667378960644e-05,
"loss": 1.7015,
"step": 1109
},
{
"epoch": 0.06395667079600127,
"grad_norm": 4.675705432891846,
"learning_rate": 2.125377887954732e-05,
"loss": 2.1419,
"step": 1110
},
{
"epoch": 0.06401428941834,
"grad_norm": 5.743031024932861,
"learning_rate": 2.1173002604230425e-05,
"loss": 2.2031,
"step": 1111
},
{
"epoch": 0.06407190804067875,
"grad_norm": 5.196958065032959,
"learning_rate": 2.1092338868357302e-05,
"loss": 2.089,
"step": 1112
},
{
"epoch": 0.06412952666301748,
"grad_norm": 5.74540376663208,
"learning_rate": 2.1011787986835934e-05,
"loss": 1.8015,
"step": 1113
},
{
"epoch": 0.06418714528535623,
"grad_norm": 6.175201892852783,
"learning_rate": 2.093135027413377e-05,
"loss": 2.084,
"step": 1114
},
{
"epoch": 0.06424476390769497,
"grad_norm": 6.138916492462158,
"learning_rate": 2.0851026044276406e-05,
"loss": 1.4566,
"step": 1115
},
{
"epoch": 0.0643023825300337,
"grad_norm": 5.053371906280518,
"learning_rate": 2.0770815610846418e-05,
"loss": 1.4747,
"step": 1116
},
{
"epoch": 0.06436000115237245,
"grad_norm": 6.439314842224121,
"learning_rate": 2.0690719286982125e-05,
"loss": 2.0671,
"step": 1117
},
{
"epoch": 0.0644176197747112,
"grad_norm": 7.404539585113525,
"learning_rate": 2.061073738537635e-05,
"loss": 2.1802,
"step": 1118
},
{
"epoch": 0.06447523839704992,
"grad_norm": 6.88596773147583,
"learning_rate": 2.0530870218275273e-05,
"loss": 2.0121,
"step": 1119
},
{
"epoch": 0.06453285701938867,
"grad_norm": 7.417169094085693,
"learning_rate": 2.0451118097477094e-05,
"loss": 1.9572,
"step": 1120
},
{
"epoch": 0.0645904756417274,
"grad_norm": 7.945450782775879,
"learning_rate": 2.0371481334330912e-05,
"loss": 2.0368,
"step": 1121
},
{
"epoch": 0.06464809426406615,
"grad_norm": 8.752837181091309,
"learning_rate": 2.0291960239735436e-05,
"loss": 2.2519,
"step": 1122
},
{
"epoch": 0.06470571288640489,
"grad_norm": 5.975473403930664,
"learning_rate": 2.0212555124137866e-05,
"loss": 1.8471,
"step": 1123
},
{
"epoch": 0.06476333150874362,
"grad_norm": 6.761491775512695,
"learning_rate": 2.0133266297532587e-05,
"loss": 1.274,
"step": 1124
},
{
"epoch": 0.06482095013108237,
"grad_norm": 6.702766418457031,
"learning_rate": 2.005409406946e-05,
"loss": 1.3058,
"step": 1125
},
{
"epoch": 0.0648785687534211,
"grad_norm": 6.298551559448242,
"learning_rate": 1.9975038749005314e-05,
"loss": 1.4427,
"step": 1126
},
{
"epoch": 0.06493618737575985,
"grad_norm": 6.6965155601501465,
"learning_rate": 1.9896100644797317e-05,
"loss": 1.4374,
"step": 1127
},
{
"epoch": 0.06499380599809859,
"grad_norm": 6.686706066131592,
"learning_rate": 1.9817280065007244e-05,
"loss": 1.4331,
"step": 1128
},
{
"epoch": 0.06505142462043732,
"grad_norm": 6.5872650146484375,
"learning_rate": 1.973857731734746e-05,
"loss": 1.251,
"step": 1129
},
{
"epoch": 0.06510904324277607,
"grad_norm": 7.749248027801514,
"learning_rate": 1.9659992709070345e-05,
"loss": 1.9952,
"step": 1130
},
{
"epoch": 0.0651666618651148,
"grad_norm": 6.611239910125732,
"learning_rate": 1.958152654696705e-05,
"loss": 1.3524,
"step": 1131
},
{
"epoch": 0.06522428048745355,
"grad_norm": 6.317110061645508,
"learning_rate": 1.950317913736636e-05,
"loss": 1.4627,
"step": 1132
},
{
"epoch": 0.06528189910979229,
"grad_norm": 8.021782875061035,
"learning_rate": 1.942495078613341e-05,
"loss": 1.5733,
"step": 1133
},
{
"epoch": 0.06533951773213102,
"grad_norm": 7.358135223388672,
"learning_rate": 1.9346841798668553e-05,
"loss": 1.2235,
"step": 1134
},
{
"epoch": 0.06539713635446977,
"grad_norm": 9.270915985107422,
"learning_rate": 1.9268852479906147e-05,
"loss": 1.6454,
"step": 1135
},
{
"epoch": 0.0654547549768085,
"grad_norm": 8.690032958984375,
"learning_rate": 1.919098313431335e-05,
"loss": 1.4711,
"step": 1136
},
{
"epoch": 0.06551237359914724,
"grad_norm": 10.629873275756836,
"learning_rate": 1.9113234065889014e-05,
"loss": 1.7747,
"step": 1137
},
{
"epoch": 0.06556999222148599,
"grad_norm": 8.434743881225586,
"learning_rate": 1.9035605578162364e-05,
"loss": 1.4637,
"step": 1138
},
{
"epoch": 0.06562761084382472,
"grad_norm": 9.257351875305176,
"learning_rate": 1.8958097974191907e-05,
"loss": 1.168,
"step": 1139
},
{
"epoch": 0.06568522946616347,
"grad_norm": 10.043082237243652,
"learning_rate": 1.888071155656421e-05,
"loss": 1.8828,
"step": 1140
},
{
"epoch": 0.0657428480885022,
"grad_norm": 9.596894264221191,
"learning_rate": 1.8803446627392797e-05,
"loss": 2.1446,
"step": 1141
},
{
"epoch": 0.06580046671084094,
"grad_norm": 10.21119499206543,
"learning_rate": 1.872630348831682e-05,
"loss": 1.4045,
"step": 1142
},
{
"epoch": 0.06585808533317969,
"grad_norm": 11.253876686096191,
"learning_rate": 1.8649282440500015e-05,
"loss": 2.2187,
"step": 1143
},
{
"epoch": 0.06591570395551842,
"grad_norm": 10.968828201293945,
"learning_rate": 1.8572383784629477e-05,
"loss": 1.6181,
"step": 1144
},
{
"epoch": 0.06597332257785717,
"grad_norm": 11.836702346801758,
"learning_rate": 1.849560782091445e-05,
"loss": 1.7544,
"step": 1145
},
{
"epoch": 0.0660309412001959,
"grad_norm": 10.151345252990723,
"learning_rate": 1.8418954849085263e-05,
"loss": 0.895,
"step": 1146
},
{
"epoch": 0.06608855982253464,
"grad_norm": 8.884134292602539,
"learning_rate": 1.834242516839203e-05,
"loss": 0.7486,
"step": 1147
},
{
"epoch": 0.06614617844487339,
"grad_norm": 10.85288143157959,
"learning_rate": 1.826601907760357e-05,
"loss": 1.7473,
"step": 1148
},
{
"epoch": 0.06620379706721212,
"grad_norm": 10.78147029876709,
"learning_rate": 1.8189736875006185e-05,
"loss": 1.2058,
"step": 1149
},
{
"epoch": 0.06626141568955086,
"grad_norm": 11.327497482299805,
"learning_rate": 1.811357885840254e-05,
"loss": 0.8104,
"step": 1150
},
{
"epoch": 0.0663190343118896,
"grad_norm": 2.801546335220337,
"learning_rate": 1.8037545325110504e-05,
"loss": 1.5449,
"step": 1151
},
{
"epoch": 0.06637665293422834,
"grad_norm": 3.1385998725891113,
"learning_rate": 1.796163657196194e-05,
"loss": 1.8724,
"step": 1152
},
{
"epoch": 0.06643427155656709,
"grad_norm": 3.4718220233917236,
"learning_rate": 1.788585289530158e-05,
"loss": 1.5506,
"step": 1153
},
{
"epoch": 0.06649189017890582,
"grad_norm": 3.5483028888702393,
"learning_rate": 1.781019459098584e-05,
"loss": 1.5494,
"step": 1154
},
{
"epoch": 0.06654950880124456,
"grad_norm": 4.259626388549805,
"learning_rate": 1.7734661954381754e-05,
"loss": 1.6906,
"step": 1155
},
{
"epoch": 0.0666071274235833,
"grad_norm": 4.30315637588501,
"learning_rate": 1.7659255280365682e-05,
"loss": 1.3328,
"step": 1156
},
{
"epoch": 0.06666474604592204,
"grad_norm": 4.78853702545166,
"learning_rate": 1.7583974863322274e-05,
"loss": 1.8545,
"step": 1157
},
{
"epoch": 0.06672236466826079,
"grad_norm": 5.5761332511901855,
"learning_rate": 1.7508820997143254e-05,
"loss": 2.0578,
"step": 1158
},
{
"epoch": 0.06677998329059952,
"grad_norm": 5.004186630249023,
"learning_rate": 1.7433793975226298e-05,
"loss": 1.9804,
"step": 1159
},
{
"epoch": 0.06683760191293826,
"grad_norm": 5.428592681884766,
"learning_rate": 1.7358894090473925e-05,
"loss": 1.3237,
"step": 1160
},
{
"epoch": 0.066895220535277,
"grad_norm": 5.216967582702637,
"learning_rate": 1.728412163529227e-05,
"loss": 1.6406,
"step": 1161
},
{
"epoch": 0.06695283915761574,
"grad_norm": 5.5881028175354,
"learning_rate": 1.720947690159001e-05,
"loss": 1.4753,
"step": 1162
},
{
"epoch": 0.06701045777995449,
"grad_norm": 6.607051372528076,
"learning_rate": 1.713496018077717e-05,
"loss": 1.7072,
"step": 1163
},
{
"epoch": 0.06706807640229322,
"grad_norm": 6.841838359832764,
"learning_rate": 1.7060571763764087e-05,
"loss": 2.2575,
"step": 1164
},
{
"epoch": 0.06712569502463196,
"grad_norm": 6.977603912353516,
"learning_rate": 1.6986311940960147e-05,
"loss": 1.6719,
"step": 1165
},
{
"epoch": 0.06718331364697071,
"grad_norm": 5.838938236236572,
"learning_rate": 1.6912181002272713e-05,
"loss": 1.5608,
"step": 1166
},
{
"epoch": 0.06724093226930944,
"grad_norm": 7.533050060272217,
"learning_rate": 1.6838179237106016e-05,
"loss": 2.5848,
"step": 1167
},
{
"epoch": 0.06729855089164818,
"grad_norm": 6.3877434730529785,
"learning_rate": 1.6764306934359965e-05,
"loss": 1.7937,
"step": 1168
},
{
"epoch": 0.06735616951398692,
"grad_norm": 6.379162788391113,
"learning_rate": 1.66905643824291e-05,
"loss": 1.6522,
"step": 1169
},
{
"epoch": 0.06741378813632566,
"grad_norm": 6.743338108062744,
"learning_rate": 1.661695186920138e-05,
"loss": 1.7207,
"step": 1170
},
{
"epoch": 0.0674714067586644,
"grad_norm": 7.4394917488098145,
"learning_rate": 1.6543469682057106e-05,
"loss": 1.4863,
"step": 1171
},
{
"epoch": 0.06752902538100314,
"grad_norm": 7.081120491027832,
"learning_rate": 1.6470118107867778e-05,
"loss": 1.764,
"step": 1172
},
{
"epoch": 0.06758664400334188,
"grad_norm": 5.379045486450195,
"learning_rate": 1.6396897432995044e-05,
"loss": 1.2481,
"step": 1173
},
{
"epoch": 0.06764426262568062,
"grad_norm": 5.985071659088135,
"learning_rate": 1.6323807943289467e-05,
"loss": 1.3555,
"step": 1174
},
{
"epoch": 0.06770188124801936,
"grad_norm": 6.669476509094238,
"learning_rate": 1.6250849924089484e-05,
"loss": 1.4287,
"step": 1175
},
{
"epoch": 0.0677594998703581,
"grad_norm": 6.918170928955078,
"learning_rate": 1.6178023660220294e-05,
"loss": 1.6579,
"step": 1176
},
{
"epoch": 0.06781711849269684,
"grad_norm": 5.43526554107666,
"learning_rate": 1.6105329435992682e-05,
"loss": 1.0396,
"step": 1177
},
{
"epoch": 0.06787473711503558,
"grad_norm": 6.803481101989746,
"learning_rate": 1.6032767535202043e-05,
"loss": 1.1211,
"step": 1178
},
{
"epoch": 0.06793235573737431,
"grad_norm": 5.908268928527832,
"learning_rate": 1.5960338241127093e-05,
"loss": 1.169,
"step": 1179
},
{
"epoch": 0.06798997435971306,
"grad_norm": 6.827920436859131,
"learning_rate": 1.5888041836528915e-05,
"loss": 1.6863,
"step": 1180
},
{
"epoch": 0.0680475929820518,
"grad_norm": 8.643022537231445,
"learning_rate": 1.581587860364977e-05,
"loss": 1.3782,
"step": 1181
},
{
"epoch": 0.06810521160439054,
"grad_norm": 7.143157005310059,
"learning_rate": 1.5743848824212014e-05,
"loss": 1.5509,
"step": 1182
},
{
"epoch": 0.06816283022672928,
"grad_norm": 9.800533294677734,
"learning_rate": 1.567195277941706e-05,
"loss": 1.4908,
"step": 1183
},
{
"epoch": 0.06822044884906801,
"grad_norm": 10.031082153320312,
"learning_rate": 1.560019074994416e-05,
"loss": 1.7424,
"step": 1184
},
{
"epoch": 0.06827806747140676,
"grad_norm": 8.485883712768555,
"learning_rate": 1.552856301594942e-05,
"loss": 1.3979,
"step": 1185
},
{
"epoch": 0.0683356860937455,
"grad_norm": 7.949470043182373,
"learning_rate": 1.5457069857064623e-05,
"loss": 1.5791,
"step": 1186
},
{
"epoch": 0.06839330471608424,
"grad_norm": 8.45607852935791,
"learning_rate": 1.5385711552396227e-05,
"loss": 1.1772,
"step": 1187
},
{
"epoch": 0.06845092333842298,
"grad_norm": 6.658246040344238,
"learning_rate": 1.5314488380524182e-05,
"loss": 0.7639,
"step": 1188
},
{
"epoch": 0.06850854196076171,
"grad_norm": 8.69353199005127,
"learning_rate": 1.5243400619500903e-05,
"loss": 1.3904,
"step": 1189
},
{
"epoch": 0.06856616058310046,
"grad_norm": 9.834623336791992,
"learning_rate": 1.5172448546850165e-05,
"loss": 1.2404,
"step": 1190
},
{
"epoch": 0.0686237792054392,
"grad_norm": 11.461020469665527,
"learning_rate": 1.5101632439565998e-05,
"loss": 1.5777,
"step": 1191
},
{
"epoch": 0.06868139782777793,
"grad_norm": 9.637110710144043,
"learning_rate": 1.503095257411169e-05,
"loss": 1.4293,
"step": 1192
},
{
"epoch": 0.06873901645011668,
"grad_norm": 11.56747817993164,
"learning_rate": 1.4960409226418576e-05,
"loss": 1.0775,
"step": 1193
},
{
"epoch": 0.06879663507245541,
"grad_norm": 8.425179481506348,
"learning_rate": 1.4890002671885072e-05,
"loss": 0.7696,
"step": 1194
},
{
"epoch": 0.06885425369479416,
"grad_norm": 10.401522636413574,
"learning_rate": 1.4819733185375534e-05,
"loss": 1.54,
"step": 1195
},
{
"epoch": 0.0689118723171329,
"grad_norm": 12.076608657836914,
"learning_rate": 1.4749601041219247e-05,
"loss": 1.5604,
"step": 1196
},
{
"epoch": 0.06896949093947163,
"grad_norm": 11.82298755645752,
"learning_rate": 1.4679606513209283e-05,
"loss": 1.3343,
"step": 1197
},
{
"epoch": 0.06902710956181038,
"grad_norm": 11.132453918457031,
"learning_rate": 1.4609749874601463e-05,
"loss": 1.531,
"step": 1198
},
{
"epoch": 0.06908472818414911,
"grad_norm": 11.432952880859375,
"learning_rate": 1.4540031398113335e-05,
"loss": 0.893,
"step": 1199
},
{
"epoch": 0.06914234680648786,
"grad_norm": 13.103074073791504,
"learning_rate": 1.4470451355923027e-05,
"loss": 0.942,
"step": 1200
},
{
"epoch": 0.0691999654288266,
"grad_norm": 2.807366132736206,
"learning_rate": 1.4401010019668226e-05,
"loss": 1.4142,
"step": 1201
},
{
"epoch": 0.06925758405116533,
"grad_norm": 3.289900541305542,
"learning_rate": 1.4331707660445155e-05,
"loss": 1.5771,
"step": 1202
},
{
"epoch": 0.06931520267350408,
"grad_norm": 3.8797430992126465,
"learning_rate": 1.4262544548807432e-05,
"loss": 1.6511,
"step": 1203
},
{
"epoch": 0.06937282129584281,
"grad_norm": 3.5421643257141113,
"learning_rate": 1.4193520954765083e-05,
"loss": 1.5534,
"step": 1204
},
{
"epoch": 0.06943043991818156,
"grad_norm": 3.7102088928222656,
"learning_rate": 1.4124637147783432e-05,
"loss": 1.5172,
"step": 1205
},
{
"epoch": 0.0694880585405203,
"grad_norm": 4.117602348327637,
"learning_rate": 1.4055893396782143e-05,
"loss": 1.6133,
"step": 1206
},
{
"epoch": 0.06954567716285903,
"grad_norm": 4.356093406677246,
"learning_rate": 1.3987289970134049e-05,
"loss": 1.725,
"step": 1207
},
{
"epoch": 0.06960329578519778,
"grad_norm": 4.699815273284912,
"learning_rate": 1.3918827135664187e-05,
"loss": 1.537,
"step": 1208
},
{
"epoch": 0.06966091440753651,
"grad_norm": 4.650105953216553,
"learning_rate": 1.3850505160648709e-05,
"loss": 1.4926,
"step": 1209
},
{
"epoch": 0.06971853302987525,
"grad_norm": 5.107658863067627,
"learning_rate": 1.378232431181386e-05,
"loss": 1.7179,
"step": 1210
},
{
"epoch": 0.069776151652214,
"grad_norm": 4.739175319671631,
"learning_rate": 1.371428485533498e-05,
"loss": 1.4587,
"step": 1211
},
{
"epoch": 0.06983377027455273,
"grad_norm": 5.423766613006592,
"learning_rate": 1.3646387056835368e-05,
"loss": 1.6938,
"step": 1212
},
{
"epoch": 0.06989138889689148,
"grad_norm": 4.972559452056885,
"learning_rate": 1.3578631181385305e-05,
"loss": 1.4671,
"step": 1213
},
{
"epoch": 0.06994900751923022,
"grad_norm": 4.769227027893066,
"learning_rate": 1.3511017493501005e-05,
"loss": 1.5672,
"step": 1214
},
{
"epoch": 0.07000662614156895,
"grad_norm": 6.4741034507751465,
"learning_rate": 1.3443546257143624e-05,
"loss": 2.4878,
"step": 1215
},
{
"epoch": 0.0700642447639077,
"grad_norm": 6.461265563964844,
"learning_rate": 1.337621773571816e-05,
"loss": 2.1616,
"step": 1216
},
{
"epoch": 0.07012186338624643,
"grad_norm": 6.686141490936279,
"learning_rate": 1.3309032192072463e-05,
"loss": 1.8657,
"step": 1217
},
{
"epoch": 0.07017948200858518,
"grad_norm": 6.7380828857421875,
"learning_rate": 1.3241989888496204e-05,
"loss": 1.9427,
"step": 1218
},
{
"epoch": 0.07023710063092392,
"grad_norm": 6.249390602111816,
"learning_rate": 1.3175091086719832e-05,
"loss": 1.2004,
"step": 1219
},
{
"epoch": 0.07029471925326265,
"grad_norm": 6.980522155761719,
"learning_rate": 1.3108336047913633e-05,
"loss": 2.0509,
"step": 1220
},
{
"epoch": 0.0703523378756014,
"grad_norm": 9.115934371948242,
"learning_rate": 1.304172503268658e-05,
"loss": 2.3806,
"step": 1221
},
{
"epoch": 0.07040995649794013,
"grad_norm": 6.846525192260742,
"learning_rate": 1.297525830108542e-05,
"loss": 1.7004,
"step": 1222
},
{
"epoch": 0.07046757512027887,
"grad_norm": 6.642870903015137,
"learning_rate": 1.29089361125936e-05,
"loss": 1.2044,
"step": 1223
},
{
"epoch": 0.07052519374261762,
"grad_norm": 6.452547073364258,
"learning_rate": 1.2842758726130283e-05,
"loss": 1.5094,
"step": 1224
},
{
"epoch": 0.07058281236495635,
"grad_norm": 5.2349534034729,
"learning_rate": 1.277672640004936e-05,
"loss": 1.0245,
"step": 1225
},
{
"epoch": 0.0706404309872951,
"grad_norm": 6.4428181648254395,
"learning_rate": 1.2710839392138386e-05,
"loss": 1.4617,
"step": 1226
},
{
"epoch": 0.07069804960963383,
"grad_norm": 8.161385536193848,
"learning_rate": 1.2645097959617585e-05,
"loss": 1.5619,
"step": 1227
},
{
"epoch": 0.07075566823197257,
"grad_norm": 7.000401973724365,
"learning_rate": 1.2579502359138872e-05,
"loss": 1.3092,
"step": 1228
},
{
"epoch": 0.07081328685431132,
"grad_norm": 9.008089065551758,
"learning_rate": 1.251405284678488e-05,
"loss": 1.5806,
"step": 1229
},
{
"epoch": 0.07087090547665005,
"grad_norm": 7.795567989349365,
"learning_rate": 1.2448749678067856e-05,
"loss": 1.3938,
"step": 1230
},
{
"epoch": 0.0709285240989888,
"grad_norm": 6.975660800933838,
"learning_rate": 1.238359310792877e-05,
"loss": 1.2721,
"step": 1231
},
{
"epoch": 0.07098614272132753,
"grad_norm": 8.188170433044434,
"learning_rate": 1.2318583390736254e-05,
"loss": 1.572,
"step": 1232
},
{
"epoch": 0.07104376134366627,
"grad_norm": 7.512996673583984,
"learning_rate": 1.2253720780285639e-05,
"loss": 1.3905,
"step": 1233
},
{
"epoch": 0.07110137996600502,
"grad_norm": 8.138297080993652,
"learning_rate": 1.218900552979797e-05,
"loss": 1.621,
"step": 1234
},
{
"epoch": 0.07115899858834375,
"grad_norm": 7.63006591796875,
"learning_rate": 1.2124437891918993e-05,
"loss": 1.459,
"step": 1235
},
{
"epoch": 0.0712166172106825,
"grad_norm": 8.417518615722656,
"learning_rate": 1.206001811871818e-05,
"loss": 1.322,
"step": 1236
},
{
"epoch": 0.07127423583302123,
"grad_norm": 8.257303237915039,
"learning_rate": 1.1995746461687734e-05,
"loss": 1.3934,
"step": 1237
},
{
"epoch": 0.07133185445535997,
"grad_norm": 9.29442024230957,
"learning_rate": 1.1931623171741652e-05,
"loss": 1.727,
"step": 1238
},
{
"epoch": 0.07138947307769872,
"grad_norm": 8.721556663513184,
"learning_rate": 1.186764849921468e-05,
"loss": 1.3479,
"step": 1239
},
{
"epoch": 0.07144709170003745,
"grad_norm": 9.90758991241455,
"learning_rate": 1.1803822693861378e-05,
"loss": 1.537,
"step": 1240
},
{
"epoch": 0.0715047103223762,
"grad_norm": 7.325432777404785,
"learning_rate": 1.174014600485514e-05,
"loss": 0.6619,
"step": 1241
},
{
"epoch": 0.07156232894471493,
"grad_norm": 10.085389137268066,
"learning_rate": 1.1676618680787189e-05,
"loss": 1.4413,
"step": 1242
},
{
"epoch": 0.07161994756705367,
"grad_norm": 9.418313980102539,
"learning_rate": 1.1613240969665685e-05,
"loss": 1.2035,
"step": 1243
},
{
"epoch": 0.07167756618939242,
"grad_norm": 9.015664100646973,
"learning_rate": 1.1550013118914666e-05,
"loss": 1.152,
"step": 1244
},
{
"epoch": 0.07173518481173115,
"grad_norm": 8.91705322265625,
"learning_rate": 1.1486935375373126e-05,
"loss": 1.0203,
"step": 1245
},
{
"epoch": 0.0717928034340699,
"grad_norm": 8.304533958435059,
"learning_rate": 1.1424007985294032e-05,
"loss": 1.0657,
"step": 1246
},
{
"epoch": 0.07185042205640862,
"grad_norm": 10.287208557128906,
"learning_rate": 1.1361231194343436e-05,
"loss": 1.0867,
"step": 1247
},
{
"epoch": 0.07190804067874737,
"grad_norm": 10.810500144958496,
"learning_rate": 1.1298605247599392e-05,
"loss": 1.3672,
"step": 1248
},
{
"epoch": 0.07196565930108612,
"grad_norm": 10.38150691986084,
"learning_rate": 1.1236130389551092e-05,
"loss": 1.2666,
"step": 1249
},
{
"epoch": 0.07202327792342485,
"grad_norm": 14.021896362304688,
"learning_rate": 1.1173806864097886e-05,
"loss": 1.1838,
"step": 1250
},
{
"epoch": 0.07208089654576359,
"grad_norm": 2.665013313293457,
"learning_rate": 1.1111634914548297e-05,
"loss": 1.7504,
"step": 1251
},
{
"epoch": 0.07213851516810232,
"grad_norm": 2.8992791175842285,
"learning_rate": 1.1049614783619162e-05,
"loss": 1.3094,
"step": 1252
},
{
"epoch": 0.07219613379044107,
"grad_norm": 3.9074079990386963,
"learning_rate": 1.0987746713434576e-05,
"loss": 1.8809,
"step": 1253
},
{
"epoch": 0.07225375241277981,
"grad_norm": 3.3540804386138916,
"learning_rate": 1.0926030945525007e-05,
"loss": 1.2304,
"step": 1254
},
{
"epoch": 0.07231137103511855,
"grad_norm": 4.333744525909424,
"learning_rate": 1.0864467720826343e-05,
"loss": 1.8107,
"step": 1255
},
{
"epoch": 0.07236898965745729,
"grad_norm": 4.874741554260254,
"learning_rate": 1.080305727967893e-05,
"loss": 2.1448,
"step": 1256
},
{
"epoch": 0.07242660827979604,
"grad_norm": 5.173941135406494,
"learning_rate": 1.0741799861826706e-05,
"loss": 1.9073,
"step": 1257
},
{
"epoch": 0.07248422690213477,
"grad_norm": 3.9350931644439697,
"learning_rate": 1.0680695706416161e-05,
"loss": 1.3388,
"step": 1258
},
{
"epoch": 0.07254184552447351,
"grad_norm": 4.4014787673950195,
"learning_rate": 1.0619745051995472e-05,
"loss": 1.608,
"step": 1259
},
{
"epoch": 0.07259946414681225,
"grad_norm": 5.42172908782959,
"learning_rate": 1.0558948136513535e-05,
"loss": 1.6292,
"step": 1260
},
{
"epoch": 0.07265708276915099,
"grad_norm": 5.181097030639648,
"learning_rate": 1.0498305197319115e-05,
"loss": 1.5061,
"step": 1261
},
{
"epoch": 0.07271470139148974,
"grad_norm": 5.174496173858643,
"learning_rate": 1.0437816471159789e-05,
"loss": 1.799,
"step": 1262
},
{
"epoch": 0.07277232001382847,
"grad_norm": 5.354062557220459,
"learning_rate": 1.0377482194181132e-05,
"loss": 1.7505,
"step": 1263
},
{
"epoch": 0.07282993863616721,
"grad_norm": 4.44288969039917,
"learning_rate": 1.0317302601925743e-05,
"loss": 1.3323,
"step": 1264
},
{
"epoch": 0.07288755725850594,
"grad_norm": 5.833633899688721,
"learning_rate": 1.0257277929332332e-05,
"loss": 1.555,
"step": 1265
},
{
"epoch": 0.07294517588084469,
"grad_norm": 5.348058700561523,
"learning_rate": 1.0197408410734838e-05,
"loss": 1.7067,
"step": 1266
},
{
"epoch": 0.07300279450318344,
"grad_norm": 6.784201622009277,
"learning_rate": 1.0137694279861454e-05,
"loss": 1.5719,
"step": 1267
},
{
"epoch": 0.07306041312552217,
"grad_norm": 8.142943382263184,
"learning_rate": 1.0078135769833758e-05,
"loss": 2.0864,
"step": 1268
},
{
"epoch": 0.07311803174786091,
"grad_norm": 8.11693286895752,
"learning_rate": 1.0018733113165773e-05,
"loss": 2.5427,
"step": 1269
},
{
"epoch": 0.07317565037019964,
"grad_norm": 8.465521812438965,
"learning_rate": 9.959486541763119e-06,
"loss": 2.0763,
"step": 1270
},
{
"epoch": 0.07323326899253839,
"grad_norm": 8.063743591308594,
"learning_rate": 9.900396286922026e-06,
"loss": 1.742,
"step": 1271
},
{
"epoch": 0.07329088761487713,
"grad_norm": 8.543492317199707,
"learning_rate": 9.841462579328486e-06,
"loss": 1.9328,
"step": 1272
},
{
"epoch": 0.07334850623721587,
"grad_norm": 6.320174694061279,
"learning_rate": 9.782685649057333e-06,
"loss": 1.604,
"step": 1273
},
{
"epoch": 0.07340612485955461,
"grad_norm": 7.231295108795166,
"learning_rate": 9.72406572557133e-06,
"loss": 1.4475,
"step": 1274
},
{
"epoch": 0.07346374348189334,
"grad_norm": 6.566737174987793,
"learning_rate": 9.66560303772035e-06,
"loss": 1.4634,
"step": 1275
},
{
"epoch": 0.07352136210423209,
"grad_norm": 5.912583351135254,
"learning_rate": 9.607297813740362e-06,
"loss": 1.1079,
"step": 1276
},
{
"epoch": 0.07357898072657083,
"grad_norm": 8.524259567260742,
"learning_rate": 9.549150281252633e-06,
"loss": 1.7306,
"step": 1277
},
{
"epoch": 0.07363659934890956,
"grad_norm": 7.437958717346191,
"learning_rate": 9.491160667262783e-06,
"loss": 1.8297,
"step": 1278
},
{
"epoch": 0.07369421797124831,
"grad_norm": 6.986851692199707,
"learning_rate": 9.433329198159974e-06,
"loss": 1.3439,
"step": 1279
},
{
"epoch": 0.07375183659358704,
"grad_norm": 7.04397439956665,
"learning_rate": 9.375656099715934e-06,
"loss": 1.4192,
"step": 1280
},
{
"epoch": 0.07380945521592579,
"grad_norm": 6.16534423828125,
"learning_rate": 9.31814159708413e-06,
"loss": 1.01,
"step": 1281
},
{
"epoch": 0.07386707383826453,
"grad_norm": 7.240321159362793,
"learning_rate": 9.26078591479887e-06,
"loss": 1.2074,
"step": 1282
},
{
"epoch": 0.07392469246060326,
"grad_norm": 7.419429302215576,
"learning_rate": 9.203589276774439e-06,
"loss": 1.5331,
"step": 1283
},
{
"epoch": 0.07398231108294201,
"grad_norm": 6.982340335845947,
"learning_rate": 9.146551906304241e-06,
"loss": 0.9526,
"step": 1284
},
{
"epoch": 0.07403992970528074,
"grad_norm": 7.959641933441162,
"learning_rate": 9.08967402605988e-06,
"loss": 1.0421,
"step": 1285
},
{
"epoch": 0.07409754832761949,
"grad_norm": 10.079961776733398,
"learning_rate": 9.03295585809032e-06,
"loss": 1.605,
"step": 1286
},
{
"epoch": 0.07415516694995823,
"grad_norm": 8.764575004577637,
"learning_rate": 8.976397623821003e-06,
"loss": 1.2824,
"step": 1287
},
{
"epoch": 0.07421278557229696,
"grad_norm": 11.325631141662598,
"learning_rate": 8.919999544053036e-06,
"loss": 2.0689,
"step": 1288
},
{
"epoch": 0.07427040419463571,
"grad_norm": 10.685650825500488,
"learning_rate": 8.86376183896226e-06,
"loss": 1.2928,
"step": 1289
},
{
"epoch": 0.07432802281697444,
"grad_norm": 9.600011825561523,
"learning_rate": 8.80768472809842e-06,
"loss": 1.1182,
"step": 1290
},
{
"epoch": 0.07438564143931319,
"grad_norm": 10.983346939086914,
"learning_rate": 8.751768430384305e-06,
"loss": 1.8124,
"step": 1291
},
{
"epoch": 0.07444326006165193,
"grad_norm": 7.787347793579102,
"learning_rate": 8.696013164114902e-06,
"loss": 0.9067,
"step": 1292
},
{
"epoch": 0.07450087868399066,
"grad_norm": 8.734658241271973,
"learning_rate": 8.640419146956557e-06,
"loss": 1.0239,
"step": 1293
},
{
"epoch": 0.07455849730632941,
"grad_norm": 9.973542213439941,
"learning_rate": 8.584986595946071e-06,
"loss": 1.475,
"step": 1294
},
{
"epoch": 0.07461611592866814,
"grad_norm": 10.29810619354248,
"learning_rate": 8.529715727489912e-06,
"loss": 1.3585,
"step": 1295
},
{
"epoch": 0.07467373455100688,
"grad_norm": 12.083988189697266,
"learning_rate": 8.474606757363334e-06,
"loss": 0.9257,
"step": 1296
},
{
"epoch": 0.07473135317334563,
"grad_norm": 12.64985466003418,
"learning_rate": 8.419659900709536e-06,
"loss": 1.8098,
"step": 1297
},
{
"epoch": 0.07478897179568436,
"grad_norm": 11.525604248046875,
"learning_rate": 8.364875372038876e-06,
"loss": 1.9413,
"step": 1298
},
{
"epoch": 0.0748465904180231,
"grad_norm": 10.950992584228516,
"learning_rate": 8.310253385227946e-06,
"loss": 0.9996,
"step": 1299
},
{
"epoch": 0.07490420904036184,
"grad_norm": 14.034361839294434,
"learning_rate": 8.255794153518798e-06,
"loss": 1.1626,
"step": 1300
},
{
"epoch": 0.07496182766270058,
"grad_norm": 2.737600326538086,
"learning_rate": 8.201497889518073e-06,
"loss": 1.761,
"step": 1301
},
{
"epoch": 0.07501944628503933,
"grad_norm": 3.1791279315948486,
"learning_rate": 8.147364805196239e-06,
"loss": 1.5846,
"step": 1302
},
{
"epoch": 0.07507706490737806,
"grad_norm": 3.298542022705078,
"learning_rate": 8.093395111886687e-06,
"loss": 1.5926,
"step": 1303
},
{
"epoch": 0.0751346835297168,
"grad_norm": 3.450136661529541,
"learning_rate": 8.039589020284926e-06,
"loss": 1.4735,
"step": 1304
},
{
"epoch": 0.07519230215205555,
"grad_norm": 3.677147626876831,
"learning_rate": 7.985946740447791e-06,
"loss": 1.347,
"step": 1305
},
{
"epoch": 0.07524992077439428,
"grad_norm": 3.89573335647583,
"learning_rate": 7.932468481792582e-06,
"loss": 1.4001,
"step": 1306
},
{
"epoch": 0.07530753939673303,
"grad_norm": 4.762203216552734,
"learning_rate": 7.879154453096304e-06,
"loss": 1.5896,
"step": 1307
},
{
"epoch": 0.07536515801907176,
"grad_norm": 4.7118024826049805,
"learning_rate": 7.826004862494773e-06,
"loss": 1.7291,
"step": 1308
},
{
"epoch": 0.0754227766414105,
"grad_norm": 4.669412612915039,
"learning_rate": 7.773019917481872e-06,
"loss": 1.4086,
"step": 1309
},
{
"epoch": 0.07548039526374925,
"grad_norm": 6.52652645111084,
"learning_rate": 7.720199824908692e-06,
"loss": 2.7607,
"step": 1310
},
{
"epoch": 0.07553801388608798,
"grad_norm": 5.618553161621094,
"learning_rate": 7.667544790982778e-06,
"loss": 1.7376,
"step": 1311
},
{
"epoch": 0.07559563250842673,
"grad_norm": 5.568121910095215,
"learning_rate": 7.615055021267265e-06,
"loss": 1.4153,
"step": 1312
},
{
"epoch": 0.07565325113076546,
"grad_norm": 6.0162787437438965,
"learning_rate": 7.562730720680112e-06,
"loss": 1.5431,
"step": 1313
},
{
"epoch": 0.0757108697531042,
"grad_norm": 7.300206184387207,
"learning_rate": 7.510572093493295e-06,
"loss": 2.0336,
"step": 1314
},
{
"epoch": 0.07576848837544295,
"grad_norm": 7.871939659118652,
"learning_rate": 7.458579343331995e-06,
"loss": 1.9898,
"step": 1315
},
{
"epoch": 0.07582610699778168,
"grad_norm": 7.020273208618164,
"learning_rate": 7.40675267317385e-06,
"loss": 1.3814,
"step": 1316
},
{
"epoch": 0.07588372562012043,
"grad_norm": 8.185589790344238,
"learning_rate": 7.3550922853480915e-06,
"loss": 1.5838,
"step": 1317
},
{
"epoch": 0.07594134424245916,
"grad_norm": 7.924199104309082,
"learning_rate": 7.30359838153481e-06,
"loss": 1.6532,
"step": 1318
},
{
"epoch": 0.0759989628647979,
"grad_norm": 8.18741226196289,
"learning_rate": 7.252271162764129e-06,
"loss": 2.0839,
"step": 1319
},
{
"epoch": 0.07605658148713665,
"grad_norm": 7.6883697509765625,
"learning_rate": 7.2011108294154804e-06,
"loss": 1.6609,
"step": 1320
},
{
"epoch": 0.07611420010947538,
"grad_norm": 7.047170639038086,
"learning_rate": 7.150117581216748e-06,
"loss": 1.6234,
"step": 1321
},
{
"epoch": 0.07617181873181413,
"grad_norm": 4.978269577026367,
"learning_rate": 7.099291617243526e-06,
"loss": 0.6277,
"step": 1322
},
{
"epoch": 0.07622943735415286,
"grad_norm": 6.817296981811523,
"learning_rate": 7.048633135918347e-06,
"loss": 1.7294,
"step": 1323
},
{
"epoch": 0.0762870559764916,
"grad_norm": 6.551976680755615,
"learning_rate": 6.998142335009883e-06,
"loss": 1.5629,
"step": 1324
},
{
"epoch": 0.07634467459883035,
"grad_norm": 7.358117580413818,
"learning_rate": 6.947819411632223e-06,
"loss": 1.6529,
"step": 1325
},
{
"epoch": 0.07640229322116908,
"grad_norm": 6.99992036819458,
"learning_rate": 6.897664562244027e-06,
"loss": 1.0696,
"step": 1326
},
{
"epoch": 0.07645991184350782,
"grad_norm": 9.310277938842773,
"learning_rate": 6.8476779826478265e-06,
"loss": 2.0143,
"step": 1327
},
{
"epoch": 0.07651753046584656,
"grad_norm": 8.040164947509766,
"learning_rate": 6.797859867989226e-06,
"loss": 1.2877,
"step": 1328
},
{
"epoch": 0.0765751490881853,
"grad_norm": 8.650226593017578,
"learning_rate": 6.748210412756134e-06,
"loss": 1.5993,
"step": 1329
},
{
"epoch": 0.07663276771052405,
"grad_norm": 7.13109016418457,
"learning_rate": 6.698729810778065e-06,
"loss": 1.3774,
"step": 1330
},
{
"epoch": 0.07669038633286278,
"grad_norm": 6.740497589111328,
"learning_rate": 6.649418255225298e-06,
"loss": 1.1087,
"step": 1331
},
{
"epoch": 0.07674800495520152,
"grad_norm": 7.251396656036377,
"learning_rate": 6.600275938608164e-06,
"loss": 1.5068,
"step": 1332
},
{
"epoch": 0.07680562357754026,
"grad_norm": 8.886542320251465,
"learning_rate": 6.551303052776292e-06,
"loss": 1.4498,
"step": 1333
},
{
"epoch": 0.076863242199879,
"grad_norm": 9.089426040649414,
"learning_rate": 6.502499788917893e-06,
"loss": 1.6564,
"step": 1334
},
{
"epoch": 0.07692086082221775,
"grad_norm": 9.233247756958008,
"learning_rate": 6.45386633755894e-06,
"loss": 1.2177,
"step": 1335
},
{
"epoch": 0.07697847944455648,
"grad_norm": 9.834328651428223,
"learning_rate": 6.405402888562484e-06,
"loss": 1.6067,
"step": 1336
},
{
"epoch": 0.07703609806689522,
"grad_norm": 8.686698913574219,
"learning_rate": 6.357109631127889e-06,
"loss": 1.3768,
"step": 1337
},
{
"epoch": 0.07709371668923395,
"grad_norm": 9.639391899108887,
"learning_rate": 6.308986753790081e-06,
"loss": 1.3579,
"step": 1338
},
{
"epoch": 0.0771513353115727,
"grad_norm": 9.124120712280273,
"learning_rate": 6.261034444418879e-06,
"loss": 1.6292,
"step": 1339
},
{
"epoch": 0.07720895393391145,
"grad_norm": 9.01993465423584,
"learning_rate": 6.213252890218163e-06,
"loss": 1.2972,
"step": 1340
},
{
"epoch": 0.07726657255625018,
"grad_norm": 9.506366729736328,
"learning_rate": 6.165642277725203e-06,
"loss": 1.4636,
"step": 1341
},
{
"epoch": 0.07732419117858892,
"grad_norm": 10.157084465026855,
"learning_rate": 6.118202792809924e-06,
"loss": 1.5999,
"step": 1342
},
{
"epoch": 0.07738180980092765,
"grad_norm": 9.804509162902832,
"learning_rate": 6.07093462067419e-06,
"loss": 1.3275,
"step": 1343
},
{
"epoch": 0.0774394284232664,
"grad_norm": 11.456414222717285,
"learning_rate": 6.023837945851041e-06,
"loss": 1.8872,
"step": 1344
},
{
"epoch": 0.07749704704560514,
"grad_norm": 7.661109447479248,
"learning_rate": 5.976912952204017e-06,
"loss": 1.0861,
"step": 1345
},
{
"epoch": 0.07755466566794388,
"grad_norm": 10.093178749084473,
"learning_rate": 5.9301598229264064e-06,
"loss": 1.2754,
"step": 1346
},
{
"epoch": 0.07761228429028262,
"grad_norm": 13.415855407714844,
"learning_rate": 5.883578740540546e-06,
"loss": 1.4099,
"step": 1347
},
{
"epoch": 0.07766990291262135,
"grad_norm": 14.411483764648438,
"learning_rate": 5.837169886897143e-06,
"loss": 1.3181,
"step": 1348
},
{
"epoch": 0.0777275215349601,
"grad_norm": 12.322956085205078,
"learning_rate": 5.79093344317449e-06,
"loss": 0.9336,
"step": 1349
},
{
"epoch": 0.07778514015729884,
"grad_norm": 9.327947616577148,
"learning_rate": 5.74486958987781e-06,
"loss": 0.7117,
"step": 1350
},
{
"epoch": 0.07784275877963757,
"grad_norm": 2.5703728199005127,
"learning_rate": 5.698978506838532e-06,
"loss": 1.5084,
"step": 1351
},
{
"epoch": 0.07790037740197632,
"grad_norm": 3.1344716548919678,
"learning_rate": 5.653260373213632e-06,
"loss": 1.5168,
"step": 1352
},
{
"epoch": 0.07795799602431507,
"grad_norm": 3.3435142040252686,
"learning_rate": 5.607715367484861e-06,
"loss": 1.6171,
"step": 1353
},
{
"epoch": 0.0780156146466538,
"grad_norm": 3.791179895401001,
"learning_rate": 5.562343667458098e-06,
"loss": 1.7181,
"step": 1354
},
{
"epoch": 0.07807323326899254,
"grad_norm": 3.7208218574523926,
"learning_rate": 5.51714545026264e-06,
"loss": 1.2988,
"step": 1355
},
{
"epoch": 0.07813085189133127,
"grad_norm": 4.257368564605713,
"learning_rate": 5.472120892350513e-06,
"loss": 1.5394,
"step": 1356
},
{
"epoch": 0.07818847051367002,
"grad_norm": 4.81605339050293,
"learning_rate": 5.4272701694958076e-06,
"loss": 1.8318,
"step": 1357
},
{
"epoch": 0.07824608913600876,
"grad_norm": 4.604612827301025,
"learning_rate": 5.382593456793933e-06,
"loss": 1.5295,
"step": 1358
},
{
"epoch": 0.0783037077583475,
"grad_norm": 5.25562047958374,
"learning_rate": 5.338090928660999e-06,
"loss": 1.814,
"step": 1359
},
{
"epoch": 0.07836132638068624,
"grad_norm": 4.781506538391113,
"learning_rate": 5.293762758833071e-06,
"loss": 1.4253,
"step": 1360
},
{
"epoch": 0.07841894500302497,
"grad_norm": 5.485918045043945,
"learning_rate": 5.249609120365578e-06,
"loss": 1.5135,
"step": 1361
},
{
"epoch": 0.07847656362536372,
"grad_norm": 5.671662330627441,
"learning_rate": 5.205630185632548e-06,
"loss": 1.6113,
"step": 1362
},
{
"epoch": 0.07853418224770246,
"grad_norm": 6.093698501586914,
"learning_rate": 5.16182612632598e-06,
"loss": 1.9681,
"step": 1363
},
{
"epoch": 0.0785918008700412,
"grad_norm": 6.609533309936523,
"learning_rate": 5.1181971134551646e-06,
"loss": 1.2897,
"step": 1364
},
{
"epoch": 0.07864941949237994,
"grad_norm": 5.9721150398254395,
"learning_rate": 5.074743317346009e-06,
"loss": 1.3921,
"step": 1365
},
{
"epoch": 0.07870703811471867,
"grad_norm": 7.347040176391602,
"learning_rate": 5.031464907640421e-06,
"loss": 1.6847,
"step": 1366
},
{
"epoch": 0.07876465673705742,
"grad_norm": 7.19246244430542,
"learning_rate": 4.988362053295564e-06,
"loss": 1.771,
"step": 1367
},
{
"epoch": 0.07882227535939616,
"grad_norm": 7.701037883758545,
"learning_rate": 4.945434922583259e-06,
"loss": 1.8111,
"step": 1368
},
{
"epoch": 0.0788798939817349,
"grad_norm": 7.5752387046813965,
"learning_rate": 4.902683683089304e-06,
"loss": 1.6451,
"step": 1369
},
{
"epoch": 0.07893751260407364,
"grad_norm": 8.2559175491333,
"learning_rate": 4.860108501712824e-06,
"loss": 1.8287,
"step": 1370
},
{
"epoch": 0.07899513122641237,
"grad_norm": 7.228907108306885,
"learning_rate": 4.817709544665628e-06,
"loss": 1.3888,
"step": 1371
},
{
"epoch": 0.07905274984875112,
"grad_norm": 8.455028533935547,
"learning_rate": 4.775486977471549e-06,
"loss": 2.2663,
"step": 1372
},
{
"epoch": 0.07911036847108986,
"grad_norm": 8.277305603027344,
"learning_rate": 4.733440964965791e-06,
"loss": 2.1231,
"step": 1373
},
{
"epoch": 0.0791679870934286,
"grad_norm": 6.262785911560059,
"learning_rate": 4.691571671294298e-06,
"loss": 1.1385,
"step": 1374
},
{
"epoch": 0.07922560571576734,
"grad_norm": 7.3416218757629395,
"learning_rate": 4.649879259913137e-06,
"loss": 1.2804,
"step": 1375
},
{
"epoch": 0.07928322433810607,
"grad_norm": 6.713423252105713,
"learning_rate": 4.608363893587803e-06,
"loss": 1.4359,
"step": 1376
},
{
"epoch": 0.07934084296044482,
"grad_norm": 4.854036331176758,
"learning_rate": 4.567025734392622e-06,
"loss": 0.7511,
"step": 1377
},
{
"epoch": 0.07939846158278356,
"grad_norm": 7.6707305908203125,
"learning_rate": 4.525864943710112e-06,
"loss": 1.658,
"step": 1378
},
{
"epoch": 0.07945608020512229,
"grad_norm": 8.423785209655762,
"learning_rate": 4.484881682230341e-06,
"loss": 1.394,
"step": 1379
},
{
"epoch": 0.07951369882746104,
"grad_norm": 7.887045860290527,
"learning_rate": 4.4440761099503455e-06,
"loss": 1.4721,
"step": 1380
},
{
"epoch": 0.07957131744979977,
"grad_norm": 7.276421070098877,
"learning_rate": 4.403448386173437e-06,
"loss": 1.5326,
"step": 1381
},
{
"epoch": 0.07962893607213851,
"grad_norm": 6.97625732421875,
"learning_rate": 4.362998669508617e-06,
"loss": 0.8795,
"step": 1382
},
{
"epoch": 0.07968655469447726,
"grad_norm": 8.794151306152344,
"learning_rate": 4.322727117869951e-06,
"loss": 1.7497,
"step": 1383
},
{
"epoch": 0.07974417331681599,
"grad_norm": 7.412423610687256,
"learning_rate": 4.28263388847599e-06,
"loss": 1.1107,
"step": 1384
},
{
"epoch": 0.07980179193915474,
"grad_norm": 8.788695335388184,
"learning_rate": 4.242719137849077e-06,
"loss": 1.882,
"step": 1385
},
{
"epoch": 0.07985941056149347,
"grad_norm": 8.702564239501953,
"learning_rate": 4.2029830218148105e-06,
"loss": 1.5547,
"step": 1386
},
{
"epoch": 0.07991702918383221,
"grad_norm": 9.078214645385742,
"learning_rate": 4.163425695501388e-06,
"loss": 1.483,
"step": 1387
},
{
"epoch": 0.07997464780617096,
"grad_norm": 7.934226036071777,
"learning_rate": 4.124047313339025e-06,
"loss": 0.7694,
"step": 1388
},
{
"epoch": 0.08003226642850969,
"grad_norm": 9.265207290649414,
"learning_rate": 4.0848480290593625e-06,
"loss": 1.3706,
"step": 1389
},
{
"epoch": 0.08008988505084844,
"grad_norm": 9.881461143493652,
"learning_rate": 4.045827995694834e-06,
"loss": 1.5904,
"step": 1390
},
{
"epoch": 0.08014750367318717,
"grad_norm": 12.029980659484863,
"learning_rate": 4.00698736557808e-06,
"loss": 1.5482,
"step": 1391
},
{
"epoch": 0.08020512229552591,
"grad_norm": 9.23487663269043,
"learning_rate": 3.968326290341362e-06,
"loss": 1.1902,
"step": 1392
},
{
"epoch": 0.08026274091786466,
"grad_norm": 10.065555572509766,
"learning_rate": 3.929844920915987e-06,
"loss": 1.3536,
"step": 1393
},
{
"epoch": 0.08032035954020339,
"grad_norm": 10.576550483703613,
"learning_rate": 3.891543407531673e-06,
"loss": 1.2158,
"step": 1394
},
{
"epoch": 0.08037797816254214,
"grad_norm": 9.110869407653809,
"learning_rate": 3.853421899715992e-06,
"loss": 1.0718,
"step": 1395
},
{
"epoch": 0.08043559678488087,
"grad_norm": 12.036208152770996,
"learning_rate": 3.815480546293787e-06,
"loss": 1.3889,
"step": 1396
},
{
"epoch": 0.08049321540721961,
"grad_norm": 14.209784507751465,
"learning_rate": 3.7777194953865667e-06,
"loss": 1.3228,
"step": 1397
},
{
"epoch": 0.08055083402955836,
"grad_norm": 10.392521858215332,
"learning_rate": 3.740138894411993e-06,
"loss": 1.0992,
"step": 1398
},
{
"epoch": 0.08060845265189709,
"grad_norm": 11.944876670837402,
"learning_rate": 3.702738890083207e-06,
"loss": 0.9713,
"step": 1399
},
{
"epoch": 0.08066607127423583,
"grad_norm": 15.17037582397461,
"learning_rate": 3.6655196284083317e-06,
"loss": 0.9288,
"step": 1400
},
{
"epoch": 0.08072368989657458,
"grad_norm": 2.623278856277466,
"learning_rate": 3.628481254689875e-06,
"loss": 1.3525,
"step": 1401
},
{
"epoch": 0.08078130851891331,
"grad_norm": 3.0454962253570557,
"learning_rate": 3.5916239135241612e-06,
"loss": 1.4847,
"step": 1402
},
{
"epoch": 0.08083892714125206,
"grad_norm": 3.5481057167053223,
"learning_rate": 3.5549477488007854e-06,
"loss": 1.3441,
"step": 1403
},
{
"epoch": 0.08089654576359079,
"grad_norm": 3.8285813331604004,
"learning_rate": 3.5184529037020186e-06,
"loss": 1.4872,
"step": 1404
},
{
"epoch": 0.08095416438592953,
"grad_norm": 4.4077229499816895,
"learning_rate": 3.4821395207022766e-06,
"loss": 1.8172,
"step": 1405
},
{
"epoch": 0.08101178300826828,
"grad_norm": 4.109917163848877,
"learning_rate": 3.4460077415675474e-06,
"loss": 1.3182,
"step": 1406
},
{
"epoch": 0.08106940163060701,
"grad_norm": 4.91995906829834,
"learning_rate": 3.4100577073548634e-06,
"loss": 1.3844,
"step": 1407
},
{
"epoch": 0.08112702025294576,
"grad_norm": 4.943761825561523,
"learning_rate": 3.3742895584117085e-06,
"loss": 1.587,
"step": 1408
},
{
"epoch": 0.08118463887528449,
"grad_norm": 5.181413173675537,
"learning_rate": 3.3387034343755065e-06,
"loss": 2.059,
"step": 1409
},
{
"epoch": 0.08124225749762323,
"grad_norm": 4.918468475341797,
"learning_rate": 3.303299474173066e-06,
"loss": 1.7195,
"step": 1410
},
{
"epoch": 0.08129987611996198,
"grad_norm": 7.011109828948975,
"learning_rate": 3.2680778160200155e-06,
"loss": 2.3435,
"step": 1411
},
{
"epoch": 0.08135749474230071,
"grad_norm": 7.465047836303711,
"learning_rate": 3.233038597420318e-06,
"loss": 2.3545,
"step": 1412
},
{
"epoch": 0.08141511336463945,
"grad_norm": 6.575809001922607,
"learning_rate": 3.198181955165669e-06,
"loss": 2.1609,
"step": 1413
},
{
"epoch": 0.08147273198697819,
"grad_norm": 6.265475273132324,
"learning_rate": 3.1635080253350046e-06,
"loss": 1.6717,
"step": 1414
},
{
"epoch": 0.08153035060931693,
"grad_norm": 6.190795421600342,
"learning_rate": 3.1290169432939553e-06,
"loss": 1.7938,
"step": 1415
},
{
"epoch": 0.08158796923165568,
"grad_norm": 7.629157066345215,
"learning_rate": 3.0947088436943326e-06,
"loss": 2.2331,
"step": 1416
},
{
"epoch": 0.08164558785399441,
"grad_norm": 8.784008026123047,
"learning_rate": 3.060583860473587e-06,
"loss": 1.9761,
"step": 1417
},
{
"epoch": 0.08170320647633315,
"grad_norm": 8.13729476928711,
"learning_rate": 3.0266421268542735e-06,
"loss": 1.9917,
"step": 1418
},
{
"epoch": 0.08176082509867189,
"grad_norm": 9.353556632995605,
"learning_rate": 2.9928837753435746e-06,
"loss": 2.3454,
"step": 1419
},
{
"epoch": 0.08181844372101063,
"grad_norm": 7.279959678649902,
"learning_rate": 2.9593089377327245e-06,
"loss": 1.5294,
"step": 1420
},
{
"epoch": 0.08187606234334938,
"grad_norm": 6.368862628936768,
"learning_rate": 2.9259177450965682e-06,
"loss": 1.2804,
"step": 1421
},
{
"epoch": 0.08193368096568811,
"grad_norm": 5.7955451011657715,
"learning_rate": 2.8927103277929746e-06,
"loss": 1.1459,
"step": 1422
},
{
"epoch": 0.08199129958802685,
"grad_norm": 8.391079902648926,
"learning_rate": 2.8596868154623703e-06,
"loss": 1.4252,
"step": 1423
},
{
"epoch": 0.08204891821036558,
"grad_norm": 6.314506530761719,
"learning_rate": 2.826847337027222e-06,
"loss": 1.2634,
"step": 1424
},
{
"epoch": 0.08210653683270433,
"grad_norm": 7.440786361694336,
"learning_rate": 2.794192020691544e-06,
"loss": 1.4699,
"step": 1425
},
{
"epoch": 0.08216415545504308,
"grad_norm": 8.44675350189209,
"learning_rate": 2.7617209939403866e-06,
"loss": 1.736,
"step": 1426
},
{
"epoch": 0.0822217740773818,
"grad_norm": 8.489884376525879,
"learning_rate": 2.7294343835393368e-06,
"loss": 1.4334,
"step": 1427
},
{
"epoch": 0.08227939269972055,
"grad_norm": 6.695461750030518,
"learning_rate": 2.6973323155340234e-06,
"loss": 1.3815,
"step": 1428
},
{
"epoch": 0.08233701132205928,
"grad_norm": 7.661706447601318,
"learning_rate": 2.665414915249631e-06,
"loss": 1.2681,
"step": 1429
},
{
"epoch": 0.08239462994439803,
"grad_norm": 7.512377738952637,
"learning_rate": 2.6336823072904304e-06,
"loss": 1.4611,
"step": 1430
},
{
"epoch": 0.08245224856673677,
"grad_norm": 7.898604393005371,
"learning_rate": 2.6021346155392423e-06,
"loss": 1.3854,
"step": 1431
},
{
"epoch": 0.0825098671890755,
"grad_norm": 7.8414483070373535,
"learning_rate": 2.570771963156987e-06,
"loss": 1.167,
"step": 1432
},
{
"epoch": 0.08256748581141425,
"grad_norm": 8.268306732177734,
"learning_rate": 2.539594472582213e-06,
"loss": 1.5366,
"step": 1433
},
{
"epoch": 0.08262510443375298,
"grad_norm": 7.2767157554626465,
"learning_rate": 2.50860226553058e-06,
"loss": 1.0255,
"step": 1434
},
{
"epoch": 0.08268272305609173,
"grad_norm": 10.800278663635254,
"learning_rate": 2.4777954629944477e-06,
"loss": 1.574,
"step": 1435
},
{
"epoch": 0.08274034167843047,
"grad_norm": 9.119339942932129,
"learning_rate": 2.4471741852423237e-06,
"loss": 1.508,
"step": 1436
},
{
"epoch": 0.0827979603007692,
"grad_norm": 9.476216316223145,
"learning_rate": 2.416738551818454e-06,
"loss": 1.2083,
"step": 1437
},
{
"epoch": 0.08285557892310795,
"grad_norm": 10.253376007080078,
"learning_rate": 2.386488681542326e-06,
"loss": 1.2887,
"step": 1438
},
{
"epoch": 0.08291319754544668,
"grad_norm": 9.97877025604248,
"learning_rate": 2.3564246925082357e-06,
"loss": 1.4069,
"step": 1439
},
{
"epoch": 0.08297081616778543,
"grad_norm": 8.539217948913574,
"learning_rate": 2.3265467020847866e-06,
"loss": 0.9612,
"step": 1440
},
{
"epoch": 0.08302843479012417,
"grad_norm": 9.88011360168457,
"learning_rate": 2.2968548269144574e-06,
"loss": 1.3134,
"step": 1441
},
{
"epoch": 0.0830860534124629,
"grad_norm": 10.596741676330566,
"learning_rate": 2.2673491829131364e-06,
"loss": 1.3288,
"step": 1442
},
{
"epoch": 0.08314367203480165,
"grad_norm": 11.159172058105469,
"learning_rate": 2.238029885269677e-06,
"loss": 1.2029,
"step": 1443
},
{
"epoch": 0.08320129065714038,
"grad_norm": 7.516232490539551,
"learning_rate": 2.2088970484454517e-06,
"loss": 0.6245,
"step": 1444
},
{
"epoch": 0.08325890927947913,
"grad_norm": 11.081012725830078,
"learning_rate": 2.179950786173879e-06,
"loss": 1.3358,
"step": 1445
},
{
"epoch": 0.08331652790181787,
"grad_norm": 10.059147834777832,
"learning_rate": 2.1511912114600188e-06,
"loss": 0.7193,
"step": 1446
},
{
"epoch": 0.0833741465241566,
"grad_norm": 16.58409309387207,
"learning_rate": 2.122618436580082e-06,
"loss": 1.741,
"step": 1447
},
{
"epoch": 0.08343176514649535,
"grad_norm": 8.316960334777832,
"learning_rate": 2.0942325730810565e-06,
"loss": 0.5042,
"step": 1448
},
{
"epoch": 0.0834893837688341,
"grad_norm": 12.206513404846191,
"learning_rate": 2.066033731780209e-06,
"loss": 0.667,
"step": 1449
},
{
"epoch": 0.08354700239117283,
"grad_norm": 11.896441459655762,
"learning_rate": 2.038022022764685e-06,
"loss": 1.025,
"step": 1450
},
{
"epoch": 0.08360462101351157,
"grad_norm": 2.629281759262085,
"learning_rate": 2.01019755539108e-06,
"loss": 1.3759,
"step": 1451
},
{
"epoch": 0.0836622396358503,
"grad_norm": 2.7282769680023193,
"learning_rate": 1.9825604382849916e-06,
"loss": 0.9396,
"step": 1452
},
{
"epoch": 0.08371985825818905,
"grad_norm": 2.902930498123169,
"learning_rate": 1.9551107793406355e-06,
"loss": 1.092,
"step": 1453
},
{
"epoch": 0.0837774768805278,
"grad_norm": 3.3719232082366943,
"learning_rate": 1.927848685720368e-06,
"loss": 1.508,
"step": 1454
},
{
"epoch": 0.08383509550286652,
"grad_norm": 3.994154930114746,
"learning_rate": 1.9007742638543102e-06,
"loss": 1.4944,
"step": 1455
},
{
"epoch": 0.08389271412520527,
"grad_norm": 3.5956053733825684,
"learning_rate": 1.8738876194399236e-06,
"loss": 1.4522,
"step": 1456
},
{
"epoch": 0.083950332747544,
"grad_norm": 4.752785682678223,
"learning_rate": 1.8471888574415951e-06,
"loss": 1.773,
"step": 1457
},
{
"epoch": 0.08400795136988275,
"grad_norm": 4.907639026641846,
"learning_rate": 1.820678082090227e-06,
"loss": 1.6043,
"step": 1458
},
{
"epoch": 0.08406556999222149,
"grad_norm": 5.568052291870117,
"learning_rate": 1.794355396882813e-06,
"loss": 1.6794,
"step": 1459
},
{
"epoch": 0.08412318861456022,
"grad_norm": 5.442253589630127,
"learning_rate": 1.7682209045820686e-06,
"loss": 1.6744,
"step": 1460
},
{
"epoch": 0.08418080723689897,
"grad_norm": 5.181339740753174,
"learning_rate": 1.7422747072160017e-06,
"loss": 1.4881,
"step": 1461
},
{
"epoch": 0.0842384258592377,
"grad_norm": 6.357906818389893,
"learning_rate": 1.7165169060775365e-06,
"loss": 2.0846,
"step": 1462
},
{
"epoch": 0.08429604448157645,
"grad_norm": 6.581882476806641,
"learning_rate": 1.6909476017240912e-06,
"loss": 1.8633,
"step": 1463
},
{
"epoch": 0.08435366310391519,
"grad_norm": 5.985304832458496,
"learning_rate": 1.665566893977205e-06,
"loss": 1.4172,
"step": 1464
},
{
"epoch": 0.08441128172625392,
"grad_norm": 5.509626388549805,
"learning_rate": 1.6403748819221466e-06,
"loss": 1.6604,
"step": 1465
},
{
"epoch": 0.08446890034859267,
"grad_norm": 6.375392913818359,
"learning_rate": 1.6153716639075222e-06,
"loss": 1.5872,
"step": 1466
},
{
"epoch": 0.0845265189709314,
"grad_norm": 7.897721290588379,
"learning_rate": 1.5905573375449012e-06,
"loss": 1.8599,
"step": 1467
},
{
"epoch": 0.08458413759327015,
"grad_norm": 7.933893203735352,
"learning_rate": 1.5659319997084089e-06,
"loss": 1.5581,
"step": 1468
},
{
"epoch": 0.08464175621560889,
"grad_norm": 7.137598037719727,
"learning_rate": 1.5414957465343882e-06,
"loss": 1.1623,
"step": 1469
},
{
"epoch": 0.08469937483794762,
"grad_norm": 7.819648742675781,
"learning_rate": 1.5172486734209789e-06,
"loss": 1.8684,
"step": 1470
},
{
"epoch": 0.08475699346028637,
"grad_norm": 9.98184585571289,
"learning_rate": 1.4931908750278e-06,
"loss": 2.351,
"step": 1471
},
{
"epoch": 0.0848146120826251,
"grad_norm": 12.534150123596191,
"learning_rate": 1.4693224452755282e-06,
"loss": 1.692,
"step": 1472
},
{
"epoch": 0.08487223070496384,
"grad_norm": 7.534951686859131,
"learning_rate": 1.4456434773455541e-06,
"loss": 1.7114,
"step": 1473
},
{
"epoch": 0.08492984932730259,
"grad_norm": 7.719982624053955,
"learning_rate": 1.4221540636796205e-06,
"loss": 1.985,
"step": 1474
},
{
"epoch": 0.08498746794964132,
"grad_norm": 6.58493185043335,
"learning_rate": 1.3988542959794627e-06,
"loss": 1.5008,
"step": 1475
},
{
"epoch": 0.08504508657198007,
"grad_norm": 6.852606296539307,
"learning_rate": 1.3757442652064357e-06,
"loss": 1.356,
"step": 1476
},
{
"epoch": 0.0851027051943188,
"grad_norm": 8.046666145324707,
"learning_rate": 1.3528240615811816e-06,
"loss": 1.6529,
"step": 1477
},
{
"epoch": 0.08516032381665754,
"grad_norm": 6.943844795227051,
"learning_rate": 1.3300937745832521e-06,
"loss": 1.2587,
"step": 1478
},
{
"epoch": 0.08521794243899629,
"grad_norm": 7.486252307891846,
"learning_rate": 1.3075534929507693e-06,
"loss": 1.2292,
"step": 1479
},
{
"epoch": 0.08527556106133502,
"grad_norm": 7.63762903213501,
"learning_rate": 1.2852033046801104e-06,
"loss": 1.5982,
"step": 1480
},
{
"epoch": 0.08533317968367377,
"grad_norm": 7.90939474105835,
"learning_rate": 1.2630432970255013e-06,
"loss": 1.2343,
"step": 1481
},
{
"epoch": 0.0853907983060125,
"grad_norm": 7.681085109710693,
"learning_rate": 1.2410735564987341e-06,
"loss": 1.2635,
"step": 1482
},
{
"epoch": 0.08544841692835124,
"grad_norm": 8.854255676269531,
"learning_rate": 1.2192941688687843e-06,
"loss": 1.4468,
"step": 1483
},
{
"epoch": 0.08550603555068999,
"grad_norm": 7.650599479675293,
"learning_rate": 1.1977052191615156e-06,
"loss": 1.4939,
"step": 1484
},
{
"epoch": 0.08556365417302872,
"grad_norm": 7.889368534088135,
"learning_rate": 1.1763067916593262e-06,
"loss": 1.0372,
"step": 1485
},
{
"epoch": 0.08562127279536746,
"grad_norm": 7.41569709777832,
"learning_rate": 1.1550989699008142e-06,
"loss": 0.9774,
"step": 1486
},
{
"epoch": 0.0856788914177062,
"grad_norm": 7.5624895095825195,
"learning_rate": 1.1340818366804729e-06,
"loss": 1.2269,
"step": 1487
},
{
"epoch": 0.08573651004004494,
"grad_norm": 7.56338357925415,
"learning_rate": 1.1132554740483415e-06,
"loss": 0.9766,
"step": 1488
},
{
"epoch": 0.08579412866238369,
"grad_norm": 9.721120834350586,
"learning_rate": 1.0926199633097157e-06,
"loss": 0.9299,
"step": 1489
},
{
"epoch": 0.08585174728472242,
"grad_norm": 10.775286674499512,
"learning_rate": 1.0721753850247984e-06,
"loss": 1.8108,
"step": 1490
},
{
"epoch": 0.08590936590706116,
"grad_norm": 8.620316505432129,
"learning_rate": 1.0519218190084056e-06,
"loss": 0.9155,
"step": 1491
},
{
"epoch": 0.0859669845293999,
"grad_norm": 10.515030860900879,
"learning_rate": 1.03185934432965e-06,
"loss": 1.5124,
"step": 1492
},
{
"epoch": 0.08602460315173864,
"grad_norm": 10.365598678588867,
"learning_rate": 1.0119880393116176e-06,
"loss": 1.0672,
"step": 1493
},
{
"epoch": 0.08608222177407739,
"grad_norm": 11.100306510925293,
"learning_rate": 9.923079815310988e-07,
"loss": 1.2601,
"step": 1494
},
{
"epoch": 0.08613984039641612,
"grad_norm": 13.2669038772583,
"learning_rate": 9.728192478182574e-07,
"loss": 1.7943,
"step": 1495
},
{
"epoch": 0.08619745901875486,
"grad_norm": 9.610386848449707,
"learning_rate": 9.535219142563168e-07,
"loss": 0.9692,
"step": 1496
},
{
"epoch": 0.08625507764109361,
"grad_norm": 8.165685653686523,
"learning_rate": 9.344160561812921e-07,
"loss": 0.6033,
"step": 1497
},
{
"epoch": 0.08631269626343234,
"grad_norm": 12.818842887878418,
"learning_rate": 9.155017481817075e-07,
"loss": 1.0877,
"step": 1498
},
{
"epoch": 0.08637031488577109,
"grad_norm": 11.861927032470703,
"learning_rate": 8.967790640982465e-07,
"loss": 1.1281,
"step": 1499
},
{
"epoch": 0.08642793350810982,
"grad_norm": 13.807953834533691,
"learning_rate": 8.782480770235247e-07,
"loss": 1.2611,
"step": 1500
},
{
"epoch": 0.08648555213044856,
"grad_norm": 2.5032715797424316,
"learning_rate": 8.599088593017723e-07,
"loss": 1.279,
"step": 1501
},
{
"epoch": 0.08654317075278731,
"grad_norm": 2.9346187114715576,
"learning_rate": 8.417614825285636e-07,
"loss": 1.2721,
"step": 1502
},
{
"epoch": 0.08660078937512604,
"grad_norm": 3.4006011486053467,
"learning_rate": 8.238060175505269e-07,
"loss": 1.4945,
"step": 1503
},
{
"epoch": 0.08665840799746478,
"grad_norm": 3.5172901153564453,
"learning_rate": 8.060425344650846e-07,
"loss": 1.4068,
"step": 1504
},
{
"epoch": 0.08671602661980352,
"grad_norm": 4.262792587280273,
"learning_rate": 7.884711026201585e-07,
"loss": 1.5875,
"step": 1505
},
{
"epoch": 0.08677364524214226,
"grad_norm": 4.958654880523682,
"learning_rate": 7.710917906139204e-07,
"loss": 1.7626,
"step": 1506
},
{
"epoch": 0.086831263864481,
"grad_norm": 5.197454452514648,
"learning_rate": 7.53904666294497e-07,
"loss": 1.6268,
"step": 1507
},
{
"epoch": 0.08688888248681974,
"grad_norm": 4.851822853088379,
"learning_rate": 7.369097967597493e-07,
"loss": 1.7482,
"step": 1508
},
{
"epoch": 0.08694650110915848,
"grad_norm": 4.628675937652588,
"learning_rate": 7.201072483569549e-07,
"loss": 1.4172,
"step": 1509
},
{
"epoch": 0.08700411973149721,
"grad_norm": 5.297687530517578,
"learning_rate": 7.034970866825974e-07,
"loss": 2.0734,
"step": 1510
},
{
"epoch": 0.08706173835383596,
"grad_norm": 5.987782955169678,
"learning_rate": 6.870793765820782e-07,
"loss": 1.4782,
"step": 1511
},
{
"epoch": 0.0871193569761747,
"grad_norm": 5.462611675262451,
"learning_rate": 6.708541821494829e-07,
"loss": 1.2892,
"step": 1512
},
{
"epoch": 0.08717697559851344,
"grad_norm": 6.447831630706787,
"learning_rate": 6.548215667273206e-07,
"loss": 1.9342,
"step": 1513
},
{
"epoch": 0.08723459422085218,
"grad_norm": 5.775749683380127,
"learning_rate": 6.389815929062848e-07,
"loss": 1.1052,
"step": 1514
},
{
"epoch": 0.08729221284319091,
"grad_norm": 7.711545467376709,
"learning_rate": 6.233343225249933e-07,
"loss": 2.4737,
"step": 1515
},
{
"epoch": 0.08734983146552966,
"grad_norm": 5.973711967468262,
"learning_rate": 6.078798166697541e-07,
"loss": 1.5227,
"step": 1516
},
{
"epoch": 0.0874074500878684,
"grad_norm": 7.564472675323486,
"learning_rate": 5.92618135674361e-07,
"loss": 2.2625,
"step": 1517
},
{
"epoch": 0.08746506871020714,
"grad_norm": 7.887570381164551,
"learning_rate": 5.775493391197928e-07,
"loss": 2.1692,
"step": 1518
},
{
"epoch": 0.08752268733254588,
"grad_norm": 7.080876350402832,
"learning_rate": 5.626734858340255e-07,
"loss": 1.786,
"step": 1519
},
{
"epoch": 0.08758030595488461,
"grad_norm": 8.339112281799316,
"learning_rate": 5.479906338917984e-07,
"loss": 1.7675,
"step": 1520
},
{
"epoch": 0.08763792457722336,
"grad_norm": 8.264595031738281,
"learning_rate": 5.335008406143815e-07,
"loss": 1.7349,
"step": 1521
},
{
"epoch": 0.0876955431995621,
"grad_norm": 7.565638542175293,
"learning_rate": 5.192041625693478e-07,
"loss": 1.9572,
"step": 1522
},
{
"epoch": 0.08775316182190084,
"grad_norm": 8.93736457824707,
"learning_rate": 5.051006555703453e-07,
"loss": 2.1832,
"step": 1523
},
{
"epoch": 0.08781078044423958,
"grad_norm": 6.717231750488281,
"learning_rate": 4.911903746769142e-07,
"loss": 1.4777,
"step": 1524
},
{
"epoch": 0.08786839906657831,
"grad_norm": 6.877925395965576,
"learning_rate": 4.774733741942206e-07,
"loss": 1.4908,
"step": 1525
},
{
"epoch": 0.08792601768891706,
"grad_norm": 7.157272815704346,
"learning_rate": 4.639497076728949e-07,
"loss": 1.1946,
"step": 1526
},
{
"epoch": 0.0879836363112558,
"grad_norm": 6.139166355133057,
"learning_rate": 4.5061942790879386e-07,
"loss": 0.9741,
"step": 1527
},
{
"epoch": 0.08804125493359453,
"grad_norm": 7.109872817993164,
"learning_rate": 4.3748258694278344e-07,
"loss": 1.0362,
"step": 1528
},
{
"epoch": 0.08809887355593328,
"grad_norm": 7.474804878234863,
"learning_rate": 4.2453923606057265e-07,
"loss": 1.8015,
"step": 1529
},
{
"epoch": 0.08815649217827201,
"grad_norm": 7.79063606262207,
"learning_rate": 4.1178942579248036e-07,
"loss": 1.317,
"step": 1530
},
{
"epoch": 0.08821411080061076,
"grad_norm": 8.327803611755371,
"learning_rate": 3.992332059132631e-07,
"loss": 1.7084,
"step": 1531
},
{
"epoch": 0.0882717294229495,
"grad_norm": 7.449394226074219,
"learning_rate": 3.86870625441893e-07,
"loss": 1.3077,
"step": 1532
},
{
"epoch": 0.08832934804528823,
"grad_norm": 8.847639083862305,
"learning_rate": 3.747017326413971e-07,
"loss": 1.5461,
"step": 1533
},
{
"epoch": 0.08838696666762698,
"grad_norm": 9.207019805908203,
"learning_rate": 3.627265750186348e-07,
"loss": 1.2763,
"step": 1534
},
{
"epoch": 0.08844458528996571,
"grad_norm": 9.685409545898438,
"learning_rate": 3.5094519932415417e-07,
"loss": 1.4493,
"step": 1535
},
{
"epoch": 0.08850220391230446,
"grad_norm": 7.347187042236328,
"learning_rate": 3.3935765155196366e-07,
"loss": 1.0995,
"step": 1536
},
{
"epoch": 0.0885598225346432,
"grad_norm": 8.703095436096191,
"learning_rate": 3.2796397693939385e-07,
"loss": 1.139,
"step": 1537
},
{
"epoch": 0.08861744115698193,
"grad_norm": 9.044896125793457,
"learning_rate": 3.167642199668863e-07,
"loss": 1.54,
"step": 1538
},
{
"epoch": 0.08867505977932068,
"grad_norm": 9.896312713623047,
"learning_rate": 3.0575842435785486e-07,
"loss": 1.4508,
"step": 1539
},
{
"epoch": 0.08873267840165941,
"grad_norm": 9.210082054138184,
"learning_rate": 2.9494663307847447e-07,
"loss": 1.2747,
"step": 1540
},
{
"epoch": 0.08879029702399815,
"grad_norm": 9.259896278381348,
"learning_rate": 2.843288883375539e-07,
"loss": 1.3469,
"step": 1541
},
{
"epoch": 0.0888479156463369,
"grad_norm": 11.880096435546875,
"learning_rate": 2.7390523158633554e-07,
"loss": 1.4855,
"step": 1542
},
{
"epoch": 0.08890553426867563,
"grad_norm": 8.968679428100586,
"learning_rate": 2.6367570351836237e-07,
"loss": 1.1652,
"step": 1543
},
{
"epoch": 0.08896315289101438,
"grad_norm": 8.963898658752441,
"learning_rate": 2.536403440693003e-07,
"loss": 0.8292,
"step": 1544
},
{
"epoch": 0.08902077151335312,
"grad_norm": 13.518234252929688,
"learning_rate": 2.437991924167937e-07,
"loss": 0.9534,
"step": 1545
},
{
"epoch": 0.08907839013569185,
"grad_norm": 10.741979598999023,
"learning_rate": 2.341522869803048e-07,
"loss": 1.1406,
"step": 1546
},
{
"epoch": 0.0891360087580306,
"grad_norm": 12.254292488098145,
"learning_rate": 2.2469966542096322e-07,
"loss": 1.6535,
"step": 1547
},
{
"epoch": 0.08919362738036933,
"grad_norm": 10.460444450378418,
"learning_rate": 2.1544136464142772e-07,
"loss": 0.8327,
"step": 1548
},
{
"epoch": 0.08925124600270808,
"grad_norm": 10.744197845458984,
"learning_rate": 2.0637742078573607e-07,
"loss": 1.0415,
"step": 1549
},
{
"epoch": 0.08930886462504682,
"grad_norm": 12.140249252319336,
"learning_rate": 1.975078692391552e-07,
"loss": 0.7828,
"step": 1550
},
{
"epoch": 0.08936648324738555,
"grad_norm": 3.117053747177124,
"learning_rate": 1.8883274462806467e-07,
"loss": 1.8182,
"step": 1551
},
{
"epoch": 0.0894241018697243,
"grad_norm": 2.633333444595337,
"learning_rate": 1.8035208081980115e-07,
"loss": 1.2614,
"step": 1552
},
{
"epoch": 0.08948172049206303,
"grad_norm": 2.9836959838867188,
"learning_rate": 1.7206591092253642e-07,
"loss": 1.4255,
"step": 1553
},
{
"epoch": 0.08953933911440178,
"grad_norm": 3.157785415649414,
"learning_rate": 1.6397426728514964e-07,
"loss": 1.5868,
"step": 1554
},
{
"epoch": 0.08959695773674052,
"grad_norm": 3.6719095706939697,
"learning_rate": 1.560771814970885e-07,
"loss": 1.0872,
"step": 1555
},
{
"epoch": 0.08965457635907925,
"grad_norm": 3.6305088996887207,
"learning_rate": 1.4837468438826386e-07,
"loss": 1.2175,
"step": 1556
},
{
"epoch": 0.089712194981418,
"grad_norm": 4.5752387046813965,
"learning_rate": 1.4086680602891643e-07,
"loss": 1.6879,
"step": 1557
},
{
"epoch": 0.08976981360375673,
"grad_norm": 4.913380146026611,
"learning_rate": 1.335535757294948e-07,
"loss": 2.0423,
"step": 1558
},
{
"epoch": 0.08982743222609547,
"grad_norm": 3.961810827255249,
"learning_rate": 1.264350220405719e-07,
"loss": 0.8754,
"step": 1559
},
{
"epoch": 0.08988505084843422,
"grad_norm": 5.088049411773682,
"learning_rate": 1.195111727526843e-07,
"loss": 1.5104,
"step": 1560
},
{
"epoch": 0.08994266947077295,
"grad_norm": 5.504665851593018,
"learning_rate": 1.1278205489626547e-07,
"loss": 1.6067,
"step": 1561
},
{
"epoch": 0.0900002880931117,
"grad_norm": 5.033830165863037,
"learning_rate": 1.0624769474152363e-07,
"loss": 1.2763,
"step": 1562
},
{
"epoch": 0.09005790671545043,
"grad_norm": 5.962503910064697,
"learning_rate": 9.99081177983363e-08,
"loss": 1.5867,
"step": 1563
},
{
"epoch": 0.09011552533778917,
"grad_norm": 6.680840969085693,
"learning_rate": 9.376334881616156e-08,
"loss": 2.1862,
"step": 1564
},
{
"epoch": 0.09017314396012792,
"grad_norm": 5.870455265045166,
"learning_rate": 8.781341178393244e-08,
"loss": 1.0543,
"step": 1565
},
{
"epoch": 0.09023076258246665,
"grad_norm": 7.709200382232666,
"learning_rate": 8.20583299299571e-08,
"loss": 2.4783,
"step": 1566
},
{
"epoch": 0.0902883812048054,
"grad_norm": 6.48782205581665,
"learning_rate": 7.649812572185222e-08,
"loss": 1.6228,
"step": 1567
},
{
"epoch": 0.09034599982714413,
"grad_norm": 7.876181125640869,
"learning_rate": 7.113282086643191e-08,
"loss": 1.925,
"step": 1568
},
{
"epoch": 0.09040361844948287,
"grad_norm": 7.428340911865234,
"learning_rate": 6.596243630963006e-08,
"loss": 1.8479,
"step": 1569
},
{
"epoch": 0.09046123707182162,
"grad_norm": 7.5233049392700195,
"learning_rate": 6.098699223641702e-08,
"loss": 1.7916,
"step": 1570
},
{
"epoch": 0.09051885569416035,
"grad_norm": 9.212076187133789,
"learning_rate": 5.620650807073857e-08,
"loss": 1.9623,
"step": 1571
},
{
"epoch": 0.0905764743164991,
"grad_norm": 6.473432540893555,
"learning_rate": 5.162100247539936e-08,
"loss": 1.5251,
"step": 1572
},
{
"epoch": 0.09063409293883783,
"grad_norm": 6.715102672576904,
"learning_rate": 4.723049335204066e-08,
"loss": 1.7737,
"step": 1573
},
{
"epoch": 0.09069171156117657,
"grad_norm": 7.750906944274902,
"learning_rate": 4.303499784102383e-08,
"loss": 1.3005,
"step": 1574
},
{
"epoch": 0.09074933018351532,
"grad_norm": 8.47797679901123,
"learning_rate": 3.9034532321408076e-08,
"loss": 1.9916,
"step": 1575
},
{
"epoch": 0.09080694880585405,
"grad_norm": 5.759380340576172,
"learning_rate": 3.522911241083948e-08,
"loss": 1.3836,
"step": 1576
},
{
"epoch": 0.0908645674281928,
"grad_norm": 7.086587905883789,
"learning_rate": 3.161875296553429e-08,
"loss": 1.2335,
"step": 1577
},
{
"epoch": 0.09092218605053153,
"grad_norm": 7.697216987609863,
"learning_rate": 2.820346808018459e-08,
"loss": 1.378,
"step": 1578
},
{
"epoch": 0.09097980467287027,
"grad_norm": 8.39234733581543,
"learning_rate": 2.4983271087924974e-08,
"loss": 1.4514,
"step": 1579
},
{
"epoch": 0.09103742329520902,
"grad_norm": 8.911478996276855,
"learning_rate": 2.1958174560282595e-08,
"loss": 1.3778,
"step": 1580
},
{
"epoch": 0.09109504191754775,
"grad_norm": 7.91729211807251,
"learning_rate": 1.9128190307105e-08,
"loss": 1.4632,
"step": 1581
},
{
"epoch": 0.0911526605398865,
"grad_norm": 9.351404190063477,
"learning_rate": 1.6493329376549016e-08,
"loss": 1.702,
"step": 1582
},
{
"epoch": 0.09121027916222522,
"grad_norm": 9.39632797241211,
"learning_rate": 1.4053602054991955e-08,
"loss": 1.5477,
"step": 1583
},
{
"epoch": 0.09126789778456397,
"grad_norm": 7.607429504394531,
"learning_rate": 1.1809017867037141e-08,
"loss": 1.0051,
"step": 1584
},
{
"epoch": 0.09132551640690272,
"grad_norm": 9.332201957702637,
"learning_rate": 9.75958557545842e-09,
"loss": 1.5067,
"step": 1585
},
{
"epoch": 0.09138313502924145,
"grad_norm": 9.724040985107422,
"learning_rate": 7.905313181150176e-09,
"loss": 1.3267,
"step": 1586
},
{
"epoch": 0.09144075365158019,
"grad_norm": 8.812676429748535,
"learning_rate": 6.246207923116254e-09,
"loss": 1.4376,
"step": 1587
},
{
"epoch": 0.09149837227391892,
"grad_norm": 8.088887214660645,
"learning_rate": 4.782276278442188e-09,
"loss": 1.0595,
"step": 1588
},
{
"epoch": 0.09155599089625767,
"grad_norm": 12.923358917236328,
"learning_rate": 3.513523962256349e-09,
"loss": 2.117,
"step": 1589
},
{
"epoch": 0.09161360951859641,
"grad_norm": 10.379292488098145,
"learning_rate": 2.4399559277132888e-09,
"loss": 1.2511,
"step": 1590
},
{
"epoch": 0.09167122814093515,
"grad_norm": 8.650908470153809,
"learning_rate": 1.5615763659881933e-09,
"loss": 0.6839,
"step": 1591
},
{
"epoch": 0.09172884676327389,
"grad_norm": 9.632695198059082,
"learning_rate": 8.783887062324692e-10,
"loss": 1.3018,
"step": 1592
},
{
"epoch": 0.09178646538561264,
"grad_norm": 11.59156608581543,
"learning_rate": 3.903956155848487e-10,
"loss": 1.3564,
"step": 1593
},
{
"epoch": 0.09184408400795137,
"grad_norm": 9.620840072631836,
"learning_rate": 9.759899914918435e-11,
"loss": 1.0929,
"step": 1594
},
{
"epoch": 0.09190170263029011,
"grad_norm": 10.271053314208984,
"learning_rate": 0.0,
"loss": 1.1551,
"step": 1595
}
],
"logging_steps": 1,
"max_steps": 1595,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 399,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.659603942692094e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}