qwen2.5_3b_mvr_t / trainer_state.json
rooty2020's picture
Upload models
e575413 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2618,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003819709702062643,
"grad_norm": 19.90881904294145,
"learning_rate": 2.531645569620253e-08,
"loss": 1.299,
"step": 1
},
{
"epoch": 0.0007639419404125286,
"grad_norm": 15.414554054540094,
"learning_rate": 5.063291139240506e-08,
"loss": 1.3119,
"step": 2
},
{
"epoch": 0.001145912910618793,
"grad_norm": 16.05082822290993,
"learning_rate": 7.59493670886076e-08,
"loss": 1.3474,
"step": 3
},
{
"epoch": 0.0015278838808250573,
"grad_norm": 16.802557597929273,
"learning_rate": 1.0126582278481012e-07,
"loss": 1.2602,
"step": 4
},
{
"epoch": 0.0019098548510313217,
"grad_norm": 17.30879067784603,
"learning_rate": 1.2658227848101266e-07,
"loss": 1.3413,
"step": 5
},
{
"epoch": 0.002291825821237586,
"grad_norm": 20.77717323976014,
"learning_rate": 1.518987341772152e-07,
"loss": 1.2759,
"step": 6
},
{
"epoch": 0.00267379679144385,
"grad_norm": 17.559417298661096,
"learning_rate": 1.7721518987341772e-07,
"loss": 1.298,
"step": 7
},
{
"epoch": 0.0030557677616501145,
"grad_norm": 16.28982557516589,
"learning_rate": 2.0253164556962025e-07,
"loss": 1.349,
"step": 8
},
{
"epoch": 0.003437738731856379,
"grad_norm": 17.474868678501934,
"learning_rate": 2.2784810126582277e-07,
"loss": 1.3612,
"step": 9
},
{
"epoch": 0.0038197097020626434,
"grad_norm": 16.69254255107241,
"learning_rate": 2.5316455696202533e-07,
"loss": 1.2958,
"step": 10
},
{
"epoch": 0.004201680672268907,
"grad_norm": 14.759522064675997,
"learning_rate": 2.7848101265822783e-07,
"loss": 1.2975,
"step": 11
},
{
"epoch": 0.004583651642475172,
"grad_norm": 17.79667586296148,
"learning_rate": 3.037974683544304e-07,
"loss": 1.355,
"step": 12
},
{
"epoch": 0.004965622612681436,
"grad_norm": 16.62840589636226,
"learning_rate": 3.291139240506329e-07,
"loss": 1.3086,
"step": 13
},
{
"epoch": 0.0053475935828877,
"grad_norm": 15.772125524972015,
"learning_rate": 3.5443037974683544e-07,
"loss": 1.3046,
"step": 14
},
{
"epoch": 0.005729564553093965,
"grad_norm": 16.065515009444724,
"learning_rate": 3.79746835443038e-07,
"loss": 1.3281,
"step": 15
},
{
"epoch": 0.006111535523300229,
"grad_norm": 14.081835334511972,
"learning_rate": 4.050632911392405e-07,
"loss": 1.2608,
"step": 16
},
{
"epoch": 0.006493506493506494,
"grad_norm": 16.12517742296732,
"learning_rate": 4.3037974683544305e-07,
"loss": 1.2687,
"step": 17
},
{
"epoch": 0.006875477463712758,
"grad_norm": 15.024902136863233,
"learning_rate": 4.5569620253164555e-07,
"loss": 1.2874,
"step": 18
},
{
"epoch": 0.007257448433919022,
"grad_norm": 14.541324064699918,
"learning_rate": 4.81012658227848e-07,
"loss": 1.304,
"step": 19
},
{
"epoch": 0.007639419404125287,
"grad_norm": 14.444473500731197,
"learning_rate": 5.063291139240507e-07,
"loss": 1.2624,
"step": 20
},
{
"epoch": 0.008021390374331552,
"grad_norm": 13.532356409889397,
"learning_rate": 5.31645569620253e-07,
"loss": 1.2321,
"step": 21
},
{
"epoch": 0.008403361344537815,
"grad_norm": 14.040344008725162,
"learning_rate": 5.569620253164557e-07,
"loss": 1.2684,
"step": 22
},
{
"epoch": 0.00878533231474408,
"grad_norm": 15.065514072853418,
"learning_rate": 5.822784810126582e-07,
"loss": 1.2832,
"step": 23
},
{
"epoch": 0.009167303284950344,
"grad_norm": 14.095590912422155,
"learning_rate": 6.075949367088608e-07,
"loss": 1.1906,
"step": 24
},
{
"epoch": 0.009549274255156608,
"grad_norm": 13.52193472131181,
"learning_rate": 6.329113924050633e-07,
"loss": 1.2071,
"step": 25
},
{
"epoch": 0.009931245225362872,
"grad_norm": 15.26263801289482,
"learning_rate": 6.582278481012658e-07,
"loss": 1.1598,
"step": 26
},
{
"epoch": 0.010313216195569137,
"grad_norm": 12.160578875535336,
"learning_rate": 6.835443037974683e-07,
"loss": 1.1637,
"step": 27
},
{
"epoch": 0.0106951871657754,
"grad_norm": 10.342684824764603,
"learning_rate": 7.088607594936709e-07,
"loss": 1.1077,
"step": 28
},
{
"epoch": 0.011077158135981665,
"grad_norm": 15.632943983667545,
"learning_rate": 7.341772151898734e-07,
"loss": 1.1507,
"step": 29
},
{
"epoch": 0.01145912910618793,
"grad_norm": 10.079604573711133,
"learning_rate": 7.59493670886076e-07,
"loss": 1.0396,
"step": 30
},
{
"epoch": 0.011841100076394193,
"grad_norm": 13.611386236363026,
"learning_rate": 7.848101265822784e-07,
"loss": 1.0173,
"step": 31
},
{
"epoch": 0.012223071046600458,
"grad_norm": 10.957950016902126,
"learning_rate": 8.10126582278481e-07,
"loss": 0.9992,
"step": 32
},
{
"epoch": 0.012605042016806723,
"grad_norm": 30.773927277956396,
"learning_rate": 8.354430379746835e-07,
"loss": 1.024,
"step": 33
},
{
"epoch": 0.012987012987012988,
"grad_norm": 9.213361959330522,
"learning_rate": 8.607594936708861e-07,
"loss": 0.96,
"step": 34
},
{
"epoch": 0.013368983957219251,
"grad_norm": 9.531077209283136,
"learning_rate": 8.860759493670885e-07,
"loss": 1.0749,
"step": 35
},
{
"epoch": 0.013750954927425516,
"grad_norm": 9.863104906307168,
"learning_rate": 9.113924050632911e-07,
"loss": 1.0182,
"step": 36
},
{
"epoch": 0.01413292589763178,
"grad_norm": 9.525224203343395,
"learning_rate": 9.367088607594936e-07,
"loss": 0.9304,
"step": 37
},
{
"epoch": 0.014514896867838044,
"grad_norm": 9.908163004382338,
"learning_rate": 9.62025316455696e-07,
"loss": 0.9193,
"step": 38
},
{
"epoch": 0.014896867838044309,
"grad_norm": 5.240327212589582,
"learning_rate": 9.873417721518988e-07,
"loss": 0.8683,
"step": 39
},
{
"epoch": 0.015278838808250574,
"grad_norm": 7.817860585016493,
"learning_rate": 1.0126582278481013e-06,
"loss": 0.8277,
"step": 40
},
{
"epoch": 0.01566080977845684,
"grad_norm": 7.212486723359385,
"learning_rate": 1.0379746835443038e-06,
"loss": 0.85,
"step": 41
},
{
"epoch": 0.016042780748663103,
"grad_norm": 12.541315607408132,
"learning_rate": 1.063291139240506e-06,
"loss": 0.9004,
"step": 42
},
{
"epoch": 0.016424751718869365,
"grad_norm": 6.587171853997535,
"learning_rate": 1.0886075949367088e-06,
"loss": 0.8266,
"step": 43
},
{
"epoch": 0.01680672268907563,
"grad_norm": 37.57512879193886,
"learning_rate": 1.1139240506329113e-06,
"loss": 0.8639,
"step": 44
},
{
"epoch": 0.017188693659281894,
"grad_norm": 4.644959514031565,
"learning_rate": 1.139240506329114e-06,
"loss": 0.8066,
"step": 45
},
{
"epoch": 0.01757066462948816,
"grad_norm": 4.6286805207695325,
"learning_rate": 1.1645569620253163e-06,
"loss": 0.8066,
"step": 46
},
{
"epoch": 0.017952635599694424,
"grad_norm": 7.835888478790431,
"learning_rate": 1.1898734177215188e-06,
"loss": 0.8543,
"step": 47
},
{
"epoch": 0.01833460656990069,
"grad_norm": 5.422444896928901,
"learning_rate": 1.2151898734177215e-06,
"loss": 0.8331,
"step": 48
},
{
"epoch": 0.01871657754010695,
"grad_norm": 5.460802978188565,
"learning_rate": 1.240506329113924e-06,
"loss": 0.8651,
"step": 49
},
{
"epoch": 0.019098548510313215,
"grad_norm": 6.5678961432295475,
"learning_rate": 1.2658227848101265e-06,
"loss": 0.9134,
"step": 50
},
{
"epoch": 0.01948051948051948,
"grad_norm": 4.9487683359009536,
"learning_rate": 1.291139240506329e-06,
"loss": 0.806,
"step": 51
},
{
"epoch": 0.019862490450725745,
"grad_norm": 4.4837558296364355,
"learning_rate": 1.3164556962025315e-06,
"loss": 0.8113,
"step": 52
},
{
"epoch": 0.02024446142093201,
"grad_norm": 5.344312043183558,
"learning_rate": 1.3417721518987342e-06,
"loss": 0.8086,
"step": 53
},
{
"epoch": 0.020626432391138275,
"grad_norm": 5.246700443604725,
"learning_rate": 1.3670886075949365e-06,
"loss": 0.7082,
"step": 54
},
{
"epoch": 0.02100840336134454,
"grad_norm": 5.941401610969792,
"learning_rate": 1.3924050632911392e-06,
"loss": 0.7367,
"step": 55
},
{
"epoch": 0.0213903743315508,
"grad_norm": 5.542256214458787,
"learning_rate": 1.4177215189873418e-06,
"loss": 0.725,
"step": 56
},
{
"epoch": 0.021772345301757066,
"grad_norm": 4.930892825499901,
"learning_rate": 1.4430379746835443e-06,
"loss": 0.7891,
"step": 57
},
{
"epoch": 0.02215431627196333,
"grad_norm": 9.097002648121126,
"learning_rate": 1.4683544303797468e-06,
"loss": 0.8028,
"step": 58
},
{
"epoch": 0.022536287242169595,
"grad_norm": 11.795657973005843,
"learning_rate": 1.4936708860759493e-06,
"loss": 0.8097,
"step": 59
},
{
"epoch": 0.02291825821237586,
"grad_norm": 3.8313472673735274,
"learning_rate": 1.518987341772152e-06,
"loss": 0.6951,
"step": 60
},
{
"epoch": 0.023300229182582125,
"grad_norm": 4.034761647826669,
"learning_rate": 1.5443037974683545e-06,
"loss": 0.7105,
"step": 61
},
{
"epoch": 0.023682200152788387,
"grad_norm": 6.038362262794577,
"learning_rate": 1.5696202531645568e-06,
"loss": 0.7587,
"step": 62
},
{
"epoch": 0.02406417112299465,
"grad_norm": 3.1079730758909925,
"learning_rate": 1.5949367088607595e-06,
"loss": 0.6828,
"step": 63
},
{
"epoch": 0.024446142093200916,
"grad_norm": 3.7055669255608863,
"learning_rate": 1.620253164556962e-06,
"loss": 0.702,
"step": 64
},
{
"epoch": 0.02482811306340718,
"grad_norm": 11.676516661636496,
"learning_rate": 1.6455696202531647e-06,
"loss": 0.7052,
"step": 65
},
{
"epoch": 0.025210084033613446,
"grad_norm": 4.025514648722259,
"learning_rate": 1.670886075949367e-06,
"loss": 0.661,
"step": 66
},
{
"epoch": 0.02559205500381971,
"grad_norm": 4.656351795473404,
"learning_rate": 1.6962025316455695e-06,
"loss": 0.7484,
"step": 67
},
{
"epoch": 0.025974025974025976,
"grad_norm": 4.1084827411397,
"learning_rate": 1.7215189873417722e-06,
"loss": 0.6608,
"step": 68
},
{
"epoch": 0.026355996944232237,
"grad_norm": 4.129035383112947,
"learning_rate": 1.7468354430379747e-06,
"loss": 0.6493,
"step": 69
},
{
"epoch": 0.026737967914438502,
"grad_norm": 3.504125310927204,
"learning_rate": 1.772151898734177e-06,
"loss": 0.6193,
"step": 70
},
{
"epoch": 0.027119938884644767,
"grad_norm": 3.6925426579271314,
"learning_rate": 1.7974683544303797e-06,
"loss": 0.6431,
"step": 71
},
{
"epoch": 0.02750190985485103,
"grad_norm": 6.683623338936127,
"learning_rate": 1.8227848101265822e-06,
"loss": 0.739,
"step": 72
},
{
"epoch": 0.027883880825057297,
"grad_norm": 3.5029780150790186,
"learning_rate": 1.848101265822785e-06,
"loss": 0.6587,
"step": 73
},
{
"epoch": 0.02826585179526356,
"grad_norm": 5.370596801515218,
"learning_rate": 1.8734177215189872e-06,
"loss": 0.7036,
"step": 74
},
{
"epoch": 0.028647822765469823,
"grad_norm": 4.983202118065457,
"learning_rate": 1.8987341772151897e-06,
"loss": 0.6879,
"step": 75
},
{
"epoch": 0.029029793735676088,
"grad_norm": 11.685843678353526,
"learning_rate": 1.924050632911392e-06,
"loss": 0.6488,
"step": 76
},
{
"epoch": 0.029411764705882353,
"grad_norm": 4.392795361950916,
"learning_rate": 1.949367088607595e-06,
"loss": 0.7056,
"step": 77
},
{
"epoch": 0.029793735676088617,
"grad_norm": 3.9315009070892764,
"learning_rate": 1.9746835443037976e-06,
"loss": 0.6494,
"step": 78
},
{
"epoch": 0.030175706646294882,
"grad_norm": 5.141383767356884,
"learning_rate": 2e-06,
"loss": 0.6544,
"step": 79
},
{
"epoch": 0.030557677616501147,
"grad_norm": 4.018343651117365,
"learning_rate": 1.9999992345015908e-06,
"loss": 0.6952,
"step": 80
},
{
"epoch": 0.030939648586707412,
"grad_norm": 5.591851744780808,
"learning_rate": 1.999996938007535e-06,
"loss": 0.6603,
"step": 81
},
{
"epoch": 0.03132161955691368,
"grad_norm": 6.005561034856619,
"learning_rate": 1.999993110521348e-06,
"loss": 0.6373,
"step": 82
},
{
"epoch": 0.03170359052711994,
"grad_norm": 7.102355842806267,
"learning_rate": 1.9999877520488907e-06,
"loss": 0.6101,
"step": 83
},
{
"epoch": 0.03208556149732621,
"grad_norm": 2.867411767320501,
"learning_rate": 1.9999808625983663e-06,
"loss": 0.593,
"step": 84
},
{
"epoch": 0.032467532467532464,
"grad_norm": 4.83662234283247,
"learning_rate": 1.999972442180323e-06,
"loss": 0.586,
"step": 85
},
{
"epoch": 0.03284950343773873,
"grad_norm": 5.198461145382523,
"learning_rate": 1.9999624908076514e-06,
"loss": 0.6789,
"step": 86
},
{
"epoch": 0.033231474407944994,
"grad_norm": 2.8686757091820683,
"learning_rate": 1.999951008495588e-06,
"loss": 0.5732,
"step": 87
},
{
"epoch": 0.03361344537815126,
"grad_norm": 3.1867941404051,
"learning_rate": 1.9999379952617116e-06,
"loss": 0.6728,
"step": 88
},
{
"epoch": 0.033995416348357524,
"grad_norm": 4.819026326282661,
"learning_rate": 1.9999234511259463e-06,
"loss": 0.6788,
"step": 89
},
{
"epoch": 0.03437738731856379,
"grad_norm": 5.624234579635901,
"learning_rate": 1.999907376110558e-06,
"loss": 0.6142,
"step": 90
},
{
"epoch": 0.034759358288770054,
"grad_norm": 27.028974037702127,
"learning_rate": 1.9998897702401585e-06,
"loss": 0.5891,
"step": 91
},
{
"epoch": 0.03514132925897632,
"grad_norm": 6.062503354980905,
"learning_rate": 1.999870633541701e-06,
"loss": 0.6348,
"step": 92
},
{
"epoch": 0.03552330022918258,
"grad_norm": 4.277480684633243,
"learning_rate": 1.999849966044485e-06,
"loss": 0.616,
"step": 93
},
{
"epoch": 0.03590527119938885,
"grad_norm": 7.1085637623688935,
"learning_rate": 1.999827767780152e-06,
"loss": 0.7117,
"step": 94
},
{
"epoch": 0.03628724216959511,
"grad_norm": 4.056766891368591,
"learning_rate": 1.999804038782687e-06,
"loss": 0.5858,
"step": 95
},
{
"epoch": 0.03666921313980138,
"grad_norm": 4.674918715366667,
"learning_rate": 1.99977877908842e-06,
"loss": 0.6298,
"step": 96
},
{
"epoch": 0.03705118411000764,
"grad_norm": 3.300022908261171,
"learning_rate": 1.999751988736023e-06,
"loss": 0.5515,
"step": 97
},
{
"epoch": 0.0374331550802139,
"grad_norm": 8.740377754714698,
"learning_rate": 1.9997236677665115e-06,
"loss": 0.6847,
"step": 98
},
{
"epoch": 0.037815126050420166,
"grad_norm": 7.098187800477883,
"learning_rate": 1.999693816223245e-06,
"loss": 0.6553,
"step": 99
},
{
"epoch": 0.03819709702062643,
"grad_norm": 4.937245961472043,
"learning_rate": 1.9996624341519268e-06,
"loss": 0.6398,
"step": 100
},
{
"epoch": 0.038579067990832695,
"grad_norm": 8.475917815171096,
"learning_rate": 1.999629521600602e-06,
"loss": 0.5671,
"step": 101
},
{
"epoch": 0.03896103896103896,
"grad_norm": 3.174133239967434,
"learning_rate": 1.9995950786196597e-06,
"loss": 0.5323,
"step": 102
},
{
"epoch": 0.039343009931245225,
"grad_norm": 3.0409122423678983,
"learning_rate": 1.999559105261833e-06,
"loss": 0.6296,
"step": 103
},
{
"epoch": 0.03972498090145149,
"grad_norm": 2.886981911151555,
"learning_rate": 1.9995216015821954e-06,
"loss": 0.5191,
"step": 104
},
{
"epoch": 0.040106951871657755,
"grad_norm": 6.822287483611468,
"learning_rate": 1.9994825676381657e-06,
"loss": 0.5861,
"step": 105
},
{
"epoch": 0.04048892284186402,
"grad_norm": 6.200193148891678,
"learning_rate": 1.999442003489505e-06,
"loss": 0.5414,
"step": 106
},
{
"epoch": 0.040870893812070284,
"grad_norm": 5.169789028690033,
"learning_rate": 1.999399909198316e-06,
"loss": 0.6444,
"step": 107
},
{
"epoch": 0.04125286478227655,
"grad_norm": 7.759781519596946,
"learning_rate": 1.9993562848290463e-06,
"loss": 0.5599,
"step": 108
},
{
"epoch": 0.041634835752482814,
"grad_norm": 4.421136398577513,
"learning_rate": 1.9993111304484836e-06,
"loss": 0.6536,
"step": 109
},
{
"epoch": 0.04201680672268908,
"grad_norm": 3.823541000198925,
"learning_rate": 1.9992644461257595e-06,
"loss": 0.6581,
"step": 110
},
{
"epoch": 0.04239877769289534,
"grad_norm": 5.516049899957531,
"learning_rate": 1.999216231932347e-06,
"loss": 0.6663,
"step": 111
},
{
"epoch": 0.0427807486631016,
"grad_norm": 5.5740814564410375,
"learning_rate": 1.9991664879420628e-06,
"loss": 0.5795,
"step": 112
},
{
"epoch": 0.04316271963330787,
"grad_norm": 3.437123292810565,
"learning_rate": 1.999115214231064e-06,
"loss": 0.6331,
"step": 113
},
{
"epoch": 0.04354469060351413,
"grad_norm": 3.7632584467840404,
"learning_rate": 1.9990624108778517e-06,
"loss": 0.6531,
"step": 114
},
{
"epoch": 0.043926661573720396,
"grad_norm": 4.182331120505753,
"learning_rate": 1.999008077963266e-06,
"loss": 0.5852,
"step": 115
},
{
"epoch": 0.04430863254392666,
"grad_norm": 9.300499481983667,
"learning_rate": 1.9989522155704913e-06,
"loss": 0.5871,
"step": 116
},
{
"epoch": 0.044690603514132926,
"grad_norm": 3.344891067734144,
"learning_rate": 1.9988948237850526e-06,
"loss": 0.5699,
"step": 117
},
{
"epoch": 0.04507257448433919,
"grad_norm": 3.326565572079444,
"learning_rate": 1.9988359026948167e-06,
"loss": 0.5923,
"step": 118
},
{
"epoch": 0.045454545454545456,
"grad_norm": 4.082860223491362,
"learning_rate": 1.9987754523899915e-06,
"loss": 0.6253,
"step": 119
},
{
"epoch": 0.04583651642475172,
"grad_norm": 12.324222314245391,
"learning_rate": 1.998713472963126e-06,
"loss": 0.6368,
"step": 120
},
{
"epoch": 0.046218487394957986,
"grad_norm": 3.325231836297131,
"learning_rate": 1.998649964509111e-06,
"loss": 0.585,
"step": 121
},
{
"epoch": 0.04660045836516425,
"grad_norm": 2.9732772511513574,
"learning_rate": 1.9985849271251774e-06,
"loss": 0.5832,
"step": 122
},
{
"epoch": 0.046982429335370515,
"grad_norm": 4.5992463698133035,
"learning_rate": 1.9985183609108972e-06,
"loss": 0.6219,
"step": 123
},
{
"epoch": 0.04736440030557677,
"grad_norm": 3.4205990364099876,
"learning_rate": 1.9984502659681836e-06,
"loss": 0.5899,
"step": 124
},
{
"epoch": 0.04774637127578304,
"grad_norm": 6.047943198436655,
"learning_rate": 1.9983806424012887e-06,
"loss": 0.5719,
"step": 125
},
{
"epoch": 0.0481283422459893,
"grad_norm": 5.793217662490507,
"learning_rate": 1.9983094903168067e-06,
"loss": 0.6094,
"step": 126
},
{
"epoch": 0.04851031321619557,
"grad_norm": 5.8062953058676525,
"learning_rate": 1.998236809823671e-06,
"loss": 0.6127,
"step": 127
},
{
"epoch": 0.04889228418640183,
"grad_norm": 2.4470363943804783,
"learning_rate": 1.9981626010331558e-06,
"loss": 0.5495,
"step": 128
},
{
"epoch": 0.0492742551566081,
"grad_norm": 4.783628150040865,
"learning_rate": 1.9980868640588736e-06,
"loss": 0.5533,
"step": 129
},
{
"epoch": 0.04965622612681436,
"grad_norm": 3.291120740165707,
"learning_rate": 1.9980095990167776e-06,
"loss": 0.6196,
"step": 130
},
{
"epoch": 0.05003819709702063,
"grad_norm": 3.8172581712282656,
"learning_rate": 1.997930806025161e-06,
"loss": 0.6545,
"step": 131
},
{
"epoch": 0.05042016806722689,
"grad_norm": 5.998885200857126,
"learning_rate": 1.9978504852046553e-06,
"loss": 0.6009,
"step": 132
},
{
"epoch": 0.05080213903743316,
"grad_norm": 6.035103071609391,
"learning_rate": 1.997768636678231e-06,
"loss": 0.6362,
"step": 133
},
{
"epoch": 0.05118411000763942,
"grad_norm": 10.3639480840429,
"learning_rate": 1.9976852605711986e-06,
"loss": 0.6109,
"step": 134
},
{
"epoch": 0.05156608097784569,
"grad_norm": 4.230015798354289,
"learning_rate": 1.9976003570112055e-06,
"loss": 0.6676,
"step": 135
},
{
"epoch": 0.05194805194805195,
"grad_norm": 2.870529329674749,
"learning_rate": 1.9975139261282406e-06,
"loss": 0.5568,
"step": 136
},
{
"epoch": 0.05233002291825821,
"grad_norm": 3.979837111136933,
"learning_rate": 1.9974259680546276e-06,
"loss": 0.5828,
"step": 137
},
{
"epoch": 0.052711993888464474,
"grad_norm": 15.324396689162327,
"learning_rate": 1.997336482925031e-06,
"loss": 0.6132,
"step": 138
},
{
"epoch": 0.05309396485867074,
"grad_norm": 14.502950425377572,
"learning_rate": 1.997245470876452e-06,
"loss": 0.5564,
"step": 139
},
{
"epoch": 0.053475935828877004,
"grad_norm": 2.8972805850967966,
"learning_rate": 1.99715293204823e-06,
"loss": 0.623,
"step": 140
},
{
"epoch": 0.05385790679908327,
"grad_norm": 3.9661984150677165,
"learning_rate": 1.997058866582041e-06,
"loss": 0.5244,
"step": 141
},
{
"epoch": 0.054239877769289534,
"grad_norm": 4.6071033044010505,
"learning_rate": 1.9969632746218997e-06,
"loss": 0.5434,
"step": 142
},
{
"epoch": 0.0546218487394958,
"grad_norm": 4.86413877573829,
"learning_rate": 1.996866156314157e-06,
"loss": 0.6728,
"step": 143
},
{
"epoch": 0.05500381970970206,
"grad_norm": 3.424906514115404,
"learning_rate": 1.9967675118075e-06,
"loss": 0.5608,
"step": 144
},
{
"epoch": 0.05538579067990833,
"grad_norm": 4.464717444097961,
"learning_rate": 1.996667341252953e-06,
"loss": 0.6112,
"step": 145
},
{
"epoch": 0.05576776165011459,
"grad_norm": 20.909201379759565,
"learning_rate": 1.9965656448038783e-06,
"loss": 0.5953,
"step": 146
},
{
"epoch": 0.05614973262032086,
"grad_norm": 3.2733693957476073,
"learning_rate": 1.9964624226159714e-06,
"loss": 0.6261,
"step": 147
},
{
"epoch": 0.05653170359052712,
"grad_norm": 4.553438973985725,
"learning_rate": 1.9963576748472655e-06,
"loss": 0.5586,
"step": 148
},
{
"epoch": 0.05691367456073339,
"grad_norm": 3.557327038420317,
"learning_rate": 1.99625140165813e-06,
"loss": 0.5531,
"step": 149
},
{
"epoch": 0.057295645530939646,
"grad_norm": 2.805073373457496,
"learning_rate": 1.996143603211267e-06,
"loss": 0.5827,
"step": 150
},
{
"epoch": 0.05767761650114591,
"grad_norm": 3.950929404040752,
"learning_rate": 1.9960342796717174e-06,
"loss": 0.6139,
"step": 151
},
{
"epoch": 0.058059587471352175,
"grad_norm": 6.241356658055268,
"learning_rate": 1.9959234312068546e-06,
"loss": 0.6696,
"step": 152
},
{
"epoch": 0.05844155844155844,
"grad_norm": 4.220303162328757,
"learning_rate": 1.9958110579863866e-06,
"loss": 0.6114,
"step": 153
},
{
"epoch": 0.058823529411764705,
"grad_norm": 3.696778967888018,
"learning_rate": 1.995697160182357e-06,
"loss": 0.5757,
"step": 154
},
{
"epoch": 0.05920550038197097,
"grad_norm": 4.342641450025558,
"learning_rate": 1.9955817379691426e-06,
"loss": 0.6365,
"step": 155
},
{
"epoch": 0.059587471352177235,
"grad_norm": 3.6988467805046774,
"learning_rate": 1.9954647915234554e-06,
"loss": 0.5526,
"step": 156
},
{
"epoch": 0.0599694423223835,
"grad_norm": 2.7467797298493184,
"learning_rate": 1.9953463210243386e-06,
"loss": 0.5721,
"step": 157
},
{
"epoch": 0.060351413292589765,
"grad_norm": 3.722374343040301,
"learning_rate": 1.9952263266531716e-06,
"loss": 0.6569,
"step": 158
},
{
"epoch": 0.06073338426279603,
"grad_norm": 9.533663873007717,
"learning_rate": 1.9951048085936645e-06,
"loss": 0.6318,
"step": 159
},
{
"epoch": 0.061115355233002294,
"grad_norm": 11.718931031261699,
"learning_rate": 1.994981767031861e-06,
"loss": 0.6005,
"step": 160
},
{
"epoch": 0.06149732620320856,
"grad_norm": 4.375012655934481,
"learning_rate": 1.994857202156138e-06,
"loss": 0.5409,
"step": 161
},
{
"epoch": 0.061879297173414824,
"grad_norm": 5.120000733640627,
"learning_rate": 1.9947311141572035e-06,
"loss": 0.5635,
"step": 162
},
{
"epoch": 0.06226126814362108,
"grad_norm": 3.1000408920305285,
"learning_rate": 1.994603503228098e-06,
"loss": 0.5502,
"step": 163
},
{
"epoch": 0.06264323911382735,
"grad_norm": 8.835872288121367,
"learning_rate": 1.994474369564193e-06,
"loss": 0.6048,
"step": 164
},
{
"epoch": 0.06302521008403361,
"grad_norm": 8.273839587331405,
"learning_rate": 1.9943437133631922e-06,
"loss": 0.6539,
"step": 165
},
{
"epoch": 0.06340718105423988,
"grad_norm": 3.2608658274114237,
"learning_rate": 1.9942115348251295e-06,
"loss": 0.5477,
"step": 166
},
{
"epoch": 0.06378915202444614,
"grad_norm": 5.0788247296052,
"learning_rate": 1.99407783415237e-06,
"loss": 0.5973,
"step": 167
},
{
"epoch": 0.06417112299465241,
"grad_norm": 3.9254811565905534,
"learning_rate": 1.9939426115496094e-06,
"loss": 0.5931,
"step": 168
},
{
"epoch": 0.06455309396485867,
"grad_norm": 3.148494731000133,
"learning_rate": 1.9938058672238726e-06,
"loss": 0.5681,
"step": 169
},
{
"epoch": 0.06493506493506493,
"grad_norm": 3.4191069878832065,
"learning_rate": 1.9936676013845146e-06,
"loss": 0.5172,
"step": 170
},
{
"epoch": 0.0653170359052712,
"grad_norm": 3.9378468200785846,
"learning_rate": 1.99352781424322e-06,
"loss": 0.6518,
"step": 171
},
{
"epoch": 0.06569900687547746,
"grad_norm": 3.3159019691177036,
"learning_rate": 1.9933865060140025e-06,
"loss": 0.5513,
"step": 172
},
{
"epoch": 0.06608097784568373,
"grad_norm": 3.5786485272769575,
"learning_rate": 1.993243676913205e-06,
"loss": 0.5336,
"step": 173
},
{
"epoch": 0.06646294881588999,
"grad_norm": 4.860049758383945,
"learning_rate": 1.9930993271594982e-06,
"loss": 0.6294,
"step": 174
},
{
"epoch": 0.06684491978609626,
"grad_norm": 12.43362515923018,
"learning_rate": 1.9929534569738807e-06,
"loss": 0.5987,
"step": 175
},
{
"epoch": 0.06722689075630252,
"grad_norm": 3.527533600699027,
"learning_rate": 1.9928060665796797e-06,
"loss": 0.567,
"step": 176
},
{
"epoch": 0.06760886172650879,
"grad_norm": 8.845312363617492,
"learning_rate": 1.9926571562025493e-06,
"loss": 0.5734,
"step": 177
},
{
"epoch": 0.06799083269671505,
"grad_norm": 3.0944989407876946,
"learning_rate": 1.992506726070471e-06,
"loss": 0.5748,
"step": 178
},
{
"epoch": 0.06837280366692132,
"grad_norm": 4.535983087618347,
"learning_rate": 1.9923547764137523e-06,
"loss": 0.6393,
"step": 179
},
{
"epoch": 0.06875477463712758,
"grad_norm": 4.910034811953476,
"learning_rate": 1.9922013074650286e-06,
"loss": 0.5729,
"step": 180
},
{
"epoch": 0.06913674560733385,
"grad_norm": 4.222067308854296,
"learning_rate": 1.9920463194592593e-06,
"loss": 0.6011,
"step": 181
},
{
"epoch": 0.06951871657754011,
"grad_norm": 5.264384268024057,
"learning_rate": 1.991889812633731e-06,
"loss": 0.642,
"step": 182
},
{
"epoch": 0.06990068754774637,
"grad_norm": 18.552090762107316,
"learning_rate": 1.9917317872280553e-06,
"loss": 0.5649,
"step": 183
},
{
"epoch": 0.07028265851795264,
"grad_norm": 5.4812524878829345,
"learning_rate": 1.9915722434841686e-06,
"loss": 0.5436,
"step": 184
},
{
"epoch": 0.0706646294881589,
"grad_norm": 11.41145507933353,
"learning_rate": 1.9914111816463314e-06,
"loss": 0.5642,
"step": 185
},
{
"epoch": 0.07104660045836517,
"grad_norm": 15.827321504847037,
"learning_rate": 1.9912486019611292e-06,
"loss": 0.5328,
"step": 186
},
{
"epoch": 0.07142857142857142,
"grad_norm": 4.019121003233206,
"learning_rate": 1.9910845046774713e-06,
"loss": 0.5228,
"step": 187
},
{
"epoch": 0.0718105423987777,
"grad_norm": 3.914912354546862,
"learning_rate": 1.9909188900465898e-06,
"loss": 0.5933,
"step": 188
},
{
"epoch": 0.07219251336898395,
"grad_norm": 3.7225401404614473,
"learning_rate": 1.99075175832204e-06,
"loss": 0.6174,
"step": 189
},
{
"epoch": 0.07257448433919023,
"grad_norm": 12.833339342812874,
"learning_rate": 1.9905831097597005e-06,
"loss": 0.5013,
"step": 190
},
{
"epoch": 0.07295645530939648,
"grad_norm": 9.315651786262373,
"learning_rate": 1.9904129446177708e-06,
"loss": 0.5632,
"step": 191
},
{
"epoch": 0.07333842627960276,
"grad_norm": 4.717241219168216,
"learning_rate": 1.9902412631567742e-06,
"loss": 0.5613,
"step": 192
},
{
"epoch": 0.07372039724980901,
"grad_norm": 2.717188166571018,
"learning_rate": 1.9900680656395542e-06,
"loss": 0.5021,
"step": 193
},
{
"epoch": 0.07410236822001529,
"grad_norm": 5.396732534352068,
"learning_rate": 1.9898933523312752e-06,
"loss": 0.5678,
"step": 194
},
{
"epoch": 0.07448433919022154,
"grad_norm": 20.29200499142414,
"learning_rate": 1.989717123499423e-06,
"loss": 0.6235,
"step": 195
},
{
"epoch": 0.0748663101604278,
"grad_norm": 2.994983335593462,
"learning_rate": 1.989539379413804e-06,
"loss": 0.5642,
"step": 196
},
{
"epoch": 0.07524828113063407,
"grad_norm": 8.29189563991693,
"learning_rate": 1.989360120346543e-06,
"loss": 0.5757,
"step": 197
},
{
"epoch": 0.07563025210084033,
"grad_norm": 3.5744702141039215,
"learning_rate": 1.9891793465720854e-06,
"loss": 0.6116,
"step": 198
},
{
"epoch": 0.0760122230710466,
"grad_norm": 8.170124572497699,
"learning_rate": 1.9889970583671948e-06,
"loss": 0.6318,
"step": 199
},
{
"epoch": 0.07639419404125286,
"grad_norm": 6.5109797482671485,
"learning_rate": 1.9888132560109544e-06,
"loss": 0.6488,
"step": 200
},
{
"epoch": 0.07677616501145913,
"grad_norm": 3.1475254836277102,
"learning_rate": 1.988627939784765e-06,
"loss": 0.5516,
"step": 201
},
{
"epoch": 0.07715813598166539,
"grad_norm": 5.7077183037786,
"learning_rate": 1.988441109972345e-06,
"loss": 0.5135,
"step": 202
},
{
"epoch": 0.07754010695187166,
"grad_norm": 6.557122804854992,
"learning_rate": 1.9882527668597305e-06,
"loss": 0.5773,
"step": 203
},
{
"epoch": 0.07792207792207792,
"grad_norm": 8.966667288445084,
"learning_rate": 1.9880629107352737e-06,
"loss": 0.5242,
"step": 204
},
{
"epoch": 0.07830404889228419,
"grad_norm": 3.7233013320834623,
"learning_rate": 1.987871541889644e-06,
"loss": 0.5339,
"step": 205
},
{
"epoch": 0.07868601986249045,
"grad_norm": 6.417418420872459,
"learning_rate": 1.9876786606158265e-06,
"loss": 0.5242,
"step": 206
},
{
"epoch": 0.07906799083269672,
"grad_norm": 18.824464210105184,
"learning_rate": 1.987484267209122e-06,
"loss": 0.5333,
"step": 207
},
{
"epoch": 0.07944996180290298,
"grad_norm": 3.4759810800181126,
"learning_rate": 1.987288361967146e-06,
"loss": 0.5681,
"step": 208
},
{
"epoch": 0.07983193277310924,
"grad_norm": 3.588775053287736,
"learning_rate": 1.9870909451898286e-06,
"loss": 0.5514,
"step": 209
},
{
"epoch": 0.08021390374331551,
"grad_norm": 4.125026639003856,
"learning_rate": 1.986892017179415e-06,
"loss": 0.5599,
"step": 210
},
{
"epoch": 0.08059587471352177,
"grad_norm": 7.433563585133112,
"learning_rate": 1.986691578240462e-06,
"loss": 0.596,
"step": 211
},
{
"epoch": 0.08097784568372804,
"grad_norm": 4.075158651509923,
"learning_rate": 1.9864896286798422e-06,
"loss": 0.5647,
"step": 212
},
{
"epoch": 0.0813598166539343,
"grad_norm": 4.435622115339765,
"learning_rate": 1.9862861688067393e-06,
"loss": 0.5473,
"step": 213
},
{
"epoch": 0.08174178762414057,
"grad_norm": 6.400752274134116,
"learning_rate": 1.98608119893265e-06,
"loss": 0.5784,
"step": 214
},
{
"epoch": 0.08212375859434683,
"grad_norm": 4.034841024017193,
"learning_rate": 1.985874719371382e-06,
"loss": 0.5381,
"step": 215
},
{
"epoch": 0.0825057295645531,
"grad_norm": 5.558573234345964,
"learning_rate": 1.985666730439055e-06,
"loss": 0.5173,
"step": 216
},
{
"epoch": 0.08288770053475936,
"grad_norm": 2.0565756193889047,
"learning_rate": 1.9854572324541e-06,
"loss": 0.5066,
"step": 217
},
{
"epoch": 0.08326967150496563,
"grad_norm": 3.3119596339453716,
"learning_rate": 1.985246225737257e-06,
"loss": 0.5819,
"step": 218
},
{
"epoch": 0.08365164247517189,
"grad_norm": 6.043976155196966,
"learning_rate": 1.9850337106115766e-06,
"loss": 0.574,
"step": 219
},
{
"epoch": 0.08403361344537816,
"grad_norm": 2.3257096761329574,
"learning_rate": 1.9848196874024194e-06,
"loss": 0.5431,
"step": 220
},
{
"epoch": 0.08441558441558442,
"grad_norm": 2.611978681503251,
"learning_rate": 1.9846041564374543e-06,
"loss": 0.601,
"step": 221
},
{
"epoch": 0.08479755538579067,
"grad_norm": 3.938394541202129,
"learning_rate": 1.984387118046657e-06,
"loss": 0.5749,
"step": 222
},
{
"epoch": 0.08517952635599695,
"grad_norm": 3.8099196644027216,
"learning_rate": 1.9841685725623146e-06,
"loss": 0.5683,
"step": 223
},
{
"epoch": 0.0855614973262032,
"grad_norm": 4.270888241466328,
"learning_rate": 1.9839485203190184e-06,
"loss": 0.5362,
"step": 224
},
{
"epoch": 0.08594346829640948,
"grad_norm": 4.391700020088666,
"learning_rate": 1.983726961653668e-06,
"loss": 0.6054,
"step": 225
},
{
"epoch": 0.08632543926661573,
"grad_norm": 6.2550070608764585,
"learning_rate": 1.9835038969054692e-06,
"loss": 0.6173,
"step": 226
},
{
"epoch": 0.086707410236822,
"grad_norm": 2.771427521994671,
"learning_rate": 1.983279326415933e-06,
"loss": 0.5435,
"step": 227
},
{
"epoch": 0.08708938120702826,
"grad_norm": 5.470910999557675,
"learning_rate": 1.983053250528876e-06,
"loss": 0.4756,
"step": 228
},
{
"epoch": 0.08747135217723453,
"grad_norm": 6.526521212142111,
"learning_rate": 1.9828256695904202e-06,
"loss": 0.5879,
"step": 229
},
{
"epoch": 0.08785332314744079,
"grad_norm": 3.6873310706476903,
"learning_rate": 1.982596583948991e-06,
"loss": 0.5784,
"step": 230
},
{
"epoch": 0.08823529411764706,
"grad_norm": 17.41280102282283,
"learning_rate": 1.9823659939553177e-06,
"loss": 0.5494,
"step": 231
},
{
"epoch": 0.08861726508785332,
"grad_norm": 4.966211935572487,
"learning_rate": 1.9821338999624334e-06,
"loss": 0.5255,
"step": 232
},
{
"epoch": 0.0889992360580596,
"grad_norm": 4.187964987938789,
"learning_rate": 1.9819003023256724e-06,
"loss": 0.5995,
"step": 233
},
{
"epoch": 0.08938120702826585,
"grad_norm": 2.9009546668814865,
"learning_rate": 1.9816652014026726e-06,
"loss": 0.6022,
"step": 234
},
{
"epoch": 0.08976317799847211,
"grad_norm": 3.639504554196423,
"learning_rate": 1.9814285975533726e-06,
"loss": 0.4918,
"step": 235
},
{
"epoch": 0.09014514896867838,
"grad_norm": 4.062515207488995,
"learning_rate": 1.981190491140012e-06,
"loss": 0.591,
"step": 236
},
{
"epoch": 0.09052711993888464,
"grad_norm": 3.4989959301090203,
"learning_rate": 1.9809508825271307e-06,
"loss": 0.4974,
"step": 237
},
{
"epoch": 0.09090909090909091,
"grad_norm": 2.8663762427604973,
"learning_rate": 1.9807097720815695e-06,
"loss": 0.5101,
"step": 238
},
{
"epoch": 0.09129106187929717,
"grad_norm": 4.90634071902155,
"learning_rate": 1.980467160172467e-06,
"loss": 0.5322,
"step": 239
},
{
"epoch": 0.09167303284950344,
"grad_norm": 5.428428181782822,
"learning_rate": 1.980223047171262e-06,
"loss": 0.5178,
"step": 240
},
{
"epoch": 0.0920550038197097,
"grad_norm": 5.96934579408797,
"learning_rate": 1.97997743345169e-06,
"loss": 0.4557,
"step": 241
},
{
"epoch": 0.09243697478991597,
"grad_norm": 3.1404125093300475,
"learning_rate": 1.9797303193897853e-06,
"loss": 0.6178,
"step": 242
},
{
"epoch": 0.09281894576012223,
"grad_norm": 4.493186466405451,
"learning_rate": 1.979481705363878e-06,
"loss": 0.5436,
"step": 243
},
{
"epoch": 0.0932009167303285,
"grad_norm": 3.033561739318456,
"learning_rate": 1.9792315917545966e-06,
"loss": 0.5967,
"step": 244
},
{
"epoch": 0.09358288770053476,
"grad_norm": 2.9593338705670558,
"learning_rate": 1.978979978944863e-06,
"loss": 0.4987,
"step": 245
},
{
"epoch": 0.09396485867074103,
"grad_norm": 3.74521861090976,
"learning_rate": 1.9787268673198968e-06,
"loss": 0.5345,
"step": 246
},
{
"epoch": 0.09434682964094729,
"grad_norm": 4.703434116446447,
"learning_rate": 1.97847225726721e-06,
"loss": 0.5386,
"step": 247
},
{
"epoch": 0.09472880061115355,
"grad_norm": 4.338722545992488,
"learning_rate": 1.97821614917661e-06,
"loss": 0.5508,
"step": 248
},
{
"epoch": 0.09511077158135982,
"grad_norm": 3.3068457311915957,
"learning_rate": 1.977958543440199e-06,
"loss": 0.5321,
"step": 249
},
{
"epoch": 0.09549274255156608,
"grad_norm": 7.6890680175286725,
"learning_rate": 1.977699440452368e-06,
"loss": 0.5812,
"step": 250
},
{
"epoch": 0.09587471352177235,
"grad_norm": 5.039749423839696,
"learning_rate": 1.9774388406098046e-06,
"loss": 0.5594,
"step": 251
},
{
"epoch": 0.0962566844919786,
"grad_norm": 9.378502926954392,
"learning_rate": 1.9771767443114856e-06,
"loss": 0.5639,
"step": 252
},
{
"epoch": 0.09663865546218488,
"grad_norm": 5.953497760251802,
"learning_rate": 1.9769131519586804e-06,
"loss": 0.5354,
"step": 253
},
{
"epoch": 0.09702062643239114,
"grad_norm": 3.3592755873335056,
"learning_rate": 1.976648063954947e-06,
"loss": 0.5193,
"step": 254
},
{
"epoch": 0.09740259740259741,
"grad_norm": 10.155837325369578,
"learning_rate": 1.9763814807061354e-06,
"loss": 0.5577,
"step": 255
},
{
"epoch": 0.09778456837280367,
"grad_norm": 3.7192213821503093,
"learning_rate": 1.9761134026203823e-06,
"loss": 0.5794,
"step": 256
},
{
"epoch": 0.09816653934300994,
"grad_norm": 3.2022524038735525,
"learning_rate": 1.975843830108116e-06,
"loss": 0.5756,
"step": 257
},
{
"epoch": 0.0985485103132162,
"grad_norm": 2.954946222885205,
"learning_rate": 1.97557276358205e-06,
"loss": 0.5548,
"step": 258
},
{
"epoch": 0.09893048128342247,
"grad_norm": 6.3254586604717975,
"learning_rate": 1.9753002034571864e-06,
"loss": 0.5414,
"step": 259
},
{
"epoch": 0.09931245225362872,
"grad_norm": 6.168737429198778,
"learning_rate": 1.9750261501508146e-06,
"loss": 0.6532,
"step": 260
},
{
"epoch": 0.09969442322383498,
"grad_norm": 2.3671571449627655,
"learning_rate": 1.974750604082509e-06,
"loss": 0.5148,
"step": 261
},
{
"epoch": 0.10007639419404125,
"grad_norm": 3.069161204383331,
"learning_rate": 1.9744735656741294e-06,
"loss": 0.6238,
"step": 262
},
{
"epoch": 0.10045836516424751,
"grad_norm": 3.25395270581985,
"learning_rate": 1.9741950353498208e-06,
"loss": 0.4785,
"step": 263
},
{
"epoch": 0.10084033613445378,
"grad_norm": 3.6118907652227628,
"learning_rate": 1.9739150135360126e-06,
"loss": 0.5878,
"step": 264
},
{
"epoch": 0.10122230710466004,
"grad_norm": 2.958467227272678,
"learning_rate": 1.973633500661417e-06,
"loss": 0.5537,
"step": 265
},
{
"epoch": 0.10160427807486631,
"grad_norm": 3.495534041343745,
"learning_rate": 1.9733504971570297e-06,
"loss": 0.5092,
"step": 266
},
{
"epoch": 0.10198624904507257,
"grad_norm": 4.3395578802723405,
"learning_rate": 1.9730660034561275e-06,
"loss": 0.5533,
"step": 267
},
{
"epoch": 0.10236822001527884,
"grad_norm": 5.357259140150383,
"learning_rate": 1.97278001999427e-06,
"loss": 0.5497,
"step": 268
},
{
"epoch": 0.1027501909854851,
"grad_norm": 12.330903021774263,
"learning_rate": 1.9724925472092967e-06,
"loss": 0.6312,
"step": 269
},
{
"epoch": 0.10313216195569137,
"grad_norm": 5.008650457149163,
"learning_rate": 1.9722035855413275e-06,
"loss": 0.6209,
"step": 270
},
{
"epoch": 0.10351413292589763,
"grad_norm": 8.70082828754419,
"learning_rate": 1.971913135432762e-06,
"loss": 0.541,
"step": 271
},
{
"epoch": 0.1038961038961039,
"grad_norm": 4.3423176215549075,
"learning_rate": 1.971621197328278e-06,
"loss": 0.5866,
"step": 272
},
{
"epoch": 0.10427807486631016,
"grad_norm": 4.755627538602661,
"learning_rate": 1.971327771674832e-06,
"loss": 0.6044,
"step": 273
},
{
"epoch": 0.10466004583651642,
"grad_norm": 3.407967193816134,
"learning_rate": 1.9710328589216576e-06,
"loss": 0.5433,
"step": 274
},
{
"epoch": 0.10504201680672269,
"grad_norm": 2.4416142840249395,
"learning_rate": 1.9707364595202657e-06,
"loss": 0.4781,
"step": 275
},
{
"epoch": 0.10542398777692895,
"grad_norm": 5.02202128886637,
"learning_rate": 1.9704385739244427e-06,
"loss": 0.5189,
"step": 276
},
{
"epoch": 0.10580595874713522,
"grad_norm": 3.696189900415905,
"learning_rate": 1.9701392025902504e-06,
"loss": 0.5763,
"step": 277
},
{
"epoch": 0.10618792971734148,
"grad_norm": 5.409243605057394,
"learning_rate": 1.9698383459760253e-06,
"loss": 0.5131,
"step": 278
},
{
"epoch": 0.10656990068754775,
"grad_norm": 11.832202446687807,
"learning_rate": 1.9695360045423778e-06,
"loss": 0.4832,
"step": 279
},
{
"epoch": 0.10695187165775401,
"grad_norm": 9.10805760479928,
"learning_rate": 1.969232178752192e-06,
"loss": 0.4996,
"step": 280
},
{
"epoch": 0.10733384262796028,
"grad_norm": 7.3407876873468645,
"learning_rate": 1.968926869070624e-06,
"loss": 0.5288,
"step": 281
},
{
"epoch": 0.10771581359816654,
"grad_norm": 2.3010355675392824,
"learning_rate": 1.9686200759651023e-06,
"loss": 0.4662,
"step": 282
},
{
"epoch": 0.10809778456837281,
"grad_norm": 3.3916106572355864,
"learning_rate": 1.9683117999053253e-06,
"loss": 0.5179,
"step": 283
},
{
"epoch": 0.10847975553857907,
"grad_norm": 5.6361615135843275,
"learning_rate": 1.9680020413632638e-06,
"loss": 0.5758,
"step": 284
},
{
"epoch": 0.10886172650878534,
"grad_norm": 5.188466938832083,
"learning_rate": 1.967690800813156e-06,
"loss": 0.5608,
"step": 285
},
{
"epoch": 0.1092436974789916,
"grad_norm": 2.9205838185447686,
"learning_rate": 1.9673780787315115e-06,
"loss": 0.5626,
"step": 286
},
{
"epoch": 0.10962566844919786,
"grad_norm": 6.5632102507293215,
"learning_rate": 1.967063875597106e-06,
"loss": 0.5899,
"step": 287
},
{
"epoch": 0.11000763941940413,
"grad_norm": 8.610035941457873,
"learning_rate": 1.966748191890983e-06,
"loss": 0.5404,
"step": 288
},
{
"epoch": 0.11038961038961038,
"grad_norm": 3.3221648613789347,
"learning_rate": 1.9664310280964544e-06,
"loss": 0.5609,
"step": 289
},
{
"epoch": 0.11077158135981666,
"grad_norm": 2.9946952305092736,
"learning_rate": 1.9661123846990962e-06,
"loss": 0.487,
"step": 290
},
{
"epoch": 0.11115355233002291,
"grad_norm": 6.61554377599513,
"learning_rate": 1.9657922621867504e-06,
"loss": 0.5585,
"step": 291
},
{
"epoch": 0.11153552330022919,
"grad_norm": 2.9463233379442144,
"learning_rate": 1.965470661049524e-06,
"loss": 0.6021,
"step": 292
},
{
"epoch": 0.11191749427043544,
"grad_norm": 7.9703423929212835,
"learning_rate": 1.965147581779787e-06,
"loss": 0.5475,
"step": 293
},
{
"epoch": 0.11229946524064172,
"grad_norm": 2.8612057281120924,
"learning_rate": 1.964823024872173e-06,
"loss": 0.5855,
"step": 294
},
{
"epoch": 0.11268143621084797,
"grad_norm": 2.1630491598249604,
"learning_rate": 1.9644969908235776e-06,
"loss": 0.4495,
"step": 295
},
{
"epoch": 0.11306340718105425,
"grad_norm": 3.5789900025276857,
"learning_rate": 1.964169480133158e-06,
"loss": 0.6358,
"step": 296
},
{
"epoch": 0.1134453781512605,
"grad_norm": 2.8373987767544913,
"learning_rate": 1.963840493302331e-06,
"loss": 0.5472,
"step": 297
},
{
"epoch": 0.11382734912146678,
"grad_norm": 7.808336378861631,
"learning_rate": 1.963510030834775e-06,
"loss": 0.6182,
"step": 298
},
{
"epoch": 0.11420932009167303,
"grad_norm": 4.073657677829145,
"learning_rate": 1.963178093236428e-06,
"loss": 0.606,
"step": 299
},
{
"epoch": 0.11459129106187929,
"grad_norm": 3.8323245034578877,
"learning_rate": 1.962844681015484e-06,
"loss": 0.4772,
"step": 300
},
{
"epoch": 0.11497326203208556,
"grad_norm": 2.3060243380962335,
"learning_rate": 1.962509794682397e-06,
"loss": 0.4921,
"step": 301
},
{
"epoch": 0.11535523300229182,
"grad_norm": 2.409944953358685,
"learning_rate": 1.962173434749876e-06,
"loss": 0.4867,
"step": 302
},
{
"epoch": 0.11573720397249809,
"grad_norm": 4.736990510231778,
"learning_rate": 1.9618356017328875e-06,
"loss": 0.6127,
"step": 303
},
{
"epoch": 0.11611917494270435,
"grad_norm": 13.532790760092732,
"learning_rate": 1.961496296148653e-06,
"loss": 0.5324,
"step": 304
},
{
"epoch": 0.11650114591291062,
"grad_norm": 7.960055537091021,
"learning_rate": 1.961155518516648e-06,
"loss": 0.5629,
"step": 305
},
{
"epoch": 0.11688311688311688,
"grad_norm": 5.04926063772735,
"learning_rate": 1.960813269358602e-06,
"loss": 0.518,
"step": 306
},
{
"epoch": 0.11726508785332315,
"grad_norm": 2.965213442633116,
"learning_rate": 1.960469549198497e-06,
"loss": 0.5874,
"step": 307
},
{
"epoch": 0.11764705882352941,
"grad_norm": 8.225939704122897,
"learning_rate": 1.960124358562568e-06,
"loss": 0.4595,
"step": 308
},
{
"epoch": 0.11802902979373568,
"grad_norm": 5.646811578255499,
"learning_rate": 1.9597776979793007e-06,
"loss": 0.6004,
"step": 309
},
{
"epoch": 0.11841100076394194,
"grad_norm": 3.776921618778041,
"learning_rate": 1.9594295679794314e-06,
"loss": 0.616,
"step": 310
},
{
"epoch": 0.11879297173414821,
"grad_norm": 4.691043925903622,
"learning_rate": 1.9590799690959456e-06,
"loss": 0.5223,
"step": 311
},
{
"epoch": 0.11917494270435447,
"grad_norm": 7.166107995372442,
"learning_rate": 1.9587289018640787e-06,
"loss": 0.7085,
"step": 312
},
{
"epoch": 0.11955691367456073,
"grad_norm": 15.214836688871152,
"learning_rate": 1.9583763668213126e-06,
"loss": 0.5985,
"step": 313
},
{
"epoch": 0.119938884644767,
"grad_norm": 5.312919105323761,
"learning_rate": 1.9580223645073786e-06,
"loss": 0.5971,
"step": 314
},
{
"epoch": 0.12032085561497326,
"grad_norm": 7.7691966008692726,
"learning_rate": 1.9576668954642518e-06,
"loss": 0.5043,
"step": 315
},
{
"epoch": 0.12070282658517953,
"grad_norm": 20.610762040077248,
"learning_rate": 1.9573099602361553e-06,
"loss": 0.5065,
"step": 316
},
{
"epoch": 0.12108479755538579,
"grad_norm": 29.086197245117923,
"learning_rate": 1.9569515593695548e-06,
"loss": 0.6144,
"step": 317
},
{
"epoch": 0.12146676852559206,
"grad_norm": 33.12029190501601,
"learning_rate": 1.9565916934131618e-06,
"loss": 0.5545,
"step": 318
},
{
"epoch": 0.12184873949579832,
"grad_norm": 3.7108467677188957,
"learning_rate": 1.956230362917929e-06,
"loss": 0.5478,
"step": 319
},
{
"epoch": 0.12223071046600459,
"grad_norm": 2.7764343487978542,
"learning_rate": 1.955867568437053e-06,
"loss": 0.5677,
"step": 320
},
{
"epoch": 0.12261268143621085,
"grad_norm": 9.477033271988223,
"learning_rate": 1.955503310525971e-06,
"loss": 0.5512,
"step": 321
},
{
"epoch": 0.12299465240641712,
"grad_norm": 4.3968460001669705,
"learning_rate": 1.9551375897423604e-06,
"loss": 0.5451,
"step": 322
},
{
"epoch": 0.12337662337662338,
"grad_norm": 15.561330195325665,
"learning_rate": 1.9547704066461382e-06,
"loss": 0.5541,
"step": 323
},
{
"epoch": 0.12375859434682965,
"grad_norm": 4.550270318340944,
"learning_rate": 1.9544017617994617e-06,
"loss": 0.5589,
"step": 324
},
{
"epoch": 0.1241405653170359,
"grad_norm": 2.8993973240496076,
"learning_rate": 1.9540316557667236e-06,
"loss": 0.5785,
"step": 325
},
{
"epoch": 0.12452253628724216,
"grad_norm": 4.873813049680309,
"learning_rate": 1.9536600891145557e-06,
"loss": 0.621,
"step": 326
},
{
"epoch": 0.12490450725744844,
"grad_norm": 4.381877155485381,
"learning_rate": 1.9532870624118256e-06,
"loss": 0.5463,
"step": 327
},
{
"epoch": 0.1252864782276547,
"grad_norm": 3.1981786582566847,
"learning_rate": 1.9529125762296356e-06,
"loss": 0.5076,
"step": 328
},
{
"epoch": 0.12566844919786097,
"grad_norm": 4.331387448900339,
"learning_rate": 1.952536631141323e-06,
"loss": 0.6154,
"step": 329
},
{
"epoch": 0.12605042016806722,
"grad_norm": 3.473212725915271,
"learning_rate": 1.9521592277224587e-06,
"loss": 0.5971,
"step": 330
},
{
"epoch": 0.12643239113827348,
"grad_norm": 8.63087057459746,
"learning_rate": 1.9517803665508457e-06,
"loss": 0.5196,
"step": 331
},
{
"epoch": 0.12681436210847977,
"grad_norm": 4.474812207067407,
"learning_rate": 1.9514000482065196e-06,
"loss": 0.5319,
"step": 332
},
{
"epoch": 0.12719633307868602,
"grad_norm": 3.494968501203762,
"learning_rate": 1.951018273271747e-06,
"loss": 0.5551,
"step": 333
},
{
"epoch": 0.12757830404889228,
"grad_norm": 2.739301748152415,
"learning_rate": 1.950635042331023e-06,
"loss": 0.4947,
"step": 334
},
{
"epoch": 0.12796027501909854,
"grad_norm": 2.156954452463087,
"learning_rate": 1.950250355971074e-06,
"loss": 0.5026,
"step": 335
},
{
"epoch": 0.12834224598930483,
"grad_norm": 2.921994242594192,
"learning_rate": 1.9498642147808527e-06,
"loss": 0.5793,
"step": 336
},
{
"epoch": 0.12872421695951108,
"grad_norm": 3.7642621680466335,
"learning_rate": 1.949476619351541e-06,
"loss": 0.5749,
"step": 337
},
{
"epoch": 0.12910618792971734,
"grad_norm": 13.229444503504032,
"learning_rate": 1.949087570276545e-06,
"loss": 0.5863,
"step": 338
},
{
"epoch": 0.1294881588999236,
"grad_norm": 6.610583772086692,
"learning_rate": 1.948697068151499e-06,
"loss": 0.5771,
"step": 339
},
{
"epoch": 0.12987012987012986,
"grad_norm": 2.662384200035279,
"learning_rate": 1.94830511357426e-06,
"loss": 0.462,
"step": 340
},
{
"epoch": 0.13025210084033614,
"grad_norm": 9.227368925480821,
"learning_rate": 1.9479117071449085e-06,
"loss": 0.5419,
"step": 341
},
{
"epoch": 0.1306340718105424,
"grad_norm": 6.245643052478665,
"learning_rate": 1.9475168494657496e-06,
"loss": 0.5623,
"step": 342
},
{
"epoch": 0.13101604278074866,
"grad_norm": 3.1169872496295317,
"learning_rate": 1.9471205411413082e-06,
"loss": 0.5091,
"step": 343
},
{
"epoch": 0.13139801375095492,
"grad_norm": 3.0664080058582632,
"learning_rate": 1.9467227827783316e-06,
"loss": 0.4964,
"step": 344
},
{
"epoch": 0.1317799847211612,
"grad_norm": 6.616028945372317,
"learning_rate": 1.9463235749857863e-06,
"loss": 0.5546,
"step": 345
},
{
"epoch": 0.13216195569136746,
"grad_norm": 10.19707015916137,
"learning_rate": 1.945922918374859e-06,
"loss": 0.4688,
"step": 346
},
{
"epoch": 0.13254392666157372,
"grad_norm": 5.984892791306968,
"learning_rate": 1.9455208135589527e-06,
"loss": 0.5519,
"step": 347
},
{
"epoch": 0.13292589763177998,
"grad_norm": 2.835298431212049,
"learning_rate": 1.9451172611536887e-06,
"loss": 0.5318,
"step": 348
},
{
"epoch": 0.13330786860198626,
"grad_norm": 4.4156176487630265,
"learning_rate": 1.944712261776905e-06,
"loss": 0.5398,
"step": 349
},
{
"epoch": 0.13368983957219252,
"grad_norm": 3.4902594137991043,
"learning_rate": 1.9443058160486537e-06,
"loss": 0.5508,
"step": 350
},
{
"epoch": 0.13407181054239878,
"grad_norm": 2.8430946741850014,
"learning_rate": 1.943897924591203e-06,
"loss": 0.5059,
"step": 351
},
{
"epoch": 0.13445378151260504,
"grad_norm": 27.0985585658399,
"learning_rate": 1.943488588029032e-06,
"loss": 0.5883,
"step": 352
},
{
"epoch": 0.1348357524828113,
"grad_norm": 3.540536403271325,
"learning_rate": 1.9430778069888346e-06,
"loss": 0.544,
"step": 353
},
{
"epoch": 0.13521772345301758,
"grad_norm": 3.9935274512439083,
"learning_rate": 1.942665582099515e-06,
"loss": 0.5443,
"step": 354
},
{
"epoch": 0.13559969442322384,
"grad_norm": 8.228728108484132,
"learning_rate": 1.942251913992188e-06,
"loss": 0.5741,
"step": 355
},
{
"epoch": 0.1359816653934301,
"grad_norm": 2.3745527365165198,
"learning_rate": 1.9418368033001787e-06,
"loss": 0.5113,
"step": 356
},
{
"epoch": 0.13636363636363635,
"grad_norm": 5.445201095086467,
"learning_rate": 1.9414202506590197e-06,
"loss": 0.5209,
"step": 357
},
{
"epoch": 0.13674560733384264,
"grad_norm": 3.103070791430745,
"learning_rate": 1.941002256706452e-06,
"loss": 0.5854,
"step": 358
},
{
"epoch": 0.1371275783040489,
"grad_norm": 5.735410039753363,
"learning_rate": 1.9405828220824233e-06,
"loss": 0.5015,
"step": 359
},
{
"epoch": 0.13750954927425516,
"grad_norm": 44.47727501508621,
"learning_rate": 1.9401619474290863e-06,
"loss": 0.5022,
"step": 360
},
{
"epoch": 0.1378915202444614,
"grad_norm": 11.54466660420791,
"learning_rate": 1.939739633390799e-06,
"loss": 0.5026,
"step": 361
},
{
"epoch": 0.1382734912146677,
"grad_norm": 2.727692184162092,
"learning_rate": 1.939315880614122e-06,
"loss": 0.512,
"step": 362
},
{
"epoch": 0.13865546218487396,
"grad_norm": 5.387638675900171,
"learning_rate": 1.9388906897478206e-06,
"loss": 0.4948,
"step": 363
},
{
"epoch": 0.13903743315508021,
"grad_norm": 5.4738762897403035,
"learning_rate": 1.9384640614428603e-06,
"loss": 0.568,
"step": 364
},
{
"epoch": 0.13941940412528647,
"grad_norm": 3.137853676264805,
"learning_rate": 1.9380359963524073e-06,
"loss": 0.455,
"step": 365
},
{
"epoch": 0.13980137509549273,
"grad_norm": 2.85450795255527,
"learning_rate": 1.9376064951318286e-06,
"loss": 0.574,
"step": 366
},
{
"epoch": 0.14018334606569902,
"grad_norm": 8.305956887878198,
"learning_rate": 1.9371755584386884e-06,
"loss": 0.5438,
"step": 367
},
{
"epoch": 0.14056531703590527,
"grad_norm": 4.809269321089941,
"learning_rate": 1.93674318693275e-06,
"loss": 0.4855,
"step": 368
},
{
"epoch": 0.14094728800611153,
"grad_norm": 4.453335651466671,
"learning_rate": 1.9363093812759723e-06,
"loss": 0.5861,
"step": 369
},
{
"epoch": 0.1413292589763178,
"grad_norm": 3.2978567378812635,
"learning_rate": 1.935874142132511e-06,
"loss": 0.5345,
"step": 370
},
{
"epoch": 0.14171122994652408,
"grad_norm": 12.008265469941955,
"learning_rate": 1.9354374701687153e-06,
"loss": 0.5366,
"step": 371
},
{
"epoch": 0.14209320091673033,
"grad_norm": 3.2919020301413893,
"learning_rate": 1.9349993660531286e-06,
"loss": 0.5114,
"step": 372
},
{
"epoch": 0.1424751718869366,
"grad_norm": 7.145023652793651,
"learning_rate": 1.9345598304564875e-06,
"loss": 0.5006,
"step": 373
},
{
"epoch": 0.14285714285714285,
"grad_norm": 2.6546012908244814,
"learning_rate": 1.934118864051719e-06,
"loss": 0.5025,
"step": 374
},
{
"epoch": 0.14323911382734913,
"grad_norm": 4.844608194906241,
"learning_rate": 1.9336764675139416e-06,
"loss": 0.4897,
"step": 375
},
{
"epoch": 0.1436210847975554,
"grad_norm": 12.286804180963175,
"learning_rate": 1.933232641520463e-06,
"loss": 0.5046,
"step": 376
},
{
"epoch": 0.14400305576776165,
"grad_norm": 5.710044727414116,
"learning_rate": 1.932787386750779e-06,
"loss": 0.6171,
"step": 377
},
{
"epoch": 0.1443850267379679,
"grad_norm": 7.339897212556863,
"learning_rate": 1.932340703886573e-06,
"loss": 0.5777,
"step": 378
},
{
"epoch": 0.14476699770817417,
"grad_norm": 4.745725943014659,
"learning_rate": 1.931892593611716e-06,
"loss": 0.508,
"step": 379
},
{
"epoch": 0.14514896867838045,
"grad_norm": 3.736133374274852,
"learning_rate": 1.931443056612263e-06,
"loss": 0.4604,
"step": 380
},
{
"epoch": 0.1455309396485867,
"grad_norm": 10.799583523666772,
"learning_rate": 1.9309920935764536e-06,
"loss": 0.5162,
"step": 381
},
{
"epoch": 0.14591291061879297,
"grad_norm": 3.286063954330593,
"learning_rate": 1.9305397051947108e-06,
"loss": 0.5461,
"step": 382
},
{
"epoch": 0.14629488158899923,
"grad_norm": 4.010526947754956,
"learning_rate": 1.9300858921596395e-06,
"loss": 0.4637,
"step": 383
},
{
"epoch": 0.1466768525592055,
"grad_norm": 9.445646971706344,
"learning_rate": 1.9296306551660266e-06,
"loss": 0.5238,
"step": 384
},
{
"epoch": 0.14705882352941177,
"grad_norm": 9.813123565905315,
"learning_rate": 1.9291739949108382e-06,
"loss": 0.5323,
"step": 385
},
{
"epoch": 0.14744079449961803,
"grad_norm": 3.120586727864197,
"learning_rate": 1.9287159120932198e-06,
"loss": 0.5204,
"step": 386
},
{
"epoch": 0.14782276546982429,
"grad_norm": 7.671023206343543,
"learning_rate": 1.928256407414494e-06,
"loss": 0.5225,
"step": 387
},
{
"epoch": 0.14820473644003057,
"grad_norm": 3.7529804673105693,
"learning_rate": 1.9277954815781623e-06,
"loss": 0.5061,
"step": 388
},
{
"epoch": 0.14858670741023683,
"grad_norm": 5.585820882674302,
"learning_rate": 1.9273331352899e-06,
"loss": 0.5897,
"step": 389
},
{
"epoch": 0.1489686783804431,
"grad_norm": 3.2965758857033327,
"learning_rate": 1.9268693692575576e-06,
"loss": 0.5668,
"step": 390
},
{
"epoch": 0.14935064935064934,
"grad_norm": 4.062527614549751,
"learning_rate": 1.9264041841911595e-06,
"loss": 0.4788,
"step": 391
},
{
"epoch": 0.1497326203208556,
"grad_norm": 4.527958514536294,
"learning_rate": 1.925937580802903e-06,
"loss": 0.5355,
"step": 392
},
{
"epoch": 0.1501145912910619,
"grad_norm": 4.559973018919794,
"learning_rate": 1.9254695598071557e-06,
"loss": 0.5363,
"step": 393
},
{
"epoch": 0.15049656226126815,
"grad_norm": 3.0412227798466955,
"learning_rate": 1.925000121920457e-06,
"loss": 0.5067,
"step": 394
},
{
"epoch": 0.1508785332314744,
"grad_norm": 4.231912383154748,
"learning_rate": 1.924529267861514e-06,
"loss": 0.5168,
"step": 395
},
{
"epoch": 0.15126050420168066,
"grad_norm": 5.197316808761914,
"learning_rate": 1.9240569983512036e-06,
"loss": 0.5945,
"step": 396
},
{
"epoch": 0.15164247517188695,
"grad_norm": 4.14321526210149,
"learning_rate": 1.9235833141125685e-06,
"loss": 0.4682,
"step": 397
},
{
"epoch": 0.1520244461420932,
"grad_norm": 3.7461263120755057,
"learning_rate": 1.9231082158708177e-06,
"loss": 0.6016,
"step": 398
},
{
"epoch": 0.15240641711229946,
"grad_norm": 2.480897417284355,
"learning_rate": 1.9226317043533252e-06,
"loss": 0.5446,
"step": 399
},
{
"epoch": 0.15278838808250572,
"grad_norm": 8.07438667998488,
"learning_rate": 1.922153780289629e-06,
"loss": 0.462,
"step": 400
},
{
"epoch": 0.153170359052712,
"grad_norm": 3.3341835643001883,
"learning_rate": 1.9216744444114283e-06,
"loss": 0.5137,
"step": 401
},
{
"epoch": 0.15355233002291827,
"grad_norm": 12.924968914010819,
"learning_rate": 1.921193697452586e-06,
"loss": 0.5115,
"step": 402
},
{
"epoch": 0.15393430099312452,
"grad_norm": 3.88799749897571,
"learning_rate": 1.9207115401491236e-06,
"loss": 0.5732,
"step": 403
},
{
"epoch": 0.15431627196333078,
"grad_norm": 5.63628842857517,
"learning_rate": 1.920227973239222e-06,
"loss": 0.5443,
"step": 404
},
{
"epoch": 0.15469824293353704,
"grad_norm": 11.214095099098154,
"learning_rate": 1.919742997463221e-06,
"loss": 0.5328,
"step": 405
},
{
"epoch": 0.15508021390374332,
"grad_norm": 5.566051428810673,
"learning_rate": 1.919256613563617e-06,
"loss": 0.4916,
"step": 406
},
{
"epoch": 0.15546218487394958,
"grad_norm": 3.3238053815734765,
"learning_rate": 1.9187688222850625e-06,
"loss": 0.5405,
"step": 407
},
{
"epoch": 0.15584415584415584,
"grad_norm": 3.1790419527835585,
"learning_rate": 1.9182796243743637e-06,
"loss": 0.5485,
"step": 408
},
{
"epoch": 0.1562261268143621,
"grad_norm": 3.220273652822294,
"learning_rate": 1.917789020580482e-06,
"loss": 0.5343,
"step": 409
},
{
"epoch": 0.15660809778456838,
"grad_norm": 4.181974969495614,
"learning_rate": 1.917297011654529e-06,
"loss": 0.5917,
"step": 410
},
{
"epoch": 0.15699006875477464,
"grad_norm": 3.804677452137214,
"learning_rate": 1.9168035983497697e-06,
"loss": 0.4721,
"step": 411
},
{
"epoch": 0.1573720397249809,
"grad_norm": 4.355811094333278,
"learning_rate": 1.9163087814216184e-06,
"loss": 0.5569,
"step": 412
},
{
"epoch": 0.15775401069518716,
"grad_norm": 4.541557617112437,
"learning_rate": 1.9158125616276375e-06,
"loss": 0.5277,
"step": 413
},
{
"epoch": 0.15813598166539344,
"grad_norm": 5.070373471399588,
"learning_rate": 1.9153149397275384e-06,
"loss": 0.5818,
"step": 414
},
{
"epoch": 0.1585179526355997,
"grad_norm": 3.3582128161192233,
"learning_rate": 1.9148159164831785e-06,
"loss": 0.5603,
"step": 415
},
{
"epoch": 0.15889992360580596,
"grad_norm": 6.345325700372508,
"learning_rate": 1.9143154926585612e-06,
"loss": 0.5772,
"step": 416
},
{
"epoch": 0.15928189457601222,
"grad_norm": 18.705450602040244,
"learning_rate": 1.9138136690198334e-06,
"loss": 0.5301,
"step": 417
},
{
"epoch": 0.15966386554621848,
"grad_norm": 32.11250304588509,
"learning_rate": 1.9133104463352852e-06,
"loss": 0.5438,
"step": 418
},
{
"epoch": 0.16004583651642476,
"grad_norm": 3.606136169430874,
"learning_rate": 1.9128058253753495e-06,
"loss": 0.467,
"step": 419
},
{
"epoch": 0.16042780748663102,
"grad_norm": 3.241408005700225,
"learning_rate": 1.9122998069125995e-06,
"loss": 0.5028,
"step": 420
},
{
"epoch": 0.16080977845683728,
"grad_norm": 3.518264294880338,
"learning_rate": 1.911792391721747e-06,
"loss": 0.5709,
"step": 421
},
{
"epoch": 0.16119174942704353,
"grad_norm": 3.966087908470135,
"learning_rate": 1.911283580579644e-06,
"loss": 0.5232,
"step": 422
},
{
"epoch": 0.16157372039724982,
"grad_norm": 8.417923174743748,
"learning_rate": 1.910773374265278e-06,
"loss": 0.5512,
"step": 423
},
{
"epoch": 0.16195569136745608,
"grad_norm": 5.490712115215511,
"learning_rate": 1.910261773559774e-06,
"loss": 0.5249,
"step": 424
},
{
"epoch": 0.16233766233766234,
"grad_norm": 5.775193829980829,
"learning_rate": 1.90974877924639e-06,
"loss": 0.5097,
"step": 425
},
{
"epoch": 0.1627196333078686,
"grad_norm": 2.6822332745102724,
"learning_rate": 1.9092343921105193e-06,
"loss": 0.5452,
"step": 426
},
{
"epoch": 0.16310160427807488,
"grad_norm": 2.5693849411860046,
"learning_rate": 1.908718612939687e-06,
"loss": 0.5138,
"step": 427
},
{
"epoch": 0.16348357524828114,
"grad_norm": 3.9807799406529933,
"learning_rate": 1.90820144252355e-06,
"loss": 0.5094,
"step": 428
},
{
"epoch": 0.1638655462184874,
"grad_norm": 6.894858559350513,
"learning_rate": 1.907682881653893e-06,
"loss": 0.5386,
"step": 429
},
{
"epoch": 0.16424751718869365,
"grad_norm": 9.704522067639532,
"learning_rate": 1.9071629311246325e-06,
"loss": 0.5815,
"step": 430
},
{
"epoch": 0.1646294881588999,
"grad_norm": 4.038760905743012,
"learning_rate": 1.90664159173181e-06,
"loss": 0.584,
"step": 431
},
{
"epoch": 0.1650114591291062,
"grad_norm": 3.933972091240762,
"learning_rate": 1.9061188642735955e-06,
"loss": 0.5936,
"step": 432
},
{
"epoch": 0.16539343009931246,
"grad_norm": 2.809066574314389,
"learning_rate": 1.905594749550282e-06,
"loss": 0.5166,
"step": 433
},
{
"epoch": 0.1657754010695187,
"grad_norm": 3.232605764012116,
"learning_rate": 1.9050692483642884e-06,
"loss": 0.5989,
"step": 434
},
{
"epoch": 0.16615737203972497,
"grad_norm": 2.779376649403726,
"learning_rate": 1.9045423615201549e-06,
"loss": 0.5631,
"step": 435
},
{
"epoch": 0.16653934300993126,
"grad_norm": 6.427025295782284,
"learning_rate": 1.9040140898245437e-06,
"loss": 0.5532,
"step": 436
},
{
"epoch": 0.16692131398013751,
"grad_norm": 3.2510627416912663,
"learning_rate": 1.9034844340862368e-06,
"loss": 0.5343,
"step": 437
},
{
"epoch": 0.16730328495034377,
"grad_norm": 3.1443297662844194,
"learning_rate": 1.902953395116136e-06,
"loss": 0.5134,
"step": 438
},
{
"epoch": 0.16768525592055003,
"grad_norm": 4.539213723683322,
"learning_rate": 1.9024209737272597e-06,
"loss": 0.5408,
"step": 439
},
{
"epoch": 0.16806722689075632,
"grad_norm": 11.775446749973153,
"learning_rate": 1.9018871707347435e-06,
"loss": 0.4969,
"step": 440
},
{
"epoch": 0.16844919786096257,
"grad_norm": 4.384039093799763,
"learning_rate": 1.9013519869558386e-06,
"loss": 0.5977,
"step": 441
},
{
"epoch": 0.16883116883116883,
"grad_norm": 7.162891708265767,
"learning_rate": 1.900815423209909e-06,
"loss": 0.5363,
"step": 442
},
{
"epoch": 0.1692131398013751,
"grad_norm": 3.56315041439479,
"learning_rate": 1.9002774803184322e-06,
"loss": 0.4719,
"step": 443
},
{
"epoch": 0.16959511077158135,
"grad_norm": 3.969115647981566,
"learning_rate": 1.8997381591049972e-06,
"loss": 0.5716,
"step": 444
},
{
"epoch": 0.16997708174178763,
"grad_norm": 9.170649989885177,
"learning_rate": 1.8991974603953034e-06,
"loss": 0.5123,
"step": 445
},
{
"epoch": 0.1703590527119939,
"grad_norm": 3.0650147223282116,
"learning_rate": 1.8986553850171583e-06,
"loss": 0.6062,
"step": 446
},
{
"epoch": 0.17074102368220015,
"grad_norm": 7.195397239965812,
"learning_rate": 1.8981119338004775e-06,
"loss": 0.5529,
"step": 447
},
{
"epoch": 0.1711229946524064,
"grad_norm": 6.176705315024795,
"learning_rate": 1.897567107577284e-06,
"loss": 0.4855,
"step": 448
},
{
"epoch": 0.1715049656226127,
"grad_norm": 7.689916301252105,
"learning_rate": 1.8970209071817035e-06,
"loss": 0.514,
"step": 449
},
{
"epoch": 0.17188693659281895,
"grad_norm": 2.510661764320135,
"learning_rate": 1.8964733334499684e-06,
"loss": 0.5258,
"step": 450
},
{
"epoch": 0.1722689075630252,
"grad_norm": 4.630483174397124,
"learning_rate": 1.8959243872204115e-06,
"loss": 0.5394,
"step": 451
},
{
"epoch": 0.17265087853323147,
"grad_norm": 5.331327229967195,
"learning_rate": 1.8953740693334686e-06,
"loss": 0.5786,
"step": 452
},
{
"epoch": 0.17303284950343775,
"grad_norm": 2.6657247574130567,
"learning_rate": 1.8948223806316737e-06,
"loss": 0.4973,
"step": 453
},
{
"epoch": 0.173414820473644,
"grad_norm": 7.0345595478062135,
"learning_rate": 1.894269321959661e-06,
"loss": 0.5408,
"step": 454
},
{
"epoch": 0.17379679144385027,
"grad_norm": 4.786102808716031,
"learning_rate": 1.8937148941641613e-06,
"loss": 0.4833,
"step": 455
},
{
"epoch": 0.17417876241405653,
"grad_norm": 3.6696330883093613,
"learning_rate": 1.8931590980940022e-06,
"loss": 0.4945,
"step": 456
},
{
"epoch": 0.17456073338426278,
"grad_norm": 8.668385378720208,
"learning_rate": 1.8926019346001052e-06,
"loss": 0.5804,
"step": 457
},
{
"epoch": 0.17494270435446907,
"grad_norm": 21.438661768324913,
"learning_rate": 1.8920434045354861e-06,
"loss": 0.6104,
"step": 458
},
{
"epoch": 0.17532467532467533,
"grad_norm": 4.64360218036425,
"learning_rate": 1.8914835087552528e-06,
"loss": 0.5346,
"step": 459
},
{
"epoch": 0.17570664629488159,
"grad_norm": 13.424051118396385,
"learning_rate": 1.8909222481166036e-06,
"loss": 0.4748,
"step": 460
},
{
"epoch": 0.17608861726508784,
"grad_norm": 6.281611519049626,
"learning_rate": 1.8903596234788268e-06,
"loss": 0.5342,
"step": 461
},
{
"epoch": 0.17647058823529413,
"grad_norm": 6.22921812160575,
"learning_rate": 1.8897956357032993e-06,
"loss": 0.628,
"step": 462
},
{
"epoch": 0.1768525592055004,
"grad_norm": 5.1035629491397625,
"learning_rate": 1.8892302856534843e-06,
"loss": 0.5212,
"step": 463
},
{
"epoch": 0.17723453017570664,
"grad_norm": 3.0915992711189064,
"learning_rate": 1.888663574194931e-06,
"loss": 0.534,
"step": 464
},
{
"epoch": 0.1776165011459129,
"grad_norm": 5.332723908181321,
"learning_rate": 1.8880955021952726e-06,
"loss": 0.5906,
"step": 465
},
{
"epoch": 0.1779984721161192,
"grad_norm": 3.0869913876672905,
"learning_rate": 1.8875260705242262e-06,
"loss": 0.6028,
"step": 466
},
{
"epoch": 0.17838044308632545,
"grad_norm": 2.6777228858601188,
"learning_rate": 1.8869552800535891e-06,
"loss": 0.5192,
"step": 467
},
{
"epoch": 0.1787624140565317,
"grad_norm": 7.132223781890689,
"learning_rate": 1.8863831316572401e-06,
"loss": 0.5569,
"step": 468
},
{
"epoch": 0.17914438502673796,
"grad_norm": 3.0366710097530087,
"learning_rate": 1.8858096262111365e-06,
"loss": 0.5749,
"step": 469
},
{
"epoch": 0.17952635599694422,
"grad_norm": 4.359613446921773,
"learning_rate": 1.8852347645933134e-06,
"loss": 0.5339,
"step": 470
},
{
"epoch": 0.1799083269671505,
"grad_norm": 3.0313288425668476,
"learning_rate": 1.8846585476838817e-06,
"loss": 0.51,
"step": 471
},
{
"epoch": 0.18029029793735676,
"grad_norm": 4.623850202032284,
"learning_rate": 1.8840809763650283e-06,
"loss": 0.5301,
"step": 472
},
{
"epoch": 0.18067226890756302,
"grad_norm": 4.412433502531477,
"learning_rate": 1.8835020515210125e-06,
"loss": 0.5456,
"step": 473
},
{
"epoch": 0.18105423987776928,
"grad_norm": 7.180970352151755,
"learning_rate": 1.8829217740381667e-06,
"loss": 0.5878,
"step": 474
},
{
"epoch": 0.18143621084797557,
"grad_norm": 3.2580271139855563,
"learning_rate": 1.8823401448048938e-06,
"loss": 0.4877,
"step": 475
},
{
"epoch": 0.18181818181818182,
"grad_norm": 6.005766550636256,
"learning_rate": 1.8817571647116662e-06,
"loss": 0.5422,
"step": 476
},
{
"epoch": 0.18220015278838808,
"grad_norm": 63.296707768024426,
"learning_rate": 1.8811728346510249e-06,
"loss": 0.5151,
"step": 477
},
{
"epoch": 0.18258212375859434,
"grad_norm": 4.7969886037725065,
"learning_rate": 1.8805871555175769e-06,
"loss": 0.5801,
"step": 478
},
{
"epoch": 0.18296409472880062,
"grad_norm": 3.870191837934829,
"learning_rate": 1.8800001282079953e-06,
"loss": 0.6167,
"step": 479
},
{
"epoch": 0.18334606569900688,
"grad_norm": 5.9456808869839834,
"learning_rate": 1.8794117536210172e-06,
"loss": 0.5733,
"step": 480
},
{
"epoch": 0.18372803666921314,
"grad_norm": 2.677736985974354,
"learning_rate": 1.878822032657442e-06,
"loss": 0.4856,
"step": 481
},
{
"epoch": 0.1841100076394194,
"grad_norm": 2.003203767851191,
"learning_rate": 1.878230966220131e-06,
"loss": 0.4177,
"step": 482
},
{
"epoch": 0.18449197860962566,
"grad_norm": 4.958590655184132,
"learning_rate": 1.8776385552140044e-06,
"loss": 0.4863,
"step": 483
},
{
"epoch": 0.18487394957983194,
"grad_norm": 3.148442186053413,
"learning_rate": 1.877044800546042e-06,
"loss": 0.4769,
"step": 484
},
{
"epoch": 0.1852559205500382,
"grad_norm": 3.8142357273417606,
"learning_rate": 1.8764497031252801e-06,
"loss": 0.4691,
"step": 485
},
{
"epoch": 0.18563789152024446,
"grad_norm": 8.575634805488457,
"learning_rate": 1.8758532638628114e-06,
"loss": 0.5217,
"step": 486
},
{
"epoch": 0.18601986249045072,
"grad_norm": 4.300124100036272,
"learning_rate": 1.875255483671782e-06,
"loss": 0.5795,
"step": 487
},
{
"epoch": 0.186401833460657,
"grad_norm": 2.579583524654158,
"learning_rate": 1.8746563634673915e-06,
"loss": 0.4782,
"step": 488
},
{
"epoch": 0.18678380443086326,
"grad_norm": 3.701604778764327,
"learning_rate": 1.8740559041668912e-06,
"loss": 0.5805,
"step": 489
},
{
"epoch": 0.18716577540106952,
"grad_norm": 2.6956484266740404,
"learning_rate": 1.8734541066895825e-06,
"loss": 0.5391,
"step": 490
},
{
"epoch": 0.18754774637127578,
"grad_norm": 2.436956916352227,
"learning_rate": 1.8728509719568154e-06,
"loss": 0.5029,
"step": 491
},
{
"epoch": 0.18792971734148206,
"grad_norm": 3.796654944502864,
"learning_rate": 1.872246500891987e-06,
"loss": 0.4712,
"step": 492
},
{
"epoch": 0.18831168831168832,
"grad_norm": 2.9952676449872095,
"learning_rate": 1.8716406944205407e-06,
"loss": 0.5643,
"step": 493
},
{
"epoch": 0.18869365928189458,
"grad_norm": 5.071645117174178,
"learning_rate": 1.8710335534699644e-06,
"loss": 0.5572,
"step": 494
},
{
"epoch": 0.18907563025210083,
"grad_norm": 3.2218956223207593,
"learning_rate": 1.8704250789697888e-06,
"loss": 0.5891,
"step": 495
},
{
"epoch": 0.1894576012223071,
"grad_norm": 2.567742013094073,
"learning_rate": 1.8698152718515865e-06,
"loss": 0.5372,
"step": 496
},
{
"epoch": 0.18983957219251338,
"grad_norm": 2.894313808043004,
"learning_rate": 1.8692041330489702e-06,
"loss": 0.5649,
"step": 497
},
{
"epoch": 0.19022154316271964,
"grad_norm": 3.9977337336849113,
"learning_rate": 1.8685916634975915e-06,
"loss": 0.6096,
"step": 498
},
{
"epoch": 0.1906035141329259,
"grad_norm": 4.36211085391409,
"learning_rate": 1.8679778641351396e-06,
"loss": 0.5662,
"step": 499
},
{
"epoch": 0.19098548510313215,
"grad_norm": 2.824022282196409,
"learning_rate": 1.867362735901339e-06,
"loss": 0.5705,
"step": 500
},
{
"epoch": 0.19136745607333844,
"grad_norm": 3.853603764532641,
"learning_rate": 1.8667462797379488e-06,
"loss": 0.5318,
"step": 501
},
{
"epoch": 0.1917494270435447,
"grad_norm": 5.2244234015604984,
"learning_rate": 1.8661284965887623e-06,
"loss": 0.451,
"step": 502
},
{
"epoch": 0.19213139801375095,
"grad_norm": 3.6467105779523474,
"learning_rate": 1.8655093873996026e-06,
"loss": 0.4862,
"step": 503
},
{
"epoch": 0.1925133689839572,
"grad_norm": 4.761577705016175,
"learning_rate": 1.8648889531183248e-06,
"loss": 0.4919,
"step": 504
},
{
"epoch": 0.1928953399541635,
"grad_norm": 2.5269159027313837,
"learning_rate": 1.8642671946948108e-06,
"loss": 0.5126,
"step": 505
},
{
"epoch": 0.19327731092436976,
"grad_norm": 3.6966310858711693,
"learning_rate": 1.8636441130809718e-06,
"loss": 0.5381,
"step": 506
},
{
"epoch": 0.193659281894576,
"grad_norm": 2.9896407474985973,
"learning_rate": 1.863019709230743e-06,
"loss": 0.5316,
"step": 507
},
{
"epoch": 0.19404125286478227,
"grad_norm": 2.818002574138244,
"learning_rate": 1.8623939841000853e-06,
"loss": 0.4808,
"step": 508
},
{
"epoch": 0.19442322383498853,
"grad_norm": 3.2154224870415122,
"learning_rate": 1.8617669386469812e-06,
"loss": 0.5802,
"step": 509
},
{
"epoch": 0.19480519480519481,
"grad_norm": 3.6329119766814633,
"learning_rate": 1.861138573831436e-06,
"loss": 0.5505,
"step": 510
},
{
"epoch": 0.19518716577540107,
"grad_norm": 2.809782100245695,
"learning_rate": 1.8605088906154735e-06,
"loss": 0.463,
"step": 511
},
{
"epoch": 0.19556913674560733,
"grad_norm": 5.423524792453049,
"learning_rate": 1.8598778899631376e-06,
"loss": 0.5187,
"step": 512
},
{
"epoch": 0.1959511077158136,
"grad_norm": 3.1713155248240663,
"learning_rate": 1.8592455728404873e-06,
"loss": 0.4596,
"step": 513
},
{
"epoch": 0.19633307868601987,
"grad_norm": 2.996096261985183,
"learning_rate": 1.8586119402155993e-06,
"loss": 0.5526,
"step": 514
},
{
"epoch": 0.19671504965622613,
"grad_norm": 5.79015390194803,
"learning_rate": 1.8579769930585619e-06,
"loss": 0.576,
"step": 515
},
{
"epoch": 0.1970970206264324,
"grad_norm": 3.9655195684002087,
"learning_rate": 1.8573407323414779e-06,
"loss": 0.6321,
"step": 516
},
{
"epoch": 0.19747899159663865,
"grad_norm": 8.441146227044339,
"learning_rate": 1.85670315903846e-06,
"loss": 0.586,
"step": 517
},
{
"epoch": 0.19786096256684493,
"grad_norm": 17.00964825679305,
"learning_rate": 1.8560642741256314e-06,
"loss": 0.4568,
"step": 518
},
{
"epoch": 0.1982429335370512,
"grad_norm": 3.084747706798306,
"learning_rate": 1.8554240785811226e-06,
"loss": 0.5206,
"step": 519
},
{
"epoch": 0.19862490450725745,
"grad_norm": 3.4741332461100276,
"learning_rate": 1.8547825733850711e-06,
"loss": 0.577,
"step": 520
},
{
"epoch": 0.1990068754774637,
"grad_norm": 3.4594383129448287,
"learning_rate": 1.854139759519619e-06,
"loss": 0.5612,
"step": 521
},
{
"epoch": 0.19938884644766997,
"grad_norm": 2.9282239639784917,
"learning_rate": 1.8534956379689124e-06,
"loss": 0.5305,
"step": 522
},
{
"epoch": 0.19977081741787625,
"grad_norm": 6.784944014323334,
"learning_rate": 1.8528502097190994e-06,
"loss": 0.5725,
"step": 523
},
{
"epoch": 0.2001527883880825,
"grad_norm": 2.2673706400022087,
"learning_rate": 1.8522034757583287e-06,
"loss": 0.496,
"step": 524
},
{
"epoch": 0.20053475935828877,
"grad_norm": 3.050042571012531,
"learning_rate": 1.851555437076748e-06,
"loss": 0.4564,
"step": 525
},
{
"epoch": 0.20091673032849502,
"grad_norm": 4.446629028729372,
"learning_rate": 1.8509060946665019e-06,
"loss": 0.5623,
"step": 526
},
{
"epoch": 0.2012987012987013,
"grad_norm": 3.1836775296072672,
"learning_rate": 1.850255449521732e-06,
"loss": 0.5551,
"step": 527
},
{
"epoch": 0.20168067226890757,
"grad_norm": 3.322504159555205,
"learning_rate": 1.8496035026385742e-06,
"loss": 0.5071,
"step": 528
},
{
"epoch": 0.20206264323911383,
"grad_norm": 26.70160743717929,
"learning_rate": 1.8489502550151565e-06,
"loss": 0.5149,
"step": 529
},
{
"epoch": 0.20244461420932008,
"grad_norm": 2.873326692598321,
"learning_rate": 1.8482957076515995e-06,
"loss": 0.4605,
"step": 530
},
{
"epoch": 0.20282658517952637,
"grad_norm": 13.38960869623681,
"learning_rate": 1.8476398615500126e-06,
"loss": 0.5483,
"step": 531
},
{
"epoch": 0.20320855614973263,
"grad_norm": 5.120750671092757,
"learning_rate": 1.8469827177144945e-06,
"loss": 0.6108,
"step": 532
},
{
"epoch": 0.20359052711993889,
"grad_norm": 6.1165279573398985,
"learning_rate": 1.8463242771511302e-06,
"loss": 0.5603,
"step": 533
},
{
"epoch": 0.20397249809014514,
"grad_norm": 5.845196697566236,
"learning_rate": 1.8456645408679901e-06,
"loss": 0.4849,
"step": 534
},
{
"epoch": 0.2043544690603514,
"grad_norm": 6.374199389125064,
"learning_rate": 1.8450035098751284e-06,
"loss": 0.5738,
"step": 535
},
{
"epoch": 0.2047364400305577,
"grad_norm": 2.969692491461521,
"learning_rate": 1.8443411851845815e-06,
"loss": 0.4948,
"step": 536
},
{
"epoch": 0.20511841100076394,
"grad_norm": 4.83667398940346,
"learning_rate": 1.8436775678103662e-06,
"loss": 0.6144,
"step": 537
},
{
"epoch": 0.2055003819709702,
"grad_norm": 3.2292434566996824,
"learning_rate": 1.8430126587684784e-06,
"loss": 0.5715,
"step": 538
},
{
"epoch": 0.20588235294117646,
"grad_norm": 2.105787012901775,
"learning_rate": 1.8423464590768922e-06,
"loss": 0.5088,
"step": 539
},
{
"epoch": 0.20626432391138275,
"grad_norm": 2.6848990053540946,
"learning_rate": 1.8416789697555571e-06,
"loss": 0.5096,
"step": 540
},
{
"epoch": 0.206646294881589,
"grad_norm": 3.0525532670931645,
"learning_rate": 1.841010191826397e-06,
"loss": 0.5034,
"step": 541
},
{
"epoch": 0.20702826585179526,
"grad_norm": 3.006624669051693,
"learning_rate": 1.8403401263133087e-06,
"loss": 0.5712,
"step": 542
},
{
"epoch": 0.20741023682200152,
"grad_norm": 12.853137756580841,
"learning_rate": 1.8396687742421605e-06,
"loss": 0.5399,
"step": 543
},
{
"epoch": 0.2077922077922078,
"grad_norm": 5.52397493327756,
"learning_rate": 1.8389961366407904e-06,
"loss": 0.5681,
"step": 544
},
{
"epoch": 0.20817417876241406,
"grad_norm": 4.301081096217388,
"learning_rate": 1.8383222145390045e-06,
"loss": 0.4553,
"step": 545
},
{
"epoch": 0.20855614973262032,
"grad_norm": 9.972029144703678,
"learning_rate": 1.8376470089685748e-06,
"loss": 0.4972,
"step": 546
},
{
"epoch": 0.20893812070282658,
"grad_norm": 2.956531873075386,
"learning_rate": 1.8369705209632397e-06,
"loss": 0.4763,
"step": 547
},
{
"epoch": 0.20932009167303284,
"grad_norm": 2.352517812933045,
"learning_rate": 1.8362927515586993e-06,
"loss": 0.4823,
"step": 548
},
{
"epoch": 0.20970206264323912,
"grad_norm": 4.984880824898305,
"learning_rate": 1.8356137017926169e-06,
"loss": 0.5279,
"step": 549
},
{
"epoch": 0.21008403361344538,
"grad_norm": 8.689006523073017,
"learning_rate": 1.834933372704616e-06,
"loss": 0.5201,
"step": 550
},
{
"epoch": 0.21046600458365164,
"grad_norm": 2.6869978648037134,
"learning_rate": 1.834251765336277e-06,
"loss": 0.5057,
"step": 551
},
{
"epoch": 0.2108479755538579,
"grad_norm": 6.246534246290759,
"learning_rate": 1.83356888073114e-06,
"loss": 0.6429,
"step": 552
},
{
"epoch": 0.21122994652406418,
"grad_norm": 4.424921302293031,
"learning_rate": 1.8328847199346983e-06,
"loss": 0.513,
"step": 553
},
{
"epoch": 0.21161191749427044,
"grad_norm": 2.506864073735516,
"learning_rate": 1.8321992839944002e-06,
"loss": 0.512,
"step": 554
},
{
"epoch": 0.2119938884644767,
"grad_norm": 4.147884646333317,
"learning_rate": 1.831512573959646e-06,
"loss": 0.5281,
"step": 555
},
{
"epoch": 0.21237585943468296,
"grad_norm": 6.453459918356094,
"learning_rate": 1.8308245908817862e-06,
"loss": 0.6072,
"step": 556
},
{
"epoch": 0.21275783040488924,
"grad_norm": 3.227311642376684,
"learning_rate": 1.830135335814121e-06,
"loss": 0.5904,
"step": 557
},
{
"epoch": 0.2131398013750955,
"grad_norm": 6.966546708900706,
"learning_rate": 1.829444809811898e-06,
"loss": 0.5652,
"step": 558
},
{
"epoch": 0.21352177234530176,
"grad_norm": 4.746858728601705,
"learning_rate": 1.8287530139323098e-06,
"loss": 0.5912,
"step": 559
},
{
"epoch": 0.21390374331550802,
"grad_norm": 3.7259365079328033,
"learning_rate": 1.8280599492344937e-06,
"loss": 0.5546,
"step": 560
},
{
"epoch": 0.21428571428571427,
"grad_norm": 3.347921431679426,
"learning_rate": 1.82736561677953e-06,
"loss": 0.4817,
"step": 561
},
{
"epoch": 0.21466768525592056,
"grad_norm": 4.556644147705862,
"learning_rate": 1.8266700176304388e-06,
"loss": 0.5861,
"step": 562
},
{
"epoch": 0.21504965622612682,
"grad_norm": 3.1934469152204796,
"learning_rate": 1.825973152852181e-06,
"loss": 0.4972,
"step": 563
},
{
"epoch": 0.21543162719633308,
"grad_norm": 6.385370127198636,
"learning_rate": 1.825275023511654e-06,
"loss": 0.5343,
"step": 564
},
{
"epoch": 0.21581359816653933,
"grad_norm": 2.3151167357495517,
"learning_rate": 1.8245756306776911e-06,
"loss": 0.4989,
"step": 565
},
{
"epoch": 0.21619556913674562,
"grad_norm": 10.555445276500569,
"learning_rate": 1.8238749754210611e-06,
"loss": 0.5253,
"step": 566
},
{
"epoch": 0.21657754010695188,
"grad_norm": 7.1724974034380775,
"learning_rate": 1.8231730588144652e-06,
"loss": 0.5069,
"step": 567
},
{
"epoch": 0.21695951107715813,
"grad_norm": 4.575058249729483,
"learning_rate": 1.8224698819325348e-06,
"loss": 0.5316,
"step": 568
},
{
"epoch": 0.2173414820473644,
"grad_norm": 4.002746245635046,
"learning_rate": 1.8217654458518318e-06,
"loss": 0.4499,
"step": 569
},
{
"epoch": 0.21772345301757068,
"grad_norm": 3.948887241448565,
"learning_rate": 1.8210597516508457e-06,
"loss": 0.4636,
"step": 570
},
{
"epoch": 0.21810542398777694,
"grad_norm": 3.3365032713673455,
"learning_rate": 1.820352800409992e-06,
"loss": 0.484,
"step": 571
},
{
"epoch": 0.2184873949579832,
"grad_norm": 4.252239694976291,
"learning_rate": 1.8196445932116106e-06,
"loss": 0.491,
"step": 572
},
{
"epoch": 0.21886936592818945,
"grad_norm": 3.7309930962337203,
"learning_rate": 1.8189351311399647e-06,
"loss": 0.531,
"step": 573
},
{
"epoch": 0.2192513368983957,
"grad_norm": 5.44261846473984,
"learning_rate": 1.8182244152812384e-06,
"loss": 0.4911,
"step": 574
},
{
"epoch": 0.219633307868602,
"grad_norm": 3.287807998242394,
"learning_rate": 1.8175124467235351e-06,
"loss": 0.4891,
"step": 575
},
{
"epoch": 0.22001527883880825,
"grad_norm": 4.594516974830945,
"learning_rate": 1.8167992265568772e-06,
"loss": 0.5013,
"step": 576
},
{
"epoch": 0.2203972498090145,
"grad_norm": 3.3754014701183963,
"learning_rate": 1.816084755873202e-06,
"loss": 0.5385,
"step": 577
},
{
"epoch": 0.22077922077922077,
"grad_norm": 6.383964843549181,
"learning_rate": 1.8153690357663618e-06,
"loss": 0.5765,
"step": 578
},
{
"epoch": 0.22116119174942706,
"grad_norm": 2.609056317491272,
"learning_rate": 1.8146520673321217e-06,
"loss": 0.5439,
"step": 579
},
{
"epoch": 0.2215431627196333,
"grad_norm": 3.213598585954573,
"learning_rate": 1.8139338516681584e-06,
"loss": 0.561,
"step": 580
},
{
"epoch": 0.22192513368983957,
"grad_norm": 5.179533809421901,
"learning_rate": 1.8132143898740578e-06,
"loss": 0.5489,
"step": 581
},
{
"epoch": 0.22230710466004583,
"grad_norm": 9.17495571569845,
"learning_rate": 1.8124936830513131e-06,
"loss": 0.4893,
"step": 582
},
{
"epoch": 0.22268907563025211,
"grad_norm": 3.861478481777529,
"learning_rate": 1.8117717323033247e-06,
"loss": 0.5399,
"step": 583
},
{
"epoch": 0.22307104660045837,
"grad_norm": 17.525616277161284,
"learning_rate": 1.811048538735397e-06,
"loss": 0.5589,
"step": 584
},
{
"epoch": 0.22345301757066463,
"grad_norm": 9.15611393827099,
"learning_rate": 1.8103241034547363e-06,
"loss": 0.58,
"step": 585
},
{
"epoch": 0.2238349885408709,
"grad_norm": 13.153208089545293,
"learning_rate": 1.8095984275704516e-06,
"loss": 0.5168,
"step": 586
},
{
"epoch": 0.22421695951107715,
"grad_norm": 5.251421468073455,
"learning_rate": 1.8088715121935497e-06,
"loss": 0.5016,
"step": 587
},
{
"epoch": 0.22459893048128343,
"grad_norm": 3.6958396485074796,
"learning_rate": 1.8081433584369363e-06,
"loss": 0.4577,
"step": 588
},
{
"epoch": 0.2249809014514897,
"grad_norm": 3.265559498367038,
"learning_rate": 1.807413967415412e-06,
"loss": 0.4782,
"step": 589
},
{
"epoch": 0.22536287242169595,
"grad_norm": 7.799514923143363,
"learning_rate": 1.806683340245672e-06,
"loss": 0.4997,
"step": 590
},
{
"epoch": 0.2257448433919022,
"grad_norm": 4.542184627206501,
"learning_rate": 1.805951478046305e-06,
"loss": 0.4958,
"step": 591
},
{
"epoch": 0.2261268143621085,
"grad_norm": 3.198561665660559,
"learning_rate": 1.8052183819377889e-06,
"loss": 0.5066,
"step": 592
},
{
"epoch": 0.22650878533231475,
"grad_norm": 3.285441933069285,
"learning_rate": 1.8044840530424922e-06,
"loss": 0.5231,
"step": 593
},
{
"epoch": 0.226890756302521,
"grad_norm": 2.437769468942543,
"learning_rate": 1.803748492484669e-06,
"loss": 0.4666,
"step": 594
},
{
"epoch": 0.22727272727272727,
"grad_norm": 6.185410715981308,
"learning_rate": 1.8030117013904614e-06,
"loss": 0.5438,
"step": 595
},
{
"epoch": 0.22765469824293355,
"grad_norm": 4.448274163968093,
"learning_rate": 1.8022736808878935e-06,
"loss": 0.5467,
"step": 596
},
{
"epoch": 0.2280366692131398,
"grad_norm": 3.3611937875245914,
"learning_rate": 1.8015344321068725e-06,
"loss": 0.5204,
"step": 597
},
{
"epoch": 0.22841864018334607,
"grad_norm": 4.302668649886635,
"learning_rate": 1.800793956179186e-06,
"loss": 0.5256,
"step": 598
},
{
"epoch": 0.22880061115355232,
"grad_norm": 4.160541159727651,
"learning_rate": 1.8000522542385003e-06,
"loss": 0.4741,
"step": 599
},
{
"epoch": 0.22918258212375858,
"grad_norm": 2.9709820967481644,
"learning_rate": 1.7993093274203587e-06,
"loss": 0.6171,
"step": 600
},
{
"epoch": 0.22956455309396487,
"grad_norm": 2.65482208122776,
"learning_rate": 1.7985651768621795e-06,
"loss": 0.5183,
"step": 601
},
{
"epoch": 0.22994652406417113,
"grad_norm": 3.1309167619949796,
"learning_rate": 1.7978198037032556e-06,
"loss": 0.5214,
"step": 602
},
{
"epoch": 0.23032849503437738,
"grad_norm": 4.786873785109864,
"learning_rate": 1.7970732090847501e-06,
"loss": 0.5158,
"step": 603
},
{
"epoch": 0.23071046600458364,
"grad_norm": 3.56544383535294,
"learning_rate": 1.7963253941496973e-06,
"loss": 0.5161,
"step": 604
},
{
"epoch": 0.23109243697478993,
"grad_norm": 2.9671537454160832,
"learning_rate": 1.7955763600429994e-06,
"loss": 0.5702,
"step": 605
},
{
"epoch": 0.23147440794499619,
"grad_norm": 3.4057549644612157,
"learning_rate": 1.7948261079114256e-06,
"loss": 0.4706,
"step": 606
},
{
"epoch": 0.23185637891520244,
"grad_norm": 3.421956791631102,
"learning_rate": 1.794074638903609e-06,
"loss": 0.4638,
"step": 607
},
{
"epoch": 0.2322383498854087,
"grad_norm": 4.060003714618489,
"learning_rate": 1.7933219541700466e-06,
"loss": 0.5237,
"step": 608
},
{
"epoch": 0.232620320855615,
"grad_norm": 3.345537066191882,
"learning_rate": 1.7925680548630964e-06,
"loss": 0.549,
"step": 609
},
{
"epoch": 0.23300229182582124,
"grad_norm": 2.6400782835864427,
"learning_rate": 1.7918129421369757e-06,
"loss": 0.4747,
"step": 610
},
{
"epoch": 0.2333842627960275,
"grad_norm": 10.017074793252261,
"learning_rate": 1.7910566171477598e-06,
"loss": 0.4876,
"step": 611
},
{
"epoch": 0.23376623376623376,
"grad_norm": 6.217327964146683,
"learning_rate": 1.7902990810533794e-06,
"loss": 0.4992,
"step": 612
},
{
"epoch": 0.23414820473644002,
"grad_norm": 3.137084300891758,
"learning_rate": 1.7895403350136202e-06,
"loss": 0.5136,
"step": 613
},
{
"epoch": 0.2345301757066463,
"grad_norm": 3.560640966628178,
"learning_rate": 1.7887803801901203e-06,
"loss": 0.4693,
"step": 614
},
{
"epoch": 0.23491214667685256,
"grad_norm": 3.234048960558977,
"learning_rate": 1.7880192177463673e-06,
"loss": 0.4843,
"step": 615
},
{
"epoch": 0.23529411764705882,
"grad_norm": 3.6336304860358797,
"learning_rate": 1.7872568488476993e-06,
"loss": 0.4352,
"step": 616
},
{
"epoch": 0.23567608861726508,
"grad_norm": 5.110350004291673,
"learning_rate": 1.7864932746613001e-06,
"loss": 0.5919,
"step": 617
},
{
"epoch": 0.23605805958747136,
"grad_norm": 2.9444363443075736,
"learning_rate": 1.7857284963561997e-06,
"loss": 0.4616,
"step": 618
},
{
"epoch": 0.23644003055767762,
"grad_norm": 4.154488116453747,
"learning_rate": 1.7849625151032712e-06,
"loss": 0.5212,
"step": 619
},
{
"epoch": 0.23682200152788388,
"grad_norm": 3.6500330910954366,
"learning_rate": 1.7841953320752292e-06,
"loss": 0.5011,
"step": 620
},
{
"epoch": 0.23720397249809014,
"grad_norm": 2.851279009859859,
"learning_rate": 1.7834269484466287e-06,
"loss": 0.5079,
"step": 621
},
{
"epoch": 0.23758594346829642,
"grad_norm": 8.11545498157721,
"learning_rate": 1.7826573653938626e-06,
"loss": 0.496,
"step": 622
},
{
"epoch": 0.23796791443850268,
"grad_norm": 2.1317456264414694,
"learning_rate": 1.7818865840951598e-06,
"loss": 0.4544,
"step": 623
},
{
"epoch": 0.23834988540870894,
"grad_norm": 2.9181580549057813,
"learning_rate": 1.7811146057305847e-06,
"loss": 0.4882,
"step": 624
},
{
"epoch": 0.2387318563789152,
"grad_norm": 6.282079208223222,
"learning_rate": 1.780341431482033e-06,
"loss": 0.5373,
"step": 625
},
{
"epoch": 0.23911382734912145,
"grad_norm": 2.718832511733045,
"learning_rate": 1.7795670625332325e-06,
"loss": 0.4645,
"step": 626
},
{
"epoch": 0.23949579831932774,
"grad_norm": 2.7269521874649874,
"learning_rate": 1.7787915000697389e-06,
"loss": 0.4627,
"step": 627
},
{
"epoch": 0.239877769289534,
"grad_norm": 5.112524310187623,
"learning_rate": 1.7780147452789368e-06,
"loss": 0.5181,
"step": 628
},
{
"epoch": 0.24025974025974026,
"grad_norm": 2.7695910787876876,
"learning_rate": 1.7772367993500348e-06,
"loss": 0.5327,
"step": 629
},
{
"epoch": 0.24064171122994651,
"grad_norm": 2.6145267906202285,
"learning_rate": 1.7764576634740656e-06,
"loss": 0.5574,
"step": 630
},
{
"epoch": 0.2410236822001528,
"grad_norm": 3.233765828743376,
"learning_rate": 1.7756773388438838e-06,
"loss": 0.521,
"step": 631
},
{
"epoch": 0.24140565317035906,
"grad_norm": 6.696009320726811,
"learning_rate": 1.7748958266541642e-06,
"loss": 0.508,
"step": 632
},
{
"epoch": 0.24178762414056532,
"grad_norm": 39.56183412074627,
"learning_rate": 1.7741131281013992e-06,
"loss": 0.481,
"step": 633
},
{
"epoch": 0.24216959511077157,
"grad_norm": 3.869094625881142,
"learning_rate": 1.7733292443838978e-06,
"loss": 0.5563,
"step": 634
},
{
"epoch": 0.24255156608097786,
"grad_norm": 21.41941056342672,
"learning_rate": 1.7725441767017837e-06,
"loss": 0.4881,
"step": 635
},
{
"epoch": 0.24293353705118412,
"grad_norm": 15.102471638374096,
"learning_rate": 1.7717579262569925e-06,
"loss": 0.5545,
"step": 636
},
{
"epoch": 0.24331550802139038,
"grad_norm": 3.070865106455782,
"learning_rate": 1.770970494253272e-06,
"loss": 0.5642,
"step": 637
},
{
"epoch": 0.24369747899159663,
"grad_norm": 4.240853022191859,
"learning_rate": 1.7701818818961774e-06,
"loss": 0.5046,
"step": 638
},
{
"epoch": 0.2440794499618029,
"grad_norm": 3.8398989956706755,
"learning_rate": 1.7693920903930714e-06,
"loss": 0.4598,
"step": 639
},
{
"epoch": 0.24446142093200918,
"grad_norm": 2.6480456412643107,
"learning_rate": 1.7686011209531233e-06,
"loss": 0.5043,
"step": 640
},
{
"epoch": 0.24484339190221543,
"grad_norm": 3.600797780452132,
"learning_rate": 1.7678089747873042e-06,
"loss": 0.5166,
"step": 641
},
{
"epoch": 0.2452253628724217,
"grad_norm": 3.057904111996566,
"learning_rate": 1.7670156531083875e-06,
"loss": 0.4996,
"step": 642
},
{
"epoch": 0.24560733384262795,
"grad_norm": 3.4477591809238732,
"learning_rate": 1.7662211571309457e-06,
"loss": 0.6209,
"step": 643
},
{
"epoch": 0.24598930481283424,
"grad_norm": 3.562378985300235,
"learning_rate": 1.7654254880713504e-06,
"loss": 0.5704,
"step": 644
},
{
"epoch": 0.2463712757830405,
"grad_norm": 3.6440398708100474,
"learning_rate": 1.764628647147768e-06,
"loss": 0.5646,
"step": 645
},
{
"epoch": 0.24675324675324675,
"grad_norm": 3.1071802553619663,
"learning_rate": 1.7638306355801591e-06,
"loss": 0.5275,
"step": 646
},
{
"epoch": 0.247135217723453,
"grad_norm": 6.557389815257588,
"learning_rate": 1.7630314545902776e-06,
"loss": 0.5619,
"step": 647
},
{
"epoch": 0.2475171886936593,
"grad_norm": 2.7462182181993424,
"learning_rate": 1.7622311054016663e-06,
"loss": 0.5185,
"step": 648
},
{
"epoch": 0.24789915966386555,
"grad_norm": 5.974295269074101,
"learning_rate": 1.7614295892396577e-06,
"loss": 0.6382,
"step": 649
},
{
"epoch": 0.2482811306340718,
"grad_norm": 3.112336954489829,
"learning_rate": 1.7606269073313705e-06,
"loss": 0.5138,
"step": 650
},
{
"epoch": 0.24866310160427807,
"grad_norm": 3.740381602615543,
"learning_rate": 1.7598230609057078e-06,
"loss": 0.4806,
"step": 651
},
{
"epoch": 0.24904507257448433,
"grad_norm": 4.65790554328188,
"learning_rate": 1.7590180511933564e-06,
"loss": 0.4916,
"step": 652
},
{
"epoch": 0.2494270435446906,
"grad_norm": 2.7493165403420243,
"learning_rate": 1.7582118794267834e-06,
"loss": 0.5118,
"step": 653
},
{
"epoch": 0.24980901451489687,
"grad_norm": 3.3105503913880514,
"learning_rate": 1.757404546840235e-06,
"loss": 0.5298,
"step": 654
},
{
"epoch": 0.25019098548510316,
"grad_norm": 12.455689339904634,
"learning_rate": 1.7565960546697353e-06,
"loss": 0.5318,
"step": 655
},
{
"epoch": 0.2505729564553094,
"grad_norm": 3.380636142736357,
"learning_rate": 1.7557864041530828e-06,
"loss": 0.4663,
"step": 656
},
{
"epoch": 0.2509549274255157,
"grad_norm": 3.815668309210232,
"learning_rate": 1.7549755965298497e-06,
"loss": 0.5581,
"step": 657
},
{
"epoch": 0.25133689839572193,
"grad_norm": 4.378975343400351,
"learning_rate": 1.7541636330413807e-06,
"loss": 0.4936,
"step": 658
},
{
"epoch": 0.2517188693659282,
"grad_norm": 6.528924785964309,
"learning_rate": 1.7533505149307887e-06,
"loss": 0.5414,
"step": 659
},
{
"epoch": 0.25210084033613445,
"grad_norm": 3.4142135055928167,
"learning_rate": 1.752536243442955e-06,
"loss": 0.568,
"step": 660
},
{
"epoch": 0.2524828113063407,
"grad_norm": 2.8288694978964974,
"learning_rate": 1.7517208198245266e-06,
"loss": 0.5267,
"step": 661
},
{
"epoch": 0.25286478227654696,
"grad_norm": 3.50959890251818,
"learning_rate": 1.7509042453239146e-06,
"loss": 0.5019,
"step": 662
},
{
"epoch": 0.2532467532467532,
"grad_norm": 2.7771422738572618,
"learning_rate": 1.7500865211912923e-06,
"loss": 0.4467,
"step": 663
},
{
"epoch": 0.25362872421695953,
"grad_norm": 2.5295925264464367,
"learning_rate": 1.7492676486785923e-06,
"loss": 0.5013,
"step": 664
},
{
"epoch": 0.2540106951871658,
"grad_norm": 4.8645822646400925,
"learning_rate": 1.7484476290395058e-06,
"loss": 0.6675,
"step": 665
},
{
"epoch": 0.25439266615737205,
"grad_norm": 2.7906824801853958,
"learning_rate": 1.7476264635294803e-06,
"loss": 0.4964,
"step": 666
},
{
"epoch": 0.2547746371275783,
"grad_norm": 4.426841802219747,
"learning_rate": 1.7468041534057176e-06,
"loss": 0.5779,
"step": 667
},
{
"epoch": 0.25515660809778457,
"grad_norm": 2.7424287930246862,
"learning_rate": 1.745980699927172e-06,
"loss": 0.4458,
"step": 668
},
{
"epoch": 0.2555385790679908,
"grad_norm": 3.006140806996059,
"learning_rate": 1.7451561043545481e-06,
"loss": 0.5388,
"step": 669
},
{
"epoch": 0.2559205500381971,
"grad_norm": 3.042148103787358,
"learning_rate": 1.7443303679502993e-06,
"loss": 0.5555,
"step": 670
},
{
"epoch": 0.25630252100840334,
"grad_norm": 6.668821228222795,
"learning_rate": 1.743503491978625e-06,
"loss": 0.4967,
"step": 671
},
{
"epoch": 0.25668449197860965,
"grad_norm": 2.5308707340238827,
"learning_rate": 1.74267547770547e-06,
"loss": 0.4663,
"step": 672
},
{
"epoch": 0.2570664629488159,
"grad_norm": 6.833924393698803,
"learning_rate": 1.7418463263985213e-06,
"loss": 0.5645,
"step": 673
},
{
"epoch": 0.25744843391902217,
"grad_norm": 3.248878235436747,
"learning_rate": 1.741016039327207e-06,
"loss": 0.5378,
"step": 674
},
{
"epoch": 0.2578304048892284,
"grad_norm": 12.379869478911926,
"learning_rate": 1.7401846177626937e-06,
"loss": 0.4715,
"step": 675
},
{
"epoch": 0.2582123758594347,
"grad_norm": 2.4700767549860916,
"learning_rate": 1.7393520629778858e-06,
"loss": 0.4435,
"step": 676
},
{
"epoch": 0.25859434682964094,
"grad_norm": 3.8376182240242347,
"learning_rate": 1.7385183762474216e-06,
"loss": 0.5057,
"step": 677
},
{
"epoch": 0.2589763177998472,
"grad_norm": 2.787561200875686,
"learning_rate": 1.737683558847673e-06,
"loss": 0.4756,
"step": 678
},
{
"epoch": 0.25935828877005346,
"grad_norm": 8.0937580576913,
"learning_rate": 1.7368476120567425e-06,
"loss": 0.4516,
"step": 679
},
{
"epoch": 0.2597402597402597,
"grad_norm": 3.348677722488532,
"learning_rate": 1.7360105371544624e-06,
"loss": 0.534,
"step": 680
},
{
"epoch": 0.26012223071046603,
"grad_norm": 4.392537977655502,
"learning_rate": 1.735172335422391e-06,
"loss": 0.5828,
"step": 681
},
{
"epoch": 0.2605042016806723,
"grad_norm": 4.871306831406301,
"learning_rate": 1.7343330081438134e-06,
"loss": 0.5305,
"step": 682
},
{
"epoch": 0.26088617265087855,
"grad_norm": 2.0945191210539447,
"learning_rate": 1.7334925566037364e-06,
"loss": 0.4465,
"step": 683
},
{
"epoch": 0.2612681436210848,
"grad_norm": 2.4815453087895363,
"learning_rate": 1.7326509820888891e-06,
"loss": 0.5161,
"step": 684
},
{
"epoch": 0.26165011459129106,
"grad_norm": 2.512152878408946,
"learning_rate": 1.7318082858877189e-06,
"loss": 0.4902,
"step": 685
},
{
"epoch": 0.2620320855614973,
"grad_norm": 3.8314532509519004,
"learning_rate": 1.7309644692903908e-06,
"loss": 0.5136,
"step": 686
},
{
"epoch": 0.2624140565317036,
"grad_norm": 4.003857938523139,
"learning_rate": 1.7301195335887861e-06,
"loss": 0.5135,
"step": 687
},
{
"epoch": 0.26279602750190983,
"grad_norm": 2.336318230756018,
"learning_rate": 1.7292734800764983e-06,
"loss": 0.4443,
"step": 688
},
{
"epoch": 0.2631779984721161,
"grad_norm": 4.912046668250144,
"learning_rate": 1.7284263100488325e-06,
"loss": 0.5218,
"step": 689
},
{
"epoch": 0.2635599694423224,
"grad_norm": 14.649714625917312,
"learning_rate": 1.7275780248028035e-06,
"loss": 0.5415,
"step": 690
},
{
"epoch": 0.26394194041252866,
"grad_norm": 4.778500699991443,
"learning_rate": 1.7267286256371334e-06,
"loss": 0.5475,
"step": 691
},
{
"epoch": 0.2643239113827349,
"grad_norm": 5.334807436364755,
"learning_rate": 1.7258781138522494e-06,
"loss": 0.486,
"step": 692
},
{
"epoch": 0.2647058823529412,
"grad_norm": 4.407848861156866,
"learning_rate": 1.7250264907502823e-06,
"loss": 0.4618,
"step": 693
},
{
"epoch": 0.26508785332314744,
"grad_norm": 4.081137578513586,
"learning_rate": 1.7241737576350646e-06,
"loss": 0.5714,
"step": 694
},
{
"epoch": 0.2654698242933537,
"grad_norm": 3.458527890022539,
"learning_rate": 1.7233199158121278e-06,
"loss": 0.5386,
"step": 695
},
{
"epoch": 0.26585179526355995,
"grad_norm": 3.6226981960725078,
"learning_rate": 1.722464966588701e-06,
"loss": 0.5704,
"step": 696
},
{
"epoch": 0.2662337662337662,
"grad_norm": 2.633192513546539,
"learning_rate": 1.7216089112737092e-06,
"loss": 0.4667,
"step": 697
},
{
"epoch": 0.2666157372039725,
"grad_norm": 24.569139675810856,
"learning_rate": 1.7207517511777697e-06,
"loss": 0.4881,
"step": 698
},
{
"epoch": 0.2669977081741788,
"grad_norm": 3.7580265708859746,
"learning_rate": 1.719893487613192e-06,
"loss": 0.568,
"step": 699
},
{
"epoch": 0.26737967914438504,
"grad_norm": 4.400795314770478,
"learning_rate": 1.7190341218939753e-06,
"loss": 0.5632,
"step": 700
},
{
"epoch": 0.2677616501145913,
"grad_norm": 7.344189586259203,
"learning_rate": 1.7181736553358053e-06,
"loss": 0.5036,
"step": 701
},
{
"epoch": 0.26814362108479756,
"grad_norm": 2.778695505776415,
"learning_rate": 1.717312089256054e-06,
"loss": 0.5119,
"step": 702
},
{
"epoch": 0.2685255920550038,
"grad_norm": 2.5695491741039467,
"learning_rate": 1.7164494249737759e-06,
"loss": 0.5112,
"step": 703
},
{
"epoch": 0.2689075630252101,
"grad_norm": 2.8934171667829185,
"learning_rate": 1.715585663809708e-06,
"loss": 0.4718,
"step": 704
},
{
"epoch": 0.26928953399541633,
"grad_norm": 2.6161453326453,
"learning_rate": 1.7147208070862646e-06,
"loss": 0.5047,
"step": 705
},
{
"epoch": 0.2696715049656226,
"grad_norm": 3.8219439204812944,
"learning_rate": 1.7138548561275398e-06,
"loss": 0.5148,
"step": 706
},
{
"epoch": 0.2700534759358289,
"grad_norm": 2.1503328382135853,
"learning_rate": 1.7129878122593016e-06,
"loss": 0.5314,
"step": 707
},
{
"epoch": 0.27043544690603516,
"grad_norm": 3.646190022749743,
"learning_rate": 1.712119676808991e-06,
"loss": 0.5311,
"step": 708
},
{
"epoch": 0.2708174178762414,
"grad_norm": 3.731396032506048,
"learning_rate": 1.7112504511057205e-06,
"loss": 0.5884,
"step": 709
},
{
"epoch": 0.2711993888464477,
"grad_norm": 3.6777917230456723,
"learning_rate": 1.7103801364802725e-06,
"loss": 0.5793,
"step": 710
},
{
"epoch": 0.27158135981665393,
"grad_norm": 6.13373394566763,
"learning_rate": 1.7095087342650953e-06,
"loss": 0.5185,
"step": 711
},
{
"epoch": 0.2719633307868602,
"grad_norm": 3.087001723579745,
"learning_rate": 1.7086362457943032e-06,
"loss": 0.5009,
"step": 712
},
{
"epoch": 0.27234530175706645,
"grad_norm": 4.490686392373228,
"learning_rate": 1.7077626724036733e-06,
"loss": 0.5339,
"step": 713
},
{
"epoch": 0.2727272727272727,
"grad_norm": 3.005739224446104,
"learning_rate": 1.7068880154306436e-06,
"loss": 0.5423,
"step": 714
},
{
"epoch": 0.27310924369747897,
"grad_norm": 4.784083760333966,
"learning_rate": 1.7060122762143113e-06,
"loss": 0.5006,
"step": 715
},
{
"epoch": 0.2734912146676853,
"grad_norm": 2.508657710501213,
"learning_rate": 1.70513545609543e-06,
"loss": 0.5682,
"step": 716
},
{
"epoch": 0.27387318563789154,
"grad_norm": 4.199766413295021,
"learning_rate": 1.704257556416409e-06,
"loss": 0.5516,
"step": 717
},
{
"epoch": 0.2742551566080978,
"grad_norm": 5.212747405600585,
"learning_rate": 1.7033785785213097e-06,
"loss": 0.5135,
"step": 718
},
{
"epoch": 0.27463712757830405,
"grad_norm": 3.1721537708110477,
"learning_rate": 1.7024985237558442e-06,
"loss": 0.5187,
"step": 719
},
{
"epoch": 0.2750190985485103,
"grad_norm": 3.2955829455322654,
"learning_rate": 1.701617393467374e-06,
"loss": 0.5187,
"step": 720
},
{
"epoch": 0.27540106951871657,
"grad_norm": 4.975453907210425,
"learning_rate": 1.7007351890049066e-06,
"loss": 0.5139,
"step": 721
},
{
"epoch": 0.2757830404889228,
"grad_norm": 3.4265908223186003,
"learning_rate": 1.6998519117190939e-06,
"loss": 0.5373,
"step": 722
},
{
"epoch": 0.2761650114591291,
"grad_norm": 3.750942277195497,
"learning_rate": 1.6989675629622311e-06,
"loss": 0.5224,
"step": 723
},
{
"epoch": 0.2765469824293354,
"grad_norm": 2.9167247213499623,
"learning_rate": 1.698082144088253e-06,
"loss": 0.4503,
"step": 724
},
{
"epoch": 0.27692895339954166,
"grad_norm": 2.6892296224397443,
"learning_rate": 1.6971956564527331e-06,
"loss": 0.4308,
"step": 725
},
{
"epoch": 0.2773109243697479,
"grad_norm": 7.484679803225303,
"learning_rate": 1.6963081014128814e-06,
"loss": 0.5805,
"step": 726
},
{
"epoch": 0.27769289533995417,
"grad_norm": 2.8744376077945795,
"learning_rate": 1.6954194803275418e-06,
"loss": 0.5175,
"step": 727
},
{
"epoch": 0.27807486631016043,
"grad_norm": 2.939035122326872,
"learning_rate": 1.6945297945571898e-06,
"loss": 0.5184,
"step": 728
},
{
"epoch": 0.2784568372803667,
"grad_norm": 3.3676310306978636,
"learning_rate": 1.6936390454639323e-06,
"loss": 0.507,
"step": 729
},
{
"epoch": 0.27883880825057294,
"grad_norm": 7.253458488294697,
"learning_rate": 1.6927472344115027e-06,
"loss": 0.5281,
"step": 730
},
{
"epoch": 0.2792207792207792,
"grad_norm": 3.159544953754157,
"learning_rate": 1.6918543627652615e-06,
"loss": 0.516,
"step": 731
},
{
"epoch": 0.27960275019098546,
"grad_norm": 4.138952498409637,
"learning_rate": 1.6909604318921918e-06,
"loss": 0.4688,
"step": 732
},
{
"epoch": 0.2799847211611918,
"grad_norm": 2.741705159701366,
"learning_rate": 1.6900654431608992e-06,
"loss": 0.4839,
"step": 733
},
{
"epoch": 0.28036669213139803,
"grad_norm": 3.693179714945185,
"learning_rate": 1.6891693979416081e-06,
"loss": 0.51,
"step": 734
},
{
"epoch": 0.2807486631016043,
"grad_norm": 7.452946357187165,
"learning_rate": 1.688272297606162e-06,
"loss": 0.4816,
"step": 735
},
{
"epoch": 0.28113063407181055,
"grad_norm": 2.742131074634169,
"learning_rate": 1.6873741435280175e-06,
"loss": 0.4426,
"step": 736
},
{
"epoch": 0.2815126050420168,
"grad_norm": 5.630665693685824,
"learning_rate": 1.686474937082246e-06,
"loss": 0.548,
"step": 737
},
{
"epoch": 0.28189457601222306,
"grad_norm": 5.688789138069565,
"learning_rate": 1.68557467964553e-06,
"loss": 0.5106,
"step": 738
},
{
"epoch": 0.2822765469824293,
"grad_norm": 2.4631885801224636,
"learning_rate": 1.6846733725961605e-06,
"loss": 0.4238,
"step": 739
},
{
"epoch": 0.2826585179526356,
"grad_norm": 3.5667080177526422,
"learning_rate": 1.6837710173140359e-06,
"loss": 0.5505,
"step": 740
},
{
"epoch": 0.28304048892284184,
"grad_norm": 11.929409510968739,
"learning_rate": 1.6828676151806587e-06,
"loss": 0.504,
"step": 741
},
{
"epoch": 0.28342245989304815,
"grad_norm": 4.103107051996335,
"learning_rate": 1.6819631675791355e-06,
"loss": 0.494,
"step": 742
},
{
"epoch": 0.2838044308632544,
"grad_norm": 3.9061323742785876,
"learning_rate": 1.6810576758941726e-06,
"loss": 0.471,
"step": 743
},
{
"epoch": 0.28418640183346067,
"grad_norm": 3.0871420882267007,
"learning_rate": 1.6801511415120743e-06,
"loss": 0.4401,
"step": 744
},
{
"epoch": 0.2845683728036669,
"grad_norm": 5.957436850330277,
"learning_rate": 1.6792435658207422e-06,
"loss": 0.5614,
"step": 745
},
{
"epoch": 0.2849503437738732,
"grad_norm": 7.907512720107014,
"learning_rate": 1.6783349502096718e-06,
"loss": 0.5234,
"step": 746
},
{
"epoch": 0.28533231474407944,
"grad_norm": 7.7846738390269765,
"learning_rate": 1.6774252960699508e-06,
"loss": 0.5755,
"step": 747
},
{
"epoch": 0.2857142857142857,
"grad_norm": 4.268104953943963,
"learning_rate": 1.6765146047942569e-06,
"loss": 0.5197,
"step": 748
},
{
"epoch": 0.28609625668449196,
"grad_norm": 4.400981542332326,
"learning_rate": 1.6756028777768546e-06,
"loss": 0.5216,
"step": 749
},
{
"epoch": 0.28647822765469827,
"grad_norm": 3.237959983040218,
"learning_rate": 1.6746901164135964e-06,
"loss": 0.4327,
"step": 750
},
{
"epoch": 0.2868601986249045,
"grad_norm": 2.4956321020896826,
"learning_rate": 1.6737763221019165e-06,
"loss": 0.4209,
"step": 751
},
{
"epoch": 0.2872421695951108,
"grad_norm": 3.3785218467080993,
"learning_rate": 1.6728614962408307e-06,
"loss": 0.5133,
"step": 752
},
{
"epoch": 0.28762414056531704,
"grad_norm": 5.755907594154302,
"learning_rate": 1.671945640230935e-06,
"loss": 0.493,
"step": 753
},
{
"epoch": 0.2880061115355233,
"grad_norm": 3.5371156438993467,
"learning_rate": 1.6710287554744018e-06,
"loss": 0.5648,
"step": 754
},
{
"epoch": 0.28838808250572956,
"grad_norm": 3.31050461562023,
"learning_rate": 1.670110843374979e-06,
"loss": 0.5592,
"step": 755
},
{
"epoch": 0.2887700534759358,
"grad_norm": 10.456411989986606,
"learning_rate": 1.669191905337987e-06,
"loss": 0.517,
"step": 756
},
{
"epoch": 0.2891520244461421,
"grad_norm": 2.042811323137261,
"learning_rate": 1.6682719427703162e-06,
"loss": 0.4311,
"step": 757
},
{
"epoch": 0.28953399541634833,
"grad_norm": 3.7313229869118527,
"learning_rate": 1.6673509570804277e-06,
"loss": 0.5495,
"step": 758
},
{
"epoch": 0.28991596638655465,
"grad_norm": 5.091712054727465,
"learning_rate": 1.6664289496783469e-06,
"loss": 0.5404,
"step": 759
},
{
"epoch": 0.2902979373567609,
"grad_norm": 3.7509496079574665,
"learning_rate": 1.6655059219756642e-06,
"loss": 0.4976,
"step": 760
},
{
"epoch": 0.29067990832696716,
"grad_norm": 3.0937726229305142,
"learning_rate": 1.6645818753855323e-06,
"loss": 0.4678,
"step": 761
},
{
"epoch": 0.2910618792971734,
"grad_norm": 3.323105883693273,
"learning_rate": 1.6636568113226634e-06,
"loss": 0.4866,
"step": 762
},
{
"epoch": 0.2914438502673797,
"grad_norm": 5.253451440192973,
"learning_rate": 1.662730731203328e-06,
"loss": 0.4973,
"step": 763
},
{
"epoch": 0.29182582123758594,
"grad_norm": 3.4256082230957503,
"learning_rate": 1.661803636445351e-06,
"loss": 0.536,
"step": 764
},
{
"epoch": 0.2922077922077922,
"grad_norm": 3.3306256815135216,
"learning_rate": 1.6608755284681126e-06,
"loss": 0.5426,
"step": 765
},
{
"epoch": 0.29258976317799845,
"grad_norm": 2.9356322251185865,
"learning_rate": 1.6599464086925426e-06,
"loss": 0.5344,
"step": 766
},
{
"epoch": 0.2929717341482047,
"grad_norm": 4.155899142256312,
"learning_rate": 1.65901627854112e-06,
"loss": 0.5349,
"step": 767
},
{
"epoch": 0.293353705118411,
"grad_norm": 3.777614719641604,
"learning_rate": 1.658085139437872e-06,
"loss": 0.494,
"step": 768
},
{
"epoch": 0.2937356760886173,
"grad_norm": 2.5142543924089114,
"learning_rate": 1.6571529928083692e-06,
"loss": 0.4697,
"step": 769
},
{
"epoch": 0.29411764705882354,
"grad_norm": 6.722985418682869,
"learning_rate": 1.6562198400797252e-06,
"loss": 0.5333,
"step": 770
},
{
"epoch": 0.2944996180290298,
"grad_norm": 3.631912374117992,
"learning_rate": 1.6552856826805935e-06,
"loss": 0.5664,
"step": 771
},
{
"epoch": 0.29488158899923606,
"grad_norm": 4.989274098547631,
"learning_rate": 1.6543505220411663e-06,
"loss": 0.5367,
"step": 772
},
{
"epoch": 0.2952635599694423,
"grad_norm": 2.228577586192082,
"learning_rate": 1.6534143595931717e-06,
"loss": 0.5349,
"step": 773
},
{
"epoch": 0.29564553093964857,
"grad_norm": 2.6656508541994577,
"learning_rate": 1.6524771967698711e-06,
"loss": 0.5048,
"step": 774
},
{
"epoch": 0.29602750190985483,
"grad_norm": 3.3576646899531513,
"learning_rate": 1.6515390350060584e-06,
"loss": 0.4657,
"step": 775
},
{
"epoch": 0.29640947288006114,
"grad_norm": 3.3575492714076565,
"learning_rate": 1.650599875738056e-06,
"loss": 0.5392,
"step": 776
},
{
"epoch": 0.2967914438502674,
"grad_norm": 5.6684435372916155,
"learning_rate": 1.6496597204037135e-06,
"loss": 0.4961,
"step": 777
},
{
"epoch": 0.29717341482047366,
"grad_norm": 4.442816807184914,
"learning_rate": 1.6487185704424057e-06,
"loss": 0.5325,
"step": 778
},
{
"epoch": 0.2975553857906799,
"grad_norm": 4.592388725625374,
"learning_rate": 1.6477764272950307e-06,
"loss": 0.4357,
"step": 779
},
{
"epoch": 0.2979373567608862,
"grad_norm": 4.286297730878364,
"learning_rate": 1.6468332924040062e-06,
"loss": 0.5542,
"step": 780
},
{
"epoch": 0.29831932773109243,
"grad_norm": 2.275160007815433,
"learning_rate": 1.645889167213269e-06,
"loss": 0.518,
"step": 781
},
{
"epoch": 0.2987012987012987,
"grad_norm": 2.535647558636044,
"learning_rate": 1.6449440531682717e-06,
"loss": 0.5267,
"step": 782
},
{
"epoch": 0.29908326967150495,
"grad_norm": 4.43586450645513,
"learning_rate": 1.6439979517159808e-06,
"loss": 0.5018,
"step": 783
},
{
"epoch": 0.2994652406417112,
"grad_norm": 3.189161112968492,
"learning_rate": 1.6430508643048743e-06,
"loss": 0.4726,
"step": 784
},
{
"epoch": 0.2998472116119175,
"grad_norm": 3.1935773386697597,
"learning_rate": 1.6421027923849408e-06,
"loss": 0.5276,
"step": 785
},
{
"epoch": 0.3002291825821238,
"grad_norm": 10.751918205851151,
"learning_rate": 1.641153737407675e-06,
"loss": 0.4862,
"step": 786
},
{
"epoch": 0.30061115355233003,
"grad_norm": 3.0434845781199065,
"learning_rate": 1.6402037008260768e-06,
"loss": 0.5054,
"step": 787
},
{
"epoch": 0.3009931245225363,
"grad_norm": 3.2685180753421568,
"learning_rate": 1.6392526840946492e-06,
"loss": 0.5713,
"step": 788
},
{
"epoch": 0.30137509549274255,
"grad_norm": 3.0040361376646034,
"learning_rate": 1.6383006886693962e-06,
"loss": 0.4681,
"step": 789
},
{
"epoch": 0.3017570664629488,
"grad_norm": 3.635855546494789,
"learning_rate": 1.6373477160078197e-06,
"loss": 0.5384,
"step": 790
},
{
"epoch": 0.30213903743315507,
"grad_norm": 4.5175144684161825,
"learning_rate": 1.6363937675689174e-06,
"loss": 0.5288,
"step": 791
},
{
"epoch": 0.3025210084033613,
"grad_norm": 12.922747266951877,
"learning_rate": 1.6354388448131818e-06,
"loss": 0.5918,
"step": 792
},
{
"epoch": 0.3029029793735676,
"grad_norm": 3.686851028164734,
"learning_rate": 1.6344829492025962e-06,
"loss": 0.612,
"step": 793
},
{
"epoch": 0.3032849503437739,
"grad_norm": 7.826182578996057,
"learning_rate": 1.633526082200634e-06,
"loss": 0.5536,
"step": 794
},
{
"epoch": 0.30366692131398015,
"grad_norm": 2.7716060805059484,
"learning_rate": 1.6325682452722556e-06,
"loss": 0.5195,
"step": 795
},
{
"epoch": 0.3040488922841864,
"grad_norm": 3.1362624859125665,
"learning_rate": 1.6316094398839062e-06,
"loss": 0.5409,
"step": 796
},
{
"epoch": 0.30443086325439267,
"grad_norm": 10.90252915912184,
"learning_rate": 1.6306496675035132e-06,
"loss": 0.569,
"step": 797
},
{
"epoch": 0.3048128342245989,
"grad_norm": 3.266629007159501,
"learning_rate": 1.629688929600486e-06,
"loss": 0.4988,
"step": 798
},
{
"epoch": 0.3051948051948052,
"grad_norm": 4.016902341607713,
"learning_rate": 1.6287272276457112e-06,
"loss": 0.4002,
"step": 799
},
{
"epoch": 0.30557677616501144,
"grad_norm": 2.5154528680969337,
"learning_rate": 1.6277645631115506e-06,
"loss": 0.5111,
"step": 800
},
{
"epoch": 0.3059587471352177,
"grad_norm": 9.075085882863707,
"learning_rate": 1.6268009374718411e-06,
"loss": 0.4927,
"step": 801
},
{
"epoch": 0.306340718105424,
"grad_norm": 31.66511136078232,
"learning_rate": 1.6258363522018908e-06,
"loss": 0.5165,
"step": 802
},
{
"epoch": 0.3067226890756303,
"grad_norm": 5.0751500129820695,
"learning_rate": 1.624870808778476e-06,
"loss": 0.474,
"step": 803
},
{
"epoch": 0.30710466004583653,
"grad_norm": 2.236087119497167,
"learning_rate": 1.6239043086798411e-06,
"loss": 0.4722,
"step": 804
},
{
"epoch": 0.3074866310160428,
"grad_norm": 5.903012956063016,
"learning_rate": 1.6229368533856947e-06,
"loss": 0.5124,
"step": 805
},
{
"epoch": 0.30786860198624905,
"grad_norm": 14.118675520926457,
"learning_rate": 1.6219684443772075e-06,
"loss": 0.533,
"step": 806
},
{
"epoch": 0.3082505729564553,
"grad_norm": 3.7142239375936987,
"learning_rate": 1.6209990831370105e-06,
"loss": 0.5129,
"step": 807
},
{
"epoch": 0.30863254392666156,
"grad_norm": 3.8244519908044703,
"learning_rate": 1.6200287711491928e-06,
"loss": 0.5359,
"step": 808
},
{
"epoch": 0.3090145148968678,
"grad_norm": 3.3849363817604634,
"learning_rate": 1.6190575098992993e-06,
"loss": 0.5272,
"step": 809
},
{
"epoch": 0.3093964858670741,
"grad_norm": 4.1533543275022256,
"learning_rate": 1.6180853008743278e-06,
"loss": 0.512,
"step": 810
},
{
"epoch": 0.3097784568372804,
"grad_norm": 4.30937018622683,
"learning_rate": 1.6171121455627268e-06,
"loss": 0.4175,
"step": 811
},
{
"epoch": 0.31016042780748665,
"grad_norm": 3.549926399740357,
"learning_rate": 1.6161380454543943e-06,
"loss": 0.5586,
"step": 812
},
{
"epoch": 0.3105423987776929,
"grad_norm": 3.5404138591865766,
"learning_rate": 1.6151630020406742e-06,
"loss": 0.518,
"step": 813
},
{
"epoch": 0.31092436974789917,
"grad_norm": 3.093654062052044,
"learning_rate": 1.6141870168143551e-06,
"loss": 0.5309,
"step": 814
},
{
"epoch": 0.3113063407181054,
"grad_norm": 2.648167452438263,
"learning_rate": 1.6132100912696673e-06,
"loss": 0.5085,
"step": 815
},
{
"epoch": 0.3116883116883117,
"grad_norm": 13.440120277339844,
"learning_rate": 1.612232226902281e-06,
"loss": 0.5151,
"step": 816
},
{
"epoch": 0.31207028265851794,
"grad_norm": 9.911688500332652,
"learning_rate": 1.6112534252093028e-06,
"loss": 0.5441,
"step": 817
},
{
"epoch": 0.3124522536287242,
"grad_norm": 4.568940753889595,
"learning_rate": 1.610273687689275e-06,
"loss": 0.4445,
"step": 818
},
{
"epoch": 0.31283422459893045,
"grad_norm": 3.757739018041412,
"learning_rate": 1.6092930158421733e-06,
"loss": 0.4919,
"step": 819
},
{
"epoch": 0.31321619556913677,
"grad_norm": 2.7005901111787938,
"learning_rate": 1.6083114111694025e-06,
"loss": 0.5493,
"step": 820
},
{
"epoch": 0.313598166539343,
"grad_norm": 5.329346610981711,
"learning_rate": 1.6073288751737968e-06,
"loss": 0.5071,
"step": 821
},
{
"epoch": 0.3139801375095493,
"grad_norm": 4.155621924910232,
"learning_rate": 1.606345409359615e-06,
"loss": 0.511,
"step": 822
},
{
"epoch": 0.31436210847975554,
"grad_norm": 3.2160415173643697,
"learning_rate": 1.6053610152325407e-06,
"loss": 0.5149,
"step": 823
},
{
"epoch": 0.3147440794499618,
"grad_norm": 4.498833992646186,
"learning_rate": 1.6043756942996781e-06,
"loss": 0.4367,
"step": 824
},
{
"epoch": 0.31512605042016806,
"grad_norm": 2.667390368969184,
"learning_rate": 1.6033894480695503e-06,
"loss": 0.5029,
"step": 825
},
{
"epoch": 0.3155080213903743,
"grad_norm": 2.7056494777003834,
"learning_rate": 1.6024022780520967e-06,
"loss": 0.5443,
"step": 826
},
{
"epoch": 0.3158899923605806,
"grad_norm": 4.119909442418713,
"learning_rate": 1.6014141857586723e-06,
"loss": 0.4948,
"step": 827
},
{
"epoch": 0.3162719633307869,
"grad_norm": 3.0422282372245544,
"learning_rate": 1.6004251727020427e-06,
"loss": 0.4634,
"step": 828
},
{
"epoch": 0.31665393430099315,
"grad_norm": 2.3102004429656255,
"learning_rate": 1.599435240396384e-06,
"loss": 0.4902,
"step": 829
},
{
"epoch": 0.3170359052711994,
"grad_norm": 5.010024744168456,
"learning_rate": 1.598444390357279e-06,
"loss": 0.5581,
"step": 830
},
{
"epoch": 0.31741787624140566,
"grad_norm": 4.281690511868047,
"learning_rate": 1.5974526241017168e-06,
"loss": 0.5059,
"step": 831
},
{
"epoch": 0.3177998472116119,
"grad_norm": 2.7057434095712356,
"learning_rate": 1.5964599431480876e-06,
"loss": 0.5423,
"step": 832
},
{
"epoch": 0.3181818181818182,
"grad_norm": 3.2463832233266796,
"learning_rate": 1.595466349016183e-06,
"loss": 0.5701,
"step": 833
},
{
"epoch": 0.31856378915202443,
"grad_norm": 4.658997201978724,
"learning_rate": 1.5944718432271924e-06,
"loss": 0.6041,
"step": 834
},
{
"epoch": 0.3189457601222307,
"grad_norm": 4.653531281239861,
"learning_rate": 1.5934764273037011e-06,
"loss": 0.5121,
"step": 835
},
{
"epoch": 0.31932773109243695,
"grad_norm": 4.039817069755923,
"learning_rate": 1.5924801027696879e-06,
"loss": 0.4844,
"step": 836
},
{
"epoch": 0.31970970206264326,
"grad_norm": 5.172647458886336,
"learning_rate": 1.5914828711505221e-06,
"loss": 0.5569,
"step": 837
},
{
"epoch": 0.3200916730328495,
"grad_norm": 4.281902465737346,
"learning_rate": 1.5904847339729627e-06,
"loss": 0.5001,
"step": 838
},
{
"epoch": 0.3204736440030558,
"grad_norm": 8.404705127449969,
"learning_rate": 1.5894856927651538e-06,
"loss": 0.5092,
"step": 839
},
{
"epoch": 0.32085561497326204,
"grad_norm": 2.973031951264967,
"learning_rate": 1.5884857490566248e-06,
"loss": 0.5156,
"step": 840
},
{
"epoch": 0.3212375859434683,
"grad_norm": 2.3748881103612334,
"learning_rate": 1.587484904378286e-06,
"loss": 0.5132,
"step": 841
},
{
"epoch": 0.32161955691367455,
"grad_norm": 3.582807394186353,
"learning_rate": 1.586483160262428e-06,
"loss": 0.5619,
"step": 842
},
{
"epoch": 0.3220015278838808,
"grad_norm": 2.8054071099189954,
"learning_rate": 1.585480518242717e-06,
"loss": 0.5356,
"step": 843
},
{
"epoch": 0.32238349885408707,
"grad_norm": 2.0739297348896106,
"learning_rate": 1.5844769798541957e-06,
"loss": 0.434,
"step": 844
},
{
"epoch": 0.3227654698242933,
"grad_norm": 2.7875432362127843,
"learning_rate": 1.5834725466332777e-06,
"loss": 0.4314,
"step": 845
},
{
"epoch": 0.32314744079449964,
"grad_norm": 8.036151517702406,
"learning_rate": 1.5824672201177469e-06,
"loss": 0.524,
"step": 846
},
{
"epoch": 0.3235294117647059,
"grad_norm": 2.9892156832631587,
"learning_rate": 1.5814610018467552e-06,
"loss": 0.6175,
"step": 847
},
{
"epoch": 0.32391138273491216,
"grad_norm": 4.311205923674658,
"learning_rate": 1.5804538933608194e-06,
"loss": 0.5605,
"step": 848
},
{
"epoch": 0.3242933537051184,
"grad_norm": 3.5892677119039305,
"learning_rate": 1.5794458962018195e-06,
"loss": 0.4768,
"step": 849
},
{
"epoch": 0.3246753246753247,
"grad_norm": 6.4505241661083845,
"learning_rate": 1.5784370119129964e-06,
"loss": 0.5404,
"step": 850
},
{
"epoch": 0.32505729564553093,
"grad_norm": 12.166502490348956,
"learning_rate": 1.5774272420389481e-06,
"loss": 0.5269,
"step": 851
},
{
"epoch": 0.3254392666157372,
"grad_norm": 11.495508400688843,
"learning_rate": 1.576416588125629e-06,
"loss": 0.4747,
"step": 852
},
{
"epoch": 0.32582123758594345,
"grad_norm": 3.701605295686685,
"learning_rate": 1.5754050517203477e-06,
"loss": 0.5328,
"step": 853
},
{
"epoch": 0.32620320855614976,
"grad_norm": 15.817995551065785,
"learning_rate": 1.5743926343717626e-06,
"loss": 0.5315,
"step": 854
},
{
"epoch": 0.326585179526356,
"grad_norm": 13.590005349057503,
"learning_rate": 1.5733793376298818e-06,
"loss": 0.5502,
"step": 855
},
{
"epoch": 0.3269671504965623,
"grad_norm": 3.0797220814221093,
"learning_rate": 1.572365163046059e-06,
"loss": 0.5087,
"step": 856
},
{
"epoch": 0.32734912146676853,
"grad_norm": 11.583847119214154,
"learning_rate": 1.571350112172993e-06,
"loss": 0.4712,
"step": 857
},
{
"epoch": 0.3277310924369748,
"grad_norm": 3.2961430613576517,
"learning_rate": 1.5703341865647225e-06,
"loss": 0.501,
"step": 858
},
{
"epoch": 0.32811306340718105,
"grad_norm": 3.928057328770376,
"learning_rate": 1.569317387776627e-06,
"loss": 0.5554,
"step": 859
},
{
"epoch": 0.3284950343773873,
"grad_norm": 2.5322774096420075,
"learning_rate": 1.5682997173654217e-06,
"loss": 0.5155,
"step": 860
},
{
"epoch": 0.32887700534759357,
"grad_norm": 6.961657500082668,
"learning_rate": 1.567281176889158e-06,
"loss": 0.5751,
"step": 861
},
{
"epoch": 0.3292589763177998,
"grad_norm": 2.686658068544792,
"learning_rate": 1.5662617679072166e-06,
"loss": 0.4305,
"step": 862
},
{
"epoch": 0.32964094728800614,
"grad_norm": 3.691449670890614,
"learning_rate": 1.56524149198031e-06,
"loss": 0.4606,
"step": 863
},
{
"epoch": 0.3300229182582124,
"grad_norm": 5.90480604880744,
"learning_rate": 1.5642203506704777e-06,
"loss": 0.5178,
"step": 864
},
{
"epoch": 0.33040488922841865,
"grad_norm": 3.246819425219357,
"learning_rate": 1.5631983455410835e-06,
"loss": 0.5048,
"step": 865
},
{
"epoch": 0.3307868601986249,
"grad_norm": 2.9229315626683285,
"learning_rate": 1.562175478156814e-06,
"loss": 0.5188,
"step": 866
},
{
"epoch": 0.33116883116883117,
"grad_norm": 3.9641201967098465,
"learning_rate": 1.5611517500836757e-06,
"loss": 0.4967,
"step": 867
},
{
"epoch": 0.3315508021390374,
"grad_norm": 2.964095577159658,
"learning_rate": 1.5601271628889939e-06,
"loss": 0.4629,
"step": 868
},
{
"epoch": 0.3319327731092437,
"grad_norm": 3.108639008409921,
"learning_rate": 1.5591017181414072e-06,
"loss": 0.5235,
"step": 869
},
{
"epoch": 0.33231474407944994,
"grad_norm": 5.145992404038292,
"learning_rate": 1.558075417410869e-06,
"loss": 0.4824,
"step": 870
},
{
"epoch": 0.3326967150496562,
"grad_norm": 4.293286297651328,
"learning_rate": 1.557048262268642e-06,
"loss": 0.5072,
"step": 871
},
{
"epoch": 0.3330786860198625,
"grad_norm": 2.4399864551955286,
"learning_rate": 1.556020254287298e-06,
"loss": 0.5023,
"step": 872
},
{
"epoch": 0.33346065699006877,
"grad_norm": 3.949450729997129,
"learning_rate": 1.5549913950407132e-06,
"loss": 0.5172,
"step": 873
},
{
"epoch": 0.33384262796027503,
"grad_norm": 3.399165386574725,
"learning_rate": 1.5539616861040688e-06,
"loss": 0.5144,
"step": 874
},
{
"epoch": 0.3342245989304813,
"grad_norm": 4.669292406099275,
"learning_rate": 1.552931129053845e-06,
"loss": 0.6242,
"step": 875
},
{
"epoch": 0.33460656990068754,
"grad_norm": 3.1373804904522458,
"learning_rate": 1.5518997254678217e-06,
"loss": 0.5548,
"step": 876
},
{
"epoch": 0.3349885408708938,
"grad_norm": 2.916976585751487,
"learning_rate": 1.5508674769250748e-06,
"loss": 0.5318,
"step": 877
},
{
"epoch": 0.33537051184110006,
"grad_norm": 3.5908802384512413,
"learning_rate": 1.549834385005973e-06,
"loss": 0.4623,
"step": 878
},
{
"epoch": 0.3357524828113063,
"grad_norm": 2.200974095180301,
"learning_rate": 1.5488004512921768e-06,
"loss": 0.4454,
"step": 879
},
{
"epoch": 0.33613445378151263,
"grad_norm": 3.3527657355943656,
"learning_rate": 1.547765677366636e-06,
"loss": 0.427,
"step": 880
},
{
"epoch": 0.3365164247517189,
"grad_norm": 2.837928974571468,
"learning_rate": 1.5467300648135859e-06,
"loss": 0.4713,
"step": 881
},
{
"epoch": 0.33689839572192515,
"grad_norm": 2.788964382737859,
"learning_rate": 1.5456936152185458e-06,
"loss": 0.4874,
"step": 882
},
{
"epoch": 0.3372803666921314,
"grad_norm": 6.609913165528445,
"learning_rate": 1.5446563301683169e-06,
"loss": 0.4809,
"step": 883
},
{
"epoch": 0.33766233766233766,
"grad_norm": 3.0894351341308135,
"learning_rate": 1.5436182112509793e-06,
"loss": 0.4754,
"step": 884
},
{
"epoch": 0.3380443086325439,
"grad_norm": 2.98238262427399,
"learning_rate": 1.5425792600558898e-06,
"loss": 0.5204,
"step": 885
},
{
"epoch": 0.3384262796027502,
"grad_norm": 3.3134083797259413,
"learning_rate": 1.541539478173679e-06,
"loss": 0.461,
"step": 886
},
{
"epoch": 0.33880825057295644,
"grad_norm": 3.0428586325349434,
"learning_rate": 1.5404988671962505e-06,
"loss": 0.5149,
"step": 887
},
{
"epoch": 0.3391902215431627,
"grad_norm": 8.556822675299031,
"learning_rate": 1.5394574287167756e-06,
"loss": 0.5548,
"step": 888
},
{
"epoch": 0.339572192513369,
"grad_norm": 40.331980616781514,
"learning_rate": 1.5384151643296935e-06,
"loss": 0.5335,
"step": 889
},
{
"epoch": 0.33995416348357527,
"grad_norm": 3.2769661613654697,
"learning_rate": 1.5373720756307078e-06,
"loss": 0.5156,
"step": 890
},
{
"epoch": 0.3403361344537815,
"grad_norm": 2.630930679330716,
"learning_rate": 1.536328164216784e-06,
"loss": 0.5501,
"step": 891
},
{
"epoch": 0.3407181054239878,
"grad_norm": 3.487737259280104,
"learning_rate": 1.5352834316861472e-06,
"loss": 0.4984,
"step": 892
},
{
"epoch": 0.34110007639419404,
"grad_norm": 3.1969075890967336,
"learning_rate": 1.534237879638279e-06,
"loss": 0.4516,
"step": 893
},
{
"epoch": 0.3414820473644003,
"grad_norm": 4.640794216980847,
"learning_rate": 1.5331915096739172e-06,
"loss": 0.4934,
"step": 894
},
{
"epoch": 0.34186401833460656,
"grad_norm": 3.252307864765291,
"learning_rate": 1.53214432339505e-06,
"loss": 0.5368,
"step": 895
},
{
"epoch": 0.3422459893048128,
"grad_norm": 5.048618425090819,
"learning_rate": 1.5310963224049172e-06,
"loss": 0.4647,
"step": 896
},
{
"epoch": 0.3426279602750191,
"grad_norm": 6.500653003128015,
"learning_rate": 1.5300475083080038e-06,
"loss": 0.5108,
"step": 897
},
{
"epoch": 0.3430099312452254,
"grad_norm": 6.8570957671235915,
"learning_rate": 1.528997882710042e-06,
"loss": 0.4578,
"step": 898
},
{
"epoch": 0.34339190221543164,
"grad_norm": 2.578823000754853,
"learning_rate": 1.5279474472180045e-06,
"loss": 0.4267,
"step": 899
},
{
"epoch": 0.3437738731856379,
"grad_norm": 3.414024136450465,
"learning_rate": 1.526896203440105e-06,
"loss": 0.5203,
"step": 900
},
{
"epoch": 0.34415584415584416,
"grad_norm": 8.992620514875538,
"learning_rate": 1.525844152985794e-06,
"loss": 0.5287,
"step": 901
},
{
"epoch": 0.3445378151260504,
"grad_norm": 4.945730735463921,
"learning_rate": 1.5247912974657579e-06,
"loss": 0.6097,
"step": 902
},
{
"epoch": 0.3449197860962567,
"grad_norm": 5.514250273126038,
"learning_rate": 1.523737638491915e-06,
"loss": 0.5202,
"step": 903
},
{
"epoch": 0.34530175706646293,
"grad_norm": 3.6791103306942485,
"learning_rate": 1.5226831776774139e-06,
"loss": 0.4856,
"step": 904
},
{
"epoch": 0.3456837280366692,
"grad_norm": 4.689131774070413,
"learning_rate": 1.5216279166366306e-06,
"loss": 0.5033,
"step": 905
},
{
"epoch": 0.3460656990068755,
"grad_norm": 2.189339157882484,
"learning_rate": 1.5205718569851665e-06,
"loss": 0.4472,
"step": 906
},
{
"epoch": 0.34644766997708176,
"grad_norm": 6.16605398997605,
"learning_rate": 1.5195150003398455e-06,
"loss": 0.5039,
"step": 907
},
{
"epoch": 0.346829640947288,
"grad_norm": 110.6126110619747,
"learning_rate": 1.5184573483187116e-06,
"loss": 0.4873,
"step": 908
},
{
"epoch": 0.3472116119174943,
"grad_norm": 2.7540819070670244,
"learning_rate": 1.517398902541027e-06,
"loss": 0.5104,
"step": 909
},
{
"epoch": 0.34759358288770054,
"grad_norm": 3.353857432293008,
"learning_rate": 1.5163396646272686e-06,
"loss": 0.568,
"step": 910
},
{
"epoch": 0.3479755538579068,
"grad_norm": 3.8221680503464,
"learning_rate": 1.5152796361991264e-06,
"loss": 0.4608,
"step": 911
},
{
"epoch": 0.34835752482811305,
"grad_norm": 5.973897087213475,
"learning_rate": 1.5142188188795005e-06,
"loss": 0.4826,
"step": 912
},
{
"epoch": 0.3487394957983193,
"grad_norm": 3.3970328884940275,
"learning_rate": 1.5131572142924989e-06,
"loss": 0.5158,
"step": 913
},
{
"epoch": 0.34912146676852557,
"grad_norm": 4.76313072091096,
"learning_rate": 1.5120948240634347e-06,
"loss": 0.4796,
"step": 914
},
{
"epoch": 0.3495034377387319,
"grad_norm": 5.967721554053427,
"learning_rate": 1.511031649818824e-06,
"loss": 0.5212,
"step": 915
},
{
"epoch": 0.34988540870893814,
"grad_norm": 2.9216360097531178,
"learning_rate": 1.5099676931863834e-06,
"loss": 0.5179,
"step": 916
},
{
"epoch": 0.3502673796791444,
"grad_norm": 3.172000569956292,
"learning_rate": 1.5089029557950271e-06,
"loss": 0.5656,
"step": 917
},
{
"epoch": 0.35064935064935066,
"grad_norm": 5.813299529779351,
"learning_rate": 1.507837439274864e-06,
"loss": 0.4995,
"step": 918
},
{
"epoch": 0.3510313216195569,
"grad_norm": 3.7781237809507098,
"learning_rate": 1.5067711452571975e-06,
"loss": 0.5843,
"step": 919
},
{
"epoch": 0.35141329258976317,
"grad_norm": 10.782452205742599,
"learning_rate": 1.5057040753745195e-06,
"loss": 0.5142,
"step": 920
},
{
"epoch": 0.35179526355996943,
"grad_norm": 5.7483669629282454,
"learning_rate": 1.5046362312605111e-06,
"loss": 0.5085,
"step": 921
},
{
"epoch": 0.3521772345301757,
"grad_norm": 4.458336973237563,
"learning_rate": 1.5035676145500381e-06,
"loss": 0.5518,
"step": 922
},
{
"epoch": 0.35255920550038194,
"grad_norm": 3.4207203325004905,
"learning_rate": 1.5024982268791492e-06,
"loss": 0.5429,
"step": 923
},
{
"epoch": 0.35294117647058826,
"grad_norm": 2.7591171419666174,
"learning_rate": 1.5014280698850738e-06,
"loss": 0.4842,
"step": 924
},
{
"epoch": 0.3533231474407945,
"grad_norm": 2.209637927985218,
"learning_rate": 1.5003571452062188e-06,
"loss": 0.4312,
"step": 925
},
{
"epoch": 0.3537051184110008,
"grad_norm": 4.7613159279196005,
"learning_rate": 1.499285454482166e-06,
"loss": 0.5084,
"step": 926
},
{
"epoch": 0.35408708938120703,
"grad_norm": 3.4608843679091894,
"learning_rate": 1.498212999353671e-06,
"loss": 0.5299,
"step": 927
},
{
"epoch": 0.3544690603514133,
"grad_norm": 2.9484392042079395,
"learning_rate": 1.497139781462659e-06,
"loss": 0.5075,
"step": 928
},
{
"epoch": 0.35485103132161955,
"grad_norm": 4.041471908677798,
"learning_rate": 1.496065802452223e-06,
"loss": 0.4838,
"step": 929
},
{
"epoch": 0.3552330022918258,
"grad_norm": 3.0586178857981374,
"learning_rate": 1.4949910639666217e-06,
"loss": 0.464,
"step": 930
},
{
"epoch": 0.35561497326203206,
"grad_norm": 8.57318431884878,
"learning_rate": 1.4939155676512765e-06,
"loss": 0.5569,
"step": 931
},
{
"epoch": 0.3559969442322384,
"grad_norm": 2.91974056190333,
"learning_rate": 1.4928393151527682e-06,
"loss": 0.5208,
"step": 932
},
{
"epoch": 0.35637891520244463,
"grad_norm": 4.689793265209578,
"learning_rate": 1.4917623081188367e-06,
"loss": 0.4852,
"step": 933
},
{
"epoch": 0.3567608861726509,
"grad_norm": 43.24004926210291,
"learning_rate": 1.4906845481983756e-06,
"loss": 0.5409,
"step": 934
},
{
"epoch": 0.35714285714285715,
"grad_norm": 2.663567979947614,
"learning_rate": 1.4896060370414327e-06,
"loss": 0.4745,
"step": 935
},
{
"epoch": 0.3575248281130634,
"grad_norm": 5.681770625260996,
"learning_rate": 1.4885267762992046e-06,
"loss": 0.5177,
"step": 936
},
{
"epoch": 0.35790679908326967,
"grad_norm": 2.8746557940398936,
"learning_rate": 1.4874467676240361e-06,
"loss": 0.4514,
"step": 937
},
{
"epoch": 0.3582887700534759,
"grad_norm": 2.72188447367986,
"learning_rate": 1.4863660126694172e-06,
"loss": 0.4785,
"step": 938
},
{
"epoch": 0.3586707410236822,
"grad_norm": 3.6175411817892997,
"learning_rate": 1.4852845130899801e-06,
"loss": 0.4918,
"step": 939
},
{
"epoch": 0.35905271199388844,
"grad_norm": 20.16523000109139,
"learning_rate": 1.4842022705414975e-06,
"loss": 0.5141,
"step": 940
},
{
"epoch": 0.35943468296409475,
"grad_norm": 3.31480666789411,
"learning_rate": 1.4831192866808789e-06,
"loss": 0.575,
"step": 941
},
{
"epoch": 0.359816653934301,
"grad_norm": 5.126295586451847,
"learning_rate": 1.4820355631661695e-06,
"loss": 0.4575,
"step": 942
},
{
"epoch": 0.36019862490450727,
"grad_norm": 4.398695016233855,
"learning_rate": 1.4809511016565467e-06,
"loss": 0.4898,
"step": 943
},
{
"epoch": 0.3605805958747135,
"grad_norm": 5.322409252746599,
"learning_rate": 1.4798659038123174e-06,
"loss": 0.4963,
"step": 944
},
{
"epoch": 0.3609625668449198,
"grad_norm": 2.9183001547754595,
"learning_rate": 1.478779971294916e-06,
"loss": 0.4457,
"step": 945
},
{
"epoch": 0.36134453781512604,
"grad_norm": 36.832544737198184,
"learning_rate": 1.4776933057669015e-06,
"loss": 0.5173,
"step": 946
},
{
"epoch": 0.3617265087853323,
"grad_norm": 3.408698119711173,
"learning_rate": 1.4766059088919556e-06,
"loss": 0.563,
"step": 947
},
{
"epoch": 0.36210847975553856,
"grad_norm": 7.428213130895087,
"learning_rate": 1.4755177823348796e-06,
"loss": 0.4943,
"step": 948
},
{
"epoch": 0.3624904507257448,
"grad_norm": 4.738013482670962,
"learning_rate": 1.4744289277615915e-06,
"loss": 0.5229,
"step": 949
},
{
"epoch": 0.36287242169595113,
"grad_norm": 2.378619303414155,
"learning_rate": 1.4733393468391246e-06,
"loss": 0.4536,
"step": 950
},
{
"epoch": 0.3632543926661574,
"grad_norm": 3.403223882172209,
"learning_rate": 1.4722490412356234e-06,
"loss": 0.5731,
"step": 951
},
{
"epoch": 0.36363636363636365,
"grad_norm": 4.240397305405459,
"learning_rate": 1.4711580126203425e-06,
"loss": 0.5746,
"step": 952
},
{
"epoch": 0.3640183346065699,
"grad_norm": 2.5189168596943823,
"learning_rate": 1.4700662626636432e-06,
"loss": 0.4387,
"step": 953
},
{
"epoch": 0.36440030557677616,
"grad_norm": 6.257987655553435,
"learning_rate": 1.4689737930369916e-06,
"loss": 0.455,
"step": 954
},
{
"epoch": 0.3647822765469824,
"grad_norm": 2.5872762152510487,
"learning_rate": 1.4678806054129546e-06,
"loss": 0.4462,
"step": 955
},
{
"epoch": 0.3651642475171887,
"grad_norm": 4.1215541610904936,
"learning_rate": 1.4667867014651992e-06,
"loss": 0.5034,
"step": 956
},
{
"epoch": 0.36554621848739494,
"grad_norm": 4.878479213331288,
"learning_rate": 1.465692082868489e-06,
"loss": 0.6026,
"step": 957
},
{
"epoch": 0.36592818945760125,
"grad_norm": 6.856687402344972,
"learning_rate": 1.4645967512986817e-06,
"loss": 0.5123,
"step": 958
},
{
"epoch": 0.3663101604278075,
"grad_norm": 5.736416942272037,
"learning_rate": 1.463500708432726e-06,
"loss": 0.5408,
"step": 959
},
{
"epoch": 0.36669213139801377,
"grad_norm": 7.295068052144105,
"learning_rate": 1.46240395594866e-06,
"loss": 0.4872,
"step": 960
},
{
"epoch": 0.36707410236822,
"grad_norm": 6.789232985077266,
"learning_rate": 1.461306495525609e-06,
"loss": 0.5294,
"step": 961
},
{
"epoch": 0.3674560733384263,
"grad_norm": 10.419922076787683,
"learning_rate": 1.4602083288437807e-06,
"loss": 0.5054,
"step": 962
},
{
"epoch": 0.36783804430863254,
"grad_norm": 3.8318471946405115,
"learning_rate": 1.459109457584465e-06,
"loss": 0.6145,
"step": 963
},
{
"epoch": 0.3682200152788388,
"grad_norm": 2.7453447380279865,
"learning_rate": 1.4580098834300306e-06,
"loss": 0.4706,
"step": 964
},
{
"epoch": 0.36860198624904505,
"grad_norm": 3.3957056908352534,
"learning_rate": 1.4569096080639217e-06,
"loss": 0.4816,
"step": 965
},
{
"epoch": 0.3689839572192513,
"grad_norm": 21.643808173296733,
"learning_rate": 1.4558086331706566e-06,
"loss": 0.5029,
"step": 966
},
{
"epoch": 0.3693659281894576,
"grad_norm": 3.509279777143902,
"learning_rate": 1.4547069604358237e-06,
"loss": 0.4814,
"step": 967
},
{
"epoch": 0.3697478991596639,
"grad_norm": 2.7867056631866935,
"learning_rate": 1.4536045915460815e-06,
"loss": 0.4967,
"step": 968
},
{
"epoch": 0.37012987012987014,
"grad_norm": 5.711653548074173,
"learning_rate": 1.4525015281891525e-06,
"loss": 0.543,
"step": 969
},
{
"epoch": 0.3705118411000764,
"grad_norm": 3.753851754021124,
"learning_rate": 1.451397772053824e-06,
"loss": 0.4908,
"step": 970
},
{
"epoch": 0.37089381207028266,
"grad_norm": 3.0791284718021577,
"learning_rate": 1.450293324829942e-06,
"loss": 0.4725,
"step": 971
},
{
"epoch": 0.3712757830404889,
"grad_norm": 3.5159822093311144,
"learning_rate": 1.4491881882084122e-06,
"loss": 0.5371,
"step": 972
},
{
"epoch": 0.3716577540106952,
"grad_norm": 3.6907461066883953,
"learning_rate": 1.4480823638811957e-06,
"loss": 0.5091,
"step": 973
},
{
"epoch": 0.37203972498090143,
"grad_norm": 18.50413628464325,
"learning_rate": 1.4469758535413056e-06,
"loss": 0.5464,
"step": 974
},
{
"epoch": 0.3724216959511077,
"grad_norm": 37.857841620925036,
"learning_rate": 1.4458686588828055e-06,
"loss": 0.4892,
"step": 975
},
{
"epoch": 0.372803666921314,
"grad_norm": 4.062087723456443,
"learning_rate": 1.4447607816008073e-06,
"loss": 0.4883,
"step": 976
},
{
"epoch": 0.37318563789152026,
"grad_norm": 4.0502899507248475,
"learning_rate": 1.4436522233914676e-06,
"loss": 0.5115,
"step": 977
},
{
"epoch": 0.3735676088617265,
"grad_norm": 3.2830982872044614,
"learning_rate": 1.442542985951985e-06,
"loss": 0.5604,
"step": 978
},
{
"epoch": 0.3739495798319328,
"grad_norm": 3.7574384597214383,
"learning_rate": 1.4414330709805988e-06,
"loss": 0.4615,
"step": 979
},
{
"epoch": 0.37433155080213903,
"grad_norm": 9.626314969376834,
"learning_rate": 1.4403224801765856e-06,
"loss": 0.4969,
"step": 980
},
{
"epoch": 0.3747135217723453,
"grad_norm": 3.031009660999545,
"learning_rate": 1.439211215240256e-06,
"loss": 0.5262,
"step": 981
},
{
"epoch": 0.37509549274255155,
"grad_norm": 3.6146467716884856,
"learning_rate": 1.438099277872953e-06,
"loss": 0.5456,
"step": 982
},
{
"epoch": 0.3754774637127578,
"grad_norm": 2.5938956320763737,
"learning_rate": 1.4369866697770494e-06,
"loss": 0.5533,
"step": 983
},
{
"epoch": 0.3758594346829641,
"grad_norm": 6.142448242177804,
"learning_rate": 1.435873392655945e-06,
"loss": 0.4812,
"step": 984
},
{
"epoch": 0.3762414056531704,
"grad_norm": 3.5063299893044184,
"learning_rate": 1.4347594482140629e-06,
"loss": 0.4996,
"step": 985
},
{
"epoch": 0.37662337662337664,
"grad_norm": 3.1597222773438127,
"learning_rate": 1.4336448381568484e-06,
"loss": 0.4993,
"step": 986
},
{
"epoch": 0.3770053475935829,
"grad_norm": 4.8775854732586295,
"learning_rate": 1.4325295641907666e-06,
"loss": 0.4694,
"step": 987
},
{
"epoch": 0.37738731856378915,
"grad_norm": 13.788910507294815,
"learning_rate": 1.4314136280232983e-06,
"loss": 0.4994,
"step": 988
},
{
"epoch": 0.3777692895339954,
"grad_norm": 5.68951446399639,
"learning_rate": 1.4302970313629375e-06,
"loss": 0.5316,
"step": 989
},
{
"epoch": 0.37815126050420167,
"grad_norm": 2.690273908691655,
"learning_rate": 1.429179775919191e-06,
"loss": 0.5368,
"step": 990
},
{
"epoch": 0.3785332314744079,
"grad_norm": 3.22293202334982,
"learning_rate": 1.428061863402573e-06,
"loss": 0.4405,
"step": 991
},
{
"epoch": 0.3789152024446142,
"grad_norm": 2.495156170749678,
"learning_rate": 1.426943295524604e-06,
"loss": 0.4883,
"step": 992
},
{
"epoch": 0.3792971734148205,
"grad_norm": 7.075696664168943,
"learning_rate": 1.4258240739978073e-06,
"loss": 0.5525,
"step": 993
},
{
"epoch": 0.37967914438502676,
"grad_norm": 3.803537398333798,
"learning_rate": 1.4247042005357086e-06,
"loss": 0.5246,
"step": 994
},
{
"epoch": 0.380061115355233,
"grad_norm": 6.842575378629883,
"learning_rate": 1.4235836768528297e-06,
"loss": 0.5811,
"step": 995
},
{
"epoch": 0.3804430863254393,
"grad_norm": 3.0681376523619655,
"learning_rate": 1.4224625046646895e-06,
"loss": 0.5135,
"step": 996
},
{
"epoch": 0.38082505729564553,
"grad_norm": 3.347087827176987,
"learning_rate": 1.4213406856877983e-06,
"loss": 0.4539,
"step": 997
},
{
"epoch": 0.3812070282658518,
"grad_norm": 4.27287433479762,
"learning_rate": 1.420218221639658e-06,
"loss": 0.5221,
"step": 998
},
{
"epoch": 0.38158899923605805,
"grad_norm": 3.439342338737529,
"learning_rate": 1.4190951142387574e-06,
"loss": 0.5051,
"step": 999
},
{
"epoch": 0.3819709702062643,
"grad_norm": 3.194297131410048,
"learning_rate": 1.4179713652045701e-06,
"loss": 0.4774,
"step": 1000
},
{
"epoch": 0.38235294117647056,
"grad_norm": 13.77060808359035,
"learning_rate": 1.416846976257552e-06,
"loss": 0.3837,
"step": 1001
},
{
"epoch": 0.3827349121466769,
"grad_norm": 4.67459855390606,
"learning_rate": 1.4157219491191402e-06,
"loss": 0.4918,
"step": 1002
},
{
"epoch": 0.38311688311688313,
"grad_norm": 5.012892064717471,
"learning_rate": 1.4145962855117463e-06,
"loss": 0.5793,
"step": 1003
},
{
"epoch": 0.3834988540870894,
"grad_norm": 3.2392189698830522,
"learning_rate": 1.4134699871587583e-06,
"loss": 0.4695,
"step": 1004
},
{
"epoch": 0.38388082505729565,
"grad_norm": 3.4936570085767307,
"learning_rate": 1.4123430557845355e-06,
"loss": 0.4915,
"step": 1005
},
{
"epoch": 0.3842627960275019,
"grad_norm": 4.30960732257421,
"learning_rate": 1.4112154931144065e-06,
"loss": 0.5364,
"step": 1006
},
{
"epoch": 0.38464476699770817,
"grad_norm": 4.6293362264783475,
"learning_rate": 1.4100873008746654e-06,
"loss": 0.4841,
"step": 1007
},
{
"epoch": 0.3850267379679144,
"grad_norm": 4.359299187255675,
"learning_rate": 1.4089584807925712e-06,
"loss": 0.5463,
"step": 1008
},
{
"epoch": 0.3854087089381207,
"grad_norm": 5.72992461473088,
"learning_rate": 1.4078290345963442e-06,
"loss": 0.4207,
"step": 1009
},
{
"epoch": 0.385790679908327,
"grad_norm": 3.166354780239239,
"learning_rate": 1.4066989640151628e-06,
"loss": 0.5181,
"step": 1010
},
{
"epoch": 0.38617265087853325,
"grad_norm": 12.031522401908607,
"learning_rate": 1.4055682707791613e-06,
"loss": 0.5968,
"step": 1011
},
{
"epoch": 0.3865546218487395,
"grad_norm": 5.0095346697800265,
"learning_rate": 1.4044369566194272e-06,
"loss": 0.5062,
"step": 1012
},
{
"epoch": 0.38693659281894577,
"grad_norm": 3.876436159087894,
"learning_rate": 1.4033050232679994e-06,
"loss": 0.5436,
"step": 1013
},
{
"epoch": 0.387318563789152,
"grad_norm": 5.328253283162843,
"learning_rate": 1.4021724724578643e-06,
"loss": 0.5046,
"step": 1014
},
{
"epoch": 0.3877005347593583,
"grad_norm": 3.774429591348757,
"learning_rate": 1.4010393059229531e-06,
"loss": 0.4722,
"step": 1015
},
{
"epoch": 0.38808250572956454,
"grad_norm": 3.1480818323642943,
"learning_rate": 1.3999055253981404e-06,
"loss": 0.5092,
"step": 1016
},
{
"epoch": 0.3884644766997708,
"grad_norm": 3.365454476940935,
"learning_rate": 1.3987711326192407e-06,
"loss": 0.5091,
"step": 1017
},
{
"epoch": 0.38884644766997706,
"grad_norm": 4.863633592183512,
"learning_rate": 1.3976361293230057e-06,
"loss": 0.533,
"step": 1018
},
{
"epoch": 0.38922841864018337,
"grad_norm": 6.886090786327392,
"learning_rate": 1.3965005172471216e-06,
"loss": 0.4529,
"step": 1019
},
{
"epoch": 0.38961038961038963,
"grad_norm": 7.3423065455983325,
"learning_rate": 1.3953642981302069e-06,
"loss": 0.5695,
"step": 1020
},
{
"epoch": 0.3899923605805959,
"grad_norm": 5.385929885807556,
"learning_rate": 1.39422747371181e-06,
"loss": 0.5426,
"step": 1021
},
{
"epoch": 0.39037433155080214,
"grad_norm": 2.5061006448252496,
"learning_rate": 1.3930900457324049e-06,
"loss": 0.4652,
"step": 1022
},
{
"epoch": 0.3907563025210084,
"grad_norm": 3.441807437463952,
"learning_rate": 1.39195201593339e-06,
"loss": 0.4943,
"step": 1023
},
{
"epoch": 0.39113827349121466,
"grad_norm": 2.7967035679368677,
"learning_rate": 1.390813386057086e-06,
"loss": 0.4768,
"step": 1024
},
{
"epoch": 0.3915202444614209,
"grad_norm": 12.060161384187719,
"learning_rate": 1.3896741578467312e-06,
"loss": 0.5359,
"step": 1025
},
{
"epoch": 0.3919022154316272,
"grad_norm": 4.632740176202578,
"learning_rate": 1.3885343330464806e-06,
"loss": 0.4465,
"step": 1026
},
{
"epoch": 0.39228418640183343,
"grad_norm": 5.177448822725735,
"learning_rate": 1.3873939134014018e-06,
"loss": 0.4365,
"step": 1027
},
{
"epoch": 0.39266615737203975,
"grad_norm": 3.291902676662862,
"learning_rate": 1.3862529006574746e-06,
"loss": 0.5519,
"step": 1028
},
{
"epoch": 0.393048128342246,
"grad_norm": 18.431443968539902,
"learning_rate": 1.385111296561585e-06,
"loss": 0.5076,
"step": 1029
},
{
"epoch": 0.39343009931245226,
"grad_norm": 5.846749288635415,
"learning_rate": 1.3839691028615256e-06,
"loss": 0.4993,
"step": 1030
},
{
"epoch": 0.3938120702826585,
"grad_norm": 2.518950053819085,
"learning_rate": 1.3828263213059915e-06,
"loss": 0.5063,
"step": 1031
},
{
"epoch": 0.3941940412528648,
"grad_norm": 4.014978335278681,
"learning_rate": 1.3816829536445772e-06,
"loss": 0.5106,
"step": 1032
},
{
"epoch": 0.39457601222307104,
"grad_norm": 2.3272006995407195,
"learning_rate": 1.380539001627775e-06,
"loss": 0.443,
"step": 1033
},
{
"epoch": 0.3949579831932773,
"grad_norm": 3.199604096797627,
"learning_rate": 1.3793944670069722e-06,
"loss": 0.5355,
"step": 1034
},
{
"epoch": 0.39533995416348355,
"grad_norm": 16.726178083364527,
"learning_rate": 1.3782493515344475e-06,
"loss": 0.3989,
"step": 1035
},
{
"epoch": 0.39572192513368987,
"grad_norm": 4.009218609638227,
"learning_rate": 1.377103656963369e-06,
"loss": 0.5018,
"step": 1036
},
{
"epoch": 0.3961038961038961,
"grad_norm": 4.280555960177796,
"learning_rate": 1.3759573850477912e-06,
"loss": 0.4962,
"step": 1037
},
{
"epoch": 0.3964858670741024,
"grad_norm": 4.8440714534516465,
"learning_rate": 1.3748105375426529e-06,
"loss": 0.5608,
"step": 1038
},
{
"epoch": 0.39686783804430864,
"grad_norm": 4.633871486245415,
"learning_rate": 1.3736631162037742e-06,
"loss": 0.518,
"step": 1039
},
{
"epoch": 0.3972498090145149,
"grad_norm": 4.724392552772769,
"learning_rate": 1.3725151227878535e-06,
"loss": 0.5363,
"step": 1040
},
{
"epoch": 0.39763177998472116,
"grad_norm": 3.818997639249547,
"learning_rate": 1.3713665590524646e-06,
"loss": 0.5116,
"step": 1041
},
{
"epoch": 0.3980137509549274,
"grad_norm": 4.386142417291277,
"learning_rate": 1.3702174267560556e-06,
"loss": 0.5294,
"step": 1042
},
{
"epoch": 0.3983957219251337,
"grad_norm": 4.4357329961026934,
"learning_rate": 1.369067727657944e-06,
"loss": 0.5314,
"step": 1043
},
{
"epoch": 0.39877769289533993,
"grad_norm": 21.471149918132554,
"learning_rate": 1.3679174635183153e-06,
"loss": 0.5344,
"step": 1044
},
{
"epoch": 0.39915966386554624,
"grad_norm": 2.8010450710974797,
"learning_rate": 1.3667666360982203e-06,
"loss": 0.4522,
"step": 1045
},
{
"epoch": 0.3995416348357525,
"grad_norm": 3.5143617209461953,
"learning_rate": 1.3656152471595728e-06,
"loss": 0.4183,
"step": 1046
},
{
"epoch": 0.39992360580595876,
"grad_norm": 8.181944527126483,
"learning_rate": 1.3644632984651448e-06,
"loss": 0.4511,
"step": 1047
},
{
"epoch": 0.400305576776165,
"grad_norm": 5.379745216859139,
"learning_rate": 1.3633107917785664e-06,
"loss": 0.421,
"step": 1048
},
{
"epoch": 0.4006875477463713,
"grad_norm": 2.4402346546106646,
"learning_rate": 1.3621577288643216e-06,
"loss": 0.4628,
"step": 1049
},
{
"epoch": 0.40106951871657753,
"grad_norm": 3.9850365443403932,
"learning_rate": 1.361004111487746e-06,
"loss": 0.4997,
"step": 1050
},
{
"epoch": 0.4014514896867838,
"grad_norm": 2.704222372248545,
"learning_rate": 1.3598499414150246e-06,
"loss": 0.5521,
"step": 1051
},
{
"epoch": 0.40183346065699005,
"grad_norm": 2.8845884083437605,
"learning_rate": 1.3586952204131872e-06,
"loss": 0.5612,
"step": 1052
},
{
"epoch": 0.4022154316271963,
"grad_norm": 8.402444586867139,
"learning_rate": 1.3575399502501087e-06,
"loss": 0.5217,
"step": 1053
},
{
"epoch": 0.4025974025974026,
"grad_norm": 10.742726804172774,
"learning_rate": 1.356384132694504e-06,
"loss": 0.5248,
"step": 1054
},
{
"epoch": 0.4029793735676089,
"grad_norm": 6.63035505780104,
"learning_rate": 1.3552277695159263e-06,
"loss": 0.4687,
"step": 1055
},
{
"epoch": 0.40336134453781514,
"grad_norm": 3.0647578675508713,
"learning_rate": 1.3540708624847627e-06,
"loss": 0.4879,
"step": 1056
},
{
"epoch": 0.4037433155080214,
"grad_norm": 3.245881096555954,
"learning_rate": 1.3529134133722357e-06,
"loss": 0.4494,
"step": 1057
},
{
"epoch": 0.40412528647822765,
"grad_norm": 3.162026694383143,
"learning_rate": 1.3517554239503954e-06,
"loss": 0.477,
"step": 1058
},
{
"epoch": 0.4045072574484339,
"grad_norm": 2.3227326186227257,
"learning_rate": 1.3505968959921203e-06,
"loss": 0.4746,
"step": 1059
},
{
"epoch": 0.40488922841864017,
"grad_norm": 2.766756314398787,
"learning_rate": 1.3494378312711127e-06,
"loss": 0.5027,
"step": 1060
},
{
"epoch": 0.4052711993888464,
"grad_norm": 3.7607550125463973,
"learning_rate": 1.348278231561897e-06,
"loss": 0.4389,
"step": 1061
},
{
"epoch": 0.40565317035905274,
"grad_norm": 7.409561698492238,
"learning_rate": 1.3471180986398171e-06,
"loss": 0.5558,
"step": 1062
},
{
"epoch": 0.406035141329259,
"grad_norm": 3.782115767604086,
"learning_rate": 1.3459574342810323e-06,
"loss": 0.482,
"step": 1063
},
{
"epoch": 0.40641711229946526,
"grad_norm": 4.512926298981771,
"learning_rate": 1.3447962402625162e-06,
"loss": 0.5108,
"step": 1064
},
{
"epoch": 0.4067990832696715,
"grad_norm": 4.105292005167865,
"learning_rate": 1.3436345183620534e-06,
"loss": 0.5059,
"step": 1065
},
{
"epoch": 0.40718105423987777,
"grad_norm": 6.221311275182224,
"learning_rate": 1.3424722703582361e-06,
"loss": 0.4603,
"step": 1066
},
{
"epoch": 0.40756302521008403,
"grad_norm": 4.414167751227817,
"learning_rate": 1.3413094980304624e-06,
"loss": 0.5248,
"step": 1067
},
{
"epoch": 0.4079449961802903,
"grad_norm": 12.937308649413591,
"learning_rate": 1.340146203158933e-06,
"loss": 0.4323,
"step": 1068
},
{
"epoch": 0.40832696715049654,
"grad_norm": 5.121224260075271,
"learning_rate": 1.338982387524649e-06,
"loss": 0.5008,
"step": 1069
},
{
"epoch": 0.4087089381207028,
"grad_norm": 3.972389452208772,
"learning_rate": 1.337818052909408e-06,
"loss": 0.5284,
"step": 1070
},
{
"epoch": 0.4090909090909091,
"grad_norm": 3.4747430919143008,
"learning_rate": 1.3366532010958026e-06,
"loss": 0.5428,
"step": 1071
},
{
"epoch": 0.4094728800611154,
"grad_norm": 2.7257266623536758,
"learning_rate": 1.3354878338672175e-06,
"loss": 0.4877,
"step": 1072
},
{
"epoch": 0.40985485103132163,
"grad_norm": 3.5251937039902557,
"learning_rate": 1.3343219530078262e-06,
"loss": 0.5642,
"step": 1073
},
{
"epoch": 0.4102368220015279,
"grad_norm": 2.655070455453984,
"learning_rate": 1.3331555603025883e-06,
"loss": 0.505,
"step": 1074
},
{
"epoch": 0.41061879297173415,
"grad_norm": 3.4577485873396485,
"learning_rate": 1.3319886575372473e-06,
"loss": 0.4826,
"step": 1075
},
{
"epoch": 0.4110007639419404,
"grad_norm": 2.8641674357455105,
"learning_rate": 1.3308212464983281e-06,
"loss": 0.495,
"step": 1076
},
{
"epoch": 0.41138273491214666,
"grad_norm": 5.807472208222438,
"learning_rate": 1.3296533289731331e-06,
"loss": 0.5409,
"step": 1077
},
{
"epoch": 0.4117647058823529,
"grad_norm": 4.120319959181346,
"learning_rate": 1.3284849067497397e-06,
"loss": 0.513,
"step": 1078
},
{
"epoch": 0.4121466768525592,
"grad_norm": 2.579678150552606,
"learning_rate": 1.3273159816169996e-06,
"loss": 0.4294,
"step": 1079
},
{
"epoch": 0.4125286478227655,
"grad_norm": 2.268653454252923,
"learning_rate": 1.3261465553645328e-06,
"loss": 0.432,
"step": 1080
},
{
"epoch": 0.41291061879297175,
"grad_norm": 3.025464669627552,
"learning_rate": 1.3249766297827273e-06,
"loss": 0.4897,
"step": 1081
},
{
"epoch": 0.413292589763178,
"grad_norm": 2.581761258324958,
"learning_rate": 1.3238062066627355e-06,
"loss": 0.488,
"step": 1082
},
{
"epoch": 0.41367456073338427,
"grad_norm": 5.55222952808423,
"learning_rate": 1.3226352877964715e-06,
"loss": 0.5,
"step": 1083
},
{
"epoch": 0.4140565317035905,
"grad_norm": 3.507013858142906,
"learning_rate": 1.3214638749766084e-06,
"loss": 0.4684,
"step": 1084
},
{
"epoch": 0.4144385026737968,
"grad_norm": 8.364119968396363,
"learning_rate": 1.3202919699965756e-06,
"loss": 0.5217,
"step": 1085
},
{
"epoch": 0.41482047364400304,
"grad_norm": 6.241126374949574,
"learning_rate": 1.3191195746505555e-06,
"loss": 0.5791,
"step": 1086
},
{
"epoch": 0.4152024446142093,
"grad_norm": 5.717811993856367,
"learning_rate": 1.3179466907334815e-06,
"loss": 0.5686,
"step": 1087
},
{
"epoch": 0.4155844155844156,
"grad_norm": 16.33849161999853,
"learning_rate": 1.316773320041036e-06,
"loss": 0.4645,
"step": 1088
},
{
"epoch": 0.41596638655462187,
"grad_norm": 4.365623914056201,
"learning_rate": 1.315599464369645e-06,
"loss": 0.5383,
"step": 1089
},
{
"epoch": 0.4163483575248281,
"grad_norm": 6.688011636580966,
"learning_rate": 1.3144251255164784e-06,
"loss": 0.4655,
"step": 1090
},
{
"epoch": 0.4167303284950344,
"grad_norm": 6.08767403439254,
"learning_rate": 1.313250305279445e-06,
"loss": 0.491,
"step": 1091
},
{
"epoch": 0.41711229946524064,
"grad_norm": 4.5512808754921465,
"learning_rate": 1.3120750054571904e-06,
"loss": 0.4797,
"step": 1092
},
{
"epoch": 0.4174942704354469,
"grad_norm": 3.6174836414142613,
"learning_rate": 1.3108992278490955e-06,
"loss": 0.5254,
"step": 1093
},
{
"epoch": 0.41787624140565316,
"grad_norm": 2.5776694158723545,
"learning_rate": 1.309722974255272e-06,
"loss": 0.4617,
"step": 1094
},
{
"epoch": 0.4182582123758594,
"grad_norm": 3.392763958571057,
"learning_rate": 1.30854624647656e-06,
"loss": 0.5418,
"step": 1095
},
{
"epoch": 0.4186401833460657,
"grad_norm": 4.80249052847419,
"learning_rate": 1.3073690463145265e-06,
"loss": 0.511,
"step": 1096
},
{
"epoch": 0.419022154316272,
"grad_norm": 2.063868288679112,
"learning_rate": 1.3061913755714608e-06,
"loss": 0.4279,
"step": 1097
},
{
"epoch": 0.41940412528647825,
"grad_norm": 4.44752038172148,
"learning_rate": 1.3050132360503734e-06,
"loss": 0.5067,
"step": 1098
},
{
"epoch": 0.4197860962566845,
"grad_norm": 2.43079310683215,
"learning_rate": 1.3038346295549917e-06,
"loss": 0.5036,
"step": 1099
},
{
"epoch": 0.42016806722689076,
"grad_norm": 3.087986254246443,
"learning_rate": 1.3026555578897593e-06,
"loss": 0.4194,
"step": 1100
},
{
"epoch": 0.420550038197097,
"grad_norm": 2.861068330916895,
"learning_rate": 1.3014760228598301e-06,
"loss": 0.4778,
"step": 1101
},
{
"epoch": 0.4209320091673033,
"grad_norm": 3.1273940487669885,
"learning_rate": 1.3002960262710692e-06,
"loss": 0.4957,
"step": 1102
},
{
"epoch": 0.42131398013750954,
"grad_norm": 11.09705122642753,
"learning_rate": 1.2991155699300475e-06,
"loss": 0.4412,
"step": 1103
},
{
"epoch": 0.4216959511077158,
"grad_norm": 2.7211661339062307,
"learning_rate": 1.2979346556440395e-06,
"loss": 0.5002,
"step": 1104
},
{
"epoch": 0.42207792207792205,
"grad_norm": 2.7058097762325817,
"learning_rate": 1.296753285221022e-06,
"loss": 0.4455,
"step": 1105
},
{
"epoch": 0.42245989304812837,
"grad_norm": 3.33956849912416,
"learning_rate": 1.2955714604696691e-06,
"loss": 0.4393,
"step": 1106
},
{
"epoch": 0.4228418640183346,
"grad_norm": 4.30714280374409,
"learning_rate": 1.2943891831993501e-06,
"loss": 0.5359,
"step": 1107
},
{
"epoch": 0.4232238349885409,
"grad_norm": 3.737944346749512,
"learning_rate": 1.2932064552201284e-06,
"loss": 0.5078,
"step": 1108
},
{
"epoch": 0.42360580595874714,
"grad_norm": 5.440523550465423,
"learning_rate": 1.2920232783427566e-06,
"loss": 0.4403,
"step": 1109
},
{
"epoch": 0.4239877769289534,
"grad_norm": 3.729701007104042,
"learning_rate": 1.2908396543786746e-06,
"loss": 0.4342,
"step": 1110
},
{
"epoch": 0.42436974789915966,
"grad_norm": 15.43994590063205,
"learning_rate": 1.289655585140007e-06,
"loss": 0.4321,
"step": 1111
},
{
"epoch": 0.4247517188693659,
"grad_norm": 6.587078251439573,
"learning_rate": 1.28847107243956e-06,
"loss": 0.495,
"step": 1112
},
{
"epoch": 0.42513368983957217,
"grad_norm": 2.9102958858974572,
"learning_rate": 1.2872861180908193e-06,
"loss": 0.4504,
"step": 1113
},
{
"epoch": 0.4255156608097785,
"grad_norm": 3.945596990949721,
"learning_rate": 1.2861007239079452e-06,
"loss": 0.5611,
"step": 1114
},
{
"epoch": 0.42589763177998474,
"grad_norm": 11.583149332286014,
"learning_rate": 1.2849148917057734e-06,
"loss": 0.6082,
"step": 1115
},
{
"epoch": 0.426279602750191,
"grad_norm": 3.2093573724227946,
"learning_rate": 1.2837286232998086e-06,
"loss": 0.5386,
"step": 1116
},
{
"epoch": 0.42666157372039726,
"grad_norm": 4.195111563739433,
"learning_rate": 1.2825419205062242e-06,
"loss": 0.4815,
"step": 1117
},
{
"epoch": 0.4270435446906035,
"grad_norm": 4.241364490014807,
"learning_rate": 1.2813547851418587e-06,
"loss": 0.484,
"step": 1118
},
{
"epoch": 0.4274255156608098,
"grad_norm": 12.638732356072397,
"learning_rate": 1.2801672190242118e-06,
"loss": 0.506,
"step": 1119
},
{
"epoch": 0.42780748663101603,
"grad_norm": 3.4114092408838244,
"learning_rate": 1.2789792239714442e-06,
"loss": 0.4662,
"step": 1120
},
{
"epoch": 0.4281894576012223,
"grad_norm": 5.375163307961225,
"learning_rate": 1.2777908018023721e-06,
"loss": 0.5754,
"step": 1121
},
{
"epoch": 0.42857142857142855,
"grad_norm": 4.0498153971710344,
"learning_rate": 1.2766019543364666e-06,
"loss": 0.5991,
"step": 1122
},
{
"epoch": 0.42895339954163486,
"grad_norm": 5.283353271896474,
"learning_rate": 1.2754126833938484e-06,
"loss": 0.4709,
"step": 1123
},
{
"epoch": 0.4293353705118411,
"grad_norm": 2.9423837625564113,
"learning_rate": 1.2742229907952883e-06,
"loss": 0.4637,
"step": 1124
},
{
"epoch": 0.4297173414820474,
"grad_norm": 4.802382213000816,
"learning_rate": 1.2730328783622023e-06,
"loss": 0.5025,
"step": 1125
},
{
"epoch": 0.43009931245225363,
"grad_norm": 4.729613483707908,
"learning_rate": 1.2718423479166476e-06,
"loss": 0.5789,
"step": 1126
},
{
"epoch": 0.4304812834224599,
"grad_norm": 5.930344033965177,
"learning_rate": 1.270651401281323e-06,
"loss": 0.4907,
"step": 1127
},
{
"epoch": 0.43086325439266615,
"grad_norm": 6.399249174579611,
"learning_rate": 1.2694600402795643e-06,
"loss": 0.5434,
"step": 1128
},
{
"epoch": 0.4312452253628724,
"grad_norm": 4.725874325597204,
"learning_rate": 1.2682682667353414e-06,
"loss": 0.5078,
"step": 1129
},
{
"epoch": 0.43162719633307867,
"grad_norm": 3.127784542081372,
"learning_rate": 1.267076082473255e-06,
"loss": 0.5306,
"step": 1130
},
{
"epoch": 0.4320091673032849,
"grad_norm": 3.8951466721721224,
"learning_rate": 1.2658834893185364e-06,
"loss": 0.5297,
"step": 1131
},
{
"epoch": 0.43239113827349124,
"grad_norm": 5.319632220306428,
"learning_rate": 1.2646904890970419e-06,
"loss": 0.5037,
"step": 1132
},
{
"epoch": 0.4327731092436975,
"grad_norm": 12.156586498156457,
"learning_rate": 1.26349708363525e-06,
"loss": 0.4712,
"step": 1133
},
{
"epoch": 0.43315508021390375,
"grad_norm": 3.6579913095297862,
"learning_rate": 1.2623032747602618e-06,
"loss": 0.5496,
"step": 1134
},
{
"epoch": 0.43353705118411,
"grad_norm": 34.59162241017071,
"learning_rate": 1.2611090642997941e-06,
"loss": 0.482,
"step": 1135
},
{
"epoch": 0.43391902215431627,
"grad_norm": 4.465393830779184,
"learning_rate": 1.25991445408218e-06,
"loss": 0.4429,
"step": 1136
},
{
"epoch": 0.4343009931245225,
"grad_norm": 5.2710706198103106,
"learning_rate": 1.2587194459363634e-06,
"loss": 0.4629,
"step": 1137
},
{
"epoch": 0.4346829640947288,
"grad_norm": 5.0416569180088775,
"learning_rate": 1.2575240416918981e-06,
"loss": 0.5584,
"step": 1138
},
{
"epoch": 0.43506493506493504,
"grad_norm": 4.160767116481659,
"learning_rate": 1.2563282431789446e-06,
"loss": 0.5367,
"step": 1139
},
{
"epoch": 0.43544690603514136,
"grad_norm": 2.4278904417735303,
"learning_rate": 1.255132052228266e-06,
"loss": 0.4322,
"step": 1140
},
{
"epoch": 0.4358288770053476,
"grad_norm": 2.4236151260570815,
"learning_rate": 1.2539354706712273e-06,
"loss": 0.4969,
"step": 1141
},
{
"epoch": 0.4362108479755539,
"grad_norm": 12.539260411009332,
"learning_rate": 1.2527385003397906e-06,
"loss": 0.5147,
"step": 1142
},
{
"epoch": 0.43659281894576013,
"grad_norm": 4.294317064309806,
"learning_rate": 1.2515411430665142e-06,
"loss": 0.5108,
"step": 1143
},
{
"epoch": 0.4369747899159664,
"grad_norm": 7.562652888743942,
"learning_rate": 1.2503434006845481e-06,
"loss": 0.537,
"step": 1144
},
{
"epoch": 0.43735676088617265,
"grad_norm": 3.4712632895110094,
"learning_rate": 1.2491452750276317e-06,
"loss": 0.4542,
"step": 1145
},
{
"epoch": 0.4377387318563789,
"grad_norm": 5.924873661240008,
"learning_rate": 1.2479467679300922e-06,
"loss": 0.4853,
"step": 1146
},
{
"epoch": 0.43812070282658516,
"grad_norm": 5.020685434497532,
"learning_rate": 1.2467478812268395e-06,
"loss": 0.4446,
"step": 1147
},
{
"epoch": 0.4385026737967914,
"grad_norm": 4.134680076848503,
"learning_rate": 1.2455486167533657e-06,
"loss": 0.5461,
"step": 1148
},
{
"epoch": 0.43888464476699773,
"grad_norm": 3.7379112252874798,
"learning_rate": 1.2443489763457412e-06,
"loss": 0.4621,
"step": 1149
},
{
"epoch": 0.439266615737204,
"grad_norm": 3.151380948128957,
"learning_rate": 1.243148961840611e-06,
"loss": 0.4803,
"step": 1150
},
{
"epoch": 0.43964858670741025,
"grad_norm": 3.8402582005785755,
"learning_rate": 1.241948575075194e-06,
"loss": 0.4318,
"step": 1151
},
{
"epoch": 0.4400305576776165,
"grad_norm": 4.3539310487694785,
"learning_rate": 1.2407478178872778e-06,
"loss": 0.5722,
"step": 1152
},
{
"epoch": 0.44041252864782277,
"grad_norm": 85.6797164911384,
"learning_rate": 1.2395466921152186e-06,
"loss": 0.4572,
"step": 1153
},
{
"epoch": 0.440794499618029,
"grad_norm": 2.984376733858304,
"learning_rate": 1.2383451995979358e-06,
"loss": 0.4593,
"step": 1154
},
{
"epoch": 0.4411764705882353,
"grad_norm": 26.434417170354024,
"learning_rate": 1.2371433421749111e-06,
"loss": 0.5017,
"step": 1155
},
{
"epoch": 0.44155844155844154,
"grad_norm": 4.189293243485018,
"learning_rate": 1.2359411216861834e-06,
"loss": 0.4995,
"step": 1156
},
{
"epoch": 0.4419404125286478,
"grad_norm": 3.477942627441066,
"learning_rate": 1.2347385399723493e-06,
"loss": 0.4496,
"step": 1157
},
{
"epoch": 0.4423223834988541,
"grad_norm": 4.821052532779218,
"learning_rate": 1.2335355988745576e-06,
"loss": 0.5236,
"step": 1158
},
{
"epoch": 0.44270435446906037,
"grad_norm": 1.9972719363055844,
"learning_rate": 1.2323323002345067e-06,
"loss": 0.4232,
"step": 1159
},
{
"epoch": 0.4430863254392666,
"grad_norm": 5.052703891846589,
"learning_rate": 1.2311286458944433e-06,
"loss": 0.6035,
"step": 1160
},
{
"epoch": 0.4434682964094729,
"grad_norm": 3.4310540089602064,
"learning_rate": 1.229924637697158e-06,
"loss": 0.5135,
"step": 1161
},
{
"epoch": 0.44385026737967914,
"grad_norm": 11.968302863287425,
"learning_rate": 1.2287202774859845e-06,
"loss": 0.5268,
"step": 1162
},
{
"epoch": 0.4442322383498854,
"grad_norm": 6.062066545185635,
"learning_rate": 1.2275155671047936e-06,
"loss": 0.5321,
"step": 1163
},
{
"epoch": 0.44461420932009166,
"grad_norm": 3.0589899077634497,
"learning_rate": 1.226310508397993e-06,
"loss": 0.5545,
"step": 1164
},
{
"epoch": 0.4449961802902979,
"grad_norm": 3.0982702971013514,
"learning_rate": 1.2251051032105244e-06,
"loss": 0.5269,
"step": 1165
},
{
"epoch": 0.44537815126050423,
"grad_norm": 4.013047995670636,
"learning_rate": 1.2238993533878589e-06,
"loss": 0.5273,
"step": 1166
},
{
"epoch": 0.4457601222307105,
"grad_norm": 7.539763111242043,
"learning_rate": 1.2226932607759955e-06,
"loss": 0.5206,
"step": 1167
},
{
"epoch": 0.44614209320091675,
"grad_norm": 2.9559753054331424,
"learning_rate": 1.2214868272214585e-06,
"loss": 0.538,
"step": 1168
},
{
"epoch": 0.446524064171123,
"grad_norm": 4.4700657838149676,
"learning_rate": 1.2202800545712935e-06,
"loss": 0.5165,
"step": 1169
},
{
"epoch": 0.44690603514132926,
"grad_norm": 3.4528057031524275,
"learning_rate": 1.219072944673066e-06,
"loss": 0.5843,
"step": 1170
},
{
"epoch": 0.4472880061115355,
"grad_norm": 12.836652899377839,
"learning_rate": 1.2178654993748567e-06,
"loss": 0.5116,
"step": 1171
},
{
"epoch": 0.4476699770817418,
"grad_norm": 3.953361297445188,
"learning_rate": 1.2166577205252615e-06,
"loss": 0.4924,
"step": 1172
},
{
"epoch": 0.44805194805194803,
"grad_norm": 3.5127667403238614,
"learning_rate": 1.2154496099733854e-06,
"loss": 0.4392,
"step": 1173
},
{
"epoch": 0.4484339190221543,
"grad_norm": 2.6671312447026296,
"learning_rate": 1.2142411695688414e-06,
"loss": 0.5055,
"step": 1174
},
{
"epoch": 0.4488158899923606,
"grad_norm": 3.137496881253067,
"learning_rate": 1.2130324011617488e-06,
"loss": 0.5346,
"step": 1175
},
{
"epoch": 0.44919786096256686,
"grad_norm": 8.708672027129488,
"learning_rate": 1.2118233066027277e-06,
"loss": 0.4389,
"step": 1176
},
{
"epoch": 0.4495798319327731,
"grad_norm": 3.3420365876445635,
"learning_rate": 1.210613887742898e-06,
"loss": 0.5132,
"step": 1177
},
{
"epoch": 0.4499618029029794,
"grad_norm": 5.556852945541378,
"learning_rate": 1.2094041464338761e-06,
"loss": 0.4858,
"step": 1178
},
{
"epoch": 0.45034377387318564,
"grad_norm": 3.6150252098952342,
"learning_rate": 1.2081940845277725e-06,
"loss": 0.4491,
"step": 1179
},
{
"epoch": 0.4507257448433919,
"grad_norm": 2.339849832497659,
"learning_rate": 1.2069837038771876e-06,
"loss": 0.4443,
"step": 1180
},
{
"epoch": 0.45110771581359815,
"grad_norm": 4.6405994630916,
"learning_rate": 1.205773006335211e-06,
"loss": 0.5103,
"step": 1181
},
{
"epoch": 0.4514896867838044,
"grad_norm": 3.5040573388223177,
"learning_rate": 1.204561993755416e-06,
"loss": 0.5253,
"step": 1182
},
{
"epoch": 0.45187165775401067,
"grad_norm": 4.02406379001709,
"learning_rate": 1.2033506679918594e-06,
"loss": 0.4484,
"step": 1183
},
{
"epoch": 0.452253628724217,
"grad_norm": 3.388381395227733,
"learning_rate": 1.2021390308990768e-06,
"loss": 0.5967,
"step": 1184
},
{
"epoch": 0.45263559969442324,
"grad_norm": 6.444932583069334,
"learning_rate": 1.2009270843320816e-06,
"loss": 0.4168,
"step": 1185
},
{
"epoch": 0.4530175706646295,
"grad_norm": 3.593703564283377,
"learning_rate": 1.199714830146359e-06,
"loss": 0.4991,
"step": 1186
},
{
"epoch": 0.45339954163483576,
"grad_norm": 4.728487675258837,
"learning_rate": 1.198502270197867e-06,
"loss": 0.4914,
"step": 1187
},
{
"epoch": 0.453781512605042,
"grad_norm": 6.306712452964065,
"learning_rate": 1.1972894063430308e-06,
"loss": 0.461,
"step": 1188
},
{
"epoch": 0.4541634835752483,
"grad_norm": 7.151973114117311,
"learning_rate": 1.1960762404387413e-06,
"loss": 0.4825,
"step": 1189
},
{
"epoch": 0.45454545454545453,
"grad_norm": 10.55885615899538,
"learning_rate": 1.194862774342351e-06,
"loss": 0.4894,
"step": 1190
},
{
"epoch": 0.4549274255156608,
"grad_norm": 3.9313808891061663,
"learning_rate": 1.1936490099116735e-06,
"loss": 0.4786,
"step": 1191
},
{
"epoch": 0.4553093964858671,
"grad_norm": 3.476901809122921,
"learning_rate": 1.192434949004978e-06,
"loss": 0.5184,
"step": 1192
},
{
"epoch": 0.45569136745607336,
"grad_norm": 5.668372530085146,
"learning_rate": 1.1912205934809876e-06,
"loss": 0.5004,
"step": 1193
},
{
"epoch": 0.4560733384262796,
"grad_norm": 6.4296870499289405,
"learning_rate": 1.190005945198877e-06,
"loss": 0.5397,
"step": 1194
},
{
"epoch": 0.4564553093964859,
"grad_norm": 4.725008296529506,
"learning_rate": 1.1887910060182692e-06,
"loss": 0.4638,
"step": 1195
},
{
"epoch": 0.45683728036669213,
"grad_norm": 2.7978312243428793,
"learning_rate": 1.1875757777992313e-06,
"loss": 0.5007,
"step": 1196
},
{
"epoch": 0.4572192513368984,
"grad_norm": 2.4020090664757068,
"learning_rate": 1.1863602624022742e-06,
"loss": 0.4408,
"step": 1197
},
{
"epoch": 0.45760122230710465,
"grad_norm": 4.360547237284794,
"learning_rate": 1.1851444616883486e-06,
"loss": 0.4635,
"step": 1198
},
{
"epoch": 0.4579831932773109,
"grad_norm": 2.9512084782993777,
"learning_rate": 1.183928377518841e-06,
"loss": 0.5623,
"step": 1199
},
{
"epoch": 0.45836516424751717,
"grad_norm": 6.979742443985152,
"learning_rate": 1.1827120117555728e-06,
"loss": 0.5724,
"step": 1200
},
{
"epoch": 0.4587471352177235,
"grad_norm": 5.730731417502142,
"learning_rate": 1.181495366260796e-06,
"loss": 0.4598,
"step": 1201
},
{
"epoch": 0.45912910618792974,
"grad_norm": 3.0871943137221187,
"learning_rate": 1.1802784428971903e-06,
"loss": 0.542,
"step": 1202
},
{
"epoch": 0.459511077158136,
"grad_norm": 2.8116100600100378,
"learning_rate": 1.1790612435278627e-06,
"loss": 0.4651,
"step": 1203
},
{
"epoch": 0.45989304812834225,
"grad_norm": 5.698967179605214,
"learning_rate": 1.1778437700163407e-06,
"loss": 0.5649,
"step": 1204
},
{
"epoch": 0.4602750190985485,
"grad_norm": 3.1755623138697575,
"learning_rate": 1.1766260242265725e-06,
"loss": 0.5049,
"step": 1205
},
{
"epoch": 0.46065699006875477,
"grad_norm": 5.599555390928325,
"learning_rate": 1.175408008022923e-06,
"loss": 0.4122,
"step": 1206
},
{
"epoch": 0.461038961038961,
"grad_norm": 3.580170190977131,
"learning_rate": 1.1741897232701713e-06,
"loss": 0.5239,
"step": 1207
},
{
"epoch": 0.4614209320091673,
"grad_norm": 14.971991557910343,
"learning_rate": 1.1729711718335075e-06,
"loss": 0.479,
"step": 1208
},
{
"epoch": 0.46180290297937354,
"grad_norm": 2.880404350454197,
"learning_rate": 1.1717523555785303e-06,
"loss": 0.4886,
"step": 1209
},
{
"epoch": 0.46218487394957986,
"grad_norm": 7.781190797017161,
"learning_rate": 1.1705332763712427e-06,
"loss": 0.5012,
"step": 1210
},
{
"epoch": 0.4625668449197861,
"grad_norm": 2.4434443535930597,
"learning_rate": 1.1693139360780517e-06,
"loss": 0.4754,
"step": 1211
},
{
"epoch": 0.46294881588999237,
"grad_norm": 45.20097004657449,
"learning_rate": 1.168094336565763e-06,
"loss": 0.4677,
"step": 1212
},
{
"epoch": 0.46333078686019863,
"grad_norm": 2.145571295650126,
"learning_rate": 1.1668744797015799e-06,
"loss": 0.4429,
"step": 1213
},
{
"epoch": 0.4637127578304049,
"grad_norm": 2.751742226788619,
"learning_rate": 1.1656543673530993e-06,
"loss": 0.4968,
"step": 1214
},
{
"epoch": 0.46409472880061114,
"grad_norm": 2.909178017099473,
"learning_rate": 1.1644340013883094e-06,
"loss": 0.4326,
"step": 1215
},
{
"epoch": 0.4644766997708174,
"grad_norm": 5.284530413482173,
"learning_rate": 1.1632133836755864e-06,
"loss": 0.4681,
"step": 1216
},
{
"epoch": 0.46485867074102366,
"grad_norm": 2.763154155064458,
"learning_rate": 1.1619925160836924e-06,
"loss": 0.538,
"step": 1217
},
{
"epoch": 0.46524064171123,
"grad_norm": 4.0912500612091005,
"learning_rate": 1.1607714004817716e-06,
"loss": 0.5096,
"step": 1218
},
{
"epoch": 0.46562261268143623,
"grad_norm": 3.987704457481841,
"learning_rate": 1.1595500387393479e-06,
"loss": 0.4758,
"step": 1219
},
{
"epoch": 0.4660045836516425,
"grad_norm": 6.550222662482566,
"learning_rate": 1.1583284327263225e-06,
"loss": 0.4638,
"step": 1220
},
{
"epoch": 0.46638655462184875,
"grad_norm": 2.4034258022797865,
"learning_rate": 1.1571065843129707e-06,
"loss": 0.4397,
"step": 1221
},
{
"epoch": 0.466768525592055,
"grad_norm": 3.6897113611530905,
"learning_rate": 1.1558844953699378e-06,
"loss": 0.5763,
"step": 1222
},
{
"epoch": 0.46715049656226126,
"grad_norm": 4.339232174448057,
"learning_rate": 1.1546621677682381e-06,
"loss": 0.5573,
"step": 1223
},
{
"epoch": 0.4675324675324675,
"grad_norm": 3.203860802966495,
"learning_rate": 1.1534396033792523e-06,
"loss": 0.4413,
"step": 1224
},
{
"epoch": 0.4679144385026738,
"grad_norm": 2.628311025178851,
"learning_rate": 1.1522168040747216e-06,
"loss": 0.5194,
"step": 1225
},
{
"epoch": 0.46829640947288004,
"grad_norm": 32.48635309573544,
"learning_rate": 1.1509937717267482e-06,
"loss": 0.5325,
"step": 1226
},
{
"epoch": 0.46867838044308635,
"grad_norm": 6.698122924808716,
"learning_rate": 1.1497705082077903e-06,
"loss": 0.5143,
"step": 1227
},
{
"epoch": 0.4690603514132926,
"grad_norm": 2.9475576684385243,
"learning_rate": 1.1485470153906612e-06,
"loss": 0.4947,
"step": 1228
},
{
"epoch": 0.46944232238349887,
"grad_norm": 3.3350127815414723,
"learning_rate": 1.147323295148524e-06,
"loss": 0.4642,
"step": 1229
},
{
"epoch": 0.4698242933537051,
"grad_norm": 8.905333506931377,
"learning_rate": 1.1460993493548905e-06,
"loss": 0.5154,
"step": 1230
},
{
"epoch": 0.4702062643239114,
"grad_norm": 2.5257033406933744,
"learning_rate": 1.144875179883618e-06,
"loss": 0.4871,
"step": 1231
},
{
"epoch": 0.47058823529411764,
"grad_norm": 4.53327072940148,
"learning_rate": 1.143650788608906e-06,
"loss": 0.4972,
"step": 1232
},
{
"epoch": 0.4709702062643239,
"grad_norm": 5.160150274947944,
"learning_rate": 1.1424261774052939e-06,
"loss": 0.5123,
"step": 1233
},
{
"epoch": 0.47135217723453016,
"grad_norm": 3.0497075066239168,
"learning_rate": 1.1412013481476571e-06,
"loss": 0.5342,
"step": 1234
},
{
"epoch": 0.4717341482047364,
"grad_norm": 3.352396281245782,
"learning_rate": 1.1399763027112056e-06,
"loss": 0.4841,
"step": 1235
},
{
"epoch": 0.4721161191749427,
"grad_norm": 2.4010700114984025,
"learning_rate": 1.13875104297148e-06,
"loss": 0.4476,
"step": 1236
},
{
"epoch": 0.472498090145149,
"grad_norm": 2.1444364956763433,
"learning_rate": 1.137525570804349e-06,
"loss": 0.4792,
"step": 1237
},
{
"epoch": 0.47288006111535524,
"grad_norm": 2.4997522044099676,
"learning_rate": 1.1362998880860065e-06,
"loss": 0.4552,
"step": 1238
},
{
"epoch": 0.4732620320855615,
"grad_norm": 11.869869458123512,
"learning_rate": 1.135073996692969e-06,
"loss": 0.3982,
"step": 1239
},
{
"epoch": 0.47364400305576776,
"grad_norm": 11.08603971046374,
"learning_rate": 1.1338478985020726e-06,
"loss": 0.5119,
"step": 1240
},
{
"epoch": 0.474025974025974,
"grad_norm": 3.5696420483847313,
"learning_rate": 1.1326215953904691e-06,
"loss": 0.4499,
"step": 1241
},
{
"epoch": 0.4744079449961803,
"grad_norm": 4.197385123479996,
"learning_rate": 1.131395089235625e-06,
"loss": 0.4386,
"step": 1242
},
{
"epoch": 0.47478991596638653,
"grad_norm": 4.329632163315956,
"learning_rate": 1.1301683819153173e-06,
"loss": 0.4617,
"step": 1243
},
{
"epoch": 0.47517188693659285,
"grad_norm": 2.4072597666151028,
"learning_rate": 1.1289414753076312e-06,
"loss": 0.4296,
"step": 1244
},
{
"epoch": 0.4755538579067991,
"grad_norm": 4.025047848026513,
"learning_rate": 1.1277143712909562e-06,
"loss": 0.4778,
"step": 1245
},
{
"epoch": 0.47593582887700536,
"grad_norm": 4.91052834287783,
"learning_rate": 1.1264870717439854e-06,
"loss": 0.4833,
"step": 1246
},
{
"epoch": 0.4763177998472116,
"grad_norm": 2.992133174312151,
"learning_rate": 1.1252595785457103e-06,
"loss": 0.44,
"step": 1247
},
{
"epoch": 0.4766997708174179,
"grad_norm": 3.6402161256952534,
"learning_rate": 1.1240318935754186e-06,
"loss": 0.4952,
"step": 1248
},
{
"epoch": 0.47708174178762414,
"grad_norm": 4.552869776517533,
"learning_rate": 1.1228040187126925e-06,
"loss": 0.5271,
"step": 1249
},
{
"epoch": 0.4774637127578304,
"grad_norm": 3.9200079953681772,
"learning_rate": 1.1215759558374046e-06,
"loss": 0.6368,
"step": 1250
},
{
"epoch": 0.47784568372803665,
"grad_norm": 3.1230412445338374,
"learning_rate": 1.120347706829715e-06,
"loss": 0.4492,
"step": 1251
},
{
"epoch": 0.4782276546982429,
"grad_norm": 3.102490869262427,
"learning_rate": 1.1191192735700694e-06,
"loss": 0.4686,
"step": 1252
},
{
"epoch": 0.4786096256684492,
"grad_norm": 4.476682357490567,
"learning_rate": 1.1178906579391944e-06,
"loss": 0.5489,
"step": 1253
},
{
"epoch": 0.4789915966386555,
"grad_norm": 2.8135125812778616,
"learning_rate": 1.1166618618180975e-06,
"loss": 0.5074,
"step": 1254
},
{
"epoch": 0.47937356760886174,
"grad_norm": 40.753479628748224,
"learning_rate": 1.1154328870880615e-06,
"loss": 0.5075,
"step": 1255
},
{
"epoch": 0.479755538579068,
"grad_norm": 12.111904935457673,
"learning_rate": 1.1142037356306422e-06,
"loss": 0.531,
"step": 1256
},
{
"epoch": 0.48013750954927426,
"grad_norm": 17.359688140923335,
"learning_rate": 1.112974409327667e-06,
"loss": 0.5295,
"step": 1257
},
{
"epoch": 0.4805194805194805,
"grad_norm": 5.006833632043221,
"learning_rate": 1.111744910061231e-06,
"loss": 0.5058,
"step": 1258
},
{
"epoch": 0.48090145148968677,
"grad_norm": 4.87277750041395,
"learning_rate": 1.1105152397136927e-06,
"loss": 0.5186,
"step": 1259
},
{
"epoch": 0.48128342245989303,
"grad_norm": 5.03236301201575,
"learning_rate": 1.1092854001676743e-06,
"loss": 0.4766,
"step": 1260
},
{
"epoch": 0.4816653934300993,
"grad_norm": 3.9862712010217707,
"learning_rate": 1.108055393306056e-06,
"loss": 0.4787,
"step": 1261
},
{
"epoch": 0.4820473644003056,
"grad_norm": 4.075868898495065,
"learning_rate": 1.106825221011974e-06,
"loss": 0.5061,
"step": 1262
},
{
"epoch": 0.48242933537051186,
"grad_norm": 3.3826587745482035,
"learning_rate": 1.1055948851688186e-06,
"loss": 0.4608,
"step": 1263
},
{
"epoch": 0.4828113063407181,
"grad_norm": 2.7926168169643404,
"learning_rate": 1.10436438766023e-06,
"loss": 0.4725,
"step": 1264
},
{
"epoch": 0.4831932773109244,
"grad_norm": 4.227679252388823,
"learning_rate": 1.1031337303700958e-06,
"loss": 0.6015,
"step": 1265
},
{
"epoch": 0.48357524828113063,
"grad_norm": 3.187908709583996,
"learning_rate": 1.1019029151825488e-06,
"loss": 0.5895,
"step": 1266
},
{
"epoch": 0.4839572192513369,
"grad_norm": 4.074916916382239,
"learning_rate": 1.1006719439819624e-06,
"loss": 0.5196,
"step": 1267
},
{
"epoch": 0.48433919022154315,
"grad_norm": 8.929845403345944,
"learning_rate": 1.0994408186529504e-06,
"loss": 0.5266,
"step": 1268
},
{
"epoch": 0.4847211611917494,
"grad_norm": 5.391798819468841,
"learning_rate": 1.0982095410803613e-06,
"loss": 0.6122,
"step": 1269
},
{
"epoch": 0.4851031321619557,
"grad_norm": 3.9477864337213378,
"learning_rate": 1.0969781131492774e-06,
"loss": 0.5211,
"step": 1270
},
{
"epoch": 0.485485103132162,
"grad_norm": 4.2021316314091015,
"learning_rate": 1.0957465367450106e-06,
"loss": 0.5463,
"step": 1271
},
{
"epoch": 0.48586707410236823,
"grad_norm": 4.042478751794095,
"learning_rate": 1.0945148137531007e-06,
"loss": 0.4624,
"step": 1272
},
{
"epoch": 0.4862490450725745,
"grad_norm": 4.628945355798975,
"learning_rate": 1.0932829460593115e-06,
"loss": 0.4688,
"step": 1273
},
{
"epoch": 0.48663101604278075,
"grad_norm": 4.84585991908986,
"learning_rate": 1.092050935549629e-06,
"loss": 0.454,
"step": 1274
},
{
"epoch": 0.487012987012987,
"grad_norm": 3.569829003665894,
"learning_rate": 1.0908187841102565e-06,
"loss": 0.4872,
"step": 1275
},
{
"epoch": 0.48739495798319327,
"grad_norm": 3.1539513977942235,
"learning_rate": 1.089586493627615e-06,
"loss": 0.4405,
"step": 1276
},
{
"epoch": 0.4877769289533995,
"grad_norm": 3.707910848260399,
"learning_rate": 1.0883540659883366e-06,
"loss": 0.5597,
"step": 1277
},
{
"epoch": 0.4881588999236058,
"grad_norm": 3.7495327953119078,
"learning_rate": 1.0871215030792642e-06,
"loss": 0.4309,
"step": 1278
},
{
"epoch": 0.4885408708938121,
"grad_norm": 3.587824538383655,
"learning_rate": 1.0858888067874477e-06,
"loss": 0.4526,
"step": 1279
},
{
"epoch": 0.48892284186401835,
"grad_norm": 2.260993818632954,
"learning_rate": 1.0846559790001413e-06,
"loss": 0.4388,
"step": 1280
},
{
"epoch": 0.4893048128342246,
"grad_norm": 3.2144353104907557,
"learning_rate": 1.0834230216048004e-06,
"loss": 0.5239,
"step": 1281
},
{
"epoch": 0.48968678380443087,
"grad_norm": 2.9974050303973296,
"learning_rate": 1.0821899364890788e-06,
"loss": 0.5605,
"step": 1282
},
{
"epoch": 0.4900687547746371,
"grad_norm": 4.173980296608258,
"learning_rate": 1.0809567255408258e-06,
"loss": 0.5095,
"step": 1283
},
{
"epoch": 0.4904507257448434,
"grad_norm": 2.116436796307261,
"learning_rate": 1.079723390648084e-06,
"loss": 0.424,
"step": 1284
},
{
"epoch": 0.49083269671504964,
"grad_norm": 3.6930461747499774,
"learning_rate": 1.0784899336990844e-06,
"loss": 0.5205,
"step": 1285
},
{
"epoch": 0.4912146676852559,
"grad_norm": 4.540039955232487,
"learning_rate": 1.077256356582246e-06,
"loss": 0.5158,
"step": 1286
},
{
"epoch": 0.49159663865546216,
"grad_norm": 6.271353181611177,
"learning_rate": 1.0760226611861714e-06,
"loss": 0.499,
"step": 1287
},
{
"epoch": 0.4919786096256685,
"grad_norm": 3.8796764423218373,
"learning_rate": 1.0747888493996447e-06,
"loss": 0.5119,
"step": 1288
},
{
"epoch": 0.49236058059587473,
"grad_norm": 6.360416011728165,
"learning_rate": 1.0735549231116271e-06,
"loss": 0.4952,
"step": 1289
},
{
"epoch": 0.492742551566081,
"grad_norm": 3.70411516225405,
"learning_rate": 1.0723208842112566e-06,
"loss": 0.459,
"step": 1290
},
{
"epoch": 0.49312452253628725,
"grad_norm": 5.632813688501988,
"learning_rate": 1.0710867345878423e-06,
"loss": 0.4223,
"step": 1291
},
{
"epoch": 0.4935064935064935,
"grad_norm": 3.2511274203262106,
"learning_rate": 1.0698524761308633e-06,
"loss": 0.4402,
"step": 1292
},
{
"epoch": 0.49388846447669976,
"grad_norm": 3.8371267082582534,
"learning_rate": 1.0686181107299657e-06,
"loss": 0.4357,
"step": 1293
},
{
"epoch": 0.494270435446906,
"grad_norm": 5.165651106582903,
"learning_rate": 1.067383640274959e-06,
"loss": 0.4468,
"step": 1294
},
{
"epoch": 0.4946524064171123,
"grad_norm": 13.071303533813465,
"learning_rate": 1.0661490666558135e-06,
"loss": 0.4457,
"step": 1295
},
{
"epoch": 0.4950343773873186,
"grad_norm": 5.755798614832232,
"learning_rate": 1.0649143917626572e-06,
"loss": 0.4874,
"step": 1296
},
{
"epoch": 0.49541634835752485,
"grad_norm": 7.817980213058826,
"learning_rate": 1.0636796174857735e-06,
"loss": 0.5073,
"step": 1297
},
{
"epoch": 0.4957983193277311,
"grad_norm": 130.82970679290145,
"learning_rate": 1.0624447457155982e-06,
"loss": 0.4831,
"step": 1298
},
{
"epoch": 0.49618029029793737,
"grad_norm": 3.885116213979448,
"learning_rate": 1.061209778342716e-06,
"loss": 0.5214,
"step": 1299
},
{
"epoch": 0.4965622612681436,
"grad_norm": 3.878921326386595,
"learning_rate": 1.0599747172578579e-06,
"loss": 0.4541,
"step": 1300
},
{
"epoch": 0.4969442322383499,
"grad_norm": 2.816069956951671,
"learning_rate": 1.0587395643518982e-06,
"loss": 0.4576,
"step": 1301
},
{
"epoch": 0.49732620320855614,
"grad_norm": 3.421688209743285,
"learning_rate": 1.0575043215158525e-06,
"loss": 0.4258,
"step": 1302
},
{
"epoch": 0.4977081741787624,
"grad_norm": 3.6723025808360616,
"learning_rate": 1.0562689906408737e-06,
"loss": 0.4629,
"step": 1303
},
{
"epoch": 0.49809014514896865,
"grad_norm": 2.9379519686044517,
"learning_rate": 1.0550335736182491e-06,
"loss": 0.4804,
"step": 1304
},
{
"epoch": 0.49847211611917497,
"grad_norm": 3.487784717027458,
"learning_rate": 1.0537980723393982e-06,
"loss": 0.594,
"step": 1305
},
{
"epoch": 0.4988540870893812,
"grad_norm": 4.133689843056842,
"learning_rate": 1.05256248869587e-06,
"loss": 0.4509,
"step": 1306
},
{
"epoch": 0.4992360580595875,
"grad_norm": 2.9438677833420535,
"learning_rate": 1.0513268245793385e-06,
"loss": 0.4707,
"step": 1307
},
{
"epoch": 0.49961802902979374,
"grad_norm": 4.23654014930591,
"learning_rate": 1.050091081881602e-06,
"loss": 0.5688,
"step": 1308
},
{
"epoch": 0.5,
"grad_norm": 2.8769089170784894,
"learning_rate": 1.0488552624945785e-06,
"loss": 0.5302,
"step": 1309
},
{
"epoch": 0.5003819709702063,
"grad_norm": 2.4982081250306845,
"learning_rate": 1.0476193683103036e-06,
"loss": 0.5239,
"step": 1310
},
{
"epoch": 0.5007639419404125,
"grad_norm": 2.7328193722764405,
"learning_rate": 1.0463834012209275e-06,
"loss": 0.5111,
"step": 1311
},
{
"epoch": 0.5011459129106188,
"grad_norm": 14.462404041377377,
"learning_rate": 1.0451473631187114e-06,
"loss": 0.5698,
"step": 1312
},
{
"epoch": 0.501527883880825,
"grad_norm": 3.8057059746418886,
"learning_rate": 1.043911255896026e-06,
"loss": 0.5128,
"step": 1313
},
{
"epoch": 0.5019098548510313,
"grad_norm": 7.550989422236823,
"learning_rate": 1.0426750814453478e-06,
"loss": 0.5066,
"step": 1314
},
{
"epoch": 0.5022918258212375,
"grad_norm": 21.794350606158684,
"learning_rate": 1.0414388416592553e-06,
"loss": 0.4053,
"step": 1315
},
{
"epoch": 0.5026737967914439,
"grad_norm": 4.964648111005842,
"learning_rate": 1.0402025384304282e-06,
"loss": 0.525,
"step": 1316
},
{
"epoch": 0.5030557677616501,
"grad_norm": 2.704461353663344,
"learning_rate": 1.0389661736516427e-06,
"loss": 0.4453,
"step": 1317
},
{
"epoch": 0.5034377387318564,
"grad_norm": 3.596541442397453,
"learning_rate": 1.0377297492157694e-06,
"loss": 0.4711,
"step": 1318
},
{
"epoch": 0.5038197097020627,
"grad_norm": 2.6982636562854565,
"learning_rate": 1.03649326701577e-06,
"loss": 0.4962,
"step": 1319
},
{
"epoch": 0.5042016806722689,
"grad_norm": 3.5758546196602956,
"learning_rate": 1.035256728944695e-06,
"loss": 0.5319,
"step": 1320
},
{
"epoch": 0.5045836516424752,
"grad_norm": 2.7338650139648855,
"learning_rate": 1.0340201368956801e-06,
"loss": 0.5238,
"step": 1321
},
{
"epoch": 0.5049656226126814,
"grad_norm": 9.011380745224324,
"learning_rate": 1.0327834927619438e-06,
"loss": 0.4911,
"step": 1322
},
{
"epoch": 0.5053475935828877,
"grad_norm": 2.3127010295672226,
"learning_rate": 1.0315467984367843e-06,
"loss": 0.4739,
"step": 1323
},
{
"epoch": 0.5057295645530939,
"grad_norm": 2.873514483067639,
"learning_rate": 1.030310055813577e-06,
"loss": 0.4333,
"step": 1324
},
{
"epoch": 0.5061115355233002,
"grad_norm": 2.529448768846022,
"learning_rate": 1.0290732667857705e-06,
"loss": 0.4595,
"step": 1325
},
{
"epoch": 0.5064935064935064,
"grad_norm": 5.525960825885752,
"learning_rate": 1.027836433246885e-06,
"loss": 0.4809,
"step": 1326
},
{
"epoch": 0.5068754774637128,
"grad_norm": 3.4943325241715852,
"learning_rate": 1.0265995570905087e-06,
"loss": 0.5377,
"step": 1327
},
{
"epoch": 0.5072574484339191,
"grad_norm": 7.5645140214821165,
"learning_rate": 1.0253626402102954e-06,
"loss": 0.5336,
"step": 1328
},
{
"epoch": 0.5076394194041253,
"grad_norm": 3.2696463413539187,
"learning_rate": 1.0241256844999604e-06,
"loss": 0.4364,
"step": 1329
},
{
"epoch": 0.5080213903743316,
"grad_norm": 6.9584160075582595,
"learning_rate": 1.0228886918532791e-06,
"loss": 0.4351,
"step": 1330
},
{
"epoch": 0.5084033613445378,
"grad_norm": 3.003922678403774,
"learning_rate": 1.0216516641640835e-06,
"loss": 0.4882,
"step": 1331
},
{
"epoch": 0.5087853323147441,
"grad_norm": 2.932152882801731,
"learning_rate": 1.020414603326259e-06,
"loss": 0.46,
"step": 1332
},
{
"epoch": 0.5091673032849503,
"grad_norm": 2.381377960281441,
"learning_rate": 1.0191775112337419e-06,
"loss": 0.5098,
"step": 1333
},
{
"epoch": 0.5095492742551566,
"grad_norm": 4.816521334707846,
"learning_rate": 1.0179403897805156e-06,
"loss": 0.5134,
"step": 1334
},
{
"epoch": 0.5099312452253628,
"grad_norm": 5.752920350007644,
"learning_rate": 1.0167032408606103e-06,
"loss": 0.4723,
"step": 1335
},
{
"epoch": 0.5103132161955691,
"grad_norm": 3.292341479456508,
"learning_rate": 1.0154660663680962e-06,
"loss": 0.4752,
"step": 1336
},
{
"epoch": 0.5106951871657754,
"grad_norm": 3.017397605540056,
"learning_rate": 1.0142288681970835e-06,
"loss": 0.4906,
"step": 1337
},
{
"epoch": 0.5110771581359816,
"grad_norm": 3.553899535961395,
"learning_rate": 1.0129916482417187e-06,
"loss": 0.4463,
"step": 1338
},
{
"epoch": 0.511459129106188,
"grad_norm": 2.8101660274095868,
"learning_rate": 1.011754408396182e-06,
"loss": 0.5076,
"step": 1339
},
{
"epoch": 0.5118411000763942,
"grad_norm": 3.1656049296367486,
"learning_rate": 1.0105171505546834e-06,
"loss": 0.5324,
"step": 1340
},
{
"epoch": 0.5122230710466005,
"grad_norm": 4.898764704252674,
"learning_rate": 1.0092798766114609e-06,
"loss": 0.5099,
"step": 1341
},
{
"epoch": 0.5126050420168067,
"grad_norm": 14.139169491621754,
"learning_rate": 1.0080425884607766e-06,
"loss": 0.5188,
"step": 1342
},
{
"epoch": 0.512987012987013,
"grad_norm": 3.505703606474342,
"learning_rate": 1.006805287996915e-06,
"loss": 0.5271,
"step": 1343
},
{
"epoch": 0.5133689839572193,
"grad_norm": 13.945862624979869,
"learning_rate": 1.0055679771141793e-06,
"loss": 0.5674,
"step": 1344
},
{
"epoch": 0.5137509549274255,
"grad_norm": 3.131944114131963,
"learning_rate": 1.0043306577068882e-06,
"loss": 0.5225,
"step": 1345
},
{
"epoch": 0.5141329258976318,
"grad_norm": 2.372018227771199,
"learning_rate": 1.003093331669374e-06,
"loss": 0.4533,
"step": 1346
},
{
"epoch": 0.514514896867838,
"grad_norm": 3.2988067230203337,
"learning_rate": 1.001856000895979e-06,
"loss": 0.4922,
"step": 1347
},
{
"epoch": 0.5148968678380443,
"grad_norm": 4.938589303779806,
"learning_rate": 1.0006186672810522e-06,
"loss": 0.462,
"step": 1348
},
{
"epoch": 0.5152788388082505,
"grad_norm": 4.655576701126217,
"learning_rate": 9.99381332718948e-07,
"loss": 0.53,
"step": 1349
},
{
"epoch": 0.5156608097784569,
"grad_norm": 3.1018231048204505,
"learning_rate": 9.981439991040212e-07,
"loss": 0.4452,
"step": 1350
},
{
"epoch": 0.516042780748663,
"grad_norm": 3.6955327676526917,
"learning_rate": 9.96906668330626e-07,
"loss": 0.5178,
"step": 1351
},
{
"epoch": 0.5164247517188694,
"grad_norm": 7.2642960843934,
"learning_rate": 9.95669342293112e-07,
"loss": 0.4705,
"step": 1352
},
{
"epoch": 0.5168067226890757,
"grad_norm": 4.536834020213876,
"learning_rate": 9.944320228858208e-07,
"loss": 0.5322,
"step": 1353
},
{
"epoch": 0.5171886936592819,
"grad_norm": 3.5410384700532886,
"learning_rate": 9.93194712003085e-07,
"loss": 0.5278,
"step": 1354
},
{
"epoch": 0.5175706646294882,
"grad_norm": 294.2683369886722,
"learning_rate": 9.919574115392235e-07,
"loss": 0.5625,
"step": 1355
},
{
"epoch": 0.5179526355996944,
"grad_norm": 3.182000528960174,
"learning_rate": 9.907201233885392e-07,
"loss": 0.4204,
"step": 1356
},
{
"epoch": 0.5183346065699007,
"grad_norm": 3.1946900622632635,
"learning_rate": 9.894828494453167e-07,
"loss": 0.4726,
"step": 1357
},
{
"epoch": 0.5187165775401069,
"grad_norm": 5.528384455651845,
"learning_rate": 9.88245591603818e-07,
"loss": 0.4554,
"step": 1358
},
{
"epoch": 0.5190985485103132,
"grad_norm": 5.4773425084696665,
"learning_rate": 9.870083517582812e-07,
"loss": 0.5441,
"step": 1359
},
{
"epoch": 0.5194805194805194,
"grad_norm": 7.220271179898539,
"learning_rate": 9.857711318029169e-07,
"loss": 0.4582,
"step": 1360
},
{
"epoch": 0.5198624904507257,
"grad_norm": 4.784686919361763,
"learning_rate": 9.845339336319042e-07,
"loss": 0.5349,
"step": 1361
},
{
"epoch": 0.5202444614209321,
"grad_norm": 3.110800022530207,
"learning_rate": 9.832967591393896e-07,
"loss": 0.4794,
"step": 1362
},
{
"epoch": 0.5206264323911383,
"grad_norm": 2.950061284333217,
"learning_rate": 9.820596102194844e-07,
"loss": 0.5041,
"step": 1363
},
{
"epoch": 0.5210084033613446,
"grad_norm": 20.463942406841767,
"learning_rate": 9.808224887662582e-07,
"loss": 0.5719,
"step": 1364
},
{
"epoch": 0.5213903743315508,
"grad_norm": 17.791314544277856,
"learning_rate": 9.79585396673741e-07,
"loss": 0.5372,
"step": 1365
},
{
"epoch": 0.5217723453017571,
"grad_norm": 10.673350794026165,
"learning_rate": 9.783483358359164e-07,
"loss": 0.488,
"step": 1366
},
{
"epoch": 0.5221543162719633,
"grad_norm": 6.272582296984684,
"learning_rate": 9.771113081467208e-07,
"loss": 0.4951,
"step": 1367
},
{
"epoch": 0.5225362872421696,
"grad_norm": 78.6440162561484,
"learning_rate": 9.7587431550004e-07,
"loss": 0.4793,
"step": 1368
},
{
"epoch": 0.5229182582123758,
"grad_norm": 3.165213133124479,
"learning_rate": 9.746373597897048e-07,
"loss": 0.444,
"step": 1369
},
{
"epoch": 0.5233002291825821,
"grad_norm": 3.1184183716641827,
"learning_rate": 9.734004429094912e-07,
"loss": 0.5683,
"step": 1370
},
{
"epoch": 0.5236822001527884,
"grad_norm": 4.836415402427561,
"learning_rate": 9.721635667531152e-07,
"loss": 0.537,
"step": 1371
},
{
"epoch": 0.5240641711229946,
"grad_norm": 3.646586245251106,
"learning_rate": 9.709267332142296e-07,
"loss": 0.5344,
"step": 1372
},
{
"epoch": 0.524446142093201,
"grad_norm": 25.03775753954483,
"learning_rate": 9.696899441864232e-07,
"loss": 0.5304,
"step": 1373
},
{
"epoch": 0.5248281130634072,
"grad_norm": 2.469508639188065,
"learning_rate": 9.684532015632158e-07,
"loss": 0.501,
"step": 1374
},
{
"epoch": 0.5252100840336135,
"grad_norm": 6.275377375689046,
"learning_rate": 9.672165072380564e-07,
"loss": 0.4518,
"step": 1375
},
{
"epoch": 0.5255920550038197,
"grad_norm": 3.739219533904625,
"learning_rate": 9.659798631043202e-07,
"loss": 0.4769,
"step": 1376
},
{
"epoch": 0.525974025974026,
"grad_norm": 3.2028836058005745,
"learning_rate": 9.647432710553051e-07,
"loss": 0.5084,
"step": 1377
},
{
"epoch": 0.5263559969442322,
"grad_norm": 2.832547689308899,
"learning_rate": 9.6350673298423e-07,
"loss": 0.5256,
"step": 1378
},
{
"epoch": 0.5267379679144385,
"grad_norm": 4.074197073360397,
"learning_rate": 9.622702507842307e-07,
"loss": 0.5739,
"step": 1379
},
{
"epoch": 0.5271199388846448,
"grad_norm": 2.733260110623376,
"learning_rate": 9.610338263483572e-07,
"loss": 0.4274,
"step": 1380
},
{
"epoch": 0.527501909854851,
"grad_norm": 4.257279848392443,
"learning_rate": 9.597974615695717e-07,
"loss": 0.4869,
"step": 1381
},
{
"epoch": 0.5278838808250573,
"grad_norm": 3.6805018016990534,
"learning_rate": 9.585611583407446e-07,
"loss": 0.4395,
"step": 1382
},
{
"epoch": 0.5282658517952635,
"grad_norm": 2.7505903636646405,
"learning_rate": 9.573249185546523e-07,
"loss": 0.4637,
"step": 1383
},
{
"epoch": 0.5286478227654698,
"grad_norm": 2.860561770977799,
"learning_rate": 9.560887441039738e-07,
"loss": 0.4548,
"step": 1384
},
{
"epoch": 0.529029793735676,
"grad_norm": 8.831501484101013,
"learning_rate": 9.548526368812887e-07,
"loss": 0.5577,
"step": 1385
},
{
"epoch": 0.5294117647058824,
"grad_norm": 5.0263480248609955,
"learning_rate": 9.536165987790727e-07,
"loss": 0.414,
"step": 1386
},
{
"epoch": 0.5297937356760886,
"grad_norm": 2.6563122746837617,
"learning_rate": 9.523806316896964e-07,
"loss": 0.5009,
"step": 1387
},
{
"epoch": 0.5301757066462949,
"grad_norm": 2.7201589095964285,
"learning_rate": 9.511447375054214e-07,
"loss": 0.438,
"step": 1388
},
{
"epoch": 0.5305576776165012,
"grad_norm": 5.460884031932415,
"learning_rate": 9.499089181183979e-07,
"loss": 0.4489,
"step": 1389
},
{
"epoch": 0.5309396485867074,
"grad_norm": 3.5615191551239747,
"learning_rate": 9.486731754206616e-07,
"loss": 0.5108,
"step": 1390
},
{
"epoch": 0.5313216195569137,
"grad_norm": 9.30636166134041,
"learning_rate": 9.474375113041302e-07,
"loss": 0.4708,
"step": 1391
},
{
"epoch": 0.5317035905271199,
"grad_norm": 5.449374160647926,
"learning_rate": 9.462019276606017e-07,
"loss": 0.5523,
"step": 1392
},
{
"epoch": 0.5320855614973262,
"grad_norm": 3.49056858716858,
"learning_rate": 9.449664263817512e-07,
"loss": 0.5377,
"step": 1393
},
{
"epoch": 0.5324675324675324,
"grad_norm": 2.902207041979767,
"learning_rate": 9.437310093591263e-07,
"loss": 0.4954,
"step": 1394
},
{
"epoch": 0.5328495034377387,
"grad_norm": 4.677500426124243,
"learning_rate": 9.424956784841473e-07,
"loss": 0.522,
"step": 1395
},
{
"epoch": 0.533231474407945,
"grad_norm": 13.018420099517328,
"learning_rate": 9.412604356481018e-07,
"loss": 0.5356,
"step": 1396
},
{
"epoch": 0.5336134453781513,
"grad_norm": 6.175162131849886,
"learning_rate": 9.400252827421421e-07,
"loss": 0.4901,
"step": 1397
},
{
"epoch": 0.5339954163483576,
"grad_norm": 7.725050355203537,
"learning_rate": 9.387902216572841e-07,
"loss": 0.5219,
"step": 1398
},
{
"epoch": 0.5343773873185638,
"grad_norm": 8.072550646440144,
"learning_rate": 9.375552542844016e-07,
"loss": 0.5152,
"step": 1399
},
{
"epoch": 0.5347593582887701,
"grad_norm": 2.4221302927270743,
"learning_rate": 9.363203825142262e-07,
"loss": 0.4452,
"step": 1400
},
{
"epoch": 0.5351413292589763,
"grad_norm": 3.2307294108351177,
"learning_rate": 9.350856082373429e-07,
"loss": 0.5134,
"step": 1401
},
{
"epoch": 0.5355233002291826,
"grad_norm": 5.350677308214525,
"learning_rate": 9.338509333441865e-07,
"loss": 0.5055,
"step": 1402
},
{
"epoch": 0.5359052711993888,
"grad_norm": 5.664433410289456,
"learning_rate": 9.326163597250407e-07,
"loss": 0.5005,
"step": 1403
},
{
"epoch": 0.5362872421695951,
"grad_norm": 3.102982845081864,
"learning_rate": 9.313818892700343e-07,
"loss": 0.5239,
"step": 1404
},
{
"epoch": 0.5366692131398014,
"grad_norm": 26.69471541348305,
"learning_rate": 9.301475238691365e-07,
"loss": 0.4663,
"step": 1405
},
{
"epoch": 0.5370511841100076,
"grad_norm": 8.868438039533663,
"learning_rate": 9.289132654121576e-07,
"loss": 0.4662,
"step": 1406
},
{
"epoch": 0.5374331550802139,
"grad_norm": 22.121579017983922,
"learning_rate": 9.276791157887436e-07,
"loss": 0.5445,
"step": 1407
},
{
"epoch": 0.5378151260504201,
"grad_norm": 51.58676075333544,
"learning_rate": 9.264450768883727e-07,
"loss": 0.5196,
"step": 1408
},
{
"epoch": 0.5381970970206265,
"grad_norm": 4.527370473189713,
"learning_rate": 9.252111506003557e-07,
"loss": 0.5098,
"step": 1409
},
{
"epoch": 0.5385790679908327,
"grad_norm": 4.090202605819511,
"learning_rate": 9.239773388138285e-07,
"loss": 0.495,
"step": 1410
},
{
"epoch": 0.538961038961039,
"grad_norm": 8.164285139505568,
"learning_rate": 9.227436434177539e-07,
"loss": 0.575,
"step": 1411
},
{
"epoch": 0.5393430099312452,
"grad_norm": 4.586672847633152,
"learning_rate": 9.215100663009158e-07,
"loss": 0.499,
"step": 1412
},
{
"epoch": 0.5397249809014515,
"grad_norm": 9.803378384591609,
"learning_rate": 9.202766093519162e-07,
"loss": 0.4822,
"step": 1413
},
{
"epoch": 0.5401069518716578,
"grad_norm": 3.0597382175916668,
"learning_rate": 9.190432744591739e-07,
"loss": 0.5386,
"step": 1414
},
{
"epoch": 0.540488922841864,
"grad_norm": 4.2626191363582775,
"learning_rate": 9.178100635109214e-07,
"loss": 0.5004,
"step": 1415
},
{
"epoch": 0.5408708938120703,
"grad_norm": 3.103706429369607,
"learning_rate": 9.165769783951995e-07,
"loss": 0.5113,
"step": 1416
},
{
"epoch": 0.5412528647822765,
"grad_norm": 3.0638124770428274,
"learning_rate": 9.153440209998589e-07,
"loss": 0.4859,
"step": 1417
},
{
"epoch": 0.5416348357524828,
"grad_norm": 4.055884011812646,
"learning_rate": 9.141111932125524e-07,
"loss": 0.4632,
"step": 1418
},
{
"epoch": 0.542016806722689,
"grad_norm": 4.701495297937109,
"learning_rate": 9.128784969207358e-07,
"loss": 0.4976,
"step": 1419
},
{
"epoch": 0.5423987776928954,
"grad_norm": 4.138380618201808,
"learning_rate": 9.116459340116637e-07,
"loss": 0.4545,
"step": 1420
},
{
"epoch": 0.5427807486631016,
"grad_norm": 3.306842464831278,
"learning_rate": 9.104135063723851e-07,
"loss": 0.5213,
"step": 1421
},
{
"epoch": 0.5431627196333079,
"grad_norm": 2.428190100888985,
"learning_rate": 9.091812158897432e-07,
"loss": 0.4542,
"step": 1422
},
{
"epoch": 0.5435446906035142,
"grad_norm": 5.055143900923019,
"learning_rate": 9.079490644503714e-07,
"loss": 0.4704,
"step": 1423
},
{
"epoch": 0.5439266615737204,
"grad_norm": 3.442884686476501,
"learning_rate": 9.067170539406884e-07,
"loss": 0.4659,
"step": 1424
},
{
"epoch": 0.5443086325439267,
"grad_norm": 4.6387174303407726,
"learning_rate": 9.054851862468994e-07,
"loss": 0.4954,
"step": 1425
},
{
"epoch": 0.5446906035141329,
"grad_norm": 3.018966390578636,
"learning_rate": 9.042534632549897e-07,
"loss": 0.4842,
"step": 1426
},
{
"epoch": 0.5450725744843392,
"grad_norm": 12.512900108390705,
"learning_rate": 9.030218868507227e-07,
"loss": 0.5299,
"step": 1427
},
{
"epoch": 0.5454545454545454,
"grad_norm": 3.2874335207763608,
"learning_rate": 9.017904589196389e-07,
"loss": 0.4564,
"step": 1428
},
{
"epoch": 0.5458365164247517,
"grad_norm": 3.7038858926146236,
"learning_rate": 9.005591813470497e-07,
"loss": 0.4896,
"step": 1429
},
{
"epoch": 0.5462184873949579,
"grad_norm": 5.546433583514546,
"learning_rate": 8.993280560180376e-07,
"loss": 0.4317,
"step": 1430
},
{
"epoch": 0.5466004583651642,
"grad_norm": 3.5277981501703106,
"learning_rate": 8.980970848174515e-07,
"loss": 0.4678,
"step": 1431
},
{
"epoch": 0.5469824293353706,
"grad_norm": 4.576434077551453,
"learning_rate": 8.968662696299041e-07,
"loss": 0.5186,
"step": 1432
},
{
"epoch": 0.5473644003055768,
"grad_norm": 3.52536301654518,
"learning_rate": 8.956356123397701e-07,
"loss": 0.5332,
"step": 1433
},
{
"epoch": 0.5477463712757831,
"grad_norm": 4.764198850876418,
"learning_rate": 8.944051148311816e-07,
"loss": 0.5026,
"step": 1434
},
{
"epoch": 0.5481283422459893,
"grad_norm": 3.0442696758604746,
"learning_rate": 8.931747789880262e-07,
"loss": 0.5365,
"step": 1435
},
{
"epoch": 0.5485103132161956,
"grad_norm": 4.766070575325928,
"learning_rate": 8.919446066939441e-07,
"loss": 0.5468,
"step": 1436
},
{
"epoch": 0.5488922841864018,
"grad_norm": 9.341427804610916,
"learning_rate": 8.907145998323256e-07,
"loss": 0.4737,
"step": 1437
},
{
"epoch": 0.5492742551566081,
"grad_norm": 2.878781065183578,
"learning_rate": 8.894847602863073e-07,
"loss": 0.4587,
"step": 1438
},
{
"epoch": 0.5496562261268143,
"grad_norm": 2.9168697517060957,
"learning_rate": 8.882550899387692e-07,
"loss": 0.5013,
"step": 1439
},
{
"epoch": 0.5500381970970206,
"grad_norm": 2.440854640917449,
"learning_rate": 8.870255906723329e-07,
"loss": 0.4878,
"step": 1440
},
{
"epoch": 0.5504201680672269,
"grad_norm": 8.104615867570066,
"learning_rate": 8.857962643693578e-07,
"loss": 0.4979,
"step": 1441
},
{
"epoch": 0.5508021390374331,
"grad_norm": 4.602071585566376,
"learning_rate": 8.845671129119386e-07,
"loss": 0.4816,
"step": 1442
},
{
"epoch": 0.5511841100076394,
"grad_norm": 8.216271780031185,
"learning_rate": 8.833381381819024e-07,
"loss": 0.497,
"step": 1443
},
{
"epoch": 0.5515660809778457,
"grad_norm": 3.464263767454113,
"learning_rate": 8.821093420608053e-07,
"loss": 0.4295,
"step": 1444
},
{
"epoch": 0.551948051948052,
"grad_norm": 2.981673706352068,
"learning_rate": 8.808807264299308e-07,
"loss": 0.491,
"step": 1445
},
{
"epoch": 0.5523300229182582,
"grad_norm": 7.051563396141812,
"learning_rate": 8.796522931702849e-07,
"loss": 0.438,
"step": 1446
},
{
"epoch": 0.5527119938884645,
"grad_norm": 2.536412119372569,
"learning_rate": 8.784240441625952e-07,
"loss": 0.4295,
"step": 1447
},
{
"epoch": 0.5530939648586708,
"grad_norm": 3.0183629329186408,
"learning_rate": 8.771959812873074e-07,
"loss": 0.5156,
"step": 1448
},
{
"epoch": 0.553475935828877,
"grad_norm": 4.3370251554163906,
"learning_rate": 8.759681064245813e-07,
"loss": 0.4973,
"step": 1449
},
{
"epoch": 0.5538579067990833,
"grad_norm": 3.2729161328047645,
"learning_rate": 8.747404214542901e-07,
"loss": 0.4833,
"step": 1450
},
{
"epoch": 0.5542398777692895,
"grad_norm": 16.05139909702016,
"learning_rate": 8.735129282560146e-07,
"loss": 0.5091,
"step": 1451
},
{
"epoch": 0.5546218487394958,
"grad_norm": 2.8023413754370843,
"learning_rate": 8.722856287090436e-07,
"loss": 0.5034,
"step": 1452
},
{
"epoch": 0.555003819709702,
"grad_norm": 5.684720848689829,
"learning_rate": 8.710585246923692e-07,
"loss": 0.4611,
"step": 1453
},
{
"epoch": 0.5553857906799083,
"grad_norm": 3.150330041187914,
"learning_rate": 8.698316180846828e-07,
"loss": 0.5577,
"step": 1454
},
{
"epoch": 0.5557677616501145,
"grad_norm": 10.348445852541706,
"learning_rate": 8.686049107643749e-07,
"loss": 0.4978,
"step": 1455
},
{
"epoch": 0.5561497326203209,
"grad_norm": 4.9387801550507575,
"learning_rate": 8.673784046095311e-07,
"loss": 0.5367,
"step": 1456
},
{
"epoch": 0.5565317035905272,
"grad_norm": 2.9088678231936083,
"learning_rate": 8.661521014979276e-07,
"loss": 0.4076,
"step": 1457
},
{
"epoch": 0.5569136745607334,
"grad_norm": 4.516298244096117,
"learning_rate": 8.649260033070307e-07,
"loss": 0.5148,
"step": 1458
},
{
"epoch": 0.5572956455309397,
"grad_norm": 2.5243440271555135,
"learning_rate": 8.637001119139936e-07,
"loss": 0.5405,
"step": 1459
},
{
"epoch": 0.5576776165011459,
"grad_norm": 7.64792718611458,
"learning_rate": 8.624744291956509e-07,
"loss": 0.5286,
"step": 1460
},
{
"epoch": 0.5580595874713522,
"grad_norm": 4.003528589093513,
"learning_rate": 8.612489570285202e-07,
"loss": 0.4901,
"step": 1461
},
{
"epoch": 0.5584415584415584,
"grad_norm": 14.095823837542111,
"learning_rate": 8.600236972887944e-07,
"loss": 0.5361,
"step": 1462
},
{
"epoch": 0.5588235294117647,
"grad_norm": 3.0189204387992765,
"learning_rate": 8.587986518523427e-07,
"loss": 0.5517,
"step": 1463
},
{
"epoch": 0.5592055003819709,
"grad_norm": 5.732847087739884,
"learning_rate": 8.575738225947062e-07,
"loss": 0.5817,
"step": 1464
},
{
"epoch": 0.5595874713521772,
"grad_norm": 3.222690739509161,
"learning_rate": 8.563492113910938e-07,
"loss": 0.5404,
"step": 1465
},
{
"epoch": 0.5599694423223835,
"grad_norm": 4.7202010508339365,
"learning_rate": 8.551248201163817e-07,
"loss": 0.5231,
"step": 1466
},
{
"epoch": 0.5603514132925898,
"grad_norm": 3.7328063733693546,
"learning_rate": 8.539006506451096e-07,
"loss": 0.5047,
"step": 1467
},
{
"epoch": 0.5607333842627961,
"grad_norm": 5.0152413524387995,
"learning_rate": 8.52676704851476e-07,
"loss": 0.439,
"step": 1468
},
{
"epoch": 0.5611153552330023,
"grad_norm": 4.647406415540796,
"learning_rate": 8.514529846093387e-07,
"loss": 0.5324,
"step": 1469
},
{
"epoch": 0.5614973262032086,
"grad_norm": 4.6967314628587316,
"learning_rate": 8.502294917922097e-07,
"loss": 0.535,
"step": 1470
},
{
"epoch": 0.5618792971734148,
"grad_norm": 2.7305578834315853,
"learning_rate": 8.49006228273252e-07,
"loss": 0.5271,
"step": 1471
},
{
"epoch": 0.5622612681436211,
"grad_norm": 3.0620635795628792,
"learning_rate": 8.477831959252787e-07,
"loss": 0.517,
"step": 1472
},
{
"epoch": 0.5626432391138273,
"grad_norm": 2.5787028218139314,
"learning_rate": 8.465603966207479e-07,
"loss": 0.5051,
"step": 1473
},
{
"epoch": 0.5630252100840336,
"grad_norm": 4.592223182467329,
"learning_rate": 8.453378322317616e-07,
"loss": 0.4816,
"step": 1474
},
{
"epoch": 0.5634071810542399,
"grad_norm": 5.89203087141326,
"learning_rate": 8.441155046300625e-07,
"loss": 0.5154,
"step": 1475
},
{
"epoch": 0.5637891520244461,
"grad_norm": 3.435600065604833,
"learning_rate": 8.428934156870295e-07,
"loss": 0.4965,
"step": 1476
},
{
"epoch": 0.5641711229946524,
"grad_norm": 4.253808352654484,
"learning_rate": 8.416715672736774e-07,
"loss": 0.4058,
"step": 1477
},
{
"epoch": 0.5645530939648586,
"grad_norm": 2.9270004141326784,
"learning_rate": 8.404499612606524e-07,
"loss": 0.5441,
"step": 1478
},
{
"epoch": 0.564935064935065,
"grad_norm": 3.7766543211689836,
"learning_rate": 8.392285995182287e-07,
"loss": 0.5681,
"step": 1479
},
{
"epoch": 0.5653170359052712,
"grad_norm": 4.8245193053143485,
"learning_rate": 8.380074839163081e-07,
"loss": 0.4787,
"step": 1480
},
{
"epoch": 0.5656990068754775,
"grad_norm": 45.03903991958585,
"learning_rate": 8.367866163244137e-07,
"loss": 0.5358,
"step": 1481
},
{
"epoch": 0.5660809778456837,
"grad_norm": 2.9278826442105803,
"learning_rate": 8.355659986116906e-07,
"loss": 0.4661,
"step": 1482
},
{
"epoch": 0.56646294881589,
"grad_norm": 4.0805391091478205,
"learning_rate": 8.343456326469008e-07,
"loss": 0.5673,
"step": 1483
},
{
"epoch": 0.5668449197860963,
"grad_norm": 2.307755692837947,
"learning_rate": 8.331255202984201e-07,
"loss": 0.4078,
"step": 1484
},
{
"epoch": 0.5672268907563025,
"grad_norm": 4.828143841675473,
"learning_rate": 8.319056634342371e-07,
"loss": 0.5266,
"step": 1485
},
{
"epoch": 0.5676088617265088,
"grad_norm": 3.561454752977373,
"learning_rate": 8.306860639219487e-07,
"loss": 0.4747,
"step": 1486
},
{
"epoch": 0.567990832696715,
"grad_norm": 5.006142807023533,
"learning_rate": 8.294667236287574e-07,
"loss": 0.5029,
"step": 1487
},
{
"epoch": 0.5683728036669213,
"grad_norm": 7.7399372091191285,
"learning_rate": 8.282476444214699e-07,
"loss": 0.5099,
"step": 1488
},
{
"epoch": 0.5687547746371275,
"grad_norm": 4.596170322745298,
"learning_rate": 8.270288281664924e-07,
"loss": 0.4914,
"step": 1489
},
{
"epoch": 0.5691367456073338,
"grad_norm": 3.191470484566011,
"learning_rate": 8.258102767298287e-07,
"loss": 0.6249,
"step": 1490
},
{
"epoch": 0.56951871657754,
"grad_norm": 2.7721116349072727,
"learning_rate": 8.245919919770771e-07,
"loss": 0.4786,
"step": 1491
},
{
"epoch": 0.5699006875477464,
"grad_norm": 10.296186342257238,
"learning_rate": 8.233739757734278e-07,
"loss": 0.473,
"step": 1492
},
{
"epoch": 0.5702826585179527,
"grad_norm": 8.707234167735932,
"learning_rate": 8.221562299836596e-07,
"loss": 0.5692,
"step": 1493
},
{
"epoch": 0.5706646294881589,
"grad_norm": 7.119849279027865,
"learning_rate": 8.209387564721375e-07,
"loss": 0.5222,
"step": 1494
},
{
"epoch": 0.5710466004583652,
"grad_norm": 3.271163811284525,
"learning_rate": 8.197215571028097e-07,
"loss": 0.4784,
"step": 1495
},
{
"epoch": 0.5714285714285714,
"grad_norm": 3.474129399231328,
"learning_rate": 8.185046337392041e-07,
"loss": 0.486,
"step": 1496
},
{
"epoch": 0.5718105423987777,
"grad_norm": 7.702458839905247,
"learning_rate": 8.172879882444272e-07,
"loss": 0.466,
"step": 1497
},
{
"epoch": 0.5721925133689839,
"grad_norm": 2.3986718979238626,
"learning_rate": 8.160716224811587e-07,
"loss": 0.4286,
"step": 1498
},
{
"epoch": 0.5725744843391902,
"grad_norm": 2.074505714324364,
"learning_rate": 8.148555383116513e-07,
"loss": 0.4688,
"step": 1499
},
{
"epoch": 0.5729564553093965,
"grad_norm": 3.4524797955782005,
"learning_rate": 8.136397375977257e-07,
"loss": 0.5067,
"step": 1500
},
{
"epoch": 0.5733384262796027,
"grad_norm": 4.116027578701245,
"learning_rate": 8.124242222007688e-07,
"loss": 0.441,
"step": 1501
},
{
"epoch": 0.573720397249809,
"grad_norm": 7.8122892836416336,
"learning_rate": 8.112089939817311e-07,
"loss": 0.524,
"step": 1502
},
{
"epoch": 0.5741023682200153,
"grad_norm": 2.8876612966433948,
"learning_rate": 8.099940548011229e-07,
"loss": 0.4565,
"step": 1503
},
{
"epoch": 0.5744843391902216,
"grad_norm": 2.944620818559759,
"learning_rate": 8.087794065190121e-07,
"loss": 0.4721,
"step": 1504
},
{
"epoch": 0.5748663101604278,
"grad_norm": 3.3277066539267253,
"learning_rate": 8.075650509950222e-07,
"loss": 0.4696,
"step": 1505
},
{
"epoch": 0.5752482811306341,
"grad_norm": 10.689471168670488,
"learning_rate": 8.063509900883263e-07,
"loss": 0.493,
"step": 1506
},
{
"epoch": 0.5756302521008403,
"grad_norm": 2.7181430629970813,
"learning_rate": 8.051372256576487e-07,
"loss": 0.4505,
"step": 1507
},
{
"epoch": 0.5760122230710466,
"grad_norm": 3.3918780563240767,
"learning_rate": 8.03923759561259e-07,
"loss": 0.4509,
"step": 1508
},
{
"epoch": 0.5763941940412529,
"grad_norm": 4.488221465490028,
"learning_rate": 8.027105936569693e-07,
"loss": 0.4735,
"step": 1509
},
{
"epoch": 0.5767761650114591,
"grad_norm": 3.135011104593734,
"learning_rate": 8.014977298021329e-07,
"loss": 0.4989,
"step": 1510
},
{
"epoch": 0.5771581359816654,
"grad_norm": 5.773036750506569,
"learning_rate": 8.002851698536412e-07,
"loss": 0.5428,
"step": 1511
},
{
"epoch": 0.5775401069518716,
"grad_norm": 5.036394063568907,
"learning_rate": 7.990729156679185e-07,
"loss": 0.5348,
"step": 1512
},
{
"epoch": 0.577922077922078,
"grad_norm": 3.6336645286975044,
"learning_rate": 7.978609691009232e-07,
"loss": 0.4651,
"step": 1513
},
{
"epoch": 0.5783040488922842,
"grad_norm": 3.638075424158412,
"learning_rate": 7.966493320081408e-07,
"loss": 0.5198,
"step": 1514
},
{
"epoch": 0.5786860198624905,
"grad_norm": 6.900345004815312,
"learning_rate": 7.95438006244584e-07,
"loss": 0.5217,
"step": 1515
},
{
"epoch": 0.5790679908326967,
"grad_norm": 3.9858375649847186,
"learning_rate": 7.942269936647893e-07,
"loss": 0.5086,
"step": 1516
},
{
"epoch": 0.579449961802903,
"grad_norm": 3.8240778432496136,
"learning_rate": 7.930162961228122e-07,
"loss": 0.441,
"step": 1517
},
{
"epoch": 0.5798319327731093,
"grad_norm": 3.4529948281678147,
"learning_rate": 7.918059154722273e-07,
"loss": 0.4703,
"step": 1518
},
{
"epoch": 0.5802139037433155,
"grad_norm": 3.939480421094444,
"learning_rate": 7.905958535661239e-07,
"loss": 0.5244,
"step": 1519
},
{
"epoch": 0.5805958747135218,
"grad_norm": 2.1763912574345516,
"learning_rate": 7.89386112257102e-07,
"loss": 0.4816,
"step": 1520
},
{
"epoch": 0.580977845683728,
"grad_norm": 2.6665121976680437,
"learning_rate": 7.881766933972722e-07,
"loss": 0.4728,
"step": 1521
},
{
"epoch": 0.5813598166539343,
"grad_norm": 2.777402183353908,
"learning_rate": 7.869675988382514e-07,
"loss": 0.4844,
"step": 1522
},
{
"epoch": 0.5817417876241405,
"grad_norm": 3.588539547594645,
"learning_rate": 7.857588304311584e-07,
"loss": 0.4338,
"step": 1523
},
{
"epoch": 0.5821237585943468,
"grad_norm": 4.32282386026526,
"learning_rate": 7.84550390026615e-07,
"loss": 0.4775,
"step": 1524
},
{
"epoch": 0.582505729564553,
"grad_norm": 2.931593889458715,
"learning_rate": 7.833422794747386e-07,
"loss": 0.4569,
"step": 1525
},
{
"epoch": 0.5828877005347594,
"grad_norm": 13.694014402650298,
"learning_rate": 7.82134500625143e-07,
"loss": 0.4663,
"step": 1526
},
{
"epoch": 0.5832696715049657,
"grad_norm": 8.00099392041001,
"learning_rate": 7.809270553269344e-07,
"loss": 0.5448,
"step": 1527
},
{
"epoch": 0.5836516424751719,
"grad_norm": 4.911152661343212,
"learning_rate": 7.797199454287065e-07,
"loss": 0.5383,
"step": 1528
},
{
"epoch": 0.5840336134453782,
"grad_norm": 3.2819713397230283,
"learning_rate": 7.785131727785414e-07,
"loss": 0.4545,
"step": 1529
},
{
"epoch": 0.5844155844155844,
"grad_norm": 3.2090755873591106,
"learning_rate": 7.773067392240047e-07,
"loss": 0.4786,
"step": 1530
},
{
"epoch": 0.5847975553857907,
"grad_norm": 2.841300924397596,
"learning_rate": 7.761006466121412e-07,
"loss": 0.4764,
"step": 1531
},
{
"epoch": 0.5851795263559969,
"grad_norm": 2.924587185056088,
"learning_rate": 7.748948967894754e-07,
"loss": 0.4458,
"step": 1532
},
{
"epoch": 0.5855614973262032,
"grad_norm": 2.535663564324216,
"learning_rate": 7.73689491602007e-07,
"loss": 0.4888,
"step": 1533
},
{
"epoch": 0.5859434682964094,
"grad_norm": 6.512219722601765,
"learning_rate": 7.724844328952064e-07,
"loss": 0.4775,
"step": 1534
},
{
"epoch": 0.5863254392666157,
"grad_norm": 4.974082818782232,
"learning_rate": 7.712797225140158e-07,
"loss": 0.4751,
"step": 1535
},
{
"epoch": 0.586707410236822,
"grad_norm": 4.7343204793893845,
"learning_rate": 7.700753623028418e-07,
"loss": 0.4987,
"step": 1536
},
{
"epoch": 0.5870893812070282,
"grad_norm": 2.799580518820704,
"learning_rate": 7.688713541055567e-07,
"loss": 0.4883,
"step": 1537
},
{
"epoch": 0.5874713521772346,
"grad_norm": 3.2897436593561826,
"learning_rate": 7.676676997654936e-07,
"loss": 0.495,
"step": 1538
},
{
"epoch": 0.5878533231474408,
"grad_norm": 9.50610746090765,
"learning_rate": 7.664644011254425e-07,
"loss": 0.5041,
"step": 1539
},
{
"epoch": 0.5882352941176471,
"grad_norm": 3.969125093077795,
"learning_rate": 7.652614600276504e-07,
"loss": 0.542,
"step": 1540
},
{
"epoch": 0.5886172650878533,
"grad_norm": 6.534517882111242,
"learning_rate": 7.640588783138165e-07,
"loss": 0.52,
"step": 1541
},
{
"epoch": 0.5889992360580596,
"grad_norm": 3.4968680728699995,
"learning_rate": 7.62856657825089e-07,
"loss": 0.4587,
"step": 1542
},
{
"epoch": 0.5893812070282658,
"grad_norm": 3.5279074975526816,
"learning_rate": 7.616548004020642e-07,
"loss": 0.5227,
"step": 1543
},
{
"epoch": 0.5897631779984721,
"grad_norm": 4.029187714342275,
"learning_rate": 7.604533078847815e-07,
"loss": 0.4126,
"step": 1544
},
{
"epoch": 0.5901451489686784,
"grad_norm": 6.315149413504649,
"learning_rate": 7.592521821127222e-07,
"loss": 0.4608,
"step": 1545
},
{
"epoch": 0.5905271199388846,
"grad_norm": 3.0511915016198174,
"learning_rate": 7.580514249248064e-07,
"loss": 0.5158,
"step": 1546
},
{
"epoch": 0.5909090909090909,
"grad_norm": 4.288807788346786,
"learning_rate": 7.568510381593891e-07,
"loss": 0.5374,
"step": 1547
},
{
"epoch": 0.5912910618792971,
"grad_norm": 2.9619735092858104,
"learning_rate": 7.556510236542591e-07,
"loss": 0.5043,
"step": 1548
},
{
"epoch": 0.5916730328495035,
"grad_norm": 3.853484044967723,
"learning_rate": 7.544513832466343e-07,
"loss": 0.554,
"step": 1549
},
{
"epoch": 0.5920550038197097,
"grad_norm": 3.4507767530399764,
"learning_rate": 7.532521187731607e-07,
"loss": 0.4707,
"step": 1550
},
{
"epoch": 0.592436974789916,
"grad_norm": 2.8676870221969106,
"learning_rate": 7.520532320699079e-07,
"loss": 0.4643,
"step": 1551
},
{
"epoch": 0.5928189457601223,
"grad_norm": 3.4993934273529423,
"learning_rate": 7.508547249723683e-07,
"loss": 0.484,
"step": 1552
},
{
"epoch": 0.5932009167303285,
"grad_norm": 4.974887413665064,
"learning_rate": 7.49656599315452e-07,
"loss": 0.4409,
"step": 1553
},
{
"epoch": 0.5935828877005348,
"grad_norm": 3.0087772046730805,
"learning_rate": 7.484588569334857e-07,
"loss": 0.4802,
"step": 1554
},
{
"epoch": 0.593964858670741,
"grad_norm": 3.556085540496316,
"learning_rate": 7.472614996602094e-07,
"loss": 0.458,
"step": 1555
},
{
"epoch": 0.5943468296409473,
"grad_norm": 2.4299546193349637,
"learning_rate": 7.460645293287727e-07,
"loss": 0.509,
"step": 1556
},
{
"epoch": 0.5947288006111535,
"grad_norm": 3.3761103144904068,
"learning_rate": 7.448679477717339e-07,
"loss": 0.4775,
"step": 1557
},
{
"epoch": 0.5951107715813598,
"grad_norm": 3.189320341493309,
"learning_rate": 7.436717568210555e-07,
"loss": 0.5356,
"step": 1558
},
{
"epoch": 0.595492742551566,
"grad_norm": 3.013388009140299,
"learning_rate": 7.424759583081016e-07,
"loss": 0.4728,
"step": 1559
},
{
"epoch": 0.5958747135217723,
"grad_norm": 3.046578423463988,
"learning_rate": 7.412805540636366e-07,
"loss": 0.5452,
"step": 1560
},
{
"epoch": 0.5962566844919787,
"grad_norm": 2.2952147418741977,
"learning_rate": 7.4008554591782e-07,
"loss": 0.4767,
"step": 1561
},
{
"epoch": 0.5966386554621849,
"grad_norm": 7.486369396821942,
"learning_rate": 7.388909357002056e-07,
"loss": 0.473,
"step": 1562
},
{
"epoch": 0.5970206264323912,
"grad_norm": 3.1043359849327836,
"learning_rate": 7.376967252397384e-07,
"loss": 0.4619,
"step": 1563
},
{
"epoch": 0.5974025974025974,
"grad_norm": 4.387374665190372,
"learning_rate": 7.365029163647498e-07,
"loss": 0.4387,
"step": 1564
},
{
"epoch": 0.5977845683728037,
"grad_norm": 5.097455794718711,
"learning_rate": 7.353095109029586e-07,
"loss": 0.4869,
"step": 1565
},
{
"epoch": 0.5981665393430099,
"grad_norm": 4.110723581933108,
"learning_rate": 7.341165106814635e-07,
"loss": 0.5648,
"step": 1566
},
{
"epoch": 0.5985485103132162,
"grad_norm": 3.0250820237041167,
"learning_rate": 7.329239175267447e-07,
"loss": 0.4695,
"step": 1567
},
{
"epoch": 0.5989304812834224,
"grad_norm": 2.9943913573090417,
"learning_rate": 7.31731733264659e-07,
"loss": 0.4694,
"step": 1568
},
{
"epoch": 0.5993124522536287,
"grad_norm": 4.117308546393739,
"learning_rate": 7.305399597204357e-07,
"loss": 0.435,
"step": 1569
},
{
"epoch": 0.599694423223835,
"grad_norm": 2.55250388950113,
"learning_rate": 7.293485987186768e-07,
"loss": 0.4719,
"step": 1570
},
{
"epoch": 0.6000763941940412,
"grad_norm": 5.6760675149597395,
"learning_rate": 7.281576520833527e-07,
"loss": 0.4344,
"step": 1571
},
{
"epoch": 0.6004583651642476,
"grad_norm": 2.6623917208637913,
"learning_rate": 7.26967121637798e-07,
"loss": 0.4687,
"step": 1572
},
{
"epoch": 0.6008403361344538,
"grad_norm": 4.649288674104955,
"learning_rate": 7.257770092047113e-07,
"loss": 0.4796,
"step": 1573
},
{
"epoch": 0.6012223071046601,
"grad_norm": 4.608891882942033,
"learning_rate": 7.245873166061516e-07,
"loss": 0.5011,
"step": 1574
},
{
"epoch": 0.6016042780748663,
"grad_norm": 4.050761013211949,
"learning_rate": 7.233980456635336e-07,
"loss": 0.526,
"step": 1575
},
{
"epoch": 0.6019862490450726,
"grad_norm": 2.9653108536504873,
"learning_rate": 7.222091981976279e-07,
"loss": 0.5106,
"step": 1576
},
{
"epoch": 0.6023682200152788,
"grad_norm": 4.626014268253724,
"learning_rate": 7.210207760285559e-07,
"loss": 0.551,
"step": 1577
},
{
"epoch": 0.6027501909854851,
"grad_norm": 4.259716690883014,
"learning_rate": 7.198327809757881e-07,
"loss": 0.505,
"step": 1578
},
{
"epoch": 0.6031321619556914,
"grad_norm": 6.6601277749781325,
"learning_rate": 7.186452148581416e-07,
"loss": 0.5592,
"step": 1579
},
{
"epoch": 0.6035141329258976,
"grad_norm": 3.968490857184638,
"learning_rate": 7.174580794937757e-07,
"loss": 0.4628,
"step": 1580
},
{
"epoch": 0.6038961038961039,
"grad_norm": 6.682935705440368,
"learning_rate": 7.162713767001913e-07,
"loss": 0.5647,
"step": 1581
},
{
"epoch": 0.6042780748663101,
"grad_norm": 3.108425341167812,
"learning_rate": 7.150851082942269e-07,
"loss": 0.4292,
"step": 1582
},
{
"epoch": 0.6046600458365164,
"grad_norm": 3.615216176361843,
"learning_rate": 7.138992760920547e-07,
"loss": 0.4771,
"step": 1583
},
{
"epoch": 0.6050420168067226,
"grad_norm": 2.41401307253289,
"learning_rate": 7.127138819091807e-07,
"loss": 0.4679,
"step": 1584
},
{
"epoch": 0.605423987776929,
"grad_norm": 2.975265304410035,
"learning_rate": 7.115289275604399e-07,
"loss": 0.5113,
"step": 1585
},
{
"epoch": 0.6058059587471352,
"grad_norm": 2.4670213978756057,
"learning_rate": 7.103444148599928e-07,
"loss": 0.4444,
"step": 1586
},
{
"epoch": 0.6061879297173415,
"grad_norm": 2.721859542053092,
"learning_rate": 7.091603456213256e-07,
"loss": 0.494,
"step": 1587
},
{
"epoch": 0.6065699006875478,
"grad_norm": 3.1899384268225637,
"learning_rate": 7.079767216572435e-07,
"loss": 0.5085,
"step": 1588
},
{
"epoch": 0.606951871657754,
"grad_norm": 5.541235083933573,
"learning_rate": 7.067935447798715e-07,
"loss": 0.4959,
"step": 1589
},
{
"epoch": 0.6073338426279603,
"grad_norm": 2.276351070925125,
"learning_rate": 7.056108168006501e-07,
"loss": 0.416,
"step": 1590
},
{
"epoch": 0.6077158135981665,
"grad_norm": 5.658392749534633,
"learning_rate": 7.044285395303311e-07,
"loss": 0.5796,
"step": 1591
},
{
"epoch": 0.6080977845683728,
"grad_norm": 6.379675852008344,
"learning_rate": 7.032467147789777e-07,
"loss": 0.5095,
"step": 1592
},
{
"epoch": 0.608479755538579,
"grad_norm": 4.131299850365157,
"learning_rate": 7.020653443559603e-07,
"loss": 0.5488,
"step": 1593
},
{
"epoch": 0.6088617265087853,
"grad_norm": 2.9944028762199797,
"learning_rate": 7.008844300699524e-07,
"loss": 0.4771,
"step": 1594
},
{
"epoch": 0.6092436974789915,
"grad_norm": 5.575470624436518,
"learning_rate": 6.997039737289306e-07,
"loss": 0.4635,
"step": 1595
},
{
"epoch": 0.6096256684491979,
"grad_norm": 3.8486467232113863,
"learning_rate": 6.9852397714017e-07,
"loss": 0.4916,
"step": 1596
},
{
"epoch": 0.6100076394194042,
"grad_norm": 11.050978921728946,
"learning_rate": 6.973444421102407e-07,
"loss": 0.4811,
"step": 1597
},
{
"epoch": 0.6103896103896104,
"grad_norm": 4.822958194483858,
"learning_rate": 6.961653704450083e-07,
"loss": 0.5356,
"step": 1598
},
{
"epoch": 0.6107715813598167,
"grad_norm": 2.817834665524706,
"learning_rate": 6.949867639496266e-07,
"loss": 0.4756,
"step": 1599
},
{
"epoch": 0.6111535523300229,
"grad_norm": 3.1957213238441144,
"learning_rate": 6.938086244285389e-07,
"loss": 0.4073,
"step": 1600
},
{
"epoch": 0.6115355233002292,
"grad_norm": 5.636868095973053,
"learning_rate": 6.926309536854736e-07,
"loss": 0.4167,
"step": 1601
},
{
"epoch": 0.6119174942704354,
"grad_norm": 30.090375746305387,
"learning_rate": 6.914537535234398e-07,
"loss": 0.4582,
"step": 1602
},
{
"epoch": 0.6122994652406417,
"grad_norm": 5.184037798420374,
"learning_rate": 6.902770257447281e-07,
"loss": 0.4662,
"step": 1603
},
{
"epoch": 0.612681436210848,
"grad_norm": 3.671632410757147,
"learning_rate": 6.891007721509044e-07,
"loss": 0.4836,
"step": 1604
},
{
"epoch": 0.6130634071810542,
"grad_norm": 2.950958986784177,
"learning_rate": 6.879249945428096e-07,
"loss": 0.5001,
"step": 1605
},
{
"epoch": 0.6134453781512605,
"grad_norm": 4.3157545193073,
"learning_rate": 6.867496947205552e-07,
"loss": 0.5153,
"step": 1606
},
{
"epoch": 0.6138273491214667,
"grad_norm": 2.250749577401553,
"learning_rate": 6.855748744835215e-07,
"loss": 0.4348,
"step": 1607
},
{
"epoch": 0.6142093200916731,
"grad_norm": 2.2917709378525064,
"learning_rate": 6.844005356303548e-07,
"loss": 0.4505,
"step": 1608
},
{
"epoch": 0.6145912910618793,
"grad_norm": 2.438149237331104,
"learning_rate": 6.83226679958964e-07,
"loss": 0.43,
"step": 1609
},
{
"epoch": 0.6149732620320856,
"grad_norm": 3.02391998302188,
"learning_rate": 6.820533092665184e-07,
"loss": 0.4346,
"step": 1610
},
{
"epoch": 0.6153552330022918,
"grad_norm": 4.366893675828903,
"learning_rate": 6.808804253494447e-07,
"loss": 0.4641,
"step": 1611
},
{
"epoch": 0.6157372039724981,
"grad_norm": 3.2359788318982434,
"learning_rate": 6.797080300034246e-07,
"loss": 0.4959,
"step": 1612
},
{
"epoch": 0.6161191749427044,
"grad_norm": 4.715978801891636,
"learning_rate": 6.785361250233916e-07,
"loss": 0.5739,
"step": 1613
},
{
"epoch": 0.6165011459129106,
"grad_norm": 2.038495378912477,
"learning_rate": 6.773647122035282e-07,
"loss": 0.4311,
"step": 1614
},
{
"epoch": 0.6168831168831169,
"grad_norm": 13.318932449928443,
"learning_rate": 6.761937933372646e-07,
"loss": 0.4765,
"step": 1615
},
{
"epoch": 0.6172650878533231,
"grad_norm": 3.3038029184889157,
"learning_rate": 6.750233702172725e-07,
"loss": 0.4834,
"step": 1616
},
{
"epoch": 0.6176470588235294,
"grad_norm": 3.961278587601363,
"learning_rate": 6.738534446354671e-07,
"loss": 0.517,
"step": 1617
},
{
"epoch": 0.6180290297937356,
"grad_norm": 3.3010476421667527,
"learning_rate": 6.726840183830005e-07,
"loss": 0.4803,
"step": 1618
},
{
"epoch": 0.618411000763942,
"grad_norm": 4.1461864571109555,
"learning_rate": 6.7151509325026e-07,
"loss": 0.504,
"step": 1619
},
{
"epoch": 0.6187929717341482,
"grad_norm": 6.0874786021177,
"learning_rate": 6.703466710268672e-07,
"loss": 0.4243,
"step": 1620
},
{
"epoch": 0.6191749427043545,
"grad_norm": 2.844071999977886,
"learning_rate": 6.691787535016719e-07,
"loss": 0.5363,
"step": 1621
},
{
"epoch": 0.6195569136745608,
"grad_norm": 7.493734331551172,
"learning_rate": 6.680113424627525e-07,
"loss": 0.4752,
"step": 1622
},
{
"epoch": 0.619938884644767,
"grad_norm": 5.034722454254198,
"learning_rate": 6.668444396974118e-07,
"loss": 0.4612,
"step": 1623
},
{
"epoch": 0.6203208556149733,
"grad_norm": 4.666207523543107,
"learning_rate": 6.656780469921739e-07,
"loss": 0.4451,
"step": 1624
},
{
"epoch": 0.6207028265851795,
"grad_norm": 4.324324403295682,
"learning_rate": 6.645121661327823e-07,
"loss": 0.5515,
"step": 1625
},
{
"epoch": 0.6210847975553858,
"grad_norm": 10.74233453054578,
"learning_rate": 6.633467989041974e-07,
"loss": 0.4634,
"step": 1626
},
{
"epoch": 0.621466768525592,
"grad_norm": 3.9368757976763886,
"learning_rate": 6.621819470905919e-07,
"loss": 0.4891,
"step": 1627
},
{
"epoch": 0.6218487394957983,
"grad_norm": 4.195890839175386,
"learning_rate": 6.610176124753512e-07,
"loss": 0.5334,
"step": 1628
},
{
"epoch": 0.6222307104660045,
"grad_norm": 7.033789239854464,
"learning_rate": 6.598537968410669e-07,
"loss": 0.4981,
"step": 1629
},
{
"epoch": 0.6226126814362108,
"grad_norm": 3.670753713023111,
"learning_rate": 6.586905019695374e-07,
"loss": 0.4785,
"step": 1630
},
{
"epoch": 0.6229946524064172,
"grad_norm": 6.79374173212643,
"learning_rate": 6.575277296417641e-07,
"loss": 0.5383,
"step": 1631
},
{
"epoch": 0.6233766233766234,
"grad_norm": 2.961589738049803,
"learning_rate": 6.563654816379467e-07,
"loss": 0.5909,
"step": 1632
},
{
"epoch": 0.6237585943468297,
"grad_norm": 4.548882294306666,
"learning_rate": 6.552037597374835e-07,
"loss": 0.528,
"step": 1633
},
{
"epoch": 0.6241405653170359,
"grad_norm": 5.588629780486591,
"learning_rate": 6.540425657189679e-07,
"loss": 0.5701,
"step": 1634
},
{
"epoch": 0.6245225362872422,
"grad_norm": 3.2222767903382583,
"learning_rate": 6.52881901360183e-07,
"loss": 0.5252,
"step": 1635
},
{
"epoch": 0.6249045072574484,
"grad_norm": 2.708765525352287,
"learning_rate": 6.517217684381027e-07,
"loss": 0.4311,
"step": 1636
},
{
"epoch": 0.6252864782276547,
"grad_norm": 2.4373598394053366,
"learning_rate": 6.505621687288874e-07,
"loss": 0.4887,
"step": 1637
},
{
"epoch": 0.6256684491978609,
"grad_norm": 2.35629298170874,
"learning_rate": 6.494031040078796e-07,
"loss": 0.4235,
"step": 1638
},
{
"epoch": 0.6260504201680672,
"grad_norm": 5.006114811608024,
"learning_rate": 6.482445760496047e-07,
"loss": 0.4431,
"step": 1639
},
{
"epoch": 0.6264323911382735,
"grad_norm": 3.9758100753984205,
"learning_rate": 6.470865866277643e-07,
"loss": 0.4922,
"step": 1640
},
{
"epoch": 0.6268143621084797,
"grad_norm": 3.942533295869188,
"learning_rate": 6.459291375152371e-07,
"loss": 0.5151,
"step": 1641
},
{
"epoch": 0.627196333078686,
"grad_norm": 5.7439273904282935,
"learning_rate": 6.447722304840742e-07,
"loss": 0.4903,
"step": 1642
},
{
"epoch": 0.6275783040488923,
"grad_norm": 6.767150703832732,
"learning_rate": 6.436158673054959e-07,
"loss": 0.5221,
"step": 1643
},
{
"epoch": 0.6279602750190986,
"grad_norm": 3.4573716609857184,
"learning_rate": 6.424600497498909e-07,
"loss": 0.4903,
"step": 1644
},
{
"epoch": 0.6283422459893048,
"grad_norm": 4.3338546356819245,
"learning_rate": 6.413047795868128e-07,
"loss": 0.5131,
"step": 1645
},
{
"epoch": 0.6287242169595111,
"grad_norm": 2.337049188668566,
"learning_rate": 6.401500585849755e-07,
"loss": 0.4417,
"step": 1646
},
{
"epoch": 0.6291061879297173,
"grad_norm": 4.623304772516744,
"learning_rate": 6.389958885122537e-07,
"loss": 0.513,
"step": 1647
},
{
"epoch": 0.6294881588999236,
"grad_norm": 2.825474625036937,
"learning_rate": 6.378422711356784e-07,
"loss": 0.5328,
"step": 1648
},
{
"epoch": 0.6298701298701299,
"grad_norm": 3.183544201518079,
"learning_rate": 6.366892082214335e-07,
"loss": 0.4569,
"step": 1649
},
{
"epoch": 0.6302521008403361,
"grad_norm": 3.6534350262559063,
"learning_rate": 6.355367015348554e-07,
"loss": 0.5391,
"step": 1650
},
{
"epoch": 0.6306340718105424,
"grad_norm": 3.38465233341402,
"learning_rate": 6.343847528404272e-07,
"loss": 0.5551,
"step": 1651
},
{
"epoch": 0.6310160427807486,
"grad_norm": 5.682440315836746,
"learning_rate": 6.332333639017793e-07,
"loss": 0.4981,
"step": 1652
},
{
"epoch": 0.631398013750955,
"grad_norm": 3.1677497208065373,
"learning_rate": 6.320825364816849e-07,
"loss": 0.5261,
"step": 1653
},
{
"epoch": 0.6317799847211611,
"grad_norm": 4.803410013797171,
"learning_rate": 6.309322723420562e-07,
"loss": 0.5549,
"step": 1654
},
{
"epoch": 0.6321619556913675,
"grad_norm": 5.027427394401916,
"learning_rate": 6.297825732439443e-07,
"loss": 0.5482,
"step": 1655
},
{
"epoch": 0.6325439266615738,
"grad_norm": 3.8853463499253422,
"learning_rate": 6.286334409475355e-07,
"loss": 0.501,
"step": 1656
},
{
"epoch": 0.63292589763178,
"grad_norm": 3.8428389865824224,
"learning_rate": 6.274848772121466e-07,
"loss": 0.5422,
"step": 1657
},
{
"epoch": 0.6333078686019863,
"grad_norm": 2.731374604475085,
"learning_rate": 6.263368837962257e-07,
"loss": 0.455,
"step": 1658
},
{
"epoch": 0.6336898395721925,
"grad_norm": 3.4802534529869664,
"learning_rate": 6.251894624573471e-07,
"loss": 0.473,
"step": 1659
},
{
"epoch": 0.6340718105423988,
"grad_norm": 2.9929345370113176,
"learning_rate": 6.240426149522089e-07,
"loss": 0.5272,
"step": 1660
},
{
"epoch": 0.634453781512605,
"grad_norm": 3.861558519836529,
"learning_rate": 6.228963430366314e-07,
"loss": 0.533,
"step": 1661
},
{
"epoch": 0.6348357524828113,
"grad_norm": 4.272779799163776,
"learning_rate": 6.217506484655525e-07,
"loss": 0.5985,
"step": 1662
},
{
"epoch": 0.6352177234530175,
"grad_norm": 4.963715926584021,
"learning_rate": 6.206055329930277e-07,
"loss": 0.4597,
"step": 1663
},
{
"epoch": 0.6355996944232238,
"grad_norm": 7.03579269952758,
"learning_rate": 6.19460998372225e-07,
"loss": 0.449,
"step": 1664
},
{
"epoch": 0.6359816653934302,
"grad_norm": 3.429746839420197,
"learning_rate": 6.18317046355423e-07,
"loss": 0.4666,
"step": 1665
},
{
"epoch": 0.6363636363636364,
"grad_norm": 4.840340168121579,
"learning_rate": 6.171736786940086e-07,
"loss": 0.4849,
"step": 1666
},
{
"epoch": 0.6367456073338427,
"grad_norm": 3.248146554075291,
"learning_rate": 6.160308971384743e-07,
"loss": 0.4846,
"step": 1667
},
{
"epoch": 0.6371275783040489,
"grad_norm": 6.126038353964562,
"learning_rate": 6.148887034384151e-07,
"loss": 0.4926,
"step": 1668
},
{
"epoch": 0.6375095492742552,
"grad_norm": 3.4897863205009316,
"learning_rate": 6.137470993425255e-07,
"loss": 0.4311,
"step": 1669
},
{
"epoch": 0.6378915202444614,
"grad_norm": 3.7539254782155225,
"learning_rate": 6.12606086598598e-07,
"loss": 0.4978,
"step": 1670
},
{
"epoch": 0.6382734912146677,
"grad_norm": 3.6834259647704295,
"learning_rate": 6.114656669535195e-07,
"loss": 0.4303,
"step": 1671
},
{
"epoch": 0.6386554621848739,
"grad_norm": 2.463752930446396,
"learning_rate": 6.103258421532688e-07,
"loss": 0.4979,
"step": 1672
},
{
"epoch": 0.6390374331550802,
"grad_norm": 5.572518032338645,
"learning_rate": 6.091866139429141e-07,
"loss": 0.5014,
"step": 1673
},
{
"epoch": 0.6394194041252865,
"grad_norm": 3.4711321697402036,
"learning_rate": 6.080479840666099e-07,
"loss": 0.4709,
"step": 1674
},
{
"epoch": 0.6398013750954927,
"grad_norm": 13.368561355412897,
"learning_rate": 6.069099542675955e-07,
"loss": 0.5457,
"step": 1675
},
{
"epoch": 0.640183346065699,
"grad_norm": 3.860804853293634,
"learning_rate": 6.057725262881901e-07,
"loss": 0.5455,
"step": 1676
},
{
"epoch": 0.6405653170359052,
"grad_norm": 13.287050371437967,
"learning_rate": 6.046357018697927e-07,
"loss": 0.4535,
"step": 1677
},
{
"epoch": 0.6409472880061116,
"grad_norm": 3.2079302137607826,
"learning_rate": 6.034994827528785e-07,
"loss": 0.4981,
"step": 1678
},
{
"epoch": 0.6413292589763178,
"grad_norm": 2.829754729501561,
"learning_rate": 6.023638706769943e-07,
"loss": 0.4729,
"step": 1679
},
{
"epoch": 0.6417112299465241,
"grad_norm": 2.022832894555702,
"learning_rate": 6.012288673807595e-07,
"loss": 0.4926,
"step": 1680
},
{
"epoch": 0.6420932009167303,
"grad_norm": 2.778956787238978,
"learning_rate": 6.000944746018596e-07,
"loss": 0.4604,
"step": 1681
},
{
"epoch": 0.6424751718869366,
"grad_norm": 3.7374003607417228,
"learning_rate": 5.989606940770469e-07,
"loss": 0.4772,
"step": 1682
},
{
"epoch": 0.6428571428571429,
"grad_norm": 2.7171406940786635,
"learning_rate": 5.97827527542136e-07,
"loss": 0.4831,
"step": 1683
},
{
"epoch": 0.6432391138273491,
"grad_norm": 8.127558067294217,
"learning_rate": 5.966949767320004e-07,
"loss": 0.5173,
"step": 1684
},
{
"epoch": 0.6436210847975554,
"grad_norm": 3.480736121288153,
"learning_rate": 5.955630433805726e-07,
"loss": 0.4912,
"step": 1685
},
{
"epoch": 0.6440030557677616,
"grad_norm": 3.8094185592179697,
"learning_rate": 5.944317292208389e-07,
"loss": 0.5787,
"step": 1686
},
{
"epoch": 0.6443850267379679,
"grad_norm": 2.405101421994293,
"learning_rate": 5.933010359848374e-07,
"loss": 0.4991,
"step": 1687
},
{
"epoch": 0.6447669977081741,
"grad_norm": 2.7304298777313787,
"learning_rate": 5.921709654036556e-07,
"loss": 0.5103,
"step": 1688
},
{
"epoch": 0.6451489686783805,
"grad_norm": 4.805287774289694,
"learning_rate": 5.910415192074288e-07,
"loss": 0.5227,
"step": 1689
},
{
"epoch": 0.6455309396485867,
"grad_norm": 4.1700339428573745,
"learning_rate": 5.899126991253347e-07,
"loss": 0.5112,
"step": 1690
},
{
"epoch": 0.645912910618793,
"grad_norm": 10.044750000664395,
"learning_rate": 5.887845068855939e-07,
"loss": 0.5563,
"step": 1691
},
{
"epoch": 0.6462948815889993,
"grad_norm": 3.469159628822302,
"learning_rate": 5.876569442154644e-07,
"loss": 0.4598,
"step": 1692
},
{
"epoch": 0.6466768525592055,
"grad_norm": 10.365553823123852,
"learning_rate": 5.865300128412415e-07,
"loss": 0.4509,
"step": 1693
},
{
"epoch": 0.6470588235294118,
"grad_norm": 4.032808280921916,
"learning_rate": 5.85403714488254e-07,
"loss": 0.4741,
"step": 1694
},
{
"epoch": 0.647440794499618,
"grad_norm": 5.11652523829025,
"learning_rate": 5.8427805088086e-07,
"loss": 0.4334,
"step": 1695
},
{
"epoch": 0.6478227654698243,
"grad_norm": 4.621337953148375,
"learning_rate": 5.831530237424477e-07,
"loss": 0.4143,
"step": 1696
},
{
"epoch": 0.6482047364400305,
"grad_norm": 3.557162790697937,
"learning_rate": 5.820286347954302e-07,
"loss": 0.4195,
"step": 1697
},
{
"epoch": 0.6485867074102368,
"grad_norm": 4.624049852239171,
"learning_rate": 5.809048857612427e-07,
"loss": 0.4501,
"step": 1698
},
{
"epoch": 0.648968678380443,
"grad_norm": 3.929897503465146,
"learning_rate": 5.797817783603418e-07,
"loss": 0.505,
"step": 1699
},
{
"epoch": 0.6493506493506493,
"grad_norm": 3.1054123771508833,
"learning_rate": 5.786593143122016e-07,
"loss": 0.4928,
"step": 1700
},
{
"epoch": 0.6497326203208557,
"grad_norm": 3.6791158790205816,
"learning_rate": 5.775374953353105e-07,
"loss": 0.4264,
"step": 1701
},
{
"epoch": 0.6501145912910619,
"grad_norm": 3.7861539692813144,
"learning_rate": 5.764163231471704e-07,
"loss": 0.5176,
"step": 1702
},
{
"epoch": 0.6504965622612682,
"grad_norm": 3.1921624515477176,
"learning_rate": 5.752957994642915e-07,
"loss": 0.4663,
"step": 1703
},
{
"epoch": 0.6508785332314744,
"grad_norm": 2.7243804580628055,
"learning_rate": 5.741759260021925e-07,
"loss": 0.4736,
"step": 1704
},
{
"epoch": 0.6512605042016807,
"grad_norm": 2.054999205122801,
"learning_rate": 5.730567044753964e-07,
"loss": 0.4609,
"step": 1705
},
{
"epoch": 0.6516424751718869,
"grad_norm": 4.172156644093354,
"learning_rate": 5.719381365974272e-07,
"loss": 0.4597,
"step": 1706
},
{
"epoch": 0.6520244461420932,
"grad_norm": 2.973154086619092,
"learning_rate": 5.708202240808088e-07,
"loss": 0.5187,
"step": 1707
},
{
"epoch": 0.6524064171122995,
"grad_norm": 29.985463499899154,
"learning_rate": 5.697029686370625e-07,
"loss": 0.4641,
"step": 1708
},
{
"epoch": 0.6527883880825057,
"grad_norm": 4.452631031963895,
"learning_rate": 5.685863719767019e-07,
"loss": 0.54,
"step": 1709
},
{
"epoch": 0.653170359052712,
"grad_norm": 70.46307864789232,
"learning_rate": 5.674704358092331e-07,
"loss": 0.5315,
"step": 1710
},
{
"epoch": 0.6535523300229182,
"grad_norm": 3.2095941246382793,
"learning_rate": 5.663551618431516e-07,
"loss": 0.4697,
"step": 1711
},
{
"epoch": 0.6539343009931246,
"grad_norm": 2.7202164393697963,
"learning_rate": 5.652405517859372e-07,
"loss": 0.5036,
"step": 1712
},
{
"epoch": 0.6543162719633308,
"grad_norm": 6.483848952905334,
"learning_rate": 5.641266073440553e-07,
"loss": 0.4534,
"step": 1713
},
{
"epoch": 0.6546982429335371,
"grad_norm": 3.8732081112384886,
"learning_rate": 5.630133302229505e-07,
"loss": 0.4985,
"step": 1714
},
{
"epoch": 0.6550802139037433,
"grad_norm": 3.4979767733986313,
"learning_rate": 5.619007221270468e-07,
"loss": 0.5346,
"step": 1715
},
{
"epoch": 0.6554621848739496,
"grad_norm": 4.709621876577418,
"learning_rate": 5.607887847597443e-07,
"loss": 0.4209,
"step": 1716
},
{
"epoch": 0.6558441558441559,
"grad_norm": 5.503269999420802,
"learning_rate": 5.596775198234145e-07,
"loss": 0.5286,
"step": 1717
},
{
"epoch": 0.6562261268143621,
"grad_norm": 7.195878519228084,
"learning_rate": 5.585669290194009e-07,
"loss": 0.4672,
"step": 1718
},
{
"epoch": 0.6566080977845684,
"grad_norm": 2.365938759646021,
"learning_rate": 5.574570140480151e-07,
"loss": 0.4023,
"step": 1719
},
{
"epoch": 0.6569900687547746,
"grad_norm": 6.8990648285289655,
"learning_rate": 5.563477766085325e-07,
"loss": 0.4698,
"step": 1720
},
{
"epoch": 0.6573720397249809,
"grad_norm": 3.2125838369759263,
"learning_rate": 5.552392183991928e-07,
"loss": 0.4959,
"step": 1721
},
{
"epoch": 0.6577540106951871,
"grad_norm": 5.146019162532558,
"learning_rate": 5.541313411171944e-07,
"loss": 0.4607,
"step": 1722
},
{
"epoch": 0.6581359816653934,
"grad_norm": 3.421511033601826,
"learning_rate": 5.530241464586944e-07,
"loss": 0.4918,
"step": 1723
},
{
"epoch": 0.6585179526355996,
"grad_norm": 2.058183092172318,
"learning_rate": 5.519176361188043e-07,
"loss": 0.4409,
"step": 1724
},
{
"epoch": 0.658899923605806,
"grad_norm": 2.885911311024911,
"learning_rate": 5.508118117915874e-07,
"loss": 0.5268,
"step": 1725
},
{
"epoch": 0.6592818945760123,
"grad_norm": 2.7223463912778105,
"learning_rate": 5.497066751700577e-07,
"loss": 0.3902,
"step": 1726
},
{
"epoch": 0.6596638655462185,
"grad_norm": 4.6121095889796635,
"learning_rate": 5.486022279461762e-07,
"loss": 0.5185,
"step": 1727
},
{
"epoch": 0.6600458365164248,
"grad_norm": 5.173898079109986,
"learning_rate": 5.474984718108471e-07,
"loss": 0.4709,
"step": 1728
},
{
"epoch": 0.660427807486631,
"grad_norm": 2.7477603466877434,
"learning_rate": 5.463954084539181e-07,
"loss": 0.4943,
"step": 1729
},
{
"epoch": 0.6608097784568373,
"grad_norm": 4.093441308970811,
"learning_rate": 5.45293039564176e-07,
"loss": 0.5402,
"step": 1730
},
{
"epoch": 0.6611917494270435,
"grad_norm": 3.9339067289199883,
"learning_rate": 5.441913668293434e-07,
"loss": 0.4398,
"step": 1731
},
{
"epoch": 0.6615737203972498,
"grad_norm": 10.141199390696602,
"learning_rate": 5.430903919360783e-07,
"loss": 0.5275,
"step": 1732
},
{
"epoch": 0.661955691367456,
"grad_norm": 2.5647842976467383,
"learning_rate": 5.419901165699693e-07,
"loss": 0.4711,
"step": 1733
},
{
"epoch": 0.6623376623376623,
"grad_norm": 2.449797827042685,
"learning_rate": 5.408905424155345e-07,
"loss": 0.4447,
"step": 1734
},
{
"epoch": 0.6627196333078686,
"grad_norm": 2.885197674622287,
"learning_rate": 5.397916711562194e-07,
"loss": 0.4709,
"step": 1735
},
{
"epoch": 0.6631016042780749,
"grad_norm": 3.632198309075157,
"learning_rate": 5.38693504474391e-07,
"loss": 0.4712,
"step": 1736
},
{
"epoch": 0.6634835752482812,
"grad_norm": 3.786664182131556,
"learning_rate": 5.375960440513396e-07,
"loss": 0.5153,
"step": 1737
},
{
"epoch": 0.6638655462184874,
"grad_norm": 2.7266680685160916,
"learning_rate": 5.364992915672741e-07,
"loss": 0.5478,
"step": 1738
},
{
"epoch": 0.6642475171886937,
"grad_norm": 5.075336631040644,
"learning_rate": 5.354032487013182e-07,
"loss": 0.5136,
"step": 1739
},
{
"epoch": 0.6646294881588999,
"grad_norm": 4.790177542890202,
"learning_rate": 5.343079171315106e-07,
"loss": 0.4897,
"step": 1740
},
{
"epoch": 0.6650114591291062,
"grad_norm": 4.956713166422229,
"learning_rate": 5.332132985348006e-07,
"loss": 0.5102,
"step": 1741
},
{
"epoch": 0.6653934300993124,
"grad_norm": 3.7351581764257396,
"learning_rate": 5.32119394587045e-07,
"loss": 0.4859,
"step": 1742
},
{
"epoch": 0.6657754010695187,
"grad_norm": 2.3918201666827517,
"learning_rate": 5.310262069630083e-07,
"loss": 0.4508,
"step": 1743
},
{
"epoch": 0.666157372039725,
"grad_norm": 2.618140032137314,
"learning_rate": 5.299337373363563e-07,
"loss": 0.4534,
"step": 1744
},
{
"epoch": 0.6665393430099312,
"grad_norm": 9.82913689806301,
"learning_rate": 5.28841987379657e-07,
"loss": 0.4735,
"step": 1745
},
{
"epoch": 0.6669213139801375,
"grad_norm": 3.2431944892498827,
"learning_rate": 5.277509587643764e-07,
"loss": 0.4929,
"step": 1746
},
{
"epoch": 0.6673032849503437,
"grad_norm": 6.875034105299384,
"learning_rate": 5.266606531608752e-07,
"loss": 0.4888,
"step": 1747
},
{
"epoch": 0.6676852559205501,
"grad_norm": 2.84407410286601,
"learning_rate": 5.255710722384084e-07,
"loss": 0.4616,
"step": 1748
},
{
"epoch": 0.6680672268907563,
"grad_norm": 8.091849851885527,
"learning_rate": 5.244822176651203e-07,
"loss": 0.5232,
"step": 1749
},
{
"epoch": 0.6684491978609626,
"grad_norm": 4.373080856545653,
"learning_rate": 5.233940911080442e-07,
"loss": 0.4334,
"step": 1750
},
{
"epoch": 0.6688311688311688,
"grad_norm": 3.210487779087646,
"learning_rate": 5.223066942330987e-07,
"loss": 0.571,
"step": 1751
},
{
"epoch": 0.6692131398013751,
"grad_norm": 2.9220362455277082,
"learning_rate": 5.212200287050841e-07,
"loss": 0.4912,
"step": 1752
},
{
"epoch": 0.6695951107715814,
"grad_norm": 4.521842055951422,
"learning_rate": 5.201340961876828e-07,
"loss": 0.551,
"step": 1753
},
{
"epoch": 0.6699770817417876,
"grad_norm": 4.503953851845752,
"learning_rate": 5.190488983434532e-07,
"loss": 0.477,
"step": 1754
},
{
"epoch": 0.6703590527119939,
"grad_norm": 2.6664984094438418,
"learning_rate": 5.179644368338305e-07,
"loss": 0.5084,
"step": 1755
},
{
"epoch": 0.6707410236822001,
"grad_norm": 3.8997783146483314,
"learning_rate": 5.16880713319121e-07,
"loss": 0.5086,
"step": 1756
},
{
"epoch": 0.6711229946524064,
"grad_norm": 3.3464111874272717,
"learning_rate": 5.157977294585026e-07,
"loss": 0.5331,
"step": 1757
},
{
"epoch": 0.6715049656226126,
"grad_norm": 2.6820956918396814,
"learning_rate": 5.147154869100201e-07,
"loss": 0.4768,
"step": 1758
},
{
"epoch": 0.671886936592819,
"grad_norm": 42.88709117437861,
"learning_rate": 5.136339873305831e-07,
"loss": 0.4841,
"step": 1759
},
{
"epoch": 0.6722689075630253,
"grad_norm": 5.276124406268703,
"learning_rate": 5.125532323759643e-07,
"loss": 0.4874,
"step": 1760
},
{
"epoch": 0.6726508785332315,
"grad_norm": 3.229214292636012,
"learning_rate": 5.114732237007957e-07,
"loss": 0.4697,
"step": 1761
},
{
"epoch": 0.6730328495034378,
"grad_norm": 2.4631829837493293,
"learning_rate": 5.103939629585674e-07,
"loss": 0.4387,
"step": 1762
},
{
"epoch": 0.673414820473644,
"grad_norm": 3.009261835474479,
"learning_rate": 5.093154518016245e-07,
"loss": 0.4609,
"step": 1763
},
{
"epoch": 0.6737967914438503,
"grad_norm": 4.0578299577285915,
"learning_rate": 5.082376918811635e-07,
"loss": 0.4755,
"step": 1764
},
{
"epoch": 0.6741787624140565,
"grad_norm": 41.392065884095885,
"learning_rate": 5.071606848472321e-07,
"loss": 0.5212,
"step": 1765
},
{
"epoch": 0.6745607333842628,
"grad_norm": 8.192046614990442,
"learning_rate": 5.060844323487238e-07,
"loss": 0.4903,
"step": 1766
},
{
"epoch": 0.674942704354469,
"grad_norm": 4.99344291784436,
"learning_rate": 5.050089360333782e-07,
"loss": 0.5053,
"step": 1767
},
{
"epoch": 0.6753246753246753,
"grad_norm": 9.403045030732164,
"learning_rate": 5.039341975477773e-07,
"loss": 0.5111,
"step": 1768
},
{
"epoch": 0.6757066462948816,
"grad_norm": 3.869313946071402,
"learning_rate": 5.028602185373413e-07,
"loss": 0.5385,
"step": 1769
},
{
"epoch": 0.6760886172650878,
"grad_norm": 3.2039531425470464,
"learning_rate": 5.017870006463292e-07,
"loss": 0.4524,
"step": 1770
},
{
"epoch": 0.6764705882352942,
"grad_norm": 3.348277780697525,
"learning_rate": 5.007145455178343e-07,
"loss": 0.5328,
"step": 1771
},
{
"epoch": 0.6768525592055004,
"grad_norm": 3.535283899258063,
"learning_rate": 4.996428547937814e-07,
"loss": 0.4704,
"step": 1772
},
{
"epoch": 0.6772345301757067,
"grad_norm": 2.9033639422271107,
"learning_rate": 4.985719301149261e-07,
"loss": 0.5007,
"step": 1773
},
{
"epoch": 0.6776165011459129,
"grad_norm": 2.718338904911866,
"learning_rate": 4.975017731208508e-07,
"loss": 0.4864,
"step": 1774
},
{
"epoch": 0.6779984721161192,
"grad_norm": 2.5266371179234235,
"learning_rate": 4.964323854499619e-07,
"loss": 0.4287,
"step": 1775
},
{
"epoch": 0.6783804430863254,
"grad_norm": 2.9392030294015625,
"learning_rate": 4.953637687394891e-07,
"loss": 0.5271,
"step": 1776
},
{
"epoch": 0.6787624140565317,
"grad_norm": 5.303562101025649,
"learning_rate": 4.942959246254807e-07,
"loss": 0.4197,
"step": 1777
},
{
"epoch": 0.679144385026738,
"grad_norm": 3.775507561159994,
"learning_rate": 4.932288547428026e-07,
"loss": 0.4597,
"step": 1778
},
{
"epoch": 0.6795263559969442,
"grad_norm": 3.6320980633601905,
"learning_rate": 4.921625607251362e-07,
"loss": 0.4709,
"step": 1779
},
{
"epoch": 0.6799083269671505,
"grad_norm": 2.104047258371886,
"learning_rate": 4.910970442049732e-07,
"loss": 0.4822,
"step": 1780
},
{
"epoch": 0.6802902979373567,
"grad_norm": 3.5998342666828607,
"learning_rate": 4.900323068136165e-07,
"loss": 0.5084,
"step": 1781
},
{
"epoch": 0.680672268907563,
"grad_norm": 3.882898421757521,
"learning_rate": 4.889683501811761e-07,
"loss": 0.4657,
"step": 1782
},
{
"epoch": 0.6810542398777693,
"grad_norm": 2.623050039734026,
"learning_rate": 4.879051759365653e-07,
"loss": 0.4837,
"step": 1783
},
{
"epoch": 0.6814362108479756,
"grad_norm": 3.2165127332094583,
"learning_rate": 4.868427857075013e-07,
"loss": 0.4393,
"step": 1784
},
{
"epoch": 0.6818181818181818,
"grad_norm": 2.7033619904930237,
"learning_rate": 4.857811811204996e-07,
"loss": 0.4995,
"step": 1785
},
{
"epoch": 0.6822001527883881,
"grad_norm": 4.353397989819022,
"learning_rate": 4.847203638008735e-07,
"loss": 0.5109,
"step": 1786
},
{
"epoch": 0.6825821237585944,
"grad_norm": 2.745760901165819,
"learning_rate": 4.836603353727316e-07,
"loss": 0.4754,
"step": 1787
},
{
"epoch": 0.6829640947288006,
"grad_norm": 7.346443776679196,
"learning_rate": 4.826010974589731e-07,
"loss": 0.4906,
"step": 1788
},
{
"epoch": 0.6833460656990069,
"grad_norm": 2.3502944826903223,
"learning_rate": 4.815426516812883e-07,
"loss": 0.4692,
"step": 1789
},
{
"epoch": 0.6837280366692131,
"grad_norm": 3.211455722844151,
"learning_rate": 4.804849996601547e-07,
"loss": 0.4798,
"step": 1790
},
{
"epoch": 0.6841100076394194,
"grad_norm": 2.3266255130897493,
"learning_rate": 4.794281430148336e-07,
"loss": 0.4615,
"step": 1791
},
{
"epoch": 0.6844919786096256,
"grad_norm": 3.901880544682035,
"learning_rate": 4.783720833633692e-07,
"loss": 0.4488,
"step": 1792
},
{
"epoch": 0.6848739495798319,
"grad_norm": 11.162179353153887,
"learning_rate": 4.773168223225861e-07,
"loss": 0.4631,
"step": 1793
},
{
"epoch": 0.6852559205500381,
"grad_norm": 4.62060167380066,
"learning_rate": 4.7626236150808487e-07,
"loss": 0.4421,
"step": 1794
},
{
"epoch": 0.6856378915202445,
"grad_norm": 2.661341939327642,
"learning_rate": 4.752087025342422e-07,
"loss": 0.4958,
"step": 1795
},
{
"epoch": 0.6860198624904508,
"grad_norm": 3.355248941702287,
"learning_rate": 4.741558470142061e-07,
"loss": 0.4309,
"step": 1796
},
{
"epoch": 0.686401833460657,
"grad_norm": 2.581711751984442,
"learning_rate": 4.731037965598952e-07,
"loss": 0.4278,
"step": 1797
},
{
"epoch": 0.6867838044308633,
"grad_norm": 2.9895776914154153,
"learning_rate": 4.7205255278199584e-07,
"loss": 0.492,
"step": 1798
},
{
"epoch": 0.6871657754010695,
"grad_norm": 7.478761437708342,
"learning_rate": 4.710021172899582e-07,
"loss": 0.4672,
"step": 1799
},
{
"epoch": 0.6875477463712758,
"grad_norm": 3.3459084056641393,
"learning_rate": 4.6995249169199604e-07,
"loss": 0.4796,
"step": 1800
},
{
"epoch": 0.687929717341482,
"grad_norm": 3.2181859858157433,
"learning_rate": 4.689036775950832e-07,
"loss": 0.4503,
"step": 1801
},
{
"epoch": 0.6883116883116883,
"grad_norm": 4.442300903603122,
"learning_rate": 4.6785567660494987e-07,
"loss": 0.6058,
"step": 1802
},
{
"epoch": 0.6886936592818945,
"grad_norm": 4.49731712760092,
"learning_rate": 4.668084903260827e-07,
"loss": 0.5145,
"step": 1803
},
{
"epoch": 0.6890756302521008,
"grad_norm": 3.0556267963416066,
"learning_rate": 4.657621203617209e-07,
"loss": 0.4599,
"step": 1804
},
{
"epoch": 0.6894576012223071,
"grad_norm": 4.244715633338391,
"learning_rate": 4.6471656831385285e-07,
"loss": 0.462,
"step": 1805
},
{
"epoch": 0.6898395721925134,
"grad_norm": 4.128236164165927,
"learning_rate": 4.636718357832161e-07,
"loss": 0.5331,
"step": 1806
},
{
"epoch": 0.6902215431627197,
"grad_norm": 4.0861345040191255,
"learning_rate": 4.626279243692922e-07,
"loss": 0.4439,
"step": 1807
},
{
"epoch": 0.6906035141329259,
"grad_norm": 5.750611707089236,
"learning_rate": 4.6158483567030635e-07,
"loss": 0.4304,
"step": 1808
},
{
"epoch": 0.6909854851031322,
"grad_norm": 7.196354964764559,
"learning_rate": 4.605425712832246e-07,
"loss": 0.4915,
"step": 1809
},
{
"epoch": 0.6913674560733384,
"grad_norm": 5.987041811737305,
"learning_rate": 4.595011328037496e-07,
"loss": 0.5501,
"step": 1810
},
{
"epoch": 0.6917494270435447,
"grad_norm": 4.934071242682319,
"learning_rate": 4.584605218263207e-07,
"loss": 0.58,
"step": 1811
},
{
"epoch": 0.692131398013751,
"grad_norm": 3.3954147958369374,
"learning_rate": 4.5742073994411045e-07,
"loss": 0.5062,
"step": 1812
},
{
"epoch": 0.6925133689839572,
"grad_norm": 2.2667132893887794,
"learning_rate": 4.563817887490207e-07,
"loss": 0.4208,
"step": 1813
},
{
"epoch": 0.6928953399541635,
"grad_norm": 3.929797491140602,
"learning_rate": 4.5534366983168293e-07,
"loss": 0.4252,
"step": 1814
},
{
"epoch": 0.6932773109243697,
"grad_norm": 3.022047260374716,
"learning_rate": 4.5430638478145434e-07,
"loss": 0.5903,
"step": 1815
},
{
"epoch": 0.693659281894576,
"grad_norm": 6.51145142891472,
"learning_rate": 4.532699351864141e-07,
"loss": 0.4997,
"step": 1816
},
{
"epoch": 0.6940412528647822,
"grad_norm": 2.4956649196707836,
"learning_rate": 4.5223432263336404e-07,
"loss": 0.4496,
"step": 1817
},
{
"epoch": 0.6944232238349886,
"grad_norm": 116.55753178803175,
"learning_rate": 4.5119954870782305e-07,
"loss": 0.4883,
"step": 1818
},
{
"epoch": 0.6948051948051948,
"grad_norm": 3.5845588926175225,
"learning_rate": 4.5016561499402703e-07,
"loss": 0.4719,
"step": 1819
},
{
"epoch": 0.6951871657754011,
"grad_norm": 4.438982485708564,
"learning_rate": 4.4913252307492556e-07,
"loss": 0.4869,
"step": 1820
},
{
"epoch": 0.6955691367456074,
"grad_norm": 3.5818159150843094,
"learning_rate": 4.4810027453217834e-07,
"loss": 0.4822,
"step": 1821
},
{
"epoch": 0.6959511077158136,
"grad_norm": 2.593057013601486,
"learning_rate": 4.47068870946155e-07,
"loss": 0.4946,
"step": 1822
},
{
"epoch": 0.6963330786860199,
"grad_norm": 7.773289890571107,
"learning_rate": 4.460383138959315e-07,
"loss": 0.5188,
"step": 1823
},
{
"epoch": 0.6967150496562261,
"grad_norm": 4.718899974607956,
"learning_rate": 4.4500860495928663e-07,
"loss": 0.4838,
"step": 1824
},
{
"epoch": 0.6970970206264324,
"grad_norm": 3.8665984652103607,
"learning_rate": 4.439797457127019e-07,
"loss": 0.4639,
"step": 1825
},
{
"epoch": 0.6974789915966386,
"grad_norm": 4.622011857988872,
"learning_rate": 4.42951737731358e-07,
"loss": 0.456,
"step": 1826
},
{
"epoch": 0.6978609625668449,
"grad_norm": 6.110123882313781,
"learning_rate": 4.4192458258913103e-07,
"loss": 0.5425,
"step": 1827
},
{
"epoch": 0.6982429335370511,
"grad_norm": 28.584952836114386,
"learning_rate": 4.408982818585929e-07,
"loss": 0.4163,
"step": 1828
},
{
"epoch": 0.6986249045072574,
"grad_norm": 3.081629631190153,
"learning_rate": 4.398728371110063e-07,
"loss": 0.4653,
"step": 1829
},
{
"epoch": 0.6990068754774638,
"grad_norm": 3.8023416610110368,
"learning_rate": 4.38848249916324e-07,
"loss": 0.5274,
"step": 1830
},
{
"epoch": 0.69938884644767,
"grad_norm": 13.68679754109722,
"learning_rate": 4.378245218431862e-07,
"loss": 0.4907,
"step": 1831
},
{
"epoch": 0.6997708174178763,
"grad_norm": 2.989676329168659,
"learning_rate": 4.368016544589166e-07,
"loss": 0.5026,
"step": 1832
},
{
"epoch": 0.7001527883880825,
"grad_norm": 2.456891595424161,
"learning_rate": 4.357796493295222e-07,
"loss": 0.4148,
"step": 1833
},
{
"epoch": 0.7005347593582888,
"grad_norm": 3.01597433449375,
"learning_rate": 4.3475850801969007e-07,
"loss": 0.4704,
"step": 1834
},
{
"epoch": 0.700916730328495,
"grad_norm": 2.6339512449755786,
"learning_rate": 4.3373823209278336e-07,
"loss": 0.5132,
"step": 1835
},
{
"epoch": 0.7012987012987013,
"grad_norm": 2.9970556970320157,
"learning_rate": 4.32718823110842e-07,
"loss": 0.5798,
"step": 1836
},
{
"epoch": 0.7016806722689075,
"grad_norm": 2.2711103632944103,
"learning_rate": 4.31700282634578e-07,
"loss": 0.4762,
"step": 1837
},
{
"epoch": 0.7020626432391138,
"grad_norm": 3.0551038729760784,
"learning_rate": 4.306826122233729e-07,
"loss": 0.5223,
"step": 1838
},
{
"epoch": 0.7024446142093201,
"grad_norm": 6.796489619512551,
"learning_rate": 4.2966581343527765e-07,
"loss": 0.5409,
"step": 1839
},
{
"epoch": 0.7028265851795263,
"grad_norm": 4.240579878349909,
"learning_rate": 4.2864988782700716e-07,
"loss": 0.3891,
"step": 1840
},
{
"epoch": 0.7032085561497327,
"grad_norm": 5.563576383242488,
"learning_rate": 4.276348369539408e-07,
"loss": 0.4891,
"step": 1841
},
{
"epoch": 0.7035905271199389,
"grad_norm": 2.566057792322101,
"learning_rate": 4.266206623701183e-07,
"loss": 0.4866,
"step": 1842
},
{
"epoch": 0.7039724980901452,
"grad_norm": 3.7630909631063134,
"learning_rate": 4.256073656282373e-07,
"loss": 0.4461,
"step": 1843
},
{
"epoch": 0.7043544690603514,
"grad_norm": 3.0635270757473827,
"learning_rate": 4.2459494827965213e-07,
"loss": 0.4605,
"step": 1844
},
{
"epoch": 0.7047364400305577,
"grad_norm": 4.764068162210865,
"learning_rate": 4.2358341187437085e-07,
"loss": 0.4986,
"step": 1845
},
{
"epoch": 0.7051184110007639,
"grad_norm": 2.8256747931679063,
"learning_rate": 4.2257275796105184e-07,
"loss": 0.4251,
"step": 1846
},
{
"epoch": 0.7055003819709702,
"grad_norm": 3.9070473949332203,
"learning_rate": 4.2156298808700374e-07,
"loss": 0.5431,
"step": 1847
},
{
"epoch": 0.7058823529411765,
"grad_norm": 3.836471191964947,
"learning_rate": 4.205541037981802e-07,
"loss": 0.4255,
"step": 1848
},
{
"epoch": 0.7062643239113827,
"grad_norm": 6.280578101022665,
"learning_rate": 4.1954610663918046e-07,
"loss": 0.4583,
"step": 1849
},
{
"epoch": 0.706646294881589,
"grad_norm": 5.874313472902486,
"learning_rate": 4.18538998153245e-07,
"loss": 0.4825,
"step": 1850
},
{
"epoch": 0.7070282658517952,
"grad_norm": 2.722705730108688,
"learning_rate": 4.175327798822531e-07,
"loss": 0.4786,
"step": 1851
},
{
"epoch": 0.7074102368220015,
"grad_norm": 4.555491048283832,
"learning_rate": 4.1652745336672224e-07,
"loss": 0.4718,
"step": 1852
},
{
"epoch": 0.7077922077922078,
"grad_norm": 2.684682727559695,
"learning_rate": 4.1552302014580433e-07,
"loss": 0.4009,
"step": 1853
},
{
"epoch": 0.7081741787624141,
"grad_norm": 2.820889863379384,
"learning_rate": 4.1451948175728267e-07,
"loss": 0.4717,
"step": 1854
},
{
"epoch": 0.7085561497326203,
"grad_norm": 2.6265538016576713,
"learning_rate": 4.135168397375718e-07,
"loss": 0.429,
"step": 1855
},
{
"epoch": 0.7089381207028266,
"grad_norm": 3.7741584067315914,
"learning_rate": 4.125150956217138e-07,
"loss": 0.5093,
"step": 1856
},
{
"epoch": 0.7093200916730329,
"grad_norm": 5.30058124522937,
"learning_rate": 4.1151425094337513e-07,
"loss": 0.499,
"step": 1857
},
{
"epoch": 0.7097020626432391,
"grad_norm": 5.4902094944883055,
"learning_rate": 4.1051430723484623e-07,
"loss": 0.4655,
"step": 1858
},
{
"epoch": 0.7100840336134454,
"grad_norm": 6.162461259611622,
"learning_rate": 4.0951526602703735e-07,
"loss": 0.4572,
"step": 1859
},
{
"epoch": 0.7104660045836516,
"grad_norm": 3.500319460088784,
"learning_rate": 4.085171288494774e-07,
"loss": 0.5129,
"step": 1860
},
{
"epoch": 0.7108479755538579,
"grad_norm": 2.7923411056600984,
"learning_rate": 4.07519897230312e-07,
"loss": 0.5116,
"step": 1861
},
{
"epoch": 0.7112299465240641,
"grad_norm": 3.094231599936001,
"learning_rate": 4.0652357269629857e-07,
"loss": 0.4578,
"step": 1862
},
{
"epoch": 0.7116119174942704,
"grad_norm": 4.952507106482637,
"learning_rate": 4.055281567728076e-07,
"loss": 0.4675,
"step": 1863
},
{
"epoch": 0.7119938884644768,
"grad_norm": 3.158969980474859,
"learning_rate": 4.0453365098381695e-07,
"loss": 0.4729,
"step": 1864
},
{
"epoch": 0.712375859434683,
"grad_norm": 4.0128858021274425,
"learning_rate": 4.035400568519122e-07,
"loss": 0.5799,
"step": 1865
},
{
"epoch": 0.7127578304048893,
"grad_norm": 5.726579023569035,
"learning_rate": 4.0254737589828323e-07,
"loss": 0.5642,
"step": 1866
},
{
"epoch": 0.7131398013750955,
"grad_norm": 4.8738224391570135,
"learning_rate": 4.015556096427206e-07,
"loss": 0.4978,
"step": 1867
},
{
"epoch": 0.7135217723453018,
"grad_norm": 1.9393008560746685,
"learning_rate": 4.0056475960361615e-07,
"loss": 0.4016,
"step": 1868
},
{
"epoch": 0.713903743315508,
"grad_norm": 11.109760924089572,
"learning_rate": 3.9957482729795735e-07,
"loss": 0.5351,
"step": 1869
},
{
"epoch": 0.7142857142857143,
"grad_norm": 3.5030857014942542,
"learning_rate": 3.9858581424132766e-07,
"loss": 0.5276,
"step": 1870
},
{
"epoch": 0.7146676852559205,
"grad_norm": 18.94949727094923,
"learning_rate": 3.975977219479033e-07,
"loss": 0.4927,
"step": 1871
},
{
"epoch": 0.7150496562261268,
"grad_norm": 2.985867695957091,
"learning_rate": 3.966105519304499e-07,
"loss": 0.4805,
"step": 1872
},
{
"epoch": 0.7154316271963331,
"grad_norm": 11.239709251982658,
"learning_rate": 3.956243057003222e-07,
"loss": 0.4813,
"step": 1873
},
{
"epoch": 0.7158135981665393,
"grad_norm": 2.3920035236201294,
"learning_rate": 3.946389847674594e-07,
"loss": 0.4808,
"step": 1874
},
{
"epoch": 0.7161955691367456,
"grad_norm": 3.606251827873003,
"learning_rate": 3.936545906403853e-07,
"loss": 0.5318,
"step": 1875
},
{
"epoch": 0.7165775401069518,
"grad_norm": 3.0718763403473455,
"learning_rate": 3.9267112482620344e-07,
"loss": 0.4164,
"step": 1876
},
{
"epoch": 0.7169595110771582,
"grad_norm": 7.259618461181073,
"learning_rate": 3.9168858883059743e-07,
"loss": 0.444,
"step": 1877
},
{
"epoch": 0.7173414820473644,
"grad_norm": 12.93592963046385,
"learning_rate": 3.90706984157827e-07,
"loss": 0.5208,
"step": 1878
},
{
"epoch": 0.7177234530175707,
"grad_norm": 2.4143723426987047,
"learning_rate": 3.8972631231072493e-07,
"loss": 0.4594,
"step": 1879
},
{
"epoch": 0.7181054239877769,
"grad_norm": 3.2086077522932284,
"learning_rate": 3.8874657479069763e-07,
"loss": 0.5383,
"step": 1880
},
{
"epoch": 0.7184873949579832,
"grad_norm": 10.33731462373807,
"learning_rate": 3.8776777309771934e-07,
"loss": 0.4971,
"step": 1881
},
{
"epoch": 0.7188693659281895,
"grad_norm": 3.9050481213749215,
"learning_rate": 3.867899087303326e-07,
"loss": 0.5348,
"step": 1882
},
{
"epoch": 0.7192513368983957,
"grad_norm": 28.55017424878132,
"learning_rate": 3.85812983185645e-07,
"loss": 0.4986,
"step": 1883
},
{
"epoch": 0.719633307868602,
"grad_norm": 8.08468107153252,
"learning_rate": 3.848369979593259e-07,
"loss": 0.4773,
"step": 1884
},
{
"epoch": 0.7200152788388082,
"grad_norm": 3.407078286876332,
"learning_rate": 3.838619545456059e-07,
"loss": 0.4837,
"step": 1885
},
{
"epoch": 0.7203972498090145,
"grad_norm": 3.383788660732898,
"learning_rate": 3.8288785443727357e-07,
"loss": 0.5097,
"step": 1886
},
{
"epoch": 0.7207792207792207,
"grad_norm": 2.5435503955321086,
"learning_rate": 3.8191469912567243e-07,
"loss": 0.4634,
"step": 1887
},
{
"epoch": 0.721161191749427,
"grad_norm": 3.0284976642181256,
"learning_rate": 3.8094249010070047e-07,
"loss": 0.4529,
"step": 1888
},
{
"epoch": 0.7215431627196333,
"grad_norm": 2.422343762556238,
"learning_rate": 3.799712288508071e-07,
"loss": 0.5196,
"step": 1889
},
{
"epoch": 0.7219251336898396,
"grad_norm": 7.258011180896353,
"learning_rate": 3.790009168629895e-07,
"loss": 0.4744,
"step": 1890
},
{
"epoch": 0.7223071046600459,
"grad_norm": 3.660883946533744,
"learning_rate": 3.7803155562279276e-07,
"loss": 0.4803,
"step": 1891
},
{
"epoch": 0.7226890756302521,
"grad_norm": 2.473442719558735,
"learning_rate": 3.770631466143054e-07,
"loss": 0.4593,
"step": 1892
},
{
"epoch": 0.7230710466004584,
"grad_norm": 5.950510676950638,
"learning_rate": 3.7609569132015863e-07,
"loss": 0.494,
"step": 1893
},
{
"epoch": 0.7234530175706646,
"grad_norm": 2.237081011958015,
"learning_rate": 3.7512919122152397e-07,
"loss": 0.4394,
"step": 1894
},
{
"epoch": 0.7238349885408709,
"grad_norm": 2.7709843570858714,
"learning_rate": 3.741636477981092e-07,
"loss": 0.5006,
"step": 1895
},
{
"epoch": 0.7242169595110771,
"grad_norm": 2.6252764983751344,
"learning_rate": 3.7319906252815857e-07,
"loss": 0.4693,
"step": 1896
},
{
"epoch": 0.7245989304812834,
"grad_norm": 5.617419755723076,
"learning_rate": 3.722354368884495e-07,
"loss": 0.4652,
"step": 1897
},
{
"epoch": 0.7249809014514896,
"grad_norm": 3.5361263292557354,
"learning_rate": 3.71272772354289e-07,
"loss": 0.5213,
"step": 1898
},
{
"epoch": 0.725362872421696,
"grad_norm": 2.5949928089139176,
"learning_rate": 3.703110703995137e-07,
"loss": 0.5325,
"step": 1899
},
{
"epoch": 0.7257448433919023,
"grad_norm": 19.664846071662925,
"learning_rate": 3.693503324964867e-07,
"loss": 0.5319,
"step": 1900
},
{
"epoch": 0.7261268143621085,
"grad_norm": 4.070155434112447,
"learning_rate": 3.68390560116094e-07,
"loss": 0.4306,
"step": 1901
},
{
"epoch": 0.7265087853323148,
"grad_norm": 13.811969565605493,
"learning_rate": 3.674317547277447e-07,
"loss": 0.5101,
"step": 1902
},
{
"epoch": 0.726890756302521,
"grad_norm": 14.984568645747874,
"learning_rate": 3.66473917799366e-07,
"loss": 0.4773,
"step": 1903
},
{
"epoch": 0.7272727272727273,
"grad_norm": 2.608868304564764,
"learning_rate": 3.655170507974037e-07,
"loss": 0.4502,
"step": 1904
},
{
"epoch": 0.7276546982429335,
"grad_norm": 3.160544844419424,
"learning_rate": 3.645611551868184e-07,
"loss": 0.5006,
"step": 1905
},
{
"epoch": 0.7280366692131398,
"grad_norm": 4.278119381822793,
"learning_rate": 3.636062324310826e-07,
"loss": 0.4901,
"step": 1906
},
{
"epoch": 0.728418640183346,
"grad_norm": 4.951157730675554,
"learning_rate": 3.626522839921803e-07,
"loss": 0.4416,
"step": 1907
},
{
"epoch": 0.7288006111535523,
"grad_norm": 4.634081370664527,
"learning_rate": 3.6169931133060385e-07,
"loss": 0.3973,
"step": 1908
},
{
"epoch": 0.7291825821237586,
"grad_norm": 3.7413225541810857,
"learning_rate": 3.607473159053507e-07,
"loss": 0.4926,
"step": 1909
},
{
"epoch": 0.7295645530939648,
"grad_norm": 3.690256429087379,
"learning_rate": 3.597962991739235e-07,
"loss": 0.4893,
"step": 1910
},
{
"epoch": 0.7299465240641712,
"grad_norm": 2.7298792210542415,
"learning_rate": 3.588462625923252e-07,
"loss": 0.4719,
"step": 1911
},
{
"epoch": 0.7303284950343774,
"grad_norm": 9.765728465247214,
"learning_rate": 3.5789720761505913e-07,
"loss": 0.5138,
"step": 1912
},
{
"epoch": 0.7307104660045837,
"grad_norm": 2.328736820943766,
"learning_rate": 3.5694913569512565e-07,
"loss": 0.475,
"step": 1913
},
{
"epoch": 0.7310924369747899,
"grad_norm": 2.562188291710903,
"learning_rate": 3.560020482840194e-07,
"loss": 0.5071,
"step": 1914
},
{
"epoch": 0.7314744079449962,
"grad_norm": 3.682707847406333,
"learning_rate": 3.5505594683172824e-07,
"loss": 0.478,
"step": 1915
},
{
"epoch": 0.7318563789152025,
"grad_norm": 16.26499522607614,
"learning_rate": 3.54110832786731e-07,
"loss": 0.5525,
"step": 1916
},
{
"epoch": 0.7322383498854087,
"grad_norm": 6.995416001335222,
"learning_rate": 3.5316670759599366e-07,
"loss": 0.4483,
"step": 1917
},
{
"epoch": 0.732620320855615,
"grad_norm": 27.820129981944024,
"learning_rate": 3.5222357270496906e-07,
"loss": 0.4273,
"step": 1918
},
{
"epoch": 0.7330022918258212,
"grad_norm": 3.6945885753820793,
"learning_rate": 3.512814295575942e-07,
"loss": 0.5197,
"step": 1919
},
{
"epoch": 0.7333842627960275,
"grad_norm": 2.9068072278616364,
"learning_rate": 3.5034027959628653e-07,
"loss": 0.5252,
"step": 1920
},
{
"epoch": 0.7337662337662337,
"grad_norm": 5.906982103662624,
"learning_rate": 3.494001242619442e-07,
"loss": 0.469,
"step": 1921
},
{
"epoch": 0.73414820473644,
"grad_norm": 4.279419872340727,
"learning_rate": 3.484609649939415e-07,
"loss": 0.466,
"step": 1922
},
{
"epoch": 0.7345301757066462,
"grad_norm": 9.103811203425218,
"learning_rate": 3.475228032301286e-07,
"loss": 0.542,
"step": 1923
},
{
"epoch": 0.7349121466768526,
"grad_norm": 2.407127382987992,
"learning_rate": 3.465856404068285e-07,
"loss": 0.4808,
"step": 1924
},
{
"epoch": 0.7352941176470589,
"grad_norm": 7.60242894892141,
"learning_rate": 3.456494779588337e-07,
"loss": 0.5326,
"step": 1925
},
{
"epoch": 0.7356760886172651,
"grad_norm": 2.572693667995342,
"learning_rate": 3.447143173194065e-07,
"loss": 0.4897,
"step": 1926
},
{
"epoch": 0.7360580595874714,
"grad_norm": 3.622211923018903,
"learning_rate": 3.43780159920275e-07,
"loss": 0.5363,
"step": 1927
},
{
"epoch": 0.7364400305576776,
"grad_norm": 3.0667032122854327,
"learning_rate": 3.4284700719163064e-07,
"loss": 0.4805,
"step": 1928
},
{
"epoch": 0.7368220015278839,
"grad_norm": 3.450568284892199,
"learning_rate": 3.419148605621276e-07,
"loss": 0.5734,
"step": 1929
},
{
"epoch": 0.7372039724980901,
"grad_norm": 2.5215067484094273,
"learning_rate": 3.4098372145887975e-07,
"loss": 0.478,
"step": 1930
},
{
"epoch": 0.7375859434682964,
"grad_norm": 2.739279615526753,
"learning_rate": 3.400535913074574e-07,
"loss": 0.595,
"step": 1931
},
{
"epoch": 0.7379679144385026,
"grad_norm": 2.549133107179413,
"learning_rate": 3.391244715318875e-07,
"loss": 0.4223,
"step": 1932
},
{
"epoch": 0.7383498854087089,
"grad_norm": 3.3906537717385126,
"learning_rate": 3.3819636355464875e-07,
"loss": 0.4583,
"step": 1933
},
{
"epoch": 0.7387318563789153,
"grad_norm": 3.1669669812165693,
"learning_rate": 3.3726926879667207e-07,
"loss": 0.4301,
"step": 1934
},
{
"epoch": 0.7391138273491215,
"grad_norm": 3.609455837206172,
"learning_rate": 3.363431886773367e-07,
"loss": 0.4509,
"step": 1935
},
{
"epoch": 0.7394957983193278,
"grad_norm": 3.2263686183138103,
"learning_rate": 3.354181246144677e-07,
"loss": 0.47,
"step": 1936
},
{
"epoch": 0.739877769289534,
"grad_norm": 7.8689314970635325,
"learning_rate": 3.3449407802433573e-07,
"loss": 0.4766,
"step": 1937
},
{
"epoch": 0.7402597402597403,
"grad_norm": 12.297816660666351,
"learning_rate": 3.3357105032165323e-07,
"loss": 0.4759,
"step": 1938
},
{
"epoch": 0.7406417112299465,
"grad_norm": 3.278150809156143,
"learning_rate": 3.326490429195723e-07,
"loss": 0.5303,
"step": 1939
},
{
"epoch": 0.7410236822001528,
"grad_norm": 2.5949186267966082,
"learning_rate": 3.317280572296834e-07,
"loss": 0.527,
"step": 1940
},
{
"epoch": 0.741405653170359,
"grad_norm": 2.1671563182970455,
"learning_rate": 3.308080946620133e-07,
"loss": 0.4465,
"step": 1941
},
{
"epoch": 0.7417876241405653,
"grad_norm": 4.0924980474366155,
"learning_rate": 3.298891566250209e-07,
"loss": 0.4535,
"step": 1942
},
{
"epoch": 0.7421695951107716,
"grad_norm": 3.707669815731349,
"learning_rate": 3.289712445255981e-07,
"loss": 0.508,
"step": 1943
},
{
"epoch": 0.7425515660809778,
"grad_norm": 3.200538361880574,
"learning_rate": 3.280543597690648e-07,
"loss": 0.4451,
"step": 1944
},
{
"epoch": 0.7429335370511841,
"grad_norm": 5.354158493938534,
"learning_rate": 3.27138503759169e-07,
"loss": 0.48,
"step": 1945
},
{
"epoch": 0.7433155080213903,
"grad_norm": 4.059813880081117,
"learning_rate": 3.262236778980836e-07,
"loss": 0.5653,
"step": 1946
},
{
"epoch": 0.7436974789915967,
"grad_norm": 10.018457891659642,
"learning_rate": 3.2530988358640334e-07,
"loss": 0.4526,
"step": 1947
},
{
"epoch": 0.7440794499618029,
"grad_norm": 6.077309375315518,
"learning_rate": 3.2439712222314496e-07,
"loss": 0.5287,
"step": 1948
},
{
"epoch": 0.7444614209320092,
"grad_norm": 7.0200332373569,
"learning_rate": 3.2348539520574337e-07,
"loss": 0.5588,
"step": 1949
},
{
"epoch": 0.7448433919022154,
"grad_norm": 3.051416350497911,
"learning_rate": 3.2257470393004903e-07,
"loss": 0.5365,
"step": 1950
},
{
"epoch": 0.7452253628724217,
"grad_norm": 3.286316707945846,
"learning_rate": 3.2166504979032794e-07,
"loss": 0.4837,
"step": 1951
},
{
"epoch": 0.745607333842628,
"grad_norm": 8.761969558075574,
"learning_rate": 3.207564341792578e-07,
"loss": 0.5069,
"step": 1952
},
{
"epoch": 0.7459893048128342,
"grad_norm": 8.10687240149731,
"learning_rate": 3.1984885848792564e-07,
"loss": 0.517,
"step": 1953
},
{
"epoch": 0.7463712757830405,
"grad_norm": 2.5829528950320664,
"learning_rate": 3.1894232410582754e-07,
"loss": 0.4683,
"step": 1954
},
{
"epoch": 0.7467532467532467,
"grad_norm": 3.2280729471211895,
"learning_rate": 3.180368324208643e-07,
"loss": 0.4826,
"step": 1955
},
{
"epoch": 0.747135217723453,
"grad_norm": 19.541963030290834,
"learning_rate": 3.17132384819341e-07,
"loss": 0.4297,
"step": 1956
},
{
"epoch": 0.7475171886936592,
"grad_norm": 3.293546129592615,
"learning_rate": 3.162289826859643e-07,
"loss": 0.5001,
"step": 1957
},
{
"epoch": 0.7478991596638656,
"grad_norm": 3.293069144037459,
"learning_rate": 3.153266274038395e-07,
"loss": 0.4919,
"step": 1958
},
{
"epoch": 0.7482811306340718,
"grad_norm": 4.082598515187843,
"learning_rate": 3.144253203544699e-07,
"loss": 0.5036,
"step": 1959
},
{
"epoch": 0.7486631016042781,
"grad_norm": 3.542666640119538,
"learning_rate": 3.1352506291775396e-07,
"loss": 0.4648,
"step": 1960
},
{
"epoch": 0.7490450725744844,
"grad_norm": 6.080114343806355,
"learning_rate": 3.126258564719825e-07,
"loss": 0.4833,
"step": 1961
},
{
"epoch": 0.7494270435446906,
"grad_norm": 6.1238966887725805,
"learning_rate": 3.1172770239383795e-07,
"loss": 0.4712,
"step": 1962
},
{
"epoch": 0.7498090145148969,
"grad_norm": 3.0678578727874464,
"learning_rate": 3.1083060205839164e-07,
"loss": 0.4988,
"step": 1963
},
{
"epoch": 0.7501909854851031,
"grad_norm": 7.899514482543266,
"learning_rate": 3.0993455683910073e-07,
"loss": 0.4609,
"step": 1964
},
{
"epoch": 0.7505729564553094,
"grad_norm": 9.342075315504848,
"learning_rate": 3.0903956810780817e-07,
"loss": 0.4689,
"step": 1965
},
{
"epoch": 0.7509549274255156,
"grad_norm": 2.56279037396652,
"learning_rate": 3.081456372347384e-07,
"loss": 0.4273,
"step": 1966
},
{
"epoch": 0.7513368983957219,
"grad_norm": 2.618297306043059,
"learning_rate": 3.0725276558849687e-07,
"loss": 0.4186,
"step": 1967
},
{
"epoch": 0.7517188693659282,
"grad_norm": 4.148327631141519,
"learning_rate": 3.063609545360676e-07,
"loss": 0.5607,
"step": 1968
},
{
"epoch": 0.7521008403361344,
"grad_norm": 4.97633562270342,
"learning_rate": 3.0547020544280987e-07,
"loss": 0.4812,
"step": 1969
},
{
"epoch": 0.7524828113063408,
"grad_norm": 9.0842543595599,
"learning_rate": 3.045805196724581e-07,
"loss": 0.5206,
"step": 1970
},
{
"epoch": 0.752864782276547,
"grad_norm": 9.27585635939645,
"learning_rate": 3.036918985871185e-07,
"loss": 0.4541,
"step": 1971
},
{
"epoch": 0.7532467532467533,
"grad_norm": 6.796792142983438,
"learning_rate": 3.028043435472667e-07,
"loss": 0.486,
"step": 1972
},
{
"epoch": 0.7536287242169595,
"grad_norm": 2.1624545717546257,
"learning_rate": 3.019178559117471e-07,
"loss": 0.4359,
"step": 1973
},
{
"epoch": 0.7540106951871658,
"grad_norm": 9.147224778529468,
"learning_rate": 3.010324370377689e-07,
"loss": 0.4562,
"step": 1974
},
{
"epoch": 0.754392666157372,
"grad_norm": 6.290289585365308,
"learning_rate": 3.001480882809059e-07,
"loss": 0.457,
"step": 1975
},
{
"epoch": 0.7547746371275783,
"grad_norm": 2.2374087017338513,
"learning_rate": 2.992648109950935e-07,
"loss": 0.4598,
"step": 1976
},
{
"epoch": 0.7551566080977846,
"grad_norm": 6.827094524701705,
"learning_rate": 2.9838260653262584e-07,
"loss": 0.5682,
"step": 1977
},
{
"epoch": 0.7555385790679908,
"grad_norm": 4.82841612141328,
"learning_rate": 2.975014762441558e-07,
"loss": 0.5364,
"step": 1978
},
{
"epoch": 0.7559205500381971,
"grad_norm": 4.100694903188989,
"learning_rate": 2.966214214786903e-07,
"loss": 0.5031,
"step": 1979
},
{
"epoch": 0.7563025210084033,
"grad_norm": 5.721139123760344,
"learning_rate": 2.9574244358359066e-07,
"loss": 0.4956,
"step": 1980
},
{
"epoch": 0.7566844919786097,
"grad_norm": 3.0348483174093523,
"learning_rate": 2.9486454390456983e-07,
"loss": 0.458,
"step": 1981
},
{
"epoch": 0.7570664629488159,
"grad_norm": 2.071255659160732,
"learning_rate": 2.939877237856886e-07,
"loss": 0.4417,
"step": 1982
},
{
"epoch": 0.7574484339190222,
"grad_norm": 2.5377244096701643,
"learning_rate": 2.931119845693565e-07,
"loss": 0.4835,
"step": 1983
},
{
"epoch": 0.7578304048892284,
"grad_norm": 2.491320364019066,
"learning_rate": 2.9223732759632667e-07,
"loss": 0.5483,
"step": 1984
},
{
"epoch": 0.7582123758594347,
"grad_norm": 3.524489564802955,
"learning_rate": 2.913637542056967e-07,
"loss": 0.4658,
"step": 1985
},
{
"epoch": 0.758594346829641,
"grad_norm": 5.875291912515906,
"learning_rate": 2.904912657349049e-07,
"loss": 0.5105,
"step": 1986
},
{
"epoch": 0.7589763177998472,
"grad_norm": 5.744010687005276,
"learning_rate": 2.8961986351972767e-07,
"loss": 0.3959,
"step": 1987
},
{
"epoch": 0.7593582887700535,
"grad_norm": 3.1503978340744427,
"learning_rate": 2.887495488942796e-07,
"loss": 0.49,
"step": 1988
},
{
"epoch": 0.7597402597402597,
"grad_norm": 4.9894448806439495,
"learning_rate": 2.8788032319100917e-07,
"loss": 0.4775,
"step": 1989
},
{
"epoch": 0.760122230710466,
"grad_norm": 2.886379728803835,
"learning_rate": 2.8701218774069836e-07,
"loss": 0.5524,
"step": 1990
},
{
"epoch": 0.7605042016806722,
"grad_norm": 2.894689146151311,
"learning_rate": 2.8614514387246015e-07,
"loss": 0.4567,
"step": 1991
},
{
"epoch": 0.7608861726508785,
"grad_norm": 4.866779262098827,
"learning_rate": 2.8527919291373526e-07,
"loss": 0.5063,
"step": 1992
},
{
"epoch": 0.7612681436210847,
"grad_norm": 3.4928860508117783,
"learning_rate": 2.844143361902924e-07,
"loss": 0.4714,
"step": 1993
},
{
"epoch": 0.7616501145912911,
"grad_norm": 4.847558397239765,
"learning_rate": 2.8355057502622413e-07,
"loss": 0.4903,
"step": 1994
},
{
"epoch": 0.7620320855614974,
"grad_norm": 1.8542567356578314,
"learning_rate": 2.826879107439464e-07,
"loss": 0.4106,
"step": 1995
},
{
"epoch": 0.7624140565317036,
"grad_norm": 3.959019907877066,
"learning_rate": 2.8182634466419485e-07,
"loss": 0.4876,
"step": 1996
},
{
"epoch": 0.7627960275019099,
"grad_norm": 3.897130491646812,
"learning_rate": 2.809658781060249e-07,
"loss": 0.4747,
"step": 1997
},
{
"epoch": 0.7631779984721161,
"grad_norm": 4.19546163163991,
"learning_rate": 2.801065123868083e-07,
"loss": 0.5102,
"step": 1998
},
{
"epoch": 0.7635599694423224,
"grad_norm": 13.123256650379632,
"learning_rate": 2.792482488222306e-07,
"loss": 0.5024,
"step": 1999
},
{
"epoch": 0.7639419404125286,
"grad_norm": 8.484215470641198,
"learning_rate": 2.78391088726291e-07,
"loss": 0.4743,
"step": 2000
},
{
"epoch": 0.7643239113827349,
"grad_norm": 2.5750063915205663,
"learning_rate": 2.7753503341129915e-07,
"loss": 0.4796,
"step": 2001
},
{
"epoch": 0.7647058823529411,
"grad_norm": 3.2179233263308844,
"learning_rate": 2.766800841878723e-07,
"loss": 0.5152,
"step": 2002
},
{
"epoch": 0.7650878533231474,
"grad_norm": 2.818601799258001,
"learning_rate": 2.7582624236493533e-07,
"loss": 0.467,
"step": 2003
},
{
"epoch": 0.7654698242933538,
"grad_norm": 3.4044326671523257,
"learning_rate": 2.7497350924971786e-07,
"loss": 0.4662,
"step": 2004
},
{
"epoch": 0.76585179526356,
"grad_norm": 17.145403912805843,
"learning_rate": 2.741218861477507e-07,
"loss": 0.4662,
"step": 2005
},
{
"epoch": 0.7662337662337663,
"grad_norm": 3.3533992472377405,
"learning_rate": 2.7327137436286687e-07,
"loss": 0.549,
"step": 2006
},
{
"epoch": 0.7666157372039725,
"grad_norm": 20.158085491971196,
"learning_rate": 2.7242197519719654e-07,
"loss": 0.5545,
"step": 2007
},
{
"epoch": 0.7669977081741788,
"grad_norm": 3.1064468642848824,
"learning_rate": 2.7157368995116737e-07,
"loss": 0.4057,
"step": 2008
},
{
"epoch": 0.767379679144385,
"grad_norm": 4.373884424352218,
"learning_rate": 2.7072651992350193e-07,
"loss": 0.5852,
"step": 2009
},
{
"epoch": 0.7677616501145913,
"grad_norm": 4.214662447583189,
"learning_rate": 2.698804664112139e-07,
"loss": 0.5052,
"step": 2010
},
{
"epoch": 0.7681436210847975,
"grad_norm": 2.3640521615152594,
"learning_rate": 2.6903553070960907e-07,
"loss": 0.4282,
"step": 2011
},
{
"epoch": 0.7685255920550038,
"grad_norm": 2.821646391524377,
"learning_rate": 2.6819171411228146e-07,
"loss": 0.5568,
"step": 2012
},
{
"epoch": 0.7689075630252101,
"grad_norm": 2.8535141415958574,
"learning_rate": 2.673490179111111e-07,
"loss": 0.4406,
"step": 2013
},
{
"epoch": 0.7692895339954163,
"grad_norm": 8.09229993906319,
"learning_rate": 2.665074433962634e-07,
"loss": 0.4413,
"step": 2014
},
{
"epoch": 0.7696715049656226,
"grad_norm": 2.8451998217945587,
"learning_rate": 2.656669918561866e-07,
"loss": 0.5313,
"step": 2015
},
{
"epoch": 0.7700534759358288,
"grad_norm": 5.017701230366891,
"learning_rate": 2.6482766457760883e-07,
"loss": 0.4855,
"step": 2016
},
{
"epoch": 0.7704354469060352,
"grad_norm": 3.0709257455045815,
"learning_rate": 2.639894628455379e-07,
"loss": 0.5046,
"step": 2017
},
{
"epoch": 0.7708174178762414,
"grad_norm": 2.9626905006442636,
"learning_rate": 2.631523879432576e-07,
"loss": 0.5317,
"step": 2018
},
{
"epoch": 0.7711993888464477,
"grad_norm": 7.163042586264671,
"learning_rate": 2.62316441152327e-07,
"loss": 0.5906,
"step": 2019
},
{
"epoch": 0.771581359816654,
"grad_norm": 5.0311784903125,
"learning_rate": 2.6148162375257855e-07,
"loss": 0.5227,
"step": 2020
},
{
"epoch": 0.7719633307868602,
"grad_norm": 4.102688596161408,
"learning_rate": 2.606479370221142e-07,
"loss": 0.4829,
"step": 2021
},
{
"epoch": 0.7723453017570665,
"grad_norm": 3.418560369710601,
"learning_rate": 2.5981538223730616e-07,
"loss": 0.4852,
"step": 2022
},
{
"epoch": 0.7727272727272727,
"grad_norm": 7.237199469607544,
"learning_rate": 2.5898396067279327e-07,
"loss": 0.4842,
"step": 2023
},
{
"epoch": 0.773109243697479,
"grad_norm": 5.8815534297234215,
"learning_rate": 2.581536736014789e-07,
"loss": 0.4698,
"step": 2024
},
{
"epoch": 0.7734912146676852,
"grad_norm": 5.286618171866562,
"learning_rate": 2.5732452229453005e-07,
"loss": 0.5295,
"step": 2025
},
{
"epoch": 0.7738731856378915,
"grad_norm": 2.64018006585823,
"learning_rate": 2.5649650802137513e-07,
"loss": 0.5107,
"step": 2026
},
{
"epoch": 0.7742551566080977,
"grad_norm": 5.39635291634368,
"learning_rate": 2.556696320497007e-07,
"loss": 0.5189,
"step": 2027
},
{
"epoch": 0.774637127578304,
"grad_norm": 4.327051391697355,
"learning_rate": 2.5484389564545194e-07,
"loss": 0.4771,
"step": 2028
},
{
"epoch": 0.7750190985485104,
"grad_norm": 3.6198992330991437,
"learning_rate": 2.54019300072828e-07,
"loss": 0.4664,
"step": 2029
},
{
"epoch": 0.7754010695187166,
"grad_norm": 3.4612319276877876,
"learning_rate": 2.5319584659428227e-07,
"loss": 0.4549,
"step": 2030
},
{
"epoch": 0.7757830404889229,
"grad_norm": 9.185387121029617,
"learning_rate": 2.5237353647051986e-07,
"loss": 0.4215,
"step": 2031
},
{
"epoch": 0.7761650114591291,
"grad_norm": 3.0793205141834337,
"learning_rate": 2.515523709604943e-07,
"loss": 0.4781,
"step": 2032
},
{
"epoch": 0.7765469824293354,
"grad_norm": 2.415691831688847,
"learning_rate": 2.507323513214077e-07,
"loss": 0.4111,
"step": 2033
},
{
"epoch": 0.7769289533995416,
"grad_norm": 2.9556289683823596,
"learning_rate": 2.4991347880870773e-07,
"loss": 0.4735,
"step": 2034
},
{
"epoch": 0.7773109243697479,
"grad_norm": 5.721939413184182,
"learning_rate": 2.490957546760851e-07,
"loss": 0.5053,
"step": 2035
},
{
"epoch": 0.7776928953399541,
"grad_norm": 2.8081377018686142,
"learning_rate": 2.4827918017547345e-07,
"loss": 0.4053,
"step": 2036
},
{
"epoch": 0.7780748663101604,
"grad_norm": 3.827787653477836,
"learning_rate": 2.474637565570451e-07,
"loss": 0.5339,
"step": 2037
},
{
"epoch": 0.7784568372803667,
"grad_norm": 2.4848945346733946,
"learning_rate": 2.4664948506921126e-07,
"loss": 0.48,
"step": 2038
},
{
"epoch": 0.778838808250573,
"grad_norm": 2.9962827306471205,
"learning_rate": 2.458363669586194e-07,
"loss": 0.4947,
"step": 2039
},
{
"epoch": 0.7792207792207793,
"grad_norm": 2.347916499410763,
"learning_rate": 2.450244034701501e-07,
"loss": 0.4814,
"step": 2040
},
{
"epoch": 0.7796027501909855,
"grad_norm": 2.5839724639312465,
"learning_rate": 2.442135958469171e-07,
"loss": 0.4388,
"step": 2041
},
{
"epoch": 0.7799847211611918,
"grad_norm": 6.96172510132702,
"learning_rate": 2.4340394533026486e-07,
"loss": 0.5038,
"step": 2042
},
{
"epoch": 0.780366692131398,
"grad_norm": 2.1074636866882495,
"learning_rate": 2.425954531597649e-07,
"loss": 0.4702,
"step": 2043
},
{
"epoch": 0.7807486631016043,
"grad_norm": 3.6538983777379612,
"learning_rate": 2.4178812057321653e-07,
"loss": 0.4599,
"step": 2044
},
{
"epoch": 0.7811306340718105,
"grad_norm": 2.7110070952816017,
"learning_rate": 2.4098194880664356e-07,
"loss": 0.4598,
"step": 2045
},
{
"epoch": 0.7815126050420168,
"grad_norm": 4.346035006886647,
"learning_rate": 2.4017693909429206e-07,
"loss": 0.5548,
"step": 2046
},
{
"epoch": 0.7818945760122231,
"grad_norm": 3.963055626489219,
"learning_rate": 2.393730926686297e-07,
"loss": 0.5198,
"step": 2047
},
{
"epoch": 0.7822765469824293,
"grad_norm": 8.215642411040255,
"learning_rate": 2.3857041076034236e-07,
"loss": 0.478,
"step": 2048
},
{
"epoch": 0.7826585179526356,
"grad_norm": 6.317693663913666,
"learning_rate": 2.3776889459833373e-07,
"loss": 0.4781,
"step": 2049
},
{
"epoch": 0.7830404889228418,
"grad_norm": 11.445087980882196,
"learning_rate": 2.3696854540972267e-07,
"loss": 0.4477,
"step": 2050
},
{
"epoch": 0.7834224598930482,
"grad_norm": 2.485869771208182,
"learning_rate": 2.361693644198408e-07,
"loss": 0.4807,
"step": 2051
},
{
"epoch": 0.7838044308632544,
"grad_norm": 3.104809165870729,
"learning_rate": 2.3537135285223199e-07,
"loss": 0.5194,
"step": 2052
},
{
"epoch": 0.7841864018334607,
"grad_norm": 6.563255132353687,
"learning_rate": 2.3457451192864962e-07,
"loss": 0.3995,
"step": 2053
},
{
"epoch": 0.7845683728036669,
"grad_norm": 4.457138302228237,
"learning_rate": 2.3377884286905414e-07,
"loss": 0.4693,
"step": 2054
},
{
"epoch": 0.7849503437738732,
"grad_norm": 4.312405424580879,
"learning_rate": 2.329843468916124e-07,
"loss": 0.4626,
"step": 2055
},
{
"epoch": 0.7853323147440795,
"grad_norm": 2.4779569058573796,
"learning_rate": 2.3219102521269575e-07,
"loss": 0.4662,
"step": 2056
},
{
"epoch": 0.7857142857142857,
"grad_norm": 3.203547257496715,
"learning_rate": 2.313988790468765e-07,
"loss": 0.5714,
"step": 2057
},
{
"epoch": 0.786096256684492,
"grad_norm": 3.6702308504670587,
"learning_rate": 2.3060790960692845e-07,
"loss": 0.4362,
"step": 2058
},
{
"epoch": 0.7864782276546982,
"grad_norm": 2.528143424070905,
"learning_rate": 2.2981811810382269e-07,
"loss": 0.4191,
"step": 2059
},
{
"epoch": 0.7868601986249045,
"grad_norm": 8.621735007155152,
"learning_rate": 2.2902950574672798e-07,
"loss": 0.4654,
"step": 2060
},
{
"epoch": 0.7872421695951107,
"grad_norm": 4.121940405291025,
"learning_rate": 2.282420737430073e-07,
"loss": 0.566,
"step": 2061
},
{
"epoch": 0.787624140565317,
"grad_norm": 2.8941965091486344,
"learning_rate": 2.2745582329821623e-07,
"loss": 0.5333,
"step": 2062
},
{
"epoch": 0.7880061115355232,
"grad_norm": 8.905160944846878,
"learning_rate": 2.2667075561610195e-07,
"loss": 0.4494,
"step": 2063
},
{
"epoch": 0.7883880825057296,
"grad_norm": 3.245421182359227,
"learning_rate": 2.258868718986008e-07,
"loss": 0.4727,
"step": 2064
},
{
"epoch": 0.7887700534759359,
"grad_norm": 3.994712407307103,
"learning_rate": 2.2510417334583566e-07,
"loss": 0.5717,
"step": 2065
},
{
"epoch": 0.7891520244461421,
"grad_norm": 3.1308390080703536,
"learning_rate": 2.2432266115611588e-07,
"loss": 0.5019,
"step": 2066
},
{
"epoch": 0.7895339954163484,
"grad_norm": 9.174614866376384,
"learning_rate": 2.2354233652593436e-07,
"loss": 0.4767,
"step": 2067
},
{
"epoch": 0.7899159663865546,
"grad_norm": 3.332817881843333,
"learning_rate": 2.2276320064996513e-07,
"loss": 0.5531,
"step": 2068
},
{
"epoch": 0.7902979373567609,
"grad_norm": 3.1221313508461694,
"learning_rate": 2.2198525472106322e-07,
"loss": 0.4997,
"step": 2069
},
{
"epoch": 0.7906799083269671,
"grad_norm": 6.25659098508917,
"learning_rate": 2.212084999302609e-07,
"loss": 0.5339,
"step": 2070
},
{
"epoch": 0.7910618792971734,
"grad_norm": 2.378343334180352,
"learning_rate": 2.204329374667675e-07,
"loss": 0.4792,
"step": 2071
},
{
"epoch": 0.7914438502673797,
"grad_norm": 5.407938710117125,
"learning_rate": 2.1965856851796704e-07,
"loss": 0.4508,
"step": 2072
},
{
"epoch": 0.7918258212375859,
"grad_norm": 8.650676035914138,
"learning_rate": 2.1888539426941534e-07,
"loss": 0.5357,
"step": 2073
},
{
"epoch": 0.7922077922077922,
"grad_norm": 3.320964515680207,
"learning_rate": 2.181134159048399e-07,
"loss": 0.54,
"step": 2074
},
{
"epoch": 0.7925897631779985,
"grad_norm": 4.3796081323275855,
"learning_rate": 2.1734263460613745e-07,
"loss": 0.4396,
"step": 2075
},
{
"epoch": 0.7929717341482048,
"grad_norm": 4.6250797467380185,
"learning_rate": 2.1657305155337114e-07,
"loss": 0.515,
"step": 2076
},
{
"epoch": 0.793353705118411,
"grad_norm": 3.2669468591765125,
"learning_rate": 2.158046679247706e-07,
"loss": 0.4935,
"step": 2077
},
{
"epoch": 0.7937356760886173,
"grad_norm": 3.32173143735186,
"learning_rate": 2.150374848967288e-07,
"loss": 0.3964,
"step": 2078
},
{
"epoch": 0.7941176470588235,
"grad_norm": 4.002395870563238,
"learning_rate": 2.142715036438001e-07,
"loss": 0.4846,
"step": 2079
},
{
"epoch": 0.7944996180290298,
"grad_norm": 3.262119711105562,
"learning_rate": 2.1350672533869985e-07,
"loss": 0.4709,
"step": 2080
},
{
"epoch": 0.7948815889992361,
"grad_norm": 3.897308700584841,
"learning_rate": 2.1274315115230069e-07,
"loss": 0.5574,
"step": 2081
},
{
"epoch": 0.7952635599694423,
"grad_norm": 2.788895366333766,
"learning_rate": 2.1198078225363248e-07,
"loss": 0.4567,
"step": 2082
},
{
"epoch": 0.7956455309396486,
"grad_norm": 4.428889893647171,
"learning_rate": 2.1121961980987991e-07,
"loss": 0.4818,
"step": 2083
},
{
"epoch": 0.7960275019098548,
"grad_norm": 3.4304173580488047,
"learning_rate": 2.1045966498637968e-07,
"loss": 0.4832,
"step": 2084
},
{
"epoch": 0.7964094728800611,
"grad_norm": 2.7090643157430634,
"learning_rate": 2.0970091894662046e-07,
"loss": 0.4742,
"step": 2085
},
{
"epoch": 0.7967914438502673,
"grad_norm": 3.498421884744891,
"learning_rate": 2.0894338285224032e-07,
"loss": 0.4831,
"step": 2086
},
{
"epoch": 0.7971734148204737,
"grad_norm": 2.554999201557372,
"learning_rate": 2.0818705786302414e-07,
"loss": 0.4128,
"step": 2087
},
{
"epoch": 0.7975553857906799,
"grad_norm": 5.764550803661119,
"learning_rate": 2.0743194513690354e-07,
"loss": 0.4518,
"step": 2088
},
{
"epoch": 0.7979373567608862,
"grad_norm": 4.392621968809952,
"learning_rate": 2.066780458299532e-07,
"loss": 0.5366,
"step": 2089
},
{
"epoch": 0.7983193277310925,
"grad_norm": 27.23762782223962,
"learning_rate": 2.059253610963908e-07,
"loss": 0.5096,
"step": 2090
},
{
"epoch": 0.7987012987012987,
"grad_norm": 4.917678268346897,
"learning_rate": 2.051738920885745e-07,
"loss": 0.4983,
"step": 2091
},
{
"epoch": 0.799083269671505,
"grad_norm": 2.4843286909212985,
"learning_rate": 2.0442363995700053e-07,
"loss": 0.4707,
"step": 2092
},
{
"epoch": 0.7994652406417112,
"grad_norm": 3.1580430450112864,
"learning_rate": 2.0367460585030294e-07,
"loss": 0.3834,
"step": 2093
},
{
"epoch": 0.7998472116119175,
"grad_norm": 4.74167956034697,
"learning_rate": 2.0292679091525e-07,
"loss": 0.5594,
"step": 2094
},
{
"epoch": 0.8002291825821237,
"grad_norm": 5.6498499573703365,
"learning_rate": 2.0218019629674444e-07,
"loss": 0.5196,
"step": 2095
},
{
"epoch": 0.80061115355233,
"grad_norm": 4.527159164195184,
"learning_rate": 2.0143482313782046e-07,
"loss": 0.5576,
"step": 2096
},
{
"epoch": 0.8009931245225362,
"grad_norm": 5.130664753102986,
"learning_rate": 2.0069067257964133e-07,
"loss": 0.5389,
"step": 2097
},
{
"epoch": 0.8013750954927426,
"grad_norm": 2.3508514319389784,
"learning_rate": 1.9994774576149986e-07,
"loss": 0.4141,
"step": 2098
},
{
"epoch": 0.8017570664629489,
"grad_norm": 4.036901095606672,
"learning_rate": 1.9920604382081396e-07,
"loss": 0.4771,
"step": 2099
},
{
"epoch": 0.8021390374331551,
"grad_norm": 3.8505341704800937,
"learning_rate": 1.984655678931274e-07,
"loss": 0.5119,
"step": 2100
},
{
"epoch": 0.8025210084033614,
"grad_norm": 4.315212828257233,
"learning_rate": 1.9772631911210658e-07,
"loss": 0.5262,
"step": 2101
},
{
"epoch": 0.8029029793735676,
"grad_norm": 4.179122202495466,
"learning_rate": 1.9698829860953869e-07,
"loss": 0.4772,
"step": 2102
},
{
"epoch": 0.8032849503437739,
"grad_norm": 3.510354812278512,
"learning_rate": 1.9625150751533105e-07,
"loss": 0.4846,
"step": 2103
},
{
"epoch": 0.8036669213139801,
"grad_norm": 5.192771915402893,
"learning_rate": 1.9551594695750807e-07,
"loss": 0.5724,
"step": 2104
},
{
"epoch": 0.8040488922841864,
"grad_norm": 2.8914671355912898,
"learning_rate": 1.9478161806221094e-07,
"loss": 0.5198,
"step": 2105
},
{
"epoch": 0.8044308632543926,
"grad_norm": 3.013881239807992,
"learning_rate": 1.9404852195369515e-07,
"loss": 0.4438,
"step": 2106
},
{
"epoch": 0.8048128342245989,
"grad_norm": 3.4213101296477197,
"learning_rate": 1.9331665975432775e-07,
"loss": 0.4714,
"step": 2107
},
{
"epoch": 0.8051948051948052,
"grad_norm": 2.855202248672021,
"learning_rate": 1.9258603258458827e-07,
"loss": 0.4218,
"step": 2108
},
{
"epoch": 0.8055767761650114,
"grad_norm": 4.939004677299009,
"learning_rate": 1.918566415630638e-07,
"loss": 0.4966,
"step": 2109
},
{
"epoch": 0.8059587471352178,
"grad_norm": 4.876125661218717,
"learning_rate": 1.9112848780645018e-07,
"loss": 0.5304,
"step": 2110
},
{
"epoch": 0.806340718105424,
"grad_norm": 5.349902870683962,
"learning_rate": 1.9040157242954856e-07,
"loss": 0.4977,
"step": 2111
},
{
"epoch": 0.8067226890756303,
"grad_norm": 6.427472086996999,
"learning_rate": 1.8967589654526362e-07,
"loss": 0.5665,
"step": 2112
},
{
"epoch": 0.8071046600458365,
"grad_norm": 3.9644914565815865,
"learning_rate": 1.8895146126460337e-07,
"loss": 0.5475,
"step": 2113
},
{
"epoch": 0.8074866310160428,
"grad_norm": 2.6214119049961675,
"learning_rate": 1.8822826769667533e-07,
"loss": 0.4595,
"step": 2114
},
{
"epoch": 0.807868601986249,
"grad_norm": 3.4275301139405765,
"learning_rate": 1.875063169486869e-07,
"loss": 0.4387,
"step": 2115
},
{
"epoch": 0.8082505729564553,
"grad_norm": 2.957960275521017,
"learning_rate": 1.8678561012594253e-07,
"loss": 0.5228,
"step": 2116
},
{
"epoch": 0.8086325439266616,
"grad_norm": 4.5705638000310715,
"learning_rate": 1.8606614833184165e-07,
"loss": 0.5427,
"step": 2117
},
{
"epoch": 0.8090145148968678,
"grad_norm": 2.8072984602726283,
"learning_rate": 1.853479326678783e-07,
"loss": 0.4979,
"step": 2118
},
{
"epoch": 0.8093964858670741,
"grad_norm": 3.268001934700151,
"learning_rate": 1.8463096423363843e-07,
"loss": 0.5171,
"step": 2119
},
{
"epoch": 0.8097784568372803,
"grad_norm": 2.32727482347449,
"learning_rate": 1.8391524412679805e-07,
"loss": 0.4387,
"step": 2120
},
{
"epoch": 0.8101604278074866,
"grad_norm": 2.3928853103257683,
"learning_rate": 1.832007734431229e-07,
"loss": 0.3816,
"step": 2121
},
{
"epoch": 0.8105423987776929,
"grad_norm": 3.522836164648356,
"learning_rate": 1.824875532764647e-07,
"loss": 0.4504,
"step": 2122
},
{
"epoch": 0.8109243697478992,
"grad_norm": 3.8543287249884086,
"learning_rate": 1.8177558471876164e-07,
"loss": 0.5192,
"step": 2123
},
{
"epoch": 0.8113063407181055,
"grad_norm": 2.82058060067478,
"learning_rate": 1.8106486886003547e-07,
"loss": 0.4373,
"step": 2124
},
{
"epoch": 0.8116883116883117,
"grad_norm": 5.33102966199271,
"learning_rate": 1.8035540678838946e-07,
"loss": 0.4829,
"step": 2125
},
{
"epoch": 0.812070282658518,
"grad_norm": 11.516447534177683,
"learning_rate": 1.7964719959000808e-07,
"loss": 0.5049,
"step": 2126
},
{
"epoch": 0.8124522536287242,
"grad_norm": 4.790046217132607,
"learning_rate": 1.7894024834915443e-07,
"loss": 0.4592,
"step": 2127
},
{
"epoch": 0.8128342245989305,
"grad_norm": 7.356390091789118,
"learning_rate": 1.7823455414816812e-07,
"loss": 0.4138,
"step": 2128
},
{
"epoch": 0.8132161955691367,
"grad_norm": 2.2861859979657995,
"learning_rate": 1.775301180674651e-07,
"loss": 0.4342,
"step": 2129
},
{
"epoch": 0.813598166539343,
"grad_norm": 7.537853643424543,
"learning_rate": 1.76826941185535e-07,
"loss": 0.5356,
"step": 2130
},
{
"epoch": 0.8139801375095492,
"grad_norm": 3.444331655734697,
"learning_rate": 1.7612502457893874e-07,
"loss": 0.4874,
"step": 2131
},
{
"epoch": 0.8143621084797555,
"grad_norm": 2.696003558115096,
"learning_rate": 1.7542436932230897e-07,
"loss": 0.4492,
"step": 2132
},
{
"epoch": 0.8147440794499619,
"grad_norm": 2.7325781405086365,
"learning_rate": 1.7472497648834627e-07,
"loss": 0.4839,
"step": 2133
},
{
"epoch": 0.8151260504201681,
"grad_norm": 2.4953850542257876,
"learning_rate": 1.74026847147819e-07,
"loss": 0.4183,
"step": 2134
},
{
"epoch": 0.8155080213903744,
"grad_norm": 2.698104289850833,
"learning_rate": 1.733299823695612e-07,
"loss": 0.4834,
"step": 2135
},
{
"epoch": 0.8158899923605806,
"grad_norm": 5.766516311048943,
"learning_rate": 1.726343832204702e-07,
"loss": 0.5258,
"step": 2136
},
{
"epoch": 0.8162719633307869,
"grad_norm": 4.263091493052819,
"learning_rate": 1.7194005076550633e-07,
"loss": 0.4654,
"step": 2137
},
{
"epoch": 0.8166539343009931,
"grad_norm": 3.6493299151046643,
"learning_rate": 1.712469860676905e-07,
"loss": 0.5148,
"step": 2138
},
{
"epoch": 0.8170359052711994,
"grad_norm": 3.118503435110606,
"learning_rate": 1.7055519018810215e-07,
"loss": 0.5313,
"step": 2139
},
{
"epoch": 0.8174178762414056,
"grad_norm": 2.856259832257614,
"learning_rate": 1.6986466418587875e-07,
"loss": 0.4324,
"step": 2140
},
{
"epoch": 0.8177998472116119,
"grad_norm": 2.6322054967660637,
"learning_rate": 1.6917540911821383e-07,
"loss": 0.4694,
"step": 2141
},
{
"epoch": 0.8181818181818182,
"grad_norm": 4.1708143673350095,
"learning_rate": 1.6848742604035405e-07,
"loss": 0.5139,
"step": 2142
},
{
"epoch": 0.8185637891520244,
"grad_norm": 2.6858118054559075,
"learning_rate": 1.6780071600559985e-07,
"loss": 0.4807,
"step": 2143
},
{
"epoch": 0.8189457601222307,
"grad_norm": 3.452723265801165,
"learning_rate": 1.6711528006530162e-07,
"loss": 0.485,
"step": 2144
},
{
"epoch": 0.819327731092437,
"grad_norm": 3.348365409723057,
"learning_rate": 1.6643111926885988e-07,
"loss": 0.5307,
"step": 2145
},
{
"epoch": 0.8197097020626433,
"grad_norm": 4.175205747051993,
"learning_rate": 1.657482346637229e-07,
"loss": 0.4897,
"step": 2146
},
{
"epoch": 0.8200916730328495,
"grad_norm": 2.627351449164976,
"learning_rate": 1.6506662729538424e-07,
"loss": 0.4675,
"step": 2147
},
{
"epoch": 0.8204736440030558,
"grad_norm": 4.364209282487767,
"learning_rate": 1.643862982073828e-07,
"loss": 0.4748,
"step": 2148
},
{
"epoch": 0.820855614973262,
"grad_norm": 3.8648444535189705,
"learning_rate": 1.637072484413008e-07,
"loss": 0.4898,
"step": 2149
},
{
"epoch": 0.8212375859434683,
"grad_norm": 2.4741715164345157,
"learning_rate": 1.6302947903676045e-07,
"loss": 0.5016,
"step": 2150
},
{
"epoch": 0.8216195569136746,
"grad_norm": 8.096634033181392,
"learning_rate": 1.6235299103142507e-07,
"loss": 0.53,
"step": 2151
},
{
"epoch": 0.8220015278838808,
"grad_norm": 10.094345497672295,
"learning_rate": 1.6167778546099563e-07,
"loss": 0.5067,
"step": 2152
},
{
"epoch": 0.8223834988540871,
"grad_norm": 3.0033457948105493,
"learning_rate": 1.6100386335920945e-07,
"loss": 0.4624,
"step": 2153
},
{
"epoch": 0.8227654698242933,
"grad_norm": 3.6237989363735474,
"learning_rate": 1.6033122575783943e-07,
"loss": 0.4402,
"step": 2154
},
{
"epoch": 0.8231474407944996,
"grad_norm": 27.114977190810855,
"learning_rate": 1.596598736866912e-07,
"loss": 0.5047,
"step": 2155
},
{
"epoch": 0.8235294117647058,
"grad_norm": 8.097836516523152,
"learning_rate": 1.5898980817360296e-07,
"loss": 0.5523,
"step": 2156
},
{
"epoch": 0.8239113827349122,
"grad_norm": 6.138753598509105,
"learning_rate": 1.58321030244443e-07,
"loss": 0.5264,
"step": 2157
},
{
"epoch": 0.8242933537051184,
"grad_norm": 5.54119380250345,
"learning_rate": 1.5765354092310767e-07,
"loss": 0.5002,
"step": 2158
},
{
"epoch": 0.8246753246753247,
"grad_norm": 12.383001202449798,
"learning_rate": 1.5698734123152147e-07,
"loss": 0.4764,
"step": 2159
},
{
"epoch": 0.825057295645531,
"grad_norm": 6.763350972657245,
"learning_rate": 1.5632243218963405e-07,
"loss": 0.5253,
"step": 2160
},
{
"epoch": 0.8254392666157372,
"grad_norm": 7.899539027451731,
"learning_rate": 1.5565881481541855e-07,
"loss": 0.5055,
"step": 2161
},
{
"epoch": 0.8258212375859435,
"grad_norm": 5.708250468763933,
"learning_rate": 1.5499649012487158e-07,
"loss": 0.5095,
"step": 2162
},
{
"epoch": 0.8262032085561497,
"grad_norm": 12.871641057671368,
"learning_rate": 1.5433545913200975e-07,
"loss": 0.4852,
"step": 2163
},
{
"epoch": 0.826585179526356,
"grad_norm": 3.844853650524388,
"learning_rate": 1.5367572284886966e-07,
"loss": 0.4783,
"step": 2164
},
{
"epoch": 0.8269671504965622,
"grad_norm": 3.5393461249782376,
"learning_rate": 1.5301728228550547e-07,
"loss": 0.4546,
"step": 2165
},
{
"epoch": 0.8273491214667685,
"grad_norm": 3.9628215025790277,
"learning_rate": 1.523601384499873e-07,
"loss": 0.4962,
"step": 2166
},
{
"epoch": 0.8277310924369747,
"grad_norm": 3.320017741891724,
"learning_rate": 1.5170429234840042e-07,
"loss": 0.5417,
"step": 2167
},
{
"epoch": 0.828113063407181,
"grad_norm": 16.613667129121023,
"learning_rate": 1.5104974498484345e-07,
"loss": 0.5198,
"step": 2168
},
{
"epoch": 0.8284950343773874,
"grad_norm": 2.97697112468617,
"learning_rate": 1.5039649736142578e-07,
"loss": 0.54,
"step": 2169
},
{
"epoch": 0.8288770053475936,
"grad_norm": 3.968470149842816,
"learning_rate": 1.4974455047826784e-07,
"loss": 0.4708,
"step": 2170
},
{
"epoch": 0.8292589763177999,
"grad_norm": 13.993826599883949,
"learning_rate": 1.490939053334982e-07,
"loss": 0.5484,
"step": 2171
},
{
"epoch": 0.8296409472880061,
"grad_norm": 4.0595859874577975,
"learning_rate": 1.4844456292325212e-07,
"loss": 0.5645,
"step": 2172
},
{
"epoch": 0.8300229182582124,
"grad_norm": 2.9777732873389056,
"learning_rate": 1.4779652424167131e-07,
"loss": 0.5123,
"step": 2173
},
{
"epoch": 0.8304048892284186,
"grad_norm": 5.623326166041766,
"learning_rate": 1.4714979028090058e-07,
"loss": 0.4902,
"step": 2174
},
{
"epoch": 0.8307868601986249,
"grad_norm": 3.638460408651059,
"learning_rate": 1.465043620310875e-07,
"loss": 0.5674,
"step": 2175
},
{
"epoch": 0.8311688311688312,
"grad_norm": 16.00436964062797,
"learning_rate": 1.4586024048038103e-07,
"loss": 0.6007,
"step": 2176
},
{
"epoch": 0.8315508021390374,
"grad_norm": 9.547989664149393,
"learning_rate": 1.4521742661492885e-07,
"loss": 0.4768,
"step": 2177
},
{
"epoch": 0.8319327731092437,
"grad_norm": 3.880657659745083,
"learning_rate": 1.4457592141887708e-07,
"loss": 0.5351,
"step": 2178
},
{
"epoch": 0.8323147440794499,
"grad_norm": 2.504159388877166,
"learning_rate": 1.4393572587436843e-07,
"loss": 0.4805,
"step": 2179
},
{
"epoch": 0.8326967150496563,
"grad_norm": 2.413992644051076,
"learning_rate": 1.4329684096153972e-07,
"loss": 0.4288,
"step": 2180
},
{
"epoch": 0.8330786860198625,
"grad_norm": 2.805884193807672,
"learning_rate": 1.4265926765852187e-07,
"loss": 0.4767,
"step": 2181
},
{
"epoch": 0.8334606569900688,
"grad_norm": 2.6599816498305864,
"learning_rate": 1.42023006941438e-07,
"loss": 0.4093,
"step": 2182
},
{
"epoch": 0.833842627960275,
"grad_norm": 2.268768752495317,
"learning_rate": 1.413880597844007e-07,
"loss": 0.426,
"step": 2183
},
{
"epoch": 0.8342245989304813,
"grad_norm": 3.1786372741174933,
"learning_rate": 1.4075442715951246e-07,
"loss": 0.4236,
"step": 2184
},
{
"epoch": 0.8346065699006876,
"grad_norm": 4.764909929435903,
"learning_rate": 1.4012211003686236e-07,
"loss": 0.5414,
"step": 2185
},
{
"epoch": 0.8349885408708938,
"grad_norm": 10.893544216950978,
"learning_rate": 1.3949110938452613e-07,
"loss": 0.5049,
"step": 2186
},
{
"epoch": 0.8353705118411001,
"grad_norm": 7.152163635201567,
"learning_rate": 1.388614261685641e-07,
"loss": 0.4803,
"step": 2187
},
{
"epoch": 0.8357524828113063,
"grad_norm": 3.5881330377029164,
"learning_rate": 1.3823306135301872e-07,
"loss": 0.5535,
"step": 2188
},
{
"epoch": 0.8361344537815126,
"grad_norm": 2.51608097849988,
"learning_rate": 1.3760601589991472e-07,
"loss": 0.4877,
"step": 2189
},
{
"epoch": 0.8365164247517188,
"grad_norm": 6.759430257728035,
"learning_rate": 1.36980290769257e-07,
"loss": 0.5013,
"step": 2190
},
{
"epoch": 0.8368983957219251,
"grad_norm": 5.091071275038226,
"learning_rate": 1.3635588691902822e-07,
"loss": 0.4787,
"step": 2191
},
{
"epoch": 0.8372803666921314,
"grad_norm": 2.7856178151388478,
"learning_rate": 1.3573280530518893e-07,
"loss": 0.5048,
"step": 2192
},
{
"epoch": 0.8376623376623377,
"grad_norm": 2.4746294744651887,
"learning_rate": 1.351110468816754e-07,
"loss": 0.5198,
"step": 2193
},
{
"epoch": 0.838044308632544,
"grad_norm": 33.71405414753952,
"learning_rate": 1.344906126003973e-07,
"loss": 0.468,
"step": 2194
},
{
"epoch": 0.8384262796027502,
"grad_norm": 7.024288440007473,
"learning_rate": 1.3387150341123798e-07,
"loss": 0.5526,
"step": 2195
},
{
"epoch": 0.8388082505729565,
"grad_norm": 6.049475591050513,
"learning_rate": 1.3325372026205116e-07,
"loss": 0.4301,
"step": 2196
},
{
"epoch": 0.8391902215431627,
"grad_norm": 10.611540554361598,
"learning_rate": 1.3263726409866116e-07,
"loss": 0.4779,
"step": 2197
},
{
"epoch": 0.839572192513369,
"grad_norm": 3.0621823518575115,
"learning_rate": 1.3202213586486056e-07,
"loss": 0.5016,
"step": 2198
},
{
"epoch": 0.8399541634835752,
"grad_norm": 3.275246324988664,
"learning_rate": 1.3140833650240834e-07,
"loss": 0.5119,
"step": 2199
},
{
"epoch": 0.8403361344537815,
"grad_norm": 3.3609199868920148,
"learning_rate": 1.3079586695102963e-07,
"loss": 0.5135,
"step": 2200
},
{
"epoch": 0.8407181054239877,
"grad_norm": 3.3231780812489666,
"learning_rate": 1.3018472814841342e-07,
"loss": 0.4663,
"step": 2201
},
{
"epoch": 0.841100076394194,
"grad_norm": 4.4956394515998515,
"learning_rate": 1.2957492103021107e-07,
"loss": 0.4903,
"step": 2202
},
{
"epoch": 0.8414820473644004,
"grad_norm": 3.852529019099927,
"learning_rate": 1.2896644653003552e-07,
"loss": 0.4153,
"step": 2203
},
{
"epoch": 0.8418640183346066,
"grad_norm": 4.721021829716811,
"learning_rate": 1.2835930557945906e-07,
"loss": 0.4437,
"step": 2204
},
{
"epoch": 0.8422459893048129,
"grad_norm": 4.45443689471161,
"learning_rate": 1.277534991080128e-07,
"loss": 0.4884,
"step": 2205
},
{
"epoch": 0.8426279602750191,
"grad_norm": 2.8545799117331105,
"learning_rate": 1.271490280431845e-07,
"loss": 0.5027,
"step": 2206
},
{
"epoch": 0.8430099312452254,
"grad_norm": 3.1618830266696367,
"learning_rate": 1.265458933104172e-07,
"loss": 0.5227,
"step": 2207
},
{
"epoch": 0.8433919022154316,
"grad_norm": 5.868702228977397,
"learning_rate": 1.259440958331086e-07,
"loss": 0.5185,
"step": 2208
},
{
"epoch": 0.8437738731856379,
"grad_norm": 11.923531533986598,
"learning_rate": 1.2534363653260838e-07,
"loss": 0.4696,
"step": 2209
},
{
"epoch": 0.8441558441558441,
"grad_norm": 4.096587385166496,
"learning_rate": 1.2474451632821792e-07,
"loss": 0.496,
"step": 2210
},
{
"epoch": 0.8445378151260504,
"grad_norm": 3.9900336904878166,
"learning_rate": 1.2414673613718863e-07,
"loss": 0.5002,
"step": 2211
},
{
"epoch": 0.8449197860962567,
"grad_norm": 3.21690791572575,
"learning_rate": 1.2355029687471963e-07,
"loss": 0.517,
"step": 2212
},
{
"epoch": 0.8453017570664629,
"grad_norm": 2.998421983038558,
"learning_rate": 1.2295519945395806e-07,
"loss": 0.4889,
"step": 2213
},
{
"epoch": 0.8456837280366692,
"grad_norm": 4.306846266548668,
"learning_rate": 1.2236144478599553e-07,
"loss": 0.4833,
"step": 2214
},
{
"epoch": 0.8460656990068754,
"grad_norm": 3.816892740943075,
"learning_rate": 1.2176903377986903e-07,
"loss": 0.5117,
"step": 2215
},
{
"epoch": 0.8464476699770818,
"grad_norm": 3.27557051439516,
"learning_rate": 1.2117796734255793e-07,
"loss": 0.5633,
"step": 2216
},
{
"epoch": 0.846829640947288,
"grad_norm": 2.809194476163277,
"learning_rate": 1.2058824637898267e-07,
"loss": 0.4319,
"step": 2217
},
{
"epoch": 0.8472116119174943,
"grad_norm": 2.565263632594187,
"learning_rate": 1.1999987179200466e-07,
"loss": 0.4182,
"step": 2218
},
{
"epoch": 0.8475935828877005,
"grad_norm": 4.393436093843852,
"learning_rate": 1.1941284448242306e-07,
"loss": 0.5012,
"step": 2219
},
{
"epoch": 0.8479755538579068,
"grad_norm": 3.784435251005329,
"learning_rate": 1.1882716534897507e-07,
"loss": 0.496,
"step": 2220
},
{
"epoch": 0.8483575248281131,
"grad_norm": 8.747122275021068,
"learning_rate": 1.1824283528833379e-07,
"loss": 0.5423,
"step": 2221
},
{
"epoch": 0.8487394957983193,
"grad_norm": 4.644657697010408,
"learning_rate": 1.1765985519510624e-07,
"loss": 0.5356,
"step": 2222
},
{
"epoch": 0.8491214667685256,
"grad_norm": 2.098684965570757,
"learning_rate": 1.1707822596183337e-07,
"loss": 0.4655,
"step": 2223
},
{
"epoch": 0.8495034377387318,
"grad_norm": 3.0204783803502404,
"learning_rate": 1.1649794847898754e-07,
"loss": 0.4898,
"step": 2224
},
{
"epoch": 0.8498854087089381,
"grad_norm": 14.77076289760896,
"learning_rate": 1.1591902363497175e-07,
"loss": 0.5915,
"step": 2225
},
{
"epoch": 0.8502673796791443,
"grad_norm": 2.2247545000489946,
"learning_rate": 1.1534145231611836e-07,
"loss": 0.4969,
"step": 2226
},
{
"epoch": 0.8506493506493507,
"grad_norm": 4.198150069950216,
"learning_rate": 1.147652354066867e-07,
"loss": 0.5177,
"step": 2227
},
{
"epoch": 0.851031321619557,
"grad_norm": 4.011138143417836,
"learning_rate": 1.1419037378886364e-07,
"loss": 0.4546,
"step": 2228
},
{
"epoch": 0.8514132925897632,
"grad_norm": 4.1242405856658,
"learning_rate": 1.1361686834275997e-07,
"loss": 0.5162,
"step": 2229
},
{
"epoch": 0.8517952635599695,
"grad_norm": 3.33567545055266,
"learning_rate": 1.1304471994641085e-07,
"loss": 0.5543,
"step": 2230
},
{
"epoch": 0.8521772345301757,
"grad_norm": 3.3591483538343927,
"learning_rate": 1.1247392947577395e-07,
"loss": 0.4238,
"step": 2231
},
{
"epoch": 0.852559205500382,
"grad_norm": 3.2108515377110924,
"learning_rate": 1.1190449780472722e-07,
"loss": 0.4729,
"step": 2232
},
{
"epoch": 0.8529411764705882,
"grad_norm": 11.40142858082818,
"learning_rate": 1.1133642580506886e-07,
"loss": 0.4736,
"step": 2233
},
{
"epoch": 0.8533231474407945,
"grad_norm": 3.1885378611447446,
"learning_rate": 1.1076971434651571e-07,
"loss": 0.4514,
"step": 2234
},
{
"epoch": 0.8537051184110007,
"grad_norm": 6.454722532760631,
"learning_rate": 1.1020436429670066e-07,
"loss": 0.463,
"step": 2235
},
{
"epoch": 0.854087089381207,
"grad_norm": 6.051133144528017,
"learning_rate": 1.096403765211732e-07,
"loss": 0.4592,
"step": 2236
},
{
"epoch": 0.8544690603514133,
"grad_norm": 9.702170781782433,
"learning_rate": 1.0907775188339652e-07,
"loss": 0.4529,
"step": 2237
},
{
"epoch": 0.8548510313216195,
"grad_norm": 6.04570371091639,
"learning_rate": 1.0851649124474727e-07,
"loss": 0.4939,
"step": 2238
},
{
"epoch": 0.8552330022918259,
"grad_norm": 5.28690485826026,
"learning_rate": 1.0795659546451397e-07,
"loss": 0.5272,
"step": 2239
},
{
"epoch": 0.8556149732620321,
"grad_norm": 9.375052196894691,
"learning_rate": 1.0739806539989482e-07,
"loss": 0.5103,
"step": 2240
},
{
"epoch": 0.8559969442322384,
"grad_norm": 7.951196817714195,
"learning_rate": 1.0684090190599782e-07,
"loss": 0.5101,
"step": 2241
},
{
"epoch": 0.8563789152024446,
"grad_norm": 7.003244801403864,
"learning_rate": 1.0628510583583861e-07,
"loss": 0.5662,
"step": 2242
},
{
"epoch": 0.8567608861726509,
"grad_norm": 11.099276113974952,
"learning_rate": 1.0573067804033897e-07,
"loss": 0.5318,
"step": 2243
},
{
"epoch": 0.8571428571428571,
"grad_norm": 2.2709048820976707,
"learning_rate": 1.0517761936832615e-07,
"loss": 0.4368,
"step": 2244
},
{
"epoch": 0.8575248281130634,
"grad_norm": 7.724425718241674,
"learning_rate": 1.0462593066653159e-07,
"loss": 0.4265,
"step": 2245
},
{
"epoch": 0.8579067990832697,
"grad_norm": 8.808875882128211,
"learning_rate": 1.0407561277958831e-07,
"loss": 0.5229,
"step": 2246
},
{
"epoch": 0.8582887700534759,
"grad_norm": 4.23873315916548,
"learning_rate": 1.0352666655003173e-07,
"loss": 0.516,
"step": 2247
},
{
"epoch": 0.8586707410236822,
"grad_norm": 4.203252767377129,
"learning_rate": 1.0297909281829642e-07,
"loss": 0.4807,
"step": 2248
},
{
"epoch": 0.8590527119938884,
"grad_norm": 4.602668002547525,
"learning_rate": 1.0243289242271625e-07,
"loss": 0.476,
"step": 2249
},
{
"epoch": 0.8594346829640948,
"grad_norm": 2.7565613852101833,
"learning_rate": 1.0188806619952239e-07,
"loss": 0.4661,
"step": 2250
},
{
"epoch": 0.859816653934301,
"grad_norm": 2.9586330341043285,
"learning_rate": 1.0134461498284175e-07,
"loss": 0.4072,
"step": 2251
},
{
"epoch": 0.8601986249045073,
"grad_norm": 3.921729259810394,
"learning_rate": 1.0080253960469653e-07,
"loss": 0.5747,
"step": 2252
},
{
"epoch": 0.8605805958747135,
"grad_norm": 24.577549995932177,
"learning_rate": 1.0026184089500266e-07,
"loss": 0.5181,
"step": 2253
},
{
"epoch": 0.8609625668449198,
"grad_norm": 2.887900020199325,
"learning_rate": 9.972251968156775e-08,
"loss": 0.4497,
"step": 2254
},
{
"epoch": 0.8613445378151261,
"grad_norm": 4.525304080726074,
"learning_rate": 9.918457679009095e-08,
"loss": 0.5317,
"step": 2255
},
{
"epoch": 0.8617265087853323,
"grad_norm": 3.461849105116977,
"learning_rate": 9.864801304416159e-08,
"loss": 0.5421,
"step": 2256
},
{
"epoch": 0.8621084797555386,
"grad_norm": 2.8242327267596985,
"learning_rate": 9.811282926525632e-08,
"loss": 0.4628,
"step": 2257
},
{
"epoch": 0.8624904507257448,
"grad_norm": 9.00792987972709,
"learning_rate": 9.757902627274039e-08,
"loss": 0.505,
"step": 2258
},
{
"epoch": 0.8628724216959511,
"grad_norm": 5.880509030524464,
"learning_rate": 9.704660488386418e-08,
"loss": 0.4353,
"step": 2259
},
{
"epoch": 0.8632543926661573,
"grad_norm": 4.774488398492021,
"learning_rate": 9.651556591376309e-08,
"loss": 0.4097,
"step": 2260
},
{
"epoch": 0.8636363636363636,
"grad_norm": 3.6326302819898615,
"learning_rate": 9.598591017545643e-08,
"loss": 0.4939,
"step": 2261
},
{
"epoch": 0.8640183346065698,
"grad_norm": 2.4374436794116012,
"learning_rate": 9.545763847984512e-08,
"loss": 0.5212,
"step": 2262
},
{
"epoch": 0.8644003055767762,
"grad_norm": 2.223777079210593,
"learning_rate": 9.493075163571152e-08,
"loss": 0.4479,
"step": 2263
},
{
"epoch": 0.8647822765469825,
"grad_norm": 3.53955584612186,
"learning_rate": 9.440525044971793e-08,
"loss": 0.4763,
"step": 2264
},
{
"epoch": 0.8651642475171887,
"grad_norm": 2.9697819295899017,
"learning_rate": 9.388113572640454e-08,
"loss": 0.4184,
"step": 2265
},
{
"epoch": 0.865546218487395,
"grad_norm": 4.924061622038229,
"learning_rate": 9.335840826818975e-08,
"loss": 0.5132,
"step": 2266
},
{
"epoch": 0.8659281894576012,
"grad_norm": 3.511432459698299,
"learning_rate": 9.283706887536769e-08,
"loss": 0.4636,
"step": 2267
},
{
"epoch": 0.8663101604278075,
"grad_norm": 2.5156321800382826,
"learning_rate": 9.23171183461069e-08,
"loss": 0.4564,
"step": 2268
},
{
"epoch": 0.8666921313980137,
"grad_norm": 8.531918185130868,
"learning_rate": 9.179855747645027e-08,
"loss": 0.462,
"step": 2269
},
{
"epoch": 0.86707410236822,
"grad_norm": 2.6380802463987316,
"learning_rate": 9.128138706031274e-08,
"loss": 0.4548,
"step": 2270
},
{
"epoch": 0.8674560733384262,
"grad_norm": 4.445704232933789,
"learning_rate": 9.07656078894805e-08,
"loss": 0.5486,
"step": 2271
},
{
"epoch": 0.8678380443086325,
"grad_norm": 2.1578780349871587,
"learning_rate": 9.025122075361013e-08,
"loss": 0.4434,
"step": 2272
},
{
"epoch": 0.8682200152788389,
"grad_norm": 2.6670098332020467,
"learning_rate": 8.973822644022632e-08,
"loss": 0.4345,
"step": 2273
},
{
"epoch": 0.868601986249045,
"grad_norm": 7.2891934149937345,
"learning_rate": 8.922662573472195e-08,
"loss": 0.4511,
"step": 2274
},
{
"epoch": 0.8689839572192514,
"grad_norm": 10.840324811059782,
"learning_rate": 8.871641942035612e-08,
"loss": 0.5316,
"step": 2275
},
{
"epoch": 0.8693659281894576,
"grad_norm": 4.289385323121571,
"learning_rate": 8.820760827825292e-08,
"loss": 0.5083,
"step": 2276
},
{
"epoch": 0.8697478991596639,
"grad_norm": 3.1582635618624133,
"learning_rate": 8.770019308740051e-08,
"loss": 0.51,
"step": 2277
},
{
"epoch": 0.8701298701298701,
"grad_norm": 1.8803307145675334,
"learning_rate": 8.719417462465039e-08,
"loss": 0.4284,
"step": 2278
},
{
"epoch": 0.8705118411000764,
"grad_norm": 3.5424445332825427,
"learning_rate": 8.668955366471465e-08,
"loss": 0.458,
"step": 2279
},
{
"epoch": 0.8708938120702827,
"grad_norm": 3.201396165183148,
"learning_rate": 8.618633098016681e-08,
"loss": 0.4927,
"step": 2280
},
{
"epoch": 0.8712757830404889,
"grad_norm": 3.991953266601079,
"learning_rate": 8.568450734143873e-08,
"loss": 0.4885,
"step": 2281
},
{
"epoch": 0.8716577540106952,
"grad_norm": 3.4953614222608214,
"learning_rate": 8.518408351682127e-08,
"loss": 0.5084,
"step": 2282
},
{
"epoch": 0.8720397249809014,
"grad_norm": 2.695167087812339,
"learning_rate": 8.468506027246158e-08,
"loss": 0.4777,
"step": 2283
},
{
"epoch": 0.8724216959511077,
"grad_norm": 5.30059850738619,
"learning_rate": 8.418743837236242e-08,
"loss": 0.5029,
"step": 2284
},
{
"epoch": 0.872803666921314,
"grad_norm": 2.629519582190331,
"learning_rate": 8.369121857838157e-08,
"loss": 0.4694,
"step": 2285
},
{
"epoch": 0.8731856378915203,
"grad_norm": 4.258188402714665,
"learning_rate": 8.319640165023012e-08,
"loss": 0.4264,
"step": 2286
},
{
"epoch": 0.8735676088617265,
"grad_norm": 2.4821274197746366,
"learning_rate": 8.270298834547085e-08,
"loss": 0.4497,
"step": 2287
},
{
"epoch": 0.8739495798319328,
"grad_norm": 6.887109932801548,
"learning_rate": 8.221097941951816e-08,
"loss": 0.4585,
"step": 2288
},
{
"epoch": 0.8743315508021391,
"grad_norm": 3.5491839840353103,
"learning_rate": 8.172037562563605e-08,
"loss": 0.4904,
"step": 2289
},
{
"epoch": 0.8747135217723453,
"grad_norm": 3.3348486996500757,
"learning_rate": 8.123117771493737e-08,
"loss": 0.4956,
"step": 2290
},
{
"epoch": 0.8750954927425516,
"grad_norm": 5.276977593463025,
"learning_rate": 8.074338643638279e-08,
"loss": 0.4255,
"step": 2291
},
{
"epoch": 0.8754774637127578,
"grad_norm": 5.252023258479506,
"learning_rate": 8.025700253677892e-08,
"loss": 0.5391,
"step": 2292
},
{
"epoch": 0.8758594346829641,
"grad_norm": 2.187421377334062,
"learning_rate": 7.977202676077799e-08,
"loss": 0.4243,
"step": 2293
},
{
"epoch": 0.8762414056531703,
"grad_norm": 3.491067074766581,
"learning_rate": 7.928845985087662e-08,
"loss": 0.5148,
"step": 2294
},
{
"epoch": 0.8766233766233766,
"grad_norm": 4.314321061348468,
"learning_rate": 7.880630254741394e-08,
"loss": 0.5677,
"step": 2295
},
{
"epoch": 0.8770053475935828,
"grad_norm": 2.5728758575647257,
"learning_rate": 7.832555558857135e-08,
"loss": 0.4648,
"step": 2296
},
{
"epoch": 0.8773873185637892,
"grad_norm": 17.279851999637817,
"learning_rate": 7.784621971037108e-08,
"loss": 0.5449,
"step": 2297
},
{
"epoch": 0.8777692895339955,
"grad_norm": 10.228330033165268,
"learning_rate": 7.736829564667447e-08,
"loss": 0.4463,
"step": 2298
},
{
"epoch": 0.8781512605042017,
"grad_norm": 4.060117251246413,
"learning_rate": 7.689178412918218e-08,
"loss": 0.4704,
"step": 2299
},
{
"epoch": 0.878533231474408,
"grad_norm": 5.799277070801896,
"learning_rate": 7.641668588743133e-08,
"loss": 0.4838,
"step": 2300
},
{
"epoch": 0.8789152024446142,
"grad_norm": 3.719374887698917,
"learning_rate": 7.594300164879619e-08,
"loss": 0.5071,
"step": 2301
},
{
"epoch": 0.8792971734148205,
"grad_norm": 4.264362033842296,
"learning_rate": 7.547073213848577e-08,
"loss": 0.4403,
"step": 2302
},
{
"epoch": 0.8796791443850267,
"grad_norm": 3.788827425478075,
"learning_rate": 7.4999878079543e-08,
"loss": 0.5339,
"step": 2303
},
{
"epoch": 0.880061115355233,
"grad_norm": 2.5289529401638777,
"learning_rate": 7.453044019284405e-08,
"loss": 0.4423,
"step": 2304
},
{
"epoch": 0.8804430863254392,
"grad_norm": 7.002007214800676,
"learning_rate": 7.40624191970971e-08,
"loss": 0.4629,
"step": 2305
},
{
"epoch": 0.8808250572956455,
"grad_norm": 5.266495507291039,
"learning_rate": 7.359581580884033e-08,
"loss": 0.5676,
"step": 2306
},
{
"epoch": 0.8812070282658518,
"grad_norm": 3.8260975120477334,
"learning_rate": 7.31306307424423e-08,
"loss": 0.5122,
"step": 2307
},
{
"epoch": 0.881588999236058,
"grad_norm": 2.790345430405287,
"learning_rate": 7.266686471009997e-08,
"loss": 0.4735,
"step": 2308
},
{
"epoch": 0.8819709702062644,
"grad_norm": 3.684760773280003,
"learning_rate": 7.220451842183739e-08,
"loss": 0.5038,
"step": 2309
},
{
"epoch": 0.8823529411764706,
"grad_norm": 4.563314359100101,
"learning_rate": 7.174359258550556e-08,
"loss": 0.5461,
"step": 2310
},
{
"epoch": 0.8827349121466769,
"grad_norm": 2.5504725475804335,
"learning_rate": 7.12840879067802e-08,
"loss": 0.4883,
"step": 2311
},
{
"epoch": 0.8831168831168831,
"grad_norm": 5.159420179728155,
"learning_rate": 7.082600508916159e-08,
"loss": 0.5941,
"step": 2312
},
{
"epoch": 0.8834988540870894,
"grad_norm": 11.219695075561999,
"learning_rate": 7.036934483397317e-08,
"loss": 0.476,
"step": 2313
},
{
"epoch": 0.8838808250572956,
"grad_norm": 2.978056435882148,
"learning_rate": 6.991410784036022e-08,
"loss": 0.477,
"step": 2314
},
{
"epoch": 0.8842627960275019,
"grad_norm": 3.7975254482400365,
"learning_rate": 6.946029480528903e-08,
"loss": 0.4614,
"step": 2315
},
{
"epoch": 0.8846447669977082,
"grad_norm": 3.4134180584954406,
"learning_rate": 6.900790642354637e-08,
"loss": 0.4966,
"step": 2316
},
{
"epoch": 0.8850267379679144,
"grad_norm": 4.643651136566239,
"learning_rate": 6.855694338773688e-08,
"loss": 0.5225,
"step": 2317
},
{
"epoch": 0.8854087089381207,
"grad_norm": 9.136809850536642,
"learning_rate": 6.810740638828383e-08,
"loss": 0.4469,
"step": 2318
},
{
"epoch": 0.8857906799083269,
"grad_norm": 3.1500664562294083,
"learning_rate": 6.76592961134268e-08,
"loss": 0.5334,
"step": 2319
},
{
"epoch": 0.8861726508785333,
"grad_norm": 2.409148369848989,
"learning_rate": 6.721261324922112e-08,
"loss": 0.3789,
"step": 2320
},
{
"epoch": 0.8865546218487395,
"grad_norm": 2.8271574701763527,
"learning_rate": 6.67673584795373e-08,
"loss": 0.5126,
"step": 2321
},
{
"epoch": 0.8869365928189458,
"grad_norm": 2.8112882228861777,
"learning_rate": 6.632353248605837e-08,
"loss": 0.5374,
"step": 2322
},
{
"epoch": 0.887318563789152,
"grad_norm": 2.1807020495214937,
"learning_rate": 6.588113594828093e-08,
"loss": 0.4594,
"step": 2323
},
{
"epoch": 0.8877005347593583,
"grad_norm": 4.0306723914894675,
"learning_rate": 6.544016954351239e-08,
"loss": 0.431,
"step": 2324
},
{
"epoch": 0.8880825057295646,
"grad_norm": 3.8868049027748692,
"learning_rate": 6.500063394687106e-08,
"loss": 0.4137,
"step": 2325
},
{
"epoch": 0.8884644766997708,
"grad_norm": 6.682039265354064,
"learning_rate": 6.456252983128474e-08,
"loss": 0.4696,
"step": 2326
},
{
"epoch": 0.8888464476699771,
"grad_norm": 3.4499785260092697,
"learning_rate": 6.412585786748903e-08,
"loss": 0.5297,
"step": 2327
},
{
"epoch": 0.8892284186401833,
"grad_norm": 2.429085558496027,
"learning_rate": 6.369061872402759e-08,
"loss": 0.513,
"step": 2328
},
{
"epoch": 0.8896103896103896,
"grad_norm": 3.9376933833626175,
"learning_rate": 6.325681306725005e-08,
"loss": 0.4788,
"step": 2329
},
{
"epoch": 0.8899923605805958,
"grad_norm": 3.2597699671375877,
"learning_rate": 6.282444156131151e-08,
"loss": 0.4658,
"step": 2330
},
{
"epoch": 0.8903743315508021,
"grad_norm": 4.26811354289429,
"learning_rate": 6.239350486817152e-08,
"loss": 0.5007,
"step": 2331
},
{
"epoch": 0.8907563025210085,
"grad_norm": 4.2206368649504755,
"learning_rate": 6.196400364759247e-08,
"loss": 0.4902,
"step": 2332
},
{
"epoch": 0.8911382734912147,
"grad_norm": 2.4907328780717592,
"learning_rate": 6.153593855713968e-08,
"loss": 0.4511,
"step": 2333
},
{
"epoch": 0.891520244461421,
"grad_norm": 4.959565850081995,
"learning_rate": 6.110931025217925e-08,
"loss": 0.5045,
"step": 2334
},
{
"epoch": 0.8919022154316272,
"grad_norm": 3.7572745391689524,
"learning_rate": 6.068411938587781e-08,
"loss": 0.5677,
"step": 2335
},
{
"epoch": 0.8922841864018335,
"grad_norm": 2.379380511225663,
"learning_rate": 6.02603666092013e-08,
"loss": 0.4285,
"step": 2336
},
{
"epoch": 0.8926661573720397,
"grad_norm": 3.754647796988103,
"learning_rate": 5.983805257091368e-08,
"loss": 0.5302,
"step": 2337
},
{
"epoch": 0.893048128342246,
"grad_norm": 2.848286049389822,
"learning_rate": 5.941717791757672e-08,
"loss": 0.4725,
"step": 2338
},
{
"epoch": 0.8934300993124522,
"grad_norm": 3.033769690010149,
"learning_rate": 5.899774329354779e-08,
"loss": 0.5222,
"step": 2339
},
{
"epoch": 0.8938120702826585,
"grad_norm": 2.822997726176448,
"learning_rate": 5.857974934098009e-08,
"loss": 0.4834,
"step": 2340
},
{
"epoch": 0.8941940412528648,
"grad_norm": 3.6218212785099846,
"learning_rate": 5.816319669982128e-08,
"loss": 0.4322,
"step": 2341
},
{
"epoch": 0.894576012223071,
"grad_norm": 4.100612733001243,
"learning_rate": 5.774808600781189e-08,
"loss": 0.4262,
"step": 2342
},
{
"epoch": 0.8949579831932774,
"grad_norm": 2.564049036298038,
"learning_rate": 5.733441790048521e-08,
"loss": 0.4038,
"step": 2343
},
{
"epoch": 0.8953399541634836,
"grad_norm": 3.9446403466513287,
"learning_rate": 5.692219301116552e-08,
"loss": 0.4825,
"step": 2344
},
{
"epoch": 0.8957219251336899,
"grad_norm": 6.785105325773704,
"learning_rate": 5.651141197096798e-08,
"loss": 0.5957,
"step": 2345
},
{
"epoch": 0.8961038961038961,
"grad_norm": 2.694455153342699,
"learning_rate": 5.61020754087973e-08,
"loss": 0.476,
"step": 2346
},
{
"epoch": 0.8964858670741024,
"grad_norm": 3.815755349635915,
"learning_rate": 5.5694183951346065e-08,
"loss": 0.4931,
"step": 2347
},
{
"epoch": 0.8968678380443086,
"grad_norm": 3.8630788349567897,
"learning_rate": 5.528773822309496e-08,
"loss": 0.5034,
"step": 2348
},
{
"epoch": 0.8972498090145149,
"grad_norm": 2.5044630949597626,
"learning_rate": 5.488273884631123e-08,
"loss": 0.5415,
"step": 2349
},
{
"epoch": 0.8976317799847212,
"grad_norm": 2.867664534297458,
"learning_rate": 5.447918644104743e-08,
"loss": 0.4707,
"step": 2350
},
{
"epoch": 0.8980137509549274,
"grad_norm": 8.48950132856148,
"learning_rate": 5.407708162514113e-08,
"loss": 0.498,
"step": 2351
},
{
"epoch": 0.8983957219251337,
"grad_norm": 2.34402164869529,
"learning_rate": 5.3676425014213435e-08,
"loss": 0.4552,
"step": 2352
},
{
"epoch": 0.8987776928953399,
"grad_norm": 4.1371745573301615,
"learning_rate": 5.327721722166834e-08,
"loss": 0.5665,
"step": 2353
},
{
"epoch": 0.8991596638655462,
"grad_norm": 6.504312039213924,
"learning_rate": 5.287945885869194e-08,
"loss": 0.4734,
"step": 2354
},
{
"epoch": 0.8995416348357524,
"grad_norm": 2.9021662899236684,
"learning_rate": 5.248315053425056e-08,
"loss": 0.4604,
"step": 2355
},
{
"epoch": 0.8999236058059588,
"grad_norm": 3.403797598625929,
"learning_rate": 5.208829285509142e-08,
"loss": 0.5411,
"step": 2356
},
{
"epoch": 0.900305576776165,
"grad_norm": 4.783477891165306,
"learning_rate": 5.169488642574027e-08,
"loss": 0.4629,
"step": 2357
},
{
"epoch": 0.9006875477463713,
"grad_norm": 15.68911867533447,
"learning_rate": 5.130293184850099e-08,
"loss": 0.5054,
"step": 2358
},
{
"epoch": 0.9010695187165776,
"grad_norm": 2.5332595127468616,
"learning_rate": 5.091242972345478e-08,
"loss": 0.5008,
"step": 2359
},
{
"epoch": 0.9014514896867838,
"grad_norm": 8.588767382940686,
"learning_rate": 5.052338064845929e-08,
"loss": 0.4966,
"step": 2360
},
{
"epoch": 0.9018334606569901,
"grad_norm": 4.865329868153571,
"learning_rate": 5.0135785219147296e-08,
"loss": 0.466,
"step": 2361
},
{
"epoch": 0.9022154316271963,
"grad_norm": 3.3066273732746083,
"learning_rate": 4.974964402892634e-08,
"loss": 0.4862,
"step": 2362
},
{
"epoch": 0.9025974025974026,
"grad_norm": 3.3019950317215727,
"learning_rate": 4.936495766897708e-08,
"loss": 0.4352,
"step": 2363
},
{
"epoch": 0.9029793735676088,
"grad_norm": 4.445642650292908,
"learning_rate": 4.898172672825318e-08,
"loss": 0.4258,
"step": 2364
},
{
"epoch": 0.9033613445378151,
"grad_norm": 2.6308787434531125,
"learning_rate": 4.859995179348031e-08,
"loss": 0.4671,
"step": 2365
},
{
"epoch": 0.9037433155080213,
"grad_norm": 4.461380943240074,
"learning_rate": 4.821963344915425e-08,
"loss": 0.45,
"step": 2366
},
{
"epoch": 0.9041252864782277,
"grad_norm": 2.569926207230875,
"learning_rate": 4.7840772277541355e-08,
"loss": 0.5161,
"step": 2367
},
{
"epoch": 0.904507257448434,
"grad_norm": 2.310180671077372,
"learning_rate": 4.746336885867708e-08,
"loss": 0.413,
"step": 2368
},
{
"epoch": 0.9048892284186402,
"grad_norm": 8.008831838428597,
"learning_rate": 4.708742377036445e-08,
"loss": 0.4887,
"step": 2369
},
{
"epoch": 0.9052711993888465,
"grad_norm": 2.417115548070838,
"learning_rate": 4.6712937588174516e-08,
"loss": 0.4353,
"step": 2370
},
{
"epoch": 0.9056531703590527,
"grad_norm": 4.394918894147626,
"learning_rate": 4.633991088544431e-08,
"loss": 0.4839,
"step": 2371
},
{
"epoch": 0.906035141329259,
"grad_norm": 8.02100161280262,
"learning_rate": 4.5968344233276555e-08,
"loss": 0.4704,
"step": 2372
},
{
"epoch": 0.9064171122994652,
"grad_norm": 3.9031007453137305,
"learning_rate": 4.5598238200538656e-08,
"loss": 0.4826,
"step": 2373
},
{
"epoch": 0.9067990832696715,
"grad_norm": 2.4618887290175016,
"learning_rate": 4.522959335386156e-08,
"loss": 0.4246,
"step": 2374
},
{
"epoch": 0.9071810542398777,
"grad_norm": 3.934186892254672,
"learning_rate": 4.4862410257639596e-08,
"loss": 0.4209,
"step": 2375
},
{
"epoch": 0.907563025210084,
"grad_norm": 2.1542872213974675,
"learning_rate": 4.449668947402896e-08,
"loss": 0.3929,
"step": 2376
},
{
"epoch": 0.9079449961802903,
"grad_norm": 18.797820507369316,
"learning_rate": 4.413243156294666e-08,
"loss": 0.4449,
"step": 2377
},
{
"epoch": 0.9083269671504965,
"grad_norm": 2.8407513122567387,
"learning_rate": 4.376963708207071e-08,
"loss": 0.4631,
"step": 2378
},
{
"epoch": 0.9087089381207029,
"grad_norm": 4.291609007601337,
"learning_rate": 4.340830658683825e-08,
"loss": 0.52,
"step": 2379
},
{
"epoch": 0.9090909090909091,
"grad_norm": 4.264413087780691,
"learning_rate": 4.304844063044499e-08,
"loss": 0.577,
"step": 2380
},
{
"epoch": 0.9094728800611154,
"grad_norm": 3.5747033080813173,
"learning_rate": 4.2690039763844667e-08,
"loss": 0.5238,
"step": 2381
},
{
"epoch": 0.9098548510313216,
"grad_norm": 3.5116154000371353,
"learning_rate": 4.233310453574801e-08,
"loss": 0.5185,
"step": 2382
},
{
"epoch": 0.9102368220015279,
"grad_norm": 3.3241750674383623,
"learning_rate": 4.197763549262146e-08,
"loss": 0.5087,
"step": 2383
},
{
"epoch": 0.9106187929717342,
"grad_norm": 3.5558252293960857,
"learning_rate": 4.1623633178687114e-08,
"loss": 0.5488,
"step": 2384
},
{
"epoch": 0.9110007639419404,
"grad_norm": 3.4242856695089157,
"learning_rate": 4.1271098135921336e-08,
"loss": 0.5749,
"step": 2385
},
{
"epoch": 0.9113827349121467,
"grad_norm": 2.8263395122975896,
"learning_rate": 4.092003090405416e-08,
"loss": 0.428,
"step": 2386
},
{
"epoch": 0.9117647058823529,
"grad_norm": 4.024724825063178,
"learning_rate": 4.0570432020568644e-08,
"loss": 0.4923,
"step": 2387
},
{
"epoch": 0.9121466768525592,
"grad_norm": 2.4570053679729487,
"learning_rate": 4.0222302020699094e-08,
"loss": 0.4676,
"step": 2388
},
{
"epoch": 0.9125286478227654,
"grad_norm": 4.360719142970035,
"learning_rate": 3.987564143743172e-08,
"loss": 0.5316,
"step": 2389
},
{
"epoch": 0.9129106187929718,
"grad_norm": 3.028029807023603,
"learning_rate": 3.953045080150297e-08,
"loss": 0.4971,
"step": 2390
},
{
"epoch": 0.913292589763178,
"grad_norm": 2.3050412394820627,
"learning_rate": 3.9186730641398215e-08,
"loss": 0.4644,
"step": 2391
},
{
"epoch": 0.9136745607333843,
"grad_norm": 2.8659318912563894,
"learning_rate": 3.8844481483352064e-08,
"loss": 0.4844,
"step": 2392
},
{
"epoch": 0.9140565317035906,
"grad_norm": 6.253440099952343,
"learning_rate": 3.8503703851347045e-08,
"loss": 0.5149,
"step": 2393
},
{
"epoch": 0.9144385026737968,
"grad_norm": 5.154526787048037,
"learning_rate": 3.8164398267112374e-08,
"loss": 0.5132,
"step": 2394
},
{
"epoch": 0.9148204736440031,
"grad_norm": 2.1481475788873787,
"learning_rate": 3.782656525012407e-08,
"loss": 0.4361,
"step": 2395
},
{
"epoch": 0.9152024446142093,
"grad_norm": 2.763346783105214,
"learning_rate": 3.7490205317603166e-08,
"loss": 0.5156,
"step": 2396
},
{
"epoch": 0.9155844155844156,
"grad_norm": 2.8926375420505344,
"learning_rate": 3.715531898451574e-08,
"loss": 0.4157,
"step": 2397
},
{
"epoch": 0.9159663865546218,
"grad_norm": 3.4496840499009234,
"learning_rate": 3.6821906763572e-08,
"loss": 0.4903,
"step": 2398
},
{
"epoch": 0.9163483575248281,
"grad_norm": 3.4267602997255486,
"learning_rate": 3.648996916522451e-08,
"loss": 0.4465,
"step": 2399
},
{
"epoch": 0.9167303284950343,
"grad_norm": 5.860122366156432,
"learning_rate": 3.6159506697668873e-08,
"loss": 0.5308,
"step": 2400
},
{
"epoch": 0.9171122994652406,
"grad_norm": 5.488387961195745,
"learning_rate": 3.5830519866842157e-08,
"loss": 0.5484,
"step": 2401
},
{
"epoch": 0.917494270435447,
"grad_norm": 6.794836685398453,
"learning_rate": 3.550300917642213e-08,
"loss": 0.4806,
"step": 2402
},
{
"epoch": 0.9178762414056532,
"grad_norm": 2.6093841180604693,
"learning_rate": 3.517697512782658e-08,
"loss": 0.4599,
"step": 2403
},
{
"epoch": 0.9182582123758595,
"grad_norm": 3.4504804735195282,
"learning_rate": 3.4852418220212566e-08,
"loss": 0.5402,
"step": 2404
},
{
"epoch": 0.9186401833460657,
"grad_norm": 4.185650752063594,
"learning_rate": 3.4529338950475714e-08,
"loss": 0.4815,
"step": 2405
},
{
"epoch": 0.919022154316272,
"grad_norm": 3.9855108336709373,
"learning_rate": 3.420773781324937e-08,
"loss": 0.4591,
"step": 2406
},
{
"epoch": 0.9194041252864782,
"grad_norm": 4.145701142888078,
"learning_rate": 3.388761530090378e-08,
"loss": 0.5677,
"step": 2407
},
{
"epoch": 0.9197860962566845,
"grad_norm": 6.238351276020026,
"learning_rate": 3.356897190354557e-08,
"loss": 0.488,
"step": 2408
},
{
"epoch": 0.9201680672268907,
"grad_norm": 4.5878782856218,
"learning_rate": 3.3251808109016956e-08,
"loss": 0.599,
"step": 2409
},
{
"epoch": 0.920550038197097,
"grad_norm": 2.9796613949389337,
"learning_rate": 3.293612440289428e-08,
"loss": 0.468,
"step": 2410
},
{
"epoch": 0.9209320091673033,
"grad_norm": 38.85951380119334,
"learning_rate": 3.2621921268488504e-08,
"loss": 0.5484,
"step": 2411
},
{
"epoch": 0.9213139801375095,
"grad_norm": 2.4367627848014224,
"learning_rate": 3.230919918684372e-08,
"loss": 0.4052,
"step": 2412
},
{
"epoch": 0.9216959511077158,
"grad_norm": 4.128488958335345,
"learning_rate": 3.1997958636736265e-08,
"loss": 0.5357,
"step": 2413
},
{
"epoch": 0.922077922077922,
"grad_norm": 3.62015265467357,
"learning_rate": 3.1688200094674656e-08,
"loss": 0.5516,
"step": 2414
},
{
"epoch": 0.9224598930481284,
"grad_norm": 5.609850730563149,
"learning_rate": 3.1379924034897866e-08,
"loss": 0.4584,
"step": 2415
},
{
"epoch": 0.9228418640183346,
"grad_norm": 4.925225282573639,
"learning_rate": 3.107313092937591e-08,
"loss": 0.4784,
"step": 2416
},
{
"epoch": 0.9232238349885409,
"grad_norm": 3.614076871714596,
"learning_rate": 3.0767821247807966e-08,
"loss": 0.5702,
"step": 2417
},
{
"epoch": 0.9236058059587471,
"grad_norm": 2.3233578959478702,
"learning_rate": 3.0463995457622125e-08,
"loss": 0.4334,
"step": 2418
},
{
"epoch": 0.9239877769289534,
"grad_norm": 2.735136462427685,
"learning_rate": 3.016165402397475e-08,
"loss": 0.4939,
"step": 2419
},
{
"epoch": 0.9243697478991597,
"grad_norm": 14.909691725186427,
"learning_rate": 2.986079740974967e-08,
"loss": 0.5128,
"step": 2420
},
{
"epoch": 0.9247517188693659,
"grad_norm": 3.937721610666776,
"learning_rate": 2.9561426075557115e-08,
"loss": 0.5306,
"step": 2421
},
{
"epoch": 0.9251336898395722,
"grad_norm": 3.9909072151582774,
"learning_rate": 2.9263540479734006e-08,
"loss": 0.5413,
"step": 2422
},
{
"epoch": 0.9255156608097784,
"grad_norm": 2.098663773977231,
"learning_rate": 2.8967141078342196e-08,
"loss": 0.4502,
"step": 2423
},
{
"epoch": 0.9258976317799847,
"grad_norm": 2.7573703745421296,
"learning_rate": 2.8672228325167912e-08,
"loss": 0.4884,
"step": 2424
},
{
"epoch": 0.926279602750191,
"grad_norm": 6.424686498232581,
"learning_rate": 2.8378802671722103e-08,
"loss": 0.5137,
"step": 2425
},
{
"epoch": 0.9266615737203973,
"grad_norm": 7.560061974499517,
"learning_rate": 2.8086864567238078e-08,
"loss": 0.4843,
"step": 2426
},
{
"epoch": 0.9270435446906035,
"grad_norm": 6.3387544092100185,
"learning_rate": 2.7796414458672314e-08,
"loss": 0.5097,
"step": 2427
},
{
"epoch": 0.9274255156608098,
"grad_norm": 3.6039281338462255,
"learning_rate": 2.7507452790703213e-08,
"loss": 0.4869,
"step": 2428
},
{
"epoch": 0.9278074866310161,
"grad_norm": 5.806052774033399,
"learning_rate": 2.7219980005729783e-08,
"loss": 0.4672,
"step": 2429
},
{
"epoch": 0.9281894576012223,
"grad_norm": 7.564785636649657,
"learning_rate": 2.6933996543872183e-08,
"loss": 0.4386,
"step": 2430
},
{
"epoch": 0.9285714285714286,
"grad_norm": 2.9170232799721747,
"learning_rate": 2.6649502842970296e-08,
"loss": 0.4851,
"step": 2431
},
{
"epoch": 0.9289533995416348,
"grad_norm": 9.392568327180424,
"learning_rate": 2.6366499338582815e-08,
"loss": 0.5296,
"step": 2432
},
{
"epoch": 0.9293353705118411,
"grad_norm": 9.70506303380789,
"learning_rate": 2.6084986463987268e-08,
"loss": 0.4244,
"step": 2433
},
{
"epoch": 0.9297173414820473,
"grad_norm": 13.748575755941072,
"learning_rate": 2.5804964650179006e-08,
"loss": 0.5472,
"step": 2434
},
{
"epoch": 0.9300993124522536,
"grad_norm": 6.1287489440569605,
"learning_rate": 2.5526434325870538e-08,
"loss": 0.5373,
"step": 2435
},
{
"epoch": 0.93048128342246,
"grad_norm": 2.167781902789066,
"learning_rate": 2.524939591749109e-08,
"loss": 0.4751,
"step": 2436
},
{
"epoch": 0.9308632543926662,
"grad_norm": 7.266627876871788,
"learning_rate": 2.497384984918516e-08,
"loss": 0.5423,
"step": 2437
},
{
"epoch": 0.9312452253628725,
"grad_norm": 3.263820326616446,
"learning_rate": 2.469979654281329e-08,
"loss": 0.4727,
"step": 2438
},
{
"epoch": 0.9316271963330787,
"grad_norm": 3.5820381946347752,
"learning_rate": 2.4427236417949972e-08,
"loss": 0.5531,
"step": 2439
},
{
"epoch": 0.932009167303285,
"grad_norm": 8.255211815573727,
"learning_rate": 2.4156169891884072e-08,
"loss": 0.4857,
"step": 2440
},
{
"epoch": 0.9323911382734912,
"grad_norm": 3.131970430778959,
"learning_rate": 2.3886597379617513e-08,
"loss": 0.45,
"step": 2441
},
{
"epoch": 0.9327731092436975,
"grad_norm": 5.276546664852998,
"learning_rate": 2.3618519293864823e-08,
"loss": 0.4804,
"step": 2442
},
{
"epoch": 0.9331550802139037,
"grad_norm": 7.611226617433435,
"learning_rate": 2.3351936045052924e-08,
"loss": 0.4964,
"step": 2443
},
{
"epoch": 0.93353705118411,
"grad_norm": 2.066917669950662,
"learning_rate": 2.308684804131966e-08,
"loss": 0.4592,
"step": 2444
},
{
"epoch": 0.9339190221543163,
"grad_norm": 69.162277438671,
"learning_rate": 2.2823255688514176e-08,
"loss": 0.4307,
"step": 2445
},
{
"epoch": 0.9343009931245225,
"grad_norm": 4.138543166981627,
"learning_rate": 2.256115939019543e-08,
"loss": 0.5291,
"step": 2446
},
{
"epoch": 0.9346829640947288,
"grad_norm": 2.53167006255338,
"learning_rate": 2.230055954763199e-08,
"loss": 0.4888,
"step": 2447
},
{
"epoch": 0.935064935064935,
"grad_norm": 4.663611449231072,
"learning_rate": 2.2041456559801496e-08,
"loss": 0.552,
"step": 2448
},
{
"epoch": 0.9354469060351414,
"grad_norm": 2.829414766827518,
"learning_rate": 2.1783850823389515e-08,
"loss": 0.5144,
"step": 2449
},
{
"epoch": 0.9358288770053476,
"grad_norm": 2.5354145568379356,
"learning_rate": 2.15277427327899e-08,
"loss": 0.5085,
"step": 2450
},
{
"epoch": 0.9362108479755539,
"grad_norm": 2.3139431168002016,
"learning_rate": 2.1273132680103334e-08,
"loss": 0.4271,
"step": 2451
},
{
"epoch": 0.9365928189457601,
"grad_norm": 3.6244433465779653,
"learning_rate": 2.102002105513678e-08,
"loss": 0.5293,
"step": 2452
},
{
"epoch": 0.9369747899159664,
"grad_norm": 2.5781479249341968,
"learning_rate": 2.076840824540349e-08,
"loss": 0.4442,
"step": 2453
},
{
"epoch": 0.9373567608861727,
"grad_norm": 5.273646909202582,
"learning_rate": 2.0518294636121757e-08,
"loss": 0.5527,
"step": 2454
},
{
"epoch": 0.9377387318563789,
"grad_norm": 5.390575416829436,
"learning_rate": 2.0269680610214833e-08,
"loss": 0.4569,
"step": 2455
},
{
"epoch": 0.9381207028265852,
"grad_norm": 2.6922649557728886,
"learning_rate": 2.0022566548310027e-08,
"loss": 0.4581,
"step": 2456
},
{
"epoch": 0.9385026737967914,
"grad_norm": 6.359944613369677,
"learning_rate": 1.9776952828737926e-08,
"loss": 0.4757,
"step": 2457
},
{
"epoch": 0.9388846447669977,
"grad_norm": 4.587885863519702,
"learning_rate": 1.9532839827532732e-08,
"loss": 0.485,
"step": 2458
},
{
"epoch": 0.9392666157372039,
"grad_norm": 9.028109372763168,
"learning_rate": 1.929022791843038e-08,
"loss": 0.5032,
"step": 2459
},
{
"epoch": 0.9396485867074102,
"grad_norm": 4.07780994271552,
"learning_rate": 1.904911747286908e-08,
"loss": 0.4723,
"step": 2460
},
{
"epoch": 0.9400305576776165,
"grad_norm": 7.248527924705579,
"learning_rate": 1.880950885998811e-08,
"loss": 0.5223,
"step": 2461
},
{
"epoch": 0.9404125286478228,
"grad_norm": 21.084309021441634,
"learning_rate": 1.857140244662747e-08,
"loss": 0.4618,
"step": 2462
},
{
"epoch": 0.9407944996180291,
"grad_norm": 9.320765763275892,
"learning_rate": 1.833479859732734e-08,
"loss": 0.4479,
"step": 2463
},
{
"epoch": 0.9411764705882353,
"grad_norm": 2.95765530893394,
"learning_rate": 1.8099697674327508e-08,
"loss": 0.5336,
"step": 2464
},
{
"epoch": 0.9415584415584416,
"grad_norm": 5.30798966055467,
"learning_rate": 1.786610003756661e-08,
"loss": 0.473,
"step": 2465
},
{
"epoch": 0.9419404125286478,
"grad_norm": 5.338861028871045,
"learning_rate": 1.7634006044682126e-08,
"loss": 0.5285,
"step": 2466
},
{
"epoch": 0.9423223834988541,
"grad_norm": 5.805591085748861,
"learning_rate": 1.7403416051008924e-08,
"loss": 0.5057,
"step": 2467
},
{
"epoch": 0.9427043544690603,
"grad_norm": 2.9144284769885775,
"learning_rate": 1.717433040957972e-08,
"loss": 0.4936,
"step": 2468
},
{
"epoch": 0.9430863254392666,
"grad_norm": 24.451455797558253,
"learning_rate": 1.6946749471123956e-08,
"loss": 0.4861,
"step": 2469
},
{
"epoch": 0.9434682964094728,
"grad_norm": 12.971234620764871,
"learning_rate": 1.6720673584067148e-08,
"loss": 0.4059,
"step": 2470
},
{
"epoch": 0.9438502673796791,
"grad_norm": 18.01743851371368,
"learning_rate": 1.6496103094530757e-08,
"loss": 0.5127,
"step": 2471
},
{
"epoch": 0.9442322383498855,
"grad_norm": 2.9056555653541305,
"learning_rate": 1.627303834633187e-08,
"loss": 0.446,
"step": 2472
},
{
"epoch": 0.9446142093200917,
"grad_norm": 2.551777757341228,
"learning_rate": 1.6051479680981415e-08,
"loss": 0.4613,
"step": 2473
},
{
"epoch": 0.944996180290298,
"grad_norm": 3.003276545289939,
"learning_rate": 1.5831427437685173e-08,
"loss": 0.5248,
"step": 2474
},
{
"epoch": 0.9453781512605042,
"grad_norm": 5.323459704857999,
"learning_rate": 1.561288195334265e-08,
"loss": 0.532,
"step": 2475
},
{
"epoch": 0.9457601222307105,
"grad_norm": 2.365400516468899,
"learning_rate": 1.5395843562545974e-08,
"loss": 0.4618,
"step": 2476
},
{
"epoch": 0.9461420932009167,
"grad_norm": 2.755227125455112,
"learning_rate": 1.5180312597580458e-08,
"loss": 0.3999,
"step": 2477
},
{
"epoch": 0.946524064171123,
"grad_norm": 3.2326623623248794,
"learning_rate": 1.4966289388423147e-08,
"loss": 0.4357,
"step": 2478
},
{
"epoch": 0.9469060351413292,
"grad_norm": 2.361174127926857,
"learning_rate": 1.4753774262743046e-08,
"loss": 0.4554,
"step": 2479
},
{
"epoch": 0.9472880061115355,
"grad_norm": 4.378556170360985,
"learning_rate": 1.4542767545900003e-08,
"loss": 0.4378,
"step": 2480
},
{
"epoch": 0.9476699770817418,
"grad_norm": 2.2364556283777164,
"learning_rate": 1.4333269560944717e-08,
"loss": 0.4661,
"step": 2481
},
{
"epoch": 0.948051948051948,
"grad_norm": 2.563590121932103,
"learning_rate": 1.4125280628617952e-08,
"loss": 0.4635,
"step": 2482
},
{
"epoch": 0.9484339190221543,
"grad_norm": 3.3298514105520134,
"learning_rate": 1.3918801067349995e-08,
"loss": 0.4859,
"step": 2483
},
{
"epoch": 0.9488158899923606,
"grad_norm": 5.8126286577642015,
"learning_rate": 1.3713831193260528e-08,
"loss": 0.5104,
"step": 2484
},
{
"epoch": 0.9491978609625669,
"grad_norm": 2.400524311988587,
"learning_rate": 1.3510371320157643e-08,
"loss": 0.4765,
"step": 2485
},
{
"epoch": 0.9495798319327731,
"grad_norm": 2.1295617001631304,
"learning_rate": 1.3308421759537836e-08,
"loss": 0.4082,
"step": 2486
},
{
"epoch": 0.9499618029029794,
"grad_norm": 13.243672852391542,
"learning_rate": 1.3107982820585228e-08,
"loss": 0.4391,
"step": 2487
},
{
"epoch": 0.9503437738731857,
"grad_norm": 3.0429698587661154,
"learning_rate": 1.2909054810171239e-08,
"loss": 0.4903,
"step": 2488
},
{
"epoch": 0.9507257448433919,
"grad_norm": 4.626317767425915,
"learning_rate": 1.2711638032854021e-08,
"loss": 0.4923,
"step": 2489
},
{
"epoch": 0.9511077158135982,
"grad_norm": 6.637464390802047,
"learning_rate": 1.251573279087792e-08,
"loss": 0.4712,
"step": 2490
},
{
"epoch": 0.9514896867838044,
"grad_norm": 3.5719726736167736,
"learning_rate": 1.2321339384173345e-08,
"loss": 0.5081,
"step": 2491
},
{
"epoch": 0.9518716577540107,
"grad_norm": 4.561259347019035,
"learning_rate": 1.21284581103559e-08,
"loss": 0.5352,
"step": 2492
},
{
"epoch": 0.9522536287242169,
"grad_norm": 2.8215706918666403,
"learning_rate": 1.1937089264726253e-08,
"loss": 0.519,
"step": 2493
},
{
"epoch": 0.9526355996944232,
"grad_norm": 3.378391343951543,
"learning_rate": 1.1747233140269596e-08,
"loss": 0.4295,
"step": 2494
},
{
"epoch": 0.9530175706646294,
"grad_norm": 8.946458315475008,
"learning_rate": 1.1558890027654866e-08,
"loss": 0.4447,
"step": 2495
},
{
"epoch": 0.9533995416348358,
"grad_norm": 7.700140610935818,
"learning_rate": 1.1372060215234847e-08,
"loss": 0.4444,
"step": 2496
},
{
"epoch": 0.9537815126050421,
"grad_norm": 6.912722171712339,
"learning_rate": 1.118674398904551e-08,
"loss": 0.4639,
"step": 2497
},
{
"epoch": 0.9541634835752483,
"grad_norm": 4.024897980877189,
"learning_rate": 1.100294163280513e-08,
"loss": 0.492,
"step": 2498
},
{
"epoch": 0.9545454545454546,
"grad_norm": 3.0992206704953342,
"learning_rate": 1.0820653427914828e-08,
"loss": 0.522,
"step": 2499
},
{
"epoch": 0.9549274255156608,
"grad_norm": 2.855578844605935,
"learning_rate": 1.063987965345703e-08,
"loss": 0.4911,
"step": 2500
},
{
"epoch": 0.9553093964858671,
"grad_norm": 3.906976426151052,
"learning_rate": 1.04606205861959e-08,
"loss": 0.4569,
"step": 2501
},
{
"epoch": 0.9556913674560733,
"grad_norm": 12.941475800622458,
"learning_rate": 1.0282876500576688e-08,
"loss": 0.4352,
"step": 2502
},
{
"epoch": 0.9560733384262796,
"grad_norm": 4.644900257875716,
"learning_rate": 1.01066476687246e-08,
"loss": 0.4578,
"step": 2503
},
{
"epoch": 0.9564553093964858,
"grad_norm": 2.668015599505941,
"learning_rate": 9.931934360445814e-09,
"loss": 0.4355,
"step": 2504
},
{
"epoch": 0.9568372803666921,
"grad_norm": 2.806844667156648,
"learning_rate": 9.758736843225701e-09,
"loss": 0.4629,
"step": 2505
},
{
"epoch": 0.9572192513368984,
"grad_norm": 2.660779902021113,
"learning_rate": 9.587055382229037e-09,
"loss": 0.5203,
"step": 2506
},
{
"epoch": 0.9576012223071046,
"grad_norm": 3.800974420517706,
"learning_rate": 9.41689024029968e-09,
"loss": 0.4719,
"step": 2507
},
{
"epoch": 0.957983193277311,
"grad_norm": 3.0800978990708643,
"learning_rate": 9.248241677960011e-09,
"loss": 0.4703,
"step": 2508
},
{
"epoch": 0.9583651642475172,
"grad_norm": 2.978333812158537,
"learning_rate": 9.08110995341016e-09,
"loss": 0.4924,
"step": 2509
},
{
"epoch": 0.9587471352177235,
"grad_norm": 2.790216095881215,
"learning_rate": 8.915495322528555e-09,
"loss": 0.4961,
"step": 2510
},
{
"epoch": 0.9591291061879297,
"grad_norm": 3.960282013008847,
"learning_rate": 8.751398038870484e-09,
"loss": 0.5025,
"step": 2511
},
{
"epoch": 0.959511077158136,
"grad_norm": 3.166325201504291,
"learning_rate": 8.588818353668537e-09,
"loss": 0.453,
"step": 2512
},
{
"epoch": 0.9598930481283422,
"grad_norm": 19.411603373034804,
"learning_rate": 8.427756515831497e-09,
"loss": 0.4555,
"step": 2513
},
{
"epoch": 0.9602750190985485,
"grad_norm": 2.721687001904413,
"learning_rate": 8.26821277194467e-09,
"loss": 0.4848,
"step": 2514
},
{
"epoch": 0.9606569900687548,
"grad_norm": 2.762646240823191,
"learning_rate": 8.110187366268894e-09,
"loss": 0.5042,
"step": 2515
},
{
"epoch": 0.961038961038961,
"grad_norm": 2.817165214716127,
"learning_rate": 7.953680540740748e-09,
"loss": 0.4613,
"step": 2516
},
{
"epoch": 0.9614209320091673,
"grad_norm": 7.43002937648401,
"learning_rate": 7.798692534971451e-09,
"loss": 0.4693,
"step": 2517
},
{
"epoch": 0.9618029029793735,
"grad_norm": 3.180631330964851,
"learning_rate": 7.645223586247418e-09,
"loss": 0.5238,
"step": 2518
},
{
"epoch": 0.9621848739495799,
"grad_norm": 2.856222981196358,
"learning_rate": 7.493273929528921e-09,
"loss": 0.4452,
"step": 2519
},
{
"epoch": 0.9625668449197861,
"grad_norm": 3.3447643860752265,
"learning_rate": 7.3428437974504265e-09,
"loss": 0.5306,
"step": 2520
},
{
"epoch": 0.9629488158899924,
"grad_norm": 2.3267905770269666,
"learning_rate": 7.193933420320042e-09,
"loss": 0.4516,
"step": 2521
},
{
"epoch": 0.9633307868601986,
"grad_norm": 2.7302756172093074,
"learning_rate": 7.046543026118956e-09,
"loss": 0.4803,
"step": 2522
},
{
"epoch": 0.9637127578304049,
"grad_norm": 4.923203717473156,
"learning_rate": 6.900672840501554e-09,
"loss": 0.6031,
"step": 2523
},
{
"epoch": 0.9640947288006112,
"grad_norm": 3.8328762563622543,
"learning_rate": 6.7563230867946354e-09,
"loss": 0.4466,
"step": 2524
},
{
"epoch": 0.9644766997708174,
"grad_norm": 4.211498973825368,
"learning_rate": 6.613493985997088e-09,
"loss": 0.4968,
"step": 2525
},
{
"epoch": 0.9648586707410237,
"grad_norm": 5.64224813345612,
"learning_rate": 6.47218575677988e-09,
"loss": 0.4485,
"step": 2526
},
{
"epoch": 0.9652406417112299,
"grad_norm": 3.6256739132444364,
"learning_rate": 6.332398615485512e-09,
"loss": 0.4546,
"step": 2527
},
{
"epoch": 0.9656226126814362,
"grad_norm": 3.114141972804922,
"learning_rate": 6.194132776127458e-09,
"loss": 0.4622,
"step": 2528
},
{
"epoch": 0.9660045836516424,
"grad_norm": 4.335451632477308,
"learning_rate": 6.057388450390499e-09,
"loss": 0.4512,
"step": 2529
},
{
"epoch": 0.9663865546218487,
"grad_norm": 3.6689875192180774,
"learning_rate": 5.922165847629612e-09,
"loss": 0.4927,
"step": 2530
},
{
"epoch": 0.966768525592055,
"grad_norm": 2.7010211520935528,
"learning_rate": 5.788465174870194e-09,
"loss": 0.4484,
"step": 2531
},
{
"epoch": 0.9671504965622613,
"grad_norm": 3.6118256042660954,
"learning_rate": 5.656286636807728e-09,
"loss": 0.4991,
"step": 2532
},
{
"epoch": 0.9675324675324676,
"grad_norm": 4.550237561723368,
"learning_rate": 5.525630435806894e-09,
"loss": 0.5044,
"step": 2533
},
{
"epoch": 0.9679144385026738,
"grad_norm": 3.809819785626597,
"learning_rate": 5.396496771902015e-09,
"loss": 0.5116,
"step": 2534
},
{
"epoch": 0.9682964094728801,
"grad_norm": 2.018702332806629,
"learning_rate": 5.268885842796389e-09,
"loss": 0.4236,
"step": 2535
},
{
"epoch": 0.9686783804430863,
"grad_norm": 3.0965862496642207,
"learning_rate": 5.142797843861846e-09,
"loss": 0.47,
"step": 2536
},
{
"epoch": 0.9690603514132926,
"grad_norm": 5.721837411629174,
"learning_rate": 5.018232968138747e-09,
"loss": 0.6053,
"step": 2537
},
{
"epoch": 0.9694423223834988,
"grad_norm": 4.538074720461138,
"learning_rate": 4.895191406335542e-09,
"loss": 0.4949,
"step": 2538
},
{
"epoch": 0.9698242933537051,
"grad_norm": 3.284799260650926,
"learning_rate": 4.773673346828322e-09,
"loss": 0.496,
"step": 2539
},
{
"epoch": 0.9702062643239114,
"grad_norm": 2.30954834543498,
"learning_rate": 4.653678975661157e-09,
"loss": 0.4054,
"step": 2540
},
{
"epoch": 0.9705882352941176,
"grad_norm": 5.735682125808379,
"learning_rate": 4.535208476544761e-09,
"loss": 0.5455,
"step": 2541
},
{
"epoch": 0.970970206264324,
"grad_norm": 3.7993614694320303,
"learning_rate": 4.418262030857156e-09,
"loss": 0.4997,
"step": 2542
},
{
"epoch": 0.9713521772345302,
"grad_norm": 2.7825335207587245,
"learning_rate": 4.302839817643122e-09,
"loss": 0.4377,
"step": 2543
},
{
"epoch": 0.9717341482047365,
"grad_norm": 2.7985021160529144,
"learning_rate": 4.188942013613417e-09,
"loss": 0.5475,
"step": 2544
},
{
"epoch": 0.9721161191749427,
"grad_norm": 6.054199223017417,
"learning_rate": 4.076568793145552e-09,
"loss": 0.4502,
"step": 2545
},
{
"epoch": 0.972498090145149,
"grad_norm": 5.005797345035652,
"learning_rate": 3.9657203282823514e-09,
"loss": 0.4847,
"step": 2546
},
{
"epoch": 0.9728800611153552,
"grad_norm": 3.542144976610882,
"learning_rate": 3.856396788732508e-09,
"loss": 0.4225,
"step": 2547
},
{
"epoch": 0.9732620320855615,
"grad_norm": 3.9331468206663573,
"learning_rate": 3.748598341870135e-09,
"loss": 0.4686,
"step": 2548
},
{
"epoch": 0.9736440030557678,
"grad_norm": 11.047515167994217,
"learning_rate": 3.6423251527341048e-09,
"loss": 0.4378,
"step": 2549
},
{
"epoch": 0.974025974025974,
"grad_norm": 5.641008244986063,
"learning_rate": 3.5375773840284895e-09,
"loss": 0.4458,
"step": 2550
},
{
"epoch": 0.9744079449961803,
"grad_norm": 3.1768114309293574,
"learning_rate": 3.434355196121674e-09,
"loss": 0.4294,
"step": 2551
},
{
"epoch": 0.9747899159663865,
"grad_norm": 3.326041714498732,
"learning_rate": 3.3326587470465793e-09,
"loss": 0.4936,
"step": 2552
},
{
"epoch": 0.9751718869365928,
"grad_norm": 3.192592176693493,
"learning_rate": 3.2324881925001045e-09,
"loss": 0.5045,
"step": 2553
},
{
"epoch": 0.975553857906799,
"grad_norm": 3.6882386753493313,
"learning_rate": 3.1338436858431293e-09,
"loss": 0.4618,
"step": 2554
},
{
"epoch": 0.9759358288770054,
"grad_norm": 3.1630091731444105,
"learning_rate": 3.036725378099958e-09,
"loss": 0.5647,
"step": 2555
},
{
"epoch": 0.9763177998472116,
"grad_norm": 6.080592480427539,
"learning_rate": 2.941133417958541e-09,
"loss": 0.4768,
"step": 2556
},
{
"epoch": 0.9766997708174179,
"grad_norm": 3.32881963748884,
"learning_rate": 2.8470679517696995e-09,
"loss": 0.4456,
"step": 2557
},
{
"epoch": 0.9770817417876242,
"grad_norm": 2.825770502568057,
"learning_rate": 2.7545291235475665e-09,
"loss": 0.4687,
"step": 2558
},
{
"epoch": 0.9774637127578304,
"grad_norm": 2.8682663422456565,
"learning_rate": 2.663517074968591e-09,
"loss": 0.4346,
"step": 2559
},
{
"epoch": 0.9778456837280367,
"grad_norm": 20.475308532466034,
"learning_rate": 2.5740319453720906e-09,
"loss": 0.5423,
"step": 2560
},
{
"epoch": 0.9782276546982429,
"grad_norm": 3.427236872657874,
"learning_rate": 2.4860738717593643e-09,
"loss": 0.4925,
"step": 2561
},
{
"epoch": 0.9786096256684492,
"grad_norm": 3.4960090273367253,
"learning_rate": 2.399642988794137e-09,
"loss": 0.4906,
"step": 2562
},
{
"epoch": 0.9789915966386554,
"grad_norm": 5.310310660843343,
"learning_rate": 2.314739428801671e-09,
"loss": 0.5329,
"step": 2563
},
{
"epoch": 0.9793735676088617,
"grad_norm": 2.1641671491311025,
"learning_rate": 2.2313633217689865e-09,
"loss": 0.4817,
"step": 2564
},
{
"epoch": 0.9797555385790679,
"grad_norm": 7.750415967420114,
"learning_rate": 2.1495147953448643e-09,
"loss": 0.5322,
"step": 2565
},
{
"epoch": 0.9801375095492743,
"grad_norm": 2.076980851759081,
"learning_rate": 2.0691939748389566e-09,
"loss": 0.4361,
"step": 2566
},
{
"epoch": 0.9805194805194806,
"grad_norm": 4.796938970992404,
"learning_rate": 1.990400983222229e-09,
"loss": 0.4756,
"step": 2567
},
{
"epoch": 0.9809014514896868,
"grad_norm": 6.541046842142587,
"learning_rate": 1.9131359411265203e-09,
"loss": 0.4655,
"step": 2568
},
{
"epoch": 0.9812834224598931,
"grad_norm": 5.938506466582482,
"learning_rate": 1.8373989668443168e-09,
"loss": 0.5141,
"step": 2569
},
{
"epoch": 0.9816653934300993,
"grad_norm": 2.8652369163931986,
"learning_rate": 1.7631901763287549e-09,
"loss": 0.5437,
"step": 2570
},
{
"epoch": 0.9820473644003056,
"grad_norm": 2.910245040907176,
"learning_rate": 1.6905096831931753e-09,
"loss": 0.4538,
"step": 2571
},
{
"epoch": 0.9824293353705118,
"grad_norm": 303.4846717609501,
"learning_rate": 1.6193575987112352e-09,
"loss": 0.4642,
"step": 2572
},
{
"epoch": 0.9828113063407181,
"grad_norm": 3.269168545125339,
"learning_rate": 1.5497340318165742e-09,
"loss": 0.4603,
"step": 2573
},
{
"epoch": 0.9831932773109243,
"grad_norm": 2.640937522483087,
"learning_rate": 1.4816390891025931e-09,
"loss": 0.4574,
"step": 2574
},
{
"epoch": 0.9835752482811306,
"grad_norm": 3.57904681889891,
"learning_rate": 1.4150728748224538e-09,
"loss": 0.5048,
"step": 2575
},
{
"epoch": 0.983957219251337,
"grad_norm": 8.684873939885174,
"learning_rate": 1.3500354908888566e-09,
"loss": 0.5121,
"step": 2576
},
{
"epoch": 0.9843391902215431,
"grad_norm": 2.807742880348149,
"learning_rate": 1.286527036873819e-09,
"loss": 0.4514,
"step": 2577
},
{
"epoch": 0.9847211611917495,
"grad_norm": 2.6363709403478466,
"learning_rate": 1.224547610008453e-09,
"loss": 0.4982,
"step": 2578
},
{
"epoch": 0.9851031321619557,
"grad_norm": 3.1684012952039726,
"learning_rate": 1.164097305183298e-09,
"loss": 0.4583,
"step": 2579
},
{
"epoch": 0.985485103132162,
"grad_norm": 3.590684132000082,
"learning_rate": 1.1051762149473232e-09,
"loss": 0.4964,
"step": 2580
},
{
"epoch": 0.9858670741023682,
"grad_norm": 5.004510927683761,
"learning_rate": 1.0477844295087024e-09,
"loss": 0.4452,
"step": 2581
},
{
"epoch": 0.9862490450725745,
"grad_norm": 7.36797219813988,
"learning_rate": 9.919220367340387e-10,
"loss": 0.5761,
"step": 2582
},
{
"epoch": 0.9866310160427807,
"grad_norm": 3.418088354979299,
"learning_rate": 9.375891221484743e-10,
"loss": 0.4862,
"step": 2583
},
{
"epoch": 0.987012987012987,
"grad_norm": 2.8783008805563166,
"learning_rate": 8.847857689355809e-10,
"loss": 0.4573,
"step": 2584
},
{
"epoch": 0.9873949579831933,
"grad_norm": 13.812348241958448,
"learning_rate": 8.335120579370247e-10,
"loss": 0.4834,
"step": 2585
},
{
"epoch": 0.9877769289533995,
"grad_norm": 3.035692731524948,
"learning_rate": 7.837680676526792e-10,
"loss": 0.4293,
"step": 2586
},
{
"epoch": 0.9881588999236058,
"grad_norm": 2.311185049015733,
"learning_rate": 7.355538742406242e-10,
"loss": 0.4276,
"step": 2587
},
{
"epoch": 0.988540870893812,
"grad_norm": 2.3430547457905457,
"learning_rate": 6.88869551516369e-10,
"loss": 0.4485,
"step": 2588
},
{
"epoch": 0.9889228418640184,
"grad_norm": 4.04539151835123,
"learning_rate": 6.437151709536292e-10,
"loss": 0.5413,
"step": 2589
},
{
"epoch": 0.9893048128342246,
"grad_norm": 3.344931739067561,
"learning_rate": 6.000908016836614e-10,
"loss": 0.4889,
"step": 2590
},
{
"epoch": 0.9896867838044309,
"grad_norm": 5.715271667567572,
"learning_rate": 5.579965104951511e-10,
"loss": 0.4916,
"step": 2591
},
{
"epoch": 0.9900687547746372,
"grad_norm": 4.474119824184043,
"learning_rate": 5.174323618343246e-10,
"loss": 0.4628,
"step": 2592
},
{
"epoch": 0.9904507257448434,
"grad_norm": 4.58984054764715,
"learning_rate": 4.783984178047263e-10,
"loss": 0.4496,
"step": 2593
},
{
"epoch": 0.9908326967150497,
"grad_norm": 3.537106032864748,
"learning_rate": 4.40894738167219e-10,
"loss": 0.5855,
"step": 2594
},
{
"epoch": 0.9912146676852559,
"grad_norm": 3.2765152812486344,
"learning_rate": 4.0492138033998424e-10,
"loss": 0.4051,
"step": 2595
},
{
"epoch": 0.9915966386554622,
"grad_norm": 4.1733591811192685,
"learning_rate": 3.7047839939785553e-10,
"loss": 0.5353,
"step": 2596
},
{
"epoch": 0.9919786096256684,
"grad_norm": 4.696537622938952,
"learning_rate": 3.3756584807309587e-10,
"loss": 0.4364,
"step": 2597
},
{
"epoch": 0.9923605805958747,
"grad_norm": 3.76697874842984,
"learning_rate": 3.061837767547315e-10,
"loss": 0.5534,
"step": 2598
},
{
"epoch": 0.9927425515660809,
"grad_norm": 8.74608643405226,
"learning_rate": 2.7633223348844106e-10,
"loss": 0.4994,
"step": 2599
},
{
"epoch": 0.9931245225362872,
"grad_norm": 4.006165578795337,
"learning_rate": 2.480112639769993e-10,
"loss": 0.5504,
"step": 2600
},
{
"epoch": 0.9935064935064936,
"grad_norm": 3.9404386333333754,
"learning_rate": 2.2122091157972256e-10,
"loss": 0.5143,
"step": 2601
},
{
"epoch": 0.9938884644766998,
"grad_norm": 10.682839455672944,
"learning_rate": 1.9596121731257908e-10,
"loss": 0.4989,
"step": 2602
},
{
"epoch": 0.9942704354469061,
"grad_norm": 2.76581270014255,
"learning_rate": 1.7223221984785651e-10,
"loss": 0.4919,
"step": 2603
},
{
"epoch": 0.9946524064171123,
"grad_norm": 2.909695663450389,
"learning_rate": 1.500339555148278e-10,
"loss": 0.4611,
"step": 2604
},
{
"epoch": 0.9950343773873186,
"grad_norm": 4.261689227706372,
"learning_rate": 1.2936645829886294e-10,
"loss": 0.4687,
"step": 2605
},
{
"epoch": 0.9954163483575248,
"grad_norm": 2.7347150907423297,
"learning_rate": 1.1022975984176231e-10,
"loss": 0.4518,
"step": 2606
},
{
"epoch": 0.9957983193277311,
"grad_norm": 5.4011893230672,
"learning_rate": 9.262388944186739e-11,
"loss": 0.5702,
"step": 2607
},
{
"epoch": 0.9961802902979373,
"grad_norm": 3.4713061472027165,
"learning_rate": 7.65488740537279e-11,
"loss": 0.4385,
"step": 2608
},
{
"epoch": 0.9965622612681436,
"grad_norm": 3.823133193566733,
"learning_rate": 6.200473828810171e-11,
"loss": 0.4967,
"step": 2609
},
{
"epoch": 0.9969442322383499,
"grad_norm": 2.8124484286478255,
"learning_rate": 4.8991504411843856e-11,
"loss": 0.448,
"step": 2610
},
{
"epoch": 0.9973262032085561,
"grad_norm": 8.56790054183414,
"learning_rate": 3.7509192348461614e-11,
"loss": 0.4716,
"step": 2611
},
{
"epoch": 0.9977081741787625,
"grad_norm": 3.489183065122739,
"learning_rate": 2.7557819677115345e-11,
"loss": 0.4173,
"step": 2612
},
{
"epoch": 0.9980901451489687,
"grad_norm": 3.064351005201259,
"learning_rate": 1.9137401633506635e-11,
"loss": 0.4869,
"step": 2613
},
{
"epoch": 0.998472116119175,
"grad_norm": 3.0529889006171773,
"learning_rate": 1.2247951109101151e-11,
"loss": 0.4295,
"step": 2614
},
{
"epoch": 0.9988540870893812,
"grad_norm": 11.94148717007962,
"learning_rate": 6.889478651794789e-12,
"loss": 0.4622,
"step": 2615
},
{
"epoch": 0.9992360580595875,
"grad_norm": 13.14822182317399,
"learning_rate": 3.0619924651364982e-12,
"loss": 0.5448,
"step": 2616
},
{
"epoch": 0.9996180290297937,
"grad_norm": 6.143156160558923,
"learning_rate": 7.654984093274919e-13,
"loss": 0.4924,
"step": 2617
},
{
"epoch": 1.0,
"grad_norm": 2.676839745191836,
"learning_rate": 0.0,
"loss": 0.5148,
"step": 2618
},
{
"epoch": 1.0,
"step": 2618,
"total_flos": 1946642581422080.0,
"train_loss": 0.5203552902473394,
"train_runtime": 30094.9628,
"train_samples_per_second": 22.269,
"train_steps_per_second": 0.087
}
],
"logging_steps": 1.0,
"max_steps": 2618,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1946642581422080.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}