hello4 / 075 /trainer_state.json
hoang14's picture
Upload folder using huggingface_hub
d92afdd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7512341390280469,
"eval_steps": 500,
"global_step": 1284,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005850733170000365,
"grad_norm": 0.3784801935906415,
"learning_rate": 7.999998479354072e-06,
"loss": 1.1815,
"step": 1
},
{
"epoch": 0.001170146634000073,
"grad_norm": 0.10369689761982695,
"learning_rate": 7.999993917417574e-06,
"loss": 1.0085,
"step": 2
},
{
"epoch": 0.0017552199510001097,
"grad_norm": 0.08165678751844849,
"learning_rate": 7.99998631419436e-06,
"loss": 0.934,
"step": 3
},
{
"epoch": 0.002340293268000146,
"grad_norm": 0.06369675215202451,
"learning_rate": 7.999975669690855e-06,
"loss": 1.0323,
"step": 4
},
{
"epoch": 0.002925366585000183,
"grad_norm": 0.08076101376684344,
"learning_rate": 7.999961983916048e-06,
"loss": 1.0676,
"step": 5
},
{
"epoch": 0.0035104399020002195,
"grad_norm": 0.06288930819751326,
"learning_rate": 7.999945256881502e-06,
"loss": 1.0421,
"step": 6
},
{
"epoch": 0.004095513219000256,
"grad_norm": 0.06851051488443692,
"learning_rate": 7.999925488601348e-06,
"loss": 0.9544,
"step": 7
},
{
"epoch": 0.004680586536000292,
"grad_norm": 0.06007707301663692,
"learning_rate": 7.999902679092287e-06,
"loss": 1.0349,
"step": 8
},
{
"epoch": 0.005265659853000329,
"grad_norm": 0.07674940091671953,
"learning_rate": 7.99987682837359e-06,
"loss": 1.1809,
"step": 9
},
{
"epoch": 0.005850733170000366,
"grad_norm": 0.06356726822529891,
"learning_rate": 7.999847936467093e-06,
"loss": 0.9502,
"step": 10
},
{
"epoch": 0.006435806487000402,
"grad_norm": 0.060093989725189235,
"learning_rate": 7.999816003397203e-06,
"loss": 1.0007,
"step": 11
},
{
"epoch": 0.007020879804000439,
"grad_norm": 0.04797598057222403,
"learning_rate": 7.999781029190902e-06,
"loss": 1.0316,
"step": 12
},
{
"epoch": 0.007605953121000476,
"grad_norm": 0.08827709210231136,
"learning_rate": 7.999743013877734e-06,
"loss": 1.0688,
"step": 13
},
{
"epoch": 0.008191026438000511,
"grad_norm": 0.04993529689396162,
"learning_rate": 7.999701957489811e-06,
"loss": 0.9695,
"step": 14
},
{
"epoch": 0.008776099755000549,
"grad_norm": 0.053685150458545855,
"learning_rate": 7.999657860061823e-06,
"loss": 1.0826,
"step": 15
},
{
"epoch": 0.009361173072000585,
"grad_norm": 0.04357397086005245,
"learning_rate": 7.99961072163102e-06,
"loss": 0.9788,
"step": 16
},
{
"epoch": 0.009946246389000622,
"grad_norm": 0.051838589957396514,
"learning_rate": 7.999560542237226e-06,
"loss": 1.0222,
"step": 17
},
{
"epoch": 0.010531319706000658,
"grad_norm": 0.04948384582310689,
"learning_rate": 7.999507321922833e-06,
"loss": 0.9882,
"step": 18
},
{
"epoch": 0.011116393023000696,
"grad_norm": 0.05488935540485584,
"learning_rate": 7.9994510607328e-06,
"loss": 1.0124,
"step": 19
},
{
"epoch": 0.011701466340000731,
"grad_norm": 0.04632979465330071,
"learning_rate": 7.999391758714659e-06,
"loss": 0.9646,
"step": 20
},
{
"epoch": 0.012286539657000767,
"grad_norm": 0.04102257708769348,
"learning_rate": 7.999329415918508e-06,
"loss": 0.8946,
"step": 21
},
{
"epoch": 0.012871612974000805,
"grad_norm": 0.045466366302330634,
"learning_rate": 7.999264032397013e-06,
"loss": 0.9973,
"step": 22
},
{
"epoch": 0.01345668629100084,
"grad_norm": 0.04440538659368297,
"learning_rate": 7.99919560820541e-06,
"loss": 1.0596,
"step": 23
},
{
"epoch": 0.014041759608000878,
"grad_norm": 0.042500986542824803,
"learning_rate": 7.999124143401507e-06,
"loss": 0.9303,
"step": 24
},
{
"epoch": 0.014626832925000914,
"grad_norm": 0.05180121412922999,
"learning_rate": 7.999049638045675e-06,
"loss": 0.9412,
"step": 25
},
{
"epoch": 0.015211906242000951,
"grad_norm": 0.06441302150178038,
"learning_rate": 7.998972092200859e-06,
"loss": 0.9986,
"step": 26
},
{
"epoch": 0.015796979559000987,
"grad_norm": 0.05297417453471626,
"learning_rate": 7.998891505932565e-06,
"loss": 1.0701,
"step": 27
},
{
"epoch": 0.016382052876001023,
"grad_norm": 0.04305692283062352,
"learning_rate": 7.998807879308878e-06,
"loss": 0.9823,
"step": 28
},
{
"epoch": 0.016967126193001062,
"grad_norm": 0.048331091733781785,
"learning_rate": 7.998721212400443e-06,
"loss": 0.9145,
"step": 29
},
{
"epoch": 0.017552199510001098,
"grad_norm": 0.04249327256576877,
"learning_rate": 7.998631505280477e-06,
"loss": 0.9469,
"step": 30
},
{
"epoch": 0.018137272827001134,
"grad_norm": 0.042672714299481426,
"learning_rate": 7.998538758024765e-06,
"loss": 0.9814,
"step": 31
},
{
"epoch": 0.01872234614400117,
"grad_norm": 0.04643044969291229,
"learning_rate": 7.998442970711661e-06,
"loss": 0.9342,
"step": 32
},
{
"epoch": 0.019307419461001205,
"grad_norm": 0.042411147754236,
"learning_rate": 7.998344143422087e-06,
"loss": 0.8851,
"step": 33
},
{
"epoch": 0.019892492778001244,
"grad_norm": 0.04407093546490425,
"learning_rate": 7.998242276239527e-06,
"loss": 1.0026,
"step": 34
},
{
"epoch": 0.02047756609500128,
"grad_norm": 0.044039695433035646,
"learning_rate": 7.998137369250046e-06,
"loss": 0.9637,
"step": 35
},
{
"epoch": 0.021062639412001316,
"grad_norm": 0.04258725681727216,
"learning_rate": 7.998029422542267e-06,
"loss": 0.9249,
"step": 36
},
{
"epoch": 0.021647712729001352,
"grad_norm": 0.045486820543060684,
"learning_rate": 7.997918436207383e-06,
"loss": 1.005,
"step": 37
},
{
"epoch": 0.02223278604600139,
"grad_norm": 0.03843585783476988,
"learning_rate": 7.997804410339156e-06,
"loss": 0.9815,
"step": 38
},
{
"epoch": 0.022817859363001427,
"grad_norm": 0.05544665751989288,
"learning_rate": 7.997687345033915e-06,
"loss": 0.94,
"step": 39
},
{
"epoch": 0.023402932680001463,
"grad_norm": 0.041440042889966715,
"learning_rate": 7.99756724039056e-06,
"loss": 0.9337,
"step": 40
},
{
"epoch": 0.0239880059970015,
"grad_norm": 0.040659761048442974,
"learning_rate": 7.997444096510552e-06,
"loss": 0.897,
"step": 41
},
{
"epoch": 0.024573079314001534,
"grad_norm": 0.06541743477477353,
"learning_rate": 7.997317913497925e-06,
"loss": 0.9325,
"step": 42
},
{
"epoch": 0.025158152631001574,
"grad_norm": 0.04101226832374059,
"learning_rate": 7.997188691459279e-06,
"loss": 0.9323,
"step": 43
},
{
"epoch": 0.02574322594800161,
"grad_norm": 0.03954326036578227,
"learning_rate": 7.997056430503782e-06,
"loss": 0.8793,
"step": 44
},
{
"epoch": 0.026328299265001645,
"grad_norm": 0.0396268421897809,
"learning_rate": 7.996921130743165e-06,
"loss": 0.9849,
"step": 45
},
{
"epoch": 0.02691337258200168,
"grad_norm": 0.03855187149117414,
"learning_rate": 7.996782792291732e-06,
"loss": 0.9328,
"step": 46
},
{
"epoch": 0.02749844589900172,
"grad_norm": 0.03791722568157901,
"learning_rate": 7.996641415266355e-06,
"loss": 0.9293,
"step": 47
},
{
"epoch": 0.028083519216001756,
"grad_norm": 0.04497262810695708,
"learning_rate": 7.996496999786465e-06,
"loss": 0.9884,
"step": 48
},
{
"epoch": 0.02866859253300179,
"grad_norm": 0.041089653577607424,
"learning_rate": 7.996349545974065e-06,
"loss": 0.9305,
"step": 49
},
{
"epoch": 0.029253665850001827,
"grad_norm": 0.04162057876282252,
"learning_rate": 7.996199053953729e-06,
"loss": 1.0771,
"step": 50
},
{
"epoch": 0.029838739167001863,
"grad_norm": 0.045060287442874644,
"learning_rate": 7.996045523852587e-06,
"loss": 0.9819,
"step": 51
},
{
"epoch": 0.030423812484001903,
"grad_norm": 0.03950486400361273,
"learning_rate": 7.995888955800346e-06,
"loss": 0.8911,
"step": 52
},
{
"epoch": 0.03100888580100194,
"grad_norm": 0.040638774400984866,
"learning_rate": 7.995729349929275e-06,
"loss": 0.9289,
"step": 53
},
{
"epoch": 0.031593959118001974,
"grad_norm": 0.04034434081592363,
"learning_rate": 7.995566706374206e-06,
"loss": 0.9529,
"step": 54
},
{
"epoch": 0.03217903243500201,
"grad_norm": 0.04315614004870732,
"learning_rate": 7.995401025272545e-06,
"loss": 0.9554,
"step": 55
},
{
"epoch": 0.032764105752002046,
"grad_norm": 0.03862522293897866,
"learning_rate": 7.995232306764258e-06,
"loss": 0.9604,
"step": 56
},
{
"epoch": 0.03334917906900208,
"grad_norm": 0.042952485422922036,
"learning_rate": 7.995060550991879e-06,
"loss": 0.9731,
"step": 57
},
{
"epoch": 0.033934252386002124,
"grad_norm": 0.04399639647859174,
"learning_rate": 7.994885758100508e-06,
"loss": 0.868,
"step": 58
},
{
"epoch": 0.03451932570300216,
"grad_norm": 0.03794998627596821,
"learning_rate": 7.99470792823781e-06,
"loss": 0.9176,
"step": 59
},
{
"epoch": 0.035104399020002196,
"grad_norm": 0.046410465067380204,
"learning_rate": 7.99452706155402e-06,
"loss": 0.9456,
"step": 60
},
{
"epoch": 0.03568947233700223,
"grad_norm": 0.04773216743183266,
"learning_rate": 7.994343158201927e-06,
"loss": 1.0049,
"step": 61
},
{
"epoch": 0.03627454565400227,
"grad_norm": 0.04395331205981261,
"learning_rate": 7.994156218336901e-06,
"loss": 0.9197,
"step": 62
},
{
"epoch": 0.0368596189710023,
"grad_norm": 0.04353152162367831,
"learning_rate": 7.993966242116865e-06,
"loss": 0.8978,
"step": 63
},
{
"epoch": 0.03744469228800234,
"grad_norm": 0.041195331266407585,
"learning_rate": 7.993773229702312e-06,
"loss": 0.965,
"step": 64
},
{
"epoch": 0.038029765605002375,
"grad_norm": 0.050163343033375746,
"learning_rate": 7.993577181256304e-06,
"loss": 0.8876,
"step": 65
},
{
"epoch": 0.03861483892200241,
"grad_norm": 0.06660490581550026,
"learning_rate": 7.993378096944456e-06,
"loss": 0.9082,
"step": 66
},
{
"epoch": 0.03919991223900245,
"grad_norm": 0.03984407119317513,
"learning_rate": 7.99317597693496e-06,
"loss": 0.9233,
"step": 67
},
{
"epoch": 0.03978498555600249,
"grad_norm": 0.04866369054757869,
"learning_rate": 7.992970821398567e-06,
"loss": 0.9349,
"step": 68
},
{
"epoch": 0.040370058873002525,
"grad_norm": 0.04990567711730111,
"learning_rate": 7.99276263050859e-06,
"loss": 0.985,
"step": 69
},
{
"epoch": 0.04095513219000256,
"grad_norm": 0.17411986574945643,
"learning_rate": 7.992551404440914e-06,
"loss": 1.0208,
"step": 70
},
{
"epoch": 0.041540205507002596,
"grad_norm": 0.04353986815925501,
"learning_rate": 7.992337143373981e-06,
"loss": 0.9633,
"step": 71
},
{
"epoch": 0.04212527882400263,
"grad_norm": 0.05318272939257367,
"learning_rate": 7.9921198474888e-06,
"loss": 1.0015,
"step": 72
},
{
"epoch": 0.04271035214100267,
"grad_norm": 0.059411931897509304,
"learning_rate": 7.991899516968942e-06,
"loss": 1.0224,
"step": 73
},
{
"epoch": 0.043295425458002704,
"grad_norm": 0.05746696657441689,
"learning_rate": 7.991676152000545e-06,
"loss": 0.9817,
"step": 74
},
{
"epoch": 0.04388049877500274,
"grad_norm": 0.03994063377968752,
"learning_rate": 7.991449752772307e-06,
"loss": 0.899,
"step": 75
},
{
"epoch": 0.04446557209200278,
"grad_norm": 0.05114807327866483,
"learning_rate": 7.991220319475492e-06,
"loss": 0.9627,
"step": 76
},
{
"epoch": 0.04505064540900282,
"grad_norm": 0.043413594203031196,
"learning_rate": 7.990987852303923e-06,
"loss": 0.9385,
"step": 77
},
{
"epoch": 0.045635718726002854,
"grad_norm": 0.044141893159488445,
"learning_rate": 7.990752351453994e-06,
"loss": 0.9214,
"step": 78
},
{
"epoch": 0.04622079204300289,
"grad_norm": 0.07098639926950194,
"learning_rate": 7.990513817124652e-06,
"loss": 0.9762,
"step": 79
},
{
"epoch": 0.046805865360002925,
"grad_norm": 0.04487096138718826,
"learning_rate": 7.990272249517416e-06,
"loss": 0.9379,
"step": 80
},
{
"epoch": 0.04739093867700296,
"grad_norm": 0.040488382771263605,
"learning_rate": 7.990027648836359e-06,
"loss": 0.9563,
"step": 81
},
{
"epoch": 0.047976011994003,
"grad_norm": 0.04352730030611419,
"learning_rate": 7.989780015288123e-06,
"loss": 0.9488,
"step": 82
},
{
"epoch": 0.04856108531100303,
"grad_norm": 0.04413441845817798,
"learning_rate": 7.98952934908191e-06,
"loss": 1.0336,
"step": 83
},
{
"epoch": 0.04914615862800307,
"grad_norm": 0.04193745680850997,
"learning_rate": 7.989275650429482e-06,
"loss": 0.8785,
"step": 84
},
{
"epoch": 0.04973123194500311,
"grad_norm": 0.04475381091812719,
"learning_rate": 7.989018919545165e-06,
"loss": 0.9443,
"step": 85
},
{
"epoch": 0.05031630526200315,
"grad_norm": 0.04234754821679888,
"learning_rate": 7.988759156645845e-06,
"loss": 0.9564,
"step": 86
},
{
"epoch": 0.05090137857900318,
"grad_norm": 0.054324472784088765,
"learning_rate": 7.988496361950972e-06,
"loss": 0.9824,
"step": 87
},
{
"epoch": 0.05148645189600322,
"grad_norm": 0.051967709927586946,
"learning_rate": 7.988230535682556e-06,
"loss": 0.914,
"step": 88
},
{
"epoch": 0.052071525213003254,
"grad_norm": 0.03996756018971234,
"learning_rate": 7.987961678065169e-06,
"loss": 0.9421,
"step": 89
},
{
"epoch": 0.05265659853000329,
"grad_norm": 0.07556612961446735,
"learning_rate": 7.987689789325939e-06,
"loss": 0.9791,
"step": 90
},
{
"epoch": 0.053241671847003326,
"grad_norm": 0.04557130060143385,
"learning_rate": 7.987414869694562e-06,
"loss": 0.9318,
"step": 91
},
{
"epoch": 0.05382674516400336,
"grad_norm": 0.0424700455078842,
"learning_rate": 7.98713691940329e-06,
"loss": 0.9745,
"step": 92
},
{
"epoch": 0.0544118184810034,
"grad_norm": 0.07879025661937863,
"learning_rate": 7.986855938686935e-06,
"loss": 0.9614,
"step": 93
},
{
"epoch": 0.05499689179800344,
"grad_norm": 0.04227267907542001,
"learning_rate": 7.986571927782871e-06,
"loss": 0.9317,
"step": 94
},
{
"epoch": 0.055581965115003476,
"grad_norm": 0.042275976377898476,
"learning_rate": 7.986284886931033e-06,
"loss": 0.8982,
"step": 95
},
{
"epoch": 0.05616703843200351,
"grad_norm": 0.0457764180229334,
"learning_rate": 7.985994816373913e-06,
"loss": 0.8803,
"step": 96
},
{
"epoch": 0.05675211174900355,
"grad_norm": 0.04305545472943831,
"learning_rate": 7.985701716356565e-06,
"loss": 0.9786,
"step": 97
},
{
"epoch": 0.05733718506600358,
"grad_norm": 0.045580685701444475,
"learning_rate": 7.985405587126597e-06,
"loss": 0.9036,
"step": 98
},
{
"epoch": 0.05792225838300362,
"grad_norm": 0.0442896526405325,
"learning_rate": 7.985106428934183e-06,
"loss": 0.9871,
"step": 99
},
{
"epoch": 0.058507331700003655,
"grad_norm": 0.04396583794537118,
"learning_rate": 7.984804242032051e-06,
"loss": 1.0145,
"step": 100
},
{
"epoch": 0.05909240501700369,
"grad_norm": 0.04488641679254391,
"learning_rate": 7.984499026675494e-06,
"loss": 0.9673,
"step": 101
},
{
"epoch": 0.05967747833400373,
"grad_norm": 0.0402258783966036,
"learning_rate": 7.984190783122351e-06,
"loss": 1.0458,
"step": 102
},
{
"epoch": 0.06026255165100377,
"grad_norm": 0.048502182555142354,
"learning_rate": 7.983879511633036e-06,
"loss": 0.8879,
"step": 103
},
{
"epoch": 0.060847624968003805,
"grad_norm": 0.04818474096979939,
"learning_rate": 7.983565212470504e-06,
"loss": 0.9467,
"step": 104
},
{
"epoch": 0.06143269828500384,
"grad_norm": 0.04176125713160911,
"learning_rate": 7.983247885900283e-06,
"loss": 0.9266,
"step": 105
},
{
"epoch": 0.06201777160200388,
"grad_norm": 0.04212530605033032,
"learning_rate": 7.982927532190447e-06,
"loss": 0.9179,
"step": 106
},
{
"epoch": 0.06260284491900392,
"grad_norm": 0.037198964746819226,
"learning_rate": 7.982604151611633e-06,
"loss": 1.0018,
"step": 107
},
{
"epoch": 0.06318791823600395,
"grad_norm": 0.04090799839826158,
"learning_rate": 7.982277744437035e-06,
"loss": 0.8756,
"step": 108
},
{
"epoch": 0.06377299155300399,
"grad_norm": 0.03857422273289164,
"learning_rate": 7.981948310942402e-06,
"loss": 0.8855,
"step": 109
},
{
"epoch": 0.06435806487000402,
"grad_norm": 0.05241752689478532,
"learning_rate": 7.981615851406039e-06,
"loss": 0.8862,
"step": 110
},
{
"epoch": 0.06494313818700406,
"grad_norm": 0.04086973369607326,
"learning_rate": 7.981280366108814e-06,
"loss": 0.9221,
"step": 111
},
{
"epoch": 0.06552821150400409,
"grad_norm": 0.03931043694281754,
"learning_rate": 7.98094185533414e-06,
"loss": 0.9417,
"step": 112
},
{
"epoch": 0.06611328482100413,
"grad_norm": 0.04169878931857935,
"learning_rate": 7.980600319367995e-06,
"loss": 0.958,
"step": 113
},
{
"epoch": 0.06669835813800416,
"grad_norm": 0.044230077430854955,
"learning_rate": 7.980255758498908e-06,
"loss": 0.9265,
"step": 114
},
{
"epoch": 0.0672834314550042,
"grad_norm": 0.04488148180330816,
"learning_rate": 7.979908173017968e-06,
"loss": 0.8908,
"step": 115
},
{
"epoch": 0.06786850477200425,
"grad_norm": 0.042038370275589085,
"learning_rate": 7.979557563218815e-06,
"loss": 0.8961,
"step": 116
},
{
"epoch": 0.06845357808900428,
"grad_norm": 0.04747064970378541,
"learning_rate": 7.979203929397646e-06,
"loss": 1.0609,
"step": 117
},
{
"epoch": 0.06903865140600432,
"grad_norm": 0.04392999493678844,
"learning_rate": 7.97884727185321e-06,
"loss": 0.9001,
"step": 118
},
{
"epoch": 0.06962372472300435,
"grad_norm": 0.040693633998808994,
"learning_rate": 7.978487590886814e-06,
"loss": 0.8562,
"step": 119
},
{
"epoch": 0.07020879804000439,
"grad_norm": 0.08337676841807191,
"learning_rate": 7.978124886802316e-06,
"loss": 0.9344,
"step": 120
},
{
"epoch": 0.07079387135700442,
"grad_norm": 0.03951441645023813,
"learning_rate": 7.977759159906134e-06,
"loss": 0.9182,
"step": 121
},
{
"epoch": 0.07137894467400446,
"grad_norm": 0.04427536962304041,
"learning_rate": 7.977390410507229e-06,
"loss": 0.9079,
"step": 122
},
{
"epoch": 0.07196401799100449,
"grad_norm": 0.047402666476443076,
"learning_rate": 7.977018638917126e-06,
"loss": 0.9442,
"step": 123
},
{
"epoch": 0.07254909130800453,
"grad_norm": 0.07806155365092546,
"learning_rate": 7.976643845449897e-06,
"loss": 0.9453,
"step": 124
},
{
"epoch": 0.07313416462500458,
"grad_norm": 0.04187989433422361,
"learning_rate": 7.97626603042217e-06,
"loss": 0.9762,
"step": 125
},
{
"epoch": 0.0737192379420046,
"grad_norm": 0.038153971027990764,
"learning_rate": 7.975885194153125e-06,
"loss": 0.9377,
"step": 126
},
{
"epoch": 0.07430431125900465,
"grad_norm": 0.04398811005912341,
"learning_rate": 7.975501336964492e-06,
"loss": 0.9655,
"step": 127
},
{
"epoch": 0.07488938457600468,
"grad_norm": 0.04310577216463342,
"learning_rate": 7.975114459180555e-06,
"loss": 0.9082,
"step": 128
},
{
"epoch": 0.07547445789300472,
"grad_norm": 0.04552741227747631,
"learning_rate": 7.97472456112815e-06,
"loss": 0.8667,
"step": 129
},
{
"epoch": 0.07605953121000475,
"grad_norm": 0.0406705187810207,
"learning_rate": 7.974331643136666e-06,
"loss": 0.9286,
"step": 130
},
{
"epoch": 0.07664460452700479,
"grad_norm": 0.16819457832404855,
"learning_rate": 7.973935705538039e-06,
"loss": 0.9724,
"step": 131
},
{
"epoch": 0.07722967784400482,
"grad_norm": 0.04524796652654962,
"learning_rate": 7.973536748666756e-06,
"loss": 0.961,
"step": 132
},
{
"epoch": 0.07781475116100486,
"grad_norm": 0.051171072191227115,
"learning_rate": 7.973134772859862e-06,
"loss": 0.9513,
"step": 133
},
{
"epoch": 0.0783998244780049,
"grad_norm": 0.044436153110071305,
"learning_rate": 7.972729778456946e-06,
"loss": 0.9363,
"step": 134
},
{
"epoch": 0.07898489779500494,
"grad_norm": 0.04485419817375143,
"learning_rate": 7.97232176580015e-06,
"loss": 0.8583,
"step": 135
},
{
"epoch": 0.07956997111200498,
"grad_norm": 0.044866876801222304,
"learning_rate": 7.971910735234161e-06,
"loss": 0.9859,
"step": 136
},
{
"epoch": 0.080155044429005,
"grad_norm": 0.03934716109284772,
"learning_rate": 7.971496687106219e-06,
"loss": 0.8592,
"step": 137
},
{
"epoch": 0.08074011774600505,
"grad_norm": 0.041852675369480244,
"learning_rate": 7.971079621766117e-06,
"loss": 0.9353,
"step": 138
},
{
"epoch": 0.08132519106300508,
"grad_norm": 0.3430745253576026,
"learning_rate": 7.97065953956619e-06,
"loss": 0.9602,
"step": 139
},
{
"epoch": 0.08191026438000512,
"grad_norm": 0.44783710464834237,
"learning_rate": 7.970236440861327e-06,
"loss": 0.9833,
"step": 140
},
{
"epoch": 0.08249533769700515,
"grad_norm": 0.29750847371388817,
"learning_rate": 7.96981032600896e-06,
"loss": 0.8244,
"step": 141
},
{
"epoch": 0.08308041101400519,
"grad_norm": 0.169443611740874,
"learning_rate": 7.969381195369076e-06,
"loss": 0.8983,
"step": 142
},
{
"epoch": 0.08366548433100524,
"grad_norm": 0.04111262427570532,
"learning_rate": 7.968949049304204e-06,
"loss": 0.9552,
"step": 143
},
{
"epoch": 0.08425055764800526,
"grad_norm": 0.04087231642049412,
"learning_rate": 7.968513888179421e-06,
"loss": 0.9051,
"step": 144
},
{
"epoch": 0.08483563096500531,
"grad_norm": 0.05663350951363164,
"learning_rate": 7.968075712362356e-06,
"loss": 0.8366,
"step": 145
},
{
"epoch": 0.08542070428200534,
"grad_norm": 0.04202895629977615,
"learning_rate": 7.967634522223179e-06,
"loss": 0.8115,
"step": 146
},
{
"epoch": 0.08600577759900538,
"grad_norm": 0.04238111014919017,
"learning_rate": 7.96719031813461e-06,
"loss": 0.9086,
"step": 147
},
{
"epoch": 0.08659085091600541,
"grad_norm": 0.04226501020383857,
"learning_rate": 7.966743100471913e-06,
"loss": 0.9286,
"step": 148
},
{
"epoch": 0.08717592423300545,
"grad_norm": 0.046325797442375215,
"learning_rate": 7.9662928696129e-06,
"loss": 0.9393,
"step": 149
},
{
"epoch": 0.08776099755000548,
"grad_norm": 0.05532068772615188,
"learning_rate": 7.965839625937926e-06,
"loss": 0.9202,
"step": 150
},
{
"epoch": 0.08834607086700552,
"grad_norm": 0.039360829289514745,
"learning_rate": 7.965383369829894e-06,
"loss": 0.908,
"step": 151
},
{
"epoch": 0.08893114418400556,
"grad_norm": 0.04965498407233415,
"learning_rate": 7.964924101674252e-06,
"loss": 0.9406,
"step": 152
},
{
"epoch": 0.0895162175010056,
"grad_norm": 0.04177674200968805,
"learning_rate": 7.964461821858987e-06,
"loss": 0.8933,
"step": 153
},
{
"epoch": 0.09010129081800564,
"grad_norm": 0.04711456314429998,
"learning_rate": 7.963996530774639e-06,
"loss": 1.0111,
"step": 154
},
{
"epoch": 0.09068636413500566,
"grad_norm": 0.045100723994096155,
"learning_rate": 7.963528228814285e-06,
"loss": 0.9806,
"step": 155
},
{
"epoch": 0.09127143745200571,
"grad_norm": 0.09448573408012474,
"learning_rate": 7.96305691637355e-06,
"loss": 0.9142,
"step": 156
},
{
"epoch": 0.09185651076900574,
"grad_norm": 0.04297907772876167,
"learning_rate": 7.962582593850596e-06,
"loss": 0.8852,
"step": 157
},
{
"epoch": 0.09244158408600578,
"grad_norm": 0.05151683390684187,
"learning_rate": 7.962105261646138e-06,
"loss": 0.9975,
"step": 158
},
{
"epoch": 0.09302665740300581,
"grad_norm": 0.07242957126413647,
"learning_rate": 7.961624920163423e-06,
"loss": 0.9196,
"step": 159
},
{
"epoch": 0.09361173072000585,
"grad_norm": 0.08840439035290122,
"learning_rate": 7.961141569808248e-06,
"loss": 0.92,
"step": 160
},
{
"epoch": 0.0941968040370059,
"grad_norm": 0.11047843778949552,
"learning_rate": 7.960655210988948e-06,
"loss": 0.9452,
"step": 161
},
{
"epoch": 0.09478187735400592,
"grad_norm": 0.04275273565605597,
"learning_rate": 7.960165844116399e-06,
"loss": 0.9641,
"step": 162
},
{
"epoch": 0.09536695067100597,
"grad_norm": 0.05662855927390397,
"learning_rate": 7.959673469604025e-06,
"loss": 0.9354,
"step": 163
},
{
"epoch": 0.095952023988006,
"grad_norm": 0.04769002643125012,
"learning_rate": 7.959178087867779e-06,
"loss": 0.9087,
"step": 164
},
{
"epoch": 0.09653709730500604,
"grad_norm": 0.050744023655463925,
"learning_rate": 7.958679699326164e-06,
"loss": 0.9561,
"step": 165
},
{
"epoch": 0.09712217062200607,
"grad_norm": 0.058451109924341715,
"learning_rate": 7.958178304400222e-06,
"loss": 0.9881,
"step": 166
},
{
"epoch": 0.09770724393900611,
"grad_norm": 0.04338399471073774,
"learning_rate": 7.95767390351353e-06,
"loss": 0.9705,
"step": 167
},
{
"epoch": 0.09829231725600614,
"grad_norm": 0.04901291931441639,
"learning_rate": 7.95716649709221e-06,
"loss": 0.9229,
"step": 168
},
{
"epoch": 0.09887739057300618,
"grad_norm": 0.044284928725944674,
"learning_rate": 7.95665608556492e-06,
"loss": 0.9493,
"step": 169
},
{
"epoch": 0.09946246389000622,
"grad_norm": 0.04914804115067185,
"learning_rate": 7.956142669362855e-06,
"loss": 0.9879,
"step": 170
},
{
"epoch": 0.10004753720700625,
"grad_norm": 0.04087870881565583,
"learning_rate": 7.955626248919752e-06,
"loss": 0.9435,
"step": 171
},
{
"epoch": 0.1006326105240063,
"grad_norm": 0.04938743203389444,
"learning_rate": 7.955106824671888e-06,
"loss": 0.8813,
"step": 172
},
{
"epoch": 0.10121768384100632,
"grad_norm": 0.04440441336302364,
"learning_rate": 7.95458439705807e-06,
"loss": 0.954,
"step": 173
},
{
"epoch": 0.10180275715800637,
"grad_norm": 0.04052619663535755,
"learning_rate": 7.954058966519649e-06,
"loss": 0.8759,
"step": 174
},
{
"epoch": 0.1023878304750064,
"grad_norm": 0.04315943430366373,
"learning_rate": 7.953530533500507e-06,
"loss": 0.8621,
"step": 175
},
{
"epoch": 0.10297290379200644,
"grad_norm": 0.07065437183104553,
"learning_rate": 7.952999098447072e-06,
"loss": 0.9796,
"step": 176
},
{
"epoch": 0.10355797710900647,
"grad_norm": 0.04286710979601013,
"learning_rate": 7.952464661808297e-06,
"loss": 0.9187,
"step": 177
},
{
"epoch": 0.10414305042600651,
"grad_norm": 0.044637305084684484,
"learning_rate": 7.951927224035678e-06,
"loss": 0.8772,
"step": 178
},
{
"epoch": 0.10472812374300655,
"grad_norm": 0.04369651860095562,
"learning_rate": 7.951386785583244e-06,
"loss": 0.8969,
"step": 179
},
{
"epoch": 0.10531319706000658,
"grad_norm": 0.05048322425196143,
"learning_rate": 7.950843346907559e-06,
"loss": 0.8907,
"step": 180
},
{
"epoch": 0.10589827037700662,
"grad_norm": 0.03884181381210202,
"learning_rate": 7.95029690846772e-06,
"loss": 0.931,
"step": 181
},
{
"epoch": 0.10648334369400665,
"grad_norm": 0.05060670376656713,
"learning_rate": 7.949747470725362e-06,
"loss": 0.8624,
"step": 182
},
{
"epoch": 0.1070684170110067,
"grad_norm": 0.04162121727818916,
"learning_rate": 7.949195034144653e-06,
"loss": 0.9141,
"step": 183
},
{
"epoch": 0.10765349032800672,
"grad_norm": 0.04729330740672752,
"learning_rate": 7.94863959919229e-06,
"loss": 0.921,
"step": 184
},
{
"epoch": 0.10823856364500677,
"grad_norm": 0.04459204791909808,
"learning_rate": 7.948081166337509e-06,
"loss": 0.8993,
"step": 185
},
{
"epoch": 0.1088236369620068,
"grad_norm": 0.04645986236352756,
"learning_rate": 7.947519736052075e-06,
"loss": 0.9158,
"step": 186
},
{
"epoch": 0.10940871027900684,
"grad_norm": 0.04275967646092167,
"learning_rate": 7.946955308810285e-06,
"loss": 0.9387,
"step": 187
},
{
"epoch": 0.10999378359600688,
"grad_norm": 0.07421648959437231,
"learning_rate": 7.94638788508897e-06,
"loss": 0.8497,
"step": 188
},
{
"epoch": 0.11057885691300691,
"grad_norm": 0.04955170539289863,
"learning_rate": 7.945817465367493e-06,
"loss": 0.8525,
"step": 189
},
{
"epoch": 0.11116393023000695,
"grad_norm": 0.12013696366466048,
"learning_rate": 7.945244050127744e-06,
"loss": 0.9616,
"step": 190
},
{
"epoch": 0.11174900354700698,
"grad_norm": 0.040829512145039985,
"learning_rate": 7.944667639854148e-06,
"loss": 0.8344,
"step": 191
},
{
"epoch": 0.11233407686400702,
"grad_norm": 0.05068364591326354,
"learning_rate": 7.944088235033657e-06,
"loss": 0.9403,
"step": 192
},
{
"epoch": 0.11291915018100705,
"grad_norm": 0.0499939002215986,
"learning_rate": 7.943505836155753e-06,
"loss": 0.9475,
"step": 193
},
{
"epoch": 0.1135042234980071,
"grad_norm": 0.05407026250866459,
"learning_rate": 7.94292044371245e-06,
"loss": 0.9101,
"step": 194
},
{
"epoch": 0.11408929681500712,
"grad_norm": 0.06417314751489454,
"learning_rate": 7.94233205819829e-06,
"loss": 0.8787,
"step": 195
},
{
"epoch": 0.11467437013200717,
"grad_norm": 0.03871241656337873,
"learning_rate": 7.941740680110343e-06,
"loss": 0.9059,
"step": 196
},
{
"epoch": 0.11525944344900721,
"grad_norm": 0.041501526897382096,
"learning_rate": 7.941146309948205e-06,
"loss": 0.8946,
"step": 197
},
{
"epoch": 0.11584451676600724,
"grad_norm": 0.037938175729775744,
"learning_rate": 7.940548948214005e-06,
"loss": 0.8534,
"step": 198
},
{
"epoch": 0.11642959008300728,
"grad_norm": 0.038697315108935856,
"learning_rate": 7.939948595412394e-06,
"loss": 0.9671,
"step": 199
},
{
"epoch": 0.11701466340000731,
"grad_norm": 0.040128164685532784,
"learning_rate": 7.939345252050552e-06,
"loss": 0.9452,
"step": 200
},
{
"epoch": 0.11759973671700735,
"grad_norm": 0.04159138776071103,
"learning_rate": 7.938738918638187e-06,
"loss": 0.9525,
"step": 201
},
{
"epoch": 0.11818481003400738,
"grad_norm": 0.044356876333774484,
"learning_rate": 7.93812959568753e-06,
"loss": 0.8863,
"step": 202
},
{
"epoch": 0.11876988335100742,
"grad_norm": 0.04110203273326361,
"learning_rate": 7.93751728371334e-06,
"loss": 0.8585,
"step": 203
},
{
"epoch": 0.11935495666800745,
"grad_norm": 0.03907623380015084,
"learning_rate": 7.9369019832329e-06,
"loss": 0.8901,
"step": 204
},
{
"epoch": 0.1199400299850075,
"grad_norm": 0.06841747716076531,
"learning_rate": 7.936283694766016e-06,
"loss": 0.9382,
"step": 205
},
{
"epoch": 0.12052510330200754,
"grad_norm": 0.03864267029597622,
"learning_rate": 7.935662418835023e-06,
"loss": 0.9445,
"step": 206
},
{
"epoch": 0.12111017661900757,
"grad_norm": 0.04187477888552647,
"learning_rate": 7.935038155964775e-06,
"loss": 0.949,
"step": 207
},
{
"epoch": 0.12169524993600761,
"grad_norm": 0.03844046778107278,
"learning_rate": 7.934410906682653e-06,
"loss": 0.8185,
"step": 208
},
{
"epoch": 0.12228032325300764,
"grad_norm": 0.03894778215959397,
"learning_rate": 7.933780671518558e-06,
"loss": 0.8226,
"step": 209
},
{
"epoch": 0.12286539657000768,
"grad_norm": 0.05316470088962357,
"learning_rate": 7.933147451004914e-06,
"loss": 0.9149,
"step": 210
},
{
"epoch": 0.12345046988700771,
"grad_norm": 0.04054196015487159,
"learning_rate": 7.932511245676669e-06,
"loss": 0.907,
"step": 211
},
{
"epoch": 0.12403554320400775,
"grad_norm": 0.04197248747401694,
"learning_rate": 7.931872056071292e-06,
"loss": 0.8974,
"step": 212
},
{
"epoch": 0.12462061652100778,
"grad_norm": 0.04107563523275673,
"learning_rate": 7.931229882728771e-06,
"loss": 0.8758,
"step": 213
},
{
"epoch": 0.12520568983800784,
"grad_norm": 0.04898006391812857,
"learning_rate": 7.930584726191616e-06,
"loss": 0.9015,
"step": 214
},
{
"epoch": 0.12579076315500787,
"grad_norm": 0.053828637223887026,
"learning_rate": 7.92993658700486e-06,
"loss": 0.9095,
"step": 215
},
{
"epoch": 0.1263758364720079,
"grad_norm": 0.041596264391364365,
"learning_rate": 7.929285465716051e-06,
"loss": 0.9324,
"step": 216
},
{
"epoch": 0.12696090978900793,
"grad_norm": 0.04270202089537964,
"learning_rate": 7.928631362875258e-06,
"loss": 0.9712,
"step": 217
},
{
"epoch": 0.12754598310600798,
"grad_norm": 0.04508600982672278,
"learning_rate": 7.927974279035069e-06,
"loss": 0.8526,
"step": 218
},
{
"epoch": 0.128131056423008,
"grad_norm": 0.04210505656160864,
"learning_rate": 7.927314214750592e-06,
"loss": 0.8483,
"step": 219
},
{
"epoch": 0.12871612974000804,
"grad_norm": 0.04228456764136032,
"learning_rate": 7.926651170579451e-06,
"loss": 1.0049,
"step": 220
},
{
"epoch": 0.12930120305700807,
"grad_norm": 0.039866451292078504,
"learning_rate": 7.92598514708179e-06,
"loss": 0.9421,
"step": 221
},
{
"epoch": 0.12988627637400812,
"grad_norm": 0.037613132725661406,
"learning_rate": 7.925316144820263e-06,
"loss": 0.9474,
"step": 222
},
{
"epoch": 0.13047134969100815,
"grad_norm": 0.04168086524600805,
"learning_rate": 7.92464416436005e-06,
"loss": 0.9058,
"step": 223
},
{
"epoch": 0.13105642300800818,
"grad_norm": 0.039621964711338775,
"learning_rate": 7.923969206268839e-06,
"loss": 0.9086,
"step": 224
},
{
"epoch": 0.13164149632500824,
"grad_norm": 0.05209282643387043,
"learning_rate": 7.923291271116838e-06,
"loss": 0.9298,
"step": 225
},
{
"epoch": 0.13222656964200827,
"grad_norm": 0.044258185244179175,
"learning_rate": 7.92261035947677e-06,
"loss": 0.8925,
"step": 226
},
{
"epoch": 0.1328116429590083,
"grad_norm": 0.039844108391859055,
"learning_rate": 7.92192647192387e-06,
"loss": 0.8392,
"step": 227
},
{
"epoch": 0.13339671627600833,
"grad_norm": 0.044744651789733476,
"learning_rate": 7.92123960903589e-06,
"loss": 0.9329,
"step": 228
},
{
"epoch": 0.13398178959300838,
"grad_norm": 0.04494192821446448,
"learning_rate": 7.92054977139309e-06,
"loss": 0.8606,
"step": 229
},
{
"epoch": 0.1345668629100084,
"grad_norm": 0.05863838322698434,
"learning_rate": 7.919856959578252e-06,
"loss": 0.9302,
"step": 230
},
{
"epoch": 0.13515193622700844,
"grad_norm": 0.04085984556832983,
"learning_rate": 7.919161174176663e-06,
"loss": 0.9993,
"step": 231
},
{
"epoch": 0.1357370095440085,
"grad_norm": 0.04500674877406008,
"learning_rate": 7.918462415776125e-06,
"loss": 0.9377,
"step": 232
},
{
"epoch": 0.13632208286100853,
"grad_norm": 0.039173528958006375,
"learning_rate": 7.917760684966955e-06,
"loss": 0.813,
"step": 233
},
{
"epoch": 0.13690715617800855,
"grad_norm": 0.04298295272921228,
"learning_rate": 7.91705598234197e-06,
"loss": 0.8952,
"step": 234
},
{
"epoch": 0.13749222949500858,
"grad_norm": 0.04028408438122686,
"learning_rate": 7.916348308496513e-06,
"loss": 0.9051,
"step": 235
},
{
"epoch": 0.13807730281200864,
"grad_norm": 0.06743695497866435,
"learning_rate": 7.915637664028423e-06,
"loss": 0.9475,
"step": 236
},
{
"epoch": 0.13866237612900867,
"grad_norm": 0.03949575625475006,
"learning_rate": 7.914924049538061e-06,
"loss": 0.9316,
"step": 237
},
{
"epoch": 0.1392474494460087,
"grad_norm": 0.04030623921903529,
"learning_rate": 7.914207465628284e-06,
"loss": 0.885,
"step": 238
},
{
"epoch": 0.13983252276300873,
"grad_norm": 0.047528726409039795,
"learning_rate": 7.91348791290447e-06,
"loss": 0.918,
"step": 239
},
{
"epoch": 0.14041759608000878,
"grad_norm": 0.04466991571391728,
"learning_rate": 7.912765391974496e-06,
"loss": 0.9305,
"step": 240
},
{
"epoch": 0.1410026693970088,
"grad_norm": 0.04186868219039162,
"learning_rate": 7.912039903448752e-06,
"loss": 0.9415,
"step": 241
},
{
"epoch": 0.14158774271400884,
"grad_norm": 0.05801571080351748,
"learning_rate": 7.91131144794013e-06,
"loss": 0.8787,
"step": 242
},
{
"epoch": 0.1421728160310089,
"grad_norm": 0.045088484806881386,
"learning_rate": 7.910580026064038e-06,
"loss": 1.0604,
"step": 243
},
{
"epoch": 0.14275788934800893,
"grad_norm": 0.06574863742707004,
"learning_rate": 7.909845638438377e-06,
"loss": 0.9216,
"step": 244
},
{
"epoch": 0.14334296266500896,
"grad_norm": 0.0372095564143721,
"learning_rate": 7.909108285683563e-06,
"loss": 0.9336,
"step": 245
},
{
"epoch": 0.14392803598200898,
"grad_norm": 0.04397627871472844,
"learning_rate": 7.908367968422515e-06,
"loss": 0.9261,
"step": 246
},
{
"epoch": 0.14451310929900904,
"grad_norm": 0.04276892267104892,
"learning_rate": 7.907624687280654e-06,
"loss": 0.8673,
"step": 247
},
{
"epoch": 0.14509818261600907,
"grad_norm": 0.045304865664725656,
"learning_rate": 7.906878442885907e-06,
"loss": 0.9475,
"step": 248
},
{
"epoch": 0.1456832559330091,
"grad_norm": 0.04370076931511423,
"learning_rate": 7.906129235868702e-06,
"loss": 0.8433,
"step": 249
},
{
"epoch": 0.14626832925000915,
"grad_norm": 0.0672588807453601,
"learning_rate": 7.905377066861973e-06,
"loss": 0.9124,
"step": 250
},
{
"epoch": 0.14685340256700918,
"grad_norm": 0.046912566764935076,
"learning_rate": 7.904621936501156e-06,
"loss": 0.9044,
"step": 251
},
{
"epoch": 0.1474384758840092,
"grad_norm": 0.07278415346178851,
"learning_rate": 7.903863845424185e-06,
"loss": 0.901,
"step": 252
},
{
"epoch": 0.14802354920100924,
"grad_norm": 0.04218918671820467,
"learning_rate": 7.9031027942715e-06,
"loss": 0.9019,
"step": 253
},
{
"epoch": 0.1486086225180093,
"grad_norm": 0.03858325311022567,
"learning_rate": 7.90233878368604e-06,
"loss": 0.8601,
"step": 254
},
{
"epoch": 0.14919369583500933,
"grad_norm": 0.04209886442727145,
"learning_rate": 7.90157181431324e-06,
"loss": 0.8681,
"step": 255
},
{
"epoch": 0.14977876915200936,
"grad_norm": 0.049400812285918,
"learning_rate": 7.90080188680104e-06,
"loss": 0.9494,
"step": 256
},
{
"epoch": 0.15036384246900938,
"grad_norm": 0.047941941474994906,
"learning_rate": 7.900029001799882e-06,
"loss": 0.8439,
"step": 257
},
{
"epoch": 0.15094891578600944,
"grad_norm": 0.042101153780900284,
"learning_rate": 7.899253159962694e-06,
"loss": 0.917,
"step": 258
},
{
"epoch": 0.15153398910300947,
"grad_norm": 0.040583990176968884,
"learning_rate": 7.898474361944915e-06,
"loss": 0.9328,
"step": 259
},
{
"epoch": 0.1521190624200095,
"grad_norm": 0.04000469028913075,
"learning_rate": 7.897692608404474e-06,
"loss": 0.9454,
"step": 260
},
{
"epoch": 0.15270413573700956,
"grad_norm": 0.04819787772789072,
"learning_rate": 7.8969079000018e-06,
"loss": 1.0061,
"step": 261
},
{
"epoch": 0.15328920905400958,
"grad_norm": 0.04670811295251141,
"learning_rate": 7.896120237399817e-06,
"loss": 0.8678,
"step": 262
},
{
"epoch": 0.1538742823710096,
"grad_norm": 0.03876277230266676,
"learning_rate": 7.895329621263945e-06,
"loss": 0.9465,
"step": 263
},
{
"epoch": 0.15445935568800964,
"grad_norm": 0.06917560168966953,
"learning_rate": 7.894536052262098e-06,
"loss": 0.9114,
"step": 264
},
{
"epoch": 0.1550444290050097,
"grad_norm": 0.052197549375990714,
"learning_rate": 7.893739531064688e-06,
"loss": 0.8966,
"step": 265
},
{
"epoch": 0.15562950232200973,
"grad_norm": 0.05089520469658166,
"learning_rate": 7.892940058344615e-06,
"loss": 0.8403,
"step": 266
},
{
"epoch": 0.15621457563900976,
"grad_norm": 0.04200303069403226,
"learning_rate": 7.89213763477728e-06,
"loss": 0.7954,
"step": 267
},
{
"epoch": 0.1567996489560098,
"grad_norm": 0.04045997821257577,
"learning_rate": 7.89133226104057e-06,
"loss": 0.9484,
"step": 268
},
{
"epoch": 0.15738472227300984,
"grad_norm": 0.04355833968928891,
"learning_rate": 7.890523937814872e-06,
"loss": 0.8871,
"step": 269
},
{
"epoch": 0.15796979559000987,
"grad_norm": 0.038980722774793516,
"learning_rate": 7.889712665783055e-06,
"loss": 0.8242,
"step": 270
},
{
"epoch": 0.1585548689070099,
"grad_norm": 0.0417431555190703,
"learning_rate": 7.888898445630486e-06,
"loss": 0.918,
"step": 271
},
{
"epoch": 0.15913994222400996,
"grad_norm": 0.04420422353440596,
"learning_rate": 7.888081278045022e-06,
"loss": 0.9345,
"step": 272
},
{
"epoch": 0.15972501554100998,
"grad_norm": 0.043453256463112454,
"learning_rate": 7.88726116371701e-06,
"loss": 0.8439,
"step": 273
},
{
"epoch": 0.16031008885801,
"grad_norm": 0.04018335492133053,
"learning_rate": 7.88643810333928e-06,
"loss": 0.9024,
"step": 274
},
{
"epoch": 0.16089516217501004,
"grad_norm": 0.03885288803364117,
"learning_rate": 7.885612097607161e-06,
"loss": 0.9005,
"step": 275
},
{
"epoch": 0.1614802354920101,
"grad_norm": 0.03940189643105726,
"learning_rate": 7.884783147218464e-06,
"loss": 0.8726,
"step": 276
},
{
"epoch": 0.16206530880901013,
"grad_norm": 0.04322545289358093,
"learning_rate": 7.88395125287349e-06,
"loss": 0.9309,
"step": 277
},
{
"epoch": 0.16265038212601016,
"grad_norm": 0.03991615308835092,
"learning_rate": 7.883116415275022e-06,
"loss": 0.9319,
"step": 278
},
{
"epoch": 0.1632354554430102,
"grad_norm": 0.04774591346437184,
"learning_rate": 7.882278635128339e-06,
"loss": 0.9976,
"step": 279
},
{
"epoch": 0.16382052876001024,
"grad_norm": 0.043726280435400645,
"learning_rate": 7.881437913141196e-06,
"loss": 0.9041,
"step": 280
},
{
"epoch": 0.16440560207701027,
"grad_norm": 0.05349577122745657,
"learning_rate": 7.880594250023842e-06,
"loss": 0.9109,
"step": 281
},
{
"epoch": 0.1649906753940103,
"grad_norm": 0.04664425873865413,
"learning_rate": 7.879747646489002e-06,
"loss": 0.8872,
"step": 282
},
{
"epoch": 0.16557574871101036,
"grad_norm": 0.04689991827127376,
"learning_rate": 7.878898103251891e-06,
"loss": 0.946,
"step": 283
},
{
"epoch": 0.16616082202801039,
"grad_norm": 0.04419788368438809,
"learning_rate": 7.87804562103021e-06,
"loss": 0.8699,
"step": 284
},
{
"epoch": 0.16674589534501041,
"grad_norm": 0.0498909584325992,
"learning_rate": 7.877190200544131e-06,
"loss": 0.8396,
"step": 285
},
{
"epoch": 0.16733096866201047,
"grad_norm": 0.04446094526551824,
"learning_rate": 7.876331842516323e-06,
"loss": 0.887,
"step": 286
},
{
"epoch": 0.1679160419790105,
"grad_norm": 0.06094680175561847,
"learning_rate": 7.875470547671926e-06,
"loss": 0.8834,
"step": 287
},
{
"epoch": 0.16850111529601053,
"grad_norm": 0.038876474999689326,
"learning_rate": 7.874606316738566e-06,
"loss": 0.8975,
"step": 288
},
{
"epoch": 0.16908618861301056,
"grad_norm": 0.04076135396799628,
"learning_rate": 7.873739150446349e-06,
"loss": 0.9094,
"step": 289
},
{
"epoch": 0.16967126193001061,
"grad_norm": 0.04242085203333459,
"learning_rate": 7.872869049527855e-06,
"loss": 0.9346,
"step": 290
},
{
"epoch": 0.17025633524701064,
"grad_norm": 0.04192270928126719,
"learning_rate": 7.871996014718154e-06,
"loss": 0.916,
"step": 291
},
{
"epoch": 0.17084140856401067,
"grad_norm": 0.06296131776401025,
"learning_rate": 7.871120046754787e-06,
"loss": 0.7869,
"step": 292
},
{
"epoch": 0.1714264818810107,
"grad_norm": 0.04208658542590707,
"learning_rate": 7.870241146377773e-06,
"loss": 0.863,
"step": 293
},
{
"epoch": 0.17201155519801076,
"grad_norm": 0.04221040890826715,
"learning_rate": 7.869359314329613e-06,
"loss": 0.8125,
"step": 294
},
{
"epoch": 0.17259662851501079,
"grad_norm": 0.044062682914531,
"learning_rate": 7.868474551355277e-06,
"loss": 0.8283,
"step": 295
},
{
"epoch": 0.17318170183201081,
"grad_norm": 0.053136044791769796,
"learning_rate": 7.867586858202221e-06,
"loss": 0.9321,
"step": 296
},
{
"epoch": 0.17376677514901087,
"grad_norm": 0.041998835388755755,
"learning_rate": 7.866696235620367e-06,
"loss": 0.9435,
"step": 297
},
{
"epoch": 0.1743518484660109,
"grad_norm": 0.04432061620173052,
"learning_rate": 7.865802684362119e-06,
"loss": 0.944,
"step": 298
},
{
"epoch": 0.17493692178301093,
"grad_norm": 0.03816132379303917,
"learning_rate": 7.864906205182347e-06,
"loss": 0.9222,
"step": 299
},
{
"epoch": 0.17552199510001096,
"grad_norm": 0.04061878988742196,
"learning_rate": 7.864006798838405e-06,
"loss": 0.9344,
"step": 300
},
{
"epoch": 0.17610706841701101,
"grad_norm": 0.038725999488975066,
"learning_rate": 7.863104466090113e-06,
"loss": 0.9477,
"step": 301
},
{
"epoch": 0.17669214173401104,
"grad_norm": 0.0675670307551766,
"learning_rate": 7.862199207699763e-06,
"loss": 0.8939,
"step": 302
},
{
"epoch": 0.17727721505101107,
"grad_norm": 0.04866290151842816,
"learning_rate": 7.861291024432122e-06,
"loss": 0.944,
"step": 303
},
{
"epoch": 0.17786228836801113,
"grad_norm": 0.04231401792052211,
"learning_rate": 7.860379917054426e-06,
"loss": 0.9108,
"step": 304
},
{
"epoch": 0.17844736168501116,
"grad_norm": 0.21645206187990054,
"learning_rate": 7.859465886336381e-06,
"loss": 0.9328,
"step": 305
},
{
"epoch": 0.1790324350020112,
"grad_norm": 0.04212212891416765,
"learning_rate": 7.858548933050162e-06,
"loss": 0.9755,
"step": 306
},
{
"epoch": 0.17961750831901122,
"grad_norm": 0.047331820432207364,
"learning_rate": 7.857629057970417e-06,
"loss": 0.8702,
"step": 307
},
{
"epoch": 0.18020258163601127,
"grad_norm": 0.040260553510288316,
"learning_rate": 7.856706261874258e-06,
"loss": 0.8934,
"step": 308
},
{
"epoch": 0.1807876549530113,
"grad_norm": 0.04326897093604986,
"learning_rate": 7.855780545541264e-06,
"loss": 0.877,
"step": 309
},
{
"epoch": 0.18137272827001133,
"grad_norm": 0.03936456606263684,
"learning_rate": 7.854851909753487e-06,
"loss": 0.9206,
"step": 310
},
{
"epoch": 0.18195780158701136,
"grad_norm": 0.040529920237504666,
"learning_rate": 7.853920355295438e-06,
"loss": 0.8469,
"step": 311
},
{
"epoch": 0.18254287490401142,
"grad_norm": 0.07984199172508148,
"learning_rate": 7.852985882954102e-06,
"loss": 0.856,
"step": 312
},
{
"epoch": 0.18312794822101144,
"grad_norm": 0.04341308255910565,
"learning_rate": 7.85204849351892e-06,
"loss": 0.8975,
"step": 313
},
{
"epoch": 0.18371302153801147,
"grad_norm": 0.05163482815007264,
"learning_rate": 7.851108187781802e-06,
"loss": 0.8516,
"step": 314
},
{
"epoch": 0.18429809485501153,
"grad_norm": 0.04363599571939765,
"learning_rate": 7.850164966537124e-06,
"loss": 0.9088,
"step": 315
},
{
"epoch": 0.18488316817201156,
"grad_norm": 0.043024652012164384,
"learning_rate": 7.84921883058172e-06,
"loss": 0.8291,
"step": 316
},
{
"epoch": 0.1854682414890116,
"grad_norm": 0.041659608271311335,
"learning_rate": 7.848269780714892e-06,
"loss": 0.9719,
"step": 317
},
{
"epoch": 0.18605331480601162,
"grad_norm": 0.04557805725835434,
"learning_rate": 7.847317817738394e-06,
"loss": 0.9638,
"step": 318
},
{
"epoch": 0.18663838812301167,
"grad_norm": 0.04388413396186285,
"learning_rate": 7.846362942456455e-06,
"loss": 0.93,
"step": 319
},
{
"epoch": 0.1872234614400117,
"grad_norm": 0.04052950822615268,
"learning_rate": 7.845405155675752e-06,
"loss": 0.8951,
"step": 320
},
{
"epoch": 0.18780853475701173,
"grad_norm": 0.03852369264523642,
"learning_rate": 7.844444458205428e-06,
"loss": 0.8521,
"step": 321
},
{
"epoch": 0.1883936080740118,
"grad_norm": 0.045799254500923765,
"learning_rate": 7.843480850857083e-06,
"loss": 0.8966,
"step": 322
},
{
"epoch": 0.18897868139101182,
"grad_norm": 0.04373024374645557,
"learning_rate": 7.842514334444776e-06,
"loss": 0.973,
"step": 323
},
{
"epoch": 0.18956375470801184,
"grad_norm": 0.037372310076302304,
"learning_rate": 7.841544909785022e-06,
"loss": 1.0232,
"step": 324
},
{
"epoch": 0.19014882802501187,
"grad_norm": 0.044911079932608254,
"learning_rate": 7.840572577696798e-06,
"loss": 0.8351,
"step": 325
},
{
"epoch": 0.19073390134201193,
"grad_norm": 0.04820487217400144,
"learning_rate": 7.839597339001529e-06,
"loss": 0.9381,
"step": 326
},
{
"epoch": 0.19131897465901196,
"grad_norm": 0.04460622903204633,
"learning_rate": 7.8386191945231e-06,
"loss": 0.9047,
"step": 327
},
{
"epoch": 0.191904047976012,
"grad_norm": 0.04344227987010105,
"learning_rate": 7.837638145087855e-06,
"loss": 0.8882,
"step": 328
},
{
"epoch": 0.19248912129301204,
"grad_norm": 0.041162516007716705,
"learning_rate": 7.836654191524583e-06,
"loss": 0.888,
"step": 329
},
{
"epoch": 0.19307419461001207,
"grad_norm": 0.050217278727892056,
"learning_rate": 7.835667334664533e-06,
"loss": 0.9425,
"step": 330
},
{
"epoch": 0.1936592679270121,
"grad_norm": 0.045239661384039016,
"learning_rate": 7.834677575341407e-06,
"loss": 0.845,
"step": 331
},
{
"epoch": 0.19424434124401213,
"grad_norm": 0.04069762643444013,
"learning_rate": 7.833684914391354e-06,
"loss": 0.9045,
"step": 332
},
{
"epoch": 0.1948294145610122,
"grad_norm": 0.03659391253836006,
"learning_rate": 7.832689352652978e-06,
"loss": 0.8415,
"step": 333
},
{
"epoch": 0.19541448787801222,
"grad_norm": 0.0478253523475305,
"learning_rate": 7.831690890967332e-06,
"loss": 0.9023,
"step": 334
},
{
"epoch": 0.19599956119501225,
"grad_norm": 0.03931532297283958,
"learning_rate": 7.830689530177923e-06,
"loss": 0.8757,
"step": 335
},
{
"epoch": 0.19658463451201227,
"grad_norm": 0.04480666786944768,
"learning_rate": 7.8296852711307e-06,
"loss": 0.8393,
"step": 336
},
{
"epoch": 0.19716970782901233,
"grad_norm": 0.0734058976406723,
"learning_rate": 7.828678114674066e-06,
"loss": 0.9038,
"step": 337
},
{
"epoch": 0.19775478114601236,
"grad_norm": 0.03967258990394233,
"learning_rate": 7.827668061658871e-06,
"loss": 0.8009,
"step": 338
},
{
"epoch": 0.1983398544630124,
"grad_norm": 0.0418553799297778,
"learning_rate": 7.82665511293841e-06,
"loss": 0.8865,
"step": 339
},
{
"epoch": 0.19892492778001244,
"grad_norm": 0.0384561013270465,
"learning_rate": 7.825639269368426e-06,
"loss": 0.872,
"step": 340
},
{
"epoch": 0.19951000109701247,
"grad_norm": 0.04182210062250734,
"learning_rate": 7.824620531807106e-06,
"loss": 0.8974,
"step": 341
},
{
"epoch": 0.2000950744140125,
"grad_norm": 0.0402445680829306,
"learning_rate": 7.823598901115085e-06,
"loss": 0.8017,
"step": 342
},
{
"epoch": 0.20068014773101253,
"grad_norm": 0.03976795416941979,
"learning_rate": 7.822574378155436e-06,
"loss": 0.9298,
"step": 343
},
{
"epoch": 0.2012652210480126,
"grad_norm": 0.043031163568613286,
"learning_rate": 7.821546963793683e-06,
"loss": 0.9508,
"step": 344
},
{
"epoch": 0.20185029436501262,
"grad_norm": 0.0485982687560341,
"learning_rate": 7.82051665889779e-06,
"loss": 0.8536,
"step": 345
},
{
"epoch": 0.20243536768201265,
"grad_norm": 0.04149899672912665,
"learning_rate": 7.819483464338156e-06,
"loss": 0.8767,
"step": 346
},
{
"epoch": 0.2030204409990127,
"grad_norm": 0.0971131140419449,
"learning_rate": 7.818447380987634e-06,
"loss": 0.9271,
"step": 347
},
{
"epoch": 0.20360551431601273,
"grad_norm": 0.04183099462603424,
"learning_rate": 7.817408409721506e-06,
"loss": 0.8362,
"step": 348
},
{
"epoch": 0.20419058763301276,
"grad_norm": 0.03719681766847752,
"learning_rate": 7.8163665514175e-06,
"loss": 0.8544,
"step": 349
},
{
"epoch": 0.2047756609500128,
"grad_norm": 0.08061763199476867,
"learning_rate": 7.815321806955782e-06,
"loss": 0.8335,
"step": 350
},
{
"epoch": 0.20536073426701285,
"grad_norm": 0.053441055551360056,
"learning_rate": 7.814274177218955e-06,
"loss": 0.8602,
"step": 351
},
{
"epoch": 0.20594580758401287,
"grad_norm": 0.1775803458323184,
"learning_rate": 7.81322366309206e-06,
"loss": 0.8998,
"step": 352
},
{
"epoch": 0.2065308809010129,
"grad_norm": 0.0637002971995377,
"learning_rate": 7.812170265462573e-06,
"loss": 0.9737,
"step": 353
},
{
"epoch": 0.20711595421801293,
"grad_norm": 0.04274639590829472,
"learning_rate": 7.81111398522041e-06,
"loss": 1.009,
"step": 354
},
{
"epoch": 0.207701027535013,
"grad_norm": 0.041282961278004064,
"learning_rate": 7.81005482325792e-06,
"loss": 0.9035,
"step": 355
},
{
"epoch": 0.20828610085201302,
"grad_norm": 0.04195399372218902,
"learning_rate": 7.808992780469889e-06,
"loss": 0.9128,
"step": 356
},
{
"epoch": 0.20887117416901305,
"grad_norm": 0.03698717701423449,
"learning_rate": 7.807927857753527e-06,
"loss": 0.8562,
"step": 357
},
{
"epoch": 0.2094562474860131,
"grad_norm": 0.1081591722204418,
"learning_rate": 7.80686005600849e-06,
"loss": 0.8931,
"step": 358
},
{
"epoch": 0.21004132080301313,
"grad_norm": 0.04152721891763337,
"learning_rate": 7.80578937613686e-06,
"loss": 0.8633,
"step": 359
},
{
"epoch": 0.21062639412001316,
"grad_norm": 0.04200660652374662,
"learning_rate": 7.804715819043148e-06,
"loss": 0.8773,
"step": 360
},
{
"epoch": 0.2112114674370132,
"grad_norm": 0.04405111681557889,
"learning_rate": 7.803639385634302e-06,
"loss": 0.8587,
"step": 361
},
{
"epoch": 0.21179654075401325,
"grad_norm": 0.04571477197647962,
"learning_rate": 7.802560076819694e-06,
"loss": 0.8334,
"step": 362
},
{
"epoch": 0.21238161407101327,
"grad_norm": 0.041607118224879065,
"learning_rate": 7.80147789351113e-06,
"loss": 0.8739,
"step": 363
},
{
"epoch": 0.2129666873880133,
"grad_norm": 0.06194034993642153,
"learning_rate": 7.800392836622838e-06,
"loss": 0.8956,
"step": 364
},
{
"epoch": 0.21355176070501336,
"grad_norm": 0.03798806008002444,
"learning_rate": 7.79930490707148e-06,
"loss": 0.8966,
"step": 365
},
{
"epoch": 0.2141368340220134,
"grad_norm": 0.04739784630899101,
"learning_rate": 7.798214105776146e-06,
"loss": 0.9552,
"step": 366
},
{
"epoch": 0.21472190733901342,
"grad_norm": 0.04497114482691456,
"learning_rate": 7.797120433658343e-06,
"loss": 0.8666,
"step": 367
},
{
"epoch": 0.21530698065601345,
"grad_norm": 0.04176901867071411,
"learning_rate": 7.796023891642011e-06,
"loss": 0.9051,
"step": 368
},
{
"epoch": 0.2158920539730135,
"grad_norm": 0.056698801623745465,
"learning_rate": 7.794924480653513e-06,
"loss": 0.8745,
"step": 369
},
{
"epoch": 0.21647712729001353,
"grad_norm": 0.04267067239376988,
"learning_rate": 7.793822201621633e-06,
"loss": 0.9129,
"step": 370
},
{
"epoch": 0.21706220060701356,
"grad_norm": 0.03999203881276761,
"learning_rate": 7.79271705547758e-06,
"loss": 0.8814,
"step": 371
},
{
"epoch": 0.2176472739240136,
"grad_norm": 0.04206496870855173,
"learning_rate": 7.79160904315499e-06,
"loss": 0.8936,
"step": 372
},
{
"epoch": 0.21823234724101365,
"grad_norm": 0.040567992793796616,
"learning_rate": 7.79049816558991e-06,
"loss": 0.8961,
"step": 373
},
{
"epoch": 0.21881742055801368,
"grad_norm": 0.04165915715538525,
"learning_rate": 7.789384423720815e-06,
"loss": 0.901,
"step": 374
},
{
"epoch": 0.2194024938750137,
"grad_norm": 0.04311161567240108,
"learning_rate": 7.788267818488597e-06,
"loss": 0.8571,
"step": 375
},
{
"epoch": 0.21998756719201376,
"grad_norm": 0.04090984120457054,
"learning_rate": 7.78714835083657e-06,
"loss": 0.879,
"step": 376
},
{
"epoch": 0.2205726405090138,
"grad_norm": 0.03723653818234615,
"learning_rate": 7.786026021710462e-06,
"loss": 0.8687,
"step": 377
},
{
"epoch": 0.22115771382601382,
"grad_norm": 0.040732056413017595,
"learning_rate": 7.78490083205842e-06,
"loss": 0.9033,
"step": 378
},
{
"epoch": 0.22174278714301385,
"grad_norm": 0.03755340315603773,
"learning_rate": 7.783772782831008e-06,
"loss": 0.8919,
"step": 379
},
{
"epoch": 0.2223278604600139,
"grad_norm": 0.04513035465018611,
"learning_rate": 7.782641874981207e-06,
"loss": 0.8766,
"step": 380
},
{
"epoch": 0.22291293377701393,
"grad_norm": 0.042927929340526826,
"learning_rate": 7.78150810946441e-06,
"loss": 0.8692,
"step": 381
},
{
"epoch": 0.22349800709401396,
"grad_norm": 0.03548281821425231,
"learning_rate": 7.780371487238428e-06,
"loss": 0.7295,
"step": 382
},
{
"epoch": 0.22408308041101402,
"grad_norm": 0.03978070402906236,
"learning_rate": 7.779232009263484e-06,
"loss": 0.8555,
"step": 383
},
{
"epoch": 0.22466815372801405,
"grad_norm": 0.036709565386030436,
"learning_rate": 7.778089676502209e-06,
"loss": 0.7492,
"step": 384
},
{
"epoch": 0.22525322704501408,
"grad_norm": 0.04547788495521431,
"learning_rate": 7.776944489919649e-06,
"loss": 0.8334,
"step": 385
},
{
"epoch": 0.2258383003620141,
"grad_norm": 0.04220442842369723,
"learning_rate": 7.775796450483267e-06,
"loss": 0.8244,
"step": 386
},
{
"epoch": 0.22642337367901416,
"grad_norm": 0.04778060193840243,
"learning_rate": 7.774645559162927e-06,
"loss": 0.8511,
"step": 387
},
{
"epoch": 0.2270084469960142,
"grad_norm": 0.05014682109980806,
"learning_rate": 7.773491816930904e-06,
"loss": 0.8334,
"step": 388
},
{
"epoch": 0.22759352031301422,
"grad_norm": 0.04001986651909544,
"learning_rate": 7.772335224761886e-06,
"loss": 0.8224,
"step": 389
},
{
"epoch": 0.22817859363001425,
"grad_norm": 0.040094874463681825,
"learning_rate": 7.771175783632966e-06,
"loss": 0.9069,
"step": 390
},
{
"epoch": 0.2287636669470143,
"grad_norm": 0.03678306586668835,
"learning_rate": 7.770013494523641e-06,
"loss": 0.8758,
"step": 391
},
{
"epoch": 0.22934874026401433,
"grad_norm": 0.04444193067206238,
"learning_rate": 7.768848358415819e-06,
"loss": 0.946,
"step": 392
},
{
"epoch": 0.22993381358101436,
"grad_norm": 0.042386218395158284,
"learning_rate": 7.767680376293811e-06,
"loss": 0.8395,
"step": 393
},
{
"epoch": 0.23051888689801442,
"grad_norm": 0.051874603825963005,
"learning_rate": 7.766509549144332e-06,
"loss": 0.8867,
"step": 394
},
{
"epoch": 0.23110396021501445,
"grad_norm": 0.04115935591406729,
"learning_rate": 7.765335877956498e-06,
"loss": 0.8509,
"step": 395
},
{
"epoch": 0.23168903353201448,
"grad_norm": 0.045377388296697053,
"learning_rate": 7.764159363721833e-06,
"loss": 0.8783,
"step": 396
},
{
"epoch": 0.2322741068490145,
"grad_norm": 0.03845331275197764,
"learning_rate": 7.762980007434261e-06,
"loss": 0.8721,
"step": 397
},
{
"epoch": 0.23285918016601456,
"grad_norm": 0.06130607399623932,
"learning_rate": 7.761797810090103e-06,
"loss": 0.896,
"step": 398
},
{
"epoch": 0.2334442534830146,
"grad_norm": 0.0653248890033106,
"learning_rate": 7.760612772688086e-06,
"loss": 0.9239,
"step": 399
},
{
"epoch": 0.23402932680001462,
"grad_norm": 0.04386903271065406,
"learning_rate": 7.759424896229329e-06,
"loss": 0.9055,
"step": 400
},
{
"epoch": 0.23461440011701468,
"grad_norm": 0.05112057938888221,
"learning_rate": 7.758234181717359e-06,
"loss": 0.8179,
"step": 401
},
{
"epoch": 0.2351994734340147,
"grad_norm": 0.03919834965990091,
"learning_rate": 7.757040630158094e-06,
"loss": 0.9131,
"step": 402
},
{
"epoch": 0.23578454675101473,
"grad_norm": 0.04080472273046829,
"learning_rate": 7.75584424255985e-06,
"loss": 0.8772,
"step": 403
},
{
"epoch": 0.23636962006801476,
"grad_norm": 0.036834620421617906,
"learning_rate": 7.754645019933338e-06,
"loss": 0.8155,
"step": 404
},
{
"epoch": 0.23695469338501482,
"grad_norm": 0.03654317750178369,
"learning_rate": 7.753442963291668e-06,
"loss": 0.8346,
"step": 405
},
{
"epoch": 0.23753976670201485,
"grad_norm": 0.04519798929105044,
"learning_rate": 7.752238073650339e-06,
"loss": 0.912,
"step": 406
},
{
"epoch": 0.23812484001901488,
"grad_norm": 0.03826695253269902,
"learning_rate": 7.751030352027246e-06,
"loss": 0.8772,
"step": 407
},
{
"epoch": 0.2387099133360149,
"grad_norm": 0.04486993387188878,
"learning_rate": 7.749819799442676e-06,
"loss": 0.8826,
"step": 408
},
{
"epoch": 0.23929498665301496,
"grad_norm": 0.0776681654995442,
"learning_rate": 7.74860641691931e-06,
"loss": 0.8987,
"step": 409
},
{
"epoch": 0.239880059970015,
"grad_norm": 0.05169098865704706,
"learning_rate": 7.747390205482216e-06,
"loss": 0.7904,
"step": 410
},
{
"epoch": 0.24046513328701502,
"grad_norm": 0.042053672534972886,
"learning_rate": 7.746171166158855e-06,
"loss": 0.9188,
"step": 411
},
{
"epoch": 0.24105020660401508,
"grad_norm": 0.044614916290635534,
"learning_rate": 7.744949299979071e-06,
"loss": 0.9118,
"step": 412
},
{
"epoch": 0.2416352799210151,
"grad_norm": 0.04594381480411999,
"learning_rate": 7.743724607975105e-06,
"loss": 0.8547,
"step": 413
},
{
"epoch": 0.24222035323801513,
"grad_norm": 0.037664202042716706,
"learning_rate": 7.742497091181578e-06,
"loss": 0.8446,
"step": 414
},
{
"epoch": 0.24280542655501516,
"grad_norm": 0.04794778502796113,
"learning_rate": 7.741266750635502e-06,
"loss": 0.897,
"step": 415
},
{
"epoch": 0.24339049987201522,
"grad_norm": 0.04188595026753968,
"learning_rate": 7.740033587376275e-06,
"loss": 0.9061,
"step": 416
},
{
"epoch": 0.24397557318901525,
"grad_norm": 0.04407359629305258,
"learning_rate": 7.738797602445671e-06,
"loss": 0.9146,
"step": 417
},
{
"epoch": 0.24456064650601528,
"grad_norm": 0.04077311126531583,
"learning_rate": 7.73755879688786e-06,
"loss": 0.8515,
"step": 418
},
{
"epoch": 0.24514571982301533,
"grad_norm": 0.055784116703384304,
"learning_rate": 7.736317171749385e-06,
"loss": 0.851,
"step": 419
},
{
"epoch": 0.24573079314001536,
"grad_norm": 0.049743142106565615,
"learning_rate": 7.735072728079179e-06,
"loss": 0.8718,
"step": 420
},
{
"epoch": 0.2463158664570154,
"grad_norm": 0.04470234941233142,
"learning_rate": 7.73382546692855e-06,
"loss": 0.9624,
"step": 421
},
{
"epoch": 0.24690093977401542,
"grad_norm": 0.040010409109375616,
"learning_rate": 7.732575389351187e-06,
"loss": 0.8925,
"step": 422
},
{
"epoch": 0.24748601309101548,
"grad_norm": 0.04513853855792064,
"learning_rate": 7.731322496403161e-06,
"loss": 0.8163,
"step": 423
},
{
"epoch": 0.2480710864080155,
"grad_norm": 0.04305001532204258,
"learning_rate": 7.730066789142922e-06,
"loss": 0.818,
"step": 424
},
{
"epoch": 0.24865615972501554,
"grad_norm": 0.0381328984157233,
"learning_rate": 7.728808268631291e-06,
"loss": 0.8655,
"step": 425
},
{
"epoch": 0.24924123304201556,
"grad_norm": 0.04141777979618345,
"learning_rate": 7.727546935931473e-06,
"loss": 0.8447,
"step": 426
},
{
"epoch": 0.24982630635901562,
"grad_norm": 0.0441532278076972,
"learning_rate": 7.726282792109049e-06,
"loss": 0.7839,
"step": 427
},
{
"epoch": 0.2504113796760157,
"grad_norm": 0.04369415131884431,
"learning_rate": 7.725015838231966e-06,
"loss": 0.8375,
"step": 428
},
{
"epoch": 0.2509964529930157,
"grad_norm": 0.04101978650087937,
"learning_rate": 7.723746075370553e-06,
"loss": 0.8001,
"step": 429
},
{
"epoch": 0.25158152631001574,
"grad_norm": 0.04974984131766339,
"learning_rate": 7.722473504597512e-06,
"loss": 0.8914,
"step": 430
},
{
"epoch": 0.25216659962701576,
"grad_norm": 0.042172544165836386,
"learning_rate": 7.721198126987914e-06,
"loss": 0.8266,
"step": 431
},
{
"epoch": 0.2527516729440158,
"grad_norm": 0.04137736709510405,
"learning_rate": 7.719919943619202e-06,
"loss": 0.9043,
"step": 432
},
{
"epoch": 0.2533367462610158,
"grad_norm": 0.04296090636863713,
"learning_rate": 7.718638955571187e-06,
"loss": 0.8638,
"step": 433
},
{
"epoch": 0.25392181957801585,
"grad_norm": 0.04617407215473608,
"learning_rate": 7.717355163926054e-06,
"loss": 0.8223,
"step": 434
},
{
"epoch": 0.2545068928950159,
"grad_norm": 0.05470709039494341,
"learning_rate": 7.716068569768357e-06,
"loss": 0.8702,
"step": 435
},
{
"epoch": 0.25509196621201596,
"grad_norm": 0.05481374522257052,
"learning_rate": 7.714779174185011e-06,
"loss": 0.8644,
"step": 436
},
{
"epoch": 0.255677039529016,
"grad_norm": 0.047154937968022254,
"learning_rate": 7.713486978265303e-06,
"loss": 0.8656,
"step": 437
},
{
"epoch": 0.256262112846016,
"grad_norm": 0.04351286538068476,
"learning_rate": 7.712191983100885e-06,
"loss": 0.9139,
"step": 438
},
{
"epoch": 0.25684718616301605,
"grad_norm": 0.036225679985398614,
"learning_rate": 7.710894189785773e-06,
"loss": 0.8589,
"step": 439
},
{
"epoch": 0.2574322594800161,
"grad_norm": 0.036619323150318614,
"learning_rate": 7.709593599416346e-06,
"loss": 0.8355,
"step": 440
},
{
"epoch": 0.2580173327970161,
"grad_norm": 0.052556029410115106,
"learning_rate": 7.708290213091348e-06,
"loss": 0.9407,
"step": 441
},
{
"epoch": 0.25860240611401614,
"grad_norm": 0.04045383405114572,
"learning_rate": 7.706984031911884e-06,
"loss": 0.8224,
"step": 442
},
{
"epoch": 0.2591874794310162,
"grad_norm": 0.03850958419485924,
"learning_rate": 7.705675056981419e-06,
"loss": 0.8289,
"step": 443
},
{
"epoch": 0.25977255274801625,
"grad_norm": 0.042025364903943374,
"learning_rate": 7.704363289405782e-06,
"loss": 0.8295,
"step": 444
},
{
"epoch": 0.2603576260650163,
"grad_norm": 0.18295957147270098,
"learning_rate": 7.703048730293156e-06,
"loss": 0.8591,
"step": 445
},
{
"epoch": 0.2609426993820163,
"grad_norm": 0.08559741878328085,
"learning_rate": 7.701731380754086e-06,
"loss": 0.8841,
"step": 446
},
{
"epoch": 0.26152777269901634,
"grad_norm": 0.04030996945096956,
"learning_rate": 7.700411241901473e-06,
"loss": 0.927,
"step": 447
},
{
"epoch": 0.26211284601601637,
"grad_norm": 0.04080969851872162,
"learning_rate": 7.699088314850574e-06,
"loss": 0.8448,
"step": 448
},
{
"epoch": 0.2626979193330164,
"grad_norm": 0.0378951860436084,
"learning_rate": 7.697762600719002e-06,
"loss": 0.8806,
"step": 449
},
{
"epoch": 0.2632829926500165,
"grad_norm": 0.03623506747860633,
"learning_rate": 7.696434100626727e-06,
"loss": 0.8899,
"step": 450
},
{
"epoch": 0.2638680659670165,
"grad_norm": 0.03949638769998007,
"learning_rate": 7.695102815696068e-06,
"loss": 0.896,
"step": 451
},
{
"epoch": 0.26445313928401654,
"grad_norm": 0.039703852605046996,
"learning_rate": 7.6937687470517e-06,
"loss": 0.9092,
"step": 452
},
{
"epoch": 0.26503821260101657,
"grad_norm": 0.04067815287205374,
"learning_rate": 7.692431895820648e-06,
"loss": 0.8948,
"step": 453
},
{
"epoch": 0.2656232859180166,
"grad_norm": 0.0498733499092084,
"learning_rate": 7.691092263132289e-06,
"loss": 0.8407,
"step": 454
},
{
"epoch": 0.2662083592350166,
"grad_norm": 0.06147495248603913,
"learning_rate": 7.689749850118347e-06,
"loss": 0.8343,
"step": 455
},
{
"epoch": 0.26679343255201665,
"grad_norm": 0.2184373267208327,
"learning_rate": 7.6884046579129e-06,
"loss": 0.909,
"step": 456
},
{
"epoch": 0.26737850586901674,
"grad_norm": 0.038431478494613806,
"learning_rate": 7.68705668765237e-06,
"loss": 0.8685,
"step": 457
},
{
"epoch": 0.26796357918601676,
"grad_norm": 0.15936476995911328,
"learning_rate": 7.685705940475523e-06,
"loss": 0.8176,
"step": 458
},
{
"epoch": 0.2685486525030168,
"grad_norm": 0.04683404608346142,
"learning_rate": 7.68435241752348e-06,
"loss": 0.8523,
"step": 459
},
{
"epoch": 0.2691337258200168,
"grad_norm": 0.04603467291785015,
"learning_rate": 7.6829961199397e-06,
"loss": 0.8134,
"step": 460
},
{
"epoch": 0.26971879913701685,
"grad_norm": 0.0502568206105835,
"learning_rate": 7.681637048869985e-06,
"loss": 0.9043,
"step": 461
},
{
"epoch": 0.2703038724540169,
"grad_norm": 0.03828249024856938,
"learning_rate": 7.680275205462485e-06,
"loss": 0.843,
"step": 462
},
{
"epoch": 0.2708889457710169,
"grad_norm": 0.043272538310479496,
"learning_rate": 7.67891059086769e-06,
"loss": 0.9456,
"step": 463
},
{
"epoch": 0.271474019088017,
"grad_norm": 0.04015211847803301,
"learning_rate": 7.67754320623843e-06,
"loss": 0.9414,
"step": 464
},
{
"epoch": 0.272059092405017,
"grad_norm": 0.04064423849996686,
"learning_rate": 7.676173052729877e-06,
"loss": 0.8705,
"step": 465
},
{
"epoch": 0.27264416572201705,
"grad_norm": 0.04969071039366313,
"learning_rate": 7.67480013149954e-06,
"loss": 0.8922,
"step": 466
},
{
"epoch": 0.2732292390390171,
"grad_norm": 0.05260125052428257,
"learning_rate": 7.67342444370727e-06,
"loss": 0.9733,
"step": 467
},
{
"epoch": 0.2738143123560171,
"grad_norm": 0.040268950739110694,
"learning_rate": 7.672045990515248e-06,
"loss": 0.8806,
"step": 468
},
{
"epoch": 0.27439938567301714,
"grad_norm": 0.042577022398831814,
"learning_rate": 7.670664773088e-06,
"loss": 0.9561,
"step": 469
},
{
"epoch": 0.27498445899001717,
"grad_norm": 0.04618878912895806,
"learning_rate": 7.669280792592383e-06,
"loss": 0.8403,
"step": 470
},
{
"epoch": 0.2755695323070172,
"grad_norm": 0.04343593914167416,
"learning_rate": 7.667894050197583e-06,
"loss": 0.8355,
"step": 471
},
{
"epoch": 0.2761546056240173,
"grad_norm": 0.040556959908718666,
"learning_rate": 7.66650454707513e-06,
"loss": 0.8608,
"step": 472
},
{
"epoch": 0.2767396789410173,
"grad_norm": 0.054697588596535354,
"learning_rate": 7.665112284398881e-06,
"loss": 0.839,
"step": 473
},
{
"epoch": 0.27732475225801734,
"grad_norm": 0.04393307328991595,
"learning_rate": 7.66371726334502e-06,
"loss": 0.8494,
"step": 474
},
{
"epoch": 0.27790982557501737,
"grad_norm": 0.04275309783082017,
"learning_rate": 7.662319485092067e-06,
"loss": 0.8259,
"step": 475
},
{
"epoch": 0.2784948988920174,
"grad_norm": 0.03953402161699839,
"learning_rate": 7.66091895082087e-06,
"loss": 0.7773,
"step": 476
},
{
"epoch": 0.2790799722090174,
"grad_norm": 0.03860846601474072,
"learning_rate": 7.659515661714608e-06,
"loss": 0.7962,
"step": 477
},
{
"epoch": 0.27966504552601745,
"grad_norm": 0.05366294387360709,
"learning_rate": 7.658109618958779e-06,
"loss": 0.9233,
"step": 478
},
{
"epoch": 0.28025011884301754,
"grad_norm": 0.04420018919513911,
"learning_rate": 7.656700823741216e-06,
"loss": 0.8156,
"step": 479
},
{
"epoch": 0.28083519216001757,
"grad_norm": 0.039049327120388326,
"learning_rate": 7.655289277252074e-06,
"loss": 0.8856,
"step": 480
},
{
"epoch": 0.2814202654770176,
"grad_norm": 0.26425632822736106,
"learning_rate": 7.653874980683828e-06,
"loss": 0.9503,
"step": 481
},
{
"epoch": 0.2820053387940176,
"grad_norm": 0.04289481203658443,
"learning_rate": 7.652457935231285e-06,
"loss": 0.8937,
"step": 482
},
{
"epoch": 0.28259041211101765,
"grad_norm": 0.04293700432214028,
"learning_rate": 7.651038142091568e-06,
"loss": 0.8571,
"step": 483
},
{
"epoch": 0.2831754854280177,
"grad_norm": 0.038244633724887706,
"learning_rate": 7.649615602464123e-06,
"loss": 0.7906,
"step": 484
},
{
"epoch": 0.2837605587450177,
"grad_norm": 0.03944291933937196,
"learning_rate": 7.648190317550717e-06,
"loss": 0.926,
"step": 485
},
{
"epoch": 0.2843456320620178,
"grad_norm": 0.10913228375108804,
"learning_rate": 7.646762288555433e-06,
"loss": 0.8592,
"step": 486
},
{
"epoch": 0.2849307053790178,
"grad_norm": 0.038286199243376444,
"learning_rate": 7.645331516684676e-06,
"loss": 0.8418,
"step": 487
},
{
"epoch": 0.28551577869601785,
"grad_norm": 0.043890441209433355,
"learning_rate": 7.643898003147167e-06,
"loss": 0.9614,
"step": 488
},
{
"epoch": 0.2861008520130179,
"grad_norm": 0.0382147078144481,
"learning_rate": 7.642461749153943e-06,
"loss": 0.83,
"step": 489
},
{
"epoch": 0.2866859253300179,
"grad_norm": 0.10202631306826099,
"learning_rate": 7.641022755918357e-06,
"loss": 0.9484,
"step": 490
},
{
"epoch": 0.28727099864701794,
"grad_norm": 0.04017584960145629,
"learning_rate": 7.639581024656072e-06,
"loss": 0.8278,
"step": 491
},
{
"epoch": 0.28785607196401797,
"grad_norm": 0.05571278346236126,
"learning_rate": 7.638136556585071e-06,
"loss": 0.9093,
"step": 492
},
{
"epoch": 0.28844114528101805,
"grad_norm": 0.04145030654143878,
"learning_rate": 7.636689352925643e-06,
"loss": 0.913,
"step": 493
},
{
"epoch": 0.2890262185980181,
"grad_norm": 0.05058778086388885,
"learning_rate": 7.635239414900393e-06,
"loss": 0.9366,
"step": 494
},
{
"epoch": 0.2896112919150181,
"grad_norm": 0.047476857954303966,
"learning_rate": 7.63378674373423e-06,
"loss": 0.8528,
"step": 495
},
{
"epoch": 0.29019636523201814,
"grad_norm": 0.043805250682478876,
"learning_rate": 7.632331340654377e-06,
"loss": 0.9953,
"step": 496
},
{
"epoch": 0.29078143854901817,
"grad_norm": 0.04691649904039108,
"learning_rate": 7.630873206890365e-06,
"loss": 0.8893,
"step": 497
},
{
"epoch": 0.2913665118660182,
"grad_norm": 0.061366723803107565,
"learning_rate": 7.629412343674026e-06,
"loss": 0.8895,
"step": 498
},
{
"epoch": 0.2919515851830182,
"grad_norm": 0.052047839285183604,
"learning_rate": 7.627948752239508e-06,
"loss": 0.8322,
"step": 499
},
{
"epoch": 0.2925366585000183,
"grad_norm": 0.04858401908225131,
"learning_rate": 7.6264824338232515e-06,
"loss": 0.7962,
"step": 500
},
{
"epoch": 0.29312173181701834,
"grad_norm": 0.05167192214353387,
"learning_rate": 7.625013389664012e-06,
"loss": 0.8178,
"step": 501
},
{
"epoch": 0.29370680513401837,
"grad_norm": 0.04571347435775933,
"learning_rate": 7.623541621002841e-06,
"loss": 0.9531,
"step": 502
},
{
"epoch": 0.2942918784510184,
"grad_norm": 0.05170428092011692,
"learning_rate": 7.622067129083092e-06,
"loss": 0.863,
"step": 503
},
{
"epoch": 0.2948769517680184,
"grad_norm": 0.04769020808960418,
"learning_rate": 7.620589915150423e-06,
"loss": 0.8693,
"step": 504
},
{
"epoch": 0.29546202508501845,
"grad_norm": 0.036447168096671895,
"learning_rate": 7.619109980452789e-06,
"loss": 0.8263,
"step": 505
},
{
"epoch": 0.2960470984020185,
"grad_norm": 0.04697422924818202,
"learning_rate": 7.617627326240441e-06,
"loss": 0.9127,
"step": 506
},
{
"epoch": 0.29663217171901857,
"grad_norm": 0.05026264821034648,
"learning_rate": 7.6161419537659345e-06,
"loss": 0.9163,
"step": 507
},
{
"epoch": 0.2972172450360186,
"grad_norm": 0.044605972655997896,
"learning_rate": 7.614653864284114e-06,
"loss": 0.8998,
"step": 508
},
{
"epoch": 0.2978023183530186,
"grad_norm": 0.06437245062396214,
"learning_rate": 7.613163059052123e-06,
"loss": 0.8039,
"step": 509
},
{
"epoch": 0.29838739167001865,
"grad_norm": 0.04554747767372419,
"learning_rate": 7.611669539329398e-06,
"loss": 0.8662,
"step": 510
},
{
"epoch": 0.2989724649870187,
"grad_norm": 0.0802330744693163,
"learning_rate": 7.610173306377671e-06,
"loss": 0.8343,
"step": 511
},
{
"epoch": 0.2995575383040187,
"grad_norm": 0.04192153945182111,
"learning_rate": 7.608674361460963e-06,
"loss": 0.8983,
"step": 512
},
{
"epoch": 0.30014261162101874,
"grad_norm": 0.04902593536059904,
"learning_rate": 7.607172705845589e-06,
"loss": 0.9242,
"step": 513
},
{
"epoch": 0.30072768493801877,
"grad_norm": 0.05436876049704265,
"learning_rate": 7.605668340800153e-06,
"loss": 0.834,
"step": 514
},
{
"epoch": 0.30131275825501885,
"grad_norm": 0.04047347680328035,
"learning_rate": 7.604161267595545e-06,
"loss": 0.8359,
"step": 515
},
{
"epoch": 0.3018978315720189,
"grad_norm": 0.03848909199643286,
"learning_rate": 7.602651487504946e-06,
"loss": 0.8126,
"step": 516
},
{
"epoch": 0.3024829048890189,
"grad_norm": 0.046207806514092946,
"learning_rate": 7.601139001803825e-06,
"loss": 0.869,
"step": 517
},
{
"epoch": 0.30306797820601894,
"grad_norm": 0.03852808749767389,
"learning_rate": 7.5996238117699344e-06,
"loss": 0.9808,
"step": 518
},
{
"epoch": 0.30365305152301897,
"grad_norm": 0.04005257123878126,
"learning_rate": 7.5981059186833114e-06,
"loss": 0.7767,
"step": 519
},
{
"epoch": 0.304238124840019,
"grad_norm": 0.05159848088030845,
"learning_rate": 7.596585323826277e-06,
"loss": 0.8932,
"step": 520
},
{
"epoch": 0.304823198157019,
"grad_norm": 0.040391024202221286,
"learning_rate": 7.595062028483434e-06,
"loss": 0.9093,
"step": 521
},
{
"epoch": 0.3054082714740191,
"grad_norm": 0.040428764785829324,
"learning_rate": 7.593536033941669e-06,
"loss": 0.9087,
"step": 522
},
{
"epoch": 0.30599334479101914,
"grad_norm": 0.09158364294178328,
"learning_rate": 7.592007341490145e-06,
"loss": 0.8601,
"step": 523
},
{
"epoch": 0.30657841810801917,
"grad_norm": 0.04136317840574456,
"learning_rate": 7.590475952420309e-06,
"loss": 0.8953,
"step": 524
},
{
"epoch": 0.3071634914250192,
"grad_norm": 0.06396409974499655,
"learning_rate": 7.588941868025881e-06,
"loss": 0.8297,
"step": 525
},
{
"epoch": 0.3077485647420192,
"grad_norm": 0.040333070246341814,
"learning_rate": 7.587405089602862e-06,
"loss": 0.7719,
"step": 526
},
{
"epoch": 0.30833363805901925,
"grad_norm": 0.04148918223122552,
"learning_rate": 7.585865618449528e-06,
"loss": 0.8007,
"step": 527
},
{
"epoch": 0.3089187113760193,
"grad_norm": 0.048132396423435676,
"learning_rate": 7.584323455866427e-06,
"loss": 0.8579,
"step": 528
},
{
"epoch": 0.30950378469301937,
"grad_norm": 0.04328208405834297,
"learning_rate": 7.582778603156387e-06,
"loss": 0.8071,
"step": 529
},
{
"epoch": 0.3100888580100194,
"grad_norm": 0.04318172969759895,
"learning_rate": 7.5812310616245e-06,
"loss": 0.83,
"step": 530
},
{
"epoch": 0.3106739313270194,
"grad_norm": 0.03768227605900526,
"learning_rate": 7.579680832578137e-06,
"loss": 0.8344,
"step": 531
},
{
"epoch": 0.31125900464401945,
"grad_norm": 0.04790982152353994,
"learning_rate": 7.578127917326936e-06,
"loss": 0.8974,
"step": 532
},
{
"epoch": 0.3118440779610195,
"grad_norm": 0.038002618502432514,
"learning_rate": 7.576572317182805e-06,
"loss": 0.792,
"step": 533
},
{
"epoch": 0.3124291512780195,
"grad_norm": 0.036571897121817694,
"learning_rate": 7.575014033459921e-06,
"loss": 0.8418,
"step": 534
},
{
"epoch": 0.31301422459501954,
"grad_norm": 0.03670017559323694,
"learning_rate": 7.573453067474724e-06,
"loss": 0.8834,
"step": 535
},
{
"epoch": 0.3135992979120196,
"grad_norm": 0.03621350431630049,
"learning_rate": 7.5718894205459284e-06,
"loss": 0.8549,
"step": 536
},
{
"epoch": 0.31418437122901965,
"grad_norm": 0.04296675197248163,
"learning_rate": 7.570323093994503e-06,
"loss": 0.7894,
"step": 537
},
{
"epoch": 0.3147694445460197,
"grad_norm": 0.044503765611357125,
"learning_rate": 7.568754089143688e-06,
"loss": 0.8655,
"step": 538
},
{
"epoch": 0.3153545178630197,
"grad_norm": 0.03715684392321602,
"learning_rate": 7.5671824073189845e-06,
"loss": 0.8697,
"step": 539
},
{
"epoch": 0.31593959118001974,
"grad_norm": 0.04056871817206291,
"learning_rate": 7.5656080498481535e-06,
"loss": 0.8803,
"step": 540
},
{
"epoch": 0.31652466449701977,
"grad_norm": 0.08089424036325778,
"learning_rate": 7.564031018061219e-06,
"loss": 0.8098,
"step": 541
},
{
"epoch": 0.3171097378140198,
"grad_norm": 0.04027917264460558,
"learning_rate": 7.562451313290459e-06,
"loss": 0.8939,
"step": 542
},
{
"epoch": 0.3176948111310199,
"grad_norm": 0.042852152415068484,
"learning_rate": 7.560868936870418e-06,
"loss": 0.8901,
"step": 543
},
{
"epoch": 0.3182798844480199,
"grad_norm": 0.05886634493495861,
"learning_rate": 7.559283890137889e-06,
"loss": 0.9286,
"step": 544
},
{
"epoch": 0.31886495776501994,
"grad_norm": 0.04216130498342499,
"learning_rate": 7.557696174431927e-06,
"loss": 0.8311,
"step": 545
},
{
"epoch": 0.31945003108201997,
"grad_norm": 0.053197981320438834,
"learning_rate": 7.556105791093838e-06,
"loss": 0.8952,
"step": 546
},
{
"epoch": 0.32003510439902,
"grad_norm": 0.038738558056749535,
"learning_rate": 7.554512741467183e-06,
"loss": 0.87,
"step": 547
},
{
"epoch": 0.32062017771602,
"grad_norm": 0.04255317774837396,
"learning_rate": 7.552917026897778e-06,
"loss": 0.8386,
"step": 548
},
{
"epoch": 0.32120525103302006,
"grad_norm": 0.06530091195329979,
"learning_rate": 7.551318648733684e-06,
"loss": 0.8862,
"step": 549
},
{
"epoch": 0.3217903243500201,
"grad_norm": 0.04640020878919504,
"learning_rate": 7.549717608325219e-06,
"loss": 0.8615,
"step": 550
},
{
"epoch": 0.32237539766702017,
"grad_norm": 0.038491606071055146,
"learning_rate": 7.548113907024948e-06,
"loss": 0.8581,
"step": 551
},
{
"epoch": 0.3229604709840202,
"grad_norm": 0.05470482648340031,
"learning_rate": 7.54650754618768e-06,
"loss": 0.8104,
"step": 552
},
{
"epoch": 0.3235455443010202,
"grad_norm": 0.04194329994008872,
"learning_rate": 7.544898527170475e-06,
"loss": 0.7725,
"step": 553
},
{
"epoch": 0.32413061761802026,
"grad_norm": 0.059518370147140835,
"learning_rate": 7.543286851332641e-06,
"loss": 0.8814,
"step": 554
},
{
"epoch": 0.3247156909350203,
"grad_norm": 0.041939649514720574,
"learning_rate": 7.5416725200357215e-06,
"loss": 0.8785,
"step": 555
},
{
"epoch": 0.3253007642520203,
"grad_norm": 0.038119584353072236,
"learning_rate": 7.540055534643512e-06,
"loss": 0.8787,
"step": 556
},
{
"epoch": 0.32588583756902034,
"grad_norm": 0.35043742170562575,
"learning_rate": 7.538435896522048e-06,
"loss": 0.9425,
"step": 557
},
{
"epoch": 0.3264709108860204,
"grad_norm": 0.037804212753556964,
"learning_rate": 7.536813607039603e-06,
"loss": 0.9275,
"step": 558
},
{
"epoch": 0.32705598420302046,
"grad_norm": 0.04797991190424691,
"learning_rate": 7.535188667566693e-06,
"loss": 0.8349,
"step": 559
},
{
"epoch": 0.3276410575200205,
"grad_norm": 0.07131490446157496,
"learning_rate": 7.533561079476073e-06,
"loss": 0.7922,
"step": 560
},
{
"epoch": 0.3282261308370205,
"grad_norm": 0.04273451205950798,
"learning_rate": 7.531930844142734e-06,
"loss": 0.9063,
"step": 561
},
{
"epoch": 0.32881120415402054,
"grad_norm": 0.04761650491169362,
"learning_rate": 7.5302979629439044e-06,
"loss": 0.874,
"step": 562
},
{
"epoch": 0.32939627747102057,
"grad_norm": 0.053621412028226095,
"learning_rate": 7.528662437259048e-06,
"loss": 0.8908,
"step": 563
},
{
"epoch": 0.3299813507880206,
"grad_norm": 0.08172735146274197,
"learning_rate": 7.527024268469862e-06,
"loss": 0.7873,
"step": 564
},
{
"epoch": 0.3305664241050207,
"grad_norm": 0.045936897060612714,
"learning_rate": 7.525383457960277e-06,
"loss": 0.8582,
"step": 565
},
{
"epoch": 0.3311514974220207,
"grad_norm": 0.043038357564032,
"learning_rate": 7.523740007116453e-06,
"loss": 0.8864,
"step": 566
},
{
"epoch": 0.33173657073902074,
"grad_norm": 0.0595338473222242,
"learning_rate": 7.5220939173267855e-06,
"loss": 0.8398,
"step": 567
},
{
"epoch": 0.33232164405602077,
"grad_norm": 0.03876298862367819,
"learning_rate": 7.520445189981897e-06,
"loss": 0.7608,
"step": 568
},
{
"epoch": 0.3329067173730208,
"grad_norm": 0.04137701061941132,
"learning_rate": 7.518793826474636e-06,
"loss": 0.9109,
"step": 569
},
{
"epoch": 0.33349179069002083,
"grad_norm": 0.04054950689452163,
"learning_rate": 7.517139828200079e-06,
"loss": 0.9109,
"step": 570
},
{
"epoch": 0.33407686400702086,
"grad_norm": 0.045942175432546556,
"learning_rate": 7.5154831965555315e-06,
"loss": 0.8569,
"step": 571
},
{
"epoch": 0.33466193732402094,
"grad_norm": 0.0412369088341243,
"learning_rate": 7.51382393294052e-06,
"loss": 0.837,
"step": 572
},
{
"epoch": 0.33524701064102097,
"grad_norm": 0.03984087759584128,
"learning_rate": 7.5121620387567955e-06,
"loss": 0.8173,
"step": 573
},
{
"epoch": 0.335832083958021,
"grad_norm": 0.03855595716614889,
"learning_rate": 7.510497515408333e-06,
"loss": 0.7942,
"step": 574
},
{
"epoch": 0.33641715727502103,
"grad_norm": 0.04123539453768172,
"learning_rate": 7.508830364301327e-06,
"loss": 0.9384,
"step": 575
},
{
"epoch": 0.33700223059202106,
"grad_norm": 0.047137253838708155,
"learning_rate": 7.507160586844191e-06,
"loss": 0.8449,
"step": 576
},
{
"epoch": 0.3375873039090211,
"grad_norm": 0.03987110874806033,
"learning_rate": 7.50548818444756e-06,
"loss": 0.8678,
"step": 577
},
{
"epoch": 0.3381723772260211,
"grad_norm": 0.05156972953667277,
"learning_rate": 7.503813158524284e-06,
"loss": 0.8369,
"step": 578
},
{
"epoch": 0.3387574505430212,
"grad_norm": 0.05429963825180294,
"learning_rate": 7.502135510489432e-06,
"loss": 0.9386,
"step": 579
},
{
"epoch": 0.33934252386002123,
"grad_norm": 0.04563390307441152,
"learning_rate": 7.500455241760284e-06,
"loss": 0.7892,
"step": 580
},
{
"epoch": 0.33992759717702126,
"grad_norm": 0.03772537720365855,
"learning_rate": 7.4987723537563395e-06,
"loss": 0.8171,
"step": 581
},
{
"epoch": 0.3405126704940213,
"grad_norm": 0.14591090698165343,
"learning_rate": 7.497086847899305e-06,
"loss": 0.9374,
"step": 582
},
{
"epoch": 0.3410977438110213,
"grad_norm": 0.04274276349409338,
"learning_rate": 7.495398725613103e-06,
"loss": 0.8886,
"step": 583
},
{
"epoch": 0.34168281712802134,
"grad_norm": 0.045419293797600416,
"learning_rate": 7.4937079883238644e-06,
"loss": 0.8874,
"step": 584
},
{
"epoch": 0.34226789044502137,
"grad_norm": 0.039369245868335276,
"learning_rate": 7.4920146374599305e-06,
"loss": 0.8515,
"step": 585
},
{
"epoch": 0.3428529637620214,
"grad_norm": 0.041038361870929824,
"learning_rate": 7.490318674451848e-06,
"loss": 0.8352,
"step": 586
},
{
"epoch": 0.3434380370790215,
"grad_norm": 0.03546313415290626,
"learning_rate": 7.488620100732373e-06,
"loss": 0.8126,
"step": 587
},
{
"epoch": 0.3440231103960215,
"grad_norm": 0.047686593783821035,
"learning_rate": 7.486918917736467e-06,
"loss": 0.8438,
"step": 588
},
{
"epoch": 0.34460818371302154,
"grad_norm": 0.03860874121220213,
"learning_rate": 7.485215126901294e-06,
"loss": 0.796,
"step": 589
},
{
"epoch": 0.34519325703002157,
"grad_norm": 0.04131878299287184,
"learning_rate": 7.483508729666222e-06,
"loss": 0.8787,
"step": 590
},
{
"epoch": 0.3457783303470216,
"grad_norm": 0.0626727371222094,
"learning_rate": 7.481799727472821e-06,
"loss": 0.8556,
"step": 591
},
{
"epoch": 0.34636340366402163,
"grad_norm": 0.046182038395419095,
"learning_rate": 7.480088121764862e-06,
"loss": 0.8362,
"step": 592
},
{
"epoch": 0.34694847698102166,
"grad_norm": 0.04057968397212824,
"learning_rate": 7.478373913988314e-06,
"loss": 0.8382,
"step": 593
},
{
"epoch": 0.34753355029802174,
"grad_norm": 0.04060993045831877,
"learning_rate": 7.476657105591347e-06,
"loss": 0.844,
"step": 594
},
{
"epoch": 0.34811862361502177,
"grad_norm": 0.042757904869809,
"learning_rate": 7.474937698024326e-06,
"loss": 0.8323,
"step": 595
},
{
"epoch": 0.3487036969320218,
"grad_norm": 0.042358138791525404,
"learning_rate": 7.4732156927398134e-06,
"loss": 0.8055,
"step": 596
},
{
"epoch": 0.34928877024902183,
"grad_norm": 0.04569798776629126,
"learning_rate": 7.4714910911925614e-06,
"loss": 0.7941,
"step": 597
},
{
"epoch": 0.34987384356602186,
"grad_norm": 0.0438874811170573,
"learning_rate": 7.469763894839523e-06,
"loss": 0.919,
"step": 598
},
{
"epoch": 0.3504589168830219,
"grad_norm": 0.06653725512945165,
"learning_rate": 7.468034105139836e-06,
"loss": 0.8574,
"step": 599
},
{
"epoch": 0.3510439902000219,
"grad_norm": 0.04995025638554342,
"learning_rate": 7.466301723554835e-06,
"loss": 0.8496,
"step": 600
},
{
"epoch": 0.351629063517022,
"grad_norm": 0.04283576163163911,
"learning_rate": 7.46456675154804e-06,
"loss": 0.8757,
"step": 601
},
{
"epoch": 0.35221413683402203,
"grad_norm": 0.039849657265063225,
"learning_rate": 7.462829190585162e-06,
"loss": 0.8945,
"step": 602
},
{
"epoch": 0.35279921015102206,
"grad_norm": 0.0440391834022003,
"learning_rate": 7.461089042134098e-06,
"loss": 0.8571,
"step": 603
},
{
"epoch": 0.3533842834680221,
"grad_norm": 0.043132549255425784,
"learning_rate": 7.45934630766493e-06,
"loss": 0.8668,
"step": 604
},
{
"epoch": 0.3539693567850221,
"grad_norm": 0.04249197374690922,
"learning_rate": 7.4576009886499285e-06,
"loss": 0.8797,
"step": 605
},
{
"epoch": 0.35455443010202214,
"grad_norm": 0.05324832438497396,
"learning_rate": 7.455853086563542e-06,
"loss": 0.9684,
"step": 606
},
{
"epoch": 0.3551395034190222,
"grad_norm": 0.04904764924574238,
"learning_rate": 7.454102602882405e-06,
"loss": 0.7753,
"step": 607
},
{
"epoch": 0.35572457673602226,
"grad_norm": 0.039014384352298336,
"learning_rate": 7.452349539085334e-06,
"loss": 0.8561,
"step": 608
},
{
"epoch": 0.3563096500530223,
"grad_norm": 0.03934058564150346,
"learning_rate": 7.4505938966533175e-06,
"loss": 0.8438,
"step": 609
},
{
"epoch": 0.3568947233700223,
"grad_norm": 0.05279799201688822,
"learning_rate": 7.448835677069536e-06,
"loss": 0.8912,
"step": 610
},
{
"epoch": 0.35747979668702234,
"grad_norm": 0.04001456755669222,
"learning_rate": 7.447074881819332e-06,
"loss": 0.8553,
"step": 611
},
{
"epoch": 0.3580648700040224,
"grad_norm": 0.042425970420024564,
"learning_rate": 7.445311512390233e-06,
"loss": 0.8327,
"step": 612
},
{
"epoch": 0.3586499433210224,
"grad_norm": 0.061113675404079935,
"learning_rate": 7.443545570271942e-06,
"loss": 0.8842,
"step": 613
},
{
"epoch": 0.35923501663802243,
"grad_norm": 0.046390573358408176,
"learning_rate": 7.44177705695633e-06,
"loss": 0.869,
"step": 614
},
{
"epoch": 0.3598200899550225,
"grad_norm": 0.04323809663682179,
"learning_rate": 7.440005973937445e-06,
"loss": 0.9184,
"step": 615
},
{
"epoch": 0.36040516327202254,
"grad_norm": 0.04915044460354856,
"learning_rate": 7.4382323227115e-06,
"loss": 0.8376,
"step": 616
},
{
"epoch": 0.3609902365890226,
"grad_norm": 0.0385546720018955,
"learning_rate": 7.436456104776885e-06,
"loss": 0.8515,
"step": 617
},
{
"epoch": 0.3615753099060226,
"grad_norm": 0.042848988320383044,
"learning_rate": 7.4346773216341545e-06,
"loss": 0.867,
"step": 618
},
{
"epoch": 0.36216038322302263,
"grad_norm": 0.04584623182025803,
"learning_rate": 7.432895974786029e-06,
"loss": 0.8974,
"step": 619
},
{
"epoch": 0.36274545654002266,
"grad_norm": 0.04282342241688822,
"learning_rate": 7.431112065737397e-06,
"loss": 0.8623,
"step": 620
},
{
"epoch": 0.3633305298570227,
"grad_norm": 0.04076113650735267,
"learning_rate": 7.429325595995311e-06,
"loss": 0.8682,
"step": 621
},
{
"epoch": 0.3639156031740227,
"grad_norm": 0.05126027130856797,
"learning_rate": 7.427536567068985e-06,
"loss": 0.906,
"step": 622
},
{
"epoch": 0.3645006764910228,
"grad_norm": 0.049318751888490414,
"learning_rate": 7.4257449804697975e-06,
"loss": 0.8397,
"step": 623
},
{
"epoch": 0.36508574980802283,
"grad_norm": 0.043254726523200684,
"learning_rate": 7.423950837711287e-06,
"loss": 0.8622,
"step": 624
},
{
"epoch": 0.36567082312502286,
"grad_norm": 0.05038943712513383,
"learning_rate": 7.422154140309151e-06,
"loss": 0.8775,
"step": 625
},
{
"epoch": 0.3662558964420229,
"grad_norm": 0.04134166981874639,
"learning_rate": 7.420354889781245e-06,
"loss": 0.8226,
"step": 626
},
{
"epoch": 0.3668409697590229,
"grad_norm": 0.03866147658091302,
"learning_rate": 7.418553087647582e-06,
"loss": 0.8664,
"step": 627
},
{
"epoch": 0.36742604307602295,
"grad_norm": 0.038894973852294755,
"learning_rate": 7.416748735430332e-06,
"loss": 0.8796,
"step": 628
},
{
"epoch": 0.368011116393023,
"grad_norm": 0.05664602547094577,
"learning_rate": 7.4149418346538144e-06,
"loss": 0.9451,
"step": 629
},
{
"epoch": 0.36859618971002306,
"grad_norm": 0.03682659768264277,
"learning_rate": 7.413132386844507e-06,
"loss": 0.7877,
"step": 630
},
{
"epoch": 0.3691812630270231,
"grad_norm": 0.04070102583713315,
"learning_rate": 7.411320393531038e-06,
"loss": 0.8647,
"step": 631
},
{
"epoch": 0.3697663363440231,
"grad_norm": 0.037583772238652974,
"learning_rate": 7.4095058562441835e-06,
"loss": 0.8984,
"step": 632
},
{
"epoch": 0.37035140966102315,
"grad_norm": 0.03745618837783022,
"learning_rate": 7.407688776516873e-06,
"loss": 0.8077,
"step": 633
},
{
"epoch": 0.3709364829780232,
"grad_norm": 0.04719274188930425,
"learning_rate": 7.405869155884178e-06,
"loss": 0.7846,
"step": 634
},
{
"epoch": 0.3715215562950232,
"grad_norm": 0.04726483533088948,
"learning_rate": 7.404046995883322e-06,
"loss": 0.9625,
"step": 635
},
{
"epoch": 0.37210662961202323,
"grad_norm": 0.03718098121534363,
"learning_rate": 7.402222298053672e-06,
"loss": 0.7673,
"step": 636
},
{
"epoch": 0.3726917029290233,
"grad_norm": 0.041699609984716224,
"learning_rate": 7.400395063936738e-06,
"loss": 0.8846,
"step": 637
},
{
"epoch": 0.37327677624602335,
"grad_norm": 0.03781340458428561,
"learning_rate": 7.3985652950761734e-06,
"loss": 0.8589,
"step": 638
},
{
"epoch": 0.3738618495630234,
"grad_norm": 0.05169204738245716,
"learning_rate": 7.396732993017774e-06,
"loss": 0.8737,
"step": 639
},
{
"epoch": 0.3744469228800234,
"grad_norm": 0.040199437093993116,
"learning_rate": 7.394898159309474e-06,
"loss": 0.8432,
"step": 640
},
{
"epoch": 0.37503199619702343,
"grad_norm": 0.04129820957000115,
"learning_rate": 7.393060795501346e-06,
"loss": 0.8363,
"step": 641
},
{
"epoch": 0.37561706951402346,
"grad_norm": 0.04346950641443829,
"learning_rate": 7.391220903145602e-06,
"loss": 0.8008,
"step": 642
},
{
"epoch": 0.3762021428310235,
"grad_norm": 0.043967535702394124,
"learning_rate": 7.389378483796589e-06,
"loss": 0.8695,
"step": 643
},
{
"epoch": 0.3767872161480236,
"grad_norm": 0.037870985542036356,
"learning_rate": 7.387533539010789e-06,
"loss": 0.7874,
"step": 644
},
{
"epoch": 0.3773722894650236,
"grad_norm": 0.0394925511025262,
"learning_rate": 7.385686070346818e-06,
"loss": 0.8427,
"step": 645
},
{
"epoch": 0.37795736278202363,
"grad_norm": 0.03932506659821688,
"learning_rate": 7.383836079365423e-06,
"loss": 0.8773,
"step": 646
},
{
"epoch": 0.37854243609902366,
"grad_norm": 0.04174103140864924,
"learning_rate": 7.381983567629482e-06,
"loss": 0.8532,
"step": 647
},
{
"epoch": 0.3791275094160237,
"grad_norm": 0.03777546260763163,
"learning_rate": 7.380128536704003e-06,
"loss": 0.842,
"step": 648
},
{
"epoch": 0.3797125827330237,
"grad_norm": 0.038903962254009807,
"learning_rate": 7.378270988156122e-06,
"loss": 0.9141,
"step": 649
},
{
"epoch": 0.38029765605002375,
"grad_norm": 0.03931172416456551,
"learning_rate": 7.376410923555104e-06,
"loss": 0.8382,
"step": 650
},
{
"epoch": 0.38088272936702383,
"grad_norm": 0.03830193944287262,
"learning_rate": 7.374548344472336e-06,
"loss": 0.913,
"step": 651
},
{
"epoch": 0.38146780268402386,
"grad_norm": 0.05689973145188307,
"learning_rate": 7.372683252481333e-06,
"loss": 0.9233,
"step": 652
},
{
"epoch": 0.3820528760010239,
"grad_norm": 0.11927191175958904,
"learning_rate": 7.370815649157728e-06,
"loss": 0.8497,
"step": 653
},
{
"epoch": 0.3826379493180239,
"grad_norm": 0.04879685015775887,
"learning_rate": 7.36894553607928e-06,
"loss": 0.8902,
"step": 654
},
{
"epoch": 0.38322302263502395,
"grad_norm": 0.038545163591213454,
"learning_rate": 7.3670729148258655e-06,
"loss": 0.8101,
"step": 655
},
{
"epoch": 0.383808095952024,
"grad_norm": 0.07018595700302209,
"learning_rate": 7.365197786979483e-06,
"loss": 0.8732,
"step": 656
},
{
"epoch": 0.384393169269024,
"grad_norm": 0.03883816516578689,
"learning_rate": 7.3633201541242465e-06,
"loss": 0.9438,
"step": 657
},
{
"epoch": 0.3849782425860241,
"grad_norm": 0.045681207579253334,
"learning_rate": 7.3614400178463834e-06,
"loss": 0.8083,
"step": 658
},
{
"epoch": 0.3855633159030241,
"grad_norm": 0.03536545477300677,
"learning_rate": 7.359557379734242e-06,
"loss": 0.7559,
"step": 659
},
{
"epoch": 0.38614838922002415,
"grad_norm": 0.03989482911665265,
"learning_rate": 7.357672241378282e-06,
"loss": 0.8969,
"step": 660
},
{
"epoch": 0.3867334625370242,
"grad_norm": 0.03858611847647411,
"learning_rate": 7.355784604371071e-06,
"loss": 0.835,
"step": 661
},
{
"epoch": 0.3873185358540242,
"grad_norm": 0.04523699088789738,
"learning_rate": 7.353894470307294e-06,
"loss": 0.8641,
"step": 662
},
{
"epoch": 0.38790360917102423,
"grad_norm": 0.041661369856111206,
"learning_rate": 7.352001840783741e-06,
"loss": 0.834,
"step": 663
},
{
"epoch": 0.38848868248802426,
"grad_norm": 0.041538283343954825,
"learning_rate": 7.3501067173993115e-06,
"loss": 0.9114,
"step": 664
},
{
"epoch": 0.3890737558050243,
"grad_norm": 0.039602491788351755,
"learning_rate": 7.348209101755012e-06,
"loss": 0.8479,
"step": 665
},
{
"epoch": 0.3896588291220244,
"grad_norm": 0.04077290004316095,
"learning_rate": 7.346308995453956e-06,
"loss": 0.8511,
"step": 666
},
{
"epoch": 0.3902439024390244,
"grad_norm": 0.044595088605505394,
"learning_rate": 7.344406400101358e-06,
"loss": 0.8121,
"step": 667
},
{
"epoch": 0.39082897575602443,
"grad_norm": 0.04520053665259006,
"learning_rate": 7.342501317304538e-06,
"loss": 0.8916,
"step": 668
},
{
"epoch": 0.39141404907302446,
"grad_norm": 0.04048302305644283,
"learning_rate": 7.340593748672915e-06,
"loss": 0.9092,
"step": 669
},
{
"epoch": 0.3919991223900245,
"grad_norm": 0.04037233961111267,
"learning_rate": 7.33868369581801e-06,
"loss": 0.881,
"step": 670
},
{
"epoch": 0.3925841957070245,
"grad_norm": 0.05051075945293654,
"learning_rate": 7.336771160353441e-06,
"loss": 0.8109,
"step": 671
},
{
"epoch": 0.39316926902402455,
"grad_norm": 0.049551953529107955,
"learning_rate": 7.334856143894927e-06,
"loss": 0.9017,
"step": 672
},
{
"epoch": 0.39375434234102463,
"grad_norm": 0.03993075169550292,
"learning_rate": 7.332938648060276e-06,
"loss": 0.8542,
"step": 673
},
{
"epoch": 0.39433941565802466,
"grad_norm": 0.038451189858571745,
"learning_rate": 7.331018674469396e-06,
"loss": 0.8112,
"step": 674
},
{
"epoch": 0.3949244889750247,
"grad_norm": 0.0505499896286193,
"learning_rate": 7.329096224744291e-06,
"loss": 0.8813,
"step": 675
},
{
"epoch": 0.3955095622920247,
"grad_norm": 0.071098260548373,
"learning_rate": 7.3271713005090494e-06,
"loss": 0.7823,
"step": 676
},
{
"epoch": 0.39609463560902475,
"grad_norm": 0.039554913835090445,
"learning_rate": 7.325243903389853e-06,
"loss": 0.8741,
"step": 677
},
{
"epoch": 0.3966797089260248,
"grad_norm": 0.03860100123135944,
"learning_rate": 7.323314035014974e-06,
"loss": 0.8042,
"step": 678
},
{
"epoch": 0.3972647822430248,
"grad_norm": 0.043534126452199624,
"learning_rate": 7.321381697014771e-06,
"loss": 0.8062,
"step": 679
},
{
"epoch": 0.3978498555600249,
"grad_norm": 0.04429511697906778,
"learning_rate": 7.319446891021693e-06,
"loss": 0.8726,
"step": 680
},
{
"epoch": 0.3984349288770249,
"grad_norm": 0.03812425577668118,
"learning_rate": 7.317509618670267e-06,
"loss": 0.8136,
"step": 681
},
{
"epoch": 0.39902000219402495,
"grad_norm": 0.07345134546444418,
"learning_rate": 7.315569881597106e-06,
"loss": 0.8483,
"step": 682
},
{
"epoch": 0.399605075511025,
"grad_norm": 0.038808207078641584,
"learning_rate": 7.313627681440909e-06,
"loss": 0.9122,
"step": 683
},
{
"epoch": 0.400190148828025,
"grad_norm": 0.03882381554371386,
"learning_rate": 7.311683019842453e-06,
"loss": 0.8767,
"step": 684
},
{
"epoch": 0.40077522214502503,
"grad_norm": 0.09532542796696841,
"learning_rate": 7.309735898444593e-06,
"loss": 0.817,
"step": 685
},
{
"epoch": 0.40136029546202506,
"grad_norm": 0.0401829342578114,
"learning_rate": 7.307786318892265e-06,
"loss": 0.9071,
"step": 686
},
{
"epoch": 0.40194536877902515,
"grad_norm": 0.03903463743661068,
"learning_rate": 7.305834282832478e-06,
"loss": 0.9161,
"step": 687
},
{
"epoch": 0.4025304420960252,
"grad_norm": 0.03813349508001005,
"learning_rate": 7.303879791914321e-06,
"loss": 0.7688,
"step": 688
},
{
"epoch": 0.4031155154130252,
"grad_norm": 0.03801357430065186,
"learning_rate": 7.301922847788953e-06,
"loss": 0.7473,
"step": 689
},
{
"epoch": 0.40370058873002523,
"grad_norm": 0.0391358430740653,
"learning_rate": 7.299963452109607e-06,
"loss": 0.84,
"step": 690
},
{
"epoch": 0.40428566204702526,
"grad_norm": 0.04237700699339236,
"learning_rate": 7.298001606531588e-06,
"loss": 0.7707,
"step": 691
},
{
"epoch": 0.4048707353640253,
"grad_norm": 0.04484671718594054,
"learning_rate": 7.296037312712267e-06,
"loss": 0.8456,
"step": 692
},
{
"epoch": 0.4054558086810253,
"grad_norm": 0.04025195813747376,
"learning_rate": 7.2940705723110895e-06,
"loss": 0.8882,
"step": 693
},
{
"epoch": 0.4060408819980254,
"grad_norm": 0.2399660293834239,
"learning_rate": 7.292101386989561e-06,
"loss": 0.8086,
"step": 694
},
{
"epoch": 0.40662595531502543,
"grad_norm": 0.03485270510298816,
"learning_rate": 7.290129758411258e-06,
"loss": 0.8997,
"step": 695
},
{
"epoch": 0.40721102863202546,
"grad_norm": 0.03819929313036679,
"learning_rate": 7.288155688241819e-06,
"loss": 0.9212,
"step": 696
},
{
"epoch": 0.4077961019490255,
"grad_norm": 0.03510738864060292,
"learning_rate": 7.286179178148942e-06,
"loss": 0.8754,
"step": 697
},
{
"epoch": 0.4083811752660255,
"grad_norm": 0.043632341643109356,
"learning_rate": 7.284200229802391e-06,
"loss": 0.7717,
"step": 698
},
{
"epoch": 0.40896624858302555,
"grad_norm": 0.04089651755929757,
"learning_rate": 7.28221884487399e-06,
"loss": 0.88,
"step": 699
},
{
"epoch": 0.4095513219000256,
"grad_norm": 0.037780685942199126,
"learning_rate": 7.280235025037616e-06,
"loss": 0.8197,
"step": 700
},
{
"epoch": 0.4101363952170256,
"grad_norm": 0.03655073966270369,
"learning_rate": 7.27824877196921e-06,
"loss": 0.8279,
"step": 701
},
{
"epoch": 0.4107214685340257,
"grad_norm": 0.05600910511927575,
"learning_rate": 7.2762600873467624e-06,
"loss": 0.8476,
"step": 702
},
{
"epoch": 0.4113065418510257,
"grad_norm": 0.034951743393564536,
"learning_rate": 7.274268972850321e-06,
"loss": 0.7609,
"step": 703
},
{
"epoch": 0.41189161516802575,
"grad_norm": 0.09106547558232417,
"learning_rate": 7.272275430161988e-06,
"loss": 0.8996,
"step": 704
},
{
"epoch": 0.4124766884850258,
"grad_norm": 0.051359026093647085,
"learning_rate": 7.270279460965912e-06,
"loss": 0.9052,
"step": 705
},
{
"epoch": 0.4130617618020258,
"grad_norm": 0.03905089647983441,
"learning_rate": 7.268281066948296e-06,
"loss": 0.918,
"step": 706
},
{
"epoch": 0.41364683511902584,
"grad_norm": 0.04206925775516373,
"learning_rate": 7.2662802497973875e-06,
"loss": 0.8053,
"step": 707
},
{
"epoch": 0.41423190843602586,
"grad_norm": 0.040474218310822804,
"learning_rate": 7.264277011203488e-06,
"loss": 0.891,
"step": 708
},
{
"epoch": 0.41481698175302595,
"grad_norm": 0.0442335071782456,
"learning_rate": 7.262271352858936e-06,
"loss": 0.8593,
"step": 709
},
{
"epoch": 0.415402055070026,
"grad_norm": 0.047122148951545366,
"learning_rate": 7.26026327645812e-06,
"loss": 0.844,
"step": 710
},
{
"epoch": 0.415987128387026,
"grad_norm": 0.03850054120717149,
"learning_rate": 7.258252783697469e-06,
"loss": 0.7795,
"step": 711
},
{
"epoch": 0.41657220170402603,
"grad_norm": 0.050566843036294336,
"learning_rate": 7.2562398762754554e-06,
"loss": 0.7794,
"step": 712
},
{
"epoch": 0.41715727502102606,
"grad_norm": 0.058106581796564256,
"learning_rate": 7.254224555892587e-06,
"loss": 0.9735,
"step": 713
},
{
"epoch": 0.4177423483380261,
"grad_norm": 0.03831983478639148,
"learning_rate": 7.252206824251416e-06,
"loss": 0.7832,
"step": 714
},
{
"epoch": 0.4183274216550261,
"grad_norm": 0.0431158884889512,
"learning_rate": 7.250186683056527e-06,
"loss": 0.7865,
"step": 715
},
{
"epoch": 0.4189124949720262,
"grad_norm": 0.046081855255802225,
"learning_rate": 7.248164134014544e-06,
"loss": 0.8512,
"step": 716
},
{
"epoch": 0.41949756828902623,
"grad_norm": 0.04502443182272067,
"learning_rate": 7.246139178834119e-06,
"loss": 0.853,
"step": 717
},
{
"epoch": 0.42008264160602626,
"grad_norm": 0.045461255479903595,
"learning_rate": 7.244111819225946e-06,
"loss": 0.8262,
"step": 718
},
{
"epoch": 0.4206677149230263,
"grad_norm": 0.038558457568846585,
"learning_rate": 7.24208205690274e-06,
"loss": 0.7975,
"step": 719
},
{
"epoch": 0.4212527882400263,
"grad_norm": 0.04654917549136812,
"learning_rate": 7.240049893579256e-06,
"loss": 0.8849,
"step": 720
},
{
"epoch": 0.42183786155702635,
"grad_norm": 0.04307894480353439,
"learning_rate": 7.238015330972268e-06,
"loss": 0.805,
"step": 721
},
{
"epoch": 0.4224229348740264,
"grad_norm": 0.04281636756195477,
"learning_rate": 7.235978370800583e-06,
"loss": 0.8471,
"step": 722
},
{
"epoch": 0.42300800819102646,
"grad_norm": 0.04507551309506065,
"learning_rate": 7.233939014785032e-06,
"loss": 0.8468,
"step": 723
},
{
"epoch": 0.4235930815080265,
"grad_norm": 0.06918610285412721,
"learning_rate": 7.2318972646484685e-06,
"loss": 0.8655,
"step": 724
},
{
"epoch": 0.4241781548250265,
"grad_norm": 0.04721791843997281,
"learning_rate": 7.229853122115772e-06,
"loss": 0.7927,
"step": 725
},
{
"epoch": 0.42476322814202655,
"grad_norm": 0.041834283551617495,
"learning_rate": 7.227806588913838e-06,
"loss": 0.8712,
"step": 726
},
{
"epoch": 0.4253483014590266,
"grad_norm": 0.05505118490346748,
"learning_rate": 7.225757666771585e-06,
"loss": 0.8584,
"step": 727
},
{
"epoch": 0.4259333747760266,
"grad_norm": 0.047773194756069906,
"learning_rate": 7.223706357419951e-06,
"loss": 0.7893,
"step": 728
},
{
"epoch": 0.42651844809302664,
"grad_norm": 0.04822106540139186,
"learning_rate": 7.221652662591887e-06,
"loss": 0.8277,
"step": 729
},
{
"epoch": 0.4271035214100267,
"grad_norm": 0.03969595772236796,
"learning_rate": 7.219596584022363e-06,
"loss": 0.8394,
"step": 730
},
{
"epoch": 0.42768859472702675,
"grad_norm": 0.0422480401855194,
"learning_rate": 7.217538123448359e-06,
"loss": 0.8094,
"step": 731
},
{
"epoch": 0.4282736680440268,
"grad_norm": 0.0394391284081235,
"learning_rate": 7.215477282608871e-06,
"loss": 0.8847,
"step": 732
},
{
"epoch": 0.4288587413610268,
"grad_norm": 0.05956332380443304,
"learning_rate": 7.213414063244903e-06,
"loss": 0.8427,
"step": 733
},
{
"epoch": 0.42944381467802684,
"grad_norm": 0.03974032496622027,
"learning_rate": 7.21134846709947e-06,
"loss": 0.7986,
"step": 734
},
{
"epoch": 0.43002888799502687,
"grad_norm": 0.061979287274211155,
"learning_rate": 7.209280495917594e-06,
"loss": 0.8178,
"step": 735
},
{
"epoch": 0.4306139613120269,
"grad_norm": 0.04099124105139921,
"learning_rate": 7.2072101514463045e-06,
"loss": 0.7936,
"step": 736
},
{
"epoch": 0.4311990346290269,
"grad_norm": 0.04101203671080088,
"learning_rate": 7.205137435434634e-06,
"loss": 0.8607,
"step": 737
},
{
"epoch": 0.431784107946027,
"grad_norm": 0.04063679329152861,
"learning_rate": 7.203062349633622e-06,
"loss": 0.8066,
"step": 738
},
{
"epoch": 0.43236918126302704,
"grad_norm": 0.045567608816787585,
"learning_rate": 7.200984895796305e-06,
"loss": 0.8558,
"step": 739
},
{
"epoch": 0.43295425458002706,
"grad_norm": 0.0514210080543997,
"learning_rate": 7.198905075677726e-06,
"loss": 0.7855,
"step": 740
},
{
"epoch": 0.4335393278970271,
"grad_norm": 0.0404180379498117,
"learning_rate": 7.196822891034922e-06,
"loss": 0.9028,
"step": 741
},
{
"epoch": 0.4341244012140271,
"grad_norm": 0.03964126441343688,
"learning_rate": 7.1947383436269295e-06,
"loss": 0.872,
"step": 742
},
{
"epoch": 0.43470947453102715,
"grad_norm": 0.04863222203139086,
"learning_rate": 7.192651435214781e-06,
"loss": 0.9288,
"step": 743
},
{
"epoch": 0.4352945478480272,
"grad_norm": 0.04412474170590896,
"learning_rate": 7.190562167561505e-06,
"loss": 0.836,
"step": 744
},
{
"epoch": 0.43587962116502726,
"grad_norm": 0.16048273545352948,
"learning_rate": 7.188470542432119e-06,
"loss": 0.7639,
"step": 745
},
{
"epoch": 0.4364646944820273,
"grad_norm": 0.04767484164607536,
"learning_rate": 7.1863765615936375e-06,
"loss": 0.8481,
"step": 746
},
{
"epoch": 0.4370497677990273,
"grad_norm": 0.06989126022408862,
"learning_rate": 7.184280226815061e-06,
"loss": 0.8569,
"step": 747
},
{
"epoch": 0.43763484111602735,
"grad_norm": 0.041900678395983416,
"learning_rate": 7.18218153986738e-06,
"loss": 0.931,
"step": 748
},
{
"epoch": 0.4382199144330274,
"grad_norm": 0.04216846873986442,
"learning_rate": 7.180080502523572e-06,
"loss": 0.8646,
"step": 749
},
{
"epoch": 0.4388049877500274,
"grad_norm": 0.050264349955050885,
"learning_rate": 7.177977116558601e-06,
"loss": 0.8199,
"step": 750
},
{
"epoch": 0.43939006106702744,
"grad_norm": 0.03843848162959958,
"learning_rate": 7.175871383749415e-06,
"loss": 0.8097,
"step": 751
},
{
"epoch": 0.4399751343840275,
"grad_norm": 0.05545552486681674,
"learning_rate": 7.173763305874942e-06,
"loss": 0.9036,
"step": 752
},
{
"epoch": 0.44056020770102755,
"grad_norm": 0.040372444436672136,
"learning_rate": 7.1716528847160944e-06,
"loss": 0.7861,
"step": 753
},
{
"epoch": 0.4411452810180276,
"grad_norm": 0.03823936143241982,
"learning_rate": 7.169540122055764e-06,
"loss": 0.7976,
"step": 754
},
{
"epoch": 0.4417303543350276,
"grad_norm": 0.040595268808395715,
"learning_rate": 7.167425019678817e-06,
"loss": 0.8007,
"step": 755
},
{
"epoch": 0.44231542765202764,
"grad_norm": 0.04580612084615541,
"learning_rate": 7.1653075793721e-06,
"loss": 0.741,
"step": 756
},
{
"epoch": 0.44290050096902767,
"grad_norm": 0.044516769340694984,
"learning_rate": 7.163187802924435e-06,
"loss": 0.911,
"step": 757
},
{
"epoch": 0.4434855742860277,
"grad_norm": 0.04502988747564978,
"learning_rate": 7.161065692126614e-06,
"loss": 0.8775,
"step": 758
},
{
"epoch": 0.4440706476030278,
"grad_norm": 0.041288645539491076,
"learning_rate": 7.1589412487714055e-06,
"loss": 0.8393,
"step": 759
},
{
"epoch": 0.4446557209200278,
"grad_norm": 0.04219916867709284,
"learning_rate": 7.156814474653542e-06,
"loss": 0.8203,
"step": 760
},
{
"epoch": 0.44524079423702784,
"grad_norm": 0.04510176879579754,
"learning_rate": 7.154685371569736e-06,
"loss": 0.7882,
"step": 761
},
{
"epoch": 0.44582586755402787,
"grad_norm": 0.03956297558604167,
"learning_rate": 7.152553941318655e-06,
"loss": 0.8313,
"step": 762
},
{
"epoch": 0.4464109408710279,
"grad_norm": 0.03781666670962381,
"learning_rate": 7.15042018570094e-06,
"loss": 0.9057,
"step": 763
},
{
"epoch": 0.4469960141880279,
"grad_norm": 0.04278185971588543,
"learning_rate": 7.148284106519195e-06,
"loss": 0.842,
"step": 764
},
{
"epoch": 0.44758108750502795,
"grad_norm": 0.03726277158845805,
"learning_rate": 7.1461457055779875e-06,
"loss": 0.8003,
"step": 765
},
{
"epoch": 0.44816616082202804,
"grad_norm": 0.044396475499465786,
"learning_rate": 7.144004984683844e-06,
"loss": 0.8393,
"step": 766
},
{
"epoch": 0.44875123413902807,
"grad_norm": 0.03484609580213948,
"learning_rate": 7.141861945645254e-06,
"loss": 0.8255,
"step": 767
},
{
"epoch": 0.4493363074560281,
"grad_norm": 0.04524092857108677,
"learning_rate": 7.139716590272663e-06,
"loss": 0.7811,
"step": 768
},
{
"epoch": 0.4499213807730281,
"grad_norm": 0.03734702819768394,
"learning_rate": 7.1375689203784755e-06,
"loss": 0.8644,
"step": 769
},
{
"epoch": 0.45050645409002815,
"grad_norm": 0.05019144085295421,
"learning_rate": 7.135418937777049e-06,
"loss": 0.9044,
"step": 770
},
{
"epoch": 0.4510915274070282,
"grad_norm": 0.04029746203545264,
"learning_rate": 7.133266644284696e-06,
"loss": 0.7769,
"step": 771
},
{
"epoch": 0.4516766007240282,
"grad_norm": 0.04262838461849833,
"learning_rate": 7.131112041719681e-06,
"loss": 0.9143,
"step": 772
},
{
"epoch": 0.45226167404102824,
"grad_norm": 0.043876149904780545,
"learning_rate": 7.1289551319022195e-06,
"loss": 0.8828,
"step": 773
},
{
"epoch": 0.4528467473580283,
"grad_norm": 0.04711071987260168,
"learning_rate": 7.126795916654477e-06,
"loss": 0.7762,
"step": 774
},
{
"epoch": 0.45343182067502835,
"grad_norm": 0.04116549252293238,
"learning_rate": 7.124634397800565e-06,
"loss": 0.7778,
"step": 775
},
{
"epoch": 0.4540168939920284,
"grad_norm": 0.0393006386390472,
"learning_rate": 7.1224705771665405e-06,
"loss": 0.8465,
"step": 776
},
{
"epoch": 0.4546019673090284,
"grad_norm": 0.03826784736833335,
"learning_rate": 7.120304456580408e-06,
"loss": 0.8359,
"step": 777
},
{
"epoch": 0.45518704062602844,
"grad_norm": 0.04985454473324124,
"learning_rate": 7.118136037872112e-06,
"loss": 0.8552,
"step": 778
},
{
"epoch": 0.45577211394302847,
"grad_norm": 0.04012812213905606,
"learning_rate": 7.115965322873541e-06,
"loss": 0.8249,
"step": 779
},
{
"epoch": 0.4563571872600285,
"grad_norm": 0.03736935441616661,
"learning_rate": 7.113792313418522e-06,
"loss": 0.8399,
"step": 780
},
{
"epoch": 0.4569422605770286,
"grad_norm": 0.036669437710784104,
"learning_rate": 7.1116170113428194e-06,
"loss": 0.79,
"step": 781
},
{
"epoch": 0.4575273338940286,
"grad_norm": 0.03882415136861196,
"learning_rate": 7.109439418484137e-06,
"loss": 0.8016,
"step": 782
},
{
"epoch": 0.45811240721102864,
"grad_norm": 0.0452764493144211,
"learning_rate": 7.107259536682111e-06,
"loss": 0.8138,
"step": 783
},
{
"epoch": 0.45869748052802867,
"grad_norm": 0.04729394522582173,
"learning_rate": 7.105077367778313e-06,
"loss": 0.7596,
"step": 784
},
{
"epoch": 0.4592825538450287,
"grad_norm": 0.05019146434388651,
"learning_rate": 7.102892913616248e-06,
"loss": 0.8015,
"step": 785
},
{
"epoch": 0.4598676271620287,
"grad_norm": 0.039099244411788574,
"learning_rate": 7.100706176041348e-06,
"loss": 0.8098,
"step": 786
},
{
"epoch": 0.46045270047902875,
"grad_norm": 0.03895433813179543,
"learning_rate": 7.098517156900978e-06,
"loss": 0.8851,
"step": 787
},
{
"epoch": 0.46103777379602884,
"grad_norm": 0.03731969942911145,
"learning_rate": 7.096325858044427e-06,
"loss": 0.7721,
"step": 788
},
{
"epoch": 0.46162284711302887,
"grad_norm": 0.04097182225913861,
"learning_rate": 7.094132281322912e-06,
"loss": 0.8223,
"step": 789
},
{
"epoch": 0.4622079204300289,
"grad_norm": 0.03696891399238777,
"learning_rate": 7.091936428589576e-06,
"loss": 0.8938,
"step": 790
},
{
"epoch": 0.4627929937470289,
"grad_norm": 0.03792585935917287,
"learning_rate": 7.089738301699479e-06,
"loss": 0.8393,
"step": 791
},
{
"epoch": 0.46337806706402895,
"grad_norm": 0.10218168425909542,
"learning_rate": 7.087537902509607e-06,
"loss": 0.8016,
"step": 792
},
{
"epoch": 0.463963140381029,
"grad_norm": 0.03989943757222258,
"learning_rate": 7.085335232878865e-06,
"loss": 0.7431,
"step": 793
},
{
"epoch": 0.464548213698029,
"grad_norm": 0.03900627784304605,
"learning_rate": 7.083130294668076e-06,
"loss": 0.8431,
"step": 794
},
{
"epoch": 0.4651332870150291,
"grad_norm": 0.05897148270799823,
"learning_rate": 7.080923089739978e-06,
"loss": 0.83,
"step": 795
},
{
"epoch": 0.4657183603320291,
"grad_norm": 0.03888063452127548,
"learning_rate": 7.078713619959228e-06,
"loss": 0.7915,
"step": 796
},
{
"epoch": 0.46630343364902915,
"grad_norm": 0.04604225990759911,
"learning_rate": 7.076501887192387e-06,
"loss": 0.8419,
"step": 797
},
{
"epoch": 0.4668885069660292,
"grad_norm": 0.040295564919243125,
"learning_rate": 7.074287893307941e-06,
"loss": 0.8764,
"step": 798
},
{
"epoch": 0.4674735802830292,
"grad_norm": 0.03769433658091701,
"learning_rate": 7.072071640176274e-06,
"loss": 0.8566,
"step": 799
},
{
"epoch": 0.46805865360002924,
"grad_norm": 0.03728814454795478,
"learning_rate": 7.069853129669688e-06,
"loss": 0.771,
"step": 800
},
{
"epoch": 0.46864372691702927,
"grad_norm": 0.03547834831253728,
"learning_rate": 7.067632363662386e-06,
"loss": 0.7874,
"step": 801
},
{
"epoch": 0.46922880023402935,
"grad_norm": 0.04116629646593246,
"learning_rate": 7.065409344030479e-06,
"loss": 0.8579,
"step": 802
},
{
"epoch": 0.4698138735510294,
"grad_norm": 0.04284181743182457,
"learning_rate": 7.063184072651981e-06,
"loss": 0.874,
"step": 803
},
{
"epoch": 0.4703989468680294,
"grad_norm": 0.5453076047222728,
"learning_rate": 7.060956551406807e-06,
"loss": 0.8275,
"step": 804
},
{
"epoch": 0.47098402018502944,
"grad_norm": 0.04285257240888046,
"learning_rate": 7.058726782176778e-06,
"loss": 0.8748,
"step": 805
},
{
"epoch": 0.47156909350202947,
"grad_norm": 0.0426593418584649,
"learning_rate": 7.056494766845606e-06,
"loss": 0.805,
"step": 806
},
{
"epoch": 0.4721541668190295,
"grad_norm": 0.03749564999202563,
"learning_rate": 7.05426050729891e-06,
"loss": 0.8289,
"step": 807
},
{
"epoch": 0.4727392401360295,
"grad_norm": 0.044556182570763894,
"learning_rate": 7.052024005424194e-06,
"loss": 0.8757,
"step": 808
},
{
"epoch": 0.4733243134530296,
"grad_norm": 0.04281051498943111,
"learning_rate": 7.049785263110867e-06,
"loss": 0.8838,
"step": 809
},
{
"epoch": 0.47390938677002964,
"grad_norm": 0.04188599336409731,
"learning_rate": 7.047544282250223e-06,
"loss": 0.8371,
"step": 810
},
{
"epoch": 0.47449446008702967,
"grad_norm": 0.04079367612711138,
"learning_rate": 7.045301064735451e-06,
"loss": 0.8383,
"step": 811
},
{
"epoch": 0.4750795334040297,
"grad_norm": 0.037411483434431944,
"learning_rate": 7.0430556124616294e-06,
"loss": 0.7866,
"step": 812
},
{
"epoch": 0.4756646067210297,
"grad_norm": 0.0497395887833391,
"learning_rate": 7.040807927325723e-06,
"loss": 0.836,
"step": 813
},
{
"epoch": 0.47624968003802975,
"grad_norm": 0.039340018189295584,
"learning_rate": 7.038558011226583e-06,
"loss": 0.7925,
"step": 814
},
{
"epoch": 0.4768347533550298,
"grad_norm": 0.04067266509967937,
"learning_rate": 7.036305866064947e-06,
"loss": 0.8246,
"step": 815
},
{
"epoch": 0.4774198266720298,
"grad_norm": 0.03756080114515082,
"learning_rate": 7.0340514937434334e-06,
"loss": 0.8091,
"step": 816
},
{
"epoch": 0.4780048999890299,
"grad_norm": 0.04009793142877897,
"learning_rate": 7.031794896166544e-06,
"loss": 0.8367,
"step": 817
},
{
"epoch": 0.4785899733060299,
"grad_norm": 0.045719600380512315,
"learning_rate": 7.029536075240659e-06,
"loss": 0.8698,
"step": 818
},
{
"epoch": 0.47917504662302995,
"grad_norm": 0.06146708297478866,
"learning_rate": 7.0272750328740394e-06,
"loss": 0.7769,
"step": 819
},
{
"epoch": 0.47976011994003,
"grad_norm": 0.0379549936030894,
"learning_rate": 7.025011770976821e-06,
"loss": 0.8307,
"step": 820
},
{
"epoch": 0.48034519325703,
"grad_norm": 0.046629898254568806,
"learning_rate": 7.022746291461013e-06,
"loss": 0.8296,
"step": 821
},
{
"epoch": 0.48093026657403004,
"grad_norm": 0.04903726722961758,
"learning_rate": 7.020478596240503e-06,
"loss": 0.8578,
"step": 822
},
{
"epoch": 0.48151533989103007,
"grad_norm": 0.044482172504589565,
"learning_rate": 7.018208687231045e-06,
"loss": 0.8339,
"step": 823
},
{
"epoch": 0.48210041320803015,
"grad_norm": 0.04652606756149258,
"learning_rate": 7.015936566350267e-06,
"loss": 0.8629,
"step": 824
},
{
"epoch": 0.4826854865250302,
"grad_norm": 0.03952788329765694,
"learning_rate": 7.013662235517661e-06,
"loss": 0.8851,
"step": 825
},
{
"epoch": 0.4832705598420302,
"grad_norm": 0.04737073432575187,
"learning_rate": 7.011385696654594e-06,
"loss": 0.8662,
"step": 826
},
{
"epoch": 0.48385563315903024,
"grad_norm": 0.03751881453871839,
"learning_rate": 7.0091069516842915e-06,
"loss": 0.8559,
"step": 827
},
{
"epoch": 0.48444070647603027,
"grad_norm": 0.03716116535695275,
"learning_rate": 7.006826002531843e-06,
"loss": 0.7718,
"step": 828
},
{
"epoch": 0.4850257797930303,
"grad_norm": 0.056266031939443406,
"learning_rate": 7.004542851124203e-06,
"loss": 0.7714,
"step": 829
},
{
"epoch": 0.4856108531100303,
"grad_norm": 0.05535556673447491,
"learning_rate": 7.0022574993901865e-06,
"loss": 0.8676,
"step": 830
},
{
"epoch": 0.4861959264270304,
"grad_norm": 0.03845486724104416,
"learning_rate": 6.999969949260464e-06,
"loss": 0.7885,
"step": 831
},
{
"epoch": 0.48678099974403044,
"grad_norm": 0.04062056962499875,
"learning_rate": 6.99768020266757e-06,
"loss": 0.8322,
"step": 832
},
{
"epoch": 0.48736607306103047,
"grad_norm": 0.06424079693979534,
"learning_rate": 6.995388261545884e-06,
"loss": 0.905,
"step": 833
},
{
"epoch": 0.4879511463780305,
"grad_norm": 0.04236693639090112,
"learning_rate": 6.993094127831649e-06,
"loss": 0.8726,
"step": 834
},
{
"epoch": 0.4885362196950305,
"grad_norm": 0.038818701632739046,
"learning_rate": 6.990797803462955e-06,
"loss": 0.7753,
"step": 835
},
{
"epoch": 0.48912129301203056,
"grad_norm": 0.05513690717963214,
"learning_rate": 6.988499290379746e-06,
"loss": 0.888,
"step": 836
},
{
"epoch": 0.4897063663290306,
"grad_norm": 0.040849030445044794,
"learning_rate": 6.986198590523812e-06,
"loss": 0.8466,
"step": 837
},
{
"epoch": 0.49029143964603067,
"grad_norm": 0.040919168417802985,
"learning_rate": 6.983895705838793e-06,
"loss": 0.8589,
"step": 838
},
{
"epoch": 0.4908765129630307,
"grad_norm": 0.038496795188997325,
"learning_rate": 6.9815906382701725e-06,
"loss": 0.862,
"step": 839
},
{
"epoch": 0.4914615862800307,
"grad_norm": 0.043613843698726576,
"learning_rate": 6.97928338976528e-06,
"loss": 0.8702,
"step": 840
},
{
"epoch": 0.49204665959703076,
"grad_norm": 0.03941143092937186,
"learning_rate": 6.9769739622732855e-06,
"loss": 0.7831,
"step": 841
},
{
"epoch": 0.4926317329140308,
"grad_norm": 0.06878143755782574,
"learning_rate": 6.974662357745203e-06,
"loss": 0.9225,
"step": 842
},
{
"epoch": 0.4932168062310308,
"grad_norm": 0.05850718491585764,
"learning_rate": 6.972348578133881e-06,
"loss": 0.8781,
"step": 843
},
{
"epoch": 0.49380187954803084,
"grad_norm": 0.039118395131995005,
"learning_rate": 6.9700326253940095e-06,
"loss": 0.7985,
"step": 844
},
{
"epoch": 0.4943869528650309,
"grad_norm": 0.057835827757031166,
"learning_rate": 6.967714501482114e-06,
"loss": 0.781,
"step": 845
},
{
"epoch": 0.49497202618203096,
"grad_norm": 0.05588852958063585,
"learning_rate": 6.965394208356551e-06,
"loss": 0.8423,
"step": 846
},
{
"epoch": 0.495557099499031,
"grad_norm": 0.039374530415438855,
"learning_rate": 6.9630717479775145e-06,
"loss": 0.8456,
"step": 847
},
{
"epoch": 0.496142172816031,
"grad_norm": 0.0358193083222334,
"learning_rate": 6.960747122307025e-06,
"loss": 0.7992,
"step": 848
},
{
"epoch": 0.49672724613303104,
"grad_norm": 0.03788661691723569,
"learning_rate": 6.9584203333089325e-06,
"loss": 0.8037,
"step": 849
},
{
"epoch": 0.49731231945003107,
"grad_norm": 0.04201649774003793,
"learning_rate": 6.956091382948918e-06,
"loss": 0.8882,
"step": 850
},
{
"epoch": 0.4978973927670311,
"grad_norm": 0.04434150316820075,
"learning_rate": 6.953760273194487e-06,
"loss": 0.8166,
"step": 851
},
{
"epoch": 0.49848246608403113,
"grad_norm": 0.07426388002691353,
"learning_rate": 6.951427006014967e-06,
"loss": 0.8424,
"step": 852
},
{
"epoch": 0.4990675394010312,
"grad_norm": 0.037544290713932796,
"learning_rate": 6.949091583381511e-06,
"loss": 0.7957,
"step": 853
},
{
"epoch": 0.49965261271803124,
"grad_norm": 0.05489041840699949,
"learning_rate": 6.946754007267091e-06,
"loss": 0.9257,
"step": 854
},
{
"epoch": 0.5002376860350313,
"grad_norm": 0.042595883415080985,
"learning_rate": 6.944414279646499e-06,
"loss": 0.8277,
"step": 855
},
{
"epoch": 0.5008227593520314,
"grad_norm": 0.08019312917038988,
"learning_rate": 6.942072402496345e-06,
"loss": 0.9115,
"step": 856
},
{
"epoch": 0.5014078326690313,
"grad_norm": 0.05086333614690917,
"learning_rate": 6.9397283777950545e-06,
"loss": 0.9041,
"step": 857
},
{
"epoch": 0.5019929059860314,
"grad_norm": 0.04214362484163065,
"learning_rate": 6.937382207522867e-06,
"loss": 0.8299,
"step": 858
},
{
"epoch": 0.5025779793030314,
"grad_norm": 0.04083088848742396,
"learning_rate": 6.935033893661835e-06,
"loss": 0.8356,
"step": 859
},
{
"epoch": 0.5031630526200315,
"grad_norm": 0.04687863044141472,
"learning_rate": 6.932683438195821e-06,
"loss": 0.865,
"step": 860
},
{
"epoch": 0.5037481259370314,
"grad_norm": 0.04280142790393131,
"learning_rate": 6.9303308431105e-06,
"loss": 0.8862,
"step": 861
},
{
"epoch": 0.5043331992540315,
"grad_norm": 0.04180226845557804,
"learning_rate": 6.92797611039335e-06,
"loss": 0.8046,
"step": 862
},
{
"epoch": 0.5049182725710316,
"grad_norm": 0.035262278854158474,
"learning_rate": 6.925619242033656e-06,
"loss": 0.8197,
"step": 863
},
{
"epoch": 0.5055033458880316,
"grad_norm": 0.03754079468139451,
"learning_rate": 6.92326024002251e-06,
"loss": 0.8101,
"step": 864
},
{
"epoch": 0.5060884192050317,
"grad_norm": 0.039581024469053656,
"learning_rate": 6.9208991063528045e-06,
"loss": 0.8607,
"step": 865
},
{
"epoch": 0.5066734925220316,
"grad_norm": 0.0371798589671245,
"learning_rate": 6.918535843019233e-06,
"loss": 0.8102,
"step": 866
},
{
"epoch": 0.5072585658390317,
"grad_norm": 0.037768636271352095,
"learning_rate": 6.916170452018288e-06,
"loss": 0.8418,
"step": 867
},
{
"epoch": 0.5078436391560317,
"grad_norm": 0.10093759593186688,
"learning_rate": 6.913802935348258e-06,
"loss": 0.8629,
"step": 868
},
{
"epoch": 0.5084287124730318,
"grad_norm": 0.03663052824269648,
"learning_rate": 6.911433295009232e-06,
"loss": 0.8162,
"step": 869
},
{
"epoch": 0.5090137857900318,
"grad_norm": 0.04855772810380649,
"learning_rate": 6.909061533003088e-06,
"loss": 0.8616,
"step": 870
},
{
"epoch": 0.5095988591070318,
"grad_norm": 0.043219673612599416,
"learning_rate": 6.906687651333498e-06,
"loss": 0.7216,
"step": 871
},
{
"epoch": 0.5101839324240319,
"grad_norm": 0.042711861232268646,
"learning_rate": 6.904311652005925e-06,
"loss": 0.7547,
"step": 872
},
{
"epoch": 0.5107690057410319,
"grad_norm": 0.05574403716352006,
"learning_rate": 6.9019335370276225e-06,
"loss": 0.795,
"step": 873
},
{
"epoch": 0.511354079058032,
"grad_norm": 0.06264056674635722,
"learning_rate": 6.899553308407629e-06,
"loss": 0.85,
"step": 874
},
{
"epoch": 0.511939152375032,
"grad_norm": 0.04875354726056701,
"learning_rate": 6.89717096815677e-06,
"loss": 0.8407,
"step": 875
},
{
"epoch": 0.512524225692032,
"grad_norm": 0.038870649597744084,
"learning_rate": 6.894786518287653e-06,
"loss": 0.8076,
"step": 876
},
{
"epoch": 0.513109299009032,
"grad_norm": 0.05118848362688543,
"learning_rate": 6.8923999608146705e-06,
"loss": 0.8363,
"step": 877
},
{
"epoch": 0.5136943723260321,
"grad_norm": 0.0376112846207315,
"learning_rate": 6.890011297753994e-06,
"loss": 0.7743,
"step": 878
},
{
"epoch": 0.5142794456430322,
"grad_norm": 0.06896727299920731,
"learning_rate": 6.887620531123574e-06,
"loss": 0.8359,
"step": 879
},
{
"epoch": 0.5148645189600322,
"grad_norm": 0.03963337869499397,
"learning_rate": 6.885227662943136e-06,
"loss": 0.8206,
"step": 880
},
{
"epoch": 0.5154495922770322,
"grad_norm": 0.04870453554962347,
"learning_rate": 6.882832695234186e-06,
"loss": 0.8189,
"step": 881
},
{
"epoch": 0.5160346655940322,
"grad_norm": 0.03835351064840454,
"learning_rate": 6.880435630019998e-06,
"loss": 0.8973,
"step": 882
},
{
"epoch": 0.5166197389110323,
"grad_norm": 0.058777196577975345,
"learning_rate": 6.8780364693256224e-06,
"loss": 0.8733,
"step": 883
},
{
"epoch": 0.5172048122280323,
"grad_norm": 0.04023477251259275,
"learning_rate": 6.875635215177878e-06,
"loss": 0.7674,
"step": 884
},
{
"epoch": 0.5177898855450324,
"grad_norm": 0.10180465827282513,
"learning_rate": 6.873231869605351e-06,
"loss": 0.8238,
"step": 885
},
{
"epoch": 0.5183749588620324,
"grad_norm": 0.03697008145978301,
"learning_rate": 6.870826434638396e-06,
"loss": 0.8113,
"step": 886
},
{
"epoch": 0.5189600321790324,
"grad_norm": 0.03765250989596081,
"learning_rate": 6.868418912309133e-06,
"loss": 0.8409,
"step": 887
},
{
"epoch": 0.5195451054960325,
"grad_norm": 0.03863967736287724,
"learning_rate": 6.866009304651444e-06,
"loss": 0.7622,
"step": 888
},
{
"epoch": 0.5201301788130325,
"grad_norm": 0.04809769182496589,
"learning_rate": 6.8635976137009735e-06,
"loss": 0.838,
"step": 889
},
{
"epoch": 0.5207152521300326,
"grad_norm": 0.04589536972341525,
"learning_rate": 6.861183841495127e-06,
"loss": 0.9291,
"step": 890
},
{
"epoch": 0.5213003254470325,
"grad_norm": 0.04495994452664241,
"learning_rate": 6.858767990073066e-06,
"loss": 0.9015,
"step": 891
},
{
"epoch": 0.5218853987640326,
"grad_norm": 0.03736417738835852,
"learning_rate": 6.856350061475712e-06,
"loss": 0.7575,
"step": 892
},
{
"epoch": 0.5224704720810327,
"grad_norm": 0.039650470820273634,
"learning_rate": 6.853930057745735e-06,
"loss": 0.7939,
"step": 893
},
{
"epoch": 0.5230555453980327,
"grad_norm": 0.04072420592259055,
"learning_rate": 6.8515079809275656e-06,
"loss": 0.8026,
"step": 894
},
{
"epoch": 0.5236406187150328,
"grad_norm": 0.04022954757166665,
"learning_rate": 6.849083833067381e-06,
"loss": 0.7891,
"step": 895
},
{
"epoch": 0.5242256920320327,
"grad_norm": 0.04135409431135059,
"learning_rate": 6.846657616213109e-06,
"loss": 0.758,
"step": 896
},
{
"epoch": 0.5248107653490328,
"grad_norm": 0.03960521359539724,
"learning_rate": 6.844229332414427e-06,
"loss": 0.8038,
"step": 897
},
{
"epoch": 0.5253958386660328,
"grad_norm": 0.03765896158372491,
"learning_rate": 6.841798983722755e-06,
"loss": 0.7877,
"step": 898
},
{
"epoch": 0.5259809119830329,
"grad_norm": 0.058839179066597226,
"learning_rate": 6.839366572191262e-06,
"loss": 0.7523,
"step": 899
},
{
"epoch": 0.526565985300033,
"grad_norm": 0.040054239033310936,
"learning_rate": 6.836932099874856e-06,
"loss": 0.8321,
"step": 900
},
{
"epoch": 0.5271510586170329,
"grad_norm": 0.0627577307413516,
"learning_rate": 6.834495568830187e-06,
"loss": 0.8389,
"step": 901
},
{
"epoch": 0.527736131934033,
"grad_norm": 0.036542541933322875,
"learning_rate": 6.832056981115644e-06,
"loss": 0.8108,
"step": 902
},
{
"epoch": 0.528321205251033,
"grad_norm": 0.039969851332150676,
"learning_rate": 6.8296163387913545e-06,
"loss": 0.8144,
"step": 903
},
{
"epoch": 0.5289062785680331,
"grad_norm": 0.04091122066442238,
"learning_rate": 6.827173643919181e-06,
"loss": 0.8865,
"step": 904
},
{
"epoch": 0.529491351885033,
"grad_norm": 0.04077179519179353,
"learning_rate": 6.824728898562721e-06,
"loss": 0.8353,
"step": 905
},
{
"epoch": 0.5300764252020331,
"grad_norm": 0.045472282387591885,
"learning_rate": 6.822282104787305e-06,
"loss": 0.8175,
"step": 906
},
{
"epoch": 0.5306614985190332,
"grad_norm": 0.03919660004429457,
"learning_rate": 6.819833264659988e-06,
"loss": 0.7968,
"step": 907
},
{
"epoch": 0.5312465718360332,
"grad_norm": 0.0550475630156648,
"learning_rate": 6.81738238024956e-06,
"loss": 0.8005,
"step": 908
},
{
"epoch": 0.5318316451530333,
"grad_norm": 0.04044516211789697,
"learning_rate": 6.814929453626538e-06,
"loss": 0.8056,
"step": 909
},
{
"epoch": 0.5324167184700332,
"grad_norm": 0.04512920702041598,
"learning_rate": 6.81247448686316e-06,
"loss": 0.8061,
"step": 910
},
{
"epoch": 0.5330017917870333,
"grad_norm": 0.06440782132528662,
"learning_rate": 6.810017482033392e-06,
"loss": 0.8471,
"step": 911
},
{
"epoch": 0.5335868651040333,
"grad_norm": 0.03695317579916004,
"learning_rate": 6.8075584412129205e-06,
"loss": 0.8222,
"step": 912
},
{
"epoch": 0.5341719384210334,
"grad_norm": 0.03666660940321042,
"learning_rate": 6.805097366479148e-06,
"loss": 0.7822,
"step": 913
},
{
"epoch": 0.5347570117380335,
"grad_norm": 0.055445109178676184,
"learning_rate": 6.802634259911201e-06,
"loss": 0.844,
"step": 914
},
{
"epoch": 0.5353420850550334,
"grad_norm": 0.05279543181613761,
"learning_rate": 6.800169123589919e-06,
"loss": 0.7463,
"step": 915
},
{
"epoch": 0.5359271583720335,
"grad_norm": 0.041859624286024466,
"learning_rate": 6.797701959597859e-06,
"loss": 0.8604,
"step": 916
},
{
"epoch": 0.5365122316890335,
"grad_norm": 0.0383422427552055,
"learning_rate": 6.795232770019286e-06,
"loss": 0.7703,
"step": 917
},
{
"epoch": 0.5370973050060336,
"grad_norm": 0.043028959290976825,
"learning_rate": 6.7927615569401815e-06,
"loss": 0.8212,
"step": 918
},
{
"epoch": 0.5376823783230336,
"grad_norm": 0.03804962859282499,
"learning_rate": 6.790288322448235e-06,
"loss": 0.8366,
"step": 919
},
{
"epoch": 0.5382674516400336,
"grad_norm": 0.039087833943377275,
"learning_rate": 6.787813068632843e-06,
"loss": 0.7831,
"step": 920
},
{
"epoch": 0.5388525249570337,
"grad_norm": 0.04141823524439823,
"learning_rate": 6.785335797585107e-06,
"loss": 0.8828,
"step": 921
},
{
"epoch": 0.5394375982740337,
"grad_norm": 0.05007681264530146,
"learning_rate": 6.782856511397835e-06,
"loss": 0.8005,
"step": 922
},
{
"epoch": 0.5400226715910338,
"grad_norm": 0.0358413269090128,
"learning_rate": 6.780375212165535e-06,
"loss": 0.8488,
"step": 923
},
{
"epoch": 0.5406077449080338,
"grad_norm": 0.039662947137159635,
"learning_rate": 6.777891901984417e-06,
"loss": 0.8269,
"step": 924
},
{
"epoch": 0.5411928182250338,
"grad_norm": 0.03708918191821332,
"learning_rate": 6.775406582952389e-06,
"loss": 0.8561,
"step": 925
},
{
"epoch": 0.5417778915420338,
"grad_norm": 0.0512337126087228,
"learning_rate": 6.772919257169059e-06,
"loss": 0.8225,
"step": 926
},
{
"epoch": 0.5423629648590339,
"grad_norm": 0.15259566476683822,
"learning_rate": 6.770429926735727e-06,
"loss": 0.7892,
"step": 927
},
{
"epoch": 0.542948038176034,
"grad_norm": 0.036454692983525445,
"learning_rate": 6.767938593755386e-06,
"loss": 0.87,
"step": 928
},
{
"epoch": 0.543533111493034,
"grad_norm": 0.08503045559657693,
"learning_rate": 6.765445260332723e-06,
"loss": 0.878,
"step": 929
},
{
"epoch": 0.544118184810034,
"grad_norm": 0.04291403088850566,
"learning_rate": 6.7629499285741155e-06,
"loss": 0.8633,
"step": 930
},
{
"epoch": 0.544703258127034,
"grad_norm": 0.04417233325326183,
"learning_rate": 6.7604526005876265e-06,
"loss": 0.7777,
"step": 931
},
{
"epoch": 0.5452883314440341,
"grad_norm": 0.04685514307801011,
"learning_rate": 6.7579532784830075e-06,
"loss": 0.8233,
"step": 932
},
{
"epoch": 0.5458734047610341,
"grad_norm": 0.04216821721923777,
"learning_rate": 6.755451964371696e-06,
"loss": 0.8055,
"step": 933
},
{
"epoch": 0.5464584780780342,
"grad_norm": 0.05899439548668003,
"learning_rate": 6.752948660366807e-06,
"loss": 0.7423,
"step": 934
},
{
"epoch": 0.5470435513950342,
"grad_norm": 0.039547439339394544,
"learning_rate": 6.750443368583141e-06,
"loss": 0.7959,
"step": 935
},
{
"epoch": 0.5476286247120342,
"grad_norm": 0.046082967838638036,
"learning_rate": 6.747936091137179e-06,
"loss": 0.7691,
"step": 936
},
{
"epoch": 0.5482136980290343,
"grad_norm": 0.04724571857592231,
"learning_rate": 6.745426830147074e-06,
"loss": 0.7716,
"step": 937
},
{
"epoch": 0.5487987713460343,
"grad_norm": 0.049126540248396626,
"learning_rate": 6.74291558773266e-06,
"loss": 0.8051,
"step": 938
},
{
"epoch": 0.5493838446630344,
"grad_norm": 0.038162530207834874,
"learning_rate": 6.740402366015442e-06,
"loss": 0.8182,
"step": 939
},
{
"epoch": 0.5499689179800343,
"grad_norm": 0.07362395009585133,
"learning_rate": 6.737887167118597e-06,
"loss": 0.8025,
"step": 940
},
{
"epoch": 0.5505539912970344,
"grad_norm": 0.03855880618683095,
"learning_rate": 6.735369993166977e-06,
"loss": 0.8257,
"step": 941
},
{
"epoch": 0.5511390646140344,
"grad_norm": 0.04403565508981498,
"learning_rate": 6.732850846287096e-06,
"loss": 0.7377,
"step": 942
},
{
"epoch": 0.5517241379310345,
"grad_norm": 0.039173991395047666,
"learning_rate": 6.730329728607137e-06,
"loss": 0.8063,
"step": 943
},
{
"epoch": 0.5523092112480346,
"grad_norm": 0.042531239336748856,
"learning_rate": 6.72780664225695e-06,
"loss": 0.8237,
"step": 944
},
{
"epoch": 0.5528942845650345,
"grad_norm": 0.039715480086954504,
"learning_rate": 6.725281589368046e-06,
"loss": 0.8229,
"step": 945
},
{
"epoch": 0.5534793578820346,
"grad_norm": 0.04910369460983757,
"learning_rate": 6.722754572073599e-06,
"loss": 0.8503,
"step": 946
},
{
"epoch": 0.5540644311990346,
"grad_norm": 0.03827200257320983,
"learning_rate": 6.720225592508439e-06,
"loss": 0.9069,
"step": 947
},
{
"epoch": 0.5546495045160347,
"grad_norm": 0.06511818499208381,
"learning_rate": 6.7176946528090585e-06,
"loss": 0.9065,
"step": 948
},
{
"epoch": 0.5552345778330346,
"grad_norm": 0.03694464014581456,
"learning_rate": 6.715161755113604e-06,
"loss": 0.8588,
"step": 949
},
{
"epoch": 0.5558196511500347,
"grad_norm": 0.03684150463907197,
"learning_rate": 6.712626901561876e-06,
"loss": 0.8272,
"step": 950
},
{
"epoch": 0.5564047244670348,
"grad_norm": 0.04096608730346328,
"learning_rate": 6.710090094295323e-06,
"loss": 0.9031,
"step": 951
},
{
"epoch": 0.5569897977840348,
"grad_norm": 0.04008860398814125,
"learning_rate": 6.707551335457054e-06,
"loss": 0.8452,
"step": 952
},
{
"epoch": 0.5575748711010349,
"grad_norm": 0.06233285275932101,
"learning_rate": 6.705010627191816e-06,
"loss": 0.8813,
"step": 953
},
{
"epoch": 0.5581599444180348,
"grad_norm": 0.04077134247799943,
"learning_rate": 6.7024679716460114e-06,
"loss": 0.8493,
"step": 954
},
{
"epoch": 0.5587450177350349,
"grad_norm": 0.046352641350363474,
"learning_rate": 6.699923370967682e-06,
"loss": 0.8309,
"step": 955
},
{
"epoch": 0.5593300910520349,
"grad_norm": 0.041407416461248854,
"learning_rate": 6.6973768273065145e-06,
"loss": 0.8149,
"step": 956
},
{
"epoch": 0.559915164369035,
"grad_norm": 0.03903915190219945,
"learning_rate": 6.694828342813839e-06,
"loss": 0.8669,
"step": 957
},
{
"epoch": 0.5605002376860351,
"grad_norm": 0.05659630453049191,
"learning_rate": 6.692277919642623e-06,
"loss": 0.8291,
"step": 958
},
{
"epoch": 0.561085311003035,
"grad_norm": 0.048419920107990906,
"learning_rate": 6.6897255599474705e-06,
"loss": 0.7891,
"step": 959
},
{
"epoch": 0.5616703843200351,
"grad_norm": 0.04526622141241147,
"learning_rate": 6.6871712658846255e-06,
"loss": 0.82,
"step": 960
},
{
"epoch": 0.5622554576370351,
"grad_norm": 0.05616992844910552,
"learning_rate": 6.684615039611963e-06,
"loss": 0.819,
"step": 961
},
{
"epoch": 0.5628405309540352,
"grad_norm": 0.05533671965109322,
"learning_rate": 6.682056883288993e-06,
"loss": 0.8278,
"step": 962
},
{
"epoch": 0.5634256042710352,
"grad_norm": 0.04723246972770138,
"learning_rate": 6.679496799076853e-06,
"loss": 0.8255,
"step": 963
},
{
"epoch": 0.5640106775880352,
"grad_norm": 0.04762545006180037,
"learning_rate": 6.67693478913831e-06,
"loss": 0.8867,
"step": 964
},
{
"epoch": 0.5645957509050353,
"grad_norm": 0.04368900157724721,
"learning_rate": 6.674370855637759e-06,
"loss": 0.8527,
"step": 965
},
{
"epoch": 0.5651808242220353,
"grad_norm": 0.04639910967539116,
"learning_rate": 6.671805000741221e-06,
"loss": 0.8147,
"step": 966
},
{
"epoch": 0.5657658975390354,
"grad_norm": 0.04122171566933528,
"learning_rate": 6.6692372266163365e-06,
"loss": 0.8176,
"step": 967
},
{
"epoch": 0.5663509708560354,
"grad_norm": 0.04160364438101482,
"learning_rate": 6.666667535432371e-06,
"loss": 0.8588,
"step": 968
},
{
"epoch": 0.5669360441730354,
"grad_norm": 0.04172707626334754,
"learning_rate": 6.664095929360207e-06,
"loss": 0.8315,
"step": 969
},
{
"epoch": 0.5675211174900354,
"grad_norm": 0.04646561250778188,
"learning_rate": 6.661522410572346e-06,
"loss": 0.7923,
"step": 970
},
{
"epoch": 0.5681061908070355,
"grad_norm": 0.04188298952966753,
"learning_rate": 6.658946981242906e-06,
"loss": 0.7711,
"step": 971
},
{
"epoch": 0.5686912641240356,
"grad_norm": 0.03908796787695422,
"learning_rate": 6.656369643547617e-06,
"loss": 0.8856,
"step": 972
},
{
"epoch": 0.5692763374410356,
"grad_norm": 0.042788844509774104,
"learning_rate": 6.653790399663823e-06,
"loss": 0.7808,
"step": 973
},
{
"epoch": 0.5698614107580356,
"grad_norm": 0.05201703823511508,
"learning_rate": 6.651209251770478e-06,
"loss": 0.8618,
"step": 974
},
{
"epoch": 0.5704464840750356,
"grad_norm": 0.04585801993937681,
"learning_rate": 6.648626202048144e-06,
"loss": 0.8373,
"step": 975
},
{
"epoch": 0.5710315573920357,
"grad_norm": 0.040661578495326846,
"learning_rate": 6.646041252678989e-06,
"loss": 0.7641,
"step": 976
},
{
"epoch": 0.5716166307090357,
"grad_norm": 0.035144726376477184,
"learning_rate": 6.643454405846788e-06,
"loss": 0.7272,
"step": 977
},
{
"epoch": 0.5722017040260358,
"grad_norm": 0.04251759434748344,
"learning_rate": 6.640865663736917e-06,
"loss": 0.8063,
"step": 978
},
{
"epoch": 0.5727867773430358,
"grad_norm": 0.037521500861554015,
"learning_rate": 6.638275028536356e-06,
"loss": 0.7797,
"step": 979
},
{
"epoch": 0.5733718506600358,
"grad_norm": 0.041622067012318506,
"learning_rate": 6.6356825024336784e-06,
"loss": 0.8679,
"step": 980
},
{
"epoch": 0.5739569239770359,
"grad_norm": 0.04290995619424604,
"learning_rate": 6.63308808761906e-06,
"loss": 0.8477,
"step": 981
},
{
"epoch": 0.5745419972940359,
"grad_norm": 0.0429619795472441,
"learning_rate": 6.630491786284273e-06,
"loss": 0.8145,
"step": 982
},
{
"epoch": 0.575127070611036,
"grad_norm": 0.03771407315818989,
"learning_rate": 6.6278936006226795e-06,
"loss": 0.796,
"step": 983
},
{
"epoch": 0.5757121439280359,
"grad_norm": 0.0720666677058357,
"learning_rate": 6.625293532829236e-06,
"loss": 0.7856,
"step": 984
},
{
"epoch": 0.576297217245036,
"grad_norm": 0.05035693118475441,
"learning_rate": 6.622691585100488e-06,
"loss": 0.8212,
"step": 985
},
{
"epoch": 0.5768822905620361,
"grad_norm": 0.0661002129965687,
"learning_rate": 6.620087759634569e-06,
"loss": 0.8248,
"step": 986
},
{
"epoch": 0.5774673638790361,
"grad_norm": 0.043048564438655845,
"learning_rate": 6.617482058631201e-06,
"loss": 0.7865,
"step": 987
},
{
"epoch": 0.5780524371960362,
"grad_norm": 0.04716626817210766,
"learning_rate": 6.614874484291688e-06,
"loss": 0.7806,
"step": 988
},
{
"epoch": 0.5786375105130361,
"grad_norm": 0.043537127489980836,
"learning_rate": 6.612265038818915e-06,
"loss": 0.8248,
"step": 989
},
{
"epoch": 0.5792225838300362,
"grad_norm": 0.036528767071679374,
"learning_rate": 6.609653724417354e-06,
"loss": 0.8464,
"step": 990
},
{
"epoch": 0.5798076571470362,
"grad_norm": 0.041985098940740775,
"learning_rate": 6.6070405432930495e-06,
"loss": 0.8371,
"step": 991
},
{
"epoch": 0.5803927304640363,
"grad_norm": 0.037661629039364965,
"learning_rate": 6.604425497653627e-06,
"loss": 0.8133,
"step": 992
},
{
"epoch": 0.5809778037810364,
"grad_norm": 0.04709785126617865,
"learning_rate": 6.6018085897082845e-06,
"loss": 0.7926,
"step": 993
},
{
"epoch": 0.5815628770980363,
"grad_norm": 0.04920382998315035,
"learning_rate": 6.5991898216677945e-06,
"loss": 0.8511,
"step": 994
},
{
"epoch": 0.5821479504150364,
"grad_norm": 0.0384231269239162,
"learning_rate": 6.596569195744502e-06,
"loss": 0.8767,
"step": 995
},
{
"epoch": 0.5827330237320364,
"grad_norm": 0.0469334210563224,
"learning_rate": 6.59394671415232e-06,
"loss": 0.8445,
"step": 996
},
{
"epoch": 0.5833180970490365,
"grad_norm": 0.07961405813390293,
"learning_rate": 6.591322379106728e-06,
"loss": 0.7951,
"step": 997
},
{
"epoch": 0.5839031703660365,
"grad_norm": 0.039593707752364676,
"learning_rate": 6.588696192824775e-06,
"loss": 0.8325,
"step": 998
},
{
"epoch": 0.5844882436830365,
"grad_norm": 0.04184294201712494,
"learning_rate": 6.5860681575250706e-06,
"loss": 0.8599,
"step": 999
},
{
"epoch": 0.5850733170000366,
"grad_norm": 0.10237775503964221,
"learning_rate": 6.5834382754277885e-06,
"loss": 0.8159,
"step": 1000
},
{
"epoch": 0.5856583903170366,
"grad_norm": 0.03881481319706578,
"learning_rate": 6.580806548754661e-06,
"loss": 0.7969,
"step": 1001
},
{
"epoch": 0.5862434636340367,
"grad_norm": 0.043457660890875695,
"learning_rate": 6.578172979728979e-06,
"loss": 0.8012,
"step": 1002
},
{
"epoch": 0.5868285369510367,
"grad_norm": 0.037934756068100706,
"learning_rate": 6.5755375705755924e-06,
"loss": 0.8248,
"step": 1003
},
{
"epoch": 0.5874136102680367,
"grad_norm": 0.12610177824785532,
"learning_rate": 6.572900323520901e-06,
"loss": 0.7342,
"step": 1004
},
{
"epoch": 0.5879986835850367,
"grad_norm": 0.03661390969286209,
"learning_rate": 6.570261240792861e-06,
"loss": 0.7682,
"step": 1005
},
{
"epoch": 0.5885837569020368,
"grad_norm": 0.06798106192365762,
"learning_rate": 6.5676203246209785e-06,
"loss": 0.793,
"step": 1006
},
{
"epoch": 0.5891688302190369,
"grad_norm": 0.05106297642369706,
"learning_rate": 6.564977577236309e-06,
"loss": 0.881,
"step": 1007
},
{
"epoch": 0.5897539035360368,
"grad_norm": 0.04384540012407079,
"learning_rate": 6.5623330008714505e-06,
"loss": 0.7453,
"step": 1008
},
{
"epoch": 0.5903389768530369,
"grad_norm": 0.10758927483483961,
"learning_rate": 6.559686597760555e-06,
"loss": 0.8367,
"step": 1009
},
{
"epoch": 0.5909240501700369,
"grad_norm": 0.04079443571028819,
"learning_rate": 6.557038370139307e-06,
"loss": 0.8972,
"step": 1010
},
{
"epoch": 0.591509123487037,
"grad_norm": 0.04314093124710063,
"learning_rate": 6.554388320244943e-06,
"loss": 0.8316,
"step": 1011
},
{
"epoch": 0.592094196804037,
"grad_norm": 0.037939172015033606,
"learning_rate": 6.5517364503162315e-06,
"loss": 0.8587,
"step": 1012
},
{
"epoch": 0.592679270121037,
"grad_norm": 0.08510073848850286,
"learning_rate": 6.549082762593481e-06,
"loss": 0.8269,
"step": 1013
},
{
"epoch": 0.5932643434380371,
"grad_norm": 0.04708547345385738,
"learning_rate": 6.546427259318535e-06,
"loss": 0.8102,
"step": 1014
},
{
"epoch": 0.5938494167550371,
"grad_norm": 0.053444431590032856,
"learning_rate": 6.543769942734772e-06,
"loss": 0.7774,
"step": 1015
},
{
"epoch": 0.5944344900720372,
"grad_norm": 0.04005959184610443,
"learning_rate": 6.541110815087104e-06,
"loss": 0.7808,
"step": 1016
},
{
"epoch": 0.5950195633890372,
"grad_norm": 0.05077795167298267,
"learning_rate": 6.538449878621966e-06,
"loss": 0.8758,
"step": 1017
},
{
"epoch": 0.5956046367060372,
"grad_norm": 0.03833355241397989,
"learning_rate": 6.535787135587331e-06,
"loss": 0.8331,
"step": 1018
},
{
"epoch": 0.5961897100230372,
"grad_norm": 0.053233926527690635,
"learning_rate": 6.533122588232689e-06,
"loss": 0.8339,
"step": 1019
},
{
"epoch": 0.5967747833400373,
"grad_norm": 0.044734695814882965,
"learning_rate": 6.530456238809062e-06,
"loss": 0.7773,
"step": 1020
},
{
"epoch": 0.5973598566570373,
"grad_norm": 0.05575094862913427,
"learning_rate": 6.527788089568987e-06,
"loss": 0.8303,
"step": 1021
},
{
"epoch": 0.5979449299740374,
"grad_norm": 0.04961953370443646,
"learning_rate": 6.525118142766527e-06,
"loss": 0.7392,
"step": 1022
},
{
"epoch": 0.5985300032910374,
"grad_norm": 0.09625684788536548,
"learning_rate": 6.522446400657264e-06,
"loss": 0.9054,
"step": 1023
},
{
"epoch": 0.5991150766080374,
"grad_norm": 0.05229897404990849,
"learning_rate": 6.519772865498291e-06,
"loss": 0.7961,
"step": 1024
},
{
"epoch": 0.5997001499250375,
"grad_norm": 0.043742861050743945,
"learning_rate": 6.51709753954822e-06,
"loss": 0.8543,
"step": 1025
},
{
"epoch": 0.6002852232420375,
"grad_norm": 0.04421257742872319,
"learning_rate": 6.514420425067179e-06,
"loss": 0.9387,
"step": 1026
},
{
"epoch": 0.6008702965590376,
"grad_norm": 0.05074707736420169,
"learning_rate": 6.511741524316798e-06,
"loss": 0.6896,
"step": 1027
},
{
"epoch": 0.6014553698760375,
"grad_norm": 0.038800133730654246,
"learning_rate": 6.509060839560223e-06,
"loss": 0.7429,
"step": 1028
},
{
"epoch": 0.6020404431930376,
"grad_norm": 0.0672245152896617,
"learning_rate": 6.506378373062107e-06,
"loss": 0.8246,
"step": 1029
},
{
"epoch": 0.6026255165100377,
"grad_norm": 0.04853749164209948,
"learning_rate": 6.503694127088604e-06,
"loss": 0.8006,
"step": 1030
},
{
"epoch": 0.6032105898270377,
"grad_norm": 0.0423510296147158,
"learning_rate": 6.501008103907376e-06,
"loss": 0.8068,
"step": 1031
},
{
"epoch": 0.6037956631440378,
"grad_norm": 0.04168402528442862,
"learning_rate": 6.498320305787583e-06,
"loss": 0.747,
"step": 1032
},
{
"epoch": 0.6043807364610377,
"grad_norm": 0.05536933839665471,
"learning_rate": 6.495630734999885e-06,
"loss": 0.8214,
"step": 1033
},
{
"epoch": 0.6049658097780378,
"grad_norm": 0.03706453349529763,
"learning_rate": 6.4929393938164425e-06,
"loss": 0.7507,
"step": 1034
},
{
"epoch": 0.6055508830950378,
"grad_norm": 0.03569665816245634,
"learning_rate": 6.490246284510907e-06,
"loss": 0.8255,
"step": 1035
},
{
"epoch": 0.6061359564120379,
"grad_norm": 0.04630173604608656,
"learning_rate": 6.487551409358428e-06,
"loss": 0.8046,
"step": 1036
},
{
"epoch": 0.606721029729038,
"grad_norm": 0.04723153392312322,
"learning_rate": 6.4848547706356444e-06,
"loss": 0.8256,
"step": 1037
},
{
"epoch": 0.6073061030460379,
"grad_norm": 0.03827183486478635,
"learning_rate": 6.482156370620683e-06,
"loss": 0.7563,
"step": 1038
},
{
"epoch": 0.607891176363038,
"grad_norm": 0.0388015622476578,
"learning_rate": 6.479456211593165e-06,
"loss": 0.7176,
"step": 1039
},
{
"epoch": 0.608476249680038,
"grad_norm": 0.03999779366433317,
"learning_rate": 6.476754295834191e-06,
"loss": 0.8224,
"step": 1040
},
{
"epoch": 0.6090613229970381,
"grad_norm": 0.039892311683218856,
"learning_rate": 6.47405062562635e-06,
"loss": 0.8236,
"step": 1041
},
{
"epoch": 0.609646396314038,
"grad_norm": 0.03973227428998665,
"learning_rate": 6.471345203253711e-06,
"loss": 0.8184,
"step": 1042
},
{
"epoch": 0.6102314696310381,
"grad_norm": 0.04150106217436074,
"learning_rate": 6.468638031001823e-06,
"loss": 0.8804,
"step": 1043
},
{
"epoch": 0.6108165429480382,
"grad_norm": 0.04239329779716534,
"learning_rate": 6.465929111157714e-06,
"loss": 0.7935,
"step": 1044
},
{
"epoch": 0.6114016162650382,
"grad_norm": 0.05192716135637802,
"learning_rate": 6.463218446009888e-06,
"loss": 0.8526,
"step": 1045
},
{
"epoch": 0.6119866895820383,
"grad_norm": 0.04095824415870296,
"learning_rate": 6.4605060378483255e-06,
"loss": 0.901,
"step": 1046
},
{
"epoch": 0.6125717628990383,
"grad_norm": 0.03546640975061806,
"learning_rate": 6.457791888964478e-06,
"loss": 0.7811,
"step": 1047
},
{
"epoch": 0.6131568362160383,
"grad_norm": 0.040296634693903335,
"learning_rate": 6.455076001651265e-06,
"loss": 0.7403,
"step": 1048
},
{
"epoch": 0.6137419095330383,
"grad_norm": 0.039361319712739894,
"learning_rate": 6.452358378203079e-06,
"loss": 0.8359,
"step": 1049
},
{
"epoch": 0.6143269828500384,
"grad_norm": 0.04449717698536887,
"learning_rate": 6.449639020915777e-06,
"loss": 0.7877,
"step": 1050
},
{
"epoch": 0.6149120561670385,
"grad_norm": 0.05075898809213348,
"learning_rate": 6.446917932086681e-06,
"loss": 0.7867,
"step": 1051
},
{
"epoch": 0.6154971294840385,
"grad_norm": 0.05427508854998823,
"learning_rate": 6.444195114014573e-06,
"loss": 0.9079,
"step": 1052
},
{
"epoch": 0.6160822028010385,
"grad_norm": 0.039038596627602784,
"learning_rate": 6.441470568999704e-06,
"loss": 0.8449,
"step": 1053
},
{
"epoch": 0.6166672761180385,
"grad_norm": 0.05495139920274083,
"learning_rate": 6.438744299343774e-06,
"loss": 0.8195,
"step": 1054
},
{
"epoch": 0.6172523494350386,
"grad_norm": 0.04036928882191201,
"learning_rate": 6.436016307349947e-06,
"loss": 0.9269,
"step": 1055
},
{
"epoch": 0.6178374227520386,
"grad_norm": 0.03982823893586514,
"learning_rate": 6.4332865953228395e-06,
"loss": 0.7358,
"step": 1056
},
{
"epoch": 0.6184224960690387,
"grad_norm": 0.06481474497909782,
"learning_rate": 6.430555165568521e-06,
"loss": 0.7267,
"step": 1057
},
{
"epoch": 0.6190075693860387,
"grad_norm": 0.05875784906821891,
"learning_rate": 6.427822020394512e-06,
"loss": 0.8439,
"step": 1058
},
{
"epoch": 0.6195926427030387,
"grad_norm": 0.040087218682744265,
"learning_rate": 6.425087162109781e-06,
"loss": 0.7822,
"step": 1059
},
{
"epoch": 0.6201777160200388,
"grad_norm": 0.04225947062502592,
"learning_rate": 6.422350593024747e-06,
"loss": 0.8365,
"step": 1060
},
{
"epoch": 0.6207627893370388,
"grad_norm": 0.03505701993198131,
"learning_rate": 6.419612315451275e-06,
"loss": 0.7465,
"step": 1061
},
{
"epoch": 0.6213478626540389,
"grad_norm": 0.03617054781781428,
"learning_rate": 6.4168723317026655e-06,
"loss": 0.7628,
"step": 1062
},
{
"epoch": 0.6219329359710388,
"grad_norm": 0.04149790973744168,
"learning_rate": 6.414130644093669e-06,
"loss": 0.7846,
"step": 1063
},
{
"epoch": 0.6225180092880389,
"grad_norm": 0.08318622079767567,
"learning_rate": 6.411387254940473e-06,
"loss": 0.8865,
"step": 1064
},
{
"epoch": 0.623103082605039,
"grad_norm": 0.04061744209355914,
"learning_rate": 6.4086421665607e-06,
"loss": 0.7673,
"step": 1065
},
{
"epoch": 0.623688155922039,
"grad_norm": 0.04460594650043864,
"learning_rate": 6.405895381273411e-06,
"loss": 0.7735,
"step": 1066
},
{
"epoch": 0.624273229239039,
"grad_norm": 0.05550280064698044,
"learning_rate": 6.403146901399098e-06,
"loss": 0.8569,
"step": 1067
},
{
"epoch": 0.624858302556039,
"grad_norm": 0.04163827309254167,
"learning_rate": 6.400396729259685e-06,
"loss": 0.8951,
"step": 1068
},
{
"epoch": 0.6254433758730391,
"grad_norm": 0.1509844863070891,
"learning_rate": 6.39764486717853e-06,
"loss": 0.8082,
"step": 1069
},
{
"epoch": 0.6260284491900391,
"grad_norm": 0.04492131767454413,
"learning_rate": 6.394891317480412e-06,
"loss": 0.7518,
"step": 1070
},
{
"epoch": 0.6266135225070392,
"grad_norm": 0.0471963162082604,
"learning_rate": 6.39213608249154e-06,
"loss": 0.8127,
"step": 1071
},
{
"epoch": 0.6271985958240393,
"grad_norm": 0.038627758596647896,
"learning_rate": 6.389379164539545e-06,
"loss": 0.7781,
"step": 1072
},
{
"epoch": 0.6277836691410392,
"grad_norm": 0.04265598384978139,
"learning_rate": 6.386620565953482e-06,
"loss": 0.7698,
"step": 1073
},
{
"epoch": 0.6283687424580393,
"grad_norm": 0.040866964196317926,
"learning_rate": 6.383860289063821e-06,
"loss": 0.741,
"step": 1074
},
{
"epoch": 0.6289538157750393,
"grad_norm": 0.04488242542194653,
"learning_rate": 6.3810983362024575e-06,
"loss": 0.8618,
"step": 1075
},
{
"epoch": 0.6295388890920394,
"grad_norm": 0.04181264245748804,
"learning_rate": 6.3783347097026935e-06,
"loss": 0.8318,
"step": 1076
},
{
"epoch": 0.6301239624090393,
"grad_norm": 0.057019285327405245,
"learning_rate": 6.375569411899253e-06,
"loss": 0.822,
"step": 1077
},
{
"epoch": 0.6307090357260394,
"grad_norm": 0.04075452383662032,
"learning_rate": 6.3728024451282675e-06,
"loss": 0.7854,
"step": 1078
},
{
"epoch": 0.6312941090430395,
"grad_norm": 0.04369789370291459,
"learning_rate": 6.37003381172728e-06,
"loss": 0.7976,
"step": 1079
},
{
"epoch": 0.6318791823600395,
"grad_norm": 0.05849968444823343,
"learning_rate": 6.367263514035242e-06,
"loss": 0.8309,
"step": 1080
},
{
"epoch": 0.6324642556770396,
"grad_norm": 0.07398315736379607,
"learning_rate": 6.364491554392508e-06,
"loss": 0.8695,
"step": 1081
},
{
"epoch": 0.6330493289940395,
"grad_norm": 0.14149329637182187,
"learning_rate": 6.36171793514084e-06,
"loss": 0.9035,
"step": 1082
},
{
"epoch": 0.6336344023110396,
"grad_norm": 0.09431782156363094,
"learning_rate": 6.358942658623402e-06,
"loss": 0.7978,
"step": 1083
},
{
"epoch": 0.6342194756280396,
"grad_norm": 0.11444686544474032,
"learning_rate": 6.356165727184753e-06,
"loss": 0.8075,
"step": 1084
},
{
"epoch": 0.6348045489450397,
"grad_norm": 0.07683296171985984,
"learning_rate": 6.353387143170856e-06,
"loss": 0.7619,
"step": 1085
},
{
"epoch": 0.6353896222620398,
"grad_norm": 0.03611601798693106,
"learning_rate": 6.3506069089290705e-06,
"loss": 0.7886,
"step": 1086
},
{
"epoch": 0.6359746955790397,
"grad_norm": 0.055915745635355715,
"learning_rate": 6.3478250268081435e-06,
"loss": 0.8511,
"step": 1087
},
{
"epoch": 0.6365597688960398,
"grad_norm": 0.048663447635497695,
"learning_rate": 6.34504149915822e-06,
"loss": 0.9001,
"step": 1088
},
{
"epoch": 0.6371448422130398,
"grad_norm": 0.04226309960545497,
"learning_rate": 6.342256328330833e-06,
"loss": 0.7945,
"step": 1089
},
{
"epoch": 0.6377299155300399,
"grad_norm": 0.0425747978871558,
"learning_rate": 6.339469516678903e-06,
"loss": 0.7945,
"step": 1090
},
{
"epoch": 0.6383149888470399,
"grad_norm": 0.049540193290765706,
"learning_rate": 6.33668106655674e-06,
"loss": 0.8424,
"step": 1091
},
{
"epoch": 0.6389000621640399,
"grad_norm": 0.04361089778932384,
"learning_rate": 6.333890980320033e-06,
"loss": 0.8775,
"step": 1092
},
{
"epoch": 0.6394851354810399,
"grad_norm": 0.04952042923998371,
"learning_rate": 6.331099260325858e-06,
"loss": 0.7921,
"step": 1093
},
{
"epoch": 0.64007020879804,
"grad_norm": 0.08510241121137069,
"learning_rate": 6.32830590893267e-06,
"loss": 0.8449,
"step": 1094
},
{
"epoch": 0.6406552821150401,
"grad_norm": 0.052581634750892337,
"learning_rate": 6.325510928500298e-06,
"loss": 0.8504,
"step": 1095
},
{
"epoch": 0.64124035543204,
"grad_norm": 0.06971844661189186,
"learning_rate": 6.322714321389955e-06,
"loss": 0.8049,
"step": 1096
},
{
"epoch": 0.6418254287490401,
"grad_norm": 0.0425225993758099,
"learning_rate": 6.319916089964221e-06,
"loss": 0.7374,
"step": 1097
},
{
"epoch": 0.6424105020660401,
"grad_norm": 0.04070882005149654,
"learning_rate": 6.317116236587052e-06,
"loss": 0.747,
"step": 1098
},
{
"epoch": 0.6429955753830402,
"grad_norm": 0.04748840753374928,
"learning_rate": 6.314314763623775e-06,
"loss": 0.7728,
"step": 1099
},
{
"epoch": 0.6435806487000402,
"grad_norm": 0.08783073493232298,
"learning_rate": 6.31151167344108e-06,
"loss": 0.8031,
"step": 1100
},
{
"epoch": 0.6441657220170403,
"grad_norm": 0.12757918242470023,
"learning_rate": 6.308706968407029e-06,
"loss": 0.8275,
"step": 1101
},
{
"epoch": 0.6447507953340403,
"grad_norm": 0.04265066246559783,
"learning_rate": 6.305900650891045e-06,
"loss": 0.8143,
"step": 1102
},
{
"epoch": 0.6453358686510403,
"grad_norm": 0.05532213512913742,
"learning_rate": 6.303092723263917e-06,
"loss": 0.7623,
"step": 1103
},
{
"epoch": 0.6459209419680404,
"grad_norm": 0.05575276568946241,
"learning_rate": 6.300283187897788e-06,
"loss": 0.7578,
"step": 1104
},
{
"epoch": 0.6465060152850404,
"grad_norm": 0.04004488402472709,
"learning_rate": 6.297472047166164e-06,
"loss": 0.8287,
"step": 1105
},
{
"epoch": 0.6470910886020405,
"grad_norm": 0.07040462606702534,
"learning_rate": 6.294659303443907e-06,
"loss": 0.802,
"step": 1106
},
{
"epoch": 0.6476761619190404,
"grad_norm": 0.04991605035702622,
"learning_rate": 6.291844959107231e-06,
"loss": 0.78,
"step": 1107
},
{
"epoch": 0.6482612352360405,
"grad_norm": 0.04810590450360042,
"learning_rate": 6.289029016533705e-06,
"loss": 0.7531,
"step": 1108
},
{
"epoch": 0.6488463085530406,
"grad_norm": 0.04521072775635856,
"learning_rate": 6.286211478102243e-06,
"loss": 0.773,
"step": 1109
},
{
"epoch": 0.6494313818700406,
"grad_norm": 0.04141710815776832,
"learning_rate": 6.283392346193114e-06,
"loss": 0.813,
"step": 1110
},
{
"epoch": 0.6500164551870407,
"grad_norm": 0.043835473627139195,
"learning_rate": 6.280571623187929e-06,
"loss": 0.793,
"step": 1111
},
{
"epoch": 0.6506015285040406,
"grad_norm": 0.05669397811371989,
"learning_rate": 6.277749311469643e-06,
"loss": 0.7317,
"step": 1112
},
{
"epoch": 0.6511866018210407,
"grad_norm": 0.08646975004658393,
"learning_rate": 6.274925413422558e-06,
"loss": 0.8447,
"step": 1113
},
{
"epoch": 0.6517716751380407,
"grad_norm": 0.039267068048581634,
"learning_rate": 6.272099931432308e-06,
"loss": 0.8171,
"step": 1114
},
{
"epoch": 0.6523567484550408,
"grad_norm": 0.04626551882560527,
"learning_rate": 6.2692728678858705e-06,
"loss": 0.7719,
"step": 1115
},
{
"epoch": 0.6529418217720409,
"grad_norm": 0.08786584886983898,
"learning_rate": 6.26644422517156e-06,
"loss": 0.7863,
"step": 1116
},
{
"epoch": 0.6535268950890408,
"grad_norm": 0.07658703901409003,
"learning_rate": 6.26361400567902e-06,
"loss": 0.8893,
"step": 1117
},
{
"epoch": 0.6541119684060409,
"grad_norm": 0.03799609243525315,
"learning_rate": 6.2607822117992326e-06,
"loss": 0.7575,
"step": 1118
},
{
"epoch": 0.6546970417230409,
"grad_norm": 0.04335037636508533,
"learning_rate": 6.257948845924505e-06,
"loss": 0.7564,
"step": 1119
},
{
"epoch": 0.655282115040041,
"grad_norm": 0.04285395891669155,
"learning_rate": 6.2551139104484755e-06,
"loss": 0.8482,
"step": 1120
},
{
"epoch": 0.6558671883570409,
"grad_norm": 0.03461985415221269,
"learning_rate": 6.252277407766103e-06,
"loss": 0.7411,
"step": 1121
},
{
"epoch": 0.656452261674041,
"grad_norm": 0.06081410937727396,
"learning_rate": 6.249439340273679e-06,
"loss": 0.8753,
"step": 1122
},
{
"epoch": 0.6570373349910411,
"grad_norm": 0.08982630298329526,
"learning_rate": 6.246599710368809e-06,
"loss": 0.7514,
"step": 1123
},
{
"epoch": 0.6576224083080411,
"grad_norm": 0.04564349868186195,
"learning_rate": 6.243758520450423e-06,
"loss": 0.7989,
"step": 1124
},
{
"epoch": 0.6582074816250412,
"grad_norm": 0.043054553052619716,
"learning_rate": 6.240915772918768e-06,
"loss": 0.7454,
"step": 1125
},
{
"epoch": 0.6587925549420411,
"grad_norm": 0.03677134636684232,
"learning_rate": 6.238071470175405e-06,
"loss": 0.8528,
"step": 1126
},
{
"epoch": 0.6593776282590412,
"grad_norm": 0.04172483135818754,
"learning_rate": 6.235225614623212e-06,
"loss": 0.812,
"step": 1127
},
{
"epoch": 0.6599627015760412,
"grad_norm": 0.05277877562040755,
"learning_rate": 6.232378208666376e-06,
"loss": 0.8283,
"step": 1128
},
{
"epoch": 0.6605477748930413,
"grad_norm": 0.04454598931616706,
"learning_rate": 6.229529254710396e-06,
"loss": 0.8537,
"step": 1129
},
{
"epoch": 0.6611328482100414,
"grad_norm": 0.047900592019452154,
"learning_rate": 6.226678755162076e-06,
"loss": 0.825,
"step": 1130
},
{
"epoch": 0.6617179215270413,
"grad_norm": 0.05963183612092722,
"learning_rate": 6.223826712429529e-06,
"loss": 0.8042,
"step": 1131
},
{
"epoch": 0.6623029948440414,
"grad_norm": 0.046383510242271296,
"learning_rate": 6.220973128922168e-06,
"loss": 0.865,
"step": 1132
},
{
"epoch": 0.6628880681610414,
"grad_norm": 0.04417415078522508,
"learning_rate": 6.218118007050713e-06,
"loss": 0.8235,
"step": 1133
},
{
"epoch": 0.6634731414780415,
"grad_norm": 0.04207639503924024,
"learning_rate": 6.215261349227178e-06,
"loss": 0.7858,
"step": 1134
},
{
"epoch": 0.6640582147950415,
"grad_norm": 0.04649797867374506,
"learning_rate": 6.212403157864878e-06,
"loss": 0.868,
"step": 1135
},
{
"epoch": 0.6646432881120415,
"grad_norm": 0.09407948533657494,
"learning_rate": 6.209543435378422e-06,
"loss": 0.8818,
"step": 1136
},
{
"epoch": 0.6652283614290416,
"grad_norm": 0.037478109364168094,
"learning_rate": 6.206682184183712e-06,
"loss": 0.812,
"step": 1137
},
{
"epoch": 0.6658134347460416,
"grad_norm": 0.05288676232169846,
"learning_rate": 6.203819406697945e-06,
"loss": 0.7548,
"step": 1138
},
{
"epoch": 0.6663985080630417,
"grad_norm": 0.04535893610318677,
"learning_rate": 6.200955105339603e-06,
"loss": 0.8772,
"step": 1139
},
{
"epoch": 0.6669835813800417,
"grad_norm": 0.04773693536693857,
"learning_rate": 6.198089282528456e-06,
"loss": 0.7763,
"step": 1140
},
{
"epoch": 0.6675686546970417,
"grad_norm": 0.0518520434245037,
"learning_rate": 6.195221940685563e-06,
"loss": 0.7668,
"step": 1141
},
{
"epoch": 0.6681537280140417,
"grad_norm": 0.05462892968699057,
"learning_rate": 6.192353082233263e-06,
"loss": 0.7096,
"step": 1142
},
{
"epoch": 0.6687388013310418,
"grad_norm": 0.04122645056649732,
"learning_rate": 6.189482709595177e-06,
"loss": 0.7839,
"step": 1143
},
{
"epoch": 0.6693238746480419,
"grad_norm": 0.058891294040750164,
"learning_rate": 6.186610825196204e-06,
"loss": 0.7504,
"step": 1144
},
{
"epoch": 0.6699089479650419,
"grad_norm": 0.05296752593762354,
"learning_rate": 6.183737431462524e-06,
"loss": 0.7591,
"step": 1145
},
{
"epoch": 0.6704940212820419,
"grad_norm": 0.04766421657837364,
"learning_rate": 6.180862530821588e-06,
"loss": 0.742,
"step": 1146
},
{
"epoch": 0.6710790945990419,
"grad_norm": 0.0915137633830507,
"learning_rate": 6.177986125702121e-06,
"loss": 0.8167,
"step": 1147
},
{
"epoch": 0.671664167916042,
"grad_norm": 0.04064619759463224,
"learning_rate": 6.17510821853412e-06,
"loss": 0.755,
"step": 1148
},
{
"epoch": 0.672249241233042,
"grad_norm": 0.03962465637676519,
"learning_rate": 6.17222881174885e-06,
"loss": 0.7952,
"step": 1149
},
{
"epoch": 0.6728343145500421,
"grad_norm": 0.0475379390885668,
"learning_rate": 6.169347907778846e-06,
"loss": 0.7889,
"step": 1150
},
{
"epoch": 0.6734193878670421,
"grad_norm": 0.04260205050833479,
"learning_rate": 6.166465509057902e-06,
"loss": 0.7094,
"step": 1151
},
{
"epoch": 0.6740044611840421,
"grad_norm": 0.0452503260334743,
"learning_rate": 6.163581618021079e-06,
"loss": 0.8137,
"step": 1152
},
{
"epoch": 0.6745895345010422,
"grad_norm": 0.049045892950316486,
"learning_rate": 6.1606962371046975e-06,
"loss": 0.7476,
"step": 1153
},
{
"epoch": 0.6751746078180422,
"grad_norm": 0.1839546472224546,
"learning_rate": 6.157809368746337e-06,
"loss": 0.7341,
"step": 1154
},
{
"epoch": 0.6757596811350423,
"grad_norm": 0.04737636667405579,
"learning_rate": 6.154921015384833e-06,
"loss": 0.8772,
"step": 1155
},
{
"epoch": 0.6763447544520422,
"grad_norm": 0.039827421277940374,
"learning_rate": 6.152031179460276e-06,
"loss": 0.8184,
"step": 1156
},
{
"epoch": 0.6769298277690423,
"grad_norm": 0.03980204662807788,
"learning_rate": 6.14913986341401e-06,
"loss": 0.8504,
"step": 1157
},
{
"epoch": 0.6775149010860424,
"grad_norm": 0.06377067252825656,
"learning_rate": 6.146247069688627e-06,
"loss": 0.8496,
"step": 1158
},
{
"epoch": 0.6780999744030424,
"grad_norm": 0.5555285996614837,
"learning_rate": 6.14335280072797e-06,
"loss": 0.8131,
"step": 1159
},
{
"epoch": 0.6786850477200425,
"grad_norm": 0.03909895393006111,
"learning_rate": 6.140457058977125e-06,
"loss": 0.7756,
"step": 1160
},
{
"epoch": 0.6792701210370424,
"grad_norm": 0.03969052355186135,
"learning_rate": 6.137559846882426e-06,
"loss": 0.8209,
"step": 1161
},
{
"epoch": 0.6798551943540425,
"grad_norm": 0.04103151559227765,
"learning_rate": 6.134661166891445e-06,
"loss": 0.8015,
"step": 1162
},
{
"epoch": 0.6804402676710425,
"grad_norm": 0.05136493470713414,
"learning_rate": 6.131761021453e-06,
"loss": 0.8234,
"step": 1163
},
{
"epoch": 0.6810253409880426,
"grad_norm": 0.04189728013078323,
"learning_rate": 6.128859413017141e-06,
"loss": 0.7964,
"step": 1164
},
{
"epoch": 0.6816104143050427,
"grad_norm": 0.04337964675631273,
"learning_rate": 6.1259563440351564e-06,
"loss": 0.8434,
"step": 1165
},
{
"epoch": 0.6821954876220426,
"grad_norm": 0.09451825900326687,
"learning_rate": 6.123051816959569e-06,
"loss": 0.8254,
"step": 1166
},
{
"epoch": 0.6827805609390427,
"grad_norm": 0.045248262569469115,
"learning_rate": 6.120145834244133e-06,
"loss": 0.7613,
"step": 1167
},
{
"epoch": 0.6833656342560427,
"grad_norm": 0.03837960459677211,
"learning_rate": 6.117238398343831e-06,
"loss": 0.7958,
"step": 1168
},
{
"epoch": 0.6839507075730428,
"grad_norm": 0.05242659218061473,
"learning_rate": 6.114329511714876e-06,
"loss": 0.8356,
"step": 1169
},
{
"epoch": 0.6845357808900427,
"grad_norm": 0.06860429447590187,
"learning_rate": 6.111419176814704e-06,
"loss": 0.802,
"step": 1170
},
{
"epoch": 0.6851208542070428,
"grad_norm": 0.04508456559483142,
"learning_rate": 6.108507396101975e-06,
"loss": 0.8444,
"step": 1171
},
{
"epoch": 0.6857059275240428,
"grad_norm": 0.06393026794280061,
"learning_rate": 6.105594172036572e-06,
"loss": 0.7585,
"step": 1172
},
{
"epoch": 0.6862910008410429,
"grad_norm": 0.05435654372858688,
"learning_rate": 6.102679507079597e-06,
"loss": 0.9379,
"step": 1173
},
{
"epoch": 0.686876074158043,
"grad_norm": 0.057231278163684694,
"learning_rate": 6.099763403693366e-06,
"loss": 0.8431,
"step": 1174
},
{
"epoch": 0.6874611474750429,
"grad_norm": 0.0414940103750233,
"learning_rate": 6.096845864341415e-06,
"loss": 0.8247,
"step": 1175
},
{
"epoch": 0.688046220792043,
"grad_norm": 0.03753965623304419,
"learning_rate": 6.09392689148849e-06,
"loss": 0.7384,
"step": 1176
},
{
"epoch": 0.688631294109043,
"grad_norm": 0.0445855037853274,
"learning_rate": 6.09100648760055e-06,
"loss": 0.8913,
"step": 1177
},
{
"epoch": 0.6892163674260431,
"grad_norm": 0.03616019636034817,
"learning_rate": 6.08808465514476e-06,
"loss": 0.7741,
"step": 1178
},
{
"epoch": 0.6898014407430431,
"grad_norm": 0.03958351633767278,
"learning_rate": 6.085161396589493e-06,
"loss": 0.6991,
"step": 1179
},
{
"epoch": 0.6903865140600431,
"grad_norm": 0.03958973007993701,
"learning_rate": 6.082236714404331e-06,
"loss": 0.8455,
"step": 1180
},
{
"epoch": 0.6909715873770432,
"grad_norm": 0.04165937145938639,
"learning_rate": 6.079310611060052e-06,
"loss": 0.8072,
"step": 1181
},
{
"epoch": 0.6915566606940432,
"grad_norm": 0.045951396419511376,
"learning_rate": 6.07638308902864e-06,
"loss": 0.8313,
"step": 1182
},
{
"epoch": 0.6921417340110433,
"grad_norm": 0.04752344885301804,
"learning_rate": 6.073454150783274e-06,
"loss": 0.8802,
"step": 1183
},
{
"epoch": 0.6927268073280433,
"grad_norm": 0.05117696052721477,
"learning_rate": 6.070523798798329e-06,
"loss": 0.9472,
"step": 1184
},
{
"epoch": 0.6933118806450433,
"grad_norm": 0.040955259256951086,
"learning_rate": 6.06759203554938e-06,
"loss": 0.8126,
"step": 1185
},
{
"epoch": 0.6938969539620433,
"grad_norm": 0.034916342462867116,
"learning_rate": 6.064658863513186e-06,
"loss": 0.7091,
"step": 1186
},
{
"epoch": 0.6944820272790434,
"grad_norm": 0.042140933363091035,
"learning_rate": 6.061724285167704e-06,
"loss": 0.8323,
"step": 1187
},
{
"epoch": 0.6950671005960435,
"grad_norm": 0.04413794065710716,
"learning_rate": 6.058788302992072e-06,
"loss": 0.8419,
"step": 1188
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.042514536823805356,
"learning_rate": 6.055850919466621e-06,
"loss": 0.8863,
"step": 1189
},
{
"epoch": 0.6962372472300435,
"grad_norm": 0.04885967786149231,
"learning_rate": 6.05291213707286e-06,
"loss": 0.7658,
"step": 1190
},
{
"epoch": 0.6968223205470435,
"grad_norm": 0.04978541890148103,
"learning_rate": 6.0499719582934815e-06,
"loss": 0.7496,
"step": 1191
},
{
"epoch": 0.6974073938640436,
"grad_norm": 0.05231245753746865,
"learning_rate": 6.047030385612362e-06,
"loss": 0.784,
"step": 1192
},
{
"epoch": 0.6979924671810436,
"grad_norm": 0.04028385347084005,
"learning_rate": 6.0440874215145465e-06,
"loss": 0.6913,
"step": 1193
},
{
"epoch": 0.6985775404980437,
"grad_norm": 0.062297108276536735,
"learning_rate": 6.041143068486264e-06,
"loss": 0.7337,
"step": 1194
},
{
"epoch": 0.6991626138150437,
"grad_norm": 0.03928043528877265,
"learning_rate": 6.038197329014914e-06,
"loss": 0.8415,
"step": 1195
},
{
"epoch": 0.6997476871320437,
"grad_norm": 0.046537841954999465,
"learning_rate": 6.035250205589064e-06,
"loss": 0.7873,
"step": 1196
},
{
"epoch": 0.7003327604490438,
"grad_norm": 0.042222747639775425,
"learning_rate": 6.032301700698458e-06,
"loss": 0.7629,
"step": 1197
},
{
"epoch": 0.7009178337660438,
"grad_norm": 0.05930943404435324,
"learning_rate": 6.029351816833998e-06,
"loss": 0.8273,
"step": 1198
},
{
"epoch": 0.7015029070830439,
"grad_norm": 0.05874167359504905,
"learning_rate": 6.026400556487758e-06,
"loss": 0.7159,
"step": 1199
},
{
"epoch": 0.7020879804000438,
"grad_norm": 0.052410757851244154,
"learning_rate": 6.023447922152972e-06,
"loss": 0.7887,
"step": 1200
},
{
"epoch": 0.7026730537170439,
"grad_norm": 0.03785761302792468,
"learning_rate": 6.020493916324037e-06,
"loss": 0.7942,
"step": 1201
},
{
"epoch": 0.703258127034044,
"grad_norm": 0.0392431216621593,
"learning_rate": 6.017538541496503e-06,
"loss": 0.7885,
"step": 1202
},
{
"epoch": 0.703843200351044,
"grad_norm": 0.08892449717880838,
"learning_rate": 6.014581800167085e-06,
"loss": 0.8001,
"step": 1203
},
{
"epoch": 0.7044282736680441,
"grad_norm": 0.06747014549686459,
"learning_rate": 6.011623694833644e-06,
"loss": 0.727,
"step": 1204
},
{
"epoch": 0.705013346985044,
"grad_norm": 0.04543757611973289,
"learning_rate": 6.008664227995198e-06,
"loss": 0.8129,
"step": 1205
},
{
"epoch": 0.7055984203020441,
"grad_norm": 0.039104300131473785,
"learning_rate": 6.005703402151916e-06,
"loss": 0.8098,
"step": 1206
},
{
"epoch": 0.7061834936190441,
"grad_norm": 0.038265510416551914,
"learning_rate": 6.0027412198051114e-06,
"loss": 0.8279,
"step": 1207
},
{
"epoch": 0.7067685669360442,
"grad_norm": 0.03588239116909633,
"learning_rate": 5.999777683457247e-06,
"loss": 0.8096,
"step": 1208
},
{
"epoch": 0.7073536402530443,
"grad_norm": 0.03971284808692058,
"learning_rate": 5.996812795611928e-06,
"loss": 0.8331,
"step": 1209
},
{
"epoch": 0.7079387135700442,
"grad_norm": 0.03963917599976568,
"learning_rate": 5.9938465587739e-06,
"loss": 0.7908,
"step": 1210
},
{
"epoch": 0.7085237868870443,
"grad_norm": 0.045273875457725724,
"learning_rate": 5.990878975449051e-06,
"loss": 0.8111,
"step": 1211
},
{
"epoch": 0.7091088602040443,
"grad_norm": 0.04337467569823838,
"learning_rate": 5.9879100481444055e-06,
"loss": 0.8222,
"step": 1212
},
{
"epoch": 0.7096939335210444,
"grad_norm": 0.039603085337812066,
"learning_rate": 5.984939779368122e-06,
"loss": 0.7721,
"step": 1213
},
{
"epoch": 0.7102790068380443,
"grad_norm": 0.04156772105291293,
"learning_rate": 5.981968171629494e-06,
"loss": 0.8197,
"step": 1214
},
{
"epoch": 0.7108640801550444,
"grad_norm": 0.04856521644044076,
"learning_rate": 5.978995227438944e-06,
"loss": 0.8677,
"step": 1215
},
{
"epoch": 0.7114491534720445,
"grad_norm": 0.03956445076057253,
"learning_rate": 5.976020949308027e-06,
"loss": 0.7877,
"step": 1216
},
{
"epoch": 0.7120342267890445,
"grad_norm": 0.04862552856496891,
"learning_rate": 5.973045339749422e-06,
"loss": 0.8109,
"step": 1217
},
{
"epoch": 0.7126193001060446,
"grad_norm": 0.045295501112575506,
"learning_rate": 5.970068401276935e-06,
"loss": 0.791,
"step": 1218
},
{
"epoch": 0.7132043734230445,
"grad_norm": 0.03916519269752596,
"learning_rate": 5.967090136405491e-06,
"loss": 0.8131,
"step": 1219
},
{
"epoch": 0.7137894467400446,
"grad_norm": 0.038383406339616045,
"learning_rate": 5.96411054765114e-06,
"loss": 0.8102,
"step": 1220
},
{
"epoch": 0.7143745200570446,
"grad_norm": 0.06824384503876316,
"learning_rate": 5.961129637531047e-06,
"loss": 0.8104,
"step": 1221
},
{
"epoch": 0.7149595933740447,
"grad_norm": 0.07316382930858244,
"learning_rate": 5.958147408563497e-06,
"loss": 0.7862,
"step": 1222
},
{
"epoch": 0.7155446666910448,
"grad_norm": 0.04737080928686243,
"learning_rate": 5.9551638632678835e-06,
"loss": 0.8513,
"step": 1223
},
{
"epoch": 0.7161297400080447,
"grad_norm": 0.0374937969952467,
"learning_rate": 5.952179004164718e-06,
"loss": 0.8002,
"step": 1224
},
{
"epoch": 0.7167148133250448,
"grad_norm": 0.04053689173262939,
"learning_rate": 5.949192833775618e-06,
"loss": 0.7657,
"step": 1225
},
{
"epoch": 0.7172998866420448,
"grad_norm": 0.0398778756693423,
"learning_rate": 5.946205354623312e-06,
"loss": 0.7725,
"step": 1226
},
{
"epoch": 0.7178849599590449,
"grad_norm": 0.04506339056501446,
"learning_rate": 5.943216569231629e-06,
"loss": 0.7931,
"step": 1227
},
{
"epoch": 0.7184700332760449,
"grad_norm": 0.04982830890931998,
"learning_rate": 5.940226480125508e-06,
"loss": 0.8265,
"step": 1228
},
{
"epoch": 0.719055106593045,
"grad_norm": 0.0397312539469508,
"learning_rate": 5.937235089830984e-06,
"loss": 0.769,
"step": 1229
},
{
"epoch": 0.719640179910045,
"grad_norm": 0.03837231212563197,
"learning_rate": 5.934242400875195e-06,
"loss": 0.8259,
"step": 1230
},
{
"epoch": 0.720225253227045,
"grad_norm": 0.03612472247568365,
"learning_rate": 5.931248415786371e-06,
"loss": 0.7637,
"step": 1231
},
{
"epoch": 0.7208103265440451,
"grad_norm": 0.03537723098243959,
"learning_rate": 5.928253137093844e-06,
"loss": 0.738,
"step": 1232
},
{
"epoch": 0.7213953998610451,
"grad_norm": 0.0345658269866166,
"learning_rate": 5.925256567328036e-06,
"loss": 0.7422,
"step": 1233
},
{
"epoch": 0.7219804731780451,
"grad_norm": 0.041550218114552985,
"learning_rate": 5.922258709020456e-06,
"loss": 0.8158,
"step": 1234
},
{
"epoch": 0.7225655464950451,
"grad_norm": 0.04738915180347003,
"learning_rate": 5.919259564703705e-06,
"loss": 0.7693,
"step": 1235
},
{
"epoch": 0.7231506198120452,
"grad_norm": 0.22798373109574022,
"learning_rate": 5.916259136911472e-06,
"loss": 0.7964,
"step": 1236
},
{
"epoch": 0.7237356931290453,
"grad_norm": 0.03960961306940275,
"learning_rate": 5.913257428178526e-06,
"loss": 0.8529,
"step": 1237
},
{
"epoch": 0.7243207664460453,
"grad_norm": 0.03762982041843176,
"learning_rate": 5.910254441040723e-06,
"loss": 0.7978,
"step": 1238
},
{
"epoch": 0.7249058397630453,
"grad_norm": 0.03967585434913458,
"learning_rate": 5.907250178034994e-06,
"loss": 0.7715,
"step": 1239
},
{
"epoch": 0.7254909130800453,
"grad_norm": 0.035499686476307656,
"learning_rate": 5.904244641699352e-06,
"loss": 0.7821,
"step": 1240
},
{
"epoch": 0.7260759863970454,
"grad_norm": 0.04146813756606358,
"learning_rate": 5.9012378345728824e-06,
"loss": 0.7832,
"step": 1241
},
{
"epoch": 0.7266610597140454,
"grad_norm": 0.035835229174829814,
"learning_rate": 5.8982297591957465e-06,
"loss": 0.7951,
"step": 1242
},
{
"epoch": 0.7272461330310455,
"grad_norm": 0.040752453651612,
"learning_rate": 5.8952204181091775e-06,
"loss": 0.7898,
"step": 1243
},
{
"epoch": 0.7278312063480454,
"grad_norm": 0.04254122433979048,
"learning_rate": 5.8922098138554745e-06,
"loss": 0.8384,
"step": 1244
},
{
"epoch": 0.7284162796650455,
"grad_norm": 0.048324438293500575,
"learning_rate": 5.889197948978008e-06,
"loss": 0.7273,
"step": 1245
},
{
"epoch": 0.7290013529820456,
"grad_norm": 0.04876106635437417,
"learning_rate": 5.886184826021208e-06,
"loss": 0.7078,
"step": 1246
},
{
"epoch": 0.7295864262990456,
"grad_norm": 0.047909628199116906,
"learning_rate": 5.883170447530575e-06,
"loss": 0.7293,
"step": 1247
},
{
"epoch": 0.7301714996160457,
"grad_norm": 0.04759781973779755,
"learning_rate": 5.880154816052666e-06,
"loss": 0.8114,
"step": 1248
},
{
"epoch": 0.7307565729330456,
"grad_norm": 0.03579815989872979,
"learning_rate": 5.8771379341350905e-06,
"loss": 0.7843,
"step": 1249
},
{
"epoch": 0.7313416462500457,
"grad_norm": 0.048181696474966534,
"learning_rate": 5.874119804326525e-06,
"loss": 0.85,
"step": 1250
},
{
"epoch": 0.7319267195670457,
"grad_norm": 0.040629567336692485,
"learning_rate": 5.871100429176694e-06,
"loss": 0.8121,
"step": 1251
},
{
"epoch": 0.7325117928840458,
"grad_norm": 0.039483264479736525,
"learning_rate": 5.8680798112363784e-06,
"loss": 0.8365,
"step": 1252
},
{
"epoch": 0.7330968662010459,
"grad_norm": 0.05089919878146447,
"learning_rate": 5.865057953057401e-06,
"loss": 0.8076,
"step": 1253
},
{
"epoch": 0.7336819395180458,
"grad_norm": 0.035397442355980586,
"learning_rate": 5.862034857192642e-06,
"loss": 0.7798,
"step": 1254
},
{
"epoch": 0.7342670128350459,
"grad_norm": 0.03687328400855783,
"learning_rate": 5.859010526196021e-06,
"loss": 0.7859,
"step": 1255
},
{
"epoch": 0.7348520861520459,
"grad_norm": 0.06502490332457671,
"learning_rate": 5.855984962622504e-06,
"loss": 0.7927,
"step": 1256
},
{
"epoch": 0.735437159469046,
"grad_norm": 0.04157588146277028,
"learning_rate": 5.852958169028094e-06,
"loss": 0.7907,
"step": 1257
},
{
"epoch": 0.736022232786046,
"grad_norm": 0.04254658384010035,
"learning_rate": 5.849930147969839e-06,
"loss": 0.7903,
"step": 1258
},
{
"epoch": 0.736607306103046,
"grad_norm": 0.04207256800472794,
"learning_rate": 5.846900902005822e-06,
"loss": 0.7673,
"step": 1259
},
{
"epoch": 0.7371923794200461,
"grad_norm": 0.03928632402273387,
"learning_rate": 5.843870433695156e-06,
"loss": 0.7548,
"step": 1260
},
{
"epoch": 0.7377774527370461,
"grad_norm": 0.039954700127276935,
"learning_rate": 5.8408387455979946e-06,
"loss": 0.7986,
"step": 1261
},
{
"epoch": 0.7383625260540462,
"grad_norm": 0.03783285103775797,
"learning_rate": 5.837805840275515e-06,
"loss": 0.8107,
"step": 1262
},
{
"epoch": 0.7389475993710461,
"grad_norm": 0.040841957568584206,
"learning_rate": 5.834771720289929e-06,
"loss": 0.8005,
"step": 1263
},
{
"epoch": 0.7395326726880462,
"grad_norm": 0.044981013127008924,
"learning_rate": 5.831736388204467e-06,
"loss": 0.7682,
"step": 1264
},
{
"epoch": 0.7401177460050462,
"grad_norm": 0.038332859477558344,
"learning_rate": 5.828699846583389e-06,
"loss": 0.7548,
"step": 1265
},
{
"epoch": 0.7407028193220463,
"grad_norm": 0.04385966631432292,
"learning_rate": 5.825662097991978e-06,
"loss": 0.8131,
"step": 1266
},
{
"epoch": 0.7412878926390464,
"grad_norm": 0.038909792748055394,
"learning_rate": 5.82262314499653e-06,
"loss": 0.8741,
"step": 1267
},
{
"epoch": 0.7418729659560463,
"grad_norm": 0.05612505051259805,
"learning_rate": 5.8195829901643655e-06,
"loss": 0.8022,
"step": 1268
},
{
"epoch": 0.7424580392730464,
"grad_norm": 0.041230289913411614,
"learning_rate": 5.816541636063816e-06,
"loss": 0.7588,
"step": 1269
},
{
"epoch": 0.7430431125900464,
"grad_norm": 0.053472936644260796,
"learning_rate": 5.813499085264229e-06,
"loss": 0.733,
"step": 1270
},
{
"epoch": 0.7436281859070465,
"grad_norm": 0.05949960419609776,
"learning_rate": 5.8104553403359586e-06,
"loss": 0.8247,
"step": 1271
},
{
"epoch": 0.7442132592240465,
"grad_norm": 0.04317467554051099,
"learning_rate": 5.807410403850371e-06,
"loss": 0.8015,
"step": 1272
},
{
"epoch": 0.7447983325410465,
"grad_norm": 0.040008771003069384,
"learning_rate": 5.804364278379842e-06,
"loss": 0.7975,
"step": 1273
},
{
"epoch": 0.7453834058580466,
"grad_norm": 0.039926332581406196,
"learning_rate": 5.801316966497744e-06,
"loss": 0.7404,
"step": 1274
},
{
"epoch": 0.7459684791750466,
"grad_norm": 0.03922271741776474,
"learning_rate": 5.798268470778461e-06,
"loss": 0.8157,
"step": 1275
},
{
"epoch": 0.7465535524920467,
"grad_norm": 0.03804854554685112,
"learning_rate": 5.795218793797367e-06,
"loss": 0.6987,
"step": 1276
},
{
"epoch": 0.7471386258090467,
"grad_norm": 0.03876336199264868,
"learning_rate": 5.792167938130842e-06,
"loss": 0.8456,
"step": 1277
},
{
"epoch": 0.7477236991260467,
"grad_norm": 0.03771896812051095,
"learning_rate": 5.78911590635626e-06,
"loss": 0.726,
"step": 1278
},
{
"epoch": 0.7483087724430467,
"grad_norm": 0.036735705010982965,
"learning_rate": 5.786062701051983e-06,
"loss": 0.9274,
"step": 1279
},
{
"epoch": 0.7488938457600468,
"grad_norm": 0.03943361612412777,
"learning_rate": 5.783008324797375e-06,
"loss": 0.7146,
"step": 1280
},
{
"epoch": 0.7494789190770469,
"grad_norm": 0.03647462654976768,
"learning_rate": 5.779952780172777e-06,
"loss": 0.7589,
"step": 1281
},
{
"epoch": 0.7500639923940469,
"grad_norm": 0.03462458239120716,
"learning_rate": 5.776896069759528e-06,
"loss": 0.7397,
"step": 1282
},
{
"epoch": 0.750649065711047,
"grad_norm": 0.03495722370436699,
"learning_rate": 5.773838196139946e-06,
"loss": 0.6993,
"step": 1283
},
{
"epoch": 0.7512341390280469,
"grad_norm": 0.05321435758446983,
"learning_rate": 5.770779161897329e-06,
"loss": 0.8397,
"step": 1284
}
],
"logging_steps": 1,
"max_steps": 3418,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 428,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5326977607139328.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}