ErrorAI's picture
Training in progress, step 1392, checkpoint
9789a9a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7920341394025604,
"eval_steps": 348,
"global_step": 1392,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005689900426742532,
"grad_norm": 0.922553300857544,
"learning_rate": 2e-05,
"loss": 1.7225,
"step": 1
},
{
"epoch": 0.0005689900426742532,
"eval_loss": 1.6560131311416626,
"eval_runtime": 17.2854,
"eval_samples_per_second": 42.811,
"eval_steps_per_second": 21.405,
"step": 1
},
{
"epoch": 0.0011379800853485065,
"grad_norm": 1.0872293710708618,
"learning_rate": 4e-05,
"loss": 1.7777,
"step": 2
},
{
"epoch": 0.0017069701280227596,
"grad_norm": 1.0032234191894531,
"learning_rate": 6e-05,
"loss": 1.6594,
"step": 3
},
{
"epoch": 0.002275960170697013,
"grad_norm": 0.9296952486038208,
"learning_rate": 8e-05,
"loss": 1.6329,
"step": 4
},
{
"epoch": 0.002844950213371266,
"grad_norm": 0.8549262881278992,
"learning_rate": 0.0001,
"loss": 1.6946,
"step": 5
},
{
"epoch": 0.0034139402560455193,
"grad_norm": 0.7175059914588928,
"learning_rate": 0.00012,
"loss": 1.605,
"step": 6
},
{
"epoch": 0.003982930298719772,
"grad_norm": 0.729087233543396,
"learning_rate": 0.00014,
"loss": 1.7539,
"step": 7
},
{
"epoch": 0.004551920341394026,
"grad_norm": 0.7559539675712585,
"learning_rate": 0.00016,
"loss": 1.7079,
"step": 8
},
{
"epoch": 0.005120910384068279,
"grad_norm": 0.9097371101379395,
"learning_rate": 0.00018,
"loss": 1.4693,
"step": 9
},
{
"epoch": 0.005689900426742532,
"grad_norm": 0.7562863230705261,
"learning_rate": 0.0002,
"loss": 1.7192,
"step": 10
},
{
"epoch": 0.006258890469416785,
"grad_norm": 0.8033550381660461,
"learning_rate": 0.00019999974162322295,
"loss": 1.6699,
"step": 11
},
{
"epoch": 0.0068278805120910386,
"grad_norm": 0.6270872950553894,
"learning_rate": 0.00019999896649422697,
"loss": 1.7042,
"step": 12
},
{
"epoch": 0.007396870554765292,
"grad_norm": 0.6003552079200745,
"learning_rate": 0.00019999767461701748,
"loss": 1.672,
"step": 13
},
{
"epoch": 0.007965860597439544,
"grad_norm": 0.5751997232437134,
"learning_rate": 0.00019999586599827042,
"loss": 1.5727,
"step": 14
},
{
"epoch": 0.008534850640113799,
"grad_norm": 0.5488961338996887,
"learning_rate": 0.00019999354064733184,
"loss": 1.6477,
"step": 15
},
{
"epoch": 0.009103840682788052,
"grad_norm": 0.4690549671649933,
"learning_rate": 0.00019999069857621807,
"loss": 1.4063,
"step": 16
},
{
"epoch": 0.009672830725462305,
"grad_norm": 0.5245763659477234,
"learning_rate": 0.00019998733979961563,
"loss": 1.649,
"step": 17
},
{
"epoch": 0.010241820768136558,
"grad_norm": 0.4962601661682129,
"learning_rate": 0.0001999834643348811,
"loss": 1.5558,
"step": 18
},
{
"epoch": 0.010810810810810811,
"grad_norm": 0.5009298324584961,
"learning_rate": 0.0001999790722020411,
"loss": 1.6178,
"step": 19
},
{
"epoch": 0.011379800853485065,
"grad_norm": 0.5524196028709412,
"learning_rate": 0.00019997416342379208,
"loss": 1.6133,
"step": 20
},
{
"epoch": 0.011948790896159318,
"grad_norm": 0.48095259070396423,
"learning_rate": 0.00019996873802550043,
"loss": 1.4158,
"step": 21
},
{
"epoch": 0.01251778093883357,
"grad_norm": 0.5575169324874878,
"learning_rate": 0.00019996279603520196,
"loss": 1.7057,
"step": 22
},
{
"epoch": 0.013086770981507824,
"grad_norm": 0.5423071384429932,
"learning_rate": 0.00019995633748360223,
"loss": 1.5661,
"step": 23
},
{
"epoch": 0.013655761024182077,
"grad_norm": 0.49561819434165955,
"learning_rate": 0.00019994936240407598,
"loss": 1.4119,
"step": 24
},
{
"epoch": 0.01422475106685633,
"grad_norm": 0.4862682521343231,
"learning_rate": 0.00019994187083266716,
"loss": 1.519,
"step": 25
},
{
"epoch": 0.014793741109530583,
"grad_norm": 0.5174720883369446,
"learning_rate": 0.0001999338628080888,
"loss": 1.3668,
"step": 26
},
{
"epoch": 0.015362731152204837,
"grad_norm": 0.5306721329689026,
"learning_rate": 0.0001999253383717226,
"loss": 1.6097,
"step": 27
},
{
"epoch": 0.015931721194879088,
"grad_norm": 0.5307742357254028,
"learning_rate": 0.00019991629756761886,
"loss": 1.7738,
"step": 28
},
{
"epoch": 0.016500711237553343,
"grad_norm": 0.6086705327033997,
"learning_rate": 0.00019990674044249634,
"loss": 1.7079,
"step": 29
},
{
"epoch": 0.017069701280227598,
"grad_norm": 0.5047173500061035,
"learning_rate": 0.00019989666704574175,
"loss": 1.6998,
"step": 30
},
{
"epoch": 0.01763869132290185,
"grad_norm": 0.5041013360023499,
"learning_rate": 0.00019988607742940978,
"loss": 1.7047,
"step": 31
},
{
"epoch": 0.018207681365576104,
"grad_norm": 0.4694116413593292,
"learning_rate": 0.00019987497164822263,
"loss": 1.3058,
"step": 32
},
{
"epoch": 0.018776671408250355,
"grad_norm": 0.5069786310195923,
"learning_rate": 0.0001998633497595698,
"loss": 1.6603,
"step": 33
},
{
"epoch": 0.01934566145092461,
"grad_norm": 0.4877070486545563,
"learning_rate": 0.0001998512118235078,
"loss": 1.5145,
"step": 34
},
{
"epoch": 0.01991465149359886,
"grad_norm": 0.5028818845748901,
"learning_rate": 0.0001998385579027599,
"loss": 1.5016,
"step": 35
},
{
"epoch": 0.020483641536273117,
"grad_norm": 0.4918319880962372,
"learning_rate": 0.00019982538806271566,
"loss": 1.5468,
"step": 36
},
{
"epoch": 0.021052631578947368,
"grad_norm": 0.5177620649337769,
"learning_rate": 0.00019981170237143067,
"loss": 1.5555,
"step": 37
},
{
"epoch": 0.021621621621621623,
"grad_norm": 0.49115803837776184,
"learning_rate": 0.00019979750089962629,
"loss": 1.592,
"step": 38
},
{
"epoch": 0.022190611664295874,
"grad_norm": 0.5621944069862366,
"learning_rate": 0.00019978278372068906,
"loss": 1.6697,
"step": 39
},
{
"epoch": 0.02275960170697013,
"grad_norm": 0.49260076880455017,
"learning_rate": 0.00019976755091067054,
"loss": 1.4688,
"step": 40
},
{
"epoch": 0.02332859174964438,
"grad_norm": 0.4910222589969635,
"learning_rate": 0.00019975180254828688,
"loss": 1.462,
"step": 41
},
{
"epoch": 0.023897581792318635,
"grad_norm": 0.5017576217651367,
"learning_rate": 0.0001997355387149182,
"loss": 1.6558,
"step": 42
},
{
"epoch": 0.024466571834992887,
"grad_norm": 0.5089415907859802,
"learning_rate": 0.00019971875949460852,
"loss": 1.6412,
"step": 43
},
{
"epoch": 0.02503556187766714,
"grad_norm": 0.4794662594795227,
"learning_rate": 0.00019970146497406505,
"loss": 1.6011,
"step": 44
},
{
"epoch": 0.025604551920341393,
"grad_norm": 0.5046934485435486,
"learning_rate": 0.00019968365524265777,
"loss": 1.6675,
"step": 45
},
{
"epoch": 0.026173541963015648,
"grad_norm": 0.4993690550327301,
"learning_rate": 0.0001996653303924192,
"loss": 1.6735,
"step": 46
},
{
"epoch": 0.0267425320056899,
"grad_norm": 0.48856502771377563,
"learning_rate": 0.00019964649051804355,
"loss": 1.5536,
"step": 47
},
{
"epoch": 0.027311522048364154,
"grad_norm": 0.4920005202293396,
"learning_rate": 0.0001996271357168866,
"loss": 1.6204,
"step": 48
},
{
"epoch": 0.027880512091038406,
"grad_norm": 0.5342410802841187,
"learning_rate": 0.00019960726608896502,
"loss": 1.719,
"step": 49
},
{
"epoch": 0.02844950213371266,
"grad_norm": 0.5041580200195312,
"learning_rate": 0.00019958688173695572,
"loss": 1.7053,
"step": 50
},
{
"epoch": 0.029018492176386912,
"grad_norm": 0.5237680077552795,
"learning_rate": 0.00019956598276619562,
"loss": 1.5091,
"step": 51
},
{
"epoch": 0.029587482219061167,
"grad_norm": 0.4911646246910095,
"learning_rate": 0.0001995445692846809,
"loss": 1.6085,
"step": 52
},
{
"epoch": 0.030156472261735418,
"grad_norm": 0.520005464553833,
"learning_rate": 0.00019952264140306645,
"loss": 1.4782,
"step": 53
},
{
"epoch": 0.030725462304409673,
"grad_norm": 0.49788954854011536,
"learning_rate": 0.0001995001992346654,
"loss": 1.4905,
"step": 54
},
{
"epoch": 0.031294452347083924,
"grad_norm": 0.5043379664421082,
"learning_rate": 0.00019947724289544845,
"loss": 1.6566,
"step": 55
},
{
"epoch": 0.031863442389758176,
"grad_norm": 0.5547715425491333,
"learning_rate": 0.00019945377250404328,
"loss": 1.7227,
"step": 56
},
{
"epoch": 0.032432432432432434,
"grad_norm": 0.5288915634155273,
"learning_rate": 0.000199429788181734,
"loss": 1.5921,
"step": 57
},
{
"epoch": 0.033001422475106686,
"grad_norm": 0.5353677868843079,
"learning_rate": 0.00019940529005246048,
"loss": 1.5371,
"step": 58
},
{
"epoch": 0.03357041251778094,
"grad_norm": 0.520143449306488,
"learning_rate": 0.00019938027824281757,
"loss": 1.6308,
"step": 59
},
{
"epoch": 0.034139402560455195,
"grad_norm": 0.50368732213974,
"learning_rate": 0.0001993547528820548,
"loss": 1.4645,
"step": 60
},
{
"epoch": 0.03470839260312945,
"grad_norm": 0.5326752066612244,
"learning_rate": 0.0001993287141020753,
"loss": 1.5832,
"step": 61
},
{
"epoch": 0.0352773826458037,
"grad_norm": 0.48568812012672424,
"learning_rate": 0.00019930216203743544,
"loss": 1.4137,
"step": 62
},
{
"epoch": 0.03584637268847795,
"grad_norm": 0.4832801818847656,
"learning_rate": 0.0001992750968253439,
"loss": 1.4713,
"step": 63
},
{
"epoch": 0.03641536273115221,
"grad_norm": 0.49059394001960754,
"learning_rate": 0.00019924751860566118,
"loss": 1.6009,
"step": 64
},
{
"epoch": 0.03698435277382646,
"grad_norm": 0.5292865633964539,
"learning_rate": 0.0001992194275208987,
"loss": 1.6339,
"step": 65
},
{
"epoch": 0.03755334281650071,
"grad_norm": 0.520621120929718,
"learning_rate": 0.00019919082371621811,
"loss": 1.7033,
"step": 66
},
{
"epoch": 0.03812233285917496,
"grad_norm": 0.5552493929862976,
"learning_rate": 0.0001991617073394306,
"loss": 1.5704,
"step": 67
},
{
"epoch": 0.03869132290184922,
"grad_norm": 0.5199451446533203,
"learning_rate": 0.0001991320785409961,
"loss": 1.6266,
"step": 68
},
{
"epoch": 0.03926031294452347,
"grad_norm": 0.540593147277832,
"learning_rate": 0.0001991019374740225,
"loss": 1.7327,
"step": 69
},
{
"epoch": 0.03982930298719772,
"grad_norm": 0.5305120348930359,
"learning_rate": 0.00019907128429426477,
"loss": 1.6544,
"step": 70
},
{
"epoch": 0.040398293029871975,
"grad_norm": 0.5247764587402344,
"learning_rate": 0.00019904011916012433,
"loss": 1.429,
"step": 71
},
{
"epoch": 0.04096728307254623,
"grad_norm": 0.500156819820404,
"learning_rate": 0.00019900844223264813,
"loss": 1.6106,
"step": 72
},
{
"epoch": 0.041536273115220484,
"grad_norm": 0.49794986844062805,
"learning_rate": 0.00019897625367552784,
"loss": 1.5322,
"step": 73
},
{
"epoch": 0.042105263157894736,
"grad_norm": 0.5475789308547974,
"learning_rate": 0.00019894355365509894,
"loss": 1.4882,
"step": 74
},
{
"epoch": 0.04267425320056899,
"grad_norm": 0.5272343158721924,
"learning_rate": 0.00019891034234033995,
"loss": 1.5119,
"step": 75
},
{
"epoch": 0.043243243243243246,
"grad_norm": 0.4892237186431885,
"learning_rate": 0.00019887661990287153,
"loss": 1.5567,
"step": 76
},
{
"epoch": 0.0438122332859175,
"grad_norm": 0.528414249420166,
"learning_rate": 0.00019884238651695556,
"loss": 1.7716,
"step": 77
},
{
"epoch": 0.04438122332859175,
"grad_norm": 0.5159140229225159,
"learning_rate": 0.00019880764235949427,
"loss": 1.6873,
"step": 78
},
{
"epoch": 0.044950213371266,
"grad_norm": 0.5157197713851929,
"learning_rate": 0.0001987723876100294,
"loss": 1.5196,
"step": 79
},
{
"epoch": 0.04551920341394026,
"grad_norm": 0.518205463886261,
"learning_rate": 0.00019873662245074102,
"loss": 1.5238,
"step": 80
},
{
"epoch": 0.04608819345661451,
"grad_norm": 0.5316376090049744,
"learning_rate": 0.00019870034706644693,
"loss": 1.4913,
"step": 81
},
{
"epoch": 0.04665718349928876,
"grad_norm": 0.5020834803581238,
"learning_rate": 0.00019866356164460145,
"loss": 1.4051,
"step": 82
},
{
"epoch": 0.04722617354196301,
"grad_norm": 0.4912559986114502,
"learning_rate": 0.00019862626637529455,
"loss": 1.4947,
"step": 83
},
{
"epoch": 0.04779516358463727,
"grad_norm": 0.5261936187744141,
"learning_rate": 0.00019858846145125086,
"loss": 1.659,
"step": 84
},
{
"epoch": 0.04836415362731152,
"grad_norm": 0.5002409815788269,
"learning_rate": 0.00019855014706782867,
"loss": 1.4743,
"step": 85
},
{
"epoch": 0.048933143669985774,
"grad_norm": 0.5293824672698975,
"learning_rate": 0.0001985113234230189,
"loss": 1.5796,
"step": 86
},
{
"epoch": 0.049502133712660025,
"grad_norm": 0.49084582924842834,
"learning_rate": 0.00019847199071744415,
"loss": 1.6052,
"step": 87
},
{
"epoch": 0.05007112375533428,
"grad_norm": 0.5251219868659973,
"learning_rate": 0.00019843214915435758,
"loss": 1.7684,
"step": 88
},
{
"epoch": 0.050640113798008535,
"grad_norm": 0.5003427267074585,
"learning_rate": 0.0001983917989396418,
"loss": 1.5715,
"step": 89
},
{
"epoch": 0.051209103840682786,
"grad_norm": 0.5283729434013367,
"learning_rate": 0.0001983509402818081,
"loss": 1.5396,
"step": 90
},
{
"epoch": 0.051778093883357044,
"grad_norm": 0.49652016162872314,
"learning_rate": 0.00019830957339199494,
"loss": 1.5353,
"step": 91
},
{
"epoch": 0.052347083926031296,
"grad_norm": 0.49297675490379333,
"learning_rate": 0.00019826769848396727,
"loss": 1.5012,
"step": 92
},
{
"epoch": 0.05291607396870555,
"grad_norm": 0.5100125670433044,
"learning_rate": 0.0001982253157741151,
"loss": 1.6194,
"step": 93
},
{
"epoch": 0.0534850640113798,
"grad_norm": 0.5218221545219421,
"learning_rate": 0.00019818242548145265,
"loss": 1.6505,
"step": 94
},
{
"epoch": 0.05405405405405406,
"grad_norm": 0.5490546226501465,
"learning_rate": 0.000198139027827617,
"loss": 1.498,
"step": 95
},
{
"epoch": 0.05462304409672831,
"grad_norm": 0.5228062868118286,
"learning_rate": 0.00019809512303686706,
"loss": 1.4592,
"step": 96
},
{
"epoch": 0.05519203413940256,
"grad_norm": 0.49827295541763306,
"learning_rate": 0.00019805071133608242,
"loss": 1.6593,
"step": 97
},
{
"epoch": 0.05576102418207681,
"grad_norm": 0.5081865191459656,
"learning_rate": 0.0001980057929547621,
"loss": 1.4226,
"step": 98
},
{
"epoch": 0.05633001422475107,
"grad_norm": 0.5018671751022339,
"learning_rate": 0.00019796036812502347,
"loss": 1.4995,
"step": 99
},
{
"epoch": 0.05689900426742532,
"grad_norm": 0.5807016491889954,
"learning_rate": 0.00019791443708160094,
"loss": 1.7405,
"step": 100
},
{
"epoch": 0.05746799431009957,
"grad_norm": 0.5095066428184509,
"learning_rate": 0.00019786800006184473,
"loss": 1.4908,
"step": 101
},
{
"epoch": 0.058036984352773824,
"grad_norm": 0.5552268028259277,
"learning_rate": 0.00019782105730571992,
"loss": 1.5289,
"step": 102
},
{
"epoch": 0.05860597439544808,
"grad_norm": 0.47026970982551575,
"learning_rate": 0.00019777360905580478,
"loss": 1.3497,
"step": 103
},
{
"epoch": 0.059174964438122334,
"grad_norm": 0.5475593209266663,
"learning_rate": 0.00019772565555728984,
"loss": 1.6329,
"step": 104
},
{
"epoch": 0.059743954480796585,
"grad_norm": 0.5217400789260864,
"learning_rate": 0.00019767719705797657,
"loss": 1.6181,
"step": 105
},
{
"epoch": 0.060312944523470836,
"grad_norm": 0.5143265128135681,
"learning_rate": 0.00019762823380827592,
"loss": 1.6369,
"step": 106
},
{
"epoch": 0.060881934566145095,
"grad_norm": 0.501568615436554,
"learning_rate": 0.0001975787660612072,
"loss": 1.6871,
"step": 107
},
{
"epoch": 0.061450924608819346,
"grad_norm": 0.47950610518455505,
"learning_rate": 0.00019752879407239685,
"loss": 1.4494,
"step": 108
},
{
"epoch": 0.0620199146514936,
"grad_norm": 0.5488466024398804,
"learning_rate": 0.0001974783181000768,
"loss": 1.6457,
"step": 109
},
{
"epoch": 0.06258890469416785,
"grad_norm": 0.5165080428123474,
"learning_rate": 0.0001974273384050835,
"loss": 1.5463,
"step": 110
},
{
"epoch": 0.06315789473684211,
"grad_norm": 0.5002058744430542,
"learning_rate": 0.0001973758552508563,
"loss": 1.4333,
"step": 111
},
{
"epoch": 0.06372688477951635,
"grad_norm": 0.4927598237991333,
"learning_rate": 0.00019732386890343624,
"loss": 1.5576,
"step": 112
},
{
"epoch": 0.06429587482219061,
"grad_norm": 0.5156055688858032,
"learning_rate": 0.0001972713796314646,
"loss": 1.4821,
"step": 113
},
{
"epoch": 0.06486486486486487,
"grad_norm": 0.5108924508094788,
"learning_rate": 0.0001972183877061816,
"loss": 1.502,
"step": 114
},
{
"epoch": 0.06543385490753911,
"grad_norm": 0.5052126049995422,
"learning_rate": 0.00019716489340142483,
"loss": 1.7285,
"step": 115
},
{
"epoch": 0.06600284495021337,
"grad_norm": 0.5034211874008179,
"learning_rate": 0.00019711089699362807,
"loss": 1.4148,
"step": 116
},
{
"epoch": 0.06657183499288763,
"grad_norm": 0.5284733772277832,
"learning_rate": 0.00019705639876181969,
"loss": 1.5979,
"step": 117
},
{
"epoch": 0.06714082503556187,
"grad_norm": 0.5434923768043518,
"learning_rate": 0.0001970013989876212,
"loss": 1.6856,
"step": 118
},
{
"epoch": 0.06770981507823613,
"grad_norm": 0.48895972967147827,
"learning_rate": 0.00019694589795524588,
"loss": 1.5305,
"step": 119
},
{
"epoch": 0.06827880512091039,
"grad_norm": 0.5481955409049988,
"learning_rate": 0.00019688989595149732,
"loss": 1.473,
"step": 120
},
{
"epoch": 0.06884779516358464,
"grad_norm": 0.47966116666793823,
"learning_rate": 0.00019683339326576781,
"loss": 1.1899,
"step": 121
},
{
"epoch": 0.0694167852062589,
"grad_norm": 0.5007337927818298,
"learning_rate": 0.00019677639019003706,
"loss": 1.4747,
"step": 122
},
{
"epoch": 0.06998577524893314,
"grad_norm": 0.5798030495643616,
"learning_rate": 0.00019671888701887046,
"loss": 1.5881,
"step": 123
},
{
"epoch": 0.0705547652916074,
"grad_norm": 0.5382363200187683,
"learning_rate": 0.0001966608840494177,
"loss": 1.6345,
"step": 124
},
{
"epoch": 0.07112375533428165,
"grad_norm": 0.5181685090065002,
"learning_rate": 0.00019660238158141112,
"loss": 1.48,
"step": 125
},
{
"epoch": 0.0716927453769559,
"grad_norm": 0.5349889993667603,
"learning_rate": 0.0001965433799171644,
"loss": 1.5679,
"step": 126
},
{
"epoch": 0.07226173541963016,
"grad_norm": 0.496991902589798,
"learning_rate": 0.00019648387936157068,
"loss": 1.5596,
"step": 127
},
{
"epoch": 0.07283072546230442,
"grad_norm": 0.5177836418151855,
"learning_rate": 0.0001964238802221012,
"loss": 1.3765,
"step": 128
},
{
"epoch": 0.07339971550497866,
"grad_norm": 0.5253962874412537,
"learning_rate": 0.00019636338280880366,
"loss": 1.7268,
"step": 129
},
{
"epoch": 0.07396870554765292,
"grad_norm": 0.5878409743309021,
"learning_rate": 0.00019630238743430058,
"loss": 1.5933,
"step": 130
},
{
"epoch": 0.07453769559032716,
"grad_norm": 0.5072840452194214,
"learning_rate": 0.00019624089441378775,
"loss": 1.3819,
"step": 131
},
{
"epoch": 0.07510668563300142,
"grad_norm": 0.5567812323570251,
"learning_rate": 0.0001961789040650325,
"loss": 1.5582,
"step": 132
},
{
"epoch": 0.07567567567567568,
"grad_norm": 0.48109254240989685,
"learning_rate": 0.00019611641670837219,
"loss": 1.4227,
"step": 133
},
{
"epoch": 0.07624466571834992,
"grad_norm": 0.5404167175292969,
"learning_rate": 0.00019605343266671245,
"loss": 1.6807,
"step": 134
},
{
"epoch": 0.07681365576102418,
"grad_norm": 0.47476792335510254,
"learning_rate": 0.00019598995226552556,
"loss": 1.3462,
"step": 135
},
{
"epoch": 0.07738264580369844,
"grad_norm": 0.4884220361709595,
"learning_rate": 0.0001959259758328487,
"loss": 1.5956,
"step": 136
},
{
"epoch": 0.07795163584637269,
"grad_norm": 0.5190904140472412,
"learning_rate": 0.00019586150369928245,
"loss": 1.6685,
"step": 137
},
{
"epoch": 0.07852062588904694,
"grad_norm": 0.513028621673584,
"learning_rate": 0.0001957965361979888,
"loss": 1.7023,
"step": 138
},
{
"epoch": 0.07908961593172119,
"grad_norm": 0.4926295578479767,
"learning_rate": 0.00019573107366468962,
"loss": 1.4606,
"step": 139
},
{
"epoch": 0.07965860597439545,
"grad_norm": 0.5009914636611938,
"learning_rate": 0.00019566511643766485,
"loss": 1.5636,
"step": 140
},
{
"epoch": 0.0802275960170697,
"grad_norm": 0.54355388879776,
"learning_rate": 0.00019559866485775084,
"loss": 1.681,
"step": 141
},
{
"epoch": 0.08079658605974395,
"grad_norm": 0.5059416890144348,
"learning_rate": 0.00019553171926833853,
"loss": 1.6193,
"step": 142
},
{
"epoch": 0.08136557610241821,
"grad_norm": 0.5309209227561951,
"learning_rate": 0.00019546428001537155,
"loss": 1.5552,
"step": 143
},
{
"epoch": 0.08193456614509247,
"grad_norm": 0.4913862943649292,
"learning_rate": 0.0001953963474473447,
"loss": 1.5506,
"step": 144
},
{
"epoch": 0.08250355618776671,
"grad_norm": 0.5331928133964539,
"learning_rate": 0.0001953279219153019,
"loss": 1.7152,
"step": 145
},
{
"epoch": 0.08307254623044097,
"grad_norm": 0.5169084072113037,
"learning_rate": 0.00019525900377283457,
"loss": 1.6177,
"step": 146
},
{
"epoch": 0.08364153627311523,
"grad_norm": 0.5159075856208801,
"learning_rate": 0.00019518959337607957,
"loss": 1.5652,
"step": 147
},
{
"epoch": 0.08421052631578947,
"grad_norm": 0.5606206655502319,
"learning_rate": 0.0001951196910837177,
"loss": 1.6821,
"step": 148
},
{
"epoch": 0.08477951635846373,
"grad_norm": 0.47890591621398926,
"learning_rate": 0.0001950492972569715,
"loss": 1.5041,
"step": 149
},
{
"epoch": 0.08534850640113797,
"grad_norm": 0.5077673196792603,
"learning_rate": 0.0001949784122596035,
"loss": 1.5837,
"step": 150
},
{
"epoch": 0.08591749644381223,
"grad_norm": 0.5021458268165588,
"learning_rate": 0.00019490703645791454,
"loss": 1.5813,
"step": 151
},
{
"epoch": 0.08648648648648649,
"grad_norm": 0.5000331997871399,
"learning_rate": 0.00019483517022074156,
"loss": 1.5686,
"step": 152
},
{
"epoch": 0.08705547652916074,
"grad_norm": 0.5121405124664307,
"learning_rate": 0.0001947628139194559,
"loss": 1.4329,
"step": 153
},
{
"epoch": 0.087624466571835,
"grad_norm": 0.5058543682098389,
"learning_rate": 0.00019468996792796137,
"loss": 1.36,
"step": 154
},
{
"epoch": 0.08819345661450925,
"grad_norm": 0.5810546875,
"learning_rate": 0.00019461663262269213,
"loss": 1.3764,
"step": 155
},
{
"epoch": 0.0887624466571835,
"grad_norm": 0.5015589594841003,
"learning_rate": 0.00019454280838261106,
"loss": 1.4966,
"step": 156
},
{
"epoch": 0.08933143669985776,
"grad_norm": 0.5284256339073181,
"learning_rate": 0.0001944684955892075,
"loss": 1.4944,
"step": 157
},
{
"epoch": 0.089900426742532,
"grad_norm": 0.49957889318466187,
"learning_rate": 0.0001943936946264955,
"loss": 1.4641,
"step": 158
},
{
"epoch": 0.09046941678520626,
"grad_norm": 0.5073912143707275,
"learning_rate": 0.00019431840588101157,
"loss": 1.3371,
"step": 159
},
{
"epoch": 0.09103840682788052,
"grad_norm": 0.5323196649551392,
"learning_rate": 0.00019424262974181313,
"loss": 1.5312,
"step": 160
},
{
"epoch": 0.09160739687055476,
"grad_norm": 0.5276457071304321,
"learning_rate": 0.00019416636660047595,
"loss": 1.64,
"step": 161
},
{
"epoch": 0.09217638691322902,
"grad_norm": 0.49499741196632385,
"learning_rate": 0.0001940896168510926,
"loss": 1.3689,
"step": 162
},
{
"epoch": 0.09274537695590328,
"grad_norm": 0.5169721245765686,
"learning_rate": 0.00019401238089027017,
"loss": 1.5352,
"step": 163
},
{
"epoch": 0.09331436699857752,
"grad_norm": 0.48859354853630066,
"learning_rate": 0.0001939346591171281,
"loss": 1.4584,
"step": 164
},
{
"epoch": 0.09388335704125178,
"grad_norm": 0.5150989890098572,
"learning_rate": 0.00019385645193329654,
"loss": 1.5178,
"step": 165
},
{
"epoch": 0.09445234708392602,
"grad_norm": 0.48626863956451416,
"learning_rate": 0.00019377775974291383,
"loss": 1.3689,
"step": 166
},
{
"epoch": 0.09502133712660028,
"grad_norm": 0.5352733731269836,
"learning_rate": 0.0001936985829526247,
"loss": 1.5953,
"step": 167
},
{
"epoch": 0.09559032716927454,
"grad_norm": 0.5061799883842468,
"learning_rate": 0.00019361892197157797,
"loss": 1.6339,
"step": 168
},
{
"epoch": 0.09615931721194879,
"grad_norm": 0.5095758438110352,
"learning_rate": 0.0001935387772114246,
"loss": 1.5116,
"step": 169
},
{
"epoch": 0.09672830725462304,
"grad_norm": 0.4948934316635132,
"learning_rate": 0.00019345814908631556,
"loss": 1.3963,
"step": 170
},
{
"epoch": 0.0972972972972973,
"grad_norm": 0.5632720589637756,
"learning_rate": 0.0001933770380128995,
"loss": 1.618,
"step": 171
},
{
"epoch": 0.09786628733997155,
"grad_norm": 0.5013827681541443,
"learning_rate": 0.00019329544441032076,
"loss": 1.4847,
"step": 172
},
{
"epoch": 0.0984352773826458,
"grad_norm": 0.512117326259613,
"learning_rate": 0.0001932133687002172,
"loss": 1.4346,
"step": 173
},
{
"epoch": 0.09900426742532005,
"grad_norm": 0.5385090708732605,
"learning_rate": 0.00019313081130671798,
"loss": 1.6694,
"step": 174
},
{
"epoch": 0.09957325746799431,
"grad_norm": 0.5616840720176697,
"learning_rate": 0.00019304777265644133,
"loss": 1.5638,
"step": 175
},
{
"epoch": 0.10014224751066857,
"grad_norm": 0.5222409963607788,
"learning_rate": 0.0001929642531784925,
"loss": 1.6203,
"step": 176
},
{
"epoch": 0.10071123755334281,
"grad_norm": 0.5733211040496826,
"learning_rate": 0.00019288025330446126,
"loss": 1.6952,
"step": 177
},
{
"epoch": 0.10128022759601707,
"grad_norm": 0.5625792741775513,
"learning_rate": 0.00019279577346842,
"loss": 1.6639,
"step": 178
},
{
"epoch": 0.10184921763869133,
"grad_norm": 0.5778010487556458,
"learning_rate": 0.0001927108141069213,
"loss": 1.5719,
"step": 179
},
{
"epoch": 0.10241820768136557,
"grad_norm": 0.5034694671630859,
"learning_rate": 0.00019262537565899564,
"loss": 1.4461,
"step": 180
},
{
"epoch": 0.10298719772403983,
"grad_norm": 0.5446426272392273,
"learning_rate": 0.0001925394585661492,
"loss": 1.4904,
"step": 181
},
{
"epoch": 0.10355618776671409,
"grad_norm": 0.47503742575645447,
"learning_rate": 0.00019245306327236172,
"loss": 1.5012,
"step": 182
},
{
"epoch": 0.10412517780938833,
"grad_norm": 0.5337246656417847,
"learning_rate": 0.00019236619022408387,
"loss": 1.4175,
"step": 183
},
{
"epoch": 0.10469416785206259,
"grad_norm": 0.5157039165496826,
"learning_rate": 0.00019227883987023523,
"loss": 1.6435,
"step": 184
},
{
"epoch": 0.10526315789473684,
"grad_norm": 0.5278623700141907,
"learning_rate": 0.00019219101266220188,
"loss": 1.6746,
"step": 185
},
{
"epoch": 0.1058321479374111,
"grad_norm": 0.4916015565395355,
"learning_rate": 0.000192102709053834,
"loss": 1.4584,
"step": 186
},
{
"epoch": 0.10640113798008535,
"grad_norm": 0.5512337684631348,
"learning_rate": 0.00019201392950144363,
"loss": 1.6313,
"step": 187
},
{
"epoch": 0.1069701280227596,
"grad_norm": 0.506673276424408,
"learning_rate": 0.0001919246744638023,
"loss": 1.4842,
"step": 188
},
{
"epoch": 0.10753911806543386,
"grad_norm": 0.49428772926330566,
"learning_rate": 0.00019183494440213857,
"loss": 1.4246,
"step": 189
},
{
"epoch": 0.10810810810810811,
"grad_norm": 0.5020580887794495,
"learning_rate": 0.0001917447397801357,
"loss": 1.6966,
"step": 190
},
{
"epoch": 0.10867709815078236,
"grad_norm": 0.5004864931106567,
"learning_rate": 0.00019165406106392928,
"loss": 1.3144,
"step": 191
},
{
"epoch": 0.10924608819345662,
"grad_norm": 0.47853466868400574,
"learning_rate": 0.00019156290872210488,
"loss": 1.3321,
"step": 192
},
{
"epoch": 0.10981507823613086,
"grad_norm": 0.4940144121646881,
"learning_rate": 0.00019147128322569533,
"loss": 1.2719,
"step": 193
},
{
"epoch": 0.11038406827880512,
"grad_norm": 0.5355538725852966,
"learning_rate": 0.00019137918504817878,
"loss": 1.4551,
"step": 194
},
{
"epoch": 0.11095305832147938,
"grad_norm": 0.5604861378669739,
"learning_rate": 0.00019128661466547576,
"loss": 1.6109,
"step": 195
},
{
"epoch": 0.11152204836415362,
"grad_norm": 0.5061023235321045,
"learning_rate": 0.000191193572555947,
"loss": 1.511,
"step": 196
},
{
"epoch": 0.11209103840682788,
"grad_norm": 0.5125574469566345,
"learning_rate": 0.0001911000592003909,
"loss": 1.4209,
"step": 197
},
{
"epoch": 0.11266002844950214,
"grad_norm": 0.5150197744369507,
"learning_rate": 0.00019100607508204114,
"loss": 1.6323,
"step": 198
},
{
"epoch": 0.11322901849217638,
"grad_norm": 0.5164692997932434,
"learning_rate": 0.0001909116206865639,
"loss": 1.5086,
"step": 199
},
{
"epoch": 0.11379800853485064,
"grad_norm": 0.5399172306060791,
"learning_rate": 0.00019081669650205564,
"loss": 1.5051,
"step": 200
},
{
"epoch": 0.11436699857752489,
"grad_norm": 0.49494683742523193,
"learning_rate": 0.0001907213030190405,
"loss": 1.5123,
"step": 201
},
{
"epoch": 0.11493598862019914,
"grad_norm": 0.5344505906105042,
"learning_rate": 0.00019062544073046768,
"loss": 1.5364,
"step": 202
},
{
"epoch": 0.1155049786628734,
"grad_norm": 0.5201467871665955,
"learning_rate": 0.00019052911013170892,
"loss": 1.5027,
"step": 203
},
{
"epoch": 0.11607396870554765,
"grad_norm": 0.5991513729095459,
"learning_rate": 0.00019043231172055603,
"loss": 1.6402,
"step": 204
},
{
"epoch": 0.1166429587482219,
"grad_norm": 0.5526711940765381,
"learning_rate": 0.00019033504599721827,
"loss": 1.6166,
"step": 205
},
{
"epoch": 0.11721194879089616,
"grad_norm": 0.493965208530426,
"learning_rate": 0.00019023731346431972,
"loss": 1.3099,
"step": 206
},
{
"epoch": 0.11778093883357041,
"grad_norm": 0.5043678879737854,
"learning_rate": 0.00019013911462689668,
"loss": 1.3328,
"step": 207
},
{
"epoch": 0.11834992887624467,
"grad_norm": 0.518515944480896,
"learning_rate": 0.00019004044999239517,
"loss": 1.453,
"step": 208
},
{
"epoch": 0.11891891891891893,
"grad_norm": 0.547725260257721,
"learning_rate": 0.00018994132007066816,
"loss": 1.552,
"step": 209
},
{
"epoch": 0.11948790896159317,
"grad_norm": 0.5498734712600708,
"learning_rate": 0.0001898417253739731,
"loss": 1.6076,
"step": 210
},
{
"epoch": 0.12005689900426743,
"grad_norm": 0.5087684392929077,
"learning_rate": 0.00018974166641696908,
"loss": 1.3459,
"step": 211
},
{
"epoch": 0.12062588904694167,
"grad_norm": 0.49864476919174194,
"learning_rate": 0.00018964114371671428,
"loss": 1.502,
"step": 212
},
{
"epoch": 0.12119487908961593,
"grad_norm": 0.49818646907806396,
"learning_rate": 0.0001895401577926634,
"loss": 1.5047,
"step": 213
},
{
"epoch": 0.12176386913229019,
"grad_norm": 0.5151641964912415,
"learning_rate": 0.00018943870916666476,
"loss": 1.5276,
"step": 214
},
{
"epoch": 0.12233285917496443,
"grad_norm": 0.5294698476791382,
"learning_rate": 0.00018933679836295777,
"loss": 1.4735,
"step": 215
},
{
"epoch": 0.12290184921763869,
"grad_norm": 0.5169737339019775,
"learning_rate": 0.0001892344259081701,
"loss": 1.6458,
"step": 216
},
{
"epoch": 0.12347083926031295,
"grad_norm": 0.5262957811355591,
"learning_rate": 0.000189131592331315,
"loss": 1.6239,
"step": 217
},
{
"epoch": 0.1240398293029872,
"grad_norm": 0.5043689012527466,
"learning_rate": 0.00018902829816378876,
"loss": 1.5785,
"step": 218
},
{
"epoch": 0.12460881934566145,
"grad_norm": 0.5032008290290833,
"learning_rate": 0.00018892454393936754,
"loss": 1.4075,
"step": 219
},
{
"epoch": 0.1251778093883357,
"grad_norm": 0.5261518359184265,
"learning_rate": 0.00018882033019420504,
"loss": 1.4251,
"step": 220
},
{
"epoch": 0.12574679943100997,
"grad_norm": 0.5519723296165466,
"learning_rate": 0.00018871565746682949,
"loss": 1.6654,
"step": 221
},
{
"epoch": 0.12631578947368421,
"grad_norm": 0.5465745329856873,
"learning_rate": 0.0001886105262981409,
"loss": 1.5489,
"step": 222
},
{
"epoch": 0.12688477951635846,
"grad_norm": 0.6040769219398499,
"learning_rate": 0.00018850493723140835,
"loss": 1.6205,
"step": 223
},
{
"epoch": 0.1274537695590327,
"grad_norm": 0.5207870006561279,
"learning_rate": 0.0001883988908122671,
"loss": 1.5843,
"step": 224
},
{
"epoch": 0.12802275960170698,
"grad_norm": 0.5130170583724976,
"learning_rate": 0.00018829238758871574,
"loss": 1.5384,
"step": 225
},
{
"epoch": 0.12859174964438122,
"grad_norm": 0.5100380182266235,
"learning_rate": 0.00018818542811111354,
"loss": 1.5026,
"step": 226
},
{
"epoch": 0.12916073968705546,
"grad_norm": 0.5047493577003479,
"learning_rate": 0.00018807801293217735,
"loss": 1.4774,
"step": 227
},
{
"epoch": 0.12972972972972974,
"grad_norm": 0.5392350554466248,
"learning_rate": 0.0001879701426069789,
"loss": 1.2986,
"step": 228
},
{
"epoch": 0.13029871977240398,
"grad_norm": 0.4927089810371399,
"learning_rate": 0.00018786181769294203,
"loss": 1.3298,
"step": 229
},
{
"epoch": 0.13086770981507823,
"grad_norm": 0.5079994797706604,
"learning_rate": 0.0001877530387498395,
"loss": 1.4027,
"step": 230
},
{
"epoch": 0.1314366998577525,
"grad_norm": 0.5074231624603271,
"learning_rate": 0.00018764380633979035,
"loss": 1.6176,
"step": 231
},
{
"epoch": 0.13200568990042674,
"grad_norm": 0.5501790642738342,
"learning_rate": 0.00018753412102725698,
"loss": 1.3795,
"step": 232
},
{
"epoch": 0.132574679943101,
"grad_norm": 0.5117084383964539,
"learning_rate": 0.00018742398337904213,
"loss": 1.4731,
"step": 233
},
{
"epoch": 0.13314366998577526,
"grad_norm": 0.5027900338172913,
"learning_rate": 0.00018731339396428607,
"loss": 1.5399,
"step": 234
},
{
"epoch": 0.1337126600284495,
"grad_norm": 0.5187605619430542,
"learning_rate": 0.00018720235335446342,
"loss": 1.5111,
"step": 235
},
{
"epoch": 0.13428165007112375,
"grad_norm": 0.5272188782691956,
"learning_rate": 0.00018709086212338058,
"loss": 1.5717,
"step": 236
},
{
"epoch": 0.13485064011379802,
"grad_norm": 0.5339289903640747,
"learning_rate": 0.00018697892084717238,
"loss": 1.4529,
"step": 237
},
{
"epoch": 0.13541963015647226,
"grad_norm": 0.5382213592529297,
"learning_rate": 0.00018686653010429937,
"loss": 1.5727,
"step": 238
},
{
"epoch": 0.1359886201991465,
"grad_norm": 0.5148522257804871,
"learning_rate": 0.00018675369047554475,
"loss": 1.5683,
"step": 239
},
{
"epoch": 0.13655761024182078,
"grad_norm": 0.5300989747047424,
"learning_rate": 0.00018664040254401121,
"loss": 1.6485,
"step": 240
},
{
"epoch": 0.13712660028449503,
"grad_norm": 0.5400955080986023,
"learning_rate": 0.00018652666689511824,
"loss": 1.5095,
"step": 241
},
{
"epoch": 0.13769559032716927,
"grad_norm": 0.49695253372192383,
"learning_rate": 0.0001864124841165988,
"loss": 1.3692,
"step": 242
},
{
"epoch": 0.13826458036984351,
"grad_norm": 0.5431788563728333,
"learning_rate": 0.00018629785479849656,
"loss": 1.5774,
"step": 243
},
{
"epoch": 0.1388335704125178,
"grad_norm": 0.5125901103019714,
"learning_rate": 0.00018618277953316245,
"loss": 1.3545,
"step": 244
},
{
"epoch": 0.13940256045519203,
"grad_norm": 0.5172457695007324,
"learning_rate": 0.0001860672589152521,
"loss": 1.5196,
"step": 245
},
{
"epoch": 0.13997155049786628,
"grad_norm": 0.5287220478057861,
"learning_rate": 0.00018595129354172235,
"loss": 1.7279,
"step": 246
},
{
"epoch": 0.14054054054054055,
"grad_norm": 0.5728311538696289,
"learning_rate": 0.00018583488401182843,
"loss": 1.5514,
"step": 247
},
{
"epoch": 0.1411095305832148,
"grad_norm": 0.5267804861068726,
"learning_rate": 0.0001857180309271207,
"loss": 1.5115,
"step": 248
},
{
"epoch": 0.14167852062588904,
"grad_norm": 0.5459727644920349,
"learning_rate": 0.00018560073489144166,
"loss": 1.5057,
"step": 249
},
{
"epoch": 0.1422475106685633,
"grad_norm": 0.5065287947654724,
"learning_rate": 0.00018548299651092269,
"loss": 1.4906,
"step": 250
},
{
"epoch": 0.14281650071123755,
"grad_norm": 0.5647059082984924,
"learning_rate": 0.00018536481639398107,
"loss": 1.5447,
"step": 251
},
{
"epoch": 0.1433854907539118,
"grad_norm": 0.5164194703102112,
"learning_rate": 0.00018524619515131679,
"loss": 1.6922,
"step": 252
},
{
"epoch": 0.14395448079658607,
"grad_norm": 0.5288499593734741,
"learning_rate": 0.0001851271333959093,
"loss": 1.5596,
"step": 253
},
{
"epoch": 0.14452347083926032,
"grad_norm": 0.509348452091217,
"learning_rate": 0.00018500763174301448,
"loss": 1.6263,
"step": 254
},
{
"epoch": 0.14509246088193456,
"grad_norm": 0.5377824902534485,
"learning_rate": 0.00018488769081016133,
"loss": 1.4711,
"step": 255
},
{
"epoch": 0.14566145092460883,
"grad_norm": 0.5068728923797607,
"learning_rate": 0.00018476731121714894,
"loss": 1.6706,
"step": 256
},
{
"epoch": 0.14623044096728308,
"grad_norm": 0.5097038745880127,
"learning_rate": 0.0001846464935860431,
"loss": 1.5841,
"step": 257
},
{
"epoch": 0.14679943100995732,
"grad_norm": 0.5391016006469727,
"learning_rate": 0.0001845252385411732,
"loss": 1.6935,
"step": 258
},
{
"epoch": 0.14736842105263157,
"grad_norm": 0.5154038667678833,
"learning_rate": 0.00018440354670912906,
"loss": 1.3827,
"step": 259
},
{
"epoch": 0.14793741109530584,
"grad_norm": 0.5789750814437866,
"learning_rate": 0.00018428141871875743,
"loss": 1.545,
"step": 260
},
{
"epoch": 0.14850640113798008,
"grad_norm": 0.5456128716468811,
"learning_rate": 0.00018415885520115915,
"loss": 1.5359,
"step": 261
},
{
"epoch": 0.14907539118065433,
"grad_norm": 0.6158856749534607,
"learning_rate": 0.00018403585678968551,
"loss": 1.7601,
"step": 262
},
{
"epoch": 0.1496443812233286,
"grad_norm": 0.4721933603286743,
"learning_rate": 0.00018391242411993516,
"loss": 1.3328,
"step": 263
},
{
"epoch": 0.15021337126600284,
"grad_norm": 0.5242535471916199,
"learning_rate": 0.00018378855782975084,
"loss": 1.3359,
"step": 264
},
{
"epoch": 0.1507823613086771,
"grad_norm": 0.5116239190101624,
"learning_rate": 0.000183664258559216,
"loss": 1.218,
"step": 265
},
{
"epoch": 0.15135135135135136,
"grad_norm": 0.5715349316596985,
"learning_rate": 0.0001835395269506515,
"loss": 1.7737,
"step": 266
},
{
"epoch": 0.1519203413940256,
"grad_norm": 0.5294284224510193,
"learning_rate": 0.0001834143636486124,
"loss": 1.7273,
"step": 267
},
{
"epoch": 0.15248933143669985,
"grad_norm": 0.5225195288658142,
"learning_rate": 0.0001832887692998845,
"loss": 1.5397,
"step": 268
},
{
"epoch": 0.15305832147937412,
"grad_norm": 0.5032251477241516,
"learning_rate": 0.00018316274455348105,
"loss": 1.4483,
"step": 269
},
{
"epoch": 0.15362731152204837,
"grad_norm": 0.5733814835548401,
"learning_rate": 0.00018303629006063943,
"loss": 1.5798,
"step": 270
},
{
"epoch": 0.1541963015647226,
"grad_norm": 0.5273986458778381,
"learning_rate": 0.0001829094064748177,
"loss": 1.6515,
"step": 271
},
{
"epoch": 0.15476529160739688,
"grad_norm": 0.563911497592926,
"learning_rate": 0.00018278209445169135,
"loss": 1.6408,
"step": 272
},
{
"epoch": 0.15533428165007113,
"grad_norm": 0.5052376985549927,
"learning_rate": 0.00018265435464914973,
"loss": 1.3572,
"step": 273
},
{
"epoch": 0.15590327169274537,
"grad_norm": 0.5052018761634827,
"learning_rate": 0.0001825261877272928,
"loss": 1.5019,
"step": 274
},
{
"epoch": 0.15647226173541964,
"grad_norm": 0.4795508086681366,
"learning_rate": 0.00018239759434842773,
"loss": 1.0659,
"step": 275
},
{
"epoch": 0.1570412517780939,
"grad_norm": 0.5224232077598572,
"learning_rate": 0.00018226857517706537,
"loss": 1.6048,
"step": 276
},
{
"epoch": 0.15761024182076813,
"grad_norm": 0.5337119698524475,
"learning_rate": 0.00018213913087991685,
"loss": 1.4884,
"step": 277
},
{
"epoch": 0.15817923186344238,
"grad_norm": 0.48973479866981506,
"learning_rate": 0.0001820092621258902,
"loss": 1.3599,
"step": 278
},
{
"epoch": 0.15874822190611665,
"grad_norm": 0.4995887577533722,
"learning_rate": 0.0001818789695860868,
"loss": 1.5088,
"step": 279
},
{
"epoch": 0.1593172119487909,
"grad_norm": 0.513390064239502,
"learning_rate": 0.00018174825393379798,
"loss": 1.5376,
"step": 280
},
{
"epoch": 0.15988620199146514,
"grad_norm": 0.5285114645957947,
"learning_rate": 0.00018161711584450152,
"loss": 1.706,
"step": 281
},
{
"epoch": 0.1604551920341394,
"grad_norm": 0.5384095907211304,
"learning_rate": 0.00018148555599585816,
"loss": 1.474,
"step": 282
},
{
"epoch": 0.16102418207681365,
"grad_norm": 0.5326551795005798,
"learning_rate": 0.0001813535750677081,
"loss": 1.4764,
"step": 283
},
{
"epoch": 0.1615931721194879,
"grad_norm": 0.538357675075531,
"learning_rate": 0.0001812211737420675,
"loss": 1.7382,
"step": 284
},
{
"epoch": 0.16216216216216217,
"grad_norm": 0.5192847847938538,
"learning_rate": 0.00018108835270312488,
"loss": 1.5809,
"step": 285
},
{
"epoch": 0.16273115220483642,
"grad_norm": 0.5059441328048706,
"learning_rate": 0.00018095511263723768,
"loss": 1.3315,
"step": 286
},
{
"epoch": 0.16330014224751066,
"grad_norm": 0.542091429233551,
"learning_rate": 0.00018082145423292868,
"loss": 1.394,
"step": 287
},
{
"epoch": 0.16386913229018493,
"grad_norm": 0.5587398409843445,
"learning_rate": 0.00018068737818088248,
"loss": 1.5478,
"step": 288
},
{
"epoch": 0.16443812233285918,
"grad_norm": 0.5091587901115417,
"learning_rate": 0.00018055288517394174,
"loss": 1.4298,
"step": 289
},
{
"epoch": 0.16500711237553342,
"grad_norm": 0.5347201228141785,
"learning_rate": 0.00018041797590710398,
"loss": 1.4504,
"step": 290
},
{
"epoch": 0.1655761024182077,
"grad_norm": 0.5370376110076904,
"learning_rate": 0.00018028265107751756,
"loss": 1.6061,
"step": 291
},
{
"epoch": 0.16614509246088194,
"grad_norm": 0.5322532057762146,
"learning_rate": 0.00018014691138447834,
"loss": 1.5102,
"step": 292
},
{
"epoch": 0.16671408250355618,
"grad_norm": 0.4970771074295044,
"learning_rate": 0.00018001075752942605,
"loss": 1.3017,
"step": 293
},
{
"epoch": 0.16728307254623045,
"grad_norm": 0.5143032670021057,
"learning_rate": 0.00017987419021594053,
"loss": 1.5115,
"step": 294
},
{
"epoch": 0.1678520625889047,
"grad_norm": 0.4978564977645874,
"learning_rate": 0.00017973721014973823,
"loss": 1.33,
"step": 295
},
{
"epoch": 0.16842105263157894,
"grad_norm": 0.5085217356681824,
"learning_rate": 0.00017959981803866856,
"loss": 1.3251,
"step": 296
},
{
"epoch": 0.1689900426742532,
"grad_norm": 0.522738516330719,
"learning_rate": 0.0001794620145927101,
"loss": 1.3305,
"step": 297
},
{
"epoch": 0.16955903271692746,
"grad_norm": 0.506791353225708,
"learning_rate": 0.00017932380052396702,
"loss": 1.5626,
"step": 298
},
{
"epoch": 0.1701280227596017,
"grad_norm": 0.541067898273468,
"learning_rate": 0.0001791851765466655,
"loss": 1.6446,
"step": 299
},
{
"epoch": 0.17069701280227595,
"grad_norm": 0.5105940103530884,
"learning_rate": 0.0001790461433771498,
"loss": 1.5842,
"step": 300
},
{
"epoch": 0.17126600284495022,
"grad_norm": 0.49997130036354065,
"learning_rate": 0.00017890670173387885,
"loss": 1.5844,
"step": 301
},
{
"epoch": 0.17183499288762447,
"grad_norm": 0.5258059501647949,
"learning_rate": 0.00017876685233742226,
"loss": 1.5576,
"step": 302
},
{
"epoch": 0.1724039829302987,
"grad_norm": 0.5664198398590088,
"learning_rate": 0.00017862659591045673,
"loss": 1.4313,
"step": 303
},
{
"epoch": 0.17297297297297298,
"grad_norm": 0.5197086930274963,
"learning_rate": 0.00017848593317776234,
"loss": 1.4374,
"step": 304
},
{
"epoch": 0.17354196301564723,
"grad_norm": 0.5377213954925537,
"learning_rate": 0.0001783448648662188,
"loss": 1.3973,
"step": 305
},
{
"epoch": 0.17411095305832147,
"grad_norm": 0.4912850260734558,
"learning_rate": 0.00017820339170480156,
"loss": 1.3055,
"step": 306
},
{
"epoch": 0.17467994310099574,
"grad_norm": 0.5148215293884277,
"learning_rate": 0.00017806151442457827,
"loss": 1.5493,
"step": 307
},
{
"epoch": 0.17524893314367,
"grad_norm": 0.5305980443954468,
"learning_rate": 0.0001779192337587048,
"loss": 1.6176,
"step": 308
},
{
"epoch": 0.17581792318634423,
"grad_norm": 0.5322251319885254,
"learning_rate": 0.0001777765504424215,
"loss": 1.6621,
"step": 309
},
{
"epoch": 0.1763869132290185,
"grad_norm": 0.5405860543251038,
"learning_rate": 0.00017763346521304955,
"loss": 1.5951,
"step": 310
},
{
"epoch": 0.17695590327169275,
"grad_norm": 0.5762712359428406,
"learning_rate": 0.00017748997880998691,
"loss": 1.4609,
"step": 311
},
{
"epoch": 0.177524893314367,
"grad_norm": 0.5313809514045715,
"learning_rate": 0.0001773460919747047,
"loss": 1.4488,
"step": 312
},
{
"epoch": 0.17809388335704124,
"grad_norm": 0.5385677814483643,
"learning_rate": 0.00017720180545074322,
"loss": 1.5543,
"step": 313
},
{
"epoch": 0.1786628733997155,
"grad_norm": 0.5349786877632141,
"learning_rate": 0.00017705711998370824,
"loss": 1.5848,
"step": 314
},
{
"epoch": 0.17923186344238975,
"grad_norm": 0.5395460724830627,
"learning_rate": 0.00017691203632126706,
"loss": 1.5344,
"step": 315
},
{
"epoch": 0.179800853485064,
"grad_norm": 0.5073065757751465,
"learning_rate": 0.0001767665552131446,
"loss": 1.4227,
"step": 316
},
{
"epoch": 0.18036984352773827,
"grad_norm": 0.5242070555686951,
"learning_rate": 0.00017662067741111974,
"loss": 1.5054,
"step": 317
},
{
"epoch": 0.18093883357041252,
"grad_norm": 0.5271447896957397,
"learning_rate": 0.00017647440366902117,
"loss": 1.5675,
"step": 318
},
{
"epoch": 0.18150782361308676,
"grad_norm": 0.5302979946136475,
"learning_rate": 0.00017632773474272363,
"loss": 1.4631,
"step": 319
},
{
"epoch": 0.18207681365576103,
"grad_norm": 0.5438220500946045,
"learning_rate": 0.00017618067139014404,
"loss": 1.4737,
"step": 320
},
{
"epoch": 0.18264580369843528,
"grad_norm": 0.5002385377883911,
"learning_rate": 0.0001760332143712375,
"loss": 1.3976,
"step": 321
},
{
"epoch": 0.18321479374110952,
"grad_norm": 0.5478991866111755,
"learning_rate": 0.00017588536444799338,
"loss": 1.527,
"step": 322
},
{
"epoch": 0.1837837837837838,
"grad_norm": 0.5406285524368286,
"learning_rate": 0.0001757371223844314,
"loss": 1.4453,
"step": 323
},
{
"epoch": 0.18435277382645804,
"grad_norm": 0.5226593613624573,
"learning_rate": 0.00017558848894659771,
"loss": 1.5309,
"step": 324
},
{
"epoch": 0.18492176386913228,
"grad_norm": 0.5488921999931335,
"learning_rate": 0.0001754394649025609,
"loss": 1.6993,
"step": 325
},
{
"epoch": 0.18549075391180656,
"grad_norm": 0.5268238186836243,
"learning_rate": 0.000175290051022408,
"loss": 1.4578,
"step": 326
},
{
"epoch": 0.1860597439544808,
"grad_norm": 0.5236526727676392,
"learning_rate": 0.00017514024807824055,
"loss": 1.5276,
"step": 327
},
{
"epoch": 0.18662873399715504,
"grad_norm": 0.5280612707138062,
"learning_rate": 0.00017499005684417057,
"loss": 1.5191,
"step": 328
},
{
"epoch": 0.18719772403982932,
"grad_norm": 0.5311048030853271,
"learning_rate": 0.0001748394780963166,
"loss": 1.6317,
"step": 329
},
{
"epoch": 0.18776671408250356,
"grad_norm": 0.5343871712684631,
"learning_rate": 0.0001746885126127997,
"loss": 1.6759,
"step": 330
},
{
"epoch": 0.1883357041251778,
"grad_norm": 0.5824495553970337,
"learning_rate": 0.00017453716117373937,
"loss": 1.5064,
"step": 331
},
{
"epoch": 0.18890469416785205,
"grad_norm": 0.5165912508964539,
"learning_rate": 0.0001743854245612495,
"loss": 1.413,
"step": 332
},
{
"epoch": 0.18947368421052632,
"grad_norm": 0.5721679329872131,
"learning_rate": 0.0001742333035594345,
"loss": 1.3518,
"step": 333
},
{
"epoch": 0.19004267425320057,
"grad_norm": 0.5547354817390442,
"learning_rate": 0.00017408079895438498,
"loss": 1.7325,
"step": 334
},
{
"epoch": 0.1906116642958748,
"grad_norm": 0.5567200779914856,
"learning_rate": 0.00017392791153417398,
"loss": 1.6179,
"step": 335
},
{
"epoch": 0.19118065433854908,
"grad_norm": 0.5186401009559631,
"learning_rate": 0.00017377464208885265,
"loss": 1.3499,
"step": 336
},
{
"epoch": 0.19174964438122333,
"grad_norm": 0.5111268758773804,
"learning_rate": 0.00017362099141044626,
"loss": 1.2942,
"step": 337
},
{
"epoch": 0.19231863442389757,
"grad_norm": 0.5359705090522766,
"learning_rate": 0.0001734669602929502,
"loss": 1.552,
"step": 338
},
{
"epoch": 0.19288762446657184,
"grad_norm": 0.5835704803466797,
"learning_rate": 0.0001733125495323257,
"loss": 1.3161,
"step": 339
},
{
"epoch": 0.1934566145092461,
"grad_norm": 0.5223122835159302,
"learning_rate": 0.00017315775992649584,
"loss": 1.5189,
"step": 340
},
{
"epoch": 0.19402560455192033,
"grad_norm": 0.5331559777259827,
"learning_rate": 0.0001730025922753415,
"loss": 1.7263,
"step": 341
},
{
"epoch": 0.1945945945945946,
"grad_norm": 0.54593425989151,
"learning_rate": 0.00017284704738069698,
"loss": 1.5158,
"step": 342
},
{
"epoch": 0.19516358463726885,
"grad_norm": 0.5385016202926636,
"learning_rate": 0.000172691126046346,
"loss": 1.5762,
"step": 343
},
{
"epoch": 0.1957325746799431,
"grad_norm": 0.4981791079044342,
"learning_rate": 0.00017253482907801773,
"loss": 1.3606,
"step": 344
},
{
"epoch": 0.19630156472261737,
"grad_norm": 0.5046445727348328,
"learning_rate": 0.00017237815728338217,
"loss": 1.382,
"step": 345
},
{
"epoch": 0.1968705547652916,
"grad_norm": 0.5692354440689087,
"learning_rate": 0.00017222111147204645,
"loss": 1.6214,
"step": 346
},
{
"epoch": 0.19743954480796586,
"grad_norm": 0.5191353559494019,
"learning_rate": 0.00017206369245555036,
"loss": 1.459,
"step": 347
},
{
"epoch": 0.1980085348506401,
"grad_norm": 0.5159747004508972,
"learning_rate": 0.0001719059010473623,
"loss": 1.6057,
"step": 348
},
{
"epoch": 0.1980085348506401,
"eval_loss": 1.506325602531433,
"eval_runtime": 16.4362,
"eval_samples_per_second": 45.023,
"eval_steps_per_second": 22.511,
"step": 348
},
{
"epoch": 0.19857752489331437,
"grad_norm": 0.5306143164634705,
"learning_rate": 0.00017174773806287496,
"loss": 1.5776,
"step": 349
},
{
"epoch": 0.19914651493598862,
"grad_norm": 0.5569584369659424,
"learning_rate": 0.00017158920431940117,
"loss": 1.5926,
"step": 350
},
{
"epoch": 0.19971550497866286,
"grad_norm": 0.5538038611412048,
"learning_rate": 0.0001714303006361697,
"loss": 1.6146,
"step": 351
},
{
"epoch": 0.20028449502133713,
"grad_norm": 0.5369197130203247,
"learning_rate": 0.00017127102783432097,
"loss": 1.514,
"step": 352
},
{
"epoch": 0.20085348506401138,
"grad_norm": 0.6111621856689453,
"learning_rate": 0.00017111138673690283,
"loss": 1.3508,
"step": 353
},
{
"epoch": 0.20142247510668562,
"grad_norm": 0.5350061655044556,
"learning_rate": 0.0001709513781688664,
"loss": 1.5506,
"step": 354
},
{
"epoch": 0.2019914651493599,
"grad_norm": 0.5226223468780518,
"learning_rate": 0.00017079100295706154,
"loss": 1.55,
"step": 355
},
{
"epoch": 0.20256045519203414,
"grad_norm": 0.5834634304046631,
"learning_rate": 0.0001706302619302329,
"loss": 1.6025,
"step": 356
},
{
"epoch": 0.20312944523470838,
"grad_norm": 0.564756453037262,
"learning_rate": 0.0001704691559190155,
"loss": 1.5174,
"step": 357
},
{
"epoch": 0.20369843527738266,
"grad_norm": 0.5217262506484985,
"learning_rate": 0.00017030768575593025,
"loss": 1.4321,
"step": 358
},
{
"epoch": 0.2042674253200569,
"grad_norm": 0.5270060896873474,
"learning_rate": 0.0001701458522753801,
"loss": 1.6006,
"step": 359
},
{
"epoch": 0.20483641536273114,
"grad_norm": 0.5722881555557251,
"learning_rate": 0.00016998365631364527,
"loss": 1.7025,
"step": 360
},
{
"epoch": 0.20540540540540542,
"grad_norm": 0.5267907977104187,
"learning_rate": 0.00016982109870887908,
"loss": 1.5108,
"step": 361
},
{
"epoch": 0.20597439544807966,
"grad_norm": 0.5428017973899841,
"learning_rate": 0.00016965818030110382,
"loss": 1.6343,
"step": 362
},
{
"epoch": 0.2065433854907539,
"grad_norm": 0.5151480436325073,
"learning_rate": 0.0001694949019322061,
"loss": 1.5242,
"step": 363
},
{
"epoch": 0.20711237553342818,
"grad_norm": 0.5217251181602478,
"learning_rate": 0.00016933126444593273,
"loss": 1.54,
"step": 364
},
{
"epoch": 0.20768136557610242,
"grad_norm": 0.5215661525726318,
"learning_rate": 0.00016916726868788622,
"loss": 1.5131,
"step": 365
},
{
"epoch": 0.20825035561877667,
"grad_norm": 0.5087475776672363,
"learning_rate": 0.00016900291550552048,
"loss": 1.6782,
"step": 366
},
{
"epoch": 0.2088193456614509,
"grad_norm": 0.5366347432136536,
"learning_rate": 0.0001688382057481364,
"loss": 1.5821,
"step": 367
},
{
"epoch": 0.20938833570412518,
"grad_norm": 0.5469174385070801,
"learning_rate": 0.00016867314026687753,
"loss": 1.8795,
"step": 368
},
{
"epoch": 0.20995732574679943,
"grad_norm": 0.5702829957008362,
"learning_rate": 0.00016850771991472563,
"loss": 1.4382,
"step": 369
},
{
"epoch": 0.21052631578947367,
"grad_norm": 0.5792803764343262,
"learning_rate": 0.0001683419455464962,
"loss": 1.6934,
"step": 370
},
{
"epoch": 0.21109530583214794,
"grad_norm": 0.5423445701599121,
"learning_rate": 0.0001681758180188342,
"loss": 1.5408,
"step": 371
},
{
"epoch": 0.2116642958748222,
"grad_norm": 0.5211445093154907,
"learning_rate": 0.00016800933819020956,
"loss": 1.5354,
"step": 372
},
{
"epoch": 0.21223328591749643,
"grad_norm": 0.5631567239761353,
"learning_rate": 0.0001678425069209127,
"loss": 1.6356,
"step": 373
},
{
"epoch": 0.2128022759601707,
"grad_norm": 0.5736171007156372,
"learning_rate": 0.0001676753250730501,
"loss": 1.6202,
"step": 374
},
{
"epoch": 0.21337126600284495,
"grad_norm": 0.5194095373153687,
"learning_rate": 0.00016750779351053994,
"loss": 1.4419,
"step": 375
},
{
"epoch": 0.2139402560455192,
"grad_norm": 0.5220928192138672,
"learning_rate": 0.0001673399130991075,
"loss": 1.4182,
"step": 376
},
{
"epoch": 0.21450924608819347,
"grad_norm": 0.5223848819732666,
"learning_rate": 0.00016717168470628077,
"loss": 1.5831,
"step": 377
},
{
"epoch": 0.2150782361308677,
"grad_norm": 0.5400263071060181,
"learning_rate": 0.00016700310920138596,
"loss": 1.579,
"step": 378
},
{
"epoch": 0.21564722617354196,
"grad_norm": 0.5276429653167725,
"learning_rate": 0.00016683418745554299,
"loss": 1.4674,
"step": 379
},
{
"epoch": 0.21621621621621623,
"grad_norm": 0.5498270392417908,
"learning_rate": 0.000166664920341661,
"loss": 1.8171,
"step": 380
},
{
"epoch": 0.21678520625889047,
"grad_norm": 0.5207138657569885,
"learning_rate": 0.00016649530873443375,
"loss": 1.3337,
"step": 381
},
{
"epoch": 0.21735419630156472,
"grad_norm": 0.5555972456932068,
"learning_rate": 0.00016632535351033533,
"loss": 1.5634,
"step": 382
},
{
"epoch": 0.217923186344239,
"grad_norm": 0.5569733381271362,
"learning_rate": 0.00016615505554761533,
"loss": 1.6649,
"step": 383
},
{
"epoch": 0.21849217638691323,
"grad_norm": 0.5526515245437622,
"learning_rate": 0.00016598441572629458,
"loss": 1.2708,
"step": 384
},
{
"epoch": 0.21906116642958748,
"grad_norm": 0.5405237674713135,
"learning_rate": 0.0001658134349281604,
"loss": 1.5085,
"step": 385
},
{
"epoch": 0.21963015647226172,
"grad_norm": 0.5164327621459961,
"learning_rate": 0.00016564211403676213,
"loss": 1.4096,
"step": 386
},
{
"epoch": 0.220199146514936,
"grad_norm": 0.535915195941925,
"learning_rate": 0.0001654704539374066,
"loss": 1.5407,
"step": 387
},
{
"epoch": 0.22076813655761024,
"grad_norm": 0.5589139461517334,
"learning_rate": 0.0001652984555171534,
"loss": 1.5837,
"step": 388
},
{
"epoch": 0.22133712660028448,
"grad_norm": 0.5141209959983826,
"learning_rate": 0.00016512611966481056,
"loss": 1.377,
"step": 389
},
{
"epoch": 0.22190611664295876,
"grad_norm": 0.514789879322052,
"learning_rate": 0.00016495344727092973,
"loss": 1.5191,
"step": 390
},
{
"epoch": 0.222475106685633,
"grad_norm": 0.5353395342826843,
"learning_rate": 0.00016478043922780157,
"loss": 1.5026,
"step": 391
},
{
"epoch": 0.22304409672830725,
"grad_norm": 0.5318089127540588,
"learning_rate": 0.00016460709642945133,
"loss": 1.5277,
"step": 392
},
{
"epoch": 0.22361308677098152,
"grad_norm": 0.5722904205322266,
"learning_rate": 0.00016443341977163408,
"loss": 1.3433,
"step": 393
},
{
"epoch": 0.22418207681365576,
"grad_norm": 0.542008101940155,
"learning_rate": 0.0001642594101518301,
"loss": 1.5241,
"step": 394
},
{
"epoch": 0.22475106685633,
"grad_norm": 0.5351589918136597,
"learning_rate": 0.00016408506846924035,
"loss": 1.6335,
"step": 395
},
{
"epoch": 0.22532005689900428,
"grad_norm": 0.5150931477546692,
"learning_rate": 0.00016391039562478157,
"loss": 1.5412,
"step": 396
},
{
"epoch": 0.22588904694167852,
"grad_norm": 0.5498356819152832,
"learning_rate": 0.00016373539252108202,
"loss": 1.5062,
"step": 397
},
{
"epoch": 0.22645803698435277,
"grad_norm": 0.5373052358627319,
"learning_rate": 0.0001635600600624763,
"loss": 1.6658,
"step": 398
},
{
"epoch": 0.22702702702702704,
"grad_norm": 0.5198200941085815,
"learning_rate": 0.00016338439915500127,
"loss": 1.3554,
"step": 399
},
{
"epoch": 0.22759601706970128,
"grad_norm": 0.5517953038215637,
"learning_rate": 0.00016320841070639083,
"loss": 1.5403,
"step": 400
},
{
"epoch": 0.22816500711237553,
"grad_norm": 0.5407613515853882,
"learning_rate": 0.00016303209562607154,
"loss": 1.5033,
"step": 401
},
{
"epoch": 0.22873399715504977,
"grad_norm": 0.5271732211112976,
"learning_rate": 0.00016285545482515792,
"loss": 1.4554,
"step": 402
},
{
"epoch": 0.22930298719772405,
"grad_norm": 0.5387139916419983,
"learning_rate": 0.0001626784892164475,
"loss": 1.7347,
"step": 403
},
{
"epoch": 0.2298719772403983,
"grad_norm": 0.5222678780555725,
"learning_rate": 0.00016250119971441637,
"loss": 1.4489,
"step": 404
},
{
"epoch": 0.23044096728307253,
"grad_norm": 0.5498174428939819,
"learning_rate": 0.00016232358723521436,
"loss": 1.6047,
"step": 405
},
{
"epoch": 0.2310099573257468,
"grad_norm": 0.5119244456291199,
"learning_rate": 0.0001621456526966603,
"loss": 1.5818,
"step": 406
},
{
"epoch": 0.23157894736842105,
"grad_norm": 0.5584565997123718,
"learning_rate": 0.00016196739701823716,
"loss": 1.6863,
"step": 407
},
{
"epoch": 0.2321479374110953,
"grad_norm": 0.5125292539596558,
"learning_rate": 0.00016178882112108752,
"loss": 1.4137,
"step": 408
},
{
"epoch": 0.23271692745376957,
"grad_norm": 0.518551230430603,
"learning_rate": 0.00016160992592800872,
"loss": 1.304,
"step": 409
},
{
"epoch": 0.2332859174964438,
"grad_norm": 0.5396437048912048,
"learning_rate": 0.00016143071236344797,
"loss": 1.6118,
"step": 410
},
{
"epoch": 0.23385490753911806,
"grad_norm": 0.6036053895950317,
"learning_rate": 0.0001612511813534978,
"loss": 1.5618,
"step": 411
},
{
"epoch": 0.23442389758179233,
"grad_norm": 0.5274645686149597,
"learning_rate": 0.00016107133382589105,
"loss": 1.5238,
"step": 412
},
{
"epoch": 0.23499288762446657,
"grad_norm": 0.5649259090423584,
"learning_rate": 0.00016089117070999616,
"loss": 1.4841,
"step": 413
},
{
"epoch": 0.23556187766714082,
"grad_norm": 0.5350419282913208,
"learning_rate": 0.0001607106929368125,
"loss": 1.4252,
"step": 414
},
{
"epoch": 0.2361308677098151,
"grad_norm": 0.5421844124794006,
"learning_rate": 0.00016052990143896535,
"loss": 1.3899,
"step": 415
},
{
"epoch": 0.23669985775248933,
"grad_norm": 0.5462636947631836,
"learning_rate": 0.0001603487971507012,
"loss": 1.6417,
"step": 416
},
{
"epoch": 0.23726884779516358,
"grad_norm": 0.564430832862854,
"learning_rate": 0.00016016738100788297,
"loss": 1.6418,
"step": 417
},
{
"epoch": 0.23783783783783785,
"grad_norm": 0.5399342179298401,
"learning_rate": 0.00015998565394798492,
"loss": 1.3624,
"step": 418
},
{
"epoch": 0.2384068278805121,
"grad_norm": 0.5136001706123352,
"learning_rate": 0.00015980361691008815,
"loss": 1.3956,
"step": 419
},
{
"epoch": 0.23897581792318634,
"grad_norm": 0.5325256586074829,
"learning_rate": 0.00015962127083487548,
"loss": 1.2396,
"step": 420
},
{
"epoch": 0.23954480796586058,
"grad_norm": 0.5132279396057129,
"learning_rate": 0.00015943861666462675,
"loss": 1.4461,
"step": 421
},
{
"epoch": 0.24011379800853486,
"grad_norm": 0.5597640872001648,
"learning_rate": 0.0001592556553432139,
"loss": 1.5031,
"step": 422
},
{
"epoch": 0.2406827880512091,
"grad_norm": 0.5563086271286011,
"learning_rate": 0.00015907238781609606,
"loss": 1.4839,
"step": 423
},
{
"epoch": 0.24125177809388335,
"grad_norm": 0.557904839515686,
"learning_rate": 0.00015888881503031468,
"loss": 1.6277,
"step": 424
},
{
"epoch": 0.24182076813655762,
"grad_norm": 0.5795301198959351,
"learning_rate": 0.00015870493793448864,
"loss": 1.4073,
"step": 425
},
{
"epoch": 0.24238975817923186,
"grad_norm": 0.5133345127105713,
"learning_rate": 0.00015852075747880938,
"loss": 1.3689,
"step": 426
},
{
"epoch": 0.2429587482219061,
"grad_norm": 0.5455712676048279,
"learning_rate": 0.00015833627461503595,
"loss": 1.6118,
"step": 427
},
{
"epoch": 0.24352773826458038,
"grad_norm": 0.5585681796073914,
"learning_rate": 0.00015815149029649013,
"loss": 1.5628,
"step": 428
},
{
"epoch": 0.24409672830725462,
"grad_norm": 0.5475082397460938,
"learning_rate": 0.0001579664054780514,
"loss": 1.5907,
"step": 429
},
{
"epoch": 0.24466571834992887,
"grad_norm": 0.530405580997467,
"learning_rate": 0.0001577810211161522,
"loss": 1.5324,
"step": 430
},
{
"epoch": 0.24523470839260314,
"grad_norm": 0.5662998557090759,
"learning_rate": 0.00015759533816877275,
"loss": 1.2456,
"step": 431
},
{
"epoch": 0.24580369843527738,
"grad_norm": 0.6249381303787231,
"learning_rate": 0.0001574093575954363,
"loss": 1.4694,
"step": 432
},
{
"epoch": 0.24637268847795163,
"grad_norm": 0.5382659435272217,
"learning_rate": 0.00015722308035720408,
"loss": 1.6025,
"step": 433
},
{
"epoch": 0.2469416785206259,
"grad_norm": 0.5415714383125305,
"learning_rate": 0.00015703650741667036,
"loss": 1.3643,
"step": 434
},
{
"epoch": 0.24751066856330015,
"grad_norm": 0.540256917476654,
"learning_rate": 0.0001568496397379574,
"loss": 1.4577,
"step": 435
},
{
"epoch": 0.2480796586059744,
"grad_norm": 0.5126465559005737,
"learning_rate": 0.0001566624782867106,
"loss": 1.5512,
"step": 436
},
{
"epoch": 0.24864864864864866,
"grad_norm": 0.5520801544189453,
"learning_rate": 0.0001564750240300934,
"loss": 1.6545,
"step": 437
},
{
"epoch": 0.2492176386913229,
"grad_norm": 0.5290027260780334,
"learning_rate": 0.00015628727793678233,
"loss": 1.5391,
"step": 438
},
{
"epoch": 0.24978662873399715,
"grad_norm": 0.5835967659950256,
"learning_rate": 0.00015609924097696203,
"loss": 1.4657,
"step": 439
},
{
"epoch": 0.2503556187766714,
"grad_norm": 0.5586689710617065,
"learning_rate": 0.00015591091412232012,
"loss": 1.5222,
"step": 440
},
{
"epoch": 0.25092460881934564,
"grad_norm": 0.5292929410934448,
"learning_rate": 0.00015572229834604235,
"loss": 1.4726,
"step": 441
},
{
"epoch": 0.25149359886201994,
"grad_norm": 0.5165523290634155,
"learning_rate": 0.00015553339462280748,
"loss": 1.4154,
"step": 442
},
{
"epoch": 0.2520625889046942,
"grad_norm": 0.5475851893424988,
"learning_rate": 0.00015534420392878211,
"loss": 1.5885,
"step": 443
},
{
"epoch": 0.25263157894736843,
"grad_norm": 0.5540974736213684,
"learning_rate": 0.00015515472724161598,
"loss": 1.4529,
"step": 444
},
{
"epoch": 0.2532005689900427,
"grad_norm": 0.5251240730285645,
"learning_rate": 0.00015496496554043653,
"loss": 1.3794,
"step": 445
},
{
"epoch": 0.2537695590327169,
"grad_norm": 0.5751416683197021,
"learning_rate": 0.00015477491980584417,
"loss": 1.5417,
"step": 446
},
{
"epoch": 0.25433854907539116,
"grad_norm": 0.5411546230316162,
"learning_rate": 0.00015458459101990693,
"loss": 1.6787,
"step": 447
},
{
"epoch": 0.2549075391180654,
"grad_norm": 0.5817191004753113,
"learning_rate": 0.00015439398016615558,
"loss": 1.5382,
"step": 448
},
{
"epoch": 0.2554765291607397,
"grad_norm": 0.505901038646698,
"learning_rate": 0.00015420308822957848,
"loss": 1.3885,
"step": 449
},
{
"epoch": 0.25604551920341395,
"grad_norm": 0.5091856718063354,
"learning_rate": 0.00015401191619661658,
"loss": 1.4067,
"step": 450
},
{
"epoch": 0.2566145092460882,
"grad_norm": 0.5677408576011658,
"learning_rate": 0.00015382046505515803,
"loss": 1.5578,
"step": 451
},
{
"epoch": 0.25718349928876244,
"grad_norm": 0.5270281434059143,
"learning_rate": 0.00015362873579453348,
"loss": 1.3921,
"step": 452
},
{
"epoch": 0.2577524893314367,
"grad_norm": 0.5784454345703125,
"learning_rate": 0.00015343672940551067,
"loss": 1.5433,
"step": 453
},
{
"epoch": 0.25832147937411093,
"grad_norm": 0.5490661859512329,
"learning_rate": 0.00015324444688028947,
"loss": 1.4543,
"step": 454
},
{
"epoch": 0.25889046941678523,
"grad_norm": 0.5555963516235352,
"learning_rate": 0.00015305188921249665,
"loss": 1.3882,
"step": 455
},
{
"epoch": 0.2594594594594595,
"grad_norm": 0.5918729305267334,
"learning_rate": 0.0001528590573971808,
"loss": 1.6544,
"step": 456
},
{
"epoch": 0.2600284495021337,
"grad_norm": 0.5301398038864136,
"learning_rate": 0.00015266595243080714,
"loss": 1.6201,
"step": 457
},
{
"epoch": 0.26059743954480796,
"grad_norm": 0.5327576994895935,
"learning_rate": 0.0001524725753112525,
"loss": 1.6861,
"step": 458
},
{
"epoch": 0.2611664295874822,
"grad_norm": 0.5090361833572388,
"learning_rate": 0.00015227892703780003,
"loss": 1.2298,
"step": 459
},
{
"epoch": 0.26173541963015645,
"grad_norm": 0.5667193531990051,
"learning_rate": 0.00015208500861113401,
"loss": 1.4061,
"step": 460
},
{
"epoch": 0.26230440967283075,
"grad_norm": 0.5170226097106934,
"learning_rate": 0.00015189082103333484,
"loss": 1.3402,
"step": 461
},
{
"epoch": 0.262873399715505,
"grad_norm": 0.5260865688323975,
"learning_rate": 0.0001516963653078737,
"loss": 1.4571,
"step": 462
},
{
"epoch": 0.26344238975817924,
"grad_norm": 0.5484414100646973,
"learning_rate": 0.00015150164243960752,
"loss": 1.4822,
"step": 463
},
{
"epoch": 0.2640113798008535,
"grad_norm": 0.5555655360221863,
"learning_rate": 0.00015130665343477358,
"loss": 1.4383,
"step": 464
},
{
"epoch": 0.26458036984352773,
"grad_norm": 0.5628737211227417,
"learning_rate": 0.0001511113993009845,
"loss": 1.6092,
"step": 465
},
{
"epoch": 0.265149359886202,
"grad_norm": 0.5401899814605713,
"learning_rate": 0.00015091588104722297,
"loss": 1.4347,
"step": 466
},
{
"epoch": 0.2657183499288762,
"grad_norm": 0.5575911998748779,
"learning_rate": 0.00015072009968383656,
"loss": 1.6627,
"step": 467
},
{
"epoch": 0.2662873399715505,
"grad_norm": 0.539851725101471,
"learning_rate": 0.00015052405622253235,
"loss": 1.5648,
"step": 468
},
{
"epoch": 0.26685633001422476,
"grad_norm": 0.5497231483459473,
"learning_rate": 0.00015032775167637193,
"loss": 1.5671,
"step": 469
},
{
"epoch": 0.267425320056899,
"grad_norm": 0.5294174551963806,
"learning_rate": 0.00015013118705976602,
"loss": 1.4519,
"step": 470
},
{
"epoch": 0.26799431009957325,
"grad_norm": 0.5508366227149963,
"learning_rate": 0.00014993436338846925,
"loss": 1.2089,
"step": 471
},
{
"epoch": 0.2685633001422475,
"grad_norm": 0.530941903591156,
"learning_rate": 0.00014973728167957498,
"loss": 1.2298,
"step": 472
},
{
"epoch": 0.26913229018492174,
"grad_norm": 0.572995126247406,
"learning_rate": 0.00014953994295150986,
"loss": 1.5102,
"step": 473
},
{
"epoch": 0.26970128022759604,
"grad_norm": 0.5313156843185425,
"learning_rate": 0.00014934234822402883,
"loss": 1.3345,
"step": 474
},
{
"epoch": 0.2702702702702703,
"grad_norm": 0.5710895657539368,
"learning_rate": 0.0001491444985182097,
"loss": 1.4461,
"step": 475
},
{
"epoch": 0.27083926031294453,
"grad_norm": 0.5655211210250854,
"learning_rate": 0.00014894639485644784,
"loss": 1.6591,
"step": 476
},
{
"epoch": 0.2714082503556188,
"grad_norm": 0.5507573485374451,
"learning_rate": 0.00014874803826245089,
"loss": 1.3442,
"step": 477
},
{
"epoch": 0.271977240398293,
"grad_norm": 0.5628292560577393,
"learning_rate": 0.00014854942976123367,
"loss": 1.6926,
"step": 478
},
{
"epoch": 0.27254623044096726,
"grad_norm": 0.5278828740119934,
"learning_rate": 0.00014835057037911268,
"loss": 1.3193,
"step": 479
},
{
"epoch": 0.27311522048364156,
"grad_norm": 0.550122857093811,
"learning_rate": 0.0001481514611437008,
"loss": 1.4085,
"step": 480
},
{
"epoch": 0.2736842105263158,
"grad_norm": 0.5174803733825684,
"learning_rate": 0.00014795210308390211,
"loss": 1.2066,
"step": 481
},
{
"epoch": 0.27425320056899005,
"grad_norm": 0.5421956777572632,
"learning_rate": 0.00014775249722990646,
"loss": 1.4261,
"step": 482
},
{
"epoch": 0.2748221906116643,
"grad_norm": 0.5158098936080933,
"learning_rate": 0.00014755264461318416,
"loss": 1.277,
"step": 483
},
{
"epoch": 0.27539118065433854,
"grad_norm": 0.5564343929290771,
"learning_rate": 0.0001473525462664808,
"loss": 1.5075,
"step": 484
},
{
"epoch": 0.2759601706970128,
"grad_norm": 0.5485411882400513,
"learning_rate": 0.0001471522032238116,
"loss": 1.4847,
"step": 485
},
{
"epoch": 0.27652916073968703,
"grad_norm": 0.5449703931808472,
"learning_rate": 0.00014695161652045641,
"loss": 1.6162,
"step": 486
},
{
"epoch": 0.27709815078236133,
"grad_norm": 0.5641449093818665,
"learning_rate": 0.00014675078719295415,
"loss": 1.3614,
"step": 487
},
{
"epoch": 0.2776671408250356,
"grad_norm": 0.5554978251457214,
"learning_rate": 0.00014654971627909747,
"loss": 1.5019,
"step": 488
},
{
"epoch": 0.2782361308677098,
"grad_norm": 0.5530039668083191,
"learning_rate": 0.0001463484048179275,
"loss": 1.5116,
"step": 489
},
{
"epoch": 0.27880512091038406,
"grad_norm": 0.5324894189834595,
"learning_rate": 0.00014614685384972835,
"loss": 1.3575,
"step": 490
},
{
"epoch": 0.2793741109530583,
"grad_norm": 0.5472353100776672,
"learning_rate": 0.0001459450644160218,
"loss": 1.5364,
"step": 491
},
{
"epoch": 0.27994310099573255,
"grad_norm": 0.5706241130828857,
"learning_rate": 0.00014574303755956195,
"loss": 1.5958,
"step": 492
},
{
"epoch": 0.28051209103840685,
"grad_norm": 0.5553603768348694,
"learning_rate": 0.00014554077432432975,
"loss": 1.5664,
"step": 493
},
{
"epoch": 0.2810810810810811,
"grad_norm": 0.542325496673584,
"learning_rate": 0.00014533827575552766,
"loss": 1.4275,
"step": 494
},
{
"epoch": 0.28165007112375534,
"grad_norm": 0.6180648803710938,
"learning_rate": 0.00014513554289957424,
"loss": 1.3948,
"step": 495
},
{
"epoch": 0.2822190611664296,
"grad_norm": 0.6009839177131653,
"learning_rate": 0.0001449325768040987,
"loss": 1.6545,
"step": 496
},
{
"epoch": 0.28278805120910383,
"grad_norm": 0.58924800157547,
"learning_rate": 0.00014472937851793557,
"loss": 1.3284,
"step": 497
},
{
"epoch": 0.2833570412517781,
"grad_norm": 0.5391841530799866,
"learning_rate": 0.0001445259490911192,
"loss": 1.3593,
"step": 498
},
{
"epoch": 0.2839260312944524,
"grad_norm": 0.562134325504303,
"learning_rate": 0.0001443222895748784,
"loss": 1.4458,
"step": 499
},
{
"epoch": 0.2844950213371266,
"grad_norm": 0.5663224458694458,
"learning_rate": 0.000144118401021631,
"loss": 1.5136,
"step": 500
},
{
"epoch": 0.28506401137980086,
"grad_norm": 0.5762481689453125,
"learning_rate": 0.00014391428448497825,
"loss": 1.5841,
"step": 501
},
{
"epoch": 0.2856330014224751,
"grad_norm": 0.5568172931671143,
"learning_rate": 0.00014370994101969967,
"loss": 1.5863,
"step": 502
},
{
"epoch": 0.28620199146514935,
"grad_norm": 0.5461404323577881,
"learning_rate": 0.00014350537168174738,
"loss": 1.4175,
"step": 503
},
{
"epoch": 0.2867709815078236,
"grad_norm": 0.5522152781486511,
"learning_rate": 0.00014330057752824068,
"loss": 1.5865,
"step": 504
},
{
"epoch": 0.28733997155049784,
"grad_norm": 0.5333879590034485,
"learning_rate": 0.00014309555961746067,
"loss": 1.4804,
"step": 505
},
{
"epoch": 0.28790896159317214,
"grad_norm": 0.5656757354736328,
"learning_rate": 0.00014289031900884463,
"loss": 1.4009,
"step": 506
},
{
"epoch": 0.2884779516358464,
"grad_norm": 0.55275559425354,
"learning_rate": 0.00014268485676298078,
"loss": 1.3477,
"step": 507
},
{
"epoch": 0.28904694167852063,
"grad_norm": 0.5528755784034729,
"learning_rate": 0.00014247917394160254,
"loss": 1.6965,
"step": 508
},
{
"epoch": 0.2896159317211949,
"grad_norm": 0.5423591732978821,
"learning_rate": 0.00014227327160758316,
"loss": 1.3725,
"step": 509
},
{
"epoch": 0.2901849217638691,
"grad_norm": 0.5610995292663574,
"learning_rate": 0.00014206715082493032,
"loss": 1.5135,
"step": 510
},
{
"epoch": 0.29075391180654336,
"grad_norm": 0.550565242767334,
"learning_rate": 0.00014186081265878047,
"loss": 1.2824,
"step": 511
},
{
"epoch": 0.29132290184921766,
"grad_norm": 0.5238208174705505,
"learning_rate": 0.00014165425817539343,
"loss": 1.3519,
"step": 512
},
{
"epoch": 0.2918918918918919,
"grad_norm": 0.5561342835426331,
"learning_rate": 0.00014144748844214684,
"loss": 1.4381,
"step": 513
},
{
"epoch": 0.29246088193456615,
"grad_norm": 0.5522477030754089,
"learning_rate": 0.0001412405045275306,
"loss": 1.5873,
"step": 514
},
{
"epoch": 0.2930298719772404,
"grad_norm": 0.5491191744804382,
"learning_rate": 0.0001410333075011415,
"loss": 1.4527,
"step": 515
},
{
"epoch": 0.29359886201991464,
"grad_norm": 0.5521331429481506,
"learning_rate": 0.00014082589843367752,
"loss": 1.6342,
"step": 516
},
{
"epoch": 0.2941678520625889,
"grad_norm": 0.5632197856903076,
"learning_rate": 0.0001406182783969324,
"loss": 1.4758,
"step": 517
},
{
"epoch": 0.29473684210526313,
"grad_norm": 0.5883782505989075,
"learning_rate": 0.00014041044846379,
"loss": 1.4963,
"step": 518
},
{
"epoch": 0.29530583214793743,
"grad_norm": 0.5621269941329956,
"learning_rate": 0.00014020240970821893,
"loss": 1.6292,
"step": 519
},
{
"epoch": 0.2958748221906117,
"grad_norm": 0.5850755572319031,
"learning_rate": 0.00013999416320526685,
"loss": 1.5853,
"step": 520
},
{
"epoch": 0.2964438122332859,
"grad_norm": 0.5468763113021851,
"learning_rate": 0.00013978571003105502,
"loss": 1.4112,
"step": 521
},
{
"epoch": 0.29701280227596016,
"grad_norm": 0.5954291820526123,
"learning_rate": 0.00013957705126277253,
"loss": 1.4785,
"step": 522
},
{
"epoch": 0.2975817923186344,
"grad_norm": 0.5438716411590576,
"learning_rate": 0.00013936818797867102,
"loss": 1.6543,
"step": 523
},
{
"epoch": 0.29815078236130865,
"grad_norm": 0.5444651246070862,
"learning_rate": 0.00013915912125805893,
"loss": 1.5327,
"step": 524
},
{
"epoch": 0.29871977240398295,
"grad_norm": 0.5755301117897034,
"learning_rate": 0.00013894985218129602,
"loss": 1.5734,
"step": 525
},
{
"epoch": 0.2992887624466572,
"grad_norm": 0.5267385244369507,
"learning_rate": 0.0001387403818297876,
"loss": 1.5172,
"step": 526
},
{
"epoch": 0.29985775248933144,
"grad_norm": 0.5721412301063538,
"learning_rate": 0.00013853071128597924,
"loss": 1.617,
"step": 527
},
{
"epoch": 0.3004267425320057,
"grad_norm": 0.547497570514679,
"learning_rate": 0.00013832084163335084,
"loss": 1.4242,
"step": 528
},
{
"epoch": 0.30099573257467993,
"grad_norm": 0.5331338047981262,
"learning_rate": 0.00013811077395641135,
"loss": 1.2921,
"step": 529
},
{
"epoch": 0.3015647226173542,
"grad_norm": 0.5468523502349854,
"learning_rate": 0.00013790050934069296,
"loss": 1.3264,
"step": 530
},
{
"epoch": 0.3021337126600285,
"grad_norm": 0.538796067237854,
"learning_rate": 0.00013769004887274547,
"loss": 1.4284,
"step": 531
},
{
"epoch": 0.3027027027027027,
"grad_norm": 0.5727618932723999,
"learning_rate": 0.0001374793936401309,
"loss": 1.509,
"step": 532
},
{
"epoch": 0.30327169274537696,
"grad_norm": 0.5127109289169312,
"learning_rate": 0.00013726854473141765,
"loss": 1.3145,
"step": 533
},
{
"epoch": 0.3038406827880512,
"grad_norm": 0.5412492156028748,
"learning_rate": 0.00013705750323617495,
"loss": 1.4385,
"step": 534
},
{
"epoch": 0.30440967283072545,
"grad_norm": 0.6073004603385925,
"learning_rate": 0.0001368462702449672,
"loss": 1.585,
"step": 535
},
{
"epoch": 0.3049786628733997,
"grad_norm": 0.6075984239578247,
"learning_rate": 0.00013663484684934836,
"loss": 1.6782,
"step": 536
},
{
"epoch": 0.30554765291607394,
"grad_norm": 0.5950874090194702,
"learning_rate": 0.0001364232341418564,
"loss": 1.6634,
"step": 537
},
{
"epoch": 0.30611664295874824,
"grad_norm": 0.5442619323730469,
"learning_rate": 0.00013621143321600746,
"loss": 1.6321,
"step": 538
},
{
"epoch": 0.3066856330014225,
"grad_norm": 0.5568251609802246,
"learning_rate": 0.00013599944516629045,
"loss": 1.3718,
"step": 539
},
{
"epoch": 0.30725462304409673,
"grad_norm": 0.5321120023727417,
"learning_rate": 0.00013578727108816104,
"loss": 1.3387,
"step": 540
},
{
"epoch": 0.307823613086771,
"grad_norm": 0.6142572164535522,
"learning_rate": 0.00013557491207803635,
"loss": 1.4013,
"step": 541
},
{
"epoch": 0.3083926031294452,
"grad_norm": 0.5809832811355591,
"learning_rate": 0.0001353623692332891,
"loss": 1.2896,
"step": 542
},
{
"epoch": 0.30896159317211946,
"grad_norm": 0.5262885689735413,
"learning_rate": 0.00013514964365224206,
"loss": 1.4799,
"step": 543
},
{
"epoch": 0.30953058321479376,
"grad_norm": 0.5609673261642456,
"learning_rate": 0.00013493673643416218,
"loss": 1.461,
"step": 544
},
{
"epoch": 0.310099573257468,
"grad_norm": 0.5489050149917603,
"learning_rate": 0.0001347236486792551,
"loss": 1.3912,
"step": 545
},
{
"epoch": 0.31066856330014225,
"grad_norm": 0.55717533826828,
"learning_rate": 0.0001345103814886593,
"loss": 1.4207,
"step": 546
},
{
"epoch": 0.3112375533428165,
"grad_norm": 0.5326306819915771,
"learning_rate": 0.00013429693596444067,
"loss": 1.563,
"step": 547
},
{
"epoch": 0.31180654338549074,
"grad_norm": 0.5783535838127136,
"learning_rate": 0.00013408331320958648,
"loss": 1.4829,
"step": 548
},
{
"epoch": 0.312375533428165,
"grad_norm": 0.5628453493118286,
"learning_rate": 0.00013386951432799987,
"loss": 1.4815,
"step": 549
},
{
"epoch": 0.3129445234708393,
"grad_norm": 0.5468215346336365,
"learning_rate": 0.00013365554042449427,
"loss": 1.3575,
"step": 550
},
{
"epoch": 0.31351351351351353,
"grad_norm": 0.5711040496826172,
"learning_rate": 0.00013344139260478732,
"loss": 1.5833,
"step": 551
},
{
"epoch": 0.3140825035561878,
"grad_norm": 0.5313072204589844,
"learning_rate": 0.00013322707197549555,
"loss": 1.5447,
"step": 552
},
{
"epoch": 0.314651493598862,
"grad_norm": 0.6006999015808105,
"learning_rate": 0.00013301257964412844,
"loss": 1.747,
"step": 553
},
{
"epoch": 0.31522048364153626,
"grad_norm": 0.6007615923881531,
"learning_rate": 0.00013279791671908268,
"loss": 1.5486,
"step": 554
},
{
"epoch": 0.3157894736842105,
"grad_norm": 0.553854763507843,
"learning_rate": 0.00013258308430963664,
"loss": 1.4473,
"step": 555
},
{
"epoch": 0.31635846372688475,
"grad_norm": 0.5920282006263733,
"learning_rate": 0.00013236808352594433,
"loss": 1.4883,
"step": 556
},
{
"epoch": 0.31692745376955905,
"grad_norm": 0.5819621682167053,
"learning_rate": 0.00013215291547903006,
"loss": 1.4925,
"step": 557
},
{
"epoch": 0.3174964438122333,
"grad_norm": 0.5728132128715515,
"learning_rate": 0.0001319375812807823,
"loss": 1.3921,
"step": 558
},
{
"epoch": 0.31806543385490754,
"grad_norm": 0.6309751868247986,
"learning_rate": 0.0001317220820439481,
"loss": 1.6893,
"step": 559
},
{
"epoch": 0.3186344238975818,
"grad_norm": 0.5545490384101868,
"learning_rate": 0.00013150641888212756,
"loss": 1.4053,
"step": 560
},
{
"epoch": 0.31920341394025603,
"grad_norm": 0.5476984977722168,
"learning_rate": 0.00013129059290976767,
"loss": 1.3499,
"step": 561
},
{
"epoch": 0.3197724039829303,
"grad_norm": 0.5255653262138367,
"learning_rate": 0.00013107460524215678,
"loss": 1.318,
"step": 562
},
{
"epoch": 0.3203413940256046,
"grad_norm": 0.649142861366272,
"learning_rate": 0.0001308584569954189,
"loss": 1.6503,
"step": 563
},
{
"epoch": 0.3209103840682788,
"grad_norm": 0.5934924483299255,
"learning_rate": 0.0001306421492865077,
"loss": 1.5933,
"step": 564
},
{
"epoch": 0.32147937411095306,
"grad_norm": 0.5277055501937866,
"learning_rate": 0.00013042568323320107,
"loss": 1.4174,
"step": 565
},
{
"epoch": 0.3220483641536273,
"grad_norm": 0.5566196441650391,
"learning_rate": 0.00013020905995409497,
"loss": 1.4713,
"step": 566
},
{
"epoch": 0.32261735419630155,
"grad_norm": 0.5719363689422607,
"learning_rate": 0.00012999228056859784,
"loss": 1.5238,
"step": 567
},
{
"epoch": 0.3231863442389758,
"grad_norm": 0.5720301866531372,
"learning_rate": 0.00012977534619692494,
"loss": 1.5374,
"step": 568
},
{
"epoch": 0.3237553342816501,
"grad_norm": 0.5727265477180481,
"learning_rate": 0.0001295582579600923,
"loss": 1.4789,
"step": 569
},
{
"epoch": 0.32432432432432434,
"grad_norm": 0.5553936958312988,
"learning_rate": 0.00012934101697991115,
"loss": 1.2535,
"step": 570
},
{
"epoch": 0.3248933143669986,
"grad_norm": 0.5490901470184326,
"learning_rate": 0.00012912362437898192,
"loss": 1.4513,
"step": 571
},
{
"epoch": 0.32546230440967283,
"grad_norm": 0.5691761374473572,
"learning_rate": 0.0001289060812806886,
"loss": 1.5947,
"step": 572
},
{
"epoch": 0.3260312944523471,
"grad_norm": 0.5883947610855103,
"learning_rate": 0.00012868838880919294,
"loss": 1.3175,
"step": 573
},
{
"epoch": 0.3266002844950213,
"grad_norm": 0.5340852737426758,
"learning_rate": 0.00012847054808942847,
"loss": 1.1903,
"step": 574
},
{
"epoch": 0.32716927453769556,
"grad_norm": 0.5509372353553772,
"learning_rate": 0.0001282525602470949,
"loss": 1.5289,
"step": 575
},
{
"epoch": 0.32773826458036986,
"grad_norm": 0.5860341191291809,
"learning_rate": 0.00012803442640865208,
"loss": 1.6618,
"step": 576
},
{
"epoch": 0.3283072546230441,
"grad_norm": 0.540502667427063,
"learning_rate": 0.00012781614770131442,
"loss": 1.5062,
"step": 577
},
{
"epoch": 0.32887624466571835,
"grad_norm": 0.5500742793083191,
"learning_rate": 0.00012759772525304492,
"loss": 1.6137,
"step": 578
},
{
"epoch": 0.3294452347083926,
"grad_norm": 0.550717830657959,
"learning_rate": 0.00012737916019254933,
"loss": 1.6204,
"step": 579
},
{
"epoch": 0.33001422475106684,
"grad_norm": 0.5424780249595642,
"learning_rate": 0.00012716045364927035,
"loss": 1.3499,
"step": 580
},
{
"epoch": 0.3305832147937411,
"grad_norm": 0.5449280142784119,
"learning_rate": 0.0001269416067533818,
"loss": 1.518,
"step": 581
},
{
"epoch": 0.3311522048364154,
"grad_norm": 0.5500824451446533,
"learning_rate": 0.0001267226206357828,
"loss": 1.6019,
"step": 582
},
{
"epoch": 0.33172119487908963,
"grad_norm": 0.5455232262611389,
"learning_rate": 0.00012650349642809197,
"loss": 1.5048,
"step": 583
},
{
"epoch": 0.3322901849217639,
"grad_norm": 0.5600374937057495,
"learning_rate": 0.00012628423526264134,
"loss": 1.4539,
"step": 584
},
{
"epoch": 0.3328591749644381,
"grad_norm": 0.5611444115638733,
"learning_rate": 0.0001260648382724708,
"loss": 1.4871,
"step": 585
},
{
"epoch": 0.33342816500711236,
"grad_norm": 0.5722511410713196,
"learning_rate": 0.00012584530659132215,
"loss": 1.4491,
"step": 586
},
{
"epoch": 0.3339971550497866,
"grad_norm": 0.5913495421409607,
"learning_rate": 0.00012562564135363313,
"loss": 1.136,
"step": 587
},
{
"epoch": 0.3345661450924609,
"grad_norm": 0.578739583492279,
"learning_rate": 0.00012540584369453162,
"loss": 1.3503,
"step": 588
},
{
"epoch": 0.33513513513513515,
"grad_norm": 0.5618348717689514,
"learning_rate": 0.00012518591474982985,
"loss": 1.5827,
"step": 589
},
{
"epoch": 0.3357041251778094,
"grad_norm": 0.5958595871925354,
"learning_rate": 0.00012496585565601853,
"loss": 1.6305,
"step": 590
},
{
"epoch": 0.33627311522048364,
"grad_norm": 0.5362867116928101,
"learning_rate": 0.00012474566755026073,
"loss": 1.416,
"step": 591
},
{
"epoch": 0.3368421052631579,
"grad_norm": 0.5598848462104797,
"learning_rate": 0.00012452535157038641,
"loss": 1.4456,
"step": 592
},
{
"epoch": 0.33741109530583213,
"grad_norm": 0.5422506332397461,
"learning_rate": 0.00012430490885488617,
"loss": 1.3472,
"step": 593
},
{
"epoch": 0.3379800853485064,
"grad_norm": 0.5901892781257629,
"learning_rate": 0.00012408434054290561,
"loss": 1.5748,
"step": 594
},
{
"epoch": 0.3385490753911807,
"grad_norm": 0.5219245553016663,
"learning_rate": 0.00012386364777423932,
"loss": 1.3369,
"step": 595
},
{
"epoch": 0.3391180654338549,
"grad_norm": 0.5885049104690552,
"learning_rate": 0.00012364283168932495,
"loss": 1.5212,
"step": 596
},
{
"epoch": 0.33968705547652916,
"grad_norm": 0.5666311383247375,
"learning_rate": 0.0001234218934292376,
"loss": 1.5041,
"step": 597
},
{
"epoch": 0.3402560455192034,
"grad_norm": 0.6065592765808105,
"learning_rate": 0.0001232008341356835,
"loss": 1.5489,
"step": 598
},
{
"epoch": 0.34082503556187765,
"grad_norm": 0.6251218914985657,
"learning_rate": 0.0001229796549509944,
"loss": 1.5043,
"step": 599
},
{
"epoch": 0.3413940256045519,
"grad_norm": 0.562077522277832,
"learning_rate": 0.00012275835701812163,
"loss": 1.547,
"step": 600
},
{
"epoch": 0.3419630156472262,
"grad_norm": 0.5375682711601257,
"learning_rate": 0.00012253694148063013,
"loss": 1.3999,
"step": 601
},
{
"epoch": 0.34253200568990044,
"grad_norm": 0.583003044128418,
"learning_rate": 0.0001223154094826925,
"loss": 1.641,
"step": 602
},
{
"epoch": 0.3431009957325747,
"grad_norm": 0.619719922542572,
"learning_rate": 0.00012209376216908328,
"loss": 1.5772,
"step": 603
},
{
"epoch": 0.34366998577524893,
"grad_norm": 0.5548385977745056,
"learning_rate": 0.00012187200068517277,
"loss": 1.4802,
"step": 604
},
{
"epoch": 0.3442389758179232,
"grad_norm": 0.5717220902442932,
"learning_rate": 0.00012165012617692143,
"loss": 1.533,
"step": 605
},
{
"epoch": 0.3448079658605974,
"grad_norm": 0.5915637016296387,
"learning_rate": 0.00012142813979087356,
"loss": 1.4618,
"step": 606
},
{
"epoch": 0.34537695590327167,
"grad_norm": 0.5780906081199646,
"learning_rate": 0.00012120604267415172,
"loss": 1.428,
"step": 607
},
{
"epoch": 0.34594594594594597,
"grad_norm": 0.6107869744300842,
"learning_rate": 0.0001209838359744507,
"loss": 1.6056,
"step": 608
},
{
"epoch": 0.3465149359886202,
"grad_norm": 0.5807276368141174,
"learning_rate": 0.0001207615208400315,
"loss": 1.4344,
"step": 609
},
{
"epoch": 0.34708392603129445,
"grad_norm": 0.5761096477508545,
"learning_rate": 0.00012053909841971547,
"loss": 1.6409,
"step": 610
},
{
"epoch": 0.3476529160739687,
"grad_norm": 0.5648180246353149,
"learning_rate": 0.00012031656986287835,
"loss": 1.5207,
"step": 611
},
{
"epoch": 0.34822190611664294,
"grad_norm": 0.5846616625785828,
"learning_rate": 0.00012009393631944439,
"loss": 1.709,
"step": 612
},
{
"epoch": 0.3487908961593172,
"grad_norm": 0.5779747366905212,
"learning_rate": 0.00011987119893988035,
"loss": 1.5626,
"step": 613
},
{
"epoch": 0.3493598862019915,
"grad_norm": 0.5634474158287048,
"learning_rate": 0.00011964835887518955,
"loss": 1.645,
"step": 614
},
{
"epoch": 0.34992887624466573,
"grad_norm": 0.5536413788795471,
"learning_rate": 0.00011942541727690593,
"loss": 1.4927,
"step": 615
},
{
"epoch": 0.35049786628734,
"grad_norm": 0.5312451720237732,
"learning_rate": 0.00011920237529708811,
"loss": 1.3328,
"step": 616
},
{
"epoch": 0.3510668563300142,
"grad_norm": 0.5960412621498108,
"learning_rate": 0.00011897923408831346,
"loss": 1.5827,
"step": 617
},
{
"epoch": 0.35163584637268847,
"grad_norm": 0.598399817943573,
"learning_rate": 0.00011875599480367215,
"loss": 1.5477,
"step": 618
},
{
"epoch": 0.3522048364153627,
"grad_norm": 0.517993688583374,
"learning_rate": 0.00011853265859676108,
"loss": 1.3741,
"step": 619
},
{
"epoch": 0.352773826458037,
"grad_norm": 0.5564917922019958,
"learning_rate": 0.00011830922662167803,
"loss": 1.3112,
"step": 620
},
{
"epoch": 0.35334281650071125,
"grad_norm": 0.5626814961433411,
"learning_rate": 0.00011808570003301566,
"loss": 1.5272,
"step": 621
},
{
"epoch": 0.3539118065433855,
"grad_norm": 0.6245387196540833,
"learning_rate": 0.00011786207998585559,
"loss": 1.433,
"step": 622
},
{
"epoch": 0.35448079658605974,
"grad_norm": 0.5711420178413391,
"learning_rate": 0.00011763836763576237,
"loss": 1.4975,
"step": 623
},
{
"epoch": 0.355049786628734,
"grad_norm": 0.5550587177276611,
"learning_rate": 0.00011741456413877749,
"loss": 1.3973,
"step": 624
},
{
"epoch": 0.35561877667140823,
"grad_norm": 0.583817183971405,
"learning_rate": 0.00011719067065141352,
"loss": 1.4535,
"step": 625
},
{
"epoch": 0.3561877667140825,
"grad_norm": 0.5912776589393616,
"learning_rate": 0.00011696668833064795,
"loss": 1.5161,
"step": 626
},
{
"epoch": 0.3567567567567568,
"grad_norm": 0.615287184715271,
"learning_rate": 0.0001167426183339174,
"loss": 1.6331,
"step": 627
},
{
"epoch": 0.357325746799431,
"grad_norm": 0.5431495308876038,
"learning_rate": 0.00011651846181911161,
"loss": 1.5279,
"step": 628
},
{
"epoch": 0.35789473684210527,
"grad_norm": 0.5510687232017517,
"learning_rate": 0.00011629421994456723,
"loss": 1.5859,
"step": 629
},
{
"epoch": 0.3584637268847795,
"grad_norm": 0.5746335983276367,
"learning_rate": 0.0001160698938690622,
"loss": 1.4053,
"step": 630
},
{
"epoch": 0.35903271692745375,
"grad_norm": 0.5783334374427795,
"learning_rate": 0.00011584548475180943,
"loss": 1.6259,
"step": 631
},
{
"epoch": 0.359601706970128,
"grad_norm": 0.5857696533203125,
"learning_rate": 0.00011562099375245108,
"loss": 1.4625,
"step": 632
},
{
"epoch": 0.3601706970128023,
"grad_norm": 0.580596387386322,
"learning_rate": 0.00011539642203105232,
"loss": 1.511,
"step": 633
},
{
"epoch": 0.36073968705547654,
"grad_norm": 0.5730242729187012,
"learning_rate": 0.00011517177074809546,
"loss": 1.6307,
"step": 634
},
{
"epoch": 0.3613086770981508,
"grad_norm": 0.567469596862793,
"learning_rate": 0.0001149470410644741,
"loss": 1.5477,
"step": 635
},
{
"epoch": 0.36187766714082503,
"grad_norm": 0.5704171061515808,
"learning_rate": 0.00011472223414148675,
"loss": 1.4716,
"step": 636
},
{
"epoch": 0.3624466571834993,
"grad_norm": 0.5398246645927429,
"learning_rate": 0.00011449735114083127,
"loss": 1.6304,
"step": 637
},
{
"epoch": 0.3630156472261735,
"grad_norm": 0.5576680898666382,
"learning_rate": 0.0001142723932245985,
"loss": 1.4775,
"step": 638
},
{
"epoch": 0.3635846372688478,
"grad_norm": 0.5728341341018677,
"learning_rate": 0.00011404736155526645,
"loss": 1.6101,
"step": 639
},
{
"epoch": 0.36415362731152207,
"grad_norm": 0.54744553565979,
"learning_rate": 0.00011382225729569436,
"loss": 1.2536,
"step": 640
},
{
"epoch": 0.3647226173541963,
"grad_norm": 0.5593659281730652,
"learning_rate": 0.00011359708160911641,
"loss": 1.4138,
"step": 641
},
{
"epoch": 0.36529160739687055,
"grad_norm": 0.5415304899215698,
"learning_rate": 0.00011337183565913599,
"loss": 1.5221,
"step": 642
},
{
"epoch": 0.3658605974395448,
"grad_norm": 0.5653886198997498,
"learning_rate": 0.00011314652060971955,
"loss": 1.5221,
"step": 643
},
{
"epoch": 0.36642958748221904,
"grad_norm": 0.5842243432998657,
"learning_rate": 0.00011292113762519061,
"loss": 1.501,
"step": 644
},
{
"epoch": 0.3669985775248933,
"grad_norm": 0.5919954180717468,
"learning_rate": 0.00011269568787022376,
"loss": 1.5444,
"step": 645
},
{
"epoch": 0.3675675675675676,
"grad_norm": 0.5867476463317871,
"learning_rate": 0.00011247017250983865,
"loss": 1.4897,
"step": 646
},
{
"epoch": 0.36813655761024183,
"grad_norm": 0.5661168098449707,
"learning_rate": 0.00011224459270939384,
"loss": 1.3373,
"step": 647
},
{
"epoch": 0.3687055476529161,
"grad_norm": 0.5516852736473083,
"learning_rate": 0.00011201894963458106,
"loss": 1.6209,
"step": 648
},
{
"epoch": 0.3692745376955903,
"grad_norm": 0.615533709526062,
"learning_rate": 0.00011179324445141883,
"loss": 1.369,
"step": 649
},
{
"epoch": 0.36984352773826457,
"grad_norm": 0.5543255805969238,
"learning_rate": 0.00011156747832624679,
"loss": 1.3172,
"step": 650
},
{
"epoch": 0.3704125177809388,
"grad_norm": 0.5759336352348328,
"learning_rate": 0.00011134165242571938,
"loss": 1.5896,
"step": 651
},
{
"epoch": 0.3709815078236131,
"grad_norm": 0.5587149858474731,
"learning_rate": 0.00011111576791679994,
"loss": 1.5963,
"step": 652
},
{
"epoch": 0.37155049786628735,
"grad_norm": 0.5666396617889404,
"learning_rate": 0.00011088982596675475,
"loss": 1.5253,
"step": 653
},
{
"epoch": 0.3721194879089616,
"grad_norm": 0.5888431668281555,
"learning_rate": 0.00011066382774314683,
"loss": 1.4419,
"step": 654
},
{
"epoch": 0.37268847795163584,
"grad_norm": 0.5519063472747803,
"learning_rate": 0.00011043777441383006,
"loss": 1.5396,
"step": 655
},
{
"epoch": 0.3732574679943101,
"grad_norm": 0.5812383890151978,
"learning_rate": 0.00011021166714694297,
"loss": 1.2045,
"step": 656
},
{
"epoch": 0.37382645803698433,
"grad_norm": 0.5881744623184204,
"learning_rate": 0.000109985507110903,
"loss": 1.4078,
"step": 657
},
{
"epoch": 0.37439544807965863,
"grad_norm": 0.5681930184364319,
"learning_rate": 0.00010975929547440016,
"loss": 1.4739,
"step": 658
},
{
"epoch": 0.3749644381223329,
"grad_norm": 0.5596330165863037,
"learning_rate": 0.0001095330334063911,
"loss": 1.4085,
"step": 659
},
{
"epoch": 0.3755334281650071,
"grad_norm": 0.5785601139068604,
"learning_rate": 0.00010930672207609306,
"loss": 1.4087,
"step": 660
},
{
"epoch": 0.37610241820768137,
"grad_norm": 0.5467891097068787,
"learning_rate": 0.00010908036265297794,
"loss": 1.6924,
"step": 661
},
{
"epoch": 0.3766714082503556,
"grad_norm": 0.5449764132499695,
"learning_rate": 0.00010885395630676607,
"loss": 1.5254,
"step": 662
},
{
"epoch": 0.37724039829302985,
"grad_norm": 0.5570394396781921,
"learning_rate": 0.00010862750420742031,
"loss": 1.4218,
"step": 663
},
{
"epoch": 0.3778093883357041,
"grad_norm": 0.5946861505508423,
"learning_rate": 0.00010840100752513996,
"loss": 1.6474,
"step": 664
},
{
"epoch": 0.3783783783783784,
"grad_norm": 0.545051097869873,
"learning_rate": 0.00010817446743035462,
"loss": 1.459,
"step": 665
},
{
"epoch": 0.37894736842105264,
"grad_norm": 0.5713635683059692,
"learning_rate": 0.00010794788509371829,
"loss": 1.44,
"step": 666
},
{
"epoch": 0.3795163584637269,
"grad_norm": 0.5865978598594666,
"learning_rate": 0.00010772126168610325,
"loss": 1.5968,
"step": 667
},
{
"epoch": 0.38008534850640113,
"grad_norm": 0.5625496506690979,
"learning_rate": 0.00010749459837859408,
"loss": 1.4018,
"step": 668
},
{
"epoch": 0.3806543385490754,
"grad_norm": 0.5960560441017151,
"learning_rate": 0.00010726789634248137,
"loss": 1.5808,
"step": 669
},
{
"epoch": 0.3812233285917496,
"grad_norm": 0.6137279868125916,
"learning_rate": 0.00010704115674925604,
"loss": 1.212,
"step": 670
},
{
"epoch": 0.3817923186344239,
"grad_norm": 0.5478764772415161,
"learning_rate": 0.00010681438077060291,
"loss": 1.4701,
"step": 671
},
{
"epoch": 0.38236130867709817,
"grad_norm": 0.6135146021842957,
"learning_rate": 0.000106587569578395,
"loss": 1.5428,
"step": 672
},
{
"epoch": 0.3829302987197724,
"grad_norm": 0.5707561373710632,
"learning_rate": 0.00010636072434468714,
"loss": 1.5299,
"step": 673
},
{
"epoch": 0.38349928876244666,
"grad_norm": 0.529769778251648,
"learning_rate": 0.00010613384624171016,
"loss": 1.4161,
"step": 674
},
{
"epoch": 0.3840682788051209,
"grad_norm": 0.5672623515129089,
"learning_rate": 0.00010590693644186474,
"loss": 1.5084,
"step": 675
},
{
"epoch": 0.38463726884779514,
"grad_norm": 0.5277720093727112,
"learning_rate": 0.00010567999611771528,
"loss": 1.2255,
"step": 676
},
{
"epoch": 0.38520625889046944,
"grad_norm": 0.5478918552398682,
"learning_rate": 0.00010545302644198405,
"loss": 1.3878,
"step": 677
},
{
"epoch": 0.3857752489331437,
"grad_norm": 0.5412498712539673,
"learning_rate": 0.00010522602858754487,
"loss": 1.5586,
"step": 678
},
{
"epoch": 0.38634423897581793,
"grad_norm": 0.5770754814147949,
"learning_rate": 0.00010499900372741718,
"loss": 1.3127,
"step": 679
},
{
"epoch": 0.3869132290184922,
"grad_norm": 0.5917402505874634,
"learning_rate": 0.00010477195303476011,
"loss": 1.3799,
"step": 680
},
{
"epoch": 0.3874822190611664,
"grad_norm": 0.5400240421295166,
"learning_rate": 0.00010454487768286612,
"loss": 1.2999,
"step": 681
},
{
"epoch": 0.38805120910384067,
"grad_norm": 0.5468504428863525,
"learning_rate": 0.00010431777884515514,
"loss": 1.3114,
"step": 682
},
{
"epoch": 0.3886201991465149,
"grad_norm": 0.5608039498329163,
"learning_rate": 0.00010409065769516856,
"loss": 1.3888,
"step": 683
},
{
"epoch": 0.3891891891891892,
"grad_norm": 0.5961167216300964,
"learning_rate": 0.00010386351540656292,
"loss": 1.5431,
"step": 684
},
{
"epoch": 0.38975817923186346,
"grad_norm": 0.5718376040458679,
"learning_rate": 0.00010363635315310414,
"loss": 1.521,
"step": 685
},
{
"epoch": 0.3903271692745377,
"grad_norm": 0.5798651576042175,
"learning_rate": 0.00010340917210866118,
"loss": 1.519,
"step": 686
},
{
"epoch": 0.39089615931721194,
"grad_norm": 0.5611982941627502,
"learning_rate": 0.00010318197344720018,
"loss": 1.499,
"step": 687
},
{
"epoch": 0.3914651493598862,
"grad_norm": 0.571074366569519,
"learning_rate": 0.00010295475834277831,
"loss": 1.4738,
"step": 688
},
{
"epoch": 0.39203413940256043,
"grad_norm": 0.5722329020500183,
"learning_rate": 0.00010272752796953766,
"loss": 1.6584,
"step": 689
},
{
"epoch": 0.39260312944523473,
"grad_norm": 0.5674881935119629,
"learning_rate": 0.00010250028350169931,
"loss": 1.5507,
"step": 690
},
{
"epoch": 0.393172119487909,
"grad_norm": 0.5546680688858032,
"learning_rate": 0.00010227302611355712,
"loss": 1.297,
"step": 691
},
{
"epoch": 0.3937411095305832,
"grad_norm": 0.5614904165267944,
"learning_rate": 0.00010204575697947168,
"loss": 1.4416,
"step": 692
},
{
"epoch": 0.39431009957325747,
"grad_norm": 0.5829195380210876,
"learning_rate": 0.00010181847727386433,
"loss": 1.5031,
"step": 693
},
{
"epoch": 0.3948790896159317,
"grad_norm": 0.5744046568870544,
"learning_rate": 0.00010159118817121105,
"loss": 1.4576,
"step": 694
},
{
"epoch": 0.39544807965860596,
"grad_norm": 0.572902262210846,
"learning_rate": 0.00010136389084603637,
"loss": 1.5078,
"step": 695
},
{
"epoch": 0.3960170697012802,
"grad_norm": 0.5696277618408203,
"learning_rate": 0.00010113658647290723,
"loss": 1.4636,
"step": 696
},
{
"epoch": 0.3960170697012802,
"eval_loss": 1.4791862964630127,
"eval_runtime": 15.3322,
"eval_samples_per_second": 48.265,
"eval_steps_per_second": 24.132,
"step": 696
},
{
"epoch": 0.3965860597439545,
"grad_norm": 0.5474138855934143,
"learning_rate": 0.0001009092762264271,
"loss": 1.4683,
"step": 697
},
{
"epoch": 0.39715504978662874,
"grad_norm": 0.6160016059875488,
"learning_rate": 0.00010068196128122975,
"loss": 1.6705,
"step": 698
},
{
"epoch": 0.397724039829303,
"grad_norm": 0.5745415687561035,
"learning_rate": 0.00010045464281197327,
"loss": 1.5104,
"step": 699
},
{
"epoch": 0.39829302987197723,
"grad_norm": 0.5802525281906128,
"learning_rate": 0.0001002273219933339,
"loss": 1.4029,
"step": 700
},
{
"epoch": 0.3988620199146515,
"grad_norm": 0.5592519044876099,
"learning_rate": 0.0001,
"loss": 1.6325,
"step": 701
},
{
"epoch": 0.3994310099573257,
"grad_norm": 0.6051873564720154,
"learning_rate": 9.977267800666613e-05,
"loss": 1.688,
"step": 702
},
{
"epoch": 0.4,
"grad_norm": 0.5836036205291748,
"learning_rate": 9.954535718802675e-05,
"loss": 1.3107,
"step": 703
},
{
"epoch": 0.40056899004267427,
"grad_norm": 0.5733322501182556,
"learning_rate": 9.931803871877028e-05,
"loss": 1.7469,
"step": 704
},
{
"epoch": 0.4011379800853485,
"grad_norm": 0.5718969106674194,
"learning_rate": 9.909072377357294e-05,
"loss": 1.3822,
"step": 705
},
{
"epoch": 0.40170697012802276,
"grad_norm": 0.5877561569213867,
"learning_rate": 9.88634135270928e-05,
"loss": 1.6344,
"step": 706
},
{
"epoch": 0.402275960170697,
"grad_norm": 0.5636436939239502,
"learning_rate": 9.863610915396365e-05,
"loss": 1.5552,
"step": 707
},
{
"epoch": 0.40284495021337124,
"grad_norm": 0.5809296369552612,
"learning_rate": 9.840881182878895e-05,
"loss": 1.3633,
"step": 708
},
{
"epoch": 0.40341394025604554,
"grad_norm": 0.5500168204307556,
"learning_rate": 9.81815227261357e-05,
"loss": 1.4063,
"step": 709
},
{
"epoch": 0.4039829302987198,
"grad_norm": 0.5806904435157776,
"learning_rate": 9.795424302052836e-05,
"loss": 1.5629,
"step": 710
},
{
"epoch": 0.40455192034139403,
"grad_norm": 0.5868257880210876,
"learning_rate": 9.77269738864429e-05,
"loss": 1.3655,
"step": 711
},
{
"epoch": 0.4051209103840683,
"grad_norm": 0.5417432188987732,
"learning_rate": 9.749971649830071e-05,
"loss": 1.4914,
"step": 712
},
{
"epoch": 0.4056899004267425,
"grad_norm": 0.6012546420097351,
"learning_rate": 9.727247203046234e-05,
"loss": 1.5365,
"step": 713
},
{
"epoch": 0.40625889046941677,
"grad_norm": 0.5691578388214111,
"learning_rate": 9.704524165722174e-05,
"loss": 1.5959,
"step": 714
},
{
"epoch": 0.406827880512091,
"grad_norm": 0.5487850904464722,
"learning_rate": 9.681802655279986e-05,
"loss": 1.4469,
"step": 715
},
{
"epoch": 0.4073968705547653,
"grad_norm": 0.6280918121337891,
"learning_rate": 9.659082789133884e-05,
"loss": 1.338,
"step": 716
},
{
"epoch": 0.40796586059743956,
"grad_norm": 0.5909377932548523,
"learning_rate": 9.63636468468959e-05,
"loss": 1.6272,
"step": 717
},
{
"epoch": 0.4085348506401138,
"grad_norm": 0.6044595837593079,
"learning_rate": 9.613648459343708e-05,
"loss": 1.5717,
"step": 718
},
{
"epoch": 0.40910384068278804,
"grad_norm": 0.5833640098571777,
"learning_rate": 9.590934230483149e-05,
"loss": 1.4213,
"step": 719
},
{
"epoch": 0.4096728307254623,
"grad_norm": 0.6057854890823364,
"learning_rate": 9.568222115484488e-05,
"loss": 1.4861,
"step": 720
},
{
"epoch": 0.41024182076813653,
"grad_norm": 0.5813032984733582,
"learning_rate": 9.54551223171339e-05,
"loss": 1.5329,
"step": 721
},
{
"epoch": 0.41081081081081083,
"grad_norm": 0.5498741865158081,
"learning_rate": 9.522804696523991e-05,
"loss": 1.4457,
"step": 722
},
{
"epoch": 0.4113798008534851,
"grad_norm": 0.5357645750045776,
"learning_rate": 9.500099627258282e-05,
"loss": 1.2792,
"step": 723
},
{
"epoch": 0.4119487908961593,
"grad_norm": 0.5478993654251099,
"learning_rate": 9.477397141245519e-05,
"loss": 1.5071,
"step": 724
},
{
"epoch": 0.41251778093883357,
"grad_norm": 0.5776642560958862,
"learning_rate": 9.454697355801598e-05,
"loss": 1.3664,
"step": 725
},
{
"epoch": 0.4130867709815078,
"grad_norm": 0.6283994913101196,
"learning_rate": 9.432000388228473e-05,
"loss": 1.3994,
"step": 726
},
{
"epoch": 0.41365576102418206,
"grad_norm": 0.6153956651687622,
"learning_rate": 9.409306355813529e-05,
"loss": 1.2524,
"step": 727
},
{
"epoch": 0.41422475106685636,
"grad_norm": 0.5952728986740112,
"learning_rate": 9.386615375828984e-05,
"loss": 1.5941,
"step": 728
},
{
"epoch": 0.4147937411095306,
"grad_norm": 0.5799689292907715,
"learning_rate": 9.36392756553129e-05,
"loss": 1.3113,
"step": 729
},
{
"epoch": 0.41536273115220484,
"grad_norm": 0.5933107733726501,
"learning_rate": 9.341243042160503e-05,
"loss": 1.6378,
"step": 730
},
{
"epoch": 0.4159317211948791,
"grad_norm": 0.5808780789375305,
"learning_rate": 9.318561922939711e-05,
"loss": 1.663,
"step": 731
},
{
"epoch": 0.41650071123755333,
"grad_norm": 0.5661304593086243,
"learning_rate": 9.295884325074398e-05,
"loss": 1.4145,
"step": 732
},
{
"epoch": 0.4170697012802276,
"grad_norm": 0.577038049697876,
"learning_rate": 9.273210365751862e-05,
"loss": 1.4288,
"step": 733
},
{
"epoch": 0.4176386913229018,
"grad_norm": 0.5904839038848877,
"learning_rate": 9.250540162140597e-05,
"loss": 1.5257,
"step": 734
},
{
"epoch": 0.4182076813655761,
"grad_norm": 0.5645294785499573,
"learning_rate": 9.227873831389677e-05,
"loss": 1.4073,
"step": 735
},
{
"epoch": 0.41877667140825037,
"grad_norm": 0.5541549921035767,
"learning_rate": 9.205211490628173e-05,
"loss": 1.3965,
"step": 736
},
{
"epoch": 0.4193456614509246,
"grad_norm": 0.6137387752532959,
"learning_rate": 9.18255325696454e-05,
"loss": 1.3849,
"step": 737
},
{
"epoch": 0.41991465149359886,
"grad_norm": 0.588316798210144,
"learning_rate": 9.159899247486004e-05,
"loss": 1.4989,
"step": 738
},
{
"epoch": 0.4204836415362731,
"grad_norm": 0.567848265171051,
"learning_rate": 9.13724957925797e-05,
"loss": 1.4989,
"step": 739
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.554695188999176,
"learning_rate": 9.114604369323395e-05,
"loss": 1.4509,
"step": 740
},
{
"epoch": 0.42162162162162165,
"grad_norm": 0.5507339835166931,
"learning_rate": 9.091963734702208e-05,
"loss": 1.364,
"step": 741
},
{
"epoch": 0.4221906116642959,
"grad_norm": 0.569786012172699,
"learning_rate": 9.069327792390695e-05,
"loss": 1.5775,
"step": 742
},
{
"epoch": 0.42275960170697013,
"grad_norm": 0.563234806060791,
"learning_rate": 9.046696659360894e-05,
"loss": 1.4557,
"step": 743
},
{
"epoch": 0.4233285917496444,
"grad_norm": 0.5537723302841187,
"learning_rate": 9.024070452559986e-05,
"loss": 1.443,
"step": 744
},
{
"epoch": 0.4238975817923186,
"grad_norm": 0.5786699056625366,
"learning_rate": 9.001449288909702e-05,
"loss": 1.2683,
"step": 745
},
{
"epoch": 0.42446657183499287,
"grad_norm": 0.55182945728302,
"learning_rate": 8.978833285305705e-05,
"loss": 1.4565,
"step": 746
},
{
"epoch": 0.42503556187766717,
"grad_norm": 0.5818150043487549,
"learning_rate": 8.956222558616998e-05,
"loss": 1.6502,
"step": 747
},
{
"epoch": 0.4256045519203414,
"grad_norm": 0.6044638752937317,
"learning_rate": 8.933617225685319e-05,
"loss": 1.4631,
"step": 748
},
{
"epoch": 0.42617354196301566,
"grad_norm": 0.568188488483429,
"learning_rate": 8.91101740332453e-05,
"loss": 1.5997,
"step": 749
},
{
"epoch": 0.4267425320056899,
"grad_norm": 0.5530648231506348,
"learning_rate": 8.888423208320008e-05,
"loss": 1.174,
"step": 750
},
{
"epoch": 0.42731152204836415,
"grad_norm": 0.5782289505004883,
"learning_rate": 8.865834757428064e-05,
"loss": 1.5198,
"step": 751
},
{
"epoch": 0.4278805120910384,
"grad_norm": 0.5685307383537292,
"learning_rate": 8.843252167375322e-05,
"loss": 1.5545,
"step": 752
},
{
"epoch": 0.42844950213371263,
"grad_norm": 0.5832937359809875,
"learning_rate": 8.820675554858115e-05,
"loss": 1.5776,
"step": 753
},
{
"epoch": 0.42901849217638693,
"grad_norm": 0.6279184818267822,
"learning_rate": 8.7981050365419e-05,
"loss": 1.5975,
"step": 754
},
{
"epoch": 0.4295874822190612,
"grad_norm": 0.5440697073936462,
"learning_rate": 8.775540729060618e-05,
"loss": 1.3772,
"step": 755
},
{
"epoch": 0.4301564722617354,
"grad_norm": 0.6341460347175598,
"learning_rate": 8.752982749016139e-05,
"loss": 1.573,
"step": 756
},
{
"epoch": 0.43072546230440967,
"grad_norm": 0.5840321779251099,
"learning_rate": 8.730431212977625e-05,
"loss": 1.5051,
"step": 757
},
{
"epoch": 0.4312944523470839,
"grad_norm": 0.5965592265129089,
"learning_rate": 8.70788623748094e-05,
"loss": 1.5323,
"step": 758
},
{
"epoch": 0.43186344238975816,
"grad_norm": 0.5905702710151672,
"learning_rate": 8.68534793902805e-05,
"loss": 1.4051,
"step": 759
},
{
"epoch": 0.43243243243243246,
"grad_norm": 0.5640906691551208,
"learning_rate": 8.662816434086404e-05,
"loss": 1.6614,
"step": 760
},
{
"epoch": 0.4330014224751067,
"grad_norm": 0.5574825406074524,
"learning_rate": 8.64029183908836e-05,
"loss": 1.3464,
"step": 761
},
{
"epoch": 0.43357041251778095,
"grad_norm": 0.5866842865943909,
"learning_rate": 8.617774270430566e-05,
"loss": 1.4531,
"step": 762
},
{
"epoch": 0.4341394025604552,
"grad_norm": 0.6260978579521179,
"learning_rate": 8.595263844473353e-05,
"loss": 1.4005,
"step": 763
},
{
"epoch": 0.43470839260312943,
"grad_norm": 0.5732872486114502,
"learning_rate": 8.572760677540154e-05,
"loss": 1.366,
"step": 764
},
{
"epoch": 0.4352773826458037,
"grad_norm": 0.5682139992713928,
"learning_rate": 8.550264885916877e-05,
"loss": 1.359,
"step": 765
},
{
"epoch": 0.435846372688478,
"grad_norm": 0.5898922085762024,
"learning_rate": 8.527776585851328e-05,
"loss": 1.5197,
"step": 766
},
{
"epoch": 0.4364153627311522,
"grad_norm": 0.5902604460716248,
"learning_rate": 8.505295893552594e-05,
"loss": 1.4844,
"step": 767
},
{
"epoch": 0.43698435277382647,
"grad_norm": 0.6057772040367126,
"learning_rate": 8.482822925190452e-05,
"loss": 1.5739,
"step": 768
},
{
"epoch": 0.4375533428165007,
"grad_norm": 0.5546793341636658,
"learning_rate": 8.460357796894773e-05,
"loss": 1.4748,
"step": 769
},
{
"epoch": 0.43812233285917496,
"grad_norm": 0.5493602156639099,
"learning_rate": 8.437900624754895e-05,
"loss": 1.3922,
"step": 770
},
{
"epoch": 0.4386913229018492,
"grad_norm": 0.5499581098556519,
"learning_rate": 8.415451524819058e-05,
"loss": 1.574,
"step": 771
},
{
"epoch": 0.43926031294452345,
"grad_norm": 0.5515440702438354,
"learning_rate": 8.393010613093781e-05,
"loss": 1.3672,
"step": 772
},
{
"epoch": 0.43982930298719775,
"grad_norm": 0.5613058805465698,
"learning_rate": 8.370578005543278e-05,
"loss": 1.3815,
"step": 773
},
{
"epoch": 0.440398293029872,
"grad_norm": 0.5643707513809204,
"learning_rate": 8.348153818088844e-05,
"loss": 1.5947,
"step": 774
},
{
"epoch": 0.44096728307254623,
"grad_norm": 0.6310828924179077,
"learning_rate": 8.325738166608263e-05,
"loss": 1.5413,
"step": 775
},
{
"epoch": 0.4415362731152205,
"grad_norm": 0.6655511856079102,
"learning_rate": 8.303331166935209e-05,
"loss": 1.5198,
"step": 776
},
{
"epoch": 0.4421052631578947,
"grad_norm": 0.5539633631706238,
"learning_rate": 8.280932934858652e-05,
"loss": 1.4308,
"step": 777
},
{
"epoch": 0.44267425320056897,
"grad_norm": 0.5974248647689819,
"learning_rate": 8.25854358612225e-05,
"loss": 1.537,
"step": 778
},
{
"epoch": 0.44324324324324327,
"grad_norm": 0.5987525582313538,
"learning_rate": 8.236163236423767e-05,
"loss": 1.5318,
"step": 779
},
{
"epoch": 0.4438122332859175,
"grad_norm": 0.5623188018798828,
"learning_rate": 8.213792001414445e-05,
"loss": 1.6016,
"step": 780
},
{
"epoch": 0.44438122332859176,
"grad_norm": 0.5642153024673462,
"learning_rate": 8.191429996698436e-05,
"loss": 1.4452,
"step": 781
},
{
"epoch": 0.444950213371266,
"grad_norm": 0.6042040586471558,
"learning_rate": 8.1690773378322e-05,
"loss": 1.6325,
"step": 782
},
{
"epoch": 0.44551920341394025,
"grad_norm": 0.5777531862258911,
"learning_rate": 8.146734140323896e-05,
"loss": 1.6388,
"step": 783
},
{
"epoch": 0.4460881934566145,
"grad_norm": 0.5600481629371643,
"learning_rate": 8.124400519632788e-05,
"loss": 1.5077,
"step": 784
},
{
"epoch": 0.4466571834992888,
"grad_norm": 0.5644223690032959,
"learning_rate": 8.102076591168655e-05,
"loss": 1.4056,
"step": 785
},
{
"epoch": 0.44722617354196303,
"grad_norm": 0.6023853421211243,
"learning_rate": 8.079762470291191e-05,
"loss": 1.713,
"step": 786
},
{
"epoch": 0.4477951635846373,
"grad_norm": 0.5626102685928345,
"learning_rate": 8.05745827230941e-05,
"loss": 1.5125,
"step": 787
},
{
"epoch": 0.4483641536273115,
"grad_norm": 0.5824998617172241,
"learning_rate": 8.035164112481048e-05,
"loss": 1.4695,
"step": 788
},
{
"epoch": 0.44893314366998577,
"grad_norm": 0.5714951157569885,
"learning_rate": 8.01288010601197e-05,
"loss": 1.4452,
"step": 789
},
{
"epoch": 0.44950213371266,
"grad_norm": 0.5934897065162659,
"learning_rate": 7.990606368055564e-05,
"loss": 1.5389,
"step": 790
},
{
"epoch": 0.45007112375533426,
"grad_norm": 0.5794687867164612,
"learning_rate": 7.968343013712167e-05,
"loss": 1.4127,
"step": 791
},
{
"epoch": 0.45064011379800856,
"grad_norm": 0.5628656148910522,
"learning_rate": 7.946090158028455e-05,
"loss": 1.4798,
"step": 792
},
{
"epoch": 0.4512091038406828,
"grad_norm": 0.5794563293457031,
"learning_rate": 7.923847915996851e-05,
"loss": 1.5584,
"step": 793
},
{
"epoch": 0.45177809388335705,
"grad_norm": 0.5685121417045593,
"learning_rate": 7.901616402554933e-05,
"loss": 1.51,
"step": 794
},
{
"epoch": 0.4523470839260313,
"grad_norm": 0.568209171295166,
"learning_rate": 7.87939573258483e-05,
"loss": 1.5588,
"step": 795
},
{
"epoch": 0.45291607396870553,
"grad_norm": 0.5683977603912354,
"learning_rate": 7.857186020912647e-05,
"loss": 1.4482,
"step": 796
},
{
"epoch": 0.4534850640113798,
"grad_norm": 0.5802903771400452,
"learning_rate": 7.834987382307861e-05,
"loss": 1.5827,
"step": 797
},
{
"epoch": 0.4540540540540541,
"grad_norm": 0.5780924558639526,
"learning_rate": 7.812799931482721e-05,
"loss": 1.4595,
"step": 798
},
{
"epoch": 0.4546230440967283,
"grad_norm": 0.5929847359657288,
"learning_rate": 7.790623783091677e-05,
"loss": 1.5512,
"step": 799
},
{
"epoch": 0.45519203413940257,
"grad_norm": 0.5519236326217651,
"learning_rate": 7.768459051730752e-05,
"loss": 1.4239,
"step": 800
},
{
"epoch": 0.4557610241820768,
"grad_norm": 0.5426004528999329,
"learning_rate": 7.74630585193699e-05,
"loss": 1.3005,
"step": 801
},
{
"epoch": 0.45633001422475106,
"grad_norm": 0.6065943241119385,
"learning_rate": 7.724164298187838e-05,
"loss": 1.3966,
"step": 802
},
{
"epoch": 0.4568990042674253,
"grad_norm": 0.5971605777740479,
"learning_rate": 7.70203450490056e-05,
"loss": 1.5944,
"step": 803
},
{
"epoch": 0.45746799431009955,
"grad_norm": 0.5548596978187561,
"learning_rate": 7.679916586431654e-05,
"loss": 1.4323,
"step": 804
},
{
"epoch": 0.45803698435277385,
"grad_norm": 0.5478107929229736,
"learning_rate": 7.657810657076243e-05,
"loss": 1.3819,
"step": 805
},
{
"epoch": 0.4586059743954481,
"grad_norm": 0.5837447047233582,
"learning_rate": 7.635716831067505e-05,
"loss": 1.3941,
"step": 806
},
{
"epoch": 0.45917496443812233,
"grad_norm": 0.5920546650886536,
"learning_rate": 7.613635222576072e-05,
"loss": 1.5395,
"step": 807
},
{
"epoch": 0.4597439544807966,
"grad_norm": 0.6047683358192444,
"learning_rate": 7.59156594570944e-05,
"loss": 1.4169,
"step": 808
},
{
"epoch": 0.4603129445234708,
"grad_norm": 0.5774646401405334,
"learning_rate": 7.569509114511386e-05,
"loss": 1.5108,
"step": 809
},
{
"epoch": 0.46088193456614507,
"grad_norm": 0.5855366587638855,
"learning_rate": 7.547464842961362e-05,
"loss": 1.6545,
"step": 810
},
{
"epoch": 0.46145092460881937,
"grad_norm": 0.5752539038658142,
"learning_rate": 7.52543324497393e-05,
"loss": 1.6431,
"step": 811
},
{
"epoch": 0.4620199146514936,
"grad_norm": 0.5689989328384399,
"learning_rate": 7.503414434398151e-05,
"loss": 1.2883,
"step": 812
},
{
"epoch": 0.46258890469416786,
"grad_norm": 0.6341901421546936,
"learning_rate": 7.481408525017013e-05,
"loss": 1.4223,
"step": 813
},
{
"epoch": 0.4631578947368421,
"grad_norm": 0.6005092263221741,
"learning_rate": 7.459415630546842e-05,
"loss": 1.5522,
"step": 814
},
{
"epoch": 0.46372688477951635,
"grad_norm": 0.6249240636825562,
"learning_rate": 7.437435864636691e-05,
"loss": 1.5459,
"step": 815
},
{
"epoch": 0.4642958748221906,
"grad_norm": 0.5745651125907898,
"learning_rate": 7.415469340867787e-05,
"loss": 1.6287,
"step": 816
},
{
"epoch": 0.4648648648648649,
"grad_norm": 0.5915263891220093,
"learning_rate": 7.393516172752919e-05,
"loss": 1.4738,
"step": 817
},
{
"epoch": 0.46543385490753914,
"grad_norm": 0.5895527601242065,
"learning_rate": 7.371576473735867e-05,
"loss": 1.6939,
"step": 818
},
{
"epoch": 0.4660028449502134,
"grad_norm": 0.5770692825317383,
"learning_rate": 7.349650357190807e-05,
"loss": 1.4264,
"step": 819
},
{
"epoch": 0.4665718349928876,
"grad_norm": 0.6085241436958313,
"learning_rate": 7.327737936421721e-05,
"loss": 1.5019,
"step": 820
},
{
"epoch": 0.46714082503556187,
"grad_norm": 0.5652032494544983,
"learning_rate": 7.305839324661823e-05,
"loss": 1.3324,
"step": 821
},
{
"epoch": 0.4677098150782361,
"grad_norm": 0.5609267950057983,
"learning_rate": 7.283954635072968e-05,
"loss": 1.3902,
"step": 822
},
{
"epoch": 0.46827880512091036,
"grad_norm": 0.5592348575592041,
"learning_rate": 7.262083980745069e-05,
"loss": 1.4362,
"step": 823
},
{
"epoch": 0.46884779516358466,
"grad_norm": 0.5790618658065796,
"learning_rate": 7.240227474695509e-05,
"loss": 1.4753,
"step": 824
},
{
"epoch": 0.4694167852062589,
"grad_norm": 0.5804809927940369,
"learning_rate": 7.218385229868559e-05,
"loss": 1.2719,
"step": 825
},
{
"epoch": 0.46998577524893315,
"grad_norm": 0.5487887859344482,
"learning_rate": 7.196557359134794e-05,
"loss": 1.3212,
"step": 826
},
{
"epoch": 0.4705547652916074,
"grad_norm": 0.5842025876045227,
"learning_rate": 7.174743975290513e-05,
"loss": 1.5622,
"step": 827
},
{
"epoch": 0.47112375533428164,
"grad_norm": 0.580644428730011,
"learning_rate": 7.152945191057154e-05,
"loss": 1.4567,
"step": 828
},
{
"epoch": 0.4716927453769559,
"grad_norm": 0.5735095739364624,
"learning_rate": 7.131161119080712e-05,
"loss": 1.4547,
"step": 829
},
{
"epoch": 0.4722617354196302,
"grad_norm": 0.5592243671417236,
"learning_rate": 7.109391871931142e-05,
"loss": 1.3144,
"step": 830
},
{
"epoch": 0.4728307254623044,
"grad_norm": 0.581495463848114,
"learning_rate": 7.087637562101813e-05,
"loss": 1.5145,
"step": 831
},
{
"epoch": 0.47339971550497867,
"grad_norm": 0.5653107762336731,
"learning_rate": 7.065898302008886e-05,
"loss": 1.388,
"step": 832
},
{
"epoch": 0.4739687055476529,
"grad_norm": 0.5776169300079346,
"learning_rate": 7.04417420399077e-05,
"loss": 1.5059,
"step": 833
},
{
"epoch": 0.47453769559032716,
"grad_norm": 0.556419312953949,
"learning_rate": 7.02246538030751e-05,
"loss": 1.3933,
"step": 834
},
{
"epoch": 0.4751066856330014,
"grad_norm": 0.5605750679969788,
"learning_rate": 7.000771943140218e-05,
"loss": 1.4677,
"step": 835
},
{
"epoch": 0.4756756756756757,
"grad_norm": 0.5609278678894043,
"learning_rate": 6.979094004590507e-05,
"loss": 1.4526,
"step": 836
},
{
"epoch": 0.47624466571834995,
"grad_norm": 0.5990177392959595,
"learning_rate": 6.957431676679896e-05,
"loss": 1.6215,
"step": 837
},
{
"epoch": 0.4768136557610242,
"grad_norm": 0.5737520456314087,
"learning_rate": 6.935785071349228e-05,
"loss": 1.3985,
"step": 838
},
{
"epoch": 0.47738264580369844,
"grad_norm": 0.5521170496940613,
"learning_rate": 6.914154300458115e-05,
"loss": 1.6527,
"step": 839
},
{
"epoch": 0.4779516358463727,
"grad_norm": 0.5809024572372437,
"learning_rate": 6.892539475784326e-05,
"loss": 1.5697,
"step": 840
},
{
"epoch": 0.4785206258890469,
"grad_norm": 0.6158897876739502,
"learning_rate": 6.870940709023237e-05,
"loss": 1.48,
"step": 841
},
{
"epoch": 0.47908961593172117,
"grad_norm": 0.5950735807418823,
"learning_rate": 6.849358111787246e-05,
"loss": 1.3335,
"step": 842
},
{
"epoch": 0.47965860597439547,
"grad_norm": 0.5788929462432861,
"learning_rate": 6.82779179560519e-05,
"loss": 1.4746,
"step": 843
},
{
"epoch": 0.4802275960170697,
"grad_norm": 0.6169467568397522,
"learning_rate": 6.806241871921777e-05,
"loss": 1.2997,
"step": 844
},
{
"epoch": 0.48079658605974396,
"grad_norm": 0.5850261449813843,
"learning_rate": 6.784708452096998e-05,
"loss": 1.2293,
"step": 845
},
{
"epoch": 0.4813655761024182,
"grad_norm": 0.5514947772026062,
"learning_rate": 6.763191647405568e-05,
"loss": 1.3825,
"step": 846
},
{
"epoch": 0.48193456614509245,
"grad_norm": 0.5753430128097534,
"learning_rate": 6.741691569036338e-05,
"loss": 1.5195,
"step": 847
},
{
"epoch": 0.4825035561877667,
"grad_norm": 0.5876197814941406,
"learning_rate": 6.720208328091732e-05,
"loss": 1.4453,
"step": 848
},
{
"epoch": 0.483072546230441,
"grad_norm": 0.5744032859802246,
"learning_rate": 6.69874203558716e-05,
"loss": 1.4914,
"step": 849
},
{
"epoch": 0.48364153627311524,
"grad_norm": 0.5800637006759644,
"learning_rate": 6.677292802450447e-05,
"loss": 1.4932,
"step": 850
},
{
"epoch": 0.4842105263157895,
"grad_norm": 0.5554024577140808,
"learning_rate": 6.655860739521271e-05,
"loss": 1.1795,
"step": 851
},
{
"epoch": 0.4847795163584637,
"grad_norm": 0.5711913704872131,
"learning_rate": 6.634445957550577e-05,
"loss": 1.486,
"step": 852
},
{
"epoch": 0.48534850640113797,
"grad_norm": 0.5684107542037964,
"learning_rate": 6.613048567200013e-05,
"loss": 1.3984,
"step": 853
},
{
"epoch": 0.4859174964438122,
"grad_norm": 0.5672001242637634,
"learning_rate": 6.591668679041359e-05,
"loss": 1.4811,
"step": 854
},
{
"epoch": 0.4864864864864865,
"grad_norm": 0.5804989337921143,
"learning_rate": 6.570306403555937e-05,
"loss": 1.3624,
"step": 855
},
{
"epoch": 0.48705547652916076,
"grad_norm": 0.6067745089530945,
"learning_rate": 6.548961851134072e-05,
"loss": 1.4192,
"step": 856
},
{
"epoch": 0.487624466571835,
"grad_norm": 0.576329231262207,
"learning_rate": 6.527635132074493e-05,
"loss": 1.6314,
"step": 857
},
{
"epoch": 0.48819345661450925,
"grad_norm": 0.5863393545150757,
"learning_rate": 6.506326356583781e-05,
"loss": 1.5669,
"step": 858
},
{
"epoch": 0.4887624466571835,
"grad_norm": 0.6074771285057068,
"learning_rate": 6.485035634775796e-05,
"loss": 1.3334,
"step": 859
},
{
"epoch": 0.48933143669985774,
"grad_norm": 0.5837851166725159,
"learning_rate": 6.463763076671091e-05,
"loss": 1.607,
"step": 860
},
{
"epoch": 0.489900426742532,
"grad_norm": 0.5989742875099182,
"learning_rate": 6.442508792196369e-05,
"loss": 1.4518,
"step": 861
},
{
"epoch": 0.4904694167852063,
"grad_norm": 0.5692201852798462,
"learning_rate": 6.4212728911839e-05,
"loss": 1.3878,
"step": 862
},
{
"epoch": 0.4910384068278805,
"grad_norm": 0.6134719252586365,
"learning_rate": 6.400055483370957e-05,
"loss": 1.5154,
"step": 863
},
{
"epoch": 0.49160739687055477,
"grad_norm": 0.5494038462638855,
"learning_rate": 6.378856678399255e-05,
"loss": 1.2968,
"step": 864
},
{
"epoch": 0.492176386913229,
"grad_norm": 0.5780492424964905,
"learning_rate": 6.357676585814366e-05,
"loss": 1.5766,
"step": 865
},
{
"epoch": 0.49274537695590326,
"grad_norm": 0.5398704409599304,
"learning_rate": 6.336515315065168e-05,
"loss": 1.4446,
"step": 866
},
{
"epoch": 0.4933143669985775,
"grad_norm": 0.5509852170944214,
"learning_rate": 6.315372975503285e-05,
"loss": 1.4465,
"step": 867
},
{
"epoch": 0.4938833570412518,
"grad_norm": 0.6671035885810852,
"learning_rate": 6.294249676382508e-05,
"loss": 1.706,
"step": 868
},
{
"epoch": 0.49445234708392605,
"grad_norm": 0.579408586025238,
"learning_rate": 6.273145526858236e-05,
"loss": 1.5695,
"step": 869
},
{
"epoch": 0.4950213371266003,
"grad_norm": 0.571058988571167,
"learning_rate": 6.252060635986911e-05,
"loss": 1.3541,
"step": 870
},
{
"epoch": 0.49559032716927454,
"grad_norm": 0.5792422890663147,
"learning_rate": 6.230995112725454e-05,
"loss": 1.4329,
"step": 871
},
{
"epoch": 0.4961593172119488,
"grad_norm": 0.5893927216529846,
"learning_rate": 6.209949065930706e-05,
"loss": 1.4674,
"step": 872
},
{
"epoch": 0.496728307254623,
"grad_norm": 0.5954142212867737,
"learning_rate": 6.188922604358865e-05,
"loss": 1.4462,
"step": 873
},
{
"epoch": 0.4972972972972973,
"grad_norm": 0.6741952896118164,
"learning_rate": 6.16791583666492e-05,
"loss": 1.6458,
"step": 874
},
{
"epoch": 0.49786628733997157,
"grad_norm": 0.6125763654708862,
"learning_rate": 6.146928871402081e-05,
"loss": 1.5387,
"step": 875
},
{
"epoch": 0.4984352773826458,
"grad_norm": 0.5839952230453491,
"learning_rate": 6.12596181702124e-05,
"loss": 1.6821,
"step": 876
},
{
"epoch": 0.49900426742532006,
"grad_norm": 0.5859706401824951,
"learning_rate": 6.1050147818704e-05,
"loss": 1.4713,
"step": 877
},
{
"epoch": 0.4995732574679943,
"grad_norm": 0.5910811424255371,
"learning_rate": 6.0840878741941057e-05,
"loss": 1.59,
"step": 878
},
{
"epoch": 0.5001422475106686,
"grad_norm": 0.6297405958175659,
"learning_rate": 6.063181202132901e-05,
"loss": 1.5881,
"step": 879
},
{
"epoch": 0.5007112375533428,
"grad_norm": 0.5714183449745178,
"learning_rate": 6.0422948737227504e-05,
"loss": 1.5894,
"step": 880
},
{
"epoch": 0.5012802275960171,
"grad_norm": 0.5969492197036743,
"learning_rate": 6.0214289968945004e-05,
"loss": 1.6697,
"step": 881
},
{
"epoch": 0.5018492176386913,
"grad_norm": 0.5817530155181885,
"learning_rate": 6.000583679473315e-05,
"loss": 1.5806,
"step": 882
},
{
"epoch": 0.5024182076813656,
"grad_norm": 0.5869944095611572,
"learning_rate": 5.979759029178107e-05,
"loss": 1.4565,
"step": 883
},
{
"epoch": 0.5029871977240399,
"grad_norm": 0.5745888948440552,
"learning_rate": 5.958955153621004e-05,
"loss": 1.5645,
"step": 884
},
{
"epoch": 0.5035561877667141,
"grad_norm": 0.549628734588623,
"learning_rate": 5.938172160306765e-05,
"loss": 1.5017,
"step": 885
},
{
"epoch": 0.5041251778093884,
"grad_norm": 0.5471094250679016,
"learning_rate": 5.9174101566322504e-05,
"loss": 1.2781,
"step": 886
},
{
"epoch": 0.5046941678520626,
"grad_norm": 0.5772054195404053,
"learning_rate": 5.896669249885851e-05,
"loss": 1.386,
"step": 887
},
{
"epoch": 0.5052631578947369,
"grad_norm": 0.6143761873245239,
"learning_rate": 5.875949547246939e-05,
"loss": 1.5432,
"step": 888
},
{
"epoch": 0.505832147937411,
"grad_norm": 0.5768917202949524,
"learning_rate": 5.8552511557853204e-05,
"loss": 1.6945,
"step": 889
},
{
"epoch": 0.5064011379800853,
"grad_norm": 0.5644556283950806,
"learning_rate": 5.8345741824606617e-05,
"loss": 1.5163,
"step": 890
},
{
"epoch": 0.5069701280227596,
"grad_norm": 0.6083329916000366,
"learning_rate": 5.813918734121955e-05,
"loss": 1.7979,
"step": 891
},
{
"epoch": 0.5075391180654338,
"grad_norm": 0.5543102025985718,
"learning_rate": 5.7932849175069705e-05,
"loss": 1.5558,
"step": 892
},
{
"epoch": 0.5081081081081081,
"grad_norm": 0.6090741753578186,
"learning_rate": 5.7726728392416874e-05,
"loss": 1.6233,
"step": 893
},
{
"epoch": 0.5086770981507823,
"grad_norm": 0.556496798992157,
"learning_rate": 5.7520826058397525e-05,
"loss": 1.5755,
"step": 894
},
{
"epoch": 0.5092460881934566,
"grad_norm": 0.6258504986763,
"learning_rate": 5.731514323701927e-05,
"loss": 1.6054,
"step": 895
},
{
"epoch": 0.5098150782361308,
"grad_norm": 0.6283307671546936,
"learning_rate": 5.7109680991155364e-05,
"loss": 1.8276,
"step": 896
},
{
"epoch": 0.5103840682788051,
"grad_norm": 0.5817832946777344,
"learning_rate": 5.690444038253935e-05,
"loss": 1.6388,
"step": 897
},
{
"epoch": 0.5109530583214794,
"grad_norm": 0.5892955660820007,
"learning_rate": 5.669942247175933e-05,
"loss": 1.2641,
"step": 898
},
{
"epoch": 0.5115220483641536,
"grad_norm": 0.5834968686103821,
"learning_rate": 5.649462831825265e-05,
"loss": 1.4207,
"step": 899
},
{
"epoch": 0.5120910384068279,
"grad_norm": 0.5753495693206787,
"learning_rate": 5.629005898030035e-05,
"loss": 1.4724,
"step": 900
},
{
"epoch": 0.5126600284495021,
"grad_norm": 0.6050419211387634,
"learning_rate": 5.608571551502175e-05,
"loss": 1.7189,
"step": 901
},
{
"epoch": 0.5132290184921764,
"grad_norm": 0.5946124196052551,
"learning_rate": 5.588159897836902e-05,
"loss": 1.3803,
"step": 902
},
{
"epoch": 0.5137980085348507,
"grad_norm": 0.5731397867202759,
"learning_rate": 5.56777104251216e-05,
"loss": 1.7426,
"step": 903
},
{
"epoch": 0.5143669985775249,
"grad_norm": 0.5813397169113159,
"learning_rate": 5.5474050908880814e-05,
"loss": 1.4898,
"step": 904
},
{
"epoch": 0.5149359886201992,
"grad_norm": 0.5610973834991455,
"learning_rate": 5.5270621482064465e-05,
"loss": 1.4937,
"step": 905
},
{
"epoch": 0.5155049786628734,
"grad_norm": 0.5550079941749573,
"learning_rate": 5.50674231959013e-05,
"loss": 1.3543,
"step": 906
},
{
"epoch": 0.5160739687055477,
"grad_norm": 0.596593976020813,
"learning_rate": 5.4864457100425783e-05,
"loss": 1.5856,
"step": 907
},
{
"epoch": 0.5166429587482219,
"grad_norm": 0.6018926501274109,
"learning_rate": 5.4661724244472355e-05,
"loss": 1.5092,
"step": 908
},
{
"epoch": 0.5172119487908962,
"grad_norm": 0.6650524735450745,
"learning_rate": 5.4459225675670264e-05,
"loss": 1.7059,
"step": 909
},
{
"epoch": 0.5177809388335705,
"grad_norm": 0.5858013033866882,
"learning_rate": 5.425696244043807e-05,
"loss": 1.4591,
"step": 910
},
{
"epoch": 0.5183499288762446,
"grad_norm": 0.555473268032074,
"learning_rate": 5.405493558397824e-05,
"loss": 1.401,
"step": 911
},
{
"epoch": 0.518918918918919,
"grad_norm": 0.6246885061264038,
"learning_rate": 5.385314615027168e-05,
"loss": 1.4415,
"step": 912
},
{
"epoch": 0.5194879089615931,
"grad_norm": 0.608062207698822,
"learning_rate": 5.365159518207252e-05,
"loss": 1.4239,
"step": 913
},
{
"epoch": 0.5200568990042674,
"grad_norm": 0.5979565382003784,
"learning_rate": 5.345028372090256e-05,
"loss": 1.4656,
"step": 914
},
{
"epoch": 0.5206258890469416,
"grad_norm": 0.6553084254264832,
"learning_rate": 5.324921280704589e-05,
"loss": 1.4609,
"step": 915
},
{
"epoch": 0.5211948790896159,
"grad_norm": 0.5839146971702576,
"learning_rate": 5.304838347954363e-05,
"loss": 1.5546,
"step": 916
},
{
"epoch": 0.5217638691322902,
"grad_norm": 0.5618466734886169,
"learning_rate": 5.284779677618841e-05,
"loss": 1.4078,
"step": 917
},
{
"epoch": 0.5223328591749644,
"grad_norm": 0.6020224690437317,
"learning_rate": 5.264745373351923e-05,
"loss": 1.568,
"step": 918
},
{
"epoch": 0.5229018492176387,
"grad_norm": 0.6049513220787048,
"learning_rate": 5.244735538681584e-05,
"loss": 1.3196,
"step": 919
},
{
"epoch": 0.5234708392603129,
"grad_norm": 0.5781171917915344,
"learning_rate": 5.224750277009358e-05,
"loss": 1.5366,
"step": 920
},
{
"epoch": 0.5240398293029872,
"grad_norm": 0.6478269696235657,
"learning_rate": 5.204789691609793e-05,
"loss": 1.5281,
"step": 921
},
{
"epoch": 0.5246088193456615,
"grad_norm": 0.598915696144104,
"learning_rate": 5.184853885629921e-05,
"loss": 1.5734,
"step": 922
},
{
"epoch": 0.5251778093883357,
"grad_norm": 0.589694619178772,
"learning_rate": 5.1649429620887334e-05,
"loss": 1.4307,
"step": 923
},
{
"epoch": 0.52574679943101,
"grad_norm": 0.5483283996582031,
"learning_rate": 5.145057023876634e-05,
"loss": 1.4334,
"step": 924
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.5908382534980774,
"learning_rate": 5.125196173754914e-05,
"loss": 1.588,
"step": 925
},
{
"epoch": 0.5268847795163585,
"grad_norm": 0.5898739695549011,
"learning_rate": 5.105360514355222e-05,
"loss": 1.5685,
"step": 926
},
{
"epoch": 0.5274537695590327,
"grad_norm": 0.6079673171043396,
"learning_rate": 5.0855501481790305e-05,
"loss": 1.4421,
"step": 927
},
{
"epoch": 0.528022759601707,
"grad_norm": 0.5824552178382874,
"learning_rate": 5.0657651775971146e-05,
"loss": 1.3472,
"step": 928
},
{
"epoch": 0.5285917496443813,
"grad_norm": 0.5997583866119385,
"learning_rate": 5.046005704849015e-05,
"loss": 1.6292,
"step": 929
},
{
"epoch": 0.5291607396870555,
"grad_norm": 0.5740709900856018,
"learning_rate": 5.026271832042506e-05,
"loss": 1.4085,
"step": 930
},
{
"epoch": 0.5297297297297298,
"grad_norm": 0.5683955550193787,
"learning_rate": 5.0065636611530767e-05,
"loss": 1.4722,
"step": 931
},
{
"epoch": 0.530298719772404,
"grad_norm": 0.5909097790718079,
"learning_rate": 4.986881294023397e-05,
"loss": 1.5688,
"step": 932
},
{
"epoch": 0.5308677098150782,
"grad_norm": 0.5723986029624939,
"learning_rate": 4.967224832362807e-05,
"loss": 1.718,
"step": 933
},
{
"epoch": 0.5314366998577524,
"grad_norm": 0.6397773623466492,
"learning_rate": 4.947594377746769e-05,
"loss": 1.5896,
"step": 934
},
{
"epoch": 0.5320056899004267,
"grad_norm": 0.6130902171134949,
"learning_rate": 4.9279900316163466e-05,
"loss": 1.5974,
"step": 935
},
{
"epoch": 0.532574679943101,
"grad_norm": 0.5888193845748901,
"learning_rate": 4.908411895277704e-05,
"loss": 1.569,
"step": 936
},
{
"epoch": 0.5331436699857752,
"grad_norm": 0.5966805219650269,
"learning_rate": 4.8888600699015496e-05,
"loss": 1.4014,
"step": 937
},
{
"epoch": 0.5337126600284495,
"grad_norm": 0.6131336092948914,
"learning_rate": 4.869334656522644e-05,
"loss": 1.5619,
"step": 938
},
{
"epoch": 0.5342816500711237,
"grad_norm": 0.5846887826919556,
"learning_rate": 4.849835756039254e-05,
"loss": 1.5674,
"step": 939
},
{
"epoch": 0.534850640113798,
"grad_norm": 0.5879199504852295,
"learning_rate": 4.830363469212631e-05,
"loss": 1.6148,
"step": 940
},
{
"epoch": 0.5354196301564723,
"grad_norm": 0.6081675887107849,
"learning_rate": 4.8109178966665194e-05,
"loss": 1.5329,
"step": 941
},
{
"epoch": 0.5359886201991465,
"grad_norm": 0.5982802510261536,
"learning_rate": 4.791499138886603e-05,
"loss": 1.5198,
"step": 942
},
{
"epoch": 0.5365576102418208,
"grad_norm": 0.5899128913879395,
"learning_rate": 4.7721072962199975e-05,
"loss": 1.331,
"step": 943
},
{
"epoch": 0.537126600284495,
"grad_norm": 0.6289139986038208,
"learning_rate": 4.7527424688747535e-05,
"loss": 1.3543,
"step": 944
},
{
"epoch": 0.5376955903271693,
"grad_norm": 0.5747124552726746,
"learning_rate": 4.733404756919287e-05,
"loss": 1.2679,
"step": 945
},
{
"epoch": 0.5382645803698435,
"grad_norm": 0.5888437032699585,
"learning_rate": 4.7140942602819236e-05,
"loss": 1.3506,
"step": 946
},
{
"epoch": 0.5388335704125178,
"grad_norm": 0.6044580936431885,
"learning_rate": 4.694811078750338e-05,
"loss": 1.5955,
"step": 947
},
{
"epoch": 0.5394025604551921,
"grad_norm": 0.6149877905845642,
"learning_rate": 4.6755553119710524e-05,
"loss": 1.5836,
"step": 948
},
{
"epoch": 0.5399715504978663,
"grad_norm": 0.6135841012001038,
"learning_rate": 4.656327059448937e-05,
"loss": 1.4659,
"step": 949
},
{
"epoch": 0.5405405405405406,
"grad_norm": 0.5868760943412781,
"learning_rate": 4.637126420546653e-05,
"loss": 1.3821,
"step": 950
},
{
"epoch": 0.5411095305832148,
"grad_norm": 0.6109480261802673,
"learning_rate": 4.6179534944842e-05,
"loss": 1.5173,
"step": 951
},
{
"epoch": 0.5416785206258891,
"grad_norm": 0.6133657693862915,
"learning_rate": 4.5988083803383464e-05,
"loss": 1.6325,
"step": 952
},
{
"epoch": 0.5422475106685632,
"grad_norm": 0.593211829662323,
"learning_rate": 4.57969117704215e-05,
"loss": 1.361,
"step": 953
},
{
"epoch": 0.5428165007112375,
"grad_norm": 0.5881854891777039,
"learning_rate": 4.560601983384447e-05,
"loss": 1.3796,
"step": 954
},
{
"epoch": 0.5433854907539118,
"grad_norm": 0.65924471616745,
"learning_rate": 4.5415408980093096e-05,
"loss": 1.5899,
"step": 955
},
{
"epoch": 0.543954480796586,
"grad_norm": 0.6201925277709961,
"learning_rate": 4.522508019415587e-05,
"loss": 1.536,
"step": 956
},
{
"epoch": 0.5445234708392603,
"grad_norm": 0.5619149208068848,
"learning_rate": 4.50350344595635e-05,
"loss": 1.3624,
"step": 957
},
{
"epoch": 0.5450924608819345,
"grad_norm": 0.5680489540100098,
"learning_rate": 4.484527275838404e-05,
"loss": 1.4247,
"step": 958
},
{
"epoch": 0.5456614509246088,
"grad_norm": 0.5449238419532776,
"learning_rate": 4.4655796071217937e-05,
"loss": 1.3423,
"step": 959
},
{
"epoch": 0.5462304409672831,
"grad_norm": 0.6032193899154663,
"learning_rate": 4.446660537719256e-05,
"loss": 1.6294,
"step": 960
},
{
"epoch": 0.5467994310099573,
"grad_norm": 0.5516905784606934,
"learning_rate": 4.427770165395766e-05,
"loss": 1.3738,
"step": 961
},
{
"epoch": 0.5473684210526316,
"grad_norm": 0.6235291361808777,
"learning_rate": 4.4089085877679904e-05,
"loss": 1.4602,
"step": 962
},
{
"epoch": 0.5479374110953058,
"grad_norm": 0.6051345467567444,
"learning_rate": 4.3900759023037974e-05,
"loss": 1.3761,
"step": 963
},
{
"epoch": 0.5485064011379801,
"grad_norm": 0.5858922600746155,
"learning_rate": 4.3712722063217693e-05,
"loss": 1.5158,
"step": 964
},
{
"epoch": 0.5490753911806543,
"grad_norm": 0.5914279818534851,
"learning_rate": 4.3524975969906636e-05,
"loss": 1.3333,
"step": 965
},
{
"epoch": 0.5496443812233286,
"grad_norm": 0.5849418044090271,
"learning_rate": 4.3337521713289407e-05,
"loss": 1.5459,
"step": 966
},
{
"epoch": 0.5502133712660029,
"grad_norm": 0.5740037560462952,
"learning_rate": 4.315036026204262e-05,
"loss": 1.3858,
"step": 967
},
{
"epoch": 0.5507823613086771,
"grad_norm": 0.5611101984977722,
"learning_rate": 4.296349258332967e-05,
"loss": 1.3895,
"step": 968
},
{
"epoch": 0.5513513513513514,
"grad_norm": 0.585473895072937,
"learning_rate": 4.277691964279594e-05,
"loss": 1.2682,
"step": 969
},
{
"epoch": 0.5519203413940256,
"grad_norm": 0.6113364100456238,
"learning_rate": 4.259064240456374e-05,
"loss": 1.4292,
"step": 970
},
{
"epoch": 0.5524893314366999,
"grad_norm": 0.6335917115211487,
"learning_rate": 4.2404661831227276e-05,
"loss": 1.4529,
"step": 971
},
{
"epoch": 0.5530583214793741,
"grad_norm": 0.574226975440979,
"learning_rate": 4.2218978883847835e-05,
"loss": 1.5254,
"step": 972
},
{
"epoch": 0.5536273115220484,
"grad_norm": 0.5865671038627625,
"learning_rate": 4.203359452194863e-05,
"loss": 1.5265,
"step": 973
},
{
"epoch": 0.5541963015647227,
"grad_norm": 0.5852011442184448,
"learning_rate": 4.184850970350992e-05,
"loss": 1.5834,
"step": 974
},
{
"epoch": 0.5547652916073968,
"grad_norm": 0.6045235395431519,
"learning_rate": 4.166372538496408e-05,
"loss": 1.3905,
"step": 975
},
{
"epoch": 0.5553342816500711,
"grad_norm": 0.558691143989563,
"learning_rate": 4.147924252119063e-05,
"loss": 1.5088,
"step": 976
},
{
"epoch": 0.5559032716927453,
"grad_norm": 0.5623577237129211,
"learning_rate": 4.129506206551138e-05,
"loss": 1.3502,
"step": 977
},
{
"epoch": 0.5564722617354196,
"grad_norm": 0.5946846604347229,
"learning_rate": 4.1111184969685354e-05,
"loss": 1.3884,
"step": 978
},
{
"epoch": 0.5570412517780939,
"grad_norm": 0.5882412195205688,
"learning_rate": 4.0927612183903976e-05,
"loss": 1.542,
"step": 979
},
{
"epoch": 0.5576102418207681,
"grad_norm": 0.577912449836731,
"learning_rate": 4.0744344656786124e-05,
"loss": 1.324,
"step": 980
},
{
"epoch": 0.5581792318634424,
"grad_norm": 0.5644152164459229,
"learning_rate": 4.056138333537326e-05,
"loss": 1.2746,
"step": 981
},
{
"epoch": 0.5587482219061166,
"grad_norm": 0.6058292984962463,
"learning_rate": 4.037872916512455e-05,
"loss": 1.5404,
"step": 982
},
{
"epoch": 0.5593172119487909,
"grad_norm": 0.6061570644378662,
"learning_rate": 4.019638308991189e-05,
"loss": 1.3896,
"step": 983
},
{
"epoch": 0.5598862019914651,
"grad_norm": 0.6102644205093384,
"learning_rate": 4.0014346052015114e-05,
"loss": 1.5365,
"step": 984
},
{
"epoch": 0.5604551920341394,
"grad_norm": 0.5747568011283875,
"learning_rate": 3.983261899211708e-05,
"loss": 1.4337,
"step": 985
},
{
"epoch": 0.5610241820768137,
"grad_norm": 0.5756990909576416,
"learning_rate": 3.965120284929878e-05,
"loss": 1.4752,
"step": 986
},
{
"epoch": 0.5615931721194879,
"grad_norm": 0.570568323135376,
"learning_rate": 3.947009856103465e-05,
"loss": 1.4064,
"step": 987
},
{
"epoch": 0.5621621621621622,
"grad_norm": 0.6102871298789978,
"learning_rate": 3.928930706318752e-05,
"loss": 1.5697,
"step": 988
},
{
"epoch": 0.5627311522048364,
"grad_norm": 0.555619478225708,
"learning_rate": 3.910882929000387e-05,
"loss": 1.2905,
"step": 989
},
{
"epoch": 0.5633001422475107,
"grad_norm": 0.6053213477134705,
"learning_rate": 3.892866617410901e-05,
"loss": 1.4823,
"step": 990
},
{
"epoch": 0.5638691322901849,
"grad_norm": 0.5635027289390564,
"learning_rate": 3.874881864650224e-05,
"loss": 1.2325,
"step": 991
},
{
"epoch": 0.5644381223328592,
"grad_norm": 0.6095726490020752,
"learning_rate": 3.8569287636552024e-05,
"loss": 1.5359,
"step": 992
},
{
"epoch": 0.5650071123755335,
"grad_norm": 0.5644766092300415,
"learning_rate": 3.839007407199129e-05,
"loss": 1.277,
"step": 993
},
{
"epoch": 0.5655761024182077,
"grad_norm": 0.5609472393989563,
"learning_rate": 3.821117887891249e-05,
"loss": 1.2394,
"step": 994
},
{
"epoch": 0.566145092460882,
"grad_norm": 0.6164161562919617,
"learning_rate": 3.803260298176288e-05,
"loss": 1.5458,
"step": 995
},
{
"epoch": 0.5667140825035561,
"grad_norm": 0.6040405631065369,
"learning_rate": 3.7854347303339754e-05,
"loss": 1.2356,
"step": 996
},
{
"epoch": 0.5672830725462304,
"grad_norm": 0.6196702718734741,
"learning_rate": 3.767641276478563e-05,
"loss": 1.5923,
"step": 997
},
{
"epoch": 0.5678520625889047,
"grad_norm": 0.5526005029678345,
"learning_rate": 3.749880028558364e-05,
"loss": 1.5057,
"step": 998
},
{
"epoch": 0.5684210526315789,
"grad_norm": 0.5806797742843628,
"learning_rate": 3.732151078355253e-05,
"loss": 1.5355,
"step": 999
},
{
"epoch": 0.5689900426742532,
"grad_norm": 0.5680354237556458,
"learning_rate": 3.7144545174842115e-05,
"loss": 1.4381,
"step": 1000
},
{
"epoch": 0.5695590327169274,
"grad_norm": 0.5921180248260498,
"learning_rate": 3.6967904373928475e-05,
"loss": 1.3444,
"step": 1001
},
{
"epoch": 0.5701280227596017,
"grad_norm": 0.5849342942237854,
"learning_rate": 3.6791589293609184e-05,
"loss": 1.3836,
"step": 1002
},
{
"epoch": 0.5706970128022759,
"grad_norm": 0.5548643469810486,
"learning_rate": 3.661560084499874e-05,
"loss": 1.4809,
"step": 1003
},
{
"epoch": 0.5712660028449502,
"grad_norm": 0.5976467132568359,
"learning_rate": 3.64399399375237e-05,
"loss": 1.4543,
"step": 1004
},
{
"epoch": 0.5718349928876245,
"grad_norm": 0.588699996471405,
"learning_rate": 3.6264607478918037e-05,
"loss": 1.4448,
"step": 1005
},
{
"epoch": 0.5724039829302987,
"grad_norm": 0.5786314606666565,
"learning_rate": 3.608960437521844e-05,
"loss": 1.769,
"step": 1006
},
{
"epoch": 0.572972972972973,
"grad_norm": 0.6124690771102905,
"learning_rate": 3.591493153075966e-05,
"loss": 1.6527,
"step": 1007
},
{
"epoch": 0.5735419630156472,
"grad_norm": 0.5587359070777893,
"learning_rate": 3.5740589848169894e-05,
"loss": 1.2819,
"step": 1008
},
{
"epoch": 0.5741109530583215,
"grad_norm": 0.6170410513877869,
"learning_rate": 3.556658022836594e-05,
"loss": 1.5858,
"step": 1009
},
{
"epoch": 0.5746799431009957,
"grad_norm": 0.5927881002426147,
"learning_rate": 3.5392903570548694e-05,
"loss": 1.6321,
"step": 1010
},
{
"epoch": 0.57524893314367,
"grad_norm": 0.5902583599090576,
"learning_rate": 3.521956077219847e-05,
"loss": 1.5162,
"step": 1011
},
{
"epoch": 0.5758179231863443,
"grad_norm": 0.6113704442977905,
"learning_rate": 3.504655272907028e-05,
"loss": 1.6929,
"step": 1012
},
{
"epoch": 0.5763869132290185,
"grad_norm": 0.5586623549461365,
"learning_rate": 3.4873880335189427e-05,
"loss": 1.3555,
"step": 1013
},
{
"epoch": 0.5769559032716928,
"grad_norm": 0.5992634296417236,
"learning_rate": 3.470154448284659e-05,
"loss": 1.6901,
"step": 1014
},
{
"epoch": 0.577524893314367,
"grad_norm": 0.5722742676734924,
"learning_rate": 3.452954606259343e-05,
"loss": 1.386,
"step": 1015
},
{
"epoch": 0.5780938833570413,
"grad_norm": 0.6090911030769348,
"learning_rate": 3.435788596323789e-05,
"loss": 1.528,
"step": 1016
},
{
"epoch": 0.5786628733997156,
"grad_norm": 0.5943465828895569,
"learning_rate": 3.41865650718396e-05,
"loss": 1.4567,
"step": 1017
},
{
"epoch": 0.5792318634423897,
"grad_norm": 0.5948119163513184,
"learning_rate": 3.4015584273705425e-05,
"loss": 1.4926,
"step": 1018
},
{
"epoch": 0.579800853485064,
"grad_norm": 0.6115890741348267,
"learning_rate": 3.384494445238471e-05,
"loss": 1.4113,
"step": 1019
},
{
"epoch": 0.5803698435277382,
"grad_norm": 0.5682458281517029,
"learning_rate": 3.367464648966471e-05,
"loss": 1.514,
"step": 1020
},
{
"epoch": 0.5809388335704125,
"grad_norm": 0.5994877219200134,
"learning_rate": 3.350469126556627e-05,
"loss": 1.495,
"step": 1021
},
{
"epoch": 0.5815078236130867,
"grad_norm": 0.5887535810470581,
"learning_rate": 3.333507965833905e-05,
"loss": 1.6428,
"step": 1022
},
{
"epoch": 0.582076813655761,
"grad_norm": 0.5758301615715027,
"learning_rate": 3.316581254445701e-05,
"loss": 1.4076,
"step": 1023
},
{
"epoch": 0.5826458036984353,
"grad_norm": 0.6117954850196838,
"learning_rate": 3.299689079861408e-05,
"loss": 1.4471,
"step": 1024
},
{
"epoch": 0.5832147937411095,
"grad_norm": 0.6079879999160767,
"learning_rate": 3.2828315293719245e-05,
"loss": 1.485,
"step": 1025
},
{
"epoch": 0.5837837837837838,
"grad_norm": 0.5936009287834167,
"learning_rate": 3.266008690089253e-05,
"loss": 1.6109,
"step": 1026
},
{
"epoch": 0.584352773826458,
"grad_norm": 0.5736754536628723,
"learning_rate": 3.24922064894601e-05,
"loss": 1.4451,
"step": 1027
},
{
"epoch": 0.5849217638691323,
"grad_norm": 0.5830667018890381,
"learning_rate": 3.23246749269499e-05,
"loss": 1.499,
"step": 1028
},
{
"epoch": 0.5854907539118065,
"grad_norm": 0.5929978489875793,
"learning_rate": 3.2157493079087343e-05,
"loss": 1.5964,
"step": 1029
},
{
"epoch": 0.5860597439544808,
"grad_norm": 0.5748528242111206,
"learning_rate": 3.1990661809790445e-05,
"loss": 1.3425,
"step": 1030
},
{
"epoch": 0.5866287339971551,
"grad_norm": 0.6261157393455505,
"learning_rate": 3.18241819811658e-05,
"loss": 1.4458,
"step": 1031
},
{
"epoch": 0.5871977240398293,
"grad_norm": 0.5736514925956726,
"learning_rate": 3.165805445350383e-05,
"loss": 1.3948,
"step": 1032
},
{
"epoch": 0.5877667140825036,
"grad_norm": 0.6165857911109924,
"learning_rate": 3.149228008527437e-05,
"loss": 1.6043,
"step": 1033
},
{
"epoch": 0.5883357041251778,
"grad_norm": 0.6109797954559326,
"learning_rate": 3.132685973312251e-05,
"loss": 1.5376,
"step": 1034
},
{
"epoch": 0.5889046941678521,
"grad_norm": 0.5716987252235413,
"learning_rate": 3.116179425186361e-05,
"loss": 1.3554,
"step": 1035
},
{
"epoch": 0.5894736842105263,
"grad_norm": 0.6563665866851807,
"learning_rate": 3.099708449447956e-05,
"loss": 1.4934,
"step": 1036
},
{
"epoch": 0.5900426742532006,
"grad_norm": 0.6072697043418884,
"learning_rate": 3.083273131211382e-05,
"loss": 1.3181,
"step": 1037
},
{
"epoch": 0.5906116642958749,
"grad_norm": 0.5769975781440735,
"learning_rate": 3.066873555406727e-05,
"loss": 1.5376,
"step": 1038
},
{
"epoch": 0.591180654338549,
"grad_norm": 0.58552485704422,
"learning_rate": 3.0505098067793937e-05,
"loss": 1.3483,
"step": 1039
},
{
"epoch": 0.5917496443812233,
"grad_norm": 0.6377474069595337,
"learning_rate": 3.0341819698896202e-05,
"loss": 1.6044,
"step": 1040
},
{
"epoch": 0.5923186344238975,
"grad_norm": 0.5746393203735352,
"learning_rate": 3.017890129112094e-05,
"loss": 1.5081,
"step": 1041
},
{
"epoch": 0.5928876244665718,
"grad_norm": 0.5879509449005127,
"learning_rate": 3.0016343686354775e-05,
"loss": 1.7884,
"step": 1042
},
{
"epoch": 0.5934566145092461,
"grad_norm": 0.5871498584747314,
"learning_rate": 2.9854147724619886e-05,
"loss": 1.4425,
"step": 1043
},
{
"epoch": 0.5940256045519203,
"grad_norm": 0.6417199373245239,
"learning_rate": 2.9692314244069764e-05,
"loss": 1.4729,
"step": 1044
},
{
"epoch": 0.5940256045519203,
"eval_loss": 1.4645270109176636,
"eval_runtime": 16.2716,
"eval_samples_per_second": 45.478,
"eval_steps_per_second": 22.739,
"step": 1044
},
{
"epoch": 0.5945945945945946,
"grad_norm": 0.5834308862686157,
"learning_rate": 2.9530844080984565e-05,
"loss": 1.4174,
"step": 1045
},
{
"epoch": 0.5951635846372688,
"grad_norm": 0.5811535120010376,
"learning_rate": 2.9369738069767107e-05,
"loss": 1.2859,
"step": 1046
},
{
"epoch": 0.5957325746799431,
"grad_norm": 0.6040303707122803,
"learning_rate": 2.920899704293849e-05,
"loss": 1.7526,
"step": 1047
},
{
"epoch": 0.5963015647226173,
"grad_norm": 0.5936810970306396,
"learning_rate": 2.9048621831133616e-05,
"loss": 1.3031,
"step": 1048
},
{
"epoch": 0.5968705547652916,
"grad_norm": 0.5825332999229431,
"learning_rate": 2.8888613263097153e-05,
"loss": 1.3483,
"step": 1049
},
{
"epoch": 0.5974395448079659,
"grad_norm": 0.6082255244255066,
"learning_rate": 2.8728972165679067e-05,
"loss": 1.528,
"step": 1050
},
{
"epoch": 0.5980085348506401,
"grad_norm": 0.594572126865387,
"learning_rate": 2.8569699363830316e-05,
"loss": 1.5789,
"step": 1051
},
{
"epoch": 0.5985775248933144,
"grad_norm": 0.6006420850753784,
"learning_rate": 2.8410795680598846e-05,
"loss": 1.5638,
"step": 1052
},
{
"epoch": 0.5991465149359886,
"grad_norm": 0.5715523958206177,
"learning_rate": 2.825226193712507e-05,
"loss": 1.5222,
"step": 1053
},
{
"epoch": 0.5997155049786629,
"grad_norm": 0.5750184059143066,
"learning_rate": 2.8094098952637692e-05,
"loss": 1.5154,
"step": 1054
},
{
"epoch": 0.6002844950213371,
"grad_norm": 0.5836694240570068,
"learning_rate": 2.793630754444967e-05,
"loss": 1.4624,
"step": 1055
},
{
"epoch": 0.6008534850640114,
"grad_norm": 0.5644353628158569,
"learning_rate": 2.7778888527953572e-05,
"loss": 1.564,
"step": 1056
},
{
"epoch": 0.6014224751066857,
"grad_norm": 0.6327478885650635,
"learning_rate": 2.762184271661785e-05,
"loss": 1.4707,
"step": 1057
},
{
"epoch": 0.6019914651493599,
"grad_norm": 0.5783342719078064,
"learning_rate": 2.746517092198231e-05,
"loss": 1.4888,
"step": 1058
},
{
"epoch": 0.6025604551920342,
"grad_norm": 0.5796740651130676,
"learning_rate": 2.730887395365397e-05,
"loss": 1.5201,
"step": 1059
},
{
"epoch": 0.6031294452347084,
"grad_norm": 0.5543321967124939,
"learning_rate": 2.715295261930306e-05,
"loss": 1.4378,
"step": 1060
},
{
"epoch": 0.6036984352773827,
"grad_norm": 0.6244597434997559,
"learning_rate": 2.699740772465851e-05,
"loss": 1.4242,
"step": 1061
},
{
"epoch": 0.604267425320057,
"grad_norm": 0.5890554785728455,
"learning_rate": 2.6842240073504165e-05,
"loss": 1.4732,
"step": 1062
},
{
"epoch": 0.6048364153627311,
"grad_norm": 0.5934953689575195,
"learning_rate": 2.668745046767436e-05,
"loss": 1.517,
"step": 1063
},
{
"epoch": 0.6054054054054054,
"grad_norm": 0.5716105103492737,
"learning_rate": 2.6533039707049834e-05,
"loss": 1.2859,
"step": 1064
},
{
"epoch": 0.6059743954480796,
"grad_norm": 0.5798661708831787,
"learning_rate": 2.63790085895538e-05,
"loss": 1.5015,
"step": 1065
},
{
"epoch": 0.6065433854907539,
"grad_norm": 0.600385844707489,
"learning_rate": 2.6225357911147385e-05,
"loss": 1.4027,
"step": 1066
},
{
"epoch": 0.6071123755334281,
"grad_norm": 0.5749977231025696,
"learning_rate": 2.6072088465826038e-05,
"loss": 1.5876,
"step": 1067
},
{
"epoch": 0.6076813655761024,
"grad_norm": 0.5585724711418152,
"learning_rate": 2.591920104561503e-05,
"loss": 1.4756,
"step": 1068
},
{
"epoch": 0.6082503556187767,
"grad_norm": 0.5597994327545166,
"learning_rate": 2.5766696440565496e-05,
"loss": 1.4621,
"step": 1069
},
{
"epoch": 0.6088193456614509,
"grad_norm": 0.5746705532073975,
"learning_rate": 2.5614575438750522e-05,
"loss": 1.1686,
"step": 1070
},
{
"epoch": 0.6093883357041252,
"grad_norm": 0.6218065023422241,
"learning_rate": 2.546283882626065e-05,
"loss": 1.4789,
"step": 1071
},
{
"epoch": 0.6099573257467994,
"grad_norm": 0.6003706455230713,
"learning_rate": 2.5311487387200306e-05,
"loss": 1.3938,
"step": 1072
},
{
"epoch": 0.6105263157894737,
"grad_norm": 0.6234976649284363,
"learning_rate": 2.516052190368341e-05,
"loss": 1.4399,
"step": 1073
},
{
"epoch": 0.6110953058321479,
"grad_norm": 0.5926810503005981,
"learning_rate": 2.500994315582943e-05,
"loss": 1.3032,
"step": 1074
},
{
"epoch": 0.6116642958748222,
"grad_norm": 0.5906057953834534,
"learning_rate": 2.485975192175949e-05,
"loss": 1.2748,
"step": 1075
},
{
"epoch": 0.6122332859174965,
"grad_norm": 0.5761781334877014,
"learning_rate": 2.4709948977592034e-05,
"loss": 1.4486,
"step": 1076
},
{
"epoch": 0.6128022759601707,
"grad_norm": 0.6170504093170166,
"learning_rate": 2.4560535097439108e-05,
"loss": 1.5943,
"step": 1077
},
{
"epoch": 0.613371266002845,
"grad_norm": 0.6018140912055969,
"learning_rate": 2.4411511053402302e-05,
"loss": 1.5996,
"step": 1078
},
{
"epoch": 0.6139402560455192,
"grad_norm": 0.5538153052330017,
"learning_rate": 2.4262877615568626e-05,
"loss": 1.4874,
"step": 1079
},
{
"epoch": 0.6145092460881935,
"grad_norm": 0.5843609571456909,
"learning_rate": 2.411463555200667e-05,
"loss": 1.269,
"step": 1080
},
{
"epoch": 0.6150782361308678,
"grad_norm": 0.5559793710708618,
"learning_rate": 2.3966785628762546e-05,
"loss": 1.5796,
"step": 1081
},
{
"epoch": 0.615647226173542,
"grad_norm": 0.5636264085769653,
"learning_rate": 2.381932860985596e-05,
"loss": 1.2805,
"step": 1082
},
{
"epoch": 0.6162162162162163,
"grad_norm": 0.6079363226890564,
"learning_rate": 2.3672265257276383e-05,
"loss": 1.5295,
"step": 1083
},
{
"epoch": 0.6167852062588904,
"grad_norm": 0.6165335178375244,
"learning_rate": 2.352559633097885e-05,
"loss": 1.5551,
"step": 1084
},
{
"epoch": 0.6173541963015647,
"grad_norm": 0.6137623190879822,
"learning_rate": 2.337932258888028e-05,
"loss": 1.4585,
"step": 1085
},
{
"epoch": 0.6179231863442389,
"grad_norm": 0.5806836485862732,
"learning_rate": 2.3233444786855407e-05,
"loss": 1.5539,
"step": 1086
},
{
"epoch": 0.6184921763869132,
"grad_norm": 0.5768011212348938,
"learning_rate": 2.308796367873296e-05,
"loss": 1.4415,
"step": 1087
},
{
"epoch": 0.6190611664295875,
"grad_norm": 0.5644312500953674,
"learning_rate": 2.294288001629177e-05,
"loss": 1.4668,
"step": 1088
},
{
"epoch": 0.6196301564722617,
"grad_norm": 0.5748885869979858,
"learning_rate": 2.2798194549256792e-05,
"loss": 1.3066,
"step": 1089
},
{
"epoch": 0.620199146514936,
"grad_norm": 0.5609626770019531,
"learning_rate": 2.2653908025295323e-05,
"loss": 1.3779,
"step": 1090
},
{
"epoch": 0.6207681365576102,
"grad_norm": 0.5691306591033936,
"learning_rate": 2.251002119001312e-05,
"loss": 1.442,
"step": 1091
},
{
"epoch": 0.6213371266002845,
"grad_norm": 0.589314877986908,
"learning_rate": 2.2366534786950467e-05,
"loss": 1.4482,
"step": 1092
},
{
"epoch": 0.6219061166429587,
"grad_norm": 0.5820268392562866,
"learning_rate": 2.222344955757851e-05,
"loss": 1.4195,
"step": 1093
},
{
"epoch": 0.622475106685633,
"grad_norm": 0.6211294531822205,
"learning_rate": 2.2080766241295235e-05,
"loss": 1.549,
"step": 1094
},
{
"epoch": 0.6230440967283073,
"grad_norm": 0.6313804984092712,
"learning_rate": 2.1938485575421752e-05,
"loss": 1.6662,
"step": 1095
},
{
"epoch": 0.6236130867709815,
"grad_norm": 0.5776501297950745,
"learning_rate": 2.1796608295198462e-05,
"loss": 1.3551,
"step": 1096
},
{
"epoch": 0.6241820768136558,
"grad_norm": 0.5959988236427307,
"learning_rate": 2.165513513378121e-05,
"loss": 1.4321,
"step": 1097
},
{
"epoch": 0.62475106685633,
"grad_norm": 0.5878854393959045,
"learning_rate": 2.1514066822237665e-05,
"loss": 1.428,
"step": 1098
},
{
"epoch": 0.6253200568990043,
"grad_norm": 0.5653113722801208,
"learning_rate": 2.137340408954329e-05,
"loss": 1.3464,
"step": 1099
},
{
"epoch": 0.6258890469416786,
"grad_norm": 0.5969840884208679,
"learning_rate": 2.1233147662577767e-05,
"loss": 1.4497,
"step": 1100
},
{
"epoch": 0.6264580369843528,
"grad_norm": 0.5675022602081299,
"learning_rate": 2.1093298266121165e-05,
"loss": 1.4289,
"step": 1101
},
{
"epoch": 0.6270270270270271,
"grad_norm": 0.6396809816360474,
"learning_rate": 2.0953856622850176e-05,
"loss": 1.4908,
"step": 1102
},
{
"epoch": 0.6275960170697013,
"grad_norm": 0.5843429565429688,
"learning_rate": 2.081482345333452e-05,
"loss": 1.6213,
"step": 1103
},
{
"epoch": 0.6281650071123756,
"grad_norm": 0.5792785882949829,
"learning_rate": 2.0676199476033e-05,
"loss": 1.57,
"step": 1104
},
{
"epoch": 0.6287339971550497,
"grad_norm": 0.6015857458114624,
"learning_rate": 2.053798540728995e-05,
"loss": 1.5818,
"step": 1105
},
{
"epoch": 0.629302987197724,
"grad_norm": 0.5723267197608948,
"learning_rate": 2.0400181961331478e-05,
"loss": 1.3799,
"step": 1106
},
{
"epoch": 0.6298719772403983,
"grad_norm": 0.6322827339172363,
"learning_rate": 2.0262789850261798e-05,
"loss": 1.4456,
"step": 1107
},
{
"epoch": 0.6304409672830725,
"grad_norm": 0.6475574970245361,
"learning_rate": 2.012580978405949e-05,
"loss": 1.6081,
"step": 1108
},
{
"epoch": 0.6310099573257468,
"grad_norm": 0.5577263832092285,
"learning_rate": 1.9989242470573975e-05,
"loss": 1.319,
"step": 1109
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.5767825245857239,
"learning_rate": 1.9853088615521663e-05,
"loss": 1.1708,
"step": 1110
},
{
"epoch": 0.6321479374110953,
"grad_norm": 0.5954525470733643,
"learning_rate": 1.9717348922482458e-05,
"loss": 1.3891,
"step": 1111
},
{
"epoch": 0.6327169274537695,
"grad_norm": 0.6102060079574585,
"learning_rate": 1.9582024092896033e-05,
"loss": 1.3531,
"step": 1112
},
{
"epoch": 0.6332859174964438,
"grad_norm": 0.5929975509643555,
"learning_rate": 1.9447114826058233e-05,
"loss": 1.5927,
"step": 1113
},
{
"epoch": 0.6338549075391181,
"grad_norm": 0.5874947905540466,
"learning_rate": 1.931262181911754e-05,
"loss": 1.4828,
"step": 1114
},
{
"epoch": 0.6344238975817923,
"grad_norm": 0.5605891346931458,
"learning_rate": 1.9178545767071322e-05,
"loss": 1.655,
"step": 1115
},
{
"epoch": 0.6349928876244666,
"grad_norm": 0.5770267248153687,
"learning_rate": 1.9044887362762343e-05,
"loss": 1.3424,
"step": 1116
},
{
"epoch": 0.6355618776671408,
"grad_norm": 0.639620840549469,
"learning_rate": 1.8911647296875147e-05,
"loss": 1.3701,
"step": 1117
},
{
"epoch": 0.6361308677098151,
"grad_norm": 0.6051335334777832,
"learning_rate": 1.87788262579325e-05,
"loss": 1.3458,
"step": 1118
},
{
"epoch": 0.6366998577524894,
"grad_norm": 0.5964054465293884,
"learning_rate": 1.8646424932291896e-05,
"loss": 1.5532,
"step": 1119
},
{
"epoch": 0.6372688477951636,
"grad_norm": 0.6007367968559265,
"learning_rate": 1.851444400414185e-05,
"loss": 1.5373,
"step": 1120
},
{
"epoch": 0.6378378378378379,
"grad_norm": 0.5970923900604248,
"learning_rate": 1.8382884155498514e-05,
"loss": 1.5256,
"step": 1121
},
{
"epoch": 0.6384068278805121,
"grad_norm": 0.6180837750434875,
"learning_rate": 1.8251746066202058e-05,
"loss": 1.4781,
"step": 1122
},
{
"epoch": 0.6389758179231864,
"grad_norm": 0.6046369671821594,
"learning_rate": 1.812103041391322e-05,
"loss": 1.4899,
"step": 1123
},
{
"epoch": 0.6395448079658606,
"grad_norm": 0.5703504085540771,
"learning_rate": 1.799073787410982e-05,
"loss": 1.5633,
"step": 1124
},
{
"epoch": 0.6401137980085349,
"grad_norm": 0.6019449830055237,
"learning_rate": 1.786086912008316e-05,
"loss": 1.3685,
"step": 1125
},
{
"epoch": 0.6406827880512092,
"grad_norm": 0.5852835774421692,
"learning_rate": 1.773142482293464e-05,
"loss": 1.5065,
"step": 1126
},
{
"epoch": 0.6412517780938833,
"grad_norm": 0.5664365887641907,
"learning_rate": 1.7602405651572275e-05,
"loss": 1.5823,
"step": 1127
},
{
"epoch": 0.6418207681365576,
"grad_norm": 0.5778409242630005,
"learning_rate": 1.747381227270718e-05,
"loss": 1.4294,
"step": 1128
},
{
"epoch": 0.6423897581792318,
"grad_norm": 0.5901049375534058,
"learning_rate": 1.734564535085028e-05,
"loss": 1.3996,
"step": 1129
},
{
"epoch": 0.6429587482219061,
"grad_norm": 0.6099653244018555,
"learning_rate": 1.721790554830869e-05,
"loss": 1.5873,
"step": 1130
},
{
"epoch": 0.6435277382645803,
"grad_norm": 0.5981472730636597,
"learning_rate": 1.7090593525182287e-05,
"loss": 1.5958,
"step": 1131
},
{
"epoch": 0.6440967283072546,
"grad_norm": 0.6043581366539001,
"learning_rate": 1.6963709939360585e-05,
"loss": 1.561,
"step": 1132
},
{
"epoch": 0.6446657183499289,
"grad_norm": 0.6230269074440002,
"learning_rate": 1.6837255446518964e-05,
"loss": 1.4484,
"step": 1133
},
{
"epoch": 0.6452347083926031,
"grad_norm": 0.579458475112915,
"learning_rate": 1.671123070011551e-05,
"loss": 1.597,
"step": 1134
},
{
"epoch": 0.6458036984352774,
"grad_norm": 0.5982540845870972,
"learning_rate": 1.6585636351387635e-05,
"loss": 1.5299,
"step": 1135
},
{
"epoch": 0.6463726884779516,
"grad_norm": 0.6335290670394897,
"learning_rate": 1.646047304934851e-05,
"loss": 1.6529,
"step": 1136
},
{
"epoch": 0.6469416785206259,
"grad_norm": 0.580467164516449,
"learning_rate": 1.6335741440784035e-05,
"loss": 1.5459,
"step": 1137
},
{
"epoch": 0.6475106685633002,
"grad_norm": 0.5840801000595093,
"learning_rate": 1.621144217024918e-05,
"loss": 1.3808,
"step": 1138
},
{
"epoch": 0.6480796586059744,
"grad_norm": 0.592555582523346,
"learning_rate": 1.608757588006483e-05,
"loss": 1.5013,
"step": 1139
},
{
"epoch": 0.6486486486486487,
"grad_norm": 0.5938240885734558,
"learning_rate": 1.596414321031452e-05,
"loss": 1.3971,
"step": 1140
},
{
"epoch": 0.6492176386913229,
"grad_norm": 0.5719125270843506,
"learning_rate": 1.5841144798840855e-05,
"loss": 1.4372,
"step": 1141
},
{
"epoch": 0.6497866287339972,
"grad_norm": 0.6199617981910706,
"learning_rate": 1.5718581281242572e-05,
"loss": 1.6019,
"step": 1142
},
{
"epoch": 0.6503556187766714,
"grad_norm": 0.594205379486084,
"learning_rate": 1.5596453290870982e-05,
"loss": 1.5322,
"step": 1143
},
{
"epoch": 0.6509246088193457,
"grad_norm": 0.6198005676269531,
"learning_rate": 1.5474761458826793e-05,
"loss": 1.4777,
"step": 1144
},
{
"epoch": 0.65149359886202,
"grad_norm": 0.567058265209198,
"learning_rate": 1.5353506413956932e-05,
"loss": 1.5108,
"step": 1145
},
{
"epoch": 0.6520625889046942,
"grad_norm": 0.5950532555580139,
"learning_rate": 1.5232688782851068e-05,
"loss": 1.5038,
"step": 1146
},
{
"epoch": 0.6526315789473685,
"grad_norm": 0.6178238987922668,
"learning_rate": 1.511230918983867e-05,
"loss": 1.5458,
"step": 1147
},
{
"epoch": 0.6532005689900426,
"grad_norm": 0.5962685346603394,
"learning_rate": 1.4992368256985546e-05,
"loss": 1.432,
"step": 1148
},
{
"epoch": 0.6537695590327169,
"grad_norm": 0.5979620218276978,
"learning_rate": 1.4872866604090696e-05,
"loss": 1.5035,
"step": 1149
},
{
"epoch": 0.6543385490753911,
"grad_norm": 0.5819264650344849,
"learning_rate": 1.475380484868325e-05,
"loss": 1.4169,
"step": 1150
},
{
"epoch": 0.6549075391180654,
"grad_norm": 0.6469606757164001,
"learning_rate": 1.4635183606018943e-05,
"loss": 1.4442,
"step": 1151
},
{
"epoch": 0.6554765291607397,
"grad_norm": 0.5789610147476196,
"learning_rate": 1.451700348907734e-05,
"loss": 1.122,
"step": 1152
},
{
"epoch": 0.6560455192034139,
"grad_norm": 0.5797150135040283,
"learning_rate": 1.4399265108558379e-05,
"loss": 1.5795,
"step": 1153
},
{
"epoch": 0.6566145092460882,
"grad_norm": 0.6014512777328491,
"learning_rate": 1.4281969072879298e-05,
"loss": 1.3673,
"step": 1154
},
{
"epoch": 0.6571834992887624,
"grad_norm": 0.566061794757843,
"learning_rate": 1.4165115988171596e-05,
"loss": 1.4255,
"step": 1155
},
{
"epoch": 0.6577524893314367,
"grad_norm": 0.599322497844696,
"learning_rate": 1.4048706458277672e-05,
"loss": 1.4538,
"step": 1156
},
{
"epoch": 0.658321479374111,
"grad_norm": 0.6258883476257324,
"learning_rate": 1.3932741084747913e-05,
"loss": 1.5197,
"step": 1157
},
{
"epoch": 0.6588904694167852,
"grad_norm": 0.603754460811615,
"learning_rate": 1.3817220466837566e-05,
"loss": 1.5596,
"step": 1158
},
{
"epoch": 0.6594594594594595,
"grad_norm": 0.5680553317070007,
"learning_rate": 1.3702145201503458e-05,
"loss": 1.3882,
"step": 1159
},
{
"epoch": 0.6600284495021337,
"grad_norm": 0.6317921280860901,
"learning_rate": 1.3587515883401202e-05,
"loss": 1.4051,
"step": 1160
},
{
"epoch": 0.660597439544808,
"grad_norm": 0.5998348593711853,
"learning_rate": 1.3473333104881792e-05,
"loss": 1.5309,
"step": 1161
},
{
"epoch": 0.6611664295874822,
"grad_norm": 0.5694242715835571,
"learning_rate": 1.3359597455988803e-05,
"loss": 1.4933,
"step": 1162
},
{
"epoch": 0.6617354196301565,
"grad_norm": 0.6193349361419678,
"learning_rate": 1.3246309524455291e-05,
"loss": 1.5781,
"step": 1163
},
{
"epoch": 0.6623044096728308,
"grad_norm": 0.5579991340637207,
"learning_rate": 1.3133469895700634e-05,
"loss": 1.3616,
"step": 1164
},
{
"epoch": 0.662873399715505,
"grad_norm": 0.5790702104568481,
"learning_rate": 1.3021079152827631e-05,
"loss": 1.3994,
"step": 1165
},
{
"epoch": 0.6634423897581793,
"grad_norm": 0.5730209946632385,
"learning_rate": 1.2909137876619448e-05,
"loss": 1.3269,
"step": 1166
},
{
"epoch": 0.6640113798008535,
"grad_norm": 0.6066268086433411,
"learning_rate": 1.2797646645536566e-05,
"loss": 1.6239,
"step": 1167
},
{
"epoch": 0.6645803698435278,
"grad_norm": 0.649182140827179,
"learning_rate": 1.2686606035713944e-05,
"loss": 1.7304,
"step": 1168
},
{
"epoch": 0.6651493598862019,
"grad_norm": 0.6383649110794067,
"learning_rate": 1.2576016620957853e-05,
"loss": 1.4477,
"step": 1169
},
{
"epoch": 0.6657183499288762,
"grad_norm": 0.5763673782348633,
"learning_rate": 1.2465878972743028e-05,
"loss": 1.4846,
"step": 1170
},
{
"epoch": 0.6662873399715505,
"grad_norm": 0.5865679383277893,
"learning_rate": 1.2356193660209681e-05,
"loss": 1.5687,
"step": 1171
},
{
"epoch": 0.6668563300142247,
"grad_norm": 0.5898412466049194,
"learning_rate": 1.2246961250160527e-05,
"loss": 1.5227,
"step": 1172
},
{
"epoch": 0.667425320056899,
"grad_norm": 0.6910015344619751,
"learning_rate": 1.2138182307057987e-05,
"loss": 1.245,
"step": 1173
},
{
"epoch": 0.6679943100995732,
"grad_norm": 0.5660498142242432,
"learning_rate": 1.2029857393021094e-05,
"loss": 1.2887,
"step": 1174
},
{
"epoch": 0.6685633001422475,
"grad_norm": 0.5966072082519531,
"learning_rate": 1.1921987067822672e-05,
"loss": 1.3417,
"step": 1175
},
{
"epoch": 0.6691322901849218,
"grad_norm": 0.5854772329330444,
"learning_rate": 1.1814571888886483e-05,
"loss": 1.474,
"step": 1176
},
{
"epoch": 0.669701280227596,
"grad_norm": 0.6000230312347412,
"learning_rate": 1.1707612411284253e-05,
"loss": 1.4276,
"step": 1177
},
{
"epoch": 0.6702702702702703,
"grad_norm": 0.5757988691329956,
"learning_rate": 1.1601109187732928e-05,
"loss": 1.5459,
"step": 1178
},
{
"epoch": 0.6708392603129445,
"grad_norm": 0.5930068492889404,
"learning_rate": 1.149506276859167e-05,
"loss": 1.4149,
"step": 1179
},
{
"epoch": 0.6714082503556188,
"grad_norm": 0.5741011500358582,
"learning_rate": 1.1389473701859121e-05,
"loss": 1.2504,
"step": 1180
},
{
"epoch": 0.671977240398293,
"grad_norm": 0.588571310043335,
"learning_rate": 1.1284342533170545e-05,
"loss": 1.6301,
"step": 1181
},
{
"epoch": 0.6725462304409673,
"grad_norm": 0.5500454306602478,
"learning_rate": 1.1179669805794968e-05,
"loss": 1.4952,
"step": 1182
},
{
"epoch": 0.6731152204836416,
"grad_norm": 0.5811514854431152,
"learning_rate": 1.1075456060632472e-05,
"loss": 1.447,
"step": 1183
},
{
"epoch": 0.6736842105263158,
"grad_norm": 0.6315092444419861,
"learning_rate": 1.0971701836211268e-05,
"loss": 1.5707,
"step": 1184
},
{
"epoch": 0.6742532005689901,
"grad_norm": 0.6309195756912231,
"learning_rate": 1.0868407668684998e-05,
"loss": 1.2443,
"step": 1185
},
{
"epoch": 0.6748221906116643,
"grad_norm": 0.5840954780578613,
"learning_rate": 1.0765574091829933e-05,
"loss": 1.4682,
"step": 1186
},
{
"epoch": 0.6753911806543386,
"grad_norm": 0.6113706827163696,
"learning_rate": 1.0663201637042252e-05,
"loss": 1.4292,
"step": 1187
},
{
"epoch": 0.6759601706970128,
"grad_norm": 0.6047906279563904,
"learning_rate": 1.0561290833335224e-05,
"loss": 1.627,
"step": 1188
},
{
"epoch": 0.676529160739687,
"grad_norm": 0.5755859613418579,
"learning_rate": 1.04598422073366e-05,
"loss": 1.3449,
"step": 1189
},
{
"epoch": 0.6770981507823614,
"grad_norm": 0.5938130021095276,
"learning_rate": 1.0358856283285722e-05,
"loss": 1.389,
"step": 1190
},
{
"epoch": 0.6776671408250355,
"grad_norm": 0.5975162386894226,
"learning_rate": 1.0258333583030955e-05,
"loss": 1.4868,
"step": 1191
},
{
"epoch": 0.6782361308677098,
"grad_norm": 0.6362975239753723,
"learning_rate": 1.0158274626026931e-05,
"loss": 1.6409,
"step": 1192
},
{
"epoch": 0.678805120910384,
"grad_norm": 0.6175844669342041,
"learning_rate": 1.0058679929331827e-05,
"loss": 1.4914,
"step": 1193
},
{
"epoch": 0.6793741109530583,
"grad_norm": 0.5870533585548401,
"learning_rate": 9.959550007604835e-06,
"loss": 1.3655,
"step": 1194
},
{
"epoch": 0.6799431009957326,
"grad_norm": 0.5993149280548096,
"learning_rate": 9.860885373103324e-06,
"loss": 1.4203,
"step": 1195
},
{
"epoch": 0.6805120910384068,
"grad_norm": 0.5798912048339844,
"learning_rate": 9.7626865356803e-06,
"loss": 1.4378,
"step": 1196
},
{
"epoch": 0.6810810810810811,
"grad_norm": 0.5729113221168518,
"learning_rate": 9.664954002781745e-06,
"loss": 1.4054,
"step": 1197
},
{
"epoch": 0.6816500711237553,
"grad_norm": 0.6329131126403809,
"learning_rate": 9.567688279443964e-06,
"loss": 1.4381,
"step": 1198
},
{
"epoch": 0.6822190611664296,
"grad_norm": 0.6088592410087585,
"learning_rate": 9.4708898682911e-06,
"loss": 1.4094,
"step": 1199
},
{
"epoch": 0.6827880512091038,
"grad_norm": 0.5889382362365723,
"learning_rate": 9.374559269532346e-06,
"loss": 1.5365,
"step": 1200
},
{
"epoch": 0.6833570412517781,
"grad_norm": 0.6487043499946594,
"learning_rate": 9.27869698095951e-06,
"loss": 1.4747,
"step": 1201
},
{
"epoch": 0.6839260312944524,
"grad_norm": 0.6006666421890259,
"learning_rate": 9.183303497944361e-06,
"loss": 1.3953,
"step": 1202
},
{
"epoch": 0.6844950213371266,
"grad_norm": 0.5925318002700806,
"learning_rate": 9.088379313436113e-06,
"loss": 1.5679,
"step": 1203
},
{
"epoch": 0.6850640113798009,
"grad_norm": 0.5964149832725525,
"learning_rate": 8.993924917958874e-06,
"loss": 1.4872,
"step": 1204
},
{
"epoch": 0.6856330014224751,
"grad_norm": 0.5567532777786255,
"learning_rate": 8.899940799609096e-06,
"loss": 1.3922,
"step": 1205
},
{
"epoch": 0.6862019914651494,
"grad_norm": 0.5803432464599609,
"learning_rate": 8.806427444053033e-06,
"loss": 1.319,
"step": 1206
},
{
"epoch": 0.6867709815078236,
"grad_norm": 0.583640456199646,
"learning_rate": 8.713385334524283e-06,
"loss": 1.4564,
"step": 1207
},
{
"epoch": 0.6873399715504979,
"grad_norm": 0.6316723227500916,
"learning_rate": 8.620814951821232e-06,
"loss": 1.4586,
"step": 1208
},
{
"epoch": 0.6879089615931722,
"grad_norm": 0.5926545262336731,
"learning_rate": 8.528716774304658e-06,
"loss": 1.5008,
"step": 1209
},
{
"epoch": 0.6884779516358464,
"grad_norm": 0.5738364458084106,
"learning_rate": 8.43709127789517e-06,
"loss": 1.3766,
"step": 1210
},
{
"epoch": 0.6890469416785207,
"grad_norm": 0.5985202193260193,
"learning_rate": 8.345938936070718e-06,
"loss": 1.5175,
"step": 1211
},
{
"epoch": 0.6896159317211948,
"grad_norm": 0.6196452379226685,
"learning_rate": 8.255260219864324e-06,
"loss": 1.6161,
"step": 1212
},
{
"epoch": 0.6901849217638691,
"grad_norm": 0.6303586959838867,
"learning_rate": 8.16505559786146e-06,
"loss": 1.6054,
"step": 1213
},
{
"epoch": 0.6907539118065433,
"grad_norm": 0.5856702327728271,
"learning_rate": 8.07532553619772e-06,
"loss": 1.5131,
"step": 1214
},
{
"epoch": 0.6913229018492176,
"grad_norm": 0.5996472239494324,
"learning_rate": 7.986070498556397e-06,
"loss": 1.3462,
"step": 1215
},
{
"epoch": 0.6918918918918919,
"grad_norm": 0.6016293168067932,
"learning_rate": 7.897290946166037e-06,
"loss": 1.3177,
"step": 1216
},
{
"epoch": 0.6924608819345661,
"grad_norm": 0.5805103182792664,
"learning_rate": 7.808987337798158e-06,
"loss": 1.1701,
"step": 1217
},
{
"epoch": 0.6930298719772404,
"grad_norm": 0.5823555588722229,
"learning_rate": 7.721160129764792e-06,
"loss": 1.275,
"step": 1218
},
{
"epoch": 0.6935988620199146,
"grad_norm": 0.5881485939025879,
"learning_rate": 7.633809775916135e-06,
"loss": 1.3304,
"step": 1219
},
{
"epoch": 0.6941678520625889,
"grad_norm": 0.610157310962677,
"learning_rate": 7.546936727638298e-06,
"loss": 1.325,
"step": 1220
},
{
"epoch": 0.6947368421052632,
"grad_norm": 0.6015896797180176,
"learning_rate": 7.460541433850788e-06,
"loss": 1.4739,
"step": 1221
},
{
"epoch": 0.6953058321479374,
"grad_norm": 0.6073941588401794,
"learning_rate": 7.374624341004388e-06,
"loss": 1.6308,
"step": 1222
},
{
"epoch": 0.6958748221906117,
"grad_norm": 0.5897748470306396,
"learning_rate": 7.289185893078721e-06,
"loss": 1.4808,
"step": 1223
},
{
"epoch": 0.6964438122332859,
"grad_norm": 0.6318244338035583,
"learning_rate": 7.204226531579994e-06,
"loss": 1.5134,
"step": 1224
},
{
"epoch": 0.6970128022759602,
"grad_norm": 0.5809091329574585,
"learning_rate": 7.119746695538765e-06,
"loss": 1.4117,
"step": 1225
},
{
"epoch": 0.6975817923186344,
"grad_norm": 0.6108141541481018,
"learning_rate": 7.0357468215075275e-06,
"loss": 1.3201,
"step": 1226
},
{
"epoch": 0.6981507823613087,
"grad_norm": 0.566813051700592,
"learning_rate": 6.952227343558671e-06,
"loss": 1.502,
"step": 1227
},
{
"epoch": 0.698719772403983,
"grad_norm": 0.5999999046325684,
"learning_rate": 6.869188693282036e-06,
"loss": 1.3958,
"step": 1228
},
{
"epoch": 0.6992887624466572,
"grad_norm": 0.6325690150260925,
"learning_rate": 6.786631299782797e-06,
"loss": 1.4682,
"step": 1229
},
{
"epoch": 0.6998577524893315,
"grad_norm": 0.5865020155906677,
"learning_rate": 6.704555589679262e-06,
"loss": 1.4662,
"step": 1230
},
{
"epoch": 0.7004267425320057,
"grad_norm": 0.5978051424026489,
"learning_rate": 6.622961987100518e-06,
"loss": 1.4549,
"step": 1231
},
{
"epoch": 0.70099573257468,
"grad_norm": 0.6172093152999878,
"learning_rate": 6.541850913684444e-06,
"loss": 1.52,
"step": 1232
},
{
"epoch": 0.7015647226173541,
"grad_norm": 0.6080772280693054,
"learning_rate": 6.461222788575394e-06,
"loss": 1.5765,
"step": 1233
},
{
"epoch": 0.7021337126600284,
"grad_norm": 0.6048703193664551,
"learning_rate": 6.3810780284220495e-06,
"loss": 1.6723,
"step": 1234
},
{
"epoch": 0.7027027027027027,
"grad_norm": 0.5950552225112915,
"learning_rate": 6.301417047375347e-06,
"loss": 1.4492,
"step": 1235
},
{
"epoch": 0.7032716927453769,
"grad_norm": 0.5936811566352844,
"learning_rate": 6.222240257086176e-06,
"loss": 1.4721,
"step": 1236
},
{
"epoch": 0.7038406827880512,
"grad_norm": 0.5990265011787415,
"learning_rate": 6.143548066703475e-06,
"loss": 1.3644,
"step": 1237
},
{
"epoch": 0.7044096728307254,
"grad_norm": 0.5738005638122559,
"learning_rate": 6.065340882871906e-06,
"loss": 1.4847,
"step": 1238
},
{
"epoch": 0.7049786628733997,
"grad_norm": 0.5998217463493347,
"learning_rate": 5.9876191097298475e-06,
"loss": 1.4917,
"step": 1239
},
{
"epoch": 0.705547652916074,
"grad_norm": 0.5693299174308777,
"learning_rate": 5.910383148907395e-06,
"loss": 1.3934,
"step": 1240
},
{
"epoch": 0.7061166429587482,
"grad_norm": 0.5998255014419556,
"learning_rate": 5.8336333995240526e-06,
"loss": 1.6348,
"step": 1241
},
{
"epoch": 0.7066856330014225,
"grad_norm": 0.6056730151176453,
"learning_rate": 5.757370258186889e-06,
"loss": 1.4748,
"step": 1242
},
{
"epoch": 0.7072546230440967,
"grad_norm": 0.6141417622566223,
"learning_rate": 5.6815941189884315e-06,
"loss": 1.4371,
"step": 1243
},
{
"epoch": 0.707823613086771,
"grad_norm": 0.5924522280693054,
"learning_rate": 5.606305373504528e-06,
"loss": 1.4896,
"step": 1244
},
{
"epoch": 0.7083926031294452,
"grad_norm": 0.5907067656517029,
"learning_rate": 5.5315044107925094e-06,
"loss": 1.5258,
"step": 1245
},
{
"epoch": 0.7089615931721195,
"grad_norm": 0.5856897234916687,
"learning_rate": 5.457191617388957e-06,
"loss": 1.3751,
"step": 1246
},
{
"epoch": 0.7095305832147938,
"grad_norm": 0.5863030552864075,
"learning_rate": 5.383367377307857e-06,
"loss": 1.2607,
"step": 1247
},
{
"epoch": 0.710099573257468,
"grad_norm": 0.5891332626342773,
"learning_rate": 5.310032072038651e-06,
"loss": 1.3852,
"step": 1248
},
{
"epoch": 0.7106685633001423,
"grad_norm": 0.5775113701820374,
"learning_rate": 5.237186080544098e-06,
"loss": 1.5867,
"step": 1249
},
{
"epoch": 0.7112375533428165,
"grad_norm": 0.5843526721000671,
"learning_rate": 5.164829779258451e-06,
"loss": 1.5694,
"step": 1250
},
{
"epoch": 0.7118065433854908,
"grad_norm": 0.7019409537315369,
"learning_rate": 5.092963542085483e-06,
"loss": 1.4444,
"step": 1251
},
{
"epoch": 0.712375533428165,
"grad_norm": 0.6279569864273071,
"learning_rate": 5.021587740396505e-06,
"loss": 1.5798,
"step": 1252
},
{
"epoch": 0.7129445234708393,
"grad_norm": 0.6179226040840149,
"learning_rate": 4.950702743028535e-06,
"loss": 1.3976,
"step": 1253
},
{
"epoch": 0.7135135135135136,
"grad_norm": 0.5874016284942627,
"learning_rate": 4.880308916282305e-06,
"loss": 1.5384,
"step": 1254
},
{
"epoch": 0.7140825035561877,
"grad_norm": 0.5691651701927185,
"learning_rate": 4.810406623920427e-06,
"loss": 1.3594,
"step": 1255
},
{
"epoch": 0.714651493598862,
"grad_norm": 0.5660080909729004,
"learning_rate": 4.740996227165462e-06,
"loss": 1.5635,
"step": 1256
},
{
"epoch": 0.7152204836415362,
"grad_norm": 0.6053207516670227,
"learning_rate": 4.672078084698095e-06,
"loss": 1.5981,
"step": 1257
},
{
"epoch": 0.7157894736842105,
"grad_norm": 0.5976085066795349,
"learning_rate": 4.603652552655302e-06,
"loss": 1.5909,
"step": 1258
},
{
"epoch": 0.7163584637268848,
"grad_norm": 0.547666609287262,
"learning_rate": 4.53571998462845e-06,
"loss": 1.3622,
"step": 1259
},
{
"epoch": 0.716927453769559,
"grad_norm": 0.6441154479980469,
"learning_rate": 4.468280731661489e-06,
"loss": 1.4626,
"step": 1260
},
{
"epoch": 0.7174964438122333,
"grad_norm": 0.6044400930404663,
"learning_rate": 4.4013351422491635e-06,
"loss": 1.5432,
"step": 1261
},
{
"epoch": 0.7180654338549075,
"grad_norm": 0.5658133029937744,
"learning_rate": 4.334883562335157e-06,
"loss": 1.4528,
"step": 1262
},
{
"epoch": 0.7186344238975818,
"grad_norm": 0.6291137933731079,
"learning_rate": 4.268926335310408e-06,
"loss": 1.2975,
"step": 1263
},
{
"epoch": 0.719203413940256,
"grad_norm": 0.5724123120307922,
"learning_rate": 4.20346380201122e-06,
"loss": 1.223,
"step": 1264
},
{
"epoch": 0.7197724039829303,
"grad_norm": 0.6324455142021179,
"learning_rate": 4.138496300717565e-06,
"loss": 1.3516,
"step": 1265
},
{
"epoch": 0.7203413940256046,
"grad_norm": 0.5916242599487305,
"learning_rate": 4.0740241671513025e-06,
"loss": 1.6546,
"step": 1266
},
{
"epoch": 0.7209103840682788,
"grad_norm": 0.5780736804008484,
"learning_rate": 4.010047734474454e-06,
"loss": 1.4467,
"step": 1267
},
{
"epoch": 0.7214793741109531,
"grad_norm": 0.580437958240509,
"learning_rate": 3.946567333287566e-06,
"loss": 1.2151,
"step": 1268
},
{
"epoch": 0.7220483641536273,
"grad_norm": 0.631999135017395,
"learning_rate": 3.883583291627823e-06,
"loss": 1.6731,
"step": 1269
},
{
"epoch": 0.7226173541963016,
"grad_norm": 0.5912725329399109,
"learning_rate": 3.821095934967511e-06,
"loss": 1.5419,
"step": 1270
},
{
"epoch": 0.7231863442389758,
"grad_norm": 0.5841814279556274,
"learning_rate": 3.759105586212275e-06,
"loss": 1.36,
"step": 1271
},
{
"epoch": 0.7237553342816501,
"grad_norm": 0.620486319065094,
"learning_rate": 3.6976125656994376e-06,
"loss": 1.4474,
"step": 1272
},
{
"epoch": 0.7243243243243244,
"grad_norm": 0.5620819330215454,
"learning_rate": 3.6366171911963455e-06,
"loss": 1.3565,
"step": 1273
},
{
"epoch": 0.7248933143669986,
"grad_norm": 0.6318161487579346,
"learning_rate": 3.576119777898812e-06,
"loss": 1.5721,
"step": 1274
},
{
"epoch": 0.7254623044096729,
"grad_norm": 0.5643869638442993,
"learning_rate": 3.516120638429332e-06,
"loss": 1.3681,
"step": 1275
},
{
"epoch": 0.726031294452347,
"grad_norm": 0.5829715132713318,
"learning_rate": 3.4566200828356157e-06,
"loss": 1.3699,
"step": 1276
},
{
"epoch": 0.7266002844950213,
"grad_norm": 0.5623791813850403,
"learning_rate": 3.397618418588877e-06,
"loss": 1.3686,
"step": 1277
},
{
"epoch": 0.7271692745376956,
"grad_norm": 0.5907699465751648,
"learning_rate": 3.3391159505823165e-06,
"loss": 1.5019,
"step": 1278
},
{
"epoch": 0.7277382645803698,
"grad_norm": 0.5887323617935181,
"learning_rate": 3.2811129811295416e-06,
"loss": 1.5161,
"step": 1279
},
{
"epoch": 0.7283072546230441,
"grad_norm": 0.6375420093536377,
"learning_rate": 3.2236098099629353e-06,
"loss": 1.53,
"step": 1280
},
{
"epoch": 0.7288762446657183,
"grad_norm": 0.569848358631134,
"learning_rate": 3.16660673423218e-06,
"loss": 1.5462,
"step": 1281
},
{
"epoch": 0.7294452347083926,
"grad_norm": 0.5773903727531433,
"learning_rate": 3.1101040485027043e-06,
"loss": 1.4332,
"step": 1282
},
{
"epoch": 0.7300142247510668,
"grad_norm": 0.5759513974189758,
"learning_rate": 3.0541020447541256e-06,
"loss": 1.4906,
"step": 1283
},
{
"epoch": 0.7305832147937411,
"grad_norm": 0.5894652009010315,
"learning_rate": 2.99860101237881e-06,
"loss": 1.3007,
"step": 1284
},
{
"epoch": 0.7311522048364154,
"grad_norm": 0.5720746517181396,
"learning_rate": 2.9436012381803156e-06,
"loss": 1.5254,
"step": 1285
},
{
"epoch": 0.7317211948790896,
"grad_norm": 0.6133726239204407,
"learning_rate": 2.8891030063719183e-06,
"loss": 1.6029,
"step": 1286
},
{
"epoch": 0.7322901849217639,
"grad_norm": 0.6293920874595642,
"learning_rate": 2.8351065985751766e-06,
"loss": 1.5918,
"step": 1287
},
{
"epoch": 0.7328591749644381,
"grad_norm": 0.5941974520683289,
"learning_rate": 2.7816122938184255e-06,
"loss": 1.43,
"step": 1288
},
{
"epoch": 0.7334281650071124,
"grad_norm": 0.5790094137191772,
"learning_rate": 2.7286203685354063e-06,
"loss": 1.4635,
"step": 1289
},
{
"epoch": 0.7339971550497866,
"grad_norm": 0.593591570854187,
"learning_rate": 2.6761310965637833e-06,
"loss": 1.554,
"step": 1290
},
{
"epoch": 0.7345661450924609,
"grad_norm": 0.6287367939949036,
"learning_rate": 2.62414474914372e-06,
"loss": 1.2736,
"step": 1291
},
{
"epoch": 0.7351351351351352,
"grad_norm": 0.586243748664856,
"learning_rate": 2.5726615949165254e-06,
"loss": 1.6281,
"step": 1292
},
{
"epoch": 0.7357041251778094,
"grad_norm": 0.6094790697097778,
"learning_rate": 2.5216818999232117e-06,
"loss": 1.4495,
"step": 1293
},
{
"epoch": 0.7362731152204837,
"grad_norm": 0.5789740681648254,
"learning_rate": 2.4712059276031816e-06,
"loss": 1.6063,
"step": 1294
},
{
"epoch": 0.7368421052631579,
"grad_norm": 0.5999897122383118,
"learning_rate": 2.421233938792811e-06,
"loss": 1.3805,
"step": 1295
},
{
"epoch": 0.7374110953058322,
"grad_norm": 0.5815314054489136,
"learning_rate": 2.3717661917241117e-06,
"loss": 1.4289,
"step": 1296
},
{
"epoch": 0.7379800853485065,
"grad_norm": 0.5862295031547546,
"learning_rate": 2.322802942023461e-06,
"loss": 1.4672,
"step": 1297
},
{
"epoch": 0.7385490753911806,
"grad_norm": 0.5870852470397949,
"learning_rate": 2.2743444427101525e-06,
"loss": 1.5368,
"step": 1298
},
{
"epoch": 0.7391180654338549,
"grad_norm": 0.5981742143630981,
"learning_rate": 2.2263909441952226e-06,
"loss": 1.4996,
"step": 1299
},
{
"epoch": 0.7396870554765291,
"grad_norm": 0.6381643414497375,
"learning_rate": 2.178942694280095e-06,
"loss": 1.4773,
"step": 1300
},
{
"epoch": 0.7402560455192034,
"grad_norm": 0.5861015915870667,
"learning_rate": 2.1319999381552604e-06,
"loss": 1.3885,
"step": 1301
},
{
"epoch": 0.7408250355618776,
"grad_norm": 0.5836819410324097,
"learning_rate": 2.0855629183990867e-06,
"loss": 1.4594,
"step": 1302
},
{
"epoch": 0.7413940256045519,
"grad_norm": 0.57367342710495,
"learning_rate": 2.039631874976533e-06,
"loss": 1.5536,
"step": 1303
},
{
"epoch": 0.7419630156472262,
"grad_norm": 0.6635316014289856,
"learning_rate": 1.9942070452378836e-06,
"loss": 1.3837,
"step": 1304
},
{
"epoch": 0.7425320056899004,
"grad_norm": 0.6087902784347534,
"learning_rate": 1.9492886639175922e-06,
"loss": 1.5232,
"step": 1305
},
{
"epoch": 0.7431009957325747,
"grad_norm": 0.5868977308273315,
"learning_rate": 1.9048769631329399e-06,
"loss": 1.5394,
"step": 1306
},
{
"epoch": 0.7436699857752489,
"grad_norm": 0.5506064891815186,
"learning_rate": 1.8609721723830132e-06,
"loss": 1.3222,
"step": 1307
},
{
"epoch": 0.7442389758179232,
"grad_norm": 0.6256322860717773,
"learning_rate": 1.8175745185473714e-06,
"loss": 1.6126,
"step": 1308
},
{
"epoch": 0.7448079658605974,
"grad_norm": 0.6551912426948547,
"learning_rate": 1.774684225884904e-06,
"loss": 1.6678,
"step": 1309
},
{
"epoch": 0.7453769559032717,
"grad_norm": 0.6695655584335327,
"learning_rate": 1.7323015160327638e-06,
"loss": 1.4653,
"step": 1310
},
{
"epoch": 0.745945945945946,
"grad_norm": 0.5701092481613159,
"learning_rate": 1.690426608005069e-06,
"loss": 1.4619,
"step": 1311
},
{
"epoch": 0.7465149359886202,
"grad_norm": 0.5779337882995605,
"learning_rate": 1.6490597181919254e-06,
"loss": 1.3819,
"step": 1312
},
{
"epoch": 0.7470839260312945,
"grad_norm": 0.6129727959632874,
"learning_rate": 1.6082010603582053e-06,
"loss": 1.5916,
"step": 1313
},
{
"epoch": 0.7476529160739687,
"grad_norm": 0.627040445804596,
"learning_rate": 1.567850845642449e-06,
"loss": 1.4437,
"step": 1314
},
{
"epoch": 0.748221906116643,
"grad_norm": 0.5893445611000061,
"learning_rate": 1.5280092825558645e-06,
"loss": 1.4348,
"step": 1315
},
{
"epoch": 0.7487908961593173,
"grad_norm": 0.5637263059616089,
"learning_rate": 1.4886765769811072e-06,
"loss": 1.4235,
"step": 1316
},
{
"epoch": 0.7493598862019915,
"grad_norm": 0.5983802080154419,
"learning_rate": 1.4498529321713584e-06,
"loss": 1.5322,
"step": 1317
},
{
"epoch": 0.7499288762446658,
"grad_norm": 0.6314471364021301,
"learning_rate": 1.4115385487491583e-06,
"loss": 1.511,
"step": 1318
},
{
"epoch": 0.7504978662873399,
"grad_norm": 0.6288767457008362,
"learning_rate": 1.3737336247054644e-06,
"loss": 1.5245,
"step": 1319
},
{
"epoch": 0.7510668563300142,
"grad_norm": 0.5633329153060913,
"learning_rate": 1.3364383553985726e-06,
"loss": 1.5002,
"step": 1320
},
{
"epoch": 0.7516358463726884,
"grad_norm": 0.5696760416030884,
"learning_rate": 1.2996529335530749e-06,
"loss": 1.4833,
"step": 1321
},
{
"epoch": 0.7522048364153627,
"grad_norm": 0.6070805191993713,
"learning_rate": 1.2633775492589816e-06,
"loss": 1.3631,
"step": 1322
},
{
"epoch": 0.752773826458037,
"grad_norm": 0.6544724702835083,
"learning_rate": 1.2276123899706227e-06,
"loss": 1.3451,
"step": 1323
},
{
"epoch": 0.7533428165007112,
"grad_norm": 0.5900003910064697,
"learning_rate": 1.1923576405057258e-06,
"loss": 1.4344,
"step": 1324
},
{
"epoch": 0.7539118065433855,
"grad_norm": 0.5852407217025757,
"learning_rate": 1.1576134830444619e-06,
"loss": 1.3403,
"step": 1325
},
{
"epoch": 0.7544807965860597,
"grad_norm": 0.557827353477478,
"learning_rate": 1.1233800971285013e-06,
"loss": 1.2692,
"step": 1326
},
{
"epoch": 0.755049786628734,
"grad_norm": 0.5728473663330078,
"learning_rate": 1.0896576596600705e-06,
"loss": 1.2999,
"step": 1327
},
{
"epoch": 0.7556187766714082,
"grad_norm": 0.6241177320480347,
"learning_rate": 1.0564463449010852e-06,
"loss": 1.3683,
"step": 1328
},
{
"epoch": 0.7561877667140825,
"grad_norm": 0.5648365020751953,
"learning_rate": 1.0237463244721747e-06,
"loss": 1.4297,
"step": 1329
},
{
"epoch": 0.7567567567567568,
"grad_norm": 0.590801477432251,
"learning_rate": 9.915577673518695e-07,
"loss": 1.5818,
"step": 1330
},
{
"epoch": 0.757325746799431,
"grad_norm": 0.6130539178848267,
"learning_rate": 9.59880839875682e-07,
"loss": 1.2326,
"step": 1331
},
{
"epoch": 0.7578947368421053,
"grad_norm": 0.5840870141983032,
"learning_rate": 9.287157057352502e-07,
"loss": 1.4967,
"step": 1332
},
{
"epoch": 0.7584637268847795,
"grad_norm": 0.5912277102470398,
"learning_rate": 8.980625259775277e-07,
"loss": 1.5008,
"step": 1333
},
{
"epoch": 0.7590327169274538,
"grad_norm": 0.5769196152687073,
"learning_rate": 8.679214590039064e-07,
"loss": 1.29,
"step": 1334
},
{
"epoch": 0.7596017069701281,
"grad_norm": 0.6438156366348267,
"learning_rate": 8.382926605694064e-07,
"loss": 1.4775,
"step": 1335
},
{
"epoch": 0.7601706970128023,
"grad_norm": 0.5996107459068298,
"learning_rate": 8.091762837819094e-07,
"loss": 1.5507,
"step": 1336
},
{
"epoch": 0.7607396870554766,
"grad_norm": 0.6058631539344788,
"learning_rate": 7.80572479101327e-07,
"loss": 1.4726,
"step": 1337
},
{
"epoch": 0.7613086770981508,
"grad_norm": 0.5997673273086548,
"learning_rate": 7.524813943388331e-07,
"loss": 1.5108,
"step": 1338
},
{
"epoch": 0.761877667140825,
"grad_norm": 0.5843853950500488,
"learning_rate": 7.249031746561108e-07,
"loss": 1.4569,
"step": 1339
},
{
"epoch": 0.7624466571834992,
"grad_norm": 0.5930002927780151,
"learning_rate": 6.978379625645959e-07,
"loss": 1.4858,
"step": 1340
},
{
"epoch": 0.7630156472261735,
"grad_norm": 0.5973467826843262,
"learning_rate": 6.712858979247116e-07,
"loss": 1.5819,
"step": 1341
},
{
"epoch": 0.7635846372688478,
"grad_norm": 0.601449191570282,
"learning_rate": 6.452471179452135e-07,
"loss": 1.6227,
"step": 1342
},
{
"epoch": 0.764153627311522,
"grad_norm": 0.5814241170883179,
"learning_rate": 6.197217571824232e-07,
"loss": 1.3806,
"step": 1343
},
{
"epoch": 0.7647226173541963,
"grad_norm": 0.5642287731170654,
"learning_rate": 5.947099475395402e-07,
"loss": 1.1583,
"step": 1344
},
{
"epoch": 0.7652916073968705,
"grad_norm": 0.5667275190353394,
"learning_rate": 5.702118182659866e-07,
"loss": 1.5422,
"step": 1345
},
{
"epoch": 0.7658605974395448,
"grad_norm": 0.5716063976287842,
"learning_rate": 5.462274959567193e-07,
"loss": 1.4454,
"step": 1346
},
{
"epoch": 0.766429587482219,
"grad_norm": 0.5906545519828796,
"learning_rate": 5.227571045515633e-07,
"loss": 1.4336,
"step": 1347
},
{
"epoch": 0.7669985775248933,
"grad_norm": 0.5766403079032898,
"learning_rate": 4.998007653346126e-07,
"loss": 1.3452,
"step": 1348
},
{
"epoch": 0.7675675675675676,
"grad_norm": 0.5666573643684387,
"learning_rate": 4.773585969335636e-07,
"loss": 1.4239,
"step": 1349
},
{
"epoch": 0.7681365576102418,
"grad_norm": 0.586741030216217,
"learning_rate": 4.554307153191273e-07,
"loss": 1.4837,
"step": 1350
},
{
"epoch": 0.7687055476529161,
"grad_norm": 0.5627419948577881,
"learning_rate": 4.340172338043846e-07,
"loss": 1.1588,
"step": 1351
},
{
"epoch": 0.7692745376955903,
"grad_norm": 0.6001150608062744,
"learning_rate": 4.131182630442876e-07,
"loss": 1.6122,
"step": 1352
},
{
"epoch": 0.7698435277382646,
"grad_norm": 0.6114206314086914,
"learning_rate": 3.9273391103499257e-07,
"loss": 1.493,
"step": 1353
},
{
"epoch": 0.7704125177809389,
"grad_norm": 0.5789377689361572,
"learning_rate": 3.728642831133833e-07,
"loss": 1.4742,
"step": 1354
},
{
"epoch": 0.7709815078236131,
"grad_norm": 0.6265093684196472,
"learning_rate": 3.5350948195645993e-07,
"loss": 1.4284,
"step": 1355
},
{
"epoch": 0.7715504978662874,
"grad_norm": 0.6009383201599121,
"learning_rate": 3.3466960758082867e-07,
"loss": 1.3373,
"step": 1356
},
{
"epoch": 0.7721194879089616,
"grad_norm": 0.5908554196357727,
"learning_rate": 3.163447573422351e-07,
"loss": 1.4147,
"step": 1357
},
{
"epoch": 0.7726884779516359,
"grad_norm": 0.614896297454834,
"learning_rate": 2.985350259349762e-07,
"loss": 1.3802,
"step": 1358
},
{
"epoch": 0.77325746799431,
"grad_norm": 0.6063140630722046,
"learning_rate": 2.812405053914891e-07,
"loss": 1.66,
"step": 1359
},
{
"epoch": 0.7738264580369844,
"grad_norm": 0.5757827758789062,
"learning_rate": 2.644612850818073e-07,
"loss": 1.4361,
"step": 1360
},
{
"epoch": 0.7743954480796587,
"grad_norm": 0.5920888185501099,
"learning_rate": 2.481974517131502e-07,
"loss": 1.3681,
"step": 1361
},
{
"epoch": 0.7749644381223328,
"grad_norm": 0.5669330358505249,
"learning_rate": 2.324490893294673e-07,
"loss": 1.5391,
"step": 1362
},
{
"epoch": 0.7755334281650071,
"grad_norm": 0.5745005011558533,
"learning_rate": 2.172162793109611e-07,
"loss": 1.3118,
"step": 1363
},
{
"epoch": 0.7761024182076813,
"grad_norm": 0.5584404468536377,
"learning_rate": 2.0249910037374308e-07,
"loss": 1.4001,
"step": 1364
},
{
"epoch": 0.7766714082503556,
"grad_norm": 0.6618481874465942,
"learning_rate": 1.8829762856933387e-07,
"loss": 1.4373,
"step": 1365
},
{
"epoch": 0.7772403982930298,
"grad_norm": 0.5788954496383667,
"learning_rate": 1.7461193728436353e-07,
"loss": 1.5453,
"step": 1366
},
{
"epoch": 0.7778093883357041,
"grad_norm": 0.6013731956481934,
"learning_rate": 1.614420972401165e-07,
"loss": 1.5133,
"step": 1367
},
{
"epoch": 0.7783783783783784,
"grad_norm": 0.622704029083252,
"learning_rate": 1.4878817649220944e-07,
"loss": 1.4024,
"step": 1368
},
{
"epoch": 0.7789473684210526,
"grad_norm": 0.5754665732383728,
"learning_rate": 1.36650240430225e-07,
"loss": 1.2369,
"step": 1369
},
{
"epoch": 0.7795163584637269,
"grad_norm": 0.5812003016471863,
"learning_rate": 1.250283517774009e-07,
"loss": 1.5473,
"step": 1370
},
{
"epoch": 0.7800853485064011,
"grad_norm": 0.6443690657615662,
"learning_rate": 1.1392257059023026e-07,
"loss": 1.5182,
"step": 1371
},
{
"epoch": 0.7806543385490754,
"grad_norm": 0.5886285901069641,
"learning_rate": 1.0333295425825063e-07,
"loss": 1.6711,
"step": 1372
},
{
"epoch": 0.7812233285917497,
"grad_norm": 0.5866546034812927,
"learning_rate": 9.325955750367766e-08,
"loss": 1.326,
"step": 1373
},
{
"epoch": 0.7817923186344239,
"grad_norm": 0.5975569486618042,
"learning_rate": 8.370243238113862e-08,
"loss": 1.522,
"step": 1374
},
{
"epoch": 0.7823613086770982,
"grad_norm": 0.5648385286331177,
"learning_rate": 7.466162827742817e-08,
"loss": 1.4345,
"step": 1375
},
{
"epoch": 0.7829302987197724,
"grad_norm": 0.6075783371925354,
"learning_rate": 6.61371919112197e-08,
"loss": 1.4661,
"step": 1376
},
{
"epoch": 0.7834992887624467,
"grad_norm": 0.6267833113670349,
"learning_rate": 5.812916733284324e-08,
"loss": 1.5151,
"step": 1377
},
{
"epoch": 0.7840682788051209,
"grad_norm": 0.5820707082748413,
"learning_rate": 5.063759592404127e-08,
"loss": 1.4866,
"step": 1378
},
{
"epoch": 0.7846372688477952,
"grad_norm": 0.5778554677963257,
"learning_rate": 4.366251639777996e-08,
"loss": 1.3435,
"step": 1379
},
{
"epoch": 0.7852062588904695,
"grad_norm": 0.5690316557884216,
"learning_rate": 3.720396479803823e-08,
"loss": 1.4896,
"step": 1380
},
{
"epoch": 0.7857752489331437,
"grad_norm": 0.5569249391555786,
"learning_rate": 3.126197449959678e-08,
"loss": 1.358,
"step": 1381
},
{
"epoch": 0.786344238975818,
"grad_norm": 0.6170508861541748,
"learning_rate": 2.5836576207916018e-08,
"loss": 1.4965,
"step": 1382
},
{
"epoch": 0.7869132290184921,
"grad_norm": 0.5334784388542175,
"learning_rate": 2.092779795892508e-08,
"loss": 1.2593,
"step": 1383
},
{
"epoch": 0.7874822190611664,
"grad_norm": 0.6333361864089966,
"learning_rate": 1.6535665118910802e-08,
"loss": 1.3485,
"step": 1384
},
{
"epoch": 0.7880512091038406,
"grad_norm": 0.6247609853744507,
"learning_rate": 1.2660200384384536e-08,
"loss": 1.4111,
"step": 1385
},
{
"epoch": 0.7886201991465149,
"grad_norm": 0.5764187574386597,
"learning_rate": 9.301423781926666e-09,
"loss": 1.201,
"step": 1386
},
{
"epoch": 0.7891891891891892,
"grad_norm": 0.5923343896865845,
"learning_rate": 6.459352668164442e-09,
"loss": 1.5227,
"step": 1387
},
{
"epoch": 0.7897581792318634,
"grad_norm": 0.578344464302063,
"learning_rate": 4.134001729583226e-09,
"loss": 1.3055,
"step": 1388
},
{
"epoch": 0.7903271692745377,
"grad_norm": 0.595220148563385,
"learning_rate": 2.3253829825153894e-09,
"loss": 1.5065,
"step": 1389
},
{
"epoch": 0.7908961593172119,
"grad_norm": 0.5846388339996338,
"learning_rate": 1.033505773062604e-09,
"loss": 1.5331,
"step": 1390
},
{
"epoch": 0.7914651493598862,
"grad_norm": 0.5945976376533508,
"learning_rate": 2.5837677706253003e-10,
"loss": 1.5147,
"step": 1391
},
{
"epoch": 0.7920341394025604,
"grad_norm": 0.6109771728515625,
"learning_rate": 0.0,
"loss": 1.508,
"step": 1392
},
{
"epoch": 0.7920341394025604,
"eval_loss": 1.461509108543396,
"eval_runtime": 16.4544,
"eval_samples_per_second": 44.973,
"eval_steps_per_second": 22.486,
"step": 1392
}
],
"logging_steps": 1,
"max_steps": 1392,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 348,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.474815762936627e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}