ErrorAI's picture
Training in progress, step 1303, checkpoint
c8ece18 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.000575926281436,
"eval_steps": 326,
"global_step": 1303,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007679017085813016,
"grad_norm": 1.3421963453292847,
"learning_rate": 2e-05,
"loss": 1.819,
"step": 1
},
{
"epoch": 0.0007679017085813016,
"eval_loss": 1.6877703666687012,
"eval_runtime": 19.7388,
"eval_samples_per_second": 27.813,
"eval_steps_per_second": 13.932,
"step": 1
},
{
"epoch": 0.0015358034171626032,
"grad_norm": 1.0465058088302612,
"learning_rate": 4e-05,
"loss": 1.2514,
"step": 2
},
{
"epoch": 0.002303705125743905,
"grad_norm": 1.2153908014297485,
"learning_rate": 6e-05,
"loss": 1.6968,
"step": 3
},
{
"epoch": 0.0030716068343252063,
"grad_norm": 1.4489881992340088,
"learning_rate": 8e-05,
"loss": 1.6527,
"step": 4
},
{
"epoch": 0.003839508542906508,
"grad_norm": 1.383280873298645,
"learning_rate": 0.0001,
"loss": 1.7778,
"step": 5
},
{
"epoch": 0.00460741025148781,
"grad_norm": 1.1447311639785767,
"learning_rate": 0.00012,
"loss": 1.5621,
"step": 6
},
{
"epoch": 0.005375311960069111,
"grad_norm": 1.3640100955963135,
"learning_rate": 0.00014,
"loss": 1.5738,
"step": 7
},
{
"epoch": 0.006143213668650413,
"grad_norm": 1.419996738433838,
"learning_rate": 0.00016,
"loss": 1.3716,
"step": 8
},
{
"epoch": 0.006911115377231714,
"grad_norm": 1.2156635522842407,
"learning_rate": 0.00018,
"loss": 1.129,
"step": 9
},
{
"epoch": 0.007679017085813016,
"grad_norm": 1.6312865018844604,
"learning_rate": 0.0002,
"loss": 1.4983,
"step": 10
},
{
"epoch": 0.008446918794394317,
"grad_norm": 1.8767600059509277,
"learning_rate": 0.00019999970482981582,
"loss": 1.1722,
"step": 11
},
{
"epoch": 0.00921482050297562,
"grad_norm": 2.1750783920288086,
"learning_rate": 0.0001999988193210057,
"loss": 1.689,
"step": 12
},
{
"epoch": 0.00998272221155692,
"grad_norm": 2.1488802433013916,
"learning_rate": 0.00019999734347879723,
"loss": 1.3221,
"step": 13
},
{
"epoch": 0.010750623920138222,
"grad_norm": 2.133239269256592,
"learning_rate": 0.0001999952773119029,
"loss": 1.4366,
"step": 14
},
{
"epoch": 0.011518525628719524,
"grad_norm": 1.99974524974823,
"learning_rate": 0.00019999262083252007,
"loss": 1.2169,
"step": 15
},
{
"epoch": 0.012286427337300825,
"grad_norm": 1.5727359056472778,
"learning_rate": 0.00019998937405633105,
"loss": 1.1514,
"step": 16
},
{
"epoch": 0.013054329045882128,
"grad_norm": 2.484224557876587,
"learning_rate": 0.00019998553700250284,
"loss": 1.6751,
"step": 17
},
{
"epoch": 0.013822230754463429,
"grad_norm": 2.1434097290039062,
"learning_rate": 0.00019998110969368717,
"loss": 1.7869,
"step": 18
},
{
"epoch": 0.01459013246304473,
"grad_norm": 1.8864972591400146,
"learning_rate": 0.00019997609215602019,
"loss": 1.3473,
"step": 19
},
{
"epoch": 0.015358034171626032,
"grad_norm": 1.396660327911377,
"learning_rate": 0.00019997048441912246,
"loss": 1.0329,
"step": 20
},
{
"epoch": 0.016125935880207335,
"grad_norm": 1.8384391069412231,
"learning_rate": 0.0001999642865160987,
"loss": 1.5533,
"step": 21
},
{
"epoch": 0.016893837588788634,
"grad_norm": 1.6881628036499023,
"learning_rate": 0.0001999574984835377,
"loss": 1.4798,
"step": 22
},
{
"epoch": 0.017661739297369937,
"grad_norm": 1.6988723278045654,
"learning_rate": 0.00019995012036151186,
"loss": 1.3521,
"step": 23
},
{
"epoch": 0.01842964100595124,
"grad_norm": 1.5781642198562622,
"learning_rate": 0.00019994215219357728,
"loss": 1.1237,
"step": 24
},
{
"epoch": 0.01919754271453254,
"grad_norm": 1.8065123558044434,
"learning_rate": 0.00019993359402677323,
"loss": 1.4728,
"step": 25
},
{
"epoch": 0.01996544442311384,
"grad_norm": 1.6945093870162964,
"learning_rate": 0.00019992444591162206,
"loss": 1.2896,
"step": 26
},
{
"epoch": 0.020733346131695144,
"grad_norm": 1.4968225955963135,
"learning_rate": 0.00019991470790212877,
"loss": 1.3827,
"step": 27
},
{
"epoch": 0.021501247840276443,
"grad_norm": 1.4860451221466064,
"learning_rate": 0.00019990438005578075,
"loss": 1.4381,
"step": 28
},
{
"epoch": 0.022269149548857746,
"grad_norm": 1.543802261352539,
"learning_rate": 0.00019989346243354746,
"loss": 1.3484,
"step": 29
},
{
"epoch": 0.02303705125743905,
"grad_norm": 1.5513710975646973,
"learning_rate": 0.00019988195509988005,
"loss": 1.3842,
"step": 30
},
{
"epoch": 0.023804952966020348,
"grad_norm": 1.6369807720184326,
"learning_rate": 0.00019986985812271092,
"loss": 1.242,
"step": 31
},
{
"epoch": 0.02457285467460165,
"grad_norm": 1.7637338638305664,
"learning_rate": 0.00019985717157345345,
"loss": 1.1865,
"step": 32
},
{
"epoch": 0.025340756383182953,
"grad_norm": 1.4395427703857422,
"learning_rate": 0.00019984389552700144,
"loss": 1.2769,
"step": 33
},
{
"epoch": 0.026108658091764256,
"grad_norm": 1.8801194429397583,
"learning_rate": 0.0001998300300617287,
"loss": 1.7346,
"step": 34
},
{
"epoch": 0.026876559800345555,
"grad_norm": 1.7689722776412964,
"learning_rate": 0.00019981557525948875,
"loss": 1.4345,
"step": 35
},
{
"epoch": 0.027644461508926858,
"grad_norm": 1.1703026294708252,
"learning_rate": 0.00019980053120561411,
"loss": 0.8503,
"step": 36
},
{
"epoch": 0.02841236321750816,
"grad_norm": 1.4274190664291382,
"learning_rate": 0.00019978489798891584,
"loss": 1.2453,
"step": 37
},
{
"epoch": 0.02918026492608946,
"grad_norm": 1.5258653163909912,
"learning_rate": 0.00019976867570168318,
"loss": 1.2976,
"step": 38
},
{
"epoch": 0.029948166634670762,
"grad_norm": 1.3230305910110474,
"learning_rate": 0.00019975186443968286,
"loss": 1.419,
"step": 39
},
{
"epoch": 0.030716068343252065,
"grad_norm": 1.4891058206558228,
"learning_rate": 0.0001997344643021585,
"loss": 1.2045,
"step": 40
},
{
"epoch": 0.031483970051833364,
"grad_norm": 1.3065136671066284,
"learning_rate": 0.00019971647539183013,
"loss": 1.2106,
"step": 41
},
{
"epoch": 0.03225187176041467,
"grad_norm": 1.5086220502853394,
"learning_rate": 0.00019969789781489362,
"loss": 1.4317,
"step": 42
},
{
"epoch": 0.03301977346899597,
"grad_norm": 1.7055323123931885,
"learning_rate": 0.00019967873168101984,
"loss": 1.5407,
"step": 43
},
{
"epoch": 0.03378767517757727,
"grad_norm": 1.1143574714660645,
"learning_rate": 0.00019965897710335422,
"loss": 0.9124,
"step": 44
},
{
"epoch": 0.034555576886158575,
"grad_norm": 1.7933096885681152,
"learning_rate": 0.00019963863419851605,
"loss": 1.5175,
"step": 45
},
{
"epoch": 0.035323478594739874,
"grad_norm": 1.963930368423462,
"learning_rate": 0.00019961770308659767,
"loss": 1.4704,
"step": 46
},
{
"epoch": 0.03609138030332117,
"grad_norm": 1.2341890335083008,
"learning_rate": 0.00019959618389116387,
"loss": 0.8867,
"step": 47
},
{
"epoch": 0.03685928201190248,
"grad_norm": 1.3576570749282837,
"learning_rate": 0.0001995740767392512,
"loss": 1.5176,
"step": 48
},
{
"epoch": 0.03762718372048378,
"grad_norm": 1.2808408737182617,
"learning_rate": 0.0001995513817613671,
"loss": 1.076,
"step": 49
},
{
"epoch": 0.03839508542906508,
"grad_norm": 1.597029209136963,
"learning_rate": 0.00019952809909148914,
"loss": 1.7766,
"step": 50
},
{
"epoch": 0.039162987137646384,
"grad_norm": 1.302276849746704,
"learning_rate": 0.0001995042288670643,
"loss": 1.1456,
"step": 51
},
{
"epoch": 0.03993088884622768,
"grad_norm": 1.564570665359497,
"learning_rate": 0.00019947977122900822,
"loss": 1.428,
"step": 52
},
{
"epoch": 0.04069879055480898,
"grad_norm": 1.4207552671432495,
"learning_rate": 0.0001994547263217042,
"loss": 1.4306,
"step": 53
},
{
"epoch": 0.04146669226339029,
"grad_norm": 1.530429720878601,
"learning_rate": 0.00019942909429300238,
"loss": 1.1629,
"step": 54
},
{
"epoch": 0.04223459397197159,
"grad_norm": 1.1206613779067993,
"learning_rate": 0.00019940287529421902,
"loss": 1.2482,
"step": 55
},
{
"epoch": 0.04300249568055289,
"grad_norm": 1.2022866010665894,
"learning_rate": 0.00019937606948013548,
"loss": 1.208,
"step": 56
},
{
"epoch": 0.04377039738913419,
"grad_norm": 1.7056037187576294,
"learning_rate": 0.00019934867700899722,
"loss": 1.2333,
"step": 57
},
{
"epoch": 0.04453829909771549,
"grad_norm": 1.274591088294983,
"learning_rate": 0.00019932069804251312,
"loss": 0.8844,
"step": 58
},
{
"epoch": 0.04530620080629679,
"grad_norm": 1.6336207389831543,
"learning_rate": 0.0001992921327458543,
"loss": 1.37,
"step": 59
},
{
"epoch": 0.0460741025148781,
"grad_norm": 1.6932519674301147,
"learning_rate": 0.00019926298128765323,
"loss": 1.4952,
"step": 60
},
{
"epoch": 0.046842004223459396,
"grad_norm": 1.6268481016159058,
"learning_rate": 0.00019923324384000276,
"loss": 1.4606,
"step": 61
},
{
"epoch": 0.047609905932040696,
"grad_norm": 1.5920450687408447,
"learning_rate": 0.00019920292057845499,
"loss": 1.4074,
"step": 62
},
{
"epoch": 0.048377807640622,
"grad_norm": 1.5211634635925293,
"learning_rate": 0.00019917201168202043,
"loss": 1.4462,
"step": 63
},
{
"epoch": 0.0491457093492033,
"grad_norm": 1.4284402132034302,
"learning_rate": 0.00019914051733316678,
"loss": 0.8046,
"step": 64
},
{
"epoch": 0.0499136110577846,
"grad_norm": 1.6174309253692627,
"learning_rate": 0.00019910843771781783,
"loss": 1.5238,
"step": 65
},
{
"epoch": 0.050681512766365906,
"grad_norm": 1.4781291484832764,
"learning_rate": 0.00019907577302535255,
"loss": 1.3418,
"step": 66
},
{
"epoch": 0.051449414474947205,
"grad_norm": 1.597865343093872,
"learning_rate": 0.00019904252344860382,
"loss": 1.1921,
"step": 67
},
{
"epoch": 0.05221731618352851,
"grad_norm": 1.5983515977859497,
"learning_rate": 0.00019900868918385726,
"loss": 1.5806,
"step": 68
},
{
"epoch": 0.05298521789210981,
"grad_norm": 2.1672117710113525,
"learning_rate": 0.00019897427043085022,
"loss": 1.5998,
"step": 69
},
{
"epoch": 0.05375311960069111,
"grad_norm": 1.3775156736373901,
"learning_rate": 0.0001989392673927705,
"loss": 1.2913,
"step": 70
},
{
"epoch": 0.054521021309272416,
"grad_norm": 1.5657135248184204,
"learning_rate": 0.00019890368027625517,
"loss": 1.1062,
"step": 71
},
{
"epoch": 0.055288923017853715,
"grad_norm": 1.1934847831726074,
"learning_rate": 0.00019886750929138934,
"loss": 1.126,
"step": 72
},
{
"epoch": 0.056056824726435014,
"grad_norm": 1.964847207069397,
"learning_rate": 0.0001988307546517049,
"loss": 1.5169,
"step": 73
},
{
"epoch": 0.05682472643501632,
"grad_norm": 1.7718520164489746,
"learning_rate": 0.00019879341657417935,
"loss": 1.1245,
"step": 74
},
{
"epoch": 0.05759262814359762,
"grad_norm": 1.3038172721862793,
"learning_rate": 0.00019875549527923449,
"loss": 1.0887,
"step": 75
},
{
"epoch": 0.05836052985217892,
"grad_norm": 1.4829697608947754,
"learning_rate": 0.00019871699099073493,
"loss": 1.2666,
"step": 76
},
{
"epoch": 0.059128431560760225,
"grad_norm": 1.5347778797149658,
"learning_rate": 0.0001986779039359871,
"loss": 1.2203,
"step": 77
},
{
"epoch": 0.059896333269341524,
"grad_norm": 1.6270524263381958,
"learning_rate": 0.00019863823434573762,
"loss": 1.4289,
"step": 78
},
{
"epoch": 0.060664234977922823,
"grad_norm": 1.3492023944854736,
"learning_rate": 0.00019859798245417217,
"loss": 1.2448,
"step": 79
},
{
"epoch": 0.06143213668650413,
"grad_norm": 1.7291414737701416,
"learning_rate": 0.0001985571484989138,
"loss": 1.2516,
"step": 80
},
{
"epoch": 0.06220003839508543,
"grad_norm": 1.7101045846939087,
"learning_rate": 0.00019851573272102195,
"loss": 1.2677,
"step": 81
},
{
"epoch": 0.06296794010366673,
"grad_norm": 1.4374966621398926,
"learning_rate": 0.0001984737353649906,
"loss": 1.1178,
"step": 82
},
{
"epoch": 0.06373584181224803,
"grad_norm": 1.5133461952209473,
"learning_rate": 0.00019843115667874707,
"loss": 1.46,
"step": 83
},
{
"epoch": 0.06450374352082934,
"grad_norm": 1.594156265258789,
"learning_rate": 0.00019838799691365065,
"loss": 1.2635,
"step": 84
},
{
"epoch": 0.06527164522941063,
"grad_norm": 1.4688767194747925,
"learning_rate": 0.00019834425632449075,
"loss": 1.1498,
"step": 85
},
{
"epoch": 0.06603954693799194,
"grad_norm": 1.7303012609481812,
"learning_rate": 0.00019829993516948577,
"loss": 1.775,
"step": 86
},
{
"epoch": 0.06680744864657324,
"grad_norm": 1.2842633724212646,
"learning_rate": 0.00019825503371028136,
"loss": 1.1091,
"step": 87
},
{
"epoch": 0.06757535035515454,
"grad_norm": 1.4563853740692139,
"learning_rate": 0.000198209552211949,
"loss": 1.4345,
"step": 88
},
{
"epoch": 0.06834325206373584,
"grad_norm": 1.344085693359375,
"learning_rate": 0.00019816349094298427,
"loss": 1.4728,
"step": 89
},
{
"epoch": 0.06911115377231715,
"grad_norm": 1.4741275310516357,
"learning_rate": 0.0001981168501753055,
"loss": 1.4337,
"step": 90
},
{
"epoch": 0.06987905548089844,
"grad_norm": 1.4946739673614502,
"learning_rate": 0.0001980696301842519,
"loss": 1.4202,
"step": 91
},
{
"epoch": 0.07064695718947975,
"grad_norm": 1.2858799695968628,
"learning_rate": 0.00019802183124858222,
"loss": 0.9181,
"step": 92
},
{
"epoch": 0.07141485889806105,
"grad_norm": 1.789942979812622,
"learning_rate": 0.00019797345365047284,
"loss": 1.4772,
"step": 93
},
{
"epoch": 0.07218276060664235,
"grad_norm": 1.4283826351165771,
"learning_rate": 0.0001979244976755162,
"loss": 1.3747,
"step": 94
},
{
"epoch": 0.07295066231522365,
"grad_norm": 1.5858755111694336,
"learning_rate": 0.00019787496361271925,
"loss": 1.5726,
"step": 95
},
{
"epoch": 0.07371856402380496,
"grad_norm": 1.5366917848587036,
"learning_rate": 0.00019782485175450155,
"loss": 1.4846,
"step": 96
},
{
"epoch": 0.07448646573238625,
"grad_norm": 1.363489031791687,
"learning_rate": 0.0001977741623966936,
"loss": 1.211,
"step": 97
},
{
"epoch": 0.07525436744096756,
"grad_norm": 1.6271450519561768,
"learning_rate": 0.00019772289583853514,
"loss": 1.51,
"step": 98
},
{
"epoch": 0.07602226914954886,
"grad_norm": 1.333284854888916,
"learning_rate": 0.00019767105238267338,
"loss": 1.2552,
"step": 99
},
{
"epoch": 0.07679017085813016,
"grad_norm": 1.5619887113571167,
"learning_rate": 0.00019761863233516117,
"loss": 1.4492,
"step": 100
},
{
"epoch": 0.07755807256671146,
"grad_norm": 1.3601068258285522,
"learning_rate": 0.0001975656360054552,
"loss": 1.2376,
"step": 101
},
{
"epoch": 0.07832597427529277,
"grad_norm": 1.6772176027297974,
"learning_rate": 0.0001975120637064142,
"loss": 1.4527,
"step": 102
},
{
"epoch": 0.07909387598387406,
"grad_norm": 1.5059335231781006,
"learning_rate": 0.00019745791575429705,
"loss": 1.3531,
"step": 103
},
{
"epoch": 0.07986177769245537,
"grad_norm": 1.6139791011810303,
"learning_rate": 0.00019740319246876106,
"loss": 1.5811,
"step": 104
},
{
"epoch": 0.08062967940103667,
"grad_norm": 1.6082731485366821,
"learning_rate": 0.00019734789417285976,
"loss": 1.3414,
"step": 105
},
{
"epoch": 0.08139758110961796,
"grad_norm": 1.5934547185897827,
"learning_rate": 0.0001972920211930414,
"loss": 1.3688,
"step": 106
},
{
"epoch": 0.08216548281819927,
"grad_norm": 1.2157509326934814,
"learning_rate": 0.0001972355738591467,
"loss": 0.8838,
"step": 107
},
{
"epoch": 0.08293338452678058,
"grad_norm": 1.6874890327453613,
"learning_rate": 0.00019717855250440705,
"loss": 1.5832,
"step": 108
},
{
"epoch": 0.08370128623536187,
"grad_norm": 1.4265177249908447,
"learning_rate": 0.00019712095746544255,
"loss": 1.3194,
"step": 109
},
{
"epoch": 0.08446918794394317,
"grad_norm": 1.457571268081665,
"learning_rate": 0.00019706278908225992,
"loss": 1.3425,
"step": 110
},
{
"epoch": 0.08523708965252448,
"grad_norm": 1.6059311628341675,
"learning_rate": 0.00019700404769825068,
"loss": 1.7976,
"step": 111
},
{
"epoch": 0.08600499136110577,
"grad_norm": 1.3891575336456299,
"learning_rate": 0.00019694473366018887,
"loss": 1.3154,
"step": 112
},
{
"epoch": 0.08677289306968708,
"grad_norm": 1.5132336616516113,
"learning_rate": 0.00019688484731822923,
"loss": 1.4888,
"step": 113
},
{
"epoch": 0.08754079477826839,
"grad_norm": 1.5129386186599731,
"learning_rate": 0.00019682438902590498,
"loss": 1.279,
"step": 114
},
{
"epoch": 0.08830869648684968,
"grad_norm": 1.4669959545135498,
"learning_rate": 0.0001967633591401259,
"loss": 1.1884,
"step": 115
},
{
"epoch": 0.08907659819543098,
"grad_norm": 1.3678569793701172,
"learning_rate": 0.000196701758021176,
"loss": 1.0954,
"step": 116
},
{
"epoch": 0.08984449990401229,
"grad_norm": 1.3512712717056274,
"learning_rate": 0.00019663958603271148,
"loss": 1.1583,
"step": 117
},
{
"epoch": 0.09061240161259358,
"grad_norm": 1.2493406534194946,
"learning_rate": 0.0001965768435417588,
"loss": 1.1847,
"step": 118
},
{
"epoch": 0.09138030332117489,
"grad_norm": 1.6582459211349487,
"learning_rate": 0.00019651353091871215,
"loss": 1.4876,
"step": 119
},
{
"epoch": 0.0921482050297562,
"grad_norm": 1.5519031286239624,
"learning_rate": 0.00019644964853733152,
"loss": 1.5264,
"step": 120
},
{
"epoch": 0.09291610673833749,
"grad_norm": 1.9167393445968628,
"learning_rate": 0.0001963851967747404,
"loss": 1.6469,
"step": 121
},
{
"epoch": 0.09368400844691879,
"grad_norm": 1.7159210443496704,
"learning_rate": 0.00019632017601142355,
"loss": 1.4076,
"step": 122
},
{
"epoch": 0.0944519101555001,
"grad_norm": 1.5562142133712769,
"learning_rate": 0.00019625458663122478,
"loss": 1.0584,
"step": 123
},
{
"epoch": 0.09521981186408139,
"grad_norm": 1.5296403169631958,
"learning_rate": 0.00019618842902134465,
"loss": 1.3064,
"step": 124
},
{
"epoch": 0.0959877135726627,
"grad_norm": 1.6723419427871704,
"learning_rate": 0.00019612170357233836,
"loss": 1.4708,
"step": 125
},
{
"epoch": 0.096755615281244,
"grad_norm": 1.6125822067260742,
"learning_rate": 0.00019605441067811302,
"loss": 1.7615,
"step": 126
},
{
"epoch": 0.0975235169898253,
"grad_norm": 1.4108672142028809,
"learning_rate": 0.00019598655073592585,
"loss": 1.447,
"step": 127
},
{
"epoch": 0.0982914186984066,
"grad_norm": 1.373512864112854,
"learning_rate": 0.0001959181241463814,
"loss": 1.1841,
"step": 128
},
{
"epoch": 0.09905932040698791,
"grad_norm": 1.5692287683486938,
"learning_rate": 0.00019584913131342953,
"loss": 1.4497,
"step": 129
},
{
"epoch": 0.0998272221155692,
"grad_norm": 1.8311026096343994,
"learning_rate": 0.0001957795726443628,
"loss": 1.425,
"step": 130
},
{
"epoch": 0.1005951238241505,
"grad_norm": 1.5279914140701294,
"learning_rate": 0.000195709448549814,
"loss": 1.2395,
"step": 131
},
{
"epoch": 0.10136302553273181,
"grad_norm": 1.5802501440048218,
"learning_rate": 0.00019563875944375407,
"loss": 1.3774,
"step": 132
},
{
"epoch": 0.10213092724131312,
"grad_norm": 1.3959014415740967,
"learning_rate": 0.0001955675057434893,
"loss": 1.3567,
"step": 133
},
{
"epoch": 0.10289882894989441,
"grad_norm": 1.426458716392517,
"learning_rate": 0.00019549568786965903,
"loss": 1.26,
"step": 134
},
{
"epoch": 0.10366673065847572,
"grad_norm": 1.413362979888916,
"learning_rate": 0.00019542330624623322,
"loss": 1.3663,
"step": 135
},
{
"epoch": 0.10443463236705702,
"grad_norm": 1.4533791542053223,
"learning_rate": 0.00019535036130050975,
"loss": 1.3983,
"step": 136
},
{
"epoch": 0.10520253407563832,
"grad_norm": 1.3483699560165405,
"learning_rate": 0.00019527685346311212,
"loss": 1.0326,
"step": 137
},
{
"epoch": 0.10597043578421962,
"grad_norm": 1.591539740562439,
"learning_rate": 0.0001952027831679867,
"loss": 1.2265,
"step": 138
},
{
"epoch": 0.10673833749280093,
"grad_norm": 1.777915120124817,
"learning_rate": 0.00019512815085240046,
"loss": 1.6348,
"step": 139
},
{
"epoch": 0.10750623920138222,
"grad_norm": 1.4367990493774414,
"learning_rate": 0.000195052956956938,
"loss": 1.3012,
"step": 140
},
{
"epoch": 0.10827414090996353,
"grad_norm": 1.5832383632659912,
"learning_rate": 0.00019497720192549926,
"loss": 1.1717,
"step": 141
},
{
"epoch": 0.10904204261854483,
"grad_norm": 1.6121189594268799,
"learning_rate": 0.00019490088620529678,
"loss": 1.5101,
"step": 142
},
{
"epoch": 0.10980994432712612,
"grad_norm": 1.6623026132583618,
"learning_rate": 0.00019482401024685308,
"loss": 1.3055,
"step": 143
},
{
"epoch": 0.11057784603570743,
"grad_norm": 1.2664202451705933,
"learning_rate": 0.0001947465745039979,
"loss": 1.0602,
"step": 144
},
{
"epoch": 0.11134574774428874,
"grad_norm": 1.318278431892395,
"learning_rate": 0.0001946685794338658,
"loss": 1.285,
"step": 145
},
{
"epoch": 0.11211364945287003,
"grad_norm": 1.7018437385559082,
"learning_rate": 0.00019459002549689308,
"loss": 1.3439,
"step": 146
},
{
"epoch": 0.11288155116145134,
"grad_norm": 1.629821538925171,
"learning_rate": 0.0001945109131568154,
"loss": 1.2974,
"step": 147
},
{
"epoch": 0.11364945287003264,
"grad_norm": 1.416452407836914,
"learning_rate": 0.00019443124288066475,
"loss": 1.1124,
"step": 148
},
{
"epoch": 0.11441735457861393,
"grad_norm": 1.6793434619903564,
"learning_rate": 0.00019435101513876703,
"loss": 1.3027,
"step": 149
},
{
"epoch": 0.11518525628719524,
"grad_norm": 1.465809941291809,
"learning_rate": 0.00019427023040473896,
"loss": 0.9204,
"step": 150
},
{
"epoch": 0.11595315799577655,
"grad_norm": 1.7174264192581177,
"learning_rate": 0.0001941888891554854,
"loss": 1.2307,
"step": 151
},
{
"epoch": 0.11672105970435784,
"grad_norm": 1.3627086877822876,
"learning_rate": 0.00019410699187119663,
"loss": 1.0926,
"step": 152
},
{
"epoch": 0.11748896141293914,
"grad_norm": 2.059274911880493,
"learning_rate": 0.00019402453903534533,
"loss": 1.5285,
"step": 153
},
{
"epoch": 0.11825686312152045,
"grad_norm": 1.3840800523757935,
"learning_rate": 0.0001939415311346839,
"loss": 1.2504,
"step": 154
},
{
"epoch": 0.11902476483010174,
"grad_norm": 1.6135592460632324,
"learning_rate": 0.0001938579686592415,
"loss": 1.3955,
"step": 155
},
{
"epoch": 0.11979266653868305,
"grad_norm": 1.4203635454177856,
"learning_rate": 0.00019377385210232113,
"loss": 1.2271,
"step": 156
},
{
"epoch": 0.12056056824726435,
"grad_norm": 1.4303590059280396,
"learning_rate": 0.0001936891819604968,
"loss": 1.5135,
"step": 157
},
{
"epoch": 0.12132846995584565,
"grad_norm": 1.3626837730407715,
"learning_rate": 0.00019360395873361055,
"loss": 1.1232,
"step": 158
},
{
"epoch": 0.12209637166442695,
"grad_norm": 1.2136698961257935,
"learning_rate": 0.00019351818292476946,
"loss": 1.2211,
"step": 159
},
{
"epoch": 0.12286427337300826,
"grad_norm": 1.463807463645935,
"learning_rate": 0.00019343185504034277,
"loss": 1.2924,
"step": 160
},
{
"epoch": 0.12363217508158955,
"grad_norm": 1.2477118968963623,
"learning_rate": 0.0001933449755899588,
"loss": 1.2391,
"step": 161
},
{
"epoch": 0.12440007679017086,
"grad_norm": 1.2382277250289917,
"learning_rate": 0.0001932575450865021,
"loss": 1.0537,
"step": 162
},
{
"epoch": 0.12516797849875216,
"grad_norm": 1.1444644927978516,
"learning_rate": 0.00019316956404611012,
"loss": 1.0164,
"step": 163
},
{
"epoch": 0.12593588020733346,
"grad_norm": 1.7438474893569946,
"learning_rate": 0.00019308103298817052,
"loss": 1.3299,
"step": 164
},
{
"epoch": 0.12670378191591478,
"grad_norm": 1.3742320537567139,
"learning_rate": 0.00019299195243531792,
"loss": 1.1848,
"step": 165
},
{
"epoch": 0.12747168362449607,
"grad_norm": 1.456499695777893,
"learning_rate": 0.00019290232291343067,
"loss": 1.5438,
"step": 166
},
{
"epoch": 0.12823958533307736,
"grad_norm": 1.0693509578704834,
"learning_rate": 0.0001928121449516281,
"loss": 1.0623,
"step": 167
},
{
"epoch": 0.12900748704165868,
"grad_norm": 1.7221637964248657,
"learning_rate": 0.00019272141908226707,
"loss": 1.1941,
"step": 168
},
{
"epoch": 0.12977538875023997,
"grad_norm": 1.3603307008743286,
"learning_rate": 0.0001926301458409391,
"loss": 1.0901,
"step": 169
},
{
"epoch": 0.13054329045882127,
"grad_norm": 1.498059868812561,
"learning_rate": 0.00019253832576646688,
"loss": 1.4565,
"step": 170
},
{
"epoch": 0.13131119216740259,
"grad_norm": 1.5251699686050415,
"learning_rate": 0.00019244595940090143,
"loss": 1.1852,
"step": 171
},
{
"epoch": 0.13207909387598388,
"grad_norm": 1.6394065618515015,
"learning_rate": 0.00019235304728951866,
"loss": 1.3357,
"step": 172
},
{
"epoch": 0.13284699558456517,
"grad_norm": 1.3979147672653198,
"learning_rate": 0.00019225958998081633,
"loss": 1.2176,
"step": 173
},
{
"epoch": 0.1336148972931465,
"grad_norm": 1.5247663259506226,
"learning_rate": 0.0001921655880265106,
"loss": 1.4776,
"step": 174
},
{
"epoch": 0.13438279900172778,
"grad_norm": 1.7664347887039185,
"learning_rate": 0.00019207104198153295,
"loss": 1.3425,
"step": 175
},
{
"epoch": 0.13515070071030907,
"grad_norm": 1.719008445739746,
"learning_rate": 0.0001919759524040269,
"loss": 1.5569,
"step": 176
},
{
"epoch": 0.1359186024188904,
"grad_norm": 1.5756323337554932,
"learning_rate": 0.0001918803198553446,
"loss": 1.4662,
"step": 177
},
{
"epoch": 0.1366865041274717,
"grad_norm": 1.1478333473205566,
"learning_rate": 0.00019178414490004356,
"loss": 0.8951,
"step": 178
},
{
"epoch": 0.13745440583605298,
"grad_norm": 1.549656629562378,
"learning_rate": 0.00019168742810588335,
"loss": 1.1626,
"step": 179
},
{
"epoch": 0.1382223075446343,
"grad_norm": 2.0804789066314697,
"learning_rate": 0.00019159017004382234,
"loss": 1.1341,
"step": 180
},
{
"epoch": 0.1389902092532156,
"grad_norm": 1.3882943391799927,
"learning_rate": 0.00019149237128801404,
"loss": 1.0807,
"step": 181
},
{
"epoch": 0.13975811096179688,
"grad_norm": 1.5854548215866089,
"learning_rate": 0.000191394032415804,
"loss": 1.2928,
"step": 182
},
{
"epoch": 0.1405260126703782,
"grad_norm": 1.5255122184753418,
"learning_rate": 0.00019129515400772635,
"loss": 1.2089,
"step": 183
},
{
"epoch": 0.1412939143789595,
"grad_norm": 1.3562302589416504,
"learning_rate": 0.00019119573664750018,
"loss": 1.2578,
"step": 184
},
{
"epoch": 0.1420618160875408,
"grad_norm": 1.4164509773254395,
"learning_rate": 0.00019109578092202628,
"loss": 1.1642,
"step": 185
},
{
"epoch": 0.1428297177961221,
"grad_norm": 1.4175788164138794,
"learning_rate": 0.00019099528742138371,
"loss": 1.2517,
"step": 186
},
{
"epoch": 0.1435976195047034,
"grad_norm": 1.5696830749511719,
"learning_rate": 0.00019089425673882615,
"loss": 1.4897,
"step": 187
},
{
"epoch": 0.1443655212132847,
"grad_norm": 1.4855685234069824,
"learning_rate": 0.0001907926894707785,
"loss": 1.4277,
"step": 188
},
{
"epoch": 0.145133422921866,
"grad_norm": 1.6295796632766724,
"learning_rate": 0.00019069058621683336,
"loss": 1.3268,
"step": 189
},
{
"epoch": 0.1459013246304473,
"grad_norm": 1.5414029359817505,
"learning_rate": 0.0001905879475797474,
"loss": 1.4513,
"step": 190
},
{
"epoch": 0.1466692263390286,
"grad_norm": 1.3137727975845337,
"learning_rate": 0.00019048477416543801,
"loss": 1.1609,
"step": 191
},
{
"epoch": 0.14743712804760992,
"grad_norm": 1.5763883590698242,
"learning_rate": 0.00019038106658297944,
"loss": 1.2035,
"step": 192
},
{
"epoch": 0.1482050297561912,
"grad_norm": 1.3297104835510254,
"learning_rate": 0.00019027682544459947,
"loss": 1.2346,
"step": 193
},
{
"epoch": 0.1489729314647725,
"grad_norm": 1.568937063217163,
"learning_rate": 0.00019017205136567556,
"loss": 1.3332,
"step": 194
},
{
"epoch": 0.14974083317335382,
"grad_norm": 1.6871936321258545,
"learning_rate": 0.00019006674496473144,
"loss": 1.0715,
"step": 195
},
{
"epoch": 0.1505087348819351,
"grad_norm": 1.6157925128936768,
"learning_rate": 0.00018996090686343328,
"loss": 1.4985,
"step": 196
},
{
"epoch": 0.1512766365905164,
"grad_norm": 1.384244680404663,
"learning_rate": 0.0001898545376865861,
"loss": 0.9407,
"step": 197
},
{
"epoch": 0.15204453829909773,
"grad_norm": 1.5199958086013794,
"learning_rate": 0.00018974763806213013,
"loss": 1.459,
"step": 198
},
{
"epoch": 0.15281244000767902,
"grad_norm": 1.6747719049453735,
"learning_rate": 0.000189640208621137,
"loss": 1.475,
"step": 199
},
{
"epoch": 0.1535803417162603,
"grad_norm": 1.3858054876327515,
"learning_rate": 0.00018953224999780605,
"loss": 1.1565,
"step": 200
},
{
"epoch": 0.15434824342484163,
"grad_norm": 1.5445752143859863,
"learning_rate": 0.00018942376282946066,
"loss": 1.3588,
"step": 201
},
{
"epoch": 0.15511614513342292,
"grad_norm": 1.3727281093597412,
"learning_rate": 0.0001893147477565443,
"loss": 1.1978,
"step": 202
},
{
"epoch": 0.15588404684200421,
"grad_norm": 1.5093921422958374,
"learning_rate": 0.000189205205422617,
"loss": 1.2175,
"step": 203
},
{
"epoch": 0.15665194855058553,
"grad_norm": 1.3368033170700073,
"learning_rate": 0.0001890951364743514,
"loss": 1.0925,
"step": 204
},
{
"epoch": 0.15741985025916683,
"grad_norm": 1.3794496059417725,
"learning_rate": 0.00018898454156152886,
"loss": 1.3221,
"step": 205
},
{
"epoch": 0.15818775196774812,
"grad_norm": 1.8431724309921265,
"learning_rate": 0.0001888734213370359,
"loss": 1.4075,
"step": 206
},
{
"epoch": 0.15895565367632944,
"grad_norm": 1.592666745185852,
"learning_rate": 0.00018876177645685998,
"loss": 1.2406,
"step": 207
},
{
"epoch": 0.15972355538491073,
"grad_norm": 1.7013722658157349,
"learning_rate": 0.00018864960758008592,
"loss": 1.4999,
"step": 208
},
{
"epoch": 0.16049145709349202,
"grad_norm": 1.318893313407898,
"learning_rate": 0.00018853691536889188,
"loss": 0.9244,
"step": 209
},
{
"epoch": 0.16125935880207334,
"grad_norm": 1.1515171527862549,
"learning_rate": 0.0001884237004885455,
"loss": 0.9195,
"step": 210
},
{
"epoch": 0.16202726051065464,
"grad_norm": 1.6677429676055908,
"learning_rate": 0.0001883099636073999,
"loss": 1.315,
"step": 211
},
{
"epoch": 0.16279516221923593,
"grad_norm": 1.5073673725128174,
"learning_rate": 0.0001881957053968898,
"loss": 1.18,
"step": 212
},
{
"epoch": 0.16356306392781725,
"grad_norm": 1.6688820123672485,
"learning_rate": 0.00018808092653152753,
"loss": 1.4562,
"step": 213
},
{
"epoch": 0.16433096563639854,
"grad_norm": 1.4027730226516724,
"learning_rate": 0.00018796562768889913,
"loss": 1.2261,
"step": 214
},
{
"epoch": 0.16509886734497983,
"grad_norm": 1.699299931526184,
"learning_rate": 0.0001878498095496601,
"loss": 1.4926,
"step": 215
},
{
"epoch": 0.16586676905356115,
"grad_norm": 1.1611884832382202,
"learning_rate": 0.00018773347279753177,
"loss": 0.9826,
"step": 216
},
{
"epoch": 0.16663467076214244,
"grad_norm": 1.4166936874389648,
"learning_rate": 0.00018761661811929686,
"loss": 1.142,
"step": 217
},
{
"epoch": 0.16740257247072374,
"grad_norm": 1.652862310409546,
"learning_rate": 0.00018749924620479585,
"loss": 1.3107,
"step": 218
},
{
"epoch": 0.16817047417930506,
"grad_norm": 1.301758885383606,
"learning_rate": 0.0001873813577469224,
"loss": 1.0521,
"step": 219
},
{
"epoch": 0.16893837588788635,
"grad_norm": 1.6529386043548584,
"learning_rate": 0.0001872629534416197,
"loss": 1.1955,
"step": 220
},
{
"epoch": 0.16970627759646764,
"grad_norm": 1.254197359085083,
"learning_rate": 0.0001871440339878762,
"loss": 0.974,
"step": 221
},
{
"epoch": 0.17047417930504896,
"grad_norm": 1.7299309968948364,
"learning_rate": 0.0001870246000877214,
"loss": 1.4206,
"step": 222
},
{
"epoch": 0.17124208101363025,
"grad_norm": 1.4324923753738403,
"learning_rate": 0.00018690465244622183,
"loss": 1.1965,
"step": 223
},
{
"epoch": 0.17200998272221155,
"grad_norm": 1.4682673215866089,
"learning_rate": 0.00018678419177147685,
"loss": 1.2639,
"step": 224
},
{
"epoch": 0.17277788443079287,
"grad_norm": 1.4695028066635132,
"learning_rate": 0.0001866632187746145,
"loss": 1.2862,
"step": 225
},
{
"epoch": 0.17354578613937416,
"grad_norm": 1.2502456903457642,
"learning_rate": 0.00018654173416978714,
"loss": 1.3128,
"step": 226
},
{
"epoch": 0.17431368784795545,
"grad_norm": 1.443822979927063,
"learning_rate": 0.0001864197386741674,
"loss": 1.3217,
"step": 227
},
{
"epoch": 0.17508158955653677,
"grad_norm": 1.5240435600280762,
"learning_rate": 0.00018629723300794408,
"loss": 1.2936,
"step": 228
},
{
"epoch": 0.17584949126511806,
"grad_norm": 1.5708985328674316,
"learning_rate": 0.00018617421789431747,
"loss": 1.3001,
"step": 229
},
{
"epoch": 0.17661739297369936,
"grad_norm": 1.595597743988037,
"learning_rate": 0.0001860506940594955,
"loss": 1.2869,
"step": 230
},
{
"epoch": 0.17738529468228068,
"grad_norm": 1.3937422037124634,
"learning_rate": 0.00018592666223268917,
"loss": 1.1357,
"step": 231
},
{
"epoch": 0.17815319639086197,
"grad_norm": 1.6847139596939087,
"learning_rate": 0.00018580212314610846,
"loss": 1.6054,
"step": 232
},
{
"epoch": 0.17892109809944326,
"grad_norm": 2.011245012283325,
"learning_rate": 0.0001856770775349579,
"loss": 1.4489,
"step": 233
},
{
"epoch": 0.17968899980802458,
"grad_norm": 1.3544971942901611,
"learning_rate": 0.00018555152613743215,
"loss": 1.3519,
"step": 234
},
{
"epoch": 0.18045690151660587,
"grad_norm": 1.729085922241211,
"learning_rate": 0.00018542546969471183,
"loss": 1.5642,
"step": 235
},
{
"epoch": 0.18122480322518716,
"grad_norm": 1.8843064308166504,
"learning_rate": 0.00018529890895095902,
"loss": 1.7392,
"step": 236
},
{
"epoch": 0.18199270493376848,
"grad_norm": 1.3016268014907837,
"learning_rate": 0.00018517184465331288,
"loss": 1.0011,
"step": 237
},
{
"epoch": 0.18276060664234978,
"grad_norm": 1.5241748094558716,
"learning_rate": 0.00018504427755188521,
"loss": 1.0784,
"step": 238
},
{
"epoch": 0.18352850835093107,
"grad_norm": 1.42753267288208,
"learning_rate": 0.00018491620839975617,
"loss": 1.1811,
"step": 239
},
{
"epoch": 0.1842964100595124,
"grad_norm": 1.427199125289917,
"learning_rate": 0.00018478763795296962,
"loss": 1.0278,
"step": 240
},
{
"epoch": 0.18506431176809368,
"grad_norm": 1.4727647304534912,
"learning_rate": 0.0001846585669705288,
"loss": 1.4351,
"step": 241
},
{
"epoch": 0.18583221347667497,
"grad_norm": 1.6001156568527222,
"learning_rate": 0.00018452899621439182,
"loss": 1.4974,
"step": 242
},
{
"epoch": 0.1866001151852563,
"grad_norm": 1.8062998056411743,
"learning_rate": 0.00018439892644946722,
"loss": 1.4885,
"step": 243
},
{
"epoch": 0.18736801689383759,
"grad_norm": 1.3706297874450684,
"learning_rate": 0.00018426835844360929,
"loss": 1.2593,
"step": 244
},
{
"epoch": 0.18813591860241888,
"grad_norm": 1.557486653327942,
"learning_rate": 0.00018413729296761364,
"loss": 1.3712,
"step": 245
},
{
"epoch": 0.1889038203110002,
"grad_norm": 1.640881896018982,
"learning_rate": 0.00018400573079521278,
"loss": 1.4834,
"step": 246
},
{
"epoch": 0.1896717220195815,
"grad_norm": 1.6011335849761963,
"learning_rate": 0.0001838736727030712,
"loss": 1.2738,
"step": 247
},
{
"epoch": 0.19043962372816278,
"grad_norm": 1.2537946701049805,
"learning_rate": 0.00018374111947078124,
"loss": 1.0421,
"step": 248
},
{
"epoch": 0.1912075254367441,
"grad_norm": 1.5533615350723267,
"learning_rate": 0.00018360807188085807,
"loss": 1.2964,
"step": 249
},
{
"epoch": 0.1919754271453254,
"grad_norm": 1.967961072921753,
"learning_rate": 0.00018347453071873536,
"loss": 1.7348,
"step": 250
},
{
"epoch": 0.1927433288539067,
"grad_norm": 1.4344468116760254,
"learning_rate": 0.00018334049677276045,
"loss": 1.1252,
"step": 251
},
{
"epoch": 0.193511230562488,
"grad_norm": 1.6143999099731445,
"learning_rate": 0.0001832059708341899,
"loss": 1.3829,
"step": 252
},
{
"epoch": 0.1942791322710693,
"grad_norm": 1.8929922580718994,
"learning_rate": 0.00018307095369718456,
"loss": 1.4921,
"step": 253
},
{
"epoch": 0.1950470339796506,
"grad_norm": 1.6425096988677979,
"learning_rate": 0.00018293544615880517,
"loss": 1.4332,
"step": 254
},
{
"epoch": 0.1958149356882319,
"grad_norm": 1.1571861505508423,
"learning_rate": 0.00018279944901900737,
"loss": 0.9519,
"step": 255
},
{
"epoch": 0.1965828373968132,
"grad_norm": 1.438349723815918,
"learning_rate": 0.00018266296308063718,
"loss": 1.2061,
"step": 256
},
{
"epoch": 0.1973507391053945,
"grad_norm": 1.3200489282608032,
"learning_rate": 0.00018252598914942622,
"loss": 1.4497,
"step": 257
},
{
"epoch": 0.19811864081397582,
"grad_norm": 1.596771240234375,
"learning_rate": 0.00018238852803398689,
"loss": 1.4469,
"step": 258
},
{
"epoch": 0.1988865425225571,
"grad_norm": 1.5184094905853271,
"learning_rate": 0.00018225058054580765,
"loss": 1.2613,
"step": 259
},
{
"epoch": 0.1996544442311384,
"grad_norm": 1.5872782468795776,
"learning_rate": 0.0001821121474992482,
"loss": 1.2036,
"step": 260
},
{
"epoch": 0.20042234593971972,
"grad_norm": 1.4941065311431885,
"learning_rate": 0.00018197322971153467,
"loss": 1.4401,
"step": 261
},
{
"epoch": 0.201190247648301,
"grad_norm": 1.490402102470398,
"learning_rate": 0.0001818338280027549,
"loss": 1.18,
"step": 262
},
{
"epoch": 0.20195814935688233,
"grad_norm": 1.4398747682571411,
"learning_rate": 0.00018169394319585345,
"loss": 1.4196,
"step": 263
},
{
"epoch": 0.20272605106546362,
"grad_norm": 1.7845875024795532,
"learning_rate": 0.00018155357611662672,
"loss": 1.7554,
"step": 264
},
{
"epoch": 0.20349395277404492,
"grad_norm": 1.4204550981521606,
"learning_rate": 0.0001814127275937183,
"loss": 1.1294,
"step": 265
},
{
"epoch": 0.20426185448262624,
"grad_norm": 1.406785011291504,
"learning_rate": 0.0001812713984586139,
"loss": 1.4107,
"step": 266
},
{
"epoch": 0.20502975619120753,
"grad_norm": 1.5216752290725708,
"learning_rate": 0.00018112958954563646,
"loss": 1.2575,
"step": 267
},
{
"epoch": 0.20579765789978882,
"grad_norm": 1.3387800455093384,
"learning_rate": 0.00018098730169194117,
"loss": 1.2524,
"step": 268
},
{
"epoch": 0.20656555960837014,
"grad_norm": 1.4416996240615845,
"learning_rate": 0.00018084453573751072,
"loss": 1.1916,
"step": 269
},
{
"epoch": 0.20733346131695143,
"grad_norm": 1.2977286577224731,
"learning_rate": 0.00018070129252515014,
"loss": 1.2981,
"step": 270
},
{
"epoch": 0.20810136302553273,
"grad_norm": 1.4423656463623047,
"learning_rate": 0.00018055757290048202,
"loss": 1.3222,
"step": 271
},
{
"epoch": 0.20886926473411405,
"grad_norm": 1.6373261213302612,
"learning_rate": 0.00018041337771194121,
"loss": 1.4657,
"step": 272
},
{
"epoch": 0.20963716644269534,
"grad_norm": 1.9930970668792725,
"learning_rate": 0.0001802687078107702,
"loss": 1.2641,
"step": 273
},
{
"epoch": 0.21040506815127663,
"grad_norm": 1.6621267795562744,
"learning_rate": 0.0001801235640510138,
"loss": 1.4616,
"step": 274
},
{
"epoch": 0.21117296985985795,
"grad_norm": 1.6404649019241333,
"learning_rate": 0.0001799779472895142,
"loss": 1.5701,
"step": 275
},
{
"epoch": 0.21194087156843924,
"grad_norm": 1.4635223150253296,
"learning_rate": 0.00017983185838590587,
"loss": 1.2226,
"step": 276
},
{
"epoch": 0.21270877327702054,
"grad_norm": 1.485813021659851,
"learning_rate": 0.0001796852982026107,
"loss": 1.5223,
"step": 277
},
{
"epoch": 0.21347667498560186,
"grad_norm": 1.6244434118270874,
"learning_rate": 0.00017953826760483255,
"loss": 1.189,
"step": 278
},
{
"epoch": 0.21424457669418315,
"grad_norm": 1.3490244150161743,
"learning_rate": 0.00017939076746055239,
"loss": 1.1965,
"step": 279
},
{
"epoch": 0.21501247840276444,
"grad_norm": 1.5216923952102661,
"learning_rate": 0.00017924279864052313,
"loss": 1.26,
"step": 280
},
{
"epoch": 0.21578038011134576,
"grad_norm": 1.4194529056549072,
"learning_rate": 0.00017909436201826444,
"loss": 1.1278,
"step": 281
},
{
"epoch": 0.21654828181992705,
"grad_norm": 1.5075751543045044,
"learning_rate": 0.00017894545847005764,
"loss": 1.2345,
"step": 282
},
{
"epoch": 0.21731618352850834,
"grad_norm": 1.5216305255889893,
"learning_rate": 0.00017879608887494045,
"loss": 1.2505,
"step": 283
},
{
"epoch": 0.21808408523708966,
"grad_norm": 1.0526906251907349,
"learning_rate": 0.00017864625411470193,
"loss": 0.9264,
"step": 284
},
{
"epoch": 0.21885198694567096,
"grad_norm": 1.7272154092788696,
"learning_rate": 0.00017849595507387714,
"loss": 1.3899,
"step": 285
},
{
"epoch": 0.21961988865425225,
"grad_norm": 1.4222748279571533,
"learning_rate": 0.00017834519263974197,
"loss": 1.2786,
"step": 286
},
{
"epoch": 0.22038779036283357,
"grad_norm": 1.5858385562896729,
"learning_rate": 0.00017819396770230793,
"loss": 1.58,
"step": 287
},
{
"epoch": 0.22115569207141486,
"grad_norm": 1.35157310962677,
"learning_rate": 0.0001780422811543169,
"loss": 1.0841,
"step": 288
},
{
"epoch": 0.22192359377999615,
"grad_norm": 1.4709627628326416,
"learning_rate": 0.00017789013389123582,
"loss": 1.3991,
"step": 289
},
{
"epoch": 0.22269149548857747,
"grad_norm": 1.6085643768310547,
"learning_rate": 0.00017773752681125133,
"loss": 1.2382,
"step": 290
},
{
"epoch": 0.22345939719715877,
"grad_norm": 1.9695161581039429,
"learning_rate": 0.00017758446081526472,
"loss": 1.408,
"step": 291
},
{
"epoch": 0.22422729890574006,
"grad_norm": 2.029320001602173,
"learning_rate": 0.00017743093680688628,
"loss": 1.4736,
"step": 292
},
{
"epoch": 0.22499520061432138,
"grad_norm": 1.4890549182891846,
"learning_rate": 0.00017727695569243025,
"loss": 1.1405,
"step": 293
},
{
"epoch": 0.22576310232290267,
"grad_norm": 1.3257640600204468,
"learning_rate": 0.00017712251838090929,
"loss": 1.0384,
"step": 294
},
{
"epoch": 0.22653100403148396,
"grad_norm": 1.1012932062149048,
"learning_rate": 0.00017696762578402918,
"loss": 0.7738,
"step": 295
},
{
"epoch": 0.22729890574006528,
"grad_norm": 1.72528874874115,
"learning_rate": 0.0001768122788161835,
"loss": 1.4075,
"step": 296
},
{
"epoch": 0.22806680744864657,
"grad_norm": 1.4007856845855713,
"learning_rate": 0.00017665647839444808,
"loss": 0.9999,
"step": 297
},
{
"epoch": 0.22883470915722787,
"grad_norm": 1.8628685474395752,
"learning_rate": 0.0001765002254385757,
"loss": 1.5956,
"step": 298
},
{
"epoch": 0.2296026108658092,
"grad_norm": 1.5395143032073975,
"learning_rate": 0.0001763435208709906,
"loss": 1.286,
"step": 299
},
{
"epoch": 0.23037051257439048,
"grad_norm": 1.555917739868164,
"learning_rate": 0.00017618636561678316,
"loss": 1.7136,
"step": 300
},
{
"epoch": 0.23113841428297177,
"grad_norm": 1.3042889833450317,
"learning_rate": 0.0001760287606037043,
"loss": 1.0877,
"step": 301
},
{
"epoch": 0.2319063159915531,
"grad_norm": 1.0165395736694336,
"learning_rate": 0.00017587070676215993,
"loss": 0.983,
"step": 302
},
{
"epoch": 0.23267421770013438,
"grad_norm": 1.264906406402588,
"learning_rate": 0.0001757122050252058,
"loss": 1.1153,
"step": 303
},
{
"epoch": 0.23344211940871568,
"grad_norm": 1.2855709791183472,
"learning_rate": 0.0001755532563285416,
"loss": 1.3091,
"step": 304
},
{
"epoch": 0.234210021117297,
"grad_norm": 1.8462400436401367,
"learning_rate": 0.0001753938616105056,
"loss": 1.4876,
"step": 305
},
{
"epoch": 0.2349779228258783,
"grad_norm": 1.6739815473556519,
"learning_rate": 0.0001752340218120693,
"loss": 1.4114,
"step": 306
},
{
"epoch": 0.23574582453445958,
"grad_norm": 1.4647902250289917,
"learning_rate": 0.00017507373787683142,
"loss": 1.2797,
"step": 307
},
{
"epoch": 0.2365137262430409,
"grad_norm": 1.4707120656967163,
"learning_rate": 0.00017491301075101278,
"loss": 1.3396,
"step": 308
},
{
"epoch": 0.2372816279516222,
"grad_norm": 1.462719202041626,
"learning_rate": 0.0001747518413834505,
"loss": 1.2757,
"step": 309
},
{
"epoch": 0.23804952966020348,
"grad_norm": 1.5493544340133667,
"learning_rate": 0.0001745902307255924,
"loss": 1.2215,
"step": 310
},
{
"epoch": 0.2388174313687848,
"grad_norm": 1.4878123998641968,
"learning_rate": 0.00017442817973149145,
"loss": 1.3161,
"step": 311
},
{
"epoch": 0.2395853330773661,
"grad_norm": 1.3560563325881958,
"learning_rate": 0.0001742656893578001,
"loss": 0.7846,
"step": 312
},
{
"epoch": 0.2403532347859474,
"grad_norm": 1.3380439281463623,
"learning_rate": 0.00017410276056376456,
"loss": 1.1383,
"step": 313
},
{
"epoch": 0.2411211364945287,
"grad_norm": 1.7162578105926514,
"learning_rate": 0.00017393939431121933,
"loss": 1.7719,
"step": 314
},
{
"epoch": 0.24188903820311,
"grad_norm": 1.621044397354126,
"learning_rate": 0.00017377559156458132,
"loss": 1.5056,
"step": 315
},
{
"epoch": 0.2426569399116913,
"grad_norm": 1.2746073007583618,
"learning_rate": 0.00017361135329084428,
"loss": 1.0386,
"step": 316
},
{
"epoch": 0.24342484162027261,
"grad_norm": 1.3122072219848633,
"learning_rate": 0.00017344668045957305,
"loss": 0.932,
"step": 317
},
{
"epoch": 0.2441927433288539,
"grad_norm": 1.3992735147476196,
"learning_rate": 0.0001732815740428978,
"loss": 1.2038,
"step": 318
},
{
"epoch": 0.2449606450374352,
"grad_norm": 1.61997389793396,
"learning_rate": 0.00017311603501550838,
"loss": 1.5754,
"step": 319
},
{
"epoch": 0.24572854674601652,
"grad_norm": 1.5020617246627808,
"learning_rate": 0.00017295006435464848,
"loss": 1.197,
"step": 320
},
{
"epoch": 0.2464964484545978,
"grad_norm": 1.4431666135787964,
"learning_rate": 0.00017278366304010993,
"loss": 1.5506,
"step": 321
},
{
"epoch": 0.2472643501631791,
"grad_norm": 1.6864901781082153,
"learning_rate": 0.00017261683205422687,
"loss": 1.478,
"step": 322
},
{
"epoch": 0.24803225187176042,
"grad_norm": 1.3386764526367188,
"learning_rate": 0.00017244957238186993,
"loss": 1.258,
"step": 323
},
{
"epoch": 0.24880015358034172,
"grad_norm": 1.4950274229049683,
"learning_rate": 0.00017228188501044043,
"loss": 1.2203,
"step": 324
},
{
"epoch": 0.249568055288923,
"grad_norm": 1.4726964235305786,
"learning_rate": 0.00017211377092986476,
"loss": 1.3879,
"step": 325
},
{
"epoch": 0.2503359569975043,
"grad_norm": 1.3446931838989258,
"learning_rate": 0.00017194523113258804,
"loss": 1.3336,
"step": 326
},
{
"epoch": 0.2503359569975043,
"eval_loss": 1.2999707460403442,
"eval_runtime": 19.7696,
"eval_samples_per_second": 27.77,
"eval_steps_per_second": 13.91,
"step": 326
},
{
"epoch": 0.25110385870608565,
"grad_norm": 1.1212903261184692,
"learning_rate": 0.00017177626661356884,
"loss": 0.9078,
"step": 327
},
{
"epoch": 0.2518717604146669,
"grad_norm": 1.5891238451004028,
"learning_rate": 0.0001716068783702729,
"loss": 1.3183,
"step": 328
},
{
"epoch": 0.25263966212324823,
"grad_norm": 1.8708879947662354,
"learning_rate": 0.00017143706740266733,
"loss": 1.6728,
"step": 329
},
{
"epoch": 0.25340756383182955,
"grad_norm": 1.304468035697937,
"learning_rate": 0.00017126683471321494,
"loss": 1.0729,
"step": 330
},
{
"epoch": 0.2541754655404108,
"grad_norm": 2.041584014892578,
"learning_rate": 0.00017109618130686793,
"loss": 1.3812,
"step": 331
},
{
"epoch": 0.25494336724899214,
"grad_norm": 1.474676251411438,
"learning_rate": 0.00017092510819106228,
"loss": 1.2994,
"step": 332
},
{
"epoch": 0.25571126895757346,
"grad_norm": 2.100346088409424,
"learning_rate": 0.00017075361637571164,
"loss": 1.3583,
"step": 333
},
{
"epoch": 0.2564791706661547,
"grad_norm": 1.4596396684646606,
"learning_rate": 0.00017058170687320144,
"loss": 1.3277,
"step": 334
},
{
"epoch": 0.25724707237473604,
"grad_norm": 1.605150818824768,
"learning_rate": 0.00017040938069838284,
"loss": 1.2923,
"step": 335
},
{
"epoch": 0.25801497408331736,
"grad_norm": 1.727765679359436,
"learning_rate": 0.00017023663886856681,
"loss": 1.4675,
"step": 336
},
{
"epoch": 0.2587828757918986,
"grad_norm": 1.5267651081085205,
"learning_rate": 0.0001700634824035182,
"loss": 1.3337,
"step": 337
},
{
"epoch": 0.25955077750047995,
"grad_norm": 1.5880826711654663,
"learning_rate": 0.00016988991232544943,
"loss": 1.2193,
"step": 338
},
{
"epoch": 0.26031867920906127,
"grad_norm": 1.7486482858657837,
"learning_rate": 0.00016971592965901472,
"loss": 1.3199,
"step": 339
},
{
"epoch": 0.26108658091764253,
"grad_norm": 1.4829312562942505,
"learning_rate": 0.00016954153543130405,
"loss": 1.2846,
"step": 340
},
{
"epoch": 0.26185448262622385,
"grad_norm": 1.501550316810608,
"learning_rate": 0.00016936673067183695,
"loss": 1.2895,
"step": 341
},
{
"epoch": 0.26262238433480517,
"grad_norm": 1.6724597215652466,
"learning_rate": 0.00016919151641255642,
"loss": 1.7143,
"step": 342
},
{
"epoch": 0.26339028604338643,
"grad_norm": 1.6089798212051392,
"learning_rate": 0.00016901589368782303,
"loss": 1.4535,
"step": 343
},
{
"epoch": 0.26415818775196775,
"grad_norm": 1.5275441408157349,
"learning_rate": 0.00016883986353440856,
"loss": 1.2774,
"step": 344
},
{
"epoch": 0.2649260894605491,
"grad_norm": 1.6726583242416382,
"learning_rate": 0.0001686634269914901,
"loss": 1.3747,
"step": 345
},
{
"epoch": 0.26569399116913034,
"grad_norm": 1.340077519416809,
"learning_rate": 0.00016848658510064377,
"loss": 1.0331,
"step": 346
},
{
"epoch": 0.26646189287771166,
"grad_norm": 1.2696207761764526,
"learning_rate": 0.00016830933890583865,
"loss": 1.107,
"step": 347
},
{
"epoch": 0.267229794586293,
"grad_norm": 1.505295753479004,
"learning_rate": 0.00016813168945343062,
"loss": 1.3706,
"step": 348
},
{
"epoch": 0.26799769629487424,
"grad_norm": 1.503833293914795,
"learning_rate": 0.000167953637792156,
"loss": 1.3591,
"step": 349
},
{
"epoch": 0.26876559800345556,
"grad_norm": 1.5831080675125122,
"learning_rate": 0.00016777518497312576,
"loss": 1.4128,
"step": 350
},
{
"epoch": 0.2695334997120369,
"grad_norm": 1.567592978477478,
"learning_rate": 0.00016759633204981885,
"loss": 1.3437,
"step": 351
},
{
"epoch": 0.27030140142061815,
"grad_norm": 1.4218353033065796,
"learning_rate": 0.00016741708007807625,
"loss": 1.1095,
"step": 352
},
{
"epoch": 0.27106930312919947,
"grad_norm": 1.5181939601898193,
"learning_rate": 0.0001672374301160948,
"loss": 1.3159,
"step": 353
},
{
"epoch": 0.2718372048377808,
"grad_norm": 1.3197153806686401,
"learning_rate": 0.00016705738322442067,
"loss": 1.2836,
"step": 354
},
{
"epoch": 0.27260510654636205,
"grad_norm": 1.5604197978973389,
"learning_rate": 0.0001668769404659434,
"loss": 1.525,
"step": 355
},
{
"epoch": 0.2733730082549434,
"grad_norm": 1.5216556787490845,
"learning_rate": 0.00016669610290588938,
"loss": 1.2548,
"step": 356
},
{
"epoch": 0.2741409099635247,
"grad_norm": 1.519769310951233,
"learning_rate": 0.00016651487161181575,
"loss": 1.4625,
"step": 357
},
{
"epoch": 0.27490881167210596,
"grad_norm": 1.584656834602356,
"learning_rate": 0.00016633324765360404,
"loss": 1.2258,
"step": 358
},
{
"epoch": 0.2756767133806873,
"grad_norm": 1.3976715803146362,
"learning_rate": 0.00016615123210345374,
"loss": 1.0869,
"step": 359
},
{
"epoch": 0.2764446150892686,
"grad_norm": 1.6249608993530273,
"learning_rate": 0.00016596882603587613,
"loss": 1.3358,
"step": 360
},
{
"epoch": 0.27721251679784986,
"grad_norm": 1.8045907020568848,
"learning_rate": 0.00016578603052768787,
"loss": 1.4165,
"step": 361
},
{
"epoch": 0.2779804185064312,
"grad_norm": 2.2819437980651855,
"learning_rate": 0.00016560284665800463,
"loss": 2.0465,
"step": 362
},
{
"epoch": 0.2787483202150125,
"grad_norm": 1.4787830114364624,
"learning_rate": 0.00016541927550823475,
"loss": 1.4912,
"step": 363
},
{
"epoch": 0.27951622192359377,
"grad_norm": 1.8598263263702393,
"learning_rate": 0.00016523531816207285,
"loss": 1.4541,
"step": 364
},
{
"epoch": 0.2802841236321751,
"grad_norm": 1.5561397075653076,
"learning_rate": 0.00016505097570549334,
"loss": 1.0468,
"step": 365
},
{
"epoch": 0.2810520253407564,
"grad_norm": 1.2687749862670898,
"learning_rate": 0.00016486624922674423,
"loss": 1.0835,
"step": 366
},
{
"epoch": 0.28181992704933767,
"grad_norm": 1.1787006855010986,
"learning_rate": 0.0001646811398163405,
"loss": 1.1177,
"step": 367
},
{
"epoch": 0.282587828757919,
"grad_norm": 1.8415557146072388,
"learning_rate": 0.00016449564856705763,
"loss": 1.5507,
"step": 368
},
{
"epoch": 0.2833557304665003,
"grad_norm": 1.6068737506866455,
"learning_rate": 0.00016430977657392543,
"loss": 1.1337,
"step": 369
},
{
"epoch": 0.2841236321750816,
"grad_norm": 1.4251381158828735,
"learning_rate": 0.00016412352493422132,
"loss": 1.1206,
"step": 370
},
{
"epoch": 0.2848915338836629,
"grad_norm": 1.5951745510101318,
"learning_rate": 0.00016393689474746383,
"loss": 1.3409,
"step": 371
},
{
"epoch": 0.2856594355922442,
"grad_norm": 1.8638418912887573,
"learning_rate": 0.00016374988711540634,
"loss": 1.671,
"step": 372
},
{
"epoch": 0.2864273373008255,
"grad_norm": 1.7180877923965454,
"learning_rate": 0.00016356250314203044,
"loss": 1.2214,
"step": 373
},
{
"epoch": 0.2871952390094068,
"grad_norm": 1.66769540309906,
"learning_rate": 0.00016337474393353932,
"loss": 1.3264,
"step": 374
},
{
"epoch": 0.2879631407179881,
"grad_norm": 1.2491950988769531,
"learning_rate": 0.00016318661059835135,
"loss": 1.0643,
"step": 375
},
{
"epoch": 0.2887310424265694,
"grad_norm": 1.6886341571807861,
"learning_rate": 0.0001629981042470936,
"loss": 1.4799,
"step": 376
},
{
"epoch": 0.2894989441351507,
"grad_norm": 1.3660120964050293,
"learning_rate": 0.00016280922599259517,
"loss": 1.0789,
"step": 377
},
{
"epoch": 0.290266845843732,
"grad_norm": 1.5407626628875732,
"learning_rate": 0.00016261997694988064,
"loss": 1.1531,
"step": 378
},
{
"epoch": 0.2910347475523133,
"grad_norm": 1.512680172920227,
"learning_rate": 0.00016243035823616347,
"loss": 1.3475,
"step": 379
},
{
"epoch": 0.2918026492608946,
"grad_norm": 1.6978341341018677,
"learning_rate": 0.0001622403709708395,
"loss": 1.4456,
"step": 380
},
{
"epoch": 0.29257055096947593,
"grad_norm": 1.3576867580413818,
"learning_rate": 0.00016205001627548019,
"loss": 1.2833,
"step": 381
},
{
"epoch": 0.2933384526780572,
"grad_norm": 1.6945931911468506,
"learning_rate": 0.0001618592952738263,
"loss": 1.2796,
"step": 382
},
{
"epoch": 0.2941063543866385,
"grad_norm": 1.592065691947937,
"learning_rate": 0.00016166820909178074,
"loss": 1.4813,
"step": 383
},
{
"epoch": 0.29487425609521983,
"grad_norm": 1.5409668684005737,
"learning_rate": 0.00016147675885740242,
"loss": 1.3686,
"step": 384
},
{
"epoch": 0.2956421578038011,
"grad_norm": 1.6272224187850952,
"learning_rate": 0.00016128494570089944,
"loss": 1.6073,
"step": 385
},
{
"epoch": 0.2964100595123824,
"grad_norm": 1.212295651435852,
"learning_rate": 0.0001610927707546222,
"loss": 1.1079,
"step": 386
},
{
"epoch": 0.29717796122096374,
"grad_norm": 1.5491012334823608,
"learning_rate": 0.00016090023515305703,
"loss": 1.1266,
"step": 387
},
{
"epoch": 0.297945862929545,
"grad_norm": 1.0519541501998901,
"learning_rate": 0.0001607073400328193,
"loss": 0.9496,
"step": 388
},
{
"epoch": 0.2987137646381263,
"grad_norm": 1.6982554197311401,
"learning_rate": 0.00016051408653264675,
"loss": 1.4332,
"step": 389
},
{
"epoch": 0.29948166634670764,
"grad_norm": 1.4598078727722168,
"learning_rate": 0.00016032047579339287,
"loss": 1.1593,
"step": 390
},
{
"epoch": 0.3002495680552889,
"grad_norm": 1.446446180343628,
"learning_rate": 0.00016012650895801995,
"loss": 1.3672,
"step": 391
},
{
"epoch": 0.3010174697638702,
"grad_norm": 1.4834871292114258,
"learning_rate": 0.00015993218717159254,
"loss": 1.2994,
"step": 392
},
{
"epoch": 0.30178537147245155,
"grad_norm": 1.433243989944458,
"learning_rate": 0.00015973751158127058,
"loss": 1.083,
"step": 393
},
{
"epoch": 0.3025532731810328,
"grad_norm": 1.1852916479110718,
"learning_rate": 0.00015954248333630266,
"loss": 0.9979,
"step": 394
},
{
"epoch": 0.30332117488961413,
"grad_norm": 1.4138494729995728,
"learning_rate": 0.0001593471035880193,
"loss": 0.9835,
"step": 395
},
{
"epoch": 0.30408907659819545,
"grad_norm": 1.609925389289856,
"learning_rate": 0.00015915137348982596,
"loss": 1.2299,
"step": 396
},
{
"epoch": 0.3048569783067767,
"grad_norm": 1.60574471950531,
"learning_rate": 0.00015895529419719643,
"loss": 1.3246,
"step": 397
},
{
"epoch": 0.30562488001535804,
"grad_norm": 1.3476864099502563,
"learning_rate": 0.00015875886686766597,
"loss": 1.1798,
"step": 398
},
{
"epoch": 0.30639278172393936,
"grad_norm": 1.5925241708755493,
"learning_rate": 0.00015856209266082436,
"loss": 0.8936,
"step": 399
},
{
"epoch": 0.3071606834325206,
"grad_norm": 1.489413857460022,
"learning_rate": 0.0001583649727383092,
"loss": 1.1598,
"step": 400
},
{
"epoch": 0.30792858514110194,
"grad_norm": 1.3467170000076294,
"learning_rate": 0.00015816750826379896,
"loss": 1.0321,
"step": 401
},
{
"epoch": 0.30869648684968326,
"grad_norm": 1.8774082660675049,
"learning_rate": 0.00015796970040300612,
"loss": 1.3866,
"step": 402
},
{
"epoch": 0.3094643885582645,
"grad_norm": 1.35391104221344,
"learning_rate": 0.0001577715503236704,
"loss": 0.9064,
"step": 403
},
{
"epoch": 0.31023229026684584,
"grad_norm": 1.7187341451644897,
"learning_rate": 0.00015757305919555164,
"loss": 1.3035,
"step": 404
},
{
"epoch": 0.31100019197542716,
"grad_norm": 1.5372244119644165,
"learning_rate": 0.00015737422819042313,
"loss": 1.1807,
"step": 405
},
{
"epoch": 0.31176809368400843,
"grad_norm": 1.558975100517273,
"learning_rate": 0.00015717505848206455,
"loss": 1.273,
"step": 406
},
{
"epoch": 0.31253599539258975,
"grad_norm": 1.8539949655532837,
"learning_rate": 0.00015697555124625508,
"loss": 1.3312,
"step": 407
},
{
"epoch": 0.31330389710117107,
"grad_norm": 1.5257269144058228,
"learning_rate": 0.00015677570766076652,
"loss": 1.2561,
"step": 408
},
{
"epoch": 0.31407179880975233,
"grad_norm": 1.7116575241088867,
"learning_rate": 0.0001565755289053562,
"loss": 1.4553,
"step": 409
},
{
"epoch": 0.31483970051833365,
"grad_norm": 1.2616760730743408,
"learning_rate": 0.00015637501616176005,
"loss": 1.0677,
"step": 410
},
{
"epoch": 0.315607602226915,
"grad_norm": 1.6886520385742188,
"learning_rate": 0.00015617417061368586,
"loss": 1.3973,
"step": 411
},
{
"epoch": 0.31637550393549624,
"grad_norm": 1.5470880270004272,
"learning_rate": 0.0001559729934468059,
"loss": 1.2439,
"step": 412
},
{
"epoch": 0.31714340564407756,
"grad_norm": 1.5241177082061768,
"learning_rate": 0.0001557714858487502,
"loss": 1.1146,
"step": 413
},
{
"epoch": 0.3179113073526589,
"grad_norm": 1.266050100326538,
"learning_rate": 0.00015556964900909952,
"loss": 0.9626,
"step": 414
},
{
"epoch": 0.31867920906124014,
"grad_norm": 1.4324322938919067,
"learning_rate": 0.00015536748411937814,
"loss": 1.424,
"step": 415
},
{
"epoch": 0.31944711076982146,
"grad_norm": 1.3910094499588013,
"learning_rate": 0.00015516499237304703,
"loss": 1.0697,
"step": 416
},
{
"epoch": 0.3202150124784028,
"grad_norm": 1.579870343208313,
"learning_rate": 0.00015496217496549673,
"loss": 0.9477,
"step": 417
},
{
"epoch": 0.32098291418698405,
"grad_norm": 1.598908543586731,
"learning_rate": 0.00015475903309404023,
"loss": 1.3701,
"step": 418
},
{
"epoch": 0.32175081589556537,
"grad_norm": 1.9276705980300903,
"learning_rate": 0.00015455556795790603,
"loss": 1.268,
"step": 419
},
{
"epoch": 0.3225187176041467,
"grad_norm": 1.4769033193588257,
"learning_rate": 0.000154351780758231,
"loss": 1.315,
"step": 420
},
{
"epoch": 0.32328661931272795,
"grad_norm": 1.5065767765045166,
"learning_rate": 0.00015414767269805317,
"loss": 1.2895,
"step": 421
},
{
"epoch": 0.32405452102130927,
"grad_norm": 1.3060064315795898,
"learning_rate": 0.00015394324498230487,
"loss": 1.3985,
"step": 422
},
{
"epoch": 0.3248224227298906,
"grad_norm": 1.3480304479599,
"learning_rate": 0.00015373849881780542,
"loss": 1.1568,
"step": 423
},
{
"epoch": 0.32559032443847186,
"grad_norm": 1.3555710315704346,
"learning_rate": 0.00015353343541325406,
"loss": 1.2957,
"step": 424
},
{
"epoch": 0.3263582261470532,
"grad_norm": 1.3848627805709839,
"learning_rate": 0.00015332805597922285,
"loss": 1.1681,
"step": 425
},
{
"epoch": 0.3271261278556345,
"grad_norm": 1.3289916515350342,
"learning_rate": 0.00015312236172814955,
"loss": 1.0471,
"step": 426
},
{
"epoch": 0.32789402956421576,
"grad_norm": 1.392135739326477,
"learning_rate": 0.0001529163538743303,
"loss": 1.3595,
"step": 427
},
{
"epoch": 0.3286619312727971,
"grad_norm": 1.2764785289764404,
"learning_rate": 0.00015271003363391268,
"loss": 0.9874,
"step": 428
},
{
"epoch": 0.3294298329813784,
"grad_norm": 1.879889965057373,
"learning_rate": 0.00015250340222488826,
"loss": 1.4683,
"step": 429
},
{
"epoch": 0.33019773468995967,
"grad_norm": 1.6945278644561768,
"learning_rate": 0.00015229646086708574,
"loss": 1.2251,
"step": 430
},
{
"epoch": 0.330965636398541,
"grad_norm": 1.3778647184371948,
"learning_rate": 0.0001520892107821635,
"loss": 1.1494,
"step": 431
},
{
"epoch": 0.3317335381071223,
"grad_norm": 1.1476179361343384,
"learning_rate": 0.0001518816531936024,
"loss": 1.13,
"step": 432
},
{
"epoch": 0.33250143981570357,
"grad_norm": 1.8296000957489014,
"learning_rate": 0.0001516737893266987,
"loss": 1.4141,
"step": 433
},
{
"epoch": 0.3332693415242849,
"grad_norm": 1.7736458778381348,
"learning_rate": 0.00015146562040855676,
"loss": 1.0404,
"step": 434
},
{
"epoch": 0.3340372432328662,
"grad_norm": 2.155344009399414,
"learning_rate": 0.00015125714766808167,
"loss": 1.567,
"step": 435
},
{
"epoch": 0.3348051449414475,
"grad_norm": 1.3648680448532104,
"learning_rate": 0.00015104837233597223,
"loss": 1.1135,
"step": 436
},
{
"epoch": 0.3355730466500288,
"grad_norm": 1.4443109035491943,
"learning_rate": 0.00015083929564471343,
"loss": 1.2181,
"step": 437
},
{
"epoch": 0.3363409483586101,
"grad_norm": 1.52615225315094,
"learning_rate": 0.00015062991882856946,
"loss": 1.2578,
"step": 438
},
{
"epoch": 0.3371088500671914,
"grad_norm": 1.5846048593521118,
"learning_rate": 0.00015042024312357616,
"loss": 1.2116,
"step": 439
},
{
"epoch": 0.3378767517757727,
"grad_norm": 1.6490528583526611,
"learning_rate": 0.00015021026976753385,
"loss": 1.3501,
"step": 440
},
{
"epoch": 0.338644653484354,
"grad_norm": 1.5763674974441528,
"learning_rate": 0.00015000000000000001,
"loss": 1.2794,
"step": 441
},
{
"epoch": 0.3394125551929353,
"grad_norm": 1.7455716133117676,
"learning_rate": 0.000149789435062282,
"loss": 1.3979,
"step": 442
},
{
"epoch": 0.3401804569015166,
"grad_norm": 1.2413816452026367,
"learning_rate": 0.00014957857619742957,
"loss": 1.0325,
"step": 443
},
{
"epoch": 0.3409483586100979,
"grad_norm": 1.4034167528152466,
"learning_rate": 0.0001493674246502278,
"loss": 1.0691,
"step": 444
},
{
"epoch": 0.3417162603186792,
"grad_norm": 1.4105448722839355,
"learning_rate": 0.00014915598166718945,
"loss": 1.4555,
"step": 445
},
{
"epoch": 0.3424841620272605,
"grad_norm": 1.4534122943878174,
"learning_rate": 0.00014894424849654783,
"loss": 1.2776,
"step": 446
},
{
"epoch": 0.34325206373584183,
"grad_norm": 1.3057913780212402,
"learning_rate": 0.00014873222638824937,
"loss": 1.121,
"step": 447
},
{
"epoch": 0.3440199654444231,
"grad_norm": 1.4911497831344604,
"learning_rate": 0.0001485199165939461,
"loss": 1.0317,
"step": 448
},
{
"epoch": 0.3447878671530044,
"grad_norm": 1.4716236591339111,
"learning_rate": 0.00014830732036698845,
"loss": 1.1425,
"step": 449
},
{
"epoch": 0.34555576886158573,
"grad_norm": 1.5776805877685547,
"learning_rate": 0.0001480944389624178,
"loss": 1.325,
"step": 450
},
{
"epoch": 0.346323670570167,
"grad_norm": 1.466970443725586,
"learning_rate": 0.00014788127363695897,
"loss": 1.4133,
"step": 451
},
{
"epoch": 0.3470915722787483,
"grad_norm": 1.6664904356002808,
"learning_rate": 0.00014766782564901298,
"loss": 1.358,
"step": 452
},
{
"epoch": 0.34785947398732964,
"grad_norm": 1.333856225013733,
"learning_rate": 0.00014745409625864942,
"loss": 1.1728,
"step": 453
},
{
"epoch": 0.3486273756959109,
"grad_norm": 1.8161249160766602,
"learning_rate": 0.0001472400867275992,
"loss": 1.2516,
"step": 454
},
{
"epoch": 0.3493952774044922,
"grad_norm": 1.9915804862976074,
"learning_rate": 0.00014702579831924698,
"loss": 1.6048,
"step": 455
},
{
"epoch": 0.35016317911307354,
"grad_norm": 1.4382892847061157,
"learning_rate": 0.00014681123229862367,
"loss": 1.2205,
"step": 456
},
{
"epoch": 0.3509310808216548,
"grad_norm": 1.7863125801086426,
"learning_rate": 0.0001465963899323992,
"loss": 1.3791,
"step": 457
},
{
"epoch": 0.3516989825302361,
"grad_norm": 1.5871827602386475,
"learning_rate": 0.00014638127248887473,
"loss": 1.2077,
"step": 458
},
{
"epoch": 0.35246688423881745,
"grad_norm": 1.621598720550537,
"learning_rate": 0.00014616588123797535,
"loss": 1.2283,
"step": 459
},
{
"epoch": 0.3532347859473987,
"grad_norm": 1.6450941562652588,
"learning_rate": 0.0001459502174512426,
"loss": 1.3417,
"step": 460
},
{
"epoch": 0.35400268765598003,
"grad_norm": 1.8434786796569824,
"learning_rate": 0.0001457342824018269,
"loss": 1.4919,
"step": 461
},
{
"epoch": 0.35477058936456135,
"grad_norm": 1.5148212909698486,
"learning_rate": 0.00014551807736447995,
"loss": 1.4935,
"step": 462
},
{
"epoch": 0.3555384910731426,
"grad_norm": 1.7087806463241577,
"learning_rate": 0.0001453016036155474,
"loss": 1.506,
"step": 463
},
{
"epoch": 0.35630639278172394,
"grad_norm": 1.463679552078247,
"learning_rate": 0.00014508486243296122,
"loss": 1.1589,
"step": 464
},
{
"epoch": 0.35707429449030526,
"grad_norm": 1.46501886844635,
"learning_rate": 0.00014486785509623202,
"loss": 1.5455,
"step": 465
},
{
"epoch": 0.3578421961988865,
"grad_norm": 1.5746129751205444,
"learning_rate": 0.00014465058288644174,
"loss": 1.4495,
"step": 466
},
{
"epoch": 0.35861009790746784,
"grad_norm": 1.6184135675430298,
"learning_rate": 0.00014443304708623597,
"loss": 1.0957,
"step": 467
},
{
"epoch": 0.35937799961604916,
"grad_norm": 1.584116816520691,
"learning_rate": 0.00014421524897981637,
"loss": 1.339,
"step": 468
},
{
"epoch": 0.3601459013246304,
"grad_norm": 1.3535794019699097,
"learning_rate": 0.00014399718985293297,
"loss": 1.0564,
"step": 469
},
{
"epoch": 0.36091380303321174,
"grad_norm": 1.1871492862701416,
"learning_rate": 0.00014377887099287698,
"loss": 0.9705,
"step": 470
},
{
"epoch": 0.36168170474179306,
"grad_norm": 1.4988235235214233,
"learning_rate": 0.00014356029368847264,
"loss": 1.4004,
"step": 471
},
{
"epoch": 0.36244960645037433,
"grad_norm": 1.3582288026809692,
"learning_rate": 0.0001433414592300701,
"loss": 1.1406,
"step": 472
},
{
"epoch": 0.36321750815895565,
"grad_norm": 1.7595349550247192,
"learning_rate": 0.00014312236890953744,
"loss": 1.3519,
"step": 473
},
{
"epoch": 0.36398540986753697,
"grad_norm": 1.5235776901245117,
"learning_rate": 0.00014290302402025334,
"loss": 1.0064,
"step": 474
},
{
"epoch": 0.36475331157611823,
"grad_norm": 2.060851812362671,
"learning_rate": 0.00014268342585709913,
"loss": 1.1825,
"step": 475
},
{
"epoch": 0.36552121328469955,
"grad_norm": 1.6202908754348755,
"learning_rate": 0.00014246357571645152,
"loss": 1.3926,
"step": 476
},
{
"epoch": 0.3662891149932809,
"grad_norm": 1.436496376991272,
"learning_rate": 0.00014224347489617456,
"loss": 1.4027,
"step": 477
},
{
"epoch": 0.36705701670186214,
"grad_norm": 1.5660499334335327,
"learning_rate": 0.00014202312469561228,
"loss": 1.2506,
"step": 478
},
{
"epoch": 0.36782491841044346,
"grad_norm": 1.473960041999817,
"learning_rate": 0.00014180252641558084,
"loss": 1.2378,
"step": 479
},
{
"epoch": 0.3685928201190248,
"grad_norm": 1.4252207279205322,
"learning_rate": 0.00014158168135836095,
"loss": 1.1854,
"step": 480
},
{
"epoch": 0.36936072182760604,
"grad_norm": 1.6253414154052734,
"learning_rate": 0.00014136059082769017,
"loss": 1.5745,
"step": 481
},
{
"epoch": 0.37012862353618736,
"grad_norm": 1.7292263507843018,
"learning_rate": 0.00014113925612875512,
"loss": 1.41,
"step": 482
},
{
"epoch": 0.3708965252447687,
"grad_norm": 1.7278509140014648,
"learning_rate": 0.00014091767856818388,
"loss": 1.3054,
"step": 483
},
{
"epoch": 0.37166442695334995,
"grad_norm": 1.8640563488006592,
"learning_rate": 0.00014069585945403822,
"loss": 1.5207,
"step": 484
},
{
"epoch": 0.37243232866193127,
"grad_norm": 1.6371948719024658,
"learning_rate": 0.00014047380009580594,
"loss": 1.3498,
"step": 485
},
{
"epoch": 0.3732002303705126,
"grad_norm": 1.2843165397644043,
"learning_rate": 0.00014025150180439308,
"loss": 1.0629,
"step": 486
},
{
"epoch": 0.37396813207909385,
"grad_norm": 1.539959192276001,
"learning_rate": 0.00014002896589211618,
"loss": 1.2718,
"step": 487
},
{
"epoch": 0.37473603378767517,
"grad_norm": 1.709976315498352,
"learning_rate": 0.00013980619367269455,
"loss": 1.1173,
"step": 488
},
{
"epoch": 0.3755039354962565,
"grad_norm": 1.6751576662063599,
"learning_rate": 0.00013958318646124259,
"loss": 1.5058,
"step": 489
},
{
"epoch": 0.37627183720483776,
"grad_norm": 1.8899919986724854,
"learning_rate": 0.0001393599455742618,
"loss": 1.5131,
"step": 490
},
{
"epoch": 0.3770397389134191,
"grad_norm": 1.3949605226516724,
"learning_rate": 0.00013913647232963332,
"loss": 1.194,
"step": 491
},
{
"epoch": 0.3778076406220004,
"grad_norm": 1.3396071195602417,
"learning_rate": 0.00013891276804660991,
"loss": 1.178,
"step": 492
},
{
"epoch": 0.37857554233058166,
"grad_norm": 1.8949428796768188,
"learning_rate": 0.00013868883404580823,
"loss": 1.5164,
"step": 493
},
{
"epoch": 0.379343444039163,
"grad_norm": 1.6502594947814941,
"learning_rate": 0.00013846467164920116,
"loss": 1.1968,
"step": 494
},
{
"epoch": 0.3801113457477443,
"grad_norm": 1.6355159282684326,
"learning_rate": 0.00013824028218010977,
"loss": 1.4489,
"step": 495
},
{
"epoch": 0.38087924745632556,
"grad_norm": 1.509320855140686,
"learning_rate": 0.00013801566696319562,
"loss": 1.169,
"step": 496
},
{
"epoch": 0.3816471491649069,
"grad_norm": 1.206089735031128,
"learning_rate": 0.0001377908273244531,
"loss": 0.9691,
"step": 497
},
{
"epoch": 0.3824150508734882,
"grad_norm": 1.6482902765274048,
"learning_rate": 0.0001375657645912014,
"loss": 1.0815,
"step": 498
},
{
"epoch": 0.38318295258206947,
"grad_norm": 1.538040280342102,
"learning_rate": 0.0001373404800920765,
"loss": 1.528,
"step": 499
},
{
"epoch": 0.3839508542906508,
"grad_norm": 1.4498200416564941,
"learning_rate": 0.00013711497515702398,
"loss": 1.2253,
"step": 500
},
{
"epoch": 0.3847187559992321,
"grad_norm": 1.4636632204055786,
"learning_rate": 0.0001368892511172903,
"loss": 1.3644,
"step": 501
},
{
"epoch": 0.3854866577078134,
"grad_norm": 1.5408172607421875,
"learning_rate": 0.0001366633093054157,
"loss": 1.3672,
"step": 502
},
{
"epoch": 0.3862545594163947,
"grad_norm": 1.4577535390853882,
"learning_rate": 0.00013643715105522589,
"loss": 1.2762,
"step": 503
},
{
"epoch": 0.387022461124976,
"grad_norm": 1.4348138570785522,
"learning_rate": 0.0001362107777018243,
"loss": 1.1911,
"step": 504
},
{
"epoch": 0.3877903628335573,
"grad_norm": 1.8007947206497192,
"learning_rate": 0.0001359841905815842,
"loss": 1.6611,
"step": 505
},
{
"epoch": 0.3885582645421386,
"grad_norm": 1.3817611932754517,
"learning_rate": 0.00013575739103214088,
"loss": 0.949,
"step": 506
},
{
"epoch": 0.3893261662507199,
"grad_norm": 1.5751131772994995,
"learning_rate": 0.0001355303803923836,
"loss": 1.4595,
"step": 507
},
{
"epoch": 0.3900940679593012,
"grad_norm": 1.4235345125198364,
"learning_rate": 0.00013530316000244782,
"loss": 1.1157,
"step": 508
},
{
"epoch": 0.3908619696678825,
"grad_norm": 1.4169321060180664,
"learning_rate": 0.0001350757312037072,
"loss": 1.1267,
"step": 509
},
{
"epoch": 0.3916298713764638,
"grad_norm": 1.4583044052124023,
"learning_rate": 0.00013484809533876582,
"loss": 1.0842,
"step": 510
},
{
"epoch": 0.3923977730850451,
"grad_norm": 1.3082489967346191,
"learning_rate": 0.00013462025375145,
"loss": 1.1793,
"step": 511
},
{
"epoch": 0.3931656747936264,
"grad_norm": 1.9846649169921875,
"learning_rate": 0.00013439220778680067,
"loss": 1.7083,
"step": 512
},
{
"epoch": 0.3939335765022077,
"grad_norm": 1.6533915996551514,
"learning_rate": 0.00013416395879106515,
"loss": 1.3144,
"step": 513
},
{
"epoch": 0.394701478210789,
"grad_norm": 1.5912500619888306,
"learning_rate": 0.00013393550811168948,
"loss": 1.3611,
"step": 514
},
{
"epoch": 0.3954693799193703,
"grad_norm": 1.2825086116790771,
"learning_rate": 0.00013370685709731015,
"loss": 1.2001,
"step": 515
},
{
"epoch": 0.39623728162795163,
"grad_norm": 1.493638515472412,
"learning_rate": 0.00013347800709774652,
"loss": 1.125,
"step": 516
},
{
"epoch": 0.3970051833365329,
"grad_norm": 1.3047711849212646,
"learning_rate": 0.0001332489594639924,
"loss": 1.031,
"step": 517
},
{
"epoch": 0.3977730850451142,
"grad_norm": 1.6641656160354614,
"learning_rate": 0.00013301971554820853,
"loss": 1.3513,
"step": 518
},
{
"epoch": 0.39854098675369554,
"grad_norm": 1.4950916767120361,
"learning_rate": 0.00013279027670371426,
"loss": 1.3485,
"step": 519
},
{
"epoch": 0.3993088884622768,
"grad_norm": 1.5947059392929077,
"learning_rate": 0.00013256064428497966,
"loss": 1.3895,
"step": 520
},
{
"epoch": 0.4000767901708581,
"grad_norm": 1.307298183441162,
"learning_rate": 0.00013233081964761766,
"loss": 1.3044,
"step": 521
},
{
"epoch": 0.40084469187943944,
"grad_norm": 1.5660213232040405,
"learning_rate": 0.0001321008041483758,
"loss": 1.6157,
"step": 522
},
{
"epoch": 0.40161259358802076,
"grad_norm": 1.3612569570541382,
"learning_rate": 0.0001318705991451285,
"loss": 1.226,
"step": 523
},
{
"epoch": 0.402380495296602,
"grad_norm": 1.351832628250122,
"learning_rate": 0.00013164020599686882,
"loss": 1.0547,
"step": 524
},
{
"epoch": 0.40314839700518335,
"grad_norm": 1.3935625553131104,
"learning_rate": 0.00013140962606370048,
"loss": 1.2035,
"step": 525
},
{
"epoch": 0.40391629871376467,
"grad_norm": 1.1423629522323608,
"learning_rate": 0.0001311788607068299,
"loss": 0.7527,
"step": 526
},
{
"epoch": 0.40468420042234593,
"grad_norm": 1.5600942373275757,
"learning_rate": 0.00013094791128855814,
"loss": 1.0371,
"step": 527
},
{
"epoch": 0.40545210213092725,
"grad_norm": 1.6167805194854736,
"learning_rate": 0.0001307167791722729,
"loss": 1.2223,
"step": 528
},
{
"epoch": 0.40622000383950857,
"grad_norm": 1.5059945583343506,
"learning_rate": 0.00013048546572244036,
"loss": 1.1128,
"step": 529
},
{
"epoch": 0.40698790554808983,
"grad_norm": 1.4017764329910278,
"learning_rate": 0.0001302539723045971,
"loss": 1.1994,
"step": 530
},
{
"epoch": 0.40775580725667115,
"grad_norm": 1.4649596214294434,
"learning_rate": 0.00013002230028534234,
"loss": 1.3015,
"step": 531
},
{
"epoch": 0.4085237089652525,
"grad_norm": 1.5319805145263672,
"learning_rate": 0.00012979045103232945,
"loss": 1.1543,
"step": 532
},
{
"epoch": 0.40929161067383374,
"grad_norm": 1.2714585065841675,
"learning_rate": 0.00012955842591425818,
"loss": 1.067,
"step": 533
},
{
"epoch": 0.41005951238241506,
"grad_norm": 1.4179991483688354,
"learning_rate": 0.00012932622630086648,
"loss": 1.247,
"step": 534
},
{
"epoch": 0.4108274140909964,
"grad_norm": 1.4584649801254272,
"learning_rate": 0.0001290938535629224,
"loss": 1.0901,
"step": 535
},
{
"epoch": 0.41159531579957764,
"grad_norm": 1.2720510959625244,
"learning_rate": 0.00012886130907221603,
"loss": 0.9964,
"step": 536
},
{
"epoch": 0.41236321750815896,
"grad_norm": 1.5172308683395386,
"learning_rate": 0.00012862859420155134,
"loss": 1.1805,
"step": 537
},
{
"epoch": 0.4131311192167403,
"grad_norm": 1.5134514570236206,
"learning_rate": 0.00012839571032473814,
"loss": 1.2426,
"step": 538
},
{
"epoch": 0.41389902092532155,
"grad_norm": 1.5913889408111572,
"learning_rate": 0.00012816265881658405,
"loss": 1.3287,
"step": 539
},
{
"epoch": 0.41466692263390287,
"grad_norm": 1.4839410781860352,
"learning_rate": 0.00012792944105288612,
"loss": 1.2787,
"step": 540
},
{
"epoch": 0.4154348243424842,
"grad_norm": 1.7110356092453003,
"learning_rate": 0.000127696058410423,
"loss": 1.6284,
"step": 541
},
{
"epoch": 0.41620272605106545,
"grad_norm": 1.369132161140442,
"learning_rate": 0.00012746251226694662,
"loss": 1.3544,
"step": 542
},
{
"epoch": 0.4169706277596468,
"grad_norm": 1.293350338935852,
"learning_rate": 0.00012722880400117413,
"loss": 1.1775,
"step": 543
},
{
"epoch": 0.4177385294682281,
"grad_norm": 1.5114126205444336,
"learning_rate": 0.00012699493499277983,
"loss": 1.4465,
"step": 544
},
{
"epoch": 0.41850643117680936,
"grad_norm": 1.5067218542099,
"learning_rate": 0.00012676090662238682,
"loss": 1.3107,
"step": 545
},
{
"epoch": 0.4192743328853907,
"grad_norm": 1.2761341333389282,
"learning_rate": 0.00012652672027155904,
"loss": 1.0406,
"step": 546
},
{
"epoch": 0.420042234593972,
"grad_norm": 1.493939757347107,
"learning_rate": 0.00012629237732279314,
"loss": 1.2623,
"step": 547
},
{
"epoch": 0.42081013630255326,
"grad_norm": 1.484665870666504,
"learning_rate": 0.0001260578791595101,
"loss": 1.459,
"step": 548
},
{
"epoch": 0.4215780380111346,
"grad_norm": 1.2791699171066284,
"learning_rate": 0.00012582322716604718,
"loss": 0.9361,
"step": 549
},
{
"epoch": 0.4223459397197159,
"grad_norm": 1.5707510709762573,
"learning_rate": 0.0001255884227276499,
"loss": 1.1258,
"step": 550
},
{
"epoch": 0.42311384142829717,
"grad_norm": 1.6234004497528076,
"learning_rate": 0.0001253534672304636,
"loss": 1.2735,
"step": 551
},
{
"epoch": 0.4238817431368785,
"grad_norm": 1.4539543390274048,
"learning_rate": 0.00012511836206152545,
"loss": 1.2575,
"step": 552
},
{
"epoch": 0.4246496448454598,
"grad_norm": 1.4913365840911865,
"learning_rate": 0.00012488310860875622,
"loss": 1.4831,
"step": 553
},
{
"epoch": 0.42541754655404107,
"grad_norm": 1.9435288906097412,
"learning_rate": 0.0001246477082609519,
"loss": 0.9954,
"step": 554
},
{
"epoch": 0.4261854482626224,
"grad_norm": 1.5525418519973755,
"learning_rate": 0.00012441216240777585,
"loss": 1.2844,
"step": 555
},
{
"epoch": 0.4269533499712037,
"grad_norm": 1.238305926322937,
"learning_rate": 0.0001241764724397503,
"loss": 0.8637,
"step": 556
},
{
"epoch": 0.427721251679785,
"grad_norm": 1.7352509498596191,
"learning_rate": 0.00012394063974824828,
"loss": 1.3207,
"step": 557
},
{
"epoch": 0.4284891533883663,
"grad_norm": 1.5795037746429443,
"learning_rate": 0.00012370466572548538,
"loss": 1.2913,
"step": 558
},
{
"epoch": 0.4292570550969476,
"grad_norm": 1.6189255714416504,
"learning_rate": 0.0001234685517645115,
"loss": 1.154,
"step": 559
},
{
"epoch": 0.4300249568055289,
"grad_norm": 1.5668927431106567,
"learning_rate": 0.00012323229925920273,
"loss": 1.4944,
"step": 560
},
{
"epoch": 0.4307928585141102,
"grad_norm": 1.6585174798965454,
"learning_rate": 0.00012299590960425288,
"loss": 1.2406,
"step": 561
},
{
"epoch": 0.4315607602226915,
"grad_norm": 1.6430768966674805,
"learning_rate": 0.00012275938419516552,
"loss": 1.2662,
"step": 562
},
{
"epoch": 0.4323286619312728,
"grad_norm": 1.5714935064315796,
"learning_rate": 0.0001225227244282457,
"loss": 1.2358,
"step": 563
},
{
"epoch": 0.4330965636398541,
"grad_norm": 1.551619291305542,
"learning_rate": 0.00012228593170059151,
"loss": 1.0941,
"step": 564
},
{
"epoch": 0.4338644653484354,
"grad_norm": 1.6610519886016846,
"learning_rate": 0.000122049007410086,
"loss": 1.3556,
"step": 565
},
{
"epoch": 0.4346323670570167,
"grad_norm": 1.4311244487762451,
"learning_rate": 0.00012181195295538895,
"loss": 1.1336,
"step": 566
},
{
"epoch": 0.435400268765598,
"grad_norm": 1.7137714624404907,
"learning_rate": 0.00012157476973592842,
"loss": 1.4885,
"step": 567
},
{
"epoch": 0.43616817047417933,
"grad_norm": 1.6225979328155518,
"learning_rate": 0.00012133745915189278,
"loss": 1.216,
"step": 568
},
{
"epoch": 0.4369360721827606,
"grad_norm": 1.5688353776931763,
"learning_rate": 0.00012110002260422218,
"loss": 1.2242,
"step": 569
},
{
"epoch": 0.4377039738913419,
"grad_norm": 1.724080204963684,
"learning_rate": 0.00012086246149460038,
"loss": 1.2706,
"step": 570
},
{
"epoch": 0.43847187559992323,
"grad_norm": 1.465526819229126,
"learning_rate": 0.00012062477722544656,
"loss": 1.0762,
"step": 571
},
{
"epoch": 0.4392397773085045,
"grad_norm": 1.3275350332260132,
"learning_rate": 0.00012038697119990687,
"loss": 1.2327,
"step": 572
},
{
"epoch": 0.4400076790170858,
"grad_norm": 1.5525063276290894,
"learning_rate": 0.00012014904482184633,
"loss": 1.3994,
"step": 573
},
{
"epoch": 0.44077558072566714,
"grad_norm": 1.318934679031372,
"learning_rate": 0.00011991099949584032,
"loss": 1.4836,
"step": 574
},
{
"epoch": 0.4415434824342484,
"grad_norm": 1.376668930053711,
"learning_rate": 0.00011967283662716653,
"loss": 1.2234,
"step": 575
},
{
"epoch": 0.4423113841428297,
"grad_norm": 1.4669725894927979,
"learning_rate": 0.00011943455762179654,
"loss": 1.3136,
"step": 576
},
{
"epoch": 0.44307928585141104,
"grad_norm": 1.3037981986999512,
"learning_rate": 0.00011919616388638748,
"loss": 1.2784,
"step": 577
},
{
"epoch": 0.4438471875599923,
"grad_norm": 1.8770321607589722,
"learning_rate": 0.0001189576568282738,
"loss": 1.4478,
"step": 578
},
{
"epoch": 0.4446150892685736,
"grad_norm": 1.633165717124939,
"learning_rate": 0.00011871903785545897,
"loss": 1.4224,
"step": 579
},
{
"epoch": 0.44538299097715495,
"grad_norm": 1.5755568742752075,
"learning_rate": 0.00011848030837660709,
"loss": 1.2581,
"step": 580
},
{
"epoch": 0.4461508926857362,
"grad_norm": 1.5348775386810303,
"learning_rate": 0.00011824146980103467,
"loss": 1.0031,
"step": 581
},
{
"epoch": 0.44691879439431753,
"grad_norm": 1.3276475667953491,
"learning_rate": 0.00011800252353870224,
"loss": 1.0511,
"step": 582
},
{
"epoch": 0.44768669610289885,
"grad_norm": 2.070185661315918,
"learning_rate": 0.00011776347100020602,
"loss": 1.4314,
"step": 583
},
{
"epoch": 0.4484545978114801,
"grad_norm": 1.82218337059021,
"learning_rate": 0.00011752431359676968,
"loss": 1.5193,
"step": 584
},
{
"epoch": 0.44922249952006144,
"grad_norm": 1.479628086090088,
"learning_rate": 0.00011728505274023584,
"loss": 1.1324,
"step": 585
},
{
"epoch": 0.44999040122864276,
"grad_norm": 1.273962378501892,
"learning_rate": 0.00011704568984305802,
"loss": 1.0844,
"step": 586
},
{
"epoch": 0.450758302937224,
"grad_norm": 1.537895917892456,
"learning_rate": 0.00011680622631829197,
"loss": 1.2109,
"step": 587
},
{
"epoch": 0.45152620464580534,
"grad_norm": 1.5234670639038086,
"learning_rate": 0.00011656666357958751,
"loss": 1.3014,
"step": 588
},
{
"epoch": 0.45229410635438666,
"grad_norm": 1.6489896774291992,
"learning_rate": 0.00011632700304118032,
"loss": 1.4389,
"step": 589
},
{
"epoch": 0.4530620080629679,
"grad_norm": 1.419973373413086,
"learning_rate": 0.0001160872461178832,
"loss": 1.2462,
"step": 590
},
{
"epoch": 0.45382990977154924,
"grad_norm": 1.0888590812683105,
"learning_rate": 0.00011584739422507804,
"loss": 0.9298,
"step": 591
},
{
"epoch": 0.45459781148013056,
"grad_norm": 1.7613029479980469,
"learning_rate": 0.00011560744877870748,
"loss": 1.3252,
"step": 592
},
{
"epoch": 0.45536571318871183,
"grad_norm": 1.5147452354431152,
"learning_rate": 0.00011536741119526628,
"loss": 1.3277,
"step": 593
},
{
"epoch": 0.45613361489729315,
"grad_norm": 1.3303625583648682,
"learning_rate": 0.00011512728289179323,
"loss": 1.1962,
"step": 594
},
{
"epoch": 0.45690151660587447,
"grad_norm": 1.591471552848816,
"learning_rate": 0.00011488706528586261,
"loss": 1.324,
"step": 595
},
{
"epoch": 0.45766941831445573,
"grad_norm": 1.9963793754577637,
"learning_rate": 0.00011464675979557593,
"loss": 1.4472,
"step": 596
},
{
"epoch": 0.45843732002303705,
"grad_norm": 1.6944851875305176,
"learning_rate": 0.00011440636783955356,
"loss": 1.3831,
"step": 597
},
{
"epoch": 0.4592052217316184,
"grad_norm": 1.851412296295166,
"learning_rate": 0.00011416589083692619,
"loss": 1.3932,
"step": 598
},
{
"epoch": 0.45997312344019964,
"grad_norm": 1.5847994089126587,
"learning_rate": 0.00011392533020732666,
"loss": 1.1381,
"step": 599
},
{
"epoch": 0.46074102514878096,
"grad_norm": 1.6723082065582275,
"learning_rate": 0.00011368468737088148,
"loss": 1.5375,
"step": 600
},
{
"epoch": 0.4615089268573623,
"grad_norm": 1.5877834558486938,
"learning_rate": 0.00011344396374820244,
"loss": 1.3328,
"step": 601
},
{
"epoch": 0.46227682856594354,
"grad_norm": 1.797132968902588,
"learning_rate": 0.0001132031607603783,
"loss": 1.2852,
"step": 602
},
{
"epoch": 0.46304473027452486,
"grad_norm": 1.5253082513809204,
"learning_rate": 0.0001129622798289663,
"loss": 1.2521,
"step": 603
},
{
"epoch": 0.4638126319831062,
"grad_norm": 1.8899868726730347,
"learning_rate": 0.00011272132237598376,
"loss": 1.3255,
"step": 604
},
{
"epoch": 0.46458053369168745,
"grad_norm": 1.6422898769378662,
"learning_rate": 0.00011248028982389989,
"loss": 1.1699,
"step": 605
},
{
"epoch": 0.46534843540026877,
"grad_norm": 1.5534974336624146,
"learning_rate": 0.00011223918359562708,
"loss": 1.2355,
"step": 606
},
{
"epoch": 0.4661163371088501,
"grad_norm": 1.4911137819290161,
"learning_rate": 0.00011199800511451273,
"loss": 1.0746,
"step": 607
},
{
"epoch": 0.46688423881743135,
"grad_norm": 1.4544899463653564,
"learning_rate": 0.0001117567558043308,
"loss": 1.1927,
"step": 608
},
{
"epoch": 0.46765214052601267,
"grad_norm": 1.744524598121643,
"learning_rate": 0.00011151543708927335,
"loss": 1.6512,
"step": 609
},
{
"epoch": 0.468420042234594,
"grad_norm": 1.4004900455474854,
"learning_rate": 0.00011127405039394216,
"loss": 1.0693,
"step": 610
},
{
"epoch": 0.46918794394317526,
"grad_norm": 1.452285885810852,
"learning_rate": 0.00011103259714334034,
"loss": 1.0925,
"step": 611
},
{
"epoch": 0.4699558456517566,
"grad_norm": 1.1670202016830444,
"learning_rate": 0.00011079107876286387,
"loss": 0.9021,
"step": 612
},
{
"epoch": 0.4707237473603379,
"grad_norm": 1.809617042541504,
"learning_rate": 0.0001105494966782933,
"loss": 1.3879,
"step": 613
},
{
"epoch": 0.47149164906891916,
"grad_norm": 1.4755923748016357,
"learning_rate": 0.0001103078523157852,
"loss": 1.2053,
"step": 614
},
{
"epoch": 0.4722595507775005,
"grad_norm": 2.111751079559326,
"learning_rate": 0.00011006614710186372,
"loss": 1.5057,
"step": 615
},
{
"epoch": 0.4730274524860818,
"grad_norm": 1.3958059549331665,
"learning_rate": 0.00010982438246341238,
"loss": 1.196,
"step": 616
},
{
"epoch": 0.47379535419466307,
"grad_norm": 1.6343685388565063,
"learning_rate": 0.00010958255982766538,
"loss": 1.2389,
"step": 617
},
{
"epoch": 0.4745632559032444,
"grad_norm": 1.746578335762024,
"learning_rate": 0.00010934068062219945,
"loss": 1.3113,
"step": 618
},
{
"epoch": 0.4753311576118257,
"grad_norm": 1.474244475364685,
"learning_rate": 0.0001090987462749251,
"loss": 1.2434,
"step": 619
},
{
"epoch": 0.47609905932040697,
"grad_norm": 1.774789810180664,
"learning_rate": 0.00010885675821407844,
"loss": 1.3742,
"step": 620
},
{
"epoch": 0.4768669610289883,
"grad_norm": 1.5622092485427856,
"learning_rate": 0.00010861471786821275,
"loss": 1.2501,
"step": 621
},
{
"epoch": 0.4776348627375696,
"grad_norm": 1.8582671880722046,
"learning_rate": 0.00010837262666618983,
"loss": 1.3851,
"step": 622
},
{
"epoch": 0.4784027644461509,
"grad_norm": 1.37471342086792,
"learning_rate": 0.00010813048603717182,
"loss": 1.2147,
"step": 623
},
{
"epoch": 0.4791706661547322,
"grad_norm": 1.9024494886398315,
"learning_rate": 0.0001078882974106126,
"loss": 1.4406,
"step": 624
},
{
"epoch": 0.4799385678633135,
"grad_norm": 1.6293984651565552,
"learning_rate": 0.00010764606221624933,
"loss": 1.2317,
"step": 625
},
{
"epoch": 0.4807064695718948,
"grad_norm": 1.6357662677764893,
"learning_rate": 0.00010740378188409426,
"loss": 1.1184,
"step": 626
},
{
"epoch": 0.4814743712804761,
"grad_norm": 1.6468091011047363,
"learning_rate": 0.00010716145784442593,
"loss": 1.2806,
"step": 627
},
{
"epoch": 0.4822422729890574,
"grad_norm": 1.6838487386703491,
"learning_rate": 0.00010691909152778094,
"loss": 1.313,
"step": 628
},
{
"epoch": 0.4830101746976387,
"grad_norm": 1.7010256052017212,
"learning_rate": 0.00010667668436494558,
"loss": 1.2907,
"step": 629
},
{
"epoch": 0.48377807640622,
"grad_norm": 1.6259071826934814,
"learning_rate": 0.00010643423778694712,
"loss": 1.6014,
"step": 630
},
{
"epoch": 0.4845459781148013,
"grad_norm": 1.5683881044387817,
"learning_rate": 0.0001061917532250456,
"loss": 1.3454,
"step": 631
},
{
"epoch": 0.4853138798233826,
"grad_norm": 1.4420804977416992,
"learning_rate": 0.00010594923211072532,
"loss": 1.4059,
"step": 632
},
{
"epoch": 0.4860817815319639,
"grad_norm": 1.7529085874557495,
"learning_rate": 0.00010570667587568626,
"loss": 1.3871,
"step": 633
},
{
"epoch": 0.48684968324054523,
"grad_norm": 1.5442010164260864,
"learning_rate": 0.00010546408595183578,
"loss": 1.0737,
"step": 634
},
{
"epoch": 0.4876175849491265,
"grad_norm": 1.6395798921585083,
"learning_rate": 0.00010522146377128021,
"loss": 1.2025,
"step": 635
},
{
"epoch": 0.4883854866577078,
"grad_norm": 1.6505008935928345,
"learning_rate": 0.00010497881076631615,
"loss": 1.231,
"step": 636
},
{
"epoch": 0.48915338836628913,
"grad_norm": 1.7394682168960571,
"learning_rate": 0.00010473612836942226,
"loss": 1.6761,
"step": 637
},
{
"epoch": 0.4899212900748704,
"grad_norm": 1.410757303237915,
"learning_rate": 0.00010449341801325073,
"loss": 1.2553,
"step": 638
},
{
"epoch": 0.4906891917834517,
"grad_norm": 1.3731025457382202,
"learning_rate": 0.00010425068113061873,
"loss": 1.4168,
"step": 639
},
{
"epoch": 0.49145709349203304,
"grad_norm": 1.296189546585083,
"learning_rate": 0.00010400791915450009,
"loss": 1.105,
"step": 640
},
{
"epoch": 0.4922249952006143,
"grad_norm": 1.17586350440979,
"learning_rate": 0.00010376513351801673,
"loss": 0.9808,
"step": 641
},
{
"epoch": 0.4929928969091956,
"grad_norm": 2.6915199756622314,
"learning_rate": 0.00010352232565443032,
"loss": 1.4701,
"step": 642
},
{
"epoch": 0.49376079861777694,
"grad_norm": 1.942378044128418,
"learning_rate": 0.00010327949699713366,
"loss": 1.3895,
"step": 643
},
{
"epoch": 0.4945287003263582,
"grad_norm": 1.7854747772216797,
"learning_rate": 0.00010303664897964232,
"loss": 1.1927,
"step": 644
},
{
"epoch": 0.4952966020349395,
"grad_norm": 1.4492199420928955,
"learning_rate": 0.00010279378303558624,
"loss": 1.4889,
"step": 645
},
{
"epoch": 0.49606450374352085,
"grad_norm": 1.6698827743530273,
"learning_rate": 0.00010255090059870107,
"loss": 1.4768,
"step": 646
},
{
"epoch": 0.4968324054521021,
"grad_norm": 1.533857822418213,
"learning_rate": 0.00010230800310281992,
"loss": 1.3059,
"step": 647
},
{
"epoch": 0.49760030716068343,
"grad_norm": 1.816888689994812,
"learning_rate": 0.00010206509198186476,
"loss": 1.5369,
"step": 648
},
{
"epoch": 0.49836820886926475,
"grad_norm": 1.3209645748138428,
"learning_rate": 0.00010182216866983796,
"loss": 1.2222,
"step": 649
},
{
"epoch": 0.499136110577846,
"grad_norm": 1.441152811050415,
"learning_rate": 0.00010157923460081394,
"loss": 1.4053,
"step": 650
},
{
"epoch": 0.49990401228642733,
"grad_norm": 1.3601536750793457,
"learning_rate": 0.00010133629120893055,
"loss": 1.3098,
"step": 651
},
{
"epoch": 0.5006719139950087,
"grad_norm": 1.6354866027832031,
"learning_rate": 0.00010109333992838072,
"loss": 1.2891,
"step": 652
},
{
"epoch": 0.5006719139950087,
"eval_loss": 1.273740291595459,
"eval_runtime": 19.7598,
"eval_samples_per_second": 27.784,
"eval_steps_per_second": 13.917,
"step": 652
},
{
"epoch": 0.50143981570359,
"grad_norm": 1.54447603225708,
"learning_rate": 0.00010085038219340393,
"loss": 1.296,
"step": 653
},
{
"epoch": 0.5022077174121713,
"grad_norm": 1.8366080522537231,
"learning_rate": 0.00010060741943827776,
"loss": 1.6111,
"step": 654
},
{
"epoch": 0.5029756191207525,
"grad_norm": 1.22926926612854,
"learning_rate": 0.00010036445309730944,
"loss": 1.0101,
"step": 655
},
{
"epoch": 0.5037435208293338,
"grad_norm": 1.0615041255950928,
"learning_rate": 0.00010012148460482738,
"loss": 0.8333,
"step": 656
},
{
"epoch": 0.5045114225379151,
"grad_norm": 1.7094695568084717,
"learning_rate": 9.987851539517262e-05,
"loss": 1.094,
"step": 657
},
{
"epoch": 0.5052793242464965,
"grad_norm": 1.4576027393341064,
"learning_rate": 9.963554690269058e-05,
"loss": 0.957,
"step": 658
},
{
"epoch": 0.5060472259550778,
"grad_norm": 1.4120879173278809,
"learning_rate": 9.939258056172225e-05,
"loss": 1.1924,
"step": 659
},
{
"epoch": 0.5068151276636591,
"grad_norm": 1.4560344219207764,
"learning_rate": 9.914961780659609e-05,
"loss": 1.0673,
"step": 660
},
{
"epoch": 0.5075830293722403,
"grad_norm": 1.4893819093704224,
"learning_rate": 9.890666007161929e-05,
"loss": 1.1459,
"step": 661
},
{
"epoch": 0.5083509310808216,
"grad_norm": 1.5299136638641357,
"learning_rate": 9.866370879106947e-05,
"loss": 1.2983,
"step": 662
},
{
"epoch": 0.509118832789403,
"grad_norm": 1.6000056266784668,
"learning_rate": 9.84207653991861e-05,
"loss": 1.0902,
"step": 663
},
{
"epoch": 0.5098867344979843,
"grad_norm": 1.5117030143737793,
"learning_rate": 9.817783133016206e-05,
"loss": 1.2475,
"step": 664
},
{
"epoch": 0.5106546362065656,
"grad_norm": 1.7616103887557983,
"learning_rate": 9.793490801813528e-05,
"loss": 1.4153,
"step": 665
},
{
"epoch": 0.5114225379151469,
"grad_norm": 1.6097863912582397,
"learning_rate": 9.769199689718009e-05,
"loss": 1.3362,
"step": 666
},
{
"epoch": 0.5121904396237281,
"grad_norm": 1.431277871131897,
"learning_rate": 9.744909940129895e-05,
"loss": 1.2254,
"step": 667
},
{
"epoch": 0.5129583413323094,
"grad_norm": 1.4436259269714355,
"learning_rate": 9.720621696441378e-05,
"loss": 1.1335,
"step": 668
},
{
"epoch": 0.5137262430408908,
"grad_norm": 1.7994132041931152,
"learning_rate": 9.69633510203577e-05,
"loss": 1.3912,
"step": 669
},
{
"epoch": 0.5144941447494721,
"grad_norm": 1.828708529472351,
"learning_rate": 9.672050300286636e-05,
"loss": 1.5837,
"step": 670
},
{
"epoch": 0.5152620464580534,
"grad_norm": 1.592246651649475,
"learning_rate": 9.64776743455697e-05,
"loss": 1.3176,
"step": 671
},
{
"epoch": 0.5160299481666347,
"grad_norm": 1.7357617616653442,
"learning_rate": 9.623486648198326e-05,
"loss": 1.47,
"step": 672
},
{
"epoch": 0.5167978498752159,
"grad_norm": 1.6944161653518677,
"learning_rate": 9.599208084549993e-05,
"loss": 1.4586,
"step": 673
},
{
"epoch": 0.5175657515837973,
"grad_norm": 1.827109456062317,
"learning_rate": 9.574931886938128e-05,
"loss": 1.2451,
"step": 674
},
{
"epoch": 0.5183336532923786,
"grad_norm": 1.3678934574127197,
"learning_rate": 9.550658198674931e-05,
"loss": 1.2403,
"step": 675
},
{
"epoch": 0.5191015550009599,
"grad_norm": 0.994289755821228,
"learning_rate": 9.526387163057777e-05,
"loss": 0.8628,
"step": 676
},
{
"epoch": 0.5198694567095412,
"grad_norm": 1.3589410781860352,
"learning_rate": 9.502118923368388e-05,
"loss": 0.8397,
"step": 677
},
{
"epoch": 0.5206373584181225,
"grad_norm": 1.3971327543258667,
"learning_rate": 9.477853622871984e-05,
"loss": 1.1653,
"step": 678
},
{
"epoch": 0.5214052601267037,
"grad_norm": 1.3670016527175903,
"learning_rate": 9.453591404816423e-05,
"loss": 1.0544,
"step": 679
},
{
"epoch": 0.5221731618352851,
"grad_norm": 1.3910092115402222,
"learning_rate": 9.429332412431377e-05,
"loss": 1.2027,
"step": 680
},
{
"epoch": 0.5229410635438664,
"grad_norm": 1.4565633535385132,
"learning_rate": 9.405076788927469e-05,
"loss": 1.3962,
"step": 681
},
{
"epoch": 0.5237089652524477,
"grad_norm": 1.4770212173461914,
"learning_rate": 9.380824677495441e-05,
"loss": 1.3789,
"step": 682
},
{
"epoch": 0.524476866961029,
"grad_norm": 1.5948761701583862,
"learning_rate": 9.356576221305289e-05,
"loss": 1.4457,
"step": 683
},
{
"epoch": 0.5252447686696103,
"grad_norm": 1.4264615774154663,
"learning_rate": 9.332331563505444e-05,
"loss": 1.0041,
"step": 684
},
{
"epoch": 0.5260126703781915,
"grad_norm": 1.5794204473495483,
"learning_rate": 9.308090847221905e-05,
"loss": 1.3674,
"step": 685
},
{
"epoch": 0.5267805720867729,
"grad_norm": 1.5819778442382812,
"learning_rate": 9.283854215557409e-05,
"loss": 1.5201,
"step": 686
},
{
"epoch": 0.5275484737953542,
"grad_norm": 1.3187499046325684,
"learning_rate": 9.259621811590578e-05,
"loss": 1.1523,
"step": 687
},
{
"epoch": 0.5283163755039355,
"grad_norm": 1.585579514503479,
"learning_rate": 9.235393778375068e-05,
"loss": 1.2612,
"step": 688
},
{
"epoch": 0.5290842772125168,
"grad_norm": 1.7242546081542969,
"learning_rate": 9.211170258938747e-05,
"loss": 1.374,
"step": 689
},
{
"epoch": 0.5298521789210981,
"grad_norm": 1.4706342220306396,
"learning_rate": 9.18695139628282e-05,
"loss": 1.264,
"step": 690
},
{
"epoch": 0.5306200806296794,
"grad_norm": 1.520382046699524,
"learning_rate": 9.162737333381019e-05,
"loss": 1.4948,
"step": 691
},
{
"epoch": 0.5313879823382607,
"grad_norm": 1.598055362701416,
"learning_rate": 9.138528213178727e-05,
"loss": 1.4628,
"step": 692
},
{
"epoch": 0.532155884046842,
"grad_norm": 1.2860671281814575,
"learning_rate": 9.11432417859216e-05,
"loss": 1.0033,
"step": 693
},
{
"epoch": 0.5329237857554233,
"grad_norm": 1.4247864484786987,
"learning_rate": 9.090125372507492e-05,
"loss": 1.1321,
"step": 694
},
{
"epoch": 0.5336916874640046,
"grad_norm": 1.3131632804870605,
"learning_rate": 9.065931937780059e-05,
"loss": 0.9775,
"step": 695
},
{
"epoch": 0.534459589172586,
"grad_norm": 1.3943092823028564,
"learning_rate": 9.041744017233462e-05,
"loss": 1.1079,
"step": 696
},
{
"epoch": 0.5352274908811672,
"grad_norm": 1.3534505367279053,
"learning_rate": 9.017561753658764e-05,
"loss": 0.9166,
"step": 697
},
{
"epoch": 0.5359953925897485,
"grad_norm": 1.6207268238067627,
"learning_rate": 8.993385289813627e-05,
"loss": 1.1386,
"step": 698
},
{
"epoch": 0.5367632942983298,
"grad_norm": 1.4252740144729614,
"learning_rate": 8.969214768421483e-05,
"loss": 1.2191,
"step": 699
},
{
"epoch": 0.5375311960069111,
"grad_norm": 1.3587523698806763,
"learning_rate": 8.945050332170672e-05,
"loss": 1.1997,
"step": 700
},
{
"epoch": 0.5382990977154924,
"grad_norm": 1.596205234527588,
"learning_rate": 8.920892123713614e-05,
"loss": 1.1552,
"step": 701
},
{
"epoch": 0.5390669994240738,
"grad_norm": 1.779982089996338,
"learning_rate": 8.89674028566597e-05,
"loss": 1.46,
"step": 702
},
{
"epoch": 0.539834901132655,
"grad_norm": 1.493944525718689,
"learning_rate": 8.872594960605785e-05,
"loss": 1.238,
"step": 703
},
{
"epoch": 0.5406028028412363,
"grad_norm": 1.5526620149612427,
"learning_rate": 8.848456291072666e-05,
"loss": 1.3413,
"step": 704
},
{
"epoch": 0.5413707045498176,
"grad_norm": 1.4706637859344482,
"learning_rate": 8.82432441956692e-05,
"loss": 1.2897,
"step": 705
},
{
"epoch": 0.5421386062583989,
"grad_norm": 1.2133674621582031,
"learning_rate": 8.80019948854873e-05,
"loss": 0.9593,
"step": 706
},
{
"epoch": 0.5429065079669803,
"grad_norm": 1.2873600721359253,
"learning_rate": 8.776081640437294e-05,
"loss": 1.118,
"step": 707
},
{
"epoch": 0.5436744096755616,
"grad_norm": 1.5350018739700317,
"learning_rate": 8.751971017610012e-05,
"loss": 1.3241,
"step": 708
},
{
"epoch": 0.5444423113841428,
"grad_norm": 1.4526116847991943,
"learning_rate": 8.727867762401623e-05,
"loss": 1.2734,
"step": 709
},
{
"epoch": 0.5452102130927241,
"grad_norm": 1.551858901977539,
"learning_rate": 8.703772017103372e-05,
"loss": 1.3445,
"step": 710
},
{
"epoch": 0.5459781148013054,
"grad_norm": 1.0594290494918823,
"learning_rate": 8.679683923962174e-05,
"loss": 0.891,
"step": 711
},
{
"epoch": 0.5467460165098867,
"grad_norm": 1.4673049449920654,
"learning_rate": 8.655603625179759e-05,
"loss": 1.3731,
"step": 712
},
{
"epoch": 0.5475139182184681,
"grad_norm": 1.2594002485275269,
"learning_rate": 8.631531262911857e-05,
"loss": 1.0889,
"step": 713
},
{
"epoch": 0.5482818199270494,
"grad_norm": 1.3199065923690796,
"learning_rate": 8.607466979267338e-05,
"loss": 1.1011,
"step": 714
},
{
"epoch": 0.5490497216356306,
"grad_norm": 1.752083659172058,
"learning_rate": 8.583410916307386e-05,
"loss": 1.4096,
"step": 715
},
{
"epoch": 0.5498176233442119,
"grad_norm": 1.4408888816833496,
"learning_rate": 8.559363216044647e-05,
"loss": 1.2785,
"step": 716
},
{
"epoch": 0.5505855250527932,
"grad_norm": 1.881710410118103,
"learning_rate": 8.53532402044241e-05,
"loss": 1.854,
"step": 717
},
{
"epoch": 0.5513534267613746,
"grad_norm": 1.3627890348434448,
"learning_rate": 8.51129347141374e-05,
"loss": 1.097,
"step": 718
},
{
"epoch": 0.5521213284699559,
"grad_norm": 1.4654157161712646,
"learning_rate": 8.487271710820681e-05,
"loss": 1.3027,
"step": 719
},
{
"epoch": 0.5528892301785372,
"grad_norm": 1.7268285751342773,
"learning_rate": 8.463258880473373e-05,
"loss": 1.3451,
"step": 720
},
{
"epoch": 0.5536571318871184,
"grad_norm": 1.112960934638977,
"learning_rate": 8.439255122129254e-05,
"loss": 0.9304,
"step": 721
},
{
"epoch": 0.5544250335956997,
"grad_norm": 1.4320957660675049,
"learning_rate": 8.415260577492195e-05,
"loss": 1.2068,
"step": 722
},
{
"epoch": 0.555192935304281,
"grad_norm": 1.6056063175201416,
"learning_rate": 8.391275388211684e-05,
"loss": 1.1656,
"step": 723
},
{
"epoch": 0.5559608370128624,
"grad_norm": 1.415808081626892,
"learning_rate": 8.367299695881973e-05,
"loss": 1.3507,
"step": 724
},
{
"epoch": 0.5567287387214437,
"grad_norm": 1.5225753784179688,
"learning_rate": 8.34333364204125e-05,
"loss": 1.4994,
"step": 725
},
{
"epoch": 0.557496640430025,
"grad_norm": 1.4271262884140015,
"learning_rate": 8.319377368170808e-05,
"loss": 1.2742,
"step": 726
},
{
"epoch": 0.5582645421386062,
"grad_norm": 1.217391848564148,
"learning_rate": 8.295431015694202e-05,
"loss": 1.0471,
"step": 727
},
{
"epoch": 0.5590324438471875,
"grad_norm": 1.5793488025665283,
"learning_rate": 8.271494725976418e-05,
"loss": 1.3258,
"step": 728
},
{
"epoch": 0.5598003455557689,
"grad_norm": 1.4085965156555176,
"learning_rate": 8.247568640323036e-05,
"loss": 1.2015,
"step": 729
},
{
"epoch": 0.5605682472643502,
"grad_norm": 1.4623831510543823,
"learning_rate": 8.223652899979402e-05,
"loss": 1.1587,
"step": 730
},
{
"epoch": 0.5613361489729315,
"grad_norm": 1.187821626663208,
"learning_rate": 8.199747646129775e-05,
"loss": 1.0068,
"step": 731
},
{
"epoch": 0.5621040506815128,
"grad_norm": 1.485092282295227,
"learning_rate": 8.175853019896534e-05,
"loss": 1.3343,
"step": 732
},
{
"epoch": 0.562871952390094,
"grad_norm": 1.3452789783477783,
"learning_rate": 8.15196916233929e-05,
"loss": 1.2402,
"step": 733
},
{
"epoch": 0.5636398540986753,
"grad_norm": 1.7040382623672485,
"learning_rate": 8.128096214454105e-05,
"loss": 1.1712,
"step": 734
},
{
"epoch": 0.5644077558072567,
"grad_norm": 1.5438884496688843,
"learning_rate": 8.104234317172621e-05,
"loss": 1.1095,
"step": 735
},
{
"epoch": 0.565175657515838,
"grad_norm": 1.173060417175293,
"learning_rate": 8.080383611361254e-05,
"loss": 0.9693,
"step": 736
},
{
"epoch": 0.5659435592244193,
"grad_norm": 1.4293849468231201,
"learning_rate": 8.056544237820351e-05,
"loss": 1.0731,
"step": 737
},
{
"epoch": 0.5667114609330006,
"grad_norm": 1.4654078483581543,
"learning_rate": 8.03271633728335e-05,
"loss": 1.4003,
"step": 738
},
{
"epoch": 0.5674793626415818,
"grad_norm": 1.4996401071548462,
"learning_rate": 8.008900050415973e-05,
"loss": 1.117,
"step": 739
},
{
"epoch": 0.5682472643501632,
"grad_norm": 1.5051236152648926,
"learning_rate": 7.985095517815371e-05,
"loss": 1.5012,
"step": 740
},
{
"epoch": 0.5690151660587445,
"grad_norm": 1.4953557252883911,
"learning_rate": 7.961302880009314e-05,
"loss": 1.2279,
"step": 741
},
{
"epoch": 0.5697830677673258,
"grad_norm": 1.3291648626327515,
"learning_rate": 7.937522277455343e-05,
"loss": 1.1494,
"step": 742
},
{
"epoch": 0.5705509694759071,
"grad_norm": 1.496016025543213,
"learning_rate": 7.913753850539964e-05,
"loss": 1.2484,
"step": 743
},
{
"epoch": 0.5713188711844884,
"grad_norm": 1.4508578777313232,
"learning_rate": 7.889997739577783e-05,
"loss": 1.1512,
"step": 744
},
{
"epoch": 0.5720867728930696,
"grad_norm": 1.3858839273452759,
"learning_rate": 7.866254084810724e-05,
"loss": 1.3323,
"step": 745
},
{
"epoch": 0.572854674601651,
"grad_norm": 1.4042750597000122,
"learning_rate": 7.842523026407159e-05,
"loss": 0.9763,
"step": 746
},
{
"epoch": 0.5736225763102323,
"grad_norm": 1.4263511896133423,
"learning_rate": 7.818804704461108e-05,
"loss": 1.1,
"step": 747
},
{
"epoch": 0.5743904780188136,
"grad_norm": 1.4217872619628906,
"learning_rate": 7.795099258991404e-05,
"loss": 1.001,
"step": 748
},
{
"epoch": 0.5751583797273949,
"grad_norm": 1.5163894891738892,
"learning_rate": 7.771406829940852e-05,
"loss": 1.1633,
"step": 749
},
{
"epoch": 0.5759262814359762,
"grad_norm": 1.423274278640747,
"learning_rate": 7.747727557175434e-05,
"loss": 1.4725,
"step": 750
},
{
"epoch": 0.5766941831445574,
"grad_norm": 1.5022209882736206,
"learning_rate": 7.724061580483449e-05,
"loss": 1.1757,
"step": 751
},
{
"epoch": 0.5774620848531388,
"grad_norm": 1.3035588264465332,
"learning_rate": 7.700409039574717e-05,
"loss": 1.0345,
"step": 752
},
{
"epoch": 0.5782299865617201,
"grad_norm": 1.6852291822433472,
"learning_rate": 7.676770074079732e-05,
"loss": 1.1958,
"step": 753
},
{
"epoch": 0.5789978882703014,
"grad_norm": 1.4031083583831787,
"learning_rate": 7.653144823548852e-05,
"loss": 1.3684,
"step": 754
},
{
"epoch": 0.5797657899788827,
"grad_norm": 1.4951286315917969,
"learning_rate": 7.62953342745146e-05,
"loss": 1.4351,
"step": 755
},
{
"epoch": 0.580533691687464,
"grad_norm": 1.273272156715393,
"learning_rate": 7.605936025175174e-05,
"loss": 1.1404,
"step": 756
},
{
"epoch": 0.5813015933960453,
"grad_norm": 1.4025629758834839,
"learning_rate": 7.582352756024971e-05,
"loss": 1.4564,
"step": 757
},
{
"epoch": 0.5820694951046266,
"grad_norm": 1.344117283821106,
"learning_rate": 7.558783759222417e-05,
"loss": 1.1624,
"step": 758
},
{
"epoch": 0.5828373968132079,
"grad_norm": 1.3355436325073242,
"learning_rate": 7.535229173904811e-05,
"loss": 1.043,
"step": 759
},
{
"epoch": 0.5836052985217892,
"grad_norm": 1.2501863241195679,
"learning_rate": 7.511689139124382e-05,
"loss": 1.2641,
"step": 760
},
{
"epoch": 0.5843732002303705,
"grad_norm": 1.3736534118652344,
"learning_rate": 7.488163793847458e-05,
"loss": 1.1726,
"step": 761
},
{
"epoch": 0.5851411019389519,
"grad_norm": 1.429854154586792,
"learning_rate": 7.464653276953643e-05,
"loss": 1.2585,
"step": 762
},
{
"epoch": 0.5859090036475331,
"grad_norm": 1.7089576721191406,
"learning_rate": 7.441157727235015e-05,
"loss": 1.5036,
"step": 763
},
{
"epoch": 0.5866769053561144,
"grad_norm": 1.3912054300308228,
"learning_rate": 7.417677283395284e-05,
"loss": 1.0993,
"step": 764
},
{
"epoch": 0.5874448070646957,
"grad_norm": 1.7586991786956787,
"learning_rate": 7.394212084048995e-05,
"loss": 1.2544,
"step": 765
},
{
"epoch": 0.588212708773277,
"grad_norm": 1.2912665605545044,
"learning_rate": 7.370762267720685e-05,
"loss": 1.0896,
"step": 766
},
{
"epoch": 0.5889806104818583,
"grad_norm": 1.5114494562149048,
"learning_rate": 7.347327972844096e-05,
"loss": 1.1424,
"step": 767
},
{
"epoch": 0.5897485121904397,
"grad_norm": 1.5202648639678955,
"learning_rate": 7.323909337761317e-05,
"loss": 1.2709,
"step": 768
},
{
"epoch": 0.5905164138990209,
"grad_norm": 1.5682910680770874,
"learning_rate": 7.30050650072202e-05,
"loss": 1.5064,
"step": 769
},
{
"epoch": 0.5912843156076022,
"grad_norm": 1.7039157152175903,
"learning_rate": 7.277119599882586e-05,
"loss": 1.1595,
"step": 770
},
{
"epoch": 0.5920522173161835,
"grad_norm": 6.636146068572998,
"learning_rate": 7.25374877330534e-05,
"loss": 0.9431,
"step": 771
},
{
"epoch": 0.5928201190247648,
"grad_norm": 1.5151164531707764,
"learning_rate": 7.230394158957705e-05,
"loss": 1.2479,
"step": 772
},
{
"epoch": 0.5935880207333462,
"grad_norm": 1.6289341449737549,
"learning_rate": 7.20705589471139e-05,
"loss": 1.2524,
"step": 773
},
{
"epoch": 0.5943559224419275,
"grad_norm": 1.5332694053649902,
"learning_rate": 7.1837341183416e-05,
"loss": 1.4389,
"step": 774
},
{
"epoch": 0.5951238241505087,
"grad_norm": 1.4293265342712402,
"learning_rate": 7.160428967526187e-05,
"loss": 1.1761,
"step": 775
},
{
"epoch": 0.59589172585909,
"grad_norm": 1.1657414436340332,
"learning_rate": 7.137140579844871e-05,
"loss": 0.9015,
"step": 776
},
{
"epoch": 0.5966596275676713,
"grad_norm": 1.196415662765503,
"learning_rate": 7.1138690927784e-05,
"loss": 1.0494,
"step": 777
},
{
"epoch": 0.5974275292762526,
"grad_norm": 1.522498369216919,
"learning_rate": 7.090614643707762e-05,
"loss": 1.2778,
"step": 778
},
{
"epoch": 0.598195430984834,
"grad_norm": 1.7265686988830566,
"learning_rate": 7.067377369913352e-05,
"loss": 1.3857,
"step": 779
},
{
"epoch": 0.5989633326934153,
"grad_norm": 1.781242847442627,
"learning_rate": 7.044157408574185e-05,
"loss": 1.8548,
"step": 780
},
{
"epoch": 0.5997312344019965,
"grad_norm": 1.2553101778030396,
"learning_rate": 7.020954896767058e-05,
"loss": 1.0755,
"step": 781
},
{
"epoch": 0.6004991361105778,
"grad_norm": 1.6484979391098022,
"learning_rate": 6.997769971465769e-05,
"loss": 1.3693,
"step": 782
},
{
"epoch": 0.6012670378191591,
"grad_norm": 2.0107421875,
"learning_rate": 6.974602769540289e-05,
"loss": 1.3331,
"step": 783
},
{
"epoch": 0.6020349395277405,
"grad_norm": 1.945434331893921,
"learning_rate": 6.951453427755968e-05,
"loss": 1.468,
"step": 784
},
{
"epoch": 0.6028028412363218,
"grad_norm": 1.637454628944397,
"learning_rate": 6.928322082772712e-05,
"loss": 1.3608,
"step": 785
},
{
"epoch": 0.6035707429449031,
"grad_norm": 1.5328805446624756,
"learning_rate": 6.905208871144187e-05,
"loss": 1.2353,
"step": 786
},
{
"epoch": 0.6043386446534843,
"grad_norm": 1.452931523323059,
"learning_rate": 6.882113929317015e-05,
"loss": 1.1362,
"step": 787
},
{
"epoch": 0.6051065463620656,
"grad_norm": 1.3005062341690063,
"learning_rate": 6.859037393629957e-05,
"loss": 1.0392,
"step": 788
},
{
"epoch": 0.6058744480706469,
"grad_norm": 1.3638523817062378,
"learning_rate": 6.835979400313122e-05,
"loss": 1.2327,
"step": 789
},
{
"epoch": 0.6066423497792283,
"grad_norm": 1.7218124866485596,
"learning_rate": 6.81294008548715e-05,
"loss": 1.3321,
"step": 790
},
{
"epoch": 0.6074102514878096,
"grad_norm": 1.0565749406814575,
"learning_rate": 6.789919585162423e-05,
"loss": 0.9172,
"step": 791
},
{
"epoch": 0.6081781531963909,
"grad_norm": 1.1628111600875854,
"learning_rate": 6.766918035238237e-05,
"loss": 1.0269,
"step": 792
},
{
"epoch": 0.6089460549049721,
"grad_norm": 1.5882484912872314,
"learning_rate": 6.743935571502038e-05,
"loss": 1.281,
"step": 793
},
{
"epoch": 0.6097139566135534,
"grad_norm": 1.0573451519012451,
"learning_rate": 6.720972329628577e-05,
"loss": 0.681,
"step": 794
},
{
"epoch": 0.6104818583221348,
"grad_norm": 1.6153432130813599,
"learning_rate": 6.698028445179148e-05,
"loss": 1.3266,
"step": 795
},
{
"epoch": 0.6112497600307161,
"grad_norm": 1.3120388984680176,
"learning_rate": 6.675104053600763e-05,
"loss": 1.101,
"step": 796
},
{
"epoch": 0.6120176617392974,
"grad_norm": 1.544906735420227,
"learning_rate": 6.65219929022535e-05,
"loss": 1.2774,
"step": 797
},
{
"epoch": 0.6127855634478787,
"grad_norm": 1.5639188289642334,
"learning_rate": 6.629314290268987e-05,
"loss": 1.301,
"step": 798
},
{
"epoch": 0.6135534651564599,
"grad_norm": 1.116890549659729,
"learning_rate": 6.606449188831057e-05,
"loss": 1.0978,
"step": 799
},
{
"epoch": 0.6143213668650412,
"grad_norm": 1.459924578666687,
"learning_rate": 6.583604120893488e-05,
"loss": 1.1193,
"step": 800
},
{
"epoch": 0.6150892685736226,
"grad_norm": 1.2557390928268433,
"learning_rate": 6.560779221319938e-05,
"loss": 1.0979,
"step": 801
},
{
"epoch": 0.6158571702822039,
"grad_norm": 1.281129240989685,
"learning_rate": 6.537974624855003e-05,
"loss": 1.3004,
"step": 802
},
{
"epoch": 0.6166250719907852,
"grad_norm": 1.4236611127853394,
"learning_rate": 6.51519046612342e-05,
"loss": 1.1254,
"step": 803
},
{
"epoch": 0.6173929736993665,
"grad_norm": 1.4232025146484375,
"learning_rate": 6.492426879629282e-05,
"loss": 1.0886,
"step": 804
},
{
"epoch": 0.6181608754079477,
"grad_norm": 1.569633960723877,
"learning_rate": 6.46968399975522e-05,
"loss": 1.1388,
"step": 805
},
{
"epoch": 0.618928777116529,
"grad_norm": 1.7293319702148438,
"learning_rate": 6.446961960761643e-05,
"loss": 1.4027,
"step": 806
},
{
"epoch": 0.6196966788251104,
"grad_norm": 1.5421292781829834,
"learning_rate": 6.424260896785914e-05,
"loss": 1.2518,
"step": 807
},
{
"epoch": 0.6204645805336917,
"grad_norm": 1.5703529119491577,
"learning_rate": 6.40158094184158e-05,
"loss": 1.2429,
"step": 808
},
{
"epoch": 0.621232482242273,
"grad_norm": 1.575032353401184,
"learning_rate": 6.378922229817575e-05,
"loss": 1.3882,
"step": 809
},
{
"epoch": 0.6220003839508543,
"grad_norm": 1.1647250652313232,
"learning_rate": 6.356284894477412e-05,
"loss": 1.031,
"step": 810
},
{
"epoch": 0.6227682856594355,
"grad_norm": 1.8725273609161377,
"learning_rate": 6.333669069458432e-05,
"loss": 1.4888,
"step": 811
},
{
"epoch": 0.6235361873680169,
"grad_norm": 1.4924973249435425,
"learning_rate": 6.311074888270971e-05,
"loss": 1.3342,
"step": 812
},
{
"epoch": 0.6243040890765982,
"grad_norm": 1.2791497707366943,
"learning_rate": 6.288502484297607e-05,
"loss": 1.1041,
"step": 813
},
{
"epoch": 0.6250719907851795,
"grad_norm": 1.1811076402664185,
"learning_rate": 6.265951990792347e-05,
"loss": 0.838,
"step": 814
},
{
"epoch": 0.6258398924937608,
"grad_norm": 1.6005576848983765,
"learning_rate": 6.243423540879865e-05,
"loss": 1.3403,
"step": 815
},
{
"epoch": 0.6266077942023421,
"grad_norm": 1.2689980268478394,
"learning_rate": 6.220917267554686e-05,
"loss": 1.0595,
"step": 816
},
{
"epoch": 0.6273756959109233,
"grad_norm": 1.4566655158996582,
"learning_rate": 6.198433303680439e-05,
"loss": 1.2665,
"step": 817
},
{
"epoch": 0.6281435976195047,
"grad_norm": 1.9303637742996216,
"learning_rate": 6.175971781989025e-05,
"loss": 0.9127,
"step": 818
},
{
"epoch": 0.628911499328086,
"grad_norm": 1.7557438611984253,
"learning_rate": 6.153532835079886e-05,
"loss": 1.4541,
"step": 819
},
{
"epoch": 0.6296794010366673,
"grad_norm": 1.5808215141296387,
"learning_rate": 6.131116595419178e-05,
"loss": 1.4305,
"step": 820
},
{
"epoch": 0.6304473027452486,
"grad_norm": 1.6107852458953857,
"learning_rate": 6.108723195339011e-05,
"loss": 1.1872,
"step": 821
},
{
"epoch": 0.63121520445383,
"grad_norm": 1.4639952182769775,
"learning_rate": 6.086352767036673e-05,
"loss": 0.8872,
"step": 822
},
{
"epoch": 0.6319831061624112,
"grad_norm": 1.5953764915466309,
"learning_rate": 6.064005442573824e-05,
"loss": 1.3455,
"step": 823
},
{
"epoch": 0.6327510078709925,
"grad_norm": 1.7272642850875854,
"learning_rate": 6.041681353875746e-05,
"loss": 1.273,
"step": 824
},
{
"epoch": 0.6335189095795738,
"grad_norm": 1.5354080200195312,
"learning_rate": 6.019380632730546e-05,
"loss": 1.1075,
"step": 825
},
{
"epoch": 0.6342868112881551,
"grad_norm": 1.4213950634002686,
"learning_rate": 5.997103410788385e-05,
"loss": 1.2971,
"step": 826
},
{
"epoch": 0.6350547129967364,
"grad_norm": 1.8925020694732666,
"learning_rate": 5.9748498195606925e-05,
"loss": 1.3272,
"step": 827
},
{
"epoch": 0.6358226147053178,
"grad_norm": 1.3079675436019897,
"learning_rate": 5.952619990419408e-05,
"loss": 1.1373,
"step": 828
},
{
"epoch": 0.636590516413899,
"grad_norm": 1.5753669738769531,
"learning_rate": 5.9304140545961784e-05,
"loss": 1.0857,
"step": 829
},
{
"epoch": 0.6373584181224803,
"grad_norm": 1.3841052055358887,
"learning_rate": 5.9082321431816156e-05,
"loss": 1.2683,
"step": 830
},
{
"epoch": 0.6381263198310616,
"grad_norm": 1.3311750888824463,
"learning_rate": 5.88607438712449e-05,
"loss": 0.9628,
"step": 831
},
{
"epoch": 0.6388942215396429,
"grad_norm": 1.4135347604751587,
"learning_rate": 5.863940917230986e-05,
"loss": 1.0973,
"step": 832
},
{
"epoch": 0.6396621232482242,
"grad_norm": 1.665837049484253,
"learning_rate": 5.841831864163909e-05,
"loss": 1.5469,
"step": 833
},
{
"epoch": 0.6404300249568056,
"grad_norm": 1.3354551792144775,
"learning_rate": 5.8197473584419184e-05,
"loss": 1.1408,
"step": 834
},
{
"epoch": 0.6411979266653869,
"grad_norm": 1.3838506937026978,
"learning_rate": 5.7976875304387756e-05,
"loss": 1.0322,
"step": 835
},
{
"epoch": 0.6419658283739681,
"grad_norm": 1.6376174688339233,
"learning_rate": 5.7756525103825474e-05,
"loss": 1.4984,
"step": 836
},
{
"epoch": 0.6427337300825494,
"grad_norm": 1.2751094102859497,
"learning_rate": 5.753642428354852e-05,
"loss": 1.2019,
"step": 837
},
{
"epoch": 0.6435016317911307,
"grad_norm": 1.3204684257507324,
"learning_rate": 5.731657414290085e-05,
"loss": 1.1769,
"step": 838
},
{
"epoch": 0.644269533499712,
"grad_norm": 1.444831132888794,
"learning_rate": 5.7096975979746704e-05,
"loss": 1.2601,
"step": 839
},
{
"epoch": 0.6450374352082934,
"grad_norm": 1.5533994436264038,
"learning_rate": 5.687763109046255e-05,
"loss": 1.2037,
"step": 840
},
{
"epoch": 0.6458053369168747,
"grad_norm": 1.3148772716522217,
"learning_rate": 5.665854076992991e-05,
"loss": 1.0888,
"step": 841
},
{
"epoch": 0.6465732386254559,
"grad_norm": 1.465082049369812,
"learning_rate": 5.643970631152735e-05,
"loss": 1.2246,
"step": 842
},
{
"epoch": 0.6473411403340372,
"grad_norm": 1.5530979633331299,
"learning_rate": 5.622112900712304e-05,
"loss": 1.4689,
"step": 843
},
{
"epoch": 0.6481090420426185,
"grad_norm": 1.1694343090057373,
"learning_rate": 5.600281014706703e-05,
"loss": 0.9216,
"step": 844
},
{
"epoch": 0.6488769437511999,
"grad_norm": 1.592970371246338,
"learning_rate": 5.57847510201837e-05,
"loss": 1.1406,
"step": 845
},
{
"epoch": 0.6496448454597812,
"grad_norm": 1.7581945657730103,
"learning_rate": 5.556695291376406e-05,
"loss": 1.4523,
"step": 846
},
{
"epoch": 0.6504127471683625,
"grad_norm": 1.7311755418777466,
"learning_rate": 5.5349417113558254e-05,
"loss": 1.3606,
"step": 847
},
{
"epoch": 0.6511806488769437,
"grad_norm": 1.291117548942566,
"learning_rate": 5.5132144903768e-05,
"loss": 1.0444,
"step": 848
},
{
"epoch": 0.651948550585525,
"grad_norm": 1.4802870750427246,
"learning_rate": 5.491513756703881e-05,
"loss": 1.1835,
"step": 849
},
{
"epoch": 0.6527164522941064,
"grad_norm": 1.0915478467941284,
"learning_rate": 5.46983963844526e-05,
"loss": 0.9936,
"step": 850
},
{
"epoch": 0.6534843540026877,
"grad_norm": 1.842965006828308,
"learning_rate": 5.448192263552006e-05,
"loss": 1.2762,
"step": 851
},
{
"epoch": 0.654252255711269,
"grad_norm": 1.5491669178009033,
"learning_rate": 5.426571759817314e-05,
"loss": 1.2144,
"step": 852
},
{
"epoch": 0.6550201574198503,
"grad_norm": 1.6982448101043701,
"learning_rate": 5.4049782548757386e-05,
"loss": 1.0843,
"step": 853
},
{
"epoch": 0.6557880591284315,
"grad_norm": 1.5886989831924438,
"learning_rate": 5.383411876202464e-05,
"loss": 1.393,
"step": 854
},
{
"epoch": 0.6565559608370128,
"grad_norm": 1.3143181800842285,
"learning_rate": 5.36187275111253e-05,
"loss": 1.1188,
"step": 855
},
{
"epoch": 0.6573238625455942,
"grad_norm": 1.499030590057373,
"learning_rate": 5.340361006760082e-05,
"loss": 1.1549,
"step": 856
},
{
"epoch": 0.6580917642541755,
"grad_norm": 1.2520569562911987,
"learning_rate": 5.318876770137634e-05,
"loss": 1.1459,
"step": 857
},
{
"epoch": 0.6588596659627568,
"grad_norm": 1.5099189281463623,
"learning_rate": 5.297420168075307e-05,
"loss": 1.2704,
"step": 858
},
{
"epoch": 0.6596275676713381,
"grad_norm": 1.6787750720977783,
"learning_rate": 5.275991327240082e-05,
"loss": 1.2595,
"step": 859
},
{
"epoch": 0.6603954693799193,
"grad_norm": 1.695313811302185,
"learning_rate": 5.254590374135058e-05,
"loss": 1.3029,
"step": 860
},
{
"epoch": 0.6611633710885007,
"grad_norm": 1.2656611204147339,
"learning_rate": 5.233217435098707e-05,
"loss": 1.2773,
"step": 861
},
{
"epoch": 0.661931272797082,
"grad_norm": 1.3123670816421509,
"learning_rate": 5.2118726363041036e-05,
"loss": 1.653,
"step": 862
},
{
"epoch": 0.6626991745056633,
"grad_norm": 1.6967337131500244,
"learning_rate": 5.190556103758223e-05,
"loss": 1.356,
"step": 863
},
{
"epoch": 0.6634670762142446,
"grad_norm": 1.0755648612976074,
"learning_rate": 5.1692679633011564e-05,
"loss": 0.8091,
"step": 864
},
{
"epoch": 0.6642349779228259,
"grad_norm": 1.0523223876953125,
"learning_rate": 5.148008340605392e-05,
"loss": 0.854,
"step": 865
},
{
"epoch": 0.6650028796314071,
"grad_norm": 1.3313195705413818,
"learning_rate": 5.1267773611750624e-05,
"loss": 1.0723,
"step": 866
},
{
"epoch": 0.6657707813399885,
"grad_norm": 1.5703046321868896,
"learning_rate": 5.1055751503452144e-05,
"loss": 1.3341,
"step": 867
},
{
"epoch": 0.6665386830485698,
"grad_norm": 1.6033220291137695,
"learning_rate": 5.0844018332810594e-05,
"loss": 1.4145,
"step": 868
},
{
"epoch": 0.6673065847571511,
"grad_norm": 1.1940321922302246,
"learning_rate": 5.0632575349772225e-05,
"loss": 1.0971,
"step": 869
},
{
"epoch": 0.6680744864657324,
"grad_norm": 1.534477710723877,
"learning_rate": 5.0421423802570454e-05,
"loss": 1.2956,
"step": 870
},
{
"epoch": 0.6688423881743137,
"grad_norm": 1.457879900932312,
"learning_rate": 5.0210564937718055e-05,
"loss": 1.3399,
"step": 871
},
{
"epoch": 0.669610289882895,
"grad_norm": 1.6395093202590942,
"learning_rate": 5.000000000000002e-05,
"loss": 1.2267,
"step": 872
},
{
"epoch": 0.6703781915914763,
"grad_norm": 1.403594970703125,
"learning_rate": 4.978973023246616e-05,
"loss": 1.2561,
"step": 873
},
{
"epoch": 0.6711460933000576,
"grad_norm": 1.3514751195907593,
"learning_rate": 4.957975687642389e-05,
"loss": 0.8501,
"step": 874
},
{
"epoch": 0.6719139950086389,
"grad_norm": 1.7166774272918701,
"learning_rate": 4.937008117143055e-05,
"loss": 1.2769,
"step": 875
},
{
"epoch": 0.6726818967172202,
"grad_norm": 1.637560486793518,
"learning_rate": 4.9160704355286577e-05,
"loss": 1.2707,
"step": 876
},
{
"epoch": 0.6734497984258015,
"grad_norm": 1.7437028884887695,
"learning_rate": 4.895162766402781e-05,
"loss": 1.356,
"step": 877
},
{
"epoch": 0.6742177001343828,
"grad_norm": 1.4267345666885376,
"learning_rate": 4.8742852331918364e-05,
"loss": 1.2037,
"step": 878
},
{
"epoch": 0.6749856018429641,
"grad_norm": 2.0313918590545654,
"learning_rate": 4.8534379591443246e-05,
"loss": 1.5035,
"step": 879
},
{
"epoch": 0.6757535035515454,
"grad_norm": 1.4802058935165405,
"learning_rate": 4.8326210673301284e-05,
"loss": 1.1481,
"step": 880
},
{
"epoch": 0.6765214052601267,
"grad_norm": 1.6689741611480713,
"learning_rate": 4.811834680639765e-05,
"loss": 1.2703,
"step": 881
},
{
"epoch": 0.677289306968708,
"grad_norm": 1.475077748298645,
"learning_rate": 4.791078921783653e-05,
"loss": 1.0892,
"step": 882
},
{
"epoch": 0.6780572086772894,
"grad_norm": 1.7580574750900269,
"learning_rate": 4.770353913291428e-05,
"loss": 1.2977,
"step": 883
},
{
"epoch": 0.6788251103858706,
"grad_norm": 1.482071876525879,
"learning_rate": 4.749659777511177e-05,
"loss": 1.2857,
"step": 884
},
{
"epoch": 0.6795930120944519,
"grad_norm": 1.2742493152618408,
"learning_rate": 4.728996636608738e-05,
"loss": 1.0448,
"step": 885
},
{
"epoch": 0.6803609138030332,
"grad_norm": 1.8095487356185913,
"learning_rate": 4.708364612566969e-05,
"loss": 1.4402,
"step": 886
},
{
"epoch": 0.6811288155116145,
"grad_norm": 1.4991480112075806,
"learning_rate": 4.6877638271850485e-05,
"loss": 1.0895,
"step": 887
},
{
"epoch": 0.6818967172201958,
"grad_norm": 1.588757038116455,
"learning_rate": 4.667194402077714e-05,
"loss": 1.472,
"step": 888
},
{
"epoch": 0.6826646189287772,
"grad_norm": 1.5407018661499023,
"learning_rate": 4.646656458674595e-05,
"loss": 1.0996,
"step": 889
},
{
"epoch": 0.6834325206373584,
"grad_norm": 1.4947824478149414,
"learning_rate": 4.62615011821946e-05,
"loss": 1.1829,
"step": 890
},
{
"epoch": 0.6842004223459397,
"grad_norm": 1.6118358373641968,
"learning_rate": 4.6056755017695155e-05,
"loss": 1.26,
"step": 891
},
{
"epoch": 0.684968324054521,
"grad_norm": 1.4456740617752075,
"learning_rate": 4.585232730194682e-05,
"loss": 1.1356,
"step": 892
},
{
"epoch": 0.6857362257631023,
"grad_norm": 1.421918272972107,
"learning_rate": 4.5648219241769054e-05,
"loss": 1.1096,
"step": 893
},
{
"epoch": 0.6865041274716837,
"grad_norm": 1.182092547416687,
"learning_rate": 4.5444432042093996e-05,
"loss": 0.9751,
"step": 894
},
{
"epoch": 0.687272029180265,
"grad_norm": 1.425451636314392,
"learning_rate": 4.524096690595978e-05,
"loss": 1.3076,
"step": 895
},
{
"epoch": 0.6880399308888462,
"grad_norm": 1.116645097732544,
"learning_rate": 4.5037825034503304e-05,
"loss": 0.7194,
"step": 896
},
{
"epoch": 0.6888078325974275,
"grad_norm": 1.3570024967193604,
"learning_rate": 4.4835007626953e-05,
"loss": 0.9747,
"step": 897
},
{
"epoch": 0.6895757343060088,
"grad_norm": 1.3995678424835205,
"learning_rate": 4.4632515880621894e-05,
"loss": 1.0773,
"step": 898
},
{
"epoch": 0.6903436360145901,
"grad_norm": 1.5802603960037231,
"learning_rate": 4.443035099090048e-05,
"loss": 1.3399,
"step": 899
},
{
"epoch": 0.6911115377231715,
"grad_norm": 1.563367247581482,
"learning_rate": 4.4228514151249825e-05,
"loss": 1.0937,
"step": 900
},
{
"epoch": 0.6918794394317528,
"grad_norm": 1.6263221502304077,
"learning_rate": 4.4027006553194115e-05,
"loss": 1.258,
"step": 901
},
{
"epoch": 0.692647341140334,
"grad_norm": 1.682242751121521,
"learning_rate": 4.3825829386314166e-05,
"loss": 1.3231,
"step": 902
},
{
"epoch": 0.6934152428489153,
"grad_norm": 1.3335479497909546,
"learning_rate": 4.362498383823996e-05,
"loss": 1.0193,
"step": 903
},
{
"epoch": 0.6941831445574966,
"grad_norm": 1.619031548500061,
"learning_rate": 4.342447109464385e-05,
"loss": 1.2696,
"step": 904
},
{
"epoch": 0.694951046266078,
"grad_norm": 1.4861383438110352,
"learning_rate": 4.322429233923351e-05,
"loss": 1.3105,
"step": 905
},
{
"epoch": 0.6957189479746593,
"grad_norm": 1.4006625413894653,
"learning_rate": 4.3024448753744925e-05,
"loss": 1.3573,
"step": 906
},
{
"epoch": 0.6964868496832406,
"grad_norm": 1.2410006523132324,
"learning_rate": 4.282494151793548e-05,
"loss": 0.9566,
"step": 907
},
{
"epoch": 0.6972547513918218,
"grad_norm": 1.0770704746246338,
"learning_rate": 4.2625771809576874e-05,
"loss": 0.7789,
"step": 908
},
{
"epoch": 0.6980226531004031,
"grad_norm": 1.2837942838668823,
"learning_rate": 4.242694080444837e-05,
"loss": 1.1047,
"step": 909
},
{
"epoch": 0.6987905548089844,
"grad_norm": 1.1040459871292114,
"learning_rate": 4.2228449676329616e-05,
"loss": 0.9763,
"step": 910
},
{
"epoch": 0.6995584565175658,
"grad_norm": 1.6829538345336914,
"learning_rate": 4.2030299596993883e-05,
"loss": 1.4803,
"step": 911
},
{
"epoch": 0.7003263582261471,
"grad_norm": 1.4828580617904663,
"learning_rate": 4.1832491736201077e-05,
"loss": 1.2561,
"step": 912
},
{
"epoch": 0.7010942599347284,
"grad_norm": 1.568756341934204,
"learning_rate": 4.1635027261690827e-05,
"loss": 1.4303,
"step": 913
},
{
"epoch": 0.7018621616433096,
"grad_norm": 1.3178819417953491,
"learning_rate": 4.143790733917564e-05,
"loss": 0.8952,
"step": 914
},
{
"epoch": 0.7026300633518909,
"grad_norm": 1.4817698001861572,
"learning_rate": 4.124113313233404e-05,
"loss": 1.3478,
"step": 915
},
{
"epoch": 0.7033979650604723,
"grad_norm": 1.4555271863937378,
"learning_rate": 4.1044705802803574e-05,
"loss": 1.3088,
"step": 916
},
{
"epoch": 0.7041658667690536,
"grad_norm": 1.4719526767730713,
"learning_rate": 4.084862651017406e-05,
"loss": 1.2125,
"step": 917
},
{
"epoch": 0.7049337684776349,
"grad_norm": 1.2816683053970337,
"learning_rate": 4.065289641198073e-05,
"loss": 1.208,
"step": 918
},
{
"epoch": 0.7057016701862162,
"grad_norm": 1.3525464534759521,
"learning_rate": 4.045751666369736e-05,
"loss": 1.1444,
"step": 919
},
{
"epoch": 0.7064695718947974,
"grad_norm": 1.4733887910842896,
"learning_rate": 4.026248841872946e-05,
"loss": 1.2619,
"step": 920
},
{
"epoch": 0.7072374736033787,
"grad_norm": 1.3341400623321533,
"learning_rate": 4.006781282840748e-05,
"loss": 1.153,
"step": 921
},
{
"epoch": 0.7080053753119601,
"grad_norm": 1.5136420726776123,
"learning_rate": 3.987349104198007e-05,
"loss": 1.1891,
"step": 922
},
{
"epoch": 0.7087732770205414,
"grad_norm": 1.2748438119888306,
"learning_rate": 3.9679524206607156e-05,
"loss": 1.0631,
"step": 923
},
{
"epoch": 0.7095411787291227,
"grad_norm": 1.4721794128417969,
"learning_rate": 3.948591346735325e-05,
"loss": 1.2841,
"step": 924
},
{
"epoch": 0.710309080437704,
"grad_norm": 1.9709136486053467,
"learning_rate": 3.929265996718072e-05,
"loss": 1.478,
"step": 925
},
{
"epoch": 0.7110769821462852,
"grad_norm": 2.1271793842315674,
"learning_rate": 3.9099764846943e-05,
"loss": 1.1028,
"step": 926
},
{
"epoch": 0.7118448838548666,
"grad_norm": 1.5257985591888428,
"learning_rate": 3.89072292453778e-05,
"loss": 0.9435,
"step": 927
},
{
"epoch": 0.7126127855634479,
"grad_norm": 1.2246668338775635,
"learning_rate": 3.871505429910057e-05,
"loss": 0.9961,
"step": 928
},
{
"epoch": 0.7133806872720292,
"grad_norm": 1.3091059923171997,
"learning_rate": 3.85232411425976e-05,
"loss": 1.1994,
"step": 929
},
{
"epoch": 0.7141485889806105,
"grad_norm": 1.6306805610656738,
"learning_rate": 3.833179090821929e-05,
"loss": 1.4055,
"step": 930
},
{
"epoch": 0.7149164906891918,
"grad_norm": 1.6316722631454468,
"learning_rate": 3.814070472617375e-05,
"loss": 1.4363,
"step": 931
},
{
"epoch": 0.715684392397773,
"grad_norm": 1.474591612815857,
"learning_rate": 3.794998372451981e-05,
"loss": 1.1251,
"step": 932
},
{
"epoch": 0.7164522941063544,
"grad_norm": 1.4235069751739502,
"learning_rate": 3.775962902916056e-05,
"loss": 1.0631,
"step": 933
},
{
"epoch": 0.7172201958149357,
"grad_norm": 1.580439567565918,
"learning_rate": 3.756964176383655e-05,
"loss": 1.3518,
"step": 934
},
{
"epoch": 0.717988097523517,
"grad_norm": 1.3070342540740967,
"learning_rate": 3.7380023050119415e-05,
"loss": 1.2074,
"step": 935
},
{
"epoch": 0.7187559992320983,
"grad_norm": 1.4719895124435425,
"learning_rate": 3.7190774007404835e-05,
"loss": 1.0298,
"step": 936
},
{
"epoch": 0.7195239009406796,
"grad_norm": 1.440446376800537,
"learning_rate": 3.700189575290641e-05,
"loss": 1.1669,
"step": 937
},
{
"epoch": 0.7202918026492608,
"grad_norm": 1.3623183965682983,
"learning_rate": 3.681338940164868e-05,
"loss": 1.1463,
"step": 938
},
{
"epoch": 0.7210597043578422,
"grad_norm": 1.303728461265564,
"learning_rate": 3.6625256066460735e-05,
"loss": 1.0379,
"step": 939
},
{
"epoch": 0.7218276060664235,
"grad_norm": 1.3376387357711792,
"learning_rate": 3.6437496857969566e-05,
"loss": 1.2042,
"step": 940
},
{
"epoch": 0.7225955077750048,
"grad_norm": 1.6355212926864624,
"learning_rate": 3.625011288459365e-05,
"loss": 1.3353,
"step": 941
},
{
"epoch": 0.7233634094835861,
"grad_norm": 1.3309329748153687,
"learning_rate": 3.606310525253621e-05,
"loss": 0.765,
"step": 942
},
{
"epoch": 0.7241313111921674,
"grad_norm": 1.7754614353179932,
"learning_rate": 3.5876475065778715e-05,
"loss": 1.1589,
"step": 943
},
{
"epoch": 0.7248992129007487,
"grad_norm": 1.2315130233764648,
"learning_rate": 3.5690223426074576e-05,
"loss": 0.9012,
"step": 944
},
{
"epoch": 0.72566711460933,
"grad_norm": 1.3348158597946167,
"learning_rate": 3.550435143294238e-05,
"loss": 1.0467,
"step": 945
},
{
"epoch": 0.7264350163179113,
"grad_norm": 1.4577149152755737,
"learning_rate": 3.531886018365954e-05,
"loss": 1.0583,
"step": 946
},
{
"epoch": 0.7272029180264926,
"grad_norm": 1.927155613899231,
"learning_rate": 3.513375077325575e-05,
"loss": 1.2384,
"step": 947
},
{
"epoch": 0.7279708197350739,
"grad_norm": 2.0631394386291504,
"learning_rate": 3.4949024294506674e-05,
"loss": 1.3989,
"step": 948
},
{
"epoch": 0.7287387214436553,
"grad_norm": 1.7409682273864746,
"learning_rate": 3.476468183792716e-05,
"loss": 1.3022,
"step": 949
},
{
"epoch": 0.7295066231522365,
"grad_norm": 1.0105476379394531,
"learning_rate": 3.458072449176525e-05,
"loss": 0.902,
"step": 950
},
{
"epoch": 0.7302745248608178,
"grad_norm": 1.6116336584091187,
"learning_rate": 3.439715334199538e-05,
"loss": 1.2317,
"step": 951
},
{
"epoch": 0.7310424265693991,
"grad_norm": 1.6682941913604736,
"learning_rate": 3.4213969472312154e-05,
"loss": 1.4236,
"step": 952
},
{
"epoch": 0.7318103282779804,
"grad_norm": 1.2750146389007568,
"learning_rate": 3.403117396412391e-05,
"loss": 0.8671,
"step": 953
},
{
"epoch": 0.7325782299865617,
"grad_norm": 1.2994352579116821,
"learning_rate": 3.384876789654631e-05,
"loss": 1.1212,
"step": 954
},
{
"epoch": 0.7333461316951431,
"grad_norm": 2.186424732208252,
"learning_rate": 3.366675234639601e-05,
"loss": 1.4304,
"step": 955
},
{
"epoch": 0.7341140334037243,
"grad_norm": 1.4062567949295044,
"learning_rate": 3.348512838818425e-05,
"loss": 1.1381,
"step": 956
},
{
"epoch": 0.7348819351123056,
"grad_norm": 1.4242571592330933,
"learning_rate": 3.3303897094110636e-05,
"loss": 1.0323,
"step": 957
},
{
"epoch": 0.7356498368208869,
"grad_norm": 1.2858667373657227,
"learning_rate": 3.3123059534056634e-05,
"loss": 1.0643,
"step": 958
},
{
"epoch": 0.7364177385294682,
"grad_norm": 1.6064081192016602,
"learning_rate": 3.294261677557935e-05,
"loss": 1.2303,
"step": 959
},
{
"epoch": 0.7371856402380496,
"grad_norm": 1.5934802293777466,
"learning_rate": 3.2762569883905205e-05,
"loss": 1.2093,
"step": 960
},
{
"epoch": 0.7379535419466309,
"grad_norm": 1.4172946214675903,
"learning_rate": 3.258291992192377e-05,
"loss": 1.2843,
"step": 961
},
{
"epoch": 0.7387214436552121,
"grad_norm": 1.6268061399459839,
"learning_rate": 3.240366795018117e-05,
"loss": 1.4062,
"step": 962
},
{
"epoch": 0.7394893453637934,
"grad_norm": 1.4560757875442505,
"learning_rate": 3.222481502687425e-05,
"loss": 1.3195,
"step": 963
},
{
"epoch": 0.7402572470723747,
"grad_norm": 1.716341257095337,
"learning_rate": 3.2046362207844e-05,
"loss": 1.3937,
"step": 964
},
{
"epoch": 0.741025148780956,
"grad_norm": 1.4211591482162476,
"learning_rate": 3.1868310546569424e-05,
"loss": 1.3429,
"step": 965
},
{
"epoch": 0.7417930504895374,
"grad_norm": 1.4726101160049438,
"learning_rate": 3.1690661094161364e-05,
"loss": 1.2199,
"step": 966
},
{
"epoch": 0.7425609521981187,
"grad_norm": 1.6528246402740479,
"learning_rate": 3.151341489935627e-05,
"loss": 1.4508,
"step": 967
},
{
"epoch": 0.7433288539066999,
"grad_norm": 1.4311929941177368,
"learning_rate": 3.133657300850995e-05,
"loss": 1.3215,
"step": 968
},
{
"epoch": 0.7440967556152812,
"grad_norm": 1.6301379203796387,
"learning_rate": 3.116013646559146e-05,
"loss": 1.1288,
"step": 969
},
{
"epoch": 0.7448646573238625,
"grad_norm": 1.50910222530365,
"learning_rate": 3.0984106312177e-05,
"loss": 1.139,
"step": 970
},
{
"epoch": 0.7456325590324439,
"grad_norm": 1.4490171670913696,
"learning_rate": 3.0808483587443595e-05,
"loss": 1.1863,
"step": 971
},
{
"epoch": 0.7464004607410252,
"grad_norm": 1.835550308227539,
"learning_rate": 3.063326932816307e-05,
"loss": 1.2192,
"step": 972
},
{
"epoch": 0.7471683624496065,
"grad_norm": 1.5930982828140259,
"learning_rate": 3.045846456869592e-05,
"loss": 1.148,
"step": 973
},
{
"epoch": 0.7479362641581877,
"grad_norm": 1.7530690431594849,
"learning_rate": 3.0284070340985295e-05,
"loss": 1.2803,
"step": 974
},
{
"epoch": 0.748704165866769,
"grad_norm": 1.397155523300171,
"learning_rate": 3.011008767455059e-05,
"loss": 1.2601,
"step": 975
},
{
"epoch": 0.7494720675753503,
"grad_norm": 1.3860681056976318,
"learning_rate": 2.9936517596481818e-05,
"loss": 1.314,
"step": 976
},
{
"epoch": 0.7502399692839317,
"grad_norm": 1.445178747177124,
"learning_rate": 2.9763361131433208e-05,
"loss": 0.9815,
"step": 977
},
{
"epoch": 0.751007870992513,
"grad_norm": 1.291642427444458,
"learning_rate": 2.9590619301617183e-05,
"loss": 1.0057,
"step": 978
},
{
"epoch": 0.751007870992513,
"eval_loss": 1.251602292060852,
"eval_runtime": 19.752,
"eval_samples_per_second": 27.795,
"eval_steps_per_second": 13.923,
"step": 978
},
{
"epoch": 0.7517757727010943,
"grad_norm": 1.2883607149124146,
"learning_rate": 2.9418293126798603e-05,
"loss": 1.1005,
"step": 979
},
{
"epoch": 0.7525436744096755,
"grad_norm": 1.434813141822815,
"learning_rate": 2.9246383624288387e-05,
"loss": 1.201,
"step": 980
},
{
"epoch": 0.7533115761182568,
"grad_norm": 2.126904249191284,
"learning_rate": 2.9074891808937753e-05,
"loss": 1.4774,
"step": 981
},
{
"epoch": 0.7540794778268382,
"grad_norm": 1.5914756059646606,
"learning_rate": 2.8903818693132077e-05,
"loss": 1.3087,
"step": 982
},
{
"epoch": 0.7548473795354195,
"grad_norm": 1.577763557434082,
"learning_rate": 2.873316528678507e-05,
"loss": 1.3263,
"step": 983
},
{
"epoch": 0.7556152812440008,
"grad_norm": 1.5671124458312988,
"learning_rate": 2.856293259733266e-05,
"loss": 1.2893,
"step": 984
},
{
"epoch": 0.7563831829525821,
"grad_norm": 1.6029667854309082,
"learning_rate": 2.8393121629727138e-05,
"loss": 1.1126,
"step": 985
},
{
"epoch": 0.7571510846611633,
"grad_norm": 1.4934439659118652,
"learning_rate": 2.8223733386431185e-05,
"loss": 1.2581,
"step": 986
},
{
"epoch": 0.7579189863697446,
"grad_norm": 1.327414870262146,
"learning_rate": 2.8054768867411974e-05,
"loss": 1.2253,
"step": 987
},
{
"epoch": 0.758686888078326,
"grad_norm": 1.5715781450271606,
"learning_rate": 2.788622907013526e-05,
"loss": 1.1577,
"step": 988
},
{
"epoch": 0.7594547897869073,
"grad_norm": 1.4220985174179077,
"learning_rate": 2.7718114989559552e-05,
"loss": 1.1358,
"step": 989
},
{
"epoch": 0.7602226914954886,
"grad_norm": 1.4818079471588135,
"learning_rate": 2.7550427618130127e-05,
"loss": 1.2783,
"step": 990
},
{
"epoch": 0.7609905932040699,
"grad_norm": 1.394391655921936,
"learning_rate": 2.738316794577315e-05,
"loss": 1.2429,
"step": 991
},
{
"epoch": 0.7617584949126511,
"grad_norm": 1.7591447830200195,
"learning_rate": 2.7216336959890076e-05,
"loss": 1.5355,
"step": 992
},
{
"epoch": 0.7625263966212324,
"grad_norm": 1.226349949836731,
"learning_rate": 2.704993564535152e-05,
"loss": 1.0599,
"step": 993
},
{
"epoch": 0.7632942983298138,
"grad_norm": 1.3571503162384033,
"learning_rate": 2.688396498449164e-05,
"loss": 0.9554,
"step": 994
},
{
"epoch": 0.7640622000383951,
"grad_norm": 1.546478509902954,
"learning_rate": 2.671842595710219e-05,
"loss": 1.2609,
"step": 995
},
{
"epoch": 0.7648301017469764,
"grad_norm": 1.304118275642395,
"learning_rate": 2.655331954042699e-05,
"loss": 1.1655,
"step": 996
},
{
"epoch": 0.7655980034555577,
"grad_norm": 1.4672322273254395,
"learning_rate": 2.638864670915572e-05,
"loss": 1.1936,
"step": 997
},
{
"epoch": 0.7663659051641389,
"grad_norm": 1.1511002779006958,
"learning_rate": 2.622440843541869e-05,
"loss": 1.1561,
"step": 998
},
{
"epoch": 0.7671338068727203,
"grad_norm": 1.617389440536499,
"learning_rate": 2.6060605688780694e-05,
"loss": 1.2096,
"step": 999
},
{
"epoch": 0.7679017085813016,
"grad_norm": 1.7287838459014893,
"learning_rate": 2.5897239436235466e-05,
"loss": 1.3722,
"step": 1000
},
{
"epoch": 0.7686696102898829,
"grad_norm": 1.4887206554412842,
"learning_rate": 2.5734310642199943e-05,
"loss": 1.2082,
"step": 1001
},
{
"epoch": 0.7694375119984642,
"grad_norm": 1.1259337663650513,
"learning_rate": 2.557182026850855e-05,
"loss": 0.937,
"step": 1002
},
{
"epoch": 0.7702054137070455,
"grad_norm": 1.7888356447219849,
"learning_rate": 2.5409769274407637e-05,
"loss": 1.2036,
"step": 1003
},
{
"epoch": 0.7709733154156267,
"grad_norm": 1.3678765296936035,
"learning_rate": 2.524815861654952e-05,
"loss": 1.183,
"step": 1004
},
{
"epoch": 0.7717412171242081,
"grad_norm": 1.4759341478347778,
"learning_rate": 2.5086989248987248e-05,
"loss": 1.4874,
"step": 1005
},
{
"epoch": 0.7725091188327894,
"grad_norm": 1.6217724084854126,
"learning_rate": 2.492626212316862e-05,
"loss": 1.3213,
"step": 1006
},
{
"epoch": 0.7732770205413707,
"grad_norm": 1.735228180885315,
"learning_rate": 2.476597818793075e-05,
"loss": 1.4892,
"step": 1007
},
{
"epoch": 0.774044922249952,
"grad_norm": 1.6505582332611084,
"learning_rate": 2.460613838949437e-05,
"loss": 1.3946,
"step": 1008
},
{
"epoch": 0.7748128239585333,
"grad_norm": 1.5245342254638672,
"learning_rate": 2.444674367145845e-05,
"loss": 1.3344,
"step": 1009
},
{
"epoch": 0.7755807256671146,
"grad_norm": 1.4345464706420898,
"learning_rate": 2.4287794974794208e-05,
"loss": 1.304,
"step": 1010
},
{
"epoch": 0.7763486273756959,
"grad_norm": 1.120750069618225,
"learning_rate": 2.4129293237840066e-05,
"loss": 1.031,
"step": 1011
},
{
"epoch": 0.7771165290842772,
"grad_norm": 1.4984389543533325,
"learning_rate": 2.397123939629574e-05,
"loss": 1.4389,
"step": 1012
},
{
"epoch": 0.7778844307928585,
"grad_norm": 1.4427404403686523,
"learning_rate": 2.3813634383216853e-05,
"loss": 1.1667,
"step": 1013
},
{
"epoch": 0.7786523325014398,
"grad_norm": 1.6551741361618042,
"learning_rate": 2.3656479129009422e-05,
"loss": 1.3006,
"step": 1014
},
{
"epoch": 0.7794202342100212,
"grad_norm": 1.3143059015274048,
"learning_rate": 2.3499774561424327e-05,
"loss": 1.134,
"step": 1015
},
{
"epoch": 0.7801881359186024,
"grad_norm": 1.7605934143066406,
"learning_rate": 2.3343521605551967e-05,
"loss": 1.1622,
"step": 1016
},
{
"epoch": 0.7809560376271837,
"grad_norm": 1.5233832597732544,
"learning_rate": 2.3187721183816503e-05,
"loss": 1.2939,
"step": 1017
},
{
"epoch": 0.781723939335765,
"grad_norm": 1.3118071556091309,
"learning_rate": 2.303237421597082e-05,
"loss": 0.9272,
"step": 1018
},
{
"epoch": 0.7824918410443463,
"grad_norm": 1.559906005859375,
"learning_rate": 2.2877481619090734e-05,
"loss": 1.4749,
"step": 1019
},
{
"epoch": 0.7832597427529276,
"grad_norm": 1.6087397336959839,
"learning_rate": 2.2723044307569775e-05,
"loss": 1.2059,
"step": 1020
},
{
"epoch": 0.784027644461509,
"grad_norm": 1.5750290155410767,
"learning_rate": 2.2569063193113716e-05,
"loss": 1.2115,
"step": 1021
},
{
"epoch": 0.7847955461700902,
"grad_norm": 1.4662597179412842,
"learning_rate": 2.2415539184735323e-05,
"loss": 0.9787,
"step": 1022
},
{
"epoch": 0.7855634478786715,
"grad_norm": 1.5807639360427856,
"learning_rate": 2.2262473188748667e-05,
"loss": 1.5,
"step": 1023
},
{
"epoch": 0.7863313495872528,
"grad_norm": 1.4712680578231812,
"learning_rate": 2.210986610876421e-05,
"loss": 1.1427,
"step": 1024
},
{
"epoch": 0.7870992512958341,
"grad_norm": 1.7284588813781738,
"learning_rate": 2.1957718845683106e-05,
"loss": 0.9901,
"step": 1025
},
{
"epoch": 0.7878671530044155,
"grad_norm": 1.7499277591705322,
"learning_rate": 2.180603229769208e-05,
"loss": 1.7659,
"step": 1026
},
{
"epoch": 0.7886350547129968,
"grad_norm": 2.04580020904541,
"learning_rate": 2.1654807360258068e-05,
"loss": 1.4249,
"step": 1027
},
{
"epoch": 0.789402956421578,
"grad_norm": 1.8796188831329346,
"learning_rate": 2.15040449261229e-05,
"loss": 1.4763,
"step": 1028
},
{
"epoch": 0.7901708581301593,
"grad_norm": 1.7241255044937134,
"learning_rate": 2.1353745885298103e-05,
"loss": 1.4569,
"step": 1029
},
{
"epoch": 0.7909387598387406,
"grad_norm": 1.2481935024261475,
"learning_rate": 2.120391112505955e-05,
"loss": 1.1742,
"step": 1030
},
{
"epoch": 0.7917066615473219,
"grad_norm": 1.440290093421936,
"learning_rate": 2.1054541529942374e-05,
"loss": 1.243,
"step": 1031
},
{
"epoch": 0.7924745632559033,
"grad_norm": 1.8699524402618408,
"learning_rate": 2.090563798173557e-05,
"loss": 1.3004,
"step": 1032
},
{
"epoch": 0.7932424649644846,
"grad_norm": 1.7997124195098877,
"learning_rate": 2.0757201359476884e-05,
"loss": 1.5915,
"step": 1033
},
{
"epoch": 0.7940103666730658,
"grad_norm": 1.4160553216934204,
"learning_rate": 2.060923253944761e-05,
"loss": 1.3386,
"step": 1034
},
{
"epoch": 0.7947782683816471,
"grad_norm": 1.4589799642562866,
"learning_rate": 2.0461732395167475e-05,
"loss": 1.3019,
"step": 1035
},
{
"epoch": 0.7955461700902284,
"grad_norm": 1.8283833265304565,
"learning_rate": 2.03147017973893e-05,
"loss": 1.3423,
"step": 1036
},
{
"epoch": 0.7963140717988098,
"grad_norm": 1.4732909202575684,
"learning_rate": 2.0168141614094126e-05,
"loss": 1.2183,
"step": 1037
},
{
"epoch": 0.7970819735073911,
"grad_norm": 1.5215457677841187,
"learning_rate": 2.0022052710485874e-05,
"loss": 1.2761,
"step": 1038
},
{
"epoch": 0.7978498752159724,
"grad_norm": 1.812894344329834,
"learning_rate": 1.9876435948986228e-05,
"loss": 1.4416,
"step": 1039
},
{
"epoch": 0.7986177769245536,
"grad_norm": 1.212895154953003,
"learning_rate": 1.973129218922981e-05,
"loss": 1.0733,
"step": 1040
},
{
"epoch": 0.7993856786331349,
"grad_norm": 1.5380189418792725,
"learning_rate": 1.95866222880588e-05,
"loss": 1.4282,
"step": 1041
},
{
"epoch": 0.8001535803417162,
"grad_norm": 1.671745777130127,
"learning_rate": 1.9442427099518022e-05,
"loss": 1.4183,
"step": 1042
},
{
"epoch": 0.8009214820502976,
"grad_norm": 1.6439645290374756,
"learning_rate": 1.9298707474849843e-05,
"loss": 1.2435,
"step": 1043
},
{
"epoch": 0.8016893837588789,
"grad_norm": 1.3178609609603882,
"learning_rate": 1.9155464262489298e-05,
"loss": 0.843,
"step": 1044
},
{
"epoch": 0.8024572854674602,
"grad_norm": 1.7350953817367554,
"learning_rate": 1.9012698308058852e-05,
"loss": 1.4906,
"step": 1045
},
{
"epoch": 0.8032251871760415,
"grad_norm": 1.2103064060211182,
"learning_rate": 1.8870410454363573e-05,
"loss": 0.9144,
"step": 1046
},
{
"epoch": 0.8039930888846227,
"grad_norm": 1.1046971082687378,
"learning_rate": 1.872860154138608e-05,
"loss": 0.9619,
"step": 1047
},
{
"epoch": 0.804760990593204,
"grad_norm": 1.832051157951355,
"learning_rate": 1.858727240628171e-05,
"loss": 1.4815,
"step": 1048
},
{
"epoch": 0.8055288923017854,
"grad_norm": 1.1102668046951294,
"learning_rate": 1.8446423883373286e-05,
"loss": 0.9905,
"step": 1049
},
{
"epoch": 0.8062967940103667,
"grad_norm": 1.155604600906372,
"learning_rate": 1.8306056804146575e-05,
"loss": 1.0172,
"step": 1050
},
{
"epoch": 0.807064695718948,
"grad_norm": 1.4365675449371338,
"learning_rate": 1.816617199724512e-05,
"loss": 1.3202,
"step": 1051
},
{
"epoch": 0.8078325974275293,
"grad_norm": 1.2660962343215942,
"learning_rate": 1.8026770288465323e-05,
"loss": 1.2372,
"step": 1052
},
{
"epoch": 0.8086004991361105,
"grad_norm": 1.484471082687378,
"learning_rate": 1.7887852500751822e-05,
"loss": 1.2489,
"step": 1053
},
{
"epoch": 0.8093684008446919,
"grad_norm": 1.62258780002594,
"learning_rate": 1.7749419454192373e-05,
"loss": 1.3631,
"step": 1054
},
{
"epoch": 0.8101363025532732,
"grad_norm": 1.2742317914962769,
"learning_rate": 1.7611471966013127e-05,
"loss": 1.0303,
"step": 1055
},
{
"epoch": 0.8109042042618545,
"grad_norm": 1.770555853843689,
"learning_rate": 1.7474010850573775e-05,
"loss": 1.4317,
"step": 1056
},
{
"epoch": 0.8116721059704358,
"grad_norm": 1.3370784521102905,
"learning_rate": 1.7337036919362827e-05,
"loss": 1.1577,
"step": 1057
},
{
"epoch": 0.8124400076790171,
"grad_norm": 1.4367464780807495,
"learning_rate": 1.7200550980992647e-05,
"loss": 1.3085,
"step": 1058
},
{
"epoch": 0.8132079093875983,
"grad_norm": 1.363452434539795,
"learning_rate": 1.706455384119485e-05,
"loss": 1.1423,
"step": 1059
},
{
"epoch": 0.8139758110961797,
"grad_norm": 1.3414223194122314,
"learning_rate": 1.6929046302815443e-05,
"loss": 1.2153,
"step": 1060
},
{
"epoch": 0.814743712804761,
"grad_norm": 1.577257513999939,
"learning_rate": 1.6794029165810133e-05,
"loss": 1.357,
"step": 1061
},
{
"epoch": 0.8155116145133423,
"grad_norm": 1.594196081161499,
"learning_rate": 1.665950322723957e-05,
"loss": 1.2597,
"step": 1062
},
{
"epoch": 0.8162795162219236,
"grad_norm": 1.5210363864898682,
"learning_rate": 1.652546928126466e-05,
"loss": 1.2459,
"step": 1063
},
{
"epoch": 0.817047417930505,
"grad_norm": 1.3744914531707764,
"learning_rate": 1.6391928119141965e-05,
"loss": 1.236,
"step": 1064
},
{
"epoch": 0.8178153196390862,
"grad_norm": 1.56074857711792,
"learning_rate": 1.625888052921878e-05,
"loss": 1.2514,
"step": 1065
},
{
"epoch": 0.8185832213476675,
"grad_norm": 1.3938485383987427,
"learning_rate": 1.612632729692881e-05,
"loss": 1.3724,
"step": 1066
},
{
"epoch": 0.8193511230562488,
"grad_norm": 1.2004398107528687,
"learning_rate": 1.599426920478726e-05,
"loss": 0.8796,
"step": 1067
},
{
"epoch": 0.8201190247648301,
"grad_norm": 1.6214113235473633,
"learning_rate": 1.586270703238637e-05,
"loss": 1.3691,
"step": 1068
},
{
"epoch": 0.8208869264734114,
"grad_norm": 1.7719411849975586,
"learning_rate": 1.573164155639073e-05,
"loss": 1.5033,
"step": 1069
},
{
"epoch": 0.8216548281819928,
"grad_norm": 1.2365126609802246,
"learning_rate": 1.560107355053282e-05,
"loss": 1.1611,
"step": 1070
},
{
"epoch": 0.822422729890574,
"grad_norm": 1.804200291633606,
"learning_rate": 1.5471003785608184e-05,
"loss": 1.2228,
"step": 1071
},
{
"epoch": 0.8231906315991553,
"grad_norm": 3.12967586517334,
"learning_rate": 1.534143302947123e-05,
"loss": 1.0455,
"step": 1072
},
{
"epoch": 0.8239585333077366,
"grad_norm": 1.7391802072525024,
"learning_rate": 1.5212362047030427e-05,
"loss": 1.4632,
"step": 1073
},
{
"epoch": 0.8247264350163179,
"grad_norm": 1.4144318103790283,
"learning_rate": 1.5083791600243857e-05,
"loss": 1.2948,
"step": 1074
},
{
"epoch": 0.8254943367248992,
"grad_norm": 1.2028521299362183,
"learning_rate": 1.4955722448114807e-05,
"loss": 0.9203,
"step": 1075
},
{
"epoch": 0.8262622384334806,
"grad_norm": 1.4866068363189697,
"learning_rate": 1.4828155346687123e-05,
"loss": 1.3486,
"step": 1076
},
{
"epoch": 0.8270301401420618,
"grad_norm": 1.471641182899475,
"learning_rate": 1.4701091049040994e-05,
"loss": 1.1633,
"step": 1077
},
{
"epoch": 0.8277980418506431,
"grad_norm": 1.3013348579406738,
"learning_rate": 1.4574530305288158e-05,
"loss": 0.9819,
"step": 1078
},
{
"epoch": 0.8285659435592244,
"grad_norm": 1.5000224113464355,
"learning_rate": 1.4448473862567857e-05,
"loss": 1.1458,
"step": 1079
},
{
"epoch": 0.8293338452678057,
"grad_norm": 1.4229687452316284,
"learning_rate": 1.4322922465042132e-05,
"loss": 1.3797,
"step": 1080
},
{
"epoch": 0.8301017469763871,
"grad_norm": 1.3966771364212036,
"learning_rate": 1.4197876853891557e-05,
"loss": 1.4436,
"step": 1081
},
{
"epoch": 0.8308696486849684,
"grad_norm": 1.3111258745193481,
"learning_rate": 1.4073337767310834e-05,
"loss": 1.175,
"step": 1082
},
{
"epoch": 0.8316375503935496,
"grad_norm": 1.4146270751953125,
"learning_rate": 1.3949305940504541e-05,
"loss": 1.1575,
"step": 1083
},
{
"epoch": 0.8324054521021309,
"grad_norm": 1.2625914812088013,
"learning_rate": 1.3825782105682527e-05,
"loss": 0.984,
"step": 1084
},
{
"epoch": 0.8331733538107122,
"grad_norm": 1.2037335634231567,
"learning_rate": 1.3702766992055927e-05,
"loss": 1.0908,
"step": 1085
},
{
"epoch": 0.8339412555192935,
"grad_norm": 1.384108304977417,
"learning_rate": 1.3580261325832578e-05,
"loss": 1.4079,
"step": 1086
},
{
"epoch": 0.8347091572278749,
"grad_norm": 1.0640181303024292,
"learning_rate": 1.3458265830212891e-05,
"loss": 0.68,
"step": 1087
},
{
"epoch": 0.8354770589364562,
"grad_norm": 1.6578110456466675,
"learning_rate": 1.333678122538553e-05,
"loss": 1.1659,
"step": 1088
},
{
"epoch": 0.8362449606450374,
"grad_norm": 1.3299002647399902,
"learning_rate": 1.321580822852313e-05,
"loss": 1.1961,
"step": 1089
},
{
"epoch": 0.8370128623536187,
"grad_norm": 1.5801900625228882,
"learning_rate": 1.3095347553778193e-05,
"loss": 1.3358,
"step": 1090
},
{
"epoch": 0.8377807640622,
"grad_norm": 1.6045485734939575,
"learning_rate": 1.2975399912278608e-05,
"loss": 1.3452,
"step": 1091
},
{
"epoch": 0.8385486657707814,
"grad_norm": 1.3222181797027588,
"learning_rate": 1.2855966012123822e-05,
"loss": 1.0153,
"step": 1092
},
{
"epoch": 0.8393165674793627,
"grad_norm": 1.5065910816192627,
"learning_rate": 1.2737046558380305e-05,
"loss": 1.3556,
"step": 1093
},
{
"epoch": 0.840084469187944,
"grad_norm": 1.6371604204177856,
"learning_rate": 1.2618642253077628e-05,
"loss": 1.3203,
"step": 1094
},
{
"epoch": 0.8408523708965252,
"grad_norm": 1.801613688468933,
"learning_rate": 1.2500753795204157e-05,
"loss": 1.272,
"step": 1095
},
{
"epoch": 0.8416202726051065,
"grad_norm": 1.6735864877700806,
"learning_rate": 1.2383381880703138e-05,
"loss": 1.2709,
"step": 1096
},
{
"epoch": 0.8423881743136878,
"grad_norm": 1.3746118545532227,
"learning_rate": 1.2266527202468248e-05,
"loss": 1.164,
"step": 1097
},
{
"epoch": 0.8431560760222692,
"grad_norm": 1.5882354974746704,
"learning_rate": 1.2150190450339915e-05,
"loss": 0.8384,
"step": 1098
},
{
"epoch": 0.8439239777308505,
"grad_norm": 1.492803931236267,
"learning_rate": 1.2034372311100905e-05,
"loss": 1.4302,
"step": 1099
},
{
"epoch": 0.8446918794394318,
"grad_norm": 1.868151307106018,
"learning_rate": 1.1919073468472475e-05,
"loss": 1.24,
"step": 1100
},
{
"epoch": 0.845459781148013,
"grad_norm": 1.3878445625305176,
"learning_rate": 1.1804294603110222e-05,
"loss": 1.1959,
"step": 1101
},
{
"epoch": 0.8462276828565943,
"grad_norm": 1.6517834663391113,
"learning_rate": 1.1690036392600112e-05,
"loss": 1.4946,
"step": 1102
},
{
"epoch": 0.8469955845651757,
"grad_norm": 1.5959718227386475,
"learning_rate": 1.1576299511454513e-05,
"loss": 1.3349,
"step": 1103
},
{
"epoch": 0.847763486273757,
"grad_norm": 1.3100310564041138,
"learning_rate": 1.1463084631108101e-05,
"loss": 1.0446,
"step": 1104
},
{
"epoch": 0.8485313879823383,
"grad_norm": 1.5334558486938477,
"learning_rate": 1.135039241991408e-05,
"loss": 1.0935,
"step": 1105
},
{
"epoch": 0.8492992896909196,
"grad_norm": 1.306308627128601,
"learning_rate": 1.1238223543140024e-05,
"loss": 1.0424,
"step": 1106
},
{
"epoch": 0.8500671913995008,
"grad_norm": 1.5370351076126099,
"learning_rate": 1.1126578662964115e-05,
"loss": 1.1695,
"step": 1107
},
{
"epoch": 0.8508350931080821,
"grad_norm": 1.5034894943237305,
"learning_rate": 1.1015458438471116e-05,
"loss": 1.3281,
"step": 1108
},
{
"epoch": 0.8516029948166635,
"grad_norm": 1.815669059753418,
"learning_rate": 1.0904863525648633e-05,
"loss": 1.3144,
"step": 1109
},
{
"epoch": 0.8523708965252448,
"grad_norm": 1.2194162607192993,
"learning_rate": 1.0794794577383016e-05,
"loss": 1.1239,
"step": 1110
},
{
"epoch": 0.8531387982338261,
"grad_norm": 1.3530551195144653,
"learning_rate": 1.0685252243455712e-05,
"loss": 1.1957,
"step": 1111
},
{
"epoch": 0.8539066999424074,
"grad_norm": 1.437690019607544,
"learning_rate": 1.0576237170539383e-05,
"loss": 1.1088,
"step": 1112
},
{
"epoch": 0.8546746016509886,
"grad_norm": 1.7501755952835083,
"learning_rate": 1.0467750002193944e-05,
"loss": 1.3246,
"step": 1113
},
{
"epoch": 0.85544250335957,
"grad_norm": 1.587377905845642,
"learning_rate": 1.0359791378863005e-05,
"loss": 1.2558,
"step": 1114
},
{
"epoch": 0.8562104050681513,
"grad_norm": 1.8887481689453125,
"learning_rate": 1.025236193786987e-05,
"loss": 1.523,
"step": 1115
},
{
"epoch": 0.8569783067767326,
"grad_norm": 1.4728237390518188,
"learning_rate": 1.014546231341391e-05,
"loss": 1.4592,
"step": 1116
},
{
"epoch": 0.8577462084853139,
"grad_norm": 1.5659328699111938,
"learning_rate": 1.0039093136566735e-05,
"loss": 1.2452,
"step": 1117
},
{
"epoch": 0.8585141101938952,
"grad_norm": 1.501523733139038,
"learning_rate": 9.933255035268574e-06,
"loss": 1.1862,
"step": 1118
},
{
"epoch": 0.8592820119024764,
"grad_norm": 1.4726933240890503,
"learning_rate": 9.827948634324447e-06,
"loss": 1.1691,
"step": 1119
},
{
"epoch": 0.8600499136110578,
"grad_norm": 1.601131558418274,
"learning_rate": 9.72317455540055e-06,
"loss": 1.2678,
"step": 1120
},
{
"epoch": 0.8608178153196391,
"grad_norm": 1.5712275505065918,
"learning_rate": 9.61893341702056e-06,
"loss": 1.1582,
"step": 1121
},
{
"epoch": 0.8615857170282204,
"grad_norm": 1.4145914316177368,
"learning_rate": 9.515225834562003e-06,
"loss": 0.9806,
"step": 1122
},
{
"epoch": 0.8623536187368017,
"grad_norm": 1.4771418571472168,
"learning_rate": 9.412052420252605e-06,
"loss": 1.233,
"step": 1123
},
{
"epoch": 0.863121520445383,
"grad_norm": 1.4225813150405884,
"learning_rate": 9.309413783166654e-06,
"loss": 0.8999,
"step": 1124
},
{
"epoch": 0.8638894221539642,
"grad_norm": 1.6784931421279907,
"learning_rate": 9.207310529221525e-06,
"loss": 1.2208,
"step": 1125
},
{
"epoch": 0.8646573238625456,
"grad_norm": 1.05604887008667,
"learning_rate": 9.10574326117386e-06,
"loss": 0.9784,
"step": 1126
},
{
"epoch": 0.8654252255711269,
"grad_norm": 1.5101516246795654,
"learning_rate": 9.004712578616304e-06,
"loss": 1.2459,
"step": 1127
},
{
"epoch": 0.8661931272797082,
"grad_norm": 1.5375887155532837,
"learning_rate": 8.90421907797374e-06,
"loss": 1.3941,
"step": 1128
},
{
"epoch": 0.8669610289882895,
"grad_norm": 1.382124423980713,
"learning_rate": 8.804263352499864e-06,
"loss": 1.0871,
"step": 1129
},
{
"epoch": 0.8677289306968708,
"grad_norm": 1.4245071411132812,
"learning_rate": 8.70484599227367e-06,
"loss": 1.0823,
"step": 1130
},
{
"epoch": 0.8684968324054521,
"grad_norm": 1.651361107826233,
"learning_rate": 8.605967584195995e-06,
"loss": 1.3007,
"step": 1131
},
{
"epoch": 0.8692647341140334,
"grad_norm": 1.8151158094406128,
"learning_rate": 8.507628711985983e-06,
"loss": 1.3263,
"step": 1132
},
{
"epoch": 0.8700326358226147,
"grad_norm": 1.6930426359176636,
"learning_rate": 8.409829956177684e-06,
"loss": 1.2812,
"step": 1133
},
{
"epoch": 0.870800537531196,
"grad_norm": 1.6178615093231201,
"learning_rate": 8.312571894116649e-06,
"loss": 1.2922,
"step": 1134
},
{
"epoch": 0.8715684392397773,
"grad_norm": 1.7102075815200806,
"learning_rate": 8.215855099956472e-06,
"loss": 1.1281,
"step": 1135
},
{
"epoch": 0.8723363409483587,
"grad_norm": 1.4845244884490967,
"learning_rate": 8.119680144655428e-06,
"loss": 1.0213,
"step": 1136
},
{
"epoch": 0.8731042426569399,
"grad_norm": 1.4863834381103516,
"learning_rate": 8.024047595973095e-06,
"loss": 1.0862,
"step": 1137
},
{
"epoch": 0.8738721443655212,
"grad_norm": 1.2989436388015747,
"learning_rate": 7.92895801846707e-06,
"loss": 1.052,
"step": 1138
},
{
"epoch": 0.8746400460741025,
"grad_norm": 1.3424791097640991,
"learning_rate": 7.834411973489419e-06,
"loss": 1.3913,
"step": 1139
},
{
"epoch": 0.8754079477826838,
"grad_norm": 1.5543662309646606,
"learning_rate": 7.740410019183697e-06,
"loss": 1.4759,
"step": 1140
},
{
"epoch": 0.8761758494912651,
"grad_norm": 1.499265193939209,
"learning_rate": 7.646952710481336e-06,
"loss": 1.3668,
"step": 1141
},
{
"epoch": 0.8769437511998465,
"grad_norm": 1.6394531726837158,
"learning_rate": 7.554040599098588e-06,
"loss": 1.1713,
"step": 1142
},
{
"epoch": 0.8777116529084277,
"grad_norm": 1.4457104206085205,
"learning_rate": 7.461674233533123e-06,
"loss": 1.3674,
"step": 1143
},
{
"epoch": 0.878479554617009,
"grad_norm": 1.468809723854065,
"learning_rate": 7.369854159060929e-06,
"loss": 1.0444,
"step": 1144
},
{
"epoch": 0.8792474563255903,
"grad_norm": 1.610374093055725,
"learning_rate": 7.278580917732913e-06,
"loss": 1.5037,
"step": 1145
},
{
"epoch": 0.8800153580341716,
"grad_norm": 1.3369286060333252,
"learning_rate": 7.187855048371917e-06,
"loss": 1.129,
"step": 1146
},
{
"epoch": 0.880783259742753,
"grad_norm": 1.4274189472198486,
"learning_rate": 7.097677086569343e-06,
"loss": 1.2629,
"step": 1147
},
{
"epoch": 0.8815511614513343,
"grad_norm": 1.4373105764389038,
"learning_rate": 7.008047564682119e-06,
"loss": 1.2244,
"step": 1148
},
{
"epoch": 0.8823190631599155,
"grad_norm": 1.5060944557189941,
"learning_rate": 6.91896701182948e-06,
"loss": 1.3279,
"step": 1149
},
{
"epoch": 0.8830869648684968,
"grad_norm": 1.1090878248214722,
"learning_rate": 6.83043595388988e-06,
"loss": 0.9791,
"step": 1150
},
{
"epoch": 0.8838548665770781,
"grad_norm": 1.6525593996047974,
"learning_rate": 6.742454913497942e-06,
"loss": 1.2751,
"step": 1151
},
{
"epoch": 0.8846227682856594,
"grad_norm": 1.5659713745117188,
"learning_rate": 6.6550244100412e-06,
"loss": 1.3544,
"step": 1152
},
{
"epoch": 0.8853906699942408,
"grad_norm": 1.4881879091262817,
"learning_rate": 6.568144959657263e-06,
"loss": 1.1873,
"step": 1153
},
{
"epoch": 0.8861585717028221,
"grad_norm": 1.8786675930023193,
"learning_rate": 6.481817075230567e-06,
"loss": 1.3176,
"step": 1154
},
{
"epoch": 0.8869264734114033,
"grad_norm": 1.4211797714233398,
"learning_rate": 6.396041266389474e-06,
"loss": 1.3475,
"step": 1155
},
{
"epoch": 0.8876943751199846,
"grad_norm": 1.4349972009658813,
"learning_rate": 6.3108180395031965e-06,
"loss": 1.1019,
"step": 1156
},
{
"epoch": 0.8884622768285659,
"grad_norm": 1.3336567878723145,
"learning_rate": 6.22614789767888e-06,
"loss": 1.1098,
"step": 1157
},
{
"epoch": 0.8892301785371473,
"grad_norm": 1.4133919477462769,
"learning_rate": 6.142031340758525e-06,
"loss": 1.2118,
"step": 1158
},
{
"epoch": 0.8899980802457286,
"grad_norm": 1.3893505334854126,
"learning_rate": 6.058468865316102e-06,
"loss": 1.1315,
"step": 1159
},
{
"epoch": 0.8907659819543099,
"grad_norm": 1.786585807800293,
"learning_rate": 5.975460964654689e-06,
"loss": 1.6215,
"step": 1160
},
{
"epoch": 0.8915338836628911,
"grad_norm": 1.3746907711029053,
"learning_rate": 5.8930081288034014e-06,
"loss": 1.0703,
"step": 1161
},
{
"epoch": 0.8923017853714724,
"grad_norm": 1.420802116394043,
"learning_rate": 5.8111108445146116e-06,
"loss": 1.0426,
"step": 1162
},
{
"epoch": 0.8930696870800537,
"grad_norm": 1.5010699033737183,
"learning_rate": 5.72976959526107e-06,
"loss": 1.3698,
"step": 1163
},
{
"epoch": 0.8938375887886351,
"grad_norm": 1.3752045631408691,
"learning_rate": 5.648984861232986e-06,
"loss": 1.2014,
"step": 1164
},
{
"epoch": 0.8946054904972164,
"grad_norm": 1.5743210315704346,
"learning_rate": 5.568757119335244e-06,
"loss": 1.4007,
"step": 1165
},
{
"epoch": 0.8953733922057977,
"grad_norm": 1.4705897569656372,
"learning_rate": 5.489086843184632e-06,
"loss": 1.0485,
"step": 1166
},
{
"epoch": 0.8961412939143789,
"grad_norm": 1.4664305448532104,
"learning_rate": 5.40997450310693e-06,
"loss": 1.3765,
"step": 1167
},
{
"epoch": 0.8969091956229602,
"grad_norm": 1.6887331008911133,
"learning_rate": 5.3314205661342155e-06,
"loss": 1.3474,
"step": 1168
},
{
"epoch": 0.8976770973315416,
"grad_norm": 1.5927088260650635,
"learning_rate": 5.253425496002084e-06,
"loss": 1.2915,
"step": 1169
},
{
"epoch": 0.8984449990401229,
"grad_norm": 1.3032876253128052,
"learning_rate": 5.175989753146948e-06,
"loss": 1.2463,
"step": 1170
},
{
"epoch": 0.8992129007487042,
"grad_norm": 1.2626043558120728,
"learning_rate": 5.099113794703225e-06,
"loss": 1.2223,
"step": 1171
},
{
"epoch": 0.8999808024572855,
"grad_norm": 1.5485360622406006,
"learning_rate": 5.0227980745007345e-06,
"loss": 1.4437,
"step": 1172
},
{
"epoch": 0.9007487041658667,
"grad_norm": 1.3756585121154785,
"learning_rate": 4.947043043062016e-06,
"loss": 1.3702,
"step": 1173
},
{
"epoch": 0.901516605874448,
"grad_norm": 1.281550645828247,
"learning_rate": 4.87184914759955e-06,
"loss": 0.8686,
"step": 1174
},
{
"epoch": 0.9022845075830294,
"grad_norm": 1.7812248468399048,
"learning_rate": 4.7972168320132845e-06,
"loss": 1.2599,
"step": 1175
},
{
"epoch": 0.9030524092916107,
"grad_norm": 1.186799168586731,
"learning_rate": 4.7231465368879124e-06,
"loss": 1.0582,
"step": 1176
},
{
"epoch": 0.903820311000192,
"grad_norm": 1.4105154275894165,
"learning_rate": 4.649638699490266e-06,
"loss": 1.1345,
"step": 1177
},
{
"epoch": 0.9045882127087733,
"grad_norm": 1.4764769077301025,
"learning_rate": 4.576693753766792e-06,
"loss": 1.3989,
"step": 1178
},
{
"epoch": 0.9053561144173545,
"grad_norm": 2.1660075187683105,
"learning_rate": 4.5043121303409595e-06,
"loss": 1.1665,
"step": 1179
},
{
"epoch": 0.9061240161259358,
"grad_norm": 1.262603521347046,
"learning_rate": 4.432494256510711e-06,
"loss": 0.9425,
"step": 1180
},
{
"epoch": 0.9068919178345172,
"grad_norm": 1.8108893632888794,
"learning_rate": 4.361240556245938e-06,
"loss": 1.2851,
"step": 1181
},
{
"epoch": 0.9076598195430985,
"grad_norm": 1.5858032703399658,
"learning_rate": 4.290551450185986e-06,
"loss": 1.4577,
"step": 1182
},
{
"epoch": 0.9084277212516798,
"grad_norm": 1.2271817922592163,
"learning_rate": 4.220427355637224e-06,
"loss": 1.1701,
"step": 1183
},
{
"epoch": 0.9091956229602611,
"grad_norm": 1.280598759651184,
"learning_rate": 4.150868686570464e-06,
"loss": 1.2309,
"step": 1184
},
{
"epoch": 0.9099635246688423,
"grad_norm": 1.40516996383667,
"learning_rate": 4.081875853618588e-06,
"loss": 1.0201,
"step": 1185
},
{
"epoch": 0.9107314263774237,
"grad_norm": 1.4448282718658447,
"learning_rate": 4.013449264074187e-06,
"loss": 1.1834,
"step": 1186
},
{
"epoch": 0.911499328086005,
"grad_norm": 1.2964657545089722,
"learning_rate": 3.945589321886989e-06,
"loss": 1.0199,
"step": 1187
},
{
"epoch": 0.9122672297945863,
"grad_norm": 1.3470900058746338,
"learning_rate": 3.878296427661676e-06,
"loss": 1.0585,
"step": 1188
},
{
"epoch": 0.9130351315031676,
"grad_norm": 1.3308759927749634,
"learning_rate": 3.8115709786553435e-06,
"loss": 1.3752,
"step": 1189
},
{
"epoch": 0.9138030332117489,
"grad_norm": 1.4458931684494019,
"learning_rate": 3.7454133687752524e-06,
"loss": 1.1349,
"step": 1190
},
{
"epoch": 0.9145709349203301,
"grad_norm": 1.471204400062561,
"learning_rate": 3.6798239885764806e-06,
"loss": 1.126,
"step": 1191
},
{
"epoch": 0.9153388366289115,
"grad_norm": 1.403232216835022,
"learning_rate": 3.614803225259622e-06,
"loss": 1.1377,
"step": 1192
},
{
"epoch": 0.9161067383374928,
"grad_norm": 1.5859801769256592,
"learning_rate": 3.550351462668489e-06,
"loss": 1.2045,
"step": 1193
},
{
"epoch": 0.9168746400460741,
"grad_norm": 1.2398918867111206,
"learning_rate": 3.4864690812878688e-06,
"loss": 1.111,
"step": 1194
},
{
"epoch": 0.9176425417546554,
"grad_norm": 1.455277681350708,
"learning_rate": 3.4231564582412167e-06,
"loss": 1.1314,
"step": 1195
},
{
"epoch": 0.9184104434632367,
"grad_norm": 1.5992294549942017,
"learning_rate": 3.3604139672885227e-06,
"loss": 1.2585,
"step": 1196
},
{
"epoch": 0.919178345171818,
"grad_norm": 1.6772078275680542,
"learning_rate": 3.298241978824046e-06,
"loss": 1.0393,
"step": 1197
},
{
"epoch": 0.9199462468803993,
"grad_norm": 1.202988862991333,
"learning_rate": 3.2366408598741072e-06,
"loss": 1.0763,
"step": 1198
},
{
"epoch": 0.9207141485889806,
"grad_norm": 1.5782155990600586,
"learning_rate": 3.175610974095011e-06,
"loss": 1.3341,
"step": 1199
},
{
"epoch": 0.9214820502975619,
"grad_norm": 1.611743450164795,
"learning_rate": 3.115152681770783e-06,
"loss": 1.0767,
"step": 1200
},
{
"epoch": 0.9222499520061432,
"grad_norm": 1.6371206045150757,
"learning_rate": 3.055266339811147e-06,
"loss": 0.7947,
"step": 1201
},
{
"epoch": 0.9230178537147246,
"grad_norm": 1.335860252380371,
"learning_rate": 2.9959523017493386e-06,
"loss": 1.2236,
"step": 1202
},
{
"epoch": 0.9237857554233058,
"grad_norm": 1.4622019529342651,
"learning_rate": 2.9372109177400854e-06,
"loss": 1.5577,
"step": 1203
},
{
"epoch": 0.9245536571318871,
"grad_norm": 1.4906941652297974,
"learning_rate": 2.8790425345574745e-06,
"loss": 1.1759,
"step": 1204
},
{
"epoch": 0.9253215588404684,
"grad_norm": 1.2302792072296143,
"learning_rate": 2.821447495592977e-06,
"loss": 0.7965,
"step": 1205
},
{
"epoch": 0.9260894605490497,
"grad_norm": 1.4680209159851074,
"learning_rate": 2.7644261408533155e-06,
"loss": 1.213,
"step": 1206
},
{
"epoch": 0.926857362257631,
"grad_norm": 1.737641453742981,
"learning_rate": 2.707978806958611e-06,
"loss": 1.1734,
"step": 1207
},
{
"epoch": 0.9276252639662124,
"grad_norm": 1.386087417602539,
"learning_rate": 2.6521058271402386e-06,
"loss": 1.3075,
"step": 1208
},
{
"epoch": 0.9283931656747936,
"grad_norm": 1.5818560123443604,
"learning_rate": 2.596807531238965e-06,
"loss": 1.3811,
"step": 1209
},
{
"epoch": 0.9291610673833749,
"grad_norm": 1.2463423013687134,
"learning_rate": 2.542084245702947e-06,
"loss": 1.2488,
"step": 1210
},
{
"epoch": 0.9299289690919562,
"grad_norm": 1.1760039329528809,
"learning_rate": 2.487936293585813e-06,
"loss": 0.9825,
"step": 1211
},
{
"epoch": 0.9306968708005375,
"grad_norm": 1.2284053564071655,
"learning_rate": 2.4343639945448306e-06,
"loss": 1.12,
"step": 1212
},
{
"epoch": 0.9314647725091189,
"grad_norm": 1.4763203859329224,
"learning_rate": 2.3813676648388517e-06,
"loss": 1.1244,
"step": 1213
},
{
"epoch": 0.9322326742177002,
"grad_norm": 1.6483160257339478,
"learning_rate": 2.3289476173266376e-06,
"loss": 1.3215,
"step": 1214
},
{
"epoch": 0.9330005759262814,
"grad_norm": 1.625486135482788,
"learning_rate": 2.2771041614648825e-06,
"loss": 1.578,
"step": 1215
},
{
"epoch": 0.9337684776348627,
"grad_norm": 1.7281734943389893,
"learning_rate": 2.2258376033064354e-06,
"loss": 1.3752,
"step": 1216
},
{
"epoch": 0.934536379343444,
"grad_norm": 1.1822775602340698,
"learning_rate": 2.1751482454984706e-06,
"loss": 0.9411,
"step": 1217
},
{
"epoch": 0.9353042810520253,
"grad_norm": 1.5272676944732666,
"learning_rate": 2.1250363872807655e-06,
"loss": 1.3861,
"step": 1218
},
{
"epoch": 0.9360721827606067,
"grad_norm": 1.6605676412582397,
"learning_rate": 2.0755023244838136e-06,
"loss": 1.5562,
"step": 1219
},
{
"epoch": 0.936840084469188,
"grad_norm": 1.4125683307647705,
"learning_rate": 2.026546349527181e-06,
"loss": 1.0978,
"step": 1220
},
{
"epoch": 0.9376079861777692,
"grad_norm": 1.3512035608291626,
"learning_rate": 1.978168751417786e-06,
"loss": 0.9322,
"step": 1221
},
{
"epoch": 0.9383758878863505,
"grad_norm": 1.7064318656921387,
"learning_rate": 1.93036981574809e-06,
"loss": 1.2816,
"step": 1222
},
{
"epoch": 0.9391437895949318,
"grad_norm": 1.2211792469024658,
"learning_rate": 1.8831498246945189e-06,
"loss": 1.0071,
"step": 1223
},
{
"epoch": 0.9399116913035132,
"grad_norm": 1.3540048599243164,
"learning_rate": 1.836509057015734e-06,
"loss": 1.202,
"step": 1224
},
{
"epoch": 0.9406795930120945,
"grad_norm": 1.3373445272445679,
"learning_rate": 1.7904477880510307e-06,
"loss": 1.142,
"step": 1225
},
{
"epoch": 0.9414474947206758,
"grad_norm": 1.38016939163208,
"learning_rate": 1.7449662897186414e-06,
"loss": 1.3193,
"step": 1226
},
{
"epoch": 0.942215396429257,
"grad_norm": 1.3662525415420532,
"learning_rate": 1.7000648305142364e-06,
"loss": 1.0969,
"step": 1227
},
{
"epoch": 0.9429832981378383,
"grad_norm": 1.7097259759902954,
"learning_rate": 1.655743675509258e-06,
"loss": 1.4585,
"step": 1228
},
{
"epoch": 0.9437511998464196,
"grad_norm": 1.177868127822876,
"learning_rate": 1.6120030863493674e-06,
"loss": 0.9733,
"step": 1229
},
{
"epoch": 0.944519101555001,
"grad_norm": 1.481663465499878,
"learning_rate": 1.5688433212529107e-06,
"loss": 1.0466,
"step": 1230
},
{
"epoch": 0.9452870032635823,
"grad_norm": 1.6859853267669678,
"learning_rate": 1.5262646350094334e-06,
"loss": 1.2808,
"step": 1231
},
{
"epoch": 0.9460549049721636,
"grad_norm": 1.7730293273925781,
"learning_rate": 1.4842672789780798e-06,
"loss": 1.2563,
"step": 1232
},
{
"epoch": 0.9468228066807448,
"grad_norm": 1.5149153470993042,
"learning_rate": 1.4428515010861955e-06,
"loss": 1.249,
"step": 1233
},
{
"epoch": 0.9475907083893261,
"grad_norm": 1.8611276149749756,
"learning_rate": 1.4020175458278607e-06,
"loss": 1.5504,
"step": 1234
},
{
"epoch": 0.9483586100979075,
"grad_norm": 1.5275269746780396,
"learning_rate": 1.3617656542623813e-06,
"loss": 1.3417,
"step": 1235
},
{
"epoch": 0.9491265118064888,
"grad_norm": 1.3223843574523926,
"learning_rate": 1.322096064012912e-06,
"loss": 1.1907,
"step": 1236
},
{
"epoch": 0.9498944135150701,
"grad_norm": 1.4529744386672974,
"learning_rate": 1.2830090092650904e-06,
"loss": 1.16,
"step": 1237
},
{
"epoch": 0.9506623152236514,
"grad_norm": 1.4799784421920776,
"learning_rate": 1.244504720765549e-06,
"loss": 1.3583,
"step": 1238
},
{
"epoch": 0.9514302169322326,
"grad_norm": 1.4964150190353394,
"learning_rate": 1.20658342582064e-06,
"loss": 1.2563,
"step": 1239
},
{
"epoch": 0.9521981186408139,
"grad_norm": 1.318679690361023,
"learning_rate": 1.1692453482951115e-06,
"loss": 1.1699,
"step": 1240
},
{
"epoch": 0.9529660203493953,
"grad_norm": 1.2729301452636719,
"learning_rate": 1.1324907086106895e-06,
"loss": 1.0768,
"step": 1241
},
{
"epoch": 0.9537339220579766,
"grad_norm": 1.4914193153381348,
"learning_rate": 1.096319723744843e-06,
"loss": 0.9835,
"step": 1242
},
{
"epoch": 0.9545018237665579,
"grad_norm": 1.4499318599700928,
"learning_rate": 1.0607326072295087e-06,
"loss": 1.1102,
"step": 1243
},
{
"epoch": 0.9552697254751392,
"grad_norm": 1.4721513986587524,
"learning_rate": 1.0257295691497914e-06,
"loss": 1.234,
"step": 1244
},
{
"epoch": 0.9560376271837204,
"grad_norm": 1.553381323814392,
"learning_rate": 9.913108161427543e-07,
"loss": 1.2512,
"step": 1245
},
{
"epoch": 0.9568055288923017,
"grad_norm": 1.384883165359497,
"learning_rate": 9.57476551396197e-07,
"loss": 1.2325,
"step": 1246
},
{
"epoch": 0.9575734306008831,
"grad_norm": 1.6136502027511597,
"learning_rate": 9.242269746474575e-07,
"loss": 1.4892,
"step": 1247
},
{
"epoch": 0.9583413323094644,
"grad_norm": 1.5573700666427612,
"learning_rate": 8.915622821821789e-07,
"loss": 1.447,
"step": 1248
},
{
"epoch": 0.9591092340180457,
"grad_norm": 1.47049880027771,
"learning_rate": 8.594826668332445e-07,
"loss": 1.1597,
"step": 1249
},
{
"epoch": 0.959877135726627,
"grad_norm": 0.9694231748580933,
"learning_rate": 8.279883179795666e-07,
"loss": 0.8528,
"step": 1250
},
{
"epoch": 0.9606450374352083,
"grad_norm": 1.28829026222229,
"learning_rate": 7.970794215450106e-07,
"loss": 1.1936,
"step": 1251
},
{
"epoch": 0.9614129391437896,
"grad_norm": 1.6050745248794556,
"learning_rate": 7.667561599972505e-07,
"loss": 1.3697,
"step": 1252
},
{
"epoch": 0.9621808408523709,
"grad_norm": 1.1561931371688843,
"learning_rate": 7.370187123467708e-07,
"loss": 0.9132,
"step": 1253
},
{
"epoch": 0.9629487425609522,
"grad_norm": 1.2682461738586426,
"learning_rate": 7.078672541456999e-07,
"loss": 1.1176,
"step": 1254
},
{
"epoch": 0.9637166442695335,
"grad_norm": 1.4032613039016724,
"learning_rate": 6.793019574868775e-07,
"loss": 1.2812,
"step": 1255
},
{
"epoch": 0.9644845459781148,
"grad_norm": 1.855252981185913,
"learning_rate": 6.513229910027896e-07,
"loss": 1.4774,
"step": 1256
},
{
"epoch": 0.9652524476866962,
"grad_norm": 1.4534977674484253,
"learning_rate": 6.239305198645462e-07,
"loss": 1.097,
"step": 1257
},
{
"epoch": 0.9660203493952774,
"grad_norm": 1.4636932611465454,
"learning_rate": 5.971247057809826e-07,
"loss": 1.2459,
"step": 1258
},
{
"epoch": 0.9667882511038587,
"grad_norm": 1.318543553352356,
"learning_rate": 5.709057069976265e-07,
"loss": 1.342,
"step": 1259
},
{
"epoch": 0.96755615281244,
"grad_norm": 1.2237138748168945,
"learning_rate": 5.452736782958323e-07,
"loss": 1.0311,
"step": 1260
},
{
"epoch": 0.9683240545210213,
"grad_norm": 1.3968307971954346,
"learning_rate": 5.20228770991793e-07,
"loss": 1.2152,
"step": 1261
},
{
"epoch": 0.9690919562296026,
"grad_norm": 1.2888987064361572,
"learning_rate": 4.957711329357073e-07,
"loss": 0.9803,
"step": 1262
},
{
"epoch": 0.969859857938184,
"grad_norm": 1.2455861568450928,
"learning_rate": 4.7190090851090274e-07,
"loss": 0.814,
"step": 1263
},
{
"epoch": 0.9706277596467652,
"grad_norm": 1.263824462890625,
"learning_rate": 4.4861823863292516e-07,
"loss": 0.9433,
"step": 1264
},
{
"epoch": 0.9713956613553465,
"grad_norm": 1.63260817527771,
"learning_rate": 4.259232607487951e-07,
"loss": 1.4563,
"step": 1265
},
{
"epoch": 0.9721635630639278,
"grad_norm": 1.3201172351837158,
"learning_rate": 4.038161088361192e-07,
"loss": 1.0101,
"step": 1266
},
{
"epoch": 0.9729314647725091,
"grad_norm": 1.456137776374817,
"learning_rate": 3.8229691340234684e-07,
"loss": 1.3377,
"step": 1267
},
{
"epoch": 0.9736993664810905,
"grad_norm": 1.9307949542999268,
"learning_rate": 3.613658014839594e-07,
"loss": 1.5946,
"step": 1268
},
{
"epoch": 0.9744672681896718,
"grad_norm": 1.0594274997711182,
"learning_rate": 3.4102289664578177e-07,
"loss": 1.0772,
"step": 1269
},
{
"epoch": 0.975235169898253,
"grad_norm": 1.3033182621002197,
"learning_rate": 3.212683189801724e-07,
"loss": 1.2004,
"step": 1270
},
{
"epoch": 0.9760030716068343,
"grad_norm": 1.574930191040039,
"learning_rate": 3.021021851063899e-07,
"loss": 1.266,
"step": 1271
},
{
"epoch": 0.9767709733154156,
"grad_norm": 1.5406841039657593,
"learning_rate": 2.8352460816986057e-07,
"loss": 1.0827,
"step": 1272
},
{
"epoch": 0.977538875023997,
"grad_norm": 1.1407910585403442,
"learning_rate": 2.6553569784152357e-07,
"loss": 0.8456,
"step": 1273
},
{
"epoch": 0.9783067767325783,
"grad_norm": 1.4787224531173706,
"learning_rate": 2.481355603171531e-07,
"loss": 1.3375,
"step": 1274
},
{
"epoch": 0.9790746784411596,
"grad_norm": 1.514699935913086,
"learning_rate": 2.3132429831682622e-07,
"loss": 1.4017,
"step": 1275
},
{
"epoch": 0.9798425801497408,
"grad_norm": 1.372117519378662,
"learning_rate": 2.1510201108416728e-07,
"loss": 1.1619,
"step": 1276
},
{
"epoch": 0.9806104818583221,
"grad_norm": 1.2351239919662476,
"learning_rate": 1.9946879438592636e-07,
"loss": 0.945,
"step": 1277
},
{
"epoch": 0.9813783835669034,
"grad_norm": 1.4396629333496094,
"learning_rate": 1.8442474051125757e-07,
"loss": 0.9939,
"step": 1278
},
{
"epoch": 0.9821462852754848,
"grad_norm": 1.9772976636886597,
"learning_rate": 1.6996993827129715e-07,
"loss": 1.4349,
"step": 1279
},
{
"epoch": 0.9829141869840661,
"grad_norm": 1.2860651016235352,
"learning_rate": 1.561044729985861e-07,
"loss": 0.9433,
"step": 1280
},
{
"epoch": 0.9836820886926474,
"grad_norm": 1.496124505996704,
"learning_rate": 1.428284265465596e-07,
"loss": 1.2376,
"step": 1281
},
{
"epoch": 0.9844499904012286,
"grad_norm": 1.2294336557388306,
"learning_rate": 1.3014187728906945e-07,
"loss": 0.9644,
"step": 1282
},
{
"epoch": 0.9852178921098099,
"grad_norm": 1.182978630065918,
"learning_rate": 1.1804490011995129e-07,
"loss": 1.2047,
"step": 1283
},
{
"epoch": 0.9859857938183912,
"grad_norm": 1.6320767402648926,
"learning_rate": 1.0653756645252477e-07,
"loss": 1.4767,
"step": 1284
},
{
"epoch": 0.9867536955269726,
"grad_norm": 1.482164978981018,
"learning_rate": 9.561994421924958e-08,
"loss": 1.21,
"step": 1285
},
{
"epoch": 0.9875215972355539,
"grad_norm": 1.3324884176254272,
"learning_rate": 8.529209787123682e-08,
"loss": 1.0876,
"step": 1286
},
{
"epoch": 0.9882894989441352,
"grad_norm": 1.530920386314392,
"learning_rate": 7.555408837794931e-08,
"loss": 1.1581,
"step": 1287
},
{
"epoch": 0.9890574006527164,
"grad_norm": 1.4899532794952393,
"learning_rate": 6.640597322677967e-08,
"loss": 1.3396,
"step": 1288
},
{
"epoch": 0.9898253023612977,
"grad_norm": 1.2358452081680298,
"learning_rate": 5.784780642275056e-08,
"loss": 1.0957,
"step": 1289
},
{
"epoch": 0.990593204069879,
"grad_norm": 1.703139305114746,
"learning_rate": 4.9879638488159465e-08,
"loss": 1.2339,
"step": 1290
},
{
"epoch": 0.9913611057784604,
"grad_norm": 1.3851401805877686,
"learning_rate": 4.2501516462334356e-08,
"loss": 1.4007,
"step": 1291
},
{
"epoch": 0.9921290074870417,
"grad_norm": 1.52871572971344,
"learning_rate": 3.5713483901300696e-08,
"loss": 1.3904,
"step": 1292
},
{
"epoch": 0.992896909195623,
"grad_norm": 1.9627262353897095,
"learning_rate": 2.9515580877559346e-08,
"loss": 1.2959,
"step": 1293
},
{
"epoch": 0.9936648109042042,
"grad_norm": 2.1106338500976562,
"learning_rate": 2.3907843979831257e-08,
"loss": 1.7045,
"step": 1294
},
{
"epoch": 0.9944327126127855,
"grad_norm": 1.3870468139648438,
"learning_rate": 1.8890306312846495e-08,
"loss": 1.2606,
"step": 1295
},
{
"epoch": 0.9952006143213669,
"grad_norm": 1.4199986457824707,
"learning_rate": 1.446299749716662e-08,
"loss": 1.3145,
"step": 1296
},
{
"epoch": 0.9959685160299482,
"grad_norm": 1.398864984512329,
"learning_rate": 1.0625943668973736e-08,
"loss": 1.3875,
"step": 1297
},
{
"epoch": 0.9967364177385295,
"grad_norm": 1.4852126836776733,
"learning_rate": 7.379167479948379e-09,
"loss": 1.1144,
"step": 1298
},
{
"epoch": 0.9975043194471108,
"grad_norm": 1.3623526096343994,
"learning_rate": 4.722688097125172e-09,
"loss": 1.3668,
"step": 1299
},
{
"epoch": 0.998272221155692,
"grad_norm": 1.426559329032898,
"learning_rate": 2.656521202770712e-09,
"loss": 1.2487,
"step": 1300
},
{
"epoch": 0.9990401228642733,
"grad_norm": 1.8108794689178467,
"learning_rate": 1.1806789942947484e-09,
"loss": 1.5596,
"step": 1301
},
{
"epoch": 0.9998080245728547,
"grad_norm": 1.4854652881622314,
"learning_rate": 2.9517018420577305e-10,
"loss": 1.0427,
"step": 1302
},
{
"epoch": 1.000575926281436,
"grad_norm": 2.9019205570220947,
"learning_rate": 0.0,
"loss": 2.1779,
"step": 1303
}
],
"logging_steps": 1,
"max_steps": 1303,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 326,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.5981287804174336e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}