BQTrans-new / trainer_state.json
SKNahin's picture
End of training
0191047 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.999092650468797,
"eval_steps": 500,
"global_step": 2973,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010081661457808247,
"grad_norm": 34.57471518068361,
"learning_rate": 5e-05,
"loss": 3.9088,
"step": 1
},
{
"epoch": 0.0020163322915616494,
"grad_norm": 14.984387011888629,
"learning_rate": 5e-05,
"loss": 3.2615,
"step": 2
},
{
"epoch": 0.003024498437342474,
"grad_norm": 10.049184651061143,
"learning_rate": 5e-05,
"loss": 2.9787,
"step": 3
},
{
"epoch": 0.004032664583123299,
"grad_norm": 8.838958962379248,
"learning_rate": 5e-05,
"loss": 2.8635,
"step": 4
},
{
"epoch": 0.005040830728904123,
"grad_norm": 3.9611442119429787,
"learning_rate": 5e-05,
"loss": 2.751,
"step": 5
},
{
"epoch": 0.006048996874684948,
"grad_norm": 3.9695757524166546,
"learning_rate": 5e-05,
"loss": 2.7094,
"step": 6
},
{
"epoch": 0.007057163020465773,
"grad_norm": 4.086711126397493,
"learning_rate": 5e-05,
"loss": 2.67,
"step": 7
},
{
"epoch": 0.008065329166246598,
"grad_norm": 4.107991896963143,
"learning_rate": 5e-05,
"loss": 2.617,
"step": 8
},
{
"epoch": 0.009073495312027422,
"grad_norm": 4.144442971978443,
"learning_rate": 5e-05,
"loss": 2.5949,
"step": 9
},
{
"epoch": 0.010081661457808247,
"grad_norm": 4.106646166791991,
"learning_rate": 5e-05,
"loss": 2.5656,
"step": 10
},
{
"epoch": 0.011089827603589071,
"grad_norm": 4.053417964656737,
"learning_rate": 5e-05,
"loss": 2.5501,
"step": 11
},
{
"epoch": 0.012097993749369896,
"grad_norm": 4.136404620418808,
"learning_rate": 5e-05,
"loss": 2.5117,
"step": 12
},
{
"epoch": 0.01310615989515072,
"grad_norm": 3.996802290533662,
"learning_rate": 5e-05,
"loss": 2.4996,
"step": 13
},
{
"epoch": 0.014114326040931546,
"grad_norm": 3.849705821582526,
"learning_rate": 5e-05,
"loss": 2.487,
"step": 14
},
{
"epoch": 0.01512249218671237,
"grad_norm": 3.876905025177348,
"learning_rate": 5e-05,
"loss": 2.4751,
"step": 15
},
{
"epoch": 0.016130658332493195,
"grad_norm": 4.174026769641786,
"learning_rate": 5e-05,
"loss": 2.4676,
"step": 16
},
{
"epoch": 0.017138824478274018,
"grad_norm": 3.6416371605800775,
"learning_rate": 5e-05,
"loss": 2.4028,
"step": 17
},
{
"epoch": 0.018146990624054844,
"grad_norm": 3.632584282733268,
"learning_rate": 5e-05,
"loss": 2.4173,
"step": 18
},
{
"epoch": 0.01915515676983567,
"grad_norm": 3.6069769034580834,
"learning_rate": 5e-05,
"loss": 2.4006,
"step": 19
},
{
"epoch": 0.020163322915616493,
"grad_norm": 3.544857323595724,
"learning_rate": 5e-05,
"loss": 2.3923,
"step": 20
},
{
"epoch": 0.02117148906139732,
"grad_norm": 3.5114294914477626,
"learning_rate": 5e-05,
"loss": 2.3706,
"step": 21
},
{
"epoch": 0.022179655207178142,
"grad_norm": 3.4607286551586376,
"learning_rate": 5e-05,
"loss": 2.3337,
"step": 22
},
{
"epoch": 0.02318782135295897,
"grad_norm": 3.5128484012447716,
"learning_rate": 5e-05,
"loss": 2.3594,
"step": 23
},
{
"epoch": 0.02419598749873979,
"grad_norm": 3.4759482001503335,
"learning_rate": 5e-05,
"loss": 2.3424,
"step": 24
},
{
"epoch": 0.025204153644520617,
"grad_norm": 3.4342708313075407,
"learning_rate": 5e-05,
"loss": 2.321,
"step": 25
},
{
"epoch": 0.02621231979030144,
"grad_norm": 3.4635287305691396,
"learning_rate": 5e-05,
"loss": 2.2965,
"step": 26
},
{
"epoch": 0.027220485936082266,
"grad_norm": 3.5484975768174545,
"learning_rate": 5e-05,
"loss": 2.2829,
"step": 27
},
{
"epoch": 0.028228652081863093,
"grad_norm": 3.445709870621474,
"learning_rate": 5e-05,
"loss": 2.2953,
"step": 28
},
{
"epoch": 0.029236818227643915,
"grad_norm": 3.5550881065258517,
"learning_rate": 5e-05,
"loss": 2.2736,
"step": 29
},
{
"epoch": 0.03024498437342474,
"grad_norm": 3.453768977491902,
"learning_rate": 5e-05,
"loss": 2.2509,
"step": 30
},
{
"epoch": 0.03125315051920557,
"grad_norm": 3.288221262065957,
"learning_rate": 5e-05,
"loss": 2.2539,
"step": 31
},
{
"epoch": 0.03226131666498639,
"grad_norm": 3.2957588848132113,
"learning_rate": 5e-05,
"loss": 2.222,
"step": 32
},
{
"epoch": 0.03326948281076721,
"grad_norm": 3.326504022688874,
"learning_rate": 5e-05,
"loss": 2.2299,
"step": 33
},
{
"epoch": 0.034277648956548036,
"grad_norm": 3.3869099538130536,
"learning_rate": 5e-05,
"loss": 2.2075,
"step": 34
},
{
"epoch": 0.035285815102328866,
"grad_norm": 3.3039093489375793,
"learning_rate": 5e-05,
"loss": 2.2039,
"step": 35
},
{
"epoch": 0.03629398124810969,
"grad_norm": 3.372398687640437,
"learning_rate": 5e-05,
"loss": 2.1876,
"step": 36
},
{
"epoch": 0.03730214739389051,
"grad_norm": 3.3305428300095614,
"learning_rate": 5e-05,
"loss": 2.1848,
"step": 37
},
{
"epoch": 0.03831031353967134,
"grad_norm": 3.153488406520241,
"learning_rate": 5e-05,
"loss": 2.168,
"step": 38
},
{
"epoch": 0.039318479685452164,
"grad_norm": 3.1899907953902136,
"learning_rate": 5e-05,
"loss": 2.1406,
"step": 39
},
{
"epoch": 0.040326645831232986,
"grad_norm": 3.1966061633238367,
"learning_rate": 5e-05,
"loss": 2.1458,
"step": 40
},
{
"epoch": 0.04133481197701381,
"grad_norm": 3.1424092213443884,
"learning_rate": 5e-05,
"loss": 2.1308,
"step": 41
},
{
"epoch": 0.04234297812279464,
"grad_norm": 3.1232443108155943,
"learning_rate": 5e-05,
"loss": 2.1274,
"step": 42
},
{
"epoch": 0.04335114426857546,
"grad_norm": 3.2221627259743633,
"learning_rate": 5e-05,
"loss": 2.1014,
"step": 43
},
{
"epoch": 0.044359310414356284,
"grad_norm": 3.1413246825252155,
"learning_rate": 5e-05,
"loss": 2.1125,
"step": 44
},
{
"epoch": 0.04536747656013711,
"grad_norm": 3.1011534555071405,
"learning_rate": 5e-05,
"loss": 2.0817,
"step": 45
},
{
"epoch": 0.04637564270591794,
"grad_norm": 3.047593153180324,
"learning_rate": 5e-05,
"loss": 2.0662,
"step": 46
},
{
"epoch": 0.04738380885169876,
"grad_norm": 3.0099009196968742,
"learning_rate": 5e-05,
"loss": 2.0795,
"step": 47
},
{
"epoch": 0.04839197499747958,
"grad_norm": 2.8797605142116502,
"learning_rate": 5e-05,
"loss": 2.064,
"step": 48
},
{
"epoch": 0.04940014114326041,
"grad_norm": 2.9135674114693804,
"learning_rate": 5e-05,
"loss": 2.0572,
"step": 49
},
{
"epoch": 0.050408307289041235,
"grad_norm": 2.854983324411203,
"learning_rate": 5e-05,
"loss": 2.0437,
"step": 50
},
{
"epoch": 0.05141647343482206,
"grad_norm": 2.6836463449195183,
"learning_rate": 5e-05,
"loss": 2.0318,
"step": 51
},
{
"epoch": 0.05242463958060288,
"grad_norm": 2.6293906040436927,
"learning_rate": 5e-05,
"loss": 2.0319,
"step": 52
},
{
"epoch": 0.05343280572638371,
"grad_norm": 2.5302507817152065,
"learning_rate": 5e-05,
"loss": 2.0341,
"step": 53
},
{
"epoch": 0.05444097187216453,
"grad_norm": 2.365010314655601,
"learning_rate": 5e-05,
"loss": 2.0192,
"step": 54
},
{
"epoch": 0.055449138017945356,
"grad_norm": 2.2898929454078036,
"learning_rate": 5e-05,
"loss": 2.0123,
"step": 55
},
{
"epoch": 0.056457304163726185,
"grad_norm": 2.1619235754274055,
"learning_rate": 5e-05,
"loss": 1.9955,
"step": 56
},
{
"epoch": 0.05746547030950701,
"grad_norm": 1.9924556971488956,
"learning_rate": 5e-05,
"loss": 1.9743,
"step": 57
},
{
"epoch": 0.05847363645528783,
"grad_norm": 1.8642154895286722,
"learning_rate": 5e-05,
"loss": 1.965,
"step": 58
},
{
"epoch": 0.05948180260106865,
"grad_norm": 2.5911398208880607,
"learning_rate": 5e-05,
"loss": 2.0067,
"step": 59
},
{
"epoch": 0.06048996874684948,
"grad_norm": 1.639201916804658,
"learning_rate": 5e-05,
"loss": 1.9476,
"step": 60
},
{
"epoch": 0.061498134892630306,
"grad_norm": 1.4719820152496543,
"learning_rate": 5e-05,
"loss": 1.9788,
"step": 61
},
{
"epoch": 0.06250630103841114,
"grad_norm": 1.334249704262187,
"learning_rate": 5e-05,
"loss": 1.9746,
"step": 62
},
{
"epoch": 0.06351446718419196,
"grad_norm": 1.2045528079644199,
"learning_rate": 5e-05,
"loss": 1.9384,
"step": 63
},
{
"epoch": 0.06452263332997278,
"grad_norm": 1.1167108623471675,
"learning_rate": 5e-05,
"loss": 1.9608,
"step": 64
},
{
"epoch": 0.0655307994757536,
"grad_norm": 1.0221312125696673,
"learning_rate": 5e-05,
"loss": 1.9638,
"step": 65
},
{
"epoch": 0.06653896562153443,
"grad_norm": 0.8737146038745541,
"learning_rate": 5e-05,
"loss": 1.9518,
"step": 66
},
{
"epoch": 0.06754713176731525,
"grad_norm": 0.7942690648353735,
"learning_rate": 5e-05,
"loss": 1.9362,
"step": 67
},
{
"epoch": 0.06855529791309607,
"grad_norm": 0.6836365100733637,
"learning_rate": 5e-05,
"loss": 1.9519,
"step": 68
},
{
"epoch": 0.06956346405887691,
"grad_norm": 0.6076738169177845,
"learning_rate": 5e-05,
"loss": 1.9278,
"step": 69
},
{
"epoch": 0.07057163020465773,
"grad_norm": 0.5459271299084082,
"learning_rate": 5e-05,
"loss": 1.9116,
"step": 70
},
{
"epoch": 0.07157979635043855,
"grad_norm": 0.48476007136077964,
"learning_rate": 5e-05,
"loss": 1.9356,
"step": 71
},
{
"epoch": 0.07258796249621938,
"grad_norm": 0.431119236964475,
"learning_rate": 5e-05,
"loss": 1.8993,
"step": 72
},
{
"epoch": 0.0735961286420002,
"grad_norm": 0.39233195771215057,
"learning_rate": 5e-05,
"loss": 1.9308,
"step": 73
},
{
"epoch": 0.07460429478778102,
"grad_norm": 0.3525945811491024,
"learning_rate": 5e-05,
"loss": 1.9144,
"step": 74
},
{
"epoch": 0.07561246093356185,
"grad_norm": 0.3147607611067091,
"learning_rate": 5e-05,
"loss": 1.9147,
"step": 75
},
{
"epoch": 0.07662062707934268,
"grad_norm": 0.29445808394590856,
"learning_rate": 5e-05,
"loss": 1.9116,
"step": 76
},
{
"epoch": 0.0776287932251235,
"grad_norm": 0.26730669596893064,
"learning_rate": 5e-05,
"loss": 1.938,
"step": 77
},
{
"epoch": 0.07863695937090433,
"grad_norm": 0.2541501927237772,
"learning_rate": 5e-05,
"loss": 1.9046,
"step": 78
},
{
"epoch": 0.07964512551668515,
"grad_norm": 0.2402658098277575,
"learning_rate": 5e-05,
"loss": 1.9239,
"step": 79
},
{
"epoch": 0.08065329166246597,
"grad_norm": 0.22676350053618127,
"learning_rate": 5e-05,
"loss": 1.8973,
"step": 80
},
{
"epoch": 0.0816614578082468,
"grad_norm": 0.21692458062593906,
"learning_rate": 5e-05,
"loss": 1.9027,
"step": 81
},
{
"epoch": 0.08266962395402762,
"grad_norm": 0.20753970513804731,
"learning_rate": 5e-05,
"loss": 1.8857,
"step": 82
},
{
"epoch": 0.08367779009980846,
"grad_norm": 0.19995124003410258,
"learning_rate": 5e-05,
"loss": 1.8915,
"step": 83
},
{
"epoch": 0.08468595624558928,
"grad_norm": 0.18993359513015598,
"learning_rate": 5e-05,
"loss": 1.897,
"step": 84
},
{
"epoch": 0.0856941223913701,
"grad_norm": 0.189296051381214,
"learning_rate": 5e-05,
"loss": 1.903,
"step": 85
},
{
"epoch": 0.08670228853715092,
"grad_norm": 0.1848921313288835,
"learning_rate": 5e-05,
"loss": 1.9063,
"step": 86
},
{
"epoch": 0.08771045468293175,
"grad_norm": 0.24718165656407112,
"learning_rate": 5e-05,
"loss": 1.8911,
"step": 87
},
{
"epoch": 0.08871862082871257,
"grad_norm": 0.17748622791573787,
"learning_rate": 5e-05,
"loss": 1.8771,
"step": 88
},
{
"epoch": 0.08972678697449339,
"grad_norm": 0.17315877460124424,
"learning_rate": 5e-05,
"loss": 1.8772,
"step": 89
},
{
"epoch": 0.09073495312027421,
"grad_norm": 0.16519480154282987,
"learning_rate": 5e-05,
"loss": 1.8926,
"step": 90
},
{
"epoch": 0.09174311926605505,
"grad_norm": 0.1646588806987109,
"learning_rate": 5e-05,
"loss": 1.8871,
"step": 91
},
{
"epoch": 0.09275128541183587,
"grad_norm": 0.16399339737997176,
"learning_rate": 5e-05,
"loss": 1.8774,
"step": 92
},
{
"epoch": 0.0937594515576167,
"grad_norm": 0.1669428196999267,
"learning_rate": 5e-05,
"loss": 1.8834,
"step": 93
},
{
"epoch": 0.09476761770339752,
"grad_norm": 1.0261114242764373,
"learning_rate": 5e-05,
"loss": 1.8693,
"step": 94
},
{
"epoch": 0.09577578384917834,
"grad_norm": 0.1719496356360831,
"learning_rate": 5e-05,
"loss": 1.886,
"step": 95
},
{
"epoch": 0.09678394999495916,
"grad_norm": 0.2790856954528175,
"learning_rate": 5e-05,
"loss": 1.8651,
"step": 96
},
{
"epoch": 0.09779211614073999,
"grad_norm": 0.16174313169788473,
"learning_rate": 5e-05,
"loss": 1.886,
"step": 97
},
{
"epoch": 0.09880028228652082,
"grad_norm": 0.15585221456592352,
"learning_rate": 5e-05,
"loss": 1.8844,
"step": 98
},
{
"epoch": 0.09980844843230165,
"grad_norm": 0.16364893420163687,
"learning_rate": 5e-05,
"loss": 1.8588,
"step": 99
},
{
"epoch": 0.10081661457808247,
"grad_norm": 0.15674565320396686,
"learning_rate": 5e-05,
"loss": 1.8484,
"step": 100
},
{
"epoch": 0.10182478072386329,
"grad_norm": 0.16912516357614416,
"learning_rate": 5e-05,
"loss": 1.8667,
"step": 101
},
{
"epoch": 0.10283294686964412,
"grad_norm": 0.17191340384387088,
"learning_rate": 5e-05,
"loss": 1.8787,
"step": 102
},
{
"epoch": 0.10384111301542494,
"grad_norm": 0.16516975348246232,
"learning_rate": 5e-05,
"loss": 1.8583,
"step": 103
},
{
"epoch": 0.10484927916120576,
"grad_norm": 1.9242450149861634,
"learning_rate": 5e-05,
"loss": 1.9162,
"step": 104
},
{
"epoch": 0.1058574453069866,
"grad_norm": 0.16888127293984725,
"learning_rate": 5e-05,
"loss": 1.8751,
"step": 105
},
{
"epoch": 0.10686561145276742,
"grad_norm": 0.15854633983815475,
"learning_rate": 5e-05,
"loss": 1.8563,
"step": 106
},
{
"epoch": 0.10787377759854824,
"grad_norm": 0.16888919013834458,
"learning_rate": 5e-05,
"loss": 1.8511,
"step": 107
},
{
"epoch": 0.10888194374432907,
"grad_norm": 0.15933324172053995,
"learning_rate": 5e-05,
"loss": 1.87,
"step": 108
},
{
"epoch": 0.10989010989010989,
"grad_norm": 0.16397568361062448,
"learning_rate": 5e-05,
"loss": 1.8442,
"step": 109
},
{
"epoch": 0.11089827603589071,
"grad_norm": 0.16652309668195364,
"learning_rate": 5e-05,
"loss": 1.8439,
"step": 110
},
{
"epoch": 0.11190644218167153,
"grad_norm": 0.15799876365905016,
"learning_rate": 5e-05,
"loss": 1.8642,
"step": 111
},
{
"epoch": 0.11291460832745237,
"grad_norm": 0.15119742114523702,
"learning_rate": 5e-05,
"loss": 1.8374,
"step": 112
},
{
"epoch": 0.1139227744732332,
"grad_norm": 0.17103775998292917,
"learning_rate": 5e-05,
"loss": 1.865,
"step": 113
},
{
"epoch": 0.11493094061901402,
"grad_norm": 0.1683642083482424,
"learning_rate": 5e-05,
"loss": 1.8518,
"step": 114
},
{
"epoch": 0.11593910676479484,
"grad_norm": 0.16671673058838882,
"learning_rate": 5e-05,
"loss": 1.8554,
"step": 115
},
{
"epoch": 0.11694727291057566,
"grad_norm": 0.14987402234479336,
"learning_rate": 5e-05,
"loss": 1.8631,
"step": 116
},
{
"epoch": 0.11795543905635648,
"grad_norm": 0.15527353891827372,
"learning_rate": 5e-05,
"loss": 1.8453,
"step": 117
},
{
"epoch": 0.1189636052021373,
"grad_norm": 0.1679615027458173,
"learning_rate": 5e-05,
"loss": 1.8625,
"step": 118
},
{
"epoch": 0.11997177134791814,
"grad_norm": 0.224122986925736,
"learning_rate": 5e-05,
"loss": 1.8657,
"step": 119
},
{
"epoch": 0.12097993749369897,
"grad_norm": 0.1562370758852964,
"learning_rate": 5e-05,
"loss": 1.8477,
"step": 120
},
{
"epoch": 0.12198810363947979,
"grad_norm": 0.18240341523398404,
"learning_rate": 5e-05,
"loss": 1.8515,
"step": 121
},
{
"epoch": 0.12299626978526061,
"grad_norm": 0.15759149835129863,
"learning_rate": 5e-05,
"loss": 1.8476,
"step": 122
},
{
"epoch": 0.12400443593104143,
"grad_norm": 0.16577117834023483,
"learning_rate": 5e-05,
"loss": 1.8364,
"step": 123
},
{
"epoch": 0.12501260207682227,
"grad_norm": 0.15114077783588442,
"learning_rate": 5e-05,
"loss": 1.8534,
"step": 124
},
{
"epoch": 0.12602076822260308,
"grad_norm": 0.17320420801320205,
"learning_rate": 5e-05,
"loss": 1.8533,
"step": 125
},
{
"epoch": 0.12702893436838392,
"grad_norm": 0.1426802208439398,
"learning_rate": 5e-05,
"loss": 1.8492,
"step": 126
},
{
"epoch": 0.12803710051416473,
"grad_norm": 0.1427745438880488,
"learning_rate": 5e-05,
"loss": 1.8515,
"step": 127
},
{
"epoch": 0.12904526665994556,
"grad_norm": 0.14985816706678418,
"learning_rate": 5e-05,
"loss": 1.8359,
"step": 128
},
{
"epoch": 0.13005343280572637,
"grad_norm": 0.14340309531623066,
"learning_rate": 5e-05,
"loss": 1.8503,
"step": 129
},
{
"epoch": 0.1310615989515072,
"grad_norm": 0.1435206282235358,
"learning_rate": 5e-05,
"loss": 1.8517,
"step": 130
},
{
"epoch": 0.13206976509728804,
"grad_norm": 0.14058170113730545,
"learning_rate": 5e-05,
"loss": 1.8429,
"step": 131
},
{
"epoch": 0.13307793124306885,
"grad_norm": 0.14306330526901823,
"learning_rate": 5e-05,
"loss": 1.8301,
"step": 132
},
{
"epoch": 0.1340860973888497,
"grad_norm": 0.14477090717659785,
"learning_rate": 5e-05,
"loss": 1.8601,
"step": 133
},
{
"epoch": 0.1350942635346305,
"grad_norm": 0.14319080650257207,
"learning_rate": 5e-05,
"loss": 1.8437,
"step": 134
},
{
"epoch": 0.13610242968041134,
"grad_norm": 0.14034122909510496,
"learning_rate": 5e-05,
"loss": 1.8259,
"step": 135
},
{
"epoch": 0.13711059582619214,
"grad_norm": 0.13808662342530606,
"learning_rate": 5e-05,
"loss": 1.8434,
"step": 136
},
{
"epoch": 0.13811876197197298,
"grad_norm": 0.15111039758004002,
"learning_rate": 5e-05,
"loss": 1.8507,
"step": 137
},
{
"epoch": 0.13912692811775382,
"grad_norm": 0.1365342831509653,
"learning_rate": 5e-05,
"loss": 1.8358,
"step": 138
},
{
"epoch": 0.14013509426353463,
"grad_norm": 0.13875807647032345,
"learning_rate": 5e-05,
"loss": 1.8397,
"step": 139
},
{
"epoch": 0.14114326040931546,
"grad_norm": 0.14253547281592827,
"learning_rate": 5e-05,
"loss": 1.8313,
"step": 140
},
{
"epoch": 0.14215142655509627,
"grad_norm": 0.14753781033459742,
"learning_rate": 5e-05,
"loss": 1.8487,
"step": 141
},
{
"epoch": 0.1431595927008771,
"grad_norm": 0.13160128302364027,
"learning_rate": 5e-05,
"loss": 1.8184,
"step": 142
},
{
"epoch": 0.14416775884665792,
"grad_norm": 0.14333676268082823,
"learning_rate": 5e-05,
"loss": 1.8261,
"step": 143
},
{
"epoch": 0.14517592499243875,
"grad_norm": 0.1430079366271181,
"learning_rate": 5e-05,
"loss": 1.8191,
"step": 144
},
{
"epoch": 0.1461840911382196,
"grad_norm": 0.14215288025785802,
"learning_rate": 5e-05,
"loss": 1.8243,
"step": 145
},
{
"epoch": 0.1471922572840004,
"grad_norm": 0.5239016479752525,
"learning_rate": 5e-05,
"loss": 1.8419,
"step": 146
},
{
"epoch": 0.14820042342978124,
"grad_norm": 0.14824799807909486,
"learning_rate": 5e-05,
"loss": 1.8268,
"step": 147
},
{
"epoch": 0.14920858957556205,
"grad_norm": 0.1430248626948606,
"learning_rate": 5e-05,
"loss": 1.8267,
"step": 148
},
{
"epoch": 0.15021675572134288,
"grad_norm": 0.14496122414220788,
"learning_rate": 5e-05,
"loss": 1.8427,
"step": 149
},
{
"epoch": 0.1512249218671237,
"grad_norm": 0.1473177263441529,
"learning_rate": 5e-05,
"loss": 1.8355,
"step": 150
},
{
"epoch": 0.15223308801290453,
"grad_norm": 0.14231490267928462,
"learning_rate": 5e-05,
"loss": 1.805,
"step": 151
},
{
"epoch": 0.15324125415868536,
"grad_norm": 0.1418245299909162,
"learning_rate": 5e-05,
"loss": 1.8183,
"step": 152
},
{
"epoch": 0.15424942030446617,
"grad_norm": 0.13753317806858326,
"learning_rate": 5e-05,
"loss": 1.8147,
"step": 153
},
{
"epoch": 0.155257586450247,
"grad_norm": 0.13914926431077226,
"learning_rate": 5e-05,
"loss": 1.8119,
"step": 154
},
{
"epoch": 0.15626575259602782,
"grad_norm": 0.1435782070136882,
"learning_rate": 5e-05,
"loss": 1.8165,
"step": 155
},
{
"epoch": 0.15727391874180865,
"grad_norm": 0.14305520914603162,
"learning_rate": 5e-05,
"loss": 1.7945,
"step": 156
},
{
"epoch": 0.15828208488758946,
"grad_norm": 0.15121415486642223,
"learning_rate": 5e-05,
"loss": 1.8184,
"step": 157
},
{
"epoch": 0.1592902510333703,
"grad_norm": 0.1455230658958776,
"learning_rate": 5e-05,
"loss": 1.8347,
"step": 158
},
{
"epoch": 0.16029841717915114,
"grad_norm": 0.38203744961546876,
"learning_rate": 5e-05,
"loss": 1.8339,
"step": 159
},
{
"epoch": 0.16130658332493195,
"grad_norm": 0.15216313167006948,
"learning_rate": 5e-05,
"loss": 1.8241,
"step": 160
},
{
"epoch": 0.16231474947071278,
"grad_norm": 0.14241802216869767,
"learning_rate": 5e-05,
"loss": 1.8183,
"step": 161
},
{
"epoch": 0.1633229156164936,
"grad_norm": 0.16701614066822063,
"learning_rate": 5e-05,
"loss": 1.8266,
"step": 162
},
{
"epoch": 0.16433108176227443,
"grad_norm": 0.14130437355543407,
"learning_rate": 5e-05,
"loss": 1.823,
"step": 163
},
{
"epoch": 0.16533924790805524,
"grad_norm": 0.14144400494428122,
"learning_rate": 5e-05,
"loss": 1.8102,
"step": 164
},
{
"epoch": 0.16634741405383607,
"grad_norm": 0.19337899524551297,
"learning_rate": 5e-05,
"loss": 1.8124,
"step": 165
},
{
"epoch": 0.1673555801996169,
"grad_norm": 0.14367033024459142,
"learning_rate": 5e-05,
"loss": 1.8215,
"step": 166
},
{
"epoch": 0.16836374634539772,
"grad_norm": 0.14832836884198922,
"learning_rate": 5e-05,
"loss": 1.8168,
"step": 167
},
{
"epoch": 0.16937191249117856,
"grad_norm": 0.1465143571682479,
"learning_rate": 5e-05,
"loss": 1.8256,
"step": 168
},
{
"epoch": 0.17038007863695936,
"grad_norm": 0.14542201739695612,
"learning_rate": 5e-05,
"loss": 1.801,
"step": 169
},
{
"epoch": 0.1713882447827402,
"grad_norm": 0.13892745937383433,
"learning_rate": 5e-05,
"loss": 1.8057,
"step": 170
},
{
"epoch": 0.172396410928521,
"grad_norm": 0.14274131608703794,
"learning_rate": 5e-05,
"loss": 1.8079,
"step": 171
},
{
"epoch": 0.17340457707430185,
"grad_norm": 0.1401424607261012,
"learning_rate": 5e-05,
"loss": 1.8055,
"step": 172
},
{
"epoch": 0.17441274322008268,
"grad_norm": 0.15769308735320056,
"learning_rate": 5e-05,
"loss": 1.7996,
"step": 173
},
{
"epoch": 0.1754209093658635,
"grad_norm": 0.14070813717782912,
"learning_rate": 5e-05,
"loss": 1.8039,
"step": 174
},
{
"epoch": 0.17642907551164433,
"grad_norm": 0.1578385284864902,
"learning_rate": 5e-05,
"loss": 1.8177,
"step": 175
},
{
"epoch": 0.17743724165742514,
"grad_norm": 0.15862029917030843,
"learning_rate": 5e-05,
"loss": 1.8211,
"step": 176
},
{
"epoch": 0.17844540780320597,
"grad_norm": 0.15757584940286892,
"learning_rate": 5e-05,
"loss": 1.8311,
"step": 177
},
{
"epoch": 0.17945357394898678,
"grad_norm": 0.15109490816160354,
"learning_rate": 5e-05,
"loss": 1.8189,
"step": 178
},
{
"epoch": 0.18046174009476762,
"grad_norm": 0.16394550016574783,
"learning_rate": 5e-05,
"loss": 1.8033,
"step": 179
},
{
"epoch": 0.18146990624054843,
"grad_norm": 0.15861740081769296,
"learning_rate": 5e-05,
"loss": 1.8019,
"step": 180
},
{
"epoch": 0.18247807238632927,
"grad_norm": 0.14363097977546618,
"learning_rate": 5e-05,
"loss": 1.8044,
"step": 181
},
{
"epoch": 0.1834862385321101,
"grad_norm": 0.15924066731744524,
"learning_rate": 5e-05,
"loss": 1.7934,
"step": 182
},
{
"epoch": 0.1844944046778909,
"grad_norm": 0.1409238970199551,
"learning_rate": 5e-05,
"loss": 1.8236,
"step": 183
},
{
"epoch": 0.18550257082367175,
"grad_norm": 0.14448492106367192,
"learning_rate": 5e-05,
"loss": 1.7832,
"step": 184
},
{
"epoch": 0.18651073696945256,
"grad_norm": 0.13579799309864551,
"learning_rate": 5e-05,
"loss": 1.8104,
"step": 185
},
{
"epoch": 0.1875189031152334,
"grad_norm": 0.1519137140895342,
"learning_rate": 5e-05,
"loss": 1.8095,
"step": 186
},
{
"epoch": 0.1885270692610142,
"grad_norm": 0.14284624647289126,
"learning_rate": 5e-05,
"loss": 1.8111,
"step": 187
},
{
"epoch": 0.18953523540679504,
"grad_norm": 0.13804963935004147,
"learning_rate": 5e-05,
"loss": 1.7897,
"step": 188
},
{
"epoch": 0.19054340155257588,
"grad_norm": 0.13890536326955857,
"learning_rate": 5e-05,
"loss": 1.7911,
"step": 189
},
{
"epoch": 0.19155156769835668,
"grad_norm": 0.1454820197216778,
"learning_rate": 5e-05,
"loss": 1.8,
"step": 190
},
{
"epoch": 0.19255973384413752,
"grad_norm": 0.14265790459239303,
"learning_rate": 5e-05,
"loss": 1.8011,
"step": 191
},
{
"epoch": 0.19356789998991833,
"grad_norm": 0.15184150116266387,
"learning_rate": 5e-05,
"loss": 1.8358,
"step": 192
},
{
"epoch": 0.19457606613569917,
"grad_norm": 0.13887936401292228,
"learning_rate": 5e-05,
"loss": 1.8029,
"step": 193
},
{
"epoch": 0.19558423228147997,
"grad_norm": 0.15123788408580607,
"learning_rate": 5e-05,
"loss": 1.7918,
"step": 194
},
{
"epoch": 0.1965923984272608,
"grad_norm": 0.4184962428270207,
"learning_rate": 5e-05,
"loss": 1.8033,
"step": 195
},
{
"epoch": 0.19760056457304165,
"grad_norm": 0.13421178586714527,
"learning_rate": 5e-05,
"loss": 1.8121,
"step": 196
},
{
"epoch": 0.19860873071882246,
"grad_norm": 0.14770772432026583,
"learning_rate": 5e-05,
"loss": 1.7971,
"step": 197
},
{
"epoch": 0.1996168968646033,
"grad_norm": 0.14383182989087254,
"learning_rate": 5e-05,
"loss": 1.8008,
"step": 198
},
{
"epoch": 0.2006250630103841,
"grad_norm": 0.1384392103492628,
"learning_rate": 5e-05,
"loss": 1.8115,
"step": 199
},
{
"epoch": 0.20163322915616494,
"grad_norm": 0.14726904874212857,
"learning_rate": 5e-05,
"loss": 1.8155,
"step": 200
},
{
"epoch": 0.20264139530194575,
"grad_norm": 0.1384824989118434,
"learning_rate": 5e-05,
"loss": 1.8092,
"step": 201
},
{
"epoch": 0.20364956144772658,
"grad_norm": 0.13775347706113975,
"learning_rate": 5e-05,
"loss": 1.793,
"step": 202
},
{
"epoch": 0.20465772759350742,
"grad_norm": 0.15774933663999155,
"learning_rate": 5e-05,
"loss": 1.806,
"step": 203
},
{
"epoch": 0.20566589373928823,
"grad_norm": 0.14301341548219315,
"learning_rate": 5e-05,
"loss": 1.8072,
"step": 204
},
{
"epoch": 0.20667405988506907,
"grad_norm": 0.169861283670675,
"learning_rate": 5e-05,
"loss": 1.7983,
"step": 205
},
{
"epoch": 0.20768222603084988,
"grad_norm": 0.15504709124693639,
"learning_rate": 5e-05,
"loss": 1.8048,
"step": 206
},
{
"epoch": 0.2086903921766307,
"grad_norm": 0.14436742876219333,
"learning_rate": 5e-05,
"loss": 1.7936,
"step": 207
},
{
"epoch": 0.20969855832241152,
"grad_norm": 1.2062437482583328,
"learning_rate": 5e-05,
"loss": 1.8324,
"step": 208
},
{
"epoch": 0.21070672446819236,
"grad_norm": 0.1649150229959874,
"learning_rate": 5e-05,
"loss": 1.7906,
"step": 209
},
{
"epoch": 0.2117148906139732,
"grad_norm": 0.1460932612643998,
"learning_rate": 5e-05,
"loss": 1.8009,
"step": 210
},
{
"epoch": 0.212723056759754,
"grad_norm": 0.14251437275063011,
"learning_rate": 5e-05,
"loss": 1.7927,
"step": 211
},
{
"epoch": 0.21373122290553484,
"grad_norm": 0.15037931584645792,
"learning_rate": 5e-05,
"loss": 1.7999,
"step": 212
},
{
"epoch": 0.21473938905131565,
"grad_norm": 0.1446855518280459,
"learning_rate": 5e-05,
"loss": 1.7884,
"step": 213
},
{
"epoch": 0.21574755519709649,
"grad_norm": 0.14747066125003186,
"learning_rate": 5e-05,
"loss": 1.7736,
"step": 214
},
{
"epoch": 0.2167557213428773,
"grad_norm": 0.14478801075472833,
"learning_rate": 5e-05,
"loss": 1.7958,
"step": 215
},
{
"epoch": 0.21776388748865813,
"grad_norm": 0.14675607719308872,
"learning_rate": 5e-05,
"loss": 1.784,
"step": 216
},
{
"epoch": 0.21877205363443897,
"grad_norm": 0.13905433311433602,
"learning_rate": 5e-05,
"loss": 1.781,
"step": 217
},
{
"epoch": 0.21978021978021978,
"grad_norm": 0.15427823427776496,
"learning_rate": 5e-05,
"loss": 1.7826,
"step": 218
},
{
"epoch": 0.2207883859260006,
"grad_norm": 0.13866445119988918,
"learning_rate": 5e-05,
"loss": 1.7772,
"step": 219
},
{
"epoch": 0.22179655207178142,
"grad_norm": 0.15729430183656884,
"learning_rate": 5e-05,
"loss": 1.8089,
"step": 220
},
{
"epoch": 0.22280471821756226,
"grad_norm": 0.1384346165139127,
"learning_rate": 5e-05,
"loss": 1.8027,
"step": 221
},
{
"epoch": 0.22381288436334307,
"grad_norm": 0.15170585753242535,
"learning_rate": 5e-05,
"loss": 1.793,
"step": 222
},
{
"epoch": 0.2248210505091239,
"grad_norm": 0.14604589286191189,
"learning_rate": 5e-05,
"loss": 1.7886,
"step": 223
},
{
"epoch": 0.22582921665490474,
"grad_norm": 0.15226581106025078,
"learning_rate": 5e-05,
"loss": 1.7866,
"step": 224
},
{
"epoch": 0.22683738280068555,
"grad_norm": 0.4148206066745425,
"learning_rate": 5e-05,
"loss": 1.8154,
"step": 225
},
{
"epoch": 0.2278455489464664,
"grad_norm": 0.154524327856525,
"learning_rate": 5e-05,
"loss": 1.7785,
"step": 226
},
{
"epoch": 0.2288537150922472,
"grad_norm": 0.14509919795107487,
"learning_rate": 5e-05,
"loss": 1.789,
"step": 227
},
{
"epoch": 0.22986188123802803,
"grad_norm": 0.14848451528795917,
"learning_rate": 5e-05,
"loss": 1.803,
"step": 228
},
{
"epoch": 0.23087004738380884,
"grad_norm": 0.1619137027086449,
"learning_rate": 5e-05,
"loss": 1.793,
"step": 229
},
{
"epoch": 0.23187821352958968,
"grad_norm": 0.15893201293250522,
"learning_rate": 5e-05,
"loss": 1.7825,
"step": 230
},
{
"epoch": 0.23288637967537051,
"grad_norm": 0.18006927954065935,
"learning_rate": 5e-05,
"loss": 1.7925,
"step": 231
},
{
"epoch": 0.23389454582115132,
"grad_norm": 0.21554580188759084,
"learning_rate": 5e-05,
"loss": 1.7699,
"step": 232
},
{
"epoch": 0.23490271196693216,
"grad_norm": 0.17549267236851238,
"learning_rate": 5e-05,
"loss": 1.7641,
"step": 233
},
{
"epoch": 0.23591087811271297,
"grad_norm": 0.15014379853354953,
"learning_rate": 5e-05,
"loss": 1.8095,
"step": 234
},
{
"epoch": 0.2369190442584938,
"grad_norm": 0.1445848774035969,
"learning_rate": 5e-05,
"loss": 1.7858,
"step": 235
},
{
"epoch": 0.2379272104042746,
"grad_norm": 0.1606246837780276,
"learning_rate": 5e-05,
"loss": 1.7918,
"step": 236
},
{
"epoch": 0.23893537655005545,
"grad_norm": 0.15316695958989793,
"learning_rate": 5e-05,
"loss": 1.7798,
"step": 237
},
{
"epoch": 0.2399435426958363,
"grad_norm": 0.16374005732964989,
"learning_rate": 5e-05,
"loss": 1.7959,
"step": 238
},
{
"epoch": 0.2409517088416171,
"grad_norm": 0.14355882787135935,
"learning_rate": 5e-05,
"loss": 1.7788,
"step": 239
},
{
"epoch": 0.24195987498739793,
"grad_norm": 0.14247097062180922,
"learning_rate": 5e-05,
"loss": 1.7731,
"step": 240
},
{
"epoch": 0.24296804113317874,
"grad_norm": 0.13912036094349547,
"learning_rate": 5e-05,
"loss": 1.7861,
"step": 241
},
{
"epoch": 0.24397620727895958,
"grad_norm": 0.16920052818228684,
"learning_rate": 5e-05,
"loss": 1.7764,
"step": 242
},
{
"epoch": 0.2449843734247404,
"grad_norm": 0.1455022483791817,
"learning_rate": 5e-05,
"loss": 1.783,
"step": 243
},
{
"epoch": 0.24599253957052122,
"grad_norm": 0.15542518289693488,
"learning_rate": 5e-05,
"loss": 1.8016,
"step": 244
},
{
"epoch": 0.24700070571630206,
"grad_norm": 0.15524505680074863,
"learning_rate": 5e-05,
"loss": 1.7909,
"step": 245
},
{
"epoch": 0.24800887186208287,
"grad_norm": 0.1516919173418789,
"learning_rate": 5e-05,
"loss": 1.7882,
"step": 246
},
{
"epoch": 0.2490170380078637,
"grad_norm": 0.1668727384239558,
"learning_rate": 5e-05,
"loss": 1.7863,
"step": 247
},
{
"epoch": 0.25002520415364454,
"grad_norm": 0.17266897431534142,
"learning_rate": 5e-05,
"loss": 1.7668,
"step": 248
},
{
"epoch": 0.25103337029942535,
"grad_norm": 0.1531271105909299,
"learning_rate": 5e-05,
"loss": 1.7835,
"step": 249
},
{
"epoch": 0.25204153644520616,
"grad_norm": 0.15085666027016767,
"learning_rate": 5e-05,
"loss": 1.8045,
"step": 250
},
{
"epoch": 0.25304970259098697,
"grad_norm": 0.15682683621010082,
"learning_rate": 5e-05,
"loss": 1.7693,
"step": 251
},
{
"epoch": 0.25405786873676783,
"grad_norm": 0.15144607833613863,
"learning_rate": 5e-05,
"loss": 1.7774,
"step": 252
},
{
"epoch": 0.25506603488254864,
"grad_norm": 0.14001585494695262,
"learning_rate": 5e-05,
"loss": 1.7901,
"step": 253
},
{
"epoch": 0.25607420102832945,
"grad_norm": 0.19707640367045973,
"learning_rate": 5e-05,
"loss": 1.7838,
"step": 254
},
{
"epoch": 0.2570823671741103,
"grad_norm": 0.12846973999730532,
"learning_rate": 5e-05,
"loss": 1.7763,
"step": 255
},
{
"epoch": 0.2580905333198911,
"grad_norm": 0.16735151403436016,
"learning_rate": 5e-05,
"loss": 1.7713,
"step": 256
},
{
"epoch": 0.25909869946567193,
"grad_norm": 0.14797938918139564,
"learning_rate": 5e-05,
"loss": 1.7742,
"step": 257
},
{
"epoch": 0.26010686561145274,
"grad_norm": 0.15168862915568712,
"learning_rate": 5e-05,
"loss": 1.7794,
"step": 258
},
{
"epoch": 0.2611150317572336,
"grad_norm": 0.16227121267503694,
"learning_rate": 5e-05,
"loss": 1.77,
"step": 259
},
{
"epoch": 0.2621231979030144,
"grad_norm": 0.14066196762298472,
"learning_rate": 5e-05,
"loss": 1.7882,
"step": 260
},
{
"epoch": 0.2631313640487952,
"grad_norm": 0.13869952828234983,
"learning_rate": 5e-05,
"loss": 1.782,
"step": 261
},
{
"epoch": 0.2641395301945761,
"grad_norm": 0.14487517640755312,
"learning_rate": 5e-05,
"loss": 1.779,
"step": 262
},
{
"epoch": 0.2651476963403569,
"grad_norm": 0.13417492252768634,
"learning_rate": 5e-05,
"loss": 1.771,
"step": 263
},
{
"epoch": 0.2661558624861377,
"grad_norm": 0.14848309582974037,
"learning_rate": 5e-05,
"loss": 1.7873,
"step": 264
},
{
"epoch": 0.2671640286319185,
"grad_norm": 0.14221777255093823,
"learning_rate": 5e-05,
"loss": 1.789,
"step": 265
},
{
"epoch": 0.2681721947776994,
"grad_norm": 0.13934641054898725,
"learning_rate": 5e-05,
"loss": 1.7813,
"step": 266
},
{
"epoch": 0.2691803609234802,
"grad_norm": 0.1470259671529886,
"learning_rate": 5e-05,
"loss": 1.7584,
"step": 267
},
{
"epoch": 0.270188527069261,
"grad_norm": 0.13951054708411406,
"learning_rate": 5e-05,
"loss": 1.7562,
"step": 268
},
{
"epoch": 0.27119669321504186,
"grad_norm": 0.14853290147291082,
"learning_rate": 5e-05,
"loss": 1.7704,
"step": 269
},
{
"epoch": 0.27220485936082267,
"grad_norm": 0.14895311614357434,
"learning_rate": 5e-05,
"loss": 1.7874,
"step": 270
},
{
"epoch": 0.2732130255066035,
"grad_norm": 0.13590434471079565,
"learning_rate": 5e-05,
"loss": 1.7796,
"step": 271
},
{
"epoch": 0.2742211916523843,
"grad_norm": 0.14848968529004114,
"learning_rate": 5e-05,
"loss": 1.7733,
"step": 272
},
{
"epoch": 0.27522935779816515,
"grad_norm": 0.14128642904473834,
"learning_rate": 5e-05,
"loss": 1.7761,
"step": 273
},
{
"epoch": 0.27623752394394596,
"grad_norm": 0.15714325279208918,
"learning_rate": 5e-05,
"loss": 1.7793,
"step": 274
},
{
"epoch": 0.27724569008972677,
"grad_norm": 0.13785114250535732,
"learning_rate": 5e-05,
"loss": 1.7724,
"step": 275
},
{
"epoch": 0.27825385623550764,
"grad_norm": 0.14777719343647677,
"learning_rate": 5e-05,
"loss": 1.7776,
"step": 276
},
{
"epoch": 0.27926202238128844,
"grad_norm": 0.15031370981427467,
"learning_rate": 5e-05,
"loss": 1.7702,
"step": 277
},
{
"epoch": 0.28027018852706925,
"grad_norm": 0.13880569847429872,
"learning_rate": 5e-05,
"loss": 1.7712,
"step": 278
},
{
"epoch": 0.28127835467285006,
"grad_norm": 0.14874303568984948,
"learning_rate": 5e-05,
"loss": 1.7668,
"step": 279
},
{
"epoch": 0.2822865208186309,
"grad_norm": 0.14816035016590623,
"learning_rate": 5e-05,
"loss": 1.7576,
"step": 280
},
{
"epoch": 0.28329468696441173,
"grad_norm": 0.14049002413491998,
"learning_rate": 5e-05,
"loss": 1.7693,
"step": 281
},
{
"epoch": 0.28430285311019254,
"grad_norm": 0.1510130790384099,
"learning_rate": 5e-05,
"loss": 1.7484,
"step": 282
},
{
"epoch": 0.2853110192559734,
"grad_norm": 0.13918487318352804,
"learning_rate": 5e-05,
"loss": 1.7642,
"step": 283
},
{
"epoch": 0.2863191854017542,
"grad_norm": 0.1474578079181453,
"learning_rate": 5e-05,
"loss": 1.7744,
"step": 284
},
{
"epoch": 0.287327351547535,
"grad_norm": 0.13820771228831047,
"learning_rate": 5e-05,
"loss": 1.7672,
"step": 285
},
{
"epoch": 0.28833551769331583,
"grad_norm": 0.14351030264796166,
"learning_rate": 5e-05,
"loss": 1.7582,
"step": 286
},
{
"epoch": 0.2893436838390967,
"grad_norm": 0.15670700153087316,
"learning_rate": 5e-05,
"loss": 1.7629,
"step": 287
},
{
"epoch": 0.2903518499848775,
"grad_norm": 0.1783938883378467,
"learning_rate": 5e-05,
"loss": 1.7827,
"step": 288
},
{
"epoch": 0.2913600161306583,
"grad_norm": 0.15333916211025533,
"learning_rate": 5e-05,
"loss": 1.7636,
"step": 289
},
{
"epoch": 0.2923681822764392,
"grad_norm": 0.13236694666147217,
"learning_rate": 5e-05,
"loss": 1.7682,
"step": 290
},
{
"epoch": 0.29337634842222,
"grad_norm": 0.13938856624825205,
"learning_rate": 5e-05,
"loss": 1.768,
"step": 291
},
{
"epoch": 0.2943845145680008,
"grad_norm": 0.1314885900094198,
"learning_rate": 5e-05,
"loss": 1.7932,
"step": 292
},
{
"epoch": 0.2953926807137816,
"grad_norm": 0.15583014570979986,
"learning_rate": 5e-05,
"loss": 1.758,
"step": 293
},
{
"epoch": 0.2964008468595625,
"grad_norm": 0.14067581924618947,
"learning_rate": 5e-05,
"loss": 1.7721,
"step": 294
},
{
"epoch": 0.2974090130053433,
"grad_norm": 0.15970423037745704,
"learning_rate": 5e-05,
"loss": 1.7731,
"step": 295
},
{
"epoch": 0.2984171791511241,
"grad_norm": 0.13752711701400588,
"learning_rate": 5e-05,
"loss": 1.7674,
"step": 296
},
{
"epoch": 0.29942534529690495,
"grad_norm": 0.1521446918249182,
"learning_rate": 5e-05,
"loss": 1.7721,
"step": 297
},
{
"epoch": 0.30043351144268576,
"grad_norm": 0.1475193365401531,
"learning_rate": 5e-05,
"loss": 1.7457,
"step": 298
},
{
"epoch": 0.30144167758846657,
"grad_norm": 0.16275272355252648,
"learning_rate": 5e-05,
"loss": 1.7625,
"step": 299
},
{
"epoch": 0.3024498437342474,
"grad_norm": 0.15182041598893675,
"learning_rate": 5e-05,
"loss": 1.7597,
"step": 300
},
{
"epoch": 0.30345800988002825,
"grad_norm": 0.14884368791325303,
"learning_rate": 5e-05,
"loss": 1.7472,
"step": 301
},
{
"epoch": 0.30446617602580905,
"grad_norm": 0.1670052966040775,
"learning_rate": 5e-05,
"loss": 1.765,
"step": 302
},
{
"epoch": 0.30547434217158986,
"grad_norm": 0.1463624528999074,
"learning_rate": 5e-05,
"loss": 1.786,
"step": 303
},
{
"epoch": 0.3064825083173707,
"grad_norm": 0.16637180035101126,
"learning_rate": 5e-05,
"loss": 1.7483,
"step": 304
},
{
"epoch": 0.30749067446315154,
"grad_norm": 0.14187795345800958,
"learning_rate": 5e-05,
"loss": 1.7453,
"step": 305
},
{
"epoch": 0.30849884060893235,
"grad_norm": 0.15031420779115814,
"learning_rate": 5e-05,
"loss": 1.7623,
"step": 306
},
{
"epoch": 0.30950700675471315,
"grad_norm": 0.14881086214174263,
"learning_rate": 5e-05,
"loss": 1.7562,
"step": 307
},
{
"epoch": 0.310515172900494,
"grad_norm": 0.13856355863719944,
"learning_rate": 5e-05,
"loss": 1.77,
"step": 308
},
{
"epoch": 0.3115233390462748,
"grad_norm": 0.154670739980791,
"learning_rate": 5e-05,
"loss": 1.7881,
"step": 309
},
{
"epoch": 0.31253150519205564,
"grad_norm": 0.12865970113488273,
"learning_rate": 5e-05,
"loss": 1.7808,
"step": 310
},
{
"epoch": 0.3135396713378365,
"grad_norm": 0.13399014677184057,
"learning_rate": 5e-05,
"loss": 1.7543,
"step": 311
},
{
"epoch": 0.3145478374836173,
"grad_norm": 0.14027126764064923,
"learning_rate": 5e-05,
"loss": 1.7619,
"step": 312
},
{
"epoch": 0.3155560036293981,
"grad_norm": 0.13197916406773966,
"learning_rate": 5e-05,
"loss": 1.7532,
"step": 313
},
{
"epoch": 0.3165641697751789,
"grad_norm": 0.13246622547168005,
"learning_rate": 5e-05,
"loss": 1.7724,
"step": 314
},
{
"epoch": 0.3175723359209598,
"grad_norm": 0.15951916864488466,
"learning_rate": 5e-05,
"loss": 1.766,
"step": 315
},
{
"epoch": 0.3185805020667406,
"grad_norm": 0.1348760333121085,
"learning_rate": 5e-05,
"loss": 1.7621,
"step": 316
},
{
"epoch": 0.3195886682125214,
"grad_norm": 0.1519377070479792,
"learning_rate": 5e-05,
"loss": 1.765,
"step": 317
},
{
"epoch": 0.3205968343583023,
"grad_norm": 0.14818327854333438,
"learning_rate": 5e-05,
"loss": 1.7636,
"step": 318
},
{
"epoch": 0.3216050005040831,
"grad_norm": 0.14522476140759424,
"learning_rate": 5e-05,
"loss": 1.7479,
"step": 319
},
{
"epoch": 0.3226131666498639,
"grad_norm": 0.14944731912062306,
"learning_rate": 5e-05,
"loss": 1.7668,
"step": 320
},
{
"epoch": 0.3236213327956447,
"grad_norm": 0.15818576872525297,
"learning_rate": 5e-05,
"loss": 1.7709,
"step": 321
},
{
"epoch": 0.32462949894142556,
"grad_norm": 0.13986279703460075,
"learning_rate": 5e-05,
"loss": 1.7724,
"step": 322
},
{
"epoch": 0.3256376650872064,
"grad_norm": 0.1488733535408475,
"learning_rate": 5e-05,
"loss": 1.748,
"step": 323
},
{
"epoch": 0.3266458312329872,
"grad_norm": 0.14710962194981353,
"learning_rate": 5e-05,
"loss": 1.7522,
"step": 324
},
{
"epoch": 0.32765399737876805,
"grad_norm": 0.15431100217374796,
"learning_rate": 5e-05,
"loss": 1.7528,
"step": 325
},
{
"epoch": 0.32866216352454886,
"grad_norm": 0.13109425651027415,
"learning_rate": 5e-05,
"loss": 1.781,
"step": 326
},
{
"epoch": 0.32967032967032966,
"grad_norm": 0.16057027582711905,
"learning_rate": 5e-05,
"loss": 1.7629,
"step": 327
},
{
"epoch": 0.3306784958161105,
"grad_norm": 0.1685848412347586,
"learning_rate": 5e-05,
"loss": 1.7537,
"step": 328
},
{
"epoch": 0.33168666196189134,
"grad_norm": 0.14191134015434137,
"learning_rate": 5e-05,
"loss": 1.7649,
"step": 329
},
{
"epoch": 0.33269482810767215,
"grad_norm": 0.14552531083316922,
"learning_rate": 5e-05,
"loss": 1.7454,
"step": 330
},
{
"epoch": 0.33370299425345296,
"grad_norm": 0.13436840358793842,
"learning_rate": 5e-05,
"loss": 1.7607,
"step": 331
},
{
"epoch": 0.3347111603992338,
"grad_norm": 0.1437049850393499,
"learning_rate": 5e-05,
"loss": 1.759,
"step": 332
},
{
"epoch": 0.33571932654501463,
"grad_norm": 0.1479474964632027,
"learning_rate": 5e-05,
"loss": 1.7513,
"step": 333
},
{
"epoch": 0.33672749269079544,
"grad_norm": 0.13190734076173918,
"learning_rate": 5e-05,
"loss": 1.7508,
"step": 334
},
{
"epoch": 0.33773565883657625,
"grad_norm": 0.1388043068065826,
"learning_rate": 5e-05,
"loss": 1.7516,
"step": 335
},
{
"epoch": 0.3387438249823571,
"grad_norm": 0.13435510062113534,
"learning_rate": 5e-05,
"loss": 1.7699,
"step": 336
},
{
"epoch": 0.3397519911281379,
"grad_norm": 0.14201154202574512,
"learning_rate": 5e-05,
"loss": 1.7474,
"step": 337
},
{
"epoch": 0.34076015727391873,
"grad_norm": 0.1295896147377835,
"learning_rate": 5e-05,
"loss": 1.7569,
"step": 338
},
{
"epoch": 0.3417683234196996,
"grad_norm": 0.14909601316736315,
"learning_rate": 5e-05,
"loss": 1.7646,
"step": 339
},
{
"epoch": 0.3427764895654804,
"grad_norm": 0.12623421831242762,
"learning_rate": 5e-05,
"loss": 1.7574,
"step": 340
},
{
"epoch": 0.3437846557112612,
"grad_norm": 0.1536075709940801,
"learning_rate": 5e-05,
"loss": 1.7522,
"step": 341
},
{
"epoch": 0.344792821857042,
"grad_norm": 0.13677884574284213,
"learning_rate": 5e-05,
"loss": 1.7663,
"step": 342
},
{
"epoch": 0.3458009880028229,
"grad_norm": 0.1452552313969236,
"learning_rate": 5e-05,
"loss": 1.7519,
"step": 343
},
{
"epoch": 0.3468091541486037,
"grad_norm": 0.13100920298589408,
"learning_rate": 5e-05,
"loss": 1.7351,
"step": 344
},
{
"epoch": 0.3478173202943845,
"grad_norm": 0.1529118284017465,
"learning_rate": 5e-05,
"loss": 1.7425,
"step": 345
},
{
"epoch": 0.34882548644016537,
"grad_norm": 0.1449911362454515,
"learning_rate": 5e-05,
"loss": 1.7642,
"step": 346
},
{
"epoch": 0.3498336525859462,
"grad_norm": 0.13800055137890146,
"learning_rate": 5e-05,
"loss": 1.7562,
"step": 347
},
{
"epoch": 0.350841818731727,
"grad_norm": 0.13597864974899437,
"learning_rate": 5e-05,
"loss": 1.7589,
"step": 348
},
{
"epoch": 0.3518499848775078,
"grad_norm": 0.1364345014185089,
"learning_rate": 5e-05,
"loss": 1.7617,
"step": 349
},
{
"epoch": 0.35285815102328866,
"grad_norm": 0.13446515013689875,
"learning_rate": 5e-05,
"loss": 1.7451,
"step": 350
},
{
"epoch": 0.35386631716906947,
"grad_norm": 0.13355960968251984,
"learning_rate": 5e-05,
"loss": 1.7484,
"step": 351
},
{
"epoch": 0.3548744833148503,
"grad_norm": 0.1421313311554251,
"learning_rate": 5e-05,
"loss": 1.7596,
"step": 352
},
{
"epoch": 0.35588264946063114,
"grad_norm": 0.14566126575906166,
"learning_rate": 5e-05,
"loss": 1.7312,
"step": 353
},
{
"epoch": 0.35689081560641195,
"grad_norm": 0.13035111170688612,
"learning_rate": 5e-05,
"loss": 1.7614,
"step": 354
},
{
"epoch": 0.35789898175219276,
"grad_norm": 0.13286517726961428,
"learning_rate": 5e-05,
"loss": 1.7472,
"step": 355
},
{
"epoch": 0.35890714789797357,
"grad_norm": 0.1372842264280202,
"learning_rate": 5e-05,
"loss": 1.7767,
"step": 356
},
{
"epoch": 0.35991531404375443,
"grad_norm": 0.14233910976123346,
"learning_rate": 5e-05,
"loss": 1.7592,
"step": 357
},
{
"epoch": 0.36092348018953524,
"grad_norm": 0.13669422347097734,
"learning_rate": 5e-05,
"loss": 1.7538,
"step": 358
},
{
"epoch": 0.36193164633531605,
"grad_norm": 0.15205769492979604,
"learning_rate": 5e-05,
"loss": 1.7576,
"step": 359
},
{
"epoch": 0.36293981248109686,
"grad_norm": 0.13378904395785118,
"learning_rate": 5e-05,
"loss": 1.7626,
"step": 360
},
{
"epoch": 0.3639479786268777,
"grad_norm": 0.3089538056627945,
"learning_rate": 5e-05,
"loss": 1.7493,
"step": 361
},
{
"epoch": 0.36495614477265853,
"grad_norm": 0.14212608488794012,
"learning_rate": 5e-05,
"loss": 1.743,
"step": 362
},
{
"epoch": 0.36596431091843934,
"grad_norm": 0.14052901555742894,
"learning_rate": 5e-05,
"loss": 1.7326,
"step": 363
},
{
"epoch": 0.3669724770642202,
"grad_norm": 0.1278029391584442,
"learning_rate": 5e-05,
"loss": 1.7438,
"step": 364
},
{
"epoch": 0.367980643210001,
"grad_norm": 0.14825199869813654,
"learning_rate": 5e-05,
"loss": 1.7622,
"step": 365
},
{
"epoch": 0.3689888093557818,
"grad_norm": 0.1463739970423256,
"learning_rate": 5e-05,
"loss": 1.7574,
"step": 366
},
{
"epoch": 0.36999697550156263,
"grad_norm": 0.33813722142883335,
"learning_rate": 5e-05,
"loss": 1.7545,
"step": 367
},
{
"epoch": 0.3710051416473435,
"grad_norm": 0.14054743655697394,
"learning_rate": 5e-05,
"loss": 1.7437,
"step": 368
},
{
"epoch": 0.3720133077931243,
"grad_norm": 0.1515425002596235,
"learning_rate": 5e-05,
"loss": 1.7612,
"step": 369
},
{
"epoch": 0.3730214739389051,
"grad_norm": 0.1389451504765318,
"learning_rate": 5e-05,
"loss": 1.7674,
"step": 370
},
{
"epoch": 0.374029640084686,
"grad_norm": 0.13701439256551207,
"learning_rate": 5e-05,
"loss": 1.7611,
"step": 371
},
{
"epoch": 0.3750378062304668,
"grad_norm": 0.14830435767997738,
"learning_rate": 5e-05,
"loss": 1.7508,
"step": 372
},
{
"epoch": 0.3760459723762476,
"grad_norm": 0.1302612126588965,
"learning_rate": 5e-05,
"loss": 1.7452,
"step": 373
},
{
"epoch": 0.3770541385220284,
"grad_norm": 0.15597780892702984,
"learning_rate": 5e-05,
"loss": 1.751,
"step": 374
},
{
"epoch": 0.37806230466780927,
"grad_norm": 0.14754988055242355,
"learning_rate": 5e-05,
"loss": 1.7384,
"step": 375
},
{
"epoch": 0.3790704708135901,
"grad_norm": 0.1314239062985928,
"learning_rate": 5e-05,
"loss": 1.7456,
"step": 376
},
{
"epoch": 0.3800786369593709,
"grad_norm": 0.14764554065106597,
"learning_rate": 5e-05,
"loss": 1.7498,
"step": 377
},
{
"epoch": 0.38108680310515175,
"grad_norm": 0.14732407564585398,
"learning_rate": 5e-05,
"loss": 1.7508,
"step": 378
},
{
"epoch": 0.38209496925093256,
"grad_norm": 0.12259986325733822,
"learning_rate": 5e-05,
"loss": 1.7387,
"step": 379
},
{
"epoch": 0.38310313539671337,
"grad_norm": 0.2808281947042048,
"learning_rate": 5e-05,
"loss": 1.7471,
"step": 380
},
{
"epoch": 0.3841113015424942,
"grad_norm": 0.1460713637825732,
"learning_rate": 5e-05,
"loss": 1.745,
"step": 381
},
{
"epoch": 0.38511946768827504,
"grad_norm": 0.14483673080907972,
"learning_rate": 5e-05,
"loss": 1.7376,
"step": 382
},
{
"epoch": 0.38612763383405585,
"grad_norm": 0.1378264320137719,
"learning_rate": 5e-05,
"loss": 1.7477,
"step": 383
},
{
"epoch": 0.38713579997983666,
"grad_norm": 0.15073976260859417,
"learning_rate": 5e-05,
"loss": 1.737,
"step": 384
},
{
"epoch": 0.3881439661256175,
"grad_norm": 0.16818259732146923,
"learning_rate": 5e-05,
"loss": 1.7322,
"step": 385
},
{
"epoch": 0.38915213227139833,
"grad_norm": 0.14184910393165215,
"learning_rate": 5e-05,
"loss": 1.7523,
"step": 386
},
{
"epoch": 0.39016029841717914,
"grad_norm": 0.1426597749017869,
"learning_rate": 5e-05,
"loss": 1.7265,
"step": 387
},
{
"epoch": 0.39116846456295995,
"grad_norm": 0.15016411676683258,
"learning_rate": 5e-05,
"loss": 1.7492,
"step": 388
},
{
"epoch": 0.3921766307087408,
"grad_norm": 0.1563835678153886,
"learning_rate": 5e-05,
"loss": 1.7418,
"step": 389
},
{
"epoch": 0.3931847968545216,
"grad_norm": 0.14052651716274392,
"learning_rate": 5e-05,
"loss": 1.7375,
"step": 390
},
{
"epoch": 0.39419296300030243,
"grad_norm": 0.15568693064666192,
"learning_rate": 5e-05,
"loss": 1.741,
"step": 391
},
{
"epoch": 0.3952011291460833,
"grad_norm": 0.1451434834091193,
"learning_rate": 5e-05,
"loss": 1.7385,
"step": 392
},
{
"epoch": 0.3962092952918641,
"grad_norm": 0.1544580671517428,
"learning_rate": 5e-05,
"loss": 1.7523,
"step": 393
},
{
"epoch": 0.3972174614376449,
"grad_norm": 0.1381900723590304,
"learning_rate": 5e-05,
"loss": 1.7372,
"step": 394
},
{
"epoch": 0.3982256275834257,
"grad_norm": 0.15968284750095027,
"learning_rate": 5e-05,
"loss": 1.7543,
"step": 395
},
{
"epoch": 0.3992337937292066,
"grad_norm": 0.14776907712047604,
"learning_rate": 5e-05,
"loss": 1.7394,
"step": 396
},
{
"epoch": 0.4002419598749874,
"grad_norm": 0.13501018733973855,
"learning_rate": 5e-05,
"loss": 1.7518,
"step": 397
},
{
"epoch": 0.4012501260207682,
"grad_norm": 0.16874153226114574,
"learning_rate": 5e-05,
"loss": 1.7176,
"step": 398
},
{
"epoch": 0.40225829216654907,
"grad_norm": 0.14973959531484612,
"learning_rate": 5e-05,
"loss": 1.7389,
"step": 399
},
{
"epoch": 0.4032664583123299,
"grad_norm": 0.1547127773297765,
"learning_rate": 5e-05,
"loss": 1.7472,
"step": 400
},
{
"epoch": 0.4042746244581107,
"grad_norm": 0.1460619660702291,
"learning_rate": 5e-05,
"loss": 1.7224,
"step": 401
},
{
"epoch": 0.4052827906038915,
"grad_norm": 0.1478494609390841,
"learning_rate": 5e-05,
"loss": 1.7697,
"step": 402
},
{
"epoch": 0.40629095674967236,
"grad_norm": 0.1398560289935615,
"learning_rate": 5e-05,
"loss": 1.7312,
"step": 403
},
{
"epoch": 0.40729912289545317,
"grad_norm": 0.13341445069107552,
"learning_rate": 5e-05,
"loss": 1.7499,
"step": 404
},
{
"epoch": 0.408307289041234,
"grad_norm": 0.138920341930684,
"learning_rate": 5e-05,
"loss": 1.7369,
"step": 405
},
{
"epoch": 0.40931545518701484,
"grad_norm": 0.1406715694146648,
"learning_rate": 5e-05,
"loss": 1.7555,
"step": 406
},
{
"epoch": 0.41032362133279565,
"grad_norm": 0.13978239406309753,
"learning_rate": 5e-05,
"loss": 1.7327,
"step": 407
},
{
"epoch": 0.41133178747857646,
"grad_norm": 0.138769156475813,
"learning_rate": 5e-05,
"loss": 1.7332,
"step": 408
},
{
"epoch": 0.41233995362435727,
"grad_norm": 0.13985747952648767,
"learning_rate": 5e-05,
"loss": 1.7225,
"step": 409
},
{
"epoch": 0.41334811977013813,
"grad_norm": 0.15206432665256198,
"learning_rate": 5e-05,
"loss": 1.7441,
"step": 410
},
{
"epoch": 0.41435628591591894,
"grad_norm": 0.13933462078819422,
"learning_rate": 5e-05,
"loss": 1.7413,
"step": 411
},
{
"epoch": 0.41536445206169975,
"grad_norm": 2.2257681112592564,
"learning_rate": 5e-05,
"loss": 1.7312,
"step": 412
},
{
"epoch": 0.4163726182074806,
"grad_norm": 0.1690367229814024,
"learning_rate": 5e-05,
"loss": 1.7225,
"step": 413
},
{
"epoch": 0.4173807843532614,
"grad_norm": 0.14142371640158033,
"learning_rate": 5e-05,
"loss": 1.7328,
"step": 414
},
{
"epoch": 0.41838895049904223,
"grad_norm": 0.1443718391541402,
"learning_rate": 5e-05,
"loss": 1.7357,
"step": 415
},
{
"epoch": 0.41939711664482304,
"grad_norm": 0.2948974879165241,
"learning_rate": 5e-05,
"loss": 1.7537,
"step": 416
},
{
"epoch": 0.4204052827906039,
"grad_norm": 0.1502743223563147,
"learning_rate": 5e-05,
"loss": 1.7405,
"step": 417
},
{
"epoch": 0.4214134489363847,
"grad_norm": 0.14613986823609737,
"learning_rate": 5e-05,
"loss": 1.7577,
"step": 418
},
{
"epoch": 0.4224216150821655,
"grad_norm": 0.13894542829285816,
"learning_rate": 5e-05,
"loss": 1.7496,
"step": 419
},
{
"epoch": 0.4234297812279464,
"grad_norm": 0.14043550419038592,
"learning_rate": 5e-05,
"loss": 1.7474,
"step": 420
},
{
"epoch": 0.4244379473737272,
"grad_norm": 0.13780554695644626,
"learning_rate": 5e-05,
"loss": 1.7456,
"step": 421
},
{
"epoch": 0.425446113519508,
"grad_norm": 0.14639228010470762,
"learning_rate": 5e-05,
"loss": 1.7354,
"step": 422
},
{
"epoch": 0.4264542796652888,
"grad_norm": 0.1520050370518254,
"learning_rate": 5e-05,
"loss": 1.7448,
"step": 423
},
{
"epoch": 0.4274624458110697,
"grad_norm": 0.13683884365296195,
"learning_rate": 5e-05,
"loss": 1.7166,
"step": 424
},
{
"epoch": 0.4284706119568505,
"grad_norm": 0.15001649638954673,
"learning_rate": 5e-05,
"loss": 1.7414,
"step": 425
},
{
"epoch": 0.4294787781026313,
"grad_norm": 0.14911716164608302,
"learning_rate": 5e-05,
"loss": 1.743,
"step": 426
},
{
"epoch": 0.43048694424841216,
"grad_norm": 0.14543327208617476,
"learning_rate": 5e-05,
"loss": 1.7389,
"step": 427
},
{
"epoch": 0.43149511039419297,
"grad_norm": 0.14680781392232245,
"learning_rate": 5e-05,
"loss": 1.7493,
"step": 428
},
{
"epoch": 0.4325032765399738,
"grad_norm": 0.14236841418985777,
"learning_rate": 5e-05,
"loss": 1.7366,
"step": 429
},
{
"epoch": 0.4335114426857546,
"grad_norm": 0.14485011489323618,
"learning_rate": 5e-05,
"loss": 1.7171,
"step": 430
},
{
"epoch": 0.43451960883153545,
"grad_norm": 0.13241397122387577,
"learning_rate": 5e-05,
"loss": 1.7482,
"step": 431
},
{
"epoch": 0.43552777497731626,
"grad_norm": 0.14853569815804848,
"learning_rate": 5e-05,
"loss": 1.7347,
"step": 432
},
{
"epoch": 0.43653594112309707,
"grad_norm": 0.13704108362000308,
"learning_rate": 5e-05,
"loss": 1.731,
"step": 433
},
{
"epoch": 0.43754410726887794,
"grad_norm": 0.13536978394353463,
"learning_rate": 5e-05,
"loss": 1.7324,
"step": 434
},
{
"epoch": 0.43855227341465874,
"grad_norm": 0.12979908901630022,
"learning_rate": 5e-05,
"loss": 1.7284,
"step": 435
},
{
"epoch": 0.43956043956043955,
"grad_norm": 0.1470395254667769,
"learning_rate": 5e-05,
"loss": 1.7364,
"step": 436
},
{
"epoch": 0.44056860570622036,
"grad_norm": 0.13171786834848215,
"learning_rate": 5e-05,
"loss": 1.7358,
"step": 437
},
{
"epoch": 0.4415767718520012,
"grad_norm": 0.12371696720195713,
"learning_rate": 5e-05,
"loss": 1.7304,
"step": 438
},
{
"epoch": 0.44258493799778204,
"grad_norm": 0.1396213895694086,
"learning_rate": 5e-05,
"loss": 1.7408,
"step": 439
},
{
"epoch": 0.44359310414356284,
"grad_norm": 0.14030549983014554,
"learning_rate": 5e-05,
"loss": 1.7274,
"step": 440
},
{
"epoch": 0.4446012702893437,
"grad_norm": 0.13146485100443162,
"learning_rate": 5e-05,
"loss": 1.7235,
"step": 441
},
{
"epoch": 0.4456094364351245,
"grad_norm": 0.14614864845330183,
"learning_rate": 5e-05,
"loss": 1.7316,
"step": 442
},
{
"epoch": 0.4466176025809053,
"grad_norm": 0.14869530126493308,
"learning_rate": 5e-05,
"loss": 1.7198,
"step": 443
},
{
"epoch": 0.44762576872668614,
"grad_norm": 0.14476843782545282,
"learning_rate": 5e-05,
"loss": 1.711,
"step": 444
},
{
"epoch": 0.448633934872467,
"grad_norm": 0.14384381110954358,
"learning_rate": 5e-05,
"loss": 1.735,
"step": 445
},
{
"epoch": 0.4496421010182478,
"grad_norm": 0.14075247779244654,
"learning_rate": 5e-05,
"loss": 1.7334,
"step": 446
},
{
"epoch": 0.4506502671640286,
"grad_norm": 0.13764214124217602,
"learning_rate": 5e-05,
"loss": 1.7288,
"step": 447
},
{
"epoch": 0.4516584333098095,
"grad_norm": 0.14818845579532672,
"learning_rate": 5e-05,
"loss": 1.7412,
"step": 448
},
{
"epoch": 0.4526665994555903,
"grad_norm": 0.15111207644428082,
"learning_rate": 5e-05,
"loss": 1.7358,
"step": 449
},
{
"epoch": 0.4536747656013711,
"grad_norm": 0.14168477020342377,
"learning_rate": 5e-05,
"loss": 1.7449,
"step": 450
},
{
"epoch": 0.4546829317471519,
"grad_norm": 0.15078242479582551,
"learning_rate": 5e-05,
"loss": 1.7211,
"step": 451
},
{
"epoch": 0.4556910978929328,
"grad_norm": 0.1385439837802104,
"learning_rate": 5e-05,
"loss": 1.739,
"step": 452
},
{
"epoch": 0.4566992640387136,
"grad_norm": 0.14591800685044598,
"learning_rate": 5e-05,
"loss": 1.725,
"step": 453
},
{
"epoch": 0.4577074301844944,
"grad_norm": 0.22199306850298647,
"learning_rate": 5e-05,
"loss": 1.7465,
"step": 454
},
{
"epoch": 0.45871559633027525,
"grad_norm": 0.15203439655834225,
"learning_rate": 5e-05,
"loss": 1.7268,
"step": 455
},
{
"epoch": 0.45972376247605606,
"grad_norm": 0.13798415129149033,
"learning_rate": 5e-05,
"loss": 1.7438,
"step": 456
},
{
"epoch": 0.4607319286218369,
"grad_norm": 0.13188851417331895,
"learning_rate": 5e-05,
"loss": 1.7236,
"step": 457
},
{
"epoch": 0.4617400947676177,
"grad_norm": 0.1334541786269807,
"learning_rate": 5e-05,
"loss": 1.7299,
"step": 458
},
{
"epoch": 0.46274826091339855,
"grad_norm": 0.1377200829622698,
"learning_rate": 5e-05,
"loss": 1.7099,
"step": 459
},
{
"epoch": 0.46375642705917935,
"grad_norm": 0.14092160238477452,
"learning_rate": 5e-05,
"loss": 1.7491,
"step": 460
},
{
"epoch": 0.46476459320496016,
"grad_norm": 0.12847342176262316,
"learning_rate": 5e-05,
"loss": 1.7287,
"step": 461
},
{
"epoch": 0.46577275935074103,
"grad_norm": 0.15559256378324404,
"learning_rate": 5e-05,
"loss": 1.7376,
"step": 462
},
{
"epoch": 0.46678092549652184,
"grad_norm": 0.12551572964931354,
"learning_rate": 5e-05,
"loss": 1.753,
"step": 463
},
{
"epoch": 0.46778909164230265,
"grad_norm": 0.1364430200172465,
"learning_rate": 5e-05,
"loss": 1.7317,
"step": 464
},
{
"epoch": 0.46879725778808345,
"grad_norm": 0.14685795918261627,
"learning_rate": 5e-05,
"loss": 1.739,
"step": 465
},
{
"epoch": 0.4698054239338643,
"grad_norm": 0.13843596201019037,
"learning_rate": 5e-05,
"loss": 1.7327,
"step": 466
},
{
"epoch": 0.47081359007964513,
"grad_norm": 0.14978074414352593,
"learning_rate": 5e-05,
"loss": 1.7122,
"step": 467
},
{
"epoch": 0.47182175622542594,
"grad_norm": 0.1343468710676364,
"learning_rate": 5e-05,
"loss": 1.717,
"step": 468
},
{
"epoch": 0.4728299223712068,
"grad_norm": 0.12935824246473474,
"learning_rate": 5e-05,
"loss": 1.7288,
"step": 469
},
{
"epoch": 0.4738380885169876,
"grad_norm": 0.13786240231667163,
"learning_rate": 5e-05,
"loss": 1.7252,
"step": 470
},
{
"epoch": 0.4748462546627684,
"grad_norm": 0.14555494126848026,
"learning_rate": 5e-05,
"loss": 1.7274,
"step": 471
},
{
"epoch": 0.4758544208085492,
"grad_norm": 0.14024877099742716,
"learning_rate": 5e-05,
"loss": 1.7278,
"step": 472
},
{
"epoch": 0.4768625869543301,
"grad_norm": 0.1409529955461895,
"learning_rate": 5e-05,
"loss": 1.7283,
"step": 473
},
{
"epoch": 0.4778707531001109,
"grad_norm": 0.13352675400260733,
"learning_rate": 5e-05,
"loss": 1.7334,
"step": 474
},
{
"epoch": 0.4788789192458917,
"grad_norm": 0.13271967028157422,
"learning_rate": 5e-05,
"loss": 1.73,
"step": 475
},
{
"epoch": 0.4798870853916726,
"grad_norm": 0.14269298630641805,
"learning_rate": 5e-05,
"loss": 1.7339,
"step": 476
},
{
"epoch": 0.4808952515374534,
"grad_norm": 0.13466898722756104,
"learning_rate": 5e-05,
"loss": 1.7362,
"step": 477
},
{
"epoch": 0.4819034176832342,
"grad_norm": 0.14709845539406763,
"learning_rate": 5e-05,
"loss": 1.7369,
"step": 478
},
{
"epoch": 0.482911583829015,
"grad_norm": 0.8605202681074835,
"learning_rate": 5e-05,
"loss": 1.7396,
"step": 479
},
{
"epoch": 0.48391974997479587,
"grad_norm": 0.1408289867387891,
"learning_rate": 5e-05,
"loss": 1.7486,
"step": 480
},
{
"epoch": 0.4849279161205767,
"grad_norm": 0.13874114391890766,
"learning_rate": 5e-05,
"loss": 1.7298,
"step": 481
},
{
"epoch": 0.4859360822663575,
"grad_norm": 0.13518417994710039,
"learning_rate": 5e-05,
"loss": 1.7425,
"step": 482
},
{
"epoch": 0.48694424841213835,
"grad_norm": 0.13536099460502093,
"learning_rate": 5e-05,
"loss": 1.7366,
"step": 483
},
{
"epoch": 0.48795241455791916,
"grad_norm": 0.14571584731434253,
"learning_rate": 5e-05,
"loss": 1.7179,
"step": 484
},
{
"epoch": 0.48896058070369997,
"grad_norm": 0.1532694552886724,
"learning_rate": 5e-05,
"loss": 1.7261,
"step": 485
},
{
"epoch": 0.4899687468494808,
"grad_norm": 0.1488328657164774,
"learning_rate": 5e-05,
"loss": 1.7295,
"step": 486
},
{
"epoch": 0.49097691299526164,
"grad_norm": 0.16219320768165912,
"learning_rate": 5e-05,
"loss": 1.7513,
"step": 487
},
{
"epoch": 0.49198507914104245,
"grad_norm": 0.1567614713551956,
"learning_rate": 5e-05,
"loss": 1.7395,
"step": 488
},
{
"epoch": 0.49299324528682326,
"grad_norm": 0.14386584627809132,
"learning_rate": 5e-05,
"loss": 1.7374,
"step": 489
},
{
"epoch": 0.4940014114326041,
"grad_norm": 0.1477080265489417,
"learning_rate": 5e-05,
"loss": 1.7299,
"step": 490
},
{
"epoch": 0.49500957757838493,
"grad_norm": 0.15832395917512415,
"learning_rate": 5e-05,
"loss": 1.7102,
"step": 491
},
{
"epoch": 0.49601774372416574,
"grad_norm": 0.14645390760965665,
"learning_rate": 5e-05,
"loss": 1.7303,
"step": 492
},
{
"epoch": 0.49702590986994655,
"grad_norm": 0.15695111138785844,
"learning_rate": 5e-05,
"loss": 1.7365,
"step": 493
},
{
"epoch": 0.4980340760157274,
"grad_norm": 0.15125086541771027,
"learning_rate": 5e-05,
"loss": 1.7329,
"step": 494
},
{
"epoch": 0.4990422421615082,
"grad_norm": 0.15968163890366974,
"learning_rate": 5e-05,
"loss": 1.7357,
"step": 495
},
{
"epoch": 0.5000504083072891,
"grad_norm": 0.16114977122007354,
"learning_rate": 5e-05,
"loss": 1.7425,
"step": 496
},
{
"epoch": 0.5010585744530699,
"grad_norm": 0.15366693494225167,
"learning_rate": 5e-05,
"loss": 1.7324,
"step": 497
},
{
"epoch": 0.5020667405988507,
"grad_norm": 0.1702223823593283,
"learning_rate": 5e-05,
"loss": 1.7431,
"step": 498
},
{
"epoch": 0.5030749067446315,
"grad_norm": 0.13707825885720634,
"learning_rate": 5e-05,
"loss": 1.727,
"step": 499
},
{
"epoch": 0.5040830728904123,
"grad_norm": 0.17011916766699697,
"learning_rate": 5e-05,
"loss": 1.7499,
"step": 500
},
{
"epoch": 0.5050912390361931,
"grad_norm": 0.13836910508259959,
"learning_rate": 5e-05,
"loss": 1.7163,
"step": 501
},
{
"epoch": 0.5060994051819739,
"grad_norm": 0.15250504212866517,
"learning_rate": 5e-05,
"loss": 1.7036,
"step": 502
},
{
"epoch": 0.5071075713277549,
"grad_norm": 0.14003029207685944,
"learning_rate": 5e-05,
"loss": 1.7243,
"step": 503
},
{
"epoch": 0.5081157374735357,
"grad_norm": 0.1402206682632946,
"learning_rate": 5e-05,
"loss": 1.7233,
"step": 504
},
{
"epoch": 0.5091239036193165,
"grad_norm": 0.1386346284282238,
"learning_rate": 5e-05,
"loss": 1.7264,
"step": 505
},
{
"epoch": 0.5101320697650973,
"grad_norm": 0.1501064183846461,
"learning_rate": 5e-05,
"loss": 1.703,
"step": 506
},
{
"epoch": 0.5111402359108781,
"grad_norm": 0.13375944136661508,
"learning_rate": 5e-05,
"loss": 1.7252,
"step": 507
},
{
"epoch": 0.5121484020566589,
"grad_norm": 0.1290192933245119,
"learning_rate": 5e-05,
"loss": 1.745,
"step": 508
},
{
"epoch": 0.5131565682024397,
"grad_norm": 0.13695047818203324,
"learning_rate": 5e-05,
"loss": 1.7053,
"step": 509
},
{
"epoch": 0.5141647343482206,
"grad_norm": 0.14082092939847418,
"learning_rate": 5e-05,
"loss": 1.7329,
"step": 510
},
{
"epoch": 0.5151729004940014,
"grad_norm": 0.13646757680134972,
"learning_rate": 5e-05,
"loss": 1.721,
"step": 511
},
{
"epoch": 0.5161810666397822,
"grad_norm": 0.13407540360293163,
"learning_rate": 5e-05,
"loss": 1.7281,
"step": 512
},
{
"epoch": 0.5171892327855631,
"grad_norm": 0.13098469909538138,
"learning_rate": 5e-05,
"loss": 1.7262,
"step": 513
},
{
"epoch": 0.5181973989313439,
"grad_norm": 0.14710547224222956,
"learning_rate": 5e-05,
"loss": 1.7274,
"step": 514
},
{
"epoch": 0.5192055650771247,
"grad_norm": 0.13328204864391038,
"learning_rate": 5e-05,
"loss": 1.7219,
"step": 515
},
{
"epoch": 0.5202137312229055,
"grad_norm": 0.13775879403810146,
"learning_rate": 5e-05,
"loss": 1.7106,
"step": 516
},
{
"epoch": 0.5212218973686864,
"grad_norm": 0.1361065570686086,
"learning_rate": 5e-05,
"loss": 1.7091,
"step": 517
},
{
"epoch": 0.5222300635144672,
"grad_norm": 0.14499957520034,
"learning_rate": 5e-05,
"loss": 1.7251,
"step": 518
},
{
"epoch": 0.523238229660248,
"grad_norm": 0.12979459429272397,
"learning_rate": 5e-05,
"loss": 1.725,
"step": 519
},
{
"epoch": 0.5242463958060288,
"grad_norm": 0.14007157320777774,
"learning_rate": 5e-05,
"loss": 1.7235,
"step": 520
},
{
"epoch": 0.5252545619518096,
"grad_norm": 0.14159905194394676,
"learning_rate": 5e-05,
"loss": 1.7328,
"step": 521
},
{
"epoch": 0.5262627280975904,
"grad_norm": 0.13713095256059554,
"learning_rate": 5e-05,
"loss": 1.7318,
"step": 522
},
{
"epoch": 0.5272708942433713,
"grad_norm": 0.13585308519517678,
"learning_rate": 5e-05,
"loss": 1.7107,
"step": 523
},
{
"epoch": 0.5282790603891522,
"grad_norm": 0.13652232276260395,
"learning_rate": 5e-05,
"loss": 1.7026,
"step": 524
},
{
"epoch": 0.529287226534933,
"grad_norm": 0.14269519168803854,
"learning_rate": 5e-05,
"loss": 1.7168,
"step": 525
},
{
"epoch": 0.5302953926807138,
"grad_norm": 0.1546096414052513,
"learning_rate": 5e-05,
"loss": 1.7075,
"step": 526
},
{
"epoch": 0.5313035588264946,
"grad_norm": 0.1451610095802245,
"learning_rate": 5e-05,
"loss": 1.7191,
"step": 527
},
{
"epoch": 0.5323117249722754,
"grad_norm": 0.14790875937903392,
"learning_rate": 5e-05,
"loss": 1.731,
"step": 528
},
{
"epoch": 0.5333198911180562,
"grad_norm": 0.13845540973359793,
"learning_rate": 5e-05,
"loss": 1.712,
"step": 529
},
{
"epoch": 0.534328057263837,
"grad_norm": 0.14603408754761985,
"learning_rate": 5e-05,
"loss": 1.716,
"step": 530
},
{
"epoch": 0.535336223409618,
"grad_norm": 0.14745894068649418,
"learning_rate": 5e-05,
"loss": 1.7208,
"step": 531
},
{
"epoch": 0.5363443895553988,
"grad_norm": 0.15007467357406096,
"learning_rate": 5e-05,
"loss": 1.7197,
"step": 532
},
{
"epoch": 0.5373525557011796,
"grad_norm": 0.1371057758498084,
"learning_rate": 5e-05,
"loss": 1.7045,
"step": 533
},
{
"epoch": 0.5383607218469604,
"grad_norm": 0.1359949109326032,
"learning_rate": 5e-05,
"loss": 1.728,
"step": 534
},
{
"epoch": 0.5393688879927412,
"grad_norm": 0.13925029366834532,
"learning_rate": 5e-05,
"loss": 1.7302,
"step": 535
},
{
"epoch": 0.540377054138522,
"grad_norm": 0.13327284315989982,
"learning_rate": 5e-05,
"loss": 1.7336,
"step": 536
},
{
"epoch": 0.5413852202843028,
"grad_norm": 0.14631809213996722,
"learning_rate": 5e-05,
"loss": 1.7125,
"step": 537
},
{
"epoch": 0.5423933864300837,
"grad_norm": 0.15397077312479548,
"learning_rate": 5e-05,
"loss": 1.7171,
"step": 538
},
{
"epoch": 0.5434015525758645,
"grad_norm": 0.13392329130753536,
"learning_rate": 5e-05,
"loss": 1.7127,
"step": 539
},
{
"epoch": 0.5444097187216453,
"grad_norm": 0.13337174252821263,
"learning_rate": 5e-05,
"loss": 1.7155,
"step": 540
},
{
"epoch": 0.5454178848674262,
"grad_norm": 0.13116751901128795,
"learning_rate": 5e-05,
"loss": 1.7024,
"step": 541
},
{
"epoch": 0.546426051013207,
"grad_norm": 0.1428951010708818,
"learning_rate": 5e-05,
"loss": 1.7333,
"step": 542
},
{
"epoch": 0.5474342171589878,
"grad_norm": 0.1290258219479217,
"learning_rate": 5e-05,
"loss": 1.7159,
"step": 543
},
{
"epoch": 0.5484423833047686,
"grad_norm": 0.1253949130299628,
"learning_rate": 5e-05,
"loss": 1.7108,
"step": 544
},
{
"epoch": 0.5494505494505495,
"grad_norm": 0.13389518594876162,
"learning_rate": 5e-05,
"loss": 1.7207,
"step": 545
},
{
"epoch": 0.5504587155963303,
"grad_norm": 0.12847899268177035,
"learning_rate": 5e-05,
"loss": 1.7199,
"step": 546
},
{
"epoch": 0.5514668817421111,
"grad_norm": 0.1349692531199703,
"learning_rate": 5e-05,
"loss": 1.7218,
"step": 547
},
{
"epoch": 0.5524750478878919,
"grad_norm": 0.13660970361600422,
"learning_rate": 5e-05,
"loss": 1.7012,
"step": 548
},
{
"epoch": 0.5534832140336727,
"grad_norm": 0.1289091568097243,
"learning_rate": 5e-05,
"loss": 1.7101,
"step": 549
},
{
"epoch": 0.5544913801794535,
"grad_norm": 0.1267210192380341,
"learning_rate": 5e-05,
"loss": 1.748,
"step": 550
},
{
"epoch": 0.5554995463252344,
"grad_norm": 0.5082915304718543,
"learning_rate": 5e-05,
"loss": 1.7317,
"step": 551
},
{
"epoch": 0.5565077124710153,
"grad_norm": 0.13703430635922975,
"learning_rate": 5e-05,
"loss": 1.7122,
"step": 552
},
{
"epoch": 0.5575158786167961,
"grad_norm": 0.14737679979743612,
"learning_rate": 5e-05,
"loss": 1.7129,
"step": 553
},
{
"epoch": 0.5585240447625769,
"grad_norm": 0.13721196520311135,
"learning_rate": 5e-05,
"loss": 1.7061,
"step": 554
},
{
"epoch": 0.5595322109083577,
"grad_norm": 0.13752293899101511,
"learning_rate": 5e-05,
"loss": 1.7129,
"step": 555
},
{
"epoch": 0.5605403770541385,
"grad_norm": 0.14405745219156493,
"learning_rate": 5e-05,
"loss": 1.732,
"step": 556
},
{
"epoch": 0.5615485431999193,
"grad_norm": 0.14145457824757338,
"learning_rate": 5e-05,
"loss": 1.7129,
"step": 557
},
{
"epoch": 0.5625567093457001,
"grad_norm": 0.2815416558938452,
"learning_rate": 5e-05,
"loss": 1.7132,
"step": 558
},
{
"epoch": 0.563564875491481,
"grad_norm": 0.1348175656894975,
"learning_rate": 5e-05,
"loss": 1.7062,
"step": 559
},
{
"epoch": 0.5645730416372619,
"grad_norm": 0.1404056458141734,
"learning_rate": 5e-05,
"loss": 1.7041,
"step": 560
},
{
"epoch": 0.5655812077830427,
"grad_norm": 0.13557060086657308,
"learning_rate": 5e-05,
"loss": 1.6929,
"step": 561
},
{
"epoch": 0.5665893739288235,
"grad_norm": 0.1814667125480638,
"learning_rate": 5e-05,
"loss": 1.7188,
"step": 562
},
{
"epoch": 0.5675975400746043,
"grad_norm": 0.1495980796266418,
"learning_rate": 5e-05,
"loss": 1.7131,
"step": 563
},
{
"epoch": 0.5686057062203851,
"grad_norm": 0.1426252612271266,
"learning_rate": 5e-05,
"loss": 1.7111,
"step": 564
},
{
"epoch": 0.5696138723661659,
"grad_norm": 0.14109547917090415,
"learning_rate": 5e-05,
"loss": 1.7157,
"step": 565
},
{
"epoch": 0.5706220385119468,
"grad_norm": 1.1846012501687457,
"learning_rate": 5e-05,
"loss": 1.7389,
"step": 566
},
{
"epoch": 0.5716302046577276,
"grad_norm": 0.13348962336896375,
"learning_rate": 5e-05,
"loss": 1.7395,
"step": 567
},
{
"epoch": 0.5726383708035084,
"grad_norm": 0.14031380805805546,
"learning_rate": 5e-05,
"loss": 1.712,
"step": 568
},
{
"epoch": 0.5736465369492892,
"grad_norm": 0.1310314007639987,
"learning_rate": 5e-05,
"loss": 1.693,
"step": 569
},
{
"epoch": 0.57465470309507,
"grad_norm": 0.13576282604410486,
"learning_rate": 5e-05,
"loss": 1.7147,
"step": 570
},
{
"epoch": 0.5756628692408509,
"grad_norm": 0.13663564305484593,
"learning_rate": 5e-05,
"loss": 1.7218,
"step": 571
},
{
"epoch": 0.5766710353866317,
"grad_norm": 0.13310876341309824,
"learning_rate": 5e-05,
"loss": 1.7115,
"step": 572
},
{
"epoch": 0.5776792015324126,
"grad_norm": 0.13371585466789873,
"learning_rate": 5e-05,
"loss": 1.6953,
"step": 573
},
{
"epoch": 0.5786873676781934,
"grad_norm": 0.1325972537789156,
"learning_rate": 5e-05,
"loss": 1.7171,
"step": 574
},
{
"epoch": 0.5796955338239742,
"grad_norm": 0.1399994172395994,
"learning_rate": 5e-05,
"loss": 1.7164,
"step": 575
},
{
"epoch": 0.580703699969755,
"grad_norm": 0.13203487633668445,
"learning_rate": 5e-05,
"loss": 1.7071,
"step": 576
},
{
"epoch": 0.5817118661155358,
"grad_norm": 0.14386726865520158,
"learning_rate": 5e-05,
"loss": 1.7,
"step": 577
},
{
"epoch": 0.5827200322613166,
"grad_norm": 0.13018151439861664,
"learning_rate": 5e-05,
"loss": 1.729,
"step": 578
},
{
"epoch": 0.5837281984070974,
"grad_norm": 0.1447246872750649,
"learning_rate": 5e-05,
"loss": 1.6932,
"step": 579
},
{
"epoch": 0.5847363645528784,
"grad_norm": 0.16595954582029865,
"learning_rate": 5e-05,
"loss": 1.6853,
"step": 580
},
{
"epoch": 0.5857445306986592,
"grad_norm": 0.13139675694006203,
"learning_rate": 5e-05,
"loss": 1.6956,
"step": 581
},
{
"epoch": 0.58675269684444,
"grad_norm": 0.1296186802645693,
"learning_rate": 5e-05,
"loss": 1.704,
"step": 582
},
{
"epoch": 0.5877608629902208,
"grad_norm": 0.1417163740036467,
"learning_rate": 5e-05,
"loss": 1.7366,
"step": 583
},
{
"epoch": 0.5887690291360016,
"grad_norm": 0.1393769542097642,
"learning_rate": 5e-05,
"loss": 1.7082,
"step": 584
},
{
"epoch": 0.5897771952817824,
"grad_norm": 0.14036329264215688,
"learning_rate": 5e-05,
"loss": 1.7113,
"step": 585
},
{
"epoch": 0.5907853614275632,
"grad_norm": 0.1301775902839359,
"learning_rate": 5e-05,
"loss": 1.7102,
"step": 586
},
{
"epoch": 0.5917935275733441,
"grad_norm": 0.1390680768441475,
"learning_rate": 5e-05,
"loss": 1.7178,
"step": 587
},
{
"epoch": 0.592801693719125,
"grad_norm": 1.234403190380023,
"learning_rate": 5e-05,
"loss": 1.7226,
"step": 588
},
{
"epoch": 0.5938098598649058,
"grad_norm": 0.13833101198973996,
"learning_rate": 5e-05,
"loss": 1.7067,
"step": 589
},
{
"epoch": 0.5948180260106866,
"grad_norm": 0.14494669886852304,
"learning_rate": 5e-05,
"loss": 1.7266,
"step": 590
},
{
"epoch": 0.5958261921564674,
"grad_norm": 0.14362347468568545,
"learning_rate": 5e-05,
"loss": 1.7143,
"step": 591
},
{
"epoch": 0.5968343583022482,
"grad_norm": 0.1461984401259311,
"learning_rate": 5e-05,
"loss": 1.712,
"step": 592
},
{
"epoch": 0.597842524448029,
"grad_norm": 0.1739881660017132,
"learning_rate": 5e-05,
"loss": 1.7098,
"step": 593
},
{
"epoch": 0.5988506905938099,
"grad_norm": 0.18425300681672835,
"learning_rate": 5e-05,
"loss": 1.7064,
"step": 594
},
{
"epoch": 0.5998588567395907,
"grad_norm": 0.19078389638041413,
"learning_rate": 5e-05,
"loss": 1.7178,
"step": 595
},
{
"epoch": 0.6008670228853715,
"grad_norm": 0.1873949916072263,
"learning_rate": 5e-05,
"loss": 1.6959,
"step": 596
},
{
"epoch": 0.6018751890311523,
"grad_norm": 0.17344176857191784,
"learning_rate": 5e-05,
"loss": 1.7149,
"step": 597
},
{
"epoch": 0.6028833551769331,
"grad_norm": 0.17090514987747174,
"learning_rate": 5e-05,
"loss": 1.7267,
"step": 598
},
{
"epoch": 0.603891521322714,
"grad_norm": 0.1487087124109567,
"learning_rate": 5e-05,
"loss": 1.6872,
"step": 599
},
{
"epoch": 0.6048996874684948,
"grad_norm": 0.2528381347556544,
"learning_rate": 5e-05,
"loss": 1.7156,
"step": 600
},
{
"epoch": 0.6059078536142757,
"grad_norm": 0.15234832402505802,
"learning_rate": 5e-05,
"loss": 1.7009,
"step": 601
},
{
"epoch": 0.6069160197600565,
"grad_norm": 0.14617071918285268,
"learning_rate": 5e-05,
"loss": 1.7271,
"step": 602
},
{
"epoch": 0.6079241859058373,
"grad_norm": 0.1464277003567868,
"learning_rate": 5e-05,
"loss": 1.7059,
"step": 603
},
{
"epoch": 0.6089323520516181,
"grad_norm": 0.1364383676186627,
"learning_rate": 5e-05,
"loss": 1.699,
"step": 604
},
{
"epoch": 0.6099405181973989,
"grad_norm": 0.13348764051783354,
"learning_rate": 5e-05,
"loss": 1.6955,
"step": 605
},
{
"epoch": 0.6109486843431797,
"grad_norm": 0.14467069260355409,
"learning_rate": 5e-05,
"loss": 1.7234,
"step": 606
},
{
"epoch": 0.6119568504889605,
"grad_norm": 0.14430553492028114,
"learning_rate": 5e-05,
"loss": 1.7044,
"step": 607
},
{
"epoch": 0.6129650166347415,
"grad_norm": 0.13383413430124796,
"learning_rate": 5e-05,
"loss": 1.7153,
"step": 608
},
{
"epoch": 0.6139731827805223,
"grad_norm": 0.35332289652423776,
"learning_rate": 5e-05,
"loss": 1.7271,
"step": 609
},
{
"epoch": 0.6149813489263031,
"grad_norm": 0.1347172057729184,
"learning_rate": 5e-05,
"loss": 1.7054,
"step": 610
},
{
"epoch": 0.6159895150720839,
"grad_norm": 0.14344071966912086,
"learning_rate": 5e-05,
"loss": 1.7372,
"step": 611
},
{
"epoch": 0.6169976812178647,
"grad_norm": 0.13636203998882243,
"learning_rate": 5e-05,
"loss": 1.7011,
"step": 612
},
{
"epoch": 0.6180058473636455,
"grad_norm": 0.1330854184385751,
"learning_rate": 5e-05,
"loss": 1.6987,
"step": 613
},
{
"epoch": 0.6190140135094263,
"grad_norm": 0.13300020535045284,
"learning_rate": 5e-05,
"loss": 1.705,
"step": 614
},
{
"epoch": 0.6200221796552072,
"grad_norm": 0.13095614639583578,
"learning_rate": 5e-05,
"loss": 1.7164,
"step": 615
},
{
"epoch": 0.621030345800988,
"grad_norm": 0.13408249920884896,
"learning_rate": 5e-05,
"loss": 1.7114,
"step": 616
},
{
"epoch": 0.6220385119467688,
"grad_norm": 0.13287977802166426,
"learning_rate": 5e-05,
"loss": 1.7052,
"step": 617
},
{
"epoch": 0.6230466780925497,
"grad_norm": 0.13438176270298807,
"learning_rate": 5e-05,
"loss": 1.6987,
"step": 618
},
{
"epoch": 0.6240548442383305,
"grad_norm": 0.14213719214134174,
"learning_rate": 5e-05,
"loss": 1.7098,
"step": 619
},
{
"epoch": 0.6250630103841113,
"grad_norm": 0.14563144439803705,
"learning_rate": 5e-05,
"loss": 1.7003,
"step": 620
},
{
"epoch": 0.6260711765298921,
"grad_norm": 0.13777788141797628,
"learning_rate": 5e-05,
"loss": 1.6947,
"step": 621
},
{
"epoch": 0.627079342675673,
"grad_norm": 0.14480162592645623,
"learning_rate": 5e-05,
"loss": 1.7073,
"step": 622
},
{
"epoch": 0.6280875088214538,
"grad_norm": 0.1580924816537373,
"learning_rate": 5e-05,
"loss": 1.7055,
"step": 623
},
{
"epoch": 0.6290956749672346,
"grad_norm": 0.14248189883225654,
"learning_rate": 5e-05,
"loss": 1.6967,
"step": 624
},
{
"epoch": 0.6301038411130154,
"grad_norm": 0.1460039384954323,
"learning_rate": 5e-05,
"loss": 1.7049,
"step": 625
},
{
"epoch": 0.6311120072587962,
"grad_norm": 0.14093391631694235,
"learning_rate": 5e-05,
"loss": 1.7084,
"step": 626
},
{
"epoch": 0.632120173404577,
"grad_norm": 0.13217177954523113,
"learning_rate": 5e-05,
"loss": 1.7006,
"step": 627
},
{
"epoch": 0.6331283395503579,
"grad_norm": 0.14346490701814205,
"learning_rate": 5e-05,
"loss": 1.706,
"step": 628
},
{
"epoch": 0.6341365056961388,
"grad_norm": 0.15624821587275614,
"learning_rate": 5e-05,
"loss": 1.7058,
"step": 629
},
{
"epoch": 0.6351446718419196,
"grad_norm": 0.13155447156335595,
"learning_rate": 5e-05,
"loss": 1.7206,
"step": 630
},
{
"epoch": 0.6361528379877004,
"grad_norm": 1.0827383527162002,
"learning_rate": 5e-05,
"loss": 1.7073,
"step": 631
},
{
"epoch": 0.6371610041334812,
"grad_norm": 0.1374997471183784,
"learning_rate": 5e-05,
"loss": 1.7009,
"step": 632
},
{
"epoch": 0.638169170279262,
"grad_norm": 0.3334907532113421,
"learning_rate": 5e-05,
"loss": 1.7053,
"step": 633
},
{
"epoch": 0.6391773364250428,
"grad_norm": 0.13738631554336547,
"learning_rate": 5e-05,
"loss": 1.6925,
"step": 634
},
{
"epoch": 0.6401855025708236,
"grad_norm": 0.14421445661445198,
"learning_rate": 5e-05,
"loss": 1.684,
"step": 635
},
{
"epoch": 0.6411936687166045,
"grad_norm": 0.18704625439878989,
"learning_rate": 5e-05,
"loss": 1.7159,
"step": 636
},
{
"epoch": 0.6422018348623854,
"grad_norm": 0.15492677537903046,
"learning_rate": 5e-05,
"loss": 1.7143,
"step": 637
},
{
"epoch": 0.6432100010081662,
"grad_norm": 0.14329772207472202,
"learning_rate": 5e-05,
"loss": 1.6936,
"step": 638
},
{
"epoch": 0.644218167153947,
"grad_norm": 0.14506569626786983,
"learning_rate": 5e-05,
"loss": 1.7204,
"step": 639
},
{
"epoch": 0.6452263332997278,
"grad_norm": 0.14492261935555906,
"learning_rate": 5e-05,
"loss": 1.7256,
"step": 640
},
{
"epoch": 0.6462344994455086,
"grad_norm": 0.1444154532219128,
"learning_rate": 5e-05,
"loss": 1.7007,
"step": 641
},
{
"epoch": 0.6472426655912894,
"grad_norm": 0.1564687421107206,
"learning_rate": 5e-05,
"loss": 1.7277,
"step": 642
},
{
"epoch": 0.6482508317370703,
"grad_norm": 0.13974139251562445,
"learning_rate": 5e-05,
"loss": 1.7195,
"step": 643
},
{
"epoch": 0.6492589978828511,
"grad_norm": 0.15722678783316818,
"learning_rate": 5e-05,
"loss": 1.7142,
"step": 644
},
{
"epoch": 0.6502671640286319,
"grad_norm": 0.13965193592073247,
"learning_rate": 5e-05,
"loss": 1.7061,
"step": 645
},
{
"epoch": 0.6512753301744127,
"grad_norm": 0.16062017984617108,
"learning_rate": 5e-05,
"loss": 1.699,
"step": 646
},
{
"epoch": 0.6522834963201936,
"grad_norm": 0.13834940905057014,
"learning_rate": 5e-05,
"loss": 1.6782,
"step": 647
},
{
"epoch": 0.6532916624659744,
"grad_norm": 0.14378694887801025,
"learning_rate": 5e-05,
"loss": 1.7134,
"step": 648
},
{
"epoch": 0.6542998286117552,
"grad_norm": 0.1403877743768562,
"learning_rate": 5e-05,
"loss": 1.7375,
"step": 649
},
{
"epoch": 0.6553079947575361,
"grad_norm": 0.14834934143675046,
"learning_rate": 5e-05,
"loss": 1.7174,
"step": 650
},
{
"epoch": 0.6563161609033169,
"grad_norm": 0.13349134651059139,
"learning_rate": 5e-05,
"loss": 1.6888,
"step": 651
},
{
"epoch": 0.6573243270490977,
"grad_norm": 0.14300986280595238,
"learning_rate": 5e-05,
"loss": 1.6887,
"step": 652
},
{
"epoch": 0.6583324931948785,
"grad_norm": 0.1366229209171474,
"learning_rate": 5e-05,
"loss": 1.6866,
"step": 653
},
{
"epoch": 0.6593406593406593,
"grad_norm": 0.13129142830945212,
"learning_rate": 5e-05,
"loss": 1.6929,
"step": 654
},
{
"epoch": 0.6603488254864401,
"grad_norm": 0.15534033844039313,
"learning_rate": 5e-05,
"loss": 1.7266,
"step": 655
},
{
"epoch": 0.661356991632221,
"grad_norm": 0.13585701733479483,
"learning_rate": 5e-05,
"loss": 1.7184,
"step": 656
},
{
"epoch": 0.6623651577780019,
"grad_norm": 0.14604955910857845,
"learning_rate": 5e-05,
"loss": 1.7064,
"step": 657
},
{
"epoch": 0.6633733239237827,
"grad_norm": 0.13294919069862188,
"learning_rate": 5e-05,
"loss": 1.7013,
"step": 658
},
{
"epoch": 0.6643814900695635,
"grad_norm": 0.14828211853774284,
"learning_rate": 5e-05,
"loss": 1.7178,
"step": 659
},
{
"epoch": 0.6653896562153443,
"grad_norm": 0.14322058740077032,
"learning_rate": 5e-05,
"loss": 1.7017,
"step": 660
},
{
"epoch": 0.6663978223611251,
"grad_norm": 0.22694947426265846,
"learning_rate": 5e-05,
"loss": 1.7233,
"step": 661
},
{
"epoch": 0.6674059885069059,
"grad_norm": 0.13369181995377313,
"learning_rate": 5e-05,
"loss": 1.7122,
"step": 662
},
{
"epoch": 0.6684141546526867,
"grad_norm": 0.12961955869168454,
"learning_rate": 5e-05,
"loss": 1.7103,
"step": 663
},
{
"epoch": 0.6694223207984676,
"grad_norm": 0.12740690105639293,
"learning_rate": 5e-05,
"loss": 1.6991,
"step": 664
},
{
"epoch": 0.6704304869442484,
"grad_norm": 0.1395080769602082,
"learning_rate": 5e-05,
"loss": 1.7003,
"step": 665
},
{
"epoch": 0.6714386530900293,
"grad_norm": 0.13266459395217106,
"learning_rate": 5e-05,
"loss": 1.6896,
"step": 666
},
{
"epoch": 0.6724468192358101,
"grad_norm": 0.13792503447640894,
"learning_rate": 5e-05,
"loss": 1.7094,
"step": 667
},
{
"epoch": 0.6734549853815909,
"grad_norm": 0.13549499942407675,
"learning_rate": 5e-05,
"loss": 1.6852,
"step": 668
},
{
"epoch": 0.6744631515273717,
"grad_norm": 0.1304887647842567,
"learning_rate": 5e-05,
"loss": 1.7224,
"step": 669
},
{
"epoch": 0.6754713176731525,
"grad_norm": 0.13803972632497452,
"learning_rate": 5e-05,
"loss": 1.7074,
"step": 670
},
{
"epoch": 0.6764794838189334,
"grad_norm": 0.15238968656547802,
"learning_rate": 5e-05,
"loss": 1.697,
"step": 671
},
{
"epoch": 0.6774876499647142,
"grad_norm": 0.13650396460895298,
"learning_rate": 5e-05,
"loss": 1.7006,
"step": 672
},
{
"epoch": 0.678495816110495,
"grad_norm": 0.15406427775258108,
"learning_rate": 5e-05,
"loss": 1.7225,
"step": 673
},
{
"epoch": 0.6795039822562758,
"grad_norm": 0.14975427688081136,
"learning_rate": 5e-05,
"loss": 1.6938,
"step": 674
},
{
"epoch": 0.6805121484020566,
"grad_norm": 0.14574699614799233,
"learning_rate": 5e-05,
"loss": 1.6909,
"step": 675
},
{
"epoch": 0.6815203145478375,
"grad_norm": 0.140769429118802,
"learning_rate": 5e-05,
"loss": 1.7179,
"step": 676
},
{
"epoch": 0.6825284806936183,
"grad_norm": 0.1397379615230004,
"learning_rate": 5e-05,
"loss": 1.7137,
"step": 677
},
{
"epoch": 0.6835366468393992,
"grad_norm": 0.14148911779317347,
"learning_rate": 5e-05,
"loss": 1.6874,
"step": 678
},
{
"epoch": 0.68454481298518,
"grad_norm": 0.13796021307017373,
"learning_rate": 5e-05,
"loss": 1.7047,
"step": 679
},
{
"epoch": 0.6855529791309608,
"grad_norm": 0.12981325688396536,
"learning_rate": 5e-05,
"loss": 1.6835,
"step": 680
},
{
"epoch": 0.6865611452767416,
"grad_norm": 0.1520733919033312,
"learning_rate": 5e-05,
"loss": 1.6752,
"step": 681
},
{
"epoch": 0.6875693114225224,
"grad_norm": 0.13925368484326953,
"learning_rate": 5e-05,
"loss": 1.73,
"step": 682
},
{
"epoch": 0.6885774775683032,
"grad_norm": 0.15191330782704446,
"learning_rate": 5e-05,
"loss": 1.7128,
"step": 683
},
{
"epoch": 0.689585643714084,
"grad_norm": 0.1393558696693607,
"learning_rate": 5e-05,
"loss": 1.7002,
"step": 684
},
{
"epoch": 0.690593809859865,
"grad_norm": 0.16282864818947926,
"learning_rate": 5e-05,
"loss": 1.6996,
"step": 685
},
{
"epoch": 0.6916019760056458,
"grad_norm": 0.13659644056226283,
"learning_rate": 5e-05,
"loss": 1.6925,
"step": 686
},
{
"epoch": 0.6926101421514266,
"grad_norm": 0.14230355233928796,
"learning_rate": 5e-05,
"loss": 1.7016,
"step": 687
},
{
"epoch": 0.6936183082972074,
"grad_norm": 0.1320721416125541,
"learning_rate": 5e-05,
"loss": 1.6923,
"step": 688
},
{
"epoch": 0.6946264744429882,
"grad_norm": 0.12974182615487503,
"learning_rate": 5e-05,
"loss": 1.6961,
"step": 689
},
{
"epoch": 0.695634640588769,
"grad_norm": 0.14088384510278604,
"learning_rate": 5e-05,
"loss": 1.7033,
"step": 690
},
{
"epoch": 0.6966428067345498,
"grad_norm": 0.12974104848865461,
"learning_rate": 5e-05,
"loss": 1.6946,
"step": 691
},
{
"epoch": 0.6976509728803307,
"grad_norm": 0.1350755538708987,
"learning_rate": 5e-05,
"loss": 1.6926,
"step": 692
},
{
"epoch": 0.6986591390261115,
"grad_norm": 0.13778448675665025,
"learning_rate": 5e-05,
"loss": 1.7203,
"step": 693
},
{
"epoch": 0.6996673051718924,
"grad_norm": 0.17101586020957985,
"learning_rate": 5e-05,
"loss": 1.6938,
"step": 694
},
{
"epoch": 0.7006754713176732,
"grad_norm": 0.1303067376289026,
"learning_rate": 5e-05,
"loss": 1.7038,
"step": 695
},
{
"epoch": 0.701683637463454,
"grad_norm": 0.13399992160593918,
"learning_rate": 5e-05,
"loss": 1.7003,
"step": 696
},
{
"epoch": 0.7026918036092348,
"grad_norm": 0.1382522486144173,
"learning_rate": 5e-05,
"loss": 1.6986,
"step": 697
},
{
"epoch": 0.7036999697550156,
"grad_norm": 0.30559753438026604,
"learning_rate": 5e-05,
"loss": 1.7017,
"step": 698
},
{
"epoch": 0.7047081359007965,
"grad_norm": 0.13451228910548593,
"learning_rate": 5e-05,
"loss": 1.6905,
"step": 699
},
{
"epoch": 0.7057163020465773,
"grad_norm": 0.1231524957978636,
"learning_rate": 5e-05,
"loss": 1.6833,
"step": 700
},
{
"epoch": 0.7067244681923581,
"grad_norm": 0.13457654769947636,
"learning_rate": 5e-05,
"loss": 1.7098,
"step": 701
},
{
"epoch": 0.7077326343381389,
"grad_norm": 0.13286181727814403,
"learning_rate": 5e-05,
"loss": 1.6984,
"step": 702
},
{
"epoch": 0.7087408004839197,
"grad_norm": 0.13158152592049696,
"learning_rate": 5e-05,
"loss": 1.6898,
"step": 703
},
{
"epoch": 0.7097489666297006,
"grad_norm": 0.12393794270845451,
"learning_rate": 5e-05,
"loss": 1.7019,
"step": 704
},
{
"epoch": 0.7107571327754814,
"grad_norm": 0.8791329643142535,
"learning_rate": 5e-05,
"loss": 1.7017,
"step": 705
},
{
"epoch": 0.7117652989212623,
"grad_norm": 0.13766657632572252,
"learning_rate": 5e-05,
"loss": 1.7083,
"step": 706
},
{
"epoch": 0.7127734650670431,
"grad_norm": 0.1314801794244582,
"learning_rate": 5e-05,
"loss": 1.7014,
"step": 707
},
{
"epoch": 0.7137816312128239,
"grad_norm": 0.14301185853218212,
"learning_rate": 5e-05,
"loss": 1.6946,
"step": 708
},
{
"epoch": 0.7147897973586047,
"grad_norm": 0.13315850713688443,
"learning_rate": 5e-05,
"loss": 1.6916,
"step": 709
},
{
"epoch": 0.7157979635043855,
"grad_norm": 0.15050185104963668,
"learning_rate": 5e-05,
"loss": 1.6951,
"step": 710
},
{
"epoch": 0.7168061296501663,
"grad_norm": 0.193887861942083,
"learning_rate": 5e-05,
"loss": 1.7016,
"step": 711
},
{
"epoch": 0.7178142957959471,
"grad_norm": 0.184705631693118,
"learning_rate": 5e-05,
"loss": 1.6808,
"step": 712
},
{
"epoch": 0.7188224619417279,
"grad_norm": 0.20131120279722708,
"learning_rate": 5e-05,
"loss": 1.6846,
"step": 713
},
{
"epoch": 0.7198306280875089,
"grad_norm": 0.1645196401547296,
"learning_rate": 5e-05,
"loss": 1.6853,
"step": 714
},
{
"epoch": 0.7208387942332897,
"grad_norm": 0.15217136145669521,
"learning_rate": 5e-05,
"loss": 1.6905,
"step": 715
},
{
"epoch": 0.7218469603790705,
"grad_norm": 0.14669369727694134,
"learning_rate": 5e-05,
"loss": 1.7173,
"step": 716
},
{
"epoch": 0.7228551265248513,
"grad_norm": 0.1518741725825213,
"learning_rate": 5e-05,
"loss": 1.6945,
"step": 717
},
{
"epoch": 0.7238632926706321,
"grad_norm": 0.15882664040360764,
"learning_rate": 5e-05,
"loss": 1.6833,
"step": 718
},
{
"epoch": 0.7248714588164129,
"grad_norm": 1.1379498482193364,
"learning_rate": 5e-05,
"loss": 1.6763,
"step": 719
},
{
"epoch": 0.7258796249621937,
"grad_norm": 0.1586630734368667,
"learning_rate": 5e-05,
"loss": 1.6993,
"step": 720
},
{
"epoch": 0.7268877911079746,
"grad_norm": 0.16714817199638557,
"learning_rate": 5e-05,
"loss": 1.7066,
"step": 721
},
{
"epoch": 0.7278959572537554,
"grad_norm": 0.1439676312445262,
"learning_rate": 5e-05,
"loss": 1.7123,
"step": 722
},
{
"epoch": 0.7289041233995363,
"grad_norm": 0.14412259743914776,
"learning_rate": 5e-05,
"loss": 1.6992,
"step": 723
},
{
"epoch": 0.7299122895453171,
"grad_norm": 0.1444786558037397,
"learning_rate": 5e-05,
"loss": 1.6805,
"step": 724
},
{
"epoch": 0.7309204556910979,
"grad_norm": 0.12930306224525773,
"learning_rate": 5e-05,
"loss": 1.6876,
"step": 725
},
{
"epoch": 0.7319286218368787,
"grad_norm": 0.1406707248655597,
"learning_rate": 5e-05,
"loss": 1.7009,
"step": 726
},
{
"epoch": 0.7329367879826595,
"grad_norm": 0.14693857868721935,
"learning_rate": 5e-05,
"loss": 1.696,
"step": 727
},
{
"epoch": 0.7339449541284404,
"grad_norm": 0.14013055752478618,
"learning_rate": 5e-05,
"loss": 1.7079,
"step": 728
},
{
"epoch": 0.7349531202742212,
"grad_norm": 0.44162578171698896,
"learning_rate": 5e-05,
"loss": 1.7047,
"step": 729
},
{
"epoch": 0.735961286420002,
"grad_norm": 0.144202453186501,
"learning_rate": 5e-05,
"loss": 1.6722,
"step": 730
},
{
"epoch": 0.7369694525657828,
"grad_norm": 0.20350906402966454,
"learning_rate": 5e-05,
"loss": 1.6882,
"step": 731
},
{
"epoch": 0.7379776187115636,
"grad_norm": 0.13670367287551335,
"learning_rate": 5e-05,
"loss": 1.6952,
"step": 732
},
{
"epoch": 0.7389857848573445,
"grad_norm": 0.13733218254706195,
"learning_rate": 5e-05,
"loss": 1.707,
"step": 733
},
{
"epoch": 0.7399939510031253,
"grad_norm": 0.15097417904303445,
"learning_rate": 5e-05,
"loss": 1.7045,
"step": 734
},
{
"epoch": 0.7410021171489062,
"grad_norm": 0.13762128360800496,
"learning_rate": 5e-05,
"loss": 1.7033,
"step": 735
},
{
"epoch": 0.742010283294687,
"grad_norm": 0.13213047670959707,
"learning_rate": 5e-05,
"loss": 1.6971,
"step": 736
},
{
"epoch": 0.7430184494404678,
"grad_norm": 0.1499348434975417,
"learning_rate": 5e-05,
"loss": 1.6975,
"step": 737
},
{
"epoch": 0.7440266155862486,
"grad_norm": 0.13723445349671687,
"learning_rate": 5e-05,
"loss": 1.6922,
"step": 738
},
{
"epoch": 0.7450347817320294,
"grad_norm": 0.14545498886021488,
"learning_rate": 5e-05,
"loss": 1.6788,
"step": 739
},
{
"epoch": 0.7460429478778102,
"grad_norm": 0.1287235990420584,
"learning_rate": 5e-05,
"loss": 1.6955,
"step": 740
},
{
"epoch": 0.747051114023591,
"grad_norm": 0.1455471683094355,
"learning_rate": 5e-05,
"loss": 1.704,
"step": 741
},
{
"epoch": 0.748059280169372,
"grad_norm": 0.14617565645976455,
"learning_rate": 5e-05,
"loss": 1.6939,
"step": 742
},
{
"epoch": 0.7490674463151528,
"grad_norm": 0.1346326602528038,
"learning_rate": 5e-05,
"loss": 1.7025,
"step": 743
},
{
"epoch": 0.7500756124609336,
"grad_norm": 0.15520918731889405,
"learning_rate": 5e-05,
"loss": 1.6939,
"step": 744
},
{
"epoch": 0.7510837786067144,
"grad_norm": 0.13569763381362607,
"learning_rate": 5e-05,
"loss": 1.6966,
"step": 745
},
{
"epoch": 0.7520919447524952,
"grad_norm": 0.1299510283700129,
"learning_rate": 5e-05,
"loss": 1.6987,
"step": 746
},
{
"epoch": 0.753100110898276,
"grad_norm": 0.13419663557532094,
"learning_rate": 5e-05,
"loss": 1.6932,
"step": 747
},
{
"epoch": 0.7541082770440568,
"grad_norm": 0.13169849020289098,
"learning_rate": 5e-05,
"loss": 1.7071,
"step": 748
},
{
"epoch": 0.7551164431898377,
"grad_norm": 0.12522318521189696,
"learning_rate": 5e-05,
"loss": 1.6883,
"step": 749
},
{
"epoch": 0.7561246093356185,
"grad_norm": 0.12835934020684264,
"learning_rate": 5e-05,
"loss": 1.6758,
"step": 750
},
{
"epoch": 0.7571327754813993,
"grad_norm": 0.13498286500128723,
"learning_rate": 5e-05,
"loss": 1.6985,
"step": 751
},
{
"epoch": 0.7581409416271802,
"grad_norm": 0.13419888046777306,
"learning_rate": 5e-05,
"loss": 1.6846,
"step": 752
},
{
"epoch": 0.759149107772961,
"grad_norm": 0.12600233145443784,
"learning_rate": 5e-05,
"loss": 1.6979,
"step": 753
},
{
"epoch": 0.7601572739187418,
"grad_norm": 0.12934232671574464,
"learning_rate": 5e-05,
"loss": 1.6878,
"step": 754
},
{
"epoch": 0.7611654400645226,
"grad_norm": 0.1950926929344581,
"learning_rate": 5e-05,
"loss": 1.7178,
"step": 755
},
{
"epoch": 0.7621736062103035,
"grad_norm": 0.12859037517359037,
"learning_rate": 5e-05,
"loss": 1.7016,
"step": 756
},
{
"epoch": 0.7631817723560843,
"grad_norm": 0.1409670994846807,
"learning_rate": 5e-05,
"loss": 1.6854,
"step": 757
},
{
"epoch": 0.7641899385018651,
"grad_norm": 0.12288058164990474,
"learning_rate": 5e-05,
"loss": 1.6938,
"step": 758
},
{
"epoch": 0.7651981046476459,
"grad_norm": 0.12523243562044944,
"learning_rate": 5e-05,
"loss": 1.7088,
"step": 759
},
{
"epoch": 0.7662062707934267,
"grad_norm": 0.12400867326960456,
"learning_rate": 5e-05,
"loss": 1.6961,
"step": 760
},
{
"epoch": 0.7672144369392075,
"grad_norm": 0.12733361317641823,
"learning_rate": 5e-05,
"loss": 1.6895,
"step": 761
},
{
"epoch": 0.7682226030849884,
"grad_norm": 0.12401980643230025,
"learning_rate": 5e-05,
"loss": 1.6871,
"step": 762
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.13258386366930572,
"learning_rate": 5e-05,
"loss": 1.7151,
"step": 763
},
{
"epoch": 0.7702389353765501,
"grad_norm": 0.13554333774066002,
"learning_rate": 5e-05,
"loss": 1.6828,
"step": 764
},
{
"epoch": 0.7712471015223309,
"grad_norm": 0.1298590698259134,
"learning_rate": 5e-05,
"loss": 1.6817,
"step": 765
},
{
"epoch": 0.7722552676681117,
"grad_norm": 0.13225875098703646,
"learning_rate": 5e-05,
"loss": 1.7172,
"step": 766
},
{
"epoch": 0.7732634338138925,
"grad_norm": 0.12372816362157169,
"learning_rate": 5e-05,
"loss": 1.697,
"step": 767
},
{
"epoch": 0.7742715999596733,
"grad_norm": 0.8145033326229155,
"learning_rate": 5e-05,
"loss": 1.7296,
"step": 768
},
{
"epoch": 0.7752797661054541,
"grad_norm": 0.13559791454225872,
"learning_rate": 5e-05,
"loss": 1.6963,
"step": 769
},
{
"epoch": 0.776287932251235,
"grad_norm": 0.1308197405902908,
"learning_rate": 5e-05,
"loss": 1.6977,
"step": 770
},
{
"epoch": 0.7772960983970159,
"grad_norm": 0.13209728678813237,
"learning_rate": 5e-05,
"loss": 1.6858,
"step": 771
},
{
"epoch": 0.7783042645427967,
"grad_norm": 0.1278801420116914,
"learning_rate": 5e-05,
"loss": 1.6837,
"step": 772
},
{
"epoch": 0.7793124306885775,
"grad_norm": 0.13513888346210853,
"learning_rate": 5e-05,
"loss": 1.6887,
"step": 773
},
{
"epoch": 0.7803205968343583,
"grad_norm": 0.13570837058732832,
"learning_rate": 5e-05,
"loss": 1.694,
"step": 774
},
{
"epoch": 0.7813287629801391,
"grad_norm": 0.13434724641320867,
"learning_rate": 5e-05,
"loss": 1.6922,
"step": 775
},
{
"epoch": 0.7823369291259199,
"grad_norm": 0.12411873442488698,
"learning_rate": 5e-05,
"loss": 1.6832,
"step": 776
},
{
"epoch": 0.7833450952717008,
"grad_norm": 0.13282243905863508,
"learning_rate": 5e-05,
"loss": 1.6787,
"step": 777
},
{
"epoch": 0.7843532614174816,
"grad_norm": 0.14016303702305682,
"learning_rate": 5e-05,
"loss": 1.7033,
"step": 778
},
{
"epoch": 0.7853614275632624,
"grad_norm": 0.12715810408217135,
"learning_rate": 5e-05,
"loss": 1.698,
"step": 779
},
{
"epoch": 0.7863695937090432,
"grad_norm": 1.6383771081899785,
"learning_rate": 5e-05,
"loss": 1.6946,
"step": 780
},
{
"epoch": 0.787377759854824,
"grad_norm": 0.1555175450872356,
"learning_rate": 5e-05,
"loss": 1.6939,
"step": 781
},
{
"epoch": 0.7883859260006049,
"grad_norm": 0.12270274039737394,
"learning_rate": 5e-05,
"loss": 1.6934,
"step": 782
},
{
"epoch": 0.7893940921463857,
"grad_norm": 0.14328346799244598,
"learning_rate": 5e-05,
"loss": 1.6772,
"step": 783
},
{
"epoch": 0.7904022582921666,
"grad_norm": 0.1344035866062604,
"learning_rate": 5e-05,
"loss": 1.6731,
"step": 784
},
{
"epoch": 0.7914104244379474,
"grad_norm": 0.12957396945165556,
"learning_rate": 5e-05,
"loss": 1.701,
"step": 785
},
{
"epoch": 0.7924185905837282,
"grad_norm": 0.134892447245696,
"learning_rate": 5e-05,
"loss": 1.6861,
"step": 786
},
{
"epoch": 0.793426756729509,
"grad_norm": 0.13944146412945518,
"learning_rate": 5e-05,
"loss": 1.6878,
"step": 787
},
{
"epoch": 0.7944349228752898,
"grad_norm": 0.13517076378309315,
"learning_rate": 5e-05,
"loss": 1.6799,
"step": 788
},
{
"epoch": 0.7954430890210706,
"grad_norm": 0.13052895695360262,
"learning_rate": 5e-05,
"loss": 1.6729,
"step": 789
},
{
"epoch": 0.7964512551668514,
"grad_norm": 0.16070051048503994,
"learning_rate": 5e-05,
"loss": 1.6913,
"step": 790
},
{
"epoch": 0.7974594213126324,
"grad_norm": 0.13432850463263618,
"learning_rate": 5e-05,
"loss": 1.6768,
"step": 791
},
{
"epoch": 0.7984675874584132,
"grad_norm": 0.14983096856734876,
"learning_rate": 5e-05,
"loss": 1.6807,
"step": 792
},
{
"epoch": 0.799475753604194,
"grad_norm": 0.14499086970343855,
"learning_rate": 5e-05,
"loss": 1.6788,
"step": 793
},
{
"epoch": 0.8004839197499748,
"grad_norm": 0.14034782106855784,
"learning_rate": 5e-05,
"loss": 1.7087,
"step": 794
},
{
"epoch": 0.8014920858957556,
"grad_norm": 0.138379791092538,
"learning_rate": 5e-05,
"loss": 1.6776,
"step": 795
},
{
"epoch": 0.8025002520415364,
"grad_norm": 0.1482595466004464,
"learning_rate": 5e-05,
"loss": 1.6909,
"step": 796
},
{
"epoch": 0.8035084181873172,
"grad_norm": 0.13548313971169001,
"learning_rate": 5e-05,
"loss": 1.6666,
"step": 797
},
{
"epoch": 0.8045165843330981,
"grad_norm": 0.13487479450679285,
"learning_rate": 5e-05,
"loss": 1.689,
"step": 798
},
{
"epoch": 0.805524750478879,
"grad_norm": 0.1432363848206779,
"learning_rate": 5e-05,
"loss": 1.697,
"step": 799
},
{
"epoch": 0.8065329166246598,
"grad_norm": 0.13896327594504643,
"learning_rate": 5e-05,
"loss": 1.6911,
"step": 800
},
{
"epoch": 0.8075410827704406,
"grad_norm": 0.14338629590554086,
"learning_rate": 5e-05,
"loss": 1.6869,
"step": 801
},
{
"epoch": 0.8085492489162214,
"grad_norm": 0.13281813618885874,
"learning_rate": 5e-05,
"loss": 1.6861,
"step": 802
},
{
"epoch": 0.8095574150620022,
"grad_norm": 0.139985059403575,
"learning_rate": 5e-05,
"loss": 1.6827,
"step": 803
},
{
"epoch": 0.810565581207783,
"grad_norm": 0.14363687868955394,
"learning_rate": 5e-05,
"loss": 1.6707,
"step": 804
},
{
"epoch": 0.8115737473535639,
"grad_norm": 0.1336932116903534,
"learning_rate": 5e-05,
"loss": 1.6922,
"step": 805
},
{
"epoch": 0.8125819134993447,
"grad_norm": 0.1529549893245701,
"learning_rate": 5e-05,
"loss": 1.6766,
"step": 806
},
{
"epoch": 0.8135900796451255,
"grad_norm": 0.13672318865512173,
"learning_rate": 5e-05,
"loss": 1.6953,
"step": 807
},
{
"epoch": 0.8145982457909063,
"grad_norm": 0.14190254697613094,
"learning_rate": 5e-05,
"loss": 1.6598,
"step": 808
},
{
"epoch": 0.8156064119366871,
"grad_norm": 0.13913502141445533,
"learning_rate": 5e-05,
"loss": 1.687,
"step": 809
},
{
"epoch": 0.816614578082468,
"grad_norm": 0.13641964764170883,
"learning_rate": 5e-05,
"loss": 1.7055,
"step": 810
},
{
"epoch": 0.8176227442282488,
"grad_norm": 0.13599664312690396,
"learning_rate": 5e-05,
"loss": 1.705,
"step": 811
},
{
"epoch": 0.8186309103740297,
"grad_norm": 0.15018714562444313,
"learning_rate": 5e-05,
"loss": 1.6857,
"step": 812
},
{
"epoch": 0.8196390765198105,
"grad_norm": 0.14301860812865722,
"learning_rate": 5e-05,
"loss": 1.682,
"step": 813
},
{
"epoch": 0.8206472426655913,
"grad_norm": 0.1344232562677402,
"learning_rate": 5e-05,
"loss": 1.6966,
"step": 814
},
{
"epoch": 0.8216554088113721,
"grad_norm": 0.14041565750131477,
"learning_rate": 5e-05,
"loss": 1.6887,
"step": 815
},
{
"epoch": 0.8226635749571529,
"grad_norm": 0.13764254491162634,
"learning_rate": 5e-05,
"loss": 1.6734,
"step": 816
},
{
"epoch": 0.8236717411029337,
"grad_norm": 0.48603813362470627,
"learning_rate": 5e-05,
"loss": 1.7112,
"step": 817
},
{
"epoch": 0.8246799072487145,
"grad_norm": 0.15630406069565908,
"learning_rate": 5e-05,
"loss": 1.702,
"step": 818
},
{
"epoch": 0.8256880733944955,
"grad_norm": 0.14342007772006957,
"learning_rate": 5e-05,
"loss": 1.6846,
"step": 819
},
{
"epoch": 0.8266962395402763,
"grad_norm": 0.12685666364606676,
"learning_rate": 5e-05,
"loss": 1.6745,
"step": 820
},
{
"epoch": 0.8277044056860571,
"grad_norm": 0.14328976861961018,
"learning_rate": 5e-05,
"loss": 1.676,
"step": 821
},
{
"epoch": 0.8287125718318379,
"grad_norm": 0.14792477595331482,
"learning_rate": 5e-05,
"loss": 1.6945,
"step": 822
},
{
"epoch": 0.8297207379776187,
"grad_norm": 0.12890024243258635,
"learning_rate": 5e-05,
"loss": 1.6946,
"step": 823
},
{
"epoch": 0.8307289041233995,
"grad_norm": 0.1394381946621567,
"learning_rate": 5e-05,
"loss": 1.6897,
"step": 824
},
{
"epoch": 0.8317370702691803,
"grad_norm": 0.14871567892964102,
"learning_rate": 5e-05,
"loss": 1.6665,
"step": 825
},
{
"epoch": 0.8327452364149612,
"grad_norm": 0.1346517356681281,
"learning_rate": 5e-05,
"loss": 1.6892,
"step": 826
},
{
"epoch": 0.833753402560742,
"grad_norm": 0.13954790270024078,
"learning_rate": 5e-05,
"loss": 1.6878,
"step": 827
},
{
"epoch": 0.8347615687065228,
"grad_norm": 0.14054961982092284,
"learning_rate": 5e-05,
"loss": 1.667,
"step": 828
},
{
"epoch": 0.8357697348523037,
"grad_norm": 0.13206695580210953,
"learning_rate": 5e-05,
"loss": 1.6757,
"step": 829
},
{
"epoch": 0.8367779009980845,
"grad_norm": 0.15778739105529885,
"learning_rate": 5e-05,
"loss": 1.692,
"step": 830
},
{
"epoch": 0.8377860671438653,
"grad_norm": 0.1373991062407869,
"learning_rate": 5e-05,
"loss": 1.6739,
"step": 831
},
{
"epoch": 0.8387942332896461,
"grad_norm": 0.2576531409786061,
"learning_rate": 5e-05,
"loss": 1.6695,
"step": 832
},
{
"epoch": 0.839802399435427,
"grad_norm": 0.15106020481865556,
"learning_rate": 5e-05,
"loss": 1.6805,
"step": 833
},
{
"epoch": 0.8408105655812078,
"grad_norm": 0.14046305460650463,
"learning_rate": 5e-05,
"loss": 1.6777,
"step": 834
},
{
"epoch": 0.8418187317269886,
"grad_norm": 0.14720245880174745,
"learning_rate": 5e-05,
"loss": 1.6836,
"step": 835
},
{
"epoch": 0.8428268978727694,
"grad_norm": 0.1578072566386783,
"learning_rate": 5e-05,
"loss": 1.6861,
"step": 836
},
{
"epoch": 0.8438350640185502,
"grad_norm": 0.1518940101101013,
"learning_rate": 5e-05,
"loss": 1.6878,
"step": 837
},
{
"epoch": 0.844843230164331,
"grad_norm": 0.13998500777852416,
"learning_rate": 5e-05,
"loss": 1.6832,
"step": 838
},
{
"epoch": 0.8458513963101119,
"grad_norm": 0.13904704518884228,
"learning_rate": 5e-05,
"loss": 1.6753,
"step": 839
},
{
"epoch": 0.8468595624558928,
"grad_norm": 0.14081647055828106,
"learning_rate": 5e-05,
"loss": 1.6812,
"step": 840
},
{
"epoch": 0.8478677286016736,
"grad_norm": 0.15399662388279747,
"learning_rate": 5e-05,
"loss": 1.6726,
"step": 841
},
{
"epoch": 0.8488758947474544,
"grad_norm": 0.24482482359606256,
"learning_rate": 5e-05,
"loss": 1.6894,
"step": 842
},
{
"epoch": 0.8498840608932352,
"grad_norm": 0.1381996934428476,
"learning_rate": 5e-05,
"loss": 1.6787,
"step": 843
},
{
"epoch": 0.850892227039016,
"grad_norm": 0.12942528029338027,
"learning_rate": 5e-05,
"loss": 1.674,
"step": 844
},
{
"epoch": 0.8519003931847968,
"grad_norm": 0.13627903853259218,
"learning_rate": 5e-05,
"loss": 1.676,
"step": 845
},
{
"epoch": 0.8529085593305776,
"grad_norm": 0.1336712149207386,
"learning_rate": 5e-05,
"loss": 1.6817,
"step": 846
},
{
"epoch": 0.8539167254763586,
"grad_norm": 0.1367092325582646,
"learning_rate": 5e-05,
"loss": 1.6621,
"step": 847
},
{
"epoch": 0.8549248916221394,
"grad_norm": 0.1347323160292146,
"learning_rate": 5e-05,
"loss": 1.7124,
"step": 848
},
{
"epoch": 0.8559330577679202,
"grad_norm": 0.13560405221175614,
"learning_rate": 5e-05,
"loss": 1.6861,
"step": 849
},
{
"epoch": 0.856941223913701,
"grad_norm": 0.13449548817890208,
"learning_rate": 5e-05,
"loss": 1.6826,
"step": 850
},
{
"epoch": 0.8579493900594818,
"grad_norm": 0.1341433652220611,
"learning_rate": 5e-05,
"loss": 1.6864,
"step": 851
},
{
"epoch": 0.8589575562052626,
"grad_norm": 0.14825925731848053,
"learning_rate": 5e-05,
"loss": 1.6864,
"step": 852
},
{
"epoch": 0.8599657223510434,
"grad_norm": 0.1304576882873733,
"learning_rate": 5e-05,
"loss": 1.7001,
"step": 853
},
{
"epoch": 0.8609738884968243,
"grad_norm": 0.13574394501767972,
"learning_rate": 5e-05,
"loss": 1.6918,
"step": 854
},
{
"epoch": 0.8619820546426051,
"grad_norm": 0.13884970149183168,
"learning_rate": 5e-05,
"loss": 1.6875,
"step": 855
},
{
"epoch": 0.8629902207883859,
"grad_norm": 0.1362435489981324,
"learning_rate": 5e-05,
"loss": 1.6945,
"step": 856
},
{
"epoch": 0.8639983869341668,
"grad_norm": 0.13528485619923905,
"learning_rate": 5e-05,
"loss": 1.6828,
"step": 857
},
{
"epoch": 0.8650065530799476,
"grad_norm": 0.13432004891402732,
"learning_rate": 5e-05,
"loss": 1.6852,
"step": 858
},
{
"epoch": 0.8660147192257284,
"grad_norm": 0.1242110461943383,
"learning_rate": 5e-05,
"loss": 1.6859,
"step": 859
},
{
"epoch": 0.8670228853715092,
"grad_norm": 0.13526165386716868,
"learning_rate": 5e-05,
"loss": 1.6882,
"step": 860
},
{
"epoch": 0.8680310515172901,
"grad_norm": 0.13723528079790265,
"learning_rate": 5e-05,
"loss": 1.6742,
"step": 861
},
{
"epoch": 0.8690392176630709,
"grad_norm": 0.14152025186993977,
"learning_rate": 5e-05,
"loss": 1.6824,
"step": 862
},
{
"epoch": 0.8700473838088517,
"grad_norm": 0.13593092657670974,
"learning_rate": 5e-05,
"loss": 1.6877,
"step": 863
},
{
"epoch": 0.8710555499546325,
"grad_norm": 0.1237240839204448,
"learning_rate": 5e-05,
"loss": 1.6653,
"step": 864
},
{
"epoch": 0.8720637161004133,
"grad_norm": 0.12457362639962367,
"learning_rate": 5e-05,
"loss": 1.6741,
"step": 865
},
{
"epoch": 0.8730718822461941,
"grad_norm": 0.13769038207724557,
"learning_rate": 5e-05,
"loss": 1.6866,
"step": 866
},
{
"epoch": 0.874080048391975,
"grad_norm": 0.22819354832540836,
"learning_rate": 5e-05,
"loss": 1.6865,
"step": 867
},
{
"epoch": 0.8750882145377559,
"grad_norm": 0.12812321433260723,
"learning_rate": 5e-05,
"loss": 1.6629,
"step": 868
},
{
"epoch": 0.8760963806835367,
"grad_norm": 0.13193468553242307,
"learning_rate": 5e-05,
"loss": 1.6795,
"step": 869
},
{
"epoch": 0.8771045468293175,
"grad_norm": 0.1339735770465933,
"learning_rate": 5e-05,
"loss": 1.6642,
"step": 870
},
{
"epoch": 0.8781127129750983,
"grad_norm": 0.13219641454052525,
"learning_rate": 5e-05,
"loss": 1.6824,
"step": 871
},
{
"epoch": 0.8791208791208791,
"grad_norm": 0.1278981257624631,
"learning_rate": 5e-05,
"loss": 1.7017,
"step": 872
},
{
"epoch": 0.8801290452666599,
"grad_norm": 0.1268463958373461,
"learning_rate": 5e-05,
"loss": 1.685,
"step": 873
},
{
"epoch": 0.8811372114124407,
"grad_norm": 0.12262984135503795,
"learning_rate": 5e-05,
"loss": 1.6832,
"step": 874
},
{
"epoch": 0.8821453775582216,
"grad_norm": 0.12807704116710825,
"learning_rate": 5e-05,
"loss": 1.6715,
"step": 875
},
{
"epoch": 0.8831535437040025,
"grad_norm": 0.13384246213045484,
"learning_rate": 5e-05,
"loss": 1.6796,
"step": 876
},
{
"epoch": 0.8841617098497833,
"grad_norm": 0.13271946030978699,
"learning_rate": 5e-05,
"loss": 1.6792,
"step": 877
},
{
"epoch": 0.8851698759955641,
"grad_norm": 0.13002797068913113,
"learning_rate": 5e-05,
"loss": 1.6826,
"step": 878
},
{
"epoch": 0.8861780421413449,
"grad_norm": 0.12387813054000466,
"learning_rate": 5e-05,
"loss": 1.6764,
"step": 879
},
{
"epoch": 0.8871862082871257,
"grad_norm": 0.1539348827112057,
"learning_rate": 5e-05,
"loss": 1.681,
"step": 880
},
{
"epoch": 0.8881943744329065,
"grad_norm": 0.14191475073212179,
"learning_rate": 5e-05,
"loss": 1.7038,
"step": 881
},
{
"epoch": 0.8892025405786874,
"grad_norm": 0.12626208410619427,
"learning_rate": 5e-05,
"loss": 1.6734,
"step": 882
},
{
"epoch": 0.8902107067244682,
"grad_norm": 0.1537939707678987,
"learning_rate": 5e-05,
"loss": 1.6878,
"step": 883
},
{
"epoch": 0.891218872870249,
"grad_norm": 0.14006513053904576,
"learning_rate": 5e-05,
"loss": 1.6784,
"step": 884
},
{
"epoch": 0.8922270390160298,
"grad_norm": 0.1323359573323714,
"learning_rate": 5e-05,
"loss": 1.6753,
"step": 885
},
{
"epoch": 0.8932352051618107,
"grad_norm": 0.127135232877454,
"learning_rate": 5e-05,
"loss": 1.6703,
"step": 886
},
{
"epoch": 0.8942433713075915,
"grad_norm": 0.14395160246089883,
"learning_rate": 5e-05,
"loss": 1.6791,
"step": 887
},
{
"epoch": 0.8952515374533723,
"grad_norm": 0.13213800028351733,
"learning_rate": 5e-05,
"loss": 1.6977,
"step": 888
},
{
"epoch": 0.8962597035991532,
"grad_norm": 0.1339578277536838,
"learning_rate": 5e-05,
"loss": 1.6655,
"step": 889
},
{
"epoch": 0.897267869744934,
"grad_norm": 0.13843289199372555,
"learning_rate": 5e-05,
"loss": 1.6807,
"step": 890
},
{
"epoch": 0.8982760358907148,
"grad_norm": 0.15034227589113325,
"learning_rate": 5e-05,
"loss": 1.684,
"step": 891
},
{
"epoch": 0.8992842020364956,
"grad_norm": 0.1295931007457488,
"learning_rate": 5e-05,
"loss": 1.6821,
"step": 892
},
{
"epoch": 0.9002923681822764,
"grad_norm": 0.13591806645658097,
"learning_rate": 5e-05,
"loss": 1.6728,
"step": 893
},
{
"epoch": 0.9013005343280572,
"grad_norm": 0.1438591830321329,
"learning_rate": 5e-05,
"loss": 1.6698,
"step": 894
},
{
"epoch": 0.902308700473838,
"grad_norm": 0.1271279095390623,
"learning_rate": 5e-05,
"loss": 1.6668,
"step": 895
},
{
"epoch": 0.903316866619619,
"grad_norm": 0.13381633362697323,
"learning_rate": 5e-05,
"loss": 1.6692,
"step": 896
},
{
"epoch": 0.9043250327653998,
"grad_norm": 0.1365519873037108,
"learning_rate": 5e-05,
"loss": 1.6771,
"step": 897
},
{
"epoch": 0.9053331989111806,
"grad_norm": 0.12756904544167164,
"learning_rate": 5e-05,
"loss": 1.6862,
"step": 898
},
{
"epoch": 0.9063413650569614,
"grad_norm": 0.14850169861234608,
"learning_rate": 5e-05,
"loss": 1.6716,
"step": 899
},
{
"epoch": 0.9073495312027422,
"grad_norm": 0.13581227089444428,
"learning_rate": 5e-05,
"loss": 1.6805,
"step": 900
},
{
"epoch": 0.908357697348523,
"grad_norm": 0.12471119236144047,
"learning_rate": 5e-05,
"loss": 1.6618,
"step": 901
},
{
"epoch": 0.9093658634943038,
"grad_norm": 0.15506755587406426,
"learning_rate": 5e-05,
"loss": 1.6787,
"step": 902
},
{
"epoch": 0.9103740296400847,
"grad_norm": 0.13513790109556142,
"learning_rate": 5e-05,
"loss": 1.6708,
"step": 903
},
{
"epoch": 0.9113821957858655,
"grad_norm": 0.15277615577202727,
"learning_rate": 5e-05,
"loss": 1.6595,
"step": 904
},
{
"epoch": 0.9123903619316464,
"grad_norm": 0.1350891914749887,
"learning_rate": 5e-05,
"loss": 1.6567,
"step": 905
},
{
"epoch": 0.9133985280774272,
"grad_norm": 0.14602516176664662,
"learning_rate": 5e-05,
"loss": 1.6749,
"step": 906
},
{
"epoch": 0.914406694223208,
"grad_norm": 0.14463663430798326,
"learning_rate": 5e-05,
"loss": 1.6732,
"step": 907
},
{
"epoch": 0.9154148603689888,
"grad_norm": 0.14699091093716773,
"learning_rate": 5e-05,
"loss": 1.6604,
"step": 908
},
{
"epoch": 0.9164230265147696,
"grad_norm": 0.13215022110277264,
"learning_rate": 5e-05,
"loss": 1.6741,
"step": 909
},
{
"epoch": 0.9174311926605505,
"grad_norm": 0.13050745363963512,
"learning_rate": 5e-05,
"loss": 1.6608,
"step": 910
},
{
"epoch": 0.9184393588063313,
"grad_norm": 0.1466525773007831,
"learning_rate": 5e-05,
"loss": 1.6616,
"step": 911
},
{
"epoch": 0.9194475249521121,
"grad_norm": 0.13347369745114426,
"learning_rate": 5e-05,
"loss": 1.6709,
"step": 912
},
{
"epoch": 0.9204556910978929,
"grad_norm": 0.14407041271422674,
"learning_rate": 5e-05,
"loss": 1.6736,
"step": 913
},
{
"epoch": 0.9214638572436737,
"grad_norm": 0.13888242220869906,
"learning_rate": 5e-05,
"loss": 1.6949,
"step": 914
},
{
"epoch": 0.9224720233894546,
"grad_norm": 0.14262053965466828,
"learning_rate": 5e-05,
"loss": 1.6734,
"step": 915
},
{
"epoch": 0.9234801895352354,
"grad_norm": 0.1409772264763286,
"learning_rate": 5e-05,
"loss": 1.6678,
"step": 916
},
{
"epoch": 0.9244883556810163,
"grad_norm": 0.14527396885120117,
"learning_rate": 5e-05,
"loss": 1.6761,
"step": 917
},
{
"epoch": 0.9254965218267971,
"grad_norm": 0.12740510773335295,
"learning_rate": 5e-05,
"loss": 1.6708,
"step": 918
},
{
"epoch": 0.9265046879725779,
"grad_norm": 0.139395581998477,
"learning_rate": 5e-05,
"loss": 1.6613,
"step": 919
},
{
"epoch": 0.9275128541183587,
"grad_norm": 0.13417759421161327,
"learning_rate": 5e-05,
"loss": 1.6753,
"step": 920
},
{
"epoch": 0.9285210202641395,
"grad_norm": 0.13555609704891103,
"learning_rate": 5e-05,
"loss": 1.6738,
"step": 921
},
{
"epoch": 0.9295291864099203,
"grad_norm": 0.12829038107875804,
"learning_rate": 5e-05,
"loss": 1.6684,
"step": 922
},
{
"epoch": 0.9305373525557011,
"grad_norm": 0.13296107854572745,
"learning_rate": 5e-05,
"loss": 1.6726,
"step": 923
},
{
"epoch": 0.9315455187014821,
"grad_norm": 0.14241035335107388,
"learning_rate": 5e-05,
"loss": 1.6709,
"step": 924
},
{
"epoch": 0.9325536848472629,
"grad_norm": 0.1271240766685691,
"learning_rate": 5e-05,
"loss": 1.6766,
"step": 925
},
{
"epoch": 0.9335618509930437,
"grad_norm": 0.1354859254660614,
"learning_rate": 5e-05,
"loss": 1.6715,
"step": 926
},
{
"epoch": 0.9345700171388245,
"grad_norm": 0.12889632841887344,
"learning_rate": 5e-05,
"loss": 1.68,
"step": 927
},
{
"epoch": 0.9355781832846053,
"grad_norm": 0.12917740851963883,
"learning_rate": 5e-05,
"loss": 1.6793,
"step": 928
},
{
"epoch": 0.9365863494303861,
"grad_norm": 0.1327484530362986,
"learning_rate": 5e-05,
"loss": 1.6586,
"step": 929
},
{
"epoch": 0.9375945155761669,
"grad_norm": 0.12967479905068346,
"learning_rate": 5e-05,
"loss": 1.6829,
"step": 930
},
{
"epoch": 0.9386026817219478,
"grad_norm": 0.12850564347803076,
"learning_rate": 5e-05,
"loss": 1.666,
"step": 931
},
{
"epoch": 0.9396108478677286,
"grad_norm": 0.12869497807470315,
"learning_rate": 5e-05,
"loss": 1.6911,
"step": 932
},
{
"epoch": 0.9406190140135094,
"grad_norm": 0.13662082311676438,
"learning_rate": 5e-05,
"loss": 1.674,
"step": 933
},
{
"epoch": 0.9416271801592903,
"grad_norm": 0.1382953464962334,
"learning_rate": 5e-05,
"loss": 1.6647,
"step": 934
},
{
"epoch": 0.9426353463050711,
"grad_norm": 0.13350088079608952,
"learning_rate": 5e-05,
"loss": 1.6832,
"step": 935
},
{
"epoch": 0.9436435124508519,
"grad_norm": 0.1431935916731277,
"learning_rate": 5e-05,
"loss": 1.6748,
"step": 936
},
{
"epoch": 0.9446516785966327,
"grad_norm": 0.14180022265326894,
"learning_rate": 5e-05,
"loss": 1.6641,
"step": 937
},
{
"epoch": 0.9456598447424136,
"grad_norm": 0.1272013668604564,
"learning_rate": 5e-05,
"loss": 1.6853,
"step": 938
},
{
"epoch": 0.9466680108881944,
"grad_norm": 0.13326949088898338,
"learning_rate": 5e-05,
"loss": 1.666,
"step": 939
},
{
"epoch": 0.9476761770339752,
"grad_norm": 0.1475715105954654,
"learning_rate": 5e-05,
"loss": 1.6828,
"step": 940
},
{
"epoch": 0.948684343179756,
"grad_norm": 0.14083105254743475,
"learning_rate": 5e-05,
"loss": 1.6784,
"step": 941
},
{
"epoch": 0.9496925093255368,
"grad_norm": 0.13511643953253086,
"learning_rate": 5e-05,
"loss": 1.6815,
"step": 942
},
{
"epoch": 0.9507006754713176,
"grad_norm": 0.12796079103971297,
"learning_rate": 5e-05,
"loss": 1.6817,
"step": 943
},
{
"epoch": 0.9517088416170985,
"grad_norm": 0.16362744096426632,
"learning_rate": 5e-05,
"loss": 1.6836,
"step": 944
},
{
"epoch": 0.9527170077628794,
"grad_norm": 0.12797064422723695,
"learning_rate": 5e-05,
"loss": 1.6751,
"step": 945
},
{
"epoch": 0.9537251739086602,
"grad_norm": 0.1434444700945595,
"learning_rate": 5e-05,
"loss": 1.6834,
"step": 946
},
{
"epoch": 0.954733340054441,
"grad_norm": 0.1321562433293951,
"learning_rate": 5e-05,
"loss": 1.6654,
"step": 947
},
{
"epoch": 0.9557415062002218,
"grad_norm": 0.1350527789374817,
"learning_rate": 5e-05,
"loss": 1.6668,
"step": 948
},
{
"epoch": 0.9567496723460026,
"grad_norm": 0.14156980572384642,
"learning_rate": 5e-05,
"loss": 1.6734,
"step": 949
},
{
"epoch": 0.9577578384917834,
"grad_norm": 0.14013712544503423,
"learning_rate": 5e-05,
"loss": 1.6842,
"step": 950
},
{
"epoch": 0.9587660046375642,
"grad_norm": 0.13222246222944062,
"learning_rate": 5e-05,
"loss": 1.6779,
"step": 951
},
{
"epoch": 0.9597741707833451,
"grad_norm": 0.13904740589623618,
"learning_rate": 5e-05,
"loss": 1.6672,
"step": 952
},
{
"epoch": 0.960782336929126,
"grad_norm": 0.12712158954742464,
"learning_rate": 5e-05,
"loss": 1.6727,
"step": 953
},
{
"epoch": 0.9617905030749068,
"grad_norm": 0.1288804401638552,
"learning_rate": 5e-05,
"loss": 1.6632,
"step": 954
},
{
"epoch": 0.9627986692206876,
"grad_norm": 0.13295930679196813,
"learning_rate": 5e-05,
"loss": 1.6839,
"step": 955
},
{
"epoch": 0.9638068353664684,
"grad_norm": 0.23028443790162464,
"learning_rate": 5e-05,
"loss": 1.6842,
"step": 956
},
{
"epoch": 0.9648150015122492,
"grad_norm": 0.1397117984780913,
"learning_rate": 5e-05,
"loss": 1.6865,
"step": 957
},
{
"epoch": 0.96582316765803,
"grad_norm": 0.13497797279155332,
"learning_rate": 5e-05,
"loss": 1.6864,
"step": 958
},
{
"epoch": 0.9668313338038109,
"grad_norm": 0.12916303558347642,
"learning_rate": 5e-05,
"loss": 1.6901,
"step": 959
},
{
"epoch": 0.9678394999495917,
"grad_norm": 0.23211777447315532,
"learning_rate": 5e-05,
"loss": 1.6766,
"step": 960
},
{
"epoch": 0.9688476660953725,
"grad_norm": 0.13601563623084056,
"learning_rate": 5e-05,
"loss": 1.6753,
"step": 961
},
{
"epoch": 0.9698558322411533,
"grad_norm": 0.137289477966096,
"learning_rate": 5e-05,
"loss": 1.6737,
"step": 962
},
{
"epoch": 0.9708639983869342,
"grad_norm": 0.13667594781630565,
"learning_rate": 5e-05,
"loss": 1.6804,
"step": 963
},
{
"epoch": 0.971872164532715,
"grad_norm": 0.13576064908436217,
"learning_rate": 5e-05,
"loss": 1.6603,
"step": 964
},
{
"epoch": 0.9728803306784958,
"grad_norm": 0.132798732372051,
"learning_rate": 5e-05,
"loss": 1.6828,
"step": 965
},
{
"epoch": 0.9738884968242767,
"grad_norm": 0.13208449355289498,
"learning_rate": 5e-05,
"loss": 1.6744,
"step": 966
},
{
"epoch": 0.9748966629700575,
"grad_norm": 0.13585942411581226,
"learning_rate": 5e-05,
"loss": 1.6777,
"step": 967
},
{
"epoch": 0.9759048291158383,
"grad_norm": 0.13548184798449628,
"learning_rate": 5e-05,
"loss": 1.6881,
"step": 968
},
{
"epoch": 0.9769129952616191,
"grad_norm": 0.1392166913735763,
"learning_rate": 5e-05,
"loss": 1.6675,
"step": 969
},
{
"epoch": 0.9779211614073999,
"grad_norm": 0.13739517699713566,
"learning_rate": 5e-05,
"loss": 1.6725,
"step": 970
},
{
"epoch": 0.9789293275531807,
"grad_norm": 0.1325157842600348,
"learning_rate": 5e-05,
"loss": 1.7046,
"step": 971
},
{
"epoch": 0.9799374936989615,
"grad_norm": 0.14491654836379084,
"learning_rate": 5e-05,
"loss": 1.6907,
"step": 972
},
{
"epoch": 0.9809456598447425,
"grad_norm": 0.1350018683671611,
"learning_rate": 5e-05,
"loss": 1.6893,
"step": 973
},
{
"epoch": 0.9819538259905233,
"grad_norm": 0.1380573133150687,
"learning_rate": 5e-05,
"loss": 1.6686,
"step": 974
},
{
"epoch": 0.9829619921363041,
"grad_norm": 0.15639160382515796,
"learning_rate": 5e-05,
"loss": 1.6831,
"step": 975
},
{
"epoch": 0.9839701582820849,
"grad_norm": 0.13129130265567285,
"learning_rate": 5e-05,
"loss": 1.6852,
"step": 976
},
{
"epoch": 0.9849783244278657,
"grad_norm": 0.1378835095946666,
"learning_rate": 5e-05,
"loss": 1.6809,
"step": 977
},
{
"epoch": 0.9859864905736465,
"grad_norm": 0.15323167789285774,
"learning_rate": 5e-05,
"loss": 1.6584,
"step": 978
},
{
"epoch": 0.9869946567194273,
"grad_norm": 0.13120916627174922,
"learning_rate": 5e-05,
"loss": 1.6746,
"step": 979
},
{
"epoch": 0.9880028228652082,
"grad_norm": 0.16658089074004762,
"learning_rate": 5e-05,
"loss": 1.6883,
"step": 980
},
{
"epoch": 0.989010989010989,
"grad_norm": 0.12480946878755692,
"learning_rate": 5e-05,
"loss": 1.6667,
"step": 981
},
{
"epoch": 0.9900191551567699,
"grad_norm": 0.15011152635478692,
"learning_rate": 5e-05,
"loss": 1.6713,
"step": 982
},
{
"epoch": 0.9910273213025507,
"grad_norm": 0.14336382061066752,
"learning_rate": 5e-05,
"loss": 1.6682,
"step": 983
},
{
"epoch": 0.9920354874483315,
"grad_norm": 0.15560552997255717,
"learning_rate": 5e-05,
"loss": 1.6554,
"step": 984
},
{
"epoch": 0.9930436535941123,
"grad_norm": 0.25239196240528766,
"learning_rate": 5e-05,
"loss": 1.6708,
"step": 985
},
{
"epoch": 0.9940518197398931,
"grad_norm": 0.15375826976754078,
"learning_rate": 5e-05,
"loss": 1.6722,
"step": 986
},
{
"epoch": 0.995059985885674,
"grad_norm": 0.13908301421214161,
"learning_rate": 5e-05,
"loss": 1.6638,
"step": 987
},
{
"epoch": 0.9960681520314548,
"grad_norm": 0.13376466586132293,
"learning_rate": 5e-05,
"loss": 1.6708,
"step": 988
},
{
"epoch": 0.9970763181772356,
"grad_norm": 0.14674096140221835,
"learning_rate": 5e-05,
"loss": 1.6638,
"step": 989
},
{
"epoch": 0.9980844843230164,
"grad_norm": 0.14637311521906743,
"learning_rate": 5e-05,
"loss": 1.6977,
"step": 990
},
{
"epoch": 0.9990926504687972,
"grad_norm": 0.1432639711689218,
"learning_rate": 5e-05,
"loss": 1.6693,
"step": 991
},
{
"epoch": 1.0010081661457808,
"grad_norm": 0.21860288289168814,
"learning_rate": 5e-05,
"loss": 3.2927,
"step": 992
},
{
"epoch": 1.0020163322915616,
"grad_norm": 0.14091034776903996,
"learning_rate": 5e-05,
"loss": 1.6465,
"step": 993
},
{
"epoch": 1.0030244984373424,
"grad_norm": 0.15840691147102967,
"learning_rate": 5e-05,
"loss": 1.6415,
"step": 994
},
{
"epoch": 1.0040326645831232,
"grad_norm": 0.1419834757811408,
"learning_rate": 5e-05,
"loss": 1.6695,
"step": 995
},
{
"epoch": 1.005040830728904,
"grad_norm": 0.15445174116810362,
"learning_rate": 5e-05,
"loss": 1.6508,
"step": 996
},
{
"epoch": 1.0060489968746849,
"grad_norm": 0.15506560418976248,
"learning_rate": 5e-05,
"loss": 1.6608,
"step": 997
},
{
"epoch": 1.0070571630204657,
"grad_norm": 0.15772043212937356,
"learning_rate": 5e-05,
"loss": 1.6549,
"step": 998
},
{
"epoch": 1.0080653291662467,
"grad_norm": 0.14809733736088554,
"learning_rate": 5e-05,
"loss": 1.6756,
"step": 999
},
{
"epoch": 1.0090734953120275,
"grad_norm": 0.15527053504041188,
"learning_rate": 5e-05,
"loss": 1.6443,
"step": 1000
},
{
"epoch": 1.0100816614578083,
"grad_norm": 0.13339301695926947,
"learning_rate": 5e-05,
"loss": 1.6502,
"step": 1001
},
{
"epoch": 1.0110898276035891,
"grad_norm": 0.16069903566032434,
"learning_rate": 5e-05,
"loss": 1.6386,
"step": 1002
},
{
"epoch": 1.01209799374937,
"grad_norm": 0.145353827020301,
"learning_rate": 5e-05,
"loss": 1.6708,
"step": 1003
},
{
"epoch": 1.0131061598951507,
"grad_norm": 0.15847490082912677,
"learning_rate": 5e-05,
"loss": 1.6668,
"step": 1004
},
{
"epoch": 1.0141143260409315,
"grad_norm": 0.1527946599717766,
"learning_rate": 5e-05,
"loss": 1.6544,
"step": 1005
},
{
"epoch": 1.0151224921867124,
"grad_norm": 0.14568390303961978,
"learning_rate": 5e-05,
"loss": 1.653,
"step": 1006
},
{
"epoch": 1.0161306583324932,
"grad_norm": 0.15026544585100637,
"learning_rate": 5e-05,
"loss": 1.6631,
"step": 1007
},
{
"epoch": 1.017138824478274,
"grad_norm": 0.14649278087306322,
"learning_rate": 5e-05,
"loss": 1.67,
"step": 1008
},
{
"epoch": 1.0181469906240548,
"grad_norm": 0.13992710750397244,
"learning_rate": 5e-05,
"loss": 1.6659,
"step": 1009
},
{
"epoch": 1.0191551567698356,
"grad_norm": 0.134375044039903,
"learning_rate": 5e-05,
"loss": 1.6758,
"step": 1010
},
{
"epoch": 1.0201633229156164,
"grad_norm": 0.14737843192197858,
"learning_rate": 5e-05,
"loss": 1.6422,
"step": 1011
},
{
"epoch": 1.0211714890613974,
"grad_norm": 0.13773473373848963,
"learning_rate": 5e-05,
"loss": 1.664,
"step": 1012
},
{
"epoch": 1.0221796552071782,
"grad_norm": 0.15945433650936014,
"learning_rate": 5e-05,
"loss": 1.652,
"step": 1013
},
{
"epoch": 1.023187821352959,
"grad_norm": 0.15989378386570163,
"learning_rate": 5e-05,
"loss": 1.6789,
"step": 1014
},
{
"epoch": 1.0241959874987399,
"grad_norm": 0.15474749283228387,
"learning_rate": 5e-05,
"loss": 1.6484,
"step": 1015
},
{
"epoch": 1.0252041536445207,
"grad_norm": 0.14454939561179925,
"learning_rate": 5e-05,
"loss": 1.6549,
"step": 1016
},
{
"epoch": 1.0262123197903015,
"grad_norm": 0.1429342231425721,
"learning_rate": 5e-05,
"loss": 1.6575,
"step": 1017
},
{
"epoch": 1.0272204859360823,
"grad_norm": 0.1472418787822263,
"learning_rate": 5e-05,
"loss": 1.6575,
"step": 1018
},
{
"epoch": 1.028228652081863,
"grad_norm": 0.15058269545560304,
"learning_rate": 5e-05,
"loss": 1.6466,
"step": 1019
},
{
"epoch": 1.029236818227644,
"grad_norm": 0.14338434142901946,
"learning_rate": 5e-05,
"loss": 1.6367,
"step": 1020
},
{
"epoch": 1.0302449843734247,
"grad_norm": 0.14739578480466062,
"learning_rate": 5e-05,
"loss": 1.6462,
"step": 1021
},
{
"epoch": 1.0312531505192055,
"grad_norm": 0.14596335620916118,
"learning_rate": 5e-05,
"loss": 1.6519,
"step": 1022
},
{
"epoch": 1.0322613166649863,
"grad_norm": 0.14344758953412376,
"learning_rate": 5e-05,
"loss": 1.6361,
"step": 1023
},
{
"epoch": 1.0332694828107671,
"grad_norm": 0.14371231242831609,
"learning_rate": 5e-05,
"loss": 1.652,
"step": 1024
},
{
"epoch": 1.034277648956548,
"grad_norm": 0.14003131085381088,
"learning_rate": 5e-05,
"loss": 1.6816,
"step": 1025
},
{
"epoch": 1.0352858151023288,
"grad_norm": 0.1378014474462088,
"learning_rate": 5e-05,
"loss": 1.6555,
"step": 1026
},
{
"epoch": 1.0362939812481098,
"grad_norm": 0.13383360458955793,
"learning_rate": 5e-05,
"loss": 1.6475,
"step": 1027
},
{
"epoch": 1.0373021473938906,
"grad_norm": 0.1494803160243675,
"learning_rate": 5e-05,
"loss": 1.6562,
"step": 1028
},
{
"epoch": 1.0383103135396714,
"grad_norm": 0.1357968016925739,
"learning_rate": 5e-05,
"loss": 1.6377,
"step": 1029
},
{
"epoch": 1.0393184796854522,
"grad_norm": 0.1389382336016073,
"learning_rate": 5e-05,
"loss": 1.6594,
"step": 1030
},
{
"epoch": 1.040326645831233,
"grad_norm": 0.1406360279058917,
"learning_rate": 5e-05,
"loss": 1.6465,
"step": 1031
},
{
"epoch": 1.0413348119770138,
"grad_norm": 0.1398760422398223,
"learning_rate": 5e-05,
"loss": 1.6609,
"step": 1032
},
{
"epoch": 1.0423429781227946,
"grad_norm": 0.1551391224014159,
"learning_rate": 5e-05,
"loss": 1.6569,
"step": 1033
},
{
"epoch": 1.0433511442685754,
"grad_norm": 0.13814685402174295,
"learning_rate": 5e-05,
"loss": 1.6478,
"step": 1034
},
{
"epoch": 1.0443593104143563,
"grad_norm": 0.13526430253748103,
"learning_rate": 5e-05,
"loss": 1.6464,
"step": 1035
},
{
"epoch": 1.045367476560137,
"grad_norm": 0.15377599703718353,
"learning_rate": 5e-05,
"loss": 1.6533,
"step": 1036
},
{
"epoch": 1.0463756427059179,
"grad_norm": 0.14272580240194616,
"learning_rate": 5e-05,
"loss": 1.6464,
"step": 1037
},
{
"epoch": 1.0473838088516987,
"grad_norm": 0.1425037845577125,
"learning_rate": 5e-05,
"loss": 1.6561,
"step": 1038
},
{
"epoch": 1.0483919749974795,
"grad_norm": 0.1382158099797001,
"learning_rate": 5e-05,
"loss": 1.6556,
"step": 1039
},
{
"epoch": 1.0494001411432605,
"grad_norm": 0.13446219082640498,
"learning_rate": 5e-05,
"loss": 1.6477,
"step": 1040
},
{
"epoch": 1.0504083072890413,
"grad_norm": 0.14002570935710634,
"learning_rate": 5e-05,
"loss": 1.6603,
"step": 1041
},
{
"epoch": 1.0514164734348221,
"grad_norm": 0.14680327184585512,
"learning_rate": 5e-05,
"loss": 1.6463,
"step": 1042
},
{
"epoch": 1.052424639580603,
"grad_norm": 0.13807927181398283,
"learning_rate": 5e-05,
"loss": 1.6606,
"step": 1043
},
{
"epoch": 1.0534328057263838,
"grad_norm": 0.14002355585431073,
"learning_rate": 5e-05,
"loss": 1.6559,
"step": 1044
},
{
"epoch": 1.0544409718721646,
"grad_norm": 0.13712234265374093,
"learning_rate": 5e-05,
"loss": 1.6568,
"step": 1045
},
{
"epoch": 1.0554491380179454,
"grad_norm": 0.1478330543967965,
"learning_rate": 5e-05,
"loss": 1.6487,
"step": 1046
},
{
"epoch": 1.0564573041637262,
"grad_norm": 0.140280334541867,
"learning_rate": 5e-05,
"loss": 1.657,
"step": 1047
},
{
"epoch": 1.057465470309507,
"grad_norm": 0.14612153616751714,
"learning_rate": 5e-05,
"loss": 1.6685,
"step": 1048
},
{
"epoch": 1.0584736364552878,
"grad_norm": 0.13553689622586162,
"learning_rate": 5e-05,
"loss": 1.6424,
"step": 1049
},
{
"epoch": 1.0594818026010686,
"grad_norm": 0.14257587085611279,
"learning_rate": 5e-05,
"loss": 1.6423,
"step": 1050
},
{
"epoch": 1.0604899687468494,
"grad_norm": 0.13249030300586634,
"learning_rate": 5e-05,
"loss": 1.6588,
"step": 1051
},
{
"epoch": 1.0614981348926302,
"grad_norm": 0.14948997388776752,
"learning_rate": 5e-05,
"loss": 1.6514,
"step": 1052
},
{
"epoch": 1.062506301038411,
"grad_norm": 0.13413211819187493,
"learning_rate": 5e-05,
"loss": 1.6243,
"step": 1053
},
{
"epoch": 1.0635144671841918,
"grad_norm": 0.13517249636027376,
"learning_rate": 5e-05,
"loss": 1.6718,
"step": 1054
},
{
"epoch": 1.0645226333299729,
"grad_norm": 0.1425324839674816,
"learning_rate": 5e-05,
"loss": 1.634,
"step": 1055
},
{
"epoch": 1.0655307994757537,
"grad_norm": 0.14421737540989363,
"learning_rate": 5e-05,
"loss": 1.6601,
"step": 1056
},
{
"epoch": 1.0665389656215345,
"grad_norm": 0.13949099479682,
"learning_rate": 5e-05,
"loss": 1.6584,
"step": 1057
},
{
"epoch": 1.0675471317673153,
"grad_norm": 0.13543143787477846,
"learning_rate": 5e-05,
"loss": 1.6264,
"step": 1058
},
{
"epoch": 1.0685552979130961,
"grad_norm": 0.1308536607672446,
"learning_rate": 5e-05,
"loss": 1.6483,
"step": 1059
},
{
"epoch": 1.069563464058877,
"grad_norm": 0.15395546567197096,
"learning_rate": 5e-05,
"loss": 1.6537,
"step": 1060
},
{
"epoch": 1.0705716302046577,
"grad_norm": 0.13590652212696186,
"learning_rate": 5e-05,
"loss": 1.6489,
"step": 1061
},
{
"epoch": 1.0715797963504385,
"grad_norm": 0.1369179871265128,
"learning_rate": 5e-05,
"loss": 1.6499,
"step": 1062
},
{
"epoch": 1.0725879624962193,
"grad_norm": 0.13868179923924345,
"learning_rate": 5e-05,
"loss": 1.6568,
"step": 1063
},
{
"epoch": 1.0735961286420002,
"grad_norm": 0.1418740879249988,
"learning_rate": 5e-05,
"loss": 1.6507,
"step": 1064
},
{
"epoch": 1.074604294787781,
"grad_norm": 0.1398126841174943,
"learning_rate": 5e-05,
"loss": 1.6492,
"step": 1065
},
{
"epoch": 1.0756124609335618,
"grad_norm": 0.14638179326062736,
"learning_rate": 5e-05,
"loss": 1.6467,
"step": 1066
},
{
"epoch": 1.0766206270793426,
"grad_norm": 0.12995279427717285,
"learning_rate": 5e-05,
"loss": 1.6449,
"step": 1067
},
{
"epoch": 1.0776287932251236,
"grad_norm": 0.14547801601785154,
"learning_rate": 5e-05,
"loss": 1.6355,
"step": 1068
},
{
"epoch": 1.0786369593709044,
"grad_norm": 0.14421057898446202,
"learning_rate": 5e-05,
"loss": 1.6374,
"step": 1069
},
{
"epoch": 1.0796451255166852,
"grad_norm": 0.14129351206800517,
"learning_rate": 5e-05,
"loss": 1.652,
"step": 1070
},
{
"epoch": 1.080653291662466,
"grad_norm": 0.13777294504511903,
"learning_rate": 5e-05,
"loss": 1.6619,
"step": 1071
},
{
"epoch": 1.0816614578082469,
"grad_norm": 0.14301808646954006,
"learning_rate": 5e-05,
"loss": 1.6333,
"step": 1072
},
{
"epoch": 1.0826696239540277,
"grad_norm": 0.1455323994765,
"learning_rate": 5e-05,
"loss": 1.63,
"step": 1073
},
{
"epoch": 1.0836777900998085,
"grad_norm": 0.13488078036821274,
"learning_rate": 5e-05,
"loss": 1.6476,
"step": 1074
},
{
"epoch": 1.0846859562455893,
"grad_norm": 0.12922264908060607,
"learning_rate": 5e-05,
"loss": 1.6562,
"step": 1075
},
{
"epoch": 1.08569412239137,
"grad_norm": 0.13245289137103436,
"learning_rate": 5e-05,
"loss": 1.638,
"step": 1076
},
{
"epoch": 1.086702288537151,
"grad_norm": 0.13467066705212152,
"learning_rate": 5e-05,
"loss": 1.6432,
"step": 1077
},
{
"epoch": 1.0877104546829317,
"grad_norm": 0.13683554315901364,
"learning_rate": 5e-05,
"loss": 1.6418,
"step": 1078
},
{
"epoch": 1.0887186208287125,
"grad_norm": 0.13087668906495806,
"learning_rate": 5e-05,
"loss": 1.6453,
"step": 1079
},
{
"epoch": 1.0897267869744933,
"grad_norm": 0.12813283034951103,
"learning_rate": 5e-05,
"loss": 1.6664,
"step": 1080
},
{
"epoch": 1.0907349531202741,
"grad_norm": 0.13206635982114845,
"learning_rate": 5e-05,
"loss": 1.6612,
"step": 1081
},
{
"epoch": 1.091743119266055,
"grad_norm": 0.1298970210608696,
"learning_rate": 5e-05,
"loss": 1.6357,
"step": 1082
},
{
"epoch": 1.092751285411836,
"grad_norm": 0.13177936167224702,
"learning_rate": 5e-05,
"loss": 1.6533,
"step": 1083
},
{
"epoch": 1.0937594515576168,
"grad_norm": 0.1370794107339465,
"learning_rate": 5e-05,
"loss": 1.6573,
"step": 1084
},
{
"epoch": 1.0947676177033976,
"grad_norm": 0.12985926535902795,
"learning_rate": 5e-05,
"loss": 1.6597,
"step": 1085
},
{
"epoch": 1.0957757838491784,
"grad_norm": 0.14365813461674878,
"learning_rate": 5e-05,
"loss": 1.6415,
"step": 1086
},
{
"epoch": 1.0967839499949592,
"grad_norm": 0.13831040201343497,
"learning_rate": 5e-05,
"loss": 1.6427,
"step": 1087
},
{
"epoch": 1.09779211614074,
"grad_norm": 0.12700324637807814,
"learning_rate": 5e-05,
"loss": 1.6635,
"step": 1088
},
{
"epoch": 1.0988002822865208,
"grad_norm": 0.14530007715183632,
"learning_rate": 5e-05,
"loss": 1.6552,
"step": 1089
},
{
"epoch": 1.0998084484323016,
"grad_norm": 0.14358509176757844,
"learning_rate": 5e-05,
"loss": 1.65,
"step": 1090
},
{
"epoch": 1.1008166145780824,
"grad_norm": 0.12220911593027503,
"learning_rate": 5e-05,
"loss": 1.6603,
"step": 1091
},
{
"epoch": 1.1018247807238633,
"grad_norm": 0.1383717567406863,
"learning_rate": 5e-05,
"loss": 1.6642,
"step": 1092
},
{
"epoch": 1.102832946869644,
"grad_norm": 0.13425833878331841,
"learning_rate": 5e-05,
"loss": 1.6413,
"step": 1093
},
{
"epoch": 1.1038411130154249,
"grad_norm": 0.12995091320791363,
"learning_rate": 5e-05,
"loss": 1.6429,
"step": 1094
},
{
"epoch": 1.1048492791612057,
"grad_norm": 0.13727248059012334,
"learning_rate": 5e-05,
"loss": 1.643,
"step": 1095
},
{
"epoch": 1.1058574453069867,
"grad_norm": 0.1451092239957977,
"learning_rate": 5e-05,
"loss": 1.6235,
"step": 1096
},
{
"epoch": 1.1068656114527675,
"grad_norm": 1.1003212392254569,
"learning_rate": 5e-05,
"loss": 1.6607,
"step": 1097
},
{
"epoch": 1.1078737775985483,
"grad_norm": 0.1487788294386961,
"learning_rate": 5e-05,
"loss": 1.6637,
"step": 1098
},
{
"epoch": 1.1088819437443291,
"grad_norm": 0.1348509108073677,
"learning_rate": 5e-05,
"loss": 1.6633,
"step": 1099
},
{
"epoch": 1.10989010989011,
"grad_norm": 0.13451834596455475,
"learning_rate": 5e-05,
"loss": 1.6503,
"step": 1100
},
{
"epoch": 1.1108982760358908,
"grad_norm": 0.1434915625519777,
"learning_rate": 5e-05,
"loss": 1.6441,
"step": 1101
},
{
"epoch": 1.1119064421816716,
"grad_norm": 0.13247439246408826,
"learning_rate": 5e-05,
"loss": 1.6452,
"step": 1102
},
{
"epoch": 1.1129146083274524,
"grad_norm": 0.15483809847029564,
"learning_rate": 5e-05,
"loss": 1.6334,
"step": 1103
},
{
"epoch": 1.1139227744732332,
"grad_norm": 0.15969160070618565,
"learning_rate": 5e-05,
"loss": 1.6429,
"step": 1104
},
{
"epoch": 1.114930940619014,
"grad_norm": 0.1628238357590171,
"learning_rate": 5e-05,
"loss": 1.6608,
"step": 1105
},
{
"epoch": 1.1159391067647948,
"grad_norm": 0.15930542337028844,
"learning_rate": 5e-05,
"loss": 1.6488,
"step": 1106
},
{
"epoch": 1.1169472729105756,
"grad_norm": 0.15466370593275638,
"learning_rate": 5e-05,
"loss": 1.6532,
"step": 1107
},
{
"epoch": 1.1179554390563564,
"grad_norm": 0.14010912938299824,
"learning_rate": 5e-05,
"loss": 1.6573,
"step": 1108
},
{
"epoch": 1.1189636052021372,
"grad_norm": 0.16390153898300394,
"learning_rate": 5e-05,
"loss": 1.6594,
"step": 1109
},
{
"epoch": 1.119971771347918,
"grad_norm": 0.14116244076539533,
"learning_rate": 5e-05,
"loss": 1.644,
"step": 1110
},
{
"epoch": 1.120979937493699,
"grad_norm": 0.13648666647020574,
"learning_rate": 5e-05,
"loss": 1.6513,
"step": 1111
},
{
"epoch": 1.1219881036394799,
"grad_norm": 0.140123096773187,
"learning_rate": 5e-05,
"loss": 1.6301,
"step": 1112
},
{
"epoch": 1.1229962697852607,
"grad_norm": 0.13187345151623706,
"learning_rate": 5e-05,
"loss": 1.644,
"step": 1113
},
{
"epoch": 1.1240044359310415,
"grad_norm": 0.13763479057234174,
"learning_rate": 5e-05,
"loss": 1.6425,
"step": 1114
},
{
"epoch": 1.1250126020768223,
"grad_norm": 0.13396232089072882,
"learning_rate": 5e-05,
"loss": 1.6681,
"step": 1115
},
{
"epoch": 1.126020768222603,
"grad_norm": 0.13322825877954908,
"learning_rate": 5e-05,
"loss": 1.659,
"step": 1116
},
{
"epoch": 1.127028934368384,
"grad_norm": 0.13579297620215142,
"learning_rate": 5e-05,
"loss": 1.6437,
"step": 1117
},
{
"epoch": 1.1280371005141647,
"grad_norm": 0.1341591695038578,
"learning_rate": 5e-05,
"loss": 1.6507,
"step": 1118
},
{
"epoch": 1.1290452666599455,
"grad_norm": 0.13741517387144328,
"learning_rate": 5e-05,
"loss": 1.6442,
"step": 1119
},
{
"epoch": 1.1300534328057263,
"grad_norm": 0.14068768012069355,
"learning_rate": 5e-05,
"loss": 1.6465,
"step": 1120
},
{
"epoch": 1.1310615989515072,
"grad_norm": 0.14004222531844976,
"learning_rate": 5e-05,
"loss": 1.6438,
"step": 1121
},
{
"epoch": 1.132069765097288,
"grad_norm": 0.1430222089343432,
"learning_rate": 5e-05,
"loss": 1.6294,
"step": 1122
},
{
"epoch": 1.1330779312430688,
"grad_norm": 0.1354002130699085,
"learning_rate": 5e-05,
"loss": 1.6235,
"step": 1123
},
{
"epoch": 1.1340860973888498,
"grad_norm": 0.14706699771800255,
"learning_rate": 5e-05,
"loss": 1.6524,
"step": 1124
},
{
"epoch": 1.1350942635346306,
"grad_norm": 0.14384265748302816,
"learning_rate": 5e-05,
"loss": 1.6484,
"step": 1125
},
{
"epoch": 1.1361024296804114,
"grad_norm": 0.1328015268784059,
"learning_rate": 5e-05,
"loss": 1.6241,
"step": 1126
},
{
"epoch": 1.1371105958261922,
"grad_norm": 0.1398676330492913,
"learning_rate": 5e-05,
"loss": 1.6441,
"step": 1127
},
{
"epoch": 1.138118761971973,
"grad_norm": 0.14489176249712743,
"learning_rate": 5e-05,
"loss": 1.6501,
"step": 1128
},
{
"epoch": 1.1391269281177538,
"grad_norm": 0.15427806273758912,
"learning_rate": 5e-05,
"loss": 1.6382,
"step": 1129
},
{
"epoch": 1.1401350942635347,
"grad_norm": 0.1353320676919305,
"learning_rate": 5e-05,
"loss": 1.6316,
"step": 1130
},
{
"epoch": 1.1411432604093155,
"grad_norm": 0.1459498336653144,
"learning_rate": 5e-05,
"loss": 1.6762,
"step": 1131
},
{
"epoch": 1.1421514265550963,
"grad_norm": 0.15798013402714647,
"learning_rate": 5e-05,
"loss": 1.6511,
"step": 1132
},
{
"epoch": 1.143159592700877,
"grad_norm": 0.15079687525350177,
"learning_rate": 5e-05,
"loss": 1.6349,
"step": 1133
},
{
"epoch": 1.144167758846658,
"grad_norm": 0.1750724581326861,
"learning_rate": 5e-05,
"loss": 1.6552,
"step": 1134
},
{
"epoch": 1.1451759249924387,
"grad_norm": 0.13556870245122188,
"learning_rate": 5e-05,
"loss": 1.6489,
"step": 1135
},
{
"epoch": 1.1461840911382195,
"grad_norm": 0.1319776262053859,
"learning_rate": 5e-05,
"loss": 1.6505,
"step": 1136
},
{
"epoch": 1.1471922572840003,
"grad_norm": 0.13323668607471617,
"learning_rate": 5e-05,
"loss": 1.6707,
"step": 1137
},
{
"epoch": 1.1482004234297811,
"grad_norm": 0.13656918475459365,
"learning_rate": 5e-05,
"loss": 1.6283,
"step": 1138
},
{
"epoch": 1.149208589575562,
"grad_norm": 0.12451944847688946,
"learning_rate": 5e-05,
"loss": 1.6379,
"step": 1139
},
{
"epoch": 1.150216755721343,
"grad_norm": 0.12773660059098071,
"learning_rate": 5e-05,
"loss": 1.644,
"step": 1140
},
{
"epoch": 1.1512249218671238,
"grad_norm": 0.12345976774900896,
"learning_rate": 5e-05,
"loss": 1.6386,
"step": 1141
},
{
"epoch": 1.1522330880129046,
"grad_norm": 0.1380893224066923,
"learning_rate": 5e-05,
"loss": 1.6667,
"step": 1142
},
{
"epoch": 1.1532412541586854,
"grad_norm": 0.140157865219376,
"learning_rate": 5e-05,
"loss": 1.635,
"step": 1143
},
{
"epoch": 1.1542494203044662,
"grad_norm": 0.13727630744096145,
"learning_rate": 5e-05,
"loss": 1.6424,
"step": 1144
},
{
"epoch": 1.155257586450247,
"grad_norm": 0.1324099666783387,
"learning_rate": 5e-05,
"loss": 1.6373,
"step": 1145
},
{
"epoch": 1.1562657525960278,
"grad_norm": 0.13882259979347136,
"learning_rate": 5e-05,
"loss": 1.6591,
"step": 1146
},
{
"epoch": 1.1572739187418086,
"grad_norm": 0.13098459974691126,
"learning_rate": 5e-05,
"loss": 1.6447,
"step": 1147
},
{
"epoch": 1.1582820848875894,
"grad_norm": 0.1431528108442056,
"learning_rate": 5e-05,
"loss": 1.6467,
"step": 1148
},
{
"epoch": 1.1592902510333702,
"grad_norm": 0.13353399419959994,
"learning_rate": 5e-05,
"loss": 1.6259,
"step": 1149
},
{
"epoch": 1.160298417179151,
"grad_norm": 0.14073095816973716,
"learning_rate": 5e-05,
"loss": 1.6524,
"step": 1150
},
{
"epoch": 1.1613065833249319,
"grad_norm": 0.12967149139945466,
"learning_rate": 5e-05,
"loss": 1.636,
"step": 1151
},
{
"epoch": 1.162314749470713,
"grad_norm": 0.1404076923142028,
"learning_rate": 5e-05,
"loss": 1.6388,
"step": 1152
},
{
"epoch": 1.1633229156164937,
"grad_norm": 0.12890108550224091,
"learning_rate": 5e-05,
"loss": 1.6419,
"step": 1153
},
{
"epoch": 1.1643310817622745,
"grad_norm": 0.14108978798383223,
"learning_rate": 5e-05,
"loss": 1.6519,
"step": 1154
},
{
"epoch": 1.1653392479080553,
"grad_norm": 0.12324326033477677,
"learning_rate": 5e-05,
"loss": 1.6378,
"step": 1155
},
{
"epoch": 1.1663474140538361,
"grad_norm": 0.13123027082856784,
"learning_rate": 5e-05,
"loss": 1.6271,
"step": 1156
},
{
"epoch": 1.167355580199617,
"grad_norm": 0.13632941963813844,
"learning_rate": 5e-05,
"loss": 1.6449,
"step": 1157
},
{
"epoch": 1.1683637463453977,
"grad_norm": 0.14274588260292997,
"learning_rate": 5e-05,
"loss": 1.6289,
"step": 1158
},
{
"epoch": 1.1693719124911786,
"grad_norm": 0.1388212405939215,
"learning_rate": 5e-05,
"loss": 1.651,
"step": 1159
},
{
"epoch": 1.1703800786369594,
"grad_norm": 0.13913045853970632,
"learning_rate": 5e-05,
"loss": 1.6128,
"step": 1160
},
{
"epoch": 1.1713882447827402,
"grad_norm": 0.12953138641069079,
"learning_rate": 5e-05,
"loss": 1.6716,
"step": 1161
},
{
"epoch": 1.172396410928521,
"grad_norm": 0.13383463327732076,
"learning_rate": 5e-05,
"loss": 1.6353,
"step": 1162
},
{
"epoch": 1.1734045770743018,
"grad_norm": 0.13282072248663965,
"learning_rate": 5e-05,
"loss": 1.6457,
"step": 1163
},
{
"epoch": 1.1744127432200826,
"grad_norm": 0.1256500519520948,
"learning_rate": 5e-05,
"loss": 1.6458,
"step": 1164
},
{
"epoch": 1.1754209093658634,
"grad_norm": 0.13370852595416396,
"learning_rate": 5e-05,
"loss": 1.6396,
"step": 1165
},
{
"epoch": 1.1764290755116442,
"grad_norm": 0.14088261443154418,
"learning_rate": 5e-05,
"loss": 1.6178,
"step": 1166
},
{
"epoch": 1.177437241657425,
"grad_norm": 0.1278315762332314,
"learning_rate": 5e-05,
"loss": 1.6467,
"step": 1167
},
{
"epoch": 1.178445407803206,
"grad_norm": 0.1366827463160475,
"learning_rate": 5e-05,
"loss": 1.6612,
"step": 1168
},
{
"epoch": 1.1794535739489869,
"grad_norm": 0.13220181447257495,
"learning_rate": 5e-05,
"loss": 1.6393,
"step": 1169
},
{
"epoch": 1.1804617400947677,
"grad_norm": 0.13957285481272086,
"learning_rate": 5e-05,
"loss": 1.6442,
"step": 1170
},
{
"epoch": 1.1814699062405485,
"grad_norm": 0.13381450080318658,
"learning_rate": 5e-05,
"loss": 1.6409,
"step": 1171
},
{
"epoch": 1.1824780723863293,
"grad_norm": 0.1331024616666049,
"learning_rate": 5e-05,
"loss": 1.6297,
"step": 1172
},
{
"epoch": 1.18348623853211,
"grad_norm": 0.13425932581575892,
"learning_rate": 5e-05,
"loss": 1.646,
"step": 1173
},
{
"epoch": 1.184494404677891,
"grad_norm": 0.1419448930914817,
"learning_rate": 5e-05,
"loss": 1.6333,
"step": 1174
},
{
"epoch": 1.1855025708236717,
"grad_norm": 0.12587029419536305,
"learning_rate": 5e-05,
"loss": 1.6451,
"step": 1175
},
{
"epoch": 1.1865107369694525,
"grad_norm": 0.12990741754999835,
"learning_rate": 5e-05,
"loss": 1.6377,
"step": 1176
},
{
"epoch": 1.1875189031152333,
"grad_norm": 0.13028002070390035,
"learning_rate": 5e-05,
"loss": 1.6448,
"step": 1177
},
{
"epoch": 1.1885270692610141,
"grad_norm": 0.1361939415163281,
"learning_rate": 5e-05,
"loss": 1.6432,
"step": 1178
},
{
"epoch": 1.189535235406795,
"grad_norm": 0.13811927402862617,
"learning_rate": 5e-05,
"loss": 1.6421,
"step": 1179
},
{
"epoch": 1.190543401552576,
"grad_norm": 0.13087126034252194,
"learning_rate": 5e-05,
"loss": 1.6426,
"step": 1180
},
{
"epoch": 1.1915515676983568,
"grad_norm": 0.14294244646531867,
"learning_rate": 5e-05,
"loss": 1.6554,
"step": 1181
},
{
"epoch": 1.1925597338441376,
"grad_norm": 0.14034413475200178,
"learning_rate": 5e-05,
"loss": 1.6532,
"step": 1182
},
{
"epoch": 1.1935678999899184,
"grad_norm": 0.13177978129089846,
"learning_rate": 5e-05,
"loss": 1.6269,
"step": 1183
},
{
"epoch": 1.1945760661356992,
"grad_norm": 0.12164759452371639,
"learning_rate": 5e-05,
"loss": 1.6411,
"step": 1184
},
{
"epoch": 1.19558423228148,
"grad_norm": 0.13393061368735346,
"learning_rate": 5e-05,
"loss": 1.6504,
"step": 1185
},
{
"epoch": 1.1965923984272608,
"grad_norm": 0.12922097478855823,
"learning_rate": 5e-05,
"loss": 1.644,
"step": 1186
},
{
"epoch": 1.1976005645730416,
"grad_norm": 0.13474046194928005,
"learning_rate": 5e-05,
"loss": 1.6357,
"step": 1187
},
{
"epoch": 1.1986087307188225,
"grad_norm": 0.13416139424027906,
"learning_rate": 5e-05,
"loss": 1.6253,
"step": 1188
},
{
"epoch": 1.1996168968646033,
"grad_norm": 0.12424536100362801,
"learning_rate": 5e-05,
"loss": 1.6597,
"step": 1189
},
{
"epoch": 1.200625063010384,
"grad_norm": 0.12859940727054653,
"learning_rate": 5e-05,
"loss": 1.6473,
"step": 1190
},
{
"epoch": 1.2016332291561649,
"grad_norm": 0.13774315044583574,
"learning_rate": 5e-05,
"loss": 1.6433,
"step": 1191
},
{
"epoch": 1.2026413953019457,
"grad_norm": 0.13474605371401327,
"learning_rate": 5e-05,
"loss": 1.6429,
"step": 1192
},
{
"epoch": 1.2036495614477265,
"grad_norm": 0.13137818581832644,
"learning_rate": 5e-05,
"loss": 1.6361,
"step": 1193
},
{
"epoch": 1.2046577275935073,
"grad_norm": 0.12998129292792446,
"learning_rate": 5e-05,
"loss": 1.642,
"step": 1194
},
{
"epoch": 1.2056658937392881,
"grad_norm": 0.13364766964304525,
"learning_rate": 5e-05,
"loss": 1.647,
"step": 1195
},
{
"epoch": 1.2066740598850692,
"grad_norm": 0.13387780726266185,
"learning_rate": 5e-05,
"loss": 1.639,
"step": 1196
},
{
"epoch": 1.20768222603085,
"grad_norm": 0.1262397535621686,
"learning_rate": 5e-05,
"loss": 1.6516,
"step": 1197
},
{
"epoch": 1.2086903921766308,
"grad_norm": 0.1335534592115057,
"learning_rate": 5e-05,
"loss": 1.6357,
"step": 1198
},
{
"epoch": 1.2096985583224116,
"grad_norm": 0.12213304785010605,
"learning_rate": 5e-05,
"loss": 1.6424,
"step": 1199
},
{
"epoch": 1.2107067244681924,
"grad_norm": 0.1301733256476672,
"learning_rate": 5e-05,
"loss": 1.6374,
"step": 1200
},
{
"epoch": 1.2117148906139732,
"grad_norm": 0.1264707123675731,
"learning_rate": 5e-05,
"loss": 1.6634,
"step": 1201
},
{
"epoch": 1.212723056759754,
"grad_norm": 0.13464550491855465,
"learning_rate": 5e-05,
"loss": 1.6452,
"step": 1202
},
{
"epoch": 1.2137312229055348,
"grad_norm": 0.12740550424329222,
"learning_rate": 5e-05,
"loss": 1.6259,
"step": 1203
},
{
"epoch": 1.2147393890513156,
"grad_norm": 0.12745955103663922,
"learning_rate": 5e-05,
"loss": 1.6616,
"step": 1204
},
{
"epoch": 1.2157475551970964,
"grad_norm": 0.1388644513951216,
"learning_rate": 5e-05,
"loss": 1.6387,
"step": 1205
},
{
"epoch": 1.2167557213428772,
"grad_norm": 0.1341359780867019,
"learning_rate": 5e-05,
"loss": 1.6458,
"step": 1206
},
{
"epoch": 1.217763887488658,
"grad_norm": 0.22124542084376875,
"learning_rate": 5e-05,
"loss": 1.6437,
"step": 1207
},
{
"epoch": 1.218772053634439,
"grad_norm": 0.13394553567218737,
"learning_rate": 5e-05,
"loss": 1.6443,
"step": 1208
},
{
"epoch": 1.2197802197802199,
"grad_norm": 0.13108124495407641,
"learning_rate": 5e-05,
"loss": 1.6375,
"step": 1209
},
{
"epoch": 1.2207883859260007,
"grad_norm": 0.13289777245013853,
"learning_rate": 5e-05,
"loss": 1.6245,
"step": 1210
},
{
"epoch": 1.2217965520717815,
"grad_norm": 0.13453450266809466,
"learning_rate": 5e-05,
"loss": 1.6338,
"step": 1211
},
{
"epoch": 1.2228047182175623,
"grad_norm": 0.17614553222683538,
"learning_rate": 5e-05,
"loss": 1.6254,
"step": 1212
},
{
"epoch": 1.2238128843633431,
"grad_norm": 0.12999419428479778,
"learning_rate": 5e-05,
"loss": 1.6345,
"step": 1213
},
{
"epoch": 1.224821050509124,
"grad_norm": 0.13534104691368284,
"learning_rate": 5e-05,
"loss": 1.6466,
"step": 1214
},
{
"epoch": 1.2258292166549047,
"grad_norm": 0.13500513832725639,
"learning_rate": 5e-05,
"loss": 1.6463,
"step": 1215
},
{
"epoch": 1.2268373828006855,
"grad_norm": 0.13460460013548567,
"learning_rate": 5e-05,
"loss": 1.6284,
"step": 1216
},
{
"epoch": 1.2278455489464664,
"grad_norm": 0.14239914142179805,
"learning_rate": 5e-05,
"loss": 1.6321,
"step": 1217
},
{
"epoch": 1.2288537150922472,
"grad_norm": 0.13003403900073976,
"learning_rate": 5e-05,
"loss": 1.6315,
"step": 1218
},
{
"epoch": 1.229861881238028,
"grad_norm": 0.1375130522018261,
"learning_rate": 5e-05,
"loss": 1.6507,
"step": 1219
},
{
"epoch": 1.2308700473838088,
"grad_norm": 0.14339474655375617,
"learning_rate": 5e-05,
"loss": 1.6321,
"step": 1220
},
{
"epoch": 1.2318782135295896,
"grad_norm": 0.9032408125998735,
"learning_rate": 5e-05,
"loss": 1.632,
"step": 1221
},
{
"epoch": 1.2328863796753704,
"grad_norm": 0.1474587794305201,
"learning_rate": 5e-05,
"loss": 1.6427,
"step": 1222
},
{
"epoch": 1.2338945458211512,
"grad_norm": 0.14814555142158561,
"learning_rate": 5e-05,
"loss": 1.6494,
"step": 1223
},
{
"epoch": 1.2349027119669322,
"grad_norm": 0.14395647101199663,
"learning_rate": 5e-05,
"loss": 1.6268,
"step": 1224
},
{
"epoch": 1.235910878112713,
"grad_norm": 0.13929910981835994,
"learning_rate": 5e-05,
"loss": 1.6295,
"step": 1225
},
{
"epoch": 1.2369190442584939,
"grad_norm": 0.14002983850498224,
"learning_rate": 5e-05,
"loss": 1.6522,
"step": 1226
},
{
"epoch": 1.2379272104042747,
"grad_norm": 0.1353192537450001,
"learning_rate": 5e-05,
"loss": 1.6309,
"step": 1227
},
{
"epoch": 1.2389353765500555,
"grad_norm": 0.13292267565626098,
"learning_rate": 5e-05,
"loss": 1.6428,
"step": 1228
},
{
"epoch": 1.2399435426958363,
"grad_norm": 0.13456391714773153,
"learning_rate": 5e-05,
"loss": 1.629,
"step": 1229
},
{
"epoch": 1.240951708841617,
"grad_norm": 0.13788714692286827,
"learning_rate": 5e-05,
"loss": 1.6346,
"step": 1230
},
{
"epoch": 1.241959874987398,
"grad_norm": 0.14423261316710512,
"learning_rate": 5e-05,
"loss": 1.6448,
"step": 1231
},
{
"epoch": 1.2429680411331787,
"grad_norm": 0.14447654635989376,
"learning_rate": 5e-05,
"loss": 1.6462,
"step": 1232
},
{
"epoch": 1.2439762072789595,
"grad_norm": 0.1434544966920315,
"learning_rate": 5e-05,
"loss": 1.649,
"step": 1233
},
{
"epoch": 1.2449843734247403,
"grad_norm": 0.14777426191966908,
"learning_rate": 5e-05,
"loss": 1.6367,
"step": 1234
},
{
"epoch": 1.2459925395705211,
"grad_norm": 0.13612570962263057,
"learning_rate": 5e-05,
"loss": 1.6166,
"step": 1235
},
{
"epoch": 1.2470007057163022,
"grad_norm": 0.14476607599552396,
"learning_rate": 5e-05,
"loss": 1.6263,
"step": 1236
},
{
"epoch": 1.248008871862083,
"grad_norm": 0.13145764552676129,
"learning_rate": 5e-05,
"loss": 1.6451,
"step": 1237
},
{
"epoch": 1.2490170380078638,
"grad_norm": 0.1490958446557475,
"learning_rate": 5e-05,
"loss": 1.6182,
"step": 1238
},
{
"epoch": 1.2500252041536446,
"grad_norm": 0.13808203154322748,
"learning_rate": 5e-05,
"loss": 1.6477,
"step": 1239
},
{
"epoch": 1.2510333702994254,
"grad_norm": 0.14138072513679126,
"learning_rate": 5e-05,
"loss": 1.6399,
"step": 1240
},
{
"epoch": 1.2520415364452062,
"grad_norm": 0.1320433504886423,
"learning_rate": 5e-05,
"loss": 1.6212,
"step": 1241
},
{
"epoch": 1.253049702590987,
"grad_norm": 0.13903842537780303,
"learning_rate": 5e-05,
"loss": 1.646,
"step": 1242
},
{
"epoch": 1.2540578687367678,
"grad_norm": 0.2562763748818742,
"learning_rate": 5e-05,
"loss": 1.6453,
"step": 1243
},
{
"epoch": 1.2550660348825486,
"grad_norm": 0.13135629946561656,
"learning_rate": 5e-05,
"loss": 1.6308,
"step": 1244
},
{
"epoch": 1.2560742010283295,
"grad_norm": 0.14459808940104862,
"learning_rate": 5e-05,
"loss": 1.6379,
"step": 1245
},
{
"epoch": 1.2570823671741103,
"grad_norm": 0.14002530789490344,
"learning_rate": 5e-05,
"loss": 1.6452,
"step": 1246
},
{
"epoch": 1.258090533319891,
"grad_norm": 0.13156584118231135,
"learning_rate": 5e-05,
"loss": 1.6396,
"step": 1247
},
{
"epoch": 1.2590986994656719,
"grad_norm": 0.1446357476831878,
"learning_rate": 5e-05,
"loss": 1.641,
"step": 1248
},
{
"epoch": 1.2601068656114527,
"grad_norm": 0.13668026201833627,
"learning_rate": 5e-05,
"loss": 1.654,
"step": 1249
},
{
"epoch": 1.2611150317572335,
"grad_norm": 0.14431276066509344,
"learning_rate": 5e-05,
"loss": 1.6429,
"step": 1250
},
{
"epoch": 1.2621231979030143,
"grad_norm": 0.13805388076215672,
"learning_rate": 5e-05,
"loss": 1.6422,
"step": 1251
},
{
"epoch": 1.2631313640487951,
"grad_norm": 0.127891326472375,
"learning_rate": 5e-05,
"loss": 1.6512,
"step": 1252
},
{
"epoch": 1.2641395301945761,
"grad_norm": 0.1380651670587077,
"learning_rate": 5e-05,
"loss": 1.6556,
"step": 1253
},
{
"epoch": 1.265147696340357,
"grad_norm": 0.13354225518347246,
"learning_rate": 5e-05,
"loss": 1.6314,
"step": 1254
},
{
"epoch": 1.2661558624861378,
"grad_norm": 0.14441013224418936,
"learning_rate": 5e-05,
"loss": 1.6374,
"step": 1255
},
{
"epoch": 1.2671640286319186,
"grad_norm": 0.1467779285068228,
"learning_rate": 5e-05,
"loss": 1.6457,
"step": 1256
},
{
"epoch": 1.2681721947776994,
"grad_norm": 0.18403393179481492,
"learning_rate": 5e-05,
"loss": 1.6244,
"step": 1257
},
{
"epoch": 1.2691803609234802,
"grad_norm": 0.14105230329683727,
"learning_rate": 5e-05,
"loss": 1.6263,
"step": 1258
},
{
"epoch": 1.270188527069261,
"grad_norm": 0.14021432110623025,
"learning_rate": 5e-05,
"loss": 1.6344,
"step": 1259
},
{
"epoch": 1.2711966932150418,
"grad_norm": 0.14733366499856676,
"learning_rate": 5e-05,
"loss": 1.6422,
"step": 1260
},
{
"epoch": 1.2722048593608226,
"grad_norm": 0.12082105925908934,
"learning_rate": 5e-05,
"loss": 1.6357,
"step": 1261
},
{
"epoch": 1.2732130255066034,
"grad_norm": 0.15655495903198283,
"learning_rate": 5e-05,
"loss": 1.641,
"step": 1262
},
{
"epoch": 1.2742211916523842,
"grad_norm": 0.128474136803076,
"learning_rate": 5e-05,
"loss": 1.6574,
"step": 1263
},
{
"epoch": 1.2752293577981653,
"grad_norm": 0.14881569196732872,
"learning_rate": 5e-05,
"loss": 1.626,
"step": 1264
},
{
"epoch": 1.276237523943946,
"grad_norm": 0.13311171207901276,
"learning_rate": 5e-05,
"loss": 1.6443,
"step": 1265
},
{
"epoch": 1.2772456900897269,
"grad_norm": 0.1252689256815893,
"learning_rate": 5e-05,
"loss": 1.6238,
"step": 1266
},
{
"epoch": 1.2782538562355077,
"grad_norm": 0.13341970118068097,
"learning_rate": 5e-05,
"loss": 1.6411,
"step": 1267
},
{
"epoch": 1.2792620223812885,
"grad_norm": 0.1324244420868155,
"learning_rate": 5e-05,
"loss": 1.6365,
"step": 1268
},
{
"epoch": 1.2802701885270693,
"grad_norm": 0.12881476074167852,
"learning_rate": 5e-05,
"loss": 1.652,
"step": 1269
},
{
"epoch": 1.2812783546728501,
"grad_norm": 0.12856209242490413,
"learning_rate": 5e-05,
"loss": 1.6406,
"step": 1270
},
{
"epoch": 1.282286520818631,
"grad_norm": 0.13208309716233654,
"learning_rate": 5e-05,
"loss": 1.6298,
"step": 1271
},
{
"epoch": 1.2832946869644117,
"grad_norm": 0.13939720107304857,
"learning_rate": 5e-05,
"loss": 1.6312,
"step": 1272
},
{
"epoch": 1.2843028531101925,
"grad_norm": 0.14248500778548934,
"learning_rate": 5e-05,
"loss": 1.6518,
"step": 1273
},
{
"epoch": 1.2853110192559734,
"grad_norm": 0.13631582079163687,
"learning_rate": 5e-05,
"loss": 1.6503,
"step": 1274
},
{
"epoch": 1.2863191854017542,
"grad_norm": 0.5103305312002676,
"learning_rate": 5e-05,
"loss": 1.6421,
"step": 1275
},
{
"epoch": 1.287327351547535,
"grad_norm": 0.12896301140990832,
"learning_rate": 5e-05,
"loss": 1.6398,
"step": 1276
},
{
"epoch": 1.2883355176933158,
"grad_norm": 0.12142844514497131,
"learning_rate": 5e-05,
"loss": 1.6239,
"step": 1277
},
{
"epoch": 1.2893436838390966,
"grad_norm": 0.13477045845515837,
"learning_rate": 5e-05,
"loss": 1.6259,
"step": 1278
},
{
"epoch": 1.2903518499848774,
"grad_norm": 0.13139882642042439,
"learning_rate": 5e-05,
"loss": 1.6465,
"step": 1279
},
{
"epoch": 1.2913600161306582,
"grad_norm": 0.1351573799385729,
"learning_rate": 5e-05,
"loss": 1.6385,
"step": 1280
},
{
"epoch": 1.2923681822764392,
"grad_norm": 0.13532575707387254,
"learning_rate": 5e-05,
"loss": 1.655,
"step": 1281
},
{
"epoch": 1.29337634842222,
"grad_norm": 0.15211481123741347,
"learning_rate": 5e-05,
"loss": 1.6241,
"step": 1282
},
{
"epoch": 1.2943845145680009,
"grad_norm": 0.14059638838644756,
"learning_rate": 5e-05,
"loss": 1.6411,
"step": 1283
},
{
"epoch": 1.2953926807137817,
"grad_norm": 0.1478517568247696,
"learning_rate": 5e-05,
"loss": 1.6185,
"step": 1284
},
{
"epoch": 1.2964008468595625,
"grad_norm": 0.13518147900413588,
"learning_rate": 5e-05,
"loss": 1.6549,
"step": 1285
},
{
"epoch": 1.2974090130053433,
"grad_norm": 0.14127263362808326,
"learning_rate": 5e-05,
"loss": 1.6169,
"step": 1286
},
{
"epoch": 1.298417179151124,
"grad_norm": 0.1435161140567047,
"learning_rate": 5e-05,
"loss": 1.6505,
"step": 1287
},
{
"epoch": 1.299425345296905,
"grad_norm": 0.262970067626453,
"learning_rate": 5e-05,
"loss": 1.6175,
"step": 1288
},
{
"epoch": 1.3004335114426857,
"grad_norm": 0.14386872816206583,
"learning_rate": 5e-05,
"loss": 1.6399,
"step": 1289
},
{
"epoch": 1.3014416775884665,
"grad_norm": 0.13310896853865345,
"learning_rate": 5e-05,
"loss": 1.6423,
"step": 1290
},
{
"epoch": 1.3024498437342473,
"grad_norm": 0.12947071002773367,
"learning_rate": 5e-05,
"loss": 1.6238,
"step": 1291
},
{
"epoch": 1.3034580098800284,
"grad_norm": 0.13778793067032488,
"learning_rate": 5e-05,
"loss": 1.6463,
"step": 1292
},
{
"epoch": 1.3044661760258092,
"grad_norm": 0.12863210061676175,
"learning_rate": 5e-05,
"loss": 1.6515,
"step": 1293
},
{
"epoch": 1.30547434217159,
"grad_norm": 0.1400329870431008,
"learning_rate": 5e-05,
"loss": 1.6384,
"step": 1294
},
{
"epoch": 1.3064825083173708,
"grad_norm": 0.14907669819312522,
"learning_rate": 5e-05,
"loss": 1.6547,
"step": 1295
},
{
"epoch": 1.3074906744631516,
"grad_norm": 0.13952657031737695,
"learning_rate": 5e-05,
"loss": 1.6494,
"step": 1296
},
{
"epoch": 1.3084988406089324,
"grad_norm": 0.14408484354195883,
"learning_rate": 5e-05,
"loss": 1.6359,
"step": 1297
},
{
"epoch": 1.3095070067547132,
"grad_norm": 0.14741323470832812,
"learning_rate": 5e-05,
"loss": 1.6414,
"step": 1298
},
{
"epoch": 1.310515172900494,
"grad_norm": 0.14470601610286588,
"learning_rate": 5e-05,
"loss": 1.6524,
"step": 1299
},
{
"epoch": 1.3115233390462748,
"grad_norm": 0.13677564741215217,
"learning_rate": 5e-05,
"loss": 1.6182,
"step": 1300
},
{
"epoch": 1.3125315051920556,
"grad_norm": 0.1509685257674073,
"learning_rate": 5e-05,
"loss": 1.6505,
"step": 1301
},
{
"epoch": 1.3135396713378364,
"grad_norm": 0.1379807694079571,
"learning_rate": 5e-05,
"loss": 1.6241,
"step": 1302
},
{
"epoch": 1.3145478374836173,
"grad_norm": 0.14359330330139006,
"learning_rate": 5e-05,
"loss": 1.6485,
"step": 1303
},
{
"epoch": 1.315556003629398,
"grad_norm": 0.1380525441991341,
"learning_rate": 5e-05,
"loss": 1.6367,
"step": 1304
},
{
"epoch": 1.3165641697751789,
"grad_norm": 0.12872879401596335,
"learning_rate": 5e-05,
"loss": 1.6273,
"step": 1305
},
{
"epoch": 1.3175723359209597,
"grad_norm": 0.15181805868172427,
"learning_rate": 5e-05,
"loss": 1.6312,
"step": 1306
},
{
"epoch": 1.3185805020667405,
"grad_norm": 0.13761906701342136,
"learning_rate": 5e-05,
"loss": 1.6308,
"step": 1307
},
{
"epoch": 1.3195886682125213,
"grad_norm": 0.13196832773553202,
"learning_rate": 5e-05,
"loss": 1.6312,
"step": 1308
},
{
"epoch": 1.3205968343583023,
"grad_norm": 0.14610514452766105,
"learning_rate": 5e-05,
"loss": 1.6492,
"step": 1309
},
{
"epoch": 1.3216050005040831,
"grad_norm": 0.13794310427945244,
"learning_rate": 5e-05,
"loss": 1.6377,
"step": 1310
},
{
"epoch": 1.322613166649864,
"grad_norm": 0.13378727396876383,
"learning_rate": 5e-05,
"loss": 1.626,
"step": 1311
},
{
"epoch": 1.3236213327956448,
"grad_norm": 0.13461401880559,
"learning_rate": 5e-05,
"loss": 1.6367,
"step": 1312
},
{
"epoch": 1.3246294989414256,
"grad_norm": 0.1466088079580074,
"learning_rate": 5e-05,
"loss": 1.6206,
"step": 1313
},
{
"epoch": 1.3256376650872064,
"grad_norm": 0.12716896573606204,
"learning_rate": 5e-05,
"loss": 1.6464,
"step": 1314
},
{
"epoch": 1.3266458312329872,
"grad_norm": 0.14687168001753023,
"learning_rate": 5e-05,
"loss": 1.6417,
"step": 1315
},
{
"epoch": 1.327653997378768,
"grad_norm": 0.1472725540417992,
"learning_rate": 5e-05,
"loss": 1.6463,
"step": 1316
},
{
"epoch": 1.3286621635245488,
"grad_norm": 0.2813113312514839,
"learning_rate": 5e-05,
"loss": 1.6317,
"step": 1317
},
{
"epoch": 1.3296703296703296,
"grad_norm": 0.14714981053972961,
"learning_rate": 5e-05,
"loss": 1.6495,
"step": 1318
},
{
"epoch": 1.3306784958161104,
"grad_norm": 0.14529120808295362,
"learning_rate": 5e-05,
"loss": 1.6332,
"step": 1319
},
{
"epoch": 1.3316866619618914,
"grad_norm": 0.1266252330698273,
"learning_rate": 5e-05,
"loss": 1.6266,
"step": 1320
},
{
"epoch": 1.3326948281076723,
"grad_norm": 0.13448730108032922,
"learning_rate": 5e-05,
"loss": 1.6386,
"step": 1321
},
{
"epoch": 1.333702994253453,
"grad_norm": 0.14425479757866708,
"learning_rate": 5e-05,
"loss": 1.6537,
"step": 1322
},
{
"epoch": 1.3347111603992339,
"grad_norm": 0.1367216965708987,
"learning_rate": 5e-05,
"loss": 1.6413,
"step": 1323
},
{
"epoch": 1.3357193265450147,
"grad_norm": 0.13294074465376157,
"learning_rate": 5e-05,
"loss": 1.623,
"step": 1324
},
{
"epoch": 1.3367274926907955,
"grad_norm": 0.13725476866885875,
"learning_rate": 5e-05,
"loss": 1.6438,
"step": 1325
},
{
"epoch": 1.3377356588365763,
"grad_norm": 0.13523299845821615,
"learning_rate": 5e-05,
"loss": 1.6393,
"step": 1326
},
{
"epoch": 1.3387438249823571,
"grad_norm": 0.12945209128402962,
"learning_rate": 5e-05,
"loss": 1.6244,
"step": 1327
},
{
"epoch": 1.339751991128138,
"grad_norm": 0.1418573980815489,
"learning_rate": 5e-05,
"loss": 1.6448,
"step": 1328
},
{
"epoch": 1.3407601572739187,
"grad_norm": 0.1439901458961193,
"learning_rate": 5e-05,
"loss": 1.6268,
"step": 1329
},
{
"epoch": 1.3417683234196995,
"grad_norm": 0.13282615605008152,
"learning_rate": 5e-05,
"loss": 1.625,
"step": 1330
},
{
"epoch": 1.3427764895654803,
"grad_norm": 0.14182252901536777,
"learning_rate": 5e-05,
"loss": 1.6522,
"step": 1331
},
{
"epoch": 1.3437846557112612,
"grad_norm": 0.1322659099290378,
"learning_rate": 5e-05,
"loss": 1.6484,
"step": 1332
},
{
"epoch": 1.344792821857042,
"grad_norm": 0.14218642066492232,
"learning_rate": 5e-05,
"loss": 1.621,
"step": 1333
},
{
"epoch": 1.3458009880028228,
"grad_norm": 0.14964017176346037,
"learning_rate": 5e-05,
"loss": 1.6405,
"step": 1334
},
{
"epoch": 1.3468091541486036,
"grad_norm": 0.1285209866495888,
"learning_rate": 5e-05,
"loss": 1.6267,
"step": 1335
},
{
"epoch": 1.3478173202943844,
"grad_norm": 0.13753938035954444,
"learning_rate": 5e-05,
"loss": 1.645,
"step": 1336
},
{
"epoch": 1.3488254864401654,
"grad_norm": 0.13202352215553254,
"learning_rate": 5e-05,
"loss": 1.6327,
"step": 1337
},
{
"epoch": 1.3498336525859462,
"grad_norm": 0.13901724623518463,
"learning_rate": 5e-05,
"loss": 1.6554,
"step": 1338
},
{
"epoch": 1.350841818731727,
"grad_norm": 0.1295242891524851,
"learning_rate": 5e-05,
"loss": 1.6303,
"step": 1339
},
{
"epoch": 1.3518499848775078,
"grad_norm": 0.12942845751350548,
"learning_rate": 5e-05,
"loss": 1.644,
"step": 1340
},
{
"epoch": 1.3528581510232887,
"grad_norm": 0.1423410306591456,
"learning_rate": 5e-05,
"loss": 1.63,
"step": 1341
},
{
"epoch": 1.3538663171690695,
"grad_norm": 0.13302313269191274,
"learning_rate": 5e-05,
"loss": 1.6114,
"step": 1342
},
{
"epoch": 1.3548744833148503,
"grad_norm": 0.12861793129843543,
"learning_rate": 5e-05,
"loss": 1.6396,
"step": 1343
},
{
"epoch": 1.355882649460631,
"grad_norm": 0.1402971078531711,
"learning_rate": 5e-05,
"loss": 1.6351,
"step": 1344
},
{
"epoch": 1.356890815606412,
"grad_norm": 0.14298016010767073,
"learning_rate": 5e-05,
"loss": 1.6259,
"step": 1345
},
{
"epoch": 1.3578989817521927,
"grad_norm": 0.12968525696099564,
"learning_rate": 5e-05,
"loss": 1.6244,
"step": 1346
},
{
"epoch": 1.3589071478979735,
"grad_norm": 0.13672510109372069,
"learning_rate": 5e-05,
"loss": 1.6124,
"step": 1347
},
{
"epoch": 1.3599153140437545,
"grad_norm": 0.1369770021713587,
"learning_rate": 5e-05,
"loss": 1.6213,
"step": 1348
},
{
"epoch": 1.3609234801895354,
"grad_norm": 0.14972171375276808,
"learning_rate": 5e-05,
"loss": 1.6504,
"step": 1349
},
{
"epoch": 1.3619316463353162,
"grad_norm": 0.13548138877666627,
"learning_rate": 5e-05,
"loss": 1.6246,
"step": 1350
},
{
"epoch": 1.362939812481097,
"grad_norm": 0.12731452256887532,
"learning_rate": 5e-05,
"loss": 1.6348,
"step": 1351
},
{
"epoch": 1.3639479786268778,
"grad_norm": 0.13106154830975245,
"learning_rate": 5e-05,
"loss": 1.6252,
"step": 1352
},
{
"epoch": 1.3649561447726586,
"grad_norm": 0.12384538188385992,
"learning_rate": 5e-05,
"loss": 1.6318,
"step": 1353
},
{
"epoch": 1.3659643109184394,
"grad_norm": 0.124405294327545,
"learning_rate": 5e-05,
"loss": 1.6235,
"step": 1354
},
{
"epoch": 1.3669724770642202,
"grad_norm": 0.12918504823168622,
"learning_rate": 5e-05,
"loss": 1.6256,
"step": 1355
},
{
"epoch": 1.367980643210001,
"grad_norm": 0.12337217263498097,
"learning_rate": 5e-05,
"loss": 1.6166,
"step": 1356
},
{
"epoch": 1.3689888093557818,
"grad_norm": 0.12962612029855872,
"learning_rate": 5e-05,
"loss": 1.6329,
"step": 1357
},
{
"epoch": 1.3699969755015626,
"grad_norm": 0.12086261928461686,
"learning_rate": 5e-05,
"loss": 1.6468,
"step": 1358
},
{
"epoch": 1.3710051416473434,
"grad_norm": 0.12948482979477746,
"learning_rate": 5e-05,
"loss": 1.6402,
"step": 1359
},
{
"epoch": 1.3720133077931242,
"grad_norm": 0.125839711380269,
"learning_rate": 5e-05,
"loss": 1.6469,
"step": 1360
},
{
"epoch": 1.373021473938905,
"grad_norm": 0.12573772766040806,
"learning_rate": 5e-05,
"loss": 1.6168,
"step": 1361
},
{
"epoch": 1.3740296400846859,
"grad_norm": 0.1290446046069406,
"learning_rate": 5e-05,
"loss": 1.6276,
"step": 1362
},
{
"epoch": 1.3750378062304667,
"grad_norm": 0.13146037117405457,
"learning_rate": 5e-05,
"loss": 1.6368,
"step": 1363
},
{
"epoch": 1.3760459723762475,
"grad_norm": 0.12755209258029462,
"learning_rate": 5e-05,
"loss": 1.6457,
"step": 1364
},
{
"epoch": 1.3770541385220283,
"grad_norm": 0.13647382863573795,
"learning_rate": 5e-05,
"loss": 1.6253,
"step": 1365
},
{
"epoch": 1.3780623046678093,
"grad_norm": 0.12246578130156244,
"learning_rate": 5e-05,
"loss": 1.6187,
"step": 1366
},
{
"epoch": 1.3790704708135901,
"grad_norm": 0.12975737421478123,
"learning_rate": 5e-05,
"loss": 1.6388,
"step": 1367
},
{
"epoch": 1.380078636959371,
"grad_norm": 0.12116689228695646,
"learning_rate": 5e-05,
"loss": 1.6268,
"step": 1368
},
{
"epoch": 1.3810868031051518,
"grad_norm": 0.13078737733785906,
"learning_rate": 5e-05,
"loss": 1.624,
"step": 1369
},
{
"epoch": 1.3820949692509326,
"grad_norm": 0.13726230514420107,
"learning_rate": 5e-05,
"loss": 1.6562,
"step": 1370
},
{
"epoch": 1.3831031353967134,
"grad_norm": 0.1260785537305176,
"learning_rate": 5e-05,
"loss": 1.6444,
"step": 1371
},
{
"epoch": 1.3841113015424942,
"grad_norm": 0.13315635100403078,
"learning_rate": 5e-05,
"loss": 1.6475,
"step": 1372
},
{
"epoch": 1.385119467688275,
"grad_norm": 0.13844883164175295,
"learning_rate": 5e-05,
"loss": 1.6258,
"step": 1373
},
{
"epoch": 1.3861276338340558,
"grad_norm": 0.12683794742559432,
"learning_rate": 5e-05,
"loss": 1.6413,
"step": 1374
},
{
"epoch": 1.3871357999798366,
"grad_norm": 0.12598536946678948,
"learning_rate": 5e-05,
"loss": 1.629,
"step": 1375
},
{
"epoch": 1.3881439661256176,
"grad_norm": 0.12784468233003735,
"learning_rate": 5e-05,
"loss": 1.6074,
"step": 1376
},
{
"epoch": 1.3891521322713984,
"grad_norm": 0.1312115590212781,
"learning_rate": 5e-05,
"loss": 1.6326,
"step": 1377
},
{
"epoch": 1.3901602984171793,
"grad_norm": 0.12983030111730282,
"learning_rate": 5e-05,
"loss": 1.6264,
"step": 1378
},
{
"epoch": 1.39116846456296,
"grad_norm": 0.21725924278257416,
"learning_rate": 5e-05,
"loss": 1.6359,
"step": 1379
},
{
"epoch": 1.3921766307087409,
"grad_norm": 0.1332941952033477,
"learning_rate": 5e-05,
"loss": 1.6322,
"step": 1380
},
{
"epoch": 1.3931847968545217,
"grad_norm": 0.13413967271869495,
"learning_rate": 5e-05,
"loss": 1.6144,
"step": 1381
},
{
"epoch": 1.3941929630003025,
"grad_norm": 0.14430992905252235,
"learning_rate": 5e-05,
"loss": 1.6407,
"step": 1382
},
{
"epoch": 1.3952011291460833,
"grad_norm": 0.13923694142317142,
"learning_rate": 5e-05,
"loss": 1.6267,
"step": 1383
},
{
"epoch": 1.396209295291864,
"grad_norm": 0.12797099248284885,
"learning_rate": 5e-05,
"loss": 1.6018,
"step": 1384
},
{
"epoch": 1.397217461437645,
"grad_norm": 0.1307895540629942,
"learning_rate": 5e-05,
"loss": 1.6366,
"step": 1385
},
{
"epoch": 1.3982256275834257,
"grad_norm": 0.12571320908006497,
"learning_rate": 5e-05,
"loss": 1.62,
"step": 1386
},
{
"epoch": 1.3992337937292065,
"grad_norm": 0.3542423664590062,
"learning_rate": 5e-05,
"loss": 1.6325,
"step": 1387
},
{
"epoch": 1.4002419598749873,
"grad_norm": 0.12876602312218682,
"learning_rate": 5e-05,
"loss": 1.6416,
"step": 1388
},
{
"epoch": 1.4012501260207681,
"grad_norm": 0.13824360520518839,
"learning_rate": 5e-05,
"loss": 1.6345,
"step": 1389
},
{
"epoch": 1.402258292166549,
"grad_norm": 0.121799646242387,
"learning_rate": 5e-05,
"loss": 1.6179,
"step": 1390
},
{
"epoch": 1.4032664583123298,
"grad_norm": 0.13664100958490472,
"learning_rate": 5e-05,
"loss": 1.641,
"step": 1391
},
{
"epoch": 1.4042746244581106,
"grad_norm": 0.12576756467632957,
"learning_rate": 5e-05,
"loss": 1.6237,
"step": 1392
},
{
"epoch": 1.4052827906038914,
"grad_norm": 0.1439018295437482,
"learning_rate": 5e-05,
"loss": 1.6273,
"step": 1393
},
{
"epoch": 1.4062909567496724,
"grad_norm": 0.13003815870667002,
"learning_rate": 5e-05,
"loss": 1.6219,
"step": 1394
},
{
"epoch": 1.4072991228954532,
"grad_norm": 0.13687971905369478,
"learning_rate": 5e-05,
"loss": 1.6197,
"step": 1395
},
{
"epoch": 1.408307289041234,
"grad_norm": 0.1375315943365244,
"learning_rate": 5e-05,
"loss": 1.6364,
"step": 1396
},
{
"epoch": 1.4093154551870148,
"grad_norm": 0.13327720464715445,
"learning_rate": 5e-05,
"loss": 1.623,
"step": 1397
},
{
"epoch": 1.4103236213327957,
"grad_norm": 0.13341972541206865,
"learning_rate": 5e-05,
"loss": 1.6217,
"step": 1398
},
{
"epoch": 1.4113317874785765,
"grad_norm": 0.13173002538223347,
"learning_rate": 5e-05,
"loss": 1.6118,
"step": 1399
},
{
"epoch": 1.4123399536243573,
"grad_norm": 0.1440735510707323,
"learning_rate": 5e-05,
"loss": 1.6423,
"step": 1400
},
{
"epoch": 1.413348119770138,
"grad_norm": 0.12973621264990803,
"learning_rate": 5e-05,
"loss": 1.6091,
"step": 1401
},
{
"epoch": 1.4143562859159189,
"grad_norm": 0.1387503285921373,
"learning_rate": 5e-05,
"loss": 1.6255,
"step": 1402
},
{
"epoch": 1.4153644520616997,
"grad_norm": 0.1287204365133802,
"learning_rate": 5e-05,
"loss": 1.6251,
"step": 1403
},
{
"epoch": 1.4163726182074807,
"grad_norm": 0.13824194326191094,
"learning_rate": 5e-05,
"loss": 1.6326,
"step": 1404
},
{
"epoch": 1.4173807843532615,
"grad_norm": 0.14030055845752487,
"learning_rate": 5e-05,
"loss": 1.6248,
"step": 1405
},
{
"epoch": 1.4183889504990423,
"grad_norm": 0.1327154607182318,
"learning_rate": 5e-05,
"loss": 1.6165,
"step": 1406
},
{
"epoch": 1.4193971166448232,
"grad_norm": 0.14178972226561212,
"learning_rate": 5e-05,
"loss": 1.614,
"step": 1407
},
{
"epoch": 1.420405282790604,
"grad_norm": 0.12894624684755449,
"learning_rate": 5e-05,
"loss": 1.6351,
"step": 1408
},
{
"epoch": 1.4214134489363848,
"grad_norm": 0.15528603445217812,
"learning_rate": 5e-05,
"loss": 1.6523,
"step": 1409
},
{
"epoch": 1.4224216150821656,
"grad_norm": 0.1353781562647246,
"learning_rate": 5e-05,
"loss": 1.6179,
"step": 1410
},
{
"epoch": 1.4234297812279464,
"grad_norm": 0.13026314546525541,
"learning_rate": 5e-05,
"loss": 1.6151,
"step": 1411
},
{
"epoch": 1.4244379473737272,
"grad_norm": 0.13180261150357153,
"learning_rate": 5e-05,
"loss": 1.6429,
"step": 1412
},
{
"epoch": 1.425446113519508,
"grad_norm": 0.13287377471543904,
"learning_rate": 5e-05,
"loss": 1.6339,
"step": 1413
},
{
"epoch": 1.4264542796652888,
"grad_norm": 0.12721440160192868,
"learning_rate": 5e-05,
"loss": 1.6333,
"step": 1414
},
{
"epoch": 1.4274624458110696,
"grad_norm": 0.12732334827178118,
"learning_rate": 5e-05,
"loss": 1.6236,
"step": 1415
},
{
"epoch": 1.4284706119568504,
"grad_norm": 0.13937418837251475,
"learning_rate": 5e-05,
"loss": 1.6365,
"step": 1416
},
{
"epoch": 1.4294787781026312,
"grad_norm": 0.13917627454938042,
"learning_rate": 5e-05,
"loss": 1.6504,
"step": 1417
},
{
"epoch": 1.430486944248412,
"grad_norm": 0.13390358153558804,
"learning_rate": 5e-05,
"loss": 1.6468,
"step": 1418
},
{
"epoch": 1.4314951103941929,
"grad_norm": 0.12976181090918484,
"learning_rate": 5e-05,
"loss": 1.6382,
"step": 1419
},
{
"epoch": 1.4325032765399737,
"grad_norm": 0.13271624982794075,
"learning_rate": 5e-05,
"loss": 1.6171,
"step": 1420
},
{
"epoch": 1.4335114426857545,
"grad_norm": 0.1705925536645093,
"learning_rate": 5e-05,
"loss": 1.6127,
"step": 1421
},
{
"epoch": 1.4345196088315355,
"grad_norm": 0.13810164939076577,
"learning_rate": 5e-05,
"loss": 1.6375,
"step": 1422
},
{
"epoch": 1.4355277749773163,
"grad_norm": 0.15063906545851247,
"learning_rate": 5e-05,
"loss": 1.6106,
"step": 1423
},
{
"epoch": 1.4365359411230971,
"grad_norm": 0.13828434307491985,
"learning_rate": 5e-05,
"loss": 1.6278,
"step": 1424
},
{
"epoch": 1.437544107268878,
"grad_norm": 0.12320067761427174,
"learning_rate": 5e-05,
"loss": 1.6611,
"step": 1425
},
{
"epoch": 1.4385522734146587,
"grad_norm": 0.1473590960932424,
"learning_rate": 5e-05,
"loss": 1.6326,
"step": 1426
},
{
"epoch": 1.4395604395604396,
"grad_norm": 0.1429247807137141,
"learning_rate": 5e-05,
"loss": 1.6419,
"step": 1427
},
{
"epoch": 1.4405686057062204,
"grad_norm": 0.15239036053934374,
"learning_rate": 5e-05,
"loss": 1.6406,
"step": 1428
},
{
"epoch": 1.4415767718520012,
"grad_norm": 0.14066773498783475,
"learning_rate": 5e-05,
"loss": 1.6305,
"step": 1429
},
{
"epoch": 1.442584937997782,
"grad_norm": 0.15385686605084753,
"learning_rate": 5e-05,
"loss": 1.6213,
"step": 1430
},
{
"epoch": 1.4435931041435628,
"grad_norm": 0.13581356581239723,
"learning_rate": 5e-05,
"loss": 1.6371,
"step": 1431
},
{
"epoch": 1.4446012702893438,
"grad_norm": 0.15507768261137825,
"learning_rate": 5e-05,
"loss": 1.6354,
"step": 1432
},
{
"epoch": 1.4456094364351246,
"grad_norm": 0.13224979284586927,
"learning_rate": 5e-05,
"loss": 1.6474,
"step": 1433
},
{
"epoch": 1.4466176025809054,
"grad_norm": 0.1382288574532829,
"learning_rate": 5e-05,
"loss": 1.6344,
"step": 1434
},
{
"epoch": 1.4476257687266862,
"grad_norm": 0.13584124436539982,
"learning_rate": 5e-05,
"loss": 1.6154,
"step": 1435
},
{
"epoch": 1.448633934872467,
"grad_norm": 0.13671329188776624,
"learning_rate": 5e-05,
"loss": 1.6231,
"step": 1436
},
{
"epoch": 1.4496421010182479,
"grad_norm": 0.13666691741442263,
"learning_rate": 5e-05,
"loss": 1.6311,
"step": 1437
},
{
"epoch": 1.4506502671640287,
"grad_norm": 0.13064295993930966,
"learning_rate": 5e-05,
"loss": 1.6414,
"step": 1438
},
{
"epoch": 1.4516584333098095,
"grad_norm": 0.1443609737427036,
"learning_rate": 5e-05,
"loss": 1.639,
"step": 1439
},
{
"epoch": 1.4526665994555903,
"grad_norm": 0.1263187296916393,
"learning_rate": 5e-05,
"loss": 1.6262,
"step": 1440
},
{
"epoch": 1.453674765601371,
"grad_norm": 0.1404759867837024,
"learning_rate": 5e-05,
"loss": 1.6179,
"step": 1441
},
{
"epoch": 1.454682931747152,
"grad_norm": 0.13926923655101497,
"learning_rate": 5e-05,
"loss": 1.6274,
"step": 1442
},
{
"epoch": 1.4556910978929327,
"grad_norm": 0.14531337277887243,
"learning_rate": 5e-05,
"loss": 1.6167,
"step": 1443
},
{
"epoch": 1.4566992640387135,
"grad_norm": 0.14131640008793925,
"learning_rate": 5e-05,
"loss": 1.626,
"step": 1444
},
{
"epoch": 1.4577074301844943,
"grad_norm": 0.13671686790875545,
"learning_rate": 5e-05,
"loss": 1.6365,
"step": 1445
},
{
"epoch": 1.4587155963302751,
"grad_norm": 0.15206688858491618,
"learning_rate": 5e-05,
"loss": 1.6255,
"step": 1446
},
{
"epoch": 1.459723762476056,
"grad_norm": 0.13153412143553034,
"learning_rate": 5e-05,
"loss": 1.6154,
"step": 1447
},
{
"epoch": 1.4607319286218368,
"grad_norm": 0.14583017658822006,
"learning_rate": 5e-05,
"loss": 1.6436,
"step": 1448
},
{
"epoch": 1.4617400947676176,
"grad_norm": 0.13210222622246628,
"learning_rate": 5e-05,
"loss": 1.6285,
"step": 1449
},
{
"epoch": 1.4627482609133986,
"grad_norm": 0.1377997855343899,
"learning_rate": 5e-05,
"loss": 1.6456,
"step": 1450
},
{
"epoch": 1.4637564270591794,
"grad_norm": 0.15205802244172517,
"learning_rate": 5e-05,
"loss": 1.6382,
"step": 1451
},
{
"epoch": 1.4647645932049602,
"grad_norm": 0.12713100923946807,
"learning_rate": 5e-05,
"loss": 1.6229,
"step": 1452
},
{
"epoch": 1.465772759350741,
"grad_norm": 0.14136573990700596,
"learning_rate": 5e-05,
"loss": 1.6351,
"step": 1453
},
{
"epoch": 1.4667809254965218,
"grad_norm": 0.13839984348763215,
"learning_rate": 5e-05,
"loss": 1.6235,
"step": 1454
},
{
"epoch": 1.4677890916423026,
"grad_norm": 0.13224270697703103,
"learning_rate": 5e-05,
"loss": 1.6127,
"step": 1455
},
{
"epoch": 1.4687972577880835,
"grad_norm": 0.13995029322894012,
"learning_rate": 5e-05,
"loss": 1.6212,
"step": 1456
},
{
"epoch": 1.4698054239338643,
"grad_norm": 0.1343268792694107,
"learning_rate": 5e-05,
"loss": 1.6363,
"step": 1457
},
{
"epoch": 1.470813590079645,
"grad_norm": 0.13654059463569093,
"learning_rate": 5e-05,
"loss": 1.633,
"step": 1458
},
{
"epoch": 1.4718217562254259,
"grad_norm": 0.13691595619242897,
"learning_rate": 5e-05,
"loss": 1.6474,
"step": 1459
},
{
"epoch": 1.472829922371207,
"grad_norm": 0.1408875473463007,
"learning_rate": 5e-05,
"loss": 1.63,
"step": 1460
},
{
"epoch": 1.4738380885169877,
"grad_norm": 0.14665577417113682,
"learning_rate": 5e-05,
"loss": 1.6092,
"step": 1461
},
{
"epoch": 1.4748462546627685,
"grad_norm": 0.13392872202872624,
"learning_rate": 5e-05,
"loss": 1.6277,
"step": 1462
},
{
"epoch": 1.4758544208085493,
"grad_norm": 0.13318288146628018,
"learning_rate": 5e-05,
"loss": 1.6309,
"step": 1463
},
{
"epoch": 1.4768625869543301,
"grad_norm": 0.14756630254079095,
"learning_rate": 5e-05,
"loss": 1.62,
"step": 1464
},
{
"epoch": 1.477870753100111,
"grad_norm": 0.14478366008638208,
"learning_rate": 5e-05,
"loss": 1.6137,
"step": 1465
},
{
"epoch": 1.4788789192458918,
"grad_norm": 0.1387406186352145,
"learning_rate": 5e-05,
"loss": 1.6236,
"step": 1466
},
{
"epoch": 1.4798870853916726,
"grad_norm": 0.14007315709845755,
"learning_rate": 5e-05,
"loss": 1.6437,
"step": 1467
},
{
"epoch": 1.4808952515374534,
"grad_norm": 0.14807940769005204,
"learning_rate": 5e-05,
"loss": 1.6185,
"step": 1468
},
{
"epoch": 1.4819034176832342,
"grad_norm": 0.13015841439441841,
"learning_rate": 5e-05,
"loss": 1.64,
"step": 1469
},
{
"epoch": 1.482911583829015,
"grad_norm": 0.13423951958450828,
"learning_rate": 5e-05,
"loss": 1.6196,
"step": 1470
},
{
"epoch": 1.4839197499747958,
"grad_norm": 0.15393810166949654,
"learning_rate": 5e-05,
"loss": 1.6009,
"step": 1471
},
{
"epoch": 1.4849279161205766,
"grad_norm": 0.13589415860343146,
"learning_rate": 5e-05,
"loss": 1.6219,
"step": 1472
},
{
"epoch": 1.4859360822663574,
"grad_norm": 0.14671625526864973,
"learning_rate": 5e-05,
"loss": 1.6211,
"step": 1473
},
{
"epoch": 1.4869442484121382,
"grad_norm": 0.1353917088551739,
"learning_rate": 5e-05,
"loss": 1.6267,
"step": 1474
},
{
"epoch": 1.487952414557919,
"grad_norm": 0.14107952075322783,
"learning_rate": 5e-05,
"loss": 1.6234,
"step": 1475
},
{
"epoch": 1.4889605807036999,
"grad_norm": 0.13491768117614777,
"learning_rate": 5e-05,
"loss": 1.619,
"step": 1476
},
{
"epoch": 1.4899687468494807,
"grad_norm": 0.19622895258627024,
"learning_rate": 5e-05,
"loss": 1.6411,
"step": 1477
},
{
"epoch": 1.4909769129952617,
"grad_norm": 0.13347997431566885,
"learning_rate": 5e-05,
"loss": 1.6306,
"step": 1478
},
{
"epoch": 1.4919850791410425,
"grad_norm": 0.1416800687994707,
"learning_rate": 5e-05,
"loss": 1.6341,
"step": 1479
},
{
"epoch": 1.4929932452868233,
"grad_norm": 0.13591874359876954,
"learning_rate": 5e-05,
"loss": 1.6441,
"step": 1480
},
{
"epoch": 1.4940014114326041,
"grad_norm": 0.1408069374827294,
"learning_rate": 5e-05,
"loss": 1.621,
"step": 1481
},
{
"epoch": 1.495009577578385,
"grad_norm": 0.14029513798691906,
"learning_rate": 5e-05,
"loss": 1.6326,
"step": 1482
},
{
"epoch": 1.4960177437241657,
"grad_norm": 0.15447150259956755,
"learning_rate": 5e-05,
"loss": 1.6438,
"step": 1483
},
{
"epoch": 1.4970259098699465,
"grad_norm": 0.15816124844116827,
"learning_rate": 5e-05,
"loss": 1.6183,
"step": 1484
},
{
"epoch": 1.4980340760157274,
"grad_norm": 0.12696159571269544,
"learning_rate": 5e-05,
"loss": 1.6361,
"step": 1485
},
{
"epoch": 1.4990422421615082,
"grad_norm": 0.13687446317253593,
"learning_rate": 5e-05,
"loss": 1.6362,
"step": 1486
},
{
"epoch": 1.5000504083072892,
"grad_norm": 0.19177985637700964,
"learning_rate": 5e-05,
"loss": 1.6092,
"step": 1487
},
{
"epoch": 1.50105857445307,
"grad_norm": 0.13815553297856198,
"learning_rate": 5e-05,
"loss": 1.6483,
"step": 1488
},
{
"epoch": 1.5020667405988508,
"grad_norm": 0.12867409727512602,
"learning_rate": 5e-05,
"loss": 1.6232,
"step": 1489
},
{
"epoch": 1.5030749067446316,
"grad_norm": 0.13876479312318743,
"learning_rate": 5e-05,
"loss": 1.6283,
"step": 1490
},
{
"epoch": 1.5040830728904124,
"grad_norm": 0.1380968512865023,
"learning_rate": 5e-05,
"loss": 1.6088,
"step": 1491
},
{
"epoch": 1.5050912390361932,
"grad_norm": 0.1387886106002393,
"learning_rate": 5e-05,
"loss": 1.6195,
"step": 1492
},
{
"epoch": 1.506099405181974,
"grad_norm": 0.13155001387555398,
"learning_rate": 5e-05,
"loss": 1.6282,
"step": 1493
},
{
"epoch": 1.5071075713277549,
"grad_norm": 0.14474568242324626,
"learning_rate": 5e-05,
"loss": 1.6174,
"step": 1494
},
{
"epoch": 1.5081157374735357,
"grad_norm": 0.1328261700337089,
"learning_rate": 5e-05,
"loss": 1.6371,
"step": 1495
},
{
"epoch": 1.5091239036193165,
"grad_norm": 0.13791902147936075,
"learning_rate": 5e-05,
"loss": 1.635,
"step": 1496
},
{
"epoch": 1.5101320697650973,
"grad_norm": 0.13810690850494428,
"learning_rate": 5e-05,
"loss": 1.6131,
"step": 1497
},
{
"epoch": 1.511140235910878,
"grad_norm": 0.13120526016158554,
"learning_rate": 5e-05,
"loss": 1.6164,
"step": 1498
},
{
"epoch": 1.512148402056659,
"grad_norm": 0.1287974235018955,
"learning_rate": 5e-05,
"loss": 1.6268,
"step": 1499
},
{
"epoch": 1.5131565682024397,
"grad_norm": 0.1378623308637562,
"learning_rate": 5e-05,
"loss": 1.6315,
"step": 1500
},
{
"epoch": 1.5141647343482205,
"grad_norm": 0.15571650444651786,
"learning_rate": 5e-05,
"loss": 1.6158,
"step": 1501
},
{
"epoch": 1.5151729004940013,
"grad_norm": 0.15196148508385474,
"learning_rate": 5e-05,
"loss": 1.6127,
"step": 1502
},
{
"epoch": 1.5161810666397821,
"grad_norm": 0.1291006666802701,
"learning_rate": 5e-05,
"loss": 1.6228,
"step": 1503
},
{
"epoch": 1.517189232785563,
"grad_norm": 0.1288597453694645,
"learning_rate": 5e-05,
"loss": 1.6281,
"step": 1504
},
{
"epoch": 1.5181973989313438,
"grad_norm": 0.13822665412671015,
"learning_rate": 5e-05,
"loss": 1.627,
"step": 1505
},
{
"epoch": 1.5192055650771246,
"grad_norm": 0.22934071247722015,
"learning_rate": 5e-05,
"loss": 1.64,
"step": 1506
},
{
"epoch": 1.5202137312229054,
"grad_norm": 0.1336341986385085,
"learning_rate": 5e-05,
"loss": 1.6038,
"step": 1507
},
{
"epoch": 1.5212218973686864,
"grad_norm": 0.14920723438783143,
"learning_rate": 5e-05,
"loss": 1.6242,
"step": 1508
},
{
"epoch": 1.5222300635144672,
"grad_norm": 0.1377984398583919,
"learning_rate": 5e-05,
"loss": 1.6109,
"step": 1509
},
{
"epoch": 1.523238229660248,
"grad_norm": 0.14474709932915958,
"learning_rate": 5e-05,
"loss": 1.6369,
"step": 1510
},
{
"epoch": 1.5242463958060288,
"grad_norm": 0.14597802367199372,
"learning_rate": 5e-05,
"loss": 1.6076,
"step": 1511
},
{
"epoch": 1.5252545619518096,
"grad_norm": 0.1835711819944948,
"learning_rate": 5e-05,
"loss": 1.6065,
"step": 1512
},
{
"epoch": 1.5262627280975904,
"grad_norm": 0.13796861080831385,
"learning_rate": 5e-05,
"loss": 1.6382,
"step": 1513
},
{
"epoch": 1.5272708942433713,
"grad_norm": 0.13180750342058548,
"learning_rate": 5e-05,
"loss": 1.6229,
"step": 1514
},
{
"epoch": 1.5282790603891523,
"grad_norm": 0.13830723804472403,
"learning_rate": 5e-05,
"loss": 1.6165,
"step": 1515
},
{
"epoch": 1.529287226534933,
"grad_norm": 0.14674607489303773,
"learning_rate": 5e-05,
"loss": 1.6209,
"step": 1516
},
{
"epoch": 1.530295392680714,
"grad_norm": 0.13829861312181452,
"learning_rate": 5e-05,
"loss": 1.6404,
"step": 1517
},
{
"epoch": 1.5313035588264947,
"grad_norm": 0.13531408327464506,
"learning_rate": 5e-05,
"loss": 1.6321,
"step": 1518
},
{
"epoch": 1.5323117249722755,
"grad_norm": 0.12793478847755976,
"learning_rate": 5e-05,
"loss": 1.6109,
"step": 1519
},
{
"epoch": 1.5333198911180563,
"grad_norm": 0.1366784633044243,
"learning_rate": 5e-05,
"loss": 1.6404,
"step": 1520
},
{
"epoch": 1.5343280572638371,
"grad_norm": 0.1283164088827875,
"learning_rate": 5e-05,
"loss": 1.6128,
"step": 1521
},
{
"epoch": 1.535336223409618,
"grad_norm": 0.1292907360896931,
"learning_rate": 5e-05,
"loss": 1.6265,
"step": 1522
},
{
"epoch": 1.5363443895553988,
"grad_norm": 0.12656329603863192,
"learning_rate": 5e-05,
"loss": 1.6251,
"step": 1523
},
{
"epoch": 1.5373525557011796,
"grad_norm": 0.1316052445071765,
"learning_rate": 5e-05,
"loss": 1.609,
"step": 1524
},
{
"epoch": 1.5383607218469604,
"grad_norm": 0.14252214471740876,
"learning_rate": 5e-05,
"loss": 1.6338,
"step": 1525
},
{
"epoch": 1.5393688879927412,
"grad_norm": 0.14870741117949973,
"learning_rate": 5e-05,
"loss": 1.6356,
"step": 1526
},
{
"epoch": 1.540377054138522,
"grad_norm": 0.13577012776225475,
"learning_rate": 5e-05,
"loss": 1.6212,
"step": 1527
},
{
"epoch": 1.5413852202843028,
"grad_norm": 0.17114483738951686,
"learning_rate": 5e-05,
"loss": 1.6024,
"step": 1528
},
{
"epoch": 1.5423933864300836,
"grad_norm": 0.14737308412560612,
"learning_rate": 5e-05,
"loss": 1.6286,
"step": 1529
},
{
"epoch": 1.5434015525758644,
"grad_norm": 0.1295264825941411,
"learning_rate": 5e-05,
"loss": 1.6222,
"step": 1530
},
{
"epoch": 1.5444097187216452,
"grad_norm": 0.14018320799858966,
"learning_rate": 5e-05,
"loss": 1.6295,
"step": 1531
},
{
"epoch": 1.545417884867426,
"grad_norm": 0.14609731368477788,
"learning_rate": 5e-05,
"loss": 1.6143,
"step": 1532
},
{
"epoch": 1.5464260510132068,
"grad_norm": 0.14330539697029224,
"learning_rate": 5e-05,
"loss": 1.6307,
"step": 1533
},
{
"epoch": 1.5474342171589877,
"grad_norm": 0.15111461104455878,
"learning_rate": 5e-05,
"loss": 1.6433,
"step": 1534
},
{
"epoch": 1.5484423833047685,
"grad_norm": 0.15868830823996302,
"learning_rate": 5e-05,
"loss": 1.6026,
"step": 1535
},
{
"epoch": 1.5494505494505495,
"grad_norm": 0.13936815230644864,
"learning_rate": 5e-05,
"loss": 1.6121,
"step": 1536
},
{
"epoch": 1.5504587155963303,
"grad_norm": 0.15699741936829822,
"learning_rate": 5e-05,
"loss": 1.6285,
"step": 1537
},
{
"epoch": 1.5514668817421111,
"grad_norm": 0.13811556094647703,
"learning_rate": 5e-05,
"loss": 1.621,
"step": 1538
},
{
"epoch": 1.552475047887892,
"grad_norm": 0.15020339457186493,
"learning_rate": 5e-05,
"loss": 1.6175,
"step": 1539
},
{
"epoch": 1.5534832140336727,
"grad_norm": 0.1369692638804312,
"learning_rate": 5e-05,
"loss": 1.6112,
"step": 1540
},
{
"epoch": 1.5544913801794535,
"grad_norm": 0.13858737929742926,
"learning_rate": 5e-05,
"loss": 1.6377,
"step": 1541
},
{
"epoch": 1.5554995463252344,
"grad_norm": 0.16480903243687473,
"learning_rate": 5e-05,
"loss": 1.6418,
"step": 1542
},
{
"epoch": 1.5565077124710154,
"grad_norm": 0.14958634326370873,
"learning_rate": 5e-05,
"loss": 1.6139,
"step": 1543
},
{
"epoch": 1.5575158786167962,
"grad_norm": 0.15298580974284656,
"learning_rate": 5e-05,
"loss": 1.6128,
"step": 1544
},
{
"epoch": 1.558524044762577,
"grad_norm": 0.13583289639406054,
"learning_rate": 5e-05,
"loss": 1.6103,
"step": 1545
},
{
"epoch": 1.5595322109083578,
"grad_norm": 0.1503120470346654,
"learning_rate": 5e-05,
"loss": 1.6166,
"step": 1546
},
{
"epoch": 1.5605403770541386,
"grad_norm": 0.1383639822102947,
"learning_rate": 5e-05,
"loss": 1.6147,
"step": 1547
},
{
"epoch": 1.5615485431999194,
"grad_norm": 0.14685023146454493,
"learning_rate": 5e-05,
"loss": 1.6219,
"step": 1548
},
{
"epoch": 1.5625567093457002,
"grad_norm": 0.1873494126741415,
"learning_rate": 5e-05,
"loss": 1.6167,
"step": 1549
},
{
"epoch": 1.563564875491481,
"grad_norm": 0.13611446482024203,
"learning_rate": 5e-05,
"loss": 1.6246,
"step": 1550
},
{
"epoch": 1.5645730416372619,
"grad_norm": 0.15741117634697316,
"learning_rate": 5e-05,
"loss": 1.619,
"step": 1551
},
{
"epoch": 1.5655812077830427,
"grad_norm": 0.147224170977167,
"learning_rate": 5e-05,
"loss": 1.6095,
"step": 1552
},
{
"epoch": 1.5665893739288235,
"grad_norm": 0.1577262032399655,
"learning_rate": 5e-05,
"loss": 1.6179,
"step": 1553
},
{
"epoch": 1.5675975400746043,
"grad_norm": 0.14858674477219846,
"learning_rate": 5e-05,
"loss": 1.6023,
"step": 1554
},
{
"epoch": 1.568605706220385,
"grad_norm": 0.13688153875311357,
"learning_rate": 5e-05,
"loss": 1.6226,
"step": 1555
},
{
"epoch": 1.569613872366166,
"grad_norm": 0.15426478611617334,
"learning_rate": 5e-05,
"loss": 1.6111,
"step": 1556
},
{
"epoch": 1.5706220385119467,
"grad_norm": 0.13637312594724105,
"learning_rate": 5e-05,
"loss": 1.6286,
"step": 1557
},
{
"epoch": 1.5716302046577275,
"grad_norm": 0.1668337456049804,
"learning_rate": 5e-05,
"loss": 1.6216,
"step": 1558
},
{
"epoch": 1.5726383708035083,
"grad_norm": 0.13939660345064128,
"learning_rate": 5e-05,
"loss": 1.6185,
"step": 1559
},
{
"epoch": 1.5736465369492891,
"grad_norm": 0.5399345687431912,
"learning_rate": 5e-05,
"loss": 1.6361,
"step": 1560
},
{
"epoch": 1.57465470309507,
"grad_norm": 0.1521584080169154,
"learning_rate": 5e-05,
"loss": 1.6053,
"step": 1561
},
{
"epoch": 1.5756628692408507,
"grad_norm": 0.13935279315517085,
"learning_rate": 5e-05,
"loss": 1.6247,
"step": 1562
},
{
"epoch": 1.5766710353866316,
"grad_norm": 0.13475960826393785,
"learning_rate": 5e-05,
"loss": 1.6336,
"step": 1563
},
{
"epoch": 1.5776792015324126,
"grad_norm": 0.13736336068453353,
"learning_rate": 5e-05,
"loss": 1.6266,
"step": 1564
},
{
"epoch": 1.5786873676781934,
"grad_norm": 0.14005434359348037,
"learning_rate": 5e-05,
"loss": 1.6098,
"step": 1565
},
{
"epoch": 1.5796955338239742,
"grad_norm": 0.1409911202784141,
"learning_rate": 5e-05,
"loss": 1.6524,
"step": 1566
},
{
"epoch": 1.580703699969755,
"grad_norm": 0.14734554570710956,
"learning_rate": 5e-05,
"loss": 1.6165,
"step": 1567
},
{
"epoch": 1.5817118661155358,
"grad_norm": 0.13213475979971628,
"learning_rate": 5e-05,
"loss": 1.6408,
"step": 1568
},
{
"epoch": 1.5827200322613166,
"grad_norm": 0.14975068515074358,
"learning_rate": 5e-05,
"loss": 1.6084,
"step": 1569
},
{
"epoch": 1.5837281984070974,
"grad_norm": 0.6523063563005193,
"learning_rate": 5e-05,
"loss": 1.6404,
"step": 1570
},
{
"epoch": 1.5847363645528785,
"grad_norm": 0.14186737586646578,
"learning_rate": 5e-05,
"loss": 1.6075,
"step": 1571
},
{
"epoch": 1.5857445306986593,
"grad_norm": 0.30669830762249223,
"learning_rate": 5e-05,
"loss": 1.6185,
"step": 1572
},
{
"epoch": 1.58675269684444,
"grad_norm": 0.1390697291644904,
"learning_rate": 5e-05,
"loss": 1.6216,
"step": 1573
},
{
"epoch": 1.587760862990221,
"grad_norm": 0.1345479360720834,
"learning_rate": 5e-05,
"loss": 1.6267,
"step": 1574
},
{
"epoch": 1.5887690291360017,
"grad_norm": 0.1335398629343587,
"learning_rate": 5e-05,
"loss": 1.6126,
"step": 1575
},
{
"epoch": 1.5897771952817825,
"grad_norm": 0.47436425719803493,
"learning_rate": 5e-05,
"loss": 1.6169,
"step": 1576
},
{
"epoch": 1.5907853614275633,
"grad_norm": 0.14245577008322435,
"learning_rate": 5e-05,
"loss": 1.6284,
"step": 1577
},
{
"epoch": 1.5917935275733441,
"grad_norm": 0.14938912417150504,
"learning_rate": 5e-05,
"loss": 1.6299,
"step": 1578
},
{
"epoch": 1.592801693719125,
"grad_norm": 0.14006970763748072,
"learning_rate": 5e-05,
"loss": 1.6089,
"step": 1579
},
{
"epoch": 1.5938098598649058,
"grad_norm": 0.15149158584758154,
"learning_rate": 5e-05,
"loss": 1.608,
"step": 1580
},
{
"epoch": 1.5948180260106866,
"grad_norm": 0.13996334020731915,
"learning_rate": 5e-05,
"loss": 1.6207,
"step": 1581
},
{
"epoch": 1.5958261921564674,
"grad_norm": 0.14878426743491646,
"learning_rate": 5e-05,
"loss": 1.6129,
"step": 1582
},
{
"epoch": 1.5968343583022482,
"grad_norm": 0.13983441814569922,
"learning_rate": 5e-05,
"loss": 1.6256,
"step": 1583
},
{
"epoch": 1.597842524448029,
"grad_norm": 0.14514902618278047,
"learning_rate": 5e-05,
"loss": 1.5978,
"step": 1584
},
{
"epoch": 1.5988506905938098,
"grad_norm": 0.144016085291176,
"learning_rate": 5e-05,
"loss": 1.6025,
"step": 1585
},
{
"epoch": 1.5998588567395906,
"grad_norm": 0.14685717799417497,
"learning_rate": 5e-05,
"loss": 1.6203,
"step": 1586
},
{
"epoch": 1.6008670228853714,
"grad_norm": 0.14838433929728054,
"learning_rate": 5e-05,
"loss": 1.6076,
"step": 1587
},
{
"epoch": 1.6018751890311522,
"grad_norm": 0.13310027592461138,
"learning_rate": 5e-05,
"loss": 1.6222,
"step": 1588
},
{
"epoch": 1.602883355176933,
"grad_norm": 0.14697520402464495,
"learning_rate": 5e-05,
"loss": 1.6227,
"step": 1589
},
{
"epoch": 1.6038915213227138,
"grad_norm": 0.134026752065162,
"learning_rate": 5e-05,
"loss": 1.6324,
"step": 1590
},
{
"epoch": 1.6048996874684947,
"grad_norm": 0.14411651340829987,
"learning_rate": 5e-05,
"loss": 1.6212,
"step": 1591
},
{
"epoch": 1.6059078536142757,
"grad_norm": 0.14008440924202942,
"learning_rate": 5e-05,
"loss": 1.6208,
"step": 1592
},
{
"epoch": 1.6069160197600565,
"grad_norm": 0.14065237992088672,
"learning_rate": 5e-05,
"loss": 1.6188,
"step": 1593
},
{
"epoch": 1.6079241859058373,
"grad_norm": 0.13042442625793854,
"learning_rate": 5e-05,
"loss": 1.63,
"step": 1594
},
{
"epoch": 1.608932352051618,
"grad_norm": 0.1550178866652208,
"learning_rate": 5e-05,
"loss": 1.6094,
"step": 1595
},
{
"epoch": 1.609940518197399,
"grad_norm": 0.13287478123944418,
"learning_rate": 5e-05,
"loss": 1.6104,
"step": 1596
},
{
"epoch": 1.6109486843431797,
"grad_norm": 0.1390379550303179,
"learning_rate": 5e-05,
"loss": 1.6173,
"step": 1597
},
{
"epoch": 1.6119568504889605,
"grad_norm": 0.14050846371572615,
"learning_rate": 5e-05,
"loss": 1.6283,
"step": 1598
},
{
"epoch": 1.6129650166347416,
"grad_norm": 0.12452266302553723,
"learning_rate": 5e-05,
"loss": 1.6104,
"step": 1599
},
{
"epoch": 1.6139731827805224,
"grad_norm": 0.14720611562795238,
"learning_rate": 5e-05,
"loss": 1.6209,
"step": 1600
},
{
"epoch": 1.6149813489263032,
"grad_norm": 0.13214543696834904,
"learning_rate": 5e-05,
"loss": 1.6249,
"step": 1601
},
{
"epoch": 1.615989515072084,
"grad_norm": 0.13514507336848264,
"learning_rate": 5e-05,
"loss": 1.6063,
"step": 1602
},
{
"epoch": 1.6169976812178648,
"grad_norm": 0.13479055340954935,
"learning_rate": 5e-05,
"loss": 1.637,
"step": 1603
},
{
"epoch": 1.6180058473636456,
"grad_norm": 0.13861806828091428,
"learning_rate": 5e-05,
"loss": 1.6089,
"step": 1604
},
{
"epoch": 1.6190140135094264,
"grad_norm": 0.14778093009386095,
"learning_rate": 5e-05,
"loss": 1.618,
"step": 1605
},
{
"epoch": 1.6200221796552072,
"grad_norm": 0.14198742984621807,
"learning_rate": 5e-05,
"loss": 1.6253,
"step": 1606
},
{
"epoch": 1.621030345800988,
"grad_norm": 0.13665966814805347,
"learning_rate": 5e-05,
"loss": 1.6155,
"step": 1607
},
{
"epoch": 1.6220385119467688,
"grad_norm": 0.1420508367016213,
"learning_rate": 5e-05,
"loss": 1.6206,
"step": 1608
},
{
"epoch": 1.6230466780925497,
"grad_norm": 0.14238222987888366,
"learning_rate": 5e-05,
"loss": 1.6232,
"step": 1609
},
{
"epoch": 1.6240548442383305,
"grad_norm": 0.1379466487700634,
"learning_rate": 5e-05,
"loss": 1.6015,
"step": 1610
},
{
"epoch": 1.6250630103841113,
"grad_norm": 0.15179244846068082,
"learning_rate": 5e-05,
"loss": 1.6232,
"step": 1611
},
{
"epoch": 1.626071176529892,
"grad_norm": 0.13271666919390243,
"learning_rate": 5e-05,
"loss": 1.6165,
"step": 1612
},
{
"epoch": 1.627079342675673,
"grad_norm": 0.14790413368011668,
"learning_rate": 5e-05,
"loss": 1.6188,
"step": 1613
},
{
"epoch": 1.6280875088214537,
"grad_norm": 0.14767363930567415,
"learning_rate": 5e-05,
"loss": 1.6158,
"step": 1614
},
{
"epoch": 1.6290956749672345,
"grad_norm": 0.13613544304024167,
"learning_rate": 5e-05,
"loss": 1.6316,
"step": 1615
},
{
"epoch": 1.6301038411130153,
"grad_norm": 0.1425243634985558,
"learning_rate": 5e-05,
"loss": 1.6186,
"step": 1616
},
{
"epoch": 1.6311120072587961,
"grad_norm": 0.13375330006671063,
"learning_rate": 5e-05,
"loss": 1.617,
"step": 1617
},
{
"epoch": 1.632120173404577,
"grad_norm": 0.22112118602632483,
"learning_rate": 5e-05,
"loss": 1.6065,
"step": 1618
},
{
"epoch": 1.6331283395503577,
"grad_norm": 0.13804098585417388,
"learning_rate": 5e-05,
"loss": 1.6121,
"step": 1619
},
{
"epoch": 1.6341365056961388,
"grad_norm": 0.14314107543990023,
"learning_rate": 5e-05,
"loss": 1.6357,
"step": 1620
},
{
"epoch": 1.6351446718419196,
"grad_norm": 0.13844475251859784,
"learning_rate": 5e-05,
"loss": 1.6261,
"step": 1621
},
{
"epoch": 1.6361528379877004,
"grad_norm": 0.14077933123926825,
"learning_rate": 5e-05,
"loss": 1.617,
"step": 1622
},
{
"epoch": 1.6371610041334812,
"grad_norm": 0.13847095038208737,
"learning_rate": 5e-05,
"loss": 1.6096,
"step": 1623
},
{
"epoch": 1.638169170279262,
"grad_norm": 0.14235824859571994,
"learning_rate": 5e-05,
"loss": 1.6336,
"step": 1624
},
{
"epoch": 1.6391773364250428,
"grad_norm": 0.145577748481187,
"learning_rate": 5e-05,
"loss": 1.6253,
"step": 1625
},
{
"epoch": 1.6401855025708236,
"grad_norm": 0.15603712171954626,
"learning_rate": 5e-05,
"loss": 1.6177,
"step": 1626
},
{
"epoch": 1.6411936687166047,
"grad_norm": 0.13651075788038194,
"learning_rate": 5e-05,
"loss": 1.6221,
"step": 1627
},
{
"epoch": 1.6422018348623855,
"grad_norm": 0.1380322696697574,
"learning_rate": 5e-05,
"loss": 1.623,
"step": 1628
},
{
"epoch": 1.6432100010081663,
"grad_norm": 0.14918368429745976,
"learning_rate": 5e-05,
"loss": 1.624,
"step": 1629
},
{
"epoch": 1.644218167153947,
"grad_norm": 0.11732167310285535,
"learning_rate": 5e-05,
"loss": 1.5998,
"step": 1630
},
{
"epoch": 1.645226333299728,
"grad_norm": 0.14002340488796944,
"learning_rate": 5e-05,
"loss": 1.6209,
"step": 1631
},
{
"epoch": 1.6462344994455087,
"grad_norm": 0.13742206757706413,
"learning_rate": 5e-05,
"loss": 1.6137,
"step": 1632
},
{
"epoch": 1.6472426655912895,
"grad_norm": 0.13878053780251967,
"learning_rate": 5e-05,
"loss": 1.6176,
"step": 1633
},
{
"epoch": 1.6482508317370703,
"grad_norm": 0.13538113449588376,
"learning_rate": 5e-05,
"loss": 1.6382,
"step": 1634
},
{
"epoch": 1.6492589978828511,
"grad_norm": 0.1372021043904172,
"learning_rate": 5e-05,
"loss": 1.6249,
"step": 1635
},
{
"epoch": 1.650267164028632,
"grad_norm": 0.14008214908108707,
"learning_rate": 5e-05,
"loss": 1.6091,
"step": 1636
},
{
"epoch": 1.6512753301744127,
"grad_norm": 0.1339866493894695,
"learning_rate": 5e-05,
"loss": 1.5994,
"step": 1637
},
{
"epoch": 1.6522834963201936,
"grad_norm": 0.13181194279202782,
"learning_rate": 5e-05,
"loss": 1.6418,
"step": 1638
},
{
"epoch": 1.6532916624659744,
"grad_norm": 0.14351324455593178,
"learning_rate": 5e-05,
"loss": 1.6024,
"step": 1639
},
{
"epoch": 1.6542998286117552,
"grad_norm": 0.12816439188392217,
"learning_rate": 5e-05,
"loss": 1.6424,
"step": 1640
},
{
"epoch": 1.655307994757536,
"grad_norm": 0.13510192746334643,
"learning_rate": 5e-05,
"loss": 1.6066,
"step": 1641
},
{
"epoch": 1.6563161609033168,
"grad_norm": 0.13959090945226782,
"learning_rate": 5e-05,
"loss": 1.6378,
"step": 1642
},
{
"epoch": 1.6573243270490976,
"grad_norm": 0.14054462649724678,
"learning_rate": 5e-05,
"loss": 1.6342,
"step": 1643
},
{
"epoch": 1.6583324931948784,
"grad_norm": 0.24272243831428125,
"learning_rate": 5e-05,
"loss": 1.6091,
"step": 1644
},
{
"epoch": 1.6593406593406592,
"grad_norm": 0.13496532677224876,
"learning_rate": 5e-05,
"loss": 1.6224,
"step": 1645
},
{
"epoch": 1.66034882548644,
"grad_norm": 0.13530180754992427,
"learning_rate": 5e-05,
"loss": 1.5929,
"step": 1646
},
{
"epoch": 1.6613569916322208,
"grad_norm": 0.1402470630054397,
"learning_rate": 5e-05,
"loss": 1.5957,
"step": 1647
},
{
"epoch": 1.6623651577780019,
"grad_norm": 0.1339642900610824,
"learning_rate": 5e-05,
"loss": 1.6135,
"step": 1648
},
{
"epoch": 1.6633733239237827,
"grad_norm": 0.15291952746349996,
"learning_rate": 5e-05,
"loss": 1.6113,
"step": 1649
},
{
"epoch": 1.6643814900695635,
"grad_norm": 0.1439871519251173,
"learning_rate": 5e-05,
"loss": 1.617,
"step": 1650
},
{
"epoch": 1.6653896562153443,
"grad_norm": 0.13197615212474387,
"learning_rate": 5e-05,
"loss": 1.6232,
"step": 1651
},
{
"epoch": 1.666397822361125,
"grad_norm": 0.13639699783186127,
"learning_rate": 5e-05,
"loss": 1.618,
"step": 1652
},
{
"epoch": 1.667405988506906,
"grad_norm": 0.13605034216960754,
"learning_rate": 5e-05,
"loss": 1.6288,
"step": 1653
},
{
"epoch": 1.6684141546526867,
"grad_norm": 0.13680563856675576,
"learning_rate": 5e-05,
"loss": 1.6284,
"step": 1654
},
{
"epoch": 1.6694223207984678,
"grad_norm": 0.14294132013022695,
"learning_rate": 5e-05,
"loss": 1.6033,
"step": 1655
},
{
"epoch": 1.6704304869442486,
"grad_norm": 0.14258491267969362,
"learning_rate": 5e-05,
"loss": 1.624,
"step": 1656
},
{
"epoch": 1.6714386530900294,
"grad_norm": 0.1425290155938703,
"learning_rate": 5e-05,
"loss": 1.6037,
"step": 1657
},
{
"epoch": 1.6724468192358102,
"grad_norm": 0.12627279779042835,
"learning_rate": 5e-05,
"loss": 1.6149,
"step": 1658
},
{
"epoch": 1.673454985381591,
"grad_norm": 0.1343344246316048,
"learning_rate": 5e-05,
"loss": 1.6097,
"step": 1659
},
{
"epoch": 1.6744631515273718,
"grad_norm": 0.12793350365768266,
"learning_rate": 5e-05,
"loss": 1.617,
"step": 1660
},
{
"epoch": 1.6754713176731526,
"grad_norm": 0.13886742282540715,
"learning_rate": 5e-05,
"loss": 1.6132,
"step": 1661
},
{
"epoch": 1.6764794838189334,
"grad_norm": 0.14001713897764617,
"learning_rate": 5e-05,
"loss": 1.6154,
"step": 1662
},
{
"epoch": 1.6774876499647142,
"grad_norm": 0.135753035141293,
"learning_rate": 5e-05,
"loss": 1.6166,
"step": 1663
},
{
"epoch": 1.678495816110495,
"grad_norm": 0.13147689252282455,
"learning_rate": 5e-05,
"loss": 1.6074,
"step": 1664
},
{
"epoch": 1.6795039822562758,
"grad_norm": 0.13514270183722293,
"learning_rate": 5e-05,
"loss": 1.6204,
"step": 1665
},
{
"epoch": 1.6805121484020566,
"grad_norm": 0.13207538462664556,
"learning_rate": 5e-05,
"loss": 1.6158,
"step": 1666
},
{
"epoch": 1.6815203145478375,
"grad_norm": 0.14653678599732686,
"learning_rate": 5e-05,
"loss": 1.6035,
"step": 1667
},
{
"epoch": 1.6825284806936183,
"grad_norm": 0.1371827365018962,
"learning_rate": 5e-05,
"loss": 1.6138,
"step": 1668
},
{
"epoch": 1.683536646839399,
"grad_norm": 0.13723733155590662,
"learning_rate": 5e-05,
"loss": 1.6442,
"step": 1669
},
{
"epoch": 1.6845448129851799,
"grad_norm": 0.1401309125942649,
"learning_rate": 5e-05,
"loss": 1.605,
"step": 1670
},
{
"epoch": 1.6855529791309607,
"grad_norm": 0.14698602949806877,
"learning_rate": 5e-05,
"loss": 1.6145,
"step": 1671
},
{
"epoch": 1.6865611452767415,
"grad_norm": 0.14460773794736487,
"learning_rate": 5e-05,
"loss": 1.6079,
"step": 1672
},
{
"epoch": 1.6875693114225223,
"grad_norm": 0.1376051842545434,
"learning_rate": 5e-05,
"loss": 1.6377,
"step": 1673
},
{
"epoch": 1.6885774775683031,
"grad_norm": 0.13423153255852358,
"learning_rate": 5e-05,
"loss": 1.6093,
"step": 1674
},
{
"epoch": 1.689585643714084,
"grad_norm": 0.1385968781092482,
"learning_rate": 5e-05,
"loss": 1.5947,
"step": 1675
},
{
"epoch": 1.690593809859865,
"grad_norm": 0.1461167654309264,
"learning_rate": 5e-05,
"loss": 1.6221,
"step": 1676
},
{
"epoch": 1.6916019760056458,
"grad_norm": 0.12582225692106638,
"learning_rate": 5e-05,
"loss": 1.6135,
"step": 1677
},
{
"epoch": 1.6926101421514266,
"grad_norm": 0.1438589812956846,
"learning_rate": 5e-05,
"loss": 1.6041,
"step": 1678
},
{
"epoch": 1.6936183082972074,
"grad_norm": 0.13943583979636195,
"learning_rate": 5e-05,
"loss": 1.6045,
"step": 1679
},
{
"epoch": 1.6946264744429882,
"grad_norm": 0.13360275717336678,
"learning_rate": 5e-05,
"loss": 1.618,
"step": 1680
},
{
"epoch": 1.695634640588769,
"grad_norm": 0.12441487879737671,
"learning_rate": 5e-05,
"loss": 1.6249,
"step": 1681
},
{
"epoch": 1.6966428067345498,
"grad_norm": 0.1339772033533686,
"learning_rate": 5e-05,
"loss": 1.6203,
"step": 1682
},
{
"epoch": 1.6976509728803308,
"grad_norm": 0.13386807917707239,
"learning_rate": 5e-05,
"loss": 1.6183,
"step": 1683
},
{
"epoch": 1.6986591390261117,
"grad_norm": 0.129651625611091,
"learning_rate": 5e-05,
"loss": 1.615,
"step": 1684
},
{
"epoch": 1.6996673051718925,
"grad_norm": 0.12428352736785793,
"learning_rate": 5e-05,
"loss": 1.6281,
"step": 1685
},
{
"epoch": 1.7006754713176733,
"grad_norm": 0.1363843489132686,
"learning_rate": 5e-05,
"loss": 1.6204,
"step": 1686
},
{
"epoch": 1.701683637463454,
"grad_norm": 0.14359773549748206,
"learning_rate": 5e-05,
"loss": 1.6011,
"step": 1687
},
{
"epoch": 1.7026918036092349,
"grad_norm": 0.13322068817698787,
"learning_rate": 5e-05,
"loss": 1.6377,
"step": 1688
},
{
"epoch": 1.7036999697550157,
"grad_norm": 0.12744174180057552,
"learning_rate": 5e-05,
"loss": 1.6319,
"step": 1689
},
{
"epoch": 1.7047081359007965,
"grad_norm": 0.12545956427102473,
"learning_rate": 5e-05,
"loss": 1.6244,
"step": 1690
},
{
"epoch": 1.7057163020465773,
"grad_norm": 0.13246626345885423,
"learning_rate": 5e-05,
"loss": 1.6319,
"step": 1691
},
{
"epoch": 1.7067244681923581,
"grad_norm": 0.13204181957812078,
"learning_rate": 5e-05,
"loss": 1.6016,
"step": 1692
},
{
"epoch": 1.707732634338139,
"grad_norm": 0.12492646517845629,
"learning_rate": 5e-05,
"loss": 1.6029,
"step": 1693
},
{
"epoch": 1.7087408004839197,
"grad_norm": 0.1392980997156786,
"learning_rate": 5e-05,
"loss": 1.606,
"step": 1694
},
{
"epoch": 1.7097489666297006,
"grad_norm": 0.1330684199861111,
"learning_rate": 5e-05,
"loss": 1.612,
"step": 1695
},
{
"epoch": 1.7107571327754814,
"grad_norm": 0.14132626784333496,
"learning_rate": 5e-05,
"loss": 1.5997,
"step": 1696
},
{
"epoch": 1.7117652989212622,
"grad_norm": 0.1407849034511313,
"learning_rate": 5e-05,
"loss": 1.6333,
"step": 1697
},
{
"epoch": 1.712773465067043,
"grad_norm": 0.1321531853957436,
"learning_rate": 5e-05,
"loss": 1.6038,
"step": 1698
},
{
"epoch": 1.7137816312128238,
"grad_norm": 0.14485102927415872,
"learning_rate": 5e-05,
"loss": 1.6048,
"step": 1699
},
{
"epoch": 1.7147897973586046,
"grad_norm": 0.13069749492979196,
"learning_rate": 5e-05,
"loss": 1.6327,
"step": 1700
},
{
"epoch": 1.7157979635043854,
"grad_norm": 0.13630816111779237,
"learning_rate": 5e-05,
"loss": 1.6112,
"step": 1701
},
{
"epoch": 1.7168061296501662,
"grad_norm": 0.14237455896465825,
"learning_rate": 5e-05,
"loss": 1.6248,
"step": 1702
},
{
"epoch": 1.717814295795947,
"grad_norm": 0.13763466168468277,
"learning_rate": 5e-05,
"loss": 1.6197,
"step": 1703
},
{
"epoch": 1.7188224619417278,
"grad_norm": 0.1374930902892298,
"learning_rate": 5e-05,
"loss": 1.6218,
"step": 1704
},
{
"epoch": 1.7198306280875089,
"grad_norm": 0.13512322205932592,
"learning_rate": 5e-05,
"loss": 1.6268,
"step": 1705
},
{
"epoch": 1.7208387942332897,
"grad_norm": 0.1452506652798437,
"learning_rate": 5e-05,
"loss": 1.6203,
"step": 1706
},
{
"epoch": 1.7218469603790705,
"grad_norm": 0.1557032826185054,
"learning_rate": 5e-05,
"loss": 1.6126,
"step": 1707
},
{
"epoch": 1.7228551265248513,
"grad_norm": 0.13340855639180935,
"learning_rate": 5e-05,
"loss": 1.6194,
"step": 1708
},
{
"epoch": 1.723863292670632,
"grad_norm": 0.1736722982841165,
"learning_rate": 5e-05,
"loss": 1.6177,
"step": 1709
},
{
"epoch": 1.724871458816413,
"grad_norm": 0.13838096398331612,
"learning_rate": 5e-05,
"loss": 1.6258,
"step": 1710
},
{
"epoch": 1.7258796249621937,
"grad_norm": 0.1340817051050624,
"learning_rate": 5e-05,
"loss": 1.6222,
"step": 1711
},
{
"epoch": 1.7268877911079747,
"grad_norm": 0.1549435001901365,
"learning_rate": 5e-05,
"loss": 1.6046,
"step": 1712
},
{
"epoch": 1.7278959572537556,
"grad_norm": 0.12750772195502852,
"learning_rate": 5e-05,
"loss": 1.6148,
"step": 1713
},
{
"epoch": 1.7289041233995364,
"grad_norm": 0.1457235372622354,
"learning_rate": 5e-05,
"loss": 1.6334,
"step": 1714
},
{
"epoch": 1.7299122895453172,
"grad_norm": 0.1319619921628885,
"learning_rate": 5e-05,
"loss": 1.6232,
"step": 1715
},
{
"epoch": 1.730920455691098,
"grad_norm": 0.13299977467266605,
"learning_rate": 5e-05,
"loss": 1.6075,
"step": 1716
},
{
"epoch": 1.7319286218368788,
"grad_norm": 0.13998386630533857,
"learning_rate": 5e-05,
"loss": 1.6111,
"step": 1717
},
{
"epoch": 1.7329367879826596,
"grad_norm": 0.13255509324875403,
"learning_rate": 5e-05,
"loss": 1.6104,
"step": 1718
},
{
"epoch": 1.7339449541284404,
"grad_norm": 0.13387268249492904,
"learning_rate": 5e-05,
"loss": 1.6204,
"step": 1719
},
{
"epoch": 1.7349531202742212,
"grad_norm": 0.1413802578936938,
"learning_rate": 5e-05,
"loss": 1.5923,
"step": 1720
},
{
"epoch": 1.735961286420002,
"grad_norm": 0.13701754337130248,
"learning_rate": 5e-05,
"loss": 1.6029,
"step": 1721
},
{
"epoch": 1.7369694525657828,
"grad_norm": 0.1380475535571739,
"learning_rate": 5e-05,
"loss": 1.5915,
"step": 1722
},
{
"epoch": 1.7379776187115636,
"grad_norm": 0.5622851151853336,
"learning_rate": 5e-05,
"loss": 1.6169,
"step": 1723
},
{
"epoch": 1.7389857848573445,
"grad_norm": 0.13527175858738594,
"learning_rate": 5e-05,
"loss": 1.6033,
"step": 1724
},
{
"epoch": 1.7399939510031253,
"grad_norm": 0.13601583456425725,
"learning_rate": 5e-05,
"loss": 1.5897,
"step": 1725
},
{
"epoch": 1.741002117148906,
"grad_norm": 0.12879872996292097,
"learning_rate": 5e-05,
"loss": 1.633,
"step": 1726
},
{
"epoch": 1.7420102832946869,
"grad_norm": 0.1428323936794853,
"learning_rate": 5e-05,
"loss": 1.6072,
"step": 1727
},
{
"epoch": 1.7430184494404677,
"grad_norm": 0.19285827074287154,
"learning_rate": 5e-05,
"loss": 1.6181,
"step": 1728
},
{
"epoch": 1.7440266155862485,
"grad_norm": 0.19549095939112598,
"learning_rate": 5e-05,
"loss": 1.5952,
"step": 1729
},
{
"epoch": 1.7450347817320293,
"grad_norm": 0.13241128621801906,
"learning_rate": 5e-05,
"loss": 1.6142,
"step": 1730
},
{
"epoch": 1.7460429478778101,
"grad_norm": 0.1402339809583584,
"learning_rate": 5e-05,
"loss": 1.584,
"step": 1731
},
{
"epoch": 1.747051114023591,
"grad_norm": 0.14004271275884192,
"learning_rate": 5e-05,
"loss": 1.6304,
"step": 1732
},
{
"epoch": 1.748059280169372,
"grad_norm": 0.14445552177426974,
"learning_rate": 5e-05,
"loss": 1.5978,
"step": 1733
},
{
"epoch": 1.7490674463151528,
"grad_norm": 0.14003088790753546,
"learning_rate": 5e-05,
"loss": 1.6014,
"step": 1734
},
{
"epoch": 1.7500756124609336,
"grad_norm": 0.13228066300335042,
"learning_rate": 5e-05,
"loss": 1.6289,
"step": 1735
},
{
"epoch": 1.7510837786067144,
"grad_norm": 0.15457189168732346,
"learning_rate": 5e-05,
"loss": 1.613,
"step": 1736
},
{
"epoch": 1.7520919447524952,
"grad_norm": 0.13633629639002504,
"learning_rate": 5e-05,
"loss": 1.627,
"step": 1737
},
{
"epoch": 1.753100110898276,
"grad_norm": 0.14369857847598344,
"learning_rate": 5e-05,
"loss": 1.6029,
"step": 1738
},
{
"epoch": 1.7541082770440568,
"grad_norm": 0.14664263803679667,
"learning_rate": 5e-05,
"loss": 1.6201,
"step": 1739
},
{
"epoch": 1.7551164431898378,
"grad_norm": 0.14355866363102157,
"learning_rate": 5e-05,
"loss": 1.6216,
"step": 1740
},
{
"epoch": 1.7561246093356186,
"grad_norm": 0.1466730945121939,
"learning_rate": 5e-05,
"loss": 1.6003,
"step": 1741
},
{
"epoch": 1.7571327754813995,
"grad_norm": 0.13314431110264674,
"learning_rate": 5e-05,
"loss": 1.6125,
"step": 1742
},
{
"epoch": 1.7581409416271803,
"grad_norm": 0.15711319563822906,
"learning_rate": 5e-05,
"loss": 1.6124,
"step": 1743
},
{
"epoch": 1.759149107772961,
"grad_norm": 1.2638567147756596,
"learning_rate": 5e-05,
"loss": 1.5946,
"step": 1744
},
{
"epoch": 1.7601572739187419,
"grad_norm": 0.14049558238092832,
"learning_rate": 5e-05,
"loss": 1.6199,
"step": 1745
},
{
"epoch": 1.7611654400645227,
"grad_norm": 0.15174858113188466,
"learning_rate": 5e-05,
"loss": 1.6235,
"step": 1746
},
{
"epoch": 1.7621736062103035,
"grad_norm": 0.14413750895344504,
"learning_rate": 5e-05,
"loss": 1.6208,
"step": 1747
},
{
"epoch": 1.7631817723560843,
"grad_norm": 0.15508494984696386,
"learning_rate": 5e-05,
"loss": 1.6084,
"step": 1748
},
{
"epoch": 1.7641899385018651,
"grad_norm": 0.15498485653169553,
"learning_rate": 5e-05,
"loss": 1.5952,
"step": 1749
},
{
"epoch": 1.765198104647646,
"grad_norm": 0.16604683843727644,
"learning_rate": 5e-05,
"loss": 1.6142,
"step": 1750
},
{
"epoch": 1.7662062707934267,
"grad_norm": 0.17563345232700797,
"learning_rate": 5e-05,
"loss": 1.5914,
"step": 1751
},
{
"epoch": 1.7672144369392075,
"grad_norm": 0.15092209589409813,
"learning_rate": 5e-05,
"loss": 1.616,
"step": 1752
},
{
"epoch": 1.7682226030849884,
"grad_norm": 0.17913311190933373,
"learning_rate": 5e-05,
"loss": 1.6192,
"step": 1753
},
{
"epoch": 1.7692307692307692,
"grad_norm": 0.1710531241250294,
"learning_rate": 5e-05,
"loss": 1.6162,
"step": 1754
},
{
"epoch": 1.77023893537655,
"grad_norm": 0.16638506208920983,
"learning_rate": 5e-05,
"loss": 1.6334,
"step": 1755
},
{
"epoch": 1.7712471015223308,
"grad_norm": 0.18761824583553158,
"learning_rate": 5e-05,
"loss": 1.6142,
"step": 1756
},
{
"epoch": 1.7722552676681116,
"grad_norm": 0.14892043726691043,
"learning_rate": 5e-05,
"loss": 1.6317,
"step": 1757
},
{
"epoch": 1.7732634338138924,
"grad_norm": 0.16535088344152474,
"learning_rate": 5e-05,
"loss": 1.6128,
"step": 1758
},
{
"epoch": 1.7742715999596732,
"grad_norm": 0.15060919472661713,
"learning_rate": 5e-05,
"loss": 1.6075,
"step": 1759
},
{
"epoch": 1.775279766105454,
"grad_norm": 0.1685283379818972,
"learning_rate": 5e-05,
"loss": 1.6045,
"step": 1760
},
{
"epoch": 1.776287932251235,
"grad_norm": 0.1457182770606618,
"learning_rate": 5e-05,
"loss": 1.6248,
"step": 1761
},
{
"epoch": 1.7772960983970159,
"grad_norm": 0.14486683958980914,
"learning_rate": 5e-05,
"loss": 1.6043,
"step": 1762
},
{
"epoch": 1.7783042645427967,
"grad_norm": 0.13696686191142463,
"learning_rate": 5e-05,
"loss": 1.6395,
"step": 1763
},
{
"epoch": 1.7793124306885775,
"grad_norm": 0.16153417016077473,
"learning_rate": 5e-05,
"loss": 1.6063,
"step": 1764
},
{
"epoch": 1.7803205968343583,
"grad_norm": 0.12761827086520675,
"learning_rate": 5e-05,
"loss": 1.6002,
"step": 1765
},
{
"epoch": 1.781328762980139,
"grad_norm": 0.1528242449301567,
"learning_rate": 5e-05,
"loss": 1.6141,
"step": 1766
},
{
"epoch": 1.78233692912592,
"grad_norm": 0.14813788393804353,
"learning_rate": 5e-05,
"loss": 1.6057,
"step": 1767
},
{
"epoch": 1.783345095271701,
"grad_norm": 0.13362860876896798,
"learning_rate": 5e-05,
"loss": 1.6029,
"step": 1768
},
{
"epoch": 1.7843532614174817,
"grad_norm": 0.14148302977868854,
"learning_rate": 5e-05,
"loss": 1.6254,
"step": 1769
},
{
"epoch": 1.7853614275632625,
"grad_norm": 0.3091291743457628,
"learning_rate": 5e-05,
"loss": 1.6312,
"step": 1770
},
{
"epoch": 1.7863695937090434,
"grad_norm": 0.13735284354244087,
"learning_rate": 5e-05,
"loss": 1.6013,
"step": 1771
},
{
"epoch": 1.7873777598548242,
"grad_norm": 0.12693953022906115,
"learning_rate": 5e-05,
"loss": 1.5964,
"step": 1772
},
{
"epoch": 1.788385926000605,
"grad_norm": 0.12719899555292422,
"learning_rate": 5e-05,
"loss": 1.6171,
"step": 1773
},
{
"epoch": 1.7893940921463858,
"grad_norm": 0.12635154500382387,
"learning_rate": 5e-05,
"loss": 1.6087,
"step": 1774
},
{
"epoch": 1.7904022582921666,
"grad_norm": 0.13868520829220343,
"learning_rate": 5e-05,
"loss": 1.6173,
"step": 1775
},
{
"epoch": 1.7914104244379474,
"grad_norm": 0.1391173368990605,
"learning_rate": 5e-05,
"loss": 1.6213,
"step": 1776
},
{
"epoch": 1.7924185905837282,
"grad_norm": 0.1376872401819976,
"learning_rate": 5e-05,
"loss": 1.6016,
"step": 1777
},
{
"epoch": 1.793426756729509,
"grad_norm": 0.15909193130638846,
"learning_rate": 5e-05,
"loss": 1.6022,
"step": 1778
},
{
"epoch": 1.7944349228752898,
"grad_norm": 0.13145339806752976,
"learning_rate": 5e-05,
"loss": 1.5977,
"step": 1779
},
{
"epoch": 1.7954430890210706,
"grad_norm": 0.13830608154808835,
"learning_rate": 5e-05,
"loss": 1.6146,
"step": 1780
},
{
"epoch": 1.7964512551668514,
"grad_norm": 0.13587870252501869,
"learning_rate": 5e-05,
"loss": 1.612,
"step": 1781
},
{
"epoch": 1.7974594213126323,
"grad_norm": 0.14346041025356132,
"learning_rate": 5e-05,
"loss": 1.6336,
"step": 1782
},
{
"epoch": 1.798467587458413,
"grad_norm": 0.13668262983065077,
"learning_rate": 5e-05,
"loss": 1.6003,
"step": 1783
},
{
"epoch": 1.7994757536041939,
"grad_norm": 0.15063006748789715,
"learning_rate": 5e-05,
"loss": 1.6111,
"step": 1784
},
{
"epoch": 1.8004839197499747,
"grad_norm": 0.1358547525997899,
"learning_rate": 5e-05,
"loss": 1.615,
"step": 1785
},
{
"epoch": 1.8014920858957555,
"grad_norm": 0.12987109911965108,
"learning_rate": 5e-05,
"loss": 1.6107,
"step": 1786
},
{
"epoch": 1.8025002520415363,
"grad_norm": 0.15381766777406494,
"learning_rate": 5e-05,
"loss": 1.6209,
"step": 1787
},
{
"epoch": 1.803508418187317,
"grad_norm": 0.13625928451952748,
"learning_rate": 5e-05,
"loss": 1.6131,
"step": 1788
},
{
"epoch": 1.8045165843330981,
"grad_norm": 0.14903057932506056,
"learning_rate": 5e-05,
"loss": 1.62,
"step": 1789
},
{
"epoch": 1.805524750478879,
"grad_norm": 0.1363950109586751,
"learning_rate": 5e-05,
"loss": 1.5978,
"step": 1790
},
{
"epoch": 1.8065329166246598,
"grad_norm": 0.12663592216992495,
"learning_rate": 5e-05,
"loss": 1.6146,
"step": 1791
},
{
"epoch": 1.8075410827704406,
"grad_norm": 0.13673734077949914,
"learning_rate": 5e-05,
"loss": 1.6285,
"step": 1792
},
{
"epoch": 1.8085492489162214,
"grad_norm": 0.1987012161353432,
"learning_rate": 5e-05,
"loss": 1.6161,
"step": 1793
},
{
"epoch": 1.8095574150620022,
"grad_norm": 0.12952797334927602,
"learning_rate": 5e-05,
"loss": 1.6171,
"step": 1794
},
{
"epoch": 1.810565581207783,
"grad_norm": 0.12897445209773659,
"learning_rate": 5e-05,
"loss": 1.6033,
"step": 1795
},
{
"epoch": 1.811573747353564,
"grad_norm": 0.12947253799660324,
"learning_rate": 5e-05,
"loss": 1.583,
"step": 1796
},
{
"epoch": 1.8125819134993448,
"grad_norm": 0.1343054463682107,
"learning_rate": 5e-05,
"loss": 1.6088,
"step": 1797
},
{
"epoch": 1.8135900796451256,
"grad_norm": 0.1390809767000595,
"learning_rate": 5e-05,
"loss": 1.6049,
"step": 1798
},
{
"epoch": 1.8145982457909065,
"grad_norm": 1.1247567635889086,
"learning_rate": 5e-05,
"loss": 1.6084,
"step": 1799
},
{
"epoch": 1.8156064119366873,
"grad_norm": 0.13972004841516494,
"learning_rate": 5e-05,
"loss": 1.6027,
"step": 1800
},
{
"epoch": 1.816614578082468,
"grad_norm": 0.12266920923306984,
"learning_rate": 5e-05,
"loss": 1.5949,
"step": 1801
},
{
"epoch": 1.8176227442282489,
"grad_norm": 0.13573438818370204,
"learning_rate": 5e-05,
"loss": 1.5959,
"step": 1802
},
{
"epoch": 1.8186309103740297,
"grad_norm": 0.13976539408124178,
"learning_rate": 5e-05,
"loss": 1.6162,
"step": 1803
},
{
"epoch": 1.8196390765198105,
"grad_norm": 0.15254799912538275,
"learning_rate": 5e-05,
"loss": 1.5973,
"step": 1804
},
{
"epoch": 1.8206472426655913,
"grad_norm": 0.1384953584615067,
"learning_rate": 5e-05,
"loss": 1.6354,
"step": 1805
},
{
"epoch": 1.8216554088113721,
"grad_norm": 0.1422730990739598,
"learning_rate": 5e-05,
"loss": 1.602,
"step": 1806
},
{
"epoch": 1.822663574957153,
"grad_norm": 0.15360377620657606,
"learning_rate": 5e-05,
"loss": 1.5962,
"step": 1807
},
{
"epoch": 1.8236717411029337,
"grad_norm": 0.14850922203359646,
"learning_rate": 5e-05,
"loss": 1.6307,
"step": 1808
},
{
"epoch": 1.8246799072487145,
"grad_norm": 0.15150459907289224,
"learning_rate": 5e-05,
"loss": 1.6181,
"step": 1809
},
{
"epoch": 1.8256880733944953,
"grad_norm": 0.15088788156019828,
"learning_rate": 5e-05,
"loss": 1.5938,
"step": 1810
},
{
"epoch": 1.8266962395402762,
"grad_norm": 0.15492817542755252,
"learning_rate": 5e-05,
"loss": 1.6048,
"step": 1811
},
{
"epoch": 1.827704405686057,
"grad_norm": 0.14112165642917768,
"learning_rate": 5e-05,
"loss": 1.6073,
"step": 1812
},
{
"epoch": 1.8287125718318378,
"grad_norm": 0.13706017286835598,
"learning_rate": 5e-05,
"loss": 1.6075,
"step": 1813
},
{
"epoch": 1.8297207379776186,
"grad_norm": 0.1385283772912929,
"learning_rate": 5e-05,
"loss": 1.6211,
"step": 1814
},
{
"epoch": 1.8307289041233994,
"grad_norm": 0.1404996606919889,
"learning_rate": 5e-05,
"loss": 1.6087,
"step": 1815
},
{
"epoch": 1.8317370702691802,
"grad_norm": 0.13800608084793464,
"learning_rate": 5e-05,
"loss": 1.6078,
"step": 1816
},
{
"epoch": 1.8327452364149612,
"grad_norm": 0.13881745887222013,
"learning_rate": 5e-05,
"loss": 1.6336,
"step": 1817
},
{
"epoch": 1.833753402560742,
"grad_norm": 0.137211197141056,
"learning_rate": 5e-05,
"loss": 1.6274,
"step": 1818
},
{
"epoch": 1.8347615687065228,
"grad_norm": 0.13123746536610786,
"learning_rate": 5e-05,
"loss": 1.6398,
"step": 1819
},
{
"epoch": 1.8357697348523037,
"grad_norm": 0.13812430669099415,
"learning_rate": 5e-05,
"loss": 1.6013,
"step": 1820
},
{
"epoch": 1.8367779009980845,
"grad_norm": 0.13585642799638906,
"learning_rate": 5e-05,
"loss": 1.6024,
"step": 1821
},
{
"epoch": 1.8377860671438653,
"grad_norm": 0.14276337338880293,
"learning_rate": 5e-05,
"loss": 1.6071,
"step": 1822
},
{
"epoch": 1.838794233289646,
"grad_norm": 0.1371532484515092,
"learning_rate": 5e-05,
"loss": 1.6214,
"step": 1823
},
{
"epoch": 1.8398023994354271,
"grad_norm": 0.14658292784641913,
"learning_rate": 5e-05,
"loss": 1.6019,
"step": 1824
},
{
"epoch": 1.840810565581208,
"grad_norm": 0.151520889065639,
"learning_rate": 5e-05,
"loss": 1.6075,
"step": 1825
},
{
"epoch": 1.8418187317269887,
"grad_norm": 0.13216226198024741,
"learning_rate": 5e-05,
"loss": 1.6125,
"step": 1826
},
{
"epoch": 1.8428268978727695,
"grad_norm": 0.14248039055169562,
"learning_rate": 5e-05,
"loss": 1.6328,
"step": 1827
},
{
"epoch": 1.8438350640185504,
"grad_norm": 0.13790247147421328,
"learning_rate": 5e-05,
"loss": 1.5826,
"step": 1828
},
{
"epoch": 1.8448432301643312,
"grad_norm": 0.14759698549401534,
"learning_rate": 5e-05,
"loss": 1.6195,
"step": 1829
},
{
"epoch": 1.845851396310112,
"grad_norm": 0.13045200667392873,
"learning_rate": 5e-05,
"loss": 1.606,
"step": 1830
},
{
"epoch": 1.8468595624558928,
"grad_norm": 0.32465101968739074,
"learning_rate": 5e-05,
"loss": 1.6092,
"step": 1831
},
{
"epoch": 1.8478677286016736,
"grad_norm": 0.1398935855602046,
"learning_rate": 5e-05,
"loss": 1.6139,
"step": 1832
},
{
"epoch": 1.8488758947474544,
"grad_norm": 0.1338958194359356,
"learning_rate": 5e-05,
"loss": 1.6117,
"step": 1833
},
{
"epoch": 1.8498840608932352,
"grad_norm": 0.14382819664190047,
"learning_rate": 5e-05,
"loss": 1.6057,
"step": 1834
},
{
"epoch": 1.850892227039016,
"grad_norm": 0.14351491833839605,
"learning_rate": 5e-05,
"loss": 1.6086,
"step": 1835
},
{
"epoch": 1.8519003931847968,
"grad_norm": 0.13662427769391197,
"learning_rate": 5e-05,
"loss": 1.6218,
"step": 1836
},
{
"epoch": 1.8529085593305776,
"grad_norm": 0.13890383974711884,
"learning_rate": 5e-05,
"loss": 1.6166,
"step": 1837
},
{
"epoch": 1.8539167254763584,
"grad_norm": 0.13513145904149312,
"learning_rate": 5e-05,
"loss": 1.5881,
"step": 1838
},
{
"epoch": 1.8549248916221392,
"grad_norm": 0.13594723016421367,
"learning_rate": 5e-05,
"loss": 1.6008,
"step": 1839
},
{
"epoch": 1.85593305776792,
"grad_norm": 0.1378923959889255,
"learning_rate": 5e-05,
"loss": 1.6138,
"step": 1840
},
{
"epoch": 1.8569412239137009,
"grad_norm": 0.14780320989735776,
"learning_rate": 5e-05,
"loss": 1.5961,
"step": 1841
},
{
"epoch": 1.8579493900594817,
"grad_norm": 0.14025512215005298,
"learning_rate": 5e-05,
"loss": 1.5891,
"step": 1842
},
{
"epoch": 1.8589575562052625,
"grad_norm": 0.13720223676278484,
"learning_rate": 5e-05,
"loss": 1.5996,
"step": 1843
},
{
"epoch": 1.8599657223510433,
"grad_norm": 0.13394687034017358,
"learning_rate": 5e-05,
"loss": 1.5953,
"step": 1844
},
{
"epoch": 1.8609738884968243,
"grad_norm": 0.13660579030003928,
"learning_rate": 5e-05,
"loss": 1.6247,
"step": 1845
},
{
"epoch": 1.8619820546426051,
"grad_norm": 0.15443892739189163,
"learning_rate": 5e-05,
"loss": 1.6047,
"step": 1846
},
{
"epoch": 1.862990220788386,
"grad_norm": 0.13569253914401672,
"learning_rate": 5e-05,
"loss": 1.5989,
"step": 1847
},
{
"epoch": 1.8639983869341668,
"grad_norm": 0.14797438502778215,
"learning_rate": 5e-05,
"loss": 1.5955,
"step": 1848
},
{
"epoch": 1.8650065530799476,
"grad_norm": 0.1421115324462668,
"learning_rate": 5e-05,
"loss": 1.6031,
"step": 1849
},
{
"epoch": 1.8660147192257284,
"grad_norm": 0.1333814678847669,
"learning_rate": 5e-05,
"loss": 1.6313,
"step": 1850
},
{
"epoch": 1.8670228853715092,
"grad_norm": 0.14198072611997878,
"learning_rate": 5e-05,
"loss": 1.6246,
"step": 1851
},
{
"epoch": 1.8680310515172902,
"grad_norm": 0.12894781507480482,
"learning_rate": 5e-05,
"loss": 1.5984,
"step": 1852
},
{
"epoch": 1.869039217663071,
"grad_norm": 0.13553779775058764,
"learning_rate": 5e-05,
"loss": 1.6331,
"step": 1853
},
{
"epoch": 1.8700473838088518,
"grad_norm": 0.1488801216335878,
"learning_rate": 5e-05,
"loss": 1.6056,
"step": 1854
},
{
"epoch": 1.8710555499546326,
"grad_norm": 0.1340319579502652,
"learning_rate": 5e-05,
"loss": 1.5931,
"step": 1855
},
{
"epoch": 1.8720637161004134,
"grad_norm": 0.15078566624864237,
"learning_rate": 5e-05,
"loss": 1.6172,
"step": 1856
},
{
"epoch": 1.8730718822461943,
"grad_norm": 0.14431215810257134,
"learning_rate": 5e-05,
"loss": 1.6202,
"step": 1857
},
{
"epoch": 1.874080048391975,
"grad_norm": 0.13057775295180585,
"learning_rate": 5e-05,
"loss": 1.6009,
"step": 1858
},
{
"epoch": 1.8750882145377559,
"grad_norm": 0.13694652840155766,
"learning_rate": 5e-05,
"loss": 1.6063,
"step": 1859
},
{
"epoch": 1.8760963806835367,
"grad_norm": 0.13929146625825486,
"learning_rate": 5e-05,
"loss": 1.5999,
"step": 1860
},
{
"epoch": 1.8771045468293175,
"grad_norm": 0.13962543869508612,
"learning_rate": 5e-05,
"loss": 1.6071,
"step": 1861
},
{
"epoch": 1.8781127129750983,
"grad_norm": 0.1294264670653415,
"learning_rate": 5e-05,
"loss": 1.597,
"step": 1862
},
{
"epoch": 1.879120879120879,
"grad_norm": 0.1392941337579169,
"learning_rate": 5e-05,
"loss": 1.6109,
"step": 1863
},
{
"epoch": 1.88012904526666,
"grad_norm": 0.1375911580117992,
"learning_rate": 5e-05,
"loss": 1.6063,
"step": 1864
},
{
"epoch": 1.8811372114124407,
"grad_norm": 0.13057281302682908,
"learning_rate": 5e-05,
"loss": 1.5869,
"step": 1865
},
{
"epoch": 1.8821453775582215,
"grad_norm": 0.13465492897476905,
"learning_rate": 5e-05,
"loss": 1.6111,
"step": 1866
},
{
"epoch": 1.8831535437040023,
"grad_norm": 0.1352567853452641,
"learning_rate": 5e-05,
"loss": 1.6065,
"step": 1867
},
{
"epoch": 1.8841617098497832,
"grad_norm": 0.13265304956653806,
"learning_rate": 5e-05,
"loss": 1.6004,
"step": 1868
},
{
"epoch": 1.885169875995564,
"grad_norm": 0.134585810919123,
"learning_rate": 5e-05,
"loss": 1.6098,
"step": 1869
},
{
"epoch": 1.8861780421413448,
"grad_norm": 0.1378781672247223,
"learning_rate": 5e-05,
"loss": 1.6134,
"step": 1870
},
{
"epoch": 1.8871862082871256,
"grad_norm": 0.6687407616313615,
"learning_rate": 5e-05,
"loss": 1.6211,
"step": 1871
},
{
"epoch": 1.8881943744329064,
"grad_norm": 0.14131248756584672,
"learning_rate": 5e-05,
"loss": 1.6081,
"step": 1872
},
{
"epoch": 1.8892025405786874,
"grad_norm": 0.14944089262776455,
"learning_rate": 5e-05,
"loss": 1.6127,
"step": 1873
},
{
"epoch": 1.8902107067244682,
"grad_norm": 0.14994048389580897,
"learning_rate": 5e-05,
"loss": 1.6082,
"step": 1874
},
{
"epoch": 1.891218872870249,
"grad_norm": 0.14204566054703277,
"learning_rate": 5e-05,
"loss": 1.5939,
"step": 1875
},
{
"epoch": 1.8922270390160298,
"grad_norm": 0.14563930123363145,
"learning_rate": 5e-05,
"loss": 1.6285,
"step": 1876
},
{
"epoch": 1.8932352051618107,
"grad_norm": 0.14602409024075935,
"learning_rate": 5e-05,
"loss": 1.6162,
"step": 1877
},
{
"epoch": 1.8942433713075915,
"grad_norm": 0.3250170638850309,
"learning_rate": 5e-05,
"loss": 1.6326,
"step": 1878
},
{
"epoch": 1.8952515374533723,
"grad_norm": 0.1480123135093311,
"learning_rate": 5e-05,
"loss": 1.6074,
"step": 1879
},
{
"epoch": 1.8962597035991533,
"grad_norm": 0.13564589267677152,
"learning_rate": 5e-05,
"loss": 1.6165,
"step": 1880
},
{
"epoch": 1.897267869744934,
"grad_norm": 0.14598987026919524,
"learning_rate": 5e-05,
"loss": 1.6241,
"step": 1881
},
{
"epoch": 1.898276035890715,
"grad_norm": 0.15340222651431212,
"learning_rate": 5e-05,
"loss": 1.6038,
"step": 1882
},
{
"epoch": 1.8992842020364957,
"grad_norm": 0.14660842376268418,
"learning_rate": 5e-05,
"loss": 1.6137,
"step": 1883
},
{
"epoch": 1.9002923681822765,
"grad_norm": 0.14523481586952683,
"learning_rate": 5e-05,
"loss": 1.6114,
"step": 1884
},
{
"epoch": 1.9013005343280573,
"grad_norm": 0.139366252487723,
"learning_rate": 5e-05,
"loss": 1.6189,
"step": 1885
},
{
"epoch": 1.9023087004738382,
"grad_norm": 0.14047698934688582,
"learning_rate": 5e-05,
"loss": 1.5822,
"step": 1886
},
{
"epoch": 1.903316866619619,
"grad_norm": 0.1429688256910901,
"learning_rate": 5e-05,
"loss": 1.6005,
"step": 1887
},
{
"epoch": 1.9043250327653998,
"grad_norm": 0.1449894299784549,
"learning_rate": 5e-05,
"loss": 1.619,
"step": 1888
},
{
"epoch": 1.9053331989111806,
"grad_norm": 0.136258179982917,
"learning_rate": 5e-05,
"loss": 1.6177,
"step": 1889
},
{
"epoch": 1.9063413650569614,
"grad_norm": 0.14993956007154408,
"learning_rate": 5e-05,
"loss": 1.6151,
"step": 1890
},
{
"epoch": 1.9073495312027422,
"grad_norm": 0.18176454707772344,
"learning_rate": 5e-05,
"loss": 1.6147,
"step": 1891
},
{
"epoch": 1.908357697348523,
"grad_norm": 0.13697270459679123,
"learning_rate": 5e-05,
"loss": 1.5944,
"step": 1892
},
{
"epoch": 1.9093658634943038,
"grad_norm": 0.14866596724221887,
"learning_rate": 5e-05,
"loss": 1.6019,
"step": 1893
},
{
"epoch": 1.9103740296400846,
"grad_norm": 0.13384771033533666,
"learning_rate": 5e-05,
"loss": 1.6016,
"step": 1894
},
{
"epoch": 1.9113821957858654,
"grad_norm": 0.1443979145146113,
"learning_rate": 5e-05,
"loss": 1.6118,
"step": 1895
},
{
"epoch": 1.9123903619316462,
"grad_norm": 0.14963383206603662,
"learning_rate": 5e-05,
"loss": 1.6152,
"step": 1896
},
{
"epoch": 1.913398528077427,
"grad_norm": 0.13098848994842957,
"learning_rate": 5e-05,
"loss": 1.6132,
"step": 1897
},
{
"epoch": 1.9144066942232079,
"grad_norm": 0.1420448647911459,
"learning_rate": 5e-05,
"loss": 1.6152,
"step": 1898
},
{
"epoch": 1.9154148603689887,
"grad_norm": 0.1402422345987282,
"learning_rate": 5e-05,
"loss": 1.6258,
"step": 1899
},
{
"epoch": 1.9164230265147695,
"grad_norm": 0.14282419116361197,
"learning_rate": 5e-05,
"loss": 1.5993,
"step": 1900
},
{
"epoch": 1.9174311926605505,
"grad_norm": 0.12900876538976672,
"learning_rate": 5e-05,
"loss": 1.5947,
"step": 1901
},
{
"epoch": 1.9184393588063313,
"grad_norm": 0.13602993676987202,
"learning_rate": 5e-05,
"loss": 1.5963,
"step": 1902
},
{
"epoch": 1.9194475249521121,
"grad_norm": 0.13091353897349692,
"learning_rate": 5e-05,
"loss": 1.6103,
"step": 1903
},
{
"epoch": 1.920455691097893,
"grad_norm": 0.13147459168353956,
"learning_rate": 5e-05,
"loss": 1.5896,
"step": 1904
},
{
"epoch": 1.9214638572436737,
"grad_norm": 0.1278349356366095,
"learning_rate": 5e-05,
"loss": 1.585,
"step": 1905
},
{
"epoch": 1.9224720233894546,
"grad_norm": 0.13556216612236158,
"learning_rate": 5e-05,
"loss": 1.5997,
"step": 1906
},
{
"epoch": 1.9234801895352354,
"grad_norm": 0.13786564534521312,
"learning_rate": 5e-05,
"loss": 1.6049,
"step": 1907
},
{
"epoch": 1.9244883556810164,
"grad_norm": 0.13361588168189525,
"learning_rate": 5e-05,
"loss": 1.6127,
"step": 1908
},
{
"epoch": 1.9254965218267972,
"grad_norm": 0.12569201204576966,
"learning_rate": 5e-05,
"loss": 1.5928,
"step": 1909
},
{
"epoch": 1.926504687972578,
"grad_norm": 0.152867731337792,
"learning_rate": 5e-05,
"loss": 1.6042,
"step": 1910
},
{
"epoch": 1.9275128541183588,
"grad_norm": 0.13575798302417755,
"learning_rate": 5e-05,
"loss": 1.5977,
"step": 1911
},
{
"epoch": 1.9285210202641396,
"grad_norm": 0.1398337969996516,
"learning_rate": 5e-05,
"loss": 1.6014,
"step": 1912
},
{
"epoch": 1.9295291864099204,
"grad_norm": 0.12623522408596025,
"learning_rate": 5e-05,
"loss": 1.6017,
"step": 1913
},
{
"epoch": 1.9305373525557012,
"grad_norm": 0.13172330015779796,
"learning_rate": 5e-05,
"loss": 1.5824,
"step": 1914
},
{
"epoch": 1.931545518701482,
"grad_norm": 0.13863466103596936,
"learning_rate": 5e-05,
"loss": 1.6083,
"step": 1915
},
{
"epoch": 1.9325536848472629,
"grad_norm": 0.1289112695594299,
"learning_rate": 5e-05,
"loss": 1.6057,
"step": 1916
},
{
"epoch": 1.9335618509930437,
"grad_norm": 0.125376155275176,
"learning_rate": 5e-05,
"loss": 1.6015,
"step": 1917
},
{
"epoch": 1.9345700171388245,
"grad_norm": 0.1269564764984299,
"learning_rate": 5e-05,
"loss": 1.6147,
"step": 1918
},
{
"epoch": 1.9355781832846053,
"grad_norm": 0.12566810938900913,
"learning_rate": 5e-05,
"loss": 1.5841,
"step": 1919
},
{
"epoch": 1.936586349430386,
"grad_norm": 0.13117779231765533,
"learning_rate": 5e-05,
"loss": 1.5935,
"step": 1920
},
{
"epoch": 1.937594515576167,
"grad_norm": 0.13386327063924972,
"learning_rate": 5e-05,
"loss": 1.5995,
"step": 1921
},
{
"epoch": 1.9386026817219477,
"grad_norm": 0.1257136559733915,
"learning_rate": 5e-05,
"loss": 1.6112,
"step": 1922
},
{
"epoch": 1.9396108478677285,
"grad_norm": 0.13537512034613913,
"learning_rate": 5e-05,
"loss": 1.5972,
"step": 1923
},
{
"epoch": 1.9406190140135093,
"grad_norm": 0.1253377735548057,
"learning_rate": 5e-05,
"loss": 1.606,
"step": 1924
},
{
"epoch": 1.9416271801592901,
"grad_norm": 0.13805307841774989,
"learning_rate": 5e-05,
"loss": 1.6166,
"step": 1925
},
{
"epoch": 1.942635346305071,
"grad_norm": 0.14387345444798735,
"learning_rate": 5e-05,
"loss": 1.6043,
"step": 1926
},
{
"epoch": 1.9436435124508518,
"grad_norm": 0.14135640883511918,
"learning_rate": 5e-05,
"loss": 1.5987,
"step": 1927
},
{
"epoch": 1.9446516785966326,
"grad_norm": 0.137130239912801,
"learning_rate": 5e-05,
"loss": 1.6048,
"step": 1928
},
{
"epoch": 1.9456598447424136,
"grad_norm": 0.13528422418643588,
"learning_rate": 5e-05,
"loss": 1.5905,
"step": 1929
},
{
"epoch": 1.9466680108881944,
"grad_norm": 0.13217969456041886,
"learning_rate": 5e-05,
"loss": 1.5956,
"step": 1930
},
{
"epoch": 1.9476761770339752,
"grad_norm": 0.12844860444043849,
"learning_rate": 5e-05,
"loss": 1.5986,
"step": 1931
},
{
"epoch": 1.948684343179756,
"grad_norm": 0.13198107001437961,
"learning_rate": 5e-05,
"loss": 1.5972,
"step": 1932
},
{
"epoch": 1.9496925093255368,
"grad_norm": 0.14030130422292156,
"learning_rate": 5e-05,
"loss": 1.5947,
"step": 1933
},
{
"epoch": 1.9507006754713176,
"grad_norm": 0.13982594899557532,
"learning_rate": 5e-05,
"loss": 1.6082,
"step": 1934
},
{
"epoch": 1.9517088416170985,
"grad_norm": 0.13591588212366842,
"learning_rate": 5e-05,
"loss": 1.6009,
"step": 1935
},
{
"epoch": 1.9527170077628795,
"grad_norm": 0.14041955247040697,
"learning_rate": 5e-05,
"loss": 1.6068,
"step": 1936
},
{
"epoch": 1.9537251739086603,
"grad_norm": 0.12783435122430417,
"learning_rate": 5e-05,
"loss": 1.5945,
"step": 1937
},
{
"epoch": 1.954733340054441,
"grad_norm": 0.15003800100073741,
"learning_rate": 5e-05,
"loss": 1.5921,
"step": 1938
},
{
"epoch": 1.955741506200222,
"grad_norm": 0.13003233132983819,
"learning_rate": 5e-05,
"loss": 1.6035,
"step": 1939
},
{
"epoch": 1.9567496723460027,
"grad_norm": 0.13691493601420102,
"learning_rate": 5e-05,
"loss": 1.602,
"step": 1940
},
{
"epoch": 1.9577578384917835,
"grad_norm": 0.14181799516668003,
"learning_rate": 5e-05,
"loss": 1.615,
"step": 1941
},
{
"epoch": 1.9587660046375643,
"grad_norm": 0.12521176786439234,
"learning_rate": 5e-05,
"loss": 1.595,
"step": 1942
},
{
"epoch": 1.9597741707833451,
"grad_norm": 0.12916795703523762,
"learning_rate": 5e-05,
"loss": 1.6053,
"step": 1943
},
{
"epoch": 1.960782336929126,
"grad_norm": 0.13398015324125567,
"learning_rate": 5e-05,
"loss": 1.6093,
"step": 1944
},
{
"epoch": 1.9617905030749068,
"grad_norm": 0.12958030427407444,
"learning_rate": 5e-05,
"loss": 1.6057,
"step": 1945
},
{
"epoch": 1.9627986692206876,
"grad_norm": 0.12561514967589582,
"learning_rate": 5e-05,
"loss": 1.5809,
"step": 1946
},
{
"epoch": 1.9638068353664684,
"grad_norm": 0.13248125250714624,
"learning_rate": 5e-05,
"loss": 1.6067,
"step": 1947
},
{
"epoch": 1.9648150015122492,
"grad_norm": 0.14501357641489096,
"learning_rate": 5e-05,
"loss": 1.6124,
"step": 1948
},
{
"epoch": 1.96582316765803,
"grad_norm": 0.15592280691143984,
"learning_rate": 5e-05,
"loss": 1.6014,
"step": 1949
},
{
"epoch": 1.9668313338038108,
"grad_norm": 0.134906763274516,
"learning_rate": 5e-05,
"loss": 1.6183,
"step": 1950
},
{
"epoch": 1.9678394999495916,
"grad_norm": 0.14157220397428136,
"learning_rate": 5e-05,
"loss": 1.6093,
"step": 1951
},
{
"epoch": 1.9688476660953724,
"grad_norm": 0.14414118076516377,
"learning_rate": 5e-05,
"loss": 1.6088,
"step": 1952
},
{
"epoch": 1.9698558322411532,
"grad_norm": 0.13162906710490183,
"learning_rate": 5e-05,
"loss": 1.5969,
"step": 1953
},
{
"epoch": 1.970863998386934,
"grad_norm": 0.13250441324273587,
"learning_rate": 5e-05,
"loss": 1.6031,
"step": 1954
},
{
"epoch": 1.9718721645327149,
"grad_norm": 0.24250705772782222,
"learning_rate": 5e-05,
"loss": 1.6015,
"step": 1955
},
{
"epoch": 1.9728803306784957,
"grad_norm": 0.13645893773608062,
"learning_rate": 5e-05,
"loss": 1.6159,
"step": 1956
},
{
"epoch": 1.9738884968242767,
"grad_norm": 0.1371558712887575,
"learning_rate": 5e-05,
"loss": 1.6047,
"step": 1957
},
{
"epoch": 1.9748966629700575,
"grad_norm": 0.12471125980392224,
"learning_rate": 5e-05,
"loss": 1.5932,
"step": 1958
},
{
"epoch": 1.9759048291158383,
"grad_norm": 0.13982098444969587,
"learning_rate": 5e-05,
"loss": 1.595,
"step": 1959
},
{
"epoch": 1.9769129952616191,
"grad_norm": 0.13725684179927516,
"learning_rate": 5e-05,
"loss": 1.6089,
"step": 1960
},
{
"epoch": 1.9779211614074,
"grad_norm": 0.13982441728850534,
"learning_rate": 5e-05,
"loss": 1.619,
"step": 1961
},
{
"epoch": 1.9789293275531807,
"grad_norm": 0.15335757017626273,
"learning_rate": 5e-05,
"loss": 1.5956,
"step": 1962
},
{
"epoch": 1.9799374936989615,
"grad_norm": 0.1312542054924984,
"learning_rate": 5e-05,
"loss": 1.603,
"step": 1963
},
{
"epoch": 1.9809456598447426,
"grad_norm": 0.12820080032328318,
"learning_rate": 5e-05,
"loss": 1.6009,
"step": 1964
},
{
"epoch": 1.9819538259905234,
"grad_norm": 0.12787509611016895,
"learning_rate": 5e-05,
"loss": 1.6012,
"step": 1965
},
{
"epoch": 1.9829619921363042,
"grad_norm": 0.13478549723799105,
"learning_rate": 5e-05,
"loss": 1.5921,
"step": 1966
},
{
"epoch": 1.983970158282085,
"grad_norm": 0.12801241537455618,
"learning_rate": 5e-05,
"loss": 1.5773,
"step": 1967
},
{
"epoch": 1.9849783244278658,
"grad_norm": 0.13642931742379077,
"learning_rate": 5e-05,
"loss": 1.6076,
"step": 1968
},
{
"epoch": 1.9859864905736466,
"grad_norm": 0.13228520189020773,
"learning_rate": 5e-05,
"loss": 1.594,
"step": 1969
},
{
"epoch": 1.9869946567194274,
"grad_norm": 0.15214148479625955,
"learning_rate": 5e-05,
"loss": 1.6005,
"step": 1970
},
{
"epoch": 1.9880028228652082,
"grad_norm": 0.12704794193194915,
"learning_rate": 5e-05,
"loss": 1.5948,
"step": 1971
},
{
"epoch": 1.989010989010989,
"grad_norm": 0.14032856285895748,
"learning_rate": 5e-05,
"loss": 1.6169,
"step": 1972
},
{
"epoch": 1.9900191551567699,
"grad_norm": 0.1379011923261903,
"learning_rate": 5e-05,
"loss": 1.6093,
"step": 1973
},
{
"epoch": 1.9910273213025507,
"grad_norm": 0.15400627838079264,
"learning_rate": 5e-05,
"loss": 1.5877,
"step": 1974
},
{
"epoch": 1.9920354874483315,
"grad_norm": 0.13931815971769526,
"learning_rate": 5e-05,
"loss": 1.6181,
"step": 1975
},
{
"epoch": 1.9930436535941123,
"grad_norm": 0.14901278041832924,
"learning_rate": 5e-05,
"loss": 1.6117,
"step": 1976
},
{
"epoch": 1.994051819739893,
"grad_norm": 0.13392496239472437,
"learning_rate": 5e-05,
"loss": 1.5965,
"step": 1977
},
{
"epoch": 1.995059985885674,
"grad_norm": 0.14729085153751564,
"learning_rate": 5e-05,
"loss": 1.5922,
"step": 1978
},
{
"epoch": 1.9960681520314547,
"grad_norm": 0.1441581729480837,
"learning_rate": 5e-05,
"loss": 1.5931,
"step": 1979
},
{
"epoch": 1.9970763181772355,
"grad_norm": 0.14997561173196974,
"learning_rate": 5e-05,
"loss": 1.5888,
"step": 1980
},
{
"epoch": 1.9980844843230163,
"grad_norm": 0.14243833152383623,
"learning_rate": 5e-05,
"loss": 1.6121,
"step": 1981
},
{
"epoch": 1.9990926504687971,
"grad_norm": 0.13679601875916828,
"learning_rate": 5e-05,
"loss": 1.6121,
"step": 1982
},
{
"epoch": 2.001008166145781,
"grad_norm": 0.15239737325547434,
"learning_rate": 5e-05,
"loss": 3.149,
"step": 1983
},
{
"epoch": 2.0020163322915616,
"grad_norm": 0.1414434123590037,
"learning_rate": 5e-05,
"loss": 1.5731,
"step": 1984
},
{
"epoch": 2.0030244984373424,
"grad_norm": 0.14044131087245215,
"learning_rate": 5e-05,
"loss": 1.5719,
"step": 1985
},
{
"epoch": 2.0040326645831232,
"grad_norm": 0.14938263143092181,
"learning_rate": 5e-05,
"loss": 1.5797,
"step": 1986
},
{
"epoch": 2.005040830728904,
"grad_norm": 0.1480385858459359,
"learning_rate": 5e-05,
"loss": 1.5864,
"step": 1987
},
{
"epoch": 2.006048996874685,
"grad_norm": 0.15105987898215928,
"learning_rate": 5e-05,
"loss": 1.5857,
"step": 1988
},
{
"epoch": 2.0070571630204657,
"grad_norm": 0.15277805692282542,
"learning_rate": 5e-05,
"loss": 1.5877,
"step": 1989
},
{
"epoch": 2.0080653291662465,
"grad_norm": 0.14136226071223026,
"learning_rate": 5e-05,
"loss": 1.5922,
"step": 1990
},
{
"epoch": 2.0090734953120273,
"grad_norm": 0.15906443661641592,
"learning_rate": 5e-05,
"loss": 1.5765,
"step": 1991
},
{
"epoch": 2.010081661457808,
"grad_norm": 0.1390828662294288,
"learning_rate": 5e-05,
"loss": 1.5749,
"step": 1992
},
{
"epoch": 2.011089827603589,
"grad_norm": 0.13479860947934347,
"learning_rate": 5e-05,
"loss": 1.5758,
"step": 1993
},
{
"epoch": 2.0120979937493697,
"grad_norm": 0.14011583458011628,
"learning_rate": 5e-05,
"loss": 1.5849,
"step": 1994
},
{
"epoch": 2.0131061598951505,
"grad_norm": 0.14109946386905808,
"learning_rate": 5e-05,
"loss": 1.599,
"step": 1995
},
{
"epoch": 2.0141143260409313,
"grad_norm": 0.13679672267236298,
"learning_rate": 5e-05,
"loss": 1.594,
"step": 1996
},
{
"epoch": 2.0151224921867126,
"grad_norm": 0.14025024019966215,
"learning_rate": 5e-05,
"loss": 1.5715,
"step": 1997
},
{
"epoch": 2.0161306583324934,
"grad_norm": 0.14492555269221222,
"learning_rate": 5e-05,
"loss": 1.5849,
"step": 1998
},
{
"epoch": 2.017138824478274,
"grad_norm": 0.14225952288247473,
"learning_rate": 5e-05,
"loss": 1.5875,
"step": 1999
},
{
"epoch": 2.018146990624055,
"grad_norm": 0.12931382989804227,
"learning_rate": 5e-05,
"loss": 1.5862,
"step": 2000
},
{
"epoch": 2.019155156769836,
"grad_norm": 0.12570288310495115,
"learning_rate": 5e-05,
"loss": 1.582,
"step": 2001
},
{
"epoch": 2.0201633229156166,
"grad_norm": 0.128123634753698,
"learning_rate": 5e-05,
"loss": 1.5896,
"step": 2002
},
{
"epoch": 2.0211714890613974,
"grad_norm": 0.13997309593233415,
"learning_rate": 5e-05,
"loss": 1.5859,
"step": 2003
},
{
"epoch": 2.0221796552071782,
"grad_norm": 0.1277717156019842,
"learning_rate": 5e-05,
"loss": 1.5649,
"step": 2004
},
{
"epoch": 2.023187821352959,
"grad_norm": 0.1364969063859019,
"learning_rate": 5e-05,
"loss": 1.5711,
"step": 2005
},
{
"epoch": 2.02419598749874,
"grad_norm": 0.13159486148668112,
"learning_rate": 5e-05,
"loss": 1.5834,
"step": 2006
},
{
"epoch": 2.0252041536445207,
"grad_norm": 0.1400695599752123,
"learning_rate": 5e-05,
"loss": 1.5679,
"step": 2007
},
{
"epoch": 2.0262123197903015,
"grad_norm": 0.12494151796163276,
"learning_rate": 5e-05,
"loss": 1.5795,
"step": 2008
},
{
"epoch": 2.0272204859360823,
"grad_norm": 0.12971624165572987,
"learning_rate": 5e-05,
"loss": 1.5857,
"step": 2009
},
{
"epoch": 2.028228652081863,
"grad_norm": 0.14127100423090322,
"learning_rate": 5e-05,
"loss": 1.58,
"step": 2010
},
{
"epoch": 2.029236818227644,
"grad_norm": 0.12546115316106868,
"learning_rate": 5e-05,
"loss": 1.6,
"step": 2011
},
{
"epoch": 2.0302449843734247,
"grad_norm": 0.14428275735853666,
"learning_rate": 5e-05,
"loss": 1.5982,
"step": 2012
},
{
"epoch": 2.0312531505192055,
"grad_norm": 0.1332295837228725,
"learning_rate": 5e-05,
"loss": 1.5935,
"step": 2013
},
{
"epoch": 2.0322613166649863,
"grad_norm": 0.13597233433162517,
"learning_rate": 5e-05,
"loss": 1.5897,
"step": 2014
},
{
"epoch": 2.033269482810767,
"grad_norm": 0.13374651915633368,
"learning_rate": 5e-05,
"loss": 1.582,
"step": 2015
},
{
"epoch": 2.034277648956548,
"grad_norm": 1.0718429567439263,
"learning_rate": 5e-05,
"loss": 1.6054,
"step": 2016
},
{
"epoch": 2.0352858151023288,
"grad_norm": 0.1454753303776003,
"learning_rate": 5e-05,
"loss": 1.5744,
"step": 2017
},
{
"epoch": 2.0362939812481096,
"grad_norm": 1.3192993490814344,
"learning_rate": 5e-05,
"loss": 1.5846,
"step": 2018
},
{
"epoch": 2.0373021473938904,
"grad_norm": 0.15829490474116634,
"learning_rate": 5e-05,
"loss": 1.5645,
"step": 2019
},
{
"epoch": 2.038310313539671,
"grad_norm": 0.4166200931020022,
"learning_rate": 5e-05,
"loss": 1.5748,
"step": 2020
},
{
"epoch": 2.039318479685452,
"grad_norm": 0.16462236982161826,
"learning_rate": 5e-05,
"loss": 1.6045,
"step": 2021
},
{
"epoch": 2.040326645831233,
"grad_norm": 0.18848958751308395,
"learning_rate": 5e-05,
"loss": 1.5723,
"step": 2022
},
{
"epoch": 2.0413348119770136,
"grad_norm": 2.8691517912384406,
"learning_rate": 5e-05,
"loss": 1.5825,
"step": 2023
},
{
"epoch": 2.042342978122795,
"grad_norm": 0.1831129706751049,
"learning_rate": 5e-05,
"loss": 1.5909,
"step": 2024
},
{
"epoch": 2.0433511442685757,
"grad_norm": 0.1542464053659099,
"learning_rate": 5e-05,
"loss": 1.5815,
"step": 2025
},
{
"epoch": 2.0443593104143565,
"grad_norm": 0.16951371372735333,
"learning_rate": 5e-05,
"loss": 1.5849,
"step": 2026
},
{
"epoch": 2.0453674765601373,
"grad_norm": 0.15075713521771286,
"learning_rate": 5e-05,
"loss": 1.5829,
"step": 2027
},
{
"epoch": 2.046375642705918,
"grad_norm": 0.1710363964481069,
"learning_rate": 5e-05,
"loss": 1.5661,
"step": 2028
},
{
"epoch": 2.047383808851699,
"grad_norm": 0.1407350500918785,
"learning_rate": 5e-05,
"loss": 1.5794,
"step": 2029
},
{
"epoch": 2.0483919749974797,
"grad_norm": 0.12745889682392728,
"learning_rate": 5e-05,
"loss": 1.5856,
"step": 2030
},
{
"epoch": 2.0494001411432605,
"grad_norm": 0.1548959761618227,
"learning_rate": 5e-05,
"loss": 1.571,
"step": 2031
},
{
"epoch": 2.0504083072890413,
"grad_norm": 0.13805558786072203,
"learning_rate": 5e-05,
"loss": 1.5865,
"step": 2032
},
{
"epoch": 2.051416473434822,
"grad_norm": 0.13469138055910054,
"learning_rate": 5e-05,
"loss": 1.5706,
"step": 2033
},
{
"epoch": 2.052424639580603,
"grad_norm": 0.13611480781385657,
"learning_rate": 5e-05,
"loss": 1.5763,
"step": 2034
},
{
"epoch": 2.0534328057263838,
"grad_norm": 0.12987461003973538,
"learning_rate": 5e-05,
"loss": 1.574,
"step": 2035
},
{
"epoch": 2.0544409718721646,
"grad_norm": 0.1394014787871454,
"learning_rate": 5e-05,
"loss": 1.6029,
"step": 2036
},
{
"epoch": 2.0554491380179454,
"grad_norm": 0.12930904952960037,
"learning_rate": 5e-05,
"loss": 1.5956,
"step": 2037
},
{
"epoch": 2.056457304163726,
"grad_norm": 0.12879520510508266,
"learning_rate": 5e-05,
"loss": 1.5755,
"step": 2038
},
{
"epoch": 2.057465470309507,
"grad_norm": 0.134482974278635,
"learning_rate": 5e-05,
"loss": 1.5903,
"step": 2039
},
{
"epoch": 2.058473636455288,
"grad_norm": 0.12654255401460732,
"learning_rate": 5e-05,
"loss": 1.5734,
"step": 2040
},
{
"epoch": 2.0594818026010686,
"grad_norm": 0.13279043549099975,
"learning_rate": 5e-05,
"loss": 1.575,
"step": 2041
},
{
"epoch": 2.0604899687468494,
"grad_norm": 0.1333429239635304,
"learning_rate": 5e-05,
"loss": 1.5925,
"step": 2042
},
{
"epoch": 2.0614981348926302,
"grad_norm": 0.1418866674864204,
"learning_rate": 5e-05,
"loss": 1.5896,
"step": 2043
},
{
"epoch": 2.062506301038411,
"grad_norm": 0.14434314231466377,
"learning_rate": 5e-05,
"loss": 1.588,
"step": 2044
},
{
"epoch": 2.063514467184192,
"grad_norm": 0.1344900582718443,
"learning_rate": 5e-05,
"loss": 1.5736,
"step": 2045
},
{
"epoch": 2.0645226333299727,
"grad_norm": 0.12807147404779445,
"learning_rate": 5e-05,
"loss": 1.5801,
"step": 2046
},
{
"epoch": 2.0655307994757535,
"grad_norm": 0.13979396715382578,
"learning_rate": 5e-05,
"loss": 1.5837,
"step": 2047
},
{
"epoch": 2.0665389656215343,
"grad_norm": 0.14404907785468482,
"learning_rate": 5e-05,
"loss": 1.6047,
"step": 2048
},
{
"epoch": 2.067547131767315,
"grad_norm": 0.13668043965508236,
"learning_rate": 5e-05,
"loss": 1.6051,
"step": 2049
},
{
"epoch": 2.068555297913096,
"grad_norm": 0.1351624797762825,
"learning_rate": 5e-05,
"loss": 1.5896,
"step": 2050
},
{
"epoch": 2.0695634640588767,
"grad_norm": 0.13984443389880155,
"learning_rate": 5e-05,
"loss": 1.5854,
"step": 2051
},
{
"epoch": 2.0705716302046575,
"grad_norm": 0.16648618218296007,
"learning_rate": 5e-05,
"loss": 1.5876,
"step": 2052
},
{
"epoch": 2.0715797963504388,
"grad_norm": 0.13088261845123292,
"learning_rate": 5e-05,
"loss": 1.5868,
"step": 2053
},
{
"epoch": 2.0725879624962196,
"grad_norm": 0.13745554033609456,
"learning_rate": 5e-05,
"loss": 1.5746,
"step": 2054
},
{
"epoch": 2.0735961286420004,
"grad_norm": 0.13532858724224328,
"learning_rate": 5e-05,
"loss": 1.5877,
"step": 2055
},
{
"epoch": 2.074604294787781,
"grad_norm": 0.15855367646124624,
"learning_rate": 5e-05,
"loss": 1.5802,
"step": 2056
},
{
"epoch": 2.075612460933562,
"grad_norm": 0.12206776230979091,
"learning_rate": 5e-05,
"loss": 1.5795,
"step": 2057
},
{
"epoch": 2.076620627079343,
"grad_norm": 0.13082081367498316,
"learning_rate": 5e-05,
"loss": 1.5909,
"step": 2058
},
{
"epoch": 2.0776287932251236,
"grad_norm": 0.14240441263897932,
"learning_rate": 5e-05,
"loss": 1.5802,
"step": 2059
},
{
"epoch": 2.0786369593709044,
"grad_norm": 0.12811284544223986,
"learning_rate": 5e-05,
"loss": 1.5766,
"step": 2060
},
{
"epoch": 2.0796451255166852,
"grad_norm": 4.17170817033231,
"learning_rate": 5e-05,
"loss": 1.5797,
"step": 2061
},
{
"epoch": 2.080653291662466,
"grad_norm": 0.1721950101628573,
"learning_rate": 5e-05,
"loss": 1.587,
"step": 2062
},
{
"epoch": 2.081661457808247,
"grad_norm": 0.13185518863717982,
"learning_rate": 5e-05,
"loss": 1.5774,
"step": 2063
},
{
"epoch": 2.0826696239540277,
"grad_norm": 0.146560571379586,
"learning_rate": 5e-05,
"loss": 1.5938,
"step": 2064
},
{
"epoch": 2.0836777900998085,
"grad_norm": 0.14846207117290025,
"learning_rate": 5e-05,
"loss": 1.5758,
"step": 2065
},
{
"epoch": 2.0846859562455893,
"grad_norm": 0.1311640077841521,
"learning_rate": 5e-05,
"loss": 1.5809,
"step": 2066
},
{
"epoch": 2.08569412239137,
"grad_norm": 0.13241479202761453,
"learning_rate": 5e-05,
"loss": 1.6085,
"step": 2067
},
{
"epoch": 2.086702288537151,
"grad_norm": 0.1312992668660464,
"learning_rate": 5e-05,
"loss": 1.5771,
"step": 2068
},
{
"epoch": 2.0877104546829317,
"grad_norm": 0.1352480938058842,
"learning_rate": 5e-05,
"loss": 1.5829,
"step": 2069
},
{
"epoch": 2.0887186208287125,
"grad_norm": 0.1333541128455232,
"learning_rate": 5e-05,
"loss": 1.6096,
"step": 2070
},
{
"epoch": 2.0897267869744933,
"grad_norm": 0.14180592326583852,
"learning_rate": 5e-05,
"loss": 1.581,
"step": 2071
},
{
"epoch": 2.090734953120274,
"grad_norm": 0.13239894577254888,
"learning_rate": 5e-05,
"loss": 1.5804,
"step": 2072
},
{
"epoch": 2.091743119266055,
"grad_norm": 0.12376712319547249,
"learning_rate": 5e-05,
"loss": 1.5701,
"step": 2073
},
{
"epoch": 2.0927512854118357,
"grad_norm": 0.12850324197745347,
"learning_rate": 5e-05,
"loss": 1.5788,
"step": 2074
},
{
"epoch": 2.0937594515576166,
"grad_norm": 0.13756263568523414,
"learning_rate": 5e-05,
"loss": 1.595,
"step": 2075
},
{
"epoch": 2.0947676177033974,
"grad_norm": 0.13632894139290583,
"learning_rate": 5e-05,
"loss": 1.5792,
"step": 2076
},
{
"epoch": 2.095775783849178,
"grad_norm": 0.1377732804776425,
"learning_rate": 5e-05,
"loss": 1.5925,
"step": 2077
},
{
"epoch": 2.096783949994959,
"grad_norm": 0.1265264861915451,
"learning_rate": 5e-05,
"loss": 1.5803,
"step": 2078
},
{
"epoch": 2.09779211614074,
"grad_norm": 0.1373811074844198,
"learning_rate": 5e-05,
"loss": 1.5895,
"step": 2079
},
{
"epoch": 2.098800282286521,
"grad_norm": 0.16079023863605688,
"learning_rate": 5e-05,
"loss": 1.5761,
"step": 2080
},
{
"epoch": 2.099808448432302,
"grad_norm": 0.1287996846512774,
"learning_rate": 5e-05,
"loss": 1.5884,
"step": 2081
},
{
"epoch": 2.1008166145780827,
"grad_norm": 0.13967937800715108,
"learning_rate": 5e-05,
"loss": 1.584,
"step": 2082
},
{
"epoch": 2.1018247807238635,
"grad_norm": 0.20316101494970074,
"learning_rate": 5e-05,
"loss": 1.5841,
"step": 2083
},
{
"epoch": 2.1028329468696443,
"grad_norm": 0.1351942953234323,
"learning_rate": 5e-05,
"loss": 1.5961,
"step": 2084
},
{
"epoch": 2.103841113015425,
"grad_norm": 0.1349643106789439,
"learning_rate": 5e-05,
"loss": 1.6004,
"step": 2085
},
{
"epoch": 2.104849279161206,
"grad_norm": 0.13766524752091427,
"learning_rate": 5e-05,
"loss": 1.5861,
"step": 2086
},
{
"epoch": 2.1058574453069867,
"grad_norm": 0.14504842619366753,
"learning_rate": 5e-05,
"loss": 1.569,
"step": 2087
},
{
"epoch": 2.1068656114527675,
"grad_norm": 0.1397972647089506,
"learning_rate": 5e-05,
"loss": 1.577,
"step": 2088
},
{
"epoch": 2.1078737775985483,
"grad_norm": 0.14666718158867315,
"learning_rate": 5e-05,
"loss": 1.5732,
"step": 2089
},
{
"epoch": 2.108881943744329,
"grad_norm": 0.14788272692590154,
"learning_rate": 5e-05,
"loss": 1.5743,
"step": 2090
},
{
"epoch": 2.10989010989011,
"grad_norm": 0.1359914376560995,
"learning_rate": 5e-05,
"loss": 1.5776,
"step": 2091
},
{
"epoch": 2.1108982760358908,
"grad_norm": 0.13778006607100088,
"learning_rate": 5e-05,
"loss": 1.5765,
"step": 2092
},
{
"epoch": 2.1119064421816716,
"grad_norm": 0.1368601787177404,
"learning_rate": 5e-05,
"loss": 1.5898,
"step": 2093
},
{
"epoch": 2.1129146083274524,
"grad_norm": 0.1388394060190795,
"learning_rate": 5e-05,
"loss": 1.5891,
"step": 2094
},
{
"epoch": 2.113922774473233,
"grad_norm": 0.14575647068642428,
"learning_rate": 5e-05,
"loss": 1.5689,
"step": 2095
},
{
"epoch": 2.114930940619014,
"grad_norm": 0.14607699343754782,
"learning_rate": 5e-05,
"loss": 1.5795,
"step": 2096
},
{
"epoch": 2.115939106764795,
"grad_norm": 0.1391717279301115,
"learning_rate": 5e-05,
"loss": 1.5871,
"step": 2097
},
{
"epoch": 2.1169472729105756,
"grad_norm": 0.1325220561356466,
"learning_rate": 5e-05,
"loss": 1.5864,
"step": 2098
},
{
"epoch": 2.1179554390563564,
"grad_norm": 0.1518234847245551,
"learning_rate": 5e-05,
"loss": 1.577,
"step": 2099
},
{
"epoch": 2.1189636052021372,
"grad_norm": 0.13544712006560455,
"learning_rate": 5e-05,
"loss": 1.5939,
"step": 2100
},
{
"epoch": 2.119971771347918,
"grad_norm": 0.1349862559901404,
"learning_rate": 5e-05,
"loss": 1.6002,
"step": 2101
},
{
"epoch": 2.120979937493699,
"grad_norm": 0.13453836426341287,
"learning_rate": 5e-05,
"loss": 1.5763,
"step": 2102
},
{
"epoch": 2.1219881036394797,
"grad_norm": 2.960637713596343,
"learning_rate": 5e-05,
"loss": 1.6011,
"step": 2103
},
{
"epoch": 2.1229962697852605,
"grad_norm": 0.15059737751300753,
"learning_rate": 5e-05,
"loss": 1.5764,
"step": 2104
},
{
"epoch": 2.1240044359310413,
"grad_norm": 0.12980284644941475,
"learning_rate": 5e-05,
"loss": 1.5908,
"step": 2105
},
{
"epoch": 2.125012602076822,
"grad_norm": 0.15228697770189628,
"learning_rate": 5e-05,
"loss": 1.5925,
"step": 2106
},
{
"epoch": 2.126020768222603,
"grad_norm": 0.12991652337590776,
"learning_rate": 5e-05,
"loss": 1.5722,
"step": 2107
},
{
"epoch": 2.1270289343683837,
"grad_norm": 0.13422596335523498,
"learning_rate": 5e-05,
"loss": 1.5842,
"step": 2108
},
{
"epoch": 2.128037100514165,
"grad_norm": 0.14033677209306625,
"learning_rate": 5e-05,
"loss": 1.6051,
"step": 2109
},
{
"epoch": 2.1290452666599458,
"grad_norm": 0.13554909833181647,
"learning_rate": 5e-05,
"loss": 1.5684,
"step": 2110
},
{
"epoch": 2.1300534328057266,
"grad_norm": 0.137669348086702,
"learning_rate": 5e-05,
"loss": 1.5865,
"step": 2111
},
{
"epoch": 2.1310615989515074,
"grad_norm": 0.12888124635141174,
"learning_rate": 5e-05,
"loss": 1.5723,
"step": 2112
},
{
"epoch": 2.132069765097288,
"grad_norm": 0.14405589627337975,
"learning_rate": 5e-05,
"loss": 1.5749,
"step": 2113
},
{
"epoch": 2.133077931243069,
"grad_norm": 0.13261819566358374,
"learning_rate": 5e-05,
"loss": 1.5961,
"step": 2114
},
{
"epoch": 2.13408609738885,
"grad_norm": 0.15633574445165777,
"learning_rate": 5e-05,
"loss": 1.5877,
"step": 2115
},
{
"epoch": 2.1350942635346306,
"grad_norm": 0.143612355437984,
"learning_rate": 5e-05,
"loss": 1.5826,
"step": 2116
},
{
"epoch": 2.1361024296804114,
"grad_norm": 0.13224621619113583,
"learning_rate": 5e-05,
"loss": 1.5885,
"step": 2117
},
{
"epoch": 2.1371105958261922,
"grad_norm": 0.1449013360115979,
"learning_rate": 5e-05,
"loss": 1.5805,
"step": 2118
},
{
"epoch": 2.138118761971973,
"grad_norm": 0.13555979751611189,
"learning_rate": 5e-05,
"loss": 1.5847,
"step": 2119
},
{
"epoch": 2.139126928117754,
"grad_norm": 0.14961995183462212,
"learning_rate": 5e-05,
"loss": 1.5903,
"step": 2120
},
{
"epoch": 2.1401350942635347,
"grad_norm": 0.13816884057531062,
"learning_rate": 5e-05,
"loss": 1.5834,
"step": 2121
},
{
"epoch": 2.1411432604093155,
"grad_norm": 0.12696546427922376,
"learning_rate": 5e-05,
"loss": 1.5897,
"step": 2122
},
{
"epoch": 2.1421514265550963,
"grad_norm": 0.1364021704158851,
"learning_rate": 5e-05,
"loss": 1.5808,
"step": 2123
},
{
"epoch": 2.143159592700877,
"grad_norm": 0.32059154960425695,
"learning_rate": 5e-05,
"loss": 1.5731,
"step": 2124
},
{
"epoch": 2.144167758846658,
"grad_norm": 0.1286960221199563,
"learning_rate": 5e-05,
"loss": 1.5722,
"step": 2125
},
{
"epoch": 2.1451759249924387,
"grad_norm": 0.13168440073637036,
"learning_rate": 5e-05,
"loss": 1.5784,
"step": 2126
},
{
"epoch": 2.1461840911382195,
"grad_norm": 0.12958498432639048,
"learning_rate": 5e-05,
"loss": 1.5662,
"step": 2127
},
{
"epoch": 2.1471922572840003,
"grad_norm": 0.12674408462329906,
"learning_rate": 5e-05,
"loss": 1.576,
"step": 2128
},
{
"epoch": 2.148200423429781,
"grad_norm": 0.1285194149227695,
"learning_rate": 5e-05,
"loss": 1.5878,
"step": 2129
},
{
"epoch": 2.149208589575562,
"grad_norm": 0.13258417774564543,
"learning_rate": 5e-05,
"loss": 1.5715,
"step": 2130
},
{
"epoch": 2.1502167557213427,
"grad_norm": 0.13196317225101575,
"learning_rate": 5e-05,
"loss": 1.5752,
"step": 2131
},
{
"epoch": 2.1512249218671236,
"grad_norm": 0.1327938664530659,
"learning_rate": 5e-05,
"loss": 1.5596,
"step": 2132
},
{
"epoch": 2.1522330880129044,
"grad_norm": 0.12022858158050774,
"learning_rate": 5e-05,
"loss": 1.607,
"step": 2133
},
{
"epoch": 2.153241254158685,
"grad_norm": 0.13503719807505016,
"learning_rate": 5e-05,
"loss": 1.5919,
"step": 2134
},
{
"epoch": 2.154249420304466,
"grad_norm": 0.13245288331895266,
"learning_rate": 5e-05,
"loss": 1.5653,
"step": 2135
},
{
"epoch": 2.1552575864502472,
"grad_norm": 0.1312546045670137,
"learning_rate": 5e-05,
"loss": 1.5931,
"step": 2136
},
{
"epoch": 2.1562657525960276,
"grad_norm": 0.13183928326778555,
"learning_rate": 5e-05,
"loss": 1.5893,
"step": 2137
},
{
"epoch": 2.157273918741809,
"grad_norm": 0.1317393731662259,
"learning_rate": 5e-05,
"loss": 1.5663,
"step": 2138
},
{
"epoch": 2.1582820848875897,
"grad_norm": 0.13923135753150415,
"learning_rate": 5e-05,
"loss": 1.6045,
"step": 2139
},
{
"epoch": 2.1592902510333705,
"grad_norm": 0.13848524773596155,
"learning_rate": 5e-05,
"loss": 1.5747,
"step": 2140
},
{
"epoch": 2.1602984171791513,
"grad_norm": 0.13263746512720395,
"learning_rate": 5e-05,
"loss": 1.5734,
"step": 2141
},
{
"epoch": 2.161306583324932,
"grad_norm": 0.13110665556573656,
"learning_rate": 5e-05,
"loss": 1.5906,
"step": 2142
},
{
"epoch": 2.162314749470713,
"grad_norm": 0.13151151142333048,
"learning_rate": 5e-05,
"loss": 1.5988,
"step": 2143
},
{
"epoch": 2.1633229156164937,
"grad_norm": 0.13200152470504317,
"learning_rate": 5e-05,
"loss": 1.5876,
"step": 2144
},
{
"epoch": 2.1643310817622745,
"grad_norm": 0.13533006578271542,
"learning_rate": 5e-05,
"loss": 1.5894,
"step": 2145
},
{
"epoch": 2.1653392479080553,
"grad_norm": 0.1284499300796628,
"learning_rate": 5e-05,
"loss": 1.5815,
"step": 2146
},
{
"epoch": 2.166347414053836,
"grad_norm": 0.13632214074228208,
"learning_rate": 5e-05,
"loss": 1.5804,
"step": 2147
},
{
"epoch": 2.167355580199617,
"grad_norm": 0.12791435267801157,
"learning_rate": 5e-05,
"loss": 1.5779,
"step": 2148
},
{
"epoch": 2.1683637463453977,
"grad_norm": 0.1473984814477361,
"learning_rate": 5e-05,
"loss": 1.5879,
"step": 2149
},
{
"epoch": 2.1693719124911786,
"grad_norm": 0.13749322099010552,
"learning_rate": 5e-05,
"loss": 1.5867,
"step": 2150
},
{
"epoch": 2.1703800786369594,
"grad_norm": 0.1461564315809883,
"learning_rate": 5e-05,
"loss": 1.5771,
"step": 2151
},
{
"epoch": 2.17138824478274,
"grad_norm": 0.1458481299707068,
"learning_rate": 5e-05,
"loss": 1.5897,
"step": 2152
},
{
"epoch": 2.172396410928521,
"grad_norm": 0.1498709317955309,
"learning_rate": 5e-05,
"loss": 1.5626,
"step": 2153
},
{
"epoch": 2.173404577074302,
"grad_norm": 0.1492508722779085,
"learning_rate": 5e-05,
"loss": 1.6004,
"step": 2154
},
{
"epoch": 2.1744127432200826,
"grad_norm": 0.13424273422951521,
"learning_rate": 5e-05,
"loss": 1.579,
"step": 2155
},
{
"epoch": 2.1754209093658634,
"grad_norm": 0.13508285975796092,
"learning_rate": 5e-05,
"loss": 1.572,
"step": 2156
},
{
"epoch": 2.176429075511644,
"grad_norm": 0.14287579327034905,
"learning_rate": 5e-05,
"loss": 1.565,
"step": 2157
},
{
"epoch": 2.177437241657425,
"grad_norm": 0.15647576630368834,
"learning_rate": 5e-05,
"loss": 1.601,
"step": 2158
},
{
"epoch": 2.178445407803206,
"grad_norm": 0.13642422731103188,
"learning_rate": 5e-05,
"loss": 1.5608,
"step": 2159
},
{
"epoch": 2.1794535739489866,
"grad_norm": 0.15150847011968074,
"learning_rate": 5e-05,
"loss": 1.58,
"step": 2160
},
{
"epoch": 2.1804617400947675,
"grad_norm": 0.13834478868061467,
"learning_rate": 5e-05,
"loss": 1.5742,
"step": 2161
},
{
"epoch": 2.1814699062405483,
"grad_norm": 0.1329070692603562,
"learning_rate": 5e-05,
"loss": 1.5851,
"step": 2162
},
{
"epoch": 2.182478072386329,
"grad_norm": 0.1442442344587271,
"learning_rate": 5e-05,
"loss": 1.5689,
"step": 2163
},
{
"epoch": 2.18348623853211,
"grad_norm": 0.13450680290698083,
"learning_rate": 5e-05,
"loss": 1.5923,
"step": 2164
},
{
"epoch": 2.184494404677891,
"grad_norm": 0.13365578601135725,
"learning_rate": 5e-05,
"loss": 1.586,
"step": 2165
},
{
"epoch": 2.185502570823672,
"grad_norm": 0.13122827567534617,
"learning_rate": 5e-05,
"loss": 1.5887,
"step": 2166
},
{
"epoch": 2.1865107369694528,
"grad_norm": 0.1324478292442316,
"learning_rate": 5e-05,
"loss": 1.5651,
"step": 2167
},
{
"epoch": 2.1875189031152336,
"grad_norm": 0.13095148860474917,
"learning_rate": 5e-05,
"loss": 1.57,
"step": 2168
},
{
"epoch": 2.1885270692610144,
"grad_norm": 0.13487743576121505,
"learning_rate": 5e-05,
"loss": 1.5847,
"step": 2169
},
{
"epoch": 2.189535235406795,
"grad_norm": 0.13851027435124633,
"learning_rate": 5e-05,
"loss": 1.5942,
"step": 2170
},
{
"epoch": 2.190543401552576,
"grad_norm": 0.13806832538791874,
"learning_rate": 5e-05,
"loss": 1.5841,
"step": 2171
},
{
"epoch": 2.191551567698357,
"grad_norm": 0.136314059375051,
"learning_rate": 5e-05,
"loss": 1.567,
"step": 2172
},
{
"epoch": 2.1925597338441376,
"grad_norm": 0.13767914337656698,
"learning_rate": 5e-05,
"loss": 1.5817,
"step": 2173
},
{
"epoch": 2.1935678999899184,
"grad_norm": 0.13590218518992947,
"learning_rate": 5e-05,
"loss": 1.5909,
"step": 2174
},
{
"epoch": 2.194576066135699,
"grad_norm": 0.140979551198686,
"learning_rate": 5e-05,
"loss": 1.5849,
"step": 2175
},
{
"epoch": 2.19558423228148,
"grad_norm": 0.1351539370773555,
"learning_rate": 5e-05,
"loss": 1.5887,
"step": 2176
},
{
"epoch": 2.196592398427261,
"grad_norm": 0.13522254246517385,
"learning_rate": 5e-05,
"loss": 1.5631,
"step": 2177
},
{
"epoch": 2.1976005645730416,
"grad_norm": 0.1341256457939,
"learning_rate": 5e-05,
"loss": 1.5681,
"step": 2178
},
{
"epoch": 2.1986087307188225,
"grad_norm": 0.182496231321001,
"learning_rate": 5e-05,
"loss": 1.5728,
"step": 2179
},
{
"epoch": 2.1996168968646033,
"grad_norm": 0.1301106186108056,
"learning_rate": 5e-05,
"loss": 1.5915,
"step": 2180
},
{
"epoch": 2.200625063010384,
"grad_norm": 0.24496089985012215,
"learning_rate": 5e-05,
"loss": 1.5783,
"step": 2181
},
{
"epoch": 2.201633229156165,
"grad_norm": 0.13970245980834814,
"learning_rate": 5e-05,
"loss": 1.5774,
"step": 2182
},
{
"epoch": 2.2026413953019457,
"grad_norm": 0.13368084450749423,
"learning_rate": 5e-05,
"loss": 1.5732,
"step": 2183
},
{
"epoch": 2.2036495614477265,
"grad_norm": 0.14027167656602788,
"learning_rate": 5e-05,
"loss": 1.5801,
"step": 2184
},
{
"epoch": 2.2046577275935073,
"grad_norm": 0.12608760050047663,
"learning_rate": 5e-05,
"loss": 1.565,
"step": 2185
},
{
"epoch": 2.205665893739288,
"grad_norm": 0.1400159673728019,
"learning_rate": 5e-05,
"loss": 1.591,
"step": 2186
},
{
"epoch": 2.206674059885069,
"grad_norm": 0.1280770249793632,
"learning_rate": 5e-05,
"loss": 1.5874,
"step": 2187
},
{
"epoch": 2.2076822260308497,
"grad_norm": 0.13005190751973192,
"learning_rate": 5e-05,
"loss": 1.5717,
"step": 2188
},
{
"epoch": 2.2086903921766305,
"grad_norm": 0.13149320702928272,
"learning_rate": 5e-05,
"loss": 1.5892,
"step": 2189
},
{
"epoch": 2.2096985583224114,
"grad_norm": 0.13944948801880389,
"learning_rate": 5e-05,
"loss": 1.5683,
"step": 2190
},
{
"epoch": 2.210706724468192,
"grad_norm": 0.1347679049234339,
"learning_rate": 5e-05,
"loss": 1.5784,
"step": 2191
},
{
"epoch": 2.2117148906139734,
"grad_norm": 0.13761469202598953,
"learning_rate": 5e-05,
"loss": 1.5801,
"step": 2192
},
{
"epoch": 2.212723056759754,
"grad_norm": 0.1359688017286691,
"learning_rate": 5e-05,
"loss": 1.5811,
"step": 2193
},
{
"epoch": 2.213731222905535,
"grad_norm": 0.13710284930832534,
"learning_rate": 5e-05,
"loss": 1.578,
"step": 2194
},
{
"epoch": 2.214739389051316,
"grad_norm": 0.13307043588957998,
"learning_rate": 5e-05,
"loss": 1.5851,
"step": 2195
},
{
"epoch": 2.2157475551970967,
"grad_norm": 0.13932056597633424,
"learning_rate": 5e-05,
"loss": 1.5768,
"step": 2196
},
{
"epoch": 2.2167557213428775,
"grad_norm": 0.14029489437072842,
"learning_rate": 5e-05,
"loss": 1.5786,
"step": 2197
},
{
"epoch": 2.2177638874886583,
"grad_norm": 0.12511079910671596,
"learning_rate": 5e-05,
"loss": 1.5635,
"step": 2198
},
{
"epoch": 2.218772053634439,
"grad_norm": 0.14343716835451112,
"learning_rate": 5e-05,
"loss": 1.5872,
"step": 2199
},
{
"epoch": 2.21978021978022,
"grad_norm": 0.2178821073538235,
"learning_rate": 5e-05,
"loss": 1.5571,
"step": 2200
},
{
"epoch": 2.2207883859260007,
"grad_norm": 0.13920646180211935,
"learning_rate": 5e-05,
"loss": 1.5638,
"step": 2201
},
{
"epoch": 2.2217965520717815,
"grad_norm": 0.14362100346423048,
"learning_rate": 5e-05,
"loss": 1.5828,
"step": 2202
},
{
"epoch": 2.2228047182175623,
"grad_norm": 0.14703335119452382,
"learning_rate": 5e-05,
"loss": 1.5693,
"step": 2203
},
{
"epoch": 2.223812884363343,
"grad_norm": 0.14195761111941657,
"learning_rate": 5e-05,
"loss": 1.5727,
"step": 2204
},
{
"epoch": 2.224821050509124,
"grad_norm": 0.13970507344162816,
"learning_rate": 5e-05,
"loss": 1.5999,
"step": 2205
},
{
"epoch": 2.2258292166549047,
"grad_norm": 0.14852447157811244,
"learning_rate": 5e-05,
"loss": 1.5775,
"step": 2206
},
{
"epoch": 2.2268373828006855,
"grad_norm": 0.1363828903790539,
"learning_rate": 5e-05,
"loss": 1.583,
"step": 2207
},
{
"epoch": 2.2278455489464664,
"grad_norm": 0.1481544737328289,
"learning_rate": 5e-05,
"loss": 1.5798,
"step": 2208
},
{
"epoch": 2.228853715092247,
"grad_norm": 0.1374534164490761,
"learning_rate": 5e-05,
"loss": 1.5645,
"step": 2209
},
{
"epoch": 2.229861881238028,
"grad_norm": 0.163200552421793,
"learning_rate": 5e-05,
"loss": 1.5766,
"step": 2210
},
{
"epoch": 2.230870047383809,
"grad_norm": 0.12232954664324697,
"learning_rate": 5e-05,
"loss": 1.5907,
"step": 2211
},
{
"epoch": 2.2318782135295896,
"grad_norm": 0.13950948739812444,
"learning_rate": 5e-05,
"loss": 1.5693,
"step": 2212
},
{
"epoch": 2.2328863796753704,
"grad_norm": 0.14155762951252868,
"learning_rate": 5e-05,
"loss": 1.5744,
"step": 2213
},
{
"epoch": 2.233894545821151,
"grad_norm": 0.1472663258665249,
"learning_rate": 5e-05,
"loss": 1.5624,
"step": 2214
},
{
"epoch": 2.234902711966932,
"grad_norm": 0.1395352568904782,
"learning_rate": 5e-05,
"loss": 1.5837,
"step": 2215
},
{
"epoch": 2.235910878112713,
"grad_norm": 0.14155823128431386,
"learning_rate": 5e-05,
"loss": 1.5628,
"step": 2216
},
{
"epoch": 2.2369190442584936,
"grad_norm": 0.1252450124685863,
"learning_rate": 5e-05,
"loss": 1.5732,
"step": 2217
},
{
"epoch": 2.2379272104042744,
"grad_norm": 0.14270458613439232,
"learning_rate": 5e-05,
"loss": 1.5672,
"step": 2218
},
{
"epoch": 2.2389353765500553,
"grad_norm": 0.1327344509151647,
"learning_rate": 5e-05,
"loss": 1.5768,
"step": 2219
},
{
"epoch": 2.239943542695836,
"grad_norm": 0.14289844364572002,
"learning_rate": 5e-05,
"loss": 1.5655,
"step": 2220
},
{
"epoch": 2.2409517088416173,
"grad_norm": 0.13075263019525016,
"learning_rate": 5e-05,
"loss": 1.5631,
"step": 2221
},
{
"epoch": 2.241959874987398,
"grad_norm": 0.1353210741302557,
"learning_rate": 5e-05,
"loss": 1.5826,
"step": 2222
},
{
"epoch": 2.242968041133179,
"grad_norm": 0.12712135635024222,
"learning_rate": 5e-05,
"loss": 1.5672,
"step": 2223
},
{
"epoch": 2.2439762072789597,
"grad_norm": 0.13005000485849497,
"learning_rate": 5e-05,
"loss": 1.5779,
"step": 2224
},
{
"epoch": 2.2449843734247406,
"grad_norm": 0.13477545800901453,
"learning_rate": 5e-05,
"loss": 1.5872,
"step": 2225
},
{
"epoch": 2.2459925395705214,
"grad_norm": 0.1455016663994394,
"learning_rate": 5e-05,
"loss": 1.5901,
"step": 2226
},
{
"epoch": 2.247000705716302,
"grad_norm": 0.13429071804096004,
"learning_rate": 5e-05,
"loss": 1.5639,
"step": 2227
},
{
"epoch": 2.248008871862083,
"grad_norm": 0.13385857234037593,
"learning_rate": 5e-05,
"loss": 1.5959,
"step": 2228
},
{
"epoch": 2.249017038007864,
"grad_norm": 0.13026162883941678,
"learning_rate": 5e-05,
"loss": 1.5955,
"step": 2229
},
{
"epoch": 2.2500252041536446,
"grad_norm": 0.15064310024726288,
"learning_rate": 5e-05,
"loss": 1.5859,
"step": 2230
},
{
"epoch": 2.2510333702994254,
"grad_norm": 0.13267969070820512,
"learning_rate": 5e-05,
"loss": 1.5738,
"step": 2231
},
{
"epoch": 2.252041536445206,
"grad_norm": 0.12875095223195912,
"learning_rate": 5e-05,
"loss": 1.5912,
"step": 2232
},
{
"epoch": 2.253049702590987,
"grad_norm": 0.1496068264954723,
"learning_rate": 5e-05,
"loss": 1.576,
"step": 2233
},
{
"epoch": 2.254057868736768,
"grad_norm": 0.13575255873767209,
"learning_rate": 5e-05,
"loss": 1.5781,
"step": 2234
},
{
"epoch": 2.2550660348825486,
"grad_norm": 0.1418476006839431,
"learning_rate": 5e-05,
"loss": 1.5879,
"step": 2235
},
{
"epoch": 2.2560742010283295,
"grad_norm": 0.1390998352061481,
"learning_rate": 5e-05,
"loss": 1.5647,
"step": 2236
},
{
"epoch": 2.2570823671741103,
"grad_norm": 0.14615130491475525,
"learning_rate": 5e-05,
"loss": 1.584,
"step": 2237
},
{
"epoch": 2.258090533319891,
"grad_norm": 0.12856984501618252,
"learning_rate": 5e-05,
"loss": 1.5756,
"step": 2238
},
{
"epoch": 2.259098699465672,
"grad_norm": 0.13665089453090834,
"learning_rate": 5e-05,
"loss": 1.5582,
"step": 2239
},
{
"epoch": 2.2601068656114527,
"grad_norm": 1.3474519747649263,
"learning_rate": 5e-05,
"loss": 1.5816,
"step": 2240
},
{
"epoch": 2.2611150317572335,
"grad_norm": 0.1448758915526553,
"learning_rate": 5e-05,
"loss": 1.5757,
"step": 2241
},
{
"epoch": 2.2621231979030143,
"grad_norm": 0.14092104981267023,
"learning_rate": 5e-05,
"loss": 1.5724,
"step": 2242
},
{
"epoch": 2.263131364048795,
"grad_norm": 0.1509143463685949,
"learning_rate": 5e-05,
"loss": 1.5728,
"step": 2243
},
{
"epoch": 2.264139530194576,
"grad_norm": 0.14023101809057317,
"learning_rate": 5e-05,
"loss": 1.5741,
"step": 2244
},
{
"epoch": 2.2651476963403567,
"grad_norm": 0.1484884492305928,
"learning_rate": 5e-05,
"loss": 1.5545,
"step": 2245
},
{
"epoch": 2.2661558624861375,
"grad_norm": 0.14920882778947953,
"learning_rate": 5e-05,
"loss": 1.5876,
"step": 2246
},
{
"epoch": 2.2671640286319183,
"grad_norm": 0.1632290484686048,
"learning_rate": 5e-05,
"loss": 1.5732,
"step": 2247
},
{
"epoch": 2.2681721947776996,
"grad_norm": 0.14978755710589833,
"learning_rate": 5e-05,
"loss": 1.6006,
"step": 2248
},
{
"epoch": 2.26918036092348,
"grad_norm": 0.14689789735003894,
"learning_rate": 5e-05,
"loss": 1.5757,
"step": 2249
},
{
"epoch": 2.270188527069261,
"grad_norm": 0.15144881512451783,
"learning_rate": 5e-05,
"loss": 1.5879,
"step": 2250
},
{
"epoch": 2.271196693215042,
"grad_norm": 0.15866554211155684,
"learning_rate": 5e-05,
"loss": 1.567,
"step": 2251
},
{
"epoch": 2.272204859360823,
"grad_norm": 0.14616524485747734,
"learning_rate": 5e-05,
"loss": 1.5915,
"step": 2252
},
{
"epoch": 2.2732130255066036,
"grad_norm": 0.13541258512111323,
"learning_rate": 5e-05,
"loss": 1.5909,
"step": 2253
},
{
"epoch": 2.2742211916523845,
"grad_norm": 0.14557208487414966,
"learning_rate": 5e-05,
"loss": 1.5753,
"step": 2254
},
{
"epoch": 2.2752293577981653,
"grad_norm": 0.14355832149078251,
"learning_rate": 5e-05,
"loss": 1.5777,
"step": 2255
},
{
"epoch": 2.276237523943946,
"grad_norm": 0.15534461022920001,
"learning_rate": 5e-05,
"loss": 1.582,
"step": 2256
},
{
"epoch": 2.277245690089727,
"grad_norm": 0.14040643758465315,
"learning_rate": 5e-05,
"loss": 1.5937,
"step": 2257
},
{
"epoch": 2.2782538562355077,
"grad_norm": 0.14217759030812158,
"learning_rate": 5e-05,
"loss": 1.5939,
"step": 2258
},
{
"epoch": 2.2792620223812885,
"grad_norm": 0.15321054452281707,
"learning_rate": 5e-05,
"loss": 1.5897,
"step": 2259
},
{
"epoch": 2.2802701885270693,
"grad_norm": 0.13411534025001054,
"learning_rate": 5e-05,
"loss": 1.571,
"step": 2260
},
{
"epoch": 2.28127835467285,
"grad_norm": 0.1264814479446074,
"learning_rate": 5e-05,
"loss": 1.5781,
"step": 2261
},
{
"epoch": 2.282286520818631,
"grad_norm": 0.13892914007499751,
"learning_rate": 5e-05,
"loss": 1.5753,
"step": 2262
},
{
"epoch": 2.2832946869644117,
"grad_norm": 0.13891896623161473,
"learning_rate": 5e-05,
"loss": 1.5754,
"step": 2263
},
{
"epoch": 2.2843028531101925,
"grad_norm": 0.13986597276910606,
"learning_rate": 5e-05,
"loss": 1.5846,
"step": 2264
},
{
"epoch": 2.2853110192559734,
"grad_norm": 0.1313651130008528,
"learning_rate": 5e-05,
"loss": 1.583,
"step": 2265
},
{
"epoch": 2.286319185401754,
"grad_norm": 0.14180149149457943,
"learning_rate": 5e-05,
"loss": 1.6002,
"step": 2266
},
{
"epoch": 2.287327351547535,
"grad_norm": 0.13664033183968566,
"learning_rate": 5e-05,
"loss": 1.565,
"step": 2267
},
{
"epoch": 2.288335517693316,
"grad_norm": 0.13106617230412848,
"learning_rate": 5e-05,
"loss": 1.5653,
"step": 2268
},
{
"epoch": 2.2893436838390966,
"grad_norm": 0.14110105058274033,
"learning_rate": 5e-05,
"loss": 1.5739,
"step": 2269
},
{
"epoch": 2.2903518499848774,
"grad_norm": 0.13246989225005226,
"learning_rate": 5e-05,
"loss": 1.5812,
"step": 2270
},
{
"epoch": 2.291360016130658,
"grad_norm": 0.12690842300537472,
"learning_rate": 5e-05,
"loss": 1.6001,
"step": 2271
},
{
"epoch": 2.292368182276439,
"grad_norm": 0.14061559722453193,
"learning_rate": 5e-05,
"loss": 1.5666,
"step": 2272
},
{
"epoch": 2.29337634842222,
"grad_norm": 0.13319571985220693,
"learning_rate": 5e-05,
"loss": 1.5843,
"step": 2273
},
{
"epoch": 2.2943845145680006,
"grad_norm": 0.12788817214469078,
"learning_rate": 5e-05,
"loss": 1.5789,
"step": 2274
},
{
"epoch": 2.2953926807137814,
"grad_norm": 0.12997493129548418,
"learning_rate": 5e-05,
"loss": 1.5625,
"step": 2275
},
{
"epoch": 2.2964008468595623,
"grad_norm": 0.13560915101486962,
"learning_rate": 5e-05,
"loss": 1.572,
"step": 2276
},
{
"epoch": 2.2974090130053435,
"grad_norm": 0.13228064336220804,
"learning_rate": 5e-05,
"loss": 1.5789,
"step": 2277
},
{
"epoch": 2.298417179151124,
"grad_norm": 0.1425843624741558,
"learning_rate": 5e-05,
"loss": 1.6008,
"step": 2278
},
{
"epoch": 2.299425345296905,
"grad_norm": 0.14131677728244713,
"learning_rate": 5e-05,
"loss": 1.5733,
"step": 2279
},
{
"epoch": 2.300433511442686,
"grad_norm": 0.14384822881880258,
"learning_rate": 5e-05,
"loss": 1.5723,
"step": 2280
},
{
"epoch": 2.3014416775884667,
"grad_norm": 0.13621106913387487,
"learning_rate": 5e-05,
"loss": 1.5816,
"step": 2281
},
{
"epoch": 2.3024498437342475,
"grad_norm": 0.18150249302366323,
"learning_rate": 5e-05,
"loss": 1.5868,
"step": 2282
},
{
"epoch": 2.3034580098800284,
"grad_norm": 0.13024559329255764,
"learning_rate": 5e-05,
"loss": 1.561,
"step": 2283
},
{
"epoch": 2.304466176025809,
"grad_norm": 0.14221434659857535,
"learning_rate": 5e-05,
"loss": 1.5815,
"step": 2284
},
{
"epoch": 2.30547434217159,
"grad_norm": 0.1243982975878934,
"learning_rate": 5e-05,
"loss": 1.5692,
"step": 2285
},
{
"epoch": 2.306482508317371,
"grad_norm": 0.14314562098995504,
"learning_rate": 5e-05,
"loss": 1.5928,
"step": 2286
},
{
"epoch": 2.3074906744631516,
"grad_norm": 0.1343242688971022,
"learning_rate": 5e-05,
"loss": 1.5871,
"step": 2287
},
{
"epoch": 2.3084988406089324,
"grad_norm": 0.1321173628116035,
"learning_rate": 5e-05,
"loss": 1.5944,
"step": 2288
},
{
"epoch": 2.309507006754713,
"grad_norm": 0.13453862969642263,
"learning_rate": 5e-05,
"loss": 1.5907,
"step": 2289
},
{
"epoch": 2.310515172900494,
"grad_norm": 0.13687930001775503,
"learning_rate": 5e-05,
"loss": 1.5894,
"step": 2290
},
{
"epoch": 2.311523339046275,
"grad_norm": 0.13215284755773643,
"learning_rate": 5e-05,
"loss": 1.5834,
"step": 2291
},
{
"epoch": 2.3125315051920556,
"grad_norm": 0.13579965157318924,
"learning_rate": 5e-05,
"loss": 1.5779,
"step": 2292
},
{
"epoch": 2.3135396713378364,
"grad_norm": 0.14328135334944211,
"learning_rate": 5e-05,
"loss": 1.5758,
"step": 2293
},
{
"epoch": 2.3145478374836173,
"grad_norm": 0.1325118478849696,
"learning_rate": 5e-05,
"loss": 1.5829,
"step": 2294
},
{
"epoch": 2.315556003629398,
"grad_norm": 0.1380410269853047,
"learning_rate": 5e-05,
"loss": 1.5791,
"step": 2295
},
{
"epoch": 2.316564169775179,
"grad_norm": 0.13875953603401303,
"learning_rate": 5e-05,
"loss": 1.5506,
"step": 2296
},
{
"epoch": 2.3175723359209597,
"grad_norm": 0.13467458290717993,
"learning_rate": 5e-05,
"loss": 1.5693,
"step": 2297
},
{
"epoch": 2.3185805020667405,
"grad_norm": 0.140163645245547,
"learning_rate": 5e-05,
"loss": 1.5794,
"step": 2298
},
{
"epoch": 2.3195886682125213,
"grad_norm": 0.1453337264089796,
"learning_rate": 5e-05,
"loss": 1.5605,
"step": 2299
},
{
"epoch": 2.320596834358302,
"grad_norm": 0.13839240934781297,
"learning_rate": 5e-05,
"loss": 1.5904,
"step": 2300
},
{
"epoch": 2.321605000504083,
"grad_norm": 0.14108100710419977,
"learning_rate": 5e-05,
"loss": 1.5721,
"step": 2301
},
{
"epoch": 2.3226131666498637,
"grad_norm": 0.1292487879613601,
"learning_rate": 5e-05,
"loss": 1.5786,
"step": 2302
},
{
"epoch": 2.3236213327956445,
"grad_norm": 0.1415731273313686,
"learning_rate": 5e-05,
"loss": 1.5663,
"step": 2303
},
{
"epoch": 2.324629498941426,
"grad_norm": 0.13192875328689405,
"learning_rate": 5e-05,
"loss": 1.5763,
"step": 2304
},
{
"epoch": 2.325637665087206,
"grad_norm": 0.13791037594694341,
"learning_rate": 5e-05,
"loss": 1.5748,
"step": 2305
},
{
"epoch": 2.3266458312329874,
"grad_norm": 0.132297965228094,
"learning_rate": 5e-05,
"loss": 1.5697,
"step": 2306
},
{
"epoch": 2.327653997378768,
"grad_norm": 0.12873243615415655,
"learning_rate": 5e-05,
"loss": 1.5615,
"step": 2307
},
{
"epoch": 2.328662163524549,
"grad_norm": 0.1565315534359242,
"learning_rate": 5e-05,
"loss": 1.5715,
"step": 2308
},
{
"epoch": 2.32967032967033,
"grad_norm": 0.22566782353722953,
"learning_rate": 5e-05,
"loss": 1.5741,
"step": 2309
},
{
"epoch": 2.3306784958161106,
"grad_norm": 0.14956726587127525,
"learning_rate": 5e-05,
"loss": 1.5733,
"step": 2310
},
{
"epoch": 2.3316866619618914,
"grad_norm": 0.15605951198053955,
"learning_rate": 5e-05,
"loss": 1.586,
"step": 2311
},
{
"epoch": 2.3326948281076723,
"grad_norm": 0.1407105414222044,
"learning_rate": 5e-05,
"loss": 1.5723,
"step": 2312
},
{
"epoch": 2.333702994253453,
"grad_norm": 0.15265911696540452,
"learning_rate": 5e-05,
"loss": 1.5642,
"step": 2313
},
{
"epoch": 2.334711160399234,
"grad_norm": 0.1381536437205612,
"learning_rate": 5e-05,
"loss": 1.5601,
"step": 2314
},
{
"epoch": 2.3357193265450147,
"grad_norm": 0.15228291932338708,
"learning_rate": 5e-05,
"loss": 1.5706,
"step": 2315
},
{
"epoch": 2.3367274926907955,
"grad_norm": 0.1306924937598232,
"learning_rate": 5e-05,
"loss": 1.5827,
"step": 2316
},
{
"epoch": 2.3377356588365763,
"grad_norm": 0.14001665585417195,
"learning_rate": 5e-05,
"loss": 1.5958,
"step": 2317
},
{
"epoch": 2.338743824982357,
"grad_norm": 0.13735707283706902,
"learning_rate": 5e-05,
"loss": 1.5848,
"step": 2318
},
{
"epoch": 2.339751991128138,
"grad_norm": 0.1556551079360177,
"learning_rate": 5e-05,
"loss": 1.5659,
"step": 2319
},
{
"epoch": 2.3407601572739187,
"grad_norm": 0.12912763501066377,
"learning_rate": 5e-05,
"loss": 1.5754,
"step": 2320
},
{
"epoch": 2.3417683234196995,
"grad_norm": 0.1387010095308894,
"learning_rate": 5e-05,
"loss": 1.5826,
"step": 2321
},
{
"epoch": 2.3427764895654803,
"grad_norm": 0.13707015278373597,
"learning_rate": 5e-05,
"loss": 1.5722,
"step": 2322
},
{
"epoch": 2.343784655711261,
"grad_norm": 0.13755459849656163,
"learning_rate": 5e-05,
"loss": 1.599,
"step": 2323
},
{
"epoch": 2.344792821857042,
"grad_norm": 0.1477816867261747,
"learning_rate": 5e-05,
"loss": 1.5929,
"step": 2324
},
{
"epoch": 2.3458009880028228,
"grad_norm": 0.14646723031107797,
"learning_rate": 5e-05,
"loss": 1.5688,
"step": 2325
},
{
"epoch": 2.3468091541486036,
"grad_norm": 0.13740302126433548,
"learning_rate": 5e-05,
"loss": 1.5705,
"step": 2326
},
{
"epoch": 2.3478173202943844,
"grad_norm": 0.14019264086801358,
"learning_rate": 5e-05,
"loss": 1.5734,
"step": 2327
},
{
"epoch": 2.348825486440165,
"grad_norm": 0.1467986736371059,
"learning_rate": 5e-05,
"loss": 1.5707,
"step": 2328
},
{
"epoch": 2.349833652585946,
"grad_norm": 0.14612680323301838,
"learning_rate": 5e-05,
"loss": 1.5849,
"step": 2329
},
{
"epoch": 2.350841818731727,
"grad_norm": 0.12608156150784167,
"learning_rate": 5e-05,
"loss": 1.5938,
"step": 2330
},
{
"epoch": 2.3518499848775076,
"grad_norm": 0.1396137084828096,
"learning_rate": 5e-05,
"loss": 1.5776,
"step": 2331
},
{
"epoch": 2.3528581510232884,
"grad_norm": 0.14697256462380895,
"learning_rate": 5e-05,
"loss": 1.5785,
"step": 2332
},
{
"epoch": 2.3538663171690697,
"grad_norm": 0.13318598995909164,
"learning_rate": 5e-05,
"loss": 1.5649,
"step": 2333
},
{
"epoch": 2.35487448331485,
"grad_norm": 0.13167374592526118,
"learning_rate": 5e-05,
"loss": 1.5588,
"step": 2334
},
{
"epoch": 2.3558826494606313,
"grad_norm": 0.13698685253970463,
"learning_rate": 5e-05,
"loss": 1.5496,
"step": 2335
},
{
"epoch": 2.356890815606412,
"grad_norm": 0.1452425046413725,
"learning_rate": 5e-05,
"loss": 1.5836,
"step": 2336
},
{
"epoch": 2.357898981752193,
"grad_norm": 0.1376201655979563,
"learning_rate": 5e-05,
"loss": 1.5726,
"step": 2337
},
{
"epoch": 2.3589071478979737,
"grad_norm": 0.1319396160325784,
"learning_rate": 5e-05,
"loss": 1.5648,
"step": 2338
},
{
"epoch": 2.3599153140437545,
"grad_norm": 0.13551922943268416,
"learning_rate": 5e-05,
"loss": 1.5738,
"step": 2339
},
{
"epoch": 2.3609234801895354,
"grad_norm": 0.13680512805774192,
"learning_rate": 5e-05,
"loss": 1.5667,
"step": 2340
},
{
"epoch": 2.361931646335316,
"grad_norm": 0.12812856649089038,
"learning_rate": 5e-05,
"loss": 1.5845,
"step": 2341
},
{
"epoch": 2.362939812481097,
"grad_norm": 0.13206644043547325,
"learning_rate": 5e-05,
"loss": 1.5612,
"step": 2342
},
{
"epoch": 2.3639479786268778,
"grad_norm": 0.13412558436594751,
"learning_rate": 5e-05,
"loss": 1.5779,
"step": 2343
},
{
"epoch": 2.3649561447726586,
"grad_norm": 0.13473004496883761,
"learning_rate": 5e-05,
"loss": 1.5746,
"step": 2344
},
{
"epoch": 2.3659643109184394,
"grad_norm": 0.13206568313033137,
"learning_rate": 5e-05,
"loss": 1.5781,
"step": 2345
},
{
"epoch": 2.36697247706422,
"grad_norm": 0.1486456193954694,
"learning_rate": 5e-05,
"loss": 1.568,
"step": 2346
},
{
"epoch": 2.367980643210001,
"grad_norm": 0.12754711366765203,
"learning_rate": 5e-05,
"loss": 1.5649,
"step": 2347
},
{
"epoch": 2.368988809355782,
"grad_norm": 0.13566146603000886,
"learning_rate": 5e-05,
"loss": 1.5655,
"step": 2348
},
{
"epoch": 2.3699969755015626,
"grad_norm": 0.13956288232984534,
"learning_rate": 5e-05,
"loss": 1.5731,
"step": 2349
},
{
"epoch": 2.3710051416473434,
"grad_norm": 0.1642712298546713,
"learning_rate": 5e-05,
"loss": 1.5786,
"step": 2350
},
{
"epoch": 2.3720133077931242,
"grad_norm": 0.1371958047601129,
"learning_rate": 5e-05,
"loss": 1.5597,
"step": 2351
},
{
"epoch": 2.373021473938905,
"grad_norm": 0.1431570975130207,
"learning_rate": 5e-05,
"loss": 1.5628,
"step": 2352
},
{
"epoch": 2.374029640084686,
"grad_norm": 0.12426697149592261,
"learning_rate": 5e-05,
"loss": 1.5779,
"step": 2353
},
{
"epoch": 2.3750378062304667,
"grad_norm": 0.14237237175042702,
"learning_rate": 5e-05,
"loss": 1.5786,
"step": 2354
},
{
"epoch": 2.3760459723762475,
"grad_norm": 0.1318323948497074,
"learning_rate": 5e-05,
"loss": 1.5625,
"step": 2355
},
{
"epoch": 2.3770541385220283,
"grad_norm": 0.13119991131073985,
"learning_rate": 5e-05,
"loss": 1.5647,
"step": 2356
},
{
"epoch": 2.378062304667809,
"grad_norm": 0.14500573229210184,
"learning_rate": 5e-05,
"loss": 1.5727,
"step": 2357
},
{
"epoch": 2.37907047081359,
"grad_norm": 0.13472149296359828,
"learning_rate": 5e-05,
"loss": 1.5687,
"step": 2358
},
{
"epoch": 2.3800786369593707,
"grad_norm": 0.13195202941241038,
"learning_rate": 5e-05,
"loss": 1.5926,
"step": 2359
},
{
"epoch": 2.381086803105152,
"grad_norm": 0.14237660486568385,
"learning_rate": 5e-05,
"loss": 1.5744,
"step": 2360
},
{
"epoch": 2.3820949692509323,
"grad_norm": 0.13706216336194096,
"learning_rate": 5e-05,
"loss": 1.5752,
"step": 2361
},
{
"epoch": 2.3831031353967136,
"grad_norm": 0.1492832019493269,
"learning_rate": 5e-05,
"loss": 1.5751,
"step": 2362
},
{
"epoch": 2.384111301542494,
"grad_norm": 0.13284967563267536,
"learning_rate": 5e-05,
"loss": 1.5828,
"step": 2363
},
{
"epoch": 2.385119467688275,
"grad_norm": 0.13781931262747582,
"learning_rate": 5e-05,
"loss": 1.5615,
"step": 2364
},
{
"epoch": 2.386127633834056,
"grad_norm": 0.13612007614009056,
"learning_rate": 5e-05,
"loss": 1.5774,
"step": 2365
},
{
"epoch": 2.387135799979837,
"grad_norm": 0.1447708769772133,
"learning_rate": 5e-05,
"loss": 1.59,
"step": 2366
},
{
"epoch": 2.3881439661256176,
"grad_norm": 0.13170398607372086,
"learning_rate": 5e-05,
"loss": 1.5639,
"step": 2367
},
{
"epoch": 2.3891521322713984,
"grad_norm": 0.14621589964890505,
"learning_rate": 5e-05,
"loss": 1.5792,
"step": 2368
},
{
"epoch": 2.3901602984171793,
"grad_norm": 0.1511773599153227,
"learning_rate": 5e-05,
"loss": 1.5746,
"step": 2369
},
{
"epoch": 2.39116846456296,
"grad_norm": 0.1458467606603414,
"learning_rate": 5e-05,
"loss": 1.5833,
"step": 2370
},
{
"epoch": 2.392176630708741,
"grad_norm": 0.14927910226780375,
"learning_rate": 5e-05,
"loss": 1.5585,
"step": 2371
},
{
"epoch": 2.3931847968545217,
"grad_norm": 0.1449109010627941,
"learning_rate": 5e-05,
"loss": 1.5763,
"step": 2372
},
{
"epoch": 2.3941929630003025,
"grad_norm": 0.13109405942336458,
"learning_rate": 5e-05,
"loss": 1.5581,
"step": 2373
},
{
"epoch": 2.3952011291460833,
"grad_norm": 0.13934861021734576,
"learning_rate": 5e-05,
"loss": 1.5674,
"step": 2374
},
{
"epoch": 2.396209295291864,
"grad_norm": 0.1446961784361111,
"learning_rate": 5e-05,
"loss": 1.5415,
"step": 2375
},
{
"epoch": 2.397217461437645,
"grad_norm": 0.13755035427062254,
"learning_rate": 5e-05,
"loss": 1.5687,
"step": 2376
},
{
"epoch": 2.3982256275834257,
"grad_norm": 0.12921600400332672,
"learning_rate": 5e-05,
"loss": 1.5843,
"step": 2377
},
{
"epoch": 2.3992337937292065,
"grad_norm": 0.12599830025256673,
"learning_rate": 5e-05,
"loss": 1.5836,
"step": 2378
},
{
"epoch": 2.4002419598749873,
"grad_norm": 0.14284986000034275,
"learning_rate": 5e-05,
"loss": 1.5692,
"step": 2379
},
{
"epoch": 2.401250126020768,
"grad_norm": 0.13384043727676295,
"learning_rate": 5e-05,
"loss": 1.5778,
"step": 2380
},
{
"epoch": 2.402258292166549,
"grad_norm": 0.12888164481531164,
"learning_rate": 5e-05,
"loss": 1.5788,
"step": 2381
},
{
"epoch": 2.4032664583123298,
"grad_norm": 0.13001360124841055,
"learning_rate": 5e-05,
"loss": 1.564,
"step": 2382
},
{
"epoch": 2.4042746244581106,
"grad_norm": 0.13855183595836684,
"learning_rate": 5e-05,
"loss": 1.5777,
"step": 2383
},
{
"epoch": 2.4052827906038914,
"grad_norm": 0.1355775003901884,
"learning_rate": 5e-05,
"loss": 1.5773,
"step": 2384
},
{
"epoch": 2.406290956749672,
"grad_norm": 0.13842090636379575,
"learning_rate": 5e-05,
"loss": 1.577,
"step": 2385
},
{
"epoch": 2.407299122895453,
"grad_norm": 0.1312234964077852,
"learning_rate": 5e-05,
"loss": 1.5805,
"step": 2386
},
{
"epoch": 2.408307289041234,
"grad_norm": 0.1480537482351519,
"learning_rate": 5e-05,
"loss": 1.5641,
"step": 2387
},
{
"epoch": 2.4093154551870146,
"grad_norm": 0.13563346503461496,
"learning_rate": 5e-05,
"loss": 1.5812,
"step": 2388
},
{
"epoch": 2.410323621332796,
"grad_norm": 0.1374414779066698,
"learning_rate": 5e-05,
"loss": 1.5775,
"step": 2389
},
{
"epoch": 2.4113317874785762,
"grad_norm": 0.13960415581308322,
"learning_rate": 5e-05,
"loss": 1.5545,
"step": 2390
},
{
"epoch": 2.4123399536243575,
"grad_norm": 0.13359670238450724,
"learning_rate": 5e-05,
"loss": 1.5834,
"step": 2391
},
{
"epoch": 2.4133481197701383,
"grad_norm": 0.1392195241675204,
"learning_rate": 5e-05,
"loss": 1.5594,
"step": 2392
},
{
"epoch": 2.414356285915919,
"grad_norm": 0.13247489393452228,
"learning_rate": 5e-05,
"loss": 1.5781,
"step": 2393
},
{
"epoch": 2.4153644520617,
"grad_norm": 0.1497906517108831,
"learning_rate": 5e-05,
"loss": 1.5744,
"step": 2394
},
{
"epoch": 2.4163726182074807,
"grad_norm": 0.12838007271692908,
"learning_rate": 5e-05,
"loss": 1.5749,
"step": 2395
},
{
"epoch": 2.4173807843532615,
"grad_norm": 0.1407089734768055,
"learning_rate": 5e-05,
"loss": 1.5792,
"step": 2396
},
{
"epoch": 2.4183889504990423,
"grad_norm": 0.1393752593452367,
"learning_rate": 5e-05,
"loss": 1.5811,
"step": 2397
},
{
"epoch": 2.419397116644823,
"grad_norm": 0.12964122325258895,
"learning_rate": 5e-05,
"loss": 1.5627,
"step": 2398
},
{
"epoch": 2.420405282790604,
"grad_norm": 0.13460702024917695,
"learning_rate": 5e-05,
"loss": 1.5689,
"step": 2399
},
{
"epoch": 2.4214134489363848,
"grad_norm": 0.13878826918050213,
"learning_rate": 5e-05,
"loss": 1.5736,
"step": 2400
},
{
"epoch": 2.4224216150821656,
"grad_norm": 0.12934976735747655,
"learning_rate": 5e-05,
"loss": 1.562,
"step": 2401
},
{
"epoch": 2.4234297812279464,
"grad_norm": 0.13563528648016107,
"learning_rate": 5e-05,
"loss": 1.584,
"step": 2402
},
{
"epoch": 2.424437947373727,
"grad_norm": 0.1635163479489948,
"learning_rate": 5e-05,
"loss": 1.5644,
"step": 2403
},
{
"epoch": 2.425446113519508,
"grad_norm": 0.1362603873202254,
"learning_rate": 5e-05,
"loss": 1.5709,
"step": 2404
},
{
"epoch": 2.426454279665289,
"grad_norm": 0.13705910067232868,
"learning_rate": 5e-05,
"loss": 1.5599,
"step": 2405
},
{
"epoch": 2.4274624458110696,
"grad_norm": 0.13238627727596283,
"learning_rate": 5e-05,
"loss": 1.567,
"step": 2406
},
{
"epoch": 2.4284706119568504,
"grad_norm": 0.1424503110347935,
"learning_rate": 5e-05,
"loss": 1.5827,
"step": 2407
},
{
"epoch": 2.4294787781026312,
"grad_norm": 0.14481999094444797,
"learning_rate": 5e-05,
"loss": 1.5802,
"step": 2408
},
{
"epoch": 2.430486944248412,
"grad_norm": 0.1254927464881309,
"learning_rate": 5e-05,
"loss": 1.5666,
"step": 2409
},
{
"epoch": 2.431495110394193,
"grad_norm": 0.14871450999559535,
"learning_rate": 5e-05,
"loss": 1.5794,
"step": 2410
},
{
"epoch": 2.4325032765399737,
"grad_norm": 0.13294327553010796,
"learning_rate": 5e-05,
"loss": 1.5715,
"step": 2411
},
{
"epoch": 2.4335114426857545,
"grad_norm": 0.13348530146232257,
"learning_rate": 5e-05,
"loss": 1.5764,
"step": 2412
},
{
"epoch": 2.4345196088315353,
"grad_norm": 0.14339091781597899,
"learning_rate": 5e-05,
"loss": 1.5721,
"step": 2413
},
{
"epoch": 2.435527774977316,
"grad_norm": 0.15100964315181148,
"learning_rate": 5e-05,
"loss": 1.5742,
"step": 2414
},
{
"epoch": 2.436535941123097,
"grad_norm": 0.1291649549852917,
"learning_rate": 5e-05,
"loss": 1.5665,
"step": 2415
},
{
"epoch": 2.437544107268878,
"grad_norm": 0.13241803442644284,
"learning_rate": 5e-05,
"loss": 1.5589,
"step": 2416
},
{
"epoch": 2.4385522734146585,
"grad_norm": 0.13515219086853045,
"learning_rate": 5e-05,
"loss": 1.5817,
"step": 2417
},
{
"epoch": 2.4395604395604398,
"grad_norm": 0.13471099156870195,
"learning_rate": 5e-05,
"loss": 1.5648,
"step": 2418
},
{
"epoch": 2.44056860570622,
"grad_norm": 0.1392251071384398,
"learning_rate": 5e-05,
"loss": 1.5876,
"step": 2419
},
{
"epoch": 2.4415767718520014,
"grad_norm": 0.1410794306172876,
"learning_rate": 5e-05,
"loss": 1.5932,
"step": 2420
},
{
"epoch": 2.442584937997782,
"grad_norm": 0.1362769330250836,
"learning_rate": 5e-05,
"loss": 1.5854,
"step": 2421
},
{
"epoch": 2.443593104143563,
"grad_norm": 0.14631288657866592,
"learning_rate": 5e-05,
"loss": 1.5527,
"step": 2422
},
{
"epoch": 2.444601270289344,
"grad_norm": 0.13269401985059945,
"learning_rate": 5e-05,
"loss": 1.5845,
"step": 2423
},
{
"epoch": 2.4456094364351246,
"grad_norm": 0.1319557868081523,
"learning_rate": 5e-05,
"loss": 1.5667,
"step": 2424
},
{
"epoch": 2.4466176025809054,
"grad_norm": 0.12597772042087557,
"learning_rate": 5e-05,
"loss": 1.5695,
"step": 2425
},
{
"epoch": 2.4476257687266862,
"grad_norm": 0.13815243282087905,
"learning_rate": 5e-05,
"loss": 1.5816,
"step": 2426
},
{
"epoch": 2.448633934872467,
"grad_norm": 0.13713330689169553,
"learning_rate": 5e-05,
"loss": 1.572,
"step": 2427
},
{
"epoch": 2.449642101018248,
"grad_norm": 0.13859041649550563,
"learning_rate": 5e-05,
"loss": 1.5953,
"step": 2428
},
{
"epoch": 2.4506502671640287,
"grad_norm": 0.13508535172669156,
"learning_rate": 5e-05,
"loss": 1.5721,
"step": 2429
},
{
"epoch": 2.4516584333098095,
"grad_norm": 0.1463530271146982,
"learning_rate": 5e-05,
"loss": 1.5834,
"step": 2430
},
{
"epoch": 2.4526665994555903,
"grad_norm": 0.13224226734969893,
"learning_rate": 5e-05,
"loss": 1.5764,
"step": 2431
},
{
"epoch": 2.453674765601371,
"grad_norm": 0.1607786000995468,
"learning_rate": 5e-05,
"loss": 1.5888,
"step": 2432
},
{
"epoch": 2.454682931747152,
"grad_norm": 0.1408107276695577,
"learning_rate": 5e-05,
"loss": 1.5759,
"step": 2433
},
{
"epoch": 2.4556910978929327,
"grad_norm": 0.17113821583538205,
"learning_rate": 5e-05,
"loss": 1.5892,
"step": 2434
},
{
"epoch": 2.4566992640387135,
"grad_norm": 0.1336457346672259,
"learning_rate": 5e-05,
"loss": 1.5703,
"step": 2435
},
{
"epoch": 2.4577074301844943,
"grad_norm": 0.14812625323333414,
"learning_rate": 5e-05,
"loss": 1.552,
"step": 2436
},
{
"epoch": 2.458715596330275,
"grad_norm": 0.13290094043888356,
"learning_rate": 5e-05,
"loss": 1.5769,
"step": 2437
},
{
"epoch": 2.459723762476056,
"grad_norm": 0.13961256613566742,
"learning_rate": 5e-05,
"loss": 1.5871,
"step": 2438
},
{
"epoch": 2.4607319286218368,
"grad_norm": 0.13177722231521807,
"learning_rate": 5e-05,
"loss": 1.5921,
"step": 2439
},
{
"epoch": 2.4617400947676176,
"grad_norm": 0.14231658931889407,
"learning_rate": 5e-05,
"loss": 1.567,
"step": 2440
},
{
"epoch": 2.4627482609133984,
"grad_norm": 0.13476284327765373,
"learning_rate": 5e-05,
"loss": 1.5865,
"step": 2441
},
{
"epoch": 2.463756427059179,
"grad_norm": 0.13970460473325771,
"learning_rate": 5e-05,
"loss": 1.5655,
"step": 2442
},
{
"epoch": 2.46476459320496,
"grad_norm": 0.13205775238552916,
"learning_rate": 5e-05,
"loss": 1.579,
"step": 2443
},
{
"epoch": 2.465772759350741,
"grad_norm": 0.13888961499112318,
"learning_rate": 5e-05,
"loss": 1.5736,
"step": 2444
},
{
"epoch": 2.466780925496522,
"grad_norm": 0.12906724404483103,
"learning_rate": 5e-05,
"loss": 1.5602,
"step": 2445
},
{
"epoch": 2.4677890916423024,
"grad_norm": 0.13215069256460463,
"learning_rate": 5e-05,
"loss": 1.5707,
"step": 2446
},
{
"epoch": 2.4687972577880837,
"grad_norm": 0.13378752606561972,
"learning_rate": 5e-05,
"loss": 1.5813,
"step": 2447
},
{
"epoch": 2.4698054239338645,
"grad_norm": 0.12885498076034543,
"learning_rate": 5e-05,
"loss": 1.5834,
"step": 2448
},
{
"epoch": 2.4708135900796453,
"grad_norm": 0.13188478896564354,
"learning_rate": 5e-05,
"loss": 1.5708,
"step": 2449
},
{
"epoch": 2.471821756225426,
"grad_norm": 0.14245357066832004,
"learning_rate": 5e-05,
"loss": 1.5777,
"step": 2450
},
{
"epoch": 2.472829922371207,
"grad_norm": 0.12803504022023557,
"learning_rate": 5e-05,
"loss": 1.5649,
"step": 2451
},
{
"epoch": 2.4738380885169877,
"grad_norm": 0.13782634262350107,
"learning_rate": 5e-05,
"loss": 1.5625,
"step": 2452
},
{
"epoch": 2.4748462546627685,
"grad_norm": 0.13296958894649374,
"learning_rate": 5e-05,
"loss": 1.5771,
"step": 2453
},
{
"epoch": 2.4758544208085493,
"grad_norm": 0.12638621012304266,
"learning_rate": 5e-05,
"loss": 1.5841,
"step": 2454
},
{
"epoch": 2.47686258695433,
"grad_norm": 0.1332807458477833,
"learning_rate": 5e-05,
"loss": 1.5927,
"step": 2455
},
{
"epoch": 2.477870753100111,
"grad_norm": 0.1289022244439646,
"learning_rate": 5e-05,
"loss": 1.5829,
"step": 2456
},
{
"epoch": 2.4788789192458918,
"grad_norm": 0.14266754208759228,
"learning_rate": 5e-05,
"loss": 1.5741,
"step": 2457
},
{
"epoch": 2.4798870853916726,
"grad_norm": 0.13739273922110581,
"learning_rate": 5e-05,
"loss": 1.5577,
"step": 2458
},
{
"epoch": 2.4808952515374534,
"grad_norm": 0.14045495223205887,
"learning_rate": 5e-05,
"loss": 1.5534,
"step": 2459
},
{
"epoch": 2.481903417683234,
"grad_norm": 0.14154710496839273,
"learning_rate": 5e-05,
"loss": 1.5755,
"step": 2460
},
{
"epoch": 2.482911583829015,
"grad_norm": 0.12835695587273255,
"learning_rate": 5e-05,
"loss": 1.5675,
"step": 2461
},
{
"epoch": 2.483919749974796,
"grad_norm": 0.12933419582659292,
"learning_rate": 5e-05,
"loss": 1.5685,
"step": 2462
},
{
"epoch": 2.4849279161205766,
"grad_norm": 0.130298325020088,
"learning_rate": 5e-05,
"loss": 1.5529,
"step": 2463
},
{
"epoch": 2.4859360822663574,
"grad_norm": 0.12967199709240884,
"learning_rate": 5e-05,
"loss": 1.5681,
"step": 2464
},
{
"epoch": 2.4869442484121382,
"grad_norm": 0.13052105938370967,
"learning_rate": 5e-05,
"loss": 1.5588,
"step": 2465
},
{
"epoch": 2.487952414557919,
"grad_norm": 0.13926608795875886,
"learning_rate": 5e-05,
"loss": 1.57,
"step": 2466
},
{
"epoch": 2.4889605807037,
"grad_norm": 0.13527261030795035,
"learning_rate": 5e-05,
"loss": 1.562,
"step": 2467
},
{
"epoch": 2.4899687468494807,
"grad_norm": 0.13615561502455836,
"learning_rate": 5e-05,
"loss": 1.5685,
"step": 2468
},
{
"epoch": 2.4909769129952615,
"grad_norm": 0.13182708641555427,
"learning_rate": 5e-05,
"loss": 1.5588,
"step": 2469
},
{
"epoch": 2.4919850791410423,
"grad_norm": 0.14280294347540126,
"learning_rate": 5e-05,
"loss": 1.5682,
"step": 2470
},
{
"epoch": 2.492993245286823,
"grad_norm": 0.1398245999275893,
"learning_rate": 5e-05,
"loss": 1.5499,
"step": 2471
},
{
"epoch": 2.4940014114326043,
"grad_norm": 0.14408179071191235,
"learning_rate": 5e-05,
"loss": 1.5684,
"step": 2472
},
{
"epoch": 2.4950095775783847,
"grad_norm": 0.1482559154904685,
"learning_rate": 5e-05,
"loss": 1.5685,
"step": 2473
},
{
"epoch": 2.496017743724166,
"grad_norm": 0.12983878701279014,
"learning_rate": 5e-05,
"loss": 1.5616,
"step": 2474
},
{
"epoch": 2.4970259098699463,
"grad_norm": 0.1350990230454604,
"learning_rate": 5e-05,
"loss": 1.5686,
"step": 2475
},
{
"epoch": 2.4980340760157276,
"grad_norm": 0.1517745861472744,
"learning_rate": 5e-05,
"loss": 1.5649,
"step": 2476
},
{
"epoch": 2.4990422421615084,
"grad_norm": 0.1296415782702942,
"learning_rate": 5e-05,
"loss": 1.5809,
"step": 2477
},
{
"epoch": 2.500050408307289,
"grad_norm": 0.14410943255216555,
"learning_rate": 5e-05,
"loss": 1.5739,
"step": 2478
},
{
"epoch": 2.50105857445307,
"grad_norm": 0.1330701724321787,
"learning_rate": 5e-05,
"loss": 1.5388,
"step": 2479
},
{
"epoch": 2.502066740598851,
"grad_norm": 0.13484529044587992,
"learning_rate": 5e-05,
"loss": 1.5706,
"step": 2480
},
{
"epoch": 2.5030749067446316,
"grad_norm": 0.14585530598199864,
"learning_rate": 5e-05,
"loss": 1.5628,
"step": 2481
},
{
"epoch": 2.5040830728904124,
"grad_norm": 0.14335110959236813,
"learning_rate": 5e-05,
"loss": 1.5791,
"step": 2482
},
{
"epoch": 2.5050912390361932,
"grad_norm": 0.14823280767430108,
"learning_rate": 5e-05,
"loss": 1.566,
"step": 2483
},
{
"epoch": 2.506099405181974,
"grad_norm": 0.14556611452022872,
"learning_rate": 5e-05,
"loss": 1.5653,
"step": 2484
},
{
"epoch": 2.507107571327755,
"grad_norm": 0.13454439849791208,
"learning_rate": 5e-05,
"loss": 1.56,
"step": 2485
},
{
"epoch": 2.5081157374735357,
"grad_norm": 0.13839984587712512,
"learning_rate": 5e-05,
"loss": 1.5565,
"step": 2486
},
{
"epoch": 2.5091239036193165,
"grad_norm": 0.13307537522190815,
"learning_rate": 5e-05,
"loss": 1.5671,
"step": 2487
},
{
"epoch": 2.5101320697650973,
"grad_norm": 0.13959579904141645,
"learning_rate": 5e-05,
"loss": 1.5797,
"step": 2488
},
{
"epoch": 2.511140235910878,
"grad_norm": 0.13005573539881465,
"learning_rate": 5e-05,
"loss": 1.5768,
"step": 2489
},
{
"epoch": 2.512148402056659,
"grad_norm": 0.13276083136899364,
"learning_rate": 5e-05,
"loss": 1.5707,
"step": 2490
},
{
"epoch": 2.5131565682024397,
"grad_norm": 0.12732790377512698,
"learning_rate": 5e-05,
"loss": 1.5637,
"step": 2491
},
{
"epoch": 2.5141647343482205,
"grad_norm": 0.13703947337554104,
"learning_rate": 5e-05,
"loss": 1.5608,
"step": 2492
},
{
"epoch": 2.5151729004940013,
"grad_norm": 0.1369690453430113,
"learning_rate": 5e-05,
"loss": 1.5871,
"step": 2493
},
{
"epoch": 2.516181066639782,
"grad_norm": 0.13733318758079022,
"learning_rate": 5e-05,
"loss": 1.5803,
"step": 2494
},
{
"epoch": 2.517189232785563,
"grad_norm": 0.14136033097026432,
"learning_rate": 5e-05,
"loss": 1.5658,
"step": 2495
},
{
"epoch": 2.5181973989313438,
"grad_norm": 0.14034198910723664,
"learning_rate": 5e-05,
"loss": 1.561,
"step": 2496
},
{
"epoch": 2.5192055650771246,
"grad_norm": 0.14047149813923593,
"learning_rate": 5e-05,
"loss": 1.5775,
"step": 2497
},
{
"epoch": 2.5202137312229054,
"grad_norm": 0.1361565028785915,
"learning_rate": 5e-05,
"loss": 1.5864,
"step": 2498
},
{
"epoch": 2.5212218973686866,
"grad_norm": 0.1346932459707488,
"learning_rate": 5e-05,
"loss": 1.5628,
"step": 2499
},
{
"epoch": 2.522230063514467,
"grad_norm": 0.13712538669966598,
"learning_rate": 5e-05,
"loss": 1.5732,
"step": 2500
},
{
"epoch": 2.5232382296602482,
"grad_norm": 0.13199711949758308,
"learning_rate": 5e-05,
"loss": 1.5574,
"step": 2501
},
{
"epoch": 2.5242463958060286,
"grad_norm": 0.1306387580998687,
"learning_rate": 5e-05,
"loss": 1.5614,
"step": 2502
},
{
"epoch": 2.52525456195181,
"grad_norm": 0.1302641243617534,
"learning_rate": 5e-05,
"loss": 1.5607,
"step": 2503
},
{
"epoch": 2.5262627280975902,
"grad_norm": 0.13309453441297842,
"learning_rate": 5e-05,
"loss": 1.5731,
"step": 2504
},
{
"epoch": 2.5272708942433715,
"grad_norm": 0.1402063808794923,
"learning_rate": 5e-05,
"loss": 1.5924,
"step": 2505
},
{
"epoch": 2.5282790603891523,
"grad_norm": 0.13066719388767034,
"learning_rate": 5e-05,
"loss": 1.5675,
"step": 2506
},
{
"epoch": 2.529287226534933,
"grad_norm": 0.1314587822264092,
"learning_rate": 5e-05,
"loss": 1.5551,
"step": 2507
},
{
"epoch": 2.530295392680714,
"grad_norm": 0.13270831067662986,
"learning_rate": 5e-05,
"loss": 1.5629,
"step": 2508
},
{
"epoch": 2.5313035588264947,
"grad_norm": 0.13610270267142832,
"learning_rate": 5e-05,
"loss": 1.5789,
"step": 2509
},
{
"epoch": 2.5323117249722755,
"grad_norm": 0.1420301959642942,
"learning_rate": 5e-05,
"loss": 1.5786,
"step": 2510
},
{
"epoch": 2.5333198911180563,
"grad_norm": 0.1320847830419652,
"learning_rate": 5e-05,
"loss": 1.5542,
"step": 2511
},
{
"epoch": 2.534328057263837,
"grad_norm": 0.13115439787293542,
"learning_rate": 5e-05,
"loss": 1.5868,
"step": 2512
},
{
"epoch": 2.535336223409618,
"grad_norm": 0.13671059366359536,
"learning_rate": 5e-05,
"loss": 1.5749,
"step": 2513
},
{
"epoch": 2.5363443895553988,
"grad_norm": 0.15170920625959478,
"learning_rate": 5e-05,
"loss": 1.5785,
"step": 2514
},
{
"epoch": 2.5373525557011796,
"grad_norm": 0.1385479032372868,
"learning_rate": 5e-05,
"loss": 1.5806,
"step": 2515
},
{
"epoch": 2.5383607218469604,
"grad_norm": 0.15501047413228763,
"learning_rate": 5e-05,
"loss": 1.5737,
"step": 2516
},
{
"epoch": 2.539368887992741,
"grad_norm": 0.14382396943316664,
"learning_rate": 5e-05,
"loss": 1.5747,
"step": 2517
},
{
"epoch": 2.540377054138522,
"grad_norm": 0.13810111650121543,
"learning_rate": 5e-05,
"loss": 1.5718,
"step": 2518
},
{
"epoch": 2.541385220284303,
"grad_norm": 0.14698439268982416,
"learning_rate": 5e-05,
"loss": 1.5738,
"step": 2519
},
{
"epoch": 2.5423933864300836,
"grad_norm": 0.13239747259177995,
"learning_rate": 5e-05,
"loss": 1.5693,
"step": 2520
},
{
"epoch": 2.5434015525758644,
"grad_norm": 0.1392786276751499,
"learning_rate": 5e-05,
"loss": 1.5598,
"step": 2521
},
{
"epoch": 2.5444097187216452,
"grad_norm": 0.13066731381474933,
"learning_rate": 5e-05,
"loss": 1.5432,
"step": 2522
},
{
"epoch": 2.545417884867426,
"grad_norm": 0.14006779722474544,
"learning_rate": 5e-05,
"loss": 1.5616,
"step": 2523
},
{
"epoch": 2.546426051013207,
"grad_norm": 0.14015375699552457,
"learning_rate": 5e-05,
"loss": 1.5757,
"step": 2524
},
{
"epoch": 2.5474342171589877,
"grad_norm": 0.1287079586458338,
"learning_rate": 5e-05,
"loss": 1.5612,
"step": 2525
},
{
"epoch": 2.5484423833047685,
"grad_norm": 0.1380581924188533,
"learning_rate": 5e-05,
"loss": 1.5612,
"step": 2526
},
{
"epoch": 2.5494505494505493,
"grad_norm": 0.1461047231472578,
"learning_rate": 5e-05,
"loss": 1.5813,
"step": 2527
},
{
"epoch": 2.5504587155963305,
"grad_norm": 0.13094559552009488,
"learning_rate": 5e-05,
"loss": 1.5576,
"step": 2528
},
{
"epoch": 2.551466881742111,
"grad_norm": 0.14877667456952287,
"learning_rate": 5e-05,
"loss": 1.5803,
"step": 2529
},
{
"epoch": 2.552475047887892,
"grad_norm": 0.1410988956684791,
"learning_rate": 5e-05,
"loss": 1.5676,
"step": 2530
},
{
"epoch": 2.5534832140336725,
"grad_norm": 0.1346136915835775,
"learning_rate": 5e-05,
"loss": 1.5645,
"step": 2531
},
{
"epoch": 2.5544913801794538,
"grad_norm": 0.13677299683014227,
"learning_rate": 5e-05,
"loss": 1.5723,
"step": 2532
},
{
"epoch": 2.555499546325234,
"grad_norm": 0.1381562967508932,
"learning_rate": 5e-05,
"loss": 1.572,
"step": 2533
},
{
"epoch": 2.5565077124710154,
"grad_norm": 0.12280272126633365,
"learning_rate": 5e-05,
"loss": 1.5681,
"step": 2534
},
{
"epoch": 2.557515878616796,
"grad_norm": 0.12742988769491065,
"learning_rate": 5e-05,
"loss": 1.584,
"step": 2535
},
{
"epoch": 2.558524044762577,
"grad_norm": 0.13510718950092368,
"learning_rate": 5e-05,
"loss": 1.5764,
"step": 2536
},
{
"epoch": 2.559532210908358,
"grad_norm": 0.12533625305806456,
"learning_rate": 5e-05,
"loss": 1.5665,
"step": 2537
},
{
"epoch": 2.5605403770541386,
"grad_norm": 0.1285836841448671,
"learning_rate": 5e-05,
"loss": 1.5705,
"step": 2538
},
{
"epoch": 2.5615485431999194,
"grad_norm": 0.1348571888933907,
"learning_rate": 5e-05,
"loss": 1.5556,
"step": 2539
},
{
"epoch": 2.5625567093457002,
"grad_norm": 0.1325268099316684,
"learning_rate": 5e-05,
"loss": 1.5766,
"step": 2540
},
{
"epoch": 2.563564875491481,
"grad_norm": 0.13932827541236092,
"learning_rate": 5e-05,
"loss": 1.581,
"step": 2541
},
{
"epoch": 2.564573041637262,
"grad_norm": 0.24623788216863024,
"learning_rate": 5e-05,
"loss": 1.5761,
"step": 2542
},
{
"epoch": 2.5655812077830427,
"grad_norm": 0.14270394122091418,
"learning_rate": 5e-05,
"loss": 1.5712,
"step": 2543
},
{
"epoch": 2.5665893739288235,
"grad_norm": 0.26287780505488345,
"learning_rate": 5e-05,
"loss": 1.5688,
"step": 2544
},
{
"epoch": 2.5675975400746043,
"grad_norm": 0.13643131361751182,
"learning_rate": 5e-05,
"loss": 1.5749,
"step": 2545
},
{
"epoch": 2.568605706220385,
"grad_norm": 0.14293860347761278,
"learning_rate": 5e-05,
"loss": 1.5597,
"step": 2546
},
{
"epoch": 2.569613872366166,
"grad_norm": 0.1439622809758673,
"learning_rate": 5e-05,
"loss": 1.57,
"step": 2547
},
{
"epoch": 2.5706220385119467,
"grad_norm": 0.1360551242225786,
"learning_rate": 5e-05,
"loss": 1.5813,
"step": 2548
},
{
"epoch": 2.5716302046577275,
"grad_norm": 0.1433763087264025,
"learning_rate": 5e-05,
"loss": 1.5737,
"step": 2549
},
{
"epoch": 2.5726383708035083,
"grad_norm": 0.14349978498686156,
"learning_rate": 5e-05,
"loss": 1.5873,
"step": 2550
},
{
"epoch": 2.573646536949289,
"grad_norm": 0.13563939190053786,
"learning_rate": 5e-05,
"loss": 1.565,
"step": 2551
},
{
"epoch": 2.57465470309507,
"grad_norm": 0.1400727636727626,
"learning_rate": 5e-05,
"loss": 1.5659,
"step": 2552
},
{
"epoch": 2.5756628692408507,
"grad_norm": 0.46677268806303335,
"learning_rate": 5e-05,
"loss": 1.5834,
"step": 2553
},
{
"epoch": 2.5766710353866316,
"grad_norm": 0.14369253983364566,
"learning_rate": 5e-05,
"loss": 1.5558,
"step": 2554
},
{
"epoch": 2.577679201532413,
"grad_norm": 0.14498641663098719,
"learning_rate": 5e-05,
"loss": 1.5652,
"step": 2555
},
{
"epoch": 2.578687367678193,
"grad_norm": 0.13760310777721238,
"learning_rate": 5e-05,
"loss": 1.581,
"step": 2556
},
{
"epoch": 2.5796955338239744,
"grad_norm": 0.14467222241686664,
"learning_rate": 5e-05,
"loss": 1.5752,
"step": 2557
},
{
"epoch": 2.580703699969755,
"grad_norm": 0.1361819067350698,
"learning_rate": 5e-05,
"loss": 1.5776,
"step": 2558
},
{
"epoch": 2.581711866115536,
"grad_norm": 0.13332450522409992,
"learning_rate": 5e-05,
"loss": 1.5606,
"step": 2559
},
{
"epoch": 2.5827200322613164,
"grad_norm": 0.14004076082903943,
"learning_rate": 5e-05,
"loss": 1.5679,
"step": 2560
},
{
"epoch": 2.5837281984070977,
"grad_norm": 0.17716816167241073,
"learning_rate": 5e-05,
"loss": 1.5808,
"step": 2561
},
{
"epoch": 2.5847363645528785,
"grad_norm": 0.135531215421772,
"learning_rate": 5e-05,
"loss": 1.5717,
"step": 2562
},
{
"epoch": 2.5857445306986593,
"grad_norm": 0.14131718191338816,
"learning_rate": 5e-05,
"loss": 1.5629,
"step": 2563
},
{
"epoch": 2.58675269684444,
"grad_norm": 0.15566287819098393,
"learning_rate": 5e-05,
"loss": 1.5772,
"step": 2564
},
{
"epoch": 2.587760862990221,
"grad_norm": 0.1486242781666469,
"learning_rate": 5e-05,
"loss": 1.5629,
"step": 2565
},
{
"epoch": 2.5887690291360017,
"grad_norm": 0.13751406947742945,
"learning_rate": 5e-05,
"loss": 1.5636,
"step": 2566
},
{
"epoch": 2.5897771952817825,
"grad_norm": 0.15458590089287258,
"learning_rate": 5e-05,
"loss": 1.5714,
"step": 2567
},
{
"epoch": 2.5907853614275633,
"grad_norm": 0.13822881305270032,
"learning_rate": 5e-05,
"loss": 1.5801,
"step": 2568
},
{
"epoch": 2.591793527573344,
"grad_norm": 0.14331964687266405,
"learning_rate": 5e-05,
"loss": 1.5846,
"step": 2569
},
{
"epoch": 2.592801693719125,
"grad_norm": 0.15434411395135197,
"learning_rate": 5e-05,
"loss": 1.57,
"step": 2570
},
{
"epoch": 2.5938098598649058,
"grad_norm": 0.14346789950395492,
"learning_rate": 5e-05,
"loss": 1.5553,
"step": 2571
},
{
"epoch": 2.5948180260106866,
"grad_norm": 0.3649511832228748,
"learning_rate": 5e-05,
"loss": 1.571,
"step": 2572
},
{
"epoch": 2.5958261921564674,
"grad_norm": 0.23624873083863812,
"learning_rate": 5e-05,
"loss": 1.5614,
"step": 2573
},
{
"epoch": 2.596834358302248,
"grad_norm": 0.13868172024332132,
"learning_rate": 5e-05,
"loss": 1.5732,
"step": 2574
},
{
"epoch": 2.597842524448029,
"grad_norm": 0.19472869370683973,
"learning_rate": 5e-05,
"loss": 1.5739,
"step": 2575
},
{
"epoch": 2.59885069059381,
"grad_norm": 0.15021863097825597,
"learning_rate": 5e-05,
"loss": 1.5746,
"step": 2576
},
{
"epoch": 2.5998588567395906,
"grad_norm": 0.1723890646573745,
"learning_rate": 5e-05,
"loss": 1.5868,
"step": 2577
},
{
"epoch": 2.6008670228853714,
"grad_norm": 0.14318449183244025,
"learning_rate": 5e-05,
"loss": 1.5693,
"step": 2578
},
{
"epoch": 2.6018751890311522,
"grad_norm": 0.14758726018313456,
"learning_rate": 5e-05,
"loss": 1.5837,
"step": 2579
},
{
"epoch": 2.602883355176933,
"grad_norm": 0.18145968707420648,
"learning_rate": 5e-05,
"loss": 1.5929,
"step": 2580
},
{
"epoch": 2.603891521322714,
"grad_norm": 0.15104416134896115,
"learning_rate": 5e-05,
"loss": 1.5777,
"step": 2581
},
{
"epoch": 2.6048996874684947,
"grad_norm": 0.1578694401415007,
"learning_rate": 5e-05,
"loss": 1.5815,
"step": 2582
},
{
"epoch": 2.6059078536142755,
"grad_norm": 0.18166720384742438,
"learning_rate": 5e-05,
"loss": 1.5623,
"step": 2583
},
{
"epoch": 2.6069160197600567,
"grad_norm": 0.15697367202164383,
"learning_rate": 5e-05,
"loss": 1.5641,
"step": 2584
},
{
"epoch": 2.607924185905837,
"grad_norm": 0.14997661568258938,
"learning_rate": 5e-05,
"loss": 1.5638,
"step": 2585
},
{
"epoch": 2.6089323520516183,
"grad_norm": 0.1669423778875084,
"learning_rate": 5e-05,
"loss": 1.5763,
"step": 2586
},
{
"epoch": 2.6099405181973987,
"grad_norm": 0.18036491079980102,
"learning_rate": 5e-05,
"loss": 1.5621,
"step": 2587
},
{
"epoch": 2.61094868434318,
"grad_norm": 0.13695247542006733,
"learning_rate": 5e-05,
"loss": 1.5736,
"step": 2588
},
{
"epoch": 2.6119568504889603,
"grad_norm": 0.17108760911033974,
"learning_rate": 5e-05,
"loss": 1.5776,
"step": 2589
},
{
"epoch": 2.6129650166347416,
"grad_norm": 0.15743505516329523,
"learning_rate": 5e-05,
"loss": 1.561,
"step": 2590
},
{
"epoch": 2.6139731827805224,
"grad_norm": 0.15546759277401703,
"learning_rate": 5e-05,
"loss": 1.5809,
"step": 2591
},
{
"epoch": 2.614981348926303,
"grad_norm": 0.1525456111524391,
"learning_rate": 5e-05,
"loss": 1.5649,
"step": 2592
},
{
"epoch": 2.615989515072084,
"grad_norm": 0.1590829570184181,
"learning_rate": 5e-05,
"loss": 1.5632,
"step": 2593
},
{
"epoch": 2.616997681217865,
"grad_norm": 0.15097999238651005,
"learning_rate": 5e-05,
"loss": 1.5692,
"step": 2594
},
{
"epoch": 2.6180058473636456,
"grad_norm": 0.15281298093347742,
"learning_rate": 5e-05,
"loss": 1.564,
"step": 2595
},
{
"epoch": 2.6190140135094264,
"grad_norm": 0.15336242452735435,
"learning_rate": 5e-05,
"loss": 1.5464,
"step": 2596
},
{
"epoch": 2.6200221796552072,
"grad_norm": 0.15028356582825605,
"learning_rate": 5e-05,
"loss": 1.5682,
"step": 2597
},
{
"epoch": 2.621030345800988,
"grad_norm": 0.14572672233718242,
"learning_rate": 5e-05,
"loss": 1.5601,
"step": 2598
},
{
"epoch": 2.622038511946769,
"grad_norm": 0.14094923929247996,
"learning_rate": 5e-05,
"loss": 1.5762,
"step": 2599
},
{
"epoch": 2.6230466780925497,
"grad_norm": 0.14403490955925488,
"learning_rate": 5e-05,
"loss": 1.5861,
"step": 2600
},
{
"epoch": 2.6240548442383305,
"grad_norm": 0.15008485743918354,
"learning_rate": 5e-05,
"loss": 1.576,
"step": 2601
},
{
"epoch": 2.6250630103841113,
"grad_norm": 0.14787246043739002,
"learning_rate": 5e-05,
"loss": 1.5764,
"step": 2602
},
{
"epoch": 2.626071176529892,
"grad_norm": 0.15241722364084348,
"learning_rate": 5e-05,
"loss": 1.5547,
"step": 2603
},
{
"epoch": 2.627079342675673,
"grad_norm": 0.1502994733216112,
"learning_rate": 5e-05,
"loss": 1.5685,
"step": 2604
},
{
"epoch": 2.6280875088214537,
"grad_norm": 0.16074365492554207,
"learning_rate": 5e-05,
"loss": 1.5758,
"step": 2605
},
{
"epoch": 2.6290956749672345,
"grad_norm": 0.14430981056083628,
"learning_rate": 5e-05,
"loss": 1.5736,
"step": 2606
},
{
"epoch": 2.6301038411130153,
"grad_norm": 0.16123995936065388,
"learning_rate": 5e-05,
"loss": 1.5663,
"step": 2607
},
{
"epoch": 2.631112007258796,
"grad_norm": 0.14231002853104405,
"learning_rate": 5e-05,
"loss": 1.5559,
"step": 2608
},
{
"epoch": 2.632120173404577,
"grad_norm": 0.14228451160268388,
"learning_rate": 5e-05,
"loss": 1.5378,
"step": 2609
},
{
"epoch": 2.6331283395503577,
"grad_norm": 0.14162038187228881,
"learning_rate": 5e-05,
"loss": 1.5657,
"step": 2610
},
{
"epoch": 2.634136505696139,
"grad_norm": 0.16034654380517346,
"learning_rate": 5e-05,
"loss": 1.5607,
"step": 2611
},
{
"epoch": 2.6351446718419194,
"grad_norm": 0.1406556428280142,
"learning_rate": 5e-05,
"loss": 1.5773,
"step": 2612
},
{
"epoch": 2.6361528379877006,
"grad_norm": 0.1574208300546788,
"learning_rate": 5e-05,
"loss": 1.5687,
"step": 2613
},
{
"epoch": 2.637161004133481,
"grad_norm": 0.129992555684169,
"learning_rate": 5e-05,
"loss": 1.5761,
"step": 2614
},
{
"epoch": 2.6381691702792622,
"grad_norm": 0.1479122083673606,
"learning_rate": 5e-05,
"loss": 1.5715,
"step": 2615
},
{
"epoch": 2.6391773364250426,
"grad_norm": 0.17401025097129166,
"learning_rate": 5e-05,
"loss": 1.5764,
"step": 2616
},
{
"epoch": 2.640185502570824,
"grad_norm": 0.16327524485219438,
"learning_rate": 5e-05,
"loss": 1.5734,
"step": 2617
},
{
"epoch": 2.6411936687166047,
"grad_norm": 0.15186897128683868,
"learning_rate": 5e-05,
"loss": 1.5752,
"step": 2618
},
{
"epoch": 2.6422018348623855,
"grad_norm": 0.14122002915712664,
"learning_rate": 5e-05,
"loss": 1.5752,
"step": 2619
},
{
"epoch": 2.6432100010081663,
"grad_norm": 0.14625103956516458,
"learning_rate": 5e-05,
"loss": 1.5548,
"step": 2620
},
{
"epoch": 2.644218167153947,
"grad_norm": 0.15319911711521672,
"learning_rate": 5e-05,
"loss": 1.5782,
"step": 2621
},
{
"epoch": 2.645226333299728,
"grad_norm": 0.14080663787259234,
"learning_rate": 5e-05,
"loss": 1.5309,
"step": 2622
},
{
"epoch": 2.6462344994455087,
"grad_norm": 0.14200037232407361,
"learning_rate": 5e-05,
"loss": 1.5609,
"step": 2623
},
{
"epoch": 2.6472426655912895,
"grad_norm": 0.14548049077056605,
"learning_rate": 5e-05,
"loss": 1.5662,
"step": 2624
},
{
"epoch": 2.6482508317370703,
"grad_norm": 0.14588858561313403,
"learning_rate": 5e-05,
"loss": 1.5715,
"step": 2625
},
{
"epoch": 2.649258997882851,
"grad_norm": 0.13422997913312334,
"learning_rate": 5e-05,
"loss": 1.5576,
"step": 2626
},
{
"epoch": 2.650267164028632,
"grad_norm": 0.14373272226483658,
"learning_rate": 5e-05,
"loss": 1.5654,
"step": 2627
},
{
"epoch": 2.6512753301744127,
"grad_norm": 0.14397101488478398,
"learning_rate": 5e-05,
"loss": 1.5482,
"step": 2628
},
{
"epoch": 2.6522834963201936,
"grad_norm": 0.1399424865762421,
"learning_rate": 5e-05,
"loss": 1.5737,
"step": 2629
},
{
"epoch": 2.6532916624659744,
"grad_norm": 0.14886678980576779,
"learning_rate": 5e-05,
"loss": 1.5883,
"step": 2630
},
{
"epoch": 2.654299828611755,
"grad_norm": 0.14653892263537838,
"learning_rate": 5e-05,
"loss": 1.5725,
"step": 2631
},
{
"epoch": 2.655307994757536,
"grad_norm": 0.13657992074184186,
"learning_rate": 5e-05,
"loss": 1.5558,
"step": 2632
},
{
"epoch": 2.656316160903317,
"grad_norm": 0.16181280045845536,
"learning_rate": 5e-05,
"loss": 1.5665,
"step": 2633
},
{
"epoch": 2.6573243270490976,
"grad_norm": 0.15163898001388915,
"learning_rate": 5e-05,
"loss": 1.5636,
"step": 2634
},
{
"epoch": 2.6583324931948784,
"grad_norm": 0.14391052900139392,
"learning_rate": 5e-05,
"loss": 1.5716,
"step": 2635
},
{
"epoch": 2.659340659340659,
"grad_norm": 0.15441727511277034,
"learning_rate": 5e-05,
"loss": 1.5658,
"step": 2636
},
{
"epoch": 2.66034882548644,
"grad_norm": 0.18434934741540565,
"learning_rate": 5e-05,
"loss": 1.5583,
"step": 2637
},
{
"epoch": 2.661356991632221,
"grad_norm": 0.14675093820621574,
"learning_rate": 5e-05,
"loss": 1.5822,
"step": 2638
},
{
"epoch": 2.6623651577780016,
"grad_norm": 0.17292648886603113,
"learning_rate": 5e-05,
"loss": 1.5895,
"step": 2639
},
{
"epoch": 2.663373323923783,
"grad_norm": 0.13899788061020074,
"learning_rate": 5e-05,
"loss": 1.5786,
"step": 2640
},
{
"epoch": 2.6643814900695633,
"grad_norm": 0.16613231489632996,
"learning_rate": 5e-05,
"loss": 1.5556,
"step": 2641
},
{
"epoch": 2.6653896562153445,
"grad_norm": 0.17077450676770634,
"learning_rate": 5e-05,
"loss": 1.5697,
"step": 2642
},
{
"epoch": 2.666397822361125,
"grad_norm": 0.1403577426269955,
"learning_rate": 5e-05,
"loss": 1.5632,
"step": 2643
},
{
"epoch": 2.667405988506906,
"grad_norm": 0.13213661449923972,
"learning_rate": 5e-05,
"loss": 1.5613,
"step": 2644
},
{
"epoch": 2.6684141546526865,
"grad_norm": 0.14106671427970552,
"learning_rate": 5e-05,
"loss": 1.5725,
"step": 2645
},
{
"epoch": 2.6694223207984678,
"grad_norm": 0.1366834656968961,
"learning_rate": 5e-05,
"loss": 1.5797,
"step": 2646
},
{
"epoch": 2.6704304869442486,
"grad_norm": 0.14767004453703217,
"learning_rate": 5e-05,
"loss": 1.5517,
"step": 2647
},
{
"epoch": 2.6714386530900294,
"grad_norm": 0.13292170771715223,
"learning_rate": 5e-05,
"loss": 1.5689,
"step": 2648
},
{
"epoch": 2.67244681923581,
"grad_norm": 0.14250366847001242,
"learning_rate": 5e-05,
"loss": 1.5439,
"step": 2649
},
{
"epoch": 2.673454985381591,
"grad_norm": 0.13990853172352147,
"learning_rate": 5e-05,
"loss": 1.5605,
"step": 2650
},
{
"epoch": 2.674463151527372,
"grad_norm": 0.1342108354881665,
"learning_rate": 5e-05,
"loss": 1.5474,
"step": 2651
},
{
"epoch": 2.6754713176731526,
"grad_norm": 0.14095351494547906,
"learning_rate": 5e-05,
"loss": 1.562,
"step": 2652
},
{
"epoch": 2.6764794838189334,
"grad_norm": 0.13632871187159845,
"learning_rate": 5e-05,
"loss": 1.5788,
"step": 2653
},
{
"epoch": 2.6774876499647142,
"grad_norm": 0.13741013059016396,
"learning_rate": 5e-05,
"loss": 1.5809,
"step": 2654
},
{
"epoch": 2.678495816110495,
"grad_norm": 0.14104148902284483,
"learning_rate": 5e-05,
"loss": 1.568,
"step": 2655
},
{
"epoch": 2.679503982256276,
"grad_norm": 0.13634681877896998,
"learning_rate": 5e-05,
"loss": 1.5826,
"step": 2656
},
{
"epoch": 2.6805121484020566,
"grad_norm": 0.1400690608553844,
"learning_rate": 5e-05,
"loss": 1.5479,
"step": 2657
},
{
"epoch": 2.6815203145478375,
"grad_norm": 0.1381298076752533,
"learning_rate": 5e-05,
"loss": 1.5753,
"step": 2658
},
{
"epoch": 2.6825284806936183,
"grad_norm": 0.14096150667236787,
"learning_rate": 5e-05,
"loss": 1.5556,
"step": 2659
},
{
"epoch": 2.683536646839399,
"grad_norm": 0.15006887442688122,
"learning_rate": 5e-05,
"loss": 1.5747,
"step": 2660
},
{
"epoch": 2.68454481298518,
"grad_norm": 0.1378134811649124,
"learning_rate": 5e-05,
"loss": 1.5687,
"step": 2661
},
{
"epoch": 2.6855529791309607,
"grad_norm": 0.2270710658723629,
"learning_rate": 5e-05,
"loss": 1.5771,
"step": 2662
},
{
"epoch": 2.6865611452767415,
"grad_norm": 0.14058235952025902,
"learning_rate": 5e-05,
"loss": 1.5646,
"step": 2663
},
{
"epoch": 2.6875693114225223,
"grad_norm": 0.12594815732088815,
"learning_rate": 5e-05,
"loss": 1.5576,
"step": 2664
},
{
"epoch": 2.688577477568303,
"grad_norm": 0.1399517021609818,
"learning_rate": 5e-05,
"loss": 1.5728,
"step": 2665
},
{
"epoch": 2.689585643714084,
"grad_norm": 0.13174605200545916,
"learning_rate": 5e-05,
"loss": 1.566,
"step": 2666
},
{
"epoch": 2.690593809859865,
"grad_norm": 0.13746031870272224,
"learning_rate": 5e-05,
"loss": 1.569,
"step": 2667
},
{
"epoch": 2.6916019760056455,
"grad_norm": 4.560783413286865,
"learning_rate": 5e-05,
"loss": 1.5996,
"step": 2668
},
{
"epoch": 2.692610142151427,
"grad_norm": 0.1599729642198486,
"learning_rate": 5e-05,
"loss": 1.5783,
"step": 2669
},
{
"epoch": 2.693618308297207,
"grad_norm": 0.1328849115559514,
"learning_rate": 5e-05,
"loss": 1.5763,
"step": 2670
},
{
"epoch": 2.6946264744429884,
"grad_norm": 0.1576681364906905,
"learning_rate": 5e-05,
"loss": 1.5723,
"step": 2671
},
{
"epoch": 2.695634640588769,
"grad_norm": 0.1366040880089965,
"learning_rate": 5e-05,
"loss": 1.5641,
"step": 2672
},
{
"epoch": 2.69664280673455,
"grad_norm": 0.1580270504056786,
"learning_rate": 5e-05,
"loss": 1.5447,
"step": 2673
},
{
"epoch": 2.697650972880331,
"grad_norm": 0.13771171876035967,
"learning_rate": 5e-05,
"loss": 1.5708,
"step": 2674
},
{
"epoch": 2.6986591390261117,
"grad_norm": 0.1517424456471761,
"learning_rate": 5e-05,
"loss": 1.5494,
"step": 2675
},
{
"epoch": 2.6996673051718925,
"grad_norm": 0.14995179976865108,
"learning_rate": 5e-05,
"loss": 1.5595,
"step": 2676
},
{
"epoch": 2.7006754713176733,
"grad_norm": 2.1810950431818146,
"learning_rate": 5e-05,
"loss": 1.5817,
"step": 2677
},
{
"epoch": 2.701683637463454,
"grad_norm": 0.1798172607390676,
"learning_rate": 5e-05,
"loss": 1.5635,
"step": 2678
},
{
"epoch": 2.702691803609235,
"grad_norm": 0.16109832316524236,
"learning_rate": 5e-05,
"loss": 1.5825,
"step": 2679
},
{
"epoch": 2.7036999697550157,
"grad_norm": 0.14862178006767085,
"learning_rate": 5e-05,
"loss": 1.5512,
"step": 2680
},
{
"epoch": 2.7047081359007965,
"grad_norm": 0.14919683751945761,
"learning_rate": 5e-05,
"loss": 1.5653,
"step": 2681
},
{
"epoch": 2.7057163020465773,
"grad_norm": 0.20775257275377015,
"learning_rate": 5e-05,
"loss": 1.5673,
"step": 2682
},
{
"epoch": 2.706724468192358,
"grad_norm": 0.15438174577601985,
"learning_rate": 5e-05,
"loss": 1.5799,
"step": 2683
},
{
"epoch": 2.707732634338139,
"grad_norm": 0.15569951049593259,
"learning_rate": 5e-05,
"loss": 1.5618,
"step": 2684
},
{
"epoch": 2.7087408004839197,
"grad_norm": 0.15064037867202598,
"learning_rate": 5e-05,
"loss": 1.5648,
"step": 2685
},
{
"epoch": 2.7097489666297006,
"grad_norm": 0.15781744910859977,
"learning_rate": 5e-05,
"loss": 1.5626,
"step": 2686
},
{
"epoch": 2.7107571327754814,
"grad_norm": 0.15466431610953948,
"learning_rate": 5e-05,
"loss": 1.5609,
"step": 2687
},
{
"epoch": 2.711765298921262,
"grad_norm": 0.15814654807557524,
"learning_rate": 5e-05,
"loss": 1.5655,
"step": 2688
},
{
"epoch": 2.712773465067043,
"grad_norm": 0.13949634738027647,
"learning_rate": 5e-05,
"loss": 1.5831,
"step": 2689
},
{
"epoch": 2.713781631212824,
"grad_norm": 0.1406618538641033,
"learning_rate": 5e-05,
"loss": 1.5663,
"step": 2690
},
{
"epoch": 2.7147897973586046,
"grad_norm": 0.13191748568761202,
"learning_rate": 5e-05,
"loss": 1.5647,
"step": 2691
},
{
"epoch": 2.7157979635043854,
"grad_norm": 0.5074854253225494,
"learning_rate": 5e-05,
"loss": 1.5717,
"step": 2692
},
{
"epoch": 2.716806129650166,
"grad_norm": 0.13776438838613422,
"learning_rate": 5e-05,
"loss": 1.5749,
"step": 2693
},
{
"epoch": 2.717814295795947,
"grad_norm": 0.16485203741006418,
"learning_rate": 5e-05,
"loss": 1.5693,
"step": 2694
},
{
"epoch": 2.718822461941728,
"grad_norm": 0.1398154057360336,
"learning_rate": 5e-05,
"loss": 1.5709,
"step": 2695
},
{
"epoch": 2.719830628087509,
"grad_norm": 0.1322842540538691,
"learning_rate": 5e-05,
"loss": 1.5669,
"step": 2696
},
{
"epoch": 2.7208387942332894,
"grad_norm": 0.14578755614140504,
"learning_rate": 5e-05,
"loss": 1.5893,
"step": 2697
},
{
"epoch": 2.7218469603790707,
"grad_norm": 0.13249217510021813,
"learning_rate": 5e-05,
"loss": 1.5695,
"step": 2698
},
{
"epoch": 2.722855126524851,
"grad_norm": 0.13230498419415554,
"learning_rate": 5e-05,
"loss": 1.5581,
"step": 2699
},
{
"epoch": 2.7238632926706323,
"grad_norm": 0.140920778481646,
"learning_rate": 5e-05,
"loss": 1.5528,
"step": 2700
},
{
"epoch": 2.7248714588164127,
"grad_norm": 0.1433753585777024,
"learning_rate": 5e-05,
"loss": 1.5704,
"step": 2701
},
{
"epoch": 2.725879624962194,
"grad_norm": 0.1336399638319816,
"learning_rate": 5e-05,
"loss": 1.593,
"step": 2702
},
{
"epoch": 2.7268877911079747,
"grad_norm": 0.14560257221000472,
"learning_rate": 5e-05,
"loss": 1.5345,
"step": 2703
},
{
"epoch": 2.7278959572537556,
"grad_norm": 0.1836365416353928,
"learning_rate": 5e-05,
"loss": 1.5812,
"step": 2704
},
{
"epoch": 2.7289041233995364,
"grad_norm": 0.14009232696115306,
"learning_rate": 5e-05,
"loss": 1.5655,
"step": 2705
},
{
"epoch": 2.729912289545317,
"grad_norm": 0.15768817911278674,
"learning_rate": 5e-05,
"loss": 1.5618,
"step": 2706
},
{
"epoch": 2.730920455691098,
"grad_norm": 0.1357757946680182,
"learning_rate": 5e-05,
"loss": 1.5679,
"step": 2707
},
{
"epoch": 2.731928621836879,
"grad_norm": 0.1361176788290324,
"learning_rate": 5e-05,
"loss": 1.557,
"step": 2708
},
{
"epoch": 2.7329367879826596,
"grad_norm": 0.1552287534850647,
"learning_rate": 5e-05,
"loss": 1.567,
"step": 2709
},
{
"epoch": 2.7339449541284404,
"grad_norm": 0.8550126356500881,
"learning_rate": 5e-05,
"loss": 1.573,
"step": 2710
},
{
"epoch": 2.734953120274221,
"grad_norm": 0.14590212131755376,
"learning_rate": 5e-05,
"loss": 1.5705,
"step": 2711
},
{
"epoch": 2.735961286420002,
"grad_norm": 0.14077732180407562,
"learning_rate": 5e-05,
"loss": 1.5711,
"step": 2712
},
{
"epoch": 2.736969452565783,
"grad_norm": 0.14271742070831292,
"learning_rate": 5e-05,
"loss": 1.5651,
"step": 2713
},
{
"epoch": 2.7379776187115636,
"grad_norm": 0.13449509255344788,
"learning_rate": 5e-05,
"loss": 1.5657,
"step": 2714
},
{
"epoch": 2.7389857848573445,
"grad_norm": 0.1428486321025379,
"learning_rate": 5e-05,
"loss": 1.5692,
"step": 2715
},
{
"epoch": 2.7399939510031253,
"grad_norm": 0.14030467155716347,
"learning_rate": 5e-05,
"loss": 1.5568,
"step": 2716
},
{
"epoch": 2.741002117148906,
"grad_norm": 0.13805285727527963,
"learning_rate": 5e-05,
"loss": 1.5836,
"step": 2717
},
{
"epoch": 2.742010283294687,
"grad_norm": 0.14366377411261483,
"learning_rate": 5e-05,
"loss": 1.545,
"step": 2718
},
{
"epoch": 2.7430184494404677,
"grad_norm": 0.14521750353616375,
"learning_rate": 5e-05,
"loss": 1.5728,
"step": 2719
},
{
"epoch": 2.7440266155862485,
"grad_norm": 0.15205719509608184,
"learning_rate": 5e-05,
"loss": 1.5549,
"step": 2720
},
{
"epoch": 2.7450347817320293,
"grad_norm": 0.15011156624584093,
"learning_rate": 5e-05,
"loss": 1.5611,
"step": 2721
},
{
"epoch": 2.74604294787781,
"grad_norm": 0.13791270226630295,
"learning_rate": 5e-05,
"loss": 1.5597,
"step": 2722
},
{
"epoch": 2.747051114023591,
"grad_norm": 0.13822033117826582,
"learning_rate": 5e-05,
"loss": 1.5685,
"step": 2723
},
{
"epoch": 2.7480592801693717,
"grad_norm": 0.13309371437546486,
"learning_rate": 5e-05,
"loss": 1.5737,
"step": 2724
},
{
"epoch": 2.749067446315153,
"grad_norm": 0.13956376666855283,
"learning_rate": 5e-05,
"loss": 1.5533,
"step": 2725
},
{
"epoch": 2.7500756124609333,
"grad_norm": 0.14242752187426724,
"learning_rate": 5e-05,
"loss": 1.5695,
"step": 2726
},
{
"epoch": 2.7510837786067146,
"grad_norm": 0.13778114699423819,
"learning_rate": 5e-05,
"loss": 1.5566,
"step": 2727
},
{
"epoch": 2.752091944752495,
"grad_norm": 0.1252140442866245,
"learning_rate": 5e-05,
"loss": 1.5619,
"step": 2728
},
{
"epoch": 2.753100110898276,
"grad_norm": 0.14450735002596674,
"learning_rate": 5e-05,
"loss": 1.5588,
"step": 2729
},
{
"epoch": 2.7541082770440566,
"grad_norm": 0.12444546171527433,
"learning_rate": 5e-05,
"loss": 1.5662,
"step": 2730
},
{
"epoch": 2.755116443189838,
"grad_norm": 0.141004291675775,
"learning_rate": 5e-05,
"loss": 1.572,
"step": 2731
},
{
"epoch": 2.7561246093356186,
"grad_norm": 0.13199288201421328,
"learning_rate": 5e-05,
"loss": 1.5861,
"step": 2732
},
{
"epoch": 2.7571327754813995,
"grad_norm": 0.12793988338331708,
"learning_rate": 5e-05,
"loss": 1.5785,
"step": 2733
},
{
"epoch": 2.7581409416271803,
"grad_norm": 0.14342057001903644,
"learning_rate": 5e-05,
"loss": 1.5567,
"step": 2734
},
{
"epoch": 2.759149107772961,
"grad_norm": 0.14450779133652514,
"learning_rate": 5e-05,
"loss": 1.5822,
"step": 2735
},
{
"epoch": 2.760157273918742,
"grad_norm": 0.1405320308190688,
"learning_rate": 5e-05,
"loss": 1.5595,
"step": 2736
},
{
"epoch": 2.7611654400645227,
"grad_norm": 0.1286823927500477,
"learning_rate": 5e-05,
"loss": 1.5511,
"step": 2737
},
{
"epoch": 2.7621736062103035,
"grad_norm": 0.15254126540044513,
"learning_rate": 5e-05,
"loss": 1.5599,
"step": 2738
},
{
"epoch": 2.7631817723560843,
"grad_norm": 0.1332062281330069,
"learning_rate": 5e-05,
"loss": 1.563,
"step": 2739
},
{
"epoch": 2.764189938501865,
"grad_norm": 0.1307457427111258,
"learning_rate": 5e-05,
"loss": 1.5617,
"step": 2740
},
{
"epoch": 2.765198104647646,
"grad_norm": 0.13375355501400343,
"learning_rate": 5e-05,
"loss": 1.5766,
"step": 2741
},
{
"epoch": 2.7662062707934267,
"grad_norm": 0.13102102704849145,
"learning_rate": 5e-05,
"loss": 1.5598,
"step": 2742
},
{
"epoch": 2.7672144369392075,
"grad_norm": 0.1344654670681869,
"learning_rate": 5e-05,
"loss": 1.5709,
"step": 2743
},
{
"epoch": 2.7682226030849884,
"grad_norm": 0.135632915147248,
"learning_rate": 5e-05,
"loss": 1.5623,
"step": 2744
},
{
"epoch": 2.769230769230769,
"grad_norm": 0.14105458578703645,
"learning_rate": 5e-05,
"loss": 1.5459,
"step": 2745
},
{
"epoch": 2.77023893537655,
"grad_norm": 0.1330217189753598,
"learning_rate": 5e-05,
"loss": 1.5593,
"step": 2746
},
{
"epoch": 2.771247101522331,
"grad_norm": 0.13021194087012497,
"learning_rate": 5e-05,
"loss": 1.5572,
"step": 2747
},
{
"epoch": 2.7722552676681116,
"grad_norm": 0.1300297855746897,
"learning_rate": 5e-05,
"loss": 1.5658,
"step": 2748
},
{
"epoch": 2.7732634338138924,
"grad_norm": 0.1370134739303623,
"learning_rate": 5e-05,
"loss": 1.5621,
"step": 2749
},
{
"epoch": 2.774271599959673,
"grad_norm": 0.13816320034814938,
"learning_rate": 5e-05,
"loss": 1.5893,
"step": 2750
},
{
"epoch": 2.775279766105454,
"grad_norm": 0.14272754344183877,
"learning_rate": 5e-05,
"loss": 1.5436,
"step": 2751
},
{
"epoch": 2.7762879322512353,
"grad_norm": 0.13315789617751309,
"learning_rate": 5e-05,
"loss": 1.5773,
"step": 2752
},
{
"epoch": 2.7772960983970156,
"grad_norm": 0.14393952562946477,
"learning_rate": 5e-05,
"loss": 1.5576,
"step": 2753
},
{
"epoch": 2.778304264542797,
"grad_norm": 0.14032685806148001,
"learning_rate": 5e-05,
"loss": 1.567,
"step": 2754
},
{
"epoch": 2.7793124306885773,
"grad_norm": 0.15032876140994053,
"learning_rate": 5e-05,
"loss": 1.5859,
"step": 2755
},
{
"epoch": 2.7803205968343585,
"grad_norm": 0.14094821098163882,
"learning_rate": 5e-05,
"loss": 1.5663,
"step": 2756
},
{
"epoch": 2.781328762980139,
"grad_norm": 0.14430135424273705,
"learning_rate": 5e-05,
"loss": 1.5515,
"step": 2757
},
{
"epoch": 2.78233692912592,
"grad_norm": 0.14519553615256653,
"learning_rate": 5e-05,
"loss": 1.5588,
"step": 2758
},
{
"epoch": 2.783345095271701,
"grad_norm": 0.21843890586032347,
"learning_rate": 5e-05,
"loss": 1.5949,
"step": 2759
},
{
"epoch": 2.7843532614174817,
"grad_norm": 0.14087284318724133,
"learning_rate": 5e-05,
"loss": 1.5634,
"step": 2760
},
{
"epoch": 2.7853614275632625,
"grad_norm": 0.13897524686225557,
"learning_rate": 5e-05,
"loss": 1.5929,
"step": 2761
},
{
"epoch": 2.7863695937090434,
"grad_norm": 0.1403214060798171,
"learning_rate": 5e-05,
"loss": 1.5688,
"step": 2762
},
{
"epoch": 2.787377759854824,
"grad_norm": 0.13466657363854218,
"learning_rate": 5e-05,
"loss": 1.5429,
"step": 2763
},
{
"epoch": 2.788385926000605,
"grad_norm": 0.14004040045303778,
"learning_rate": 5e-05,
"loss": 1.5561,
"step": 2764
},
{
"epoch": 2.789394092146386,
"grad_norm": 0.12908514859637493,
"learning_rate": 5e-05,
"loss": 1.567,
"step": 2765
},
{
"epoch": 2.7904022582921666,
"grad_norm": 0.13545587394709968,
"learning_rate": 5e-05,
"loss": 1.5735,
"step": 2766
},
{
"epoch": 2.7914104244379474,
"grad_norm": 0.14099444514004147,
"learning_rate": 5e-05,
"loss": 1.5572,
"step": 2767
},
{
"epoch": 2.792418590583728,
"grad_norm": 0.12851123246323942,
"learning_rate": 5e-05,
"loss": 1.5647,
"step": 2768
},
{
"epoch": 2.793426756729509,
"grad_norm": 0.1294860650724102,
"learning_rate": 5e-05,
"loss": 1.5632,
"step": 2769
},
{
"epoch": 2.79443492287529,
"grad_norm": 0.1545242303048526,
"learning_rate": 5e-05,
"loss": 1.5623,
"step": 2770
},
{
"epoch": 2.7954430890210706,
"grad_norm": 0.13293580747047254,
"learning_rate": 5e-05,
"loss": 1.5631,
"step": 2771
},
{
"epoch": 2.7964512551668514,
"grad_norm": 0.1677966111788506,
"learning_rate": 5e-05,
"loss": 1.5661,
"step": 2772
},
{
"epoch": 2.7974594213126323,
"grad_norm": 0.1373898025220244,
"learning_rate": 5e-05,
"loss": 1.5724,
"step": 2773
},
{
"epoch": 2.798467587458413,
"grad_norm": 0.15485397737946918,
"learning_rate": 5e-05,
"loss": 1.5709,
"step": 2774
},
{
"epoch": 2.799475753604194,
"grad_norm": 0.13026482998146938,
"learning_rate": 5e-05,
"loss": 1.5582,
"step": 2775
},
{
"epoch": 2.8004839197499747,
"grad_norm": 0.14900467029658052,
"learning_rate": 5e-05,
"loss": 1.569,
"step": 2776
},
{
"epoch": 2.8014920858957555,
"grad_norm": 0.15753731898029472,
"learning_rate": 5e-05,
"loss": 1.5671,
"step": 2777
},
{
"epoch": 2.8025002520415363,
"grad_norm": 0.14340045158388026,
"learning_rate": 5e-05,
"loss": 1.5746,
"step": 2778
},
{
"epoch": 2.803508418187317,
"grad_norm": 0.13741597876550288,
"learning_rate": 5e-05,
"loss": 1.582,
"step": 2779
},
{
"epoch": 2.804516584333098,
"grad_norm": 0.15596253708049596,
"learning_rate": 5e-05,
"loss": 1.5475,
"step": 2780
},
{
"epoch": 2.805524750478879,
"grad_norm": 0.14072537992748982,
"learning_rate": 5e-05,
"loss": 1.5766,
"step": 2781
},
{
"epoch": 2.8065329166246595,
"grad_norm": 0.14150135394284288,
"learning_rate": 5e-05,
"loss": 1.57,
"step": 2782
},
{
"epoch": 2.807541082770441,
"grad_norm": 0.16124329516191444,
"learning_rate": 5e-05,
"loss": 1.5729,
"step": 2783
},
{
"epoch": 2.808549248916221,
"grad_norm": 0.1409731887973358,
"learning_rate": 5e-05,
"loss": 1.5762,
"step": 2784
},
{
"epoch": 2.8095574150620024,
"grad_norm": 0.12994746549707634,
"learning_rate": 5e-05,
"loss": 1.5647,
"step": 2785
},
{
"epoch": 2.8105655812077828,
"grad_norm": 0.1337149306330427,
"learning_rate": 5e-05,
"loss": 1.5589,
"step": 2786
},
{
"epoch": 2.811573747353564,
"grad_norm": 0.14527439997833774,
"learning_rate": 5e-05,
"loss": 1.5571,
"step": 2787
},
{
"epoch": 2.812581913499345,
"grad_norm": 0.14729537346535954,
"learning_rate": 5e-05,
"loss": 1.5582,
"step": 2788
},
{
"epoch": 2.8135900796451256,
"grad_norm": 0.13523666939952828,
"learning_rate": 5e-05,
"loss": 1.5811,
"step": 2789
},
{
"epoch": 2.8145982457909065,
"grad_norm": 0.13471718692012805,
"learning_rate": 5e-05,
"loss": 1.56,
"step": 2790
},
{
"epoch": 2.8156064119366873,
"grad_norm": 0.14241905055575318,
"learning_rate": 5e-05,
"loss": 1.5552,
"step": 2791
},
{
"epoch": 2.816614578082468,
"grad_norm": 0.15117704580718153,
"learning_rate": 5e-05,
"loss": 1.5667,
"step": 2792
},
{
"epoch": 2.817622744228249,
"grad_norm": 0.129569133240046,
"learning_rate": 5e-05,
"loss": 1.5584,
"step": 2793
},
{
"epoch": 2.8186309103740297,
"grad_norm": 0.13326536091885646,
"learning_rate": 5e-05,
"loss": 1.5528,
"step": 2794
},
{
"epoch": 2.8196390765198105,
"grad_norm": 0.14180855374306933,
"learning_rate": 5e-05,
"loss": 1.5659,
"step": 2795
},
{
"epoch": 2.8206472426655913,
"grad_norm": 0.13591751704326935,
"learning_rate": 5e-05,
"loss": 1.5587,
"step": 2796
},
{
"epoch": 2.821655408811372,
"grad_norm": 0.12936260419202925,
"learning_rate": 5e-05,
"loss": 1.5539,
"step": 2797
},
{
"epoch": 2.822663574957153,
"grad_norm": 0.13430709757077128,
"learning_rate": 5e-05,
"loss": 1.5658,
"step": 2798
},
{
"epoch": 2.8236717411029337,
"grad_norm": 0.13930112727317112,
"learning_rate": 5e-05,
"loss": 1.5566,
"step": 2799
},
{
"epoch": 2.8246799072487145,
"grad_norm": 0.1394216962524837,
"learning_rate": 5e-05,
"loss": 1.5691,
"step": 2800
},
{
"epoch": 2.8256880733944953,
"grad_norm": 0.13870544987039013,
"learning_rate": 5e-05,
"loss": 1.5585,
"step": 2801
},
{
"epoch": 2.826696239540276,
"grad_norm": 0.13573910551114407,
"learning_rate": 5e-05,
"loss": 1.5679,
"step": 2802
},
{
"epoch": 2.827704405686057,
"grad_norm": 0.13141232855559276,
"learning_rate": 5e-05,
"loss": 1.5571,
"step": 2803
},
{
"epoch": 2.8287125718318378,
"grad_norm": 0.14085927498398315,
"learning_rate": 5e-05,
"loss": 1.5725,
"step": 2804
},
{
"epoch": 2.8297207379776186,
"grad_norm": 0.1495486355365083,
"learning_rate": 5e-05,
"loss": 1.5754,
"step": 2805
},
{
"epoch": 2.8307289041233994,
"grad_norm": 0.14377995563695223,
"learning_rate": 5e-05,
"loss": 1.5577,
"step": 2806
},
{
"epoch": 2.83173707026918,
"grad_norm": 0.13733139961069687,
"learning_rate": 5e-05,
"loss": 1.5534,
"step": 2807
},
{
"epoch": 2.8327452364149615,
"grad_norm": 0.1459807185548558,
"learning_rate": 5e-05,
"loss": 1.5683,
"step": 2808
},
{
"epoch": 2.833753402560742,
"grad_norm": 0.14123278862792107,
"learning_rate": 5e-05,
"loss": 1.5641,
"step": 2809
},
{
"epoch": 2.834761568706523,
"grad_norm": 0.1302876133381825,
"learning_rate": 5e-05,
"loss": 1.567,
"step": 2810
},
{
"epoch": 2.8357697348523034,
"grad_norm": 0.12946097333437906,
"learning_rate": 5e-05,
"loss": 1.5448,
"step": 2811
},
{
"epoch": 2.8367779009980847,
"grad_norm": 0.141413346599211,
"learning_rate": 5e-05,
"loss": 1.5502,
"step": 2812
},
{
"epoch": 2.837786067143865,
"grad_norm": 0.1373517445844322,
"learning_rate": 5e-05,
"loss": 1.5538,
"step": 2813
},
{
"epoch": 2.8387942332896463,
"grad_norm": 0.14637065916716574,
"learning_rate": 5e-05,
"loss": 1.5619,
"step": 2814
},
{
"epoch": 2.839802399435427,
"grad_norm": 0.1441806648238271,
"learning_rate": 5e-05,
"loss": 1.5678,
"step": 2815
},
{
"epoch": 2.840810565581208,
"grad_norm": 0.14384298061843082,
"learning_rate": 5e-05,
"loss": 1.5414,
"step": 2816
},
{
"epoch": 2.8418187317269887,
"grad_norm": 0.15100690742938513,
"learning_rate": 5e-05,
"loss": 1.5895,
"step": 2817
},
{
"epoch": 2.8428268978727695,
"grad_norm": 0.14521702309872028,
"learning_rate": 5e-05,
"loss": 1.5636,
"step": 2818
},
{
"epoch": 2.8438350640185504,
"grad_norm": 0.1442222471812924,
"learning_rate": 5e-05,
"loss": 1.5601,
"step": 2819
},
{
"epoch": 2.844843230164331,
"grad_norm": 0.14102393840045072,
"learning_rate": 5e-05,
"loss": 1.5619,
"step": 2820
},
{
"epoch": 2.845851396310112,
"grad_norm": 0.14387069683105316,
"learning_rate": 5e-05,
"loss": 1.5604,
"step": 2821
},
{
"epoch": 2.846859562455893,
"grad_norm": 0.14830391380316718,
"learning_rate": 5e-05,
"loss": 1.5597,
"step": 2822
},
{
"epoch": 2.8478677286016736,
"grad_norm": 0.1390966393194733,
"learning_rate": 5e-05,
"loss": 1.5673,
"step": 2823
},
{
"epoch": 2.8488758947474544,
"grad_norm": 0.13823246164976574,
"learning_rate": 5e-05,
"loss": 1.5549,
"step": 2824
},
{
"epoch": 2.849884060893235,
"grad_norm": 0.15290443253208844,
"learning_rate": 5e-05,
"loss": 1.5464,
"step": 2825
},
{
"epoch": 2.850892227039016,
"grad_norm": 0.14540803890446127,
"learning_rate": 5e-05,
"loss": 1.5564,
"step": 2826
},
{
"epoch": 2.851900393184797,
"grad_norm": 0.14501993953342943,
"learning_rate": 5e-05,
"loss": 1.5617,
"step": 2827
},
{
"epoch": 2.8529085593305776,
"grad_norm": 0.14398340860081874,
"learning_rate": 5e-05,
"loss": 1.5658,
"step": 2828
},
{
"epoch": 2.8539167254763584,
"grad_norm": 0.14518128848972808,
"learning_rate": 5e-05,
"loss": 1.557,
"step": 2829
},
{
"epoch": 2.8549248916221392,
"grad_norm": 0.14454368726953212,
"learning_rate": 5e-05,
"loss": 1.576,
"step": 2830
},
{
"epoch": 2.85593305776792,
"grad_norm": 0.14806003244528992,
"learning_rate": 5e-05,
"loss": 1.5659,
"step": 2831
},
{
"epoch": 2.856941223913701,
"grad_norm": 0.1438396358137218,
"learning_rate": 5e-05,
"loss": 1.5661,
"step": 2832
},
{
"epoch": 2.8579493900594817,
"grad_norm": 0.14870498477600272,
"learning_rate": 5e-05,
"loss": 1.5676,
"step": 2833
},
{
"epoch": 2.8589575562052625,
"grad_norm": 0.13352249252236617,
"learning_rate": 5e-05,
"loss": 1.561,
"step": 2834
},
{
"epoch": 2.8599657223510433,
"grad_norm": 0.1398136261294875,
"learning_rate": 5e-05,
"loss": 1.565,
"step": 2835
},
{
"epoch": 2.860973888496824,
"grad_norm": 0.15341641930861216,
"learning_rate": 5e-05,
"loss": 1.5592,
"step": 2836
},
{
"epoch": 2.8619820546426054,
"grad_norm": 0.1435822023700197,
"learning_rate": 5e-05,
"loss": 1.5623,
"step": 2837
},
{
"epoch": 2.8629902207883857,
"grad_norm": 0.13842838786544373,
"learning_rate": 5e-05,
"loss": 1.548,
"step": 2838
},
{
"epoch": 2.863998386934167,
"grad_norm": 0.14097645285321597,
"learning_rate": 5e-05,
"loss": 1.5694,
"step": 2839
},
{
"epoch": 2.8650065530799473,
"grad_norm": 0.14074603728496404,
"learning_rate": 5e-05,
"loss": 1.5753,
"step": 2840
},
{
"epoch": 2.8660147192257286,
"grad_norm": 0.136340337143158,
"learning_rate": 5e-05,
"loss": 1.5656,
"step": 2841
},
{
"epoch": 2.867022885371509,
"grad_norm": 0.1661909550319213,
"learning_rate": 5e-05,
"loss": 1.5535,
"step": 2842
},
{
"epoch": 2.86803105151729,
"grad_norm": 0.14300362069607922,
"learning_rate": 5e-05,
"loss": 1.5716,
"step": 2843
},
{
"epoch": 2.869039217663071,
"grad_norm": 0.13424444297100016,
"learning_rate": 5e-05,
"loss": 1.5575,
"step": 2844
},
{
"epoch": 2.870047383808852,
"grad_norm": 0.14593598021094323,
"learning_rate": 5e-05,
"loss": 1.5708,
"step": 2845
},
{
"epoch": 2.8710555499546326,
"grad_norm": 0.13718997873999128,
"learning_rate": 5e-05,
"loss": 1.5716,
"step": 2846
},
{
"epoch": 2.8720637161004134,
"grad_norm": 0.14756508713246896,
"learning_rate": 5e-05,
"loss": 1.5479,
"step": 2847
},
{
"epoch": 2.8730718822461943,
"grad_norm": 0.13240813416747257,
"learning_rate": 5e-05,
"loss": 1.5709,
"step": 2848
},
{
"epoch": 2.874080048391975,
"grad_norm": 0.14818471917119877,
"learning_rate": 5e-05,
"loss": 1.5827,
"step": 2849
},
{
"epoch": 2.875088214537756,
"grad_norm": 0.1622234890061836,
"learning_rate": 5e-05,
"loss": 1.5567,
"step": 2850
},
{
"epoch": 2.8760963806835367,
"grad_norm": 0.14840209944205146,
"learning_rate": 5e-05,
"loss": 1.5647,
"step": 2851
},
{
"epoch": 2.8771045468293175,
"grad_norm": 0.13686801156144324,
"learning_rate": 5e-05,
"loss": 1.5519,
"step": 2852
},
{
"epoch": 2.8781127129750983,
"grad_norm": 0.1390584020109862,
"learning_rate": 5e-05,
"loss": 1.5478,
"step": 2853
},
{
"epoch": 2.879120879120879,
"grad_norm": 0.1358182096960893,
"learning_rate": 5e-05,
"loss": 1.5484,
"step": 2854
},
{
"epoch": 2.88012904526666,
"grad_norm": 0.398577333773907,
"learning_rate": 5e-05,
"loss": 1.5786,
"step": 2855
},
{
"epoch": 2.8811372114124407,
"grad_norm": 0.1331042940921803,
"learning_rate": 5e-05,
"loss": 1.58,
"step": 2856
},
{
"epoch": 2.8821453775582215,
"grad_norm": 0.13902477299710286,
"learning_rate": 5e-05,
"loss": 1.5654,
"step": 2857
},
{
"epoch": 2.8831535437040023,
"grad_norm": 0.13660096150095175,
"learning_rate": 5e-05,
"loss": 1.5622,
"step": 2858
},
{
"epoch": 2.884161709849783,
"grad_norm": 0.1324123575740831,
"learning_rate": 5e-05,
"loss": 1.5634,
"step": 2859
},
{
"epoch": 2.885169875995564,
"grad_norm": 0.1419985727170632,
"learning_rate": 5e-05,
"loss": 1.5446,
"step": 2860
},
{
"epoch": 2.8861780421413448,
"grad_norm": 0.14450454707140364,
"learning_rate": 5e-05,
"loss": 1.5588,
"step": 2861
},
{
"epoch": 2.8871862082871256,
"grad_norm": 0.13556264678370267,
"learning_rate": 5e-05,
"loss": 1.5616,
"step": 2862
},
{
"epoch": 2.8881943744329064,
"grad_norm": 0.13924940223913126,
"learning_rate": 5e-05,
"loss": 1.5582,
"step": 2863
},
{
"epoch": 2.8892025405786876,
"grad_norm": 0.14489037756646053,
"learning_rate": 5e-05,
"loss": 1.5547,
"step": 2864
},
{
"epoch": 2.890210706724468,
"grad_norm": 0.1364069870015726,
"learning_rate": 5e-05,
"loss": 1.5613,
"step": 2865
},
{
"epoch": 2.8912188728702493,
"grad_norm": 0.1413793239639967,
"learning_rate": 5e-05,
"loss": 1.5625,
"step": 2866
},
{
"epoch": 2.8922270390160296,
"grad_norm": 0.14912428347663212,
"learning_rate": 5e-05,
"loss": 1.5564,
"step": 2867
},
{
"epoch": 2.893235205161811,
"grad_norm": 0.1538535083338703,
"learning_rate": 5e-05,
"loss": 1.5599,
"step": 2868
},
{
"epoch": 2.8942433713075912,
"grad_norm": 0.14106606384804676,
"learning_rate": 5e-05,
"loss": 1.5505,
"step": 2869
},
{
"epoch": 2.8952515374533725,
"grad_norm": 0.14601342368943243,
"learning_rate": 5e-05,
"loss": 1.5743,
"step": 2870
},
{
"epoch": 2.8962597035991533,
"grad_norm": 0.1359969686343452,
"learning_rate": 5e-05,
"loss": 1.5752,
"step": 2871
},
{
"epoch": 2.897267869744934,
"grad_norm": 0.1321428989584743,
"learning_rate": 5e-05,
"loss": 1.5609,
"step": 2872
},
{
"epoch": 2.898276035890715,
"grad_norm": 0.13621152268777725,
"learning_rate": 5e-05,
"loss": 1.5699,
"step": 2873
},
{
"epoch": 2.8992842020364957,
"grad_norm": 0.14578823774326305,
"learning_rate": 5e-05,
"loss": 1.5666,
"step": 2874
},
{
"epoch": 2.9002923681822765,
"grad_norm": 0.13560289941208062,
"learning_rate": 5e-05,
"loss": 1.5631,
"step": 2875
},
{
"epoch": 2.9013005343280573,
"grad_norm": 0.6578459661435414,
"learning_rate": 5e-05,
"loss": 1.5493,
"step": 2876
},
{
"epoch": 2.902308700473838,
"grad_norm": 0.139381900708492,
"learning_rate": 5e-05,
"loss": 1.5577,
"step": 2877
},
{
"epoch": 2.903316866619619,
"grad_norm": 0.13559165116802954,
"learning_rate": 5e-05,
"loss": 1.5588,
"step": 2878
},
{
"epoch": 2.9043250327653998,
"grad_norm": 0.15146327521944022,
"learning_rate": 5e-05,
"loss": 1.5661,
"step": 2879
},
{
"epoch": 2.9053331989111806,
"grad_norm": 0.13804288601464923,
"learning_rate": 5e-05,
"loss": 1.5452,
"step": 2880
},
{
"epoch": 2.9063413650569614,
"grad_norm": 0.14744411336595978,
"learning_rate": 5e-05,
"loss": 1.5637,
"step": 2881
},
{
"epoch": 2.907349531202742,
"grad_norm": 0.14237854990959567,
"learning_rate": 5e-05,
"loss": 1.5553,
"step": 2882
},
{
"epoch": 2.908357697348523,
"grad_norm": 0.15023980689854555,
"learning_rate": 5e-05,
"loss": 1.5424,
"step": 2883
},
{
"epoch": 2.909365863494304,
"grad_norm": 0.1429507112014247,
"learning_rate": 5e-05,
"loss": 1.5445,
"step": 2884
},
{
"epoch": 2.9103740296400846,
"grad_norm": 0.13597749204024304,
"learning_rate": 5e-05,
"loss": 1.5802,
"step": 2885
},
{
"epoch": 2.9113821957858654,
"grad_norm": 0.14491507670987977,
"learning_rate": 5e-05,
"loss": 1.562,
"step": 2886
},
{
"epoch": 2.9123903619316462,
"grad_norm": 0.13456355698926403,
"learning_rate": 5e-05,
"loss": 1.5596,
"step": 2887
},
{
"epoch": 2.913398528077427,
"grad_norm": 0.15444933801070784,
"learning_rate": 5e-05,
"loss": 1.5605,
"step": 2888
},
{
"epoch": 2.914406694223208,
"grad_norm": 0.1371525313971729,
"learning_rate": 5e-05,
"loss": 1.5667,
"step": 2889
},
{
"epoch": 2.9154148603689887,
"grad_norm": 0.14412880734806927,
"learning_rate": 5e-05,
"loss": 1.5656,
"step": 2890
},
{
"epoch": 2.9164230265147695,
"grad_norm": 0.14034209949997412,
"learning_rate": 5e-05,
"loss": 1.5588,
"step": 2891
},
{
"epoch": 2.9174311926605503,
"grad_norm": 0.12952729294769288,
"learning_rate": 5e-05,
"loss": 1.5442,
"step": 2892
},
{
"epoch": 2.9184393588063315,
"grad_norm": 0.1398162399293511,
"learning_rate": 5e-05,
"loss": 1.5717,
"step": 2893
},
{
"epoch": 2.919447524952112,
"grad_norm": 0.1329999216346244,
"learning_rate": 5e-05,
"loss": 1.5395,
"step": 2894
},
{
"epoch": 2.920455691097893,
"grad_norm": 0.15674089864365628,
"learning_rate": 5e-05,
"loss": 1.5437,
"step": 2895
},
{
"epoch": 2.9214638572436735,
"grad_norm": 0.13429643000428865,
"learning_rate": 5e-05,
"loss": 1.5494,
"step": 2896
},
{
"epoch": 2.9224720233894548,
"grad_norm": 0.1332516748670609,
"learning_rate": 5e-05,
"loss": 1.5691,
"step": 2897
},
{
"epoch": 2.923480189535235,
"grad_norm": 0.15368020425840034,
"learning_rate": 5e-05,
"loss": 1.5516,
"step": 2898
},
{
"epoch": 2.9244883556810164,
"grad_norm": 0.13508568799212733,
"learning_rate": 5e-05,
"loss": 1.5496,
"step": 2899
},
{
"epoch": 2.925496521826797,
"grad_norm": 0.14579709546992245,
"learning_rate": 5e-05,
"loss": 1.5559,
"step": 2900
},
{
"epoch": 2.926504687972578,
"grad_norm": 0.15426626925981565,
"learning_rate": 5e-05,
"loss": 1.5659,
"step": 2901
},
{
"epoch": 2.927512854118359,
"grad_norm": 0.138471559025861,
"learning_rate": 5e-05,
"loss": 1.5656,
"step": 2902
},
{
"epoch": 2.9285210202641396,
"grad_norm": 0.155357177378076,
"learning_rate": 5e-05,
"loss": 1.5688,
"step": 2903
},
{
"epoch": 2.9295291864099204,
"grad_norm": 0.137418601181882,
"learning_rate": 5e-05,
"loss": 1.559,
"step": 2904
},
{
"epoch": 2.9305373525557012,
"grad_norm": 0.15213283592124305,
"learning_rate": 5e-05,
"loss": 1.5697,
"step": 2905
},
{
"epoch": 2.931545518701482,
"grad_norm": 0.1371356248123586,
"learning_rate": 5e-05,
"loss": 1.5536,
"step": 2906
},
{
"epoch": 2.932553684847263,
"grad_norm": 0.15585636460909322,
"learning_rate": 5e-05,
"loss": 1.5784,
"step": 2907
},
{
"epoch": 2.9335618509930437,
"grad_norm": 0.13432776199282834,
"learning_rate": 5e-05,
"loss": 1.5633,
"step": 2908
},
{
"epoch": 2.9345700171388245,
"grad_norm": 0.15101484572955937,
"learning_rate": 5e-05,
"loss": 1.5638,
"step": 2909
},
{
"epoch": 2.9355781832846053,
"grad_norm": 0.13284986895435724,
"learning_rate": 5e-05,
"loss": 1.5536,
"step": 2910
},
{
"epoch": 2.936586349430386,
"grad_norm": 0.15239448643115522,
"learning_rate": 5e-05,
"loss": 1.5542,
"step": 2911
},
{
"epoch": 2.937594515576167,
"grad_norm": 0.13304948631549568,
"learning_rate": 5e-05,
"loss": 1.5515,
"step": 2912
},
{
"epoch": 2.9386026817219477,
"grad_norm": 0.1493914552614863,
"learning_rate": 5e-05,
"loss": 1.5786,
"step": 2913
},
{
"epoch": 2.9396108478677285,
"grad_norm": 0.141104588952366,
"learning_rate": 5e-05,
"loss": 1.5541,
"step": 2914
},
{
"epoch": 2.9406190140135093,
"grad_norm": 0.14744388875991352,
"learning_rate": 5e-05,
"loss": 1.5695,
"step": 2915
},
{
"epoch": 2.94162718015929,
"grad_norm": 0.15106287768765167,
"learning_rate": 5e-05,
"loss": 1.5635,
"step": 2916
},
{
"epoch": 2.942635346305071,
"grad_norm": 0.13633154398328548,
"learning_rate": 5e-05,
"loss": 1.5568,
"step": 2917
},
{
"epoch": 2.9436435124508518,
"grad_norm": 0.14206620322234342,
"learning_rate": 5e-05,
"loss": 1.5712,
"step": 2918
},
{
"epoch": 2.9446516785966326,
"grad_norm": 0.1400371750496543,
"learning_rate": 5e-05,
"loss": 1.5481,
"step": 2919
},
{
"epoch": 2.945659844742414,
"grad_norm": 0.15361701555835644,
"learning_rate": 5e-05,
"loss": 1.5517,
"step": 2920
},
{
"epoch": 2.946668010888194,
"grad_norm": 0.1395143142535918,
"learning_rate": 5e-05,
"loss": 1.5563,
"step": 2921
},
{
"epoch": 2.9476761770339754,
"grad_norm": 0.15238197270206633,
"learning_rate": 5e-05,
"loss": 1.5611,
"step": 2922
},
{
"epoch": 2.948684343179756,
"grad_norm": 0.13791269422161265,
"learning_rate": 5e-05,
"loss": 1.5445,
"step": 2923
},
{
"epoch": 2.949692509325537,
"grad_norm": 0.1527120115790887,
"learning_rate": 5e-05,
"loss": 1.5591,
"step": 2924
},
{
"epoch": 2.9507006754713174,
"grad_norm": 0.14875816521276122,
"learning_rate": 5e-05,
"loss": 1.5391,
"step": 2925
},
{
"epoch": 2.9517088416170987,
"grad_norm": 0.14956494592977318,
"learning_rate": 5e-05,
"loss": 1.548,
"step": 2926
},
{
"epoch": 2.9527170077628795,
"grad_norm": 0.1323459526950249,
"learning_rate": 5e-05,
"loss": 1.5546,
"step": 2927
},
{
"epoch": 2.9537251739086603,
"grad_norm": 0.16534355366989031,
"learning_rate": 5e-05,
"loss": 1.5373,
"step": 2928
},
{
"epoch": 2.954733340054441,
"grad_norm": 0.1345830722048253,
"learning_rate": 5e-05,
"loss": 1.5526,
"step": 2929
},
{
"epoch": 2.955741506200222,
"grad_norm": 0.14475119660699556,
"learning_rate": 5e-05,
"loss": 1.5604,
"step": 2930
},
{
"epoch": 2.9567496723460027,
"grad_norm": 0.1423458248073331,
"learning_rate": 5e-05,
"loss": 1.5737,
"step": 2931
},
{
"epoch": 2.9577578384917835,
"grad_norm": 0.12953891111963645,
"learning_rate": 5e-05,
"loss": 1.5623,
"step": 2932
},
{
"epoch": 2.9587660046375643,
"grad_norm": 0.14369391910038792,
"learning_rate": 5e-05,
"loss": 1.537,
"step": 2933
},
{
"epoch": 2.959774170783345,
"grad_norm": 0.15086425876239956,
"learning_rate": 5e-05,
"loss": 1.5727,
"step": 2934
},
{
"epoch": 2.960782336929126,
"grad_norm": 0.14151325112581856,
"learning_rate": 5e-05,
"loss": 1.5504,
"step": 2935
},
{
"epoch": 2.9617905030749068,
"grad_norm": 0.14019801752830394,
"learning_rate": 5e-05,
"loss": 1.5711,
"step": 2936
},
{
"epoch": 2.9627986692206876,
"grad_norm": 0.1453666753568266,
"learning_rate": 5e-05,
"loss": 1.566,
"step": 2937
},
{
"epoch": 2.9638068353664684,
"grad_norm": 0.13919807108999072,
"learning_rate": 5e-05,
"loss": 1.5651,
"step": 2938
},
{
"epoch": 2.964815001512249,
"grad_norm": 0.14464003186645194,
"learning_rate": 5e-05,
"loss": 1.5845,
"step": 2939
},
{
"epoch": 2.96582316765803,
"grad_norm": 0.13620309551680643,
"learning_rate": 5e-05,
"loss": 1.5598,
"step": 2940
},
{
"epoch": 2.966831333803811,
"grad_norm": 0.1372206763034652,
"learning_rate": 5e-05,
"loss": 1.5578,
"step": 2941
},
{
"epoch": 2.9678394999495916,
"grad_norm": 0.13886626014581938,
"learning_rate": 5e-05,
"loss": 1.5649,
"step": 2942
},
{
"epoch": 2.9688476660953724,
"grad_norm": 0.13428686365047957,
"learning_rate": 5e-05,
"loss": 1.5704,
"step": 2943
},
{
"epoch": 2.9698558322411532,
"grad_norm": 0.12948797335444784,
"learning_rate": 5e-05,
"loss": 1.5725,
"step": 2944
},
{
"epoch": 2.970863998386934,
"grad_norm": 0.12769456210733446,
"learning_rate": 5e-05,
"loss": 1.5541,
"step": 2945
},
{
"epoch": 2.971872164532715,
"grad_norm": 0.14488293202297234,
"learning_rate": 5e-05,
"loss": 1.5609,
"step": 2946
},
{
"epoch": 2.9728803306784957,
"grad_norm": 0.14137326637650713,
"learning_rate": 5e-05,
"loss": 1.5581,
"step": 2947
},
{
"epoch": 2.9738884968242765,
"grad_norm": 0.1338660797211836,
"learning_rate": 5e-05,
"loss": 1.5614,
"step": 2948
},
{
"epoch": 2.9748966629700577,
"grad_norm": 0.1379123131883918,
"learning_rate": 5e-05,
"loss": 1.5642,
"step": 2949
},
{
"epoch": 2.975904829115838,
"grad_norm": 0.1340410935395909,
"learning_rate": 5e-05,
"loss": 1.5668,
"step": 2950
},
{
"epoch": 2.9769129952616193,
"grad_norm": 0.12855729642664465,
"learning_rate": 5e-05,
"loss": 1.5513,
"step": 2951
},
{
"epoch": 2.9779211614073997,
"grad_norm": 0.13071208212474497,
"learning_rate": 5e-05,
"loss": 1.5522,
"step": 2952
},
{
"epoch": 2.978929327553181,
"grad_norm": 0.13305329821521164,
"learning_rate": 5e-05,
"loss": 1.5668,
"step": 2953
},
{
"epoch": 2.9799374936989613,
"grad_norm": 0.13376568327004498,
"learning_rate": 5e-05,
"loss": 1.558,
"step": 2954
},
{
"epoch": 2.9809456598447426,
"grad_norm": 0.13084931939094915,
"learning_rate": 5e-05,
"loss": 1.5575,
"step": 2955
},
{
"epoch": 2.9819538259905234,
"grad_norm": 0.13867891937239454,
"learning_rate": 5e-05,
"loss": 1.5598,
"step": 2956
},
{
"epoch": 2.982961992136304,
"grad_norm": 0.1397205924366696,
"learning_rate": 5e-05,
"loss": 1.5663,
"step": 2957
},
{
"epoch": 2.983970158282085,
"grad_norm": 0.13969260296990604,
"learning_rate": 5e-05,
"loss": 1.5606,
"step": 2958
},
{
"epoch": 2.984978324427866,
"grad_norm": 0.1631492011661443,
"learning_rate": 5e-05,
"loss": 1.5398,
"step": 2959
},
{
"epoch": 2.9859864905736466,
"grad_norm": 0.15119413820443203,
"learning_rate": 5e-05,
"loss": 1.5569,
"step": 2960
},
{
"epoch": 2.9869946567194274,
"grad_norm": 0.14863919142032006,
"learning_rate": 5e-05,
"loss": 1.5451,
"step": 2961
},
{
"epoch": 2.9880028228652082,
"grad_norm": 0.13687840450793468,
"learning_rate": 5e-05,
"loss": 1.5598,
"step": 2962
},
{
"epoch": 2.989010989010989,
"grad_norm": 0.1493181266930825,
"learning_rate": 5e-05,
"loss": 1.5572,
"step": 2963
},
{
"epoch": 2.99001915515677,
"grad_norm": 0.20032135827358913,
"learning_rate": 5e-05,
"loss": 1.5583,
"step": 2964
},
{
"epoch": 2.9910273213025507,
"grad_norm": 0.13696879150194943,
"learning_rate": 5e-05,
"loss": 1.569,
"step": 2965
},
{
"epoch": 2.9920354874483315,
"grad_norm": 0.14187831421826821,
"learning_rate": 5e-05,
"loss": 1.5625,
"step": 2966
},
{
"epoch": 2.9930436535941123,
"grad_norm": 0.14273354843205222,
"learning_rate": 5e-05,
"loss": 1.5632,
"step": 2967
},
{
"epoch": 2.994051819739893,
"grad_norm": 0.1372115086061227,
"learning_rate": 5e-05,
"loss": 1.5637,
"step": 2968
},
{
"epoch": 2.995059985885674,
"grad_norm": 0.13929386398909455,
"learning_rate": 5e-05,
"loss": 1.5568,
"step": 2969
},
{
"epoch": 2.9960681520314547,
"grad_norm": 0.14597924434479242,
"learning_rate": 5e-05,
"loss": 1.5658,
"step": 2970
},
{
"epoch": 2.9970763181772355,
"grad_norm": 0.13204270313325644,
"learning_rate": 5e-05,
"loss": 1.552,
"step": 2971
},
{
"epoch": 2.9980844843230163,
"grad_norm": 0.1490654908890621,
"learning_rate": 5e-05,
"loss": 1.57,
"step": 2972
},
{
"epoch": 2.999092650468797,
"grad_norm": 0.14445886974569572,
"learning_rate": 5e-05,
"loss": 1.5441,
"step": 2973
},
{
"epoch": 2.999092650468797,
"step": 2973,
"total_flos": 1045643124342784.0,
"train_loss": 1.6583996424962084,
"train_runtime": 54127.1912,
"train_samples_per_second": 10.994,
"train_steps_per_second": 0.055
}
],
"logging_steps": 1,
"max_steps": 2973,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1045643124342784.0,
"train_batch_size": 10,
"trial_name": null,
"trial_params": null
}