SynthAgent-SFT-UI-TARS-1.5-7B / trainer_state.json
ChilleD's picture
Upload folder using huggingface_hub
69d7118 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1491,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002012072434607646,
"grad_norm": 12.363840103149414,
"learning_rate": 0.0,
"loss": 1.2598,
"step": 1
},
{
"epoch": 0.004024144869215292,
"grad_norm": 12.238037109375,
"learning_rate": 6.666666666666668e-08,
"loss": 1.2856,
"step": 2
},
{
"epoch": 0.006036217303822937,
"grad_norm": 12.225936889648438,
"learning_rate": 1.3333333333333336e-07,
"loss": 1.3008,
"step": 3
},
{
"epoch": 0.008048289738430584,
"grad_norm": 11.544827461242676,
"learning_rate": 2.0000000000000002e-07,
"loss": 1.2326,
"step": 4
},
{
"epoch": 0.01006036217303823,
"grad_norm": 12.333775520324707,
"learning_rate": 2.666666666666667e-07,
"loss": 1.2295,
"step": 5
},
{
"epoch": 0.012072434607645875,
"grad_norm": 12.383609771728516,
"learning_rate": 3.3333333333333335e-07,
"loss": 1.2722,
"step": 6
},
{
"epoch": 0.014084507042253521,
"grad_norm": 11.465510368347168,
"learning_rate": 4.0000000000000003e-07,
"loss": 1.1603,
"step": 7
},
{
"epoch": 0.01609657947686117,
"grad_norm": 12.0714750289917,
"learning_rate": 4.666666666666667e-07,
"loss": 1.2336,
"step": 8
},
{
"epoch": 0.018108651911468814,
"grad_norm": 11.47994327545166,
"learning_rate": 5.333333333333335e-07,
"loss": 1.2026,
"step": 9
},
{
"epoch": 0.02012072434607646,
"grad_norm": 10.58621883392334,
"learning_rate": 6.000000000000001e-07,
"loss": 1.1641,
"step": 10
},
{
"epoch": 0.022132796780684104,
"grad_norm": 10.854904174804688,
"learning_rate": 6.666666666666667e-07,
"loss": 1.2054,
"step": 11
},
{
"epoch": 0.02414486921529175,
"grad_norm": 10.539307594299316,
"learning_rate": 7.333333333333334e-07,
"loss": 1.1917,
"step": 12
},
{
"epoch": 0.026156941649899398,
"grad_norm": 8.997591972351074,
"learning_rate": 8.000000000000001e-07,
"loss": 1.1712,
"step": 13
},
{
"epoch": 0.028169014084507043,
"grad_norm": 8.63853931427002,
"learning_rate": 8.666666666666668e-07,
"loss": 1.1661,
"step": 14
},
{
"epoch": 0.030181086519114688,
"grad_norm": 8.33820915222168,
"learning_rate": 9.333333333333334e-07,
"loss": 1.1192,
"step": 15
},
{
"epoch": 0.03219315895372234,
"grad_norm": 8.019930839538574,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.1022,
"step": 16
},
{
"epoch": 0.03420523138832998,
"grad_norm": 6.020583629608154,
"learning_rate": 1.066666666666667e-06,
"loss": 1.0401,
"step": 17
},
{
"epoch": 0.03621730382293763,
"grad_norm": 5.38732385635376,
"learning_rate": 1.1333333333333334e-06,
"loss": 0.9759,
"step": 18
},
{
"epoch": 0.03822937625754527,
"grad_norm": 5.1723833084106445,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.96,
"step": 19
},
{
"epoch": 0.04024144869215292,
"grad_norm": 5.272335052490234,
"learning_rate": 1.2666666666666669e-06,
"loss": 0.9716,
"step": 20
},
{
"epoch": 0.04225352112676056,
"grad_norm": 4.846181869506836,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.942,
"step": 21
},
{
"epoch": 0.04426559356136821,
"grad_norm": 4.733026504516602,
"learning_rate": 1.4000000000000001e-06,
"loss": 0.9423,
"step": 22
},
{
"epoch": 0.04627766599597585,
"grad_norm": 4.3946990966796875,
"learning_rate": 1.4666666666666669e-06,
"loss": 0.9051,
"step": 23
},
{
"epoch": 0.0482897384305835,
"grad_norm": 4.081869125366211,
"learning_rate": 1.5333333333333334e-06,
"loss": 0.838,
"step": 24
},
{
"epoch": 0.05030181086519115,
"grad_norm": 4.333906173706055,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.8396,
"step": 25
},
{
"epoch": 0.052313883299798795,
"grad_norm": 4.418334007263184,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.8865,
"step": 26
},
{
"epoch": 0.05432595573440644,
"grad_norm": 3.9022629261016846,
"learning_rate": 1.7333333333333336e-06,
"loss": 0.8417,
"step": 27
},
{
"epoch": 0.056338028169014086,
"grad_norm": 3.718716621398926,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.792,
"step": 28
},
{
"epoch": 0.05835010060362173,
"grad_norm": 3.3578712940216064,
"learning_rate": 1.8666666666666669e-06,
"loss": 0.8466,
"step": 29
},
{
"epoch": 0.060362173038229376,
"grad_norm": 3.447502374649048,
"learning_rate": 1.9333333333333336e-06,
"loss": 0.8104,
"step": 30
},
{
"epoch": 0.06237424547283702,
"grad_norm": 3.520570993423462,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.8379,
"step": 31
},
{
"epoch": 0.06438631790744467,
"grad_norm": 3.2361371517181396,
"learning_rate": 2.0666666666666666e-06,
"loss": 0.8015,
"step": 32
},
{
"epoch": 0.06639839034205232,
"grad_norm": 3.283750534057617,
"learning_rate": 2.133333333333334e-06,
"loss": 0.7935,
"step": 33
},
{
"epoch": 0.06841046277665996,
"grad_norm": 3.19575834274292,
"learning_rate": 2.2e-06,
"loss": 0.7224,
"step": 34
},
{
"epoch": 0.07042253521126761,
"grad_norm": 3.1006929874420166,
"learning_rate": 2.266666666666667e-06,
"loss": 0.7664,
"step": 35
},
{
"epoch": 0.07243460764587525,
"grad_norm": 3.1626148223876953,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.7612,
"step": 36
},
{
"epoch": 0.0744466800804829,
"grad_norm": 3.043454170227051,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.7359,
"step": 37
},
{
"epoch": 0.07645875251509054,
"grad_norm": 2.9314467906951904,
"learning_rate": 2.466666666666667e-06,
"loss": 0.7721,
"step": 38
},
{
"epoch": 0.07847082494969819,
"grad_norm": 3.1366055011749268,
"learning_rate": 2.5333333333333338e-06,
"loss": 0.719,
"step": 39
},
{
"epoch": 0.08048289738430583,
"grad_norm": 3.107473850250244,
"learning_rate": 2.6e-06,
"loss": 0.8246,
"step": 40
},
{
"epoch": 0.08249496981891348,
"grad_norm": 2.6806769371032715,
"learning_rate": 2.666666666666667e-06,
"loss": 0.6973,
"step": 41
},
{
"epoch": 0.08450704225352113,
"grad_norm": 2.670893430709839,
"learning_rate": 2.7333333333333336e-06,
"loss": 0.7593,
"step": 42
},
{
"epoch": 0.08651911468812877,
"grad_norm": 2.6718692779541016,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.7396,
"step": 43
},
{
"epoch": 0.08853118712273642,
"grad_norm": 2.9941153526306152,
"learning_rate": 2.866666666666667e-06,
"loss": 0.7654,
"step": 44
},
{
"epoch": 0.09054325955734406,
"grad_norm": 2.6897857189178467,
"learning_rate": 2.9333333333333338e-06,
"loss": 0.6612,
"step": 45
},
{
"epoch": 0.0925553319919517,
"grad_norm": 2.886622667312622,
"learning_rate": 3e-06,
"loss": 0.7474,
"step": 46
},
{
"epoch": 0.09456740442655935,
"grad_norm": 2.6397299766540527,
"learning_rate": 3.066666666666667e-06,
"loss": 0.704,
"step": 47
},
{
"epoch": 0.096579476861167,
"grad_norm": 2.64058780670166,
"learning_rate": 3.133333333333334e-06,
"loss": 0.6708,
"step": 48
},
{
"epoch": 0.09859154929577464,
"grad_norm": 3.1715197563171387,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.6346,
"step": 49
},
{
"epoch": 0.1006036217303823,
"grad_norm": 2.7641634941101074,
"learning_rate": 3.266666666666667e-06,
"loss": 0.6568,
"step": 50
},
{
"epoch": 0.10261569416498995,
"grad_norm": 2.6137845516204834,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.6276,
"step": 51
},
{
"epoch": 0.10462776659959759,
"grad_norm": 2.7980453968048096,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.6974,
"step": 52
},
{
"epoch": 0.10663983903420524,
"grad_norm": 2.5735130310058594,
"learning_rate": 3.4666666666666672e-06,
"loss": 0.7257,
"step": 53
},
{
"epoch": 0.10865191146881288,
"grad_norm": 2.5075342655181885,
"learning_rate": 3.5333333333333335e-06,
"loss": 0.6615,
"step": 54
},
{
"epoch": 0.11066398390342053,
"grad_norm": 2.779794454574585,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.642,
"step": 55
},
{
"epoch": 0.11267605633802817,
"grad_norm": 2.7019553184509277,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.6855,
"step": 56
},
{
"epoch": 0.11468812877263582,
"grad_norm": 2.685800075531006,
"learning_rate": 3.7333333333333337e-06,
"loss": 0.7083,
"step": 57
},
{
"epoch": 0.11670020120724346,
"grad_norm": 2.5412144660949707,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.768,
"step": 58
},
{
"epoch": 0.11871227364185111,
"grad_norm": 2.727508783340454,
"learning_rate": 3.866666666666667e-06,
"loss": 0.6644,
"step": 59
},
{
"epoch": 0.12072434607645875,
"grad_norm": 2.6290087699890137,
"learning_rate": 3.9333333333333335e-06,
"loss": 0.7471,
"step": 60
},
{
"epoch": 0.1227364185110664,
"grad_norm": 2.714343547821045,
"learning_rate": 4.000000000000001e-06,
"loss": 0.7178,
"step": 61
},
{
"epoch": 0.12474849094567404,
"grad_norm": 2.59692645072937,
"learning_rate": 4.066666666666667e-06,
"loss": 0.6653,
"step": 62
},
{
"epoch": 0.1267605633802817,
"grad_norm": 2.5672385692596436,
"learning_rate": 4.133333333333333e-06,
"loss": 0.6388,
"step": 63
},
{
"epoch": 0.12877263581488935,
"grad_norm": 2.3973758220672607,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.6807,
"step": 64
},
{
"epoch": 0.13078470824949698,
"grad_norm": 2.5640907287597656,
"learning_rate": 4.266666666666668e-06,
"loss": 0.6079,
"step": 65
},
{
"epoch": 0.13279678068410464,
"grad_norm": 2.399198055267334,
"learning_rate": 4.333333333333334e-06,
"loss": 0.6361,
"step": 66
},
{
"epoch": 0.13480885311871227,
"grad_norm": 2.62172794342041,
"learning_rate": 4.4e-06,
"loss": 0.6594,
"step": 67
},
{
"epoch": 0.13682092555331993,
"grad_norm": 2.631462574005127,
"learning_rate": 4.4666666666666665e-06,
"loss": 0.6447,
"step": 68
},
{
"epoch": 0.13883299798792756,
"grad_norm": 2.4494845867156982,
"learning_rate": 4.533333333333334e-06,
"loss": 0.5988,
"step": 69
},
{
"epoch": 0.14084507042253522,
"grad_norm": 2.6119790077209473,
"learning_rate": 4.600000000000001e-06,
"loss": 0.6501,
"step": 70
},
{
"epoch": 0.14285714285714285,
"grad_norm": 2.604640483856201,
"learning_rate": 4.666666666666667e-06,
"loss": 0.6714,
"step": 71
},
{
"epoch": 0.1448692152917505,
"grad_norm": 2.5536398887634277,
"learning_rate": 4.7333333333333335e-06,
"loss": 0.649,
"step": 72
},
{
"epoch": 0.14688128772635814,
"grad_norm": 2.456615686416626,
"learning_rate": 4.800000000000001e-06,
"loss": 0.6269,
"step": 73
},
{
"epoch": 0.1488933601609658,
"grad_norm": 2.5996391773223877,
"learning_rate": 4.866666666666667e-06,
"loss": 0.6658,
"step": 74
},
{
"epoch": 0.15090543259557343,
"grad_norm": 2.334995985031128,
"learning_rate": 4.933333333333334e-06,
"loss": 0.579,
"step": 75
},
{
"epoch": 0.1529175050301811,
"grad_norm": 2.5622453689575195,
"learning_rate": 5e-06,
"loss": 0.6378,
"step": 76
},
{
"epoch": 0.15492957746478872,
"grad_norm": 2.5427086353302,
"learning_rate": 5.0666666666666676e-06,
"loss": 0.6418,
"step": 77
},
{
"epoch": 0.15694164989939638,
"grad_norm": 2.5094008445739746,
"learning_rate": 5.133333333333334e-06,
"loss": 0.6471,
"step": 78
},
{
"epoch": 0.158953722334004,
"grad_norm": 2.7548811435699463,
"learning_rate": 5.2e-06,
"loss": 0.6631,
"step": 79
},
{
"epoch": 0.16096579476861167,
"grad_norm": 2.6184520721435547,
"learning_rate": 5.2666666666666665e-06,
"loss": 0.6779,
"step": 80
},
{
"epoch": 0.16297786720321933,
"grad_norm": 2.7182962894439697,
"learning_rate": 5.333333333333334e-06,
"loss": 0.6641,
"step": 81
},
{
"epoch": 0.16498993963782696,
"grad_norm": 2.5088016986846924,
"learning_rate": 5.400000000000001e-06,
"loss": 0.6771,
"step": 82
},
{
"epoch": 0.16700201207243462,
"grad_norm": 2.573153495788574,
"learning_rate": 5.466666666666667e-06,
"loss": 0.6399,
"step": 83
},
{
"epoch": 0.16901408450704225,
"grad_norm": 2.4727790355682373,
"learning_rate": 5.533333333333334e-06,
"loss": 0.6326,
"step": 84
},
{
"epoch": 0.1710261569416499,
"grad_norm": 2.5456035137176514,
"learning_rate": 5.600000000000001e-06,
"loss": 0.648,
"step": 85
},
{
"epoch": 0.17303822937625754,
"grad_norm": 2.272167444229126,
"learning_rate": 5.666666666666667e-06,
"loss": 0.5667,
"step": 86
},
{
"epoch": 0.1750503018108652,
"grad_norm": 2.5209145545959473,
"learning_rate": 5.733333333333334e-06,
"loss": 0.6212,
"step": 87
},
{
"epoch": 0.17706237424547283,
"grad_norm": 2.570265054702759,
"learning_rate": 5.8e-06,
"loss": 0.6653,
"step": 88
},
{
"epoch": 0.1790744466800805,
"grad_norm": 2.527291774749756,
"learning_rate": 5.8666666666666675e-06,
"loss": 0.6653,
"step": 89
},
{
"epoch": 0.18108651911468812,
"grad_norm": 2.5215442180633545,
"learning_rate": 5.933333333333335e-06,
"loss": 0.6097,
"step": 90
},
{
"epoch": 0.18309859154929578,
"grad_norm": 2.7476773262023926,
"learning_rate": 6e-06,
"loss": 0.6701,
"step": 91
},
{
"epoch": 0.1851106639839034,
"grad_norm": 2.5635337829589844,
"learning_rate": 6.066666666666667e-06,
"loss": 0.6136,
"step": 92
},
{
"epoch": 0.18712273641851107,
"grad_norm": 2.61013126373291,
"learning_rate": 6.133333333333334e-06,
"loss": 0.6636,
"step": 93
},
{
"epoch": 0.1891348088531187,
"grad_norm": 2.596705198287964,
"learning_rate": 6.200000000000001e-06,
"loss": 0.6145,
"step": 94
},
{
"epoch": 0.19114688128772636,
"grad_norm": 2.6027705669403076,
"learning_rate": 6.266666666666668e-06,
"loss": 0.6379,
"step": 95
},
{
"epoch": 0.193158953722334,
"grad_norm": 2.587643623352051,
"learning_rate": 6.333333333333333e-06,
"loss": 0.6574,
"step": 96
},
{
"epoch": 0.19517102615694165,
"grad_norm": 2.42325758934021,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.6293,
"step": 97
},
{
"epoch": 0.19718309859154928,
"grad_norm": 2.4672398567199707,
"learning_rate": 6.466666666666667e-06,
"loss": 0.5981,
"step": 98
},
{
"epoch": 0.19919517102615694,
"grad_norm": 2.478847026824951,
"learning_rate": 6.533333333333334e-06,
"loss": 0.6141,
"step": 99
},
{
"epoch": 0.2012072434607646,
"grad_norm": 2.40338397026062,
"learning_rate": 6.600000000000001e-06,
"loss": 0.6513,
"step": 100
},
{
"epoch": 0.20321931589537223,
"grad_norm": 2.523690700531006,
"learning_rate": 6.666666666666667e-06,
"loss": 0.6469,
"step": 101
},
{
"epoch": 0.2052313883299799,
"grad_norm": 2.5886921882629395,
"learning_rate": 6.733333333333334e-06,
"loss": 0.6215,
"step": 102
},
{
"epoch": 0.20724346076458752,
"grad_norm": 2.5619165897369385,
"learning_rate": 6.800000000000001e-06,
"loss": 0.645,
"step": 103
},
{
"epoch": 0.20925553319919518,
"grad_norm": 2.5696005821228027,
"learning_rate": 6.866666666666667e-06,
"loss": 0.6091,
"step": 104
},
{
"epoch": 0.2112676056338028,
"grad_norm": 2.725149393081665,
"learning_rate": 6.9333333333333344e-06,
"loss": 0.697,
"step": 105
},
{
"epoch": 0.21327967806841047,
"grad_norm": 2.7614009380340576,
"learning_rate": 7e-06,
"loss": 0.6294,
"step": 106
},
{
"epoch": 0.2152917505030181,
"grad_norm": 2.488131523132324,
"learning_rate": 7.066666666666667e-06,
"loss": 0.625,
"step": 107
},
{
"epoch": 0.21730382293762576,
"grad_norm": 2.4684252738952637,
"learning_rate": 7.133333333333334e-06,
"loss": 0.6594,
"step": 108
},
{
"epoch": 0.2193158953722334,
"grad_norm": 2.5597586631774902,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.6692,
"step": 109
},
{
"epoch": 0.22132796780684105,
"grad_norm": 2.429131031036377,
"learning_rate": 7.266666666666668e-06,
"loss": 0.6566,
"step": 110
},
{
"epoch": 0.22334004024144868,
"grad_norm": 2.4478707313537598,
"learning_rate": 7.333333333333333e-06,
"loss": 0.6316,
"step": 111
},
{
"epoch": 0.22535211267605634,
"grad_norm": 2.4292643070220947,
"learning_rate": 7.4e-06,
"loss": 0.6437,
"step": 112
},
{
"epoch": 0.22736418511066397,
"grad_norm": 2.5762760639190674,
"learning_rate": 7.4666666666666675e-06,
"loss": 0.6179,
"step": 113
},
{
"epoch": 0.22937625754527163,
"grad_norm": 2.5746638774871826,
"learning_rate": 7.533333333333334e-06,
"loss": 0.6415,
"step": 114
},
{
"epoch": 0.23138832997987926,
"grad_norm": 2.685413122177124,
"learning_rate": 7.600000000000001e-06,
"loss": 0.6635,
"step": 115
},
{
"epoch": 0.23340040241448692,
"grad_norm": 2.37715744972229,
"learning_rate": 7.666666666666667e-06,
"loss": 0.6124,
"step": 116
},
{
"epoch": 0.23541247484909456,
"grad_norm": 2.478545904159546,
"learning_rate": 7.733333333333334e-06,
"loss": 0.6226,
"step": 117
},
{
"epoch": 0.23742454728370221,
"grad_norm": 2.634754180908203,
"learning_rate": 7.800000000000002e-06,
"loss": 0.6772,
"step": 118
},
{
"epoch": 0.23943661971830985,
"grad_norm": 2.674330949783325,
"learning_rate": 7.866666666666667e-06,
"loss": 0.657,
"step": 119
},
{
"epoch": 0.2414486921529175,
"grad_norm": 2.785724401473999,
"learning_rate": 7.933333333333334e-06,
"loss": 0.5971,
"step": 120
},
{
"epoch": 0.24346076458752516,
"grad_norm": 2.7215487957000732,
"learning_rate": 8.000000000000001e-06,
"loss": 0.6228,
"step": 121
},
{
"epoch": 0.2454728370221328,
"grad_norm": 2.535701274871826,
"learning_rate": 8.066666666666667e-06,
"loss": 0.6479,
"step": 122
},
{
"epoch": 0.24748490945674045,
"grad_norm": 2.7002458572387695,
"learning_rate": 8.133333333333334e-06,
"loss": 0.6286,
"step": 123
},
{
"epoch": 0.24949698189134809,
"grad_norm": 2.5765464305877686,
"learning_rate": 8.2e-06,
"loss": 0.5909,
"step": 124
},
{
"epoch": 0.2515090543259557,
"grad_norm": 2.7955777645111084,
"learning_rate": 8.266666666666667e-06,
"loss": 0.6271,
"step": 125
},
{
"epoch": 0.2535211267605634,
"grad_norm": 2.583167552947998,
"learning_rate": 8.333333333333334e-06,
"loss": 0.6438,
"step": 126
},
{
"epoch": 0.25553319919517103,
"grad_norm": 2.5065181255340576,
"learning_rate": 8.400000000000001e-06,
"loss": 0.6421,
"step": 127
},
{
"epoch": 0.2575452716297787,
"grad_norm": 2.5843505859375,
"learning_rate": 8.466666666666668e-06,
"loss": 0.6277,
"step": 128
},
{
"epoch": 0.2595573440643863,
"grad_norm": 2.729172468185425,
"learning_rate": 8.533333333333335e-06,
"loss": 0.6338,
"step": 129
},
{
"epoch": 0.26156941649899396,
"grad_norm": 2.566673755645752,
"learning_rate": 8.6e-06,
"loss": 0.6624,
"step": 130
},
{
"epoch": 0.2635814889336016,
"grad_norm": 2.436913251876831,
"learning_rate": 8.666666666666668e-06,
"loss": 0.5772,
"step": 131
},
{
"epoch": 0.2655935613682093,
"grad_norm": 2.3920977115631104,
"learning_rate": 8.733333333333333e-06,
"loss": 0.6181,
"step": 132
},
{
"epoch": 0.2676056338028169,
"grad_norm": 2.6135761737823486,
"learning_rate": 8.8e-06,
"loss": 0.6439,
"step": 133
},
{
"epoch": 0.26961770623742454,
"grad_norm": 2.5263290405273438,
"learning_rate": 8.866666666666668e-06,
"loss": 0.6164,
"step": 134
},
{
"epoch": 0.2716297786720322,
"grad_norm": 2.5991945266723633,
"learning_rate": 8.933333333333333e-06,
"loss": 0.6352,
"step": 135
},
{
"epoch": 0.27364185110663986,
"grad_norm": 2.720930576324463,
"learning_rate": 9e-06,
"loss": 0.6283,
"step": 136
},
{
"epoch": 0.27565392354124746,
"grad_norm": 2.5199596881866455,
"learning_rate": 9.066666666666667e-06,
"loss": 0.6374,
"step": 137
},
{
"epoch": 0.2776659959758551,
"grad_norm": 2.7811954021453857,
"learning_rate": 9.133333333333335e-06,
"loss": 0.6282,
"step": 138
},
{
"epoch": 0.2796780684104628,
"grad_norm": 2.6453726291656494,
"learning_rate": 9.200000000000002e-06,
"loss": 0.6371,
"step": 139
},
{
"epoch": 0.28169014084507044,
"grad_norm": 2.6417572498321533,
"learning_rate": 9.266666666666667e-06,
"loss": 0.6413,
"step": 140
},
{
"epoch": 0.2837022132796781,
"grad_norm": 2.5948872566223145,
"learning_rate": 9.333333333333334e-06,
"loss": 0.6444,
"step": 141
},
{
"epoch": 0.2857142857142857,
"grad_norm": 2.462456464767456,
"learning_rate": 9.4e-06,
"loss": 0.6641,
"step": 142
},
{
"epoch": 0.28772635814889336,
"grad_norm": 2.4424092769622803,
"learning_rate": 9.466666666666667e-06,
"loss": 0.611,
"step": 143
},
{
"epoch": 0.289738430583501,
"grad_norm": 2.3720812797546387,
"learning_rate": 9.533333333333334e-06,
"loss": 0.5783,
"step": 144
},
{
"epoch": 0.2917505030181087,
"grad_norm": 2.604189872741699,
"learning_rate": 9.600000000000001e-06,
"loss": 0.6717,
"step": 145
},
{
"epoch": 0.2937625754527163,
"grad_norm": 2.4482827186584473,
"learning_rate": 9.666666666666667e-06,
"loss": 0.6396,
"step": 146
},
{
"epoch": 0.29577464788732394,
"grad_norm": 2.364368200302124,
"learning_rate": 9.733333333333334e-06,
"loss": 0.6,
"step": 147
},
{
"epoch": 0.2977867203219316,
"grad_norm": 2.4040334224700928,
"learning_rate": 9.800000000000001e-06,
"loss": 0.6545,
"step": 148
},
{
"epoch": 0.29979879275653926,
"grad_norm": 2.6192374229431152,
"learning_rate": 9.866666666666668e-06,
"loss": 0.6403,
"step": 149
},
{
"epoch": 0.30181086519114686,
"grad_norm": 2.255182981491089,
"learning_rate": 9.933333333333334e-06,
"loss": 0.6026,
"step": 150
},
{
"epoch": 0.3038229376257545,
"grad_norm": 2.4689016342163086,
"learning_rate": 1e-05,
"loss": 0.6061,
"step": 151
},
{
"epoch": 0.3058350100603622,
"grad_norm": 2.5769577026367188,
"learning_rate": 9.999986279118938e-06,
"loss": 0.6053,
"step": 152
},
{
"epoch": 0.30784708249496984,
"grad_norm": 2.4138309955596924,
"learning_rate": 9.999945116551056e-06,
"loss": 0.6489,
"step": 153
},
{
"epoch": 0.30985915492957744,
"grad_norm": 2.449880599975586,
"learning_rate": 9.999876512522269e-06,
"loss": 0.6062,
"step": 154
},
{
"epoch": 0.3118712273641851,
"grad_norm": 2.315631866455078,
"learning_rate": 9.9997804674091e-06,
"loss": 0.5896,
"step": 155
},
{
"epoch": 0.31388329979879276,
"grad_norm": 2.8500030040740967,
"learning_rate": 9.999656981738679e-06,
"loss": 0.6558,
"step": 156
},
{
"epoch": 0.3158953722334004,
"grad_norm": 2.3704442977905273,
"learning_rate": 9.999506056188736e-06,
"loss": 0.6401,
"step": 157
},
{
"epoch": 0.317907444668008,
"grad_norm": 2.323148488998413,
"learning_rate": 9.999327691587609e-06,
"loss": 0.6136,
"step": 158
},
{
"epoch": 0.3199195171026157,
"grad_norm": 2.4699552059173584,
"learning_rate": 9.99912188891422e-06,
"loss": 0.6254,
"step": 159
},
{
"epoch": 0.32193158953722334,
"grad_norm": 2.135148525238037,
"learning_rate": 9.99888864929809e-06,
"loss": 0.6152,
"step": 160
},
{
"epoch": 0.323943661971831,
"grad_norm": 2.483369827270508,
"learning_rate": 9.998627974019322e-06,
"loss": 0.5779,
"step": 161
},
{
"epoch": 0.32595573440643866,
"grad_norm": 2.4658560752868652,
"learning_rate": 9.99833986450859e-06,
"loss": 0.6304,
"step": 162
},
{
"epoch": 0.32796780684104626,
"grad_norm": 2.4425296783447266,
"learning_rate": 9.99802432234714e-06,
"loss": 0.6123,
"step": 163
},
{
"epoch": 0.3299798792756539,
"grad_norm": 2.291313648223877,
"learning_rate": 9.997681349266782e-06,
"loss": 0.6,
"step": 164
},
{
"epoch": 0.3319919517102616,
"grad_norm": 2.4326376914978027,
"learning_rate": 9.997310947149872e-06,
"loss": 0.5919,
"step": 165
},
{
"epoch": 0.33400402414486924,
"grad_norm": 2.344089984893799,
"learning_rate": 9.996913118029306e-06,
"loss": 0.6441,
"step": 166
},
{
"epoch": 0.33601609657947684,
"grad_norm": 2.360903024673462,
"learning_rate": 9.996487864088512e-06,
"loss": 0.6209,
"step": 167
},
{
"epoch": 0.3380281690140845,
"grad_norm": 2.3726160526275635,
"learning_rate": 9.996035187661433e-06,
"loss": 0.5881,
"step": 168
},
{
"epoch": 0.34004024144869216,
"grad_norm": 2.3640172481536865,
"learning_rate": 9.995555091232516e-06,
"loss": 0.6319,
"step": 169
},
{
"epoch": 0.3420523138832998,
"grad_norm": 2.4159719944000244,
"learning_rate": 9.9950475774367e-06,
"loss": 0.6403,
"step": 170
},
{
"epoch": 0.3440643863179074,
"grad_norm": 2.472817897796631,
"learning_rate": 9.994512649059401e-06,
"loss": 0.592,
"step": 171
},
{
"epoch": 0.3460764587525151,
"grad_norm": 2.400581121444702,
"learning_rate": 9.99395030903649e-06,
"loss": 0.6134,
"step": 172
},
{
"epoch": 0.34808853118712274,
"grad_norm": 2.537536382675171,
"learning_rate": 9.993360560454293e-06,
"loss": 0.6273,
"step": 173
},
{
"epoch": 0.3501006036217304,
"grad_norm": 2.3792333602905273,
"learning_rate": 9.992743406549556e-06,
"loss": 0.6072,
"step": 174
},
{
"epoch": 0.352112676056338,
"grad_norm": 2.3866188526153564,
"learning_rate": 9.992098850709434e-06,
"loss": 0.6412,
"step": 175
},
{
"epoch": 0.35412474849094566,
"grad_norm": 2.3259477615356445,
"learning_rate": 9.99142689647148e-06,
"loss": 0.6485,
"step": 176
},
{
"epoch": 0.3561368209255533,
"grad_norm": 2.6365954875946045,
"learning_rate": 9.990727547523616e-06,
"loss": 0.6391,
"step": 177
},
{
"epoch": 0.358148893360161,
"grad_norm": 2.523137331008911,
"learning_rate": 9.990000807704114e-06,
"loss": 0.6138,
"step": 178
},
{
"epoch": 0.36016096579476864,
"grad_norm": 2.461246967315674,
"learning_rate": 9.989246681001577e-06,
"loss": 0.6051,
"step": 179
},
{
"epoch": 0.36217303822937624,
"grad_norm": 2.1713852882385254,
"learning_rate": 9.988465171554921e-06,
"loss": 0.5876,
"step": 180
},
{
"epoch": 0.3641851106639839,
"grad_norm": 2.6124799251556396,
"learning_rate": 9.987656283653344e-06,
"loss": 0.6481,
"step": 181
},
{
"epoch": 0.36619718309859156,
"grad_norm": 2.4890964031219482,
"learning_rate": 9.986820021736306e-06,
"loss": 0.6064,
"step": 182
},
{
"epoch": 0.3682092555331992,
"grad_norm": 2.3688108921051025,
"learning_rate": 9.985956390393511e-06,
"loss": 0.6046,
"step": 183
},
{
"epoch": 0.3702213279678068,
"grad_norm": 2.48551344871521,
"learning_rate": 9.985065394364869e-06,
"loss": 0.6489,
"step": 184
},
{
"epoch": 0.3722334004024145,
"grad_norm": 2.2956740856170654,
"learning_rate": 9.984147038540482e-06,
"loss": 0.6279,
"step": 185
},
{
"epoch": 0.37424547283702214,
"grad_norm": 2.439910411834717,
"learning_rate": 9.983201327960607e-06,
"loss": 0.6137,
"step": 186
},
{
"epoch": 0.3762575452716298,
"grad_norm": 2.5276641845703125,
"learning_rate": 9.982228267815644e-06,
"loss": 0.6155,
"step": 187
},
{
"epoch": 0.3782696177062374,
"grad_norm": 2.286393165588379,
"learning_rate": 9.981227863446082e-06,
"loss": 0.5831,
"step": 188
},
{
"epoch": 0.38028169014084506,
"grad_norm": 2.3950138092041016,
"learning_rate": 9.980200120342499e-06,
"loss": 0.6125,
"step": 189
},
{
"epoch": 0.3822937625754527,
"grad_norm": 2.266212224960327,
"learning_rate": 9.979145044145506e-06,
"loss": 0.6074,
"step": 190
},
{
"epoch": 0.3843058350100604,
"grad_norm": 2.353178024291992,
"learning_rate": 9.978062640645737e-06,
"loss": 0.6534,
"step": 191
},
{
"epoch": 0.386317907444668,
"grad_norm": 2.330751657485962,
"learning_rate": 9.976952915783804e-06,
"loss": 0.6001,
"step": 192
},
{
"epoch": 0.38832997987927564,
"grad_norm": 2.3842129707336426,
"learning_rate": 9.975815875650265e-06,
"loss": 0.5859,
"step": 193
},
{
"epoch": 0.3903420523138833,
"grad_norm": 2.396599292755127,
"learning_rate": 9.9746515264856e-06,
"loss": 0.6097,
"step": 194
},
{
"epoch": 0.39235412474849096,
"grad_norm": 2.323148250579834,
"learning_rate": 9.973459874680167e-06,
"loss": 0.6033,
"step": 195
},
{
"epoch": 0.39436619718309857,
"grad_norm": 2.5869362354278564,
"learning_rate": 9.972240926774167e-06,
"loss": 0.5759,
"step": 196
},
{
"epoch": 0.3963782696177062,
"grad_norm": 2.671992540359497,
"learning_rate": 9.970994689457623e-06,
"loss": 0.5771,
"step": 197
},
{
"epoch": 0.3983903420523139,
"grad_norm": 2.518411159515381,
"learning_rate": 9.969721169570319e-06,
"loss": 0.6368,
"step": 198
},
{
"epoch": 0.40040241448692154,
"grad_norm": 2.2937800884246826,
"learning_rate": 9.968420374101782e-06,
"loss": 0.6316,
"step": 199
},
{
"epoch": 0.4024144869215292,
"grad_norm": 2.3399505615234375,
"learning_rate": 9.967092310191237e-06,
"loss": 0.602,
"step": 200
},
{
"epoch": 0.4044265593561368,
"grad_norm": 2.2086527347564697,
"learning_rate": 9.965736985127568e-06,
"loss": 0.5983,
"step": 201
},
{
"epoch": 0.40643863179074446,
"grad_norm": 2.1889898777008057,
"learning_rate": 9.964354406349272e-06,
"loss": 0.6179,
"step": 202
},
{
"epoch": 0.4084507042253521,
"grad_norm": 2.510423421859741,
"learning_rate": 9.962944581444433e-06,
"loss": 0.607,
"step": 203
},
{
"epoch": 0.4104627766599598,
"grad_norm": 2.23941707611084,
"learning_rate": 9.961507518150666e-06,
"loss": 0.6072,
"step": 204
},
{
"epoch": 0.4124748490945674,
"grad_norm": 2.304394483566284,
"learning_rate": 9.960043224355081e-06,
"loss": 0.56,
"step": 205
},
{
"epoch": 0.41448692152917505,
"grad_norm": 2.3394389152526855,
"learning_rate": 9.958551708094237e-06,
"loss": 0.5895,
"step": 206
},
{
"epoch": 0.4164989939637827,
"grad_norm": 2.3602826595306396,
"learning_rate": 9.9570329775541e-06,
"loss": 0.6176,
"step": 207
},
{
"epoch": 0.41851106639839036,
"grad_norm": 2.3634259700775146,
"learning_rate": 9.955487041070003e-06,
"loss": 0.6063,
"step": 208
},
{
"epoch": 0.42052313883299797,
"grad_norm": 2.2722222805023193,
"learning_rate": 9.953913907126584e-06,
"loss": 0.5682,
"step": 209
},
{
"epoch": 0.4225352112676056,
"grad_norm": 2.2542028427124023,
"learning_rate": 9.952313584357763e-06,
"loss": 0.6419,
"step": 210
},
{
"epoch": 0.4245472837022133,
"grad_norm": 2.452004909515381,
"learning_rate": 9.95068608154667e-06,
"loss": 0.5832,
"step": 211
},
{
"epoch": 0.42655935613682094,
"grad_norm": 2.302015542984009,
"learning_rate": 9.949031407625616e-06,
"loss": 0.6485,
"step": 212
},
{
"epoch": 0.42857142857142855,
"grad_norm": 2.347144842147827,
"learning_rate": 9.947349571676037e-06,
"loss": 0.6086,
"step": 213
},
{
"epoch": 0.4305835010060362,
"grad_norm": 2.3000502586364746,
"learning_rate": 9.945640582928438e-06,
"loss": 0.5909,
"step": 214
},
{
"epoch": 0.43259557344064387,
"grad_norm": 2.3875091075897217,
"learning_rate": 9.943904450762351e-06,
"loss": 0.6138,
"step": 215
},
{
"epoch": 0.4346076458752515,
"grad_norm": 2.4650800228118896,
"learning_rate": 9.942141184706286e-06,
"loss": 0.5952,
"step": 216
},
{
"epoch": 0.43661971830985913,
"grad_norm": 2.5489614009857178,
"learning_rate": 9.940350794437663e-06,
"loss": 0.6147,
"step": 217
},
{
"epoch": 0.4386317907444668,
"grad_norm": 2.565382957458496,
"learning_rate": 9.938533289782778e-06,
"loss": 0.6167,
"step": 218
},
{
"epoch": 0.44064386317907445,
"grad_norm": 2.7737607955932617,
"learning_rate": 9.936688680716737e-06,
"loss": 0.6235,
"step": 219
},
{
"epoch": 0.4426559356136821,
"grad_norm": 2.4090120792388916,
"learning_rate": 9.934816977363404e-06,
"loss": 0.6042,
"step": 220
},
{
"epoch": 0.44466800804828976,
"grad_norm": 2.378023147583008,
"learning_rate": 9.932918189995345e-06,
"loss": 0.5705,
"step": 221
},
{
"epoch": 0.44668008048289737,
"grad_norm": 2.137075424194336,
"learning_rate": 9.930992329033777e-06,
"loss": 0.5828,
"step": 222
},
{
"epoch": 0.448692152917505,
"grad_norm": 2.5460665225982666,
"learning_rate": 9.929039405048502e-06,
"loss": 0.6181,
"step": 223
},
{
"epoch": 0.4507042253521127,
"grad_norm": 2.4987435340881348,
"learning_rate": 9.927059428757857e-06,
"loss": 0.5926,
"step": 224
},
{
"epoch": 0.45271629778672035,
"grad_norm": 2.2808916568756104,
"learning_rate": 9.925052411028646e-06,
"loss": 0.5412,
"step": 225
},
{
"epoch": 0.45472837022132795,
"grad_norm": 2.6912238597869873,
"learning_rate": 9.923018362876093e-06,
"loss": 0.6123,
"step": 226
},
{
"epoch": 0.4567404426559356,
"grad_norm": 2.254833459854126,
"learning_rate": 9.920957295463772e-06,
"loss": 0.5876,
"step": 227
},
{
"epoch": 0.45875251509054327,
"grad_norm": 2.572598695755005,
"learning_rate": 9.918869220103542e-06,
"loss": 0.6207,
"step": 228
},
{
"epoch": 0.4607645875251509,
"grad_norm": 2.448763608932495,
"learning_rate": 9.916754148255501e-06,
"loss": 0.6001,
"step": 229
},
{
"epoch": 0.46277665995975853,
"grad_norm": 2.2648446559906006,
"learning_rate": 9.914612091527908e-06,
"loss": 0.6357,
"step": 230
},
{
"epoch": 0.4647887323943662,
"grad_norm": 2.6665232181549072,
"learning_rate": 9.912443061677125e-06,
"loss": 0.5835,
"step": 231
},
{
"epoch": 0.46680080482897385,
"grad_norm": 2.5652852058410645,
"learning_rate": 9.91024707060755e-06,
"loss": 0.5807,
"step": 232
},
{
"epoch": 0.4688128772635815,
"grad_norm": 2.583991289138794,
"learning_rate": 9.90802413037156e-06,
"loss": 0.5885,
"step": 233
},
{
"epoch": 0.4708249496981891,
"grad_norm": 2.55000638961792,
"learning_rate": 9.905774253169433e-06,
"loss": 0.5982,
"step": 234
},
{
"epoch": 0.47283702213279677,
"grad_norm": 2.4194183349609375,
"learning_rate": 9.903497451349286e-06,
"loss": 0.5808,
"step": 235
},
{
"epoch": 0.47484909456740443,
"grad_norm": 2.1875178813934326,
"learning_rate": 9.901193737407011e-06,
"loss": 0.6064,
"step": 236
},
{
"epoch": 0.4768611670020121,
"grad_norm": 2.475219488143921,
"learning_rate": 9.898863123986203e-06,
"loss": 0.6258,
"step": 237
},
{
"epoch": 0.4788732394366197,
"grad_norm": 2.3584680557250977,
"learning_rate": 9.896505623878088e-06,
"loss": 0.5774,
"step": 238
},
{
"epoch": 0.48088531187122735,
"grad_norm": 2.2012126445770264,
"learning_rate": 9.89412125002146e-06,
"loss": 0.5675,
"step": 239
},
{
"epoch": 0.482897384305835,
"grad_norm": 2.501800537109375,
"learning_rate": 9.8917100155026e-06,
"loss": 0.5434,
"step": 240
},
{
"epoch": 0.48490945674044267,
"grad_norm": 2.6421823501586914,
"learning_rate": 9.889271933555214e-06,
"loss": 0.6171,
"step": 241
},
{
"epoch": 0.4869215291750503,
"grad_norm": 2.3850109577178955,
"learning_rate": 9.886807017560356e-06,
"loss": 0.5794,
"step": 242
},
{
"epoch": 0.48893360160965793,
"grad_norm": 2.1486642360687256,
"learning_rate": 9.884315281046352e-06,
"loss": 0.5469,
"step": 243
},
{
"epoch": 0.4909456740442656,
"grad_norm": 2.2797324657440186,
"learning_rate": 9.881796737688732e-06,
"loss": 0.5792,
"step": 244
},
{
"epoch": 0.49295774647887325,
"grad_norm": 2.6412172317504883,
"learning_rate": 9.879251401310148e-06,
"loss": 0.6719,
"step": 245
},
{
"epoch": 0.4949698189134809,
"grad_norm": 2.5547709465026855,
"learning_rate": 9.876679285880304e-06,
"loss": 0.6059,
"step": 246
},
{
"epoch": 0.4969818913480885,
"grad_norm": 2.522813320159912,
"learning_rate": 9.874080405515874e-06,
"loss": 0.6261,
"step": 247
},
{
"epoch": 0.49899396378269617,
"grad_norm": 2.457695960998535,
"learning_rate": 9.871454774480433e-06,
"loss": 0.6169,
"step": 248
},
{
"epoch": 0.5010060362173038,
"grad_norm": 2.424248456954956,
"learning_rate": 9.868802407184367e-06,
"loss": 0.5937,
"step": 249
},
{
"epoch": 0.5030181086519114,
"grad_norm": 2.377311944961548,
"learning_rate": 9.866123318184803e-06,
"loss": 0.5936,
"step": 250
},
{
"epoch": 0.5050301810865191,
"grad_norm": 2.4924814701080322,
"learning_rate": 9.863417522185525e-06,
"loss": 0.5936,
"step": 251
},
{
"epoch": 0.5070422535211268,
"grad_norm": 2.4591898918151855,
"learning_rate": 9.860685034036897e-06,
"loss": 0.611,
"step": 252
},
{
"epoch": 0.5090543259557344,
"grad_norm": 2.0778732299804688,
"learning_rate": 9.857925868735774e-06,
"loss": 0.5892,
"step": 253
},
{
"epoch": 0.5110663983903421,
"grad_norm": 2.6331255435943604,
"learning_rate": 9.855140041425428e-06,
"loss": 0.6307,
"step": 254
},
{
"epoch": 0.5130784708249497,
"grad_norm": 2.3365707397460938,
"learning_rate": 9.852327567395463e-06,
"loss": 0.5972,
"step": 255
},
{
"epoch": 0.5150905432595574,
"grad_norm": 2.3462042808532715,
"learning_rate": 9.84948846208173e-06,
"loss": 0.6209,
"step": 256
},
{
"epoch": 0.5171026156941649,
"grad_norm": 2.4540977478027344,
"learning_rate": 9.846622741066232e-06,
"loss": 0.6274,
"step": 257
},
{
"epoch": 0.5191146881287726,
"grad_norm": 2.2751944065093994,
"learning_rate": 9.843730420077061e-06,
"loss": 0.6026,
"step": 258
},
{
"epoch": 0.5211267605633803,
"grad_norm": 2.3845276832580566,
"learning_rate": 9.840811514988294e-06,
"loss": 0.6102,
"step": 259
},
{
"epoch": 0.5231388329979879,
"grad_norm": 2.46744966506958,
"learning_rate": 9.83786604181991e-06,
"loss": 0.6107,
"step": 260
},
{
"epoch": 0.5251509054325956,
"grad_norm": 2.3754844665527344,
"learning_rate": 9.834894016737705e-06,
"loss": 0.5667,
"step": 261
},
{
"epoch": 0.5271629778672032,
"grad_norm": 2.2515339851379395,
"learning_rate": 9.831895456053197e-06,
"loss": 0.6242,
"step": 262
},
{
"epoch": 0.5291750503018109,
"grad_norm": 2.5688209533691406,
"learning_rate": 9.828870376223546e-06,
"loss": 0.5932,
"step": 263
},
{
"epoch": 0.5311871227364185,
"grad_norm": 2.4301135540008545,
"learning_rate": 9.825818793851456e-06,
"loss": 0.616,
"step": 264
},
{
"epoch": 0.5331991951710262,
"grad_norm": 2.3733065128326416,
"learning_rate": 9.822740725685087e-06,
"loss": 0.5439,
"step": 265
},
{
"epoch": 0.5352112676056338,
"grad_norm": 2.2880313396453857,
"learning_rate": 9.819636188617961e-06,
"loss": 0.592,
"step": 266
},
{
"epoch": 0.5372233400402414,
"grad_norm": 2.3435163497924805,
"learning_rate": 9.81650519968887e-06,
"loss": 0.5901,
"step": 267
},
{
"epoch": 0.5392354124748491,
"grad_norm": 2.272224187850952,
"learning_rate": 9.81334777608179e-06,
"loss": 0.5828,
"step": 268
},
{
"epoch": 0.5412474849094567,
"grad_norm": 2.3393688201904297,
"learning_rate": 9.810163935125768e-06,
"loss": 0.6356,
"step": 269
},
{
"epoch": 0.5432595573440644,
"grad_norm": 2.552076578140259,
"learning_rate": 9.806953694294849e-06,
"loss": 0.5929,
"step": 270
},
{
"epoch": 0.545271629778672,
"grad_norm": 2.2416205406188965,
"learning_rate": 9.803717071207965e-06,
"loss": 0.611,
"step": 271
},
{
"epoch": 0.5472837022132797,
"grad_norm": 2.4686906337738037,
"learning_rate": 9.800454083628845e-06,
"loss": 0.6189,
"step": 272
},
{
"epoch": 0.5492957746478874,
"grad_norm": 2.3410513401031494,
"learning_rate": 9.797164749465915e-06,
"loss": 0.6166,
"step": 273
},
{
"epoch": 0.5513078470824949,
"grad_norm": 2.240062713623047,
"learning_rate": 9.793849086772198e-06,
"loss": 0.6308,
"step": 274
},
{
"epoch": 0.5533199195171026,
"grad_norm": 1.9945600032806396,
"learning_rate": 9.790507113745222e-06,
"loss": 0.5439,
"step": 275
},
{
"epoch": 0.5553319919517102,
"grad_norm": 2.338785409927368,
"learning_rate": 9.787138848726912e-06,
"loss": 0.5622,
"step": 276
},
{
"epoch": 0.5573440643863179,
"grad_norm": 2.2766361236572266,
"learning_rate": 9.783744310203492e-06,
"loss": 0.6072,
"step": 277
},
{
"epoch": 0.5593561368209256,
"grad_norm": 2.300431489944458,
"learning_rate": 9.780323516805386e-06,
"loss": 0.5599,
"step": 278
},
{
"epoch": 0.5613682092555332,
"grad_norm": 2.407275676727295,
"learning_rate": 9.776876487307115e-06,
"loss": 0.5904,
"step": 279
},
{
"epoch": 0.5633802816901409,
"grad_norm": 2.198110342025757,
"learning_rate": 9.77340324062719e-06,
"loss": 0.5778,
"step": 280
},
{
"epoch": 0.5653923541247485,
"grad_norm": 2.2225024700164795,
"learning_rate": 9.769903795828016e-06,
"loss": 0.5909,
"step": 281
},
{
"epoch": 0.5674044265593562,
"grad_norm": 2.542376756668091,
"learning_rate": 9.766378172115775e-06,
"loss": 0.5907,
"step": 282
},
{
"epoch": 0.5694164989939637,
"grad_norm": 2.7032480239868164,
"learning_rate": 9.76282638884034e-06,
"loss": 0.6323,
"step": 283
},
{
"epoch": 0.5714285714285714,
"grad_norm": 2.1654727458953857,
"learning_rate": 9.75924846549514e-06,
"loss": 0.582,
"step": 284
},
{
"epoch": 0.5734406438631791,
"grad_norm": 2.6309971809387207,
"learning_rate": 9.755644421717083e-06,
"loss": 0.5845,
"step": 285
},
{
"epoch": 0.5754527162977867,
"grad_norm": 2.3880655765533447,
"learning_rate": 9.752014277286433e-06,
"loss": 0.5996,
"step": 286
},
{
"epoch": 0.5774647887323944,
"grad_norm": 2.295335054397583,
"learning_rate": 9.7483580521267e-06,
"loss": 0.5756,
"step": 287
},
{
"epoch": 0.579476861167002,
"grad_norm": 2.394613742828369,
"learning_rate": 9.744675766304538e-06,
"loss": 0.6449,
"step": 288
},
{
"epoch": 0.5814889336016097,
"grad_norm": 2.123344898223877,
"learning_rate": 9.740967440029628e-06,
"loss": 0.5853,
"step": 289
},
{
"epoch": 0.5835010060362174,
"grad_norm": 2.532409191131592,
"learning_rate": 9.737233093654572e-06,
"loss": 0.6253,
"step": 290
},
{
"epoch": 0.5855130784708249,
"grad_norm": 2.29142689704895,
"learning_rate": 9.733472747674779e-06,
"loss": 0.6021,
"step": 291
},
{
"epoch": 0.5875251509054326,
"grad_norm": 2.3614251613616943,
"learning_rate": 9.729686422728353e-06,
"loss": 0.5971,
"step": 292
},
{
"epoch": 0.5895372233400402,
"grad_norm": 2.237323522567749,
"learning_rate": 9.725874139595978e-06,
"loss": 0.5917,
"step": 293
},
{
"epoch": 0.5915492957746479,
"grad_norm": 2.2730329036712646,
"learning_rate": 9.722035919200812e-06,
"loss": 0.6119,
"step": 294
},
{
"epoch": 0.5935613682092555,
"grad_norm": 1.9858745336532593,
"learning_rate": 9.718171782608355e-06,
"loss": 0.5934,
"step": 295
},
{
"epoch": 0.5955734406438632,
"grad_norm": 2.39754581451416,
"learning_rate": 9.714281751026356e-06,
"loss": 0.5964,
"step": 296
},
{
"epoch": 0.5975855130784709,
"grad_norm": 2.244943857192993,
"learning_rate": 9.710365845804675e-06,
"loss": 0.5375,
"step": 297
},
{
"epoch": 0.5995975855130785,
"grad_norm": 2.48502254486084,
"learning_rate": 9.706424088435183e-06,
"loss": 0.6355,
"step": 298
},
{
"epoch": 0.6016096579476862,
"grad_norm": 2.49822735786438,
"learning_rate": 9.702456500551632e-06,
"loss": 0.5974,
"step": 299
},
{
"epoch": 0.6036217303822937,
"grad_norm": 2.8203203678131104,
"learning_rate": 9.698463103929542e-06,
"loss": 0.5476,
"step": 300
},
{
"epoch": 0.6056338028169014,
"grad_norm": 2.2995455265045166,
"learning_rate": 9.694443920486083e-06,
"loss": 0.5746,
"step": 301
},
{
"epoch": 0.607645875251509,
"grad_norm": 2.167100429534912,
"learning_rate": 9.690398972279949e-06,
"loss": 0.5653,
"step": 302
},
{
"epoch": 0.6096579476861167,
"grad_norm": 2.3564436435699463,
"learning_rate": 9.686328281511241e-06,
"loss": 0.5767,
"step": 303
},
{
"epoch": 0.6116700201207244,
"grad_norm": 2.2094357013702393,
"learning_rate": 9.682231870521347e-06,
"loss": 0.6548,
"step": 304
},
{
"epoch": 0.613682092555332,
"grad_norm": 2.4704389572143555,
"learning_rate": 9.67810976179281e-06,
"loss": 0.5766,
"step": 305
},
{
"epoch": 0.6156941649899397,
"grad_norm": 2.2543351650238037,
"learning_rate": 9.673961977949219e-06,
"loss": 0.6256,
"step": 306
},
{
"epoch": 0.6177062374245473,
"grad_norm": 2.216660737991333,
"learning_rate": 9.669788541755072e-06,
"loss": 0.5912,
"step": 307
},
{
"epoch": 0.6197183098591549,
"grad_norm": 2.1589713096618652,
"learning_rate": 9.665589476115657e-06,
"loss": 0.5898,
"step": 308
},
{
"epoch": 0.6217303822937625,
"grad_norm": 2.309406042098999,
"learning_rate": 9.661364804076927e-06,
"loss": 0.6137,
"step": 309
},
{
"epoch": 0.6237424547283702,
"grad_norm": 2.363293409347534,
"learning_rate": 9.657114548825372e-06,
"loss": 0.6052,
"step": 310
},
{
"epoch": 0.6257545271629779,
"grad_norm": 2.509986162185669,
"learning_rate": 9.652838733687888e-06,
"loss": 0.5869,
"step": 311
},
{
"epoch": 0.6277665995975855,
"grad_norm": 2.23989200592041,
"learning_rate": 9.648537382131659e-06,
"loss": 0.5552,
"step": 312
},
{
"epoch": 0.6297786720321932,
"grad_norm": 2.3576748371124268,
"learning_rate": 9.644210517764014e-06,
"loss": 0.5931,
"step": 313
},
{
"epoch": 0.6317907444668008,
"grad_norm": 2.3847086429595947,
"learning_rate": 9.639858164332314e-06,
"loss": 0.5895,
"step": 314
},
{
"epoch": 0.6338028169014085,
"grad_norm": 2.3280272483825684,
"learning_rate": 9.635480345723805e-06,
"loss": 0.566,
"step": 315
},
{
"epoch": 0.635814889336016,
"grad_norm": 2.1712679862976074,
"learning_rate": 9.631077085965501e-06,
"loss": 0.6073,
"step": 316
},
{
"epoch": 0.6378269617706237,
"grad_norm": 2.112177610397339,
"learning_rate": 9.626648409224041e-06,
"loss": 0.5855,
"step": 317
},
{
"epoch": 0.6398390342052314,
"grad_norm": 2.2860617637634277,
"learning_rate": 9.622194339805565e-06,
"loss": 0.6272,
"step": 318
},
{
"epoch": 0.641851106639839,
"grad_norm": 2.424269437789917,
"learning_rate": 9.617714902155576e-06,
"loss": 0.6146,
"step": 319
},
{
"epoch": 0.6438631790744467,
"grad_norm": 2.1368589401245117,
"learning_rate": 9.613210120858805e-06,
"loss": 0.6157,
"step": 320
},
{
"epoch": 0.6458752515090543,
"grad_norm": 2.490374803543091,
"learning_rate": 9.608680020639081e-06,
"loss": 0.6139,
"step": 321
},
{
"epoch": 0.647887323943662,
"grad_norm": 2.3180062770843506,
"learning_rate": 9.60412462635919e-06,
"loss": 0.6013,
"step": 322
},
{
"epoch": 0.6498993963782697,
"grad_norm": 2.4402894973754883,
"learning_rate": 9.599543963020741e-06,
"loss": 0.6116,
"step": 323
},
{
"epoch": 0.6519114688128773,
"grad_norm": 2.1556742191314697,
"learning_rate": 9.594938055764029e-06,
"loss": 0.5712,
"step": 324
},
{
"epoch": 0.6539235412474849,
"grad_norm": 2.4181032180786133,
"learning_rate": 9.590306929867896e-06,
"loss": 0.6334,
"step": 325
},
{
"epoch": 0.6559356136820925,
"grad_norm": 2.138808250427246,
"learning_rate": 9.585650610749593e-06,
"loss": 0.6156,
"step": 326
},
{
"epoch": 0.6579476861167002,
"grad_norm": 2.266510248184204,
"learning_rate": 9.580969123964641e-06,
"loss": 0.5878,
"step": 327
},
{
"epoch": 0.6599597585513078,
"grad_norm": 2.302675485610962,
"learning_rate": 9.576262495206689e-06,
"loss": 0.5439,
"step": 328
},
{
"epoch": 0.6619718309859155,
"grad_norm": 2.0406110286712646,
"learning_rate": 9.571530750307374e-06,
"loss": 0.5612,
"step": 329
},
{
"epoch": 0.6639839034205232,
"grad_norm": 2.294686794281006,
"learning_rate": 9.56677391523618e-06,
"loss": 0.6101,
"step": 330
},
{
"epoch": 0.6659959758551308,
"grad_norm": 2.3122353553771973,
"learning_rate": 9.561992016100293e-06,
"loss": 0.5615,
"step": 331
},
{
"epoch": 0.6680080482897385,
"grad_norm": 2.389636754989624,
"learning_rate": 9.557185079144463e-06,
"loss": 0.5509,
"step": 332
},
{
"epoch": 0.670020120724346,
"grad_norm": 2.0459258556365967,
"learning_rate": 9.552353130750852e-06,
"loss": 0.5769,
"step": 333
},
{
"epoch": 0.6720321931589537,
"grad_norm": 2.269744396209717,
"learning_rate": 9.547496197438896e-06,
"loss": 0.6115,
"step": 334
},
{
"epoch": 0.6740442655935613,
"grad_norm": 2.3433408737182617,
"learning_rate": 9.542614305865158e-06,
"loss": 0.5611,
"step": 335
},
{
"epoch": 0.676056338028169,
"grad_norm": 2.231168270111084,
"learning_rate": 9.53770748282318e-06,
"loss": 0.5728,
"step": 336
},
{
"epoch": 0.6780684104627767,
"grad_norm": 2.2339141368865967,
"learning_rate": 9.532775755243334e-06,
"loss": 0.5661,
"step": 337
},
{
"epoch": 0.6800804828973843,
"grad_norm": 2.384350061416626,
"learning_rate": 9.527819150192681e-06,
"loss": 0.65,
"step": 338
},
{
"epoch": 0.682092555331992,
"grad_norm": 2.395918369293213,
"learning_rate": 9.522837694874814e-06,
"loss": 0.6252,
"step": 339
},
{
"epoch": 0.6841046277665996,
"grad_norm": 2.5676770210266113,
"learning_rate": 9.517831416629717e-06,
"loss": 0.5988,
"step": 340
},
{
"epoch": 0.6861167002012073,
"grad_norm": 2.204547166824341,
"learning_rate": 9.512800342933608e-06,
"loss": 0.5708,
"step": 341
},
{
"epoch": 0.6881287726358148,
"grad_norm": 2.5879533290863037,
"learning_rate": 9.507744501398794e-06,
"loss": 0.6058,
"step": 342
},
{
"epoch": 0.6901408450704225,
"grad_norm": 2.14680814743042,
"learning_rate": 9.502663919773516e-06,
"loss": 0.5931,
"step": 343
},
{
"epoch": 0.6921529175050302,
"grad_norm": 2.6863231658935547,
"learning_rate": 9.497558625941794e-06,
"loss": 0.6241,
"step": 344
},
{
"epoch": 0.6941649899396378,
"grad_norm": 2.3913419246673584,
"learning_rate": 9.492428647923281e-06,
"loss": 0.591,
"step": 345
},
{
"epoch": 0.6961770623742455,
"grad_norm": 2.2461934089660645,
"learning_rate": 9.487274013873104e-06,
"loss": 0.5122,
"step": 346
},
{
"epoch": 0.6981891348088531,
"grad_norm": 2.091630697250366,
"learning_rate": 9.482094752081711e-06,
"loss": 0.5787,
"step": 347
},
{
"epoch": 0.7002012072434608,
"grad_norm": 2.184694290161133,
"learning_rate": 9.47689089097472e-06,
"loss": 0.5839,
"step": 348
},
{
"epoch": 0.7022132796780685,
"grad_norm": 2.175163984298706,
"learning_rate": 9.471662459112747e-06,
"loss": 0.5782,
"step": 349
},
{
"epoch": 0.704225352112676,
"grad_norm": 2.343888521194458,
"learning_rate": 9.466409485191275e-06,
"loss": 0.5909,
"step": 350
},
{
"epoch": 0.7062374245472837,
"grad_norm": 2.230376958847046,
"learning_rate": 9.461131998040473e-06,
"loss": 0.5791,
"step": 351
},
{
"epoch": 0.7082494969818913,
"grad_norm": 2.165900707244873,
"learning_rate": 9.455830026625053e-06,
"loss": 0.5488,
"step": 352
},
{
"epoch": 0.710261569416499,
"grad_norm": 2.151834487915039,
"learning_rate": 9.450503600044102e-06,
"loss": 0.5556,
"step": 353
},
{
"epoch": 0.7122736418511066,
"grad_norm": 2.06109619140625,
"learning_rate": 9.445152747530922e-06,
"loss": 0.5415,
"step": 354
},
{
"epoch": 0.7142857142857143,
"grad_norm": 2.3980467319488525,
"learning_rate": 9.439777498452883e-06,
"loss": 0.5612,
"step": 355
},
{
"epoch": 0.716297786720322,
"grad_norm": 2.2379043102264404,
"learning_rate": 9.434377882311244e-06,
"loss": 0.6302,
"step": 356
},
{
"epoch": 0.7183098591549296,
"grad_norm": 2.3387439250946045,
"learning_rate": 9.428953928741002e-06,
"loss": 0.6122,
"step": 357
},
{
"epoch": 0.7203219315895373,
"grad_norm": 2.465433359146118,
"learning_rate": 9.423505667510724e-06,
"loss": 0.5993,
"step": 358
},
{
"epoch": 0.7223340040241448,
"grad_norm": 2.0558369159698486,
"learning_rate": 9.41803312852239e-06,
"loss": 0.5871,
"step": 359
},
{
"epoch": 0.7243460764587525,
"grad_norm": 2.2847893238067627,
"learning_rate": 9.41253634181122e-06,
"loss": 0.5773,
"step": 360
},
{
"epoch": 0.7263581488933601,
"grad_norm": 2.137911319732666,
"learning_rate": 9.40701533754552e-06,
"loss": 0.6051,
"step": 361
},
{
"epoch": 0.7283702213279678,
"grad_norm": 1.9390573501586914,
"learning_rate": 9.401470146026504e-06,
"loss": 0.5161,
"step": 362
},
{
"epoch": 0.7303822937625755,
"grad_norm": 2.056952714920044,
"learning_rate": 9.39590079768814e-06,
"loss": 0.5652,
"step": 363
},
{
"epoch": 0.7323943661971831,
"grad_norm": 2.426093339920044,
"learning_rate": 9.390307323096972e-06,
"loss": 0.5756,
"step": 364
},
{
"epoch": 0.7344064386317908,
"grad_norm": 2.0200657844543457,
"learning_rate": 9.384689752951961e-06,
"loss": 0.5601,
"step": 365
},
{
"epoch": 0.7364185110663984,
"grad_norm": 1.9603573083877563,
"learning_rate": 9.379048118084312e-06,
"loss": 0.5586,
"step": 366
},
{
"epoch": 0.738430583501006,
"grad_norm": 2.151219129562378,
"learning_rate": 9.373382449457305e-06,
"loss": 0.5368,
"step": 367
},
{
"epoch": 0.7404426559356136,
"grad_norm": 2.253244161605835,
"learning_rate": 9.367692778166126e-06,
"loss": 0.563,
"step": 368
},
{
"epoch": 0.7424547283702213,
"grad_norm": 2.4091432094573975,
"learning_rate": 9.361979135437697e-06,
"loss": 0.5909,
"step": 369
},
{
"epoch": 0.744466800804829,
"grad_norm": 2.0590202808380127,
"learning_rate": 9.356241552630503e-06,
"loss": 0.5424,
"step": 370
},
{
"epoch": 0.7464788732394366,
"grad_norm": 2.2783074378967285,
"learning_rate": 9.350480061234419e-06,
"loss": 0.6102,
"step": 371
},
{
"epoch": 0.7484909456740443,
"grad_norm": 2.28292179107666,
"learning_rate": 9.344694692870541e-06,
"loss": 0.5819,
"step": 372
},
{
"epoch": 0.7505030181086519,
"grad_norm": 2.3239924907684326,
"learning_rate": 9.338885479291012e-06,
"loss": 0.5518,
"step": 373
},
{
"epoch": 0.7525150905432596,
"grad_norm": 2.2188453674316406,
"learning_rate": 9.333052452378838e-06,
"loss": 0.5808,
"step": 374
},
{
"epoch": 0.7545271629778671,
"grad_norm": 2.173330783843994,
"learning_rate": 9.32719564414773e-06,
"loss": 0.6589,
"step": 375
},
{
"epoch": 0.7565392354124748,
"grad_norm": 2.2564475536346436,
"learning_rate": 9.321315086741916e-06,
"loss": 0.5818,
"step": 376
},
{
"epoch": 0.7585513078470825,
"grad_norm": 2.1442489624023438,
"learning_rate": 9.315410812435967e-06,
"loss": 0.6017,
"step": 377
},
{
"epoch": 0.7605633802816901,
"grad_norm": 2.2043070793151855,
"learning_rate": 9.30948285363462e-06,
"loss": 0.5714,
"step": 378
},
{
"epoch": 0.7625754527162978,
"grad_norm": 2.286181688308716,
"learning_rate": 9.303531242872606e-06,
"loss": 0.577,
"step": 379
},
{
"epoch": 0.7645875251509054,
"grad_norm": 2.649578094482422,
"learning_rate": 9.297556012814457e-06,
"loss": 0.6219,
"step": 380
},
{
"epoch": 0.7665995975855131,
"grad_norm": 2.3334577083587646,
"learning_rate": 9.291557196254342e-06,
"loss": 0.627,
"step": 381
},
{
"epoch": 0.7686116700201208,
"grad_norm": 2.107356548309326,
"learning_rate": 9.285534826115884e-06,
"loss": 0.5891,
"step": 382
},
{
"epoch": 0.7706237424547284,
"grad_norm": 2.133880138397217,
"learning_rate": 9.279488935451971e-06,
"loss": 0.5658,
"step": 383
},
{
"epoch": 0.772635814889336,
"grad_norm": 2.4783966541290283,
"learning_rate": 9.27341955744458e-06,
"loss": 0.5775,
"step": 384
},
{
"epoch": 0.7746478873239436,
"grad_norm": 2.5501651763916016,
"learning_rate": 9.2673267254046e-06,
"loss": 0.5742,
"step": 385
},
{
"epoch": 0.7766599597585513,
"grad_norm": 2.2829442024230957,
"learning_rate": 9.261210472771637e-06,
"loss": 0.5579,
"step": 386
},
{
"epoch": 0.778672032193159,
"grad_norm": 2.3803324699401855,
"learning_rate": 9.255070833113845e-06,
"loss": 0.6267,
"step": 387
},
{
"epoch": 0.7806841046277666,
"grad_norm": 2.5065324306488037,
"learning_rate": 9.248907840127726e-06,
"loss": 0.5967,
"step": 388
},
{
"epoch": 0.7826961770623743,
"grad_norm": 2.320683240890503,
"learning_rate": 9.24272152763796e-06,
"loss": 0.5925,
"step": 389
},
{
"epoch": 0.7847082494969819,
"grad_norm": 2.3530187606811523,
"learning_rate": 9.236511929597206e-06,
"loss": 0.6105,
"step": 390
},
{
"epoch": 0.7867203219315896,
"grad_norm": 2.618340253829956,
"learning_rate": 9.230279080085933e-06,
"loss": 0.5969,
"step": 391
},
{
"epoch": 0.7887323943661971,
"grad_norm": 2.411909580230713,
"learning_rate": 9.224023013312212e-06,
"loss": 0.5556,
"step": 392
},
{
"epoch": 0.7907444668008048,
"grad_norm": 2.401766061782837,
"learning_rate": 9.217743763611545e-06,
"loss": 0.5826,
"step": 393
},
{
"epoch": 0.7927565392354124,
"grad_norm": 2.151867151260376,
"learning_rate": 9.211441365446661e-06,
"loss": 0.598,
"step": 394
},
{
"epoch": 0.7947686116700201,
"grad_norm": 2.0797793865203857,
"learning_rate": 9.20511585340735e-06,
"loss": 0.578,
"step": 395
},
{
"epoch": 0.7967806841046278,
"grad_norm": 2.3202261924743652,
"learning_rate": 9.198767262210244e-06,
"loss": 0.5966,
"step": 396
},
{
"epoch": 0.7987927565392354,
"grad_norm": 2.244210720062256,
"learning_rate": 9.192395626698656e-06,
"loss": 0.5522,
"step": 397
},
{
"epoch": 0.8008048289738431,
"grad_norm": 2.1149046421051025,
"learning_rate": 9.186000981842362e-06,
"loss": 0.5579,
"step": 398
},
{
"epoch": 0.8028169014084507,
"grad_norm": 2.2352654933929443,
"learning_rate": 9.17958336273743e-06,
"loss": 0.5321,
"step": 399
},
{
"epoch": 0.8048289738430584,
"grad_norm": 2.6794004440307617,
"learning_rate": 9.173142804606012e-06,
"loss": 0.5584,
"step": 400
},
{
"epoch": 0.806841046277666,
"grad_norm": 1.9585459232330322,
"learning_rate": 9.166679342796162e-06,
"loss": 0.5313,
"step": 401
},
{
"epoch": 0.8088531187122736,
"grad_norm": 2.3576083183288574,
"learning_rate": 9.160193012781639e-06,
"loss": 0.617,
"step": 402
},
{
"epoch": 0.8108651911468813,
"grad_norm": 2.1958634853363037,
"learning_rate": 9.153683850161706e-06,
"loss": 0.6003,
"step": 403
},
{
"epoch": 0.8128772635814889,
"grad_norm": 2.409407615661621,
"learning_rate": 9.147151890660942e-06,
"loss": 0.5722,
"step": 404
},
{
"epoch": 0.8148893360160966,
"grad_norm": 2.3817203044891357,
"learning_rate": 9.140597170129041e-06,
"loss": 0.6051,
"step": 405
},
{
"epoch": 0.8169014084507042,
"grad_norm": 2.1336405277252197,
"learning_rate": 9.13401972454062e-06,
"loss": 0.5703,
"step": 406
},
{
"epoch": 0.8189134808853119,
"grad_norm": 2.2689437866210938,
"learning_rate": 9.12741958999502e-06,
"loss": 0.5767,
"step": 407
},
{
"epoch": 0.8209255533199196,
"grad_norm": 2.151379346847534,
"learning_rate": 9.120796802716104e-06,
"loss": 0.5539,
"step": 408
},
{
"epoch": 0.8229376257545271,
"grad_norm": 1.9756191968917847,
"learning_rate": 9.114151399052064e-06,
"loss": 0.5481,
"step": 409
},
{
"epoch": 0.8249496981891348,
"grad_norm": 2.424356698989868,
"learning_rate": 9.107483415475216e-06,
"loss": 0.6311,
"step": 410
},
{
"epoch": 0.8269617706237424,
"grad_norm": 2.0553550720214844,
"learning_rate": 9.100792888581803e-06,
"loss": 0.5733,
"step": 411
},
{
"epoch": 0.8289738430583501,
"grad_norm": 2.2919304370880127,
"learning_rate": 9.094079855091797e-06,
"loss": 0.5902,
"step": 412
},
{
"epoch": 0.8309859154929577,
"grad_norm": 2.2795591354370117,
"learning_rate": 9.08734435184869e-06,
"loss": 0.5339,
"step": 413
},
{
"epoch": 0.8329979879275654,
"grad_norm": 2.2266199588775635,
"learning_rate": 9.080586415819296e-06,
"loss": 0.5724,
"step": 414
},
{
"epoch": 0.8350100603621731,
"grad_norm": 2.197139263153076,
"learning_rate": 9.073806084093556e-06,
"loss": 0.5668,
"step": 415
},
{
"epoch": 0.8370221327967807,
"grad_norm": 2.386579751968384,
"learning_rate": 9.067003393884313e-06,
"loss": 0.6091,
"step": 416
},
{
"epoch": 0.8390342052313883,
"grad_norm": 2.1007778644561768,
"learning_rate": 9.06017838252713e-06,
"loss": 0.5447,
"step": 417
},
{
"epoch": 0.8410462776659959,
"grad_norm": 2.3940844535827637,
"learning_rate": 9.053331087480075e-06,
"loss": 0.613,
"step": 418
},
{
"epoch": 0.8430583501006036,
"grad_norm": 2.0589396953582764,
"learning_rate": 9.046461546323519e-06,
"loss": 0.523,
"step": 419
},
{
"epoch": 0.8450704225352113,
"grad_norm": 2.245084047317505,
"learning_rate": 9.039569796759921e-06,
"loss": 0.5571,
"step": 420
},
{
"epoch": 0.8470824949698189,
"grad_norm": 2.283914804458618,
"learning_rate": 9.032655876613636e-06,
"loss": 0.5937,
"step": 421
},
{
"epoch": 0.8490945674044266,
"grad_norm": 2.2793750762939453,
"learning_rate": 9.02571982383069e-06,
"loss": 0.5811,
"step": 422
},
{
"epoch": 0.8511066398390342,
"grad_norm": 2.318835735321045,
"learning_rate": 9.018761676478585e-06,
"loss": 0.5851,
"step": 423
},
{
"epoch": 0.8531187122736419,
"grad_norm": 2.1775121688842773,
"learning_rate": 9.01178147274609e-06,
"loss": 0.5939,
"step": 424
},
{
"epoch": 0.8551307847082495,
"grad_norm": 2.144890308380127,
"learning_rate": 9.00477925094302e-06,
"loss": 0.5606,
"step": 425
},
{
"epoch": 0.8571428571428571,
"grad_norm": 2.165470838546753,
"learning_rate": 8.997755049500037e-06,
"loss": 0.6005,
"step": 426
},
{
"epoch": 0.8591549295774648,
"grad_norm": 2.3857879638671875,
"learning_rate": 8.990708906968431e-06,
"loss": 0.6083,
"step": 427
},
{
"epoch": 0.8611670020120724,
"grad_norm": 2.0287764072418213,
"learning_rate": 8.98364086201992e-06,
"loss": 0.5549,
"step": 428
},
{
"epoch": 0.8631790744466801,
"grad_norm": 2.002955436706543,
"learning_rate": 8.976550953446426e-06,
"loss": 0.5845,
"step": 429
},
{
"epoch": 0.8651911468812877,
"grad_norm": 2.124072551727295,
"learning_rate": 8.969439220159861e-06,
"loss": 0.5631,
"step": 430
},
{
"epoch": 0.8672032193158954,
"grad_norm": 2.3265137672424316,
"learning_rate": 8.962305701191927e-06,
"loss": 0.5627,
"step": 431
},
{
"epoch": 0.869215291750503,
"grad_norm": 2.0870211124420166,
"learning_rate": 8.955150435693889e-06,
"loss": 0.5217,
"step": 432
},
{
"epoch": 0.8712273641851107,
"grad_norm": 2.349735975265503,
"learning_rate": 8.947973462936366e-06,
"loss": 0.5817,
"step": 433
},
{
"epoch": 0.8732394366197183,
"grad_norm": 2.4168457984924316,
"learning_rate": 8.940774822309116e-06,
"loss": 0.5642,
"step": 434
},
{
"epoch": 0.8752515090543259,
"grad_norm": 2.447883367538452,
"learning_rate": 8.933554553320813e-06,
"loss": 0.588,
"step": 435
},
{
"epoch": 0.8772635814889336,
"grad_norm": 2.1601593494415283,
"learning_rate": 8.926312695598837e-06,
"loss": 0.6093,
"step": 436
},
{
"epoch": 0.8792756539235412,
"grad_norm": 2.3886845111846924,
"learning_rate": 8.919049288889058e-06,
"loss": 0.5617,
"step": 437
},
{
"epoch": 0.8812877263581489,
"grad_norm": 2.295163154602051,
"learning_rate": 8.911764373055612e-06,
"loss": 0.5183,
"step": 438
},
{
"epoch": 0.8832997987927566,
"grad_norm": 2.1284451484680176,
"learning_rate": 8.904457988080682e-06,
"loss": 0.5466,
"step": 439
},
{
"epoch": 0.8853118712273642,
"grad_norm": 2.3324074745178223,
"learning_rate": 8.897130174064285e-06,
"loss": 0.5525,
"step": 440
},
{
"epoch": 0.8873239436619719,
"grad_norm": 2.279731035232544,
"learning_rate": 8.889780971224047e-06,
"loss": 0.6048,
"step": 441
},
{
"epoch": 0.8893360160965795,
"grad_norm": 2.274237632751465,
"learning_rate": 8.882410419894983e-06,
"loss": 0.5566,
"step": 442
},
{
"epoch": 0.8913480885311871,
"grad_norm": 2.2570290565490723,
"learning_rate": 8.875018560529275e-06,
"loss": 0.5492,
"step": 443
},
{
"epoch": 0.8933601609657947,
"grad_norm": 2.310661554336548,
"learning_rate": 8.867605433696056e-06,
"loss": 0.5782,
"step": 444
},
{
"epoch": 0.8953722334004024,
"grad_norm": 2.328352212905884,
"learning_rate": 8.860171080081174e-06,
"loss": 0.6308,
"step": 445
},
{
"epoch": 0.89738430583501,
"grad_norm": 2.168409585952759,
"learning_rate": 8.852715540486986e-06,
"loss": 0.5418,
"step": 446
},
{
"epoch": 0.8993963782696177,
"grad_norm": 2.200997829437256,
"learning_rate": 8.845238855832117e-06,
"loss": 0.6063,
"step": 447
},
{
"epoch": 0.9014084507042254,
"grad_norm": 2.295320987701416,
"learning_rate": 8.837741067151251e-06,
"loss": 0.5874,
"step": 448
},
{
"epoch": 0.903420523138833,
"grad_norm": 2.168964385986328,
"learning_rate": 8.83022221559489e-06,
"loss": 0.5664,
"step": 449
},
{
"epoch": 0.9054325955734407,
"grad_norm": 2.1303305625915527,
"learning_rate": 8.822682342429147e-06,
"loss": 0.5336,
"step": 450
},
{
"epoch": 0.9074446680080482,
"grad_norm": 2.489168882369995,
"learning_rate": 8.8151214890355e-06,
"loss": 0.6388,
"step": 451
},
{
"epoch": 0.9094567404426559,
"grad_norm": 2.106583595275879,
"learning_rate": 8.807539696910574e-06,
"loss": 0.5871,
"step": 452
},
{
"epoch": 0.9114688128772636,
"grad_norm": 2.0476789474487305,
"learning_rate": 8.79993700766592e-06,
"loss": 0.5506,
"step": 453
},
{
"epoch": 0.9134808853118712,
"grad_norm": 2.1992383003234863,
"learning_rate": 8.792313463027777e-06,
"loss": 0.5737,
"step": 454
},
{
"epoch": 0.9154929577464789,
"grad_norm": 2.2196712493896484,
"learning_rate": 8.784669104836842e-06,
"loss": 0.5607,
"step": 455
},
{
"epoch": 0.9175050301810865,
"grad_norm": 2.1786701679229736,
"learning_rate": 8.777003975048048e-06,
"loss": 0.5975,
"step": 456
},
{
"epoch": 0.9195171026156942,
"grad_norm": 2.178668260574341,
"learning_rate": 8.76931811573033e-06,
"loss": 0.5721,
"step": 457
},
{
"epoch": 0.9215291750503019,
"grad_norm": 2.1133759021759033,
"learning_rate": 8.761611569066388e-06,
"loss": 0.5687,
"step": 458
},
{
"epoch": 0.9235412474849095,
"grad_norm": 2.18926739692688,
"learning_rate": 8.753884377352472e-06,
"loss": 0.5927,
"step": 459
},
{
"epoch": 0.9255533199195171,
"grad_norm": 2.5471627712249756,
"learning_rate": 8.74613658299813e-06,
"loss": 0.5743,
"step": 460
},
{
"epoch": 0.9275653923541247,
"grad_norm": 2.43631649017334,
"learning_rate": 8.738368228525988e-06,
"loss": 0.6036,
"step": 461
},
{
"epoch": 0.9295774647887324,
"grad_norm": 2.3415586948394775,
"learning_rate": 8.730579356571514e-06,
"loss": 0.5686,
"step": 462
},
{
"epoch": 0.93158953722334,
"grad_norm": 2.2325901985168457,
"learning_rate": 8.72277000988278e-06,
"loss": 0.5742,
"step": 463
},
{
"epoch": 0.9336016096579477,
"grad_norm": 2.3566064834594727,
"learning_rate": 8.714940231320237e-06,
"loss": 0.6196,
"step": 464
},
{
"epoch": 0.9356136820925554,
"grad_norm": 2.469269275665283,
"learning_rate": 8.707090063856466e-06,
"loss": 0.5786,
"step": 465
},
{
"epoch": 0.937625754527163,
"grad_norm": 2.319122552871704,
"learning_rate": 8.699219550575954e-06,
"loss": 0.5886,
"step": 466
},
{
"epoch": 0.9396378269617707,
"grad_norm": 2.2840166091918945,
"learning_rate": 8.691328734674851e-06,
"loss": 0.5377,
"step": 467
},
{
"epoch": 0.9416498993963782,
"grad_norm": 2.2281320095062256,
"learning_rate": 8.683417659460735e-06,
"loss": 0.5468,
"step": 468
},
{
"epoch": 0.9436619718309859,
"grad_norm": 2.3349030017852783,
"learning_rate": 8.675486368352376e-06,
"loss": 0.6274,
"step": 469
},
{
"epoch": 0.9456740442655935,
"grad_norm": 1.9517689943313599,
"learning_rate": 8.667534904879495e-06,
"loss": 0.536,
"step": 470
},
{
"epoch": 0.9476861167002012,
"grad_norm": 2.1241378784179688,
"learning_rate": 8.659563312682524e-06,
"loss": 0.5384,
"step": 471
},
{
"epoch": 0.9496981891348089,
"grad_norm": 2.210144281387329,
"learning_rate": 8.651571635512372e-06,
"loss": 0.5456,
"step": 472
},
{
"epoch": 0.9517102615694165,
"grad_norm": 2.253452777862549,
"learning_rate": 8.64355991723018e-06,
"loss": 0.5862,
"step": 473
},
{
"epoch": 0.9537223340040242,
"grad_norm": 2.534611225128174,
"learning_rate": 8.635528201807079e-06,
"loss": 0.6127,
"step": 474
},
{
"epoch": 0.9557344064386318,
"grad_norm": 2.063807725906372,
"learning_rate": 8.627476533323957e-06,
"loss": 0.5489,
"step": 475
},
{
"epoch": 0.9577464788732394,
"grad_norm": 2.166027784347534,
"learning_rate": 8.619404955971208e-06,
"loss": 0.5602,
"step": 476
},
{
"epoch": 0.959758551307847,
"grad_norm": 2.0670714378356934,
"learning_rate": 8.61131351404849e-06,
"loss": 0.561,
"step": 477
},
{
"epoch": 0.9617706237424547,
"grad_norm": 2.023287057876587,
"learning_rate": 8.603202251964492e-06,
"loss": 0.5245,
"step": 478
},
{
"epoch": 0.9637826961770624,
"grad_norm": 2.208113431930542,
"learning_rate": 8.595071214236675e-06,
"loss": 0.5625,
"step": 479
},
{
"epoch": 0.96579476861167,
"grad_norm": 2.1182444095611572,
"learning_rate": 8.586920445491043e-06,
"loss": 0.5861,
"step": 480
},
{
"epoch": 0.9678068410462777,
"grad_norm": 2.0620083808898926,
"learning_rate": 8.578749990461884e-06,
"loss": 0.5696,
"step": 481
},
{
"epoch": 0.9698189134808853,
"grad_norm": 2.276942014694214,
"learning_rate": 8.570559893991537e-06,
"loss": 0.5385,
"step": 482
},
{
"epoch": 0.971830985915493,
"grad_norm": 2.8296422958374023,
"learning_rate": 8.562350201030139e-06,
"loss": 0.5484,
"step": 483
},
{
"epoch": 0.9738430583501007,
"grad_norm": 2.4141933917999268,
"learning_rate": 8.554120956635375e-06,
"loss": 0.5563,
"step": 484
},
{
"epoch": 0.9758551307847082,
"grad_norm": 2.302938938140869,
"learning_rate": 8.54587220597224e-06,
"loss": 0.6235,
"step": 485
},
{
"epoch": 0.9778672032193159,
"grad_norm": 2.2329790592193604,
"learning_rate": 8.537603994312786e-06,
"loss": 0.5669,
"step": 486
},
{
"epoch": 0.9798792756539235,
"grad_norm": 2.3101489543914795,
"learning_rate": 8.52931636703587e-06,
"loss": 0.5698,
"step": 487
},
{
"epoch": 0.9818913480885312,
"grad_norm": 2.620720386505127,
"learning_rate": 8.521009369626914e-06,
"loss": 0.5333,
"step": 488
},
{
"epoch": 0.9839034205231388,
"grad_norm": 2.2651710510253906,
"learning_rate": 8.512683047677644e-06,
"loss": 0.5524,
"step": 489
},
{
"epoch": 0.9859154929577465,
"grad_norm": 2.12550687789917,
"learning_rate": 8.504337446885854e-06,
"loss": 0.5665,
"step": 490
},
{
"epoch": 0.9879275653923542,
"grad_norm": 2.010809898376465,
"learning_rate": 8.495972613055137e-06,
"loss": 0.5295,
"step": 491
},
{
"epoch": 0.9899396378269618,
"grad_norm": 2.183659553527832,
"learning_rate": 8.487588592094652e-06,
"loss": 0.5685,
"step": 492
},
{
"epoch": 0.9919517102615694,
"grad_norm": 2.26654314994812,
"learning_rate": 8.47918543001886e-06,
"loss": 0.5872,
"step": 493
},
{
"epoch": 0.993963782696177,
"grad_norm": 2.483935832977295,
"learning_rate": 8.470763172947276e-06,
"loss": 0.5938,
"step": 494
},
{
"epoch": 0.9959758551307847,
"grad_norm": 2.136721611022949,
"learning_rate": 8.462321867104217e-06,
"loss": 0.5819,
"step": 495
},
{
"epoch": 0.9979879275653923,
"grad_norm": 2.082050323486328,
"learning_rate": 8.453861558818542e-06,
"loss": 0.5132,
"step": 496
},
{
"epoch": 1.0,
"grad_norm": 2.1009018421173096,
"learning_rate": 8.445382294523406e-06,
"loss": 0.5328,
"step": 497
},
{
"epoch": 1.0020120724346075,
"grad_norm": 2.176445722579956,
"learning_rate": 8.436884120755997e-06,
"loss": 0.4829,
"step": 498
},
{
"epoch": 1.0040241448692153,
"grad_norm": 2.335286855697632,
"learning_rate": 8.428367084157292e-06,
"loss": 0.4892,
"step": 499
},
{
"epoch": 1.0060362173038229,
"grad_norm": 2.0593619346618652,
"learning_rate": 8.419831231471785e-06,
"loss": 0.4445,
"step": 500
},
{
"epoch": 1.0080482897384306,
"grad_norm": 2.074493169784546,
"learning_rate": 8.411276609547246e-06,
"loss": 0.4707,
"step": 501
},
{
"epoch": 1.0100603621730382,
"grad_norm": 1.8737417459487915,
"learning_rate": 8.402703265334455e-06,
"loss": 0.4441,
"step": 502
},
{
"epoch": 1.012072434607646,
"grad_norm": 2.2214300632476807,
"learning_rate": 8.394111245886948e-06,
"loss": 0.4426,
"step": 503
},
{
"epoch": 1.0140845070422535,
"grad_norm": 2.098071575164795,
"learning_rate": 8.385500598360752e-06,
"loss": 0.4542,
"step": 504
},
{
"epoch": 1.0160965794768613,
"grad_norm": 2.084216356277466,
"learning_rate": 8.376871370014139e-06,
"loss": 0.4747,
"step": 505
},
{
"epoch": 1.0181086519114688,
"grad_norm": 2.022986888885498,
"learning_rate": 8.368223608207351e-06,
"loss": 0.4475,
"step": 506
},
{
"epoch": 1.0201207243460764,
"grad_norm": 2.2105493545532227,
"learning_rate": 8.359557360402357e-06,
"loss": 0.4508,
"step": 507
},
{
"epoch": 1.0221327967806841,
"grad_norm": 2.076406240463257,
"learning_rate": 8.350872674162578e-06,
"loss": 0.4252,
"step": 508
},
{
"epoch": 1.0241448692152917,
"grad_norm": 2.2363109588623047,
"learning_rate": 8.34216959715263e-06,
"loss": 0.4534,
"step": 509
},
{
"epoch": 1.0261569416498995,
"grad_norm": 2.0087409019470215,
"learning_rate": 8.333448177138071e-06,
"loss": 0.4703,
"step": 510
},
{
"epoch": 1.028169014084507,
"grad_norm": 2.0913915634155273,
"learning_rate": 8.324708461985124e-06,
"loss": 0.4365,
"step": 511
},
{
"epoch": 1.0301810865191148,
"grad_norm": 2.1394097805023193,
"learning_rate": 8.315950499660427e-06,
"loss": 0.4716,
"step": 512
},
{
"epoch": 1.0321931589537223,
"grad_norm": 2.2066264152526855,
"learning_rate": 8.307174338230765e-06,
"loss": 0.4548,
"step": 513
},
{
"epoch": 1.0342052313883299,
"grad_norm": 2.1786088943481445,
"learning_rate": 8.298380025862805e-06,
"loss": 0.4606,
"step": 514
},
{
"epoch": 1.0362173038229376,
"grad_norm": 2.3795361518859863,
"learning_rate": 8.28956761082283e-06,
"loss": 0.4754,
"step": 515
},
{
"epoch": 1.0382293762575452,
"grad_norm": 2.1223013401031494,
"learning_rate": 8.280737141476482e-06,
"loss": 0.4541,
"step": 516
},
{
"epoch": 1.040241448692153,
"grad_norm": 2.176300287246704,
"learning_rate": 8.271888666288488e-06,
"loss": 0.4617,
"step": 517
},
{
"epoch": 1.0422535211267605,
"grad_norm": 1.9915233850479126,
"learning_rate": 8.263022233822397e-06,
"loss": 0.4617,
"step": 518
},
{
"epoch": 1.0442655935613683,
"grad_norm": 2.0911736488342285,
"learning_rate": 8.254137892740318e-06,
"loss": 0.4702,
"step": 519
},
{
"epoch": 1.0462776659959758,
"grad_norm": 2.0248148441314697,
"learning_rate": 8.245235691802644e-06,
"loss": 0.4635,
"step": 520
},
{
"epoch": 1.0482897384305836,
"grad_norm": 2.0968503952026367,
"learning_rate": 8.23631567986779e-06,
"loss": 0.4747,
"step": 521
},
{
"epoch": 1.0503018108651911,
"grad_norm": 2.070502281188965,
"learning_rate": 8.227377905891927e-06,
"loss": 0.4537,
"step": 522
},
{
"epoch": 1.0523138832997987,
"grad_norm": 2.0351507663726807,
"learning_rate": 8.218422418928709e-06,
"loss": 0.4757,
"step": 523
},
{
"epoch": 1.0543259557344065,
"grad_norm": 2.0005311965942383,
"learning_rate": 8.209449268129003e-06,
"loss": 0.4605,
"step": 524
},
{
"epoch": 1.056338028169014,
"grad_norm": 2.127006769180298,
"learning_rate": 8.200458502740623e-06,
"loss": 0.4664,
"step": 525
},
{
"epoch": 1.0583501006036218,
"grad_norm": 2.3169891834259033,
"learning_rate": 8.191450172108058e-06,
"loss": 0.4957,
"step": 526
},
{
"epoch": 1.0603621730382293,
"grad_norm": 2.0168895721435547,
"learning_rate": 8.182424325672203e-06,
"loss": 0.49,
"step": 527
},
{
"epoch": 1.062374245472837,
"grad_norm": 2.439521551132202,
"learning_rate": 8.173381012970084e-06,
"loss": 0.4864,
"step": 528
},
{
"epoch": 1.0643863179074446,
"grad_norm": 2.3614089488983154,
"learning_rate": 8.164320283634585e-06,
"loss": 0.4545,
"step": 529
},
{
"epoch": 1.0663983903420524,
"grad_norm": 2.2037999629974365,
"learning_rate": 8.155242187394184e-06,
"loss": 0.4369,
"step": 530
},
{
"epoch": 1.06841046277666,
"grad_norm": 2.243170976638794,
"learning_rate": 8.146146774072674e-06,
"loss": 0.4901,
"step": 531
},
{
"epoch": 1.0704225352112675,
"grad_norm": 2.1171209812164307,
"learning_rate": 8.137034093588885e-06,
"loss": 0.4677,
"step": 532
},
{
"epoch": 1.0724346076458753,
"grad_norm": 2.155569314956665,
"learning_rate": 8.127904195956424e-06,
"loss": 0.43,
"step": 533
},
{
"epoch": 1.0744466800804828,
"grad_norm": 2.3345916271209717,
"learning_rate": 8.118757131283383e-06,
"loss": 0.4634,
"step": 534
},
{
"epoch": 1.0764587525150906,
"grad_norm": 2.1813671588897705,
"learning_rate": 8.109592949772076e-06,
"loss": 0.4629,
"step": 535
},
{
"epoch": 1.0784708249496981,
"grad_norm": 2.235050916671753,
"learning_rate": 8.100411701718765e-06,
"loss": 0.5095,
"step": 536
},
{
"epoch": 1.080482897384306,
"grad_norm": 2.0694808959960938,
"learning_rate": 8.091213437513371e-06,
"loss": 0.4165,
"step": 537
},
{
"epoch": 1.0824949698189135,
"grad_norm": 2.163832426071167,
"learning_rate": 8.081998207639212e-06,
"loss": 0.4883,
"step": 538
},
{
"epoch": 1.084507042253521,
"grad_norm": 2.0511605739593506,
"learning_rate": 8.072766062672717e-06,
"loss": 0.4735,
"step": 539
},
{
"epoch": 1.0865191146881288,
"grad_norm": 2.378787040710449,
"learning_rate": 8.06351705328315e-06,
"loss": 0.4798,
"step": 540
},
{
"epoch": 1.0885311871227363,
"grad_norm": 2.1771721839904785,
"learning_rate": 8.054251230232333e-06,
"loss": 0.461,
"step": 541
},
{
"epoch": 1.090543259557344,
"grad_norm": 2.158625602722168,
"learning_rate": 8.044968644374373e-06,
"loss": 0.4469,
"step": 542
},
{
"epoch": 1.0925553319919517,
"grad_norm": 2.060878038406372,
"learning_rate": 8.035669346655368e-06,
"loss": 0.4245,
"step": 543
},
{
"epoch": 1.0945674044265594,
"grad_norm": 2.228379487991333,
"learning_rate": 8.026353388113142e-06,
"loss": 0.4839,
"step": 544
},
{
"epoch": 1.096579476861167,
"grad_norm": 2.0745017528533936,
"learning_rate": 8.017020819876962e-06,
"loss": 0.4298,
"step": 545
},
{
"epoch": 1.0985915492957747,
"grad_norm": 2.1419124603271484,
"learning_rate": 8.007671693167248e-06,
"loss": 0.4674,
"step": 546
},
{
"epoch": 1.1006036217303823,
"grad_norm": 2.2269890308380127,
"learning_rate": 7.998306059295302e-06,
"loss": 0.4667,
"step": 547
},
{
"epoch": 1.10261569416499,
"grad_norm": 2.07487416267395,
"learning_rate": 7.988923969663027e-06,
"loss": 0.4672,
"step": 548
},
{
"epoch": 1.1046277665995976,
"grad_norm": 2.1029868125915527,
"learning_rate": 7.979525475762634e-06,
"loss": 0.4545,
"step": 549
},
{
"epoch": 1.1066398390342052,
"grad_norm": 2.18890380859375,
"learning_rate": 7.97011062917637e-06,
"loss": 0.4577,
"step": 550
},
{
"epoch": 1.108651911468813,
"grad_norm": 2.4978983402252197,
"learning_rate": 7.960679481576233e-06,
"loss": 0.4757,
"step": 551
},
{
"epoch": 1.1106639839034205,
"grad_norm": 2.099303722381592,
"learning_rate": 7.951232084723685e-06,
"loss": 0.4564,
"step": 552
},
{
"epoch": 1.1126760563380282,
"grad_norm": 2.0341546535491943,
"learning_rate": 7.941768490469368e-06,
"loss": 0.4653,
"step": 553
},
{
"epoch": 1.1146881287726358,
"grad_norm": 2.0823800563812256,
"learning_rate": 7.932288750752819e-06,
"loss": 0.4906,
"step": 554
},
{
"epoch": 1.1167002012072436,
"grad_norm": 2.0323636531829834,
"learning_rate": 7.922792917602197e-06,
"loss": 0.4837,
"step": 555
},
{
"epoch": 1.118712273641851,
"grad_norm": 1.9934216737747192,
"learning_rate": 7.913281043133978e-06,
"loss": 0.4657,
"step": 556
},
{
"epoch": 1.1207243460764587,
"grad_norm": 2.1831841468811035,
"learning_rate": 7.903753179552682e-06,
"loss": 0.4731,
"step": 557
},
{
"epoch": 1.1227364185110664,
"grad_norm": 2.1525766849517822,
"learning_rate": 7.89420937915058e-06,
"loss": 0.4437,
"step": 558
},
{
"epoch": 1.124748490945674,
"grad_norm": 2.4499902725219727,
"learning_rate": 7.884649694307413e-06,
"loss": 0.466,
"step": 559
},
{
"epoch": 1.1267605633802817,
"grad_norm": 2.279303789138794,
"learning_rate": 7.875074177490103e-06,
"loss": 0.4554,
"step": 560
},
{
"epoch": 1.1287726358148893,
"grad_norm": 2.1323020458221436,
"learning_rate": 7.86548288125246e-06,
"loss": 0.4735,
"step": 561
},
{
"epoch": 1.130784708249497,
"grad_norm": 2.1366891860961914,
"learning_rate": 7.855875858234894e-06,
"loss": 0.4721,
"step": 562
},
{
"epoch": 1.1327967806841046,
"grad_norm": 2.020785331726074,
"learning_rate": 7.846253161164138e-06,
"loss": 0.4888,
"step": 563
},
{
"epoch": 1.1348088531187122,
"grad_norm": 2.11442494392395,
"learning_rate": 7.836614842852942e-06,
"loss": 0.4809,
"step": 564
},
{
"epoch": 1.13682092555332,
"grad_norm": 2.2528202533721924,
"learning_rate": 7.826960956199796e-06,
"loss": 0.4726,
"step": 565
},
{
"epoch": 1.1388329979879275,
"grad_norm": 2.0524942874908447,
"learning_rate": 7.817291554188628e-06,
"loss": 0.4295,
"step": 566
},
{
"epoch": 1.1408450704225352,
"grad_norm": 2.4182207584381104,
"learning_rate": 7.80760668988853e-06,
"loss": 0.4897,
"step": 567
},
{
"epoch": 1.1428571428571428,
"grad_norm": 2.186673879623413,
"learning_rate": 7.797906416453445e-06,
"loss": 0.4761,
"step": 568
},
{
"epoch": 1.1448692152917506,
"grad_norm": 2.0501601696014404,
"learning_rate": 7.788190787121896e-06,
"loss": 0.4596,
"step": 569
},
{
"epoch": 1.1468812877263581,
"grad_norm": 2.1505749225616455,
"learning_rate": 7.778459855216678e-06,
"loss": 0.4727,
"step": 570
},
{
"epoch": 1.1488933601609659,
"grad_norm": 2.1751537322998047,
"learning_rate": 7.768713674144578e-06,
"loss": 0.4699,
"step": 571
},
{
"epoch": 1.1509054325955734,
"grad_norm": 2.1614253520965576,
"learning_rate": 7.758952297396068e-06,
"loss": 0.4347,
"step": 572
},
{
"epoch": 1.1529175050301812,
"grad_norm": 2.0868771076202393,
"learning_rate": 7.749175778545026e-06,
"loss": 0.4835,
"step": 573
},
{
"epoch": 1.1549295774647887,
"grad_norm": 2.064375638961792,
"learning_rate": 7.739384171248436e-06,
"loss": 0.4815,
"step": 574
},
{
"epoch": 1.1569416498993963,
"grad_norm": 2.0885112285614014,
"learning_rate": 7.729577529246084e-06,
"loss": 0.4406,
"step": 575
},
{
"epoch": 1.158953722334004,
"grad_norm": 2.109877347946167,
"learning_rate": 7.719755906360282e-06,
"loss": 0.4575,
"step": 576
},
{
"epoch": 1.1609657947686116,
"grad_norm": 2.359646797180176,
"learning_rate": 7.709919356495555e-06,
"loss": 0.4899,
"step": 577
},
{
"epoch": 1.1629778672032194,
"grad_norm": 2.143345355987549,
"learning_rate": 7.700067933638357e-06,
"loss": 0.4668,
"step": 578
},
{
"epoch": 1.164989939637827,
"grad_norm": 2.231412649154663,
"learning_rate": 7.690201691856768e-06,
"loss": 0.4682,
"step": 579
},
{
"epoch": 1.1670020120724347,
"grad_norm": 2.0817251205444336,
"learning_rate": 7.6803206853002e-06,
"loss": 0.4536,
"step": 580
},
{
"epoch": 1.1690140845070423,
"grad_norm": 2.1597044467926025,
"learning_rate": 7.670424968199099e-06,
"loss": 0.4748,
"step": 581
},
{
"epoch": 1.1710261569416498,
"grad_norm": 2.033923864364624,
"learning_rate": 7.660514594864648e-06,
"loss": 0.4566,
"step": 582
},
{
"epoch": 1.1730382293762576,
"grad_norm": 1.9592833518981934,
"learning_rate": 7.650589619688468e-06,
"loss": 0.4569,
"step": 583
},
{
"epoch": 1.1750503018108651,
"grad_norm": 1.9300081729888916,
"learning_rate": 7.640650097142322e-06,
"loss": 0.4589,
"step": 584
},
{
"epoch": 1.1770623742454729,
"grad_norm": 2.2498085498809814,
"learning_rate": 7.630696081777813e-06,
"loss": 0.4447,
"step": 585
},
{
"epoch": 1.1790744466800804,
"grad_norm": 2.153975009918213,
"learning_rate": 7.620727628226081e-06,
"loss": 0.4664,
"step": 586
},
{
"epoch": 1.1810865191146882,
"grad_norm": 2.1976206302642822,
"learning_rate": 7.610744791197518e-06,
"loss": 0.4821,
"step": 587
},
{
"epoch": 1.1830985915492958,
"grad_norm": 2.1182820796966553,
"learning_rate": 7.6007476254814495e-06,
"loss": 0.4904,
"step": 588
},
{
"epoch": 1.1851106639839033,
"grad_norm": 2.017240524291992,
"learning_rate": 7.590736185945843e-06,
"loss": 0.4514,
"step": 589
},
{
"epoch": 1.187122736418511,
"grad_norm": 2.5631942749023438,
"learning_rate": 7.580710527537008e-06,
"loss": 0.4769,
"step": 590
},
{
"epoch": 1.1891348088531186,
"grad_norm": 2.0534706115722656,
"learning_rate": 7.570670705279291e-06,
"loss": 0.4648,
"step": 591
},
{
"epoch": 1.1911468812877264,
"grad_norm": 2.2206432819366455,
"learning_rate": 7.560616774274775e-06,
"loss": 0.5011,
"step": 592
},
{
"epoch": 1.193158953722334,
"grad_norm": 2.055204153060913,
"learning_rate": 7.550548789702979e-06,
"loss": 0.4658,
"step": 593
},
{
"epoch": 1.1951710261569417,
"grad_norm": 2.200359582901001,
"learning_rate": 7.540466806820545e-06,
"loss": 0.4657,
"step": 594
},
{
"epoch": 1.1971830985915493,
"grad_norm": 1.9172611236572266,
"learning_rate": 7.5303708809609514e-06,
"loss": 0.4538,
"step": 595
},
{
"epoch": 1.199195171026157,
"grad_norm": 2.0303874015808105,
"learning_rate": 7.520261067534198e-06,
"loss": 0.4757,
"step": 596
},
{
"epoch": 1.2012072434607646,
"grad_norm": 2.161301851272583,
"learning_rate": 7.510137422026502e-06,
"loss": 0.4639,
"step": 597
},
{
"epoch": 1.2032193158953723,
"grad_norm": 2.2369987964630127,
"learning_rate": 7.500000000000001e-06,
"loss": 0.4943,
"step": 598
},
{
"epoch": 1.20523138832998,
"grad_norm": 2.028968095779419,
"learning_rate": 7.489848857092436e-06,
"loss": 0.4322,
"step": 599
},
{
"epoch": 1.2072434607645874,
"grad_norm": 2.4104745388031006,
"learning_rate": 7.479684049016859e-06,
"loss": 0.4957,
"step": 600
},
{
"epoch": 1.2092555331991952,
"grad_norm": 1.9772114753723145,
"learning_rate": 7.469505631561318e-06,
"loss": 0.4543,
"step": 601
},
{
"epoch": 1.2112676056338028,
"grad_norm": 2.124650239944458,
"learning_rate": 7.459313660588557e-06,
"loss": 0.4756,
"step": 602
},
{
"epoch": 1.2132796780684105,
"grad_norm": 2.071934461593628,
"learning_rate": 7.449108192035701e-06,
"loss": 0.4617,
"step": 603
},
{
"epoch": 1.215291750503018,
"grad_norm": 2.0079896450042725,
"learning_rate": 7.4388892819139625e-06,
"loss": 0.4301,
"step": 604
},
{
"epoch": 1.2173038229376258,
"grad_norm": 2.316357374191284,
"learning_rate": 7.428656986308318e-06,
"loss": 0.4605,
"step": 605
},
{
"epoch": 1.2193158953722334,
"grad_norm": 1.9327945709228516,
"learning_rate": 7.4184113613772134e-06,
"loss": 0.4604,
"step": 606
},
{
"epoch": 1.221327967806841,
"grad_norm": 1.9896938800811768,
"learning_rate": 7.408152463352249e-06,
"loss": 0.4764,
"step": 607
},
{
"epoch": 1.2233400402414487,
"grad_norm": 2.1156723499298096,
"learning_rate": 7.397880348537873e-06,
"loss": 0.4775,
"step": 608
},
{
"epoch": 1.2253521126760563,
"grad_norm": 1.9545114040374756,
"learning_rate": 7.387595073311072e-06,
"loss": 0.4509,
"step": 609
},
{
"epoch": 1.227364185110664,
"grad_norm": 1.8793998956680298,
"learning_rate": 7.3772966941210585e-06,
"loss": 0.4614,
"step": 610
},
{
"epoch": 1.2293762575452716,
"grad_norm": 2.2358322143554688,
"learning_rate": 7.366985267488971e-06,
"loss": 0.4615,
"step": 611
},
{
"epoch": 1.2313883299798793,
"grad_norm": 1.9540613889694214,
"learning_rate": 7.356660850007551e-06,
"loss": 0.4375,
"step": 612
},
{
"epoch": 1.233400402414487,
"grad_norm": 2.101412057876587,
"learning_rate": 7.346323498340839e-06,
"loss": 0.4818,
"step": 613
},
{
"epoch": 1.2354124748490944,
"grad_norm": 2.190537214279175,
"learning_rate": 7.335973269223865e-06,
"loss": 0.4889,
"step": 614
},
{
"epoch": 1.2374245472837022,
"grad_norm": 2.2181079387664795,
"learning_rate": 7.325610219462336e-06,
"loss": 0.4504,
"step": 615
},
{
"epoch": 1.2394366197183098,
"grad_norm": 2.1350760459899902,
"learning_rate": 7.3152344059323165e-06,
"loss": 0.4696,
"step": 616
},
{
"epoch": 1.2414486921529175,
"grad_norm": 2.065744161605835,
"learning_rate": 7.304845885579933e-06,
"loss": 0.4286,
"step": 617
},
{
"epoch": 1.243460764587525,
"grad_norm": 2.2795002460479736,
"learning_rate": 7.294444715421043e-06,
"loss": 0.4949,
"step": 618
},
{
"epoch": 1.2454728370221329,
"grad_norm": 2.2813029289245605,
"learning_rate": 7.284030952540937e-06,
"loss": 0.4731,
"step": 619
},
{
"epoch": 1.2474849094567404,
"grad_norm": 1.9561939239501953,
"learning_rate": 7.273604654094012e-06,
"loss": 0.4693,
"step": 620
},
{
"epoch": 1.2494969818913482,
"grad_norm": 1.989205241203308,
"learning_rate": 7.2631658773034715e-06,
"loss": 0.4582,
"step": 621
},
{
"epoch": 1.2515090543259557,
"grad_norm": 1.8016010522842407,
"learning_rate": 7.252714679461001e-06,
"loss": 0.4541,
"step": 622
},
{
"epoch": 1.2535211267605635,
"grad_norm": 2.0992701053619385,
"learning_rate": 7.2422511179264555e-06,
"loss": 0.4958,
"step": 623
},
{
"epoch": 1.255533199195171,
"grad_norm": 2.151954412460327,
"learning_rate": 7.231775250127551e-06,
"loss": 0.4732,
"step": 624
},
{
"epoch": 1.2575452716297786,
"grad_norm": 2.354834794998169,
"learning_rate": 7.221287133559537e-06,
"loss": 0.4532,
"step": 625
},
{
"epoch": 1.2595573440643864,
"grad_norm": 2.047466993331909,
"learning_rate": 7.2107868257849e-06,
"loss": 0.4551,
"step": 626
},
{
"epoch": 1.261569416498994,
"grad_norm": 2.151855945587158,
"learning_rate": 7.200274384433026e-06,
"loss": 0.4619,
"step": 627
},
{
"epoch": 1.2635814889336017,
"grad_norm": 2.0850412845611572,
"learning_rate": 7.189749867199899e-06,
"loss": 0.4991,
"step": 628
},
{
"epoch": 1.2655935613682092,
"grad_norm": 1.9346874952316284,
"learning_rate": 7.1792133318477775e-06,
"loss": 0.4493,
"step": 629
},
{
"epoch": 1.267605633802817,
"grad_norm": 1.9817110300064087,
"learning_rate": 7.1686648362048824e-06,
"loss": 0.4444,
"step": 630
},
{
"epoch": 1.2696177062374245,
"grad_norm": 2.3106513023376465,
"learning_rate": 7.1581044381650735e-06,
"loss": 0.4995,
"step": 631
},
{
"epoch": 1.271629778672032,
"grad_norm": 2.014252185821533,
"learning_rate": 7.14753219568754e-06,
"loss": 0.4491,
"step": 632
},
{
"epoch": 1.2736418511066399,
"grad_norm": 2.0986545085906982,
"learning_rate": 7.136948166796472e-06,
"loss": 0.4533,
"step": 633
},
{
"epoch": 1.2756539235412474,
"grad_norm": 2.028027296066284,
"learning_rate": 7.126352409580749e-06,
"loss": 0.474,
"step": 634
},
{
"epoch": 1.2776659959758552,
"grad_norm": 2.0538876056671143,
"learning_rate": 7.115744982193624e-06,
"loss": 0.4543,
"step": 635
},
{
"epoch": 1.2796780684104627,
"grad_norm": 2.1324546337127686,
"learning_rate": 7.105125942852396e-06,
"loss": 0.4947,
"step": 636
},
{
"epoch": 1.2816901408450705,
"grad_norm": 2.01959490776062,
"learning_rate": 7.094495349838093e-06,
"loss": 0.4451,
"step": 637
},
{
"epoch": 1.283702213279678,
"grad_norm": 1.945053219795227,
"learning_rate": 7.083853261495159e-06,
"loss": 0.4766,
"step": 638
},
{
"epoch": 1.2857142857142856,
"grad_norm": 2.1535494327545166,
"learning_rate": 7.073199736231123e-06,
"loss": 0.4911,
"step": 639
},
{
"epoch": 1.2877263581488934,
"grad_norm": 2.027019739151001,
"learning_rate": 7.062534832516288e-06,
"loss": 0.4701,
"step": 640
},
{
"epoch": 1.2897384305835011,
"grad_norm": 2.15201473236084,
"learning_rate": 7.051858608883404e-06,
"loss": 0.4968,
"step": 641
},
{
"epoch": 1.2917505030181087,
"grad_norm": 2.152102470397949,
"learning_rate": 7.041171123927347e-06,
"loss": 0.451,
"step": 642
},
{
"epoch": 1.2937625754527162,
"grad_norm": 2.2539751529693604,
"learning_rate": 7.0304724363048025e-06,
"loss": 0.4791,
"step": 643
},
{
"epoch": 1.295774647887324,
"grad_norm": 2.0571231842041016,
"learning_rate": 7.019762604733939e-06,
"loss": 0.4843,
"step": 644
},
{
"epoch": 1.2977867203219315,
"grad_norm": 2.099419355392456,
"learning_rate": 7.009041687994085e-06,
"loss": 0.465,
"step": 645
},
{
"epoch": 1.2997987927565393,
"grad_norm": 2.1120150089263916,
"learning_rate": 6.998309744925411e-06,
"loss": 0.4451,
"step": 646
},
{
"epoch": 1.3018108651911469,
"grad_norm": 1.9553170204162598,
"learning_rate": 6.987566834428605e-06,
"loss": 0.4525,
"step": 647
},
{
"epoch": 1.3038229376257546,
"grad_norm": 1.9628238677978516,
"learning_rate": 6.97681301546454e-06,
"loss": 0.4738,
"step": 648
},
{
"epoch": 1.3058350100603622,
"grad_norm": 1.9138386249542236,
"learning_rate": 6.9660483470539704e-06,
"loss": 0.4732,
"step": 649
},
{
"epoch": 1.3078470824949697,
"grad_norm": 2.1095831394195557,
"learning_rate": 6.955272888277188e-06,
"loss": 0.5139,
"step": 650
},
{
"epoch": 1.3098591549295775,
"grad_norm": 2.0262861251831055,
"learning_rate": 6.944486698273704e-06,
"loss": 0.4743,
"step": 651
},
{
"epoch": 1.311871227364185,
"grad_norm": 2.467956304550171,
"learning_rate": 6.933689836241939e-06,
"loss": 0.4598,
"step": 652
},
{
"epoch": 1.3138832997987928,
"grad_norm": 2.182114601135254,
"learning_rate": 6.92288236143887e-06,
"loss": 0.4858,
"step": 653
},
{
"epoch": 1.3158953722334004,
"grad_norm": 2.1629250049591064,
"learning_rate": 6.912064333179729e-06,
"loss": 0.4857,
"step": 654
},
{
"epoch": 1.3179074446680081,
"grad_norm": 2.0808186531066895,
"learning_rate": 6.901235810837668e-06,
"loss": 0.4631,
"step": 655
},
{
"epoch": 1.3199195171026157,
"grad_norm": 2.05938720703125,
"learning_rate": 6.890396853843436e-06,
"loss": 0.4958,
"step": 656
},
{
"epoch": 1.3219315895372232,
"grad_norm": 2.0860953330993652,
"learning_rate": 6.879547521685046e-06,
"loss": 0.4936,
"step": 657
},
{
"epoch": 1.323943661971831,
"grad_norm": 2.298236131668091,
"learning_rate": 6.868687873907458e-06,
"loss": 0.4549,
"step": 658
},
{
"epoch": 1.3259557344064388,
"grad_norm": 2.3687760829925537,
"learning_rate": 6.857817970112246e-06,
"loss": 0.4538,
"step": 659
},
{
"epoch": 1.3279678068410463,
"grad_norm": 2.0305449962615967,
"learning_rate": 6.846937869957272e-06,
"loss": 0.448,
"step": 660
},
{
"epoch": 1.3299798792756539,
"grad_norm": 2.1987497806549072,
"learning_rate": 6.836047633156361e-06,
"loss": 0.5089,
"step": 661
},
{
"epoch": 1.3319919517102616,
"grad_norm": 1.9728593826293945,
"learning_rate": 6.8251473194789695e-06,
"loss": 0.4767,
"step": 662
},
{
"epoch": 1.3340040241448692,
"grad_norm": 2.0280892848968506,
"learning_rate": 6.814236988749863e-06,
"loss": 0.4816,
"step": 663
},
{
"epoch": 1.3360160965794767,
"grad_norm": 2.024660587310791,
"learning_rate": 6.8033167008487784e-06,
"loss": 0.4562,
"step": 664
},
{
"epoch": 1.3380281690140845,
"grad_norm": 2.0069921016693115,
"learning_rate": 6.792386515710106e-06,
"loss": 0.4399,
"step": 665
},
{
"epoch": 1.3400402414486923,
"grad_norm": 2.357219696044922,
"learning_rate": 6.7814464933225535e-06,
"loss": 0.4681,
"step": 666
},
{
"epoch": 1.3420523138832998,
"grad_norm": 2.1096150875091553,
"learning_rate": 6.77049669372882e-06,
"loss": 0.44,
"step": 667
},
{
"epoch": 1.3440643863179074,
"grad_norm": 2.167057991027832,
"learning_rate": 6.759537177025263e-06,
"loss": 0.4421,
"step": 668
},
{
"epoch": 1.3460764587525151,
"grad_norm": 2.035834789276123,
"learning_rate": 6.748568003361576e-06,
"loss": 0.448,
"step": 669
},
{
"epoch": 1.3480885311871227,
"grad_norm": 2.0616490840911865,
"learning_rate": 6.737589232940445e-06,
"loss": 0.4103,
"step": 670
},
{
"epoch": 1.3501006036217305,
"grad_norm": 1.953667163848877,
"learning_rate": 6.726600926017234e-06,
"loss": 0.456,
"step": 671
},
{
"epoch": 1.352112676056338,
"grad_norm": 1.9617230892181396,
"learning_rate": 6.715603142899645e-06,
"loss": 0.4652,
"step": 672
},
{
"epoch": 1.3541247484909458,
"grad_norm": 2.0986597537994385,
"learning_rate": 6.704595943947385e-06,
"loss": 0.4459,
"step": 673
},
{
"epoch": 1.3561368209255533,
"grad_norm": 2.0416882038116455,
"learning_rate": 6.693579389571844e-06,
"loss": 0.4903,
"step": 674
},
{
"epoch": 1.3581488933601609,
"grad_norm": 2.0102877616882324,
"learning_rate": 6.682553540235754e-06,
"loss": 0.4337,
"step": 675
},
{
"epoch": 1.3601609657947686,
"grad_norm": 2.2172913551330566,
"learning_rate": 6.671518456452859e-06,
"loss": 0.4606,
"step": 676
},
{
"epoch": 1.3621730382293762,
"grad_norm": 2.233868360519409,
"learning_rate": 6.6604741987875905e-06,
"loss": 0.448,
"step": 677
},
{
"epoch": 1.364185110663984,
"grad_norm": 1.9892909526824951,
"learning_rate": 6.649420827854729e-06,
"loss": 0.4605,
"step": 678
},
{
"epoch": 1.3661971830985915,
"grad_norm": 2.053473711013794,
"learning_rate": 6.638358404319064e-06,
"loss": 0.4642,
"step": 679
},
{
"epoch": 1.3682092555331993,
"grad_norm": 1.8882447481155396,
"learning_rate": 6.62728698889508e-06,
"loss": 0.4396,
"step": 680
},
{
"epoch": 1.3702213279678068,
"grad_norm": 2.1912496089935303,
"learning_rate": 6.616206642346603e-06,
"loss": 0.4522,
"step": 681
},
{
"epoch": 1.3722334004024144,
"grad_norm": 2.0848143100738525,
"learning_rate": 6.605117425486483e-06,
"loss": 0.4698,
"step": 682
},
{
"epoch": 1.3742454728370221,
"grad_norm": 2.0212464332580566,
"learning_rate": 6.594019399176246e-06,
"loss": 0.4888,
"step": 683
},
{
"epoch": 1.37625754527163,
"grad_norm": 2.030343532562256,
"learning_rate": 6.582912624325777e-06,
"loss": 0.4524,
"step": 684
},
{
"epoch": 1.3782696177062375,
"grad_norm": 2.1226394176483154,
"learning_rate": 6.571797161892965e-06,
"loss": 0.5117,
"step": 685
},
{
"epoch": 1.380281690140845,
"grad_norm": 2.0747764110565186,
"learning_rate": 6.5606730728833904e-06,
"loss": 0.4656,
"step": 686
},
{
"epoch": 1.3822937625754528,
"grad_norm": 2.0734431743621826,
"learning_rate": 6.549540418349969e-06,
"loss": 0.4867,
"step": 687
},
{
"epoch": 1.3843058350100603,
"grad_norm": 2.2219855785369873,
"learning_rate": 6.538399259392637e-06,
"loss": 0.4542,
"step": 688
},
{
"epoch": 1.3863179074446679,
"grad_norm": 2.252269983291626,
"learning_rate": 6.527249657157998e-06,
"loss": 0.4614,
"step": 689
},
{
"epoch": 1.3883299798792756,
"grad_norm": 1.9988418817520142,
"learning_rate": 6.516091672839e-06,
"loss": 0.442,
"step": 690
},
{
"epoch": 1.3903420523138834,
"grad_norm": 2.1525936126708984,
"learning_rate": 6.504925367674595e-06,
"loss": 0.5083,
"step": 691
},
{
"epoch": 1.392354124748491,
"grad_norm": 2.121413230895996,
"learning_rate": 6.4937508029493965e-06,
"loss": 0.4407,
"step": 692
},
{
"epoch": 1.3943661971830985,
"grad_norm": 1.963167667388916,
"learning_rate": 6.482568039993356e-06,
"loss": 0.4743,
"step": 693
},
{
"epoch": 1.3963782696177063,
"grad_norm": 2.271878242492676,
"learning_rate": 6.471377140181419e-06,
"loss": 0.4536,
"step": 694
},
{
"epoch": 1.3983903420523138,
"grad_norm": 2.22976016998291,
"learning_rate": 6.4601781649331885e-06,
"loss": 0.4729,
"step": 695
},
{
"epoch": 1.4004024144869216,
"grad_norm": 1.9575120210647583,
"learning_rate": 6.4489711757125814e-06,
"loss": 0.4685,
"step": 696
},
{
"epoch": 1.4024144869215291,
"grad_norm": 2.0964162349700928,
"learning_rate": 6.437756234027512e-06,
"loss": 0.5093,
"step": 697
},
{
"epoch": 1.404426559356137,
"grad_norm": 2.1892035007476807,
"learning_rate": 6.4265334014295284e-06,
"loss": 0.5002,
"step": 698
},
{
"epoch": 1.4064386317907445,
"grad_norm": 2.233177423477173,
"learning_rate": 6.415302739513492e-06,
"loss": 0.4899,
"step": 699
},
{
"epoch": 1.408450704225352,
"grad_norm": 1.9493423700332642,
"learning_rate": 6.40406430991723e-06,
"loss": 0.4715,
"step": 700
},
{
"epoch": 1.4104627766599598,
"grad_norm": 2.131361246109009,
"learning_rate": 6.392818174321213e-06,
"loss": 0.491,
"step": 701
},
{
"epoch": 1.4124748490945673,
"grad_norm": 1.9579322338104248,
"learning_rate": 6.3815643944481866e-06,
"loss": 0.439,
"step": 702
},
{
"epoch": 1.414486921529175,
"grad_norm": 1.9951404333114624,
"learning_rate": 6.370303032062869e-06,
"loss": 0.4235,
"step": 703
},
{
"epoch": 1.4164989939637826,
"grad_norm": 2.19970965385437,
"learning_rate": 6.359034148971581e-06,
"loss": 0.4866,
"step": 704
},
{
"epoch": 1.4185110663983904,
"grad_norm": 1.9917786121368408,
"learning_rate": 6.347757807021926e-06,
"loss": 0.4717,
"step": 705
},
{
"epoch": 1.420523138832998,
"grad_norm": 1.9985647201538086,
"learning_rate": 6.336474068102444e-06,
"loss": 0.4991,
"step": 706
},
{
"epoch": 1.4225352112676055,
"grad_norm": 2.0368287563323975,
"learning_rate": 6.325182994142267e-06,
"loss": 0.4417,
"step": 707
},
{
"epoch": 1.4245472837022133,
"grad_norm": 1.9710001945495605,
"learning_rate": 6.3138846471107925e-06,
"loss": 0.46,
"step": 708
},
{
"epoch": 1.426559356136821,
"grad_norm": 2.040895462036133,
"learning_rate": 6.302579089017328e-06,
"loss": 0.4903,
"step": 709
},
{
"epoch": 1.4285714285714286,
"grad_norm": 2.1418488025665283,
"learning_rate": 6.291266381910761e-06,
"loss": 0.4745,
"step": 710
},
{
"epoch": 1.4305835010060362,
"grad_norm": 1.9608333110809326,
"learning_rate": 6.279946587879216e-06,
"loss": 0.4524,
"step": 711
},
{
"epoch": 1.432595573440644,
"grad_norm": 2.0412542819976807,
"learning_rate": 6.268619769049713e-06,
"loss": 0.4441,
"step": 712
},
{
"epoch": 1.4346076458752515,
"grad_norm": 1.8654680252075195,
"learning_rate": 6.2572859875878225e-06,
"loss": 0.4107,
"step": 713
},
{
"epoch": 1.436619718309859,
"grad_norm": 1.9783178567886353,
"learning_rate": 6.245945305697335e-06,
"loss": 0.4386,
"step": 714
},
{
"epoch": 1.4386317907444668,
"grad_norm": 2.1955788135528564,
"learning_rate": 6.234597785619906e-06,
"loss": 0.4611,
"step": 715
},
{
"epoch": 1.4406438631790746,
"grad_norm": 1.9734547138214111,
"learning_rate": 6.223243489634727e-06,
"loss": 0.4506,
"step": 716
},
{
"epoch": 1.442655935613682,
"grad_norm": 2.019916534423828,
"learning_rate": 6.211882480058175e-06,
"loss": 0.4369,
"step": 717
},
{
"epoch": 1.4446680080482897,
"grad_norm": 2.066774368286133,
"learning_rate": 6.200514819243476e-06,
"loss": 0.472,
"step": 718
},
{
"epoch": 1.4466800804828974,
"grad_norm": 2.1636836528778076,
"learning_rate": 6.189140569580356e-06,
"loss": 0.4954,
"step": 719
},
{
"epoch": 1.448692152917505,
"grad_norm": 1.956506371498108,
"learning_rate": 6.1777597934947084e-06,
"loss": 0.4858,
"step": 720
},
{
"epoch": 1.4507042253521127,
"grad_norm": 2.098059892654419,
"learning_rate": 6.166372553448241e-06,
"loss": 0.4979,
"step": 721
},
{
"epoch": 1.4527162977867203,
"grad_norm": 2.1118078231811523,
"learning_rate": 6.154978911938143e-06,
"loss": 0.4482,
"step": 722
},
{
"epoch": 1.454728370221328,
"grad_norm": 2.1067488193511963,
"learning_rate": 6.143578931496732e-06,
"loss": 0.4546,
"step": 723
},
{
"epoch": 1.4567404426559356,
"grad_norm": 2.073150396347046,
"learning_rate": 6.132172674691119e-06,
"loss": 0.4523,
"step": 724
},
{
"epoch": 1.4587525150905432,
"grad_norm": 1.9528892040252686,
"learning_rate": 6.120760204122862e-06,
"loss": 0.4583,
"step": 725
},
{
"epoch": 1.460764587525151,
"grad_norm": 2.218679189682007,
"learning_rate": 6.109341582427621e-06,
"loss": 0.4744,
"step": 726
},
{
"epoch": 1.4627766599597585,
"grad_norm": 2.0021162033081055,
"learning_rate": 6.097916872274815e-06,
"loss": 0.4482,
"step": 727
},
{
"epoch": 1.4647887323943662,
"grad_norm": 2.2345480918884277,
"learning_rate": 6.086486136367281e-06,
"loss": 0.4799,
"step": 728
},
{
"epoch": 1.4668008048289738,
"grad_norm": 2.0748777389526367,
"learning_rate": 6.075049437440927e-06,
"loss": 0.4432,
"step": 729
},
{
"epoch": 1.4688128772635816,
"grad_norm": 2.083968162536621,
"learning_rate": 6.063606838264384e-06,
"loss": 0.4438,
"step": 730
},
{
"epoch": 1.470824949698189,
"grad_norm": 1.9462379217147827,
"learning_rate": 6.0521584016386735e-06,
"loss": 0.4895,
"step": 731
},
{
"epoch": 1.4728370221327967,
"grad_norm": 2.2787818908691406,
"learning_rate": 6.040704190396847e-06,
"loss": 0.4758,
"step": 732
},
{
"epoch": 1.4748490945674044,
"grad_norm": 2.0492944717407227,
"learning_rate": 6.029244267403652e-06,
"loss": 0.4756,
"step": 733
},
{
"epoch": 1.4768611670020122,
"grad_norm": 1.994223952293396,
"learning_rate": 6.0177786955551874e-06,
"loss": 0.4213,
"step": 734
},
{
"epoch": 1.4788732394366197,
"grad_norm": 2.0081369876861572,
"learning_rate": 6.006307537778552e-06,
"loss": 0.4307,
"step": 735
},
{
"epoch": 1.4808853118712273,
"grad_norm": 1.9906424283981323,
"learning_rate": 5.9948308570315e-06,
"loss": 0.4815,
"step": 736
},
{
"epoch": 1.482897384305835,
"grad_norm": 2.073981761932373,
"learning_rate": 5.983348716302101e-06,
"loss": 0.4892,
"step": 737
},
{
"epoch": 1.4849094567404426,
"grad_norm": 2.121760606765747,
"learning_rate": 5.97186117860839e-06,
"loss": 0.4734,
"step": 738
},
{
"epoch": 1.4869215291750504,
"grad_norm": 2.0857138633728027,
"learning_rate": 5.960368306998023e-06,
"loss": 0.4555,
"step": 739
},
{
"epoch": 1.488933601609658,
"grad_norm": 2.2429444789886475,
"learning_rate": 5.948870164547932e-06,
"loss": 0.4985,
"step": 740
},
{
"epoch": 1.4909456740442657,
"grad_norm": 2.1200976371765137,
"learning_rate": 5.9373668143639694e-06,
"loss": 0.4807,
"step": 741
},
{
"epoch": 1.4929577464788732,
"grad_norm": 2.1874051094055176,
"learning_rate": 5.92585831958058e-06,
"loss": 0.4907,
"step": 742
},
{
"epoch": 1.4949698189134808,
"grad_norm": 1.9364540576934814,
"learning_rate": 5.914344743360435e-06,
"loss": 0.4441,
"step": 743
},
{
"epoch": 1.4969818913480886,
"grad_norm": 2.2439324855804443,
"learning_rate": 5.902826148894102e-06,
"loss": 0.4845,
"step": 744
},
{
"epoch": 1.4989939637826961,
"grad_norm": 2.184908390045166,
"learning_rate": 5.891302599399686e-06,
"loss": 0.4523,
"step": 745
},
{
"epoch": 1.5010060362173037,
"grad_norm": 2.3075244426727295,
"learning_rate": 5.8797741581224866e-06,
"loss": 0.4966,
"step": 746
},
{
"epoch": 1.5030181086519114,
"grad_norm": 2.011004686355591,
"learning_rate": 5.8682408883346535e-06,
"loss": 0.4264,
"step": 747
},
{
"epoch": 1.5050301810865192,
"grad_norm": 2.2199621200561523,
"learning_rate": 5.856702853334833e-06,
"loss": 0.4877,
"step": 748
},
{
"epoch": 1.5070422535211268,
"grad_norm": 2.1460952758789062,
"learning_rate": 5.845160116447833e-06,
"loss": 0.4548,
"step": 749
},
{
"epoch": 1.5090543259557343,
"grad_norm": 2.155158758163452,
"learning_rate": 5.833612741024256e-06,
"loss": 0.4729,
"step": 750
},
{
"epoch": 1.511066398390342,
"grad_norm": 2.3106179237365723,
"learning_rate": 5.8220607904401725e-06,
"loss": 0.5055,
"step": 751
},
{
"epoch": 1.5130784708249498,
"grad_norm": 2.1805429458618164,
"learning_rate": 5.810504328096756e-06,
"loss": 0.4714,
"step": 752
},
{
"epoch": 1.5150905432595574,
"grad_norm": 1.9099804162979126,
"learning_rate": 5.798943417419944e-06,
"loss": 0.4072,
"step": 753
},
{
"epoch": 1.517102615694165,
"grad_norm": 2.059183359146118,
"learning_rate": 5.78737812186009e-06,
"loss": 0.4982,
"step": 754
},
{
"epoch": 1.5191146881287727,
"grad_norm": 2.180112600326538,
"learning_rate": 5.775808504891612e-06,
"loss": 0.4974,
"step": 755
},
{
"epoch": 1.5211267605633803,
"grad_norm": 2.0823423862457275,
"learning_rate": 5.764234630012643e-06,
"loss": 0.4779,
"step": 756
},
{
"epoch": 1.5231388329979878,
"grad_norm": 2.170238733291626,
"learning_rate": 5.752656560744692e-06,
"loss": 0.4495,
"step": 757
},
{
"epoch": 1.5251509054325956,
"grad_norm": 2.0095527172088623,
"learning_rate": 5.741074360632278e-06,
"loss": 0.4173,
"step": 758
},
{
"epoch": 1.5271629778672033,
"grad_norm": 2.175075054168701,
"learning_rate": 5.729488093242601e-06,
"loss": 0.4744,
"step": 759
},
{
"epoch": 1.529175050301811,
"grad_norm": 2.083327293395996,
"learning_rate": 5.717897822165179e-06,
"loss": 0.4455,
"step": 760
},
{
"epoch": 1.5311871227364184,
"grad_norm": 1.970665454864502,
"learning_rate": 5.706303611011502e-06,
"loss": 0.4756,
"step": 761
},
{
"epoch": 1.5331991951710262,
"grad_norm": 2.1470654010772705,
"learning_rate": 5.694705523414691e-06,
"loss": 0.4607,
"step": 762
},
{
"epoch": 1.5352112676056338,
"grad_norm": 2.2010574340820312,
"learning_rate": 5.6831036230291345e-06,
"loss": 0.4645,
"step": 763
},
{
"epoch": 1.5372233400402413,
"grad_norm": 1.93354332447052,
"learning_rate": 5.671497973530152e-06,
"loss": 0.4472,
"step": 764
},
{
"epoch": 1.539235412474849,
"grad_norm": 2.1269092559814453,
"learning_rate": 5.659888638613638e-06,
"loss": 0.4577,
"step": 765
},
{
"epoch": 1.5412474849094568,
"grad_norm": 2.004490852355957,
"learning_rate": 5.648275681995716e-06,
"loss": 0.4566,
"step": 766
},
{
"epoch": 1.5432595573440644,
"grad_norm": 1.9603842496871948,
"learning_rate": 5.636659167412381e-06,
"loss": 0.4608,
"step": 767
},
{
"epoch": 1.545271629778672,
"grad_norm": 2.082428216934204,
"learning_rate": 5.625039158619161e-06,
"loss": 0.4735,
"step": 768
},
{
"epoch": 1.5472837022132797,
"grad_norm": 2.371439218521118,
"learning_rate": 5.613415719390759e-06,
"loss": 0.4786,
"step": 769
},
{
"epoch": 1.5492957746478875,
"grad_norm": 2.142385482788086,
"learning_rate": 5.601788913520706e-06,
"loss": 0.484,
"step": 770
},
{
"epoch": 1.5513078470824948,
"grad_norm": 2.027139663696289,
"learning_rate": 5.590158804821011e-06,
"loss": 0.4389,
"step": 771
},
{
"epoch": 1.5533199195171026,
"grad_norm": 2.114689588546753,
"learning_rate": 5.578525457121807e-06,
"loss": 0.4502,
"step": 772
},
{
"epoch": 1.5553319919517103,
"grad_norm": 2.133058786392212,
"learning_rate": 5.566888934271007e-06,
"loss": 0.4906,
"step": 773
},
{
"epoch": 1.557344064386318,
"grad_norm": 1.9754748344421387,
"learning_rate": 5.5552493001339535e-06,
"loss": 0.4646,
"step": 774
},
{
"epoch": 1.5593561368209254,
"grad_norm": 2.156743288040161,
"learning_rate": 5.543606618593053e-06,
"loss": 0.4571,
"step": 775
},
{
"epoch": 1.5613682092555332,
"grad_norm": 2.0975053310394287,
"learning_rate": 5.531960953547452e-06,
"loss": 0.47,
"step": 776
},
{
"epoch": 1.563380281690141,
"grad_norm": 2.1429851055145264,
"learning_rate": 5.520312368912661e-06,
"loss": 0.4562,
"step": 777
},
{
"epoch": 1.5653923541247485,
"grad_norm": 2.181736946105957,
"learning_rate": 5.508660928620216e-06,
"loss": 0.4377,
"step": 778
},
{
"epoch": 1.567404426559356,
"grad_norm": 2.0732667446136475,
"learning_rate": 5.497006696617333e-06,
"loss": 0.4527,
"step": 779
},
{
"epoch": 1.5694164989939638,
"grad_norm": 2.0271811485290527,
"learning_rate": 5.485349736866541e-06,
"loss": 0.4733,
"step": 780
},
{
"epoch": 1.5714285714285714,
"grad_norm": 2.1071932315826416,
"learning_rate": 5.473690113345343e-06,
"loss": 0.4597,
"step": 781
},
{
"epoch": 1.573440643863179,
"grad_norm": 2.325165033340454,
"learning_rate": 5.462027890045862e-06,
"loss": 0.4859,
"step": 782
},
{
"epoch": 1.5754527162977867,
"grad_norm": 2.015083074569702,
"learning_rate": 5.450363130974492e-06,
"loss": 0.4725,
"step": 783
},
{
"epoch": 1.5774647887323945,
"grad_norm": 2.0369396209716797,
"learning_rate": 5.438695900151537e-06,
"loss": 0.4506,
"step": 784
},
{
"epoch": 1.579476861167002,
"grad_norm": 1.943955898284912,
"learning_rate": 5.427026261610877e-06,
"loss": 0.4499,
"step": 785
},
{
"epoch": 1.5814889336016096,
"grad_norm": 2.0148682594299316,
"learning_rate": 5.4153542793995985e-06,
"loss": 0.4854,
"step": 786
},
{
"epoch": 1.5835010060362174,
"grad_norm": 1.9239193201065063,
"learning_rate": 5.403680017577653e-06,
"loss": 0.429,
"step": 787
},
{
"epoch": 1.585513078470825,
"grad_norm": 2.050187587738037,
"learning_rate": 5.392003540217505e-06,
"loss": 0.4537,
"step": 788
},
{
"epoch": 1.5875251509054324,
"grad_norm": 1.947845697402954,
"learning_rate": 5.380324911403776e-06,
"loss": 0.4409,
"step": 789
},
{
"epoch": 1.5895372233400402,
"grad_norm": 1.9879707098007202,
"learning_rate": 5.368644195232896e-06,
"loss": 0.4574,
"step": 790
},
{
"epoch": 1.591549295774648,
"grad_norm": 2.045276165008545,
"learning_rate": 5.356961455812754e-06,
"loss": 0.462,
"step": 791
},
{
"epoch": 1.5935613682092555,
"grad_norm": 1.8802539110183716,
"learning_rate": 5.34527675726234e-06,
"loss": 0.4382,
"step": 792
},
{
"epoch": 1.595573440643863,
"grad_norm": 2.2450199127197266,
"learning_rate": 5.3335901637113985e-06,
"loss": 0.4532,
"step": 793
},
{
"epoch": 1.5975855130784709,
"grad_norm": 1.9864826202392578,
"learning_rate": 5.321901739300074e-06,
"loss": 0.4517,
"step": 794
},
{
"epoch": 1.5995975855130786,
"grad_norm": 2.0585172176361084,
"learning_rate": 5.310211548178556e-06,
"loss": 0.4497,
"step": 795
},
{
"epoch": 1.6016096579476862,
"grad_norm": 2.0852487087249756,
"learning_rate": 5.298519654506736e-06,
"loss": 0.4839,
"step": 796
},
{
"epoch": 1.6036217303822937,
"grad_norm": 1.9984678030014038,
"learning_rate": 5.286826122453847e-06,
"loss": 0.4271,
"step": 797
},
{
"epoch": 1.6056338028169015,
"grad_norm": 2.0208373069763184,
"learning_rate": 5.275131016198112e-06,
"loss": 0.4755,
"step": 798
},
{
"epoch": 1.607645875251509,
"grad_norm": 2.085428237915039,
"learning_rate": 5.2634343999263985e-06,
"loss": 0.4494,
"step": 799
},
{
"epoch": 1.6096579476861166,
"grad_norm": 2.018244981765747,
"learning_rate": 5.251736337833857e-06,
"loss": 0.4527,
"step": 800
},
{
"epoch": 1.6116700201207244,
"grad_norm": 2.217186450958252,
"learning_rate": 5.2400368941235745e-06,
"loss": 0.4691,
"step": 801
},
{
"epoch": 1.6136820925553321,
"grad_norm": 2.0559592247009277,
"learning_rate": 5.228336133006223e-06,
"loss": 0.4481,
"step": 802
},
{
"epoch": 1.6156941649899397,
"grad_norm": 1.9424960613250732,
"learning_rate": 5.216634118699701e-06,
"loss": 0.4291,
"step": 803
},
{
"epoch": 1.6177062374245472,
"grad_norm": 2.2018039226531982,
"learning_rate": 5.20493091542879e-06,
"loss": 0.4898,
"step": 804
},
{
"epoch": 1.619718309859155,
"grad_norm": 2.1593589782714844,
"learning_rate": 5.193226587424793e-06,
"loss": 0.4734,
"step": 805
},
{
"epoch": 1.6217303822937625,
"grad_norm": 2.275895118713379,
"learning_rate": 5.181521198925183e-06,
"loss": 0.4686,
"step": 806
},
{
"epoch": 1.62374245472837,
"grad_norm": 2.2770731449127197,
"learning_rate": 5.169814814173263e-06,
"loss": 0.5107,
"step": 807
},
{
"epoch": 1.6257545271629779,
"grad_norm": 2.013920545578003,
"learning_rate": 5.158107497417795e-06,
"loss": 0.4326,
"step": 808
},
{
"epoch": 1.6277665995975856,
"grad_norm": 2.1957645416259766,
"learning_rate": 5.14639931291266e-06,
"loss": 0.4449,
"step": 809
},
{
"epoch": 1.6297786720321932,
"grad_norm": 2.270573854446411,
"learning_rate": 5.134690324916502e-06,
"loss": 0.4456,
"step": 810
},
{
"epoch": 1.6317907444668007,
"grad_norm": 2.1294291019439697,
"learning_rate": 5.122980597692372e-06,
"loss": 0.461,
"step": 811
},
{
"epoch": 1.6338028169014085,
"grad_norm": 2.094083309173584,
"learning_rate": 5.11127019550738e-06,
"loss": 0.4692,
"step": 812
},
{
"epoch": 1.635814889336016,
"grad_norm": 2.167789936065674,
"learning_rate": 5.099559182632342e-06,
"loss": 0.4748,
"step": 813
},
{
"epoch": 1.6378269617706236,
"grad_norm": 2.0614733695983887,
"learning_rate": 5.087847623341421e-06,
"loss": 0.4512,
"step": 814
},
{
"epoch": 1.6398390342052314,
"grad_norm": 2.1112565994262695,
"learning_rate": 5.076135581911784e-06,
"loss": 0.4708,
"step": 815
},
{
"epoch": 1.6418511066398391,
"grad_norm": 2.078747272491455,
"learning_rate": 5.0644231226232434e-06,
"loss": 0.4351,
"step": 816
},
{
"epoch": 1.6438631790744467,
"grad_norm": 2.212445020675659,
"learning_rate": 5.052710309757899e-06,
"loss": 0.484,
"step": 817
},
{
"epoch": 1.6458752515090542,
"grad_norm": 2.132495880126953,
"learning_rate": 5.040997207599798e-06,
"loss": 0.456,
"step": 818
},
{
"epoch": 1.647887323943662,
"grad_norm": 2.0021562576293945,
"learning_rate": 5.029283880434575e-06,
"loss": 0.4747,
"step": 819
},
{
"epoch": 1.6498993963782698,
"grad_norm": 2.115347146987915,
"learning_rate": 5.0175703925490936e-06,
"loss": 0.457,
"step": 820
},
{
"epoch": 1.6519114688128773,
"grad_norm": 1.9082791805267334,
"learning_rate": 5.005856808231108e-06,
"loss": 0.4414,
"step": 821
},
{
"epoch": 1.6539235412474849,
"grad_norm": 1.9503601789474487,
"learning_rate": 4.994143191768893e-06,
"loss": 0.4342,
"step": 822
},
{
"epoch": 1.6559356136820926,
"grad_norm": 2.0434417724609375,
"learning_rate": 4.982429607450907e-06,
"loss": 0.4592,
"step": 823
},
{
"epoch": 1.6579476861167002,
"grad_norm": 2.0412988662719727,
"learning_rate": 4.970716119565427e-06,
"loss": 0.4432,
"step": 824
},
{
"epoch": 1.6599597585513077,
"grad_norm": 1.9840989112854004,
"learning_rate": 4.959002792400205e-06,
"loss": 0.4636,
"step": 825
},
{
"epoch": 1.6619718309859155,
"grad_norm": 2.2154717445373535,
"learning_rate": 4.947289690242103e-06,
"loss": 0.5052,
"step": 826
},
{
"epoch": 1.6639839034205233,
"grad_norm": 2.267805337905884,
"learning_rate": 4.935576877376759e-06,
"loss": 0.4632,
"step": 827
},
{
"epoch": 1.6659959758551308,
"grad_norm": 2.170832872390747,
"learning_rate": 4.9238644180882175e-06,
"loss": 0.4847,
"step": 828
},
{
"epoch": 1.6680080482897384,
"grad_norm": 2.076737403869629,
"learning_rate": 4.91215237665858e-06,
"loss": 0.4341,
"step": 829
},
{
"epoch": 1.6700201207243461,
"grad_norm": 1.9692955017089844,
"learning_rate": 4.900440817367661e-06,
"loss": 0.456,
"step": 830
},
{
"epoch": 1.6720321931589537,
"grad_norm": 1.9987684488296509,
"learning_rate": 4.88872980449262e-06,
"loss": 0.4358,
"step": 831
},
{
"epoch": 1.6740442655935612,
"grad_norm": 2.1247968673706055,
"learning_rate": 4.877019402307629e-06,
"loss": 0.461,
"step": 832
},
{
"epoch": 1.676056338028169,
"grad_norm": 2.2850656509399414,
"learning_rate": 4.8653096750835e-06,
"loss": 0.4332,
"step": 833
},
{
"epoch": 1.6780684104627768,
"grad_norm": 2.0370311737060547,
"learning_rate": 4.853600687087342e-06,
"loss": 0.4627,
"step": 834
},
{
"epoch": 1.6800804828973843,
"grad_norm": 2.027519702911377,
"learning_rate": 4.841892502582206e-06,
"loss": 0.4236,
"step": 835
},
{
"epoch": 1.6820925553319919,
"grad_norm": 2.3669791221618652,
"learning_rate": 4.830185185826739e-06,
"loss": 0.5049,
"step": 836
},
{
"epoch": 1.6841046277665996,
"grad_norm": 2.097409963607788,
"learning_rate": 4.818478801074818e-06,
"loss": 0.4571,
"step": 837
},
{
"epoch": 1.6861167002012074,
"grad_norm": 1.9327471256256104,
"learning_rate": 4.806773412575211e-06,
"loss": 0.4383,
"step": 838
},
{
"epoch": 1.6881287726358147,
"grad_norm": 1.9600250720977783,
"learning_rate": 4.795069084571211e-06,
"loss": 0.4242,
"step": 839
},
{
"epoch": 1.6901408450704225,
"grad_norm": 2.109365224838257,
"learning_rate": 4.7833658813002995e-06,
"loss": 0.4403,
"step": 840
},
{
"epoch": 1.6921529175050303,
"grad_norm": 2.030219793319702,
"learning_rate": 4.7716638669937784e-06,
"loss": 0.4917,
"step": 841
},
{
"epoch": 1.6941649899396378,
"grad_norm": 1.9466344118118286,
"learning_rate": 4.759963105876428e-06,
"loss": 0.4385,
"step": 842
},
{
"epoch": 1.6961770623742454,
"grad_norm": 2.284513473510742,
"learning_rate": 4.748263662166145e-06,
"loss": 0.4599,
"step": 843
},
{
"epoch": 1.6981891348088531,
"grad_norm": 2.1511683464050293,
"learning_rate": 4.736565600073602e-06,
"loss": 0.4905,
"step": 844
},
{
"epoch": 1.700201207243461,
"grad_norm": 2.027404308319092,
"learning_rate": 4.724868983801889e-06,
"loss": 0.464,
"step": 845
},
{
"epoch": 1.7022132796780685,
"grad_norm": 2.0017733573913574,
"learning_rate": 4.713173877546155e-06,
"loss": 0.4683,
"step": 846
},
{
"epoch": 1.704225352112676,
"grad_norm": 2.067063331604004,
"learning_rate": 4.701480345493266e-06,
"loss": 0.4882,
"step": 847
},
{
"epoch": 1.7062374245472838,
"grad_norm": 2.250122547149658,
"learning_rate": 4.689788451821445e-06,
"loss": 0.4518,
"step": 848
},
{
"epoch": 1.7082494969818913,
"grad_norm": 1.9627032279968262,
"learning_rate": 4.678098260699928e-06,
"loss": 0.4553,
"step": 849
},
{
"epoch": 1.7102615694164989,
"grad_norm": 2.173635959625244,
"learning_rate": 4.666409836288603e-06,
"loss": 0.454,
"step": 850
},
{
"epoch": 1.7122736418511066,
"grad_norm": 2.029207944869995,
"learning_rate": 4.654723242737661e-06,
"loss": 0.4599,
"step": 851
},
{
"epoch": 1.7142857142857144,
"grad_norm": 2.130281925201416,
"learning_rate": 4.643038544187246e-06,
"loss": 0.4592,
"step": 852
},
{
"epoch": 1.716297786720322,
"grad_norm": 1.9927594661712646,
"learning_rate": 4.631355804767106e-06,
"loss": 0.481,
"step": 853
},
{
"epoch": 1.7183098591549295,
"grad_norm": 2.2229726314544678,
"learning_rate": 4.619675088596226e-06,
"loss": 0.4815,
"step": 854
},
{
"epoch": 1.7203219315895373,
"grad_norm": 2.119466781616211,
"learning_rate": 4.607996459782498e-06,
"loss": 0.4562,
"step": 855
},
{
"epoch": 1.7223340040241448,
"grad_norm": 2.228403329849243,
"learning_rate": 4.596319982422348e-06,
"loss": 0.4897,
"step": 856
},
{
"epoch": 1.7243460764587524,
"grad_norm": 2.0544867515563965,
"learning_rate": 4.584645720600403e-06,
"loss": 0.4711,
"step": 857
},
{
"epoch": 1.7263581488933601,
"grad_norm": 1.99813711643219,
"learning_rate": 4.572973738389124e-06,
"loss": 0.4732,
"step": 858
},
{
"epoch": 1.728370221327968,
"grad_norm": 2.0984408855438232,
"learning_rate": 4.561304099848464e-06,
"loss": 0.4765,
"step": 859
},
{
"epoch": 1.7303822937625755,
"grad_norm": 2.145559310913086,
"learning_rate": 4.549636869025511e-06,
"loss": 0.4586,
"step": 860
},
{
"epoch": 1.732394366197183,
"grad_norm": 2.286984443664551,
"learning_rate": 4.5379721099541385e-06,
"loss": 0.4723,
"step": 861
},
{
"epoch": 1.7344064386317908,
"grad_norm": 1.8818014860153198,
"learning_rate": 4.526309886654659e-06,
"loss": 0.4366,
"step": 862
},
{
"epoch": 1.7364185110663986,
"grad_norm": 1.8989081382751465,
"learning_rate": 4.514650263133461e-06,
"loss": 0.4338,
"step": 863
},
{
"epoch": 1.7384305835010059,
"grad_norm": 2.0375852584838867,
"learning_rate": 4.502993303382669e-06,
"loss": 0.4514,
"step": 864
},
{
"epoch": 1.7404426559356136,
"grad_norm": 2.0181658267974854,
"learning_rate": 4.491339071379783e-06,
"loss": 0.4441,
"step": 865
},
{
"epoch": 1.7424547283702214,
"grad_norm": 2.303046941757202,
"learning_rate": 4.47968763108734e-06,
"loss": 0.4722,
"step": 866
},
{
"epoch": 1.744466800804829,
"grad_norm": 2.083085298538208,
"learning_rate": 4.46803904645255e-06,
"loss": 0.4504,
"step": 867
},
{
"epoch": 1.7464788732394365,
"grad_norm": 2.0313520431518555,
"learning_rate": 4.4563933814069475e-06,
"loss": 0.4545,
"step": 868
},
{
"epoch": 1.7484909456740443,
"grad_norm": 2.080291748046875,
"learning_rate": 4.444750699866047e-06,
"loss": 0.444,
"step": 869
},
{
"epoch": 1.750503018108652,
"grad_norm": 2.0764756202697754,
"learning_rate": 4.433111065728992e-06,
"loss": 0.4334,
"step": 870
},
{
"epoch": 1.7525150905432596,
"grad_norm": 1.9911096096038818,
"learning_rate": 4.4214745428781946e-06,
"loss": 0.4635,
"step": 871
},
{
"epoch": 1.7545271629778671,
"grad_norm": 2.1145546436309814,
"learning_rate": 4.409841195178991e-06,
"loss": 0.4674,
"step": 872
},
{
"epoch": 1.756539235412475,
"grad_norm": 2.2299070358276367,
"learning_rate": 4.3982110864792956e-06,
"loss": 0.4466,
"step": 873
},
{
"epoch": 1.7585513078470825,
"grad_norm": 2.0294342041015625,
"learning_rate": 4.386584280609242e-06,
"loss": 0.4633,
"step": 874
},
{
"epoch": 1.76056338028169,
"grad_norm": 1.9908781051635742,
"learning_rate": 4.37496084138084e-06,
"loss": 0.433,
"step": 875
},
{
"epoch": 1.7625754527162978,
"grad_norm": 2.0967750549316406,
"learning_rate": 4.363340832587621e-06,
"loss": 0.4488,
"step": 876
},
{
"epoch": 1.7645875251509056,
"grad_norm": 1.912702202796936,
"learning_rate": 4.351724318004286e-06,
"loss": 0.4405,
"step": 877
},
{
"epoch": 1.766599597585513,
"grad_norm": 1.9199731349945068,
"learning_rate": 4.340111361386361e-06,
"loss": 0.4658,
"step": 878
},
{
"epoch": 1.7686116700201207,
"grad_norm": 2.046421527862549,
"learning_rate": 4.328502026469849e-06,
"loss": 0.4595,
"step": 879
},
{
"epoch": 1.7706237424547284,
"grad_norm": 2.1802546977996826,
"learning_rate": 4.316896376970866e-06,
"loss": 0.4886,
"step": 880
},
{
"epoch": 1.772635814889336,
"grad_norm": 2.084728240966797,
"learning_rate": 4.305294476585312e-06,
"loss": 0.4269,
"step": 881
},
{
"epoch": 1.7746478873239435,
"grad_norm": 2.1468799114227295,
"learning_rate": 4.293696388988498e-06,
"loss": 0.4661,
"step": 882
},
{
"epoch": 1.7766599597585513,
"grad_norm": 1.940514087677002,
"learning_rate": 4.282102177834822e-06,
"loss": 0.4365,
"step": 883
},
{
"epoch": 1.778672032193159,
"grad_norm": 2.1592133045196533,
"learning_rate": 4.2705119067574006e-06,
"loss": 0.4283,
"step": 884
},
{
"epoch": 1.7806841046277666,
"grad_norm": 2.060865879058838,
"learning_rate": 4.258925639367723e-06,
"loss": 0.433,
"step": 885
},
{
"epoch": 1.7826961770623742,
"grad_norm": 1.940653681755066,
"learning_rate": 4.2473434392553115e-06,
"loss": 0.4262,
"step": 886
},
{
"epoch": 1.784708249496982,
"grad_norm": 2.010108709335327,
"learning_rate": 4.235765369987358e-06,
"loss": 0.4684,
"step": 887
},
{
"epoch": 1.7867203219315897,
"grad_norm": 1.950357437133789,
"learning_rate": 4.224191495108391e-06,
"loss": 0.4145,
"step": 888
},
{
"epoch": 1.788732394366197,
"grad_norm": 2.0455737113952637,
"learning_rate": 4.212621878139912e-06,
"loss": 0.4411,
"step": 889
},
{
"epoch": 1.7907444668008048,
"grad_norm": 1.9487494230270386,
"learning_rate": 4.201056582580059e-06,
"loss": 0.4615,
"step": 890
},
{
"epoch": 1.7927565392354126,
"grad_norm": 2.0646378993988037,
"learning_rate": 4.189495671903246e-06,
"loss": 0.4776,
"step": 891
},
{
"epoch": 1.79476861167002,
"grad_norm": 1.9958351850509644,
"learning_rate": 4.177939209559828e-06,
"loss": 0.4467,
"step": 892
},
{
"epoch": 1.7967806841046277,
"grad_norm": 2.160418748855591,
"learning_rate": 4.1663872589757445e-06,
"loss": 0.4512,
"step": 893
},
{
"epoch": 1.7987927565392354,
"grad_norm": 2.015880584716797,
"learning_rate": 4.154839883552169e-06,
"loss": 0.4536,
"step": 894
},
{
"epoch": 1.8008048289738432,
"grad_norm": 2.0357439517974854,
"learning_rate": 4.143297146665167e-06,
"loss": 0.4459,
"step": 895
},
{
"epoch": 1.8028169014084507,
"grad_norm": 2.141810178756714,
"learning_rate": 4.131759111665349e-06,
"loss": 0.4381,
"step": 896
},
{
"epoch": 1.8048289738430583,
"grad_norm": 1.9679553508758545,
"learning_rate": 4.120225841877515e-06,
"loss": 0.4573,
"step": 897
},
{
"epoch": 1.806841046277666,
"grad_norm": 2.099289894104004,
"learning_rate": 4.108697400600316e-06,
"loss": 0.4425,
"step": 898
},
{
"epoch": 1.8088531187122736,
"grad_norm": 1.9328632354736328,
"learning_rate": 4.0971738511059e-06,
"loss": 0.4328,
"step": 899
},
{
"epoch": 1.8108651911468812,
"grad_norm": 1.991387128829956,
"learning_rate": 4.085655256639565e-06,
"loss": 0.4432,
"step": 900
},
{
"epoch": 1.812877263581489,
"grad_norm": 1.9772350788116455,
"learning_rate": 4.074141680419422e-06,
"loss": 0.4606,
"step": 901
},
{
"epoch": 1.8148893360160967,
"grad_norm": 2.063642978668213,
"learning_rate": 4.062633185636031e-06,
"loss": 0.4283,
"step": 902
},
{
"epoch": 1.8169014084507042,
"grad_norm": 2.158418655395508,
"learning_rate": 4.051129835452071e-06,
"loss": 0.4539,
"step": 903
},
{
"epoch": 1.8189134808853118,
"grad_norm": 2.029050827026367,
"learning_rate": 4.039631693001976e-06,
"loss": 0.4085,
"step": 904
},
{
"epoch": 1.8209255533199196,
"grad_norm": 2.1008615493774414,
"learning_rate": 4.028138821391611e-06,
"loss": 0.4713,
"step": 905
},
{
"epoch": 1.8229376257545271,
"grad_norm": 2.0045077800750732,
"learning_rate": 4.016651283697901e-06,
"loss": 0.4516,
"step": 906
},
{
"epoch": 1.8249496981891347,
"grad_norm": 1.9948246479034424,
"learning_rate": 4.005169142968503e-06,
"loss": 0.4344,
"step": 907
},
{
"epoch": 1.8269617706237424,
"grad_norm": 2.154141664505005,
"learning_rate": 3.99369246222145e-06,
"loss": 0.4837,
"step": 908
},
{
"epoch": 1.8289738430583502,
"grad_norm": 2.0279178619384766,
"learning_rate": 3.982221304444813e-06,
"loss": 0.4489,
"step": 909
},
{
"epoch": 1.8309859154929577,
"grad_norm": 2.0363872051239014,
"learning_rate": 3.970755732596349e-06,
"loss": 0.4596,
"step": 910
},
{
"epoch": 1.8329979879275653,
"grad_norm": 2.062293767929077,
"learning_rate": 3.959295809603155e-06,
"loss": 0.4652,
"step": 911
},
{
"epoch": 1.835010060362173,
"grad_norm": 1.976284384727478,
"learning_rate": 3.947841598361329e-06,
"loss": 0.4415,
"step": 912
},
{
"epoch": 1.8370221327967808,
"grad_norm": 2.208650827407837,
"learning_rate": 3.936393161735616e-06,
"loss": 0.4848,
"step": 913
},
{
"epoch": 1.8390342052313882,
"grad_norm": 2.186150550842285,
"learning_rate": 3.924950562559074e-06,
"loss": 0.4438,
"step": 914
},
{
"epoch": 1.841046277665996,
"grad_norm": 2.255722999572754,
"learning_rate": 3.91351386363272e-06,
"loss": 0.452,
"step": 915
},
{
"epoch": 1.8430583501006037,
"grad_norm": 2.2457187175750732,
"learning_rate": 3.902083127725186e-06,
"loss": 0.4987,
"step": 916
},
{
"epoch": 1.8450704225352113,
"grad_norm": 2.1687653064727783,
"learning_rate": 3.890658417572379e-06,
"loss": 0.484,
"step": 917
},
{
"epoch": 1.8470824949698188,
"grad_norm": 2.064213752746582,
"learning_rate": 3.879239795877139e-06,
"loss": 0.4827,
"step": 918
},
{
"epoch": 1.8490945674044266,
"grad_norm": 2.2020626068115234,
"learning_rate": 3.867827325308882e-06,
"loss": 0.4244,
"step": 919
},
{
"epoch": 1.8511066398390343,
"grad_norm": 2.0802299976348877,
"learning_rate": 3.8564210685032695e-06,
"loss": 0.4633,
"step": 920
},
{
"epoch": 1.8531187122736419,
"grad_norm": 2.077509641647339,
"learning_rate": 3.845021088061858e-06,
"loss": 0.4269,
"step": 921
},
{
"epoch": 1.8551307847082494,
"grad_norm": 2.064271926879883,
"learning_rate": 3.83362744655176e-06,
"loss": 0.4551,
"step": 922
},
{
"epoch": 1.8571428571428572,
"grad_norm": 1.8204706907272339,
"learning_rate": 3.822240206505293e-06,
"loss": 0.4576,
"step": 923
},
{
"epoch": 1.8591549295774648,
"grad_norm": 2.13497257232666,
"learning_rate": 3.810859430419646e-06,
"loss": 0.4544,
"step": 924
},
{
"epoch": 1.8611670020120723,
"grad_norm": 2.3285865783691406,
"learning_rate": 3.799485180756526e-06,
"loss": 0.4876,
"step": 925
},
{
"epoch": 1.86317907444668,
"grad_norm": 2.118795394897461,
"learning_rate": 3.788117519941825e-06,
"loss": 0.4532,
"step": 926
},
{
"epoch": 1.8651911468812878,
"grad_norm": 2.0889439582824707,
"learning_rate": 3.776756510365275e-06,
"loss": 0.448,
"step": 927
},
{
"epoch": 1.8672032193158954,
"grad_norm": 2.229954957962036,
"learning_rate": 3.765402214380095e-06,
"loss": 0.4849,
"step": 928
},
{
"epoch": 1.869215291750503,
"grad_norm": 2.2357869148254395,
"learning_rate": 3.7540546943026677e-06,
"loss": 0.431,
"step": 929
},
{
"epoch": 1.8712273641851107,
"grad_norm": 2.2264294624328613,
"learning_rate": 3.7427140124121774e-06,
"loss": 0.4364,
"step": 930
},
{
"epoch": 1.8732394366197183,
"grad_norm": 1.928287148475647,
"learning_rate": 3.731380230950288e-06,
"loss": 0.4132,
"step": 931
},
{
"epoch": 1.8752515090543258,
"grad_norm": 2.287692070007324,
"learning_rate": 3.720053412120784e-06,
"loss": 0.4247,
"step": 932
},
{
"epoch": 1.8772635814889336,
"grad_norm": 2.039332151412964,
"learning_rate": 3.7087336180892395e-06,
"loss": 0.4352,
"step": 933
},
{
"epoch": 1.8792756539235413,
"grad_norm": 2.0014684200286865,
"learning_rate": 3.6974209109826724e-06,
"loss": 0.4743,
"step": 934
},
{
"epoch": 1.881287726358149,
"grad_norm": 2.0897486209869385,
"learning_rate": 3.686115352889209e-06,
"loss": 0.4952,
"step": 935
},
{
"epoch": 1.8832997987927564,
"grad_norm": 2.2058608531951904,
"learning_rate": 3.674817005857735e-06,
"loss": 0.4366,
"step": 936
},
{
"epoch": 1.8853118712273642,
"grad_norm": 2.0209622383117676,
"learning_rate": 3.663525931897559e-06,
"loss": 0.4689,
"step": 937
},
{
"epoch": 1.887323943661972,
"grad_norm": 2.1546790599823,
"learning_rate": 3.6522421929780746e-06,
"loss": 0.4743,
"step": 938
},
{
"epoch": 1.8893360160965795,
"grad_norm": 2.0966379642486572,
"learning_rate": 3.6409658510284208e-06,
"loss": 0.4299,
"step": 939
},
{
"epoch": 1.891348088531187,
"grad_norm": 1.9026216268539429,
"learning_rate": 3.6296969679371325e-06,
"loss": 0.4367,
"step": 940
},
{
"epoch": 1.8933601609657948,
"grad_norm": 1.863969326019287,
"learning_rate": 3.6184356055518143e-06,
"loss": 0.423,
"step": 941
},
{
"epoch": 1.8953722334004024,
"grad_norm": 2.201620101928711,
"learning_rate": 3.6071818256787906e-06,
"loss": 0.4244,
"step": 942
},
{
"epoch": 1.89738430583501,
"grad_norm": 2.143632650375366,
"learning_rate": 3.595935690082769e-06,
"loss": 0.4744,
"step": 943
},
{
"epoch": 1.8993963782696177,
"grad_norm": 2.0650856494903564,
"learning_rate": 3.5846972604865103e-06,
"loss": 0.4696,
"step": 944
},
{
"epoch": 1.9014084507042255,
"grad_norm": 2.024775743484497,
"learning_rate": 3.5734665985704732e-06,
"loss": 0.4614,
"step": 945
},
{
"epoch": 1.903420523138833,
"grad_norm": 2.026299476623535,
"learning_rate": 3.56224376597249e-06,
"loss": 0.4446,
"step": 946
},
{
"epoch": 1.9054325955734406,
"grad_norm": 1.9642820358276367,
"learning_rate": 3.551028824287418e-06,
"loss": 0.4592,
"step": 947
},
{
"epoch": 1.9074446680080483,
"grad_norm": 2.067823648452759,
"learning_rate": 3.5398218350668136e-06,
"loss": 0.4665,
"step": 948
},
{
"epoch": 1.909456740442656,
"grad_norm": 1.9692931175231934,
"learning_rate": 3.528622859818582e-06,
"loss": 0.4287,
"step": 949
},
{
"epoch": 1.9114688128772634,
"grad_norm": 2.2274701595306396,
"learning_rate": 3.517431960006645e-06,
"loss": 0.4307,
"step": 950
},
{
"epoch": 1.9134808853118712,
"grad_norm": 1.988458514213562,
"learning_rate": 3.506249197050604e-06,
"loss": 0.434,
"step": 951
},
{
"epoch": 1.915492957746479,
"grad_norm": 2.008466958999634,
"learning_rate": 3.495074632325407e-06,
"loss": 0.4262,
"step": 952
},
{
"epoch": 1.9175050301810865,
"grad_norm": 2.0033493041992188,
"learning_rate": 3.4839083271610007e-06,
"loss": 0.4252,
"step": 953
},
{
"epoch": 1.919517102615694,
"grad_norm": 1.943579077720642,
"learning_rate": 3.472750342842003e-06,
"loss": 0.4361,
"step": 954
},
{
"epoch": 1.9215291750503019,
"grad_norm": 2.111849069595337,
"learning_rate": 3.461600740607366e-06,
"loss": 0.46,
"step": 955
},
{
"epoch": 1.9235412474849096,
"grad_norm": 2.095845937728882,
"learning_rate": 3.4504595816500318e-06,
"loss": 0.4766,
"step": 956
},
{
"epoch": 1.925553319919517,
"grad_norm": 1.9449495077133179,
"learning_rate": 3.4393269271166117e-06,
"loss": 0.4518,
"step": 957
},
{
"epoch": 1.9275653923541247,
"grad_norm": 2.125091552734375,
"learning_rate": 3.4282028381070366e-06,
"loss": 0.4408,
"step": 958
},
{
"epoch": 1.9295774647887325,
"grad_norm": 2.1160356998443604,
"learning_rate": 3.4170873756742263e-06,
"loss": 0.4429,
"step": 959
},
{
"epoch": 1.93158953722334,
"grad_norm": 1.942185640335083,
"learning_rate": 3.405980600823754e-06,
"loss": 0.4197,
"step": 960
},
{
"epoch": 1.9336016096579476,
"grad_norm": 2.013988733291626,
"learning_rate": 3.3948825745135196e-06,
"loss": 0.4262,
"step": 961
},
{
"epoch": 1.9356136820925554,
"grad_norm": 2.161773443222046,
"learning_rate": 3.383793357653398e-06,
"loss": 0.48,
"step": 962
},
{
"epoch": 1.9376257545271631,
"grad_norm": 2.071415662765503,
"learning_rate": 3.372713011104922e-06,
"loss": 0.4106,
"step": 963
},
{
"epoch": 1.9396378269617707,
"grad_norm": 2.229872465133667,
"learning_rate": 3.361641595680937e-06,
"loss": 0.4894,
"step": 964
},
{
"epoch": 1.9416498993963782,
"grad_norm": 1.8888545036315918,
"learning_rate": 3.350579172145273e-06,
"loss": 0.4299,
"step": 965
},
{
"epoch": 1.943661971830986,
"grad_norm": 1.9907033443450928,
"learning_rate": 3.3395258012124103e-06,
"loss": 0.4731,
"step": 966
},
{
"epoch": 1.9456740442655935,
"grad_norm": 2.101872682571411,
"learning_rate": 3.3284815435471423e-06,
"loss": 0.4627,
"step": 967
},
{
"epoch": 1.947686116700201,
"grad_norm": 1.9270325899124146,
"learning_rate": 3.3174464597642497e-06,
"loss": 0.4535,
"step": 968
},
{
"epoch": 1.9496981891348089,
"grad_norm": 2.015502691268921,
"learning_rate": 3.306420610428157e-06,
"loss": 0.4303,
"step": 969
},
{
"epoch": 1.9517102615694166,
"grad_norm": 2.1776444911956787,
"learning_rate": 3.295404056052616e-06,
"loss": 0.4968,
"step": 970
},
{
"epoch": 1.9537223340040242,
"grad_norm": 2.200979232788086,
"learning_rate": 3.284396857100357e-06,
"loss": 0.4782,
"step": 971
},
{
"epoch": 1.9557344064386317,
"grad_norm": 2.23836612701416,
"learning_rate": 3.273399073982768e-06,
"loss": 0.4682,
"step": 972
},
{
"epoch": 1.9577464788732395,
"grad_norm": 2.088153839111328,
"learning_rate": 3.2624107670595567e-06,
"loss": 0.4557,
"step": 973
},
{
"epoch": 1.959758551307847,
"grad_norm": 2.098024606704712,
"learning_rate": 3.251431996638427e-06,
"loss": 0.4578,
"step": 974
},
{
"epoch": 1.9617706237424546,
"grad_norm": 1.946381688117981,
"learning_rate": 3.2404628229747386e-06,
"loss": 0.4263,
"step": 975
},
{
"epoch": 1.9637826961770624,
"grad_norm": 2.1083738803863525,
"learning_rate": 3.2295033062711823e-06,
"loss": 0.4683,
"step": 976
},
{
"epoch": 1.9657947686116701,
"grad_norm": 1.9927233457565308,
"learning_rate": 3.2185535066774477e-06,
"loss": 0.4478,
"step": 977
},
{
"epoch": 1.9678068410462777,
"grad_norm": 2.067403793334961,
"learning_rate": 3.2076134842898955e-06,
"loss": 0.4404,
"step": 978
},
{
"epoch": 1.9698189134808852,
"grad_norm": 2.1482181549072266,
"learning_rate": 3.1966832991512232e-06,
"loss": 0.4626,
"step": 979
},
{
"epoch": 1.971830985915493,
"grad_norm": 2.094623327255249,
"learning_rate": 3.1857630112501397e-06,
"loss": 0.4116,
"step": 980
},
{
"epoch": 1.9738430583501008,
"grad_norm": 2.0925683975219727,
"learning_rate": 3.174852680521032e-06,
"loss": 0.4319,
"step": 981
},
{
"epoch": 1.975855130784708,
"grad_norm": 2.095231056213379,
"learning_rate": 3.16395236684364e-06,
"loss": 0.447,
"step": 982
},
{
"epoch": 1.9778672032193159,
"grad_norm": 1.9157521724700928,
"learning_rate": 3.1530621300427294e-06,
"loss": 0.4593,
"step": 983
},
{
"epoch": 1.9798792756539236,
"grad_norm": 2.173527956008911,
"learning_rate": 3.1421820298877554e-06,
"loss": 0.4772,
"step": 984
},
{
"epoch": 1.9818913480885312,
"grad_norm": 2.125617742538452,
"learning_rate": 3.131312126092544e-06,
"loss": 0.439,
"step": 985
},
{
"epoch": 1.9839034205231387,
"grad_norm": 2.0524990558624268,
"learning_rate": 3.1204524783149546e-06,
"loss": 0.4433,
"step": 986
},
{
"epoch": 1.9859154929577465,
"grad_norm": 2.243478298187256,
"learning_rate": 3.1096031461565656e-06,
"loss": 0.4445,
"step": 987
},
{
"epoch": 1.9879275653923543,
"grad_norm": 2.210712194442749,
"learning_rate": 3.098764189162332e-06,
"loss": 0.4589,
"step": 988
},
{
"epoch": 1.9899396378269618,
"grad_norm": 2.2478127479553223,
"learning_rate": 3.087935666820273e-06,
"loss": 0.4519,
"step": 989
},
{
"epoch": 1.9919517102615694,
"grad_norm": 2.1954751014709473,
"learning_rate": 3.0771176385611318e-06,
"loss": 0.4132,
"step": 990
},
{
"epoch": 1.9939637826961771,
"grad_norm": 2.0039350986480713,
"learning_rate": 3.0663101637580626e-06,
"loss": 0.4582,
"step": 991
},
{
"epoch": 1.9959758551307847,
"grad_norm": 1.9488987922668457,
"learning_rate": 3.055513301726296e-06,
"loss": 0.4492,
"step": 992
},
{
"epoch": 1.9979879275653922,
"grad_norm": 1.9810996055603027,
"learning_rate": 3.044727111722815e-06,
"loss": 0.4541,
"step": 993
},
{
"epoch": 2.0,
"grad_norm": 2.0250370502471924,
"learning_rate": 3.03395165294603e-06,
"loss": 0.4507,
"step": 994
},
{
"epoch": 2.0020120724346078,
"grad_norm": 1.8620500564575195,
"learning_rate": 3.02318698453546e-06,
"loss": 0.351,
"step": 995
},
{
"epoch": 2.004024144869215,
"grad_norm": 1.8618104457855225,
"learning_rate": 3.0124331655713966e-06,
"loss": 0.3471,
"step": 996
},
{
"epoch": 2.006036217303823,
"grad_norm": 1.7663768529891968,
"learning_rate": 3.0016902550745896e-06,
"loss": 0.3199,
"step": 997
},
{
"epoch": 2.0080482897384306,
"grad_norm": 1.7676547765731812,
"learning_rate": 2.990958312005916e-06,
"loss": 0.3247,
"step": 998
},
{
"epoch": 2.0100603621730384,
"grad_norm": 1.8144701719284058,
"learning_rate": 2.980237395266061e-06,
"loss": 0.3384,
"step": 999
},
{
"epoch": 2.0120724346076457,
"grad_norm": 1.9220832586288452,
"learning_rate": 2.9695275636951983e-06,
"loss": 0.3346,
"step": 1000
},
{
"epoch": 2.0140845070422535,
"grad_norm": 1.9769055843353271,
"learning_rate": 2.958828876072654e-06,
"loss": 0.3497,
"step": 1001
},
{
"epoch": 2.0160965794768613,
"grad_norm": 1.8415639400482178,
"learning_rate": 2.9481413911165984e-06,
"loss": 0.3224,
"step": 1002
},
{
"epoch": 2.0181086519114686,
"grad_norm": 1.7870392799377441,
"learning_rate": 2.9374651674837128e-06,
"loss": 0.2993,
"step": 1003
},
{
"epoch": 2.0201207243460764,
"grad_norm": 1.8392082452774048,
"learning_rate": 2.9268002637688788e-06,
"loss": 0.2891,
"step": 1004
},
{
"epoch": 2.022132796780684,
"grad_norm": 2.053039073944092,
"learning_rate": 2.9161467385048425e-06,
"loss": 0.3221,
"step": 1005
},
{
"epoch": 2.024144869215292,
"grad_norm": 2.1062474250793457,
"learning_rate": 2.9055046501619088e-06,
"loss": 0.3138,
"step": 1006
},
{
"epoch": 2.0261569416498992,
"grad_norm": 2.1327919960021973,
"learning_rate": 2.894874057147606e-06,
"loss": 0.3319,
"step": 1007
},
{
"epoch": 2.028169014084507,
"grad_norm": 2.227560520172119,
"learning_rate": 2.8842550178063777e-06,
"loss": 0.3259,
"step": 1008
},
{
"epoch": 2.0301810865191148,
"grad_norm": 2.0540666580200195,
"learning_rate": 2.8736475904192516e-06,
"loss": 0.3203,
"step": 1009
},
{
"epoch": 2.0321931589537225,
"grad_norm": 2.228684663772583,
"learning_rate": 2.863051833203531e-06,
"loss": 0.3381,
"step": 1010
},
{
"epoch": 2.03420523138833,
"grad_norm": 2.0458016395568848,
"learning_rate": 2.852467804312463e-06,
"loss": 0.3369,
"step": 1011
},
{
"epoch": 2.0362173038229376,
"grad_norm": 2.0923287868499756,
"learning_rate": 2.841895561834927e-06,
"loss": 0.314,
"step": 1012
},
{
"epoch": 2.0382293762575454,
"grad_norm": 1.8492389917373657,
"learning_rate": 2.8313351637951196e-06,
"loss": 0.3044,
"step": 1013
},
{
"epoch": 2.0402414486921527,
"grad_norm": 1.9469666481018066,
"learning_rate": 2.8207866681522233e-06,
"loss": 0.3365,
"step": 1014
},
{
"epoch": 2.0422535211267605,
"grad_norm": 1.9745899438858032,
"learning_rate": 2.810250132800103e-06,
"loss": 0.3094,
"step": 1015
},
{
"epoch": 2.0442655935613683,
"grad_norm": 2.1182546615600586,
"learning_rate": 2.7997256155669737e-06,
"loss": 0.3388,
"step": 1016
},
{
"epoch": 2.046277665995976,
"grad_norm": 1.8506799936294556,
"learning_rate": 2.7892131742151007e-06,
"loss": 0.3286,
"step": 1017
},
{
"epoch": 2.0482897384305834,
"grad_norm": 1.8766345977783203,
"learning_rate": 2.778712866440464e-06,
"loss": 0.3427,
"step": 1018
},
{
"epoch": 2.050301810865191,
"grad_norm": 2.0145444869995117,
"learning_rate": 2.7682247498724536e-06,
"loss": 0.362,
"step": 1019
},
{
"epoch": 2.052313883299799,
"grad_norm": 1.8069449663162231,
"learning_rate": 2.7577488820735465e-06,
"loss": 0.3251,
"step": 1020
},
{
"epoch": 2.0543259557344062,
"grad_norm": 1.952471375465393,
"learning_rate": 2.7472853205389997e-06,
"loss": 0.3323,
"step": 1021
},
{
"epoch": 2.056338028169014,
"grad_norm": 1.8935775756835938,
"learning_rate": 2.736834122696529e-06,
"loss": 0.3165,
"step": 1022
},
{
"epoch": 2.058350100603622,
"grad_norm": 2.1522717475891113,
"learning_rate": 2.7263953459059888e-06,
"loss": 0.3245,
"step": 1023
},
{
"epoch": 2.0603621730382295,
"grad_norm": 1.9437185525894165,
"learning_rate": 2.715969047459066e-06,
"loss": 0.3144,
"step": 1024
},
{
"epoch": 2.062374245472837,
"grad_norm": 2.134711980819702,
"learning_rate": 2.705555284578958e-06,
"loss": 0.3343,
"step": 1025
},
{
"epoch": 2.0643863179074446,
"grad_norm": 1.7659924030303955,
"learning_rate": 2.6951541144200676e-06,
"loss": 0.3019,
"step": 1026
},
{
"epoch": 2.0663983903420524,
"grad_norm": 2.057668685913086,
"learning_rate": 2.6847655940676843e-06,
"loss": 0.3173,
"step": 1027
},
{
"epoch": 2.0684104627766597,
"grad_norm": 1.9060618877410889,
"learning_rate": 2.6743897805376672e-06,
"loss": 0.3124,
"step": 1028
},
{
"epoch": 2.0704225352112675,
"grad_norm": 1.781306505203247,
"learning_rate": 2.664026730776136e-06,
"loss": 0.2955,
"step": 1029
},
{
"epoch": 2.0724346076458753,
"grad_norm": 1.9370532035827637,
"learning_rate": 2.6536765016591626e-06,
"loss": 0.3154,
"step": 1030
},
{
"epoch": 2.074446680080483,
"grad_norm": 1.8034547567367554,
"learning_rate": 2.64333914999245e-06,
"loss": 0.313,
"step": 1031
},
{
"epoch": 2.0764587525150904,
"grad_norm": 1.9127081632614136,
"learning_rate": 2.63301473251103e-06,
"loss": 0.325,
"step": 1032
},
{
"epoch": 2.078470824949698,
"grad_norm": 1.9056042432785034,
"learning_rate": 2.622703305878941e-06,
"loss": 0.312,
"step": 1033
},
{
"epoch": 2.080482897384306,
"grad_norm": 1.922743797302246,
"learning_rate": 2.6124049266889296e-06,
"loss": 0.3124,
"step": 1034
},
{
"epoch": 2.0824949698189137,
"grad_norm": 1.9337576627731323,
"learning_rate": 2.6021196514621283e-06,
"loss": 0.3069,
"step": 1035
},
{
"epoch": 2.084507042253521,
"grad_norm": 2.0789332389831543,
"learning_rate": 2.5918475366477536e-06,
"loss": 0.3276,
"step": 1036
},
{
"epoch": 2.086519114688129,
"grad_norm": 1.9439971446990967,
"learning_rate": 2.5815886386227882e-06,
"loss": 0.3067,
"step": 1037
},
{
"epoch": 2.0885311871227366,
"grad_norm": 1.9567536115646362,
"learning_rate": 2.5713430136916828e-06,
"loss": 0.3264,
"step": 1038
},
{
"epoch": 2.090543259557344,
"grad_norm": 1.9910478591918945,
"learning_rate": 2.5611107180860395e-06,
"loss": 0.3422,
"step": 1039
},
{
"epoch": 2.0925553319919517,
"grad_norm": 1.9096462726593018,
"learning_rate": 2.5508918079643e-06,
"loss": 0.3087,
"step": 1040
},
{
"epoch": 2.0945674044265594,
"grad_norm": 1.9407165050506592,
"learning_rate": 2.540686339411446e-06,
"loss": 0.3083,
"step": 1041
},
{
"epoch": 2.096579476861167,
"grad_norm": 2.0691964626312256,
"learning_rate": 2.530494368438683e-06,
"loss": 0.32,
"step": 1042
},
{
"epoch": 2.0985915492957745,
"grad_norm": 1.9105881452560425,
"learning_rate": 2.520315950983141e-06,
"loss": 0.3293,
"step": 1043
},
{
"epoch": 2.1006036217303823,
"grad_norm": 2.024200916290283,
"learning_rate": 2.5101511429075654e-06,
"loss": 0.3245,
"step": 1044
},
{
"epoch": 2.10261569416499,
"grad_norm": 1.9861524105072021,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.3235,
"step": 1045
},
{
"epoch": 2.1046277665995974,
"grad_norm": 2.02134370803833,
"learning_rate": 2.489862577973498e-06,
"loss": 0.3244,
"step": 1046
},
{
"epoch": 2.106639839034205,
"grad_norm": 1.9421387910842896,
"learning_rate": 2.4797389324658037e-06,
"loss": 0.3215,
"step": 1047
},
{
"epoch": 2.108651911468813,
"grad_norm": 2.0010151863098145,
"learning_rate": 2.4696291190390494e-06,
"loss": 0.3157,
"step": 1048
},
{
"epoch": 2.1106639839034207,
"grad_norm": 1.8834630250930786,
"learning_rate": 2.459533193179457e-06,
"loss": 0.31,
"step": 1049
},
{
"epoch": 2.112676056338028,
"grad_norm": 1.9237369298934937,
"learning_rate": 2.4494512102970247e-06,
"loss": 0.3048,
"step": 1050
},
{
"epoch": 2.114688128772636,
"grad_norm": 1.9493533372879028,
"learning_rate": 2.4393832257252253e-06,
"loss": 0.3194,
"step": 1051
},
{
"epoch": 2.1167002012072436,
"grad_norm": 1.905882716178894,
"learning_rate": 2.42932929472071e-06,
"loss": 0.3211,
"step": 1052
},
{
"epoch": 2.118712273641851,
"grad_norm": 2.0549941062927246,
"learning_rate": 2.4192894724629943e-06,
"loss": 0.3362,
"step": 1053
},
{
"epoch": 2.1207243460764587,
"grad_norm": 2.161760091781616,
"learning_rate": 2.4092638140541586e-06,
"loss": 0.3293,
"step": 1054
},
{
"epoch": 2.1227364185110664,
"grad_norm": 1.8797554969787598,
"learning_rate": 2.399252374518551e-06,
"loss": 0.2915,
"step": 1055
},
{
"epoch": 2.124748490945674,
"grad_norm": 1.9617598056793213,
"learning_rate": 2.3892552088024827e-06,
"loss": 0.3111,
"step": 1056
},
{
"epoch": 2.1267605633802815,
"grad_norm": 1.9987127780914307,
"learning_rate": 2.3792723717739197e-06,
"loss": 0.319,
"step": 1057
},
{
"epoch": 2.1287726358148893,
"grad_norm": 1.9436331987380981,
"learning_rate": 2.3693039182221907e-06,
"loss": 0.3167,
"step": 1058
},
{
"epoch": 2.130784708249497,
"grad_norm": 1.9609178304672241,
"learning_rate": 2.3593499028576793e-06,
"loss": 0.3181,
"step": 1059
},
{
"epoch": 2.132796780684105,
"grad_norm": 1.8295328617095947,
"learning_rate": 2.349410380311532e-06,
"loss": 0.3098,
"step": 1060
},
{
"epoch": 2.134808853118712,
"grad_norm": 1.9270719289779663,
"learning_rate": 2.3394854051353534e-06,
"loss": 0.3213,
"step": 1061
},
{
"epoch": 2.13682092555332,
"grad_norm": 1.8924223184585571,
"learning_rate": 2.329575031800903e-06,
"loss": 0.3164,
"step": 1062
},
{
"epoch": 2.1388329979879277,
"grad_norm": 2.010369062423706,
"learning_rate": 2.319679314699801e-06,
"loss": 0.3217,
"step": 1063
},
{
"epoch": 2.140845070422535,
"grad_norm": 1.8954033851623535,
"learning_rate": 2.3097983081432334e-06,
"loss": 0.3241,
"step": 1064
},
{
"epoch": 2.142857142857143,
"grad_norm": 2.061217784881592,
"learning_rate": 2.299932066361643e-06,
"loss": 0.3274,
"step": 1065
},
{
"epoch": 2.1448692152917506,
"grad_norm": 2.0788443088531494,
"learning_rate": 2.290080643504446e-06,
"loss": 0.3092,
"step": 1066
},
{
"epoch": 2.1468812877263583,
"grad_norm": 1.9684137105941772,
"learning_rate": 2.2802440936397203e-06,
"loss": 0.3227,
"step": 1067
},
{
"epoch": 2.1488933601609657,
"grad_norm": 1.9991480112075806,
"learning_rate": 2.2704224707539164e-06,
"loss": 0.3041,
"step": 1068
},
{
"epoch": 2.1509054325955734,
"grad_norm": 1.926109790802002,
"learning_rate": 2.2606158287515662e-06,
"loss": 0.3132,
"step": 1069
},
{
"epoch": 2.152917505030181,
"grad_norm": 1.8808412551879883,
"learning_rate": 2.250824221454976e-06,
"loss": 0.302,
"step": 1070
},
{
"epoch": 2.1549295774647885,
"grad_norm": 2.0052027702331543,
"learning_rate": 2.2410477026039335e-06,
"loss": 0.3344,
"step": 1071
},
{
"epoch": 2.1569416498993963,
"grad_norm": 1.9723577499389648,
"learning_rate": 2.2312863258554236e-06,
"loss": 0.3377,
"step": 1072
},
{
"epoch": 2.158953722334004,
"grad_norm": 1.9339603185653687,
"learning_rate": 2.221540144783323e-06,
"loss": 0.3195,
"step": 1073
},
{
"epoch": 2.160965794768612,
"grad_norm": 1.9883408546447754,
"learning_rate": 2.211809212878106e-06,
"loss": 0.3129,
"step": 1074
},
{
"epoch": 2.162977867203219,
"grad_norm": 1.8902078866958618,
"learning_rate": 2.2020935835465567e-06,
"loss": 0.31,
"step": 1075
},
{
"epoch": 2.164989939637827,
"grad_norm": 2.026864528656006,
"learning_rate": 2.1923933101114713e-06,
"loss": 0.3211,
"step": 1076
},
{
"epoch": 2.1670020120724347,
"grad_norm": 1.993359923362732,
"learning_rate": 2.182708445811371e-06,
"loss": 0.3295,
"step": 1077
},
{
"epoch": 2.169014084507042,
"grad_norm": 1.8539053201675415,
"learning_rate": 2.1730390438002056e-06,
"loss": 0.2943,
"step": 1078
},
{
"epoch": 2.17102615694165,
"grad_norm": 1.9406895637512207,
"learning_rate": 2.1633851571470595e-06,
"loss": 0.3138,
"step": 1079
},
{
"epoch": 2.1730382293762576,
"grad_norm": 1.890519142150879,
"learning_rate": 2.1537468388358645e-06,
"loss": 0.3266,
"step": 1080
},
{
"epoch": 2.1750503018108653,
"grad_norm": 1.8422409296035767,
"learning_rate": 2.1441241417651072e-06,
"loss": 0.3042,
"step": 1081
},
{
"epoch": 2.1770623742454727,
"grad_norm": 2.003852128982544,
"learning_rate": 2.134517118747541e-06,
"loss": 0.3173,
"step": 1082
},
{
"epoch": 2.1790744466800804,
"grad_norm": 1.9138869047164917,
"learning_rate": 2.1249258225098974e-06,
"loss": 0.3044,
"step": 1083
},
{
"epoch": 2.181086519114688,
"grad_norm": 2.0287420749664307,
"learning_rate": 2.1153503056925872e-06,
"loss": 0.3354,
"step": 1084
},
{
"epoch": 2.183098591549296,
"grad_norm": 1.9738901853561401,
"learning_rate": 2.1057906208494204e-06,
"loss": 0.3211,
"step": 1085
},
{
"epoch": 2.1851106639839033,
"grad_norm": 1.9624329805374146,
"learning_rate": 2.09624682044732e-06,
"loss": 0.3349,
"step": 1086
},
{
"epoch": 2.187122736418511,
"grad_norm": 2.0174593925476074,
"learning_rate": 2.086718956866024e-06,
"loss": 0.313,
"step": 1087
},
{
"epoch": 2.189134808853119,
"grad_norm": 2.0925774574279785,
"learning_rate": 2.0772070823978034e-06,
"loss": 0.3098,
"step": 1088
},
{
"epoch": 2.191146881287726,
"grad_norm": 1.9644815921783447,
"learning_rate": 2.06771124924718e-06,
"loss": 0.3177,
"step": 1089
},
{
"epoch": 2.193158953722334,
"grad_norm": 1.8980356454849243,
"learning_rate": 2.0582315095306343e-06,
"loss": 0.3178,
"step": 1090
},
{
"epoch": 2.1951710261569417,
"grad_norm": 1.9412862062454224,
"learning_rate": 2.0487679152763173e-06,
"loss": 0.3026,
"step": 1091
},
{
"epoch": 2.1971830985915495,
"grad_norm": 1.9344528913497925,
"learning_rate": 2.0393205184237687e-06,
"loss": 0.339,
"step": 1092
},
{
"epoch": 2.199195171026157,
"grad_norm": 1.8947501182556152,
"learning_rate": 2.0298893708236307e-06,
"loss": 0.3275,
"step": 1093
},
{
"epoch": 2.2012072434607646,
"grad_norm": 1.942131757736206,
"learning_rate": 2.0204745242373665e-06,
"loss": 0.3351,
"step": 1094
},
{
"epoch": 2.2032193158953723,
"grad_norm": 1.9295439720153809,
"learning_rate": 2.011076030336974e-06,
"loss": 0.3167,
"step": 1095
},
{
"epoch": 2.20523138832998,
"grad_norm": 2.072382926940918,
"learning_rate": 2.0016939407046987e-06,
"loss": 0.3532,
"step": 1096
},
{
"epoch": 2.2072434607645874,
"grad_norm": 2.0070250034332275,
"learning_rate": 1.992328306832755e-06,
"loss": 0.3542,
"step": 1097
},
{
"epoch": 2.209255533199195,
"grad_norm": 1.749671220779419,
"learning_rate": 1.9829791801230398e-06,
"loss": 0.2876,
"step": 1098
},
{
"epoch": 2.211267605633803,
"grad_norm": 1.9479471445083618,
"learning_rate": 1.9736466118868573e-06,
"loss": 0.342,
"step": 1099
},
{
"epoch": 2.2132796780684103,
"grad_norm": 1.8488900661468506,
"learning_rate": 1.9643306533446332e-06,
"loss": 0.3147,
"step": 1100
},
{
"epoch": 2.215291750503018,
"grad_norm": 2.0866432189941406,
"learning_rate": 1.9550313556256294e-06,
"loss": 0.3121,
"step": 1101
},
{
"epoch": 2.217303822937626,
"grad_norm": 1.9821211099624634,
"learning_rate": 1.945748769767667e-06,
"loss": 0.3254,
"step": 1102
},
{
"epoch": 2.219315895372233,
"grad_norm": 1.9831209182739258,
"learning_rate": 1.9364829467168522e-06,
"loss": 0.3243,
"step": 1103
},
{
"epoch": 2.221327967806841,
"grad_norm": 1.8965582847595215,
"learning_rate": 1.927233937327285e-06,
"loss": 0.319,
"step": 1104
},
{
"epoch": 2.2233400402414487,
"grad_norm": 1.8991901874542236,
"learning_rate": 1.9180017923607884e-06,
"loss": 0.3179,
"step": 1105
},
{
"epoch": 2.2253521126760565,
"grad_norm": 1.933734655380249,
"learning_rate": 1.9087865624866297e-06,
"loss": 0.3194,
"step": 1106
},
{
"epoch": 2.227364185110664,
"grad_norm": 1.9359025955200195,
"learning_rate": 1.8995882982812352e-06,
"loss": 0.3141,
"step": 1107
},
{
"epoch": 2.2293762575452716,
"grad_norm": 2.0395710468292236,
"learning_rate": 1.8904070502279242e-06,
"loss": 0.329,
"step": 1108
},
{
"epoch": 2.2313883299798793,
"grad_norm": 1.9852657318115234,
"learning_rate": 1.8812428687166195e-06,
"loss": 0.3519,
"step": 1109
},
{
"epoch": 2.233400402414487,
"grad_norm": 2.0795342922210693,
"learning_rate": 1.8720958040435772e-06,
"loss": 0.3267,
"step": 1110
},
{
"epoch": 2.2354124748490944,
"grad_norm": 2.028374433517456,
"learning_rate": 1.8629659064111138e-06,
"loss": 0.3324,
"step": 1111
},
{
"epoch": 2.237424547283702,
"grad_norm": 1.9261505603790283,
"learning_rate": 1.8538532259273272e-06,
"loss": 0.3209,
"step": 1112
},
{
"epoch": 2.23943661971831,
"grad_norm": 2.0454418659210205,
"learning_rate": 1.844757812605817e-06,
"loss": 0.2846,
"step": 1113
},
{
"epoch": 2.2414486921529173,
"grad_norm": 1.9863117933273315,
"learning_rate": 1.8356797163654172e-06,
"loss": 0.3121,
"step": 1114
},
{
"epoch": 2.243460764587525,
"grad_norm": 2.1988511085510254,
"learning_rate": 1.8266189870299184e-06,
"loss": 0.3031,
"step": 1115
},
{
"epoch": 2.245472837022133,
"grad_norm": 2.0179038047790527,
"learning_rate": 1.8175756743277967e-06,
"loss": 0.3246,
"step": 1116
},
{
"epoch": 2.2474849094567406,
"grad_norm": 1.9999439716339111,
"learning_rate": 1.8085498278919421e-06,
"loss": 0.3066,
"step": 1117
},
{
"epoch": 2.249496981891348,
"grad_norm": 1.915867567062378,
"learning_rate": 1.7995414972593784e-06,
"loss": 0.3303,
"step": 1118
},
{
"epoch": 2.2515090543259557,
"grad_norm": 2.091055154800415,
"learning_rate": 1.7905507318709997e-06,
"loss": 0.3478,
"step": 1119
},
{
"epoch": 2.2535211267605635,
"grad_norm": 1.8792387247085571,
"learning_rate": 1.7815775810712921e-06,
"loss": 0.3155,
"step": 1120
},
{
"epoch": 2.2555331991951713,
"grad_norm": 2.0227322578430176,
"learning_rate": 1.772622094108074e-06,
"loss": 0.3273,
"step": 1121
},
{
"epoch": 2.2575452716297786,
"grad_norm": 1.9454227685928345,
"learning_rate": 1.7636843201322106e-06,
"loss": 0.3376,
"step": 1122
},
{
"epoch": 2.2595573440643864,
"grad_norm": 1.826862096786499,
"learning_rate": 1.754764308197358e-06,
"loss": 0.3242,
"step": 1123
},
{
"epoch": 2.261569416498994,
"grad_norm": 1.9834349155426025,
"learning_rate": 1.7458621072596827e-06,
"loss": 0.318,
"step": 1124
},
{
"epoch": 2.2635814889336014,
"grad_norm": 1.9824222326278687,
"learning_rate": 1.7369777661776032e-06,
"loss": 0.3243,
"step": 1125
},
{
"epoch": 2.265593561368209,
"grad_norm": 1.8404629230499268,
"learning_rate": 1.728111333711514e-06,
"loss": 0.3094,
"step": 1126
},
{
"epoch": 2.267605633802817,
"grad_norm": 1.9138398170471191,
"learning_rate": 1.7192628585235188e-06,
"loss": 0.3125,
"step": 1127
},
{
"epoch": 2.2696177062374243,
"grad_norm": 1.917009949684143,
"learning_rate": 1.7104323891771697e-06,
"loss": 0.3205,
"step": 1128
},
{
"epoch": 2.271629778672032,
"grad_norm": 1.910467267036438,
"learning_rate": 1.7016199741371958e-06,
"loss": 0.3063,
"step": 1129
},
{
"epoch": 2.27364185110664,
"grad_norm": 2.0068464279174805,
"learning_rate": 1.6928256617692357e-06,
"loss": 0.3219,
"step": 1130
},
{
"epoch": 2.2756539235412476,
"grad_norm": 1.9971808195114136,
"learning_rate": 1.6840495003395741e-06,
"loss": 0.3223,
"step": 1131
},
{
"epoch": 2.277665995975855,
"grad_norm": 1.9968491792678833,
"learning_rate": 1.6752915380148772e-06,
"loss": 0.3146,
"step": 1132
},
{
"epoch": 2.2796780684104627,
"grad_norm": 1.8720755577087402,
"learning_rate": 1.6665518228619316e-06,
"loss": 0.3043,
"step": 1133
},
{
"epoch": 2.2816901408450705,
"grad_norm": 2.0129830837249756,
"learning_rate": 1.6578304028473703e-06,
"loss": 0.3109,
"step": 1134
},
{
"epoch": 2.2837022132796783,
"grad_norm": 1.853187084197998,
"learning_rate": 1.6491273258374241e-06,
"loss": 0.3086,
"step": 1135
},
{
"epoch": 2.2857142857142856,
"grad_norm": 1.9343267679214478,
"learning_rate": 1.6404426395976446e-06,
"loss": 0.3179,
"step": 1136
},
{
"epoch": 2.2877263581488934,
"grad_norm": 1.9186265468597412,
"learning_rate": 1.6317763917926494e-06,
"loss": 0.3252,
"step": 1137
},
{
"epoch": 2.289738430583501,
"grad_norm": 1.889265775680542,
"learning_rate": 1.6231286299858635e-06,
"loss": 0.3049,
"step": 1138
},
{
"epoch": 2.2917505030181085,
"grad_norm": 2.0032553672790527,
"learning_rate": 1.6144994016392484e-06,
"loss": 0.3246,
"step": 1139
},
{
"epoch": 2.2937625754527162,
"grad_norm": 1.9866799116134644,
"learning_rate": 1.6058887541130541e-06,
"loss": 0.3323,
"step": 1140
},
{
"epoch": 2.295774647887324,
"grad_norm": 2.0110700130462646,
"learning_rate": 1.5972967346655449e-06,
"loss": 0.3232,
"step": 1141
},
{
"epoch": 2.2977867203219318,
"grad_norm": 1.9062737226486206,
"learning_rate": 1.5887233904527548e-06,
"loss": 0.3115,
"step": 1142
},
{
"epoch": 2.299798792756539,
"grad_norm": 2.0070598125457764,
"learning_rate": 1.5801687685282169e-06,
"loss": 0.3148,
"step": 1143
},
{
"epoch": 2.301810865191147,
"grad_norm": 1.9191631078720093,
"learning_rate": 1.5716329158427097e-06,
"loss": 0.3244,
"step": 1144
},
{
"epoch": 2.3038229376257546,
"grad_norm": 1.8915290832519531,
"learning_rate": 1.5631158792440027e-06,
"loss": 0.3026,
"step": 1145
},
{
"epoch": 2.3058350100603624,
"grad_norm": 1.9323855638504028,
"learning_rate": 1.5546177054765954e-06,
"loss": 0.2986,
"step": 1146
},
{
"epoch": 2.3078470824949697,
"grad_norm": 1.9170725345611572,
"learning_rate": 1.546138441181459e-06,
"loss": 0.3278,
"step": 1147
},
{
"epoch": 2.3098591549295775,
"grad_norm": 1.9867016077041626,
"learning_rate": 1.5376781328957846e-06,
"loss": 0.3282,
"step": 1148
},
{
"epoch": 2.3118712273641853,
"grad_norm": 1.8710124492645264,
"learning_rate": 1.5292368270527259e-06,
"loss": 0.3168,
"step": 1149
},
{
"epoch": 2.3138832997987926,
"grad_norm": 1.8784232139587402,
"learning_rate": 1.5208145699811417e-06,
"loss": 0.3163,
"step": 1150
},
{
"epoch": 2.3158953722334004,
"grad_norm": 2.079653024673462,
"learning_rate": 1.5124114079053492e-06,
"loss": 0.3268,
"step": 1151
},
{
"epoch": 2.317907444668008,
"grad_norm": 1.9615051746368408,
"learning_rate": 1.5040273869448652e-06,
"loss": 0.3163,
"step": 1152
},
{
"epoch": 2.3199195171026155,
"grad_norm": 2.029083251953125,
"learning_rate": 1.4956625531141495e-06,
"loss": 0.3118,
"step": 1153
},
{
"epoch": 2.3219315895372232,
"grad_norm": 1.926914930343628,
"learning_rate": 1.4873169523223568e-06,
"loss": 0.3132,
"step": 1154
},
{
"epoch": 2.323943661971831,
"grad_norm": 2.0502724647521973,
"learning_rate": 1.4789906303730888e-06,
"loss": 0.338,
"step": 1155
},
{
"epoch": 2.3259557344064388,
"grad_norm": 2.0026066303253174,
"learning_rate": 1.470683632964131e-06,
"loss": 0.3235,
"step": 1156
},
{
"epoch": 2.327967806841046,
"grad_norm": 1.9444767236709595,
"learning_rate": 1.462396005687216e-06,
"loss": 0.3239,
"step": 1157
},
{
"epoch": 2.329979879275654,
"grad_norm": 1.9651098251342773,
"learning_rate": 1.4541277940277604e-06,
"loss": 0.3336,
"step": 1158
},
{
"epoch": 2.3319919517102616,
"grad_norm": 2.0174999237060547,
"learning_rate": 1.4458790433646264e-06,
"loss": 0.3276,
"step": 1159
},
{
"epoch": 2.3340040241448694,
"grad_norm": 1.976993203163147,
"learning_rate": 1.4376497989698635e-06,
"loss": 0.321,
"step": 1160
},
{
"epoch": 2.3360160965794767,
"grad_norm": 1.8561229705810547,
"learning_rate": 1.4294401060084634e-06,
"loss": 0.297,
"step": 1161
},
{
"epoch": 2.3380281690140845,
"grad_norm": 1.98836350440979,
"learning_rate": 1.4212500095381176e-06,
"loss": 0.3096,
"step": 1162
},
{
"epoch": 2.3400402414486923,
"grad_norm": 2.002023935317993,
"learning_rate": 1.4130795545089588e-06,
"loss": 0.3252,
"step": 1163
},
{
"epoch": 2.3420523138832996,
"grad_norm": 1.9276113510131836,
"learning_rate": 1.4049287857633264e-06,
"loss": 0.3416,
"step": 1164
},
{
"epoch": 2.3440643863179074,
"grad_norm": 1.998106598854065,
"learning_rate": 1.3967977480355106e-06,
"loss": 0.2949,
"step": 1165
},
{
"epoch": 2.346076458752515,
"grad_norm": 1.9151923656463623,
"learning_rate": 1.388686485951512e-06,
"loss": 0.3093,
"step": 1166
},
{
"epoch": 2.348088531187123,
"grad_norm": 1.9476468563079834,
"learning_rate": 1.3805950440287936e-06,
"loss": 0.3013,
"step": 1167
},
{
"epoch": 2.3501006036217302,
"grad_norm": 1.9365510940551758,
"learning_rate": 1.3725234666760428e-06,
"loss": 0.3257,
"step": 1168
},
{
"epoch": 2.352112676056338,
"grad_norm": 1.9782971143722534,
"learning_rate": 1.3644717981929213e-06,
"loss": 0.3212,
"step": 1169
},
{
"epoch": 2.3541247484909458,
"grad_norm": 1.8265795707702637,
"learning_rate": 1.356440082769822e-06,
"loss": 0.2976,
"step": 1170
},
{
"epoch": 2.3561368209255535,
"grad_norm": 1.9859201908111572,
"learning_rate": 1.3484283644876289e-06,
"loss": 0.3246,
"step": 1171
},
{
"epoch": 2.358148893360161,
"grad_norm": 1.8772772550582886,
"learning_rate": 1.3404366873174778e-06,
"loss": 0.3079,
"step": 1172
},
{
"epoch": 2.3601609657947686,
"grad_norm": 1.969179391860962,
"learning_rate": 1.3324650951205064e-06,
"loss": 0.3113,
"step": 1173
},
{
"epoch": 2.3621730382293764,
"grad_norm": 2.0032851696014404,
"learning_rate": 1.3245136316476253e-06,
"loss": 0.3253,
"step": 1174
},
{
"epoch": 2.3641851106639837,
"grad_norm": 1.8898330926895142,
"learning_rate": 1.3165823405392668e-06,
"loss": 0.3076,
"step": 1175
},
{
"epoch": 2.3661971830985915,
"grad_norm": 1.8925584554672241,
"learning_rate": 1.3086712653251504e-06,
"loss": 0.3034,
"step": 1176
},
{
"epoch": 2.3682092555331993,
"grad_norm": 1.8813642263412476,
"learning_rate": 1.3007804494240478e-06,
"loss": 0.3083,
"step": 1177
},
{
"epoch": 2.3702213279678066,
"grad_norm": 2.067413568496704,
"learning_rate": 1.2929099361435348e-06,
"loss": 0.3329,
"step": 1178
},
{
"epoch": 2.3722334004024144,
"grad_norm": 1.9133418798446655,
"learning_rate": 1.2850597686797644e-06,
"loss": 0.306,
"step": 1179
},
{
"epoch": 2.374245472837022,
"grad_norm": 2.065636157989502,
"learning_rate": 1.2772299901172198e-06,
"loss": 0.3425,
"step": 1180
},
{
"epoch": 2.37625754527163,
"grad_norm": 1.886622428894043,
"learning_rate": 1.2694206434284878e-06,
"loss": 0.3182,
"step": 1181
},
{
"epoch": 2.3782696177062372,
"grad_norm": 1.9201165437698364,
"learning_rate": 1.261631771474014e-06,
"loss": 0.317,
"step": 1182
},
{
"epoch": 2.380281690140845,
"grad_norm": 1.94356369972229,
"learning_rate": 1.2538634170018727e-06,
"loss": 0.3114,
"step": 1183
},
{
"epoch": 2.3822937625754528,
"grad_norm": 2.046518564224243,
"learning_rate": 1.246115622647529e-06,
"loss": 0.3231,
"step": 1184
},
{
"epoch": 2.3843058350100605,
"grad_norm": 1.8998048305511475,
"learning_rate": 1.2383884309336114e-06,
"loss": 0.3334,
"step": 1185
},
{
"epoch": 2.386317907444668,
"grad_norm": 2.0530076026916504,
"learning_rate": 1.2306818842696716e-06,
"loss": 0.3336,
"step": 1186
},
{
"epoch": 2.3883299798792756,
"grad_norm": 1.8843131065368652,
"learning_rate": 1.222996024951953e-06,
"loss": 0.324,
"step": 1187
},
{
"epoch": 2.3903420523138834,
"grad_norm": 2.0479204654693604,
"learning_rate": 1.21533089516316e-06,
"loss": 0.3317,
"step": 1188
},
{
"epoch": 2.3923541247484907,
"grad_norm": 1.8047560453414917,
"learning_rate": 1.2076865369722246e-06,
"loss": 0.2889,
"step": 1189
},
{
"epoch": 2.3943661971830985,
"grad_norm": 1.9316056966781616,
"learning_rate": 1.2000629923340801e-06,
"loss": 0.3171,
"step": 1190
},
{
"epoch": 2.3963782696177063,
"grad_norm": 1.7541041374206543,
"learning_rate": 1.1924603030894277e-06,
"loss": 0.2852,
"step": 1191
},
{
"epoch": 2.398390342052314,
"grad_norm": 2.0493853092193604,
"learning_rate": 1.184878510964504e-06,
"loss": 0.3269,
"step": 1192
},
{
"epoch": 2.4004024144869214,
"grad_norm": 1.8844995498657227,
"learning_rate": 1.1773176575708544e-06,
"loss": 0.335,
"step": 1193
},
{
"epoch": 2.402414486921529,
"grad_norm": 2.019207715988159,
"learning_rate": 1.1697777844051105e-06,
"loss": 0.3085,
"step": 1194
},
{
"epoch": 2.404426559356137,
"grad_norm": 1.9953314065933228,
"learning_rate": 1.1622589328487505e-06,
"loss": 0.3153,
"step": 1195
},
{
"epoch": 2.4064386317907447,
"grad_norm": 1.9246023893356323,
"learning_rate": 1.1547611441678836e-06,
"loss": 0.3251,
"step": 1196
},
{
"epoch": 2.408450704225352,
"grad_norm": 1.9564388990402222,
"learning_rate": 1.1472844595130145e-06,
"loss": 0.3349,
"step": 1197
},
{
"epoch": 2.41046277665996,
"grad_norm": 1.972939372062683,
"learning_rate": 1.1398289199188262e-06,
"loss": 0.298,
"step": 1198
},
{
"epoch": 2.4124748490945676,
"grad_norm": 1.9610313177108765,
"learning_rate": 1.132394566303946e-06,
"loss": 0.3106,
"step": 1199
},
{
"epoch": 2.414486921529175,
"grad_norm": 1.8735629320144653,
"learning_rate": 1.124981439470726e-06,
"loss": 0.3173,
"step": 1200
},
{
"epoch": 2.4164989939637826,
"grad_norm": 2.084070920944214,
"learning_rate": 1.1175895801050185e-06,
"loss": 0.3334,
"step": 1201
},
{
"epoch": 2.4185110663983904,
"grad_norm": 1.9063400030136108,
"learning_rate": 1.110219028775954e-06,
"loss": 0.3069,
"step": 1202
},
{
"epoch": 2.4205231388329977,
"grad_norm": 2.0351479053497314,
"learning_rate": 1.1028698259357162e-06,
"loss": 0.3225,
"step": 1203
},
{
"epoch": 2.4225352112676055,
"grad_norm": 2.048466682434082,
"learning_rate": 1.09554201191932e-06,
"loss": 0.3249,
"step": 1204
},
{
"epoch": 2.4245472837022133,
"grad_norm": 1.9268667697906494,
"learning_rate": 1.0882356269443912e-06,
"loss": 0.3009,
"step": 1205
},
{
"epoch": 2.426559356136821,
"grad_norm": 1.935354232788086,
"learning_rate": 1.080950711110943e-06,
"loss": 0.312,
"step": 1206
},
{
"epoch": 2.4285714285714284,
"grad_norm": 2.054149627685547,
"learning_rate": 1.0736873044011632e-06,
"loss": 0.3133,
"step": 1207
},
{
"epoch": 2.430583501006036,
"grad_norm": 1.9512964487075806,
"learning_rate": 1.066445446679189e-06,
"loss": 0.3184,
"step": 1208
},
{
"epoch": 2.432595573440644,
"grad_norm": 1.9467830657958984,
"learning_rate": 1.0592251776908857e-06,
"loss": 0.3043,
"step": 1209
},
{
"epoch": 2.4346076458752517,
"grad_norm": 1.9755560159683228,
"learning_rate": 1.052026537063634e-06,
"loss": 0.3321,
"step": 1210
},
{
"epoch": 2.436619718309859,
"grad_norm": 2.049520492553711,
"learning_rate": 1.044849564306112e-06,
"loss": 0.337,
"step": 1211
},
{
"epoch": 2.438631790744467,
"grad_norm": 2.0248889923095703,
"learning_rate": 1.037694298808074e-06,
"loss": 0.3206,
"step": 1212
},
{
"epoch": 2.4406438631790746,
"grad_norm": 1.9970359802246094,
"learning_rate": 1.03056077984014e-06,
"loss": 0.3217,
"step": 1213
},
{
"epoch": 2.442655935613682,
"grad_norm": 1.858817219734192,
"learning_rate": 1.023449046553575e-06,
"loss": 0.3107,
"step": 1214
},
{
"epoch": 2.4446680080482897,
"grad_norm": 1.7835618257522583,
"learning_rate": 1.0163591379800796e-06,
"loss": 0.3038,
"step": 1215
},
{
"epoch": 2.4466800804828974,
"grad_norm": 2.1031856536865234,
"learning_rate": 1.0092910930315698e-06,
"loss": 0.3268,
"step": 1216
},
{
"epoch": 2.448692152917505,
"grad_norm": 1.9893072843551636,
"learning_rate": 1.002244950499966e-06,
"loss": 0.3081,
"step": 1217
},
{
"epoch": 2.4507042253521125,
"grad_norm": 1.9361686706542969,
"learning_rate": 9.952207490569816e-07,
"loss": 0.3101,
"step": 1218
},
{
"epoch": 2.4527162977867203,
"grad_norm": 1.8221616744995117,
"learning_rate": 9.882185272539107e-07,
"loss": 0.3039,
"step": 1219
},
{
"epoch": 2.454728370221328,
"grad_norm": 1.901294469833374,
"learning_rate": 9.81238323521415e-07,
"loss": 0.3056,
"step": 1220
},
{
"epoch": 2.456740442655936,
"grad_norm": 1.874755620956421,
"learning_rate": 9.742801761693122e-07,
"loss": 0.2944,
"step": 1221
},
{
"epoch": 2.458752515090543,
"grad_norm": 1.930405855178833,
"learning_rate": 9.673441233863661e-07,
"loss": 0.3138,
"step": 1222
},
{
"epoch": 2.460764587525151,
"grad_norm": 1.8764375448226929,
"learning_rate": 9.604302032400787e-07,
"loss": 0.316,
"step": 1223
},
{
"epoch": 2.4627766599597587,
"grad_norm": 1.952162265777588,
"learning_rate": 9.535384536764807e-07,
"loss": 0.3254,
"step": 1224
},
{
"epoch": 2.464788732394366,
"grad_norm": 1.8980810642242432,
"learning_rate": 9.466689125199247e-07,
"loss": 0.3177,
"step": 1225
},
{
"epoch": 2.466800804828974,
"grad_norm": 1.9399605989456177,
"learning_rate": 9.39821617472872e-07,
"loss": 0.3204,
"step": 1226
},
{
"epoch": 2.4688128772635816,
"grad_norm": 1.874392032623291,
"learning_rate": 9.329966061156887e-07,
"loss": 0.3042,
"step": 1227
},
{
"epoch": 2.470824949698189,
"grad_norm": 2.0169479846954346,
"learning_rate": 9.261939159064465e-07,
"loss": 0.3132,
"step": 1228
},
{
"epoch": 2.4728370221327967,
"grad_norm": 1.8879144191741943,
"learning_rate": 9.194135841807028e-07,
"loss": 0.3095,
"step": 1229
},
{
"epoch": 2.4748490945674044,
"grad_norm": 1.959427833557129,
"learning_rate": 9.12655648151311e-07,
"loss": 0.3175,
"step": 1230
},
{
"epoch": 2.476861167002012,
"grad_norm": 1.9600000381469727,
"learning_rate": 9.059201449082045e-07,
"loss": 0.315,
"step": 1231
},
{
"epoch": 2.4788732394366195,
"grad_norm": 1.9241917133331299,
"learning_rate": 8.992071114181977e-07,
"loss": 0.3196,
"step": 1232
},
{
"epoch": 2.4808853118712273,
"grad_norm": 2.018326997756958,
"learning_rate": 8.925165845247858e-07,
"loss": 0.3501,
"step": 1233
},
{
"epoch": 2.482897384305835,
"grad_norm": 1.9944074153900146,
"learning_rate": 8.858486009479384e-07,
"loss": 0.3331,
"step": 1234
},
{
"epoch": 2.484909456740443,
"grad_norm": 1.9525293111801147,
"learning_rate": 8.792031972838966e-07,
"loss": 0.3029,
"step": 1235
},
{
"epoch": 2.48692152917505,
"grad_norm": 1.9734416007995605,
"learning_rate": 8.7258041000498e-07,
"loss": 0.3137,
"step": 1236
},
{
"epoch": 2.488933601609658,
"grad_norm": 2.0141916275024414,
"learning_rate": 8.659802754593805e-07,
"loss": 0.3307,
"step": 1237
},
{
"epoch": 2.4909456740442657,
"grad_norm": 1.8858426809310913,
"learning_rate": 8.594028298709605e-07,
"loss": 0.3147,
"step": 1238
},
{
"epoch": 2.492957746478873,
"grad_norm": 1.9975107908248901,
"learning_rate": 8.528481093390606e-07,
"loss": 0.3344,
"step": 1239
},
{
"epoch": 2.494969818913481,
"grad_norm": 1.8693078756332397,
"learning_rate": 8.463161498382949e-07,
"loss": 0.3246,
"step": 1240
},
{
"epoch": 2.4969818913480886,
"grad_norm": 1.9392619132995605,
"learning_rate": 8.398069872183607e-07,
"loss": 0.2963,
"step": 1241
},
{
"epoch": 2.4989939637826963,
"grad_norm": 2.0875697135925293,
"learning_rate": 8.333206572038377e-07,
"loss": 0.3388,
"step": 1242
},
{
"epoch": 2.5010060362173037,
"grad_norm": 1.840063214302063,
"learning_rate": 8.268571953939897e-07,
"loss": 0.3095,
"step": 1243
},
{
"epoch": 2.5030181086519114,
"grad_norm": 2.0050125122070312,
"learning_rate": 8.204166372625727e-07,
"loss": 0.3182,
"step": 1244
},
{
"epoch": 2.505030181086519,
"grad_norm": 1.988363265991211,
"learning_rate": 8.139990181576391e-07,
"loss": 0.3323,
"step": 1245
},
{
"epoch": 2.507042253521127,
"grad_norm": 1.9448785781860352,
"learning_rate": 8.07604373301345e-07,
"loss": 0.3304,
"step": 1246
},
{
"epoch": 2.5090543259557343,
"grad_norm": 1.9935352802276611,
"learning_rate": 8.012327377897561e-07,
"loss": 0.3036,
"step": 1247
},
{
"epoch": 2.511066398390342,
"grad_norm": 1.865787386894226,
"learning_rate": 7.948841465926533e-07,
"loss": 0.3221,
"step": 1248
},
{
"epoch": 2.51307847082495,
"grad_norm": 1.9329603910446167,
"learning_rate": 7.885586345533397e-07,
"loss": 0.2934,
"step": 1249
},
{
"epoch": 2.515090543259557,
"grad_norm": 1.9562501907348633,
"learning_rate": 7.822562363884584e-07,
"loss": 0.333,
"step": 1250
},
{
"epoch": 2.517102615694165,
"grad_norm": 2.2125632762908936,
"learning_rate": 7.759769866877892e-07,
"loss": 0.3409,
"step": 1251
},
{
"epoch": 2.5191146881287727,
"grad_norm": 1.965734601020813,
"learning_rate": 7.697209199140676e-07,
"loss": 0.2914,
"step": 1252
},
{
"epoch": 2.52112676056338,
"grad_norm": 1.9423799514770508,
"learning_rate": 7.634880704027936e-07,
"loss": 0.3089,
"step": 1253
},
{
"epoch": 2.523138832997988,
"grad_norm": 1.93148672580719,
"learning_rate": 7.572784723620424e-07,
"loss": 0.297,
"step": 1254
},
{
"epoch": 2.5251509054325956,
"grad_norm": 1.876718282699585,
"learning_rate": 7.510921598722765e-07,
"loss": 0.3018,
"step": 1255
},
{
"epoch": 2.5271629778672033,
"grad_norm": 1.8500628471374512,
"learning_rate": 7.449291668861575e-07,
"loss": 0.3041,
"step": 1256
},
{
"epoch": 2.529175050301811,
"grad_norm": 1.909126877784729,
"learning_rate": 7.387895272283635e-07,
"loss": 0.3115,
"step": 1257
},
{
"epoch": 2.5311871227364184,
"grad_norm": 1.8048720359802246,
"learning_rate": 7.326732745954001e-07,
"loss": 0.2716,
"step": 1258
},
{
"epoch": 2.533199195171026,
"grad_norm": 2.0273842811584473,
"learning_rate": 7.265804425554202e-07,
"loss": 0.3362,
"step": 1259
},
{
"epoch": 2.535211267605634,
"grad_norm": 1.8919765949249268,
"learning_rate": 7.205110645480307e-07,
"loss": 0.3133,
"step": 1260
},
{
"epoch": 2.5372233400402413,
"grad_norm": 1.9173896312713623,
"learning_rate": 7.144651738841174e-07,
"loss": 0.2987,
"step": 1261
},
{
"epoch": 2.539235412474849,
"grad_norm": 1.8675278425216675,
"learning_rate": 7.084428037456587e-07,
"loss": 0.3081,
"step": 1262
},
{
"epoch": 2.541247484909457,
"grad_norm": 2.023757219314575,
"learning_rate": 7.024439871855448e-07,
"loss": 0.3106,
"step": 1263
},
{
"epoch": 2.543259557344064,
"grad_norm": 1.903003215789795,
"learning_rate": 6.96468757127396e-07,
"loss": 0.3097,
"step": 1264
},
{
"epoch": 2.545271629778672,
"grad_norm": 2.010345458984375,
"learning_rate": 6.905171463653798e-07,
"loss": 0.3248,
"step": 1265
},
{
"epoch": 2.5472837022132797,
"grad_norm": 1.883948802947998,
"learning_rate": 6.845891875640331e-07,
"loss": 0.3153,
"step": 1266
},
{
"epoch": 2.5492957746478875,
"grad_norm": 1.9149082899093628,
"learning_rate": 6.786849132580841e-07,
"loss": 0.3049,
"step": 1267
},
{
"epoch": 2.551307847082495,
"grad_norm": 2.012042284011841,
"learning_rate": 6.728043558522706e-07,
"loss": 0.3291,
"step": 1268
},
{
"epoch": 2.5533199195171026,
"grad_norm": 1.913818359375,
"learning_rate": 6.669475476211628e-07,
"loss": 0.3248,
"step": 1269
},
{
"epoch": 2.5553319919517103,
"grad_norm": 1.8596763610839844,
"learning_rate": 6.611145207089897e-07,
"loss": 0.2996,
"step": 1270
},
{
"epoch": 2.557344064386318,
"grad_norm": 1.9084898233413696,
"learning_rate": 6.55305307129459e-07,
"loss": 0.2944,
"step": 1271
},
{
"epoch": 2.5593561368209254,
"grad_norm": 1.9268581867218018,
"learning_rate": 6.49519938765582e-07,
"loss": 0.3169,
"step": 1272
},
{
"epoch": 2.561368209255533,
"grad_norm": 2.001847505569458,
"learning_rate": 6.437584473694991e-07,
"loss": 0.3339,
"step": 1273
},
{
"epoch": 2.563380281690141,
"grad_norm": 2.047466993331909,
"learning_rate": 6.380208645623037e-07,
"loss": 0.3148,
"step": 1274
},
{
"epoch": 2.5653923541247483,
"grad_norm": 2.0780773162841797,
"learning_rate": 6.323072218338739e-07,
"loss": 0.3154,
"step": 1275
},
{
"epoch": 2.567404426559356,
"grad_norm": 1.877504825592041,
"learning_rate": 6.266175505426958e-07,
"loss": 0.306,
"step": 1276
},
{
"epoch": 2.569416498993964,
"grad_norm": 1.825515627861023,
"learning_rate": 6.209518819156895e-07,
"loss": 0.3002,
"step": 1277
},
{
"epoch": 2.571428571428571,
"grad_norm": 1.8743566274642944,
"learning_rate": 6.15310247048041e-07,
"loss": 0.3089,
"step": 1278
},
{
"epoch": 2.573440643863179,
"grad_norm": 1.9637446403503418,
"learning_rate": 6.096926769030298e-07,
"loss": 0.3118,
"step": 1279
},
{
"epoch": 2.5754527162977867,
"grad_norm": 2.0054385662078857,
"learning_rate": 6.040992023118624e-07,
"loss": 0.302,
"step": 1280
},
{
"epoch": 2.5774647887323945,
"grad_norm": 1.9758704900741577,
"learning_rate": 5.985298539734973e-07,
"loss": 0.3302,
"step": 1281
},
{
"epoch": 2.5794768611670023,
"grad_norm": 1.9158987998962402,
"learning_rate": 5.929846624544821e-07,
"loss": 0.3166,
"step": 1282
},
{
"epoch": 2.5814889336016096,
"grad_norm": 1.9502754211425781,
"learning_rate": 5.874636581887804e-07,
"loss": 0.3229,
"step": 1283
},
{
"epoch": 2.5835010060362174,
"grad_norm": 1.9610239267349243,
"learning_rate": 5.819668714776122e-07,
"loss": 0.3149,
"step": 1284
},
{
"epoch": 2.585513078470825,
"grad_norm": 1.9487340450286865,
"learning_rate": 5.76494332489278e-07,
"loss": 0.3048,
"step": 1285
},
{
"epoch": 2.5875251509054324,
"grad_norm": 1.97187340259552,
"learning_rate": 5.710460712589993e-07,
"loss": 0.3304,
"step": 1286
},
{
"epoch": 2.58953722334004,
"grad_norm": 1.9745339155197144,
"learning_rate": 5.656221176887572e-07,
"loss": 0.3121,
"step": 1287
},
{
"epoch": 2.591549295774648,
"grad_norm": 1.8480572700500488,
"learning_rate": 5.602225015471175e-07,
"loss": 0.3001,
"step": 1288
},
{
"epoch": 2.5935613682092553,
"grad_norm": 2.0272209644317627,
"learning_rate": 5.548472524690784e-07,
"loss": 0.3463,
"step": 1289
},
{
"epoch": 2.595573440643863,
"grad_norm": 2.1198689937591553,
"learning_rate": 5.494963999559011e-07,
"loss": 0.328,
"step": 1290
},
{
"epoch": 2.597585513078471,
"grad_norm": 1.9094234704971313,
"learning_rate": 5.441699733749479e-07,
"loss": 0.3346,
"step": 1291
},
{
"epoch": 2.5995975855130786,
"grad_norm": 1.8595271110534668,
"learning_rate": 5.388680019595266e-07,
"loss": 0.3198,
"step": 1292
},
{
"epoch": 2.6016096579476864,
"grad_norm": 1.8942431211471558,
"learning_rate": 5.335905148087256e-07,
"loss": 0.3185,
"step": 1293
},
{
"epoch": 2.6036217303822937,
"grad_norm": 1.82722008228302,
"learning_rate": 5.283375408872538e-07,
"loss": 0.2988,
"step": 1294
},
{
"epoch": 2.6056338028169015,
"grad_norm": 1.8183109760284424,
"learning_rate": 5.231091090252832e-07,
"loss": 0.3016,
"step": 1295
},
{
"epoch": 2.6076458752515093,
"grad_norm": 1.949315071105957,
"learning_rate": 5.179052479182889e-07,
"loss": 0.2915,
"step": 1296
},
{
"epoch": 2.6096579476861166,
"grad_norm": 1.9181135892868042,
"learning_rate": 5.127259861268974e-07,
"loss": 0.2865,
"step": 1297
},
{
"epoch": 2.6116700201207244,
"grad_norm": 1.8018304109573364,
"learning_rate": 5.075713520767201e-07,
"loss": 0.2975,
"step": 1298
},
{
"epoch": 2.613682092555332,
"grad_norm": 1.929329514503479,
"learning_rate": 5.024413740582074e-07,
"loss": 0.3304,
"step": 1299
},
{
"epoch": 2.6156941649899395,
"grad_norm": 1.8946964740753174,
"learning_rate": 4.973360802264859e-07,
"loss": 0.2947,
"step": 1300
},
{
"epoch": 2.6177062374245472,
"grad_norm": 1.8121057748794556,
"learning_rate": 4.922554986012068e-07,
"loss": 0.3002,
"step": 1301
},
{
"epoch": 2.619718309859155,
"grad_norm": 1.8261134624481201,
"learning_rate": 4.871996570663934e-07,
"loss": 0.274,
"step": 1302
},
{
"epoch": 2.6217303822937623,
"grad_norm": 1.8316187858581543,
"learning_rate": 4.82168583370285e-07,
"loss": 0.2878,
"step": 1303
},
{
"epoch": 2.62374245472837,
"grad_norm": 1.8134701251983643,
"learning_rate": 4.771623051251878e-07,
"loss": 0.2951,
"step": 1304
},
{
"epoch": 2.625754527162978,
"grad_norm": 1.8709946870803833,
"learning_rate": 4.721808498073205e-07,
"loss": 0.3039,
"step": 1305
},
{
"epoch": 2.6277665995975856,
"grad_norm": 2.038022518157959,
"learning_rate": 4.6722424475666715e-07,
"loss": 0.3066,
"step": 1306
},
{
"epoch": 2.6297786720321934,
"grad_norm": 1.8829350471496582,
"learning_rate": 4.622925171768211e-07,
"loss": 0.3043,
"step": 1307
},
{
"epoch": 2.6317907444668007,
"grad_norm": 1.8870759010314941,
"learning_rate": 4.57385694134842e-07,
"loss": 0.3039,
"step": 1308
},
{
"epoch": 2.6338028169014085,
"grad_norm": 2.1094441413879395,
"learning_rate": 4.5250380256110335e-07,
"loss": 0.3452,
"step": 1309
},
{
"epoch": 2.6358148893360163,
"grad_norm": 1.867954134941101,
"learning_rate": 4.476468692491487e-07,
"loss": 0.2878,
"step": 1310
},
{
"epoch": 2.6378269617706236,
"grad_norm": 1.9955129623413086,
"learning_rate": 4.428149208555388e-07,
"loss": 0.2993,
"step": 1311
},
{
"epoch": 2.6398390342052314,
"grad_norm": 1.785326361656189,
"learning_rate": 4.380079838997087e-07,
"loss": 0.3052,
"step": 1312
},
{
"epoch": 2.641851106639839,
"grad_norm": 1.8974698781967163,
"learning_rate": 4.3322608476382255e-07,
"loss": 0.2985,
"step": 1313
},
{
"epoch": 2.6438631790744465,
"grad_norm": 1.9238228797912598,
"learning_rate": 4.2846924969262736e-07,
"loss": 0.3157,
"step": 1314
},
{
"epoch": 2.6458752515090542,
"grad_norm": 2.059622287750244,
"learning_rate": 4.237375047933118e-07,
"loss": 0.3192,
"step": 1315
},
{
"epoch": 2.647887323943662,
"grad_norm": 1.8804223537445068,
"learning_rate": 4.190308760353595e-07,
"loss": 0.3101,
"step": 1316
},
{
"epoch": 2.6498993963782698,
"grad_norm": 1.9766371250152588,
"learning_rate": 4.1434938925040804e-07,
"loss": 0.3165,
"step": 1317
},
{
"epoch": 2.6519114688128775,
"grad_norm": 1.8625285625457764,
"learning_rate": 4.0969307013210445e-07,
"loss": 0.3115,
"step": 1318
},
{
"epoch": 2.653923541247485,
"grad_norm": 1.8935160636901855,
"learning_rate": 4.050619442359721e-07,
"loss": 0.2961,
"step": 1319
},
{
"epoch": 2.6559356136820926,
"grad_norm": 2.0412039756774902,
"learning_rate": 4.004560369792593e-07,
"loss": 0.3169,
"step": 1320
},
{
"epoch": 2.6579476861167004,
"grad_norm": 1.8851536512374878,
"learning_rate": 3.958753736408105e-07,
"loss": 0.3136,
"step": 1321
},
{
"epoch": 2.6599597585513077,
"grad_norm": 1.9968008995056152,
"learning_rate": 3.91319979360919e-07,
"loss": 0.302,
"step": 1322
},
{
"epoch": 2.6619718309859155,
"grad_norm": 1.869197964668274,
"learning_rate": 3.867898791411956e-07,
"loss": 0.3141,
"step": 1323
},
{
"epoch": 2.6639839034205233,
"grad_norm": 1.8277980089187622,
"learning_rate": 3.822850978444254e-07,
"loss": 0.3149,
"step": 1324
},
{
"epoch": 2.6659959758551306,
"grad_norm": 1.792640209197998,
"learning_rate": 3.778056601944358e-07,
"loss": 0.3035,
"step": 1325
},
{
"epoch": 2.6680080482897384,
"grad_norm": 1.92755126953125,
"learning_rate": 3.733515907759594e-07,
"loss": 0.3227,
"step": 1326
},
{
"epoch": 2.670020120724346,
"grad_norm": 1.9215903282165527,
"learning_rate": 3.6892291403449963e-07,
"loss": 0.3126,
"step": 1327
},
{
"epoch": 2.6720321931589535,
"grad_norm": 2.0002219676971436,
"learning_rate": 3.645196542761953e-07,
"loss": 0.3399,
"step": 1328
},
{
"epoch": 2.6740442655935612,
"grad_norm": 1.9246290922164917,
"learning_rate": 3.6014183566768725e-07,
"loss": 0.3389,
"step": 1329
},
{
"epoch": 2.676056338028169,
"grad_norm": 1.8840545415878296,
"learning_rate": 3.557894822359864e-07,
"loss": 0.3063,
"step": 1330
},
{
"epoch": 2.6780684104627768,
"grad_norm": 1.9435675144195557,
"learning_rate": 3.5146261786834225e-07,
"loss": 0.3238,
"step": 1331
},
{
"epoch": 2.6800804828973845,
"grad_norm": 1.9354828596115112,
"learning_rate": 3.471612663121121e-07,
"loss": 0.3235,
"step": 1332
},
{
"epoch": 2.682092555331992,
"grad_norm": 1.8880016803741455,
"learning_rate": 3.428854511746293e-07,
"loss": 0.3195,
"step": 1333
},
{
"epoch": 2.6841046277665996,
"grad_norm": 1.8282817602157593,
"learning_rate": 3.386351959230738e-07,
"loss": 0.3048,
"step": 1334
},
{
"epoch": 2.6861167002012074,
"grad_norm": 1.8626350164413452,
"learning_rate": 3.344105238843437e-07,
"loss": 0.3023,
"step": 1335
},
{
"epoch": 2.6881287726358147,
"grad_norm": 1.9268531799316406,
"learning_rate": 3.302114582449295e-07,
"loss": 0.3143,
"step": 1336
},
{
"epoch": 2.6901408450704225,
"grad_norm": 1.9087598323822021,
"learning_rate": 3.2603802205078195e-07,
"loss": 0.3051,
"step": 1337
},
{
"epoch": 2.6921529175050303,
"grad_norm": 1.9388471841812134,
"learning_rate": 3.2189023820719034e-07,
"loss": 0.294,
"step": 1338
},
{
"epoch": 2.6941649899396376,
"grad_norm": 1.8099417686462402,
"learning_rate": 3.177681294786539e-07,
"loss": 0.2976,
"step": 1339
},
{
"epoch": 2.6961770623742454,
"grad_norm": 2.025315761566162,
"learning_rate": 3.136717184887589e-07,
"loss": 0.301,
"step": 1340
},
{
"epoch": 2.698189134808853,
"grad_norm": 1.9770852327346802,
"learning_rate": 3.0960102772005174e-07,
"loss": 0.3199,
"step": 1341
},
{
"epoch": 2.700201207243461,
"grad_norm": 1.8802844285964966,
"learning_rate": 3.055560795139173e-07,
"loss": 0.3248,
"step": 1342
},
{
"epoch": 2.7022132796780687,
"grad_norm": 1.8507022857666016,
"learning_rate": 3.015368960704584e-07,
"loss": 0.2858,
"step": 1343
},
{
"epoch": 2.704225352112676,
"grad_norm": 2.052767753601074,
"learning_rate": 2.975434994483689e-07,
"loss": 0.3219,
"step": 1344
},
{
"epoch": 2.7062374245472838,
"grad_norm": 1.999880313873291,
"learning_rate": 2.9357591156481793e-07,
"loss": 0.3258,
"step": 1345
},
{
"epoch": 2.7082494969818915,
"grad_norm": 1.8796782493591309,
"learning_rate": 2.896341541953257e-07,
"loss": 0.3131,
"step": 1346
},
{
"epoch": 2.710261569416499,
"grad_norm": 1.8701180219650269,
"learning_rate": 2.85718248973646e-07,
"loss": 0.285,
"step": 1347
},
{
"epoch": 2.7122736418511066,
"grad_norm": 1.8917667865753174,
"learning_rate": 2.8182821739164534e-07,
"loss": 0.3163,
"step": 1348
},
{
"epoch": 2.7142857142857144,
"grad_norm": 1.933946132659912,
"learning_rate": 2.779640807991896e-07,
"loss": 0.3079,
"step": 1349
},
{
"epoch": 2.7162977867203217,
"grad_norm": 2.010690689086914,
"learning_rate": 2.74125860404022e-07,
"loss": 0.3163,
"step": 1350
},
{
"epoch": 2.7183098591549295,
"grad_norm": 1.8380995988845825,
"learning_rate": 2.7031357727164865e-07,
"loss": 0.3043,
"step": 1351
},
{
"epoch": 2.7203219315895373,
"grad_norm": 2.11596417427063,
"learning_rate": 2.665272523252216e-07,
"loss": 0.3343,
"step": 1352
},
{
"epoch": 2.7223340040241446,
"grad_norm": 2.0510504245758057,
"learning_rate": 2.627669063454291e-07,
"loss": 0.3328,
"step": 1353
},
{
"epoch": 2.7243460764587524,
"grad_norm": 1.8645933866500854,
"learning_rate": 2.5903255997037246e-07,
"loss": 0.3116,
"step": 1354
},
{
"epoch": 2.72635814889336,
"grad_norm": 1.8328444957733154,
"learning_rate": 2.553242336954631e-07,
"loss": 0.3054,
"step": 1355
},
{
"epoch": 2.728370221327968,
"grad_norm": 1.905352234840393,
"learning_rate": 2.516419478733012e-07,
"loss": 0.3156,
"step": 1356
},
{
"epoch": 2.7303822937625757,
"grad_norm": 1.8690829277038574,
"learning_rate": 2.479857227135685e-07,
"loss": 0.3138,
"step": 1357
},
{
"epoch": 2.732394366197183,
"grad_norm": 1.8393657207489014,
"learning_rate": 2.443555782829188e-07,
"loss": 0.3068,
"step": 1358
},
{
"epoch": 2.734406438631791,
"grad_norm": 1.9217677116394043,
"learning_rate": 2.407515345048622e-07,
"loss": 0.3097,
"step": 1359
},
{
"epoch": 2.7364185110663986,
"grad_norm": 2.0007879734039307,
"learning_rate": 2.3717361115966343e-07,
"loss": 0.2843,
"step": 1360
},
{
"epoch": 2.738430583501006,
"grad_norm": 1.9894534349441528,
"learning_rate": 2.3362182788422395e-07,
"loss": 0.3149,
"step": 1361
},
{
"epoch": 2.7404426559356136,
"grad_norm": 1.8293792009353638,
"learning_rate": 2.300962041719851e-07,
"loss": 0.2966,
"step": 1362
},
{
"epoch": 2.7424547283702214,
"grad_norm": 1.9846625328063965,
"learning_rate": 2.2659675937281078e-07,
"loss": 0.299,
"step": 1363
},
{
"epoch": 2.7444668008048287,
"grad_norm": 2.0236048698425293,
"learning_rate": 2.2312351269288712e-07,
"loss": 0.3109,
"step": 1364
},
{
"epoch": 2.7464788732394365,
"grad_norm": 1.8248317241668701,
"learning_rate": 2.1967648319461577e-07,
"loss": 0.3068,
"step": 1365
},
{
"epoch": 2.7484909456740443,
"grad_norm": 1.9960497617721558,
"learning_rate": 2.1625568979651012e-07,
"loss": 0.3365,
"step": 1366
},
{
"epoch": 2.750503018108652,
"grad_norm": 1.8922979831695557,
"learning_rate": 2.1286115127308992e-07,
"loss": 0.3029,
"step": 1367
},
{
"epoch": 2.75251509054326,
"grad_norm": 1.9048439264297485,
"learning_rate": 2.0949288625477903e-07,
"loss": 0.3167,
"step": 1368
},
{
"epoch": 2.754527162977867,
"grad_norm": 1.855001449584961,
"learning_rate": 2.061509132278028e-07,
"loss": 0.3012,
"step": 1369
},
{
"epoch": 2.756539235412475,
"grad_norm": 1.9068505764007568,
"learning_rate": 2.028352505340858e-07,
"loss": 0.328,
"step": 1370
},
{
"epoch": 2.7585513078470827,
"grad_norm": 2.014235734939575,
"learning_rate": 1.9954591637115495e-07,
"loss": 0.3387,
"step": 1371
},
{
"epoch": 2.76056338028169,
"grad_norm": 1.9344955682754517,
"learning_rate": 1.9628292879203482e-07,
"loss": 0.3176,
"step": 1372
},
{
"epoch": 2.762575452716298,
"grad_norm": 1.9466530084609985,
"learning_rate": 1.9304630570515182e-07,
"loss": 0.3062,
"step": 1373
},
{
"epoch": 2.7645875251509056,
"grad_norm": 1.8726989030838013,
"learning_rate": 1.8983606487423255e-07,
"loss": 0.3215,
"step": 1374
},
{
"epoch": 2.766599597585513,
"grad_norm": 2.013495445251465,
"learning_rate": 1.866522239182117e-07,
"loss": 0.3394,
"step": 1375
},
{
"epoch": 2.7686116700201207,
"grad_norm": 1.9069247245788574,
"learning_rate": 1.8349480031112977e-07,
"loss": 0.3259,
"step": 1376
},
{
"epoch": 2.7706237424547284,
"grad_norm": 1.7492789030075073,
"learning_rate": 1.8036381138204051e-07,
"loss": 0.2815,
"step": 1377
},
{
"epoch": 2.7726358148893357,
"grad_norm": 2.041621685028076,
"learning_rate": 1.7725927431491375e-07,
"loss": 0.3234,
"step": 1378
},
{
"epoch": 2.7746478873239435,
"grad_norm": 2.031970977783203,
"learning_rate": 1.7418120614854427e-07,
"loss": 0.3093,
"step": 1379
},
{
"epoch": 2.7766599597585513,
"grad_norm": 1.9473472833633423,
"learning_rate": 1.711296237764548e-07,
"loss": 0.3081,
"step": 1380
},
{
"epoch": 2.778672032193159,
"grad_norm": 2.1944499015808105,
"learning_rate": 1.6810454394680431e-07,
"loss": 0.3473,
"step": 1381
},
{
"epoch": 2.780684104627767,
"grad_norm": 1.859397292137146,
"learning_rate": 1.6510598326229645e-07,
"loss": 0.2997,
"step": 1382
},
{
"epoch": 2.782696177062374,
"grad_norm": 1.9514036178588867,
"learning_rate": 1.6213395818009016e-07,
"loss": 0.3157,
"step": 1383
},
{
"epoch": 2.784708249496982,
"grad_norm": 2.0373504161834717,
"learning_rate": 1.5918848501170647e-07,
"loss": 0.326,
"step": 1384
},
{
"epoch": 2.7867203219315897,
"grad_norm": 1.9372187852859497,
"learning_rate": 1.5626957992293966e-07,
"loss": 0.317,
"step": 1385
},
{
"epoch": 2.788732394366197,
"grad_norm": 1.8588268756866455,
"learning_rate": 1.5337725893376954e-07,
"loss": 0.2764,
"step": 1386
},
{
"epoch": 2.790744466800805,
"grad_norm": 1.865725040435791,
"learning_rate": 1.505115379182731e-07,
"loss": 0.3216,
"step": 1387
},
{
"epoch": 2.7927565392354126,
"grad_norm": 1.9945579767227173,
"learning_rate": 1.47672432604537e-07,
"loss": 0.3149,
"step": 1388
},
{
"epoch": 2.79476861167002,
"grad_norm": 1.7939214706420898,
"learning_rate": 1.4485995857457246e-07,
"loss": 0.2956,
"step": 1389
},
{
"epoch": 2.7967806841046277,
"grad_norm": 1.8982107639312744,
"learning_rate": 1.420741312642282e-07,
"loss": 0.3121,
"step": 1390
},
{
"epoch": 2.7987927565392354,
"grad_norm": 1.9184291362762451,
"learning_rate": 1.3931496596310545e-07,
"loss": 0.3006,
"step": 1391
},
{
"epoch": 2.800804828973843,
"grad_norm": 1.929681658744812,
"learning_rate": 1.3658247781447642e-07,
"loss": 0.308,
"step": 1392
},
{
"epoch": 2.802816901408451,
"grad_norm": 2.064938545227051,
"learning_rate": 1.338766818151982e-07,
"loss": 0.3255,
"step": 1393
},
{
"epoch": 2.8048289738430583,
"grad_norm": 1.8805738687515259,
"learning_rate": 1.3119759281563392e-07,
"loss": 0.2883,
"step": 1394
},
{
"epoch": 2.806841046277666,
"grad_norm": 2.030398368835449,
"learning_rate": 1.2854522551956738e-07,
"loss": 0.3209,
"step": 1395
},
{
"epoch": 2.808853118712274,
"grad_norm": 1.8628559112548828,
"learning_rate": 1.2591959448412628e-07,
"loss": 0.2927,
"step": 1396
},
{
"epoch": 2.810865191146881,
"grad_norm": 1.9574239253997803,
"learning_rate": 1.2332071411969792e-07,
"loss": 0.3308,
"step": 1397
},
{
"epoch": 2.812877263581489,
"grad_norm": 2.0328168869018555,
"learning_rate": 1.2074859868985377e-07,
"loss": 0.3143,
"step": 1398
},
{
"epoch": 2.8148893360160967,
"grad_norm": 1.9412139654159546,
"learning_rate": 1.1820326231126944e-07,
"loss": 0.3297,
"step": 1399
},
{
"epoch": 2.816901408450704,
"grad_norm": 1.9176777601242065,
"learning_rate": 1.1568471895364863e-07,
"loss": 0.3229,
"step": 1400
},
{
"epoch": 2.818913480885312,
"grad_norm": 1.95015549659729,
"learning_rate": 1.1319298243964549e-07,
"loss": 0.3056,
"step": 1401
},
{
"epoch": 2.8209255533199196,
"grad_norm": 1.9641227722167969,
"learning_rate": 1.107280664447874e-07,
"loss": 0.3415,
"step": 1402
},
{
"epoch": 2.822937625754527,
"grad_norm": 1.8761005401611328,
"learning_rate": 1.082899844974017e-07,
"loss": 0.3046,
"step": 1403
},
{
"epoch": 2.8249496981891347,
"grad_norm": 1.9387608766555786,
"learning_rate": 1.0587874997854186e-07,
"loss": 0.2972,
"step": 1404
},
{
"epoch": 2.8269617706237424,
"grad_norm": 2.00655460357666,
"learning_rate": 1.0349437612191259e-07,
"loss": 0.3216,
"step": 1405
},
{
"epoch": 2.82897384305835,
"grad_norm": 1.9893320798873901,
"learning_rate": 1.0113687601379818e-07,
"loss": 0.3208,
"step": 1406
},
{
"epoch": 2.830985915492958,
"grad_norm": 1.9536393880844116,
"learning_rate": 9.880626259298976e-08,
"loss": 0.2988,
"step": 1407
},
{
"epoch": 2.8329979879275653,
"grad_norm": 1.9395415782928467,
"learning_rate": 9.650254865071428e-08,
"loss": 0.3084,
"step": 1408
},
{
"epoch": 2.835010060362173,
"grad_norm": 1.9450585842132568,
"learning_rate": 9.422574683056795e-08,
"loss": 0.3429,
"step": 1409
},
{
"epoch": 2.837022132796781,
"grad_norm": 1.9292974472045898,
"learning_rate": 9.197586962843952e-08,
"loss": 0.3018,
"step": 1410
},
{
"epoch": 2.839034205231388,
"grad_norm": 1.8370075225830078,
"learning_rate": 8.975292939244928e-08,
"loss": 0.308,
"step": 1411
},
{
"epoch": 2.841046277665996,
"grad_norm": 2.0159215927124023,
"learning_rate": 8.755693832287581e-08,
"loss": 0.3139,
"step": 1412
},
{
"epoch": 2.8430583501006037,
"grad_norm": 1.8302010297775269,
"learning_rate": 8.538790847209211e-08,
"loss": 0.3057,
"step": 1413
},
{
"epoch": 2.845070422535211,
"grad_norm": 1.9254567623138428,
"learning_rate": 8.324585174449895e-08,
"loss": 0.3305,
"step": 1414
},
{
"epoch": 2.847082494969819,
"grad_norm": 1.7892004251480103,
"learning_rate": 8.11307798964589e-08,
"loss": 0.2847,
"step": 1415
},
{
"epoch": 2.8490945674044266,
"grad_norm": 1.957893967628479,
"learning_rate": 7.90427045362302e-08,
"loss": 0.308,
"step": 1416
},
{
"epoch": 2.8511066398390343,
"grad_norm": 1.836108684539795,
"learning_rate": 7.698163712390683e-08,
"loss": 0.3072,
"step": 1417
},
{
"epoch": 2.853118712273642,
"grad_norm": 1.8814616203308105,
"learning_rate": 7.494758897135412e-08,
"loss": 0.2938,
"step": 1418
},
{
"epoch": 2.8551307847082494,
"grad_norm": 2.026705265045166,
"learning_rate": 7.294057124214438e-08,
"loss": 0.3311,
"step": 1419
},
{
"epoch": 2.857142857142857,
"grad_norm": 1.9151105880737305,
"learning_rate": 7.096059495149855e-08,
"loss": 0.3127,
"step": 1420
},
{
"epoch": 2.859154929577465,
"grad_norm": 1.9340089559555054,
"learning_rate": 6.900767096622352e-08,
"loss": 0.3123,
"step": 1421
},
{
"epoch": 2.8611670020120723,
"grad_norm": 1.9273947477340698,
"learning_rate": 6.708181000465552e-08,
"loss": 0.3069,
"step": 1422
},
{
"epoch": 2.86317907444668,
"grad_norm": 1.8585259914398193,
"learning_rate": 6.518302263659737e-08,
"loss": 0.3073,
"step": 1423
},
{
"epoch": 2.865191146881288,
"grad_norm": 1.9229656457901,
"learning_rate": 6.331131928326407e-08,
"loss": 0.3171,
"step": 1424
},
{
"epoch": 2.867203219315895,
"grad_norm": 1.8940155506134033,
"learning_rate": 6.146671021722284e-08,
"loss": 0.3228,
"step": 1425
},
{
"epoch": 2.869215291750503,
"grad_norm": 1.772291660308838,
"learning_rate": 5.964920556233767e-08,
"loss": 0.2821,
"step": 1426
},
{
"epoch": 2.8712273641851107,
"grad_norm": 1.874535083770752,
"learning_rate": 5.785881529371595e-08,
"loss": 0.2926,
"step": 1427
},
{
"epoch": 2.873239436619718,
"grad_norm": 2.0020828247070312,
"learning_rate": 5.609554923764915e-08,
"loss": 0.3234,
"step": 1428
},
{
"epoch": 2.875251509054326,
"grad_norm": 2.036130666732788,
"learning_rate": 5.435941707156389e-08,
"loss": 0.3074,
"step": 1429
},
{
"epoch": 2.8772635814889336,
"grad_norm": 1.8147860765457153,
"learning_rate": 5.265042832396428e-08,
"loss": 0.2758,
"step": 1430
},
{
"epoch": 2.8792756539235413,
"grad_norm": 2.0033209323883057,
"learning_rate": 5.0968592374384116e-08,
"loss": 0.3158,
"step": 1431
},
{
"epoch": 2.881287726358149,
"grad_norm": 2.0302250385284424,
"learning_rate": 4.931391845333089e-08,
"loss": 0.3167,
"step": 1432
},
{
"epoch": 2.8832997987927564,
"grad_norm": 1.8428484201431274,
"learning_rate": 4.768641564223852e-08,
"loss": 0.3046,
"step": 1433
},
{
"epoch": 2.885311871227364,
"grad_norm": 2.1346139907836914,
"learning_rate": 4.608609287341581e-08,
"loss": 0.3182,
"step": 1434
},
{
"epoch": 2.887323943661972,
"grad_norm": 1.9685754776000977,
"learning_rate": 4.451295892999863e-08,
"loss": 0.3058,
"step": 1435
},
{
"epoch": 2.8893360160965793,
"grad_norm": 1.896419882774353,
"learning_rate": 4.296702244590056e-08,
"loss": 0.2972,
"step": 1436
},
{
"epoch": 2.891348088531187,
"grad_norm": 1.8687481880187988,
"learning_rate": 4.144829190576516e-08,
"loss": 0.3066,
"step": 1437
},
{
"epoch": 2.893360160965795,
"grad_norm": 1.9098145961761475,
"learning_rate": 3.99567756449204e-08,
"loss": 0.3276,
"step": 1438
},
{
"epoch": 2.895372233400402,
"grad_norm": 1.8980722427368164,
"learning_rate": 3.84924818493343e-08,
"loss": 0.2977,
"step": 1439
},
{
"epoch": 2.89738430583501,
"grad_norm": 1.9447804689407349,
"learning_rate": 3.705541855556716e-08,
"loss": 0.31,
"step": 1440
},
{
"epoch": 2.8993963782696177,
"grad_norm": 1.932647943496704,
"learning_rate": 3.5645593650728284e-08,
"loss": 0.3065,
"step": 1441
},
{
"epoch": 2.9014084507042255,
"grad_norm": 1.9486619234085083,
"learning_rate": 3.426301487243433e-08,
"loss": 0.3083,
"step": 1442
},
{
"epoch": 2.9034205231388333,
"grad_norm": 2.015017032623291,
"learning_rate": 3.290768980876324e-08,
"loss": 0.3284,
"step": 1443
},
{
"epoch": 2.9054325955734406,
"grad_norm": 1.9345711469650269,
"learning_rate": 3.157962589821872e-08,
"loss": 0.3237,
"step": 1444
},
{
"epoch": 2.9074446680080483,
"grad_norm": 2.0234079360961914,
"learning_rate": 3.027883042968249e-08,
"loss": 0.3112,
"step": 1445
},
{
"epoch": 2.909456740442656,
"grad_norm": 1.8842613697052002,
"learning_rate": 2.9005310542378205e-08,
"loss": 0.3024,
"step": 1446
},
{
"epoch": 2.9114688128772634,
"grad_norm": 1.8669836521148682,
"learning_rate": 2.77590732258326e-08,
"loss": 0.3031,
"step": 1447
},
{
"epoch": 2.913480885311871,
"grad_norm": 2.0102365016937256,
"learning_rate": 2.6540125319834964e-08,
"loss": 0.3102,
"step": 1448
},
{
"epoch": 2.915492957746479,
"grad_norm": 1.9279073476791382,
"learning_rate": 2.5348473514400507e-08,
"loss": 0.2971,
"step": 1449
},
{
"epoch": 2.9175050301810863,
"grad_norm": 1.921225666999817,
"learning_rate": 2.4184124349734828e-08,
"loss": 0.3094,
"step": 1450
},
{
"epoch": 2.919517102615694,
"grad_norm": 1.9262641668319702,
"learning_rate": 2.3047084216196724e-08,
"loss": 0.3069,
"step": 1451
},
{
"epoch": 2.921529175050302,
"grad_norm": 1.8994249105453491,
"learning_rate": 2.1937359354262665e-08,
"loss": 0.3096,
"step": 1452
},
{
"epoch": 2.9235412474849096,
"grad_norm": 2.079052686691284,
"learning_rate": 2.085495585449404e-08,
"loss": 0.3253,
"step": 1453
},
{
"epoch": 2.925553319919517,
"grad_norm": 1.9570258855819702,
"learning_rate": 1.979987965750274e-08,
"loss": 0.3455,
"step": 1454
},
{
"epoch": 2.9275653923541247,
"grad_norm": 2.058948040008545,
"learning_rate": 1.8772136553918408e-08,
"loss": 0.331,
"step": 1455
},
{
"epoch": 2.9295774647887325,
"grad_norm": 2.0447700023651123,
"learning_rate": 1.7771732184357905e-08,
"loss": 0.3452,
"step": 1456
},
{
"epoch": 2.9315895372233403,
"grad_norm": 2.081425428390503,
"learning_rate": 1.679867203939256e-08,
"loss": 0.3349,
"step": 1457
},
{
"epoch": 2.9336016096579476,
"grad_norm": 1.8499003648757935,
"learning_rate": 1.5852961459519868e-08,
"loss": 0.2985,
"step": 1458
},
{
"epoch": 2.9356136820925554,
"grad_norm": 1.9242502450942993,
"learning_rate": 1.4934605635132383e-08,
"loss": 0.3154,
"step": 1459
},
{
"epoch": 2.937625754527163,
"grad_norm": 1.93385648727417,
"learning_rate": 1.4043609606489983e-08,
"loss": 0.299,
"step": 1460
},
{
"epoch": 2.9396378269617705,
"grad_norm": 1.9221806526184082,
"learning_rate": 1.3179978263694326e-08,
"loss": 0.3123,
"step": 1461
},
{
"epoch": 2.941649899396378,
"grad_norm": 2.0381548404693604,
"learning_rate": 1.2343716346657209e-08,
"loss": 0.3252,
"step": 1462
},
{
"epoch": 2.943661971830986,
"grad_norm": 1.926993489265442,
"learning_rate": 1.1534828445080027e-08,
"loss": 0.3015,
"step": 1463
},
{
"epoch": 2.9456740442655933,
"grad_norm": 1.840437412261963,
"learning_rate": 1.0753318998423246e-08,
"loss": 0.3145,
"step": 1464
},
{
"epoch": 2.947686116700201,
"grad_norm": 1.8412384986877441,
"learning_rate": 9.999192295886973e-09,
"loss": 0.3011,
"step": 1465
},
{
"epoch": 2.949698189134809,
"grad_norm": 1.8052462339401245,
"learning_rate": 9.272452476384308e-09,
"loss": 0.282,
"step": 1466
},
{
"epoch": 2.9517102615694166,
"grad_norm": 1.972144603729248,
"learning_rate": 8.5731035285197e-09,
"loss": 0.3058,
"step": 1467
},
{
"epoch": 2.9537223340040244,
"grad_norm": 1.8558961153030396,
"learning_rate": 7.90114929056618e-09,
"loss": 0.3157,
"step": 1468
},
{
"epoch": 2.9557344064386317,
"grad_norm": 2.027374029159546,
"learning_rate": 7.256593450444827e-09,
"loss": 0.3317,
"step": 1469
},
{
"epoch": 2.9577464788732395,
"grad_norm": 1.951428771018982,
"learning_rate": 6.639439545707005e-09,
"loss": 0.3011,
"step": 1470
},
{
"epoch": 2.9597585513078473,
"grad_norm": 1.7829604148864746,
"learning_rate": 6.04969096350938e-09,
"loss": 0.2914,
"step": 1471
},
{
"epoch": 2.9617706237424546,
"grad_norm": 1.9716224670410156,
"learning_rate": 5.487350940600044e-09,
"loss": 0.3362,
"step": 1472
},
{
"epoch": 2.9637826961770624,
"grad_norm": 2.0185909271240234,
"learning_rate": 4.952422563300197e-09,
"loss": 0.2978,
"step": 1473
},
{
"epoch": 2.96579476861167,
"grad_norm": 1.8780219554901123,
"learning_rate": 4.444908767484712e-09,
"loss": 0.2806,
"step": 1474
},
{
"epoch": 2.9678068410462775,
"grad_norm": 1.9690665006637573,
"learning_rate": 3.964812338567714e-09,
"loss": 0.3135,
"step": 1475
},
{
"epoch": 2.9698189134808852,
"grad_norm": 1.9470399618148804,
"learning_rate": 3.5121359114886898e-09,
"loss": 0.318,
"step": 1476
},
{
"epoch": 2.971830985915493,
"grad_norm": 1.9868518114089966,
"learning_rate": 3.0868819706947327e-09,
"loss": 0.3146,
"step": 1477
},
{
"epoch": 2.9738430583501008,
"grad_norm": 1.8380684852600098,
"learning_rate": 2.6890528501288814e-09,
"loss": 0.2889,
"step": 1478
},
{
"epoch": 2.975855130784708,
"grad_norm": 1.9578129053115845,
"learning_rate": 2.3186507332184636e-09,
"loss": 0.3003,
"step": 1479
},
{
"epoch": 2.977867203219316,
"grad_norm": 1.9266585111618042,
"learning_rate": 1.9756776528601085e-09,
"loss": 0.324,
"step": 1480
},
{
"epoch": 2.9798792756539236,
"grad_norm": 1.8439252376556396,
"learning_rate": 1.660135491411974e-09,
"loss": 0.3137,
"step": 1481
},
{
"epoch": 2.9818913480885314,
"grad_norm": 1.9398472309112549,
"learning_rate": 1.3720259806793146e-09,
"loss": 0.2902,
"step": 1482
},
{
"epoch": 2.9839034205231387,
"grad_norm": 1.9513535499572754,
"learning_rate": 1.111350701909486e-09,
"loss": 0.3255,
"step": 1483
},
{
"epoch": 2.9859154929577465,
"grad_norm": 1.8612955808639526,
"learning_rate": 8.781110857802866e-10,
"loss": 0.3181,
"step": 1484
},
{
"epoch": 2.9879275653923543,
"grad_norm": 1.9021211862564087,
"learning_rate": 6.723084123921864e-10,
"loss": 0.297,
"step": 1485
},
{
"epoch": 2.9899396378269616,
"grad_norm": 1.824216604232788,
"learning_rate": 4.939438112638861e-10,
"loss": 0.3023,
"step": 1486
},
{
"epoch": 2.9919517102615694,
"grad_norm": 1.8582955598831177,
"learning_rate": 3.430182613223254e-10,
"loss": 0.3129,
"step": 1487
},
{
"epoch": 2.993963782696177,
"grad_norm": 1.8397884368896484,
"learning_rate": 2.1953259090101708e-10,
"loss": 0.3229,
"step": 1488
},
{
"epoch": 2.9959758551307845,
"grad_norm": 1.9026719331741333,
"learning_rate": 1.2348747773172075e-10,
"loss": 0.2996,
"step": 1489
},
{
"epoch": 2.9979879275653922,
"grad_norm": 1.9118090867996216,
"learning_rate": 5.488344894444275e-11,
"loss": 0.3106,
"step": 1490
},
{
"epoch": 3.0,
"grad_norm": 1.7826815843582153,
"learning_rate": 1.3720881062440073e-11,
"loss": 0.2744,
"step": 1491
}
],
"logging_steps": 1,
"max_steps": 1491,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.5071875383559193e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}