zac2025_phase1 / trainer_state.json
thangquoc's picture
Upload folder using huggingface_hub
51b32f9 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.091220068415051,
"eval_steps": 500,
"global_step": 900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004561003420752566,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 2.0969,
"step": 1
},
{
"epoch": 0.009122006841505131,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 2.1202,
"step": 2
},
{
"epoch": 0.013683010262257697,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 1.9863,
"step": 3
},
{
"epoch": 0.018244013683010263,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 2.1056,
"step": 4
},
{
"epoch": 0.02280501710376283,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 2.1682,
"step": 5
},
{
"epoch": 0.027366020524515394,
"grad_norm": 126.99530029296875,
"learning_rate": 0.0,
"loss": 2.16,
"step": 6
},
{
"epoch": 0.03192702394526796,
"grad_norm": 136.298583984375,
"learning_rate": 3.0303030303030305e-06,
"loss": 2.1097,
"step": 7
},
{
"epoch": 0.036488027366020526,
"grad_norm": 118.9027328491211,
"learning_rate": 6.060606060606061e-06,
"loss": 2.0874,
"step": 8
},
{
"epoch": 0.04104903078677309,
"grad_norm": 95.98336791992188,
"learning_rate": 9.090909090909091e-06,
"loss": 2.0198,
"step": 9
},
{
"epoch": 0.04561003420752566,
"grad_norm": 91.16958618164062,
"learning_rate": 1.2121212121212122e-05,
"loss": 1.9522,
"step": 10
},
{
"epoch": 0.05017103762827822,
"grad_norm": 66.85772705078125,
"learning_rate": 1.5151515151515153e-05,
"loss": 1.6778,
"step": 11
},
{
"epoch": 0.05473204104903079,
"grad_norm": 53.472843170166016,
"learning_rate": 1.8181818181818182e-05,
"loss": 1.4714,
"step": 12
},
{
"epoch": 0.059293044469783354,
"grad_norm": 49.19029235839844,
"learning_rate": 2.1212121212121215e-05,
"loss": 1.294,
"step": 13
},
{
"epoch": 0.06385404789053592,
"grad_norm": 50.0140266418457,
"learning_rate": 2.4242424242424244e-05,
"loss": 1.4804,
"step": 14
},
{
"epoch": 0.06841505131128849,
"grad_norm": 46.8694953918457,
"learning_rate": 2.7272727272727273e-05,
"loss": 1.1335,
"step": 15
},
{
"epoch": 0.07297605473204105,
"grad_norm": 43.30156326293945,
"learning_rate": 3.0303030303030306e-05,
"loss": 1.2229,
"step": 16
},
{
"epoch": 0.07753705815279362,
"grad_norm": 51.07203674316406,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.2547,
"step": 17
},
{
"epoch": 0.08209806157354618,
"grad_norm": 62.249114990234375,
"learning_rate": 3.6363636363636364e-05,
"loss": 1.0567,
"step": 18
},
{
"epoch": 0.08665906499429875,
"grad_norm": 47.2119026184082,
"learning_rate": 3.939393939393939e-05,
"loss": 0.9491,
"step": 19
},
{
"epoch": 0.09122006841505131,
"grad_norm": 42.218753814697266,
"learning_rate": 4.242424242424243e-05,
"loss": 0.8794,
"step": 20
},
{
"epoch": 0.09578107183580388,
"grad_norm": 42.218753814697266,
"learning_rate": 4.545454545454546e-05,
"loss": 1.1536,
"step": 21
},
{
"epoch": 0.10034207525655645,
"grad_norm": 44.523101806640625,
"learning_rate": 4.545454545454546e-05,
"loss": 0.7646,
"step": 22
},
{
"epoch": 0.10490307867730901,
"grad_norm": 63.81179428100586,
"learning_rate": 4.848484848484849e-05,
"loss": 0.9237,
"step": 23
},
{
"epoch": 0.10946408209806158,
"grad_norm": 46.185333251953125,
"learning_rate": 5.151515151515152e-05,
"loss": 0.9475,
"step": 24
},
{
"epoch": 0.11402508551881414,
"grad_norm": 50.09680938720703,
"learning_rate": 5.4545454545454546e-05,
"loss": 0.6559,
"step": 25
},
{
"epoch": 0.11858608893956671,
"grad_norm": 57.93541717529297,
"learning_rate": 5.757575757575758e-05,
"loss": 0.9655,
"step": 26
},
{
"epoch": 0.12314709236031927,
"grad_norm": 44.12418746948242,
"learning_rate": 6.060606060606061e-05,
"loss": 0.991,
"step": 27
},
{
"epoch": 0.12770809578107184,
"grad_norm": 55.63926315307617,
"learning_rate": 6.363636363636364e-05,
"loss": 0.9469,
"step": 28
},
{
"epoch": 0.1322690992018244,
"grad_norm": 52.04874038696289,
"learning_rate": 6.666666666666667e-05,
"loss": 0.765,
"step": 29
},
{
"epoch": 0.13683010262257697,
"grad_norm": 78.61589813232422,
"learning_rate": 6.96969696969697e-05,
"loss": 0.8077,
"step": 30
},
{
"epoch": 0.14139110604332952,
"grad_norm": 78.61589813232422,
"learning_rate": 7.272727272727273e-05,
"loss": 0.6946,
"step": 31
},
{
"epoch": 0.1459521094640821,
"grad_norm": 45.660404205322266,
"learning_rate": 7.272727272727273e-05,
"loss": 0.8566,
"step": 32
},
{
"epoch": 0.15051311288483465,
"grad_norm": 45.660404205322266,
"learning_rate": 7.575757575757576e-05,
"loss": 0.9645,
"step": 33
},
{
"epoch": 0.15507411630558723,
"grad_norm": 45.660404205322266,
"learning_rate": 7.575757575757576e-05,
"loss": 0.8577,
"step": 34
},
{
"epoch": 0.15963511972633979,
"grad_norm": 44.082706451416016,
"learning_rate": 7.575757575757576e-05,
"loss": 0.6715,
"step": 35
},
{
"epoch": 0.16419612314709237,
"grad_norm": 723.3299560546875,
"learning_rate": 7.878787878787879e-05,
"loss": 0.9595,
"step": 36
},
{
"epoch": 0.16875712656784492,
"grad_norm": 102.72968292236328,
"learning_rate": 8.181818181818183e-05,
"loss": 0.8705,
"step": 37
},
{
"epoch": 0.1733181299885975,
"grad_norm": 51.577972412109375,
"learning_rate": 8.484848484848486e-05,
"loss": 0.9124,
"step": 38
},
{
"epoch": 0.17787913340935005,
"grad_norm": 79.64832305908203,
"learning_rate": 8.787878787878789e-05,
"loss": 0.8608,
"step": 39
},
{
"epoch": 0.18244013683010263,
"grad_norm": 74.03942108154297,
"learning_rate": 9.090909090909092e-05,
"loss": 0.7678,
"step": 40
},
{
"epoch": 0.18700114025085518,
"grad_norm": 75.6192855834961,
"learning_rate": 9.393939393939395e-05,
"loss": 0.8841,
"step": 41
},
{
"epoch": 0.19156214367160776,
"grad_norm": 151.26239013671875,
"learning_rate": 9.696969696969698e-05,
"loss": 0.6354,
"step": 42
},
{
"epoch": 0.1961231470923603,
"grad_norm": 63.19050598144531,
"learning_rate": 0.0001,
"loss": 1.0635,
"step": 43
},
{
"epoch": 0.2006841505131129,
"grad_norm": 69.78765869140625,
"learning_rate": 9.999978327420663e-05,
"loss": 0.5772,
"step": 44
},
{
"epoch": 0.20524515393386544,
"grad_norm": 74.76192474365234,
"learning_rate": 9.99991330987053e-05,
"loss": 0.8419,
"step": 45
},
{
"epoch": 0.20980615735461802,
"grad_norm": 65.3372802734375,
"learning_rate": 9.999804947913241e-05,
"loss": 0.7743,
"step": 46
},
{
"epoch": 0.21436716077537057,
"grad_norm": 84.05085754394531,
"learning_rate": 9.999653242488188e-05,
"loss": 0.8496,
"step": 47
},
{
"epoch": 0.21892816419612315,
"grad_norm": 45.334293365478516,
"learning_rate": 9.999458194910512e-05,
"loss": 0.779,
"step": 48
},
{
"epoch": 0.2234891676168757,
"grad_norm": 59.37651443481445,
"learning_rate": 9.999219806871085e-05,
"loss": 0.7776,
"step": 49
},
{
"epoch": 0.22805017103762829,
"grad_norm": 44.242713928222656,
"learning_rate": 9.998938080436503e-05,
"loss": 0.7422,
"step": 50
},
{
"epoch": 0.23261117445838084,
"grad_norm": 76.85882568359375,
"learning_rate": 9.998613018049059e-05,
"loss": 0.8527,
"step": 51
},
{
"epoch": 0.23717217787913342,
"grad_norm": 53.421348571777344,
"learning_rate": 9.99824462252673e-05,
"loss": 0.6793,
"step": 52
},
{
"epoch": 0.24173318129988597,
"grad_norm": 74.96648406982422,
"learning_rate": 9.997832897063148e-05,
"loss": 0.645,
"step": 53
},
{
"epoch": 0.24629418472063855,
"grad_norm": 36.90277099609375,
"learning_rate": 9.997377845227576e-05,
"loss": 0.6156,
"step": 54
},
{
"epoch": 0.2508551881413911,
"grad_norm": 45.15210723876953,
"learning_rate": 9.996879470964868e-05,
"loss": 0.689,
"step": 55
},
{
"epoch": 0.2554161915621437,
"grad_norm": 84.45231628417969,
"learning_rate": 9.996337778595453e-05,
"loss": 1.1516,
"step": 56
},
{
"epoch": 0.25997719498289623,
"grad_norm": 74.82112121582031,
"learning_rate": 9.995752772815274e-05,
"loss": 0.7793,
"step": 57
},
{
"epoch": 0.2645381984036488,
"grad_norm": 46.81865692138672,
"learning_rate": 9.995124458695768e-05,
"loss": 0.677,
"step": 58
},
{
"epoch": 0.2690992018244014,
"grad_norm": 50.14031219482422,
"learning_rate": 9.994452841683808e-05,
"loss": 0.7934,
"step": 59
},
{
"epoch": 0.27366020524515394,
"grad_norm": 70.68441772460938,
"learning_rate": 9.993737927601663e-05,
"loss": 0.694,
"step": 60
},
{
"epoch": 0.2782212086659065,
"grad_norm": 41.65163040161133,
"learning_rate": 9.992979722646948e-05,
"loss": 0.6657,
"step": 61
},
{
"epoch": 0.28278221208665905,
"grad_norm": 43.47587203979492,
"learning_rate": 9.992178233392564e-05,
"loss": 0.6069,
"step": 62
},
{
"epoch": 0.28734321550741165,
"grad_norm": 46.3960075378418,
"learning_rate": 9.991333466786648e-05,
"loss": 0.7959,
"step": 63
},
{
"epoch": 0.2919042189281642,
"grad_norm": 36.781761169433594,
"learning_rate": 9.990445430152507e-05,
"loss": 0.6845,
"step": 64
},
{
"epoch": 0.29646522234891676,
"grad_norm": 47.75035095214844,
"learning_rate": 9.989514131188559e-05,
"loss": 0.9177,
"step": 65
},
{
"epoch": 0.3010262257696693,
"grad_norm": 45.957950592041016,
"learning_rate": 9.988539577968265e-05,
"loss": 0.7967,
"step": 66
},
{
"epoch": 0.3055872291904219,
"grad_norm": 26.79014015197754,
"learning_rate": 9.987521778940058e-05,
"loss": 0.4885,
"step": 67
},
{
"epoch": 0.31014823261117447,
"grad_norm": 46.98130416870117,
"learning_rate": 9.986460742927271e-05,
"loss": 0.7922,
"step": 68
},
{
"epoch": 0.314709236031927,
"grad_norm": 29.98710823059082,
"learning_rate": 9.985356479128056e-05,
"loss": 0.6239,
"step": 69
},
{
"epoch": 0.31927023945267957,
"grad_norm": 29.05475616455078,
"learning_rate": 9.984208997115312e-05,
"loss": 0.5977,
"step": 70
},
{
"epoch": 0.3238312428734322,
"grad_norm": 30.07978057861328,
"learning_rate": 9.9830183068366e-05,
"loss": 0.6691,
"step": 71
},
{
"epoch": 0.32839224629418473,
"grad_norm": 44.7830810546875,
"learning_rate": 9.981784418614048e-05,
"loss": 0.8664,
"step": 72
},
{
"epoch": 0.3329532497149373,
"grad_norm": 36.961578369140625,
"learning_rate": 9.980507343144273e-05,
"loss": 0.6482,
"step": 73
},
{
"epoch": 0.33751425313568983,
"grad_norm": 40.433258056640625,
"learning_rate": 9.979187091498284e-05,
"loss": 0.7933,
"step": 74
},
{
"epoch": 0.34207525655644244,
"grad_norm": 21.32088279724121,
"learning_rate": 9.977823675121383e-05,
"loss": 0.562,
"step": 75
},
{
"epoch": 0.346636259977195,
"grad_norm": 36.999900817871094,
"learning_rate": 9.97641710583307e-05,
"loss": 0.7949,
"step": 76
},
{
"epoch": 0.35119726339794755,
"grad_norm": 34.943058013916016,
"learning_rate": 9.974967395826941e-05,
"loss": 0.6669,
"step": 77
},
{
"epoch": 0.3557582668187001,
"grad_norm": 40.3181037902832,
"learning_rate": 9.973474557670575e-05,
"loss": 0.65,
"step": 78
},
{
"epoch": 0.3603192702394527,
"grad_norm": 26.141529083251953,
"learning_rate": 9.971938604305435e-05,
"loss": 0.5017,
"step": 79
},
{
"epoch": 0.36488027366020526,
"grad_norm": 27.214019775390625,
"learning_rate": 9.970359549046749e-05,
"loss": 0.5175,
"step": 80
},
{
"epoch": 0.3694412770809578,
"grad_norm": 30.315534591674805,
"learning_rate": 9.968737405583396e-05,
"loss": 0.6422,
"step": 81
},
{
"epoch": 0.37400228050171036,
"grad_norm": 38.341400146484375,
"learning_rate": 9.967072187977795e-05,
"loss": 0.5456,
"step": 82
},
{
"epoch": 0.37856328392246297,
"grad_norm": 24.291261672973633,
"learning_rate": 9.965363910665761e-05,
"loss": 0.512,
"step": 83
},
{
"epoch": 0.3831242873432155,
"grad_norm": 27.774852752685547,
"learning_rate": 9.963612588456412e-05,
"loss": 0.5651,
"step": 84
},
{
"epoch": 0.38768529076396807,
"grad_norm": 28.017744064331055,
"learning_rate": 9.961818236532012e-05,
"loss": 0.5831,
"step": 85
},
{
"epoch": 0.3922462941847206,
"grad_norm": 52.88800811767578,
"learning_rate": 9.959980870447854e-05,
"loss": 0.609,
"step": 86
},
{
"epoch": 0.39680729760547323,
"grad_norm": 41.79977035522461,
"learning_rate": 9.958100506132127e-05,
"loss": 0.9048,
"step": 87
},
{
"epoch": 0.4013683010262258,
"grad_norm": 22.837493896484375,
"learning_rate": 9.956177159885765e-05,
"loss": 0.526,
"step": 88
},
{
"epoch": 0.40592930444697833,
"grad_norm": 32.65010070800781,
"learning_rate": 9.954210848382318e-05,
"loss": 0.7481,
"step": 89
},
{
"epoch": 0.4104903078677309,
"grad_norm": 27.46462631225586,
"learning_rate": 9.952201588667804e-05,
"loss": 0.56,
"step": 90
},
{
"epoch": 0.4150513112884835,
"grad_norm": 24.279617309570312,
"learning_rate": 9.950149398160562e-05,
"loss": 0.568,
"step": 91
},
{
"epoch": 0.41961231470923605,
"grad_norm": 27.96613311767578,
"learning_rate": 9.94805429465109e-05,
"loss": 0.5389,
"step": 92
},
{
"epoch": 0.4241733181299886,
"grad_norm": 29.904308319091797,
"learning_rate": 9.945916296301913e-05,
"loss": 0.6791,
"step": 93
},
{
"epoch": 0.42873432155074115,
"grad_norm": 28.36408042907715,
"learning_rate": 9.943735421647404e-05,
"loss": 0.6191,
"step": 94
},
{
"epoch": 0.43329532497149376,
"grad_norm": 28.37791633605957,
"learning_rate": 9.941511689593633e-05,
"loss": 0.6578,
"step": 95
},
{
"epoch": 0.4378563283922463,
"grad_norm": 29.4257869720459,
"learning_rate": 9.939245119418207e-05,
"loss": 0.7219,
"step": 96
},
{
"epoch": 0.44241733181299886,
"grad_norm": 31.684606552124023,
"learning_rate": 9.936935730770093e-05,
"loss": 0.7971,
"step": 97
},
{
"epoch": 0.4469783352337514,
"grad_norm": 41.304290771484375,
"learning_rate": 9.934583543669453e-05,
"loss": 0.5916,
"step": 98
},
{
"epoch": 0.45153933865450396,
"grad_norm": 45.91229248046875,
"learning_rate": 9.932188578507476e-05,
"loss": 0.6905,
"step": 99
},
{
"epoch": 0.45610034207525657,
"grad_norm": 18.211179733276367,
"learning_rate": 9.929750856046187e-05,
"loss": 0.4074,
"step": 100
},
{
"epoch": 0.4606613454960091,
"grad_norm": 22.817419052124023,
"learning_rate": 9.92727039741828e-05,
"loss": 0.5474,
"step": 101
},
{
"epoch": 0.4652223489167617,
"grad_norm": 17.524913787841797,
"learning_rate": 9.924747224126932e-05,
"loss": 0.3943,
"step": 102
},
{
"epoch": 0.4697833523375142,
"grad_norm": 29.555734634399414,
"learning_rate": 9.922181358045607e-05,
"loss": 0.4851,
"step": 103
},
{
"epoch": 0.47434435575826683,
"grad_norm": 17.645509719848633,
"learning_rate": 9.919572821417886e-05,
"loss": 0.4485,
"step": 104
},
{
"epoch": 0.4789053591790194,
"grad_norm": 27.566722869873047,
"learning_rate": 9.916921636857253e-05,
"loss": 0.5754,
"step": 105
},
{
"epoch": 0.48346636259977194,
"grad_norm": 23.578996658325195,
"learning_rate": 9.91422782734691e-05,
"loss": 0.5258,
"step": 106
},
{
"epoch": 0.4880273660205245,
"grad_norm": 82.11543273925781,
"learning_rate": 9.911491416239578e-05,
"loss": 0.6878,
"step": 107
},
{
"epoch": 0.4925883694412771,
"grad_norm": 23.49557876586914,
"learning_rate": 9.908712427257291e-05,
"loss": 0.6356,
"step": 108
},
{
"epoch": 0.49714937286202965,
"grad_norm": 35.3658447265625,
"learning_rate": 9.905890884491195e-05,
"loss": 0.5214,
"step": 109
},
{
"epoch": 0.5017103762827823,
"grad_norm": 31.83234214782715,
"learning_rate": 9.903026812401333e-05,
"loss": 0.4928,
"step": 110
},
{
"epoch": 0.5062713797035348,
"grad_norm": 27.706256866455078,
"learning_rate": 9.900120235816435e-05,
"loss": 0.7318,
"step": 111
},
{
"epoch": 0.5108323831242874,
"grad_norm": 34.21133041381836,
"learning_rate": 9.897171179933707e-05,
"loss": 0.4351,
"step": 112
},
{
"epoch": 0.5153933865450399,
"grad_norm": 27.63327407836914,
"learning_rate": 9.894179670318606e-05,
"loss": 0.4016,
"step": 113
},
{
"epoch": 0.5199543899657925,
"grad_norm": 19.61988639831543,
"learning_rate": 9.891145732904627e-05,
"loss": 0.4805,
"step": 114
},
{
"epoch": 0.5245153933865451,
"grad_norm": 32.44860076904297,
"learning_rate": 9.88806939399307e-05,
"loss": 0.6809,
"step": 115
},
{
"epoch": 0.5290763968072976,
"grad_norm": 32.945152282714844,
"learning_rate": 9.884950680252811e-05,
"loss": 0.7838,
"step": 116
},
{
"epoch": 0.5336374002280502,
"grad_norm": 26.530765533447266,
"learning_rate": 9.881789618720081e-05,
"loss": 0.6608,
"step": 117
},
{
"epoch": 0.5381984036488028,
"grad_norm": 29.559206008911133,
"learning_rate": 9.878586236798222e-05,
"loss": 0.6232,
"step": 118
},
{
"epoch": 0.5427594070695553,
"grad_norm": 18.236879348754883,
"learning_rate": 9.875340562257453e-05,
"loss": 0.4325,
"step": 119
},
{
"epoch": 0.5473204104903079,
"grad_norm": 16.536705017089844,
"learning_rate": 9.872052623234632e-05,
"loss": 0.3459,
"step": 120
},
{
"epoch": 0.5518814139110604,
"grad_norm": 32.01934051513672,
"learning_rate": 9.868722448233004e-05,
"loss": 0.6023,
"step": 121
},
{
"epoch": 0.556442417331813,
"grad_norm": 19.95067024230957,
"learning_rate": 9.865350066121961e-05,
"loss": 0.4983,
"step": 122
},
{
"epoch": 0.5610034207525656,
"grad_norm": 17.09720802307129,
"learning_rate": 9.861935506136793e-05,
"loss": 0.5208,
"step": 123
},
{
"epoch": 0.5655644241733181,
"grad_norm": 40.02044677734375,
"learning_rate": 9.85847879787843e-05,
"loss": 0.4965,
"step": 124
},
{
"epoch": 0.5701254275940707,
"grad_norm": 24.024381637573242,
"learning_rate": 9.854979971313182e-05,
"loss": 0.6857,
"step": 125
},
{
"epoch": 0.5746864310148233,
"grad_norm": 23.170162200927734,
"learning_rate": 9.85143905677249e-05,
"loss": 0.4628,
"step": 126
},
{
"epoch": 0.5792474344355758,
"grad_norm": 32.29564666748047,
"learning_rate": 9.847856084952653e-05,
"loss": 0.5976,
"step": 127
},
{
"epoch": 0.5838084378563284,
"grad_norm": 22.218772888183594,
"learning_rate": 9.844231086914571e-05,
"loss": 0.4436,
"step": 128
},
{
"epoch": 0.5883694412770809,
"grad_norm": 16.03778648376465,
"learning_rate": 9.84056409408346e-05,
"loss": 0.4686,
"step": 129
},
{
"epoch": 0.5929304446978335,
"grad_norm": 23.267412185668945,
"learning_rate": 9.836855138248605e-05,
"loss": 0.5099,
"step": 130
},
{
"epoch": 0.5974914481185861,
"grad_norm": 38.55447006225586,
"learning_rate": 9.833104251563056e-05,
"loss": 0.6403,
"step": 131
},
{
"epoch": 0.6020524515393386,
"grad_norm": 23.0700740814209,
"learning_rate": 9.829311466543373e-05,
"loss": 0.4528,
"step": 132
},
{
"epoch": 0.6066134549600912,
"grad_norm": 18.824100494384766,
"learning_rate": 9.825476816069326e-05,
"loss": 0.5267,
"step": 133
},
{
"epoch": 0.6111744583808438,
"grad_norm": 30.157243728637695,
"learning_rate": 9.821600333383625e-05,
"loss": 0.5274,
"step": 134
},
{
"epoch": 0.6157354618015963,
"grad_norm": 17.902389526367188,
"learning_rate": 9.817682052091618e-05,
"loss": 0.4983,
"step": 135
},
{
"epoch": 0.6202964652223489,
"grad_norm": 23.980289459228516,
"learning_rate": 9.813722006161013e-05,
"loss": 0.5766,
"step": 136
},
{
"epoch": 0.6248574686431014,
"grad_norm": 18.97156524658203,
"learning_rate": 9.809720229921572e-05,
"loss": 0.4777,
"step": 137
},
{
"epoch": 0.629418472063854,
"grad_norm": 21.262067794799805,
"learning_rate": 9.805676758064821e-05,
"loss": 0.6721,
"step": 138
},
{
"epoch": 0.6339794754846066,
"grad_norm": 17.919252395629883,
"learning_rate": 9.801591625643745e-05,
"loss": 0.3766,
"step": 139
},
{
"epoch": 0.6385404789053591,
"grad_norm": 14.64062213897705,
"learning_rate": 9.797464868072488e-05,
"loss": 0.4098,
"step": 140
},
{
"epoch": 0.6431014823261118,
"grad_norm": 24.484783172607422,
"learning_rate": 9.79329652112604e-05,
"loss": 0.6009,
"step": 141
},
{
"epoch": 0.6476624857468644,
"grad_norm": 23.56806182861328,
"learning_rate": 9.789086620939936e-05,
"loss": 0.5756,
"step": 142
},
{
"epoch": 0.6522234891676169,
"grad_norm": 35.71012496948242,
"learning_rate": 9.784835204009932e-05,
"loss": 0.6669,
"step": 143
},
{
"epoch": 0.6567844925883695,
"grad_norm": 25.027629852294922,
"learning_rate": 9.780542307191698e-05,
"loss": 0.7502,
"step": 144
},
{
"epoch": 0.661345496009122,
"grad_norm": 20.506362915039062,
"learning_rate": 9.77620796770049e-05,
"loss": 0.5575,
"step": 145
},
{
"epoch": 0.6659064994298746,
"grad_norm": 18.51320457458496,
"learning_rate": 9.771832223110839e-05,
"loss": 0.4005,
"step": 146
},
{
"epoch": 0.6704675028506272,
"grad_norm": 17.1466007232666,
"learning_rate": 9.76741511135621e-05,
"loss": 0.513,
"step": 147
},
{
"epoch": 0.6750285062713797,
"grad_norm": 17.755672454833984,
"learning_rate": 9.762956670728685e-05,
"loss": 0.5208,
"step": 148
},
{
"epoch": 0.6795895096921323,
"grad_norm": 17.082569122314453,
"learning_rate": 9.758456939878629e-05,
"loss": 0.5182,
"step": 149
},
{
"epoch": 0.6841505131128849,
"grad_norm": 19.6417179107666,
"learning_rate": 9.753915957814352e-05,
"loss": 0.6026,
"step": 150
},
{
"epoch": 0.6887115165336374,
"grad_norm": 16.423038482666016,
"learning_rate": 9.74933376390177e-05,
"loss": 0.4535,
"step": 151
},
{
"epoch": 0.69327251995439,
"grad_norm": 12.587494850158691,
"learning_rate": 9.744710397864067e-05,
"loss": 0.3239,
"step": 152
},
{
"epoch": 0.6978335233751425,
"grad_norm": 16.246538162231445,
"learning_rate": 9.740045899781352e-05,
"loss": 0.4221,
"step": 153
},
{
"epoch": 0.7023945267958951,
"grad_norm": 20.31894302368164,
"learning_rate": 9.735340310090307e-05,
"loss": 0.3357,
"step": 154
},
{
"epoch": 0.7069555302166477,
"grad_norm": 39.6623420715332,
"learning_rate": 9.730593669583836e-05,
"loss": 0.5047,
"step": 155
},
{
"epoch": 0.7115165336374002,
"grad_norm": 16.25356101989746,
"learning_rate": 9.725806019410717e-05,
"loss": 0.5985,
"step": 156
},
{
"epoch": 0.7160775370581528,
"grad_norm": 18.692333221435547,
"learning_rate": 9.720977401075242e-05,
"loss": 0.4652,
"step": 157
},
{
"epoch": 0.7206385404789054,
"grad_norm": 15.554421424865723,
"learning_rate": 9.716107856436855e-05,
"loss": 0.3915,
"step": 158
},
{
"epoch": 0.7251995438996579,
"grad_norm": 14.215270042419434,
"learning_rate": 9.711197427709796e-05,
"loss": 0.4865,
"step": 159
},
{
"epoch": 0.7297605473204105,
"grad_norm": 21.14404296875,
"learning_rate": 9.706246157462726e-05,
"loss": 0.4058,
"step": 160
},
{
"epoch": 0.734321550741163,
"grad_norm": 24.879043579101562,
"learning_rate": 9.701254088618362e-05,
"loss": 0.4697,
"step": 161
},
{
"epoch": 0.7388825541619156,
"grad_norm": 20.374792098999023,
"learning_rate": 9.696221264453109e-05,
"loss": 0.3389,
"step": 162
},
{
"epoch": 0.7434435575826682,
"grad_norm": 21.55912208557129,
"learning_rate": 9.69114772859668e-05,
"loss": 0.5669,
"step": 163
},
{
"epoch": 0.7480045610034207,
"grad_norm": 13.084688186645508,
"learning_rate": 9.686033525031719e-05,
"loss": 0.3422,
"step": 164
},
{
"epoch": 0.7525655644241733,
"grad_norm": 16.64484405517578,
"learning_rate": 9.680878698093417e-05,
"loss": 0.5166,
"step": 165
},
{
"epoch": 0.7571265678449259,
"grad_norm": 18.273216247558594,
"learning_rate": 9.675683292469132e-05,
"loss": 0.5568,
"step": 166
},
{
"epoch": 0.7616875712656784,
"grad_norm": 26.35822868347168,
"learning_rate": 9.670447353198e-05,
"loss": 0.6115,
"step": 167
},
{
"epoch": 0.766248574686431,
"grad_norm": 18.36164665222168,
"learning_rate": 9.665170925670548e-05,
"loss": 0.3441,
"step": 168
},
{
"epoch": 0.7708095781071835,
"grad_norm": 17.316587448120117,
"learning_rate": 9.659854055628291e-05,
"loss": 0.451,
"step": 169
},
{
"epoch": 0.7753705815279361,
"grad_norm": 23.489328384399414,
"learning_rate": 9.654496789163345e-05,
"loss": 0.5535,
"step": 170
},
{
"epoch": 0.7799315849486887,
"grad_norm": 19.27407455444336,
"learning_rate": 9.649099172718021e-05,
"loss": 0.5019,
"step": 171
},
{
"epoch": 0.7844925883694412,
"grad_norm": 11.64967155456543,
"learning_rate": 9.643661253084431e-05,
"loss": 0.3258,
"step": 172
},
{
"epoch": 0.7890535917901939,
"grad_norm": 19.625473022460938,
"learning_rate": 9.638183077404069e-05,
"loss": 0.3288,
"step": 173
},
{
"epoch": 0.7936145952109465,
"grad_norm": 18.685232162475586,
"learning_rate": 9.632664693167416e-05,
"loss": 0.3644,
"step": 174
},
{
"epoch": 0.798175598631699,
"grad_norm": 26.637842178344727,
"learning_rate": 9.627106148213522e-05,
"loss": 0.644,
"step": 175
},
{
"epoch": 0.8027366020524516,
"grad_norm": 23.631614685058594,
"learning_rate": 9.621507490729585e-05,
"loss": 0.3727,
"step": 176
},
{
"epoch": 0.8072976054732041,
"grad_norm": 15.275004386901855,
"learning_rate": 9.615868769250546e-05,
"loss": 0.3924,
"step": 177
},
{
"epoch": 0.8118586088939567,
"grad_norm": 15.47768497467041,
"learning_rate": 9.610190032658663e-05,
"loss": 0.4487,
"step": 178
},
{
"epoch": 0.8164196123147093,
"grad_norm": 14.71181583404541,
"learning_rate": 9.604471330183083e-05,
"loss": 0.32,
"step": 179
},
{
"epoch": 0.8209806157354618,
"grad_norm": 9.841964721679688,
"learning_rate": 9.598712711399416e-05,
"loss": 0.2505,
"step": 180
},
{
"epoch": 0.8255416191562144,
"grad_norm": 14.165433883666992,
"learning_rate": 9.592914226229314e-05,
"loss": 0.4393,
"step": 181
},
{
"epoch": 0.830102622576967,
"grad_norm": 23.092065811157227,
"learning_rate": 9.587075924940028e-05,
"loss": 0.4625,
"step": 182
},
{
"epoch": 0.8346636259977195,
"grad_norm": 17.662593841552734,
"learning_rate": 9.581197858143978e-05,
"loss": 0.4665,
"step": 183
},
{
"epoch": 0.8392246294184721,
"grad_norm": 16.949716567993164,
"learning_rate": 9.575280076798309e-05,
"loss": 0.4784,
"step": 184
},
{
"epoch": 0.8437856328392246,
"grad_norm": 18.952489852905273,
"learning_rate": 9.569322632204458e-05,
"loss": 0.3888,
"step": 185
},
{
"epoch": 0.8483466362599772,
"grad_norm": 16.350954055786133,
"learning_rate": 9.563325576007701e-05,
"loss": 0.3935,
"step": 186
},
{
"epoch": 0.8529076396807298,
"grad_norm": 16.852394104003906,
"learning_rate": 9.557288960196707e-05,
"loss": 0.4025,
"step": 187
},
{
"epoch": 0.8574686431014823,
"grad_norm": 14.821525573730469,
"learning_rate": 9.551212837103092e-05,
"loss": 0.3752,
"step": 188
},
{
"epoch": 0.8620296465222349,
"grad_norm": 20.433399200439453,
"learning_rate": 9.545097259400958e-05,
"loss": 0.3219,
"step": 189
},
{
"epoch": 0.8665906499429875,
"grad_norm": 11.883235931396484,
"learning_rate": 9.538942280106443e-05,
"loss": 0.3892,
"step": 190
},
{
"epoch": 0.87115165336374,
"grad_norm": 14.423933029174805,
"learning_rate": 9.53274795257726e-05,
"loss": 0.3798,
"step": 191
},
{
"epoch": 0.8757126567844926,
"grad_norm": 15.010958671569824,
"learning_rate": 9.526514330512225e-05,
"loss": 0.3801,
"step": 192
},
{
"epoch": 0.8802736602052451,
"grad_norm": 21.800418853759766,
"learning_rate": 9.520241467950811e-05,
"loss": 0.4404,
"step": 193
},
{
"epoch": 0.8848346636259977,
"grad_norm": 15.904304504394531,
"learning_rate": 9.513929419272662e-05,
"loss": 0.3278,
"step": 194
},
{
"epoch": 0.8893956670467503,
"grad_norm": 10.985480308532715,
"learning_rate": 9.507578239197126e-05,
"loss": 0.2883,
"step": 195
},
{
"epoch": 0.8939566704675028,
"grad_norm": 10.487696647644043,
"learning_rate": 9.501187982782785e-05,
"loss": 0.2636,
"step": 196
},
{
"epoch": 0.8985176738882554,
"grad_norm": 19.759944915771484,
"learning_rate": 9.494758705426978e-05,
"loss": 0.3749,
"step": 197
},
{
"epoch": 0.9030786773090079,
"grad_norm": 17.322166442871094,
"learning_rate": 9.48829046286531e-05,
"loss": 0.4068,
"step": 198
},
{
"epoch": 0.9076396807297605,
"grad_norm": 15.3864107131958,
"learning_rate": 9.481783311171183e-05,
"loss": 0.3576,
"step": 199
},
{
"epoch": 0.9122006841505131,
"grad_norm": 13.966897964477539,
"learning_rate": 9.475237306755302e-05,
"loss": 0.4239,
"step": 200
},
{
"epoch": 0.9167616875712656,
"grad_norm": 14.596879005432129,
"learning_rate": 9.468652506365187e-05,
"loss": 0.3637,
"step": 201
},
{
"epoch": 0.9213226909920182,
"grad_norm": 20.099353790283203,
"learning_rate": 9.46202896708468e-05,
"loss": 0.5008,
"step": 202
},
{
"epoch": 0.9258836944127709,
"grad_norm": 14.773473739624023,
"learning_rate": 9.455366746333454e-05,
"loss": 0.3506,
"step": 203
},
{
"epoch": 0.9304446978335233,
"grad_norm": 18.689729690551758,
"learning_rate": 9.448665901866514e-05,
"loss": 0.4078,
"step": 204
},
{
"epoch": 0.935005701254276,
"grad_norm": 13.453817367553711,
"learning_rate": 9.441926491773691e-05,
"loss": 0.3253,
"step": 205
},
{
"epoch": 0.9395667046750285,
"grad_norm": 14.93052864074707,
"learning_rate": 9.435148574479144e-05,
"loss": 0.3576,
"step": 206
},
{
"epoch": 0.9441277080957811,
"grad_norm": 11.697999000549316,
"learning_rate": 9.428332208740857e-05,
"loss": 0.3115,
"step": 207
},
{
"epoch": 0.9486887115165337,
"grad_norm": 13.518777847290039,
"learning_rate": 9.421477453650118e-05,
"loss": 0.364,
"step": 208
},
{
"epoch": 0.9532497149372862,
"grad_norm": 10.434165000915527,
"learning_rate": 9.414584368631019e-05,
"loss": 0.2677,
"step": 209
},
{
"epoch": 0.9578107183580388,
"grad_norm": 16.765907287597656,
"learning_rate": 9.407653013439928e-05,
"loss": 0.5504,
"step": 210
},
{
"epoch": 0.9623717217787914,
"grad_norm": 10.962894439697266,
"learning_rate": 9.400683448164987e-05,
"loss": 0.2913,
"step": 211
},
{
"epoch": 0.9669327251995439,
"grad_norm": 27.222328186035156,
"learning_rate": 9.393675733225578e-05,
"loss": 0.6258,
"step": 212
},
{
"epoch": 0.9714937286202965,
"grad_norm": 17.89396095275879,
"learning_rate": 9.386629929371804e-05,
"loss": 0.3468,
"step": 213
},
{
"epoch": 0.976054732041049,
"grad_norm": 11.917913436889648,
"learning_rate": 9.379546097683962e-05,
"loss": 0.3384,
"step": 214
},
{
"epoch": 0.9806157354618016,
"grad_norm": 16.30259895324707,
"learning_rate": 9.372424299572013e-05,
"loss": 0.4395,
"step": 215
},
{
"epoch": 0.9851767388825542,
"grad_norm": 19.039505004882812,
"learning_rate": 9.365264596775051e-05,
"loss": 0.4235,
"step": 216
},
{
"epoch": 0.9897377423033067,
"grad_norm": 21.45336151123047,
"learning_rate": 9.35806705136077e-05,
"loss": 0.3146,
"step": 217
},
{
"epoch": 0.9942987457240593,
"grad_norm": 13.630745887756348,
"learning_rate": 9.350831725724916e-05,
"loss": 0.3927,
"step": 218
},
{
"epoch": 0.9988597491448119,
"grad_norm": 13.76926326751709,
"learning_rate": 9.343558682590756e-05,
"loss": 0.3581,
"step": 219
},
{
"epoch": 1.0,
"grad_norm": 14.069113731384277,
"learning_rate": 9.336247985008534e-05,
"loss": 0.2267,
"step": 220
},
{
"epoch": 1.0045610034207526,
"grad_norm": 9.834161758422852,
"learning_rate": 9.328899696354918e-05,
"loss": 0.2113,
"step": 221
},
{
"epoch": 1.0091220068415052,
"grad_norm": 8.437716484069824,
"learning_rate": 9.321513880332458e-05,
"loss": 0.2404,
"step": 222
},
{
"epoch": 1.0136830102622576,
"grad_norm": 10.78850269317627,
"learning_rate": 9.314090600969024e-05,
"loss": 0.1706,
"step": 223
},
{
"epoch": 1.0182440136830102,
"grad_norm": 10.366409301757812,
"learning_rate": 9.306629922617261e-05,
"loss": 0.2395,
"step": 224
},
{
"epoch": 1.0228050171037628,
"grad_norm": 14.871321678161621,
"learning_rate": 9.29913190995403e-05,
"loss": 0.2993,
"step": 225
},
{
"epoch": 1.0273660205245154,
"grad_norm": 12.021495819091797,
"learning_rate": 9.291596627979836e-05,
"loss": 0.2149,
"step": 226
},
{
"epoch": 1.031927023945268,
"grad_norm": 14.372687339782715,
"learning_rate": 9.284024142018281e-05,
"loss": 0.2743,
"step": 227
},
{
"epoch": 1.0364880273660204,
"grad_norm": 16.323156356811523,
"learning_rate": 9.276414517715484e-05,
"loss": 0.343,
"step": 228
},
{
"epoch": 1.041049030786773,
"grad_norm": 14.0962495803833,
"learning_rate": 9.268767821039521e-05,
"loss": 0.2017,
"step": 229
},
{
"epoch": 1.0456100342075256,
"grad_norm": 13.36721420288086,
"learning_rate": 9.261084118279847e-05,
"loss": 0.2844,
"step": 230
},
{
"epoch": 1.0501710376282782,
"grad_norm": 16.017093658447266,
"learning_rate": 9.253363476046725e-05,
"loss": 0.2139,
"step": 231
},
{
"epoch": 1.0547320410490308,
"grad_norm": 13.20304012298584,
"learning_rate": 9.245605961270649e-05,
"loss": 0.1957,
"step": 232
},
{
"epoch": 1.0592930444697835,
"grad_norm": 11.656867027282715,
"learning_rate": 9.23781164120176e-05,
"loss": 0.2862,
"step": 233
},
{
"epoch": 1.0638540478905358,
"grad_norm": 22.73741340637207,
"learning_rate": 9.229980583409266e-05,
"loss": 0.5163,
"step": 234
},
{
"epoch": 1.0684150513112884,
"grad_norm": 13.677047729492188,
"learning_rate": 9.222112855780856e-05,
"loss": 0.304,
"step": 235
},
{
"epoch": 1.072976054732041,
"grad_norm": 18.445669174194336,
"learning_rate": 9.214208526522109e-05,
"loss": 0.4152,
"step": 236
},
{
"epoch": 1.0775370581527937,
"grad_norm": 7.721029758453369,
"learning_rate": 9.206267664155907e-05,
"loss": 0.1688,
"step": 237
},
{
"epoch": 1.0820980615735463,
"grad_norm": 14.50108528137207,
"learning_rate": 9.198290337521838e-05,
"loss": 0.3409,
"step": 238
},
{
"epoch": 1.0866590649942987,
"grad_norm": 9.616985321044922,
"learning_rate": 9.190276615775599e-05,
"loss": 0.212,
"step": 239
},
{
"epoch": 1.0912200684150513,
"grad_norm": 10.23328685760498,
"learning_rate": 9.182226568388401e-05,
"loss": 0.2361,
"step": 240
},
{
"epoch": 1.0957810718358039,
"grad_norm": 8.819774627685547,
"learning_rate": 9.174140265146356e-05,
"loss": 0.2378,
"step": 241
},
{
"epoch": 1.1003420752565565,
"grad_norm": 9.800360679626465,
"learning_rate": 9.166017776149887e-05,
"loss": 0.1975,
"step": 242
},
{
"epoch": 1.104903078677309,
"grad_norm": 14.380069732666016,
"learning_rate": 9.157859171813107e-05,
"loss": 0.1747,
"step": 243
},
{
"epoch": 1.1094640820980617,
"grad_norm": 11.026459693908691,
"learning_rate": 9.149664522863217e-05,
"loss": 0.2154,
"step": 244
},
{
"epoch": 1.114025085518814,
"grad_norm": 14.39684009552002,
"learning_rate": 9.141433900339887e-05,
"loss": 0.2274,
"step": 245
},
{
"epoch": 1.1185860889395667,
"grad_norm": 12.926016807556152,
"learning_rate": 9.133167375594647e-05,
"loss": 0.2368,
"step": 246
},
{
"epoch": 1.1231470923603193,
"grad_norm": 11.235928535461426,
"learning_rate": 9.12486502029026e-05,
"loss": 0.1921,
"step": 247
},
{
"epoch": 1.127708095781072,
"grad_norm": 8.581406593322754,
"learning_rate": 9.11652690640011e-05,
"loss": 0.1573,
"step": 248
},
{
"epoch": 1.1322690992018245,
"grad_norm": 12.511028289794922,
"learning_rate": 9.10815310620757e-05,
"loss": 0.2574,
"step": 249
},
{
"epoch": 1.1368301026225769,
"grad_norm": 13.827507019042969,
"learning_rate": 9.099743692305379e-05,
"loss": 0.2751,
"step": 250
},
{
"epoch": 1.1413911060433295,
"grad_norm": 16.066164016723633,
"learning_rate": 9.091298737595014e-05,
"loss": 0.2848,
"step": 251
},
{
"epoch": 1.145952109464082,
"grad_norm": 19.021018981933594,
"learning_rate": 9.082818315286055e-05,
"loss": 0.2962,
"step": 252
},
{
"epoch": 1.1505131128848347,
"grad_norm": 10.119819641113281,
"learning_rate": 9.074302498895552e-05,
"loss": 0.2158,
"step": 253
},
{
"epoch": 1.1550741163055873,
"grad_norm": 11.308869361877441,
"learning_rate": 9.065751362247388e-05,
"loss": 0.2406,
"step": 254
},
{
"epoch": 1.1596351197263397,
"grad_norm": 13.046134948730469,
"learning_rate": 9.057164979471635e-05,
"loss": 0.2534,
"step": 255
},
{
"epoch": 1.1641961231470923,
"grad_norm": 12.656744003295898,
"learning_rate": 9.048543425003923e-05,
"loss": 0.2888,
"step": 256
},
{
"epoch": 1.168757126567845,
"grad_norm": 11.619269371032715,
"learning_rate": 9.039886773584779e-05,
"loss": 0.2209,
"step": 257
},
{
"epoch": 1.1733181299885975,
"grad_norm": 9.45288372039795,
"learning_rate": 9.031195100258987e-05,
"loss": 0.1455,
"step": 258
},
{
"epoch": 1.1778791334093501,
"grad_norm": 8.505230903625488,
"learning_rate": 9.02246848037494e-05,
"loss": 0.1666,
"step": 259
},
{
"epoch": 1.1824401368301025,
"grad_norm": 18.330678939819336,
"learning_rate": 9.013706989583983e-05,
"loss": 0.2517,
"step": 260
},
{
"epoch": 1.1870011402508551,
"grad_norm": 14.25640869140625,
"learning_rate": 9.00491070383976e-05,
"loss": 0.3547,
"step": 261
},
{
"epoch": 1.1915621436716077,
"grad_norm": 9.561500549316406,
"learning_rate": 8.996079699397547e-05,
"loss": 0.2168,
"step": 262
},
{
"epoch": 1.1961231470923603,
"grad_norm": 12.185916900634766,
"learning_rate": 8.987214052813604e-05,
"loss": 0.1639,
"step": 263
},
{
"epoch": 1.200684150513113,
"grad_norm": 10.289037704467773,
"learning_rate": 8.978313840944503e-05,
"loss": 0.1805,
"step": 264
},
{
"epoch": 1.2052451539338653,
"grad_norm": 10.741447448730469,
"learning_rate": 8.969379140946464e-05,
"loss": 0.2754,
"step": 265
},
{
"epoch": 1.209806157354618,
"grad_norm": 10.507063865661621,
"learning_rate": 8.960410030274681e-05,
"loss": 0.2606,
"step": 266
},
{
"epoch": 1.2143671607753705,
"grad_norm": 15.26578140258789,
"learning_rate": 8.951406586682662e-05,
"loss": 0.3271,
"step": 267
},
{
"epoch": 1.2189281641961232,
"grad_norm": 12.20109748840332,
"learning_rate": 8.942368888221545e-05,
"loss": 0.2345,
"step": 268
},
{
"epoch": 1.2234891676168758,
"grad_norm": 9.984328269958496,
"learning_rate": 8.933297013239424e-05,
"loss": 0.1968,
"step": 269
},
{
"epoch": 1.2280501710376284,
"grad_norm": 12.456114768981934,
"learning_rate": 8.924191040380671e-05,
"loss": 0.2624,
"step": 270
},
{
"epoch": 1.2326111744583808,
"grad_norm": 11.822625160217285,
"learning_rate": 8.915051048585256e-05,
"loss": 0.2642,
"step": 271
},
{
"epoch": 1.2371721778791334,
"grad_norm": 14.391879081726074,
"learning_rate": 8.905877117088054e-05,
"loss": 0.2378,
"step": 272
},
{
"epoch": 1.241733181299886,
"grad_norm": 10.977947235107422,
"learning_rate": 8.896669325418172e-05,
"loss": 0.2302,
"step": 273
},
{
"epoch": 1.2462941847206386,
"grad_norm": 10.284666061401367,
"learning_rate": 8.887427753398248e-05,
"loss": 0.2304,
"step": 274
},
{
"epoch": 1.2508551881413912,
"grad_norm": 12.455047607421875,
"learning_rate": 8.87815248114376e-05,
"loss": 0.2586,
"step": 275
},
{
"epoch": 1.2554161915621438,
"grad_norm": 8.457530975341797,
"learning_rate": 8.868843589062339e-05,
"loss": 0.1605,
"step": 276
},
{
"epoch": 1.2599771949828962,
"grad_norm": 9.678699493408203,
"learning_rate": 8.859501157853066e-05,
"loss": 0.1834,
"step": 277
},
{
"epoch": 1.2645381984036488,
"grad_norm": 10.811609268188477,
"learning_rate": 8.850125268505774e-05,
"loss": 0.2116,
"step": 278
},
{
"epoch": 1.2690992018244014,
"grad_norm": 17.398542404174805,
"learning_rate": 8.840716002300347e-05,
"loss": 0.2112,
"step": 279
},
{
"epoch": 1.273660205245154,
"grad_norm": 20.152509689331055,
"learning_rate": 8.831273440806009e-05,
"loss": 0.2475,
"step": 280
},
{
"epoch": 1.2782212086659066,
"grad_norm": 9.431918144226074,
"learning_rate": 8.821797665880625e-05,
"loss": 0.1543,
"step": 281
},
{
"epoch": 1.282782212086659,
"grad_norm": 16.22393798828125,
"learning_rate": 8.812288759669994e-05,
"loss": 0.2396,
"step": 282
},
{
"epoch": 1.2873432155074116,
"grad_norm": 22.462894439697266,
"learning_rate": 8.802746804607118e-05,
"loss": 0.4583,
"step": 283
},
{
"epoch": 1.2919042189281642,
"grad_norm": 9.348506927490234,
"learning_rate": 8.793171883411515e-05,
"loss": 0.1537,
"step": 284
},
{
"epoch": 1.2964652223489168,
"grad_norm": 11.54738998413086,
"learning_rate": 8.783564079088477e-05,
"loss": 0.1724,
"step": 285
},
{
"epoch": 1.3010262257696694,
"grad_norm": 14.7079439163208,
"learning_rate": 8.773923474928365e-05,
"loss": 0.2363,
"step": 286
},
{
"epoch": 1.3055872291904218,
"grad_norm": 10.920402526855469,
"learning_rate": 8.764250154505885e-05,
"loss": 0.2441,
"step": 287
},
{
"epoch": 1.3101482326111744,
"grad_norm": 13.201237678527832,
"learning_rate": 8.754544201679353e-05,
"loss": 0.2623,
"step": 288
},
{
"epoch": 1.314709236031927,
"grad_norm": 8.716439247131348,
"learning_rate": 8.744805700589989e-05,
"loss": 0.2039,
"step": 289
},
{
"epoch": 1.3192702394526796,
"grad_norm": 18.450355529785156,
"learning_rate": 8.735034735661162e-05,
"loss": 0.2247,
"step": 290
},
{
"epoch": 1.3238312428734322,
"grad_norm": 9.224489212036133,
"learning_rate": 8.725231391597681e-05,
"loss": 0.2552,
"step": 291
},
{
"epoch": 1.3283922462941846,
"grad_norm": 6.751110553741455,
"learning_rate": 8.715395753385048e-05,
"loss": 0.1663,
"step": 292
},
{
"epoch": 1.3329532497149372,
"grad_norm": 12.301689147949219,
"learning_rate": 8.705527906288718e-05,
"loss": 0.2175,
"step": 293
},
{
"epoch": 1.3375142531356898,
"grad_norm": 10.624826431274414,
"learning_rate": 8.695627935853373e-05,
"loss": 0.1919,
"step": 294
},
{
"epoch": 1.3420752565564424,
"grad_norm": 11.931551933288574,
"learning_rate": 8.68569592790217e-05,
"loss": 0.2923,
"step": 295
},
{
"epoch": 1.346636259977195,
"grad_norm": 10.668767929077148,
"learning_rate": 8.675731968536002e-05,
"loss": 0.2388,
"step": 296
},
{
"epoch": 1.3511972633979474,
"grad_norm": 12.659479141235352,
"learning_rate": 8.66573614413275e-05,
"loss": 0.2413,
"step": 297
},
{
"epoch": 1.3557582668187,
"grad_norm": 17.342519760131836,
"learning_rate": 8.655708541346533e-05,
"loss": 0.2613,
"step": 298
},
{
"epoch": 1.3603192702394526,
"grad_norm": 11.727388381958008,
"learning_rate": 8.645649247106955e-05,
"loss": 0.2109,
"step": 299
},
{
"epoch": 1.3648802736602053,
"grad_norm": 12.136419296264648,
"learning_rate": 8.635558348618359e-05,
"loss": 0.2467,
"step": 300
},
{
"epoch": 1.3694412770809579,
"grad_norm": 10.73682975769043,
"learning_rate": 8.625435933359062e-05,
"loss": 0.1937,
"step": 301
},
{
"epoch": 1.3740022805017102,
"grad_norm": 11.996789932250977,
"learning_rate": 8.615282089080609e-05,
"loss": 0.2655,
"step": 302
},
{
"epoch": 1.378563283922463,
"grad_norm": 15.302202224731445,
"learning_rate": 8.605096903806991e-05,
"loss": 0.2487,
"step": 303
},
{
"epoch": 1.3831242873432155,
"grad_norm": 9.55948543548584,
"learning_rate": 8.594880465833908e-05,
"loss": 0.1708,
"step": 304
},
{
"epoch": 1.387685290763968,
"grad_norm": 12.158270835876465,
"learning_rate": 8.584632863727982e-05,
"loss": 0.2452,
"step": 305
},
{
"epoch": 1.3922462941847207,
"grad_norm": 8.76557731628418,
"learning_rate": 8.574354186326001e-05,
"loss": 0.193,
"step": 306
},
{
"epoch": 1.3968072976054733,
"grad_norm": 9.248729705810547,
"learning_rate": 8.564044522734147e-05,
"loss": 0.2264,
"step": 307
},
{
"epoch": 1.401368301026226,
"grad_norm": 9.079157829284668,
"learning_rate": 8.55370396232722e-05,
"loss": 0.1809,
"step": 308
},
{
"epoch": 1.4059293044469783,
"grad_norm": 9.482975959777832,
"learning_rate": 8.543332594747865e-05,
"loss": 0.1772,
"step": 309
},
{
"epoch": 1.4104903078677309,
"grad_norm": 9.198452949523926,
"learning_rate": 8.532930509905799e-05,
"loss": 0.2047,
"step": 310
},
{
"epoch": 1.4150513112884835,
"grad_norm": 8.783437728881836,
"learning_rate": 8.522497797977024e-05,
"loss": 0.2247,
"step": 311
},
{
"epoch": 1.419612314709236,
"grad_norm": 14.365094184875488,
"learning_rate": 8.512034549403053e-05,
"loss": 0.2208,
"step": 312
},
{
"epoch": 1.4241733181299887,
"grad_norm": 14.370891571044922,
"learning_rate": 8.501540854890118e-05,
"loss": 0.2326,
"step": 313
},
{
"epoch": 1.428734321550741,
"grad_norm": 10.391968727111816,
"learning_rate": 8.491016805408387e-05,
"loss": 0.1751,
"step": 314
},
{
"epoch": 1.4332953249714937,
"grad_norm": 11.512178421020508,
"learning_rate": 8.480462492191186e-05,
"loss": 0.2978,
"step": 315
},
{
"epoch": 1.4378563283922463,
"grad_norm": 12.58578872680664,
"learning_rate": 8.469878006734185e-05,
"loss": 0.2706,
"step": 316
},
{
"epoch": 1.442417331812999,
"grad_norm": 8.530269622802734,
"learning_rate": 8.459263440794627e-05,
"loss": 0.1755,
"step": 317
},
{
"epoch": 1.4469783352337515,
"grad_norm": 8.248932838439941,
"learning_rate": 8.448618886390522e-05,
"loss": 0.1483,
"step": 318
},
{
"epoch": 1.451539338654504,
"grad_norm": 18.134685516357422,
"learning_rate": 8.437944435799848e-05,
"loss": 0.1938,
"step": 319
},
{
"epoch": 1.4561003420752565,
"grad_norm": 8.072942733764648,
"learning_rate": 8.427240181559754e-05,
"loss": 0.1573,
"step": 320
},
{
"epoch": 1.4606613454960091,
"grad_norm": 11.139336585998535,
"learning_rate": 8.416506216465765e-05,
"loss": 0.2272,
"step": 321
},
{
"epoch": 1.4652223489167617,
"grad_norm": 12.186053276062012,
"learning_rate": 8.405742633570961e-05,
"loss": 0.1716,
"step": 322
},
{
"epoch": 1.4697833523375143,
"grad_norm": 10.145633697509766,
"learning_rate": 8.394949526185185e-05,
"loss": 0.1913,
"step": 323
},
{
"epoch": 1.4743443557582667,
"grad_norm": 9.266544342041016,
"learning_rate": 8.384126987874228e-05,
"loss": 0.1642,
"step": 324
},
{
"epoch": 1.4789053591790193,
"grad_norm": 11.050301551818848,
"learning_rate": 8.373275112459016e-05,
"loss": 0.2253,
"step": 325
},
{
"epoch": 1.483466362599772,
"grad_norm": 8.425420761108398,
"learning_rate": 8.362393994014805e-05,
"loss": 0.1826,
"step": 326
},
{
"epoch": 1.4880273660205245,
"grad_norm": 14.498648643493652,
"learning_rate": 8.35148372687035e-05,
"loss": 0.2432,
"step": 327
},
{
"epoch": 1.4925883694412772,
"grad_norm": 14.982590675354004,
"learning_rate": 8.340544405607111e-05,
"loss": 0.1724,
"step": 328
},
{
"epoch": 1.4971493728620295,
"grad_norm": 11.120532035827637,
"learning_rate": 8.329576125058406e-05,
"loss": 0.1461,
"step": 329
},
{
"epoch": 1.5017103762827824,
"grad_norm": 17.683189392089844,
"learning_rate": 8.318578980308609e-05,
"loss": 0.3342,
"step": 330
},
{
"epoch": 1.5062713797035348,
"grad_norm": 17.191091537475586,
"learning_rate": 8.307553066692314e-05,
"loss": 0.2188,
"step": 331
},
{
"epoch": 1.5108323831242874,
"grad_norm": 8.582752227783203,
"learning_rate": 8.29649847979352e-05,
"loss": 0.1141,
"step": 332
},
{
"epoch": 1.51539338654504,
"grad_norm": 9.13406753540039,
"learning_rate": 8.28541531544479e-05,
"loss": 0.1767,
"step": 333
},
{
"epoch": 1.5199543899657924,
"grad_norm": 8.726181030273438,
"learning_rate": 8.274303669726426e-05,
"loss": 0.1348,
"step": 334
},
{
"epoch": 1.5245153933865452,
"grad_norm": 10.707447052001953,
"learning_rate": 8.263163638965639e-05,
"loss": 0.2005,
"step": 335
},
{
"epoch": 1.5290763968072976,
"grad_norm": 12.0310640335083,
"learning_rate": 8.25199531973571e-05,
"loss": 0.1985,
"step": 336
},
{
"epoch": 1.5336374002280502,
"grad_norm": 9.672492027282715,
"learning_rate": 8.24079880885515e-05,
"loss": 0.2014,
"step": 337
},
{
"epoch": 1.5381984036488028,
"grad_norm": 9.297097206115723,
"learning_rate": 8.22957420338687e-05,
"loss": 0.1302,
"step": 338
},
{
"epoch": 1.5427594070695552,
"grad_norm": 25.988061904907227,
"learning_rate": 8.218321600637329e-05,
"loss": 0.2899,
"step": 339
},
{
"epoch": 1.547320410490308,
"grad_norm": 9.74842643737793,
"learning_rate": 8.2070410981557e-05,
"loss": 0.1612,
"step": 340
},
{
"epoch": 1.5518814139110604,
"grad_norm": 10.73891544342041,
"learning_rate": 8.195732793733014e-05,
"loss": 0.2282,
"step": 341
},
{
"epoch": 1.556442417331813,
"grad_norm": 15.269837379455566,
"learning_rate": 8.184396785401322e-05,
"loss": 0.1585,
"step": 342
},
{
"epoch": 1.5610034207525656,
"grad_norm": 7.805790901184082,
"learning_rate": 8.173033171432841e-05,
"loss": 0.1117,
"step": 343
},
{
"epoch": 1.565564424173318,
"grad_norm": 9.819446563720703,
"learning_rate": 8.1616420503391e-05,
"loss": 0.2143,
"step": 344
},
{
"epoch": 1.5701254275940708,
"grad_norm": 8.949931144714355,
"learning_rate": 8.15022352087009e-05,
"loss": 0.2139,
"step": 345
},
{
"epoch": 1.5746864310148232,
"grad_norm": 14.177704811096191,
"learning_rate": 8.138777682013403e-05,
"loss": 0.2733,
"step": 346
},
{
"epoch": 1.5792474344355758,
"grad_norm": 10.694663047790527,
"learning_rate": 8.127304632993382e-05,
"loss": 0.1532,
"step": 347
},
{
"epoch": 1.5838084378563284,
"grad_norm": 14.421151161193848,
"learning_rate": 8.115804473270253e-05,
"loss": 0.1349,
"step": 348
},
{
"epoch": 1.5883694412770808,
"grad_norm": 9.572623252868652,
"learning_rate": 8.104277302539264e-05,
"loss": 0.1852,
"step": 349
},
{
"epoch": 1.5929304446978336,
"grad_norm": 8.018699645996094,
"learning_rate": 8.092723220729825e-05,
"loss": 0.1398,
"step": 350
},
{
"epoch": 1.597491448118586,
"grad_norm": 10.331695556640625,
"learning_rate": 8.081142328004637e-05,
"loss": 0.1678,
"step": 351
},
{
"epoch": 1.6020524515393386,
"grad_norm": 8.879880905151367,
"learning_rate": 8.069534724758827e-05,
"loss": 0.1527,
"step": 352
},
{
"epoch": 1.6066134549600912,
"grad_norm": 11.865134239196777,
"learning_rate": 8.057900511619076e-05,
"loss": 0.174,
"step": 353
},
{
"epoch": 1.6111744583808438,
"grad_norm": 20.736913681030273,
"learning_rate": 8.046239789442749e-05,
"loss": 0.14,
"step": 354
},
{
"epoch": 1.6157354618015964,
"grad_norm": 8.29340648651123,
"learning_rate": 8.034552659317012e-05,
"loss": 0.1924,
"step": 355
},
{
"epoch": 1.6202964652223488,
"grad_norm": 14.969886779785156,
"learning_rate": 8.02283922255797e-05,
"loss": 0.1776,
"step": 356
},
{
"epoch": 1.6248574686431014,
"grad_norm": 41.689517974853516,
"learning_rate": 8.011099580709778e-05,
"loss": 0.1337,
"step": 357
},
{
"epoch": 1.629418472063854,
"grad_norm": 9.815425872802734,
"learning_rate": 7.999333835543763e-05,
"loss": 0.1959,
"step": 358
},
{
"epoch": 1.6339794754846066,
"grad_norm": 12.40318775177002,
"learning_rate": 7.987542089057542e-05,
"loss": 0.1968,
"step": 359
},
{
"epoch": 1.6385404789053593,
"grad_norm": 8.287771224975586,
"learning_rate": 7.975724443474143e-05,
"loss": 0.1082,
"step": 360
},
{
"epoch": 1.6431014823261116,
"grad_norm": 9.289151191711426,
"learning_rate": 7.963881001241107e-05,
"loss": 0.1176,
"step": 361
},
{
"epoch": 1.6476624857468645,
"grad_norm": 12.972766876220703,
"learning_rate": 7.952011865029614e-05,
"loss": 0.2185,
"step": 362
},
{
"epoch": 1.6522234891676169,
"grad_norm": 11.908880233764648,
"learning_rate": 7.940117137733579e-05,
"loss": 0.177,
"step": 363
},
{
"epoch": 1.6567844925883695,
"grad_norm": 8.70804500579834,
"learning_rate": 7.928196922468772e-05,
"loss": 0.143,
"step": 364
},
{
"epoch": 1.661345496009122,
"grad_norm": 11.03876781463623,
"learning_rate": 7.916251322571918e-05,
"loss": 0.1837,
"step": 365
},
{
"epoch": 1.6659064994298745,
"grad_norm": 12.949993133544922,
"learning_rate": 7.904280441599801e-05,
"loss": 0.1652,
"step": 366
},
{
"epoch": 1.6704675028506273,
"grad_norm": 10.857973098754883,
"learning_rate": 7.892284383328367e-05,
"loss": 0.1575,
"step": 367
},
{
"epoch": 1.6750285062713797,
"grad_norm": 8.548442840576172,
"learning_rate": 7.88026325175183e-05,
"loss": 0.125,
"step": 368
},
{
"epoch": 1.6795895096921323,
"grad_norm": 7.094759464263916,
"learning_rate": 7.868217151081755e-05,
"loss": 0.132,
"step": 369
},
{
"epoch": 1.6841505131128849,
"grad_norm": 8.124651908874512,
"learning_rate": 7.856146185746175e-05,
"loss": 0.164,
"step": 370
},
{
"epoch": 1.6887115165336373,
"grad_norm": 10.263216018676758,
"learning_rate": 7.844050460388671e-05,
"loss": 0.1476,
"step": 371
},
{
"epoch": 1.69327251995439,
"grad_norm": 12.262899398803711,
"learning_rate": 7.831930079867469e-05,
"loss": 0.1952,
"step": 372
},
{
"epoch": 1.6978335233751425,
"grad_norm": 8.35619831085205,
"learning_rate": 7.819785149254532e-05,
"loss": 0.1429,
"step": 373
},
{
"epoch": 1.702394526795895,
"grad_norm": 10.857168197631836,
"learning_rate": 7.807615773834652e-05,
"loss": 0.1307,
"step": 374
},
{
"epoch": 1.7069555302166477,
"grad_norm": 7.174655914306641,
"learning_rate": 7.795422059104527e-05,
"loss": 0.1304,
"step": 375
},
{
"epoch": 1.7115165336374,
"grad_norm": 14.329642295837402,
"learning_rate": 7.78320411077186e-05,
"loss": 0.1997,
"step": 376
},
{
"epoch": 1.716077537058153,
"grad_norm": 8.973917961120605,
"learning_rate": 7.77096203475443e-05,
"loss": 0.1583,
"step": 377
},
{
"epoch": 1.7206385404789053,
"grad_norm": 6.451292514801025,
"learning_rate": 7.758695937179185e-05,
"loss": 0.1201,
"step": 378
},
{
"epoch": 1.725199543899658,
"grad_norm": 6.503537178039551,
"learning_rate": 7.746405924381313e-05,
"loss": 0.0973,
"step": 379
},
{
"epoch": 1.7297605473204105,
"grad_norm": 6.6744608879089355,
"learning_rate": 7.734092102903323e-05,
"loss": 0.1019,
"step": 380
},
{
"epoch": 1.734321550741163,
"grad_norm": 9.03148365020752,
"learning_rate": 7.721754579494127e-05,
"loss": 0.1592,
"step": 381
},
{
"epoch": 1.7388825541619157,
"grad_norm": 7.3868632316589355,
"learning_rate": 7.709393461108107e-05,
"loss": 0.1457,
"step": 382
},
{
"epoch": 1.7434435575826681,
"grad_norm": 11.139805793762207,
"learning_rate": 7.697008854904191e-05,
"loss": 0.1512,
"step": 383
},
{
"epoch": 1.7480045610034207,
"grad_norm": 9.616064071655273,
"learning_rate": 7.68460086824492e-05,
"loss": 0.194,
"step": 384
},
{
"epoch": 1.7525655644241733,
"grad_norm": 8.999774932861328,
"learning_rate": 7.672169608695525e-05,
"loss": 0.1654,
"step": 385
},
{
"epoch": 1.757126567844926,
"grad_norm": 12.37429141998291,
"learning_rate": 7.659715184022994e-05,
"loss": 0.2122,
"step": 386
},
{
"epoch": 1.7616875712656785,
"grad_norm": 9.472933769226074,
"learning_rate": 7.647237702195123e-05,
"loss": 0.1587,
"step": 387
},
{
"epoch": 1.766248574686431,
"grad_norm": 10.54593563079834,
"learning_rate": 7.634737271379603e-05,
"loss": 0.2103,
"step": 388
},
{
"epoch": 1.7708095781071835,
"grad_norm": 6.688052654266357,
"learning_rate": 7.622213999943062e-05,
"loss": 0.0989,
"step": 389
},
{
"epoch": 1.7753705815279361,
"grad_norm": 10.251477241516113,
"learning_rate": 7.609667996450141e-05,
"loss": 0.2219,
"step": 390
},
{
"epoch": 1.7799315849486887,
"grad_norm": 6.267465591430664,
"learning_rate": 7.59709936966254e-05,
"loss": 0.0958,
"step": 391
},
{
"epoch": 1.7844925883694414,
"grad_norm": 8.224940299987793,
"learning_rate": 7.584508228538085e-05,
"loss": 0.1312,
"step": 392
},
{
"epoch": 1.7890535917901937,
"grad_norm": 12.063385963439941,
"learning_rate": 7.571894682229775e-05,
"loss": 0.1833,
"step": 393
},
{
"epoch": 1.7936145952109466,
"grad_norm": 8.511308670043945,
"learning_rate": 7.559258840084848e-05,
"loss": 0.1442,
"step": 394
},
{
"epoch": 1.798175598631699,
"grad_norm": 8.30827522277832,
"learning_rate": 7.546600811643816e-05,
"loss": 0.1438,
"step": 395
},
{
"epoch": 1.8027366020524516,
"grad_norm": 11.280699729919434,
"learning_rate": 7.533920706639531e-05,
"loss": 0.2558,
"step": 396
},
{
"epoch": 1.8072976054732042,
"grad_norm": 6.188623905181885,
"learning_rate": 7.521218634996226e-05,
"loss": 0.1072,
"step": 397
},
{
"epoch": 1.8118586088939566,
"grad_norm": 15.961888313293457,
"learning_rate": 7.508494706828564e-05,
"loss": 0.1619,
"step": 398
},
{
"epoch": 1.8164196123147094,
"grad_norm": 9.33893871307373,
"learning_rate": 7.49574903244068e-05,
"loss": 0.1995,
"step": 399
},
{
"epoch": 1.8209806157354618,
"grad_norm": 12.080733299255371,
"learning_rate": 7.482981722325232e-05,
"loss": 0.1647,
"step": 400
},
{
"epoch": 1.8255416191562144,
"grad_norm": 8.13494873046875,
"learning_rate": 7.470192887162435e-05,
"loss": 0.1278,
"step": 401
},
{
"epoch": 1.830102622576967,
"grad_norm": 9.168209075927734,
"learning_rate": 7.457382637819108e-05,
"loss": 0.1244,
"step": 402
},
{
"epoch": 1.8346636259977194,
"grad_norm": 8.094377517700195,
"learning_rate": 7.444551085347707e-05,
"loss": 0.1066,
"step": 403
},
{
"epoch": 1.8392246294184722,
"grad_norm": 9.4691743850708,
"learning_rate": 7.43169834098537e-05,
"loss": 0.1378,
"step": 404
},
{
"epoch": 1.8437856328392246,
"grad_norm": 9.721671104431152,
"learning_rate": 7.418824516152943e-05,
"loss": 0.1247,
"step": 405
},
{
"epoch": 1.8483466362599772,
"grad_norm": 8.888439178466797,
"learning_rate": 7.405929722454026e-05,
"loss": 0.1256,
"step": 406
},
{
"epoch": 1.8529076396807298,
"grad_norm": 7.914821147918701,
"learning_rate": 7.393014071673992e-05,
"loss": 0.1293,
"step": 407
},
{
"epoch": 1.8574686431014822,
"grad_norm": 10.728132247924805,
"learning_rate": 7.380077675779027e-05,
"loss": 0.1738,
"step": 408
},
{
"epoch": 1.862029646522235,
"grad_norm": 13.084773063659668,
"learning_rate": 7.36712064691516e-05,
"loss": 0.1478,
"step": 409
},
{
"epoch": 1.8665906499429874,
"grad_norm": 6.1122660636901855,
"learning_rate": 7.354143097407283e-05,
"loss": 0.084,
"step": 410
},
{
"epoch": 1.87115165336374,
"grad_norm": 16.69349479675293,
"learning_rate": 7.341145139758185e-05,
"loss": 0.1161,
"step": 411
},
{
"epoch": 1.8757126567844926,
"grad_norm": 18.03197479248047,
"learning_rate": 7.328126886647575e-05,
"loss": 0.2555,
"step": 412
},
{
"epoch": 1.880273660205245,
"grad_norm": 5.904569149017334,
"learning_rate": 7.315088450931103e-05,
"loss": 0.0978,
"step": 413
},
{
"epoch": 1.8848346636259978,
"grad_norm": 8.850961685180664,
"learning_rate": 7.302029945639377e-05,
"loss": 0.1636,
"step": 414
},
{
"epoch": 1.8893956670467502,
"grad_norm": 8.666600227355957,
"learning_rate": 7.288951483976998e-05,
"loss": 0.1544,
"step": 415
},
{
"epoch": 1.8939566704675028,
"grad_norm": 8.048266410827637,
"learning_rate": 7.275853179321565e-05,
"loss": 0.1148,
"step": 416
},
{
"epoch": 1.8985176738882554,
"grad_norm": 9.665177345275879,
"learning_rate": 7.262735145222696e-05,
"loss": 0.1452,
"step": 417
},
{
"epoch": 1.9030786773090078,
"grad_norm": 6.529131889343262,
"learning_rate": 7.249597495401043e-05,
"loss": 0.0976,
"step": 418
},
{
"epoch": 1.9076396807297606,
"grad_norm": 6.697221755981445,
"learning_rate": 7.236440343747313e-05,
"loss": 0.1207,
"step": 419
},
{
"epoch": 1.912200684150513,
"grad_norm": 7.641704559326172,
"learning_rate": 7.223263804321269e-05,
"loss": 0.1102,
"step": 420
},
{
"epoch": 1.9167616875712656,
"grad_norm": 5.448543071746826,
"learning_rate": 7.21006799135075e-05,
"loss": 0.0969,
"step": 421
},
{
"epoch": 1.9213226909920182,
"grad_norm": 12.550832748413086,
"learning_rate": 7.196853019230676e-05,
"loss": 0.1629,
"step": 422
},
{
"epoch": 1.9258836944127709,
"grad_norm": 7.084536552429199,
"learning_rate": 7.183619002522062e-05,
"loss": 0.1378,
"step": 423
},
{
"epoch": 1.9304446978335235,
"grad_norm": 8.91976547241211,
"learning_rate": 7.170366055951017e-05,
"loss": 0.1177,
"step": 424
},
{
"epoch": 1.9350057012542758,
"grad_norm": 9.238527297973633,
"learning_rate": 7.157094294407756e-05,
"loss": 0.1522,
"step": 425
},
{
"epoch": 1.9395667046750285,
"grad_norm": 8.187129974365234,
"learning_rate": 7.143803832945601e-05,
"loss": 0.1134,
"step": 426
},
{
"epoch": 1.944127708095781,
"grad_norm": 7.621769905090332,
"learning_rate": 7.130494786779987e-05,
"loss": 0.1011,
"step": 427
},
{
"epoch": 1.9486887115165337,
"grad_norm": 11.57784652709961,
"learning_rate": 7.117167271287453e-05,
"loss": 0.1254,
"step": 428
},
{
"epoch": 1.9532497149372863,
"grad_norm": 8.105171203613281,
"learning_rate": 7.103821402004654e-05,
"loss": 0.0994,
"step": 429
},
{
"epoch": 1.9578107183580387,
"grad_norm": 10.137523651123047,
"learning_rate": 7.090457294627358e-05,
"loss": 0.0976,
"step": 430
},
{
"epoch": 1.9623717217787915,
"grad_norm": 5.640718936920166,
"learning_rate": 7.077075065009433e-05,
"loss": 0.0887,
"step": 431
},
{
"epoch": 1.9669327251995439,
"grad_norm": 10.016772270202637,
"learning_rate": 7.063674829161853e-05,
"loss": 0.1036,
"step": 432
},
{
"epoch": 1.9714937286202965,
"grad_norm": 8.870481491088867,
"learning_rate": 7.050256703251688e-05,
"loss": 0.0973,
"step": 433
},
{
"epoch": 1.976054732041049,
"grad_norm": 7.390217304229736,
"learning_rate": 7.036820803601099e-05,
"loss": 0.0966,
"step": 434
},
{
"epoch": 1.9806157354618015,
"grad_norm": 9.348631858825684,
"learning_rate": 7.023367246686323e-05,
"loss": 0.1119,
"step": 435
},
{
"epoch": 1.9851767388825543,
"grad_norm": 7.239314556121826,
"learning_rate": 7.009896149136674e-05,
"loss": 0.1167,
"step": 436
},
{
"epoch": 1.9897377423033067,
"grad_norm": 7.269038200378418,
"learning_rate": 6.996407627733526e-05,
"loss": 0.123,
"step": 437
},
{
"epoch": 1.9942987457240593,
"grad_norm": 10.885858535766602,
"learning_rate": 6.982901799409294e-05,
"loss": 0.1401,
"step": 438
},
{
"epoch": 1.998859749144812,
"grad_norm": 13.753951072692871,
"learning_rate": 6.969378781246436e-05,
"loss": 0.0994,
"step": 439
},
{
"epoch": 2.0,
"grad_norm": 21.25026512145996,
"learning_rate": 6.955838690476426e-05,
"loss": 0.1222,
"step": 440
},
{
"epoch": 2.0045610034207524,
"grad_norm": 6.62643575668335,
"learning_rate": 6.942281644478739e-05,
"loss": 0.0698,
"step": 441
},
{
"epoch": 2.009122006841505,
"grad_norm": 7.465953826904297,
"learning_rate": 6.928707760779838e-05,
"loss": 0.0795,
"step": 442
},
{
"epoch": 2.0136830102622576,
"grad_norm": 7.87470006942749,
"learning_rate": 6.915117157052149e-05,
"loss": 0.0733,
"step": 443
},
{
"epoch": 2.0182440136830104,
"grad_norm": 7.7158966064453125,
"learning_rate": 6.90150995111305e-05,
"loss": 0.0761,
"step": 444
},
{
"epoch": 2.022805017103763,
"grad_norm": 9.388237953186035,
"learning_rate": 6.887886260923842e-05,
"loss": 0.111,
"step": 445
},
{
"epoch": 2.027366020524515,
"grad_norm": 10.703797340393066,
"learning_rate": 6.874246204588724e-05,
"loss": 0.1158,
"step": 446
},
{
"epoch": 2.031927023945268,
"grad_norm": 6.636610507965088,
"learning_rate": 6.860589900353778e-05,
"loss": 0.078,
"step": 447
},
{
"epoch": 2.0364880273660204,
"grad_norm": 6.754958629608154,
"learning_rate": 6.84691746660594e-05,
"loss": 0.0676,
"step": 448
},
{
"epoch": 2.0410490307867732,
"grad_norm": 8.061761856079102,
"learning_rate": 6.833229021871974e-05,
"loss": 0.0781,
"step": 449
},
{
"epoch": 2.0456100342075256,
"grad_norm": 5.99964714050293,
"learning_rate": 6.819524684817438e-05,
"loss": 0.0645,
"step": 450
},
{
"epoch": 2.050171037628278,
"grad_norm": 6.639948844909668,
"learning_rate": 6.805804574245666e-05,
"loss": 0.0721,
"step": 451
},
{
"epoch": 2.054732041049031,
"grad_norm": 6.918362140655518,
"learning_rate": 6.792068809096734e-05,
"loss": 0.1027,
"step": 452
},
{
"epoch": 2.0592930444697832,
"grad_norm": 6.9611616134643555,
"learning_rate": 6.778317508446423e-05,
"loss": 0.0902,
"step": 453
},
{
"epoch": 2.063854047890536,
"grad_norm": 5.7177019119262695,
"learning_rate": 6.764550791505197e-05,
"loss": 0.0544,
"step": 454
},
{
"epoch": 2.0684150513112884,
"grad_norm": 7.697108745574951,
"learning_rate": 6.750768777617162e-05,
"loss": 0.0673,
"step": 455
},
{
"epoch": 2.072976054732041,
"grad_norm": 3.653858184814453,
"learning_rate": 6.736971586259033e-05,
"loss": 0.0413,
"step": 456
},
{
"epoch": 2.0775370581527937,
"grad_norm": 6.587297439575195,
"learning_rate": 6.723159337039097e-05,
"loss": 0.0537,
"step": 457
},
{
"epoch": 2.082098061573546,
"grad_norm": 5.920407295227051,
"learning_rate": 6.709332149696185e-05,
"loss": 0.0555,
"step": 458
},
{
"epoch": 2.086659064994299,
"grad_norm": 5.50054407119751,
"learning_rate": 6.695490144098621e-05,
"loss": 0.0756,
"step": 459
},
{
"epoch": 2.0912200684150513,
"grad_norm": 8.171920776367188,
"learning_rate": 6.681633440243194e-05,
"loss": 0.0817,
"step": 460
},
{
"epoch": 2.095781071835804,
"grad_norm": 7.142725944519043,
"learning_rate": 6.667762158254104e-05,
"loss": 0.0579,
"step": 461
},
{
"epoch": 2.1003420752565565,
"grad_norm": 6.230417251586914,
"learning_rate": 6.653876418381937e-05,
"loss": 0.0778,
"step": 462
},
{
"epoch": 2.104903078677309,
"grad_norm": 7.222645282745361,
"learning_rate": 6.639976341002614e-05,
"loss": 0.0471,
"step": 463
},
{
"epoch": 2.1094640820980617,
"grad_norm": 8.968223571777344,
"learning_rate": 6.626062046616345e-05,
"loss": 0.0631,
"step": 464
},
{
"epoch": 2.114025085518814,
"grad_norm": 6.115957736968994,
"learning_rate": 6.612133655846592e-05,
"loss": 0.0605,
"step": 465
},
{
"epoch": 2.118586088939567,
"grad_norm": 12.084288597106934,
"learning_rate": 6.598191289439016e-05,
"loss": 0.1068,
"step": 466
},
{
"epoch": 2.1231470923603193,
"grad_norm": 6.62805700302124,
"learning_rate": 6.584235068260432e-05,
"loss": 0.0812,
"step": 467
},
{
"epoch": 2.1277080957810717,
"grad_norm": 9.849166870117188,
"learning_rate": 6.570265113297764e-05,
"loss": 0.0972,
"step": 468
},
{
"epoch": 2.1322690992018245,
"grad_norm": 8.004566192626953,
"learning_rate": 6.556281545656999e-05,
"loss": 0.0602,
"step": 469
},
{
"epoch": 2.136830102622577,
"grad_norm": 5.589608192443848,
"learning_rate": 6.542284486562124e-05,
"loss": 0.0537,
"step": 470
},
{
"epoch": 2.1413911060433297,
"grad_norm": 5.782744884490967,
"learning_rate": 6.528274057354092e-05,
"loss": 0.071,
"step": 471
},
{
"epoch": 2.145952109464082,
"grad_norm": 5.03806734085083,
"learning_rate": 6.514250379489753e-05,
"loss": 0.052,
"step": 472
},
{
"epoch": 2.1505131128848345,
"grad_norm": 8.631730079650879,
"learning_rate": 6.500213574540823e-05,
"loss": 0.0711,
"step": 473
},
{
"epoch": 2.1550741163055873,
"grad_norm": 3.648717164993286,
"learning_rate": 6.486163764192806e-05,
"loss": 0.0558,
"step": 474
},
{
"epoch": 2.1596351197263397,
"grad_norm": 5.878966808319092,
"learning_rate": 6.472101070243952e-05,
"loss": 0.0377,
"step": 475
},
{
"epoch": 2.1641961231470925,
"grad_norm": 6.6274919509887695,
"learning_rate": 6.458025614604203e-05,
"loss": 0.063,
"step": 476
},
{
"epoch": 2.168757126567845,
"grad_norm": 5.117002964019775,
"learning_rate": 6.44393751929413e-05,
"loss": 0.0675,
"step": 477
},
{
"epoch": 2.1733181299885973,
"grad_norm": 4.451428413391113,
"learning_rate": 6.429836906443879e-05,
"loss": 0.0437,
"step": 478
},
{
"epoch": 2.17787913340935,
"grad_norm": 7.2544755935668945,
"learning_rate": 6.415723898292112e-05,
"loss": 0.0816,
"step": 479
},
{
"epoch": 2.1824401368301025,
"grad_norm": 7.115444183349609,
"learning_rate": 6.401598617184939e-05,
"loss": 0.0632,
"step": 480
},
{
"epoch": 2.1870011402508553,
"grad_norm": 6.341275215148926,
"learning_rate": 6.387461185574874e-05,
"loss": 0.045,
"step": 481
},
{
"epoch": 2.1915621436716077,
"grad_norm": 4.9018025398254395,
"learning_rate": 6.373311726019763e-05,
"loss": 0.0449,
"step": 482
},
{
"epoch": 2.19612314709236,
"grad_norm": 6.2423906326293945,
"learning_rate": 6.359150361181715e-05,
"loss": 0.0609,
"step": 483
},
{
"epoch": 2.200684150513113,
"grad_norm": 7.57888126373291,
"learning_rate": 6.344977213826054e-05,
"loss": 0.094,
"step": 484
},
{
"epoch": 2.2052451539338653,
"grad_norm": 7.0582475662231445,
"learning_rate": 6.330792406820242e-05,
"loss": 0.0598,
"step": 485
},
{
"epoch": 2.209806157354618,
"grad_norm": 6.263000011444092,
"learning_rate": 6.316596063132822e-05,
"loss": 0.0594,
"step": 486
},
{
"epoch": 2.2143671607753705,
"grad_norm": 5.4891862869262695,
"learning_rate": 6.302388305832351e-05,
"loss": 0.0512,
"step": 487
},
{
"epoch": 2.2189281641961234,
"grad_norm": 7.57410192489624,
"learning_rate": 6.288169258086322e-05,
"loss": 0.0746,
"step": 488
},
{
"epoch": 2.2234891676168758,
"grad_norm": 7.4229631423950195,
"learning_rate": 6.273939043160118e-05,
"loss": 0.0609,
"step": 489
},
{
"epoch": 2.228050171037628,
"grad_norm": 12.584153175354004,
"learning_rate": 6.259697784415918e-05,
"loss": 0.1267,
"step": 490
},
{
"epoch": 2.232611174458381,
"grad_norm": 8.015351295471191,
"learning_rate": 6.245445605311649e-05,
"loss": 0.0611,
"step": 491
},
{
"epoch": 2.2371721778791334,
"grad_norm": 8.479742050170898,
"learning_rate": 6.231182629399901e-05,
"loss": 0.052,
"step": 492
},
{
"epoch": 2.241733181299886,
"grad_norm": 7.191579341888428,
"learning_rate": 6.21690898032687e-05,
"loss": 0.0738,
"step": 493
},
{
"epoch": 2.2462941847206386,
"grad_norm": 6.246610641479492,
"learning_rate": 6.202624781831268e-05,
"loss": 0.0577,
"step": 494
},
{
"epoch": 2.250855188141391,
"grad_norm": 4.082911968231201,
"learning_rate": 6.188330157743267e-05,
"loss": 0.0404,
"step": 495
},
{
"epoch": 2.255416191562144,
"grad_norm": 5.735588550567627,
"learning_rate": 6.174025231983416e-05,
"loss": 0.0529,
"step": 496
},
{
"epoch": 2.259977194982896,
"grad_norm": 9.69885540008545,
"learning_rate": 6.159710128561575e-05,
"loss": 0.0574,
"step": 497
},
{
"epoch": 2.264538198403649,
"grad_norm": 7.707938194274902,
"learning_rate": 6.145384971575823e-05,
"loss": 0.0704,
"step": 498
},
{
"epoch": 2.2690992018244014,
"grad_norm": 5.515017032623291,
"learning_rate": 6.131049885211404e-05,
"loss": 0.0501,
"step": 499
},
{
"epoch": 2.2736602052451538,
"grad_norm": 7.536128997802734,
"learning_rate": 6.116704993739635e-05,
"loss": 0.0669,
"step": 500
},
{
"epoch": 2.2782212086659066,
"grad_norm": 7.009504795074463,
"learning_rate": 6.102350421516837e-05,
"loss": 0.0609,
"step": 501
},
{
"epoch": 2.282782212086659,
"grad_norm": 6.848779678344727,
"learning_rate": 6.087986292983252e-05,
"loss": 0.0742,
"step": 502
},
{
"epoch": 2.287343215507412,
"grad_norm": 6.385640621185303,
"learning_rate": 6.073612732661966e-05,
"loss": 0.0537,
"step": 503
},
{
"epoch": 2.291904218928164,
"grad_norm": 6.395091533660889,
"learning_rate": 6.059229865157829e-05,
"loss": 0.042,
"step": 504
},
{
"epoch": 2.2964652223489166,
"grad_norm": 9.047046661376953,
"learning_rate": 6.044837815156377e-05,
"loss": 0.0676,
"step": 505
},
{
"epoch": 2.3010262257696694,
"grad_norm": 4.578718662261963,
"learning_rate": 6.030436707422745e-05,
"loss": 0.0509,
"step": 506
},
{
"epoch": 2.305587229190422,
"grad_norm": 5.892753601074219,
"learning_rate": 6.016026666800597e-05,
"loss": 0.0484,
"step": 507
},
{
"epoch": 2.3101482326111746,
"grad_norm": 5.961977958679199,
"learning_rate": 6.001607818211031e-05,
"loss": 0.0653,
"step": 508
},
{
"epoch": 2.314709236031927,
"grad_norm": 5.6413397789001465,
"learning_rate": 5.987180286651503e-05,
"loss": 0.0468,
"step": 509
},
{
"epoch": 2.3192702394526794,
"grad_norm": 5.839052677154541,
"learning_rate": 5.9727441971947395e-05,
"loss": 0.0458,
"step": 510
},
{
"epoch": 2.3238312428734322,
"grad_norm": 3.717437982559204,
"learning_rate": 5.958299674987663e-05,
"loss": 0.0322,
"step": 511
},
{
"epoch": 2.3283922462941846,
"grad_norm": 5.86605978012085,
"learning_rate": 5.943846845250291e-05,
"loss": 0.0425,
"step": 512
},
{
"epoch": 2.3329532497149374,
"grad_norm": 3.296215534210205,
"learning_rate": 5.9293858332746644e-05,
"loss": 0.034,
"step": 513
},
{
"epoch": 2.33751425313569,
"grad_norm": 4.8171186447143555,
"learning_rate": 5.9149167644237555e-05,
"loss": 0.0427,
"step": 514
},
{
"epoch": 2.342075256556442,
"grad_norm": 6.483091354370117,
"learning_rate": 5.90043976413038e-05,
"loss": 0.0545,
"step": 515
},
{
"epoch": 2.346636259977195,
"grad_norm": 4.027348041534424,
"learning_rate": 5.885954957896115e-05,
"loss": 0.0376,
"step": 516
},
{
"epoch": 2.3511972633979474,
"grad_norm": 3.8197691440582275,
"learning_rate": 5.871462471290202e-05,
"loss": 0.0287,
"step": 517
},
{
"epoch": 2.3557582668187003,
"grad_norm": 15.101433753967285,
"learning_rate": 5.8569624299484716e-05,
"loss": 0.0699,
"step": 518
},
{
"epoch": 2.3603192702394526,
"grad_norm": 6.565672397613525,
"learning_rate": 5.842454959572239e-05,
"loss": 0.0715,
"step": 519
},
{
"epoch": 2.364880273660205,
"grad_norm": 5.648789405822754,
"learning_rate": 5.827940185927227e-05,
"loss": 0.066,
"step": 520
},
{
"epoch": 2.369441277080958,
"grad_norm": 6.39064359664917,
"learning_rate": 5.813418234842467e-05,
"loss": 0.0425,
"step": 521
},
{
"epoch": 2.3740022805017102,
"grad_norm": 6.164553165435791,
"learning_rate": 5.798889232209217e-05,
"loss": 0.0491,
"step": 522
},
{
"epoch": 2.378563283922463,
"grad_norm": 6.937675476074219,
"learning_rate": 5.78435330397986e-05,
"loss": 0.0354,
"step": 523
},
{
"epoch": 2.3831242873432155,
"grad_norm": 5.974575996398926,
"learning_rate": 5.769810576166818e-05,
"loss": 0.0504,
"step": 524
},
{
"epoch": 2.387685290763968,
"grad_norm": 6.108855247497559,
"learning_rate": 5.755261174841461e-05,
"loss": 0.0597,
"step": 525
},
{
"epoch": 2.3922462941847207,
"grad_norm": 5.981025695800781,
"learning_rate": 5.740705226133013e-05,
"loss": 0.062,
"step": 526
},
{
"epoch": 2.396807297605473,
"grad_norm": 6.320438861846924,
"learning_rate": 5.726142856227452e-05,
"loss": 0.0499,
"step": 527
},
{
"epoch": 2.401368301026226,
"grad_norm": 4.965454578399658,
"learning_rate": 5.7115741913664264e-05,
"loss": 0.0432,
"step": 528
},
{
"epoch": 2.4059293044469783,
"grad_norm": 7.857591152191162,
"learning_rate": 5.696999357846153e-05,
"loss": 0.0564,
"step": 529
},
{
"epoch": 2.4104903078677307,
"grad_norm": 5.360653877258301,
"learning_rate": 5.682418482016329e-05,
"loss": 0.037,
"step": 530
},
{
"epoch": 2.4150513112884835,
"grad_norm": 5.187353610992432,
"learning_rate": 5.6678316902790266e-05,
"loss": 0.0434,
"step": 531
},
{
"epoch": 2.419612314709236,
"grad_norm": 7.093838691711426,
"learning_rate": 5.653239109087608e-05,
"loss": 0.0816,
"step": 532
},
{
"epoch": 2.4241733181299887,
"grad_norm": 8.862817764282227,
"learning_rate": 5.6386408649456205e-05,
"loss": 0.1088,
"step": 533
},
{
"epoch": 2.428734321550741,
"grad_norm": 5.698467254638672,
"learning_rate": 5.624037084405708e-05,
"loss": 0.0764,
"step": 534
},
{
"epoch": 2.433295324971494,
"grad_norm": 7.893596649169922,
"learning_rate": 5.609427894068507e-05,
"loss": 0.0743,
"step": 535
},
{
"epoch": 2.4378563283922463,
"grad_norm": 3.882078170776367,
"learning_rate": 5.594813420581554e-05,
"loss": 0.0395,
"step": 536
},
{
"epoch": 2.4424173318129987,
"grad_norm": 5.990970611572266,
"learning_rate": 5.580193790638181e-05,
"loss": 0.0434,
"step": 537
},
{
"epoch": 2.4469783352337515,
"grad_norm": 6.1614789962768555,
"learning_rate": 5.565569130976422e-05,
"loss": 0.043,
"step": 538
},
{
"epoch": 2.451539338654504,
"grad_norm": 4.174839973449707,
"learning_rate": 5.5509395683779185e-05,
"loss": 0.0583,
"step": 539
},
{
"epoch": 2.4561003420752567,
"grad_norm": 3.416801929473877,
"learning_rate": 5.536305229666815e-05,
"loss": 0.034,
"step": 540
},
{
"epoch": 2.460661345496009,
"grad_norm": 5.814635276794434,
"learning_rate": 5.521666241708655e-05,
"loss": 0.0409,
"step": 541
},
{
"epoch": 2.4652223489167615,
"grad_norm": 4.838456153869629,
"learning_rate": 5.5070227314092896e-05,
"loss": 0.0428,
"step": 542
},
{
"epoch": 2.4697833523375143,
"grad_norm": 7.684220790863037,
"learning_rate": 5.492374825713775e-05,
"loss": 0.0663,
"step": 543
},
{
"epoch": 2.4743443557582667,
"grad_norm": 3.3523683547973633,
"learning_rate": 5.47772265160527e-05,
"loss": 0.0315,
"step": 544
},
{
"epoch": 2.4789053591790196,
"grad_norm": 5.440591812133789,
"learning_rate": 5.46306633610394e-05,
"loss": 0.053,
"step": 545
},
{
"epoch": 2.483466362599772,
"grad_norm": 4.606085300445557,
"learning_rate": 5.448406006265846e-05,
"loss": 0.0345,
"step": 546
},
{
"epoch": 2.4880273660205243,
"grad_norm": 6.1201887130737305,
"learning_rate": 5.433741789181853e-05,
"loss": 0.0673,
"step": 547
},
{
"epoch": 2.492588369441277,
"grad_norm": 7.997361660003662,
"learning_rate": 5.419073811976525e-05,
"loss": 0.0764,
"step": 548
},
{
"epoch": 2.4971493728620295,
"grad_norm": 4.388640880584717,
"learning_rate": 5.4044022018070214e-05,
"loss": 0.0414,
"step": 549
},
{
"epoch": 2.5017103762827824,
"grad_norm": 4.9629645347595215,
"learning_rate": 5.3897270858619966e-05,
"loss": 0.0424,
"step": 550
},
{
"epoch": 2.5062713797035348,
"grad_norm": 7.596857070922852,
"learning_rate": 5.3750485913604965e-05,
"loss": 0.0453,
"step": 551
},
{
"epoch": 2.5108323831242876,
"grad_norm": 5.5651068687438965,
"learning_rate": 5.360366845550856e-05,
"loss": 0.0339,
"step": 552
},
{
"epoch": 2.51539338654504,
"grad_norm": 3.2136380672454834,
"learning_rate": 5.345681975709594e-05,
"loss": 0.0224,
"step": 553
},
{
"epoch": 2.5199543899657924,
"grad_norm": 4.0387864112854,
"learning_rate": 5.330994109140315e-05,
"loss": 0.0296,
"step": 554
},
{
"epoch": 2.524515393386545,
"grad_norm": 5.669864654541016,
"learning_rate": 5.316303373172601e-05,
"loss": 0.0543,
"step": 555
},
{
"epoch": 2.5290763968072976,
"grad_norm": 3.9306421279907227,
"learning_rate": 5.301609895160906e-05,
"loss": 0.0374,
"step": 556
},
{
"epoch": 2.5336374002280504,
"grad_norm": 3.963334321975708,
"learning_rate": 5.286913802483459e-05,
"loss": 0.0304,
"step": 557
},
{
"epoch": 2.538198403648803,
"grad_norm": 4.443750858306885,
"learning_rate": 5.2722152225411503e-05,
"loss": 0.0397,
"step": 558
},
{
"epoch": 2.542759407069555,
"grad_norm": 5.408681869506836,
"learning_rate": 5.25751428275644e-05,
"loss": 0.0408,
"step": 559
},
{
"epoch": 2.547320410490308,
"grad_norm": 8.279979705810547,
"learning_rate": 5.242811110572242e-05,
"loss": 0.0392,
"step": 560
},
{
"epoch": 2.5518814139110604,
"grad_norm": 4.709146022796631,
"learning_rate": 5.228105833450819e-05,
"loss": 0.0377,
"step": 561
},
{
"epoch": 2.556442417331813,
"grad_norm": 7.52549409866333,
"learning_rate": 5.213398578872688e-05,
"loss": 0.0353,
"step": 562
},
{
"epoch": 2.5610034207525656,
"grad_norm": 5.7986602783203125,
"learning_rate": 5.198689474335503e-05,
"loss": 0.0564,
"step": 563
},
{
"epoch": 2.565564424173318,
"grad_norm": 6.1219611167907715,
"learning_rate": 5.183978647352961e-05,
"loss": 0.0441,
"step": 564
},
{
"epoch": 2.570125427594071,
"grad_norm": 4.516667366027832,
"learning_rate": 5.169266225453686e-05,
"loss": 0.0316,
"step": 565
},
{
"epoch": 2.574686431014823,
"grad_norm": 5.2199625968933105,
"learning_rate": 5.154552336180132e-05,
"loss": 0.0369,
"step": 566
},
{
"epoch": 2.579247434435576,
"grad_norm": 4.977226257324219,
"learning_rate": 5.139837107087468e-05,
"loss": 0.0383,
"step": 567
},
{
"epoch": 2.5838084378563284,
"grad_norm": 3.4498722553253174,
"learning_rate": 5.1251206657424864e-05,
"loss": 0.0215,
"step": 568
},
{
"epoch": 2.588369441277081,
"grad_norm": 4.04592227935791,
"learning_rate": 5.110403139722484e-05,
"loss": 0.0249,
"step": 569
},
{
"epoch": 2.5929304446978336,
"grad_norm": 6.6897969245910645,
"learning_rate": 5.0956846566141595e-05,
"loss": 0.0463,
"step": 570
},
{
"epoch": 2.597491448118586,
"grad_norm": 5.076176166534424,
"learning_rate": 5.080965344012508e-05,
"loss": 0.0426,
"step": 571
},
{
"epoch": 2.602052451539339,
"grad_norm": 11.544487953186035,
"learning_rate": 5.066245329519721e-05,
"loss": 0.0356,
"step": 572
},
{
"epoch": 2.6066134549600912,
"grad_norm": 6.120387077331543,
"learning_rate": 5.0515247407440705e-05,
"loss": 0.0451,
"step": 573
},
{
"epoch": 2.6111744583808436,
"grad_norm": 5.812496185302734,
"learning_rate": 5.036803705298808e-05,
"loss": 0.0293,
"step": 574
},
{
"epoch": 2.6157354618015964,
"grad_norm": 4.080401420593262,
"learning_rate": 5.022082350801055e-05,
"loss": 0.032,
"step": 575
},
{
"epoch": 2.620296465222349,
"grad_norm": 4.283697128295898,
"learning_rate": 5.007360804870702e-05,
"loss": 0.0161,
"step": 576
},
{
"epoch": 2.6248574686431017,
"grad_norm": 5.630773544311523,
"learning_rate": 4.9926391951292985e-05,
"loss": 0.0428,
"step": 577
},
{
"epoch": 2.629418472063854,
"grad_norm": 5.993396759033203,
"learning_rate": 4.977917649198945e-05,
"loss": 0.038,
"step": 578
},
{
"epoch": 2.6339794754846064,
"grad_norm": 5.899278163909912,
"learning_rate": 4.963196294701194e-05,
"loss": 0.048,
"step": 579
},
{
"epoch": 2.6385404789053593,
"grad_norm": 5.6876091957092285,
"learning_rate": 4.9484752592559306e-05,
"loss": 0.0358,
"step": 580
},
{
"epoch": 2.6431014823261116,
"grad_norm": 8.28043270111084,
"learning_rate": 4.9337546704802806e-05,
"loss": 0.0446,
"step": 581
},
{
"epoch": 2.6476624857468645,
"grad_norm": 3.0159778594970703,
"learning_rate": 4.919034655987493e-05,
"loss": 0.0202,
"step": 582
},
{
"epoch": 2.652223489167617,
"grad_norm": 3.556821823120117,
"learning_rate": 4.904315343385844e-05,
"loss": 0.0359,
"step": 583
},
{
"epoch": 2.6567844925883692,
"grad_norm": 9.480207443237305,
"learning_rate": 4.889596860277519e-05,
"loss": 0.0292,
"step": 584
},
{
"epoch": 2.661345496009122,
"grad_norm": 4.381405830383301,
"learning_rate": 4.8748793342575134e-05,
"loss": 0.0432,
"step": 585
},
{
"epoch": 2.6659064994298745,
"grad_norm": 3.772207260131836,
"learning_rate": 4.860162892912532e-05,
"loss": 0.0172,
"step": 586
},
{
"epoch": 2.6704675028506273,
"grad_norm": 4.178829193115234,
"learning_rate": 4.84544766381987e-05,
"loss": 0.0344,
"step": 587
},
{
"epoch": 2.6750285062713797,
"grad_norm": 12.805524826049805,
"learning_rate": 4.830733774546315e-05,
"loss": 0.0377,
"step": 588
},
{
"epoch": 2.679589509692132,
"grad_norm": 5.325319290161133,
"learning_rate": 4.8160213526470403e-05,
"loss": 0.0533,
"step": 589
},
{
"epoch": 2.684150513112885,
"grad_norm": 6.815293788909912,
"learning_rate": 4.801310525664498e-05,
"loss": 0.0256,
"step": 590
},
{
"epoch": 2.6887115165336373,
"grad_norm": 5.803644180297852,
"learning_rate": 4.7866014211273135e-05,
"loss": 0.0179,
"step": 591
},
{
"epoch": 2.69327251995439,
"grad_norm": 4.223587512969971,
"learning_rate": 4.7718941665491825e-05,
"loss": 0.0337,
"step": 592
},
{
"epoch": 2.6978335233751425,
"grad_norm": 3.5225670337677,
"learning_rate": 4.7571888894277604e-05,
"loss": 0.0251,
"step": 593
},
{
"epoch": 2.702394526795895,
"grad_norm": 4.653651714324951,
"learning_rate": 4.7424857172435596e-05,
"loss": 0.0309,
"step": 594
},
{
"epoch": 2.7069555302166477,
"grad_norm": 4.962986469268799,
"learning_rate": 4.72778477745885e-05,
"loss": 0.0375,
"step": 595
},
{
"epoch": 2.7115165336374,
"grad_norm": 7.884991645812988,
"learning_rate": 4.713086197516542e-05,
"loss": 0.0619,
"step": 596
},
{
"epoch": 2.716077537058153,
"grad_norm": 6.011470317840576,
"learning_rate": 4.698390104839096e-05,
"loss": 0.0304,
"step": 597
},
{
"epoch": 2.7206385404789053,
"grad_norm": 4.3046159744262695,
"learning_rate": 4.683696626827401e-05,
"loss": 0.0251,
"step": 598
},
{
"epoch": 2.7251995438996577,
"grad_norm": 4.003452301025391,
"learning_rate": 4.669005890859686e-05,
"loss": 0.0231,
"step": 599
},
{
"epoch": 2.7297605473204105,
"grad_norm": 4.7901530265808105,
"learning_rate": 4.654318024290407e-05,
"loss": 0.0425,
"step": 600
},
{
"epoch": 2.734321550741163,
"grad_norm": 4.104437351226807,
"learning_rate": 4.639633154449146e-05,
"loss": 0.0284,
"step": 601
},
{
"epoch": 2.7388825541619157,
"grad_norm": 3.93487811088562,
"learning_rate": 4.624951408639503e-05,
"loss": 0.0294,
"step": 602
},
{
"epoch": 2.743443557582668,
"grad_norm": 6.138600826263428,
"learning_rate": 4.610272914138004e-05,
"loss": 0.0315,
"step": 603
},
{
"epoch": 2.7480045610034205,
"grad_norm": 6.667906761169434,
"learning_rate": 4.59559779819298e-05,
"loss": 0.0395,
"step": 604
},
{
"epoch": 2.7525655644241733,
"grad_norm": 4.121731281280518,
"learning_rate": 4.5809261880234764e-05,
"loss": 0.0319,
"step": 605
},
{
"epoch": 2.757126567844926,
"grad_norm": 3.8120853900909424,
"learning_rate": 4.566258210818148e-05,
"loss": 0.029,
"step": 606
},
{
"epoch": 2.7616875712656785,
"grad_norm": 3.287109851837158,
"learning_rate": 4.5515939937341556e-05,
"loss": 0.0224,
"step": 607
},
{
"epoch": 2.766248574686431,
"grad_norm": 3.322906970977783,
"learning_rate": 4.5369336638960616e-05,
"loss": 0.0233,
"step": 608
},
{
"epoch": 2.7708095781071833,
"grad_norm": 1.5358067750930786,
"learning_rate": 4.522277348394731e-05,
"loss": 0.0088,
"step": 609
},
{
"epoch": 2.775370581527936,
"grad_norm": 4.789572715759277,
"learning_rate": 4.507625174286226e-05,
"loss": 0.0357,
"step": 610
},
{
"epoch": 2.779931584948689,
"grad_norm": 3.2839534282684326,
"learning_rate": 4.492977268590711e-05,
"loss": 0.0237,
"step": 611
},
{
"epoch": 2.7844925883694414,
"grad_norm": 4.322288513183594,
"learning_rate": 4.478333758291347e-05,
"loss": 0.0387,
"step": 612
},
{
"epoch": 2.7890535917901937,
"grad_norm": 6.475346088409424,
"learning_rate": 4.4636947703331864e-05,
"loss": 0.0426,
"step": 613
},
{
"epoch": 2.7936145952109466,
"grad_norm": 4.861753940582275,
"learning_rate": 4.449060431622082e-05,
"loss": 0.0286,
"step": 614
},
{
"epoch": 2.798175598631699,
"grad_norm": 2.6957809925079346,
"learning_rate": 4.434430869023579e-05,
"loss": 0.0222,
"step": 615
},
{
"epoch": 2.802736602052452,
"grad_norm": 6.314004898071289,
"learning_rate": 4.419806209361822e-05,
"loss": 0.033,
"step": 616
},
{
"epoch": 2.807297605473204,
"grad_norm": 5.230919361114502,
"learning_rate": 4.405186579418448e-05,
"loss": 0.0195,
"step": 617
},
{
"epoch": 2.8118586088939566,
"grad_norm": 4.417494773864746,
"learning_rate": 4.390572105931492e-05,
"loss": 0.0372,
"step": 618
},
{
"epoch": 2.8164196123147094,
"grad_norm": 6.1748552322387695,
"learning_rate": 4.375962915594292e-05,
"loss": 0.0284,
"step": 619
},
{
"epoch": 2.8209806157354618,
"grad_norm": 8.319523811340332,
"learning_rate": 4.36135913505438e-05,
"loss": 0.0392,
"step": 620
},
{
"epoch": 2.8255416191562146,
"grad_norm": 6.181365489959717,
"learning_rate": 4.346760890912394e-05,
"loss": 0.0521,
"step": 621
},
{
"epoch": 2.830102622576967,
"grad_norm": 4.259495735168457,
"learning_rate": 4.3321683097209745e-05,
"loss": 0.0181,
"step": 622
},
{
"epoch": 2.8346636259977194,
"grad_norm": 6.773824214935303,
"learning_rate": 4.317581517983673e-05,
"loss": 0.0229,
"step": 623
},
{
"epoch": 2.839224629418472,
"grad_norm": 7.4220356941223145,
"learning_rate": 4.303000642153847e-05,
"loss": 0.0316,
"step": 624
},
{
"epoch": 2.8437856328392246,
"grad_norm": 6.611977577209473,
"learning_rate": 4.288425808633575e-05,
"loss": 0.0333,
"step": 625
},
{
"epoch": 2.8483466362599774,
"grad_norm": 4.086733818054199,
"learning_rate": 4.27385714377255e-05,
"loss": 0.022,
"step": 626
},
{
"epoch": 2.85290763968073,
"grad_norm": 3.454923629760742,
"learning_rate": 4.259294773866987e-05,
"loss": 0.0273,
"step": 627
},
{
"epoch": 2.857468643101482,
"grad_norm": 2.6385574340820312,
"learning_rate": 4.2447388251585384e-05,
"loss": 0.0167,
"step": 628
},
{
"epoch": 2.862029646522235,
"grad_norm": 3.3853583335876465,
"learning_rate": 4.230189423833183e-05,
"loss": 0.0261,
"step": 629
},
{
"epoch": 2.8665906499429874,
"grad_norm": 2.6701831817626953,
"learning_rate": 4.215646696020141e-05,
"loss": 0.0251,
"step": 630
},
{
"epoch": 2.8711516533637402,
"grad_norm": 2.350428342819214,
"learning_rate": 4.201110767790784e-05,
"loss": 0.013,
"step": 631
},
{
"epoch": 2.8757126567844926,
"grad_norm": 4.5163373947143555,
"learning_rate": 4.186581765157534e-05,
"loss": 0.0333,
"step": 632
},
{
"epoch": 2.880273660205245,
"grad_norm": 4.357926368713379,
"learning_rate": 4.172059814072776e-05,
"loss": 0.0204,
"step": 633
},
{
"epoch": 2.884834663625998,
"grad_norm": 5.049153804779053,
"learning_rate": 4.157545040427763e-05,
"loss": 0.0508,
"step": 634
},
{
"epoch": 2.88939566704675,
"grad_norm": 3.518669366836548,
"learning_rate": 4.143037570051529e-05,
"loss": 0.0267,
"step": 635
},
{
"epoch": 2.893956670467503,
"grad_norm": 2.3187649250030518,
"learning_rate": 4.1285375287097976e-05,
"loss": 0.0144,
"step": 636
},
{
"epoch": 2.8985176738882554,
"grad_norm": 5.340627193450928,
"learning_rate": 4.114045042103887e-05,
"loss": 0.0221,
"step": 637
},
{
"epoch": 2.903078677309008,
"grad_norm": 5.597445487976074,
"learning_rate": 4.099560235869621e-05,
"loss": 0.0217,
"step": 638
},
{
"epoch": 2.9076396807297606,
"grad_norm": 10.440146446228027,
"learning_rate": 4.085083235576246e-05,
"loss": 0.0383,
"step": 639
},
{
"epoch": 2.912200684150513,
"grad_norm": 6.332849502563477,
"learning_rate": 4.070614166725337e-05,
"loss": 0.0341,
"step": 640
},
{
"epoch": 2.916761687571266,
"grad_norm": 3.814603090286255,
"learning_rate": 4.056153154749711e-05,
"loss": 0.0336,
"step": 641
},
{
"epoch": 2.9213226909920182,
"grad_norm": 6.715717315673828,
"learning_rate": 4.04170032501234e-05,
"loss": 0.0396,
"step": 642
},
{
"epoch": 2.9258836944127706,
"grad_norm": 5.1199140548706055,
"learning_rate": 4.02725580280526e-05,
"loss": 0.0311,
"step": 643
},
{
"epoch": 2.9304446978335235,
"grad_norm": 2.676060438156128,
"learning_rate": 4.012819713348499e-05,
"loss": 0.0188,
"step": 644
},
{
"epoch": 2.935005701254276,
"grad_norm": 7.360265254974365,
"learning_rate": 3.9983921817889694e-05,
"loss": 0.0318,
"step": 645
},
{
"epoch": 2.9395667046750287,
"grad_norm": 2.801821708679199,
"learning_rate": 3.9839733331994036e-05,
"loss": 0.0176,
"step": 646
},
{
"epoch": 2.944127708095781,
"grad_norm": 6.362011432647705,
"learning_rate": 3.9695632925772555e-05,
"loss": 0.0353,
"step": 647
},
{
"epoch": 2.9486887115165334,
"grad_norm": 3.604642152786255,
"learning_rate": 3.955162184843625e-05,
"loss": 0.0298,
"step": 648
},
{
"epoch": 2.9532497149372863,
"grad_norm": 4.035106182098389,
"learning_rate": 3.940770134842172e-05,
"loss": 0.0312,
"step": 649
},
{
"epoch": 2.9578107183580387,
"grad_norm": 3.8275413513183594,
"learning_rate": 3.9263872673380356e-05,
"loss": 0.0242,
"step": 650
},
{
"epoch": 2.9623717217787915,
"grad_norm": 6.57460355758667,
"learning_rate": 3.912013707016748e-05,
"loss": 0.0362,
"step": 651
},
{
"epoch": 2.966932725199544,
"grad_norm": 2.4332528114318848,
"learning_rate": 3.897649578483163e-05,
"loss": 0.0124,
"step": 652
},
{
"epoch": 2.9714937286202963,
"grad_norm": 2.740542411804199,
"learning_rate": 3.883295006260366e-05,
"loss": 0.0141,
"step": 653
},
{
"epoch": 2.976054732041049,
"grad_norm": 4.145965576171875,
"learning_rate": 3.868950114788597e-05,
"loss": 0.0227,
"step": 654
},
{
"epoch": 2.9806157354618015,
"grad_norm": 3.8210062980651855,
"learning_rate": 3.8546150284241784e-05,
"loss": 0.0256,
"step": 655
},
{
"epoch": 2.9851767388825543,
"grad_norm": 2.960423469543457,
"learning_rate": 3.840289871438427e-05,
"loss": 0.0234,
"step": 656
},
{
"epoch": 2.9897377423033067,
"grad_norm": 2.305687189102173,
"learning_rate": 3.8259747680165835e-05,
"loss": 0.0167,
"step": 657
},
{
"epoch": 2.994298745724059,
"grad_norm": 3.4394383430480957,
"learning_rate": 3.811669842256733e-05,
"loss": 0.0188,
"step": 658
},
{
"epoch": 2.998859749144812,
"grad_norm": 4.973351955413818,
"learning_rate": 3.7973752181687335e-05,
"loss": 0.0154,
"step": 659
},
{
"epoch": 3.0,
"grad_norm": 8.8117036819458,
"learning_rate": 3.78309101967313e-05,
"loss": 0.0315,
"step": 660
},
{
"epoch": 3.0045610034207524,
"grad_norm": 2.305928945541382,
"learning_rate": 3.768817370600098e-05,
"loss": 0.0068,
"step": 661
},
{
"epoch": 3.009122006841505,
"grad_norm": 3.3252930641174316,
"learning_rate": 3.754554394688353e-05,
"loss": 0.0249,
"step": 662
},
{
"epoch": 3.0136830102622576,
"grad_norm": 2.8702948093414307,
"learning_rate": 3.740302215584083e-05,
"loss": 0.0141,
"step": 663
},
{
"epoch": 3.0182440136830104,
"grad_norm": 1.2875800132751465,
"learning_rate": 3.726060956839884e-05,
"loss": 0.0088,
"step": 664
},
{
"epoch": 3.022805017103763,
"grad_norm": 2.241077184677124,
"learning_rate": 3.7118307419136784e-05,
"loss": 0.0158,
"step": 665
},
{
"epoch": 3.027366020524515,
"grad_norm": 3.223511219024658,
"learning_rate": 3.697611694167652e-05,
"loss": 0.0172,
"step": 666
},
{
"epoch": 3.031927023945268,
"grad_norm": 1.6573007106781006,
"learning_rate": 3.683403936867179e-05,
"loss": 0.0061,
"step": 667
},
{
"epoch": 3.0364880273660204,
"grad_norm": 2.708397388458252,
"learning_rate": 3.6692075931797586e-05,
"loss": 0.0189,
"step": 668
},
{
"epoch": 3.0410490307867732,
"grad_norm": 1.0548919439315796,
"learning_rate": 3.6550227861739474e-05,
"loss": 0.0057,
"step": 669
},
{
"epoch": 3.0456100342075256,
"grad_norm": 2.2885780334472656,
"learning_rate": 3.640849638818286e-05,
"loss": 0.0114,
"step": 670
},
{
"epoch": 3.050171037628278,
"grad_norm": 2.5884759426116943,
"learning_rate": 3.6266882739802385e-05,
"loss": 0.0135,
"step": 671
},
{
"epoch": 3.054732041049031,
"grad_norm": 1.9603959321975708,
"learning_rate": 3.612538814425127e-05,
"loss": 0.0078,
"step": 672
},
{
"epoch": 3.0592930444697832,
"grad_norm": 1.8305693864822388,
"learning_rate": 3.598401382815062e-05,
"loss": 0.0191,
"step": 673
},
{
"epoch": 3.063854047890536,
"grad_norm": 6.010728359222412,
"learning_rate": 3.584276101707892e-05,
"loss": 0.0228,
"step": 674
},
{
"epoch": 3.0684150513112884,
"grad_norm": 0.7543220520019531,
"learning_rate": 3.570163093556123e-05,
"loss": 0.0049,
"step": 675
},
{
"epoch": 3.072976054732041,
"grad_norm": 3.3827199935913086,
"learning_rate": 3.556062480705871e-05,
"loss": 0.0205,
"step": 676
},
{
"epoch": 3.0775370581527937,
"grad_norm": 3.6950876712799072,
"learning_rate": 3.541974385395799e-05,
"loss": 0.013,
"step": 677
},
{
"epoch": 3.082098061573546,
"grad_norm": 2.4483461380004883,
"learning_rate": 3.527898929756049e-05,
"loss": 0.0112,
"step": 678
},
{
"epoch": 3.086659064994299,
"grad_norm": 1.083269715309143,
"learning_rate": 3.5138362358071955e-05,
"loss": 0.0051,
"step": 679
},
{
"epoch": 3.0912200684150513,
"grad_norm": 3.507185459136963,
"learning_rate": 3.4997864254591786e-05,
"loss": 0.0184,
"step": 680
},
{
"epoch": 3.095781071835804,
"grad_norm": 1.244012713432312,
"learning_rate": 3.4857496205102474e-05,
"loss": 0.0058,
"step": 681
},
{
"epoch": 3.1003420752565565,
"grad_norm": 1.7855976819992065,
"learning_rate": 3.47172594264591e-05,
"loss": 0.0081,
"step": 682
},
{
"epoch": 3.104903078677309,
"grad_norm": 3.3493175506591797,
"learning_rate": 3.457715513437878e-05,
"loss": 0.0099,
"step": 683
},
{
"epoch": 3.1094640820980617,
"grad_norm": 2.2510809898376465,
"learning_rate": 3.443718454343003e-05,
"loss": 0.0103,
"step": 684
},
{
"epoch": 3.114025085518814,
"grad_norm": 2.8858158588409424,
"learning_rate": 3.429734886702235e-05,
"loss": 0.0201,
"step": 685
},
{
"epoch": 3.118586088939567,
"grad_norm": 2.9078762531280518,
"learning_rate": 3.415764931739569e-05,
"loss": 0.0126,
"step": 686
},
{
"epoch": 3.1231470923603193,
"grad_norm": 11.870075225830078,
"learning_rate": 3.401808710560984e-05,
"loss": 0.0367,
"step": 687
},
{
"epoch": 3.1277080957810717,
"grad_norm": 2.278918504714966,
"learning_rate": 3.3878663441534074e-05,
"loss": 0.0131,
"step": 688
},
{
"epoch": 3.1322690992018245,
"grad_norm": 2.479530096054077,
"learning_rate": 3.3739379533836545e-05,
"loss": 0.0157,
"step": 689
},
{
"epoch": 3.136830102622577,
"grad_norm": 5.716989517211914,
"learning_rate": 3.360023658997387e-05,
"loss": 0.0121,
"step": 690
},
{
"epoch": 3.1413911060433297,
"grad_norm": 3.942920446395874,
"learning_rate": 3.346123581618064e-05,
"loss": 0.0119,
"step": 691
},
{
"epoch": 3.145952109464082,
"grad_norm": 3.193537712097168,
"learning_rate": 3.332237841745898e-05,
"loss": 0.0214,
"step": 692
},
{
"epoch": 3.1505131128848345,
"grad_norm": 6.671420574188232,
"learning_rate": 3.318366559756807e-05,
"loss": 0.0141,
"step": 693
},
{
"epoch": 3.1550741163055873,
"grad_norm": 1.345292329788208,
"learning_rate": 3.304509855901379e-05,
"loss": 0.0052,
"step": 694
},
{
"epoch": 3.1596351197263397,
"grad_norm": 1.9885728359222412,
"learning_rate": 3.290667850303816e-05,
"loss": 0.0139,
"step": 695
},
{
"epoch": 3.1641961231470925,
"grad_norm": 2.2747621536254883,
"learning_rate": 3.276840662960904e-05,
"loss": 0.0111,
"step": 696
},
{
"epoch": 3.168757126567845,
"grad_norm": 1.8984365463256836,
"learning_rate": 3.26302841374097e-05,
"loss": 0.0094,
"step": 697
},
{
"epoch": 3.1733181299885973,
"grad_norm": 1.0688873529434204,
"learning_rate": 3.2492312223828395e-05,
"loss": 0.0066,
"step": 698
},
{
"epoch": 3.17787913340935,
"grad_norm": 3.808332681655884,
"learning_rate": 3.235449208494804e-05,
"loss": 0.0267,
"step": 699
},
{
"epoch": 3.1824401368301025,
"grad_norm": 2.2355008125305176,
"learning_rate": 3.221682491553578e-05,
"loss": 0.0098,
"step": 700
},
{
"epoch": 3.1870011402508553,
"grad_norm": 2.873753547668457,
"learning_rate": 3.207931190903267e-05,
"loss": 0.0211,
"step": 701
},
{
"epoch": 3.1915621436716077,
"grad_norm": 1.0170818567276,
"learning_rate": 3.194195425754333e-05,
"loss": 0.0042,
"step": 702
},
{
"epoch": 3.19612314709236,
"grad_norm": 1.7715719938278198,
"learning_rate": 3.180475315182563e-05,
"loss": 0.0042,
"step": 703
},
{
"epoch": 3.200684150513113,
"grad_norm": 3.1615145206451416,
"learning_rate": 3.166770978128027e-05,
"loss": 0.0096,
"step": 704
},
{
"epoch": 3.2052451539338653,
"grad_norm": 1.4706147909164429,
"learning_rate": 3.1530825333940606e-05,
"loss": 0.0069,
"step": 705
},
{
"epoch": 3.209806157354618,
"grad_norm": 1.4655320644378662,
"learning_rate": 3.139410099646223e-05,
"loss": 0.007,
"step": 706
},
{
"epoch": 3.2143671607753705,
"grad_norm": 2.010169506072998,
"learning_rate": 3.1257537954112784e-05,
"loss": 0.0061,
"step": 707
},
{
"epoch": 3.2189281641961234,
"grad_norm": 2.4863297939300537,
"learning_rate": 3.112113739076161e-05,
"loss": 0.0135,
"step": 708
},
{
"epoch": 3.2234891676168758,
"grad_norm": 0.9651201963424683,
"learning_rate": 3.09849004888695e-05,
"loss": 0.0059,
"step": 709
},
{
"epoch": 3.228050171037628,
"grad_norm": 1.4260286092758179,
"learning_rate": 3.084882842947851e-05,
"loss": 0.0058,
"step": 710
},
{
"epoch": 3.232611174458381,
"grad_norm": 2.2575788497924805,
"learning_rate": 3.071292239220164e-05,
"loss": 0.0072,
"step": 711
},
{
"epoch": 3.2371721778791334,
"grad_norm": 2.315737247467041,
"learning_rate": 3.057718355521262e-05,
"loss": 0.0072,
"step": 712
},
{
"epoch": 3.241733181299886,
"grad_norm": 1.3366587162017822,
"learning_rate": 3.0441613095235755e-05,
"loss": 0.0079,
"step": 713
},
{
"epoch": 3.2462941847206386,
"grad_norm": 1.586226224899292,
"learning_rate": 3.0306212187535653e-05,
"loss": 0.0087,
"step": 714
},
{
"epoch": 3.250855188141391,
"grad_norm": 1.7931245565414429,
"learning_rate": 3.0170982005907066e-05,
"loss": 0.0077,
"step": 715
},
{
"epoch": 3.255416191562144,
"grad_norm": 3.0318455696105957,
"learning_rate": 3.003592372266476e-05,
"loss": 0.0055,
"step": 716
},
{
"epoch": 3.259977194982896,
"grad_norm": 1.366331934928894,
"learning_rate": 2.990103850863327e-05,
"loss": 0.0047,
"step": 717
},
{
"epoch": 3.264538198403649,
"grad_norm": 0.7723425030708313,
"learning_rate": 2.9766327533136774e-05,
"loss": 0.0049,
"step": 718
},
{
"epoch": 3.2690992018244014,
"grad_norm": 3.0485169887542725,
"learning_rate": 2.963179196398902e-05,
"loss": 0.007,
"step": 719
},
{
"epoch": 3.2736602052451538,
"grad_norm": 1.4431509971618652,
"learning_rate": 2.9497432967483124e-05,
"loss": 0.0051,
"step": 720
},
{
"epoch": 3.2782212086659066,
"grad_norm": 1.8810582160949707,
"learning_rate": 2.9363251708381477e-05,
"loss": 0.0069,
"step": 721
},
{
"epoch": 3.282782212086659,
"grad_norm": 0.9738725423812866,
"learning_rate": 2.9229249349905684e-05,
"loss": 0.0049,
"step": 722
},
{
"epoch": 3.287343215507412,
"grad_norm": 2.8499152660369873,
"learning_rate": 2.9095427053726442e-05,
"loss": 0.0093,
"step": 723
},
{
"epoch": 3.291904218928164,
"grad_norm": 2.7052316665649414,
"learning_rate": 2.896178597995347e-05,
"loss": 0.0109,
"step": 724
},
{
"epoch": 3.2964652223489166,
"grad_norm": 0.7764227986335754,
"learning_rate": 2.882832728712551e-05,
"loss": 0.0046,
"step": 725
},
{
"epoch": 3.3010262257696694,
"grad_norm": 2.247260570526123,
"learning_rate": 2.869505213220014e-05,
"loss": 0.0078,
"step": 726
},
{
"epoch": 3.305587229190422,
"grad_norm": 0.8493082523345947,
"learning_rate": 2.8561961670543995e-05,
"loss": 0.0049,
"step": 727
},
{
"epoch": 3.3101482326111746,
"grad_norm": 0.709747850894928,
"learning_rate": 2.8429057055922448e-05,
"loss": 0.0039,
"step": 728
},
{
"epoch": 3.314709236031927,
"grad_norm": 1.864812970161438,
"learning_rate": 2.8296339440489837e-05,
"loss": 0.0094,
"step": 729
},
{
"epoch": 3.3192702394526794,
"grad_norm": 2.6070640087127686,
"learning_rate": 2.8163809974779405e-05,
"loss": 0.0127,
"step": 730
},
{
"epoch": 3.3238312428734322,
"grad_norm": 0.6544182896614075,
"learning_rate": 2.8031469807693257e-05,
"loss": 0.0029,
"step": 731
},
{
"epoch": 3.3283922462941846,
"grad_norm": 2.191878318786621,
"learning_rate": 2.789932008649252e-05,
"loss": 0.0075,
"step": 732
},
{
"epoch": 3.3329532497149374,
"grad_norm": 1.73975670337677,
"learning_rate": 2.776736195678734e-05,
"loss": 0.0083,
"step": 733
},
{
"epoch": 3.33751425313569,
"grad_norm": 1.4151902198791504,
"learning_rate": 2.7635596562526865e-05,
"loss": 0.0071,
"step": 734
},
{
"epoch": 3.342075256556442,
"grad_norm": 2.084052562713623,
"learning_rate": 2.7504025045989577e-05,
"loss": 0.0098,
"step": 735
},
{
"epoch": 3.346636259977195,
"grad_norm": 1.386006474494934,
"learning_rate": 2.737264854777306e-05,
"loss": 0.0083,
"step": 736
},
{
"epoch": 3.3511972633979474,
"grad_norm": 2.138157606124878,
"learning_rate": 2.724146820678436e-05,
"loss": 0.0061,
"step": 737
},
{
"epoch": 3.3557582668187003,
"grad_norm": 0.871894896030426,
"learning_rate": 2.7110485160230037e-05,
"loss": 0.0047,
"step": 738
},
{
"epoch": 3.3603192702394526,
"grad_norm": 2.6974539756774902,
"learning_rate": 2.6979700543606245e-05,
"loss": 0.0063,
"step": 739
},
{
"epoch": 3.364880273660205,
"grad_norm": 1.140648603439331,
"learning_rate": 2.6849115490689013e-05,
"loss": 0.0046,
"step": 740
},
{
"epoch": 3.369441277080958,
"grad_norm": 5.994167804718018,
"learning_rate": 2.6718731133524265e-05,
"loss": 0.0116,
"step": 741
},
{
"epoch": 3.3740022805017102,
"grad_norm": 2.2735085487365723,
"learning_rate": 2.6588548602418156e-05,
"loss": 0.0092,
"step": 742
},
{
"epoch": 3.378563283922463,
"grad_norm": 1.1353788375854492,
"learning_rate": 2.6458569025927183e-05,
"loss": 0.0038,
"step": 743
},
{
"epoch": 3.3831242873432155,
"grad_norm": 1.2895104885101318,
"learning_rate": 2.6328793530848405e-05,
"loss": 0.0053,
"step": 744
},
{
"epoch": 3.387685290763968,
"grad_norm": 1.2267273664474487,
"learning_rate": 2.6199223242209747e-05,
"loss": 0.0054,
"step": 745
},
{
"epoch": 3.3922462941847207,
"grad_norm": 1.287307858467102,
"learning_rate": 2.6069859283260097e-05,
"loss": 0.005,
"step": 746
},
{
"epoch": 3.396807297605473,
"grad_norm": 0.8949470520019531,
"learning_rate": 2.5940702775459747e-05,
"loss": 0.0046,
"step": 747
},
{
"epoch": 3.401368301026226,
"grad_norm": 3.3357739448547363,
"learning_rate": 2.5811754838470583e-05,
"loss": 0.0076,
"step": 748
},
{
"epoch": 3.4059293044469783,
"grad_norm": 7.779567718505859,
"learning_rate": 2.5683016590146318e-05,
"loss": 0.0072,
"step": 749
},
{
"epoch": 3.4104903078677307,
"grad_norm": 2.3621115684509277,
"learning_rate": 2.5554489146522958e-05,
"loss": 0.0091,
"step": 750
},
{
"epoch": 3.4150513112884835,
"grad_norm": 1.8572039604187012,
"learning_rate": 2.542617362180893e-05,
"loss": 0.0074,
"step": 751
},
{
"epoch": 3.419612314709236,
"grad_norm": 2.4851315021514893,
"learning_rate": 2.5298071128375644e-05,
"loss": 0.0045,
"step": 752
},
{
"epoch": 3.4241733181299887,
"grad_norm": 2.945042371749878,
"learning_rate": 2.5170182776747687e-05,
"loss": 0.009,
"step": 753
},
{
"epoch": 3.428734321550741,
"grad_norm": 2.0146474838256836,
"learning_rate": 2.5042509675593195e-05,
"loss": 0.0115,
"step": 754
},
{
"epoch": 3.433295324971494,
"grad_norm": 0.28727975487709045,
"learning_rate": 2.491505293171438e-05,
"loss": 0.0022,
"step": 755
},
{
"epoch": 3.4378563283922463,
"grad_norm": 4.880654335021973,
"learning_rate": 2.478781365003775e-05,
"loss": 0.0218,
"step": 756
},
{
"epoch": 3.4424173318129987,
"grad_norm": 0.8212169408798218,
"learning_rate": 2.46607929336047e-05,
"loss": 0.0037,
"step": 757
},
{
"epoch": 3.4469783352337515,
"grad_norm": 2.413165330886841,
"learning_rate": 2.4533991883561868e-05,
"loss": 0.0115,
"step": 758
},
{
"epoch": 3.451539338654504,
"grad_norm": 1.3072090148925781,
"learning_rate": 2.440741159915153e-05,
"loss": 0.0057,
"step": 759
},
{
"epoch": 3.4561003420752567,
"grad_norm": 2.672323703765869,
"learning_rate": 2.4281053177702256e-05,
"loss": 0.0105,
"step": 760
},
{
"epoch": 3.460661345496009,
"grad_norm": 1.8092498779296875,
"learning_rate": 2.4154917714619164e-05,
"loss": 0.0055,
"step": 761
},
{
"epoch": 3.4652223489167615,
"grad_norm": 2.4887306690216064,
"learning_rate": 2.40290063033746e-05,
"loss": 0.0068,
"step": 762
},
{
"epoch": 3.4697833523375143,
"grad_norm": 1.1760387420654297,
"learning_rate": 2.3903320035498605e-05,
"loss": 0.0049,
"step": 763
},
{
"epoch": 3.4743443557582667,
"grad_norm": 1.3489493131637573,
"learning_rate": 2.3777860000569384e-05,
"loss": 0.004,
"step": 764
},
{
"epoch": 3.4789053591790196,
"grad_norm": 4.36347770690918,
"learning_rate": 2.365262728620398e-05,
"loss": 0.0042,
"step": 765
},
{
"epoch": 3.483466362599772,
"grad_norm": 0.5808774828910828,
"learning_rate": 2.352762297804879e-05,
"loss": 0.0029,
"step": 766
},
{
"epoch": 3.4880273660205243,
"grad_norm": 1.9665743112564087,
"learning_rate": 2.340284815977007e-05,
"loss": 0.0088,
"step": 767
},
{
"epoch": 3.492588369441277,
"grad_norm": 5.437419891357422,
"learning_rate": 2.327830391304475e-05,
"loss": 0.0414,
"step": 768
},
{
"epoch": 3.4971493728620295,
"grad_norm": 0.9246713519096375,
"learning_rate": 2.315399131755081e-05,
"loss": 0.0055,
"step": 769
},
{
"epoch": 3.5017103762827824,
"grad_norm": 1.88973069190979,
"learning_rate": 2.3029911450958113e-05,
"loss": 0.007,
"step": 770
},
{
"epoch": 3.5062713797035348,
"grad_norm": 5.172039985656738,
"learning_rate": 2.2906065388918934e-05,
"loss": 0.0104,
"step": 771
},
{
"epoch": 3.5108323831242876,
"grad_norm": 2.3819446563720703,
"learning_rate": 2.278245420505873e-05,
"loss": 0.0125,
"step": 772
},
{
"epoch": 3.51539338654504,
"grad_norm": 0.6151133179664612,
"learning_rate": 2.2659078970966784e-05,
"loss": 0.0036,
"step": 773
},
{
"epoch": 3.5199543899657924,
"grad_norm": 1.6557809114456177,
"learning_rate": 2.2535940756186897e-05,
"loss": 0.0095,
"step": 774
},
{
"epoch": 3.524515393386545,
"grad_norm": 1.1430513858795166,
"learning_rate": 2.2413040628208165e-05,
"loss": 0.0049,
"step": 775
},
{
"epoch": 3.5290763968072976,
"grad_norm": 1.5872365236282349,
"learning_rate": 2.22903796524557e-05,
"loss": 0.0045,
"step": 776
},
{
"epoch": 3.5336374002280504,
"grad_norm": 1.7567164897918701,
"learning_rate": 2.2167958892281404e-05,
"loss": 0.0072,
"step": 777
},
{
"epoch": 3.538198403648803,
"grad_norm": 3.414562702178955,
"learning_rate": 2.2045779408954738e-05,
"loss": 0.015,
"step": 778
},
{
"epoch": 3.542759407069555,
"grad_norm": 1.1695743799209595,
"learning_rate": 2.192384226165349e-05,
"loss": 0.0063,
"step": 779
},
{
"epoch": 3.547320410490308,
"grad_norm": 1.2887126207351685,
"learning_rate": 2.180214850745467e-05,
"loss": 0.0042,
"step": 780
},
{
"epoch": 3.5518814139110604,
"grad_norm": 6.278741359710693,
"learning_rate": 2.1680699201325326e-05,
"loss": 0.0096,
"step": 781
},
{
"epoch": 3.556442417331813,
"grad_norm": 1.6705055236816406,
"learning_rate": 2.1559495396113307e-05,
"loss": 0.0064,
"step": 782
},
{
"epoch": 3.5610034207525656,
"grad_norm": 1.6865471601486206,
"learning_rate": 2.1438538142538273e-05,
"loss": 0.0066,
"step": 783
},
{
"epoch": 3.565564424173318,
"grad_norm": 1.117870569229126,
"learning_rate": 2.131782848918245e-05,
"loss": 0.0039,
"step": 784
},
{
"epoch": 3.570125427594071,
"grad_norm": 0.6459174752235413,
"learning_rate": 2.119736748248172e-05,
"loss": 0.0031,
"step": 785
},
{
"epoch": 3.574686431014823,
"grad_norm": 2.9996261596679688,
"learning_rate": 2.1077156166716323e-05,
"loss": 0.007,
"step": 786
},
{
"epoch": 3.579247434435576,
"grad_norm": 2.0634710788726807,
"learning_rate": 2.0957195584001986e-05,
"loss": 0.0076,
"step": 787
},
{
"epoch": 3.5838084378563284,
"grad_norm": 3.3125040531158447,
"learning_rate": 2.083748677428083e-05,
"loss": 0.0137,
"step": 788
},
{
"epoch": 3.588369441277081,
"grad_norm": 4.13361120223999,
"learning_rate": 2.0718030775312285e-05,
"loss": 0.0193,
"step": 789
},
{
"epoch": 3.5929304446978336,
"grad_norm": 1.7809717655181885,
"learning_rate": 2.0598828622664213e-05,
"loss": 0.006,
"step": 790
},
{
"epoch": 3.597491448118586,
"grad_norm": 12.555645942687988,
"learning_rate": 2.0479881349703883e-05,
"loss": 0.0169,
"step": 791
},
{
"epoch": 3.602052451539339,
"grad_norm": 2.071539878845215,
"learning_rate": 2.0361189987588918e-05,
"loss": 0.0072,
"step": 792
},
{
"epoch": 3.6066134549600912,
"grad_norm": 1.3284027576446533,
"learning_rate": 2.024275556525858e-05,
"loss": 0.006,
"step": 793
},
{
"epoch": 3.6111744583808436,
"grad_norm": 0.8479968905448914,
"learning_rate": 2.012457910942458e-05,
"loss": 0.0039,
"step": 794
},
{
"epoch": 3.6157354618015964,
"grad_norm": 2.979787826538086,
"learning_rate": 2.0006661644562375e-05,
"loss": 0.0092,
"step": 795
},
{
"epoch": 3.620296465222349,
"grad_norm": 0.8467869758605957,
"learning_rate": 1.988900419290224e-05,
"loss": 0.0033,
"step": 796
},
{
"epoch": 3.6248574686431017,
"grad_norm": 0.7672457098960876,
"learning_rate": 1.9771607774420307e-05,
"loss": 0.0038,
"step": 797
},
{
"epoch": 3.629418472063854,
"grad_norm": 1.3988791704177856,
"learning_rate": 1.9654473406829903e-05,
"loss": 0.0059,
"step": 798
},
{
"epoch": 3.6339794754846064,
"grad_norm": 1.741504430770874,
"learning_rate": 1.953760210557254e-05,
"loss": 0.0096,
"step": 799
},
{
"epoch": 3.6385404789053593,
"grad_norm": 4.00645637512207,
"learning_rate": 1.942099488380923e-05,
"loss": 0.0098,
"step": 800
},
{
"epoch": 3.6431014823261116,
"grad_norm": 0.9082571864128113,
"learning_rate": 1.9304652752411734e-05,
"loss": 0.0038,
"step": 801
},
{
"epoch": 3.6476624857468645,
"grad_norm": 0.9427306652069092,
"learning_rate": 1.9188576719953633e-05,
"loss": 0.0051,
"step": 802
},
{
"epoch": 3.652223489167617,
"grad_norm": 0.5667453408241272,
"learning_rate": 1.9072767792701768e-05,
"loss": 0.0029,
"step": 803
},
{
"epoch": 3.6567844925883692,
"grad_norm": 1.584375262260437,
"learning_rate": 1.895722697460737e-05,
"loss": 0.0042,
"step": 804
},
{
"epoch": 3.661345496009122,
"grad_norm": 4.02076530456543,
"learning_rate": 1.884195526729748e-05,
"loss": 0.0092,
"step": 805
},
{
"epoch": 3.6659064994298745,
"grad_norm": 2.488560438156128,
"learning_rate": 1.8726953670066193e-05,
"loss": 0.0041,
"step": 806
},
{
"epoch": 3.6704675028506273,
"grad_norm": 1.546533226966858,
"learning_rate": 1.861222317986598e-05,
"loss": 0.0074,
"step": 807
},
{
"epoch": 3.6750285062713797,
"grad_norm": 2.984379529953003,
"learning_rate": 1.8497764791299117e-05,
"loss": 0.0091,
"step": 808
},
{
"epoch": 3.679589509692132,
"grad_norm": 1.3167816400527954,
"learning_rate": 1.8383579496609004e-05,
"loss": 0.0063,
"step": 809
},
{
"epoch": 3.684150513112885,
"grad_norm": 2.830770492553711,
"learning_rate": 1.8269668285671587e-05,
"loss": 0.0059,
"step": 810
},
{
"epoch": 3.6887115165336373,
"grad_norm": 1.9150404930114746,
"learning_rate": 1.8156032145986784e-05,
"loss": 0.003,
"step": 811
},
{
"epoch": 3.69327251995439,
"grad_norm": 0.5943277478218079,
"learning_rate": 1.8042672062669863e-05,
"loss": 0.0028,
"step": 812
},
{
"epoch": 3.6978335233751425,
"grad_norm": 0.7251271605491638,
"learning_rate": 1.7929589018443016e-05,
"loss": 0.004,
"step": 813
},
{
"epoch": 3.702394526795895,
"grad_norm": 0.5687354207038879,
"learning_rate": 1.7816783993626712e-05,
"loss": 0.0032,
"step": 814
},
{
"epoch": 3.7069555302166477,
"grad_norm": 0.7953961491584778,
"learning_rate": 1.7704257966131304e-05,
"loss": 0.0041,
"step": 815
},
{
"epoch": 3.7115165336374,
"grad_norm": 1.1048918962478638,
"learning_rate": 1.759201191144852e-05,
"loss": 0.0035,
"step": 816
},
{
"epoch": 3.716077537058153,
"grad_norm": 1.2720906734466553,
"learning_rate": 1.7480046802642906e-05,
"loss": 0.0043,
"step": 817
},
{
"epoch": 3.7206385404789053,
"grad_norm": 4.515064239501953,
"learning_rate": 1.7368363610343617e-05,
"loss": 0.009,
"step": 818
},
{
"epoch": 3.7251995438996577,
"grad_norm": 2.149350166320801,
"learning_rate": 1.725696330273575e-05,
"loss": 0.007,
"step": 819
},
{
"epoch": 3.7297605473204105,
"grad_norm": 0.4950422942638397,
"learning_rate": 1.714584684555211e-05,
"loss": 0.0028,
"step": 820
},
{
"epoch": 3.734321550741163,
"grad_norm": 1.4185466766357422,
"learning_rate": 1.703501520206482e-05,
"loss": 0.005,
"step": 821
},
{
"epoch": 3.7388825541619157,
"grad_norm": 3.5513916015625,
"learning_rate": 1.692446933307687e-05,
"loss": 0.0053,
"step": 822
},
{
"epoch": 3.743443557582668,
"grad_norm": 2.450361728668213,
"learning_rate": 1.6814210196913927e-05,
"loss": 0.0083,
"step": 823
},
{
"epoch": 3.7480045610034205,
"grad_norm": 0.6304205656051636,
"learning_rate": 1.6704238749415957e-05,
"loss": 0.0042,
"step": 824
},
{
"epoch": 3.7525655644241733,
"grad_norm": 2.1942145824432373,
"learning_rate": 1.6594555943928887e-05,
"loss": 0.0046,
"step": 825
},
{
"epoch": 3.757126567844926,
"grad_norm": 0.7358046770095825,
"learning_rate": 1.6485162731296495e-05,
"loss": 0.0027,
"step": 826
},
{
"epoch": 3.7616875712656785,
"grad_norm": 3.021874189376831,
"learning_rate": 1.6376060059851963e-05,
"loss": 0.0092,
"step": 827
},
{
"epoch": 3.766248574686431,
"grad_norm": 0.618632435798645,
"learning_rate": 1.6267248875409835e-05,
"loss": 0.0033,
"step": 828
},
{
"epoch": 3.7708095781071833,
"grad_norm": 1.160000205039978,
"learning_rate": 1.6158730121257737e-05,
"loss": 0.0047,
"step": 829
},
{
"epoch": 3.775370581527936,
"grad_norm": 1.2812681198120117,
"learning_rate": 1.6050504738148152e-05,
"loss": 0.005,
"step": 830
},
{
"epoch": 3.779931584948689,
"grad_norm": 1.2367734909057617,
"learning_rate": 1.5942573664290412e-05,
"loss": 0.0058,
"step": 831
},
{
"epoch": 3.7844925883694414,
"grad_norm": 1.0198251008987427,
"learning_rate": 1.5834937835342366e-05,
"loss": 0.0039,
"step": 832
},
{
"epoch": 3.7890535917901937,
"grad_norm": 0.9254816174507141,
"learning_rate": 1.5727598184402464e-05,
"loss": 0.003,
"step": 833
},
{
"epoch": 3.7936145952109466,
"grad_norm": 1.487305998802185,
"learning_rate": 1.562055564200154e-05,
"loss": 0.0049,
"step": 834
},
{
"epoch": 3.798175598631699,
"grad_norm": 0.8291229009628296,
"learning_rate": 1.5513811136094787e-05,
"loss": 0.0042,
"step": 835
},
{
"epoch": 3.802736602052452,
"grad_norm": 0.4022439122200012,
"learning_rate": 1.5407365592053735e-05,
"loss": 0.0023,
"step": 836
},
{
"epoch": 3.807297605473204,
"grad_norm": 1.640064001083374,
"learning_rate": 1.5301219932658156e-05,
"loss": 0.0057,
"step": 837
},
{
"epoch": 3.8118586088939566,
"grad_norm": 1.2258837223052979,
"learning_rate": 1.5195375078088147e-05,
"loss": 0.0034,
"step": 838
},
{
"epoch": 3.8164196123147094,
"grad_norm": 1.264310359954834,
"learning_rate": 1.5089831945916133e-05,
"loss": 0.0057,
"step": 839
},
{
"epoch": 3.8209806157354618,
"grad_norm": 2.701324939727783,
"learning_rate": 1.4984591451098845e-05,
"loss": 0.0086,
"step": 840
},
{
"epoch": 3.8255416191562146,
"grad_norm": 3.1908340454101562,
"learning_rate": 1.4879654505969498e-05,
"loss": 0.0085,
"step": 841
},
{
"epoch": 3.830102622576967,
"grad_norm": 0.7210344672203064,
"learning_rate": 1.4775022020229756e-05,
"loss": 0.0028,
"step": 842
},
{
"epoch": 3.8346636259977194,
"grad_norm": 2.616572380065918,
"learning_rate": 1.4670694900942005e-05,
"loss": 0.0069,
"step": 843
},
{
"epoch": 3.839224629418472,
"grad_norm": 1.0061376094818115,
"learning_rate": 1.4566674052521357e-05,
"loss": 0.0035,
"step": 844
},
{
"epoch": 3.8437856328392246,
"grad_norm": 0.5519426465034485,
"learning_rate": 1.4462960376727813e-05,
"loss": 0.0025,
"step": 845
},
{
"epoch": 3.8483466362599774,
"grad_norm": 1.0319393873214722,
"learning_rate": 1.4359554772658552e-05,
"loss": 0.0037,
"step": 846
},
{
"epoch": 3.85290763968073,
"grad_norm": 5.387468338012695,
"learning_rate": 1.4256458136739998e-05,
"loss": 0.0131,
"step": 847
},
{
"epoch": 3.857468643101482,
"grad_norm": 1.1421221494674683,
"learning_rate": 1.415367136272019e-05,
"loss": 0.0038,
"step": 848
},
{
"epoch": 3.862029646522235,
"grad_norm": 1.2745294570922852,
"learning_rate": 1.4051195341660939e-05,
"loss": 0.0063,
"step": 849
},
{
"epoch": 3.8665906499429874,
"grad_norm": 3.796590805053711,
"learning_rate": 1.3949030961930077e-05,
"loss": 0.0102,
"step": 850
},
{
"epoch": 3.8711516533637402,
"grad_norm": 1.4126759767532349,
"learning_rate": 1.3847179109193925e-05,
"loss": 0.0043,
"step": 851
},
{
"epoch": 3.8757126567844926,
"grad_norm": 0.6939073801040649,
"learning_rate": 1.374564066640937e-05,
"loss": 0.0038,
"step": 852
},
{
"epoch": 3.880273660205245,
"grad_norm": 0.8315445780754089,
"learning_rate": 1.3644416513816416e-05,
"loss": 0.0051,
"step": 853
},
{
"epoch": 3.884834663625998,
"grad_norm": 0.7290100455284119,
"learning_rate": 1.3543507528930472e-05,
"loss": 0.0035,
"step": 854
},
{
"epoch": 3.88939566704675,
"grad_norm": 3.279763698577881,
"learning_rate": 1.3442914586534688e-05,
"loss": 0.0112,
"step": 855
},
{
"epoch": 3.893956670467503,
"grad_norm": 0.7284213900566101,
"learning_rate": 1.3342638558672504e-05,
"loss": 0.0032,
"step": 856
},
{
"epoch": 3.8985176738882554,
"grad_norm": 1.461885690689087,
"learning_rate": 1.3242680314639993e-05,
"loss": 0.0046,
"step": 857
},
{
"epoch": 3.903078677309008,
"grad_norm": 0.36676260828971863,
"learning_rate": 1.31430407209783e-05,
"loss": 0.0023,
"step": 858
},
{
"epoch": 3.9076396807297606,
"grad_norm": 0.5633653402328491,
"learning_rate": 1.3043720641466289e-05,
"loss": 0.0025,
"step": 859
},
{
"epoch": 3.912200684150513,
"grad_norm": 0.9612581133842468,
"learning_rate": 1.2944720937112836e-05,
"loss": 0.0023,
"step": 860
},
{
"epoch": 3.916761687571266,
"grad_norm": 2.462862491607666,
"learning_rate": 1.284604246614955e-05,
"loss": 0.0048,
"step": 861
},
{
"epoch": 3.9213226909920182,
"grad_norm": 0.6819082498550415,
"learning_rate": 1.2747686084023192e-05,
"loss": 0.003,
"step": 862
},
{
"epoch": 3.9258836944127706,
"grad_norm": 0.4367881417274475,
"learning_rate": 1.2649652643388382e-05,
"loss": 0.0024,
"step": 863
},
{
"epoch": 3.9304446978335235,
"grad_norm": 0.6573525667190552,
"learning_rate": 1.2551942994100136e-05,
"loss": 0.0029,
"step": 864
},
{
"epoch": 3.935005701254276,
"grad_norm": 1.2858328819274902,
"learning_rate": 1.2454557983206477e-05,
"loss": 0.0039,
"step": 865
},
{
"epoch": 3.9395667046750287,
"grad_norm": 0.46625810861587524,
"learning_rate": 1.2357498454941175e-05,
"loss": 0.0029,
"step": 866
},
{
"epoch": 3.944127708095781,
"grad_norm": 0.39558151364326477,
"learning_rate": 1.2260765250716356e-05,
"loss": 0.0021,
"step": 867
},
{
"epoch": 3.9486887115165334,
"grad_norm": 0.9131205677986145,
"learning_rate": 1.2164359209115234e-05,
"loss": 0.0032,
"step": 868
},
{
"epoch": 3.9532497149372863,
"grad_norm": 0.8359415531158447,
"learning_rate": 1.2068281165884864e-05,
"loss": 0.0038,
"step": 869
},
{
"epoch": 3.9578107183580387,
"grad_norm": 0.5636485815048218,
"learning_rate": 1.1972531953928823e-05,
"loss": 0.0025,
"step": 870
},
{
"epoch": 3.9623717217787915,
"grad_norm": 1.0502080917358398,
"learning_rate": 1.1877112403300079e-05,
"loss": 0.0032,
"step": 871
},
{
"epoch": 3.966932725199544,
"grad_norm": 1.4225945472717285,
"learning_rate": 1.1782023341193754e-05,
"loss": 0.005,
"step": 872
},
{
"epoch": 3.9714937286202963,
"grad_norm": 0.8865280747413635,
"learning_rate": 1.1687265591939927e-05,
"loss": 0.0036,
"step": 873
},
{
"epoch": 3.976054732041049,
"grad_norm": 0.7002689242362976,
"learning_rate": 1.1592839976996555e-05,
"loss": 0.0034,
"step": 874
},
{
"epoch": 3.9806157354618015,
"grad_norm": 1.3850862979888916,
"learning_rate": 1.1498747314942255e-05,
"loss": 0.0052,
"step": 875
},
{
"epoch": 3.9851767388825543,
"grad_norm": 1.3568379878997803,
"learning_rate": 1.1404988421469348e-05,
"loss": 0.0037,
"step": 876
},
{
"epoch": 3.9897377423033067,
"grad_norm": 0.815382719039917,
"learning_rate": 1.1311564109376621e-05,
"loss": 0.0036,
"step": 877
},
{
"epoch": 3.994298745724059,
"grad_norm": 2.1018550395965576,
"learning_rate": 1.121847518856241e-05,
"loss": 0.0048,
"step": 878
},
{
"epoch": 3.998859749144812,
"grad_norm": 0.6212396025657654,
"learning_rate": 1.1125722466017547e-05,
"loss": 0.0025,
"step": 879
},
{
"epoch": 4.0,
"grad_norm": 0.9469783306121826,
"learning_rate": 1.1033306745818283e-05,
"loss": 0.0028,
"step": 880
},
{
"epoch": 4.004561003420752,
"grad_norm": 0.6264204382896423,
"learning_rate": 1.0941228829119453e-05,
"loss": 0.0033,
"step": 881
},
{
"epoch": 4.009122006841505,
"grad_norm": 0.27060914039611816,
"learning_rate": 1.0849489514147459e-05,
"loss": 0.0016,
"step": 882
},
{
"epoch": 4.013683010262258,
"grad_norm": 0.47667694091796875,
"learning_rate": 1.0758089596193282e-05,
"loss": 0.0028,
"step": 883
},
{
"epoch": 4.01824401368301,
"grad_norm": 0.9456324577331543,
"learning_rate": 1.066702986760577e-05,
"loss": 0.0029,
"step": 884
},
{
"epoch": 4.022805017103763,
"grad_norm": 1.1689949035644531,
"learning_rate": 1.057631111778456e-05,
"loss": 0.0034,
"step": 885
},
{
"epoch": 4.027366020524515,
"grad_norm": 0.25507575273513794,
"learning_rate": 1.0485934133173387e-05,
"loss": 0.002,
"step": 886
},
{
"epoch": 4.031927023945268,
"grad_norm": 0.37119409441947937,
"learning_rate": 1.0395899697253208e-05,
"loss": 0.0022,
"step": 887
},
{
"epoch": 4.036488027366021,
"grad_norm": 0.3143307864665985,
"learning_rate": 1.0306208590535382e-05,
"loss": 0.0021,
"step": 888
},
{
"epoch": 4.041049030786773,
"grad_norm": 0.3163100481033325,
"learning_rate": 1.0216861590554983e-05,
"loss": 0.0021,
"step": 889
},
{
"epoch": 4.045610034207526,
"grad_norm": 0.4894810914993286,
"learning_rate": 1.012785947186397e-05,
"loss": 0.0024,
"step": 890
},
{
"epoch": 4.050171037628278,
"grad_norm": 0.3152639865875244,
"learning_rate": 1.0039203006024527e-05,
"loss": 0.0025,
"step": 891
},
{
"epoch": 4.05473204104903,
"grad_norm": 1.619958519935608,
"learning_rate": 9.95089296160241e-06,
"loss": 0.0054,
"step": 892
},
{
"epoch": 4.059293044469784,
"grad_norm": 0.5192055106163025,
"learning_rate": 9.862930104160162e-06,
"loss": 0.0027,
"step": 893
},
{
"epoch": 4.063854047890536,
"grad_norm": 2.080965518951416,
"learning_rate": 9.775315196250612e-06,
"loss": 0.0042,
"step": 894
},
{
"epoch": 4.068415051311288,
"grad_norm": 0.3620293140411377,
"learning_rate": 9.688048997410143e-06,
"loss": 0.0022,
"step": 895
},
{
"epoch": 4.072976054732041,
"grad_norm": 0.37922972440719604,
"learning_rate": 9.601132264152223e-06,
"loss": 0.002,
"step": 896
},
{
"epoch": 4.077537058152793,
"grad_norm": 0.3833453059196472,
"learning_rate": 9.51456574996078e-06,
"loss": 0.0022,
"step": 897
},
{
"epoch": 4.0820980615735465,
"grad_norm": 0.41054201126098633,
"learning_rate": 9.428350205283648e-06,
"loss": 0.0024,
"step": 898
},
{
"epoch": 4.086659064994299,
"grad_norm": 0.3207608461380005,
"learning_rate": 9.342486377526133e-06,
"loss": 0.002,
"step": 899
},
{
"epoch": 4.091220068415051,
"grad_norm": 0.4284456968307495,
"learning_rate": 9.256975011044483e-06,
"loss": 0.0022,
"step": 900
}
],
"logging_steps": 1.0,
"max_steps": 1100,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.659402853631918e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}