fakeVLM / trainer_state.json
lingcco's picture
Upload trainer_state.json with huggingface_hub
7156099 verified
raw
history blame
142 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 816,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0024509803921568627,
"grad_norm": 8.812210083007812,
"learning_rate": 1.6260162601626018e-07,
"loss": 2.2606,
"step": 1
},
{
"epoch": 0.004901960784313725,
"grad_norm": 8.948471069335938,
"learning_rate": 3.2520325203252037e-07,
"loss": 2.2717,
"step": 2
},
{
"epoch": 0.007352941176470588,
"grad_norm": 9.580260276794434,
"learning_rate": 4.878048780487805e-07,
"loss": 2.189,
"step": 3
},
{
"epoch": 0.00980392156862745,
"grad_norm": 8.850502967834473,
"learning_rate": 6.504065040650407e-07,
"loss": 2.3295,
"step": 4
},
{
"epoch": 0.012254901960784314,
"grad_norm": 9.253467559814453,
"learning_rate": 8.130081300813009e-07,
"loss": 2.2149,
"step": 5
},
{
"epoch": 0.014705882352941176,
"grad_norm": 8.920105934143066,
"learning_rate": 9.75609756097561e-07,
"loss": 2.3054,
"step": 6
},
{
"epoch": 0.01715686274509804,
"grad_norm": 8.572409629821777,
"learning_rate": 1.1382113821138213e-06,
"loss": 2.2113,
"step": 7
},
{
"epoch": 0.0196078431372549,
"grad_norm": 9.019370079040527,
"learning_rate": 1.3008130081300815e-06,
"loss": 2.2433,
"step": 8
},
{
"epoch": 0.022058823529411766,
"grad_norm": 7.8791937828063965,
"learning_rate": 1.4634146341463414e-06,
"loss": 2.1095,
"step": 9
},
{
"epoch": 0.024509803921568627,
"grad_norm": 7.617267608642578,
"learning_rate": 1.6260162601626018e-06,
"loss": 2.1689,
"step": 10
},
{
"epoch": 0.02696078431372549,
"grad_norm": 7.418929576873779,
"learning_rate": 1.788617886178862e-06,
"loss": 2.1149,
"step": 11
},
{
"epoch": 0.029411764705882353,
"grad_norm": 6.207332611083984,
"learning_rate": 1.951219512195122e-06,
"loss": 1.9281,
"step": 12
},
{
"epoch": 0.031862745098039214,
"grad_norm": 5.373957633972168,
"learning_rate": 2.1138211382113824e-06,
"loss": 1.9441,
"step": 13
},
{
"epoch": 0.03431372549019608,
"grad_norm": 5.035505771636963,
"learning_rate": 2.2764227642276426e-06,
"loss": 1.9082,
"step": 14
},
{
"epoch": 0.03676470588235294,
"grad_norm": 5.093581676483154,
"learning_rate": 2.4390243902439027e-06,
"loss": 1.8668,
"step": 15
},
{
"epoch": 0.0392156862745098,
"grad_norm": 4.3776469230651855,
"learning_rate": 2.601626016260163e-06,
"loss": 1.603,
"step": 16
},
{
"epoch": 0.041666666666666664,
"grad_norm": 4.985185623168945,
"learning_rate": 2.764227642276423e-06,
"loss": 1.6327,
"step": 17
},
{
"epoch": 0.04411764705882353,
"grad_norm": 4.654208183288574,
"learning_rate": 2.926829268292683e-06,
"loss": 1.5797,
"step": 18
},
{
"epoch": 0.04656862745098039,
"grad_norm": 3.7216811180114746,
"learning_rate": 3.0894308943089435e-06,
"loss": 1.5041,
"step": 19
},
{
"epoch": 0.049019607843137254,
"grad_norm": 3.5889596939086914,
"learning_rate": 3.2520325203252037e-06,
"loss": 1.4872,
"step": 20
},
{
"epoch": 0.051470588235294115,
"grad_norm": 3.119215488433838,
"learning_rate": 3.414634146341464e-06,
"loss": 1.3953,
"step": 21
},
{
"epoch": 0.05392156862745098,
"grad_norm": 4.156096935272217,
"learning_rate": 3.577235772357724e-06,
"loss": 1.2343,
"step": 22
},
{
"epoch": 0.056372549019607844,
"grad_norm": 3.7410824298858643,
"learning_rate": 3.7398373983739838e-06,
"loss": 1.242,
"step": 23
},
{
"epoch": 0.058823529411764705,
"grad_norm": 3.6227011680603027,
"learning_rate": 3.902439024390244e-06,
"loss": 1.1223,
"step": 24
},
{
"epoch": 0.061274509803921566,
"grad_norm": 3.0835347175598145,
"learning_rate": 4.0650406504065046e-06,
"loss": 1.0941,
"step": 25
},
{
"epoch": 0.06372549019607843,
"grad_norm": 2.975940465927124,
"learning_rate": 4.227642276422765e-06,
"loss": 1.0424,
"step": 26
},
{
"epoch": 0.0661764705882353,
"grad_norm": 2.3432960510253906,
"learning_rate": 4.390243902439025e-06,
"loss": 1.0583,
"step": 27
},
{
"epoch": 0.06862745098039216,
"grad_norm": 3.1780247688293457,
"learning_rate": 4.552845528455285e-06,
"loss": 0.9739,
"step": 28
},
{
"epoch": 0.07107843137254902,
"grad_norm": 2.5403900146484375,
"learning_rate": 4.715447154471545e-06,
"loss": 0.9267,
"step": 29
},
{
"epoch": 0.07352941176470588,
"grad_norm": 3.0422229766845703,
"learning_rate": 4.8780487804878055e-06,
"loss": 0.9596,
"step": 30
},
{
"epoch": 0.07598039215686274,
"grad_norm": 2.9660439491271973,
"learning_rate": 5.040650406504065e-06,
"loss": 0.8717,
"step": 31
},
{
"epoch": 0.0784313725490196,
"grad_norm": 2.1265180110931396,
"learning_rate": 5.203252032520326e-06,
"loss": 0.8358,
"step": 32
},
{
"epoch": 0.08088235294117647,
"grad_norm": 2.330235719680786,
"learning_rate": 5.365853658536586e-06,
"loss": 0.8244,
"step": 33
},
{
"epoch": 0.08333333333333333,
"grad_norm": 2.4875876903533936,
"learning_rate": 5.528455284552846e-06,
"loss": 0.7916,
"step": 34
},
{
"epoch": 0.0857843137254902,
"grad_norm": 1.7746851444244385,
"learning_rate": 5.691056910569106e-06,
"loss": 0.7284,
"step": 35
},
{
"epoch": 0.08823529411764706,
"grad_norm": 2.7690117359161377,
"learning_rate": 5.853658536585366e-06,
"loss": 0.8039,
"step": 36
},
{
"epoch": 0.09068627450980392,
"grad_norm": 2.3365156650543213,
"learning_rate": 6.016260162601627e-06,
"loss": 0.7148,
"step": 37
},
{
"epoch": 0.09313725490196079,
"grad_norm": 1.7329320907592773,
"learning_rate": 6.178861788617887e-06,
"loss": 0.7221,
"step": 38
},
{
"epoch": 0.09558823529411764,
"grad_norm": 1.6151103973388672,
"learning_rate": 6.341463414634147e-06,
"loss": 0.6837,
"step": 39
},
{
"epoch": 0.09803921568627451,
"grad_norm": 1.8260527849197388,
"learning_rate": 6.504065040650407e-06,
"loss": 0.7271,
"step": 40
},
{
"epoch": 0.10049019607843138,
"grad_norm": 1.6191492080688477,
"learning_rate": 6.666666666666667e-06,
"loss": 0.7238,
"step": 41
},
{
"epoch": 0.10294117647058823,
"grad_norm": 1.6571029424667358,
"learning_rate": 6.829268292682928e-06,
"loss": 0.6568,
"step": 42
},
{
"epoch": 0.1053921568627451,
"grad_norm": 1.7302550077438354,
"learning_rate": 6.991869918699188e-06,
"loss": 0.6404,
"step": 43
},
{
"epoch": 0.10784313725490197,
"grad_norm": 1.4269192218780518,
"learning_rate": 7.154471544715448e-06,
"loss": 0.7192,
"step": 44
},
{
"epoch": 0.11029411764705882,
"grad_norm": 1.587689757347107,
"learning_rate": 7.317073170731707e-06,
"loss": 0.6527,
"step": 45
},
{
"epoch": 0.11274509803921569,
"grad_norm": 1.831628680229187,
"learning_rate": 7.4796747967479676e-06,
"loss": 0.6424,
"step": 46
},
{
"epoch": 0.11519607843137254,
"grad_norm": 1.6706323623657227,
"learning_rate": 7.64227642276423e-06,
"loss": 0.6599,
"step": 47
},
{
"epoch": 0.11764705882352941,
"grad_norm": 1.8432836532592773,
"learning_rate": 7.804878048780489e-06,
"loss": 0.6388,
"step": 48
},
{
"epoch": 0.12009803921568628,
"grad_norm": 1.3941445350646973,
"learning_rate": 7.967479674796748e-06,
"loss": 0.6246,
"step": 49
},
{
"epoch": 0.12254901960784313,
"grad_norm": 1.857008934020996,
"learning_rate": 8.130081300813009e-06,
"loss": 0.6075,
"step": 50
},
{
"epoch": 0.125,
"grad_norm": 1.2007290124893188,
"learning_rate": 8.292682926829268e-06,
"loss": 0.5524,
"step": 51
},
{
"epoch": 0.12745098039215685,
"grad_norm": 1.3946441411972046,
"learning_rate": 8.45528455284553e-06,
"loss": 0.6135,
"step": 52
},
{
"epoch": 0.12990196078431374,
"grad_norm": 1.2084949016571045,
"learning_rate": 8.617886178861789e-06,
"loss": 0.6117,
"step": 53
},
{
"epoch": 0.1323529411764706,
"grad_norm": 1.7342880964279175,
"learning_rate": 8.78048780487805e-06,
"loss": 0.5969,
"step": 54
},
{
"epoch": 0.13480392156862744,
"grad_norm": 1.6315040588378906,
"learning_rate": 8.94308943089431e-06,
"loss": 0.5975,
"step": 55
},
{
"epoch": 0.13725490196078433,
"grad_norm": 1.5566022396087646,
"learning_rate": 9.10569105691057e-06,
"loss": 0.5801,
"step": 56
},
{
"epoch": 0.13970588235294118,
"grad_norm": 1.1815757751464844,
"learning_rate": 9.268292682926831e-06,
"loss": 0.5566,
"step": 57
},
{
"epoch": 0.14215686274509803,
"grad_norm": 1.8545993566513062,
"learning_rate": 9.43089430894309e-06,
"loss": 0.5655,
"step": 58
},
{
"epoch": 0.14460784313725492,
"grad_norm": 1.3835339546203613,
"learning_rate": 9.59349593495935e-06,
"loss": 0.5761,
"step": 59
},
{
"epoch": 0.14705882352941177,
"grad_norm": 1.2994441986083984,
"learning_rate": 9.756097560975611e-06,
"loss": 0.5714,
"step": 60
},
{
"epoch": 0.14950980392156862,
"grad_norm": 1.9451756477355957,
"learning_rate": 9.91869918699187e-06,
"loss": 0.6185,
"step": 61
},
{
"epoch": 0.15196078431372548,
"grad_norm": 1.5378062725067139,
"learning_rate": 1.008130081300813e-05,
"loss": 0.5713,
"step": 62
},
{
"epoch": 0.15441176470588236,
"grad_norm": 1.2995511293411255,
"learning_rate": 1.024390243902439e-05,
"loss": 0.5165,
"step": 63
},
{
"epoch": 0.1568627450980392,
"grad_norm": 1.5652039051055908,
"learning_rate": 1.0406504065040652e-05,
"loss": 0.5818,
"step": 64
},
{
"epoch": 0.15931372549019607,
"grad_norm": 1.6445112228393555,
"learning_rate": 1.0569105691056911e-05,
"loss": 0.511,
"step": 65
},
{
"epoch": 0.16176470588235295,
"grad_norm": 1.1132397651672363,
"learning_rate": 1.0731707317073172e-05,
"loss": 0.5649,
"step": 66
},
{
"epoch": 0.1642156862745098,
"grad_norm": 1.4722031354904175,
"learning_rate": 1.0894308943089431e-05,
"loss": 0.507,
"step": 67
},
{
"epoch": 0.16666666666666666,
"grad_norm": 1.7021046876907349,
"learning_rate": 1.1056910569105692e-05,
"loss": 0.5301,
"step": 68
},
{
"epoch": 0.16911764705882354,
"grad_norm": 1.284529685974121,
"learning_rate": 1.1219512195121953e-05,
"loss": 0.5333,
"step": 69
},
{
"epoch": 0.1715686274509804,
"grad_norm": 1.3712999820709229,
"learning_rate": 1.1382113821138213e-05,
"loss": 0.5234,
"step": 70
},
{
"epoch": 0.17401960784313725,
"grad_norm": 1.1805524826049805,
"learning_rate": 1.1544715447154474e-05,
"loss": 0.5128,
"step": 71
},
{
"epoch": 0.17647058823529413,
"grad_norm": 1.3389873504638672,
"learning_rate": 1.1707317073170731e-05,
"loss": 0.4838,
"step": 72
},
{
"epoch": 0.17892156862745098,
"grad_norm": 1.4768121242523193,
"learning_rate": 1.1869918699186992e-05,
"loss": 0.4878,
"step": 73
},
{
"epoch": 0.18137254901960784,
"grad_norm": 2.1929194927215576,
"learning_rate": 1.2032520325203254e-05,
"loss": 0.5284,
"step": 74
},
{
"epoch": 0.18382352941176472,
"grad_norm": 2.1756324768066406,
"learning_rate": 1.2195121951219513e-05,
"loss": 0.5186,
"step": 75
},
{
"epoch": 0.18627450980392157,
"grad_norm": 1.4016366004943848,
"learning_rate": 1.2357723577235774e-05,
"loss": 0.5108,
"step": 76
},
{
"epoch": 0.18872549019607843,
"grad_norm": 1.7765308618545532,
"learning_rate": 1.2520325203252033e-05,
"loss": 0.5096,
"step": 77
},
{
"epoch": 0.19117647058823528,
"grad_norm": 1.1144344806671143,
"learning_rate": 1.2682926829268294e-05,
"loss": 0.5113,
"step": 78
},
{
"epoch": 0.19362745098039216,
"grad_norm": 1.3912361860275269,
"learning_rate": 1.2845528455284555e-05,
"loss": 0.5188,
"step": 79
},
{
"epoch": 0.19607843137254902,
"grad_norm": 1.6264848709106445,
"learning_rate": 1.3008130081300815e-05,
"loss": 0.4925,
"step": 80
},
{
"epoch": 0.19852941176470587,
"grad_norm": 1.100017786026001,
"learning_rate": 1.3170731707317076e-05,
"loss": 0.4863,
"step": 81
},
{
"epoch": 0.20098039215686275,
"grad_norm": 1.1988650560379028,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.4959,
"step": 82
},
{
"epoch": 0.2034313725490196,
"grad_norm": 1.7220489978790283,
"learning_rate": 1.3495934959349594e-05,
"loss": 0.4807,
"step": 83
},
{
"epoch": 0.20588235294117646,
"grad_norm": 1.1386171579360962,
"learning_rate": 1.3658536585365855e-05,
"loss": 0.4547,
"step": 84
},
{
"epoch": 0.20833333333333334,
"grad_norm": 1.3636847734451294,
"learning_rate": 1.3821138211382115e-05,
"loss": 0.4414,
"step": 85
},
{
"epoch": 0.2107843137254902,
"grad_norm": 2.023822546005249,
"learning_rate": 1.3983739837398376e-05,
"loss": 0.5203,
"step": 86
},
{
"epoch": 0.21323529411764705,
"grad_norm": 0.996777355670929,
"learning_rate": 1.4146341463414635e-05,
"loss": 0.4957,
"step": 87
},
{
"epoch": 0.21568627450980393,
"grad_norm": 0.9686525464057922,
"learning_rate": 1.4308943089430896e-05,
"loss": 0.4806,
"step": 88
},
{
"epoch": 0.2181372549019608,
"grad_norm": 0.9195975661277771,
"learning_rate": 1.4471544715447157e-05,
"loss": 0.5033,
"step": 89
},
{
"epoch": 0.22058823529411764,
"grad_norm": 1.043886423110962,
"learning_rate": 1.4634146341463415e-05,
"loss": 0.4426,
"step": 90
},
{
"epoch": 0.22303921568627452,
"grad_norm": 1.3796557188034058,
"learning_rate": 1.4796747967479676e-05,
"loss": 0.4708,
"step": 91
},
{
"epoch": 0.22549019607843138,
"grad_norm": 0.9486726522445679,
"learning_rate": 1.4959349593495935e-05,
"loss": 0.489,
"step": 92
},
{
"epoch": 0.22794117647058823,
"grad_norm": 1.1943005323410034,
"learning_rate": 1.5121951219512196e-05,
"loss": 0.5021,
"step": 93
},
{
"epoch": 0.23039215686274508,
"grad_norm": 0.9456331729888916,
"learning_rate": 1.528455284552846e-05,
"loss": 0.4934,
"step": 94
},
{
"epoch": 0.23284313725490197,
"grad_norm": 1.033549189567566,
"learning_rate": 1.5447154471544717e-05,
"loss": 0.4828,
"step": 95
},
{
"epoch": 0.23529411764705882,
"grad_norm": 1.384258508682251,
"learning_rate": 1.5609756097560978e-05,
"loss": 0.4426,
"step": 96
},
{
"epoch": 0.23774509803921567,
"grad_norm": 1.076845645904541,
"learning_rate": 1.5772357723577235e-05,
"loss": 0.4833,
"step": 97
},
{
"epoch": 0.24019607843137256,
"grad_norm": 0.9619578719139099,
"learning_rate": 1.5934959349593496e-05,
"loss": 0.4451,
"step": 98
},
{
"epoch": 0.2426470588235294,
"grad_norm": 0.9403388500213623,
"learning_rate": 1.6097560975609757e-05,
"loss": 0.4749,
"step": 99
},
{
"epoch": 0.24509803921568626,
"grad_norm": 1.1576982736587524,
"learning_rate": 1.6260162601626018e-05,
"loss": 0.4588,
"step": 100
},
{
"epoch": 0.24754901960784315,
"grad_norm": 1.0392265319824219,
"learning_rate": 1.642276422764228e-05,
"loss": 0.4319,
"step": 101
},
{
"epoch": 0.25,
"grad_norm": 1.1822607517242432,
"learning_rate": 1.6585365853658537e-05,
"loss": 0.4737,
"step": 102
},
{
"epoch": 0.25245098039215685,
"grad_norm": 1.0691688060760498,
"learning_rate": 1.6747967479674798e-05,
"loss": 0.4947,
"step": 103
},
{
"epoch": 0.2549019607843137,
"grad_norm": 1.3543163537979126,
"learning_rate": 1.691056910569106e-05,
"loss": 0.4284,
"step": 104
},
{
"epoch": 0.25735294117647056,
"grad_norm": 1.4430104494094849,
"learning_rate": 1.7073170731707317e-05,
"loss": 0.4764,
"step": 105
},
{
"epoch": 0.25980392156862747,
"grad_norm": 1.0300085544586182,
"learning_rate": 1.7235772357723578e-05,
"loss": 0.4622,
"step": 106
},
{
"epoch": 0.2622549019607843,
"grad_norm": 1.13246750831604,
"learning_rate": 1.739837398373984e-05,
"loss": 0.4598,
"step": 107
},
{
"epoch": 0.2647058823529412,
"grad_norm": 1.5132665634155273,
"learning_rate": 1.75609756097561e-05,
"loss": 0.4409,
"step": 108
},
{
"epoch": 0.26715686274509803,
"grad_norm": 0.9958670139312744,
"learning_rate": 1.772357723577236e-05,
"loss": 0.4636,
"step": 109
},
{
"epoch": 0.2696078431372549,
"grad_norm": 0.9780481457710266,
"learning_rate": 1.788617886178862e-05,
"loss": 0.4464,
"step": 110
},
{
"epoch": 0.27205882352941174,
"grad_norm": 1.0843981504440308,
"learning_rate": 1.804878048780488e-05,
"loss": 0.4683,
"step": 111
},
{
"epoch": 0.27450980392156865,
"grad_norm": 1.0033676624298096,
"learning_rate": 1.821138211382114e-05,
"loss": 0.4595,
"step": 112
},
{
"epoch": 0.2769607843137255,
"grad_norm": 1.2065788507461548,
"learning_rate": 1.83739837398374e-05,
"loss": 0.4776,
"step": 113
},
{
"epoch": 0.27941176470588236,
"grad_norm": 1.060770034790039,
"learning_rate": 1.8536585365853663e-05,
"loss": 0.4294,
"step": 114
},
{
"epoch": 0.2818627450980392,
"grad_norm": 1.170822262763977,
"learning_rate": 1.869918699186992e-05,
"loss": 0.4647,
"step": 115
},
{
"epoch": 0.28431372549019607,
"grad_norm": 2.872832775115967,
"learning_rate": 1.886178861788618e-05,
"loss": 0.4723,
"step": 116
},
{
"epoch": 0.2867647058823529,
"grad_norm": 1.2136062383651733,
"learning_rate": 1.902439024390244e-05,
"loss": 0.4763,
"step": 117
},
{
"epoch": 0.28921568627450983,
"grad_norm": 1.0381885766983032,
"learning_rate": 1.91869918699187e-05,
"loss": 0.4596,
"step": 118
},
{
"epoch": 0.2916666666666667,
"grad_norm": 0.9407505393028259,
"learning_rate": 1.934959349593496e-05,
"loss": 0.4691,
"step": 119
},
{
"epoch": 0.29411764705882354,
"grad_norm": 0.9868777990341187,
"learning_rate": 1.9512195121951222e-05,
"loss": 0.4609,
"step": 120
},
{
"epoch": 0.2965686274509804,
"grad_norm": 0.880571722984314,
"learning_rate": 1.9674796747967483e-05,
"loss": 0.4565,
"step": 121
},
{
"epoch": 0.29901960784313725,
"grad_norm": 0.9205477833747864,
"learning_rate": 1.983739837398374e-05,
"loss": 0.4461,
"step": 122
},
{
"epoch": 0.3014705882352941,
"grad_norm": 0.9311808347702026,
"learning_rate": 2e-05,
"loss": 0.4674,
"step": 123
},
{
"epoch": 0.30392156862745096,
"grad_norm": 1.1109329462051392,
"learning_rate": 1.999999684835258e-05,
"loss": 0.4332,
"step": 124
},
{
"epoch": 0.30637254901960786,
"grad_norm": 0.8790717124938965,
"learning_rate": 1.9999987393412305e-05,
"loss": 0.4399,
"step": 125
},
{
"epoch": 0.3088235294117647,
"grad_norm": 1.1366180181503296,
"learning_rate": 1.9999971635185137e-05,
"loss": 0.4355,
"step": 126
},
{
"epoch": 0.3112745098039216,
"grad_norm": 1.0026062726974487,
"learning_rate": 1.9999949573681004e-05,
"loss": 0.4388,
"step": 127
},
{
"epoch": 0.3137254901960784,
"grad_norm": 0.9259772300720215,
"learning_rate": 1.9999921208913814e-05,
"loss": 0.4388,
"step": 128
},
{
"epoch": 0.3161764705882353,
"grad_norm": 0.9517544507980347,
"learning_rate": 1.999988654090145e-05,
"loss": 0.4649,
"step": 129
},
{
"epoch": 0.31862745098039214,
"grad_norm": 0.8791248798370361,
"learning_rate": 1.9999845569665762e-05,
"loss": 0.4325,
"step": 130
},
{
"epoch": 0.32107843137254904,
"grad_norm": 1.01931631565094,
"learning_rate": 1.999979829523257e-05,
"loss": 0.4442,
"step": 131
},
{
"epoch": 0.3235294117647059,
"grad_norm": 1.187097430229187,
"learning_rate": 1.999974471763168e-05,
"loss": 0.4701,
"step": 132
},
{
"epoch": 0.32598039215686275,
"grad_norm": 1.64523184299469,
"learning_rate": 1.9999684836896864e-05,
"loss": 0.4652,
"step": 133
},
{
"epoch": 0.3284313725490196,
"grad_norm": 1.2207841873168945,
"learning_rate": 1.9999618653065858e-05,
"loss": 0.4396,
"step": 134
},
{
"epoch": 0.33088235294117646,
"grad_norm": 1.1219441890716553,
"learning_rate": 1.9999546166180386e-05,
"loss": 0.4413,
"step": 135
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.8931692242622375,
"learning_rate": 1.999946737628614e-05,
"loss": 0.4293,
"step": 136
},
{
"epoch": 0.33578431372549017,
"grad_norm": 1.0054982900619507,
"learning_rate": 1.999938228343278e-05,
"loss": 0.4705,
"step": 137
},
{
"epoch": 0.3382352941176471,
"grad_norm": 0.9084284901618958,
"learning_rate": 1.9999290887673943e-05,
"loss": 0.4383,
"step": 138
},
{
"epoch": 0.34068627450980393,
"grad_norm": 1.043857216835022,
"learning_rate": 1.999919318906724e-05,
"loss": 0.4193,
"step": 139
},
{
"epoch": 0.3431372549019608,
"grad_norm": 1.084401249885559,
"learning_rate": 1.9999089187674254e-05,
"loss": 0.4259,
"step": 140
},
{
"epoch": 0.34558823529411764,
"grad_norm": 2.9998996257781982,
"learning_rate": 1.9998978883560534e-05,
"loss": 0.4152,
"step": 141
},
{
"epoch": 0.3480392156862745,
"grad_norm": 0.9272080659866333,
"learning_rate": 1.9998862276795617e-05,
"loss": 0.411,
"step": 142
},
{
"epoch": 0.35049019607843135,
"grad_norm": 0.9321224093437195,
"learning_rate": 1.9998739367452998e-05,
"loss": 0.424,
"step": 143
},
{
"epoch": 0.35294117647058826,
"grad_norm": 0.9029969573020935,
"learning_rate": 1.9998610155610153e-05,
"loss": 0.4271,
"step": 144
},
{
"epoch": 0.3553921568627451,
"grad_norm": 0.8797354698181152,
"learning_rate": 1.9998474641348523e-05,
"loss": 0.3952,
"step": 145
},
{
"epoch": 0.35784313725490197,
"grad_norm": 1.0403674840927124,
"learning_rate": 1.9998332824753533e-05,
"loss": 0.4488,
"step": 146
},
{
"epoch": 0.3602941176470588,
"grad_norm": 2.9800989627838135,
"learning_rate": 1.999818470591457e-05,
"loss": 0.4339,
"step": 147
},
{
"epoch": 0.3627450980392157,
"grad_norm": 1.4683583974838257,
"learning_rate": 1.9998030284925e-05,
"loss": 0.4336,
"step": 148
},
{
"epoch": 0.36519607843137253,
"grad_norm": 0.9492887854576111,
"learning_rate": 1.999786956188216e-05,
"loss": 0.4242,
"step": 149
},
{
"epoch": 0.36764705882352944,
"grad_norm": 0.9128084182739258,
"learning_rate": 1.9997702536887353e-05,
"loss": 0.4574,
"step": 150
},
{
"epoch": 0.3700980392156863,
"grad_norm": 0.8592134118080139,
"learning_rate": 1.9997529210045864e-05,
"loss": 0.4445,
"step": 151
},
{
"epoch": 0.37254901960784315,
"grad_norm": 1.0512664318084717,
"learning_rate": 1.999734958146695e-05,
"loss": 0.4384,
"step": 152
},
{
"epoch": 0.375,
"grad_norm": 1.1068940162658691,
"learning_rate": 1.999716365126383e-05,
"loss": 0.436,
"step": 153
},
{
"epoch": 0.37745098039215685,
"grad_norm": 0.9419934749603271,
"learning_rate": 1.99969714195537e-05,
"loss": 0.4461,
"step": 154
},
{
"epoch": 0.3799019607843137,
"grad_norm": 0.8165314793586731,
"learning_rate": 1.9996772886457735e-05,
"loss": 0.4081,
"step": 155
},
{
"epoch": 0.38235294117647056,
"grad_norm": 1.0197334289550781,
"learning_rate": 1.999656805210107e-05,
"loss": 0.4249,
"step": 156
},
{
"epoch": 0.38480392156862747,
"grad_norm": 1.0298762321472168,
"learning_rate": 1.9996356916612825e-05,
"loss": 0.4528,
"step": 157
},
{
"epoch": 0.3872549019607843,
"grad_norm": 0.8545371294021606,
"learning_rate": 1.999613948012608e-05,
"loss": 0.4546,
"step": 158
},
{
"epoch": 0.3897058823529412,
"grad_norm": 0.8098302483558655,
"learning_rate": 1.9995915742777896e-05,
"loss": 0.4329,
"step": 159
},
{
"epoch": 0.39215686274509803,
"grad_norm": 0.9707794189453125,
"learning_rate": 1.9995685704709296e-05,
"loss": 0.4082,
"step": 160
},
{
"epoch": 0.3946078431372549,
"grad_norm": 0.9233653545379639,
"learning_rate": 1.9995449366065284e-05,
"loss": 0.4506,
"step": 161
},
{
"epoch": 0.39705882352941174,
"grad_norm": 0.9012821316719055,
"learning_rate": 1.9995206726994827e-05,
"loss": 0.4291,
"step": 162
},
{
"epoch": 0.39950980392156865,
"grad_norm": 0.9143744111061096,
"learning_rate": 1.9994957787650872e-05,
"loss": 0.444,
"step": 163
},
{
"epoch": 0.4019607843137255,
"grad_norm": 1.4861371517181396,
"learning_rate": 1.999470254819033e-05,
"loss": 0.4356,
"step": 164
},
{
"epoch": 0.40441176470588236,
"grad_norm": 0.8851897120475769,
"learning_rate": 1.9994441008774087e-05,
"loss": 0.4364,
"step": 165
},
{
"epoch": 0.4068627450980392,
"grad_norm": 0.8898401856422424,
"learning_rate": 1.9994173169567e-05,
"loss": 0.4328,
"step": 166
},
{
"epoch": 0.40931372549019607,
"grad_norm": 1.0515555143356323,
"learning_rate": 1.999389903073789e-05,
"loss": 0.4394,
"step": 167
},
{
"epoch": 0.4117647058823529,
"grad_norm": 1.07290780544281,
"learning_rate": 1.9993618592459567e-05,
"loss": 0.4264,
"step": 168
},
{
"epoch": 0.41421568627450983,
"grad_norm": 1.7121777534484863,
"learning_rate": 1.9993331854908787e-05,
"loss": 0.3942,
"step": 169
},
{
"epoch": 0.4166666666666667,
"grad_norm": 0.8365022540092468,
"learning_rate": 1.9993038818266296e-05,
"loss": 0.4357,
"step": 170
},
{
"epoch": 0.41911764705882354,
"grad_norm": 0.9449752569198608,
"learning_rate": 1.99927394827168e-05,
"loss": 0.4529,
"step": 171
},
{
"epoch": 0.4215686274509804,
"grad_norm": 0.8139907121658325,
"learning_rate": 1.9992433848448982e-05,
"loss": 0.4358,
"step": 172
},
{
"epoch": 0.42401960784313725,
"grad_norm": 0.9037125110626221,
"learning_rate": 1.9992121915655493e-05,
"loss": 0.4476,
"step": 173
},
{
"epoch": 0.4264705882352941,
"grad_norm": 2.4106829166412354,
"learning_rate": 1.999180368453295e-05,
"loss": 0.4357,
"step": 174
},
{
"epoch": 0.42892156862745096,
"grad_norm": 0.9304994940757751,
"learning_rate": 1.9991479155281945e-05,
"loss": 0.4341,
"step": 175
},
{
"epoch": 0.43137254901960786,
"grad_norm": 0.8943951725959778,
"learning_rate": 1.999114832810704e-05,
"loss": 0.4322,
"step": 176
},
{
"epoch": 0.4338235294117647,
"grad_norm": 0.8317229151725769,
"learning_rate": 1.9990811203216762e-05,
"loss": 0.4303,
"step": 177
},
{
"epoch": 0.4362745098039216,
"grad_norm": 1.007298469543457,
"learning_rate": 1.999046778082361e-05,
"loss": 0.4246,
"step": 178
},
{
"epoch": 0.4387254901960784,
"grad_norm": 0.9868156909942627,
"learning_rate": 1.999011806114406e-05,
"loss": 0.458,
"step": 179
},
{
"epoch": 0.4411764705882353,
"grad_norm": 0.9554439187049866,
"learning_rate": 1.9989762044398545e-05,
"loss": 0.4403,
"step": 180
},
{
"epoch": 0.44362745098039214,
"grad_norm": 1.015590786933899,
"learning_rate": 1.998939973081147e-05,
"loss": 0.4405,
"step": 181
},
{
"epoch": 0.44607843137254904,
"grad_norm": 1.051297664642334,
"learning_rate": 1.9989031120611223e-05,
"loss": 0.4412,
"step": 182
},
{
"epoch": 0.4485294117647059,
"grad_norm": 0.8119022846221924,
"learning_rate": 1.9988656214030135e-05,
"loss": 0.4457,
"step": 183
},
{
"epoch": 0.45098039215686275,
"grad_norm": 0.8970580101013184,
"learning_rate": 1.9988275011304534e-05,
"loss": 0.4356,
"step": 184
},
{
"epoch": 0.4534313725490196,
"grad_norm": 0.9368757009506226,
"learning_rate": 1.9987887512674694e-05,
"loss": 0.4312,
"step": 185
},
{
"epoch": 0.45588235294117646,
"grad_norm": 0.8388264179229736,
"learning_rate": 1.9987493718384877e-05,
"loss": 0.4429,
"step": 186
},
{
"epoch": 0.4583333333333333,
"grad_norm": 0.9641779661178589,
"learning_rate": 1.998709362868329e-05,
"loss": 0.4863,
"step": 187
},
{
"epoch": 0.46078431372549017,
"grad_norm": 0.7595623731613159,
"learning_rate": 1.9986687243822134e-05,
"loss": 0.418,
"step": 188
},
{
"epoch": 0.4632352941176471,
"grad_norm": 0.9058425426483154,
"learning_rate": 1.9986274564057553e-05,
"loss": 0.4068,
"step": 189
},
{
"epoch": 0.46568627450980393,
"grad_norm": 0.8350051045417786,
"learning_rate": 1.9985855589649682e-05,
"loss": 0.4255,
"step": 190
},
{
"epoch": 0.4681372549019608,
"grad_norm": 0.8191215395927429,
"learning_rate": 1.9985430320862608e-05,
"loss": 0.3908,
"step": 191
},
{
"epoch": 0.47058823529411764,
"grad_norm": 1.0077133178710938,
"learning_rate": 1.998499875796439e-05,
"loss": 0.4476,
"step": 192
},
{
"epoch": 0.4730392156862745,
"grad_norm": 0.9197707772254944,
"learning_rate": 1.998456090122706e-05,
"loss": 0.4004,
"step": 193
},
{
"epoch": 0.47549019607843135,
"grad_norm": 0.718039870262146,
"learning_rate": 1.9984116750926605e-05,
"loss": 0.4649,
"step": 194
},
{
"epoch": 0.47794117647058826,
"grad_norm": 0.8088057637214661,
"learning_rate": 1.9983666307342984e-05,
"loss": 0.4413,
"step": 195
},
{
"epoch": 0.4803921568627451,
"grad_norm": 0.8088821172714233,
"learning_rate": 1.9983209570760138e-05,
"loss": 0.4283,
"step": 196
},
{
"epoch": 0.48284313725490197,
"grad_norm": 1.4009703397750854,
"learning_rate": 1.998274654146595e-05,
"loss": 0.4274,
"step": 197
},
{
"epoch": 0.4852941176470588,
"grad_norm": 0.7745358943939209,
"learning_rate": 1.998227721975228e-05,
"loss": 0.4104,
"step": 198
},
{
"epoch": 0.4877450980392157,
"grad_norm": 0.7973339557647705,
"learning_rate": 1.9981801605914968e-05,
"loss": 0.409,
"step": 199
},
{
"epoch": 0.49019607843137253,
"grad_norm": 0.7767025232315063,
"learning_rate": 1.9981319700253793e-05,
"loss": 0.4328,
"step": 200
},
{
"epoch": 0.49264705882352944,
"grad_norm": 0.8516882658004761,
"learning_rate": 1.9980831503072524e-05,
"loss": 0.4154,
"step": 201
},
{
"epoch": 0.4950980392156863,
"grad_norm": 0.7395961284637451,
"learning_rate": 1.9980337014678878e-05,
"loss": 0.3956,
"step": 202
},
{
"epoch": 0.49754901960784315,
"grad_norm": 1.2353953123092651,
"learning_rate": 1.9979836235384554e-05,
"loss": 0.4582,
"step": 203
},
{
"epoch": 0.5,
"grad_norm": 0.7706623077392578,
"learning_rate": 1.9979329165505206e-05,
"loss": 0.4457,
"step": 204
},
{
"epoch": 0.5024509803921569,
"grad_norm": 0.717491865158081,
"learning_rate": 1.997881580536045e-05,
"loss": 0.3907,
"step": 205
},
{
"epoch": 0.5049019607843137,
"grad_norm": 0.8740329742431641,
"learning_rate": 1.9978296155273876e-05,
"loss": 0.4175,
"step": 206
},
{
"epoch": 0.5073529411764706,
"grad_norm": 0.7538326978683472,
"learning_rate": 1.9977770215573034e-05,
"loss": 0.394,
"step": 207
},
{
"epoch": 0.5098039215686274,
"grad_norm": 0.8653766512870789,
"learning_rate": 1.997723798658944e-05,
"loss": 0.4547,
"step": 208
},
{
"epoch": 0.5122549019607843,
"grad_norm": 0.7366246581077576,
"learning_rate": 1.9976699468658568e-05,
"loss": 0.4339,
"step": 209
},
{
"epoch": 0.5147058823529411,
"grad_norm": 0.7467411160469055,
"learning_rate": 1.9976154662119872e-05,
"loss": 0.4176,
"step": 210
},
{
"epoch": 0.5171568627450981,
"grad_norm": 0.7021523714065552,
"learning_rate": 1.9975603567316756e-05,
"loss": 0.4236,
"step": 211
},
{
"epoch": 0.5196078431372549,
"grad_norm": 0.9027709364891052,
"learning_rate": 1.9975046184596584e-05,
"loss": 0.4285,
"step": 212
},
{
"epoch": 0.5220588235294118,
"grad_norm": 0.8142911791801453,
"learning_rate": 1.9974482514310698e-05,
"loss": 0.4426,
"step": 213
},
{
"epoch": 0.5245098039215687,
"grad_norm": 0.7636563181877136,
"learning_rate": 1.9973912556814396e-05,
"loss": 0.3966,
"step": 214
},
{
"epoch": 0.5269607843137255,
"grad_norm": 0.8787136077880859,
"learning_rate": 1.9973336312466937e-05,
"loss": 0.4153,
"step": 215
},
{
"epoch": 0.5294117647058824,
"grad_norm": 0.767151951789856,
"learning_rate": 1.9972753781631546e-05,
"loss": 0.4318,
"step": 216
},
{
"epoch": 0.5318627450980392,
"grad_norm": 0.8044896125793457,
"learning_rate": 1.9972164964675405e-05,
"loss": 0.4146,
"step": 217
},
{
"epoch": 0.5343137254901961,
"grad_norm": 0.7757907509803772,
"learning_rate": 1.9971569861969667e-05,
"loss": 0.4303,
"step": 218
},
{
"epoch": 0.5367647058823529,
"grad_norm": 0.8263378739356995,
"learning_rate": 1.9970968473889447e-05,
"loss": 0.4312,
"step": 219
},
{
"epoch": 0.5392156862745098,
"grad_norm": 0.8943036198616028,
"learning_rate": 1.9970360800813807e-05,
"loss": 0.4476,
"step": 220
},
{
"epoch": 0.5416666666666666,
"grad_norm": 1.0750970840454102,
"learning_rate": 1.996974684312579e-05,
"loss": 0.4251,
"step": 221
},
{
"epoch": 0.5441176470588235,
"grad_norm": 1.2690297365188599,
"learning_rate": 1.9969126601212388e-05,
"loss": 0.441,
"step": 222
},
{
"epoch": 0.5465686274509803,
"grad_norm": 0.7188127040863037,
"learning_rate": 1.9968500075464558e-05,
"loss": 0.3844,
"step": 223
},
{
"epoch": 0.5490196078431373,
"grad_norm": 0.8277931809425354,
"learning_rate": 1.9967867266277213e-05,
"loss": 0.4108,
"step": 224
},
{
"epoch": 0.5514705882352942,
"grad_norm": 0.7500224709510803,
"learning_rate": 1.9967228174049246e-05,
"loss": 0.4395,
"step": 225
},
{
"epoch": 0.553921568627451,
"grad_norm": 1.1601848602294922,
"learning_rate": 1.9966582799183477e-05,
"loss": 0.4051,
"step": 226
},
{
"epoch": 0.5563725490196079,
"grad_norm": 0.7330986261367798,
"learning_rate": 1.9965931142086717e-05,
"loss": 0.4269,
"step": 227
},
{
"epoch": 0.5588235294117647,
"grad_norm": 0.8486798405647278,
"learning_rate": 1.9965273203169725e-05,
"loss": 0.42,
"step": 228
},
{
"epoch": 0.5612745098039216,
"grad_norm": 0.6744402050971985,
"learning_rate": 1.9964608982847213e-05,
"loss": 0.4072,
"step": 229
},
{
"epoch": 0.5637254901960784,
"grad_norm": 0.8949413299560547,
"learning_rate": 1.996393848153786e-05,
"loss": 0.4481,
"step": 230
},
{
"epoch": 0.5661764705882353,
"grad_norm": 0.8391293287277222,
"learning_rate": 1.9963261699664304e-05,
"loss": 0.4146,
"step": 231
},
{
"epoch": 0.5686274509803921,
"grad_norm": 0.7431681752204895,
"learning_rate": 1.9962578637653142e-05,
"loss": 0.4032,
"step": 232
},
{
"epoch": 0.571078431372549,
"grad_norm": 0.6789780855178833,
"learning_rate": 1.9961889295934927e-05,
"loss": 0.3989,
"step": 233
},
{
"epoch": 0.5735294117647058,
"grad_norm": 0.7854921221733093,
"learning_rate": 1.996119367494417e-05,
"loss": 0.4114,
"step": 234
},
{
"epoch": 0.5759803921568627,
"grad_norm": 0.6768577098846436,
"learning_rate": 1.9960491775119344e-05,
"loss": 0.4149,
"step": 235
},
{
"epoch": 0.5784313725490197,
"grad_norm": 0.7455466985702515,
"learning_rate": 1.9959783596902876e-05,
"loss": 0.4192,
"step": 236
},
{
"epoch": 0.5808823529411765,
"grad_norm": 1.4012922048568726,
"learning_rate": 1.995906914074115e-05,
"loss": 0.4127,
"step": 237
},
{
"epoch": 0.5833333333333334,
"grad_norm": 0.7336264252662659,
"learning_rate": 1.995834840708451e-05,
"loss": 0.428,
"step": 238
},
{
"epoch": 0.5857843137254902,
"grad_norm": 0.7108700275421143,
"learning_rate": 1.9957621396387256e-05,
"loss": 0.4359,
"step": 239
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.7039199471473694,
"learning_rate": 1.9956888109107645e-05,
"loss": 0.4166,
"step": 240
},
{
"epoch": 0.5906862745098039,
"grad_norm": 0.8049781918525696,
"learning_rate": 1.9956148545707886e-05,
"loss": 0.4021,
"step": 241
},
{
"epoch": 0.5931372549019608,
"grad_norm": 0.67364901304245,
"learning_rate": 1.9955402706654154e-05,
"loss": 0.4058,
"step": 242
},
{
"epoch": 0.5955882352941176,
"grad_norm": 0.7115527987480164,
"learning_rate": 1.995465059241657e-05,
"loss": 0.4267,
"step": 243
},
{
"epoch": 0.5980392156862745,
"grad_norm": 0.6766150593757629,
"learning_rate": 1.9953892203469213e-05,
"loss": 0.4281,
"step": 244
},
{
"epoch": 0.6004901960784313,
"grad_norm": 0.6857250928878784,
"learning_rate": 1.9953127540290117e-05,
"loss": 0.3835,
"step": 245
},
{
"epoch": 0.6029411764705882,
"grad_norm": 0.7766714692115784,
"learning_rate": 1.9952356603361272e-05,
"loss": 0.4161,
"step": 246
},
{
"epoch": 0.6053921568627451,
"grad_norm": 0.780937910079956,
"learning_rate": 1.9951579393168625e-05,
"loss": 0.3901,
"step": 247
},
{
"epoch": 0.6078431372549019,
"grad_norm": 0.7650203704833984,
"learning_rate": 1.995079591020207e-05,
"loss": 0.4153,
"step": 248
},
{
"epoch": 0.6102941176470589,
"grad_norm": 0.6831273436546326,
"learning_rate": 1.9950006154955466e-05,
"loss": 0.3936,
"step": 249
},
{
"epoch": 0.6127450980392157,
"grad_norm": 0.6588292121887207,
"learning_rate": 1.9949210127926616e-05,
"loss": 0.4156,
"step": 250
},
{
"epoch": 0.6151960784313726,
"grad_norm": 0.9314211010932922,
"learning_rate": 1.9948407829617275e-05,
"loss": 0.4509,
"step": 251
},
{
"epoch": 0.6176470588235294,
"grad_norm": 0.7328870296478271,
"learning_rate": 1.994759926053316e-05,
"loss": 0.4531,
"step": 252
},
{
"epoch": 0.6200980392156863,
"grad_norm": 0.7139073610305786,
"learning_rate": 1.9946784421183934e-05,
"loss": 0.3963,
"step": 253
},
{
"epoch": 0.6225490196078431,
"grad_norm": 0.8202491998672485,
"learning_rate": 1.994596331208322e-05,
"loss": 0.4173,
"step": 254
},
{
"epoch": 0.625,
"grad_norm": 0.7805032730102539,
"learning_rate": 1.9945135933748578e-05,
"loss": 0.3976,
"step": 255
},
{
"epoch": 0.6274509803921569,
"grad_norm": 0.7403963804244995,
"learning_rate": 1.9944302286701534e-05,
"loss": 0.4075,
"step": 256
},
{
"epoch": 0.6299019607843137,
"grad_norm": 0.7291815280914307,
"learning_rate": 1.994346237146756e-05,
"loss": 0.4171,
"step": 257
},
{
"epoch": 0.6323529411764706,
"grad_norm": 0.8638690114021301,
"learning_rate": 1.9942616188576078e-05,
"loss": 0.4206,
"step": 258
},
{
"epoch": 0.6348039215686274,
"grad_norm": 0.8649875521659851,
"learning_rate": 1.9941763738560463e-05,
"loss": 0.4462,
"step": 259
},
{
"epoch": 0.6372549019607843,
"grad_norm": 0.676437497138977,
"learning_rate": 1.9940905021958043e-05,
"loss": 0.3943,
"step": 260
},
{
"epoch": 0.6397058823529411,
"grad_norm": 0.7884188890457153,
"learning_rate": 1.9940040039310086e-05,
"loss": 0.4385,
"step": 261
},
{
"epoch": 0.6421568627450981,
"grad_norm": 0.8990888595581055,
"learning_rate": 1.9939168791161817e-05,
"loss": 0.4427,
"step": 262
},
{
"epoch": 0.6446078431372549,
"grad_norm": 0.7080796360969543,
"learning_rate": 1.993829127806241e-05,
"loss": 0.4381,
"step": 263
},
{
"epoch": 0.6470588235294118,
"grad_norm": 0.8058847188949585,
"learning_rate": 1.9937407500564995e-05,
"loss": 0.4331,
"step": 264
},
{
"epoch": 0.6495098039215687,
"grad_norm": 1.3826943635940552,
"learning_rate": 1.993651745922663e-05,
"loss": 0.41,
"step": 265
},
{
"epoch": 0.6519607843137255,
"grad_norm": 0.9948577284812927,
"learning_rate": 1.9935621154608348e-05,
"loss": 0.4336,
"step": 266
},
{
"epoch": 0.6544117647058824,
"grad_norm": 0.8192009925842285,
"learning_rate": 1.9934718587275105e-05,
"loss": 0.4235,
"step": 267
},
{
"epoch": 0.6568627450980392,
"grad_norm": 1.1083520650863647,
"learning_rate": 1.9933809757795817e-05,
"loss": 0.4252,
"step": 268
},
{
"epoch": 0.6593137254901961,
"grad_norm": 0.7908585667610168,
"learning_rate": 1.9932894666743357e-05,
"loss": 0.4126,
"step": 269
},
{
"epoch": 0.6617647058823529,
"grad_norm": 0.7882091999053955,
"learning_rate": 1.993197331469452e-05,
"loss": 0.4348,
"step": 270
},
{
"epoch": 0.6642156862745098,
"grad_norm": 0.7854613065719604,
"learning_rate": 1.993104570223007e-05,
"loss": 0.4178,
"step": 271
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.816422164440155,
"learning_rate": 1.99301118299347e-05,
"loss": 0.431,
"step": 272
},
{
"epoch": 0.6691176470588235,
"grad_norm": 0.8504275679588318,
"learning_rate": 1.992917169839707e-05,
"loss": 0.3891,
"step": 273
},
{
"epoch": 0.6715686274509803,
"grad_norm": 0.8654723763465881,
"learning_rate": 1.9928225308209762e-05,
"loss": 0.4024,
"step": 274
},
{
"epoch": 0.6740196078431373,
"grad_norm": 0.688203752040863,
"learning_rate": 1.9927272659969317e-05,
"loss": 0.3961,
"step": 275
},
{
"epoch": 0.6764705882352942,
"grad_norm": 0.7712222933769226,
"learning_rate": 1.9926313754276217e-05,
"loss": 0.4179,
"step": 276
},
{
"epoch": 0.678921568627451,
"grad_norm": 0.8645032048225403,
"learning_rate": 1.9925348591734894e-05,
"loss": 0.4012,
"step": 277
},
{
"epoch": 0.6813725490196079,
"grad_norm": 0.7662283182144165,
"learning_rate": 1.9924377172953708e-05,
"loss": 0.4117,
"step": 278
},
{
"epoch": 0.6838235294117647,
"grad_norm": 0.7253487706184387,
"learning_rate": 1.9923399498544982e-05,
"loss": 0.4166,
"step": 279
},
{
"epoch": 0.6862745098039216,
"grad_norm": 1.120529055595398,
"learning_rate": 1.9922415569124966e-05,
"loss": 0.4333,
"step": 280
},
{
"epoch": 0.6887254901960784,
"grad_norm": 0.7117664813995361,
"learning_rate": 1.9921425385313865e-05,
"loss": 0.424,
"step": 281
},
{
"epoch": 0.6911764705882353,
"grad_norm": 0.7550606727600098,
"learning_rate": 1.9920428947735817e-05,
"loss": 0.43,
"step": 282
},
{
"epoch": 0.6936274509803921,
"grad_norm": 0.7705708146095276,
"learning_rate": 1.9919426257018913e-05,
"loss": 0.3933,
"step": 283
},
{
"epoch": 0.696078431372549,
"grad_norm": 0.6843774914741516,
"learning_rate": 1.991841731379517e-05,
"loss": 0.4144,
"step": 284
},
{
"epoch": 0.6985294117647058,
"grad_norm": 0.7802500128746033,
"learning_rate": 1.9917402118700557e-05,
"loss": 0.4228,
"step": 285
},
{
"epoch": 0.7009803921568627,
"grad_norm": 0.7861772775650024,
"learning_rate": 1.9916380672374984e-05,
"loss": 0.4321,
"step": 286
},
{
"epoch": 0.7034313725490197,
"grad_norm": 0.6811192631721497,
"learning_rate": 1.99153529754623e-05,
"loss": 0.3964,
"step": 287
},
{
"epoch": 0.7058823529411765,
"grad_norm": 0.731429934501648,
"learning_rate": 1.9914319028610285e-05,
"loss": 0.3991,
"step": 288
},
{
"epoch": 0.7083333333333334,
"grad_norm": 0.7709717750549316,
"learning_rate": 1.9913278832470673e-05,
"loss": 0.4371,
"step": 289
},
{
"epoch": 0.7107843137254902,
"grad_norm": 0.7618672251701355,
"learning_rate": 1.991223238769913e-05,
"loss": 0.4095,
"step": 290
},
{
"epoch": 0.7132352941176471,
"grad_norm": 0.8476813435554504,
"learning_rate": 1.991117969495526e-05,
"loss": 0.4338,
"step": 291
},
{
"epoch": 0.7156862745098039,
"grad_norm": 0.8538393974304199,
"learning_rate": 1.9910120754902606e-05,
"loss": 0.4528,
"step": 292
},
{
"epoch": 0.7181372549019608,
"grad_norm": 0.6772223114967346,
"learning_rate": 1.9909055568208647e-05,
"loss": 0.4016,
"step": 293
},
{
"epoch": 0.7205882352941176,
"grad_norm": 0.8460076451301575,
"learning_rate": 1.9907984135544804e-05,
"loss": 0.3989,
"step": 294
},
{
"epoch": 0.7230392156862745,
"grad_norm": 0.8783762454986572,
"learning_rate": 1.9906906457586433e-05,
"loss": 0.422,
"step": 295
},
{
"epoch": 0.7254901960784313,
"grad_norm": 0.8669313788414001,
"learning_rate": 1.9905822535012825e-05,
"loss": 0.4003,
"step": 296
},
{
"epoch": 0.7279411764705882,
"grad_norm": 0.77335125207901,
"learning_rate": 1.9904732368507207e-05,
"loss": 0.4087,
"step": 297
},
{
"epoch": 0.7303921568627451,
"grad_norm": 0.7851046919822693,
"learning_rate": 1.9903635958756745e-05,
"loss": 0.4303,
"step": 298
},
{
"epoch": 0.7328431372549019,
"grad_norm": 0.7382727265357971,
"learning_rate": 1.990253330645254e-05,
"loss": 0.442,
"step": 299
},
{
"epoch": 0.7352941176470589,
"grad_norm": 0.8672908544540405,
"learning_rate": 1.9901424412289626e-05,
"loss": 0.4395,
"step": 300
},
{
"epoch": 0.7377450980392157,
"grad_norm": 0.8030222058296204,
"learning_rate": 1.9900309276966968e-05,
"loss": 0.4261,
"step": 301
},
{
"epoch": 0.7401960784313726,
"grad_norm": 1.077311396598816,
"learning_rate": 1.989918790118747e-05,
"loss": 0.4115,
"step": 302
},
{
"epoch": 0.7426470588235294,
"grad_norm": 0.7743614315986633,
"learning_rate": 1.989806028565797e-05,
"loss": 0.4145,
"step": 303
},
{
"epoch": 0.7450980392156863,
"grad_norm": 0.9641470313072205,
"learning_rate": 1.9896926431089233e-05,
"loss": 0.4138,
"step": 304
},
{
"epoch": 0.7475490196078431,
"grad_norm": 1.1215864419937134,
"learning_rate": 1.9895786338195968e-05,
"loss": 0.4146,
"step": 305
},
{
"epoch": 0.75,
"grad_norm": 0.8197227716445923,
"learning_rate": 1.9894640007696806e-05,
"loss": 0.4365,
"step": 306
},
{
"epoch": 0.7524509803921569,
"grad_norm": 0.7501351237297058,
"learning_rate": 1.9893487440314312e-05,
"loss": 0.3838,
"step": 307
},
{
"epoch": 0.7549019607843137,
"grad_norm": 0.7394661903381348,
"learning_rate": 1.989232863677498e-05,
"loss": 0.4063,
"step": 308
},
{
"epoch": 0.7573529411764706,
"grad_norm": 0.8234416842460632,
"learning_rate": 1.9891163597809246e-05,
"loss": 0.4218,
"step": 309
},
{
"epoch": 0.7598039215686274,
"grad_norm": 0.7823823094367981,
"learning_rate": 1.988999232415146e-05,
"loss": 0.4166,
"step": 310
},
{
"epoch": 0.7622549019607843,
"grad_norm": 0.6892342567443848,
"learning_rate": 1.988881481653992e-05,
"loss": 0.3912,
"step": 311
},
{
"epoch": 0.7647058823529411,
"grad_norm": 0.913314938545227,
"learning_rate": 1.9887631075716835e-05,
"loss": 0.4309,
"step": 312
},
{
"epoch": 0.7671568627450981,
"grad_norm": 0.9958233833312988,
"learning_rate": 1.9886441102428352e-05,
"loss": 0.4191,
"step": 313
},
{
"epoch": 0.7696078431372549,
"grad_norm": 1.1082661151885986,
"learning_rate": 1.988524489742455e-05,
"loss": 0.4231,
"step": 314
},
{
"epoch": 0.7720588235294118,
"grad_norm": 0.7491757869720459,
"learning_rate": 1.9884042461459436e-05,
"loss": 0.4409,
"step": 315
},
{
"epoch": 0.7745098039215687,
"grad_norm": 0.7593130469322205,
"learning_rate": 1.9882833795290932e-05,
"loss": 0.3981,
"step": 316
},
{
"epoch": 0.7769607843137255,
"grad_norm": 0.7963762283325195,
"learning_rate": 1.9881618899680902e-05,
"loss": 0.4117,
"step": 317
},
{
"epoch": 0.7794117647058824,
"grad_norm": 0.7004902362823486,
"learning_rate": 1.9880397775395128e-05,
"loss": 0.408,
"step": 318
},
{
"epoch": 0.7818627450980392,
"grad_norm": 0.826677143573761,
"learning_rate": 1.987917042320332e-05,
"loss": 0.3915,
"step": 319
},
{
"epoch": 0.7843137254901961,
"grad_norm": 1.082105278968811,
"learning_rate": 1.987793684387912e-05,
"loss": 0.3848,
"step": 320
},
{
"epoch": 0.7867647058823529,
"grad_norm": 0.8547869324684143,
"learning_rate": 1.9876697038200082e-05,
"loss": 0.417,
"step": 321
},
{
"epoch": 0.7892156862745098,
"grad_norm": 0.8947910070419312,
"learning_rate": 1.9875451006947695e-05,
"loss": 0.4322,
"step": 322
},
{
"epoch": 0.7916666666666666,
"grad_norm": 1.049882411956787,
"learning_rate": 1.9874198750907367e-05,
"loss": 0.4079,
"step": 323
},
{
"epoch": 0.7941176470588235,
"grad_norm": 1.4788544178009033,
"learning_rate": 1.9872940270868438e-05,
"loss": 0.4224,
"step": 324
},
{
"epoch": 0.7965686274509803,
"grad_norm": 1.8144521713256836,
"learning_rate": 1.9871675567624157e-05,
"loss": 0.3958,
"step": 325
},
{
"epoch": 0.7990196078431373,
"grad_norm": 0.9387969374656677,
"learning_rate": 1.9870404641971712e-05,
"loss": 0.4217,
"step": 326
},
{
"epoch": 0.8014705882352942,
"grad_norm": 1.2158770561218262,
"learning_rate": 1.9869127494712198e-05,
"loss": 0.4621,
"step": 327
},
{
"epoch": 0.803921568627451,
"grad_norm": 1.6395939588546753,
"learning_rate": 1.9867844126650647e-05,
"loss": 0.3871,
"step": 328
},
{
"epoch": 0.8063725490196079,
"grad_norm": 1.300582766532898,
"learning_rate": 1.986655453859599e-05,
"loss": 0.4067,
"step": 329
},
{
"epoch": 0.8088235294117647,
"grad_norm": 1.3686848878860474,
"learning_rate": 1.9865258731361104e-05,
"loss": 0.4013,
"step": 330
},
{
"epoch": 0.8112745098039216,
"grad_norm": 2.8070366382598877,
"learning_rate": 1.9863956705762774e-05,
"loss": 0.3948,
"step": 331
},
{
"epoch": 0.8137254901960784,
"grad_norm": 3.0911552906036377,
"learning_rate": 1.98626484626217e-05,
"loss": 0.4242,
"step": 332
},
{
"epoch": 0.8161764705882353,
"grad_norm": 8.936417579650879,
"learning_rate": 1.9861334002762512e-05,
"loss": 0.4248,
"step": 333
},
{
"epoch": 0.8186274509803921,
"grad_norm": 2.287069320678711,
"learning_rate": 1.9860013327013744e-05,
"loss": 0.4181,
"step": 334
},
{
"epoch": 0.821078431372549,
"grad_norm": 0.9176841378211975,
"learning_rate": 1.9858686436207865e-05,
"loss": 0.4171,
"step": 335
},
{
"epoch": 0.8235294117647058,
"grad_norm": 1.0563653707504272,
"learning_rate": 1.9857353331181248e-05,
"loss": 0.4204,
"step": 336
},
{
"epoch": 0.8259803921568627,
"grad_norm": 1.0640106201171875,
"learning_rate": 1.9856014012774195e-05,
"loss": 0.402,
"step": 337
},
{
"epoch": 0.8284313725490197,
"grad_norm": 1.111237645149231,
"learning_rate": 1.9854668481830908e-05,
"loss": 0.3974,
"step": 338
},
{
"epoch": 0.8308823529411765,
"grad_norm": 1.239695429801941,
"learning_rate": 1.9853316739199523e-05,
"loss": 0.428,
"step": 339
},
{
"epoch": 0.8333333333333334,
"grad_norm": 1.3592584133148193,
"learning_rate": 1.9851958785732083e-05,
"loss": 0.4431,
"step": 340
},
{
"epoch": 0.8357843137254902,
"grad_norm": 1.030774474143982,
"learning_rate": 1.9850594622284545e-05,
"loss": 0.4138,
"step": 341
},
{
"epoch": 0.8382352941176471,
"grad_norm": 1.0790998935699463,
"learning_rate": 1.9849224249716777e-05,
"loss": 0.403,
"step": 342
},
{
"epoch": 0.8406862745098039,
"grad_norm": 1.4409196376800537,
"learning_rate": 1.984784766889257e-05,
"loss": 0.3981,
"step": 343
},
{
"epoch": 0.8431372549019608,
"grad_norm": 1.6880717277526855,
"learning_rate": 1.9846464880679622e-05,
"loss": 0.4312,
"step": 344
},
{
"epoch": 0.8455882352941176,
"grad_norm": 1.4403940439224243,
"learning_rate": 1.9845075885949546e-05,
"loss": 0.395,
"step": 345
},
{
"epoch": 0.8480392156862745,
"grad_norm": 1.2085556983947754,
"learning_rate": 1.9843680685577866e-05,
"loss": 0.4127,
"step": 346
},
{
"epoch": 0.8504901960784313,
"grad_norm": 1.230765700340271,
"learning_rate": 1.9842279280444017e-05,
"loss": 0.4038,
"step": 347
},
{
"epoch": 0.8529411764705882,
"grad_norm": 1.0861726999282837,
"learning_rate": 1.9840871671431344e-05,
"loss": 0.3849,
"step": 348
},
{
"epoch": 0.8553921568627451,
"grad_norm": 12.917766571044922,
"learning_rate": 1.983945785942711e-05,
"loss": 0.4152,
"step": 349
},
{
"epoch": 0.8578431372549019,
"grad_norm": 1.1176540851593018,
"learning_rate": 1.9838037845322476e-05,
"loss": 0.4275,
"step": 350
},
{
"epoch": 0.8602941176470589,
"grad_norm": 1.4891622066497803,
"learning_rate": 1.9836611630012528e-05,
"loss": 0.4082,
"step": 351
},
{
"epoch": 0.8627450980392157,
"grad_norm": 1.3843212127685547,
"learning_rate": 1.983517921439624e-05,
"loss": 0.3959,
"step": 352
},
{
"epoch": 0.8651960784313726,
"grad_norm": 1.0070065259933472,
"learning_rate": 1.9833740599376515e-05,
"loss": 0.4139,
"step": 353
},
{
"epoch": 0.8676470588235294,
"grad_norm": 1.7093080282211304,
"learning_rate": 1.983229578586015e-05,
"loss": 0.406,
"step": 354
},
{
"epoch": 0.8700980392156863,
"grad_norm": 7.11337947845459,
"learning_rate": 1.9830844774757856e-05,
"loss": 0.4204,
"step": 355
},
{
"epoch": 0.8725490196078431,
"grad_norm": 1.7389130592346191,
"learning_rate": 1.9829387566984242e-05,
"loss": 0.4128,
"step": 356
},
{
"epoch": 0.875,
"grad_norm": 1.3052678108215332,
"learning_rate": 1.9827924163457836e-05,
"loss": 0.4234,
"step": 357
},
{
"epoch": 0.8774509803921569,
"grad_norm": 1.2700797319412231,
"learning_rate": 1.9826454565101065e-05,
"loss": 0.4108,
"step": 358
},
{
"epoch": 0.8799019607843137,
"grad_norm": 1.049463152885437,
"learning_rate": 1.9824978772840255e-05,
"loss": 0.4029,
"step": 359
},
{
"epoch": 0.8823529411764706,
"grad_norm": 0.9537893533706665,
"learning_rate": 1.9823496787605644e-05,
"loss": 0.3775,
"step": 360
},
{
"epoch": 0.8848039215686274,
"grad_norm": 1.9085501432418823,
"learning_rate": 1.9822008610331372e-05,
"loss": 0.429,
"step": 361
},
{
"epoch": 0.8872549019607843,
"grad_norm": 1.2662749290466309,
"learning_rate": 1.9820514241955476e-05,
"loss": 0.3985,
"step": 362
},
{
"epoch": 0.8897058823529411,
"grad_norm": 0.9020135402679443,
"learning_rate": 1.9819013683419904e-05,
"loss": 0.38,
"step": 363
},
{
"epoch": 0.8921568627450981,
"grad_norm": 1.0456725358963013,
"learning_rate": 1.9817506935670504e-05,
"loss": 0.4245,
"step": 364
},
{
"epoch": 0.8946078431372549,
"grad_norm": 1.0952237844467163,
"learning_rate": 1.9815993999657024e-05,
"loss": 0.4027,
"step": 365
},
{
"epoch": 0.8970588235294118,
"grad_norm": 0.8261983394622803,
"learning_rate": 1.9814474876333108e-05,
"loss": 0.3888,
"step": 366
},
{
"epoch": 0.8995098039215687,
"grad_norm": 0.9443764090538025,
"learning_rate": 1.9812949566656306e-05,
"loss": 0.4102,
"step": 367
},
{
"epoch": 0.9019607843137255,
"grad_norm": 0.7021450400352478,
"learning_rate": 1.9811418071588068e-05,
"loss": 0.4031,
"step": 368
},
{
"epoch": 0.9044117647058824,
"grad_norm": 0.7270957231521606,
"learning_rate": 1.9809880392093733e-05,
"loss": 0.394,
"step": 369
},
{
"epoch": 0.9068627450980392,
"grad_norm": 0.6759046912193298,
"learning_rate": 1.9808336529142556e-05,
"loss": 0.3854,
"step": 370
},
{
"epoch": 0.9093137254901961,
"grad_norm": 0.7309487462043762,
"learning_rate": 1.980678648370767e-05,
"loss": 0.412,
"step": 371
},
{
"epoch": 0.9117647058823529,
"grad_norm": 0.7586361169815063,
"learning_rate": 1.9805230256766122e-05,
"loss": 0.4346,
"step": 372
},
{
"epoch": 0.9142156862745098,
"grad_norm": 0.8274661898612976,
"learning_rate": 1.980366784929885e-05,
"loss": 0.3982,
"step": 373
},
{
"epoch": 0.9166666666666666,
"grad_norm": 0.6976910829544067,
"learning_rate": 1.9802099262290673e-05,
"loss": 0.4145,
"step": 374
},
{
"epoch": 0.9191176470588235,
"grad_norm": 1.2365355491638184,
"learning_rate": 1.9800524496730326e-05,
"loss": 0.3819,
"step": 375
},
{
"epoch": 0.9215686274509803,
"grad_norm": 1.0249954462051392,
"learning_rate": 1.9798943553610426e-05,
"loss": 0.4132,
"step": 376
},
{
"epoch": 0.9240196078431373,
"grad_norm": 0.7227798700332642,
"learning_rate": 1.97973564339275e-05,
"loss": 0.4173,
"step": 377
},
{
"epoch": 0.9264705882352942,
"grad_norm": 0.6626468896865845,
"learning_rate": 1.9795763138681936e-05,
"loss": 0.4135,
"step": 378
},
{
"epoch": 0.928921568627451,
"grad_norm": 0.7319209575653076,
"learning_rate": 1.9794163668878052e-05,
"loss": 0.398,
"step": 379
},
{
"epoch": 0.9313725490196079,
"grad_norm": 0.735980212688446,
"learning_rate": 1.9792558025524036e-05,
"loss": 0.3822,
"step": 380
},
{
"epoch": 0.9338235294117647,
"grad_norm": 0.7025852799415588,
"learning_rate": 1.979094620963197e-05,
"loss": 0.4005,
"step": 381
},
{
"epoch": 0.9362745098039216,
"grad_norm": 0.72138911485672,
"learning_rate": 1.978932822221783e-05,
"loss": 0.3795,
"step": 382
},
{
"epoch": 0.9387254901960784,
"grad_norm": 0.6665112972259521,
"learning_rate": 1.9787704064301484e-05,
"loss": 0.4004,
"step": 383
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.8278647661209106,
"learning_rate": 1.978607373690668e-05,
"loss": 0.3953,
"step": 384
},
{
"epoch": 0.9436274509803921,
"grad_norm": 0.8091961741447449,
"learning_rate": 1.9784437241061068e-05,
"loss": 0.4299,
"step": 385
},
{
"epoch": 0.946078431372549,
"grad_norm": 0.6562437415122986,
"learning_rate": 1.978279457779618e-05,
"loss": 0.4386,
"step": 386
},
{
"epoch": 0.9485294117647058,
"grad_norm": 0.63033127784729,
"learning_rate": 1.978114574814743e-05,
"loss": 0.3851,
"step": 387
},
{
"epoch": 0.9509803921568627,
"grad_norm": 0.6354605555534363,
"learning_rate": 1.9779490753154126e-05,
"loss": 0.3781,
"step": 388
},
{
"epoch": 0.9534313725490197,
"grad_norm": 0.6709758639335632,
"learning_rate": 1.9777829593859463e-05,
"loss": 0.3988,
"step": 389
},
{
"epoch": 0.9558823529411765,
"grad_norm": 0.7227322459220886,
"learning_rate": 1.9776162271310513e-05,
"loss": 0.4103,
"step": 390
},
{
"epoch": 0.9583333333333334,
"grad_norm": 0.6730179190635681,
"learning_rate": 1.9774488786558246e-05,
"loss": 0.3968,
"step": 391
},
{
"epoch": 0.9607843137254902,
"grad_norm": 0.8349875807762146,
"learning_rate": 1.97728091406575e-05,
"loss": 0.4157,
"step": 392
},
{
"epoch": 0.9632352941176471,
"grad_norm": 1.065743088722229,
"learning_rate": 1.9771123334667014e-05,
"loss": 0.3993,
"step": 393
},
{
"epoch": 0.9656862745098039,
"grad_norm": 0.6808967590332031,
"learning_rate": 1.9769431369649395e-05,
"loss": 0.4001,
"step": 394
},
{
"epoch": 0.9681372549019608,
"grad_norm": 0.6151614785194397,
"learning_rate": 1.976773324667114e-05,
"loss": 0.3791,
"step": 395
},
{
"epoch": 0.9705882352941176,
"grad_norm": 0.6762698888778687,
"learning_rate": 1.9766028966802627e-05,
"loss": 0.3911,
"step": 396
},
{
"epoch": 0.9730392156862745,
"grad_norm": 1.0377041101455688,
"learning_rate": 1.9764318531118114e-05,
"loss": 0.3814,
"step": 397
},
{
"epoch": 0.9754901960784313,
"grad_norm": 0.7692320346832275,
"learning_rate": 1.976260194069574e-05,
"loss": 0.3747,
"step": 398
},
{
"epoch": 0.9779411764705882,
"grad_norm": 0.727139413356781,
"learning_rate": 1.9760879196617518e-05,
"loss": 0.3921,
"step": 399
},
{
"epoch": 0.9803921568627451,
"grad_norm": 0.8963577151298523,
"learning_rate": 1.9759150299969348e-05,
"loss": 0.4025,
"step": 400
},
{
"epoch": 0.9828431372549019,
"grad_norm": 0.7158836126327515,
"learning_rate": 1.9757415251841004e-05,
"loss": 0.3836,
"step": 401
},
{
"epoch": 0.9852941176470589,
"grad_norm": 0.670198380947113,
"learning_rate": 1.975567405332614e-05,
"loss": 0.4255,
"step": 402
},
{
"epoch": 0.9877450980392157,
"grad_norm": 0.7386361956596375,
"learning_rate": 1.975392670552228e-05,
"loss": 0.4143,
"step": 403
},
{
"epoch": 0.9901960784313726,
"grad_norm": 0.6616537570953369,
"learning_rate": 1.975217320953083e-05,
"loss": 0.3971,
"step": 404
},
{
"epoch": 0.9926470588235294,
"grad_norm": 0.7673658132553101,
"learning_rate": 1.9750413566457077e-05,
"loss": 0.3874,
"step": 405
},
{
"epoch": 0.9950980392156863,
"grad_norm": 0.6570972204208374,
"learning_rate": 1.9748647777410167e-05,
"loss": 0.3853,
"step": 406
},
{
"epoch": 0.9975490196078431,
"grad_norm": 0.9980835318565369,
"learning_rate": 1.9746875843503135e-05,
"loss": 0.401,
"step": 407
},
{
"epoch": 1.0,
"grad_norm": 0.640150785446167,
"learning_rate": 1.974509776585288e-05,
"loss": 0.3939,
"step": 408
},
{
"epoch": 1.0,
"eval_loss": 0.39860227704048157,
"eval_runtime": 54.9582,
"eval_samples_per_second": 90.978,
"eval_steps_per_second": 0.364,
"step": 408
},
{
"epoch": 1.0024509803921569,
"grad_norm": 0.7501364946365356,
"learning_rate": 1.9743313545580176e-05,
"loss": 0.3697,
"step": 409
},
{
"epoch": 1.0049019607843137,
"grad_norm": 0.8544023633003235,
"learning_rate": 1.9741523183809673e-05,
"loss": 0.3782,
"step": 410
},
{
"epoch": 1.0073529411764706,
"grad_norm": 0.7510189414024353,
"learning_rate": 1.9739726681669884e-05,
"loss": 0.3524,
"step": 411
},
{
"epoch": 1.0098039215686274,
"grad_norm": 0.6872191429138184,
"learning_rate": 1.9737924040293203e-05,
"loss": 0.3684,
"step": 412
},
{
"epoch": 1.0122549019607843,
"grad_norm": 0.6787446737289429,
"learning_rate": 1.9736115260815885e-05,
"loss": 0.3562,
"step": 413
},
{
"epoch": 1.0147058823529411,
"grad_norm": 0.8372893929481506,
"learning_rate": 1.973430034437806e-05,
"loss": 0.357,
"step": 414
},
{
"epoch": 1.017156862745098,
"grad_norm": 1.7599529027938843,
"learning_rate": 1.9732479292123714e-05,
"loss": 0.3827,
"step": 415
},
{
"epoch": 1.0196078431372548,
"grad_norm": 0.7997543215751648,
"learning_rate": 1.973065210520072e-05,
"loss": 0.3895,
"step": 416
},
{
"epoch": 1.0220588235294117,
"grad_norm": 0.720363974571228,
"learning_rate": 1.9728818784760805e-05,
"loss": 0.353,
"step": 417
},
{
"epoch": 1.0245098039215685,
"grad_norm": 7.207123279571533,
"learning_rate": 1.9726979331959563e-05,
"loss": 0.3604,
"step": 418
},
{
"epoch": 1.0269607843137254,
"grad_norm": 1.5167217254638672,
"learning_rate": 1.9725133747956454e-05,
"loss": 0.3613,
"step": 419
},
{
"epoch": 1.0294117647058822,
"grad_norm": 0.7159681916236877,
"learning_rate": 1.9723282033914812e-05,
"loss": 0.367,
"step": 420
},
{
"epoch": 1.031862745098039,
"grad_norm": 0.72771155834198,
"learning_rate": 1.972142419100182e-05,
"loss": 0.3422,
"step": 421
},
{
"epoch": 1.0343137254901962,
"grad_norm": 1.6255884170532227,
"learning_rate": 1.971956022038853e-05,
"loss": 0.3765,
"step": 422
},
{
"epoch": 1.036764705882353,
"grad_norm": 0.651329517364502,
"learning_rate": 1.9717690123249865e-05,
"loss": 0.3617,
"step": 423
},
{
"epoch": 1.0392156862745099,
"grad_norm": 0.7460439801216125,
"learning_rate": 1.9715813900764595e-05,
"loss": 0.3673,
"step": 424
},
{
"epoch": 1.0416666666666667,
"grad_norm": 0.8420533537864685,
"learning_rate": 1.971393155411536e-05,
"loss": 0.3957,
"step": 425
},
{
"epoch": 1.0441176470588236,
"grad_norm": 0.8797054886817932,
"learning_rate": 1.971204308448866e-05,
"loss": 0.4145,
"step": 426
},
{
"epoch": 1.0465686274509804,
"grad_norm": 0.7792790532112122,
"learning_rate": 1.9710148493074854e-05,
"loss": 0.3841,
"step": 427
},
{
"epoch": 1.0490196078431373,
"grad_norm": 0.7829272747039795,
"learning_rate": 1.970824778106816e-05,
"loss": 0.3742,
"step": 428
},
{
"epoch": 1.0514705882352942,
"grad_norm": 1.5860029458999634,
"learning_rate": 1.9706340949666646e-05,
"loss": 0.3669,
"step": 429
},
{
"epoch": 1.053921568627451,
"grad_norm": 0.8086922764778137,
"learning_rate": 1.970442800007225e-05,
"loss": 0.3701,
"step": 430
},
{
"epoch": 1.0563725490196079,
"grad_norm": 0.7580492496490479,
"learning_rate": 1.9702508933490757e-05,
"loss": 0.3826,
"step": 431
},
{
"epoch": 1.0588235294117647,
"grad_norm": 0.7586913704872131,
"learning_rate": 1.970058375113182e-05,
"loss": 0.354,
"step": 432
},
{
"epoch": 1.0612745098039216,
"grad_norm": 0.8073590993881226,
"learning_rate": 1.9698652454208923e-05,
"loss": 0.3408,
"step": 433
},
{
"epoch": 1.0637254901960784,
"grad_norm": 1.076324462890625,
"learning_rate": 1.9696715043939433e-05,
"loss": 0.3746,
"step": 434
},
{
"epoch": 1.0661764705882353,
"grad_norm": 0.671450138092041,
"learning_rate": 1.969477152154455e-05,
"loss": 0.3562,
"step": 435
},
{
"epoch": 1.0686274509803921,
"grad_norm": 0.6423475742340088,
"learning_rate": 1.9692821888249333e-05,
"loss": 0.3647,
"step": 436
},
{
"epoch": 1.071078431372549,
"grad_norm": 0.6885897517204285,
"learning_rate": 1.96908661452827e-05,
"loss": 0.3576,
"step": 437
},
{
"epoch": 1.0735294117647058,
"grad_norm": 0.9739678502082825,
"learning_rate": 1.9688904293877408e-05,
"loss": 0.3906,
"step": 438
},
{
"epoch": 1.0759803921568627,
"grad_norm": 0.7215884327888489,
"learning_rate": 1.968693633527007e-05,
"loss": 0.3571,
"step": 439
},
{
"epoch": 1.0784313725490196,
"grad_norm": 0.9208282232284546,
"learning_rate": 1.9684962270701147e-05,
"loss": 0.3717,
"step": 440
},
{
"epoch": 1.0808823529411764,
"grad_norm": 0.8803519010543823,
"learning_rate": 1.9682982101414955e-05,
"loss": 0.3869,
"step": 441
},
{
"epoch": 1.0833333333333333,
"grad_norm": 0.9058979749679565,
"learning_rate": 1.9680995828659647e-05,
"loss": 0.377,
"step": 442
},
{
"epoch": 1.0857843137254901,
"grad_norm": 0.852573812007904,
"learning_rate": 1.9679003453687236e-05,
"loss": 0.3724,
"step": 443
},
{
"epoch": 1.088235294117647,
"grad_norm": 0.7396073937416077,
"learning_rate": 1.967700497775357e-05,
"loss": 0.3356,
"step": 444
},
{
"epoch": 1.0906862745098038,
"grad_norm": 0.7687863111495972,
"learning_rate": 1.967500040211835e-05,
"loss": 0.3608,
"step": 445
},
{
"epoch": 1.093137254901961,
"grad_norm": 0.7369428873062134,
"learning_rate": 1.9672989728045112e-05,
"loss": 0.3652,
"step": 446
},
{
"epoch": 1.0955882352941178,
"grad_norm": 1.0725834369659424,
"learning_rate": 1.9670972956801253e-05,
"loss": 0.3638,
"step": 447
},
{
"epoch": 1.0980392156862746,
"grad_norm": 0.8268279433250427,
"learning_rate": 1.9668950089657998e-05,
"loss": 0.3919,
"step": 448
},
{
"epoch": 1.1004901960784315,
"grad_norm": 0.8067097663879395,
"learning_rate": 1.9666921127890425e-05,
"loss": 0.3564,
"step": 449
},
{
"epoch": 1.1029411764705883,
"grad_norm": 0.683048665523529,
"learning_rate": 1.9664886072777437e-05,
"loss": 0.3793,
"step": 450
},
{
"epoch": 1.1053921568627452,
"grad_norm": 1.884798526763916,
"learning_rate": 1.9662844925601804e-05,
"loss": 0.3815,
"step": 451
},
{
"epoch": 1.107843137254902,
"grad_norm": 0.6016831398010254,
"learning_rate": 1.966079768765011e-05,
"loss": 0.3665,
"step": 452
},
{
"epoch": 1.1102941176470589,
"grad_norm": 0.6554032564163208,
"learning_rate": 1.9658744360212794e-05,
"loss": 0.3392,
"step": 453
},
{
"epoch": 1.1127450980392157,
"grad_norm": 0.7478920221328735,
"learning_rate": 1.9656684944584126e-05,
"loss": 0.3753,
"step": 454
},
{
"epoch": 1.1151960784313726,
"grad_norm": 0.6055667996406555,
"learning_rate": 1.965461944206222e-05,
"loss": 0.3586,
"step": 455
},
{
"epoch": 1.1176470588235294,
"grad_norm": 0.591434895992279,
"learning_rate": 1.9652547853949026e-05,
"loss": 0.3317,
"step": 456
},
{
"epoch": 1.1200980392156863,
"grad_norm": 1.2468827962875366,
"learning_rate": 1.9650470181550317e-05,
"loss": 0.3569,
"step": 457
},
{
"epoch": 1.1225490196078431,
"grad_norm": 0.6211933493614197,
"learning_rate": 1.964838642617572e-05,
"loss": 0.3697,
"step": 458
},
{
"epoch": 1.125,
"grad_norm": 0.6601367592811584,
"learning_rate": 1.9646296589138683e-05,
"loss": 0.3867,
"step": 459
},
{
"epoch": 1.1274509803921569,
"grad_norm": 0.6897466778755188,
"learning_rate": 1.964420067175649e-05,
"loss": 0.3868,
"step": 460
},
{
"epoch": 1.1299019607843137,
"grad_norm": 0.6055756211280823,
"learning_rate": 1.964209867535027e-05,
"loss": 0.3486,
"step": 461
},
{
"epoch": 1.1323529411764706,
"grad_norm": 5.347257614135742,
"learning_rate": 1.963999060124496e-05,
"loss": 0.36,
"step": 462
},
{
"epoch": 1.1348039215686274,
"grad_norm": 0.6725960373878479,
"learning_rate": 1.963787645076935e-05,
"loss": 0.3864,
"step": 463
},
{
"epoch": 1.1372549019607843,
"grad_norm": 0.6154267191886902,
"learning_rate": 1.963575622525605e-05,
"loss": 0.3807,
"step": 464
},
{
"epoch": 1.1397058823529411,
"grad_norm": 0.6601632237434387,
"learning_rate": 1.9633629926041498e-05,
"loss": 0.3667,
"step": 465
},
{
"epoch": 1.142156862745098,
"grad_norm": 0.6736763119697571,
"learning_rate": 1.9631497554465963e-05,
"loss": 0.3662,
"step": 466
},
{
"epoch": 1.1446078431372548,
"grad_norm": 0.6230549812316895,
"learning_rate": 1.9629359111873543e-05,
"loss": 0.3521,
"step": 467
},
{
"epoch": 1.1470588235294117,
"grad_norm": 0.6212219595909119,
"learning_rate": 1.9627214599612166e-05,
"loss": 0.3534,
"step": 468
},
{
"epoch": 1.1495098039215685,
"grad_norm": 0.6918830275535583,
"learning_rate": 1.9625064019033573e-05,
"loss": 0.3827,
"step": 469
},
{
"epoch": 1.1519607843137254,
"grad_norm": 0.6371306777000427,
"learning_rate": 1.9622907371493344e-05,
"loss": 0.3631,
"step": 470
},
{
"epoch": 1.1544117647058822,
"grad_norm": 0.6398683190345764,
"learning_rate": 1.9620744658350873e-05,
"loss": 0.3711,
"step": 471
},
{
"epoch": 1.156862745098039,
"grad_norm": 1.005239725112915,
"learning_rate": 1.9618575880969387e-05,
"loss": 0.3588,
"step": 472
},
{
"epoch": 1.159313725490196,
"grad_norm": 0.7092298865318298,
"learning_rate": 1.961640104071593e-05,
"loss": 0.3484,
"step": 473
},
{
"epoch": 1.161764705882353,
"grad_norm": 0.6344369649887085,
"learning_rate": 1.9614220138961363e-05,
"loss": 0.3671,
"step": 474
},
{
"epoch": 1.1642156862745099,
"grad_norm": 0.6361258029937744,
"learning_rate": 1.9612033177080377e-05,
"loss": 0.3681,
"step": 475
},
{
"epoch": 1.1666666666666667,
"grad_norm": 0.7222378253936768,
"learning_rate": 1.9609840156451474e-05,
"loss": 0.3609,
"step": 476
},
{
"epoch": 1.1691176470588236,
"grad_norm": 0.762977659702301,
"learning_rate": 1.9607641078456986e-05,
"loss": 0.3519,
"step": 477
},
{
"epoch": 1.1715686274509804,
"grad_norm": 0.8761401176452637,
"learning_rate": 1.960543594448305e-05,
"loss": 0.3854,
"step": 478
},
{
"epoch": 1.1740196078431373,
"grad_norm": 0.7317111492156982,
"learning_rate": 1.9603224755919634e-05,
"loss": 0.381,
"step": 479
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.8281358480453491,
"learning_rate": 1.960100751416051e-05,
"loss": 0.4156,
"step": 480
},
{
"epoch": 1.178921568627451,
"grad_norm": 0.7146068811416626,
"learning_rate": 1.9598784220603272e-05,
"loss": 0.3806,
"step": 481
},
{
"epoch": 1.1813725490196079,
"grad_norm": 0.6956613063812256,
"learning_rate": 1.9596554876649328e-05,
"loss": 0.3571,
"step": 482
},
{
"epoch": 1.1838235294117647,
"grad_norm": 0.6944482922554016,
"learning_rate": 1.95943194837039e-05,
"loss": 0.3593,
"step": 483
},
{
"epoch": 1.1862745098039216,
"grad_norm": 0.6722803711891174,
"learning_rate": 1.9592078043176022e-05,
"loss": 0.363,
"step": 484
},
{
"epoch": 1.1887254901960784,
"grad_norm": 0.6937478184700012,
"learning_rate": 1.9589830556478538e-05,
"loss": 0.3734,
"step": 485
},
{
"epoch": 1.1911764705882353,
"grad_norm": 0.7974143624305725,
"learning_rate": 1.9587577025028105e-05,
"loss": 0.3535,
"step": 486
},
{
"epoch": 1.1936274509803921,
"grad_norm": 2.08024001121521,
"learning_rate": 1.9585317450245195e-05,
"loss": 0.3757,
"step": 487
},
{
"epoch": 1.196078431372549,
"grad_norm": 0.8869052529335022,
"learning_rate": 1.958305183355408e-05,
"loss": 0.3702,
"step": 488
},
{
"epoch": 1.1985294117647058,
"grad_norm": 0.7479692697525024,
"learning_rate": 1.9580780176382847e-05,
"loss": 0.3577,
"step": 489
},
{
"epoch": 1.2009803921568627,
"grad_norm": 0.7658264636993408,
"learning_rate": 1.957850248016339e-05,
"loss": 0.3789,
"step": 490
},
{
"epoch": 1.2034313725490196,
"grad_norm": 1.0089616775512695,
"learning_rate": 1.9576218746331402e-05,
"loss": 0.4052,
"step": 491
},
{
"epoch": 1.2058823529411764,
"grad_norm": 0.743039071559906,
"learning_rate": 1.957392897632639e-05,
"loss": 0.3555,
"step": 492
},
{
"epoch": 1.2083333333333333,
"grad_norm": 0.9136559367179871,
"learning_rate": 1.957163317159167e-05,
"loss": 0.3632,
"step": 493
},
{
"epoch": 1.2107843137254901,
"grad_norm": 0.7348246574401855,
"learning_rate": 1.9569331333574352e-05,
"loss": 0.3687,
"step": 494
},
{
"epoch": 1.213235294117647,
"grad_norm": 0.7725732326507568,
"learning_rate": 1.956702346372535e-05,
"loss": 0.3602,
"step": 495
},
{
"epoch": 1.215686274509804,
"grad_norm": 0.668029248714447,
"learning_rate": 1.956470956349938e-05,
"loss": 0.3603,
"step": 496
},
{
"epoch": 1.218137254901961,
"grad_norm": 0.7130250334739685,
"learning_rate": 1.9562389634354966e-05,
"loss": 0.3731,
"step": 497
},
{
"epoch": 1.2205882352941178,
"grad_norm": 0.7816908955574036,
"learning_rate": 1.9560063677754427e-05,
"loss": 0.3739,
"step": 498
},
{
"epoch": 1.2230392156862746,
"grad_norm": 0.7725933790206909,
"learning_rate": 1.9557731695163883e-05,
"loss": 0.3846,
"step": 499
},
{
"epoch": 1.2254901960784315,
"grad_norm": 0.7621762752532959,
"learning_rate": 1.955539368805325e-05,
"loss": 0.3522,
"step": 500
},
{
"epoch": 1.2279411764705883,
"grad_norm": 0.8003554344177246,
"learning_rate": 1.9553049657896245e-05,
"loss": 0.3678,
"step": 501
},
{
"epoch": 1.2303921568627452,
"grad_norm": 0.6918476819992065,
"learning_rate": 1.955069960617037e-05,
"loss": 0.3572,
"step": 502
},
{
"epoch": 1.232843137254902,
"grad_norm": 0.7378032803535461,
"learning_rate": 1.9548343534356947e-05,
"loss": 0.3687,
"step": 503
},
{
"epoch": 1.2352941176470589,
"grad_norm": 0.7186984419822693,
"learning_rate": 1.9545981443941067e-05,
"loss": 0.3773,
"step": 504
},
{
"epoch": 1.2377450980392157,
"grad_norm": 0.8525075316429138,
"learning_rate": 1.954361333641163e-05,
"loss": 0.3905,
"step": 505
},
{
"epoch": 1.2401960784313726,
"grad_norm": 0.6297598481178284,
"learning_rate": 1.954123921326132e-05,
"loss": 0.3637,
"step": 506
},
{
"epoch": 1.2426470588235294,
"grad_norm": 0.7166191339492798,
"learning_rate": 1.953885907598662e-05,
"loss": 0.3606,
"step": 507
},
{
"epoch": 1.2450980392156863,
"grad_norm": 0.6492571234703064,
"learning_rate": 1.9536472926087794e-05,
"loss": 0.3727,
"step": 508
},
{
"epoch": 1.2475490196078431,
"grad_norm": 0.7176105380058289,
"learning_rate": 1.953408076506891e-05,
"loss": 0.3634,
"step": 509
},
{
"epoch": 1.25,
"grad_norm": 0.690896213054657,
"learning_rate": 1.953168259443782e-05,
"loss": 0.3484,
"step": 510
},
{
"epoch": 1.2524509803921569,
"grad_norm": 0.8115174770355225,
"learning_rate": 1.9529278415706155e-05,
"loss": 0.3848,
"step": 511
},
{
"epoch": 1.2549019607843137,
"grad_norm": 0.7020537257194519,
"learning_rate": 1.9526868230389343e-05,
"loss": 0.349,
"step": 512
},
{
"epoch": 1.2573529411764706,
"grad_norm": 0.93234783411026,
"learning_rate": 1.9524452040006597e-05,
"loss": 0.3728,
"step": 513
},
{
"epoch": 1.2598039215686274,
"grad_norm": 1.0823783874511719,
"learning_rate": 1.9522029846080906e-05,
"loss": 0.4153,
"step": 514
},
{
"epoch": 1.2622549019607843,
"grad_norm": 0.6064553260803223,
"learning_rate": 1.9519601650139056e-05,
"loss": 0.3636,
"step": 515
},
{
"epoch": 1.2647058823529411,
"grad_norm": 0.7104873657226562,
"learning_rate": 1.951716745371161e-05,
"loss": 0.3665,
"step": 516
},
{
"epoch": 1.267156862745098,
"grad_norm": 0.6917532086372375,
"learning_rate": 1.9514727258332908e-05,
"loss": 0.3486,
"step": 517
},
{
"epoch": 1.2696078431372548,
"grad_norm": 0.6799424290657043,
"learning_rate": 1.9512281065541083e-05,
"loss": 0.3712,
"step": 518
},
{
"epoch": 1.2720588235294117,
"grad_norm": 0.6953539848327637,
"learning_rate": 1.9509828876878042e-05,
"loss": 0.3548,
"step": 519
},
{
"epoch": 1.2745098039215685,
"grad_norm": 0.797035813331604,
"learning_rate": 1.9507370693889472e-05,
"loss": 0.3815,
"step": 520
},
{
"epoch": 1.2769607843137254,
"grad_norm": 0.6895994544029236,
"learning_rate": 1.9504906518124836e-05,
"loss": 0.3574,
"step": 521
},
{
"epoch": 1.2794117647058822,
"grad_norm": 0.7277301549911499,
"learning_rate": 1.9502436351137376e-05,
"loss": 0.3758,
"step": 522
},
{
"epoch": 1.281862745098039,
"grad_norm": 8.913773536682129,
"learning_rate": 1.9499960194484118e-05,
"loss": 0.3953,
"step": 523
},
{
"epoch": 1.284313725490196,
"grad_norm": 0.9411600232124329,
"learning_rate": 1.9497478049725846e-05,
"loss": 0.3915,
"step": 524
},
{
"epoch": 1.2867647058823528,
"grad_norm": 0.9357278347015381,
"learning_rate": 1.9494989918427137e-05,
"loss": 0.3531,
"step": 525
},
{
"epoch": 1.2892156862745099,
"grad_norm": 0.6859796047210693,
"learning_rate": 1.9492495802156332e-05,
"loss": 0.3814,
"step": 526
},
{
"epoch": 1.2916666666666667,
"grad_norm": 0.8207082748413086,
"learning_rate": 1.9489995702485546e-05,
"loss": 0.3814,
"step": 527
},
{
"epoch": 1.2941176470588236,
"grad_norm": 0.9978480935096741,
"learning_rate": 1.948748962099066e-05,
"loss": 0.3834,
"step": 528
},
{
"epoch": 1.2965686274509804,
"grad_norm": 1.0504316091537476,
"learning_rate": 1.948497755925134e-05,
"loss": 0.3722,
"step": 529
},
{
"epoch": 1.2990196078431373,
"grad_norm": 0.7427694797515869,
"learning_rate": 1.9482459518851004e-05,
"loss": 0.3787,
"step": 530
},
{
"epoch": 1.3014705882352942,
"grad_norm": 0.7518629431724548,
"learning_rate": 1.9479935501376856e-05,
"loss": 0.3704,
"step": 531
},
{
"epoch": 1.303921568627451,
"grad_norm": 0.7836286425590515,
"learning_rate": 1.947740550841985e-05,
"loss": 0.334,
"step": 532
},
{
"epoch": 1.3063725490196079,
"grad_norm": 0.6694628596305847,
"learning_rate": 1.947486954157472e-05,
"loss": 0.3558,
"step": 533
},
{
"epoch": 1.3088235294117647,
"grad_norm": 0.7005615830421448,
"learning_rate": 1.947232760243996e-05,
"loss": 0.3526,
"step": 534
},
{
"epoch": 1.3112745098039216,
"grad_norm": 0.8005860447883606,
"learning_rate": 1.946977969261783e-05,
"loss": 0.3861,
"step": 535
},
{
"epoch": 1.3137254901960784,
"grad_norm": 0.7872335314750671,
"learning_rate": 1.946722581371435e-05,
"loss": 0.4075,
"step": 536
},
{
"epoch": 1.3161764705882353,
"grad_norm": 5.456146240234375,
"learning_rate": 1.9464665967339305e-05,
"loss": 0.3505,
"step": 537
},
{
"epoch": 1.3186274509803921,
"grad_norm": 0.903198778629303,
"learning_rate": 1.946210015510625e-05,
"loss": 0.3751,
"step": 538
},
{
"epoch": 1.321078431372549,
"grad_norm": 0.7047187089920044,
"learning_rate": 1.9459528378632478e-05,
"loss": 0.3503,
"step": 539
},
{
"epoch": 1.3235294117647058,
"grad_norm": 0.7212843298912048,
"learning_rate": 1.9456950639539064e-05,
"loss": 0.3602,
"step": 540
},
{
"epoch": 1.3259803921568627,
"grad_norm": 0.6304447054862976,
"learning_rate": 1.9454366939450837e-05,
"loss": 0.3248,
"step": 541
},
{
"epoch": 1.3284313725490196,
"grad_norm": 0.6566408276557922,
"learning_rate": 1.945177727999637e-05,
"loss": 0.3861,
"step": 542
},
{
"epoch": 1.3308823529411764,
"grad_norm": 0.712893009185791,
"learning_rate": 1.944918166280801e-05,
"loss": 0.3476,
"step": 543
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.6446470618247986,
"learning_rate": 1.944658008952184e-05,
"loss": 0.3799,
"step": 544
},
{
"epoch": 1.3357843137254901,
"grad_norm": 0.5955755710601807,
"learning_rate": 1.944397256177772e-05,
"loss": 0.3792,
"step": 545
},
{
"epoch": 1.3382352941176472,
"grad_norm": 0.6672739386558533,
"learning_rate": 1.944135908121925e-05,
"loss": 0.3598,
"step": 546
},
{
"epoch": 1.340686274509804,
"grad_norm": 0.6049591898918152,
"learning_rate": 1.943873964949378e-05,
"loss": 0.3682,
"step": 547
},
{
"epoch": 1.343137254901961,
"grad_norm": 0.8622561097145081,
"learning_rate": 1.943611426825241e-05,
"loss": 0.3948,
"step": 548
},
{
"epoch": 1.3455882352941178,
"grad_norm": 0.5897242426872253,
"learning_rate": 1.9433482939150007e-05,
"loss": 0.3699,
"step": 549
},
{
"epoch": 1.3480392156862746,
"grad_norm": 0.5615806579589844,
"learning_rate": 1.9430845663845166e-05,
"loss": 0.3554,
"step": 550
},
{
"epoch": 1.3504901960784315,
"grad_norm": 0.5839536190032959,
"learning_rate": 1.9428202444000245e-05,
"loss": 0.3619,
"step": 551
},
{
"epoch": 1.3529411764705883,
"grad_norm": 0.6188960075378418,
"learning_rate": 1.9425553281281343e-05,
"loss": 0.346,
"step": 552
},
{
"epoch": 1.3553921568627452,
"grad_norm": 0.6070338487625122,
"learning_rate": 1.94228981773583e-05,
"loss": 0.3757,
"step": 553
},
{
"epoch": 1.357843137254902,
"grad_norm": 0.805676281452179,
"learning_rate": 1.942023713390471e-05,
"loss": 0.4162,
"step": 554
},
{
"epoch": 1.3602941176470589,
"grad_norm": 0.5722381472587585,
"learning_rate": 1.9417570152597908e-05,
"loss": 0.3575,
"step": 555
},
{
"epoch": 1.3627450980392157,
"grad_norm": 0.5843807458877563,
"learning_rate": 1.941489723511897e-05,
"loss": 0.3665,
"step": 556
},
{
"epoch": 1.3651960784313726,
"grad_norm": 0.5685890913009644,
"learning_rate": 1.9412218383152717e-05,
"loss": 0.346,
"step": 557
},
{
"epoch": 1.3676470588235294,
"grad_norm": 0.6130423545837402,
"learning_rate": 1.9409533598387704e-05,
"loss": 0.3595,
"step": 558
},
{
"epoch": 1.3700980392156863,
"grad_norm": 0.658596932888031,
"learning_rate": 1.940684288251623e-05,
"loss": 0.3757,
"step": 559
},
{
"epoch": 1.3725490196078431,
"grad_norm": 0.6186444759368896,
"learning_rate": 1.9404146237234337e-05,
"loss": 0.3676,
"step": 560
},
{
"epoch": 1.375,
"grad_norm": 0.6111751198768616,
"learning_rate": 1.94014436642418e-05,
"loss": 0.3789,
"step": 561
},
{
"epoch": 1.3774509803921569,
"grad_norm": 0.5736342668533325,
"learning_rate": 1.9398735165242125e-05,
"loss": 0.3572,
"step": 562
},
{
"epoch": 1.3799019607843137,
"grad_norm": 0.5537214875221252,
"learning_rate": 1.9396020741942562e-05,
"loss": 0.3287,
"step": 563
},
{
"epoch": 1.3823529411764706,
"grad_norm": 0.6673843264579773,
"learning_rate": 1.939330039605409e-05,
"loss": 0.4142,
"step": 564
},
{
"epoch": 1.3848039215686274,
"grad_norm": 0.6837509870529175,
"learning_rate": 1.939057412929143e-05,
"loss": 0.3806,
"step": 565
},
{
"epoch": 1.3872549019607843,
"grad_norm": 0.6290351152420044,
"learning_rate": 1.938784194337302e-05,
"loss": 0.3382,
"step": 566
},
{
"epoch": 1.3897058823529411,
"grad_norm": 0.7367657423019409,
"learning_rate": 1.938510384002104e-05,
"loss": 0.3845,
"step": 567
},
{
"epoch": 1.392156862745098,
"grad_norm": 0.5813812613487244,
"learning_rate": 1.93823598209614e-05,
"loss": 0.346,
"step": 568
},
{
"epoch": 1.3946078431372548,
"grad_norm": 0.7961930632591248,
"learning_rate": 1.9379609887923735e-05,
"loss": 0.4031,
"step": 569
},
{
"epoch": 1.3970588235294117,
"grad_norm": 0.6278417706489563,
"learning_rate": 1.9376854042641405e-05,
"loss": 0.3846,
"step": 570
},
{
"epoch": 1.3995098039215685,
"grad_norm": 0.5420176982879639,
"learning_rate": 1.937409228685151e-05,
"loss": 0.3476,
"step": 571
},
{
"epoch": 1.4019607843137254,
"grad_norm": 0.7068719863891602,
"learning_rate": 1.9371324622294853e-05,
"loss": 0.3534,
"step": 572
},
{
"epoch": 1.4044117647058822,
"grad_norm": 0.6442379951477051,
"learning_rate": 1.936855105071598e-05,
"loss": 0.3693,
"step": 573
},
{
"epoch": 1.406862745098039,
"grad_norm": 0.6163464188575745,
"learning_rate": 1.9365771573863162e-05,
"loss": 0.3806,
"step": 574
},
{
"epoch": 1.409313725490196,
"grad_norm": 0.5886669158935547,
"learning_rate": 1.936298619348838e-05,
"loss": 0.3858,
"step": 575
},
{
"epoch": 1.4117647058823528,
"grad_norm": 0.6063313484191895,
"learning_rate": 1.9360194911347337e-05,
"loss": 0.3827,
"step": 576
},
{
"epoch": 1.4142156862745099,
"grad_norm": 0.5955997109413147,
"learning_rate": 1.9357397729199464e-05,
"loss": 0.3997,
"step": 577
},
{
"epoch": 1.4166666666666667,
"grad_norm": 0.6143521070480347,
"learning_rate": 1.9354594648807908e-05,
"loss": 0.3666,
"step": 578
},
{
"epoch": 1.4191176470588236,
"grad_norm": 0.5655225515365601,
"learning_rate": 1.9351785671939533e-05,
"loss": 0.3667,
"step": 579
},
{
"epoch": 1.4215686274509804,
"grad_norm": 0.5782202482223511,
"learning_rate": 1.934897080036492e-05,
"loss": 0.366,
"step": 580
},
{
"epoch": 1.4240196078431373,
"grad_norm": 0.5994782447814941,
"learning_rate": 1.9346150035858365e-05,
"loss": 0.3539,
"step": 581
},
{
"epoch": 1.4264705882352942,
"grad_norm": 0.6522775292396545,
"learning_rate": 1.9343323380197882e-05,
"loss": 0.3939,
"step": 582
},
{
"epoch": 1.428921568627451,
"grad_norm": 0.5779765248298645,
"learning_rate": 1.934049083516519e-05,
"loss": 0.3469,
"step": 583
},
{
"epoch": 1.4313725490196079,
"grad_norm": 0.6300088167190552,
"learning_rate": 1.9337652402545732e-05,
"loss": 0.3748,
"step": 584
},
{
"epoch": 1.4338235294117647,
"grad_norm": 0.5482218861579895,
"learning_rate": 1.9334808084128646e-05,
"loss": 0.3444,
"step": 585
},
{
"epoch": 1.4362745098039216,
"grad_norm": 0.6258654594421387,
"learning_rate": 1.93319578817068e-05,
"loss": 0.3663,
"step": 586
},
{
"epoch": 1.4387254901960784,
"grad_norm": 0.5975489020347595,
"learning_rate": 1.9329101797076757e-05,
"loss": 0.3567,
"step": 587
},
{
"epoch": 1.4411764705882353,
"grad_norm": 0.5405751466751099,
"learning_rate": 1.932623983203879e-05,
"loss": 0.3505,
"step": 588
},
{
"epoch": 1.4436274509803921,
"grad_norm": 0.6650300621986389,
"learning_rate": 1.932337198839688e-05,
"loss": 0.3689,
"step": 589
},
{
"epoch": 1.446078431372549,
"grad_norm": 0.6745548844337463,
"learning_rate": 1.9320498267958717e-05,
"loss": 0.356,
"step": 590
},
{
"epoch": 1.4485294117647058,
"grad_norm": 0.6748042106628418,
"learning_rate": 1.9317618672535685e-05,
"loss": 0.3931,
"step": 591
},
{
"epoch": 1.4509803921568627,
"grad_norm": 0.585654079914093,
"learning_rate": 1.9314733203942884e-05,
"loss": 0.3696,
"step": 592
},
{
"epoch": 1.4534313725490196,
"grad_norm": 0.6388471722602844,
"learning_rate": 1.9311841863999108e-05,
"loss": 0.3228,
"step": 593
},
{
"epoch": 1.4558823529411764,
"grad_norm": 0.6433977484703064,
"learning_rate": 1.9308944654526852e-05,
"loss": 0.363,
"step": 594
},
{
"epoch": 1.4583333333333333,
"grad_norm": 0.6569803357124329,
"learning_rate": 1.9306041577352314e-05,
"loss": 0.3843,
"step": 595
},
{
"epoch": 1.4607843137254901,
"grad_norm": 0.6765384674072266,
"learning_rate": 1.9303132634305392e-05,
"loss": 0.3582,
"step": 596
},
{
"epoch": 1.4632352941176472,
"grad_norm": 0.597810685634613,
"learning_rate": 1.9300217827219676e-05,
"loss": 0.3902,
"step": 597
},
{
"epoch": 1.465686274509804,
"grad_norm": 0.5685702562332153,
"learning_rate": 1.929729715793245e-05,
"loss": 0.3771,
"step": 598
},
{
"epoch": 1.468137254901961,
"grad_norm": 0.5223026275634766,
"learning_rate": 1.9294370628284707e-05,
"loss": 0.3532,
"step": 599
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.7328498363494873,
"learning_rate": 1.9291438240121122e-05,
"loss": 0.355,
"step": 600
},
{
"epoch": 1.4730392156862746,
"grad_norm": 0.594637930393219,
"learning_rate": 1.928849999529006e-05,
"loss": 0.3601,
"step": 601
},
{
"epoch": 1.4754901960784315,
"grad_norm": 0.550162136554718,
"learning_rate": 1.928555589564359e-05,
"loss": 0.3548,
"step": 602
},
{
"epoch": 1.4779411764705883,
"grad_norm": 0.5517258048057556,
"learning_rate": 1.928260594303746e-05,
"loss": 0.3655,
"step": 603
},
{
"epoch": 1.4803921568627452,
"grad_norm": 0.5857623815536499,
"learning_rate": 1.927965013933112e-05,
"loss": 0.3761,
"step": 604
},
{
"epoch": 1.482843137254902,
"grad_norm": 0.6127851009368896,
"learning_rate": 1.9276688486387692e-05,
"loss": 0.378,
"step": 605
},
{
"epoch": 1.4852941176470589,
"grad_norm": 0.5628093481063843,
"learning_rate": 1.9273720986073995e-05,
"loss": 0.3433,
"step": 606
},
{
"epoch": 1.4877450980392157,
"grad_norm": 0.5430567264556885,
"learning_rate": 1.9270747640260533e-05,
"loss": 0.3432,
"step": 607
},
{
"epoch": 1.4901960784313726,
"grad_norm": 0.5946584939956665,
"learning_rate": 1.9267768450821493e-05,
"loss": 0.3542,
"step": 608
},
{
"epoch": 1.4926470588235294,
"grad_norm": 0.5365927815437317,
"learning_rate": 1.9264783419634748e-05,
"loss": 0.3346,
"step": 609
},
{
"epoch": 1.4950980392156863,
"grad_norm": 0.5608130693435669,
"learning_rate": 1.9261792548581848e-05,
"loss": 0.3503,
"step": 610
},
{
"epoch": 1.4975490196078431,
"grad_norm": 0.5951604843139648,
"learning_rate": 1.925879583954803e-05,
"loss": 0.3894,
"step": 611
},
{
"epoch": 1.5,
"grad_norm": 0.6033993363380432,
"learning_rate": 1.925579329442221e-05,
"loss": 0.3865,
"step": 612
},
{
"epoch": 1.5024509803921569,
"grad_norm": 0.5504310131072998,
"learning_rate": 1.925278491509697e-05,
"loss": 0.3646,
"step": 613
},
{
"epoch": 1.5049019607843137,
"grad_norm": 0.7966532111167908,
"learning_rate": 1.9249770703468592e-05,
"loss": 0.3734,
"step": 614
},
{
"epoch": 1.5073529411764706,
"grad_norm": 0.6212445497512817,
"learning_rate": 1.924675066143702e-05,
"loss": 0.3653,
"step": 615
},
{
"epoch": 1.5098039215686274,
"grad_norm": 0.4999699592590332,
"learning_rate": 1.924372479090587e-05,
"loss": 0.3057,
"step": 616
},
{
"epoch": 1.5122549019607843,
"grad_norm": 0.5370170474052429,
"learning_rate": 1.9240693093782442e-05,
"loss": 0.3496,
"step": 617
},
{
"epoch": 1.5147058823529411,
"grad_norm": 0.6104731559753418,
"learning_rate": 1.9237655571977708e-05,
"loss": 0.3615,
"step": 618
},
{
"epoch": 1.517156862745098,
"grad_norm": 0.5741218328475952,
"learning_rate": 1.9234612227406296e-05,
"loss": 0.3753,
"step": 619
},
{
"epoch": 1.5196078431372548,
"grad_norm": 0.5282995700836182,
"learning_rate": 1.9231563061986528e-05,
"loss": 0.3507,
"step": 620
},
{
"epoch": 1.5220588235294117,
"grad_norm": 0.5602993369102478,
"learning_rate": 1.9228508077640374e-05,
"loss": 0.3759,
"step": 621
},
{
"epoch": 1.5245098039215685,
"grad_norm": 0.5278483033180237,
"learning_rate": 1.9225447276293486e-05,
"loss": 0.3674,
"step": 622
},
{
"epoch": 1.5269607843137254,
"grad_norm": 0.5521953105926514,
"learning_rate": 1.9222380659875176e-05,
"loss": 0.376,
"step": 623
},
{
"epoch": 1.5294117647058822,
"grad_norm": 0.587298572063446,
"learning_rate": 1.921930823031842e-05,
"loss": 0.3748,
"step": 624
},
{
"epoch": 1.531862745098039,
"grad_norm": 0.609389066696167,
"learning_rate": 1.921622998955986e-05,
"loss": 0.3689,
"step": 625
},
{
"epoch": 1.534313725490196,
"grad_norm": 0.5267477035522461,
"learning_rate": 1.921314593953981e-05,
"loss": 0.3614,
"step": 626
},
{
"epoch": 1.5367647058823528,
"grad_norm": 0.6144201755523682,
"learning_rate": 1.9210056082202234e-05,
"loss": 0.366,
"step": 627
},
{
"epoch": 1.5392156862745097,
"grad_norm": 0.5221917629241943,
"learning_rate": 1.920696041949476e-05,
"loss": 0.3554,
"step": 628
},
{
"epoch": 1.5416666666666665,
"grad_norm": 0.5637856125831604,
"learning_rate": 1.920385895336867e-05,
"loss": 0.3812,
"step": 629
},
{
"epoch": 1.5441176470588234,
"grad_norm": 0.556053102016449,
"learning_rate": 1.9200751685778915e-05,
"loss": 0.3499,
"step": 630
},
{
"epoch": 1.5465686274509802,
"grad_norm": 0.5361911058425903,
"learning_rate": 1.9197638618684098e-05,
"loss": 0.3597,
"step": 631
},
{
"epoch": 1.5490196078431373,
"grad_norm": 0.6157797574996948,
"learning_rate": 1.9194519754046476e-05,
"loss": 0.3614,
"step": 632
},
{
"epoch": 1.5514705882352942,
"grad_norm": 0.6103066205978394,
"learning_rate": 1.9191395093831955e-05,
"loss": 0.3719,
"step": 633
},
{
"epoch": 1.553921568627451,
"grad_norm": 0.5483396053314209,
"learning_rate": 1.918826464001011e-05,
"loss": 0.3564,
"step": 634
},
{
"epoch": 1.5563725490196079,
"grad_norm": 0.5710693597793579,
"learning_rate": 1.9185128394554154e-05,
"loss": 0.3686,
"step": 635
},
{
"epoch": 1.5588235294117647,
"grad_norm": 0.5463498830795288,
"learning_rate": 1.9181986359440957e-05,
"loss": 0.36,
"step": 636
},
{
"epoch": 1.5612745098039216,
"grad_norm": 0.6029631495475769,
"learning_rate": 1.9178838536651033e-05,
"loss": 0.3666,
"step": 637
},
{
"epoch": 1.5637254901960784,
"grad_norm": 0.5595495700836182,
"learning_rate": 1.9175684928168552e-05,
"loss": 0.3738,
"step": 638
},
{
"epoch": 1.5661764705882353,
"grad_norm": 0.5660778880119324,
"learning_rate": 1.917252553598132e-05,
"loss": 0.3685,
"step": 639
},
{
"epoch": 1.5686274509803921,
"grad_norm": 0.8197193741798401,
"learning_rate": 1.91693603620808e-05,
"loss": 0.3671,
"step": 640
},
{
"epoch": 1.571078431372549,
"grad_norm": 0.5595434904098511,
"learning_rate": 1.9166189408462096e-05,
"loss": 0.366,
"step": 641
},
{
"epoch": 1.5735294117647058,
"grad_norm": 0.5566771626472473,
"learning_rate": 1.9163012677123947e-05,
"loss": 0.348,
"step": 642
},
{
"epoch": 1.5759803921568627,
"grad_norm": 0.5822195410728455,
"learning_rate": 1.9159830170068743e-05,
"loss": 0.3826,
"step": 643
},
{
"epoch": 1.5784313725490198,
"grad_norm": 0.7989565134048462,
"learning_rate": 1.915664188930252e-05,
"loss": 0.3695,
"step": 644
},
{
"epoch": 1.5808823529411766,
"grad_norm": 0.5835515260696411,
"learning_rate": 1.9153447836834935e-05,
"loss": 0.363,
"step": 645
},
{
"epoch": 1.5833333333333335,
"grad_norm": 0.6216723918914795,
"learning_rate": 1.9150248014679294e-05,
"loss": 0.3678,
"step": 646
},
{
"epoch": 1.5857843137254903,
"grad_norm": 0.5697610974311829,
"learning_rate": 1.9147042424852546e-05,
"loss": 0.3532,
"step": 647
},
{
"epoch": 1.5882352941176472,
"grad_norm": 0.5751510858535767,
"learning_rate": 1.9143831069375264e-05,
"loss": 0.3592,
"step": 648
},
{
"epoch": 1.590686274509804,
"grad_norm": 0.9562742710113525,
"learning_rate": 1.914061395027166e-05,
"loss": 0.3721,
"step": 649
},
{
"epoch": 1.593137254901961,
"grad_norm": 0.5464246273040771,
"learning_rate": 1.9137391069569584e-05,
"loss": 0.3804,
"step": 650
},
{
"epoch": 1.5955882352941178,
"grad_norm": 0.5823296308517456,
"learning_rate": 1.9134162429300506e-05,
"loss": 0.3747,
"step": 651
},
{
"epoch": 1.5980392156862746,
"grad_norm": 0.5940808653831482,
"learning_rate": 1.9130928031499536e-05,
"loss": 0.3381,
"step": 652
},
{
"epoch": 1.6004901960784315,
"grad_norm": 0.5946800112724304,
"learning_rate": 1.912768787820541e-05,
"loss": 0.3782,
"step": 653
},
{
"epoch": 1.6029411764705883,
"grad_norm": 0.6195725798606873,
"learning_rate": 1.9124441971460493e-05,
"loss": 0.3853,
"step": 654
},
{
"epoch": 1.6053921568627452,
"grad_norm": 0.5833792686462402,
"learning_rate": 1.9121190313310777e-05,
"loss": 0.3596,
"step": 655
},
{
"epoch": 1.607843137254902,
"grad_norm": 0.6532778143882751,
"learning_rate": 1.9117932905805873e-05,
"loss": 0.3551,
"step": 656
},
{
"epoch": 1.6102941176470589,
"grad_norm": 0.6050318479537964,
"learning_rate": 1.9114669750999025e-05,
"loss": 0.3727,
"step": 657
},
{
"epoch": 1.6127450980392157,
"grad_norm": 0.6293352246284485,
"learning_rate": 1.9111400850947098e-05,
"loss": 0.3582,
"step": 658
},
{
"epoch": 1.6151960784313726,
"grad_norm": 0.8406941890716553,
"learning_rate": 1.910812620771057e-05,
"loss": 0.3456,
"step": 659
},
{
"epoch": 1.6176470588235294,
"grad_norm": 0.5856779217720032,
"learning_rate": 1.9104845823353545e-05,
"loss": 0.375,
"step": 660
},
{
"epoch": 1.6200980392156863,
"grad_norm": 0.5750583410263062,
"learning_rate": 1.9101559699943756e-05,
"loss": 0.3821,
"step": 661
},
{
"epoch": 1.6225490196078431,
"grad_norm": 0.566253125667572,
"learning_rate": 1.909826783955253e-05,
"loss": 0.3645,
"step": 662
},
{
"epoch": 1.625,
"grad_norm": 0.6087077260017395,
"learning_rate": 1.9094970244254836e-05,
"loss": 0.37,
"step": 663
},
{
"epoch": 1.6274509803921569,
"grad_norm": 0.5437163710594177,
"learning_rate": 1.909166691612924e-05,
"loss": 0.3742,
"step": 664
},
{
"epoch": 1.6299019607843137,
"grad_norm": 0.5609057545661926,
"learning_rate": 1.9088357857257923e-05,
"loss": 0.367,
"step": 665
},
{
"epoch": 1.6323529411764706,
"grad_norm": 0.6074764728546143,
"learning_rate": 1.9085043069726684e-05,
"loss": 0.3539,
"step": 666
},
{
"epoch": 1.6348039215686274,
"grad_norm": 0.5492837429046631,
"learning_rate": 1.908172255562494e-05,
"loss": 0.3449,
"step": 667
},
{
"epoch": 1.6372549019607843,
"grad_norm": 0.5852351784706116,
"learning_rate": 1.90783963170457e-05,
"loss": 0.3867,
"step": 668
},
{
"epoch": 1.6397058823529411,
"grad_norm": 0.713809072971344,
"learning_rate": 1.907506435608559e-05,
"loss": 0.3766,
"step": 669
},
{
"epoch": 1.642156862745098,
"grad_norm": 0.5536720156669617,
"learning_rate": 1.9071726674844854e-05,
"loss": 0.3771,
"step": 670
},
{
"epoch": 1.6446078431372548,
"grad_norm": 0.6789511442184448,
"learning_rate": 1.9068383275427314e-05,
"loss": 0.3565,
"step": 671
},
{
"epoch": 1.6470588235294117,
"grad_norm": 0.5678662061691284,
"learning_rate": 1.9065034159940428e-05,
"loss": 0.3606,
"step": 672
},
{
"epoch": 1.6495098039215685,
"grad_norm": 0.5874570608139038,
"learning_rate": 1.9061679330495234e-05,
"loss": 0.399,
"step": 673
},
{
"epoch": 1.6519607843137254,
"grad_norm": 0.6090113520622253,
"learning_rate": 1.9058318789206383e-05,
"loss": 0.3683,
"step": 674
},
{
"epoch": 1.6544117647058822,
"grad_norm": 0.5444072484970093,
"learning_rate": 1.9054952538192122e-05,
"loss": 0.3506,
"step": 675
},
{
"epoch": 1.656862745098039,
"grad_norm": 0.5434143543243408,
"learning_rate": 1.90515805795743e-05,
"loss": 0.3918,
"step": 676
},
{
"epoch": 1.659313725490196,
"grad_norm": 0.5712372064590454,
"learning_rate": 1.9048202915478356e-05,
"loss": 0.3693,
"step": 677
},
{
"epoch": 1.6617647058823528,
"grad_norm": 0.54757159948349,
"learning_rate": 1.904481954803334e-05,
"loss": 0.3495,
"step": 678
},
{
"epoch": 1.6642156862745097,
"grad_norm": 0.5374836921691895,
"learning_rate": 1.9041430479371885e-05,
"loss": 0.3622,
"step": 679
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.8362724781036377,
"learning_rate": 1.903803571163022e-05,
"loss": 0.384,
"step": 680
},
{
"epoch": 1.6691176470588234,
"grad_norm": 0.5432379841804504,
"learning_rate": 1.9034635246948164e-05,
"loss": 0.3666,
"step": 681
},
{
"epoch": 1.6715686274509802,
"grad_norm": 0.5300214290618896,
"learning_rate": 1.903122908746914e-05,
"loss": 0.3223,
"step": 682
},
{
"epoch": 1.6740196078431373,
"grad_norm": 0.5180357694625854,
"learning_rate": 1.9027817235340136e-05,
"loss": 0.3508,
"step": 683
},
{
"epoch": 1.6764705882352942,
"grad_norm": 0.6264671683311462,
"learning_rate": 1.9024399692711758e-05,
"loss": 0.3756,
"step": 684
},
{
"epoch": 1.678921568627451,
"grad_norm": 0.5713222622871399,
"learning_rate": 1.9020976461738175e-05,
"loss": 0.3509,
"step": 685
},
{
"epoch": 1.6813725490196079,
"grad_norm": 0.5600945949554443,
"learning_rate": 1.901754754457715e-05,
"loss": 0.3565,
"step": 686
},
{
"epoch": 1.6838235294117647,
"grad_norm": 0.5659016966819763,
"learning_rate": 1.901411294339004e-05,
"loss": 0.383,
"step": 687
},
{
"epoch": 1.6862745098039216,
"grad_norm": 1.0765410661697388,
"learning_rate": 1.9010672660341764e-05,
"loss": 0.3548,
"step": 688
},
{
"epoch": 1.6887254901960784,
"grad_norm": 0.6318280696868896,
"learning_rate": 1.9007226697600836e-05,
"loss": 0.3852,
"step": 689
},
{
"epoch": 1.6911764705882353,
"grad_norm": 0.5683992505073547,
"learning_rate": 1.9003775057339354e-05,
"loss": 0.3582,
"step": 690
},
{
"epoch": 1.6936274509803921,
"grad_norm": 0.5141113996505737,
"learning_rate": 1.9000317741732984e-05,
"loss": 0.3406,
"step": 691
},
{
"epoch": 1.696078431372549,
"grad_norm": 0.5886033773422241,
"learning_rate": 1.8996854752960975e-05,
"loss": 0.373,
"step": 692
},
{
"epoch": 1.6985294117647058,
"grad_norm": 0.5503585934638977,
"learning_rate": 1.899338609320615e-05,
"loss": 0.3724,
"step": 693
},
{
"epoch": 1.7009803921568627,
"grad_norm": 0.542449414730072,
"learning_rate": 1.898991176465491e-05,
"loss": 0.3549,
"step": 694
},
{
"epoch": 1.7034313725490198,
"grad_norm": 0.6106210350990295,
"learning_rate": 1.898643176949722e-05,
"loss": 0.3981,
"step": 695
},
{
"epoch": 1.7058823529411766,
"grad_norm": 0.5234098434448242,
"learning_rate": 1.8982946109926638e-05,
"loss": 0.3813,
"step": 696
},
{
"epoch": 1.7083333333333335,
"grad_norm": 0.6064922213554382,
"learning_rate": 1.8979454788140262e-05,
"loss": 0.3578,
"step": 697
},
{
"epoch": 1.7107843137254903,
"grad_norm": 0.5252379179000854,
"learning_rate": 1.897595780633878e-05,
"loss": 0.3576,
"step": 698
},
{
"epoch": 1.7132352941176472,
"grad_norm": 0.5773696899414062,
"learning_rate": 1.897245516672645e-05,
"loss": 0.3591,
"step": 699
},
{
"epoch": 1.715686274509804,
"grad_norm": 0.5730868577957153,
"learning_rate": 1.8968946871511078e-05,
"loss": 0.3651,
"step": 700
},
{
"epoch": 1.718137254901961,
"grad_norm": 1.1624000072479248,
"learning_rate": 1.8965432922904057e-05,
"loss": 0.3836,
"step": 701
},
{
"epoch": 1.7205882352941178,
"grad_norm": 0.5790229439735413,
"learning_rate": 1.8961913323120323e-05,
"loss": 0.3667,
"step": 702
},
{
"epoch": 1.7230392156862746,
"grad_norm": 0.5452889204025269,
"learning_rate": 1.895838807437839e-05,
"loss": 0.3526,
"step": 703
},
{
"epoch": 1.7254901960784315,
"grad_norm": 0.6192535758018494,
"learning_rate": 1.895485717890032e-05,
"loss": 0.395,
"step": 704
},
{
"epoch": 1.7279411764705883,
"grad_norm": 0.591917872428894,
"learning_rate": 1.8951320638911745e-05,
"loss": 0.3891,
"step": 705
},
{
"epoch": 1.7303921568627452,
"grad_norm": 0.6142959594726562,
"learning_rate": 1.894777845664185e-05,
"loss": 0.3665,
"step": 706
},
{
"epoch": 1.732843137254902,
"grad_norm": 0.572198212146759,
"learning_rate": 1.8944230634323377e-05,
"loss": 0.3599,
"step": 707
},
{
"epoch": 1.7352941176470589,
"grad_norm": 0.5875250697135925,
"learning_rate": 1.894067717419262e-05,
"loss": 0.3701,
"step": 708
},
{
"epoch": 1.7377450980392157,
"grad_norm": 0.5331512093544006,
"learning_rate": 1.8937118078489435e-05,
"loss": 0.3582,
"step": 709
},
{
"epoch": 1.7401960784313726,
"grad_norm": 0.5872368812561035,
"learning_rate": 1.893355334945722e-05,
"loss": 0.3366,
"step": 710
},
{
"epoch": 1.7426470588235294,
"grad_norm": 0.573293149471283,
"learning_rate": 1.8929982989342933e-05,
"loss": 0.3657,
"step": 711
},
{
"epoch": 1.7450980392156863,
"grad_norm": 0.7099412083625793,
"learning_rate": 1.8926407000397074e-05,
"loss": 0.34,
"step": 712
},
{
"epoch": 1.7475490196078431,
"grad_norm": 0.6928579807281494,
"learning_rate": 1.8922825384873692e-05,
"loss": 0.3721,
"step": 713
},
{
"epoch": 1.75,
"grad_norm": 0.5628637671470642,
"learning_rate": 1.891923814503039e-05,
"loss": 0.3329,
"step": 714
},
{
"epoch": 1.7524509803921569,
"grad_norm": 0.5792011618614197,
"learning_rate": 1.8915645283128313e-05,
"loss": 0.3665,
"step": 715
},
{
"epoch": 1.7549019607843137,
"grad_norm": 0.5537716746330261,
"learning_rate": 1.8912046801432143e-05,
"loss": 0.3686,
"step": 716
},
{
"epoch": 1.7573529411764706,
"grad_norm": 0.5735242366790771,
"learning_rate": 1.8908442702210112e-05,
"loss": 0.3559,
"step": 717
},
{
"epoch": 1.7598039215686274,
"grad_norm": 0.6453959345817566,
"learning_rate": 1.8904832987733983e-05,
"loss": 0.3731,
"step": 718
},
{
"epoch": 1.7622549019607843,
"grad_norm": 0.5662568211555481,
"learning_rate": 1.8901217660279073e-05,
"loss": 0.3559,
"step": 719
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.5591050982475281,
"learning_rate": 1.8897596722124227e-05,
"loss": 0.3377,
"step": 720
},
{
"epoch": 1.767156862745098,
"grad_norm": 0.5754038691520691,
"learning_rate": 1.8893970175551827e-05,
"loss": 0.3536,
"step": 721
},
{
"epoch": 1.7696078431372548,
"grad_norm": 0.6125769019126892,
"learning_rate": 1.8890338022847797e-05,
"loss": 0.3546,
"step": 722
},
{
"epoch": 1.7720588235294117,
"grad_norm": 0.6129846572875977,
"learning_rate": 1.8886700266301585e-05,
"loss": 0.3756,
"step": 723
},
{
"epoch": 1.7745098039215685,
"grad_norm": 0.5618334412574768,
"learning_rate": 1.888305690820618e-05,
"loss": 0.385,
"step": 724
},
{
"epoch": 1.7769607843137254,
"grad_norm": 0.8084755539894104,
"learning_rate": 1.88794079508581e-05,
"loss": 0.3549,
"step": 725
},
{
"epoch": 1.7794117647058822,
"grad_norm": 0.49203312397003174,
"learning_rate": 1.8875753396557385e-05,
"loss": 0.3266,
"step": 726
},
{
"epoch": 1.781862745098039,
"grad_norm": 0.5416219830513,
"learning_rate": 1.8872093247607605e-05,
"loss": 0.3552,
"step": 727
},
{
"epoch": 1.784313725490196,
"grad_norm": 0.6262273192405701,
"learning_rate": 1.886842750631587e-05,
"loss": 0.3592,
"step": 728
},
{
"epoch": 1.7867647058823528,
"grad_norm": 0.5882182717323303,
"learning_rate": 1.88647561749928e-05,
"loss": 0.3628,
"step": 729
},
{
"epoch": 1.7892156862745097,
"grad_norm": 0.5331644415855408,
"learning_rate": 1.8861079255952544e-05,
"loss": 0.3576,
"step": 730
},
{
"epoch": 1.7916666666666665,
"grad_norm": 0.6353577971458435,
"learning_rate": 1.885739675151277e-05,
"loss": 0.3608,
"step": 731
},
{
"epoch": 1.7941176470588234,
"grad_norm": 0.5356265306472778,
"learning_rate": 1.885370866399467e-05,
"loss": 0.3776,
"step": 732
},
{
"epoch": 1.7965686274509802,
"grad_norm": 0.6093118786811829,
"learning_rate": 1.885001499572296e-05,
"loss": 0.373,
"step": 733
},
{
"epoch": 1.7990196078431373,
"grad_norm": 0.5321884155273438,
"learning_rate": 1.884631574902586e-05,
"loss": 0.3432,
"step": 734
},
{
"epoch": 1.8014705882352942,
"grad_norm": 0.5564878582954407,
"learning_rate": 1.8842610926235118e-05,
"loss": 0.3704,
"step": 735
},
{
"epoch": 1.803921568627451,
"grad_norm": 0.5606857538223267,
"learning_rate": 1.883890052968599e-05,
"loss": 0.3798,
"step": 736
},
{
"epoch": 1.8063725490196079,
"grad_norm": 0.5832607746124268,
"learning_rate": 1.8835184561717253e-05,
"loss": 0.3604,
"step": 737
},
{
"epoch": 1.8088235294117647,
"grad_norm": 0.5438320636749268,
"learning_rate": 1.883146302467119e-05,
"loss": 0.3499,
"step": 738
},
{
"epoch": 1.8112745098039216,
"grad_norm": 0.5336952209472656,
"learning_rate": 1.8827735920893592e-05,
"loss": 0.357,
"step": 739
},
{
"epoch": 1.8137254901960784,
"grad_norm": 0.636478841304779,
"learning_rate": 1.8824003252733767e-05,
"loss": 0.3493,
"step": 740
},
{
"epoch": 1.8161764705882353,
"grad_norm": 0.5961681604385376,
"learning_rate": 1.882026502254452e-05,
"loss": 0.3381,
"step": 741
},
{
"epoch": 1.8186274509803921,
"grad_norm": 0.5795152187347412,
"learning_rate": 1.881652123268218e-05,
"loss": 0.3718,
"step": 742
},
{
"epoch": 1.821078431372549,
"grad_norm": 0.6357633471488953,
"learning_rate": 1.881277188550655e-05,
"loss": 0.372,
"step": 743
},
{
"epoch": 1.8235294117647058,
"grad_norm": 0.7582595944404602,
"learning_rate": 1.880901698338097e-05,
"loss": 0.3662,
"step": 744
},
{
"epoch": 1.8259803921568627,
"grad_norm": 0.6817951798439026,
"learning_rate": 1.8805256528672257e-05,
"loss": 0.3403,
"step": 745
},
{
"epoch": 1.8284313725490198,
"grad_norm": 0.538033664226532,
"learning_rate": 1.880149052375074e-05,
"loss": 0.3518,
"step": 746
},
{
"epoch": 1.8308823529411766,
"grad_norm": 0.5820069313049316,
"learning_rate": 1.879771897099024e-05,
"loss": 0.3467,
"step": 747
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.6112432479858398,
"learning_rate": 1.879394187276808e-05,
"loss": 0.3564,
"step": 748
},
{
"epoch": 1.8357843137254903,
"grad_norm": 0.5884008407592773,
"learning_rate": 1.8790159231465078e-05,
"loss": 0.3801,
"step": 749
},
{
"epoch": 1.8382352941176472,
"grad_norm": 0.5615097880363464,
"learning_rate": 1.8786371049465543e-05,
"loss": 0.3692,
"step": 750
},
{
"epoch": 1.840686274509804,
"grad_norm": 0.6377096176147461,
"learning_rate": 1.8782577329157273e-05,
"loss": 0.3817,
"step": 751
},
{
"epoch": 1.843137254901961,
"grad_norm": 0.5488484501838684,
"learning_rate": 1.8778778072931567e-05,
"loss": 0.3564,
"step": 752
},
{
"epoch": 1.8455882352941178,
"grad_norm": 0.7228192090988159,
"learning_rate": 1.8774973283183208e-05,
"loss": 0.3433,
"step": 753
},
{
"epoch": 1.8480392156862746,
"grad_norm": 0.7717460989952087,
"learning_rate": 1.8771162962310466e-05,
"loss": 0.3613,
"step": 754
},
{
"epoch": 1.8504901960784315,
"grad_norm": 0.5413428544998169,
"learning_rate": 1.87673471127151e-05,
"loss": 0.3423,
"step": 755
},
{
"epoch": 1.8529411764705883,
"grad_norm": 0.5849916338920593,
"learning_rate": 1.876352573680235e-05,
"loss": 0.3785,
"step": 756
},
{
"epoch": 1.8553921568627452,
"grad_norm": 0.5505291819572449,
"learning_rate": 1.875969883698094e-05,
"loss": 0.3516,
"step": 757
},
{
"epoch": 1.857843137254902,
"grad_norm": 0.5991479754447937,
"learning_rate": 1.8755866415663084e-05,
"loss": 0.3487,
"step": 758
},
{
"epoch": 1.8602941176470589,
"grad_norm": 0.8881350159645081,
"learning_rate": 1.8752028475264467e-05,
"loss": 0.3489,
"step": 759
},
{
"epoch": 1.8627450980392157,
"grad_norm": 0.596832811832428,
"learning_rate": 1.8748185018204256e-05,
"loss": 0.3654,
"step": 760
},
{
"epoch": 1.8651960784313726,
"grad_norm": 0.9907362461090088,
"learning_rate": 1.8744336046905094e-05,
"loss": 0.3516,
"step": 761
},
{
"epoch": 1.8676470588235294,
"grad_norm": 0.6258214116096497,
"learning_rate": 1.8740481563793108e-05,
"loss": 0.3815,
"step": 762
},
{
"epoch": 1.8700980392156863,
"grad_norm": 0.6085917353630066,
"learning_rate": 1.873662157129788e-05,
"loss": 0.3564,
"step": 763
},
{
"epoch": 1.8725490196078431,
"grad_norm": 0.6735434532165527,
"learning_rate": 1.8732756071852488e-05,
"loss": 0.3946,
"step": 764
},
{
"epoch": 1.875,
"grad_norm": 0.6309966444969177,
"learning_rate": 1.8728885067893467e-05,
"loss": 0.3858,
"step": 765
},
{
"epoch": 1.8774509803921569,
"grad_norm": 0.6150327920913696,
"learning_rate": 1.8725008561860823e-05,
"loss": 0.3848,
"step": 766
},
{
"epoch": 1.8799019607843137,
"grad_norm": 0.6120826601982117,
"learning_rate": 1.872112655619803e-05,
"loss": 0.3622,
"step": 767
},
{
"epoch": 1.8823529411764706,
"grad_norm": 0.5882266759872437,
"learning_rate": 1.8717239053352038e-05,
"loss": 0.3653,
"step": 768
},
{
"epoch": 1.8848039215686274,
"grad_norm": 1.1109150648117065,
"learning_rate": 1.871334605577325e-05,
"loss": 0.3978,
"step": 769
},
{
"epoch": 1.8872549019607843,
"grad_norm": 0.6655982732772827,
"learning_rate": 1.870944756591554e-05,
"loss": 0.3778,
"step": 770
},
{
"epoch": 1.8897058823529411,
"grad_norm": 0.6111178994178772,
"learning_rate": 1.8705543586236233e-05,
"loss": 0.3748,
"step": 771
},
{
"epoch": 1.892156862745098,
"grad_norm": 0.5709286332130432,
"learning_rate": 1.870163411919613e-05,
"loss": 0.3633,
"step": 772
},
{
"epoch": 1.8946078431372548,
"grad_norm": 0.5684765577316284,
"learning_rate": 1.8697719167259484e-05,
"loss": 0.3675,
"step": 773
},
{
"epoch": 1.8970588235294117,
"grad_norm": 0.581131637096405,
"learning_rate": 1.8693798732894e-05,
"loss": 0.3744,
"step": 774
},
{
"epoch": 1.8995098039215685,
"grad_norm": 0.6335357427597046,
"learning_rate": 1.8689872818570843e-05,
"loss": 0.3651,
"step": 775
},
{
"epoch": 1.9019607843137254,
"grad_norm": 0.5542675256729126,
"learning_rate": 1.868594142676464e-05,
"loss": 0.3131,
"step": 776
},
{
"epoch": 1.9044117647058822,
"grad_norm": 0.6895968317985535,
"learning_rate": 1.8682004559953457e-05,
"loss": 0.3884,
"step": 777
},
{
"epoch": 1.906862745098039,
"grad_norm": 0.6183537244796753,
"learning_rate": 1.8678062220618817e-05,
"loss": 0.3446,
"step": 778
},
{
"epoch": 1.909313725490196,
"grad_norm": 0.6690419316291809,
"learning_rate": 1.8674114411245697e-05,
"loss": 0.3611,
"step": 779
},
{
"epoch": 1.9117647058823528,
"grad_norm": 0.5871723890304565,
"learning_rate": 1.867016113432251e-05,
"loss": 0.3328,
"step": 780
},
{
"epoch": 1.9142156862745097,
"grad_norm": 0.61688232421875,
"learning_rate": 1.8666202392341134e-05,
"loss": 0.3648,
"step": 781
},
{
"epoch": 1.9166666666666665,
"grad_norm": 0.575491189956665,
"learning_rate": 1.8662238187796874e-05,
"loss": 0.363,
"step": 782
},
{
"epoch": 1.9191176470588234,
"grad_norm": 0.6746342778205872,
"learning_rate": 1.8658268523188483e-05,
"loss": 0.3786,
"step": 783
},
{
"epoch": 1.9215686274509802,
"grad_norm": 1.0063936710357666,
"learning_rate": 1.8654293401018163e-05,
"loss": 0.3564,
"step": 784
},
{
"epoch": 1.9240196078431373,
"grad_norm": 0.7424333691596985,
"learning_rate": 1.8650312823791545e-05,
"loss": 0.3574,
"step": 785
},
{
"epoch": 1.9264705882352942,
"grad_norm": 0.689834713935852,
"learning_rate": 1.864632679401771e-05,
"loss": 0.393,
"step": 786
},
{
"epoch": 1.928921568627451,
"grad_norm": 0.5924261212348938,
"learning_rate": 1.8642335314209164e-05,
"loss": 0.3698,
"step": 787
},
{
"epoch": 1.9313725490196079,
"grad_norm": 1.25295090675354,
"learning_rate": 1.8638338386881863e-05,
"loss": 0.3357,
"step": 788
},
{
"epoch": 1.9338235294117647,
"grad_norm": 0.724563717842102,
"learning_rate": 1.8634336014555178e-05,
"loss": 0.3652,
"step": 789
},
{
"epoch": 1.9362745098039216,
"grad_norm": 0.7059599161148071,
"learning_rate": 1.8630328199751928e-05,
"loss": 0.3686,
"step": 790
},
{
"epoch": 1.9387254901960784,
"grad_norm": 0.6653025150299072,
"learning_rate": 1.862631494499836e-05,
"loss": 0.3976,
"step": 791
},
{
"epoch": 1.9411764705882353,
"grad_norm": 0.6979086995124817,
"learning_rate": 1.8622296252824138e-05,
"loss": 0.3512,
"step": 792
},
{
"epoch": 1.9436274509803921,
"grad_norm": 0.5850315093994141,
"learning_rate": 1.8618272125762367e-05,
"loss": 0.3551,
"step": 793
},
{
"epoch": 1.946078431372549,
"grad_norm": 0.5560693144798279,
"learning_rate": 1.8614242566349576e-05,
"loss": 0.3364,
"step": 794
},
{
"epoch": 1.9485294117647058,
"grad_norm": 0.6023897528648376,
"learning_rate": 1.8610207577125708e-05,
"loss": 0.3524,
"step": 795
},
{
"epoch": 1.9509803921568627,
"grad_norm": 0.6868537068367004,
"learning_rate": 1.8606167160634145e-05,
"loss": 0.3481,
"step": 796
},
{
"epoch": 1.9534313725490198,
"grad_norm": 0.6240056157112122,
"learning_rate": 1.8602121319421674e-05,
"loss": 0.3535,
"step": 797
},
{
"epoch": 1.9558823529411766,
"grad_norm": 0.5703122615814209,
"learning_rate": 1.859807005603851e-05,
"loss": 0.3782,
"step": 798
},
{
"epoch": 1.9583333333333335,
"grad_norm": 0.6061028242111206,
"learning_rate": 1.8594013373038278e-05,
"loss": 0.34,
"step": 799
},
{
"epoch": 1.9607843137254903,
"grad_norm": 0.6340641379356384,
"learning_rate": 1.8589951272978034e-05,
"loss": 0.3665,
"step": 800
},
{
"epoch": 1.9632352941176472,
"grad_norm": 0.5348547697067261,
"learning_rate": 1.8585883758418232e-05,
"loss": 0.3573,
"step": 801
},
{
"epoch": 1.965686274509804,
"grad_norm": 0.5754359364509583,
"learning_rate": 1.8581810831922757e-05,
"loss": 0.3609,
"step": 802
},
{
"epoch": 1.968137254901961,
"grad_norm": 0.6445683836936951,
"learning_rate": 1.857773249605888e-05,
"loss": 0.3682,
"step": 803
},
{
"epoch": 1.9705882352941178,
"grad_norm": 0.5709390640258789,
"learning_rate": 1.8573648753397306e-05,
"loss": 0.3489,
"step": 804
},
{
"epoch": 1.9730392156862746,
"grad_norm": 0.7119299173355103,
"learning_rate": 1.856955960651214e-05,
"loss": 0.3579,
"step": 805
},
{
"epoch": 1.9754901960784315,
"grad_norm": 0.521127462387085,
"learning_rate": 1.8565465057980884e-05,
"loss": 0.3456,
"step": 806
},
{
"epoch": 1.9779411764705883,
"grad_norm": 0.6117939352989197,
"learning_rate": 1.856136511038446e-05,
"loss": 0.3708,
"step": 807
},
{
"epoch": 1.9803921568627452,
"grad_norm": 0.5785604119300842,
"learning_rate": 1.8557259766307177e-05,
"loss": 0.3403,
"step": 808
},
{
"epoch": 1.982843137254902,
"grad_norm": 0.5162942409515381,
"learning_rate": 1.8553149028336763e-05,
"loss": 0.3273,
"step": 809
},
{
"epoch": 1.9852941176470589,
"grad_norm": 0.5114243030548096,
"learning_rate": 1.8549032899064334e-05,
"loss": 0.3261,
"step": 810
},
{
"epoch": 1.9877450980392157,
"grad_norm": 0.5065172910690308,
"learning_rate": 1.854491138108441e-05,
"loss": 0.3491,
"step": 811
},
{
"epoch": 1.9901960784313726,
"grad_norm": 0.610893964767456,
"learning_rate": 1.8540784476994905e-05,
"loss": 0.3957,
"step": 812
},
{
"epoch": 1.9926470588235294,
"grad_norm": 0.5487807989120483,
"learning_rate": 1.8536652189397123e-05,
"loss": 0.3569,
"step": 813
},
{
"epoch": 1.9950980392156863,
"grad_norm": 0.5499549508094788,
"learning_rate": 1.8532514520895766e-05,
"loss": 0.3604,
"step": 814
},
{
"epoch": 1.9975490196078431,
"grad_norm": 0.5409987568855286,
"learning_rate": 1.852837147409894e-05,
"loss": 0.3568,
"step": 815
},
{
"epoch": 2.0,
"grad_norm": 0.6135896444320679,
"learning_rate": 1.852422305161812e-05,
"loss": 0.3665,
"step": 816
},
{
"epoch": 2.0,
"eval_loss": 0.38614732027053833,
"eval_runtime": 54.5029,
"eval_samples_per_second": 91.738,
"eval_steps_per_second": 0.367,
"step": 816
}
],
"logging_steps": 1.0,
"max_steps": 4080,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.896285469533274e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}