GLM-Magnum-lora / trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
031b422 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 175,
"global_step": 1398,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001430615164520744,
"grad_norm": 7.826082229614258,
"learning_rate": 0.0,
"loss": 0.898,
"step": 1
},
{
"epoch": 0.001430615164520744,
"eval_loss": 1.065092921257019,
"eval_runtime": 65.026,
"eval_samples_per_second": 6.382,
"eval_steps_per_second": 0.4,
"step": 1
},
{
"epoch": 0.002861230329041488,
"grad_norm": 10.205927848815918,
"learning_rate": 3.7500000000000005e-08,
"loss": 0.9261,
"step": 2
},
{
"epoch": 0.004291845493562232,
"grad_norm": 3.9774727821350098,
"learning_rate": 7.500000000000001e-08,
"loss": 0.9309,
"step": 3
},
{
"epoch": 0.005722460658082976,
"grad_norm": 5.370663642883301,
"learning_rate": 1.125e-07,
"loss": 0.846,
"step": 4
},
{
"epoch": 0.00715307582260372,
"grad_norm": 6.776569843292236,
"learning_rate": 1.5000000000000002e-07,
"loss": 0.8782,
"step": 5
},
{
"epoch": 0.008583690987124463,
"grad_norm": 8.526254653930664,
"learning_rate": 1.875e-07,
"loss": 0.9247,
"step": 6
},
{
"epoch": 0.010014306151645207,
"grad_norm": 2.043957471847534,
"learning_rate": 2.25e-07,
"loss": 0.9349,
"step": 7
},
{
"epoch": 0.011444921316165951,
"grad_norm": 2.4873178005218506,
"learning_rate": 2.625e-07,
"loss": 0.8981,
"step": 8
},
{
"epoch": 0.012875536480686695,
"grad_norm": 4.598736763000488,
"learning_rate": 3.0000000000000004e-07,
"loss": 0.8809,
"step": 9
},
{
"epoch": 0.01430615164520744,
"grad_norm": 6.595153331756592,
"learning_rate": 3.375e-07,
"loss": 0.9229,
"step": 10
},
{
"epoch": 0.015736766809728183,
"grad_norm": 5.382663249969482,
"learning_rate": 3.75e-07,
"loss": 0.9396,
"step": 11
},
{
"epoch": 0.017167381974248927,
"grad_norm": 10.02416706085205,
"learning_rate": 4.125e-07,
"loss": 0.8546,
"step": 12
},
{
"epoch": 0.01859799713876967,
"grad_norm": 4.947641849517822,
"learning_rate": 4.5e-07,
"loss": 0.9159,
"step": 13
},
{
"epoch": 0.020028612303290415,
"grad_norm": 3.2930426597595215,
"learning_rate": 4.875e-07,
"loss": 0.9403,
"step": 14
},
{
"epoch": 0.02145922746781116,
"grad_norm": 24.454675674438477,
"learning_rate": 5.25e-07,
"loss": 0.8754,
"step": 15
},
{
"epoch": 0.022889842632331903,
"grad_norm": 7.453534126281738,
"learning_rate": 5.625e-07,
"loss": 0.9897,
"step": 16
},
{
"epoch": 0.024320457796852647,
"grad_norm": 8.125139236450195,
"learning_rate": 6.000000000000001e-07,
"loss": 0.9593,
"step": 17
},
{
"epoch": 0.02575107296137339,
"grad_norm": 8.038130760192871,
"learning_rate": 6.375e-07,
"loss": 0.9863,
"step": 18
},
{
"epoch": 0.027181688125894134,
"grad_norm": 10.386178016662598,
"learning_rate": 6.75e-07,
"loss": 0.9412,
"step": 19
},
{
"epoch": 0.02861230329041488,
"grad_norm": 9.146885871887207,
"learning_rate": 7.125e-07,
"loss": 0.8988,
"step": 20
},
{
"epoch": 0.030042918454935622,
"grad_norm": 6.290739059448242,
"learning_rate": 7.5e-07,
"loss": 0.8968,
"step": 21
},
{
"epoch": 0.031473533619456366,
"grad_norm": 2.8495869636535645,
"learning_rate": 7.875000000000001e-07,
"loss": 0.9229,
"step": 22
},
{
"epoch": 0.032904148783977114,
"grad_norm": 4.456954002380371,
"learning_rate": 8.25e-07,
"loss": 0.8605,
"step": 23
},
{
"epoch": 0.034334763948497854,
"grad_norm": 12.40089225769043,
"learning_rate": 8.625e-07,
"loss": 0.8897,
"step": 24
},
{
"epoch": 0.0357653791130186,
"grad_norm": 3.42988920211792,
"learning_rate": 9e-07,
"loss": 0.8653,
"step": 25
},
{
"epoch": 0.03719599427753934,
"grad_norm": 2.2468039989471436,
"learning_rate": 9.375e-07,
"loss": 0.9229,
"step": 26
},
{
"epoch": 0.03862660944206009,
"grad_norm": 4.040201663970947,
"learning_rate": 9.75e-07,
"loss": 0.8753,
"step": 27
},
{
"epoch": 0.04005722460658083,
"grad_norm": 4.08870792388916,
"learning_rate": 1.0125e-06,
"loss": 0.9356,
"step": 28
},
{
"epoch": 0.04148783977110158,
"grad_norm": 5.570353984832764,
"learning_rate": 1.05e-06,
"loss": 0.8388,
"step": 29
},
{
"epoch": 0.04291845493562232,
"grad_norm": 4.162603378295898,
"learning_rate": 1.0875e-06,
"loss": 0.865,
"step": 30
},
{
"epoch": 0.044349070100143065,
"grad_norm": 9.821990013122559,
"learning_rate": 1.125e-06,
"loss": 0.9317,
"step": 31
},
{
"epoch": 0.045779685264663805,
"grad_norm": 17.85947036743164,
"learning_rate": 1.1625e-06,
"loss": 0.9546,
"step": 32
},
{
"epoch": 0.04721030042918455,
"grad_norm": 4.307530879974365,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.7619,
"step": 33
},
{
"epoch": 0.04864091559370529,
"grad_norm": 3.723987579345703,
"learning_rate": 1.2375e-06,
"loss": 0.8835,
"step": 34
},
{
"epoch": 0.05007153075822604,
"grad_norm": 6.962404251098633,
"learning_rate": 1.275e-06,
"loss": 0.7924,
"step": 35
},
{
"epoch": 0.05150214592274678,
"grad_norm": 4.578495025634766,
"learning_rate": 1.3125000000000001e-06,
"loss": 0.8527,
"step": 36
},
{
"epoch": 0.05293276108726753,
"grad_norm": 9.046110153198242,
"learning_rate": 1.35e-06,
"loss": 0.9319,
"step": 37
},
{
"epoch": 0.05436337625178827,
"grad_norm": 2.2053868770599365,
"learning_rate": 1.3875e-06,
"loss": 0.9608,
"step": 38
},
{
"epoch": 0.055793991416309016,
"grad_norm": 2.3856260776519775,
"learning_rate": 1.425e-06,
"loss": 0.8641,
"step": 39
},
{
"epoch": 0.05722460658082976,
"grad_norm": 1.8333237171173096,
"learning_rate": 1.4625e-06,
"loss": 0.9357,
"step": 40
},
{
"epoch": 0.058655221745350504,
"grad_norm": 2.9304890632629395,
"learning_rate": 1.5e-06,
"loss": 0.8986,
"step": 41
},
{
"epoch": 0.060085836909871244,
"grad_norm": 3.4019198417663574,
"learning_rate": 1.5374999999999999e-06,
"loss": 0.9427,
"step": 42
},
{
"epoch": 0.06151645207439199,
"grad_norm": 7.195025444030762,
"learning_rate": 1.5750000000000002e-06,
"loss": 0.851,
"step": 43
},
{
"epoch": 0.06294706723891273,
"grad_norm": 7.58285665512085,
"learning_rate": 1.6125e-06,
"loss": 0.9239,
"step": 44
},
{
"epoch": 0.06437768240343347,
"grad_norm": 7.752026081085205,
"learning_rate": 1.65e-06,
"loss": 0.863,
"step": 45
},
{
"epoch": 0.06580829756795423,
"grad_norm": 13.529495239257812,
"learning_rate": 1.6875e-06,
"loss": 0.8844,
"step": 46
},
{
"epoch": 0.06723891273247497,
"grad_norm": 4.444079399108887,
"learning_rate": 1.725e-06,
"loss": 0.9185,
"step": 47
},
{
"epoch": 0.06866952789699571,
"grad_norm": 8.650182723999023,
"learning_rate": 1.7625e-06,
"loss": 0.8735,
"step": 48
},
{
"epoch": 0.07010014306151645,
"grad_norm": 3.767944097518921,
"learning_rate": 1.8e-06,
"loss": 0.7559,
"step": 49
},
{
"epoch": 0.0715307582260372,
"grad_norm": 7.349740982055664,
"learning_rate": 1.8375000000000002e-06,
"loss": 0.8848,
"step": 50
},
{
"epoch": 0.07296137339055794,
"grad_norm": 6.42757511138916,
"learning_rate": 1.875e-06,
"loss": 0.8778,
"step": 51
},
{
"epoch": 0.07439198855507868,
"grad_norm": 4.057242393493652,
"learning_rate": 1.9125e-06,
"loss": 0.8433,
"step": 52
},
{
"epoch": 0.07582260371959942,
"grad_norm": 2.327789306640625,
"learning_rate": 1.95e-06,
"loss": 0.8083,
"step": 53
},
{
"epoch": 0.07725321888412018,
"grad_norm": 8.588128089904785,
"learning_rate": 1.9875e-06,
"loss": 0.8587,
"step": 54
},
{
"epoch": 0.07868383404864092,
"grad_norm": 9.92045783996582,
"learning_rate": 2.025e-06,
"loss": 0.8613,
"step": 55
},
{
"epoch": 0.08011444921316166,
"grad_norm": 5.001506805419922,
"learning_rate": 2.0625e-06,
"loss": 0.8549,
"step": 56
},
{
"epoch": 0.0815450643776824,
"grad_norm": 4.943772315979004,
"learning_rate": 2.1e-06,
"loss": 0.9253,
"step": 57
},
{
"epoch": 0.08297567954220315,
"grad_norm": 2.5432281494140625,
"learning_rate": 2.1375000000000003e-06,
"loss": 0.8148,
"step": 58
},
{
"epoch": 0.0844062947067239,
"grad_norm": 3.7364847660064697,
"learning_rate": 2.175e-06,
"loss": 0.8329,
"step": 59
},
{
"epoch": 0.08583690987124463,
"grad_norm": 2.2858948707580566,
"learning_rate": 2.2125e-06,
"loss": 0.8131,
"step": 60
},
{
"epoch": 0.08726752503576538,
"grad_norm": 3.740797281265259,
"learning_rate": 2.25e-06,
"loss": 0.8974,
"step": 61
},
{
"epoch": 0.08869814020028613,
"grad_norm": 7.974575042724609,
"learning_rate": 2.2875e-06,
"loss": 0.8773,
"step": 62
},
{
"epoch": 0.09012875536480687,
"grad_norm": 3.5054333209991455,
"learning_rate": 2.325e-06,
"loss": 0.8011,
"step": 63
},
{
"epoch": 0.09155937052932761,
"grad_norm": 1.7374111413955688,
"learning_rate": 2.3625e-06,
"loss": 0.8732,
"step": 64
},
{
"epoch": 0.09298998569384835,
"grad_norm": 1.5484044551849365,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.816,
"step": 65
},
{
"epoch": 0.0944206008583691,
"grad_norm": 7.499728679656982,
"learning_rate": 2.4375e-06,
"loss": 0.8951,
"step": 66
},
{
"epoch": 0.09585121602288985,
"grad_norm": 2.1170144081115723,
"learning_rate": 2.475e-06,
"loss": 0.8616,
"step": 67
},
{
"epoch": 0.09728183118741059,
"grad_norm": 4.520656108856201,
"learning_rate": 2.5125e-06,
"loss": 0.823,
"step": 68
},
{
"epoch": 0.09871244635193133,
"grad_norm": 2.0560104846954346,
"learning_rate": 2.55e-06,
"loss": 0.8335,
"step": 69
},
{
"epoch": 0.10014306151645208,
"grad_norm": 1.0364820957183838,
"learning_rate": 2.5875000000000002e-06,
"loss": 0.8691,
"step": 70
},
{
"epoch": 0.10157367668097282,
"grad_norm": 13.255958557128906,
"learning_rate": 2.6250000000000003e-06,
"loss": 0.9005,
"step": 71
},
{
"epoch": 0.10300429184549356,
"grad_norm": 1.2062978744506836,
"learning_rate": 2.6625e-06,
"loss": 0.9268,
"step": 72
},
{
"epoch": 0.1044349070100143,
"grad_norm": 5.754052639007568,
"learning_rate": 2.7e-06,
"loss": 0.8912,
"step": 73
},
{
"epoch": 0.10586552217453506,
"grad_norm": 6.267002105712891,
"learning_rate": 2.7375e-06,
"loss": 0.8835,
"step": 74
},
{
"epoch": 0.1072961373390558,
"grad_norm": 5.339660167694092,
"learning_rate": 2.775e-06,
"loss": 0.8765,
"step": 75
},
{
"epoch": 0.10872675250357654,
"grad_norm": 3.0998125076293945,
"learning_rate": 2.8125e-06,
"loss": 0.7866,
"step": 76
},
{
"epoch": 0.11015736766809728,
"grad_norm": 5.969987392425537,
"learning_rate": 2.85e-06,
"loss": 0.8956,
"step": 77
},
{
"epoch": 0.11158798283261803,
"grad_norm": 3.4417006969451904,
"learning_rate": 2.8875000000000003e-06,
"loss": 0.7929,
"step": 78
},
{
"epoch": 0.11301859799713877,
"grad_norm": 6.582152366638184,
"learning_rate": 2.925e-06,
"loss": 0.9373,
"step": 79
},
{
"epoch": 0.11444921316165951,
"grad_norm": 1.3354519605636597,
"learning_rate": 2.9625e-06,
"loss": 0.8581,
"step": 80
},
{
"epoch": 0.11587982832618025,
"grad_norm": 13.804448127746582,
"learning_rate": 3e-06,
"loss": 0.8789,
"step": 81
},
{
"epoch": 0.11731044349070101,
"grad_norm": 3.086815357208252,
"learning_rate": 2.999995738818993e-06,
"loss": 0.8516,
"step": 82
},
{
"epoch": 0.11874105865522175,
"grad_norm": 14.031466484069824,
"learning_rate": 2.999982955300181e-06,
"loss": 0.9504,
"step": 83
},
{
"epoch": 0.12017167381974249,
"grad_norm": 7.03550910949707,
"learning_rate": 2.9999616495161956e-06,
"loss": 0.841,
"step": 84
},
{
"epoch": 0.12160228898426323,
"grad_norm": 1.4175535440444946,
"learning_rate": 2.9999318215880865e-06,
"loss": 0.8488,
"step": 85
},
{
"epoch": 0.12303290414878398,
"grad_norm": 3.6041760444641113,
"learning_rate": 2.9998934716853238e-06,
"loss": 0.865,
"step": 86
},
{
"epoch": 0.12446351931330472,
"grad_norm": 2.260624408721924,
"learning_rate": 2.9998466000257944e-06,
"loss": 0.9309,
"step": 87
},
{
"epoch": 0.12589413447782546,
"grad_norm": 29.91061782836914,
"learning_rate": 2.9997912068758043e-06,
"loss": 0.8052,
"step": 88
},
{
"epoch": 0.12732474964234622,
"grad_norm": 15.271418571472168,
"learning_rate": 2.9997272925500735e-06,
"loss": 0.8355,
"step": 89
},
{
"epoch": 0.12875536480686695,
"grad_norm": 9.124563217163086,
"learning_rate": 2.9996548574117354e-06,
"loss": 0.847,
"step": 90
},
{
"epoch": 0.1301859799713877,
"grad_norm": 2.8253238201141357,
"learning_rate": 2.9995739018723365e-06,
"loss": 0.85,
"step": 91
},
{
"epoch": 0.13161659513590845,
"grad_norm": 12.32058048248291,
"learning_rate": 2.999484426391831e-06,
"loss": 0.907,
"step": 92
},
{
"epoch": 0.13304721030042918,
"grad_norm": 1.6532840728759766,
"learning_rate": 2.999386431478581e-06,
"loss": 0.8617,
"step": 93
},
{
"epoch": 0.13447782546494993,
"grad_norm": 2.9194514751434326,
"learning_rate": 2.9992799176893515e-06,
"loss": 0.8747,
"step": 94
},
{
"epoch": 0.13590844062947066,
"grad_norm": 5.123015880584717,
"learning_rate": 2.999164885629309e-06,
"loss": 0.8485,
"step": 95
},
{
"epoch": 0.13733905579399142,
"grad_norm": 2.5870351791381836,
"learning_rate": 2.9990413359520165e-06,
"loss": 0.8487,
"step": 96
},
{
"epoch": 0.13876967095851217,
"grad_norm": 99.052490234375,
"learning_rate": 2.998909269359431e-06,
"loss": 0.8617,
"step": 97
},
{
"epoch": 0.1402002861230329,
"grad_norm": 2.046233892440796,
"learning_rate": 2.998768686601898e-06,
"loss": 0.8317,
"step": 98
},
{
"epoch": 0.14163090128755365,
"grad_norm": 2.7805709838867188,
"learning_rate": 2.99861958847815e-06,
"loss": 0.8411,
"step": 99
},
{
"epoch": 0.1430615164520744,
"grad_norm": 4.889650821685791,
"learning_rate": 2.998461975835298e-06,
"loss": 0.9486,
"step": 100
},
{
"epoch": 0.14449213161659513,
"grad_norm": 4.531475067138672,
"learning_rate": 2.9982958495688307e-06,
"loss": 0.7729,
"step": 101
},
{
"epoch": 0.1459227467811159,
"grad_norm": 25.037559509277344,
"learning_rate": 2.9981212106226067e-06,
"loss": 0.8791,
"step": 102
},
{
"epoch": 0.1473533619456366,
"grad_norm": 5.148260116577148,
"learning_rate": 2.9979380599888506e-06,
"loss": 0.8771,
"step": 103
},
{
"epoch": 0.14878397711015737,
"grad_norm": 12.409090042114258,
"learning_rate": 2.997746398708146e-06,
"loss": 0.8488,
"step": 104
},
{
"epoch": 0.15021459227467812,
"grad_norm": 9.207916259765625,
"learning_rate": 2.99754622786943e-06,
"loss": 0.9463,
"step": 105
},
{
"epoch": 0.15164520743919885,
"grad_norm": 9.328911781311035,
"learning_rate": 2.99733754860999e-06,
"loss": 0.8123,
"step": 106
},
{
"epoch": 0.1530758226037196,
"grad_norm": 3.671304941177368,
"learning_rate": 2.997120362115451e-06,
"loss": 0.7729,
"step": 107
},
{
"epoch": 0.15450643776824036,
"grad_norm": 3.156205654144287,
"learning_rate": 2.9968946696197754e-06,
"loss": 0.8232,
"step": 108
},
{
"epoch": 0.15593705293276108,
"grad_norm": 5.058759689331055,
"learning_rate": 2.9966604724052517e-06,
"loss": 0.7645,
"step": 109
},
{
"epoch": 0.15736766809728184,
"grad_norm": 6.537101745605469,
"learning_rate": 2.9964177718024888e-06,
"loss": 0.8669,
"step": 110
},
{
"epoch": 0.15879828326180256,
"grad_norm": 13.55328369140625,
"learning_rate": 2.9961665691904087e-06,
"loss": 0.7843,
"step": 111
},
{
"epoch": 0.16022889842632332,
"grad_norm": 1.259379267692566,
"learning_rate": 2.9959068659962367e-06,
"loss": 0.7038,
"step": 112
},
{
"epoch": 0.16165951359084407,
"grad_norm": 2.055260181427002,
"learning_rate": 2.995638663695497e-06,
"loss": 0.7348,
"step": 113
},
{
"epoch": 0.1630901287553648,
"grad_norm": 3.9688467979431152,
"learning_rate": 2.9953619638120004e-06,
"loss": 0.8377,
"step": 114
},
{
"epoch": 0.16452074391988555,
"grad_norm": 1.2332788705825806,
"learning_rate": 2.9950767679178377e-06,
"loss": 0.8324,
"step": 115
},
{
"epoch": 0.1659513590844063,
"grad_norm": 4.329883098602295,
"learning_rate": 2.994783077633372e-06,
"loss": 0.8407,
"step": 116
},
{
"epoch": 0.16738197424892703,
"grad_norm": 30.238853454589844,
"learning_rate": 2.994480894627225e-06,
"loss": 0.7921,
"step": 117
},
{
"epoch": 0.1688125894134478,
"grad_norm": 3.2549190521240234,
"learning_rate": 2.9941702206162733e-06,
"loss": 0.9115,
"step": 118
},
{
"epoch": 0.17024320457796852,
"grad_norm": 2.118321180343628,
"learning_rate": 2.9938510573656333e-06,
"loss": 0.8938,
"step": 119
},
{
"epoch": 0.17167381974248927,
"grad_norm": 9.38047981262207,
"learning_rate": 2.993523406688656e-06,
"loss": 0.9441,
"step": 120
},
{
"epoch": 0.17310443490701002,
"grad_norm": 2.0981671810150146,
"learning_rate": 2.9931872704469126e-06,
"loss": 0.8982,
"step": 121
},
{
"epoch": 0.17453505007153075,
"grad_norm": 2.4683213233947754,
"learning_rate": 2.992842650550186e-06,
"loss": 0.9112,
"step": 122
},
{
"epoch": 0.1759656652360515,
"grad_norm": 6.200721740722656,
"learning_rate": 2.9924895489564602e-06,
"loss": 0.9541,
"step": 123
},
{
"epoch": 0.17739628040057226,
"grad_norm": 13.966652870178223,
"learning_rate": 2.9921279676719085e-06,
"loss": 0.8528,
"step": 124
},
{
"epoch": 0.17882689556509299,
"grad_norm": 4.730607032775879,
"learning_rate": 2.9917579087508817e-06,
"loss": 0.7931,
"step": 125
},
{
"epoch": 0.18025751072961374,
"grad_norm": 14.206419944763184,
"learning_rate": 2.9913793742958968e-06,
"loss": 0.9154,
"step": 126
},
{
"epoch": 0.18168812589413447,
"grad_norm": 6.954555988311768,
"learning_rate": 2.9909923664576264e-06,
"loss": 0.7906,
"step": 127
},
{
"epoch": 0.18311874105865522,
"grad_norm": 5.227564811706543,
"learning_rate": 2.9905968874348833e-06,
"loss": 0.8771,
"step": 128
},
{
"epoch": 0.18454935622317598,
"grad_norm": 9.619230270385742,
"learning_rate": 2.9901929394746126e-06,
"loss": 0.9761,
"step": 129
},
{
"epoch": 0.1859799713876967,
"grad_norm": 22.448091506958008,
"learning_rate": 2.9897805248718737e-06,
"loss": 0.83,
"step": 130
},
{
"epoch": 0.18741058655221746,
"grad_norm": 4.735238075256348,
"learning_rate": 2.9893596459698313e-06,
"loss": 0.841,
"step": 131
},
{
"epoch": 0.1888412017167382,
"grad_norm": 3.6265194416046143,
"learning_rate": 2.9889303051597403e-06,
"loss": 0.8511,
"step": 132
},
{
"epoch": 0.19027181688125894,
"grad_norm": 4.24748420715332,
"learning_rate": 2.9884925048809327e-06,
"loss": 0.8496,
"step": 133
},
{
"epoch": 0.1917024320457797,
"grad_norm": 5.5269856452941895,
"learning_rate": 2.9880462476208033e-06,
"loss": 0.8475,
"step": 134
},
{
"epoch": 0.19313304721030042,
"grad_norm": 3.4209656715393066,
"learning_rate": 2.987591535914796e-06,
"loss": 0.77,
"step": 135
},
{
"epoch": 0.19456366237482117,
"grad_norm": 1.303078055381775,
"learning_rate": 2.9871283723463896e-06,
"loss": 0.8877,
"step": 136
},
{
"epoch": 0.19599427753934193,
"grad_norm": 4.734808444976807,
"learning_rate": 2.986656759547082e-06,
"loss": 0.8509,
"step": 137
},
{
"epoch": 0.19742489270386265,
"grad_norm": 22.503400802612305,
"learning_rate": 2.986176700196377e-06,
"loss": 0.859,
"step": 138
},
{
"epoch": 0.1988555078683834,
"grad_norm": 11.41430950164795,
"learning_rate": 2.9856881970217674e-06,
"loss": 0.8071,
"step": 139
},
{
"epoch": 0.20028612303290416,
"grad_norm": 1.9186583757400513,
"learning_rate": 2.985191252798721e-06,
"loss": 0.7943,
"step": 140
},
{
"epoch": 0.2017167381974249,
"grad_norm": 3.3689098358154297,
"learning_rate": 2.9846858703506625e-06,
"loss": 0.8457,
"step": 141
},
{
"epoch": 0.20314735336194564,
"grad_norm": 23.29608726501465,
"learning_rate": 2.984172052548961e-06,
"loss": 0.8721,
"step": 142
},
{
"epoch": 0.20457796852646637,
"grad_norm": 3.0760669708251953,
"learning_rate": 2.98364980231291e-06,
"loss": 0.9147,
"step": 143
},
{
"epoch": 0.20600858369098712,
"grad_norm": 6.489161491394043,
"learning_rate": 2.9831191226097138e-06,
"loss": 0.7935,
"step": 144
},
{
"epoch": 0.20743919885550788,
"grad_norm": 23.182281494140625,
"learning_rate": 2.9825800164544683e-06,
"loss": 0.8989,
"step": 145
},
{
"epoch": 0.2088698140200286,
"grad_norm": 3.310340404510498,
"learning_rate": 2.9820324869101457e-06,
"loss": 0.9176,
"step": 146
},
{
"epoch": 0.21030042918454936,
"grad_norm": 37.046268463134766,
"learning_rate": 2.9814765370875757e-06,
"loss": 0.8695,
"step": 147
},
{
"epoch": 0.2117310443490701,
"grad_norm": 7.289843559265137,
"learning_rate": 2.980912170145429e-06,
"loss": 0.7522,
"step": 148
},
{
"epoch": 0.21316165951359084,
"grad_norm": 2.9157369136810303,
"learning_rate": 2.9803393892901983e-06,
"loss": 0.8782,
"step": 149
},
{
"epoch": 0.2145922746781116,
"grad_norm": 5.29908561706543,
"learning_rate": 2.9797581977761813e-06,
"loss": 0.9556,
"step": 150
},
{
"epoch": 0.21602288984263232,
"grad_norm": 15.43282413482666,
"learning_rate": 2.97916859890546e-06,
"loss": 0.794,
"step": 151
},
{
"epoch": 0.21745350500715308,
"grad_norm": 1.031524419784546,
"learning_rate": 2.9785705960278854e-06,
"loss": 0.7869,
"step": 152
},
{
"epoch": 0.21888412017167383,
"grad_norm": 1.8489532470703125,
"learning_rate": 2.9779641925410552e-06,
"loss": 0.8462,
"step": 153
},
{
"epoch": 0.22031473533619456,
"grad_norm": 3.093093156814575,
"learning_rate": 2.9773493918902956e-06,
"loss": 0.8689,
"step": 154
},
{
"epoch": 0.2217453505007153,
"grad_norm": 12.075631141662598,
"learning_rate": 2.9767261975686436e-06,
"loss": 0.835,
"step": 155
},
{
"epoch": 0.22317596566523606,
"grad_norm": 5.174819469451904,
"learning_rate": 2.976094613116823e-06,
"loss": 0.7994,
"step": 156
},
{
"epoch": 0.2246065808297568,
"grad_norm": 14.805009841918945,
"learning_rate": 2.975454642123228e-06,
"loss": 0.7749,
"step": 157
},
{
"epoch": 0.22603719599427755,
"grad_norm": 6.730155944824219,
"learning_rate": 2.9748062882239032e-06,
"loss": 0.7781,
"step": 158
},
{
"epoch": 0.22746781115879827,
"grad_norm": 1.8753336668014526,
"learning_rate": 2.9741495551025176e-06,
"loss": 0.8107,
"step": 159
},
{
"epoch": 0.22889842632331903,
"grad_norm": 7.615732192993164,
"learning_rate": 2.9734844464903513e-06,
"loss": 0.8196,
"step": 160
},
{
"epoch": 0.23032904148783978,
"grad_norm": 11.586601257324219,
"learning_rate": 2.9728109661662674e-06,
"loss": 0.7974,
"step": 161
},
{
"epoch": 0.2317596566523605,
"grad_norm": 12.352217674255371,
"learning_rate": 2.972129117956695e-06,
"loss": 0.7608,
"step": 162
},
{
"epoch": 0.23319027181688126,
"grad_norm": 23.733856201171875,
"learning_rate": 2.971438905735606e-06,
"loss": 0.8376,
"step": 163
},
{
"epoch": 0.23462088698140202,
"grad_norm": 11.216538429260254,
"learning_rate": 2.9707403334244917e-06,
"loss": 0.8035,
"step": 164
},
{
"epoch": 0.23605150214592274,
"grad_norm": 14.699457168579102,
"learning_rate": 2.9700334049923436e-06,
"loss": 0.7992,
"step": 165
},
{
"epoch": 0.2374821173104435,
"grad_norm": 11.340972900390625,
"learning_rate": 2.9693181244556285e-06,
"loss": 0.8836,
"step": 166
},
{
"epoch": 0.23891273247496422,
"grad_norm": 25.200716018676758,
"learning_rate": 2.968594495878266e-06,
"loss": 0.9051,
"step": 167
},
{
"epoch": 0.24034334763948498,
"grad_norm": 9.116874694824219,
"learning_rate": 2.967862523371605e-06,
"loss": 0.8595,
"step": 168
},
{
"epoch": 0.24177396280400573,
"grad_norm": 8.399476051330566,
"learning_rate": 2.9671222110944032e-06,
"loss": 0.8618,
"step": 169
},
{
"epoch": 0.24320457796852646,
"grad_norm": 3.272933006286621,
"learning_rate": 2.9663735632527995e-06,
"loss": 0.7056,
"step": 170
},
{
"epoch": 0.2446351931330472,
"grad_norm": 6.614375591278076,
"learning_rate": 2.9656165841002934e-06,
"loss": 0.7985,
"step": 171
},
{
"epoch": 0.24606580829756797,
"grad_norm": 32.3626594543457,
"learning_rate": 2.964851277937717e-06,
"loss": 0.7313,
"step": 172
},
{
"epoch": 0.2474964234620887,
"grad_norm": 8.703509330749512,
"learning_rate": 2.9640776491132155e-06,
"loss": 0.859,
"step": 173
},
{
"epoch": 0.24892703862660945,
"grad_norm": 4.464837551116943,
"learning_rate": 2.9632957020222185e-06,
"loss": 0.841,
"step": 174
},
{
"epoch": 0.2503576537911302,
"grad_norm": 3.000603199005127,
"learning_rate": 2.9625054411074166e-06,
"loss": 0.804,
"step": 175
},
{
"epoch": 0.2503576537911302,
"eval_loss": 0.987718939781189,
"eval_runtime": 63.9493,
"eval_samples_per_second": 6.49,
"eval_steps_per_second": 0.407,
"step": 175
},
{
"epoch": 0.25178826895565093,
"grad_norm": 7.612940788269043,
"learning_rate": 2.9617068708587365e-06,
"loss": 0.7769,
"step": 176
},
{
"epoch": 0.2532188841201717,
"grad_norm": 8.721627235412598,
"learning_rate": 2.9608999958133147e-06,
"loss": 0.8665,
"step": 177
},
{
"epoch": 0.25464949928469244,
"grad_norm": 16.083751678466797,
"learning_rate": 2.9600848205554717e-06,
"loss": 0.782,
"step": 178
},
{
"epoch": 0.25608011444921314,
"grad_norm": 6.257425308227539,
"learning_rate": 2.959261349716687e-06,
"loss": 0.9526,
"step": 179
},
{
"epoch": 0.2575107296137339,
"grad_norm": 1.2820326089859009,
"learning_rate": 2.9584295879755717e-06,
"loss": 0.7956,
"step": 180
},
{
"epoch": 0.25894134477825465,
"grad_norm": 6.126409530639648,
"learning_rate": 2.957589540057842e-06,
"loss": 0.7572,
"step": 181
},
{
"epoch": 0.2603719599427754,
"grad_norm": 6.914258003234863,
"learning_rate": 2.9567412107362925e-06,
"loss": 0.8475,
"step": 182
},
{
"epoch": 0.26180257510729615,
"grad_norm": 10.246359825134277,
"learning_rate": 2.9558846048307703e-06,
"loss": 0.865,
"step": 183
},
{
"epoch": 0.2632331902718169,
"grad_norm": 7.619375705718994,
"learning_rate": 2.955019727208145e-06,
"loss": 0.8139,
"step": 184
},
{
"epoch": 0.2646638054363376,
"grad_norm": 5.338575839996338,
"learning_rate": 2.9541465827822845e-06,
"loss": 0.8606,
"step": 185
},
{
"epoch": 0.26609442060085836,
"grad_norm": 7.702805042266846,
"learning_rate": 2.9532651765140233e-06,
"loss": 0.893,
"step": 186
},
{
"epoch": 0.2675250357653791,
"grad_norm": 19.783300399780273,
"learning_rate": 2.952375513411137e-06,
"loss": 0.9462,
"step": 187
},
{
"epoch": 0.26895565092989987,
"grad_norm": 7.961276054382324,
"learning_rate": 2.951477598528313e-06,
"loss": 0.8445,
"step": 188
},
{
"epoch": 0.2703862660944206,
"grad_norm": 2.4168357849121094,
"learning_rate": 2.9505714369671222e-06,
"loss": 0.7095,
"step": 189
},
{
"epoch": 0.2718168812589413,
"grad_norm": 3.021878957748413,
"learning_rate": 2.949657033875989e-06,
"loss": 0.8208,
"step": 190
},
{
"epoch": 0.2732474964234621,
"grad_norm": 3.7104063034057617,
"learning_rate": 2.948734394450162e-06,
"loss": 0.8333,
"step": 191
},
{
"epoch": 0.27467811158798283,
"grad_norm": 8.396327018737793,
"learning_rate": 2.947803523931687e-06,
"loss": 0.8052,
"step": 192
},
{
"epoch": 0.2761087267525036,
"grad_norm": 4.769831657409668,
"learning_rate": 2.9468644276093736e-06,
"loss": 0.7715,
"step": 193
},
{
"epoch": 0.27753934191702434,
"grad_norm": 5.608636379241943,
"learning_rate": 2.9459171108187688e-06,
"loss": 0.8781,
"step": 194
},
{
"epoch": 0.27896995708154504,
"grad_norm": 2.4100029468536377,
"learning_rate": 2.9449615789421225e-06,
"loss": 0.8128,
"step": 195
},
{
"epoch": 0.2804005722460658,
"grad_norm": 12.692727088928223,
"learning_rate": 2.943997837408361e-06,
"loss": 0.8316,
"step": 196
},
{
"epoch": 0.28183118741058655,
"grad_norm": 2.0479393005371094,
"learning_rate": 2.943025891693054e-06,
"loss": 0.7717,
"step": 197
},
{
"epoch": 0.2832618025751073,
"grad_norm": 5.383994102478027,
"learning_rate": 2.9420457473183827e-06,
"loss": 0.8796,
"step": 198
},
{
"epoch": 0.28469241773962806,
"grad_norm": 45.458194732666016,
"learning_rate": 2.941057409853112e-06,
"loss": 0.9014,
"step": 199
},
{
"epoch": 0.2861230329041488,
"grad_norm": 5.420682907104492,
"learning_rate": 2.9400608849125535e-06,
"loss": 0.8651,
"step": 200
},
{
"epoch": 0.2875536480686695,
"grad_norm": 3.0556061267852783,
"learning_rate": 2.939056178158539e-06,
"loss": 0.7834,
"step": 201
},
{
"epoch": 0.28898426323319026,
"grad_norm": 7.664134979248047,
"learning_rate": 2.938043295299385e-06,
"loss": 0.7516,
"step": 202
},
{
"epoch": 0.290414878397711,
"grad_norm": 10.764482498168945,
"learning_rate": 2.937022242089861e-06,
"loss": 0.8377,
"step": 203
},
{
"epoch": 0.2918454935622318,
"grad_norm": 17.695600509643555,
"learning_rate": 2.9359930243311565e-06,
"loss": 0.9017,
"step": 204
},
{
"epoch": 0.2932761087267525,
"grad_norm": 6.34639835357666,
"learning_rate": 2.9349556478708494e-06,
"loss": 0.8308,
"step": 205
},
{
"epoch": 0.2947067238912732,
"grad_norm": 62.43265914916992,
"learning_rate": 2.933910118602872e-06,
"loss": 0.8773,
"step": 206
},
{
"epoch": 0.296137339055794,
"grad_norm": 5.159948348999023,
"learning_rate": 2.932856442467476e-06,
"loss": 0.6787,
"step": 207
},
{
"epoch": 0.29756795422031473,
"grad_norm": 3.1082255840301514,
"learning_rate": 2.931794625451202e-06,
"loss": 0.8965,
"step": 208
},
{
"epoch": 0.2989985693848355,
"grad_norm": 2.5208675861358643,
"learning_rate": 2.930724673586842e-06,
"loss": 0.8792,
"step": 209
},
{
"epoch": 0.30042918454935624,
"grad_norm": 12.822099685668945,
"learning_rate": 2.929646592953408e-06,
"loss": 0.8534,
"step": 210
},
{
"epoch": 0.30185979971387694,
"grad_norm": 21.156463623046875,
"learning_rate": 2.928560389676095e-06,
"loss": 0.7975,
"step": 211
},
{
"epoch": 0.3032904148783977,
"grad_norm": 1.6340172290802002,
"learning_rate": 2.9274660699262483e-06,
"loss": 0.7555,
"step": 212
},
{
"epoch": 0.30472103004291845,
"grad_norm": 4.860538005828857,
"learning_rate": 2.926363639921327e-06,
"loss": 0.8352,
"step": 213
},
{
"epoch": 0.3061516452074392,
"grad_norm": 1.8420562744140625,
"learning_rate": 2.92525310592487e-06,
"loss": 0.8709,
"step": 214
},
{
"epoch": 0.30758226037195996,
"grad_norm": 3.420260429382324,
"learning_rate": 2.9241344742464586e-06,
"loss": 0.8462,
"step": 215
},
{
"epoch": 0.3090128755364807,
"grad_norm": 4.125131130218506,
"learning_rate": 2.923007751241683e-06,
"loss": 0.8501,
"step": 216
},
{
"epoch": 0.3104434907010014,
"grad_norm": 4.323355674743652,
"learning_rate": 2.9218729433121034e-06,
"loss": 0.8146,
"step": 217
},
{
"epoch": 0.31187410586552217,
"grad_norm": 3.767756938934326,
"learning_rate": 2.920730056905216e-06,
"loss": 0.8045,
"step": 218
},
{
"epoch": 0.3133047210300429,
"grad_norm": 104.36705017089844,
"learning_rate": 2.919579098514415e-06,
"loss": 0.7723,
"step": 219
},
{
"epoch": 0.3147353361945637,
"grad_norm": 5.676706790924072,
"learning_rate": 2.9184200746789575e-06,
"loss": 0.8171,
"step": 220
},
{
"epoch": 0.31616595135908443,
"grad_norm": 17.918733596801758,
"learning_rate": 2.9172529919839226e-06,
"loss": 0.8766,
"step": 221
},
{
"epoch": 0.31759656652360513,
"grad_norm": 6.99448823928833,
"learning_rate": 2.9160778570601787e-06,
"loss": 0.7374,
"step": 222
},
{
"epoch": 0.3190271816881259,
"grad_norm": 2.60298490524292,
"learning_rate": 2.9148946765843418e-06,
"loss": 0.7419,
"step": 223
},
{
"epoch": 0.32045779685264664,
"grad_norm": 11.042309761047363,
"learning_rate": 2.913703457278741e-06,
"loss": 0.8656,
"step": 224
},
{
"epoch": 0.3218884120171674,
"grad_norm": 7.073480129241943,
"learning_rate": 2.9125042059113773e-06,
"loss": 0.7972,
"step": 225
},
{
"epoch": 0.32331902718168815,
"grad_norm": 3.2704951763153076,
"learning_rate": 2.9112969292958874e-06,
"loss": 0.8576,
"step": 226
},
{
"epoch": 0.32474964234620884,
"grad_norm": 5.423962116241455,
"learning_rate": 2.9100816342915025e-06,
"loss": 0.8155,
"step": 227
},
{
"epoch": 0.3261802575107296,
"grad_norm": 10.780782699584961,
"learning_rate": 2.908858327803013e-06,
"loss": 0.833,
"step": 228
},
{
"epoch": 0.32761087267525035,
"grad_norm": 5.944501876831055,
"learning_rate": 2.907627016780725e-06,
"loss": 0.8205,
"step": 229
},
{
"epoch": 0.3290414878397711,
"grad_norm": 14.172869682312012,
"learning_rate": 2.906387708220425e-06,
"loss": 0.8103,
"step": 230
},
{
"epoch": 0.33047210300429186,
"grad_norm": 5.723515033721924,
"learning_rate": 2.905140409163337e-06,
"loss": 0.8297,
"step": 231
},
{
"epoch": 0.3319027181688126,
"grad_norm": 10.871804237365723,
"learning_rate": 2.903885126696083e-06,
"loss": 0.8411,
"step": 232
},
{
"epoch": 0.3333333333333333,
"grad_norm": 13.487065315246582,
"learning_rate": 2.902621867950645e-06,
"loss": 0.873,
"step": 233
},
{
"epoch": 0.33476394849785407,
"grad_norm": 6.601111888885498,
"learning_rate": 2.9013506401043214e-06,
"loss": 0.8536,
"step": 234
},
{
"epoch": 0.3361945636623748,
"grad_norm": 6.6699724197387695,
"learning_rate": 2.900071450379688e-06,
"loss": 0.827,
"step": 235
},
{
"epoch": 0.3376251788268956,
"grad_norm": 3.9626381397247314,
"learning_rate": 2.8987843060445575e-06,
"loss": 0.8954,
"step": 236
},
{
"epoch": 0.33905579399141633,
"grad_norm": 1.9564266204833984,
"learning_rate": 2.8974892144119353e-06,
"loss": 0.7551,
"step": 237
},
{
"epoch": 0.34048640915593703,
"grad_norm": 7.052184581756592,
"learning_rate": 2.896186182839982e-06,
"loss": 0.8094,
"step": 238
},
{
"epoch": 0.3419170243204578,
"grad_norm": 28.235042572021484,
"learning_rate": 2.8948752187319696e-06,
"loss": 0.7715,
"step": 239
},
{
"epoch": 0.34334763948497854,
"grad_norm": 7.05892276763916,
"learning_rate": 2.8935563295362367e-06,
"loss": 0.7823,
"step": 240
},
{
"epoch": 0.3447782546494993,
"grad_norm": 4.274432182312012,
"learning_rate": 2.8922295227461523e-06,
"loss": 0.8163,
"step": 241
},
{
"epoch": 0.34620886981402005,
"grad_norm": 2.481339693069458,
"learning_rate": 2.8908948059000676e-06,
"loss": 0.812,
"step": 242
},
{
"epoch": 0.34763948497854075,
"grad_norm": 3.1200881004333496,
"learning_rate": 2.8895521865812758e-06,
"loss": 0.7542,
"step": 243
},
{
"epoch": 0.3490701001430615,
"grad_norm": 9.71296501159668,
"learning_rate": 2.88820167241797e-06,
"loss": 0.8787,
"step": 244
},
{
"epoch": 0.35050071530758226,
"grad_norm": 1.676202654838562,
"learning_rate": 2.886843271083196e-06,
"loss": 0.7536,
"step": 245
},
{
"epoch": 0.351931330472103,
"grad_norm": 1.9348456859588623,
"learning_rate": 2.8854769902948127e-06,
"loss": 0.7707,
"step": 246
},
{
"epoch": 0.35336194563662376,
"grad_norm": 21.045368194580078,
"learning_rate": 2.8841028378154463e-06,
"loss": 0.8119,
"step": 247
},
{
"epoch": 0.3547925608011445,
"grad_norm": 6.235752582550049,
"learning_rate": 2.8827208214524477e-06,
"loss": 0.7814,
"step": 248
},
{
"epoch": 0.3562231759656652,
"grad_norm": 9.359082221984863,
"learning_rate": 2.881330949057845e-06,
"loss": 0.8157,
"step": 249
},
{
"epoch": 0.35765379113018597,
"grad_norm": 5.472043037414551,
"learning_rate": 2.8799332285283025e-06,
"loss": 0.8594,
"step": 250
},
{
"epoch": 0.3590844062947067,
"grad_norm": 11.745348930358887,
"learning_rate": 2.8785276678050736e-06,
"loss": 0.8394,
"step": 251
},
{
"epoch": 0.3605150214592275,
"grad_norm": 16.824607849121094,
"learning_rate": 2.877114274873957e-06,
"loss": 0.7987,
"step": 252
},
{
"epoch": 0.36194563662374823,
"grad_norm": 30.396041870117188,
"learning_rate": 2.8756930577652493e-06,
"loss": 0.7705,
"step": 253
},
{
"epoch": 0.36337625178826893,
"grad_norm": 5.095501899719238,
"learning_rate": 2.874264024553702e-06,
"loss": 0.8093,
"step": 254
},
{
"epoch": 0.3648068669527897,
"grad_norm": 5.913100242614746,
"learning_rate": 2.8728271833584744e-06,
"loss": 0.8863,
"step": 255
},
{
"epoch": 0.36623748211731044,
"grad_norm": 3.5775020122528076,
"learning_rate": 2.871382542343087e-06,
"loss": 0.8394,
"step": 256
},
{
"epoch": 0.3676680972818312,
"grad_norm": 4.6704816818237305,
"learning_rate": 2.869930109715375e-06,
"loss": 0.9023,
"step": 257
},
{
"epoch": 0.36909871244635195,
"grad_norm": 52.114646911621094,
"learning_rate": 2.868469893727443e-06,
"loss": 0.713,
"step": 258
},
{
"epoch": 0.37052932761087265,
"grad_norm": 5.689326763153076,
"learning_rate": 2.8670019026756174e-06,
"loss": 0.9299,
"step": 259
},
{
"epoch": 0.3719599427753934,
"grad_norm": 1.1218035221099854,
"learning_rate": 2.8655261449003993e-06,
"loss": 0.8403,
"step": 260
},
{
"epoch": 0.37339055793991416,
"grad_norm": 2.4807209968566895,
"learning_rate": 2.864042628786416e-06,
"loss": 0.8961,
"step": 261
},
{
"epoch": 0.3748211731044349,
"grad_norm": 6.620181560516357,
"learning_rate": 2.8625513627623757e-06,
"loss": 0.839,
"step": 262
},
{
"epoch": 0.37625178826895567,
"grad_norm": 7.724957466125488,
"learning_rate": 2.8610523553010174e-06,
"loss": 0.8033,
"step": 263
},
{
"epoch": 0.3776824034334764,
"grad_norm": 3.1110544204711914,
"learning_rate": 2.8595456149190633e-06,
"loss": 0.8175,
"step": 264
},
{
"epoch": 0.3791130185979971,
"grad_norm": 5.656611919403076,
"learning_rate": 2.858031150177173e-06,
"loss": 0.823,
"step": 265
},
{
"epoch": 0.3805436337625179,
"grad_norm": 5.221110820770264,
"learning_rate": 2.85650896967989e-06,
"loss": 0.8279,
"step": 266
},
{
"epoch": 0.38197424892703863,
"grad_norm": 3.36710786819458,
"learning_rate": 2.854979082075596e-06,
"loss": 0.7052,
"step": 267
},
{
"epoch": 0.3834048640915594,
"grad_norm": 5.043059349060059,
"learning_rate": 2.8534414960564626e-06,
"loss": 0.815,
"step": 268
},
{
"epoch": 0.38483547925608014,
"grad_norm": 2.3259692192077637,
"learning_rate": 2.8518962203583996e-06,
"loss": 0.8315,
"step": 269
},
{
"epoch": 0.38626609442060084,
"grad_norm": 2.116469621658325,
"learning_rate": 2.850343263761005e-06,
"loss": 0.8151,
"step": 270
},
{
"epoch": 0.3876967095851216,
"grad_norm": 5.095742225646973,
"learning_rate": 2.8487826350875188e-06,
"loss": 0.8809,
"step": 271
},
{
"epoch": 0.38912732474964234,
"grad_norm": 21.042909622192383,
"learning_rate": 2.8472143432047694e-06,
"loss": 0.8604,
"step": 272
},
{
"epoch": 0.3905579399141631,
"grad_norm": 4.103556156158447,
"learning_rate": 2.8456383970231238e-06,
"loss": 0.8797,
"step": 273
},
{
"epoch": 0.39198855507868385,
"grad_norm": 8.809136390686035,
"learning_rate": 2.8440548054964382e-06,
"loss": 0.9017,
"step": 274
},
{
"epoch": 0.39341917024320455,
"grad_norm": 4.425339221954346,
"learning_rate": 2.8424635776220057e-06,
"loss": 0.9289,
"step": 275
},
{
"epoch": 0.3948497854077253,
"grad_norm": 4.326204776763916,
"learning_rate": 2.8408647224405066e-06,
"loss": 0.768,
"step": 276
},
{
"epoch": 0.39628040057224606,
"grad_norm": 14.46237564086914,
"learning_rate": 2.8392582490359563e-06,
"loss": 0.8116,
"step": 277
},
{
"epoch": 0.3977110157367668,
"grad_norm": 7.003748416900635,
"learning_rate": 2.8376441665356527e-06,
"loss": 0.7712,
"step": 278
},
{
"epoch": 0.39914163090128757,
"grad_norm": 6.612820625305176,
"learning_rate": 2.8360224841101273e-06,
"loss": 0.874,
"step": 279
},
{
"epoch": 0.4005722460658083,
"grad_norm": 1.8514535427093506,
"learning_rate": 2.8343932109730885e-06,
"loss": 0.8416,
"step": 280
},
{
"epoch": 0.402002861230329,
"grad_norm": 3.661787271499634,
"learning_rate": 2.8327563563813735e-06,
"loss": 0.8026,
"step": 281
},
{
"epoch": 0.4034334763948498,
"grad_norm": 4.149445056915283,
"learning_rate": 2.8311119296348947e-06,
"loss": 0.8505,
"step": 282
},
{
"epoch": 0.40486409155937053,
"grad_norm": 1.8762818574905396,
"learning_rate": 2.829459940076585e-06,
"loss": 0.91,
"step": 283
},
{
"epoch": 0.4062947067238913,
"grad_norm": 3.605158805847168,
"learning_rate": 2.8278003970923464e-06,
"loss": 0.786,
"step": 284
},
{
"epoch": 0.40772532188841204,
"grad_norm": 5.466380596160889,
"learning_rate": 2.826133310110996e-06,
"loss": 0.7949,
"step": 285
},
{
"epoch": 0.40915593705293274,
"grad_norm": 2.403118133544922,
"learning_rate": 2.824458688604214e-06,
"loss": 0.8175,
"step": 286
},
{
"epoch": 0.4105865522174535,
"grad_norm": 8.62721061706543,
"learning_rate": 2.8227765420864864e-06,
"loss": 0.7938,
"step": 287
},
{
"epoch": 0.41201716738197425,
"grad_norm": 12.812850952148438,
"learning_rate": 2.821086880115055e-06,
"loss": 0.8682,
"step": 288
},
{
"epoch": 0.413447782546495,
"grad_norm": 10.280946731567383,
"learning_rate": 2.81938971228986e-06,
"loss": 0.7679,
"step": 289
},
{
"epoch": 0.41487839771101576,
"grad_norm": 5.222766399383545,
"learning_rate": 2.8176850482534874e-06,
"loss": 0.8453,
"step": 290
},
{
"epoch": 0.41630901287553645,
"grad_norm": 16.025169372558594,
"learning_rate": 2.8159728976911133e-06,
"loss": 0.7303,
"step": 291
},
{
"epoch": 0.4177396280400572,
"grad_norm": 2.5485048294067383,
"learning_rate": 2.8142532703304487e-06,
"loss": 0.8233,
"step": 292
},
{
"epoch": 0.41917024320457796,
"grad_norm": 2.9927330017089844,
"learning_rate": 2.8125261759416854e-06,
"loss": 0.8752,
"step": 293
},
{
"epoch": 0.4206008583690987,
"grad_norm": 20.316953659057617,
"learning_rate": 2.810791624337438e-06,
"loss": 0.7761,
"step": 294
},
{
"epoch": 0.4220314735336195,
"grad_norm": 5.816092014312744,
"learning_rate": 2.8090496253726924e-06,
"loss": 0.8886,
"step": 295
},
{
"epoch": 0.4234620886981402,
"grad_norm": 2.1833443641662598,
"learning_rate": 2.8073001889447446e-06,
"loss": 0.8559,
"step": 296
},
{
"epoch": 0.4248927038626609,
"grad_norm": 1.9403437376022339,
"learning_rate": 2.805543324993149e-06,
"loss": 0.7898,
"step": 297
},
{
"epoch": 0.4263233190271817,
"grad_norm": 18.38999366760254,
"learning_rate": 2.8037790434996593e-06,
"loss": 0.8416,
"step": 298
},
{
"epoch": 0.42775393419170243,
"grad_norm": 4.05740213394165,
"learning_rate": 2.8020073544881724e-06,
"loss": 0.8204,
"step": 299
},
{
"epoch": 0.4291845493562232,
"grad_norm": 1.8824354410171509,
"learning_rate": 2.800228268024672e-06,
"loss": 0.78,
"step": 300
},
{
"epoch": 0.43061516452074394,
"grad_norm": 2.7645819187164307,
"learning_rate": 2.79844179421717e-06,
"loss": 0.8157,
"step": 301
},
{
"epoch": 0.43204577968526464,
"grad_norm": 3.2076547145843506,
"learning_rate": 2.796647943215651e-06,
"loss": 0.8537,
"step": 302
},
{
"epoch": 0.4334763948497854,
"grad_norm": 3.7037010192871094,
"learning_rate": 2.7948467252120144e-06,
"loss": 0.8262,
"step": 303
},
{
"epoch": 0.43490701001430615,
"grad_norm": 6.0140557289123535,
"learning_rate": 2.793038150440013e-06,
"loss": 0.9137,
"step": 304
},
{
"epoch": 0.4363376251788269,
"grad_norm": 3.6040737628936768,
"learning_rate": 2.7912222291752013e-06,
"loss": 0.8043,
"step": 305
},
{
"epoch": 0.43776824034334766,
"grad_norm": 2.64436674118042,
"learning_rate": 2.7893989717348702e-06,
"loss": 0.8577,
"step": 306
},
{
"epoch": 0.43919885550786836,
"grad_norm": 3.0492098331451416,
"learning_rate": 2.7875683884779937e-06,
"loss": 0.8455,
"step": 307
},
{
"epoch": 0.4406294706723891,
"grad_norm": 3.0012905597686768,
"learning_rate": 2.785730489805167e-06,
"loss": 0.787,
"step": 308
},
{
"epoch": 0.44206008583690987,
"grad_norm": 2.695319652557373,
"learning_rate": 2.783885286158549e-06,
"loss": 0.8001,
"step": 309
},
{
"epoch": 0.4434907010014306,
"grad_norm": 4.0424909591674805,
"learning_rate": 2.782032788021802e-06,
"loss": 0.78,
"step": 310
},
{
"epoch": 0.4449213161659514,
"grad_norm": 2.0582504272460938,
"learning_rate": 2.7801730059200314e-06,
"loss": 0.8018,
"step": 311
},
{
"epoch": 0.44635193133047213,
"grad_norm": 1.0271695852279663,
"learning_rate": 2.7783059504197293e-06,
"loss": 0.8059,
"step": 312
},
{
"epoch": 0.44778254649499283,
"grad_norm": 12.270268440246582,
"learning_rate": 2.7764316321287102e-06,
"loss": 0.7964,
"step": 313
},
{
"epoch": 0.4492131616595136,
"grad_norm": 4.83074951171875,
"learning_rate": 2.774550061696055e-06,
"loss": 0.8015,
"step": 314
},
{
"epoch": 0.45064377682403434,
"grad_norm": 4.174887180328369,
"learning_rate": 2.7726612498120442e-06,
"loss": 0.8314,
"step": 315
},
{
"epoch": 0.4520743919885551,
"grad_norm": 2.5617687702178955,
"learning_rate": 2.7707652072081057e-06,
"loss": 0.7849,
"step": 316
},
{
"epoch": 0.45350500715307585,
"grad_norm": 23.52600860595703,
"learning_rate": 2.7688619446567456e-06,
"loss": 0.8122,
"step": 317
},
{
"epoch": 0.45493562231759654,
"grad_norm": 1.7928926944732666,
"learning_rate": 2.7669514729714935e-06,
"loss": 0.882,
"step": 318
},
{
"epoch": 0.4563662374821173,
"grad_norm": 8.705628395080566,
"learning_rate": 2.765033803006836e-06,
"loss": 0.788,
"step": 319
},
{
"epoch": 0.45779685264663805,
"grad_norm": 118.05711364746094,
"learning_rate": 2.7631089456581586e-06,
"loss": 0.8104,
"step": 320
},
{
"epoch": 0.4592274678111588,
"grad_norm": 3.2315642833709717,
"learning_rate": 2.7611769118616817e-06,
"loss": 0.8708,
"step": 321
},
{
"epoch": 0.46065808297567956,
"grad_norm": 3.948796033859253,
"learning_rate": 2.7592377125944e-06,
"loss": 0.7526,
"step": 322
},
{
"epoch": 0.46208869814020026,
"grad_norm": 4.273873329162598,
"learning_rate": 2.7572913588740195e-06,
"loss": 0.8011,
"step": 323
},
{
"epoch": 0.463519313304721,
"grad_norm": 2.294113874435425,
"learning_rate": 2.755337861758893e-06,
"loss": 0.795,
"step": 324
},
{
"epoch": 0.46494992846924177,
"grad_norm": 72.31570434570312,
"learning_rate": 2.7533772323479605e-06,
"loss": 0.8524,
"step": 325
},
{
"epoch": 0.4663805436337625,
"grad_norm": 2.326502799987793,
"learning_rate": 2.7514094817806853e-06,
"loss": 0.7838,
"step": 326
},
{
"epoch": 0.4678111587982833,
"grad_norm": 7.991358280181885,
"learning_rate": 2.7494346212369884e-06,
"loss": 0.7923,
"step": 327
},
{
"epoch": 0.46924177396280403,
"grad_norm": 5.2567596435546875,
"learning_rate": 2.7474526619371874e-06,
"loss": 0.8094,
"step": 328
},
{
"epoch": 0.47067238912732473,
"grad_norm": 29.58182144165039,
"learning_rate": 2.7454636151419323e-06,
"loss": 0.8041,
"step": 329
},
{
"epoch": 0.4721030042918455,
"grad_norm": 4.548513412475586,
"learning_rate": 2.7434674921521414e-06,
"loss": 0.8016,
"step": 330
},
{
"epoch": 0.47353361945636624,
"grad_norm": 16.49585723876953,
"learning_rate": 2.7414643043089362e-06,
"loss": 0.7666,
"step": 331
},
{
"epoch": 0.474964234620887,
"grad_norm": 4.154926300048828,
"learning_rate": 2.739454062993578e-06,
"loss": 0.7745,
"step": 332
},
{
"epoch": 0.47639484978540775,
"grad_norm": 6.365798473358154,
"learning_rate": 2.7374367796274023e-06,
"loss": 0.8022,
"step": 333
},
{
"epoch": 0.47782546494992845,
"grad_norm": 125.90961456298828,
"learning_rate": 2.735412465671756e-06,
"loss": 0.8109,
"step": 334
},
{
"epoch": 0.4792560801144492,
"grad_norm": 32.61653518676758,
"learning_rate": 2.73338113262793e-06,
"loss": 0.8748,
"step": 335
},
{
"epoch": 0.48068669527896996,
"grad_norm": 3.2467617988586426,
"learning_rate": 2.7313427920370948e-06,
"loss": 0.8134,
"step": 336
},
{
"epoch": 0.4821173104434907,
"grad_norm": 9.577071189880371,
"learning_rate": 2.7292974554802343e-06,
"loss": 0.8149,
"step": 337
},
{
"epoch": 0.48354792560801146,
"grad_norm": 9.44502067565918,
"learning_rate": 2.7272451345780804e-06,
"loss": 0.825,
"step": 338
},
{
"epoch": 0.48497854077253216,
"grad_norm": 3.725696325302124,
"learning_rate": 2.725185840991049e-06,
"loss": 0.8543,
"step": 339
},
{
"epoch": 0.4864091559370529,
"grad_norm": 9.806964874267578,
"learning_rate": 2.723119586419169e-06,
"loss": 0.7656,
"step": 340
},
{
"epoch": 0.48783977110157367,
"grad_norm": 6.31876802444458,
"learning_rate": 2.721046382602021e-06,
"loss": 0.8145,
"step": 341
},
{
"epoch": 0.4892703862660944,
"grad_norm": 5.0293073654174805,
"learning_rate": 2.718966241318666e-06,
"loss": 0.8477,
"step": 342
},
{
"epoch": 0.4907010014306152,
"grad_norm": 1.991077184677124,
"learning_rate": 2.7168791743875835e-06,
"loss": 0.7861,
"step": 343
},
{
"epoch": 0.49213161659513593,
"grad_norm": 7.8108344078063965,
"learning_rate": 2.7147851936665995e-06,
"loss": 0.8532,
"step": 344
},
{
"epoch": 0.49356223175965663,
"grad_norm": 2.972151041030884,
"learning_rate": 2.712684311052822e-06,
"loss": 0.8825,
"step": 345
},
{
"epoch": 0.4949928469241774,
"grad_norm": 3.060875177383423,
"learning_rate": 2.710576538482572e-06,
"loss": 0.8001,
"step": 346
},
{
"epoch": 0.49642346208869814,
"grad_norm": 10.620682716369629,
"learning_rate": 2.7084618879313177e-06,
"loss": 0.8303,
"step": 347
},
{
"epoch": 0.4978540772532189,
"grad_norm": 21.889728546142578,
"learning_rate": 2.706340371413603e-06,
"loss": 0.8979,
"step": 348
},
{
"epoch": 0.49928469241773965,
"grad_norm": 9.274587631225586,
"learning_rate": 2.7042120009829832e-06,
"loss": 0.8525,
"step": 349
},
{
"epoch": 0.5007153075822603,
"grad_norm": 16.314605712890625,
"learning_rate": 2.7020767887319534e-06,
"loss": 0.8911,
"step": 350
},
{
"epoch": 0.5007153075822603,
"eval_loss": 0.9631034731864929,
"eval_runtime": 64.0772,
"eval_samples_per_second": 6.477,
"eval_steps_per_second": 0.406,
"step": 350
},
{
"epoch": 0.5021459227467812,
"grad_norm": 15.189119338989258,
"learning_rate": 2.6999347467918816e-06,
"loss": 0.7916,
"step": 351
},
{
"epoch": 0.5035765379113019,
"grad_norm": 6.363760948181152,
"learning_rate": 2.6977858873329394e-06,
"loss": 0.863,
"step": 352
},
{
"epoch": 0.5050071530758226,
"grad_norm": 18.08306121826172,
"learning_rate": 2.695630222564032e-06,
"loss": 0.8125,
"step": 353
},
{
"epoch": 0.5064377682403434,
"grad_norm": 5.672774791717529,
"learning_rate": 2.6934677647327293e-06,
"loss": 0.8818,
"step": 354
},
{
"epoch": 0.5078683834048641,
"grad_norm": 61.24919509887695,
"learning_rate": 2.6912985261251977e-06,
"loss": 0.8885,
"step": 355
},
{
"epoch": 0.5092989985693849,
"grad_norm": 7.921273708343506,
"learning_rate": 2.689122519066128e-06,
"loss": 0.7384,
"step": 356
},
{
"epoch": 0.5107296137339056,
"grad_norm": 2.321747064590454,
"learning_rate": 2.686939755918667e-06,
"loss": 0.7979,
"step": 357
},
{
"epoch": 0.5121602288984263,
"grad_norm": 6.9070587158203125,
"learning_rate": 2.684750249084346e-06,
"loss": 0.8531,
"step": 358
},
{
"epoch": 0.5135908440629471,
"grad_norm": 2.6162514686584473,
"learning_rate": 2.6825540110030117e-06,
"loss": 0.8871,
"step": 359
},
{
"epoch": 0.5150214592274678,
"grad_norm": 8.098695755004883,
"learning_rate": 2.6803510541527555e-06,
"loss": 0.8527,
"step": 360
},
{
"epoch": 0.5164520743919886,
"grad_norm": 1.7876381874084473,
"learning_rate": 2.678141391049841e-06,
"loss": 0.8607,
"step": 361
},
{
"epoch": 0.5178826895565093,
"grad_norm": 83.18020629882812,
"learning_rate": 2.675925034248633e-06,
"loss": 0.8275,
"step": 362
},
{
"epoch": 0.51931330472103,
"grad_norm": 2.7980153560638428,
"learning_rate": 2.67370199634153e-06,
"loss": 0.8568,
"step": 363
},
{
"epoch": 0.5207439198855508,
"grad_norm": 2.3697915077209473,
"learning_rate": 2.671472289958886e-06,
"loss": 0.8863,
"step": 364
},
{
"epoch": 0.5221745350500715,
"grad_norm": 8.928977012634277,
"learning_rate": 2.669235927768946e-06,
"loss": 0.714,
"step": 365
},
{
"epoch": 0.5236051502145923,
"grad_norm": 17.770780563354492,
"learning_rate": 2.6669929224777677e-06,
"loss": 0.7601,
"step": 366
},
{
"epoch": 0.525035765379113,
"grad_norm": 2.65303635597229,
"learning_rate": 2.664743286829154e-06,
"loss": 0.8077,
"step": 367
},
{
"epoch": 0.5264663805436338,
"grad_norm": 2.1842598915100098,
"learning_rate": 2.6624870336045768e-06,
"loss": 0.791,
"step": 368
},
{
"epoch": 0.5278969957081545,
"grad_norm": 3.5350661277770996,
"learning_rate": 2.660224175623108e-06,
"loss": 0.8359,
"step": 369
},
{
"epoch": 0.5293276108726752,
"grad_norm": 6.636647701263428,
"learning_rate": 2.6579547257413438e-06,
"loss": 0.7339,
"step": 370
},
{
"epoch": 0.530758226037196,
"grad_norm": 2.953014612197876,
"learning_rate": 2.6556786968533337e-06,
"loss": 0.7684,
"step": 371
},
{
"epoch": 0.5321888412017167,
"grad_norm": 16.38330841064453,
"learning_rate": 2.6533961018905052e-06,
"loss": 0.7963,
"step": 372
},
{
"epoch": 0.5336194563662375,
"grad_norm": 3.730391502380371,
"learning_rate": 2.6511069538215928e-06,
"loss": 0.8331,
"step": 373
},
{
"epoch": 0.5350500715307582,
"grad_norm": 2.098069906234741,
"learning_rate": 2.6488112656525614e-06,
"loss": 0.7582,
"step": 374
},
{
"epoch": 0.5364806866952789,
"grad_norm": 10.553278923034668,
"learning_rate": 2.6465090504265353e-06,
"loss": 0.7405,
"step": 375
},
{
"epoch": 0.5379113018597997,
"grad_norm": 8.935467720031738,
"learning_rate": 2.6442003212237215e-06,
"loss": 0.8012,
"step": 376
},
{
"epoch": 0.5393419170243204,
"grad_norm": 5.658432483673096,
"learning_rate": 2.6418850911613385e-06,
"loss": 0.8527,
"step": 377
},
{
"epoch": 0.5407725321888412,
"grad_norm": 7.131669521331787,
"learning_rate": 2.6395633733935376e-06,
"loss": 0.7484,
"step": 378
},
{
"epoch": 0.542203147353362,
"grad_norm": 7.413619518280029,
"learning_rate": 2.6372351811113327e-06,
"loss": 0.8055,
"step": 379
},
{
"epoch": 0.5436337625178826,
"grad_norm": 3.693314790725708,
"learning_rate": 2.634900527542522e-06,
"loss": 0.8518,
"step": 380
},
{
"epoch": 0.5450643776824035,
"grad_norm": 19.805158615112305,
"learning_rate": 2.632559425951613e-06,
"loss": 0.7986,
"step": 381
},
{
"epoch": 0.5464949928469242,
"grad_norm": 4.035129070281982,
"learning_rate": 2.63021188963975e-06,
"loss": 0.7836,
"step": 382
},
{
"epoch": 0.547925608011445,
"grad_norm": 5.204458236694336,
"learning_rate": 2.6278579319446364e-06,
"loss": 0.8931,
"step": 383
},
{
"epoch": 0.5493562231759657,
"grad_norm": 2.124077320098877,
"learning_rate": 2.625497566240458e-06,
"loss": 0.7553,
"step": 384
},
{
"epoch": 0.5507868383404864,
"grad_norm": 23.981964111328125,
"learning_rate": 2.623130805937809e-06,
"loss": 0.8436,
"step": 385
},
{
"epoch": 0.5522174535050072,
"grad_norm": 3.7908241748809814,
"learning_rate": 2.6207576644836144e-06,
"loss": 0.7655,
"step": 386
},
{
"epoch": 0.5536480686695279,
"grad_norm": 2.662917375564575,
"learning_rate": 2.6183781553610553e-06,
"loss": 0.8928,
"step": 387
},
{
"epoch": 0.5550786838340487,
"grad_norm": 12.019503593444824,
"learning_rate": 2.615992292089489e-06,
"loss": 0.7619,
"step": 388
},
{
"epoch": 0.5565092989985694,
"grad_norm": 2.186976194381714,
"learning_rate": 2.613600088224378e-06,
"loss": 0.8131,
"step": 389
},
{
"epoch": 0.5579399141630901,
"grad_norm": 4.182912349700928,
"learning_rate": 2.6112015573572054e-06,
"loss": 0.7677,
"step": 390
},
{
"epoch": 0.5593705293276109,
"grad_norm": 4.425599575042725,
"learning_rate": 2.6087967131154046e-06,
"loss": 0.7237,
"step": 391
},
{
"epoch": 0.5608011444921316,
"grad_norm": 3.038487672805786,
"learning_rate": 2.6063855691622773e-06,
"loss": 0.8731,
"step": 392
},
{
"epoch": 0.5622317596566524,
"grad_norm": 8.466862678527832,
"learning_rate": 2.6039681391969175e-06,
"loss": 0.851,
"step": 393
},
{
"epoch": 0.5636623748211731,
"grad_norm": 1.744046688079834,
"learning_rate": 2.6015444369541346e-06,
"loss": 0.7861,
"step": 394
},
{
"epoch": 0.5650929899856938,
"grad_norm": 4.3912835121154785,
"learning_rate": 2.5991144762043736e-06,
"loss": 0.7755,
"step": 395
},
{
"epoch": 0.5665236051502146,
"grad_norm": 2.832746744155884,
"learning_rate": 2.5966782707536385e-06,
"loss": 0.8042,
"step": 396
},
{
"epoch": 0.5679542203147353,
"grad_norm": 12.723127365112305,
"learning_rate": 2.5942358344434123e-06,
"loss": 0.8115,
"step": 397
},
{
"epoch": 0.5693848354792561,
"grad_norm": 12.688072204589844,
"learning_rate": 2.5917871811505786e-06,
"loss": 0.7963,
"step": 398
},
{
"epoch": 0.5708154506437768,
"grad_norm": 2.819028377532959,
"learning_rate": 2.589332324787345e-06,
"loss": 0.7876,
"step": 399
},
{
"epoch": 0.5722460658082976,
"grad_norm": 6.72185754776001,
"learning_rate": 2.58687127930116e-06,
"loss": 0.7474,
"step": 400
},
{
"epoch": 0.5736766809728183,
"grad_norm": 5.983644008636475,
"learning_rate": 2.5844040586746383e-06,
"loss": 0.7863,
"step": 401
},
{
"epoch": 0.575107296137339,
"grad_norm": 6.598376274108887,
"learning_rate": 2.581930676925478e-06,
"loss": 0.8686,
"step": 402
},
{
"epoch": 0.5765379113018598,
"grad_norm": 15.069884300231934,
"learning_rate": 2.579451148106382e-06,
"loss": 0.8143,
"step": 403
},
{
"epoch": 0.5779685264663805,
"grad_norm": 6.5639119148254395,
"learning_rate": 2.576965486304978e-06,
"loss": 0.712,
"step": 404
},
{
"epoch": 0.5793991416309013,
"grad_norm": 3.1110270023345947,
"learning_rate": 2.5744737056437407e-06,
"loss": 0.8277,
"step": 405
},
{
"epoch": 0.580829756795422,
"grad_norm": 3.178307294845581,
"learning_rate": 2.571975820279906e-06,
"loss": 0.7377,
"step": 406
},
{
"epoch": 0.5822603719599427,
"grad_norm": 1.4912009239196777,
"learning_rate": 2.5694718444053977e-06,
"loss": 0.8098,
"step": 407
},
{
"epoch": 0.5836909871244635,
"grad_norm": 1.6244900226593018,
"learning_rate": 2.5669617922467407e-06,
"loss": 0.8304,
"step": 408
},
{
"epoch": 0.5851216022889842,
"grad_norm": 5.3474016189575195,
"learning_rate": 2.5644456780649842e-06,
"loss": 0.8797,
"step": 409
},
{
"epoch": 0.586552217453505,
"grad_norm": 6.614544868469238,
"learning_rate": 2.561923516155619e-06,
"loss": 0.7439,
"step": 410
},
{
"epoch": 0.5879828326180258,
"grad_norm": 8.531089782714844,
"learning_rate": 2.5593953208484957e-06,
"loss": 0.7857,
"step": 411
},
{
"epoch": 0.5894134477825465,
"grad_norm": 3.9704976081848145,
"learning_rate": 2.556861106507745e-06,
"loss": 0.7818,
"step": 412
},
{
"epoch": 0.5908440629470673,
"grad_norm": 19.362394332885742,
"learning_rate": 2.554320887531696e-06,
"loss": 0.7372,
"step": 413
},
{
"epoch": 0.592274678111588,
"grad_norm": 4.459641933441162,
"learning_rate": 2.551774678352791e-06,
"loss": 0.7558,
"step": 414
},
{
"epoch": 0.5937052932761088,
"grad_norm": 3.2259392738342285,
"learning_rate": 2.549222493437509e-06,
"loss": 0.8202,
"step": 415
},
{
"epoch": 0.5951359084406295,
"grad_norm": 8.601910591125488,
"learning_rate": 2.5466643472862773e-06,
"loss": 0.8521,
"step": 416
},
{
"epoch": 0.5965665236051502,
"grad_norm": 36.73543167114258,
"learning_rate": 2.544100254433396e-06,
"loss": 0.884,
"step": 417
},
{
"epoch": 0.597997138769671,
"grad_norm": 4.288121223449707,
"learning_rate": 2.541530229446949e-06,
"loss": 0.8053,
"step": 418
},
{
"epoch": 0.5994277539341917,
"grad_norm": 1.6672669649124146,
"learning_rate": 2.538954286928726e-06,
"loss": 0.7844,
"step": 419
},
{
"epoch": 0.6008583690987125,
"grad_norm": 6.948642253875732,
"learning_rate": 2.5363724415141366e-06,
"loss": 0.8092,
"step": 420
},
{
"epoch": 0.6022889842632332,
"grad_norm": 5.498805999755859,
"learning_rate": 2.5337847078721275e-06,
"loss": 0.8096,
"step": 421
},
{
"epoch": 0.6037195994277539,
"grad_norm": 2.4374125003814697,
"learning_rate": 2.531191100705102e-06,
"loss": 0.8779,
"step": 422
},
{
"epoch": 0.6051502145922747,
"grad_norm": 7.563881874084473,
"learning_rate": 2.5285916347488315e-06,
"loss": 0.8159,
"step": 423
},
{
"epoch": 0.6065808297567954,
"grad_norm": 1.8715702295303345,
"learning_rate": 2.525986324772377e-06,
"loss": 0.7818,
"step": 424
},
{
"epoch": 0.6080114449213162,
"grad_norm": 6.312496185302734,
"learning_rate": 2.5233751855780012e-06,
"loss": 0.7421,
"step": 425
},
{
"epoch": 0.6094420600858369,
"grad_norm": 9.23725414276123,
"learning_rate": 2.5207582320010873e-06,
"loss": 0.8207,
"step": 426
},
{
"epoch": 0.6108726752503576,
"grad_norm": 8.034300804138184,
"learning_rate": 2.518135478910051e-06,
"loss": 0.8379,
"step": 427
},
{
"epoch": 0.6123032904148784,
"grad_norm": 18.190195083618164,
"learning_rate": 2.5155069412062605e-06,
"loss": 0.8071,
"step": 428
},
{
"epoch": 0.6137339055793991,
"grad_norm": 1.6293121576309204,
"learning_rate": 2.51287263382395e-06,
"loss": 0.8994,
"step": 429
},
{
"epoch": 0.6151645207439199,
"grad_norm": 12.373995780944824,
"learning_rate": 2.5102325717301316e-06,
"loss": 0.7766,
"step": 430
},
{
"epoch": 0.6165951359084406,
"grad_norm": 3.5726394653320312,
"learning_rate": 2.507586769924517e-06,
"loss": 0.8163,
"step": 431
},
{
"epoch": 0.6180257510729614,
"grad_norm": 21.729354858398438,
"learning_rate": 2.5049352434394263e-06,
"loss": 0.8227,
"step": 432
},
{
"epoch": 0.6194563662374821,
"grad_norm": 9.771985054016113,
"learning_rate": 2.502278007339705e-06,
"loss": 0.7762,
"step": 433
},
{
"epoch": 0.6208869814020028,
"grad_norm": 4.523687362670898,
"learning_rate": 2.4996150767226375e-06,
"loss": 0.7464,
"step": 434
},
{
"epoch": 0.6223175965665236,
"grad_norm": 5.442951202392578,
"learning_rate": 2.496946466717865e-06,
"loss": 0.7712,
"step": 435
},
{
"epoch": 0.6237482117310443,
"grad_norm": 4.3297600746154785,
"learning_rate": 2.494272192487293e-06,
"loss": 0.7618,
"step": 436
},
{
"epoch": 0.6251788268955651,
"grad_norm": 9.589028358459473,
"learning_rate": 2.4915922692250107e-06,
"loss": 0.8449,
"step": 437
},
{
"epoch": 0.6266094420600858,
"grad_norm": 1.7547855377197266,
"learning_rate": 2.4889067121572023e-06,
"loss": 0.8368,
"step": 438
},
{
"epoch": 0.6280400572246065,
"grad_norm": 32.727203369140625,
"learning_rate": 2.486215536542061e-06,
"loss": 0.7986,
"step": 439
},
{
"epoch": 0.6294706723891274,
"grad_norm": 25.33466911315918,
"learning_rate": 2.4835187576697013e-06,
"loss": 0.8372,
"step": 440
},
{
"epoch": 0.630901287553648,
"grad_norm": 4.323023319244385,
"learning_rate": 2.480816390862075e-06,
"loss": 0.7125,
"step": 441
},
{
"epoch": 0.6323319027181689,
"grad_norm": 1.7013431787490845,
"learning_rate": 2.4781084514728797e-06,
"loss": 0.8322,
"step": 442
},
{
"epoch": 0.6337625178826896,
"grad_norm": 3.388103485107422,
"learning_rate": 2.475394954887476e-06,
"loss": 0.8479,
"step": 443
},
{
"epoch": 0.6351931330472103,
"grad_norm": 1.854880452156067,
"learning_rate": 2.4726759165227963e-06,
"loss": 0.8113,
"step": 444
},
{
"epoch": 0.6366237482117311,
"grad_norm": 17.75172996520996,
"learning_rate": 2.469951351827262e-06,
"loss": 0.913,
"step": 445
},
{
"epoch": 0.6380543633762518,
"grad_norm": 57.98564910888672,
"learning_rate": 2.467221276280689e-06,
"loss": 0.8532,
"step": 446
},
{
"epoch": 0.6394849785407726,
"grad_norm": 16.528905868530273,
"learning_rate": 2.4644857053942066e-06,
"loss": 0.7039,
"step": 447
},
{
"epoch": 0.6409155937052933,
"grad_norm": 2.7571375370025635,
"learning_rate": 2.4617446547101648e-06,
"loss": 0.7315,
"step": 448
},
{
"epoch": 0.642346208869814,
"grad_norm": 1.71315336227417,
"learning_rate": 2.4589981398020472e-06,
"loss": 0.8122,
"step": 449
},
{
"epoch": 0.6437768240343348,
"grad_norm": 10.909632682800293,
"learning_rate": 2.456246176274384e-06,
"loss": 0.8142,
"step": 450
},
{
"epoch": 0.6452074391988555,
"grad_norm": 2.0243256092071533,
"learning_rate": 2.4534887797626616e-06,
"loss": 0.7944,
"step": 451
},
{
"epoch": 0.6466380543633763,
"grad_norm": 3.194434404373169,
"learning_rate": 2.4507259659332335e-06,
"loss": 0.7259,
"step": 452
},
{
"epoch": 0.648068669527897,
"grad_norm": 6.121618270874023,
"learning_rate": 2.447957750483233e-06,
"loss": 0.7809,
"step": 453
},
{
"epoch": 0.6494992846924177,
"grad_norm": 5.905685901641846,
"learning_rate": 2.4451841491404837e-06,
"loss": 0.7678,
"step": 454
},
{
"epoch": 0.6509298998569385,
"grad_norm": 2.6199986934661865,
"learning_rate": 2.4424051776634074e-06,
"loss": 0.858,
"step": 455
},
{
"epoch": 0.6523605150214592,
"grad_norm": 3.9428181648254395,
"learning_rate": 2.4396208518409392e-06,
"loss": 0.8447,
"step": 456
},
{
"epoch": 0.65379113018598,
"grad_norm": 10.59566593170166,
"learning_rate": 2.4368311874924335e-06,
"loss": 0.7262,
"step": 457
},
{
"epoch": 0.6552217453505007,
"grad_norm": 2.320530652999878,
"learning_rate": 2.434036200467577e-06,
"loss": 0.7948,
"step": 458
},
{
"epoch": 0.6566523605150214,
"grad_norm": 1.7014998197555542,
"learning_rate": 2.431235906646297e-06,
"loss": 0.795,
"step": 459
},
{
"epoch": 0.6580829756795422,
"grad_norm": 3.835496187210083,
"learning_rate": 2.4284303219386723e-06,
"loss": 0.791,
"step": 460
},
{
"epoch": 0.6595135908440629,
"grad_norm": 2.4570517539978027,
"learning_rate": 2.4256194622848413e-06,
"loss": 0.7939,
"step": 461
},
{
"epoch": 0.6609442060085837,
"grad_norm": 2.4925966262817383,
"learning_rate": 2.4228033436549135e-06,
"loss": 0.7902,
"step": 462
},
{
"epoch": 0.6623748211731044,
"grad_norm": 4.428621292114258,
"learning_rate": 2.4199819820488774e-06,
"loss": 0.7936,
"step": 463
},
{
"epoch": 0.6638054363376252,
"grad_norm": 2.6506454944610596,
"learning_rate": 2.417155393496509e-06,
"loss": 0.7503,
"step": 464
},
{
"epoch": 0.6652360515021459,
"grad_norm": 5.980336666107178,
"learning_rate": 2.4143235940572825e-06,
"loss": 0.7956,
"step": 465
},
{
"epoch": 0.6666666666666666,
"grad_norm": 5.393733978271484,
"learning_rate": 2.4114865998202785e-06,
"loss": 0.8161,
"step": 466
},
{
"epoch": 0.6680972818311874,
"grad_norm": 7.725034713745117,
"learning_rate": 2.4086444269040905e-06,
"loss": 0.835,
"step": 467
},
{
"epoch": 0.6695278969957081,
"grad_norm": 10.333250999450684,
"learning_rate": 2.4057970914567367e-06,
"loss": 0.8684,
"step": 468
},
{
"epoch": 0.670958512160229,
"grad_norm": 7.202269554138184,
"learning_rate": 2.4029446096555665e-06,
"loss": 0.7689,
"step": 469
},
{
"epoch": 0.6723891273247496,
"grad_norm": 3.6261048316955566,
"learning_rate": 2.4000869977071677e-06,
"loss": 0.846,
"step": 470
},
{
"epoch": 0.6738197424892703,
"grad_norm": 6.523050785064697,
"learning_rate": 2.3972242718472758e-06,
"loss": 0.854,
"step": 471
},
{
"epoch": 0.6752503576537912,
"grad_norm": 6.202769756317139,
"learning_rate": 2.3943564483406825e-06,
"loss": 0.7847,
"step": 472
},
{
"epoch": 0.6766809728183119,
"grad_norm": 2.8411245346069336,
"learning_rate": 2.391483543481141e-06,
"loss": 0.7264,
"step": 473
},
{
"epoch": 0.6781115879828327,
"grad_norm": 4.087616920471191,
"learning_rate": 2.388605573591273e-06,
"loss": 0.832,
"step": 474
},
{
"epoch": 0.6795422031473534,
"grad_norm": 3.4109015464782715,
"learning_rate": 2.385722555022482e-06,
"loss": 0.7944,
"step": 475
},
{
"epoch": 0.6809728183118741,
"grad_norm": 2.6394965648651123,
"learning_rate": 2.382834504154852e-06,
"loss": 0.7663,
"step": 476
},
{
"epoch": 0.6824034334763949,
"grad_norm": 1.812250018119812,
"learning_rate": 2.3799414373970595e-06,
"loss": 0.7917,
"step": 477
},
{
"epoch": 0.6838340486409156,
"grad_norm": 3.3612866401672363,
"learning_rate": 2.3770433711862792e-06,
"loss": 0.8315,
"step": 478
},
{
"epoch": 0.6852646638054364,
"grad_norm": 1.5638633966445923,
"learning_rate": 2.3741403219880914e-06,
"loss": 0.8377,
"step": 479
},
{
"epoch": 0.6866952789699571,
"grad_norm": 2.297668695449829,
"learning_rate": 2.3712323062963865e-06,
"loss": 0.7572,
"step": 480
},
{
"epoch": 0.6881258941344778,
"grad_norm": 6.158240795135498,
"learning_rate": 2.3683193406332724e-06,
"loss": 0.8389,
"step": 481
},
{
"epoch": 0.6895565092989986,
"grad_norm": 2.0638744831085205,
"learning_rate": 2.3654014415489823e-06,
"loss": 0.7253,
"step": 482
},
{
"epoch": 0.6909871244635193,
"grad_norm": 2.175001382827759,
"learning_rate": 2.362478625621777e-06,
"loss": 0.8104,
"step": 483
},
{
"epoch": 0.6924177396280401,
"grad_norm": 1.4559204578399658,
"learning_rate": 2.3595509094578526e-06,
"loss": 0.7884,
"step": 484
},
{
"epoch": 0.6938483547925608,
"grad_norm": 5.243185520172119,
"learning_rate": 2.3566183096912486e-06,
"loss": 0.7642,
"step": 485
},
{
"epoch": 0.6952789699570815,
"grad_norm": 2.665585994720459,
"learning_rate": 2.353680842983749e-06,
"loss": 0.7022,
"step": 486
},
{
"epoch": 0.6967095851216023,
"grad_norm": 2.043215274810791,
"learning_rate": 2.35073852602479e-06,
"loss": 0.8458,
"step": 487
},
{
"epoch": 0.698140200286123,
"grad_norm": 22.086647033691406,
"learning_rate": 2.347791375531365e-06,
"loss": 0.7665,
"step": 488
},
{
"epoch": 0.6995708154506438,
"grad_norm": 7.558642387390137,
"learning_rate": 2.34483940824793e-06,
"loss": 0.844,
"step": 489
},
{
"epoch": 0.7010014306151645,
"grad_norm": 7.840277671813965,
"learning_rate": 2.341882640946308e-06,
"loss": 0.8423,
"step": 490
},
{
"epoch": 0.7024320457796852,
"grad_norm": 5.021843433380127,
"learning_rate": 2.3389210904255924e-06,
"loss": 0.8149,
"step": 491
},
{
"epoch": 0.703862660944206,
"grad_norm": 3.7846174240112305,
"learning_rate": 2.3359547735120533e-06,
"loss": 0.8246,
"step": 492
},
{
"epoch": 0.7052932761087267,
"grad_norm": 2.6051504611968994,
"learning_rate": 2.332983707059043e-06,
"loss": 0.7554,
"step": 493
},
{
"epoch": 0.7067238912732475,
"grad_norm": 3.5369930267333984,
"learning_rate": 2.3300079079468966e-06,
"loss": 0.8198,
"step": 494
},
{
"epoch": 0.7081545064377682,
"grad_norm": 3.8711929321289062,
"learning_rate": 2.3270273930828395e-06,
"loss": 0.8471,
"step": 495
},
{
"epoch": 0.709585121602289,
"grad_norm": 7.332760334014893,
"learning_rate": 2.3240421794008887e-06,
"loss": 0.8014,
"step": 496
},
{
"epoch": 0.7110157367668097,
"grad_norm": 10.128408432006836,
"learning_rate": 2.32105228386176e-06,
"loss": 0.8255,
"step": 497
},
{
"epoch": 0.7124463519313304,
"grad_norm": 12.858423233032227,
"learning_rate": 2.318057723452766e-06,
"loss": 0.7532,
"step": 498
},
{
"epoch": 0.7138769670958512,
"grad_norm": 14.100822448730469,
"learning_rate": 2.3150585151877275e-06,
"loss": 0.8493,
"step": 499
},
{
"epoch": 0.7153075822603719,
"grad_norm": 6.1690802574157715,
"learning_rate": 2.312054676106869e-06,
"loss": 0.8536,
"step": 500
},
{
"epoch": 0.7167381974248928,
"grad_norm": 4.373723983764648,
"learning_rate": 2.3090462232767273e-06,
"loss": 0.6945,
"step": 501
},
{
"epoch": 0.7181688125894135,
"grad_norm": 3.3550474643707275,
"learning_rate": 2.306033173790051e-06,
"loss": 0.8152,
"step": 502
},
{
"epoch": 0.7195994277539342,
"grad_norm": 3.153048515319824,
"learning_rate": 2.303015544765706e-06,
"loss": 0.7717,
"step": 503
},
{
"epoch": 0.721030042918455,
"grad_norm": 1.9680156707763672,
"learning_rate": 2.2999933533485773e-06,
"loss": 0.8112,
"step": 504
},
{
"epoch": 0.7224606580829757,
"grad_norm": 1.8092211484909058,
"learning_rate": 2.296966616709471e-06,
"loss": 0.7915,
"step": 505
},
{
"epoch": 0.7238912732474965,
"grad_norm": 2.4597418308258057,
"learning_rate": 2.2939353520450174e-06,
"loss": 0.8475,
"step": 506
},
{
"epoch": 0.7253218884120172,
"grad_norm": 2.957054853439331,
"learning_rate": 2.2908995765775724e-06,
"loss": 0.7414,
"step": 507
},
{
"epoch": 0.7267525035765379,
"grad_norm": 6.677426338195801,
"learning_rate": 2.287859307555122e-06,
"loss": 0.8409,
"step": 508
},
{
"epoch": 0.7281831187410587,
"grad_norm": 1.2464028596878052,
"learning_rate": 2.284814562251181e-06,
"loss": 0.743,
"step": 509
},
{
"epoch": 0.7296137339055794,
"grad_norm": 2.3922863006591797,
"learning_rate": 2.2817653579646976e-06,
"loss": 0.8122,
"step": 510
},
{
"epoch": 0.7310443490701002,
"grad_norm": 2.2561073303222656,
"learning_rate": 2.2787117120199536e-06,
"loss": 0.8087,
"step": 511
},
{
"epoch": 0.7324749642346209,
"grad_norm": 2.277667284011841,
"learning_rate": 2.275653641766466e-06,
"loss": 0.7543,
"step": 512
},
{
"epoch": 0.7339055793991416,
"grad_norm": 4.844744682312012,
"learning_rate": 2.2725911645788896e-06,
"loss": 0.7403,
"step": 513
},
{
"epoch": 0.7353361945636624,
"grad_norm": 2.275442123413086,
"learning_rate": 2.269524297856918e-06,
"loss": 0.8568,
"step": 514
},
{
"epoch": 0.7367668097281831,
"grad_norm": 1.839421272277832,
"learning_rate": 2.266453059025182e-06,
"loss": 0.8456,
"step": 515
},
{
"epoch": 0.7381974248927039,
"grad_norm": 5.043838977813721,
"learning_rate": 2.2633774655331557e-06,
"loss": 0.8047,
"step": 516
},
{
"epoch": 0.7396280400572246,
"grad_norm": 28.97209930419922,
"learning_rate": 2.2602975348550526e-06,
"loss": 0.7526,
"step": 517
},
{
"epoch": 0.7410586552217453,
"grad_norm": 1.293421983718872,
"learning_rate": 2.2572132844897287e-06,
"loss": 0.7508,
"step": 518
},
{
"epoch": 0.7424892703862661,
"grad_norm": 3.2048988342285156,
"learning_rate": 2.2541247319605834e-06,
"loss": 0.8266,
"step": 519
},
{
"epoch": 0.7439198855507868,
"grad_norm": 2.890925884246826,
"learning_rate": 2.251031894815458e-06,
"loss": 0.8708,
"step": 520
},
{
"epoch": 0.7453505007153076,
"grad_norm": 11.851993560791016,
"learning_rate": 2.2479347906265375e-06,
"loss": 0.8088,
"step": 521
},
{
"epoch": 0.7467811158798283,
"grad_norm": 9.119662284851074,
"learning_rate": 2.2448334369902512e-06,
"loss": 0.7403,
"step": 522
},
{
"epoch": 0.748211731044349,
"grad_norm": 2.4148502349853516,
"learning_rate": 2.2417278515271717e-06,
"loss": 0.8282,
"step": 523
},
{
"epoch": 0.7496423462088698,
"grad_norm": 4.801558494567871,
"learning_rate": 2.2386180518819133e-06,
"loss": 0.8236,
"step": 524
},
{
"epoch": 0.7510729613733905,
"grad_norm": 6.738923072814941,
"learning_rate": 2.2355040557230362e-06,
"loss": 0.8058,
"step": 525
},
{
"epoch": 0.7510729613733905,
"eval_loss": 0.9472324252128601,
"eval_runtime": 64.2532,
"eval_samples_per_second": 6.459,
"eval_steps_per_second": 0.405,
"step": 525
},
{
"epoch": 0.7525035765379113,
"grad_norm": 2.7826318740844727,
"learning_rate": 2.232385880742942e-06,
"loss": 0.8036,
"step": 526
},
{
"epoch": 0.753934191702432,
"grad_norm": 2.154601573944092,
"learning_rate": 2.229263544657774e-06,
"loss": 0.7827,
"step": 527
},
{
"epoch": 0.7553648068669528,
"grad_norm": 5.298775672912598,
"learning_rate": 2.226137065207318e-06,
"loss": 0.8632,
"step": 528
},
{
"epoch": 0.7567954220314735,
"grad_norm": 4.424346446990967,
"learning_rate": 2.223006460154901e-06,
"loss": 0.84,
"step": 529
},
{
"epoch": 0.7582260371959942,
"grad_norm": 1.773184061050415,
"learning_rate": 2.219871747287289e-06,
"loss": 0.7129,
"step": 530
},
{
"epoch": 0.759656652360515,
"grad_norm": 2.4234611988067627,
"learning_rate": 2.216732944414588e-06,
"loss": 0.8844,
"step": 531
},
{
"epoch": 0.7610872675250357,
"grad_norm": 8.778568267822266,
"learning_rate": 2.2135900693701396e-06,
"loss": 0.7412,
"step": 532
},
{
"epoch": 0.7625178826895566,
"grad_norm": 2.856816530227661,
"learning_rate": 2.210443140010424e-06,
"loss": 0.8266,
"step": 533
},
{
"epoch": 0.7639484978540773,
"grad_norm": 1.9950830936431885,
"learning_rate": 2.2072921742149547e-06,
"loss": 0.7138,
"step": 534
},
{
"epoch": 0.765379113018598,
"grad_norm": 4.11127233505249,
"learning_rate": 2.2041371898861797e-06,
"loss": 0.7274,
"step": 535
},
{
"epoch": 0.7668097281831188,
"grad_norm": 6.386568069458008,
"learning_rate": 2.2009782049493786e-06,
"loss": 0.7266,
"step": 536
},
{
"epoch": 0.7682403433476395,
"grad_norm": 52.481101989746094,
"learning_rate": 2.197815237352559e-06,
"loss": 0.7578,
"step": 537
},
{
"epoch": 0.7696709585121603,
"grad_norm": 1.5614707469940186,
"learning_rate": 2.1946483050663577e-06,
"loss": 0.7825,
"step": 538
},
{
"epoch": 0.771101573676681,
"grad_norm": 1.0652791261672974,
"learning_rate": 2.191477426083938e-06,
"loss": 0.7794,
"step": 539
},
{
"epoch": 0.7725321888412017,
"grad_norm": 3.390814781188965,
"learning_rate": 2.188302618420884e-06,
"loss": 0.7919,
"step": 540
},
{
"epoch": 0.7739628040057225,
"grad_norm": 1.7528146505355835,
"learning_rate": 2.1851239001151045e-06,
"loss": 0.8441,
"step": 541
},
{
"epoch": 0.7753934191702432,
"grad_norm": 2.2587413787841797,
"learning_rate": 2.181941289226724e-06,
"loss": 0.7683,
"step": 542
},
{
"epoch": 0.776824034334764,
"grad_norm": 1.5743305683135986,
"learning_rate": 2.178754803837983e-06,
"loss": 0.7909,
"step": 543
},
{
"epoch": 0.7782546494992847,
"grad_norm": 1.3611418008804321,
"learning_rate": 2.1755644620531374e-06,
"loss": 0.7889,
"step": 544
},
{
"epoch": 0.7796852646638054,
"grad_norm": 4.035607814788818,
"learning_rate": 2.172370281998352e-06,
"loss": 0.8698,
"step": 545
},
{
"epoch": 0.7811158798283262,
"grad_norm": 2.1121273040771484,
"learning_rate": 2.169172281821599e-06,
"loss": 0.8374,
"step": 546
},
{
"epoch": 0.7825464949928469,
"grad_norm": 2.133861541748047,
"learning_rate": 2.1659704796925556e-06,
"loss": 0.7605,
"step": 547
},
{
"epoch": 0.7839771101573677,
"grad_norm": 3.578118085861206,
"learning_rate": 2.1627648938024992e-06,
"loss": 0.7709,
"step": 548
},
{
"epoch": 0.7854077253218884,
"grad_norm": 2.089550018310547,
"learning_rate": 2.1595555423642063e-06,
"loss": 0.8255,
"step": 549
},
{
"epoch": 0.7868383404864091,
"grad_norm": 11.813118934631348,
"learning_rate": 2.1563424436118457e-06,
"loss": 0.7723,
"step": 550
},
{
"epoch": 0.7882689556509299,
"grad_norm": 3.343435764312744,
"learning_rate": 2.153125615800879e-06,
"loss": 0.7733,
"step": 551
},
{
"epoch": 0.7896995708154506,
"grad_norm": 2.129638433456421,
"learning_rate": 2.149905077207953e-06,
"loss": 0.8172,
"step": 552
},
{
"epoch": 0.7911301859799714,
"grad_norm": 4.0213518142700195,
"learning_rate": 2.146680846130798e-06,
"loss": 0.7916,
"step": 553
},
{
"epoch": 0.7925608011444921,
"grad_norm": 2.1677591800689697,
"learning_rate": 2.1434529408881236e-06,
"loss": 0.7638,
"step": 554
},
{
"epoch": 0.7939914163090128,
"grad_norm": 16.7386417388916,
"learning_rate": 2.1402213798195154e-06,
"loss": 0.8264,
"step": 555
},
{
"epoch": 0.7954220314735336,
"grad_norm": 10.93736457824707,
"learning_rate": 2.136986181285328e-06,
"loss": 0.7442,
"step": 556
},
{
"epoch": 0.7968526466380543,
"grad_norm": 1.9464385509490967,
"learning_rate": 2.133747363666584e-06,
"loss": 0.7404,
"step": 557
},
{
"epoch": 0.7982832618025751,
"grad_norm": 4.377130031585693,
"learning_rate": 2.130504945364867e-06,
"loss": 0.8033,
"step": 558
},
{
"epoch": 0.7997138769670958,
"grad_norm": 1.937048077583313,
"learning_rate": 2.127258944802219e-06,
"loss": 0.6928,
"step": 559
},
{
"epoch": 0.8011444921316166,
"grad_norm": 14.530089378356934,
"learning_rate": 2.124009380421035e-06,
"loss": 0.7674,
"step": 560
},
{
"epoch": 0.8025751072961373,
"grad_norm": 2.9048678874969482,
"learning_rate": 2.1207562706839576e-06,
"loss": 0.8203,
"step": 561
},
{
"epoch": 0.804005722460658,
"grad_norm": 1.4509227275848389,
"learning_rate": 2.117499634073772e-06,
"loss": 0.8966,
"step": 562
},
{
"epoch": 0.8054363376251789,
"grad_norm": 3.235529661178589,
"learning_rate": 2.114239489093303e-06,
"loss": 0.873,
"step": 563
},
{
"epoch": 0.8068669527896996,
"grad_norm": 2.0383121967315674,
"learning_rate": 2.110975854265307e-06,
"loss": 0.7683,
"step": 564
},
{
"epoch": 0.8082975679542204,
"grad_norm": 7.3494086265563965,
"learning_rate": 2.10770874813237e-06,
"loss": 0.8416,
"step": 565
},
{
"epoch": 0.8097281831187411,
"grad_norm": 5.675449848175049,
"learning_rate": 2.104438189256799e-06,
"loss": 0.7911,
"step": 566
},
{
"epoch": 0.8111587982832618,
"grad_norm": 4.777390480041504,
"learning_rate": 2.1011641962205187e-06,
"loss": 0.8528,
"step": 567
},
{
"epoch": 0.8125894134477826,
"grad_norm": 2.8145751953125,
"learning_rate": 2.0978867876249645e-06,
"loss": 0.7943,
"step": 568
},
{
"epoch": 0.8140200286123033,
"grad_norm": 5.517285346984863,
"learning_rate": 2.0946059820909782e-06,
"loss": 0.8388,
"step": 569
},
{
"epoch": 0.8154506437768241,
"grad_norm": 1.663217306137085,
"learning_rate": 2.0913217982587015e-06,
"loss": 0.8075,
"step": 570
},
{
"epoch": 0.8168812589413448,
"grad_norm": 2.412997245788574,
"learning_rate": 2.088034254787471e-06,
"loss": 0.8201,
"step": 571
},
{
"epoch": 0.8183118741058655,
"grad_norm": 2.8591926097869873,
"learning_rate": 2.0847433703557086e-06,
"loss": 0.7948,
"step": 572
},
{
"epoch": 0.8197424892703863,
"grad_norm": 1.8389852046966553,
"learning_rate": 2.0814491636608215e-06,
"loss": 0.8375,
"step": 573
},
{
"epoch": 0.821173104434907,
"grad_norm": 1.8744090795516968,
"learning_rate": 2.0781516534190904e-06,
"loss": 0.8258,
"step": 574
},
{
"epoch": 0.8226037195994278,
"grad_norm": 1.6970771551132202,
"learning_rate": 2.0748508583655664e-06,
"loss": 0.7844,
"step": 575
},
{
"epoch": 0.8240343347639485,
"grad_norm": 2.463869333267212,
"learning_rate": 2.0715467972539623e-06,
"loss": 0.7811,
"step": 576
},
{
"epoch": 0.8254649499284692,
"grad_norm": 5.908227443695068,
"learning_rate": 2.068239488856549e-06,
"loss": 0.7585,
"step": 577
},
{
"epoch": 0.82689556509299,
"grad_norm": 2.3920488357543945,
"learning_rate": 2.0649289519640455e-06,
"loss": 0.7492,
"step": 578
},
{
"epoch": 0.8283261802575107,
"grad_norm": 3.978250503540039,
"learning_rate": 2.0616152053855146e-06,
"loss": 0.7396,
"step": 579
},
{
"epoch": 0.8297567954220315,
"grad_norm": 7.113371849060059,
"learning_rate": 2.0582982679482547e-06,
"loss": 0.8467,
"step": 580
},
{
"epoch": 0.8311874105865522,
"grad_norm": 2.1145403385162354,
"learning_rate": 2.0549781584976937e-06,
"loss": 0.8825,
"step": 581
},
{
"epoch": 0.8326180257510729,
"grad_norm": 1.185595989227295,
"learning_rate": 2.0516548958972816e-06,
"loss": 0.769,
"step": 582
},
{
"epoch": 0.8340486409155937,
"grad_norm": 2.6032955646514893,
"learning_rate": 2.0483284990283833e-06,
"loss": 0.791,
"step": 583
},
{
"epoch": 0.8354792560801144,
"grad_norm": 2.5867621898651123,
"learning_rate": 2.0449989867901698e-06,
"loss": 0.8191,
"step": 584
},
{
"epoch": 0.8369098712446352,
"grad_norm": 5.271536827087402,
"learning_rate": 2.041666378099515e-06,
"loss": 0.777,
"step": 585
},
{
"epoch": 0.8383404864091559,
"grad_norm": 1.7477766275405884,
"learning_rate": 2.0383306918908827e-06,
"loss": 0.7011,
"step": 586
},
{
"epoch": 0.8397711015736766,
"grad_norm": 3.4318323135375977,
"learning_rate": 2.0349919471162245e-06,
"loss": 0.867,
"step": 587
},
{
"epoch": 0.8412017167381974,
"grad_norm": 8.079400062561035,
"learning_rate": 2.031650162744867e-06,
"loss": 0.8089,
"step": 588
},
{
"epoch": 0.8426323319027181,
"grad_norm": 2.9076359272003174,
"learning_rate": 2.028305357763408e-06,
"loss": 0.8009,
"step": 589
},
{
"epoch": 0.844062947067239,
"grad_norm": 1.9895862340927124,
"learning_rate": 2.024957551175607e-06,
"loss": 0.9391,
"step": 590
},
{
"epoch": 0.8454935622317596,
"grad_norm": 1.4979381561279297,
"learning_rate": 2.0216067620022773e-06,
"loss": 0.6863,
"step": 591
},
{
"epoch": 0.8469241773962805,
"grad_norm": 1.5766220092773438,
"learning_rate": 2.0182530092811776e-06,
"loss": 0.8043,
"step": 592
},
{
"epoch": 0.8483547925608012,
"grad_norm": 62.486412048339844,
"learning_rate": 2.0148963120669043e-06,
"loss": 0.7341,
"step": 593
},
{
"epoch": 0.8497854077253219,
"grad_norm": 6.319479942321777,
"learning_rate": 2.0115366894307833e-06,
"loss": 0.8319,
"step": 594
},
{
"epoch": 0.8512160228898427,
"grad_norm": 4.788107395172119,
"learning_rate": 2.0081741604607617e-06,
"loss": 0.8415,
"step": 595
},
{
"epoch": 0.8526466380543634,
"grad_norm": 5.281350135803223,
"learning_rate": 2.004808744261299e-06,
"loss": 0.8006,
"step": 596
},
{
"epoch": 0.8540772532188842,
"grad_norm": 8.550089836120605,
"learning_rate": 2.001440459953258e-06,
"loss": 0.8473,
"step": 597
},
{
"epoch": 0.8555078683834049,
"grad_norm": 2.373152732849121,
"learning_rate": 1.998069326673798e-06,
"loss": 0.7599,
"step": 598
},
{
"epoch": 0.8569384835479256,
"grad_norm": 1.6767165660858154,
"learning_rate": 1.994695363576265e-06,
"loss": 0.7986,
"step": 599
},
{
"epoch": 0.8583690987124464,
"grad_norm": 2.729363441467285,
"learning_rate": 1.991318589830081e-06,
"loss": 0.8142,
"step": 600
},
{
"epoch": 0.8597997138769671,
"grad_norm": 9.5516357421875,
"learning_rate": 1.9879390246206394e-06,
"loss": 0.7423,
"step": 601
},
{
"epoch": 0.8612303290414879,
"grad_norm": 1.492181420326233,
"learning_rate": 1.9845566871491923e-06,
"loss": 0.8123,
"step": 602
},
{
"epoch": 0.8626609442060086,
"grad_norm": 1.3610990047454834,
"learning_rate": 1.9811715966327413e-06,
"loss": 0.7944,
"step": 603
},
{
"epoch": 0.8640915593705293,
"grad_norm": 2.7227566242218018,
"learning_rate": 1.9777837723039323e-06,
"loss": 0.8195,
"step": 604
},
{
"epoch": 0.8655221745350501,
"grad_norm": 6.30021858215332,
"learning_rate": 1.9743932334109423e-06,
"loss": 0.774,
"step": 605
},
{
"epoch": 0.8669527896995708,
"grad_norm": 2.1827492713928223,
"learning_rate": 1.97099999921737e-06,
"loss": 0.7981,
"step": 606
},
{
"epoch": 0.8683834048640916,
"grad_norm": 1.8844138383865356,
"learning_rate": 1.96760408900213e-06,
"loss": 0.7882,
"step": 607
},
{
"epoch": 0.8698140200286123,
"grad_norm": 3.7884268760681152,
"learning_rate": 1.9642055220593394e-06,
"loss": 0.7905,
"step": 608
},
{
"epoch": 0.871244635193133,
"grad_norm": 1.2026420831680298,
"learning_rate": 1.9608043176982095e-06,
"loss": 0.8302,
"step": 609
},
{
"epoch": 0.8726752503576538,
"grad_norm": 3.9259285926818848,
"learning_rate": 1.957400495242938e-06,
"loss": 0.775,
"step": 610
},
{
"epoch": 0.8741058655221745,
"grad_norm": 2.2979843616485596,
"learning_rate": 1.9539940740325953e-06,
"loss": 0.8282,
"step": 611
},
{
"epoch": 0.8755364806866953,
"grad_norm": 25.16666603088379,
"learning_rate": 1.950585073421018e-06,
"loss": 0.7903,
"step": 612
},
{
"epoch": 0.876967095851216,
"grad_norm": 2.016211748123169,
"learning_rate": 1.947173512776699e-06,
"loss": 0.7878,
"step": 613
},
{
"epoch": 0.8783977110157367,
"grad_norm": 3.2067463397979736,
"learning_rate": 1.9437594114826734e-06,
"loss": 0.7854,
"step": 614
},
{
"epoch": 0.8798283261802575,
"grad_norm": 4.444864273071289,
"learning_rate": 1.940342788936413e-06,
"loss": 0.844,
"step": 615
},
{
"epoch": 0.8812589413447782,
"grad_norm": 3.628343105316162,
"learning_rate": 1.9369236645497137e-06,
"loss": 0.7698,
"step": 616
},
{
"epoch": 0.882689556509299,
"grad_norm": 1.8619632720947266,
"learning_rate": 1.933502057748587e-06,
"loss": 0.7731,
"step": 617
},
{
"epoch": 0.8841201716738197,
"grad_norm": 4.017360210418701,
"learning_rate": 1.9300779879731462e-06,
"loss": 0.8335,
"step": 618
},
{
"epoch": 0.8855507868383404,
"grad_norm": 4.365695953369141,
"learning_rate": 1.9266514746775006e-06,
"loss": 0.7448,
"step": 619
},
{
"epoch": 0.8869814020028612,
"grad_norm": 3.6699016094207764,
"learning_rate": 1.9232225373296406e-06,
"loss": 0.8343,
"step": 620
},
{
"epoch": 0.8884120171673819,
"grad_norm": 0.9214816093444824,
"learning_rate": 1.9197911954113295e-06,
"loss": 0.7744,
"step": 621
},
{
"epoch": 0.8898426323319027,
"grad_norm": 9.310022354125977,
"learning_rate": 1.916357468417994e-06,
"loss": 0.8854,
"step": 622
},
{
"epoch": 0.8912732474964234,
"grad_norm": 1.421976923942566,
"learning_rate": 1.9129213758586094e-06,
"loss": 0.8246,
"step": 623
},
{
"epoch": 0.8927038626609443,
"grad_norm": 1.6473592519760132,
"learning_rate": 1.909482937255592e-06,
"loss": 0.8423,
"step": 624
},
{
"epoch": 0.894134477825465,
"grad_norm": 3.704306125640869,
"learning_rate": 1.9060421721446884e-06,
"loss": 0.8118,
"step": 625
},
{
"epoch": 0.8955650929899857,
"grad_norm": 22.0517635345459,
"learning_rate": 1.9025991000748615e-06,
"loss": 0.8045,
"step": 626
},
{
"epoch": 0.8969957081545065,
"grad_norm": 3.9099020957946777,
"learning_rate": 1.8991537406081833e-06,
"loss": 0.8319,
"step": 627
},
{
"epoch": 0.8984263233190272,
"grad_norm": 1.8165937662124634,
"learning_rate": 1.8957061133197202e-06,
"loss": 0.7867,
"step": 628
},
{
"epoch": 0.899856938483548,
"grad_norm": 1.5057600736618042,
"learning_rate": 1.8922562377974244e-06,
"loss": 0.8217,
"step": 629
},
{
"epoch": 0.9012875536480687,
"grad_norm": 3.3929216861724854,
"learning_rate": 1.8888041336420212e-06,
"loss": 0.7126,
"step": 630
},
{
"epoch": 0.9027181688125894,
"grad_norm": 1.0596497058868408,
"learning_rate": 1.8853498204668986e-06,
"loss": 0.7926,
"step": 631
},
{
"epoch": 0.9041487839771102,
"grad_norm": 5.535174369812012,
"learning_rate": 1.881893317897994e-06,
"loss": 0.749,
"step": 632
},
{
"epoch": 0.9055793991416309,
"grad_norm": 5.7785964012146,
"learning_rate": 1.8784346455736855e-06,
"loss": 0.8318,
"step": 633
},
{
"epoch": 0.9070100143061517,
"grad_norm": 1.2321951389312744,
"learning_rate": 1.8749738231446784e-06,
"loss": 0.8232,
"step": 634
},
{
"epoch": 0.9084406294706724,
"grad_norm": 3.309943199157715,
"learning_rate": 1.8715108702738928e-06,
"loss": 0.8027,
"step": 635
},
{
"epoch": 0.9098712446351931,
"grad_norm": 2.805023193359375,
"learning_rate": 1.8680458066363548e-06,
"loss": 0.7425,
"step": 636
},
{
"epoch": 0.9113018597997139,
"grad_norm": 1.852483868598938,
"learning_rate": 1.8645786519190823e-06,
"loss": 0.7809,
"step": 637
},
{
"epoch": 0.9127324749642346,
"grad_norm": 1.6780593395233154,
"learning_rate": 1.8611094258209734e-06,
"loss": 0.7843,
"step": 638
},
{
"epoch": 0.9141630901287554,
"grad_norm": 1.102247953414917,
"learning_rate": 1.857638148052695e-06,
"loss": 0.7515,
"step": 639
},
{
"epoch": 0.9155937052932761,
"grad_norm": 9.121733665466309,
"learning_rate": 1.8541648383365718e-06,
"loss": 0.7945,
"step": 640
},
{
"epoch": 0.9170243204577968,
"grad_norm": 19.972715377807617,
"learning_rate": 1.8506895164064718e-06,
"loss": 0.8476,
"step": 641
},
{
"epoch": 0.9184549356223176,
"grad_norm": 3.2186429500579834,
"learning_rate": 1.8472122020076958e-06,
"loss": 0.6715,
"step": 642
},
{
"epoch": 0.9198855507868383,
"grad_norm": 6.097784042358398,
"learning_rate": 1.8437329148968656e-06,
"loss": 0.7966,
"step": 643
},
{
"epoch": 0.9213161659513591,
"grad_norm": 2.0366463661193848,
"learning_rate": 1.8402516748418104e-06,
"loss": 0.8192,
"step": 644
},
{
"epoch": 0.9227467811158798,
"grad_norm": 2.5847008228302,
"learning_rate": 1.8367685016214566e-06,
"loss": 0.7565,
"step": 645
},
{
"epoch": 0.9241773962804005,
"grad_norm": 9.477577209472656,
"learning_rate": 1.8332834150257114e-06,
"loss": 0.8442,
"step": 646
},
{
"epoch": 0.9256080114449213,
"grad_norm": 7.726278781890869,
"learning_rate": 1.8297964348553555e-06,
"loss": 0.6881,
"step": 647
},
{
"epoch": 0.927038626609442,
"grad_norm": 3.332657814025879,
"learning_rate": 1.8263075809219276e-06,
"loss": 0.8475,
"step": 648
},
{
"epoch": 0.9284692417739628,
"grad_norm": 3.3939545154571533,
"learning_rate": 1.8228168730476105e-06,
"loss": 0.7308,
"step": 649
},
{
"epoch": 0.9298998569384835,
"grad_norm": 1.3222719430923462,
"learning_rate": 1.8193243310651228e-06,
"loss": 0.7714,
"step": 650
},
{
"epoch": 0.9313304721030042,
"grad_norm": 5.846932888031006,
"learning_rate": 1.8158299748176019e-06,
"loss": 0.7393,
"step": 651
},
{
"epoch": 0.932761087267525,
"grad_norm": 1.6963729858398438,
"learning_rate": 1.812333824158494e-06,
"loss": 0.756,
"step": 652
},
{
"epoch": 0.9341917024320457,
"grad_norm": 1.2512105703353882,
"learning_rate": 1.8088358989514405e-06,
"loss": 0.8292,
"step": 653
},
{
"epoch": 0.9356223175965666,
"grad_norm": 4.08266544342041,
"learning_rate": 1.805336219070164e-06,
"loss": 0.7543,
"step": 654
},
{
"epoch": 0.9370529327610873,
"grad_norm": 2.852705955505371,
"learning_rate": 1.8018348043983574e-06,
"loss": 0.7735,
"step": 655
},
{
"epoch": 0.9384835479256081,
"grad_norm": 1.9104331731796265,
"learning_rate": 1.79833167482957e-06,
"loss": 0.7555,
"step": 656
},
{
"epoch": 0.9399141630901288,
"grad_norm": 2.230699300765991,
"learning_rate": 1.7948268502670936e-06,
"loss": 0.8005,
"step": 657
},
{
"epoch": 0.9413447782546495,
"grad_norm": 1.6662317514419556,
"learning_rate": 1.7913203506238506e-06,
"loss": 0.922,
"step": 658
},
{
"epoch": 0.9427753934191703,
"grad_norm": 6.263296604156494,
"learning_rate": 1.787812195822281e-06,
"loss": 0.8096,
"step": 659
},
{
"epoch": 0.944206008583691,
"grad_norm": 3.0145373344421387,
"learning_rate": 1.7843024057942278e-06,
"loss": 0.7369,
"step": 660
},
{
"epoch": 0.9456366237482118,
"grad_norm": 2.3436765670776367,
"learning_rate": 1.7807910004808256e-06,
"loss": 0.761,
"step": 661
},
{
"epoch": 0.9470672389127325,
"grad_norm": 1.2780580520629883,
"learning_rate": 1.7772779998323859e-06,
"loss": 0.8346,
"step": 662
},
{
"epoch": 0.9484978540772532,
"grad_norm": 3.3852529525756836,
"learning_rate": 1.7737634238082838e-06,
"loss": 0.7956,
"step": 663
},
{
"epoch": 0.949928469241774,
"grad_norm": 4.87917947769165,
"learning_rate": 1.7702472923768456e-06,
"loss": 0.8228,
"step": 664
},
{
"epoch": 0.9513590844062947,
"grad_norm": 18.506868362426758,
"learning_rate": 1.766729625515235e-06,
"loss": 0.7943,
"step": 665
},
{
"epoch": 0.9527896995708155,
"grad_norm": 4.777498245239258,
"learning_rate": 1.7632104432093383e-06,
"loss": 0.7994,
"step": 666
},
{
"epoch": 0.9542203147353362,
"grad_norm": 1.219874382019043,
"learning_rate": 1.7596897654536527e-06,
"loss": 0.8897,
"step": 667
},
{
"epoch": 0.9556509298998569,
"grad_norm": 1.1841962337493896,
"learning_rate": 1.7561676122511722e-06,
"loss": 0.8273,
"step": 668
},
{
"epoch": 0.9570815450643777,
"grad_norm": 3.4952194690704346,
"learning_rate": 1.7526440036132735e-06,
"loss": 0.766,
"step": 669
},
{
"epoch": 0.9585121602288984,
"grad_norm": 1.1049143075942993,
"learning_rate": 1.749118959559601e-06,
"loss": 0.7345,
"step": 670
},
{
"epoch": 0.9599427753934192,
"grad_norm": 1.2833698987960815,
"learning_rate": 1.745592500117957e-06,
"loss": 0.806,
"step": 671
},
{
"epoch": 0.9613733905579399,
"grad_norm": 4.3774518966674805,
"learning_rate": 1.742064645324183e-06,
"loss": 0.7199,
"step": 672
},
{
"epoch": 0.9628040057224606,
"grad_norm": 4.67322301864624,
"learning_rate": 1.7385354152220507e-06,
"loss": 0.8035,
"step": 673
},
{
"epoch": 0.9642346208869814,
"grad_norm": 5.434276580810547,
"learning_rate": 1.7350048298631435e-06,
"loss": 0.8651,
"step": 674
},
{
"epoch": 0.9656652360515021,
"grad_norm": 2.621474027633667,
"learning_rate": 1.731472909306746e-06,
"loss": 0.772,
"step": 675
},
{
"epoch": 0.9670958512160229,
"grad_norm": 2.7498602867126465,
"learning_rate": 1.7279396736197291e-06,
"loss": 0.7756,
"step": 676
},
{
"epoch": 0.9685264663805436,
"grad_norm": 3.2077572345733643,
"learning_rate": 1.7244051428764343e-06,
"loss": 0.7203,
"step": 677
},
{
"epoch": 0.9699570815450643,
"grad_norm": 3.252988338470459,
"learning_rate": 1.7208693371585628e-06,
"loss": 0.8783,
"step": 678
},
{
"epoch": 0.9713876967095851,
"grad_norm": 2.9252920150756836,
"learning_rate": 1.7173322765550588e-06,
"loss": 0.7418,
"step": 679
},
{
"epoch": 0.9728183118741058,
"grad_norm": 3.183591842651367,
"learning_rate": 1.7137939811619956e-06,
"loss": 0.7614,
"step": 680
},
{
"epoch": 0.9742489270386266,
"grad_norm": 3.029395341873169,
"learning_rate": 1.7102544710824628e-06,
"loss": 0.8751,
"step": 681
},
{
"epoch": 0.9756795422031473,
"grad_norm": 5.665907382965088,
"learning_rate": 1.7067137664264521e-06,
"loss": 0.8122,
"step": 682
},
{
"epoch": 0.977110157367668,
"grad_norm": 10.361516952514648,
"learning_rate": 1.7031718873107404e-06,
"loss": 0.8093,
"step": 683
},
{
"epoch": 0.9785407725321889,
"grad_norm": 10.015640258789062,
"learning_rate": 1.699628853858779e-06,
"loss": 0.8042,
"step": 684
},
{
"epoch": 0.9799713876967096,
"grad_norm": 2.4526281356811523,
"learning_rate": 1.6960846862005769e-06,
"loss": 0.6861,
"step": 685
},
{
"epoch": 0.9814020028612304,
"grad_norm": 4.162567138671875,
"learning_rate": 1.692539404472587e-06,
"loss": 0.7906,
"step": 686
},
{
"epoch": 0.9828326180257511,
"grad_norm": 1.5269864797592163,
"learning_rate": 1.6889930288175922e-06,
"loss": 0.8598,
"step": 687
},
{
"epoch": 0.9842632331902719,
"grad_norm": 4.929915428161621,
"learning_rate": 1.6854455793845915e-06,
"loss": 0.785,
"step": 688
},
{
"epoch": 0.9856938483547926,
"grad_norm": 3.590336322784424,
"learning_rate": 1.6818970763286826e-06,
"loss": 0.774,
"step": 689
},
{
"epoch": 0.9871244635193133,
"grad_norm": 8.861334800720215,
"learning_rate": 1.6783475398109513e-06,
"loss": 0.7606,
"step": 690
},
{
"epoch": 0.9885550786838341,
"grad_norm": 1.1489014625549316,
"learning_rate": 1.6747969899983546e-06,
"loss": 0.8077,
"step": 691
},
{
"epoch": 0.9899856938483548,
"grad_norm": 2.970811367034912,
"learning_rate": 1.6712454470636052e-06,
"loss": 0.6827,
"step": 692
},
{
"epoch": 0.9914163090128756,
"grad_norm": 2.4784224033355713,
"learning_rate": 1.6676929311850608e-06,
"loss": 0.7306,
"step": 693
},
{
"epoch": 0.9928469241773963,
"grad_norm": 1.8776549100875854,
"learning_rate": 1.6641394625466055e-06,
"loss": 0.7379,
"step": 694
},
{
"epoch": 0.994277539341917,
"grad_norm": 1.7985637187957764,
"learning_rate": 1.6605850613375356e-06,
"loss": 0.7949,
"step": 695
},
{
"epoch": 0.9957081545064378,
"grad_norm": 3.027981996536255,
"learning_rate": 1.6570297477524488e-06,
"loss": 0.8686,
"step": 696
},
{
"epoch": 0.9971387696709585,
"grad_norm": 1.519041657447815,
"learning_rate": 1.6534735419911228e-06,
"loss": 0.7968,
"step": 697
},
{
"epoch": 0.9985693848354793,
"grad_norm": 3.942765712738037,
"learning_rate": 1.6499164642584074e-06,
"loss": 0.7562,
"step": 698
},
{
"epoch": 1.0,
"grad_norm": 1.1095448732376099,
"learning_rate": 1.6463585347641054e-06,
"loss": 0.8442,
"step": 699
},
{
"epoch": 1.0014306151645207,
"grad_norm": 1.9735312461853027,
"learning_rate": 1.6427997737228582e-06,
"loss": 0.7842,
"step": 700
},
{
"epoch": 1.0014306151645207,
"eval_loss": 0.9359034895896912,
"eval_runtime": 64.0219,
"eval_samples_per_second": 6.482,
"eval_steps_per_second": 0.406,
"step": 700
},
{
"epoch": 1.0028612303290414,
"grad_norm": 4.721752643585205,
"learning_rate": 1.6392402013540328e-06,
"loss": 0.8099,
"step": 701
},
{
"epoch": 1.0042918454935623,
"grad_norm": 2.144127130508423,
"learning_rate": 1.635679837881606e-06,
"loss": 0.8072,
"step": 702
},
{
"epoch": 1.005722460658083,
"grad_norm": 1.4669064283370972,
"learning_rate": 1.6321187035340477e-06,
"loss": 0.7411,
"step": 703
},
{
"epoch": 1.0071530758226037,
"grad_norm": 3.2362279891967773,
"learning_rate": 1.6285568185442092e-06,
"loss": 0.7697,
"step": 704
},
{
"epoch": 1.0085836909871244,
"grad_norm": 3.9374539852142334,
"learning_rate": 1.6249942031492063e-06,
"loss": 0.8036,
"step": 705
},
{
"epoch": 1.0100143061516451,
"grad_norm": 4.1698126792907715,
"learning_rate": 1.6214308775903035e-06,
"loss": 0.8324,
"step": 706
},
{
"epoch": 1.011444921316166,
"grad_norm": 2.475919246673584,
"learning_rate": 1.6178668621128018e-06,
"loss": 0.7851,
"step": 707
},
{
"epoch": 1.0128755364806867,
"grad_norm": 9.091358184814453,
"learning_rate": 1.6143021769659212e-06,
"loss": 0.7688,
"step": 708
},
{
"epoch": 1.0143061516452074,
"grad_norm": 1.0087482929229736,
"learning_rate": 1.6107368424026866e-06,
"loss": 0.8104,
"step": 709
},
{
"epoch": 1.0157367668097281,
"grad_norm": 4.268504619598389,
"learning_rate": 1.6071708786798126e-06,
"loss": 0.8231,
"step": 710
},
{
"epoch": 1.0171673819742488,
"grad_norm": 3.690303087234497,
"learning_rate": 1.6036043060575882e-06,
"loss": 0.7511,
"step": 711
},
{
"epoch": 1.0185979971387698,
"grad_norm": 3.737053871154785,
"learning_rate": 1.6000371447997617e-06,
"loss": 0.8103,
"step": 712
},
{
"epoch": 1.0200286123032904,
"grad_norm": 2.4901950359344482,
"learning_rate": 1.596469415173427e-06,
"loss": 0.8233,
"step": 713
},
{
"epoch": 1.0214592274678111,
"grad_norm": 108.90562438964844,
"learning_rate": 1.5929011374489059e-06,
"loss": 0.7623,
"step": 714
},
{
"epoch": 1.0228898426323318,
"grad_norm": 3.225177049636841,
"learning_rate": 1.5893323318996348e-06,
"loss": 0.8646,
"step": 715
},
{
"epoch": 1.0243204577968525,
"grad_norm": 7.861708164215088,
"learning_rate": 1.5857630188020494e-06,
"loss": 0.8483,
"step": 716
},
{
"epoch": 1.0257510729613735,
"grad_norm": 2.513399600982666,
"learning_rate": 1.5821932184354677e-06,
"loss": 0.8675,
"step": 717
},
{
"epoch": 1.0271816881258942,
"grad_norm": 3.3864715099334717,
"learning_rate": 1.5786229510819777e-06,
"loss": 0.8231,
"step": 718
},
{
"epoch": 1.0286123032904149,
"grad_norm": 8.62854290008545,
"learning_rate": 1.5750522370263203e-06,
"loss": 0.7884,
"step": 719
},
{
"epoch": 1.0300429184549356,
"grad_norm": 4.026301383972168,
"learning_rate": 1.5714810965557728e-06,
"loss": 0.7832,
"step": 720
},
{
"epoch": 1.0314735336194563,
"grad_norm": 5.8504438400268555,
"learning_rate": 1.5679095499600376e-06,
"loss": 0.8102,
"step": 721
},
{
"epoch": 1.0329041487839772,
"grad_norm": 3.6803553104400635,
"learning_rate": 1.5643376175311233e-06,
"loss": 0.7454,
"step": 722
},
{
"epoch": 1.0343347639484979,
"grad_norm": 5.682314395904541,
"learning_rate": 1.5607653195632304e-06,
"loss": 0.7855,
"step": 723
},
{
"epoch": 1.0357653791130186,
"grad_norm": 8.800222396850586,
"learning_rate": 1.5571926763526365e-06,
"loss": 0.7561,
"step": 724
},
{
"epoch": 1.0371959942775393,
"grad_norm": 2.693606376647949,
"learning_rate": 1.5536197081975814e-06,
"loss": 0.8077,
"step": 725
},
{
"epoch": 1.0386266094420602,
"grad_norm": 7.366818428039551,
"learning_rate": 1.5500464353981495e-06,
"loss": 0.758,
"step": 726
},
{
"epoch": 1.040057224606581,
"grad_norm": 2.4745495319366455,
"learning_rate": 1.5464728782561578e-06,
"loss": 0.8134,
"step": 727
},
{
"epoch": 1.0414878397711016,
"grad_norm": 4.274849891662598,
"learning_rate": 1.542899057075038e-06,
"loss": 0.7351,
"step": 728
},
{
"epoch": 1.0429184549356223,
"grad_norm": 2.3312735557556152,
"learning_rate": 1.5393249921597215e-06,
"loss": 0.7486,
"step": 729
},
{
"epoch": 1.044349070100143,
"grad_norm": 2.961493492126465,
"learning_rate": 1.5357507038165258e-06,
"loss": 0.8082,
"step": 730
},
{
"epoch": 1.0457796852646637,
"grad_norm": 3.3071088790893555,
"learning_rate": 1.5321762123530366e-06,
"loss": 0.8408,
"step": 731
},
{
"epoch": 1.0472103004291846,
"grad_norm": 3.7048757076263428,
"learning_rate": 1.5286015380779939e-06,
"loss": 0.6624,
"step": 732
},
{
"epoch": 1.0486409155937053,
"grad_norm": 2.3817408084869385,
"learning_rate": 1.525026701301177e-06,
"loss": 0.7843,
"step": 733
},
{
"epoch": 1.050071530758226,
"grad_norm": 1.2996212244033813,
"learning_rate": 1.5214517223332873e-06,
"loss": 0.6905,
"step": 734
},
{
"epoch": 1.0515021459227467,
"grad_norm": 2.558300018310547,
"learning_rate": 1.5178766214858356e-06,
"loss": 0.7479,
"step": 735
},
{
"epoch": 1.0529327610872676,
"grad_norm": 3.06276273727417,
"learning_rate": 1.5143014190710241e-06,
"loss": 0.826,
"step": 736
},
{
"epoch": 1.0543633762517883,
"grad_norm": 9.476898193359375,
"learning_rate": 1.5107261354016317e-06,
"loss": 0.8496,
"step": 737
},
{
"epoch": 1.055793991416309,
"grad_norm": 1.7778562307357788,
"learning_rate": 1.5071507907909004e-06,
"loss": 0.7557,
"step": 738
},
{
"epoch": 1.0572246065808297,
"grad_norm": 1.8848568201065063,
"learning_rate": 1.503575405552417e-06,
"loss": 0.8162,
"step": 739
},
{
"epoch": 1.0586552217453504,
"grad_norm": 0.8061392307281494,
"learning_rate": 1.5e-06,
"loss": 0.7872,
"step": 740
},
{
"epoch": 1.0600858369098713,
"grad_norm": 5.786372661590576,
"learning_rate": 1.496424594447583e-06,
"loss": 0.8272,
"step": 741
},
{
"epoch": 1.061516452074392,
"grad_norm": 1.484350323677063,
"learning_rate": 1.4928492092091e-06,
"loss": 0.7515,
"step": 742
},
{
"epoch": 1.0629470672389127,
"grad_norm": 3.867645502090454,
"learning_rate": 1.4892738645983686e-06,
"loss": 0.8213,
"step": 743
},
{
"epoch": 1.0643776824034334,
"grad_norm": 2.6978371143341064,
"learning_rate": 1.4856985809289764e-06,
"loss": 0.7573,
"step": 744
},
{
"epoch": 1.0658082975679541,
"grad_norm": 5.597418785095215,
"learning_rate": 1.4821233785141647e-06,
"loss": 0.7814,
"step": 745
},
{
"epoch": 1.067238912732475,
"grad_norm": 2.4046719074249268,
"learning_rate": 1.4785482776667128e-06,
"loss": 0.8052,
"step": 746
},
{
"epoch": 1.0686695278969958,
"grad_norm": 2.482250452041626,
"learning_rate": 1.4749732986988233e-06,
"loss": 0.7652,
"step": 747
},
{
"epoch": 1.0701001430615165,
"grad_norm": 5.594193935394287,
"learning_rate": 1.4713984619220064e-06,
"loss": 0.6645,
"step": 748
},
{
"epoch": 1.0715307582260372,
"grad_norm": 3.4292051792144775,
"learning_rate": 1.4678237876469637e-06,
"loss": 0.7883,
"step": 749
},
{
"epoch": 1.0729613733905579,
"grad_norm": 0.7807212471961975,
"learning_rate": 1.4642492961834743e-06,
"loss": 0.78,
"step": 750
},
{
"epoch": 1.0743919885550788,
"grad_norm": 5.383660316467285,
"learning_rate": 1.4606750078402786e-06,
"loss": 0.7539,
"step": 751
},
{
"epoch": 1.0758226037195995,
"grad_norm": 4.694250583648682,
"learning_rate": 1.4571009429249621e-06,
"loss": 0.7208,
"step": 752
},
{
"epoch": 1.0772532188841202,
"grad_norm": 1.829797387123108,
"learning_rate": 1.4535271217438427e-06,
"loss": 0.763,
"step": 753
},
{
"epoch": 1.0786838340486409,
"grad_norm": 20.187421798706055,
"learning_rate": 1.4499535646018508e-06,
"loss": 0.7726,
"step": 754
},
{
"epoch": 1.0801144492131616,
"grad_norm": 3.4587745666503906,
"learning_rate": 1.446380291802419e-06,
"loss": 0.7618,
"step": 755
},
{
"epoch": 1.0815450643776825,
"grad_norm": 2.4537343978881836,
"learning_rate": 1.4428073236473637e-06,
"loss": 0.8274,
"step": 756
},
{
"epoch": 1.0829756795422032,
"grad_norm": 4.690003395080566,
"learning_rate": 1.4392346804367697e-06,
"loss": 0.7229,
"step": 757
},
{
"epoch": 1.084406294706724,
"grad_norm": 2.620816946029663,
"learning_rate": 1.4356623824688768e-06,
"loss": 0.7523,
"step": 758
},
{
"epoch": 1.0858369098712446,
"grad_norm": 2.812201499938965,
"learning_rate": 1.4320904500399625e-06,
"loss": 0.7251,
"step": 759
},
{
"epoch": 1.0872675250357653,
"grad_norm": 1.717846393585205,
"learning_rate": 1.4285189034442273e-06,
"loss": 0.81,
"step": 760
},
{
"epoch": 1.0886981402002862,
"grad_norm": 3.324570655822754,
"learning_rate": 1.4249477629736802e-06,
"loss": 0.7907,
"step": 761
},
{
"epoch": 1.090128755364807,
"grad_norm": 3.35800838470459,
"learning_rate": 1.4213770489180224e-06,
"loss": 0.7245,
"step": 762
},
{
"epoch": 1.0915593705293276,
"grad_norm": 3.3062188625335693,
"learning_rate": 1.4178067815645326e-06,
"loss": 0.7933,
"step": 763
},
{
"epoch": 1.0929899856938483,
"grad_norm": 16.04672622680664,
"learning_rate": 1.414236981197951e-06,
"loss": 0.7359,
"step": 764
},
{
"epoch": 1.094420600858369,
"grad_norm": 1.6228106021881104,
"learning_rate": 1.4106676681003653e-06,
"loss": 0.806,
"step": 765
},
{
"epoch": 1.09585121602289,
"grad_norm": 5.070892333984375,
"learning_rate": 1.4070988625510942e-06,
"loss": 0.784,
"step": 766
},
{
"epoch": 1.0972818311874106,
"grad_norm": 9.049479484558105,
"learning_rate": 1.403530584826573e-06,
"loss": 0.7501,
"step": 767
},
{
"epoch": 1.0987124463519313,
"grad_norm": 2.303457260131836,
"learning_rate": 1.3999628552002386e-06,
"loss": 0.7539,
"step": 768
},
{
"epoch": 1.100143061516452,
"grad_norm": 4.238282680511475,
"learning_rate": 1.3963956939424123e-06,
"loss": 0.7909,
"step": 769
},
{
"epoch": 1.1015736766809727,
"grad_norm": 5.631208419799805,
"learning_rate": 1.3928291213201877e-06,
"loss": 0.8202,
"step": 770
},
{
"epoch": 1.1030042918454936,
"grad_norm": 1.7331924438476562,
"learning_rate": 1.3892631575973137e-06,
"loss": 0.849,
"step": 771
},
{
"epoch": 1.1044349070100143,
"grad_norm": 4.782192707061768,
"learning_rate": 1.3856978230340789e-06,
"loss": 0.819,
"step": 772
},
{
"epoch": 1.105865522174535,
"grad_norm": 2.9614789485931396,
"learning_rate": 1.3821331378871983e-06,
"loss": 0.8061,
"step": 773
},
{
"epoch": 1.1072961373390557,
"grad_norm": 3.1825926303863525,
"learning_rate": 1.3785691224096972e-06,
"loss": 0.8027,
"step": 774
},
{
"epoch": 1.1087267525035764,
"grad_norm": 1.6604760885238647,
"learning_rate": 1.3750057968507944e-06,
"loss": 0.7238,
"step": 775
},
{
"epoch": 1.1101573676680974,
"grad_norm": 1.8294752836227417,
"learning_rate": 1.3714431814557916e-06,
"loss": 0.8283,
"step": 776
},
{
"epoch": 1.111587982832618,
"grad_norm": 6.401926517486572,
"learning_rate": 1.3678812964659528e-06,
"loss": 0.7288,
"step": 777
},
{
"epoch": 1.1130185979971388,
"grad_norm": 3.224818468093872,
"learning_rate": 1.3643201621183948e-06,
"loss": 0.8541,
"step": 778
},
{
"epoch": 1.1144492131616595,
"grad_norm": 1.71248459815979,
"learning_rate": 1.3607597986459677e-06,
"loss": 0.7835,
"step": 779
},
{
"epoch": 1.1158798283261802,
"grad_norm": 3.652742624282837,
"learning_rate": 1.3572002262771425e-06,
"loss": 0.8003,
"step": 780
},
{
"epoch": 1.117310443490701,
"grad_norm": 0.81279057264328,
"learning_rate": 1.3536414652358953e-06,
"loss": 0.7865,
"step": 781
},
{
"epoch": 1.1187410586552218,
"grad_norm": 6.66923713684082,
"learning_rate": 1.3500835357415933e-06,
"loss": 0.8885,
"step": 782
},
{
"epoch": 1.1201716738197425,
"grad_norm": 3.3524577617645264,
"learning_rate": 1.3465264580088777e-06,
"loss": 0.7786,
"step": 783
},
{
"epoch": 1.1216022889842632,
"grad_norm": 4.309314727783203,
"learning_rate": 1.342970252247552e-06,
"loss": 0.784,
"step": 784
},
{
"epoch": 1.123032904148784,
"grad_norm": 10.110477447509766,
"learning_rate": 1.3394149386624647e-06,
"loss": 0.7979,
"step": 785
},
{
"epoch": 1.1244635193133048,
"grad_norm": 1.8501849174499512,
"learning_rate": 1.3358605374533952e-06,
"loss": 0.8531,
"step": 786
},
{
"epoch": 1.1258941344778255,
"grad_norm": 2.0311262607574463,
"learning_rate": 1.3323070688149395e-06,
"loss": 0.7445,
"step": 787
},
{
"epoch": 1.1273247496423462,
"grad_norm": 1.5606796741485596,
"learning_rate": 1.3287545529363951e-06,
"loss": 0.7768,
"step": 788
},
{
"epoch": 1.128755364806867,
"grad_norm": 4.589614391326904,
"learning_rate": 1.3252030100016462e-06,
"loss": 0.7829,
"step": 789
},
{
"epoch": 1.1301859799713876,
"grad_norm": 1.5389312505722046,
"learning_rate": 1.321652460189049e-06,
"loss": 0.787,
"step": 790
},
{
"epoch": 1.1316165951359085,
"grad_norm": 2.4592175483703613,
"learning_rate": 1.318102923671318e-06,
"loss": 0.8379,
"step": 791
},
{
"epoch": 1.1330472103004292,
"grad_norm": 1.0238618850708008,
"learning_rate": 1.314554420615409e-06,
"loss": 0.7934,
"step": 792
},
{
"epoch": 1.13447782546495,
"grad_norm": 3.073195695877075,
"learning_rate": 1.3110069711824081e-06,
"loss": 0.8114,
"step": 793
},
{
"epoch": 1.1359084406294706,
"grad_norm": 1.4695512056350708,
"learning_rate": 1.3074605955274136e-06,
"loss": 0.7787,
"step": 794
},
{
"epoch": 1.1373390557939915,
"grad_norm": 2.683389663696289,
"learning_rate": 1.3039153137994239e-06,
"loss": 0.7827,
"step": 795
},
{
"epoch": 1.1387696709585122,
"grad_norm": 1.7253704071044922,
"learning_rate": 1.3003711461412214e-06,
"loss": 0.798,
"step": 796
},
{
"epoch": 1.140200286123033,
"grad_norm": 16.745397567749023,
"learning_rate": 1.2968281126892603e-06,
"loss": 0.7709,
"step": 797
},
{
"epoch": 1.1416309012875536,
"grad_norm": 2.683840751647949,
"learning_rate": 1.2932862335735486e-06,
"loss": 0.7775,
"step": 798
},
{
"epoch": 1.1430615164520743,
"grad_norm": 7.146876811981201,
"learning_rate": 1.2897455289175373e-06,
"loss": 0.8856,
"step": 799
},
{
"epoch": 1.144492131616595,
"grad_norm": 1.972984790802002,
"learning_rate": 1.2862060188380051e-06,
"loss": 0.7153,
"step": 800
},
{
"epoch": 1.145922746781116,
"grad_norm": 2.476194143295288,
"learning_rate": 1.2826677234449419e-06,
"loss": 0.8171,
"step": 801
},
{
"epoch": 1.1473533619456366,
"grad_norm": 2.416992425918579,
"learning_rate": 1.2791306628414377e-06,
"loss": 0.814,
"step": 802
},
{
"epoch": 1.1487839771101573,
"grad_norm": 10.751389503479004,
"learning_rate": 1.275594857123566e-06,
"loss": 0.7874,
"step": 803
},
{
"epoch": 1.150214592274678,
"grad_norm": 1.4024333953857422,
"learning_rate": 1.2720603263802716e-06,
"loss": 0.8824,
"step": 804
},
{
"epoch": 1.151645207439199,
"grad_norm": 1.4597464799880981,
"learning_rate": 1.2685270906932546e-06,
"loss": 0.7573,
"step": 805
},
{
"epoch": 1.1530758226037197,
"grad_norm": 2.488672971725464,
"learning_rate": 1.2649951701368566e-06,
"loss": 0.717,
"step": 806
},
{
"epoch": 1.1545064377682404,
"grad_norm": 10.042638778686523,
"learning_rate": 1.2614645847779498e-06,
"loss": 0.7655,
"step": 807
},
{
"epoch": 1.155937052932761,
"grad_norm": 5.5453901290893555,
"learning_rate": 1.2579353546758169e-06,
"loss": 0.707,
"step": 808
},
{
"epoch": 1.1573676680972818,
"grad_norm": 9.400655746459961,
"learning_rate": 1.2544074998820431e-06,
"loss": 0.8075,
"step": 809
},
{
"epoch": 1.1587982832618025,
"grad_norm": 1.1171351671218872,
"learning_rate": 1.2508810404403991e-06,
"loss": 0.7257,
"step": 810
},
{
"epoch": 1.1602288984263234,
"grad_norm": 1.9322105646133423,
"learning_rate": 1.2473559963867266e-06,
"loss": 0.6525,
"step": 811
},
{
"epoch": 1.161659513590844,
"grad_norm": 2.5018885135650635,
"learning_rate": 1.2438323877488274e-06,
"loss": 0.6813,
"step": 812
},
{
"epoch": 1.1630901287553648,
"grad_norm": 4.477802276611328,
"learning_rate": 1.2403102345463473e-06,
"loss": 0.7791,
"step": 813
},
{
"epoch": 1.1645207439198855,
"grad_norm": 1.7652959823608398,
"learning_rate": 1.2367895567906618e-06,
"loss": 0.7778,
"step": 814
},
{
"epoch": 1.1659513590844064,
"grad_norm": 1.8609610795974731,
"learning_rate": 1.233270374484765e-06,
"loss": 0.7831,
"step": 815
},
{
"epoch": 1.167381974248927,
"grad_norm": 5.632737636566162,
"learning_rate": 1.2297527076231542e-06,
"loss": 0.7406,
"step": 816
},
{
"epoch": 1.1688125894134478,
"grad_norm": 4.156643867492676,
"learning_rate": 1.2262365761917163e-06,
"loss": 0.8467,
"step": 817
},
{
"epoch": 1.1702432045779685,
"grad_norm": 6.219330310821533,
"learning_rate": 1.2227220001676142e-06,
"loss": 0.8302,
"step": 818
},
{
"epoch": 1.1716738197424892,
"grad_norm": 3.3409154415130615,
"learning_rate": 1.2192089995191743e-06,
"loss": 0.8674,
"step": 819
},
{
"epoch": 1.17310443490701,
"grad_norm": 3.474548101425171,
"learning_rate": 1.2156975942057719e-06,
"loss": 0.8351,
"step": 820
},
{
"epoch": 1.1745350500715308,
"grad_norm": 3.2273216247558594,
"learning_rate": 1.212187804177719e-06,
"loss": 0.857,
"step": 821
},
{
"epoch": 1.1759656652360515,
"grad_norm": 1.604404091835022,
"learning_rate": 1.2086796493761495e-06,
"loss": 0.8938,
"step": 822
},
{
"epoch": 1.1773962804005722,
"grad_norm": 1.4558448791503906,
"learning_rate": 1.2051731497329063e-06,
"loss": 0.7917,
"step": 823
},
{
"epoch": 1.178826895565093,
"grad_norm": 2.538985013961792,
"learning_rate": 1.2016683251704303e-06,
"loss": 0.7406,
"step": 824
},
{
"epoch": 1.1802575107296138,
"grad_norm": 1.2528947591781616,
"learning_rate": 1.1981651956016425e-06,
"loss": 0.8545,
"step": 825
},
{
"epoch": 1.1816881258941345,
"grad_norm": 1.4131247997283936,
"learning_rate": 1.194663780929836e-06,
"loss": 0.7394,
"step": 826
},
{
"epoch": 1.1831187410586552,
"grad_norm": 16.873014450073242,
"learning_rate": 1.1911641010485598e-06,
"loss": 0.8212,
"step": 827
},
{
"epoch": 1.184549356223176,
"grad_norm": 6.866806507110596,
"learning_rate": 1.187666175841506e-06,
"loss": 0.9203,
"step": 828
},
{
"epoch": 1.1859799713876966,
"grad_norm": 1.7047280073165894,
"learning_rate": 1.184170025182398e-06,
"loss": 0.7769,
"step": 829
},
{
"epoch": 1.1874105865522175,
"grad_norm": 5.180852890014648,
"learning_rate": 1.1806756689348775e-06,
"loss": 0.791,
"step": 830
},
{
"epoch": 1.1888412017167382,
"grad_norm": 3.131958484649658,
"learning_rate": 1.1771831269523896e-06,
"loss": 0.7949,
"step": 831
},
{
"epoch": 1.190271816881259,
"grad_norm": 1.1318491697311401,
"learning_rate": 1.1736924190780725e-06,
"loss": 0.7955,
"step": 832
},
{
"epoch": 1.1917024320457796,
"grad_norm": 6.675893306732178,
"learning_rate": 1.1702035651446442e-06,
"loss": 0.7918,
"step": 833
},
{
"epoch": 1.1931330472103003,
"grad_norm": 8.012784004211426,
"learning_rate": 1.1667165849742884e-06,
"loss": 0.7151,
"step": 834
},
{
"epoch": 1.1945636623748213,
"grad_norm": 1.9865070581436157,
"learning_rate": 1.1632314983785435e-06,
"loss": 0.8307,
"step": 835
},
{
"epoch": 1.195994277539342,
"grad_norm": 6.1861677169799805,
"learning_rate": 1.1597483251581895e-06,
"loss": 0.7981,
"step": 836
},
{
"epoch": 1.1974248927038627,
"grad_norm": 2.7006752490997314,
"learning_rate": 1.1562670851031345e-06,
"loss": 0.8067,
"step": 837
},
{
"epoch": 1.1988555078683834,
"grad_norm": 1.065775752067566,
"learning_rate": 1.1527877979923043e-06,
"loss": 0.759,
"step": 838
},
{
"epoch": 1.200286123032904,
"grad_norm": 1.9265739917755127,
"learning_rate": 1.1493104835935287e-06,
"loss": 0.7376,
"step": 839
},
{
"epoch": 1.201716738197425,
"grad_norm": 1.4268121719360352,
"learning_rate": 1.1458351616634283e-06,
"loss": 0.7874,
"step": 840
},
{
"epoch": 1.2031473533619457,
"grad_norm": 2.659268856048584,
"learning_rate": 1.1423618519473052e-06,
"loss": 0.8201,
"step": 841
},
{
"epoch": 1.2045779685264664,
"grad_norm": 3.1713037490844727,
"learning_rate": 1.1388905741790269e-06,
"loss": 0.8612,
"step": 842
},
{
"epoch": 1.206008583690987,
"grad_norm": 10.63504695892334,
"learning_rate": 1.1354213480809178e-06,
"loss": 0.7408,
"step": 843
},
{
"epoch": 1.207439198855508,
"grad_norm": 5.266157627105713,
"learning_rate": 1.1319541933636455e-06,
"loss": 0.8414,
"step": 844
},
{
"epoch": 1.2088698140200287,
"grad_norm": 2.5737879276275635,
"learning_rate": 1.1284891297261075e-06,
"loss": 0.8581,
"step": 845
},
{
"epoch": 1.2103004291845494,
"grad_norm": 4.128069877624512,
"learning_rate": 1.1250261768553221e-06,
"loss": 0.8162,
"step": 846
},
{
"epoch": 1.21173104434907,
"grad_norm": 2.4845378398895264,
"learning_rate": 1.1215653544263147e-06,
"loss": 0.7017,
"step": 847
},
{
"epoch": 1.2131616595135908,
"grad_norm": 2.9242730140686035,
"learning_rate": 1.118106682102006e-06,
"loss": 0.8214,
"step": 848
},
{
"epoch": 1.2145922746781115,
"grad_norm": 3.1195361614227295,
"learning_rate": 1.1146501795331017e-06,
"loss": 0.8892,
"step": 849
},
{
"epoch": 1.2160228898426324,
"grad_norm": 1.8963371515274048,
"learning_rate": 1.111195866357979e-06,
"loss": 0.7455,
"step": 850
},
{
"epoch": 1.217453505007153,
"grad_norm": 4.30813455581665,
"learning_rate": 1.107743762202576e-06,
"loss": 0.7363,
"step": 851
},
{
"epoch": 1.2188841201716738,
"grad_norm": 1.2631362676620483,
"learning_rate": 1.10429388668028e-06,
"loss": 0.7979,
"step": 852
},
{
"epoch": 1.2203147353361945,
"grad_norm": 1.1063506603240967,
"learning_rate": 1.1008462593918172e-06,
"loss": 0.8217,
"step": 853
},
{
"epoch": 1.2217453505007154,
"grad_norm": 5.987213611602783,
"learning_rate": 1.0974008999251385e-06,
"loss": 0.7839,
"step": 854
},
{
"epoch": 1.2231759656652361,
"grad_norm": 1.2211673259735107,
"learning_rate": 1.0939578278553117e-06,
"loss": 0.7484,
"step": 855
},
{
"epoch": 1.2246065808297568,
"grad_norm": 29.422378540039062,
"learning_rate": 1.0905170627444082e-06,
"loss": 0.7305,
"step": 856
},
{
"epoch": 1.2260371959942775,
"grad_norm": 0.9755321741104126,
"learning_rate": 1.0870786241413909e-06,
"loss": 0.728,
"step": 857
},
{
"epoch": 1.2274678111587982,
"grad_norm": 2.794478178024292,
"learning_rate": 1.083642531582006e-06,
"loss": 0.763,
"step": 858
},
{
"epoch": 1.228898426323319,
"grad_norm": 9.834367752075195,
"learning_rate": 1.0802088045886703e-06,
"loss": 0.7693,
"step": 859
},
{
"epoch": 1.2303290414878398,
"grad_norm": 1.6742088794708252,
"learning_rate": 1.0767774626703599e-06,
"loss": 0.7502,
"step": 860
},
{
"epoch": 1.2317596566523605,
"grad_norm": 1.3184466361999512,
"learning_rate": 1.0733485253224997e-06,
"loss": 0.7145,
"step": 861
},
{
"epoch": 1.2331902718168812,
"grad_norm": 2.6459200382232666,
"learning_rate": 1.069922012026854e-06,
"loss": 0.7881,
"step": 862
},
{
"epoch": 1.234620886981402,
"grad_norm": 2.640869379043579,
"learning_rate": 1.0664979422514134e-06,
"loss": 0.7546,
"step": 863
},
{
"epoch": 1.2360515021459229,
"grad_norm": 3.070185899734497,
"learning_rate": 1.0630763354502864e-06,
"loss": 0.7508,
"step": 864
},
{
"epoch": 1.2374821173104436,
"grad_norm": 1.690168857574463,
"learning_rate": 1.0596572110635875e-06,
"loss": 0.8324,
"step": 865
},
{
"epoch": 1.2389127324749643,
"grad_norm": 2.343522071838379,
"learning_rate": 1.056240588517327e-06,
"loss": 0.8546,
"step": 866
},
{
"epoch": 1.240343347639485,
"grad_norm": 2.629617691040039,
"learning_rate": 1.0528264872233018e-06,
"loss": 0.8052,
"step": 867
},
{
"epoch": 1.2417739628040056,
"grad_norm": 5.790884971618652,
"learning_rate": 1.049414926578982e-06,
"loss": 0.8059,
"step": 868
},
{
"epoch": 1.2432045779685263,
"grad_norm": 3.549689292907715,
"learning_rate": 1.0460059259674048e-06,
"loss": 0.6624,
"step": 869
},
{
"epoch": 1.2446351931330473,
"grad_norm": 6.8801493644714355,
"learning_rate": 1.0425995047570625e-06,
"loss": 0.751,
"step": 870
},
{
"epoch": 1.246065808297568,
"grad_norm": 3.7252180576324463,
"learning_rate": 1.0391956823017906e-06,
"loss": 0.6847,
"step": 871
},
{
"epoch": 1.2474964234620887,
"grad_norm": 3.222304582595825,
"learning_rate": 1.0357944779406609e-06,
"loss": 0.8095,
"step": 872
},
{
"epoch": 1.2489270386266094,
"grad_norm": 1.8582981824874878,
"learning_rate": 1.0323959109978703e-06,
"loss": 0.7937,
"step": 873
},
{
"epoch": 1.2503576537911303,
"grad_norm": 2.6876213550567627,
"learning_rate": 1.0290000007826299e-06,
"loss": 0.7574,
"step": 874
},
{
"epoch": 1.251788268955651,
"grad_norm": 1.8542571067810059,
"learning_rate": 1.0256067665890578e-06,
"loss": 0.7267,
"step": 875
},
{
"epoch": 1.251788268955651,
"eval_loss": 0.9304266571998596,
"eval_runtime": 66.8532,
"eval_samples_per_second": 6.208,
"eval_steps_per_second": 0.389,
"step": 875
},
{
"epoch": 1.2532188841201717,
"grad_norm": 3.313300609588623,
"learning_rate": 1.0222162276960676e-06,
"loss": 0.8148,
"step": 876
},
{
"epoch": 1.2546494992846924,
"grad_norm": 2.9693450927734375,
"learning_rate": 1.0188284033672586e-06,
"loss": 0.737,
"step": 877
},
{
"epoch": 1.256080114449213,
"grad_norm": 1.4272849559783936,
"learning_rate": 1.015443312850808e-06,
"loss": 0.9017,
"step": 878
},
{
"epoch": 1.2575107296137338,
"grad_norm": 1.6904128789901733,
"learning_rate": 1.0120609753793609e-06,
"loss": 0.75,
"step": 879
},
{
"epoch": 1.2589413447782547,
"grad_norm": 4.684359550476074,
"learning_rate": 1.0086814101699191e-06,
"loss": 0.711,
"step": 880
},
{
"epoch": 1.2603719599427754,
"grad_norm": 1.3708410263061523,
"learning_rate": 1.0053046364237354e-06,
"loss": 0.8005,
"step": 881
},
{
"epoch": 1.261802575107296,
"grad_norm": 1.4521434307098389,
"learning_rate": 1.0019306733262022e-06,
"loss": 0.818,
"step": 882
},
{
"epoch": 1.263233190271817,
"grad_norm": 0.9280107617378235,
"learning_rate": 9.985595400467423e-07,
"loss": 0.7696,
"step": 883
},
{
"epoch": 1.2646638054363377,
"grad_norm": 2.259516477584839,
"learning_rate": 9.951912557387014e-07,
"loss": 0.8095,
"step": 884
},
{
"epoch": 1.2660944206008584,
"grad_norm": 1.7881183624267578,
"learning_rate": 9.918258395392388e-07,
"loss": 0.837,
"step": 885
},
{
"epoch": 1.2675250357653791,
"grad_norm": 3.3883907794952393,
"learning_rate": 9.88463310569217e-07,
"loss": 0.8968,
"step": 886
},
{
"epoch": 1.2689556509298998,
"grad_norm": 1.247185230255127,
"learning_rate": 9.851036879330958e-07,
"loss": 0.7996,
"step": 887
},
{
"epoch": 1.2703862660944205,
"grad_norm": 2.4265060424804688,
"learning_rate": 9.817469907188227e-07,
"loss": 0.6631,
"step": 888
},
{
"epoch": 1.2718168812589412,
"grad_norm": 4.242371082305908,
"learning_rate": 9.783932379977228e-07,
"loss": 0.7746,
"step": 889
},
{
"epoch": 1.2732474964234621,
"grad_norm": 4.2158660888671875,
"learning_rate": 9.75042448824393e-07,
"loss": 0.7862,
"step": 890
},
{
"epoch": 1.2746781115879828,
"grad_norm": 2.9039363861083984,
"learning_rate": 9.716946422365922e-07,
"loss": 0.7609,
"step": 891
},
{
"epoch": 1.2761087267525035,
"grad_norm": 4.17219877243042,
"learning_rate": 9.683498372551335e-07,
"loss": 0.7278,
"step": 892
},
{
"epoch": 1.2775393419170245,
"grad_norm": 3.1430556774139404,
"learning_rate": 9.650080528837762e-07,
"loss": 0.8266,
"step": 893
},
{
"epoch": 1.2789699570815452,
"grad_norm": 8.886442184448242,
"learning_rate": 9.616693081091172e-07,
"loss": 0.7685,
"step": 894
},
{
"epoch": 1.2804005722460658,
"grad_norm": 1.9755185842514038,
"learning_rate": 9.58333621900485e-07,
"loss": 0.7883,
"step": 895
},
{
"epoch": 1.2818311874105865,
"grad_norm": 2.893641710281372,
"learning_rate": 9.550010132098303e-07,
"loss": 0.7261,
"step": 896
},
{
"epoch": 1.2832618025751072,
"grad_norm": 1.6755917072296143,
"learning_rate": 9.51671500971617e-07,
"loss": 0.8368,
"step": 897
},
{
"epoch": 1.284692417739628,
"grad_norm": 3.195072889328003,
"learning_rate": 9.483451041027182e-07,
"loss": 0.855,
"step": 898
},
{
"epoch": 1.2861230329041489,
"grad_norm": 1.5989915132522583,
"learning_rate": 9.450218415023063e-07,
"loss": 0.8193,
"step": 899
},
{
"epoch": 1.2875536480686696,
"grad_norm": 4.059481620788574,
"learning_rate": 9.417017320517456e-07,
"loss": 0.7388,
"step": 900
},
{
"epoch": 1.2889842632331903,
"grad_norm": 4.821532249450684,
"learning_rate": 9.383847946144855e-07,
"loss": 0.7063,
"step": 901
},
{
"epoch": 1.290414878397711,
"grad_norm": 16.60176658630371,
"learning_rate": 9.350710480359549e-07,
"loss": 0.7916,
"step": 902
},
{
"epoch": 1.2918454935622319,
"grad_norm": 5.774556636810303,
"learning_rate": 9.317605111434513e-07,
"loss": 0.8476,
"step": 903
},
{
"epoch": 1.2932761087267526,
"grad_norm": 1.998368263244629,
"learning_rate": 9.284532027460378e-07,
"loss": 0.7909,
"step": 904
},
{
"epoch": 1.2947067238912733,
"grad_norm": 2.349731206893921,
"learning_rate": 9.251491416344341e-07,
"loss": 0.8264,
"step": 905
},
{
"epoch": 1.296137339055794,
"grad_norm": 2.667130947113037,
"learning_rate": 9.2184834658091e-07,
"loss": 0.6402,
"step": 906
},
{
"epoch": 1.2975679542203147,
"grad_norm": 1.8666576147079468,
"learning_rate": 9.185508363391787e-07,
"loss": 0.8442,
"step": 907
},
{
"epoch": 1.2989985693848354,
"grad_norm": 2.20011043548584,
"learning_rate": 9.152566296442919e-07,
"loss": 0.8345,
"step": 908
},
{
"epoch": 1.3004291845493563,
"grad_norm": 1.1894949674606323,
"learning_rate": 9.119657452125299e-07,
"loss": 0.8069,
"step": 909
},
{
"epoch": 1.301859799713877,
"grad_norm": 2.52988862991333,
"learning_rate": 9.086782017412988e-07,
"loss": 0.7534,
"step": 910
},
{
"epoch": 1.3032904148783977,
"grad_norm": 3.5195047855377197,
"learning_rate": 9.053940179090225e-07,
"loss": 0.7125,
"step": 911
},
{
"epoch": 1.3047210300429184,
"grad_norm": 3.777909994125366,
"learning_rate": 9.021132123750361e-07,
"loss": 0.7886,
"step": 912
},
{
"epoch": 1.3061516452074393,
"grad_norm": 3.459988832473755,
"learning_rate": 8.988358037794821e-07,
"loss": 0.8223,
"step": 913
},
{
"epoch": 1.30758226037196,
"grad_norm": 1.278838038444519,
"learning_rate": 8.955618107432014e-07,
"loss": 0.8042,
"step": 914
},
{
"epoch": 1.3090128755364807,
"grad_norm": 2.881751775741577,
"learning_rate": 8.922912518676302e-07,
"loss": 0.8053,
"step": 915
},
{
"epoch": 1.3104434907010014,
"grad_norm": 3.7574193477630615,
"learning_rate": 8.890241457346934e-07,
"loss": 0.7679,
"step": 916
},
{
"epoch": 1.311874105865522,
"grad_norm": 6.835153102874756,
"learning_rate": 8.857605109066977e-07,
"loss": 0.757,
"step": 917
},
{
"epoch": 1.3133047210300428,
"grad_norm": 2.608959913253784,
"learning_rate": 8.825003659262284e-07,
"loss": 0.7314,
"step": 918
},
{
"epoch": 1.3147353361945637,
"grad_norm": 2.777501344680786,
"learning_rate": 8.792437293160431e-07,
"loss": 0.7734,
"step": 919
},
{
"epoch": 1.3161659513590844,
"grad_norm": 1.376322627067566,
"learning_rate": 8.759906195789654e-07,
"loss": 0.8299,
"step": 920
},
{
"epoch": 1.3175965665236051,
"grad_norm": 5.208398342132568,
"learning_rate": 8.727410551977812e-07,
"loss": 0.6947,
"step": 921
},
{
"epoch": 1.3190271816881258,
"grad_norm": 1.6894828081130981,
"learning_rate": 8.694950546351335e-07,
"loss": 0.7012,
"step": 922
},
{
"epoch": 1.3204577968526467,
"grad_norm": 4.928938388824463,
"learning_rate": 8.662526363334164e-07,
"loss": 0.818,
"step": 923
},
{
"epoch": 1.3218884120171674,
"grad_norm": 1.4015679359436035,
"learning_rate": 8.630138187146725e-07,
"loss": 0.7557,
"step": 924
},
{
"epoch": 1.3233190271816881,
"grad_norm": 1.5027586221694946,
"learning_rate": 8.597786201804853e-07,
"loss": 0.8091,
"step": 925
},
{
"epoch": 1.3247496423462088,
"grad_norm": 1.433759331703186,
"learning_rate": 8.56547059111877e-07,
"loss": 0.7719,
"step": 926
},
{
"epoch": 1.3261802575107295,
"grad_norm": 1.4195560216903687,
"learning_rate": 8.533191538692026e-07,
"loss": 0.7916,
"step": 927
},
{
"epoch": 1.3276108726752502,
"grad_norm": 2.4685306549072266,
"learning_rate": 8.500949227920477e-07,
"loss": 0.7753,
"step": 928
},
{
"epoch": 1.3290414878397712,
"grad_norm": 1.3594095706939697,
"learning_rate": 8.468743841991219e-07,
"loss": 0.7694,
"step": 929
},
{
"epoch": 1.3304721030042919,
"grad_norm": 2.4916977882385254,
"learning_rate": 8.436575563881544e-07,
"loss": 0.7889,
"step": 930
},
{
"epoch": 1.3319027181688126,
"grad_norm": 5.942515850067139,
"learning_rate": 8.404444576357943e-07,
"loss": 0.7976,
"step": 931
},
{
"epoch": 1.3333333333333333,
"grad_norm": 1.2734025716781616,
"learning_rate": 8.372351061975014e-07,
"loss": 0.8291,
"step": 932
},
{
"epoch": 1.3347639484978542,
"grad_norm": 4.3545732498168945,
"learning_rate": 8.340295203074449e-07,
"loss": 0.8092,
"step": 933
},
{
"epoch": 1.3361945636623749,
"grad_norm": 2.437654733657837,
"learning_rate": 8.308277181784017e-07,
"loss": 0.7858,
"step": 934
},
{
"epoch": 1.3376251788268956,
"grad_norm": 2.960955858230591,
"learning_rate": 8.27629718001649e-07,
"loss": 0.8502,
"step": 935
},
{
"epoch": 1.3390557939914163,
"grad_norm": 3.9844677448272705,
"learning_rate": 8.244355379468631e-07,
"loss": 0.7174,
"step": 936
},
{
"epoch": 1.340486409155937,
"grad_norm": 3.1742899417877197,
"learning_rate": 8.212451961620176e-07,
"loss": 0.7704,
"step": 937
},
{
"epoch": 1.3419170243204577,
"grad_norm": 2.129551410675049,
"learning_rate": 8.180587107732766e-07,
"loss": 0.7319,
"step": 938
},
{
"epoch": 1.3433476394849786,
"grad_norm": 1.7495276927947998,
"learning_rate": 8.148760998848951e-07,
"loss": 0.7423,
"step": 939
},
{
"epoch": 1.3447782546494993,
"grad_norm": 2.0302577018737793,
"learning_rate": 8.116973815791154e-07,
"loss": 0.7748,
"step": 940
},
{
"epoch": 1.34620886981402,
"grad_norm": 1.8777068853378296,
"learning_rate": 8.085225739160623e-07,
"loss": 0.7707,
"step": 941
},
{
"epoch": 1.3476394849785407,
"grad_norm": 2.8703246116638184,
"learning_rate": 8.053516949336425e-07,
"loss": 0.7156,
"step": 942
},
{
"epoch": 1.3490701001430616,
"grad_norm": 2.731548309326172,
"learning_rate": 8.021847626474412e-07,
"loss": 0.8371,
"step": 943
},
{
"epoch": 1.3505007153075823,
"grad_norm": 4.414968490600586,
"learning_rate": 7.990217950506219e-07,
"loss": 0.7124,
"step": 944
},
{
"epoch": 1.351931330472103,
"grad_norm": 1.4502582550048828,
"learning_rate": 7.958628101138203e-07,
"loss": 0.7313,
"step": 945
},
{
"epoch": 1.3533619456366237,
"grad_norm": 3.5596978664398193,
"learning_rate": 7.927078257850451e-07,
"loss": 0.7698,
"step": 946
},
{
"epoch": 1.3547925608011444,
"grad_norm": 1.3692398071289062,
"learning_rate": 7.895568599895763e-07,
"loss": 0.7405,
"step": 947
},
{
"epoch": 1.356223175965665,
"grad_norm": 2.794085741043091,
"learning_rate": 7.864099306298608e-07,
"loss": 0.775,
"step": 948
},
{
"epoch": 1.357653791130186,
"grad_norm": 5.740682601928711,
"learning_rate": 7.832670555854122e-07,
"loss": 0.8187,
"step": 949
},
{
"epoch": 1.3590844062947067,
"grad_norm": 3.9949023723602295,
"learning_rate": 7.801282527127108e-07,
"loss": 0.797,
"step": 950
},
{
"epoch": 1.3605150214592274,
"grad_norm": 1.2518641948699951,
"learning_rate": 7.769935398450992e-07,
"loss": 0.7613,
"step": 951
},
{
"epoch": 1.3619456366237483,
"grad_norm": 1.4318602085113525,
"learning_rate": 7.738629347926818e-07,
"loss": 0.7331,
"step": 952
},
{
"epoch": 1.363376251788269,
"grad_norm": 2.8898508548736572,
"learning_rate": 7.707364553422264e-07,
"loss": 0.7671,
"step": 953
},
{
"epoch": 1.3648068669527897,
"grad_norm": 4.7733473777771,
"learning_rate": 7.676141192570586e-07,
"loss": 0.8436,
"step": 954
},
{
"epoch": 1.3662374821173104,
"grad_norm": 2.112035036087036,
"learning_rate": 7.644959442769636e-07,
"loss": 0.7985,
"step": 955
},
{
"epoch": 1.3676680972818311,
"grad_norm": 4.145442485809326,
"learning_rate": 7.613819481180869e-07,
"loss": 0.8581,
"step": 956
},
{
"epoch": 1.3690987124463518,
"grad_norm": 1.2203041315078735,
"learning_rate": 7.582721484728289e-07,
"loss": 0.6751,
"step": 957
},
{
"epoch": 1.3705293276108725,
"grad_norm": 2.124601364135742,
"learning_rate": 7.551665630097485e-07,
"loss": 0.8874,
"step": 958
},
{
"epoch": 1.3719599427753935,
"grad_norm": 2.922088623046875,
"learning_rate": 7.520652093734624e-07,
"loss": 0.7966,
"step": 959
},
{
"epoch": 1.3733905579399142,
"grad_norm": 23.925447463989258,
"learning_rate": 7.489681051845424e-07,
"loss": 0.8503,
"step": 960
},
{
"epoch": 1.3748211731044349,
"grad_norm": 1.904219150543213,
"learning_rate": 7.458752680394165e-07,
"loss": 0.7959,
"step": 961
},
{
"epoch": 1.3762517882689558,
"grad_norm": 1.1502995491027832,
"learning_rate": 7.427867155102712e-07,
"loss": 0.7655,
"step": 962
},
{
"epoch": 1.3776824034334765,
"grad_norm": 7.088009357452393,
"learning_rate": 7.397024651449477e-07,
"loss": 0.7752,
"step": 963
},
{
"epoch": 1.3791130185979972,
"grad_norm": 1.1466937065124512,
"learning_rate": 7.366225344668442e-07,
"loss": 0.7847,
"step": 964
},
{
"epoch": 1.3805436337625179,
"grad_norm": 6.192886829376221,
"learning_rate": 7.335469409748178e-07,
"loss": 0.7846,
"step": 965
},
{
"epoch": 1.3819742489270386,
"grad_norm": 4.334934711456299,
"learning_rate": 7.304757021430825e-07,
"loss": 0.6667,
"step": 966
},
{
"epoch": 1.3834048640915593,
"grad_norm": 2.754920482635498,
"learning_rate": 7.2740883542111e-07,
"loss": 0.7744,
"step": 967
},
{
"epoch": 1.3848354792560802,
"grad_norm": 11.55642032623291,
"learning_rate": 7.243463582335341e-07,
"loss": 0.7909,
"step": 968
},
{
"epoch": 1.386266094420601,
"grad_norm": 11.184896469116211,
"learning_rate": 7.212882879800468e-07,
"loss": 0.7766,
"step": 969
},
{
"epoch": 1.3876967095851216,
"grad_norm": 3.4106950759887695,
"learning_rate": 7.182346420353022e-07,
"loss": 0.8393,
"step": 970
},
{
"epoch": 1.3891273247496423,
"grad_norm": 9.028657913208008,
"learning_rate": 7.151854377488189e-07,
"loss": 0.819,
"step": 971
},
{
"epoch": 1.3905579399141632,
"grad_norm": 2.576897144317627,
"learning_rate": 7.121406924448783e-07,
"loss": 0.8373,
"step": 972
},
{
"epoch": 1.391988555078684,
"grad_norm": 1.7887141704559326,
"learning_rate": 7.091004234224274e-07,
"loss": 0.8596,
"step": 973
},
{
"epoch": 1.3934191702432046,
"grad_norm": 2.7552521228790283,
"learning_rate": 7.060646479549828e-07,
"loss": 0.8854,
"step": 974
},
{
"epoch": 1.3948497854077253,
"grad_norm": 1.784921407699585,
"learning_rate": 7.030333832905291e-07,
"loss": 0.731,
"step": 975
},
{
"epoch": 1.396280400572246,
"grad_norm": 4.638574123382568,
"learning_rate": 7.000066466514225e-07,
"loss": 0.7751,
"step": 976
},
{
"epoch": 1.3977110157367667,
"grad_norm": 2.5338118076324463,
"learning_rate": 6.969844552342939e-07,
"loss": 0.7342,
"step": 977
},
{
"epoch": 1.3991416309012876,
"grad_norm": 3.9059221744537354,
"learning_rate": 6.939668262099494e-07,
"loss": 0.8343,
"step": 978
},
{
"epoch": 1.4005722460658083,
"grad_norm": 2.2744622230529785,
"learning_rate": 6.909537767232728e-07,
"loss": 0.8063,
"step": 979
},
{
"epoch": 1.402002861230329,
"grad_norm": 8.209087371826172,
"learning_rate": 6.87945323893131e-07,
"loss": 0.7646,
"step": 980
},
{
"epoch": 1.4034334763948497,
"grad_norm": 1.3790867328643799,
"learning_rate": 6.849414848122728e-07,
"loss": 0.8081,
"step": 981
},
{
"epoch": 1.4048640915593706,
"grad_norm": 3.088000535964966,
"learning_rate": 6.819422765472337e-07,
"loss": 0.867,
"step": 982
},
{
"epoch": 1.4062947067238913,
"grad_norm": 1.6987063884735107,
"learning_rate": 6.789477161382405e-07,
"loss": 0.7473,
"step": 983
},
{
"epoch": 1.407725321888412,
"grad_norm": 1.63704514503479,
"learning_rate": 6.759578205991113e-07,
"loss": 0.7635,
"step": 984
},
{
"epoch": 1.4091559370529327,
"grad_norm": 2.0188796520233154,
"learning_rate": 6.729726069171605e-07,
"loss": 0.7787,
"step": 985
},
{
"epoch": 1.4105865522174534,
"grad_norm": 1.2314823865890503,
"learning_rate": 6.699920920531034e-07,
"loss": 0.7567,
"step": 986
},
{
"epoch": 1.4120171673819741,
"grad_norm": 6.099488258361816,
"learning_rate": 6.670162929409572e-07,
"loss": 0.8228,
"step": 987
},
{
"epoch": 1.413447782546495,
"grad_norm": 2.057798385620117,
"learning_rate": 6.640452264879465e-07,
"loss": 0.7335,
"step": 988
},
{
"epoch": 1.4148783977110158,
"grad_norm": 1.429482340812683,
"learning_rate": 6.61078909574408e-07,
"loss": 0.8056,
"step": 989
},
{
"epoch": 1.4163090128755365,
"grad_norm": 1.7895710468292236,
"learning_rate": 6.581173590536924e-07,
"loss": 0.6972,
"step": 990
},
{
"epoch": 1.4177396280400572,
"grad_norm": 1.1860177516937256,
"learning_rate": 6.551605917520704e-07,
"loss": 0.7852,
"step": 991
},
{
"epoch": 1.419170243204578,
"grad_norm": 2.0887012481689453,
"learning_rate": 6.522086244686351e-07,
"loss": 0.8344,
"step": 992
},
{
"epoch": 1.4206008583690988,
"grad_norm": 1.4745193719863892,
"learning_rate": 6.492614739752104e-07,
"loss": 0.7405,
"step": 993
},
{
"epoch": 1.4220314735336195,
"grad_norm": 17.525732040405273,
"learning_rate": 6.463191570162516e-07,
"loss": 0.8515,
"step": 994
},
{
"epoch": 1.4234620886981402,
"grad_norm": 3.040510892868042,
"learning_rate": 6.433816903087513e-07,
"loss": 0.8162,
"step": 995
},
{
"epoch": 1.4248927038626609,
"grad_norm": 41.955718994140625,
"learning_rate": 6.404490905421474e-07,
"loss": 0.7542,
"step": 996
},
{
"epoch": 1.4263233190271816,
"grad_norm": 2.4796228408813477,
"learning_rate": 6.375213743782236e-07,
"loss": 0.8064,
"step": 997
},
{
"epoch": 1.4277539341917025,
"grad_norm": 3.1929125785827637,
"learning_rate": 6.345985584510177e-07,
"loss": 0.7785,
"step": 998
},
{
"epoch": 1.4291845493562232,
"grad_norm": 1.4972928762435913,
"learning_rate": 6.316806593667274e-07,
"loss": 0.7456,
"step": 999
},
{
"epoch": 1.4306151645207439,
"grad_norm": 1.4708969593048096,
"learning_rate": 6.28767693703614e-07,
"loss": 0.7775,
"step": 1000
},
{
"epoch": 1.4320457796852646,
"grad_norm": 5.087403774261475,
"learning_rate": 6.258596780119087e-07,
"loss": 0.8118,
"step": 1001
},
{
"epoch": 1.4334763948497855,
"grad_norm": 1.518306016921997,
"learning_rate": 6.229566288137212e-07,
"loss": 0.7894,
"step": 1002
},
{
"epoch": 1.4349070100143062,
"grad_norm": 6.86577033996582,
"learning_rate": 6.200585626029412e-07,
"loss": 0.8725,
"step": 1003
},
{
"epoch": 1.436337625178827,
"grad_norm": 1.6665362119674683,
"learning_rate": 6.171654958451484e-07,
"loss": 0.7696,
"step": 1004
},
{
"epoch": 1.4377682403433476,
"grad_norm": 9.65654468536377,
"learning_rate": 6.142774449775181e-07,
"loss": 0.8192,
"step": 1005
},
{
"epoch": 1.4391988555078683,
"grad_norm": 3.128150701522827,
"learning_rate": 6.113944264087269e-07,
"loss": 0.8093,
"step": 1006
},
{
"epoch": 1.440629470672389,
"grad_norm": 3.613922357559204,
"learning_rate": 6.085164565188594e-07,
"loss": 0.7531,
"step": 1007
},
{
"epoch": 1.44206008583691,
"grad_norm": 3.4265799522399902,
"learning_rate": 6.056435516593175e-07,
"loss": 0.7629,
"step": 1008
},
{
"epoch": 1.4434907010014306,
"grad_norm": 3.6590576171875,
"learning_rate": 6.027757281527242e-07,
"loss": 0.747,
"step": 1009
},
{
"epoch": 1.4449213161659513,
"grad_norm": 7.065302848815918,
"learning_rate": 5.999130022928323e-07,
"loss": 0.7662,
"step": 1010
},
{
"epoch": 1.4463519313304722,
"grad_norm": 2.217602491378784,
"learning_rate": 5.970553903444338e-07,
"loss": 0.7692,
"step": 1011
},
{
"epoch": 1.447782546494993,
"grad_norm": 2.588672399520874,
"learning_rate": 5.942029085432636e-07,
"loss": 0.7657,
"step": 1012
},
{
"epoch": 1.4492131616595136,
"grad_norm": 1.4080536365509033,
"learning_rate": 5.913555730959096e-07,
"loss": 0.7697,
"step": 1013
},
{
"epoch": 1.4506437768240343,
"grad_norm": 1.4369243383407593,
"learning_rate": 5.88513400179722e-07,
"loss": 0.7933,
"step": 1014
},
{
"epoch": 1.452074391988555,
"grad_norm": 3.819899320602417,
"learning_rate": 5.856764059427178e-07,
"loss": 0.7487,
"step": 1015
},
{
"epoch": 1.4535050071530757,
"grad_norm": 1.862250804901123,
"learning_rate": 5.828446065034912e-07,
"loss": 0.7765,
"step": 1016
},
{
"epoch": 1.4549356223175964,
"grad_norm": 1.8112690448760986,
"learning_rate": 5.80018017951123e-07,
"loss": 0.8474,
"step": 1017
},
{
"epoch": 1.4563662374821174,
"grad_norm": 2.6052050590515137,
"learning_rate": 5.771966563450868e-07,
"loss": 0.7542,
"step": 1018
},
{
"epoch": 1.457796852646638,
"grad_norm": 8.262088775634766,
"learning_rate": 5.743805377151587e-07,
"loss": 0.7811,
"step": 1019
},
{
"epoch": 1.4592274678111588,
"grad_norm": 2.0252511501312256,
"learning_rate": 5.715696780613279e-07,
"loss": 0.8363,
"step": 1020
},
{
"epoch": 1.4606580829756797,
"grad_norm": 3.229971408843994,
"learning_rate": 5.687640933537032e-07,
"loss": 0.722,
"step": 1021
},
{
"epoch": 1.4620886981402004,
"grad_norm": 2.0548818111419678,
"learning_rate": 5.659637995324229e-07,
"loss": 0.7691,
"step": 1022
},
{
"epoch": 1.463519313304721,
"grad_norm": 2.4392716884613037,
"learning_rate": 5.631688125075667e-07,
"loss": 0.7619,
"step": 1023
},
{
"epoch": 1.4649499284692418,
"grad_norm": 7.450191497802734,
"learning_rate": 5.603791481590612e-07,
"loss": 0.8198,
"step": 1024
},
{
"epoch": 1.4663805436337625,
"grad_norm": 3.3907644748687744,
"learning_rate": 5.575948223365925e-07,
"loss": 0.7469,
"step": 1025
},
{
"epoch": 1.4678111587982832,
"grad_norm": 1.4037667512893677,
"learning_rate": 5.548158508595166e-07,
"loss": 0.7584,
"step": 1026
},
{
"epoch": 1.469241773962804,
"grad_norm": 2.314279556274414,
"learning_rate": 5.520422495167671e-07,
"loss": 0.7725,
"step": 1027
},
{
"epoch": 1.4706723891273248,
"grad_norm": 1.8584074974060059,
"learning_rate": 5.492740340667664e-07,
"loss": 0.7752,
"step": 1028
},
{
"epoch": 1.4721030042918455,
"grad_norm": 1.3589491844177246,
"learning_rate": 5.465112202373385e-07,
"loss": 0.769,
"step": 1029
},
{
"epoch": 1.4735336194563662,
"grad_norm": 0.9419125914573669,
"learning_rate": 5.43753823725616e-07,
"loss": 0.7325,
"step": 1030
},
{
"epoch": 1.474964234620887,
"grad_norm": 1.696077823638916,
"learning_rate": 5.410018601979525e-07,
"loss": 0.7432,
"step": 1031
},
{
"epoch": 1.4763948497854078,
"grad_norm": 3.0386385917663574,
"learning_rate": 5.382553452898354e-07,
"loss": 0.7708,
"step": 1032
},
{
"epoch": 1.4778254649499285,
"grad_norm": 1.6025636196136475,
"learning_rate": 5.355142946057936e-07,
"loss": 0.7812,
"step": 1033
},
{
"epoch": 1.4792560801144492,
"grad_norm": 1.280683994293213,
"learning_rate": 5.327787237193109e-07,
"loss": 0.8416,
"step": 1034
},
{
"epoch": 1.48068669527897,
"grad_norm": 1.0730654001235962,
"learning_rate": 5.300486481727383e-07,
"loss": 0.7834,
"step": 1035
},
{
"epoch": 1.4821173104434906,
"grad_norm": 3.347235679626465,
"learning_rate": 5.273240834772038e-07,
"loss": 0.7814,
"step": 1036
},
{
"epoch": 1.4835479256080115,
"grad_norm": 2.4247682094573975,
"learning_rate": 5.246050451125244e-07,
"loss": 0.795,
"step": 1037
},
{
"epoch": 1.4849785407725322,
"grad_norm": 2.305119752883911,
"learning_rate": 5.218915485271206e-07,
"loss": 0.8216,
"step": 1038
},
{
"epoch": 1.486409155937053,
"grad_norm": 4.8329901695251465,
"learning_rate": 5.191836091379255e-07,
"loss": 0.7352,
"step": 1039
},
{
"epoch": 1.4878397711015736,
"grad_norm": 2.9933252334594727,
"learning_rate": 5.164812423302991e-07,
"loss": 0.7846,
"step": 1040
},
{
"epoch": 1.4892703862660945,
"grad_norm": 2.515021324157715,
"learning_rate": 5.137844634579393e-07,
"loss": 0.8154,
"step": 1041
},
{
"epoch": 1.4907010014306152,
"grad_norm": 1.579648494720459,
"learning_rate": 5.110932878427982e-07,
"loss": 0.7556,
"step": 1042
},
{
"epoch": 1.492131616595136,
"grad_norm": 1.0330595970153809,
"learning_rate": 5.0840773077499e-07,
"loss": 0.8217,
"step": 1043
},
{
"epoch": 1.4935622317596566,
"grad_norm": 4.242507457733154,
"learning_rate": 5.057278075127074e-07,
"loss": 0.8441,
"step": 1044
},
{
"epoch": 1.4949928469241773,
"grad_norm": 3.4716875553131104,
"learning_rate": 5.030535332821356e-07,
"loss": 0.7702,
"step": 1045
},
{
"epoch": 1.496423462088698,
"grad_norm": 14.659123420715332,
"learning_rate": 5.00384923277363e-07,
"loss": 0.7983,
"step": 1046
},
{
"epoch": 1.497854077253219,
"grad_norm": 2.3961856365203857,
"learning_rate": 4.977219926602959e-07,
"loss": 0.8693,
"step": 1047
},
{
"epoch": 1.4992846924177397,
"grad_norm": 3.505021333694458,
"learning_rate": 4.950647565605744e-07,
"loss": 0.8205,
"step": 1048
},
{
"epoch": 1.5007153075822603,
"grad_norm": 1.913122296333313,
"learning_rate": 4.924132300754835e-07,
"loss": 0.8566,
"step": 1049
},
{
"epoch": 1.5021459227467813,
"grad_norm": 1.38058340549469,
"learning_rate": 4.897674282698685e-07,
"loss": 0.7602,
"step": 1050
},
{
"epoch": 1.5021459227467813,
"eval_loss": 0.927307665348053,
"eval_runtime": 63.9053,
"eval_samples_per_second": 6.494,
"eval_steps_per_second": 0.407,
"step": 1050
},
{
"epoch": 1.503576537911302,
"grad_norm": 1.8451398611068726,
"learning_rate": 4.871273661760507e-07,
"loss": 0.8307,
"step": 1051
},
{
"epoch": 1.5050071530758227,
"grad_norm": 1.7605483531951904,
"learning_rate": 4.844930587937399e-07,
"loss": 0.7784,
"step": 1052
},
{
"epoch": 1.5064377682403434,
"grad_norm": 2.6857316493988037,
"learning_rate": 4.818645210899492e-07,
"loss": 0.8508,
"step": 1053
},
{
"epoch": 1.507868383404864,
"grad_norm": 4.418957710266113,
"learning_rate": 4.792417679989133e-07,
"loss": 0.8581,
"step": 1054
},
{
"epoch": 1.5092989985693848,
"grad_norm": 1.829487919807434,
"learning_rate": 4.76624814421999e-07,
"loss": 0.7126,
"step": 1055
},
{
"epoch": 1.5107296137339055,
"grad_norm": 5.856306076049805,
"learning_rate": 4.7401367522762304e-07,
"loss": 0.7673,
"step": 1056
},
{
"epoch": 1.5121602288984262,
"grad_norm": 18.10394859313965,
"learning_rate": 4.714083652511686e-07,
"loss": 0.8228,
"step": 1057
},
{
"epoch": 1.513590844062947,
"grad_norm": 1.5220907926559448,
"learning_rate": 4.6880889929489865e-07,
"loss": 0.8537,
"step": 1058
},
{
"epoch": 1.5150214592274678,
"grad_norm": 8.02856731414795,
"learning_rate": 4.662152921278726e-07,
"loss": 0.8248,
"step": 1059
},
{
"epoch": 1.5164520743919887,
"grad_norm": 2.229973554611206,
"learning_rate": 4.636275584858641e-07,
"loss": 0.8259,
"step": 1060
},
{
"epoch": 1.5178826895565094,
"grad_norm": 7.8895416259765625,
"learning_rate": 4.610457130712745e-07,
"loss": 0.7989,
"step": 1061
},
{
"epoch": 1.51931330472103,
"grad_norm": 4.06100606918335,
"learning_rate": 4.5846977055305117e-07,
"loss": 0.8214,
"step": 1062
},
{
"epoch": 1.5207439198855508,
"grad_norm": 7.079894065856934,
"learning_rate": 4.5589974556660456e-07,
"loss": 0.8546,
"step": 1063
},
{
"epoch": 1.5221745350500715,
"grad_norm": 1.5825847387313843,
"learning_rate": 4.5333565271372316e-07,
"loss": 0.6878,
"step": 1064
},
{
"epoch": 1.5236051502145922,
"grad_norm": 2.358546257019043,
"learning_rate": 4.507775065624916e-07,
"loss": 0.7321,
"step": 1065
},
{
"epoch": 1.525035765379113,
"grad_norm": 3.348055839538574,
"learning_rate": 4.48225321647209e-07,
"loss": 0.7788,
"step": 1066
},
{
"epoch": 1.5264663805436338,
"grad_norm": 4.638083457946777,
"learning_rate": 4.456791124683043e-07,
"loss": 0.7619,
"step": 1067
},
{
"epoch": 1.5278969957081545,
"grad_norm": 2.1720707416534424,
"learning_rate": 4.431388934922545e-07,
"loss": 0.8027,
"step": 1068
},
{
"epoch": 1.5293276108726752,
"grad_norm": 3.190173625946045,
"learning_rate": 4.4060467915150454e-07,
"loss": 0.7065,
"step": 1069
},
{
"epoch": 1.5307582260371961,
"grad_norm": 3.1954455375671387,
"learning_rate": 4.380764838443813e-07,
"loss": 0.7435,
"step": 1070
},
{
"epoch": 1.5321888412017168,
"grad_norm": 2.271794557571411,
"learning_rate": 4.35554321935016e-07,
"loss": 0.7707,
"step": 1071
},
{
"epoch": 1.5336194563662375,
"grad_norm": 1.6862138509750366,
"learning_rate": 4.330382077532594e-07,
"loss": 0.7988,
"step": 1072
},
{
"epoch": 1.5350500715307582,
"grad_norm": 2.501862049102783,
"learning_rate": 4.305281555946025e-07,
"loss": 0.7269,
"step": 1073
},
{
"epoch": 1.536480686695279,
"grad_norm": 6.872259140014648,
"learning_rate": 4.2802417972009416e-07,
"loss": 0.7131,
"step": 1074
},
{
"epoch": 1.5379113018597996,
"grad_norm": 4.220912933349609,
"learning_rate": 4.2552629435625944e-07,
"loss": 0.772,
"step": 1075
},
{
"epoch": 1.5393419170243203,
"grad_norm": 14.172344207763672,
"learning_rate": 4.2303451369502167e-07,
"loss": 0.8208,
"step": 1076
},
{
"epoch": 1.5407725321888412,
"grad_norm": 3.8137381076812744,
"learning_rate": 4.2054885189361833e-07,
"loss": 0.7236,
"step": 1077
},
{
"epoch": 1.542203147353362,
"grad_norm": 1.7150403261184692,
"learning_rate": 4.1806932307452187e-07,
"loss": 0.7771,
"step": 1078
},
{
"epoch": 1.5436337625178826,
"grad_norm": 2.4068055152893066,
"learning_rate": 4.1559594132536164e-07,
"loss": 0.8226,
"step": 1079
},
{
"epoch": 1.5450643776824036,
"grad_norm": 6.004452228546143,
"learning_rate": 4.1312872069884015e-07,
"loss": 0.7727,
"step": 1080
},
{
"epoch": 1.5464949928469243,
"grad_norm": 1.6341569423675537,
"learning_rate": 4.1066767521265524e-07,
"loss": 0.7553,
"step": 1081
},
{
"epoch": 1.547925608011445,
"grad_norm": 2.0458455085754395,
"learning_rate": 4.0821281884942145e-07,
"loss": 0.8625,
"step": 1082
},
{
"epoch": 1.5493562231759657,
"grad_norm": 1.1154307126998901,
"learning_rate": 4.05764165556588e-07,
"loss": 0.725,
"step": 1083
},
{
"epoch": 1.5507868383404864,
"grad_norm": 4.723168849945068,
"learning_rate": 4.033217292463613e-07,
"loss": 0.8132,
"step": 1084
},
{
"epoch": 1.552217453505007,
"grad_norm": 2.8204376697540283,
"learning_rate": 4.008855237956261e-07,
"loss": 0.7391,
"step": 1085
},
{
"epoch": 1.5536480686695278,
"grad_norm": 5.638607501983643,
"learning_rate": 3.9845556304586554e-07,
"loss": 0.862,
"step": 1086
},
{
"epoch": 1.5550786838340487,
"grad_norm": 2.7092974185943604,
"learning_rate": 3.9603186080308253e-07,
"loss": 0.7355,
"step": 1087
},
{
"epoch": 1.5565092989985694,
"grad_norm": 2.155038356781006,
"learning_rate": 3.936144308377229e-07,
"loss": 0.7857,
"step": 1088
},
{
"epoch": 1.55793991416309,
"grad_norm": 1.2008074522018433,
"learning_rate": 3.9120328688459554e-07,
"loss": 0.7398,
"step": 1089
},
{
"epoch": 1.559370529327611,
"grad_norm": 2.4443776607513428,
"learning_rate": 3.887984426427943e-07,
"loss": 0.6986,
"step": 1090
},
{
"epoch": 1.5608011444921317,
"grad_norm": 1.3269106149673462,
"learning_rate": 3.863999117756221e-07,
"loss": 0.8451,
"step": 1091
},
{
"epoch": 1.5622317596566524,
"grad_norm": 2.9758455753326416,
"learning_rate": 3.8400770791051087e-07,
"loss": 0.8204,
"step": 1092
},
{
"epoch": 1.563662374821173,
"grad_norm": 1.9098341464996338,
"learning_rate": 3.8162184463894503e-07,
"loss": 0.7557,
"step": 1093
},
{
"epoch": 1.5650929899856938,
"grad_norm": 1.6109716892242432,
"learning_rate": 3.7924233551638575e-07,
"loss": 0.7489,
"step": 1094
},
{
"epoch": 1.5665236051502145,
"grad_norm": 1.1516788005828857,
"learning_rate": 3.768691940621913e-07,
"loss": 0.7758,
"step": 1095
},
{
"epoch": 1.5679542203147352,
"grad_norm": 0.825720489025116,
"learning_rate": 3.745024337595418e-07,
"loss": 0.7843,
"step": 1096
},
{
"epoch": 1.5693848354792561,
"grad_norm": 1.0096527338027954,
"learning_rate": 3.721420680553634e-07,
"loss": 0.7708,
"step": 1097
},
{
"epoch": 1.5708154506437768,
"grad_norm": 1.44577956199646,
"learning_rate": 3.697881103602497e-07,
"loss": 0.7596,
"step": 1098
},
{
"epoch": 1.5722460658082977,
"grad_norm": 2.351330518722534,
"learning_rate": 3.674405740483868e-07,
"loss": 0.7222,
"step": 1099
},
{
"epoch": 1.5736766809728184,
"grad_norm": 3.4750962257385254,
"learning_rate": 3.6509947245747826e-07,
"loss": 0.7588,
"step": 1100
},
{
"epoch": 1.5751072961373391,
"grad_norm": 2.525667667388916,
"learning_rate": 3.627648188886674e-07,
"loss": 0.841,
"step": 1101
},
{
"epoch": 1.5765379113018598,
"grad_norm": 6.404210567474365,
"learning_rate": 3.604366266064625e-07,
"loss": 0.7888,
"step": 1102
},
{
"epoch": 1.5779685264663805,
"grad_norm": 1.4588615894317627,
"learning_rate": 3.5811490883866165e-07,
"loss": 0.6871,
"step": 1103
},
{
"epoch": 1.5793991416309012,
"grad_norm": 2.6901755332946777,
"learning_rate": 3.557996787762785e-07,
"loss": 0.8005,
"step": 1104
},
{
"epoch": 1.580829756795422,
"grad_norm": 1.7762101888656616,
"learning_rate": 3.534909495734653e-07,
"loss": 0.7128,
"step": 1105
},
{
"epoch": 1.5822603719599426,
"grad_norm": 2.374758243560791,
"learning_rate": 3.511887343474388e-07,
"loss": 0.784,
"step": 1106
},
{
"epoch": 1.5836909871244635,
"grad_norm": 1.9807848930358887,
"learning_rate": 3.488930461784075e-07,
"loss": 0.7985,
"step": 1107
},
{
"epoch": 1.5851216022889842,
"grad_norm": 5.555301189422607,
"learning_rate": 3.46603898109495e-07,
"loss": 0.8539,
"step": 1108
},
{
"epoch": 1.5865522174535052,
"grad_norm": 1.7881734371185303,
"learning_rate": 3.443213031466664e-07,
"loss": 0.7204,
"step": 1109
},
{
"epoch": 1.5879828326180259,
"grad_norm": 2.2678470611572266,
"learning_rate": 3.420452742586562e-07,
"loss": 0.7618,
"step": 1110
},
{
"epoch": 1.5894134477825466,
"grad_norm": 1.7865197658538818,
"learning_rate": 3.397758243768925e-07,
"loss": 0.753,
"step": 1111
},
{
"epoch": 1.5908440629470673,
"grad_norm": 3.1574225425720215,
"learning_rate": 3.375129663954233e-07,
"loss": 0.7138,
"step": 1112
},
{
"epoch": 1.592274678111588,
"grad_norm": 2.2759108543395996,
"learning_rate": 3.3525671317084643e-07,
"loss": 0.7308,
"step": 1113
},
{
"epoch": 1.5937052932761087,
"grad_norm": 1.156466007232666,
"learning_rate": 3.330070775222324e-07,
"loss": 0.7906,
"step": 1114
},
{
"epoch": 1.5951359084406294,
"grad_norm": 40.72386169433594,
"learning_rate": 3.30764072231054e-07,
"loss": 0.8223,
"step": 1115
},
{
"epoch": 1.59656652360515,
"grad_norm": 2.194326639175415,
"learning_rate": 3.285277100411138e-07,
"loss": 0.8578,
"step": 1116
},
{
"epoch": 1.597997138769671,
"grad_norm": 3.6407439708709717,
"learning_rate": 3.2629800365847046e-07,
"loss": 0.78,
"step": 1117
},
{
"epoch": 1.5994277539341917,
"grad_norm": 1.0178086757659912,
"learning_rate": 3.240749657513667e-07,
"loss": 0.7566,
"step": 1118
},
{
"epoch": 1.6008583690987126,
"grad_norm": 1.544061303138733,
"learning_rate": 3.2185860895015945e-07,
"loss": 0.7867,
"step": 1119
},
{
"epoch": 1.6022889842632333,
"grad_norm": 1.3069376945495605,
"learning_rate": 3.1964894584724467e-07,
"loss": 0.7854,
"step": 1120
},
{
"epoch": 1.603719599427754,
"grad_norm": 2.242849111557007,
"learning_rate": 3.1744598899698815e-07,
"loss": 0.849,
"step": 1121
},
{
"epoch": 1.6051502145922747,
"grad_norm": 4.343907356262207,
"learning_rate": 3.152497509156543e-07,
"loss": 0.7896,
"step": 1122
},
{
"epoch": 1.6065808297567954,
"grad_norm": 3.1958019733428955,
"learning_rate": 3.1306024408133354e-07,
"loss": 0.7529,
"step": 1123
},
{
"epoch": 1.608011444921316,
"grad_norm": 4.552048206329346,
"learning_rate": 3.108774809338721e-07,
"loss": 0.7182,
"step": 1124
},
{
"epoch": 1.6094420600858368,
"grad_norm": 5.193328857421875,
"learning_rate": 3.087014738748025e-07,
"loss": 0.7959,
"step": 1125
},
{
"epoch": 1.6108726752503575,
"grad_norm": 2.845714569091797,
"learning_rate": 3.0653223526727086e-07,
"loss": 0.8154,
"step": 1126
},
{
"epoch": 1.6123032904148784,
"grad_norm": 1.2329682111740112,
"learning_rate": 3.0436977743596823e-07,
"loss": 0.7836,
"step": 1127
},
{
"epoch": 1.613733905579399,
"grad_norm": 1.965511679649353,
"learning_rate": 3.0221411266706067e-07,
"loss": 0.865,
"step": 1128
},
{
"epoch": 1.61516452074392,
"grad_norm": 1.1454925537109375,
"learning_rate": 3.000652532081185e-07,
"loss": 0.7543,
"step": 1129
},
{
"epoch": 1.6165951359084407,
"grad_norm": 2.9890522956848145,
"learning_rate": 2.979232112680466e-07,
"loss": 0.7906,
"step": 1130
},
{
"epoch": 1.6180257510729614,
"grad_norm": 3.1727652549743652,
"learning_rate": 2.95787999017017e-07,
"loss": 0.7959,
"step": 1131
},
{
"epoch": 1.6194563662374821,
"grad_norm": 3.714076042175293,
"learning_rate": 2.9365962858639733e-07,
"loss": 0.7517,
"step": 1132
},
{
"epoch": 1.6208869814020028,
"grad_norm": 2.584320545196533,
"learning_rate": 2.915381120686825e-07,
"loss": 0.7209,
"step": 1133
},
{
"epoch": 1.6223175965665235,
"grad_norm": 2.656510829925537,
"learning_rate": 2.8942346151742793e-07,
"loss": 0.7495,
"step": 1134
},
{
"epoch": 1.6237482117310442,
"grad_norm": 2.237746238708496,
"learning_rate": 2.8731568894717843e-07,
"loss": 0.7395,
"step": 1135
},
{
"epoch": 1.6251788268955651,
"grad_norm": 3.3685977458953857,
"learning_rate": 2.852148063334006e-07,
"loss": 0.8202,
"step": 1136
},
{
"epoch": 1.6266094420600858,
"grad_norm": 2.4544692039489746,
"learning_rate": 2.831208256124167e-07,
"loss": 0.8121,
"step": 1137
},
{
"epoch": 1.6280400572246065,
"grad_norm": 3.5506999492645264,
"learning_rate": 2.8103375868133424e-07,
"loss": 0.7756,
"step": 1138
},
{
"epoch": 1.6294706723891275,
"grad_norm": 3.116994619369507,
"learning_rate": 2.789536173979794e-07,
"loss": 0.8122,
"step": 1139
},
{
"epoch": 1.6309012875536482,
"grad_norm": 8.765533447265625,
"learning_rate": 2.768804135808313e-07,
"loss": 0.6921,
"step": 1140
},
{
"epoch": 1.6323319027181689,
"grad_norm": 2.5757832527160645,
"learning_rate": 2.748141590089515e-07,
"loss": 0.8041,
"step": 1141
},
{
"epoch": 1.6337625178826896,
"grad_norm": 3.618260622024536,
"learning_rate": 2.727548654219193e-07,
"loss": 0.823,
"step": 1142
},
{
"epoch": 1.6351931330472103,
"grad_norm": 1.3914964199066162,
"learning_rate": 2.707025445197659e-07,
"loss": 0.7844,
"step": 1143
},
{
"epoch": 1.636623748211731,
"grad_norm": 5.06028413772583,
"learning_rate": 2.686572079629054e-07,
"loss": 0.8875,
"step": 1144
},
{
"epoch": 1.6380543633762517,
"grad_norm": 4.079840183258057,
"learning_rate": 2.6661886737206966e-07,
"loss": 0.8285,
"step": 1145
},
{
"epoch": 1.6394849785407726,
"grad_norm": 1.7283517122268677,
"learning_rate": 2.6458753432824387e-07,
"loss": 0.6827,
"step": 1146
},
{
"epoch": 1.6409155937052933,
"grad_norm": 3.4194791316986084,
"learning_rate": 2.625632203725979e-07,
"loss": 0.7079,
"step": 1147
},
{
"epoch": 1.642346208869814,
"grad_norm": 4.089590549468994,
"learning_rate": 2.605459370064224e-07,
"loss": 0.7858,
"step": 1148
},
{
"epoch": 1.643776824034335,
"grad_norm": 1.229331135749817,
"learning_rate": 2.58535695691064e-07,
"loss": 0.791,
"step": 1149
},
{
"epoch": 1.6452074391988556,
"grad_norm": 1.524109959602356,
"learning_rate": 2.5653250784785883e-07,
"loss": 0.7691,
"step": 1150
},
{
"epoch": 1.6466380543633763,
"grad_norm": 2.404613494873047,
"learning_rate": 2.545363848580679e-07,
"loss": 0.703,
"step": 1151
},
{
"epoch": 1.648068669527897,
"grad_norm": 1.462568759918213,
"learning_rate": 2.525473380628127e-07,
"loss": 0.7592,
"step": 1152
},
{
"epoch": 1.6494992846924177,
"grad_norm": 2.3987367153167725,
"learning_rate": 2.505653787630121e-07,
"loss": 0.7462,
"step": 1153
},
{
"epoch": 1.6509298998569384,
"grad_norm": 2.1042797565460205,
"learning_rate": 2.4859051821931515e-07,
"loss": 0.8334,
"step": 1154
},
{
"epoch": 1.652360515021459,
"grad_norm": 2.755420446395874,
"learning_rate": 2.466227676520395e-07,
"loss": 0.8181,
"step": 1155
},
{
"epoch": 1.65379113018598,
"grad_norm": 11.293910026550293,
"learning_rate": 2.4466213824110745e-07,
"loss": 0.7035,
"step": 1156
},
{
"epoch": 1.6552217453505007,
"grad_norm": 1.2045400142669678,
"learning_rate": 2.427086411259812e-07,
"loss": 0.7634,
"step": 1157
},
{
"epoch": 1.6566523605150214,
"grad_norm": 1.108940839767456,
"learning_rate": 2.4076228740559996e-07,
"loss": 0.7702,
"step": 1158
},
{
"epoch": 1.6580829756795423,
"grad_norm": 7.84178352355957,
"learning_rate": 2.3882308813831857e-07,
"loss": 0.771,
"step": 1159
},
{
"epoch": 1.659513590844063,
"grad_norm": 7.632559299468994,
"learning_rate": 2.36891054341842e-07,
"loss": 0.77,
"step": 1160
},
{
"epoch": 1.6609442060085837,
"grad_norm": 2.9848833084106445,
"learning_rate": 2.349661969931643e-07,
"loss": 0.7671,
"step": 1161
},
{
"epoch": 1.6623748211731044,
"grad_norm": 7.07189416885376,
"learning_rate": 2.3304852702850688e-07,
"loss": 0.772,
"step": 1162
},
{
"epoch": 1.6638054363376251,
"grad_norm": 2.8970439434051514,
"learning_rate": 2.3113805534325465e-07,
"loss": 0.7272,
"step": 1163
},
{
"epoch": 1.6652360515021458,
"grad_norm": 2.363818883895874,
"learning_rate": 2.2923479279189464e-07,
"loss": 0.7735,
"step": 1164
},
{
"epoch": 1.6666666666666665,
"grad_norm": 6.8089599609375,
"learning_rate": 2.2733875018795586e-07,
"loss": 0.7952,
"step": 1165
},
{
"epoch": 1.6680972818311874,
"grad_norm": 5.405824661254883,
"learning_rate": 2.2544993830394571e-07,
"loss": 0.8125,
"step": 1166
},
{
"epoch": 1.6695278969957081,
"grad_norm": 1.5793416500091553,
"learning_rate": 2.2356836787128947e-07,
"loss": 0.8465,
"step": 1167
},
{
"epoch": 1.670958512160229,
"grad_norm": 1.910159945487976,
"learning_rate": 2.2169404958027095e-07,
"loss": 0.7499,
"step": 1168
},
{
"epoch": 1.6723891273247498,
"grad_norm": 10.456279754638672,
"learning_rate": 2.198269940799691e-07,
"loss": 0.8234,
"step": 1169
},
{
"epoch": 1.6738197424892705,
"grad_norm": 4.157973766326904,
"learning_rate": 2.1796721197819868e-07,
"loss": 0.8318,
"step": 1170
},
{
"epoch": 1.6752503576537912,
"grad_norm": 0.991513729095459,
"learning_rate": 2.1611471384145126e-07,
"loss": 0.7611,
"step": 1171
},
{
"epoch": 1.6766809728183119,
"grad_norm": 0.8860572576522827,
"learning_rate": 2.1426951019483327e-07,
"loss": 0.7057,
"step": 1172
},
{
"epoch": 1.6781115879828326,
"grad_norm": 2.586033344268799,
"learning_rate": 2.1243161152200629e-07,
"loss": 0.8086,
"step": 1173
},
{
"epoch": 1.6795422031473533,
"grad_norm": 2.3332133293151855,
"learning_rate": 2.1060102826512983e-07,
"loss": 0.7717,
"step": 1174
},
{
"epoch": 1.680972818311874,
"grad_norm": 2.051971197128296,
"learning_rate": 2.087777708247991e-07,
"loss": 0.7448,
"step": 1175
},
{
"epoch": 1.6824034334763949,
"grad_norm": 5.876535892486572,
"learning_rate": 2.0696184955998675e-07,
"loss": 0.7681,
"step": 1176
},
{
"epoch": 1.6838340486409156,
"grad_norm": 2.5695269107818604,
"learning_rate": 2.0515327478798601e-07,
"loss": 0.8074,
"step": 1177
},
{
"epoch": 1.6852646638054365,
"grad_norm": 8.719694137573242,
"learning_rate": 2.033520567843491e-07,
"loss": 0.8109,
"step": 1178
},
{
"epoch": 1.6866952789699572,
"grad_norm": 2.353991985321045,
"learning_rate": 2.015582057828302e-07,
"loss": 0.7361,
"step": 1179
},
{
"epoch": 1.688125894134478,
"grad_norm": 5.169013023376465,
"learning_rate": 1.9977173197532845e-07,
"loss": 0.8165,
"step": 1180
},
{
"epoch": 1.6895565092989986,
"grad_norm": 1.9157449007034302,
"learning_rate": 1.979926455118279e-07,
"loss": 0.7044,
"step": 1181
},
{
"epoch": 1.6909871244635193,
"grad_norm": 4.792452812194824,
"learning_rate": 1.9622095650034077e-07,
"loss": 0.7902,
"step": 1182
},
{
"epoch": 1.69241773962804,
"grad_norm": 1.4491595029830933,
"learning_rate": 1.94456675006851e-07,
"loss": 0.7668,
"step": 1183
},
{
"epoch": 1.6938483547925607,
"grad_norm": 2.2091567516326904,
"learning_rate": 1.9269981105525559e-07,
"loss": 0.7461,
"step": 1184
},
{
"epoch": 1.6952789699570814,
"grad_norm": 1.9733930826187134,
"learning_rate": 1.909503746273078e-07,
"loss": 0.6816,
"step": 1185
},
{
"epoch": 1.6967095851216023,
"grad_norm": 3.1535139083862305,
"learning_rate": 1.89208375662562e-07,
"loss": 0.8196,
"step": 1186
},
{
"epoch": 1.698140200286123,
"grad_norm": 2.412435531616211,
"learning_rate": 1.8747382405831515e-07,
"loss": 0.7442,
"step": 1187
},
{
"epoch": 1.699570815450644,
"grad_norm": 3.027723550796509,
"learning_rate": 1.8574672966955125e-07,
"loss": 0.823,
"step": 1188
},
{
"epoch": 1.7010014306151646,
"grad_norm": 1.3484646081924438,
"learning_rate": 1.8402710230888685e-07,
"loss": 0.8225,
"step": 1189
},
{
"epoch": 1.7024320457796853,
"grad_norm": 4.724902629852295,
"learning_rate": 1.823149517465128e-07,
"loss": 0.7957,
"step": 1190
},
{
"epoch": 1.703862660944206,
"grad_norm": 3.8161709308624268,
"learning_rate": 1.8061028771014004e-07,
"loss": 0.8052,
"step": 1191
},
{
"epoch": 1.7052932761087267,
"grad_norm": 2.081833839416504,
"learning_rate": 1.7891311988494523e-07,
"loss": 0.7378,
"step": 1192
},
{
"epoch": 1.7067238912732474,
"grad_norm": 2.2098007202148438,
"learning_rate": 1.772234579135138e-07,
"loss": 0.7968,
"step": 1193
},
{
"epoch": 1.7081545064377681,
"grad_norm": 1.3511004447937012,
"learning_rate": 1.7554131139578622e-07,
"loss": 0.8255,
"step": 1194
},
{
"epoch": 1.709585121602289,
"grad_norm": 11.756885528564453,
"learning_rate": 1.73866689889004e-07,
"loss": 0.78,
"step": 1195
},
{
"epoch": 1.7110157367668097,
"grad_norm": 1.7030614614486694,
"learning_rate": 1.7219960290765402e-07,
"loss": 0.8037,
"step": 1196
},
{
"epoch": 1.7124463519313304,
"grad_norm": 3.0442252159118652,
"learning_rate": 1.705400599234152e-07,
"loss": 0.7357,
"step": 1197
},
{
"epoch": 1.7138769670958514,
"grad_norm": 3.3682615756988525,
"learning_rate": 1.6888807036510562e-07,
"loss": 0.8288,
"step": 1198
},
{
"epoch": 1.715307582260372,
"grad_norm": 1.4772732257843018,
"learning_rate": 1.6724364361862682e-07,
"loss": 0.8346,
"step": 1199
},
{
"epoch": 1.7167381974248928,
"grad_norm": 1.5449028015136719,
"learning_rate": 1.6560678902691223e-07,
"loss": 0.6765,
"step": 1200
},
{
"epoch": 1.7181688125894135,
"grad_norm": 1.7480943202972412,
"learning_rate": 1.639775158898732e-07,
"loss": 0.796,
"step": 1201
},
{
"epoch": 1.7195994277539342,
"grad_norm": 4.165433406829834,
"learning_rate": 1.62355833464347e-07,
"loss": 0.752,
"step": 1202
},
{
"epoch": 1.7210300429184548,
"grad_norm": 1.7983890771865845,
"learning_rate": 1.6074175096404382e-07,
"loss": 0.7895,
"step": 1203
},
{
"epoch": 1.7224606580829755,
"grad_norm": 1.4561206102371216,
"learning_rate": 1.5913527755949308e-07,
"loss": 0.7682,
"step": 1204
},
{
"epoch": 1.7238912732474965,
"grad_norm": 1.1166143417358398,
"learning_rate": 1.5753642237799426e-07,
"loss": 0.825,
"step": 1205
},
{
"epoch": 1.7253218884120172,
"grad_norm": 1.4510133266448975,
"learning_rate": 1.5594519450356204e-07,
"loss": 0.7234,
"step": 1206
},
{
"epoch": 1.7267525035765379,
"grad_norm": 3.046424627304077,
"learning_rate": 1.5436160297687614e-07,
"loss": 0.8216,
"step": 1207
},
{
"epoch": 1.7281831187410588,
"grad_norm": 5.349708080291748,
"learning_rate": 1.527856567952306e-07,
"loss": 0.7233,
"step": 1208
},
{
"epoch": 1.7296137339055795,
"grad_norm": 2.7202823162078857,
"learning_rate": 1.5121736491248127e-07,
"loss": 0.7901,
"step": 1209
},
{
"epoch": 1.7310443490701002,
"grad_norm": 2.2550981044769287,
"learning_rate": 1.4965673623899495e-07,
"loss": 0.7899,
"step": 1210
},
{
"epoch": 1.7324749642346209,
"grad_norm": 2.9862146377563477,
"learning_rate": 1.481037796416009e-07,
"loss": 0.7367,
"step": 1211
},
{
"epoch": 1.7339055793991416,
"grad_norm": 0.9522223472595215,
"learning_rate": 1.4655850394353738e-07,
"loss": 0.7218,
"step": 1212
},
{
"epoch": 1.7353361945636623,
"grad_norm": 2.175283670425415,
"learning_rate": 1.450209179244038e-07,
"loss": 0.8367,
"step": 1213
},
{
"epoch": 1.736766809728183,
"grad_norm": 7.380995750427246,
"learning_rate": 1.434910303201102e-07,
"loss": 0.8238,
"step": 1214
},
{
"epoch": 1.738197424892704,
"grad_norm": 1.5405120849609375,
"learning_rate": 1.41968849822827e-07,
"loss": 0.787,
"step": 1215
},
{
"epoch": 1.7396280400572246,
"grad_norm": 3.323050022125244,
"learning_rate": 1.404543850809364e-07,
"loss": 0.7354,
"step": 1216
},
{
"epoch": 1.7410586552217453,
"grad_norm": 16.810117721557617,
"learning_rate": 1.389476446989828e-07,
"loss": 0.7283,
"step": 1217
},
{
"epoch": 1.7424892703862662,
"grad_norm": 1.5554410219192505,
"learning_rate": 1.3744863723762457e-07,
"loss": 0.8043,
"step": 1218
},
{
"epoch": 1.743919885550787,
"grad_norm": 1.5318583250045776,
"learning_rate": 1.359573712135842e-07,
"loss": 0.8493,
"step": 1219
},
{
"epoch": 1.7453505007153076,
"grad_norm": 1.6202287673950195,
"learning_rate": 1.3447385509960085e-07,
"loss": 0.7898,
"step": 1220
},
{
"epoch": 1.7467811158798283,
"grad_norm": 2.8205626010894775,
"learning_rate": 1.3299809732438277e-07,
"loss": 0.7225,
"step": 1221
},
{
"epoch": 1.748211731044349,
"grad_norm": 5.016057014465332,
"learning_rate": 1.3153010627255728e-07,
"loss": 0.8083,
"step": 1222
},
{
"epoch": 1.7496423462088697,
"grad_norm": 3.2081761360168457,
"learning_rate": 1.3006989028462536e-07,
"loss": 0.806,
"step": 1223
},
{
"epoch": 1.7510729613733904,
"grad_norm": 4.852132797241211,
"learning_rate": 1.286174576569134e-07,
"loss": 0.7865,
"step": 1224
},
{
"epoch": 1.7525035765379113,
"grad_norm": 4.22651481628418,
"learning_rate": 1.271728166415258e-07,
"loss": 0.7865,
"step": 1225
},
{
"epoch": 1.7525035765379113,
"eval_loss": 0.9261357188224792,
"eval_runtime": 64.6017,
"eval_samples_per_second": 6.424,
"eval_steps_per_second": 0.402,
"step": 1225
},
{
"epoch": 1.753934191702432,
"grad_norm": 1.1874042749404907,
"learning_rate": 1.2573597544629795e-07,
"loss": 0.7648,
"step": 1226
},
{
"epoch": 1.755364806866953,
"grad_norm": 3.088524341583252,
"learning_rate": 1.2430694223475087e-07,
"loss": 0.8424,
"step": 1227
},
{
"epoch": 1.7567954220314737,
"grad_norm": 2.089639902114868,
"learning_rate": 1.2288572512604341e-07,
"loss": 0.8197,
"step": 1228
},
{
"epoch": 1.7582260371959944,
"grad_norm": 1.833664059638977,
"learning_rate": 1.2147233219492627e-07,
"loss": 0.6933,
"step": 1229
},
{
"epoch": 1.759656652360515,
"grad_norm": 4.207241535186768,
"learning_rate": 1.2006677147169754e-07,
"loss": 0.8613,
"step": 1230
},
{
"epoch": 1.7610872675250357,
"grad_norm": 5.451657772064209,
"learning_rate": 1.1866905094215508e-07,
"loss": 0.7253,
"step": 1231
},
{
"epoch": 1.7625178826895564,
"grad_norm": 2.7151124477386475,
"learning_rate": 1.1727917854755238e-07,
"loss": 0.8098,
"step": 1232
},
{
"epoch": 1.7639484978540771,
"grad_norm": 1.2078485488891602,
"learning_rate": 1.1589716218455359e-07,
"loss": 0.6965,
"step": 1233
},
{
"epoch": 1.7653791130185978,
"grad_norm": 1.4734965562820435,
"learning_rate": 1.1452300970518758e-07,
"loss": 0.7128,
"step": 1234
},
{
"epoch": 1.7668097281831188,
"grad_norm": 3.2850356101989746,
"learning_rate": 1.1315672891680429e-07,
"loss": 0.7104,
"step": 1235
},
{
"epoch": 1.7682403433476395,
"grad_norm": 1.9388680458068848,
"learning_rate": 1.117983275820304e-07,
"loss": 0.7422,
"step": 1236
},
{
"epoch": 1.7696709585121604,
"grad_norm": 82.46575164794922,
"learning_rate": 1.1044781341872411e-07,
"loss": 0.7632,
"step": 1237
},
{
"epoch": 1.771101573676681,
"grad_norm": 3.337305784225464,
"learning_rate": 1.0910519409993247e-07,
"loss": 0.76,
"step": 1238
},
{
"epoch": 1.7725321888412018,
"grad_norm": 2.8676528930664062,
"learning_rate": 1.0777047725384786e-07,
"loss": 0.7758,
"step": 1239
},
{
"epoch": 1.7739628040057225,
"grad_norm": 21.342599868774414,
"learning_rate": 1.064436704637633e-07,
"loss": 0.8218,
"step": 1240
},
{
"epoch": 1.7753934191702432,
"grad_norm": 1.6680625677108765,
"learning_rate": 1.0512478126803071e-07,
"loss": 0.7485,
"step": 1241
},
{
"epoch": 1.7768240343347639,
"grad_norm": 5.325804710388184,
"learning_rate": 1.038138171600177e-07,
"loss": 0.7723,
"step": 1242
},
{
"epoch": 1.7782546494992846,
"grad_norm": 3.2667267322540283,
"learning_rate": 1.0251078558806486e-07,
"loss": 0.77,
"step": 1243
},
{
"epoch": 1.7796852646638053,
"grad_norm": 3.370208501815796,
"learning_rate": 1.0121569395544272e-07,
"loss": 0.8516,
"step": 1244
},
{
"epoch": 1.7811158798283262,
"grad_norm": 4.472996711730957,
"learning_rate": 9.9928549620312e-08,
"loss": 0.8197,
"step": 1245
},
{
"epoch": 1.782546494992847,
"grad_norm": 2.5200583934783936,
"learning_rate": 9.864935989567874e-08,
"loss": 0.7444,
"step": 1246
},
{
"epoch": 1.7839771101573678,
"grad_norm": 2.0389504432678223,
"learning_rate": 9.737813204935497e-08,
"loss": 0.7552,
"step": 1247
},
{
"epoch": 1.7854077253218885,
"grad_norm": 3.2909703254699707,
"learning_rate": 9.611487330391688e-08,
"loss": 0.8065,
"step": 1248
},
{
"epoch": 1.7868383404864092,
"grad_norm": 3.057483434677124,
"learning_rate": 9.485959083666324e-08,
"loss": 0.7563,
"step": 1249
},
{
"epoch": 1.78826895565093,
"grad_norm": 8.188149452209473,
"learning_rate": 9.361229177957486e-08,
"loss": 0.757,
"step": 1250
},
{
"epoch": 1.7896995708154506,
"grad_norm": 2.4237565994262695,
"learning_rate": 9.23729832192749e-08,
"loss": 0.7992,
"step": 1251
},
{
"epoch": 1.7911301859799713,
"grad_norm": 1.6685830354690552,
"learning_rate": 9.114167219698744e-08,
"loss": 0.7748,
"step": 1252
},
{
"epoch": 1.792560801144492,
"grad_norm": 4.239346981048584,
"learning_rate": 8.991836570849743e-08,
"loss": 0.7456,
"step": 1253
},
{
"epoch": 1.7939914163090127,
"grad_norm": 2.0644781589508057,
"learning_rate": 8.870307070411288e-08,
"loss": 0.8112,
"step": 1254
},
{
"epoch": 1.7954220314735336,
"grad_norm": 1.9066531658172607,
"learning_rate": 8.749579408862269e-08,
"loss": 0.7299,
"step": 1255
},
{
"epoch": 1.7968526466380543,
"grad_norm": 5.838130950927734,
"learning_rate": 8.629654272125887e-08,
"loss": 0.7255,
"step": 1256
},
{
"epoch": 1.7982832618025753,
"grad_norm": 2.705153226852417,
"learning_rate": 8.510532341565807e-08,
"loss": 0.7872,
"step": 1257
},
{
"epoch": 1.799713876967096,
"grad_norm": 2.5030932426452637,
"learning_rate": 8.392214293982165e-08,
"loss": 0.6766,
"step": 1258
},
{
"epoch": 1.8011444921316166,
"grad_norm": 2.564344882965088,
"learning_rate": 8.274700801607744e-08,
"loss": 0.7533,
"step": 1259
},
{
"epoch": 1.8025751072961373,
"grad_norm": 9.728470802307129,
"learning_rate": 8.157992532104269e-08,
"loss": 0.8039,
"step": 1260
},
{
"epoch": 1.804005722460658,
"grad_norm": 3.0228323936462402,
"learning_rate": 8.042090148558479e-08,
"loss": 0.8776,
"step": 1261
},
{
"epoch": 1.8054363376251787,
"grad_norm": 1.9111461639404297,
"learning_rate": 7.926994309478403e-08,
"loss": 0.8547,
"step": 1262
},
{
"epoch": 1.8068669527896994,
"grad_norm": 2.468275547027588,
"learning_rate": 7.812705668789671e-08,
"loss": 0.7513,
"step": 1263
},
{
"epoch": 1.8082975679542204,
"grad_norm": 2.3994827270507812,
"learning_rate": 7.699224875831717e-08,
"loss": 0.8268,
"step": 1264
},
{
"epoch": 1.809728183118741,
"grad_norm": 1.6180237531661987,
"learning_rate": 7.586552575354144e-08,
"loss": 0.7764,
"step": 1265
},
{
"epoch": 1.8111587982832618,
"grad_norm": 2.5159072875976562,
"learning_rate": 7.47468940751303e-08,
"loss": 0.8373,
"step": 1266
},
{
"epoch": 1.8125894134477827,
"grad_norm": 1.236160159111023,
"learning_rate": 7.36363600786733e-08,
"loss": 0.7767,
"step": 1267
},
{
"epoch": 1.8140200286123034,
"grad_norm": 3.060023307800293,
"learning_rate": 7.253393007375231e-08,
"loss": 0.8235,
"step": 1268
},
{
"epoch": 1.815450643776824,
"grad_norm": 4.7666120529174805,
"learning_rate": 7.143961032390533e-08,
"loss": 0.7897,
"step": 1269
},
{
"epoch": 1.8168812589413448,
"grad_norm": 1.959795594215393,
"learning_rate": 7.035340704659244e-08,
"loss": 0.8028,
"step": 1270
},
{
"epoch": 1.8183118741058655,
"grad_norm": 1.297690510749817,
"learning_rate": 6.927532641315821e-08,
"loss": 0.776,
"step": 1271
},
{
"epoch": 1.8197424892703862,
"grad_norm": 3.889566421508789,
"learning_rate": 6.8205374548798e-08,
"loss": 0.822,
"step": 1272
},
{
"epoch": 1.8211731044349069,
"grad_norm": 2.258944272994995,
"learning_rate": 6.714355753252394e-08,
"loss": 0.8079,
"step": 1273
},
{
"epoch": 1.8226037195994278,
"grad_norm": 3.4968879222869873,
"learning_rate": 6.60898813971283e-08,
"loss": 0.7688,
"step": 1274
},
{
"epoch": 1.8240343347639485,
"grad_norm": 2.931837797164917,
"learning_rate": 6.504435212915049e-08,
"loss": 0.7655,
"step": 1275
},
{
"epoch": 1.8254649499284692,
"grad_norm": 1.8585553169250488,
"learning_rate": 6.400697566884367e-08,
"loss": 0.7458,
"step": 1276
},
{
"epoch": 1.8268955650929901,
"grad_norm": 1.4828190803527832,
"learning_rate": 6.297775791013933e-08,
"loss": 0.7337,
"step": 1277
},
{
"epoch": 1.8283261802575108,
"grad_norm": 3.6852729320526123,
"learning_rate": 6.195670470061505e-08,
"loss": 0.7259,
"step": 1278
},
{
"epoch": 1.8297567954220315,
"grad_norm": 2.430832624435425,
"learning_rate": 6.094382184146085e-08,
"loss": 0.8294,
"step": 1279
},
{
"epoch": 1.8311874105865522,
"grad_norm": 1.5084558725357056,
"learning_rate": 5.99391150874466e-08,
"loss": 0.8652,
"step": 1280
},
{
"epoch": 1.832618025751073,
"grad_norm": 3.6525607109069824,
"learning_rate": 5.894259014688824e-08,
"loss": 0.7514,
"step": 1281
},
{
"epoch": 1.8340486409155936,
"grad_norm": 1.948525309562683,
"learning_rate": 5.7954252681617304e-08,
"loss": 0.7769,
"step": 1282
},
{
"epoch": 1.8354792560801143,
"grad_norm": 1.8478093147277832,
"learning_rate": 5.697410830694633e-08,
"loss": 0.8044,
"step": 1283
},
{
"epoch": 1.8369098712446352,
"grad_norm": 1.2266122102737427,
"learning_rate": 5.600216259163893e-08,
"loss": 0.7641,
"step": 1284
},
{
"epoch": 1.838340486409156,
"grad_norm": 0.9987815618515015,
"learning_rate": 5.5038421057877654e-08,
"loss": 0.6867,
"step": 1285
},
{
"epoch": 1.8397711015736766,
"grad_norm": 2.728739023208618,
"learning_rate": 5.4082889181231497e-08,
"loss": 0.8508,
"step": 1286
},
{
"epoch": 1.8412017167381975,
"grad_norm": 1.628726840019226,
"learning_rate": 5.313557239062627e-08,
"loss": 0.7974,
"step": 1287
},
{
"epoch": 1.8426323319027182,
"grad_norm": 2.028298854827881,
"learning_rate": 5.219647606831329e-08,
"loss": 0.7859,
"step": 1288
},
{
"epoch": 1.844062947067239,
"grad_norm": 2.2269015312194824,
"learning_rate": 5.126560554983822e-08,
"loss": 0.9191,
"step": 1289
},
{
"epoch": 1.8454935622317596,
"grad_norm": 5.080014705657959,
"learning_rate": 5.034296612401129e-08,
"loss": 0.6733,
"step": 1290
},
{
"epoch": 1.8469241773962803,
"grad_norm": 3.053027629852295,
"learning_rate": 4.942856303287779e-08,
"loss": 0.7883,
"step": 1291
},
{
"epoch": 1.848354792560801,
"grad_norm": 1.7156245708465576,
"learning_rate": 4.852240147168696e-08,
"loss": 0.7215,
"step": 1292
},
{
"epoch": 1.8497854077253217,
"grad_norm": 1.3909878730773926,
"learning_rate": 4.762448658886298e-08,
"loss": 0.8188,
"step": 1293
},
{
"epoch": 1.8512160228898427,
"grad_norm": 5.936245441436768,
"learning_rate": 4.673482348597685e-08,
"loss": 0.8267,
"step": 1294
},
{
"epoch": 1.8526466380543634,
"grad_norm": 18.523326873779297,
"learning_rate": 4.585341721771574e-08,
"loss": 0.7863,
"step": 1295
},
{
"epoch": 1.8540772532188843,
"grad_norm": 2.28387713432312,
"learning_rate": 4.4980272791855015e-08,
"loss": 0.8343,
"step": 1296
},
{
"epoch": 1.855507868383405,
"grad_norm": 1.3548191785812378,
"learning_rate": 4.4115395169230074e-08,
"loss": 0.7428,
"step": 1297
},
{
"epoch": 1.8569384835479257,
"grad_norm": 3.7556676864624023,
"learning_rate": 4.325878926370791e-08,
"loss": 0.7839,
"step": 1298
},
{
"epoch": 1.8583690987124464,
"grad_norm": 3.6090095043182373,
"learning_rate": 4.241045994215842e-08,
"loss": 0.8006,
"step": 1299
},
{
"epoch": 1.859799713876967,
"grad_norm": 1.8558502197265625,
"learning_rate": 4.157041202442863e-08,
"loss": 0.7306,
"step": 1300
},
{
"epoch": 1.8612303290414878,
"grad_norm": 1.3088663816452026,
"learning_rate": 4.0738650283313025e-08,
"loss": 0.7975,
"step": 1301
},
{
"epoch": 1.8626609442060085,
"grad_norm": 1.1639654636383057,
"learning_rate": 3.991517944452827e-08,
"loss": 0.7781,
"step": 1302
},
{
"epoch": 1.8640915593705292,
"grad_norm": 2.453809976577759,
"learning_rate": 3.9100004186685354e-08,
"loss": 0.8048,
"step": 1303
},
{
"epoch": 1.86552217453505,
"grad_norm": 1.307875394821167,
"learning_rate": 3.8293129141263485e-08,
"loss": 0.7623,
"step": 1304
},
{
"epoch": 1.8669527896995708,
"grad_norm": 2.7597270011901855,
"learning_rate": 3.7494558892583405e-08,
"loss": 0.7839,
"step": 1305
},
{
"epoch": 1.8683834048640917,
"grad_norm": 3.5831847190856934,
"learning_rate": 3.670429797778163e-08,
"loss": 0.7739,
"step": 1306
},
{
"epoch": 1.8698140200286124,
"grad_norm": 2.250288724899292,
"learning_rate": 3.592235088678458e-08,
"loss": 0.7752,
"step": 1307
},
{
"epoch": 1.871244635193133,
"grad_norm": 2.3639230728149414,
"learning_rate": 3.514872206228298e-08,
"loss": 0.8142,
"step": 1308
},
{
"epoch": 1.8726752503576538,
"grad_norm": 10.222983360290527,
"learning_rate": 3.438341589970684e-08,
"loss": 0.7631,
"step": 1309
},
{
"epoch": 1.8741058655221745,
"grad_norm": 1.4523752927780151,
"learning_rate": 3.3626436747200175e-08,
"loss": 0.8136,
"step": 1310
},
{
"epoch": 1.8755364806866952,
"grad_norm": 1.7456223964691162,
"learning_rate": 3.287778890559684e-08,
"loss": 0.7797,
"step": 1311
},
{
"epoch": 1.876967095851216,
"grad_norm": 1.4522265195846558,
"learning_rate": 3.2137476628395054e-08,
"loss": 0.7736,
"step": 1312
},
{
"epoch": 1.8783977110157366,
"grad_norm": 6.369755744934082,
"learning_rate": 3.1405504121734593e-08,
"loss": 0.7719,
"step": 1313
},
{
"epoch": 1.8798283261802575,
"grad_norm": 2.3526201248168945,
"learning_rate": 3.0681875544371796e-08,
"loss": 0.8312,
"step": 1314
},
{
"epoch": 1.8812589413447782,
"grad_norm": 3.876243829727173,
"learning_rate": 2.9966595007656416e-08,
"loss": 0.7576,
"step": 1315
},
{
"epoch": 1.8826895565092991,
"grad_norm": 2.7545125484466553,
"learning_rate": 2.9259666575508494e-08,
"loss": 0.7619,
"step": 1316
},
{
"epoch": 1.8841201716738198,
"grad_norm": 9.593175888061523,
"learning_rate": 2.856109426439435e-08,
"loss": 0.8205,
"step": 1317
},
{
"epoch": 1.8855507868383405,
"grad_norm": 5.5158257484436035,
"learning_rate": 2.7870882043304957e-08,
"loss": 0.7339,
"step": 1318
},
{
"epoch": 1.8869814020028612,
"grad_norm": 1.1985629796981812,
"learning_rate": 2.7189033833732614e-08,
"loss": 0.8216,
"step": 1319
},
{
"epoch": 1.888412017167382,
"grad_norm": 2.041839838027954,
"learning_rate": 2.6515553509648793e-08,
"loss": 0.7589,
"step": 1320
},
{
"epoch": 1.8898426323319026,
"grad_norm": 2.407585859298706,
"learning_rate": 2.5850444897482172e-08,
"loss": 0.8723,
"step": 1321
},
{
"epoch": 1.8912732474964233,
"grad_norm": 1.7742396593093872,
"learning_rate": 2.519371177609714e-08,
"loss": 0.8111,
"step": 1322
},
{
"epoch": 1.8927038626609443,
"grad_norm": 1.1010509729385376,
"learning_rate": 2.454535787677181e-08,
"loss": 0.8269,
"step": 1323
},
{
"epoch": 1.894134477825465,
"grad_norm": 2.547274351119995,
"learning_rate": 2.3905386883177228e-08,
"loss": 0.7992,
"step": 1324
},
{
"epoch": 1.8955650929899857,
"grad_norm": 1.671331763267517,
"learning_rate": 2.3273802431356684e-08,
"loss": 0.793,
"step": 1325
},
{
"epoch": 1.8969957081545066,
"grad_norm": 3.759086847305298,
"learning_rate": 2.2650608109704263e-08,
"loss": 0.8215,
"step": 1326
},
{
"epoch": 1.8984263233190273,
"grad_norm": 2.3819046020507812,
"learning_rate": 2.2035807458944845e-08,
"loss": 0.7701,
"step": 1327
},
{
"epoch": 1.899856938483548,
"grad_norm": 1.7277506589889526,
"learning_rate": 2.1429403972114626e-08,
"loss": 0.8075,
"step": 1328
},
{
"epoch": 1.9012875536480687,
"grad_norm": 2.645439863204956,
"learning_rate": 2.083140109453996e-08,
"loss": 0.7018,
"step": 1329
},
{
"epoch": 1.9027181688125894,
"grad_norm": 3.964482545852661,
"learning_rate": 2.0241802223818884e-08,
"loss": 0.7789,
"step": 1330
},
{
"epoch": 1.90414878397711,
"grad_norm": 5.473621845245361,
"learning_rate": 1.966061070980163e-08,
"loss": 0.7389,
"step": 1331
},
{
"epoch": 1.9055793991416308,
"grad_norm": 3.40977144241333,
"learning_rate": 1.9087829854571137e-08,
"loss": 0.82,
"step": 1332
},
{
"epoch": 1.9070100143061517,
"grad_norm": 2.368593692779541,
"learning_rate": 1.8523462912424405e-08,
"loss": 0.8084,
"step": 1333
},
{
"epoch": 1.9084406294706724,
"grad_norm": 1.9491324424743652,
"learning_rate": 1.7967513089854336e-08,
"loss": 0.791,
"step": 1334
},
{
"epoch": 1.909871244635193,
"grad_norm": 2.3171393871307373,
"learning_rate": 1.741998354553176e-08,
"loss": 0.7305,
"step": 1335
},
{
"epoch": 1.911301859799714,
"grad_norm": 1.2715893983840942,
"learning_rate": 1.6880877390286264e-08,
"loss": 0.7664,
"step": 1336
},
{
"epoch": 1.9127324749642347,
"grad_norm": 2.1280972957611084,
"learning_rate": 1.6350197687089897e-08,
"loss": 0.7713,
"step": 1337
},
{
"epoch": 1.9141630901287554,
"grad_norm": 1.0025123357772827,
"learning_rate": 1.582794745103916e-08,
"loss": 0.7392,
"step": 1338
},
{
"epoch": 1.915593705293276,
"grad_norm": 2.628035545349121,
"learning_rate": 1.5314129649337537e-08,
"loss": 0.7828,
"step": 1339
},
{
"epoch": 1.9170243204577968,
"grad_norm": 2.137150764465332,
"learning_rate": 1.4808747201279171e-08,
"loss": 0.8359,
"step": 1340
},
{
"epoch": 1.9184549356223175,
"grad_norm": 27.349082946777344,
"learning_rate": 1.4311802978232535e-08,
"loss": 0.6619,
"step": 1341
},
{
"epoch": 1.9198855507868382,
"grad_norm": 1.4242587089538574,
"learning_rate": 1.3823299803622957e-08,
"loss": 0.7845,
"step": 1342
},
{
"epoch": 1.9213161659513591,
"grad_norm": 2.508871555328369,
"learning_rate": 1.334324045291796e-08,
"loss": 0.8064,
"step": 1343
},
{
"epoch": 1.9227467811158798,
"grad_norm": 12.176376342773438,
"learning_rate": 1.2871627653610608e-08,
"loss": 0.7454,
"step": 1344
},
{
"epoch": 1.9241773962804005,
"grad_norm": 1.3289920091629028,
"learning_rate": 1.2408464085204019e-08,
"loss": 0.8334,
"step": 1345
},
{
"epoch": 1.9256080114449214,
"grad_norm": 2.553537368774414,
"learning_rate": 1.1953752379196715e-08,
"loss": 0.6796,
"step": 1346
},
{
"epoch": 1.9270386266094421,
"grad_norm": 4.845339775085449,
"learning_rate": 1.150749511906729e-08,
"loss": 0.8347,
"step": 1347
},
{
"epoch": 1.9284692417739628,
"grad_norm": 1.5274536609649658,
"learning_rate": 1.106969484025977e-08,
"loss": 0.7211,
"step": 1348
},
{
"epoch": 1.9298998569384835,
"grad_norm": 6.8586883544921875,
"learning_rate": 1.0640354030168776e-08,
"loss": 0.7573,
"step": 1349
},
{
"epoch": 1.9313304721030042,
"grad_norm": 17.5329647064209,
"learning_rate": 1.0219475128126377e-08,
"loss": 0.7283,
"step": 1350
},
{
"epoch": 1.932761087267525,
"grad_norm": 1.1714322566986084,
"learning_rate": 9.807060525387602e-09,
"loss": 0.7442,
"step": 1351
},
{
"epoch": 1.9341917024320456,
"grad_norm": 6.689727306365967,
"learning_rate": 9.403112565116612e-09,
"loss": 0.817,
"step": 1352
},
{
"epoch": 1.9356223175965666,
"grad_norm": 4.254633903503418,
"learning_rate": 9.00763354237405e-09,
"loss": 0.7439,
"step": 1353
},
{
"epoch": 1.9370529327610873,
"grad_norm": 0.9951338171958923,
"learning_rate": 8.62062570410338e-09,
"loss": 0.7618,
"step": 1354
},
{
"epoch": 1.9384835479256082,
"grad_norm": 1.1761829853057861,
"learning_rate": 8.242091249118732e-09,
"loss": 0.744,
"step": 1355
},
{
"epoch": 1.9399141630901289,
"grad_norm": 16.564828872680664,
"learning_rate": 7.87203232809175e-09,
"loss": 0.7898,
"step": 1356
},
{
"epoch": 1.9413447782546496,
"grad_norm": 2.6274590492248535,
"learning_rate": 7.510451043539923e-09,
"loss": 0.9064,
"step": 1357
},
{
"epoch": 1.9427753934191703,
"grad_norm": 3.510563373565674,
"learning_rate": 7.15734944981411e-09,
"loss": 0.7994,
"step": 1358
},
{
"epoch": 1.944206008583691,
"grad_norm": 1.1502721309661865,
"learning_rate": 6.812729553087704e-09,
"loss": 0.7258,
"step": 1359
},
{
"epoch": 1.9456366237482117,
"grad_norm": 2.471219539642334,
"learning_rate": 6.4765933113439815e-09,
"loss": 0.7513,
"step": 1360
},
{
"epoch": 1.9470672389127324,
"grad_norm": 2.596886157989502,
"learning_rate": 6.148942634366439e-09,
"loss": 0.8226,
"step": 1361
},
{
"epoch": 1.948497854077253,
"grad_norm": 1.2276873588562012,
"learning_rate": 5.829779383726808e-09,
"loss": 0.7847,
"step": 1362
},
{
"epoch": 1.949928469241774,
"grad_norm": 10.255902290344238,
"learning_rate": 5.5191053727748905e-09,
"loss": 0.8118,
"step": 1363
},
{
"epoch": 1.9513590844062947,
"grad_norm": 1.835872769355774,
"learning_rate": 5.216922366628074e-09,
"loss": 0.7836,
"step": 1364
},
{
"epoch": 1.9527896995708156,
"grad_norm": 3.529498338699341,
"learning_rate": 4.923232082161999e-09,
"loss": 0.7899,
"step": 1365
},
{
"epoch": 1.9542203147353363,
"grad_norm": 3.05070424079895,
"learning_rate": 4.638036187999739e-09,
"loss": 0.8756,
"step": 1366
},
{
"epoch": 1.955650929899857,
"grad_norm": 3.6936333179473877,
"learning_rate": 4.361336304503305e-09,
"loss": 0.8157,
"step": 1367
},
{
"epoch": 1.9570815450643777,
"grad_norm": 52.86602020263672,
"learning_rate": 4.0931340037633214e-09,
"loss": 0.7565,
"step": 1368
},
{
"epoch": 1.9585121602288984,
"grad_norm": 1.9639256000518799,
"learning_rate": 3.833430809591698e-09,
"loss": 0.7229,
"step": 1369
},
{
"epoch": 1.959942775393419,
"grad_norm": 1.0925458669662476,
"learning_rate": 3.5822281975111395e-09,
"loss": 0.7935,
"step": 1370
},
{
"epoch": 1.9613733905579398,
"grad_norm": 7.261877059936523,
"learning_rate": 3.3395275947481484e-09,
"loss": 0.7111,
"step": 1371
},
{
"epoch": 1.9628040057224605,
"grad_norm": 1.3601925373077393,
"learning_rate": 3.105330380224536e-09,
"loss": 0.7941,
"step": 1372
},
{
"epoch": 1.9642346208869814,
"grad_norm": 5.424409866333008,
"learning_rate": 2.8796378845489245e-09,
"loss": 0.8544,
"step": 1373
},
{
"epoch": 1.9656652360515021,
"grad_norm": 2.5825531482696533,
"learning_rate": 2.6624513900102565e-09,
"loss": 0.763,
"step": 1374
},
{
"epoch": 1.967095851216023,
"grad_norm": 1.6688388586044312,
"learning_rate": 2.453772130569798e-09,
"loss": 0.7661,
"step": 1375
},
{
"epoch": 1.9685264663805437,
"grad_norm": 2.8896663188934326,
"learning_rate": 2.253601291854479e-09,
"loss": 0.7118,
"step": 1376
},
{
"epoch": 1.9699570815450644,
"grad_norm": 3.865675210952759,
"learning_rate": 2.061940011149566e-09,
"loss": 0.8666,
"step": 1377
},
{
"epoch": 1.9713876967095851,
"grad_norm": 1.1079707145690918,
"learning_rate": 1.8787893773931643e-09,
"loss": 0.732,
"step": 1378
},
{
"epoch": 1.9728183118741058,
"grad_norm": 1.7514995336532593,
"learning_rate": 1.7041504311692268e-09,
"loss": 0.7525,
"step": 1379
},
{
"epoch": 1.9742489270386265,
"grad_norm": 2.8372395038604736,
"learning_rate": 1.5380241647020564e-09,
"loss": 0.8642,
"step": 1380
},
{
"epoch": 1.9756795422031472,
"grad_norm": 2.040043592453003,
"learning_rate": 1.3804115218503112e-09,
"loss": 0.8039,
"step": 1381
},
{
"epoch": 1.977110157367668,
"grad_norm": 1.864327311515808,
"learning_rate": 1.2313133981020074e-09,
"loss": 0.8012,
"step": 1382
},
{
"epoch": 1.9785407725321889,
"grad_norm": 3.190329074859619,
"learning_rate": 1.090730640569193e-09,
"loss": 0.7958,
"step": 1383
},
{
"epoch": 1.9799713876967096,
"grad_norm": 2.387249708175659,
"learning_rate": 9.58664047983615e-10,
"loss": 0.678,
"step": 1384
},
{
"epoch": 1.9814020028612305,
"grad_norm": 8.965396881103516,
"learning_rate": 8.351143706910591e-10,
"loss": 0.7806,
"step": 1385
},
{
"epoch": 1.9828326180257512,
"grad_norm": 1.7268003225326538,
"learning_rate": 7.200823106485177e-10,
"loss": 0.8479,
"step": 1386
},
{
"epoch": 1.9842632331902719,
"grad_norm": 2.4086053371429443,
"learning_rate": 6.13568521419361e-10,
"loss": 0.7753,
"step": 1387
},
{
"epoch": 1.9856938483547926,
"grad_norm": 2.2736830711364746,
"learning_rate": 5.155736081691731e-10,
"loss": 0.7656,
"step": 1388
},
{
"epoch": 1.9871244635193133,
"grad_norm": 14.675554275512695,
"learning_rate": 4.2609812766375435e-10,
"loss": 0.7532,
"step": 1389
},
{
"epoch": 1.988555078683834,
"grad_norm": 1.2827365398406982,
"learning_rate": 3.451425882646242e-10,
"loss": 0.7951,
"step": 1390
},
{
"epoch": 1.9899856938483547,
"grad_norm": 3.003237009048462,
"learning_rate": 2.727074499266902e-10,
"loss": 0.6748,
"step": 1391
},
{
"epoch": 1.9914163090128756,
"grad_norm": 3.035632610321045,
"learning_rate": 2.0879312419574969e-10,
"loss": 0.7217,
"step": 1392
},
{
"epoch": 1.9928469241773963,
"grad_norm": 1.3384627103805542,
"learning_rate": 1.5339997420549256e-10,
"loss": 0.7284,
"step": 1393
},
{
"epoch": 1.994277539341917,
"grad_norm": 2.3145205974578857,
"learning_rate": 1.065283146765017e-10,
"loss": 0.7841,
"step": 1394
},
{
"epoch": 1.995708154506438,
"grad_norm": 3.6275460720062256,
"learning_rate": 6.817841191358865e-11,
"loss": 0.858,
"step": 1395
},
{
"epoch": 1.9971387696709586,
"grad_norm": 3.475882053375244,
"learning_rate": 3.83504838046278e-11,
"loss": 0.7867,
"step": 1396
},
{
"epoch": 1.9985693848354793,
"grad_norm": 4.988249778747559,
"learning_rate": 1.7044699819057652e-11,
"loss": 0.748,
"step": 1397
},
{
"epoch": 2.0,
"grad_norm": 2.542473554611206,
"learning_rate": 4.261181007381154e-12,
"loss": 0.8327,
"step": 1398
}
],
"logging_steps": 1,
"max_steps": 1398,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 350,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3983264234067198e+20,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}