NLPQwen2-7b / trainer_state.json
zeng981's picture
Upload 15 files
9bdf5b0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.998799519807923,
"eval_steps": 500,
"global_step": 3747,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004001600640256103,
"grad_norm": 9.379745483398438,
"learning_rate": 5.999973638932638e-05,
"loss": 2.3329,
"num_input_tokens_seen": 78976,
"step": 5
},
{
"epoch": 0.008003201280512205,
"grad_norm": 1.9017620086669922,
"learning_rate": 5.999894556193823e-05,
"loss": 0.874,
"num_input_tokens_seen": 161792,
"step": 10
},
{
"epoch": 0.012004801920768308,
"grad_norm": 1.1301895380020142,
"learning_rate": 5.999762753173357e-05,
"loss": 0.4549,
"num_input_tokens_seen": 245504,
"step": 15
},
{
"epoch": 0.01600640256102441,
"grad_norm": 2.4106760025024414,
"learning_rate": 5.9995782321875545e-05,
"loss": 0.2059,
"num_input_tokens_seen": 324224,
"step": 20
},
{
"epoch": 0.020008003201280513,
"grad_norm": 1.2345476150512695,
"learning_rate": 5.999340996479194e-05,
"loss": 0.156,
"num_input_tokens_seen": 403072,
"step": 25
},
{
"epoch": 0.024009603841536616,
"grad_norm": 0.4067784547805786,
"learning_rate": 5.999051050217466e-05,
"loss": 0.0778,
"num_input_tokens_seen": 480256,
"step": 30
},
{
"epoch": 0.028011204481792718,
"grad_norm": 0.713360071182251,
"learning_rate": 5.9987083984979006e-05,
"loss": 0.0947,
"num_input_tokens_seen": 559616,
"step": 35
},
{
"epoch": 0.03201280512204882,
"grad_norm": 0.6779927611351013,
"learning_rate": 5.998313047342274e-05,
"loss": 0.0899,
"num_input_tokens_seen": 642176,
"step": 40
},
{
"epoch": 0.03601440576230492,
"grad_norm": 0.5412369966506958,
"learning_rate": 5.997865003698505e-05,
"loss": 0.0819,
"num_input_tokens_seen": 717440,
"step": 45
},
{
"epoch": 0.040016006402561026,
"grad_norm": 0.8061174154281616,
"learning_rate": 5.997364275440533e-05,
"loss": 0.0674,
"num_input_tokens_seen": 799616,
"step": 50
},
{
"epoch": 0.044017607042817125,
"grad_norm": 0.7229400277137756,
"learning_rate": 5.996810871368178e-05,
"loss": 0.0784,
"num_input_tokens_seen": 884480,
"step": 55
},
{
"epoch": 0.04801920768307323,
"grad_norm": 0.5562865734100342,
"learning_rate": 5.99620480120699e-05,
"loss": 0.0728,
"num_input_tokens_seen": 964864,
"step": 60
},
{
"epoch": 0.05202080832332933,
"grad_norm": 0.665501594543457,
"learning_rate": 5.995546075608071e-05,
"loss": 0.064,
"num_input_tokens_seen": 1042816,
"step": 65
},
{
"epoch": 0.056022408963585436,
"grad_norm": 0.5037021040916443,
"learning_rate": 5.994834706147895e-05,
"loss": 0.0794,
"num_input_tokens_seen": 1123968,
"step": 70
},
{
"epoch": 0.060024009603841535,
"grad_norm": 1.168228030204773,
"learning_rate": 5.994070705328102e-05,
"loss": 0.0775,
"num_input_tokens_seen": 1208064,
"step": 75
},
{
"epoch": 0.06402561024409764,
"grad_norm": 0.48576512932777405,
"learning_rate": 5.9932540865752753e-05,
"loss": 0.0796,
"num_input_tokens_seen": 1288704,
"step": 80
},
{
"epoch": 0.06802721088435375,
"grad_norm": 0.9761712551116943,
"learning_rate": 5.9923848642407096e-05,
"loss": 0.0848,
"num_input_tokens_seen": 1371136,
"step": 85
},
{
"epoch": 0.07202881152460984,
"grad_norm": 0.6563013195991516,
"learning_rate": 5.991463053600158e-05,
"loss": 0.069,
"num_input_tokens_seen": 1457408,
"step": 90
},
{
"epoch": 0.07603041216486595,
"grad_norm": 0.8250776529312134,
"learning_rate": 5.990488670853562e-05,
"loss": 0.0802,
"num_input_tokens_seen": 1541248,
"step": 95
},
{
"epoch": 0.08003201280512205,
"grad_norm": 0.9820340275764465,
"learning_rate": 5.9894617331247664e-05,
"loss": 0.078,
"num_input_tokens_seen": 1623296,
"step": 100
},
{
"epoch": 0.08403361344537816,
"grad_norm": 0.5198534727096558,
"learning_rate": 5.988382258461223e-05,
"loss": 0.0707,
"num_input_tokens_seen": 1705728,
"step": 105
},
{
"epoch": 0.08803521408563425,
"grad_norm": 0.6419425010681152,
"learning_rate": 5.987250265833667e-05,
"loss": 0.0614,
"num_input_tokens_seen": 1785216,
"step": 110
},
{
"epoch": 0.09203681472589036,
"grad_norm": 0.7757347226142883,
"learning_rate": 5.9860657751357876e-05,
"loss": 0.0591,
"num_input_tokens_seen": 1863424,
"step": 115
},
{
"epoch": 0.09603841536614646,
"grad_norm": 0.5099271535873413,
"learning_rate": 5.9848288071838777e-05,
"loss": 0.0605,
"num_input_tokens_seen": 1944448,
"step": 120
},
{
"epoch": 0.10004001600640255,
"grad_norm": 1.1185756921768188,
"learning_rate": 5.9835393837164675e-05,
"loss": 0.087,
"num_input_tokens_seen": 2025088,
"step": 125
},
{
"epoch": 0.10404161664665866,
"grad_norm": 0.46462324261665344,
"learning_rate": 5.982197527393943e-05,
"loss": 0.0661,
"num_input_tokens_seen": 2109056,
"step": 130
},
{
"epoch": 0.10804321728691477,
"grad_norm": 0.7254449129104614,
"learning_rate": 5.980803261798147e-05,
"loss": 0.0734,
"num_input_tokens_seen": 2185728,
"step": 135
},
{
"epoch": 0.11204481792717087,
"grad_norm": 0.6924275755882263,
"learning_rate": 5.979356611431967e-05,
"loss": 0.0545,
"num_input_tokens_seen": 2263424,
"step": 140
},
{
"epoch": 0.11604641856742696,
"grad_norm": 0.649114191532135,
"learning_rate": 5.9778576017189e-05,
"loss": 0.0572,
"num_input_tokens_seen": 2345856,
"step": 145
},
{
"epoch": 0.12004801920768307,
"grad_norm": 0.732025146484375,
"learning_rate": 5.9763062590026115e-05,
"loss": 0.0747,
"num_input_tokens_seen": 2420736,
"step": 150
},
{
"epoch": 0.12404961984793918,
"grad_norm": 0.8022703528404236,
"learning_rate": 5.974702610546467e-05,
"loss": 0.0669,
"num_input_tokens_seen": 2491648,
"step": 155
},
{
"epoch": 0.12805122048819528,
"grad_norm": 0.3996295928955078,
"learning_rate": 5.973046684533056e-05,
"loss": 0.0681,
"num_input_tokens_seen": 2566400,
"step": 160
},
{
"epoch": 0.13205282112845138,
"grad_norm": 0.5743430852890015,
"learning_rate": 5.9713385100636976e-05,
"loss": 0.0541,
"num_input_tokens_seen": 2648448,
"step": 165
},
{
"epoch": 0.1360544217687075,
"grad_norm": 0.5441402792930603,
"learning_rate": 5.969578117157926e-05,
"loss": 0.0659,
"num_input_tokens_seen": 2732416,
"step": 170
},
{
"epoch": 0.1400560224089636,
"grad_norm": 0.673743724822998,
"learning_rate": 5.9677655367529666e-05,
"loss": 0.068,
"num_input_tokens_seen": 2814720,
"step": 175
},
{
"epoch": 0.14405762304921968,
"grad_norm": 0.4986773729324341,
"learning_rate": 5.965900800703187e-05,
"loss": 0.0452,
"num_input_tokens_seen": 2894336,
"step": 180
},
{
"epoch": 0.1480592236894758,
"grad_norm": 0.7553830146789551,
"learning_rate": 5.963983941779544e-05,
"loss": 0.0446,
"num_input_tokens_seen": 2971904,
"step": 185
},
{
"epoch": 0.1520608243297319,
"grad_norm": 0.6513676047325134,
"learning_rate": 5.962014993669001e-05,
"loss": 0.0484,
"num_input_tokens_seen": 3054336,
"step": 190
},
{
"epoch": 0.15606242496998798,
"grad_norm": 0.45821425318717957,
"learning_rate": 5.959993990973941e-05,
"loss": 0.0494,
"num_input_tokens_seen": 3132800,
"step": 195
},
{
"epoch": 0.1600640256102441,
"grad_norm": 0.5005064606666565,
"learning_rate": 5.957920969211556e-05,
"loss": 0.0683,
"num_input_tokens_seen": 3211776,
"step": 200
},
{
"epoch": 0.1640656262505002,
"grad_norm": 0.44280120730400085,
"learning_rate": 5.955795964813224e-05,
"loss": 0.0557,
"num_input_tokens_seen": 3289216,
"step": 205
},
{
"epoch": 0.16806722689075632,
"grad_norm": 0.4648596942424774,
"learning_rate": 5.9536190151238675e-05,
"loss": 0.061,
"num_input_tokens_seen": 3370368,
"step": 210
},
{
"epoch": 0.1720688275310124,
"grad_norm": 0.4277510643005371,
"learning_rate": 5.951390158401298e-05,
"loss": 0.058,
"num_input_tokens_seen": 3453312,
"step": 215
},
{
"epoch": 0.1760704281712685,
"grad_norm": 0.5151507258415222,
"learning_rate": 5.949109433815543e-05,
"loss": 0.066,
"num_input_tokens_seen": 3532160,
"step": 220
},
{
"epoch": 0.18007202881152462,
"grad_norm": 0.6021590232849121,
"learning_rate": 5.946776881448159e-05,
"loss": 0.0366,
"num_input_tokens_seen": 3610112,
"step": 225
},
{
"epoch": 0.1840736294517807,
"grad_norm": 0.4898953437805176,
"learning_rate": 5.9443925422915274e-05,
"loss": 0.0554,
"num_input_tokens_seen": 3694720,
"step": 230
},
{
"epoch": 0.1880752300920368,
"grad_norm": 0.41998904943466187,
"learning_rate": 5.9419564582481306e-05,
"loss": 0.0596,
"num_input_tokens_seen": 3779328,
"step": 235
},
{
"epoch": 0.19207683073229292,
"grad_norm": 0.43542566895484924,
"learning_rate": 5.939468672129819e-05,
"loss": 0.0507,
"num_input_tokens_seen": 3857408,
"step": 240
},
{
"epoch": 0.19607843137254902,
"grad_norm": 0.45856836438179016,
"learning_rate": 5.936929227657058e-05,
"loss": 0.0575,
"num_input_tokens_seen": 3939968,
"step": 245
},
{
"epoch": 0.2000800320128051,
"grad_norm": 0.5000536441802979,
"learning_rate": 5.9343381694581585e-05,
"loss": 0.0442,
"num_input_tokens_seen": 4025856,
"step": 250
},
{
"epoch": 0.20408163265306123,
"grad_norm": 0.5097799897193909,
"learning_rate": 5.9316955430684925e-05,
"loss": 0.0443,
"num_input_tokens_seen": 4107776,
"step": 255
},
{
"epoch": 0.20808323329331732,
"grad_norm": 0.5326385498046875,
"learning_rate": 5.929001394929697e-05,
"loss": 0.0481,
"num_input_tokens_seen": 4188160,
"step": 260
},
{
"epoch": 0.21208483393357344,
"grad_norm": 0.40312302112579346,
"learning_rate": 5.926255772388851e-05,
"loss": 0.0464,
"num_input_tokens_seen": 4277248,
"step": 265
},
{
"epoch": 0.21608643457382953,
"grad_norm": 0.5460879802703857,
"learning_rate": 5.923458723697649e-05,
"loss": 0.0484,
"num_input_tokens_seen": 4361472,
"step": 270
},
{
"epoch": 0.22008803521408563,
"grad_norm": 0.6051120758056641,
"learning_rate": 5.92061029801155e-05,
"loss": 0.0521,
"num_input_tokens_seen": 4445056,
"step": 275
},
{
"epoch": 0.22408963585434175,
"grad_norm": 0.6960827708244324,
"learning_rate": 5.9177105453889144e-05,
"loss": 0.0615,
"num_input_tokens_seen": 4525568,
"step": 280
},
{
"epoch": 0.22809123649459784,
"grad_norm": 0.5051229000091553,
"learning_rate": 5.914759516790126e-05,
"loss": 0.0647,
"num_input_tokens_seen": 4609792,
"step": 285
},
{
"epoch": 0.23209283713485393,
"grad_norm": 0.6656054258346558,
"learning_rate": 5.911757264076692e-05,
"loss": 0.05,
"num_input_tokens_seen": 4690432,
"step": 290
},
{
"epoch": 0.23609443777511005,
"grad_norm": 0.6766570806503296,
"learning_rate": 5.90870384001034e-05,
"loss": 0.0523,
"num_input_tokens_seen": 4771200,
"step": 295
},
{
"epoch": 0.24009603841536614,
"grad_norm": 0.5046851634979248,
"learning_rate": 5.905599298252079e-05,
"loss": 0.0633,
"num_input_tokens_seen": 4856064,
"step": 300
},
{
"epoch": 0.24409763905562226,
"grad_norm": 0.6210601329803467,
"learning_rate": 5.9024436933612646e-05,
"loss": 0.0629,
"num_input_tokens_seen": 4938112,
"step": 305
},
{
"epoch": 0.24809923969587835,
"grad_norm": 0.5175065994262695,
"learning_rate": 5.899237080794641e-05,
"loss": 0.0476,
"num_input_tokens_seen": 5017984,
"step": 310
},
{
"epoch": 0.25210084033613445,
"grad_norm": 0.7169041037559509,
"learning_rate": 5.89597951690536e-05,
"loss": 0.0499,
"num_input_tokens_seen": 5101696,
"step": 315
},
{
"epoch": 0.25610244097639057,
"grad_norm": 0.5738083124160767,
"learning_rate": 5.8926710589419965e-05,
"loss": 0.0484,
"num_input_tokens_seen": 5182848,
"step": 320
},
{
"epoch": 0.2601040416166467,
"grad_norm": 0.5108537077903748,
"learning_rate": 5.889311765047539e-05,
"loss": 0.0617,
"num_input_tokens_seen": 5264256,
"step": 325
},
{
"epoch": 0.26410564225690275,
"grad_norm": 0.5295582413673401,
"learning_rate": 5.885901694258369e-05,
"loss": 0.0476,
"num_input_tokens_seen": 5338624,
"step": 330
},
{
"epoch": 0.26810724289715887,
"grad_norm": 0.6223270297050476,
"learning_rate": 5.8824409065032245e-05,
"loss": 0.0502,
"num_input_tokens_seen": 5420672,
"step": 335
},
{
"epoch": 0.272108843537415,
"grad_norm": 0.408581405878067,
"learning_rate": 5.8789294626021445e-05,
"loss": 0.0454,
"num_input_tokens_seen": 5497216,
"step": 340
},
{
"epoch": 0.27611044417767105,
"grad_norm": 0.5283952951431274,
"learning_rate": 5.8753674242654e-05,
"loss": 0.0528,
"num_input_tokens_seen": 5581056,
"step": 345
},
{
"epoch": 0.2801120448179272,
"grad_norm": 0.6091347932815552,
"learning_rate": 5.871754854092416e-05,
"loss": 0.0597,
"num_input_tokens_seen": 5661440,
"step": 350
},
{
"epoch": 0.2841136454581833,
"grad_norm": 0.4604595899581909,
"learning_rate": 5.868091815570661e-05,
"loss": 0.0563,
"num_input_tokens_seen": 5737344,
"step": 355
},
{
"epoch": 0.28811524609843936,
"grad_norm": 0.5345984697341919,
"learning_rate": 5.864378373074539e-05,
"loss": 0.0469,
"num_input_tokens_seen": 5817472,
"step": 360
},
{
"epoch": 0.2921168467386955,
"grad_norm": 0.359485000371933,
"learning_rate": 5.860614591864255e-05,
"loss": 0.0525,
"num_input_tokens_seen": 5896704,
"step": 365
},
{
"epoch": 0.2961184473789516,
"grad_norm": 0.5373000502586365,
"learning_rate": 5.856800538084668e-05,
"loss": 0.0667,
"num_input_tokens_seen": 5981952,
"step": 370
},
{
"epoch": 0.30012004801920766,
"grad_norm": 0.5203957557678223,
"learning_rate": 5.8529362787641326e-05,
"loss": 0.0527,
"num_input_tokens_seen": 6067456,
"step": 375
},
{
"epoch": 0.3041216486594638,
"grad_norm": 0.49910980463027954,
"learning_rate": 5.849021881813314e-05,
"loss": 0.0528,
"num_input_tokens_seen": 6150656,
"step": 380
},
{
"epoch": 0.3081232492997199,
"grad_norm": 0.7812584042549133,
"learning_rate": 5.845057416024001e-05,
"loss": 0.0554,
"num_input_tokens_seen": 6231808,
"step": 385
},
{
"epoch": 0.31212484993997597,
"grad_norm": 0.6113467812538147,
"learning_rate": 5.841042951067892e-05,
"loss": 0.054,
"num_input_tokens_seen": 6318208,
"step": 390
},
{
"epoch": 0.3161264505802321,
"grad_norm": 0.5749547481536865,
"learning_rate": 5.836978557495376e-05,
"loss": 0.0541,
"num_input_tokens_seen": 6401152,
"step": 395
},
{
"epoch": 0.3201280512204882,
"grad_norm": 0.5039237141609192,
"learning_rate": 5.832864306734287e-05,
"loss": 0.0364,
"num_input_tokens_seen": 6485888,
"step": 400
},
{
"epoch": 0.3241296518607443,
"grad_norm": 0.662135899066925,
"learning_rate": 5.828700271088653e-05,
"loss": 0.051,
"num_input_tokens_seen": 6568320,
"step": 405
},
{
"epoch": 0.3281312525010004,
"grad_norm": 0.5175401568412781,
"learning_rate": 5.8244865237374234e-05,
"loss": 0.0416,
"num_input_tokens_seen": 6646016,
"step": 410
},
{
"epoch": 0.3321328531412565,
"grad_norm": 0.4517100155353546,
"learning_rate": 5.8202231387331844e-05,
"loss": 0.0433,
"num_input_tokens_seen": 6727680,
"step": 415
},
{
"epoch": 0.33613445378151263,
"grad_norm": 0.40900635719299316,
"learning_rate": 5.815910191000854e-05,
"loss": 0.0504,
"num_input_tokens_seen": 6809344,
"step": 420
},
{
"epoch": 0.3401360544217687,
"grad_norm": 0.5824533104896545,
"learning_rate": 5.811547756336371e-05,
"loss": 0.0472,
"num_input_tokens_seen": 6889216,
"step": 425
},
{
"epoch": 0.3441376550620248,
"grad_norm": 0.49456438422203064,
"learning_rate": 5.807135911405356e-05,
"loss": 0.0359,
"num_input_tokens_seen": 6970368,
"step": 430
},
{
"epoch": 0.34813925570228094,
"grad_norm": 0.41590744256973267,
"learning_rate": 5.80267473374177e-05,
"loss": 0.0484,
"num_input_tokens_seen": 7049984,
"step": 435
},
{
"epoch": 0.352140856342537,
"grad_norm": 0.5233580470085144,
"learning_rate": 5.798164301746553e-05,
"loss": 0.0507,
"num_input_tokens_seen": 7130624,
"step": 440
},
{
"epoch": 0.3561424569827931,
"grad_norm": 0.336330771446228,
"learning_rate": 5.793604694686236e-05,
"loss": 0.0613,
"num_input_tokens_seen": 7216256,
"step": 445
},
{
"epoch": 0.36014405762304924,
"grad_norm": 0.4404323995113373,
"learning_rate": 5.7889959926915585e-05,
"loss": 0.053,
"num_input_tokens_seen": 7298432,
"step": 450
},
{
"epoch": 0.3641456582633053,
"grad_norm": 0.533418595790863,
"learning_rate": 5.784338276756059e-05,
"loss": 0.0334,
"num_input_tokens_seen": 7380352,
"step": 455
},
{
"epoch": 0.3681472589035614,
"grad_norm": 0.27110928297042847,
"learning_rate": 5.7796316287346425e-05,
"loss": 0.0357,
"num_input_tokens_seen": 7459968,
"step": 460
},
{
"epoch": 0.37214885954381755,
"grad_norm": 0.45250481367111206,
"learning_rate": 5.774876131342156e-05,
"loss": 0.0487,
"num_input_tokens_seen": 7542528,
"step": 465
},
{
"epoch": 0.3761504601840736,
"grad_norm": 0.6789034605026245,
"learning_rate": 5.770071868151923e-05,
"loss": 0.0479,
"num_input_tokens_seen": 7618688,
"step": 470
},
{
"epoch": 0.38015206082432973,
"grad_norm": 0.5371197462081909,
"learning_rate": 5.765218923594281e-05,
"loss": 0.0451,
"num_input_tokens_seen": 7699200,
"step": 475
},
{
"epoch": 0.38415366146458585,
"grad_norm": 0.5418415665626526,
"learning_rate": 5.760317382955094e-05,
"loss": 0.0682,
"num_input_tokens_seen": 7777152,
"step": 480
},
{
"epoch": 0.3881552621048419,
"grad_norm": 0.34202393889427185,
"learning_rate": 5.7553673323742596e-05,
"loss": 0.0531,
"num_input_tokens_seen": 7854720,
"step": 485
},
{
"epoch": 0.39215686274509803,
"grad_norm": 0.48411786556243896,
"learning_rate": 5.750368858844188e-05,
"loss": 0.0389,
"num_input_tokens_seen": 7937408,
"step": 490
},
{
"epoch": 0.39615846338535415,
"grad_norm": 0.48644712567329407,
"learning_rate": 5.745322050208277e-05,
"loss": 0.0565,
"num_input_tokens_seen": 8022144,
"step": 495
},
{
"epoch": 0.4001600640256102,
"grad_norm": 0.39028334617614746,
"learning_rate": 5.740226995159369e-05,
"loss": 0.0408,
"num_input_tokens_seen": 8104192,
"step": 500
},
{
"epoch": 0.40416166466586634,
"grad_norm": 0.25846999883651733,
"learning_rate": 5.73508378323819e-05,
"loss": 0.036,
"num_input_tokens_seen": 8182528,
"step": 505
},
{
"epoch": 0.40816326530612246,
"grad_norm": 0.5509349703788757,
"learning_rate": 5.7298925048317764e-05,
"loss": 0.0558,
"num_input_tokens_seen": 8260736,
"step": 510
},
{
"epoch": 0.4121648659463786,
"grad_norm": 0.3060499429702759,
"learning_rate": 5.724653251171889e-05,
"loss": 0.039,
"num_input_tokens_seen": 8339328,
"step": 515
},
{
"epoch": 0.41616646658663464,
"grad_norm": 0.4803677797317505,
"learning_rate": 5.7193661143334076e-05,
"loss": 0.04,
"num_input_tokens_seen": 8416512,
"step": 520
},
{
"epoch": 0.42016806722689076,
"grad_norm": 0.3582713007926941,
"learning_rate": 5.714031187232711e-05,
"loss": 0.0462,
"num_input_tokens_seen": 8501888,
"step": 525
},
{
"epoch": 0.4241696678671469,
"grad_norm": 0.46847158670425415,
"learning_rate": 5.7086485636260476e-05,
"loss": 0.0559,
"num_input_tokens_seen": 8583552,
"step": 530
},
{
"epoch": 0.42817126850740295,
"grad_norm": 0.6372350454330444,
"learning_rate": 5.7032183381078876e-05,
"loss": 0.0445,
"num_input_tokens_seen": 8664448,
"step": 535
},
{
"epoch": 0.43217286914765907,
"grad_norm": 0.47640711069107056,
"learning_rate": 5.6977406061092574e-05,
"loss": 0.0401,
"num_input_tokens_seen": 8746496,
"step": 540
},
{
"epoch": 0.4361744697879152,
"grad_norm": 0.5731444954872131,
"learning_rate": 5.692215463896065e-05,
"loss": 0.0439,
"num_input_tokens_seen": 8820224,
"step": 545
},
{
"epoch": 0.44017607042817125,
"grad_norm": 0.41100820899009705,
"learning_rate": 5.6866430085674086e-05,
"loss": 0.0438,
"num_input_tokens_seen": 8899328,
"step": 550
},
{
"epoch": 0.44417767106842737,
"grad_norm": 0.40955179929733276,
"learning_rate": 5.6810233380538676e-05,
"loss": 0.04,
"num_input_tokens_seen": 8979328,
"step": 555
},
{
"epoch": 0.4481792717086835,
"grad_norm": 0.4302396774291992,
"learning_rate": 5.675356551115784e-05,
"loss": 0.0415,
"num_input_tokens_seen": 9066624,
"step": 560
},
{
"epoch": 0.45218087234893956,
"grad_norm": 0.3928489089012146,
"learning_rate": 5.6696427473415254e-05,
"loss": 0.0393,
"num_input_tokens_seen": 9147136,
"step": 565
},
{
"epoch": 0.4561824729891957,
"grad_norm": 0.5838471055030823,
"learning_rate": 5.6638820271457375e-05,
"loss": 0.0495,
"num_input_tokens_seen": 9228672,
"step": 570
},
{
"epoch": 0.4601840736294518,
"grad_norm": 0.40209439396858215,
"learning_rate": 5.658074491767575e-05,
"loss": 0.0467,
"num_input_tokens_seen": 9315200,
"step": 575
},
{
"epoch": 0.46418567426970786,
"grad_norm": 0.3072943091392517,
"learning_rate": 5.652220243268925e-05,
"loss": 0.0363,
"num_input_tokens_seen": 9401344,
"step": 580
},
{
"epoch": 0.468187274909964,
"grad_norm": 0.5420807600021362,
"learning_rate": 5.6463193845326134e-05,
"loss": 0.0469,
"num_input_tokens_seen": 9480192,
"step": 585
},
{
"epoch": 0.4721888755502201,
"grad_norm": 0.4435482919216156,
"learning_rate": 5.640372019260597e-05,
"loss": 0.0503,
"num_input_tokens_seen": 9561856,
"step": 590
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.2784574031829834,
"learning_rate": 5.63437825197214e-05,
"loss": 0.03,
"num_input_tokens_seen": 9643776,
"step": 595
},
{
"epoch": 0.4801920768307323,
"grad_norm": 0.34654027223587036,
"learning_rate": 5.62833818800198e-05,
"loss": 0.0354,
"num_input_tokens_seen": 9725952,
"step": 600
},
{
"epoch": 0.4841936774709884,
"grad_norm": 0.6582991480827332,
"learning_rate": 5.622251933498469e-05,
"loss": 0.0447,
"num_input_tokens_seen": 9803008,
"step": 605
},
{
"epoch": 0.4881952781112445,
"grad_norm": 0.7253966331481934,
"learning_rate": 5.616119595421719e-05,
"loss": 0.0457,
"num_input_tokens_seen": 9881216,
"step": 610
},
{
"epoch": 0.4921968787515006,
"grad_norm": 0.41951698064804077,
"learning_rate": 5.6099412815417144e-05,
"loss": 0.0494,
"num_input_tokens_seen": 9961344,
"step": 615
},
{
"epoch": 0.4961984793917567,
"grad_norm": 0.4950437843799591,
"learning_rate": 5.603717100436419e-05,
"loss": 0.0486,
"num_input_tokens_seen": 10040960,
"step": 620
},
{
"epoch": 0.5002000800320128,
"grad_norm": 0.5678931474685669,
"learning_rate": 5.5974471614898755e-05,
"loss": 0.0419,
"num_input_tokens_seen": 10119936,
"step": 625
},
{
"epoch": 0.5042016806722689,
"grad_norm": 0.22742347419261932,
"learning_rate": 5.5911315748902685e-05,
"loss": 0.0434,
"num_input_tokens_seen": 10207360,
"step": 630
},
{
"epoch": 0.508203281312525,
"grad_norm": 0.40390047430992126,
"learning_rate": 5.584770451628001e-05,
"loss": 0.0405,
"num_input_tokens_seen": 10286336,
"step": 635
},
{
"epoch": 0.5122048819527811,
"grad_norm": 0.6086333990097046,
"learning_rate": 5.57836390349374e-05,
"loss": 0.0477,
"num_input_tokens_seen": 10376704,
"step": 640
},
{
"epoch": 0.5162064825930373,
"grad_norm": 0.4355417490005493,
"learning_rate": 5.571912043076451e-05,
"loss": 0.0424,
"num_input_tokens_seen": 10456960,
"step": 645
},
{
"epoch": 0.5202080832332934,
"grad_norm": 0.3973543643951416,
"learning_rate": 5.565414983761416e-05,
"loss": 0.045,
"num_input_tokens_seen": 10534912,
"step": 650
},
{
"epoch": 0.5242096838735494,
"grad_norm": 0.46887820959091187,
"learning_rate": 5.558872839728249e-05,
"loss": 0.0464,
"num_input_tokens_seen": 10613888,
"step": 655
},
{
"epoch": 0.5282112845138055,
"grad_norm": 0.46044033765792847,
"learning_rate": 5.5522857259488834e-05,
"loss": 0.0335,
"num_input_tokens_seen": 10693504,
"step": 660
},
{
"epoch": 0.5322128851540616,
"grad_norm": 0.37409427762031555,
"learning_rate": 5.545653758185551e-05,
"loss": 0.0406,
"num_input_tokens_seen": 10775680,
"step": 665
},
{
"epoch": 0.5362144857943177,
"grad_norm": 0.5527156591415405,
"learning_rate": 5.5389770529887516e-05,
"loss": 0.0493,
"num_input_tokens_seen": 10853632,
"step": 670
},
{
"epoch": 0.5402160864345739,
"grad_norm": 0.41998937726020813,
"learning_rate": 5.532255727695203e-05,
"loss": 0.0427,
"num_input_tokens_seen": 10933376,
"step": 675
},
{
"epoch": 0.54421768707483,
"grad_norm": 0.5678915977478027,
"learning_rate": 5.5254899004257786e-05,
"loss": 0.0429,
"num_input_tokens_seen": 11013248,
"step": 680
},
{
"epoch": 0.548219287715086,
"grad_norm": 0.3396507799625397,
"learning_rate": 5.518679690083428e-05,
"loss": 0.044,
"num_input_tokens_seen": 11095424,
"step": 685
},
{
"epoch": 0.5522208883553421,
"grad_norm": 0.21728582680225372,
"learning_rate": 5.5118252163510955e-05,
"loss": 0.0415,
"num_input_tokens_seen": 11179520,
"step": 690
},
{
"epoch": 0.5562224889955982,
"grad_norm": 0.4980517029762268,
"learning_rate": 5.504926599689609e-05,
"loss": 0.0503,
"num_input_tokens_seen": 11256320,
"step": 695
},
{
"epoch": 0.5602240896358543,
"grad_norm": 0.40451306104660034,
"learning_rate": 5.4979839613355685e-05,
"loss": 0.0497,
"num_input_tokens_seen": 11337088,
"step": 700
},
{
"epoch": 0.5642256902761105,
"grad_norm": 0.33498549461364746,
"learning_rate": 5.490997423299212e-05,
"loss": 0.0351,
"num_input_tokens_seen": 11417216,
"step": 705
},
{
"epoch": 0.5682272909163666,
"grad_norm": 0.37525609135627747,
"learning_rate": 5.483967108362273e-05,
"loss": 0.0351,
"num_input_tokens_seen": 11494528,
"step": 710
},
{
"epoch": 0.5722288915566226,
"grad_norm": 0.46845880150794983,
"learning_rate": 5.476893140075822e-05,
"loss": 0.0455,
"num_input_tokens_seen": 11573632,
"step": 715
},
{
"epoch": 0.5762304921968787,
"grad_norm": 0.4156310558319092,
"learning_rate": 5.469775642758094e-05,
"loss": 0.0389,
"num_input_tokens_seen": 11651712,
"step": 720
},
{
"epoch": 0.5802320928371348,
"grad_norm": 0.5103508234024048,
"learning_rate": 5.462614741492308e-05,
"loss": 0.0358,
"num_input_tokens_seen": 11736704,
"step": 725
},
{
"epoch": 0.584233693477391,
"grad_norm": 0.8223247528076172,
"learning_rate": 5.455410562124463e-05,
"loss": 0.0517,
"num_input_tokens_seen": 11816448,
"step": 730
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.3280523717403412,
"learning_rate": 5.448163231261132e-05,
"loss": 0.0348,
"num_input_tokens_seen": 11899520,
"step": 735
},
{
"epoch": 0.5922368947579032,
"grad_norm": 0.335967093706131,
"learning_rate": 5.440872876267233e-05,
"loss": 0.0424,
"num_input_tokens_seen": 11978112,
"step": 740
},
{
"epoch": 0.5962384953981593,
"grad_norm": 0.46126681566238403,
"learning_rate": 5.433539625263791e-05,
"loss": 0.0385,
"num_input_tokens_seen": 12057472,
"step": 745
},
{
"epoch": 0.6002400960384153,
"grad_norm": 0.42888614535331726,
"learning_rate": 5.42616360712569e-05,
"loss": 0.038,
"num_input_tokens_seen": 12140672,
"step": 750
},
{
"epoch": 0.6042416966786714,
"grad_norm": 0.3738919794559479,
"learning_rate": 5.418744951479402e-05,
"loss": 0.0423,
"num_input_tokens_seen": 12222592,
"step": 755
},
{
"epoch": 0.6082432973189276,
"grad_norm": 0.25246375799179077,
"learning_rate": 5.411283788700717e-05,
"loss": 0.0447,
"num_input_tokens_seen": 12302592,
"step": 760
},
{
"epoch": 0.6122448979591837,
"grad_norm": 0.4251025915145874,
"learning_rate": 5.403780249912443e-05,
"loss": 0.0415,
"num_input_tokens_seen": 12390784,
"step": 765
},
{
"epoch": 0.6162464985994398,
"grad_norm": 0.4907916188240051,
"learning_rate": 5.3962344669821075e-05,
"loss": 0.0347,
"num_input_tokens_seen": 12470400,
"step": 770
},
{
"epoch": 0.6202480992396959,
"grad_norm": 0.456727534532547,
"learning_rate": 5.3886465725196396e-05,
"loss": 0.0389,
"num_input_tokens_seen": 12551296,
"step": 775
},
{
"epoch": 0.6242496998799519,
"grad_norm": 0.4173440933227539,
"learning_rate": 5.381016699875037e-05,
"loss": 0.0374,
"num_input_tokens_seen": 12632192,
"step": 780
},
{
"epoch": 0.6282513005202081,
"grad_norm": 0.4084208011627197,
"learning_rate": 5.373344983136023e-05,
"loss": 0.0416,
"num_input_tokens_seen": 12709120,
"step": 785
},
{
"epoch": 0.6322529011604642,
"grad_norm": 0.43168023228645325,
"learning_rate": 5.365631557125694e-05,
"loss": 0.0334,
"num_input_tokens_seen": 12785408,
"step": 790
},
{
"epoch": 0.6362545018007203,
"grad_norm": 0.5255063772201538,
"learning_rate": 5.357876557400144e-05,
"loss": 0.0395,
"num_input_tokens_seen": 12867072,
"step": 795
},
{
"epoch": 0.6402561024409764,
"grad_norm": 0.5458687543869019,
"learning_rate": 5.350080120246087e-05,
"loss": 0.0421,
"num_input_tokens_seen": 12948864,
"step": 800
},
{
"epoch": 0.6442577030812325,
"grad_norm": 0.4166968762874603,
"learning_rate": 5.342242382678458e-05,
"loss": 0.0302,
"num_input_tokens_seen": 13030272,
"step": 805
},
{
"epoch": 0.6482593037214885,
"grad_norm": 0.31682589650154114,
"learning_rate": 5.334363482438012e-05,
"loss": 0.0339,
"num_input_tokens_seen": 13111680,
"step": 810
},
{
"epoch": 0.6522609043617447,
"grad_norm": 0.8049325346946716,
"learning_rate": 5.326443557988893e-05,
"loss": 0.0466,
"num_input_tokens_seen": 13192704,
"step": 815
},
{
"epoch": 0.6562625050020008,
"grad_norm": 0.49820753931999207,
"learning_rate": 5.31848274851621e-05,
"loss": 0.0304,
"num_input_tokens_seen": 13271680,
"step": 820
},
{
"epoch": 0.6602641056422569,
"grad_norm": 0.4695858061313629,
"learning_rate": 5.310481193923587e-05,
"loss": 0.0318,
"num_input_tokens_seen": 13352320,
"step": 825
},
{
"epoch": 0.664265706282513,
"grad_norm": 0.3038730025291443,
"learning_rate": 5.302439034830702e-05,
"loss": 0.0445,
"num_input_tokens_seen": 13428736,
"step": 830
},
{
"epoch": 0.6682673069227691,
"grad_norm": 0.3783183991909027,
"learning_rate": 5.2943564125708215e-05,
"loss": 0.0381,
"num_input_tokens_seen": 13511936,
"step": 835
},
{
"epoch": 0.6722689075630253,
"grad_norm": 0.40264102816581726,
"learning_rate": 5.2862334691883105e-05,
"loss": 0.0416,
"num_input_tokens_seen": 13596672,
"step": 840
},
{
"epoch": 0.6762705082032813,
"grad_norm": 0.3763561248779297,
"learning_rate": 5.2780703474361425e-05,
"loss": 0.0366,
"num_input_tokens_seen": 13680768,
"step": 845
},
{
"epoch": 0.6802721088435374,
"grad_norm": 0.5162457823753357,
"learning_rate": 5.269867190773385e-05,
"loss": 0.0329,
"num_input_tokens_seen": 13761152,
"step": 850
},
{
"epoch": 0.6842737094837935,
"grad_norm": 0.3188195824623108,
"learning_rate": 5.261624143362681e-05,
"loss": 0.0452,
"num_input_tokens_seen": 13839488,
"step": 855
},
{
"epoch": 0.6882753101240496,
"grad_norm": 0.48506471514701843,
"learning_rate": 5.253341350067717e-05,
"loss": 0.046,
"num_input_tokens_seen": 13918336,
"step": 860
},
{
"epoch": 0.6922769107643058,
"grad_norm": 0.25305283069610596,
"learning_rate": 5.245018956450674e-05,
"loss": 0.0392,
"num_input_tokens_seen": 14000384,
"step": 865
},
{
"epoch": 0.6962785114045619,
"grad_norm": 0.4232555031776428,
"learning_rate": 5.23665710876967e-05,
"loss": 0.0467,
"num_input_tokens_seen": 14084224,
"step": 870
},
{
"epoch": 0.7002801120448179,
"grad_norm": 0.5432577133178711,
"learning_rate": 5.2282559539761935e-05,
"loss": 0.0394,
"num_input_tokens_seen": 14168704,
"step": 875
},
{
"epoch": 0.704281712685074,
"grad_norm": 0.4924187660217285,
"learning_rate": 5.219815639712515e-05,
"loss": 0.0395,
"num_input_tokens_seen": 14248064,
"step": 880
},
{
"epoch": 0.7082833133253301,
"grad_norm": 0.43043017387390137,
"learning_rate": 5.211336314309096e-05,
"loss": 0.0319,
"num_input_tokens_seen": 14331136,
"step": 885
},
{
"epoch": 0.7122849139655862,
"grad_norm": 0.3098335564136505,
"learning_rate": 5.2028181267819837e-05,
"loss": 0.0411,
"num_input_tokens_seen": 14409088,
"step": 890
},
{
"epoch": 0.7162865146058424,
"grad_norm": 0.293866902589798,
"learning_rate": 5.194261226830186e-05,
"loss": 0.0394,
"num_input_tokens_seen": 14486784,
"step": 895
},
{
"epoch": 0.7202881152460985,
"grad_norm": 0.5189549922943115,
"learning_rate": 5.185665764833049e-05,
"loss": 0.0414,
"num_input_tokens_seen": 14572800,
"step": 900
},
{
"epoch": 0.7242897158863545,
"grad_norm": 0.47962677478790283,
"learning_rate": 5.177031891847606e-05,
"loss": 0.0378,
"num_input_tokens_seen": 14654848,
"step": 905
},
{
"epoch": 0.7282913165266106,
"grad_norm": 0.5880881547927856,
"learning_rate": 5.16835975960593e-05,
"loss": 0.0388,
"num_input_tokens_seen": 14738176,
"step": 910
},
{
"epoch": 0.7322929171668667,
"grad_norm": 0.4656384289264679,
"learning_rate": 5.159649520512462e-05,
"loss": 0.0364,
"num_input_tokens_seen": 14811776,
"step": 915
},
{
"epoch": 0.7362945178071229,
"grad_norm": 0.43349361419677734,
"learning_rate": 5.150901327641335e-05,
"loss": 0.0484,
"num_input_tokens_seen": 14897152,
"step": 920
},
{
"epoch": 0.740296118447379,
"grad_norm": 0.3207763731479645,
"learning_rate": 5.142115334733684e-05,
"loss": 0.0298,
"num_input_tokens_seen": 14976512,
"step": 925
},
{
"epoch": 0.7442977190876351,
"grad_norm": 0.3222317397594452,
"learning_rate": 5.133291696194941e-05,
"loss": 0.0349,
"num_input_tokens_seen": 15054720,
"step": 930
},
{
"epoch": 0.7482993197278912,
"grad_norm": 0.3936599791049957,
"learning_rate": 5.124430567092127e-05,
"loss": 0.0429,
"num_input_tokens_seen": 15133824,
"step": 935
},
{
"epoch": 0.7523009203681472,
"grad_norm": 0.41498124599456787,
"learning_rate": 5.115532103151124e-05,
"loss": 0.0326,
"num_input_tokens_seen": 15215616,
"step": 940
},
{
"epoch": 0.7563025210084033,
"grad_norm": 0.24888166785240173,
"learning_rate": 5.1065964607539345e-05,
"loss": 0.0369,
"num_input_tokens_seen": 15300224,
"step": 945
},
{
"epoch": 0.7603041216486595,
"grad_norm": 0.45428845286369324,
"learning_rate": 5.0976237969359415e-05,
"loss": 0.0359,
"num_input_tokens_seen": 15384576,
"step": 950
},
{
"epoch": 0.7643057222889156,
"grad_norm": 0.5684335827827454,
"learning_rate": 5.088614269383141e-05,
"loss": 0.0461,
"num_input_tokens_seen": 15464832,
"step": 955
},
{
"epoch": 0.7683073229291717,
"grad_norm": 0.2668842077255249,
"learning_rate": 5.079568036429375e-05,
"loss": 0.0402,
"num_input_tokens_seen": 15544064,
"step": 960
},
{
"epoch": 0.7723089235694278,
"grad_norm": 0.4288977086544037,
"learning_rate": 5.070485257053547e-05,
"loss": 0.0385,
"num_input_tokens_seen": 15625984,
"step": 965
},
{
"epoch": 0.7763105242096838,
"grad_norm": 0.3563046157360077,
"learning_rate": 5.0613660908768303e-05,
"loss": 0.0404,
"num_input_tokens_seen": 15707776,
"step": 970
},
{
"epoch": 0.78031212484994,
"grad_norm": 0.7557060718536377,
"learning_rate": 5.0522106981598603e-05,
"loss": 0.0369,
"num_input_tokens_seen": 15790464,
"step": 975
},
{
"epoch": 0.7843137254901961,
"grad_norm": 0.5181427597999573,
"learning_rate": 5.043019239799921e-05,
"loss": 0.0325,
"num_input_tokens_seen": 15870208,
"step": 980
},
{
"epoch": 0.7883153261304522,
"grad_norm": 0.21826592087745667,
"learning_rate": 5.033791877328113e-05,
"loss": 0.0425,
"num_input_tokens_seen": 15952256,
"step": 985
},
{
"epoch": 0.7923169267707083,
"grad_norm": 0.37385791540145874,
"learning_rate": 5.024528772906519e-05,
"loss": 0.0327,
"num_input_tokens_seen": 16036480,
"step": 990
},
{
"epoch": 0.7963185274109644,
"grad_norm": 0.35321009159088135,
"learning_rate": 5.0152300893253534e-05,
"loss": 0.0304,
"num_input_tokens_seen": 16118272,
"step": 995
},
{
"epoch": 0.8003201280512204,
"grad_norm": 0.5509520769119263,
"learning_rate": 5.0058959900000964e-05,
"loss": 0.0446,
"num_input_tokens_seen": 16194944,
"step": 1000
},
{
"epoch": 0.8043217286914766,
"grad_norm": 0.47588232159614563,
"learning_rate": 4.996526638968631e-05,
"loss": 0.0392,
"num_input_tokens_seen": 16278784,
"step": 1005
},
{
"epoch": 0.8083233293317327,
"grad_norm": 0.3435021638870239,
"learning_rate": 4.9871222008883524e-05,
"loss": 0.0347,
"num_input_tokens_seen": 16362752,
"step": 1010
},
{
"epoch": 0.8123249299719888,
"grad_norm": 0.3519127666950226,
"learning_rate": 4.977682841033278e-05,
"loss": 0.0443,
"num_input_tokens_seen": 16440192,
"step": 1015
},
{
"epoch": 0.8163265306122449,
"grad_norm": 0.2507665753364563,
"learning_rate": 4.968208725291141e-05,
"loss": 0.0377,
"num_input_tokens_seen": 16519168,
"step": 1020
},
{
"epoch": 0.820328131252501,
"grad_norm": 0.23654384911060333,
"learning_rate": 4.9587000201604776e-05,
"loss": 0.038,
"num_input_tokens_seen": 16600832,
"step": 1025
},
{
"epoch": 0.8243297318927572,
"grad_norm": 0.48912209272384644,
"learning_rate": 4.949156892747698e-05,
"loss": 0.0397,
"num_input_tokens_seen": 16680576,
"step": 1030
},
{
"epoch": 0.8283313325330132,
"grad_norm": 0.42017558217048645,
"learning_rate": 4.939579510764153e-05,
"loss": 0.0372,
"num_input_tokens_seen": 16763136,
"step": 1035
},
{
"epoch": 0.8323329331732693,
"grad_norm": 0.30439770221710205,
"learning_rate": 4.929968042523183e-05,
"loss": 0.0333,
"num_input_tokens_seen": 16840320,
"step": 1040
},
{
"epoch": 0.8363345338135254,
"grad_norm": 0.30717232823371887,
"learning_rate": 4.920322656937163e-05,
"loss": 0.0407,
"num_input_tokens_seen": 16918912,
"step": 1045
},
{
"epoch": 0.8403361344537815,
"grad_norm": 0.47993338108062744,
"learning_rate": 4.9106435235145315e-05,
"loss": 0.0526,
"num_input_tokens_seen": 17004160,
"step": 1050
},
{
"epoch": 0.8443377350940376,
"grad_norm": 0.4285404086112976,
"learning_rate": 4.900930812356815e-05,
"loss": 0.0331,
"num_input_tokens_seen": 17087360,
"step": 1055
},
{
"epoch": 0.8483393357342938,
"grad_norm": 0.45389819145202637,
"learning_rate": 4.891184694155634e-05,
"loss": 0.0465,
"num_input_tokens_seen": 17168512,
"step": 1060
},
{
"epoch": 0.8523409363745498,
"grad_norm": 0.30383792519569397,
"learning_rate": 4.88140534018971e-05,
"loss": 0.0313,
"num_input_tokens_seen": 17254272,
"step": 1065
},
{
"epoch": 0.8563425370148059,
"grad_norm": 0.364979088306427,
"learning_rate": 4.871592922321846e-05,
"loss": 0.0298,
"num_input_tokens_seen": 17335168,
"step": 1070
},
{
"epoch": 0.860344137655062,
"grad_norm": 0.5087103843688965,
"learning_rate": 4.861747612995917e-05,
"loss": 0.0476,
"num_input_tokens_seen": 17422848,
"step": 1075
},
{
"epoch": 0.8643457382953181,
"grad_norm": 0.45347684621810913,
"learning_rate": 4.851869585233829e-05,
"loss": 0.0293,
"num_input_tokens_seen": 17510656,
"step": 1080
},
{
"epoch": 0.8683473389355743,
"grad_norm": 0.4432317614555359,
"learning_rate": 4.8419590126324866e-05,
"loss": 0.0351,
"num_input_tokens_seen": 17600128,
"step": 1085
},
{
"epoch": 0.8723489395758304,
"grad_norm": 0.5003061294555664,
"learning_rate": 4.8320160693607365e-05,
"loss": 0.0424,
"num_input_tokens_seen": 17677952,
"step": 1090
},
{
"epoch": 0.8763505402160864,
"grad_norm": 0.3677990436553955,
"learning_rate": 4.822040930156312e-05,
"loss": 0.0276,
"num_input_tokens_seen": 17757056,
"step": 1095
},
{
"epoch": 0.8803521408563425,
"grad_norm": 0.308998703956604,
"learning_rate": 4.8120337703227565e-05,
"loss": 0.0378,
"num_input_tokens_seen": 17840512,
"step": 1100
},
{
"epoch": 0.8843537414965986,
"grad_norm": 0.22052662074565887,
"learning_rate": 4.801994765726347e-05,
"loss": 0.026,
"num_input_tokens_seen": 17921024,
"step": 1105
},
{
"epoch": 0.8883553421368547,
"grad_norm": 0.4048158824443817,
"learning_rate": 4.791924092793e-05,
"loss": 0.0462,
"num_input_tokens_seen": 18005248,
"step": 1110
},
{
"epoch": 0.8923569427771109,
"grad_norm": 0.30677714943885803,
"learning_rate": 4.781821928505175e-05,
"loss": 0.0388,
"num_input_tokens_seen": 18087296,
"step": 1115
},
{
"epoch": 0.896358543417367,
"grad_norm": 0.4117492139339447,
"learning_rate": 4.771688450398759e-05,
"loss": 0.0426,
"num_input_tokens_seen": 18171520,
"step": 1120
},
{
"epoch": 0.9003601440576231,
"grad_norm": 0.40267014503479004,
"learning_rate": 4.761523836559954e-05,
"loss": 0.0358,
"num_input_tokens_seen": 18251008,
"step": 1125
},
{
"epoch": 0.9043617446978791,
"grad_norm": 0.3577946126461029,
"learning_rate": 4.751328265622138e-05,
"loss": 0.035,
"num_input_tokens_seen": 18328960,
"step": 1130
},
{
"epoch": 0.9083633453381352,
"grad_norm": 0.30285748839378357,
"learning_rate": 4.741101916762735e-05,
"loss": 0.0338,
"num_input_tokens_seen": 18410624,
"step": 1135
},
{
"epoch": 0.9123649459783914,
"grad_norm": 0.2725851833820343,
"learning_rate": 4.730844969700056e-05,
"loss": 0.0432,
"num_input_tokens_seen": 18489472,
"step": 1140
},
{
"epoch": 0.9163665466186475,
"grad_norm": 0.3936793804168701,
"learning_rate": 4.7205576046901504e-05,
"loss": 0.0356,
"num_input_tokens_seen": 18570880,
"step": 1145
},
{
"epoch": 0.9203681472589036,
"grad_norm": 0.3990076780319214,
"learning_rate": 4.7102400025236335e-05,
"loss": 0.0388,
"num_input_tokens_seen": 18655488,
"step": 1150
},
{
"epoch": 0.9243697478991597,
"grad_norm": 0.7346508502960205,
"learning_rate": 4.699892344522508e-05,
"loss": 0.0444,
"num_input_tokens_seen": 18738816,
"step": 1155
},
{
"epoch": 0.9283713485394157,
"grad_norm": 0.4821522831916809,
"learning_rate": 4.689514812536982e-05,
"loss": 0.0396,
"num_input_tokens_seen": 18823040,
"step": 1160
},
{
"epoch": 0.9323729491796718,
"grad_norm": 0.41954606771469116,
"learning_rate": 4.6791075889422675e-05,
"loss": 0.0367,
"num_input_tokens_seen": 18904064,
"step": 1165
},
{
"epoch": 0.936374549819928,
"grad_norm": 0.24100787937641144,
"learning_rate": 4.668670856635379e-05,
"loss": 0.0316,
"num_input_tokens_seen": 18983040,
"step": 1170
},
{
"epoch": 0.9403761504601841,
"grad_norm": 0.3916667103767395,
"learning_rate": 4.65820479903192e-05,
"loss": 0.0511,
"num_input_tokens_seen": 19060736,
"step": 1175
},
{
"epoch": 0.9443777511004402,
"grad_norm": 0.3159593641757965,
"learning_rate": 4.647709600062856e-05,
"loss": 0.0243,
"num_input_tokens_seen": 19143168,
"step": 1180
},
{
"epoch": 0.9483793517406963,
"grad_norm": 0.3934178650379181,
"learning_rate": 4.637185444171284e-05,
"loss": 0.0458,
"num_input_tokens_seen": 19226624,
"step": 1185
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.22096635401248932,
"learning_rate": 4.626632516309194e-05,
"loss": 0.037,
"num_input_tokens_seen": 19307136,
"step": 1190
},
{
"epoch": 0.9563825530212084,
"grad_norm": 0.3833428919315338,
"learning_rate": 4.616051001934214e-05,
"loss": 0.0362,
"num_input_tokens_seen": 19387264,
"step": 1195
},
{
"epoch": 0.9603841536614646,
"grad_norm": 0.2484057992696762,
"learning_rate": 4.605441087006353e-05,
"loss": 0.0426,
"num_input_tokens_seen": 19469056,
"step": 1200
},
{
"epoch": 0.9643857543017207,
"grad_norm": 0.4272732734680176,
"learning_rate": 4.594802957984731e-05,
"loss": 0.0361,
"num_input_tokens_seen": 19551232,
"step": 1205
},
{
"epoch": 0.9683873549419768,
"grad_norm": 0.5605232119560242,
"learning_rate": 4.584136801824305e-05,
"loss": 0.0428,
"num_input_tokens_seen": 19628928,
"step": 1210
},
{
"epoch": 0.9723889555822329,
"grad_norm": 0.4456954300403595,
"learning_rate": 4.573442805972584e-05,
"loss": 0.0347,
"num_input_tokens_seen": 19710208,
"step": 1215
},
{
"epoch": 0.976390556222489,
"grad_norm": 0.5143166780471802,
"learning_rate": 4.562721158366332e-05,
"loss": 0.044,
"num_input_tokens_seen": 19792640,
"step": 1220
},
{
"epoch": 0.9803921568627451,
"grad_norm": 0.3441978394985199,
"learning_rate": 4.5519720474282626e-05,
"loss": 0.0419,
"num_input_tokens_seen": 19867904,
"step": 1225
},
{
"epoch": 0.9843937575030012,
"grad_norm": 0.3597588837146759,
"learning_rate": 4.541195662063735e-05,
"loss": 0.0543,
"num_input_tokens_seen": 19952384,
"step": 1230
},
{
"epoch": 0.9883953581432573,
"grad_norm": 0.2995705306529999,
"learning_rate": 4.530392191657432e-05,
"loss": 0.0344,
"num_input_tokens_seen": 20033408,
"step": 1235
},
{
"epoch": 0.9923969587835134,
"grad_norm": 0.24429191648960114,
"learning_rate": 4.519561826070025e-05,
"loss": 0.0287,
"num_input_tokens_seen": 20113664,
"step": 1240
},
{
"epoch": 0.9963985594237695,
"grad_norm": 0.3314052224159241,
"learning_rate": 4.508704755634846e-05,
"loss": 0.0358,
"num_input_tokens_seen": 20198016,
"step": 1245
},
{
"epoch": 1.0004001600640255,
"grad_norm": 0.2849452495574951,
"learning_rate": 4.4978211711545385e-05,
"loss": 0.0283,
"num_input_tokens_seen": 20277440,
"step": 1250
},
{
"epoch": 1.0044017607042817,
"grad_norm": 0.226671501994133,
"learning_rate": 4.486911263897706e-05,
"loss": 0.0276,
"num_input_tokens_seen": 20353472,
"step": 1255
},
{
"epoch": 1.0084033613445378,
"grad_norm": 0.42410531640052795,
"learning_rate": 4.475975225595546e-05,
"loss": 0.036,
"num_input_tokens_seen": 20440896,
"step": 1260
},
{
"epoch": 1.012404961984794,
"grad_norm": 0.5222881436347961,
"learning_rate": 4.4650132484384894e-05,
"loss": 0.0332,
"num_input_tokens_seen": 20526272,
"step": 1265
},
{
"epoch": 1.01640656262505,
"grad_norm": 0.3549119234085083,
"learning_rate": 4.454025525072813e-05,
"loss": 0.0349,
"num_input_tokens_seen": 20607936,
"step": 1270
},
{
"epoch": 1.0204081632653061,
"grad_norm": 0.3374067544937134,
"learning_rate": 4.4430122485972624e-05,
"loss": 0.0364,
"num_input_tokens_seen": 20690496,
"step": 1275
},
{
"epoch": 1.0244097639055623,
"grad_norm": 0.3464716970920563,
"learning_rate": 4.431973612559651e-05,
"loss": 0.0264,
"num_input_tokens_seen": 20766016,
"step": 1280
},
{
"epoch": 1.0284113645458184,
"grad_norm": 0.37270525097846985,
"learning_rate": 4.4209098109534666e-05,
"loss": 0.0283,
"num_input_tokens_seen": 20849728,
"step": 1285
},
{
"epoch": 1.0324129651860745,
"grad_norm": 0.3691583573818207,
"learning_rate": 4.4098210382144536e-05,
"loss": 0.0294,
"num_input_tokens_seen": 20926016,
"step": 1290
},
{
"epoch": 1.0364145658263306,
"grad_norm": 0.37765875458717346,
"learning_rate": 4.398707489217204e-05,
"loss": 0.0272,
"num_input_tokens_seen": 21000640,
"step": 1295
},
{
"epoch": 1.0404161664665867,
"grad_norm": 0.3407980501651764,
"learning_rate": 4.387569359271724e-05,
"loss": 0.04,
"num_input_tokens_seen": 21084352,
"step": 1300
},
{
"epoch": 1.0444177671068426,
"grad_norm": 0.4263424873352051,
"learning_rate": 4.376406844120011e-05,
"loss": 0.0392,
"num_input_tokens_seen": 21164480,
"step": 1305
},
{
"epoch": 1.0484193677470988,
"grad_norm": 0.4751840829849243,
"learning_rate": 4.3652201399326085e-05,
"loss": 0.0337,
"num_input_tokens_seen": 21249984,
"step": 1310
},
{
"epoch": 1.0524209683873549,
"grad_norm": 0.3867509663105011,
"learning_rate": 4.3540094433051575e-05,
"loss": 0.0395,
"num_input_tokens_seen": 21333440,
"step": 1315
},
{
"epoch": 1.056422569027611,
"grad_norm": 0.3750215768814087,
"learning_rate": 4.342774951254944e-05,
"loss": 0.029,
"num_input_tokens_seen": 21416896,
"step": 1320
},
{
"epoch": 1.0604241696678671,
"grad_norm": 0.4671512246131897,
"learning_rate": 4.3315168612174354e-05,
"loss": 0.0294,
"num_input_tokens_seen": 21496384,
"step": 1325
},
{
"epoch": 1.0644257703081232,
"grad_norm": 0.2680222988128662,
"learning_rate": 4.3202353710428125e-05,
"loss": 0.0333,
"num_input_tokens_seen": 21581248,
"step": 1330
},
{
"epoch": 1.0684273709483794,
"grad_norm": 0.45219457149505615,
"learning_rate": 4.308930678992489e-05,
"loss": 0.0265,
"num_input_tokens_seen": 21660480,
"step": 1335
},
{
"epoch": 1.0724289715886355,
"grad_norm": 0.4224177598953247,
"learning_rate": 4.2976029837356323e-05,
"loss": 0.029,
"num_input_tokens_seen": 21738048,
"step": 1340
},
{
"epoch": 1.0764305722288916,
"grad_norm": 0.2950039505958557,
"learning_rate": 4.2862524843456656e-05,
"loss": 0.0362,
"num_input_tokens_seen": 21818688,
"step": 1345
},
{
"epoch": 1.0804321728691477,
"grad_norm": 0.5623427033424377,
"learning_rate": 4.274879380296777e-05,
"loss": 0.0303,
"num_input_tokens_seen": 21899968,
"step": 1350
},
{
"epoch": 1.0844337735094038,
"grad_norm": 0.6680567860603333,
"learning_rate": 4.263483871460406e-05,
"loss": 0.0313,
"num_input_tokens_seen": 21976000,
"step": 1355
},
{
"epoch": 1.08843537414966,
"grad_norm": 0.4237207770347595,
"learning_rate": 4.2520661581017386e-05,
"loss": 0.0377,
"num_input_tokens_seen": 22059840,
"step": 1360
},
{
"epoch": 1.092436974789916,
"grad_norm": 0.31445175409317017,
"learning_rate": 4.2406264408761786e-05,
"loss": 0.0381,
"num_input_tokens_seen": 22144448,
"step": 1365
},
{
"epoch": 1.096438575430172,
"grad_norm": 0.47118425369262695,
"learning_rate": 4.2291649208258345e-05,
"loss": 0.0319,
"num_input_tokens_seen": 22217792,
"step": 1370
},
{
"epoch": 1.100440176070428,
"grad_norm": 0.41819679737091064,
"learning_rate": 4.217681799375972e-05,
"loss": 0.0359,
"num_input_tokens_seen": 22294976,
"step": 1375
},
{
"epoch": 1.1044417767106842,
"grad_norm": 0.28464624285697937,
"learning_rate": 4.206177278331484e-05,
"loss": 0.0201,
"num_input_tokens_seen": 22373696,
"step": 1380
},
{
"epoch": 1.1084433773509403,
"grad_norm": 0.3291350305080414,
"learning_rate": 4.194651559873339e-05,
"loss": 0.0326,
"num_input_tokens_seen": 22454080,
"step": 1385
},
{
"epoch": 1.1124449779911965,
"grad_norm": 0.43631651997566223,
"learning_rate": 4.1831048465550305e-05,
"loss": 0.0327,
"num_input_tokens_seen": 22536000,
"step": 1390
},
{
"epoch": 1.1164465786314526,
"grad_norm": 0.4160660207271576,
"learning_rate": 4.1715373412990195e-05,
"loss": 0.0305,
"num_input_tokens_seen": 22612416,
"step": 1395
},
{
"epoch": 1.1204481792717087,
"grad_norm": 0.3205015957355499,
"learning_rate": 4.1599492473931595e-05,
"loss": 0.0274,
"num_input_tokens_seen": 22692032,
"step": 1400
},
{
"epoch": 1.1244497799119648,
"grad_norm": 0.4284726083278656,
"learning_rate": 4.148340768487135e-05,
"loss": 0.0301,
"num_input_tokens_seen": 22773056,
"step": 1405
},
{
"epoch": 1.128451380552221,
"grad_norm": 0.48712974786758423,
"learning_rate": 4.1367121085888765e-05,
"loss": 0.0238,
"num_input_tokens_seen": 22855872,
"step": 1410
},
{
"epoch": 1.132452981192477,
"grad_norm": 0.3432983160018921,
"learning_rate": 4.125063472060974e-05,
"loss": 0.0294,
"num_input_tokens_seen": 22942528,
"step": 1415
},
{
"epoch": 1.1364545818327332,
"grad_norm": 0.3390027582645416,
"learning_rate": 4.1133950636170884e-05,
"loss": 0.0344,
"num_input_tokens_seen": 23021248,
"step": 1420
},
{
"epoch": 1.140456182472989,
"grad_norm": 0.33409103751182556,
"learning_rate": 4.101707088318354e-05,
"loss": 0.0282,
"num_input_tokens_seen": 23100736,
"step": 1425
},
{
"epoch": 1.1444577831132454,
"grad_norm": 0.40322190523147583,
"learning_rate": 4.0899997515697744e-05,
"loss": 0.0287,
"num_input_tokens_seen": 23185344,
"step": 1430
},
{
"epoch": 1.1484593837535013,
"grad_norm": 0.18723510205745697,
"learning_rate": 4.078273259116612e-05,
"loss": 0.0256,
"num_input_tokens_seen": 23268544,
"step": 1435
},
{
"epoch": 1.1524609843937574,
"grad_norm": 0.22620059549808502,
"learning_rate": 4.066527817040769e-05,
"loss": 0.0262,
"num_input_tokens_seen": 23342400,
"step": 1440
},
{
"epoch": 1.1564625850340136,
"grad_norm": 0.3559609055519104,
"learning_rate": 4.054763631757176e-05,
"loss": 0.0314,
"num_input_tokens_seen": 23422784,
"step": 1445
},
{
"epoch": 1.1604641856742697,
"grad_norm": 0.41867971420288086,
"learning_rate": 4.042980910010149e-05,
"loss": 0.0278,
"num_input_tokens_seen": 23499456,
"step": 1450
},
{
"epoch": 1.1644657863145258,
"grad_norm": 0.31694212555885315,
"learning_rate": 4.031179858869773e-05,
"loss": 0.0302,
"num_input_tokens_seen": 23581376,
"step": 1455
},
{
"epoch": 1.168467386954782,
"grad_norm": 0.5166463255882263,
"learning_rate": 4.019360685728247e-05,
"loss": 0.0281,
"num_input_tokens_seen": 23663040,
"step": 1460
},
{
"epoch": 1.172468987595038,
"grad_norm": 0.193945050239563,
"learning_rate": 4.007523598296253e-05,
"loss": 0.0241,
"num_input_tokens_seen": 23750720,
"step": 1465
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.4700028598308563,
"learning_rate": 3.995668804599298e-05,
"loss": 0.0377,
"num_input_tokens_seen": 23831488,
"step": 1470
},
{
"epoch": 1.1804721888755503,
"grad_norm": 0.39450663328170776,
"learning_rate": 3.983796512974057e-05,
"loss": 0.0384,
"num_input_tokens_seen": 23916608,
"step": 1475
},
{
"epoch": 1.1844737895158064,
"grad_norm": 0.39255639910697937,
"learning_rate": 3.971906932064716e-05,
"loss": 0.0329,
"num_input_tokens_seen": 24000448,
"step": 1480
},
{
"epoch": 1.1884753901560625,
"grad_norm": 0.3853246569633484,
"learning_rate": 3.9600002708193045e-05,
"loss": 0.0325,
"num_input_tokens_seen": 24081216,
"step": 1485
},
{
"epoch": 1.1924769907963184,
"grad_norm": 0.36767104268074036,
"learning_rate": 3.948076738486022e-05,
"loss": 0.0337,
"num_input_tokens_seen": 24161856,
"step": 1490
},
{
"epoch": 1.1964785914365745,
"grad_norm": 0.3445225954055786,
"learning_rate": 3.936136544609562e-05,
"loss": 0.0308,
"num_input_tokens_seen": 24242112,
"step": 1495
},
{
"epoch": 1.2004801920768307,
"grad_norm": 0.45754241943359375,
"learning_rate": 3.924179899027426e-05,
"loss": 0.0258,
"num_input_tokens_seen": 24317376,
"step": 1500
},
{
"epoch": 1.2044817927170868,
"grad_norm": 0.2786293625831604,
"learning_rate": 3.912207011866241e-05,
"loss": 0.0319,
"num_input_tokens_seen": 24396224,
"step": 1505
},
{
"epoch": 1.208483393357343,
"grad_norm": 0.5298479795455933,
"learning_rate": 3.9002180935380655e-05,
"loss": 0.0294,
"num_input_tokens_seen": 24477504,
"step": 1510
},
{
"epoch": 1.212484993997599,
"grad_norm": 0.3082476556301117,
"learning_rate": 3.888213354736686e-05,
"loss": 0.0309,
"num_input_tokens_seen": 24552768,
"step": 1515
},
{
"epoch": 1.2164865946378551,
"grad_norm": 0.3240519165992737,
"learning_rate": 3.876193006433923e-05,
"loss": 0.0244,
"num_input_tokens_seen": 24638400,
"step": 1520
},
{
"epoch": 1.2204881952781113,
"grad_norm": 0.5316782593727112,
"learning_rate": 3.864157259875916e-05,
"loss": 0.0432,
"num_input_tokens_seen": 24719936,
"step": 1525
},
{
"epoch": 1.2244897959183674,
"grad_norm": 0.21728059649467468,
"learning_rate": 3.8521063265794173e-05,
"loss": 0.0291,
"num_input_tokens_seen": 24796352,
"step": 1530
},
{
"epoch": 1.2284913965586235,
"grad_norm": 0.27219390869140625,
"learning_rate": 3.840040418328068e-05,
"loss": 0.0308,
"num_input_tokens_seen": 24878016,
"step": 1535
},
{
"epoch": 1.2324929971988796,
"grad_norm": 0.3836307227611542,
"learning_rate": 3.8279597471686835e-05,
"loss": 0.0254,
"num_input_tokens_seen": 24955840,
"step": 1540
},
{
"epoch": 1.2364945978391357,
"grad_norm": 0.21776366233825684,
"learning_rate": 3.815864525407519e-05,
"loss": 0.026,
"num_input_tokens_seen": 25032384,
"step": 1545
},
{
"epoch": 1.2404961984793919,
"grad_norm": 0.36937835812568665,
"learning_rate": 3.803754965606547e-05,
"loss": 0.0277,
"num_input_tokens_seen": 25114432,
"step": 1550
},
{
"epoch": 1.2444977991196478,
"grad_norm": 0.42242029309272766,
"learning_rate": 3.791631280579714e-05,
"loss": 0.029,
"num_input_tokens_seen": 25191360,
"step": 1555
},
{
"epoch": 1.2484993997599039,
"grad_norm": 0.46975576877593994,
"learning_rate": 3.779493683389206e-05,
"loss": 0.0302,
"num_input_tokens_seen": 25271488,
"step": 1560
},
{
"epoch": 1.25250100040016,
"grad_norm": 0.3445466458797455,
"learning_rate": 3.767342387341701e-05,
"loss": 0.0346,
"num_input_tokens_seen": 25359040,
"step": 1565
},
{
"epoch": 1.2565026010404161,
"grad_norm": 0.2749403417110443,
"learning_rate": 3.75517760598462e-05,
"loss": 0.0374,
"num_input_tokens_seen": 25439424,
"step": 1570
},
{
"epoch": 1.2605042016806722,
"grad_norm": 0.29519298672676086,
"learning_rate": 3.742999553102378e-05,
"loss": 0.0341,
"num_input_tokens_seen": 25519040,
"step": 1575
},
{
"epoch": 1.2645058023209284,
"grad_norm": 0.4132818281650543,
"learning_rate": 3.730808442712623e-05,
"loss": 0.0265,
"num_input_tokens_seen": 25597504,
"step": 1580
},
{
"epoch": 1.2685074029611845,
"grad_norm": 0.36612850427627563,
"learning_rate": 3.718604489062477e-05,
"loss": 0.04,
"num_input_tokens_seen": 25678528,
"step": 1585
},
{
"epoch": 1.2725090036014406,
"grad_norm": 0.3779323399066925,
"learning_rate": 3.70638790662477e-05,
"loss": 0.0243,
"num_input_tokens_seen": 25761984,
"step": 1590
},
{
"epoch": 1.2765106042416967,
"grad_norm": 0.32407495379447937,
"learning_rate": 3.6941589100942673e-05,
"loss": 0.0306,
"num_input_tokens_seen": 25839936,
"step": 1595
},
{
"epoch": 1.2805122048819528,
"grad_norm": 0.43514150381088257,
"learning_rate": 3.681917714383907e-05,
"loss": 0.0297,
"num_input_tokens_seen": 25922496,
"step": 1600
},
{
"epoch": 1.284513805522209,
"grad_norm": 0.38025614619255066,
"learning_rate": 3.669664534621011e-05,
"loss": 0.0281,
"num_input_tokens_seen": 25999424,
"step": 1605
},
{
"epoch": 1.2885154061624648,
"grad_norm": 0.28356775641441345,
"learning_rate": 3.657399586143508e-05,
"loss": 0.0394,
"num_input_tokens_seen": 26079296,
"step": 1610
},
{
"epoch": 1.2925170068027212,
"grad_norm": 0.4588053226470947,
"learning_rate": 3.645123084496157e-05,
"loss": 0.0241,
"num_input_tokens_seen": 26159936,
"step": 1615
},
{
"epoch": 1.296518607442977,
"grad_norm": 0.40065518021583557,
"learning_rate": 3.6328352454267474e-05,
"loss": 0.0377,
"num_input_tokens_seen": 26238656,
"step": 1620
},
{
"epoch": 1.3005202080832334,
"grad_norm": 0.18540312349796295,
"learning_rate": 3.620536284882316e-05,
"loss": 0.0215,
"num_input_tokens_seen": 26319296,
"step": 1625
},
{
"epoch": 1.3045218087234893,
"grad_norm": 0.2893196642398834,
"learning_rate": 3.608226419005347e-05,
"loss": 0.0352,
"num_input_tokens_seen": 26401216,
"step": 1630
},
{
"epoch": 1.3085234093637454,
"grad_norm": 0.2363426834344864,
"learning_rate": 3.595905864129976e-05,
"loss": 0.0175,
"num_input_tokens_seen": 26486080,
"step": 1635
},
{
"epoch": 1.3125250100040016,
"grad_norm": 0.37354713678359985,
"learning_rate": 3.583574836778187e-05,
"loss": 0.0235,
"num_input_tokens_seen": 26561600,
"step": 1640
},
{
"epoch": 1.3165266106442577,
"grad_norm": 0.32323962450027466,
"learning_rate": 3.5712335536560104e-05,
"loss": 0.0294,
"num_input_tokens_seen": 26648512,
"step": 1645
},
{
"epoch": 1.3205282112845138,
"grad_norm": 0.612480103969574,
"learning_rate": 3.558882231649708e-05,
"loss": 0.0269,
"num_input_tokens_seen": 26729792,
"step": 1650
},
{
"epoch": 1.32452981192477,
"grad_norm": 0.2292277216911316,
"learning_rate": 3.546521087821969e-05,
"loss": 0.0273,
"num_input_tokens_seen": 26816704,
"step": 1655
},
{
"epoch": 1.328531412565026,
"grad_norm": 0.3623555302619934,
"learning_rate": 3.5341503394080895e-05,
"loss": 0.0208,
"num_input_tokens_seen": 26900416,
"step": 1660
},
{
"epoch": 1.3325330132052822,
"grad_norm": 0.27913469076156616,
"learning_rate": 3.521770203812158e-05,
"loss": 0.0341,
"num_input_tokens_seen": 26979136,
"step": 1665
},
{
"epoch": 1.3365346138455383,
"grad_norm": 0.5891963243484497,
"learning_rate": 3.5093808986032316e-05,
"loss": 0.0381,
"num_input_tokens_seen": 27062848,
"step": 1670
},
{
"epoch": 1.3405362144857942,
"grad_norm": 0.3284565508365631,
"learning_rate": 3.496982641511518e-05,
"loss": 0.0254,
"num_input_tokens_seen": 27142848,
"step": 1675
},
{
"epoch": 1.3445378151260505,
"grad_norm": 0.4769067168235779,
"learning_rate": 3.4845756504245446e-05,
"loss": 0.0409,
"num_input_tokens_seen": 27218624,
"step": 1680
},
{
"epoch": 1.3485394157663064,
"grad_norm": 0.3174830377101898,
"learning_rate": 3.472160143383329e-05,
"loss": 0.0234,
"num_input_tokens_seen": 27301056,
"step": 1685
},
{
"epoch": 1.3525410164065625,
"grad_norm": 0.27017271518707275,
"learning_rate": 3.45973633857855e-05,
"loss": 0.0286,
"num_input_tokens_seen": 27377344,
"step": 1690
},
{
"epoch": 1.3565426170468187,
"grad_norm": 0.3403165936470032,
"learning_rate": 3.447304454346711e-05,
"loss": 0.0294,
"num_input_tokens_seen": 27456192,
"step": 1695
},
{
"epoch": 1.3605442176870748,
"grad_norm": 0.3293294310569763,
"learning_rate": 3.434864709166304e-05,
"loss": 0.0275,
"num_input_tokens_seen": 27536704,
"step": 1700
},
{
"epoch": 1.364545818327331,
"grad_norm": 0.2769692540168762,
"learning_rate": 3.422417321653968e-05,
"loss": 0.0307,
"num_input_tokens_seen": 27613504,
"step": 1705
},
{
"epoch": 1.368547418967587,
"grad_norm": 0.2703746259212494,
"learning_rate": 3.4099625105606526e-05,
"loss": 0.0259,
"num_input_tokens_seen": 27695424,
"step": 1710
},
{
"epoch": 1.3725490196078431,
"grad_norm": 0.3199769854545593,
"learning_rate": 3.3975004947677656e-05,
"loss": 0.0279,
"num_input_tokens_seen": 27776960,
"step": 1715
},
{
"epoch": 1.3765506202480993,
"grad_norm": 0.42263397574424744,
"learning_rate": 3.3850314932833334e-05,
"loss": 0.0326,
"num_input_tokens_seen": 27857728,
"step": 1720
},
{
"epoch": 1.3805522208883554,
"grad_norm": 0.2087048888206482,
"learning_rate": 3.372555725238146e-05,
"loss": 0.0325,
"num_input_tokens_seen": 27940672,
"step": 1725
},
{
"epoch": 1.3845538215286115,
"grad_norm": 0.44893744587898254,
"learning_rate": 3.360073409881914e-05,
"loss": 0.0278,
"num_input_tokens_seen": 28023616,
"step": 1730
},
{
"epoch": 1.3885554221688676,
"grad_norm": 0.3230104446411133,
"learning_rate": 3.3475847665794044e-05,
"loss": 0.0329,
"num_input_tokens_seen": 28110400,
"step": 1735
},
{
"epoch": 1.3925570228091235,
"grad_norm": 0.3153367042541504,
"learning_rate": 3.3350900148065994e-05,
"loss": 0.0235,
"num_input_tokens_seen": 28194752,
"step": 1740
},
{
"epoch": 1.3965586234493799,
"grad_norm": 0.4666508436203003,
"learning_rate": 3.3225893741468245e-05,
"loss": 0.0309,
"num_input_tokens_seen": 28276800,
"step": 1745
},
{
"epoch": 1.4005602240896358,
"grad_norm": 0.22953271865844727,
"learning_rate": 3.310083064286903e-05,
"loss": 0.0295,
"num_input_tokens_seen": 28354880,
"step": 1750
},
{
"epoch": 1.4045618247298919,
"grad_norm": 0.37972113490104675,
"learning_rate": 3.297571305013283e-05,
"loss": 0.0319,
"num_input_tokens_seen": 28438208,
"step": 1755
},
{
"epoch": 1.408563425370148,
"grad_norm": 0.3072042167186737,
"learning_rate": 3.2850543162081866e-05,
"loss": 0.0361,
"num_input_tokens_seen": 28520768,
"step": 1760
},
{
"epoch": 1.4125650260104041,
"grad_norm": 0.42325839400291443,
"learning_rate": 3.2725323178457346e-05,
"loss": 0.0221,
"num_input_tokens_seen": 28598592,
"step": 1765
},
{
"epoch": 1.4165666266506602,
"grad_norm": 0.2999728322029114,
"learning_rate": 3.260005529988091e-05,
"loss": 0.0262,
"num_input_tokens_seen": 28680000,
"step": 1770
},
{
"epoch": 1.4205682272909164,
"grad_norm": 0.3948429226875305,
"learning_rate": 3.247474172781587e-05,
"loss": 0.026,
"num_input_tokens_seen": 28763712,
"step": 1775
},
{
"epoch": 1.4245698279311725,
"grad_norm": 0.2500371038913727,
"learning_rate": 3.234938466452857e-05,
"loss": 0.0291,
"num_input_tokens_seen": 28847552,
"step": 1780
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.5287266969680786,
"learning_rate": 3.222398631304967e-05,
"loss": 0.0397,
"num_input_tokens_seen": 28927168,
"step": 1785
},
{
"epoch": 1.4325730292116847,
"grad_norm": 0.3386266529560089,
"learning_rate": 3.2098548877135416e-05,
"loss": 0.0284,
"num_input_tokens_seen": 29007936,
"step": 1790
},
{
"epoch": 1.4365746298519408,
"grad_norm": 0.28368446230888367,
"learning_rate": 3.197307456122897e-05,
"loss": 0.0251,
"num_input_tokens_seen": 29087424,
"step": 1795
},
{
"epoch": 1.440576230492197,
"grad_norm": 0.36752742528915405,
"learning_rate": 3.1847565570421566e-05,
"loss": 0.0313,
"num_input_tokens_seen": 29163840,
"step": 1800
},
{
"epoch": 1.4445778311324529,
"grad_norm": 0.3912261426448822,
"learning_rate": 3.172202411041387e-05,
"loss": 0.0193,
"num_input_tokens_seen": 29245888,
"step": 1805
},
{
"epoch": 1.4485794317727092,
"grad_norm": 0.43721485137939453,
"learning_rate": 3.1596452387477116e-05,
"loss": 0.039,
"num_input_tokens_seen": 29324224,
"step": 1810
},
{
"epoch": 1.452581032412965,
"grad_norm": 0.18612438440322876,
"learning_rate": 3.1470852608414414e-05,
"loss": 0.0335,
"num_input_tokens_seen": 29401280,
"step": 1815
},
{
"epoch": 1.4565826330532212,
"grad_norm": 0.3837645351886749,
"learning_rate": 3.1345226980521915e-05,
"loss": 0.0282,
"num_input_tokens_seen": 29478336,
"step": 1820
},
{
"epoch": 1.4605842336934773,
"grad_norm": 0.5790018439292908,
"learning_rate": 3.121957771155005e-05,
"loss": 0.0296,
"num_input_tokens_seen": 29558464,
"step": 1825
},
{
"epoch": 1.4645858343337335,
"grad_norm": 0.3951474130153656,
"learning_rate": 3.109390700966472e-05,
"loss": 0.0228,
"num_input_tokens_seen": 29640128,
"step": 1830
},
{
"epoch": 1.4685874349739896,
"grad_norm": 0.4949131906032562,
"learning_rate": 3.096821708340847e-05,
"loss": 0.0325,
"num_input_tokens_seen": 29720384,
"step": 1835
},
{
"epoch": 1.4725890356142457,
"grad_norm": 0.3182584047317505,
"learning_rate": 3.0842510141661716e-05,
"loss": 0.0291,
"num_input_tokens_seen": 29805760,
"step": 1840
},
{
"epoch": 1.4765906362545018,
"grad_norm": 0.49185431003570557,
"learning_rate": 3.07167883936039e-05,
"loss": 0.0282,
"num_input_tokens_seen": 29891520,
"step": 1845
},
{
"epoch": 1.480592236894758,
"grad_norm": 0.28761714696884155,
"learning_rate": 3.059105404867467e-05,
"loss": 0.0245,
"num_input_tokens_seen": 29973824,
"step": 1850
},
{
"epoch": 1.484593837535014,
"grad_norm": 0.4598804712295532,
"learning_rate": 3.046530931653503e-05,
"loss": 0.043,
"num_input_tokens_seen": 30056128,
"step": 1855
},
{
"epoch": 1.4885954381752702,
"grad_norm": 0.20214757323265076,
"learning_rate": 3.0339556407028567e-05,
"loss": 0.0246,
"num_input_tokens_seen": 30139328,
"step": 1860
},
{
"epoch": 1.4925970388155263,
"grad_norm": 0.22631464898586273,
"learning_rate": 3.021379753014257e-05,
"loss": 0.0313,
"num_input_tokens_seen": 30222400,
"step": 1865
},
{
"epoch": 1.4965986394557822,
"grad_norm": 0.2742927670478821,
"learning_rate": 3.008803489596917e-05,
"loss": 0.0295,
"num_input_tokens_seen": 30304192,
"step": 1870
},
{
"epoch": 1.5006002400960385,
"grad_norm": 0.41969189047813416,
"learning_rate": 2.9962270714666557e-05,
"loss": 0.0268,
"num_input_tokens_seen": 30392640,
"step": 1875
},
{
"epoch": 1.5046018407362944,
"grad_norm": 0.3135251998901367,
"learning_rate": 2.9836507196420097e-05,
"loss": 0.03,
"num_input_tokens_seen": 30478272,
"step": 1880
},
{
"epoch": 1.5086034413765508,
"grad_norm": 0.4145873785018921,
"learning_rate": 2.9710746551403516e-05,
"loss": 0.0354,
"num_input_tokens_seen": 30559424,
"step": 1885
},
{
"epoch": 1.5126050420168067,
"grad_norm": 0.40321585536003113,
"learning_rate": 2.9584990989740026e-05,
"loss": 0.0319,
"num_input_tokens_seen": 30639168,
"step": 1890
},
{
"epoch": 1.5166066426570628,
"grad_norm": 0.27597400546073914,
"learning_rate": 2.945924272146352e-05,
"loss": 0.0247,
"num_input_tokens_seen": 30717248,
"step": 1895
},
{
"epoch": 1.520608243297319,
"grad_norm": 0.357263445854187,
"learning_rate": 2.933350395647971e-05,
"loss": 0.0255,
"num_input_tokens_seen": 30799808,
"step": 1900
},
{
"epoch": 1.524609843937575,
"grad_norm": 0.48072096705436707,
"learning_rate": 2.920777690452729e-05,
"loss": 0.0232,
"num_input_tokens_seen": 30882752,
"step": 1905
},
{
"epoch": 1.5286114445778312,
"grad_norm": 0.4655681550502777,
"learning_rate": 2.9082063775139148e-05,
"loss": 0.0418,
"num_input_tokens_seen": 30958016,
"step": 1910
},
{
"epoch": 1.5326130452180873,
"grad_norm": 0.5985519289970398,
"learning_rate": 2.8956366777603425e-05,
"loss": 0.0274,
"num_input_tokens_seen": 31043008,
"step": 1915
},
{
"epoch": 1.5366146458583434,
"grad_norm": 0.4796825647354126,
"learning_rate": 2.883068812092484e-05,
"loss": 0.0264,
"num_input_tokens_seen": 31125440,
"step": 1920
},
{
"epoch": 1.5406162464985993,
"grad_norm": 0.297181636095047,
"learning_rate": 2.8705030013785708e-05,
"loss": 0.0288,
"num_input_tokens_seen": 31205696,
"step": 1925
},
{
"epoch": 1.5446178471388556,
"grad_norm": 0.41795098781585693,
"learning_rate": 2.857939466450728e-05,
"loss": 0.024,
"num_input_tokens_seen": 31288000,
"step": 1930
},
{
"epoch": 1.5486194477791115,
"grad_norm": 0.2637263536453247,
"learning_rate": 2.8453784281010812e-05,
"loss": 0.0239,
"num_input_tokens_seen": 31365312,
"step": 1935
},
{
"epoch": 1.5526210484193679,
"grad_norm": 0.6169966459274292,
"learning_rate": 2.8328201070778826e-05,
"loss": 0.0322,
"num_input_tokens_seen": 31451200,
"step": 1940
},
{
"epoch": 1.5566226490596238,
"grad_norm": 0.49008283019065857,
"learning_rate": 2.8202647240816304e-05,
"loss": 0.0301,
"num_input_tokens_seen": 31523648,
"step": 1945
},
{
"epoch": 1.5606242496998801,
"grad_norm": 0.5162390470504761,
"learning_rate": 2.8077124997611883e-05,
"loss": 0.0331,
"num_input_tokens_seen": 31608128,
"step": 1950
},
{
"epoch": 1.564625850340136,
"grad_norm": 0.34995701909065247,
"learning_rate": 2.7951636547099113e-05,
"loss": 0.025,
"num_input_tokens_seen": 31681088,
"step": 1955
},
{
"epoch": 1.5686274509803921,
"grad_norm": 0.2745002210140228,
"learning_rate": 2.7826184094617647e-05,
"loss": 0.024,
"num_input_tokens_seen": 31759040,
"step": 1960
},
{
"epoch": 1.5726290516206483,
"grad_norm": 0.33980268239974976,
"learning_rate": 2.7700769844874514e-05,
"loss": 0.0334,
"num_input_tokens_seen": 31835840,
"step": 1965
},
{
"epoch": 1.5766306522609044,
"grad_norm": 0.47615453600883484,
"learning_rate": 2.7575396001905397e-05,
"loss": 0.0299,
"num_input_tokens_seen": 31921856,
"step": 1970
},
{
"epoch": 1.5806322529011605,
"grad_norm": 0.43815162777900696,
"learning_rate": 2.7450064769035817e-05,
"loss": 0.0282,
"num_input_tokens_seen": 32005568,
"step": 1975
},
{
"epoch": 1.5846338535414166,
"grad_norm": 0.3011634051799774,
"learning_rate": 2.7324778348842506e-05,
"loss": 0.0198,
"num_input_tokens_seen": 32083136,
"step": 1980
},
{
"epoch": 1.5886354541816727,
"grad_norm": 0.4009479880332947,
"learning_rate": 2.7199538943114625e-05,
"loss": 0.0249,
"num_input_tokens_seen": 32165952,
"step": 1985
},
{
"epoch": 1.5926370548219286,
"grad_norm": 0.36803168058395386,
"learning_rate": 2.707434875281513e-05,
"loss": 0.0381,
"num_input_tokens_seen": 32244928,
"step": 1990
},
{
"epoch": 1.596638655462185,
"grad_norm": 0.41040199995040894,
"learning_rate": 2.694920997804203e-05,
"loss": 0.0183,
"num_input_tokens_seen": 32323136,
"step": 1995
},
{
"epoch": 1.6006402561024409,
"grad_norm": 0.5134280920028687,
"learning_rate": 2.6824124817989775e-05,
"loss": 0.0329,
"num_input_tokens_seen": 32405696,
"step": 2000
},
{
"epoch": 1.6046418567426972,
"grad_norm": 0.44672590494155884,
"learning_rate": 2.669909547091061e-05,
"loss": 0.0193,
"num_input_tokens_seen": 32482624,
"step": 2005
},
{
"epoch": 1.6086434573829531,
"grad_norm": 0.33581846952438354,
"learning_rate": 2.6574124134075852e-05,
"loss": 0.038,
"num_input_tokens_seen": 32566080,
"step": 2010
},
{
"epoch": 1.6126450580232092,
"grad_norm": 0.28790298104286194,
"learning_rate": 2.6449213003737438e-05,
"loss": 0.0337,
"num_input_tokens_seen": 32648640,
"step": 2015
},
{
"epoch": 1.6166466586634654,
"grad_norm": 0.28296735882759094,
"learning_rate": 2.632436427508913e-05,
"loss": 0.0256,
"num_input_tokens_seen": 32732736,
"step": 2020
},
{
"epoch": 1.6206482593037215,
"grad_norm": 0.24952416121959686,
"learning_rate": 2.619958014222813e-05,
"loss": 0.0324,
"num_input_tokens_seen": 32809024,
"step": 2025
},
{
"epoch": 1.6246498599439776,
"grad_norm": 0.3753882944583893,
"learning_rate": 2.607486279811638e-05,
"loss": 0.0254,
"num_input_tokens_seen": 32893504,
"step": 2030
},
{
"epoch": 1.6286514605842337,
"grad_norm": 0.34843510389328003,
"learning_rate": 2.5950214434542084e-05,
"loss": 0.0242,
"num_input_tokens_seen": 32973376,
"step": 2035
},
{
"epoch": 1.6326530612244898,
"grad_norm": 0.383384644985199,
"learning_rate": 2.5825637242081186e-05,
"loss": 0.0272,
"num_input_tokens_seen": 33052352,
"step": 2040
},
{
"epoch": 1.636654661864746,
"grad_norm": 0.4276237189769745,
"learning_rate": 2.5701133410058855e-05,
"loss": 0.0167,
"num_input_tokens_seen": 33136448,
"step": 2045
},
{
"epoch": 1.640656262505002,
"grad_norm": 0.4827091693878174,
"learning_rate": 2.5576705126511034e-05,
"loss": 0.0245,
"num_input_tokens_seen": 33212992,
"step": 2050
},
{
"epoch": 1.644657863145258,
"grad_norm": 0.45727095007896423,
"learning_rate": 2.5452354578145948e-05,
"loss": 0.0242,
"num_input_tokens_seen": 33295040,
"step": 2055
},
{
"epoch": 1.6486594637855143,
"grad_norm": 0.3339729607105255,
"learning_rate": 2.5328083950305738e-05,
"loss": 0.0209,
"num_input_tokens_seen": 33374272,
"step": 2060
},
{
"epoch": 1.6526610644257702,
"grad_norm": 0.3127999007701874,
"learning_rate": 2.5203895426927998e-05,
"loss": 0.0451,
"num_input_tokens_seen": 33454400,
"step": 2065
},
{
"epoch": 1.6566626650660266,
"grad_norm": 0.44086745381355286,
"learning_rate": 2.5079791190507402e-05,
"loss": 0.0319,
"num_input_tokens_seen": 33535936,
"step": 2070
},
{
"epoch": 1.6606642657062824,
"grad_norm": 0.4107028841972351,
"learning_rate": 2.495577342205739e-05,
"loss": 0.03,
"num_input_tokens_seen": 33616704,
"step": 2075
},
{
"epoch": 1.6646658663465386,
"grad_norm": 0.31537938117980957,
"learning_rate": 2.4831844301071778e-05,
"loss": 0.0256,
"num_input_tokens_seen": 33692992,
"step": 2080
},
{
"epoch": 1.6686674669867947,
"grad_norm": 0.4656033217906952,
"learning_rate": 2.4708006005486515e-05,
"loss": 0.0289,
"num_input_tokens_seen": 33774912,
"step": 2085
},
{
"epoch": 1.6726690676270508,
"grad_norm": 0.4950501024723053,
"learning_rate": 2.458426071164136e-05,
"loss": 0.0262,
"num_input_tokens_seen": 33862464,
"step": 2090
},
{
"epoch": 1.676670668267307,
"grad_norm": 0.29400134086608887,
"learning_rate": 2.4460610594241658e-05,
"loss": 0.0248,
"num_input_tokens_seen": 33939392,
"step": 2095
},
{
"epoch": 1.680672268907563,
"grad_norm": 0.33914294838905334,
"learning_rate": 2.433705782632016e-05,
"loss": 0.0268,
"num_input_tokens_seen": 34017472,
"step": 2100
},
{
"epoch": 1.6846738695478192,
"grad_norm": 0.29552993178367615,
"learning_rate": 2.4213604579198713e-05,
"loss": 0.0269,
"num_input_tokens_seen": 34105408,
"step": 2105
},
{
"epoch": 1.688675470188075,
"grad_norm": 0.3073042631149292,
"learning_rate": 2.4090253022450266e-05,
"loss": 0.0249,
"num_input_tokens_seen": 34188224,
"step": 2110
},
{
"epoch": 1.6926770708283314,
"grad_norm": 0.42417111992836,
"learning_rate": 2.3967005323860577e-05,
"loss": 0.0253,
"num_input_tokens_seen": 34267840,
"step": 2115
},
{
"epoch": 1.6966786714685873,
"grad_norm": 0.23042023181915283,
"learning_rate": 2.3843863649390266e-05,
"loss": 0.0261,
"num_input_tokens_seen": 34343872,
"step": 2120
},
{
"epoch": 1.7006802721088436,
"grad_norm": 0.29375725984573364,
"learning_rate": 2.3720830163136645e-05,
"loss": 0.0315,
"num_input_tokens_seen": 34431680,
"step": 2125
},
{
"epoch": 1.7046818727490995,
"grad_norm": 0.27085232734680176,
"learning_rate": 2.3597907027295717e-05,
"loss": 0.0342,
"num_input_tokens_seen": 34517184,
"step": 2130
},
{
"epoch": 1.708683473389356,
"grad_norm": 0.45456305146217346,
"learning_rate": 2.34750964021242e-05,
"loss": 0.0215,
"num_input_tokens_seen": 34600384,
"step": 2135
},
{
"epoch": 1.7126850740296118,
"grad_norm": 0.5528407692909241,
"learning_rate": 2.335240044590153e-05,
"loss": 0.0351,
"num_input_tokens_seen": 34681792,
"step": 2140
},
{
"epoch": 1.716686674669868,
"grad_norm": 0.33706408739089966,
"learning_rate": 2.3229821314891955e-05,
"loss": 0.0267,
"num_input_tokens_seen": 34762304,
"step": 2145
},
{
"epoch": 1.720688275310124,
"grad_norm": 0.3062940835952759,
"learning_rate": 2.3107361163306622e-05,
"loss": 0.028,
"num_input_tokens_seen": 34843456,
"step": 2150
},
{
"epoch": 1.7246898759503801,
"grad_norm": 0.5358928442001343,
"learning_rate": 2.298502214326574e-05,
"loss": 0.0278,
"num_input_tokens_seen": 34923072,
"step": 2155
},
{
"epoch": 1.7286914765906363,
"grad_norm": 0.23751559853553772,
"learning_rate": 2.2862806404760752e-05,
"loss": 0.0253,
"num_input_tokens_seen": 35002304,
"step": 2160
},
{
"epoch": 1.7326930772308924,
"grad_norm": 0.30729296803474426,
"learning_rate": 2.2740716095616516e-05,
"loss": 0.0281,
"num_input_tokens_seen": 35079744,
"step": 2165
},
{
"epoch": 1.7366946778711485,
"grad_norm": 0.31603410840034485,
"learning_rate": 2.261875336145362e-05,
"loss": 0.0285,
"num_input_tokens_seen": 35157312,
"step": 2170
},
{
"epoch": 1.7406962785114044,
"grad_norm": 0.3044101297855377,
"learning_rate": 2.2496920345650625e-05,
"loss": 0.0304,
"num_input_tokens_seen": 35238208,
"step": 2175
},
{
"epoch": 1.7446978791516607,
"grad_norm": 0.4649178683757782,
"learning_rate": 2.2375219189306413e-05,
"loss": 0.0285,
"num_input_tokens_seen": 35312704,
"step": 2180
},
{
"epoch": 1.7486994797919166,
"grad_norm": 0.3896861970424652,
"learning_rate": 2.2253652031202605e-05,
"loss": 0.0294,
"num_input_tokens_seen": 35392704,
"step": 2185
},
{
"epoch": 1.752701080432173,
"grad_norm": 0.24077565968036652,
"learning_rate": 2.2132221007765854e-05,
"loss": 0.0207,
"num_input_tokens_seen": 35471552,
"step": 2190
},
{
"epoch": 1.7567026810724289,
"grad_norm": 0.4740449786186218,
"learning_rate": 2.2010928253030455e-05,
"loss": 0.0287,
"num_input_tokens_seen": 35558208,
"step": 2195
},
{
"epoch": 1.7607042817126852,
"grad_norm": 0.3645721673965454,
"learning_rate": 2.1889775898600696e-05,
"loss": 0.035,
"num_input_tokens_seen": 35631168,
"step": 2200
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.4311535358428955,
"learning_rate": 2.176876607361352e-05,
"loss": 0.0263,
"num_input_tokens_seen": 35708224,
"step": 2205
},
{
"epoch": 1.7687074829931972,
"grad_norm": 0.35250356793403625,
"learning_rate": 2.1647900904701007e-05,
"loss": 0.0279,
"num_input_tokens_seen": 35792064,
"step": 2210
},
{
"epoch": 1.7727090836334534,
"grad_norm": 0.47433730959892273,
"learning_rate": 2.152718251595307e-05,
"loss": 0.035,
"num_input_tokens_seen": 35874752,
"step": 2215
},
{
"epoch": 1.7767106842737095,
"grad_norm": 0.3649555444717407,
"learning_rate": 2.1406613028880105e-05,
"loss": 0.0266,
"num_input_tokens_seen": 35962304,
"step": 2220
},
{
"epoch": 1.7807122849139656,
"grad_norm": 0.48135465383529663,
"learning_rate": 2.1286194562375677e-05,
"loss": 0.0366,
"num_input_tokens_seen": 36042432,
"step": 2225
},
{
"epoch": 1.7847138855542217,
"grad_norm": 0.5308105945587158,
"learning_rate": 2.116592923267933e-05,
"loss": 0.037,
"num_input_tokens_seen": 36125504,
"step": 2230
},
{
"epoch": 1.7887154861944778,
"grad_norm": 0.28020626306533813,
"learning_rate": 2.1045819153339367e-05,
"loss": 0.0235,
"num_input_tokens_seen": 36206272,
"step": 2235
},
{
"epoch": 1.7927170868347337,
"grad_norm": 0.42110276222229004,
"learning_rate": 2.0925866435175712e-05,
"loss": 0.0313,
"num_input_tokens_seen": 36287680,
"step": 2240
},
{
"epoch": 1.79671868747499,
"grad_norm": 0.34481382369995117,
"learning_rate": 2.080607318624284e-05,
"loss": 0.0241,
"num_input_tokens_seen": 36378048,
"step": 2245
},
{
"epoch": 1.800720288115246,
"grad_norm": 0.2291119247674942,
"learning_rate": 2.0686441511792663e-05,
"loss": 0.0238,
"num_input_tokens_seen": 36459584,
"step": 2250
},
{
"epoch": 1.8047218887555023,
"grad_norm": 0.11932362616062164,
"learning_rate": 2.056697351423762e-05,
"loss": 0.0329,
"num_input_tokens_seen": 36544832,
"step": 2255
},
{
"epoch": 1.8087234893957582,
"grad_norm": 0.40667611360549927,
"learning_rate": 2.044767129311365e-05,
"loss": 0.0284,
"num_input_tokens_seen": 36627136,
"step": 2260
},
{
"epoch": 1.8127250900360146,
"grad_norm": 0.29387739300727844,
"learning_rate": 2.0328536945043362e-05,
"loss": 0.0179,
"num_input_tokens_seen": 36709056,
"step": 2265
},
{
"epoch": 1.8167266906762705,
"grad_norm": 0.5413135886192322,
"learning_rate": 2.0209572563699112e-05,
"loss": 0.0281,
"num_input_tokens_seen": 36795456,
"step": 2270
},
{
"epoch": 1.8207282913165266,
"grad_norm": 0.20994551479816437,
"learning_rate": 2.00907802397663e-05,
"loss": 0.0135,
"num_input_tokens_seen": 36878912,
"step": 2275
},
{
"epoch": 1.8247298919567827,
"grad_norm": 0.3548937737941742,
"learning_rate": 1.997216206090657e-05,
"loss": 0.0278,
"num_input_tokens_seen": 36963136,
"step": 2280
},
{
"epoch": 1.8287314925970388,
"grad_norm": 0.2864935100078583,
"learning_rate": 1.9853720111721095e-05,
"loss": 0.0255,
"num_input_tokens_seen": 37050048,
"step": 2285
},
{
"epoch": 1.832733093237295,
"grad_norm": 0.41523048281669617,
"learning_rate": 1.9735456473714046e-05,
"loss": 0.0222,
"num_input_tokens_seen": 37136448,
"step": 2290
},
{
"epoch": 1.836734693877551,
"grad_norm": 0.47388768196105957,
"learning_rate": 1.961737322525587e-05,
"loss": 0.0243,
"num_input_tokens_seen": 37215808,
"step": 2295
},
{
"epoch": 1.8407362945178072,
"grad_norm": 0.48578011989593506,
"learning_rate": 1.94994724415469e-05,
"loss": 0.0325,
"num_input_tokens_seen": 37295296,
"step": 2300
},
{
"epoch": 1.844737895158063,
"grad_norm": 0.4239029288291931,
"learning_rate": 1.938175619458081e-05,
"loss": 0.0346,
"num_input_tokens_seen": 37376704,
"step": 2305
},
{
"epoch": 1.8487394957983194,
"grad_norm": 0.4094125032424927,
"learning_rate": 1.926422655310819e-05,
"loss": 0.026,
"num_input_tokens_seen": 37456832,
"step": 2310
},
{
"epoch": 1.8527410964385753,
"grad_norm": 0.38107553124427795,
"learning_rate": 1.914688558260026e-05,
"loss": 0.0314,
"num_input_tokens_seen": 37538112,
"step": 2315
},
{
"epoch": 1.8567426970788317,
"grad_norm": 0.35755014419555664,
"learning_rate": 1.9029735345212483e-05,
"loss": 0.0208,
"num_input_tokens_seen": 37619392,
"step": 2320
},
{
"epoch": 1.8607442977190876,
"grad_norm": 0.3875485062599182,
"learning_rate": 1.891277789974841e-05,
"loss": 0.0361,
"num_input_tokens_seen": 37699648,
"step": 2325
},
{
"epoch": 1.864745898359344,
"grad_norm": 0.3359077274799347,
"learning_rate": 1.8796015301623423e-05,
"loss": 0.0274,
"num_input_tokens_seen": 37781312,
"step": 2330
},
{
"epoch": 1.8687474989995998,
"grad_norm": 0.33710983395576477,
"learning_rate": 1.8679449602828673e-05,
"loss": 0.0247,
"num_input_tokens_seen": 37862848,
"step": 2335
},
{
"epoch": 1.872749099639856,
"grad_norm": 0.3907299339771271,
"learning_rate": 1.8563082851894997e-05,
"loss": 0.0291,
"num_input_tokens_seen": 37939776,
"step": 2340
},
{
"epoch": 1.876750700280112,
"grad_norm": 0.4992141127586365,
"learning_rate": 1.8446917093856883e-05,
"loss": 0.025,
"num_input_tokens_seen": 38024000,
"step": 2345
},
{
"epoch": 1.8807523009203682,
"grad_norm": 0.21406111121177673,
"learning_rate": 1.8330954370216595e-05,
"loss": 0.0198,
"num_input_tokens_seen": 38106944,
"step": 2350
},
{
"epoch": 1.8847539015606243,
"grad_norm": 0.32606270909309387,
"learning_rate": 1.8215196718908233e-05,
"loss": 0.0315,
"num_input_tokens_seen": 38190912,
"step": 2355
},
{
"epoch": 1.8887555022008804,
"grad_norm": 0.20130057632923126,
"learning_rate": 1.809964617426197e-05,
"loss": 0.0229,
"num_input_tokens_seen": 38277312,
"step": 2360
},
{
"epoch": 1.8927571028411365,
"grad_norm": 0.532683789730072,
"learning_rate": 1.7984304766968257e-05,
"loss": 0.0318,
"num_input_tokens_seen": 38365632,
"step": 2365
},
{
"epoch": 1.8967587034813924,
"grad_norm": 0.3682226240634918,
"learning_rate": 1.786917452404216e-05,
"loss": 0.021,
"num_input_tokens_seen": 38453952,
"step": 2370
},
{
"epoch": 1.9007603041216488,
"grad_norm": 0.4553619921207428,
"learning_rate": 1.7754257468787772e-05,
"loss": 0.0335,
"num_input_tokens_seen": 38535488,
"step": 2375
},
{
"epoch": 1.9047619047619047,
"grad_norm": 0.2423078417778015,
"learning_rate": 1.7639555620762546e-05,
"loss": 0.0264,
"num_input_tokens_seen": 38618432,
"step": 2380
},
{
"epoch": 1.908763505402161,
"grad_norm": 0.21501043438911438,
"learning_rate": 1.7525070995741935e-05,
"loss": 0.0166,
"num_input_tokens_seen": 38695872,
"step": 2385
},
{
"epoch": 1.912765106042417,
"grad_norm": 0.4341451823711395,
"learning_rate": 1.7410805605683855e-05,
"loss": 0.0322,
"num_input_tokens_seen": 38775232,
"step": 2390
},
{
"epoch": 1.916766706682673,
"grad_norm": 0.3028091490268707,
"learning_rate": 1.729676145869342e-05,
"loss": 0.0235,
"num_input_tokens_seen": 38850240,
"step": 2395
},
{
"epoch": 1.9207683073229291,
"grad_norm": 0.34223926067352295,
"learning_rate": 1.71829405589876e-05,
"loss": 0.0316,
"num_input_tokens_seen": 38927808,
"step": 2400
},
{
"epoch": 1.9247699079631853,
"grad_norm": 0.2994190752506256,
"learning_rate": 1.7069344906859958e-05,
"loss": 0.0235,
"num_input_tokens_seen": 39005760,
"step": 2405
},
{
"epoch": 1.9287715086034414,
"grad_norm": 0.3380463421344757,
"learning_rate": 1.6955976498645642e-05,
"loss": 0.0218,
"num_input_tokens_seen": 39085376,
"step": 2410
},
{
"epoch": 1.9327731092436975,
"grad_norm": 0.3565221130847931,
"learning_rate": 1.6842837326686105e-05,
"loss": 0.0311,
"num_input_tokens_seen": 39172416,
"step": 2415
},
{
"epoch": 1.9367747098839536,
"grad_norm": 0.2793833315372467,
"learning_rate": 1.6729929379294252e-05,
"loss": 0.025,
"num_input_tokens_seen": 39250624,
"step": 2420
},
{
"epoch": 1.9407763105242097,
"grad_norm": 0.3410326838493347,
"learning_rate": 1.6617254640719423e-05,
"loss": 0.0231,
"num_input_tokens_seen": 39332544,
"step": 2425
},
{
"epoch": 1.9447779111644659,
"grad_norm": 0.29940786957740784,
"learning_rate": 1.6504815091112525e-05,
"loss": 0.0248,
"num_input_tokens_seen": 39414592,
"step": 2430
},
{
"epoch": 1.9487795118047218,
"grad_norm": 0.38060298562049866,
"learning_rate": 1.6392612706491278e-05,
"loss": 0.0291,
"num_input_tokens_seen": 39494336,
"step": 2435
},
{
"epoch": 1.952781112444978,
"grad_norm": 0.38698962330818176,
"learning_rate": 1.628064945870539e-05,
"loss": 0.024,
"num_input_tokens_seen": 39573312,
"step": 2440
},
{
"epoch": 1.956782713085234,
"grad_norm": 0.5067830085754395,
"learning_rate": 1.6168927315402026e-05,
"loss": 0.0275,
"num_input_tokens_seen": 39654208,
"step": 2445
},
{
"epoch": 1.9607843137254903,
"grad_norm": 0.2045571208000183,
"learning_rate": 1.605744823999114e-05,
"loss": 0.02,
"num_input_tokens_seen": 39738048,
"step": 2450
},
{
"epoch": 1.9647859143657462,
"grad_norm": 0.5349982976913452,
"learning_rate": 1.5946214191611024e-05,
"loss": 0.0262,
"num_input_tokens_seen": 39815488,
"step": 2455
},
{
"epoch": 1.9687875150060024,
"grad_norm": 0.3816860616207123,
"learning_rate": 1.5835227125093835e-05,
"loss": 0.0382,
"num_input_tokens_seen": 39899200,
"step": 2460
},
{
"epoch": 1.9727891156462585,
"grad_norm": 0.33717644214630127,
"learning_rate": 1.5724488990931253e-05,
"loss": 0.0266,
"num_input_tokens_seen": 39981120,
"step": 2465
},
{
"epoch": 1.9767907162865146,
"grad_norm": 0.3681086599826813,
"learning_rate": 1.5614001735240247e-05,
"loss": 0.0269,
"num_input_tokens_seen": 40068032,
"step": 2470
},
{
"epoch": 1.9807923169267707,
"grad_norm": 0.33696845173835754,
"learning_rate": 1.550376729972878e-05,
"loss": 0.0264,
"num_input_tokens_seen": 40145088,
"step": 2475
},
{
"epoch": 1.9847939175670268,
"grad_norm": 0.30568578839302063,
"learning_rate": 1.539378762166179e-05,
"loss": 0.0293,
"num_input_tokens_seen": 40224832,
"step": 2480
},
{
"epoch": 1.988795518207283,
"grad_norm": 0.6636140942573547,
"learning_rate": 1.5284064633827063e-05,
"loss": 0.0256,
"num_input_tokens_seen": 40311616,
"step": 2485
},
{
"epoch": 1.9927971188475389,
"grad_norm": 0.33070600032806396,
"learning_rate": 1.5174600264501329e-05,
"loss": 0.0314,
"num_input_tokens_seen": 40389824,
"step": 2490
},
{
"epoch": 1.9967987194877952,
"grad_norm": 0.21985433995723724,
"learning_rate": 1.506539643741634e-05,
"loss": 0.028,
"num_input_tokens_seen": 40470336,
"step": 2495
},
{
"epoch": 2.000800320128051,
"grad_norm": 0.22473222017288208,
"learning_rate": 1.4956455071725019e-05,
"loss": 0.023,
"num_input_tokens_seen": 40552512,
"step": 2500
},
{
"epoch": 2.0048019207683074,
"grad_norm": 0.3045428395271301,
"learning_rate": 1.4847778081967866e-05,
"loss": 0.0208,
"num_input_tokens_seen": 40632512,
"step": 2505
},
{
"epoch": 2.0088035214085633,
"grad_norm": 0.3845164477825165,
"learning_rate": 1.4739367378039146e-05,
"loss": 0.0227,
"num_input_tokens_seen": 40716608,
"step": 2510
},
{
"epoch": 2.0128051220488197,
"grad_norm": 0.3960568308830261,
"learning_rate": 1.4631224865153449e-05,
"loss": 0.0203,
"num_input_tokens_seen": 40795584,
"step": 2515
},
{
"epoch": 2.0168067226890756,
"grad_norm": 0.19612348079681396,
"learning_rate": 1.4523352443812151e-05,
"loss": 0.0195,
"num_input_tokens_seen": 40868288,
"step": 2520
},
{
"epoch": 2.020808323329332,
"grad_norm": 0.3838767111301422,
"learning_rate": 1.4415752009770034e-05,
"loss": 0.0176,
"num_input_tokens_seen": 40947136,
"step": 2525
},
{
"epoch": 2.024809923969588,
"grad_norm": 0.29781728982925415,
"learning_rate": 1.4308425454001965e-05,
"loss": 0.03,
"num_input_tokens_seen": 41029056,
"step": 2530
},
{
"epoch": 2.028811524609844,
"grad_norm": 0.4778103232383728,
"learning_rate": 1.4201374662669621e-05,
"loss": 0.0236,
"num_input_tokens_seen": 41111360,
"step": 2535
},
{
"epoch": 2.0328131252501,
"grad_norm": 0.115419901907444,
"learning_rate": 1.4094601517088466e-05,
"loss": 0.018,
"num_input_tokens_seen": 41189952,
"step": 2540
},
{
"epoch": 2.036814725890356,
"grad_norm": 0.32996487617492676,
"learning_rate": 1.3988107893694517e-05,
"loss": 0.0136,
"num_input_tokens_seen": 41270080,
"step": 2545
},
{
"epoch": 2.0408163265306123,
"grad_norm": 0.37645474076271057,
"learning_rate": 1.3881895664011507e-05,
"loss": 0.0195,
"num_input_tokens_seen": 41346112,
"step": 2550
},
{
"epoch": 2.044817927170868,
"grad_norm": 0.31592825055122375,
"learning_rate": 1.377596669461793e-05,
"loss": 0.0193,
"num_input_tokens_seen": 41429568,
"step": 2555
},
{
"epoch": 2.0488195278111245,
"grad_norm": 0.33183401823043823,
"learning_rate": 1.367032284711425e-05,
"loss": 0.0203,
"num_input_tokens_seen": 41508288,
"step": 2560
},
{
"epoch": 2.0528211284513804,
"grad_norm": 0.36240455508232117,
"learning_rate": 1.3564965978090202e-05,
"loss": 0.0184,
"num_input_tokens_seen": 41587008,
"step": 2565
},
{
"epoch": 2.0568227290916368,
"grad_norm": 0.3186376690864563,
"learning_rate": 1.3459897939092108e-05,
"loss": 0.0253,
"num_input_tokens_seen": 41674048,
"step": 2570
},
{
"epoch": 2.0608243297318927,
"grad_norm": 0.3811952471733093,
"learning_rate": 1.3355120576590415e-05,
"loss": 0.0196,
"num_input_tokens_seen": 41755584,
"step": 2575
},
{
"epoch": 2.064825930372149,
"grad_norm": 0.3392605185508728,
"learning_rate": 1.3250635731947198e-05,
"loss": 0.0199,
"num_input_tokens_seen": 41834560,
"step": 2580
},
{
"epoch": 2.068827531012405,
"grad_norm": 0.37324753403663635,
"learning_rate": 1.3146445241383807e-05,
"loss": 0.0153,
"num_input_tokens_seen": 41918400,
"step": 2585
},
{
"epoch": 2.0728291316526612,
"grad_norm": 0.35555389523506165,
"learning_rate": 1.304255093594862e-05,
"loss": 0.0182,
"num_input_tokens_seen": 42001216,
"step": 2590
},
{
"epoch": 2.076830732292917,
"grad_norm": 0.3356153666973114,
"learning_rate": 1.293895464148478e-05,
"loss": 0.0202,
"num_input_tokens_seen": 42090816,
"step": 2595
},
{
"epoch": 2.0808323329331735,
"grad_norm": 0.36150428652763367,
"learning_rate": 1.2835658178598276e-05,
"loss": 0.0242,
"num_input_tokens_seen": 42177216,
"step": 2600
},
{
"epoch": 2.0848339335734294,
"grad_norm": 0.5609300136566162,
"learning_rate": 1.2732663362625746e-05,
"loss": 0.0196,
"num_input_tokens_seen": 42255552,
"step": 2605
},
{
"epoch": 2.0888355342136853,
"grad_norm": 0.5928618311882019,
"learning_rate": 1.2629972003602724e-05,
"loss": 0.0182,
"num_input_tokens_seen": 42336192,
"step": 2610
},
{
"epoch": 2.0928371348539416,
"grad_norm": 0.4556543529033661,
"learning_rate": 1.2527585906231764e-05,
"loss": 0.0307,
"num_input_tokens_seen": 42418112,
"step": 2615
},
{
"epoch": 2.0968387354941975,
"grad_norm": 0.379961222410202,
"learning_rate": 1.2425506869850739e-05,
"loss": 0.0215,
"num_input_tokens_seen": 42500544,
"step": 2620
},
{
"epoch": 2.100840336134454,
"grad_norm": 0.3861774504184723,
"learning_rate": 1.232373668840123e-05,
"loss": 0.0249,
"num_input_tokens_seen": 42583232,
"step": 2625
},
{
"epoch": 2.1048419367747098,
"grad_norm": 0.3762750029563904,
"learning_rate": 1.2222277150396943e-05,
"loss": 0.0182,
"num_input_tokens_seen": 42661056,
"step": 2630
},
{
"epoch": 2.108843537414966,
"grad_norm": 0.36462509632110596,
"learning_rate": 1.2121130038892399e-05,
"loss": 0.0147,
"num_input_tokens_seen": 42739136,
"step": 2635
},
{
"epoch": 2.112845138055222,
"grad_norm": 0.5274704694747925,
"learning_rate": 1.2020297131451445e-05,
"loss": 0.0249,
"num_input_tokens_seen": 42819008,
"step": 2640
},
{
"epoch": 2.1168467386954783,
"grad_norm": 0.4046748876571655,
"learning_rate": 1.191978020011614e-05,
"loss": 0.0229,
"num_input_tokens_seen": 42901696,
"step": 2645
},
{
"epoch": 2.1208483393357342,
"grad_norm": 0.5925213694572449,
"learning_rate": 1.1819581011375542e-05,
"loss": 0.0197,
"num_input_tokens_seen": 42980416,
"step": 2650
},
{
"epoch": 2.1248499399759906,
"grad_norm": 0.4185037612915039,
"learning_rate": 1.1719701326134695e-05,
"loss": 0.0172,
"num_input_tokens_seen": 43058752,
"step": 2655
},
{
"epoch": 2.1288515406162465,
"grad_norm": 0.2945035994052887,
"learning_rate": 1.1620142899683686e-05,
"loss": 0.017,
"num_input_tokens_seen": 43142336,
"step": 2660
},
{
"epoch": 2.1328531412565024,
"grad_norm": 0.44143450260162354,
"learning_rate": 1.1520907481666752e-05,
"loss": 0.0197,
"num_input_tokens_seen": 43223744,
"step": 2665
},
{
"epoch": 2.1368547418967587,
"grad_norm": 0.2169390767812729,
"learning_rate": 1.1421996816051586e-05,
"loss": 0.02,
"num_input_tokens_seen": 43310912,
"step": 2670
},
{
"epoch": 2.1408563425370146,
"grad_norm": 0.4547870457172394,
"learning_rate": 1.1323412641098692e-05,
"loss": 0.0161,
"num_input_tokens_seen": 43394112,
"step": 2675
},
{
"epoch": 2.144857943177271,
"grad_norm": 0.13960134983062744,
"learning_rate": 1.1225156689330766e-05,
"loss": 0.0249,
"num_input_tokens_seen": 43475264,
"step": 2680
},
{
"epoch": 2.148859543817527,
"grad_norm": 0.4410214424133301,
"learning_rate": 1.1127230687502321e-05,
"loss": 0.0193,
"num_input_tokens_seen": 43559104,
"step": 2685
},
{
"epoch": 2.152861144457783,
"grad_norm": 0.36454614996910095,
"learning_rate": 1.1029636356569314e-05,
"loss": 0.0198,
"num_input_tokens_seen": 43639616,
"step": 2690
},
{
"epoch": 2.156862745098039,
"grad_norm": 0.5577832460403442,
"learning_rate": 1.0932375411658907e-05,
"loss": 0.0238,
"num_input_tokens_seen": 43719616,
"step": 2695
},
{
"epoch": 2.1608643457382954,
"grad_norm": 0.27591243386268616,
"learning_rate": 1.0835449562039295e-05,
"loss": 0.0105,
"num_input_tokens_seen": 43801664,
"step": 2700
},
{
"epoch": 2.1648659463785513,
"grad_norm": 0.38860106468200684,
"learning_rate": 1.0738860511089725e-05,
"loss": 0.0155,
"num_input_tokens_seen": 43878720,
"step": 2705
},
{
"epoch": 2.1688675470188077,
"grad_norm": 0.3722364604473114,
"learning_rate": 1.0642609956270509e-05,
"loss": 0.0165,
"num_input_tokens_seen": 43961408,
"step": 2710
},
{
"epoch": 2.1728691476590636,
"grad_norm": 0.37319135665893555,
"learning_rate": 1.0546699589093223e-05,
"loss": 0.0207,
"num_input_tokens_seen": 44040000,
"step": 2715
},
{
"epoch": 2.17687074829932,
"grad_norm": 0.4046717882156372,
"learning_rate": 1.045113109509098e-05,
"loss": 0.0201,
"num_input_tokens_seen": 44119232,
"step": 2720
},
{
"epoch": 2.180872348939576,
"grad_norm": 0.25555476546287537,
"learning_rate": 1.0355906153788754e-05,
"loss": 0.0201,
"num_input_tokens_seen": 44199232,
"step": 2725
},
{
"epoch": 2.184873949579832,
"grad_norm": 0.29047173261642456,
"learning_rate": 1.0261026438673966e-05,
"loss": 0.0152,
"num_input_tokens_seen": 44281536,
"step": 2730
},
{
"epoch": 2.188875550220088,
"grad_norm": 0.2393018901348114,
"learning_rate": 1.0166493617166993e-05,
"loss": 0.0135,
"num_input_tokens_seen": 44358208,
"step": 2735
},
{
"epoch": 2.192877150860344,
"grad_norm": 0.42202460765838623,
"learning_rate": 1.007230935059187e-05,
"loss": 0.0185,
"num_input_tokens_seen": 44439104,
"step": 2740
},
{
"epoch": 2.1968787515006003,
"grad_norm": 0.49886980652809143,
"learning_rate": 9.97847529414713e-06,
"loss": 0.0222,
"num_input_tokens_seen": 44517568,
"step": 2745
},
{
"epoch": 2.200880352140856,
"grad_norm": 0.4164292812347412,
"learning_rate": 9.884993096876698e-06,
"loss": 0.0215,
"num_input_tokens_seen": 44594496,
"step": 2750
},
{
"epoch": 2.2048819527811125,
"grad_norm": 0.3212761878967285,
"learning_rate": 9.791864401640916e-06,
"loss": 0.0158,
"num_input_tokens_seen": 44672704,
"step": 2755
},
{
"epoch": 2.2088835534213684,
"grad_norm": 0.34444934129714966,
"learning_rate": 9.699090845087637e-06,
"loss": 0.0195,
"num_input_tokens_seen": 44747456,
"step": 2760
},
{
"epoch": 2.212885154061625,
"grad_norm": 0.3931024670600891,
"learning_rate": 9.606674057623509e-06,
"loss": 0.0184,
"num_input_tokens_seen": 44833984,
"step": 2765
},
{
"epoch": 2.2168867547018807,
"grad_norm": 0.23794767260551453,
"learning_rate": 9.514615663385338e-06,
"loss": 0.0164,
"num_input_tokens_seen": 44912576,
"step": 2770
},
{
"epoch": 2.220888355342137,
"grad_norm": 0.334563672542572,
"learning_rate": 9.422917280211449e-06,
"loss": 0.0128,
"num_input_tokens_seen": 44993984,
"step": 2775
},
{
"epoch": 2.224889955982393,
"grad_norm": 0.4635678231716156,
"learning_rate": 9.331580519613352e-06,
"loss": 0.0158,
"num_input_tokens_seen": 45075392,
"step": 2780
},
{
"epoch": 2.2288915566226493,
"grad_norm": 0.600250780582428,
"learning_rate": 9.24060698674738e-06,
"loss": 0.0186,
"num_input_tokens_seen": 45158848,
"step": 2785
},
{
"epoch": 2.232893157262905,
"grad_norm": 0.4503975510597229,
"learning_rate": 9.149998280386496e-06,
"loss": 0.0194,
"num_input_tokens_seen": 45241536,
"step": 2790
},
{
"epoch": 2.236894757903161,
"grad_norm": 0.5378267168998718,
"learning_rate": 9.059755992892156e-06,
"loss": 0.0186,
"num_input_tokens_seen": 45322432,
"step": 2795
},
{
"epoch": 2.2408963585434174,
"grad_norm": 0.501382052898407,
"learning_rate": 8.969881710186384e-06,
"loss": 0.0144,
"num_input_tokens_seen": 45405760,
"step": 2800
},
{
"epoch": 2.2448979591836733,
"grad_norm": 0.4382809102535248,
"learning_rate": 8.880377011723855e-06,
"loss": 0.0153,
"num_input_tokens_seen": 45485760,
"step": 2805
},
{
"epoch": 2.2488995598239296,
"grad_norm": 0.4418613612651825,
"learning_rate": 8.791243470464165e-06,
"loss": 0.0199,
"num_input_tokens_seen": 45564736,
"step": 2810
},
{
"epoch": 2.2529011604641855,
"grad_norm": 0.31149056553840637,
"learning_rate": 8.702482652844175e-06,
"loss": 0.0235,
"num_input_tokens_seen": 45647296,
"step": 2815
},
{
"epoch": 2.256902761104442,
"grad_norm": 0.4306128919124603,
"learning_rate": 8.61409611875046e-06,
"loss": 0.0173,
"num_input_tokens_seen": 45728320,
"step": 2820
},
{
"epoch": 2.2609043617446978,
"grad_norm": 0.3324277102947235,
"learning_rate": 8.526085421491957e-06,
"loss": 0.0195,
"num_input_tokens_seen": 45808448,
"step": 2825
},
{
"epoch": 2.264905962384954,
"grad_norm": 0.27782583236694336,
"learning_rate": 8.43845210777262e-06,
"loss": 0.0217,
"num_input_tokens_seen": 45889216,
"step": 2830
},
{
"epoch": 2.26890756302521,
"grad_norm": 0.37792855501174927,
"learning_rate": 8.351197717664213e-06,
"loss": 0.0123,
"num_input_tokens_seen": 45969856,
"step": 2835
},
{
"epoch": 2.2729091636654664,
"grad_norm": 0.642516553401947,
"learning_rate": 8.264323784579327e-06,
"loss": 0.0244,
"num_input_tokens_seen": 46046784,
"step": 2840
},
{
"epoch": 2.2769107643057223,
"grad_norm": 0.6350634098052979,
"learning_rate": 8.177831835244354e-06,
"loss": 0.022,
"num_input_tokens_seen": 46129088,
"step": 2845
},
{
"epoch": 2.280912364945978,
"grad_norm": 0.37874847650527954,
"learning_rate": 8.091723389672712e-06,
"loss": 0.017,
"num_input_tokens_seen": 46203584,
"step": 2850
},
{
"epoch": 2.2849139655862345,
"grad_norm": 0.3346487283706665,
"learning_rate": 8.005999961138065e-06,
"loss": 0.0199,
"num_input_tokens_seen": 46284864,
"step": 2855
},
{
"epoch": 2.288915566226491,
"grad_norm": 0.33976414799690247,
"learning_rate": 7.920663056147797e-06,
"loss": 0.0214,
"num_input_tokens_seen": 46370496,
"step": 2860
},
{
"epoch": 2.2929171668667467,
"grad_norm": 0.20369592308998108,
"learning_rate": 7.835714174416542e-06,
"loss": 0.0145,
"num_input_tokens_seen": 46451264,
"step": 2865
},
{
"epoch": 2.2969187675070026,
"grad_norm": 0.3600223660469055,
"learning_rate": 7.75115480883973e-06,
"loss": 0.0181,
"num_input_tokens_seen": 46529344,
"step": 2870
},
{
"epoch": 2.300920368147259,
"grad_norm": 0.46353578567504883,
"learning_rate": 7.66698644546746e-06,
"loss": 0.0319,
"num_input_tokens_seen": 46609216,
"step": 2875
},
{
"epoch": 2.304921968787515,
"grad_norm": 0.35430440306663513,
"learning_rate": 7.5832105634783246e-06,
"loss": 0.0225,
"num_input_tokens_seen": 46694080,
"step": 2880
},
{
"epoch": 2.308923569427771,
"grad_norm": 0.38934943079948425,
"learning_rate": 7.499828635153444e-06,
"loss": 0.0212,
"num_input_tokens_seen": 46784192,
"step": 2885
},
{
"epoch": 2.312925170068027,
"grad_norm": 0.22117069363594055,
"learning_rate": 7.416842125850576e-06,
"loss": 0.0196,
"num_input_tokens_seen": 46865600,
"step": 2890
},
{
"epoch": 2.3169267707082835,
"grad_norm": 0.3990350365638733,
"learning_rate": 7.334252493978344e-06,
"loss": 0.0169,
"num_input_tokens_seen": 46945856,
"step": 2895
},
{
"epoch": 2.3209283713485394,
"grad_norm": 0.38565793633461,
"learning_rate": 7.252061190970658e-06,
"loss": 0.0246,
"num_input_tokens_seen": 47026496,
"step": 2900
},
{
"epoch": 2.3249299719887957,
"grad_norm": 0.19642944633960724,
"learning_rate": 7.170269661261164e-06,
"loss": 0.0211,
"num_input_tokens_seen": 47101760,
"step": 2905
},
{
"epoch": 2.3289315726290516,
"grad_norm": 0.4160960614681244,
"learning_rate": 7.088879342257894e-06,
"loss": 0.0182,
"num_input_tokens_seen": 47179968,
"step": 2910
},
{
"epoch": 2.332933173269308,
"grad_norm": 0.2597695291042328,
"learning_rate": 7.007891664317936e-06,
"loss": 0.0227,
"num_input_tokens_seen": 47252672,
"step": 2915
},
{
"epoch": 2.336934773909564,
"grad_norm": 0.12619255483150482,
"learning_rate": 6.927308050722411e-06,
"loss": 0.0118,
"num_input_tokens_seen": 47333056,
"step": 2920
},
{
"epoch": 2.3409363745498197,
"grad_norm": 0.45799922943115234,
"learning_rate": 6.847129917651356e-06,
"loss": 0.0131,
"num_input_tokens_seen": 47421376,
"step": 2925
},
{
"epoch": 2.344937975190076,
"grad_norm": 0.33925876021385193,
"learning_rate": 6.767358674158871e-06,
"loss": 0.0201,
"num_input_tokens_seen": 47501376,
"step": 2930
},
{
"epoch": 2.348939575830332,
"grad_norm": 0.25561290979385376,
"learning_rate": 6.68799572214838e-06,
"loss": 0.0111,
"num_input_tokens_seen": 47577152,
"step": 2935
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.4312439560890198,
"learning_rate": 6.609042456347962e-06,
"loss": 0.0174,
"num_input_tokens_seen": 47659200,
"step": 2940
},
{
"epoch": 2.356942777110844,
"grad_norm": 0.2186250239610672,
"learning_rate": 6.530500264285861e-06,
"loss": 0.0158,
"num_input_tokens_seen": 47743424,
"step": 2945
},
{
"epoch": 2.3609443777511006,
"grad_norm": 0.4922243654727936,
"learning_rate": 6.4523705262660914e-06,
"loss": 0.0191,
"num_input_tokens_seen": 47824192,
"step": 2950
},
{
"epoch": 2.3649459783913565,
"grad_norm": 0.43223145604133606,
"learning_rate": 6.374654615344152e-06,
"loss": 0.0182,
"num_input_tokens_seen": 47903552,
"step": 2955
},
{
"epoch": 2.368947579031613,
"grad_norm": 0.8656986355781555,
"learning_rate": 6.297353897302989e-06,
"loss": 0.0179,
"num_input_tokens_seen": 47987264,
"step": 2960
},
{
"epoch": 2.3729491796718687,
"grad_norm": 0.35136735439300537,
"learning_rate": 6.220469730628865e-06,
"loss": 0.0084,
"num_input_tokens_seen": 48065088,
"step": 2965
},
{
"epoch": 2.376950780312125,
"grad_norm": 0.4305860996246338,
"learning_rate": 6.1440034664875865e-06,
"loss": 0.0141,
"num_input_tokens_seen": 48148288,
"step": 2970
},
{
"epoch": 2.380952380952381,
"grad_norm": 0.508858859539032,
"learning_rate": 6.067956448700711e-06,
"loss": 0.0267,
"num_input_tokens_seen": 48228800,
"step": 2975
},
{
"epoch": 2.384953981592637,
"grad_norm": 0.4427487850189209,
"learning_rate": 5.992330013721953e-06,
"loss": 0.0234,
"num_input_tokens_seen": 48312256,
"step": 2980
},
{
"epoch": 2.388955582232893,
"grad_norm": 0.617440402507782,
"learning_rate": 5.917125490613675e-06,
"loss": 0.0199,
"num_input_tokens_seen": 48393920,
"step": 2985
},
{
"epoch": 2.392957182873149,
"grad_norm": 0.43196341395378113,
"learning_rate": 5.842344201023529e-06,
"loss": 0.0177,
"num_input_tokens_seen": 48471360,
"step": 2990
},
{
"epoch": 2.3969587835134054,
"grad_norm": 0.45469948649406433,
"learning_rate": 5.76798745916127e-06,
"loss": 0.0182,
"num_input_tokens_seen": 48555072,
"step": 2995
},
{
"epoch": 2.4009603841536613,
"grad_norm": 0.3805118501186371,
"learning_rate": 5.694056571775617e-06,
"loss": 0.0217,
"num_input_tokens_seen": 48634048,
"step": 3000
},
{
"epoch": 2.4049619847939177,
"grad_norm": 0.5378417372703552,
"learning_rate": 5.6205528381313005e-06,
"loss": 0.0171,
"num_input_tokens_seen": 48713920,
"step": 3005
},
{
"epoch": 2.4089635854341735,
"grad_norm": 0.7792515754699707,
"learning_rate": 5.547477549986244e-06,
"loss": 0.0206,
"num_input_tokens_seen": 48796608,
"step": 3010
},
{
"epoch": 2.41296518607443,
"grad_norm": 0.3867614269256592,
"learning_rate": 5.474831991568833e-06,
"loss": 0.0211,
"num_input_tokens_seen": 48877632,
"step": 3015
},
{
"epoch": 2.416966786714686,
"grad_norm": 0.4291365444660187,
"learning_rate": 5.402617439555392e-06,
"loss": 0.0216,
"num_input_tokens_seen": 48954816,
"step": 3020
},
{
"epoch": 2.420968387354942,
"grad_norm": 0.6565413475036621,
"learning_rate": 5.330835163047678e-06,
"loss": 0.021,
"num_input_tokens_seen": 49040064,
"step": 3025
},
{
"epoch": 2.424969987995198,
"grad_norm": 0.26465168595314026,
"learning_rate": 5.259486423550649e-06,
"loss": 0.0095,
"num_input_tokens_seen": 49115840,
"step": 3030
},
{
"epoch": 2.4289715886354544,
"grad_norm": 0.31466591358184814,
"learning_rate": 5.1885724749502664e-06,
"loss": 0.0206,
"num_input_tokens_seen": 49195328,
"step": 3035
},
{
"epoch": 2.4329731892757103,
"grad_norm": 0.49722471833229065,
"learning_rate": 5.118094563491437e-06,
"loss": 0.0174,
"num_input_tokens_seen": 49277376,
"step": 3040
},
{
"epoch": 2.4369747899159666,
"grad_norm": 0.48088884353637695,
"learning_rate": 5.048053927756154e-06,
"loss": 0.0247,
"num_input_tokens_seen": 49364672,
"step": 3045
},
{
"epoch": 2.4409763905562225,
"grad_norm": 0.41930773854255676,
"learning_rate": 4.978451798641674e-06,
"loss": 0.0169,
"num_input_tokens_seen": 49441856,
"step": 3050
},
{
"epoch": 2.4449779911964784,
"grad_norm": 0.49265021085739136,
"learning_rate": 4.9092893993389656e-06,
"loss": 0.0169,
"num_input_tokens_seen": 49519680,
"step": 3055
},
{
"epoch": 2.4489795918367347,
"grad_norm": 0.374855637550354,
"learning_rate": 4.840567945311121e-06,
"loss": 0.0168,
"num_input_tokens_seen": 49602240,
"step": 3060
},
{
"epoch": 2.4529811924769906,
"grad_norm": 0.3391536474227905,
"learning_rate": 4.772288644272068e-06,
"loss": 0.0143,
"num_input_tokens_seen": 49684544,
"step": 3065
},
{
"epoch": 2.456982793117247,
"grad_norm": 0.4037143588066101,
"learning_rate": 4.704452696165305e-06,
"loss": 0.0194,
"num_input_tokens_seen": 49763520,
"step": 3070
},
{
"epoch": 2.460984393757503,
"grad_norm": 0.6887222528457642,
"learning_rate": 4.637061293142834e-06,
"loss": 0.0248,
"num_input_tokens_seen": 49842112,
"step": 3075
},
{
"epoch": 2.4649859943977592,
"grad_norm": 0.2990429699420929,
"learning_rate": 4.570115619544201e-06,
"loss": 0.0217,
"num_input_tokens_seen": 49928128,
"step": 3080
},
{
"epoch": 2.468987595038015,
"grad_norm": 0.5057087540626526,
"learning_rate": 4.503616851875673e-06,
"loss": 0.0143,
"num_input_tokens_seen": 50012224,
"step": 3085
},
{
"epoch": 2.4729891956782715,
"grad_norm": 0.28268003463745117,
"learning_rate": 4.437566158789581e-06,
"loss": 0.0147,
"num_input_tokens_seen": 50091968,
"step": 3090
},
{
"epoch": 2.4769907963185274,
"grad_norm": 0.3694641888141632,
"learning_rate": 4.371964701063771e-06,
"loss": 0.0187,
"num_input_tokens_seen": 50172992,
"step": 3095
},
{
"epoch": 2.4809923969587837,
"grad_norm": 0.3379111886024475,
"learning_rate": 4.306813631581211e-06,
"loss": 0.0149,
"num_input_tokens_seen": 50253760,
"step": 3100
},
{
"epoch": 2.4849939975990396,
"grad_norm": 0.38825467228889465,
"learning_rate": 4.242114095309719e-06,
"loss": 0.0133,
"num_input_tokens_seen": 50332352,
"step": 3105
},
{
"epoch": 2.4889955982392955,
"grad_norm": 0.4392836391925812,
"learning_rate": 4.1778672292818535e-06,
"loss": 0.0173,
"num_input_tokens_seen": 50413888,
"step": 3110
},
{
"epoch": 2.492997198879552,
"grad_norm": 0.13757553696632385,
"learning_rate": 4.114074162574928e-06,
"loss": 0.0147,
"num_input_tokens_seen": 50495168,
"step": 3115
},
{
"epoch": 2.4969987995198077,
"grad_norm": 0.29631200432777405,
"learning_rate": 4.0507360162911475e-06,
"loss": 0.019,
"num_input_tokens_seen": 50573632,
"step": 3120
},
{
"epoch": 2.501000400160064,
"grad_norm": 0.5353264212608337,
"learning_rate": 3.987853903537946e-06,
"loss": 0.0197,
"num_input_tokens_seen": 50656704,
"step": 3125
},
{
"epoch": 2.50500200080032,
"grad_norm": 0.6070181727409363,
"learning_rate": 3.925428929408402e-06,
"loss": 0.0174,
"num_input_tokens_seen": 50739520,
"step": 3130
},
{
"epoch": 2.5090036014405763,
"grad_norm": 0.231636181473732,
"learning_rate": 3.863462190961807e-06,
"loss": 0.0236,
"num_input_tokens_seen": 50820544,
"step": 3135
},
{
"epoch": 2.5130052020808322,
"grad_norm": 0.3611864745616913,
"learning_rate": 3.8019547772044127e-06,
"loss": 0.026,
"num_input_tokens_seen": 50903232,
"step": 3140
},
{
"epoch": 2.5170068027210886,
"grad_norm": 0.5140330791473389,
"learning_rate": 3.7409077690702577e-06,
"loss": 0.0212,
"num_input_tokens_seen": 50988352,
"step": 3145
},
{
"epoch": 2.5210084033613445,
"grad_norm": 0.15091240406036377,
"learning_rate": 3.680322239402223e-06,
"loss": 0.0171,
"num_input_tokens_seen": 51069888,
"step": 3150
},
{
"epoch": 2.525010004001601,
"grad_norm": 0.2955949902534485,
"learning_rate": 3.620199252933114e-06,
"loss": 0.0257,
"num_input_tokens_seen": 51155008,
"step": 3155
},
{
"epoch": 2.5290116046418567,
"grad_norm": 0.5346180200576782,
"learning_rate": 3.5605398662669954e-06,
"loss": 0.0238,
"num_input_tokens_seen": 51236032,
"step": 3160
},
{
"epoch": 2.5330132052821126,
"grad_norm": 0.5316638946533203,
"learning_rate": 3.5013451278606144e-06,
"loss": 0.024,
"num_input_tokens_seen": 51314240,
"step": 3165
},
{
"epoch": 2.537014805922369,
"grad_norm": 0.37888795137405396,
"learning_rate": 3.4426160780049555e-06,
"loss": 0.0148,
"num_input_tokens_seen": 51392832,
"step": 3170
},
{
"epoch": 2.5410164065626253,
"grad_norm": 0.5194000601768494,
"learning_rate": 3.384353748806991e-06,
"loss": 0.02,
"num_input_tokens_seen": 51470144,
"step": 3175
},
{
"epoch": 2.545018007202881,
"grad_norm": 0.7122679352760315,
"learning_rate": 3.326559164171492e-06,
"loss": 0.022,
"num_input_tokens_seen": 51545408,
"step": 3180
},
{
"epoch": 2.549019607843137,
"grad_norm": 0.4563440680503845,
"learning_rate": 3.2692333397830954e-06,
"loss": 0.0223,
"num_input_tokens_seen": 51627072,
"step": 3185
},
{
"epoch": 2.5530212084833934,
"grad_norm": 0.49692097306251526,
"learning_rate": 3.21237728308841e-06,
"loss": 0.0204,
"num_input_tokens_seen": 51712704,
"step": 3190
},
{
"epoch": 2.5570228091236493,
"grad_norm": 0.3346436321735382,
"learning_rate": 3.1559919932783333e-06,
"loss": 0.0144,
"num_input_tokens_seen": 51797056,
"step": 3195
},
{
"epoch": 2.5610244097639057,
"grad_norm": 0.7675926089286804,
"learning_rate": 3.1000784612704757e-06,
"loss": 0.0249,
"num_input_tokens_seen": 51879872,
"step": 3200
},
{
"epoch": 2.5650260104041616,
"grad_norm": 0.5489852428436279,
"learning_rate": 3.0446376696917644e-06,
"loss": 0.0196,
"num_input_tokens_seen": 51967296,
"step": 3205
},
{
"epoch": 2.569027611044418,
"grad_norm": 0.43161171674728394,
"learning_rate": 2.989670592861161e-06,
"loss": 0.0153,
"num_input_tokens_seen": 52045888,
"step": 3210
},
{
"epoch": 2.573029211684674,
"grad_norm": 0.4794932007789612,
"learning_rate": 2.9351781967725343e-06,
"loss": 0.0158,
"num_input_tokens_seen": 52127040,
"step": 3215
},
{
"epoch": 2.5770308123249297,
"grad_norm": 0.5003116726875305,
"learning_rate": 2.8811614390777018e-06,
"loss": 0.0145,
"num_input_tokens_seen": 52206016,
"step": 3220
},
{
"epoch": 2.581032412965186,
"grad_norm": 0.42540910840034485,
"learning_rate": 2.8276212690696013e-06,
"loss": 0.0286,
"num_input_tokens_seen": 52286912,
"step": 3225
},
{
"epoch": 2.5850340136054424,
"grad_norm": 0.3958487808704376,
"learning_rate": 2.774558627665573e-06,
"loss": 0.0157,
"num_input_tokens_seen": 52372160,
"step": 3230
},
{
"epoch": 2.5890356142456983,
"grad_norm": 0.3834131956100464,
"learning_rate": 2.721974447390868e-06,
"loss": 0.0179,
"num_input_tokens_seen": 52452672,
"step": 3235
},
{
"epoch": 2.593037214885954,
"grad_norm": 0.6398972272872925,
"learning_rate": 2.6698696523622125e-06,
"loss": 0.016,
"num_input_tokens_seen": 52539840,
"step": 3240
},
{
"epoch": 2.5970388155262105,
"grad_norm": 0.4243876338005066,
"learning_rate": 2.6182451582716417e-06,
"loss": 0.0195,
"num_input_tokens_seen": 52620352,
"step": 3245
},
{
"epoch": 2.601040416166467,
"grad_norm": 0.54941725730896,
"learning_rate": 2.5671018723703164e-06,
"loss": 0.0226,
"num_input_tokens_seen": 52699712,
"step": 3250
},
{
"epoch": 2.6050420168067228,
"grad_norm": 0.3402462303638458,
"learning_rate": 2.5164406934526395e-06,
"loss": 0.0185,
"num_input_tokens_seen": 52780736,
"step": 3255
},
{
"epoch": 2.6090436174469787,
"grad_norm": 0.32467207312583923,
"learning_rate": 2.4662625118404503e-06,
"loss": 0.0131,
"num_input_tokens_seen": 52858816,
"step": 3260
},
{
"epoch": 2.613045218087235,
"grad_norm": 0.39890602231025696,
"learning_rate": 2.4165682093673646e-06,
"loss": 0.0179,
"num_input_tokens_seen": 52936000,
"step": 3265
},
{
"epoch": 2.617046818727491,
"grad_norm": 0.32913973927497864,
"learning_rate": 2.367358659363291e-06,
"loss": 0.0118,
"num_input_tokens_seen": 53018432,
"step": 3270
},
{
"epoch": 2.6210484193677472,
"grad_norm": 0.33710378408432007,
"learning_rate": 2.318634726639053e-06,
"loss": 0.014,
"num_input_tokens_seen": 53097664,
"step": 3275
},
{
"epoch": 2.625050020008003,
"grad_norm": 0.33488237857818604,
"learning_rate": 2.270397267471256e-06,
"loss": 0.0214,
"num_input_tokens_seen": 53175872,
"step": 3280
},
{
"epoch": 2.6290516206482595,
"grad_norm": 0.36654725670814514,
"learning_rate": 2.2226471295871555e-06,
"loss": 0.014,
"num_input_tokens_seen": 53264320,
"step": 3285
},
{
"epoch": 2.6330532212885154,
"grad_norm": 0.2989281117916107,
"learning_rate": 2.175385152149827e-06,
"loss": 0.0193,
"num_input_tokens_seen": 53349312,
"step": 3290
},
{
"epoch": 2.6370548219287713,
"grad_norm": 0.367129385471344,
"learning_rate": 2.128612165743382e-06,
"loss": 0.0106,
"num_input_tokens_seen": 53432768,
"step": 3295
},
{
"epoch": 2.6410564225690276,
"grad_norm": 0.6069823503494263,
"learning_rate": 2.0823289923583865e-06,
"loss": 0.0159,
"num_input_tokens_seen": 53512256,
"step": 3300
},
{
"epoch": 2.645058023209284,
"grad_norm": 0.45984363555908203,
"learning_rate": 2.0365364453774115e-06,
"loss": 0.0152,
"num_input_tokens_seen": 53598528,
"step": 3305
},
{
"epoch": 2.64905962384954,
"grad_norm": 0.33544713258743286,
"learning_rate": 1.9912353295607255e-06,
"loss": 0.01,
"num_input_tokens_seen": 53679680,
"step": 3310
},
{
"epoch": 2.6530612244897958,
"grad_norm": 0.2931489646434784,
"learning_rate": 1.9464264410321684e-06,
"loss": 0.0142,
"num_input_tokens_seen": 53760832,
"step": 3315
},
{
"epoch": 2.657062825130052,
"grad_norm": 0.20795246958732605,
"learning_rate": 1.9021105672651807e-06,
"loss": 0.0167,
"num_input_tokens_seen": 53836480,
"step": 3320
},
{
"epoch": 2.661064425770308,
"grad_norm": 0.49103161692619324,
"learning_rate": 1.8582884870688955e-06,
"loss": 0.0156,
"num_input_tokens_seen": 53916608,
"step": 3325
},
{
"epoch": 2.6650660264105643,
"grad_norm": 0.3196892738342285,
"learning_rate": 1.8149609705745351e-06,
"loss": 0.024,
"num_input_tokens_seen": 53998272,
"step": 3330
},
{
"epoch": 2.6690676270508202,
"grad_norm": 0.3033794164657593,
"learning_rate": 1.7721287792218011e-06,
"loss": 0.0122,
"num_input_tokens_seen": 54087872,
"step": 3335
},
{
"epoch": 2.6730692276910766,
"grad_norm": 0.21723465621471405,
"learning_rate": 1.729792665745571e-06,
"loss": 0.0169,
"num_input_tokens_seen": 54164800,
"step": 3340
},
{
"epoch": 2.6770708283313325,
"grad_norm": 0.353202223777771,
"learning_rate": 1.6879533741625863e-06,
"loss": 0.016,
"num_input_tokens_seen": 54245696,
"step": 3345
},
{
"epoch": 2.6810724289715884,
"grad_norm": 0.5556342601776123,
"learning_rate": 1.6466116397584397e-06,
"loss": 0.0184,
"num_input_tokens_seen": 54328768,
"step": 3350
},
{
"epoch": 2.6850740296118447,
"grad_norm": 0.2998672127723694,
"learning_rate": 1.6057681890746345e-06,
"loss": 0.0223,
"num_input_tokens_seen": 54413376,
"step": 3355
},
{
"epoch": 2.689075630252101,
"grad_norm": 0.4005143344402313,
"learning_rate": 1.5654237398958027e-06,
"loss": 0.0196,
"num_input_tokens_seen": 54495680,
"step": 3360
},
{
"epoch": 2.693077230892357,
"grad_norm": 0.5521175861358643,
"learning_rate": 1.5255790012371074e-06,
"loss": 0.0206,
"num_input_tokens_seen": 54580288,
"step": 3365
},
{
"epoch": 2.697078831532613,
"grad_norm": 0.35052913427352905,
"learning_rate": 1.48623467333177e-06,
"loss": 0.0269,
"num_input_tokens_seen": 54663616,
"step": 3370
},
{
"epoch": 2.701080432172869,
"grad_norm": 0.606769859790802,
"learning_rate": 1.4473914476187833e-06,
"loss": 0.0188,
"num_input_tokens_seen": 54743488,
"step": 3375
},
{
"epoch": 2.705082032813125,
"grad_norm": 0.5268804430961609,
"learning_rate": 1.409050006730741e-06,
"loss": 0.0159,
"num_input_tokens_seen": 54822592,
"step": 3380
},
{
"epoch": 2.7090836334533814,
"grad_norm": 0.49813786149024963,
"learning_rate": 1.371211024481841e-06,
"loss": 0.0171,
"num_input_tokens_seen": 54903360,
"step": 3385
},
{
"epoch": 2.7130852340936373,
"grad_norm": 0.4483683705329895,
"learning_rate": 1.3338751658560577e-06,
"loss": 0.0147,
"num_input_tokens_seen": 54979520,
"step": 3390
},
{
"epoch": 2.7170868347338937,
"grad_norm": 0.4520145356655121,
"learning_rate": 1.297043086995452e-06,
"loss": 0.0245,
"num_input_tokens_seen": 55059008,
"step": 3395
},
{
"epoch": 2.7210884353741496,
"grad_norm": 0.517173707485199,
"learning_rate": 1.2607154351886296e-06,
"loss": 0.0185,
"num_input_tokens_seen": 55144768,
"step": 3400
},
{
"epoch": 2.725090036014406,
"grad_norm": 0.43764081597328186,
"learning_rate": 1.224892848859368e-06,
"loss": 0.0239,
"num_input_tokens_seen": 55221440,
"step": 3405
},
{
"epoch": 2.729091636654662,
"grad_norm": 0.3168484568595886,
"learning_rate": 1.1895759575554145e-06,
"loss": 0.0148,
"num_input_tokens_seen": 55306048,
"step": 3410
},
{
"epoch": 2.733093237294918,
"grad_norm": 0.4901845157146454,
"learning_rate": 1.1547653819374048e-06,
"loss": 0.0187,
"num_input_tokens_seen": 55385024,
"step": 3415
},
{
"epoch": 2.737094837935174,
"grad_norm": 0.6804729700088501,
"learning_rate": 1.1204617337679568e-06,
"loss": 0.0212,
"num_input_tokens_seen": 55459776,
"step": 3420
},
{
"epoch": 2.74109643857543,
"grad_norm": 0.2512257993221283,
"learning_rate": 1.0866656159009203e-06,
"loss": 0.0184,
"num_input_tokens_seen": 55543232,
"step": 3425
},
{
"epoch": 2.7450980392156863,
"grad_norm": 0.522000253200531,
"learning_rate": 1.0533776222707902e-06,
"loss": 0.0201,
"num_input_tokens_seen": 55621184,
"step": 3430
},
{
"epoch": 2.7490996398559426,
"grad_norm": 0.5399855375289917,
"learning_rate": 1.0205983378822615e-06,
"loss": 0.0143,
"num_input_tokens_seen": 55703104,
"step": 3435
},
{
"epoch": 2.7531012404961985,
"grad_norm": 0.27883198857307434,
"learning_rate": 9.883283387999564e-07,
"loss": 0.0174,
"num_input_tokens_seen": 55782976,
"step": 3440
},
{
"epoch": 2.7571028411364544,
"grad_norm": 0.4380817115306854,
"learning_rate": 9.565681921382774e-07,
"loss": 0.0154,
"num_input_tokens_seen": 55861824,
"step": 3445
},
{
"epoch": 2.7611044417767108,
"grad_norm": 0.3870999217033386,
"learning_rate": 9.253184560514738e-07,
"loss": 0.0132,
"num_input_tokens_seen": 55942336,
"step": 3450
},
{
"epoch": 2.7651060424169667,
"grad_norm": 0.4920821487903595,
"learning_rate": 8.945796797238071e-07,
"loss": 0.0187,
"num_input_tokens_seen": 56022464,
"step": 3455
},
{
"epoch": 2.769107643057223,
"grad_norm": 0.5308319330215454,
"learning_rate": 8.643524033599215e-07,
"loss": 0.0172,
"num_input_tokens_seen": 56104512,
"step": 3460
},
{
"epoch": 2.773109243697479,
"grad_norm": 0.5186963081359863,
"learning_rate": 8.346371581753187e-07,
"loss": 0.0129,
"num_input_tokens_seen": 56185024,
"step": 3465
},
{
"epoch": 2.7771108443377353,
"grad_norm": 0.6223376989364624,
"learning_rate": 8.054344663870583e-07,
"loss": 0.0164,
"num_input_tokens_seen": 56268480,
"step": 3470
},
{
"epoch": 2.781112444977991,
"grad_norm": 0.13742071390151978,
"learning_rate": 7.767448412045586e-07,
"loss": 0.0166,
"num_input_tokens_seen": 56353728,
"step": 3475
},
{
"epoch": 2.785114045618247,
"grad_norm": 0.48692306876182556,
"learning_rate": 7.48568786820577e-07,
"loss": 0.0201,
"num_input_tokens_seen": 56432448,
"step": 3480
},
{
"epoch": 2.7891156462585034,
"grad_norm": 0.45050036907196045,
"learning_rate": 7.209067984023609e-07,
"loss": 0.0159,
"num_input_tokens_seen": 56513088,
"step": 3485
},
{
"epoch": 2.7931172468987597,
"grad_norm": 0.6945263743400574,
"learning_rate": 6.937593620829342e-07,
"loss": 0.0254,
"num_input_tokens_seen": 56593472,
"step": 3490
},
{
"epoch": 2.7971188475390156,
"grad_norm": 0.33591872453689575,
"learning_rate": 6.671269549525638e-07,
"loss": 0.0176,
"num_input_tokens_seen": 56672832,
"step": 3495
},
{
"epoch": 2.8011204481792715,
"grad_norm": 0.29116731882095337,
"learning_rate": 6.410100450503708e-07,
"loss": 0.016,
"num_input_tokens_seen": 56753472,
"step": 3500
},
{
"epoch": 2.805122048819528,
"grad_norm": 0.5255082249641418,
"learning_rate": 6.154090913560928e-07,
"loss": 0.0146,
"num_input_tokens_seen": 56833600,
"step": 3505
},
{
"epoch": 2.8091236494597838,
"grad_norm": 0.30706822872161865,
"learning_rate": 5.90324543782057e-07,
"loss": 0.0193,
"num_input_tokens_seen": 56917440,
"step": 3510
},
{
"epoch": 2.81312525010004,
"grad_norm": 0.37937021255493164,
"learning_rate": 5.657568431652138e-07,
"loss": 0.025,
"num_input_tokens_seen": 56996288,
"step": 3515
},
{
"epoch": 2.817126850740296,
"grad_norm": 0.35128986835479736,
"learning_rate": 5.417064212594425e-07,
"loss": 0.0192,
"num_input_tokens_seen": 57080512,
"step": 3520
},
{
"epoch": 2.8211284513805523,
"grad_norm": 0.21868328750133514,
"learning_rate": 5.181737007279408e-07,
"loss": 0.0105,
"num_input_tokens_seen": 57163456,
"step": 3525
},
{
"epoch": 2.8251300520208082,
"grad_norm": 0.36027055978775024,
"learning_rate": 4.951590951357909e-07,
"loss": 0.0144,
"num_input_tokens_seen": 57240128,
"step": 3530
},
{
"epoch": 2.8291316526610646,
"grad_norm": 0.4171808958053589,
"learning_rate": 4.7266300894270866e-07,
"loss": 0.0212,
"num_input_tokens_seen": 57322816,
"step": 3535
},
{
"epoch": 2.8331332533013205,
"grad_norm": 0.6335403919219971,
"learning_rate": 4.506858374959222e-07,
"loss": 0.0177,
"num_input_tokens_seen": 57407808,
"step": 3540
},
{
"epoch": 2.837134853941577,
"grad_norm": 0.3277064859867096,
"learning_rate": 4.29227967023228e-07,
"loss": 0.0162,
"num_input_tokens_seen": 57485760,
"step": 3545
},
{
"epoch": 2.8411364545818327,
"grad_norm": 0.43462586402893066,
"learning_rate": 4.08289774626206e-07,
"loss": 0.0224,
"num_input_tokens_seen": 57572160,
"step": 3550
},
{
"epoch": 2.8451380552220886,
"grad_norm": 0.4823042154312134,
"learning_rate": 3.8787162827359166e-07,
"loss": 0.0196,
"num_input_tokens_seen": 57648320,
"step": 3555
},
{
"epoch": 2.849139655862345,
"grad_norm": 0.4763263761997223,
"learning_rate": 3.6797388679480124e-07,
"loss": 0.0163,
"num_input_tokens_seen": 57725248,
"step": 3560
},
{
"epoch": 2.8531412565026013,
"grad_norm": 0.5753194689750671,
"learning_rate": 3.4859689987363996e-07,
"loss": 0.0154,
"num_input_tokens_seen": 57802176,
"step": 3565
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.4690397381782532,
"learning_rate": 3.2974100804215036e-07,
"loss": 0.0186,
"num_input_tokens_seen": 57882944,
"step": 3570
},
{
"epoch": 2.861144457783113,
"grad_norm": 0.4747138023376465,
"learning_rate": 3.114065426746138e-07,
"loss": 0.0145,
"num_input_tokens_seen": 57960768,
"step": 3575
},
{
"epoch": 2.8651460584233694,
"grad_norm": 0.5063496232032776,
"learning_rate": 2.93593825981755e-07,
"loss": 0.0203,
"num_input_tokens_seen": 58035520,
"step": 3580
},
{
"epoch": 2.8691476590636253,
"grad_norm": 0.2565556764602661,
"learning_rate": 2.763031710050534e-07,
"loss": 0.0186,
"num_input_tokens_seen": 58119232,
"step": 3585
},
{
"epoch": 2.8731492597038817,
"grad_norm": 0.5019761323928833,
"learning_rate": 2.595348816112575e-07,
"loss": 0.0184,
"num_input_tokens_seen": 58202944,
"step": 3590
},
{
"epoch": 2.8771508603441376,
"grad_norm": 0.52565598487854,
"learning_rate": 2.432892524870389e-07,
"loss": 0.0278,
"num_input_tokens_seen": 58280384,
"step": 3595
},
{
"epoch": 2.881152460984394,
"grad_norm": 0.5217379927635193,
"learning_rate": 2.2756656913381026e-07,
"loss": 0.0183,
"num_input_tokens_seen": 58358848,
"step": 3600
},
{
"epoch": 2.88515406162465,
"grad_norm": 0.4081059992313385,
"learning_rate": 2.1236710786271873e-07,
"loss": 0.0173,
"num_input_tokens_seen": 58440768,
"step": 3605
},
{
"epoch": 2.8891556622649057,
"grad_norm": 0.5043428540229797,
"learning_rate": 1.9769113578977705e-07,
"loss": 0.0198,
"num_input_tokens_seen": 58530368,
"step": 3610
},
{
"epoch": 2.893157262905162,
"grad_norm": 0.4933716058731079,
"learning_rate": 1.8353891083117692e-07,
"loss": 0.0227,
"num_input_tokens_seen": 58614976,
"step": 3615
},
{
"epoch": 2.8971588635454184,
"grad_norm": 0.518379807472229,
"learning_rate": 1.6991068169875946e-07,
"loss": 0.0205,
"num_input_tokens_seen": 58695872,
"step": 3620
},
{
"epoch": 2.9011604641856743,
"grad_norm": 0.4509144723415375,
"learning_rate": 1.568066878956287e-07,
"loss": 0.017,
"num_input_tokens_seen": 58773056,
"step": 3625
},
{
"epoch": 2.90516206482593,
"grad_norm": 0.3559584319591522,
"learning_rate": 1.4422715971196487e-07,
"loss": 0.0177,
"num_input_tokens_seen": 58861504,
"step": 3630
},
{
"epoch": 2.9091636654661865,
"grad_norm": 0.3940853178501129,
"learning_rate": 1.321723182209611e-07,
"loss": 0.0198,
"num_input_tokens_seen": 58941504,
"step": 3635
},
{
"epoch": 2.9131652661064424,
"grad_norm": 0.4085898697376251,
"learning_rate": 1.206423752749397e-07,
"loss": 0.0126,
"num_input_tokens_seen": 59024064,
"step": 3640
},
{
"epoch": 2.917166866746699,
"grad_norm": 0.15462715923786163,
"learning_rate": 1.0963753350164197e-07,
"loss": 0.0173,
"num_input_tokens_seen": 59107520,
"step": 3645
},
{
"epoch": 2.9211684673869547,
"grad_norm": 0.44263237714767456,
"learning_rate": 9.915798630064422e-08,
"loss": 0.0202,
"num_input_tokens_seen": 59185216,
"step": 3650
},
{
"epoch": 2.925170068027211,
"grad_norm": 0.7302068471908569,
"learning_rate": 8.920391783998394e-08,
"loss": 0.0194,
"num_input_tokens_seen": 59267520,
"step": 3655
},
{
"epoch": 2.929171668667467,
"grad_norm": 0.5005968809127808,
"learning_rate": 7.977550305290571e-08,
"loss": 0.021,
"num_input_tokens_seen": 59350208,
"step": 3660
},
{
"epoch": 2.933173269307723,
"grad_norm": 0.5027980804443359,
"learning_rate": 7.087290763479693e-08,
"loss": 0.02,
"num_input_tokens_seen": 59430080,
"step": 3665
},
{
"epoch": 2.937174869947979,
"grad_norm": 0.6888028979301453,
"learning_rate": 6.249628804026685e-08,
"loss": 0.0219,
"num_input_tokens_seen": 59510976,
"step": 3670
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.5013588070869446,
"learning_rate": 5.464579148040549e-08,
"loss": 0.0153,
"num_input_tokens_seen": 59588288,
"step": 3675
},
{
"epoch": 2.9451780712284914,
"grad_norm": 0.2671397030353546,
"learning_rate": 4.732155592018894e-08,
"loss": 0.0107,
"num_input_tokens_seen": 59663680,
"step": 3680
},
{
"epoch": 2.9491796718687473,
"grad_norm": 0.5593500137329102,
"learning_rate": 4.052371007606803e-08,
"loss": 0.0184,
"num_input_tokens_seen": 59747648,
"step": 3685
},
{
"epoch": 2.9531812725090036,
"grad_norm": 0.5388041138648987,
"learning_rate": 3.425237341368348e-08,
"loss": 0.0295,
"num_input_tokens_seen": 59829952,
"step": 3690
},
{
"epoch": 2.9571828731492595,
"grad_norm": 0.4415709972381592,
"learning_rate": 2.8507656145794202e-08,
"loss": 0.0167,
"num_input_tokens_seen": 59911616,
"step": 3695
},
{
"epoch": 2.961184473789516,
"grad_norm": 0.5787676572799683,
"learning_rate": 2.3289659230315563e-08,
"loss": 0.0133,
"num_input_tokens_seen": 59997376,
"step": 3700
},
{
"epoch": 2.965186074429772,
"grad_norm": 0.5261440873146057,
"learning_rate": 1.859847436855744e-08,
"loss": 0.0242,
"num_input_tokens_seen": 60082112,
"step": 3705
},
{
"epoch": 2.969187675070028,
"grad_norm": 0.4745880663394928,
"learning_rate": 1.4434184003618845e-08,
"loss": 0.018,
"num_input_tokens_seen": 60166336,
"step": 3710
},
{
"epoch": 2.973189275710284,
"grad_norm": 0.3323848247528076,
"learning_rate": 1.0796861318922436e-08,
"loss": 0.0221,
"num_input_tokens_seen": 60245184,
"step": 3715
},
{
"epoch": 2.9771908763505404,
"grad_norm": 0.5333008766174316,
"learning_rate": 7.686570236942192e-09,
"loss": 0.0193,
"num_input_tokens_seen": 60324928,
"step": 3720
},
{
"epoch": 2.9811924769907963,
"grad_norm": 0.42949220538139343,
"learning_rate": 5.103365418074324e-09,
"loss": 0.0179,
"num_input_tokens_seen": 60408256,
"step": 3725
},
{
"epoch": 2.9851940776310526,
"grad_norm": 0.4650789201259613,
"learning_rate": 3.0472922596713747e-09,
"loss": 0.023,
"num_input_tokens_seen": 60489408,
"step": 3730
},
{
"epoch": 2.9891956782713085,
"grad_norm": 0.4750503599643707,
"learning_rate": 1.5183868952595158e-09,
"loss": 0.0161,
"num_input_tokens_seen": 60567616,
"step": 3735
},
{
"epoch": 2.9931972789115644,
"grad_norm": 0.6158129572868347,
"learning_rate": 5.166761938857345e-10,
"loss": 0.0233,
"num_input_tokens_seen": 60640960,
"step": 3740
},
{
"epoch": 2.9971988795518207,
"grad_norm": 0.4666935205459595,
"learning_rate": 4.2177759664863144e-11,
"loss": 0.0156,
"num_input_tokens_seen": 60723904,
"step": 3745
},
{
"epoch": 2.998799519807923,
"num_input_tokens_seen": 60756032,
"step": 3747,
"total_flos": 2.584854770034475e+18,
"train_loss": 0.036583439188740685,
"train_runtime": 712562.4979,
"train_samples_per_second": 0.673,
"train_steps_per_second": 0.005
}
],
"logging_steps": 5,
"max_steps": 3747,
"num_input_tokens_seen": 60756032,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.584854770034475e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}