e2b-r64-sft / trainer_state.json
beezza's picture
Upload folder using huggingface_hub
b4d8cfa verified
Raw
History Blame Contribute Delete
121 kB
Invalid JSON:Unexpected token 'N', ..."ad_norm": NaN, "... is not valid JSON
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.1451612903225805,
"eval_steps": 30,
"global_step": 390,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008064516129032258,
"grad_norm": NaN,
"learning_rate": 0.0,
"loss": 3.1612095832824707,
"num_input_tokens_seen": 3376,
"step": 1,
"train_runtime": 54.6226,
"train_tokens_per_second": 61.806
},
{
"epoch": 0.016129032258064516,
"grad_norm": 19.30766487121582,
"learning_rate": 0.0,
"loss": 3.09491229057312,
"num_input_tokens_seen": 6750,
"step": 2,
"train_runtime": 59.7624,
"train_tokens_per_second": 112.947
},
{
"epoch": 0.024193548387096774,
"grad_norm": 20.06780242919922,
"learning_rate": 4e-05,
"loss": 3.0951414108276367,
"num_input_tokens_seen": 10132,
"step": 3,
"train_runtime": 63.3739,
"train_tokens_per_second": 159.877
},
{
"epoch": 0.03225806451612903,
"grad_norm": 9.574868202209473,
"learning_rate": 8e-05,
"loss": 2.2402310371398926,
"num_input_tokens_seen": 13496,
"step": 4,
"train_runtime": 67.0053,
"train_tokens_per_second": 201.417
},
{
"epoch": 0.04032258064516129,
"grad_norm": 4.498556613922119,
"learning_rate": 0.00012,
"loss": 1.8392776250839233,
"num_input_tokens_seen": 16978,
"step": 5,
"train_runtime": 70.7762,
"train_tokens_per_second": 239.883
},
{
"epoch": 0.04838709677419355,
"grad_norm": 2.6655759811401367,
"learning_rate": 0.00016,
"loss": 1.4742008447647095,
"num_input_tokens_seen": 20170,
"step": 6,
"train_runtime": 74.2843,
"train_tokens_per_second": 271.524
},
{
"epoch": 0.056451612903225805,
"grad_norm": 2.0664868354797363,
"learning_rate": 0.0002,
"loss": 1.2891130447387695,
"num_input_tokens_seen": 23566,
"step": 7,
"train_runtime": 77.9869,
"train_tokens_per_second": 302.179
},
{
"epoch": 0.06451612903225806,
"grad_norm": 1.9103718996047974,
"learning_rate": 0.00019999967645432384,
"loss": 1.1510200500488281,
"num_input_tokens_seen": 26930,
"step": 8,
"train_runtime": 81.6857,
"train_tokens_per_second": 329.678
},
{
"epoch": 0.07258064516129033,
"grad_norm": 2.13531494140625,
"learning_rate": 0.00019999870581938894,
"loss": 1.1280962228775024,
"num_input_tokens_seen": 30108,
"step": 9,
"train_runtime": 85.212,
"train_tokens_per_second": 353.33
},
{
"epoch": 0.08064516129032258,
"grad_norm": 1.1911511421203613,
"learning_rate": 0.0001999970881014762,
"loss": 1.0238806009292603,
"num_input_tokens_seen": 33254,
"step": 10,
"train_runtime": 88.7519,
"train_tokens_per_second": 374.685
},
{
"epoch": 0.08870967741935484,
"grad_norm": 1.7620989084243774,
"learning_rate": 0.00019999482331105377,
"loss": 1.0168827772140503,
"num_input_tokens_seen": 36458,
"step": 11,
"train_runtime": 92.3516,
"train_tokens_per_second": 394.774
},
{
"epoch": 0.0967741935483871,
"grad_norm": 1.1830689907073975,
"learning_rate": 0.0001999919114627769,
"loss": 1.0140981674194336,
"num_input_tokens_seen": 39848,
"step": 12,
"train_runtime": 96.1612,
"train_tokens_per_second": 414.387
},
{
"epoch": 0.10483870967741936,
"grad_norm": 1.0954996347427368,
"learning_rate": 0.00019998835257548786,
"loss": 0.9716602563858032,
"num_input_tokens_seen": 42964,
"step": 13,
"train_runtime": 99.7433,
"train_tokens_per_second": 430.746
},
{
"epoch": 0.11290322580645161,
"grad_norm": 0.932753324508667,
"learning_rate": 0.00019998414667221596,
"loss": 0.9001954793930054,
"num_input_tokens_seen": 46336,
"step": 14,
"train_runtime": 103.6049,
"train_tokens_per_second": 447.237
},
{
"epoch": 0.12096774193548387,
"grad_norm": 0.8278332352638245,
"learning_rate": 0.00019997929378017725,
"loss": 0.9225928783416748,
"num_input_tokens_seen": 49524,
"step": 15,
"train_runtime": 107.2795,
"train_tokens_per_second": 461.635
},
{
"epoch": 0.12903225806451613,
"grad_norm": 0.7354539036750793,
"learning_rate": 0.00019997379393077428,
"loss": 0.8711258172988892,
"num_input_tokens_seen": 52878,
"step": 16,
"train_runtime": 111.1447,
"train_tokens_per_second": 475.758
},
{
"epoch": 0.13709677419354838,
"grad_norm": 0.7318344712257385,
"learning_rate": 0.00019996764715959618,
"loss": 0.799241840839386,
"num_input_tokens_seen": 56342,
"step": 17,
"train_runtime": 115.127,
"train_tokens_per_second": 489.39
},
{
"epoch": 0.14516129032258066,
"grad_norm": 0.774062991142273,
"learning_rate": 0.0001999608535064182,
"loss": 0.829187273979187,
"num_input_tokens_seen": 59734,
"step": 18,
"train_runtime": 119.1867,
"train_tokens_per_second": 501.18
},
{
"epoch": 0.1532258064516129,
"grad_norm": 0.8591275215148926,
"learning_rate": 0.0001999534130152014,
"loss": 0.8018933534622192,
"num_input_tokens_seen": 63022,
"step": 19,
"train_runtime": 123.0444,
"train_tokens_per_second": 512.189
},
{
"epoch": 0.16129032258064516,
"grad_norm": 0.9827930331230164,
"learning_rate": 0.00019994532573409262,
"loss": 0.8291414976119995,
"num_input_tokens_seen": 66380,
"step": 20,
"train_runtime": 126.9621,
"train_tokens_per_second": 522.833
},
{
"epoch": 0.1693548387096774,
"grad_norm": 0.9169955849647522,
"learning_rate": 0.0001999365917154239,
"loss": 0.8104863166809082,
"num_input_tokens_seen": 69412,
"step": 21,
"train_runtime": 130.5539,
"train_tokens_per_second": 531.673
},
{
"epoch": 0.1774193548387097,
"grad_norm": 0.8792040944099426,
"learning_rate": 0.00019992721101571236,
"loss": 0.8366917967796326,
"num_input_tokens_seen": 72682,
"step": 22,
"train_runtime": 134.3459,
"train_tokens_per_second": 541.007
},
{
"epoch": 0.18548387096774194,
"grad_norm": 0.8037822246551514,
"learning_rate": 0.0001999171836956597,
"loss": 0.8375218510627747,
"num_input_tokens_seen": 75880,
"step": 23,
"train_runtime": 138.1243,
"train_tokens_per_second": 549.36
},
{
"epoch": 0.1935483870967742,
"grad_norm": 0.6663984060287476,
"learning_rate": 0.0001999065098201518,
"loss": 0.7726024389266968,
"num_input_tokens_seen": 79364,
"step": 24,
"train_runtime": 142.1732,
"train_tokens_per_second": 558.22
},
{
"epoch": 0.20161290322580644,
"grad_norm": 0.6427092552185059,
"learning_rate": 0.00019989518945825844,
"loss": 0.7135709524154663,
"num_input_tokens_seen": 82600,
"step": 25,
"train_runtime": 146.1185,
"train_tokens_per_second": 565.294
},
{
"epoch": 0.20967741935483872,
"grad_norm": 0.7197744846343994,
"learning_rate": 0.00019988322268323268,
"loss": 0.7193007469177246,
"num_input_tokens_seen": 85954,
"step": 26,
"train_runtime": 150.1399,
"train_tokens_per_second": 572.493
},
{
"epoch": 0.21774193548387097,
"grad_norm": 0.6793457865715027,
"learning_rate": 0.00019987060957251047,
"loss": 0.7056538462638855,
"num_input_tokens_seen": 89314,
"step": 27,
"train_runtime": 154.1923,
"train_tokens_per_second": 579.238
},
{
"epoch": 0.22580645161290322,
"grad_norm": 0.7098727226257324,
"learning_rate": 0.00019985735020771017,
"loss": 0.7221028208732605,
"num_input_tokens_seen": 92550,
"step": 28,
"train_runtime": 158.0575,
"train_tokens_per_second": 585.546
},
{
"epoch": 0.23387096774193547,
"grad_norm": 0.656500518321991,
"learning_rate": 0.00019984344467463197,
"loss": 0.6559625864028931,
"num_input_tokens_seen": 95904,
"step": 29,
"train_runtime": 162.0987,
"train_tokens_per_second": 591.639
},
{
"epoch": 0.24193548387096775,
"grad_norm": 0.7039774060249329,
"learning_rate": 0.0001998288930632574,
"loss": 0.7168108224868774,
"num_input_tokens_seen": 99314,
"step": 30,
"train_runtime": 166.2762,
"train_tokens_per_second": 597.283
},
{
"epoch": 0.24193548387096775,
"eval_loss": 3.0846056938171387,
"eval_runtime": 12.7508,
"eval_samples_per_second": 4.078,
"eval_steps_per_second": 2.039,
"num_input_tokens_seen": 99314,
"step": 30
},
{
"epoch": 0.25,
"grad_norm": 0.7343088984489441,
"learning_rate": 0.00019981369546774865,
"loss": 0.7140476107597351,
"num_input_tokens_seen": 102632,
"step": 31,
"train_runtime": 188.002,
"train_tokens_per_second": 545.909
},
{
"epoch": 0.25806451612903225,
"grad_norm": 0.8019713759422302,
"learning_rate": 0.00019979785198644806,
"loss": 0.7408339977264404,
"num_input_tokens_seen": 106032,
"step": 32,
"train_runtime": 192.0827,
"train_tokens_per_second": 552.012
},
{
"epoch": 0.2661290322580645,
"grad_norm": 0.728600025177002,
"learning_rate": 0.00019978136272187747,
"loss": 0.6582087278366089,
"num_input_tokens_seen": 109374,
"step": 33,
"train_runtime": 196.1526,
"train_tokens_per_second": 557.596
},
{
"epoch": 0.27419354838709675,
"grad_norm": 0.6822960376739502,
"learning_rate": 0.0001997642277807374,
"loss": 0.7177855372428894,
"num_input_tokens_seen": 112780,
"step": 34,
"train_runtime": 200.2851,
"train_tokens_per_second": 563.097
},
{
"epoch": 0.28225806451612906,
"grad_norm": 0.7499037384986877,
"learning_rate": 0.00019974644727390665,
"loss": 0.6647191047668457,
"num_input_tokens_seen": 116154,
"step": 35,
"train_runtime": 204.4174,
"train_tokens_per_second": 568.22
},
{
"epoch": 0.2903225806451613,
"grad_norm": 0.7178347110748291,
"learning_rate": 0.00019972802131644127,
"loss": 0.6842952370643616,
"num_input_tokens_seen": 119350,
"step": 36,
"train_runtime": 208.3311,
"train_tokens_per_second": 572.886
},
{
"epoch": 0.29838709677419356,
"grad_norm": 0.7969527244567871,
"learning_rate": 0.00019970895002757413,
"loss": 0.7260242700576782,
"num_input_tokens_seen": 122566,
"step": 37,
"train_runtime": 212.2522,
"train_tokens_per_second": 577.455
},
{
"epoch": 0.3064516129032258,
"grad_norm": 0.7407550811767578,
"learning_rate": 0.00019968923353071377,
"loss": 0.6808462738990784,
"num_input_tokens_seen": 125906,
"step": 38,
"train_runtime": 216.3593,
"train_tokens_per_second": 581.93
},
{
"epoch": 0.31451612903225806,
"grad_norm": 0.7005120515823364,
"learning_rate": 0.00019966887195344403,
"loss": 0.6249939799308777,
"num_input_tokens_seen": 129178,
"step": 39,
"train_runtime": 220.3595,
"train_tokens_per_second": 586.215
},
{
"epoch": 0.3225806451612903,
"grad_norm": 0.8344091773033142,
"learning_rate": 0.0001996478654275229,
"loss": 0.6812502145767212,
"num_input_tokens_seen": 132242,
"step": 40,
"train_runtime": 224.1058,
"train_tokens_per_second": 590.087
},
{
"epoch": 0.33064516129032256,
"grad_norm": 0.6059567332267761,
"learning_rate": 0.00019962621408888177,
"loss": 0.6382313966751099,
"num_input_tokens_seen": 135598,
"step": 41,
"train_runtime": 228.1603,
"train_tokens_per_second": 594.31
},
{
"epoch": 0.3387096774193548,
"grad_norm": 0.6195608377456665,
"learning_rate": 0.00019960391807762463,
"loss": 0.6314352750778198,
"num_input_tokens_seen": 139036,
"step": 42,
"train_runtime": 232.2635,
"train_tokens_per_second": 598.613
},
{
"epoch": 0.3467741935483871,
"grad_norm": 0.6370927095413208,
"learning_rate": 0.00019958097753802693,
"loss": 0.6387574672698975,
"num_input_tokens_seen": 142314,
"step": 43,
"train_runtime": 236.252,
"train_tokens_per_second": 602.382
},
{
"epoch": 0.3548387096774194,
"grad_norm": 0.6034757494926453,
"learning_rate": 0.00019955739261853504,
"loss": 0.6299672722816467,
"num_input_tokens_seen": 145730,
"step": 44,
"train_runtime": 240.3213,
"train_tokens_per_second": 606.396
},
{
"epoch": 0.3629032258064516,
"grad_norm": 0.6737009882926941,
"learning_rate": 0.00019953316347176488,
"loss": 0.5970463156700134,
"num_input_tokens_seen": 148950,
"step": 45,
"train_runtime": 244.1671,
"train_tokens_per_second": 610.033
},
{
"epoch": 0.3709677419354839,
"grad_norm": 0.6426769495010376,
"learning_rate": 0.00019950829025450114,
"loss": 0.647551417350769,
"num_input_tokens_seen": 152208,
"step": 46,
"train_runtime": 248.1891,
"train_tokens_per_second": 613.274
},
{
"epoch": 0.3790322580645161,
"grad_norm": 0.6058390736579895,
"learning_rate": 0.0001994827731276963,
"loss": 0.5269473791122437,
"num_input_tokens_seen": 155494,
"step": 47,
"train_runtime": 252.1835,
"train_tokens_per_second": 616.591
},
{
"epoch": 0.3870967741935484,
"grad_norm": 0.6453372836112976,
"learning_rate": 0.00019945661225646946,
"loss": 0.5612773895263672,
"num_input_tokens_seen": 158822,
"step": 48,
"train_runtime": 256.2339,
"train_tokens_per_second": 619.832
},
{
"epoch": 0.3951612903225806,
"grad_norm": 0.5935277342796326,
"learning_rate": 0.0001994298078101054,
"loss": 0.5867321491241455,
"num_input_tokens_seen": 162234,
"step": 49,
"train_runtime": 260.2753,
"train_tokens_per_second": 623.317
},
{
"epoch": 0.4032258064516129,
"grad_norm": 0.7631756663322449,
"learning_rate": 0.00019940235996205333,
"loss": 0.6878418922424316,
"num_input_tokens_seen": 165674,
"step": 50,
"train_runtime": 264.4525,
"train_tokens_per_second": 626.479
},
{
"epoch": 0.4112903225806452,
"grad_norm": 0.695699155330658,
"learning_rate": 0.0001993742688899259,
"loss": 0.5979565978050232,
"num_input_tokens_seen": 169012,
"step": 51,
"train_runtime": 268.4933,
"train_tokens_per_second": 629.483
},
{
"epoch": 0.41935483870967744,
"grad_norm": 0.7715777158737183,
"learning_rate": 0.00019934553477549794,
"loss": 0.596588134765625,
"num_input_tokens_seen": 172396,
"step": 52,
"train_runtime": 272.5489,
"train_tokens_per_second": 632.532
},
{
"epoch": 0.4274193548387097,
"grad_norm": 0.5751280784606934,
"learning_rate": 0.00019931615780470558,
"loss": 0.6130199432373047,
"num_input_tokens_seen": 175766,
"step": 53,
"train_runtime": 276.6214,
"train_tokens_per_second": 635.403
},
{
"epoch": 0.43548387096774194,
"grad_norm": 0.7077270150184631,
"learning_rate": 0.00019928613816764458,
"loss": 0.5735506415367126,
"num_input_tokens_seen": 179034,
"step": 54,
"train_runtime": 280.6184,
"train_tokens_per_second": 637.998
},
{
"epoch": 0.4435483870967742,
"grad_norm": 0.6344470381736755,
"learning_rate": 0.00019925547605856934,
"loss": 0.6677184700965881,
"num_input_tokens_seen": 182380,
"step": 55,
"train_runtime": 284.6764,
"train_tokens_per_second": 640.657
},
{
"epoch": 0.45161290322580644,
"grad_norm": 0.7267054915428162,
"learning_rate": 0.00019922417167589183,
"loss": 0.6288032531738281,
"num_input_tokens_seen": 185574,
"step": 56,
"train_runtime": 288.5655,
"train_tokens_per_second": 643.091
},
{
"epoch": 0.4596774193548387,
"grad_norm": 0.5684869885444641,
"learning_rate": 0.00019919222522217996,
"loss": 0.6335676908493042,
"num_input_tokens_seen": 188970,
"step": 57,
"train_runtime": 292.6622,
"train_tokens_per_second": 645.693
},
{
"epoch": 0.46774193548387094,
"grad_norm": 0.6018741726875305,
"learning_rate": 0.00019915963690415647,
"loss": 0.5417088270187378,
"num_input_tokens_seen": 192282,
"step": 58,
"train_runtime": 296.681,
"train_tokens_per_second": 648.11
},
{
"epoch": 0.47580645161290325,
"grad_norm": 0.6719103455543518,
"learning_rate": 0.00019912640693269752,
"loss": 0.5992182493209839,
"num_input_tokens_seen": 195644,
"step": 59,
"train_runtime": 300.7533,
"train_tokens_per_second": 650.513
},
{
"epoch": 0.4838709677419355,
"grad_norm": 0.657250165939331,
"learning_rate": 0.00019909253552283143,
"loss": 0.6172187328338623,
"num_input_tokens_seen": 198956,
"step": 60,
"train_runtime": 304.8185,
"train_tokens_per_second": 652.703
},
{
"epoch": 0.4838709677419355,
"eval_loss": 2.6003706455230713,
"eval_runtime": 10.4131,
"eval_samples_per_second": 4.994,
"eval_steps_per_second": 2.497,
"num_input_tokens_seen": 198956,
"step": 60
},
{
"epoch": 0.49193548387096775,
"grad_norm": 0.6318468451499939,
"learning_rate": 0.00019905802289373715,
"loss": 0.5561873316764832,
"num_input_tokens_seen": 202314,
"step": 61,
"train_runtime": 323.4182,
"train_tokens_per_second": 625.549
},
{
"epoch": 0.5,
"grad_norm": 0.6109554171562195,
"learning_rate": 0.0001990228692687429,
"loss": 0.5585595369338989,
"num_input_tokens_seen": 205722,
"step": 62,
"train_runtime": 327.4723,
"train_tokens_per_second": 628.212
},
{
"epoch": 0.5080645161290323,
"grad_norm": 0.621971607208252,
"learning_rate": 0.00019898707487532474,
"loss": 0.6442801356315613,
"num_input_tokens_seen": 208882,
"step": 63,
"train_runtime": 331.3395,
"train_tokens_per_second": 630.417
},
{
"epoch": 0.5161290322580645,
"grad_norm": 0.5826683044433594,
"learning_rate": 0.0001989506399451051,
"loss": 0.575531542301178,
"num_input_tokens_seen": 212198,
"step": 64,
"train_runtime": 335.3754,
"train_tokens_per_second": 632.718
},
{
"epoch": 0.5241935483870968,
"grad_norm": 0.6495046615600586,
"learning_rate": 0.0001989135647138513,
"loss": 0.5925536751747131,
"num_input_tokens_seen": 215586,
"step": 65,
"train_runtime": 339.5014,
"train_tokens_per_second": 635.008
},
{
"epoch": 0.532258064516129,
"grad_norm": 0.7227226495742798,
"learning_rate": 0.00019887584942147394,
"loss": 0.6755293607711792,
"num_input_tokens_seen": 218732,
"step": 66,
"train_runtime": 343.3709,
"train_tokens_per_second": 637.014
},
{
"epoch": 0.5403225806451613,
"grad_norm": 0.6853323578834534,
"learning_rate": 0.0001988374943120254,
"loss": 0.6272655129432678,
"num_input_tokens_seen": 221938,
"step": 67,
"train_runtime": 347.2586,
"train_tokens_per_second": 639.114
},
{
"epoch": 0.5483870967741935,
"grad_norm": 0.5388026237487793,
"learning_rate": 0.00019879849963369827,
"loss": 0.6052734851837158,
"num_input_tokens_seen": 225362,
"step": 68,
"train_runtime": 351.4187,
"train_tokens_per_second": 641.292
},
{
"epoch": 0.5564516129032258,
"grad_norm": 0.5527206659317017,
"learning_rate": 0.00019875886563882375,
"loss": 0.5706397294998169,
"num_input_tokens_seen": 228656,
"step": 69,
"train_runtime": 355.6332,
"train_tokens_per_second": 642.955
},
{
"epoch": 0.5645161290322581,
"grad_norm": 0.5685147047042847,
"learning_rate": 0.00019871859258387,
"loss": 0.5797425508499146,
"num_input_tokens_seen": 232052,
"step": 70,
"train_runtime": 359.7474,
"train_tokens_per_second": 645.041
},
{
"epoch": 0.5725806451612904,
"grad_norm": 0.558627188205719,
"learning_rate": 0.00019867768072944045,
"loss": 0.5348191857337952,
"num_input_tokens_seen": 235430,
"step": 71,
"train_runtime": 363.8094,
"train_tokens_per_second": 647.125
},
{
"epoch": 0.5806451612903226,
"grad_norm": 0.6002105474472046,
"learning_rate": 0.00019863613034027224,
"loss": 0.5826228260993958,
"num_input_tokens_seen": 238792,
"step": 72,
"train_runtime": 367.887,
"train_tokens_per_second": 649.091
},
{
"epoch": 0.5887096774193549,
"grad_norm": 0.6451808214187622,
"learning_rate": 0.0001985939416852343,
"loss": 0.6138958930969238,
"num_input_tokens_seen": 242136,
"step": 73,
"train_runtime": 371.9411,
"train_tokens_per_second": 651.006
},
{
"epoch": 0.5967741935483871,
"grad_norm": 0.6479431986808777,
"learning_rate": 0.00019855111503732574,
"loss": 0.5971667170524597,
"num_input_tokens_seen": 245502,
"step": 74,
"train_runtime": 375.9997,
"train_tokens_per_second": 652.931
},
{
"epoch": 0.6048387096774194,
"grad_norm": 0.6030436754226685,
"learning_rate": 0.00019850765067367412,
"loss": 0.5875449776649475,
"num_input_tokens_seen": 248724,
"step": 75,
"train_runtime": 379.8628,
"train_tokens_per_second": 654.773
},
{
"epoch": 0.6129032258064516,
"grad_norm": 0.5927602648735046,
"learning_rate": 0.00019846354887553358,
"loss": 0.5731448531150818,
"num_input_tokens_seen": 252068,
"step": 76,
"train_runtime": 383.878,
"train_tokens_per_second": 656.636
},
{
"epoch": 0.6209677419354839,
"grad_norm": 0.6371795535087585,
"learning_rate": 0.00019841880992828306,
"loss": 0.5935379266738892,
"num_input_tokens_seen": 255328,
"step": 77,
"train_runtime": 387.7359,
"train_tokens_per_second": 658.51
},
{
"epoch": 0.6290322580645161,
"grad_norm": 0.5905424356460571,
"learning_rate": 0.0001983734341214244,
"loss": 0.5362335443496704,
"num_input_tokens_seen": 258440,
"step": 78,
"train_runtime": 391.431,
"train_tokens_per_second": 660.244
},
{
"epoch": 0.6370967741935484,
"grad_norm": 0.6883693337440491,
"learning_rate": 0.00019832742174858052,
"loss": 0.48908287286758423,
"num_input_tokens_seen": 261670,
"step": 79,
"train_runtime": 395.3134,
"train_tokens_per_second": 661.931
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.6175087094306946,
"learning_rate": 0.0001982807731074935,
"loss": 0.5709075331687927,
"num_input_tokens_seen": 265042,
"step": 80,
"train_runtime": 399.3599,
"train_tokens_per_second": 663.667
},
{
"epoch": 0.6532258064516129,
"grad_norm": 0.6919131875038147,
"learning_rate": 0.00019823348850002268,
"loss": 0.5524786710739136,
"num_input_tokens_seen": 268396,
"step": 81,
"train_runtime": 403.3765,
"train_tokens_per_second": 665.373
},
{
"epoch": 0.6612903225806451,
"grad_norm": 0.656579315662384,
"learning_rate": 0.00019818556823214268,
"loss": 0.5900183916091919,
"num_input_tokens_seen": 271554,
"step": 82,
"train_runtime": 407.3032,
"train_tokens_per_second": 666.712
},
{
"epoch": 0.6693548387096774,
"grad_norm": 0.7231733798980713,
"learning_rate": 0.00019813701261394136,
"loss": 0.6131920218467712,
"num_input_tokens_seen": 274730,
"step": 83,
"train_runtime": 411.1381,
"train_tokens_per_second": 668.218
},
{
"epoch": 0.6774193548387096,
"grad_norm": 0.5668199062347412,
"learning_rate": 0.00019808782195961797,
"loss": 0.5624396204948425,
"num_input_tokens_seen": 278100,
"step": 84,
"train_runtime": 415.2265,
"train_tokens_per_second": 669.755
},
{
"epoch": 0.6854838709677419,
"grad_norm": 0.5321037769317627,
"learning_rate": 0.00019803799658748094,
"loss": 0.512347936630249,
"num_input_tokens_seen": 281468,
"step": 85,
"train_runtime": 419.2932,
"train_tokens_per_second": 671.292
},
{
"epoch": 0.6935483870967742,
"grad_norm": 0.569511353969574,
"learning_rate": 0.000197987536819946,
"loss": 0.630191445350647,
"num_input_tokens_seen": 284944,
"step": 86,
"train_runtime": 423.4433,
"train_tokens_per_second": 672.921
},
{
"epoch": 0.7016129032258065,
"grad_norm": 0.5449895858764648,
"learning_rate": 0.0001979364429835339,
"loss": 0.511203944683075,
"num_input_tokens_seen": 288252,
"step": 87,
"train_runtime": 427.4775,
"train_tokens_per_second": 674.309
},
{
"epoch": 0.7096774193548387,
"grad_norm": 0.5464794635772705,
"learning_rate": 0.00019788471540886844,
"loss": 0.5040556192398071,
"num_input_tokens_seen": 291674,
"step": 88,
"train_runtime": 431.587,
"train_tokens_per_second": 675.817
},
{
"epoch": 0.717741935483871,
"grad_norm": 0.5279393792152405,
"learning_rate": 0.0001978323544306743,
"loss": 0.5020922422409058,
"num_input_tokens_seen": 295054,
"step": 89,
"train_runtime": 435.6908,
"train_tokens_per_second": 677.21
},
{
"epoch": 0.7258064516129032,
"grad_norm": 0.8044468760490417,
"learning_rate": 0.00019777936038777483,
"loss": 0.6041148900985718,
"num_input_tokens_seen": 298162,
"step": 90,
"train_runtime": 439.5049,
"train_tokens_per_second": 678.404
},
{
"epoch": 0.7258064516129032,
"eval_loss": 2.553819417953491,
"eval_runtime": 10.3809,
"eval_samples_per_second": 5.009,
"eval_steps_per_second": 2.505,
"num_input_tokens_seen": 298162,
"step": 90
},
{
"epoch": 0.7338709677419355,
"grad_norm": 0.5703001618385315,
"learning_rate": 0.0001977257336230899,
"loss": 0.49885284900665283,
"num_input_tokens_seen": 301500,
"step": 91,
"train_runtime": 458.0467,
"train_tokens_per_second": 658.23
},
{
"epoch": 0.7419354838709677,
"grad_norm": 0.7216835021972656,
"learning_rate": 0.00019767147448363366,
"loss": 0.5518635511398315,
"num_input_tokens_seen": 304624,
"step": 92,
"train_runtime": 461.8691,
"train_tokens_per_second": 659.546
},
{
"epoch": 0.75,
"grad_norm": 0.6921418905258179,
"learning_rate": 0.00019761658332051235,
"loss": 0.575799822807312,
"num_input_tokens_seen": 307956,
"step": 93,
"train_runtime": 465.9047,
"train_tokens_per_second": 660.985
},
{
"epoch": 0.7580645161290323,
"grad_norm": 0.5977121591567993,
"learning_rate": 0.00019756106048892186,
"loss": 0.5771365165710449,
"num_input_tokens_seen": 311354,
"step": 94,
"train_runtime": 470.0334,
"train_tokens_per_second": 662.408
},
{
"epoch": 0.7661290322580645,
"grad_norm": 0.598276674747467,
"learning_rate": 0.00019750490634814572,
"loss": 0.5629148483276367,
"num_input_tokens_seen": 314592,
"step": 95,
"train_runtime": 473.9733,
"train_tokens_per_second": 663.734
},
{
"epoch": 0.7741935483870968,
"grad_norm": 0.5777994394302368,
"learning_rate": 0.00019744812126155245,
"loss": 0.5368872284889221,
"num_input_tokens_seen": 317960,
"step": 96,
"train_runtime": 478.0702,
"train_tokens_per_second": 665.091
},
{
"epoch": 0.782258064516129,
"grad_norm": 0.5279212594032288,
"learning_rate": 0.00019739070559659347,
"loss": 0.5620218515396118,
"num_input_tokens_seen": 321274,
"step": 97,
"train_runtime": 482.1331,
"train_tokens_per_second": 666.359
},
{
"epoch": 0.7903225806451613,
"grad_norm": 0.5498350858688354,
"learning_rate": 0.0001973326597248006,
"loss": 0.5708056092262268,
"num_input_tokens_seen": 324480,
"step": 98,
"train_runtime": 486.061,
"train_tokens_per_second": 667.571
},
{
"epoch": 0.7983870967741935,
"grad_norm": 0.5690199136734009,
"learning_rate": 0.0001972739840217836,
"loss": 0.49778610467910767,
"num_input_tokens_seen": 327678,
"step": 99,
"train_runtime": 489.9181,
"train_tokens_per_second": 668.842
},
{
"epoch": 0.8064516129032258,
"grad_norm": 0.5800296664237976,
"learning_rate": 0.00019721467886722792,
"loss": 0.5485215783119202,
"num_input_tokens_seen": 330914,
"step": 100,
"train_runtime": 493.8199,
"train_tokens_per_second": 670.111
},
{
"epoch": 0.8145161290322581,
"grad_norm": 0.6225796341896057,
"learning_rate": 0.00019715474464489208,
"loss": 0.5637421607971191,
"num_input_tokens_seen": 334256,
"step": 101,
"train_runtime": 497.9641,
"train_tokens_per_second": 671.245
},
{
"epoch": 0.8225806451612904,
"grad_norm": 0.5687134265899658,
"learning_rate": 0.0001970941817426052,
"loss": 0.5229646563529968,
"num_input_tokens_seen": 337722,
"step": 102,
"train_runtime": 502.0618,
"train_tokens_per_second": 672.67
},
{
"epoch": 0.8306451612903226,
"grad_norm": 0.5665640234947205,
"learning_rate": 0.00019703299055226468,
"loss": 0.5211078524589539,
"num_input_tokens_seen": 341038,
"step": 103,
"train_runtime": 506.0768,
"train_tokens_per_second": 673.886
},
{
"epoch": 0.8387096774193549,
"grad_norm": 0.6949163675308228,
"learning_rate": 0.00019697117146983334,
"loss": 0.5490686893463135,
"num_input_tokens_seen": 344066,
"step": 104,
"train_runtime": 509.7166,
"train_tokens_per_second": 675.014
},
{
"epoch": 0.8467741935483871,
"grad_norm": 0.5839992761611938,
"learning_rate": 0.0001969087248953371,
"loss": 0.592645525932312,
"num_input_tokens_seen": 347352,
"step": 105,
"train_runtime": 513.6992,
"train_tokens_per_second": 676.178
},
{
"epoch": 0.8548387096774194,
"grad_norm": 0.5174548029899597,
"learning_rate": 0.00019684565123286244,
"loss": 0.5339330434799194,
"num_input_tokens_seen": 350654,
"step": 106,
"train_runtime": 517.6849,
"train_tokens_per_second": 677.35
},
{
"epoch": 0.8629032258064516,
"grad_norm": 0.5480656027793884,
"learning_rate": 0.00019678195089055346,
"loss": 0.5675982236862183,
"num_input_tokens_seen": 353954,
"step": 107,
"train_runtime": 521.5558,
"train_tokens_per_second": 678.65
},
{
"epoch": 0.8709677419354839,
"grad_norm": 0.526905357837677,
"learning_rate": 0.00019671762428060966,
"loss": 0.5410143136978149,
"num_input_tokens_seen": 357348,
"step": 108,
"train_runtime": 525.6225,
"train_tokens_per_second": 679.857
},
{
"epoch": 0.8790322580645161,
"grad_norm": 0.5543181896209717,
"learning_rate": 0.00019665267181928292,
"loss": 0.5199689865112305,
"num_input_tokens_seen": 360628,
"step": 109,
"train_runtime": 529.6075,
"train_tokens_per_second": 680.934
},
{
"epoch": 0.8870967741935484,
"grad_norm": 0.4881467819213867,
"learning_rate": 0.00019658709392687506,
"loss": 0.5179097056388855,
"num_input_tokens_seen": 364030,
"step": 110,
"train_runtime": 533.621,
"train_tokens_per_second": 682.188
},
{
"epoch": 0.8951612903225806,
"grad_norm": 0.5893236398696899,
"learning_rate": 0.00019652089102773488,
"loss": 0.6043106317520142,
"num_input_tokens_seen": 367360,
"step": 111,
"train_runtime": 537.6326,
"train_tokens_per_second": 683.292
},
{
"epoch": 0.9032258064516129,
"grad_norm": 0.7594029903411865,
"learning_rate": 0.00019645406355025565,
"loss": 0.501505434513092,
"num_input_tokens_seen": 370518,
"step": 112,
"train_runtime": 541.4395,
"train_tokens_per_second": 684.32
},
{
"epoch": 0.9112903225806451,
"grad_norm": 0.5261175632476807,
"learning_rate": 0.00019638661192687216,
"loss": 0.503300666809082,
"num_input_tokens_seen": 373828,
"step": 113,
"train_runtime": 545.4508,
"train_tokens_per_second": 685.356
},
{
"epoch": 0.9193548387096774,
"grad_norm": 0.5733628869056702,
"learning_rate": 0.00019631853659405807,
"loss": 0.5247334241867065,
"num_input_tokens_seen": 377226,
"step": 114,
"train_runtime": 549.523,
"train_tokens_per_second": 686.461
},
{
"epoch": 0.9274193548387096,
"grad_norm": 0.5294317007064819,
"learning_rate": 0.000196249837992323,
"loss": 0.503804087638855,
"num_input_tokens_seen": 380624,
"step": 115,
"train_runtime": 553.596,
"train_tokens_per_second": 687.548
},
{
"epoch": 0.9354838709677419,
"grad_norm": 0.5543656349182129,
"learning_rate": 0.0001961805165662096,
"loss": 0.4710533916950226,
"num_input_tokens_seen": 384012,
"step": 116,
"train_runtime": 557.6711,
"train_tokens_per_second": 688.599
},
{
"epoch": 0.9435483870967742,
"grad_norm": 0.6034952402114868,
"learning_rate": 0.00019611057276429085,
"loss": 0.5092794299125671,
"num_input_tokens_seen": 387364,
"step": 117,
"train_runtime": 561.7092,
"train_tokens_per_second": 689.617
},
{
"epoch": 0.9516129032258065,
"grad_norm": 0.5937938690185547,
"learning_rate": 0.00019604000703916705,
"loss": 0.5077915191650391,
"num_input_tokens_seen": 390740,
"step": 118,
"train_runtime": 565.7447,
"train_tokens_per_second": 690.665
},
{
"epoch": 0.9596774193548387,
"grad_norm": 0.5722475051879883,
"learning_rate": 0.00019596881984746287,
"loss": 0.5252417325973511,
"num_input_tokens_seen": 394002,
"step": 119,
"train_runtime": 569.7398,
"train_tokens_per_second": 691.547
},
{
"epoch": 0.967741935483871,
"grad_norm": 0.6592346429824829,
"learning_rate": 0.00019589701164982452,
"loss": 0.5243955850601196,
"num_input_tokens_seen": 397036,
"step": 120,
"train_runtime": 573.388,
"train_tokens_per_second": 692.439
},
{
"epoch": 0.967741935483871,
"eval_loss": 2.329965114593506,
"eval_runtime": 10.4196,
"eval_samples_per_second": 4.991,
"eval_steps_per_second": 2.495,
"num_input_tokens_seen": 397036,
"step": 120
},
{
"epoch": 0.9758064516129032,
"grad_norm": 0.6564823985099792,
"learning_rate": 0.00019582458291091663,
"loss": 0.605979859828949,
"num_input_tokens_seen": 400216,
"step": 121,
"train_runtime": 591.5965,
"train_tokens_per_second": 676.502
},
{
"epoch": 0.9838709677419355,
"grad_norm": 0.5364957451820374,
"learning_rate": 0.0001957515340994193,
"loss": 0.5199841260910034,
"num_input_tokens_seen": 403522,
"step": 122,
"train_runtime": 595.5873,
"train_tokens_per_second": 677.519
},
{
"epoch": 0.9919354838709677,
"grad_norm": 0.5787367820739746,
"learning_rate": 0.000195677865688025,
"loss": 0.5768128633499146,
"num_input_tokens_seen": 406874,
"step": 123,
"train_runtime": 599.6315,
"train_tokens_per_second": 678.54
},
{
"epoch": 1.0,
"grad_norm": 0.6326342821121216,
"learning_rate": 0.00019560357815343577,
"loss": 0.5327367782592773,
"num_input_tokens_seen": 410114,
"step": 124,
"train_runtime": 603.6134,
"train_tokens_per_second": 679.432
},
{
"epoch": 1.0080645161290323,
"grad_norm": 0.5442702174186707,
"learning_rate": 0.00019552867197635974,
"loss": 0.3973138928413391,
"num_input_tokens_seen": 413264,
"step": 125,
"train_runtime": 607.4187,
"train_tokens_per_second": 680.361
},
{
"epoch": 1.0161290322580645,
"grad_norm": 0.4810558259487152,
"learning_rate": 0.00019545314764150837,
"loss": 0.3840460777282715,
"num_input_tokens_seen": 416432,
"step": 126,
"train_runtime": 611.2764,
"train_tokens_per_second": 681.25
},
{
"epoch": 1.0241935483870968,
"grad_norm": 0.5415441989898682,
"learning_rate": 0.00019537700563759304,
"loss": 0.4207889437675476,
"num_input_tokens_seen": 419814,
"step": 127,
"train_runtime": 615.3399,
"train_tokens_per_second": 682.247
},
{
"epoch": 1.032258064516129,
"grad_norm": 0.5859441757202148,
"learning_rate": 0.00019530024645732206,
"loss": 0.4236603379249573,
"num_input_tokens_seen": 423168,
"step": 128,
"train_runtime": 619.4394,
"train_tokens_per_second": 683.147
},
{
"epoch": 1.0403225806451613,
"grad_norm": 0.6688714623451233,
"learning_rate": 0.00019522287059739753,
"loss": 0.446853905916214,
"num_input_tokens_seen": 426248,
"step": 129,
"train_runtime": 623.1252,
"train_tokens_per_second": 684.049
},
{
"epoch": 1.0483870967741935,
"grad_norm": 0.6689234972000122,
"learning_rate": 0.00019514487855851184,
"loss": 0.44393691420555115,
"num_input_tokens_seen": 429726,
"step": 130,
"train_runtime": 627.2816,
"train_tokens_per_second": 685.061
},
{
"epoch": 1.0564516129032258,
"grad_norm": 0.623140275478363,
"learning_rate": 0.00019506627084534483,
"loss": 0.44002050161361694,
"num_input_tokens_seen": 432912,
"step": 131,
"train_runtime": 631.1917,
"train_tokens_per_second": 685.865
},
{
"epoch": 1.064516129032258,
"grad_norm": 0.5681182146072388,
"learning_rate": 0.00019498704796656018,
"loss": 0.44711700081825256,
"num_input_tokens_seen": 436246,
"step": 132,
"train_runtime": 635.2752,
"train_tokens_per_second": 686.704
},
{
"epoch": 1.0725806451612903,
"grad_norm": 0.5604093670845032,
"learning_rate": 0.00019490721043480226,
"loss": 0.44671785831451416,
"num_input_tokens_seen": 439446,
"step": 133,
"train_runtime": 639.1181,
"train_tokens_per_second": 687.582
},
{
"epoch": 1.0806451612903225,
"grad_norm": 0.550987720489502,
"learning_rate": 0.00019482675876669286,
"loss": 0.41002413630485535,
"num_input_tokens_seen": 442716,
"step": 134,
"train_runtime": 643.1023,
"train_tokens_per_second": 688.407
},
{
"epoch": 1.0887096774193548,
"grad_norm": 0.48927298188209534,
"learning_rate": 0.00019474569348282774,
"loss": 0.4202808737754822,
"num_input_tokens_seen": 445948,
"step": 135,
"train_runtime": 646.9886,
"train_tokens_per_second": 689.267
},
{
"epoch": 1.096774193548387,
"grad_norm": 0.6476680040359497,
"learning_rate": 0.0001946640151077734,
"loss": 0.45923081040382385,
"num_input_tokens_seen": 449172,
"step": 136,
"train_runtime": 650.8352,
"train_tokens_per_second": 690.147
},
{
"epoch": 1.1048387096774193,
"grad_norm": 0.6251439452171326,
"learning_rate": 0.00019458172417006347,
"loss": 0.5086702108383179,
"num_input_tokens_seen": 452512,
"step": 137,
"train_runtime": 654.8721,
"train_tokens_per_second": 690.993
},
{
"epoch": 1.1129032258064515,
"grad_norm": 0.5364375114440918,
"learning_rate": 0.00019449882120219555,
"loss": 0.4321169853210449,
"num_input_tokens_seen": 455922,
"step": 138,
"train_runtime": 658.9413,
"train_tokens_per_second": 691.901
},
{
"epoch": 1.120967741935484,
"grad_norm": 0.5967617630958557,
"learning_rate": 0.00019441530674062753,
"loss": 0.4385676681995392,
"num_input_tokens_seen": 459246,
"step": 139,
"train_runtime": 662.9535,
"train_tokens_per_second": 692.727
},
{
"epoch": 1.129032258064516,
"grad_norm": 0.6668894290924072,
"learning_rate": 0.0001943311813257743,
"loss": 0.38102924823760986,
"num_input_tokens_seen": 462644,
"step": 140,
"train_runtime": 667.0197,
"train_tokens_per_second": 693.599
},
{
"epoch": 1.1370967741935485,
"grad_norm": 0.6523346900939941,
"learning_rate": 0.00019424644550200415,
"loss": 0.45807939767837524,
"num_input_tokens_seen": 466004,
"step": 141,
"train_runtime": 671.0685,
"train_tokens_per_second": 694.421
},
{
"epoch": 1.1451612903225807,
"grad_norm": 0.607889711856842,
"learning_rate": 0.00019416109981763526,
"loss": 0.4723294675350189,
"num_input_tokens_seen": 469474,
"step": 142,
"train_runtime": 675.1673,
"train_tokens_per_second": 695.345
},
{
"epoch": 1.153225806451613,
"grad_norm": 0.5858147740364075,
"learning_rate": 0.00019407514482493214,
"loss": 0.39813458919525146,
"num_input_tokens_seen": 472830,
"step": 143,
"train_runtime": 679.2026,
"train_tokens_per_second": 696.155
},
{
"epoch": 1.1612903225806452,
"grad_norm": 0.5359724164009094,
"learning_rate": 0.00019398858108010217,
"loss": 0.46676114201545715,
"num_input_tokens_seen": 476146,
"step": 144,
"train_runtime": 683.2422,
"train_tokens_per_second": 696.892
},
{
"epoch": 1.1693548387096775,
"grad_norm": 0.9037399888038635,
"learning_rate": 0.0001939014091432918,
"loss": 0.4816886782646179,
"num_input_tokens_seen": 479388,
"step": 145,
"train_runtime": 687.1531,
"train_tokens_per_second": 697.644
},
{
"epoch": 1.1774193548387097,
"grad_norm": 0.5351356267929077,
"learning_rate": 0.00019381362957858312,
"loss": 0.4616449475288391,
"num_input_tokens_seen": 482820,
"step": 146,
"train_runtime": 691.2213,
"train_tokens_per_second": 698.503
},
{
"epoch": 1.185483870967742,
"grad_norm": 0.5950395464897156,
"learning_rate": 0.00019372524295399013,
"loss": 0.4098345935344696,
"num_input_tokens_seen": 485956,
"step": 147,
"train_runtime": 694.9967,
"train_tokens_per_second": 699.221
},
{
"epoch": 1.1935483870967742,
"grad_norm": 0.496609091758728,
"learning_rate": 0.00019363624984145502,
"loss": 0.4433990716934204,
"num_input_tokens_seen": 489266,
"step": 148,
"train_runtime": 698.9825,
"train_tokens_per_second": 699.969
},
{
"epoch": 1.2016129032258065,
"grad_norm": 0.5592359304428101,
"learning_rate": 0.00019354665081684446,
"loss": 0.39385470747947693,
"num_input_tokens_seen": 492712,
"step": 149,
"train_runtime": 703.0554,
"train_tokens_per_second": 700.815
},
{
"epoch": 1.2096774193548387,
"grad_norm": 0.5236433744430542,
"learning_rate": 0.0001934564464599461,
"loss": 0.366371750831604,
"num_input_tokens_seen": 496182,
"step": 150,
"train_runtime": 707.1645,
"train_tokens_per_second": 701.65
},
{
"epoch": 1.2096774193548387,
"eval_loss": 2.3865628242492676,
"eval_runtime": 10.3898,
"eval_samples_per_second": 5.005,
"eval_steps_per_second": 2.502,
"num_input_tokens_seen": 496182,
"step": 150
},
{
"epoch": 1.217741935483871,
"grad_norm": 0.6336263418197632,
"learning_rate": 0.00019336563735446446,
"loss": 0.46447595953941345,
"num_input_tokens_seen": 499482,
"step": 151,
"train_runtime": 725.6025,
"train_tokens_per_second": 688.369
},
{
"epoch": 1.2258064516129032,
"grad_norm": 0.6058643460273743,
"learning_rate": 0.00019327422408801744,
"loss": 0.4268086552619934,
"num_input_tokens_seen": 502832,
"step": 152,
"train_runtime": 729.6116,
"train_tokens_per_second": 689.178
},
{
"epoch": 1.2338709677419355,
"grad_norm": 0.5647494792938232,
"learning_rate": 0.0001931822072521323,
"loss": 0.4002935588359833,
"num_input_tokens_seen": 506196,
"step": 153,
"train_runtime": 733.6971,
"train_tokens_per_second": 689.925
},
{
"epoch": 1.2419354838709677,
"grad_norm": 0.6857813000679016,
"learning_rate": 0.00019308958744224217,
"loss": 0.41752880811691284,
"num_input_tokens_seen": 509348,
"step": 154,
"train_runtime": 737.5425,
"train_tokens_per_second": 690.602
},
{
"epoch": 1.25,
"grad_norm": 0.5805030465126038,
"learning_rate": 0.00019299636525768173,
"loss": 0.4200356602668762,
"num_input_tokens_seen": 512728,
"step": 155,
"train_runtime": 741.6553,
"train_tokens_per_second": 691.329
},
{
"epoch": 1.2580645161290323,
"grad_norm": 1.3187825679779053,
"learning_rate": 0.00019290254130168374,
"loss": 0.413343608379364,
"num_input_tokens_seen": 516078,
"step": 156,
"train_runtime": 745.6977,
"train_tokens_per_second": 692.074
},
{
"epoch": 1.2661290322580645,
"grad_norm": 0.7968809604644775,
"learning_rate": 0.00019280811618137484,
"loss": 0.4406548738479614,
"num_input_tokens_seen": 519482,
"step": 157,
"train_runtime": 749.8044,
"train_tokens_per_second": 692.823
},
{
"epoch": 1.2741935483870968,
"grad_norm": 0.5864307880401611,
"learning_rate": 0.00019271309050777183,
"loss": 0.44008421897888184,
"num_input_tokens_seen": 522758,
"step": 158,
"train_runtime": 753.7142,
"train_tokens_per_second": 693.576
},
{
"epoch": 1.282258064516129,
"grad_norm": 0.5749614238739014,
"learning_rate": 0.00019261746489577765,
"loss": 0.3923473656177521,
"num_input_tokens_seen": 525784,
"step": 159,
"train_runtime": 757.3998,
"train_tokens_per_second": 694.196
},
{
"epoch": 1.2903225806451613,
"grad_norm": 0.5030391216278076,
"learning_rate": 0.00019252123996417738,
"loss": 0.37407517433166504,
"num_input_tokens_seen": 529130,
"step": 160,
"train_runtime": 761.4335,
"train_tokens_per_second": 694.913
},
{
"epoch": 1.2983870967741935,
"grad_norm": 0.6000431180000305,
"learning_rate": 0.00019242441633563417,
"loss": 0.43879765272140503,
"num_input_tokens_seen": 532308,
"step": 161,
"train_runtime": 765.2898,
"train_tokens_per_second": 695.564
},
{
"epoch": 1.3064516129032258,
"grad_norm": 0.5326035022735596,
"learning_rate": 0.00019232699463668542,
"loss": 0.38204699754714966,
"num_input_tokens_seen": 535662,
"step": 162,
"train_runtime": 769.3512,
"train_tokens_per_second": 696.252
},
{
"epoch": 1.314516129032258,
"grad_norm": 0.6022024154663086,
"learning_rate": 0.00019222897549773848,
"loss": 0.4019436240196228,
"num_input_tokens_seen": 539056,
"step": 163,
"train_runtime": 773.4493,
"train_tokens_per_second": 696.951
},
{
"epoch": 1.3225806451612903,
"grad_norm": 0.5885249972343445,
"learning_rate": 0.0001921303595530667,
"loss": 0.39464306831359863,
"num_input_tokens_seen": 542450,
"step": 164,
"train_runtime": 777.4896,
"train_tokens_per_second": 697.694
},
{
"epoch": 1.3306451612903225,
"grad_norm": 0.7338830828666687,
"learning_rate": 0.00019203114744080542,
"loss": 0.47657448053359985,
"num_input_tokens_seen": 545796,
"step": 165,
"train_runtime": 781.5136,
"train_tokens_per_second": 698.383
},
{
"epoch": 1.3387096774193548,
"grad_norm": 0.6826189160346985,
"learning_rate": 0.0001919313398029475,
"loss": 0.4395520091056824,
"num_input_tokens_seen": 548960,
"step": 166,
"train_runtime": 785.3294,
"train_tokens_per_second": 699.019
},
{
"epoch": 1.346774193548387,
"grad_norm": 0.5975714325904846,
"learning_rate": 0.00019183093728533966,
"loss": 0.42339080572128296,
"num_input_tokens_seen": 552310,
"step": 167,
"train_runtime": 789.3433,
"train_tokens_per_second": 699.708
},
{
"epoch": 1.3548387096774195,
"grad_norm": 0.5949293375015259,
"learning_rate": 0.00019172994053767784,
"loss": 0.45283418893814087,
"num_input_tokens_seen": 555660,
"step": 168,
"train_runtime": 793.3402,
"train_tokens_per_second": 700.406
},
{
"epoch": 1.3629032258064515,
"grad_norm": 0.5816603302955627,
"learning_rate": 0.0001916283502135033,
"loss": 0.43116986751556396,
"num_input_tokens_seen": 558942,
"step": 169,
"train_runtime": 797.3188,
"train_tokens_per_second": 701.027
},
{
"epoch": 1.370967741935484,
"grad_norm": 0.6768733859062195,
"learning_rate": 0.00019152616697019822,
"loss": 0.465903103351593,
"num_input_tokens_seen": 561914,
"step": 170,
"train_runtime": 800.9084,
"train_tokens_per_second": 701.596
},
{
"epoch": 1.379032258064516,
"grad_norm": 0.5320206880569458,
"learning_rate": 0.0001914233914689815,
"loss": 0.4548630714416504,
"num_input_tokens_seen": 565128,
"step": 171,
"train_runtime": 804.723,
"train_tokens_per_second": 702.264
},
{
"epoch": 1.3870967741935485,
"grad_norm": 0.5698583126068115,
"learning_rate": 0.00019132002437490458,
"loss": 0.41211193799972534,
"num_input_tokens_seen": 568492,
"step": 172,
"train_runtime": 808.7513,
"train_tokens_per_second": 702.926
},
{
"epoch": 1.3951612903225805,
"grad_norm": 0.6660341620445251,
"learning_rate": 0.00019121606635684696,
"loss": 0.4983174800872803,
"num_input_tokens_seen": 571838,
"step": 173,
"train_runtime": 812.784,
"train_tokens_per_second": 703.555
},
{
"epoch": 1.403225806451613,
"grad_norm": 0.5148375630378723,
"learning_rate": 0.00019111151808751196,
"loss": 0.3650531768798828,
"num_input_tokens_seen": 575172,
"step": 174,
"train_runtime": 816.8205,
"train_tokens_per_second": 704.16
},
{
"epoch": 1.4112903225806452,
"grad_norm": 0.680073082447052,
"learning_rate": 0.00019100638024342244,
"loss": 0.4350316524505615,
"num_input_tokens_seen": 578472,
"step": 175,
"train_runtime": 820.7995,
"train_tokens_per_second": 704.767
},
{
"epoch": 1.4193548387096775,
"grad_norm": 0.5539473295211792,
"learning_rate": 0.00019090065350491626,
"loss": 0.4079291522502899,
"num_input_tokens_seen": 581854,
"step": 176,
"train_runtime": 824.8486,
"train_tokens_per_second": 705.407
},
{
"epoch": 1.4274193548387097,
"grad_norm": 0.6199740171432495,
"learning_rate": 0.00019079433855614201,
"loss": 0.4282647371292114,
"num_input_tokens_seen": 585240,
"step": 177,
"train_runtime": 828.8684,
"train_tokens_per_second": 706.071
},
{
"epoch": 1.435483870967742,
"grad_norm": 0.7518092393875122,
"learning_rate": 0.00019068743608505455,
"loss": 0.4538434147834778,
"num_input_tokens_seen": 588466,
"step": 178,
"train_runtime": 832.705,
"train_tokens_per_second": 706.692
},
{
"epoch": 1.4435483870967742,
"grad_norm": 0.6203386187553406,
"learning_rate": 0.0001905799467834105,
"loss": 0.4033154547214508,
"num_input_tokens_seen": 591716,
"step": 179,
"train_runtime": 836.5606,
"train_tokens_per_second": 707.32
},
{
"epoch": 1.4516129032258065,
"grad_norm": 0.5324673056602478,
"learning_rate": 0.00019047187134676387,
"loss": 0.40287455916404724,
"num_input_tokens_seen": 595124,
"step": 180,
"train_runtime": 840.6426,
"train_tokens_per_second": 707.939
},
{
"epoch": 1.4516129032258065,
"eval_loss": 2.244081735610962,
"eval_runtime": 10.2876,
"eval_samples_per_second": 5.055,
"eval_steps_per_second": 2.527,
"num_input_tokens_seen": 595124,
"step": 180
},
{
"epoch": 1.4596774193548387,
"grad_norm": 0.5803776979446411,
"learning_rate": 0.0001903632104744614,
"loss": 0.40973353385925293,
"num_input_tokens_seen": 598560,
"step": 181,
"train_runtime": 859.0393,
"train_tokens_per_second": 696.778
},
{
"epoch": 1.467741935483871,
"grad_norm": 0.6523128747940063,
"learning_rate": 0.00019025396486963827,
"loss": 0.46758562326431274,
"num_input_tokens_seen": 601832,
"step": 182,
"train_runtime": 862.9801,
"train_tokens_per_second": 697.388
},
{
"epoch": 1.4758064516129032,
"grad_norm": 0.5893530249595642,
"learning_rate": 0.0001901441352392133,
"loss": 0.4586262106895447,
"num_input_tokens_seen": 605124,
"step": 183,
"train_runtime": 866.9954,
"train_tokens_per_second": 697.955
},
{
"epoch": 1.4838709677419355,
"grad_norm": 0.5453356504440308,
"learning_rate": 0.00019003372229388452,
"loss": 0.44551703333854675,
"num_input_tokens_seen": 608550,
"step": 184,
"train_runtime": 871.1073,
"train_tokens_per_second": 698.594
},
{
"epoch": 1.4919354838709677,
"grad_norm": 0.5781370997428894,
"learning_rate": 0.0001899227267481246,
"loss": 0.47110262513160706,
"num_input_tokens_seen": 611912,
"step": 185,
"train_runtime": 875.213,
"train_tokens_per_second": 699.158
},
{
"epoch": 1.5,
"grad_norm": 0.591170608997345,
"learning_rate": 0.00018981114932017609,
"loss": 0.4550918936729431,
"num_input_tokens_seen": 615114,
"step": 186,
"train_runtime": 879.1905,
"train_tokens_per_second": 699.637
},
{
"epoch": 1.5080645161290323,
"grad_norm": 0.5168197751045227,
"learning_rate": 0.00018969899073204686,
"loss": 0.39351943135261536,
"num_input_tokens_seen": 618532,
"step": 187,
"train_runtime": 883.2978,
"train_tokens_per_second": 700.253
},
{
"epoch": 1.5161290322580645,
"grad_norm": 0.6083711385726929,
"learning_rate": 0.00018958625170950545,
"loss": 0.43661871552467346,
"num_input_tokens_seen": 621584,
"step": 188,
"train_runtime": 887.0132,
"train_tokens_per_second": 700.761
},
{
"epoch": 1.5241935483870968,
"grad_norm": 0.5687974095344543,
"learning_rate": 0.00018947293298207635,
"loss": 0.45791560411453247,
"num_input_tokens_seen": 624890,
"step": 189,
"train_runtime": 891.0431,
"train_tokens_per_second": 701.302
},
{
"epoch": 1.532258064516129,
"grad_norm": 0.747931957244873,
"learning_rate": 0.00018935903528303523,
"loss": 0.491569459438324,
"num_input_tokens_seen": 627898,
"step": 190,
"train_runtime": 894.7166,
"train_tokens_per_second": 701.784
},
{
"epoch": 1.5403225806451613,
"grad_norm": 0.49939557909965515,
"learning_rate": 0.0001892445593494042,
"loss": 0.3926829695701599,
"num_input_tokens_seen": 631260,
"step": 191,
"train_runtime": 898.7539,
"train_tokens_per_second": 702.372
},
{
"epoch": 1.5483870967741935,
"grad_norm": 0.5808703303337097,
"learning_rate": 0.0001891295059219472,
"loss": 0.4172174334526062,
"num_input_tokens_seen": 634598,
"step": 192,
"train_runtime": 902.7756,
"train_tokens_per_second": 702.941
},
{
"epoch": 1.5564516129032258,
"grad_norm": 0.573276937007904,
"learning_rate": 0.00018901387574516497,
"loss": 0.4499248266220093,
"num_input_tokens_seen": 637776,
"step": 193,
"train_runtime": 906.6301,
"train_tokens_per_second": 703.458
},
{
"epoch": 1.564516129032258,
"grad_norm": 2.77301287651062,
"learning_rate": 0.00018889766956729044,
"loss": 0.43817731738090515,
"num_input_tokens_seen": 641100,
"step": 194,
"train_runtime": 910.6441,
"train_tokens_per_second": 704.007
},
{
"epoch": 1.5725806451612905,
"grad_norm": 0.5687347054481506,
"learning_rate": 0.00018878088814028364,
"loss": 0.41259127855300903,
"num_input_tokens_seen": 644472,
"step": 195,
"train_runtime": 914.6988,
"train_tokens_per_second": 704.573
},
{
"epoch": 1.5806451612903225,
"grad_norm": 0.5743884444236755,
"learning_rate": 0.00018866353221982718,
"loss": 0.371239572763443,
"num_input_tokens_seen": 647738,
"step": 196,
"train_runtime": 918.7012,
"train_tokens_per_second": 705.058
},
{
"epoch": 1.588709677419355,
"grad_norm": 0.5498597621917725,
"learning_rate": 0.000188545602565321,
"loss": 0.453549861907959,
"num_input_tokens_seen": 651030,
"step": 197,
"train_runtime": 922.5884,
"train_tokens_per_second": 705.656
},
{
"epoch": 1.596774193548387,
"grad_norm": 0.6513309478759766,
"learning_rate": 0.00018842709993987776,
"loss": 0.4566512107849121,
"num_input_tokens_seen": 654362,
"step": 198,
"train_runtime": 926.5867,
"train_tokens_per_second": 706.207
},
{
"epoch": 1.6048387096774195,
"grad_norm": 0.6124453544616699,
"learning_rate": 0.00018830802511031762,
"loss": 0.406619131565094,
"num_input_tokens_seen": 657668,
"step": 199,
"train_runtime": 930.5442,
"train_tokens_per_second": 706.756
},
{
"epoch": 1.6129032258064515,
"grad_norm": 0.5730791091918945,
"learning_rate": 0.0001881883788471636,
"loss": 0.4061928987503052,
"num_input_tokens_seen": 661006,
"step": 200,
"train_runtime": 934.5634,
"train_tokens_per_second": 707.289
},
{
"epoch": 1.620967741935484,
"grad_norm": 0.5406919121742249,
"learning_rate": 0.00018806816192463625,
"loss": 0.4146749973297119,
"num_input_tokens_seen": 664466,
"step": 201,
"train_runtime": 938.6171,
"train_tokens_per_second": 707.92
},
{
"epoch": 1.629032258064516,
"grad_norm": 0.6002207398414612,
"learning_rate": 0.0001879473751206489,
"loss": 0.4497522711753845,
"num_input_tokens_seen": 667634,
"step": 202,
"train_runtime": 942.4299,
"train_tokens_per_second": 708.418
},
{
"epoch": 1.6370967741935485,
"grad_norm": 0.5770663619041443,
"learning_rate": 0.00018782601921680256,
"loss": 0.4500475823879242,
"num_input_tokens_seen": 671080,
"step": 203,
"train_runtime": 946.5233,
"train_tokens_per_second": 708.995
},
{
"epoch": 1.6451612903225805,
"grad_norm": 0.6245407462120056,
"learning_rate": 0.00018770409499838073,
"loss": 0.49075624346733093,
"num_input_tokens_seen": 674508,
"step": 204,
"train_runtime": 950.6315,
"train_tokens_per_second": 709.537
},
{
"epoch": 1.653225806451613,
"grad_norm": 0.589747965335846,
"learning_rate": 0.0001875816032543445,
"loss": 0.4112730324268341,
"num_input_tokens_seen": 677700,
"step": 205,
"train_runtime": 954.474,
"train_tokens_per_second": 710.025
},
{
"epoch": 1.661290322580645,
"grad_norm": 0.5470958352088928,
"learning_rate": 0.00018745854477732733,
"loss": 0.42240893840789795,
"num_input_tokens_seen": 680978,
"step": 206,
"train_runtime": 958.4736,
"train_tokens_per_second": 710.482
},
{
"epoch": 1.6693548387096775,
"grad_norm": 0.5093806385993958,
"learning_rate": 0.00018733492036363005,
"loss": 0.40931662917137146,
"num_input_tokens_seen": 684328,
"step": 207,
"train_runtime": 962.4879,
"train_tokens_per_second": 710.999
},
{
"epoch": 1.6774193548387095,
"grad_norm": 0.6141051650047302,
"learning_rate": 0.0001872107308132155,
"loss": 0.4374140501022339,
"num_input_tokens_seen": 687676,
"step": 208,
"train_runtime": 966.4877,
"train_tokens_per_second": 711.521
},
{
"epoch": 1.685483870967742,
"grad_norm": 0.5096253156661987,
"learning_rate": 0.00018708597692970353,
"loss": 0.3743232488632202,
"num_input_tokens_seen": 691000,
"step": 209,
"train_runtime": 970.5016,
"train_tokens_per_second": 712.003
},
{
"epoch": 1.6935483870967742,
"grad_norm": 0.6781167984008789,
"learning_rate": 0.00018696065952036571,
"loss": 0.45937836170196533,
"num_input_tokens_seen": 694156,
"step": 210,
"train_runtime": 974.3365,
"train_tokens_per_second": 712.44
},
{
"epoch": 1.6935483870967742,
"eval_loss": 2.2753806114196777,
"eval_runtime": 10.368,
"eval_samples_per_second": 5.015,
"eval_steps_per_second": 2.508,
"num_input_tokens_seen": 694156,
"step": 210
},
{
"epoch": 1.7016129032258065,
"grad_norm": 0.5530876517295837,
"learning_rate": 0.00018683477939612021,
"loss": 0.39176639914512634,
"num_input_tokens_seen": 697510,
"step": 211,
"train_runtime": 992.7122,
"train_tokens_per_second": 702.631
},
{
"epoch": 1.7096774193548387,
"grad_norm": 0.5276106595993042,
"learning_rate": 0.0001867083373715264,
"loss": 0.3990570604801178,
"num_input_tokens_seen": 700884,
"step": 212,
"train_runtime": 996.7415,
"train_tokens_per_second": 703.175
},
{
"epoch": 1.717741935483871,
"grad_norm": 0.6544950008392334,
"learning_rate": 0.00018658133426477965,
"loss": 0.477338969707489,
"num_input_tokens_seen": 704288,
"step": 213,
"train_runtime": 1000.8249,
"train_tokens_per_second": 703.708
},
{
"epoch": 1.7258064516129032,
"grad_norm": 0.6669942736625671,
"learning_rate": 0.00018645377089770616,
"loss": 0.41006964445114136,
"num_input_tokens_seen": 707404,
"step": 214,
"train_runtime": 1004.6485,
"train_tokens_per_second": 704.131
},
{
"epoch": 1.7338709677419355,
"grad_norm": 0.6127757430076599,
"learning_rate": 0.00018632564809575742,
"loss": 0.4169791340827942,
"num_input_tokens_seen": 710738,
"step": 215,
"train_runtime": 1008.6929,
"train_tokens_per_second": 704.613
},
{
"epoch": 1.7419354838709677,
"grad_norm": 0.6781405210494995,
"learning_rate": 0.00018619696668800492,
"loss": 0.49052947759628296,
"num_input_tokens_seen": 714052,
"step": 216,
"train_runtime": 1012.7035,
"train_tokens_per_second": 705.095
},
{
"epoch": 1.75,
"grad_norm": 0.5324527621269226,
"learning_rate": 0.00018606772750713504,
"loss": 0.4169653654098511,
"num_input_tokens_seen": 717406,
"step": 217,
"train_runtime": 1016.803,
"train_tokens_per_second": 705.551
},
{
"epoch": 1.7580645161290323,
"grad_norm": 0.5233943462371826,
"learning_rate": 0.00018593793138944328,
"loss": 0.4026474058628082,
"num_input_tokens_seen": 720724,
"step": 218,
"train_runtime": 1020.857,
"train_tokens_per_second": 705.999
},
{
"epoch": 1.7661290322580645,
"grad_norm": 0.5787035226821899,
"learning_rate": 0.0001858075791748291,
"loss": 0.501507043838501,
"num_input_tokens_seen": 724156,
"step": 219,
"train_runtime": 1025.0039,
"train_tokens_per_second": 706.491
},
{
"epoch": 1.7741935483870968,
"grad_norm": 0.5081259608268738,
"learning_rate": 0.0001856766717067904,
"loss": 0.41434332728385925,
"num_input_tokens_seen": 727386,
"step": 220,
"train_runtime": 1029.0266,
"train_tokens_per_second": 706.868
},
{
"epoch": 1.782258064516129,
"grad_norm": 0.4951300024986267,
"learning_rate": 0.00018554520983241814,
"loss": 0.4030165374279022,
"num_input_tokens_seen": 730576,
"step": 221,
"train_runtime": 1032.9949,
"train_tokens_per_second": 707.241
},
{
"epoch": 1.7903225806451613,
"grad_norm": 0.47789162397384644,
"learning_rate": 0.00018541319440239066,
"loss": 0.37602391839027405,
"num_input_tokens_seen": 734042,
"step": 222,
"train_runtime": 1037.1175,
"train_tokens_per_second": 707.771
},
{
"epoch": 1.7983870967741935,
"grad_norm": 0.5510605573654175,
"learning_rate": 0.00018528062627096845,
"loss": 0.4142032861709595,
"num_input_tokens_seen": 737388,
"step": 223,
"train_runtime": 1041.1341,
"train_tokens_per_second": 708.255
},
{
"epoch": 1.8064516129032258,
"grad_norm": 0.5054709315299988,
"learning_rate": 0.0001851475062959884,
"loss": 0.4083865284919739,
"num_input_tokens_seen": 740714,
"step": 224,
"train_runtime": 1045.1999,
"train_tokens_per_second": 708.682
},
{
"epoch": 1.814516129032258,
"grad_norm": 0.5532689690589905,
"learning_rate": 0.00018501383533885837,
"loss": 0.3750511407852173,
"num_input_tokens_seen": 743952,
"step": 225,
"train_runtime": 1049.1088,
"train_tokens_per_second": 709.128
},
{
"epoch": 1.8225806451612905,
"grad_norm": 0.5842207074165344,
"learning_rate": 0.00018487961426455157,
"loss": 0.4582732319831848,
"num_input_tokens_seen": 747324,
"step": 226,
"train_runtime": 1053.1583,
"train_tokens_per_second": 709.603
},
{
"epoch": 1.8306451612903225,
"grad_norm": 0.5713872313499451,
"learning_rate": 0.0001847448439416009,
"loss": 0.4612322449684143,
"num_input_tokens_seen": 750660,
"step": 227,
"train_runtime": 1057.1876,
"train_tokens_per_second": 710.054
},
{
"epoch": 1.838709677419355,
"grad_norm": 0.7017268538475037,
"learning_rate": 0.00018460952524209355,
"loss": 0.4732731580734253,
"num_input_tokens_seen": 753890,
"step": 228,
"train_runtime": 1061.0082,
"train_tokens_per_second": 710.541
},
{
"epoch": 1.846774193548387,
"grad_norm": 0.5345413684844971,
"learning_rate": 0.0001844736590416651,
"loss": 0.40334969758987427,
"num_input_tokens_seen": 757196,
"step": 229,
"train_runtime": 1065.0157,
"train_tokens_per_second": 710.972
},
{
"epoch": 1.8548387096774195,
"grad_norm": 0.5843684673309326,
"learning_rate": 0.00018433724621949392,
"loss": 0.4093519449234009,
"num_input_tokens_seen": 760606,
"step": 230,
"train_runtime": 1069.0746,
"train_tokens_per_second": 711.462
},
{
"epoch": 1.8629032258064515,
"grad_norm": 0.5555073022842407,
"learning_rate": 0.00018420028765829568,
"loss": 0.40939491987228394,
"num_input_tokens_seen": 763910,
"step": 231,
"train_runtime": 1073.0585,
"train_tokens_per_second": 711.9
},
{
"epoch": 1.870967741935484,
"grad_norm": 0.5539902448654175,
"learning_rate": 0.00018406278424431736,
"loss": 0.43555861711502075,
"num_input_tokens_seen": 767166,
"step": 232,
"train_runtime": 1076.9274,
"train_tokens_per_second": 712.366
},
{
"epoch": 1.879032258064516,
"grad_norm": 0.5120404958724976,
"learning_rate": 0.00018392473686733163,
"loss": 0.43676942586898804,
"num_input_tokens_seen": 770592,
"step": 233,
"train_runtime": 1080.9922,
"train_tokens_per_second": 712.856
},
{
"epoch": 1.8870967741935485,
"grad_norm": 0.6968815922737122,
"learning_rate": 0.00018378614642063115,
"loss": 0.4551546275615692,
"num_input_tokens_seen": 773594,
"step": 234,
"train_runtime": 1084.6349,
"train_tokens_per_second": 713.23
},
{
"epoch": 1.8951612903225805,
"grad_norm": 0.6711899638175964,
"learning_rate": 0.00018364701380102266,
"loss": 0.47881585359573364,
"num_input_tokens_seen": 776592,
"step": 235,
"train_runtime": 1088.2536,
"train_tokens_per_second": 713.613
},
{
"epoch": 1.903225806451613,
"grad_norm": 0.5762051939964294,
"learning_rate": 0.0001835073399088214,
"loss": 0.48081904649734497,
"num_input_tokens_seen": 779942,
"step": 236,
"train_runtime": 1092.3842,
"train_tokens_per_second": 713.981
},
{
"epoch": 1.911290322580645,
"grad_norm": 0.47804713249206543,
"learning_rate": 0.00018336712564784503,
"loss": 0.39908868074417114,
"num_input_tokens_seen": 783252,
"step": 237,
"train_runtime": 1096.4305,
"train_tokens_per_second": 714.365
},
{
"epoch": 1.9193548387096775,
"grad_norm": 0.6200618147850037,
"learning_rate": 0.00018322637192540785,
"loss": 0.4580332636833191,
"num_input_tokens_seen": 786556,
"step": 238,
"train_runtime": 1100.4502,
"train_tokens_per_second": 714.758
},
{
"epoch": 1.9274193548387095,
"grad_norm": 0.6390965580940247,
"learning_rate": 0.00018308507965231508,
"loss": 0.4557971954345703,
"num_input_tokens_seen": 789882,
"step": 239,
"train_runtime": 1104.4776,
"train_tokens_per_second": 715.163
},
{
"epoch": 1.935483870967742,
"grad_norm": 0.5260592699050903,
"learning_rate": 0.00018294324974285677,
"loss": 0.382771372795105,
"num_input_tokens_seen": 793254,
"step": 240,
"train_runtime": 1108.5259,
"train_tokens_per_second": 715.594
},
{
"epoch": 1.935483870967742,
"eval_loss": 2.1555724143981934,
"eval_runtime": 10.4094,
"eval_samples_per_second": 4.996,
"eval_steps_per_second": 2.498,
"num_input_tokens_seen": 793254,
"step": 240
},
{
"epoch": 1.9435483870967742,
"grad_norm": 0.6225792169570923,
"learning_rate": 0.00018280088311480201,
"loss": 0.45386967062950134,
"num_input_tokens_seen": 796604,
"step": 241,
"train_runtime": 1127.0971,
"train_tokens_per_second": 706.775
},
{
"epoch": 1.9516129032258065,
"grad_norm": 0.5846142172813416,
"learning_rate": 0.00018265798068939294,
"loss": 0.4212104082107544,
"num_input_tokens_seen": 799812,
"step": 242,
"train_runtime": 1131.0228,
"train_tokens_per_second": 707.158
},
{
"epoch": 1.9596774193548387,
"grad_norm": 0.6712385416030884,
"learning_rate": 0.0001825145433913388,
"loss": 0.48295828700065613,
"num_input_tokens_seen": 802998,
"step": 243,
"train_runtime": 1134.8725,
"train_tokens_per_second": 707.567
},
{
"epoch": 1.967741935483871,
"grad_norm": 0.5474020838737488,
"learning_rate": 0.00018237057214880994,
"loss": 0.41326338052749634,
"num_input_tokens_seen": 806314,
"step": 244,
"train_runtime": 1138.9273,
"train_tokens_per_second": 707.959
},
{
"epoch": 1.9758064516129032,
"grad_norm": 0.5561951994895935,
"learning_rate": 0.00018222606789343183,
"loss": 0.4193962514400482,
"num_input_tokens_seen": 809616,
"step": 245,
"train_runtime": 1142.9876,
"train_tokens_per_second": 708.333
},
{
"epoch": 1.9838709677419355,
"grad_norm": 0.7057868242263794,
"learning_rate": 0.00018208103156027897,
"loss": 0.4625564515590668,
"num_input_tokens_seen": 812656,
"step": 246,
"train_runtime": 1146.6931,
"train_tokens_per_second": 708.695
},
{
"epoch": 1.9919354838709677,
"grad_norm": 0.5412559509277344,
"learning_rate": 0.00018193546408786898,
"loss": 0.3980866074562073,
"num_input_tokens_seen": 815812,
"step": 247,
"train_runtime": 1150.55,
"train_tokens_per_second": 709.063
},
{
"epoch": 2.0,
"grad_norm": 0.6223896145820618,
"learning_rate": 0.00018178936641815636,
"loss": 0.508427083492279,
"num_input_tokens_seen": 818984,
"step": 248,
"train_runtime": 1154.4438,
"train_tokens_per_second": 709.419
},
{
"epoch": 2.0080645161290325,
"grad_norm": 0.47228068113327026,
"learning_rate": 0.0001816427394965265,
"loss": 0.30645275115966797,
"num_input_tokens_seen": 822146,
"step": 249,
"train_runtime": 1158.3569,
"train_tokens_per_second": 709.752
},
{
"epoch": 2.0161290322580645,
"grad_norm": 0.45574134588241577,
"learning_rate": 0.00018149558427178956,
"loss": 0.2756215035915375,
"num_input_tokens_seen": 825434,
"step": 250,
"train_runtime": 1162.3738,
"train_tokens_per_second": 710.128
},
{
"epoch": 2.024193548387097,
"grad_norm": 0.5037111639976501,
"learning_rate": 0.00018134790169617419,
"loss": 0.26711219549179077,
"num_input_tokens_seen": 828594,
"step": 251,
"train_runtime": 1166.2381,
"train_tokens_per_second": 710.484
},
{
"epoch": 2.032258064516129,
"grad_norm": 0.468986839056015,
"learning_rate": 0.00018119969272532166,
"loss": 0.25423315167427063,
"num_input_tokens_seen": 831870,
"step": 252,
"train_runtime": 1170.2506,
"train_tokens_per_second": 710.848
},
{
"epoch": 2.0403225806451615,
"grad_norm": 0.554565966129303,
"learning_rate": 0.00018105095831827934,
"loss": 0.27069148421287537,
"num_input_tokens_seen": 835230,
"step": 253,
"train_runtime": 1174.3065,
"train_tokens_per_second": 711.254
},
{
"epoch": 2.0483870967741935,
"grad_norm": 0.7102289795875549,
"learning_rate": 0.00018090169943749476,
"loss": 0.30719172954559326,
"num_input_tokens_seen": 838606,
"step": 254,
"train_runtime": 1178.3482,
"train_tokens_per_second": 711.679
},
{
"epoch": 2.056451612903226,
"grad_norm": 0.9548736810684204,
"learning_rate": 0.0001807519170488092,
"loss": 0.2515634298324585,
"num_input_tokens_seen": 841730,
"step": 255,
"train_runtime": 1182.2287,
"train_tokens_per_second": 711.986
},
{
"epoch": 2.064516129032258,
"grad_norm": 0.6869332194328308,
"learning_rate": 0.00018060161212145155,
"loss": 0.2824576497077942,
"num_input_tokens_seen": 844882,
"step": 256,
"train_runtime": 1186.0526,
"train_tokens_per_second": 712.348
},
{
"epoch": 2.0725806451612905,
"grad_norm": 0.7332234382629395,
"learning_rate": 0.00018045078562803203,
"loss": 0.32276466488838196,
"num_input_tokens_seen": 848268,
"step": 257,
"train_runtime": 1190.1082,
"train_tokens_per_second": 712.765
},
{
"epoch": 2.0806451612903225,
"grad_norm": 0.6955188512802124,
"learning_rate": 0.00018029943854453576,
"loss": 0.28808388113975525,
"num_input_tokens_seen": 851664,
"step": 258,
"train_runtime": 1194.279,
"train_tokens_per_second": 713.12
},
{
"epoch": 2.088709677419355,
"grad_norm": 0.675642192363739,
"learning_rate": 0.00018014757185031671,
"loss": 0.302121639251709,
"num_input_tokens_seen": 854890,
"step": 259,
"train_runtime": 1198.113,
"train_tokens_per_second": 713.53
},
{
"epoch": 2.096774193548387,
"grad_norm": 0.6408395171165466,
"learning_rate": 0.0001799951865280911,
"loss": 0.2811819016933441,
"num_input_tokens_seen": 858116,
"step": 260,
"train_runtime": 1202.0858,
"train_tokens_per_second": 713.856
},
{
"epoch": 2.1048387096774195,
"grad_norm": 0.5794196128845215,
"learning_rate": 0.00017984228356393117,
"loss": 0.3123016059398651,
"num_input_tokens_seen": 861472,
"step": 261,
"train_runtime": 1206.098,
"train_tokens_per_second": 714.264
},
{
"epoch": 2.1129032258064515,
"grad_norm": 0.5573109984397888,
"learning_rate": 0.00017968886394725874,
"loss": 0.28222909569740295,
"num_input_tokens_seen": 864828,
"step": 262,
"train_runtime": 1210.1334,
"train_tokens_per_second": 714.655
},
{
"epoch": 2.120967741935484,
"grad_norm": 0.5452139377593994,
"learning_rate": 0.00017953492867083895,
"loss": 0.340986967086792,
"num_input_tokens_seen": 868250,
"step": 263,
"train_runtime": 1214.227,
"train_tokens_per_second": 715.064
},
{
"epoch": 2.129032258064516,
"grad_norm": 0.5423465967178345,
"learning_rate": 0.00017938047873077362,
"loss": 0.2829318940639496,
"num_input_tokens_seen": 871588,
"step": 264,
"train_runtime": 1218.2485,
"train_tokens_per_second": 715.444
},
{
"epoch": 2.1370967741935485,
"grad_norm": 0.5712020397186279,
"learning_rate": 0.00017922551512649496,
"loss": 0.29809385538101196,
"num_input_tokens_seen": 874904,
"step": 265,
"train_runtime": 1222.2601,
"train_tokens_per_second": 715.808
},
{
"epoch": 2.1451612903225805,
"grad_norm": 0.6137015223503113,
"learning_rate": 0.00017907003886075904,
"loss": 0.28243711590766907,
"num_input_tokens_seen": 878270,
"step": 266,
"train_runtime": 1226.4186,
"train_tokens_per_second": 716.126
},
{
"epoch": 2.153225806451613,
"grad_norm": 0.5994285345077515,
"learning_rate": 0.00017891405093963938,
"loss": 0.2684274911880493,
"num_input_tokens_seen": 881702,
"step": 267,
"train_runtime": 1230.4755,
"train_tokens_per_second": 716.554
},
{
"epoch": 2.161290322580645,
"grad_norm": 0.5856040120124817,
"learning_rate": 0.00017875755237252027,
"loss": 0.24499014019966125,
"num_input_tokens_seen": 884996,
"step": 268,
"train_runtime": 1234.4668,
"train_tokens_per_second": 716.905
},
{
"epoch": 2.1693548387096775,
"grad_norm": 0.6271276473999023,
"learning_rate": 0.00017860054417209042,
"loss": 0.2747422754764557,
"num_input_tokens_seen": 888382,
"step": 269,
"train_runtime": 1238.4919,
"train_tokens_per_second": 717.31
},
{
"epoch": 2.1774193548387095,
"grad_norm": 0.6944452524185181,
"learning_rate": 0.00017844302735433635,
"loss": 0.28950047492980957,
"num_input_tokens_seen": 891756,
"step": 270,
"train_runtime": 1242.6132,
"train_tokens_per_second": 717.646
},
{
"epoch": 2.1774193548387095,
"eval_loss": 2.4587552547454834,
"eval_runtime": 10.3684,
"eval_samples_per_second": 5.015,
"eval_steps_per_second": 2.508,
"num_input_tokens_seen": 891756,
"step": 270
},
{
"epoch": 2.185483870967742,
"grad_norm": 0.7375330924987793,
"learning_rate": 0.00017828500293853576,
"loss": 0.2858275771141052,
"num_input_tokens_seen": 895138,
"step": 271,
"train_runtime": 1261.0596,
"train_tokens_per_second": 709.83
},
{
"epoch": 2.193548387096774,
"grad_norm": 0.7790223360061646,
"learning_rate": 0.00017812647194725094,
"loss": 0.26770544052124023,
"num_input_tokens_seen": 898256,
"step": 272,
"train_runtime": 1264.8855,
"train_tokens_per_second": 710.148
},
{
"epoch": 2.2016129032258065,
"grad_norm": 0.7440584301948547,
"learning_rate": 0.00017796743540632223,
"loss": 0.2846216857433319,
"num_input_tokens_seen": 901604,
"step": 273,
"train_runtime": 1268.9522,
"train_tokens_per_second": 710.511
},
{
"epoch": 2.2096774193548385,
"grad_norm": 0.6914224624633789,
"learning_rate": 0.0001778078943448614,
"loss": 0.3188799321651459,
"num_input_tokens_seen": 905004,
"step": 274,
"train_runtime": 1273.0703,
"train_tokens_per_second": 710.883
},
{
"epoch": 2.217741935483871,
"grad_norm": 0.6362150311470032,
"learning_rate": 0.00017764784979524477,
"loss": 0.28756844997406006,
"num_input_tokens_seen": 908206,
"step": 275,
"train_runtime": 1277.0485,
"train_tokens_per_second": 711.176
},
{
"epoch": 2.225806451612903,
"grad_norm": 0.5490068793296814,
"learning_rate": 0.00017748730279310685,
"loss": 0.25740480422973633,
"num_input_tokens_seen": 911584,
"step": 276,
"train_runtime": 1281.1418,
"train_tokens_per_second": 711.54
},
{
"epoch": 2.2338709677419355,
"grad_norm": 0.6280438303947449,
"learning_rate": 0.00017732625437733335,
"loss": 0.25927111506462097,
"num_input_tokens_seen": 914828,
"step": 277,
"train_runtime": 1285.0944,
"train_tokens_per_second": 711.876
},
{
"epoch": 2.241935483870968,
"grad_norm": 0.5633710622787476,
"learning_rate": 0.00017716470559005473,
"loss": 0.2778595983982086,
"num_input_tokens_seen": 918148,
"step": 278,
"train_runtime": 1289.1238,
"train_tokens_per_second": 712.226
},
{
"epoch": 2.25,
"grad_norm": 0.5184245109558105,
"learning_rate": 0.0001770026574766391,
"loss": 0.25256186723709106,
"num_input_tokens_seen": 921458,
"step": 279,
"train_runtime": 1293.1464,
"train_tokens_per_second": 712.571
},
{
"epoch": 2.258064516129032,
"grad_norm": 0.66209477186203,
"learning_rate": 0.00017684011108568592,
"loss": 0.31928277015686035,
"num_input_tokens_seen": 924876,
"step": 280,
"train_runtime": 1297.2519,
"train_tokens_per_second": 712.95
},
{
"epoch": 2.2661290322580645,
"grad_norm": 0.730171263217926,
"learning_rate": 0.0001766770674690187,
"loss": 0.3316803276538849,
"num_input_tokens_seen": 928128,
"step": 281,
"train_runtime": 1301.1326,
"train_tokens_per_second": 713.323
},
{
"epoch": 2.274193548387097,
"grad_norm": 0.5517908930778503,
"learning_rate": 0.0001765135276816787,
"loss": 0.2707135081291199,
"num_input_tokens_seen": 931526,
"step": 282,
"train_runtime": 1305.2072,
"train_tokens_per_second": 713.7
},
{
"epoch": 2.282258064516129,
"grad_norm": 0.6096500158309937,
"learning_rate": 0.0001763494927819177,
"loss": 0.2731682062149048,
"num_input_tokens_seen": 934904,
"step": 283,
"train_runtime": 1309.2622,
"train_tokens_per_second": 714.069
},
{
"epoch": 2.2903225806451615,
"grad_norm": 1.5883084535598755,
"learning_rate": 0.00017618496383119128,
"loss": 0.31949979066848755,
"num_input_tokens_seen": 938330,
"step": 284,
"train_runtime": 1313.3041,
"train_tokens_per_second": 714.48
},
{
"epoch": 2.2983870967741935,
"grad_norm": 0.6766570210456848,
"learning_rate": 0.0001760199418941521,
"loss": 0.34931379556655884,
"num_input_tokens_seen": 941664,
"step": 285,
"train_runtime": 1317.3288,
"train_tokens_per_second": 714.828
},
{
"epoch": 2.306451612903226,
"grad_norm": 0.5871296525001526,
"learning_rate": 0.00017585442803864294,
"loss": 0.28577709197998047,
"num_input_tokens_seen": 945058,
"step": 286,
"train_runtime": 1321.3934,
"train_tokens_per_second": 715.198
},
{
"epoch": 2.314516129032258,
"grad_norm": 0.6456281542778015,
"learning_rate": 0.00017568842333568952,
"loss": 0.29731327295303345,
"num_input_tokens_seen": 948198,
"step": 287,
"train_runtime": 1325.1856,
"train_tokens_per_second": 715.521
},
{
"epoch": 2.3225806451612905,
"grad_norm": 0.5836092829704285,
"learning_rate": 0.00017552192885949395,
"loss": 0.27603939175605774,
"num_input_tokens_seen": 951480,
"step": 288,
"train_runtime": 1329.191,
"train_tokens_per_second": 715.834
},
{
"epoch": 2.3306451612903225,
"grad_norm": 0.6601560711860657,
"learning_rate": 0.0001753549456874276,
"loss": 0.28083255887031555,
"num_input_tokens_seen": 954742,
"step": 289,
"train_runtime": 1333.0323,
"train_tokens_per_second": 716.218
},
{
"epoch": 2.338709677419355,
"grad_norm": 0.5849836468696594,
"learning_rate": 0.00017518747490002413,
"loss": 0.30605417490005493,
"num_input_tokens_seen": 958120,
"step": 290,
"train_runtime": 1337.0733,
"train_tokens_per_second": 716.58
},
{
"epoch": 2.346774193548387,
"grad_norm": 0.7856936454772949,
"learning_rate": 0.00017501951758097257,
"loss": 0.39218807220458984,
"num_input_tokens_seen": 961532,
"step": 291,
"train_runtime": 1341.1305,
"train_tokens_per_second": 716.956
},
{
"epoch": 2.3548387096774195,
"grad_norm": 0.6249871253967285,
"learning_rate": 0.00017485107481711012,
"loss": 0.24985340237617493,
"num_input_tokens_seen": 964916,
"step": 292,
"train_runtime": 1345.2083,
"train_tokens_per_second": 717.299
},
{
"epoch": 2.3629032258064515,
"grad_norm": 0.5427806377410889,
"learning_rate": 0.0001746821476984154,
"loss": 0.2381906509399414,
"num_input_tokens_seen": 968250,
"step": 293,
"train_runtime": 1349.288,
"train_tokens_per_second": 717.601
},
{
"epoch": 2.370967741935484,
"grad_norm": 0.5750318765640259,
"learning_rate": 0.00017451273731800115,
"loss": 0.2934698462486267,
"num_input_tokens_seen": 971434,
"step": 294,
"train_runtime": 1353.1185,
"train_tokens_per_second": 717.922
},
{
"epoch": 2.379032258064516,
"grad_norm": 0.6818315386772156,
"learning_rate": 0.00017434284477210735,
"loss": 0.3370276689529419,
"num_input_tokens_seen": 974754,
"step": 295,
"train_runtime": 1357.1478,
"train_tokens_per_second": 718.237
},
{
"epoch": 2.3870967741935485,
"grad_norm": 0.5582734942436218,
"learning_rate": 0.00017417247116009388,
"loss": 0.2613486349582672,
"num_input_tokens_seen": 978032,
"step": 296,
"train_runtime": 1361.0424,
"train_tokens_per_second": 718.59
},
{
"epoch": 2.3951612903225805,
"grad_norm": 0.6195712089538574,
"learning_rate": 0.00017400161758443375,
"loss": 0.29393696784973145,
"num_input_tokens_seen": 981442,
"step": 297,
"train_runtime": 1365.1272,
"train_tokens_per_second": 718.938
},
{
"epoch": 2.403225806451613,
"grad_norm": 0.716315507888794,
"learning_rate": 0.0001738302851507056,
"loss": 0.3132310211658478,
"num_input_tokens_seen": 984794,
"step": 298,
"train_runtime": 1369.1701,
"train_tokens_per_second": 719.263
},
{
"epoch": 2.411290322580645,
"grad_norm": 0.7157399654388428,
"learning_rate": 0.00017365847496758684,
"loss": 0.30963802337646484,
"num_input_tokens_seen": 988146,
"step": 299,
"train_runtime": 1373.2211,
"train_tokens_per_second": 719.583
},
{
"epoch": 2.4193548387096775,
"grad_norm": 0.5889755487442017,
"learning_rate": 0.0001734861881468463,
"loss": 0.25868260860443115,
"num_input_tokens_seen": 991456,
"step": 300,
"train_runtime": 1377.248,
"train_tokens_per_second": 719.882
},
{
"epoch": 2.4193548387096775,
"eval_loss": 2.238002061843872,
"eval_runtime": 10.3632,
"eval_samples_per_second": 5.018,
"eval_steps_per_second": 2.509,
"num_input_tokens_seen": 991456,
"step": 300
},
{
"epoch": 2.4274193548387095,
"grad_norm": 0.5959520936012268,
"learning_rate": 0.00017331342580333706,
"loss": 0.2244856059551239,
"num_input_tokens_seen": 994790,
"step": 301,
"train_runtime": 1395.7066,
"train_tokens_per_second": 712.75
},
{
"epoch": 2.435483870967742,
"grad_norm": 0.7241173982620239,
"learning_rate": 0.00017314018905498931,
"loss": 0.304779589176178,
"num_input_tokens_seen": 998138,
"step": 302,
"train_runtime": 1399.7667,
"train_tokens_per_second": 713.075
},
{
"epoch": 2.443548387096774,
"grad_norm": 0.6994946002960205,
"learning_rate": 0.00017296647902280312,
"loss": 0.3222126364707947,
"num_input_tokens_seen": 1001530,
"step": 303,
"train_runtime": 1403.8708,
"train_tokens_per_second": 713.406
},
{
"epoch": 2.4516129032258065,
"grad_norm": 0.6702247858047485,
"learning_rate": 0.00017279229683084103,
"loss": 0.3149133324623108,
"num_input_tokens_seen": 1004834,
"step": 304,
"train_runtime": 1407.8875,
"train_tokens_per_second": 713.718
},
{
"epoch": 2.4596774193548385,
"grad_norm": 0.6507818698883057,
"learning_rate": 0.00017261764360622102,
"loss": 0.2576565742492676,
"num_input_tokens_seen": 1007956,
"step": 305,
"train_runtime": 1411.7075,
"train_tokens_per_second": 713.998
},
{
"epoch": 2.467741935483871,
"grad_norm": 0.630657970905304,
"learning_rate": 0.00017244252047910892,
"loss": 0.2836134731769562,
"num_input_tokens_seen": 1011316,
"step": 306,
"train_runtime": 1415.8037,
"train_tokens_per_second": 714.305
},
{
"epoch": 2.475806451612903,
"grad_norm": 0.5873962640762329,
"learning_rate": 0.00017226692858271134,
"loss": 0.27408260107040405,
"num_input_tokens_seen": 1014636,
"step": 307,
"train_runtime": 1419.8461,
"train_tokens_per_second": 714.61
},
{
"epoch": 2.4838709677419355,
"grad_norm": 0.6181919574737549,
"learning_rate": 0.00017209086905326833,
"loss": 0.2932998836040497,
"num_input_tokens_seen": 1018032,
"step": 308,
"train_runtime": 1423.9313,
"train_tokens_per_second": 714.945
},
{
"epoch": 2.491935483870968,
"grad_norm": 0.7245927453041077,
"learning_rate": 0.0001719143430300458,
"loss": 0.28670936822891235,
"num_input_tokens_seen": 1021416,
"step": 309,
"train_runtime": 1428.0038,
"train_tokens_per_second": 715.275
},
{
"epoch": 2.5,
"grad_norm": 0.562571108341217,
"learning_rate": 0.00017173735165532846,
"loss": 0.29280805587768555,
"num_input_tokens_seen": 1024810,
"step": 310,
"train_runtime": 1432.1081,
"train_tokens_per_second": 715.595
},
{
"epoch": 2.508064516129032,
"grad_norm": 0.6529526114463806,
"learning_rate": 0.00017155989607441213,
"loss": 0.3507098853588104,
"num_input_tokens_seen": 1028170,
"step": 311,
"train_runtime": 1436.2611,
"train_tokens_per_second": 715.866
},
{
"epoch": 2.5161290322580645,
"grad_norm": 0.6109700202941895,
"learning_rate": 0.00017138197743559654,
"loss": 0.3289310932159424,
"num_input_tokens_seen": 1031370,
"step": 312,
"train_runtime": 1440.118,
"train_tokens_per_second": 716.17
},
{
"epoch": 2.524193548387097,
"grad_norm": 0.612596333026886,
"learning_rate": 0.0001712035968901778,
"loss": 0.30889034271240234,
"num_input_tokens_seen": 1034628,
"step": 313,
"train_runtime": 1444.035,
"train_tokens_per_second": 716.484
},
{
"epoch": 2.532258064516129,
"grad_norm": 0.5865165591239929,
"learning_rate": 0.00017102475559244105,
"loss": 0.24781779944896698,
"num_input_tokens_seen": 1037850,
"step": 314,
"train_runtime": 1447.9152,
"train_tokens_per_second": 716.789
},
{
"epoch": 2.540322580645161,
"grad_norm": 0.6537047028541565,
"learning_rate": 0.00017084545469965283,
"loss": 0.3204243779182434,
"num_input_tokens_seen": 1041268,
"step": 315,
"train_runtime": 1452.0132,
"train_tokens_per_second": 717.12
},
{
"epoch": 2.5483870967741935,
"grad_norm": 0.6006045341491699,
"learning_rate": 0.00017066569537205371,
"loss": 0.2686625123023987,
"num_input_tokens_seen": 1044620,
"step": 316,
"train_runtime": 1456.0387,
"train_tokens_per_second": 717.44
},
{
"epoch": 2.556451612903226,
"grad_norm": 0.5934163928031921,
"learning_rate": 0.00017048547877285077,
"loss": 0.28129106760025024,
"num_input_tokens_seen": 1047986,
"step": 317,
"train_runtime": 1460.0652,
"train_tokens_per_second": 717.767
},
{
"epoch": 2.564516129032258,
"grad_norm": 0.7280172109603882,
"learning_rate": 0.00017030480606821,
"loss": 0.27025726437568665,
"num_input_tokens_seen": 1051334,
"step": 318,
"train_runtime": 1464.0809,
"train_tokens_per_second": 718.085
},
{
"epoch": 2.5725806451612905,
"grad_norm": 0.6704394817352295,
"learning_rate": 0.00017012367842724887,
"loss": 0.30058684945106506,
"num_input_tokens_seen": 1054698,
"step": 319,
"train_runtime": 1468.1115,
"train_tokens_per_second": 718.405
},
{
"epoch": 2.5806451612903225,
"grad_norm": 0.6334468126296997,
"learning_rate": 0.00016994209702202867,
"loss": 0.31510958075523376,
"num_input_tokens_seen": 1058022,
"step": 320,
"train_runtime": 1472.1413,
"train_tokens_per_second": 718.696
},
{
"epoch": 2.588709677419355,
"grad_norm": 0.7871665954589844,
"learning_rate": 0.00016976006302754702,
"loss": 0.3096451163291931,
"num_input_tokens_seen": 1061014,
"step": 321,
"train_runtime": 1475.7415,
"train_tokens_per_second": 718.97
},
{
"epoch": 2.596774193548387,
"grad_norm": 0.636999249458313,
"learning_rate": 0.0001695775776217301,
"loss": 0.2508867383003235,
"num_input_tokens_seen": 1064402,
"step": 322,
"train_runtime": 1479.7786,
"train_tokens_per_second": 719.298
},
{
"epoch": 2.6048387096774195,
"grad_norm": 0.616060733795166,
"learning_rate": 0.00016939464198542523,
"loss": 0.2926797866821289,
"num_input_tokens_seen": 1067606,
"step": 323,
"train_runtime": 1483.6203,
"train_tokens_per_second": 719.595
},
{
"epoch": 2.6129032258064515,
"grad_norm": 0.6996451616287231,
"learning_rate": 0.00016921125730239307,
"loss": 0.33650434017181396,
"num_input_tokens_seen": 1070900,
"step": 324,
"train_runtime": 1487.5405,
"train_tokens_per_second": 719.913
},
{
"epoch": 2.620967741935484,
"grad_norm": 0.5791569352149963,
"learning_rate": 0.00016902742475930006,
"loss": 0.25794506072998047,
"num_input_tokens_seen": 1074256,
"step": 325,
"train_runtime": 1491.5897,
"train_tokens_per_second": 720.209
},
{
"epoch": 2.629032258064516,
"grad_norm": 0.6315230131149292,
"learning_rate": 0.00016884314554571064,
"loss": 0.2916939854621887,
"num_input_tokens_seen": 1077650,
"step": 326,
"train_runtime": 1495.6384,
"train_tokens_per_second": 720.528
},
{
"epoch": 2.6370967741935485,
"grad_norm": 0.702532172203064,
"learning_rate": 0.0001686584208540797,
"loss": 0.31128430366516113,
"num_input_tokens_seen": 1080768,
"step": 327,
"train_runtime": 1499.4368,
"train_tokens_per_second": 720.783
},
{
"epoch": 2.6451612903225805,
"grad_norm": 0.680218517780304,
"learning_rate": 0.00016847325187974477,
"loss": 0.2789115309715271,
"num_input_tokens_seen": 1083952,
"step": 328,
"train_runtime": 1503.275,
"train_tokens_per_second": 721.06
},
{
"epoch": 2.653225806451613,
"grad_norm": 0.7413024306297302,
"learning_rate": 0.00016828763982091826,
"loss": 0.3262504041194916,
"num_input_tokens_seen": 1087238,
"step": 329,
"train_runtime": 1507.2815,
"train_tokens_per_second": 721.324
},
{
"epoch": 2.661290322580645,
"grad_norm": 0.6193671822547913,
"learning_rate": 0.00016810158587867973,
"loss": 0.2687387764453888,
"num_input_tokens_seen": 1090512,
"step": 330,
"train_runtime": 1511.2968,
"train_tokens_per_second": 721.574
},
{
"epoch": 2.661290322580645,
"eval_loss": 2.222731590270996,
"eval_runtime": 10.3677,
"eval_samples_per_second": 5.016,
"eval_steps_per_second": 2.508,
"num_input_tokens_seen": 1090512,
"step": 330
},
{
"epoch": 2.6693548387096775,
"grad_norm": 0.6186102032661438,
"learning_rate": 0.00016791509125696816,
"loss": 0.3041277527809143,
"num_input_tokens_seen": 1093854,
"step": 331,
"train_runtime": 1529.7169,
"train_tokens_per_second": 715.07
},
{
"epoch": 2.6774193548387095,
"grad_norm": 0.6018481850624084,
"learning_rate": 0.00016772815716257412,
"loss": 0.2938111126422882,
"num_input_tokens_seen": 1097220,
"step": 332,
"train_runtime": 1533.7479,
"train_tokens_per_second": 715.385
},
{
"epoch": 2.685483870967742,
"grad_norm": 0.6231778860092163,
"learning_rate": 0.00016754078480513197,
"loss": 0.31392157077789307,
"num_input_tokens_seen": 1100558,
"step": 333,
"train_runtime": 1537.8319,
"train_tokens_per_second": 715.656
},
{
"epoch": 2.693548387096774,
"grad_norm": 0.6355130076408386,
"learning_rate": 0.00016735297539711204,
"loss": 0.2564769983291626,
"num_input_tokens_seen": 1103750,
"step": 334,
"train_runtime": 1541.6907,
"train_tokens_per_second": 715.935
},
{
"epoch": 2.7016129032258065,
"grad_norm": 0.7109224796295166,
"learning_rate": 0.00016716473015381276,
"loss": 0.3542025685310364,
"num_input_tokens_seen": 1107152,
"step": 335,
"train_runtime": 1545.8031,
"train_tokens_per_second": 716.231
},
{
"epoch": 2.709677419354839,
"grad_norm": 0.6006855964660645,
"learning_rate": 0.0001669760502933528,
"loss": 0.3422238528728485,
"num_input_tokens_seen": 1110494,
"step": 336,
"train_runtime": 1549.8692,
"train_tokens_per_second": 716.508
},
{
"epoch": 2.717741935483871,
"grad_norm": 0.6025466322898865,
"learning_rate": 0.00016678693703666325,
"loss": 0.2790500223636627,
"num_input_tokens_seen": 1113802,
"step": 337,
"train_runtime": 1553.9962,
"train_tokens_per_second": 716.734
},
{
"epoch": 2.725806451612903,
"grad_norm": 0.6767451167106628,
"learning_rate": 0.00016659739160747967,
"loss": 0.3348698019981384,
"num_input_tokens_seen": 1117078,
"step": 338,
"train_runtime": 1558.0054,
"train_tokens_per_second": 716.992
},
{
"epoch": 2.7338709677419355,
"grad_norm": 0.7819514274597168,
"learning_rate": 0.00016640741523233407,
"loss": 0.33901745080947876,
"num_input_tokens_seen": 1120186,
"step": 339,
"train_runtime": 1561.7798,
"train_tokens_per_second": 717.25
},
{
"epoch": 2.741935483870968,
"grad_norm": 0.6177900433540344,
"learning_rate": 0.00016621700914054718,
"loss": 0.3110594153404236,
"num_input_tokens_seen": 1123568,
"step": 340,
"train_runtime": 1565.8546,
"train_tokens_per_second": 717.543
},
{
"epoch": 2.75,
"grad_norm": 0.6533312797546387,
"learning_rate": 0.00016602617456422034,
"loss": 0.2856423258781433,
"num_input_tokens_seen": 1126926,
"step": 341,
"train_runtime": 1569.9333,
"train_tokens_per_second": 717.818
},
{
"epoch": 2.758064516129032,
"grad_norm": 0.6791844964027405,
"learning_rate": 0.00016583491273822765,
"loss": 0.32431840896606445,
"num_input_tokens_seen": 1130262,
"step": 342,
"train_runtime": 1573.9778,
"train_tokens_per_second": 718.093
},
{
"epoch": 2.7661290322580645,
"grad_norm": 0.7280325889587402,
"learning_rate": 0.00016564322490020776,
"loss": 0.3399568498134613,
"num_input_tokens_seen": 1133612,
"step": 343,
"train_runtime": 1578.0114,
"train_tokens_per_second": 718.38
},
{
"epoch": 2.774193548387097,
"grad_norm": 1.0895047187805176,
"learning_rate": 0.00016545111229055614,
"loss": 0.336514949798584,
"num_input_tokens_seen": 1136944,
"step": 344,
"train_runtime": 1582.0241,
"train_tokens_per_second": 718.664
},
{
"epoch": 2.782258064516129,
"grad_norm": 0.7008950710296631,
"learning_rate": 0.00016525857615241687,
"loss": 0.35175198316574097,
"num_input_tokens_seen": 1140396,
"step": 345,
"train_runtime": 1586.0972,
"train_tokens_per_second": 718.995
},
{
"epoch": 2.790322580645161,
"grad_norm": 0.579539954662323,
"learning_rate": 0.00016506561773167464,
"loss": 0.29808709025382996,
"num_input_tokens_seen": 1143862,
"step": 346,
"train_runtime": 1590.3586,
"train_tokens_per_second": 719.248
},
{
"epoch": 2.7983870967741935,
"grad_norm": 0.6034091711044312,
"learning_rate": 0.00016487223827694672,
"loss": 0.3193822503089905,
"num_input_tokens_seen": 1147244,
"step": 347,
"train_runtime": 1594.4136,
"train_tokens_per_second": 719.54
},
{
"epoch": 2.806451612903226,
"grad_norm": 0.62319415807724,
"learning_rate": 0.00016467843903957485,
"loss": 0.28525224328041077,
"num_input_tokens_seen": 1150452,
"step": 348,
"train_runtime": 1598.2467,
"train_tokens_per_second": 719.821
},
{
"epoch": 2.814516129032258,
"grad_norm": 0.6607264280319214,
"learning_rate": 0.00016448422127361706,
"loss": 0.2740340828895569,
"num_input_tokens_seen": 1153642,
"step": 349,
"train_runtime": 1602.068,
"train_tokens_per_second": 720.096
},
{
"epoch": 2.8225806451612905,
"grad_norm": 0.7429677844047546,
"learning_rate": 0.00016428958623583982,
"loss": 0.28102120757102966,
"num_input_tokens_seen": 1156888,
"step": 350,
"train_runtime": 1606.0395,
"train_tokens_per_second": 720.336
},
{
"epoch": 2.8306451612903225,
"grad_norm": 0.6027553677558899,
"learning_rate": 0.0001640945351857096,
"loss": 0.2649904489517212,
"num_input_tokens_seen": 1160198,
"step": 351,
"train_runtime": 1610.0476,
"train_tokens_per_second": 720.599
},
{
"epoch": 2.838709677419355,
"grad_norm": 0.6074771285057068,
"learning_rate": 0.0001638990693853848,
"loss": 0.2550981938838959,
"num_input_tokens_seen": 1163444,
"step": 352,
"train_runtime": 1614.0033,
"train_tokens_per_second": 720.844
},
{
"epoch": 2.846774193548387,
"grad_norm": 0.7516021132469177,
"learning_rate": 0.00016370319009970777,
"loss": 0.27090102434158325,
"num_input_tokens_seen": 1166582,
"step": 353,
"train_runtime": 1617.7897,
"train_tokens_per_second": 721.096
},
{
"epoch": 2.8548387096774195,
"grad_norm": 0.608556866645813,
"learning_rate": 0.0001635068985961965,
"loss": 0.2796551585197449,
"num_input_tokens_seen": 1169882,
"step": 354,
"train_runtime": 1621.7832,
"train_tokens_per_second": 721.355
},
{
"epoch": 2.8629032258064515,
"grad_norm": 0.659274697303772,
"learning_rate": 0.00016331019614503623,
"loss": 0.34103769063949585,
"num_input_tokens_seen": 1173312,
"step": 355,
"train_runtime": 1625.8724,
"train_tokens_per_second": 721.651
},
{
"epoch": 2.870967741935484,
"grad_norm": 0.6731792688369751,
"learning_rate": 0.00016311308401907153,
"loss": 0.32820090651512146,
"num_input_tokens_seen": 1176774,
"step": 356,
"train_runtime": 1630.0017,
"train_tokens_per_second": 721.947
},
{
"epoch": 2.879032258064516,
"grad_norm": 0.6033483743667603,
"learning_rate": 0.00016291556349379795,
"loss": 0.28098687529563904,
"num_input_tokens_seen": 1180230,
"step": 357,
"train_runtime": 1634.1041,
"train_tokens_per_second": 722.249
},
{
"epoch": 2.8870967741935485,
"grad_norm": 0.6251856088638306,
"learning_rate": 0.0001627176358473537,
"loss": 0.3627236485481262,
"num_input_tokens_seen": 1183606,
"step": 358,
"train_runtime": 1638.1672,
"train_tokens_per_second": 722.518
},
{
"epoch": 2.8951612903225805,
"grad_norm": 0.64890456199646,
"learning_rate": 0.0001625193023605115,
"loss": 0.2701757252216339,
"num_input_tokens_seen": 1186796,
"step": 359,
"train_runtime": 1642.0043,
"train_tokens_per_second": 722.773
},
{
"epoch": 2.903225806451613,
"grad_norm": 0.6447115540504456,
"learning_rate": 0.00016232056431667017,
"loss": 0.3284357786178589,
"num_input_tokens_seen": 1190008,
"step": 360,
"train_runtime": 1645.8682,
"train_tokens_per_second": 723.028
},
{
"epoch": 2.903225806451613,
"eval_loss": 2.13553524017334,
"eval_runtime": 10.4034,
"eval_samples_per_second": 4.998,
"eval_steps_per_second": 2.499,
"num_input_tokens_seen": 1190008,
"step": 360
},
{
"epoch": 2.911290322580645,
"grad_norm": 0.5953078866004944,
"learning_rate": 0.0001621214230018464,
"loss": 0.307175874710083,
"num_input_tokens_seen": 1193388,
"step": 361,
"train_runtime": 1664.3092,
"train_tokens_per_second": 717.047
},
{
"epoch": 2.9193548387096775,
"grad_norm": 0.6266926527023315,
"learning_rate": 0.00016192187970466644,
"loss": 0.27196913957595825,
"num_input_tokens_seen": 1196530,
"step": 362,
"train_runtime": 1668.1264,
"train_tokens_per_second": 717.29
},
{
"epoch": 2.9274193548387095,
"grad_norm": 0.5678778886795044,
"learning_rate": 0.00016172193571635767,
"loss": 0.2596748471260071,
"num_input_tokens_seen": 1199850,
"step": 363,
"train_runtime": 1672.1956,
"train_tokens_per_second": 717.53
},
{
"epoch": 2.935483870967742,
"grad_norm": 0.7649815082550049,
"learning_rate": 0.00016152159233074037,
"loss": 0.3512059450149536,
"num_input_tokens_seen": 1203098,
"step": 364,
"train_runtime": 1676.1191,
"train_tokens_per_second": 717.788
},
{
"epoch": 2.943548387096774,
"grad_norm": 0.6907058954238892,
"learning_rate": 0.0001613208508442193,
"loss": 0.3612962067127228,
"num_input_tokens_seen": 1206496,
"step": 365,
"train_runtime": 1680.2229,
"train_tokens_per_second": 718.057
},
{
"epoch": 2.9516129032258065,
"grad_norm": 0.5369240045547485,
"learning_rate": 0.0001611197125557752,
"loss": 0.2572643756866455,
"num_input_tokens_seen": 1209652,
"step": 366,
"train_runtime": 1684.0732,
"train_tokens_per_second": 718.289
},
{
"epoch": 2.959677419354839,
"grad_norm": 0.5834548473358154,
"learning_rate": 0.00016091817876695655,
"loss": 0.27215975522994995,
"num_input_tokens_seen": 1212982,
"step": 367,
"train_runtime": 1688.117,
"train_tokens_per_second": 718.541
},
{
"epoch": 2.967741935483871,
"grad_norm": 0.6674952507019043,
"learning_rate": 0.00016071625078187114,
"loss": 0.2797396779060364,
"num_input_tokens_seen": 1216172,
"step": 368,
"train_runtime": 1691.981,
"train_tokens_per_second": 718.786
},
{
"epoch": 2.975806451612903,
"grad_norm": 0.6768476366996765,
"learning_rate": 0.0001605139299071774,
"loss": 0.3542667329311371,
"num_input_tokens_seen": 1219592,
"step": 369,
"train_runtime": 1696.1081,
"train_tokens_per_second": 719.053
},
{
"epoch": 2.9838709677419355,
"grad_norm": 0.7091079354286194,
"learning_rate": 0.00016031121745207626,
"loss": 0.2957330048084259,
"num_input_tokens_seen": 1222706,
"step": 370,
"train_runtime": 1699.921,
"train_tokens_per_second": 719.272
},
{
"epoch": 2.991935483870968,
"grad_norm": 0.7082553505897522,
"learning_rate": 0.00016010811472830252,
"loss": 0.2873592972755432,
"num_input_tokens_seen": 1225804,
"step": 371,
"train_runtime": 1703.6255,
"train_tokens_per_second": 719.527
},
{
"epoch": 3.0,
"grad_norm": 0.5840099453926086,
"learning_rate": 0.0001599046230501163,
"loss": 0.26493021845817566,
"num_input_tokens_seen": 1229126,
"step": 372,
"train_runtime": 1707.6656,
"train_tokens_per_second": 719.77
},
{
"epoch": 3.0080645161290325,
"grad_norm": 0.5550098419189453,
"learning_rate": 0.00015970074373429464,
"loss": 0.1822848916053772,
"num_input_tokens_seen": 1232466,
"step": 373,
"train_runtime": 1711.6892,
"train_tokens_per_second": 720.029
},
{
"epoch": 3.0161290322580645,
"grad_norm": 0.5182167291641235,
"learning_rate": 0.00015949647810012301,
"loss": 0.18206968903541565,
"num_input_tokens_seen": 1235888,
"step": 374,
"train_runtime": 1715.7992,
"train_tokens_per_second": 720.299
},
{
"epoch": 3.024193548387097,
"grad_norm": 0.48984354734420776,
"learning_rate": 0.0001592918274693866,
"loss": 0.16470718383789062,
"num_input_tokens_seen": 1239272,
"step": 375,
"train_runtime": 1719.8357,
"train_tokens_per_second": 720.576
},
{
"epoch": 3.032258064516129,
"grad_norm": 0.528260350227356,
"learning_rate": 0.000159086793166362,
"loss": 0.1563296616077423,
"num_input_tokens_seen": 1242570,
"step": 376,
"train_runtime": 1723.8445,
"train_tokens_per_second": 720.813
},
{
"epoch": 3.0403225806451615,
"grad_norm": 0.5711667537689209,
"learning_rate": 0.00015888137651780845,
"loss": 0.1582183986902237,
"num_input_tokens_seen": 1245794,
"step": 377,
"train_runtime": 1727.6991,
"train_tokens_per_second": 721.071
},
{
"epoch": 3.0483870967741935,
"grad_norm": 0.558810293674469,
"learning_rate": 0.0001586755788529593,
"loss": 0.14666521549224854,
"num_input_tokens_seen": 1249196,
"step": 378,
"train_runtime": 1731.7597,
"train_tokens_per_second": 721.345
},
{
"epoch": 3.056451612903226,
"grad_norm": 0.8101882338523865,
"learning_rate": 0.00015846940150351344,
"loss": 0.18561005592346191,
"num_input_tokens_seen": 1252432,
"step": 379,
"train_runtime": 1735.6067,
"train_tokens_per_second": 721.61
},
{
"epoch": 3.064516129032258,
"grad_norm": 0.7118780612945557,
"learning_rate": 0.00015826284580362668,
"loss": 0.1576988697052002,
"num_input_tokens_seen": 1255716,
"step": 380,
"train_runtime": 1739.5733,
"train_tokens_per_second": 721.853
},
{
"epoch": 3.0725806451612905,
"grad_norm": 0.8138672113418579,
"learning_rate": 0.00015805591308990308,
"loss": 0.18212170898914337,
"num_input_tokens_seen": 1259090,
"step": 381,
"train_runtime": 1743.649,
"train_tokens_per_second": 722.101
},
{
"epoch": 3.0806451612903225,
"grad_norm": 0.7878643274307251,
"learning_rate": 0.00015784860470138633,
"loss": 0.18663600087165833,
"num_input_tokens_seen": 1262366,
"step": 382,
"train_runtime": 1747.5729,
"train_tokens_per_second": 722.354
},
{
"epoch": 3.088709677419355,
"grad_norm": 0.7864635586738586,
"learning_rate": 0.00015764092197955112,
"loss": 0.18056976795196533,
"num_input_tokens_seen": 1265824,
"step": 383,
"train_runtime": 1751.6686,
"train_tokens_per_second": 722.639
},
{
"epoch": 3.096774193548387,
"grad_norm": 0.7926681637763977,
"learning_rate": 0.00015743286626829437,
"loss": 0.12878452241420746,
"num_input_tokens_seen": 1269094,
"step": 384,
"train_runtime": 1755.6443,
"train_tokens_per_second": 722.865
},
{
"epoch": 3.1048387096774195,
"grad_norm": 0.7887319922447205,
"learning_rate": 0.00015722443891392658,
"loss": 0.18509285151958466,
"num_input_tokens_seen": 1272194,
"step": 385,
"train_runtime": 1759.477,
"train_tokens_per_second": 723.052
},
{
"epoch": 3.1129032258064515,
"grad_norm": 0.6997226476669312,
"learning_rate": 0.00015701564126516314,
"loss": 0.16511765122413635,
"num_input_tokens_seen": 1275400,
"step": 386,
"train_runtime": 1763.3141,
"train_tokens_per_second": 723.297
},
{
"epoch": 3.120967741935484,
"grad_norm": 0.8127301931381226,
"learning_rate": 0.00015680647467311557,
"loss": 0.1577507108449936,
"num_input_tokens_seen": 1278726,
"step": 387,
"train_runtime": 1767.3065,
"train_tokens_per_second": 723.545
},
{
"epoch": 3.129032258064516,
"grad_norm": 0.7188698053359985,
"learning_rate": 0.00015659694049128286,
"loss": 0.19617129862308502,
"num_input_tokens_seen": 1281930,
"step": 388,
"train_runtime": 1771.1468,
"train_tokens_per_second": 723.785
},
{
"epoch": 3.1370967741935485,
"grad_norm": 0.4875478744506836,
"learning_rate": 0.0001563870400755425,
"loss": 0.11804057657718658,
"num_input_tokens_seen": 1285208,
"step": 389,
"train_runtime": 1775.1314,
"train_tokens_per_second": 724.007
},
{
"epoch": 3.1451612903225805,
"grad_norm": 0.7870502471923828,
"learning_rate": 0.00015617677478414196,
"loss": 0.17701417207717896,
"num_input_tokens_seen": 1288420,
"step": 390,
"train_runtime": 1778.9788,
"train_tokens_per_second": 724.247
},
{
"epoch": 3.1451612903225805,
"eval_loss": 2.6576552391052246,
"eval_runtime": 10.2889,
"eval_samples_per_second": 5.054,
"eval_steps_per_second": 2.527,
"num_input_tokens_seen": 1288420,
"step": 390
}
],
"logging_steps": 1,
"max_steps": 1240,
"num_input_tokens_seen": 1288420,
"num_train_epochs": 10,
"save_steps": 30,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.929439261884288e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}