Qwen3-4B-Base-SFT-tr5 / trainer_state.json
edbeeching's picture
edbeeching HF Staff
Model save
55054cc verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.6268656716417915,
"eval_steps": 500,
"global_step": 620,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007462686567164179,
"grad_norm": 1.8659797972008254,
"learning_rate": 0.0,
"loss": 0.7986637353897095,
"num_tokens": 940199.0,
"step": 1
},
{
"epoch": 0.014925373134328358,
"grad_norm": 2.067470583174153,
"learning_rate": 5.2631578947368416e-08,
"loss": 0.8278242945671082,
"num_tokens": 1940958.0,
"step": 2
},
{
"epoch": 0.022388059701492536,
"grad_norm": 1.8137442955270493,
"learning_rate": 1.0526315789473683e-07,
"loss": 0.7961194515228271,
"num_tokens": 2857380.0,
"step": 3
},
{
"epoch": 0.029850746268656716,
"grad_norm": 1.8752542747158565,
"learning_rate": 1.5789473684210525e-07,
"loss": 0.7988173961639404,
"num_tokens": 3696403.0,
"step": 4
},
{
"epoch": 0.03731343283582089,
"grad_norm": 1.9383021105435863,
"learning_rate": 2.1052631578947366e-07,
"loss": 0.8283753395080566,
"num_tokens": 4528235.0,
"step": 5
},
{
"epoch": 0.04477611940298507,
"grad_norm": 1.9512483400491862,
"learning_rate": 2.631578947368421e-07,
"loss": 0.8254790902137756,
"num_tokens": 5554672.0,
"step": 6
},
{
"epoch": 0.05223880597014925,
"grad_norm": 1.8813572711890862,
"learning_rate": 3.157894736842105e-07,
"loss": 0.8230706453323364,
"num_tokens": 6423132.0,
"step": 7
},
{
"epoch": 0.05970149253731343,
"grad_norm": 1.9448895375733437,
"learning_rate": 3.684210526315789e-07,
"loss": 0.8051227331161499,
"num_tokens": 7201644.0,
"step": 8
},
{
"epoch": 0.06716417910447761,
"grad_norm": 1.8922387607703381,
"learning_rate": 4.2105263157894733e-07,
"loss": 0.7542356252670288,
"num_tokens": 8128715.0,
"step": 9
},
{
"epoch": 0.07462686567164178,
"grad_norm": 1.9842025446380493,
"learning_rate": 4.7368421052631574e-07,
"loss": 0.8522481918334961,
"num_tokens": 9074027.0,
"step": 10
},
{
"epoch": 0.08208955223880597,
"grad_norm": 1.7964968435378388,
"learning_rate": 5.263157894736842e-07,
"loss": 0.7835813760757446,
"num_tokens": 9950641.0,
"step": 11
},
{
"epoch": 0.08955223880597014,
"grad_norm": 1.8759196623765004,
"learning_rate": 5.789473684210526e-07,
"loss": 0.8266638517379761,
"num_tokens": 10885057.0,
"step": 12
},
{
"epoch": 0.09701492537313433,
"grad_norm": 1.739475404373344,
"learning_rate": 6.31578947368421e-07,
"loss": 0.8051838874816895,
"num_tokens": 11697963.0,
"step": 13
},
{
"epoch": 0.1044776119402985,
"grad_norm": 1.6356821751063044,
"learning_rate": 6.842105263157895e-07,
"loss": 0.7847919464111328,
"num_tokens": 12632602.0,
"step": 14
},
{
"epoch": 0.11194029850746269,
"grad_norm": 1.6314037268606378,
"learning_rate": 7.368421052631578e-07,
"loss": 0.775245189666748,
"num_tokens": 13568889.0,
"step": 15
},
{
"epoch": 0.11940298507462686,
"grad_norm": 1.6105524213156879,
"learning_rate": 7.894736842105263e-07,
"loss": 0.8013657331466675,
"num_tokens": 14534242.0,
"step": 16
},
{
"epoch": 0.12686567164179105,
"grad_norm": 1.5779316833603265,
"learning_rate": 8.421052631578947e-07,
"loss": 0.80766761302948,
"num_tokens": 15435946.0,
"step": 17
},
{
"epoch": 0.13432835820895522,
"grad_norm": 1.5079351322768313,
"learning_rate": 8.947368421052631e-07,
"loss": 0.7658109664916992,
"num_tokens": 16352267.0,
"step": 18
},
{
"epoch": 0.1417910447761194,
"grad_norm": 1.320308981062678,
"learning_rate": 9.473684210526315e-07,
"loss": 0.7778770327568054,
"num_tokens": 17277422.0,
"step": 19
},
{
"epoch": 0.14925373134328357,
"grad_norm": 1.4101803404332138,
"learning_rate": 1e-06,
"loss": 0.8158027529716492,
"num_tokens": 18270697.0,
"step": 20
},
{
"epoch": 0.15671641791044777,
"grad_norm": 1.3401873553364563,
"learning_rate": 9.999938520216342e-07,
"loss": 0.7766833305358887,
"num_tokens": 19308555.0,
"step": 21
},
{
"epoch": 0.16417910447761194,
"grad_norm": 1.27445266639497,
"learning_rate": 9.999754082545259e-07,
"loss": 0.7421952486038208,
"num_tokens": 20162797.0,
"step": 22
},
{
"epoch": 0.17164179104477612,
"grad_norm": 1.3912932427121052,
"learning_rate": 9.999446692026396e-07,
"loss": 0.7800503969192505,
"num_tokens": 20981106.0,
"step": 23
},
{
"epoch": 0.1791044776119403,
"grad_norm": 1.2963982894236503,
"learning_rate": 9.999016357058995e-07,
"loss": 0.766775369644165,
"num_tokens": 21858000.0,
"step": 24
},
{
"epoch": 0.1865671641791045,
"grad_norm": 1.140051976839368,
"learning_rate": 9.998463089401678e-07,
"loss": 0.7179380059242249,
"num_tokens": 22793285.0,
"step": 25
},
{
"epoch": 0.19402985074626866,
"grad_norm": 1.1651471102131281,
"learning_rate": 9.997786904172126e-07,
"loss": 0.810413122177124,
"num_tokens": 23723801.0,
"step": 26
},
{
"epoch": 0.20149253731343283,
"grad_norm": 1.0407108116745094,
"learning_rate": 9.996987819846655e-07,
"loss": 0.7446407079696655,
"num_tokens": 24725740.0,
"step": 27
},
{
"epoch": 0.208955223880597,
"grad_norm": 0.9942554431409824,
"learning_rate": 9.996065858259727e-07,
"loss": 0.7915131449699402,
"num_tokens": 25730725.0,
"step": 28
},
{
"epoch": 0.21641791044776118,
"grad_norm": 1.0234340889029334,
"learning_rate": 9.995021044603342e-07,
"loss": 0.7581333518028259,
"num_tokens": 26557776.0,
"step": 29
},
{
"epoch": 0.22388059701492538,
"grad_norm": 1.0332669286266374,
"learning_rate": 9.993853407426352e-07,
"loss": 0.7365682125091553,
"num_tokens": 27504251.0,
"step": 30
},
{
"epoch": 0.23134328358208955,
"grad_norm": 1.0634534407808287,
"learning_rate": 9.99256297863368e-07,
"loss": 0.7191506624221802,
"num_tokens": 28534541.0,
"step": 31
},
{
"epoch": 0.23880597014925373,
"grad_norm": 1.1818384387181422,
"learning_rate": 9.99114979348545e-07,
"loss": 0.7689279317855835,
"num_tokens": 29341502.0,
"step": 32
},
{
"epoch": 0.2462686567164179,
"grad_norm": 1.0143867857533606,
"learning_rate": 9.989613890596033e-07,
"loss": 0.7768257856369019,
"num_tokens": 30211822.0,
"step": 33
},
{
"epoch": 0.2537313432835821,
"grad_norm": 0.9912608655279589,
"learning_rate": 9.987955311932968e-07,
"loss": 0.7552160024642944,
"num_tokens": 31102775.0,
"step": 34
},
{
"epoch": 0.26119402985074625,
"grad_norm": 0.859671875219598,
"learning_rate": 9.986174102815837e-07,
"loss": 0.7417880892753601,
"num_tokens": 31898227.0,
"step": 35
},
{
"epoch": 0.26865671641791045,
"grad_norm": 0.8467136154890645,
"learning_rate": 9.984270311915018e-07,
"loss": 0.7220484614372253,
"num_tokens": 32541892.0,
"step": 36
},
{
"epoch": 0.27611940298507465,
"grad_norm": 0.968520072067917,
"learning_rate": 9.982243991250357e-07,
"loss": 0.7436271905899048,
"num_tokens": 33543040.0,
"step": 37
},
{
"epoch": 0.2835820895522388,
"grad_norm": 0.7117412176092366,
"learning_rate": 9.980095196189748e-07,
"loss": 0.7281963229179382,
"num_tokens": 34505224.0,
"step": 38
},
{
"epoch": 0.291044776119403,
"grad_norm": 0.7282443922942368,
"learning_rate": 9.977823985447613e-07,
"loss": 0.7709681987762451,
"num_tokens": 35411826.0,
"step": 39
},
{
"epoch": 0.29850746268656714,
"grad_norm": 0.8081380042496161,
"learning_rate": 9.975430421083305e-07,
"loss": 0.761425256729126,
"num_tokens": 36307345.0,
"step": 40
},
{
"epoch": 0.30597014925373134,
"grad_norm": 0.8423198034792783,
"learning_rate": 9.972914568499411e-07,
"loss": 0.7312315106391907,
"num_tokens": 37196875.0,
"step": 41
},
{
"epoch": 0.31343283582089554,
"grad_norm": 0.8739486151254683,
"learning_rate": 9.970276496439966e-07,
"loss": 0.7070371508598328,
"num_tokens": 38112193.0,
"step": 42
},
{
"epoch": 0.3208955223880597,
"grad_norm": 0.8222714239323922,
"learning_rate": 9.967516276988567e-07,
"loss": 0.7004337310791016,
"num_tokens": 38855918.0,
"step": 43
},
{
"epoch": 0.3283582089552239,
"grad_norm": 0.6714287856764727,
"learning_rate": 9.964633985566412e-07,
"loss": 0.7193351984024048,
"num_tokens": 39833215.0,
"step": 44
},
{
"epoch": 0.3358208955223881,
"grad_norm": 0.6336781599517638,
"learning_rate": 9.961629700930235e-07,
"loss": 0.7344927787780762,
"num_tokens": 40760145.0,
"step": 45
},
{
"epoch": 0.34328358208955223,
"grad_norm": 0.669269067988794,
"learning_rate": 9.958503505170155e-07,
"loss": 0.7277801632881165,
"num_tokens": 41745749.0,
"step": 46
},
{
"epoch": 0.35074626865671643,
"grad_norm": 0.7317041279195701,
"learning_rate": 9.95525548370744e-07,
"loss": 0.7005234956741333,
"num_tokens": 42686630.0,
"step": 47
},
{
"epoch": 0.3582089552238806,
"grad_norm": 0.7721535484599115,
"learning_rate": 9.95188572529215e-07,
"loss": 0.7193202376365662,
"num_tokens": 43510586.0,
"step": 48
},
{
"epoch": 0.3656716417910448,
"grad_norm": 0.7041253506550509,
"learning_rate": 9.948394322000746e-07,
"loss": 0.6881219744682312,
"num_tokens": 44362248.0,
"step": 49
},
{
"epoch": 0.373134328358209,
"grad_norm": 0.636240363667009,
"learning_rate": 9.944781369233543e-07,
"loss": 0.6522014141082764,
"num_tokens": 45216722.0,
"step": 50
},
{
"epoch": 0.3805970149253731,
"grad_norm": 0.6272094853443818,
"learning_rate": 9.941046965712122e-07,
"loss": 0.6842180490493774,
"num_tokens": 46010142.0,
"step": 51
},
{
"epoch": 0.3880597014925373,
"grad_norm": 0.5645071953625206,
"learning_rate": 9.937191213476625e-07,
"loss": 0.6692793369293213,
"num_tokens": 46858670.0,
"step": 52
},
{
"epoch": 0.39552238805970147,
"grad_norm": 0.585837329746578,
"learning_rate": 9.933214217882971e-07,
"loss": 0.7204340696334839,
"num_tokens": 47836905.0,
"step": 53
},
{
"epoch": 0.40298507462686567,
"grad_norm": 0.5938197808350745,
"learning_rate": 9.929116087599972e-07,
"loss": 0.7186766266822815,
"num_tokens": 48836237.0,
"step": 54
},
{
"epoch": 0.41044776119402987,
"grad_norm": 0.5264192209085824,
"learning_rate": 9.924896934606364e-07,
"loss": 0.7225839495658875,
"num_tokens": 49860153.0,
"step": 55
},
{
"epoch": 0.417910447761194,
"grad_norm": 0.5025994710035754,
"learning_rate": 9.920556874187757e-07,
"loss": 0.6711542010307312,
"num_tokens": 50786110.0,
"step": 56
},
{
"epoch": 0.4253731343283582,
"grad_norm": 0.43381197403497257,
"learning_rate": 9.91609602493347e-07,
"loss": 0.6544876098632812,
"num_tokens": 51790390.0,
"step": 57
},
{
"epoch": 0.43283582089552236,
"grad_norm": 0.43233023140164006,
"learning_rate": 9.911514508733306e-07,
"loss": 0.7029759883880615,
"num_tokens": 52742397.0,
"step": 58
},
{
"epoch": 0.44029850746268656,
"grad_norm": 0.43910093495037306,
"learning_rate": 9.906812450774207e-07,
"loss": 0.7200834155082703,
"num_tokens": 53673114.0,
"step": 59
},
{
"epoch": 0.44776119402985076,
"grad_norm": 0.44127209489849284,
"learning_rate": 9.90198997953684e-07,
"loss": 0.6370296478271484,
"num_tokens": 54566889.0,
"step": 60
},
{
"epoch": 0.4552238805970149,
"grad_norm": 0.45096698239872907,
"learning_rate": 9.89704722679209e-07,
"loss": 0.6861921548843384,
"num_tokens": 55460491.0,
"step": 61
},
{
"epoch": 0.4626865671641791,
"grad_norm": 0.4231809219141582,
"learning_rate": 9.89198432759746e-07,
"loss": 0.6846483945846558,
"num_tokens": 56520990.0,
"step": 62
},
{
"epoch": 0.4701492537313433,
"grad_norm": 0.4911103394390683,
"learning_rate": 9.886801420293363e-07,
"loss": 0.6806150674819946,
"num_tokens": 57422206.0,
"step": 63
},
{
"epoch": 0.47761194029850745,
"grad_norm": 0.4523240789905204,
"learning_rate": 9.881498646499368e-07,
"loss": 0.7077186107635498,
"num_tokens": 58260720.0,
"step": 64
},
{
"epoch": 0.48507462686567165,
"grad_norm": 0.41521834156751597,
"learning_rate": 9.876076151110313e-07,
"loss": 0.696556806564331,
"num_tokens": 59123617.0,
"step": 65
},
{
"epoch": 0.4925373134328358,
"grad_norm": 0.44090049904740325,
"learning_rate": 9.870534082292349e-07,
"loss": 0.6695712804794312,
"num_tokens": 60033505.0,
"step": 66
},
{
"epoch": 0.5,
"grad_norm": 0.3986636896953578,
"learning_rate": 9.864872591478893e-07,
"loss": 0.6385202407836914,
"num_tokens": 60974452.0,
"step": 67
},
{
"epoch": 0.5074626865671642,
"grad_norm": 0.36309454275310177,
"learning_rate": 9.859091833366496e-07,
"loss": 0.6627390384674072,
"num_tokens": 61913977.0,
"step": 68
},
{
"epoch": 0.5149253731343284,
"grad_norm": 0.39014943112878603,
"learning_rate": 9.853191965910605e-07,
"loss": 0.6990819573402405,
"num_tokens": 62800879.0,
"step": 69
},
{
"epoch": 0.5223880597014925,
"grad_norm": 0.365989388161065,
"learning_rate": 9.84717315032125e-07,
"loss": 0.6741257309913635,
"num_tokens": 63823183.0,
"step": 70
},
{
"epoch": 0.5298507462686567,
"grad_norm": 0.367640955997548,
"learning_rate": 9.841035551058648e-07,
"loss": 0.657660722732544,
"num_tokens": 64700087.0,
"step": 71
},
{
"epoch": 0.5373134328358209,
"grad_norm": 0.38259984541818554,
"learning_rate": 9.834779335828697e-07,
"loss": 0.6268120408058167,
"num_tokens": 65533415.0,
"step": 72
},
{
"epoch": 0.5447761194029851,
"grad_norm": 0.4459381628745124,
"learning_rate": 9.828404675578403e-07,
"loss": 0.6166980266571045,
"num_tokens": 66411589.0,
"step": 73
},
{
"epoch": 0.5522388059701493,
"grad_norm": 0.3964961096159446,
"learning_rate": 9.821911744491202e-07,
"loss": 0.6667238473892212,
"num_tokens": 67203675.0,
"step": 74
},
{
"epoch": 0.5597014925373134,
"grad_norm": 0.3942585728115673,
"learning_rate": 9.815300719982202e-07,
"loss": 0.6620233058929443,
"num_tokens": 68056574.0,
"step": 75
},
{
"epoch": 0.5671641791044776,
"grad_norm": 0.35812083783608406,
"learning_rate": 9.808571782693343e-07,
"loss": 0.6339540481567383,
"num_tokens": 68907426.0,
"step": 76
},
{
"epoch": 0.5746268656716418,
"grad_norm": 0.3369082279659881,
"learning_rate": 9.801725116488449e-07,
"loss": 0.6345670819282532,
"num_tokens": 69817179.0,
"step": 77
},
{
"epoch": 0.582089552238806,
"grad_norm": 0.4132547384225159,
"learning_rate": 9.794760908448213e-07,
"loss": 0.6722534894943237,
"num_tokens": 70803003.0,
"step": 78
},
{
"epoch": 0.5895522388059702,
"grad_norm": 0.3627266662239884,
"learning_rate": 9.78767934886508e-07,
"loss": 0.6158405542373657,
"num_tokens": 71708353.0,
"step": 79
},
{
"epoch": 0.5970149253731343,
"grad_norm": 0.3555505789377954,
"learning_rate": 9.78048063123805e-07,
"loss": 0.6479436159133911,
"num_tokens": 72587706.0,
"step": 80
},
{
"epoch": 0.6044776119402985,
"grad_norm": 0.36801010623436975,
"learning_rate": 9.773164952267392e-07,
"loss": 0.6247404217720032,
"num_tokens": 73605832.0,
"step": 81
},
{
"epoch": 0.6119402985074627,
"grad_norm": 0.4986298126613671,
"learning_rate": 9.765732511849267e-07,
"loss": 0.6385573744773865,
"num_tokens": 74512496.0,
"step": 82
},
{
"epoch": 0.6194029850746269,
"grad_norm": 0.3500792677565209,
"learning_rate": 9.758183513070266e-07,
"loss": 0.6781474351882935,
"num_tokens": 75428481.0,
"step": 83
},
{
"epoch": 0.6268656716417911,
"grad_norm": 0.39093330569733553,
"learning_rate": 9.750518162201857e-07,
"loss": 0.6386494636535645,
"num_tokens": 76292706.0,
"step": 84
},
{
"epoch": 0.6343283582089553,
"grad_norm": 0.4070936194536431,
"learning_rate": 9.742736668694758e-07,
"loss": 0.6180363893508911,
"num_tokens": 77209633.0,
"step": 85
},
{
"epoch": 0.6417910447761194,
"grad_norm": 0.3354352334817167,
"learning_rate": 9.734839245173211e-07,
"loss": 0.6163570880889893,
"num_tokens": 78063420.0,
"step": 86
},
{
"epoch": 0.6492537313432836,
"grad_norm": 0.45211158839386173,
"learning_rate": 9.726826107429168e-07,
"loss": 0.6268313527107239,
"num_tokens": 78870397.0,
"step": 87
},
{
"epoch": 0.6567164179104478,
"grad_norm": 0.31941448751910695,
"learning_rate": 9.718697474416388e-07,
"loss": 0.6327146291732788,
"num_tokens": 79871666.0,
"step": 88
},
{
"epoch": 0.664179104477612,
"grad_norm": 0.3722172382477918,
"learning_rate": 9.71045356824448e-07,
"loss": 0.6110676527023315,
"num_tokens": 80712206.0,
"step": 89
},
{
"epoch": 0.6716417910447762,
"grad_norm": 0.3465587426296506,
"learning_rate": 9.7020946141728e-07,
"loss": 0.5954413414001465,
"num_tokens": 81538216.0,
"step": 90
},
{
"epoch": 0.6791044776119403,
"grad_norm": 0.30862586857277025,
"learning_rate": 9.693620840604325e-07,
"loss": 0.6106799840927124,
"num_tokens": 82585839.0,
"step": 91
},
{
"epoch": 0.6865671641791045,
"grad_norm": 0.34576661664754443,
"learning_rate": 9.685032479079392e-07,
"loss": 0.6184056997299194,
"num_tokens": 83427449.0,
"step": 92
},
{
"epoch": 0.6940298507462687,
"grad_norm": 0.30968028932263886,
"learning_rate": 9.676329764269383e-07,
"loss": 0.6404486894607544,
"num_tokens": 84449388.0,
"step": 93
},
{
"epoch": 0.7014925373134329,
"grad_norm": 0.3132280475142172,
"learning_rate": 9.667512933970313e-07,
"loss": 0.6027534008026123,
"num_tokens": 85256514.0,
"step": 94
},
{
"epoch": 0.7089552238805971,
"grad_norm": 0.37723622679137114,
"learning_rate": 9.658582229096319e-07,
"loss": 0.636467695236206,
"num_tokens": 86165960.0,
"step": 95
},
{
"epoch": 0.7164179104477612,
"grad_norm": 0.3415033431478265,
"learning_rate": 9.649537893673095e-07,
"loss": 0.6198180317878723,
"num_tokens": 86982659.0,
"step": 96
},
{
"epoch": 0.7238805970149254,
"grad_norm": 0.38663387244532404,
"learning_rate": 9.640380174831209e-07,
"loss": 0.6216307878494263,
"num_tokens": 87931000.0,
"step": 97
},
{
"epoch": 0.7313432835820896,
"grad_norm": 0.32790162197293926,
"learning_rate": 9.631109322799361e-07,
"loss": 0.6376453638076782,
"num_tokens": 88689701.0,
"step": 98
},
{
"epoch": 0.7388059701492538,
"grad_norm": 0.30426811432679324,
"learning_rate": 9.621725590897543e-07,
"loss": 0.6182718276977539,
"num_tokens": 89547645.0,
"step": 99
},
{
"epoch": 0.746268656716418,
"grad_norm": 0.3319556928745246,
"learning_rate": 9.61222923553011e-07,
"loss": 0.6192991733551025,
"num_tokens": 90297517.0,
"step": 100
},
{
"epoch": 0.753731343283582,
"grad_norm": 0.3063539981854288,
"learning_rate": 9.602620516178788e-07,
"loss": 0.6192951202392578,
"num_tokens": 91186856.0,
"step": 101
},
{
"epoch": 0.7611940298507462,
"grad_norm": 0.3353221957958766,
"learning_rate": 9.592899695395568e-07,
"loss": 0.6191784739494324,
"num_tokens": 91987232.0,
"step": 102
},
{
"epoch": 0.7686567164179104,
"grad_norm": 2.4726215379945016,
"learning_rate": 9.583067038795544e-07,
"loss": 0.6689252257347107,
"num_tokens": 92898696.0,
"step": 103
},
{
"epoch": 0.7761194029850746,
"grad_norm": 0.3683317561657326,
"learning_rate": 9.57312281504965e-07,
"loss": 0.6164358854293823,
"num_tokens": 93791120.0,
"step": 104
},
{
"epoch": 0.7835820895522388,
"grad_norm": 0.31142782603302166,
"learning_rate": 9.563067295877318e-07,
"loss": 0.5814804434776306,
"num_tokens": 94639289.0,
"step": 105
},
{
"epoch": 0.7910447761194029,
"grad_norm": 0.5777785271285245,
"learning_rate": 9.552900756039056e-07,
"loss": 0.6628624200820923,
"num_tokens": 95400207.0,
"step": 106
},
{
"epoch": 0.7985074626865671,
"grad_norm": 0.31405569146967405,
"learning_rate": 9.54262347332894e-07,
"loss": 0.647003710269928,
"num_tokens": 96227104.0,
"step": 107
},
{
"epoch": 0.8059701492537313,
"grad_norm": 0.3000528898105912,
"learning_rate": 9.532235728567022e-07,
"loss": 0.6015387177467346,
"num_tokens": 97056588.0,
"step": 108
},
{
"epoch": 0.8134328358208955,
"grad_norm": 0.3033934043892588,
"learning_rate": 9.521737805591661e-07,
"loss": 0.629927396774292,
"num_tokens": 97944111.0,
"step": 109
},
{
"epoch": 0.8208955223880597,
"grad_norm": 0.29345271513653554,
"learning_rate": 9.511129991251755e-07,
"loss": 0.5817909836769104,
"num_tokens": 98816920.0,
"step": 110
},
{
"epoch": 0.8283582089552238,
"grad_norm": 0.3148352988531031,
"learning_rate": 9.500412575398922e-07,
"loss": 0.6288615465164185,
"num_tokens": 99773832.0,
"step": 111
},
{
"epoch": 0.835820895522388,
"grad_norm": 0.3225302890726275,
"learning_rate": 9.489585850879564e-07,
"loss": 0.6282119750976562,
"num_tokens": 100805832.0,
"step": 112
},
{
"epoch": 0.8432835820895522,
"grad_norm": 0.3125344123235252,
"learning_rate": 9.478650113526874e-07,
"loss": 0.6161372661590576,
"num_tokens": 101747939.0,
"step": 113
},
{
"epoch": 0.8507462686567164,
"grad_norm": 0.3022490350310229,
"learning_rate": 9.467605662152745e-07,
"loss": 0.6462452411651611,
"num_tokens": 102733715.0,
"step": 114
},
{
"epoch": 0.8582089552238806,
"grad_norm": 0.29284676964407896,
"learning_rate": 9.456452798539616e-07,
"loss": 0.5786178112030029,
"num_tokens": 103577969.0,
"step": 115
},
{
"epoch": 0.8656716417910447,
"grad_norm": 0.31081858681339425,
"learning_rate": 9.445191827432215e-07,
"loss": 0.6079792380332947,
"num_tokens": 104507837.0,
"step": 116
},
{
"epoch": 0.8731343283582089,
"grad_norm": 0.305140531974358,
"learning_rate": 9.433823056529241e-07,
"loss": 0.6422327160835266,
"num_tokens": 105482901.0,
"step": 117
},
{
"epoch": 0.8805970149253731,
"grad_norm": 0.8470132754236691,
"learning_rate": 9.422346796474949e-07,
"loss": 0.6116156578063965,
"num_tokens": 106441176.0,
"step": 118
},
{
"epoch": 0.8880597014925373,
"grad_norm": 0.3062865057157625,
"learning_rate": 9.410763360850665e-07,
"loss": 0.6365537643432617,
"num_tokens": 107265870.0,
"step": 119
},
{
"epoch": 0.8955223880597015,
"grad_norm": 0.3112284745186021,
"learning_rate": 9.399073066166217e-07,
"loss": 0.6294253468513489,
"num_tokens": 108146690.0,
"step": 120
},
{
"epoch": 0.9029850746268657,
"grad_norm": 0.3489293940518192,
"learning_rate": 9.38727623185129e-07,
"loss": 0.6304266452789307,
"num_tokens": 109034402.0,
"step": 121
},
{
"epoch": 0.9104477611940298,
"grad_norm": 0.30735356689099647,
"learning_rate": 9.375373180246696e-07,
"loss": 0.6445657014846802,
"num_tokens": 109989572.0,
"step": 122
},
{
"epoch": 0.917910447761194,
"grad_norm": 0.30036163756207646,
"learning_rate": 9.36336423659556e-07,
"loss": 0.6575721502304077,
"num_tokens": 110969334.0,
"step": 123
},
{
"epoch": 0.9253731343283582,
"grad_norm": 0.3150563557302255,
"learning_rate": 9.351249729034441e-07,
"loss": 0.6350916624069214,
"num_tokens": 111844990.0,
"step": 124
},
{
"epoch": 0.9328358208955224,
"grad_norm": 0.42959935293984475,
"learning_rate": 9.339029988584364e-07,
"loss": 0.6106451153755188,
"num_tokens": 112800888.0,
"step": 125
},
{
"epoch": 0.9402985074626866,
"grad_norm": 0.295731574777322,
"learning_rate": 9.326705349141772e-07,
"loss": 0.6363998651504517,
"num_tokens": 113857610.0,
"step": 126
},
{
"epoch": 0.9477611940298507,
"grad_norm": 0.2886185648925166,
"learning_rate": 9.314276147469408e-07,
"loss": 0.6078730225563049,
"num_tokens": 114800904.0,
"step": 127
},
{
"epoch": 0.9552238805970149,
"grad_norm": 0.2920263616232179,
"learning_rate": 9.301742723187104e-07,
"loss": 0.6083230972290039,
"num_tokens": 115759913.0,
"step": 128
},
{
"epoch": 0.9626865671641791,
"grad_norm": 0.44738555848421335,
"learning_rate": 9.289105418762512e-07,
"loss": 0.6401327848434448,
"num_tokens": 116624191.0,
"step": 129
},
{
"epoch": 0.9701492537313433,
"grad_norm": 0.31658939252581325,
"learning_rate": 9.276364579501741e-07,
"loss": 0.6194320321083069,
"num_tokens": 117499418.0,
"step": 130
},
{
"epoch": 0.9776119402985075,
"grad_norm": 0.3550447231230164,
"learning_rate": 9.263520553539919e-07,
"loss": 0.5973168611526489,
"num_tokens": 118329517.0,
"step": 131
},
{
"epoch": 0.9850746268656716,
"grad_norm": 0.3461631737220481,
"learning_rate": 9.250573691831686e-07,
"loss": 0.6246321201324463,
"num_tokens": 119221343.0,
"step": 132
},
{
"epoch": 0.9925373134328358,
"grad_norm": 0.29903733473091415,
"learning_rate": 9.237524348141599e-07,
"loss": 0.6080079078674316,
"num_tokens": 120123659.0,
"step": 133
},
{
"epoch": 1.0,
"grad_norm": 0.30082599081337075,
"learning_rate": 9.224372879034471e-07,
"loss": 0.610882043838501,
"num_tokens": 121054976.0,
"step": 134
},
{
"epoch": 1.007462686567164,
"grad_norm": 0.3306994076014225,
"learning_rate": 9.211119643865625e-07,
"loss": 0.617473840713501,
"num_tokens": 121995409.0,
"step": 135
},
{
"epoch": 1.0149253731343284,
"grad_norm": 0.4118286344591086,
"learning_rate": 9.197765004771074e-07,
"loss": 0.6029432415962219,
"num_tokens": 122823226.0,
"step": 136
},
{
"epoch": 1.0223880597014925,
"grad_norm": 0.33422438069467975,
"learning_rate": 9.184309326657625e-07,
"loss": 0.5911135077476501,
"num_tokens": 123660597.0,
"step": 137
},
{
"epoch": 1.0298507462686568,
"grad_norm": 0.3051145695551814,
"learning_rate": 9.17075297719292e-07,
"loss": 0.5806124806404114,
"num_tokens": 124554146.0,
"step": 138
},
{
"epoch": 1.037313432835821,
"grad_norm": 0.32826641207537816,
"learning_rate": 9.157096326795367e-07,
"loss": 0.6078518629074097,
"num_tokens": 125332236.0,
"step": 139
},
{
"epoch": 1.044776119402985,
"grad_norm": 0.2711859183967077,
"learning_rate": 9.143339748624042e-07,
"loss": 0.5872972011566162,
"num_tokens": 126310236.0,
"step": 140
},
{
"epoch": 1.0522388059701493,
"grad_norm": 0.29219827505795287,
"learning_rate": 9.129483618568477e-07,
"loss": 0.5903403759002686,
"num_tokens": 127218706.0,
"step": 141
},
{
"epoch": 1.0597014925373134,
"grad_norm": 0.31682373789781537,
"learning_rate": 9.115528315238396e-07,
"loss": 0.6067441701889038,
"num_tokens": 128057825.0,
"step": 142
},
{
"epoch": 1.0671641791044777,
"grad_norm": 0.290526593989316,
"learning_rate": 9.101474219953366e-07,
"loss": 0.5909883975982666,
"num_tokens": 128955736.0,
"step": 143
},
{
"epoch": 1.0746268656716418,
"grad_norm": 0.33126907763025953,
"learning_rate": 9.087321716732382e-07,
"loss": 0.6024787425994873,
"num_tokens": 129777788.0,
"step": 144
},
{
"epoch": 1.0820895522388059,
"grad_norm": 0.2848827551783459,
"learning_rate": 9.073071192283374e-07,
"loss": 0.6014402508735657,
"num_tokens": 130659960.0,
"step": 145
},
{
"epoch": 1.0895522388059702,
"grad_norm": 0.32145826962059754,
"learning_rate": 9.058723035992631e-07,
"loss": 0.5986078977584839,
"num_tokens": 131550221.0,
"step": 146
},
{
"epoch": 1.0970149253731343,
"grad_norm": 0.3517188747706852,
"learning_rate": 9.044277639914176e-07,
"loss": 0.6086349487304688,
"num_tokens": 132486469.0,
"step": 147
},
{
"epoch": 1.1044776119402986,
"grad_norm": 0.2956143436262153,
"learning_rate": 9.029735398759043e-07,
"loss": 0.5634535551071167,
"num_tokens": 133366950.0,
"step": 148
},
{
"epoch": 1.1119402985074627,
"grad_norm": 0.3478882615131154,
"learning_rate": 9.015096709884492e-07,
"loss": 0.5996171236038208,
"num_tokens": 134285043.0,
"step": 149
},
{
"epoch": 1.1194029850746268,
"grad_norm": 0.2966927318472822,
"learning_rate": 9.000361973283158e-07,
"loss": 0.5650948882102966,
"num_tokens": 135136228.0,
"step": 150
},
{
"epoch": 1.126865671641791,
"grad_norm": 0.29045922223177134,
"learning_rate": 8.985531591572115e-07,
"loss": 0.583465576171875,
"num_tokens": 136013129.0,
"step": 151
},
{
"epoch": 1.1343283582089552,
"grad_norm": 0.29420507308387045,
"learning_rate": 8.970605969981879e-07,
"loss": 0.6276301741600037,
"num_tokens": 136978716.0,
"step": 152
},
{
"epoch": 1.1417910447761195,
"grad_norm": 0.28206954373320775,
"learning_rate": 8.955585516345332e-07,
"loss": 0.5884029865264893,
"num_tokens": 137957110.0,
"step": 153
},
{
"epoch": 1.1492537313432836,
"grad_norm": 0.2893761158835339,
"learning_rate": 8.940470641086581e-07,
"loss": 0.58906090259552,
"num_tokens": 138894209.0,
"step": 154
},
{
"epoch": 1.1567164179104479,
"grad_norm": 0.3060099897084387,
"learning_rate": 8.925261757209743e-07,
"loss": 0.6283571720123291,
"num_tokens": 139925878.0,
"step": 155
},
{
"epoch": 1.164179104477612,
"grad_norm": 0.3245985255415534,
"learning_rate": 8.909959280287655e-07,
"loss": 0.5938559770584106,
"num_tokens": 140844266.0,
"step": 156
},
{
"epoch": 1.171641791044776,
"grad_norm": 0.30881864003106135,
"learning_rate": 8.894563628450532e-07,
"loss": 0.5916883945465088,
"num_tokens": 141685264.0,
"step": 157
},
{
"epoch": 1.1791044776119404,
"grad_norm": 0.3596602053150199,
"learning_rate": 8.879075222374521e-07,
"loss": 0.563378095626831,
"num_tokens": 142607439.0,
"step": 158
},
{
"epoch": 1.1865671641791045,
"grad_norm": 0.29872513574720144,
"learning_rate": 8.863494485270226e-07,
"loss": 0.5588960647583008,
"num_tokens": 143442522.0,
"step": 159
},
{
"epoch": 1.1940298507462686,
"grad_norm": 0.35553422720166944,
"learning_rate": 8.847821842871136e-07,
"loss": 0.6027117967605591,
"num_tokens": 144356683.0,
"step": 160
},
{
"epoch": 1.2014925373134329,
"grad_norm": 0.35962639938666846,
"learning_rate": 8.832057723421988e-07,
"loss": 0.5953375101089478,
"num_tokens": 145164747.0,
"step": 161
},
{
"epoch": 1.208955223880597,
"grad_norm": 0.3320369471002194,
"learning_rate": 8.816202557667075e-07,
"loss": 0.5746063590049744,
"num_tokens": 145974438.0,
"step": 162
},
{
"epoch": 1.2164179104477613,
"grad_norm": 0.28069224857904806,
"learning_rate": 8.800256778838467e-07,
"loss": 0.5617422461509705,
"num_tokens": 146897553.0,
"step": 163
},
{
"epoch": 1.2238805970149254,
"grad_norm": 0.3248983801962701,
"learning_rate": 8.784220822644178e-07,
"loss": 0.6217033267021179,
"num_tokens": 147706235.0,
"step": 164
},
{
"epoch": 1.2313432835820897,
"grad_norm": 0.2887208674047822,
"learning_rate": 8.768095127256261e-07,
"loss": 0.5414159297943115,
"num_tokens": 148638477.0,
"step": 165
},
{
"epoch": 1.2388059701492538,
"grad_norm": 0.32438535835021254,
"learning_rate": 8.751880133298834e-07,
"loss": 0.5938442945480347,
"num_tokens": 149598765.0,
"step": 166
},
{
"epoch": 1.2462686567164178,
"grad_norm": 0.2894650573081879,
"learning_rate": 8.735576283836037e-07,
"loss": 0.6117956638336182,
"num_tokens": 150499813.0,
"step": 167
},
{
"epoch": 1.2537313432835822,
"grad_norm": 0.3397567950477019,
"learning_rate": 8.719184024359934e-07,
"loss": 0.5914928317070007,
"num_tokens": 151406909.0,
"step": 168
},
{
"epoch": 1.2611940298507462,
"grad_norm": 0.290993077313297,
"learning_rate": 8.70270380277833e-07,
"loss": 0.6332953572273254,
"num_tokens": 152358615.0,
"step": 169
},
{
"epoch": 1.2686567164179103,
"grad_norm": 0.30309197520313697,
"learning_rate": 8.686136069402541e-07,
"loss": 0.5448141694068909,
"num_tokens": 153140245.0,
"step": 170
},
{
"epoch": 1.2761194029850746,
"grad_norm": 0.2861452466873225,
"learning_rate": 8.669481276935083e-07,
"loss": 0.6137048602104187,
"num_tokens": 154065403.0,
"step": 171
},
{
"epoch": 1.2835820895522387,
"grad_norm": 0.2818788428038251,
"learning_rate": 8.652739880457308e-07,
"loss": 0.606778621673584,
"num_tokens": 155004060.0,
"step": 172
},
{
"epoch": 1.291044776119403,
"grad_norm": 0.28295852710705355,
"learning_rate": 8.635912337416962e-07,
"loss": 0.599794864654541,
"num_tokens": 155894044.0,
"step": 173
},
{
"epoch": 1.2985074626865671,
"grad_norm": 0.2842356600778382,
"learning_rate": 8.618999107615693e-07,
"loss": 0.5917081832885742,
"num_tokens": 156891753.0,
"step": 174
},
{
"epoch": 1.3059701492537314,
"grad_norm": 0.6073059901583661,
"learning_rate": 8.602000653196483e-07,
"loss": 0.5762104988098145,
"num_tokens": 157781264.0,
"step": 175
},
{
"epoch": 1.3134328358208955,
"grad_norm": 0.38500166455619816,
"learning_rate": 8.58491743863102e-07,
"loss": 0.5871777534484863,
"num_tokens": 158729371.0,
"step": 176
},
{
"epoch": 1.3208955223880596,
"grad_norm": 0.27823289078417784,
"learning_rate": 8.567749930707011e-07,
"loss": 0.605941653251648,
"num_tokens": 159723929.0,
"step": 177
},
{
"epoch": 1.328358208955224,
"grad_norm": 0.35213890439138806,
"learning_rate": 8.55049859851542e-07,
"loss": 0.6232315301895142,
"num_tokens": 160655042.0,
"step": 178
},
{
"epoch": 1.335820895522388,
"grad_norm": 0.3835692532749333,
"learning_rate": 8.533163913437657e-07,
"loss": 0.5706331729888916,
"num_tokens": 161689806.0,
"step": 179
},
{
"epoch": 1.3432835820895521,
"grad_norm": 0.30950866157021506,
"learning_rate": 8.515746349132691e-07,
"loss": 0.5692603588104248,
"num_tokens": 162672971.0,
"step": 180
},
{
"epoch": 1.3507462686567164,
"grad_norm": 0.2670593807735062,
"learning_rate": 8.498246381524123e-07,
"loss": 0.55814528465271,
"num_tokens": 163606727.0,
"step": 181
},
{
"epoch": 1.3582089552238805,
"grad_norm": 0.30949695351670486,
"learning_rate": 8.480664488787156e-07,
"loss": 0.5762124061584473,
"num_tokens": 164379724.0,
"step": 182
},
{
"epoch": 1.3656716417910448,
"grad_norm": 0.5924544887792298,
"learning_rate": 8.463001151335554e-07,
"loss": 0.588869035243988,
"num_tokens": 165282114.0,
"step": 183
},
{
"epoch": 1.373134328358209,
"grad_norm": 0.28762285578208174,
"learning_rate": 8.445256851808503e-07,
"loss": 0.5752467513084412,
"num_tokens": 166184652.0,
"step": 184
},
{
"epoch": 1.3805970149253732,
"grad_norm": 0.3298383363971737,
"learning_rate": 8.427432075057421e-07,
"loss": 0.5592284798622131,
"num_tokens": 167131883.0,
"step": 185
},
{
"epoch": 1.3880597014925373,
"grad_norm": 0.29907045507102953,
"learning_rate": 8.409527308132717e-07,
"loss": 0.6292506456375122,
"num_tokens": 168105786.0,
"step": 186
},
{
"epoch": 1.3955223880597014,
"grad_norm": 0.5372519710802263,
"learning_rate": 8.391543040270477e-07,
"loss": 0.5994750261306763,
"num_tokens": 168981965.0,
"step": 187
},
{
"epoch": 1.4029850746268657,
"grad_norm": 0.3108497777882688,
"learning_rate": 8.373479762879102e-07,
"loss": 0.5894546508789062,
"num_tokens": 169813930.0,
"step": 188
},
{
"epoch": 1.4104477611940298,
"grad_norm": 0.263215877705637,
"learning_rate": 8.355337969525874e-07,
"loss": 0.5457190871238708,
"num_tokens": 170803921.0,
"step": 189
},
{
"epoch": 1.417910447761194,
"grad_norm": 0.3065822581882927,
"learning_rate": 8.337118155923472e-07,
"loss": 0.5782487988471985,
"num_tokens": 171568584.0,
"step": 190
},
{
"epoch": 1.4253731343283582,
"grad_norm": 0.33076776216010273,
"learning_rate": 8.318820819916432e-07,
"loss": 0.5753518342971802,
"num_tokens": 172302686.0,
"step": 191
},
{
"epoch": 1.4328358208955223,
"grad_norm": 0.28171276781641486,
"learning_rate": 8.300446461467532e-07,
"loss": 0.6102815270423889,
"num_tokens": 173251435.0,
"step": 192
},
{
"epoch": 1.4402985074626866,
"grad_norm": 0.2941656823177544,
"learning_rate": 8.281995582644144e-07,
"loss": 0.5915931463241577,
"num_tokens": 174154926.0,
"step": 193
},
{
"epoch": 1.4477611940298507,
"grad_norm": 0.3065460235572143,
"learning_rate": 8.263468687604508e-07,
"loss": 0.6099899411201477,
"num_tokens": 174968736.0,
"step": 194
},
{
"epoch": 1.455223880597015,
"grad_norm": 0.27914137530301547,
"learning_rate": 8.244866282583955e-07,
"loss": 0.6181570291519165,
"num_tokens": 175993671.0,
"step": 195
},
{
"epoch": 1.462686567164179,
"grad_norm": 0.3083419332308097,
"learning_rate": 8.226188875881081e-07,
"loss": 0.5710784196853638,
"num_tokens": 176965410.0,
"step": 196
},
{
"epoch": 1.4701492537313432,
"grad_norm": 0.5452138442343578,
"learning_rate": 8.20743697784385e-07,
"loss": 0.6409458518028259,
"num_tokens": 177894815.0,
"step": 197
},
{
"epoch": 1.4776119402985075,
"grad_norm": 0.3312904222346608,
"learning_rate": 8.188611100855654e-07,
"loss": 0.5432331562042236,
"num_tokens": 178840660.0,
"step": 198
},
{
"epoch": 1.4850746268656716,
"grad_norm": 0.2774128783983067,
"learning_rate": 8.169711759321317e-07,
"loss": 0.5913591384887695,
"num_tokens": 179786009.0,
"step": 199
},
{
"epoch": 1.4925373134328357,
"grad_norm": 0.27538698518702026,
"learning_rate": 8.150739469653026e-07,
"loss": 0.5327359437942505,
"num_tokens": 180680467.0,
"step": 200
},
{
"epoch": 1.5,
"grad_norm": 0.2839849649378776,
"learning_rate": 8.131694750256233e-07,
"loss": 0.59873366355896,
"num_tokens": 181598316.0,
"step": 201
},
{
"epoch": 1.5074626865671643,
"grad_norm": 0.3083583005906764,
"learning_rate": 8.112578121515484e-07,
"loss": 0.5907875299453735,
"num_tokens": 182458909.0,
"step": 202
},
{
"epoch": 1.5149253731343284,
"grad_norm": 0.29048547641188055,
"learning_rate": 8.0933901057802e-07,
"loss": 0.581605076789856,
"num_tokens": 183258199.0,
"step": 203
},
{
"epoch": 1.5223880597014925,
"grad_norm": 0.33631459084645754,
"learning_rate": 8.074131227350408e-07,
"loss": 0.5947036743164062,
"num_tokens": 184223376.0,
"step": 204
},
{
"epoch": 1.5298507462686568,
"grad_norm": 0.9803280537963448,
"learning_rate": 8.054802012462409e-07,
"loss": 0.6088910102844238,
"num_tokens": 185129043.0,
"step": 205
},
{
"epoch": 1.537313432835821,
"grad_norm": 0.35614179487581604,
"learning_rate": 8.035402989274402e-07,
"loss": 0.595119833946228,
"num_tokens": 186025421.0,
"step": 206
},
{
"epoch": 1.544776119402985,
"grad_norm": 0.28518439705384824,
"learning_rate": 8.015934687852052e-07,
"loss": 0.5574674606323242,
"num_tokens": 186963319.0,
"step": 207
},
{
"epoch": 1.5522388059701493,
"grad_norm": 0.29270174201680466,
"learning_rate": 7.99639764015401e-07,
"loss": 0.6108373999595642,
"num_tokens": 187973354.0,
"step": 208
},
{
"epoch": 1.5597014925373134,
"grad_norm": 0.3323311285244367,
"learning_rate": 7.976792380017372e-07,
"loss": 0.5401036143302917,
"num_tokens": 188705328.0,
"step": 209
},
{
"epoch": 1.5671641791044775,
"grad_norm": 0.3456603920554097,
"learning_rate": 7.957119443143093e-07,
"loss": 0.6063162088394165,
"num_tokens": 189538934.0,
"step": 210
},
{
"epoch": 1.5746268656716418,
"grad_norm": 0.3082330500125609,
"learning_rate": 7.937379367081354e-07,
"loss": 0.5718963146209717,
"num_tokens": 190336903.0,
"step": 211
},
{
"epoch": 1.582089552238806,
"grad_norm": 0.30744552461466196,
"learning_rate": 7.917572691216866e-07,
"loss": 0.62088942527771,
"num_tokens": 191168843.0,
"step": 212
},
{
"epoch": 1.5895522388059702,
"grad_norm": 0.4103702199848832,
"learning_rate": 7.897699956754142e-07,
"loss": 0.5833892822265625,
"num_tokens": 192111363.0,
"step": 213
},
{
"epoch": 1.5970149253731343,
"grad_norm": 0.2908559364406994,
"learning_rate": 7.877761706702697e-07,
"loss": 0.5971975922584534,
"num_tokens": 193103746.0,
"step": 214
},
{
"epoch": 1.6044776119402986,
"grad_norm": 0.3328614806641497,
"learning_rate": 7.857758485862219e-07,
"loss": 0.5907278656959534,
"num_tokens": 193981563.0,
"step": 215
},
{
"epoch": 1.6119402985074627,
"grad_norm": 0.37832890065438735,
"learning_rate": 7.837690840807686e-07,
"loss": 0.6060609817504883,
"num_tokens": 194834592.0,
"step": 216
},
{
"epoch": 1.6194029850746268,
"grad_norm": 0.2677852995292723,
"learning_rate": 7.817559319874417e-07,
"loss": 0.5535368323326111,
"num_tokens": 195762991.0,
"step": 217
},
{
"epoch": 1.626865671641791,
"grad_norm": 0.2741102900904772,
"learning_rate": 7.797364473143103e-07,
"loss": 0.5808444619178772,
"num_tokens": 196736861.0,
"step": 218
},
{
"epoch": 1.6343283582089554,
"grad_norm": 0.2912625129099285,
"learning_rate": 7.777106852424768e-07,
"loss": 0.5897442102432251,
"num_tokens": 197626724.0,
"step": 219
},
{
"epoch": 1.6417910447761193,
"grad_norm": 0.591147632193599,
"learning_rate": 7.756787011245699e-07,
"loss": 0.6097444295883179,
"num_tokens": 198637589.0,
"step": 220
},
{
"epoch": 1.6492537313432836,
"grad_norm": 0.3296433282919186,
"learning_rate": 7.736405504832313e-07,
"loss": 0.6026604175567627,
"num_tokens": 199563255.0,
"step": 221
},
{
"epoch": 1.6567164179104479,
"grad_norm": 0.27997593841499196,
"learning_rate": 7.715962890095987e-07,
"loss": 0.5822043418884277,
"num_tokens": 200461303.0,
"step": 222
},
{
"epoch": 1.664179104477612,
"grad_norm": 0.3067564437403442,
"learning_rate": 7.69545972561785e-07,
"loss": 0.6166250705718994,
"num_tokens": 201369977.0,
"step": 223
},
{
"epoch": 1.671641791044776,
"grad_norm": 0.2688906931144112,
"learning_rate": 7.674896571633506e-07,
"loss": 0.5445988178253174,
"num_tokens": 202278503.0,
"step": 224
},
{
"epoch": 1.6791044776119404,
"grad_norm": 0.2691568834958204,
"learning_rate": 7.65427399001774e-07,
"loss": 0.5422626733779907,
"num_tokens": 203242720.0,
"step": 225
},
{
"epoch": 1.6865671641791045,
"grad_norm": 0.2906038986233365,
"learning_rate": 7.633592544269152e-07,
"loss": 0.5783904790878296,
"num_tokens": 204150301.0,
"step": 226
},
{
"epoch": 1.6940298507462686,
"grad_norm": 0.2887554770352252,
"learning_rate": 7.612852799494769e-07,
"loss": 0.588298499584198,
"num_tokens": 205093558.0,
"step": 227
},
{
"epoch": 1.7014925373134329,
"grad_norm": 0.2993654753244292,
"learning_rate": 7.592055322394602e-07,
"loss": 0.6025734543800354,
"num_tokens": 205970210.0,
"step": 228
},
{
"epoch": 1.7089552238805972,
"grad_norm": 0.284435640176723,
"learning_rate": 7.571200681246158e-07,
"loss": 0.6054296493530273,
"num_tokens": 206859291.0,
"step": 229
},
{
"epoch": 1.716417910447761,
"grad_norm": 0.3195285881506451,
"learning_rate": 7.550289445888914e-07,
"loss": 0.5874844789505005,
"num_tokens": 207786446.0,
"step": 230
},
{
"epoch": 1.7238805970149254,
"grad_norm": 0.28553073165686976,
"learning_rate": 7.529322187708751e-07,
"loss": 0.6177946925163269,
"num_tokens": 208698287.0,
"step": 231
},
{
"epoch": 1.7313432835820897,
"grad_norm": 0.49617871796694113,
"learning_rate": 7.508299479622334e-07,
"loss": 0.5590040683746338,
"num_tokens": 209548343.0,
"step": 232
},
{
"epoch": 1.7388059701492538,
"grad_norm": 0.28377509079902785,
"learning_rate": 7.487221896061457e-07,
"loss": 0.5984382629394531,
"num_tokens": 210468969.0,
"step": 233
},
{
"epoch": 1.7462686567164178,
"grad_norm": 0.27598753591931857,
"learning_rate": 7.46609001295736e-07,
"loss": 0.614782452583313,
"num_tokens": 211457470.0,
"step": 234
},
{
"epoch": 1.7537313432835822,
"grad_norm": 0.3112684553028677,
"learning_rate": 7.444904407724972e-07,
"loss": 0.5674484372138977,
"num_tokens": 212347451.0,
"step": 235
},
{
"epoch": 1.7611940298507462,
"grad_norm": 0.6461909553210673,
"learning_rate": 7.423665659247152e-07,
"loss": 0.5716361999511719,
"num_tokens": 213190706.0,
"step": 236
},
{
"epoch": 1.7686567164179103,
"grad_norm": 0.28012296588796226,
"learning_rate": 7.40237434785886e-07,
"loss": 0.5874301195144653,
"num_tokens": 214169074.0,
"step": 237
},
{
"epoch": 1.7761194029850746,
"grad_norm": 0.279647753002793,
"learning_rate": 7.381031055331305e-07,
"loss": 0.6019556522369385,
"num_tokens": 215188427.0,
"step": 238
},
{
"epoch": 1.783582089552239,
"grad_norm": 0.39177638463203673,
"learning_rate": 7.359636364856043e-07,
"loss": 0.6031475067138672,
"num_tokens": 216080767.0,
"step": 239
},
{
"epoch": 1.7910447761194028,
"grad_norm": 0.3360890439886117,
"learning_rate": 7.338190861029051e-07,
"loss": 0.5989038944244385,
"num_tokens": 216995394.0,
"step": 240
},
{
"epoch": 1.7985074626865671,
"grad_norm": 0.2739266991645573,
"learning_rate": 7.316695129834744e-07,
"loss": 0.5613197684288025,
"num_tokens": 217866020.0,
"step": 241
},
{
"epoch": 1.8059701492537314,
"grad_norm": 0.5120657338819569,
"learning_rate": 7.295149758629966e-07,
"loss": 0.5808136463165283,
"num_tokens": 218680341.0,
"step": 242
},
{
"epoch": 1.8134328358208955,
"grad_norm": 0.294200195693698,
"learning_rate": 7.273555336127946e-07,
"loss": 0.5945237874984741,
"num_tokens": 219550948.0,
"step": 243
},
{
"epoch": 1.8208955223880596,
"grad_norm": 0.2909609394380647,
"learning_rate": 7.251912452382205e-07,
"loss": 0.5686002373695374,
"num_tokens": 220517125.0,
"step": 244
},
{
"epoch": 1.828358208955224,
"grad_norm": 0.29553389453752155,
"learning_rate": 7.230221698770439e-07,
"loss": 0.5637418031692505,
"num_tokens": 221365026.0,
"step": 245
},
{
"epoch": 1.835820895522388,
"grad_norm": 0.331158945273446,
"learning_rate": 7.20848366797835e-07,
"loss": 0.5235867500305176,
"num_tokens": 222233736.0,
"step": 246
},
{
"epoch": 1.8432835820895521,
"grad_norm": 0.29651149137672,
"learning_rate": 7.186698953983465e-07,
"loss": 0.6124955415725708,
"num_tokens": 223222809.0,
"step": 247
},
{
"epoch": 1.8507462686567164,
"grad_norm": 0.2967229855382439,
"learning_rate": 7.164868152038898e-07,
"loss": 0.5900925993919373,
"num_tokens": 224116326.0,
"step": 248
},
{
"epoch": 1.8582089552238807,
"grad_norm": 0.2770420932471773,
"learning_rate": 7.14299185865708e-07,
"loss": 0.5970636606216431,
"num_tokens": 991360.0,
"step": 249
},
{
"epoch": 1.8656716417910446,
"grad_norm": 0.2858538902789167,
"learning_rate": 7.121070671593477e-07,
"loss": 0.5819560289382935,
"num_tokens": 2010167.0,
"step": 250
},
{
"epoch": 1.873134328358209,
"grad_norm": 0.2972326089151943,
"learning_rate": 7.099105189830235e-07,
"loss": 0.5888773202896118,
"num_tokens": 2885939.0,
"step": 251
},
{
"epoch": 1.8805970149253732,
"grad_norm": 0.3057535952426567,
"learning_rate": 7.07709601355983e-07,
"loss": 0.5811155438423157,
"num_tokens": 3762868.0,
"step": 252
},
{
"epoch": 1.8880597014925373,
"grad_norm": 0.31258132271786665,
"learning_rate": 7.055043744168657e-07,
"loss": 0.6176049709320068,
"num_tokens": 4733514.0,
"step": 253
},
{
"epoch": 1.8955223880597014,
"grad_norm": 0.3030852287619505,
"learning_rate": 7.03294898422061e-07,
"loss": 0.5565370917320251,
"num_tokens": 5639515.0,
"step": 254
},
{
"epoch": 1.9029850746268657,
"grad_norm": 0.3012477759016326,
"learning_rate": 7.010812337440604e-07,
"loss": 0.568949818611145,
"num_tokens": 6415052.0,
"step": 255
},
{
"epoch": 1.9104477611940298,
"grad_norm": 0.27565638559926237,
"learning_rate": 6.988634408698082e-07,
"loss": 0.5407424569129944,
"num_tokens": 7346190.0,
"step": 256
},
{
"epoch": 1.917910447761194,
"grad_norm": 0.32156589204612385,
"learning_rate": 6.9664158039905e-07,
"loss": 0.609969973564148,
"num_tokens": 8239599.0,
"step": 257
},
{
"epoch": 1.9253731343283582,
"grad_norm": 0.3188048041018452,
"learning_rate": 6.944157130426745e-07,
"loss": 0.5987858176231384,
"num_tokens": 9077707.0,
"step": 258
},
{
"epoch": 1.9328358208955225,
"grad_norm": 0.2961821437519157,
"learning_rate": 6.921858996210568e-07,
"loss": 0.568209171295166,
"num_tokens": 9982372.0,
"step": 259
},
{
"epoch": 1.9402985074626866,
"grad_norm": 0.27392736616109464,
"learning_rate": 6.899522010623958e-07,
"loss": 0.5922641158103943,
"num_tokens": 11023445.0,
"step": 260
},
{
"epoch": 1.9477611940298507,
"grad_norm": 0.2933002440532853,
"learning_rate": 6.877146784010486e-07,
"loss": 0.5736743211746216,
"num_tokens": 11857709.0,
"step": 261
},
{
"epoch": 1.955223880597015,
"grad_norm": 0.27846912836784765,
"learning_rate": 6.854733927758636e-07,
"loss": 0.5894352197647095,
"num_tokens": 12766496.0,
"step": 262
},
{
"epoch": 1.962686567164179,
"grad_norm": 0.27777120339996386,
"learning_rate": 6.8322840542851e-07,
"loss": 0.601696789264679,
"num_tokens": 13767472.0,
"step": 263
},
{
"epoch": 1.9701492537313432,
"grad_norm": 0.46660244180149263,
"learning_rate": 6.80979777701804e-07,
"loss": 0.5974367260932922,
"num_tokens": 14594712.0,
"step": 264
},
{
"epoch": 1.9776119402985075,
"grad_norm": 0.2956712799688728,
"learning_rate": 6.787275710380329e-07,
"loss": 0.5965464115142822,
"num_tokens": 15486445.0,
"step": 265
},
{
"epoch": 1.9850746268656716,
"grad_norm": 0.31107227320954434,
"learning_rate": 6.764718469772757e-07,
"loss": 0.576676607131958,
"num_tokens": 16227990.0,
"step": 266
},
{
"epoch": 1.9925373134328357,
"grad_norm": 0.28037710557720436,
"learning_rate": 6.742126671557227e-07,
"loss": 0.556594729423523,
"num_tokens": 17105978.0,
"step": 267
},
{
"epoch": 2.0,
"grad_norm": 0.31658535211949906,
"learning_rate": 6.719500933039897e-07,
"loss": 0.5741510391235352,
"num_tokens": 18011768.0,
"step": 268
},
{
"epoch": 2.0074626865671643,
"grad_norm": 0.2959704877731208,
"learning_rate": 6.69684187245433e-07,
"loss": 0.596227765083313,
"num_tokens": 18916003.0,
"step": 269
},
{
"epoch": 2.014925373134328,
"grad_norm": 0.29107376493477805,
"learning_rate": 6.674150108944592e-07,
"loss": 0.5445001125335693,
"num_tokens": 19684628.0,
"step": 270
},
{
"epoch": 2.0223880597014925,
"grad_norm": 0.2777883261006439,
"learning_rate": 6.651426262548325e-07,
"loss": 0.5889461636543274,
"num_tokens": 20690086.0,
"step": 271
},
{
"epoch": 2.029850746268657,
"grad_norm": 0.2701961134075662,
"learning_rate": 6.628670954179829e-07,
"loss": 0.5695216655731201,
"num_tokens": 21686072.0,
"step": 272
},
{
"epoch": 2.0373134328358207,
"grad_norm": 0.26760015658938013,
"learning_rate": 6.605884805613072e-07,
"loss": 0.5295987129211426,
"num_tokens": 22622971.0,
"step": 273
},
{
"epoch": 2.044776119402985,
"grad_norm": 0.323703653894069,
"learning_rate": 6.583068439464715e-07,
"loss": 0.5844870209693909,
"num_tokens": 23496905.0,
"step": 274
},
{
"epoch": 2.0522388059701493,
"grad_norm": 0.30828913191565205,
"learning_rate": 6.560222479177094e-07,
"loss": 0.5690542459487915,
"num_tokens": 24365149.0,
"step": 275
},
{
"epoch": 2.0597014925373136,
"grad_norm": 0.3084417281876953,
"learning_rate": 6.537347549001184e-07,
"loss": 0.5742576122283936,
"num_tokens": 25184612.0,
"step": 276
},
{
"epoch": 2.0671641791044775,
"grad_norm": 0.3080393843745802,
"learning_rate": 6.514444273979543e-07,
"loss": 0.5722700357437134,
"num_tokens": 26054937.0,
"step": 277
},
{
"epoch": 2.074626865671642,
"grad_norm": 0.26775870499252885,
"learning_rate": 6.491513279929237e-07,
"loss": 0.5365396738052368,
"num_tokens": 26954789.0,
"step": 278
},
{
"epoch": 2.082089552238806,
"grad_norm": 0.35802085892606694,
"learning_rate": 6.468555193424735e-07,
"loss": 0.5596331357955933,
"num_tokens": 27845072.0,
"step": 279
},
{
"epoch": 2.08955223880597,
"grad_norm": 0.2719625157722236,
"learning_rate": 6.445570641780786e-07,
"loss": 0.5419675707817078,
"num_tokens": 28658754.0,
"step": 280
},
{
"epoch": 2.0970149253731343,
"grad_norm": 0.27709779905677195,
"learning_rate": 6.422560253035287e-07,
"loss": 0.5775716304779053,
"num_tokens": 29562576.0,
"step": 281
},
{
"epoch": 2.1044776119402986,
"grad_norm": 0.3348867650726701,
"learning_rate": 6.39952465593211e-07,
"loss": 0.585283637046814,
"num_tokens": 30431256.0,
"step": 282
},
{
"epoch": 2.111940298507463,
"grad_norm": 0.29694286893571703,
"learning_rate": 6.376464479903937e-07,
"loss": 0.5197538733482361,
"num_tokens": 31183415.0,
"step": 283
},
{
"epoch": 2.1194029850746268,
"grad_norm": 0.29840728825899304,
"learning_rate": 6.35338035505505e-07,
"loss": 0.5599273443222046,
"num_tokens": 32067616.0,
"step": 284
},
{
"epoch": 2.126865671641791,
"grad_norm": 0.33678060078261246,
"learning_rate": 6.330272912144116e-07,
"loss": 0.6192691326141357,
"num_tokens": 32981757.0,
"step": 285
},
{
"epoch": 2.1343283582089554,
"grad_norm": 0.2824243012069984,
"learning_rate": 6.307142782566951e-07,
"loss": 0.5863723754882812,
"num_tokens": 34022251.0,
"step": 286
},
{
"epoch": 2.1417910447761193,
"grad_norm": 0.299007428336042,
"learning_rate": 6.283990598339274e-07,
"loss": 0.5666537284851074,
"num_tokens": 34771710.0,
"step": 287
},
{
"epoch": 2.1492537313432836,
"grad_norm": 0.32004542775898487,
"learning_rate": 6.260816992079431e-07,
"loss": 0.5231757760047913,
"num_tokens": 35650183.0,
"step": 288
},
{
"epoch": 2.156716417910448,
"grad_norm": 0.3045721000640588,
"learning_rate": 6.237622596991106e-07,
"loss": 0.5760424137115479,
"num_tokens": 36493771.0,
"step": 289
},
{
"epoch": 2.1641791044776117,
"grad_norm": 0.2937692676912212,
"learning_rate": 6.214408046846034e-07,
"loss": 0.568109393119812,
"num_tokens": 37330886.0,
"step": 290
},
{
"epoch": 2.171641791044776,
"grad_norm": 0.27011730349579827,
"learning_rate": 6.191173975966668e-07,
"loss": 0.5667808055877686,
"num_tokens": 38365287.0,
"step": 291
},
{
"epoch": 2.1791044776119404,
"grad_norm": 0.28692265505240294,
"learning_rate": 6.16792101920885e-07,
"loss": 0.6112924814224243,
"num_tokens": 39420111.0,
"step": 292
},
{
"epoch": 2.1865671641791047,
"grad_norm": 0.28526825922243654,
"learning_rate": 6.144649811944473e-07,
"loss": 0.5639245510101318,
"num_tokens": 40263636.0,
"step": 293
},
{
"epoch": 2.1940298507462686,
"grad_norm": 0.2908552938644807,
"learning_rate": 6.121360990044106e-07,
"loss": 0.5848294496536255,
"num_tokens": 41080304.0,
"step": 294
},
{
"epoch": 2.201492537313433,
"grad_norm": 0.2821005110679085,
"learning_rate": 6.098055189859634e-07,
"loss": 0.5666854381561279,
"num_tokens": 42076069.0,
"step": 295
},
{
"epoch": 2.208955223880597,
"grad_norm": 0.2609076970175634,
"learning_rate": 6.074733048206852e-07,
"loss": 0.5690361857414246,
"num_tokens": 43082365.0,
"step": 296
},
{
"epoch": 2.216417910447761,
"grad_norm": 0.2836464990705104,
"learning_rate": 6.051395202348089e-07,
"loss": 0.5679644346237183,
"num_tokens": 44012700.0,
"step": 297
},
{
"epoch": 2.2238805970149254,
"grad_norm": 0.25600506754304947,
"learning_rate": 6.028042289974768e-07,
"loss": 0.5512281656265259,
"num_tokens": 44917778.0,
"step": 298
},
{
"epoch": 2.2313432835820897,
"grad_norm": 0.3257772292478769,
"learning_rate": 6.004674949190003e-07,
"loss": 0.5415934324264526,
"num_tokens": 45740145.0,
"step": 299
},
{
"epoch": 2.2388059701492535,
"grad_norm": 0.3286166291890802,
"learning_rate": 5.981293818491152e-07,
"loss": 0.5995659828186035,
"num_tokens": 46620715.0,
"step": 300
},
{
"epoch": 2.246268656716418,
"grad_norm": 0.2894099742683797,
"learning_rate": 5.957899536752373e-07,
"loss": 0.608267605304718,
"num_tokens": 47539124.0,
"step": 301
},
{
"epoch": 2.253731343283582,
"grad_norm": 0.2895508079853879,
"learning_rate": 5.934492743207168e-07,
"loss": 0.5291934013366699,
"num_tokens": 48336408.0,
"step": 302
},
{
"epoch": 2.2611940298507465,
"grad_norm": 0.3174861796364666,
"learning_rate": 5.911074077430916e-07,
"loss": 0.5688158273696899,
"num_tokens": 49205406.0,
"step": 303
},
{
"epoch": 2.2686567164179103,
"grad_norm": 0.3232273733079749,
"learning_rate": 5.887644179323403e-07,
"loss": 0.5540226697921753,
"num_tokens": 50043421.0,
"step": 304
},
{
"epoch": 2.2761194029850746,
"grad_norm": 0.28558608567310023,
"learning_rate": 5.864203689091315e-07,
"loss": 0.5832343697547913,
"num_tokens": 50915233.0,
"step": 305
},
{
"epoch": 2.283582089552239,
"grad_norm": 0.3069099454369789,
"learning_rate": 5.84075324723078e-07,
"loss": 0.5831292867660522,
"num_tokens": 51814606.0,
"step": 306
},
{
"epoch": 2.291044776119403,
"grad_norm": 0.2697329326673818,
"learning_rate": 5.817293494509836e-07,
"loss": 0.5265708565711975,
"num_tokens": 52624758.0,
"step": 307
},
{
"epoch": 2.298507462686567,
"grad_norm": 0.27527418320187297,
"learning_rate": 5.793825071950935e-07,
"loss": 0.5518659353256226,
"num_tokens": 53591262.0,
"step": 308
},
{
"epoch": 2.3059701492537314,
"grad_norm": 0.31199334247822685,
"learning_rate": 5.770348620813432e-07,
"loss": 0.5563576221466064,
"num_tokens": 54586868.0,
"step": 309
},
{
"epoch": 2.3134328358208958,
"grad_norm": 0.33111603191568395,
"learning_rate": 5.746864782576053e-07,
"loss": 0.5557553768157959,
"num_tokens": 55375213.0,
"step": 310
},
{
"epoch": 2.3208955223880596,
"grad_norm": 0.2955172821022238,
"learning_rate": 5.723374198919376e-07,
"loss": 0.5784043073654175,
"num_tokens": 56208304.0,
"step": 311
},
{
"epoch": 2.328358208955224,
"grad_norm": 0.27282274560683967,
"learning_rate": 5.699877511708284e-07,
"loss": 0.5383070111274719,
"num_tokens": 57191922.0,
"step": 312
},
{
"epoch": 2.3358208955223883,
"grad_norm": 0.258279345220606,
"learning_rate": 5.676375362974449e-07,
"loss": 0.5381882786750793,
"num_tokens": 58105389.0,
"step": 313
},
{
"epoch": 2.343283582089552,
"grad_norm": 0.33949214023855984,
"learning_rate": 5.652868394898766e-07,
"loss": 0.5437734723091125,
"num_tokens": 58961497.0,
"step": 314
},
{
"epoch": 2.3507462686567164,
"grad_norm": 0.2833579384746108,
"learning_rate": 5.629357249793816e-07,
"loss": 0.592788815498352,
"num_tokens": 59947795.0,
"step": 315
},
{
"epoch": 2.3582089552238807,
"grad_norm": 0.30062364724661333,
"learning_rate": 5.605842570086319e-07,
"loss": 0.5617958307266235,
"num_tokens": 60964098.0,
"step": 316
},
{
"epoch": 2.3656716417910446,
"grad_norm": 0.2800076043555411,
"learning_rate": 5.582324998299572e-07,
"loss": 0.5720120072364807,
"num_tokens": 61889873.0,
"step": 317
},
{
"epoch": 2.373134328358209,
"grad_norm": 0.27101523720971354,
"learning_rate": 5.558805177035901e-07,
"loss": 0.5741963386535645,
"num_tokens": 62849188.0,
"step": 318
},
{
"epoch": 2.3805970149253732,
"grad_norm": 0.285471903870457,
"learning_rate": 5.53528374895909e-07,
"loss": 0.5414842963218689,
"num_tokens": 63726113.0,
"step": 319
},
{
"epoch": 2.388059701492537,
"grad_norm": 0.27878281549210715,
"learning_rate": 5.511761356776833e-07,
"loss": 0.5728551745414734,
"num_tokens": 64647597.0,
"step": 320
},
{
"epoch": 2.3955223880597014,
"grad_norm": 0.28622078113655536,
"learning_rate": 5.488238643223167e-07,
"loss": 0.5815838575363159,
"num_tokens": 65508928.0,
"step": 321
},
{
"epoch": 2.4029850746268657,
"grad_norm": 0.27899596544094324,
"learning_rate": 5.464716251040911e-07,
"loss": 0.5664654970169067,
"num_tokens": 66358099.0,
"step": 322
},
{
"epoch": 2.41044776119403,
"grad_norm": 0.2825063898870399,
"learning_rate": 5.441194822964099e-07,
"loss": 0.5764633417129517,
"num_tokens": 67219625.0,
"step": 323
},
{
"epoch": 2.417910447761194,
"grad_norm": 0.2889902676953764,
"learning_rate": 5.417675001700427e-07,
"loss": 0.5656483173370361,
"num_tokens": 68141332.0,
"step": 324
},
{
"epoch": 2.425373134328358,
"grad_norm": 0.27885504864685967,
"learning_rate": 5.39415742991368e-07,
"loss": 0.6192145943641663,
"num_tokens": 69046407.0,
"step": 325
},
{
"epoch": 2.4328358208955225,
"grad_norm": 0.286670961202952,
"learning_rate": 5.370642750206184e-07,
"loss": 0.6090319156646729,
"num_tokens": 70083093.0,
"step": 326
},
{
"epoch": 2.4402985074626864,
"grad_norm": 0.275072660826794,
"learning_rate": 5.347131605101236e-07,
"loss": 0.6045145988464355,
"num_tokens": 71047395.0,
"step": 327
},
{
"epoch": 2.4477611940298507,
"grad_norm": 0.2864324709863456,
"learning_rate": 5.323624637025551e-07,
"loss": 0.5572278499603271,
"num_tokens": 71932159.0,
"step": 328
},
{
"epoch": 2.455223880597015,
"grad_norm": 0.2973709054279998,
"learning_rate": 5.300122488291716e-07,
"loss": 0.5611422061920166,
"num_tokens": 72789371.0,
"step": 329
},
{
"epoch": 2.4626865671641793,
"grad_norm": 0.3318220155418688,
"learning_rate": 5.276625801080625e-07,
"loss": 0.5865360498428345,
"num_tokens": 73721478.0,
"step": 330
},
{
"epoch": 2.470149253731343,
"grad_norm": 0.3436341281789925,
"learning_rate": 5.253135217423947e-07,
"loss": 0.5705252885818481,
"num_tokens": 74706274.0,
"step": 331
},
{
"epoch": 2.4776119402985075,
"grad_norm": 0.37986006551326945,
"learning_rate": 5.229651379186569e-07,
"loss": 0.5907820463180542,
"num_tokens": 75647716.0,
"step": 332
},
{
"epoch": 2.485074626865672,
"grad_norm": 0.3122329879913656,
"learning_rate": 5.206174928049065e-07,
"loss": 0.5766445994377136,
"num_tokens": 76637809.0,
"step": 333
},
{
"epoch": 2.4925373134328357,
"grad_norm": 0.2998854461296799,
"learning_rate": 5.182706505490165e-07,
"loss": 0.5649234652519226,
"num_tokens": 77528162.0,
"step": 334
},
{
"epoch": 2.5,
"grad_norm": 0.25745394101844005,
"learning_rate": 5.15924675276922e-07,
"loss": 0.5354350805282593,
"num_tokens": 78421720.0,
"step": 335
},
{
"epoch": 2.5074626865671643,
"grad_norm": 0.2829213028380422,
"learning_rate": 5.135796310908685e-07,
"loss": 0.5751874446868896,
"num_tokens": 79217572.0,
"step": 336
},
{
"epoch": 2.5149253731343286,
"grad_norm": 0.3139108030566433,
"learning_rate": 5.112355820676599e-07,
"loss": 0.5624819993972778,
"num_tokens": 80184157.0,
"step": 337
},
{
"epoch": 2.5223880597014925,
"grad_norm": 0.28913655087516726,
"learning_rate": 5.088925922569083e-07,
"loss": 0.568986713886261,
"num_tokens": 81059812.0,
"step": 338
},
{
"epoch": 2.529850746268657,
"grad_norm": 0.2960772237491209,
"learning_rate": 5.065507256792833e-07,
"loss": 0.5797086954116821,
"num_tokens": 81975922.0,
"step": 339
},
{
"epoch": 2.5373134328358207,
"grad_norm": 0.2882326553054164,
"learning_rate": 5.042100463247629e-07,
"loss": 0.5706868171691895,
"num_tokens": 82823460.0,
"step": 340
},
{
"epoch": 2.544776119402985,
"grad_norm": 0.307599479684034,
"learning_rate": 5.018706181508851e-07,
"loss": 0.5756710767745972,
"num_tokens": 83559785.0,
"step": 341
},
{
"epoch": 2.5522388059701493,
"grad_norm": 0.308017074889723,
"learning_rate": 4.995325050809999e-07,
"loss": 0.6031478643417358,
"num_tokens": 84441431.0,
"step": 342
},
{
"epoch": 2.5597014925373136,
"grad_norm": 0.29529571185477965,
"learning_rate": 4.971957710025234e-07,
"loss": 0.5946158170700073,
"num_tokens": 85349485.0,
"step": 343
},
{
"epoch": 2.5671641791044775,
"grad_norm": 0.2819610880293821,
"learning_rate": 4.948604797651913e-07,
"loss": 0.5992064476013184,
"num_tokens": 86267065.0,
"step": 344
},
{
"epoch": 2.574626865671642,
"grad_norm": 0.2972660099307388,
"learning_rate": 4.925266951793149e-07,
"loss": 0.573174774646759,
"num_tokens": 87077996.0,
"step": 345
},
{
"epoch": 2.582089552238806,
"grad_norm": 0.301461452019035,
"learning_rate": 4.901944810140369e-07,
"loss": 0.589251697063446,
"num_tokens": 88180031.0,
"step": 346
},
{
"epoch": 2.58955223880597,
"grad_norm": 0.2904479295025236,
"learning_rate": 4.878639009955895e-07,
"loss": 0.54721999168396,
"num_tokens": 89096524.0,
"step": 347
},
{
"epoch": 2.5970149253731343,
"grad_norm": 0.2633320073378902,
"learning_rate": 4.855350188055528e-07,
"loss": 0.5418224334716797,
"num_tokens": 90020467.0,
"step": 348
},
{
"epoch": 2.6044776119402986,
"grad_norm": 0.27176928239419323,
"learning_rate": 4.83207898079115e-07,
"loss": 0.565537691116333,
"num_tokens": 90987416.0,
"step": 349
},
{
"epoch": 2.611940298507463,
"grad_norm": 0.32160604849841345,
"learning_rate": 4.808826024033334e-07,
"loss": 0.5598034262657166,
"num_tokens": 91795663.0,
"step": 350
},
{
"epoch": 2.6194029850746268,
"grad_norm": 0.30348956227704144,
"learning_rate": 4.785591953153966e-07,
"loss": 0.5576733946800232,
"num_tokens": 92702072.0,
"step": 351
},
{
"epoch": 2.626865671641791,
"grad_norm": 0.2744155218003863,
"learning_rate": 4.762377403008895e-07,
"loss": 0.5912754535675049,
"num_tokens": 93699627.0,
"step": 352
},
{
"epoch": 2.6343283582089554,
"grad_norm": 0.2666509179401252,
"learning_rate": 4.739183007920571e-07,
"loss": 0.5752925276756287,
"num_tokens": 94666002.0,
"step": 353
},
{
"epoch": 2.6417910447761193,
"grad_norm": 0.261062380795103,
"learning_rate": 4.7160094016607276e-07,
"loss": 0.5275688767433167,
"num_tokens": 95605433.0,
"step": 354
},
{
"epoch": 2.6492537313432836,
"grad_norm": 0.2811617365131969,
"learning_rate": 4.6928572174330495e-07,
"loss": 0.5722550749778748,
"num_tokens": 96516441.0,
"step": 355
},
{
"epoch": 2.656716417910448,
"grad_norm": 0.2735603112854696,
"learning_rate": 4.669727087855886e-07,
"loss": 0.5699527859687805,
"num_tokens": 97450967.0,
"step": 356
},
{
"epoch": 2.664179104477612,
"grad_norm": 0.3395844478312138,
"learning_rate": 4.6466196449449504e-07,
"loss": 0.5282535552978516,
"num_tokens": 98350106.0,
"step": 357
},
{
"epoch": 2.671641791044776,
"grad_norm": 0.2830556116676336,
"learning_rate": 4.6235355200960623e-07,
"loss": 0.5501178503036499,
"num_tokens": 99299833.0,
"step": 358
},
{
"epoch": 2.6791044776119404,
"grad_norm": 0.2720175319739195,
"learning_rate": 4.600475344067889e-07,
"loss": 0.5554410219192505,
"num_tokens": 100163789.0,
"step": 359
},
{
"epoch": 2.6865671641791042,
"grad_norm": 0.39231713406243224,
"learning_rate": 4.577439746964714e-07,
"loss": 0.5649659633636475,
"num_tokens": 101065769.0,
"step": 360
},
{
"epoch": 2.6940298507462686,
"grad_norm": 0.3840423060468296,
"learning_rate": 4.554429358219213e-07,
"loss": 0.5463579893112183,
"num_tokens": 102054742.0,
"step": 361
},
{
"epoch": 2.701492537313433,
"grad_norm": 0.2823797561004669,
"learning_rate": 4.531444806575265e-07,
"loss": 0.5806522369384766,
"num_tokens": 102999309.0,
"step": 362
},
{
"epoch": 2.708955223880597,
"grad_norm": 0.2847114226753591,
"learning_rate": 4.508486720070761e-07,
"loss": 0.5655279159545898,
"num_tokens": 103943807.0,
"step": 363
},
{
"epoch": 2.716417910447761,
"grad_norm": 0.28205090767545954,
"learning_rate": 4.4855557260204547e-07,
"loss": 0.5465511083602905,
"num_tokens": 104752259.0,
"step": 364
},
{
"epoch": 2.7238805970149254,
"grad_norm": 0.31707271181231406,
"learning_rate": 4.462652450998815e-07,
"loss": 0.56863933801651,
"num_tokens": 105560152.0,
"step": 365
},
{
"epoch": 2.7313432835820897,
"grad_norm": 0.28484578348583783,
"learning_rate": 4.439777520822905e-07,
"loss": 0.5578351020812988,
"num_tokens": 106370949.0,
"step": 366
},
{
"epoch": 2.7388059701492535,
"grad_norm": 0.2911877277567285,
"learning_rate": 4.416931560535284e-07,
"loss": 0.560371994972229,
"num_tokens": 107223604.0,
"step": 367
},
{
"epoch": 2.746268656716418,
"grad_norm": 0.27157022459261115,
"learning_rate": 4.394115194386927e-07,
"loss": 0.5460314750671387,
"num_tokens": 108240912.0,
"step": 368
},
{
"epoch": 2.753731343283582,
"grad_norm": 0.2866648017188484,
"learning_rate": 4.3713290458201714e-07,
"loss": 0.567893922328949,
"num_tokens": 109178166.0,
"step": 369
},
{
"epoch": 2.7611940298507465,
"grad_norm": 0.29527473129759935,
"learning_rate": 4.348573737451674e-07,
"loss": 0.6049559116363525,
"num_tokens": 109991427.0,
"step": 370
},
{
"epoch": 2.7686567164179103,
"grad_norm": 0.31366814462249815,
"learning_rate": 4.3258498910554085e-07,
"loss": 0.5512971878051758,
"num_tokens": 110872756.0,
"step": 371
},
{
"epoch": 2.7761194029850746,
"grad_norm": 0.3023387372899213,
"learning_rate": 4.3031581275456687e-07,
"loss": 0.5625054836273193,
"num_tokens": 111784756.0,
"step": 372
},
{
"epoch": 2.783582089552239,
"grad_norm": 0.3327888958164682,
"learning_rate": 4.2804990669601015e-07,
"loss": 0.559493899345398,
"num_tokens": 112842525.0,
"step": 373
},
{
"epoch": 2.791044776119403,
"grad_norm": 0.2864205118032827,
"learning_rate": 4.2578733284427735e-07,
"loss": 0.541454553604126,
"num_tokens": 113670083.0,
"step": 374
},
{
"epoch": 2.798507462686567,
"grad_norm": 0.2787810223726183,
"learning_rate": 4.2352815302272415e-07,
"loss": 0.5479576587677002,
"num_tokens": 114578655.0,
"step": 375
},
{
"epoch": 2.8059701492537314,
"grad_norm": 0.2870728358422289,
"learning_rate": 4.2127242896196715e-07,
"loss": 0.5296257138252258,
"num_tokens": 115385601.0,
"step": 376
},
{
"epoch": 2.8134328358208958,
"grad_norm": 0.2787997170860601,
"learning_rate": 4.190202222981959e-07,
"loss": 0.5955355763435364,
"num_tokens": 116307550.0,
"step": 377
},
{
"epoch": 2.8208955223880596,
"grad_norm": 0.26436536089418516,
"learning_rate": 4.1677159457149e-07,
"loss": 0.5246421098709106,
"num_tokens": 117169272.0,
"step": 378
},
{
"epoch": 2.828358208955224,
"grad_norm": 0.29264065144374635,
"learning_rate": 4.145266072241365e-07,
"loss": 0.5522100329399109,
"num_tokens": 118096842.0,
"step": 379
},
{
"epoch": 2.835820895522388,
"grad_norm": 0.28815451535722664,
"learning_rate": 4.1228532159895146e-07,
"loss": 0.5797725915908813,
"num_tokens": 119100115.0,
"step": 380
},
{
"epoch": 2.843283582089552,
"grad_norm": 0.3003290343487832,
"learning_rate": 4.100477989376042e-07,
"loss": 0.5710124969482422,
"num_tokens": 120047947.0,
"step": 381
},
{
"epoch": 2.8507462686567164,
"grad_norm": 0.3124248817525316,
"learning_rate": 4.0781410037894305e-07,
"loss": 0.5675666332244873,
"num_tokens": 120918603.0,
"step": 382
},
{
"epoch": 2.8582089552238807,
"grad_norm": 0.29192127163210346,
"learning_rate": 4.0558428695732563e-07,
"loss": 0.5678380727767944,
"num_tokens": 121705889.0,
"step": 383
},
{
"epoch": 2.8656716417910446,
"grad_norm": 0.45590175900628427,
"learning_rate": 4.033584196009502e-07,
"loss": 0.5677410960197449,
"num_tokens": 122662818.0,
"step": 384
},
{
"epoch": 2.873134328358209,
"grad_norm": 0.2630633298189877,
"learning_rate": 4.0113655913019173e-07,
"loss": 0.5765926837921143,
"num_tokens": 123634255.0,
"step": 385
},
{
"epoch": 2.8805970149253732,
"grad_norm": 0.4068807334874914,
"learning_rate": 3.989187662559397e-07,
"loss": 0.5568211078643799,
"num_tokens": 124693287.0,
"step": 386
},
{
"epoch": 2.888059701492537,
"grad_norm": 0.2899519882101068,
"learning_rate": 3.967051015779389e-07,
"loss": 0.5638155937194824,
"num_tokens": 125727015.0,
"step": 387
},
{
"epoch": 2.8955223880597014,
"grad_norm": 0.27969326214227774,
"learning_rate": 3.944956255831342e-07,
"loss": 0.5610464215278625,
"num_tokens": 126569685.0,
"step": 388
},
{
"epoch": 2.9029850746268657,
"grad_norm": 0.2771191212457944,
"learning_rate": 3.9229039864401703e-07,
"loss": 0.5670617818832397,
"num_tokens": 127486971.0,
"step": 389
},
{
"epoch": 2.91044776119403,
"grad_norm": 0.28740458029106764,
"learning_rate": 3.900894810169766e-07,
"loss": 0.573495626449585,
"num_tokens": 128449869.0,
"step": 390
},
{
"epoch": 2.917910447761194,
"grad_norm": 0.3229624542793302,
"learning_rate": 3.8789293284065236e-07,
"loss": 0.5427689552307129,
"num_tokens": 129068910.0,
"step": 391
},
{
"epoch": 2.925373134328358,
"grad_norm": 0.28158062590946553,
"learning_rate": 3.85700814134292e-07,
"loss": 0.5718903541564941,
"num_tokens": 129934302.0,
"step": 392
},
{
"epoch": 2.9328358208955225,
"grad_norm": 0.29563596535877035,
"learning_rate": 3.8351318479611037e-07,
"loss": 0.5753850340843201,
"num_tokens": 130851893.0,
"step": 393
},
{
"epoch": 2.9402985074626864,
"grad_norm": 0.2766878486514577,
"learning_rate": 3.813301046016536e-07,
"loss": 0.5622212886810303,
"num_tokens": 131790942.0,
"step": 394
},
{
"epoch": 2.9477611940298507,
"grad_norm": 0.31331114946966404,
"learning_rate": 3.7915163320216506e-07,
"loss": 0.5439543724060059,
"num_tokens": 132669917.0,
"step": 395
},
{
"epoch": 2.955223880597015,
"grad_norm": 0.3302390373570611,
"learning_rate": 3.7697783012295614e-07,
"loss": 0.560044527053833,
"num_tokens": 133626565.0,
"step": 396
},
{
"epoch": 2.9626865671641793,
"grad_norm": 0.2829029075854862,
"learning_rate": 3.7480875476177944e-07,
"loss": 0.5379583835601807,
"num_tokens": 134404690.0,
"step": 397
},
{
"epoch": 2.970149253731343,
"grad_norm": 0.27234325967681716,
"learning_rate": 3.7264446638720537e-07,
"loss": 0.5365550518035889,
"num_tokens": 135338794.0,
"step": 398
},
{
"epoch": 2.9776119402985075,
"grad_norm": 0.2767187314816525,
"learning_rate": 3.7048502413700343e-07,
"loss": 0.5605146288871765,
"num_tokens": 136245478.0,
"step": 399
},
{
"epoch": 2.9850746268656714,
"grad_norm": 0.2660191705819811,
"learning_rate": 3.683304870165257e-07,
"loss": 0.5613399744033813,
"num_tokens": 137308357.0,
"step": 400
},
{
"epoch": 2.9925373134328357,
"grad_norm": 0.33641507483104477,
"learning_rate": 3.66180913897095e-07,
"loss": 0.5563279390335083,
"num_tokens": 138207663.0,
"step": 401
},
{
"epoch": 3.0,
"grad_norm": 0.2972265795339739,
"learning_rate": 3.640363635143957e-07,
"loss": 0.5664753913879395,
"num_tokens": 139009002.0,
"step": 402
},
{
"epoch": 3.0074626865671643,
"grad_norm": 0.2870907018242013,
"learning_rate": 3.6189689446686957e-07,
"loss": 0.5246941447257996,
"num_tokens": 139777882.0,
"step": 403
},
{
"epoch": 3.014925373134328,
"grad_norm": 0.2916504470337103,
"learning_rate": 3.5976256521411397e-07,
"loss": 0.5544458627700806,
"num_tokens": 140621165.0,
"step": 404
},
{
"epoch": 3.0223880597014925,
"grad_norm": 0.6265255165946803,
"learning_rate": 3.576334340752847e-07,
"loss": 0.5519254207611084,
"num_tokens": 141606435.0,
"step": 405
},
{
"epoch": 3.029850746268657,
"grad_norm": 0.3145949002923175,
"learning_rate": 3.555095592275027e-07,
"loss": 0.5923848152160645,
"num_tokens": 142396631.0,
"step": 406
},
{
"epoch": 3.0373134328358207,
"grad_norm": 0.2812921044098366,
"learning_rate": 3.5339099870426415e-07,
"loss": 0.586621105670929,
"num_tokens": 143299300.0,
"step": 407
},
{
"epoch": 3.044776119402985,
"grad_norm": 0.3253513362223779,
"learning_rate": 3.512778103938542e-07,
"loss": 0.5898826122283936,
"num_tokens": 144081283.0,
"step": 408
},
{
"epoch": 3.0522388059701493,
"grad_norm": 0.2829701335583593,
"learning_rate": 3.491700520377667e-07,
"loss": 0.5714683532714844,
"num_tokens": 144948721.0,
"step": 409
},
{
"epoch": 3.0597014925373136,
"grad_norm": 0.28049591422033593,
"learning_rate": 3.470677812291248e-07,
"loss": 0.5455187559127808,
"num_tokens": 145915502.0,
"step": 410
},
{
"epoch": 3.0671641791044775,
"grad_norm": 0.3105939716549867,
"learning_rate": 3.4497105541110846e-07,
"loss": 0.5836495161056519,
"num_tokens": 146848524.0,
"step": 411
},
{
"epoch": 3.074626865671642,
"grad_norm": 0.32302139932598506,
"learning_rate": 3.428799318753844e-07,
"loss": 0.5365943908691406,
"num_tokens": 147673557.0,
"step": 412
},
{
"epoch": 3.082089552238806,
"grad_norm": 0.2868589436031599,
"learning_rate": 3.407944677605399e-07,
"loss": 0.6071346998214722,
"num_tokens": 148546880.0,
"step": 413
},
{
"epoch": 3.08955223880597,
"grad_norm": 1.1120530008183112,
"learning_rate": 3.3871472005052315e-07,
"loss": 0.5794011354446411,
"num_tokens": 149372372.0,
"step": 414
},
{
"epoch": 3.0970149253731343,
"grad_norm": 0.29942808195408777,
"learning_rate": 3.3664074557308484e-07,
"loss": 0.609196662902832,
"num_tokens": 150192682.0,
"step": 415
},
{
"epoch": 3.1044776119402986,
"grad_norm": 0.26999532517075925,
"learning_rate": 3.345726009982262e-07,
"loss": 0.5523053407669067,
"num_tokens": 151127243.0,
"step": 416
},
{
"epoch": 3.111940298507463,
"grad_norm": 0.27391131798248525,
"learning_rate": 3.325103428366494e-07,
"loss": 0.5864978432655334,
"num_tokens": 152087089.0,
"step": 417
},
{
"epoch": 3.1194029850746268,
"grad_norm": 0.4729472521720116,
"learning_rate": 3.3045402743821503e-07,
"loss": 0.5435307025909424,
"num_tokens": 153037095.0,
"step": 418
},
{
"epoch": 3.126865671641791,
"grad_norm": 0.2613169380583225,
"learning_rate": 3.284037109904013e-07,
"loss": 0.5703420042991638,
"num_tokens": 153970950.0,
"step": 419
},
{
"epoch": 3.1343283582089554,
"grad_norm": 0.26928897911274874,
"learning_rate": 3.2635944951676874e-07,
"loss": 0.5258716344833374,
"num_tokens": 154860955.0,
"step": 420
},
{
"epoch": 3.1417910447761193,
"grad_norm": 0.2763903661377402,
"learning_rate": 3.243212988754302e-07,
"loss": 0.5877372026443481,
"num_tokens": 155783554.0,
"step": 421
},
{
"epoch": 3.1492537313432836,
"grad_norm": 0.29336310605505883,
"learning_rate": 3.2228931475752317e-07,
"loss": 0.5202987790107727,
"num_tokens": 156633643.0,
"step": 422
},
{
"epoch": 3.156716417910448,
"grad_norm": 0.25416164853887924,
"learning_rate": 3.2026355268568985e-07,
"loss": 0.5262839794158936,
"num_tokens": 157509233.0,
"step": 423
},
{
"epoch": 3.1641791044776117,
"grad_norm": 0.2833930281713182,
"learning_rate": 3.1824406801255833e-07,
"loss": 0.541146993637085,
"num_tokens": 158408189.0,
"step": 424
},
{
"epoch": 3.171641791044776,
"grad_norm": 0.27826344635794753,
"learning_rate": 3.1623091591923155e-07,
"loss": 0.5324054956436157,
"num_tokens": 159344619.0,
"step": 425
},
{
"epoch": 3.1791044776119404,
"grad_norm": 0.2641432062082384,
"learning_rate": 3.142241514137781e-07,
"loss": 0.512749969959259,
"num_tokens": 160147804.0,
"step": 426
},
{
"epoch": 3.1865671641791047,
"grad_norm": 0.29499294962840417,
"learning_rate": 3.1222382932973044e-07,
"loss": 0.5644066333770752,
"num_tokens": 161152253.0,
"step": 427
},
{
"epoch": 3.1940298507462686,
"grad_norm": 0.40120876254377613,
"learning_rate": 3.1023000432458594e-07,
"loss": 0.5188844203948975,
"num_tokens": 161912590.0,
"step": 428
},
{
"epoch": 3.201492537313433,
"grad_norm": 0.2760447831486433,
"learning_rate": 3.082427308783133e-07,
"loss": 0.581289529800415,
"num_tokens": 162873772.0,
"step": 429
},
{
"epoch": 3.208955223880597,
"grad_norm": 0.38218690237559466,
"learning_rate": 3.0626206329186475e-07,
"loss": 0.5367913246154785,
"num_tokens": 163747353.0,
"step": 430
},
{
"epoch": 3.216417910447761,
"grad_norm": 0.2964101267001395,
"learning_rate": 3.042880556856907e-07,
"loss": 0.5629439353942871,
"num_tokens": 164553836.0,
"step": 431
},
{
"epoch": 3.2238805970149254,
"grad_norm": 0.2759084567994263,
"learning_rate": 3.023207619982628e-07,
"loss": 0.5370494723320007,
"num_tokens": 165403798.0,
"step": 432
},
{
"epoch": 3.2313432835820897,
"grad_norm": 0.5359782382978606,
"learning_rate": 3.003602359845989e-07,
"loss": 0.5838747620582581,
"num_tokens": 166345805.0,
"step": 433
},
{
"epoch": 3.2388059701492535,
"grad_norm": 0.3148655670627395,
"learning_rate": 2.9840653121479474e-07,
"loss": 0.5563722848892212,
"num_tokens": 167178202.0,
"step": 434
},
{
"epoch": 3.246268656716418,
"grad_norm": 0.25689828139696275,
"learning_rate": 2.964597010725599e-07,
"loss": 0.5305824875831604,
"num_tokens": 168180314.0,
"step": 435
},
{
"epoch": 3.253731343283582,
"grad_norm": 0.2613039059579068,
"learning_rate": 2.945197987537591e-07,
"loss": 0.5461628437042236,
"num_tokens": 169040092.0,
"step": 436
},
{
"epoch": 3.2611940298507465,
"grad_norm": 0.2974430760623621,
"learning_rate": 2.9258687726495905e-07,
"loss": 0.5644657611846924,
"num_tokens": 169917341.0,
"step": 437
},
{
"epoch": 3.2686567164179103,
"grad_norm": 0.2609767450471703,
"learning_rate": 2.9066098942197993e-07,
"loss": 0.5402700901031494,
"num_tokens": 170787209.0,
"step": 438
},
{
"epoch": 3.2761194029850746,
"grad_norm": 0.3010762400531778,
"learning_rate": 2.8874218784845154e-07,
"loss": 0.560728907585144,
"num_tokens": 171730223.0,
"step": 439
},
{
"epoch": 3.283582089552239,
"grad_norm": 0.2799906205843846,
"learning_rate": 2.868305249743766e-07,
"loss": 0.5792785882949829,
"num_tokens": 172620879.0,
"step": 440
},
{
"epoch": 3.291044776119403,
"grad_norm": 0.27973702443764925,
"learning_rate": 2.849260530346973e-07,
"loss": 0.5594302415847778,
"num_tokens": 173513731.0,
"step": 441
},
{
"epoch": 3.298507462686567,
"grad_norm": 0.2604858147066468,
"learning_rate": 2.830288240678682e-07,
"loss": 0.5618335008621216,
"num_tokens": 174466652.0,
"step": 442
},
{
"epoch": 3.3059701492537314,
"grad_norm": 0.28140102117529975,
"learning_rate": 2.8113888991443446e-07,
"loss": 0.5599676370620728,
"num_tokens": 175305008.0,
"step": 443
},
{
"epoch": 3.3134328358208958,
"grad_norm": 0.26261993494203545,
"learning_rate": 2.7925630221561505e-07,
"loss": 0.5589733719825745,
"num_tokens": 176287960.0,
"step": 444
},
{
"epoch": 3.3208955223880596,
"grad_norm": 0.2691459794281276,
"learning_rate": 2.773811124118918e-07,
"loss": 0.5410393476486206,
"num_tokens": 177240918.0,
"step": 445
},
{
"epoch": 3.328358208955224,
"grad_norm": 0.27073917117802515,
"learning_rate": 2.7551337174160425e-07,
"loss": 0.550033688545227,
"num_tokens": 178155824.0,
"step": 446
},
{
"epoch": 3.3358208955223883,
"grad_norm": 0.27889915727953724,
"learning_rate": 2.736531312395491e-07,
"loss": 0.5926166772842407,
"num_tokens": 179172034.0,
"step": 447
},
{
"epoch": 3.343283582089552,
"grad_norm": 0.28321328409431284,
"learning_rate": 2.718004417355855e-07,
"loss": 0.5419124960899353,
"num_tokens": 180085508.0,
"step": 448
},
{
"epoch": 3.3507462686567164,
"grad_norm": 0.25769815974818394,
"learning_rate": 2.6995535385324667e-07,
"loss": 0.5644470453262329,
"num_tokens": 181111200.0,
"step": 449
},
{
"epoch": 3.3582089552238807,
"grad_norm": 0.2645694828416433,
"learning_rate": 2.6811791800835684e-07,
"loss": 0.5500813722610474,
"num_tokens": 182084448.0,
"step": 450
},
{
"epoch": 3.3656716417910446,
"grad_norm": 0.3398596794365398,
"learning_rate": 2.6628818440765267e-07,
"loss": 0.5711795687675476,
"num_tokens": 183008409.0,
"step": 451
},
{
"epoch": 3.373134328358209,
"grad_norm": 0.28312078619309694,
"learning_rate": 2.6446620304741265e-07,
"loss": 0.49891045689582825,
"num_tokens": 183851194.0,
"step": 452
},
{
"epoch": 3.3805970149253732,
"grad_norm": 0.30224336146290764,
"learning_rate": 2.626520237120898e-07,
"loss": 0.5533944368362427,
"num_tokens": 184757031.0,
"step": 453
},
{
"epoch": 3.388059701492537,
"grad_norm": 0.2635231520858661,
"learning_rate": 2.6084569597295224e-07,
"loss": 0.5472126007080078,
"num_tokens": 185664557.0,
"step": 454
},
{
"epoch": 3.3955223880597014,
"grad_norm": 0.38599423556926155,
"learning_rate": 2.590472691867284e-07,
"loss": 0.5409133434295654,
"num_tokens": 186629979.0,
"step": 455
},
{
"epoch": 3.4029850746268657,
"grad_norm": 0.2607919939858689,
"learning_rate": 2.57256792494258e-07,
"loss": 0.5315978527069092,
"num_tokens": 187564334.0,
"step": 456
},
{
"epoch": 3.41044776119403,
"grad_norm": 0.2805243275573726,
"learning_rate": 2.554743148191497e-07,
"loss": 0.5706053376197815,
"num_tokens": 188489207.0,
"step": 457
},
{
"epoch": 3.417910447761194,
"grad_norm": 0.27087210896579755,
"learning_rate": 2.5369988486644446e-07,
"loss": 0.5453130006790161,
"num_tokens": 189462195.0,
"step": 458
},
{
"epoch": 3.425373134328358,
"grad_norm": 0.2701409088054617,
"learning_rate": 2.5193355112128434e-07,
"loss": 0.5617469549179077,
"num_tokens": 190385848.0,
"step": 459
},
{
"epoch": 3.4328358208955225,
"grad_norm": 0.27610398300578565,
"learning_rate": 2.501753618475877e-07,
"loss": 0.5490225553512573,
"num_tokens": 191288170.0,
"step": 460
},
{
"epoch": 3.4402985074626864,
"grad_norm": 0.2791596724267742,
"learning_rate": 2.4842536508673086e-07,
"loss": 0.5552560091018677,
"num_tokens": 192212892.0,
"step": 461
},
{
"epoch": 3.4477611940298507,
"grad_norm": 0.2818729998392585,
"learning_rate": 2.4668360865623443e-07,
"loss": 0.5438352227210999,
"num_tokens": 193052710.0,
"step": 462
},
{
"epoch": 3.455223880597015,
"grad_norm": 0.29076648116800047,
"learning_rate": 2.4495014014845805e-07,
"loss": 0.5421488285064697,
"num_tokens": 193971934.0,
"step": 463
},
{
"epoch": 3.4626865671641793,
"grad_norm": 0.35528872999469885,
"learning_rate": 2.432250069292989e-07,
"loss": 0.5626663565635681,
"num_tokens": 194797236.0,
"step": 464
},
{
"epoch": 3.470149253731343,
"grad_norm": 0.2931920860035253,
"learning_rate": 2.4150825613689786e-07,
"loss": 0.575283944606781,
"num_tokens": 195700091.0,
"step": 465
},
{
"epoch": 3.4776119402985075,
"grad_norm": 0.2696233856000557,
"learning_rate": 2.397999346803518e-07,
"loss": 0.5804455280303955,
"num_tokens": 196607890.0,
"step": 466
},
{
"epoch": 3.485074626865672,
"grad_norm": 0.2612143539512627,
"learning_rate": 2.3810008923843075e-07,
"loss": 0.5534828901290894,
"num_tokens": 197595641.0,
"step": 467
},
{
"epoch": 3.4925373134328357,
"grad_norm": 0.3003385075218394,
"learning_rate": 2.3640876625830382e-07,
"loss": 0.5445773601531982,
"num_tokens": 198539047.0,
"step": 468
},
{
"epoch": 3.5,
"grad_norm": 0.27091005264565227,
"learning_rate": 2.347260119542692e-07,
"loss": 0.5666298866271973,
"num_tokens": 199529062.0,
"step": 469
},
{
"epoch": 3.5074626865671643,
"grad_norm": 0.2856115330105266,
"learning_rate": 2.3305187230649173e-07,
"loss": 0.5649522542953491,
"num_tokens": 200452743.0,
"step": 470
},
{
"epoch": 3.5149253731343286,
"grad_norm": 0.26678768606458664,
"learning_rate": 2.3138639305974592e-07,
"loss": 0.5633753538131714,
"num_tokens": 201375863.0,
"step": 471
},
{
"epoch": 3.5223880597014925,
"grad_norm": 0.24743082299382807,
"learning_rate": 2.29729619722167e-07,
"loss": 0.5463535785675049,
"num_tokens": 202342633.0,
"step": 472
},
{
"epoch": 3.529850746268657,
"grad_norm": 0.2849265779918233,
"learning_rate": 2.2808159756400664e-07,
"loss": 0.5450330376625061,
"num_tokens": 203206779.0,
"step": 473
},
{
"epoch": 3.5373134328358207,
"grad_norm": 0.2959172618289051,
"learning_rate": 2.264423716163962e-07,
"loss": 0.5645024180412292,
"num_tokens": 204166390.0,
"step": 474
},
{
"epoch": 3.544776119402985,
"grad_norm": 0.30652775818807565,
"learning_rate": 2.248119866701167e-07,
"loss": 0.6000641584396362,
"num_tokens": 205136083.0,
"step": 475
},
{
"epoch": 3.5522388059701493,
"grad_norm": 0.2649068122185818,
"learning_rate": 2.231904872743739e-07,
"loss": 0.563923180103302,
"num_tokens": 206063663.0,
"step": 476
},
{
"epoch": 3.5597014925373136,
"grad_norm": 0.30105804677037196,
"learning_rate": 2.2157791773558222e-07,
"loss": 0.5499534606933594,
"num_tokens": 207014752.0,
"step": 477
},
{
"epoch": 3.5671641791044775,
"grad_norm": 0.2719779878868544,
"learning_rate": 2.1997432211615324e-07,
"loss": 0.5947707891464233,
"num_tokens": 208002031.0,
"step": 478
},
{
"epoch": 3.574626865671642,
"grad_norm": 0.27217392575188515,
"learning_rate": 2.1837974423329254e-07,
"loss": 0.5516700744628906,
"num_tokens": 208865142.0,
"step": 479
},
{
"epoch": 3.582089552238806,
"grad_norm": 0.37099057189831747,
"learning_rate": 2.1679422765780113e-07,
"loss": 0.557658851146698,
"num_tokens": 209775786.0,
"step": 480
},
{
"epoch": 3.58955223880597,
"grad_norm": 0.27880227675709274,
"learning_rate": 2.1521781571288644e-07,
"loss": 0.5569248199462891,
"num_tokens": 210690185.0,
"step": 481
},
{
"epoch": 3.5970149253731343,
"grad_norm": 0.27705864225434784,
"learning_rate": 2.136505514729774e-07,
"loss": 0.5474062561988831,
"num_tokens": 211593514.0,
"step": 482
},
{
"epoch": 3.6044776119402986,
"grad_norm": 0.5224112991843156,
"learning_rate": 2.120924777625479e-07,
"loss": 0.5869604349136353,
"num_tokens": 212523789.0,
"step": 483
},
{
"epoch": 3.611940298507463,
"grad_norm": 0.2577684602730275,
"learning_rate": 2.1054363715494693e-07,
"loss": 0.5051690340042114,
"num_tokens": 213378219.0,
"step": 484
},
{
"epoch": 3.6194029850746268,
"grad_norm": 0.28941698042641906,
"learning_rate": 2.090040719712344e-07,
"loss": 0.5571575164794922,
"num_tokens": 214172580.0,
"step": 485
},
{
"epoch": 3.626865671641791,
"grad_norm": 0.4260866776138794,
"learning_rate": 2.0747382427902572e-07,
"loss": 0.5927166938781738,
"num_tokens": 214985813.0,
"step": 486
},
{
"epoch": 3.6343283582089554,
"grad_norm": 0.3024307973363642,
"learning_rate": 2.0595293589134176e-07,
"loss": 0.5418879985809326,
"num_tokens": 215879921.0,
"step": 487
},
{
"epoch": 3.6417910447761193,
"grad_norm": 0.2780348633827344,
"learning_rate": 2.044414483654668e-07,
"loss": 0.5637257695198059,
"num_tokens": 216746710.0,
"step": 488
},
{
"epoch": 3.6492537313432836,
"grad_norm": 0.36018093543345575,
"learning_rate": 2.0293940300181212e-07,
"loss": 0.5574115514755249,
"num_tokens": 217502005.0,
"step": 489
},
{
"epoch": 3.656716417910448,
"grad_norm": 0.36563238506727497,
"learning_rate": 2.0144684084278846e-07,
"loss": 0.5422406792640686,
"num_tokens": 218245391.0,
"step": 490
},
{
"epoch": 3.664179104477612,
"grad_norm": 0.26076893929904454,
"learning_rate": 1.9996380267168416e-07,
"loss": 0.5316330194473267,
"num_tokens": 219197443.0,
"step": 491
},
{
"epoch": 3.671641791044776,
"grad_norm": 0.2756117047272477,
"learning_rate": 1.9849032901155073e-07,
"loss": 0.576492965221405,
"num_tokens": 220198270.0,
"step": 492
},
{
"epoch": 3.6791044776119404,
"grad_norm": 0.3104844021800582,
"learning_rate": 1.9702646012409576e-07,
"loss": 0.5465894937515259,
"num_tokens": 221120937.0,
"step": 493
},
{
"epoch": 3.6865671641791042,
"grad_norm": 0.28862702731917284,
"learning_rate": 1.9557223600858236e-07,
"loss": 0.562412679195404,
"num_tokens": 222035264.0,
"step": 494
},
{
"epoch": 3.6940298507462686,
"grad_norm": 0.2990711536721463,
"learning_rate": 1.9412769640073686e-07,
"loss": 0.6177443265914917,
"num_tokens": 222924164.0,
"step": 495
},
{
"epoch": 3.701492537313433,
"grad_norm": 0.29123869800526553,
"learning_rate": 1.9269288077166264e-07,
"loss": 0.6057195067405701,
"num_tokens": 223814612.0,
"step": 496
},
{
"epoch": 3.708955223880597,
"grad_norm": 0.30962841555286785,
"learning_rate": 1.9126782832676173e-07,
"loss": 0.5551049709320068,
"num_tokens": 224678747.0,
"step": 497
},
{
"epoch": 3.716417910447761,
"grad_norm": 0.31789120255947023,
"learning_rate": 1.8985257800466348e-07,
"loss": 0.5476455092430115,
"num_tokens": 225631946.0,
"step": 498
},
{
"epoch": 3.7238805970149254,
"grad_norm": 0.2781242088040202,
"learning_rate": 1.8844716847616052e-07,
"loss": 0.5808273553848267,
"num_tokens": 226600195.0,
"step": 499
},
{
"epoch": 3.7313432835820897,
"grad_norm": 0.2954115885416602,
"learning_rate": 1.8705163814315228e-07,
"loss": 0.5603234767913818,
"num_tokens": 227420424.0,
"step": 500
},
{
"epoch": 3.7388059701492535,
"grad_norm": 0.26394812594414746,
"learning_rate": 1.856660251375957e-07,
"loss": 0.5475826263427734,
"num_tokens": 228393641.0,
"step": 501
},
{
"epoch": 3.746268656716418,
"grad_norm": 0.25638710494334194,
"learning_rate": 1.8429036732046327e-07,
"loss": 0.5315807461738586,
"num_tokens": 229383627.0,
"step": 502
},
{
"epoch": 3.753731343283582,
"grad_norm": 0.2820486917822845,
"learning_rate": 1.8292470228070805e-07,
"loss": 0.555698037147522,
"num_tokens": 230312293.0,
"step": 503
},
{
"epoch": 3.7611940298507465,
"grad_norm": 0.26963402638209555,
"learning_rate": 1.8156906733423738e-07,
"loss": 0.5559597611427307,
"num_tokens": 231227207.0,
"step": 504
},
{
"epoch": 3.7686567164179103,
"grad_norm": 0.2635960871590189,
"learning_rate": 1.8022349952289273e-07,
"loss": 0.5315006971359253,
"num_tokens": 232129690.0,
"step": 505
},
{
"epoch": 3.7761194029850746,
"grad_norm": 0.28835042727096805,
"learning_rate": 1.7888803561343751e-07,
"loss": 0.5724339485168457,
"num_tokens": 232988180.0,
"step": 506
},
{
"epoch": 3.783582089552239,
"grad_norm": 0.2880295787825169,
"learning_rate": 1.7756271209655294e-07,
"loss": 0.6089663505554199,
"num_tokens": 233887459.0,
"step": 507
},
{
"epoch": 3.791044776119403,
"grad_norm": 0.2923917838459817,
"learning_rate": 1.7624756518584013e-07,
"loss": 0.5508089065551758,
"num_tokens": 234724288.0,
"step": 508
},
{
"epoch": 3.798507462686567,
"grad_norm": 0.2663262112656659,
"learning_rate": 1.7494263081683131e-07,
"loss": 0.5383226871490479,
"num_tokens": 235591261.0,
"step": 509
},
{
"epoch": 3.8059701492537314,
"grad_norm": 0.2810986277113732,
"learning_rate": 1.7364794464600808e-07,
"loss": 0.5323266983032227,
"num_tokens": 236513360.0,
"step": 510
},
{
"epoch": 3.8134328358208958,
"grad_norm": 0.3206761651223088,
"learning_rate": 1.7236354204982587e-07,
"loss": 0.5368841290473938,
"num_tokens": 237427821.0,
"step": 511
},
{
"epoch": 3.8208955223880596,
"grad_norm": 0.2831284138508691,
"learning_rate": 1.7108945812374873e-07,
"loss": 0.5697877407073975,
"num_tokens": 238361524.0,
"step": 512
},
{
"epoch": 3.828358208955224,
"grad_norm": 0.4678969236120579,
"learning_rate": 1.698257276812896e-07,
"loss": 0.567964494228363,
"num_tokens": 239295734.0,
"step": 513
},
{
"epoch": 3.835820895522388,
"grad_norm": 0.3454163016526369,
"learning_rate": 1.6857238525305922e-07,
"loss": 0.5614358186721802,
"num_tokens": 240192414.0,
"step": 514
},
{
"epoch": 3.843283582089552,
"grad_norm": 0.26229554588086373,
"learning_rate": 1.6732946508582286e-07,
"loss": 0.5396016836166382,
"num_tokens": 241149058.0,
"step": 515
},
{
"epoch": 3.8507462686567164,
"grad_norm": 0.25830144071410005,
"learning_rate": 1.6609700114156368e-07,
"loss": 0.548250675201416,
"num_tokens": 242110168.0,
"step": 516
},
{
"epoch": 3.8582089552238807,
"grad_norm": 0.2577863607708949,
"learning_rate": 1.648750270965559e-07,
"loss": 0.5675839185714722,
"num_tokens": 243142913.0,
"step": 517
},
{
"epoch": 3.8656716417910446,
"grad_norm": 1.2576352530776749,
"learning_rate": 1.6366357634044403e-07,
"loss": 0.5479030609130859,
"num_tokens": 244026260.0,
"step": 518
},
{
"epoch": 3.873134328358209,
"grad_norm": 0.43195789720816985,
"learning_rate": 1.6246268197533046e-07,
"loss": 0.5657459497451782,
"num_tokens": 244835255.0,
"step": 519
},
{
"epoch": 3.8805970149253732,
"grad_norm": 0.2850328915996735,
"learning_rate": 1.6127237681487092e-07,
"loss": 0.5788131952285767,
"num_tokens": 245744839.0,
"step": 520
},
{
"epoch": 3.888059701492537,
"grad_norm": 0.2734777728630755,
"learning_rate": 1.600926933833783e-07,
"loss": 0.5809911489486694,
"num_tokens": 246688392.0,
"step": 521
},
{
"epoch": 3.8955223880597014,
"grad_norm": 0.2933774394372255,
"learning_rate": 1.5892366391493362e-07,
"loss": 0.5803858637809753,
"num_tokens": 247632902.0,
"step": 522
},
{
"epoch": 3.9029850746268657,
"grad_norm": 0.2903603425087314,
"learning_rate": 1.5776532035250513e-07,
"loss": 0.5569208860397339,
"num_tokens": 248582604.0,
"step": 523
},
{
"epoch": 3.91044776119403,
"grad_norm": 0.2633779070798848,
"learning_rate": 1.5661769434707583e-07,
"loss": 0.5375438928604126,
"num_tokens": 249449227.0,
"step": 524
},
{
"epoch": 3.917910447761194,
"grad_norm": 0.31705662168623416,
"learning_rate": 1.5548081725677842e-07,
"loss": 0.5713478326797485,
"num_tokens": 250309108.0,
"step": 525
},
{
"epoch": 3.925373134328358,
"grad_norm": 0.2743174912786303,
"learning_rate": 1.5435472014603838e-07,
"loss": 0.5781571865081787,
"num_tokens": 251284224.0,
"step": 526
},
{
"epoch": 3.9328358208955225,
"grad_norm": 0.2781053851258556,
"learning_rate": 1.5323943378472546e-07,
"loss": 0.5639330148696899,
"num_tokens": 252205748.0,
"step": 527
},
{
"epoch": 3.9402985074626864,
"grad_norm": 0.27546182816312653,
"learning_rate": 1.5213498864731265e-07,
"loss": 0.5076487064361572,
"num_tokens": 253011360.0,
"step": 528
},
{
"epoch": 3.9477611940298507,
"grad_norm": 0.2724349176732762,
"learning_rate": 1.5104141491204357e-07,
"loss": 0.5303751230239868,
"num_tokens": 253876442.0,
"step": 529
},
{
"epoch": 3.955223880597015,
"grad_norm": 0.29045934843508436,
"learning_rate": 1.4995874246010776e-07,
"loss": 0.5791366100311279,
"num_tokens": 254702285.0,
"step": 530
},
{
"epoch": 3.9626865671641793,
"grad_norm": 0.25781806828726467,
"learning_rate": 1.4888700087482444e-07,
"loss": 0.5378929376602173,
"num_tokens": 255641666.0,
"step": 531
},
{
"epoch": 3.970149253731343,
"grad_norm": 0.2657141262373794,
"learning_rate": 1.4782621944083392e-07,
"loss": 0.5480854511260986,
"num_tokens": 256570444.0,
"step": 532
},
{
"epoch": 3.9776119402985075,
"grad_norm": 0.27332662946272,
"learning_rate": 1.467764271432977e-07,
"loss": 0.5349365472793579,
"num_tokens": 257440148.0,
"step": 533
},
{
"epoch": 3.9850746268656714,
"grad_norm": 0.27927002916412313,
"learning_rate": 1.4573765266710598e-07,
"loss": 0.5557724237442017,
"num_tokens": 258286072.0,
"step": 534
},
{
"epoch": 3.9925373134328357,
"grad_norm": 0.30540619594506574,
"learning_rate": 1.4470992439609444e-07,
"loss": 0.5325461626052856,
"num_tokens": 259027900.0,
"step": 535
},
{
"epoch": 4.0,
"grad_norm": 0.2554852841852584,
"learning_rate": 1.4369327041226831e-07,
"loss": 0.5564035177230835,
"num_tokens": 260054853.0,
"step": 536
},
{
"epoch": 4.007462686567164,
"grad_norm": 0.3774222258568171,
"learning_rate": 1.4268771849503506e-07,
"loss": 0.5198606252670288,
"num_tokens": 260848096.0,
"step": 537
},
{
"epoch": 4.014925373134329,
"grad_norm": 0.42926748854255425,
"learning_rate": 1.4169329612044566e-07,
"loss": 0.5263375043869019,
"num_tokens": 261795072.0,
"step": 538
},
{
"epoch": 4.022388059701493,
"grad_norm": 0.28442769544889995,
"learning_rate": 1.4071003046044322e-07,
"loss": 0.5481403470039368,
"num_tokens": 262649190.0,
"step": 539
},
{
"epoch": 4.029850746268656,
"grad_norm": 0.33957785588251854,
"learning_rate": 1.397379483821212e-07,
"loss": 0.5446444749832153,
"num_tokens": 263639124.0,
"step": 540
},
{
"epoch": 4.037313432835821,
"grad_norm": 0.31285290824877576,
"learning_rate": 1.3877707644698893e-07,
"loss": 0.5856173038482666,
"num_tokens": 264480175.0,
"step": 541
},
{
"epoch": 4.044776119402985,
"grad_norm": 0.3146654769143484,
"learning_rate": 1.3782744091024584e-07,
"loss": 0.5640919208526611,
"num_tokens": 265217661.0,
"step": 542
},
{
"epoch": 4.052238805970149,
"grad_norm": 0.2543877235448555,
"learning_rate": 1.3688906772006393e-07,
"loss": 0.545689582824707,
"num_tokens": 266155540.0,
"step": 543
},
{
"epoch": 4.059701492537314,
"grad_norm": 0.2831883473552513,
"learning_rate": 1.3596198251687917e-07,
"loss": 0.5562140941619873,
"num_tokens": 267051346.0,
"step": 544
},
{
"epoch": 4.067164179104478,
"grad_norm": 0.2634821254048522,
"learning_rate": 1.3504621063269057e-07,
"loss": 0.5558310747146606,
"num_tokens": 268001048.0,
"step": 545
},
{
"epoch": 4.074626865671641,
"grad_norm": 0.2740473313761773,
"learning_rate": 1.34141777090368e-07,
"loss": 0.5498157739639282,
"num_tokens": 268948251.0,
"step": 546
},
{
"epoch": 4.082089552238806,
"grad_norm": 0.3034010911009059,
"learning_rate": 1.3324870660296866e-07,
"loss": 0.5079299211502075,
"num_tokens": 269891870.0,
"step": 547
},
{
"epoch": 4.08955223880597,
"grad_norm": 0.25296398139696874,
"learning_rate": 1.3236702357306156e-07,
"loss": 0.557180643081665,
"num_tokens": 270893706.0,
"step": 548
},
{
"epoch": 4.097014925373134,
"grad_norm": 0.5617876126540791,
"learning_rate": 1.3149675209206084e-07,
"loss": 0.5518041253089905,
"num_tokens": 271655159.0,
"step": 549
},
{
"epoch": 4.104477611940299,
"grad_norm": 0.26496316816661675,
"learning_rate": 1.3063791593956756e-07,
"loss": 0.5603747367858887,
"num_tokens": 272587675.0,
"step": 550
},
{
"epoch": 4.111940298507463,
"grad_norm": 0.2601339382276699,
"learning_rate": 1.2979053858271993e-07,
"loss": 0.5405164957046509,
"num_tokens": 273463891.0,
"step": 551
},
{
"epoch": 4.119402985074627,
"grad_norm": 0.2795698909044753,
"learning_rate": 1.2895464317555206e-07,
"loss": 0.5839468240737915,
"num_tokens": 274283621.0,
"step": 552
},
{
"epoch": 4.126865671641791,
"grad_norm": 0.24753013274997315,
"learning_rate": 1.28130252558361e-07,
"loss": 0.5279031991958618,
"num_tokens": 275221184.0,
"step": 553
},
{
"epoch": 4.134328358208955,
"grad_norm": 0.28266540188156414,
"learning_rate": 1.2731738925708327e-07,
"loss": 0.5553559064865112,
"num_tokens": 276094732.0,
"step": 554
},
{
"epoch": 4.141791044776119,
"grad_norm": 0.2690824174144065,
"learning_rate": 1.265160754826787e-07,
"loss": 0.572119951248169,
"num_tokens": 277122845.0,
"step": 555
},
{
"epoch": 4.149253731343284,
"grad_norm": 0.2472737773051283,
"learning_rate": 1.2572633313052409e-07,
"loss": 0.569811999797821,
"num_tokens": 278203814.0,
"step": 556
},
{
"epoch": 4.156716417910448,
"grad_norm": 0.2742488775651172,
"learning_rate": 1.249481837798144e-07,
"loss": 0.5402873754501343,
"num_tokens": 279084275.0,
"step": 557
},
{
"epoch": 4.164179104477612,
"grad_norm": 0.45576858034969514,
"learning_rate": 1.2418164869297352e-07,
"loss": 0.5487810373306274,
"num_tokens": 279995589.0,
"step": 558
},
{
"epoch": 4.1716417910447765,
"grad_norm": 0.33886009051736005,
"learning_rate": 1.2342674881507325e-07,
"loss": 0.5493899583816528,
"num_tokens": 280947164.0,
"step": 559
},
{
"epoch": 4.17910447761194,
"grad_norm": 0.2672882959311712,
"learning_rate": 1.226835047732607e-07,
"loss": 0.5846470594406128,
"num_tokens": 281865957.0,
"step": 560
},
{
"epoch": 4.186567164179104,
"grad_norm": 0.26682211241453646,
"learning_rate": 1.2195193687619503e-07,
"loss": 0.5684331655502319,
"num_tokens": 282822635.0,
"step": 561
},
{
"epoch": 4.1940298507462686,
"grad_norm": 0.5844175934203363,
"learning_rate": 1.212320651134921e-07,
"loss": 0.5448155403137207,
"num_tokens": 283735562.0,
"step": 562
},
{
"epoch": 4.201492537313433,
"grad_norm": 0.28667238801807454,
"learning_rate": 1.2052390915517878e-07,
"loss": 0.552519679069519,
"num_tokens": 284514293.0,
"step": 563
},
{
"epoch": 4.208955223880597,
"grad_norm": 0.27209099234946366,
"learning_rate": 1.198274883511551e-07,
"loss": 0.5868443250656128,
"num_tokens": 285496842.0,
"step": 564
},
{
"epoch": 4.2164179104477615,
"grad_norm": 0.282926588733111,
"learning_rate": 1.1914282173066572e-07,
"loss": 0.5723504424095154,
"num_tokens": 286397891.0,
"step": 565
},
{
"epoch": 4.223880597014926,
"grad_norm": 0.2770197201382834,
"learning_rate": 1.1846992800177977e-07,
"loss": 0.5528011918067932,
"num_tokens": 287291637.0,
"step": 566
},
{
"epoch": 4.231343283582089,
"grad_norm": 0.28361549703489075,
"learning_rate": 1.1780882555087987e-07,
"loss": 0.5853151082992554,
"num_tokens": 288241806.0,
"step": 567
},
{
"epoch": 4.2388059701492535,
"grad_norm": 0.2939822179720846,
"learning_rate": 1.1715953244215962e-07,
"loss": 0.5159034729003906,
"num_tokens": 289026082.0,
"step": 568
},
{
"epoch": 4.246268656716418,
"grad_norm": 0.27406466943349356,
"learning_rate": 1.1652206641713017e-07,
"loss": 0.5613383054733276,
"num_tokens": 289932433.0,
"step": 569
},
{
"epoch": 4.253731343283582,
"grad_norm": 0.26361825748402795,
"learning_rate": 1.1589644489413516e-07,
"loss": 0.5283357501029968,
"num_tokens": 290832565.0,
"step": 570
},
{
"epoch": 4.2611940298507465,
"grad_norm": 0.2760993964502923,
"learning_rate": 1.1528268496787496e-07,
"loss": 0.5750157833099365,
"num_tokens": 291807812.0,
"step": 571
},
{
"epoch": 4.268656716417911,
"grad_norm": 0.28258368565429337,
"learning_rate": 1.1468080340893957e-07,
"loss": 0.5445358157157898,
"num_tokens": 292611203.0,
"step": 572
},
{
"epoch": 4.276119402985074,
"grad_norm": 0.2730636420985558,
"learning_rate": 1.1409081666335033e-07,
"loss": 0.6081241369247437,
"num_tokens": 293563137.0,
"step": 573
},
{
"epoch": 4.2835820895522385,
"grad_norm": 0.2887060017580731,
"learning_rate": 1.1351274085211066e-07,
"loss": 0.5485525131225586,
"num_tokens": 294390720.0,
"step": 574
},
{
"epoch": 4.291044776119403,
"grad_norm": 0.3012747873135312,
"learning_rate": 1.1294659177076522e-07,
"loss": 0.5155702829360962,
"num_tokens": 295331187.0,
"step": 575
},
{
"epoch": 4.298507462686567,
"grad_norm": 0.2920426716987418,
"learning_rate": 1.1239238488896874e-07,
"loss": 0.5878146886825562,
"num_tokens": 296174930.0,
"step": 576
},
{
"epoch": 4.3059701492537314,
"grad_norm": 0.2717219604372653,
"learning_rate": 1.118501353500631e-07,
"loss": 0.5488337278366089,
"num_tokens": 296984260.0,
"step": 577
},
{
"epoch": 4.313432835820896,
"grad_norm": 0.2881268506141246,
"learning_rate": 1.1131985797066362e-07,
"loss": 0.5962164402008057,
"num_tokens": 297814492.0,
"step": 578
},
{
"epoch": 4.32089552238806,
"grad_norm": 0.3429404437909788,
"learning_rate": 1.1080156724025409e-07,
"loss": 0.5432817935943604,
"num_tokens": 298682103.0,
"step": 579
},
{
"epoch": 4.3283582089552235,
"grad_norm": 0.26831503316886746,
"learning_rate": 1.1029527732079083e-07,
"loss": 0.5613952875137329,
"num_tokens": 299706050.0,
"step": 580
},
{
"epoch": 4.335820895522388,
"grad_norm": 0.30116453227315987,
"learning_rate": 1.0980100204631603e-07,
"loss": 0.5974493026733398,
"num_tokens": 300500751.0,
"step": 581
},
{
"epoch": 4.343283582089552,
"grad_norm": 0.2574290282127498,
"learning_rate": 1.0931875492257944e-07,
"loss": 0.5080505609512329,
"num_tokens": 301436049.0,
"step": 582
},
{
"epoch": 4.350746268656716,
"grad_norm": 0.2768297952700454,
"learning_rate": 1.088485491266694e-07,
"loss": 0.5769013166427612,
"num_tokens": 302245987.0,
"step": 583
},
{
"epoch": 4.358208955223881,
"grad_norm": 0.2614099752067767,
"learning_rate": 1.0839039750665291e-07,
"loss": 0.5327722430229187,
"num_tokens": 303180329.0,
"step": 584
},
{
"epoch": 4.365671641791045,
"grad_norm": 0.2545576461755387,
"learning_rate": 1.0794431258122429e-07,
"loss": 0.5465179085731506,
"num_tokens": 304106987.0,
"step": 585
},
{
"epoch": 4.373134328358209,
"grad_norm": 0.3028807540174107,
"learning_rate": 1.0751030653936354e-07,
"loss": 0.5673878192901611,
"num_tokens": 304931031.0,
"step": 586
},
{
"epoch": 4.380597014925373,
"grad_norm": 0.2623967772914622,
"learning_rate": 1.0708839124000287e-07,
"loss": 0.5716835260391235,
"num_tokens": 305846255.0,
"step": 587
},
{
"epoch": 4.388059701492537,
"grad_norm": 0.2805523792578899,
"learning_rate": 1.066785782117028e-07,
"loss": 0.5245805978775024,
"num_tokens": 306627892.0,
"step": 588
},
{
"epoch": 4.395522388059701,
"grad_norm": 0.3121504154269003,
"learning_rate": 1.0628087865233737e-07,
"loss": 0.5411394238471985,
"num_tokens": 307519113.0,
"step": 589
},
{
"epoch": 4.402985074626866,
"grad_norm": 0.2913376914541736,
"learning_rate": 1.0589530342878769e-07,
"loss": 0.5592665672302246,
"num_tokens": 308359627.0,
"step": 590
},
{
"epoch": 4.41044776119403,
"grad_norm": 0.26729488095826903,
"learning_rate": 1.0552186307664565e-07,
"loss": 0.5448157787322998,
"num_tokens": 309250463.0,
"step": 591
},
{
"epoch": 4.417910447761194,
"grad_norm": 0.30314603500542503,
"learning_rate": 1.0516056779992541e-07,
"loss": 0.5698049664497375,
"num_tokens": 310094707.0,
"step": 592
},
{
"epoch": 4.425373134328359,
"grad_norm": 0.2786233963992514,
"learning_rate": 1.0481142747078492e-07,
"loss": 0.5542622804641724,
"num_tokens": 310932669.0,
"step": 593
},
{
"epoch": 4.432835820895522,
"grad_norm": 0.28934032270495663,
"learning_rate": 1.0447445162925613e-07,
"loss": 0.5697283744812012,
"num_tokens": 311864048.0,
"step": 594
},
{
"epoch": 4.440298507462686,
"grad_norm": 0.2500391659266081,
"learning_rate": 1.0414964948298435e-07,
"loss": 0.5528576374053955,
"num_tokens": 312840365.0,
"step": 595
},
{
"epoch": 4.447761194029851,
"grad_norm": 0.2916622377572014,
"learning_rate": 1.0383702990697656e-07,
"loss": 0.5366314649581909,
"num_tokens": 313795804.0,
"step": 596
},
{
"epoch": 4.455223880597015,
"grad_norm": 0.271646171110688,
"learning_rate": 1.035366014433589e-07,
"loss": 0.5479257106781006,
"num_tokens": 314648164.0,
"step": 597
},
{
"epoch": 4.462686567164179,
"grad_norm": 0.28220408400221697,
"learning_rate": 1.032483723011433e-07,
"loss": 0.5544242858886719,
"num_tokens": 315521665.0,
"step": 598
},
{
"epoch": 4.470149253731344,
"grad_norm": 0.32415992407130506,
"learning_rate": 1.0297235035600334e-07,
"loss": 0.5346230268478394,
"num_tokens": 316460972.0,
"step": 599
},
{
"epoch": 4.477611940298507,
"grad_norm": 0.29902595518304415,
"learning_rate": 1.0270854315005874e-07,
"loss": 0.5251238346099854,
"num_tokens": 317398198.0,
"step": 600
},
{
"epoch": 4.485074626865671,
"grad_norm": 0.27529822550906163,
"learning_rate": 1.0245695789166948e-07,
"loss": 0.550391674041748,
"num_tokens": 318368138.0,
"step": 601
},
{
"epoch": 4.492537313432836,
"grad_norm": 0.25678842717501,
"learning_rate": 1.0221760145523875e-07,
"loss": 0.5486523509025574,
"num_tokens": 319254359.0,
"step": 602
},
{
"epoch": 4.5,
"grad_norm": 0.30422330388419494,
"learning_rate": 1.0199048038102526e-07,
"loss": 0.5667173266410828,
"num_tokens": 320163581.0,
"step": 603
},
{
"epoch": 4.507462686567164,
"grad_norm": 0.2590403748368139,
"learning_rate": 1.0177560087496423e-07,
"loss": 0.5528400540351868,
"num_tokens": 321099182.0,
"step": 604
},
{
"epoch": 4.514925373134329,
"grad_norm": 0.2779131559748047,
"learning_rate": 1.0157296880849824e-07,
"loss": 0.5901874303817749,
"num_tokens": 321960509.0,
"step": 605
},
{
"epoch": 4.522388059701493,
"grad_norm": 0.29485819884103376,
"learning_rate": 1.0138258971841641e-07,
"loss": 0.5388875007629395,
"num_tokens": 322828287.0,
"step": 606
},
{
"epoch": 4.529850746268656,
"grad_norm": 0.2672600011322682,
"learning_rate": 1.0120446880670325e-07,
"loss": 0.5676090121269226,
"num_tokens": 323747430.0,
"step": 607
},
{
"epoch": 4.537313432835821,
"grad_norm": 0.2612827675126176,
"learning_rate": 1.0103861094039667e-07,
"loss": 0.5471125245094299,
"num_tokens": 324666159.0,
"step": 608
},
{
"epoch": 4.544776119402985,
"grad_norm": 0.27778633049164236,
"learning_rate": 1.008850206514547e-07,
"loss": 0.5418146848678589,
"num_tokens": 325560938.0,
"step": 609
},
{
"epoch": 4.552238805970149,
"grad_norm": 0.2969440332344535,
"learning_rate": 1.0074370213663201e-07,
"loss": 0.5470881462097168,
"num_tokens": 326330466.0,
"step": 610
},
{
"epoch": 4.559701492537314,
"grad_norm": 0.31944728844902104,
"learning_rate": 1.0061465925736478e-07,
"loss": 0.5502467155456543,
"num_tokens": 327193522.0,
"step": 611
},
{
"epoch": 4.567164179104478,
"grad_norm": 0.2519446230589244,
"learning_rate": 1.0049789553966569e-07,
"loss": 0.5561034679412842,
"num_tokens": 328181555.0,
"step": 612
},
{
"epoch": 4.574626865671641,
"grad_norm": 0.36293634063352515,
"learning_rate": 1.0039341417402715e-07,
"loss": 0.5579421520233154,
"num_tokens": 329048630.0,
"step": 613
},
{
"epoch": 4.582089552238806,
"grad_norm": 0.26255387893449383,
"learning_rate": 1.0030121801533441e-07,
"loss": 0.5714669823646545,
"num_tokens": 329968258.0,
"step": 614
},
{
"epoch": 4.58955223880597,
"grad_norm": 0.28393932823044754,
"learning_rate": 1.002213095827875e-07,
"loss": 0.542944610118866,
"num_tokens": 330754847.0,
"step": 615
},
{
"epoch": 4.597014925373134,
"grad_norm": 0.25741865731733854,
"learning_rate": 1.0015369105983216e-07,
"loss": 0.5193674564361572,
"num_tokens": 331683385.0,
"step": 616
},
{
"epoch": 4.604477611940299,
"grad_norm": 0.28009977442278555,
"learning_rate": 1.0009836429410053e-07,
"loss": 0.5400401949882507,
"num_tokens": 332655820.0,
"step": 617
},
{
"epoch": 4.611940298507463,
"grad_norm": 0.2840147519658574,
"learning_rate": 1.0005533079736037e-07,
"loss": 0.5117232203483582,
"num_tokens": 333524783.0,
"step": 618
},
{
"epoch": 4.619402985074627,
"grad_norm": 0.25572802620429147,
"learning_rate": 1.0002459174547398e-07,
"loss": 0.5419676303863525,
"num_tokens": 334451276.0,
"step": 619
},
{
"epoch": 4.6268656716417915,
"grad_norm": 0.2837927983840193,
"learning_rate": 1.0000614797836585e-07,
"loss": 0.5926541090011597,
"num_tokens": 335471551.0,
"step": 620
},
{
"epoch": 4.6268656716417915,
"step": 620,
"total_flos": 829202911068160.0,
"train_loss": 0.33651591361530364,
"train_runtime": 10914.7384,
"train_samples_per_second": 1.818,
"train_steps_per_second": 0.057
}
],
"logging_steps": 1,
"max_steps": 620,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 62,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 829202911068160.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}