CSP_0.6B_Thinking / trainer_state.json
VivianKeith's picture
Upload folder using huggingface_hub
781e6d1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 222,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0045045045045045045,
"grad_norm": 0.8704756498336792,
"learning_rate": 7.142857142857143e-07,
"loss": 0.31640625,
"step": 1,
"token_acc": 0.901016281855159
},
{
"epoch": 0.009009009009009009,
"grad_norm": 0.867336094379425,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.34814453125,
"step": 2,
"token_acc": 0.8902718604179347
},
{
"epoch": 0.013513513513513514,
"grad_norm": 0.8713582158088684,
"learning_rate": 2.1428571428571427e-06,
"loss": 0.404052734375,
"step": 3,
"token_acc": 0.8748633439589399
},
{
"epoch": 0.018018018018018018,
"grad_norm": 1.0100644826889038,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.3514404296875,
"step": 4,
"token_acc": 0.8913569321533923
},
{
"epoch": 0.02252252252252252,
"grad_norm": 0.7727848887443542,
"learning_rate": 3.5714285714285718e-06,
"loss": 0.337158203125,
"step": 5,
"token_acc": 0.8953797468354431
},
{
"epoch": 0.02702702702702703,
"grad_norm": 0.8298861980438232,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.376220703125,
"step": 6,
"token_acc": 0.8836344783462395
},
{
"epoch": 0.03153153153153153,
"grad_norm": 0.8222936391830444,
"learning_rate": 5e-06,
"loss": 0.40087890625,
"step": 7,
"token_acc": 0.8779121330766402
},
{
"epoch": 0.036036036036036036,
"grad_norm": 0.762649655342102,
"learning_rate": 4.999733114418726e-06,
"loss": 0.385498046875,
"step": 8,
"token_acc": 0.8799979877251233
},
{
"epoch": 0.04054054054054054,
"grad_norm": 0.8142457604408264,
"learning_rate": 4.998932514657232e-06,
"loss": 0.334716796875,
"step": 9,
"token_acc": 0.8947475570032574
},
{
"epoch": 0.04504504504504504,
"grad_norm": 1.020268201828003,
"learning_rate": 4.997598371650346e-06,
"loss": 0.395263671875,
"step": 10,
"token_acc": 0.8798358928868437
},
{
"epoch": 0.04954954954954955,
"grad_norm": 1.1546354293823242,
"learning_rate": 4.995730970248893e-06,
"loss": 0.3499755859375,
"step": 11,
"token_acc": 0.8920448295669816
},
{
"epoch": 0.05405405405405406,
"grad_norm": 0.9397904872894287,
"learning_rate": 4.993330709158879e-06,
"loss": 0.3543701171875,
"step": 12,
"token_acc": 0.8902679534039811
},
{
"epoch": 0.05855855855855856,
"grad_norm": 0.957818329334259,
"learning_rate": 4.990398100856367e-06,
"loss": 0.33349609375,
"step": 13,
"token_acc": 0.897339547629675
},
{
"epoch": 0.06306306306306306,
"grad_norm": 1.097841501235962,
"learning_rate": 4.986933771478052e-06,
"loss": 0.3480224609375,
"step": 14,
"token_acc": 0.8923554684891254
},
{
"epoch": 0.06756756756756757,
"grad_norm": 1.1507660150527954,
"learning_rate": 4.982938460687583e-06,
"loss": 0.3604736328125,
"step": 15,
"token_acc": 0.8908350807943752
},
{
"epoch": 0.07207207207207207,
"grad_norm": 0.8487851023674011,
"learning_rate": 4.978413021517634e-06,
"loss": 0.34765625,
"step": 16,
"token_acc": 0.8925047582349576
},
{
"epoch": 0.07657657657657657,
"grad_norm": 0.7968904376029968,
"learning_rate": 4.973358420187776e-06,
"loss": 0.318359375,
"step": 17,
"token_acc": 0.8994016642783518
},
{
"epoch": 0.08108108108108109,
"grad_norm": 0.8453375101089478,
"learning_rate": 4.967775735898179e-06,
"loss": 0.3338623046875,
"step": 18,
"token_acc": 0.8963307821629692
},
{
"epoch": 0.08558558558558559,
"grad_norm": 0.8234410285949707,
"learning_rate": 4.961666160599198e-06,
"loss": 0.344482421875,
"step": 19,
"token_acc": 0.8938563663904482
},
{
"epoch": 0.09009009009009009,
"grad_norm": 0.9092895984649658,
"learning_rate": 4.955030998736876e-06,
"loss": 0.323974609375,
"step": 20,
"token_acc": 0.8994619407758588
},
{
"epoch": 0.0945945945945946,
"grad_norm": 0.9384039044380188,
"learning_rate": 4.947871666974438e-06,
"loss": 0.40478515625,
"step": 21,
"token_acc": 0.8766958028536647
},
{
"epoch": 0.0990990990990991,
"grad_norm": 0.9205092787742615,
"learning_rate": 4.940189693889819e-06,
"loss": 0.32330322265625,
"step": 22,
"token_acc": 0.8981442098900853
},
{
"epoch": 0.1036036036036036,
"grad_norm": 0.9223892092704773,
"learning_rate": 4.931986719649298e-06,
"loss": 0.3792724609375,
"step": 23,
"token_acc": 0.8823543361236927
},
{
"epoch": 0.10810810810810811,
"grad_norm": 0.6995769739151001,
"learning_rate": 4.923264495657319e-06,
"loss": 0.3433837890625,
"step": 24,
"token_acc": 0.8946257568082592
},
{
"epoch": 0.11261261261261261,
"grad_norm": 1.3750187158584595,
"learning_rate": 4.914024884182535e-06,
"loss": 0.40557861328125,
"step": 25,
"token_acc": 0.878056546947765
},
{
"epoch": 0.11711711711711711,
"grad_norm": 1.1581752300262451,
"learning_rate": 4.904269857960208e-06,
"loss": 0.3367919921875,
"step": 26,
"token_acc": 0.8954026071562684
},
{
"epoch": 0.12162162162162163,
"grad_norm": 1.0632562637329102,
"learning_rate": 4.894001499771015e-06,
"loss": 0.358154296875,
"step": 27,
"token_acc": 0.8903309816311065
},
{
"epoch": 0.12612612612612611,
"grad_norm": 0.9011189341545105,
"learning_rate": 4.883222001996352e-06,
"loss": 0.3438720703125,
"step": 28,
"token_acc": 0.8913533999298984
},
{
"epoch": 0.13063063063063063,
"grad_norm": 1.0672448873519897,
"learning_rate": 4.871933666150239e-06,
"loss": 0.3509521484375,
"step": 29,
"token_acc": 0.8918955084077967
},
{
"epoch": 0.13513513513513514,
"grad_norm": 0.8111361861228943,
"learning_rate": 4.8601389023879395e-06,
"loss": 0.3671875,
"step": 30,
"token_acc": 0.8869003430910295
},
{
"epoch": 0.13963963963963963,
"grad_norm": 0.9260621666908264,
"learning_rate": 4.8478402289913566e-06,
"loss": 0.3306884765625,
"step": 31,
"token_acc": 0.8972916061574209
},
{
"epoch": 0.14414414414414414,
"grad_norm": 0.9479268193244934,
"learning_rate": 4.835040271831371e-06,
"loss": 0.35076904296875,
"step": 32,
"token_acc": 0.89304353165101
},
{
"epoch": 0.14864864864864866,
"grad_norm": 0.9695205688476562,
"learning_rate": 4.821741763807186e-06,
"loss": 0.360107421875,
"step": 33,
"token_acc": 0.8890506207294774
},
{
"epoch": 0.15315315315315314,
"grad_norm": 0.8806270956993103,
"learning_rate": 4.807947544262838e-06,
"loss": 0.37542724609375,
"step": 34,
"token_acc": 0.883788106662847
},
{
"epoch": 0.15765765765765766,
"grad_norm": 0.8483520150184631,
"learning_rate": 4.793660558380969e-06,
"loss": 0.359619140625,
"step": 35,
"token_acc": 0.8883863056108398
},
{
"epoch": 0.16216216216216217,
"grad_norm": 0.9061219692230225,
"learning_rate": 4.7788838565540044e-06,
"loss": 0.33489990234375,
"step": 36,
"token_acc": 0.8967343771172348
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.7820679545402527,
"learning_rate": 4.763620593732867e-06,
"loss": 0.350830078125,
"step": 37,
"token_acc": 0.890262264871789
},
{
"epoch": 0.17117117117117117,
"grad_norm": 0.9904912114143372,
"learning_rate": 4.747874028753375e-06,
"loss": 0.4389495849609375,
"step": 38,
"token_acc": 0.8688650397289224
},
{
"epoch": 0.17567567567567569,
"grad_norm": 0.871274471282959,
"learning_rate": 4.731647523640446e-06,
"loss": 0.3544921875,
"step": 39,
"token_acc": 0.8903172772836772
},
{
"epoch": 0.18018018018018017,
"grad_norm": 0.7513757348060608,
"learning_rate": 4.7149445428902794e-06,
"loss": 0.3541259765625,
"step": 40,
"token_acc": 0.8904149382414799
},
{
"epoch": 0.18468468468468469,
"grad_norm": 1.159681797027588,
"learning_rate": 4.697768652730656e-06,
"loss": 0.3160400390625,
"step": 41,
"token_acc": 0.9033266799733866
},
{
"epoch": 0.1891891891891892,
"grad_norm": 0.8280705213546753,
"learning_rate": 4.68012352035952e-06,
"loss": 0.3328857421875,
"step": 42,
"token_acc": 0.8980802792321116
},
{
"epoch": 0.19369369369369369,
"grad_norm": 0.8860488533973694,
"learning_rate": 4.662012913161998e-06,
"loss": 0.3543701171875,
"step": 43,
"token_acc": 0.890051324589144
},
{
"epoch": 0.1981981981981982,
"grad_norm": 0.8635104298591614,
"learning_rate": 4.643440697906033e-06,
"loss": 0.34234619140625,
"step": 44,
"token_acc": 0.8932650377240949
},
{
"epoch": 0.20270270270270271,
"grad_norm": 0.846026599407196,
"learning_rate": 4.624410839916798e-06,
"loss": 0.3629150390625,
"step": 45,
"token_acc": 0.8890571013466493
},
{
"epoch": 0.2072072072072072,
"grad_norm": 0.8589869737625122,
"learning_rate": 4.604927402230061e-06,
"loss": 0.359130859375,
"step": 46,
"token_acc": 0.8904908189625078
},
{
"epoch": 0.21171171171171171,
"grad_norm": 0.8538748025894165,
"learning_rate": 4.584994544724695e-06,
"loss": 0.373016357421875,
"step": 47,
"token_acc": 0.8841772321548235
},
{
"epoch": 0.21621621621621623,
"grad_norm": 0.8112604022026062,
"learning_rate": 4.564616523234511e-06,
"loss": 0.3341064453125,
"step": 48,
"token_acc": 0.8964978147002773
},
{
"epoch": 0.22072072072072071,
"grad_norm": 0.9030938148498535,
"learning_rate": 4.543797688639596e-06,
"loss": 0.407958984375,
"step": 49,
"token_acc": 0.8771987646635173
},
{
"epoch": 0.22522522522522523,
"grad_norm": 0.8748745918273926,
"learning_rate": 4.522542485937369e-06,
"loss": 0.37548828125,
"step": 50,
"token_acc": 0.8819748719843742
},
{
"epoch": 0.22972972972972974,
"grad_norm": 0.7620592713356018,
"learning_rate": 4.500855453293532e-06,
"loss": 0.31634521484375,
"step": 51,
"token_acc": 0.8998812351543943
},
{
"epoch": 0.23423423423423423,
"grad_norm": 0.9173205494880676,
"learning_rate": 4.478741221073136e-06,
"loss": 0.337432861328125,
"step": 52,
"token_acc": 0.8988951978520181
},
{
"epoch": 0.23873873873873874,
"grad_norm": 1.211668610572815,
"learning_rate": 4.456204510851957e-06,
"loss": 0.361572265625,
"step": 53,
"token_acc": 0.8882260347359092
},
{
"epoch": 0.24324324324324326,
"grad_norm": 1.1069424152374268,
"learning_rate": 4.433250134408401e-06,
"loss": 0.3758544921875,
"step": 54,
"token_acc": 0.8838936369355408
},
{
"epoch": 0.24774774774774774,
"grad_norm": 1.0697734355926514,
"learning_rate": 4.4098829926961485e-06,
"loss": 0.3668212890625,
"step": 55,
"token_acc": 0.8864855281237534
},
{
"epoch": 0.25225225225225223,
"grad_norm": 0.9282281994819641,
"learning_rate": 4.386108074797757e-06,
"loss": 0.3770751953125,
"step": 56,
"token_acc": 0.8851603202168664
},
{
"epoch": 0.25675675675675674,
"grad_norm": 0.7715709209442139,
"learning_rate": 4.361930456859455e-06,
"loss": 0.391845703125,
"step": 57,
"token_acc": 0.8783604400170836
},
{
"epoch": 0.26126126126126126,
"grad_norm": 0.9285658001899719,
"learning_rate": 4.337355301007336e-06,
"loss": 0.3463134765625,
"step": 58,
"token_acc": 0.8939376088180235
},
{
"epoch": 0.26576576576576577,
"grad_norm": 1.034256935119629,
"learning_rate": 4.312387854245201e-06,
"loss": 0.3419189453125,
"step": 59,
"token_acc": 0.8928964642593341
},
{
"epoch": 0.2702702702702703,
"grad_norm": 1.0528221130371094,
"learning_rate": 4.287033447334286e-06,
"loss": 0.317138671875,
"step": 60,
"token_acc": 0.901003937801551
},
{
"epoch": 0.2747747747747748,
"grad_norm": 0.8839389681816101,
"learning_rate": 4.261297493655092e-06,
"loss": 0.3155517578125,
"step": 61,
"token_acc": 0.9004427280397325
},
{
"epoch": 0.27927927927927926,
"grad_norm": 0.8743260502815247,
"learning_rate": 4.2351854880515856e-06,
"loss": 0.34619140625,
"step": 62,
"token_acc": 0.8932852856301954
},
{
"epoch": 0.28378378378378377,
"grad_norm": 0.7228661179542542,
"learning_rate": 4.208703005658e-06,
"loss": 0.348663330078125,
"step": 63,
"token_acc": 0.8947115994978206
},
{
"epoch": 0.2882882882882883,
"grad_norm": 0.8091106414794922,
"learning_rate": 4.1818557007085e-06,
"loss": 0.3548583984375,
"step": 64,
"token_acc": 0.8893367675466839
},
{
"epoch": 0.2927927927927928,
"grad_norm": 0.9842603802680969,
"learning_rate": 4.154649305329959e-06,
"loss": 0.34814453125,
"step": 65,
"token_acc": 0.8905122962458113
},
{
"epoch": 0.2972972972972973,
"grad_norm": 0.6639820337295532,
"learning_rate": 4.12708962831809e-06,
"loss": 0.33953857421875,
"step": 66,
"token_acc": 0.8963516432198938
},
{
"epoch": 0.30180180180180183,
"grad_norm": 0.9688128232955933,
"learning_rate": 4.099182553897228e-06,
"loss": 0.34039306640625,
"step": 67,
"token_acc": 0.893565994838982
},
{
"epoch": 0.3063063063063063,
"grad_norm": 0.7545742392539978,
"learning_rate": 4.070934040463999e-06,
"loss": 0.341400146484375,
"step": 68,
"token_acc": 0.894034692141854
},
{
"epoch": 0.3108108108108108,
"grad_norm": 0.834723711013794,
"learning_rate": 4.042350119315142e-06,
"loss": 0.39501953125,
"step": 69,
"token_acc": 0.8793881644934805
},
{
"epoch": 0.3153153153153153,
"grad_norm": 0.7959160804748535,
"learning_rate": 4.013436893359787e-06,
"loss": 0.37982177734375,
"step": 70,
"token_acc": 0.8827430920972124
},
{
"epoch": 0.31981981981981983,
"grad_norm": 0.7727994322776794,
"learning_rate": 3.984200535816427e-06,
"loss": 0.3292236328125,
"step": 71,
"token_acc": 0.8962622549019608
},
{
"epoch": 0.32432432432432434,
"grad_norm": 1.1143063306808472,
"learning_rate": 3.9546472888948825e-06,
"loss": 0.3765869140625,
"step": 72,
"token_acc": 0.8862365334289776
},
{
"epoch": 0.32882882882882886,
"grad_norm": 0.802827775478363,
"learning_rate": 3.924783462463541e-06,
"loss": 0.3514404296875,
"step": 73,
"token_acc": 0.8915428441438926
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.7230685353279114,
"learning_rate": 3.894615432702144e-06,
"loss": 0.326416015625,
"step": 74,
"token_acc": 0.8988927052463204
},
{
"epoch": 0.33783783783783783,
"grad_norm": 0.6452018618583679,
"learning_rate": 3.8641496407404165e-06,
"loss": 0.344970703125,
"step": 75,
"token_acc": 0.8912708883025505
},
{
"epoch": 0.34234234234234234,
"grad_norm": 0.8161414265632629,
"learning_rate": 3.833392591282838e-06,
"loss": 0.352294921875,
"step": 76,
"token_acc": 0.8910519555745838
},
{
"epoch": 0.34684684684684686,
"grad_norm": 0.9082568287849426,
"learning_rate": 3.802350851219826e-06,
"loss": 0.31591796875,
"step": 77,
"token_acc": 0.903437424998
},
{
"epoch": 0.35135135135135137,
"grad_norm": 0.8428945541381836,
"learning_rate": 3.771031048225653e-06,
"loss": 0.3326416015625,
"step": 78,
"token_acc": 0.8960439385497094
},
{
"epoch": 0.35585585585585583,
"grad_norm": 0.7270176410675049,
"learning_rate": 3.7394398693433798e-06,
"loss": 0.34326171875,
"step": 79,
"token_acc": 0.8919539041909978
},
{
"epoch": 0.36036036036036034,
"grad_norm": 0.9635636210441589,
"learning_rate": 3.70758405955712e-06,
"loss": 0.324951171875,
"step": 80,
"token_acc": 0.8983449519816702
},
{
"epoch": 0.36486486486486486,
"grad_norm": 0.7542657256126404,
"learning_rate": 3.675470420351921e-06,
"loss": 0.363525390625,
"step": 81,
"token_acc": 0.8872009164518733
},
{
"epoch": 0.36936936936936937,
"grad_norm": 0.8671934008598328,
"learning_rate": 3.6431058082615966e-06,
"loss": 0.31597900390625,
"step": 82,
"token_acc": 0.9009948542024013
},
{
"epoch": 0.3738738738738739,
"grad_norm": 1.0081602334976196,
"learning_rate": 3.6104971334047954e-06,
"loss": 0.3199462890625,
"step": 83,
"token_acc": 0.8997704785831705
},
{
"epoch": 0.3783783783783784,
"grad_norm": 1.0129483938217163,
"learning_rate": 3.5776513580096316e-06,
"loss": 0.3701171875,
"step": 84,
"token_acc": 0.88471046201057
},
{
"epoch": 0.38288288288288286,
"grad_norm": 0.8052798509597778,
"learning_rate": 3.5445754949271925e-06,
"loss": 0.3421630859375,
"step": 85,
"token_acc": 0.8927845528455285
},
{
"epoch": 0.38738738738738737,
"grad_norm": 0.8934619426727295,
"learning_rate": 3.5112766061342346e-06,
"loss": 0.3060302734375,
"step": 86,
"token_acc": 0.9026350477994344
},
{
"epoch": 0.3918918918918919,
"grad_norm": 0.6580748558044434,
"learning_rate": 3.47776180122539e-06,
"loss": 0.359619140625,
"step": 87,
"token_acc": 0.8894399733815792
},
{
"epoch": 0.3963963963963964,
"grad_norm": 0.8089192509651184,
"learning_rate": 3.4440382358952116e-06,
"loss": 0.3321533203125,
"step": 88,
"token_acc": 0.8961488812392426
},
{
"epoch": 0.4009009009009009,
"grad_norm": 0.8350925445556641,
"learning_rate": 3.4101131104103664e-06,
"loss": 0.3330078125,
"step": 89,
"token_acc": 0.8948240976437569
},
{
"epoch": 0.40540540540540543,
"grad_norm": 0.8865199089050293,
"learning_rate": 3.3759936680723238e-06,
"loss": 0.3404541015625,
"step": 90,
"token_acc": 0.8960275592341626
},
{
"epoch": 0.4099099099099099,
"grad_norm": 0.9881960153579712,
"learning_rate": 3.341687193670844e-06,
"loss": 0.4093017578125,
"step": 91,
"token_acc": 0.8754743340444411
},
{
"epoch": 0.4144144144144144,
"grad_norm": 0.6525192856788635,
"learning_rate": 3.3072010119286156e-06,
"loss": 0.343994140625,
"step": 92,
"token_acc": 0.8952931153825675
},
{
"epoch": 0.4189189189189189,
"grad_norm": 0.8146042823791504,
"learning_rate": 3.272542485937369e-06,
"loss": 0.32464599609375,
"step": 93,
"token_acc": 0.8996927651139742
},
{
"epoch": 0.42342342342342343,
"grad_norm": 0.7997490763664246,
"learning_rate": 3.237719015585787e-06,
"loss": 0.3258056640625,
"step": 94,
"token_acc": 0.8986706407255705
},
{
"epoch": 0.42792792792792794,
"grad_norm": 0.7047693133354187,
"learning_rate": 3.202738035979571e-06,
"loss": 0.3316650390625,
"step": 95,
"token_acc": 0.8959871462508255
},
{
"epoch": 0.43243243243243246,
"grad_norm": 0.6673498749732971,
"learning_rate": 3.167607015853983e-06,
"loss": 0.35711669921875,
"step": 96,
"token_acc": 0.8914680520593433
},
{
"epoch": 0.4369369369369369,
"grad_norm": 0.9044120907783508,
"learning_rate": 3.132333455979202e-06,
"loss": 0.29888916015625,
"step": 97,
"token_acc": 0.9081531160305777
},
{
"epoch": 0.44144144144144143,
"grad_norm": 0.6456412672996521,
"learning_rate": 3.0969248875588547e-06,
"loss": 0.3343048095703125,
"step": 98,
"token_acc": 0.8963899986716667
},
{
"epoch": 0.44594594594594594,
"grad_norm": 0.8044923543930054,
"learning_rate": 3.0613888706220336e-06,
"loss": 0.3668212890625,
"step": 99,
"token_acc": 0.8872929258325789
},
{
"epoch": 0.45045045045045046,
"grad_norm": 0.8912515044212341,
"learning_rate": 3.025732992409166e-06,
"loss": 0.3749237060546875,
"step": 100,
"token_acc": 0.8863157390554651
},
{
"epoch": 0.45495495495495497,
"grad_norm": 0.9178464412689209,
"learning_rate": 2.989964865752079e-06,
"loss": 0.31494140625,
"step": 101,
"token_acc": 0.9031758326878389
},
{
"epoch": 0.4594594594594595,
"grad_norm": 0.7799769639968872,
"learning_rate": 2.9540921274485913e-06,
"loss": 0.316070556640625,
"step": 102,
"token_acc": 0.9010862029119482
},
{
"epoch": 0.46396396396396394,
"grad_norm": 0.9977706670761108,
"learning_rate": 2.9181224366319947e-06,
"loss": 0.335693359375,
"step": 103,
"token_acc": 0.8947431320117716
},
{
"epoch": 0.46846846846846846,
"grad_norm": 1.2078821659088135,
"learning_rate": 2.882063473135763e-06,
"loss": 0.4012451171875,
"step": 104,
"token_acc": 0.8755390944456183
},
{
"epoch": 0.47297297297297297,
"grad_norm": 0.9138604998588562,
"learning_rate": 2.845922935853841e-06,
"loss": 0.31298828125,
"step": 105,
"token_acc": 0.9045985448046975
},
{
"epoch": 0.4774774774774775,
"grad_norm": 0.9735301733016968,
"learning_rate": 2.80970854109687e-06,
"loss": 0.312744140625,
"step": 106,
"token_acc": 0.9002951864614082
},
{
"epoch": 0.481981981981982,
"grad_norm": 0.8821303844451904,
"learning_rate": 2.773428020944687e-06,
"loss": 0.316650390625,
"step": 107,
"token_acc": 0.9018892572372231
},
{
"epoch": 0.4864864864864865,
"grad_norm": 0.7590733766555786,
"learning_rate": 2.7370891215954572e-06,
"loss": 0.362060546875,
"step": 108,
"token_acc": 0.8878953544431608
},
{
"epoch": 0.49099099099099097,
"grad_norm": 1.1650851964950562,
"learning_rate": 2.7006996017118033e-06,
"loss": 0.349365234375,
"step": 109,
"token_acc": 0.8941975079040357
},
{
"epoch": 0.4954954954954955,
"grad_norm": 0.8107997179031372,
"learning_rate": 2.6642672307642575e-06,
"loss": 0.3140869140625,
"step": 110,
"token_acc": 0.9023156522104909
},
{
"epoch": 0.5,
"grad_norm": 0.9272815585136414,
"learning_rate": 2.627799787372418e-06,
"loss": 0.33642578125,
"step": 111,
"token_acc": 0.8949056603773585
},
{
"epoch": 0.5045045045045045,
"grad_norm": 0.8578754663467407,
"learning_rate": 2.591305057644148e-06,
"loss": 0.34564208984375,
"step": 112,
"token_acc": 0.8929581334646075
},
{
"epoch": 0.509009009009009,
"grad_norm": 0.8179190754890442,
"learning_rate": 2.5547908335131704e-06,
"loss": 0.3253173828125,
"step": 113,
"token_acc": 0.8980514754137483
},
{
"epoch": 0.5135135135135135,
"grad_norm": 0.937614917755127,
"learning_rate": 2.5182649110754325e-06,
"loss": 0.30389404296875,
"step": 114,
"token_acc": 0.9068263329281652
},
{
"epoch": 0.5180180180180181,
"grad_norm": 0.9342759251594543,
"learning_rate": 2.4817350889245675e-06,
"loss": 0.362548828125,
"step": 115,
"token_acc": 0.8870322400184855
},
{
"epoch": 0.5225225225225225,
"grad_norm": 0.7444611191749573,
"learning_rate": 2.44520916648683e-06,
"loss": 0.36907958984375,
"step": 116,
"token_acc": 0.8857218663533336
},
{
"epoch": 0.527027027027027,
"grad_norm": 0.9588095545768738,
"learning_rate": 2.408694942355853e-06,
"loss": 0.38671875,
"step": 117,
"token_acc": 0.8824941943771926
},
{
"epoch": 0.5315315315315315,
"grad_norm": 0.9178715348243713,
"learning_rate": 2.3722002126275826e-06,
"loss": 0.34375,
"step": 118,
"token_acc": 0.8941754417885328
},
{
"epoch": 0.536036036036036,
"grad_norm": 0.8871239423751831,
"learning_rate": 2.3357327692357434e-06,
"loss": 0.3515625,
"step": 119,
"token_acc": 0.8914658528885615
},
{
"epoch": 0.5405405405405406,
"grad_norm": 0.7818154096603394,
"learning_rate": 2.2993003982881976e-06,
"loss": 0.30548095703125,
"step": 120,
"token_acc": 0.9041648931367617
},
{
"epoch": 0.545045045045045,
"grad_norm": 0.7676111459732056,
"learning_rate": 2.262910878404544e-06,
"loss": 0.2769775390625,
"step": 121,
"token_acc": 0.9139403881271473
},
{
"epoch": 0.5495495495495496,
"grad_norm": 0.9711487889289856,
"learning_rate": 2.2265719790553147e-06,
"loss": 0.3883056640625,
"step": 122,
"token_acc": 0.8804856085838586
},
{
"epoch": 0.5540540540540541,
"grad_norm": 0.8610119223594666,
"learning_rate": 2.19029145890313e-06,
"loss": 0.3262939453125,
"step": 123,
"token_acc": 0.8978232618583496
},
{
"epoch": 0.5585585585585585,
"grad_norm": 0.8020825386047363,
"learning_rate": 2.154077064146159e-06,
"loss": 0.3035888671875,
"step": 124,
"token_acc": 0.9039610929800985
},
{
"epoch": 0.5630630630630631,
"grad_norm": 0.9404456615447998,
"learning_rate": 2.1179365268642377e-06,
"loss": 0.32861328125,
"step": 125,
"token_acc": 0.8969406998672906
},
{
"epoch": 0.5675675675675675,
"grad_norm": 0.9896916747093201,
"learning_rate": 2.0818775633680057e-06,
"loss": 0.36248779296875,
"step": 126,
"token_acc": 0.8876241534257928
},
{
"epoch": 0.5720720720720721,
"grad_norm": 0.738699734210968,
"learning_rate": 2.045907872551409e-06,
"loss": 0.3084716796875,
"step": 127,
"token_acc": 0.9047011482971833
},
{
"epoch": 0.5765765765765766,
"grad_norm": 0.987114429473877,
"learning_rate": 2.010035134247922e-06,
"loss": 0.3638916015625,
"step": 128,
"token_acc": 0.8856382245027449
},
{
"epoch": 0.581081081081081,
"grad_norm": 0.8161911368370056,
"learning_rate": 1.9742670075908353e-06,
"loss": 0.3726806640625,
"step": 129,
"token_acc": 0.8862522268667838
},
{
"epoch": 0.5855855855855856,
"grad_norm": 1.0528193712234497,
"learning_rate": 1.9386111293779673e-06,
"loss": 0.3369140625,
"step": 130,
"token_acc": 0.8956112595402446
},
{
"epoch": 0.5900900900900901,
"grad_norm": 0.9027992486953735,
"learning_rate": 1.903075112441145e-06,
"loss": 0.3206787109375,
"step": 131,
"token_acc": 0.9009896596103493
},
{
"epoch": 0.5945945945945946,
"grad_norm": 0.7341856360435486,
"learning_rate": 1.8676665440207982e-06,
"loss": 0.2967529296875,
"step": 132,
"token_acc": 0.9068318389675675
},
{
"epoch": 0.5990990990990991,
"grad_norm": 0.735024094581604,
"learning_rate": 1.832392984146018e-06,
"loss": 0.31787109375,
"step": 133,
"token_acc": 0.8996451126455548
},
{
"epoch": 0.6036036036036037,
"grad_norm": 0.9426040053367615,
"learning_rate": 1.7972619640204298e-06,
"loss": 0.387451171875,
"step": 134,
"token_acc": 0.8815383862602143
},
{
"epoch": 0.6081081081081081,
"grad_norm": 0.7768560647964478,
"learning_rate": 1.7622809844142138e-06,
"loss": 0.35296630859375,
"step": 135,
"token_acc": 0.8923252350367681
},
{
"epoch": 0.6126126126126126,
"grad_norm": 0.8278442025184631,
"learning_rate": 1.7274575140626318e-06,
"loss": 0.3173828125,
"step": 136,
"token_acc": 0.8998283663598066
},
{
"epoch": 0.6171171171171171,
"grad_norm": 0.8853439092636108,
"learning_rate": 1.6927989880713852e-06,
"loss": 0.3544158935546875,
"step": 137,
"token_acc": 0.8905075533139094
},
{
"epoch": 0.6216216216216216,
"grad_norm": 0.6611644625663757,
"learning_rate": 1.6583128063291576e-06,
"loss": 0.33984375,
"step": 138,
"token_acc": 0.8959825350762759
},
{
"epoch": 0.6261261261261262,
"grad_norm": 0.7384758591651917,
"learning_rate": 1.6240063319276767e-06,
"loss": 0.32476806640625,
"step": 139,
"token_acc": 0.8985073953315207
},
{
"epoch": 0.6306306306306306,
"grad_norm": 0.773961067199707,
"learning_rate": 1.5898868895896336e-06,
"loss": 0.4095458984375,
"step": 140,
"token_acc": 0.8739141708803319
},
{
"epoch": 0.6351351351351351,
"grad_norm": 0.8257731199264526,
"learning_rate": 1.5559617641047886e-06,
"loss": 0.3592529296875,
"step": 141,
"token_acc": 0.8883896425588295
},
{
"epoch": 0.6396396396396397,
"grad_norm": 0.787449061870575,
"learning_rate": 1.5222381987746104e-06,
"loss": 0.357177734375,
"step": 142,
"token_acc": 0.888363484849953
},
{
"epoch": 0.6441441441441441,
"grad_norm": 0.6886521577835083,
"learning_rate": 1.488723393865766e-06,
"loss": 0.360107421875,
"step": 143,
"token_acc": 0.8890432232736252
},
{
"epoch": 0.6486486486486487,
"grad_norm": 0.7612305283546448,
"learning_rate": 1.4554245050728085e-06,
"loss": 0.3238525390625,
"step": 144,
"token_acc": 0.8999899789558072
},
{
"epoch": 0.6531531531531531,
"grad_norm": 0.8425935506820679,
"learning_rate": 1.4223486419903692e-06,
"loss": 0.327606201171875,
"step": 145,
"token_acc": 0.8979553119730186
},
{
"epoch": 0.6576576576576577,
"grad_norm": 1.0973107814788818,
"learning_rate": 1.389502866595206e-06,
"loss": 0.319122314453125,
"step": 146,
"token_acc": 0.9017525376916615
},
{
"epoch": 0.6621621621621622,
"grad_norm": 0.7503589391708374,
"learning_rate": 1.3568941917384038e-06,
"loss": 0.31683349609375,
"step": 147,
"token_acc": 0.9008651974055598
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.7911334037780762,
"learning_rate": 1.324529579648079e-06,
"loss": 0.28924560546875,
"step": 148,
"token_acc": 0.9068919780898478
},
{
"epoch": 0.6711711711711712,
"grad_norm": 0.771323025226593,
"learning_rate": 1.2924159404428804e-06,
"loss": 0.373291015625,
"step": 149,
"token_acc": 0.884785335262904
},
{
"epoch": 0.6756756756756757,
"grad_norm": 0.9364131689071655,
"learning_rate": 1.2605601306566206e-06,
"loss": 0.3597412109375,
"step": 150,
"token_acc": 0.888817750535287
},
{
"epoch": 0.6801801801801802,
"grad_norm": 0.7087746262550354,
"learning_rate": 1.2289689517743475e-06,
"loss": 0.3809814453125,
"step": 151,
"token_acc": 0.8821097824929922
},
{
"epoch": 0.6846846846846847,
"grad_norm": 0.6490341424942017,
"learning_rate": 1.1976491487801747e-06,
"loss": 0.32562255859375,
"step": 152,
"token_acc": 0.8976476697136251
},
{
"epoch": 0.6891891891891891,
"grad_norm": 0.7317127585411072,
"learning_rate": 1.1666074087171628e-06,
"loss": 0.330810546875,
"step": 153,
"token_acc": 0.9007194922339586
},
{
"epoch": 0.6936936936936937,
"grad_norm": 0.7992354035377502,
"learning_rate": 1.1358503592595837e-06,
"loss": 0.33154296875,
"step": 154,
"token_acc": 0.8951166965888689
},
{
"epoch": 0.6981981981981982,
"grad_norm": 0.8374021053314209,
"learning_rate": 1.1053845672978567e-06,
"loss": 0.3441162109375,
"step": 155,
"token_acc": 0.8938746596418214
},
{
"epoch": 0.7027027027027027,
"grad_norm": 0.747456431388855,
"learning_rate": 1.0752165375364593e-06,
"loss": 0.3369140625,
"step": 156,
"token_acc": 0.8960199252801992
},
{
"epoch": 0.7072072072072072,
"grad_norm": 0.6646521091461182,
"learning_rate": 1.0453527111051183e-06,
"loss": 0.302581787109375,
"step": 157,
"token_acc": 0.9050437347721778
},
{
"epoch": 0.7117117117117117,
"grad_norm": 0.7055298686027527,
"learning_rate": 1.0157994641835737e-06,
"loss": 0.301239013671875,
"step": 158,
"token_acc": 0.9069595645412131
},
{
"epoch": 0.7162162162162162,
"grad_norm": 0.9243741631507874,
"learning_rate": 9.865631066402138e-07,
"loss": 0.35394287109375,
"step": 159,
"token_acc": 0.8925179495143776
},
{
"epoch": 0.7207207207207207,
"grad_norm": 0.8311925530433655,
"learning_rate": 9.576498806848592e-07,
"loss": 0.340087890625,
"step": 160,
"token_acc": 0.894881932249615
},
{
"epoch": 0.7252252252252253,
"grad_norm": 0.8588569164276123,
"learning_rate": 9.290659595360019e-07,
"loss": 0.3267822265625,
"step": 161,
"token_acc": 0.8974585635359116
},
{
"epoch": 0.7297297297297297,
"grad_norm": 0.7929273247718811,
"learning_rate": 9.008174461027724e-07,
"loss": 0.340576171875,
"step": 162,
"token_acc": 0.8939879994458003
},
{
"epoch": 0.7342342342342343,
"grad_norm": 1.2013729810714722,
"learning_rate": 8.729103716819113e-07,
"loss": 0.370819091796875,
"step": 163,
"token_acc": 0.8868560581265955
},
{
"epoch": 0.7387387387387387,
"grad_norm": 0.7468942403793335,
"learning_rate": 8.453506946700419e-07,
"loss": 0.2955322265625,
"step": 164,
"token_acc": 0.9082374491957181
},
{
"epoch": 0.7432432432432432,
"grad_norm": 1.1810017824172974,
"learning_rate": 8.181442992915001e-07,
"loss": 0.378662109375,
"step": 165,
"token_acc": 0.8825104623355918
},
{
"epoch": 0.7477477477477478,
"grad_norm": 0.7701326012611389,
"learning_rate": 7.912969943420018e-07,
"loss": 0.3447265625,
"step": 166,
"token_acc": 0.8957051550086613
},
{
"epoch": 0.7522522522522522,
"grad_norm": 0.8558163046836853,
"learning_rate": 7.648145119484152e-07,
"loss": 0.3280029296875,
"step": 167,
"token_acc": 0.89935299183094
},
{
"epoch": 0.7567567567567568,
"grad_norm": 0.6991153955459595,
"learning_rate": 7.387025063449082e-07,
"loss": 0.3521728515625,
"step": 168,
"token_acc": 0.8900763667218877
},
{
"epoch": 0.7612612612612613,
"grad_norm": 1.034488558769226,
"learning_rate": 7.129665526657145e-07,
"loss": 0.3106231689453125,
"step": 169,
"token_acc": 0.9047021491417857
},
{
"epoch": 0.7657657657657657,
"grad_norm": 0.733146071434021,
"learning_rate": 6.876121457547996e-07,
"loss": 0.310302734375,
"step": 170,
"token_acc": 0.9026569890363783
},
{
"epoch": 0.7702702702702703,
"grad_norm": 0.8405783176422119,
"learning_rate": 6.626446989926652e-07,
"loss": 0.3514404296875,
"step": 171,
"token_acc": 0.890449830708873
},
{
"epoch": 0.7747747747747747,
"grad_norm": 0.8425772190093994,
"learning_rate": 6.380695431405453e-07,
"loss": 0.361083984375,
"step": 172,
"token_acc": 0.8888163230910221
},
{
"epoch": 0.7792792792792793,
"grad_norm": 0.9122714996337891,
"learning_rate": 6.138919252022435e-07,
"loss": 0.3192138671875,
"step": 173,
"token_acc": 0.899907008200186
},
{
"epoch": 0.7837837837837838,
"grad_norm": 0.9160909652709961,
"learning_rate": 5.901170073038523e-07,
"loss": 0.3465576171875,
"step": 174,
"token_acc": 0.8923485456678811
},
{
"epoch": 0.7882882882882883,
"grad_norm": 0.7696585655212402,
"learning_rate": 5.667498655916002e-07,
"loss": 0.3604736328125,
"step": 175,
"token_acc": 0.8889116998746597
},
{
"epoch": 0.7927927927927928,
"grad_norm": 0.7901983261108398,
"learning_rate": 5.437954891480443e-07,
"loss": 0.3160400390625,
"step": 176,
"token_acc": 0.9012986436881038
},
{
"epoch": 0.7972972972972973,
"grad_norm": 0.7334163188934326,
"learning_rate": 5.21258778926865e-07,
"loss": 0.34423828125,
"step": 177,
"token_acc": 0.8941695594530863
},
{
"epoch": 0.8018018018018018,
"grad_norm": 0.7236452698707581,
"learning_rate": 4.99144546706469e-07,
"loss": 0.34686279296875,
"step": 178,
"token_acc": 0.8923213125253115
},
{
"epoch": 0.8063063063063063,
"grad_norm": 0.7739962935447693,
"learning_rate": 4.774575140626317e-07,
"loss": 0.349609375,
"step": 179,
"token_acc": 0.8923176451560731
},
{
"epoch": 0.8108108108108109,
"grad_norm": 0.8533901572227478,
"learning_rate": 4.5620231136040414e-07,
"loss": 0.332275390625,
"step": 180,
"token_acc": 0.8971639258853548
},
{
"epoch": 0.8153153153153153,
"grad_norm": 1.0738646984100342,
"learning_rate": 4.3538347676548965e-07,
"loss": 0.3839111328125,
"step": 181,
"token_acc": 0.8830640641762606
},
{
"epoch": 0.8198198198198198,
"grad_norm": 0.7544999122619629,
"learning_rate": 4.150054552753055e-07,
"loss": 0.349273681640625,
"step": 182,
"token_acc": 0.8917029877374043
},
{
"epoch": 0.8243243243243243,
"grad_norm": 0.6967670917510986,
"learning_rate": 3.950725977699396e-07,
"loss": 0.308349609375,
"step": 183,
"token_acc": 0.9040394422226454
},
{
"epoch": 0.8288288288288288,
"grad_norm": 1.0199168920516968,
"learning_rate": 3.7558916008320263e-07,
"loss": 0.3571624755859375,
"step": 184,
"token_acc": 0.8911331997805311
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.776905357837677,
"learning_rate": 3.5655930209396784e-07,
"loss": 0.34765625,
"step": 185,
"token_acc": 0.8915000262425865
},
{
"epoch": 0.8378378378378378,
"grad_norm": 0.9011301398277283,
"learning_rate": 3.379870868380031e-07,
"loss": 0.37677001953125,
"step": 186,
"token_acc": 0.8837367137532784
},
{
"epoch": 0.8423423423423423,
"grad_norm": 0.7519142627716064,
"learning_rate": 3.1987647964048075e-07,
"loss": 0.3240966796875,
"step": 187,
"token_acc": 0.8982510044906642
},
{
"epoch": 0.8468468468468469,
"grad_norm": 0.8182622194290161,
"learning_rate": 3.022313472693447e-07,
"loss": 0.31689453125,
"step": 188,
"token_acc": 0.8999964010221098
},
{
"epoch": 0.8513513513513513,
"grad_norm": 0.7286121845245361,
"learning_rate": 2.850554571097211e-07,
"loss": 0.3243408203125,
"step": 189,
"token_acc": 0.900392670157068
},
{
"epoch": 0.8558558558558559,
"grad_norm": 0.8014657497406006,
"learning_rate": 2.6835247635955466e-07,
"loss": 0.3505859375,
"step": 190,
"token_acc": 0.8923208057153752
},
{
"epoch": 0.8603603603603603,
"grad_norm": 0.8122464418411255,
"learning_rate": 2.521259712466256e-07,
"loss": 0.3648681640625,
"step": 191,
"token_acc": 0.8863166841775962
},
{
"epoch": 0.8648648648648649,
"grad_norm": 1.0286812782287598,
"learning_rate": 2.3637940626713346e-07,
"loss": 0.3612060546875,
"step": 192,
"token_acc": 0.8910263284894633
},
{
"epoch": 0.8693693693693694,
"grad_norm": 0.8132110238075256,
"learning_rate": 2.2111614344599686e-07,
"loss": 0.3125152587890625,
"step": 193,
"token_acc": 0.9036560888233949
},
{
"epoch": 0.8738738738738738,
"grad_norm": 0.7871835231781006,
"learning_rate": 2.0633944161903147e-07,
"loss": 0.3336181640625,
"step": 194,
"token_acc": 0.8960117156590649
},
{
"epoch": 0.8783783783783784,
"grad_norm": 0.8867942690849304,
"learning_rate": 1.9205245573716196e-07,
"loss": 0.29541015625,
"step": 195,
"token_acc": 0.9067175216003737
},
{
"epoch": 0.8828828828828829,
"grad_norm": 0.8739636540412903,
"learning_rate": 1.7825823619281452e-07,
"loss": 0.3477783203125,
"step": 196,
"token_acc": 0.8919099564915821
},
{
"epoch": 0.8873873873873874,
"grad_norm": 0.7999213933944702,
"learning_rate": 1.649597281686302e-07,
"loss": 0.32855224609375,
"step": 197,
"token_acc": 0.8981510210179152
},
{
"epoch": 0.8918918918918919,
"grad_norm": 0.8212850093841553,
"learning_rate": 1.5215977100864394e-07,
"loss": 0.392333984375,
"step": 198,
"token_acc": 0.8798142365281447
},
{
"epoch": 0.8963963963963963,
"grad_norm": 0.8761787414550781,
"learning_rate": 1.3986109761206097e-07,
"loss": 0.3126220703125,
"step": 199,
"token_acc": 0.9025795889471472
},
{
"epoch": 0.9009009009009009,
"grad_norm": 0.7608596086502075,
"learning_rate": 1.2806633384976092e-07,
"loss": 0.332550048828125,
"step": 200,
"token_acc": 0.8978598516872388
},
{
"epoch": 0.9054054054054054,
"grad_norm": 0.807299017906189,
"learning_rate": 1.1677799800364958e-07,
"loss": 0.3438720703125,
"step": 201,
"token_acc": 0.8938019253593564
},
{
"epoch": 0.9099099099099099,
"grad_norm": 0.906044065952301,
"learning_rate": 1.0599850022898539e-07,
"loss": 0.3448486328125,
"step": 202,
"token_acc": 0.8919713642686692
},
{
"epoch": 0.9144144144144144,
"grad_norm": 0.7889218330383301,
"learning_rate": 9.573014203979241e-08,
"loss": 0.28619384765625,
"step": 203,
"token_acc": 0.9101828456205424
},
{
"epoch": 0.918918918918919,
"grad_norm": 0.7564137578010559,
"learning_rate": 8.597511581746626e-08,
"loss": 0.3443603515625,
"step": 204,
"token_acc": 0.891580368127471
},
{
"epoch": 0.9234234234234234,
"grad_norm": 0.9052029252052307,
"learning_rate": 7.673550434268123e-08,
"loss": 0.30670166015625,
"step": 205,
"token_acc": 0.9040779490944689
},
{
"epoch": 0.9279279279279279,
"grad_norm": 0.8498448729515076,
"learning_rate": 6.801328035070138e-08,
"loss": 0.2917938232421875,
"step": 206,
"token_acc": 0.9076634951423842
},
{
"epoch": 0.9324324324324325,
"grad_norm": 0.940307080745697,
"learning_rate": 5.981030611018235e-08,
"loss": 0.3585205078125,
"step": 207,
"token_acc": 0.8883763592374977
},
{
"epoch": 0.9369369369369369,
"grad_norm": 0.9120367765426636,
"learning_rate": 5.212833302556258e-08,
"loss": 0.325927734375,
"step": 208,
"token_acc": 0.8974414543043994
},
{
"epoch": 0.9414414414414415,
"grad_norm": 1.141761064529419,
"learning_rate": 4.4969001263124314e-08,
"loss": 0.3285980224609375,
"step": 209,
"token_acc": 0.8957855034437521
},
{
"epoch": 0.9459459459459459,
"grad_norm": 0.8518276810646057,
"learning_rate": 3.833383940080232e-08,
"loss": 0.3089599609375,
"step": 210,
"token_acc": 0.9050746193405264
},
{
"epoch": 0.9504504504504504,
"grad_norm": 0.8082460761070251,
"learning_rate": 3.222426410182111e-08,
"loss": 0.363555908203125,
"step": 211,
"token_acc": 0.8873174557723192
},
{
"epoch": 0.954954954954955,
"grad_norm": 1.4015358686447144,
"learning_rate": 2.6641579812224373e-08,
"loss": 0.41107177734375,
"step": 212,
"token_acc": 0.8763219103737013
},
{
"epoch": 0.9594594594594594,
"grad_norm": 0.7203386425971985,
"learning_rate": 2.1586978482366072e-08,
"loss": 0.3438720703125,
"step": 213,
"token_acc": 0.8924183661803375
},
{
"epoch": 0.963963963963964,
"grad_norm": 0.8376468420028687,
"learning_rate": 1.7061539312417107e-08,
"loss": 0.3333740234375,
"step": 214,
"token_acc": 0.8958556698301764
},
{
"epoch": 0.9684684684684685,
"grad_norm": 0.8015531897544861,
"learning_rate": 1.3066228521948221e-08,
"loss": 0.35986328125,
"step": 215,
"token_acc": 0.8891843543006334
},
{
"epoch": 0.972972972972973,
"grad_norm": 0.7691966891288757,
"learning_rate": 9.60189914363363e-09,
"loss": 0.35791015625,
"step": 216,
"token_acc": 0.8902418854661981
},
{
"epoch": 0.9774774774774775,
"grad_norm": 0.9731364250183105,
"learning_rate": 6.66929084112089e-09,
"loss": 0.352081298828125,
"step": 217,
"token_acc": 0.8928305631356983
},
{
"epoch": 0.9819819819819819,
"grad_norm": 0.7616921067237854,
"learning_rate": 4.269029751107489e-09,
"loss": 0.3609619140625,
"step": 218,
"token_acc": 0.8907411504424779
},
{
"epoch": 0.9864864864864865,
"grad_norm": 1.2653754949569702,
"learning_rate": 2.4016283496544614e-09,
"loss": 0.370086669921875,
"step": 219,
"token_acc": 0.8855589798345022
},
{
"epoch": 0.990990990990991,
"grad_norm": 0.9321889281272888,
"learning_rate": 1.0674853427683484e-09,
"loss": 0.31292724609375,
"step": 220,
"token_acc": 0.9013229718149482
},
{
"epoch": 0.9954954954954955,
"grad_norm": 0.8537278771400452,
"learning_rate": 2.668855812748561e-10,
"loss": 0.325958251953125,
"step": 221,
"token_acc": 0.89793930894784
},
{
"epoch": 1.0,
"grad_norm": 0.9416584372520447,
"learning_rate": 0.0,
"loss": 0.33929443359375,
"step": 222,
"token_acc": 0.8940414967192765
}
],
"logging_steps": 1,
"max_steps": 222,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1423772948666778e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}