{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 222, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0045045045045045045, "grad_norm": 0.8704756498336792, "learning_rate": 7.142857142857143e-07, "loss": 0.31640625, "step": 1, "token_acc": 0.901016281855159 }, { "epoch": 0.009009009009009009, "grad_norm": 0.867336094379425, "learning_rate": 1.4285714285714286e-06, "loss": 0.34814453125, "step": 2, "token_acc": 0.8902718604179347 }, { "epoch": 0.013513513513513514, "grad_norm": 0.8713582158088684, "learning_rate": 2.1428571428571427e-06, "loss": 0.404052734375, "step": 3, "token_acc": 0.8748633439589399 }, { "epoch": 0.018018018018018018, "grad_norm": 1.0100644826889038, "learning_rate": 2.8571428571428573e-06, "loss": 0.3514404296875, "step": 4, "token_acc": 0.8913569321533923 }, { "epoch": 0.02252252252252252, "grad_norm": 0.7727848887443542, "learning_rate": 3.5714285714285718e-06, "loss": 0.337158203125, "step": 5, "token_acc": 0.8953797468354431 }, { "epoch": 0.02702702702702703, "grad_norm": 0.8298861980438232, "learning_rate": 4.2857142857142855e-06, "loss": 0.376220703125, "step": 6, "token_acc": 0.8836344783462395 }, { "epoch": 0.03153153153153153, "grad_norm": 0.8222936391830444, "learning_rate": 5e-06, "loss": 0.40087890625, "step": 7, "token_acc": 0.8779121330766402 }, { "epoch": 0.036036036036036036, "grad_norm": 0.762649655342102, "learning_rate": 4.999733114418726e-06, "loss": 0.385498046875, "step": 8, "token_acc": 0.8799979877251233 }, { "epoch": 0.04054054054054054, "grad_norm": 0.8142457604408264, "learning_rate": 4.998932514657232e-06, "loss": 0.334716796875, "step": 9, "token_acc": 0.8947475570032574 }, { "epoch": 0.04504504504504504, "grad_norm": 1.020268201828003, "learning_rate": 4.997598371650346e-06, "loss": 0.395263671875, "step": 10, "token_acc": 0.8798358928868437 }, { "epoch": 0.04954954954954955, "grad_norm": 1.1546354293823242, "learning_rate": 4.995730970248893e-06, "loss": 0.3499755859375, "step": 11, "token_acc": 0.8920448295669816 }, { "epoch": 0.05405405405405406, "grad_norm": 0.9397904872894287, "learning_rate": 4.993330709158879e-06, "loss": 0.3543701171875, "step": 12, "token_acc": 0.8902679534039811 }, { "epoch": 0.05855855855855856, "grad_norm": 0.957818329334259, "learning_rate": 4.990398100856367e-06, "loss": 0.33349609375, "step": 13, "token_acc": 0.897339547629675 }, { "epoch": 0.06306306306306306, "grad_norm": 1.097841501235962, "learning_rate": 4.986933771478052e-06, "loss": 0.3480224609375, "step": 14, "token_acc": 0.8923554684891254 }, { "epoch": 0.06756756756756757, "grad_norm": 1.1507660150527954, "learning_rate": 4.982938460687583e-06, "loss": 0.3604736328125, "step": 15, "token_acc": 0.8908350807943752 }, { "epoch": 0.07207207207207207, "grad_norm": 0.8487851023674011, "learning_rate": 4.978413021517634e-06, "loss": 0.34765625, "step": 16, "token_acc": 0.8925047582349576 }, { "epoch": 0.07657657657657657, "grad_norm": 0.7968904376029968, "learning_rate": 4.973358420187776e-06, "loss": 0.318359375, "step": 17, "token_acc": 0.8994016642783518 }, { "epoch": 0.08108108108108109, "grad_norm": 0.8453375101089478, "learning_rate": 4.967775735898179e-06, "loss": 0.3338623046875, "step": 18, "token_acc": 0.8963307821629692 }, { "epoch": 0.08558558558558559, "grad_norm": 0.8234410285949707, "learning_rate": 4.961666160599198e-06, "loss": 0.344482421875, "step": 19, "token_acc": 0.8938563663904482 }, { "epoch": 0.09009009009009009, "grad_norm": 0.9092895984649658, "learning_rate": 4.955030998736876e-06, "loss": 0.323974609375, "step": 20, "token_acc": 0.8994619407758588 }, { "epoch": 0.0945945945945946, "grad_norm": 0.9384039044380188, "learning_rate": 4.947871666974438e-06, "loss": 0.40478515625, "step": 21, "token_acc": 0.8766958028536647 }, { "epoch": 0.0990990990990991, "grad_norm": 0.9205092787742615, "learning_rate": 4.940189693889819e-06, "loss": 0.32330322265625, "step": 22, "token_acc": 0.8981442098900853 }, { "epoch": 0.1036036036036036, "grad_norm": 0.9223892092704773, "learning_rate": 4.931986719649298e-06, "loss": 0.3792724609375, "step": 23, "token_acc": 0.8823543361236927 }, { "epoch": 0.10810810810810811, "grad_norm": 0.6995769739151001, "learning_rate": 4.923264495657319e-06, "loss": 0.3433837890625, "step": 24, "token_acc": 0.8946257568082592 }, { "epoch": 0.11261261261261261, "grad_norm": 1.3750187158584595, "learning_rate": 4.914024884182535e-06, "loss": 0.40557861328125, "step": 25, "token_acc": 0.878056546947765 }, { "epoch": 0.11711711711711711, "grad_norm": 1.1581752300262451, "learning_rate": 4.904269857960208e-06, "loss": 0.3367919921875, "step": 26, "token_acc": 0.8954026071562684 }, { "epoch": 0.12162162162162163, "grad_norm": 1.0632562637329102, "learning_rate": 4.894001499771015e-06, "loss": 0.358154296875, "step": 27, "token_acc": 0.8903309816311065 }, { "epoch": 0.12612612612612611, "grad_norm": 0.9011189341545105, "learning_rate": 4.883222001996352e-06, "loss": 0.3438720703125, "step": 28, "token_acc": 0.8913533999298984 }, { "epoch": 0.13063063063063063, "grad_norm": 1.0672448873519897, "learning_rate": 4.871933666150239e-06, "loss": 0.3509521484375, "step": 29, "token_acc": 0.8918955084077967 }, { "epoch": 0.13513513513513514, "grad_norm": 0.8111361861228943, "learning_rate": 4.8601389023879395e-06, "loss": 0.3671875, "step": 30, "token_acc": 0.8869003430910295 }, { "epoch": 0.13963963963963963, "grad_norm": 0.9260621666908264, "learning_rate": 4.8478402289913566e-06, "loss": 0.3306884765625, "step": 31, "token_acc": 0.8972916061574209 }, { "epoch": 0.14414414414414414, "grad_norm": 0.9479268193244934, "learning_rate": 4.835040271831371e-06, "loss": 0.35076904296875, "step": 32, "token_acc": 0.89304353165101 }, { "epoch": 0.14864864864864866, "grad_norm": 0.9695205688476562, "learning_rate": 4.821741763807186e-06, "loss": 0.360107421875, "step": 33, "token_acc": 0.8890506207294774 }, { "epoch": 0.15315315315315314, "grad_norm": 0.8806270956993103, "learning_rate": 4.807947544262838e-06, "loss": 0.37542724609375, "step": 34, "token_acc": 0.883788106662847 }, { "epoch": 0.15765765765765766, "grad_norm": 0.8483520150184631, "learning_rate": 4.793660558380969e-06, "loss": 0.359619140625, "step": 35, "token_acc": 0.8883863056108398 }, { "epoch": 0.16216216216216217, "grad_norm": 0.9061219692230225, "learning_rate": 4.7788838565540044e-06, "loss": 0.33489990234375, "step": 36, "token_acc": 0.8967343771172348 }, { "epoch": 0.16666666666666666, "grad_norm": 0.7820679545402527, "learning_rate": 4.763620593732867e-06, "loss": 0.350830078125, "step": 37, "token_acc": 0.890262264871789 }, { "epoch": 0.17117117117117117, "grad_norm": 0.9904912114143372, "learning_rate": 4.747874028753375e-06, "loss": 0.4389495849609375, "step": 38, "token_acc": 0.8688650397289224 }, { "epoch": 0.17567567567567569, "grad_norm": 0.871274471282959, "learning_rate": 4.731647523640446e-06, "loss": 0.3544921875, "step": 39, "token_acc": 0.8903172772836772 }, { "epoch": 0.18018018018018017, "grad_norm": 0.7513757348060608, "learning_rate": 4.7149445428902794e-06, "loss": 0.3541259765625, "step": 40, "token_acc": 0.8904149382414799 }, { "epoch": 0.18468468468468469, "grad_norm": 1.159681797027588, "learning_rate": 4.697768652730656e-06, "loss": 0.3160400390625, "step": 41, "token_acc": 0.9033266799733866 }, { "epoch": 0.1891891891891892, "grad_norm": 0.8280705213546753, "learning_rate": 4.68012352035952e-06, "loss": 0.3328857421875, "step": 42, "token_acc": 0.8980802792321116 }, { "epoch": 0.19369369369369369, "grad_norm": 0.8860488533973694, "learning_rate": 4.662012913161998e-06, "loss": 0.3543701171875, "step": 43, "token_acc": 0.890051324589144 }, { "epoch": 0.1981981981981982, "grad_norm": 0.8635104298591614, "learning_rate": 4.643440697906033e-06, "loss": 0.34234619140625, "step": 44, "token_acc": 0.8932650377240949 }, { "epoch": 0.20270270270270271, "grad_norm": 0.846026599407196, "learning_rate": 4.624410839916798e-06, "loss": 0.3629150390625, "step": 45, "token_acc": 0.8890571013466493 }, { "epoch": 0.2072072072072072, "grad_norm": 0.8589869737625122, "learning_rate": 4.604927402230061e-06, "loss": 0.359130859375, "step": 46, "token_acc": 0.8904908189625078 }, { "epoch": 0.21171171171171171, "grad_norm": 0.8538748025894165, "learning_rate": 4.584994544724695e-06, "loss": 0.373016357421875, "step": 47, "token_acc": 0.8841772321548235 }, { "epoch": 0.21621621621621623, "grad_norm": 0.8112604022026062, "learning_rate": 4.564616523234511e-06, "loss": 0.3341064453125, "step": 48, "token_acc": 0.8964978147002773 }, { "epoch": 0.22072072072072071, "grad_norm": 0.9030938148498535, "learning_rate": 4.543797688639596e-06, "loss": 0.407958984375, "step": 49, "token_acc": 0.8771987646635173 }, { "epoch": 0.22522522522522523, "grad_norm": 0.8748745918273926, "learning_rate": 4.522542485937369e-06, "loss": 0.37548828125, "step": 50, "token_acc": 0.8819748719843742 }, { "epoch": 0.22972972972972974, "grad_norm": 0.7620592713356018, "learning_rate": 4.500855453293532e-06, "loss": 0.31634521484375, "step": 51, "token_acc": 0.8998812351543943 }, { "epoch": 0.23423423423423423, "grad_norm": 0.9173205494880676, "learning_rate": 4.478741221073136e-06, "loss": 0.337432861328125, "step": 52, "token_acc": 0.8988951978520181 }, { "epoch": 0.23873873873873874, "grad_norm": 1.211668610572815, "learning_rate": 4.456204510851957e-06, "loss": 0.361572265625, "step": 53, "token_acc": 0.8882260347359092 }, { "epoch": 0.24324324324324326, "grad_norm": 1.1069424152374268, "learning_rate": 4.433250134408401e-06, "loss": 0.3758544921875, "step": 54, "token_acc": 0.8838936369355408 }, { "epoch": 0.24774774774774774, "grad_norm": 1.0697734355926514, "learning_rate": 4.4098829926961485e-06, "loss": 0.3668212890625, "step": 55, "token_acc": 0.8864855281237534 }, { "epoch": 0.25225225225225223, "grad_norm": 0.9282281994819641, "learning_rate": 4.386108074797757e-06, "loss": 0.3770751953125, "step": 56, "token_acc": 0.8851603202168664 }, { "epoch": 0.25675675675675674, "grad_norm": 0.7715709209442139, "learning_rate": 4.361930456859455e-06, "loss": 0.391845703125, "step": 57, "token_acc": 0.8783604400170836 }, { "epoch": 0.26126126126126126, "grad_norm": 0.9285658001899719, "learning_rate": 4.337355301007336e-06, "loss": 0.3463134765625, "step": 58, "token_acc": 0.8939376088180235 }, { "epoch": 0.26576576576576577, "grad_norm": 1.034256935119629, "learning_rate": 4.312387854245201e-06, "loss": 0.3419189453125, "step": 59, "token_acc": 0.8928964642593341 }, { "epoch": 0.2702702702702703, "grad_norm": 1.0528221130371094, "learning_rate": 4.287033447334286e-06, "loss": 0.317138671875, "step": 60, "token_acc": 0.901003937801551 }, { "epoch": 0.2747747747747748, "grad_norm": 0.8839389681816101, "learning_rate": 4.261297493655092e-06, "loss": 0.3155517578125, "step": 61, "token_acc": 0.9004427280397325 }, { "epoch": 0.27927927927927926, "grad_norm": 0.8743260502815247, "learning_rate": 4.2351854880515856e-06, "loss": 0.34619140625, "step": 62, "token_acc": 0.8932852856301954 }, { "epoch": 0.28378378378378377, "grad_norm": 0.7228661179542542, "learning_rate": 4.208703005658e-06, "loss": 0.348663330078125, "step": 63, "token_acc": 0.8947115994978206 }, { "epoch": 0.2882882882882883, "grad_norm": 0.8091106414794922, "learning_rate": 4.1818557007085e-06, "loss": 0.3548583984375, "step": 64, "token_acc": 0.8893367675466839 }, { "epoch": 0.2927927927927928, "grad_norm": 0.9842603802680969, "learning_rate": 4.154649305329959e-06, "loss": 0.34814453125, "step": 65, "token_acc": 0.8905122962458113 }, { "epoch": 0.2972972972972973, "grad_norm": 0.6639820337295532, "learning_rate": 4.12708962831809e-06, "loss": 0.33953857421875, "step": 66, "token_acc": 0.8963516432198938 }, { "epoch": 0.30180180180180183, "grad_norm": 0.9688128232955933, "learning_rate": 4.099182553897228e-06, "loss": 0.34039306640625, "step": 67, "token_acc": 0.893565994838982 }, { "epoch": 0.3063063063063063, "grad_norm": 0.7545742392539978, "learning_rate": 4.070934040463999e-06, "loss": 0.341400146484375, "step": 68, "token_acc": 0.894034692141854 }, { "epoch": 0.3108108108108108, "grad_norm": 0.834723711013794, "learning_rate": 4.042350119315142e-06, "loss": 0.39501953125, "step": 69, "token_acc": 0.8793881644934805 }, { "epoch": 0.3153153153153153, "grad_norm": 0.7959160804748535, "learning_rate": 4.013436893359787e-06, "loss": 0.37982177734375, "step": 70, "token_acc": 0.8827430920972124 }, { "epoch": 0.31981981981981983, "grad_norm": 0.7727994322776794, "learning_rate": 3.984200535816427e-06, "loss": 0.3292236328125, "step": 71, "token_acc": 0.8962622549019608 }, { "epoch": 0.32432432432432434, "grad_norm": 1.1143063306808472, "learning_rate": 3.9546472888948825e-06, "loss": 0.3765869140625, "step": 72, "token_acc": 0.8862365334289776 }, { "epoch": 0.32882882882882886, "grad_norm": 0.802827775478363, "learning_rate": 3.924783462463541e-06, "loss": 0.3514404296875, "step": 73, "token_acc": 0.8915428441438926 }, { "epoch": 0.3333333333333333, "grad_norm": 0.7230685353279114, "learning_rate": 3.894615432702144e-06, "loss": 0.326416015625, "step": 74, "token_acc": 0.8988927052463204 }, { "epoch": 0.33783783783783783, "grad_norm": 0.6452018618583679, "learning_rate": 3.8641496407404165e-06, "loss": 0.344970703125, "step": 75, "token_acc": 0.8912708883025505 }, { "epoch": 0.34234234234234234, "grad_norm": 0.8161414265632629, "learning_rate": 3.833392591282838e-06, "loss": 0.352294921875, "step": 76, "token_acc": 0.8910519555745838 }, { "epoch": 0.34684684684684686, "grad_norm": 0.9082568287849426, "learning_rate": 3.802350851219826e-06, "loss": 0.31591796875, "step": 77, "token_acc": 0.903437424998 }, { "epoch": 0.35135135135135137, "grad_norm": 0.8428945541381836, "learning_rate": 3.771031048225653e-06, "loss": 0.3326416015625, "step": 78, "token_acc": 0.8960439385497094 }, { "epoch": 0.35585585585585583, "grad_norm": 0.7270176410675049, "learning_rate": 3.7394398693433798e-06, "loss": 0.34326171875, "step": 79, "token_acc": 0.8919539041909978 }, { "epoch": 0.36036036036036034, "grad_norm": 0.9635636210441589, "learning_rate": 3.70758405955712e-06, "loss": 0.324951171875, "step": 80, "token_acc": 0.8983449519816702 }, { "epoch": 0.36486486486486486, "grad_norm": 0.7542657256126404, "learning_rate": 3.675470420351921e-06, "loss": 0.363525390625, "step": 81, "token_acc": 0.8872009164518733 }, { "epoch": 0.36936936936936937, "grad_norm": 0.8671934008598328, "learning_rate": 3.6431058082615966e-06, "loss": 0.31597900390625, "step": 82, "token_acc": 0.9009948542024013 }, { "epoch": 0.3738738738738739, "grad_norm": 1.0081602334976196, "learning_rate": 3.6104971334047954e-06, "loss": 0.3199462890625, "step": 83, "token_acc": 0.8997704785831705 }, { "epoch": 0.3783783783783784, "grad_norm": 1.0129483938217163, "learning_rate": 3.5776513580096316e-06, "loss": 0.3701171875, "step": 84, "token_acc": 0.88471046201057 }, { "epoch": 0.38288288288288286, "grad_norm": 0.8052798509597778, "learning_rate": 3.5445754949271925e-06, "loss": 0.3421630859375, "step": 85, "token_acc": 0.8927845528455285 }, { "epoch": 0.38738738738738737, "grad_norm": 0.8934619426727295, "learning_rate": 3.5112766061342346e-06, "loss": 0.3060302734375, "step": 86, "token_acc": 0.9026350477994344 }, { "epoch": 0.3918918918918919, "grad_norm": 0.6580748558044434, "learning_rate": 3.47776180122539e-06, "loss": 0.359619140625, "step": 87, "token_acc": 0.8894399733815792 }, { "epoch": 0.3963963963963964, "grad_norm": 0.8089192509651184, "learning_rate": 3.4440382358952116e-06, "loss": 0.3321533203125, "step": 88, "token_acc": 0.8961488812392426 }, { "epoch": 0.4009009009009009, "grad_norm": 0.8350925445556641, "learning_rate": 3.4101131104103664e-06, "loss": 0.3330078125, "step": 89, "token_acc": 0.8948240976437569 }, { "epoch": 0.40540540540540543, "grad_norm": 0.8865199089050293, "learning_rate": 3.3759936680723238e-06, "loss": 0.3404541015625, "step": 90, "token_acc": 0.8960275592341626 }, { "epoch": 0.4099099099099099, "grad_norm": 0.9881960153579712, "learning_rate": 3.341687193670844e-06, "loss": 0.4093017578125, "step": 91, "token_acc": 0.8754743340444411 }, { "epoch": 0.4144144144144144, "grad_norm": 0.6525192856788635, "learning_rate": 3.3072010119286156e-06, "loss": 0.343994140625, "step": 92, "token_acc": 0.8952931153825675 }, { "epoch": 0.4189189189189189, "grad_norm": 0.8146042823791504, "learning_rate": 3.272542485937369e-06, "loss": 0.32464599609375, "step": 93, "token_acc": 0.8996927651139742 }, { "epoch": 0.42342342342342343, "grad_norm": 0.7997490763664246, "learning_rate": 3.237719015585787e-06, "loss": 0.3258056640625, "step": 94, "token_acc": 0.8986706407255705 }, { "epoch": 0.42792792792792794, "grad_norm": 0.7047693133354187, "learning_rate": 3.202738035979571e-06, "loss": 0.3316650390625, "step": 95, "token_acc": 0.8959871462508255 }, { "epoch": 0.43243243243243246, "grad_norm": 0.6673498749732971, "learning_rate": 3.167607015853983e-06, "loss": 0.35711669921875, "step": 96, "token_acc": 0.8914680520593433 }, { "epoch": 0.4369369369369369, "grad_norm": 0.9044120907783508, "learning_rate": 3.132333455979202e-06, "loss": 0.29888916015625, "step": 97, "token_acc": 0.9081531160305777 }, { "epoch": 0.44144144144144143, "grad_norm": 0.6456412672996521, "learning_rate": 3.0969248875588547e-06, "loss": 0.3343048095703125, "step": 98, "token_acc": 0.8963899986716667 }, { "epoch": 0.44594594594594594, "grad_norm": 0.8044923543930054, "learning_rate": 3.0613888706220336e-06, "loss": 0.3668212890625, "step": 99, "token_acc": 0.8872929258325789 }, { "epoch": 0.45045045045045046, "grad_norm": 0.8912515044212341, "learning_rate": 3.025732992409166e-06, "loss": 0.3749237060546875, "step": 100, "token_acc": 0.8863157390554651 }, { "epoch": 0.45495495495495497, "grad_norm": 0.9178464412689209, "learning_rate": 2.989964865752079e-06, "loss": 0.31494140625, "step": 101, "token_acc": 0.9031758326878389 }, { "epoch": 0.4594594594594595, "grad_norm": 0.7799769639968872, "learning_rate": 2.9540921274485913e-06, "loss": 0.316070556640625, "step": 102, "token_acc": 0.9010862029119482 }, { "epoch": 0.46396396396396394, "grad_norm": 0.9977706670761108, "learning_rate": 2.9181224366319947e-06, "loss": 0.335693359375, "step": 103, "token_acc": 0.8947431320117716 }, { "epoch": 0.46846846846846846, "grad_norm": 1.2078821659088135, "learning_rate": 2.882063473135763e-06, "loss": 0.4012451171875, "step": 104, "token_acc": 0.8755390944456183 }, { "epoch": 0.47297297297297297, "grad_norm": 0.9138604998588562, "learning_rate": 2.845922935853841e-06, "loss": 0.31298828125, "step": 105, "token_acc": 0.9045985448046975 }, { "epoch": 0.4774774774774775, "grad_norm": 0.9735301733016968, "learning_rate": 2.80970854109687e-06, "loss": 0.312744140625, "step": 106, "token_acc": 0.9002951864614082 }, { "epoch": 0.481981981981982, "grad_norm": 0.8821303844451904, "learning_rate": 2.773428020944687e-06, "loss": 0.316650390625, "step": 107, "token_acc": 0.9018892572372231 }, { "epoch": 0.4864864864864865, "grad_norm": 0.7590733766555786, "learning_rate": 2.7370891215954572e-06, "loss": 0.362060546875, "step": 108, "token_acc": 0.8878953544431608 }, { "epoch": 0.49099099099099097, "grad_norm": 1.1650851964950562, "learning_rate": 2.7006996017118033e-06, "loss": 0.349365234375, "step": 109, "token_acc": 0.8941975079040357 }, { "epoch": 0.4954954954954955, "grad_norm": 0.8107997179031372, "learning_rate": 2.6642672307642575e-06, "loss": 0.3140869140625, "step": 110, "token_acc": 0.9023156522104909 }, { "epoch": 0.5, "grad_norm": 0.9272815585136414, "learning_rate": 2.627799787372418e-06, "loss": 0.33642578125, "step": 111, "token_acc": 0.8949056603773585 }, { "epoch": 0.5045045045045045, "grad_norm": 0.8578754663467407, "learning_rate": 2.591305057644148e-06, "loss": 0.34564208984375, "step": 112, "token_acc": 0.8929581334646075 }, { "epoch": 0.509009009009009, "grad_norm": 0.8179190754890442, "learning_rate": 2.5547908335131704e-06, "loss": 0.3253173828125, "step": 113, "token_acc": 0.8980514754137483 }, { "epoch": 0.5135135135135135, "grad_norm": 0.937614917755127, "learning_rate": 2.5182649110754325e-06, "loss": 0.30389404296875, "step": 114, "token_acc": 0.9068263329281652 }, { "epoch": 0.5180180180180181, "grad_norm": 0.9342759251594543, "learning_rate": 2.4817350889245675e-06, "loss": 0.362548828125, "step": 115, "token_acc": 0.8870322400184855 }, { "epoch": 0.5225225225225225, "grad_norm": 0.7444611191749573, "learning_rate": 2.44520916648683e-06, "loss": 0.36907958984375, "step": 116, "token_acc": 0.8857218663533336 }, { "epoch": 0.527027027027027, "grad_norm": 0.9588095545768738, "learning_rate": 2.408694942355853e-06, "loss": 0.38671875, "step": 117, "token_acc": 0.8824941943771926 }, { "epoch": 0.5315315315315315, "grad_norm": 0.9178715348243713, "learning_rate": 2.3722002126275826e-06, "loss": 0.34375, "step": 118, "token_acc": 0.8941754417885328 }, { "epoch": 0.536036036036036, "grad_norm": 0.8871239423751831, "learning_rate": 2.3357327692357434e-06, "loss": 0.3515625, "step": 119, "token_acc": 0.8914658528885615 }, { "epoch": 0.5405405405405406, "grad_norm": 0.7818154096603394, "learning_rate": 2.2993003982881976e-06, "loss": 0.30548095703125, "step": 120, "token_acc": 0.9041648931367617 }, { "epoch": 0.545045045045045, "grad_norm": 0.7676111459732056, "learning_rate": 2.262910878404544e-06, "loss": 0.2769775390625, "step": 121, "token_acc": 0.9139403881271473 }, { "epoch": 0.5495495495495496, "grad_norm": 0.9711487889289856, "learning_rate": 2.2265719790553147e-06, "loss": 0.3883056640625, "step": 122, "token_acc": 0.8804856085838586 }, { "epoch": 0.5540540540540541, "grad_norm": 0.8610119223594666, "learning_rate": 2.19029145890313e-06, "loss": 0.3262939453125, "step": 123, "token_acc": 0.8978232618583496 }, { "epoch": 0.5585585585585585, "grad_norm": 0.8020825386047363, "learning_rate": 2.154077064146159e-06, "loss": 0.3035888671875, "step": 124, "token_acc": 0.9039610929800985 }, { "epoch": 0.5630630630630631, "grad_norm": 0.9404456615447998, "learning_rate": 2.1179365268642377e-06, "loss": 0.32861328125, "step": 125, "token_acc": 0.8969406998672906 }, { "epoch": 0.5675675675675675, "grad_norm": 0.9896916747093201, "learning_rate": 2.0818775633680057e-06, "loss": 0.36248779296875, "step": 126, "token_acc": 0.8876241534257928 }, { "epoch": 0.5720720720720721, "grad_norm": 0.738699734210968, "learning_rate": 2.045907872551409e-06, "loss": 0.3084716796875, "step": 127, "token_acc": 0.9047011482971833 }, { "epoch": 0.5765765765765766, "grad_norm": 0.987114429473877, "learning_rate": 2.010035134247922e-06, "loss": 0.3638916015625, "step": 128, "token_acc": 0.8856382245027449 }, { "epoch": 0.581081081081081, "grad_norm": 0.8161911368370056, "learning_rate": 1.9742670075908353e-06, "loss": 0.3726806640625, "step": 129, "token_acc": 0.8862522268667838 }, { "epoch": 0.5855855855855856, "grad_norm": 1.0528193712234497, "learning_rate": 1.9386111293779673e-06, "loss": 0.3369140625, "step": 130, "token_acc": 0.8956112595402446 }, { "epoch": 0.5900900900900901, "grad_norm": 0.9027992486953735, "learning_rate": 1.903075112441145e-06, "loss": 0.3206787109375, "step": 131, "token_acc": 0.9009896596103493 }, { "epoch": 0.5945945945945946, "grad_norm": 0.7341856360435486, "learning_rate": 1.8676665440207982e-06, "loss": 0.2967529296875, "step": 132, "token_acc": 0.9068318389675675 }, { "epoch": 0.5990990990990991, "grad_norm": 0.735024094581604, "learning_rate": 1.832392984146018e-06, "loss": 0.31787109375, "step": 133, "token_acc": 0.8996451126455548 }, { "epoch": 0.6036036036036037, "grad_norm": 0.9426040053367615, "learning_rate": 1.7972619640204298e-06, "loss": 0.387451171875, "step": 134, "token_acc": 0.8815383862602143 }, { "epoch": 0.6081081081081081, "grad_norm": 0.7768560647964478, "learning_rate": 1.7622809844142138e-06, "loss": 0.35296630859375, "step": 135, "token_acc": 0.8923252350367681 }, { "epoch": 0.6126126126126126, "grad_norm": 0.8278442025184631, "learning_rate": 1.7274575140626318e-06, "loss": 0.3173828125, "step": 136, "token_acc": 0.8998283663598066 }, { "epoch": 0.6171171171171171, "grad_norm": 0.8853439092636108, "learning_rate": 1.6927989880713852e-06, "loss": 0.3544158935546875, "step": 137, "token_acc": 0.8905075533139094 }, { "epoch": 0.6216216216216216, "grad_norm": 0.6611644625663757, "learning_rate": 1.6583128063291576e-06, "loss": 0.33984375, "step": 138, "token_acc": 0.8959825350762759 }, { "epoch": 0.6261261261261262, "grad_norm": 0.7384758591651917, "learning_rate": 1.6240063319276767e-06, "loss": 0.32476806640625, "step": 139, "token_acc": 0.8985073953315207 }, { "epoch": 0.6306306306306306, "grad_norm": 0.773961067199707, "learning_rate": 1.5898868895896336e-06, "loss": 0.4095458984375, "step": 140, "token_acc": 0.8739141708803319 }, { "epoch": 0.6351351351351351, "grad_norm": 0.8257731199264526, "learning_rate": 1.5559617641047886e-06, "loss": 0.3592529296875, "step": 141, "token_acc": 0.8883896425588295 }, { "epoch": 0.6396396396396397, "grad_norm": 0.787449061870575, "learning_rate": 1.5222381987746104e-06, "loss": 0.357177734375, "step": 142, "token_acc": 0.888363484849953 }, { "epoch": 0.6441441441441441, "grad_norm": 0.6886521577835083, "learning_rate": 1.488723393865766e-06, "loss": 0.360107421875, "step": 143, "token_acc": 0.8890432232736252 }, { "epoch": 0.6486486486486487, "grad_norm": 0.7612305283546448, "learning_rate": 1.4554245050728085e-06, "loss": 0.3238525390625, "step": 144, "token_acc": 0.8999899789558072 }, { "epoch": 0.6531531531531531, "grad_norm": 0.8425935506820679, "learning_rate": 1.4223486419903692e-06, "loss": 0.327606201171875, "step": 145, "token_acc": 0.8979553119730186 }, { "epoch": 0.6576576576576577, "grad_norm": 1.0973107814788818, "learning_rate": 1.389502866595206e-06, "loss": 0.319122314453125, "step": 146, "token_acc": 0.9017525376916615 }, { "epoch": 0.6621621621621622, "grad_norm": 0.7503589391708374, "learning_rate": 1.3568941917384038e-06, "loss": 0.31683349609375, "step": 147, "token_acc": 0.9008651974055598 }, { "epoch": 0.6666666666666666, "grad_norm": 0.7911334037780762, "learning_rate": 1.324529579648079e-06, "loss": 0.28924560546875, "step": 148, "token_acc": 0.9068919780898478 }, { "epoch": 0.6711711711711712, "grad_norm": 0.771323025226593, "learning_rate": 1.2924159404428804e-06, "loss": 0.373291015625, "step": 149, "token_acc": 0.884785335262904 }, { "epoch": 0.6756756756756757, "grad_norm": 0.9364131689071655, "learning_rate": 1.2605601306566206e-06, "loss": 0.3597412109375, "step": 150, "token_acc": 0.888817750535287 }, { "epoch": 0.6801801801801802, "grad_norm": 0.7087746262550354, "learning_rate": 1.2289689517743475e-06, "loss": 0.3809814453125, "step": 151, "token_acc": 0.8821097824929922 }, { "epoch": 0.6846846846846847, "grad_norm": 0.6490341424942017, "learning_rate": 1.1976491487801747e-06, "loss": 0.32562255859375, "step": 152, "token_acc": 0.8976476697136251 }, { "epoch": 0.6891891891891891, "grad_norm": 0.7317127585411072, "learning_rate": 1.1666074087171628e-06, "loss": 0.330810546875, "step": 153, "token_acc": 0.9007194922339586 }, { "epoch": 0.6936936936936937, "grad_norm": 0.7992354035377502, "learning_rate": 1.1358503592595837e-06, "loss": 0.33154296875, "step": 154, "token_acc": 0.8951166965888689 }, { "epoch": 0.6981981981981982, "grad_norm": 0.8374021053314209, "learning_rate": 1.1053845672978567e-06, "loss": 0.3441162109375, "step": 155, "token_acc": 0.8938746596418214 }, { "epoch": 0.7027027027027027, "grad_norm": 0.747456431388855, "learning_rate": 1.0752165375364593e-06, "loss": 0.3369140625, "step": 156, "token_acc": 0.8960199252801992 }, { "epoch": 0.7072072072072072, "grad_norm": 0.6646521091461182, "learning_rate": 1.0453527111051183e-06, "loss": 0.302581787109375, "step": 157, "token_acc": 0.9050437347721778 }, { "epoch": 0.7117117117117117, "grad_norm": 0.7055298686027527, "learning_rate": 1.0157994641835737e-06, "loss": 0.301239013671875, "step": 158, "token_acc": 0.9069595645412131 }, { "epoch": 0.7162162162162162, "grad_norm": 0.9243741631507874, "learning_rate": 9.865631066402138e-07, "loss": 0.35394287109375, "step": 159, "token_acc": 0.8925179495143776 }, { "epoch": 0.7207207207207207, "grad_norm": 0.8311925530433655, "learning_rate": 9.576498806848592e-07, "loss": 0.340087890625, "step": 160, "token_acc": 0.894881932249615 }, { "epoch": 0.7252252252252253, "grad_norm": 0.8588569164276123, "learning_rate": 9.290659595360019e-07, "loss": 0.3267822265625, "step": 161, "token_acc": 0.8974585635359116 }, { "epoch": 0.7297297297297297, "grad_norm": 0.7929273247718811, "learning_rate": 9.008174461027724e-07, "loss": 0.340576171875, "step": 162, "token_acc": 0.8939879994458003 }, { "epoch": 0.7342342342342343, "grad_norm": 1.2013729810714722, "learning_rate": 8.729103716819113e-07, "loss": 0.370819091796875, "step": 163, "token_acc": 0.8868560581265955 }, { "epoch": 0.7387387387387387, "grad_norm": 0.7468942403793335, "learning_rate": 8.453506946700419e-07, "loss": 0.2955322265625, "step": 164, "token_acc": 0.9082374491957181 }, { "epoch": 0.7432432432432432, "grad_norm": 1.1810017824172974, "learning_rate": 8.181442992915001e-07, "loss": 0.378662109375, "step": 165, "token_acc": 0.8825104623355918 }, { "epoch": 0.7477477477477478, "grad_norm": 0.7701326012611389, "learning_rate": 7.912969943420018e-07, "loss": 0.3447265625, "step": 166, "token_acc": 0.8957051550086613 }, { "epoch": 0.7522522522522522, "grad_norm": 0.8558163046836853, "learning_rate": 7.648145119484152e-07, "loss": 0.3280029296875, "step": 167, "token_acc": 0.89935299183094 }, { "epoch": 0.7567567567567568, "grad_norm": 0.6991153955459595, "learning_rate": 7.387025063449082e-07, "loss": 0.3521728515625, "step": 168, "token_acc": 0.8900763667218877 }, { "epoch": 0.7612612612612613, "grad_norm": 1.034488558769226, "learning_rate": 7.129665526657145e-07, "loss": 0.3106231689453125, "step": 169, "token_acc": 0.9047021491417857 }, { "epoch": 0.7657657657657657, "grad_norm": 0.733146071434021, "learning_rate": 6.876121457547996e-07, "loss": 0.310302734375, "step": 170, "token_acc": 0.9026569890363783 }, { "epoch": 0.7702702702702703, "grad_norm": 0.8405783176422119, "learning_rate": 6.626446989926652e-07, "loss": 0.3514404296875, "step": 171, "token_acc": 0.890449830708873 }, { "epoch": 0.7747747747747747, "grad_norm": 0.8425772190093994, "learning_rate": 6.380695431405453e-07, "loss": 0.361083984375, "step": 172, "token_acc": 0.8888163230910221 }, { "epoch": 0.7792792792792793, "grad_norm": 0.9122714996337891, "learning_rate": 6.138919252022435e-07, "loss": 0.3192138671875, "step": 173, "token_acc": 0.899907008200186 }, { "epoch": 0.7837837837837838, "grad_norm": 0.9160909652709961, "learning_rate": 5.901170073038523e-07, "loss": 0.3465576171875, "step": 174, "token_acc": 0.8923485456678811 }, { "epoch": 0.7882882882882883, "grad_norm": 0.7696585655212402, "learning_rate": 5.667498655916002e-07, "loss": 0.3604736328125, "step": 175, "token_acc": 0.8889116998746597 }, { "epoch": 0.7927927927927928, "grad_norm": 0.7901983261108398, "learning_rate": 5.437954891480443e-07, "loss": 0.3160400390625, "step": 176, "token_acc": 0.9012986436881038 }, { "epoch": 0.7972972972972973, "grad_norm": 0.7334163188934326, "learning_rate": 5.21258778926865e-07, "loss": 0.34423828125, "step": 177, "token_acc": 0.8941695594530863 }, { "epoch": 0.8018018018018018, "grad_norm": 0.7236452698707581, "learning_rate": 4.99144546706469e-07, "loss": 0.34686279296875, "step": 178, "token_acc": 0.8923213125253115 }, { "epoch": 0.8063063063063063, "grad_norm": 0.7739962935447693, "learning_rate": 4.774575140626317e-07, "loss": 0.349609375, "step": 179, "token_acc": 0.8923176451560731 }, { "epoch": 0.8108108108108109, "grad_norm": 0.8533901572227478, "learning_rate": 4.5620231136040414e-07, "loss": 0.332275390625, "step": 180, "token_acc": 0.8971639258853548 }, { "epoch": 0.8153153153153153, "grad_norm": 1.0738646984100342, "learning_rate": 4.3538347676548965e-07, "loss": 0.3839111328125, "step": 181, "token_acc": 0.8830640641762606 }, { "epoch": 0.8198198198198198, "grad_norm": 0.7544999122619629, "learning_rate": 4.150054552753055e-07, "loss": 0.349273681640625, "step": 182, "token_acc": 0.8917029877374043 }, { "epoch": 0.8243243243243243, "grad_norm": 0.6967670917510986, "learning_rate": 3.950725977699396e-07, "loss": 0.308349609375, "step": 183, "token_acc": 0.9040394422226454 }, { "epoch": 0.8288288288288288, "grad_norm": 1.0199168920516968, "learning_rate": 3.7558916008320263e-07, "loss": 0.3571624755859375, "step": 184, "token_acc": 0.8911331997805311 }, { "epoch": 0.8333333333333334, "grad_norm": 0.776905357837677, "learning_rate": 3.5655930209396784e-07, "loss": 0.34765625, "step": 185, "token_acc": 0.8915000262425865 }, { "epoch": 0.8378378378378378, "grad_norm": 0.9011301398277283, "learning_rate": 3.379870868380031e-07, "loss": 0.37677001953125, "step": 186, "token_acc": 0.8837367137532784 }, { "epoch": 0.8423423423423423, "grad_norm": 0.7519142627716064, "learning_rate": 3.1987647964048075e-07, "loss": 0.3240966796875, "step": 187, "token_acc": 0.8982510044906642 }, { "epoch": 0.8468468468468469, "grad_norm": 0.8182622194290161, "learning_rate": 3.022313472693447e-07, "loss": 0.31689453125, "step": 188, "token_acc": 0.8999964010221098 }, { "epoch": 0.8513513513513513, "grad_norm": 0.7286121845245361, "learning_rate": 2.850554571097211e-07, "loss": 0.3243408203125, "step": 189, "token_acc": 0.900392670157068 }, { "epoch": 0.8558558558558559, "grad_norm": 0.8014657497406006, "learning_rate": 2.6835247635955466e-07, "loss": 0.3505859375, "step": 190, "token_acc": 0.8923208057153752 }, { "epoch": 0.8603603603603603, "grad_norm": 0.8122464418411255, "learning_rate": 2.521259712466256e-07, "loss": 0.3648681640625, "step": 191, "token_acc": 0.8863166841775962 }, { "epoch": 0.8648648648648649, "grad_norm": 1.0286812782287598, "learning_rate": 2.3637940626713346e-07, "loss": 0.3612060546875, "step": 192, "token_acc": 0.8910263284894633 }, { "epoch": 0.8693693693693694, "grad_norm": 0.8132110238075256, "learning_rate": 2.2111614344599686e-07, "loss": 0.3125152587890625, "step": 193, "token_acc": 0.9036560888233949 }, { "epoch": 0.8738738738738738, "grad_norm": 0.7871835231781006, "learning_rate": 2.0633944161903147e-07, "loss": 0.3336181640625, "step": 194, "token_acc": 0.8960117156590649 }, { "epoch": 0.8783783783783784, "grad_norm": 0.8867942690849304, "learning_rate": 1.9205245573716196e-07, "loss": 0.29541015625, "step": 195, "token_acc": 0.9067175216003737 }, { "epoch": 0.8828828828828829, "grad_norm": 0.8739636540412903, "learning_rate": 1.7825823619281452e-07, "loss": 0.3477783203125, "step": 196, "token_acc": 0.8919099564915821 }, { "epoch": 0.8873873873873874, "grad_norm": 0.7999213933944702, "learning_rate": 1.649597281686302e-07, "loss": 0.32855224609375, "step": 197, "token_acc": 0.8981510210179152 }, { "epoch": 0.8918918918918919, "grad_norm": 0.8212850093841553, "learning_rate": 1.5215977100864394e-07, "loss": 0.392333984375, "step": 198, "token_acc": 0.8798142365281447 }, { "epoch": 0.8963963963963963, "grad_norm": 0.8761787414550781, "learning_rate": 1.3986109761206097e-07, "loss": 0.3126220703125, "step": 199, "token_acc": 0.9025795889471472 }, { "epoch": 0.9009009009009009, "grad_norm": 0.7608596086502075, "learning_rate": 1.2806633384976092e-07, "loss": 0.332550048828125, "step": 200, "token_acc": 0.8978598516872388 }, { "epoch": 0.9054054054054054, "grad_norm": 0.807299017906189, "learning_rate": 1.1677799800364958e-07, "loss": 0.3438720703125, "step": 201, "token_acc": 0.8938019253593564 }, { "epoch": 0.9099099099099099, "grad_norm": 0.906044065952301, "learning_rate": 1.0599850022898539e-07, "loss": 0.3448486328125, "step": 202, "token_acc": 0.8919713642686692 }, { "epoch": 0.9144144144144144, "grad_norm": 0.7889218330383301, "learning_rate": 9.573014203979241e-08, "loss": 0.28619384765625, "step": 203, "token_acc": 0.9101828456205424 }, { "epoch": 0.918918918918919, "grad_norm": 0.7564137578010559, "learning_rate": 8.597511581746626e-08, "loss": 0.3443603515625, "step": 204, "token_acc": 0.891580368127471 }, { "epoch": 0.9234234234234234, "grad_norm": 0.9052029252052307, "learning_rate": 7.673550434268123e-08, "loss": 0.30670166015625, "step": 205, "token_acc": 0.9040779490944689 }, { "epoch": 0.9279279279279279, "grad_norm": 0.8498448729515076, "learning_rate": 6.801328035070138e-08, "loss": 0.2917938232421875, "step": 206, "token_acc": 0.9076634951423842 }, { "epoch": 0.9324324324324325, "grad_norm": 0.940307080745697, "learning_rate": 5.981030611018235e-08, "loss": 0.3585205078125, "step": 207, "token_acc": 0.8883763592374977 }, { "epoch": 0.9369369369369369, "grad_norm": 0.9120367765426636, "learning_rate": 5.212833302556258e-08, "loss": 0.325927734375, "step": 208, "token_acc": 0.8974414543043994 }, { "epoch": 0.9414414414414415, "grad_norm": 1.141761064529419, "learning_rate": 4.4969001263124314e-08, "loss": 0.3285980224609375, "step": 209, "token_acc": 0.8957855034437521 }, { "epoch": 0.9459459459459459, "grad_norm": 0.8518276810646057, "learning_rate": 3.833383940080232e-08, "loss": 0.3089599609375, "step": 210, "token_acc": 0.9050746193405264 }, { "epoch": 0.9504504504504504, "grad_norm": 0.8082460761070251, "learning_rate": 3.222426410182111e-08, "loss": 0.363555908203125, "step": 211, "token_acc": 0.8873174557723192 }, { "epoch": 0.954954954954955, "grad_norm": 1.4015358686447144, "learning_rate": 2.6641579812224373e-08, "loss": 0.41107177734375, "step": 212, "token_acc": 0.8763219103737013 }, { "epoch": 0.9594594594594594, "grad_norm": 0.7203386425971985, "learning_rate": 2.1586978482366072e-08, "loss": 0.3438720703125, "step": 213, "token_acc": 0.8924183661803375 }, { "epoch": 0.963963963963964, "grad_norm": 0.8376468420028687, "learning_rate": 1.7061539312417107e-08, "loss": 0.3333740234375, "step": 214, "token_acc": 0.8958556698301764 }, { "epoch": 0.9684684684684685, "grad_norm": 0.8015531897544861, "learning_rate": 1.3066228521948221e-08, "loss": 0.35986328125, "step": 215, "token_acc": 0.8891843543006334 }, { "epoch": 0.972972972972973, "grad_norm": 0.7691966891288757, "learning_rate": 9.60189914363363e-09, "loss": 0.35791015625, "step": 216, "token_acc": 0.8902418854661981 }, { "epoch": 0.9774774774774775, "grad_norm": 0.9731364250183105, "learning_rate": 6.66929084112089e-09, "loss": 0.352081298828125, "step": 217, "token_acc": 0.8928305631356983 }, { "epoch": 0.9819819819819819, "grad_norm": 0.7616921067237854, "learning_rate": 4.269029751107489e-09, "loss": 0.3609619140625, "step": 218, "token_acc": 0.8907411504424779 }, { "epoch": 0.9864864864864865, "grad_norm": 1.2653754949569702, "learning_rate": 2.4016283496544614e-09, "loss": 0.370086669921875, "step": 219, "token_acc": 0.8855589798345022 }, { "epoch": 0.990990990990991, "grad_norm": 0.9321889281272888, "learning_rate": 1.0674853427683484e-09, "loss": 0.31292724609375, "step": 220, "token_acc": 0.9013229718149482 }, { "epoch": 0.9954954954954955, "grad_norm": 0.8537278771400452, "learning_rate": 2.668855812748561e-10, "loss": 0.325958251953125, "step": 221, "token_acc": 0.89793930894784 }, { "epoch": 1.0, "grad_norm": 0.9416584372520447, "learning_rate": 0.0, "loss": 0.33929443359375, "step": 222, "token_acc": 0.8940414967192765 } ], "logging_steps": 1, "max_steps": 222, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1423772948666778e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }