{ "best_global_step": 666, "best_metric": 3.76118374, "best_model_checkpoint": "/mnt/bn/wdq-base1/data/VLMs/vsa_rl/checkpoint/reasoning_sft_1009/v5-20251009-231445/checkpoint-666", "epoch": 1.0, "eval_steps": 100, "global_step": 666, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015015015015015015, "grad_norm": 18.43874168395996, "learning_rate": 2.9411764705882356e-07, "loss": 1.8555517196655273, "memory(GiB)": 45.75, "step": 1, "token_acc": 0.5909588042289464, "train_speed(iter/s)": 0.01347 }, { "epoch": 0.0075075075075075074, "grad_norm": 20.15400505065918, "learning_rate": 1.4705882352941177e-06, "loss": 1.8491613864898682, "memory(GiB)": 53.59, "step": 5, "token_acc": 0.6025925925925926, "train_speed(iter/s)": 0.036205 }, { "epoch": 0.015015015015015015, "grad_norm": 10.93825912475586, "learning_rate": 2.9411764705882355e-06, "loss": 1.559732437133789, "memory(GiB)": 53.59, "step": 10, "token_acc": 0.6266354798625832, "train_speed(iter/s)": 0.046254 }, { "epoch": 0.02252252252252252, "grad_norm": 5.14630651473999, "learning_rate": 4.411764705882353e-06, "loss": 1.0728832244873048, "memory(GiB)": 53.59, "step": 15, "token_acc": 0.7059398496240602, "train_speed(iter/s)": 0.051056 }, { "epoch": 0.03003003003003003, "grad_norm": 10.39709186553955, "learning_rate": 5.882352941176471e-06, "loss": 0.8455103874206543, "memory(GiB)": 53.59, "step": 20, "token_acc": 0.7537844383893431, "train_speed(iter/s)": 0.05419 }, { "epoch": 0.03753753753753754, "grad_norm": 4.8650970458984375, "learning_rate": 7.352941176470589e-06, "loss": 0.7454294204711914, "memory(GiB)": 53.59, "step": 25, "token_acc": 0.7660878447395302, "train_speed(iter/s)": 0.055846 }, { "epoch": 0.04504504504504504, "grad_norm": 4.418037414550781, "learning_rate": 8.823529411764707e-06, "loss": 0.6922950744628906, "memory(GiB)": 61.49, "step": 30, "token_acc": 0.7846728698073119, "train_speed(iter/s)": 0.057253 }, { "epoch": 0.052552552552552555, "grad_norm": 7.153648376464844, "learning_rate": 9.99993822614516e-06, "loss": 0.6542837142944335, "memory(GiB)": 61.49, "step": 35, "token_acc": 0.7906924460431655, "train_speed(iter/s)": 0.058147 }, { "epoch": 0.06006006006006006, "grad_norm": 3.7986629009246826, "learning_rate": 9.997776301493914e-06, "loss": 0.6283172607421875, "memory(GiB)": 61.49, "step": 40, "token_acc": 0.7965453877251011, "train_speed(iter/s)": 0.058772 }, { "epoch": 0.06756756756756757, "grad_norm": 4.944998264312744, "learning_rate": 9.992527210334781e-06, "loss": 0.6150907516479492, "memory(GiB)": 61.49, "step": 45, "token_acc": 0.7986261479877548, "train_speed(iter/s)": 0.059369 }, { "epoch": 0.07507507507507508, "grad_norm": 12.669698715209961, "learning_rate": 9.98419419507348e-06, "loss": 0.5961947441101074, "memory(GiB)": 69.42, "step": 50, "token_acc": 0.8023150932050511, "train_speed(iter/s)": 0.059729 }, { "epoch": 0.08258258258258258, "grad_norm": 3.911559820175171, "learning_rate": 9.972782403080372e-06, "loss": 0.5979935169219971, "memory(GiB)": 77.39, "step": 55, "token_acc": 0.8086269296107541, "train_speed(iter/s)": 0.060077 }, { "epoch": 0.09009009009009009, "grad_norm": 7.1113362312316895, "learning_rate": 9.958298883510904e-06, "loss": 0.5927044868469238, "memory(GiB)": 77.39, "step": 60, "token_acc": 0.8021331371827878, "train_speed(iter/s)": 0.060454 }, { "epoch": 0.09759759759759759, "grad_norm": 3.9366421699523926, "learning_rate": 9.940752582951283e-06, "loss": 0.5776666164398193, "memory(GiB)": 77.39, "step": 65, "token_acc": 0.8102722213866747, "train_speed(iter/s)": 0.060781 }, { "epoch": 0.10510510510510511, "grad_norm": 3.8400216102600098, "learning_rate": 9.920154339892104e-06, "loss": 0.5775270462036133, "memory(GiB)": 77.4, "step": 70, "token_acc": 0.8103343924950824, "train_speed(iter/s)": 0.061034 }, { "epoch": 0.11261261261261261, "grad_norm": 3.48134183883667, "learning_rate": 9.896516878033318e-06, "loss": 0.5721072196960449, "memory(GiB)": 77.4, "step": 75, "token_acc": 0.8125897215044502, "train_speed(iter/s)": 0.061315 }, { "epoch": 0.12012012012012012, "grad_norm": 4.326797962188721, "learning_rate": 9.869854798424709e-06, "loss": 0.5744771957397461, "memory(GiB)": 77.4, "step": 80, "token_acc": 0.8050841065097685, "train_speed(iter/s)": 0.06143 }, { "epoch": 0.12762762762762764, "grad_norm": 4.147410869598389, "learning_rate": 9.840184570446702e-06, "loss": 0.5583375930786133, "memory(GiB)": 77.4, "step": 85, "token_acc": 0.8205675923100396, "train_speed(iter/s)": 0.061622 }, { "epoch": 0.13513513513513514, "grad_norm": 9.281377792358398, "learning_rate": 9.807524521637103e-06, "loss": 0.5598219871520996, "memory(GiB)": 77.4, "step": 90, "token_acc": 0.815196639938908, "train_speed(iter/s)": 0.061807 }, { "epoch": 0.14264264264264265, "grad_norm": 4.255290508270264, "learning_rate": 9.771894826370021e-06, "loss": 0.5530188083648682, "memory(GiB)": 77.4, "step": 95, "token_acc": 0.8160852001468968, "train_speed(iter/s)": 0.06204 }, { "epoch": 0.15015015015015015, "grad_norm": 3.306349515914917, "learning_rate": 9.733317493394004e-06, "loss": 0.5611482620239258, "memory(GiB)": 77.4, "step": 100, "token_acc": 0.809205586424388, "train_speed(iter/s)": 0.062203 }, { "epoch": 0.15015015015015015, "eval_loss": 4.395487308502197, "eval_runtime": 49.3031, "eval_samples_per_second": 17.443, "eval_steps_per_second": 2.191, "eval_token_acc": 0.8156251073624902, "step": 100 }, { "epoch": 0.15765765765765766, "grad_norm": 3.02348256111145, "learning_rate": 9.691816352237052e-06, "loss": 0.5578757286071777, "memory(GiB)": 77.4, "step": 105, "token_acc": 0.8126542517388378, "train_speed(iter/s)": 0.060579 }, { "epoch": 0.16516516516516516, "grad_norm": 3.450737237930298, "learning_rate": 9.647417038486936e-06, "loss": 0.5540652751922608, "memory(GiB)": 77.4, "step": 110, "token_acc": 0.815136660724896, "train_speed(iter/s)": 0.060695 }, { "epoch": 0.17267267267267267, "grad_norm": 3.2497100830078125, "learning_rate": 9.60014697795588e-06, "loss": 0.5622821807861328, "memory(GiB)": 77.4, "step": 115, "token_acc": 0.8124953734547339, "train_speed(iter/s)": 0.06081 }, { "epoch": 0.18018018018018017, "grad_norm": 3.766254425048828, "learning_rate": 9.550035369739416e-06, "loss": 0.5542448043823243, "memory(GiB)": 77.4, "step": 120, "token_acc": 0.8170215979459297, "train_speed(iter/s)": 0.060974 }, { "epoch": 0.18768768768768768, "grad_norm": 6.819382190704346, "learning_rate": 9.49711316817988e-06, "loss": 0.5530866622924805, "memory(GiB)": 77.4, "step": 125, "token_acc": 0.8172684695573349, "train_speed(iter/s)": 0.061077 }, { "epoch": 0.19519519519519518, "grad_norm": 4.784273147583008, "learning_rate": 9.44141306374566e-06, "loss": 0.5516416072845459, "memory(GiB)": 77.4, "step": 130, "token_acc": 0.8142543693727775, "train_speed(iter/s)": 0.0612 }, { "epoch": 0.20270270270270271, "grad_norm": 4.253176689147949, "learning_rate": 9.382969462838023e-06, "loss": 0.5411262512207031, "memory(GiB)": 77.4, "step": 135, "token_acc": 0.814960036193636, "train_speed(iter/s)": 0.061361 }, { "epoch": 0.21021021021021022, "grad_norm": 7.392826557159424, "learning_rate": 9.32181846653802e-06, "loss": 0.5480520725250244, "memory(GiB)": 77.4, "step": 140, "token_acc": 0.8117261471230881, "train_speed(iter/s)": 0.06147 }, { "epoch": 0.21771771771771772, "grad_norm": 5.3913750648498535, "learning_rate": 9.257997848306548e-06, "loss": 0.5492410659790039, "memory(GiB)": 77.4, "step": 145, "token_acc": 0.8235338860576558, "train_speed(iter/s)": 0.061595 }, { "epoch": 0.22522522522522523, "grad_norm": 5.326717853546143, "learning_rate": 9.191547030651383e-06, "loss": 0.537621259689331, "memory(GiB)": 77.4, "step": 150, "token_acc": 0.8174485718514134, "train_speed(iter/s)": 0.061668 }, { "epoch": 0.23273273273273273, "grad_norm": 5.77912712097168, "learning_rate": 9.122507060775587e-06, "loss": 0.5331393241882324, "memory(GiB)": 77.4, "step": 155, "token_acc": 0.8221793062467504, "train_speed(iter/s)": 0.061777 }, { "epoch": 0.24024024024024024, "grad_norm": 3.0749833583831787, "learning_rate": 9.050920585222309e-06, "loss": 0.5423390388488769, "memory(GiB)": 77.4, "step": 160, "token_acc": 0.8245859403754141, "train_speed(iter/s)": 0.061826 }, { "epoch": 0.24774774774774774, "grad_norm": 3.131141185760498, "learning_rate": 8.976831823531683e-06, "loss": 0.5409300804138184, "memory(GiB)": 77.4, "step": 165, "token_acc": 0.8230075410320864, "train_speed(iter/s)": 0.061937 }, { "epoch": 0.2552552552552553, "grad_norm": 8.984918594360352, "learning_rate": 8.900286540926062e-06, "loss": 0.5327372550964355, "memory(GiB)": 77.4, "step": 170, "token_acc": 0.8271838775664823, "train_speed(iter/s)": 0.062016 }, { "epoch": 0.2627627627627628, "grad_norm": 2.6073122024536133, "learning_rate": 8.82133202004047e-06, "loss": 0.5299601078033447, "memory(GiB)": 77.4, "step": 175, "token_acc": 0.8204494382022471, "train_speed(iter/s)": 0.062103 }, { "epoch": 0.2702702702702703, "grad_norm": 4.3983154296875, "learning_rate": 8.74001703171574e-06, "loss": 0.5317525863647461, "memory(GiB)": 77.4, "step": 180, "token_acc": 0.8145350154868928, "train_speed(iter/s)": 0.062171 }, { "epoch": 0.2777777777777778, "grad_norm": 2.6061856746673584, "learning_rate": 8.656391804872376e-06, "loss": 0.5338263511657715, "memory(GiB)": 77.4, "step": 185, "token_acc": 0.8210867324399941, "train_speed(iter/s)": 0.062229 }, { "epoch": 0.2852852852852853, "grad_norm": 3.8644051551818848, "learning_rate": 8.57050799548375e-06, "loss": 0.5335628509521484, "memory(GiB)": 77.4, "step": 190, "token_acc": 0.8225940161278167, "train_speed(iter/s)": 0.062293 }, { "epoch": 0.2927927927927928, "grad_norm": 3.1008355617523193, "learning_rate": 8.482418654667777e-06, "loss": 0.5237902641296387, "memory(GiB)": 77.4, "step": 195, "token_acc": 0.8233494964565461, "train_speed(iter/s)": 0.062328 }, { "epoch": 0.3003003003003003, "grad_norm": 4.561453342437744, "learning_rate": 8.392178195916832e-06, "loss": 0.5280370712280273, "memory(GiB)": 77.4, "step": 200, "token_acc": 0.828030888030888, "train_speed(iter/s)": 0.062359 }, { "epoch": 0.3003003003003003, "eval_loss": 4.170934200286865, "eval_runtime": 49.0517, "eval_samples_per_second": 17.533, "eval_steps_per_second": 2.202, "eval_token_acc": 0.8229910330848249, "step": 200 }, { "epoch": 0.3078078078078078, "grad_norm": 4.3819122314453125, "learning_rate": 8.299842361486094e-06, "loss": 0.520693588256836, "memory(GiB)": 77.4, "step": 205, "token_acc": 0.8251148084017165, "train_speed(iter/s)": 0.061487 }, { "epoch": 0.3153153153153153, "grad_norm": 3.4289472103118896, "learning_rate": 8.2054681879611e-06, "loss": 0.524658489227295, "memory(GiB)": 77.4, "step": 210, "token_acc": 0.8237686398553999, "train_speed(iter/s)": 0.061563 }, { "epoch": 0.3228228228228228, "grad_norm": 2.9290993213653564, "learning_rate": 8.109113971025803e-06, "loss": 0.5151615142822266, "memory(GiB)": 77.4, "step": 215, "token_acc": 0.8236711338653817, "train_speed(iter/s)": 0.061657 }, { "epoch": 0.3303303303303303, "grad_norm": 4.519934177398682, "learning_rate": 8.010839229452843e-06, "loss": 0.5212090492248536, "memory(GiB)": 77.4, "step": 220, "token_acc": 0.8203009737385659, "train_speed(iter/s)": 0.061706 }, { "epoch": 0.33783783783783783, "grad_norm": 2.7104790210723877, "learning_rate": 7.910704668338338e-06, "loss": 0.5250448226928711, "memory(GiB)": 77.4, "step": 225, "token_acc": 0.823299804834109, "train_speed(iter/s)": 0.061773 }, { "epoch": 0.34534534534534533, "grad_norm": 2.8757286071777344, "learning_rate": 7.808772141603855e-06, "loss": 0.5199666976928711, "memory(GiB)": 77.4, "step": 230, "token_acc": 0.8213438148917235, "train_speed(iter/s)": 0.061801 }, { "epoch": 0.35285285285285284, "grad_norm": 4.525195598602295, "learning_rate": 7.705104613788743e-06, "loss": 0.5214581489562988, "memory(GiB)": 77.4, "step": 235, "token_acc": 0.8286637121610456, "train_speed(iter/s)": 0.061859 }, { "epoch": 0.36036036036036034, "grad_norm": 22.122285842895508, "learning_rate": 7.599766121156436e-06, "loss": 0.5185123443603515, "memory(GiB)": 77.4, "step": 240, "token_acc": 0.8281073764944733, "train_speed(iter/s)": 0.061884 }, { "epoch": 0.36786786786786785, "grad_norm": 3.6401257514953613, "learning_rate": 7.492821732138737e-06, "loss": 0.5193865776062012, "memory(GiB)": 77.4, "step": 245, "token_acc": 0.8321651683681244, "train_speed(iter/s)": 0.061892 }, { "epoch": 0.37537537537537535, "grad_norm": 3.1468453407287598, "learning_rate": 7.3843375071425315e-06, "loss": 0.5244226455688477, "memory(GiB)": 77.4, "step": 250, "token_acc": 0.8159338649498866, "train_speed(iter/s)": 0.061943 }, { "epoch": 0.38288288288288286, "grad_norm": 2.953416585922241, "learning_rate": 7.274380457743731e-06, "loss": 0.5164532661437988, "memory(GiB)": 77.4, "step": 255, "token_acc": 0.8161441656117358, "train_speed(iter/s)": 0.062 }, { "epoch": 0.39039039039039036, "grad_norm": 3.3566670417785645, "learning_rate": 7.163018505293703e-06, "loss": 0.5199567317962647, "memory(GiB)": 77.4, "step": 260, "token_acc": 0.8266276517922458, "train_speed(iter/s)": 0.062058 }, { "epoch": 0.3978978978978979, "grad_norm": 5.95110559463501, "learning_rate": 7.050320438963691e-06, "loss": 0.5201972961425781, "memory(GiB)": 77.4, "step": 265, "token_acc": 0.8189985272459499, "train_speed(iter/s)": 0.0621 }, { "epoch": 0.40540540540540543, "grad_norm": 18.081615447998047, "learning_rate": 6.936355873253207e-06, "loss": 0.5159478187561035, "memory(GiB)": 77.4, "step": 270, "token_acc": 0.8244876108901805, "train_speed(iter/s)": 0.062133 }, { "epoch": 0.41291291291291293, "grad_norm": 6.3731184005737305, "learning_rate": 6.821195204988578e-06, "loss": 0.5061209201812744, "memory(GiB)": 77.4, "step": 275, "token_acc": 0.8274302361238883, "train_speed(iter/s)": 0.062173 }, { "epoch": 0.42042042042042044, "grad_norm": 5.5361008644104, "learning_rate": 6.704909569838281e-06, "loss": 0.5148390769958496, "memory(GiB)": 77.4, "step": 280, "token_acc": 0.8375707280271596, "train_speed(iter/s)": 0.062201 }, { "epoch": 0.42792792792792794, "grad_norm": 8.514371871948242, "learning_rate": 6.58757079837186e-06, "loss": 0.501787519454956, "memory(GiB)": 77.4, "step": 285, "token_acc": 0.8299818566676747, "train_speed(iter/s)": 0.062228 }, { "epoch": 0.43543543543543545, "grad_norm": 2.4984076023101807, "learning_rate": 6.469251371689606e-06, "loss": 0.5198217868804932, "memory(GiB)": 77.4, "step": 290, "token_acc": 0.8295916829893024, "train_speed(iter/s)": 0.062245 }, { "epoch": 0.44294294294294295, "grad_norm": 4.352683067321777, "learning_rate": 6.350024376650403e-06, "loss": 0.503413200378418, "memory(GiB)": 77.4, "step": 295, "token_acc": 0.8299345323199638, "train_speed(iter/s)": 0.062291 }, { "epoch": 0.45045045045045046, "grad_norm": 4.458861351013184, "learning_rate": 6.22996346072539e-06, "loss": 0.5108905792236328, "memory(GiB)": 77.4, "step": 300, "token_acc": 0.8341271022473582, "train_speed(iter/s)": 0.062322 }, { "epoch": 0.45045045045045046, "eval_loss": 4.029459476470947, "eval_runtime": 48.9329, "eval_samples_per_second": 17.575, "eval_steps_per_second": 2.207, "eval_token_acc": 0.8278627134366303, "step": 300 }, { "epoch": 0.45795795795795796, "grad_norm": 2.8650991916656494, "learning_rate": 6.109142786505327e-06, "loss": 0.5027182102203369, "memory(GiB)": 77.4, "step": 305, "token_acc": 0.8356470769705383, "train_speed(iter/s)": 0.061736 }, { "epoch": 0.46546546546546547, "grad_norm": 4.33251953125, "learning_rate": 5.987636985889764e-06, "loss": 0.5072842121124268, "memory(GiB)": 77.4, "step": 310, "token_acc": 0.8324474058546081, "train_speed(iter/s)": 0.061753 }, { "epoch": 0.47297297297297297, "grad_norm": 3.968172073364258, "learning_rate": 5.865521113986322e-06, "loss": 0.506615161895752, "memory(GiB)": 77.4, "step": 315, "token_acc": 0.8249737197777444, "train_speed(iter/s)": 0.061785 }, { "epoch": 0.4804804804804805, "grad_norm": 3.18407940864563, "learning_rate": 5.742870602748547e-06, "loss": 0.5017033576965332, "memory(GiB)": 77.4, "step": 320, "token_acc": 0.8319403659362999, "train_speed(iter/s)": 0.061808 }, { "epoch": 0.487987987987988, "grad_norm": 3.2644057273864746, "learning_rate": 5.619761214380998e-06, "loss": 0.4994755744934082, "memory(GiB)": 77.4, "step": 325, "token_acc": 0.8307278031266363, "train_speed(iter/s)": 0.061844 }, { "epoch": 0.4954954954954955, "grad_norm": 3.396632671356201, "learning_rate": 5.496268994540309e-06, "loss": 0.5043362617492676, "memory(GiB)": 77.4, "step": 330, "token_acc": 0.8266277066007343, "train_speed(iter/s)": 0.061862 }, { "epoch": 0.503003003003003, "grad_norm": 2.9528186321258545, "learning_rate": 5.372470225361189e-06, "loss": 0.5022759437561035, "memory(GiB)": 77.4, "step": 335, "token_acc": 0.828901303538175, "train_speed(iter/s)": 0.061871 }, { "epoch": 0.5105105105105106, "grad_norm": 3.661381959915161, "learning_rate": 5.2484413783363335e-06, "loss": 0.49889430999755857, "memory(GiB)": 77.4, "step": 340, "token_acc": 0.835191142365527, "train_speed(iter/s)": 0.061903 }, { "epoch": 0.5180180180180181, "grad_norm": 2.4464967250823975, "learning_rate": 5.124259067079365e-06, "loss": 0.5070960044860839, "memory(GiB)": 77.4, "step": 345, "token_acc": 0.8305346884666372, "train_speed(iter/s)": 0.061921 }, { "epoch": 0.5255255255255256, "grad_norm": 5.827206611633301, "learning_rate": 5e-06, "loss": 0.49094176292419434, "memory(GiB)": 77.4, "step": 350, "token_acc": 0.8313569498649054, "train_speed(iter/s)": 0.061945 }, { "epoch": 0.5330330330330331, "grad_norm": 2.506747245788574, "learning_rate": 4.875740932920635e-06, "loss": 0.4921010971069336, "memory(GiB)": 77.4, "step": 355, "token_acc": 0.833807882511614, "train_speed(iter/s)": 0.061968 }, { "epoch": 0.5405405405405406, "grad_norm": 2.513331651687622, "learning_rate": 4.751558621663668e-06, "loss": 0.49839167594909667, "memory(GiB)": 77.4, "step": 360, "token_acc": 0.8323643410852714, "train_speed(iter/s)": 0.061988 }, { "epoch": 0.5480480480480481, "grad_norm": 2.4384679794311523, "learning_rate": 4.627529774638812e-06, "loss": 0.4972184181213379, "memory(GiB)": 77.4, "step": 365, "token_acc": 0.8275731679649097, "train_speed(iter/s)": 0.062021 }, { "epoch": 0.5555555555555556, "grad_norm": 4.762032508850098, "learning_rate": 4.5037310054596936e-06, "loss": 0.496537971496582, "memory(GiB)": 77.4, "step": 370, "token_acc": 0.8353353726777587, "train_speed(iter/s)": 0.062035 }, { "epoch": 0.5630630630630631, "grad_norm": 7.371038913726807, "learning_rate": 4.380238785619003e-06, "loss": 0.4977581024169922, "memory(GiB)": 77.4, "step": 375, "token_acc": 0.8365855496119943, "train_speed(iter/s)": 0.062052 }, { "epoch": 0.5705705705705706, "grad_norm": 3.048470973968506, "learning_rate": 4.257129397251453e-06, "loss": 0.4944156646728516, "memory(GiB)": 77.4, "step": 380, "token_acc": 0.8276846772375441, "train_speed(iter/s)": 0.062064 }, { "epoch": 0.5780780780780781, "grad_norm": 3.883787155151367, "learning_rate": 4.13447888601368e-06, "loss": 0.49016704559326174, "memory(GiB)": 77.4, "step": 385, "token_acc": 0.8302655401327701, "train_speed(iter/s)": 0.062079 }, { "epoch": 0.5855855855855856, "grad_norm": 6.71173095703125, "learning_rate": 4.012363014110237e-06, "loss": 0.49213333129882814, "memory(GiB)": 77.4, "step": 390, "token_acc": 0.8332108743570904, "train_speed(iter/s)": 0.062097 }, { "epoch": 0.5930930930930931, "grad_norm": 6.448680400848389, "learning_rate": 3.890857213494673e-06, "loss": 0.493864631652832, "memory(GiB)": 77.4, "step": 395, "token_acc": 0.829066045970323, "train_speed(iter/s)": 0.062126 }, { "epoch": 0.6006006006006006, "grad_norm": 4.552060604095459, "learning_rate": 3.7700365392746106e-06, "loss": 0.48853540420532227, "memory(GiB)": 77.4, "step": 400, "token_acc": 0.8339560603796145, "train_speed(iter/s)": 0.062161 }, { "epoch": 0.6006006006006006, "eval_loss": 3.8978097438812256, "eval_runtime": 48.9784, "eval_samples_per_second": 17.559, "eval_steps_per_second": 2.205, "eval_token_acc": 0.8327343937884357, "step": 400 }, { "epoch": 0.6081081081081081, "grad_norm": 2.544924736022949, "learning_rate": 3.649975623349599e-06, "loss": 0.49090261459350587, "memory(GiB)": 77.56, "step": 405, "token_acc": 0.8355947535052013, "train_speed(iter/s)": 0.061731 }, { "epoch": 0.6156156156156156, "grad_norm": 6.126231670379639, "learning_rate": 3.5307486283103966e-06, "loss": 0.49183125495910646, "memory(GiB)": 77.56, "step": 410, "token_acc": 0.831595529217613, "train_speed(iter/s)": 0.06175 }, { "epoch": 0.6231231231231231, "grad_norm": 2.7342617511749268, "learning_rate": 3.412429201628142e-06, "loss": 0.4908740520477295, "memory(GiB)": 77.56, "step": 415, "token_acc": 0.8315980081484835, "train_speed(iter/s)": 0.061767 }, { "epoch": 0.6306306306306306, "grad_norm": 2.9041008949279785, "learning_rate": 3.29509043016172e-06, "loss": 0.4888955593109131, "memory(GiB)": 77.56, "step": 420, "token_acc": 0.8382599420996496, "train_speed(iter/s)": 0.061791 }, { "epoch": 0.6381381381381381, "grad_norm": 2.52190899848938, "learning_rate": 3.1788047950114244e-06, "loss": 0.48609514236450196, "memory(GiB)": 77.56, "step": 425, "token_acc": 0.8281978055012776, "train_speed(iter/s)": 0.061822 }, { "epoch": 0.6456456456456456, "grad_norm": 2.1861705780029297, "learning_rate": 3.0636441267467955e-06, "loss": 0.48923444747924805, "memory(GiB)": 77.56, "step": 430, "token_acc": 0.8383622335956, "train_speed(iter/s)": 0.061843 }, { "epoch": 0.6531531531531531, "grad_norm": 4.289586544036865, "learning_rate": 2.9496795610363087e-06, "loss": 0.4919395923614502, "memory(GiB)": 77.56, "step": 435, "token_acc": 0.8347007903650734, "train_speed(iter/s)": 0.061873 }, { "epoch": 0.6606606606606606, "grad_norm": 2.4423766136169434, "learning_rate": 2.8369814947062994e-06, "loss": 0.48633642196655275, "memory(GiB)": 77.56, "step": 440, "token_acc": 0.8311563010241948, "train_speed(iter/s)": 0.06188 }, { "epoch": 0.6681681681681682, "grad_norm": 2.4229958057403564, "learning_rate": 2.7256195422562687e-06, "loss": 0.48715896606445314, "memory(GiB)": 77.56, "step": 445, "token_acc": 0.8328196647372773, "train_speed(iter/s)": 0.06191 }, { "epoch": 0.6756756756756757, "grad_norm": 2.8222622871398926, "learning_rate": 2.615662492857471e-06, "loss": 0.4761828422546387, "memory(GiB)": 77.56, "step": 450, "token_acc": 0.8371917082765925, "train_speed(iter/s)": 0.061929 }, { "epoch": 0.6831831831831832, "grad_norm": 2.2745773792266846, "learning_rate": 2.5071782678612635e-06, "loss": 0.47379336357116697, "memory(GiB)": 77.56, "step": 455, "token_acc": 0.8373471378197914, "train_speed(iter/s)": 0.061944 }, { "epoch": 0.6906906906906907, "grad_norm": 3.0483222007751465, "learning_rate": 2.4002338788435654e-06, "loss": 0.4889671325683594, "memory(GiB)": 77.56, "step": 460, "token_acc": 0.8362183754993342, "train_speed(iter/s)": 0.061958 }, { "epoch": 0.6981981981981982, "grad_norm": 4.672157287597656, "learning_rate": 2.2948953862112596e-06, "loss": 0.4797488212585449, "memory(GiB)": 77.56, "step": 465, "token_acc": 0.8255822661574178, "train_speed(iter/s)": 0.061982 }, { "epoch": 0.7057057057057057, "grad_norm": 3.9066431522369385, "learning_rate": 2.1912278583961454e-06, "loss": 0.47504510879516604, "memory(GiB)": 77.56, "step": 470, "token_acc": 0.8383151135234309, "train_speed(iter/s)": 0.062009 }, { "epoch": 0.7132132132132132, "grad_norm": 3.3316314220428467, "learning_rate": 2.0892953316616616e-06, "loss": 0.4852120399475098, "memory(GiB)": 77.56, "step": 475, "token_acc": 0.8355624907118443, "train_speed(iter/s)": 0.062037 }, { "epoch": 0.7207207207207207, "grad_norm": 3.6110689640045166, "learning_rate": 1.989160770547159e-06, "loss": 0.48483953475952146, "memory(GiB)": 77.56, "step": 480, "token_acc": 0.8303629882883539, "train_speed(iter/s)": 0.06206 }, { "epoch": 0.7282282282282282, "grad_norm": 3.1614766120910645, "learning_rate": 1.8908860289741981e-06, "loss": 0.48744707107543944, "memory(GiB)": 77.56, "step": 485, "token_acc": 0.8345915240301965, "train_speed(iter/s)": 0.062073 }, { "epoch": 0.7357357357357357, "grad_norm": 2.3462319374084473, "learning_rate": 1.794531812038901e-06, "loss": 0.4857791423797607, "memory(GiB)": 77.56, "step": 490, "token_acc": 0.8333702146492586, "train_speed(iter/s)": 0.062097 }, { "epoch": 0.7432432432432432, "grad_norm": 3.077697992324829, "learning_rate": 1.7001576385139062e-06, "loss": 0.48043317794799806, "memory(GiB)": 77.56, "step": 495, "token_acc": 0.8398653702318624, "train_speed(iter/s)": 0.062109 }, { "epoch": 0.7507507507507507, "grad_norm": 4.029548168182373, "learning_rate": 1.6078218040831678e-06, "loss": 0.48291807174682616, "memory(GiB)": 77.56, "step": 500, "token_acc": 0.830759284534157, "train_speed(iter/s)": 0.062136 }, { "epoch": 0.7507507507507507, "eval_loss": 3.8144214153289795, "eval_runtime": 49.1053, "eval_samples_per_second": 17.513, "eval_steps_per_second": 2.199, "eval_token_acc": 0.835421032741265, "step": 500 }, { "epoch": 0.7582582582582582, "grad_norm": 3.489283561706543, "learning_rate": 1.5175813453322252e-06, "loss": 0.48162593841552737, "memory(GiB)": 77.56, "step": 505, "token_acc": 0.8355083683550837, "train_speed(iter/s)": 0.061761 }, { "epoch": 0.7657657657657657, "grad_norm": 2.7831027507781982, "learning_rate": 1.4294920045162514e-06, "loss": 0.47315006256103515, "memory(GiB)": 77.56, "step": 510, "token_acc": 0.8345864661654135, "train_speed(iter/s)": 0.061779 }, { "epoch": 0.7732732732732732, "grad_norm": 3.9989585876464844, "learning_rate": 1.3436081951276247e-06, "loss": 0.47411699295043946, "memory(GiB)": 77.56, "step": 515, "token_acc": 0.8337380482622553, "train_speed(iter/s)": 0.061792 }, { "epoch": 0.7807807807807807, "grad_norm": 2.496121883392334, "learning_rate": 1.2599829682842618e-06, "loss": 0.47501659393310547, "memory(GiB)": 77.56, "step": 520, "token_acc": 0.8372924268374413, "train_speed(iter/s)": 0.0618 }, { "epoch": 0.7882882882882883, "grad_norm": 2.376746416091919, "learning_rate": 1.1786679799595308e-06, "loss": 0.47827887535095215, "memory(GiB)": 77.56, "step": 525, "token_acc": 0.8420151808304808, "train_speed(iter/s)": 0.061823 }, { "epoch": 0.7957957957957958, "grad_norm": 6.058043956756592, "learning_rate": 1.09971345907394e-06, "loss": 0.48307170867919924, "memory(GiB)": 77.56, "step": 530, "token_acc": 0.8403267731154845, "train_speed(iter/s)": 0.06183 }, { "epoch": 0.8033033033033034, "grad_norm": 3.4789669513702393, "learning_rate": 1.0231681764683188e-06, "loss": 0.47766728401184083, "memory(GiB)": 77.56, "step": 535, "token_acc": 0.8329650905877154, "train_speed(iter/s)": 0.061854 }, { "epoch": 0.8108108108108109, "grad_norm": 2.2162930965423584, "learning_rate": 9.490794147776927e-07, "loss": 0.4749112606048584, "memory(GiB)": 77.56, "step": 540, "token_acc": 0.8364345738295318, "train_speed(iter/s)": 0.061873 }, { "epoch": 0.8183183183183184, "grad_norm": 2.1486546993255615, "learning_rate": 8.774929392244158e-07, "loss": 0.48050594329833984, "memory(GiB)": 77.56, "step": 545, "token_acc": 0.8377268314539617, "train_speed(iter/s)": 0.061873 }, { "epoch": 0.8258258258258259, "grad_norm": 20.101654052734375, "learning_rate": 8.084529693486171e-07, "loss": 0.4786341667175293, "memory(GiB)": 77.56, "step": 550, "token_acc": 0.8336489659874252, "train_speed(iter/s)": 0.061888 }, { "epoch": 0.8333333333333334, "grad_norm": 2.2926764488220215, "learning_rate": 7.420021516934539e-07, "loss": 0.4800395488739014, "memory(GiB)": 77.56, "step": 555, "token_acc": 0.8331179198661902, "train_speed(iter/s)": 0.0619 }, { "epoch": 0.8408408408408409, "grad_norm": 3.684734582901001, "learning_rate": 6.781815334619812e-07, "loss": 0.471435022354126, "memory(GiB)": 77.56, "step": 560, "token_acc": 0.8375401560348784, "train_speed(iter/s)": 0.061915 }, { "epoch": 0.8483483483483484, "grad_norm": 1.881805419921875, "learning_rate": 6.170305371619773e-07, "loss": 0.4786642074584961, "memory(GiB)": 77.56, "step": 565, "token_acc": 0.8357564743298501, "train_speed(iter/s)": 0.061924 }, { "epoch": 0.8558558558558559, "grad_norm": 2.26941180229187, "learning_rate": 5.585869362543416e-07, "loss": 0.47544078826904296, "memory(GiB)": 77.56, "step": 570, "token_acc": 0.8454873092554251, "train_speed(iter/s)": 0.061943 }, { "epoch": 0.8633633633633634, "grad_norm": 2.576007604598999, "learning_rate": 5.028868318201191e-07, "loss": 0.47312221527099607, "memory(GiB)": 77.56, "step": 575, "token_acc": 0.8362756052141527, "train_speed(iter/s)": 0.061967 }, { "epoch": 0.8708708708708709, "grad_norm": 2.3246395587921143, "learning_rate": 4.4996463026058476e-07, "loss": 0.479257869720459, "memory(GiB)": 77.56, "step": 580, "token_acc": 0.8453853838469223, "train_speed(iter/s)": 0.061969 }, { "epoch": 0.8783783783783784, "grad_norm": 2.4527480602264404, "learning_rate": 3.9985302204412266e-07, "loss": 0.47733464241027834, "memory(GiB)": 77.56, "step": 585, "token_acc": 0.8370748040483982, "train_speed(iter/s)": 0.061978 }, { "epoch": 0.8858858858858859, "grad_norm": 3.797691583633423, "learning_rate": 3.5258296151306495e-07, "loss": 0.4716297149658203, "memory(GiB)": 77.56, "step": 590, "token_acc": 0.8305998651988318, "train_speed(iter/s)": 0.061995 }, { "epoch": 0.8933933933933934, "grad_norm": 2.739664316177368, "learning_rate": 3.081836477629491e-07, "loss": 0.480192232131958, "memory(GiB)": 77.56, "step": 595, "token_acc": 0.8329091181314568, "train_speed(iter/s)": 0.062011 }, { "epoch": 0.9009009009009009, "grad_norm": 3.405724048614502, "learning_rate": 2.666825066059986e-07, "loss": 0.474263858795166, "memory(GiB)": 77.56, "step": 600, "token_acc": 0.8360385967536839, "train_speed(iter/s)": 0.062028 }, { "epoch": 0.9009009009009009, "eval_loss": 3.765347480773926, "eval_runtime": 49.0338, "eval_samples_per_second": 17.539, "eval_steps_per_second": 2.203, "eval_token_acc": 0.837152574981963, "step": 600 }, { "epoch": 0.9084084084084084, "grad_norm": 2.454803943634033, "learning_rate": 2.2810517362997997e-07, "loss": 0.47376174926757814, "memory(GiB)": 77.56, "step": 605, "token_acc": 0.8413309189678587, "train_speed(iter/s)": 0.061734 }, { "epoch": 0.9159159159159159, "grad_norm": 2.480395793914795, "learning_rate": 1.9247547836289792e-07, "loss": 0.4828979015350342, "memory(GiB)": 77.56, "step": 610, "token_acc": 0.8343275692818848, "train_speed(iter/s)": 0.061749 }, { "epoch": 0.9234234234234234, "grad_norm": 2.6262176036834717, "learning_rate": 1.598154295532983e-07, "loss": 0.48357315063476564, "memory(GiB)": 77.56, "step": 615, "token_acc": 0.8393310437509333, "train_speed(iter/s)": 0.061749 }, { "epoch": 0.9309309309309309, "grad_norm": 3.147251844406128, "learning_rate": 1.3014520157529244e-07, "loss": 0.47496805191040037, "memory(GiB)": 77.56, "step": 620, "token_acc": 0.8347965493549125, "train_speed(iter/s)": 0.061763 }, { "epoch": 0.9384384384384384, "grad_norm": 2.492722511291504, "learning_rate": 1.034831219666832e-07, "loss": 0.48235254287719725, "memory(GiB)": 77.56, "step": 625, "token_acc": 0.8455045594995855, "train_speed(iter/s)": 0.061787 }, { "epoch": 0.9459459459459459, "grad_norm": 2.683011054992676, "learning_rate": 7.984566010789673e-08, "loss": 0.4709568977355957, "memory(GiB)": 77.56, "step": 630, "token_acc": 0.8387436920991188, "train_speed(iter/s)": 0.061803 }, { "epoch": 0.9534534534534534, "grad_norm": 2.4714860916137695, "learning_rate": 5.9247417048717284e-08, "loss": 0.4673150062561035, "memory(GiB)": 77.56, "step": 635, "token_acc": 0.8387750506642648, "train_speed(iter/s)": 0.061826 }, { "epoch": 0.960960960960961, "grad_norm": 2.5630085468292236, "learning_rate": 4.170111648909736e-08, "loss": 0.48201580047607423, "memory(GiB)": 77.56, "step": 640, "token_acc": 0.8353464246017044, "train_speed(iter/s)": 0.061839 }, { "epoch": 0.9684684684684685, "grad_norm": 2.9935920238494873, "learning_rate": 2.721759691962922e-08, "loss": 0.47791686058044436, "memory(GiB)": 77.56, "step": 645, "token_acc": 0.836272040302267, "train_speed(iter/s)": 0.061862 }, { "epoch": 0.975975975975976, "grad_norm": 2.290571451187134, "learning_rate": 1.580580492652084e-08, "loss": 0.47302780151367185, "memory(GiB)": 77.56, "step": 650, "token_acc": 0.8399675060926076, "train_speed(iter/s)": 0.061876 }, { "epoch": 0.9834834834834835, "grad_norm": 7.702187538146973, "learning_rate": 7.472789665218805e-09, "loss": 0.4697415351867676, "memory(GiB)": 77.56, "step": 655, "token_acc": 0.8315651906519065, "train_speed(iter/s)": 0.0619 }, { "epoch": 0.990990990990991, "grad_norm": 3.5078024864196777, "learning_rate": 2.223698506088612e-09, "loss": 0.46914873123168943, "memory(GiB)": 77.56, "step": 660, "token_acc": 0.8398177880666119, "train_speed(iter/s)": 0.06191 }, { "epoch": 0.9984984984984985, "grad_norm": 2.397590160369873, "learning_rate": 6.177385484029685e-11, "loss": 0.4747368812561035, "memory(GiB)": 77.56, "step": 665, "token_acc": 0.8291888691533452, "train_speed(iter/s)": 0.061924 }, { "epoch": 1.0, "eval_loss": 3.761183738708496, "eval_runtime": 50.3959, "eval_samples_per_second": 17.065, "eval_steps_per_second": 2.143, "eval_token_acc": 0.8374686501528842, "step": 666 } ], "logging_steps": 5, "max_steps": 666, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.285336319720161e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }