| { |
| "best_global_step": 666, |
| "best_metric": 3.76118374, |
| "best_model_checkpoint": "/mnt/bn/wdq-base1/data/VLMs/vsa_rl/checkpoint/reasoning_sft_1009/v5-20251009-231445/checkpoint-666", |
| "epoch": 1.0, |
| "eval_steps": 100, |
| "global_step": 666, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0015015015015015015, |
| "grad_norm": 18.43874168395996, |
| "learning_rate": 2.9411764705882356e-07, |
| "loss": 1.8555517196655273, |
| "memory(GiB)": 45.75, |
| "step": 1, |
| "token_acc": 0.5909588042289464, |
| "train_speed(iter/s)": 0.01347 |
| }, |
| { |
| "epoch": 0.0075075075075075074, |
| "grad_norm": 20.15400505065918, |
| "learning_rate": 1.4705882352941177e-06, |
| "loss": 1.8491613864898682, |
| "memory(GiB)": 53.59, |
| "step": 5, |
| "token_acc": 0.6025925925925926, |
| "train_speed(iter/s)": 0.036205 |
| }, |
| { |
| "epoch": 0.015015015015015015, |
| "grad_norm": 10.93825912475586, |
| "learning_rate": 2.9411764705882355e-06, |
| "loss": 1.559732437133789, |
| "memory(GiB)": 53.59, |
| "step": 10, |
| "token_acc": 0.6266354798625832, |
| "train_speed(iter/s)": 0.046254 |
| }, |
| { |
| "epoch": 0.02252252252252252, |
| "grad_norm": 5.14630651473999, |
| "learning_rate": 4.411764705882353e-06, |
| "loss": 1.0728832244873048, |
| "memory(GiB)": 53.59, |
| "step": 15, |
| "token_acc": 0.7059398496240602, |
| "train_speed(iter/s)": 0.051056 |
| }, |
| { |
| "epoch": 0.03003003003003003, |
| "grad_norm": 10.39709186553955, |
| "learning_rate": 5.882352941176471e-06, |
| "loss": 0.8455103874206543, |
| "memory(GiB)": 53.59, |
| "step": 20, |
| "token_acc": 0.7537844383893431, |
| "train_speed(iter/s)": 0.05419 |
| }, |
| { |
| "epoch": 0.03753753753753754, |
| "grad_norm": 4.8650970458984375, |
| "learning_rate": 7.352941176470589e-06, |
| "loss": 0.7454294204711914, |
| "memory(GiB)": 53.59, |
| "step": 25, |
| "token_acc": 0.7660878447395302, |
| "train_speed(iter/s)": 0.055846 |
| }, |
| { |
| "epoch": 0.04504504504504504, |
| "grad_norm": 4.418037414550781, |
| "learning_rate": 8.823529411764707e-06, |
| "loss": 0.6922950744628906, |
| "memory(GiB)": 61.49, |
| "step": 30, |
| "token_acc": 0.7846728698073119, |
| "train_speed(iter/s)": 0.057253 |
| }, |
| { |
| "epoch": 0.052552552552552555, |
| "grad_norm": 7.153648376464844, |
| "learning_rate": 9.99993822614516e-06, |
| "loss": 0.6542837142944335, |
| "memory(GiB)": 61.49, |
| "step": 35, |
| "token_acc": 0.7906924460431655, |
| "train_speed(iter/s)": 0.058147 |
| }, |
| { |
| "epoch": 0.06006006006006006, |
| "grad_norm": 3.7986629009246826, |
| "learning_rate": 9.997776301493914e-06, |
| "loss": 0.6283172607421875, |
| "memory(GiB)": 61.49, |
| "step": 40, |
| "token_acc": 0.7965453877251011, |
| "train_speed(iter/s)": 0.058772 |
| }, |
| { |
| "epoch": 0.06756756756756757, |
| "grad_norm": 4.944998264312744, |
| "learning_rate": 9.992527210334781e-06, |
| "loss": 0.6150907516479492, |
| "memory(GiB)": 61.49, |
| "step": 45, |
| "token_acc": 0.7986261479877548, |
| "train_speed(iter/s)": 0.059369 |
| }, |
| { |
| "epoch": 0.07507507507507508, |
| "grad_norm": 12.669698715209961, |
| "learning_rate": 9.98419419507348e-06, |
| "loss": 0.5961947441101074, |
| "memory(GiB)": 69.42, |
| "step": 50, |
| "token_acc": 0.8023150932050511, |
| "train_speed(iter/s)": 0.059729 |
| }, |
| { |
| "epoch": 0.08258258258258258, |
| "grad_norm": 3.911559820175171, |
| "learning_rate": 9.972782403080372e-06, |
| "loss": 0.5979935169219971, |
| "memory(GiB)": 77.39, |
| "step": 55, |
| "token_acc": 0.8086269296107541, |
| "train_speed(iter/s)": 0.060077 |
| }, |
| { |
| "epoch": 0.09009009009009009, |
| "grad_norm": 7.1113362312316895, |
| "learning_rate": 9.958298883510904e-06, |
| "loss": 0.5927044868469238, |
| "memory(GiB)": 77.39, |
| "step": 60, |
| "token_acc": 0.8021331371827878, |
| "train_speed(iter/s)": 0.060454 |
| }, |
| { |
| "epoch": 0.09759759759759759, |
| "grad_norm": 3.9366421699523926, |
| "learning_rate": 9.940752582951283e-06, |
| "loss": 0.5776666164398193, |
| "memory(GiB)": 77.39, |
| "step": 65, |
| "token_acc": 0.8102722213866747, |
| "train_speed(iter/s)": 0.060781 |
| }, |
| { |
| "epoch": 0.10510510510510511, |
| "grad_norm": 3.8400216102600098, |
| "learning_rate": 9.920154339892104e-06, |
| "loss": 0.5775270462036133, |
| "memory(GiB)": 77.4, |
| "step": 70, |
| "token_acc": 0.8103343924950824, |
| "train_speed(iter/s)": 0.061034 |
| }, |
| { |
| "epoch": 0.11261261261261261, |
| "grad_norm": 3.48134183883667, |
| "learning_rate": 9.896516878033318e-06, |
| "loss": 0.5721072196960449, |
| "memory(GiB)": 77.4, |
| "step": 75, |
| "token_acc": 0.8125897215044502, |
| "train_speed(iter/s)": 0.061315 |
| }, |
| { |
| "epoch": 0.12012012012012012, |
| "grad_norm": 4.326797962188721, |
| "learning_rate": 9.869854798424709e-06, |
| "loss": 0.5744771957397461, |
| "memory(GiB)": 77.4, |
| "step": 80, |
| "token_acc": 0.8050841065097685, |
| "train_speed(iter/s)": 0.06143 |
| }, |
| { |
| "epoch": 0.12762762762762764, |
| "grad_norm": 4.147410869598389, |
| "learning_rate": 9.840184570446702e-06, |
| "loss": 0.5583375930786133, |
| "memory(GiB)": 77.4, |
| "step": 85, |
| "token_acc": 0.8205675923100396, |
| "train_speed(iter/s)": 0.061622 |
| }, |
| { |
| "epoch": 0.13513513513513514, |
| "grad_norm": 9.281377792358398, |
| "learning_rate": 9.807524521637103e-06, |
| "loss": 0.5598219871520996, |
| "memory(GiB)": 77.4, |
| "step": 90, |
| "token_acc": 0.815196639938908, |
| "train_speed(iter/s)": 0.061807 |
| }, |
| { |
| "epoch": 0.14264264264264265, |
| "grad_norm": 4.255290508270264, |
| "learning_rate": 9.771894826370021e-06, |
| "loss": 0.5530188083648682, |
| "memory(GiB)": 77.4, |
| "step": 95, |
| "token_acc": 0.8160852001468968, |
| "train_speed(iter/s)": 0.06204 |
| }, |
| { |
| "epoch": 0.15015015015015015, |
| "grad_norm": 3.306349515914917, |
| "learning_rate": 9.733317493394004e-06, |
| "loss": 0.5611482620239258, |
| "memory(GiB)": 77.4, |
| "step": 100, |
| "token_acc": 0.809205586424388, |
| "train_speed(iter/s)": 0.062203 |
| }, |
| { |
| "epoch": 0.15015015015015015, |
| "eval_loss": 4.395487308502197, |
| "eval_runtime": 49.3031, |
| "eval_samples_per_second": 17.443, |
| "eval_steps_per_second": 2.191, |
| "eval_token_acc": 0.8156251073624902, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.15765765765765766, |
| "grad_norm": 3.02348256111145, |
| "learning_rate": 9.691816352237052e-06, |
| "loss": 0.5578757286071777, |
| "memory(GiB)": 77.4, |
| "step": 105, |
| "token_acc": 0.8126542517388378, |
| "train_speed(iter/s)": 0.060579 |
| }, |
| { |
| "epoch": 0.16516516516516516, |
| "grad_norm": 3.450737237930298, |
| "learning_rate": 9.647417038486936e-06, |
| "loss": 0.5540652751922608, |
| "memory(GiB)": 77.4, |
| "step": 110, |
| "token_acc": 0.815136660724896, |
| "train_speed(iter/s)": 0.060695 |
| }, |
| { |
| "epoch": 0.17267267267267267, |
| "grad_norm": 3.2497100830078125, |
| "learning_rate": 9.60014697795588e-06, |
| "loss": 0.5622821807861328, |
| "memory(GiB)": 77.4, |
| "step": 115, |
| "token_acc": 0.8124953734547339, |
| "train_speed(iter/s)": 0.06081 |
| }, |
| { |
| "epoch": 0.18018018018018017, |
| "grad_norm": 3.766254425048828, |
| "learning_rate": 9.550035369739416e-06, |
| "loss": 0.5542448043823243, |
| "memory(GiB)": 77.4, |
| "step": 120, |
| "token_acc": 0.8170215979459297, |
| "train_speed(iter/s)": 0.060974 |
| }, |
| { |
| "epoch": 0.18768768768768768, |
| "grad_norm": 6.819382190704346, |
| "learning_rate": 9.49711316817988e-06, |
| "loss": 0.5530866622924805, |
| "memory(GiB)": 77.4, |
| "step": 125, |
| "token_acc": 0.8172684695573349, |
| "train_speed(iter/s)": 0.061077 |
| }, |
| { |
| "epoch": 0.19519519519519518, |
| "grad_norm": 4.784273147583008, |
| "learning_rate": 9.44141306374566e-06, |
| "loss": 0.5516416072845459, |
| "memory(GiB)": 77.4, |
| "step": 130, |
| "token_acc": 0.8142543693727775, |
| "train_speed(iter/s)": 0.0612 |
| }, |
| { |
| "epoch": 0.20270270270270271, |
| "grad_norm": 4.253176689147949, |
| "learning_rate": 9.382969462838023e-06, |
| "loss": 0.5411262512207031, |
| "memory(GiB)": 77.4, |
| "step": 135, |
| "token_acc": 0.814960036193636, |
| "train_speed(iter/s)": 0.061361 |
| }, |
| { |
| "epoch": 0.21021021021021022, |
| "grad_norm": 7.392826557159424, |
| "learning_rate": 9.32181846653802e-06, |
| "loss": 0.5480520725250244, |
| "memory(GiB)": 77.4, |
| "step": 140, |
| "token_acc": 0.8117261471230881, |
| "train_speed(iter/s)": 0.06147 |
| }, |
| { |
| "epoch": 0.21771771771771772, |
| "grad_norm": 5.3913750648498535, |
| "learning_rate": 9.257997848306548e-06, |
| "loss": 0.5492410659790039, |
| "memory(GiB)": 77.4, |
| "step": 145, |
| "token_acc": 0.8235338860576558, |
| "train_speed(iter/s)": 0.061595 |
| }, |
| { |
| "epoch": 0.22522522522522523, |
| "grad_norm": 5.326717853546143, |
| "learning_rate": 9.191547030651383e-06, |
| "loss": 0.537621259689331, |
| "memory(GiB)": 77.4, |
| "step": 150, |
| "token_acc": 0.8174485718514134, |
| "train_speed(iter/s)": 0.061668 |
| }, |
| { |
| "epoch": 0.23273273273273273, |
| "grad_norm": 5.77912712097168, |
| "learning_rate": 9.122507060775587e-06, |
| "loss": 0.5331393241882324, |
| "memory(GiB)": 77.4, |
| "step": 155, |
| "token_acc": 0.8221793062467504, |
| "train_speed(iter/s)": 0.061777 |
| }, |
| { |
| "epoch": 0.24024024024024024, |
| "grad_norm": 3.0749833583831787, |
| "learning_rate": 9.050920585222309e-06, |
| "loss": 0.5423390388488769, |
| "memory(GiB)": 77.4, |
| "step": 160, |
| "token_acc": 0.8245859403754141, |
| "train_speed(iter/s)": 0.061826 |
| }, |
| { |
| "epoch": 0.24774774774774774, |
| "grad_norm": 3.131141185760498, |
| "learning_rate": 8.976831823531683e-06, |
| "loss": 0.5409300804138184, |
| "memory(GiB)": 77.4, |
| "step": 165, |
| "token_acc": 0.8230075410320864, |
| "train_speed(iter/s)": 0.061937 |
| }, |
| { |
| "epoch": 0.2552552552552553, |
| "grad_norm": 8.984918594360352, |
| "learning_rate": 8.900286540926062e-06, |
| "loss": 0.5327372550964355, |
| "memory(GiB)": 77.4, |
| "step": 170, |
| "token_acc": 0.8271838775664823, |
| "train_speed(iter/s)": 0.062016 |
| }, |
| { |
| "epoch": 0.2627627627627628, |
| "grad_norm": 2.6073122024536133, |
| "learning_rate": 8.82133202004047e-06, |
| "loss": 0.5299601078033447, |
| "memory(GiB)": 77.4, |
| "step": 175, |
| "token_acc": 0.8204494382022471, |
| "train_speed(iter/s)": 0.062103 |
| }, |
| { |
| "epoch": 0.2702702702702703, |
| "grad_norm": 4.3983154296875, |
| "learning_rate": 8.74001703171574e-06, |
| "loss": 0.5317525863647461, |
| "memory(GiB)": 77.4, |
| "step": 180, |
| "token_acc": 0.8145350154868928, |
| "train_speed(iter/s)": 0.062171 |
| }, |
| { |
| "epoch": 0.2777777777777778, |
| "grad_norm": 2.6061856746673584, |
| "learning_rate": 8.656391804872376e-06, |
| "loss": 0.5338263511657715, |
| "memory(GiB)": 77.4, |
| "step": 185, |
| "token_acc": 0.8210867324399941, |
| "train_speed(iter/s)": 0.062229 |
| }, |
| { |
| "epoch": 0.2852852852852853, |
| "grad_norm": 3.8644051551818848, |
| "learning_rate": 8.57050799548375e-06, |
| "loss": 0.5335628509521484, |
| "memory(GiB)": 77.4, |
| "step": 190, |
| "token_acc": 0.8225940161278167, |
| "train_speed(iter/s)": 0.062293 |
| }, |
| { |
| "epoch": 0.2927927927927928, |
| "grad_norm": 3.1008355617523193, |
| "learning_rate": 8.482418654667777e-06, |
| "loss": 0.5237902641296387, |
| "memory(GiB)": 77.4, |
| "step": 195, |
| "token_acc": 0.8233494964565461, |
| "train_speed(iter/s)": 0.062328 |
| }, |
| { |
| "epoch": 0.3003003003003003, |
| "grad_norm": 4.561453342437744, |
| "learning_rate": 8.392178195916832e-06, |
| "loss": 0.5280370712280273, |
| "memory(GiB)": 77.4, |
| "step": 200, |
| "token_acc": 0.828030888030888, |
| "train_speed(iter/s)": 0.062359 |
| }, |
| { |
| "epoch": 0.3003003003003003, |
| "eval_loss": 4.170934200286865, |
| "eval_runtime": 49.0517, |
| "eval_samples_per_second": 17.533, |
| "eval_steps_per_second": 2.202, |
| "eval_token_acc": 0.8229910330848249, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.3078078078078078, |
| "grad_norm": 4.3819122314453125, |
| "learning_rate": 8.299842361486094e-06, |
| "loss": 0.520693588256836, |
| "memory(GiB)": 77.4, |
| "step": 205, |
| "token_acc": 0.8251148084017165, |
| "train_speed(iter/s)": 0.061487 |
| }, |
| { |
| "epoch": 0.3153153153153153, |
| "grad_norm": 3.4289472103118896, |
| "learning_rate": 8.2054681879611e-06, |
| "loss": 0.524658489227295, |
| "memory(GiB)": 77.4, |
| "step": 210, |
| "token_acc": 0.8237686398553999, |
| "train_speed(iter/s)": 0.061563 |
| }, |
| { |
| "epoch": 0.3228228228228228, |
| "grad_norm": 2.9290993213653564, |
| "learning_rate": 8.109113971025803e-06, |
| "loss": 0.5151615142822266, |
| "memory(GiB)": 77.4, |
| "step": 215, |
| "token_acc": 0.8236711338653817, |
| "train_speed(iter/s)": 0.061657 |
| }, |
| { |
| "epoch": 0.3303303303303303, |
| "grad_norm": 4.519934177398682, |
| "learning_rate": 8.010839229452843e-06, |
| "loss": 0.5212090492248536, |
| "memory(GiB)": 77.4, |
| "step": 220, |
| "token_acc": 0.8203009737385659, |
| "train_speed(iter/s)": 0.061706 |
| }, |
| { |
| "epoch": 0.33783783783783783, |
| "grad_norm": 2.7104790210723877, |
| "learning_rate": 7.910704668338338e-06, |
| "loss": 0.5250448226928711, |
| "memory(GiB)": 77.4, |
| "step": 225, |
| "token_acc": 0.823299804834109, |
| "train_speed(iter/s)": 0.061773 |
| }, |
| { |
| "epoch": 0.34534534534534533, |
| "grad_norm": 2.8757286071777344, |
| "learning_rate": 7.808772141603855e-06, |
| "loss": 0.5199666976928711, |
| "memory(GiB)": 77.4, |
| "step": 230, |
| "token_acc": 0.8213438148917235, |
| "train_speed(iter/s)": 0.061801 |
| }, |
| { |
| "epoch": 0.35285285285285284, |
| "grad_norm": 4.525195598602295, |
| "learning_rate": 7.705104613788743e-06, |
| "loss": 0.5214581489562988, |
| "memory(GiB)": 77.4, |
| "step": 235, |
| "token_acc": 0.8286637121610456, |
| "train_speed(iter/s)": 0.061859 |
| }, |
| { |
| "epoch": 0.36036036036036034, |
| "grad_norm": 22.122285842895508, |
| "learning_rate": 7.599766121156436e-06, |
| "loss": 0.5185123443603515, |
| "memory(GiB)": 77.4, |
| "step": 240, |
| "token_acc": 0.8281073764944733, |
| "train_speed(iter/s)": 0.061884 |
| }, |
| { |
| "epoch": 0.36786786786786785, |
| "grad_norm": 3.6401257514953613, |
| "learning_rate": 7.492821732138737e-06, |
| "loss": 0.5193865776062012, |
| "memory(GiB)": 77.4, |
| "step": 245, |
| "token_acc": 0.8321651683681244, |
| "train_speed(iter/s)": 0.061892 |
| }, |
| { |
| "epoch": 0.37537537537537535, |
| "grad_norm": 3.1468453407287598, |
| "learning_rate": 7.3843375071425315e-06, |
| "loss": 0.5244226455688477, |
| "memory(GiB)": 77.4, |
| "step": 250, |
| "token_acc": 0.8159338649498866, |
| "train_speed(iter/s)": 0.061943 |
| }, |
| { |
| "epoch": 0.38288288288288286, |
| "grad_norm": 2.953416585922241, |
| "learning_rate": 7.274380457743731e-06, |
| "loss": 0.5164532661437988, |
| "memory(GiB)": 77.4, |
| "step": 255, |
| "token_acc": 0.8161441656117358, |
| "train_speed(iter/s)": 0.062 |
| }, |
| { |
| "epoch": 0.39039039039039036, |
| "grad_norm": 3.3566670417785645, |
| "learning_rate": 7.163018505293703e-06, |
| "loss": 0.5199567317962647, |
| "memory(GiB)": 77.4, |
| "step": 260, |
| "token_acc": 0.8266276517922458, |
| "train_speed(iter/s)": 0.062058 |
| }, |
| { |
| "epoch": 0.3978978978978979, |
| "grad_norm": 5.95110559463501, |
| "learning_rate": 7.050320438963691e-06, |
| "loss": 0.5201972961425781, |
| "memory(GiB)": 77.4, |
| "step": 265, |
| "token_acc": 0.8189985272459499, |
| "train_speed(iter/s)": 0.0621 |
| }, |
| { |
| "epoch": 0.40540540540540543, |
| "grad_norm": 18.081615447998047, |
| "learning_rate": 6.936355873253207e-06, |
| "loss": 0.5159478187561035, |
| "memory(GiB)": 77.4, |
| "step": 270, |
| "token_acc": 0.8244876108901805, |
| "train_speed(iter/s)": 0.062133 |
| }, |
| { |
| "epoch": 0.41291291291291293, |
| "grad_norm": 6.3731184005737305, |
| "learning_rate": 6.821195204988578e-06, |
| "loss": 0.5061209201812744, |
| "memory(GiB)": 77.4, |
| "step": 275, |
| "token_acc": 0.8274302361238883, |
| "train_speed(iter/s)": 0.062173 |
| }, |
| { |
| "epoch": 0.42042042042042044, |
| "grad_norm": 5.5361008644104, |
| "learning_rate": 6.704909569838281e-06, |
| "loss": 0.5148390769958496, |
| "memory(GiB)": 77.4, |
| "step": 280, |
| "token_acc": 0.8375707280271596, |
| "train_speed(iter/s)": 0.062201 |
| }, |
| { |
| "epoch": 0.42792792792792794, |
| "grad_norm": 8.514371871948242, |
| "learning_rate": 6.58757079837186e-06, |
| "loss": 0.501787519454956, |
| "memory(GiB)": 77.4, |
| "step": 285, |
| "token_acc": 0.8299818566676747, |
| "train_speed(iter/s)": 0.062228 |
| }, |
| { |
| "epoch": 0.43543543543543545, |
| "grad_norm": 2.4984076023101807, |
| "learning_rate": 6.469251371689606e-06, |
| "loss": 0.5198217868804932, |
| "memory(GiB)": 77.4, |
| "step": 290, |
| "token_acc": 0.8295916829893024, |
| "train_speed(iter/s)": 0.062245 |
| }, |
| { |
| "epoch": 0.44294294294294295, |
| "grad_norm": 4.352683067321777, |
| "learning_rate": 6.350024376650403e-06, |
| "loss": 0.503413200378418, |
| "memory(GiB)": 77.4, |
| "step": 295, |
| "token_acc": 0.8299345323199638, |
| "train_speed(iter/s)": 0.062291 |
| }, |
| { |
| "epoch": 0.45045045045045046, |
| "grad_norm": 4.458861351013184, |
| "learning_rate": 6.22996346072539e-06, |
| "loss": 0.5108905792236328, |
| "memory(GiB)": 77.4, |
| "step": 300, |
| "token_acc": 0.8341271022473582, |
| "train_speed(iter/s)": 0.062322 |
| }, |
| { |
| "epoch": 0.45045045045045046, |
| "eval_loss": 4.029459476470947, |
| "eval_runtime": 48.9329, |
| "eval_samples_per_second": 17.575, |
| "eval_steps_per_second": 2.207, |
| "eval_token_acc": 0.8278627134366303, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.45795795795795796, |
| "grad_norm": 2.8650991916656494, |
| "learning_rate": 6.109142786505327e-06, |
| "loss": 0.5027182102203369, |
| "memory(GiB)": 77.4, |
| "step": 305, |
| "token_acc": 0.8356470769705383, |
| "train_speed(iter/s)": 0.061736 |
| }, |
| { |
| "epoch": 0.46546546546546547, |
| "grad_norm": 4.33251953125, |
| "learning_rate": 5.987636985889764e-06, |
| "loss": 0.5072842121124268, |
| "memory(GiB)": 77.4, |
| "step": 310, |
| "token_acc": 0.8324474058546081, |
| "train_speed(iter/s)": 0.061753 |
| }, |
| { |
| "epoch": 0.47297297297297297, |
| "grad_norm": 3.968172073364258, |
| "learning_rate": 5.865521113986322e-06, |
| "loss": 0.506615161895752, |
| "memory(GiB)": 77.4, |
| "step": 315, |
| "token_acc": 0.8249737197777444, |
| "train_speed(iter/s)": 0.061785 |
| }, |
| { |
| "epoch": 0.4804804804804805, |
| "grad_norm": 3.18407940864563, |
| "learning_rate": 5.742870602748547e-06, |
| "loss": 0.5017033576965332, |
| "memory(GiB)": 77.4, |
| "step": 320, |
| "token_acc": 0.8319403659362999, |
| "train_speed(iter/s)": 0.061808 |
| }, |
| { |
| "epoch": 0.487987987987988, |
| "grad_norm": 3.2644057273864746, |
| "learning_rate": 5.619761214380998e-06, |
| "loss": 0.4994755744934082, |
| "memory(GiB)": 77.4, |
| "step": 325, |
| "token_acc": 0.8307278031266363, |
| "train_speed(iter/s)": 0.061844 |
| }, |
| { |
| "epoch": 0.4954954954954955, |
| "grad_norm": 3.396632671356201, |
| "learning_rate": 5.496268994540309e-06, |
| "loss": 0.5043362617492676, |
| "memory(GiB)": 77.4, |
| "step": 330, |
| "token_acc": 0.8266277066007343, |
| "train_speed(iter/s)": 0.061862 |
| }, |
| { |
| "epoch": 0.503003003003003, |
| "grad_norm": 2.9528186321258545, |
| "learning_rate": 5.372470225361189e-06, |
| "loss": 0.5022759437561035, |
| "memory(GiB)": 77.4, |
| "step": 335, |
| "token_acc": 0.828901303538175, |
| "train_speed(iter/s)": 0.061871 |
| }, |
| { |
| "epoch": 0.5105105105105106, |
| "grad_norm": 3.661381959915161, |
| "learning_rate": 5.2484413783363335e-06, |
| "loss": 0.49889430999755857, |
| "memory(GiB)": 77.4, |
| "step": 340, |
| "token_acc": 0.835191142365527, |
| "train_speed(iter/s)": 0.061903 |
| }, |
| { |
| "epoch": 0.5180180180180181, |
| "grad_norm": 2.4464967250823975, |
| "learning_rate": 5.124259067079365e-06, |
| "loss": 0.5070960044860839, |
| "memory(GiB)": 77.4, |
| "step": 345, |
| "token_acc": 0.8305346884666372, |
| "train_speed(iter/s)": 0.061921 |
| }, |
| { |
| "epoch": 0.5255255255255256, |
| "grad_norm": 5.827206611633301, |
| "learning_rate": 5e-06, |
| "loss": 0.49094176292419434, |
| "memory(GiB)": 77.4, |
| "step": 350, |
| "token_acc": 0.8313569498649054, |
| "train_speed(iter/s)": 0.061945 |
| }, |
| { |
| "epoch": 0.5330330330330331, |
| "grad_norm": 2.506747245788574, |
| "learning_rate": 4.875740932920635e-06, |
| "loss": 0.4921010971069336, |
| "memory(GiB)": 77.4, |
| "step": 355, |
| "token_acc": 0.833807882511614, |
| "train_speed(iter/s)": 0.061968 |
| }, |
| { |
| "epoch": 0.5405405405405406, |
| "grad_norm": 2.513331651687622, |
| "learning_rate": 4.751558621663668e-06, |
| "loss": 0.49839167594909667, |
| "memory(GiB)": 77.4, |
| "step": 360, |
| "token_acc": 0.8323643410852714, |
| "train_speed(iter/s)": 0.061988 |
| }, |
| { |
| "epoch": 0.5480480480480481, |
| "grad_norm": 2.4384679794311523, |
| "learning_rate": 4.627529774638812e-06, |
| "loss": 0.4972184181213379, |
| "memory(GiB)": 77.4, |
| "step": 365, |
| "token_acc": 0.8275731679649097, |
| "train_speed(iter/s)": 0.062021 |
| }, |
| { |
| "epoch": 0.5555555555555556, |
| "grad_norm": 4.762032508850098, |
| "learning_rate": 4.5037310054596936e-06, |
| "loss": 0.496537971496582, |
| "memory(GiB)": 77.4, |
| "step": 370, |
| "token_acc": 0.8353353726777587, |
| "train_speed(iter/s)": 0.062035 |
| }, |
| { |
| "epoch": 0.5630630630630631, |
| "grad_norm": 7.371038913726807, |
| "learning_rate": 4.380238785619003e-06, |
| "loss": 0.4977581024169922, |
| "memory(GiB)": 77.4, |
| "step": 375, |
| "token_acc": 0.8365855496119943, |
| "train_speed(iter/s)": 0.062052 |
| }, |
| { |
| "epoch": 0.5705705705705706, |
| "grad_norm": 3.048470973968506, |
| "learning_rate": 4.257129397251453e-06, |
| "loss": 0.4944156646728516, |
| "memory(GiB)": 77.4, |
| "step": 380, |
| "token_acc": 0.8276846772375441, |
| "train_speed(iter/s)": 0.062064 |
| }, |
| { |
| "epoch": 0.5780780780780781, |
| "grad_norm": 3.883787155151367, |
| "learning_rate": 4.13447888601368e-06, |
| "loss": 0.49016704559326174, |
| "memory(GiB)": 77.4, |
| "step": 385, |
| "token_acc": 0.8302655401327701, |
| "train_speed(iter/s)": 0.062079 |
| }, |
| { |
| "epoch": 0.5855855855855856, |
| "grad_norm": 6.71173095703125, |
| "learning_rate": 4.012363014110237e-06, |
| "loss": 0.49213333129882814, |
| "memory(GiB)": 77.4, |
| "step": 390, |
| "token_acc": 0.8332108743570904, |
| "train_speed(iter/s)": 0.062097 |
| }, |
| { |
| "epoch": 0.5930930930930931, |
| "grad_norm": 6.448680400848389, |
| "learning_rate": 3.890857213494673e-06, |
| "loss": 0.493864631652832, |
| "memory(GiB)": 77.4, |
| "step": 395, |
| "token_acc": 0.829066045970323, |
| "train_speed(iter/s)": 0.062126 |
| }, |
| { |
| "epoch": 0.6006006006006006, |
| "grad_norm": 4.552060604095459, |
| "learning_rate": 3.7700365392746106e-06, |
| "loss": 0.48853540420532227, |
| "memory(GiB)": 77.4, |
| "step": 400, |
| "token_acc": 0.8339560603796145, |
| "train_speed(iter/s)": 0.062161 |
| }, |
| { |
| "epoch": 0.6006006006006006, |
| "eval_loss": 3.8978097438812256, |
| "eval_runtime": 48.9784, |
| "eval_samples_per_second": 17.559, |
| "eval_steps_per_second": 2.205, |
| "eval_token_acc": 0.8327343937884357, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.6081081081081081, |
| "grad_norm": 2.544924736022949, |
| "learning_rate": 3.649975623349599e-06, |
| "loss": 0.49090261459350587, |
| "memory(GiB)": 77.56, |
| "step": 405, |
| "token_acc": 0.8355947535052013, |
| "train_speed(iter/s)": 0.061731 |
| }, |
| { |
| "epoch": 0.6156156156156156, |
| "grad_norm": 6.126231670379639, |
| "learning_rate": 3.5307486283103966e-06, |
| "loss": 0.49183125495910646, |
| "memory(GiB)": 77.56, |
| "step": 410, |
| "token_acc": 0.831595529217613, |
| "train_speed(iter/s)": 0.06175 |
| }, |
| { |
| "epoch": 0.6231231231231231, |
| "grad_norm": 2.7342617511749268, |
| "learning_rate": 3.412429201628142e-06, |
| "loss": 0.4908740520477295, |
| "memory(GiB)": 77.56, |
| "step": 415, |
| "token_acc": 0.8315980081484835, |
| "train_speed(iter/s)": 0.061767 |
| }, |
| { |
| "epoch": 0.6306306306306306, |
| "grad_norm": 2.9041008949279785, |
| "learning_rate": 3.29509043016172e-06, |
| "loss": 0.4888955593109131, |
| "memory(GiB)": 77.56, |
| "step": 420, |
| "token_acc": 0.8382599420996496, |
| "train_speed(iter/s)": 0.061791 |
| }, |
| { |
| "epoch": 0.6381381381381381, |
| "grad_norm": 2.52190899848938, |
| "learning_rate": 3.1788047950114244e-06, |
| "loss": 0.48609514236450196, |
| "memory(GiB)": 77.56, |
| "step": 425, |
| "token_acc": 0.8281978055012776, |
| "train_speed(iter/s)": 0.061822 |
| }, |
| { |
| "epoch": 0.6456456456456456, |
| "grad_norm": 2.1861705780029297, |
| "learning_rate": 3.0636441267467955e-06, |
| "loss": 0.48923444747924805, |
| "memory(GiB)": 77.56, |
| "step": 430, |
| "token_acc": 0.8383622335956, |
| "train_speed(iter/s)": 0.061843 |
| }, |
| { |
| "epoch": 0.6531531531531531, |
| "grad_norm": 4.289586544036865, |
| "learning_rate": 2.9496795610363087e-06, |
| "loss": 0.4919395923614502, |
| "memory(GiB)": 77.56, |
| "step": 435, |
| "token_acc": 0.8347007903650734, |
| "train_speed(iter/s)": 0.061873 |
| }, |
| { |
| "epoch": 0.6606606606606606, |
| "grad_norm": 2.4423766136169434, |
| "learning_rate": 2.8369814947062994e-06, |
| "loss": 0.48633642196655275, |
| "memory(GiB)": 77.56, |
| "step": 440, |
| "token_acc": 0.8311563010241948, |
| "train_speed(iter/s)": 0.06188 |
| }, |
| { |
| "epoch": 0.6681681681681682, |
| "grad_norm": 2.4229958057403564, |
| "learning_rate": 2.7256195422562687e-06, |
| "loss": 0.48715896606445314, |
| "memory(GiB)": 77.56, |
| "step": 445, |
| "token_acc": 0.8328196647372773, |
| "train_speed(iter/s)": 0.06191 |
| }, |
| { |
| "epoch": 0.6756756756756757, |
| "grad_norm": 2.8222622871398926, |
| "learning_rate": 2.615662492857471e-06, |
| "loss": 0.4761828422546387, |
| "memory(GiB)": 77.56, |
| "step": 450, |
| "token_acc": 0.8371917082765925, |
| "train_speed(iter/s)": 0.061929 |
| }, |
| { |
| "epoch": 0.6831831831831832, |
| "grad_norm": 2.2745773792266846, |
| "learning_rate": 2.5071782678612635e-06, |
| "loss": 0.47379336357116697, |
| "memory(GiB)": 77.56, |
| "step": 455, |
| "token_acc": 0.8373471378197914, |
| "train_speed(iter/s)": 0.061944 |
| }, |
| { |
| "epoch": 0.6906906906906907, |
| "grad_norm": 3.0483222007751465, |
| "learning_rate": 2.4002338788435654e-06, |
| "loss": 0.4889671325683594, |
| "memory(GiB)": 77.56, |
| "step": 460, |
| "token_acc": 0.8362183754993342, |
| "train_speed(iter/s)": 0.061958 |
| }, |
| { |
| "epoch": 0.6981981981981982, |
| "grad_norm": 4.672157287597656, |
| "learning_rate": 2.2948953862112596e-06, |
| "loss": 0.4797488212585449, |
| "memory(GiB)": 77.56, |
| "step": 465, |
| "token_acc": 0.8255822661574178, |
| "train_speed(iter/s)": 0.061982 |
| }, |
| { |
| "epoch": 0.7057057057057057, |
| "grad_norm": 3.9066431522369385, |
| "learning_rate": 2.1912278583961454e-06, |
| "loss": 0.47504510879516604, |
| "memory(GiB)": 77.56, |
| "step": 470, |
| "token_acc": 0.8383151135234309, |
| "train_speed(iter/s)": 0.062009 |
| }, |
| { |
| "epoch": 0.7132132132132132, |
| "grad_norm": 3.3316314220428467, |
| "learning_rate": 2.0892953316616616e-06, |
| "loss": 0.4852120399475098, |
| "memory(GiB)": 77.56, |
| "step": 475, |
| "token_acc": 0.8355624907118443, |
| "train_speed(iter/s)": 0.062037 |
| }, |
| { |
| "epoch": 0.7207207207207207, |
| "grad_norm": 3.6110689640045166, |
| "learning_rate": 1.989160770547159e-06, |
| "loss": 0.48483953475952146, |
| "memory(GiB)": 77.56, |
| "step": 480, |
| "token_acc": 0.8303629882883539, |
| "train_speed(iter/s)": 0.06206 |
| }, |
| { |
| "epoch": 0.7282282282282282, |
| "grad_norm": 3.1614766120910645, |
| "learning_rate": 1.8908860289741981e-06, |
| "loss": 0.48744707107543944, |
| "memory(GiB)": 77.56, |
| "step": 485, |
| "token_acc": 0.8345915240301965, |
| "train_speed(iter/s)": 0.062073 |
| }, |
| { |
| "epoch": 0.7357357357357357, |
| "grad_norm": 2.3462319374084473, |
| "learning_rate": 1.794531812038901e-06, |
| "loss": 0.4857791423797607, |
| "memory(GiB)": 77.56, |
| "step": 490, |
| "token_acc": 0.8333702146492586, |
| "train_speed(iter/s)": 0.062097 |
| }, |
| { |
| "epoch": 0.7432432432432432, |
| "grad_norm": 3.077697992324829, |
| "learning_rate": 1.7001576385139062e-06, |
| "loss": 0.48043317794799806, |
| "memory(GiB)": 77.56, |
| "step": 495, |
| "token_acc": 0.8398653702318624, |
| "train_speed(iter/s)": 0.062109 |
| }, |
| { |
| "epoch": 0.7507507507507507, |
| "grad_norm": 4.029548168182373, |
| "learning_rate": 1.6078218040831678e-06, |
| "loss": 0.48291807174682616, |
| "memory(GiB)": 77.56, |
| "step": 500, |
| "token_acc": 0.830759284534157, |
| "train_speed(iter/s)": 0.062136 |
| }, |
| { |
| "epoch": 0.7507507507507507, |
| "eval_loss": 3.8144214153289795, |
| "eval_runtime": 49.1053, |
| "eval_samples_per_second": 17.513, |
| "eval_steps_per_second": 2.199, |
| "eval_token_acc": 0.835421032741265, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7582582582582582, |
| "grad_norm": 3.489283561706543, |
| "learning_rate": 1.5175813453322252e-06, |
| "loss": 0.48162593841552737, |
| "memory(GiB)": 77.56, |
| "step": 505, |
| "token_acc": 0.8355083683550837, |
| "train_speed(iter/s)": 0.061761 |
| }, |
| { |
| "epoch": 0.7657657657657657, |
| "grad_norm": 2.7831027507781982, |
| "learning_rate": 1.4294920045162514e-06, |
| "loss": 0.47315006256103515, |
| "memory(GiB)": 77.56, |
| "step": 510, |
| "token_acc": 0.8345864661654135, |
| "train_speed(iter/s)": 0.061779 |
| }, |
| { |
| "epoch": 0.7732732732732732, |
| "grad_norm": 3.9989585876464844, |
| "learning_rate": 1.3436081951276247e-06, |
| "loss": 0.47411699295043946, |
| "memory(GiB)": 77.56, |
| "step": 515, |
| "token_acc": 0.8337380482622553, |
| "train_speed(iter/s)": 0.061792 |
| }, |
| { |
| "epoch": 0.7807807807807807, |
| "grad_norm": 2.496121883392334, |
| "learning_rate": 1.2599829682842618e-06, |
| "loss": 0.47501659393310547, |
| "memory(GiB)": 77.56, |
| "step": 520, |
| "token_acc": 0.8372924268374413, |
| "train_speed(iter/s)": 0.0618 |
| }, |
| { |
| "epoch": 0.7882882882882883, |
| "grad_norm": 2.376746416091919, |
| "learning_rate": 1.1786679799595308e-06, |
| "loss": 0.47827887535095215, |
| "memory(GiB)": 77.56, |
| "step": 525, |
| "token_acc": 0.8420151808304808, |
| "train_speed(iter/s)": 0.061823 |
| }, |
| { |
| "epoch": 0.7957957957957958, |
| "grad_norm": 6.058043956756592, |
| "learning_rate": 1.09971345907394e-06, |
| "loss": 0.48307170867919924, |
| "memory(GiB)": 77.56, |
| "step": 530, |
| "token_acc": 0.8403267731154845, |
| "train_speed(iter/s)": 0.06183 |
| }, |
| { |
| "epoch": 0.8033033033033034, |
| "grad_norm": 3.4789669513702393, |
| "learning_rate": 1.0231681764683188e-06, |
| "loss": 0.47766728401184083, |
| "memory(GiB)": 77.56, |
| "step": 535, |
| "token_acc": 0.8329650905877154, |
| "train_speed(iter/s)": 0.061854 |
| }, |
| { |
| "epoch": 0.8108108108108109, |
| "grad_norm": 2.2162930965423584, |
| "learning_rate": 9.490794147776927e-07, |
| "loss": 0.4749112606048584, |
| "memory(GiB)": 77.56, |
| "step": 540, |
| "token_acc": 0.8364345738295318, |
| "train_speed(iter/s)": 0.061873 |
| }, |
| { |
| "epoch": 0.8183183183183184, |
| "grad_norm": 2.1486546993255615, |
| "learning_rate": 8.774929392244158e-07, |
| "loss": 0.48050594329833984, |
| "memory(GiB)": 77.56, |
| "step": 545, |
| "token_acc": 0.8377268314539617, |
| "train_speed(iter/s)": 0.061873 |
| }, |
| { |
| "epoch": 0.8258258258258259, |
| "grad_norm": 20.101654052734375, |
| "learning_rate": 8.084529693486171e-07, |
| "loss": 0.4786341667175293, |
| "memory(GiB)": 77.56, |
| "step": 550, |
| "token_acc": 0.8336489659874252, |
| "train_speed(iter/s)": 0.061888 |
| }, |
| { |
| "epoch": 0.8333333333333334, |
| "grad_norm": 2.2926764488220215, |
| "learning_rate": 7.420021516934539e-07, |
| "loss": 0.4800395488739014, |
| "memory(GiB)": 77.56, |
| "step": 555, |
| "token_acc": 0.8331179198661902, |
| "train_speed(iter/s)": 0.0619 |
| }, |
| { |
| "epoch": 0.8408408408408409, |
| "grad_norm": 3.684734582901001, |
| "learning_rate": 6.781815334619812e-07, |
| "loss": 0.471435022354126, |
| "memory(GiB)": 77.56, |
| "step": 560, |
| "token_acc": 0.8375401560348784, |
| "train_speed(iter/s)": 0.061915 |
| }, |
| { |
| "epoch": 0.8483483483483484, |
| "grad_norm": 1.881805419921875, |
| "learning_rate": 6.170305371619773e-07, |
| "loss": 0.4786642074584961, |
| "memory(GiB)": 77.56, |
| "step": 565, |
| "token_acc": 0.8357564743298501, |
| "train_speed(iter/s)": 0.061924 |
| }, |
| { |
| "epoch": 0.8558558558558559, |
| "grad_norm": 2.26941180229187, |
| "learning_rate": 5.585869362543416e-07, |
| "loss": 0.47544078826904296, |
| "memory(GiB)": 77.56, |
| "step": 570, |
| "token_acc": 0.8454873092554251, |
| "train_speed(iter/s)": 0.061943 |
| }, |
| { |
| "epoch": 0.8633633633633634, |
| "grad_norm": 2.576007604598999, |
| "learning_rate": 5.028868318201191e-07, |
| "loss": 0.47312221527099607, |
| "memory(GiB)": 77.56, |
| "step": 575, |
| "token_acc": 0.8362756052141527, |
| "train_speed(iter/s)": 0.061967 |
| }, |
| { |
| "epoch": 0.8708708708708709, |
| "grad_norm": 2.3246395587921143, |
| "learning_rate": 4.4996463026058476e-07, |
| "loss": 0.479257869720459, |
| "memory(GiB)": 77.56, |
| "step": 580, |
| "token_acc": 0.8453853838469223, |
| "train_speed(iter/s)": 0.061969 |
| }, |
| { |
| "epoch": 0.8783783783783784, |
| "grad_norm": 2.4527480602264404, |
| "learning_rate": 3.9985302204412266e-07, |
| "loss": 0.47733464241027834, |
| "memory(GiB)": 77.56, |
| "step": 585, |
| "token_acc": 0.8370748040483982, |
| "train_speed(iter/s)": 0.061978 |
| }, |
| { |
| "epoch": 0.8858858858858859, |
| "grad_norm": 3.797691583633423, |
| "learning_rate": 3.5258296151306495e-07, |
| "loss": 0.4716297149658203, |
| "memory(GiB)": 77.56, |
| "step": 590, |
| "token_acc": 0.8305998651988318, |
| "train_speed(iter/s)": 0.061995 |
| }, |
| { |
| "epoch": 0.8933933933933934, |
| "grad_norm": 2.739664316177368, |
| "learning_rate": 3.081836477629491e-07, |
| "loss": 0.480192232131958, |
| "memory(GiB)": 77.56, |
| "step": 595, |
| "token_acc": 0.8329091181314568, |
| "train_speed(iter/s)": 0.062011 |
| }, |
| { |
| "epoch": 0.9009009009009009, |
| "grad_norm": 3.405724048614502, |
| "learning_rate": 2.666825066059986e-07, |
| "loss": 0.474263858795166, |
| "memory(GiB)": 77.56, |
| "step": 600, |
| "token_acc": 0.8360385967536839, |
| "train_speed(iter/s)": 0.062028 |
| }, |
| { |
| "epoch": 0.9009009009009009, |
| "eval_loss": 3.765347480773926, |
| "eval_runtime": 49.0338, |
| "eval_samples_per_second": 17.539, |
| "eval_steps_per_second": 2.203, |
| "eval_token_acc": 0.837152574981963, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.9084084084084084, |
| "grad_norm": 2.454803943634033, |
| "learning_rate": 2.2810517362997997e-07, |
| "loss": 0.47376174926757814, |
| "memory(GiB)": 77.56, |
| "step": 605, |
| "token_acc": 0.8413309189678587, |
| "train_speed(iter/s)": 0.061734 |
| }, |
| { |
| "epoch": 0.9159159159159159, |
| "grad_norm": 2.480395793914795, |
| "learning_rate": 1.9247547836289792e-07, |
| "loss": 0.4828979015350342, |
| "memory(GiB)": 77.56, |
| "step": 610, |
| "token_acc": 0.8343275692818848, |
| "train_speed(iter/s)": 0.061749 |
| }, |
| { |
| "epoch": 0.9234234234234234, |
| "grad_norm": 2.6262176036834717, |
| "learning_rate": 1.598154295532983e-07, |
| "loss": 0.48357315063476564, |
| "memory(GiB)": 77.56, |
| "step": 615, |
| "token_acc": 0.8393310437509333, |
| "train_speed(iter/s)": 0.061749 |
| }, |
| { |
| "epoch": 0.9309309309309309, |
| "grad_norm": 3.147251844406128, |
| "learning_rate": 1.3014520157529244e-07, |
| "loss": 0.47496805191040037, |
| "memory(GiB)": 77.56, |
| "step": 620, |
| "token_acc": 0.8347965493549125, |
| "train_speed(iter/s)": 0.061763 |
| }, |
| { |
| "epoch": 0.9384384384384384, |
| "grad_norm": 2.492722511291504, |
| "learning_rate": 1.034831219666832e-07, |
| "loss": 0.48235254287719725, |
| "memory(GiB)": 77.56, |
| "step": 625, |
| "token_acc": 0.8455045594995855, |
| "train_speed(iter/s)": 0.061787 |
| }, |
| { |
| "epoch": 0.9459459459459459, |
| "grad_norm": 2.683011054992676, |
| "learning_rate": 7.984566010789673e-08, |
| "loss": 0.4709568977355957, |
| "memory(GiB)": 77.56, |
| "step": 630, |
| "token_acc": 0.8387436920991188, |
| "train_speed(iter/s)": 0.061803 |
| }, |
| { |
| "epoch": 0.9534534534534534, |
| "grad_norm": 2.4714860916137695, |
| "learning_rate": 5.9247417048717284e-08, |
| "loss": 0.4673150062561035, |
| "memory(GiB)": 77.56, |
| "step": 635, |
| "token_acc": 0.8387750506642648, |
| "train_speed(iter/s)": 0.061826 |
| }, |
| { |
| "epoch": 0.960960960960961, |
| "grad_norm": 2.5630085468292236, |
| "learning_rate": 4.170111648909736e-08, |
| "loss": 0.48201580047607423, |
| "memory(GiB)": 77.56, |
| "step": 640, |
| "token_acc": 0.8353464246017044, |
| "train_speed(iter/s)": 0.061839 |
| }, |
| { |
| "epoch": 0.9684684684684685, |
| "grad_norm": 2.9935920238494873, |
| "learning_rate": 2.721759691962922e-08, |
| "loss": 0.47791686058044436, |
| "memory(GiB)": 77.56, |
| "step": 645, |
| "token_acc": 0.836272040302267, |
| "train_speed(iter/s)": 0.061862 |
| }, |
| { |
| "epoch": 0.975975975975976, |
| "grad_norm": 2.290571451187134, |
| "learning_rate": 1.580580492652084e-08, |
| "loss": 0.47302780151367185, |
| "memory(GiB)": 77.56, |
| "step": 650, |
| "token_acc": 0.8399675060926076, |
| "train_speed(iter/s)": 0.061876 |
| }, |
| { |
| "epoch": 0.9834834834834835, |
| "grad_norm": 7.702187538146973, |
| "learning_rate": 7.472789665218805e-09, |
| "loss": 0.4697415351867676, |
| "memory(GiB)": 77.56, |
| "step": 655, |
| "token_acc": 0.8315651906519065, |
| "train_speed(iter/s)": 0.0619 |
| }, |
| { |
| "epoch": 0.990990990990991, |
| "grad_norm": 3.5078024864196777, |
| "learning_rate": 2.223698506088612e-09, |
| "loss": 0.46914873123168943, |
| "memory(GiB)": 77.56, |
| "step": 660, |
| "token_acc": 0.8398177880666119, |
| "train_speed(iter/s)": 0.06191 |
| }, |
| { |
| "epoch": 0.9984984984984985, |
| "grad_norm": 2.397590160369873, |
| "learning_rate": 6.177385484029685e-11, |
| "loss": 0.4747368812561035, |
| "memory(GiB)": 77.56, |
| "step": 665, |
| "token_acc": 0.8291888691533452, |
| "train_speed(iter/s)": 0.061924 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 3.761183738708496, |
| "eval_runtime": 50.3959, |
| "eval_samples_per_second": 17.065, |
| "eval_steps_per_second": 2.143, |
| "eval_token_acc": 0.8374686501528842, |
| "step": 666 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 666, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.285336319720161e+17, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|