{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "batch_size": 1, "epoch": 0, "step": 0, "tokens_per_device": 5375 }, { "epoch": 0, "loss_ce": 11.613831520080566, "loss_lvr": 22.167646408081055, "loss_mode_switch": 0.0, "loss_total": 13.830595970153809, "step": 0 }, { "batch_size": 4, "epoch": 0, "step": 0, "tokens_per_device": 4904 }, { "epoch": 0, "loss_ce": 11.974207878112793, "loss_lvr": 19.99180030822754, "loss_mode_switch": 0.0, "loss_total": 13.973387718200684, "step": 0 }, { "batch_size": 4, "epoch": 0, "step": 0, "tokens_per_device": 4612 }, { "epoch": 0, "loss_ce": 13.917898178100586, "loss_lvr": 21.924062728881836, "loss_mode_switch": 0.0, "loss_total": 16.11030387878418, "step": 0 }, { "batch_size": 1, "epoch": 0, "step": 0, "tokens_per_device": 5139 }, { "epoch": 0, "loss_ce": 7.808012008666992, "loss_lvr": 21.5530948638916, "loss_mode_switch": 0.0, "loss_total": 9.963321685791016, "step": 0 }, { "batch_size": 1, "epoch": 0, "step": 0, "tokens_per_device": 4878 }, { "epoch": 0, "loss_ce": 10.119104385375977, "loss_lvr": 20.71912384033203, "loss_mode_switch": 0.0, "loss_total": 12.191017150878906, "step": 0 }, { "batch_size": 1, "epoch": 0, "step": 0, "tokens_per_device": 4874 }, { "epoch": 0, "loss_ce": 13.323358535766602, "loss_lvr": 22.250572204589844, "loss_mode_switch": 0.0, "loss_total": 15.548416137695312, "step": 0 }, { "batch_size": 4, "epoch": 0, "step": 0, "tokens_per_device": 6308 }, { "epoch": 0, "loss_ce": 12.34442138671875, "loss_lvr": 21.550031661987305, "loss_mode_switch": 0.0, "loss_total": 14.499424934387207, "step": 0 }, { "batch_size": 1, "epoch": 0, "step": 0, "tokens_per_device": 5205 }, { "epoch": 0, "loss_ce": 12.56396484375, "loss_lvr": 23.21318244934082, "loss_mode_switch": 0.0, "loss_total": 14.885283470153809, "step": 0 }, { "epoch": 0.0004, "grad_norm": 259.600341796875, "learning_rate": 1.3333333333333336e-07, "loss": 12.7922, "step": 1 }, { "batch_size": 4, "epoch": 0.0004, "step": 1, "tokens_per_device": 5816 }, { "epoch": 0.0004, "loss_ce": 12.735658645629883, "loss_lvr": 23.829442977905273, "loss_mode_switch": 0.0, "loss_total": 15.118602752685547, "step": 1 }, { "batch_size": 4, "epoch": 0.0004, "step": 1, "tokens_per_device": 3828 }, { "epoch": 0.0004, "loss_ce": 13.610407829284668, "loss_lvr": 23.858901977539062, "loss_mode_switch": 0.0, "loss_total": 15.996297836303711, "step": 1 }, { "batch_size": 4, "epoch": 0.0004, "step": 1, "tokens_per_device": 4348 }, { "epoch": 0.0004, "loss_ce": 10.429281234741211, "loss_lvr": 24.611684799194336, "loss_mode_switch": 0.0, "loss_total": 12.890449523925781, "step": 1 }, { "batch_size": 4, "epoch": 0.0004, "step": 1, "tokens_per_device": 4748 }, { "epoch": 0.0004, "loss_ce": 9.968450546264648, "loss_lvr": 20.9213809967041, "loss_mode_switch": 0.0, "loss_total": 12.060588836669922, "step": 1 }, { "batch_size": 4, "epoch": 0.0004, "step": 1, "tokens_per_device": 4244 }, { "epoch": 0.0004, "loss_ce": 10.611931800842285, "loss_lvr": 22.904006958007812, "loss_mode_switch": 0.0, "loss_total": 12.902332305908203, "step": 1 }, { "batch_size": 1, "epoch": 0.0004, "step": 1, "tokens_per_device": 5120 }, { "epoch": 0.0004, "loss_ce": 6.66909122467041, "loss_lvr": 22.409042358398438, "loss_mode_switch": 0.0, "loss_total": 8.909996032714844, "step": 1 }, { "batch_size": 4, "epoch": 0.0004, "step": 1, "tokens_per_device": 4296 }, { "epoch": 0.0004, "loss_ce": 9.722626686096191, "loss_lvr": 23.454103469848633, "loss_mode_switch": 0.0, "loss_total": 12.068037033081055, "step": 1 }, { "batch_size": 4, "epoch": 0.0004, "step": 1, "tokens_per_device": 3496 }, { "epoch": 0.0004, "loss_ce": 12.876893997192383, "loss_lvr": 23.532987594604492, "loss_mode_switch": 0.0, "loss_total": 15.230193138122559, "step": 1 }, { "epoch": 0.0008, "grad_norm": 240.00645446777344, "learning_rate": 2.666666666666667e-07, "loss": 12.8888, "step": 2 }, { "batch_size": 4, "epoch": 0.0008, "step": 2, "tokens_per_device": 15612 }, { "epoch": 0.0008, "loss_ce": 11.827984809875488, "loss_lvr": 23.2265567779541, "loss_mode_switch": 0.0, "loss_total": 14.150640487670898, "step": 2 }, { "batch_size": 1, "epoch": 0.0008, "step": 2, "tokens_per_device": 4866 }, { "epoch": 0.0008, "loss_ce": 10.798813819885254, "loss_lvr": 22.285728454589844, "loss_mode_switch": 0.0, "loss_total": 13.027386665344238, "step": 2 }, { "batch_size": 1, "epoch": 0.0008, "step": 2, "tokens_per_device": 5118 }, { "epoch": 0.0008, "loss_ce": 11.540398597717285, "loss_lvr": 20.441118240356445, "loss_mode_switch": 0.0, "loss_total": 13.584510803222656, "step": 2 }, { "batch_size": 1, "epoch": 0.0008, "step": 2, "tokens_per_device": 5023 }, { "epoch": 0.0008, "loss_ce": 8.605645179748535, "loss_lvr": 15.53588581085205, "loss_mode_switch": 0.0, "loss_total": 10.159234046936035, "step": 2 }, { "batch_size": 1, "epoch": 0.0008, "step": 2, "tokens_per_device": 4899 }, { "epoch": 0.0008, "loss_ce": 12.66787338256836, "loss_lvr": 20.68310546875, "loss_mode_switch": 0.0, "loss_total": 14.736184120178223, "step": 2 }, { "batch_size": 4, "epoch": 0.0008, "step": 2, "tokens_per_device": 7016 }, { "epoch": 0.0008, "loss_ce": 10.798523902893066, "loss_lvr": 23.819059371948242, "loss_mode_switch": 0.0, "loss_total": 13.180429458618164, "step": 2 }, { "batch_size": 4, "epoch": 0.0008, "step": 2, "tokens_per_device": 4284 }, { "epoch": 0.0008, "loss_ce": 9.146866798400879, "loss_lvr": 20.980539321899414, "loss_mode_switch": 0.0, "loss_total": 11.24492073059082, "step": 2 }, { "batch_size": 1, "epoch": 0.0008, "step": 2, "tokens_per_device": 5560 }, { "epoch": 0.0008, "loss_ce": 14.922237396240234, "loss_lvr": 17.350341796875, "loss_mode_switch": 0.0, "loss_total": 16.657272338867188, "step": 2 }, { "epoch": 0.0012, "grad_norm": 241.27294921875, "learning_rate": 4.0000000000000003e-07, "loss": 12.6047, "step": 3 }, { "batch_size": 4, "epoch": 0.0012, "step": 3, "tokens_per_device": 3876 }, { "epoch": 0.0012, "loss_ce": 8.259891510009766, "loss_lvr": 24.722944259643555, "loss_mode_switch": 0.0, "loss_total": 10.732186317443848, "step": 3 }, { "batch_size": 4, "epoch": 0.0012, "step": 3, "tokens_per_device": 3940 }, { "epoch": 0.0012, "loss_ce": 9.822291374206543, "loss_lvr": 22.61306381225586, "loss_mode_switch": 0.0, "loss_total": 12.083598136901855, "step": 3 }, { "batch_size": 1, "epoch": 0.0012, "step": 3, "tokens_per_device": 5212 }, { "epoch": 0.0012, "loss_ce": 7.394972324371338, "loss_lvr": 19.273500442504883, "loss_mode_switch": 0.0, "loss_total": 9.322322845458984, "step": 3 }, { "batch_size": 1, "epoch": 0.0012, "step": 3, "tokens_per_device": 5110 }, { "epoch": 0.0012, "loss_ce": 7.049258232116699, "loss_lvr": 17.992944717407227, "loss_mode_switch": 0.0, "loss_total": 8.848552703857422, "step": 3 }, { "batch_size": 1, "epoch": 0.0012, "step": 3, "tokens_per_device": 5126 }, { "epoch": 0.0012, "loss_ce": 10.220044136047363, "loss_lvr": 26.083189010620117, "loss_mode_switch": 0.0, "loss_total": 12.828363418579102, "step": 3 }, { "batch_size": 1, "epoch": 0.0012, "step": 3, "tokens_per_device": 5151 }, { "epoch": 0.0012, "loss_ce": 11.67619514465332, "loss_lvr": 20.236454010009766, "loss_mode_switch": 0.0, "loss_total": 13.699840545654297, "step": 3 }, { "batch_size": 1, "epoch": 0.0012, "step": 3, "tokens_per_device": 5013 }, { "epoch": 0.0012, "loss_ce": 12.624910354614258, "loss_lvr": 22.91669273376465, "loss_mode_switch": 0.0, "loss_total": 14.916580200195312, "step": 3 }, { "batch_size": 4, "epoch": 0.0012, "step": 3, "tokens_per_device": 4936 }, { "epoch": 0.0012, "loss_ce": 9.287961959838867, "loss_lvr": 24.01114273071289, "loss_mode_switch": 0.0, "loss_total": 11.68907642364502, "step": 3 }, { "epoch": 0.0016, "grad_norm": 240.7385711669922, "learning_rate": 5.333333333333335e-07, "loss": 12.6419, "step": 4 }, { "batch_size": 1, "epoch": 0.0016, "step": 4, "tokens_per_device": 4760 }, { "epoch": 0.0016, "loss_ce": 7.057017803192139, "loss_lvr": 24.50615692138672, "loss_mode_switch": 0.0, "loss_total": 9.507633209228516, "step": 4 }, { "batch_size": 4, "epoch": 0.0016, "step": 4, "tokens_per_device": 1280 }, { "epoch": 0.0016, "loss_ce": 11.107820510864258, "loss_lvr": 24.397764205932617, "loss_mode_switch": 0.0, "loss_total": 13.54759693145752, "step": 4 }, { "batch_size": 4, "epoch": 0.0016, "step": 4, "tokens_per_device": 1936 }, { "epoch": 0.0016, "loss_ce": 10.769822120666504, "loss_lvr": 23.32453727722168, "loss_mode_switch": 0.0, "loss_total": 13.102275848388672, "step": 4 }, { "batch_size": 4, "epoch": 0.0016, "step": 4, "tokens_per_device": 6584 }, { "epoch": 0.0016, "loss_ce": 10.045831680297852, "loss_lvr": 19.225391387939453, "loss_mode_switch": 0.0, "loss_total": 11.96837043762207, "step": 4 }, { "batch_size": 1, "epoch": 0.0016, "step": 4, "tokens_per_device": 4964 }, { "epoch": 0.0016, "loss_ce": 6.570861339569092, "loss_lvr": 21.486713409423828, "loss_mode_switch": 0.0, "loss_total": 8.71953296661377, "step": 4 }, { "batch_size": 1, "epoch": 0.0016, "step": 4, "tokens_per_device": 5176 }, { "epoch": 0.0016, "loss_ce": 10.91823959350586, "loss_lvr": 22.94397735595703, "loss_mode_switch": 0.0, "loss_total": 13.212636947631836, "step": 4 }, { "batch_size": 4, "epoch": 0.0016, "step": 4, "tokens_per_device": 2612 }, { "epoch": 0.0016, "loss_ce": 9.46656322479248, "loss_lvr": 22.771656036376953, "loss_mode_switch": 0.0, "loss_total": 11.743728637695312, "step": 4 }, { "batch_size": 1, "epoch": 0.0016, "step": 4, "tokens_per_device": 5457 }, { "epoch": 0.0016, "loss_ce": 13.831037521362305, "loss_lvr": 21.058382034301758, "loss_mode_switch": 0.0, "loss_total": 15.93687629699707, "step": 4 }, { "epoch": 0.002, "grad_norm": 228.35903930664062, "learning_rate": 6.666666666666667e-07, "loss": 12.3564, "step": 5 }, { "batch_size": 4, "epoch": 0.002, "step": 5, "tokens_per_device": 4180 }, { "epoch": 0.002, "loss_ce": 11.037369728088379, "loss_lvr": 24.099777221679688, "loss_mode_switch": 0.0, "loss_total": 13.447347640991211, "step": 5 }, { "batch_size": 4, "epoch": 0.002, "step": 5, "tokens_per_device": 4220 }, { "epoch": 0.002, "loss_ce": 11.410539627075195, "loss_lvr": 22.541975021362305, "loss_mode_switch": 0.0, "loss_total": 13.664737701416016, "step": 5 }, { "batch_size": 4, "epoch": 0.002, "step": 5, "tokens_per_device": 5732 }, { "epoch": 0.002, "loss_ce": 10.999322891235352, "loss_lvr": 21.728614807128906, "loss_mode_switch": 0.0, "loss_total": 13.172183990478516, "step": 5 }, { "batch_size": 4, "epoch": 0.002, "step": 5, "tokens_per_device": 3796 }, { "epoch": 0.002, "loss_ce": 9.960591316223145, "loss_lvr": 23.942880630493164, "loss_mode_switch": 0.0, "loss_total": 12.354879379272461, "step": 5 }, { "batch_size": 4, "epoch": 0.002, "step": 5, "tokens_per_device": 3772 }, { "epoch": 0.002, "loss_ce": 7.456742286682129, "loss_lvr": 23.390548706054688, "loss_mode_switch": 0.0, "loss_total": 9.795797348022461, "step": 5 }, { "batch_size": 4, "epoch": 0.002, "step": 5, "tokens_per_device": 3776 }, { "epoch": 0.002, "loss_ce": 10.504068374633789, "loss_lvr": 25.982023239135742, "loss_mode_switch": 0.0, "loss_total": 13.10227108001709, "step": 5 }, { "batch_size": 4, "epoch": 0.002, "step": 5, "tokens_per_device": 4152 }, { "epoch": 0.002, "loss_ce": 11.772835731506348, "loss_lvr": 23.907445907592773, "loss_mode_switch": 0.0, "loss_total": 14.163579940795898, "step": 5 }, { "batch_size": 4, "epoch": 0.002, "step": 5, "tokens_per_device": 4448 }, { "epoch": 0.002, "loss_ce": 7.289435863494873, "loss_lvr": 21.176908493041992, "loss_mode_switch": 0.0, "loss_total": 9.407126426696777, "step": 5 }, { "epoch": 0.0024, "grad_norm": 318.30255126953125, "learning_rate": 8.000000000000001e-07, "loss": 12.4666, "step": 6 }, { "batch_size": 4, "epoch": 0.0024, "step": 6, "tokens_per_device": 1284 }, { "epoch": 0.0024, "loss_ce": 6.90122652053833, "loss_lvr": 24.097766876220703, "loss_mode_switch": 0.0, "loss_total": 9.311002731323242, "step": 6 }, { "batch_size": 1, "epoch": 0.0024, "step": 6, "tokens_per_device": 5468 }, { "epoch": 0.0024, "loss_ce": 13.513354301452637, "loss_lvr": 20.634666442871094, "loss_mode_switch": 0.0, "loss_total": 15.576821327209473, "step": 6 }, { "batch_size": 4, "epoch": 0.0024, "step": 6, "tokens_per_device": 5852 }, { "epoch": 0.0024, "loss_ce": 10.705231666564941, "loss_lvr": 22.20113754272461, "loss_mode_switch": 0.0, "loss_total": 12.925345420837402, "step": 6 }, { "batch_size": 4, "epoch": 0.0024, "step": 6, "tokens_per_device": 4704 }, { "epoch": 0.0024, "loss_ce": 12.573841094970703, "loss_lvr": 22.98272705078125, "loss_mode_switch": 0.0, "loss_total": 14.872114181518555, "step": 6 }, { "batch_size": 4, "epoch": 0.0024, "step": 6, "tokens_per_device": 4828 }, { "epoch": 0.0024, "loss_ce": 11.254337310791016, "loss_lvr": 21.3597354888916, "loss_mode_switch": 0.0, "loss_total": 13.390311241149902, "step": 6 }, { "batch_size": 1, "epoch": 0.0024, "step": 6, "tokens_per_device": 4852 }, { "epoch": 0.0024, "loss_ce": 5.518298625946045, "loss_lvr": 25.195714950561523, "loss_mode_switch": 0.0, "loss_total": 8.037870407104492, "step": 6 }, { "batch_size": 1, "epoch": 0.0024, "step": 6, "tokens_per_device": 5175 }, { "epoch": 0.0024, "loss_ce": 5.889595985412598, "loss_lvr": 19.030588150024414, "loss_mode_switch": 0.0, "loss_total": 7.792654991149902, "step": 6 }, { "batch_size": 4, "epoch": 0.0024, "step": 6, "tokens_per_device": 2584 }, { "epoch": 0.0024, "loss_ce": 8.05411148071289, "loss_lvr": 23.809635162353516, "loss_mode_switch": 0.0, "loss_total": 10.435074806213379, "step": 6 }, { "epoch": 0.0028, "grad_norm": 221.06509399414062, "learning_rate": 9.333333333333334e-07, "loss": 12.0049, "step": 7 }, { "batch_size": 4, "epoch": 0.0028, "step": 7, "tokens_per_device": 4208 }, { "epoch": 0.0028, "loss_ce": 8.926361083984375, "loss_lvr": 22.812347412109375, "loss_mode_switch": 0.0, "loss_total": 11.207595825195312, "step": 7 }, { "batch_size": 1, "epoch": 0.0028, "step": 7, "tokens_per_device": 5190 }, { "epoch": 0.0028, "loss_ce": 7.103700160980225, "loss_lvr": 20.9300594329834, "loss_mode_switch": 0.0, "loss_total": 9.19670581817627, "step": 7 }, { "batch_size": 1, "epoch": 0.0028, "step": 7, "tokens_per_device": 5319 }, { "epoch": 0.0028, "loss_ce": 13.785123825073242, "loss_lvr": 21.529834747314453, "loss_mode_switch": 0.0, "loss_total": 15.93810749053955, "step": 7 }, { "batch_size": 4, "epoch": 0.0028, "step": 7, "tokens_per_device": 3768 }, { "epoch": 0.0028, "loss_ce": 10.374007225036621, "loss_lvr": 25.248069763183594, "loss_mode_switch": 0.0, "loss_total": 12.89881420135498, "step": 7 }, { "batch_size": 4, "epoch": 0.0028, "step": 7, "tokens_per_device": 2768 }, { "epoch": 0.0028, "loss_ce": 9.633280754089355, "loss_lvr": 23.286386489868164, "loss_mode_switch": 0.0, "loss_total": 11.961919784545898, "step": 7 }, { "batch_size": 4, "epoch": 0.0028, "step": 7, "tokens_per_device": 4908 }, { "epoch": 0.0028, "loss_ce": 11.943330764770508, "loss_lvr": 23.735027313232422, "loss_mode_switch": 0.0, "loss_total": 14.31683349609375, "step": 7 }, { "batch_size": 4, "epoch": 0.0028, "step": 7, "tokens_per_device": 5092 }, { "epoch": 0.0028, "loss_ce": 11.006250381469727, "loss_lvr": 19.4356632232666, "loss_mode_switch": 0.0, "loss_total": 12.949816703796387, "step": 7 }, { "batch_size": 4, "epoch": 0.0028, "step": 7, "tokens_per_device": 4488 }, { "epoch": 0.0028, "loss_ce": 8.903258323669434, "loss_lvr": 22.5830020904541, "loss_mode_switch": 0.0, "loss_total": 11.161558151245117, "step": 7 }, { "epoch": 0.0032, "grad_norm": 213.75209045410156, "learning_rate": 1.066666666666667e-06, "loss": 11.7084, "step": 8 }, { "batch_size": 4, "epoch": 0.0032, "step": 8, "tokens_per_device": 1200 }, { "epoch": 0.0032, "loss_ce": 9.683794975280762, "loss_lvr": 22.713008880615234, "loss_mode_switch": 0.0, "loss_total": 11.955096244812012, "step": 8 }, { "batch_size": 4, "epoch": 0.0032, "step": 8, "tokens_per_device": 4388 }, { "epoch": 0.0032, "loss_ce": 12.287866592407227, "loss_lvr": 22.883544921875, "loss_mode_switch": 0.0, "loss_total": 14.576221466064453, "step": 8 }, { "batch_size": 4, "epoch": 0.0032, "step": 8, "tokens_per_device": 4200 }, { "epoch": 0.0032, "loss_ce": 12.343949317932129, "loss_lvr": 20.87564468383789, "loss_mode_switch": 0.0, "loss_total": 14.431513786315918, "step": 8 }, { "batch_size": 1, "epoch": 0.0032, "step": 8, "tokens_per_device": 4892 }, { "epoch": 0.0032, "loss_ce": 8.986250877380371, "loss_lvr": 20.676834106445312, "loss_mode_switch": 0.0, "loss_total": 11.053934097290039, "step": 8 }, { "batch_size": 4, "epoch": 0.0032, "step": 8, "tokens_per_device": 2900 }, { "epoch": 0.0032, "loss_ce": 12.533193588256836, "loss_lvr": 21.788545608520508, "loss_mode_switch": 0.0, "loss_total": 14.712048530578613, "step": 8 }, { "batch_size": 4, "epoch": 0.0032, "step": 8, "tokens_per_device": 13316 }, { "epoch": 0.0032, "loss_ce": 10.264874458312988, "loss_lvr": 20.76777458190918, "loss_mode_switch": 0.0, "loss_total": 12.341651916503906, "step": 8 }, { "batch_size": 4, "epoch": 0.0032, "step": 8, "tokens_per_device": 1620 }, { "epoch": 0.0032, "loss_ce": 7.873702049255371, "loss_lvr": 23.220510482788086, "loss_mode_switch": 0.0, "loss_total": 10.19575309753418, "step": 8 }, { "batch_size": 4, "epoch": 0.0032, "step": 8, "tokens_per_device": 4544 }, { "epoch": 0.0032, "loss_ce": 8.080208778381348, "loss_lvr": 23.484966278076172, "loss_mode_switch": 0.0, "loss_total": 10.428705215454102, "step": 8 }, { "epoch": 0.0036, "grad_norm": 229.5780487060547, "learning_rate": 1.2000000000000002e-06, "loss": 12.1844, "step": 9 }, { "batch_size": 4, "epoch": 0.0036, "step": 9, "tokens_per_device": 6072 }, { "epoch": 0.0036, "loss_ce": 11.880016326904297, "loss_lvr": 21.96132469177246, "loss_mode_switch": 0.0, "loss_total": 14.076148986816406, "step": 9 }, { "batch_size": 4, "epoch": 0.0036, "step": 9, "tokens_per_device": 7304 }, { "epoch": 0.0036, "loss_ce": 9.9308443069458, "loss_lvr": 19.789915084838867, "loss_mode_switch": 0.0, "loss_total": 11.909835815429688, "step": 9 }, { "batch_size": 4, "epoch": 0.0036, "step": 9, "tokens_per_device": 1996 }, { "epoch": 0.0036, "loss_ce": 9.697738647460938, "loss_lvr": 21.48575210571289, "loss_mode_switch": 0.0, "loss_total": 11.8463134765625, "step": 9 }, { "batch_size": 1, "epoch": 0.0036, "step": 9, "tokens_per_device": 6283 }, { "epoch": 0.0036, "loss_ce": 12.17724609375, "loss_lvr": 21.253957748413086, "loss_mode_switch": 0.0, "loss_total": 14.302641868591309, "step": 9 }, { "batch_size": 4, "epoch": 0.0036, "step": 9, "tokens_per_device": 6068 }, { "epoch": 0.0036, "loss_ce": 9.311718940734863, "loss_lvr": 20.20867347717285, "loss_mode_switch": 0.0, "loss_total": 11.332586288452148, "step": 9 }, { "batch_size": 4, "epoch": 0.0036, "step": 9, "tokens_per_device": 1324 }, { "epoch": 0.0036, "loss_ce": 9.023956298828125, "loss_lvr": 25.062786102294922, "loss_mode_switch": 0.0, "loss_total": 11.530235290527344, "step": 9 }, { "batch_size": 1, "epoch": 0.0036, "step": 9, "tokens_per_device": 5128 }, { "epoch": 0.0036, "loss_ce": 8.880266189575195, "loss_lvr": 27.630775451660156, "loss_mode_switch": 0.0, "loss_total": 11.643343925476074, "step": 9 }, { "batch_size": 4, "epoch": 0.0036, "step": 9, "tokens_per_device": 15724 }, { "epoch": 0.0036, "loss_ce": 7.285747528076172, "loss_lvr": 22.96712875366211, "loss_mode_switch": 0.0, "loss_total": 9.582460403442383, "step": 9 }, { "epoch": 0.004, "grad_norm": 186.4457550048828, "learning_rate": 1.3333333333333334e-06, "loss": 10.923, "step": 10 }, { "batch_size": 4, "epoch": 0.004, "step": 10, "tokens_per_device": 1640 }, { "epoch": 0.004, "loss_ce": 7.419248580932617, "loss_lvr": 22.738903045654297, "loss_mode_switch": 0.0, "loss_total": 9.69313907623291, "step": 10 }, { "batch_size": 1, "epoch": 0.004, "step": 10, "tokens_per_device": 5155 }, { "epoch": 0.004, "loss_ce": 8.594239234924316, "loss_lvr": 19.523189544677734, "loss_mode_switch": 0.0, "loss_total": 10.546558380126953, "step": 10 }, { "batch_size": 4, "epoch": 0.004, "step": 10, "tokens_per_device": 4564 }, { "epoch": 0.004, "loss_ce": 8.494871139526367, "loss_lvr": 23.823467254638672, "loss_mode_switch": 0.0, "loss_total": 10.877218246459961, "step": 10 }, { "batch_size": 4, "epoch": 0.004, "step": 10, "tokens_per_device": 4212 }, { "epoch": 0.004, "loss_ce": 10.598348617553711, "loss_lvr": 24.01956558227539, "loss_mode_switch": 0.0, "loss_total": 13.00030517578125, "step": 10 }, { "batch_size": 1, "epoch": 0.004, "step": 10, "tokens_per_device": 4878 }, { "epoch": 0.004, "loss_ce": 5.965798854827881, "loss_lvr": 22.810638427734375, "loss_mode_switch": 0.0, "loss_total": 8.246862411499023, "step": 10 }, { "batch_size": 1, "epoch": 0.004, "step": 10, "tokens_per_device": 5098 }, { "epoch": 0.004, "loss_ce": 10.679305076599121, "loss_lvr": 21.80345916748047, "loss_mode_switch": 0.0, "loss_total": 12.859651565551758, "step": 10 }, { "batch_size": 4, "epoch": 0.004, "step": 10, "tokens_per_device": 4196 }, { "epoch": 0.004, "loss_ce": 12.251391410827637, "loss_lvr": 23.460092544555664, "loss_mode_switch": 0.0, "loss_total": 14.597400665283203, "step": 10 }, { "batch_size": 4, "epoch": 0.004, "step": 10, "tokens_per_device": 1544 }, { "epoch": 0.004, "loss_ce": 6.967952251434326, "loss_lvr": 22.34494400024414, "loss_mode_switch": 0.0, "loss_total": 9.202446937561035, "step": 10 }, { "epoch": 0.0044, "grad_norm": 177.7866973876953, "learning_rate": 1.4666666666666669e-06, "loss": 10.5758, "step": 11 }, { "batch_size": 1, "epoch": 0.0044, "step": 11, "tokens_per_device": 5899 }, { "epoch": 0.0044, "loss_ce": 7.421351432800293, "loss_lvr": 21.913583755493164, "loss_mode_switch": 0.0, "loss_total": 9.612709999084473, "step": 11 }, { "batch_size": 1, "epoch": 0.0044, "step": 11, "tokens_per_device": 6510 }, { "epoch": 0.0044, "loss_ce": 9.249216079711914, "loss_lvr": 20.669137954711914, "loss_mode_switch": 0.0, "loss_total": 11.316129684448242, "step": 11 }, { "batch_size": 1, "epoch": 0.0044, "step": 11, "tokens_per_device": 4858 }, { "epoch": 0.0044, "loss_ce": 7.338983058929443, "loss_lvr": 22.93932342529297, "loss_mode_switch": 0.0, "loss_total": 9.632915496826172, "step": 11 }, { "batch_size": 4, "epoch": 0.0044, "step": 11, "tokens_per_device": 1540 }, { "epoch": 0.0044, "loss_ce": 7.9031171798706055, "loss_lvr": 24.29012107849121, "loss_mode_switch": 0.0, "loss_total": 10.33212947845459, "step": 11 }, { "batch_size": 4, "epoch": 0.0044, "step": 11, "tokens_per_device": 4320 }, { "epoch": 0.0044, "loss_ce": 6.752721309661865, "loss_lvr": 21.097396850585938, "loss_mode_switch": 0.0, "loss_total": 8.86246109008789, "step": 11 }, { "batch_size": 4, "epoch": 0.0044, "step": 11, "tokens_per_device": 1232 }, { "epoch": 0.0044, "loss_ce": 8.600020408630371, "loss_lvr": 24.174537658691406, "loss_mode_switch": 0.0, "loss_total": 11.017474174499512, "step": 11 }, { "batch_size": 4, "epoch": 0.0044, "step": 11, "tokens_per_device": 14636 }, { "epoch": 0.0044, "loss_ce": 7.4713134765625, "loss_lvr": 21.06314468383789, "loss_mode_switch": 0.0, "loss_total": 9.577628135681152, "step": 11 }, { "batch_size": 1, "epoch": 0.0044, "step": 11, "tokens_per_device": 6283 }, { "epoch": 0.0044, "loss_ce": 10.38068675994873, "loss_lvr": 20.076570510864258, "loss_mode_switch": 0.0, "loss_total": 12.388343811035156, "step": 11 }, { "epoch": 0.0048, "grad_norm": 153.61859130859375, "learning_rate": 1.6000000000000001e-06, "loss": 10.3565, "step": 12 }, { "batch_size": 4, "epoch": 0.0048, "step": 12, "tokens_per_device": 4256 }, { "epoch": 0.0048, "loss_ce": 7.516041278839111, "loss_lvr": 22.070772171020508, "loss_mode_switch": 0.0, "loss_total": 9.723118782043457, "step": 12 }, { "batch_size": 1, "epoch": 0.0048, "step": 12, "tokens_per_device": 4882 }, { "epoch": 0.0048, "loss_ce": 7.770145416259766, "loss_lvr": 22.846599578857422, "loss_mode_switch": 0.0, "loss_total": 10.054805755615234, "step": 12 }, { "batch_size": 1, "epoch": 0.0048, "step": 12, "tokens_per_device": 5226 }, { "epoch": 0.0048, "loss_ce": 5.138789176940918, "loss_lvr": 22.334171295166016, "loss_mode_switch": 0.0, "loss_total": 7.372206687927246, "step": 12 }, { "batch_size": 4, "epoch": 0.0048, "step": 12, "tokens_per_device": 5220 }, { "epoch": 0.0048, "loss_ce": 6.064359664916992, "loss_lvr": 19.578210830688477, "loss_mode_switch": 0.0, "loss_total": 8.022180557250977, "step": 12 }, { "batch_size": 4, "epoch": 0.0048, "step": 12, "tokens_per_device": 4376 }, { "epoch": 0.0048, "loss_ce": 7.9387030601501465, "loss_lvr": 21.20704460144043, "loss_mode_switch": 0.0, "loss_total": 10.059407234191895, "step": 12 }, { "batch_size": 4, "epoch": 0.0048, "step": 12, "tokens_per_device": 1340 }, { "epoch": 0.0048, "loss_ce": 4.8679118156433105, "loss_lvr": 23.220001220703125, "loss_mode_switch": 0.0, "loss_total": 7.189911842346191, "step": 12 }, { "batch_size": 4, "epoch": 0.0048, "step": 12, "tokens_per_device": 1248 }, { "epoch": 0.0048, "loss_ce": 7.024510383605957, "loss_lvr": 23.491626739501953, "loss_mode_switch": 0.0, "loss_total": 9.373673439025879, "step": 12 }, { "batch_size": 4, "epoch": 0.0048, "step": 12, "tokens_per_device": 5200 }, { "epoch": 0.0048, "loss_ce": 6.29213285446167, "loss_lvr": 20.77289581298828, "loss_mode_switch": 0.0, "loss_total": 8.369422912597656, "step": 12 }, { "epoch": 0.0052, "grad_norm": 116.7611312866211, "learning_rate": 1.7333333333333336e-06, "loss": 8.574, "step": 13 }, { "batch_size": 4, "epoch": 0.0052, "step": 13, "tokens_per_device": 7076 }, { "epoch": 0.0052, "loss_ce": 3.7626349925994873, "loss_lvr": 24.347332000732422, "loss_mode_switch": 0.0, "loss_total": 6.197368144989014, "step": 13 }, { "batch_size": 4, "epoch": 0.0052, "step": 13, "tokens_per_device": 4320 }, { "epoch": 0.0052, "loss_ce": 5.903668403625488, "loss_lvr": 20.07671356201172, "loss_mode_switch": 0.0, "loss_total": 7.91133975982666, "step": 13 }, { "batch_size": 1, "epoch": 0.0052, "step": 13, "tokens_per_device": 4897 }, { "epoch": 0.0052, "loss_ce": 5.011138439178467, "loss_lvr": 20.24105453491211, "loss_mode_switch": 0.0, "loss_total": 7.035243988037109, "step": 13 }, { "batch_size": 4, "epoch": 0.0052, "step": 13, "tokens_per_device": 3892 }, { "epoch": 0.0052, "loss_ce": 5.926401138305664, "loss_lvr": 22.024871826171875, "loss_mode_switch": 0.0, "loss_total": 8.128888130187988, "step": 13 }, { "batch_size": 4, "epoch": 0.0052, "step": 13, "tokens_per_device": 5144 }, { "epoch": 0.0052, "loss_ce": 7.0778937339782715, "loss_lvr": 18.826736450195312, "loss_mode_switch": 0.0, "loss_total": 8.960567474365234, "step": 13 }, { "batch_size": 1, "epoch": 0.0052, "step": 13, "tokens_per_device": 4987 }, { "epoch": 0.0052, "loss_ce": 3.442261219024658, "loss_lvr": 20.181135177612305, "loss_mode_switch": 0.0, "loss_total": 5.46037483215332, "step": 13 }, { "batch_size": 1, "epoch": 0.0052, "step": 13, "tokens_per_device": 4926 }, { "epoch": 0.0052, "loss_ce": 5.493370532989502, "loss_lvr": 22.9511775970459, "loss_mode_switch": 0.0, "loss_total": 7.788488388061523, "step": 13 }, { "batch_size": 4, "epoch": 0.0052, "step": 13, "tokens_per_device": 5000 }, { "epoch": 0.0052, "loss_ce": 4.996932506561279, "loss_lvr": 22.068262100219727, "loss_mode_switch": 0.0, "loss_total": 7.203758716583252, "step": 13 }, { "epoch": 0.0056, "grad_norm": 104.36156463623047, "learning_rate": 1.8666666666666669e-06, "loss": 7.808, "step": 14 }, { "batch_size": 1, "epoch": 0.0056, "step": 14, "tokens_per_device": 5177 }, { "epoch": 0.0056, "loss_ce": 5.748447895050049, "loss_lvr": 23.079294204711914, "loss_mode_switch": 0.0, "loss_total": 8.056377410888672, "step": 14 }, { "batch_size": 4, "epoch": 0.0056, "step": 14, "tokens_per_device": 1500 }, { "epoch": 0.0056, "loss_ce": 4.778672218322754, "loss_lvr": 20.602092742919922, "loss_mode_switch": 0.0, "loss_total": 6.838881492614746, "step": 14 }, { "batch_size": 4, "epoch": 0.0056, "step": 14, "tokens_per_device": 1372 }, { "epoch": 0.0056, "loss_ce": 5.280740261077881, "loss_lvr": 24.900619506835938, "loss_mode_switch": 0.0, "loss_total": 7.7708024978637695, "step": 14 }, { "batch_size": 4, "epoch": 0.0056, "step": 14, "tokens_per_device": 5788 }, { "epoch": 0.0056, "loss_ce": 5.512189865112305, "loss_lvr": 23.470233917236328, "loss_mode_switch": 0.0, "loss_total": 7.859213352203369, "step": 14 }, { "batch_size": 4, "epoch": 0.0056, "step": 14, "tokens_per_device": 5916 }, { "epoch": 0.0056, "loss_ce": 6.507986068725586, "loss_lvr": 23.121639251708984, "loss_mode_switch": 0.0, "loss_total": 8.820150375366211, "step": 14 }, { "batch_size": 4, "epoch": 0.0056, "step": 14, "tokens_per_device": 2168 }, { "epoch": 0.0056, "loss_ce": 5.655284881591797, "loss_lvr": 22.113500595092773, "loss_mode_switch": 0.0, "loss_total": 7.866635322570801, "step": 14 }, { "batch_size": 4, "epoch": 0.0056, "step": 14, "tokens_per_device": 4236 }, { "epoch": 0.0056, "loss_ce": 6.364377975463867, "loss_lvr": 22.969099044799805, "loss_mode_switch": 0.0, "loss_total": 8.661288261413574, "step": 14 }, { "batch_size": 4, "epoch": 0.0056, "step": 14, "tokens_per_device": 1472 }, { "epoch": 0.0056, "loss_ce": 4.634829998016357, "loss_lvr": 23.539072036743164, "loss_mode_switch": 0.0, "loss_total": 6.988737106323242, "step": 14 }, { "epoch": 0.006, "grad_norm": 94.64962768554688, "learning_rate": 2.0000000000000003e-06, "loss": 7.6229, "step": 15 }, { "batch_size": 4, "epoch": 0.006, "step": 15, "tokens_per_device": 4080 }, { "epoch": 0.006, "loss_ce": 5.38911247253418, "loss_lvr": 22.624467849731445, "loss_mode_switch": 0.0, "loss_total": 7.651559352874756, "step": 15 }, { "batch_size": 4, "epoch": 0.006, "step": 15, "tokens_per_device": 2644 }, { "epoch": 0.006, "loss_ce": 5.94003438949585, "loss_lvr": 22.76136016845703, "loss_mode_switch": 0.0, "loss_total": 8.216170310974121, "step": 15 }, { "batch_size": 4, "epoch": 0.006, "step": 15, "tokens_per_device": 4052 }, { "epoch": 0.006, "loss_ce": 5.025619029998779, "loss_lvr": 20.432140350341797, "loss_mode_switch": 0.0, "loss_total": 7.068833351135254, "step": 15 }, { "batch_size": 4, "epoch": 0.006, "step": 15, "tokens_per_device": 1324 }, { "epoch": 0.006, "loss_ce": 4.819749355316162, "loss_lvr": 21.30438995361328, "loss_mode_switch": 0.0, "loss_total": 6.950188636779785, "step": 15 }, { "batch_size": 4, "epoch": 0.006, "step": 15, "tokens_per_device": 6400 }, { "epoch": 0.006, "loss_ce": 5.664323329925537, "loss_lvr": 21.470029830932617, "loss_mode_switch": 0.0, "loss_total": 7.811326026916504, "step": 15 }, { "batch_size": 4, "epoch": 0.006, "step": 15, "tokens_per_device": 3360 }, { "epoch": 0.006, "loss_ce": 5.474852085113525, "loss_lvr": 23.615022659301758, "loss_mode_switch": 0.0, "loss_total": 7.8363542556762695, "step": 15 }, { "batch_size": 4, "epoch": 0.006, "step": 15, "tokens_per_device": 1592 }, { "epoch": 0.006, "loss_ce": 5.197647571563721, "loss_lvr": 22.457778930664062, "loss_mode_switch": 0.0, "loss_total": 7.443425178527832, "step": 15 }, { "batch_size": 4, "epoch": 0.006, "step": 15, "tokens_per_device": 11168 }, { "epoch": 0.006, "loss_ce": 4.810426235198975, "loss_lvr": 17.44853973388672, "loss_mode_switch": 0.0, "loss_total": 6.5552802085876465, "step": 15 }, { "epoch": 0.0064, "grad_norm": 94.33814239501953, "learning_rate": 2.133333333333334e-06, "loss": 7.4295, "step": 16 }, { "batch_size": 1, "epoch": 0.0064, "step": 16, "tokens_per_device": 4940 }, { "epoch": 0.0064, "loss_ce": 5.229210376739502, "loss_lvr": 16.638084411621094, "loss_mode_switch": 0.0, "loss_total": 6.89301872253418, "step": 16 }, { "batch_size": 1, "epoch": 0.0064, "step": 16, "tokens_per_device": 5790 }, { "epoch": 0.0064, "loss_ce": 5.00925350189209, "loss_lvr": 17.82498550415039, "loss_mode_switch": 0.0, "loss_total": 6.791751861572266, "step": 16 }, { "batch_size": 4, "epoch": 0.0064, "step": 16, "tokens_per_device": 1500 }, { "epoch": 0.0064, "loss_ce": 3.4235575199127197, "loss_lvr": 19.87352752685547, "loss_mode_switch": 0.0, "loss_total": 5.410910129547119, "step": 16 }, { "batch_size": 4, "epoch": 0.0064, "step": 16, "tokens_per_device": 4208 }, { "epoch": 0.0064, "loss_ce": 4.653353691101074, "loss_lvr": 19.00723648071289, "loss_mode_switch": 0.0, "loss_total": 6.5540771484375, "step": 16 }, { "batch_size": 1, "epoch": 0.0064, "step": 16, "tokens_per_device": 4720 }, { "epoch": 0.0064, "loss_ce": 4.876465320587158, "loss_lvr": 14.097358703613281, "loss_mode_switch": 0.0, "loss_total": 6.286201477050781, "step": 16 }, { "batch_size": 4, "epoch": 0.0064, "step": 16, "tokens_per_device": 1652 }, { "epoch": 0.0064, "loss_ce": 3.884666919708252, "loss_lvr": 18.544588088989258, "loss_mode_switch": 0.0, "loss_total": 5.739125728607178, "step": 16 }, { "batch_size": 4, "epoch": 0.0064, "step": 16, "tokens_per_device": 4192 }, { "epoch": 0.0064, "loss_ce": 4.3385009765625, "loss_lvr": 20.049402236938477, "loss_mode_switch": 0.0, "loss_total": 6.343441009521484, "step": 16 }, { "batch_size": 4, "epoch": 0.0064, "step": 16, "tokens_per_device": 4728 }, { "epoch": 0.0064, "loss_ce": 3.696815013885498, "loss_lvr": 18.65981101989746, "loss_mode_switch": 0.0, "loss_total": 5.562796115875244, "step": 16 }, { "epoch": 0.0068, "grad_norm": 63.983665466308594, "learning_rate": 2.266666666666667e-06, "loss": 6.1624, "step": 17 }, { "batch_size": 1, "epoch": 0.0068, "step": 17, "tokens_per_device": 6496 }, { "epoch": 0.0068, "loss_ce": 4.638586044311523, "loss_lvr": 17.824140548706055, "loss_mode_switch": 0.0, "loss_total": 6.421000003814697, "step": 17 }, { "batch_size": 4, "epoch": 0.0068, "step": 17, "tokens_per_device": 4532 }, { "epoch": 0.0068, "loss_ce": 4.161534309387207, "loss_lvr": 18.476598739624023, "loss_mode_switch": 0.0, "loss_total": 6.009194374084473, "step": 17 }, { "batch_size": 4, "epoch": 0.0068, "step": 17, "tokens_per_device": 10200 }, { "epoch": 0.0068, "loss_ce": 3.5715792179107666, "loss_lvr": 16.095760345458984, "loss_mode_switch": 0.0, "loss_total": 5.181155204772949, "step": 17 }, { "batch_size": 4, "epoch": 0.0068, "step": 17, "tokens_per_device": 1424 }, { "epoch": 0.0068, "loss_ce": 3.8048183917999268, "loss_lvr": 19.336212158203125, "loss_mode_switch": 0.0, "loss_total": 5.738439559936523, "step": 17 }, { "batch_size": 1, "epoch": 0.0068, "step": 17, "tokens_per_device": 5015 }, { "epoch": 0.0068, "loss_ce": 2.8361690044403076, "loss_lvr": 20.43886947631836, "loss_mode_switch": 0.0, "loss_total": 4.880055904388428, "step": 17 }, { "batch_size": 4, "epoch": 0.0068, "step": 17, "tokens_per_device": 9824 }, { "epoch": 0.0068, "loss_ce": 3.9718291759490967, "loss_lvr": 20.18190574645996, "loss_mode_switch": 0.0, "loss_total": 5.990019798278809, "step": 17 }, { "batch_size": 1, "epoch": 0.0068, "step": 17, "tokens_per_device": 5120 }, { "epoch": 0.0068, "loss_ce": 5.0193257331848145, "loss_lvr": 16.589221954345703, "loss_mode_switch": 0.0, "loss_total": 6.678247928619385, "step": 17 }, { "batch_size": 4, "epoch": 0.0068, "step": 17, "tokens_per_device": 3012 }, { "epoch": 0.0068, "loss_ce": 4.144783020019531, "loss_lvr": 15.79773235321045, "loss_mode_switch": 0.0, "loss_total": 5.7245564460754395, "step": 17 }, { "epoch": 0.0072, "grad_norm": 69.14779663085938, "learning_rate": 2.4000000000000003e-06, "loss": 6.0051, "step": 18 }, { "batch_size": 1, "epoch": 0.0072, "step": 18, "tokens_per_device": 5038 }, { "epoch": 0.0072, "loss_ce": 4.843437194824219, "loss_lvr": 14.377240180969238, "loss_mode_switch": 0.0, "loss_total": 6.281161308288574, "step": 18 }, { "batch_size": 4, "epoch": 0.0072, "step": 18, "tokens_per_device": 3888 }, { "epoch": 0.0072, "loss_ce": 3.9416661262512207, "loss_lvr": 17.46573829650879, "loss_mode_switch": 0.0, "loss_total": 5.688240051269531, "step": 18 }, { "batch_size": 4, "epoch": 0.0072, "step": 18, "tokens_per_device": 2528 }, { "epoch": 0.0072, "loss_ce": 3.869021415710449, "loss_lvr": 18.162086486816406, "loss_mode_switch": 0.0, "loss_total": 5.685230255126953, "step": 18 }, { "batch_size": 1, "epoch": 0.0072, "step": 18, "tokens_per_device": 4926 }, { "epoch": 0.0072, "loss_ce": 3.7575154304504395, "loss_lvr": 18.718250274658203, "loss_mode_switch": 0.0, "loss_total": 5.629340648651123, "step": 18 }, { "batch_size": 1, "epoch": 0.0072, "step": 18, "tokens_per_device": 4902 }, { "epoch": 0.0072, "loss_ce": 4.456331253051758, "loss_lvr": 14.905547142028809, "loss_mode_switch": 0.0, "loss_total": 5.94688606262207, "step": 18 }, { "batch_size": 4, "epoch": 0.0072, "step": 18, "tokens_per_device": 2724 }, { "epoch": 0.0072, "loss_ce": 3.8503849506378174, "loss_lvr": 17.109699249267578, "loss_mode_switch": 0.0, "loss_total": 5.561354637145996, "step": 18 }, { "batch_size": 4, "epoch": 0.0072, "step": 18, "tokens_per_device": 1576 }, { "epoch": 0.0072, "loss_ce": 3.9777050018310547, "loss_lvr": 16.940473556518555, "loss_mode_switch": 0.0, "loss_total": 5.671752452850342, "step": 18 }, { "batch_size": 1, "epoch": 0.0072, "step": 18, "tokens_per_device": 5855 }, { "epoch": 0.0072, "loss_ce": 4.9004340171813965, "loss_lvr": 16.299304962158203, "loss_mode_switch": 0.0, "loss_total": 6.530364513397217, "step": 18 }, { "epoch": 0.0076, "grad_norm": 63.1754264831543, "learning_rate": 2.5333333333333338e-06, "loss": 5.7464, "step": 19 }, { "batch_size": 4, "epoch": 0.0076, "step": 19, "tokens_per_device": 9556 }, { "epoch": 0.0076, "loss_ce": 3.710336923599243, "loss_lvr": 15.693078994750977, "loss_mode_switch": 0.0, "loss_total": 5.279644966125488, "step": 19 }, { "batch_size": 4, "epoch": 0.0076, "step": 19, "tokens_per_device": 11084 }, { "epoch": 0.0076, "loss_ce": 3.123806953430176, "loss_lvr": 17.552082061767578, "loss_mode_switch": 0.0, "loss_total": 4.87901496887207, "step": 19 }, { "batch_size": 4, "epoch": 0.0076, "step": 19, "tokens_per_device": 4300 }, { "epoch": 0.0076, "loss_ce": 3.404784917831421, "loss_lvr": 16.43052101135254, "loss_mode_switch": 0.0, "loss_total": 5.047837257385254, "step": 19 }, { "batch_size": 4, "epoch": 0.0076, "step": 19, "tokens_per_device": 6548 }, { "epoch": 0.0076, "loss_ce": 3.821160316467285, "loss_lvr": 19.190135955810547, "loss_mode_switch": 0.0, "loss_total": 5.740173816680908, "step": 19 }, { "batch_size": 4, "epoch": 0.0076, "step": 19, "tokens_per_device": 1784 }, { "epoch": 0.0076, "loss_ce": 3.3449034690856934, "loss_lvr": 15.569193840026855, "loss_mode_switch": 0.0, "loss_total": 4.901823043823242, "step": 19 }, { "batch_size": 4, "epoch": 0.0076, "step": 19, "tokens_per_device": 2676 }, { "epoch": 0.0076, "loss_ce": 2.799102544784546, "loss_lvr": 17.166976928710938, "loss_mode_switch": 0.0, "loss_total": 4.515800476074219, "step": 19 }, { "batch_size": 1, "epoch": 0.0076, "step": 19, "tokens_per_device": 4896 }, { "epoch": 0.0076, "loss_ce": 2.926323175430298, "loss_lvr": 25.54304313659668, "loss_mode_switch": 0.0, "loss_total": 5.480627536773682, "step": 19 }, { "batch_size": 4, "epoch": 0.0076, "step": 19, "tokens_per_device": 6356 }, { "epoch": 0.0076, "loss_ce": 4.2440876960754395, "loss_lvr": 17.132596969604492, "loss_mode_switch": 0.0, "loss_total": 5.957347393035889, "step": 19 }, { "epoch": 0.008, "grad_norm": 60.19755172729492, "learning_rate": 2.666666666666667e-06, "loss": 5.6776, "step": 20 }, { "batch_size": 4, "epoch": 0.008, "step": 20, "tokens_per_device": 3468 }, { "epoch": 0.008, "loss_ce": 3.4761500358581543, "loss_lvr": 16.34528923034668, "loss_mode_switch": 0.0, "loss_total": 5.1106791496276855, "step": 20 }, { "batch_size": 4, "epoch": 0.008, "step": 20, "tokens_per_device": 1248 }, { "epoch": 0.008, "loss_ce": 3.1073367595672607, "loss_lvr": 15.57971477508545, "loss_mode_switch": 0.0, "loss_total": 4.665307998657227, "step": 20 }, { "batch_size": 4, "epoch": 0.008, "step": 20, "tokens_per_device": 1708 }, { "epoch": 0.008, "loss_ce": 4.531092166900635, "loss_lvr": 15.079303741455078, "loss_mode_switch": 0.0, "loss_total": 6.039022445678711, "step": 20 }, { "batch_size": 4, "epoch": 0.008, "step": 20, "tokens_per_device": 11120 }, { "epoch": 0.008, "loss_ce": 3.4468536376953125, "loss_lvr": 16.075237274169922, "loss_mode_switch": 0.0, "loss_total": 5.054377555847168, "step": 20 }, { "batch_size": 1, "epoch": 0.008, "step": 20, "tokens_per_device": 4900 }, { "epoch": 0.008, "loss_ce": 3.201990842819214, "loss_lvr": 18.1978759765625, "loss_mode_switch": 0.0, "loss_total": 5.021778583526611, "step": 20 }, { "batch_size": 4, "epoch": 0.008, "step": 20, "tokens_per_device": 4324 }, { "epoch": 0.008, "loss_ce": 3.290093421936035, "loss_lvr": 15.480781555175781, "loss_mode_switch": 0.0, "loss_total": 4.838171482086182, "step": 20 }, { "batch_size": 4, "epoch": 0.008, "step": 20, "tokens_per_device": 3904 }, { "epoch": 0.008, "loss_ce": 4.3743486404418945, "loss_lvr": 15.237995147705078, "loss_mode_switch": 0.0, "loss_total": 5.898148059844971, "step": 20 }, { "batch_size": 4, "epoch": 0.008, "step": 20, "tokens_per_device": 4248 }, { "epoch": 0.008, "loss_ce": 4.392786026000977, "loss_lvr": 16.020784378051758, "loss_mode_switch": 0.0, "loss_total": 5.994864463806152, "step": 20 }, { "epoch": 0.0084, "grad_norm": 48.34164047241211, "learning_rate": 2.8000000000000003e-06, "loss": 5.3213, "step": 21 }, { "batch_size": 4, "epoch": 0.0084, "step": 21, "tokens_per_device": 6496 }, { "epoch": 0.0084, "loss_ce": 3.118898630142212, "loss_lvr": 18.938518524169922, "loss_mode_switch": 0.0, "loss_total": 5.012750625610352, "step": 21 }, { "batch_size": 1, "epoch": 0.0084, "step": 21, "tokens_per_device": 5159 }, { "epoch": 0.0084, "loss_ce": 1.919736385345459, "loss_lvr": 16.46965980529785, "loss_mode_switch": 0.0, "loss_total": 3.566702365875244, "step": 21 }, { "batch_size": 1, "epoch": 0.0084, "step": 21, "tokens_per_device": 5075 }, { "epoch": 0.0084, "loss_ce": 4.031325817108154, "loss_lvr": 13.50245475769043, "loss_mode_switch": 0.0, "loss_total": 5.381571292877197, "step": 21 }, { "batch_size": 1, "epoch": 0.0084, "step": 21, "tokens_per_device": 5098 }, { "epoch": 0.0084, "loss_ce": 4.352339267730713, "loss_lvr": 18.302122116088867, "loss_mode_switch": 0.0, "loss_total": 6.182551383972168, "step": 21 }, { "batch_size": 4, "epoch": 0.0084, "step": 21, "tokens_per_device": 4364 }, { "epoch": 0.0084, "loss_ce": 3.5632731914520264, "loss_lvr": 16.012908935546875, "loss_mode_switch": 0.0, "loss_total": 5.16456413269043, "step": 21 }, { "batch_size": 4, "epoch": 0.0084, "step": 21, "tokens_per_device": 4508 }, { "epoch": 0.0084, "loss_ce": 3.646364688873291, "loss_lvr": 15.8263521194458, "loss_mode_switch": 0.0, "loss_total": 5.229000091552734, "step": 21 }, { "batch_size": 1, "epoch": 0.0084, "step": 21, "tokens_per_device": 4899 }, { "epoch": 0.0084, "loss_ce": 2.2203142642974854, "loss_lvr": 17.58533477783203, "loss_mode_switch": 0.0, "loss_total": 3.9788477420806885, "step": 21 }, { "batch_size": 4, "epoch": 0.0084, "step": 21, "tokens_per_device": 4208 }, { "epoch": 0.0084, "loss_ce": 3.6108458042144775, "loss_lvr": 14.786484718322754, "loss_mode_switch": 0.0, "loss_total": 5.089494228363037, "step": 21 }, { "epoch": 0.0088, "grad_norm": 41.52472686767578, "learning_rate": 2.9333333333333338e-06, "loss": 4.9882, "step": 22 }, { "batch_size": 4, "epoch": 0.0088, "step": 22, "tokens_per_device": 4700 }, { "epoch": 0.0088, "loss_ce": 3.084892511367798, "loss_lvr": 12.223342895507812, "loss_mode_switch": 0.0, "loss_total": 4.307226657867432, "step": 22 }, { "batch_size": 4, "epoch": 0.0088, "step": 22, "tokens_per_device": 1548 }, { "epoch": 0.0088, "loss_ce": 3.0580379962921143, "loss_lvr": 11.507311820983887, "loss_mode_switch": 0.0, "loss_total": 4.20876932144165, "step": 22 }, { "batch_size": 4, "epoch": 0.0088, "step": 22, "tokens_per_device": 1412 }, { "epoch": 0.0088, "loss_ce": 2.5511980056762695, "loss_lvr": 11.675350189208984, "loss_mode_switch": 0.0, "loss_total": 3.718733072280884, "step": 22 }, { "batch_size": 4, "epoch": 0.0088, "step": 22, "tokens_per_device": 2656 }, { "epoch": 0.0088, "loss_ce": 3.5494632720947266, "loss_lvr": 11.415525436401367, "loss_mode_switch": 0.0, "loss_total": 4.691015720367432, "step": 22 }, { "batch_size": 1, "epoch": 0.0088, "step": 22, "tokens_per_device": 5018 }, { "epoch": 0.0088, "loss_ce": 3.512296438217163, "loss_lvr": 9.236138343811035, "loss_mode_switch": 0.0, "loss_total": 4.435910224914551, "step": 22 }, { "batch_size": 1, "epoch": 0.0088, "step": 22, "tokens_per_device": 6862 }, { "epoch": 0.0088, "loss_ce": 3.481968641281128, "loss_lvr": 12.203254699707031, "loss_mode_switch": 0.0, "loss_total": 4.70229434967041, "step": 22 }, { "batch_size": 4, "epoch": 0.0088, "step": 22, "tokens_per_device": 4156 }, { "epoch": 0.0088, "loss_ce": 3.2860069274902344, "loss_lvr": 10.037994384765625, "loss_mode_switch": 0.0, "loss_total": 4.289806365966797, "step": 22 }, { "batch_size": 4, "epoch": 0.0088, "step": 22, "tokens_per_device": 2560 }, { "epoch": 0.0088, "loss_ce": 2.991959571838379, "loss_lvr": 11.088516235351562, "loss_mode_switch": 0.0, "loss_total": 4.100811004638672, "step": 22 }, { "epoch": 0.0092, "grad_norm": 36.20014190673828, "learning_rate": 3.066666666666667e-06, "loss": 4.31, "step": 23 }, { "batch_size": 4, "epoch": 0.0092, "step": 23, "tokens_per_device": 4068 }, { "epoch": 0.0092, "loss_ce": 2.8102028369903564, "loss_lvr": 8.863232612609863, "loss_mode_switch": 0.0, "loss_total": 3.696526050567627, "step": 23 }, { "batch_size": 4, "epoch": 0.0092, "step": 23, "tokens_per_device": 4576 }, { "epoch": 0.0092, "loss_ce": 2.930541515350342, "loss_lvr": 13.893136024475098, "loss_mode_switch": 0.0, "loss_total": 4.319855213165283, "step": 23 }, { "batch_size": 4, "epoch": 0.0092, "step": 23, "tokens_per_device": 4096 }, { "epoch": 0.0092, "loss_ce": 2.731340169906616, "loss_lvr": 8.314614295959473, "loss_mode_switch": 0.0, "loss_total": 3.5628015995025635, "step": 23 }, { "batch_size": 4, "epoch": 0.0092, "step": 23, "tokens_per_device": 3760 }, { "epoch": 0.0092, "loss_ce": 2.8982009887695312, "loss_lvr": 10.365588188171387, "loss_mode_switch": 0.0, "loss_total": 3.9347598552703857, "step": 23 }, { "batch_size": 1, "epoch": 0.0092, "step": 23, "tokens_per_device": 4885 }, { "epoch": 0.0092, "loss_ce": 3.175828456878662, "loss_lvr": 11.701010704040527, "loss_mode_switch": 0.0, "loss_total": 4.3459296226501465, "step": 23 }, { "batch_size": 4, "epoch": 0.0092, "step": 23, "tokens_per_device": 2060 }, { "epoch": 0.0092, "loss_ce": 3.0303456783294678, "loss_lvr": 8.3988618850708, "loss_mode_switch": 0.0, "loss_total": 3.870231866836548, "step": 23 }, { "batch_size": 1, "epoch": 0.0092, "step": 23, "tokens_per_device": 4588 }, { "epoch": 0.0092, "loss_ce": 3.5676586627960205, "loss_lvr": 9.354952812194824, "loss_mode_switch": 0.0, "loss_total": 4.5031538009643555, "step": 23 }, { "batch_size": 1, "epoch": 0.0092, "step": 23, "tokens_per_device": 4914 }, { "epoch": 0.0092, "loss_ce": 2.69370436668396, "loss_lvr": 12.632869720458984, "loss_mode_switch": 0.0, "loss_total": 3.956991195678711, "step": 23 }, { "epoch": 0.0096, "grad_norm": 34.18856430053711, "learning_rate": 3.2000000000000003e-06, "loss": 3.9302, "step": 24 }, { "batch_size": 4, "epoch": 0.0096, "step": 24, "tokens_per_device": 2788 }, { "epoch": 0.0096, "loss_ce": 2.8380987644195557, "loss_lvr": 7.141580581665039, "loss_mode_switch": 0.0, "loss_total": 3.5522568225860596, "step": 24 }, { "batch_size": 1, "epoch": 0.0096, "step": 24, "tokens_per_device": 4888 }, { "epoch": 0.0096, "loss_ce": 1.9589118957519531, "loss_lvr": 11.177289962768555, "loss_mode_switch": 0.0, "loss_total": 3.076641082763672, "step": 24 }, { "batch_size": 1, "epoch": 0.0096, "step": 24, "tokens_per_device": 7475 }, { "epoch": 0.0096, "loss_ce": 3.1266934871673584, "loss_lvr": 10.746654510498047, "loss_mode_switch": 0.0, "loss_total": 4.201358795166016, "step": 24 }, { "batch_size": 4, "epoch": 0.0096, "step": 24, "tokens_per_device": 1628 }, { "epoch": 0.0096, "loss_ce": 2.7329554557800293, "loss_lvr": 7.813350200653076, "loss_mode_switch": 0.0, "loss_total": 3.5142905712127686, "step": 24 }, { "batch_size": 4, "epoch": 0.0096, "step": 24, "tokens_per_device": 10364 }, { "epoch": 0.0096, "loss_ce": 3.300260305404663, "loss_lvr": 8.724717140197754, "loss_mode_switch": 0.0, "loss_total": 4.172731876373291, "step": 24 }, { "batch_size": 4, "epoch": 0.0096, "step": 24, "tokens_per_device": 2644 }, { "epoch": 0.0096, "loss_ce": 2.7593562602996826, "loss_lvr": 8.028122901916504, "loss_mode_switch": 0.0, "loss_total": 3.562168598175049, "step": 24 }, { "batch_size": 4, "epoch": 0.0096, "step": 24, "tokens_per_device": 11168 }, { "epoch": 0.0096, "loss_ce": 3.0250070095062256, "loss_lvr": 7.428744316101074, "loss_mode_switch": 0.0, "loss_total": 3.767881393432617, "step": 24 }, { "batch_size": 4, "epoch": 0.0096, "step": 24, "tokens_per_device": 4884 }, { "epoch": 0.0096, "loss_ce": 2.835153341293335, "loss_lvr": 7.478569984436035, "loss_mode_switch": 0.0, "loss_total": 3.58301043510437, "step": 24 }, { "epoch": 0.01, "grad_norm": 29.70092010498047, "learning_rate": 3.3333333333333333e-06, "loss": 3.6449, "step": 25 }, { "batch_size": 4, "epoch": 0.01, "step": 25, "tokens_per_device": 1256 }, { "epoch": 0.01, "loss_ce": 2.1664555072784424, "loss_lvr": 7.257518291473389, "loss_mode_switch": 0.0, "loss_total": 2.892207384109497, "step": 25 }, { "batch_size": 4, "epoch": 0.01, "step": 25, "tokens_per_device": 4188 }, { "epoch": 0.01, "loss_ce": 2.3104307651519775, "loss_lvr": 7.179269313812256, "loss_mode_switch": 0.0, "loss_total": 3.028357744216919, "step": 25 }, { "batch_size": 4, "epoch": 0.01, "step": 25, "tokens_per_device": 5664 }, { "epoch": 0.01, "loss_ce": 2.966902017593384, "loss_lvr": 7.004391193389893, "loss_mode_switch": 0.0, "loss_total": 3.6673412322998047, "step": 25 }, { "batch_size": 4, "epoch": 0.01, "step": 25, "tokens_per_device": 3880 }, { "epoch": 0.01, "loss_ce": 2.8692445755004883, "loss_lvr": 7.129571437835693, "loss_mode_switch": 0.0, "loss_total": 3.5822017192840576, "step": 25 }, { "batch_size": 1, "epoch": 0.01, "step": 25, "tokens_per_device": 4925 }, { "epoch": 0.01, "loss_ce": 2.0404796600341797, "loss_lvr": 12.323954582214355, "loss_mode_switch": 0.0, "loss_total": 3.2728753089904785, "step": 25 }, { "batch_size": 1, "epoch": 0.01, "step": 25, "tokens_per_device": 4887 }, { "epoch": 0.01, "loss_ce": 2.3568832874298096, "loss_lvr": 9.522704124450684, "loss_mode_switch": 0.0, "loss_total": 3.3091537952423096, "step": 25 }, { "batch_size": 1, "epoch": 0.01, "step": 25, "tokens_per_device": 4857 }, { "epoch": 0.01, "loss_ce": 1.6232949495315552, "loss_lvr": 7.565703392028809, "loss_mode_switch": 0.0, "loss_total": 2.3798651695251465, "step": 25 }, { "batch_size": 1, "epoch": 0.01, "step": 25, "tokens_per_device": 7937 }, { "epoch": 0.01, "loss_ce": 3.196444272994995, "loss_lvr": 7.27596378326416, "loss_mode_switch": 0.0, "loss_total": 3.9240407943725586, "step": 25 }, { "epoch": 0.0104, "grad_norm": 26.13296127319336, "learning_rate": 3.4666666666666672e-06, "loss": 3.3614, "step": 26 }, { "batch_size": 4, "epoch": 0.0104, "step": 26, "tokens_per_device": 4364 }, { "epoch": 0.0104, "loss_ce": 2.344364643096924, "loss_lvr": 5.978583335876465, "loss_mode_switch": 0.0, "loss_total": 2.942223072052002, "step": 26 }, { "batch_size": 1, "epoch": 0.0104, "step": 26, "tokens_per_device": 4825 }, { "epoch": 0.0104, "loss_ce": 2.40194034576416, "loss_lvr": 9.833507537841797, "loss_mode_switch": 0.0, "loss_total": 3.38529109954834, "step": 26 }, { "batch_size": 1, "epoch": 0.0104, "step": 26, "tokens_per_device": 6309 }, { "epoch": 0.0104, "loss_ce": 2.643767833709717, "loss_lvr": 7.149838924407959, "loss_mode_switch": 0.0, "loss_total": 3.3587517738342285, "step": 26 }, { "batch_size": 1, "epoch": 0.0104, "step": 26, "tokens_per_device": 4803 }, { "epoch": 0.0104, "loss_ce": 2.2095165252685547, "loss_lvr": 7.896086692810059, "loss_mode_switch": 0.0, "loss_total": 2.9991252422332764, "step": 26 }, { "batch_size": 4, "epoch": 0.0104, "step": 26, "tokens_per_device": 4392 }, { "epoch": 0.0104, "loss_ce": 2.291182041168213, "loss_lvr": 6.4083051681518555, "loss_mode_switch": 0.0, "loss_total": 2.9320125579833984, "step": 26 }, { "batch_size": 4, "epoch": 0.0104, "step": 26, "tokens_per_device": 3320 }, { "epoch": 0.0104, "loss_ce": 2.2927989959716797, "loss_lvr": 6.285375595092773, "loss_mode_switch": 0.0, "loss_total": 2.9213366508483887, "step": 26 }, { "batch_size": 4, "epoch": 0.0104, "step": 26, "tokens_per_device": 4332 }, { "epoch": 0.0104, "loss_ce": 2.4623289108276367, "loss_lvr": 6.557909965515137, "loss_mode_switch": 0.0, "loss_total": 3.118119955062866, "step": 26 }, { "batch_size": 4, "epoch": 0.0104, "step": 26, "tokens_per_device": 4548 }, { "epoch": 0.0104, "loss_ce": 2.7299587726593018, "loss_lvr": 6.254184246063232, "loss_mode_switch": 0.0, "loss_total": 3.355377197265625, "step": 26 }, { "epoch": 0.0108, "grad_norm": 23.515398025512695, "learning_rate": 3.6000000000000003e-06, "loss": 3.1789, "step": 27 }, { "batch_size": 4, "epoch": 0.0108, "step": 27, "tokens_per_device": 2864 }, { "epoch": 0.0108, "loss_ce": 2.3815572261810303, "loss_lvr": 5.3581671714782715, "loss_mode_switch": 0.0, "loss_total": 2.9173738956451416, "step": 27 }, { "batch_size": 4, "epoch": 0.0108, "step": 27, "tokens_per_device": 5892 }, { "epoch": 0.0108, "loss_ce": 2.6890017986297607, "loss_lvr": 5.253181457519531, "loss_mode_switch": 0.0, "loss_total": 3.214319944381714, "step": 27 }, { "batch_size": 4, "epoch": 0.0108, "step": 27, "tokens_per_device": 4240 }, { "epoch": 0.0108, "loss_ce": 2.209120988845825, "loss_lvr": 5.599902629852295, "loss_mode_switch": 0.0, "loss_total": 2.769111156463623, "step": 27 }, { "batch_size": 4, "epoch": 0.0108, "step": 27, "tokens_per_device": 4120 }, { "epoch": 0.0108, "loss_ce": 1.950025200843811, "loss_lvr": 6.453608989715576, "loss_mode_switch": 0.0, "loss_total": 2.595386028289795, "step": 27 }, { "batch_size": 1, "epoch": 0.0108, "step": 27, "tokens_per_device": 4897 }, { "epoch": 0.0108, "loss_ce": 2.2030534744262695, "loss_lvr": 7.905365467071533, "loss_mode_switch": 0.0, "loss_total": 2.9935901165008545, "step": 27 }, { "batch_size": 4, "epoch": 0.0108, "step": 27, "tokens_per_device": 1772 }, { "epoch": 0.0108, "loss_ce": 1.7376585006713867, "loss_lvr": 6.097833156585693, "loss_mode_switch": 0.0, "loss_total": 2.3474419116973877, "step": 27 }, { "batch_size": 4, "epoch": 0.0108, "step": 27, "tokens_per_device": 4192 }, { "epoch": 0.0108, "loss_ce": 2.2145514488220215, "loss_lvr": 5.3429436683654785, "loss_mode_switch": 0.0, "loss_total": 2.7488458156585693, "step": 27 }, { "batch_size": 4, "epoch": 0.0108, "step": 27, "tokens_per_device": 4184 }, { "epoch": 0.0108, "loss_ce": 2.3709511756896973, "loss_lvr": 5.713209629058838, "loss_mode_switch": 0.0, "loss_total": 2.942272186279297, "step": 27 }, { "epoch": 0.0112, "grad_norm": 19.41569709777832, "learning_rate": 3.7333333333333337e-06, "loss": 2.8786, "step": 28 }, { "batch_size": 1, "epoch": 0.0112, "step": 28, "tokens_per_device": 4895 }, { "epoch": 0.0112, "loss_ce": 1.6380600929260254, "loss_lvr": 6.576527118682861, "loss_mode_switch": 0.0, "loss_total": 2.295712947845459, "step": 28 }, { "batch_size": 1, "epoch": 0.0112, "step": 28, "tokens_per_device": 6787 }, { "epoch": 0.0112, "loss_ce": 2.368134021759033, "loss_lvr": 5.471785068511963, "loss_mode_switch": 0.0, "loss_total": 2.9153125286102295, "step": 28 }, { "batch_size": 4, "epoch": 0.0112, "step": 28, "tokens_per_device": 1600 }, { "epoch": 0.0112, "loss_ce": 2.001267910003662, "loss_lvr": 5.176748275756836, "loss_mode_switch": 0.0, "loss_total": 2.5189428329467773, "step": 28 }, { "batch_size": 1, "epoch": 0.0112, "step": 28, "tokens_per_device": 4897 }, { "epoch": 0.0112, "loss_ce": 1.280647873878479, "loss_lvr": 7.128549575805664, "loss_mode_switch": 0.0, "loss_total": 1.9935028553009033, "step": 28 }, { "batch_size": 4, "epoch": 0.0112, "step": 28, "tokens_per_device": 5168 }, { "epoch": 0.0112, "loss_ce": 2.169060468673706, "loss_lvr": 6.250667095184326, "loss_mode_switch": 0.0, "loss_total": 2.7941272258758545, "step": 28 }, { "batch_size": 4, "epoch": 0.0112, "step": 28, "tokens_per_device": 4832 }, { "epoch": 0.0112, "loss_ce": 2.274313449859619, "loss_lvr": 5.659326076507568, "loss_mode_switch": 0.0, "loss_total": 2.8402462005615234, "step": 28 }, { "batch_size": 4, "epoch": 0.0112, "step": 28, "tokens_per_device": 4172 }, { "epoch": 0.0112, "loss_ce": 2.0599472522735596, "loss_lvr": 6.364450454711914, "loss_mode_switch": 0.0, "loss_total": 2.696392297744751, "step": 28 }, { "batch_size": 4, "epoch": 0.0112, "step": 28, "tokens_per_device": 1396 }, { "epoch": 0.0112, "loss_ce": 2.231175184249878, "loss_lvr": 5.125461101531982, "loss_mode_switch": 0.0, "loss_total": 2.7437212467193604, "step": 28 }, { "epoch": 0.0116, "grad_norm": 15.494439125061035, "learning_rate": 3.866666666666667e-06, "loss": 2.6109, "step": 29 }, { "batch_size": 4, "epoch": 0.0116, "step": 29, "tokens_per_device": 2676 }, { "epoch": 0.0116, "loss_ce": 1.8603672981262207, "loss_lvr": 4.060234546661377, "loss_mode_switch": 0.0, "loss_total": 2.266390800476074, "step": 29 }, { "batch_size": 1, "epoch": 0.0116, "step": 29, "tokens_per_device": 5161 }, { "epoch": 0.0116, "loss_ce": 1.7391865253448486, "loss_lvr": 5.250158786773682, "loss_mode_switch": 0.0, "loss_total": 2.264202356338501, "step": 29 }, { "batch_size": 4, "epoch": 0.0116, "step": 29, "tokens_per_device": 2584 }, { "epoch": 0.0116, "loss_ce": 2.1419897079467773, "loss_lvr": 4.459306240081787, "loss_mode_switch": 0.0, "loss_total": 2.5879204273223877, "step": 29 }, { "batch_size": 4, "epoch": 0.0116, "step": 29, "tokens_per_device": 5904 }, { "epoch": 0.0116, "loss_ce": 2.4664392471313477, "loss_lvr": 4.5486578941345215, "loss_mode_switch": 0.0, "loss_total": 2.921304941177368, "step": 29 }, { "batch_size": 4, "epoch": 0.0116, "step": 29, "tokens_per_device": 4716 }, { "epoch": 0.0116, "loss_ce": 1.9122684001922607, "loss_lvr": 4.057705402374268, "loss_mode_switch": 0.0, "loss_total": 2.3180389404296875, "step": 29 }, { "batch_size": 4, "epoch": 0.0116, "step": 29, "tokens_per_device": 3796 }, { "epoch": 0.0116, "loss_ce": 1.9761898517608643, "loss_lvr": 3.8106987476348877, "loss_mode_switch": 0.0, "loss_total": 2.357259750366211, "step": 29 }, { "batch_size": 4, "epoch": 0.0116, "step": 29, "tokens_per_device": 2940 }, { "epoch": 0.0116, "loss_ce": 2.2222096920013428, "loss_lvr": 3.7024993896484375, "loss_mode_switch": 0.0, "loss_total": 2.5924596786499023, "step": 29 }, { "batch_size": 4, "epoch": 0.0116, "step": 29, "tokens_per_device": 3056 }, { "epoch": 0.0116, "loss_ce": 2.228783130645752, "loss_lvr": 4.937905311584473, "loss_mode_switch": 0.0, "loss_total": 2.722573757171631, "step": 29 }, { "epoch": 0.012, "grad_norm": 14.328296661376953, "learning_rate": 4.000000000000001e-06, "loss": 2.5823, "step": 30 }, { "batch_size": 1, "epoch": 0.012, "step": 30, "tokens_per_device": 6599 }, { "epoch": 0.012, "loss_ce": 1.5969276428222656, "loss_lvr": 5.090417385101318, "loss_mode_switch": 0.0, "loss_total": 2.1059694290161133, "step": 30 }, { "batch_size": 4, "epoch": 0.012, "step": 30, "tokens_per_device": 3828 }, { "epoch": 0.012, "loss_ce": 1.7286759614944458, "loss_lvr": 3.4625344276428223, "loss_mode_switch": 0.0, "loss_total": 2.0749294757843018, "step": 30 }, { "batch_size": 4, "epoch": 0.012, "step": 30, "tokens_per_device": 6328 }, { "epoch": 0.012, "loss_ce": 1.9716953039169312, "loss_lvr": 3.6963250637054443, "loss_mode_switch": 0.0, "loss_total": 2.3413279056549072, "step": 30 }, { "batch_size": 4, "epoch": 0.012, "step": 30, "tokens_per_device": 15980 }, { "epoch": 0.012, "loss_ce": 2.2558507919311523, "loss_lvr": 3.6535708904266357, "loss_mode_switch": 0.0, "loss_total": 2.6212079524993896, "step": 30 }, { "batch_size": 4, "epoch": 0.012, "step": 30, "tokens_per_device": 3760 }, { "epoch": 0.012, "loss_ce": 2.1775639057159424, "loss_lvr": 3.118378162384033, "loss_mode_switch": 0.0, "loss_total": 2.4894018173217773, "step": 30 }, { "batch_size": 4, "epoch": 0.012, "step": 30, "tokens_per_device": 4368 }, { "epoch": 0.012, "loss_ce": 2.0758235454559326, "loss_lvr": 2.821793794631958, "loss_mode_switch": 0.0, "loss_total": 2.3580029010772705, "step": 30 }, { "batch_size": 1, "epoch": 0.012, "step": 30, "tokens_per_device": 7073 }, { "epoch": 0.012, "loss_ce": 2.0164124965667725, "loss_lvr": 3.746462821960449, "loss_mode_switch": 0.0, "loss_total": 2.3910586833953857, "step": 30 }, { "batch_size": 4, "epoch": 0.012, "step": 30, "tokens_per_device": 2600 }, { "epoch": 0.012, "loss_ce": 2.139119863510132, "loss_lvr": 3.7454423904418945, "loss_mode_switch": 0.0, "loss_total": 2.5136640071868896, "step": 30 }, { "epoch": 0.0124, "grad_norm": 13.581076622009277, "learning_rate": 4.133333333333333e-06, "loss": 2.3581, "step": 31 }, { "batch_size": 4, "epoch": 0.0124, "step": 31, "tokens_per_device": 4672 }, { "epoch": 0.0124, "loss_ce": 1.890592336654663, "loss_lvr": 2.6450963020324707, "loss_mode_switch": 0.0, "loss_total": 2.155102014541626, "step": 31 }, { "batch_size": 4, "epoch": 0.0124, "step": 31, "tokens_per_device": 1308 }, { "epoch": 0.0124, "loss_ce": 1.844870924949646, "loss_lvr": 2.691143274307251, "loss_mode_switch": 0.0, "loss_total": 2.113985300064087, "step": 31 }, { "batch_size": 4, "epoch": 0.0124, "step": 31, "tokens_per_device": 1504 }, { "epoch": 0.0124, "loss_ce": 2.2699005603790283, "loss_lvr": 2.9137778282165527, "loss_mode_switch": 0.0, "loss_total": 2.5612783432006836, "step": 31 }, { "batch_size": 4, "epoch": 0.0124, "step": 31, "tokens_per_device": 1344 }, { "epoch": 0.0124, "loss_ce": 1.8794740438461304, "loss_lvr": 2.7679126262664795, "loss_mode_switch": 0.0, "loss_total": 2.1562652587890625, "step": 31 }, { "batch_size": 1, "epoch": 0.0124, "step": 31, "tokens_per_device": 5160 }, { "epoch": 0.0124, "loss_ce": 2.4422194957733154, "loss_lvr": 2.0723886489868164, "loss_mode_switch": 0.0, "loss_total": 2.649458408355713, "step": 31 }, { "batch_size": 4, "epoch": 0.0124, "step": 31, "tokens_per_device": 1176 }, { "epoch": 0.0124, "loss_ce": 1.9717167615890503, "loss_lvr": 3.220336437225342, "loss_mode_switch": 0.0, "loss_total": 2.293750286102295, "step": 31 }, { "batch_size": 4, "epoch": 0.0124, "step": 31, "tokens_per_device": 3828 }, { "epoch": 0.0124, "loss_ce": 2.136063814163208, "loss_lvr": 2.789041757583618, "loss_mode_switch": 0.0, "loss_total": 2.4149680137634277, "step": 31 }, { "batch_size": 4, "epoch": 0.0124, "step": 31, "tokens_per_device": 1496 }, { "epoch": 0.0124, "loss_ce": 1.8794937133789062, "loss_lvr": 3.1657140254974365, "loss_mode_switch": 0.0, "loss_total": 2.1960651874542236, "step": 31 }, { "epoch": 0.0128, "grad_norm": 12.328706741333008, "learning_rate": 4.266666666666668e-06, "loss": 2.2265, "step": 32 }, { "batch_size": 4, "epoch": 0.0128, "step": 32, "tokens_per_device": 4312 }, { "epoch": 0.0128, "loss_ce": 1.705545425415039, "loss_lvr": 2.476416826248169, "loss_mode_switch": 0.0, "loss_total": 1.953187108039856, "step": 32 }, { "batch_size": 4, "epoch": 0.0128, "step": 32, "tokens_per_device": 5112 }, { "epoch": 0.0128, "loss_ce": 1.8726141452789307, "loss_lvr": 2.427065134048462, "loss_mode_switch": 0.0, "loss_total": 2.1153206825256348, "step": 32 }, { "batch_size": 1, "epoch": 0.0128, "step": 32, "tokens_per_device": 4941 }, { "epoch": 0.0128, "loss_ce": 1.269751787185669, "loss_lvr": 2.056349754333496, "loss_mode_switch": 0.0, "loss_total": 1.4753867387771606, "step": 32 }, { "batch_size": 4, "epoch": 0.0128, "step": 32, "tokens_per_device": 5416 }, { "epoch": 0.0128, "loss_ce": 1.9015028476715088, "loss_lvr": 2.407632350921631, "loss_mode_switch": 0.0, "loss_total": 2.142266035079956, "step": 32 }, { "batch_size": 4, "epoch": 0.0128, "step": 32, "tokens_per_device": 1404 }, { "epoch": 0.0128, "loss_ce": 1.8029252290725708, "loss_lvr": 2.7881999015808105, "loss_mode_switch": 0.0, "loss_total": 2.081745147705078, "step": 32 }, { "batch_size": 4, "epoch": 0.0128, "step": 32, "tokens_per_device": 4280 }, { "epoch": 0.0128, "loss_ce": 2.0075221061706543, "loss_lvr": 2.586927652359009, "loss_mode_switch": 0.0, "loss_total": 2.2662148475646973, "step": 32 }, { "batch_size": 4, "epoch": 0.0128, "step": 32, "tokens_per_device": 4700 }, { "epoch": 0.0128, "loss_ce": 2.1700406074523926, "loss_lvr": 2.437828779220581, "loss_mode_switch": 0.0, "loss_total": 2.4138236045837402, "step": 32 }, { "batch_size": 1, "epoch": 0.0128, "step": 32, "tokens_per_device": 5110 }, { "epoch": 0.0128, "loss_ce": 1.8048057556152344, "loss_lvr": 1.8319464921951294, "loss_mode_switch": 0.0, "loss_total": 1.9880003929138184, "step": 32 }, { "epoch": 0.0132, "grad_norm": 11.84946346282959, "learning_rate": 4.4e-06, "loss": 2.1677, "step": 33 }, { "batch_size": 1, "epoch": 0.0132, "step": 33, "tokens_per_device": 5118 }, { "epoch": 0.0132, "loss_ce": 1.964670181274414, "loss_lvr": 1.7469042539596558, "loss_mode_switch": 0.0, "loss_total": 2.1393606662750244, "step": 33 }, { "batch_size": 1, "epoch": 0.0132, "step": 33, "tokens_per_device": 5115 }, { "epoch": 0.0132, "loss_ce": 1.9090509414672852, "loss_lvr": 1.8882094621658325, "loss_mode_switch": 0.0, "loss_total": 2.097871780395508, "step": 33 }, { "batch_size": 4, "epoch": 0.0132, "step": 33, "tokens_per_device": 12108 }, { "epoch": 0.0132, "loss_ce": 1.9048479795455933, "loss_lvr": 2.5019936561584473, "loss_mode_switch": 0.0, "loss_total": 2.1550474166870117, "step": 33 }, { "batch_size": 4, "epoch": 0.0132, "step": 33, "tokens_per_device": 4600 }, { "epoch": 0.0132, "loss_ce": 1.8199406862258911, "loss_lvr": 2.376858711242676, "loss_mode_switch": 0.0, "loss_total": 2.057626485824585, "step": 33 }, { "batch_size": 1, "epoch": 0.0132, "step": 33, "tokens_per_device": 4871 }, { "epoch": 0.0132, "loss_ce": 1.388524055480957, "loss_lvr": 1.8232632875442505, "loss_mode_switch": 0.0, "loss_total": 1.5708503723144531, "step": 33 }, { "batch_size": 4, "epoch": 0.0132, "step": 33, "tokens_per_device": 4376 }, { "epoch": 0.0132, "loss_ce": 2.0417847633361816, "loss_lvr": 2.5018997192382812, "loss_mode_switch": 0.0, "loss_total": 2.2919747829437256, "step": 33 }, { "batch_size": 4, "epoch": 0.0132, "step": 33, "tokens_per_device": 1412 }, { "epoch": 0.0132, "loss_ce": 2.0993785858154297, "loss_lvr": 3.346431016921997, "loss_mode_switch": 0.0, "loss_total": 2.4340217113494873, "step": 33 }, { "batch_size": 1, "epoch": 0.0132, "step": 33, "tokens_per_device": 4881 }, { "epoch": 0.0132, "loss_ce": 2.1711716651916504, "loss_lvr": 1.7260901927947998, "loss_mode_switch": 0.0, "loss_total": 2.343780755996704, "step": 33 }, { "epoch": 0.0136, "grad_norm": 12.778327941894531, "learning_rate": 4.533333333333334e-06, "loss": 2.0743, "step": 34 }, { "batch_size": 4, "epoch": 0.0136, "step": 34, "tokens_per_device": 7640 }, { "epoch": 0.0136, "loss_ce": 1.9698874950408936, "loss_lvr": 1.7578094005584717, "loss_mode_switch": 0.0, "loss_total": 2.1456685066223145, "step": 34 }, { "batch_size": 4, "epoch": 0.0136, "step": 34, "tokens_per_device": 6052 }, { "epoch": 0.0136, "loss_ce": 1.856223702430725, "loss_lvr": 2.0639493465423584, "loss_mode_switch": 0.0, "loss_total": 2.0626187324523926, "step": 34 }, { "batch_size": 4, "epoch": 0.0136, "step": 34, "tokens_per_device": 1348 }, { "epoch": 0.0136, "loss_ce": 1.8234679698944092, "loss_lvr": 2.4013803005218506, "loss_mode_switch": 0.0, "loss_total": 2.063606023788452, "step": 34 }, { "batch_size": 4, "epoch": 0.0136, "step": 34, "tokens_per_device": 15164 }, { "epoch": 0.0136, "loss_ce": 1.9315599203109741, "loss_lvr": 2.0071399211883545, "loss_mode_switch": 0.0, "loss_total": 2.1322739124298096, "step": 34 }, { "batch_size": 4, "epoch": 0.0136, "step": 34, "tokens_per_device": 3728 }, { "epoch": 0.0136, "loss_ce": 1.5334500074386597, "loss_lvr": 2.0277481079101562, "loss_mode_switch": 0.0, "loss_total": 1.7362247705459595, "step": 34 }, { "batch_size": 1, "epoch": 0.0136, "step": 34, "tokens_per_device": 5177 }, { "epoch": 0.0136, "loss_ce": 2.148355007171631, "loss_lvr": 1.6783610582351685, "loss_mode_switch": 0.0, "loss_total": 2.3161911964416504, "step": 34 }, { "batch_size": 1, "epoch": 0.0136, "step": 34, "tokens_per_device": 4779 }, { "epoch": 0.0136, "loss_ce": 1.2526301145553589, "loss_lvr": 1.3799258470535278, "loss_mode_switch": 0.0, "loss_total": 1.3906227350234985, "step": 34 }, { "batch_size": 4, "epoch": 0.0136, "step": 34, "tokens_per_device": 4992 }, { "epoch": 0.0136, "loss_ce": 1.9034407138824463, "loss_lvr": 1.9758350849151611, "loss_mode_switch": 0.0, "loss_total": 2.1010241508483887, "step": 34 }, { "epoch": 0.014, "grad_norm": 14.807323455810547, "learning_rate": 4.666666666666667e-06, "loss": 1.9923, "step": 35 }, { "batch_size": 4, "epoch": 0.014, "step": 35, "tokens_per_device": 1520 }, { "epoch": 0.014, "loss_ce": 1.7733924388885498, "loss_lvr": 2.372469663619995, "loss_mode_switch": 0.0, "loss_total": 2.0106394290924072, "step": 35 }, { "batch_size": 4, "epoch": 0.014, "step": 35, "tokens_per_device": 3768 }, { "epoch": 0.014, "loss_ce": 1.8461997509002686, "loss_lvr": 2.615565538406372, "loss_mode_switch": 0.0, "loss_total": 2.1077563762664795, "step": 35 }, { "batch_size": 1, "epoch": 0.014, "step": 35, "tokens_per_device": 5095 }, { "epoch": 0.014, "loss_ce": 1.9891517162322998, "loss_lvr": 1.832069754600525, "loss_mode_switch": 0.0, "loss_total": 2.172358751296997, "step": 35 }, { "batch_size": 1, "epoch": 0.014, "step": 35, "tokens_per_device": 5155 }, { "epoch": 0.014, "loss_ce": 1.6921826601028442, "loss_lvr": 1.4933887720108032, "loss_mode_switch": 0.0, "loss_total": 1.8415215015411377, "step": 35 }, { "batch_size": 4, "epoch": 0.014, "step": 35, "tokens_per_device": 3868 }, { "epoch": 0.014, "loss_ce": 2.1919591426849365, "loss_lvr": 2.1981446743011475, "loss_mode_switch": 0.0, "loss_total": 2.411773681640625, "step": 35 }, { "batch_size": 4, "epoch": 0.014, "step": 35, "tokens_per_device": 14468 }, { "epoch": 0.014, "loss_ce": 1.8173450231552124, "loss_lvr": 2.2479183673858643, "loss_mode_switch": 0.0, "loss_total": 2.0421369075775146, "step": 35 }, { "batch_size": 4, "epoch": 0.014, "step": 35, "tokens_per_device": 4456 }, { "epoch": 0.014, "loss_ce": 1.8824448585510254, "loss_lvr": 2.1797385215759277, "loss_mode_switch": 0.0, "loss_total": 2.10041880607605, "step": 35 }, { "batch_size": 4, "epoch": 0.014, "step": 35, "tokens_per_device": 3828 }, { "epoch": 0.014, "loss_ce": 2.1131370067596436, "loss_lvr": 2.2538695335388184, "loss_mode_switch": 0.0, "loss_total": 2.3385238647460938, "step": 35 }, { "epoch": 0.0144, "grad_norm": 123.57169342041016, "learning_rate": 4.800000000000001e-06, "loss": 2.0055, "step": 36 }, { "batch_size": 4, "epoch": 0.0144, "step": 36, "tokens_per_device": 4444 }, { "epoch": 0.0144, "loss_ce": 1.754348635673523, "loss_lvr": 1.740110993385315, "loss_mode_switch": 0.0, "loss_total": 1.9283597469329834, "step": 36 }, { "batch_size": 1, "epoch": 0.0144, "step": 36, "tokens_per_device": 4928 }, { "epoch": 0.0144, "loss_ce": 1.889966368675232, "loss_lvr": 1.2061519622802734, "loss_mode_switch": 0.0, "loss_total": 2.0105814933776855, "step": 36 }, { "batch_size": 4, "epoch": 0.0144, "step": 36, "tokens_per_device": 5292 }, { "epoch": 0.0144, "loss_ce": 1.5894302129745483, "loss_lvr": 1.60646390914917, "loss_mode_switch": 0.0, "loss_total": 1.7500766515731812, "step": 36 }, { "batch_size": 4, "epoch": 0.0144, "step": 36, "tokens_per_device": 4440 }, { "epoch": 0.0144, "loss_ce": 2.1102776527404785, "loss_lvr": 2.040635585784912, "loss_mode_switch": 0.0, "loss_total": 2.3143413066864014, "step": 36 }, { "batch_size": 4, "epoch": 0.0144, "step": 36, "tokens_per_device": 4188 }, { "epoch": 0.0144, "loss_ce": 1.8906304836273193, "loss_lvr": 2.013580799102783, "loss_mode_switch": 0.0, "loss_total": 2.0919885635375977, "step": 36 }, { "batch_size": 4, "epoch": 0.0144, "step": 36, "tokens_per_device": 4640 }, { "epoch": 0.0144, "loss_ce": 1.6586925983428955, "loss_lvr": 1.8390562534332275, "loss_mode_switch": 0.0, "loss_total": 1.8425981998443604, "step": 36 }, { "batch_size": 4, "epoch": 0.0144, "step": 36, "tokens_per_device": 5716 }, { "epoch": 0.0144, "loss_ce": 1.8854504823684692, "loss_lvr": 1.4039465188980103, "loss_mode_switch": 0.0, "loss_total": 2.0258450508117676, "step": 36 }, { "batch_size": 1, "epoch": 0.0144, "step": 36, "tokens_per_device": 4899 }, { "epoch": 0.0144, "loss_ce": 1.42508864402771, "loss_lvr": 1.3788394927978516, "loss_mode_switch": 0.0, "loss_total": 1.5629725456237793, "step": 36 }, { "epoch": 0.0148, "grad_norm": 17.61133575439453, "learning_rate": 4.933333333333334e-06, "loss": 1.9369, "step": 37 }, { "batch_size": 1, "epoch": 0.0148, "step": 37, "tokens_per_device": 5016 }, { "epoch": 0.0148, "loss_ce": 2.0095713138580322, "loss_lvr": 1.2015132904052734, "loss_mode_switch": 0.0, "loss_total": 2.1297225952148438, "step": 37 }, { "batch_size": 4, "epoch": 0.0148, "step": 37, "tokens_per_device": 3744 }, { "epoch": 0.0148, "loss_ce": 1.9542378187179565, "loss_lvr": 1.7807438373565674, "loss_mode_switch": 0.0, "loss_total": 2.132312297821045, "step": 37 }, { "batch_size": 4, "epoch": 0.0148, "step": 37, "tokens_per_device": 5068 }, { "epoch": 0.0148, "loss_ce": 1.913145899772644, "loss_lvr": 1.5327050685882568, "loss_mode_switch": 0.0, "loss_total": 2.0664165019989014, "step": 37 }, { "batch_size": 1, "epoch": 0.0148, "step": 37, "tokens_per_device": 5019 }, { "epoch": 0.0148, "loss_ce": 1.5318269729614258, "loss_lvr": 0.9575895071029663, "loss_mode_switch": 0.0, "loss_total": 1.6275858879089355, "step": 37 }, { "batch_size": 4, "epoch": 0.0148, "step": 37, "tokens_per_device": 3804 }, { "epoch": 0.0148, "loss_ce": 1.756020188331604, "loss_lvr": 1.5849148035049438, "loss_mode_switch": 0.0, "loss_total": 1.9145116806030273, "step": 37 }, { "batch_size": 1, "epoch": 0.0148, "step": 37, "tokens_per_device": 5199 }, { "epoch": 0.0148, "loss_ce": 1.9002660512924194, "loss_lvr": 1.2959505319595337, "loss_mode_switch": 0.0, "loss_total": 2.0298612117767334, "step": 37 }, { "batch_size": 4, "epoch": 0.0148, "step": 37, "tokens_per_device": 3912 }, { "epoch": 0.0148, "loss_ce": 1.9195842742919922, "loss_lvr": 1.9786043167114258, "loss_mode_switch": 0.0, "loss_total": 2.1174447536468506, "step": 37 }, { "batch_size": 4, "epoch": 0.0148, "step": 37, "tokens_per_device": 5164 }, { "epoch": 0.0148, "loss_ce": 2.0120222568511963, "loss_lvr": 1.8591872453689575, "loss_mode_switch": 0.0, "loss_total": 2.1979410648345947, "step": 37 }, { "epoch": 0.0152, "grad_norm": 18.862031936645508, "learning_rate": 5.0666666666666676e-06, "loss": 1.8975, "step": 38 }, { "batch_size": 4, "epoch": 0.0152, "step": 38, "tokens_per_device": 2616 }, { "epoch": 0.0152, "loss_ce": 1.829668402671814, "loss_lvr": 1.6848790645599365, "loss_mode_switch": 0.0, "loss_total": 1.9981563091278076, "step": 38 }, { "batch_size": 4, "epoch": 0.0152, "step": 38, "tokens_per_device": 6496 }, { "epoch": 0.0152, "loss_ce": 1.8551051616668701, "loss_lvr": 1.8233308792114258, "loss_mode_switch": 0.0, "loss_total": 2.037438154220581, "step": 38 }, { "batch_size": 4, "epoch": 0.0152, "step": 38, "tokens_per_device": 1276 }, { "epoch": 0.0152, "loss_ce": 1.7171951532363892, "loss_lvr": 2.110614538192749, "loss_mode_switch": 0.0, "loss_total": 1.928256630897522, "step": 38 }, { "batch_size": 4, "epoch": 0.0152, "step": 38, "tokens_per_device": 15100 }, { "epoch": 0.0152, "loss_ce": 1.593686819076538, "loss_lvr": 1.3119760751724243, "loss_mode_switch": 0.0, "loss_total": 1.7248843908309937, "step": 38 }, { "batch_size": 4, "epoch": 0.0152, "step": 38, "tokens_per_device": 4536 }, { "epoch": 0.0152, "loss_ce": 1.6006208658218384, "loss_lvr": 1.8785300254821777, "loss_mode_switch": 0.0, "loss_total": 1.7884738445281982, "step": 38 }, { "batch_size": 1, "epoch": 0.0152, "step": 38, "tokens_per_device": 4675 }, { "epoch": 0.0152, "loss_ce": 1.591208815574646, "loss_lvr": 0.9662439227104187, "loss_mode_switch": 0.0, "loss_total": 1.6878331899642944, "step": 38 }, { "batch_size": 4, "epoch": 0.0152, "step": 38, "tokens_per_device": 5792 }, { "epoch": 0.0152, "loss_ce": 1.6985667943954468, "loss_lvr": 2.0679092407226562, "loss_mode_switch": 0.0, "loss_total": 1.9053577184677124, "step": 38 }, { "batch_size": 4, "epoch": 0.0152, "step": 38, "tokens_per_device": 4196 }, { "epoch": 0.0152, "loss_ce": 1.8609907627105713, "loss_lvr": 1.8683178424835205, "loss_mode_switch": 0.0, "loss_total": 2.0478224754333496, "step": 38 }, { "epoch": 0.0156, "grad_norm": 20.4416446685791, "learning_rate": 5.2e-06, "loss": 1.8791, "step": 39 }, { "batch_size": 4, "epoch": 0.0156, "step": 39, "tokens_per_device": 4384 }, { "epoch": 0.0156, "loss_ce": 1.7981268167495728, "loss_lvr": 2.317943811416626, "loss_mode_switch": 0.0, "loss_total": 2.029921293258667, "step": 39 }, { "batch_size": 4, "epoch": 0.0156, "step": 39, "tokens_per_device": 3840 }, { "epoch": 0.0156, "loss_ce": 1.6016286611557007, "loss_lvr": 1.9248133897781372, "loss_mode_switch": 0.0, "loss_total": 1.7941100597381592, "step": 39 }, { "batch_size": 4, "epoch": 0.0156, "step": 39, "tokens_per_device": 4508 }, { "epoch": 0.0156, "loss_ce": 1.633431077003479, "loss_lvr": 1.627648949623108, "loss_mode_switch": 0.0, "loss_total": 1.7961959838867188, "step": 39 }, { "batch_size": 4, "epoch": 0.0156, "step": 39, "tokens_per_device": 1456 }, { "epoch": 0.0156, "loss_ce": 1.7670081853866577, "loss_lvr": 1.976040005683899, "loss_mode_switch": 0.0, "loss_total": 1.9646122455596924, "step": 39 }, { "batch_size": 1, "epoch": 0.0156, "step": 39, "tokens_per_device": 4875 }, { "epoch": 0.0156, "loss_ce": 1.4511007070541382, "loss_lvr": 1.2633260488510132, "loss_mode_switch": 0.0, "loss_total": 1.5774333477020264, "step": 39 }, { "batch_size": 4, "epoch": 0.0156, "step": 39, "tokens_per_device": 1916 }, { "epoch": 0.0156, "loss_ce": 2.014538526535034, "loss_lvr": 1.637880563735962, "loss_mode_switch": 0.0, "loss_total": 2.1783266067504883, "step": 39 }, { "batch_size": 4, "epoch": 0.0156, "step": 39, "tokens_per_device": 2092 }, { "epoch": 0.0156, "loss_ce": 1.7940428256988525, "loss_lvr": 1.655232548713684, "loss_mode_switch": 0.0, "loss_total": 1.9595661163330078, "step": 39 }, { "batch_size": 4, "epoch": 0.0156, "step": 39, "tokens_per_device": 3960 }, { "epoch": 0.0156, "loss_ce": 1.5498582124710083, "loss_lvr": 1.7873250246047974, "loss_mode_switch": 0.0, "loss_total": 1.728590726852417, "step": 39 }, { "epoch": 0.016, "grad_norm": 44.21305465698242, "learning_rate": 5.333333333333334e-06, "loss": 1.9102, "step": 40 }, { "batch_size": 1, "epoch": 0.016, "step": 40, "tokens_per_device": 4991 }, { "epoch": 0.016, "loss_ce": 1.7416597604751587, "loss_lvr": 0.9479509592056274, "loss_mode_switch": 0.0, "loss_total": 1.8364548683166504, "step": 40 }, { "batch_size": 4, "epoch": 0.016, "step": 40, "tokens_per_device": 1812 }, { "epoch": 0.016, "loss_ce": 1.8920433521270752, "loss_lvr": 1.6536015272140503, "loss_mode_switch": 0.0, "loss_total": 2.057403564453125, "step": 40 }, { "batch_size": 4, "epoch": 0.016, "step": 40, "tokens_per_device": 10608 }, { "epoch": 0.016, "loss_ce": 1.6887472867965698, "loss_lvr": 1.2133047580718994, "loss_mode_switch": 0.0, "loss_total": 1.8100777864456177, "step": 40 }, { "batch_size": 4, "epoch": 0.016, "step": 40, "tokens_per_device": 9672 }, { "epoch": 0.016, "loss_ce": 1.4830496311187744, "loss_lvr": 1.4412920475006104, "loss_mode_switch": 0.0, "loss_total": 1.6271787881851196, "step": 40 }, { "batch_size": 4, "epoch": 0.016, "step": 40, "tokens_per_device": 5760 }, { "epoch": 0.016, "loss_ce": 1.472599983215332, "loss_lvr": 1.6524769067764282, "loss_mode_switch": 0.0, "loss_total": 1.637847661972046, "step": 40 }, { "batch_size": 1, "epoch": 0.016, "step": 40, "tokens_per_device": 6696 }, { "epoch": 0.016, "loss_ce": 1.731242060661316, "loss_lvr": 1.266853928565979, "loss_mode_switch": 0.0, "loss_total": 1.8579274415969849, "step": 40 }, { "batch_size": 1, "epoch": 0.016, "step": 40, "tokens_per_device": 4915 }, { "epoch": 0.016, "loss_ce": 0.981853187084198, "loss_lvr": 0.944830596446991, "loss_mode_switch": 0.0, "loss_total": 1.0763362646102905, "step": 40 }, { "batch_size": 4, "epoch": 0.016, "step": 40, "tokens_per_device": 2760 }, { "epoch": 0.016, "loss_ce": 1.8815972805023193, "loss_lvr": 2.346022605895996, "loss_mode_switch": 0.0, "loss_total": 2.116199493408203, "step": 40 }, { "epoch": 0.0164, "grad_norm": 25.275236129760742, "learning_rate": 5.466666666666667e-06, "loss": 1.8402, "step": 41 }, { "batch_size": 4, "epoch": 0.0164, "step": 41, "tokens_per_device": 1316 }, { "epoch": 0.0164, "loss_ce": 1.7634605169296265, "loss_lvr": 1.877894401550293, "loss_mode_switch": 0.0, "loss_total": 1.9512499570846558, "step": 41 }, { "batch_size": 1, "epoch": 0.0164, "step": 41, "tokens_per_device": 4861 }, { "epoch": 0.0164, "loss_ce": 1.4859201908111572, "loss_lvr": 1.1116993427276611, "loss_mode_switch": 0.0, "loss_total": 1.5970901250839233, "step": 41 }, { "batch_size": 4, "epoch": 0.0164, "step": 41, "tokens_per_device": 10704 }, { "epoch": 0.0164, "loss_ce": 1.8879729509353638, "loss_lvr": 1.6752508878707886, "loss_mode_switch": 0.0, "loss_total": 2.0554981231689453, "step": 41 }, { "batch_size": 4, "epoch": 0.0164, "step": 41, "tokens_per_device": 4000 }, { "epoch": 0.0164, "loss_ce": 1.628060221672058, "loss_lvr": 1.9861325025558472, "loss_mode_switch": 0.0, "loss_total": 1.8266735076904297, "step": 41 }, { "batch_size": 1, "epoch": 0.0164, "step": 41, "tokens_per_device": 5413 }, { "epoch": 0.0164, "loss_ce": 1.5774195194244385, "loss_lvr": 0.9388350248336792, "loss_mode_switch": 0.0, "loss_total": 1.6713030338287354, "step": 41 }, { "batch_size": 1, "epoch": 0.0164, "step": 41, "tokens_per_device": 5152 }, { "epoch": 0.0164, "loss_ce": 1.8724817037582397, "loss_lvr": 0.9910036325454712, "loss_mode_switch": 0.0, "loss_total": 1.971582055091858, "step": 41 }, { "batch_size": 4, "epoch": 0.0164, "step": 41, "tokens_per_device": 4244 }, { "epoch": 0.0164, "loss_ce": 1.4702916145324707, "loss_lvr": 1.7352566719055176, "loss_mode_switch": 0.0, "loss_total": 1.6438173055648804, "step": 41 }, { "batch_size": 4, "epoch": 0.0164, "step": 41, "tokens_per_device": 2880 }, { "epoch": 0.0164, "loss_ce": 1.6906522512435913, "loss_lvr": 1.2502527236938477, "loss_mode_switch": 0.0, "loss_total": 1.815677523612976, "step": 41 }, { "epoch": 0.0168, "grad_norm": 97.787841796875, "learning_rate": 5.600000000000001e-06, "loss": 1.751, "step": 42 }, { "batch_size": 1, "epoch": 0.0168, "step": 42, "tokens_per_device": 4884 }, { "epoch": 0.0168, "loss_ce": 1.1882988214492798, "loss_lvr": 0.8734080791473389, "loss_mode_switch": 0.0, "loss_total": 1.2756396532058716, "step": 42 }, { "batch_size": 4, "epoch": 0.0168, "step": 42, "tokens_per_device": 1468 }, { "epoch": 0.0168, "loss_ce": 1.7333711385726929, "loss_lvr": 2.094773769378662, "loss_mode_switch": 0.0, "loss_total": 1.942848563194275, "step": 42 }, { "batch_size": 4, "epoch": 0.0168, "step": 42, "tokens_per_device": 3780 }, { "epoch": 0.0168, "loss_ce": 1.7227730751037598, "loss_lvr": 1.9996598958969116, "loss_mode_switch": 0.0, "loss_total": 1.922739028930664, "step": 42 }, { "batch_size": 4, "epoch": 0.0168, "step": 42, "tokens_per_device": 11512 }, { "epoch": 0.0168, "loss_ce": 1.525164246559143, "loss_lvr": 1.51967453956604, "loss_mode_switch": 0.0, "loss_total": 1.6771316528320312, "step": 42 }, { "batch_size": 4, "epoch": 0.0168, "step": 42, "tokens_per_device": 9504 }, { "epoch": 0.0168, "loss_ce": 1.383021593093872, "loss_lvr": 1.4356378316879272, "loss_mode_switch": 0.0, "loss_total": 1.526585340499878, "step": 42 }, { "batch_size": 4, "epoch": 0.0168, "step": 42, "tokens_per_device": 4340 }, { "epoch": 0.0168, "loss_ce": 1.7867099046707153, "loss_lvr": 1.8278720378875732, "loss_mode_switch": 0.0, "loss_total": 1.9694970846176147, "step": 42 }, { "batch_size": 4, "epoch": 0.0168, "step": 42, "tokens_per_device": 2628 }, { "epoch": 0.0168, "loss_ce": 1.6890063285827637, "loss_lvr": 2.053804874420166, "loss_mode_switch": 0.0, "loss_total": 1.8943867683410645, "step": 42 }, { "batch_size": 1, "epoch": 0.0168, "step": 42, "tokens_per_device": 4926 }, { "epoch": 0.0168, "loss_ce": 1.8180493116378784, "loss_lvr": 0.9816591739654541, "loss_mode_switch": 0.0, "loss_total": 1.916215181350708, "step": 42 }, { "epoch": 0.0172, "grad_norm": 23.391321182250977, "learning_rate": 5.733333333333334e-06, "loss": 1.7801, "step": 43 }, { "batch_size": 1, "epoch": 0.0172, "step": 43, "tokens_per_device": 4869 }, { "epoch": 0.0172, "loss_ce": 1.4867981672286987, "loss_lvr": 0.8959618210792542, "loss_mode_switch": 0.0, "loss_total": 1.5763943195343018, "step": 43 }, { "batch_size": 4, "epoch": 0.0172, "step": 43, "tokens_per_device": 1416 }, { "epoch": 0.0172, "loss_ce": 1.2158454656600952, "loss_lvr": 1.8831111192703247, "loss_mode_switch": 0.0, "loss_total": 1.4041565656661987, "step": 43 }, { "batch_size": 1, "epoch": 0.0172, "step": 43, "tokens_per_device": 6378 }, { "epoch": 0.0172, "loss_ce": 1.4691216945648193, "loss_lvr": 1.1719508171081543, "loss_mode_switch": 0.0, "loss_total": 1.5863168239593506, "step": 43 }, { "batch_size": 4, "epoch": 0.0172, "step": 43, "tokens_per_device": 11068 }, { "epoch": 0.0172, "loss_ce": 1.625340223312378, "loss_lvr": 1.7226955890655518, "loss_mode_switch": 0.0, "loss_total": 1.797609806060791, "step": 43 }, { "batch_size": 4, "epoch": 0.0172, "step": 43, "tokens_per_device": 4244 }, { "epoch": 0.0172, "loss_ce": 1.6678632497787476, "loss_lvr": 1.8180721998214722, "loss_mode_switch": 0.0, "loss_total": 1.84967041015625, "step": 43 }, { "batch_size": 4, "epoch": 0.0172, "step": 43, "tokens_per_device": 4264 }, { "epoch": 0.0172, "loss_ce": 1.8329213857650757, "loss_lvr": 1.7874305248260498, "loss_mode_switch": 0.0, "loss_total": 2.011664390563965, "step": 43 }, { "batch_size": 1, "epoch": 0.0172, "step": 43, "tokens_per_device": 5124 }, { "epoch": 0.0172, "loss_ce": 2.039055109024048, "loss_lvr": 1.1150926351547241, "loss_mode_switch": 0.0, "loss_total": 2.150564432144165, "step": 43 }, { "batch_size": 1, "epoch": 0.0172, "step": 43, "tokens_per_device": 5117 }, { "epoch": 0.0172, "loss_ce": 1.1723281145095825, "loss_lvr": 0.8053577542304993, "loss_mode_switch": 0.0, "loss_total": 1.252863883972168, "step": 43 }, { "epoch": 0.0176, "grad_norm": 23.766645431518555, "learning_rate": 5.8666666666666675e-06, "loss": 1.7115, "step": 44 }, { "batch_size": 4, "epoch": 0.0176, "step": 44, "tokens_per_device": 4332 }, { "epoch": 0.0176, "loss_ce": 1.520509958267212, "loss_lvr": 1.7996944189071655, "loss_mode_switch": 0.0, "loss_total": 1.7004793882369995, "step": 44 }, { "batch_size": 1, "epoch": 0.0176, "step": 44, "tokens_per_device": 5688 }, { "epoch": 0.0176, "loss_ce": 1.5251872539520264, "loss_lvr": 1.1150611639022827, "loss_mode_switch": 0.0, "loss_total": 1.6366933584213257, "step": 44 }, { "batch_size": 1, "epoch": 0.0176, "step": 44, "tokens_per_device": 4881 }, { "epoch": 0.0176, "loss_ce": 1.742209553718567, "loss_lvr": 1.3597922325134277, "loss_mode_switch": 0.0, "loss_total": 1.8781887292861938, "step": 44 }, { "batch_size": 1, "epoch": 0.0176, "step": 44, "tokens_per_device": 5160 }, { "epoch": 0.0176, "loss_ce": 1.417630672454834, "loss_lvr": 1.0300902128219604, "loss_mode_switch": 0.0, "loss_total": 1.5206396579742432, "step": 44 }, { "batch_size": 1, "epoch": 0.0176, "step": 44, "tokens_per_device": 5176 }, { "epoch": 0.0176, "loss_ce": 1.3434820175170898, "loss_lvr": 1.1533150672912598, "loss_mode_switch": 0.0, "loss_total": 1.4588135480880737, "step": 44 }, { "batch_size": 1, "epoch": 0.0176, "step": 44, "tokens_per_device": 5074 }, { "epoch": 0.0176, "loss_ce": 1.4411731958389282, "loss_lvr": 1.563925862312317, "loss_mode_switch": 0.0, "loss_total": 1.597565770149231, "step": 44 }, { "batch_size": 4, "epoch": 0.0176, "step": 44, "tokens_per_device": 4264 }, { "epoch": 0.0176, "loss_ce": 1.496000051498413, "loss_lvr": 1.8437763452529907, "loss_mode_switch": 0.0, "loss_total": 1.680377721786499, "step": 44 }, { "batch_size": 4, "epoch": 0.0176, "step": 44, "tokens_per_device": 4748 }, { "epoch": 0.0176, "loss_ce": 1.5996848344802856, "loss_lvr": 2.421462059020996, "loss_mode_switch": 0.0, "loss_total": 1.841831088066101, "step": 44 }, { "epoch": 0.018, "grad_norm": 24.64431381225586, "learning_rate": 6e-06, "loss": 1.7104, "step": 45 }, { "batch_size": 4, "epoch": 0.018, "step": 45, "tokens_per_device": 4020 }, { "epoch": 0.018, "loss_ce": 1.776738166809082, "loss_lvr": 1.6285587549209595, "loss_mode_switch": 0.0, "loss_total": 1.939594030380249, "step": 45 }, { "batch_size": 1, "epoch": 0.018, "step": 45, "tokens_per_device": 4866 }, { "epoch": 0.018, "loss_ce": 1.381868839263916, "loss_lvr": 0.9337281584739685, "loss_mode_switch": 0.0, "loss_total": 1.4752416610717773, "step": 45 }, { "batch_size": 1, "epoch": 0.018, "step": 45, "tokens_per_device": 5027 }, { "epoch": 0.018, "loss_ce": 1.0089325904846191, "loss_lvr": 1.1882340908050537, "loss_mode_switch": 0.0, "loss_total": 1.1277559995651245, "step": 45 }, { "batch_size": 4, "epoch": 0.018, "step": 45, "tokens_per_device": 2660 }, { "epoch": 0.018, "loss_ce": 1.690889596939087, "loss_lvr": 1.6213164329528809, "loss_mode_switch": 0.0, "loss_total": 1.853021264076233, "step": 45 }, { "batch_size": 4, "epoch": 0.018, "step": 45, "tokens_per_device": 6688 }, { "epoch": 0.018, "loss_ce": 1.5181537866592407, "loss_lvr": 1.336254596710205, "loss_mode_switch": 0.0, "loss_total": 1.651779294013977, "step": 45 }, { "batch_size": 4, "epoch": 0.018, "step": 45, "tokens_per_device": 6640 }, { "epoch": 0.018, "loss_ce": 1.630133032798767, "loss_lvr": 1.7778196334838867, "loss_mode_switch": 0.0, "loss_total": 1.8079149723052979, "step": 45 }, { "batch_size": 4, "epoch": 0.018, "step": 45, "tokens_per_device": 7256 }, { "epoch": 0.018, "loss_ce": 1.086815357208252, "loss_lvr": 1.3341307640075684, "loss_mode_switch": 0.0, "loss_total": 1.2202284336090088, "step": 45 }, { "batch_size": 1, "epoch": 0.018, "step": 45, "tokens_per_device": 5115 }, { "epoch": 0.018, "loss_ce": 1.4518460035324097, "loss_lvr": 1.1576861143112183, "loss_mode_switch": 0.0, "loss_total": 1.5676145553588867, "step": 45 }, { "epoch": 0.0184, "grad_norm": 23.460420608520508, "learning_rate": 6.133333333333334e-06, "loss": 1.6893, "step": 46 }, { "batch_size": 1, "epoch": 0.0184, "step": 46, "tokens_per_device": 5084 }, { "epoch": 0.0184, "loss_ce": 1.2713223695755005, "loss_lvr": 2.4812827110290527, "loss_mode_switch": 0.0, "loss_total": 1.5194506645202637, "step": 46 }, { "batch_size": 4, "epoch": 0.0184, "step": 46, "tokens_per_device": 2800 }, { "epoch": 0.0184, "loss_ce": 1.8477671146392822, "loss_lvr": 1.225366473197937, "loss_mode_switch": 0.0, "loss_total": 1.9703037738800049, "step": 46 }, { "batch_size": 4, "epoch": 0.0184, "step": 46, "tokens_per_device": 10096 }, { "epoch": 0.0184, "loss_ce": 1.3332208395004272, "loss_lvr": 1.761406660079956, "loss_mode_switch": 0.0, "loss_total": 1.5093615055084229, "step": 46 }, { "batch_size": 1, "epoch": 0.0184, "step": 46, "tokens_per_device": 5330 }, { "epoch": 0.0184, "loss_ce": 1.6350266933441162, "loss_lvr": 1.160811185836792, "loss_mode_switch": 0.0, "loss_total": 1.7511078119277954, "step": 46 }, { "batch_size": 1, "epoch": 0.0184, "step": 46, "tokens_per_device": 5160 }, { "epoch": 0.0184, "loss_ce": 3.367352247238159, "loss_lvr": 1.3862828016281128, "loss_mode_switch": 0.0, "loss_total": 3.5059804916381836, "step": 46 }, { "batch_size": 1, "epoch": 0.0184, "step": 46, "tokens_per_device": 5115 }, { "epoch": 0.0184, "loss_ce": 1.7108038663864136, "loss_lvr": 0.7506132125854492, "loss_mode_switch": 0.0, "loss_total": 1.7858651876449585, "step": 46 }, { "batch_size": 1, "epoch": 0.0184, "step": 46, "tokens_per_device": 5310 }, { "epoch": 0.0184, "loss_ce": 1.726067066192627, "loss_lvr": 1.1261136531829834, "loss_mode_switch": 0.0, "loss_total": 1.8386784791946411, "step": 46 }, { "batch_size": 4, "epoch": 0.0184, "step": 46, "tokens_per_device": 3856 }, { "epoch": 0.0184, "loss_ce": 1.4719504117965698, "loss_lvr": 1.6864951848983765, "loss_mode_switch": 0.0, "loss_total": 1.6405999660491943, "step": 46 }, { "epoch": 0.0188, "grad_norm": 22.852405548095703, "learning_rate": 6.266666666666668e-06, "loss": 1.6512, "step": 47 }, { "batch_size": 4, "epoch": 0.0188, "step": 47, "tokens_per_device": 6052 }, { "epoch": 0.0188, "loss_ce": 1.207477331161499, "loss_lvr": 1.8922438621520996, "loss_mode_switch": 0.0, "loss_total": 1.396701693534851, "step": 47 }, { "batch_size": 4, "epoch": 0.0188, "step": 47, "tokens_per_device": 9644 }, { "epoch": 0.0188, "loss_ce": 1.3348418474197388, "loss_lvr": 1.7573816776275635, "loss_mode_switch": 0.0, "loss_total": 1.510580062866211, "step": 47 }, { "batch_size": 4, "epoch": 0.0188, "step": 47, "tokens_per_device": 5988 }, { "epoch": 0.0188, "loss_ce": 1.4422461986541748, "loss_lvr": 1.6342048645019531, "loss_mode_switch": 0.0, "loss_total": 1.6056666374206543, "step": 47 }, { "batch_size": 4, "epoch": 0.0188, "step": 47, "tokens_per_device": 7648 }, { "epoch": 0.0188, "loss_ce": 1.4674451351165771, "loss_lvr": 1.6610783338546753, "loss_mode_switch": 0.0, "loss_total": 1.6335530281066895, "step": 47 }, { "batch_size": 4, "epoch": 0.0188, "step": 47, "tokens_per_device": 6528 }, { "epoch": 0.0188, "loss_ce": 1.6710546016693115, "loss_lvr": 1.6293812990188599, "loss_mode_switch": 0.0, "loss_total": 1.8339927196502686, "step": 47 }, { "batch_size": 4, "epoch": 0.0188, "step": 47, "tokens_per_device": 6168 }, { "epoch": 0.0188, "loss_ce": 1.5903133153915405, "loss_lvr": 1.636278510093689, "loss_mode_switch": 0.0, "loss_total": 1.7539411783218384, "step": 47 }, { "batch_size": 4, "epoch": 0.0188, "step": 47, "tokens_per_device": 4904 }, { "epoch": 0.0188, "loss_ce": 1.4744631052017212, "loss_lvr": 1.7795932292938232, "loss_mode_switch": 0.0, "loss_total": 1.6524224281311035, "step": 47 }, { "batch_size": 4, "epoch": 0.0188, "step": 47, "tokens_per_device": 4060 }, { "epoch": 0.0188, "loss_ce": 1.1899619102478027, "loss_lvr": 1.4378446340560913, "loss_mode_switch": 0.0, "loss_total": 1.3337464332580566, "step": 47 }, { "epoch": 0.0192, "grad_norm": 21.87618637084961, "learning_rate": 6.4000000000000006e-06, "loss": 1.6089, "step": 48 }, { "batch_size": 4, "epoch": 0.0192, "step": 48, "tokens_per_device": 4156 }, { "epoch": 0.0192, "loss_ce": 1.730036735534668, "loss_lvr": 1.616005539894104, "loss_mode_switch": 0.0, "loss_total": 1.8916373252868652, "step": 48 }, { "batch_size": 1, "epoch": 0.0192, "step": 48, "tokens_per_device": 5212 }, { "epoch": 0.0192, "loss_ce": 1.252445936203003, "loss_lvr": 1.0922210216522217, "loss_mode_switch": 0.0, "loss_total": 1.3616679906845093, "step": 48 }, { "batch_size": 4, "epoch": 0.0192, "step": 48, "tokens_per_device": 4516 }, { "epoch": 0.0192, "loss_ce": 1.4742847681045532, "loss_lvr": 1.5593297481536865, "loss_mode_switch": 0.0, "loss_total": 1.6302177906036377, "step": 48 }, { "batch_size": 1, "epoch": 0.0192, "step": 48, "tokens_per_device": 5101 }, { "epoch": 0.0192, "loss_ce": 1.4272867441177368, "loss_lvr": 0.9074074029922485, "loss_mode_switch": 0.0, "loss_total": 1.5180275440216064, "step": 48 }, { "batch_size": 4, "epoch": 0.0192, "step": 48, "tokens_per_device": 4384 }, { "epoch": 0.0192, "loss_ce": 1.435335636138916, "loss_lvr": 1.8272453546524048, "loss_mode_switch": 0.0, "loss_total": 1.6180601119995117, "step": 48 }, { "batch_size": 4, "epoch": 0.0192, "step": 48, "tokens_per_device": 10448 }, { "epoch": 0.0192, "loss_ce": 1.5446555614471436, "loss_lvr": 1.6626691818237305, "loss_mode_switch": 0.0, "loss_total": 1.7109224796295166, "step": 48 }, { "batch_size": 1, "epoch": 0.0192, "step": 48, "tokens_per_device": 5131 }, { "epoch": 0.0192, "loss_ce": 1.6658551692962646, "loss_lvr": 1.0251140594482422, "loss_mode_switch": 0.0, "loss_total": 1.7683665752410889, "step": 48 }, { "batch_size": 4, "epoch": 0.0192, "step": 48, "tokens_per_device": 3444 }, { "epoch": 0.0192, "loss_ce": 1.5056082010269165, "loss_lvr": 1.732709527015686, "loss_mode_switch": 0.0, "loss_total": 1.6788791418075562, "step": 48 }, { "epoch": 0.0196, "grad_norm": 21.98090171813965, "learning_rate": 6.533333333333334e-06, "loss": 1.6056, "step": 49 }, { "batch_size": 4, "epoch": 0.0196, "step": 49, "tokens_per_device": 2292 }, { "epoch": 0.0196, "loss_ce": 1.5927302837371826, "loss_lvr": 2.3554844856262207, "loss_mode_switch": 0.0, "loss_total": 1.8282787799835205, "step": 49 }, { "batch_size": 4, "epoch": 0.0196, "step": 49, "tokens_per_device": 1536 }, { "epoch": 0.0196, "loss_ce": 1.5420496463775635, "loss_lvr": 1.9533424377441406, "loss_mode_switch": 0.0, "loss_total": 1.7373838424682617, "step": 49 }, { "batch_size": 4, "epoch": 0.0196, "step": 49, "tokens_per_device": 4508 }, { "epoch": 0.0196, "loss_ce": 1.2434009313583374, "loss_lvr": 1.6575363874435425, "loss_mode_switch": 0.0, "loss_total": 1.4091545343399048, "step": 49 }, { "batch_size": 4, "epoch": 0.0196, "step": 49, "tokens_per_device": 8340 }, { "epoch": 0.0196, "loss_ce": 1.4586536884307861, "loss_lvr": 1.873171329498291, "loss_mode_switch": 0.0, "loss_total": 1.6459708213806152, "step": 49 }, { "batch_size": 4, "epoch": 0.0196, "step": 49, "tokens_per_device": 4240 }, { "epoch": 0.0196, "loss_ce": 1.3994030952453613, "loss_lvr": 1.2998971939086914, "loss_mode_switch": 0.0, "loss_total": 1.5293928384780884, "step": 49 }, { "batch_size": 4, "epoch": 0.0196, "step": 49, "tokens_per_device": 1476 }, { "epoch": 0.0196, "loss_ce": 1.4075312614440918, "loss_lvr": 1.852484107017517, "loss_mode_switch": 0.0, "loss_total": 1.5927796363830566, "step": 49 }, { "batch_size": 4, "epoch": 0.0196, "step": 49, "tokens_per_device": 8392 }, { "epoch": 0.0196, "loss_ce": 1.375338077545166, "loss_lvr": 1.8656866550445557, "loss_mode_switch": 0.0, "loss_total": 1.5619066953659058, "step": 49 }, { "batch_size": 1, "epoch": 0.0196, "step": 49, "tokens_per_device": 4891 }, { "epoch": 0.0196, "loss_ce": 1.469961404800415, "loss_lvr": 0.9754764437675476, "loss_mode_switch": 0.0, "loss_total": 1.5675090551376343, "step": 49 }, { "epoch": 0.02, "grad_norm": 21.72597312927246, "learning_rate": 6.666666666666667e-06, "loss": 1.5457, "step": 50 }, { "batch_size": 4, "epoch": 0.02, "step": 50, "tokens_per_device": 5032 }, { "epoch": 0.02, "loss_ce": 1.4108299016952515, "loss_lvr": 1.392176866531372, "loss_mode_switch": 0.0, "loss_total": 1.5500476360321045, "step": 50 }, { "batch_size": 1, "epoch": 0.02, "step": 50, "tokens_per_device": 4382 }, { "epoch": 0.02, "loss_ce": 1.0059243440628052, "loss_lvr": 1.2276992797851562, "loss_mode_switch": 0.0, "loss_total": 1.1286942958831787, "step": 50 }, { "batch_size": 1, "epoch": 0.02, "step": 50, "tokens_per_device": 4569 }, { "epoch": 0.02, "loss_ce": 1.6148537397384644, "loss_lvr": 1.36441969871521, "loss_mode_switch": 0.0, "loss_total": 1.7512956857681274, "step": 50 }, { "batch_size": 4, "epoch": 0.02, "step": 50, "tokens_per_device": 11828 }, { "epoch": 0.02, "loss_ce": 1.4244776964187622, "loss_lvr": 1.8155912160873413, "loss_mode_switch": 0.0, "loss_total": 1.6060367822647095, "step": 50 }, { "batch_size": 4, "epoch": 0.02, "step": 50, "tokens_per_device": 2764 }, { "epoch": 0.02, "loss_ce": 1.3191657066345215, "loss_lvr": 1.7778538465499878, "loss_mode_switch": 0.0, "loss_total": 1.4969511032104492, "step": 50 }, { "batch_size": 4, "epoch": 0.02, "step": 50, "tokens_per_device": 3412 }, { "epoch": 0.02, "loss_ce": 1.5900267362594604, "loss_lvr": 1.7389148473739624, "loss_mode_switch": 0.0, "loss_total": 1.763918161392212, "step": 50 }, { "batch_size": 1, "epoch": 0.02, "step": 50, "tokens_per_device": 5080 }, { "epoch": 0.02, "loss_ce": 1.7947838306427002, "loss_lvr": 1.0642352104187012, "loss_mode_switch": 0.0, "loss_total": 1.9012073278427124, "step": 50 }, { "batch_size": 1, "epoch": 0.02, "step": 50, "tokens_per_device": 4874 }, { "epoch": 0.02, "loss_ce": 1.3146851062774658, "loss_lvr": 1.0116503238677979, "loss_mode_switch": 0.0, "loss_total": 1.4158501625061035, "step": 50 }, { "epoch": 0.0204, "grad_norm": 22.700136184692383, "learning_rate": 6.800000000000001e-06, "loss": 1.544, "step": 51 }, { "batch_size": 1, "epoch": 0.0204, "step": 51, "tokens_per_device": 4829 }, { "epoch": 0.0204, "loss_ce": 1.3123952150344849, "loss_lvr": 1.2619532346725464, "loss_mode_switch": 0.0, "loss_total": 1.4385905265808105, "step": 51 }, { "batch_size": 4, "epoch": 0.0204, "step": 51, "tokens_per_device": 4416 }, { "epoch": 0.0204, "loss_ce": 1.2837039232254028, "loss_lvr": 2.1631367206573486, "loss_mode_switch": 0.0, "loss_total": 1.5000176429748535, "step": 51 }, { "batch_size": 4, "epoch": 0.0204, "step": 51, "tokens_per_device": 4284 }, { "epoch": 0.0204, "loss_ce": 1.6175729036331177, "loss_lvr": 1.7004666328430176, "loss_mode_switch": 0.0, "loss_total": 1.7876195907592773, "step": 51 }, { "batch_size": 1, "epoch": 0.0204, "step": 51, "tokens_per_device": 5543 }, { "epoch": 0.0204, "loss_ce": 1.279239535331726, "loss_lvr": 0.9865476489067078, "loss_mode_switch": 0.0, "loss_total": 1.3778942823410034, "step": 51 }, { "batch_size": 4, "epoch": 0.0204, "step": 51, "tokens_per_device": 5736 }, { "epoch": 0.0204, "loss_ce": 1.5331017971038818, "loss_lvr": 1.8649039268493652, "loss_mode_switch": 0.0, "loss_total": 1.7195922136306763, "step": 51 }, { "batch_size": 1, "epoch": 0.0204, "step": 51, "tokens_per_device": 5004 }, { "epoch": 0.0204, "loss_ce": 1.3750423192977905, "loss_lvr": 0.7731375098228455, "loss_mode_switch": 0.0, "loss_total": 1.4523561000823975, "step": 51 }, { "batch_size": 4, "epoch": 0.0204, "step": 51, "tokens_per_device": 4552 }, { "epoch": 0.0204, "loss_ce": 1.4572540521621704, "loss_lvr": 2.0350873470306396, "loss_mode_switch": 0.0, "loss_total": 1.6607627868652344, "step": 51 }, { "batch_size": 4, "epoch": 0.0204, "step": 51, "tokens_per_device": 4732 }, { "epoch": 0.0204, "loss_ce": 1.2840029001235962, "loss_lvr": 1.4819456338882446, "loss_mode_switch": 0.0, "loss_total": 1.4321974515914917, "step": 51 }, { "epoch": 0.0208, "grad_norm": 43.19789505004883, "learning_rate": 6.9333333333333344e-06, "loss": 1.5016, "step": 52 }, { "batch_size": 4, "epoch": 0.0208, "step": 52, "tokens_per_device": 4320 }, { "epoch": 0.0208, "loss_ce": 1.0119218826293945, "loss_lvr": 1.8913112878799438, "loss_mode_switch": 0.0, "loss_total": 1.2010530233383179, "step": 52 }, { "batch_size": 4, "epoch": 0.0208, "step": 52, "tokens_per_device": 4220 }, { "epoch": 0.0208, "loss_ce": 1.3537952899932861, "loss_lvr": 1.6785354614257812, "loss_mode_switch": 0.0, "loss_total": 1.52164888381958, "step": 52 }, { "batch_size": 4, "epoch": 0.0208, "step": 52, "tokens_per_device": 4096 }, { "epoch": 0.0208, "loss_ce": 1.3961695432662964, "loss_lvr": 1.563174843788147, "loss_mode_switch": 0.0, "loss_total": 1.5524870157241821, "step": 52 }, { "batch_size": 4, "epoch": 0.0208, "step": 52, "tokens_per_device": 5696 }, { "epoch": 0.0208, "loss_ce": 1.149811863899231, "loss_lvr": 1.5012609958648682, "loss_mode_switch": 0.0, "loss_total": 1.2999379634857178, "step": 52 }, { "batch_size": 4, "epoch": 0.0208, "step": 52, "tokens_per_device": 7088 }, { "epoch": 0.0208, "loss_ce": 1.2859256267547607, "loss_lvr": 1.5549356937408447, "loss_mode_switch": 0.0, "loss_total": 1.441419243812561, "step": 52 }, { "batch_size": 4, "epoch": 0.0208, "step": 52, "tokens_per_device": 5384 }, { "epoch": 0.0208, "loss_ce": 1.4493038654327393, "loss_lvr": 1.4565365314483643, "loss_mode_switch": 0.0, "loss_total": 1.5949574708938599, "step": 52 }, { "batch_size": 4, "epoch": 0.0208, "step": 52, "tokens_per_device": 2668 }, { "epoch": 0.0208, "loss_ce": 1.2314274311065674, "loss_lvr": 1.62380051612854, "loss_mode_switch": 0.0, "loss_total": 1.3938075304031372, "step": 52 }, { "batch_size": 1, "epoch": 0.0208, "step": 52, "tokens_per_device": 4893 }, { "epoch": 0.0208, "loss_ce": 0.8386419415473938, "loss_lvr": 1.101454257965088, "loss_mode_switch": 0.0, "loss_total": 0.9487873911857605, "step": 52 }, { "epoch": 0.0212, "grad_norm": 27.28670310974121, "learning_rate": 7.066666666666667e-06, "loss": 1.4945, "step": 53 }, { "batch_size": 4, "epoch": 0.0212, "step": 53, "tokens_per_device": 4064 }, { "epoch": 0.0212, "loss_ce": 1.343461513519287, "loss_lvr": 1.7477868795394897, "loss_mode_switch": 0.0, "loss_total": 1.518240213394165, "step": 53 }, { "batch_size": 4, "epoch": 0.0212, "step": 53, "tokens_per_device": 4664 }, { "epoch": 0.0212, "loss_ce": 1.3131792545318604, "loss_lvr": 1.6440761089324951, "loss_mode_switch": 0.0, "loss_total": 1.4775868654251099, "step": 53 }, { "batch_size": 4, "epoch": 0.0212, "step": 53, "tokens_per_device": 4664 }, { "epoch": 0.0212, "loss_ce": 1.3015556335449219, "loss_lvr": 3.088730812072754, "loss_mode_switch": 0.0, "loss_total": 1.6104286909103394, "step": 53 }, { "batch_size": 4, "epoch": 0.0212, "step": 53, "tokens_per_device": 4336 }, { "epoch": 0.0212, "loss_ce": 1.0941110849380493, "loss_lvr": 2.0563814640045166, "loss_mode_switch": 0.0, "loss_total": 1.2997492551803589, "step": 53 }, { "batch_size": 4, "epoch": 0.0212, "step": 53, "tokens_per_device": 2676 }, { "epoch": 0.0212, "loss_ce": 1.5230258703231812, "loss_lvr": 1.6376994848251343, "loss_mode_switch": 0.0, "loss_total": 1.6867958307266235, "step": 53 }, { "batch_size": 4, "epoch": 0.0212, "step": 53, "tokens_per_device": 1404 }, { "epoch": 0.0212, "loss_ce": 1.4545468091964722, "loss_lvr": 2.1391382217407227, "loss_mode_switch": 0.0, "loss_total": 1.6684606075286865, "step": 53 }, { "batch_size": 1, "epoch": 0.0212, "step": 53, "tokens_per_device": 6450 }, { "epoch": 0.0212, "loss_ce": 1.486077070236206, "loss_lvr": 1.2343802452087402, "loss_mode_switch": 0.0, "loss_total": 1.6095150709152222, "step": 53 }, { "batch_size": 4, "epoch": 0.0212, "step": 53, "tokens_per_device": 1468 }, { "epoch": 0.0212, "loss_ce": 1.3076210021972656, "loss_lvr": 1.6567614078521729, "loss_mode_switch": 0.0, "loss_total": 1.473297119140625, "step": 53 }, { "epoch": 0.0216, "grad_norm": 22.346843719482422, "learning_rate": 7.2000000000000005e-06, "loss": 1.4585, "step": 54 }, { "batch_size": 4, "epoch": 0.0216, "step": 54, "tokens_per_device": 2708 }, { "epoch": 0.0216, "loss_ce": 1.1393028497695923, "loss_lvr": 1.9483431577682495, "loss_mode_switch": 0.0, "loss_total": 1.334137201309204, "step": 54 }, { "batch_size": 4, "epoch": 0.0216, "step": 54, "tokens_per_device": 4240 }, { "epoch": 0.0216, "loss_ce": 1.3206136226654053, "loss_lvr": 2.0625171661376953, "loss_mode_switch": 0.0, "loss_total": 1.5268653631210327, "step": 54 }, { "batch_size": 4, "epoch": 0.0216, "step": 54, "tokens_per_device": 2620 }, { "epoch": 0.0216, "loss_ce": 1.3250144720077515, "loss_lvr": 1.4688963890075684, "loss_mode_switch": 0.0, "loss_total": 1.4719041585922241, "step": 54 }, { "batch_size": 1, "epoch": 0.0216, "step": 54, "tokens_per_device": 5392 }, { "epoch": 0.0216, "loss_ce": 1.314423680305481, "loss_lvr": 0.8666700720787048, "loss_mode_switch": 0.0, "loss_total": 1.4010907411575317, "step": 54 }, { "batch_size": 4, "epoch": 0.0216, "step": 54, "tokens_per_device": 4252 }, { "epoch": 0.0216, "loss_ce": 1.3104662895202637, "loss_lvr": 2.3465843200683594, "loss_mode_switch": 0.0, "loss_total": 1.5451247692108154, "step": 54 }, { "batch_size": 4, "epoch": 0.0216, "step": 54, "tokens_per_device": 4200 }, { "epoch": 0.0216, "loss_ce": 1.3218189477920532, "loss_lvr": 2.0774753093719482, "loss_mode_switch": 0.0, "loss_total": 1.5295665264129639, "step": 54 }, { "batch_size": 4, "epoch": 0.0216, "step": 54, "tokens_per_device": 4440 }, { "epoch": 0.0216, "loss_ce": 1.1091747283935547, "loss_lvr": 1.6646661758422852, "loss_mode_switch": 0.0, "loss_total": 1.2756413221359253, "step": 54 }, { "batch_size": 4, "epoch": 0.0216, "step": 54, "tokens_per_device": 4248 }, { "epoch": 0.0216, "loss_ce": 1.7826865911483765, "loss_lvr": 1.4026844501495361, "loss_mode_switch": 0.0, "loss_total": 1.92295503616333, "step": 54 }, { "epoch": 0.022, "grad_norm": 21.533422470092773, "learning_rate": 7.333333333333333e-06, "loss": 1.4494, "step": 55 }, { "batch_size": 4, "epoch": 0.022, "step": 55, "tokens_per_device": 4156 }, { "epoch": 0.022, "loss_ce": 1.450365662574768, "loss_lvr": 2.0358076095581055, "loss_mode_switch": 0.0, "loss_total": 1.6539463996887207, "step": 55 }, { "batch_size": 4, "epoch": 0.022, "step": 55, "tokens_per_device": 5148 }, { "epoch": 0.022, "loss_ce": 1.3787635564804077, "loss_lvr": 1.7812044620513916, "loss_mode_switch": 0.0, "loss_total": 1.5568840503692627, "step": 55 }, { "batch_size": 4, "epoch": 0.022, "step": 55, "tokens_per_device": 3600 }, { "epoch": 0.022, "loss_ce": 1.369543194770813, "loss_lvr": 1.9089651107788086, "loss_mode_switch": 0.0, "loss_total": 1.5604397058486938, "step": 55 }, { "batch_size": 1, "epoch": 0.022, "step": 55, "tokens_per_device": 5170 }, { "epoch": 0.022, "loss_ce": 1.0160560607910156, "loss_lvr": 0.7716128826141357, "loss_mode_switch": 0.0, "loss_total": 1.093217372894287, "step": 55 }, { "batch_size": 4, "epoch": 0.022, "step": 55, "tokens_per_device": 3732 }, { "epoch": 0.022, "loss_ce": 1.1019750833511353, "loss_lvr": 1.8920882940292358, "loss_mode_switch": 0.0, "loss_total": 1.2911839485168457, "step": 55 }, { "batch_size": 4, "epoch": 0.022, "step": 55, "tokens_per_device": 5012 }, { "epoch": 0.022, "loss_ce": 1.5173214673995972, "loss_lvr": 1.4969905614852905, "loss_mode_switch": 0.0, "loss_total": 1.667020559310913, "step": 55 }, { "batch_size": 1, "epoch": 0.022, "step": 55, "tokens_per_device": 4886 }, { "epoch": 0.022, "loss_ce": 1.0393867492675781, "loss_lvr": 2.674179792404175, "loss_mode_switch": 0.0, "loss_total": 1.3068047761917114, "step": 55 }, { "batch_size": 4, "epoch": 0.022, "step": 55, "tokens_per_device": 4288 }, { "epoch": 0.022, "loss_ce": 1.1370640993118286, "loss_lvr": 1.8291243314743042, "loss_mode_switch": 0.0, "loss_total": 1.319976568222046, "step": 55 }, { "epoch": 0.0224, "grad_norm": 22.210201263427734, "learning_rate": 7.4666666666666675e-06, "loss": 1.3638, "step": 56 }, { "batch_size": 4, "epoch": 0.0224, "step": 56, "tokens_per_device": 6508 }, { "epoch": 0.0224, "loss_ce": 1.0826373100280762, "loss_lvr": 1.649060845375061, "loss_mode_switch": 0.0, "loss_total": 1.2475433349609375, "step": 56 }, { "batch_size": 1, "epoch": 0.0224, "step": 56, "tokens_per_device": 4971 }, { "epoch": 0.0224, "loss_ce": 0.8542469143867493, "loss_lvr": 1.0613982677459717, "loss_mode_switch": 0.0, "loss_total": 0.9603867530822754, "step": 56 }, { "batch_size": 4, "epoch": 0.0224, "step": 56, "tokens_per_device": 6912 }, { "epoch": 0.0224, "loss_ce": 1.2105382680892944, "loss_lvr": 1.283538818359375, "loss_mode_switch": 0.0, "loss_total": 1.3388921022415161, "step": 56 }, { "batch_size": 4, "epoch": 0.0224, "step": 56, "tokens_per_device": 5008 }, { "epoch": 0.0224, "loss_ce": 1.2939834594726562, "loss_lvr": 1.6547614336013794, "loss_mode_switch": 0.0, "loss_total": 1.4594595432281494, "step": 56 }, { "batch_size": 4, "epoch": 0.0224, "step": 56, "tokens_per_device": 3952 }, { "epoch": 0.0224, "loss_ce": 1.0580430030822754, "loss_lvr": 1.854548454284668, "loss_mode_switch": 0.0, "loss_total": 1.2434978485107422, "step": 56 }, { "batch_size": 1, "epoch": 0.0224, "step": 56, "tokens_per_device": 4877 }, { "epoch": 0.0224, "loss_ce": 0.9493213891983032, "loss_lvr": 1.180424451828003, "loss_mode_switch": 0.0, "loss_total": 1.0673638582229614, "step": 56 }, { "batch_size": 4, "epoch": 0.0224, "step": 56, "tokens_per_device": 4208 }, { "epoch": 0.0224, "loss_ce": 1.2461076974868774, "loss_lvr": 1.8207982778549194, "loss_mode_switch": 0.0, "loss_total": 1.4281874895095825, "step": 56 }, { "batch_size": 4, "epoch": 0.0224, "step": 56, "tokens_per_device": 4232 }, { "epoch": 0.0224, "loss_ce": 1.2170299291610718, "loss_lvr": 2.3982248306274414, "loss_mode_switch": 0.0, "loss_total": 1.4568524360656738, "step": 56 }, { "epoch": 0.0228, "grad_norm": 21.07147979736328, "learning_rate": 7.600000000000001e-06, "loss": 1.3342, "step": 57 }, { "batch_size": 4, "epoch": 0.0228, "step": 57, "tokens_per_device": 6132 }, { "epoch": 0.0228, "loss_ce": 1.2564347982406616, "loss_lvr": 1.621825933456421, "loss_mode_switch": 0.0, "loss_total": 1.4186173677444458, "step": 57 }, { "batch_size": 4, "epoch": 0.0228, "step": 57, "tokens_per_device": 2956 }, { "epoch": 0.0228, "loss_ce": 1.0618722438812256, "loss_lvr": 1.3146767616271973, "loss_mode_switch": 0.0, "loss_total": 1.1933399438858032, "step": 57 }, { "batch_size": 1, "epoch": 0.0228, "step": 57, "tokens_per_device": 4886 }, { "epoch": 0.0228, "loss_ce": 0.7735234498977661, "loss_lvr": 1.2404143810272217, "loss_mode_switch": 0.0, "loss_total": 0.8975648880004883, "step": 57 }, { "batch_size": 4, "epoch": 0.0228, "step": 57, "tokens_per_device": 4412 }, { "epoch": 0.0228, "loss_ce": 1.0153963565826416, "loss_lvr": 1.8476709127426147, "loss_mode_switch": 0.0, "loss_total": 1.20016348361969, "step": 57 }, { "batch_size": 4, "epoch": 0.0228, "step": 57, "tokens_per_device": 4248 }, { "epoch": 0.0228, "loss_ce": 1.1308794021606445, "loss_lvr": 1.399410605430603, "loss_mode_switch": 0.0, "loss_total": 1.2708204984664917, "step": 57 }, { "batch_size": 4, "epoch": 0.0228, "step": 57, "tokens_per_device": 2696 }, { "epoch": 0.0228, "loss_ce": 1.391629934310913, "loss_lvr": 2.5668771266937256, "loss_mode_switch": 0.0, "loss_total": 1.6483176946640015, "step": 57 }, { "batch_size": 1, "epoch": 0.0228, "step": 57, "tokens_per_device": 5078 }, { "epoch": 0.0228, "loss_ce": 0.9941625595092773, "loss_lvr": 2.681762933731079, "loss_mode_switch": 0.0, "loss_total": 1.2623388767242432, "step": 57 }, { "batch_size": 1, "epoch": 0.0228, "step": 57, "tokens_per_device": 4880 }, { "epoch": 0.0228, "loss_ce": 1.0100387334823608, "loss_lvr": 0.8732684254646301, "loss_mode_switch": 0.0, "loss_total": 1.0973656177520752, "step": 57 }, { "epoch": 0.0232, "grad_norm": 21.72284507751465, "learning_rate": 7.733333333333334e-06, "loss": 1.325, "step": 58 }, { "batch_size": 1, "epoch": 0.0232, "step": 58, "tokens_per_device": 5307 }, { "epoch": 0.0232, "loss_ce": 0.83024662733078, "loss_lvr": 1.592086672782898, "loss_mode_switch": 0.0, "loss_total": 0.9894552826881409, "step": 58 }, { "batch_size": 4, "epoch": 0.0232, "step": 58, "tokens_per_device": 9568 }, { "epoch": 0.0232, "loss_ce": 0.9405083060264587, "loss_lvr": 1.1321712732315063, "loss_mode_switch": 0.0, "loss_total": 1.0537254810333252, "step": 58 }, { "batch_size": 4, "epoch": 0.0232, "step": 58, "tokens_per_device": 2772 }, { "epoch": 0.0232, "loss_ce": 0.9482774138450623, "loss_lvr": 1.212404489517212, "loss_mode_switch": 0.0, "loss_total": 1.0695178508758545, "step": 58 }, { "batch_size": 4, "epoch": 0.0232, "step": 58, "tokens_per_device": 4776 }, { "epoch": 0.0232, "loss_ce": 1.157220482826233, "loss_lvr": 1.7101662158966064, "loss_mode_switch": 0.0, "loss_total": 1.3282370567321777, "step": 58 }, { "batch_size": 4, "epoch": 0.0232, "step": 58, "tokens_per_device": 2628 }, { "epoch": 0.0232, "loss_ce": 1.1845389604568481, "loss_lvr": 1.7432444095611572, "loss_mode_switch": 0.0, "loss_total": 1.358863353729248, "step": 58 }, { "batch_size": 4, "epoch": 0.0232, "step": 58, "tokens_per_device": 3660 }, { "epoch": 0.0232, "loss_ce": 1.0639774799346924, "loss_lvr": 1.7076964378356934, "loss_mode_switch": 0.0, "loss_total": 1.2347471714019775, "step": 58 }, { "batch_size": 4, "epoch": 0.0232, "step": 58, "tokens_per_device": 4596 }, { "epoch": 0.0232, "loss_ce": 1.1563746929168701, "loss_lvr": 1.9057422876358032, "loss_mode_switch": 0.0, "loss_total": 1.3469488620758057, "step": 58 }, { "batch_size": 1, "epoch": 0.0232, "step": 58, "tokens_per_device": 4619 }, { "epoch": 0.0232, "loss_ce": 1.065476655960083, "loss_lvr": 1.0880900621414185, "loss_mode_switch": 0.0, "loss_total": 1.174285650253296, "step": 58 }, { "epoch": 0.0236, "grad_norm": 20.414928436279297, "learning_rate": 7.866666666666667e-06, "loss": 1.2273, "step": 59 }, { "batch_size": 4, "epoch": 0.0236, "step": 59, "tokens_per_device": 4552 }, { "epoch": 0.0236, "loss_ce": 1.213344931602478, "loss_lvr": 1.8362431526184082, "loss_mode_switch": 0.0, "loss_total": 1.396969199180603, "step": 59 }, { "batch_size": 4, "epoch": 0.0236, "step": 59, "tokens_per_device": 5724 }, { "epoch": 0.0236, "loss_ce": 0.9024385213851929, "loss_lvr": 1.5141962766647339, "loss_mode_switch": 0.0, "loss_total": 1.0538581609725952, "step": 59 }, { "batch_size": 4, "epoch": 0.0236, "step": 59, "tokens_per_device": 7320 }, { "epoch": 0.0236, "loss_ce": 0.8120731711387634, "loss_lvr": 0.9566336870193481, "loss_mode_switch": 0.0, "loss_total": 0.9077365398406982, "step": 59 }, { "batch_size": 1, "epoch": 0.0236, "step": 59, "tokens_per_device": 5035 }, { "epoch": 0.0236, "loss_ce": 1.095819354057312, "loss_lvr": 0.7260469198226929, "loss_mode_switch": 0.0, "loss_total": 1.1684240102767944, "step": 59 }, { "batch_size": 4, "epoch": 0.0236, "step": 59, "tokens_per_device": 2708 }, { "epoch": 0.0236, "loss_ce": 1.1967183351516724, "loss_lvr": 1.7064555883407593, "loss_mode_switch": 0.0, "loss_total": 1.3673639297485352, "step": 59 }, { "batch_size": 1, "epoch": 0.0236, "step": 59, "tokens_per_device": 4865 }, { "epoch": 0.0236, "loss_ce": 0.9672233462333679, "loss_lvr": 0.5166878700256348, "loss_mode_switch": 0.0, "loss_total": 1.0188921689987183, "step": 59 }, { "batch_size": 4, "epoch": 0.0236, "step": 59, "tokens_per_device": 4340 }, { "epoch": 0.0236, "loss_ce": 1.0042611360549927, "loss_lvr": 2.0970075130462646, "loss_mode_switch": 0.0, "loss_total": 1.2139618396759033, "step": 59 }, { "batch_size": 4, "epoch": 0.0236, "step": 59, "tokens_per_device": 3724 }, { "epoch": 0.0236, "loss_ce": 1.0801711082458496, "loss_lvr": 2.026407241821289, "loss_mode_switch": 0.0, "loss_total": 1.2828118801116943, "step": 59 }, { "epoch": 0.024, "grad_norm": 20.655323028564453, "learning_rate": 8.000000000000001e-06, "loss": 1.2111, "step": 60 }, { "batch_size": 4, "epoch": 0.024, "step": 60, "tokens_per_device": 10296 }, { "epoch": 0.024, "loss_ce": 1.0499979257583618, "loss_lvr": 1.516037106513977, "loss_mode_switch": 0.0, "loss_total": 1.2016016244888306, "step": 60 }, { "batch_size": 1, "epoch": 0.024, "step": 60, "tokens_per_device": 5019 }, { "epoch": 0.024, "loss_ce": 1.1388580799102783, "loss_lvr": 1.3768764734268188, "loss_mode_switch": 0.0, "loss_total": 1.276545763015747, "step": 60 }, { "batch_size": 1, "epoch": 0.024, "step": 60, "tokens_per_device": 5149 }, { "epoch": 0.024, "loss_ce": 0.7615029215812683, "loss_lvr": 1.0921212434768677, "loss_mode_switch": 0.0, "loss_total": 0.8707150220870972, "step": 60 }, { "batch_size": 4, "epoch": 0.024, "step": 60, "tokens_per_device": 4252 }, { "epoch": 0.024, "loss_ce": 1.1756778955459595, "loss_lvr": 1.6711105108261108, "loss_mode_switch": 0.0, "loss_total": 1.3427889347076416, "step": 60 }, { "batch_size": 4, "epoch": 0.024, "step": 60, "tokens_per_device": 5588 }, { "epoch": 0.024, "loss_ce": 1.0711334943771362, "loss_lvr": 1.6292294263839722, "loss_mode_switch": 0.0, "loss_total": 1.2340564727783203, "step": 60 }, { "batch_size": 4, "epoch": 0.024, "step": 60, "tokens_per_device": 14540 }, { "epoch": 0.024, "loss_ce": 1.0344290733337402, "loss_lvr": 1.6717318296432495, "loss_mode_switch": 0.0, "loss_total": 1.2016022205352783, "step": 60 }, { "batch_size": 1, "epoch": 0.024, "step": 60, "tokens_per_device": 5028 }, { "epoch": 0.024, "loss_ce": 0.7297084927558899, "loss_lvr": 0.8773434162139893, "loss_mode_switch": 0.0, "loss_total": 0.8174428343772888, "step": 60 }, { "batch_size": 1, "epoch": 0.024, "step": 60, "tokens_per_device": 4943 }, { "epoch": 0.024, "loss_ce": 0.9857078790664673, "loss_lvr": 1.4005110263824463, "loss_mode_switch": 0.0, "loss_total": 1.1257590055465698, "step": 60 }, { "epoch": 0.0244, "grad_norm": 20.192827224731445, "learning_rate": 8.133333333333334e-06, "loss": 1.2066, "step": 61 }, { "batch_size": 4, "epoch": 0.0244, "step": 61, "tokens_per_device": 3876 }, { "epoch": 0.0244, "loss_ce": 0.8690129518508911, "loss_lvr": 1.9637176990509033, "loss_mode_switch": 0.0, "loss_total": 1.0653847455978394, "step": 61 }, { "batch_size": 4, "epoch": 0.0244, "step": 61, "tokens_per_device": 5988 }, { "epoch": 0.0244, "loss_ce": 0.8895124197006226, "loss_lvr": 1.570312738418579, "loss_mode_switch": 0.0, "loss_total": 1.0465437173843384, "step": 61 }, { "batch_size": 1, "epoch": 0.0244, "step": 61, "tokens_per_device": 5157 }, { "epoch": 0.0244, "loss_ce": 0.7114503383636475, "loss_lvr": 1.339120864868164, "loss_mode_switch": 0.0, "loss_total": 0.8453624248504639, "step": 61 }, { "batch_size": 1, "epoch": 0.0244, "step": 61, "tokens_per_device": 5123 }, { "epoch": 0.0244, "loss_ce": 1.041176676750183, "loss_lvr": 0.9514181017875671, "loss_mode_switch": 0.0, "loss_total": 1.1363184452056885, "step": 61 }, { "batch_size": 4, "epoch": 0.0244, "step": 61, "tokens_per_device": 3392 }, { "epoch": 0.0244, "loss_ce": 1.1837654113769531, "loss_lvr": 2.1123733520507812, "loss_mode_switch": 0.0, "loss_total": 1.3950027227401733, "step": 61 }, { "batch_size": 1, "epoch": 0.0244, "step": 61, "tokens_per_device": 4758 }, { "epoch": 0.0244, "loss_ce": 0.6882122755050659, "loss_lvr": 1.643426537513733, "loss_mode_switch": 0.0, "loss_total": 0.8525549173355103, "step": 61 }, { "batch_size": 4, "epoch": 0.0244, "step": 61, "tokens_per_device": 4248 }, { "epoch": 0.0244, "loss_ce": 1.1723051071166992, "loss_lvr": 1.8268734216690063, "loss_mode_switch": 0.0, "loss_total": 1.354992389678955, "step": 61 }, { "batch_size": 4, "epoch": 0.0244, "step": 61, "tokens_per_device": 4516 }, { "epoch": 0.0244, "loss_ce": 0.8725106716156006, "loss_lvr": 1.9019005298614502, "loss_mode_switch": 0.0, "loss_total": 1.0627007484436035, "step": 61 }, { "epoch": 0.0248, "grad_norm": 18.778369903564453, "learning_rate": 8.266666666666667e-06, "loss": 1.1789, "step": 62 }, { "batch_size": 4, "epoch": 0.0248, "step": 62, "tokens_per_device": 1716 }, { "epoch": 0.0248, "loss_ce": 0.9276713728904724, "loss_lvr": 1.8059648275375366, "loss_mode_switch": 0.0, "loss_total": 1.108267903327942, "step": 62 }, { "batch_size": 4, "epoch": 0.0248, "step": 62, "tokens_per_device": 6228 }, { "epoch": 0.0248, "loss_ce": 1.0363272428512573, "loss_lvr": 1.438913106918335, "loss_mode_switch": 0.0, "loss_total": 1.1802185773849487, "step": 62 }, { "batch_size": 4, "epoch": 0.0248, "step": 62, "tokens_per_device": 4248 }, { "epoch": 0.0248, "loss_ce": 0.9081774950027466, "loss_lvr": 2.023895025253296, "loss_mode_switch": 0.0, "loss_total": 1.1105669736862183, "step": 62 }, { "batch_size": 4, "epoch": 0.0248, "step": 62, "tokens_per_device": 3936 }, { "epoch": 0.0248, "loss_ce": 0.9675883054733276, "loss_lvr": 2.673586845397949, "loss_mode_switch": 0.0, "loss_total": 1.2349469661712646, "step": 62 }, { "batch_size": 1, "epoch": 0.0248, "step": 62, "tokens_per_device": 4917 }, { "epoch": 0.0248, "loss_ce": 1.164612054824829, "loss_lvr": 1.6966792345046997, "loss_mode_switch": 0.0, "loss_total": 1.334280014038086, "step": 62 }, { "batch_size": 4, "epoch": 0.0248, "step": 62, "tokens_per_device": 4980 }, { "epoch": 0.0248, "loss_ce": 0.8845756649971008, "loss_lvr": 1.8853036165237427, "loss_mode_switch": 0.0, "loss_total": 1.073106050491333, "step": 62 }, { "batch_size": 4, "epoch": 0.0248, "step": 62, "tokens_per_device": 10328 }, { "epoch": 0.0248, "loss_ce": 1.1297686100006104, "loss_lvr": 1.3762006759643555, "loss_mode_switch": 0.0, "loss_total": 1.2673887014389038, "step": 62 }, { "batch_size": 4, "epoch": 0.0248, "step": 62, "tokens_per_device": 3804 }, { "epoch": 0.0248, "loss_ce": 1.0322222709655762, "loss_lvr": 1.9257291555404663, "loss_mode_switch": 0.0, "loss_total": 1.2247952222824097, "step": 62 }, { "epoch": 0.0252, "grad_norm": 18.891918182373047, "learning_rate": 8.400000000000001e-06, "loss": 1.1464, "step": 63 }, { "batch_size": 1, "epoch": 0.0252, "step": 63, "tokens_per_device": 4764 }, { "epoch": 0.0252, "loss_ce": 0.4112054705619812, "loss_lvr": 1.1849348545074463, "loss_mode_switch": 0.0, "loss_total": 0.5296989679336548, "step": 63 }, { "batch_size": 4, "epoch": 0.0252, "step": 63, "tokens_per_device": 2472 }, { "epoch": 0.0252, "loss_ce": 0.9209081530570984, "loss_lvr": 1.569061040878296, "loss_mode_switch": 0.0, "loss_total": 1.0778142213821411, "step": 63 }, { "batch_size": 1, "epoch": 0.0252, "step": 63, "tokens_per_device": 4889 }, { "epoch": 0.0252, "loss_ce": 0.8251134157180786, "loss_lvr": 1.0559158325195312, "loss_mode_switch": 0.0, "loss_total": 0.9307050108909607, "step": 63 }, { "batch_size": 4, "epoch": 0.0252, "step": 63, "tokens_per_device": 1548 }, { "epoch": 0.0252, "loss_ce": 0.747048020362854, "loss_lvr": 1.9538631439208984, "loss_mode_switch": 0.0, "loss_total": 0.9424343109130859, "step": 63 }, { "batch_size": 4, "epoch": 0.0252, "step": 63, "tokens_per_device": 4132 }, { "epoch": 0.0252, "loss_ce": 1.3041024208068848, "loss_lvr": 1.617425799369812, "loss_mode_switch": 0.0, "loss_total": 1.465844988822937, "step": 63 }, { "batch_size": 4, "epoch": 0.0252, "step": 63, "tokens_per_device": 4196 }, { "epoch": 0.0252, "loss_ce": 1.325853705406189, "loss_lvr": 1.485686182975769, "loss_mode_switch": 0.0, "loss_total": 1.4744223356246948, "step": 63 }, { "batch_size": 4, "epoch": 0.0252, "step": 63, "tokens_per_device": 4204 }, { "epoch": 0.0252, "loss_ce": 0.9931097030639648, "loss_lvr": 1.8501797914505005, "loss_mode_switch": 0.0, "loss_total": 1.178127646446228, "step": 63 }, { "batch_size": 4, "epoch": 0.0252, "step": 63, "tokens_per_device": 4340 }, { "epoch": 0.0252, "loss_ce": 1.006190538406372, "loss_lvr": 1.469705581665039, "loss_mode_switch": 0.0, "loss_total": 1.1531610488891602, "step": 63 }, { "epoch": 0.0256, "grad_norm": 16.829559326171875, "learning_rate": 8.533333333333335e-06, "loss": 1.0846, "step": 64 }, { "batch_size": 4, "epoch": 0.0256, "step": 64, "tokens_per_device": 3924 }, { "epoch": 0.0256, "loss_ce": 0.9396167397499084, "loss_lvr": 1.872664451599121, "loss_mode_switch": 0.0, "loss_total": 1.1268831491470337, "step": 64 }, { "batch_size": 4, "epoch": 0.0256, "step": 64, "tokens_per_device": 1464 }, { "epoch": 0.0256, "loss_ce": 0.8505905270576477, "loss_lvr": 1.8672804832458496, "loss_mode_switch": 0.0, "loss_total": 1.0373185873031616, "step": 64 }, { "batch_size": 1, "epoch": 0.0256, "step": 64, "tokens_per_device": 5617 }, { "epoch": 0.0256, "loss_ce": 0.4582458436489105, "loss_lvr": 1.7200223207473755, "loss_mode_switch": 0.0, "loss_total": 0.6302480697631836, "step": 64 }, { "batch_size": 1, "epoch": 0.0256, "step": 64, "tokens_per_device": 5380 }, { "epoch": 0.0256, "loss_ce": 0.6492959856987, "loss_lvr": 1.230745553970337, "loss_mode_switch": 0.0, "loss_total": 0.7723705172538757, "step": 64 }, { "batch_size": 4, "epoch": 0.0256, "step": 64, "tokens_per_device": 4816 }, { "epoch": 0.0256, "loss_ce": 0.950670063495636, "loss_lvr": 1.762485146522522, "loss_mode_switch": 0.0, "loss_total": 1.1269185543060303, "step": 64 }, { "batch_size": 4, "epoch": 0.0256, "step": 64, "tokens_per_device": 5312 }, { "epoch": 0.0256, "loss_ce": 1.1078970432281494, "loss_lvr": 2.0417542457580566, "loss_mode_switch": 0.0, "loss_total": 1.312072515487671, "step": 64 }, { "batch_size": 4, "epoch": 0.0256, "step": 64, "tokens_per_device": 10976 }, { "epoch": 0.0256, "loss_ce": 0.9229910373687744, "loss_lvr": 2.436825752258301, "loss_mode_switch": 0.0, "loss_total": 1.1666736602783203, "step": 64 }, { "batch_size": 4, "epoch": 0.0256, "step": 64, "tokens_per_device": 4288 }, { "epoch": 0.0256, "loss_ce": 1.0187227725982666, "loss_lvr": 1.8686952590942383, "loss_mode_switch": 0.0, "loss_total": 1.2055922746658325, "step": 64 }, { "epoch": 0.026, "grad_norm": 16.400127410888672, "learning_rate": 8.666666666666668e-06, "loss": 1.0729, "step": 65 }, { "batch_size": 4, "epoch": 0.026, "step": 65, "tokens_per_device": 4224 }, { "epoch": 0.026, "loss_ce": 0.9075149893760681, "loss_lvr": 1.3971092700958252, "loss_mode_switch": 0.0, "loss_total": 1.0472259521484375, "step": 65 }, { "batch_size": 4, "epoch": 0.026, "step": 65, "tokens_per_device": 4012 }, { "epoch": 0.026, "loss_ce": 0.8514159917831421, "loss_lvr": 1.8817251920700073, "loss_mode_switch": 0.0, "loss_total": 1.039588451385498, "step": 65 }, { "batch_size": 4, "epoch": 0.026, "step": 65, "tokens_per_device": 6588 }, { "epoch": 0.026, "loss_ce": 0.9250572919845581, "loss_lvr": 1.4371471405029297, "loss_mode_switch": 0.0, "loss_total": 1.0687719583511353, "step": 65 }, { "batch_size": 1, "epoch": 0.026, "step": 65, "tokens_per_device": 4856 }, { "epoch": 0.026, "loss_ce": 0.7531739473342896, "loss_lvr": 1.0116031169891357, "loss_mode_switch": 0.0, "loss_total": 0.8543342351913452, "step": 65 }, { "batch_size": 4, "epoch": 0.026, "step": 65, "tokens_per_device": 4924 }, { "epoch": 0.026, "loss_ce": 0.9684096574783325, "loss_lvr": 1.550153374671936, "loss_mode_switch": 0.0, "loss_total": 1.123425006866455, "step": 65 }, { "batch_size": 4, "epoch": 0.026, "step": 65, "tokens_per_device": 2708 }, { "epoch": 0.026, "loss_ce": 0.9348480105400085, "loss_lvr": 1.5201354026794434, "loss_mode_switch": 0.0, "loss_total": 1.0868616104125977, "step": 65 }, { "batch_size": 4, "epoch": 0.026, "step": 65, "tokens_per_device": 15424 }, { "epoch": 0.026, "loss_ce": 0.9006461501121521, "loss_lvr": 0.9629846811294556, "loss_mode_switch": 0.0, "loss_total": 0.9969446063041687, "step": 65 }, { "batch_size": 4, "epoch": 0.026, "step": 65, "tokens_per_device": 3872 }, { "epoch": 0.026, "loss_ce": 0.9437369704246521, "loss_lvr": 1.75492525100708, "loss_mode_switch": 0.0, "loss_total": 1.1192295551300049, "step": 65 }, { "epoch": 0.0264, "grad_norm": 13.947864532470703, "learning_rate": 8.8e-06, "loss": 1.0083, "step": 66 }, { "batch_size": 1, "epoch": 0.0264, "step": 66, "tokens_per_device": 5127 }, { "epoch": 0.0264, "loss_ce": 0.7036693096160889, "loss_lvr": 1.3532606363296509, "loss_mode_switch": 0.0, "loss_total": 0.8389953970909119, "step": 66 }, { "batch_size": 4, "epoch": 0.0264, "step": 66, "tokens_per_device": 4888 }, { "epoch": 0.0264, "loss_ce": 0.7810543775558472, "loss_lvr": 1.7528084516525269, "loss_mode_switch": 0.0, "loss_total": 0.9563352465629578, "step": 66 }, { "batch_size": 4, "epoch": 0.0264, "step": 66, "tokens_per_device": 12316 }, { "epoch": 0.0264, "loss_ce": 0.783883273601532, "loss_lvr": 1.5354655981063843, "loss_mode_switch": 0.0, "loss_total": 0.9374298453330994, "step": 66 }, { "batch_size": 4, "epoch": 0.0264, "step": 66, "tokens_per_device": 2456 }, { "epoch": 0.0264, "loss_ce": 0.8836303353309631, "loss_lvr": 1.867322325706482, "loss_mode_switch": 0.0, "loss_total": 1.0703625679016113, "step": 66 }, { "batch_size": 1, "epoch": 0.0264, "step": 66, "tokens_per_device": 4886 }, { "epoch": 0.0264, "loss_ce": 0.5439304113388062, "loss_lvr": 1.1345467567443848, "loss_mode_switch": 0.0, "loss_total": 0.6573851108551025, "step": 66 }, { "batch_size": 4, "epoch": 0.0264, "step": 66, "tokens_per_device": 4220 }, { "epoch": 0.0264, "loss_ce": 0.7872119545936584, "loss_lvr": 1.5451768636703491, "loss_mode_switch": 0.0, "loss_total": 0.9417296648025513, "step": 66 }, { "batch_size": 4, "epoch": 0.0264, "step": 66, "tokens_per_device": 5832 }, { "epoch": 0.0264, "loss_ce": 0.9541775584220886, "loss_lvr": 1.6325751543045044, "loss_mode_switch": 0.0, "loss_total": 1.117435097694397, "step": 66 }, { "batch_size": 4, "epoch": 0.0264, "step": 66, "tokens_per_device": 6036 }, { "epoch": 0.0264, "loss_ce": 0.9392501711845398, "loss_lvr": 1.5427484512329102, "loss_mode_switch": 0.0, "loss_total": 1.0935250520706177, "step": 66 }, { "epoch": 0.0268, "grad_norm": 13.166130065917969, "learning_rate": 8.933333333333333e-06, "loss": 0.9791, "step": 67 }, { "batch_size": 4, "epoch": 0.0268, "step": 67, "tokens_per_device": 1464 }, { "epoch": 0.0268, "loss_ce": 0.6317563652992249, "loss_lvr": 2.819585084915161, "loss_mode_switch": 0.0, "loss_total": 0.9137148857116699, "step": 67 }, { "batch_size": 4, "epoch": 0.0268, "step": 67, "tokens_per_device": 4968 }, { "epoch": 0.0268, "loss_ce": 0.6229236721992493, "loss_lvr": 2.957627058029175, "loss_mode_switch": 0.0, "loss_total": 0.9186863899230957, "step": 67 }, { "batch_size": 4, "epoch": 0.0268, "step": 67, "tokens_per_device": 4200 }, { "epoch": 0.0268, "loss_ce": 0.5964041352272034, "loss_lvr": 2.9076077938079834, "loss_mode_switch": 0.0, "loss_total": 0.8871649503707886, "step": 67 }, { "batch_size": 4, "epoch": 0.0268, "step": 67, "tokens_per_device": 1212 }, { "epoch": 0.0268, "loss_ce": 0.7940017580986023, "loss_lvr": 2.87727952003479, "loss_mode_switch": 0.0, "loss_total": 1.0817296504974365, "step": 67 }, { "batch_size": 4, "epoch": 0.0268, "step": 67, "tokens_per_device": 8132 }, { "epoch": 0.0268, "loss_ce": 0.8082689046859741, "loss_lvr": 1.6821298599243164, "loss_mode_switch": 0.0, "loss_total": 0.9764819145202637, "step": 67 }, { "batch_size": 1, "epoch": 0.0268, "step": 67, "tokens_per_device": 4900 }, { "epoch": 0.0268, "loss_ce": 0.7988808751106262, "loss_lvr": 2.739151954650879, "loss_mode_switch": 0.0, "loss_total": 1.072796106338501, "step": 67 }, { "batch_size": 4, "epoch": 0.0268, "step": 67, "tokens_per_device": 5420 }, { "epoch": 0.0268, "loss_ce": 0.8055320382118225, "loss_lvr": 2.4534709453582764, "loss_mode_switch": 0.0, "loss_total": 1.0508791208267212, "step": 67 }, { "batch_size": 4, "epoch": 0.0268, "step": 67, "tokens_per_device": 4424 }, { "epoch": 0.0268, "loss_ce": 0.6482744216918945, "loss_lvr": 2.269615888595581, "loss_mode_switch": 0.0, "loss_total": 0.8752360343933105, "step": 67 }, { "epoch": 0.0272, "grad_norm": 12.824060440063477, "learning_rate": 9.066666666666667e-06, "loss": 0.9449, "step": 68 }, { "batch_size": 1, "epoch": 0.0272, "step": 68, "tokens_per_device": 5024 }, { "epoch": 0.0272, "loss_ce": 1.0149017572402954, "loss_lvr": 0.9158696532249451, "loss_mode_switch": 0.0, "loss_total": 1.1064887046813965, "step": 68 }, { "batch_size": 4, "epoch": 0.0272, "step": 68, "tokens_per_device": 2652 }, { "epoch": 0.0272, "loss_ce": 0.9914905428886414, "loss_lvr": 1.4661859273910522, "loss_mode_switch": 0.0, "loss_total": 1.1381090879440308, "step": 68 }, { "batch_size": 4, "epoch": 0.0272, "step": 68, "tokens_per_device": 4168 }, { "epoch": 0.0272, "loss_ce": 0.8183131814002991, "loss_lvr": 1.6689826250076294, "loss_mode_switch": 0.0, "loss_total": 0.9852114319801331, "step": 68 }, { "batch_size": 1, "epoch": 0.0272, "step": 68, "tokens_per_device": 4901 }, { "epoch": 0.0272, "loss_ce": 0.6429346203804016, "loss_lvr": 0.6569076180458069, "loss_mode_switch": 0.0, "loss_total": 0.7086253762245178, "step": 68 }, { "batch_size": 4, "epoch": 0.0272, "step": 68, "tokens_per_device": 2712 }, { "epoch": 0.0272, "loss_ce": 0.8168038725852966, "loss_lvr": 1.7437546253204346, "loss_mode_switch": 0.0, "loss_total": 0.991179347038269, "step": 68 }, { "batch_size": 4, "epoch": 0.0272, "step": 68, "tokens_per_device": 2604 }, { "epoch": 0.0272, "loss_ce": 0.6468024849891663, "loss_lvr": 1.750339150428772, "loss_mode_switch": 0.0, "loss_total": 0.8218364119529724, "step": 68 }, { "batch_size": 4, "epoch": 0.0272, "step": 68, "tokens_per_device": 2660 }, { "epoch": 0.0272, "loss_ce": 0.8708071112632751, "loss_lvr": 1.405153751373291, "loss_mode_switch": 0.0, "loss_total": 1.0113224983215332, "step": 68 }, { "batch_size": 1, "epoch": 0.0272, "step": 68, "tokens_per_device": 4882 }, { "epoch": 0.0272, "loss_ce": 0.8258808851242065, "loss_lvr": 1.190049171447754, "loss_mode_switch": 0.0, "loss_total": 0.944885790348053, "step": 68 }, { "epoch": 0.0276, "grad_norm": 10.652225494384766, "learning_rate": 9.200000000000002e-06, "loss": 0.9187, "step": 69 }, { "batch_size": 4, "epoch": 0.0276, "step": 69, "tokens_per_device": 4408 }, { "epoch": 0.0276, "loss_ce": 0.7082288265228271, "loss_lvr": 1.4633253812789917, "loss_mode_switch": 0.0, "loss_total": 0.8545613884925842, "step": 69 }, { "batch_size": 4, "epoch": 0.0276, "step": 69, "tokens_per_device": 1336 }, { "epoch": 0.0276, "loss_ce": 0.6941829919815063, "loss_lvr": 1.644493818283081, "loss_mode_switch": 0.0, "loss_total": 0.8586323857307434, "step": 69 }, { "batch_size": 1, "epoch": 0.0276, "step": 69, "tokens_per_device": 6053 }, { "epoch": 0.0276, "loss_ce": 0.7079319357872009, "loss_lvr": 1.117165446281433, "loss_mode_switch": 0.0, "loss_total": 0.8196485042572021, "step": 69 }, { "batch_size": 4, "epoch": 0.0276, "step": 69, "tokens_per_device": 4192 }, { "epoch": 0.0276, "loss_ce": 0.894482433795929, "loss_lvr": 1.8732173442840576, "loss_mode_switch": 0.0, "loss_total": 1.0818041563034058, "step": 69 }, { "batch_size": 4, "epoch": 0.0276, "step": 69, "tokens_per_device": 4088 }, { "epoch": 0.0276, "loss_ce": 0.9674355983734131, "loss_lvr": 2.4381513595581055, "loss_mode_switch": 0.0, "loss_total": 1.2112507820129395, "step": 69 }, { "batch_size": 4, "epoch": 0.0276, "step": 69, "tokens_per_device": 2704 }, { "epoch": 0.0276, "loss_ce": 0.7127658128738403, "loss_lvr": 1.3676140308380127, "loss_mode_switch": 0.0, "loss_total": 0.8495272397994995, "step": 69 }, { "batch_size": 4, "epoch": 0.0276, "step": 69, "tokens_per_device": 4304 }, { "epoch": 0.0276, "loss_ce": 0.680328369140625, "loss_lvr": 2.3908846378326416, "loss_mode_switch": 0.0, "loss_total": 0.9194168448448181, "step": 69 }, { "batch_size": 1, "epoch": 0.0276, "step": 69, "tokens_per_device": 5036 }, { "epoch": 0.0276, "loss_ce": 0.41423293948173523, "loss_lvr": 0.9005661606788635, "loss_mode_switch": 0.0, "loss_total": 0.5042895674705505, "step": 69 }, { "epoch": 0.028, "grad_norm": 9.33300495147705, "learning_rate": 9.333333333333334e-06, "loss": 0.911, "step": 70 }, { "batch_size": 1, "epoch": 0.028, "step": 70, "tokens_per_device": 4881 }, { "epoch": 0.028, "loss_ce": 0.744406521320343, "loss_lvr": 0.9834572076797485, "loss_mode_switch": 0.0, "loss_total": 0.84275221824646, "step": 70 }, { "batch_size": 4, "epoch": 0.028, "step": 70, "tokens_per_device": 3324 }, { "epoch": 0.028, "loss_ce": 0.5544340014457703, "loss_lvr": 2.5257132053375244, "loss_mode_switch": 0.0, "loss_total": 0.8070052862167358, "step": 70 }, { "batch_size": 4, "epoch": 0.028, "step": 70, "tokens_per_device": 5424 }, { "epoch": 0.028, "loss_ce": 1.250048041343689, "loss_lvr": 1.843143105506897, "loss_mode_switch": 0.0, "loss_total": 1.4343624114990234, "step": 70 }, { "batch_size": 1, "epoch": 0.028, "step": 70, "tokens_per_device": 5117 }, { "epoch": 0.028, "loss_ce": 0.3870488405227661, "loss_lvr": 1.009551763534546, "loss_mode_switch": 0.0, "loss_total": 0.48800402879714966, "step": 70 }, { "batch_size": 4, "epoch": 0.028, "step": 70, "tokens_per_device": 5844 }, { "epoch": 0.028, "loss_ce": 0.6164553761482239, "loss_lvr": 3.484656572341919, "loss_mode_switch": 0.0, "loss_total": 0.9649209976196289, "step": 70 }, { "batch_size": 1, "epoch": 0.028, "step": 70, "tokens_per_device": 5196 }, { "epoch": 0.028, "loss_ce": 0.5114035606384277, "loss_lvr": 1.376957654953003, "loss_mode_switch": 0.0, "loss_total": 0.6490993499755859, "step": 70 }, { "batch_size": 4, "epoch": 0.028, "step": 70, "tokens_per_device": 4624 }, { "epoch": 0.028, "loss_ce": 0.7626699209213257, "loss_lvr": 1.6358028650283813, "loss_mode_switch": 0.0, "loss_total": 0.9262502193450928, "step": 70 }, { "batch_size": 4, "epoch": 0.028, "step": 70, "tokens_per_device": 3940 }, { "epoch": 0.028, "loss_ce": 0.6085715293884277, "loss_lvr": 2.442777395248413, "loss_mode_switch": 0.0, "loss_total": 0.8528492450714111, "step": 70 }, { "epoch": 0.0284, "grad_norm": 8.884767532348633, "learning_rate": 9.466666666666667e-06, "loss": 0.8817, "step": 71 }, { "batch_size": 4, "epoch": 0.0284, "step": 71, "tokens_per_device": 3756 }, { "epoch": 0.0284, "loss_ce": 0.7017057538032532, "loss_lvr": 2.18902587890625, "loss_mode_switch": 0.0, "loss_total": 0.9206083416938782, "step": 71 }, { "batch_size": 1, "epoch": 0.0284, "step": 71, "tokens_per_device": 4894 }, { "epoch": 0.0284, "loss_ce": 0.4511725902557373, "loss_lvr": 2.9884755611419678, "loss_mode_switch": 0.0, "loss_total": 0.7500201463699341, "step": 71 }, { "batch_size": 4, "epoch": 0.0284, "step": 71, "tokens_per_device": 2692 }, { "epoch": 0.0284, "loss_ce": 0.7078050971031189, "loss_lvr": 1.7551214694976807, "loss_mode_switch": 0.0, "loss_total": 0.883317232131958, "step": 71 }, { "batch_size": 1, "epoch": 0.0284, "step": 71, "tokens_per_device": 4889 }, { "epoch": 0.0284, "loss_ce": 0.48902180790901184, "loss_lvr": 1.2530337572097778, "loss_mode_switch": 0.0, "loss_total": 0.6143251657485962, "step": 71 }, { "batch_size": 1, "epoch": 0.0284, "step": 71, "tokens_per_device": 5156 }, { "epoch": 0.0284, "loss_ce": 0.6281497478485107, "loss_lvr": 1.0298126935958862, "loss_mode_switch": 0.0, "loss_total": 0.7311310172080994, "step": 71 }, { "batch_size": 4, "epoch": 0.0284, "step": 71, "tokens_per_device": 4200 }, { "epoch": 0.0284, "loss_ce": 0.8033914566040039, "loss_lvr": 2.1314215660095215, "loss_mode_switch": 0.0, "loss_total": 1.016533613204956, "step": 71 }, { "batch_size": 4, "epoch": 0.0284, "step": 71, "tokens_per_device": 13592 }, { "epoch": 0.0284, "loss_ce": 0.6353290677070618, "loss_lvr": 1.7393288612365723, "loss_mode_switch": 0.0, "loss_total": 0.8092619776725769, "step": 71 }, { "batch_size": 1, "epoch": 0.0284, "step": 71, "tokens_per_device": 5111 }, { "epoch": 0.0284, "loss_ce": 0.23644261062145233, "loss_lvr": 1.8197898864746094, "loss_mode_switch": 0.0, "loss_total": 0.41842159628868103, "step": 71 }, { "epoch": 0.0288, "grad_norm": 7.696626663208008, "learning_rate": 9.600000000000001e-06, "loss": 0.8346, "step": 72 }, { "batch_size": 4, "epoch": 0.0288, "step": 72, "tokens_per_device": 4224 }, { "epoch": 0.0288, "loss_ce": 0.731002151966095, "loss_lvr": 2.112558126449585, "loss_mode_switch": 0.0, "loss_total": 0.9422580003738403, "step": 72 }, { "batch_size": 1, "epoch": 0.0288, "step": 72, "tokens_per_device": 4940 }, { "epoch": 0.0288, "loss_ce": 0.5787251591682434, "loss_lvr": 1.4553306102752686, "loss_mode_switch": 0.0, "loss_total": 0.7242582440376282, "step": 72 }, { "batch_size": 4, "epoch": 0.0288, "step": 72, "tokens_per_device": 1492 }, { "epoch": 0.0288, "loss_ce": 0.6831899881362915, "loss_lvr": 1.6248489618301392, "loss_mode_switch": 0.0, "loss_total": 0.8456748723983765, "step": 72 }, { "batch_size": 4, "epoch": 0.0288, "step": 72, "tokens_per_device": 4316 }, { "epoch": 0.0288, "loss_ce": 0.5884525775909424, "loss_lvr": 1.475226640701294, "loss_mode_switch": 0.0, "loss_total": 0.7359752655029297, "step": 72 }, { "batch_size": 4, "epoch": 0.0288, "step": 72, "tokens_per_device": 4392 }, { "epoch": 0.0288, "loss_ce": 0.7539697885513306, "loss_lvr": 1.6166471242904663, "loss_mode_switch": 0.0, "loss_total": 0.9156345129013062, "step": 72 }, { "batch_size": 4, "epoch": 0.0288, "step": 72, "tokens_per_device": 1664 }, { "epoch": 0.0288, "loss_ce": 0.8255460262298584, "loss_lvr": 2.143564462661743, "loss_mode_switch": 0.0, "loss_total": 1.0399024486541748, "step": 72 }, { "batch_size": 1, "epoch": 0.0288, "step": 72, "tokens_per_device": 5216 }, { "epoch": 0.0288, "loss_ce": 0.6742200255393982, "loss_lvr": 1.2491133213043213, "loss_mode_switch": 0.0, "loss_total": 0.7991313338279724, "step": 72 }, { "batch_size": 4, "epoch": 0.0288, "step": 72, "tokens_per_device": 3804 }, { "epoch": 0.0288, "loss_ce": 0.7682181000709534, "loss_lvr": 1.6952064037322998, "loss_mode_switch": 0.0, "loss_total": 0.9377387762069702, "step": 72 }, { "epoch": 0.0292, "grad_norm": 7.061488151550293, "learning_rate": 9.733333333333334e-06, "loss": 0.8711, "step": 73 }, { "batch_size": 4, "epoch": 0.0292, "step": 73, "tokens_per_device": 4624 }, { "epoch": 0.0292, "loss_ce": 0.42956265807151794, "loss_lvr": 2.3495981693267822, "loss_mode_switch": 0.0, "loss_total": 0.6645224690437317, "step": 73 }, { "batch_size": 4, "epoch": 0.0292, "step": 73, "tokens_per_device": 5240 }, { "epoch": 0.0292, "loss_ce": 0.5770570635795593, "loss_lvr": 1.7468347549438477, "loss_mode_switch": 0.0, "loss_total": 0.751740574836731, "step": 73 }, { "batch_size": 4, "epoch": 0.0292, "step": 73, "tokens_per_device": 4140 }, { "epoch": 0.0292, "loss_ce": 0.6063104271888733, "loss_lvr": 3.152923583984375, "loss_mode_switch": 0.0, "loss_total": 0.9216027855873108, "step": 73 }, { "batch_size": 1, "epoch": 0.0292, "step": 73, "tokens_per_device": 4206 }, { "epoch": 0.0292, "loss_ce": 0.22169169783592224, "loss_lvr": 2.7890119552612305, "loss_mode_switch": 0.0, "loss_total": 0.5005928874015808, "step": 73 }, { "batch_size": 1, "epoch": 0.0292, "step": 73, "tokens_per_device": 4608 }, { "epoch": 0.0292, "loss_ce": 0.5268898606300354, "loss_lvr": 0.8793036937713623, "loss_mode_switch": 0.0, "loss_total": 0.6148202419281006, "step": 73 }, { "batch_size": 4, "epoch": 0.0292, "step": 73, "tokens_per_device": 4192 }, { "epoch": 0.0292, "loss_ce": 0.7138200998306274, "loss_lvr": 1.7653127908706665, "loss_mode_switch": 0.0, "loss_total": 0.890351414680481, "step": 73 }, { "batch_size": 4, "epoch": 0.0292, "step": 73, "tokens_per_device": 5116 }, { "epoch": 0.0292, "loss_ce": 0.642427384853363, "loss_lvr": 1.8411985635757446, "loss_mode_switch": 0.0, "loss_total": 0.8265472650527954, "step": 73 }, { "batch_size": 4, "epoch": 0.0292, "step": 73, "tokens_per_device": 4620 }, { "epoch": 0.0292, "loss_ce": 0.7352773547172546, "loss_lvr": 1.755401611328125, "loss_mode_switch": 0.0, "loss_total": 0.9108175039291382, "step": 73 }, { "epoch": 0.0296, "grad_norm": 7.399606227874756, "learning_rate": 9.866666666666668e-06, "loss": 0.8445, "step": 74 }, { "batch_size": 4, "epoch": 0.0296, "step": 74, "tokens_per_device": 6024 }, { "epoch": 0.0296, "loss_ce": 0.5574091672897339, "loss_lvr": 1.6339186429977417, "loss_mode_switch": 0.0, "loss_total": 0.720801055431366, "step": 74 }, { "batch_size": 4, "epoch": 0.0296, "step": 74, "tokens_per_device": 2648 }, { "epoch": 0.0296, "loss_ce": 0.5591445565223694, "loss_lvr": 1.7957544326782227, "loss_mode_switch": 0.0, "loss_total": 0.7387199997901917, "step": 74 }, { "batch_size": 4, "epoch": 0.0296, "step": 74, "tokens_per_device": 4508 }, { "epoch": 0.0296, "loss_ce": 0.5932141542434692, "loss_lvr": 1.9211771488189697, "loss_mode_switch": 0.0, "loss_total": 0.7853318452835083, "step": 74 }, { "batch_size": 1, "epoch": 0.0296, "step": 74, "tokens_per_device": 5145 }, { "epoch": 0.0296, "loss_ce": 0.30272582173347473, "loss_lvr": 2.5522351264953613, "loss_mode_switch": 0.0, "loss_total": 0.5579493045806885, "step": 74 }, { "batch_size": 4, "epoch": 0.0296, "step": 74, "tokens_per_device": 2696 }, { "epoch": 0.0296, "loss_ce": 0.6150831580162048, "loss_lvr": 1.4995986223220825, "loss_mode_switch": 0.0, "loss_total": 0.7650430202484131, "step": 74 }, { "batch_size": 1, "epoch": 0.0296, "step": 74, "tokens_per_device": 4743 }, { "epoch": 0.0296, "loss_ce": 0.4438636600971222, "loss_lvr": 0.6520296335220337, "loss_mode_switch": 0.0, "loss_total": 0.509066641330719, "step": 74 }, { "batch_size": 4, "epoch": 0.0296, "step": 74, "tokens_per_device": 4716 }, { "epoch": 0.0296, "loss_ce": 1.0638267993927002, "loss_lvr": 1.1154321432113647, "loss_mode_switch": 0.0, "loss_total": 1.1753699779510498, "step": 74 }, { "batch_size": 1, "epoch": 0.0296, "step": 74, "tokens_per_device": 5100 }, { "epoch": 0.0296, "loss_ce": 0.415216326713562, "loss_lvr": 0.6047829985618591, "loss_mode_switch": 0.0, "loss_total": 0.4756946265697479, "step": 74 }, { "epoch": 0.03, "grad_norm": 6.7716383934021, "learning_rate": 1e-05, "loss": 0.8118, "step": 75 }, { "batch_size": 1, "epoch": 0.03, "step": 75, "tokens_per_device": 5386 }, { "epoch": 0.03, "loss_ce": 0.45915547013282776, "loss_lvr": 1.6681256294250488, "loss_mode_switch": 0.0, "loss_total": 0.6259680390357971, "step": 75 }, { "batch_size": 4, "epoch": 0.03, "step": 75, "tokens_per_device": 4208 }, { "epoch": 0.03, "loss_ce": 0.9603639245033264, "loss_lvr": 1.5600268840789795, "loss_mode_switch": 0.0, "loss_total": 1.1163666248321533, "step": 75 }, { "batch_size": 4, "epoch": 0.03, "step": 75, "tokens_per_device": 6884 }, { "epoch": 0.03, "loss_ce": 0.586793065071106, "loss_lvr": 2.579874038696289, "loss_mode_switch": 0.0, "loss_total": 0.844780445098877, "step": 75 }, { "batch_size": 4, "epoch": 0.03, "step": 75, "tokens_per_device": 1848 }, { "epoch": 0.03, "loss_ce": 0.4782622456550598, "loss_lvr": 4.2075042724609375, "loss_mode_switch": 0.0, "loss_total": 0.8990126848220825, "step": 75 }, { "batch_size": 4, "epoch": 0.03, "step": 75, "tokens_per_device": 3888 }, { "epoch": 0.03, "loss_ce": 0.7705168724060059, "loss_lvr": 1.937334656715393, "loss_mode_switch": 0.0, "loss_total": 0.9642503261566162, "step": 75 }, { "batch_size": 4, "epoch": 0.03, "step": 75, "tokens_per_device": 3744 }, { "epoch": 0.03, "loss_ce": 0.6754806041717529, "loss_lvr": 3.15075945854187, "loss_mode_switch": 0.0, "loss_total": 0.9905565977096558, "step": 75 }, { "batch_size": 1, "epoch": 0.03, "step": 75, "tokens_per_device": 4890 }, { "epoch": 0.03, "loss_ce": 0.5337501168251038, "loss_lvr": 0.8702822327613831, "loss_mode_switch": 0.0, "loss_total": 0.6207783222198486, "step": 75 }, { "batch_size": 1, "epoch": 0.03, "step": 75, "tokens_per_device": 4894 }, { "epoch": 0.03, "loss_ce": 0.16072840988636017, "loss_lvr": 2.458223819732666, "loss_mode_switch": 0.0, "loss_total": 0.406550794839859, "step": 75 }, { "epoch": 0.0304, "grad_norm": 6.841969013214111, "learning_rate": 9.999995804186196e-06, "loss": 0.8081, "step": 76 }, { "batch_size": 4, "epoch": 0.0304, "step": 76, "tokens_per_device": 4808 }, { "epoch": 0.0304, "loss_ce": 0.6355700492858887, "loss_lvr": 1.4080345630645752, "loss_mode_switch": 0.0, "loss_total": 0.7763735055923462, "step": 76 }, { "batch_size": 1, "epoch": 0.0304, "step": 76, "tokens_per_device": 5149 }, { "epoch": 0.0304, "loss_ce": 0.5863285660743713, "loss_lvr": 0.7762887477874756, "loss_mode_switch": 0.0, "loss_total": 0.663957417011261, "step": 76 }, { "batch_size": 4, "epoch": 0.0304, "step": 76, "tokens_per_device": 2616 }, { "epoch": 0.0304, "loss_ce": 0.8360564112663269, "loss_lvr": 1.378832221031189, "loss_mode_switch": 0.0, "loss_total": 0.9739396572113037, "step": 76 }, { "batch_size": 4, "epoch": 0.0304, "step": 76, "tokens_per_device": 4212 }, { "epoch": 0.0304, "loss_ce": 0.6357665061950684, "loss_lvr": 2.1565210819244385, "loss_mode_switch": 0.0, "loss_total": 0.8514186143875122, "step": 76 }, { "batch_size": 1, "epoch": 0.0304, "step": 76, "tokens_per_device": 4905 }, { "epoch": 0.0304, "loss_ce": 0.4210057258605957, "loss_lvr": 1.4183437824249268, "loss_mode_switch": 0.0, "loss_total": 0.5628401041030884, "step": 76 }, { "batch_size": 1, "epoch": 0.0304, "step": 76, "tokens_per_device": 5127 }, { "epoch": 0.0304, "loss_ce": 0.3774911165237427, "loss_lvr": 0.9072815179824829, "loss_mode_switch": 0.0, "loss_total": 0.4682192802429199, "step": 76 }, { "batch_size": 4, "epoch": 0.0304, "step": 76, "tokens_per_device": 6756 }, { "epoch": 0.0304, "loss_ce": 0.4312227666378021, "loss_lvr": 1.6403294801712036, "loss_mode_switch": 0.0, "loss_total": 0.5952557325363159, "step": 76 }, { "batch_size": 4, "epoch": 0.0304, "step": 76, "tokens_per_device": 4436 }, { "epoch": 0.0304, "loss_ce": 0.5612396001815796, "loss_lvr": 1.447453498840332, "loss_mode_switch": 0.0, "loss_total": 0.7059849500656128, "step": 76 }, { "epoch": 0.0308, "grad_norm": 6.663931846618652, "learning_rate": 9.999983216751826e-06, "loss": 0.7797, "step": 77 }, { "batch_size": 1, "epoch": 0.0308, "step": 77, "tokens_per_device": 5182 }, { "epoch": 0.0308, "loss_ce": 0.35237789154052734, "loss_lvr": 1.508320689201355, "loss_mode_switch": 0.0, "loss_total": 0.5032099485397339, "step": 77 }, { "batch_size": 4, "epoch": 0.0308, "step": 77, "tokens_per_device": 4228 }, { "epoch": 0.0308, "loss_ce": 0.7686225175857544, "loss_lvr": 1.5850460529327393, "loss_mode_switch": 0.0, "loss_total": 0.9271271228790283, "step": 77 }, { "batch_size": 4, "epoch": 0.0308, "step": 77, "tokens_per_device": 2776 }, { "epoch": 0.0308, "loss_ce": 0.92481529712677, "loss_lvr": 1.9568361043930054, "loss_mode_switch": 0.0, "loss_total": 1.1204988956451416, "step": 77 }, { "batch_size": 4, "epoch": 0.0308, "step": 77, "tokens_per_device": 5920 }, { "epoch": 0.0308, "loss_ce": 0.7636944651603699, "loss_lvr": 1.705396294593811, "loss_mode_switch": 0.0, "loss_total": 0.934234082698822, "step": 77 }, { "batch_size": 1, "epoch": 0.0308, "step": 77, "tokens_per_device": 6381 }, { "epoch": 0.0308, "loss_ce": 0.3685443699359894, "loss_lvr": 1.625568151473999, "loss_mode_switch": 0.0, "loss_total": 0.5311011672019958, "step": 77 }, { "batch_size": 1, "epoch": 0.0308, "step": 77, "tokens_per_device": 6284 }, { "epoch": 0.0308, "loss_ce": 0.3239614963531494, "loss_lvr": 2.0919647216796875, "loss_mode_switch": 0.0, "loss_total": 0.5331579446792603, "step": 77 }, { "batch_size": 1, "epoch": 0.0308, "step": 77, "tokens_per_device": 5046 }, { "epoch": 0.0308, "loss_ce": 0.3095570206642151, "loss_lvr": 5.253859043121338, "loss_mode_switch": 0.0, "loss_total": 0.8349429368972778, "step": 77 }, { "batch_size": 4, "epoch": 0.0308, "step": 77, "tokens_per_device": 2548 }, { "epoch": 0.0308, "loss_ce": 0.5669950246810913, "loss_lvr": 3.567349433898926, "loss_mode_switch": 0.0, "loss_total": 0.9237300157546997, "step": 77 }, { "epoch": 0.0312, "grad_norm": 6.966574192047119, "learning_rate": 9.999962237718015e-06, "loss": 0.733, "step": 78 }, { "batch_size": 1, "epoch": 0.0312, "step": 78, "tokens_per_device": 5104 }, { "epoch": 0.0312, "loss_ce": 0.33204248547554016, "loss_lvr": 1.2542543411254883, "loss_mode_switch": 0.0, "loss_total": 0.4574679136276245, "step": 78 }, { "batch_size": 4, "epoch": 0.0312, "step": 78, "tokens_per_device": 3880 }, { "epoch": 0.0312, "loss_ce": 0.6837755441665649, "loss_lvr": 1.8689466714859009, "loss_mode_switch": 0.0, "loss_total": 0.8706701993942261, "step": 78 }, { "batch_size": 4, "epoch": 0.0312, "step": 78, "tokens_per_device": 4252 }, { "epoch": 0.0312, "loss_ce": 0.4231167137622833, "loss_lvr": 2.3235690593719482, "loss_mode_switch": 0.0, "loss_total": 0.6554735898971558, "step": 78 }, { "batch_size": 1, "epoch": 0.0312, "step": 78, "tokens_per_device": 4867 }, { "epoch": 0.0312, "loss_ce": 0.6386697292327881, "loss_lvr": 0.8181575536727905, "loss_mode_switch": 0.0, "loss_total": 0.720485508441925, "step": 78 }, { "batch_size": 1, "epoch": 0.0312, "step": 78, "tokens_per_device": 6946 }, { "epoch": 0.0312, "loss_ce": 0.26542550325393677, "loss_lvr": 2.0715527534484863, "loss_mode_switch": 0.0, "loss_total": 0.47258079051971436, "step": 78 }, { "batch_size": 1, "epoch": 0.0312, "step": 78, "tokens_per_device": 4879 }, { "epoch": 0.0312, "loss_ce": 0.1620083749294281, "loss_lvr": 2.684385061264038, "loss_mode_switch": 0.0, "loss_total": 0.43044689297676086, "step": 78 }, { "batch_size": 4, "epoch": 0.0312, "step": 78, "tokens_per_device": 2552 }, { "epoch": 0.0312, "loss_ce": 0.8867163062095642, "loss_lvr": 3.0807645320892334, "loss_mode_switch": 0.0, "loss_total": 1.1947927474975586, "step": 78 }, { "batch_size": 4, "epoch": 0.0312, "step": 78, "tokens_per_device": 5344 }, { "epoch": 0.0312, "loss_ce": 0.4574572443962097, "loss_lvr": 2.6030728816986084, "loss_mode_switch": 0.0, "loss_total": 0.7177644968032837, "step": 78 }, { "epoch": 0.0316, "grad_norm": 7.291773796081543, "learning_rate": 9.999932867119974e-06, "loss": 0.7449, "step": 79 }, { "batch_size": 4, "epoch": 0.0316, "step": 79, "tokens_per_device": 4096 }, { "epoch": 0.0316, "loss_ce": 0.48413196206092834, "loss_lvr": 1.8195246458053589, "loss_mode_switch": 0.0, "loss_total": 0.6660844087600708, "step": 79 }, { "batch_size": 4, "epoch": 0.0316, "step": 79, "tokens_per_device": 1220 }, { "epoch": 0.0316, "loss_ce": 0.8976937532424927, "loss_lvr": 1.35975980758667, "loss_mode_switch": 0.0, "loss_total": 1.0336697101593018, "step": 79 }, { "batch_size": 4, "epoch": 0.0316, "step": 79, "tokens_per_device": 4056 }, { "epoch": 0.0316, "loss_ce": 0.48172780871391296, "loss_lvr": 1.6568886041641235, "loss_mode_switch": 0.0, "loss_total": 0.6474166512489319, "step": 79 }, { "batch_size": 4, "epoch": 0.0316, "step": 79, "tokens_per_device": 3932 }, { "epoch": 0.0316, "loss_ce": 0.6498116254806519, "loss_lvr": 2.3599302768707275, "loss_mode_switch": 0.0, "loss_total": 0.8858046531677246, "step": 79 }, { "batch_size": 4, "epoch": 0.0316, "step": 79, "tokens_per_device": 2720 }, { "epoch": 0.0316, "loss_ce": 1.0407203435897827, "loss_lvr": 1.3323702812194824, "loss_mode_switch": 0.0, "loss_total": 1.173957347869873, "step": 79 }, { "batch_size": 1, "epoch": 0.0316, "step": 79, "tokens_per_device": 5099 }, { "epoch": 0.0316, "loss_ce": 0.6647959351539612, "loss_lvr": 0.4390821158885956, "loss_mode_switch": 0.0, "loss_total": 0.7087041735649109, "step": 79 }, { "batch_size": 4, "epoch": 0.0316, "step": 79, "tokens_per_device": 4228 }, { "epoch": 0.0316, "loss_ce": 0.6416015028953552, "loss_lvr": 1.6915448904037476, "loss_mode_switch": 0.0, "loss_total": 0.8107559680938721, "step": 79 }, { "batch_size": 1, "epoch": 0.0316, "step": 79, "tokens_per_device": 4891 }, { "epoch": 0.0316, "loss_ce": 0.41095277667045593, "loss_lvr": 0.5937973260879517, "loss_mode_switch": 0.0, "loss_total": 0.4703325033187866, "step": 79 }, { "epoch": 0.032, "grad_norm": 6.593417167663574, "learning_rate": 9.999895105006995e-06, "loss": 0.8088, "step": 80 }, { "batch_size": 4, "epoch": 0.032, "step": 80, "tokens_per_device": 4216 }, { "epoch": 0.032, "loss_ce": 0.9074881672859192, "loss_lvr": 2.224039316177368, "loss_mode_switch": 0.0, "loss_total": 1.129892110824585, "step": 80 }, { "batch_size": 4, "epoch": 0.032, "step": 80, "tokens_per_device": 4276 }, { "epoch": 0.032, "loss_ce": 0.35065600275993347, "loss_lvr": 2.959343910217285, "loss_mode_switch": 0.0, "loss_total": 0.6465904116630554, "step": 80 }, { "batch_size": 4, "epoch": 0.032, "step": 80, "tokens_per_device": 2544 }, { "epoch": 0.032, "loss_ce": 0.7581744194030762, "loss_lvr": 1.8810992240905762, "loss_mode_switch": 0.0, "loss_total": 0.9462843537330627, "step": 80 }, { "batch_size": 1, "epoch": 0.032, "step": 80, "tokens_per_device": 5093 }, { "epoch": 0.032, "loss_ce": 0.21987193822860718, "loss_lvr": 1.8526161909103394, "loss_mode_switch": 0.0, "loss_total": 0.40513354539871216, "step": 80 }, { "batch_size": 1, "epoch": 0.032, "step": 80, "tokens_per_device": 4819 }, { "epoch": 0.032, "loss_ce": 0.43742886185646057, "loss_lvr": 0.9298247694969177, "loss_mode_switch": 0.0, "loss_total": 0.5304113626480103, "step": 80 }, { "batch_size": 4, "epoch": 0.032, "step": 80, "tokens_per_device": 3204 }, { "epoch": 0.032, "loss_ce": 0.5961324572563171, "loss_lvr": 2.8985276222229004, "loss_mode_switch": 0.0, "loss_total": 0.885985255241394, "step": 80 }, { "batch_size": 1, "epoch": 0.032, "step": 80, "tokens_per_device": 6073 }, { "epoch": 0.032, "loss_ce": 0.5203187465667725, "loss_lvr": 0.9842537641525269, "loss_mode_switch": 0.0, "loss_total": 0.6187441349029541, "step": 80 }, { "batch_size": 4, "epoch": 0.032, "step": 80, "tokens_per_device": 3344 }, { "epoch": 0.032, "loss_ce": 1.0042356252670288, "loss_lvr": 1.6597621440887451, "loss_mode_switch": 0.0, "loss_total": 1.1702117919921875, "step": 80 }, { "epoch": 0.0324, "grad_norm": 5.759615421295166, "learning_rate": 9.999848951442455e-06, "loss": 0.7037, "step": 81 }, { "batch_size": 4, "epoch": 0.0324, "step": 81, "tokens_per_device": 4220 }, { "epoch": 0.0324, "loss_ce": 0.28920164704322815, "loss_lvr": 3.0068891048431396, "loss_mode_switch": 0.0, "loss_total": 0.5898905992507935, "step": 81 }, { "batch_size": 1, "epoch": 0.0324, "step": 81, "tokens_per_device": 5012 }, { "epoch": 0.0324, "loss_ce": 0.378768652677536, "loss_lvr": 4.5435638427734375, "loss_mode_switch": 0.0, "loss_total": 0.8331250548362732, "step": 81 }, { "batch_size": 4, "epoch": 0.0324, "step": 81, "tokens_per_device": 1468 }, { "epoch": 0.0324, "loss_ce": 0.2861030101776123, "loss_lvr": 3.2040650844573975, "loss_mode_switch": 0.0, "loss_total": 0.6065095663070679, "step": 81 }, { "batch_size": 4, "epoch": 0.0324, "step": 81, "tokens_per_device": 3800 }, { "epoch": 0.0324, "loss_ce": 0.2948809862136841, "loss_lvr": 5.976624488830566, "loss_mode_switch": 0.0, "loss_total": 0.8925434350967407, "step": 81 }, { "batch_size": 4, "epoch": 0.0324, "step": 81, "tokens_per_device": 4228 }, { "epoch": 0.0324, "loss_ce": 0.38360244035720825, "loss_lvr": 3.3448586463928223, "loss_mode_switch": 0.0, "loss_total": 0.7180882692337036, "step": 81 }, { "batch_size": 1, "epoch": 0.0324, "step": 81, "tokens_per_device": 5043 }, { "epoch": 0.0324, "loss_ce": 0.5582140684127808, "loss_lvr": 3.2730371952056885, "loss_mode_switch": 0.0, "loss_total": 0.8855178356170654, "step": 81 }, { "batch_size": 4, "epoch": 0.0324, "step": 81, "tokens_per_device": 2732 }, { "epoch": 0.0324, "loss_ce": 0.2975939214229584, "loss_lvr": 3.9375102519989014, "loss_mode_switch": 0.0, "loss_total": 0.6913449764251709, "step": 81 }, { "batch_size": 1, "epoch": 0.0324, "step": 81, "tokens_per_device": 5117 }, { "epoch": 0.0324, "loss_ce": 0.3308752477169037, "loss_lvr": 1.53437340259552, "loss_mode_switch": 0.0, "loss_total": 0.48431259393692017, "step": 81 }, { "epoch": 0.0328, "grad_norm": 10.231213569641113, "learning_rate": 9.999794406503816e-06, "loss": 0.7799, "step": 82 }, { "batch_size": 4, "epoch": 0.0328, "step": 82, "tokens_per_device": 5612 }, { "epoch": 0.0328, "loss_ce": 0.7523844242095947, "loss_lvr": 1.2667200565338135, "loss_mode_switch": 0.0, "loss_total": 0.879056453704834, "step": 82 }, { "batch_size": 4, "epoch": 0.0328, "step": 82, "tokens_per_device": 4232 }, { "epoch": 0.0328, "loss_ce": 0.4042372703552246, "loss_lvr": 2.1058011054992676, "loss_mode_switch": 0.0, "loss_total": 0.6148173809051514, "step": 82 }, { "batch_size": 4, "epoch": 0.0328, "step": 82, "tokens_per_device": 6224 }, { "epoch": 0.0328, "loss_ce": 0.3518332839012146, "loss_lvr": 1.4329402446746826, "loss_mode_switch": 0.0, "loss_total": 0.4951273202896118, "step": 82 }, { "batch_size": 4, "epoch": 0.0328, "step": 82, "tokens_per_device": 5076 }, { "epoch": 0.0328, "loss_ce": 0.6346496939659119, "loss_lvr": 1.678019642829895, "loss_mode_switch": 0.0, "loss_total": 0.8024516701698303, "step": 82 }, { "batch_size": 1, "epoch": 0.0328, "step": 82, "tokens_per_device": 5921 }, { "epoch": 0.0328, "loss_ce": 0.8985313177108765, "loss_lvr": 0.8603888154029846, "loss_mode_switch": 0.0, "loss_total": 0.9845702052116394, "step": 82 }, { "batch_size": 4, "epoch": 0.0328, "step": 82, "tokens_per_device": 4712 }, { "epoch": 0.0328, "loss_ce": 0.8808544278144836, "loss_lvr": 1.6079177856445312, "loss_mode_switch": 0.0, "loss_total": 1.0416462421417236, "step": 82 }, { "batch_size": 4, "epoch": 0.0328, "step": 82, "tokens_per_device": 3036 }, { "epoch": 0.0328, "loss_ce": 0.5926546454429626, "loss_lvr": 2.550222158432007, "loss_mode_switch": 0.0, "loss_total": 0.8476768732070923, "step": 82 }, { "batch_size": 4, "epoch": 0.0328, "step": 82, "tokens_per_device": 3376 }, { "epoch": 0.0328, "loss_ce": 0.4255739450454712, "loss_lvr": 2.1106436252593994, "loss_mode_switch": 0.0, "loss_total": 0.6366382837295532, "step": 82 }, { "epoch": 0.0332, "grad_norm": 5.844801425933838, "learning_rate": 9.999731470282621e-06, "loss": 0.7449, "step": 83 }, { "batch_size": 1, "epoch": 0.0332, "step": 83, "tokens_per_device": 5171 }, { "epoch": 0.0332, "loss_ce": 0.252173513174057, "loss_lvr": 1.6055546998977661, "loss_mode_switch": 0.0, "loss_total": 0.4127289652824402, "step": 83 }, { "batch_size": 1, "epoch": 0.0332, "step": 83, "tokens_per_device": 4104 }, { "epoch": 0.0332, "loss_ce": 0.11793775856494904, "loss_lvr": 0.9955319762229919, "loss_mode_switch": 0.0, "loss_total": 0.21749095618724823, "step": 83 }, { "batch_size": 4, "epoch": 0.0332, "step": 83, "tokens_per_device": 6112 }, { "epoch": 0.0332, "loss_ce": 0.6049079298973083, "loss_lvr": 1.6532460451126099, "loss_mode_switch": 0.0, "loss_total": 0.7702325582504272, "step": 83 }, { "batch_size": 1, "epoch": 0.0332, "step": 83, "tokens_per_device": 4863 }, { "epoch": 0.0332, "loss_ce": 0.5941524505615234, "loss_lvr": 0.6066868305206299, "loss_mode_switch": 0.0, "loss_total": 0.6548211574554443, "step": 83 }, { "batch_size": 4, "epoch": 0.0332, "step": 83, "tokens_per_device": 10024 }, { "epoch": 0.0332, "loss_ce": 0.4804646670818329, "loss_lvr": 1.5871524810791016, "loss_mode_switch": 0.0, "loss_total": 0.6391799449920654, "step": 83 }, { "batch_size": 4, "epoch": 0.0332, "step": 83, "tokens_per_device": 4508 }, { "epoch": 0.0332, "loss_ce": 0.4901958405971527, "loss_lvr": 1.5516047477722168, "loss_mode_switch": 0.0, "loss_total": 0.645356297492981, "step": 83 }, { "batch_size": 4, "epoch": 0.0332, "step": 83, "tokens_per_device": 4048 }, { "epoch": 0.0332, "loss_ce": 0.4192371368408203, "loss_lvr": 3.519437789916992, "loss_mode_switch": 0.0, "loss_total": 0.7711809277534485, "step": 83 }, { "batch_size": 4, "epoch": 0.0332, "step": 83, "tokens_per_device": 4480 }, { "epoch": 0.0332, "loss_ce": 0.5241775512695312, "loss_lvr": 2.2072486877441406, "loss_mode_switch": 0.0, "loss_total": 0.7449024319648743, "step": 83 }, { "epoch": 0.0336, "grad_norm": 5.385040283203125, "learning_rate": 9.9996601428845e-06, "loss": 0.6879, "step": 84 }, { "batch_size": 4, "epoch": 0.0336, "step": 84, "tokens_per_device": 4048 }, { "epoch": 0.0336, "loss_ce": 0.5884101986885071, "loss_lvr": 2.815314769744873, "loss_mode_switch": 0.0, "loss_total": 0.8699417114257812, "step": 84 }, { "batch_size": 4, "epoch": 0.0336, "step": 84, "tokens_per_device": 2100 }, { "epoch": 0.0336, "loss_ce": 0.4936203062534332, "loss_lvr": 2.7409377098083496, "loss_mode_switch": 0.0, "loss_total": 0.7677140831947327, "step": 84 }, { "batch_size": 4, "epoch": 0.0336, "step": 84, "tokens_per_device": 3220 }, { "epoch": 0.0336, "loss_ce": 0.3243347406387329, "loss_lvr": 2.6272530555725098, "loss_mode_switch": 0.0, "loss_total": 0.5870600938796997, "step": 84 }, { "batch_size": 1, "epoch": 0.0336, "step": 84, "tokens_per_device": 5856 }, { "epoch": 0.0336, "loss_ce": 0.4528890550136566, "loss_lvr": 3.437004804611206, "loss_mode_switch": 0.0, "loss_total": 0.7965895533561707, "step": 84 }, { "batch_size": 1, "epoch": 0.0336, "step": 84, "tokens_per_device": 4890 }, { "epoch": 0.0336, "loss_ce": 0.6680103540420532, "loss_lvr": 1.201791763305664, "loss_mode_switch": 0.0, "loss_total": 0.7881895303726196, "step": 84 }, { "batch_size": 1, "epoch": 0.0336, "step": 84, "tokens_per_device": 4893 }, { "epoch": 0.0336, "loss_ce": 0.3258354365825653, "loss_lvr": 1.4232995510101318, "loss_mode_switch": 0.0, "loss_total": 0.46816539764404297, "step": 84 }, { "batch_size": 4, "epoch": 0.0336, "step": 84, "tokens_per_device": 3344 }, { "epoch": 0.0336, "loss_ce": 0.4595847427845001, "loss_lvr": 2.3107452392578125, "loss_mode_switch": 0.0, "loss_total": 0.6906592845916748, "step": 84 }, { "batch_size": 1, "epoch": 0.0336, "step": 84, "tokens_per_device": 5153 }, { "epoch": 0.0336, "loss_ce": 0.12106938660144806, "loss_lvr": 1.4701186418533325, "loss_mode_switch": 0.0, "loss_total": 0.2680812478065491, "step": 84 }, { "epoch": 0.034, "grad_norm": 6.753615856170654, "learning_rate": 9.99958042442916e-06, "loss": 0.638, "step": 85 }, { "batch_size": 4, "epoch": 0.034, "step": 85, "tokens_per_device": 4176 }, { "epoch": 0.034, "loss_ce": 0.5043949484825134, "loss_lvr": 3.116124391555786, "loss_mode_switch": 0.0, "loss_total": 0.8160073757171631, "step": 85 }, { "batch_size": 4, "epoch": 0.034, "step": 85, "tokens_per_device": 5352 }, { "epoch": 0.034, "loss_ce": 0.6801652312278748, "loss_lvr": 1.7805451154708862, "loss_mode_switch": 0.0, "loss_total": 0.8582197427749634, "step": 85 }, { "batch_size": 1, "epoch": 0.034, "step": 85, "tokens_per_device": 6162 }, { "epoch": 0.034, "loss_ce": 0.28232359886169434, "loss_lvr": 1.2960352897644043, "loss_mode_switch": 0.0, "loss_total": 0.41192713379859924, "step": 85 }, { "batch_size": 4, "epoch": 0.034, "step": 85, "tokens_per_device": 3872 }, { "epoch": 0.034, "loss_ce": 0.186622753739357, "loss_lvr": 2.388551950454712, "loss_mode_switch": 0.0, "loss_total": 0.4254779517650604, "step": 85 }, { "batch_size": 4, "epoch": 0.034, "step": 85, "tokens_per_device": 4592 }, { "epoch": 0.034, "loss_ce": 0.6066240072250366, "loss_lvr": 2.5237510204315186, "loss_mode_switch": 0.0, "loss_total": 0.8589991331100464, "step": 85 }, { "batch_size": 1, "epoch": 0.034, "step": 85, "tokens_per_device": 4940 }, { "epoch": 0.034, "loss_ce": 0.2411375641822815, "loss_lvr": 1.9798784255981445, "loss_mode_switch": 0.0, "loss_total": 0.4391254186630249, "step": 85 }, { "batch_size": 1, "epoch": 0.034, "step": 85, "tokens_per_device": 4674 }, { "epoch": 0.034, "loss_ce": 0.21516254544258118, "loss_lvr": 2.853593587875366, "loss_mode_switch": 0.0, "loss_total": 0.5005218982696533, "step": 85 }, { "batch_size": 4, "epoch": 0.034, "step": 85, "tokens_per_device": 1244 }, { "epoch": 0.034, "loss_ce": 0.2722875773906708, "loss_lvr": 4.1852521896362305, "loss_mode_switch": 0.0, "loss_total": 0.6908128261566162, "step": 85 }, { "epoch": 0.0344, "grad_norm": 6.898719787597656, "learning_rate": 9.999492315050396e-06, "loss": 0.6782, "step": 86 }, { "batch_size": 4, "epoch": 0.0344, "step": 86, "tokens_per_device": 4244 }, { "epoch": 0.0344, "loss_ce": 0.9673451781272888, "loss_lvr": 1.511861801147461, "loss_mode_switch": 0.0, "loss_total": 1.118531346321106, "step": 86 }, { "batch_size": 1, "epoch": 0.0344, "step": 86, "tokens_per_device": 5179 }, { "epoch": 0.0344, "loss_ce": 0.1787770390510559, "loss_lvr": 1.8156108856201172, "loss_mode_switch": 0.0, "loss_total": 0.36033815145492554, "step": 86 }, { "batch_size": 4, "epoch": 0.0344, "step": 86, "tokens_per_device": 12436 }, { "epoch": 0.0344, "loss_ce": 0.327952116727829, "loss_lvr": 1.9199409484863281, "loss_mode_switch": 0.0, "loss_total": 0.5199462175369263, "step": 86 }, { "batch_size": 4, "epoch": 0.0344, "step": 86, "tokens_per_device": 8320 }, { "epoch": 0.0344, "loss_ce": 0.4958612024784088, "loss_lvr": 1.1792640686035156, "loss_mode_switch": 0.0, "loss_total": 0.6137875914573669, "step": 86 }, { "batch_size": 4, "epoch": 0.0344, "step": 86, "tokens_per_device": 2824 }, { "epoch": 0.0344, "loss_ce": 0.4953419268131256, "loss_lvr": 1.780022144317627, "loss_mode_switch": 0.0, "loss_total": 0.6733441352844238, "step": 86 }, { "batch_size": 4, "epoch": 0.0344, "step": 86, "tokens_per_device": 2528 }, { "epoch": 0.0344, "loss_ce": 0.6057656407356262, "loss_lvr": 2.6970510482788086, "loss_mode_switch": 0.0, "loss_total": 0.875470757484436, "step": 86 }, { "batch_size": 4, "epoch": 0.0344, "step": 86, "tokens_per_device": 5652 }, { "epoch": 0.0344, "loss_ce": 0.6160231828689575, "loss_lvr": 1.195749282836914, "loss_mode_switch": 0.0, "loss_total": 0.735598087310791, "step": 86 }, { "batch_size": 1, "epoch": 0.0344, "step": 86, "tokens_per_device": 4931 }, { "epoch": 0.0344, "loss_ce": 0.8695595860481262, "loss_lvr": 2.288452625274658, "loss_mode_switch": 0.0, "loss_total": 1.098404884338379, "step": 86 }, { "epoch": 0.0348, "grad_norm": 5.220694541931152, "learning_rate": 9.999395814896086e-06, "loss": 0.6982, "step": 87 }, { "batch_size": 4, "epoch": 0.0348, "step": 87, "tokens_per_device": 6308 }, { "epoch": 0.0348, "loss_ce": 0.5254894495010376, "loss_lvr": 1.3360319137573242, "loss_mode_switch": 0.0, "loss_total": 0.6590926647186279, "step": 87 }, { "batch_size": 4, "epoch": 0.0348, "step": 87, "tokens_per_device": 4216 }, { "epoch": 0.0348, "loss_ce": 0.4651297330856323, "loss_lvr": 1.2349294424057007, "loss_mode_switch": 0.0, "loss_total": 0.5886226892471313, "step": 87 }, { "batch_size": 1, "epoch": 0.0348, "step": 87, "tokens_per_device": 5248 }, { "epoch": 0.0348, "loss_ce": 0.34411540627479553, "loss_lvr": 1.3077391386032104, "loss_mode_switch": 0.0, "loss_total": 0.47488933801651, "step": 87 }, { "batch_size": 4, "epoch": 0.0348, "step": 87, "tokens_per_device": 4192 }, { "epoch": 0.0348, "loss_ce": 0.44199490547180176, "loss_lvr": 0.8201443552970886, "loss_mode_switch": 0.0, "loss_total": 0.5240093469619751, "step": 87 }, { "batch_size": 1, "epoch": 0.0348, "step": 87, "tokens_per_device": 4887 }, { "epoch": 0.0348, "loss_ce": 0.31798577308654785, "loss_lvr": 0.8257886171340942, "loss_mode_switch": 0.0, "loss_total": 0.40056464076042175, "step": 87 }, { "batch_size": 4, "epoch": 0.0348, "step": 87, "tokens_per_device": 3900 }, { "epoch": 0.0348, "loss_ce": 0.8095340728759766, "loss_lvr": 1.6434881687164307, "loss_mode_switch": 0.0, "loss_total": 0.9738829135894775, "step": 87 }, { "batch_size": 1, "epoch": 0.0348, "step": 87, "tokens_per_device": 5123 }, { "epoch": 0.0348, "loss_ce": 0.1439502090215683, "loss_lvr": 1.184296727180481, "loss_mode_switch": 0.0, "loss_total": 0.26237988471984863, "step": 87 }, { "batch_size": 4, "epoch": 0.0348, "step": 87, "tokens_per_device": 4776 }, { "epoch": 0.0348, "loss_ce": 0.6241000294685364, "loss_lvr": 1.5131514072418213, "loss_mode_switch": 0.0, "loss_total": 0.7754151821136475, "step": 87 }, { "epoch": 0.0352, "grad_norm": 5.543851852416992, "learning_rate": 9.999290924128186e-06, "loss": 0.6864, "step": 88 }, { "batch_size": 4, "epoch": 0.0352, "step": 88, "tokens_per_device": 2728 }, { "epoch": 0.0352, "loss_ce": 0.6560866832733154, "loss_lvr": 1.400059461593628, "loss_mode_switch": 0.0, "loss_total": 0.7960926294326782, "step": 88 }, { "batch_size": 4, "epoch": 0.0352, "step": 88, "tokens_per_device": 4356 }, { "epoch": 0.0352, "loss_ce": 0.5047427415847778, "loss_lvr": 2.4823646545410156, "loss_mode_switch": 0.0, "loss_total": 0.7529792189598083, "step": 88 }, { "batch_size": 4, "epoch": 0.0352, "step": 88, "tokens_per_device": 4248 }, { "epoch": 0.0352, "loss_ce": 0.461342453956604, "loss_lvr": 1.518783450126648, "loss_mode_switch": 0.0, "loss_total": 0.6132208108901978, "step": 88 }, { "batch_size": 4, "epoch": 0.0352, "step": 88, "tokens_per_device": 1396 }, { "epoch": 0.0352, "loss_ce": 0.6131582856178284, "loss_lvr": 1.8118400573730469, "loss_mode_switch": 0.0, "loss_total": 0.7943422794342041, "step": 88 }, { "batch_size": 4, "epoch": 0.0352, "step": 88, "tokens_per_device": 2548 }, { "epoch": 0.0352, "loss_ce": 0.4445997476577759, "loss_lvr": 2.308558702468872, "loss_mode_switch": 0.0, "loss_total": 0.675455629825592, "step": 88 }, { "batch_size": 4, "epoch": 0.0352, "step": 88, "tokens_per_device": 3896 }, { "epoch": 0.0352, "loss_ce": 0.5563471913337708, "loss_lvr": 2.6146719455718994, "loss_mode_switch": 0.0, "loss_total": 0.8178143501281738, "step": 88 }, { "batch_size": 4, "epoch": 0.0352, "step": 88, "tokens_per_device": 9380 }, { "epoch": 0.0352, "loss_ce": 0.3896200656890869, "loss_lvr": 1.2780908346176147, "loss_mode_switch": 0.0, "loss_total": 0.5174291729927063, "step": 88 }, { "batch_size": 1, "epoch": 0.0352, "step": 88, "tokens_per_device": 4864 }, { "epoch": 0.0352, "loss_ce": 0.13625867664813995, "loss_lvr": 2.658766508102417, "loss_mode_switch": 0.0, "loss_total": 0.40213531255722046, "step": 88 }, { "epoch": 0.0356, "grad_norm": 4.742551326751709, "learning_rate": 9.999177642922736e-06, "loss": 0.6229, "step": 89 }, { "batch_size": 1, "epoch": 0.0356, "step": 89, "tokens_per_device": 5159 }, { "epoch": 0.0356, "loss_ce": 0.6918108463287354, "loss_lvr": 2.937284469604492, "loss_mode_switch": 0.0, "loss_total": 0.9855393171310425, "step": 89 }, { "batch_size": 1, "epoch": 0.0356, "step": 89, "tokens_per_device": 5244 }, { "epoch": 0.0356, "loss_ce": 0.5512765645980835, "loss_lvr": 2.164271831512451, "loss_mode_switch": 0.0, "loss_total": 0.7677037715911865, "step": 89 }, { "batch_size": 4, "epoch": 0.0356, "step": 89, "tokens_per_device": 5728 }, { "epoch": 0.0356, "loss_ce": 0.08105569332838058, "loss_lvr": 2.7466554641723633, "loss_mode_switch": 0.0, "loss_total": 0.35572126507759094, "step": 89 }, { "batch_size": 4, "epoch": 0.0356, "step": 89, "tokens_per_device": 1168 }, { "epoch": 0.0356, "loss_ce": 0.6020904183387756, "loss_lvr": 3.497494697570801, "loss_mode_switch": 0.0, "loss_total": 0.9518399238586426, "step": 89 }, { "batch_size": 1, "epoch": 0.0356, "step": 89, "tokens_per_device": 4891 }, { "epoch": 0.0356, "loss_ce": 0.36759650707244873, "loss_lvr": 0.9128772020339966, "loss_mode_switch": 0.0, "loss_total": 0.45888423919677734, "step": 89 }, { "batch_size": 4, "epoch": 0.0356, "step": 89, "tokens_per_device": 4264 }, { "epoch": 0.0356, "loss_ce": 0.2878592312335968, "loss_lvr": 2.9321703910827637, "loss_mode_switch": 0.0, "loss_total": 0.5810762643814087, "step": 89 }, { "batch_size": 1, "epoch": 0.0356, "step": 89, "tokens_per_device": 5114 }, { "epoch": 0.0356, "loss_ce": 0.07110336422920227, "loss_lvr": 1.539130449295044, "loss_mode_switch": 0.0, "loss_total": 0.22501641511917114, "step": 89 }, { "batch_size": 4, "epoch": 0.0356, "step": 89, "tokens_per_device": 3772 }, { "epoch": 0.0356, "loss_ce": 0.48003485798835754, "loss_lvr": 2.1013917922973633, "loss_mode_switch": 0.0, "loss_total": 0.6901740431785583, "step": 89 }, { "epoch": 0.036, "grad_norm": 5.121306896209717, "learning_rate": 9.999055971469864e-06, "loss": 0.6192, "step": 90 }, { "batch_size": 4, "epoch": 0.036, "step": 90, "tokens_per_device": 4408 }, { "epoch": 0.036, "loss_ce": 0.39068999886512756, "loss_lvr": 1.781471610069275, "loss_mode_switch": 0.0, "loss_total": 0.5688371658325195, "step": 90 }, { "batch_size": 4, "epoch": 0.036, "step": 90, "tokens_per_device": 4400 }, { "epoch": 0.036, "loss_ce": 0.39614981412887573, "loss_lvr": 1.5378570556640625, "loss_mode_switch": 0.0, "loss_total": 0.549935519695282, "step": 90 }, { "batch_size": 4, "epoch": 0.036, "step": 90, "tokens_per_device": 5796 }, { "epoch": 0.036, "loss_ce": 0.6898168921470642, "loss_lvr": 1.4849283695220947, "loss_mode_switch": 0.0, "loss_total": 0.8383097648620605, "step": 90 }, { "batch_size": 1, "epoch": 0.036, "step": 90, "tokens_per_device": 4971 }, { "epoch": 0.036, "loss_ce": 0.2001674324274063, "loss_lvr": 1.6861575841903687, "loss_mode_switch": 0.0, "loss_total": 0.368783175945282, "step": 90 }, { "batch_size": 1, "epoch": 0.036, "step": 90, "tokens_per_device": 4797 }, { "epoch": 0.036, "loss_ce": 0.13145291805267334, "loss_lvr": 2.6690285205841064, "loss_mode_switch": 0.0, "loss_total": 0.39835578203201294, "step": 90 }, { "batch_size": 4, "epoch": 0.036, "step": 90, "tokens_per_device": 4104 }, { "epoch": 0.036, "loss_ce": 0.41416820883750916, "loss_lvr": 2.01497220993042, "loss_mode_switch": 0.0, "loss_total": 0.6156654357910156, "step": 90 }, { "batch_size": 4, "epoch": 0.036, "step": 90, "tokens_per_device": 5052 }, { "epoch": 0.036, "loss_ce": 0.46724075078964233, "loss_lvr": 1.9628478288650513, "loss_mode_switch": 0.0, "loss_total": 0.6635255217552185, "step": 90 }, { "batch_size": 4, "epoch": 0.036, "step": 90, "tokens_per_device": 4316 }, { "epoch": 0.036, "loss_ce": 0.4948398172855377, "loss_lvr": 2.7936465740203857, "loss_mode_switch": 0.0, "loss_total": 0.7742044925689697, "step": 90 }, { "epoch": 0.0364, "grad_norm": 4.622314929962158, "learning_rate": 9.998925909973769e-06, "loss": 0.6297, "step": 91 }, { "batch_size": 4, "epoch": 0.0364, "step": 91, "tokens_per_device": 4608 }, { "epoch": 0.0364, "loss_ce": 0.4848199486732483, "loss_lvr": 1.4160135984420776, "loss_mode_switch": 0.0, "loss_total": 0.626421332359314, "step": 91 }, { "batch_size": 4, "epoch": 0.0364, "step": 91, "tokens_per_device": 2072 }, { "epoch": 0.0364, "loss_ce": 0.34599217772483826, "loss_lvr": 1.6025333404541016, "loss_mode_switch": 0.0, "loss_total": 0.506245493888855, "step": 91 }, { "batch_size": 4, "epoch": 0.0364, "step": 91, "tokens_per_device": 13136 }, { "epoch": 0.0364, "loss_ce": 0.1977187693119049, "loss_lvr": 1.4067397117614746, "loss_mode_switch": 0.0, "loss_total": 0.3383927345275879, "step": 91 }, { "batch_size": 1, "epoch": 0.0364, "step": 91, "tokens_per_device": 4903 }, { "epoch": 0.0364, "loss_ce": 0.1187080666422844, "loss_lvr": 1.0388749837875366, "loss_mode_switch": 0.0, "loss_total": 0.22259557247161865, "step": 91 }, { "batch_size": 4, "epoch": 0.0364, "step": 91, "tokens_per_device": 4660 }, { "epoch": 0.0364, "loss_ce": 0.3489121198654175, "loss_lvr": 1.718220591545105, "loss_mode_switch": 0.0, "loss_total": 0.5207341909408569, "step": 91 }, { "batch_size": 4, "epoch": 0.0364, "step": 91, "tokens_per_device": 4296 }, { "epoch": 0.0364, "loss_ce": 0.24394114315509796, "loss_lvr": 2.8087809085845947, "loss_mode_switch": 0.0, "loss_total": 0.5248192548751831, "step": 91 }, { "batch_size": 4, "epoch": 0.0364, "step": 91, "tokens_per_device": 3896 }, { "epoch": 0.0364, "loss_ce": 0.42097821831703186, "loss_lvr": 1.9779998064041138, "loss_mode_switch": 0.0, "loss_total": 0.6187782287597656, "step": 91 }, { "batch_size": 1, "epoch": 0.0364, "step": 91, "tokens_per_device": 4877 }, { "epoch": 0.0364, "loss_ce": 0.12256131321191788, "loss_lvr": 1.5048675537109375, "loss_mode_switch": 0.0, "loss_total": 0.27304807305336, "step": 91 }, { "epoch": 0.0368, "grad_norm": 4.694216251373291, "learning_rate": 9.99878745865274e-06, "loss": 0.6107, "step": 92 }, { "batch_size": 4, "epoch": 0.0368, "step": 92, "tokens_per_device": 4240 }, { "epoch": 0.0368, "loss_ce": 0.23839320242404938, "loss_lvr": 2.087740182876587, "loss_mode_switch": 0.0, "loss_total": 0.44716721773147583, "step": 92 }, { "batch_size": 1, "epoch": 0.0368, "step": 92, "tokens_per_device": 5096 }, { "epoch": 0.0368, "loss_ce": 0.2672909200191498, "loss_lvr": 1.3253254890441895, "loss_mode_switch": 0.0, "loss_total": 0.39982348680496216, "step": 92 }, { "batch_size": 4, "epoch": 0.0368, "step": 92, "tokens_per_device": 4828 }, { "epoch": 0.0368, "loss_ce": 0.3465619683265686, "loss_lvr": 1.3784232139587402, "loss_mode_switch": 0.0, "loss_total": 0.4844042956829071, "step": 92 }, { "batch_size": 1, "epoch": 0.0368, "step": 92, "tokens_per_device": 7398 }, { "epoch": 0.0368, "loss_ce": 0.10060806572437286, "loss_lvr": 1.208229422569275, "loss_mode_switch": 0.0, "loss_total": 0.22143101692199707, "step": 92 }, { "batch_size": 4, "epoch": 0.0368, "step": 92, "tokens_per_device": 2132 }, { "epoch": 0.0368, "loss_ce": 0.21435388922691345, "loss_lvr": 1.3429640531539917, "loss_mode_switch": 0.0, "loss_total": 0.3486502766609192, "step": 92 }, { "batch_size": 4, "epoch": 0.0368, "step": 92, "tokens_per_device": 8988 }, { "epoch": 0.0368, "loss_ce": 0.21975873410701752, "loss_lvr": 1.555350422859192, "loss_mode_switch": 0.0, "loss_total": 0.3752937912940979, "step": 92 }, { "batch_size": 4, "epoch": 0.0368, "step": 92, "tokens_per_device": 4352 }, { "epoch": 0.0368, "loss_ce": 0.7529687881469727, "loss_lvr": 2.3045756816864014, "loss_mode_switch": 0.0, "loss_total": 0.9834263324737549, "step": 92 }, { "batch_size": 4, "epoch": 0.0368, "step": 92, "tokens_per_device": 1356 }, { "epoch": 0.0368, "loss_ce": 0.413298636674881, "loss_lvr": 1.8758515119552612, "loss_mode_switch": 0.0, "loss_total": 0.6008837819099426, "step": 92 }, { "epoch": 0.0372, "grad_norm": 4.464240074157715, "learning_rate": 9.99864061773914e-06, "loss": 0.5607, "step": 93 }, { "batch_size": 1, "epoch": 0.0372, "step": 93, "tokens_per_device": 4779 }, { "epoch": 0.0372, "loss_ce": 0.33807241916656494, "loss_lvr": 1.4444818496704102, "loss_mode_switch": 0.0, "loss_total": 0.48252061009407043, "step": 93 }, { "batch_size": 4, "epoch": 0.0372, "step": 93, "tokens_per_device": 4424 }, { "epoch": 0.0372, "loss_ce": 0.8694052696228027, "loss_lvr": 2.7190473079681396, "loss_mode_switch": 0.0, "loss_total": 1.1413099765777588, "step": 93 }, { "batch_size": 1, "epoch": 0.0372, "step": 93, "tokens_per_device": 4904 }, { "epoch": 0.0372, "loss_ce": 0.025318793952465057, "loss_lvr": 2.055051565170288, "loss_mode_switch": 0.0, "loss_total": 0.23082396388053894, "step": 93 }, { "batch_size": 4, "epoch": 0.0372, "step": 93, "tokens_per_device": 2592 }, { "epoch": 0.0372, "loss_ce": 0.41503995656967163, "loss_lvr": 2.089578866958618, "loss_mode_switch": 0.0, "loss_total": 0.6239978671073914, "step": 93 }, { "batch_size": 4, "epoch": 0.0372, "step": 93, "tokens_per_device": 1500 }, { "epoch": 0.0372, "loss_ce": 0.47076839208602905, "loss_lvr": 2.750232696533203, "loss_mode_switch": 0.0, "loss_total": 0.7457916736602783, "step": 93 }, { "batch_size": 1, "epoch": 0.0372, "step": 93, "tokens_per_device": 4887 }, { "epoch": 0.0372, "loss_ce": 0.027519775554537773, "loss_lvr": 2.3958349227905273, "loss_mode_switch": 0.0, "loss_total": 0.26710325479507446, "step": 93 }, { "batch_size": 4, "epoch": 0.0372, "step": 93, "tokens_per_device": 10772 }, { "epoch": 0.0372, "loss_ce": 0.12819981575012207, "loss_lvr": 1.5879864692687988, "loss_mode_switch": 0.0, "loss_total": 0.286998450756073, "step": 93 }, { "batch_size": 1, "epoch": 0.0372, "step": 93, "tokens_per_device": 4884 }, { "epoch": 0.0372, "loss_ce": 0.04928487911820412, "loss_lvr": 1.0562971830368042, "loss_mode_switch": 0.0, "loss_total": 0.15491460263729095, "step": 93 }, { "epoch": 0.0376, "grad_norm": 4.249560832977295, "learning_rate": 9.998485387479418e-06, "loss": 0.5254, "step": 94 }, { "batch_size": 4, "epoch": 0.0376, "step": 94, "tokens_per_device": 4224 }, { "epoch": 0.0376, "loss_ce": 0.21894703805446625, "loss_lvr": 2.4182865619659424, "loss_mode_switch": 0.0, "loss_total": 0.4607757031917572, "step": 94 }, { "batch_size": 4, "epoch": 0.0376, "step": 94, "tokens_per_device": 5156 }, { "epoch": 0.0376, "loss_ce": 0.08547698706388474, "loss_lvr": 1.6199021339416504, "loss_mode_switch": 0.0, "loss_total": 0.24746719002723694, "step": 94 }, { "batch_size": 4, "epoch": 0.0376, "step": 94, "tokens_per_device": 10580 }, { "epoch": 0.0376, "loss_ce": 0.27183520793914795, "loss_lvr": 1.5679301023483276, "loss_mode_switch": 0.0, "loss_total": 0.42862820625305176, "step": 94 }, { "batch_size": 4, "epoch": 0.0376, "step": 94, "tokens_per_device": 9040 }, { "epoch": 0.0376, "loss_ce": 0.27866601943969727, "loss_lvr": 1.9703689813613892, "loss_mode_switch": 0.0, "loss_total": 0.4757029414176941, "step": 94 }, { "batch_size": 4, "epoch": 0.0376, "step": 94, "tokens_per_device": 2088 }, { "epoch": 0.0376, "loss_ce": 0.3459267318248749, "loss_lvr": 2.421949863433838, "loss_mode_switch": 0.0, "loss_total": 0.5881217122077942, "step": 94 }, { "batch_size": 1, "epoch": 0.0376, "step": 94, "tokens_per_device": 4845 }, { "epoch": 0.0376, "loss_ce": 0.0511719286441803, "loss_lvr": 0.8009278774261475, "loss_mode_switch": 0.0, "loss_total": 0.13126471638679504, "step": 94 }, { "batch_size": 4, "epoch": 0.0376, "step": 94, "tokens_per_device": 1420 }, { "epoch": 0.0376, "loss_ce": 0.6401651501655579, "loss_lvr": 2.2630438804626465, "loss_mode_switch": 0.0, "loss_total": 0.8664695620536804, "step": 94 }, { "batch_size": 4, "epoch": 0.0376, "step": 94, "tokens_per_device": 4744 }, { "epoch": 0.0376, "loss_ce": 0.19188731908798218, "loss_lvr": 1.870476484298706, "loss_mode_switch": 0.0, "loss_total": 0.37893497943878174, "step": 94 }, { "epoch": 0.038, "grad_norm": 4.01188850402832, "learning_rate": 9.998321768134101e-06, "loss": 0.5308, "step": 95 }, { "batch_size": 1, "epoch": 0.038, "step": 95, "tokens_per_device": 4875 }, { "epoch": 0.038, "loss_ce": 0.3562162518501282, "loss_lvr": 0.4262171685695648, "loss_mode_switch": 0.0, "loss_total": 0.39883798360824585, "step": 95 }, { "batch_size": 4, "epoch": 0.038, "step": 95, "tokens_per_device": 4256 }, { "epoch": 0.038, "loss_ce": 0.2946993410587311, "loss_lvr": 1.0379362106323242, "loss_mode_switch": 0.0, "loss_total": 0.3984929621219635, "step": 95 }, { "batch_size": 4, "epoch": 0.038, "step": 95, "tokens_per_device": 4372 }, { "epoch": 0.038, "loss_ce": 0.717513918876648, "loss_lvr": 1.4191946983337402, "loss_mode_switch": 0.0, "loss_total": 0.8594334125518799, "step": 95 }, { "batch_size": 4, "epoch": 0.038, "step": 95, "tokens_per_device": 1336 }, { "epoch": 0.038, "loss_ce": 0.8754918575286865, "loss_lvr": 1.7127747535705566, "loss_mode_switch": 0.0, "loss_total": 1.046769380569458, "step": 95 }, { "batch_size": 1, "epoch": 0.038, "step": 95, "tokens_per_device": 5215 }, { "epoch": 0.038, "loss_ce": 0.541347324848175, "loss_lvr": 0.8730495572090149, "loss_mode_switch": 0.0, "loss_total": 0.6286522746086121, "step": 95 }, { "batch_size": 4, "epoch": 0.038, "step": 95, "tokens_per_device": 10684 }, { "epoch": 0.038, "loss_ce": 0.32640963792800903, "loss_lvr": 1.2851996421813965, "loss_mode_switch": 0.0, "loss_total": 0.4549295902252197, "step": 95 }, { "batch_size": 4, "epoch": 0.038, "step": 95, "tokens_per_device": 3608 }, { "epoch": 0.038, "loss_ce": 1.08487868309021, "loss_lvr": 1.632753849029541, "loss_mode_switch": 0.0, "loss_total": 1.2481540441513062, "step": 95 }, { "batch_size": 1, "epoch": 0.038, "step": 95, "tokens_per_device": 4893 }, { "epoch": 0.038, "loss_ce": 0.3489987850189209, "loss_lvr": 0.40905627608299255, "loss_mode_switch": 0.0, "loss_total": 0.3899044096469879, "step": 95 }, { "epoch": 0.0384, "grad_norm": 5.557373523712158, "learning_rate": 9.998149759977795e-06, "loss": 0.5862, "step": 96 }, { "batch_size": 1, "epoch": 0.0384, "step": 96, "tokens_per_device": 4888 }, { "epoch": 0.0384, "loss_ce": 0.35132789611816406, "loss_lvr": 0.7404043078422546, "loss_mode_switch": 0.0, "loss_total": 0.4253683388233185, "step": 96 }, { "batch_size": 1, "epoch": 0.0384, "step": 96, "tokens_per_device": 4859 }, { "epoch": 0.0384, "loss_ce": 0.3557022213935852, "loss_lvr": 0.49583107233047485, "loss_mode_switch": 0.0, "loss_total": 0.4052853286266327, "step": 96 }, { "batch_size": 4, "epoch": 0.0384, "step": 96, "tokens_per_device": 3032 }, { "epoch": 0.0384, "loss_ce": 0.47171980142593384, "loss_lvr": 1.6083312034606934, "loss_mode_switch": 0.0, "loss_total": 0.6325529217720032, "step": 96 }, { "batch_size": 1, "epoch": 0.0384, "step": 96, "tokens_per_device": 4961 }, { "epoch": 0.0384, "loss_ce": 0.06896407902240753, "loss_lvr": 0.8287085890769958, "loss_mode_switch": 0.0, "loss_total": 0.15183493494987488, "step": 96 }, { "batch_size": 4, "epoch": 0.0384, "step": 96, "tokens_per_device": 6152 }, { "epoch": 0.0384, "loss_ce": 0.344868540763855, "loss_lvr": 1.502948522567749, "loss_mode_switch": 0.0, "loss_total": 0.4951633810997009, "step": 96 }, { "batch_size": 1, "epoch": 0.0384, "step": 96, "tokens_per_device": 5197 }, { "epoch": 0.0384, "loss_ce": 0.18244045972824097, "loss_lvr": 0.8527345657348633, "loss_mode_switch": 0.0, "loss_total": 0.26771390438079834, "step": 96 }, { "batch_size": 4, "epoch": 0.0384, "step": 96, "tokens_per_device": 1292 }, { "epoch": 0.0384, "loss_ce": 0.5271947979927063, "loss_lvr": 1.8298513889312744, "loss_mode_switch": 0.0, "loss_total": 0.7101799249649048, "step": 96 }, { "batch_size": 1, "epoch": 0.0384, "step": 96, "tokens_per_device": 5001 }, { "epoch": 0.0384, "loss_ce": 1.3151721954345703, "loss_lvr": 1.1411616802215576, "loss_mode_switch": 0.0, "loss_total": 1.429288387298584, "step": 96 }, { "epoch": 0.0388, "grad_norm": 4.851128101348877, "learning_rate": 9.997969363299187e-06, "loss": 0.5355, "step": 97 }, { "batch_size": 4, "epoch": 0.0388, "step": 97, "tokens_per_device": 14412 }, { "epoch": 0.0388, "loss_ce": 0.11164004355669022, "loss_lvr": 4.229428291320801, "loss_mode_switch": 0.0, "loss_total": 0.5345829129219055, "step": 97 }, { "batch_size": 4, "epoch": 0.0388, "step": 97, "tokens_per_device": 3764 }, { "epoch": 0.0388, "loss_ce": 0.7481555938720703, "loss_lvr": 2.6016666889190674, "loss_mode_switch": 0.0, "loss_total": 1.0083222389221191, "step": 97 }, { "batch_size": 1, "epoch": 0.0388, "step": 97, "tokens_per_device": 4872 }, { "epoch": 0.0388, "loss_ce": 0.025043781846761703, "loss_lvr": 2.7170019149780273, "loss_mode_switch": 0.0, "loss_total": 0.2967439889907837, "step": 97 }, { "batch_size": 4, "epoch": 0.0388, "step": 97, "tokens_per_device": 5440 }, { "epoch": 0.0388, "loss_ce": 0.5803824663162231, "loss_lvr": 1.7253104448318481, "loss_mode_switch": 0.0, "loss_total": 0.7529135346412659, "step": 97 }, { "batch_size": 4, "epoch": 0.0388, "step": 97, "tokens_per_device": 2636 }, { "epoch": 0.0388, "loss_ce": 0.082912877202034, "loss_lvr": 2.082977533340454, "loss_mode_switch": 0.0, "loss_total": 0.2912106513977051, "step": 97 }, { "batch_size": 4, "epoch": 0.0388, "step": 97, "tokens_per_device": 3820 }, { "epoch": 0.0388, "loss_ce": 0.3947584629058838, "loss_lvr": 2.5159170627593994, "loss_mode_switch": 0.0, "loss_total": 0.6463501453399658, "step": 97 }, { "batch_size": 4, "epoch": 0.0388, "step": 97, "tokens_per_device": 3784 }, { "epoch": 0.0388, "loss_ce": 1.1341623067855835, "loss_lvr": 1.660814642906189, "loss_mode_switch": 0.0, "loss_total": 1.3002437353134155, "step": 97 }, { "batch_size": 4, "epoch": 0.0388, "step": 97, "tokens_per_device": 4328 }, { "epoch": 0.0388, "loss_ce": 0.27667102217674255, "loss_lvr": 2.148927688598633, "loss_mode_switch": 0.0, "loss_total": 0.4915637969970703, "step": 97 }, { "epoch": 0.0392, "grad_norm": 4.078587055206299, "learning_rate": 9.99778057840104e-06, "loss": 0.5035, "step": 98 }, { "batch_size": 4, "epoch": 0.0392, "step": 98, "tokens_per_device": 3820 }, { "epoch": 0.0392, "loss_ce": 0.03942769020795822, "loss_lvr": 1.5889360904693604, "loss_mode_switch": 0.0, "loss_total": 0.19832131266593933, "step": 98 }, { "batch_size": 1, "epoch": 0.0392, "step": 98, "tokens_per_device": 4956 }, { "epoch": 0.0392, "loss_ce": 0.02411864511668682, "loss_lvr": 2.6311228275299072, "loss_mode_switch": 0.0, "loss_total": 0.287230908870697, "step": 98 }, { "batch_size": 4, "epoch": 0.0392, "step": 98, "tokens_per_device": 6360 }, { "epoch": 0.0392, "loss_ce": 0.823469877243042, "loss_lvr": 2.0844714641571045, "loss_mode_switch": 0.0, "loss_total": 1.0319169759750366, "step": 98 }, { "batch_size": 4, "epoch": 0.0392, "step": 98, "tokens_per_device": 5764 }, { "epoch": 0.0392, "loss_ce": 0.34588363766670227, "loss_lvr": 1.7383860349655151, "loss_mode_switch": 0.0, "loss_total": 0.5197222232818604, "step": 98 }, { "batch_size": 4, "epoch": 0.0392, "step": 98, "tokens_per_device": 3724 }, { "epoch": 0.0392, "loss_ce": 0.3556024134159088, "loss_lvr": 3.12190580368042, "loss_mode_switch": 0.0, "loss_total": 0.6677930355072021, "step": 98 }, { "batch_size": 4, "epoch": 0.0392, "step": 98, "tokens_per_device": 4628 }, { "epoch": 0.0392, "loss_ce": 0.23427338898181915, "loss_lvr": 2.120354413986206, "loss_mode_switch": 0.0, "loss_total": 0.44630885124206543, "step": 98 }, { "batch_size": 4, "epoch": 0.0392, "step": 98, "tokens_per_device": 4544 }, { "epoch": 0.0392, "loss_ce": 0.6853450536727905, "loss_lvr": 2.7183985710144043, "loss_mode_switch": 0.0, "loss_total": 0.957184910774231, "step": 98 }, { "batch_size": 1, "epoch": 0.0392, "step": 98, "tokens_per_device": 4867 }, { "epoch": 0.0392, "loss_ce": 0.07864557206630707, "loss_lvr": 1.1465470790863037, "loss_mode_switch": 0.0, "loss_total": 0.1933002769947052, "step": 98 }, { "epoch": 0.0396, "grad_norm": 3.7821993827819824, "learning_rate": 9.997583405600194e-06, "loss": 0.538, "step": 99 }, { "batch_size": 4, "epoch": 0.0396, "step": 99, "tokens_per_device": 1368 }, { "epoch": 0.0396, "loss_ce": 0.4556497037410736, "loss_lvr": 1.6129746437072754, "loss_mode_switch": 0.0, "loss_total": 0.6169471740722656, "step": 99 }, { "batch_size": 4, "epoch": 0.0396, "step": 99, "tokens_per_device": 2680 }, { "epoch": 0.0396, "loss_ce": 0.23096902668476105, "loss_lvr": 1.6871992349624634, "loss_mode_switch": 0.0, "loss_total": 0.3996889591217041, "step": 99 }, { "batch_size": 4, "epoch": 0.0396, "step": 99, "tokens_per_device": 2056 }, { "epoch": 0.0396, "loss_ce": 0.6120786070823669, "loss_lvr": 1.7114437818527222, "loss_mode_switch": 0.0, "loss_total": 0.7832229733467102, "step": 99 }, { "batch_size": 1, "epoch": 0.0396, "step": 99, "tokens_per_device": 4874 }, { "epoch": 0.0396, "loss_ce": 0.2793131470680237, "loss_lvr": 0.8283008337020874, "loss_mode_switch": 0.0, "loss_total": 0.36214321851730347, "step": 99 }, { "batch_size": 1, "epoch": 0.0396, "step": 99, "tokens_per_device": 5151 }, { "epoch": 0.0396, "loss_ce": 0.055601660162210464, "loss_lvr": 1.1731517314910889, "loss_mode_switch": 0.0, "loss_total": 0.17291682958602905, "step": 99 }, { "batch_size": 1, "epoch": 0.0396, "step": 99, "tokens_per_device": 5196 }, { "epoch": 0.0396, "loss_ce": 0.8256678581237793, "loss_lvr": 1.0338282585144043, "loss_mode_switch": 0.0, "loss_total": 0.9290506839752197, "step": 99 }, { "batch_size": 4, "epoch": 0.0396, "step": 99, "tokens_per_device": 1392 }, { "epoch": 0.0396, "loss_ce": 0.44789090752601624, "loss_lvr": 1.9480476379394531, "loss_mode_switch": 0.0, "loss_total": 0.6426956653594971, "step": 99 }, { "batch_size": 4, "epoch": 0.0396, "step": 99, "tokens_per_device": 4188 }, { "epoch": 0.0396, "loss_ce": 0.5987087488174438, "loss_lvr": 1.338510513305664, "loss_mode_switch": 0.0, "loss_total": 0.7325598001480103, "step": 99 }, { "epoch": 0.04, "grad_norm": 3.917539119720459, "learning_rate": 9.997377845227577e-06, "loss": 0.4776, "step": 100 }, { "batch_size": 4, "epoch": 0.04, "step": 100, "tokens_per_device": 2596 }, { "epoch": 0.04, "loss_ce": 0.3529979884624481, "loss_lvr": 1.7738924026489258, "loss_mode_switch": 0.0, "loss_total": 0.5303872227668762, "step": 100 }, { "batch_size": 4, "epoch": 0.04, "step": 100, "tokens_per_device": 3780 }, { "epoch": 0.04, "loss_ce": 0.39144468307495117, "loss_lvr": 1.4180071353912354, "loss_mode_switch": 0.0, "loss_total": 0.5332453846931458, "step": 100 }, { "batch_size": 1, "epoch": 0.04, "step": 100, "tokens_per_device": 5115 }, { "epoch": 0.04, "loss_ce": 0.42844152450561523, "loss_lvr": 0.7851086258888245, "loss_mode_switch": 0.0, "loss_total": 0.5069524049758911, "step": 100 }, { "batch_size": 4, "epoch": 0.04, "step": 100, "tokens_per_device": 3808 }, { "epoch": 0.04, "loss_ce": 0.26858222484588623, "loss_lvr": 1.2546520233154297, "loss_mode_switch": 0.0, "loss_total": 0.39404743909835815, "step": 100 }, { "batch_size": 1, "epoch": 0.04, "step": 100, "tokens_per_device": 4890 }, { "epoch": 0.04, "loss_ce": 0.07762859016656876, "loss_lvr": 1.6347336769104004, "loss_mode_switch": 0.0, "loss_total": 0.2411019504070282, "step": 100 }, { "batch_size": 4, "epoch": 0.04, "step": 100, "tokens_per_device": 4480 }, { "epoch": 0.04, "loss_ce": 0.3454809784889221, "loss_lvr": 1.0665751695632935, "loss_mode_switch": 0.0, "loss_total": 0.4521384835243225, "step": 100 }, { "batch_size": 1, "epoch": 0.04, "step": 100, "tokens_per_device": 4953 }, { "epoch": 0.04, "loss_ce": 0.05466777831315994, "loss_lvr": 2.006805419921875, "loss_mode_switch": 0.0, "loss_total": 0.2553483247756958, "step": 100 }, { "batch_size": 1, "epoch": 0.04, "step": 100, "tokens_per_device": 5100 }, { "epoch": 0.04, "loss_ce": 0.3740198612213135, "loss_lvr": 0.531382143497467, "loss_mode_switch": 0.0, "loss_total": 0.42715808749198914, "step": 100 }, { "epoch": 0.0404, "grad_norm": 4.194811820983887, "learning_rate": 9.997163897628175e-06, "loss": 0.5091, "step": 101 }, { "batch_size": 4, "epoch": 0.0404, "step": 101, "tokens_per_device": 4584 }, { "epoch": 0.0404, "loss_ce": 0.49895936250686646, "loss_lvr": 1.4041436910629272, "loss_mode_switch": 0.0, "loss_total": 0.6393737196922302, "step": 101 }, { "batch_size": 4, "epoch": 0.0404, "step": 101, "tokens_per_device": 5320 }, { "epoch": 0.0404, "loss_ce": 0.5092613101005554, "loss_lvr": 1.5459924936294556, "loss_mode_switch": 0.0, "loss_total": 0.663860559463501, "step": 101 }, { "batch_size": 1, "epoch": 0.0404, "step": 101, "tokens_per_device": 5113 }, { "epoch": 0.0404, "loss_ce": 0.13699476420879364, "loss_lvr": 0.4888308048248291, "loss_mode_switch": 0.0, "loss_total": 0.18587784469127655, "step": 101 }, { "batch_size": 4, "epoch": 0.0404, "step": 101, "tokens_per_device": 1292 }, { "epoch": 0.0404, "loss_ce": 0.45660892128944397, "loss_lvr": 2.7178151607513428, "loss_mode_switch": 0.0, "loss_total": 0.7283904552459717, "step": 101 }, { "batch_size": 4, "epoch": 0.0404, "step": 101, "tokens_per_device": 4444 }, { "epoch": 0.0404, "loss_ce": 0.3598967492580414, "loss_lvr": 2.257791519165039, "loss_mode_switch": 0.0, "loss_total": 0.5856758952140808, "step": 101 }, { "batch_size": 4, "epoch": 0.0404, "step": 101, "tokens_per_device": 2700 }, { "epoch": 0.0404, "loss_ce": 0.1597040444612503, "loss_lvr": 1.7104846239089966, "loss_mode_switch": 0.0, "loss_total": 0.33075249195098877, "step": 101 }, { "batch_size": 1, "epoch": 0.0404, "step": 101, "tokens_per_device": 4892 }, { "epoch": 0.0404, "loss_ce": 0.015676656737923622, "loss_lvr": 1.4482789039611816, "loss_mode_switch": 0.0, "loss_total": 0.160504549741745, "step": 101 }, { "batch_size": 1, "epoch": 0.0404, "step": 101, "tokens_per_device": 7156 }, { "epoch": 0.0404, "loss_ce": 0.05314094200730324, "loss_lvr": 0.769808292388916, "loss_mode_switch": 0.0, "loss_total": 0.13012176752090454, "step": 101 }, { "epoch": 0.0408, "grad_norm": 2.820042610168457, "learning_rate": 9.996941563161071e-06, "loss": 0.5025, "step": 102 }, { "batch_size": 1, "epoch": 0.0408, "step": 102, "tokens_per_device": 4950 }, { "epoch": 0.0408, "loss_ce": 0.15689051151275635, "loss_lvr": 1.342178225517273, "loss_mode_switch": 0.0, "loss_total": 0.2911083400249481, "step": 102 }, { "batch_size": 1, "epoch": 0.0408, "step": 102, "tokens_per_device": 4859 }, { "epoch": 0.0408, "loss_ce": 0.12120674550533295, "loss_lvr": 0.7788323163986206, "loss_mode_switch": 0.0, "loss_total": 0.19908997416496277, "step": 102 }, { "batch_size": 4, "epoch": 0.0408, "step": 102, "tokens_per_device": 1764 }, { "epoch": 0.0408, "loss_ce": 0.1748773753643036, "loss_lvr": 2.4852731227874756, "loss_mode_switch": 0.0, "loss_total": 0.4234046936035156, "step": 102 }, { "batch_size": 1, "epoch": 0.0408, "step": 102, "tokens_per_device": 5176 }, { "epoch": 0.0408, "loss_ce": 0.05310903862118721, "loss_lvr": 0.6557397842407227, "loss_mode_switch": 0.0, "loss_total": 0.11868302524089813, "step": 102 }, { "batch_size": 4, "epoch": 0.0408, "step": 102, "tokens_per_device": 8272 }, { "epoch": 0.0408, "loss_ce": 0.5429234504699707, "loss_lvr": 2.116586685180664, "loss_mode_switch": 0.0, "loss_total": 0.7545821070671082, "step": 102 }, { "batch_size": 4, "epoch": 0.0408, "step": 102, "tokens_per_device": 6268 }, { "epoch": 0.0408, "loss_ce": 0.383159875869751, "loss_lvr": 1.502702236175537, "loss_mode_switch": 0.0, "loss_total": 0.5334300994873047, "step": 102 }, { "batch_size": 4, "epoch": 0.0408, "step": 102, "tokens_per_device": 3788 }, { "epoch": 0.0408, "loss_ce": 0.47592222690582275, "loss_lvr": 1.918426275253296, "loss_mode_switch": 0.0, "loss_total": 0.6677648425102234, "step": 102 }, { "batch_size": 4, "epoch": 0.0408, "step": 102, "tokens_per_device": 4296 }, { "epoch": 0.0408, "loss_ce": 0.4692455232143402, "loss_lvr": 2.125506639480591, "loss_mode_switch": 0.0, "loss_total": 0.6817961931228638, "step": 102 }, { "epoch": 0.0412, "grad_norm": 2.812095880508423, "learning_rate": 9.996710842199412e-06, "loss": 0.467, "step": 103 }, { "batch_size": 4, "epoch": 0.0412, "step": 103, "tokens_per_device": 4384 }, { "epoch": 0.0412, "loss_ce": 0.09620939195156097, "loss_lvr": 2.4374022483825684, "loss_mode_switch": 0.0, "loss_total": 0.3399496078491211, "step": 103 }, { "batch_size": 4, "epoch": 0.0412, "step": 103, "tokens_per_device": 4568 }, { "epoch": 0.0412, "loss_ce": 0.7417523860931396, "loss_lvr": 1.7944658994674683, "loss_mode_switch": 0.0, "loss_total": 0.9211989641189575, "step": 103 }, { "batch_size": 4, "epoch": 0.0412, "step": 103, "tokens_per_device": 4200 }, { "epoch": 0.0412, "loss_ce": 0.08339358121156693, "loss_lvr": 1.440179467201233, "loss_mode_switch": 0.0, "loss_total": 0.22741153836250305, "step": 103 }, { "batch_size": 4, "epoch": 0.0412, "step": 103, "tokens_per_device": 2652 }, { "epoch": 0.0412, "loss_ce": 0.19864912331104279, "loss_lvr": 2.9656167030334473, "loss_mode_switch": 0.0, "loss_total": 0.49521082639694214, "step": 103 }, { "batch_size": 4, "epoch": 0.0412, "step": 103, "tokens_per_device": 3800 }, { "epoch": 0.0412, "loss_ce": 0.2959972620010376, "loss_lvr": 1.7215975522994995, "loss_mode_switch": 0.0, "loss_total": 0.468157023191452, "step": 103 }, { "batch_size": 4, "epoch": 0.0412, "step": 103, "tokens_per_device": 3760 }, { "epoch": 0.0412, "loss_ce": 0.4161427617073059, "loss_lvr": 1.6249446868896484, "loss_mode_switch": 0.0, "loss_total": 0.5786372423171997, "step": 103 }, { "batch_size": 1, "epoch": 0.0412, "step": 103, "tokens_per_device": 4868 }, { "epoch": 0.0412, "loss_ce": 0.2467196136713028, "loss_lvr": 0.7017802000045776, "loss_mode_switch": 0.0, "loss_total": 0.3168976306915283, "step": 103 }, { "batch_size": 4, "epoch": 0.0412, "step": 103, "tokens_per_device": 4900 }, { "epoch": 0.0412, "loss_ce": 0.6428055167198181, "loss_lvr": 1.886091947555542, "loss_mode_switch": 0.0, "loss_total": 0.8314146995544434, "step": 103 }, { "epoch": 0.0416, "grad_norm": 3.106889486312866, "learning_rate": 9.996471735130422e-06, "loss": 0.4731, "step": 104 }, { "batch_size": 4, "epoch": 0.0416, "step": 104, "tokens_per_device": 4228 }, { "epoch": 0.0416, "loss_ce": 0.17644260823726654, "loss_lvr": 1.243455171585083, "loss_mode_switch": 0.0, "loss_total": 0.30078813433647156, "step": 104 }, { "batch_size": 1, "epoch": 0.0416, "step": 104, "tokens_per_device": 4864 }, { "epoch": 0.0416, "loss_ce": 0.17049461603164673, "loss_lvr": 1.448588490486145, "loss_mode_switch": 0.0, "loss_total": 0.3153534531593323, "step": 104 }, { "batch_size": 1, "epoch": 0.0416, "step": 104, "tokens_per_device": 5187 }, { "epoch": 0.0416, "loss_ce": 0.019075972959399223, "loss_lvr": 2.0202248096466064, "loss_mode_switch": 0.0, "loss_total": 0.22109845280647278, "step": 104 }, { "batch_size": 1, "epoch": 0.0416, "step": 104, "tokens_per_device": 5043 }, { "epoch": 0.0416, "loss_ce": 0.09950102120637894, "loss_lvr": 1.4079036712646484, "loss_mode_switch": 0.0, "loss_total": 0.24029138684272766, "step": 104 }, { "batch_size": 1, "epoch": 0.0416, "step": 104, "tokens_per_device": 5552 }, { "epoch": 0.0416, "loss_ce": 0.01691422611474991, "loss_lvr": 0.8391308784484863, "loss_mode_switch": 0.0, "loss_total": 0.10082731395959854, "step": 104 }, { "batch_size": 1, "epoch": 0.0416, "step": 104, "tokens_per_device": 4753 }, { "epoch": 0.0416, "loss_ce": 0.17262032628059387, "loss_lvr": 0.44558438658714294, "loss_mode_switch": 0.0, "loss_total": 0.21717876195907593, "step": 104 }, { "batch_size": 4, "epoch": 0.0416, "step": 104, "tokens_per_device": 8648 }, { "epoch": 0.0416, "loss_ce": 0.18562225997447968, "loss_lvr": 1.6762361526489258, "loss_mode_switch": 0.0, "loss_total": 0.35324588418006897, "step": 104 }, { "batch_size": 1, "epoch": 0.0416, "step": 104, "tokens_per_device": 5120 }, { "epoch": 0.0416, "loss_ce": 0.05604476109147072, "loss_lvr": 0.45047253370285034, "loss_mode_switch": 0.0, "loss_total": 0.10109201073646545, "step": 104 }, { "epoch": 0.042, "grad_norm": 3.520487070083618, "learning_rate": 9.9962242423554e-06, "loss": 0.4407, "step": 105 }, { "batch_size": 4, "epoch": 0.042, "step": 105, "tokens_per_device": 2660 }, { "epoch": 0.042, "loss_ce": 0.3827957808971405, "loss_lvr": 1.680092215538025, "loss_mode_switch": 0.0, "loss_total": 0.5508049726486206, "step": 105 }, { "batch_size": 4, "epoch": 0.042, "step": 105, "tokens_per_device": 2548 }, { "epoch": 0.042, "loss_ce": 0.4156423807144165, "loss_lvr": 1.5967351198196411, "loss_mode_switch": 0.0, "loss_total": 0.5753158926963806, "step": 105 }, { "batch_size": 4, "epoch": 0.042, "step": 105, "tokens_per_device": 4212 }, { "epoch": 0.042, "loss_ce": 0.07193020731210709, "loss_lvr": 1.5822309255599976, "loss_mode_switch": 0.0, "loss_total": 0.23015329241752625, "step": 105 }, { "batch_size": 1, "epoch": 0.042, "step": 105, "tokens_per_device": 5180 }, { "epoch": 0.042, "loss_ce": 0.4069865942001343, "loss_lvr": 1.8903429508209229, "loss_mode_switch": 0.0, "loss_total": 0.5960208773612976, "step": 105 }, { "batch_size": 4, "epoch": 0.042, "step": 105, "tokens_per_device": 4464 }, { "epoch": 0.042, "loss_ce": 0.12875983119010925, "loss_lvr": 1.1658190488815308, "loss_mode_switch": 0.0, "loss_total": 0.24534174799919128, "step": 105 }, { "batch_size": 4, "epoch": 0.042, "step": 105, "tokens_per_device": 5844 }, { "epoch": 0.042, "loss_ce": 0.6053599119186401, "loss_lvr": 1.617501139640808, "loss_mode_switch": 0.0, "loss_total": 0.7671100497245789, "step": 105 }, { "batch_size": 4, "epoch": 0.042, "step": 105, "tokens_per_device": 4404 }, { "epoch": 0.042, "loss_ce": 0.52131187915802, "loss_lvr": 1.4472265243530273, "loss_mode_switch": 0.0, "loss_total": 0.6660345196723938, "step": 105 }, { "batch_size": 4, "epoch": 0.042, "step": 105, "tokens_per_device": 4216 }, { "epoch": 0.042, "loss_ce": 0.08423566818237305, "loss_lvr": 1.6245867013931274, "loss_mode_switch": 0.0, "loss_total": 0.24669434130191803, "step": 105 }, { "epoch": 0.0424, "grad_norm": 3.292807102203369, "learning_rate": 9.995968364289719e-06, "loss": 0.4741, "step": 106 }, { "batch_size": 1, "epoch": 0.0424, "step": 106, "tokens_per_device": 5127 }, { "epoch": 0.0424, "loss_ce": 0.06483837962150574, "loss_lvr": 0.5868976712226868, "loss_mode_switch": 0.0, "loss_total": 0.12352815270423889, "step": 106 }, { "batch_size": 4, "epoch": 0.0424, "step": 106, "tokens_per_device": 4980 }, { "epoch": 0.0424, "loss_ce": 0.3445177972316742, "loss_lvr": 1.6131484508514404, "loss_mode_switch": 0.0, "loss_total": 0.5058326721191406, "step": 106 }, { "batch_size": 4, "epoch": 0.0424, "step": 106, "tokens_per_device": 1504 }, { "epoch": 0.0424, "loss_ce": 0.30248117446899414, "loss_lvr": 1.9599772691726685, "loss_mode_switch": 0.0, "loss_total": 0.49847888946533203, "step": 106 }, { "batch_size": 4, "epoch": 0.0424, "step": 106, "tokens_per_device": 2564 }, { "epoch": 0.0424, "loss_ce": 0.5574073195457458, "loss_lvr": 2.31778621673584, "loss_mode_switch": 0.0, "loss_total": 0.7891859412193298, "step": 106 }, { "batch_size": 1, "epoch": 0.0424, "step": 106, "tokens_per_device": 4959 }, { "epoch": 0.0424, "loss_ce": 0.049103476107120514, "loss_lvr": 1.4283263683319092, "loss_mode_switch": 0.0, "loss_total": 0.19193610548973083, "step": 106 }, { "batch_size": 4, "epoch": 0.0424, "step": 106, "tokens_per_device": 5440 }, { "epoch": 0.0424, "loss_ce": 0.04811590537428856, "loss_lvr": 1.390234351158142, "loss_mode_switch": 0.0, "loss_total": 0.1871393471956253, "step": 106 }, { "batch_size": 4, "epoch": 0.0424, "step": 106, "tokens_per_device": 4836 }, { "epoch": 0.0424, "loss_ce": 0.5619649887084961, "loss_lvr": 1.4558826684951782, "loss_mode_switch": 0.0, "loss_total": 0.7075532674789429, "step": 106 }, { "batch_size": 1, "epoch": 0.0424, "step": 106, "tokens_per_device": 4923 }, { "epoch": 0.0424, "loss_ce": 0.15719246864318848, "loss_lvr": 2.705883741378784, "loss_mode_switch": 0.0, "loss_total": 0.4277808368206024, "step": 106 }, { "epoch": 0.0428, "grad_norm": 2.736745595932007, "learning_rate": 9.99570410136283e-06, "loss": 0.445, "step": 107 }, { "batch_size": 4, "epoch": 0.0428, "step": 107, "tokens_per_device": 3580 }, { "epoch": 0.0428, "loss_ce": 0.6871395111083984, "loss_lvr": 2.6891939640045166, "loss_mode_switch": 0.0, "loss_total": 0.956058919429779, "step": 107 }, { "batch_size": 4, "epoch": 0.0428, "step": 107, "tokens_per_device": 4356 }, { "epoch": 0.0428, "loss_ce": 0.2950502336025238, "loss_lvr": 1.688704490661621, "loss_mode_switch": 0.0, "loss_total": 0.4639206826686859, "step": 107 }, { "batch_size": 4, "epoch": 0.0428, "step": 107, "tokens_per_device": 1880 }, { "epoch": 0.0428, "loss_ce": 0.44152384996414185, "loss_lvr": 1.7891157865524292, "loss_mode_switch": 0.0, "loss_total": 0.6204354166984558, "step": 107 }, { "batch_size": 4, "epoch": 0.0428, "step": 107, "tokens_per_device": 4660 }, { "epoch": 0.0428, "loss_ce": 0.08698076754808426, "loss_lvr": 1.150468111038208, "loss_mode_switch": 0.0, "loss_total": 0.2020275890827179, "step": 107 }, { "batch_size": 4, "epoch": 0.0428, "step": 107, "tokens_per_device": 7904 }, { "epoch": 0.0428, "loss_ce": 0.5055174827575684, "loss_lvr": 1.938621997833252, "loss_mode_switch": 0.0, "loss_total": 0.6993796825408936, "step": 107 }, { "batch_size": 1, "epoch": 0.0428, "step": 107, "tokens_per_device": 7496 }, { "epoch": 0.0428, "loss_ce": 0.011179640889167786, "loss_lvr": 2.4314682483673096, "loss_mode_switch": 0.0, "loss_total": 0.2543264627456665, "step": 107 }, { "batch_size": 1, "epoch": 0.0428, "step": 107, "tokens_per_device": 4915 }, { "epoch": 0.0428, "loss_ce": 0.01412292942404747, "loss_lvr": 1.394339680671692, "loss_mode_switch": 0.0, "loss_total": 0.15355689823627472, "step": 107 }, { "batch_size": 4, "epoch": 0.0428, "step": 107, "tokens_per_device": 4280 }, { "epoch": 0.0428, "loss_ce": 0.750789999961853, "loss_lvr": 3.058595895767212, "loss_mode_switch": 0.0, "loss_total": 1.0566495656967163, "step": 107 }, { "epoch": 0.0432, "grad_norm": 2.6146962642669678, "learning_rate": 9.995431454018246e-06, "loss": 0.5186, "step": 108 }, { "batch_size": 1, "epoch": 0.0432, "step": 108, "tokens_per_device": 4546 }, { "epoch": 0.0432, "loss_ce": 0.08614516258239746, "loss_lvr": 1.4456593990325928, "loss_mode_switch": 0.0, "loss_total": 0.23071110248565674, "step": 108 }, { "batch_size": 4, "epoch": 0.0432, "step": 108, "tokens_per_device": 4172 }, { "epoch": 0.0432, "loss_ce": 0.7660657167434692, "loss_lvr": 1.6736918687820435, "loss_mode_switch": 0.0, "loss_total": 0.9334349036216736, "step": 108 }, { "batch_size": 4, "epoch": 0.0432, "step": 108, "tokens_per_device": 8812 }, { "epoch": 0.0432, "loss_ce": 0.17967796325683594, "loss_lvr": 2.4048562049865723, "loss_mode_switch": 0.0, "loss_total": 0.4201635718345642, "step": 108 }, { "batch_size": 1, "epoch": 0.0432, "step": 108, "tokens_per_device": 5094 }, { "epoch": 0.0432, "loss_ce": 0.03613753616809845, "loss_lvr": 1.2412981986999512, "loss_mode_switch": 0.0, "loss_total": 0.16026735305786133, "step": 108 }, { "batch_size": 4, "epoch": 0.0432, "step": 108, "tokens_per_device": 4224 }, { "epoch": 0.0432, "loss_ce": 0.13347770273685455, "loss_lvr": 1.549471378326416, "loss_mode_switch": 0.0, "loss_total": 0.28842484951019287, "step": 108 }, { "batch_size": 4, "epoch": 0.0432, "step": 108, "tokens_per_device": 3860 }, { "epoch": 0.0432, "loss_ce": 0.49716198444366455, "loss_lvr": 2.4884653091430664, "loss_mode_switch": 0.0, "loss_total": 0.7460085153579712, "step": 108 }, { "batch_size": 4, "epoch": 0.0432, "step": 108, "tokens_per_device": 4372 }, { "epoch": 0.0432, "loss_ce": 0.06028199940919876, "loss_lvr": 1.3498104810714722, "loss_mode_switch": 0.0, "loss_total": 0.1952630579471588, "step": 108 }, { "batch_size": 4, "epoch": 0.0432, "step": 108, "tokens_per_device": 5736 }, { "epoch": 0.0432, "loss_ce": 0.22269965708255768, "loss_lvr": 1.625531792640686, "loss_mode_switch": 0.0, "loss_total": 0.38525283336639404, "step": 108 }, { "epoch": 0.0436, "grad_norm": 2.8621742725372314, "learning_rate": 9.995150422713561e-06, "loss": 0.465, "step": 109 }, { "batch_size": 4, "epoch": 0.0436, "step": 109, "tokens_per_device": 4756 }, { "epoch": 0.0436, "loss_ce": 0.4194068908691406, "loss_lvr": 1.0936026573181152, "loss_mode_switch": 0.0, "loss_total": 0.5287671685218811, "step": 109 }, { "batch_size": 4, "epoch": 0.0436, "step": 109, "tokens_per_device": 4460 }, { "epoch": 0.0436, "loss_ce": 0.39267754554748535, "loss_lvr": 1.3844140768051147, "loss_mode_switch": 0.0, "loss_total": 0.5311189889907837, "step": 109 }, { "batch_size": 4, "epoch": 0.0436, "step": 109, "tokens_per_device": 3760 }, { "epoch": 0.0436, "loss_ce": 0.3316650986671448, "loss_lvr": 1.815619707107544, "loss_mode_switch": 0.0, "loss_total": 0.513227105140686, "step": 109 }, { "batch_size": 4, "epoch": 0.0436, "step": 109, "tokens_per_device": 5420 }, { "epoch": 0.0436, "loss_ce": 0.16854798793792725, "loss_lvr": 1.0971909761428833, "loss_mode_switch": 0.0, "loss_total": 0.2782670855522156, "step": 109 }, { "batch_size": 4, "epoch": 0.0436, "step": 109, "tokens_per_device": 3756 }, { "epoch": 0.0436, "loss_ce": 0.16404621303081512, "loss_lvr": 1.4746679067611694, "loss_mode_switch": 0.0, "loss_total": 0.3115130066871643, "step": 109 }, { "batch_size": 4, "epoch": 0.0436, "step": 109, "tokens_per_device": 5844 }, { "epoch": 0.0436, "loss_ce": 0.07006329298019409, "loss_lvr": 1.130711555480957, "loss_mode_switch": 0.0, "loss_total": 0.18313445150852203, "step": 109 }, { "batch_size": 4, "epoch": 0.0436, "step": 109, "tokens_per_device": 4208 }, { "epoch": 0.0436, "loss_ce": 0.28980404138565063, "loss_lvr": 1.9044610261917114, "loss_mode_switch": 0.0, "loss_total": 0.48025014996528625, "step": 109 }, { "batch_size": 4, "epoch": 0.0436, "step": 109, "tokens_per_device": 2868 }, { "epoch": 0.0436, "loss_ce": 0.4060116410255432, "loss_lvr": 1.114916443824768, "loss_mode_switch": 0.0, "loss_total": 0.5175032615661621, "step": 109 }, { "epoch": 0.044, "grad_norm": 3.393094778060913, "learning_rate": 9.99486100792044e-06, "loss": 0.4538, "step": 110 }, { "batch_size": 4, "epoch": 0.044, "step": 110, "tokens_per_device": 2116 }, { "epoch": 0.044, "loss_ce": 0.19684936106204987, "loss_lvr": 1.216522216796875, "loss_mode_switch": 0.0, "loss_total": 0.3185015916824341, "step": 110 }, { "batch_size": 4, "epoch": 0.044, "step": 110, "tokens_per_device": 13760 }, { "epoch": 0.044, "loss_ce": 0.4961012899875641, "loss_lvr": 1.9956326484680176, "loss_mode_switch": 0.0, "loss_total": 0.6956645250320435, "step": 110 }, { "batch_size": 4, "epoch": 0.044, "step": 110, "tokens_per_device": 5764 }, { "epoch": 0.044, "loss_ce": 0.12168542295694351, "loss_lvr": 2.096081256866455, "loss_mode_switch": 0.0, "loss_total": 0.3312935531139374, "step": 110 }, { "batch_size": 1, "epoch": 0.044, "step": 110, "tokens_per_device": 5021 }, { "epoch": 0.044, "loss_ce": 0.12008409202098846, "loss_lvr": 1.1926262378692627, "loss_mode_switch": 0.0, "loss_total": 0.2393467128276825, "step": 110 }, { "batch_size": 1, "epoch": 0.044, "step": 110, "tokens_per_device": 4882 }, { "epoch": 0.044, "loss_ce": 0.2225961983203888, "loss_lvr": 1.1345747709274292, "loss_mode_switch": 0.0, "loss_total": 0.33605366945266724, "step": 110 }, { "batch_size": 4, "epoch": 0.044, "step": 110, "tokens_per_device": 4228 }, { "epoch": 0.044, "loss_ce": 0.15436409413814545, "loss_lvr": 1.8922336101531982, "loss_mode_switch": 0.0, "loss_total": 0.3435874581336975, "step": 110 }, { "batch_size": 1, "epoch": 0.044, "step": 110, "tokens_per_device": 4878 }, { "epoch": 0.044, "loss_ce": 0.04139111936092377, "loss_lvr": 0.9894306659698486, "loss_mode_switch": 0.0, "loss_total": 0.14033418893814087, "step": 110 }, { "batch_size": 4, "epoch": 0.044, "step": 110, "tokens_per_device": 2656 }, { "epoch": 0.044, "loss_ce": 0.3953862190246582, "loss_lvr": 1.65202796459198, "loss_mode_switch": 0.0, "loss_total": 0.5605890154838562, "step": 110 }, { "epoch": 0.0444, "grad_norm": 2.8564364910125732, "learning_rate": 9.99456321012461e-06, "loss": 0.4344, "step": 111 }, { "batch_size": 4, "epoch": 0.0444, "step": 111, "tokens_per_device": 5568 }, { "epoch": 0.0444, "loss_ce": 0.09352413564920425, "loss_lvr": 1.5928267240524292, "loss_mode_switch": 0.0, "loss_total": 0.25280681252479553, "step": 111 }, { "batch_size": 4, "epoch": 0.0444, "step": 111, "tokens_per_device": 9072 }, { "epoch": 0.0444, "loss_ce": 0.16715645790100098, "loss_lvr": 1.3109525442123413, "loss_mode_switch": 0.0, "loss_total": 0.2982517182826996, "step": 111 }, { "batch_size": 1, "epoch": 0.0444, "step": 111, "tokens_per_device": 7334 }, { "epoch": 0.0444, "loss_ce": 0.3901207447052002, "loss_lvr": 1.180797815322876, "loss_mode_switch": 0.0, "loss_total": 0.5082005262374878, "step": 111 }, { "batch_size": 4, "epoch": 0.0444, "step": 111, "tokens_per_device": 5120 }, { "epoch": 0.0444, "loss_ce": 0.37284788489341736, "loss_lvr": 1.3881020545959473, "loss_mode_switch": 0.0, "loss_total": 0.5116580724716187, "step": 111 }, { "batch_size": 4, "epoch": 0.0444, "step": 111, "tokens_per_device": 5720 }, { "epoch": 0.0444, "loss_ce": 0.28442275524139404, "loss_lvr": 2.1104798316955566, "loss_mode_switch": 0.0, "loss_total": 0.4954707622528076, "step": 111 }, { "batch_size": 1, "epoch": 0.0444, "step": 111, "tokens_per_device": 5178 }, { "epoch": 0.0444, "loss_ce": 0.031105969101190567, "loss_lvr": 0.9280388355255127, "loss_mode_switch": 0.0, "loss_total": 0.1239098608493805, "step": 111 }, { "batch_size": 4, "epoch": 0.0444, "step": 111, "tokens_per_device": 5808 }, { "epoch": 0.0444, "loss_ce": 0.21268723905086517, "loss_lvr": 1.2686498165130615, "loss_mode_switch": 0.0, "loss_total": 0.33955222368240356, "step": 111 }, { "batch_size": 1, "epoch": 0.0444, "step": 111, "tokens_per_device": 4882 }, { "epoch": 0.0444, "loss_ce": 0.049027424305677414, "loss_lvr": 1.0371307134628296, "loss_mode_switch": 0.0, "loss_total": 0.1527404934167862, "step": 111 }, { "epoch": 0.0448, "grad_norm": 2.736705780029297, "learning_rate": 9.994257029825876e-06, "loss": 0.4607, "step": 112 }, { "batch_size": 4, "epoch": 0.0448, "step": 112, "tokens_per_device": 4656 }, { "epoch": 0.0448, "loss_ce": 0.6271071434020996, "loss_lvr": 2.677086353302002, "loss_mode_switch": 0.0, "loss_total": 0.8948158025741577, "step": 112 }, { "batch_size": 1, "epoch": 0.0448, "step": 112, "tokens_per_device": 4969 }, { "epoch": 0.0448, "loss_ce": 0.5348389148712158, "loss_lvr": 1.1065605878829956, "loss_mode_switch": 0.0, "loss_total": 0.6454949975013733, "step": 112 }, { "batch_size": 4, "epoch": 0.0448, "step": 112, "tokens_per_device": 1388 }, { "epoch": 0.0448, "loss_ce": 0.42652633786201477, "loss_lvr": 1.6626299619674683, "loss_mode_switch": 0.0, "loss_total": 0.592789351940155, "step": 112 }, { "batch_size": 4, "epoch": 0.0448, "step": 112, "tokens_per_device": 1276 }, { "epoch": 0.0448, "loss_ce": 0.7559519410133362, "loss_lvr": 2.9072422981262207, "loss_mode_switch": 0.0, "loss_total": 1.0466761589050293, "step": 112 }, { "batch_size": 1, "epoch": 0.0448, "step": 112, "tokens_per_device": 5047 }, { "epoch": 0.0448, "loss_ce": 0.5882383584976196, "loss_lvr": 1.2809644937515259, "loss_mode_switch": 0.0, "loss_total": 0.7163348197937012, "step": 112 }, { "batch_size": 4, "epoch": 0.0448, "step": 112, "tokens_per_device": 1220 }, { "epoch": 0.0448, "loss_ce": 0.17994479835033417, "loss_lvr": 2.7220137119293213, "loss_mode_switch": 0.0, "loss_total": 0.45214617252349854, "step": 112 }, { "batch_size": 4, "epoch": 0.0448, "step": 112, "tokens_per_device": 11156 }, { "epoch": 0.0448, "loss_ce": 0.04591422528028488, "loss_lvr": 1.3483468294143677, "loss_mode_switch": 0.0, "loss_total": 0.18074890971183777, "step": 112 }, { "batch_size": 1, "epoch": 0.0448, "step": 112, "tokens_per_device": 6083 }, { "epoch": 0.0448, "loss_ce": 0.007375861052423716, "loss_lvr": 0.7689422965049744, "loss_mode_switch": 0.0, "loss_total": 0.08427008986473083, "step": 112 }, { "epoch": 0.0452, "grad_norm": 2.787130117416382, "learning_rate": 9.993942467538107e-06, "loss": 0.5001, "step": 113 }, { "batch_size": 1, "epoch": 0.0452, "step": 113, "tokens_per_device": 5036 }, { "epoch": 0.0452, "loss_ce": 0.2207394689321518, "loss_lvr": 0.45800212025642395, "loss_mode_switch": 0.0, "loss_total": 0.26653969287872314, "step": 113 }, { "batch_size": 4, "epoch": 0.0452, "step": 113, "tokens_per_device": 4604 }, { "epoch": 0.0452, "loss_ce": 0.3471676707267761, "loss_lvr": 1.1697890758514404, "loss_mode_switch": 0.0, "loss_total": 0.46414658427238464, "step": 113 }, { "batch_size": 4, "epoch": 0.0452, "step": 113, "tokens_per_device": 8792 }, { "epoch": 0.0452, "loss_ce": 0.3522074520587921, "loss_lvr": 1.025162696838379, "loss_mode_switch": 0.0, "loss_total": 0.4547237157821655, "step": 113 }, { "batch_size": 1, "epoch": 0.0452, "step": 113, "tokens_per_device": 5055 }, { "epoch": 0.0452, "loss_ce": 0.2019861787557602, "loss_lvr": 1.201120138168335, "loss_mode_switch": 0.0, "loss_total": 0.3220981955528259, "step": 113 }, { "batch_size": 4, "epoch": 0.0452, "step": 113, "tokens_per_device": 4572 }, { "epoch": 0.0452, "loss_ce": 0.5207141041755676, "loss_lvr": 1.7469933032989502, "loss_mode_switch": 0.0, "loss_total": 0.6954134702682495, "step": 113 }, { "batch_size": 1, "epoch": 0.0452, "step": 113, "tokens_per_device": 4761 }, { "epoch": 0.0452, "loss_ce": 0.09045635908842087, "loss_lvr": 1.077820897102356, "loss_mode_switch": 0.0, "loss_total": 0.19823844730854034, "step": 113 }, { "batch_size": 4, "epoch": 0.0452, "step": 113, "tokens_per_device": 1596 }, { "epoch": 0.0452, "loss_ce": 0.3304935097694397, "loss_lvr": 1.8804545402526855, "loss_mode_switch": 0.0, "loss_total": 0.5185389518737793, "step": 113 }, { "batch_size": 1, "epoch": 0.0452, "step": 113, "tokens_per_device": 4846 }, { "epoch": 0.0452, "loss_ce": 0.1423211395740509, "loss_lvr": 1.5913437604904175, "loss_mode_switch": 0.0, "loss_total": 0.3014554977416992, "step": 113 }, { "epoch": 0.0456, "grad_norm": 2.754756450653076, "learning_rate": 9.993619523789241e-06, "loss": 0.5155, "step": 114 }, { "batch_size": 1, "epoch": 0.0456, "step": 114, "tokens_per_device": 5026 }, { "epoch": 0.0456, "loss_ce": 0.18740183115005493, "loss_lvr": 1.5442267656326294, "loss_mode_switch": 0.0, "loss_total": 0.3418245315551758, "step": 114 }, { "batch_size": 1, "epoch": 0.0456, "step": 114, "tokens_per_device": 4863 }, { "epoch": 0.0456, "loss_ce": 0.01795249804854393, "loss_lvr": 1.1240276098251343, "loss_mode_switch": 0.0, "loss_total": 0.13035525381565094, "step": 114 }, { "batch_size": 1, "epoch": 0.0456, "step": 114, "tokens_per_device": 5200 }, { "epoch": 0.0456, "loss_ce": 0.41193559765815735, "loss_lvr": 1.0721505880355835, "loss_mode_switch": 0.0, "loss_total": 0.5191506743431091, "step": 114 }, { "batch_size": 1, "epoch": 0.0456, "step": 114, "tokens_per_device": 5259 }, { "epoch": 0.0456, "loss_ce": 0.20696213841438293, "loss_lvr": 0.7237643003463745, "loss_mode_switch": 0.0, "loss_total": 0.2793385684490204, "step": 114 }, { "batch_size": 4, "epoch": 0.0456, "step": 114, "tokens_per_device": 3836 }, { "epoch": 0.0456, "loss_ce": 0.4309406876564026, "loss_lvr": 1.9547584056854248, "loss_mode_switch": 0.0, "loss_total": 0.6264165639877319, "step": 114 }, { "batch_size": 4, "epoch": 0.0456, "step": 114, "tokens_per_device": 4708 }, { "epoch": 0.0456, "loss_ce": 0.2683582901954651, "loss_lvr": 0.9381709694862366, "loss_mode_switch": 0.0, "loss_total": 0.3621754050254822, "step": 114 }, { "batch_size": 4, "epoch": 0.0456, "step": 114, "tokens_per_device": 4444 }, { "epoch": 0.0456, "loss_ce": 0.12098932266235352, "loss_lvr": 1.081467628479004, "loss_mode_switch": 0.0, "loss_total": 0.22913607954978943, "step": 114 }, { "batch_size": 1, "epoch": 0.0456, "step": 114, "tokens_per_device": 4614 }, { "epoch": 0.0456, "loss_ce": 0.10463117063045502, "loss_lvr": 1.1615736484527588, "loss_mode_switch": 0.0, "loss_total": 0.22078853845596313, "step": 114 }, { "epoch": 0.046, "grad_norm": 3.073338270187378, "learning_rate": 9.993288199121283e-06, "loss": 0.4498, "step": 115 }, { "batch_size": 4, "epoch": 0.046, "step": 115, "tokens_per_device": 4560 }, { "epoch": 0.046, "loss_ce": 0.616450846195221, "loss_lvr": 1.8124892711639404, "loss_mode_switch": 0.0, "loss_total": 0.7976998090744019, "step": 115 }, { "batch_size": 1, "epoch": 0.046, "step": 115, "tokens_per_device": 5114 }, { "epoch": 0.046, "loss_ce": 0.16894976794719696, "loss_lvr": 1.0143787860870361, "loss_mode_switch": 0.0, "loss_total": 0.2703876495361328, "step": 115 }, { "batch_size": 1, "epoch": 0.046, "step": 115, "tokens_per_device": 4888 }, { "epoch": 0.046, "loss_ce": 0.14111635088920593, "loss_lvr": 1.7885215282440186, "loss_mode_switch": 0.0, "loss_total": 0.3199685215950012, "step": 115 }, { "batch_size": 4, "epoch": 0.046, "step": 115, "tokens_per_device": 1632 }, { "epoch": 0.046, "loss_ce": 0.33416643738746643, "loss_lvr": 2.5463337898254395, "loss_mode_switch": 0.0, "loss_total": 0.5887998342514038, "step": 115 }, { "batch_size": 1, "epoch": 0.046, "step": 115, "tokens_per_device": 5114 }, { "epoch": 0.046, "loss_ce": 0.01635904796421528, "loss_lvr": 1.916911244392395, "loss_mode_switch": 0.0, "loss_total": 0.2080501765012741, "step": 115 }, { "batch_size": 1, "epoch": 0.046, "step": 115, "tokens_per_device": 4428 }, { "epoch": 0.046, "loss_ce": 0.0407436341047287, "loss_lvr": 1.1623224020004272, "loss_mode_switch": 0.0, "loss_total": 0.1569758653640747, "step": 115 }, { "batch_size": 4, "epoch": 0.046, "step": 115, "tokens_per_device": 4672 }, { "epoch": 0.046, "loss_ce": 0.4139876961708069, "loss_lvr": 1.14969003200531, "loss_mode_switch": 0.0, "loss_total": 0.5289567112922668, "step": 115 }, { "batch_size": 4, "epoch": 0.046, "step": 115, "tokens_per_device": 1404 }, { "epoch": 0.046, "loss_ce": 0.4586864709854126, "loss_lvr": 2.0140163898468018, "loss_mode_switch": 0.0, "loss_total": 0.6600881218910217, "step": 115 }, { "epoch": 0.0464, "grad_norm": 2.2090773582458496, "learning_rate": 9.992948494090303e-06, "loss": 0.4299, "step": 116 }, { "batch_size": 4, "epoch": 0.0464, "step": 116, "tokens_per_device": 4200 }, { "epoch": 0.0464, "loss_ce": 0.3182782530784607, "loss_lvr": 2.778311252593994, "loss_mode_switch": 0.0, "loss_total": 0.5961093902587891, "step": 116 }, { "batch_size": 4, "epoch": 0.0464, "step": 116, "tokens_per_device": 4344 }, { "epoch": 0.0464, "loss_ce": 0.4928809702396393, "loss_lvr": 2.2130749225616455, "loss_mode_switch": 0.0, "loss_total": 0.7141884565353394, "step": 116 }, { "batch_size": 1, "epoch": 0.0464, "step": 116, "tokens_per_device": 5113 }, { "epoch": 0.0464, "loss_ce": 0.07932958751916885, "loss_lvr": 0.873741626739502, "loss_mode_switch": 0.0, "loss_total": 0.16670376062393188, "step": 116 }, { "batch_size": 1, "epoch": 0.0464, "step": 116, "tokens_per_device": 4708 }, { "epoch": 0.0464, "loss_ce": 0.2123226523399353, "loss_lvr": 1.707533836364746, "loss_mode_switch": 0.0, "loss_total": 0.3830760419368744, "step": 116 }, { "batch_size": 1, "epoch": 0.0464, "step": 116, "tokens_per_device": 4856 }, { "epoch": 0.0464, "loss_ce": 0.013630000874400139, "loss_lvr": 1.2599161863327026, "loss_mode_switch": 0.0, "loss_total": 0.13962163031101227, "step": 116 }, { "batch_size": 4, "epoch": 0.0464, "step": 116, "tokens_per_device": 8116 }, { "epoch": 0.0464, "loss_ce": 0.3122629225254059, "loss_lvr": 2.0466039180755615, "loss_mode_switch": 0.0, "loss_total": 0.5169233083724976, "step": 116 }, { "batch_size": 4, "epoch": 0.0464, "step": 116, "tokens_per_device": 2588 }, { "epoch": 0.0464, "loss_ce": 0.5188100337982178, "loss_lvr": 2.0286190509796143, "loss_mode_switch": 0.0, "loss_total": 0.7216719388961792, "step": 116 }, { "batch_size": 1, "epoch": 0.0464, "step": 116, "tokens_per_device": 5119 }, { "epoch": 0.0464, "loss_ce": 0.02355734445154667, "loss_lvr": 1.4002797603607178, "loss_mode_switch": 0.0, "loss_total": 0.1635853350162506, "step": 116 }, { "epoch": 0.0468, "grad_norm": 2.18652081489563, "learning_rate": 9.992600409266437e-06, "loss": 0.4154, "step": 117 }, { "batch_size": 1, "epoch": 0.0468, "step": 117, "tokens_per_device": 5106 }, { "epoch": 0.0468, "loss_ce": 0.09254463016986847, "loss_lvr": 1.7103239297866821, "loss_mode_switch": 0.0, "loss_total": 0.26357704401016235, "step": 117 }, { "batch_size": 1, "epoch": 0.0468, "step": 117, "tokens_per_device": 4732 }, { "epoch": 0.0468, "loss_ce": 0.03625251352787018, "loss_lvr": 1.4419608116149902, "loss_mode_switch": 0.0, "loss_total": 0.18044859170913696, "step": 117 }, { "batch_size": 4, "epoch": 0.0468, "step": 117, "tokens_per_device": 4220 }, { "epoch": 0.0468, "loss_ce": 0.16718779504299164, "loss_lvr": 1.1424154043197632, "loss_mode_switch": 0.0, "loss_total": 0.28142935037612915, "step": 117 }, { "batch_size": 1, "epoch": 0.0468, "step": 117, "tokens_per_device": 5205 }, { "epoch": 0.0468, "loss_ce": 0.00988003984093666, "loss_lvr": 1.2139207124710083, "loss_mode_switch": 0.0, "loss_total": 0.1312721073627472, "step": 117 }, { "batch_size": 4, "epoch": 0.0468, "step": 117, "tokens_per_device": 1340 }, { "epoch": 0.0468, "loss_ce": 0.7868101596832275, "loss_lvr": 1.666064739227295, "loss_mode_switch": 0.0, "loss_total": 0.953416645526886, "step": 117 }, { "batch_size": 4, "epoch": 0.0468, "step": 117, "tokens_per_device": 11004 }, { "epoch": 0.0468, "loss_ce": 0.291801393032074, "loss_lvr": 0.7296292781829834, "loss_mode_switch": 0.0, "loss_total": 0.36476433277130127, "step": 117 }, { "batch_size": 1, "epoch": 0.0468, "step": 117, "tokens_per_device": 4877 }, { "epoch": 0.0468, "loss_ce": 0.11070756614208221, "loss_lvr": 0.8323987126350403, "loss_mode_switch": 0.0, "loss_total": 0.193947434425354, "step": 117 }, { "batch_size": 4, "epoch": 0.0468, "step": 117, "tokens_per_device": 6084 }, { "epoch": 0.0468, "loss_ce": 0.4742780029773712, "loss_lvr": 1.460585355758667, "loss_mode_switch": 0.0, "loss_total": 0.6203365325927734, "step": 117 }, { "epoch": 0.0472, "grad_norm": 2.2368276119232178, "learning_rate": 9.992243945233886e-06, "loss": 0.4249, "step": 118 }, { "batch_size": 4, "epoch": 0.0472, "step": 118, "tokens_per_device": 4348 }, { "epoch": 0.0472, "loss_ce": 0.26093021035194397, "loss_lvr": 1.8540102243423462, "loss_mode_switch": 0.0, "loss_total": 0.4463312327861786, "step": 118 }, { "batch_size": 1, "epoch": 0.0472, "step": 118, "tokens_per_device": 4831 }, { "epoch": 0.0472, "loss_ce": 0.010105976834893227, "loss_lvr": 0.5507143139839172, "loss_mode_switch": 0.0, "loss_total": 0.06517741084098816, "step": 118 }, { "batch_size": 4, "epoch": 0.0472, "step": 118, "tokens_per_device": 5492 }, { "epoch": 0.0472, "loss_ce": 0.1534527838230133, "loss_lvr": 1.6769626140594482, "loss_mode_switch": 0.0, "loss_total": 0.3211490511894226, "step": 118 }, { "batch_size": 4, "epoch": 0.0472, "step": 118, "tokens_per_device": 4440 }, { "epoch": 0.0472, "loss_ce": 0.46027112007141113, "loss_lvr": 1.8485785722732544, "loss_mode_switch": 0.0, "loss_total": 0.6451289653778076, "step": 118 }, { "batch_size": 4, "epoch": 0.0472, "step": 118, "tokens_per_device": 4916 }, { "epoch": 0.0472, "loss_ce": 0.6932467222213745, "loss_lvr": 1.3240714073181152, "loss_mode_switch": 0.0, "loss_total": 0.8256538510322571, "step": 118 }, { "batch_size": 1, "epoch": 0.0472, "step": 118, "tokens_per_device": 5283 }, { "epoch": 0.0472, "loss_ce": 0.01826964132487774, "loss_lvr": 1.9624325037002563, "loss_mode_switch": 0.0, "loss_total": 0.214512899518013, "step": 118 }, { "batch_size": 4, "epoch": 0.0472, "step": 118, "tokens_per_device": 1860 }, { "epoch": 0.0472, "loss_ce": 0.32963135838508606, "loss_lvr": 1.8630518913269043, "loss_mode_switch": 0.0, "loss_total": 0.515936553478241, "step": 118 }, { "batch_size": 4, "epoch": 0.0472, "step": 118, "tokens_per_device": 2840 }, { "epoch": 0.0472, "loss_ce": 0.10503043979406357, "loss_lvr": 1.0679552555084229, "loss_mode_switch": 0.0, "loss_total": 0.21182596683502197, "step": 118 }, { "epoch": 0.0476, "grad_norm": 2.7816848754882812, "learning_rate": 9.991879102590912e-06, "loss": 0.4449, "step": 119 }, { "batch_size": 4, "epoch": 0.0476, "step": 119, "tokens_per_device": 5740 }, { "epoch": 0.0476, "loss_ce": 0.2919747233390808, "loss_lvr": 1.6851365566253662, "loss_mode_switch": 0.0, "loss_total": 0.46048837900161743, "step": 119 }, { "batch_size": 4, "epoch": 0.0476, "step": 119, "tokens_per_device": 3940 }, { "epoch": 0.0476, "loss_ce": 0.08097843825817108, "loss_lvr": 1.0831995010375977, "loss_mode_switch": 0.0, "loss_total": 0.18929839134216309, "step": 119 }, { "batch_size": 4, "epoch": 0.0476, "step": 119, "tokens_per_device": 5972 }, { "epoch": 0.0476, "loss_ce": 0.3594176471233368, "loss_lvr": 1.593505859375, "loss_mode_switch": 0.0, "loss_total": 0.5187682509422302, "step": 119 }, { "batch_size": 4, "epoch": 0.0476, "step": 119, "tokens_per_device": 4316 }, { "epoch": 0.0476, "loss_ce": 0.8744819164276123, "loss_lvr": 1.8390175104141235, "loss_mode_switch": 0.0, "loss_total": 1.0583837032318115, "step": 119 }, { "batch_size": 4, "epoch": 0.0476, "step": 119, "tokens_per_device": 5580 }, { "epoch": 0.0476, "loss_ce": 0.36754921078681946, "loss_lvr": 1.0633512735366821, "loss_mode_switch": 0.0, "loss_total": 0.47388434410095215, "step": 119 }, { "batch_size": 4, "epoch": 0.0476, "step": 119, "tokens_per_device": 5424 }, { "epoch": 0.0476, "loss_ce": 0.12586171925067902, "loss_lvr": 2.076697826385498, "loss_mode_switch": 0.0, "loss_total": 0.3335314989089966, "step": 119 }, { "batch_size": 1, "epoch": 0.0476, "step": 119, "tokens_per_device": 4879 }, { "epoch": 0.0476, "loss_ce": 0.032748591154813766, "loss_lvr": 0.8274378180503845, "loss_mode_switch": 0.0, "loss_total": 0.11549237370491028, "step": 119 }, { "batch_size": 1, "epoch": 0.0476, "step": 119, "tokens_per_device": 4978 }, { "epoch": 0.0476, "loss_ce": 0.6947762370109558, "loss_lvr": 1.5027635097503662, "loss_mode_switch": 0.0, "loss_total": 0.8450525999069214, "step": 119 }, { "epoch": 0.048, "grad_norm": 2.4666051864624023, "learning_rate": 9.991505881949837e-06, "loss": 0.4533, "step": 120 }, { "batch_size": 1, "epoch": 0.048, "step": 120, "tokens_per_device": 5098 }, { "epoch": 0.048, "loss_ce": 0.04926560819149017, "loss_lvr": 1.1849348545074463, "loss_mode_switch": 0.0, "loss_total": 0.16775909066200256, "step": 120 }, { "batch_size": 1, "epoch": 0.048, "step": 120, "tokens_per_device": 4602 }, { "epoch": 0.048, "loss_ce": 0.11632374674081802, "loss_lvr": 0.983654260635376, "loss_mode_switch": 0.0, "loss_total": 0.21468916535377502, "step": 120 }, { "batch_size": 1, "epoch": 0.048, "step": 120, "tokens_per_device": 5003 }, { "epoch": 0.048, "loss_ce": 0.026088261976838112, "loss_lvr": 0.9462538361549377, "loss_mode_switch": 0.0, "loss_total": 0.12071364372968674, "step": 120 }, { "batch_size": 4, "epoch": 0.048, "step": 120, "tokens_per_device": 2544 }, { "epoch": 0.048, "loss_ce": 0.19379153847694397, "loss_lvr": 1.9487227201461792, "loss_mode_switch": 0.0, "loss_total": 0.3886638283729553, "step": 120 }, { "batch_size": 1, "epoch": 0.048, "step": 120, "tokens_per_device": 4935 }, { "epoch": 0.048, "loss_ce": 0.032583218067884445, "loss_lvr": 1.9996447563171387, "loss_mode_switch": 0.0, "loss_total": 0.23254770040512085, "step": 120 }, { "batch_size": 4, "epoch": 0.048, "step": 120, "tokens_per_device": 4216 }, { "epoch": 0.048, "loss_ce": 0.120203398168087, "loss_lvr": 2.5418596267700195, "loss_mode_switch": 0.0, "loss_total": 0.3743893802165985, "step": 120 }, { "batch_size": 4, "epoch": 0.048, "step": 120, "tokens_per_device": 5020 }, { "epoch": 0.048, "loss_ce": 0.23427942395210266, "loss_lvr": 1.458993911743164, "loss_mode_switch": 0.0, "loss_total": 0.3801788091659546, "step": 120 }, { "batch_size": 4, "epoch": 0.048, "step": 120, "tokens_per_device": 1420 }, { "epoch": 0.048, "loss_ce": 0.09024412930011749, "loss_lvr": 1.8429473638534546, "loss_mode_switch": 0.0, "loss_total": 0.27453887462615967, "step": 120 }, { "epoch": 0.0484, "grad_norm": 2.0809521675109863, "learning_rate": 9.991124283937049e-06, "loss": 0.4068, "step": 121 }, { "batch_size": 4, "epoch": 0.0484, "step": 121, "tokens_per_device": 4240 }, { "epoch": 0.0484, "loss_ce": 0.05260938033461571, "loss_lvr": 1.7097816467285156, "loss_mode_switch": 0.0, "loss_total": 0.22358755767345428, "step": 121 }, { "batch_size": 4, "epoch": 0.0484, "step": 121, "tokens_per_device": 5584 }, { "epoch": 0.0484, "loss_ce": 0.2757672667503357, "loss_lvr": 1.67328679561615, "loss_mode_switch": 0.0, "loss_total": 0.44309595227241516, "step": 121 }, { "batch_size": 4, "epoch": 0.0484, "step": 121, "tokens_per_device": 7160 }, { "epoch": 0.0484, "loss_ce": 0.16734980046749115, "loss_lvr": 0.8164153099060059, "loss_mode_switch": 0.0, "loss_total": 0.24899134039878845, "step": 121 }, { "batch_size": 4, "epoch": 0.0484, "step": 121, "tokens_per_device": 12584 }, { "epoch": 0.0484, "loss_ce": 0.4895065426826477, "loss_lvr": 1.7335258722305298, "loss_mode_switch": 0.0, "loss_total": 0.6628591418266296, "step": 121 }, { "batch_size": 1, "epoch": 0.0484, "step": 121, "tokens_per_device": 4900 }, { "epoch": 0.0484, "loss_ce": 0.025370856747031212, "loss_lvr": 1.352758765220642, "loss_mode_switch": 0.0, "loss_total": 0.1606467366218567, "step": 121 }, { "batch_size": 1, "epoch": 0.0484, "step": 121, "tokens_per_device": 4735 }, { "epoch": 0.0484, "loss_ce": 0.024449653923511505, "loss_lvr": 0.6112969517707825, "loss_mode_switch": 0.0, "loss_total": 0.08557935059070587, "step": 121 }, { "batch_size": 4, "epoch": 0.0484, "step": 121, "tokens_per_device": 1400 }, { "epoch": 0.0484, "loss_ce": 0.5626954436302185, "loss_lvr": 1.495656132698059, "loss_mode_switch": 0.0, "loss_total": 0.7122610807418823, "step": 121 }, { "batch_size": 4, "epoch": 0.0484, "step": 121, "tokens_per_device": 4256 }, { "epoch": 0.0484, "loss_ce": 0.36264482140541077, "loss_lvr": 1.5707937479019165, "loss_mode_switch": 0.0, "loss_total": 0.5197241902351379, "step": 121 }, { "epoch": 0.0488, "grad_norm": 2.034290075302124, "learning_rate": 9.990734309192995e-06, "loss": 0.4281, "step": 122 }, { "batch_size": 4, "epoch": 0.0488, "step": 122, "tokens_per_device": 3772 }, { "epoch": 0.0488, "loss_ce": 0.1944800466299057, "loss_lvr": 1.1555805206298828, "loss_mode_switch": 0.0, "loss_total": 0.31003808975219727, "step": 122 }, { "batch_size": 4, "epoch": 0.0488, "step": 122, "tokens_per_device": 1312 }, { "epoch": 0.0488, "loss_ce": 0.8219991326332092, "loss_lvr": 1.8933706283569336, "loss_mode_switch": 0.0, "loss_total": 1.0113362073898315, "step": 122 }, { "batch_size": 1, "epoch": 0.0488, "step": 122, "tokens_per_device": 4882 }, { "epoch": 0.0488, "loss_ce": 0.12331706285476685, "loss_lvr": 0.48216578364372253, "loss_mode_switch": 0.0, "loss_total": 0.17153364419937134, "step": 122 }, { "batch_size": 1, "epoch": 0.0488, "step": 122, "tokens_per_device": 4840 }, { "epoch": 0.0488, "loss_ce": 0.24615666270256042, "loss_lvr": 0.9699592590332031, "loss_mode_switch": 0.0, "loss_total": 0.34315258264541626, "step": 122 }, { "batch_size": 4, "epoch": 0.0488, "step": 122, "tokens_per_device": 1264 }, { "epoch": 0.0488, "loss_ce": 0.2604304552078247, "loss_lvr": 1.8834314346313477, "loss_mode_switch": 0.0, "loss_total": 0.4487736225128174, "step": 122 }, { "batch_size": 4, "epoch": 0.0488, "step": 122, "tokens_per_device": 9056 }, { "epoch": 0.0488, "loss_ce": 0.1294044852256775, "loss_lvr": 1.3862395286560059, "loss_mode_switch": 0.0, "loss_total": 0.2680284380912781, "step": 122 }, { "batch_size": 4, "epoch": 0.0488, "step": 122, "tokens_per_device": 1496 }, { "epoch": 0.0488, "loss_ce": 0.24785971641540527, "loss_lvr": 1.352838397026062, "loss_mode_switch": 0.0, "loss_total": 0.3831435441970825, "step": 122 }, { "batch_size": 4, "epoch": 0.0488, "step": 122, "tokens_per_device": 1592 }, { "epoch": 0.0488, "loss_ce": 0.3604171872138977, "loss_lvr": 1.258279800415039, "loss_mode_switch": 0.0, "loss_total": 0.48624515533447266, "step": 122 }, { "epoch": 0.0492, "grad_norm": 2.4882490634918213, "learning_rate": 9.990335958372178e-06, "loss": 0.4665, "step": 123 }, { "batch_size": 1, "epoch": 0.0492, "step": 123, "tokens_per_device": 6934 }, { "epoch": 0.0492, "loss_ce": 0.04698236286640167, "loss_lvr": 0.9773520827293396, "loss_mode_switch": 0.0, "loss_total": 0.14471757411956787, "step": 123 }, { "batch_size": 4, "epoch": 0.0492, "step": 123, "tokens_per_device": 3820 }, { "epoch": 0.0492, "loss_ce": 0.19971130788326263, "loss_lvr": 1.5906199216842651, "loss_mode_switch": 0.0, "loss_total": 0.35877329111099243, "step": 123 }, { "batch_size": 4, "epoch": 0.0492, "step": 123, "tokens_per_device": 3860 }, { "epoch": 0.0492, "loss_ce": 0.13313448429107666, "loss_lvr": 1.7687758207321167, "loss_mode_switch": 0.0, "loss_total": 0.3100120723247528, "step": 123 }, { "batch_size": 4, "epoch": 0.0492, "step": 123, "tokens_per_device": 3760 }, { "epoch": 0.0492, "loss_ce": 0.24725967645645142, "loss_lvr": 1.9917445182800293, "loss_mode_switch": 0.0, "loss_total": 0.4464341402053833, "step": 123 }, { "batch_size": 1, "epoch": 0.0492, "step": 123, "tokens_per_device": 4901 }, { "epoch": 0.0492, "loss_ce": 0.0257726963609457, "loss_lvr": 1.3571569919586182, "loss_mode_switch": 0.0, "loss_total": 0.1614883989095688, "step": 123 }, { "batch_size": 1, "epoch": 0.0492, "step": 123, "tokens_per_device": 4625 }, { "epoch": 0.0492, "loss_ce": 0.3346524238586426, "loss_lvr": 0.9458147883415222, "loss_mode_switch": 0.0, "loss_total": 0.4292339086532593, "step": 123 }, { "batch_size": 1, "epoch": 0.0492, "step": 123, "tokens_per_device": 5106 }, { "epoch": 0.0492, "loss_ce": 0.034523166716098785, "loss_lvr": 0.6234354972839355, "loss_mode_switch": 0.0, "loss_total": 0.09686671197414398, "step": 123 }, { "batch_size": 1, "epoch": 0.0492, "step": 123, "tokens_per_device": 6361 }, { "epoch": 0.0492, "loss_ce": 0.05251995846629143, "loss_lvr": 1.272948980331421, "loss_mode_switch": 0.0, "loss_total": 0.17981486022472382, "step": 123 }, { "epoch": 0.0496, "grad_norm": 2.8935933113098145, "learning_rate": 9.989929232143159e-06, "loss": 0.4283, "step": 124 }, { "batch_size": 4, "epoch": 0.0496, "step": 124, "tokens_per_device": 8212 }, { "epoch": 0.0496, "loss_ce": 0.1611914187669754, "loss_lvr": 1.8592443466186523, "loss_mode_switch": 0.0, "loss_total": 0.3471158742904663, "step": 124 }, { "batch_size": 4, "epoch": 0.0496, "step": 124, "tokens_per_device": 2692 }, { "epoch": 0.0496, "loss_ce": 0.31143784523010254, "loss_lvr": 1.5285543203353882, "loss_mode_switch": 0.0, "loss_total": 0.46429330110549927, "step": 124 }, { "batch_size": 1, "epoch": 0.0496, "step": 124, "tokens_per_device": 5074 }, { "epoch": 0.0496, "loss_ce": 0.10034111887216568, "loss_lvr": 0.9821887016296387, "loss_mode_switch": 0.0, "loss_total": 0.19855999946594238, "step": 124 }, { "batch_size": 4, "epoch": 0.0496, "step": 124, "tokens_per_device": 2720 }, { "epoch": 0.0496, "loss_ce": 0.2327175885438919, "loss_lvr": 1.4121509790420532, "loss_mode_switch": 0.0, "loss_total": 0.37393268942832947, "step": 124 }, { "batch_size": 4, "epoch": 0.0496, "step": 124, "tokens_per_device": 5204 }, { "epoch": 0.0496, "loss_ce": 0.08820316940546036, "loss_lvr": 1.0614902973175049, "loss_mode_switch": 0.0, "loss_total": 0.19435220956802368, "step": 124 }, { "batch_size": 1, "epoch": 0.0496, "step": 124, "tokens_per_device": 5166 }, { "epoch": 0.0496, "loss_ce": 0.0485604926943779, "loss_lvr": 0.7638542652130127, "loss_mode_switch": 0.0, "loss_total": 0.12494592368602753, "step": 124 }, { "batch_size": 4, "epoch": 0.0496, "step": 124, "tokens_per_device": 3844 }, { "epoch": 0.0496, "loss_ce": 0.3776942193508148, "loss_lvr": 1.5834522247314453, "loss_mode_switch": 0.0, "loss_total": 0.5360394716262817, "step": 124 }, { "batch_size": 4, "epoch": 0.0496, "step": 124, "tokens_per_device": 3804 }, { "epoch": 0.0496, "loss_ce": 0.4422597587108612, "loss_lvr": 2.2607173919677734, "loss_mode_switch": 0.0, "loss_total": 0.668331503868103, "step": 124 }, { "epoch": 0.05, "grad_norm": 2.0950005054473877, "learning_rate": 9.98951413118856e-06, "loss": 0.4288, "step": 125 }, { "batch_size": 4, "epoch": 0.05, "step": 125, "tokens_per_device": 5448 }, { "epoch": 0.05, "loss_ce": 0.0877755880355835, "loss_lvr": 1.168696641921997, "loss_mode_switch": 0.0, "loss_total": 0.20464524626731873, "step": 125 }, { "batch_size": 4, "epoch": 0.05, "step": 125, "tokens_per_device": 5204 }, { "epoch": 0.05, "loss_ce": 0.2248038798570633, "loss_lvr": 1.1163238286972046, "loss_mode_switch": 0.0, "loss_total": 0.33643627166748047, "step": 125 }, { "batch_size": 4, "epoch": 0.05, "step": 125, "tokens_per_device": 2904 }, { "epoch": 0.05, "loss_ce": 0.48739153146743774, "loss_lvr": 1.1454665660858154, "loss_mode_switch": 0.0, "loss_total": 0.6019381880760193, "step": 125 }, { "batch_size": 1, "epoch": 0.05, "step": 125, "tokens_per_device": 4897 }, { "epoch": 0.05, "loss_ce": 0.457139790058136, "loss_lvr": 0.6741693615913391, "loss_mode_switch": 0.0, "loss_total": 0.5245567560195923, "step": 125 }, { "batch_size": 4, "epoch": 0.05, "step": 125, "tokens_per_device": 5760 }, { "epoch": 0.05, "loss_ce": 0.0455969013273716, "loss_lvr": 2.0826869010925293, "loss_mode_switch": 0.0, "loss_total": 0.2538655996322632, "step": 125 }, { "batch_size": 4, "epoch": 0.05, "step": 125, "tokens_per_device": 4772 }, { "epoch": 0.05, "loss_ce": 0.1810486763715744, "loss_lvr": 1.8867390155792236, "loss_mode_switch": 0.0, "loss_total": 0.3697225749492645, "step": 125 }, { "batch_size": 1, "epoch": 0.05, "step": 125, "tokens_per_device": 5200 }, { "epoch": 0.05, "loss_ce": 0.4736197292804718, "loss_lvr": 1.2201898097991943, "loss_mode_switch": 0.0, "loss_total": 0.5956386923789978, "step": 125 }, { "batch_size": 1, "epoch": 0.05, "step": 125, "tokens_per_device": 4892 }, { "epoch": 0.05, "loss_ce": 0.0518714040517807, "loss_lvr": 0.33469587564468384, "loss_mode_switch": 0.0, "loss_total": 0.08534099161624908, "step": 125 }, { "epoch": 0.0504, "grad_norm": 3.0065903663635254, "learning_rate": 9.989090656205052e-06, "loss": 0.4087, "step": 126 }, { "batch_size": 1, "epoch": 0.0504, "step": 126, "tokens_per_device": 4624 }, { "epoch": 0.0504, "loss_ce": 0.1530170440673828, "loss_lvr": 1.9408413171768188, "loss_mode_switch": 0.0, "loss_total": 0.3471011817455292, "step": 126 }, { "batch_size": 4, "epoch": 0.0504, "step": 126, "tokens_per_device": 1476 }, { "epoch": 0.0504, "loss_ce": 0.19311384856700897, "loss_lvr": 1.5187650918960571, "loss_mode_switch": 0.0, "loss_total": 0.3449903726577759, "step": 126 }, { "batch_size": 4, "epoch": 0.0504, "step": 126, "tokens_per_device": 2608 }, { "epoch": 0.0504, "loss_ce": 0.1983112096786499, "loss_lvr": 1.9282338619232178, "loss_mode_switch": 0.0, "loss_total": 0.3911346197128296, "step": 126 }, { "batch_size": 1, "epoch": 0.0504, "step": 126, "tokens_per_device": 7337 }, { "epoch": 0.0504, "loss_ce": 0.005007992498576641, "loss_lvr": 1.188744306564331, "loss_mode_switch": 0.0, "loss_total": 0.12388242036104202, "step": 126 }, { "batch_size": 1, "epoch": 0.0504, "step": 126, "tokens_per_device": 4825 }, { "epoch": 0.0504, "loss_ce": 0.41169407963752747, "loss_lvr": 0.7926440238952637, "loss_mode_switch": 0.0, "loss_total": 0.49095848202705383, "step": 126 }, { "batch_size": 4, "epoch": 0.0504, "step": 126, "tokens_per_device": 12784 }, { "epoch": 0.0504, "loss_ce": 0.2619410753250122, "loss_lvr": 1.552922248840332, "loss_mode_switch": 0.0, "loss_total": 0.41723328828811646, "step": 126 }, { "batch_size": 4, "epoch": 0.0504, "step": 126, "tokens_per_device": 4432 }, { "epoch": 0.0504, "loss_ce": 0.3486347198486328, "loss_lvr": 1.7665290832519531, "loss_mode_switch": 0.0, "loss_total": 0.5252876281738281, "step": 126 }, { "batch_size": 4, "epoch": 0.0504, "step": 126, "tokens_per_device": 5500 }, { "epoch": 0.0504, "loss_ce": 0.31571847200393677, "loss_lvr": 1.370977520942688, "loss_mode_switch": 0.0, "loss_total": 0.4528162479400635, "step": 126 }, { "epoch": 0.0508, "grad_norm": 2.0256764888763428, "learning_rate": 9.988658807903369e-06, "loss": 0.4155, "step": 127 }, { "batch_size": 1, "epoch": 0.0508, "step": 127, "tokens_per_device": 6890 }, { "epoch": 0.0508, "loss_ce": 0.011418354697525501, "loss_lvr": 1.0764347314834595, "loss_mode_switch": 0.0, "loss_total": 0.11906183511018753, "step": 127 }, { "batch_size": 4, "epoch": 0.0508, "step": 127, "tokens_per_device": 3440 }, { "epoch": 0.0508, "loss_ce": 0.020301464945077896, "loss_lvr": 1.5181020498275757, "loss_mode_switch": 0.0, "loss_total": 0.17211167514324188, "step": 127 }, { "batch_size": 4, "epoch": 0.0508, "step": 127, "tokens_per_device": 6168 }, { "epoch": 0.0508, "loss_ce": 0.057226888835430145, "loss_lvr": 1.0519659519195557, "loss_mode_switch": 0.0, "loss_total": 0.1624234914779663, "step": 127 }, { "batch_size": 4, "epoch": 0.0508, "step": 127, "tokens_per_device": 4756 }, { "epoch": 0.0508, "loss_ce": 0.6219727993011475, "loss_lvr": 1.1741975545883179, "loss_mode_switch": 0.0, "loss_total": 0.7393925786018372, "step": 127 }, { "batch_size": 4, "epoch": 0.0508, "step": 127, "tokens_per_device": 8464 }, { "epoch": 0.0508, "loss_ce": 0.13657107949256897, "loss_lvr": 1.117559552192688, "loss_mode_switch": 0.0, "loss_total": 0.24832704663276672, "step": 127 }, { "batch_size": 1, "epoch": 0.0508, "step": 127, "tokens_per_device": 4890 }, { "epoch": 0.0508, "loss_ce": 0.28236880898475647, "loss_lvr": 1.763505458831787, "loss_mode_switch": 0.0, "loss_total": 0.4587193727493286, "step": 127 }, { "batch_size": 4, "epoch": 0.0508, "step": 127, "tokens_per_device": 1404 }, { "epoch": 0.0508, "loss_ce": 0.3663274049758911, "loss_lvr": 1.816421627998352, "loss_mode_switch": 0.0, "loss_total": 0.5479695796966553, "step": 127 }, { "batch_size": 4, "epoch": 0.0508, "step": 127, "tokens_per_device": 2604 }, { "epoch": 0.0508, "loss_ce": 0.6165924072265625, "loss_lvr": 1.7760108709335327, "loss_mode_switch": 0.0, "loss_total": 0.7941935062408447, "step": 127 }, { "epoch": 0.0512, "grad_norm": 2.625791072845459, "learning_rate": 9.988218587008287e-06, "loss": 0.3652, "step": 128 }, { "batch_size": 4, "epoch": 0.0512, "step": 128, "tokens_per_device": 12804 }, { "epoch": 0.0512, "loss_ce": 0.6983411312103271, "loss_lvr": 1.249895691871643, "loss_mode_switch": 0.0, "loss_total": 0.8233307003974915, "step": 128 }, { "batch_size": 4, "epoch": 0.0512, "step": 128, "tokens_per_device": 4616 }, { "epoch": 0.0512, "loss_ce": 0.876309871673584, "loss_lvr": 1.4142614603042603, "loss_mode_switch": 0.0, "loss_total": 1.0177359580993652, "step": 128 }, { "batch_size": 4, "epoch": 0.0512, "step": 128, "tokens_per_device": 3948 }, { "epoch": 0.0512, "loss_ce": 0.39764404296875, "loss_lvr": 1.0664533376693726, "loss_mode_switch": 0.0, "loss_total": 0.5042893886566162, "step": 128 }, { "batch_size": 1, "epoch": 0.0512, "step": 128, "tokens_per_device": 4874 }, { "epoch": 0.0512, "loss_ce": 0.030977940186858177, "loss_lvr": 1.1269890069961548, "loss_mode_switch": 0.0, "loss_total": 0.14367684721946716, "step": 128 }, { "batch_size": 1, "epoch": 0.0512, "step": 128, "tokens_per_device": 4882 }, { "epoch": 0.0512, "loss_ce": 0.03772452846169472, "loss_lvr": 1.1698434352874756, "loss_mode_switch": 0.0, "loss_total": 0.1547088772058487, "step": 128 }, { "batch_size": 1, "epoch": 0.0512, "step": 128, "tokens_per_device": 5160 }, { "epoch": 0.0512, "loss_ce": 0.018022626638412476, "loss_lvr": 0.4860120415687561, "loss_mode_switch": 0.0, "loss_total": 0.06662383675575256, "step": 128 }, { "batch_size": 4, "epoch": 0.0512, "step": 128, "tokens_per_device": 3040 }, { "epoch": 0.0512, "loss_ce": 0.33798515796661377, "loss_lvr": 1.0190820693969727, "loss_mode_switch": 0.0, "loss_total": 0.43989336490631104, "step": 128 }, { "batch_size": 4, "epoch": 0.0512, "step": 128, "tokens_per_device": 3764 }, { "epoch": 0.0512, "loss_ce": 0.2240121215581894, "loss_lvr": 1.5450361967086792, "loss_mode_switch": 0.0, "loss_total": 0.37851575016975403, "step": 128 }, { "epoch": 0.0516, "grad_norm": 2.443293809890747, "learning_rate": 9.987769994258645e-06, "loss": 0.4377, "step": 129 }, { "batch_size": 4, "epoch": 0.0516, "step": 129, "tokens_per_device": 4232 }, { "epoch": 0.0516, "loss_ce": 0.22795477509498596, "loss_lvr": 1.803833246231079, "loss_mode_switch": 0.0, "loss_total": 0.40833809971809387, "step": 129 }, { "batch_size": 4, "epoch": 0.0516, "step": 129, "tokens_per_device": 2680 }, { "epoch": 0.0516, "loss_ce": 0.2151404619216919, "loss_lvr": 1.5015891790390015, "loss_mode_switch": 0.0, "loss_total": 0.36529940366744995, "step": 129 }, { "batch_size": 1, "epoch": 0.0516, "step": 129, "tokens_per_device": 5191 }, { "epoch": 0.0516, "loss_ce": 0.3417969346046448, "loss_lvr": 0.7974496483802795, "loss_mode_switch": 0.0, "loss_total": 0.42154189944267273, "step": 129 }, { "batch_size": 1, "epoch": 0.0516, "step": 129, "tokens_per_device": 5112 }, { "epoch": 0.0516, "loss_ce": 0.045660004019737244, "loss_lvr": 1.4814478158950806, "loss_mode_switch": 0.0, "loss_total": 0.1938047856092453, "step": 129 }, { "batch_size": 4, "epoch": 0.0516, "step": 129, "tokens_per_device": 4268 }, { "epoch": 0.0516, "loss_ce": 0.11748175323009491, "loss_lvr": 1.2587921619415283, "loss_mode_switch": 0.0, "loss_total": 0.2433609664440155, "step": 129 }, { "batch_size": 4, "epoch": 0.0516, "step": 129, "tokens_per_device": 6052 }, { "epoch": 0.0516, "loss_ce": 0.07914220541715622, "loss_lvr": 1.9493522644042969, "loss_mode_switch": 0.0, "loss_total": 0.274077445268631, "step": 129 }, { "batch_size": 4, "epoch": 0.0516, "step": 129, "tokens_per_device": 6020 }, { "epoch": 0.0516, "loss_ce": 0.03565142676234245, "loss_lvr": 1.541384220123291, "loss_mode_switch": 0.0, "loss_total": 0.18978986144065857, "step": 129 }, { "batch_size": 1, "epoch": 0.0516, "step": 129, "tokens_per_device": 4898 }, { "epoch": 0.0516, "loss_ce": 0.08395282924175262, "loss_lvr": 0.5342795848846436, "loss_mode_switch": 0.0, "loss_total": 0.13738079369068146, "step": 129 }, { "epoch": 0.052, "grad_norm": 2.107081651687622, "learning_rate": 9.987313030407325e-06, "loss": 0.3976, "step": 130 }, { "batch_size": 4, "epoch": 0.052, "step": 130, "tokens_per_device": 2884 }, { "epoch": 0.052, "loss_ce": 0.4098038971424103, "loss_lvr": 1.5717175006866455, "loss_mode_switch": 0.0, "loss_total": 0.5669756531715393, "step": 130 }, { "batch_size": 4, "epoch": 0.052, "step": 130, "tokens_per_device": 6268 }, { "epoch": 0.052, "loss_ce": 0.28324246406555176, "loss_lvr": 1.0085179805755615, "loss_mode_switch": 0.0, "loss_total": 0.3840942680835724, "step": 130 }, { "batch_size": 4, "epoch": 0.052, "step": 130, "tokens_per_device": 4880 }, { "epoch": 0.052, "loss_ce": 0.16509835422039032, "loss_lvr": 1.6079050302505493, "loss_mode_switch": 0.0, "loss_total": 0.32588887214660645, "step": 130 }, { "batch_size": 4, "epoch": 0.052, "step": 130, "tokens_per_device": 1504 }, { "epoch": 0.052, "loss_ce": 0.20509588718414307, "loss_lvr": 2.1121106147766113, "loss_mode_switch": 0.0, "loss_total": 0.4163069725036621, "step": 130 }, { "batch_size": 4, "epoch": 0.052, "step": 130, "tokens_per_device": 6372 }, { "epoch": 0.052, "loss_ce": 0.2598266005516052, "loss_lvr": 1.291492223739624, "loss_mode_switch": 0.0, "loss_total": 0.3889758288860321, "step": 130 }, { "batch_size": 4, "epoch": 0.052, "step": 130, "tokens_per_device": 4100 }, { "epoch": 0.052, "loss_ce": 0.468131959438324, "loss_lvr": 1.2428189516067505, "loss_mode_switch": 0.0, "loss_total": 0.5924138426780701, "step": 130 }, { "batch_size": 4, "epoch": 0.052, "step": 130, "tokens_per_device": 2564 }, { "epoch": 0.052, "loss_ce": 0.5312218070030212, "loss_lvr": 1.2135611772537231, "loss_mode_switch": 0.0, "loss_total": 0.6525779366493225, "step": 130 }, { "batch_size": 4, "epoch": 0.052, "step": 130, "tokens_per_device": 3912 }, { "epoch": 0.052, "loss_ce": 0.18678712844848633, "loss_lvr": 1.4055238962173462, "loss_mode_switch": 0.0, "loss_total": 0.3273395299911499, "step": 130 }, { "epoch": 0.0524, "grad_norm": 2.0686655044555664, "learning_rate": 9.98684769622126e-06, "loss": 0.4418, "step": 131 }, { "batch_size": 4, "epoch": 0.0524, "step": 131, "tokens_per_device": 2728 }, { "epoch": 0.0524, "loss_ce": 0.645869255065918, "loss_lvr": 1.2133867740631104, "loss_mode_switch": 0.0, "loss_total": 0.7672079205513, "step": 131 }, { "batch_size": 4, "epoch": 0.0524, "step": 131, "tokens_per_device": 9388 }, { "epoch": 0.0524, "loss_ce": 0.07451565563678741, "loss_lvr": 1.1098926067352295, "loss_mode_switch": 0.0, "loss_total": 0.18550491333007812, "step": 131 }, { "batch_size": 1, "epoch": 0.0524, "step": 131, "tokens_per_device": 4831 }, { "epoch": 0.0524, "loss_ce": 0.0604926198720932, "loss_lvr": 1.859546184539795, "loss_mode_switch": 0.0, "loss_total": 0.24644723534584045, "step": 131 }, { "batch_size": 4, "epoch": 0.0524, "step": 131, "tokens_per_device": 4348 }, { "epoch": 0.0524, "loss_ce": 0.1578437089920044, "loss_lvr": 1.0644288063049316, "loss_mode_switch": 0.0, "loss_total": 0.2642865777015686, "step": 131 }, { "batch_size": 1, "epoch": 0.0524, "step": 131, "tokens_per_device": 4902 }, { "epoch": 0.0524, "loss_ce": 0.11548945307731628, "loss_lvr": 0.913924515247345, "loss_mode_switch": 0.0, "loss_total": 0.20688191056251526, "step": 131 }, { "batch_size": 4, "epoch": 0.0524, "step": 131, "tokens_per_device": 6892 }, { "epoch": 0.0524, "loss_ce": 0.03243179991841316, "loss_lvr": 1.101315975189209, "loss_mode_switch": 0.0, "loss_total": 0.14256340265274048, "step": 131 }, { "batch_size": 4, "epoch": 0.0524, "step": 131, "tokens_per_device": 14224 }, { "epoch": 0.0524, "loss_ce": 0.17658300697803497, "loss_lvr": 1.429726243019104, "loss_mode_switch": 0.0, "loss_total": 0.3195556402206421, "step": 131 }, { "batch_size": 1, "epoch": 0.0524, "step": 131, "tokens_per_device": 5068 }, { "epoch": 0.0524, "loss_ce": 0.029273325577378273, "loss_lvr": 0.6936618089675903, "loss_mode_switch": 0.0, "loss_total": 0.09863950312137604, "step": 131 }, { "epoch": 0.0528, "grad_norm": 1.8730380535125732, "learning_rate": 9.986373992481434e-06, "loss": 0.3494, "step": 132 }, { "batch_size": 4, "epoch": 0.0528, "step": 132, "tokens_per_device": 4236 }, { "epoch": 0.0528, "loss_ce": 0.08365114778280258, "loss_lvr": 1.2575337886810303, "loss_mode_switch": 0.0, "loss_total": 0.20940452814102173, "step": 132 }, { "batch_size": 4, "epoch": 0.0528, "step": 132, "tokens_per_device": 4400 }, { "epoch": 0.0528, "loss_ce": 0.18886709213256836, "loss_lvr": 1.1006994247436523, "loss_mode_switch": 0.0, "loss_total": 0.29893702268600464, "step": 132 }, { "batch_size": 4, "epoch": 0.0528, "step": 132, "tokens_per_device": 1328 }, { "epoch": 0.0528, "loss_ce": 0.5053303241729736, "loss_lvr": 1.699493169784546, "loss_mode_switch": 0.0, "loss_total": 0.6752796173095703, "step": 132 }, { "batch_size": 4, "epoch": 0.0528, "step": 132, "tokens_per_device": 2760 }, { "epoch": 0.0528, "loss_ce": 0.22603653371334076, "loss_lvr": 1.4166080951690674, "loss_mode_switch": 0.0, "loss_total": 0.3676973581314087, "step": 132 }, { "batch_size": 4, "epoch": 0.0528, "step": 132, "tokens_per_device": 4280 }, { "epoch": 0.0528, "loss_ce": 0.42357441782951355, "loss_lvr": 1.5135327577590942, "loss_mode_switch": 0.0, "loss_total": 0.5749276876449585, "step": 132 }, { "batch_size": 4, "epoch": 0.0528, "step": 132, "tokens_per_device": 5736 }, { "epoch": 0.0528, "loss_ce": 0.3856407403945923, "loss_lvr": 1.591901183128357, "loss_mode_switch": 0.0, "loss_total": 0.544830858707428, "step": 132 }, { "batch_size": 4, "epoch": 0.0528, "step": 132, "tokens_per_device": 4248 }, { "epoch": 0.0528, "loss_ce": 0.22304153442382812, "loss_lvr": 1.5610225200653076, "loss_mode_switch": 0.0, "loss_total": 0.37914377450942993, "step": 132 }, { "batch_size": 4, "epoch": 0.0528, "step": 132, "tokens_per_device": 1428 }, { "epoch": 0.0528, "loss_ce": 0.06467657536268234, "loss_lvr": 1.486539363861084, "loss_mode_switch": 0.0, "loss_total": 0.21333050727844238, "step": 132 }, { "epoch": 0.0532, "grad_norm": 2.4792420864105225, "learning_rate": 9.985891919982878e-06, "loss": 0.4375, "step": 133 }, { "batch_size": 4, "epoch": 0.0532, "step": 133, "tokens_per_device": 1324 }, { "epoch": 0.0532, "loss_ce": 0.6548352241516113, "loss_lvr": 1.9247206449508667, "loss_mode_switch": 0.0, "loss_total": 0.8473073244094849, "step": 133 }, { "batch_size": 4, "epoch": 0.0532, "step": 133, "tokens_per_device": 4512 }, { "epoch": 0.0532, "loss_ce": 0.047661636024713516, "loss_lvr": 1.3810157775878906, "loss_mode_switch": 0.0, "loss_total": 0.18576321005821228, "step": 133 }, { "batch_size": 1, "epoch": 0.0532, "step": 133, "tokens_per_device": 4852 }, { "epoch": 0.0532, "loss_ce": 0.025234289467334747, "loss_lvr": 0.5732625722885132, "loss_mode_switch": 0.0, "loss_total": 0.08256054669618607, "step": 133 }, { "batch_size": 4, "epoch": 0.0532, "step": 133, "tokens_per_device": 5912 }, { "epoch": 0.0532, "loss_ce": 0.09479893743991852, "loss_lvr": 0.9956187605857849, "loss_mode_switch": 0.0, "loss_total": 0.19436082243919373, "step": 133 }, { "batch_size": 4, "epoch": 0.0532, "step": 133, "tokens_per_device": 1252 }, { "epoch": 0.0532, "loss_ce": 0.5564996600151062, "loss_lvr": 1.7941049337387085, "loss_mode_switch": 0.0, "loss_total": 0.735910177230835, "step": 133 }, { "batch_size": 4, "epoch": 0.0532, "step": 133, "tokens_per_device": 4596 }, { "epoch": 0.0532, "loss_ce": 0.2629513740539551, "loss_lvr": 1.6436858177185059, "loss_mode_switch": 0.0, "loss_total": 0.4273199439048767, "step": 133 }, { "batch_size": 4, "epoch": 0.0532, "step": 133, "tokens_per_device": 4060 }, { "epoch": 0.0532, "loss_ce": 0.6124618053436279, "loss_lvr": 1.464316487312317, "loss_mode_switch": 0.0, "loss_total": 0.7588934898376465, "step": 133 }, { "batch_size": 4, "epoch": 0.0532, "step": 133, "tokens_per_device": 11608 }, { "epoch": 0.0532, "loss_ce": 0.15451842546463013, "loss_lvr": 1.442372441291809, "loss_mode_switch": 0.0, "loss_total": 0.2987556755542755, "step": 133 }, { "epoch": 0.0536, "grad_norm": 2.13997220993042, "learning_rate": 9.985401479534664e-06, "loss": 0.414, "step": 134 }, { "batch_size": 4, "epoch": 0.0536, "step": 134, "tokens_per_device": 6300 }, { "epoch": 0.0536, "loss_ce": 0.16500496864318848, "loss_lvr": 1.0452382564544678, "loss_mode_switch": 0.0, "loss_total": 0.2695288062095642, "step": 134 }, { "batch_size": 4, "epoch": 0.0536, "step": 134, "tokens_per_device": 1356 }, { "epoch": 0.0536, "loss_ce": 0.2824869453907013, "loss_lvr": 1.9160155057907104, "loss_mode_switch": 0.0, "loss_total": 0.47408849000930786, "step": 134 }, { "batch_size": 4, "epoch": 0.0536, "step": 134, "tokens_per_device": 2660 }, { "epoch": 0.0536, "loss_ce": 0.13496574759483337, "loss_lvr": 2.3388102054595947, "loss_mode_switch": 0.0, "loss_total": 0.3688467741012573, "step": 134 }, { "batch_size": 4, "epoch": 0.0536, "step": 134, "tokens_per_device": 7636 }, { "epoch": 0.0536, "loss_ce": 0.08882825821638107, "loss_lvr": 1.0218985080718994, "loss_mode_switch": 0.0, "loss_total": 0.19101810455322266, "step": 134 }, { "batch_size": 4, "epoch": 0.0536, "step": 134, "tokens_per_device": 3788 }, { "epoch": 0.0536, "loss_ce": 0.36789968609809875, "loss_lvr": 1.684459924697876, "loss_mode_switch": 0.0, "loss_total": 0.5363456606864929, "step": 134 }, { "batch_size": 4, "epoch": 0.0536, "step": 134, "tokens_per_device": 4932 }, { "epoch": 0.0536, "loss_ce": 0.2641284763813019, "loss_lvr": 1.2433475255966187, "loss_mode_switch": 0.0, "loss_total": 0.38846322894096375, "step": 134 }, { "batch_size": 4, "epoch": 0.0536, "step": 134, "tokens_per_device": 5064 }, { "epoch": 0.0536, "loss_ce": 0.3568717837333679, "loss_lvr": 1.416849136352539, "loss_mode_switch": 0.0, "loss_total": 0.4985567033290863, "step": 134 }, { "batch_size": 4, "epoch": 0.0536, "step": 134, "tokens_per_device": 8804 }, { "epoch": 0.0536, "loss_ce": 0.08520054072141647, "loss_lvr": 1.2552520036697388, "loss_mode_switch": 0.0, "loss_total": 0.21072575449943542, "step": 134 }, { "epoch": 0.054, "grad_norm": 2.098233699798584, "learning_rate": 9.984902671959911e-06, "loss": 0.4378, "step": 135 }, { "batch_size": 4, "epoch": 0.054, "step": 135, "tokens_per_device": 4132 }, { "epoch": 0.054, "loss_ce": 0.5210406184196472, "loss_lvr": 1.6057960987091064, "loss_mode_switch": 0.0, "loss_total": 0.6816202402114868, "step": 135 }, { "batch_size": 4, "epoch": 0.054, "step": 135, "tokens_per_device": 5320 }, { "epoch": 0.054, "loss_ce": 0.49484848976135254, "loss_lvr": 1.4941068887710571, "loss_mode_switch": 0.0, "loss_total": 0.6442592144012451, "step": 135 }, { "batch_size": 4, "epoch": 0.054, "step": 135, "tokens_per_device": 1444 }, { "epoch": 0.054, "loss_ce": 0.5762026906013489, "loss_lvr": 1.4908579587936401, "loss_mode_switch": 0.0, "loss_total": 0.7252885103225708, "step": 135 }, { "batch_size": 1, "epoch": 0.054, "step": 135, "tokens_per_device": 4990 }, { "epoch": 0.054, "loss_ce": 0.4378622770309448, "loss_lvr": 0.6888064742088318, "loss_mode_switch": 0.0, "loss_total": 0.5067429542541504, "step": 135 }, { "batch_size": 1, "epoch": 0.054, "step": 135, "tokens_per_device": 5125 }, { "epoch": 0.054, "loss_ce": 0.01667960360646248, "loss_lvr": 0.9342235326766968, "loss_mode_switch": 0.0, "loss_total": 0.11010195314884186, "step": 135 }, { "batch_size": 1, "epoch": 0.054, "step": 135, "tokens_per_device": 4918 }, { "epoch": 0.054, "loss_ce": 0.25507357716560364, "loss_lvr": 0.9879732728004456, "loss_mode_switch": 0.0, "loss_total": 0.3538708984851837, "step": 135 }, { "batch_size": 1, "epoch": 0.054, "step": 135, "tokens_per_device": 5036 }, { "epoch": 0.054, "loss_ce": 0.12311598658561707, "loss_lvr": 1.6218611001968384, "loss_mode_switch": 0.0, "loss_total": 0.2853021025657654, "step": 135 }, { "batch_size": 1, "epoch": 0.054, "step": 135, "tokens_per_device": 4884 }, { "epoch": 0.054, "loss_ce": 0.236099973320961, "loss_lvr": 2.1700096130371094, "loss_mode_switch": 0.0, "loss_total": 0.45310091972351074, "step": 135 }, { "epoch": 0.0544, "grad_norm": 2.542367696762085, "learning_rate": 9.98439549809578e-06, "loss": 0.4015, "step": 136 }, { "batch_size": 4, "epoch": 0.0544, "step": 136, "tokens_per_device": 6004 }, { "epoch": 0.0544, "loss_ce": 0.28029680252075195, "loss_lvr": 1.4812318086624146, "loss_mode_switch": 0.0, "loss_total": 0.4284200072288513, "step": 136 }, { "batch_size": 1, "epoch": 0.0544, "step": 136, "tokens_per_device": 4874 }, { "epoch": 0.0544, "loss_ce": 0.003458732971921563, "loss_lvr": 0.8739123940467834, "loss_mode_switch": 0.0, "loss_total": 0.09084997326135635, "step": 136 }, { "batch_size": 4, "epoch": 0.0544, "step": 136, "tokens_per_device": 3768 }, { "epoch": 0.0544, "loss_ce": 0.2532425820827484, "loss_lvr": 1.9064034223556519, "loss_mode_switch": 0.0, "loss_total": 0.44388294219970703, "step": 136 }, { "batch_size": 4, "epoch": 0.0544, "step": 136, "tokens_per_device": 1768 }, { "epoch": 0.0544, "loss_ce": 0.7245515584945679, "loss_lvr": 1.5812870264053345, "loss_mode_switch": 0.0, "loss_total": 0.8826802968978882, "step": 136 }, { "batch_size": 4, "epoch": 0.0544, "step": 136, "tokens_per_device": 5164 }, { "epoch": 0.0544, "loss_ce": 0.6173005700111389, "loss_lvr": 1.1832916736602783, "loss_mode_switch": 0.0, "loss_total": 0.7356297373771667, "step": 136 }, { "batch_size": 4, "epoch": 0.0544, "step": 136, "tokens_per_device": 5960 }, { "epoch": 0.0544, "loss_ce": 0.023754414170980453, "loss_lvr": 1.1649242639541626, "loss_mode_switch": 0.0, "loss_total": 0.14024683833122253, "step": 136 }, { "batch_size": 4, "epoch": 0.0544, "step": 136, "tokens_per_device": 11172 }, { "epoch": 0.0544, "loss_ce": 0.4004463255405426, "loss_lvr": 1.0810099840164185, "loss_mode_switch": 0.0, "loss_total": 0.508547306060791, "step": 136 }, { "batch_size": 4, "epoch": 0.0544, "step": 136, "tokens_per_device": 3828 }, { "epoch": 0.0544, "loss_ce": 0.34750860929489136, "loss_lvr": 1.410730242729187, "loss_mode_switch": 0.0, "loss_total": 0.48858165740966797, "step": 136 }, { "epoch": 0.0548, "grad_norm": 2.448685884475708, "learning_rate": 9.983879958793476e-06, "loss": 0.4931, "step": 137 }, { "batch_size": 4, "epoch": 0.0548, "step": 137, "tokens_per_device": 4284 }, { "epoch": 0.0548, "loss_ce": 0.1881648153066635, "loss_lvr": 1.5623499155044556, "loss_mode_switch": 0.0, "loss_total": 0.3443998098373413, "step": 137 }, { "batch_size": 1, "epoch": 0.0548, "step": 137, "tokens_per_device": 5135 }, { "epoch": 0.0548, "loss_ce": 0.123477041721344, "loss_lvr": 1.6313402652740479, "loss_mode_switch": 0.0, "loss_total": 0.28661108016967773, "step": 137 }, { "batch_size": 1, "epoch": 0.0548, "step": 137, "tokens_per_device": 6205 }, { "epoch": 0.0548, "loss_ce": 0.046199239790439606, "loss_lvr": 0.5753955841064453, "loss_mode_switch": 0.0, "loss_total": 0.10373879969120026, "step": 137 }, { "batch_size": 1, "epoch": 0.0548, "step": 137, "tokens_per_device": 4887 }, { "epoch": 0.0548, "loss_ce": 0.013702773489058018, "loss_lvr": 0.8474346399307251, "loss_mode_switch": 0.0, "loss_total": 0.09844623506069183, "step": 137 }, { "batch_size": 4, "epoch": 0.0548, "step": 137, "tokens_per_device": 4200 }, { "epoch": 0.0548, "loss_ce": 0.07308675348758698, "loss_lvr": 1.316239595413208, "loss_mode_switch": 0.0, "loss_total": 0.2047107219696045, "step": 137 }, { "batch_size": 4, "epoch": 0.0548, "step": 137, "tokens_per_device": 1332 }, { "epoch": 0.0548, "loss_ce": 0.49250927567481995, "loss_lvr": 2.936150550842285, "loss_mode_switch": 0.0, "loss_total": 0.7861243486404419, "step": 137 }, { "batch_size": 1, "epoch": 0.0548, "step": 137, "tokens_per_device": 4883 }, { "epoch": 0.0548, "loss_ce": 0.01204822026193142, "loss_lvr": 0.42218512296676636, "loss_mode_switch": 0.0, "loss_total": 0.054266735911369324, "step": 137 }, { "batch_size": 4, "epoch": 0.0548, "step": 137, "tokens_per_device": 4432 }, { "epoch": 0.0548, "loss_ce": 0.6726014018058777, "loss_lvr": 1.7268699407577515, "loss_mode_switch": 0.0, "loss_total": 0.8452883958816528, "step": 137 }, { "epoch": 0.0552, "grad_norm": 1.9148975610733032, "learning_rate": 9.983356054918238e-06, "loss": 0.4051, "step": 138 }, { "batch_size": 4, "epoch": 0.0552, "step": 138, "tokens_per_device": 4744 }, { "epoch": 0.0552, "loss_ce": 0.12724405527114868, "loss_lvr": 1.1016782522201538, "loss_mode_switch": 0.0, "loss_total": 0.23741188645362854, "step": 138 }, { "batch_size": 4, "epoch": 0.0552, "step": 138, "tokens_per_device": 6080 }, { "epoch": 0.0552, "loss_ce": 0.5804117918014526, "loss_lvr": 1.645028829574585, "loss_mode_switch": 0.0, "loss_total": 0.7449146509170532, "step": 138 }, { "batch_size": 1, "epoch": 0.0552, "step": 138, "tokens_per_device": 5074 }, { "epoch": 0.0552, "loss_ce": 0.009216181933879852, "loss_lvr": 1.6276837587356567, "loss_mode_switch": 0.0, "loss_total": 0.17198455333709717, "step": 138 }, { "batch_size": 1, "epoch": 0.0552, "step": 138, "tokens_per_device": 5131 }, { "epoch": 0.0552, "loss_ce": 0.10748383402824402, "loss_lvr": 0.5812793374061584, "loss_mode_switch": 0.0, "loss_total": 0.16561177372932434, "step": 138 }, { "batch_size": 1, "epoch": 0.0552, "step": 138, "tokens_per_device": 7743 }, { "epoch": 0.0552, "loss_ce": 0.05658644437789917, "loss_lvr": 1.0546119213104248, "loss_mode_switch": 0.0, "loss_total": 0.1620476394891739, "step": 138 }, { "batch_size": 4, "epoch": 0.0552, "step": 138, "tokens_per_device": 4220 }, { "epoch": 0.0552, "loss_ce": 0.2699403464794159, "loss_lvr": 1.256496548652649, "loss_mode_switch": 0.0, "loss_total": 0.39559000730514526, "step": 138 }, { "batch_size": 4, "epoch": 0.0552, "step": 138, "tokens_per_device": 1896 }, { "epoch": 0.0552, "loss_ce": 0.4559989273548126, "loss_lvr": 1.3541043996810913, "loss_mode_switch": 0.0, "loss_total": 0.5914093852043152, "step": 138 }, { "batch_size": 4, "epoch": 0.0552, "step": 138, "tokens_per_device": 6320 }, { "epoch": 0.0552, "loss_ce": 0.3039255738258362, "loss_lvr": 0.9341000914573669, "loss_mode_switch": 0.0, "loss_total": 0.39733558893203735, "step": 138 }, { "epoch": 0.0556, "grad_norm": 2.0581424236297607, "learning_rate": 9.982823787349352e-06, "loss": 0.4052, "step": 139 }, { "batch_size": 1, "epoch": 0.0556, "step": 139, "tokens_per_device": 4909 }, { "epoch": 0.0556, "loss_ce": 0.08540231734514236, "loss_lvr": 0.5350373387336731, "loss_mode_switch": 0.0, "loss_total": 0.13890604674816132, "step": 139 }, { "batch_size": 4, "epoch": 0.0556, "step": 139, "tokens_per_device": 9596 }, { "epoch": 0.0556, "loss_ce": 0.4914422035217285, "loss_lvr": 1.3705025911331177, "loss_mode_switch": 0.0, "loss_total": 0.6284924745559692, "step": 139 }, { "batch_size": 1, "epoch": 0.0556, "step": 139, "tokens_per_device": 5158 }, { "epoch": 0.0556, "loss_ce": 0.015076635405421257, "loss_lvr": 0.9571419358253479, "loss_mode_switch": 0.0, "loss_total": 0.11079083383083344, "step": 139 }, { "batch_size": 4, "epoch": 0.0556, "step": 139, "tokens_per_device": 4956 }, { "epoch": 0.0556, "loss_ce": 0.08609683811664581, "loss_lvr": 1.1285173892974854, "loss_mode_switch": 0.0, "loss_total": 0.19894857704639435, "step": 139 }, { "batch_size": 4, "epoch": 0.0556, "step": 139, "tokens_per_device": 2744 }, { "epoch": 0.0556, "loss_ce": 0.5713508725166321, "loss_lvr": 1.0489003658294678, "loss_mode_switch": 0.0, "loss_total": 0.6762409210205078, "step": 139 }, { "batch_size": 4, "epoch": 0.0556, "step": 139, "tokens_per_device": 2560 }, { "epoch": 0.0556, "loss_ce": 1.0266246795654297, "loss_lvr": 1.6789453029632568, "loss_mode_switch": 0.0, "loss_total": 1.1945191621780396, "step": 139 }, { "batch_size": 4, "epoch": 0.0556, "step": 139, "tokens_per_device": 3912 }, { "epoch": 0.0556, "loss_ce": 0.15119153261184692, "loss_lvr": 1.12912118434906, "loss_mode_switch": 0.0, "loss_total": 0.26410365104675293, "step": 139 }, { "batch_size": 4, "epoch": 0.0556, "step": 139, "tokens_per_device": 4452 }, { "epoch": 0.0556, "loss_ce": 0.044592853635549545, "loss_lvr": 0.952584445476532, "loss_mode_switch": 0.0, "loss_total": 0.13985130190849304, "step": 139 }, { "epoch": 0.056, "grad_norm": 2.029520034790039, "learning_rate": 9.982283156980133e-06, "loss": 0.3878, "step": 140 }, { "batch_size": 4, "epoch": 0.056, "step": 140, "tokens_per_device": 7132 }, { "epoch": 0.056, "loss_ce": 0.14873559772968292, "loss_lvr": 1.1103671789169312, "loss_mode_switch": 0.0, "loss_total": 0.25977230072021484, "step": 140 }, { "batch_size": 1, "epoch": 0.056, "step": 140, "tokens_per_device": 5175 }, { "epoch": 0.056, "loss_ce": 0.17034652829170227, "loss_lvr": 0.6912046670913696, "loss_mode_switch": 0.0, "loss_total": 0.23946699500083923, "step": 140 }, { "batch_size": 4, "epoch": 0.056, "step": 140, "tokens_per_device": 2376 }, { "epoch": 0.056, "loss_ce": 0.31061336398124695, "loss_lvr": 1.3815699815750122, "loss_mode_switch": 0.0, "loss_total": 0.44877034425735474, "step": 140 }, { "batch_size": 4, "epoch": 0.056, "step": 140, "tokens_per_device": 1292 }, { "epoch": 0.056, "loss_ce": 0.7959015369415283, "loss_lvr": 1.7937514781951904, "loss_mode_switch": 0.0, "loss_total": 0.9752767086029053, "step": 140 }, { "batch_size": 4, "epoch": 0.056, "step": 140, "tokens_per_device": 3780 }, { "epoch": 0.056, "loss_ce": 0.4308205544948578, "loss_lvr": 1.6248195171356201, "loss_mode_switch": 0.0, "loss_total": 0.5933024883270264, "step": 140 }, { "batch_size": 4, "epoch": 0.056, "step": 140, "tokens_per_device": 4232 }, { "epoch": 0.056, "loss_ce": 0.4623130261898041, "loss_lvr": 1.4704784154891968, "loss_mode_switch": 0.0, "loss_total": 0.6093608736991882, "step": 140 }, { "batch_size": 4, "epoch": 0.056, "step": 140, "tokens_per_device": 3804 }, { "epoch": 0.056, "loss_ce": 0.23995935916900635, "loss_lvr": 1.134380578994751, "loss_mode_switch": 0.0, "loss_total": 0.3533974289894104, "step": 140 }, { "batch_size": 4, "epoch": 0.056, "step": 140, "tokens_per_device": 1340 }, { "epoch": 0.056, "loss_ce": 0.902995228767395, "loss_lvr": 1.7414476871490479, "loss_mode_switch": 0.0, "loss_total": 1.077139973640442, "step": 140 }, { "epoch": 0.0564, "grad_norm": 1.747153401374817, "learning_rate": 9.981734164717936e-06, "loss": 0.3673, "step": 141 }, { "batch_size": 4, "epoch": 0.0564, "step": 141, "tokens_per_device": 5868 }, { "epoch": 0.0564, "loss_ce": 0.43732601404190063, "loss_lvr": 1.3488738536834717, "loss_mode_switch": 0.0, "loss_total": 0.5722134113311768, "step": 141 }, { "batch_size": 1, "epoch": 0.0564, "step": 141, "tokens_per_device": 5244 }, { "epoch": 0.0564, "loss_ce": 0.05033837631344795, "loss_lvr": 1.062566876411438, "loss_mode_switch": 0.0, "loss_total": 0.15659506618976593, "step": 141 }, { "batch_size": 4, "epoch": 0.0564, "step": 141, "tokens_per_device": 1596 }, { "epoch": 0.0564, "loss_ce": 0.46150434017181396, "loss_lvr": 1.3228938579559326, "loss_mode_switch": 0.0, "loss_total": 0.5937937498092651, "step": 141 }, { "batch_size": 1, "epoch": 0.0564, "step": 141, "tokens_per_device": 4900 }, { "epoch": 0.0564, "loss_ce": 0.46433672308921814, "loss_lvr": 1.3346692323684692, "loss_mode_switch": 0.0, "loss_total": 0.5978036522865295, "step": 141 }, { "batch_size": 1, "epoch": 0.0564, "step": 141, "tokens_per_device": 5180 }, { "epoch": 0.0564, "loss_ce": 0.017621036618947983, "loss_lvr": 0.6214351654052734, "loss_mode_switch": 0.0, "loss_total": 0.07976455241441727, "step": 141 }, { "batch_size": 1, "epoch": 0.0564, "step": 141, "tokens_per_device": 4923 }, { "epoch": 0.0564, "loss_ce": 0.28775477409362793, "loss_lvr": 0.7216150164604187, "loss_mode_switch": 0.0, "loss_total": 0.3599162697792053, "step": 141 }, { "batch_size": 4, "epoch": 0.0564, "step": 141, "tokens_per_device": 5080 }, { "epoch": 0.0564, "loss_ce": 0.46252182126045227, "loss_lvr": 1.094547152519226, "loss_mode_switch": 0.0, "loss_total": 0.5719765424728394, "step": 141 }, { "batch_size": 4, "epoch": 0.0564, "step": 141, "tokens_per_device": 4404 }, { "epoch": 0.0564, "loss_ce": 0.37019795179367065, "loss_lvr": 1.6559852361679077, "loss_mode_switch": 0.0, "loss_total": 0.5357964634895325, "step": 141 }, { "epoch": 0.0568, "grad_norm": 2.0812602043151855, "learning_rate": 9.981176811484148e-06, "loss": 0.3953, "step": 142 }, { "batch_size": 4, "epoch": 0.0568, "step": 142, "tokens_per_device": 3760 }, { "epoch": 0.0568, "loss_ce": 0.21927398443222046, "loss_lvr": 1.3506343364715576, "loss_mode_switch": 0.0, "loss_total": 0.3543374240398407, "step": 142 }, { "batch_size": 1, "epoch": 0.0568, "step": 142, "tokens_per_device": 4897 }, { "epoch": 0.0568, "loss_ce": 0.0220043882727623, "loss_lvr": 0.8043311834335327, "loss_mode_switch": 0.0, "loss_total": 0.10243751108646393, "step": 142 }, { "batch_size": 4, "epoch": 0.0568, "step": 142, "tokens_per_device": 1544 }, { "epoch": 0.0568, "loss_ce": 0.496987909078598, "loss_lvr": 2.0453076362609863, "loss_mode_switch": 0.0, "loss_total": 0.7015186548233032, "step": 142 }, { "batch_size": 4, "epoch": 0.0568, "step": 142, "tokens_per_device": 10800 }, { "epoch": 0.0568, "loss_ce": 0.1172817125916481, "loss_lvr": 1.1547883749008179, "loss_mode_switch": 0.0, "loss_total": 0.23276054859161377, "step": 142 }, { "batch_size": 4, "epoch": 0.0568, "step": 142, "tokens_per_device": 1336 }, { "epoch": 0.0568, "loss_ce": 0.32081228494644165, "loss_lvr": 1.5762852430343628, "loss_mode_switch": 0.0, "loss_total": 0.4784408211708069, "step": 142 }, { "batch_size": 4, "epoch": 0.0568, "step": 142, "tokens_per_device": 4700 }, { "epoch": 0.0568, "loss_ce": 0.1970815658569336, "loss_lvr": 1.2462801933288574, "loss_mode_switch": 0.0, "loss_total": 0.3217095732688904, "step": 142 }, { "batch_size": 4, "epoch": 0.0568, "step": 142, "tokens_per_device": 3408 }, { "epoch": 0.0568, "loss_ce": 0.4980103373527527, "loss_lvr": 1.3598796129226685, "loss_mode_switch": 0.0, "loss_total": 0.6339982748031616, "step": 142 }, { "batch_size": 4, "epoch": 0.0568, "step": 142, "tokens_per_device": 5788 }, { "epoch": 0.0568, "loss_ce": 0.09005673974752426, "loss_lvr": 1.393244743347168, "loss_mode_switch": 0.0, "loss_total": 0.22938120365142822, "step": 142 }, { "epoch": 0.0572, "grad_norm": 1.9018319845199585, "learning_rate": 9.98061109821419e-06, "loss": 0.4163, "step": 143 }, { "batch_size": 4, "epoch": 0.0572, "step": 143, "tokens_per_device": 1564 }, { "epoch": 0.0572, "loss_ce": 0.6808732151985168, "loss_lvr": 1.5471423864364624, "loss_mode_switch": 0.0, "loss_total": 0.8355874419212341, "step": 143 }, { "batch_size": 4, "epoch": 0.0572, "step": 143, "tokens_per_device": 3820 }, { "epoch": 0.0572, "loss_ce": 0.3243377208709717, "loss_lvr": 1.481635570526123, "loss_mode_switch": 0.0, "loss_total": 0.472501277923584, "step": 143 }, { "batch_size": 4, "epoch": 0.0572, "step": 143, "tokens_per_device": 4740 }, { "epoch": 0.0572, "loss_ce": 0.007937176153063774, "loss_lvr": 1.0819178819656372, "loss_mode_switch": 0.0, "loss_total": 0.11612896621227264, "step": 143 }, { "batch_size": 4, "epoch": 0.0572, "step": 143, "tokens_per_device": 5548 }, { "epoch": 0.0572, "loss_ce": 0.39236342906951904, "loss_lvr": 1.5329912900924683, "loss_mode_switch": 0.0, "loss_total": 0.5456625819206238, "step": 143 }, { "batch_size": 4, "epoch": 0.0572, "step": 143, "tokens_per_device": 6160 }, { "epoch": 0.0572, "loss_ce": 0.531390905380249, "loss_lvr": 1.272118330001831, "loss_mode_switch": 0.0, "loss_total": 0.6586027145385742, "step": 143 }, { "batch_size": 1, "epoch": 0.0572, "step": 143, "tokens_per_device": 4890 }, { "epoch": 0.0572, "loss_ce": 0.039028290659189224, "loss_lvr": 0.44363802671432495, "loss_mode_switch": 0.0, "loss_total": 0.08339209854602814, "step": 143 }, { "batch_size": 4, "epoch": 0.0572, "step": 143, "tokens_per_device": 2564 }, { "epoch": 0.0572, "loss_ce": 0.4518345892429352, "loss_lvr": 1.5606106519699097, "loss_mode_switch": 0.0, "loss_total": 0.6078956723213196, "step": 143 }, { "batch_size": 1, "epoch": 0.0572, "step": 143, "tokens_per_device": 4862 }, { "epoch": 0.0572, "loss_ce": 0.02661529928445816, "loss_lvr": 0.626000165939331, "loss_mode_switch": 0.0, "loss_total": 0.08921531587839127, "step": 143 }, { "epoch": 0.0576, "grad_norm": 2.0749928951263428, "learning_rate": 9.980037025857511e-06, "loss": 0.3895, "step": 144 }, { "batch_size": 4, "epoch": 0.0576, "step": 144, "tokens_per_device": 4420 }, { "epoch": 0.0576, "loss_ce": 0.2025032937526703, "loss_lvr": 1.245261788368225, "loss_mode_switch": 0.0, "loss_total": 0.3270294666290283, "step": 144 }, { "batch_size": 1, "epoch": 0.0576, "step": 144, "tokens_per_device": 4901 }, { "epoch": 0.0576, "loss_ce": 0.007970974780619144, "loss_lvr": 1.2753024101257324, "loss_mode_switch": 0.0, "loss_total": 0.1355012208223343, "step": 144 }, { "batch_size": 1, "epoch": 0.0576, "step": 144, "tokens_per_device": 5095 }, { "epoch": 0.0576, "loss_ce": 0.15033058822155, "loss_lvr": 1.8389568328857422, "loss_mode_switch": 0.0, "loss_total": 0.3342262804508209, "step": 144 }, { "batch_size": 4, "epoch": 0.0576, "step": 144, "tokens_per_device": 11284 }, { "epoch": 0.0576, "loss_ce": 0.2790912985801697, "loss_lvr": 1.4303977489471436, "loss_mode_switch": 0.0, "loss_total": 0.4221310615539551, "step": 144 }, { "batch_size": 4, "epoch": 0.0576, "step": 144, "tokens_per_device": 4404 }, { "epoch": 0.0576, "loss_ce": 0.19223196804523468, "loss_lvr": 1.5741928815841675, "loss_mode_switch": 0.0, "loss_total": 0.3496512770652771, "step": 144 }, { "batch_size": 4, "epoch": 0.0576, "step": 144, "tokens_per_device": 4256 }, { "epoch": 0.0576, "loss_ce": 0.5735092759132385, "loss_lvr": 1.523951530456543, "loss_mode_switch": 0.0, "loss_total": 0.7259044647216797, "step": 144 }, { "batch_size": 4, "epoch": 0.0576, "step": 144, "tokens_per_device": 3796 }, { "epoch": 0.0576, "loss_ce": 0.049279242753982544, "loss_lvr": 1.6484051942825317, "loss_mode_switch": 0.0, "loss_total": 0.21411976218223572, "step": 144 }, { "batch_size": 1, "epoch": 0.0576, "step": 144, "tokens_per_device": 5161 }, { "epoch": 0.0576, "loss_ce": 0.00373110082000494, "loss_lvr": 2.2725908756256104, "loss_mode_switch": 0.0, "loss_total": 0.2309901863336563, "step": 144 }, { "epoch": 0.058, "grad_norm": 2.343165636062622, "learning_rate": 9.979454595377594e-06, "loss": 0.437, "step": 145 }, { "batch_size": 1, "epoch": 0.058, "step": 145, "tokens_per_device": 5022 }, { "epoch": 0.058, "loss_ce": 0.23467428982257843, "loss_lvr": 2.886674165725708, "loss_mode_switch": 0.0, "loss_total": 0.523341715335846, "step": 145 }, { "batch_size": 4, "epoch": 0.058, "step": 145, "tokens_per_device": 6676 }, { "epoch": 0.058, "loss_ce": 0.3011380136013031, "loss_lvr": 0.978300929069519, "loss_mode_switch": 0.0, "loss_total": 0.3989681005477905, "step": 145 }, { "batch_size": 4, "epoch": 0.058, "step": 145, "tokens_per_device": 7048 }, { "epoch": 0.058, "loss_ce": 0.3428351879119873, "loss_lvr": 1.0101611614227295, "loss_mode_switch": 0.0, "loss_total": 0.4438512921333313, "step": 145 }, { "batch_size": 4, "epoch": 0.058, "step": 145, "tokens_per_device": 3484 }, { "epoch": 0.058, "loss_ce": 0.37439942359924316, "loss_lvr": 1.3123928308486938, "loss_mode_switch": 0.0, "loss_total": 0.5056387186050415, "step": 145 }, { "batch_size": 1, "epoch": 0.058, "step": 145, "tokens_per_device": 5027 }, { "epoch": 0.058, "loss_ce": 0.28332874178886414, "loss_lvr": 1.010327935218811, "loss_mode_switch": 0.0, "loss_total": 0.38436153531074524, "step": 145 }, { "batch_size": 4, "epoch": 0.058, "step": 145, "tokens_per_device": 4240 }, { "epoch": 0.058, "loss_ce": 0.13602682948112488, "loss_lvr": 1.4463448524475098, "loss_mode_switch": 0.0, "loss_total": 0.28066131472587585, "step": 145 }, { "batch_size": 4, "epoch": 0.058, "step": 145, "tokens_per_device": 5068 }, { "epoch": 0.058, "loss_ce": 0.06207644194364548, "loss_lvr": 1.1445417404174805, "loss_mode_switch": 0.0, "loss_total": 0.1765306144952774, "step": 145 }, { "batch_size": 1, "epoch": 0.058, "step": 145, "tokens_per_device": 5192 }, { "epoch": 0.058, "loss_ce": 0.028187736868858337, "loss_lvr": 0.8663797378540039, "loss_mode_switch": 0.0, "loss_total": 0.11482571065425873, "step": 145 }, { "epoch": 0.0584, "grad_norm": 2.1756322383880615, "learning_rate": 9.978863807751944e-06, "loss": 0.4198, "step": 146 }, { "batch_size": 4, "epoch": 0.0584, "step": 146, "tokens_per_device": 5876 }, { "epoch": 0.0584, "loss_ce": 0.7095658779144287, "loss_lvr": 1.1563743352890015, "loss_mode_switch": 0.0, "loss_total": 0.8252032995223999, "step": 146 }, { "batch_size": 1, "epoch": 0.0584, "step": 146, "tokens_per_device": 6064 }, { "epoch": 0.0584, "loss_ce": 0.04345323517918587, "loss_lvr": 0.7683835625648499, "loss_mode_switch": 0.0, "loss_total": 0.12029159069061279, "step": 146 }, { "batch_size": 1, "epoch": 0.0584, "step": 146, "tokens_per_device": 5151 }, { "epoch": 0.0584, "loss_ce": 0.2765965163707733, "loss_lvr": 0.7679578065872192, "loss_mode_switch": 0.0, "loss_total": 0.3533923029899597, "step": 146 }, { "batch_size": 1, "epoch": 0.0584, "step": 146, "tokens_per_device": 4884 }, { "epoch": 0.0584, "loss_ce": 0.21785999834537506, "loss_lvr": 1.2890599966049194, "loss_mode_switch": 0.0, "loss_total": 0.34676599502563477, "step": 146 }, { "batch_size": 4, "epoch": 0.0584, "step": 146, "tokens_per_device": 5128 }, { "epoch": 0.0584, "loss_ce": 0.5271969437599182, "loss_lvr": 1.4065302610397339, "loss_mode_switch": 0.0, "loss_total": 0.6678499579429626, "step": 146 }, { "batch_size": 4, "epoch": 0.0584, "step": 146, "tokens_per_device": 1616 }, { "epoch": 0.0584, "loss_ce": 0.7914814949035645, "loss_lvr": 1.1928118467330933, "loss_mode_switch": 0.0, "loss_total": 0.9107626676559448, "step": 146 }, { "batch_size": 4, "epoch": 0.0584, "step": 146, "tokens_per_device": 4540 }, { "epoch": 0.0584, "loss_ce": 0.41971057653427124, "loss_lvr": 1.448544979095459, "loss_mode_switch": 0.0, "loss_total": 0.5645650625228882, "step": 146 }, { "batch_size": 4, "epoch": 0.0584, "step": 146, "tokens_per_device": 3996 }, { "epoch": 0.0584, "loss_ce": 0.41570624709129333, "loss_lvr": 1.1064541339874268, "loss_mode_switch": 0.0, "loss_total": 0.5263516902923584, "step": 146 }, { "epoch": 0.0588, "grad_norm": 2.0167622566223145, "learning_rate": 9.978264663972099e-06, "loss": 0.3956, "step": 147 }, { "batch_size": 4, "epoch": 0.0588, "step": 147, "tokens_per_device": 4300 }, { "epoch": 0.0588, "loss_ce": 0.6397393345832825, "loss_lvr": 1.2043476104736328, "loss_mode_switch": 0.0, "loss_total": 0.7601740956306458, "step": 147 }, { "batch_size": 1, "epoch": 0.0588, "step": 147, "tokens_per_device": 5179 }, { "epoch": 0.0588, "loss_ce": 0.07454031705856323, "loss_lvr": 1.092811942100525, "loss_mode_switch": 0.0, "loss_total": 0.18382151424884796, "step": 147 }, { "batch_size": 4, "epoch": 0.0588, "step": 147, "tokens_per_device": 3752 }, { "epoch": 0.0588, "loss_ce": 0.11651186645030975, "loss_lvr": 1.2189571857452393, "loss_mode_switch": 0.0, "loss_total": 0.23840758204460144, "step": 147 }, { "batch_size": 4, "epoch": 0.0588, "step": 147, "tokens_per_device": 4232 }, { "epoch": 0.0588, "loss_ce": 0.1189379096031189, "loss_lvr": 1.3696016073226929, "loss_mode_switch": 0.0, "loss_total": 0.25589805841445923, "step": 147 }, { "batch_size": 1, "epoch": 0.0588, "step": 147, "tokens_per_device": 5205 }, { "epoch": 0.0588, "loss_ce": 0.01618059165775776, "loss_lvr": 1.6535710096359253, "loss_mode_switch": 0.0, "loss_total": 0.1815376877784729, "step": 147 }, { "batch_size": 4, "epoch": 0.0588, "step": 147, "tokens_per_device": 2724 }, { "epoch": 0.0588, "loss_ce": 0.8423011898994446, "loss_lvr": 1.1641327142715454, "loss_mode_switch": 0.0, "loss_total": 0.958714485168457, "step": 147 }, { "batch_size": 1, "epoch": 0.0588, "step": 147, "tokens_per_device": 4915 }, { "epoch": 0.0588, "loss_ce": 0.11674454063177109, "loss_lvr": 1.5120807886123657, "loss_mode_switch": 0.0, "loss_total": 0.2679526209831238, "step": 147 }, { "batch_size": 4, "epoch": 0.0588, "step": 147, "tokens_per_device": 3768 }, { "epoch": 0.0588, "loss_ce": 0.38946864008903503, "loss_lvr": 1.6719286441802979, "loss_mode_switch": 0.0, "loss_total": 0.5566614866256714, "step": 147 }, { "epoch": 0.0592, "grad_norm": 1.9179340600967407, "learning_rate": 9.977657165043613e-06, "loss": 0.4153, "step": 148 }, { "batch_size": 4, "epoch": 0.0592, "step": 148, "tokens_per_device": 1188 }, { "epoch": 0.0592, "loss_ce": 0.22968131303787231, "loss_lvr": 1.5405633449554443, "loss_mode_switch": 0.0, "loss_total": 0.3837376534938812, "step": 148 }, { "batch_size": 1, "epoch": 0.0592, "step": 148, "tokens_per_device": 4962 }, { "epoch": 0.0592, "loss_ce": 0.20537887513637543, "loss_lvr": 0.8533684015274048, "loss_mode_switch": 0.0, "loss_total": 0.2907157242298126, "step": 148 }, { "batch_size": 4, "epoch": 0.0592, "step": 148, "tokens_per_device": 4304 }, { "epoch": 0.0592, "loss_ce": 0.28025633096694946, "loss_lvr": 1.551392912864685, "loss_mode_switch": 0.0, "loss_total": 0.43539562821388245, "step": 148 }, { "batch_size": 4, "epoch": 0.0592, "step": 148, "tokens_per_device": 5036 }, { "epoch": 0.0592, "loss_ce": 0.13570018112659454, "loss_lvr": 1.0517147779464722, "loss_mode_switch": 0.0, "loss_total": 0.24087166786193848, "step": 148 }, { "batch_size": 4, "epoch": 0.0592, "step": 148, "tokens_per_device": 2720 }, { "epoch": 0.0592, "loss_ce": 0.10535603761672974, "loss_lvr": 1.234752893447876, "loss_mode_switch": 0.0, "loss_total": 0.22883132100105286, "step": 148 }, { "batch_size": 4, "epoch": 0.0592, "step": 148, "tokens_per_device": 4220 }, { "epoch": 0.0592, "loss_ce": 0.10000475496053696, "loss_lvr": 2.221252679824829, "loss_mode_switch": 0.0, "loss_total": 0.322130024433136, "step": 148 }, { "batch_size": 4, "epoch": 0.0592, "step": 148, "tokens_per_device": 6504 }, { "epoch": 0.0592, "loss_ce": 0.24799032509326935, "loss_lvr": 0.9167938232421875, "loss_mode_switch": 0.0, "loss_total": 0.33966970443725586, "step": 148 }, { "batch_size": 4, "epoch": 0.0592, "step": 148, "tokens_per_device": 4884 }, { "epoch": 0.0592, "loss_ce": 0.4487357437610626, "loss_lvr": 1.064730167388916, "loss_mode_switch": 0.0, "loss_total": 0.5552087426185608, "step": 148 }, { "epoch": 0.0596, "grad_norm": 3.021573066711426, "learning_rate": 9.977041311986072e-06, "loss": 0.3607, "step": 149 }, { "batch_size": 4, "epoch": 0.0596, "step": 149, "tokens_per_device": 2752 }, { "epoch": 0.0596, "loss_ce": 0.21329040825366974, "loss_lvr": 1.191954493522644, "loss_mode_switch": 0.0, "loss_total": 0.3324858546257019, "step": 149 }, { "batch_size": 1, "epoch": 0.0596, "step": 149, "tokens_per_device": 4833 }, { "epoch": 0.0596, "loss_ce": 0.035669174045324326, "loss_lvr": 1.0895402431488037, "loss_mode_switch": 0.0, "loss_total": 0.14462320506572723, "step": 149 }, { "batch_size": 4, "epoch": 0.0596, "step": 149, "tokens_per_device": 4216 }, { "epoch": 0.0596, "loss_ce": 0.047278184443712234, "loss_lvr": 1.4138507843017578, "loss_mode_switch": 0.0, "loss_total": 0.18866325914859772, "step": 149 }, { "batch_size": 4, "epoch": 0.0596, "step": 149, "tokens_per_device": 4208 }, { "epoch": 0.0596, "loss_ce": 0.23700760304927826, "loss_lvr": 1.7203255891799927, "loss_mode_switch": 0.0, "loss_total": 0.4090401530265808, "step": 149 }, { "batch_size": 1, "epoch": 0.0596, "step": 149, "tokens_per_device": 5145 }, { "epoch": 0.0596, "loss_ce": 0.010124682448804379, "loss_lvr": 0.8330416679382324, "loss_mode_switch": 0.0, "loss_total": 0.0934288501739502, "step": 149 }, { "batch_size": 1, "epoch": 0.0596, "step": 149, "tokens_per_device": 5201 }, { "epoch": 0.0596, "loss_ce": 0.05213412269949913, "loss_lvr": 0.6692129373550415, "loss_mode_switch": 0.0, "loss_total": 0.11905542016029358, "step": 149 }, { "batch_size": 4, "epoch": 0.0596, "step": 149, "tokens_per_device": 2756 }, { "epoch": 0.0596, "loss_ce": 0.36361655592918396, "loss_lvr": 0.7094200849533081, "loss_mode_switch": 0.0, "loss_total": 0.43455857038497925, "step": 149 }, { "batch_size": 4, "epoch": 0.0596, "step": 149, "tokens_per_device": 1528 }, { "epoch": 0.0596, "loss_ce": 0.38154253363609314, "loss_lvr": 1.1028797626495361, "loss_mode_switch": 0.0, "loss_total": 0.4918305277824402, "step": 149 }, { "epoch": 0.06, "grad_norm": 2.2504491806030273, "learning_rate": 9.97641710583307e-06, "loss": 0.363, "step": 150 }, { "batch_size": 4, "epoch": 0.06, "step": 150, "tokens_per_device": 4244 }, { "epoch": 0.06, "loss_ce": 0.09690036624670029, "loss_lvr": 1.5659799575805664, "loss_mode_switch": 0.0, "loss_total": 0.253498375415802, "step": 150 }, { "batch_size": 1, "epoch": 0.06, "step": 150, "tokens_per_device": 4861 }, { "epoch": 0.06, "loss_ce": 0.004584793001413345, "loss_lvr": 1.0215214490890503, "loss_mode_switch": 0.0, "loss_total": 0.10673694312572479, "step": 150 }, { "batch_size": 1, "epoch": 0.06, "step": 150, "tokens_per_device": 4894 }, { "epoch": 0.06, "loss_ce": 0.2512437701225281, "loss_lvr": 1.9864846467971802, "loss_mode_switch": 0.0, "loss_total": 0.44989222288131714, "step": 150 }, { "batch_size": 4, "epoch": 0.06, "step": 150, "tokens_per_device": 4296 }, { "epoch": 0.06, "loss_ce": 0.25689998269081116, "loss_lvr": 1.253569483757019, "loss_mode_switch": 0.0, "loss_total": 0.3822569251060486, "step": 150 }, { "batch_size": 1, "epoch": 0.06, "step": 150, "tokens_per_device": 5129 }, { "epoch": 0.06, "loss_ce": 0.05702018737792969, "loss_lvr": 1.2689003944396973, "loss_mode_switch": 0.0, "loss_total": 0.18391023576259613, "step": 150 }, { "batch_size": 4, "epoch": 0.06, "step": 150, "tokens_per_device": 3892 }, { "epoch": 0.06, "loss_ce": 0.11041209846735, "loss_lvr": 1.1652131080627441, "loss_mode_switch": 0.0, "loss_total": 0.22693341970443726, "step": 150 }, { "batch_size": 4, "epoch": 0.06, "step": 150, "tokens_per_device": 1240 }, { "epoch": 0.06, "loss_ce": 0.45139065384864807, "loss_lvr": 1.4272854328155518, "loss_mode_switch": 0.0, "loss_total": 0.5941191911697388, "step": 150 }, { "batch_size": 4, "epoch": 0.06, "step": 150, "tokens_per_device": 1408 }, { "epoch": 0.06, "loss_ce": 0.8864336013793945, "loss_lvr": 1.363013744354248, "loss_mode_switch": 0.0, "loss_total": 1.0227349996566772, "step": 150 }, { "epoch": 0.0604, "grad_norm": 1.929113507270813, "learning_rate": 9.975784547632237e-06, "loss": 0.4192, "step": 151 }, { "batch_size": 4, "epoch": 0.0604, "step": 151, "tokens_per_device": 3792 }, { "epoch": 0.0604, "loss_ce": 0.1739172637462616, "loss_lvr": 1.6795713901519775, "loss_mode_switch": 0.0, "loss_total": 0.3418744206428528, "step": 151 }, { "batch_size": 4, "epoch": 0.0604, "step": 151, "tokens_per_device": 2752 }, { "epoch": 0.0604, "loss_ce": 0.21975483000278473, "loss_lvr": 1.1910996437072754, "loss_mode_switch": 0.0, "loss_total": 0.338864803314209, "step": 151 }, { "batch_size": 1, "epoch": 0.0604, "step": 151, "tokens_per_device": 4879 }, { "epoch": 0.0604, "loss_ce": 0.006384075153619051, "loss_lvr": 0.9689759016036987, "loss_mode_switch": 0.0, "loss_total": 0.103281669318676, "step": 151 }, { "batch_size": 1, "epoch": 0.0604, "step": 151, "tokens_per_device": 4899 }, { "epoch": 0.0604, "loss_ce": 0.020270537585020065, "loss_lvr": 0.40282443165779114, "loss_mode_switch": 0.0, "loss_total": 0.06055298075079918, "step": 151 }, { "batch_size": 4, "epoch": 0.0604, "step": 151, "tokens_per_device": 4000 }, { "epoch": 0.0604, "loss_ce": 0.2051761895418167, "loss_lvr": 1.4027456045150757, "loss_mode_switch": 0.0, "loss_total": 0.345450758934021, "step": 151 }, { "batch_size": 4, "epoch": 0.0604, "step": 151, "tokens_per_device": 3768 }, { "epoch": 0.0604, "loss_ce": 0.3941832482814789, "loss_lvr": 1.4364699125289917, "loss_mode_switch": 0.0, "loss_total": 0.5378302335739136, "step": 151 }, { "batch_size": 4, "epoch": 0.0604, "step": 151, "tokens_per_device": 3928 }, { "epoch": 0.0604, "loss_ce": 0.47837239503860474, "loss_lvr": 1.477449893951416, "loss_mode_switch": 0.0, "loss_total": 0.6261174082756042, "step": 151 }, { "batch_size": 1, "epoch": 0.0604, "step": 151, "tokens_per_device": 5000 }, { "epoch": 0.0604, "loss_ce": 0.007363816257566214, "loss_lvr": 1.0137176513671875, "loss_mode_switch": 0.0, "loss_total": 0.1087355837225914, "step": 151 }, { "epoch": 0.0608, "grad_norm": 2.041898250579834, "learning_rate": 9.975143638445205e-06, "loss": 0.3893, "step": 152 }, { "batch_size": 4, "epoch": 0.0608, "step": 152, "tokens_per_device": 3816 }, { "epoch": 0.0608, "loss_ce": 0.483254611492157, "loss_lvr": 1.4415305852890015, "loss_mode_switch": 0.0, "loss_total": 0.6274076700210571, "step": 152 }, { "batch_size": 4, "epoch": 0.0608, "step": 152, "tokens_per_device": 4412 }, { "epoch": 0.0608, "loss_ce": 0.04501571133732796, "loss_lvr": 1.113602876663208, "loss_mode_switch": 0.0, "loss_total": 0.15637600421905518, "step": 152 }, { "batch_size": 4, "epoch": 0.0608, "step": 152, "tokens_per_device": 4312 }, { "epoch": 0.0608, "loss_ce": 0.427614688873291, "loss_lvr": 1.27984619140625, "loss_mode_switch": 0.0, "loss_total": 0.5555993318557739, "step": 152 }, { "batch_size": 4, "epoch": 0.0608, "step": 152, "tokens_per_device": 4256 }, { "epoch": 0.0608, "loss_ce": 0.345944881439209, "loss_lvr": 1.8724833726882935, "loss_mode_switch": 0.0, "loss_total": 0.5331932306289673, "step": 152 }, { "batch_size": 4, "epoch": 0.0608, "step": 152, "tokens_per_device": 3864 }, { "epoch": 0.0608, "loss_ce": 0.19986078143119812, "loss_lvr": 1.8643501996994019, "loss_mode_switch": 0.0, "loss_total": 0.38629579544067383, "step": 152 }, { "batch_size": 1, "epoch": 0.0608, "step": 152, "tokens_per_device": 5152 }, { "epoch": 0.0608, "loss_ce": 0.005935174413025379, "loss_lvr": 1.5415704250335693, "loss_mode_switch": 0.0, "loss_total": 0.16009221971035004, "step": 152 }, { "batch_size": 4, "epoch": 0.0608, "step": 152, "tokens_per_device": 5088 }, { "epoch": 0.0608, "loss_ce": 0.23051868379116058, "loss_lvr": 1.527633547782898, "loss_mode_switch": 0.0, "loss_total": 0.38328203558921814, "step": 152 }, { "batch_size": 4, "epoch": 0.0608, "step": 152, "tokens_per_device": 4300 }, { "epoch": 0.0608, "loss_ce": 0.5557377934455872, "loss_lvr": 1.196524739265442, "loss_mode_switch": 0.0, "loss_total": 0.6753902435302734, "step": 152 }, { "epoch": 0.0612, "grad_norm": 1.795299768447876, "learning_rate": 9.974494379347632e-06, "loss": 0.3558, "step": 153 }, { "batch_size": 4, "epoch": 0.0612, "step": 153, "tokens_per_device": 1316 }, { "epoch": 0.0612, "loss_ce": 0.2715758979320526, "loss_lvr": 1.6375656127929688, "loss_mode_switch": 0.0, "loss_total": 0.4353324770927429, "step": 153 }, { "batch_size": 4, "epoch": 0.0612, "step": 153, "tokens_per_device": 5384 }, { "epoch": 0.0612, "loss_ce": 0.11922909319400787, "loss_lvr": 0.9176660180091858, "loss_mode_switch": 0.0, "loss_total": 0.21099570393562317, "step": 153 }, { "batch_size": 4, "epoch": 0.0612, "step": 153, "tokens_per_device": 4224 }, { "epoch": 0.0612, "loss_ce": 0.2740902304649353, "loss_lvr": 1.4959076642990112, "loss_mode_switch": 0.0, "loss_total": 0.42368102073669434, "step": 153 }, { "batch_size": 1, "epoch": 0.0612, "step": 153, "tokens_per_device": 5167 }, { "epoch": 0.0612, "loss_ce": 0.20808453857898712, "loss_lvr": 0.6759269833564758, "loss_mode_switch": 0.0, "loss_total": 0.27567723393440247, "step": 153 }, { "batch_size": 1, "epoch": 0.0612, "step": 153, "tokens_per_device": 7589 }, { "epoch": 0.0612, "loss_ce": 0.0056172506883740425, "loss_lvr": 0.851485550403595, "loss_mode_switch": 0.0, "loss_total": 0.0907658115029335, "step": 153 }, { "batch_size": 4, "epoch": 0.0612, "step": 153, "tokens_per_device": 4428 }, { "epoch": 0.0612, "loss_ce": 0.1468149870634079, "loss_lvr": 1.1394453048706055, "loss_mode_switch": 0.0, "loss_total": 0.26075953245162964, "step": 153 }, { "batch_size": 4, "epoch": 0.0612, "step": 153, "tokens_per_device": 2716 }, { "epoch": 0.0612, "loss_ce": 0.36211514472961426, "loss_lvr": 1.3707655668258667, "loss_mode_switch": 0.0, "loss_total": 0.4991917014122009, "step": 153 }, { "batch_size": 4, "epoch": 0.0612, "step": 153, "tokens_per_device": 5392 }, { "epoch": 0.0612, "loss_ce": 0.11448992043733597, "loss_lvr": 0.9935472011566162, "loss_mode_switch": 0.0, "loss_total": 0.2138446420431137, "step": 153 }, { "epoch": 0.0616, "grad_norm": 1.9912351369857788, "learning_rate": 9.973836771429185e-06, "loss": 0.3819, "step": 154 }, { "batch_size": 4, "epoch": 0.0616, "step": 154, "tokens_per_device": 4812 }, { "epoch": 0.0616, "loss_ce": 0.18953494727611542, "loss_lvr": 1.3768550157546997, "loss_mode_switch": 0.0, "loss_total": 0.32722043991088867, "step": 154 }, { "batch_size": 4, "epoch": 0.0616, "step": 154, "tokens_per_device": 4804 }, { "epoch": 0.0616, "loss_ce": 0.2930968701839447, "loss_lvr": 1.0939582586288452, "loss_mode_switch": 0.0, "loss_total": 0.4024927020072937, "step": 154 }, { "batch_size": 4, "epoch": 0.0616, "step": 154, "tokens_per_device": 6300 }, { "epoch": 0.0616, "loss_ce": 0.062154147773981094, "loss_lvr": 0.8242756128311157, "loss_mode_switch": 0.0, "loss_total": 0.14458170533180237, "step": 154 }, { "batch_size": 4, "epoch": 0.0616, "step": 154, "tokens_per_device": 9792 }, { "epoch": 0.0616, "loss_ce": 0.44607746601104736, "loss_lvr": 0.995459794998169, "loss_mode_switch": 0.0, "loss_total": 0.5456234216690063, "step": 154 }, { "batch_size": 4, "epoch": 0.0616, "step": 154, "tokens_per_device": 4276 }, { "epoch": 0.0616, "loss_ce": 0.18096011877059937, "loss_lvr": 1.1849559545516968, "loss_mode_switch": 0.0, "loss_total": 0.2994557023048401, "step": 154 }, { "batch_size": 4, "epoch": 0.0616, "step": 154, "tokens_per_device": 4056 }, { "epoch": 0.0616, "loss_ce": 0.7856903672218323, "loss_lvr": 1.1882328987121582, "loss_mode_switch": 0.0, "loss_total": 0.9045136570930481, "step": 154 }, { "batch_size": 4, "epoch": 0.0616, "step": 154, "tokens_per_device": 5096 }, { "epoch": 0.0616, "loss_ce": 0.27430081367492676, "loss_lvr": 1.430043339729309, "loss_mode_switch": 0.0, "loss_total": 0.4173051714897156, "step": 154 }, { "batch_size": 4, "epoch": 0.0616, "step": 154, "tokens_per_device": 3768 }, { "epoch": 0.0616, "loss_ce": 0.4698282480239868, "loss_lvr": 1.3484759330749512, "loss_mode_switch": 0.0, "loss_total": 0.604675829410553, "step": 154 }, { "epoch": 0.062, "grad_norm": 2.028796911239624, "learning_rate": 9.973170815793543e-06, "loss": 0.4192, "step": 155 }, { "batch_size": 1, "epoch": 0.062, "step": 155, "tokens_per_device": 5229 }, { "epoch": 0.062, "loss_ce": 0.2467564344406128, "loss_lvr": 0.8440656661987305, "loss_mode_switch": 0.0, "loss_total": 0.3311629891395569, "step": 155 }, { "batch_size": 4, "epoch": 0.062, "step": 155, "tokens_per_device": 11008 }, { "epoch": 0.062, "loss_ce": 0.2689407467842102, "loss_lvr": 1.187328577041626, "loss_mode_switch": 0.0, "loss_total": 0.38767361640930176, "step": 155 }, { "batch_size": 1, "epoch": 0.062, "step": 155, "tokens_per_device": 5106 }, { "epoch": 0.062, "loss_ce": 0.11970506608486176, "loss_lvr": 0.9471607208251953, "loss_mode_switch": 0.0, "loss_total": 0.2144211381673813, "step": 155 }, { "batch_size": 4, "epoch": 0.062, "step": 155, "tokens_per_device": 2892 }, { "epoch": 0.062, "loss_ce": 0.08361010253429413, "loss_lvr": 1.1587255001068115, "loss_mode_switch": 0.0, "loss_total": 0.19948264956474304, "step": 155 }, { "batch_size": 1, "epoch": 0.062, "step": 155, "tokens_per_device": 4908 }, { "epoch": 0.062, "loss_ce": 0.11632724106311798, "loss_lvr": 0.3872550129890442, "loss_mode_switch": 0.0, "loss_total": 0.15505275130271912, "step": 155 }, { "batch_size": 4, "epoch": 0.062, "step": 155, "tokens_per_device": 2668 }, { "epoch": 0.062, "loss_ce": 0.6412668228149414, "loss_lvr": 1.0385006666183472, "loss_mode_switch": 0.0, "loss_total": 0.7451168894767761, "step": 155 }, { "batch_size": 1, "epoch": 0.062, "step": 155, "tokens_per_device": 5178 }, { "epoch": 0.062, "loss_ce": 0.08544652163982391, "loss_lvr": 1.367297649383545, "loss_mode_switch": 0.0, "loss_total": 0.22217628359794617, "step": 155 }, { "batch_size": 1, "epoch": 0.062, "step": 155, "tokens_per_device": 4861 }, { "epoch": 0.062, "loss_ce": 0.011254982091486454, "loss_lvr": 0.6176271438598633, "loss_mode_switch": 0.0, "loss_total": 0.07301770150661469, "step": 155 }, { "epoch": 0.0624, "grad_norm": 2.063441276550293, "learning_rate": 9.972496513558399e-06, "loss": 0.4093, "step": 156 }, { "batch_size": 4, "epoch": 0.0624, "step": 156, "tokens_per_device": 4276 }, { "epoch": 0.0624, "loss_ce": 0.5441805720329285, "loss_lvr": 1.242883324623108, "loss_mode_switch": 0.0, "loss_total": 0.6684688925743103, "step": 156 }, { "batch_size": 1, "epoch": 0.0624, "step": 156, "tokens_per_device": 4881 }, { "epoch": 0.0624, "loss_ce": 0.14899942278862, "loss_lvr": 2.4071576595306396, "loss_mode_switch": 0.0, "loss_total": 0.38971519470214844, "step": 156 }, { "batch_size": 1, "epoch": 0.0624, "step": 156, "tokens_per_device": 4856 }, { "epoch": 0.0624, "loss_ce": 0.07619971036911011, "loss_lvr": 1.6066794395446777, "loss_mode_switch": 0.0, "loss_total": 0.23686765134334564, "step": 156 }, { "batch_size": 4, "epoch": 0.0624, "step": 156, "tokens_per_device": 4264 }, { "epoch": 0.0624, "loss_ce": 0.3802434504032135, "loss_lvr": 1.5792430639266968, "loss_mode_switch": 0.0, "loss_total": 0.5381677746772766, "step": 156 }, { "batch_size": 1, "epoch": 0.0624, "step": 156, "tokens_per_device": 4807 }, { "epoch": 0.0624, "loss_ce": 0.02878999151289463, "loss_lvr": 1.8336392641067505, "loss_mode_switch": 0.0, "loss_total": 0.21215392649173737, "step": 156 }, { "batch_size": 1, "epoch": 0.0624, "step": 156, "tokens_per_device": 5112 }, { "epoch": 0.0624, "loss_ce": 0.008911071345210075, "loss_lvr": 1.1373530626296997, "loss_mode_switch": 0.0, "loss_total": 0.12264638394117355, "step": 156 }, { "batch_size": 4, "epoch": 0.0624, "step": 156, "tokens_per_device": 4480 }, { "epoch": 0.0624, "loss_ce": 0.1712946742773056, "loss_lvr": 1.293694257736206, "loss_mode_switch": 0.0, "loss_total": 0.30066409707069397, "step": 156 }, { "batch_size": 4, "epoch": 0.0624, "step": 156, "tokens_per_device": 4320 }, { "epoch": 0.0624, "loss_ce": 0.0422615148127079, "loss_lvr": 1.5113170146942139, "loss_mode_switch": 0.0, "loss_total": 0.19339321553707123, "step": 156 }, { "epoch": 0.0628, "grad_norm": 1.5590258836746216, "learning_rate": 9.971813865855448e-06, "loss": 0.3262, "step": 157 }, { "batch_size": 1, "epoch": 0.0628, "step": 157, "tokens_per_device": 7548 }, { "epoch": 0.0628, "loss_ce": 0.003064097138121724, "loss_lvr": 1.0646820068359375, "loss_mode_switch": 0.0, "loss_total": 0.10953229665756226, "step": 157 }, { "batch_size": 1, "epoch": 0.0628, "step": 157, "tokens_per_device": 4854 }, { "epoch": 0.0628, "loss_ce": 0.002665192587301135, "loss_lvr": 0.31970104575157166, "loss_mode_switch": 0.0, "loss_total": 0.03463529795408249, "step": 157 }, { "batch_size": 4, "epoch": 0.0628, "step": 157, "tokens_per_device": 4228 }, { "epoch": 0.0628, "loss_ce": 0.5329515933990479, "loss_lvr": 1.4913073778152466, "loss_mode_switch": 0.0, "loss_total": 0.6820823550224304, "step": 157 }, { "batch_size": 4, "epoch": 0.0628, "step": 157, "tokens_per_device": 4372 }, { "epoch": 0.0628, "loss_ce": 0.18710723519325256, "loss_lvr": 1.3235225677490234, "loss_mode_switch": 0.0, "loss_total": 0.3194594979286194, "step": 157 }, { "batch_size": 4, "epoch": 0.0628, "step": 157, "tokens_per_device": 9120 }, { "epoch": 0.0628, "loss_ce": 0.025127649307250977, "loss_lvr": 1.0643421411514282, "loss_mode_switch": 0.0, "loss_total": 0.13156187534332275, "step": 157 }, { "batch_size": 1, "epoch": 0.0628, "step": 157, "tokens_per_device": 5269 }, { "epoch": 0.0628, "loss_ce": 0.11757612973451614, "loss_lvr": 1.1655677556991577, "loss_mode_switch": 0.0, "loss_total": 0.23413291573524475, "step": 157 }, { "batch_size": 4, "epoch": 0.0628, "step": 157, "tokens_per_device": 4244 }, { "epoch": 0.0628, "loss_ce": 0.23152124881744385, "loss_lvr": 1.0281391143798828, "loss_mode_switch": 0.0, "loss_total": 0.3343351483345032, "step": 157 }, { "batch_size": 4, "epoch": 0.0628, "step": 157, "tokens_per_device": 4228 }, { "epoch": 0.0628, "loss_ce": 0.2573978006839752, "loss_lvr": 1.3179314136505127, "loss_mode_switch": 0.0, "loss_total": 0.3891909420490265, "step": 157 }, { "epoch": 0.0632, "grad_norm": 2.3493895530700684, "learning_rate": 9.971122873830398e-06, "loss": 0.3375, "step": 158 }, { "batch_size": 4, "epoch": 0.0632, "step": 158, "tokens_per_device": 3680 }, { "epoch": 0.0632, "loss_ce": 0.1669941246509552, "loss_lvr": 1.3545631170272827, "loss_mode_switch": 0.0, "loss_total": 0.30245041847229004, "step": 158 }, { "batch_size": 4, "epoch": 0.0632, "step": 158, "tokens_per_device": 8396 }, { "epoch": 0.0632, "loss_ce": 0.0817255824804306, "loss_lvr": 1.2565466165542603, "loss_mode_switch": 0.0, "loss_total": 0.2073802500963211, "step": 158 }, { "batch_size": 1, "epoch": 0.0632, "step": 158, "tokens_per_device": 4647 }, { "epoch": 0.0632, "loss_ce": 0.0017902822000905871, "loss_lvr": 0.8679962754249573, "loss_mode_switch": 0.0, "loss_total": 0.08858991414308548, "step": 158 }, { "batch_size": 4, "epoch": 0.0632, "step": 158, "tokens_per_device": 5788 }, { "epoch": 0.0632, "loss_ce": 0.19626396894454956, "loss_lvr": 1.0969570875167847, "loss_mode_switch": 0.0, "loss_total": 0.30595967173576355, "step": 158 }, { "batch_size": 4, "epoch": 0.0632, "step": 158, "tokens_per_device": 4772 }, { "epoch": 0.0632, "loss_ce": 0.06155218183994293, "loss_lvr": 0.9719089269638062, "loss_mode_switch": 0.0, "loss_total": 0.15874308347702026, "step": 158 }, { "batch_size": 1, "epoch": 0.0632, "step": 158, "tokens_per_device": 4902 }, { "epoch": 0.0632, "loss_ce": 0.005413917358964682, "loss_lvr": 1.7726095914840698, "loss_mode_switch": 0.0, "loss_total": 0.18267488479614258, "step": 158 }, { "batch_size": 4, "epoch": 0.0632, "step": 158, "tokens_per_device": 3884 }, { "epoch": 0.0632, "loss_ce": 0.40648940205574036, "loss_lvr": 1.3770780563354492, "loss_mode_switch": 0.0, "loss_total": 0.5441972017288208, "step": 158 }, { "batch_size": 4, "epoch": 0.0632, "step": 158, "tokens_per_device": 4292 }, { "epoch": 0.0632, "loss_ce": 0.3166966736316681, "loss_lvr": 1.0120960474014282, "loss_mode_switch": 0.0, "loss_total": 0.4179062843322754, "step": 158 }, { "epoch": 0.0636, "grad_norm": 1.9923526048660278, "learning_rate": 9.970423538642959e-06, "loss": 0.3695, "step": 159 }, { "batch_size": 1, "epoch": 0.0636, "step": 159, "tokens_per_device": 5042 }, { "epoch": 0.0636, "loss_ce": 1.4291877746582031, "loss_lvr": 1.5950909852981567, "loss_mode_switch": 0.0, "loss_total": 1.588696837425232, "step": 159 }, { "batch_size": 4, "epoch": 0.0636, "step": 159, "tokens_per_device": 3760 }, { "epoch": 0.0636, "loss_ce": 0.291459858417511, "loss_lvr": 1.2112245559692383, "loss_mode_switch": 0.0, "loss_total": 0.41258230805397034, "step": 159 }, { "batch_size": 4, "epoch": 0.0636, "step": 159, "tokens_per_device": 2568 }, { "epoch": 0.0636, "loss_ce": 0.39133796095848083, "loss_lvr": 1.4986902475357056, "loss_mode_switch": 0.0, "loss_total": 0.541206955909729, "step": 159 }, { "batch_size": 4, "epoch": 0.0636, "step": 159, "tokens_per_device": 1344 }, { "epoch": 0.0636, "loss_ce": 0.07237493246793747, "loss_lvr": 1.3384592533111572, "loss_mode_switch": 0.0, "loss_total": 0.2062208652496338, "step": 159 }, { "batch_size": 4, "epoch": 0.0636, "step": 159, "tokens_per_device": 4280 }, { "epoch": 0.0636, "loss_ce": 0.2272324562072754, "loss_lvr": 1.6632041931152344, "loss_mode_switch": 0.0, "loss_total": 0.39355289936065674, "step": 159 }, { "batch_size": 4, "epoch": 0.0636, "step": 159, "tokens_per_device": 4488 }, { "epoch": 0.0636, "loss_ce": 0.3237645924091339, "loss_lvr": 1.0818248987197876, "loss_mode_switch": 0.0, "loss_total": 0.43194708228111267, "step": 159 }, { "batch_size": 4, "epoch": 0.0636, "step": 159, "tokens_per_device": 2248 }, { "epoch": 0.0636, "loss_ce": 0.12350834161043167, "loss_lvr": 1.2432667016983032, "loss_mode_switch": 0.0, "loss_total": 0.24783501029014587, "step": 159 }, { "batch_size": 4, "epoch": 0.0636, "step": 159, "tokens_per_device": 6608 }, { "epoch": 0.0636, "loss_ce": 0.29083964228630066, "loss_lvr": 0.9204475283622742, "loss_mode_switch": 0.0, "loss_total": 0.3828843832015991, "step": 159 }, { "epoch": 0.064, "grad_norm": 2.088494300842285, "learning_rate": 9.969715861466839e-06, "loss": 0.4282, "step": 160 }, { "batch_size": 4, "epoch": 0.064, "step": 160, "tokens_per_device": 2648 }, { "epoch": 0.064, "loss_ce": 0.594296932220459, "loss_lvr": 1.143945336341858, "loss_mode_switch": 0.0, "loss_total": 0.7086914777755737, "step": 160 }, { "batch_size": 4, "epoch": 0.064, "step": 160, "tokens_per_device": 1400 }, { "epoch": 0.064, "loss_ce": 0.12872864305973053, "loss_lvr": 1.1931366920471191, "loss_mode_switch": 0.0, "loss_total": 0.24804231524467468, "step": 160 }, { "batch_size": 4, "epoch": 0.064, "step": 160, "tokens_per_device": 8888 }, { "epoch": 0.064, "loss_ce": 0.19300706684589386, "loss_lvr": 1.0234898328781128, "loss_mode_switch": 0.0, "loss_total": 0.29535603523254395, "step": 160 }, { "batch_size": 4, "epoch": 0.064, "step": 160, "tokens_per_device": 4412 }, { "epoch": 0.064, "loss_ce": 0.10055074095726013, "loss_lvr": 1.2324756383895874, "loss_mode_switch": 0.0, "loss_total": 0.22379830479621887, "step": 160 }, { "batch_size": 1, "epoch": 0.064, "step": 160, "tokens_per_device": 5189 }, { "epoch": 0.064, "loss_ce": 0.287154883146286, "loss_lvr": 0.7210370898246765, "loss_mode_switch": 0.0, "loss_total": 0.35925859212875366, "step": 160 }, { "batch_size": 1, "epoch": 0.064, "step": 160, "tokens_per_device": 5103 }, { "epoch": 0.064, "loss_ce": 0.006336837541311979, "loss_lvr": 1.2197598218917847, "loss_mode_switch": 0.0, "loss_total": 0.12831281125545502, "step": 160 }, { "batch_size": 4, "epoch": 0.064, "step": 160, "tokens_per_device": 12152 }, { "epoch": 0.064, "loss_ce": 0.024499008432030678, "loss_lvr": 0.9709334969520569, "loss_mode_switch": 0.0, "loss_total": 0.12159235775470734, "step": 160 }, { "batch_size": 4, "epoch": 0.064, "step": 160, "tokens_per_device": 8904 }, { "epoch": 0.064, "loss_ce": 0.06857957690954208, "loss_lvr": 1.659401774406433, "loss_mode_switch": 0.0, "loss_total": 0.23451974987983704, "step": 160 }, { "epoch": 0.0644, "grad_norm": 3.901031732559204, "learning_rate": 9.968999843489755e-06, "loss": 0.3743, "step": 161 }, { "batch_size": 4, "epoch": 0.0644, "step": 161, "tokens_per_device": 2980 }, { "epoch": 0.0644, "loss_ce": 0.16404365003108978, "loss_lvr": 1.6764651536941528, "loss_mode_switch": 0.0, "loss_total": 0.3316901624202728, "step": 161 }, { "batch_size": 4, "epoch": 0.0644, "step": 161, "tokens_per_device": 4260 }, { "epoch": 0.0644, "loss_ce": 1.3748730421066284, "loss_lvr": 1.2911182641983032, "loss_mode_switch": 0.0, "loss_total": 1.5039849281311035, "step": 161 }, { "batch_size": 4, "epoch": 0.0644, "step": 161, "tokens_per_device": 7028 }, { "epoch": 0.0644, "loss_ce": 0.15706580877304077, "loss_lvr": 1.2893264293670654, "loss_mode_switch": 0.0, "loss_total": 0.28599846363067627, "step": 161 }, { "batch_size": 1, "epoch": 0.0644, "step": 161, "tokens_per_device": 4887 }, { "epoch": 0.0644, "loss_ce": 0.09110496193170547, "loss_lvr": 1.5360990762710571, "loss_mode_switch": 0.0, "loss_total": 0.2447148859500885, "step": 161 }, { "batch_size": 4, "epoch": 0.0644, "step": 161, "tokens_per_device": 4148 }, { "epoch": 0.0644, "loss_ce": 0.22400909662246704, "loss_lvr": 1.4993407726287842, "loss_mode_switch": 0.0, "loss_total": 0.37394317984580994, "step": 161 }, { "batch_size": 4, "epoch": 0.0644, "step": 161, "tokens_per_device": 1604 }, { "epoch": 0.0644, "loss_ce": 0.3315536379814148, "loss_lvr": 1.271121859550476, "loss_mode_switch": 0.0, "loss_total": 0.4586658477783203, "step": 161 }, { "batch_size": 1, "epoch": 0.0644, "step": 161, "tokens_per_device": 7513 }, { "epoch": 0.0644, "loss_ce": 0.0055247461423277855, "loss_lvr": 0.5604286193847656, "loss_mode_switch": 0.0, "loss_total": 0.06156760826706886, "step": 161 }, { "batch_size": 4, "epoch": 0.0644, "step": 161, "tokens_per_device": 4240 }, { "epoch": 0.0644, "loss_ce": 0.06816206127405167, "loss_lvr": 1.0373903512954712, "loss_mode_switch": 0.0, "loss_total": 0.17190110683441162, "step": 161 }, { "epoch": 0.0648, "grad_norm": 1.9124925136566162, "learning_rate": 9.968275485913417e-06, "loss": 0.4245, "step": 162 }, { "batch_size": 1, "epoch": 0.0648, "step": 162, "tokens_per_device": 7579 }, { "epoch": 0.0648, "loss_ce": 0.011710253544151783, "loss_lvr": 0.5450487732887268, "loss_mode_switch": 0.0, "loss_total": 0.0662151351571083, "step": 162 }, { "batch_size": 4, "epoch": 0.0648, "step": 162, "tokens_per_device": 2876 }, { "epoch": 0.0648, "loss_ce": 0.8072288036346436, "loss_lvr": 0.8241002559661865, "loss_mode_switch": 0.0, "loss_total": 0.8896388411521912, "step": 162 }, { "batch_size": 4, "epoch": 0.0648, "step": 162, "tokens_per_device": 4500 }, { "epoch": 0.0648, "loss_ce": 0.6433542966842651, "loss_lvr": 1.0193716287612915, "loss_mode_switch": 0.0, "loss_total": 0.7452914714813232, "step": 162 }, { "batch_size": 1, "epoch": 0.0648, "step": 162, "tokens_per_device": 5115 }, { "epoch": 0.0648, "loss_ce": 0.08218032866716385, "loss_lvr": 0.5007010698318481, "loss_mode_switch": 0.0, "loss_total": 0.13225042819976807, "step": 162 }, { "batch_size": 4, "epoch": 0.0648, "step": 162, "tokens_per_device": 5408 }, { "epoch": 0.0648, "loss_ce": 0.5556102395057678, "loss_lvr": 0.9917834401130676, "loss_mode_switch": 0.0, "loss_total": 0.654788613319397, "step": 162 }, { "batch_size": 1, "epoch": 0.0648, "step": 162, "tokens_per_device": 5105 }, { "epoch": 0.0648, "loss_ce": 0.03859006613492966, "loss_lvr": 0.8034548759460449, "loss_mode_switch": 0.0, "loss_total": 0.11893555521965027, "step": 162 }, { "batch_size": 4, "epoch": 0.0648, "step": 162, "tokens_per_device": 1496 }, { "epoch": 0.0648, "loss_ce": 0.6915521025657654, "loss_lvr": 1.3640369176864624, "loss_mode_switch": 0.0, "loss_total": 0.8279557824134827, "step": 162 }, { "batch_size": 1, "epoch": 0.0648, "step": 162, "tokens_per_device": 5174 }, { "epoch": 0.0648, "loss_ce": 0.22193142771720886, "loss_lvr": 1.2389451265335083, "loss_mode_switch": 0.0, "loss_total": 0.3458259403705597, "step": 162 }, { "epoch": 0.0652, "grad_norm": 2.104536294937134, "learning_rate": 9.967542789953532e-06, "loss": 0.4075, "step": 163 }, { "batch_size": 4, "epoch": 0.0652, "step": 163, "tokens_per_device": 4408 }, { "epoch": 0.0652, "loss_ce": 0.1369413435459137, "loss_lvr": 0.8550971746444702, "loss_mode_switch": 0.0, "loss_total": 0.22245106101036072, "step": 163 }, { "batch_size": 1, "epoch": 0.0652, "step": 163, "tokens_per_device": 4748 }, { "epoch": 0.0652, "loss_ce": 0.06068500503897667, "loss_lvr": 0.7752584218978882, "loss_mode_switch": 0.0, "loss_total": 0.13821084797382355, "step": 163 }, { "batch_size": 4, "epoch": 0.0652, "step": 163, "tokens_per_device": 11176 }, { "epoch": 0.0652, "loss_ce": 0.19641195237636566, "loss_lvr": 1.3477604389190674, "loss_mode_switch": 0.0, "loss_total": 0.33118799328804016, "step": 163 }, { "batch_size": 4, "epoch": 0.0652, "step": 163, "tokens_per_device": 1624 }, { "epoch": 0.0652, "loss_ce": 0.10766342282295227, "loss_lvr": 1.0332797765731812, "loss_mode_switch": 0.0, "loss_total": 0.21099141240119934, "step": 163 }, { "batch_size": 4, "epoch": 0.0652, "step": 163, "tokens_per_device": 2348 }, { "epoch": 0.0652, "loss_ce": 0.2341156005859375, "loss_lvr": 1.1845786571502686, "loss_mode_switch": 0.0, "loss_total": 0.3525734543800354, "step": 163 }, { "batch_size": 4, "epoch": 0.0652, "step": 163, "tokens_per_device": 8188 }, { "epoch": 0.0652, "loss_ce": 0.2718007266521454, "loss_lvr": 1.3593733310699463, "loss_mode_switch": 0.0, "loss_total": 0.40773805975914, "step": 163 }, { "batch_size": 4, "epoch": 0.0652, "step": 163, "tokens_per_device": 4444 }, { "epoch": 0.0652, "loss_ce": 0.30128058791160583, "loss_lvr": 1.4049725532531738, "loss_mode_switch": 0.0, "loss_total": 0.4417778253555298, "step": 163 }, { "batch_size": 4, "epoch": 0.0652, "step": 163, "tokens_per_device": 4360 }, { "epoch": 0.0652, "loss_ce": 0.21775732934474945, "loss_lvr": 1.2926461696624756, "loss_mode_switch": 0.0, "loss_total": 0.3470219373703003, "step": 163 }, { "epoch": 0.0656, "grad_norm": 1.7971333265304565, "learning_rate": 9.966801756839802e-06, "loss": 0.3582, "step": 164 }, { "batch_size": 4, "epoch": 0.0656, "step": 164, "tokens_per_device": 4144 }, { "epoch": 0.0656, "loss_ce": 0.16307705640792847, "loss_lvr": 1.1201599836349487, "loss_mode_switch": 0.0, "loss_total": 0.27509304881095886, "step": 164 }, { "batch_size": 1, "epoch": 0.0656, "step": 164, "tokens_per_device": 4973 }, { "epoch": 0.0656, "loss_ce": 0.2721453905105591, "loss_lvr": 0.7640988826751709, "loss_mode_switch": 0.0, "loss_total": 0.3485552668571472, "step": 164 }, { "batch_size": 1, "epoch": 0.0656, "step": 164, "tokens_per_device": 5019 }, { "epoch": 0.0656, "loss_ce": 0.058306850492954254, "loss_lvr": 1.5248744487762451, "loss_mode_switch": 0.0, "loss_total": 0.21079429984092712, "step": 164 }, { "batch_size": 1, "epoch": 0.0656, "step": 164, "tokens_per_device": 4685 }, { "epoch": 0.0656, "loss_ce": 0.005025491584092379, "loss_lvr": 0.9661575555801392, "loss_mode_switch": 0.0, "loss_total": 0.10164124518632889, "step": 164 }, { "batch_size": 4, "epoch": 0.0656, "step": 164, "tokens_per_device": 4240 }, { "epoch": 0.0656, "loss_ce": 0.6435019373893738, "loss_lvr": 1.4472613334655762, "loss_mode_switch": 0.0, "loss_total": 0.7882280945777893, "step": 164 }, { "batch_size": 4, "epoch": 0.0656, "step": 164, "tokens_per_device": 5612 }, { "epoch": 0.0656, "loss_ce": 0.14701466262340546, "loss_lvr": 1.8401762247085571, "loss_mode_switch": 0.0, "loss_total": 0.33103227615356445, "step": 164 }, { "batch_size": 4, "epoch": 0.0656, "step": 164, "tokens_per_device": 4328 }, { "epoch": 0.0656, "loss_ce": 0.20237046480178833, "loss_lvr": 1.2998698949813843, "loss_mode_switch": 0.0, "loss_total": 0.3323574662208557, "step": 164 }, { "batch_size": 4, "epoch": 0.0656, "step": 164, "tokens_per_device": 4120 }, { "epoch": 0.0656, "loss_ce": 0.04660767316818237, "loss_lvr": 2.1961557865142822, "loss_mode_switch": 0.0, "loss_total": 0.2662232518196106, "step": 164 }, { "epoch": 0.066, "grad_norm": 1.6817907094955444, "learning_rate": 9.966052387815923e-06, "loss": 0.3847, "step": 165 }, { "batch_size": 1, "epoch": 0.066, "step": 165, "tokens_per_device": 5190 }, { "epoch": 0.066, "loss_ce": 0.21103930473327637, "loss_lvr": 0.976656436920166, "loss_mode_switch": 0.0, "loss_total": 0.3087049424648285, "step": 165 }, { "batch_size": 4, "epoch": 0.066, "step": 165, "tokens_per_device": 8424 }, { "epoch": 0.066, "loss_ce": 0.030340351164340973, "loss_lvr": 1.4067386388778687, "loss_mode_switch": 0.0, "loss_total": 0.1710142195224762, "step": 165 }, { "batch_size": 1, "epoch": 0.066, "step": 165, "tokens_per_device": 4705 }, { "epoch": 0.066, "loss_ce": 0.0055634984746575356, "loss_lvr": 0.9228426218032837, "loss_mode_switch": 0.0, "loss_total": 0.09784775972366333, "step": 165 }, { "batch_size": 4, "epoch": 0.066, "step": 165, "tokens_per_device": 5500 }, { "epoch": 0.066, "loss_ce": 0.036377765238285065, "loss_lvr": 1.0334194898605347, "loss_mode_switch": 0.0, "loss_total": 0.13971972465515137, "step": 165 }, { "batch_size": 4, "epoch": 0.066, "step": 165, "tokens_per_device": 5316 }, { "epoch": 0.066, "loss_ce": 0.057592157274484634, "loss_lvr": 1.089077115058899, "loss_mode_switch": 0.0, "loss_total": 0.16649986803531647, "step": 165 }, { "batch_size": 1, "epoch": 0.066, "step": 165, "tokens_per_device": 5456 }, { "epoch": 0.066, "loss_ce": 0.14520888030529022, "loss_lvr": 0.7245922684669495, "loss_mode_switch": 0.0, "loss_total": 0.21766811609268188, "step": 165 }, { "batch_size": 4, "epoch": 0.066, "step": 165, "tokens_per_device": 5504 }, { "epoch": 0.066, "loss_ce": 0.3373914659023285, "loss_lvr": 1.0007259845733643, "loss_mode_switch": 0.0, "loss_total": 0.43746405839920044, "step": 165 }, { "batch_size": 4, "epoch": 0.066, "step": 165, "tokens_per_device": 4368 }, { "epoch": 0.066, "loss_ce": 0.26195743680000305, "loss_lvr": 1.0589860677719116, "loss_mode_switch": 0.0, "loss_total": 0.36785605549812317, "step": 165 }, { "epoch": 0.0664, "grad_norm": 1.6551883220672607, "learning_rate": 9.96529468413958e-06, "loss": 0.392, "step": 166 }, { "batch_size": 4, "epoch": 0.0664, "step": 166, "tokens_per_device": 1676 }, { "epoch": 0.0664, "loss_ce": 0.18868665397167206, "loss_lvr": 1.5962200164794922, "loss_mode_switch": 0.0, "loss_total": 0.34830865263938904, "step": 166 }, { "batch_size": 4, "epoch": 0.0664, "step": 166, "tokens_per_device": 4224 }, { "epoch": 0.0664, "loss_ce": 0.3553275763988495, "loss_lvr": 1.4627656936645508, "loss_mode_switch": 0.0, "loss_total": 0.5016041398048401, "step": 166 }, { "batch_size": 1, "epoch": 0.0664, "step": 166, "tokens_per_device": 5120 }, { "epoch": 0.0664, "loss_ce": 0.1271168738603592, "loss_lvr": 1.847425103187561, "loss_mode_switch": 0.0, "loss_total": 0.3118593692779541, "step": 166 }, { "batch_size": 4, "epoch": 0.0664, "step": 166, "tokens_per_device": 2696 }, { "epoch": 0.0664, "loss_ce": 0.292704701423645, "loss_lvr": 1.3978300094604492, "loss_mode_switch": 0.0, "loss_total": 0.43248772621154785, "step": 166 }, { "batch_size": 1, "epoch": 0.0664, "step": 166, "tokens_per_device": 4392 }, { "epoch": 0.0664, "loss_ce": 0.011620063334703445, "loss_lvr": 0.7008765935897827, "loss_mode_switch": 0.0, "loss_total": 0.08170773088932037, "step": 166 }, { "batch_size": 4, "epoch": 0.0664, "step": 166, "tokens_per_device": 1528 }, { "epoch": 0.0664, "loss_ce": 0.8221355676651001, "loss_lvr": 1.3134992122650146, "loss_mode_switch": 0.0, "loss_total": 0.9534854888916016, "step": 166 }, { "batch_size": 4, "epoch": 0.0664, "step": 166, "tokens_per_device": 6620 }, { "epoch": 0.0664, "loss_ce": 0.1529349684715271, "loss_lvr": 1.0891472101211548, "loss_mode_switch": 0.0, "loss_total": 0.26184970140457153, "step": 166 }, { "batch_size": 4, "epoch": 0.0664, "step": 166, "tokens_per_device": 1640 }, { "epoch": 0.0664, "loss_ce": 0.5657286643981934, "loss_lvr": 1.2003623247146606, "loss_mode_switch": 0.0, "loss_total": 0.6857649087905884, "step": 166 }, { "epoch": 0.0668, "grad_norm": 1.69370698928833, "learning_rate": 9.964528647082447e-06, "loss": 0.3979, "step": 167 }, { "batch_size": 1, "epoch": 0.0668, "step": 167, "tokens_per_device": 5165 }, { "epoch": 0.0668, "loss_ce": 0.023435717448592186, "loss_lvr": 0.8533623814582825, "loss_mode_switch": 0.0, "loss_total": 0.10877195745706558, "step": 167 }, { "batch_size": 4, "epoch": 0.0668, "step": 167, "tokens_per_device": 4832 }, { "epoch": 0.0668, "loss_ce": 0.6687399744987488, "loss_lvr": 1.0551420450210571, "loss_mode_switch": 0.0, "loss_total": 0.7742542028427124, "step": 167 }, { "batch_size": 4, "epoch": 0.0668, "step": 167, "tokens_per_device": 8572 }, { "epoch": 0.0668, "loss_ce": 0.6430099606513977, "loss_lvr": 0.8429785966873169, "loss_mode_switch": 0.0, "loss_total": 0.7273077964782715, "step": 167 }, { "batch_size": 1, "epoch": 0.0668, "step": 167, "tokens_per_device": 4747 }, { "epoch": 0.0668, "loss_ce": 0.0063357665203511715, "loss_lvr": 1.471962571144104, "loss_mode_switch": 0.0, "loss_total": 0.1535320281982422, "step": 167 }, { "batch_size": 4, "epoch": 0.0668, "step": 167, "tokens_per_device": 2644 }, { "epoch": 0.0668, "loss_ce": 0.3957849442958832, "loss_lvr": 1.497280478477478, "loss_mode_switch": 0.0, "loss_total": 0.5455129742622375, "step": 167 }, { "batch_size": 1, "epoch": 0.0668, "step": 167, "tokens_per_device": 5105 }, { "epoch": 0.0668, "loss_ce": 0.02210131101310253, "loss_lvr": 0.41232043504714966, "loss_mode_switch": 0.0, "loss_total": 0.06333335489034653, "step": 167 }, { "batch_size": 1, "epoch": 0.0668, "step": 167, "tokens_per_device": 4948 }, { "epoch": 0.0668, "loss_ce": 0.01868462562561035, "loss_lvr": 0.3731095790863037, "loss_mode_switch": 0.0, "loss_total": 0.05599558353424072, "step": 167 }, { "batch_size": 4, "epoch": 0.0668, "step": 167, "tokens_per_device": 1700 }, { "epoch": 0.0668, "loss_ce": 0.9234286546707153, "loss_lvr": 1.1105971336364746, "loss_mode_switch": 0.0, "loss_total": 1.034488320350647, "step": 167 }, { "epoch": 0.0672, "grad_norm": 1.8341293334960938, "learning_rate": 9.96375427793018e-06, "loss": 0.358, "step": 168 }, { "batch_size": 1, "epoch": 0.0672, "step": 168, "tokens_per_device": 5732 }, { "epoch": 0.0672, "loss_ce": 0.011114037595689297, "loss_lvr": 0.8102843761444092, "loss_mode_switch": 0.0, "loss_total": 0.09214247763156891, "step": 168 }, { "batch_size": 4, "epoch": 0.0672, "step": 168, "tokens_per_device": 4272 }, { "epoch": 0.0672, "loss_ce": 0.21062561869621277, "loss_lvr": 2.0814101696014404, "loss_mode_switch": 0.0, "loss_total": 0.4187666177749634, "step": 168 }, { "batch_size": 4, "epoch": 0.0672, "step": 168, "tokens_per_device": 1648 }, { "epoch": 0.0672, "loss_ce": 0.34337177872657776, "loss_lvr": 1.4021344184875488, "loss_mode_switch": 0.0, "loss_total": 0.4835852384567261, "step": 168 }, { "batch_size": 4, "epoch": 0.0672, "step": 168, "tokens_per_device": 3236 }, { "epoch": 0.0672, "loss_ce": 0.45782434940338135, "loss_lvr": 1.104310154914856, "loss_mode_switch": 0.0, "loss_total": 0.5682553648948669, "step": 168 }, { "batch_size": 1, "epoch": 0.0672, "step": 168, "tokens_per_device": 5110 }, { "epoch": 0.0672, "loss_ce": 0.05846058204770088, "loss_lvr": 1.807538390159607, "loss_mode_switch": 0.0, "loss_total": 0.23921442031860352, "step": 168 }, { "batch_size": 1, "epoch": 0.0672, "step": 168, "tokens_per_device": 4742 }, { "epoch": 0.0672, "loss_ce": 0.010949213989078999, "loss_lvr": 1.1792593002319336, "loss_mode_switch": 0.0, "loss_total": 0.12887515127658844, "step": 168 }, { "batch_size": 4, "epoch": 0.0672, "step": 168, "tokens_per_device": 4984 }, { "epoch": 0.0672, "loss_ce": 0.08046060800552368, "loss_lvr": 0.9221054911613464, "loss_mode_switch": 0.0, "loss_total": 0.17267116904258728, "step": 168 }, { "batch_size": 4, "epoch": 0.0672, "step": 168, "tokens_per_device": 5308 }, { "epoch": 0.0672, "loss_ce": 0.28031423687934875, "loss_lvr": 0.9661112427711487, "loss_mode_switch": 0.0, "loss_total": 0.37692534923553467, "step": 168 }, { "epoch": 0.0676, "grad_norm": 1.5480536222457886, "learning_rate": 9.962971577982428e-06, "loss": 0.3152, "step": 169 }, { "batch_size": 4, "epoch": 0.0676, "step": 169, "tokens_per_device": 2680 }, { "epoch": 0.0676, "loss_ce": 0.1882074475288391, "loss_lvr": 0.9962616562843323, "loss_mode_switch": 0.0, "loss_total": 0.28783363103866577, "step": 169 }, { "batch_size": 4, "epoch": 0.0676, "step": 169, "tokens_per_device": 12412 }, { "epoch": 0.0676, "loss_ce": 0.28561869263648987, "loss_lvr": 1.6173133850097656, "loss_mode_switch": 0.0, "loss_total": 0.44735002517700195, "step": 169 }, { "batch_size": 4, "epoch": 0.0676, "step": 169, "tokens_per_device": 4228 }, { "epoch": 0.0676, "loss_ce": 0.1474572718143463, "loss_lvr": 1.949873685836792, "loss_mode_switch": 0.0, "loss_total": 0.34244465827941895, "step": 169 }, { "batch_size": 1, "epoch": 0.0676, "step": 169, "tokens_per_device": 5022 }, { "epoch": 0.0676, "loss_ce": 0.12991294264793396, "loss_lvr": 1.0693628787994385, "loss_mode_switch": 0.0, "loss_total": 0.23684923350811005, "step": 169 }, { "batch_size": 4, "epoch": 0.0676, "step": 169, "tokens_per_device": 8868 }, { "epoch": 0.0676, "loss_ce": 0.5289425253868103, "loss_lvr": 1.1964917182922363, "loss_mode_switch": 0.0, "loss_total": 0.6485916972160339, "step": 169 }, { "batch_size": 1, "epoch": 0.0676, "step": 169, "tokens_per_device": 4852 }, { "epoch": 0.0676, "loss_ce": 0.11319476366043091, "loss_lvr": 0.4613303244113922, "loss_mode_switch": 0.0, "loss_total": 0.15932780504226685, "step": 169 }, { "batch_size": 4, "epoch": 0.0676, "step": 169, "tokens_per_device": 2756 }, { "epoch": 0.0676, "loss_ce": 0.3327353894710541, "loss_lvr": 0.9665481448173523, "loss_mode_switch": 0.0, "loss_total": 0.42939019203186035, "step": 169 }, { "batch_size": 1, "epoch": 0.0676, "step": 169, "tokens_per_device": 5169 }, { "epoch": 0.0676, "loss_ce": 0.05495114251971245, "loss_lvr": 1.2825545072555542, "loss_mode_switch": 0.0, "loss_total": 0.18320660293102264, "step": 169 }, { "epoch": 0.068, "grad_norm": 1.6161566972732544, "learning_rate": 9.962180548552812e-06, "loss": 0.3734, "step": 170 }, { "batch_size": 4, "epoch": 0.068, "step": 170, "tokens_per_device": 15164 }, { "epoch": 0.068, "loss_ce": 0.753241240978241, "loss_lvr": 1.2879185676574707, "loss_mode_switch": 0.0, "loss_total": 0.882033109664917, "step": 170 }, { "batch_size": 4, "epoch": 0.068, "step": 170, "tokens_per_device": 5828 }, { "epoch": 0.068, "loss_ce": 0.22646117210388184, "loss_lvr": 1.031978726387024, "loss_mode_switch": 0.0, "loss_total": 0.32965904474258423, "step": 170 }, { "batch_size": 4, "epoch": 0.068, "step": 170, "tokens_per_device": 4008 }, { "epoch": 0.068, "loss_ce": 0.1676587015390396, "loss_lvr": 1.576751708984375, "loss_mode_switch": 0.0, "loss_total": 0.3253338932991028, "step": 170 }, { "batch_size": 4, "epoch": 0.068, "step": 170, "tokens_per_device": 4248 }, { "epoch": 0.068, "loss_ce": 0.05533526837825775, "loss_lvr": 1.1409794092178345, "loss_mode_switch": 0.0, "loss_total": 0.16943320631980896, "step": 170 }, { "batch_size": 4, "epoch": 0.068, "step": 170, "tokens_per_device": 1668 }, { "epoch": 0.068, "loss_ce": 0.6292902827262878, "loss_lvr": 1.260876178741455, "loss_mode_switch": 0.0, "loss_total": 0.7553778886795044, "step": 170 }, { "batch_size": 4, "epoch": 0.068, "step": 170, "tokens_per_device": 1380 }, { "epoch": 0.068, "loss_ce": 0.3762865364551544, "loss_lvr": 1.4640201330184937, "loss_mode_switch": 0.0, "loss_total": 0.5226885676383972, "step": 170 }, { "batch_size": 4, "epoch": 0.068, "step": 170, "tokens_per_device": 9792 }, { "epoch": 0.068, "loss_ce": 0.5438784956932068, "loss_lvr": 1.3197224140167236, "loss_mode_switch": 0.0, "loss_total": 0.6758507490158081, "step": 170 }, { "batch_size": 4, "epoch": 0.068, "step": 170, "tokens_per_device": 1676 }, { "epoch": 0.068, "loss_ce": 0.5362406969070435, "loss_lvr": 1.3875759840011597, "loss_mode_switch": 0.0, "loss_total": 0.6749982833862305, "step": 170 }, { "epoch": 0.0684, "grad_norm": 1.6310739517211914, "learning_rate": 9.96138119096894e-06, "loss": 0.4079, "step": 171 }, { "batch_size": 4, "epoch": 0.0684, "step": 171, "tokens_per_device": 3800 }, { "epoch": 0.0684, "loss_ce": 0.053261902183294296, "loss_lvr": 1.530387043952942, "loss_mode_switch": 0.0, "loss_total": 0.20630061626434326, "step": 171 }, { "batch_size": 4, "epoch": 0.0684, "step": 171, "tokens_per_device": 4212 }, { "epoch": 0.0684, "loss_ce": 0.8380310535430908, "loss_lvr": 1.3940244913101196, "loss_mode_switch": 0.0, "loss_total": 0.9774335026741028, "step": 171 }, { "batch_size": 1, "epoch": 0.0684, "step": 171, "tokens_per_device": 4889 }, { "epoch": 0.0684, "loss_ce": 0.04904548078775406, "loss_lvr": 0.7669634819030762, "loss_mode_switch": 0.0, "loss_total": 0.1257418394088745, "step": 171 }, { "batch_size": 4, "epoch": 0.0684, "step": 171, "tokens_per_device": 1684 }, { "epoch": 0.0684, "loss_ce": 0.5436164140701294, "loss_lvr": 1.1550122499465942, "loss_mode_switch": 0.0, "loss_total": 0.6591176390647888, "step": 171 }, { "batch_size": 4, "epoch": 0.0684, "step": 171, "tokens_per_device": 4264 }, { "epoch": 0.0684, "loss_ce": 0.22840766608715057, "loss_lvr": 1.3981194496154785, "loss_mode_switch": 0.0, "loss_total": 0.36821961402893066, "step": 171 }, { "batch_size": 1, "epoch": 0.0684, "step": 171, "tokens_per_device": 5193 }, { "epoch": 0.0684, "loss_ce": 0.047197192907333374, "loss_lvr": 1.3366059064865112, "loss_mode_switch": 0.0, "loss_total": 0.1808577924966812, "step": 171 }, { "batch_size": 4, "epoch": 0.0684, "step": 171, "tokens_per_device": 4160 }, { "epoch": 0.0684, "loss_ce": 0.4072568714618683, "loss_lvr": 1.1952396631240845, "loss_mode_switch": 0.0, "loss_total": 0.5267808437347412, "step": 171 }, { "batch_size": 4, "epoch": 0.0684, "step": 171, "tokens_per_device": 1296 }, { "epoch": 0.0684, "loss_ce": 0.6904906630516052, "loss_lvr": 1.993910312652588, "loss_mode_switch": 0.0, "loss_total": 0.8898817300796509, "step": 171 }, { "epoch": 0.0688, "grad_norm": 2.3187620639801025, "learning_rate": 9.960573506572391e-06, "loss": 0.3624, "step": 172 }, { "batch_size": 4, "epoch": 0.0688, "step": 172, "tokens_per_device": 1824 }, { "epoch": 0.0688, "loss_ce": 0.24457944929599762, "loss_lvr": 1.4204628467559814, "loss_mode_switch": 0.0, "loss_total": 0.386625736951828, "step": 172 }, { "batch_size": 4, "epoch": 0.0688, "step": 172, "tokens_per_device": 4248 }, { "epoch": 0.0688, "loss_ce": 0.49811655282974243, "loss_lvr": 1.3054304122924805, "loss_mode_switch": 0.0, "loss_total": 0.6286596059799194, "step": 172 }, { "batch_size": 1, "epoch": 0.0688, "step": 172, "tokens_per_device": 4142 }, { "epoch": 0.0688, "loss_ce": 0.010937772691249847, "loss_lvr": 1.394428014755249, "loss_mode_switch": 0.0, "loss_total": 0.15038058161735535, "step": 172 }, { "batch_size": 1, "epoch": 0.0688, "step": 172, "tokens_per_device": 5194 }, { "epoch": 0.0688, "loss_ce": 0.16072262823581696, "loss_lvr": 0.3804105222225189, "loss_mode_switch": 0.0, "loss_total": 0.1987636834383011, "step": 172 }, { "batch_size": 1, "epoch": 0.0688, "step": 172, "tokens_per_device": 5135 }, { "epoch": 0.0688, "loss_ce": 0.005018997471779585, "loss_lvr": 0.3459712862968445, "loss_mode_switch": 0.0, "loss_total": 0.03961612656712532, "step": 172 }, { "batch_size": 4, "epoch": 0.0688, "step": 172, "tokens_per_device": 2548 }, { "epoch": 0.0688, "loss_ce": 0.5307923555374146, "loss_lvr": 1.5033624172210693, "loss_mode_switch": 0.0, "loss_total": 0.6811286211013794, "step": 172 }, { "batch_size": 4, "epoch": 0.0688, "step": 172, "tokens_per_device": 3856 }, { "epoch": 0.0688, "loss_ce": 0.22549666464328766, "loss_lvr": 1.5353336334228516, "loss_mode_switch": 0.0, "loss_total": 0.3790300488471985, "step": 172 }, { "batch_size": 4, "epoch": 0.0688, "step": 172, "tokens_per_device": 5684 }, { "epoch": 0.0688, "loss_ce": 0.29164963960647583, "loss_lvr": 1.444493055343628, "loss_mode_switch": 0.0, "loss_total": 0.43609893321990967, "step": 172 }, { "epoch": 0.0692, "grad_norm": 1.7123408317565918, "learning_rate": 9.959757496718723e-06, "loss": 0.3659, "step": 173 }, { "batch_size": 1, "epoch": 0.0692, "step": 173, "tokens_per_device": 4892 }, { "epoch": 0.0692, "loss_ce": 0.01333101186901331, "loss_lvr": 1.0194082260131836, "loss_mode_switch": 0.0, "loss_total": 0.11527183651924133, "step": 173 }, { "batch_size": 4, "epoch": 0.0692, "step": 173, "tokens_per_device": 6120 }, { "epoch": 0.0692, "loss_ce": 0.0483500137925148, "loss_lvr": 1.261391043663025, "loss_mode_switch": 0.0, "loss_total": 0.1744891107082367, "step": 173 }, { "batch_size": 4, "epoch": 0.0692, "step": 173, "tokens_per_device": 2772 }, { "epoch": 0.0692, "loss_ce": 0.49234113097190857, "loss_lvr": 0.9954314827919006, "loss_mode_switch": 0.0, "loss_total": 0.5918842554092407, "step": 173 }, { "batch_size": 1, "epoch": 0.0692, "step": 173, "tokens_per_device": 5152 }, { "epoch": 0.0692, "loss_ce": 0.02063143439590931, "loss_lvr": 0.902269721031189, "loss_mode_switch": 0.0, "loss_total": 0.11085840314626694, "step": 173 }, { "batch_size": 4, "epoch": 0.0692, "step": 173, "tokens_per_device": 1496 }, { "epoch": 0.0692, "loss_ce": 0.10244429111480713, "loss_lvr": 1.341766357421875, "loss_mode_switch": 0.0, "loss_total": 0.2366209328174591, "step": 173 }, { "batch_size": 4, "epoch": 0.0692, "step": 173, "tokens_per_device": 11128 }, { "epoch": 0.0692, "loss_ce": 0.6641092896461487, "loss_lvr": 0.8567756414413452, "loss_mode_switch": 0.0, "loss_total": 0.7497868537902832, "step": 173 }, { "batch_size": 4, "epoch": 0.0692, "step": 173, "tokens_per_device": 5372 }, { "epoch": 0.0692, "loss_ce": 0.32435694336891174, "loss_lvr": 1.0446114540100098, "loss_mode_switch": 0.0, "loss_total": 0.42881810665130615, "step": 173 }, { "batch_size": 4, "epoch": 0.0692, "step": 173, "tokens_per_device": 3816 }, { "epoch": 0.0692, "loss_ce": 0.28613725304603577, "loss_lvr": 1.4448753595352173, "loss_mode_switch": 0.0, "loss_total": 0.430624783039093, "step": 173 }, { "epoch": 0.0696, "grad_norm": 1.5524438619613647, "learning_rate": 9.958933162777468e-06, "loss": 0.3718, "step": 174 }, { "batch_size": 1, "epoch": 0.0696, "step": 174, "tokens_per_device": 5100 }, { "epoch": 0.0696, "loss_ce": 0.01564769074320793, "loss_lvr": 1.0314661264419556, "loss_mode_switch": 0.0, "loss_total": 0.11879430711269379, "step": 174 }, { "batch_size": 4, "epoch": 0.0696, "step": 174, "tokens_per_device": 4596 }, { "epoch": 0.0696, "loss_ce": 0.11094263941049576, "loss_lvr": 0.8541259169578552, "loss_mode_switch": 0.0, "loss_total": 0.19635522365570068, "step": 174 }, { "batch_size": 4, "epoch": 0.0696, "step": 174, "tokens_per_device": 7912 }, { "epoch": 0.0696, "loss_ce": 0.3486528992652893, "loss_lvr": 1.211955189704895, "loss_mode_switch": 0.0, "loss_total": 0.4698484241962433, "step": 174 }, { "batch_size": 4, "epoch": 0.0696, "step": 174, "tokens_per_device": 4652 }, { "epoch": 0.0696, "loss_ce": 0.19444981217384338, "loss_lvr": 1.2198538780212402, "loss_mode_switch": 0.0, "loss_total": 0.31643521785736084, "step": 174 }, { "batch_size": 4, "epoch": 0.0696, "step": 174, "tokens_per_device": 5664 }, { "epoch": 0.0696, "loss_ce": 0.3371671438217163, "loss_lvr": 0.8568983674049377, "loss_mode_switch": 0.0, "loss_total": 0.42285698652267456, "step": 174 }, { "batch_size": 4, "epoch": 0.0696, "step": 174, "tokens_per_device": 2572 }, { "epoch": 0.0696, "loss_ce": 0.6511062383651733, "loss_lvr": 1.655781626701355, "loss_mode_switch": 0.0, "loss_total": 0.8166844248771667, "step": 174 }, { "batch_size": 4, "epoch": 0.0696, "step": 174, "tokens_per_device": 1400 }, { "epoch": 0.0696, "loss_ce": 0.4834129214286804, "loss_lvr": 1.252236008644104, "loss_mode_switch": 0.0, "loss_total": 0.6086364984512329, "step": 174 }, { "batch_size": 4, "epoch": 0.0696, "step": 174, "tokens_per_device": 4368 }, { "epoch": 0.0696, "loss_ce": 0.2285010665655136, "loss_lvr": 1.2465457916259766, "loss_mode_switch": 0.0, "loss_total": 0.35315564274787903, "step": 174 }, { "epoch": 0.07, "grad_norm": 1.8904733657836914, "learning_rate": 9.958100506132127e-06, "loss": 0.3468, "step": 175 }, { "batch_size": 1, "epoch": 0.07, "step": 175, "tokens_per_device": 4986 }, { "epoch": 0.07, "loss_ce": 0.093579962849617, "loss_lvr": 0.8489606380462646, "loss_mode_switch": 0.0, "loss_total": 0.17847603559494019, "step": 175 }, { "batch_size": 1, "epoch": 0.07, "step": 175, "tokens_per_device": 4732 }, { "epoch": 0.07, "loss_ce": 0.019448451697826385, "loss_lvr": 1.2050049304962158, "loss_mode_switch": 0.0, "loss_total": 0.13994894921779633, "step": 175 }, { "batch_size": 4, "epoch": 0.07, "step": 175, "tokens_per_device": 5736 }, { "epoch": 0.07, "loss_ce": 0.6991085410118103, "loss_lvr": 0.7997556328773499, "loss_mode_switch": 0.0, "loss_total": 0.7790840864181519, "step": 175 }, { "batch_size": 4, "epoch": 0.07, "step": 175, "tokens_per_device": 4384 }, { "epoch": 0.07, "loss_ce": 0.711153507232666, "loss_lvr": 0.6472577452659607, "loss_mode_switch": 0.0, "loss_total": 0.7758792638778687, "step": 175 }, { "batch_size": 4, "epoch": 0.07, "step": 175, "tokens_per_device": 4440 }, { "epoch": 0.07, "loss_ce": 0.44275614619255066, "loss_lvr": 1.3070802688598633, "loss_mode_switch": 0.0, "loss_total": 0.5734641551971436, "step": 175 }, { "batch_size": 4, "epoch": 0.07, "step": 175, "tokens_per_device": 3760 }, { "epoch": 0.07, "loss_ce": 0.36429283022880554, "loss_lvr": 1.0217093229293823, "loss_mode_switch": 0.0, "loss_total": 0.46646377444267273, "step": 175 }, { "batch_size": 1, "epoch": 0.07, "step": 175, "tokens_per_device": 5109 }, { "epoch": 0.07, "loss_ce": 0.02441580593585968, "loss_lvr": 0.6707730293273926, "loss_mode_switch": 0.0, "loss_total": 0.09149310737848282, "step": 175 }, { "batch_size": 4, "epoch": 0.07, "step": 175, "tokens_per_device": 1940 }, { "epoch": 0.07, "loss_ce": 0.5576189160346985, "loss_lvr": 1.3836889266967773, "loss_mode_switch": 0.0, "loss_total": 0.6959878206253052, "step": 175 }, { "epoch": 0.0704, "grad_norm": 2.1008472442626953, "learning_rate": 9.957259528180166e-06, "loss": 0.3845, "step": 176 }, { "batch_size": 4, "epoch": 0.0704, "step": 176, "tokens_per_device": 4920 }, { "epoch": 0.0704, "loss_ce": 0.04945860058069229, "loss_lvr": 1.1502553224563599, "loss_mode_switch": 0.0, "loss_total": 0.1644841432571411, "step": 176 }, { "batch_size": 4, "epoch": 0.0704, "step": 176, "tokens_per_device": 5812 }, { "epoch": 0.0704, "loss_ce": 0.05571186915040016, "loss_lvr": 0.9521036744117737, "loss_mode_switch": 0.0, "loss_total": 0.1509222388267517, "step": 176 }, { "batch_size": 4, "epoch": 0.0704, "step": 176, "tokens_per_device": 4544 }, { "epoch": 0.0704, "loss_ce": 0.28835079073905945, "loss_lvr": 0.9510524272918701, "loss_mode_switch": 0.0, "loss_total": 0.3834560513496399, "step": 176 }, { "batch_size": 4, "epoch": 0.0704, "step": 176, "tokens_per_device": 1732 }, { "epoch": 0.0704, "loss_ce": 0.7268524169921875, "loss_lvr": 1.2925503253936768, "loss_mode_switch": 0.0, "loss_total": 0.8561074733734131, "step": 176 }, { "batch_size": 4, "epoch": 0.0704, "step": 176, "tokens_per_device": 6076 }, { "epoch": 0.0704, "loss_ce": 0.2432427853345871, "loss_lvr": 0.9663426876068115, "loss_mode_switch": 0.0, "loss_total": 0.33987706899642944, "step": 176 }, { "batch_size": 4, "epoch": 0.0704, "step": 176, "tokens_per_device": 7184 }, { "epoch": 0.0704, "loss_ce": 0.6224603056907654, "loss_lvr": 1.0748847723007202, "loss_mode_switch": 0.0, "loss_total": 0.7299487590789795, "step": 176 }, { "batch_size": 1, "epoch": 0.0704, "step": 176, "tokens_per_device": 5183 }, { "epoch": 0.0704, "loss_ce": 0.047293856739997864, "loss_lvr": 1.3497036695480347, "loss_mode_switch": 0.0, "loss_total": 0.18226422369480133, "step": 176 }, { "batch_size": 4, "epoch": 0.0704, "step": 176, "tokens_per_device": 4252 }, { "epoch": 0.0704, "loss_ce": 0.8244686126708984, "loss_lvr": 1.6631947755813599, "loss_mode_switch": 0.0, "loss_total": 0.9907881021499634, "step": 176 }, { "epoch": 0.0708, "grad_norm": 1.7855749130249023, "learning_rate": 9.956410230333023e-06, "loss": 0.3403, "step": 177 }, { "batch_size": 4, "epoch": 0.0708, "step": 177, "tokens_per_device": 4700 }, { "epoch": 0.0708, "loss_ce": 0.21252253651618958, "loss_lvr": 0.9706394076347351, "loss_mode_switch": 0.0, "loss_total": 0.30958646535873413, "step": 177 }, { "batch_size": 4, "epoch": 0.0708, "step": 177, "tokens_per_device": 3596 }, { "epoch": 0.0708, "loss_ce": 0.0384550616145134, "loss_lvr": 1.4708807468414307, "loss_mode_switch": 0.0, "loss_total": 0.18554314970970154, "step": 177 }, { "batch_size": 4, "epoch": 0.0708, "step": 177, "tokens_per_device": 4224 }, { "epoch": 0.0708, "loss_ce": 0.24555404484272003, "loss_lvr": 1.6874431371688843, "loss_mode_switch": 0.0, "loss_total": 0.4142983555793762, "step": 177 }, { "batch_size": 4, "epoch": 0.0708, "step": 177, "tokens_per_device": 9616 }, { "epoch": 0.0708, "loss_ce": 0.37176620960235596, "loss_lvr": 1.1479346752166748, "loss_mode_switch": 0.0, "loss_total": 0.4865596890449524, "step": 177 }, { "batch_size": 4, "epoch": 0.0708, "step": 177, "tokens_per_device": 2712 }, { "epoch": 0.0708, "loss_ce": 0.12703882157802582, "loss_lvr": 0.9599010944366455, "loss_mode_switch": 0.0, "loss_total": 0.22302892804145813, "step": 177 }, { "batch_size": 4, "epoch": 0.0708, "step": 177, "tokens_per_device": 3996 }, { "epoch": 0.0708, "loss_ce": 0.33734023571014404, "loss_lvr": 1.2922101020812988, "loss_mode_switch": 0.0, "loss_total": 0.4665612578392029, "step": 177 }, { "batch_size": 4, "epoch": 0.0708, "step": 177, "tokens_per_device": 3892 }, { "epoch": 0.0708, "loss_ce": 0.33596888184547424, "loss_lvr": 1.158991813659668, "loss_mode_switch": 0.0, "loss_total": 0.45186805725097656, "step": 177 }, { "batch_size": 4, "epoch": 0.0708, "step": 177, "tokens_per_device": 4304 }, { "epoch": 0.0708, "loss_ce": 0.003097220091149211, "loss_lvr": 1.045160174369812, "loss_mode_switch": 0.0, "loss_total": 0.10761324316263199, "step": 177 }, { "epoch": 0.0712, "grad_norm": 2.0437052249908447, "learning_rate": 9.955552614016093e-06, "loss": 0.3144, "step": 178 }, { "batch_size": 4, "epoch": 0.0712, "step": 178, "tokens_per_device": 12152 }, { "epoch": 0.0712, "loss_ce": 0.3692132532596588, "loss_lvr": 1.4046707153320312, "loss_mode_switch": 0.0, "loss_total": 0.5096803307533264, "step": 178 }, { "batch_size": 1, "epoch": 0.0712, "step": 178, "tokens_per_device": 4901 }, { "epoch": 0.0712, "loss_ce": 0.07434353977441788, "loss_lvr": 0.6826249957084656, "loss_mode_switch": 0.0, "loss_total": 0.14260604977607727, "step": 178 }, { "batch_size": 4, "epoch": 0.0712, "step": 178, "tokens_per_device": 4224 }, { "epoch": 0.0712, "loss_ce": 0.3854885697364807, "loss_lvr": 1.3474076986312866, "loss_mode_switch": 0.0, "loss_total": 0.5202293395996094, "step": 178 }, { "batch_size": 4, "epoch": 0.0712, "step": 178, "tokens_per_device": 5904 }, { "epoch": 0.0712, "loss_ce": 0.04752723500132561, "loss_lvr": 1.25173819065094, "loss_mode_switch": 0.0, "loss_total": 0.17270106077194214, "step": 178 }, { "batch_size": 4, "epoch": 0.0712, "step": 178, "tokens_per_device": 3744 }, { "epoch": 0.0712, "loss_ce": 0.19846849143505096, "loss_lvr": 1.5618642568588257, "loss_mode_switch": 0.0, "loss_total": 0.3546549081802368, "step": 178 }, { "batch_size": 4, "epoch": 0.0712, "step": 178, "tokens_per_device": 4272 }, { "epoch": 0.0712, "loss_ce": 0.25992199778556824, "loss_lvr": 1.6738712787628174, "loss_mode_switch": 0.0, "loss_total": 0.42730912566185, "step": 178 }, { "batch_size": 4, "epoch": 0.0712, "step": 178, "tokens_per_device": 9804 }, { "epoch": 0.0712, "loss_ce": 0.024075817316770554, "loss_lvr": 1.3064384460449219, "loss_mode_switch": 0.0, "loss_total": 0.15471966564655304, "step": 178 }, { "batch_size": 4, "epoch": 0.0712, "step": 178, "tokens_per_device": 4280 }, { "epoch": 0.0712, "loss_ce": 0.31540051102638245, "loss_lvr": 1.9179123640060425, "loss_mode_switch": 0.0, "loss_total": 0.5071917772293091, "step": 178 }, { "epoch": 0.0716, "grad_norm": 2.122288465499878, "learning_rate": 9.954686680668737e-06, "loss": 0.4123, "step": 179 }, { "batch_size": 4, "epoch": 0.0716, "step": 179, "tokens_per_device": 8368 }, { "epoch": 0.0716, "loss_ce": 0.2915196120738983, "loss_lvr": 1.0908002853393555, "loss_mode_switch": 0.0, "loss_total": 0.4005996584892273, "step": 179 }, { "batch_size": 4, "epoch": 0.0716, "step": 179, "tokens_per_device": 1332 }, { "epoch": 0.0716, "loss_ce": 0.28854718804359436, "loss_lvr": 1.6479716300964355, "loss_mode_switch": 0.0, "loss_total": 0.45334434509277344, "step": 179 }, { "batch_size": 4, "epoch": 0.0716, "step": 179, "tokens_per_device": 6440 }, { "epoch": 0.0716, "loss_ce": 0.24845004081726074, "loss_lvr": 0.8738582134246826, "loss_mode_switch": 0.0, "loss_total": 0.33583587408065796, "step": 179 }, { "batch_size": 1, "epoch": 0.0716, "step": 179, "tokens_per_device": 5158 }, { "epoch": 0.0716, "loss_ce": 0.03793817386031151, "loss_lvr": 0.7887166738510132, "loss_mode_switch": 0.0, "loss_total": 0.11680984497070312, "step": 179 }, { "batch_size": 4, "epoch": 0.0716, "step": 179, "tokens_per_device": 4224 }, { "epoch": 0.0716, "loss_ce": 0.4479076862335205, "loss_lvr": 1.405505657196045, "loss_mode_switch": 0.0, "loss_total": 0.588458240032196, "step": 179 }, { "batch_size": 4, "epoch": 0.0716, "step": 179, "tokens_per_device": 4220 }, { "epoch": 0.0716, "loss_ce": 0.03793375566601753, "loss_lvr": 1.4686895608901978, "loss_mode_switch": 0.0, "loss_total": 0.18480271100997925, "step": 179 }, { "batch_size": 4, "epoch": 0.0716, "step": 179, "tokens_per_device": 1340 }, { "epoch": 0.0716, "loss_ce": 0.5270034074783325, "loss_lvr": 1.3434159755706787, "loss_mode_switch": 0.0, "loss_total": 0.6613450050354004, "step": 179 }, { "batch_size": 1, "epoch": 0.0716, "step": 179, "tokens_per_device": 4738 }, { "epoch": 0.0716, "loss_ce": 0.015640638768672943, "loss_lvr": 1.2574633359909058, "loss_mode_switch": 0.0, "loss_total": 0.1413869857788086, "step": 179 }, { "epoch": 0.072, "grad_norm": 1.5619505643844604, "learning_rate": 9.953812431744274e-06, "loss": 0.3358, "step": 180 }, { "batch_size": 4, "epoch": 0.072, "step": 180, "tokens_per_device": 4188 }, { "epoch": 0.072, "loss_ce": 0.4917488098144531, "loss_lvr": 1.504562497138977, "loss_mode_switch": 0.0, "loss_total": 0.6422050595283508, "step": 180 }, { "batch_size": 1, "epoch": 0.072, "step": 180, "tokens_per_device": 5068 }, { "epoch": 0.072, "loss_ce": 0.0018171067349612713, "loss_lvr": 1.3387001752853394, "loss_mode_switch": 0.0, "loss_total": 0.13568712770938873, "step": 180 }, { "batch_size": 1, "epoch": 0.072, "step": 180, "tokens_per_device": 5495 }, { "epoch": 0.072, "loss_ce": 0.003746054135262966, "loss_lvr": 0.8091279864311218, "loss_mode_switch": 0.0, "loss_total": 0.08465885370969772, "step": 180 }, { "batch_size": 4, "epoch": 0.072, "step": 180, "tokens_per_device": 4852 }, { "epoch": 0.072, "loss_ce": 0.13259783387184143, "loss_lvr": 1.2133091688156128, "loss_mode_switch": 0.0, "loss_total": 0.2539287507534027, "step": 180 }, { "batch_size": 4, "epoch": 0.072, "step": 180, "tokens_per_device": 5288 }, { "epoch": 0.072, "loss_ce": 0.05394743010401726, "loss_lvr": 1.1755344867706299, "loss_mode_switch": 0.0, "loss_total": 0.17150087654590607, "step": 180 }, { "batch_size": 1, "epoch": 0.072, "step": 180, "tokens_per_device": 5023 }, { "epoch": 0.072, "loss_ce": 0.003125046845525503, "loss_lvr": 0.5175632238388062, "loss_mode_switch": 0.0, "loss_total": 0.054881367832422256, "step": 180 }, { "batch_size": 1, "epoch": 0.072, "step": 180, "tokens_per_device": 4867 }, { "epoch": 0.072, "loss_ce": 0.006001166068017483, "loss_lvr": 0.5932903289794922, "loss_mode_switch": 0.0, "loss_total": 0.06533019989728928, "step": 180 }, { "batch_size": 4, "epoch": 0.072, "step": 180, "tokens_per_device": 4700 }, { "epoch": 0.072, "loss_ce": 0.016829758882522583, "loss_lvr": 1.440525770187378, "loss_mode_switch": 0.0, "loss_total": 0.16088233888149261, "step": 180 }, { "epoch": 0.0724, "grad_norm": 1.5447955131530762, "learning_rate": 9.95292986870998e-06, "loss": 0.3682, "step": 181 }, { "batch_size": 1, "epoch": 0.0724, "step": 181, "tokens_per_device": 4893 }, { "epoch": 0.0724, "loss_ce": 0.12970440089702606, "loss_lvr": 0.41615769267082214, "loss_mode_switch": 0.0, "loss_total": 0.17132017016410828, "step": 181 }, { "batch_size": 4, "epoch": 0.0724, "step": 181, "tokens_per_device": 4500 }, { "epoch": 0.0724, "loss_ce": 0.24845927953720093, "loss_lvr": 1.089044213294983, "loss_mode_switch": 0.0, "loss_total": 0.3573637008666992, "step": 181 }, { "batch_size": 1, "epoch": 0.0724, "step": 181, "tokens_per_device": 5063 }, { "epoch": 0.0724, "loss_ce": 0.01142908539623022, "loss_lvr": 1.1145061254501343, "loss_mode_switch": 0.0, "loss_total": 0.12287969887256622, "step": 181 }, { "batch_size": 4, "epoch": 0.0724, "step": 181, "tokens_per_device": 2672 }, { "epoch": 0.0724, "loss_ce": 0.3675679862499237, "loss_lvr": 1.3574128150939941, "loss_mode_switch": 0.0, "loss_total": 0.5033092498779297, "step": 181 }, { "batch_size": 1, "epoch": 0.0724, "step": 181, "tokens_per_device": 5118 }, { "epoch": 0.0724, "loss_ce": 0.08760011941194534, "loss_lvr": 0.3819504678249359, "loss_mode_switch": 0.0, "loss_total": 0.1257951706647873, "step": 181 }, { "batch_size": 4, "epoch": 0.0724, "step": 181, "tokens_per_device": 3796 }, { "epoch": 0.0724, "loss_ce": 0.6672126650810242, "loss_lvr": 1.1873024702072144, "loss_mode_switch": 0.0, "loss_total": 0.7859429121017456, "step": 181 }, { "batch_size": 4, "epoch": 0.0724, "step": 181, "tokens_per_device": 2600 }, { "epoch": 0.0724, "loss_ce": 0.12543272972106934, "loss_lvr": 1.0693068504333496, "loss_mode_switch": 0.0, "loss_total": 0.23236341774463654, "step": 181 }, { "batch_size": 4, "epoch": 0.0724, "step": 181, "tokens_per_device": 2632 }, { "epoch": 0.0724, "loss_ce": 0.7415429353713989, "loss_lvr": 1.147292137145996, "loss_mode_switch": 0.0, "loss_total": 0.8562721610069275, "step": 181 }, { "epoch": 0.0728, "grad_norm": 1.8399771451950073, "learning_rate": 9.952038993047076e-06, "loss": 0.3747, "step": 182 }, { "batch_size": 1, "epoch": 0.0728, "step": 182, "tokens_per_device": 5132 }, { "epoch": 0.0728, "loss_ce": 0.008003776893019676, "loss_lvr": 0.8529272079467773, "loss_mode_switch": 0.0, "loss_total": 0.09329649806022644, "step": 182 }, { "batch_size": 4, "epoch": 0.0728, "step": 182, "tokens_per_device": 4360 }, { "epoch": 0.0728, "loss_ce": 0.160574808716774, "loss_lvr": 1.091832160949707, "loss_mode_switch": 0.0, "loss_total": 0.269758015871048, "step": 182 }, { "batch_size": 4, "epoch": 0.0728, "step": 182, "tokens_per_device": 4948 }, { "epoch": 0.0728, "loss_ce": 0.2818664610385895, "loss_lvr": 0.8334930539131165, "loss_mode_switch": 0.0, "loss_total": 0.3652157783508301, "step": 182 }, { "batch_size": 4, "epoch": 0.0728, "step": 182, "tokens_per_device": 4840 }, { "epoch": 0.0728, "loss_ce": 0.6702714562416077, "loss_lvr": 1.2847334146499634, "loss_mode_switch": 0.0, "loss_total": 0.798744797706604, "step": 182 }, { "batch_size": 4, "epoch": 0.0728, "step": 182, "tokens_per_device": 5488 }, { "epoch": 0.0728, "loss_ce": 0.4403918981552124, "loss_lvr": 0.980819046497345, "loss_mode_switch": 0.0, "loss_total": 0.5384737849235535, "step": 182 }, { "batch_size": 4, "epoch": 0.0728, "step": 182, "tokens_per_device": 5584 }, { "epoch": 0.0728, "loss_ce": 0.14215661585330963, "loss_lvr": 0.8369706869125366, "loss_mode_switch": 0.0, "loss_total": 0.22585368156433105, "step": 182 }, { "batch_size": 1, "epoch": 0.0728, "step": 182, "tokens_per_device": 4842 }, { "epoch": 0.0728, "loss_ce": 1.3145695924758911, "loss_lvr": 1.0730608701705933, "loss_mode_switch": 0.0, "loss_total": 1.4218757152557373, "step": 182 }, { "batch_size": 1, "epoch": 0.0728, "step": 182, "tokens_per_device": 5111 }, { "epoch": 0.0728, "loss_ce": 0.19935134053230286, "loss_lvr": 0.3889882266521454, "loss_mode_switch": 0.0, "loss_total": 0.23825016617774963, "step": 182 }, { "epoch": 0.0732, "grad_norm": 2.3459866046905518, "learning_rate": 9.951139806250747e-06, "loss": 0.3984, "step": 183 }, { "batch_size": 4, "epoch": 0.0732, "step": 183, "tokens_per_device": 3192 }, { "epoch": 0.0732, "loss_ce": 0.5048479437828064, "loss_lvr": 1.1557023525238037, "loss_mode_switch": 0.0, "loss_total": 0.6204181909561157, "step": 183 }, { "batch_size": 4, "epoch": 0.0732, "step": 183, "tokens_per_device": 5928 }, { "epoch": 0.0732, "loss_ce": 0.23671932518482208, "loss_lvr": 0.9325987100601196, "loss_mode_switch": 0.0, "loss_total": 0.32997918128967285, "step": 183 }, { "batch_size": 1, "epoch": 0.0732, "step": 183, "tokens_per_device": 5624 }, { "epoch": 0.0732, "loss_ce": 0.007189193740487099, "loss_lvr": 1.0317158699035645, "loss_mode_switch": 0.0, "loss_total": 0.1103607788681984, "step": 183 }, { "batch_size": 4, "epoch": 0.0732, "step": 183, "tokens_per_device": 5680 }, { "epoch": 0.0732, "loss_ce": 0.14250189065933228, "loss_lvr": 0.9144505262374878, "loss_mode_switch": 0.0, "loss_total": 0.23394694924354553, "step": 183 }, { "batch_size": 4, "epoch": 0.0732, "step": 183, "tokens_per_device": 5744 }, { "epoch": 0.0732, "loss_ce": 0.1181616559624672, "loss_lvr": 1.0684099197387695, "loss_mode_switch": 0.0, "loss_total": 0.22500264644622803, "step": 183 }, { "batch_size": 4, "epoch": 0.0732, "step": 183, "tokens_per_device": 4288 }, { "epoch": 0.0732, "loss_ce": 0.2132125347852707, "loss_lvr": 1.2945724725723267, "loss_mode_switch": 0.0, "loss_total": 0.3426697850227356, "step": 183 }, { "batch_size": 4, "epoch": 0.0732, "step": 183, "tokens_per_device": 5800 }, { "epoch": 0.0732, "loss_ce": 0.3718167841434479, "loss_lvr": 0.9401639699935913, "loss_mode_switch": 0.0, "loss_total": 0.4658331871032715, "step": 183 }, { "batch_size": 4, "epoch": 0.0732, "step": 183, "tokens_per_device": 4316 }, { "epoch": 0.0732, "loss_ce": 0.17883682250976562, "loss_lvr": 1.087151050567627, "loss_mode_switch": 0.0, "loss_total": 0.2875519394874573, "step": 183 }, { "epoch": 0.0736, "grad_norm": 1.434382438659668, "learning_rate": 9.950232309830121e-06, "loss": 0.3178, "step": 184 }, { "batch_size": 4, "epoch": 0.0736, "step": 184, "tokens_per_device": 5740 }, { "epoch": 0.0736, "loss_ce": 0.3119359314441681, "loss_lvr": 1.1445934772491455, "loss_mode_switch": 0.0, "loss_total": 0.4263952970504761, "step": 184 }, { "batch_size": 4, "epoch": 0.0736, "step": 184, "tokens_per_device": 6976 }, { "epoch": 0.0736, "loss_ce": 0.049933936446905136, "loss_lvr": 0.7623764276504517, "loss_mode_switch": 0.0, "loss_total": 0.12617157399654388, "step": 184 }, { "batch_size": 4, "epoch": 0.0736, "step": 184, "tokens_per_device": 6792 }, { "epoch": 0.0736, "loss_ce": 0.6343764066696167, "loss_lvr": 1.0101006031036377, "loss_mode_switch": 0.0, "loss_total": 0.7353864908218384, "step": 184 }, { "batch_size": 4, "epoch": 0.0736, "step": 184, "tokens_per_device": 4296 }, { "epoch": 0.0736, "loss_ce": 0.07205506414175034, "loss_lvr": 1.3128900527954102, "loss_mode_switch": 0.0, "loss_total": 0.20334407687187195, "step": 184 }, { "batch_size": 4, "epoch": 0.0736, "step": 184, "tokens_per_device": 4212 }, { "epoch": 0.0736, "loss_ce": 0.30100366473197937, "loss_lvr": 1.2087957859039307, "loss_mode_switch": 0.0, "loss_total": 0.4218832552433014, "step": 184 }, { "batch_size": 4, "epoch": 0.0736, "step": 184, "tokens_per_device": 1424 }, { "epoch": 0.0736, "loss_ce": 0.769260585308075, "loss_lvr": 1.3391988277435303, "loss_mode_switch": 0.0, "loss_total": 0.9031804800033569, "step": 184 }, { "batch_size": 4, "epoch": 0.0736, "step": 184, "tokens_per_device": 5068 }, { "epoch": 0.0736, "loss_ce": 0.5997377634048462, "loss_lvr": 1.1062034368515015, "loss_mode_switch": 0.0, "loss_total": 0.7103580832481384, "step": 184 }, { "batch_size": 4, "epoch": 0.0736, "step": 184, "tokens_per_device": 3980 }, { "epoch": 0.0736, "loss_ce": 0.3052200675010681, "loss_lvr": 1.2726414203643799, "loss_mode_switch": 0.0, "loss_total": 0.4324842095375061, "step": 184 }, { "epoch": 0.074, "grad_norm": 1.5645650625228882, "learning_rate": 9.94931650530827e-06, "loss": 0.3678, "step": 185 }, { "batch_size": 1, "epoch": 0.074, "step": 185, "tokens_per_device": 5016 }, { "epoch": 0.074, "loss_ce": 0.016596350818872452, "loss_lvr": 0.4479300379753113, "loss_mode_switch": 0.0, "loss_total": 0.06138935685157776, "step": 185 }, { "batch_size": 1, "epoch": 0.074, "step": 185, "tokens_per_device": 4869 }, { "epoch": 0.074, "loss_ce": 0.1015300378203392, "loss_lvr": 0.48706817626953125, "loss_mode_switch": 0.0, "loss_total": 0.15023685991764069, "step": 185 }, { "batch_size": 4, "epoch": 0.074, "step": 185, "tokens_per_device": 4060 }, { "epoch": 0.074, "loss_ce": 0.2985939681529999, "loss_lvr": 1.0859562158584595, "loss_mode_switch": 0.0, "loss_total": 0.40718960762023926, "step": 185 }, { "batch_size": 4, "epoch": 0.074, "step": 185, "tokens_per_device": 5732 }, { "epoch": 0.074, "loss_ce": 0.5364417433738708, "loss_lvr": 1.298486351966858, "loss_mode_switch": 0.0, "loss_total": 0.6662904024124146, "step": 185 }, { "batch_size": 1, "epoch": 0.074, "step": 185, "tokens_per_device": 5118 }, { "epoch": 0.074, "loss_ce": 0.5988598465919495, "loss_lvr": 0.6245971322059631, "loss_mode_switch": 0.0, "loss_total": 0.6613195538520813, "step": 185 }, { "batch_size": 4, "epoch": 0.074, "step": 185, "tokens_per_device": 2576 }, { "epoch": 0.074, "loss_ce": 0.47002914547920227, "loss_lvr": 1.5671360492706299, "loss_mode_switch": 0.0, "loss_total": 0.6267427206039429, "step": 185 }, { "batch_size": 4, "epoch": 0.074, "step": 185, "tokens_per_device": 5284 }, { "epoch": 0.074, "loss_ce": 0.29324397444725037, "loss_lvr": 0.8911540508270264, "loss_mode_switch": 0.0, "loss_total": 0.3823593854904175, "step": 185 }, { "batch_size": 4, "epoch": 0.074, "step": 185, "tokens_per_device": 1896 }, { "epoch": 0.074, "loss_ce": 0.41121503710746765, "loss_lvr": 1.1472067832946777, "loss_mode_switch": 0.0, "loss_total": 0.525935709476471, "step": 185 }, { "epoch": 0.0744, "grad_norm": 1.7530053853988647, "learning_rate": 9.948392394222214e-06, "loss": 0.342, "step": 186 }, { "batch_size": 4, "epoch": 0.0744, "step": 186, "tokens_per_device": 4252 }, { "epoch": 0.0744, "loss_ce": 0.1339702010154724, "loss_lvr": 1.3959568738937378, "loss_mode_switch": 0.0, "loss_total": 0.2735658884048462, "step": 186 }, { "batch_size": 4, "epoch": 0.0744, "step": 186, "tokens_per_device": 5260 }, { "epoch": 0.0744, "loss_ce": 0.3919907510280609, "loss_lvr": 1.216344952583313, "loss_mode_switch": 0.0, "loss_total": 0.5136252641677856, "step": 186 }, { "batch_size": 4, "epoch": 0.0744, "step": 186, "tokens_per_device": 4320 }, { "epoch": 0.0744, "loss_ce": 0.10626707226037979, "loss_lvr": 1.1407032012939453, "loss_mode_switch": 0.0, "loss_total": 0.2203373908996582, "step": 186 }, { "batch_size": 4, "epoch": 0.0744, "step": 186, "tokens_per_device": 4428 }, { "epoch": 0.0744, "loss_ce": 0.015712015330791473, "loss_lvr": 0.9954922795295715, "loss_mode_switch": 0.0, "loss_total": 0.11526124179363251, "step": 186 }, { "batch_size": 4, "epoch": 0.0744, "step": 186, "tokens_per_device": 4836 }, { "epoch": 0.0744, "loss_ce": 0.31681543588638306, "loss_lvr": 0.9219174385070801, "loss_mode_switch": 0.0, "loss_total": 0.40900719165802, "step": 186 }, { "batch_size": 1, "epoch": 0.0744, "step": 186, "tokens_per_device": 4422 }, { "epoch": 0.0744, "loss_ce": 0.14811590313911438, "loss_lvr": 0.8807464241981506, "loss_mode_switch": 0.0, "loss_total": 0.2361905574798584, "step": 186 }, { "batch_size": 4, "epoch": 0.0744, "step": 186, "tokens_per_device": 4252 }, { "epoch": 0.0744, "loss_ce": 0.021632157266139984, "loss_lvr": 1.3688362836837769, "loss_mode_switch": 0.0, "loss_total": 0.1585157811641693, "step": 186 }, { "batch_size": 1, "epoch": 0.0744, "step": 186, "tokens_per_device": 4977 }, { "epoch": 0.0744, "loss_ce": 0.3311573266983032, "loss_lvr": 1.0015252828598022, "loss_mode_switch": 0.0, "loss_total": 0.43130984902381897, "step": 186 }, { "epoch": 0.0748, "grad_norm": 2.213036060333252, "learning_rate": 9.947459978122912e-06, "loss": 0.3811, "step": 187 }, { "batch_size": 1, "epoch": 0.0748, "step": 187, "tokens_per_device": 4694 }, { "epoch": 0.0748, "loss_ce": 0.0038172851782292128, "loss_lvr": 0.32946938276290894, "loss_mode_switch": 0.0, "loss_total": 0.036764226853847504, "step": 187 }, { "batch_size": 4, "epoch": 0.0748, "step": 187, "tokens_per_device": 3796 }, { "epoch": 0.0748, "loss_ce": 0.1457853615283966, "loss_lvr": 1.7483893632888794, "loss_mode_switch": 0.0, "loss_total": 0.32062429189682007, "step": 187 }, { "batch_size": 4, "epoch": 0.0748, "step": 187, "tokens_per_device": 1212 }, { "epoch": 0.0748, "loss_ce": 0.045147351920604706, "loss_lvr": 1.9876559972763062, "loss_mode_switch": 0.0, "loss_total": 0.2439129650592804, "step": 187 }, { "batch_size": 4, "epoch": 0.0748, "step": 187, "tokens_per_device": 4448 }, { "epoch": 0.0748, "loss_ce": 0.17306704819202423, "loss_lvr": 1.3996297121047974, "loss_mode_switch": 0.0, "loss_total": 0.3130300045013428, "step": 187 }, { "batch_size": 4, "epoch": 0.0748, "step": 187, "tokens_per_device": 3840 }, { "epoch": 0.0748, "loss_ce": 0.8277130126953125, "loss_lvr": 1.226387619972229, "loss_mode_switch": 0.0, "loss_total": 0.9503517746925354, "step": 187 }, { "batch_size": 4, "epoch": 0.0748, "step": 187, "tokens_per_device": 9180 }, { "epoch": 0.0748, "loss_ce": 0.3586946129798889, "loss_lvr": 0.8977645039558411, "loss_mode_switch": 0.0, "loss_total": 0.4484710693359375, "step": 187 }, { "batch_size": 4, "epoch": 0.0748, "step": 187, "tokens_per_device": 1560 }, { "epoch": 0.0748, "loss_ce": 0.4286516010761261, "loss_lvr": 1.367844820022583, "loss_mode_switch": 0.0, "loss_total": 0.565436065196991, "step": 187 }, { "batch_size": 4, "epoch": 0.0748, "step": 187, "tokens_per_device": 3748 }, { "epoch": 0.0748, "loss_ce": 0.25019192695617676, "loss_lvr": 1.1937755346298218, "loss_mode_switch": 0.0, "loss_total": 0.36956948041915894, "step": 187 }, { "epoch": 0.0752, "grad_norm": 1.7912431955337524, "learning_rate": 9.946519258575263e-06, "loss": 0.3673, "step": 188 }, { "batch_size": 4, "epoch": 0.0752, "step": 188, "tokens_per_device": 3992 }, { "epoch": 0.0752, "loss_ce": 0.4260684847831726, "loss_lvr": 1.0766924619674683, "loss_mode_switch": 0.0, "loss_total": 0.5337377190589905, "step": 188 }, { "batch_size": 4, "epoch": 0.0752, "step": 188, "tokens_per_device": 9496 }, { "epoch": 0.0752, "loss_ce": 0.02416941337287426, "loss_lvr": 1.662966251373291, "loss_mode_switch": 0.0, "loss_total": 0.190466046333313, "step": 188 }, { "batch_size": 1, "epoch": 0.0752, "step": 188, "tokens_per_device": 4896 }, { "epoch": 0.0752, "loss_ce": 0.8098468780517578, "loss_lvr": 0.9312619566917419, "loss_mode_switch": 0.0, "loss_total": 0.9029730558395386, "step": 188 }, { "batch_size": 1, "epoch": 0.0752, "step": 188, "tokens_per_device": 5131 }, { "epoch": 0.0752, "loss_ce": 0.019617918878793716, "loss_lvr": 0.728558361530304, "loss_mode_switch": 0.0, "loss_total": 0.09247376024723053, "step": 188 }, { "batch_size": 1, "epoch": 0.0752, "step": 188, "tokens_per_device": 5049 }, { "epoch": 0.0752, "loss_ce": 0.03643540292978287, "loss_lvr": 0.44591024518013, "loss_mode_switch": 0.0, "loss_total": 0.08102642744779587, "step": 188 }, { "batch_size": 1, "epoch": 0.0752, "step": 188, "tokens_per_device": 5155 }, { "epoch": 0.0752, "loss_ce": 0.0012424386804923415, "loss_lvr": 0.5059510469436646, "loss_mode_switch": 0.0, "loss_total": 0.05183754488825798, "step": 188 }, { "batch_size": 4, "epoch": 0.0752, "step": 188, "tokens_per_device": 2520 }, { "epoch": 0.0752, "loss_ce": 0.48524922132492065, "loss_lvr": 1.6258562803268433, "loss_mode_switch": 0.0, "loss_total": 0.647834837436676, "step": 188 }, { "batch_size": 4, "epoch": 0.0752, "step": 188, "tokens_per_device": 1260 }, { "epoch": 0.0752, "loss_ce": 0.30922096967697144, "loss_lvr": 1.3581576347351074, "loss_mode_switch": 0.0, "loss_total": 0.44503673911094666, "step": 188 }, { "epoch": 0.0756, "grad_norm": 1.6425715684890747, "learning_rate": 9.945570237158098e-06, "loss": 0.3282, "step": 189 }, { "batch_size": 1, "epoch": 0.0756, "step": 189, "tokens_per_device": 4936 }, { "epoch": 0.0756, "loss_ce": 0.036081258207559586, "loss_lvr": 1.0588332414627075, "loss_mode_switch": 0.0, "loss_total": 0.14196458458900452, "step": 189 }, { "batch_size": 4, "epoch": 0.0756, "step": 189, "tokens_per_device": 4256 }, { "epoch": 0.0756, "loss_ce": 0.11344312876462936, "loss_lvr": 0.8268983960151672, "loss_mode_switch": 0.0, "loss_total": 0.19613297283649445, "step": 189 }, { "batch_size": 1, "epoch": 0.0756, "step": 189, "tokens_per_device": 4935 }, { "epoch": 0.0756, "loss_ce": 0.14605507254600525, "loss_lvr": 0.9416537284851074, "loss_mode_switch": 0.0, "loss_total": 0.24022045731544495, "step": 189 }, { "batch_size": 4, "epoch": 0.0756, "step": 189, "tokens_per_device": 1824 }, { "epoch": 0.0756, "loss_ce": 0.66252201795578, "loss_lvr": 1.1326655149459839, "loss_mode_switch": 0.0, "loss_total": 0.7757885456085205, "step": 189 }, { "batch_size": 4, "epoch": 0.0756, "step": 189, "tokens_per_device": 8852 }, { "epoch": 0.0756, "loss_ce": 0.3580925166606903, "loss_lvr": 1.0242279767990112, "loss_mode_switch": 0.0, "loss_total": 0.4605153203010559, "step": 189 }, { "batch_size": 4, "epoch": 0.0756, "step": 189, "tokens_per_device": 4208 }, { "epoch": 0.0756, "loss_ce": 0.6179673075675964, "loss_lvr": 1.9536867141723633, "loss_mode_switch": 0.0, "loss_total": 0.8133360147476196, "step": 189 }, { "batch_size": 4, "epoch": 0.0756, "step": 189, "tokens_per_device": 4304 }, { "epoch": 0.0756, "loss_ce": 0.12133686244487762, "loss_lvr": 1.6894139051437378, "loss_mode_switch": 0.0, "loss_total": 0.29027825593948364, "step": 189 }, { "batch_size": 4, "epoch": 0.0756, "step": 189, "tokens_per_device": 4384 }, { "epoch": 0.0756, "loss_ce": 0.1152929812669754, "loss_lvr": 0.94951331615448, "loss_mode_switch": 0.0, "loss_total": 0.2102443128824234, "step": 189 }, { "epoch": 0.076, "grad_norm": 1.5640015602111816, "learning_rate": 9.944612915464183e-06, "loss": 0.332, "step": 190 }, { "batch_size": 4, "epoch": 0.076, "step": 190, "tokens_per_device": 16076 }, { "epoch": 0.076, "loss_ce": 0.362855464220047, "loss_lvr": 1.2618861198425293, "loss_mode_switch": 0.0, "loss_total": 0.48904407024383545, "step": 190 }, { "batch_size": 1, "epoch": 0.076, "step": 190, "tokens_per_device": 4984 }, { "epoch": 0.076, "loss_ce": 2.0838003158569336, "loss_lvr": 1.0717524290084839, "loss_mode_switch": 0.0, "loss_total": 2.1909756660461426, "step": 190 }, { "batch_size": 4, "epoch": 0.076, "step": 190, "tokens_per_device": 4280 }, { "epoch": 0.076, "loss_ce": 0.12146861106157303, "loss_lvr": 1.1370985507965088, "loss_mode_switch": 0.0, "loss_total": 0.23517847061157227, "step": 190 }, { "batch_size": 1, "epoch": 0.076, "step": 190, "tokens_per_device": 5936 }, { "epoch": 0.076, "loss_ce": 0.005774649791419506, "loss_lvr": 0.5181335806846619, "loss_mode_switch": 0.0, "loss_total": 0.057588011026382446, "step": 190 }, { "batch_size": 1, "epoch": 0.076, "step": 190, "tokens_per_device": 4880 }, { "epoch": 0.076, "loss_ce": 0.006493429187685251, "loss_lvr": 0.5709078311920166, "loss_mode_switch": 0.0, "loss_total": 0.06358421593904495, "step": 190 }, { "batch_size": 4, "epoch": 0.076, "step": 190, "tokens_per_device": 3752 }, { "epoch": 0.076, "loss_ce": 0.2911719083786011, "loss_lvr": 0.7200093865394592, "loss_mode_switch": 0.0, "loss_total": 0.36317285895347595, "step": 190 }, { "batch_size": 4, "epoch": 0.076, "step": 190, "tokens_per_device": 4028 }, { "epoch": 0.076, "loss_ce": 0.3444352447986603, "loss_lvr": 1.3190784454345703, "loss_mode_switch": 0.0, "loss_total": 0.4763430953025818, "step": 190 }, { "batch_size": 1, "epoch": 0.076, "step": 190, "tokens_per_device": 4782 }, { "epoch": 0.076, "loss_ce": 0.05322486162185669, "loss_lvr": 0.7481428384780884, "loss_mode_switch": 0.0, "loss_total": 0.12803915143013, "step": 190 }, { "epoch": 0.0764, "grad_norm": 2.042156934738159, "learning_rate": 9.943647295100219e-06, "loss": 0.3958, "step": 191 }, { "batch_size": 1, "epoch": 0.0764, "step": 191, "tokens_per_device": 5099 }, { "epoch": 0.0764, "loss_ce": 0.009796193800866604, "loss_lvr": 0.28689417243003845, "loss_mode_switch": 0.0, "loss_total": 0.03848561272025108, "step": 191 }, { "batch_size": 4, "epoch": 0.0764, "step": 191, "tokens_per_device": 6964 }, { "epoch": 0.0764, "loss_ce": 0.25193557143211365, "loss_lvr": 1.0402686595916748, "loss_mode_switch": 0.0, "loss_total": 0.35596245527267456, "step": 191 }, { "batch_size": 4, "epoch": 0.0764, "step": 191, "tokens_per_device": 2692 }, { "epoch": 0.0764, "loss_ce": 0.5719875693321228, "loss_lvr": 1.1879827976226807, "loss_mode_switch": 0.0, "loss_total": 0.690785825252533, "step": 191 }, { "batch_size": 1, "epoch": 0.0764, "step": 191, "tokens_per_device": 5120 }, { "epoch": 0.0764, "loss_ce": 0.09920823574066162, "loss_lvr": 0.45400470495224, "loss_mode_switch": 0.0, "loss_total": 0.14460870623588562, "step": 191 }, { "batch_size": 1, "epoch": 0.0764, "step": 191, "tokens_per_device": 4877 }, { "epoch": 0.0764, "loss_ce": 0.071688212454319, "loss_lvr": 0.5825610756874084, "loss_mode_switch": 0.0, "loss_total": 0.1299443244934082, "step": 191 }, { "batch_size": 4, "epoch": 0.0764, "step": 191, "tokens_per_device": 12664 }, { "epoch": 0.0764, "loss_ce": 0.13310246169567108, "loss_lvr": 1.031590461730957, "loss_mode_switch": 0.0, "loss_total": 0.2362615168094635, "step": 191 }, { "batch_size": 1, "epoch": 0.0764, "step": 191, "tokens_per_device": 4912 }, { "epoch": 0.0764, "loss_ce": 0.6349266171455383, "loss_lvr": 1.1150096654891968, "loss_mode_switch": 0.0, "loss_total": 0.746427595615387, "step": 191 }, { "batch_size": 4, "epoch": 0.0764, "step": 191, "tokens_per_device": 2700 }, { "epoch": 0.0764, "loss_ce": 0.4390880763530731, "loss_lvr": 1.4389312267303467, "loss_mode_switch": 0.0, "loss_total": 0.5829812288284302, "step": 191 }, { "epoch": 0.0768, "grad_norm": 5.601438045501709, "learning_rate": 9.94267337768683e-06, "loss": 0.3567, "step": 192 }, { "batch_size": 4, "epoch": 0.0768, "step": 192, "tokens_per_device": 1260 }, { "epoch": 0.0768, "loss_ce": 0.526812732219696, "loss_lvr": 1.2029415369033813, "loss_mode_switch": 0.0, "loss_total": 0.6471068859100342, "step": 192 }, { "batch_size": 4, "epoch": 0.0768, "step": 192, "tokens_per_device": 1776 }, { "epoch": 0.0768, "loss_ce": 0.32242757081985474, "loss_lvr": 1.1371945142745972, "loss_mode_switch": 0.0, "loss_total": 0.4361470341682434, "step": 192 }, { "batch_size": 4, "epoch": 0.0768, "step": 192, "tokens_per_device": 3956 }, { "epoch": 0.0768, "loss_ce": 0.35047638416290283, "loss_lvr": 1.0332492589950562, "loss_mode_switch": 0.0, "loss_total": 0.45380130410194397, "step": 192 }, { "batch_size": 4, "epoch": 0.0768, "step": 192, "tokens_per_device": 1332 }, { "epoch": 0.0768, "loss_ce": 0.3483191430568695, "loss_lvr": 1.4335620403289795, "loss_mode_switch": 0.0, "loss_total": 0.49167534708976746, "step": 192 }, { "batch_size": 4, "epoch": 0.0768, "step": 192, "tokens_per_device": 2536 }, { "epoch": 0.0768, "loss_ce": 0.3482106924057007, "loss_lvr": 1.2883796691894531, "loss_mode_switch": 0.0, "loss_total": 0.4770486652851105, "step": 192 }, { "batch_size": 4, "epoch": 0.0768, "step": 192, "tokens_per_device": 15760 }, { "epoch": 0.0768, "loss_ce": 0.47915613651275635, "loss_lvr": 0.9506628513336182, "loss_mode_switch": 0.0, "loss_total": 0.5742224454879761, "step": 192 }, { "batch_size": 4, "epoch": 0.0768, "step": 192, "tokens_per_device": 3784 }, { "epoch": 0.0768, "loss_ce": 0.46708202362060547, "loss_lvr": 1.216368317604065, "loss_mode_switch": 0.0, "loss_total": 0.588718831539154, "step": 192 }, { "batch_size": 4, "epoch": 0.0768, "step": 192, "tokens_per_device": 1264 }, { "epoch": 0.0768, "loss_ce": 0.5336666703224182, "loss_lvr": 1.2758841514587402, "loss_mode_switch": 0.0, "loss_total": 0.6612551212310791, "step": 192 }, { "epoch": 0.0772, "grad_norm": 1.9414339065551758, "learning_rate": 9.941691164858565e-06, "loss": 0.3947, "step": 193 }, { "batch_size": 4, "epoch": 0.0772, "step": 193, "tokens_per_device": 1420 }, { "epoch": 0.0772, "loss_ce": 0.5381477475166321, "loss_lvr": 1.233787178993225, "loss_mode_switch": 0.0, "loss_total": 0.6615264415740967, "step": 193 }, { "batch_size": 4, "epoch": 0.0772, "step": 193, "tokens_per_device": 3856 }, { "epoch": 0.0772, "loss_ce": 0.1116240918636322, "loss_lvr": 1.5239934921264648, "loss_mode_switch": 0.0, "loss_total": 0.26402342319488525, "step": 193 }, { "batch_size": 4, "epoch": 0.0772, "step": 193, "tokens_per_device": 4340 }, { "epoch": 0.0772, "loss_ce": 0.1630510687828064, "loss_lvr": 0.7978898286819458, "loss_mode_switch": 0.0, "loss_total": 0.24284005165100098, "step": 193 }, { "batch_size": 4, "epoch": 0.0772, "step": 193, "tokens_per_device": 8820 }, { "epoch": 0.0772, "loss_ce": 0.302018404006958, "loss_lvr": 0.8794794678688049, "loss_mode_switch": 0.0, "loss_total": 0.38996636867523193, "step": 193 }, { "batch_size": 4, "epoch": 0.0772, "step": 193, "tokens_per_device": 5072 }, { "epoch": 0.0772, "loss_ce": 0.38437503576278687, "loss_lvr": 0.9671051502227783, "loss_mode_switch": 0.0, "loss_total": 0.48108553886413574, "step": 193 }, { "batch_size": 4, "epoch": 0.0772, "step": 193, "tokens_per_device": 1372 }, { "epoch": 0.0772, "loss_ce": 0.12285825610160828, "loss_lvr": 1.7347835302352905, "loss_mode_switch": 0.0, "loss_total": 0.2963365912437439, "step": 193 }, { "batch_size": 4, "epoch": 0.0772, "step": 193, "tokens_per_device": 1452 }, { "epoch": 0.0772, "loss_ce": 0.2684256434440613, "loss_lvr": 1.0598505735397339, "loss_mode_switch": 0.0, "loss_total": 0.3744106888771057, "step": 193 }, { "batch_size": 4, "epoch": 0.0772, "step": 193, "tokens_per_device": 2568 }, { "epoch": 0.0772, "loss_ce": 0.14522938430309296, "loss_lvr": 1.7339893579483032, "loss_mode_switch": 0.0, "loss_total": 0.31862831115722656, "step": 193 }, { "epoch": 0.0776, "grad_norm": 1.8689020872116089, "learning_rate": 9.940700658263897e-06, "loss": 0.4158, "step": 194 }, { "batch_size": 1, "epoch": 0.0776, "step": 194, "tokens_per_device": 6780 }, { "epoch": 0.0776, "loss_ce": 0.0021991024259477854, "loss_lvr": 0.756605327129364, "loss_mode_switch": 0.0, "loss_total": 0.07785964012145996, "step": 194 }, { "batch_size": 4, "epoch": 0.0776, "step": 194, "tokens_per_device": 4340 }, { "epoch": 0.0776, "loss_ce": 0.3236059546470642, "loss_lvr": 1.2767375707626343, "loss_mode_switch": 0.0, "loss_total": 0.4512796998023987, "step": 194 }, { "batch_size": 1, "epoch": 0.0776, "step": 194, "tokens_per_device": 5158 }, { "epoch": 0.0776, "loss_ce": 0.00869436003267765, "loss_lvr": 1.1603574752807617, "loss_mode_switch": 0.0, "loss_total": 0.12473011016845703, "step": 194 }, { "batch_size": 1, "epoch": 0.0776, "step": 194, "tokens_per_device": 4846 }, { "epoch": 0.0776, "loss_ce": 0.025760425254702568, "loss_lvr": 1.3289343118667603, "loss_mode_switch": 0.0, "loss_total": 0.1586538553237915, "step": 194 }, { "batch_size": 4, "epoch": 0.0776, "step": 194, "tokens_per_device": 4224 }, { "epoch": 0.0776, "loss_ce": 0.7329540252685547, "loss_lvr": 1.2833162546157837, "loss_mode_switch": 0.0, "loss_total": 0.8612856864929199, "step": 194 }, { "batch_size": 4, "epoch": 0.0776, "step": 194, "tokens_per_device": 4260 }, { "epoch": 0.0776, "loss_ce": 0.4920371472835541, "loss_lvr": 1.8756401538848877, "loss_mode_switch": 0.0, "loss_total": 0.6796011924743652, "step": 194 }, { "batch_size": 4, "epoch": 0.0776, "step": 194, "tokens_per_device": 3896 }, { "epoch": 0.0776, "loss_ce": 0.1626744419336319, "loss_lvr": 1.2768851518630981, "loss_mode_switch": 0.0, "loss_total": 0.2903629541397095, "step": 194 }, { "batch_size": 4, "epoch": 0.0776, "step": 194, "tokens_per_device": 5820 }, { "epoch": 0.0776, "loss_ce": 0.3668394684791565, "loss_lvr": 1.0766818523406982, "loss_mode_switch": 0.0, "loss_total": 0.4745076596736908, "step": 194 }, { "epoch": 0.078, "grad_norm": 1.6297558546066284, "learning_rate": 9.93970185956522e-06, "loss": 0.4028, "step": 195 }, { "batch_size": 1, "epoch": 0.078, "step": 195, "tokens_per_device": 7336 }, { "epoch": 0.078, "loss_ce": 0.01510176807641983, "loss_lvr": 0.7122564315795898, "loss_mode_switch": 0.0, "loss_total": 0.08632741123437881, "step": 195 }, { "batch_size": 4, "epoch": 0.078, "step": 195, "tokens_per_device": 2736 }, { "epoch": 0.078, "loss_ce": 0.5890716910362244, "loss_lvr": 1.1416962146759033, "loss_mode_switch": 0.0, "loss_total": 0.7032412886619568, "step": 195 }, { "batch_size": 4, "epoch": 0.078, "step": 195, "tokens_per_device": 3532 }, { "epoch": 0.078, "loss_ce": 0.26277610659599304, "loss_lvr": 1.6360833644866943, "loss_mode_switch": 0.0, "loss_total": 0.42638444900512695, "step": 195 }, { "batch_size": 4, "epoch": 0.078, "step": 195, "tokens_per_device": 4552 }, { "epoch": 0.078, "loss_ce": 0.26847735047340393, "loss_lvr": 1.220775842666626, "loss_mode_switch": 0.0, "loss_total": 0.39055493474006653, "step": 195 }, { "batch_size": 1, "epoch": 0.078, "step": 195, "tokens_per_device": 5204 }, { "epoch": 0.078, "loss_ce": 0.0060316165909171104, "loss_lvr": 0.6388357877731323, "loss_mode_switch": 0.0, "loss_total": 0.06991519778966904, "step": 195 }, { "batch_size": 4, "epoch": 0.078, "step": 195, "tokens_per_device": 3912 }, { "epoch": 0.078, "loss_ce": 0.6326485276222229, "loss_lvr": 1.0922719240188599, "loss_mode_switch": 0.0, "loss_total": 0.7418757081031799, "step": 195 }, { "batch_size": 4, "epoch": 0.078, "step": 195, "tokens_per_device": 3196 }, { "epoch": 0.078, "loss_ce": 0.116374172270298, "loss_lvr": 1.2325977087020874, "loss_mode_switch": 0.0, "loss_total": 0.2396339476108551, "step": 195 }, { "batch_size": 4, "epoch": 0.078, "step": 195, "tokens_per_device": 5012 }, { "epoch": 0.078, "loss_ce": 0.24665965139865875, "loss_lvr": 1.046836256980896, "loss_mode_switch": 0.0, "loss_total": 0.3513432741165161, "step": 195 }, { "epoch": 0.0784, "grad_norm": 1.6435871124267578, "learning_rate": 9.938694770438843e-06, "loss": 0.3689, "step": 196 }, { "batch_size": 4, "epoch": 0.0784, "step": 196, "tokens_per_device": 3120 }, { "epoch": 0.0784, "loss_ce": 0.9954763054847717, "loss_lvr": 1.259183645248413, "loss_mode_switch": 0.0, "loss_total": 1.1213946342468262, "step": 196 }, { "batch_size": 1, "epoch": 0.0784, "step": 196, "tokens_per_device": 5252 }, { "epoch": 0.0784, "loss_ce": 0.016874277964234352, "loss_lvr": 0.8113812208175659, "loss_mode_switch": 0.0, "loss_total": 0.09801240265369415, "step": 196 }, { "batch_size": 4, "epoch": 0.0784, "step": 196, "tokens_per_device": 2716 }, { "epoch": 0.0784, "loss_ce": 0.004748254083096981, "loss_lvr": 0.9206435084342957, "loss_mode_switch": 0.0, "loss_total": 0.09681260585784912, "step": 196 }, { "batch_size": 4, "epoch": 0.0784, "step": 196, "tokens_per_device": 2576 }, { "epoch": 0.0784, "loss_ce": 0.4128378927707672, "loss_lvr": 1.1155035495758057, "loss_mode_switch": 0.0, "loss_total": 0.5243882536888123, "step": 196 }, { "batch_size": 4, "epoch": 0.0784, "step": 196, "tokens_per_device": 4036 }, { "epoch": 0.0784, "loss_ce": 0.3983010947704315, "loss_lvr": 0.8399330377578735, "loss_mode_switch": 0.0, "loss_total": 0.4822944104671478, "step": 196 }, { "batch_size": 4, "epoch": 0.0784, "step": 196, "tokens_per_device": 4616 }, { "epoch": 0.0784, "loss_ce": 0.3075214624404907, "loss_lvr": 1.0587689876556396, "loss_mode_switch": 0.0, "loss_total": 0.4133983552455902, "step": 196 }, { "batch_size": 4, "epoch": 0.0784, "step": 196, "tokens_per_device": 4324 }, { "epoch": 0.0784, "loss_ce": 0.17311520874500275, "loss_lvr": 1.066796064376831, "loss_mode_switch": 0.0, "loss_total": 0.2797948122024536, "step": 196 }, { "batch_size": 4, "epoch": 0.0784, "step": 196, "tokens_per_device": 1528 }, { "epoch": 0.0784, "loss_ce": 0.7159650921821594, "loss_lvr": 1.226035475730896, "loss_mode_switch": 0.0, "loss_total": 0.8385686278343201, "step": 196 }, { "epoch": 0.0788, "grad_norm": 4.339474678039551, "learning_rate": 9.937679392574991e-06, "loss": 0.3994, "step": 197 }, { "batch_size": 1, "epoch": 0.0788, "step": 197, "tokens_per_device": 4220 }, { "epoch": 0.0788, "loss_ce": 0.01961742900311947, "loss_lvr": 1.2847723960876465, "loss_mode_switch": 0.0, "loss_total": 0.14809466898441315, "step": 197 }, { "batch_size": 1, "epoch": 0.0788, "step": 197, "tokens_per_device": 4129 }, { "epoch": 0.0788, "loss_ce": 0.1693616807460785, "loss_lvr": 1.1899049282073975, "loss_mode_switch": 0.0, "loss_total": 0.28835219144821167, "step": 197 }, { "batch_size": 1, "epoch": 0.0788, "step": 197, "tokens_per_device": 4899 }, { "epoch": 0.0788, "loss_ce": 0.02575794607400894, "loss_lvr": 0.6026341319084167, "loss_mode_switch": 0.0, "loss_total": 0.08602136373519897, "step": 197 }, { "batch_size": 1, "epoch": 0.0788, "step": 197, "tokens_per_device": 5160 }, { "epoch": 0.0788, "loss_ce": 0.02635369822382927, "loss_lvr": 0.8331298828125, "loss_mode_switch": 0.0, "loss_total": 0.10966669023036957, "step": 197 }, { "batch_size": 4, "epoch": 0.0788, "step": 197, "tokens_per_device": 2624 }, { "epoch": 0.0788, "loss_ce": 0.2389044612646103, "loss_lvr": 1.4389179944992065, "loss_mode_switch": 0.0, "loss_total": 0.3827962577342987, "step": 197 }, { "batch_size": 1, "epoch": 0.0788, "step": 197, "tokens_per_device": 5006 }, { "epoch": 0.0788, "loss_ce": 0.017136037349700928, "loss_lvr": 1.1219077110290527, "loss_mode_switch": 0.0, "loss_total": 0.12932682037353516, "step": 197 }, { "batch_size": 1, "epoch": 0.0788, "step": 197, "tokens_per_device": 5140 }, { "epoch": 0.0788, "loss_ce": 0.07551813870668411, "loss_lvr": 0.5480425953865051, "loss_mode_switch": 0.0, "loss_total": 0.1303223967552185, "step": 197 }, { "batch_size": 1, "epoch": 0.0788, "step": 197, "tokens_per_device": 4939 }, { "epoch": 0.0788, "loss_ce": 0.42494457960128784, "loss_lvr": 0.9713442921638489, "loss_mode_switch": 0.0, "loss_total": 0.5220789909362793, "step": 197 }, { "epoch": 0.0792, "grad_norm": 1.6094427108764648, "learning_rate": 9.936655727677795e-06, "loss": 0.3415, "step": 198 }, { "batch_size": 4, "epoch": 0.0792, "step": 198, "tokens_per_device": 5128 }, { "epoch": 0.0792, "loss_ce": 0.9020960330963135, "loss_lvr": 1.0443251132965088, "loss_mode_switch": 0.0, "loss_total": 1.0065284967422485, "step": 198 }, { "batch_size": 4, "epoch": 0.0792, "step": 198, "tokens_per_device": 2728 }, { "epoch": 0.0792, "loss_ce": 0.7561904788017273, "loss_lvr": 1.2771508693695068, "loss_mode_switch": 0.0, "loss_total": 0.8839055895805359, "step": 198 }, { "batch_size": 4, "epoch": 0.0792, "step": 198, "tokens_per_device": 4712 }, { "epoch": 0.0792, "loss_ce": 0.14044159650802612, "loss_lvr": 1.4004919528961182, "loss_mode_switch": 0.0, "loss_total": 0.28049081563949585, "step": 198 }, { "batch_size": 4, "epoch": 0.0792, "step": 198, "tokens_per_device": 3880 }, { "epoch": 0.0792, "loss_ce": 0.08681110292673111, "loss_lvr": 1.4626877307891846, "loss_mode_switch": 0.0, "loss_total": 0.23307988047599792, "step": 198 }, { "batch_size": 1, "epoch": 0.0792, "step": 198, "tokens_per_device": 5619 }, { "epoch": 0.0792, "loss_ce": 0.014837682247161865, "loss_lvr": 0.5568454265594482, "loss_mode_switch": 0.0, "loss_total": 0.07052222639322281, "step": 198 }, { "batch_size": 4, "epoch": 0.0792, "step": 198, "tokens_per_device": 5808 }, { "epoch": 0.0792, "loss_ce": 0.19540783762931824, "loss_lvr": 0.9941173195838928, "loss_mode_switch": 0.0, "loss_total": 0.29481956362724304, "step": 198 }, { "batch_size": 1, "epoch": 0.0792, "step": 198, "tokens_per_device": 4890 }, { "epoch": 0.0792, "loss_ce": 0.05529572442173958, "loss_lvr": 0.849716067314148, "loss_mode_switch": 0.0, "loss_total": 0.14026732742786407, "step": 198 }, { "batch_size": 1, "epoch": 0.0792, "step": 198, "tokens_per_device": 5264 }, { "epoch": 0.0792, "loss_ce": 0.03679700195789337, "loss_lvr": 1.0094841718673706, "loss_mode_switch": 0.0, "loss_total": 0.13774541020393372, "step": 198 }, { "epoch": 0.0796, "grad_norm": 2.583977222442627, "learning_rate": 9.9356237774653e-06, "loss": 0.4117, "step": 199 }, { "batch_size": 4, "epoch": 0.0796, "step": 199, "tokens_per_device": 5468 }, { "epoch": 0.0796, "loss_ce": 0.027577580884099007, "loss_lvr": 0.9671685099601746, "loss_mode_switch": 0.0, "loss_total": 0.12429443001747131, "step": 199 }, { "batch_size": 4, "epoch": 0.0796, "step": 199, "tokens_per_device": 4236 }, { "epoch": 0.0796, "loss_ce": 0.02895108237862587, "loss_lvr": 0.8605523705482483, "loss_mode_switch": 0.0, "loss_total": 0.11500632762908936, "step": 199 }, { "batch_size": 4, "epoch": 0.0796, "step": 199, "tokens_per_device": 2600 }, { "epoch": 0.0796, "loss_ce": 0.40238434076309204, "loss_lvr": 1.470905065536499, "loss_mode_switch": 0.0, "loss_total": 0.549474835395813, "step": 199 }, { "batch_size": 4, "epoch": 0.0796, "step": 199, "tokens_per_device": 4044 }, { "epoch": 0.0796, "loss_ce": 0.26582062244415283, "loss_lvr": 1.1521779298782349, "loss_mode_switch": 0.0, "loss_total": 0.3810384273529053, "step": 199 }, { "batch_size": 4, "epoch": 0.0796, "step": 199, "tokens_per_device": 13820 }, { "epoch": 0.0796, "loss_ce": 0.2743188142776489, "loss_lvr": 1.261264443397522, "loss_mode_switch": 0.0, "loss_total": 0.40044528245925903, "step": 199 }, { "batch_size": 4, "epoch": 0.0796, "step": 199, "tokens_per_device": 4672 }, { "epoch": 0.0796, "loss_ce": 0.259480357170105, "loss_lvr": 0.8276845812797546, "loss_mode_switch": 0.0, "loss_total": 0.3422488272190094, "step": 199 }, { "batch_size": 4, "epoch": 0.0796, "step": 199, "tokens_per_device": 10760 }, { "epoch": 0.0796, "loss_ce": 0.25804367661476135, "loss_lvr": 0.9001509547233582, "loss_mode_switch": 0.0, "loss_total": 0.3480587601661682, "step": 199 }, { "batch_size": 1, "epoch": 0.0796, "step": 199, "tokens_per_device": 4891 }, { "epoch": 0.0796, "loss_ce": 0.003862563520669937, "loss_lvr": 1.159140706062317, "loss_mode_switch": 0.0, "loss_total": 0.1197766363620758, "step": 199 }, { "epoch": 0.08, "grad_norm": 1.4574180841445923, "learning_rate": 9.934583543669454e-06, "loss": 0.3221, "step": 200 }, { "batch_size": 4, "epoch": 0.08, "step": 200, "tokens_per_device": 3764 }, { "epoch": 0.08, "loss_ce": 0.33455097675323486, "loss_lvr": 1.32953679561615, "loss_mode_switch": 0.0, "loss_total": 0.46750468015670776, "step": 200 }, { "batch_size": 4, "epoch": 0.08, "step": 200, "tokens_per_device": 4312 }, { "epoch": 0.08, "loss_ce": 0.353293240070343, "loss_lvr": 1.108255386352539, "loss_mode_switch": 0.0, "loss_total": 0.4641187787055969, "step": 200 }, { "batch_size": 1, "epoch": 0.08, "step": 200, "tokens_per_device": 4940 }, { "epoch": 0.08, "loss_ce": 0.037959933280944824, "loss_lvr": 0.34566208720207214, "loss_mode_switch": 0.0, "loss_total": 0.07252614200115204, "step": 200 }, { "batch_size": 1, "epoch": 0.08, "step": 200, "tokens_per_device": 5030 }, { "epoch": 0.08, "loss_ce": 0.0020771108102053404, "loss_lvr": 0.7955619692802429, "loss_mode_switch": 0.0, "loss_total": 0.0816333070397377, "step": 200 }, { "batch_size": 4, "epoch": 0.08, "step": 200, "tokens_per_device": 1584 }, { "epoch": 0.08, "loss_ce": 0.3861979842185974, "loss_lvr": 1.2002108097076416, "loss_mode_switch": 0.0, "loss_total": 0.5062190890312195, "step": 200 }, { "batch_size": 4, "epoch": 0.08, "step": 200, "tokens_per_device": 1456 }, { "epoch": 0.08, "loss_ce": 0.5577961802482605, "loss_lvr": 1.6036787033081055, "loss_mode_switch": 0.0, "loss_total": 0.7181640863418579, "step": 200 }, { "batch_size": 4, "epoch": 0.08, "step": 200, "tokens_per_device": 3844 }, { "epoch": 0.08, "loss_ce": 0.21627546846866608, "loss_lvr": 1.0660836696624756, "loss_mode_switch": 0.0, "loss_total": 0.32288384437561035, "step": 200 }, { "batch_size": 4, "epoch": 0.08, "step": 200, "tokens_per_device": 2652 }, { "epoch": 0.08, "loss_ce": 0.44132503867149353, "loss_lvr": 1.1333857774734497, "loss_mode_switch": 0.0, "loss_total": 0.5546635985374451, "step": 200 }, { "epoch": 0.0804, "grad_norm": 1.5391169786453247, "learning_rate": 9.933535028036108e-06, "loss": 0.3267, "step": 201 }, { "batch_size": 4, "epoch": 0.0804, "step": 201, "tokens_per_device": 1604 }, { "epoch": 0.0804, "loss_ce": 0.6529770493507385, "loss_lvr": 1.1715182065963745, "loss_mode_switch": 0.0, "loss_total": 0.7701288461685181, "step": 201 }, { "batch_size": 4, "epoch": 0.0804, "step": 201, "tokens_per_device": 4016 }, { "epoch": 0.0804, "loss_ce": 0.3600042462348938, "loss_lvr": 1.2256076335906982, "loss_mode_switch": 0.0, "loss_total": 0.4825650155544281, "step": 201 }, { "batch_size": 4, "epoch": 0.0804, "step": 201, "tokens_per_device": 3800 }, { "epoch": 0.0804, "loss_ce": 0.44648516178131104, "loss_lvr": 1.2087875604629517, "loss_mode_switch": 0.0, "loss_total": 0.5673639178276062, "step": 201 }, { "batch_size": 1, "epoch": 0.0804, "step": 201, "tokens_per_device": 5120 }, { "epoch": 0.0804, "loss_ce": 0.02662104368209839, "loss_lvr": 1.3885849714279175, "loss_mode_switch": 0.0, "loss_total": 0.16547954082489014, "step": 201 }, { "batch_size": 4, "epoch": 0.0804, "step": 201, "tokens_per_device": 5076 }, { "epoch": 0.0804, "loss_ce": 0.22154998779296875, "loss_lvr": 1.3557230234146118, "loss_mode_switch": 0.0, "loss_total": 0.3571223020553589, "step": 201 }, { "batch_size": 4, "epoch": 0.0804, "step": 201, "tokens_per_device": 1336 }, { "epoch": 0.0804, "loss_ce": 0.2649015486240387, "loss_lvr": 1.3379640579223633, "loss_mode_switch": 0.0, "loss_total": 0.39869797229766846, "step": 201 }, { "batch_size": 1, "epoch": 0.0804, "step": 201, "tokens_per_device": 4630 }, { "epoch": 0.0804, "loss_ce": 0.004735300317406654, "loss_lvr": 0.5616241097450256, "loss_mode_switch": 0.0, "loss_total": 0.060897715389728546, "step": 201 }, { "batch_size": 4, "epoch": 0.0804, "step": 201, "tokens_per_device": 2748 }, { "epoch": 0.0804, "loss_ce": 0.34717857837677, "loss_lvr": 1.3113276958465576, "loss_mode_switch": 0.0, "loss_total": 0.47831135988235474, "step": 201 }, { "epoch": 0.0808, "grad_norm": 1.6837514638900757, "learning_rate": 9.932478232325013e-06, "loss": 0.3743, "step": 202 }, { "batch_size": 4, "epoch": 0.0808, "step": 202, "tokens_per_device": 10404 }, { "epoch": 0.0808, "loss_ce": 0.03424200415611267, "loss_lvr": 1.2877434492111206, "loss_mode_switch": 0.0, "loss_total": 0.16301634907722473, "step": 202 }, { "batch_size": 1, "epoch": 0.0808, "step": 202, "tokens_per_device": 5040 }, { "epoch": 0.0808, "loss_ce": 0.017723502591252327, "loss_lvr": 1.1608517169952393, "loss_mode_switch": 0.0, "loss_total": 0.1338086724281311, "step": 202 }, { "batch_size": 1, "epoch": 0.0808, "step": 202, "tokens_per_device": 4905 }, { "epoch": 0.0808, "loss_ce": 0.60575270652771, "loss_lvr": 1.01341712474823, "loss_mode_switch": 0.0, "loss_total": 0.7070944309234619, "step": 202 }, { "batch_size": 4, "epoch": 0.0808, "step": 202, "tokens_per_device": 4204 }, { "epoch": 0.0808, "loss_ce": 0.49768775701522827, "loss_lvr": 1.8032492399215698, "loss_mode_switch": 0.0, "loss_total": 0.6780126690864563, "step": 202 }, { "batch_size": 4, "epoch": 0.0808, "step": 202, "tokens_per_device": 5072 }, { "epoch": 0.0808, "loss_ce": 0.20386017858982086, "loss_lvr": 0.8288212418556213, "loss_mode_switch": 0.0, "loss_total": 0.28674229979515076, "step": 202 }, { "batch_size": 1, "epoch": 0.0808, "step": 202, "tokens_per_device": 5129 }, { "epoch": 0.0808, "loss_ce": 0.12421046942472458, "loss_lvr": 0.5434845685958862, "loss_mode_switch": 0.0, "loss_total": 0.17855893075466156, "step": 202 }, { "batch_size": 4, "epoch": 0.0808, "step": 202, "tokens_per_device": 3752 }, { "epoch": 0.0808, "loss_ce": 0.4386971890926361, "loss_lvr": 1.5997648239135742, "loss_mode_switch": 0.0, "loss_total": 0.5986737012863159, "step": 202 }, { "batch_size": 4, "epoch": 0.0808, "step": 202, "tokens_per_device": 2656 }, { "epoch": 0.0808, "loss_ce": 0.4784902036190033, "loss_lvr": 0.9785391688346863, "loss_mode_switch": 0.0, "loss_total": 0.5763441324234009, "step": 202 }, { "epoch": 0.0812, "grad_norm": 1.8747081756591797, "learning_rate": 9.931413158309816e-06, "loss": 0.3697, "step": 203 }, { "batch_size": 1, "epoch": 0.0812, "step": 203, "tokens_per_device": 4425 }, { "epoch": 0.0812, "loss_ce": 0.019788077101111412, "loss_lvr": 1.1499173641204834, "loss_mode_switch": 0.0, "loss_total": 0.13477981090545654, "step": 203 }, { "batch_size": 4, "epoch": 0.0812, "step": 203, "tokens_per_device": 1428 }, { "epoch": 0.0812, "loss_ce": 0.1585494875907898, "loss_lvr": 1.228284239768982, "loss_mode_switch": 0.0, "loss_total": 0.281377911567688, "step": 203 }, { "batch_size": 1, "epoch": 0.0812, "step": 203, "tokens_per_device": 4880 }, { "epoch": 0.0812, "loss_ce": 0.005506839137524366, "loss_lvr": 1.2653342485427856, "loss_mode_switch": 0.0, "loss_total": 0.13204027712345123, "step": 203 }, { "batch_size": 4, "epoch": 0.0812, "step": 203, "tokens_per_device": 11376 }, { "epoch": 0.0812, "loss_ce": 0.13411282002925873, "loss_lvr": 1.1472396850585938, "loss_mode_switch": 0.0, "loss_total": 0.24883678555488586, "step": 203 }, { "batch_size": 1, "epoch": 0.0812, "step": 203, "tokens_per_device": 5005 }, { "epoch": 0.0812, "loss_ce": 0.5566927790641785, "loss_lvr": 0.23590712249279022, "loss_mode_switch": 0.0, "loss_total": 0.5802834630012512, "step": 203 }, { "batch_size": 4, "epoch": 0.0812, "step": 203, "tokens_per_device": 2684 }, { "epoch": 0.0812, "loss_ce": 0.29329001903533936, "loss_lvr": 1.148751974105835, "loss_mode_switch": 0.0, "loss_total": 0.40816521644592285, "step": 203 }, { "batch_size": 1, "epoch": 0.0812, "step": 203, "tokens_per_device": 5114 }, { "epoch": 0.0812, "loss_ce": 0.013118943199515343, "loss_lvr": 0.6923459768295288, "loss_mode_switch": 0.0, "loss_total": 0.08235354721546173, "step": 203 }, { "batch_size": 4, "epoch": 0.0812, "step": 203, "tokens_per_device": 3740 }, { "epoch": 0.0812, "loss_ce": 0.4631847143173218, "loss_lvr": 2.5439767837524414, "loss_mode_switch": 0.0, "loss_total": 0.7175824046134949, "step": 203 }, { "epoch": 0.0816, "grad_norm": 1.6074234247207642, "learning_rate": 9.930339807778056e-06, "loss": 0.373, "step": 204 }, { "batch_size": 4, "epoch": 0.0816, "step": 204, "tokens_per_device": 4672 }, { "epoch": 0.0816, "loss_ce": 0.12452758103609085, "loss_lvr": 1.047580599784851, "loss_mode_switch": 0.0, "loss_total": 0.22928564250469208, "step": 204 }, { "batch_size": 1, "epoch": 0.0816, "step": 204, "tokens_per_device": 5268 }, { "epoch": 0.0816, "loss_ce": 0.055253252387046814, "loss_lvr": 0.7888174057006836, "loss_mode_switch": 0.0, "loss_total": 0.13413499295711517, "step": 204 }, { "batch_size": 4, "epoch": 0.0816, "step": 204, "tokens_per_device": 5140 }, { "epoch": 0.0816, "loss_ce": 0.07694365084171295, "loss_lvr": 1.179076910018921, "loss_mode_switch": 0.0, "loss_total": 0.1948513388633728, "step": 204 }, { "batch_size": 4, "epoch": 0.0816, "step": 204, "tokens_per_device": 4472 }, { "epoch": 0.0816, "loss_ce": 0.2397225946187973, "loss_lvr": 1.1137444972991943, "loss_mode_switch": 0.0, "loss_total": 0.351097047328949, "step": 204 }, { "batch_size": 4, "epoch": 0.0816, "step": 204, "tokens_per_device": 4244 }, { "epoch": 0.0816, "loss_ce": 0.289753258228302, "loss_lvr": 1.1803613901138306, "loss_mode_switch": 0.0, "loss_total": 0.407789409160614, "step": 204 }, { "batch_size": 4, "epoch": 0.0816, "step": 204, "tokens_per_device": 2816 }, { "epoch": 0.0816, "loss_ce": 0.18536116182804108, "loss_lvr": 0.9021024107933044, "loss_mode_switch": 0.0, "loss_total": 0.27557140588760376, "step": 204 }, { "batch_size": 4, "epoch": 0.0816, "step": 204, "tokens_per_device": 2736 }, { "epoch": 0.0816, "loss_ce": 0.47434404492378235, "loss_lvr": 1.5229696035385132, "loss_mode_switch": 0.0, "loss_total": 0.626641035079956, "step": 204 }, { "batch_size": 4, "epoch": 0.0816, "step": 204, "tokens_per_device": 4216 }, { "epoch": 0.0816, "loss_ce": 0.3690223693847656, "loss_lvr": 1.3232144117355347, "loss_mode_switch": 0.0, "loss_total": 0.501343846321106, "step": 204 }, { "epoch": 0.082, "grad_norm": 1.4596859216690063, "learning_rate": 9.929258182531167e-06, "loss": 0.3598, "step": 205 }, { "batch_size": 4, "epoch": 0.082, "step": 205, "tokens_per_device": 1536 }, { "epoch": 0.082, "loss_ce": 0.6744449734687805, "loss_lvr": 1.3253157138824463, "loss_mode_switch": 0.0, "loss_total": 0.8069765567779541, "step": 205 }, { "batch_size": 4, "epoch": 0.082, "step": 205, "tokens_per_device": 3460 }, { "epoch": 0.082, "loss_ce": 0.3316735625267029, "loss_lvr": 1.2372745275497437, "loss_mode_switch": 0.0, "loss_total": 0.4554010033607483, "step": 205 }, { "batch_size": 4, "epoch": 0.082, "step": 205, "tokens_per_device": 2640 }, { "epoch": 0.082, "loss_ce": 0.34923750162124634, "loss_lvr": 1.1881937980651855, "loss_mode_switch": 0.0, "loss_total": 0.46805688738822937, "step": 205 }, { "batch_size": 4, "epoch": 0.082, "step": 205, "tokens_per_device": 4464 }, { "epoch": 0.082, "loss_ce": 0.2852433919906616, "loss_lvr": 1.1202597618103027, "loss_mode_switch": 0.0, "loss_total": 0.3972693681716919, "step": 205 }, { "batch_size": 4, "epoch": 0.082, "step": 205, "tokens_per_device": 2068 }, { "epoch": 0.082, "loss_ce": 0.08806449174880981, "loss_lvr": 0.9021880626678467, "loss_mode_switch": 0.0, "loss_total": 0.17828330397605896, "step": 205 }, { "batch_size": 4, "epoch": 0.082, "step": 205, "tokens_per_device": 7628 }, { "epoch": 0.082, "loss_ce": 0.3862113654613495, "loss_lvr": 1.2365003824234009, "loss_mode_switch": 0.0, "loss_total": 0.509861409664154, "step": 205 }, { "batch_size": 1, "epoch": 0.082, "step": 205, "tokens_per_device": 5263 }, { "epoch": 0.082, "loss_ce": 0.11751580238342285, "loss_lvr": 0.5001399517059326, "loss_mode_switch": 0.0, "loss_total": 0.16752979159355164, "step": 205 }, { "batch_size": 4, "epoch": 0.082, "step": 205, "tokens_per_device": 4704 }, { "epoch": 0.082, "loss_ce": 0.052948709577322006, "loss_lvr": 1.3105459213256836, "loss_mode_switch": 0.0, "loss_total": 0.1840033084154129, "step": 205 }, { "epoch": 0.0824, "grad_norm": 1.8976213932037354, "learning_rate": 9.928168284384468e-06, "loss": 0.4303, "step": 206 }, { "batch_size": 4, "epoch": 0.0824, "step": 206, "tokens_per_device": 1904 }, { "epoch": 0.0824, "loss_ce": 0.10023875534534454, "loss_lvr": 2.7608892917633057, "loss_mode_switch": 0.0, "loss_total": 0.3763276934623718, "step": 206 }, { "batch_size": 1, "epoch": 0.0824, "step": 206, "tokens_per_device": 4888 }, { "epoch": 0.0824, "loss_ce": 0.029729651287198067, "loss_lvr": 0.2856364846229553, "loss_mode_switch": 0.0, "loss_total": 0.05829329788684845, "step": 206 }, { "batch_size": 4, "epoch": 0.0824, "step": 206, "tokens_per_device": 5196 }, { "epoch": 0.0824, "loss_ce": 0.053661592304706573, "loss_lvr": 1.1009987592697144, "loss_mode_switch": 0.0, "loss_total": 0.1637614667415619, "step": 206 }, { "batch_size": 4, "epoch": 0.0824, "step": 206, "tokens_per_device": 12248 }, { "epoch": 0.0824, "loss_ce": 0.2645532488822937, "loss_lvr": 0.9892955422401428, "loss_mode_switch": 0.0, "loss_total": 0.363482803106308, "step": 206 }, { "batch_size": 1, "epoch": 0.0824, "step": 206, "tokens_per_device": 4861 }, { "epoch": 0.0824, "loss_ce": 0.0020170132629573345, "loss_lvr": 0.671528160572052, "loss_mode_switch": 0.0, "loss_total": 0.06916983425617218, "step": 206 }, { "batch_size": 4, "epoch": 0.0824, "step": 206, "tokens_per_device": 9028 }, { "epoch": 0.0824, "loss_ce": 0.5297679305076599, "loss_lvr": 1.2908425331115723, "loss_mode_switch": 0.0, "loss_total": 0.658852219581604, "step": 206 }, { "batch_size": 4, "epoch": 0.0824, "step": 206, "tokens_per_device": 4076 }, { "epoch": 0.0824, "loss_ce": 0.26652517914772034, "loss_lvr": 1.1671218872070312, "loss_mode_switch": 0.0, "loss_total": 0.383237361907959, "step": 206 }, { "batch_size": 1, "epoch": 0.0824, "step": 206, "tokens_per_device": 5107 }, { "epoch": 0.0824, "loss_ce": 0.011662326753139496, "loss_lvr": 0.5252934098243713, "loss_mode_switch": 0.0, "loss_total": 0.06419166922569275, "step": 206 }, { "epoch": 0.0828, "grad_norm": 1.4925020933151245, "learning_rate": 9.927070115167161e-06, "loss": 0.3022, "step": 207 }, { "batch_size": 1, "epoch": 0.0828, "step": 207, "tokens_per_device": 4903 }, { "epoch": 0.0828, "loss_ce": 0.06624461710453033, "loss_lvr": 0.41503334045410156, "loss_mode_switch": 0.0, "loss_total": 0.10774795711040497, "step": 207 }, { "batch_size": 4, "epoch": 0.0828, "step": 207, "tokens_per_device": 4272 }, { "epoch": 0.0828, "loss_ce": 0.3054761588573456, "loss_lvr": 1.4448750019073486, "loss_mode_switch": 0.0, "loss_total": 0.44996365904808044, "step": 207 }, { "batch_size": 4, "epoch": 0.0828, "step": 207, "tokens_per_device": 4304 }, { "epoch": 0.0828, "loss_ce": 0.4412640631198883, "loss_lvr": 1.248855471611023, "loss_mode_switch": 0.0, "loss_total": 0.5661495923995972, "step": 207 }, { "batch_size": 4, "epoch": 0.0828, "step": 207, "tokens_per_device": 3972 }, { "epoch": 0.0828, "loss_ce": 0.7919908165931702, "loss_lvr": 1.3816101551055908, "loss_mode_switch": 0.0, "loss_total": 0.9301518201828003, "step": 207 }, { "batch_size": 4, "epoch": 0.0828, "step": 207, "tokens_per_device": 4112 }, { "epoch": 0.0828, "loss_ce": 0.07375417649745941, "loss_lvr": 1.0815907716751099, "loss_mode_switch": 0.0, "loss_total": 0.18191325664520264, "step": 207 }, { "batch_size": 4, "epoch": 0.0828, "step": 207, "tokens_per_device": 6348 }, { "epoch": 0.0828, "loss_ce": 0.6536849737167358, "loss_lvr": 0.8952624201774597, "loss_mode_switch": 0.0, "loss_total": 0.7432112097740173, "step": 207 }, { "batch_size": 4, "epoch": 0.0828, "step": 207, "tokens_per_device": 5552 }, { "epoch": 0.0828, "loss_ce": 0.11108669638633728, "loss_lvr": 0.9282557964324951, "loss_mode_switch": 0.0, "loss_total": 0.20391228795051575, "step": 207 }, { "batch_size": 4, "epoch": 0.0828, "step": 207, "tokens_per_device": 1356 }, { "epoch": 0.0828, "loss_ce": 0.900364875793457, "loss_lvr": 1.2763770818710327, "loss_mode_switch": 0.0, "loss_total": 1.0280026197433472, "step": 207 }, { "epoch": 0.0832, "grad_norm": 1.6927080154418945, "learning_rate": 9.925963676722335e-06, "loss": 0.391, "step": 208 }, { "batch_size": 4, "epoch": 0.0832, "step": 208, "tokens_per_device": 1624 }, { "epoch": 0.0832, "loss_ce": 0.14120608568191528, "loss_lvr": 1.2469375133514404, "loss_mode_switch": 0.0, "loss_total": 0.2658998370170593, "step": 208 }, { "batch_size": 4, "epoch": 0.0832, "step": 208, "tokens_per_device": 4312 }, { "epoch": 0.0832, "loss_ce": 0.32709503173828125, "loss_lvr": 0.6746863722801208, "loss_mode_switch": 0.0, "loss_total": 0.3945636749267578, "step": 208 }, { "batch_size": 1, "epoch": 0.0832, "step": 208, "tokens_per_device": 7150 }, { "epoch": 0.0832, "loss_ce": 0.003745161695405841, "loss_lvr": 0.5102066397666931, "loss_mode_switch": 0.0, "loss_total": 0.05476582422852516, "step": 208 }, { "batch_size": 4, "epoch": 0.0832, "step": 208, "tokens_per_device": 5444 }, { "epoch": 0.0832, "loss_ce": 0.17775456607341766, "loss_lvr": 0.8570479154586792, "loss_mode_switch": 0.0, "loss_total": 0.26345935463905334, "step": 208 }, { "batch_size": 4, "epoch": 0.0832, "step": 208, "tokens_per_device": 4272 }, { "epoch": 0.0832, "loss_ce": 0.16264019906520844, "loss_lvr": 1.253982424736023, "loss_mode_switch": 0.0, "loss_total": 0.288038432598114, "step": 208 }, { "batch_size": 4, "epoch": 0.0832, "step": 208, "tokens_per_device": 9036 }, { "epoch": 0.0832, "loss_ce": 0.15363237261772156, "loss_lvr": 0.9923054575920105, "loss_mode_switch": 0.0, "loss_total": 0.25286293029785156, "step": 208 }, { "batch_size": 1, "epoch": 0.0832, "step": 208, "tokens_per_device": 5241 }, { "epoch": 0.0832, "loss_ce": 0.13630346953868866, "loss_lvr": 0.41361141204833984, "loss_mode_switch": 0.0, "loss_total": 0.1776646077632904, "step": 208 }, { "batch_size": 4, "epoch": 0.0832, "step": 208, "tokens_per_device": 5788 }, { "epoch": 0.0832, "loss_ce": 0.42681026458740234, "loss_lvr": 1.0049169063568115, "loss_mode_switch": 0.0, "loss_total": 0.5273019671440125, "step": 208 }, { "epoch": 0.0836, "grad_norm": 1.5750523805618286, "learning_rate": 9.92484897090695e-06, "loss": 0.3535, "step": 209 }, { "batch_size": 1, "epoch": 0.0836, "step": 209, "tokens_per_device": 4966 }, { "epoch": 0.0836, "loss_ce": 0.07912008464336395, "loss_lvr": 0.45147082209587097, "loss_mode_switch": 0.0, "loss_total": 0.12426716834306717, "step": 209 }, { "batch_size": 4, "epoch": 0.0836, "step": 209, "tokens_per_device": 4292 }, { "epoch": 0.0836, "loss_ce": 0.19846859574317932, "loss_lvr": 1.1480754613876343, "loss_mode_switch": 0.0, "loss_total": 0.31327614188194275, "step": 209 }, { "batch_size": 1, "epoch": 0.0836, "step": 209, "tokens_per_device": 4922 }, { "epoch": 0.0836, "loss_ce": 0.16399943828582764, "loss_lvr": 0.4253652095794678, "loss_mode_switch": 0.0, "loss_total": 0.2065359652042389, "step": 209 }, { "batch_size": 4, "epoch": 0.0836, "step": 209, "tokens_per_device": 3220 }, { "epoch": 0.0836, "loss_ce": 0.5437396168708801, "loss_lvr": 1.2878183126449585, "loss_mode_switch": 0.0, "loss_total": 0.6725214719772339, "step": 209 }, { "batch_size": 4, "epoch": 0.0836, "step": 209, "tokens_per_device": 1424 }, { "epoch": 0.0836, "loss_ce": 0.5310571193695068, "loss_lvr": 1.2875183820724487, "loss_mode_switch": 0.0, "loss_total": 0.6598089933395386, "step": 209 }, { "batch_size": 4, "epoch": 0.0836, "step": 209, "tokens_per_device": 4228 }, { "epoch": 0.0836, "loss_ce": 0.1742313802242279, "loss_lvr": 1.0927656888961792, "loss_mode_switch": 0.0, "loss_total": 0.28350794315338135, "step": 209 }, { "batch_size": 4, "epoch": 0.0836, "step": 209, "tokens_per_device": 4028 }, { "epoch": 0.0836, "loss_ce": 0.17016510665416718, "loss_lvr": 0.9633872509002686, "loss_mode_switch": 0.0, "loss_total": 0.26650384068489075, "step": 209 }, { "batch_size": 4, "epoch": 0.0836, "step": 209, "tokens_per_device": 4288 }, { "epoch": 0.0836, "loss_ce": 0.3777178227901459, "loss_lvr": 1.1500718593597412, "loss_mode_switch": 0.0, "loss_total": 0.4927250146865845, "step": 209 }, { "epoch": 0.084, "grad_norm": 2.0665910243988037, "learning_rate": 9.923725999591846e-06, "loss": 0.3485, "step": 210 }, { "batch_size": 4, "epoch": 0.084, "step": 210, "tokens_per_device": 3804 }, { "epoch": 0.084, "loss_ce": 0.40996357798576355, "loss_lvr": 1.385721206665039, "loss_mode_switch": 0.0, "loss_total": 0.5485357046127319, "step": 210 }, { "batch_size": 4, "epoch": 0.084, "step": 210, "tokens_per_device": 6172 }, { "epoch": 0.084, "loss_ce": 0.10426993668079376, "loss_lvr": 1.0071161985397339, "loss_mode_switch": 0.0, "loss_total": 0.20498156547546387, "step": 210 }, { "batch_size": 4, "epoch": 0.084, "step": 210, "tokens_per_device": 15676 }, { "epoch": 0.084, "loss_ce": 0.13900893926620483, "loss_lvr": 0.6784693598747253, "loss_mode_switch": 0.0, "loss_total": 0.2068558782339096, "step": 210 }, { "batch_size": 1, "epoch": 0.084, "step": 210, "tokens_per_device": 4873 }, { "epoch": 0.084, "loss_ce": 0.025675233453512192, "loss_lvr": 0.3016678988933563, "loss_mode_switch": 0.0, "loss_total": 0.055842023342847824, "step": 210 }, { "batch_size": 1, "epoch": 0.084, "step": 210, "tokens_per_device": 4731 }, { "epoch": 0.084, "loss_ce": 0.0010724444873631, "loss_lvr": 1.0817484855651855, "loss_mode_switch": 0.0, "loss_total": 0.10924729704856873, "step": 210 }, { "batch_size": 4, "epoch": 0.084, "step": 210, "tokens_per_device": 4324 }, { "epoch": 0.084, "loss_ce": 0.1971380114555359, "loss_lvr": 1.313373327255249, "loss_mode_switch": 0.0, "loss_total": 0.32847535610198975, "step": 210 }, { "batch_size": 4, "epoch": 0.084, "step": 210, "tokens_per_device": 1752 }, { "epoch": 0.084, "loss_ce": 0.06959051638841629, "loss_lvr": 1.8788058757781982, "loss_mode_switch": 0.0, "loss_total": 0.25747111439704895, "step": 210 }, { "batch_size": 4, "epoch": 0.084, "step": 210, "tokens_per_device": 2540 }, { "epoch": 0.084, "loss_ce": 0.21186143159866333, "loss_lvr": 1.4408373832702637, "loss_mode_switch": 0.0, "loss_total": 0.3559451699256897, "step": 210 }, { "epoch": 0.0844, "grad_norm": 1.8425109386444092, "learning_rate": 9.922594764661737e-06, "loss": 0.3841, "step": 211 }, { "batch_size": 4, "epoch": 0.0844, "step": 211, "tokens_per_device": 3796 }, { "epoch": 0.0844, "loss_ce": 0.4106234610080719, "loss_lvr": 1.3628960847854614, "loss_mode_switch": 0.0, "loss_total": 0.5469130873680115, "step": 211 }, { "batch_size": 4, "epoch": 0.0844, "step": 211, "tokens_per_device": 6224 }, { "epoch": 0.0844, "loss_ce": 0.4569838047027588, "loss_lvr": 0.9009724855422974, "loss_mode_switch": 0.0, "loss_total": 0.5470810532569885, "step": 211 }, { "batch_size": 4, "epoch": 0.0844, "step": 211, "tokens_per_device": 2796 }, { "epoch": 0.0844, "loss_ce": 0.24773482978343964, "loss_lvr": 1.0852559804916382, "loss_mode_switch": 0.0, "loss_total": 0.35626041889190674, "step": 211 }, { "batch_size": 4, "epoch": 0.0844, "step": 211, "tokens_per_device": 1228 }, { "epoch": 0.0844, "loss_ce": 0.26337260007858276, "loss_lvr": 1.7939282655715942, "loss_mode_switch": 0.0, "loss_total": 0.44276541471481323, "step": 211 }, { "batch_size": 1, "epoch": 0.0844, "step": 211, "tokens_per_device": 5356 }, { "epoch": 0.0844, "loss_ce": 0.012923100031912327, "loss_lvr": 0.8886191844940186, "loss_mode_switch": 0.0, "loss_total": 0.10178501904010773, "step": 211 }, { "batch_size": 4, "epoch": 0.0844, "step": 211, "tokens_per_device": 4292 }, { "epoch": 0.0844, "loss_ce": 0.1433079093694687, "loss_lvr": 1.2092384099960327, "loss_mode_switch": 0.0, "loss_total": 0.26423174142837524, "step": 211 }, { "batch_size": 4, "epoch": 0.0844, "step": 211, "tokens_per_device": 2636 }, { "epoch": 0.0844, "loss_ce": 0.21251125633716583, "loss_lvr": 1.2613246440887451, "loss_mode_switch": 0.0, "loss_total": 0.33864372968673706, "step": 211 }, { "batch_size": 4, "epoch": 0.0844, "step": 211, "tokens_per_device": 1508 }, { "epoch": 0.0844, "loss_ce": 0.4964248538017273, "loss_lvr": 1.4191819429397583, "loss_mode_switch": 0.0, "loss_total": 0.6383430361747742, "step": 211 }, { "epoch": 0.0848, "grad_norm": 1.5640825033187866, "learning_rate": 9.9214552680152e-06, "loss": 0.3636, "step": 212 }, { "batch_size": 4, "epoch": 0.0848, "step": 212, "tokens_per_device": 3844 }, { "epoch": 0.0848, "loss_ce": 0.6588345170021057, "loss_lvr": 1.4933826923370361, "loss_mode_switch": 0.0, "loss_total": 0.8081728219985962, "step": 212 }, { "batch_size": 4, "epoch": 0.0848, "step": 212, "tokens_per_device": 7300 }, { "epoch": 0.0848, "loss_ce": 0.7864121794700623, "loss_lvr": 0.9980648756027222, "loss_mode_switch": 0.0, "loss_total": 0.8862186670303345, "step": 212 }, { "batch_size": 1, "epoch": 0.0848, "step": 212, "tokens_per_device": 6796 }, { "epoch": 0.0848, "loss_ce": 0.0009484532056376338, "loss_lvr": 0.6205418705940247, "loss_mode_switch": 0.0, "loss_total": 0.06300263851881027, "step": 212 }, { "batch_size": 4, "epoch": 0.0848, "step": 212, "tokens_per_device": 1384 }, { "epoch": 0.0848, "loss_ce": 0.3840845227241516, "loss_lvr": 1.1953296661376953, "loss_mode_switch": 0.0, "loss_total": 0.5036174654960632, "step": 212 }, { "batch_size": 1, "epoch": 0.0848, "step": 212, "tokens_per_device": 5169 }, { "epoch": 0.0848, "loss_ce": 0.02121851034462452, "loss_lvr": 0.7009015083312988, "loss_mode_switch": 0.0, "loss_total": 0.09130866080522537, "step": 212 }, { "batch_size": 4, "epoch": 0.0848, "step": 212, "tokens_per_device": 4220 }, { "epoch": 0.0848, "loss_ce": 0.5173783302307129, "loss_lvr": 1.395712971687317, "loss_mode_switch": 0.0, "loss_total": 0.6569496393203735, "step": 212 }, { "batch_size": 1, "epoch": 0.0848, "step": 212, "tokens_per_device": 4887 }, { "epoch": 0.0848, "loss_ce": 0.03887615352869034, "loss_lvr": 0.9689493775367737, "loss_mode_switch": 0.0, "loss_total": 0.13577109575271606, "step": 212 }, { "batch_size": 4, "epoch": 0.0848, "step": 212, "tokens_per_device": 2596 }, { "epoch": 0.0848, "loss_ce": 0.9124469757080078, "loss_lvr": 1.2504149675369263, "loss_mode_switch": 0.0, "loss_total": 1.0374884605407715, "step": 212 }, { "epoch": 0.0852, "grad_norm": 1.8329917192459106, "learning_rate": 9.920307511564686e-06, "loss": 0.409, "step": 213 }, { "batch_size": 1, "epoch": 0.0852, "step": 213, "tokens_per_device": 5098 }, { "epoch": 0.0852, "loss_ce": 0.11423327028751373, "loss_lvr": 0.8490477204322815, "loss_mode_switch": 0.0, "loss_total": 0.19913804531097412, "step": 213 }, { "batch_size": 1, "epoch": 0.0852, "step": 213, "tokens_per_device": 4855 }, { "epoch": 0.0852, "loss_ce": 0.05179578810930252, "loss_lvr": 0.6459853053092957, "loss_mode_switch": 0.0, "loss_total": 0.11639431864023209, "step": 213 }, { "batch_size": 4, "epoch": 0.0852, "step": 213, "tokens_per_device": 13652 }, { "epoch": 0.0852, "loss_ce": 0.11848682910203934, "loss_lvr": 0.9705790281295776, "loss_mode_switch": 0.0, "loss_total": 0.21554473042488098, "step": 213 }, { "batch_size": 1, "epoch": 0.0852, "step": 213, "tokens_per_device": 4879 }, { "epoch": 0.0852, "loss_ce": 0.0738757774233818, "loss_lvr": 0.349199503660202, "loss_mode_switch": 0.0, "loss_total": 0.10879573225975037, "step": 213 }, { "batch_size": 4, "epoch": 0.0852, "step": 213, "tokens_per_device": 4216 }, { "epoch": 0.0852, "loss_ce": 0.6105290651321411, "loss_lvr": 1.0174587965011597, "loss_mode_switch": 0.0, "loss_total": 0.712274968624115, "step": 213 }, { "batch_size": 4, "epoch": 0.0852, "step": 213, "tokens_per_device": 4144 }, { "epoch": 0.0852, "loss_ce": 0.5732305645942688, "loss_lvr": 1.160121202468872, "loss_mode_switch": 0.0, "loss_total": 0.6892426609992981, "step": 213 }, { "batch_size": 4, "epoch": 0.0852, "step": 213, "tokens_per_device": 7300 }, { "epoch": 0.0852, "loss_ce": 0.0037789831403642893, "loss_lvr": 0.8383293151855469, "loss_mode_switch": 0.0, "loss_total": 0.08761192113161087, "step": 213 }, { "batch_size": 4, "epoch": 0.0852, "step": 213, "tokens_per_device": 6512 }, { "epoch": 0.0852, "loss_ce": 0.3234783411026001, "loss_lvr": 1.417816400527954, "loss_mode_switch": 0.0, "loss_total": 0.46525996923446655, "step": 213 }, { "epoch": 0.0856, "grad_norm": 1.4890333414077759, "learning_rate": 9.9191514972365e-06, "loss": 0.3279, "step": 214 }, { "batch_size": 1, "epoch": 0.0856, "step": 214, "tokens_per_device": 5924 }, { "epoch": 0.0856, "loss_ce": 0.3562510311603546, "loss_lvr": 0.6440705060958862, "loss_mode_switch": 0.0, "loss_total": 0.42065808176994324, "step": 214 }, { "batch_size": 4, "epoch": 0.0856, "step": 214, "tokens_per_device": 3852 }, { "epoch": 0.0856, "loss_ce": 0.3894543945789337, "loss_lvr": 0.987453281879425, "loss_mode_switch": 0.0, "loss_total": 0.48819971084594727, "step": 214 }, { "batch_size": 4, "epoch": 0.0856, "step": 214, "tokens_per_device": 6264 }, { "epoch": 0.0856, "loss_ce": 0.4000999629497528, "loss_lvr": 0.776584804058075, "loss_mode_switch": 0.0, "loss_total": 0.4777584373950958, "step": 214 }, { "batch_size": 1, "epoch": 0.0856, "step": 214, "tokens_per_device": 5021 }, { "epoch": 0.0856, "loss_ce": 0.019254187121987343, "loss_lvr": 1.5518956184387207, "loss_mode_switch": 0.0, "loss_total": 0.17444375157356262, "step": 214 }, { "batch_size": 4, "epoch": 0.0856, "step": 214, "tokens_per_device": 1476 }, { "epoch": 0.0856, "loss_ce": 0.4327966272830963, "loss_lvr": 2.2270500659942627, "loss_mode_switch": 0.0, "loss_total": 0.6555016040802002, "step": 214 }, { "batch_size": 1, "epoch": 0.0856, "step": 214, "tokens_per_device": 5202 }, { "epoch": 0.0856, "loss_ce": 0.21181128919124603, "loss_lvr": 0.40933704376220703, "loss_mode_switch": 0.0, "loss_total": 0.25274500250816345, "step": 214 }, { "batch_size": 4, "epoch": 0.0856, "step": 214, "tokens_per_device": 4324 }, { "epoch": 0.0856, "loss_ce": 0.16173309087753296, "loss_lvr": 1.1733587980270386, "loss_mode_switch": 0.0, "loss_total": 0.2790689766407013, "step": 214 }, { "batch_size": 4, "epoch": 0.0856, "step": 214, "tokens_per_device": 2688 }, { "epoch": 0.0856, "loss_ce": 0.47113901376724243, "loss_lvr": 0.9924865365028381, "loss_mode_switch": 0.0, "loss_total": 0.5703876614570618, "step": 214 }, { "epoch": 0.086, "grad_norm": 2.1016147136688232, "learning_rate": 9.917987226970811e-06, "loss": 0.3394, "step": 215 }, { "batch_size": 4, "epoch": 0.086, "step": 215, "tokens_per_device": 3764 }, { "epoch": 0.086, "loss_ce": 0.08119538426399231, "loss_lvr": 1.148318886756897, "loss_mode_switch": 0.0, "loss_total": 0.19602727890014648, "step": 215 }, { "batch_size": 4, "epoch": 0.086, "step": 215, "tokens_per_device": 2572 }, { "epoch": 0.086, "loss_ce": 0.3771112859249115, "loss_lvr": 0.9140545725822449, "loss_mode_switch": 0.0, "loss_total": 0.4685167372226715, "step": 215 }, { "batch_size": 4, "epoch": 0.086, "step": 215, "tokens_per_device": 9448 }, { "epoch": 0.086, "loss_ce": 0.34363463521003723, "loss_lvr": 1.2558503150939941, "loss_mode_switch": 0.0, "loss_total": 0.4692196846008301, "step": 215 }, { "batch_size": 4, "epoch": 0.086, "step": 215, "tokens_per_device": 3356 }, { "epoch": 0.086, "loss_ce": 0.14513494074344635, "loss_lvr": 2.38638973236084, "loss_mode_switch": 0.0, "loss_total": 0.38377392292022705, "step": 215 }, { "batch_size": 4, "epoch": 0.086, "step": 215, "tokens_per_device": 4664 }, { "epoch": 0.086, "loss_ce": 0.3001471161842346, "loss_lvr": 1.0022958517074585, "loss_mode_switch": 0.0, "loss_total": 0.40037670731544495, "step": 215 }, { "batch_size": 4, "epoch": 0.086, "step": 215, "tokens_per_device": 2560 }, { "epoch": 0.086, "loss_ce": 0.4462168216705322, "loss_lvr": 1.1115168333053589, "loss_mode_switch": 0.0, "loss_total": 0.5573685169219971, "step": 215 }, { "batch_size": 4, "epoch": 0.086, "step": 215, "tokens_per_device": 4452 }, { "epoch": 0.086, "loss_ce": 0.20606718957424164, "loss_lvr": 1.0845741033554077, "loss_mode_switch": 0.0, "loss_total": 0.3145245909690857, "step": 215 }, { "batch_size": 1, "epoch": 0.086, "step": 215, "tokens_per_device": 5187 }, { "epoch": 0.086, "loss_ce": 0.2195458859205246, "loss_lvr": 0.7312021851539612, "loss_mode_switch": 0.0, "loss_total": 0.29266610741615295, "step": 215 }, { "epoch": 0.0864, "grad_norm": 1.9040290117263794, "learning_rate": 9.916814702721641e-06, "loss": 0.3567, "step": 216 }, { "batch_size": 4, "epoch": 0.0864, "step": 216, "tokens_per_device": 3176 }, { "epoch": 0.0864, "loss_ce": 0.29146724939346313, "loss_lvr": 0.5813402533531189, "loss_mode_switch": 0.0, "loss_total": 0.34960126876831055, "step": 216 }, { "batch_size": 4, "epoch": 0.0864, "step": 216, "tokens_per_device": 4008 }, { "epoch": 0.0864, "loss_ce": 0.04148249700665474, "loss_lvr": 1.0340944528579712, "loss_mode_switch": 0.0, "loss_total": 0.14489194750785828, "step": 216 }, { "batch_size": 4, "epoch": 0.0864, "step": 216, "tokens_per_device": 1288 }, { "epoch": 0.0864, "loss_ce": 0.4368525445461273, "loss_lvr": 1.4178513288497925, "loss_mode_switch": 0.0, "loss_total": 0.5786376595497131, "step": 216 }, { "batch_size": 4, "epoch": 0.0864, "step": 216, "tokens_per_device": 4412 }, { "epoch": 0.0864, "loss_ce": 0.14166511595249176, "loss_lvr": 0.9930344223976135, "loss_mode_switch": 0.0, "loss_total": 0.24096855521202087, "step": 216 }, { "batch_size": 4, "epoch": 0.0864, "step": 216, "tokens_per_device": 4788 }, { "epoch": 0.0864, "loss_ce": 0.24883082509040833, "loss_lvr": 1.0103696584701538, "loss_mode_switch": 0.0, "loss_total": 0.3498677909374237, "step": 216 }, { "batch_size": 4, "epoch": 0.0864, "step": 216, "tokens_per_device": 4388 }, { "epoch": 0.0864, "loss_ce": 0.5929213166236877, "loss_lvr": 1.1493949890136719, "loss_mode_switch": 0.0, "loss_total": 0.7078608274459839, "step": 216 }, { "batch_size": 1, "epoch": 0.0864, "step": 216, "tokens_per_device": 5108 }, { "epoch": 0.0864, "loss_ce": 0.02862345613539219, "loss_lvr": 1.3813862800598145, "loss_mode_switch": 0.0, "loss_total": 0.1667620986700058, "step": 216 }, { "batch_size": 4, "epoch": 0.0864, "step": 216, "tokens_per_device": 3492 }, { "epoch": 0.0864, "loss_ce": 0.43719425797462463, "loss_lvr": 1.2430781126022339, "loss_mode_switch": 0.0, "loss_total": 0.5615020990371704, "step": 216 }, { "epoch": 0.0868, "grad_norm": 1.7866981029510498, "learning_rate": 9.915633926456874e-06, "loss": 0.3653, "step": 217 }, { "batch_size": 1, "epoch": 0.0868, "step": 217, "tokens_per_device": 5185 }, { "epoch": 0.0868, "loss_ce": 0.025815216824412346, "loss_lvr": 0.37584906816482544, "loss_mode_switch": 0.0, "loss_total": 0.06340012699365616, "step": 217 }, { "batch_size": 1, "epoch": 0.0868, "step": 217, "tokens_per_device": 4976 }, { "epoch": 0.0868, "loss_ce": 0.009619445540010929, "loss_lvr": 0.4833643138408661, "loss_mode_switch": 0.0, "loss_total": 0.05795587599277496, "step": 217 }, { "batch_size": 4, "epoch": 0.0868, "step": 217, "tokens_per_device": 5008 }, { "epoch": 0.0868, "loss_ce": 0.24873724579811096, "loss_lvr": 1.2972115278244019, "loss_mode_switch": 0.0, "loss_total": 0.3784583806991577, "step": 217 }, { "batch_size": 1, "epoch": 0.0868, "step": 217, "tokens_per_device": 5142 }, { "epoch": 0.0868, "loss_ce": 0.1374993920326233, "loss_lvr": 0.47200503945350647, "loss_mode_switch": 0.0, "loss_total": 0.1846998929977417, "step": 217 }, { "batch_size": 1, "epoch": 0.0868, "step": 217, "tokens_per_device": 5097 }, { "epoch": 0.0868, "loss_ce": 0.0036585903726518154, "loss_lvr": 0.8747028112411499, "loss_mode_switch": 0.0, "loss_total": 0.0911288782954216, "step": 217 }, { "batch_size": 1, "epoch": 0.0868, "step": 217, "tokens_per_device": 4900 }, { "epoch": 0.0868, "loss_ce": 1.0700575113296509, "loss_lvr": 1.7763936519622803, "loss_mode_switch": 0.0, "loss_total": 1.247696876525879, "step": 217 }, { "batch_size": 4, "epoch": 0.0868, "step": 217, "tokens_per_device": 5080 }, { "epoch": 0.0868, "loss_ce": 0.5211555361747742, "loss_lvr": 1.1744444370269775, "loss_mode_switch": 0.0, "loss_total": 0.6385999917984009, "step": 217 }, { "batch_size": 1, "epoch": 0.0868, "step": 217, "tokens_per_device": 5172 }, { "epoch": 0.0868, "loss_ce": 0.06655330210924149, "loss_lvr": 0.6075640320777893, "loss_mode_switch": 0.0, "loss_total": 0.12730970978736877, "step": 217 }, { "epoch": 0.0872, "grad_norm": 1.7879794836044312, "learning_rate": 9.914444900158234e-06, "loss": 0.3481, "step": 218 }, { "batch_size": 4, "epoch": 0.0872, "step": 218, "tokens_per_device": 6448 }, { "epoch": 0.0872, "loss_ce": 0.3901232182979584, "loss_lvr": 1.0001583099365234, "loss_mode_switch": 0.0, "loss_total": 0.49013906717300415, "step": 218 }, { "batch_size": 4, "epoch": 0.0872, "step": 218, "tokens_per_device": 9548 }, { "epoch": 0.0872, "loss_ce": 0.2875156104564667, "loss_lvr": 1.0064510107040405, "loss_mode_switch": 0.0, "loss_total": 0.38816070556640625, "step": 218 }, { "batch_size": 4, "epoch": 0.0872, "step": 218, "tokens_per_device": 4856 }, { "epoch": 0.0872, "loss_ce": 0.47101038694381714, "loss_lvr": 1.124111533164978, "loss_mode_switch": 0.0, "loss_total": 0.583421528339386, "step": 218 }, { "batch_size": 4, "epoch": 0.0872, "step": 218, "tokens_per_device": 5188 }, { "epoch": 0.0872, "loss_ce": 0.2921479344367981, "loss_lvr": 1.2801737785339355, "loss_mode_switch": 0.0, "loss_total": 0.4201653003692627, "step": 218 }, { "batch_size": 1, "epoch": 0.0872, "step": 218, "tokens_per_device": 4756 }, { "epoch": 0.0872, "loss_ce": 0.07232264429330826, "loss_lvr": 0.5327354669570923, "loss_mode_switch": 0.0, "loss_total": 0.12559619545936584, "step": 218 }, { "batch_size": 4, "epoch": 0.0872, "step": 218, "tokens_per_device": 5352 }, { "epoch": 0.0872, "loss_ce": 0.49125444889068604, "loss_lvr": 1.1152020692825317, "loss_mode_switch": 0.0, "loss_total": 0.6027746796607971, "step": 218 }, { "batch_size": 4, "epoch": 0.0872, "step": 218, "tokens_per_device": 8600 }, { "epoch": 0.0872, "loss_ce": 0.22009801864624023, "loss_lvr": 1.0456258058547974, "loss_mode_switch": 0.0, "loss_total": 0.32466059923171997, "step": 218 }, { "batch_size": 1, "epoch": 0.0872, "step": 218, "tokens_per_device": 5496 }, { "epoch": 0.0872, "loss_ce": 0.016327913850545883, "loss_lvr": 1.041060209274292, "loss_mode_switch": 0.0, "loss_total": 0.12043394148349762, "step": 218 }, { "epoch": 0.0876, "grad_norm": 2.9172403812408447, "learning_rate": 9.91324762582129e-06, "loss": 0.3598, "step": 219 }, { "batch_size": 4, "epoch": 0.0876, "step": 219, "tokens_per_device": 7080 }, { "epoch": 0.0876, "loss_ce": 0.20504426956176758, "loss_lvr": 1.3579522371292114, "loss_mode_switch": 0.0, "loss_total": 0.3408395051956177, "step": 219 }, { "batch_size": 4, "epoch": 0.0876, "step": 219, "tokens_per_device": 5484 }, { "epoch": 0.0876, "loss_ce": 0.31860417127609253, "loss_lvr": 0.9892298579216003, "loss_mode_switch": 0.0, "loss_total": 0.4175271689891815, "step": 219 }, { "batch_size": 4, "epoch": 0.0876, "step": 219, "tokens_per_device": 4256 }, { "epoch": 0.0876, "loss_ce": 0.3147673010826111, "loss_lvr": 1.3610633611679077, "loss_mode_switch": 0.0, "loss_total": 0.45087364315986633, "step": 219 }, { "batch_size": 4, "epoch": 0.0876, "step": 219, "tokens_per_device": 7944 }, { "epoch": 0.0876, "loss_ce": 0.11957511305809021, "loss_lvr": 1.0825071334838867, "loss_mode_switch": 0.0, "loss_total": 0.2278258204460144, "step": 219 }, { "batch_size": 4, "epoch": 0.0876, "step": 219, "tokens_per_device": 3756 }, { "epoch": 0.0876, "loss_ce": 0.5613085627555847, "loss_lvr": 1.3276658058166504, "loss_mode_switch": 0.0, "loss_total": 0.6940751671791077, "step": 219 }, { "batch_size": 1, "epoch": 0.0876, "step": 219, "tokens_per_device": 4865 }, { "epoch": 0.0876, "loss_ce": 0.00238524260930717, "loss_lvr": 0.8098856210708618, "loss_mode_switch": 0.0, "loss_total": 0.08337380737066269, "step": 219 }, { "batch_size": 4, "epoch": 0.0876, "step": 219, "tokens_per_device": 4456 }, { "epoch": 0.0876, "loss_ce": 0.2964145541191101, "loss_lvr": 1.3504084348678589, "loss_mode_switch": 0.0, "loss_total": 0.4314554035663605, "step": 219 }, { "batch_size": 4, "epoch": 0.0876, "step": 219, "tokens_per_device": 3792 }, { "epoch": 0.0876, "loss_ce": 0.3337528109550476, "loss_lvr": 1.2908682823181152, "loss_mode_switch": 0.0, "loss_total": 0.46283966302871704, "step": 219 }, { "epoch": 0.088, "grad_norm": 1.749126672744751, "learning_rate": 9.912042105455462e-06, "loss": 0.3867, "step": 220 }, { "batch_size": 1, "epoch": 0.088, "step": 220, "tokens_per_device": 5082 }, { "epoch": 0.088, "loss_ce": 0.020812587812542915, "loss_lvr": 1.1022506952285767, "loss_mode_switch": 0.0, "loss_total": 0.1310376673936844, "step": 220 }, { "batch_size": 1, "epoch": 0.088, "step": 220, "tokens_per_device": 5083 }, { "epoch": 0.088, "loss_ce": 0.07724825292825699, "loss_lvr": 1.0146188735961914, "loss_mode_switch": 0.0, "loss_total": 0.17871013283729553, "step": 220 }, { "batch_size": 1, "epoch": 0.088, "step": 220, "tokens_per_device": 5706 }, { "epoch": 0.088, "loss_ce": 0.9416065216064453, "loss_lvr": 0.48770636320114136, "loss_mode_switch": 0.0, "loss_total": 0.9903771877288818, "step": 220 }, { "batch_size": 1, "epoch": 0.088, "step": 220, "tokens_per_device": 4899 }, { "epoch": 0.088, "loss_ce": 0.2642005980014801, "loss_lvr": 1.8108172416687012, "loss_mode_switch": 0.0, "loss_total": 0.44528234004974365, "step": 220 }, { "batch_size": 4, "epoch": 0.088, "step": 220, "tokens_per_device": 6884 }, { "epoch": 0.088, "loss_ce": 0.2082357257604599, "loss_lvr": 0.6711025238037109, "loss_mode_switch": 0.0, "loss_total": 0.27534598112106323, "step": 220 }, { "batch_size": 4, "epoch": 0.088, "step": 220, "tokens_per_device": 3832 }, { "epoch": 0.088, "loss_ce": 0.6426720023155212, "loss_lvr": 1.646086573600769, "loss_mode_switch": 0.0, "loss_total": 0.8072806596755981, "step": 220 }, { "batch_size": 1, "epoch": 0.088, "step": 220, "tokens_per_device": 5195 }, { "epoch": 0.088, "loss_ce": 0.7126015424728394, "loss_lvr": 0.6407134532928467, "loss_mode_switch": 0.0, "loss_total": 0.776672899723053, "step": 220 }, { "batch_size": 4, "epoch": 0.088, "step": 220, "tokens_per_device": 13556 }, { "epoch": 0.088, "loss_ce": 0.5003125667572021, "loss_lvr": 1.390478491783142, "loss_mode_switch": 0.0, "loss_total": 0.6393604278564453, "step": 220 }, { "epoch": 0.0884, "grad_norm": 2.4649498462677, "learning_rate": 9.910828341084006e-06, "loss": 0.3514, "step": 221 }, { "batch_size": 4, "epoch": 0.0884, "step": 221, "tokens_per_device": 4244 }, { "epoch": 0.0884, "loss_ce": 0.6789183616638184, "loss_lvr": 1.6994895935058594, "loss_mode_switch": 0.0, "loss_total": 0.8488672971725464, "step": 221 }, { "batch_size": 4, "epoch": 0.0884, "step": 221, "tokens_per_device": 4820 }, { "epoch": 0.0884, "loss_ce": 0.10117311030626297, "loss_lvr": 1.2604453563690186, "loss_mode_switch": 0.0, "loss_total": 0.2272176444530487, "step": 221 }, { "batch_size": 4, "epoch": 0.0884, "step": 221, "tokens_per_device": 5096 }, { "epoch": 0.0884, "loss_ce": 0.2711102366447449, "loss_lvr": 0.9092897772789001, "loss_mode_switch": 0.0, "loss_total": 0.3620392084121704, "step": 221 }, { "batch_size": 4, "epoch": 0.0884, "step": 221, "tokens_per_device": 3940 }, { "epoch": 0.0884, "loss_ce": 0.7802286148071289, "loss_lvr": 1.1308197975158691, "loss_mode_switch": 0.0, "loss_total": 0.8933106064796448, "step": 221 }, { "batch_size": 1, "epoch": 0.0884, "step": 221, "tokens_per_device": 5124 }, { "epoch": 0.0884, "loss_ce": 0.15749837458133698, "loss_lvr": 0.511736273765564, "loss_mode_switch": 0.0, "loss_total": 0.20867200195789337, "step": 221 }, { "batch_size": 1, "epoch": 0.0884, "step": 221, "tokens_per_device": 5471 }, { "epoch": 0.0884, "loss_ce": 0.03414539992809296, "loss_lvr": 0.9139918088912964, "loss_mode_switch": 0.0, "loss_total": 0.12554457783699036, "step": 221 }, { "batch_size": 1, "epoch": 0.0884, "step": 221, "tokens_per_device": 5631 }, { "epoch": 0.0884, "loss_ce": 0.1433350294828415, "loss_lvr": 0.5352687835693359, "loss_mode_switch": 0.0, "loss_total": 0.19686190783977509, "step": 221 }, { "batch_size": 1, "epoch": 0.0884, "step": 221, "tokens_per_device": 5175 }, { "epoch": 0.0884, "loss_ce": 0.005840725265443325, "loss_lvr": 1.3021504878997803, "loss_mode_switch": 0.0, "loss_total": 0.13605576753616333, "step": 221 }, { "epoch": 0.0888, "grad_norm": 1.432273030281067, "learning_rate": 9.909606334744013e-06, "loss": 0.2989, "step": 222 }, { "batch_size": 4, "epoch": 0.0888, "step": 222, "tokens_per_device": 13648 }, { "epoch": 0.0888, "loss_ce": 0.036064643412828445, "loss_lvr": 0.9172207713127136, "loss_mode_switch": 0.0, "loss_total": 0.12778672575950623, "step": 222 }, { "batch_size": 4, "epoch": 0.0888, "step": 222, "tokens_per_device": 8752 }, { "epoch": 0.0888, "loss_ce": 0.08687803149223328, "loss_lvr": 0.8807967901229858, "loss_mode_switch": 0.0, "loss_total": 0.17495772242546082, "step": 222 }, { "batch_size": 1, "epoch": 0.0888, "step": 222, "tokens_per_device": 5051 }, { "epoch": 0.0888, "loss_ce": 0.0735989362001419, "loss_lvr": 0.6046361327171326, "loss_mode_switch": 0.0, "loss_total": 0.13406255841255188, "step": 222 }, { "batch_size": 4, "epoch": 0.0888, "step": 222, "tokens_per_device": 3896 }, { "epoch": 0.0888, "loss_ce": 0.584925651550293, "loss_lvr": 1.1025022268295288, "loss_mode_switch": 0.0, "loss_total": 0.6951758861541748, "step": 222 }, { "batch_size": 1, "epoch": 0.0888, "step": 222, "tokens_per_device": 5121 }, { "epoch": 0.0888, "loss_ce": 0.040528688579797745, "loss_lvr": 0.9774559736251831, "loss_mode_switch": 0.0, "loss_total": 0.13827428221702576, "step": 222 }, { "batch_size": 1, "epoch": 0.0888, "step": 222, "tokens_per_device": 5086 }, { "epoch": 0.0888, "loss_ce": 0.5649641156196594, "loss_lvr": 0.8398590683937073, "loss_mode_switch": 0.0, "loss_total": 0.6489500403404236, "step": 222 }, { "batch_size": 1, "epoch": 0.0888, "step": 222, "tokens_per_device": 5140 }, { "epoch": 0.0888, "loss_ce": 0.05629502981901169, "loss_lvr": 0.6842113733291626, "loss_mode_switch": 0.0, "loss_total": 0.12471617013216019, "step": 222 }, { "batch_size": 1, "epoch": 0.0888, "step": 222, "tokens_per_device": 5300 }, { "epoch": 0.0888, "loss_ce": 0.09831935912370682, "loss_lvr": 1.4752198457717896, "loss_mode_switch": 0.0, "loss_total": 0.2458413541316986, "step": 222 }, { "epoch": 0.0892, "grad_norm": 1.6746013164520264, "learning_rate": 9.908376088486407e-06, "loss": 0.3438, "step": 223 }, { "batch_size": 4, "epoch": 0.0892, "step": 223, "tokens_per_device": 3624 }, { "epoch": 0.0892, "loss_ce": 0.3086310625076294, "loss_lvr": 1.5553146600723267, "loss_mode_switch": 0.0, "loss_total": 0.46416252851486206, "step": 223 }, { "batch_size": 4, "epoch": 0.0892, "step": 223, "tokens_per_device": 5376 }, { "epoch": 0.0892, "loss_ce": 0.1255103498697281, "loss_lvr": 1.19588303565979, "loss_mode_switch": 0.0, "loss_total": 0.24509865045547485, "step": 223 }, { "batch_size": 4, "epoch": 0.0892, "step": 223, "tokens_per_device": 4448 }, { "epoch": 0.0892, "loss_ce": 0.02781764045357704, "loss_lvr": 0.900205671787262, "loss_mode_switch": 0.0, "loss_total": 0.11783820390701294, "step": 223 }, { "batch_size": 4, "epoch": 0.0892, "step": 223, "tokens_per_device": 1556 }, { "epoch": 0.0892, "loss_ce": 0.6054670214653015, "loss_lvr": 1.0160267353057861, "loss_mode_switch": 0.0, "loss_total": 0.7070696949958801, "step": 223 }, { "batch_size": 4, "epoch": 0.0892, "step": 223, "tokens_per_device": 4220 }, { "epoch": 0.0892, "loss_ce": 0.13261768221855164, "loss_lvr": 0.9816181063652039, "loss_mode_switch": 0.0, "loss_total": 0.2307794988155365, "step": 223 }, { "batch_size": 4, "epoch": 0.0892, "step": 223, "tokens_per_device": 4392 }, { "epoch": 0.0892, "loss_ce": 0.02812536060810089, "loss_lvr": 1.035845160484314, "loss_mode_switch": 0.0, "loss_total": 0.13170987367630005, "step": 223 }, { "batch_size": 4, "epoch": 0.0892, "step": 223, "tokens_per_device": 5732 }, { "epoch": 0.0892, "loss_ce": 0.327115923166275, "loss_lvr": 0.9347677826881409, "loss_mode_switch": 0.0, "loss_total": 0.42059269547462463, "step": 223 }, { "batch_size": 1, "epoch": 0.0892, "step": 223, "tokens_per_device": 4884 }, { "epoch": 0.0892, "loss_ce": 0.2967677414417267, "loss_lvr": 0.5098332762718201, "loss_mode_switch": 0.0, "loss_total": 0.34775108098983765, "step": 223 }, { "epoch": 0.0896, "grad_norm": 1.8961578607559204, "learning_rate": 9.907137604375941e-06, "loss": 0.2889, "step": 224 }, { "batch_size": 1, "epoch": 0.0896, "step": 224, "tokens_per_device": 5134 }, { "epoch": 0.0896, "loss_ce": 0.0608040913939476, "loss_lvr": 0.47577977180480957, "loss_mode_switch": 0.0, "loss_total": 0.10838206857442856, "step": 224 }, { "batch_size": 4, "epoch": 0.0896, "step": 224, "tokens_per_device": 1664 }, { "epoch": 0.0896, "loss_ce": 0.6434913873672485, "loss_lvr": 1.3306646347045898, "loss_mode_switch": 0.0, "loss_total": 0.7765578627586365, "step": 224 }, { "batch_size": 4, "epoch": 0.0896, "step": 224, "tokens_per_device": 1164 }, { "epoch": 0.0896, "loss_ce": 0.09533513337373734, "loss_lvr": 1.4162840843200684, "loss_mode_switch": 0.0, "loss_total": 0.23696354031562805, "step": 224 }, { "batch_size": 4, "epoch": 0.0896, "step": 224, "tokens_per_device": 4512 }, { "epoch": 0.0896, "loss_ce": 0.3387511074542999, "loss_lvr": 1.0743788480758667, "loss_mode_switch": 0.0, "loss_total": 0.4461889863014221, "step": 224 }, { "batch_size": 1, "epoch": 0.0896, "step": 224, "tokens_per_device": 5096 }, { "epoch": 0.0896, "loss_ce": 0.006709063891321421, "loss_lvr": 0.6777951717376709, "loss_mode_switch": 0.0, "loss_total": 0.0744885802268982, "step": 224 }, { "batch_size": 4, "epoch": 0.0896, "step": 224, "tokens_per_device": 3808 }, { "epoch": 0.0896, "loss_ce": 0.3764456808567047, "loss_lvr": 1.4938751459121704, "loss_mode_switch": 0.0, "loss_total": 0.5258331894874573, "step": 224 }, { "batch_size": 4, "epoch": 0.0896, "step": 224, "tokens_per_device": 3824 }, { "epoch": 0.0896, "loss_ce": 0.15796256065368652, "loss_lvr": 0.9999056458473206, "loss_mode_switch": 0.0, "loss_total": 0.25795313715934753, "step": 224 }, { "batch_size": 4, "epoch": 0.0896, "step": 224, "tokens_per_device": 2652 }, { "epoch": 0.0896, "loss_ce": 0.4428936839103699, "loss_lvr": 1.1604790687561035, "loss_mode_switch": 0.0, "loss_total": 0.5589416027069092, "step": 224 }, { "epoch": 0.09, "grad_norm": 1.5533148050308228, "learning_rate": 9.905890884491196e-06, "loss": 0.4095, "step": 225 }, { "batch_size": 4, "epoch": 0.09, "step": 225, "tokens_per_device": 3432 }, { "epoch": 0.09, "loss_ce": 0.04273222014307976, "loss_lvr": 1.0822157859802246, "loss_mode_switch": 0.0, "loss_total": 0.15095379948616028, "step": 225 }, { "batch_size": 4, "epoch": 0.09, "step": 225, "tokens_per_device": 2780 }, { "epoch": 0.09, "loss_ce": 0.018978307023644447, "loss_lvr": 0.838760495185852, "loss_mode_switch": 0.0, "loss_total": 0.10285435616970062, "step": 225 }, { "batch_size": 1, "epoch": 0.09, "step": 225, "tokens_per_device": 4861 }, { "epoch": 0.09, "loss_ce": 0.01423260010778904, "loss_lvr": 0.6823895573616028, "loss_mode_switch": 0.0, "loss_total": 0.08247155696153641, "step": 225 }, { "batch_size": 4, "epoch": 0.09, "step": 225, "tokens_per_device": 5828 }, { "epoch": 0.09, "loss_ce": 0.4484163820743561, "loss_lvr": 0.9316967725753784, "loss_mode_switch": 0.0, "loss_total": 0.5415860414505005, "step": 225 }, { "batch_size": 4, "epoch": 0.09, "step": 225, "tokens_per_device": 3856 }, { "epoch": 0.09, "loss_ce": 0.26318755745887756, "loss_lvr": 0.8839026093482971, "loss_mode_switch": 0.0, "loss_total": 0.3515778183937073, "step": 225 }, { "batch_size": 4, "epoch": 0.09, "step": 225, "tokens_per_device": 2620 }, { "epoch": 0.09, "loss_ce": 0.2740113139152527, "loss_lvr": 1.1644880771636963, "loss_mode_switch": 0.0, "loss_total": 0.39046013355255127, "step": 225 }, { "batch_size": 1, "epoch": 0.09, "step": 225, "tokens_per_device": 4916 }, { "epoch": 0.09, "loss_ce": 0.4226696193218231, "loss_lvr": 0.6292735934257507, "loss_mode_switch": 0.0, "loss_total": 0.48559698462486267, "step": 225 }, { "batch_size": 1, "epoch": 0.09, "step": 225, "tokens_per_device": 4880 }, { "epoch": 0.09, "loss_ce": 0.5268383622169495, "loss_lvr": 0.23008470237255096, "loss_mode_switch": 0.0, "loss_total": 0.5498468279838562, "step": 225 }, { "epoch": 0.0904, "grad_norm": 1.619218111038208, "learning_rate": 9.904635930924573e-06, "loss": 0.3436, "step": 226 }, { "batch_size": 4, "epoch": 0.0904, "step": 226, "tokens_per_device": 5500 }, { "epoch": 0.0904, "loss_ce": 0.4320319890975952, "loss_lvr": 1.0007930994033813, "loss_mode_switch": 0.0, "loss_total": 0.5321112871170044, "step": 226 }, { "batch_size": 4, "epoch": 0.0904, "step": 226, "tokens_per_device": 4452 }, { "epoch": 0.0904, "loss_ce": 0.13871875405311584, "loss_lvr": 1.1697221994400024, "loss_mode_switch": 0.0, "loss_total": 0.2556909918785095, "step": 226 }, { "batch_size": 4, "epoch": 0.0904, "step": 226, "tokens_per_device": 4232 }, { "epoch": 0.0904, "loss_ce": 0.19624286890029907, "loss_lvr": 1.3537639379501343, "loss_mode_switch": 0.0, "loss_total": 0.3316192626953125, "step": 226 }, { "batch_size": 4, "epoch": 0.0904, "step": 226, "tokens_per_device": 1684 }, { "epoch": 0.0904, "loss_ce": 0.49281826615333557, "loss_lvr": 1.200466275215149, "loss_mode_switch": 0.0, "loss_total": 0.6128649115562439, "step": 226 }, { "batch_size": 4, "epoch": 0.0904, "step": 226, "tokens_per_device": 16108 }, { "epoch": 0.0904, "loss_ce": 0.10511178523302078, "loss_lvr": 1.0903699398040771, "loss_mode_switch": 0.0, "loss_total": 0.21414878964424133, "step": 226 }, { "batch_size": 1, "epoch": 0.0904, "step": 226, "tokens_per_device": 5102 }, { "epoch": 0.0904, "loss_ce": 0.1879817694425583, "loss_lvr": 0.778695821762085, "loss_mode_switch": 0.0, "loss_total": 0.26585134863853455, "step": 226 }, { "batch_size": 4, "epoch": 0.0904, "step": 226, "tokens_per_device": 5696 }, { "epoch": 0.0904, "loss_ce": 0.1368907243013382, "loss_lvr": 1.143711805343628, "loss_mode_switch": 0.0, "loss_total": 0.2512618899345398, "step": 226 }, { "batch_size": 4, "epoch": 0.0904, "step": 226, "tokens_per_device": 5792 }, { "epoch": 0.0904, "loss_ce": 0.23696701228618622, "loss_lvr": 1.5180490016937256, "loss_mode_switch": 0.0, "loss_total": 0.3887719213962555, "step": 226 }, { "epoch": 0.0908, "grad_norm": 1.5580899715423584, "learning_rate": 9.903372745782294e-06, "loss": 0.3158, "step": 227 }, { "batch_size": 1, "epoch": 0.0908, "step": 227, "tokens_per_device": 4864 }, { "epoch": 0.0908, "loss_ce": 0.00763336569070816, "loss_lvr": 1.390049934387207, "loss_mode_switch": 0.0, "loss_total": 0.14663836359977722, "step": 227 }, { "batch_size": 4, "epoch": 0.0908, "step": 227, "tokens_per_device": 3984 }, { "epoch": 0.0908, "loss_ce": 0.3339785039424896, "loss_lvr": 0.7718725204467773, "loss_mode_switch": 0.0, "loss_total": 0.4111657738685608, "step": 227 }, { "batch_size": 4, "epoch": 0.0908, "step": 227, "tokens_per_device": 4780 }, { "epoch": 0.0908, "loss_ce": 0.25546619296073914, "loss_lvr": 1.0107311010360718, "loss_mode_switch": 0.0, "loss_total": 0.3565393090248108, "step": 227 }, { "batch_size": 4, "epoch": 0.0908, "step": 227, "tokens_per_device": 1744 }, { "epoch": 0.0908, "loss_ce": 0.5535762906074524, "loss_lvr": 1.2377673387527466, "loss_mode_switch": 0.0, "loss_total": 0.677353024482727, "step": 227 }, { "batch_size": 4, "epoch": 0.0908, "step": 227, "tokens_per_device": 4728 }, { "epoch": 0.0908, "loss_ce": 0.011580045334994793, "loss_lvr": 0.8722790479660034, "loss_mode_switch": 0.0, "loss_total": 0.09880794584751129, "step": 227 }, { "batch_size": 1, "epoch": 0.0908, "step": 227, "tokens_per_device": 5071 }, { "epoch": 0.0908, "loss_ce": 0.10555073618888855, "loss_lvr": 0.8270956873893738, "loss_mode_switch": 0.0, "loss_total": 0.18826031684875488, "step": 227 }, { "batch_size": 1, "epoch": 0.0908, "step": 227, "tokens_per_device": 5122 }, { "epoch": 0.0908, "loss_ce": 0.011010853573679924, "loss_lvr": 0.7305775880813599, "loss_mode_switch": 0.0, "loss_total": 0.08406861871480942, "step": 227 }, { "batch_size": 4, "epoch": 0.0908, "step": 227, "tokens_per_device": 4664 }, { "epoch": 0.0908, "loss_ce": 0.16820141673088074, "loss_lvr": 0.9859029054641724, "loss_mode_switch": 0.0, "loss_total": 0.2667917013168335, "step": 227 }, { "epoch": 0.0912, "grad_norm": 1.6264969110488892, "learning_rate": 9.902101331184391e-06, "loss": 0.3011, "step": 228 }, { "batch_size": 1, "epoch": 0.0912, "step": 228, "tokens_per_device": 5821 }, { "epoch": 0.0912, "loss_ce": 0.01865442469716072, "loss_lvr": 0.9911717772483826, "loss_mode_switch": 0.0, "loss_total": 0.11777161061763763, "step": 228 }, { "batch_size": 1, "epoch": 0.0912, "step": 228, "tokens_per_device": 5029 }, { "epoch": 0.0912, "loss_ce": 0.012688291259109974, "loss_lvr": 0.7741040587425232, "loss_mode_switch": 0.0, "loss_total": 0.09009870141744614, "step": 228 }, { "batch_size": 4, "epoch": 0.0912, "step": 228, "tokens_per_device": 1200 }, { "epoch": 0.0912, "loss_ce": 0.28990471363067627, "loss_lvr": 1.9383677244186401, "loss_mode_switch": 0.0, "loss_total": 0.48374149203300476, "step": 228 }, { "batch_size": 1, "epoch": 0.0912, "step": 228, "tokens_per_device": 4861 }, { "epoch": 0.0912, "loss_ce": 0.017018364742398262, "loss_lvr": 0.92447429895401, "loss_mode_switch": 0.0, "loss_total": 0.10946579277515411, "step": 228 }, { "batch_size": 1, "epoch": 0.0912, "step": 228, "tokens_per_device": 4748 }, { "epoch": 0.0912, "loss_ce": 0.0033123798202723265, "loss_lvr": 0.3492860496044159, "loss_mode_switch": 0.0, "loss_total": 0.038240984082221985, "step": 228 }, { "batch_size": 4, "epoch": 0.0912, "step": 228, "tokens_per_device": 6612 }, { "epoch": 0.0912, "loss_ce": 0.49038252234458923, "loss_lvr": 1.0420751571655273, "loss_mode_switch": 0.0, "loss_total": 0.5945900678634644, "step": 228 }, { "batch_size": 4, "epoch": 0.0912, "step": 228, "tokens_per_device": 5160 }, { "epoch": 0.0912, "loss_ce": 0.7685553431510925, "loss_lvr": 0.836392343044281, "loss_mode_switch": 0.0, "loss_total": 0.8521945476531982, "step": 228 }, { "batch_size": 1, "epoch": 0.0912, "step": 228, "tokens_per_device": 5121 }, { "epoch": 0.0912, "loss_ce": 0.31789517402648926, "loss_lvr": 0.17065778374671936, "loss_mode_switch": 0.0, "loss_total": 0.3349609375, "step": 228 }, { "epoch": 0.0916, "grad_norm": 1.5728724002838135, "learning_rate": 9.900821689264715e-06, "loss": 0.3132, "step": 229 }, { "batch_size": 4, "epoch": 0.0916, "step": 229, "tokens_per_device": 3800 }, { "epoch": 0.0916, "loss_ce": 0.13708636164665222, "loss_lvr": 1.3117271661758423, "loss_mode_switch": 0.0, "loss_total": 0.26825907826423645, "step": 229 }, { "batch_size": 4, "epoch": 0.0916, "step": 229, "tokens_per_device": 4624 }, { "epoch": 0.0916, "loss_ce": 0.5531988143920898, "loss_lvr": 0.9648269414901733, "loss_mode_switch": 0.0, "loss_total": 0.6496815085411072, "step": 229 }, { "batch_size": 4, "epoch": 0.0916, "step": 229, "tokens_per_device": 2080 }, { "epoch": 0.0916, "loss_ce": 0.796996533870697, "loss_lvr": 1.0093663930892944, "loss_mode_switch": 0.0, "loss_total": 0.8979331851005554, "step": 229 }, { "batch_size": 4, "epoch": 0.0916, "step": 229, "tokens_per_device": 5108 }, { "epoch": 0.0916, "loss_ce": 0.09274987876415253, "loss_lvr": 1.12515127658844, "loss_mode_switch": 0.0, "loss_total": 0.20526501536369324, "step": 229 }, { "batch_size": 4, "epoch": 0.0916, "step": 229, "tokens_per_device": 2660 }, { "epoch": 0.0916, "loss_ce": 0.608974039554596, "loss_lvr": 1.1506379842758179, "loss_mode_switch": 0.0, "loss_total": 0.7240378260612488, "step": 229 }, { "batch_size": 4, "epoch": 0.0916, "step": 229, "tokens_per_device": 5176 }, { "epoch": 0.0916, "loss_ce": 0.3148471713066101, "loss_lvr": 0.8684385418891907, "loss_mode_switch": 0.0, "loss_total": 0.4016910195350647, "step": 229 }, { "batch_size": 1, "epoch": 0.0916, "step": 229, "tokens_per_device": 4563 }, { "epoch": 0.0916, "loss_ce": 0.004163578152656555, "loss_lvr": 0.980391263961792, "loss_mode_switch": 0.0, "loss_total": 0.10220270603895187, "step": 229 }, { "batch_size": 4, "epoch": 0.0916, "step": 229, "tokens_per_device": 1908 }, { "epoch": 0.0916, "loss_ce": 0.30168089270591736, "loss_lvr": 1.191331386566162, "loss_mode_switch": 0.0, "loss_total": 0.42081403732299805, "step": 229 }, { "epoch": 0.092, "grad_norm": 1.4485509395599365, "learning_rate": 9.899533822170922e-06, "loss": 0.344, "step": 230 }, { "batch_size": 4, "epoch": 0.092, "step": 230, "tokens_per_device": 2232 }, { "epoch": 0.092, "loss_ce": 0.2861785888671875, "loss_lvr": 0.9575022459030151, "loss_mode_switch": 0.0, "loss_total": 0.38192880153656006, "step": 230 }, { "batch_size": 1, "epoch": 0.092, "step": 230, "tokens_per_device": 5040 }, { "epoch": 0.092, "loss_ce": 0.022843999788165092, "loss_lvr": 1.1389319896697998, "loss_mode_switch": 0.0, "loss_total": 0.13673719763755798, "step": 230 }, { "batch_size": 4, "epoch": 0.092, "step": 230, "tokens_per_device": 1576 }, { "epoch": 0.092, "loss_ce": 0.6015589833259583, "loss_lvr": 1.0668865442276, "loss_mode_switch": 0.0, "loss_total": 0.7082476615905762, "step": 230 }, { "batch_size": 4, "epoch": 0.092, "step": 230, "tokens_per_device": 2732 }, { "epoch": 0.092, "loss_ce": 0.6912307143211365, "loss_lvr": 0.7834580540657043, "loss_mode_switch": 0.0, "loss_total": 0.7695765495300293, "step": 230 }, { "batch_size": 1, "epoch": 0.092, "step": 230, "tokens_per_device": 5109 }, { "epoch": 0.092, "loss_ce": 0.18144509196281433, "loss_lvr": 0.7917483448982239, "loss_mode_switch": 0.0, "loss_total": 0.2606199383735657, "step": 230 }, { "batch_size": 4, "epoch": 0.092, "step": 230, "tokens_per_device": 2652 }, { "epoch": 0.092, "loss_ce": 0.23884838819503784, "loss_lvr": 1.4445013999938965, "loss_mode_switch": 0.0, "loss_total": 0.38329851627349854, "step": 230 }, { "batch_size": 4, "epoch": 0.092, "step": 230, "tokens_per_device": 5784 }, { "epoch": 0.092, "loss_ce": 0.013376050628721714, "loss_lvr": 0.7934259176254272, "loss_mode_switch": 0.0, "loss_total": 0.09271864593029022, "step": 230 }, { "batch_size": 1, "epoch": 0.092, "step": 230, "tokens_per_device": 5098 }, { "epoch": 0.092, "loss_ce": 0.021692434325814247, "loss_lvr": 1.759217619895935, "loss_mode_switch": 0.0, "loss_total": 0.19761420786380768, "step": 230 }, { "epoch": 0.0924, "grad_norm": 1.8709032535552979, "learning_rate": 9.898237732064472e-06, "loss": 0.3815, "step": 231 }, { "batch_size": 4, "epoch": 0.0924, "step": 231, "tokens_per_device": 4524 }, { "epoch": 0.0924, "loss_ce": 0.2059347927570343, "loss_lvr": 0.844758927822113, "loss_mode_switch": 0.0, "loss_total": 0.29041069746017456, "step": 231 }, { "batch_size": 1, "epoch": 0.0924, "step": 231, "tokens_per_device": 4419 }, { "epoch": 0.0924, "loss_ce": 0.31838154792785645, "loss_lvr": 0.8191235065460205, "loss_mode_switch": 0.0, "loss_total": 0.40029388666152954, "step": 231 }, { "batch_size": 4, "epoch": 0.0924, "step": 231, "tokens_per_device": 15392 }, { "epoch": 0.0924, "loss_ce": 0.8495973944664001, "loss_lvr": 1.154293417930603, "loss_mode_switch": 0.0, "loss_total": 0.9650267362594604, "step": 231 }, { "batch_size": 1, "epoch": 0.0924, "step": 231, "tokens_per_device": 5139 }, { "epoch": 0.0924, "loss_ce": 0.11601213365793228, "loss_lvr": 0.5114790797233582, "loss_mode_switch": 0.0, "loss_total": 0.1671600341796875, "step": 231 }, { "batch_size": 4, "epoch": 0.0924, "step": 231, "tokens_per_device": 10984 }, { "epoch": 0.0924, "loss_ce": 0.27120456099510193, "loss_lvr": 1.0518271923065186, "loss_mode_switch": 0.0, "loss_total": 0.3763872981071472, "step": 231 }, { "batch_size": 4, "epoch": 0.0924, "step": 231, "tokens_per_device": 5768 }, { "epoch": 0.0924, "loss_ce": 0.19242824614048004, "loss_lvr": 1.3384302854537964, "loss_mode_switch": 0.0, "loss_total": 0.32627129554748535, "step": 231 }, { "batch_size": 4, "epoch": 0.0924, "step": 231, "tokens_per_device": 11716 }, { "epoch": 0.0924, "loss_ce": 0.6543384790420532, "loss_lvr": 1.0890557765960693, "loss_mode_switch": 0.0, "loss_total": 0.7632440328598022, "step": 231 }, { "batch_size": 4, "epoch": 0.0924, "step": 231, "tokens_per_device": 4324 }, { "epoch": 0.0924, "loss_ce": 0.5930390954017639, "loss_lvr": 1.0842082500457764, "loss_mode_switch": 0.0, "loss_total": 0.7014599442481995, "step": 231 }, { "epoch": 0.0928, "grad_norm": 1.9617948532104492, "learning_rate": 9.896933421120623e-06, "loss": 0.4208, "step": 232 }, { "batch_size": 1, "epoch": 0.0928, "step": 232, "tokens_per_device": 4895 }, { "epoch": 0.0928, "loss_ce": 0.584847629070282, "loss_lvr": 0.35167989134788513, "loss_mode_switch": 0.0, "loss_total": 0.6200156211853027, "step": 232 }, { "batch_size": 4, "epoch": 0.0928, "step": 232, "tokens_per_device": 1408 }, { "epoch": 0.0928, "loss_ce": 0.48663002252578735, "loss_lvr": 1.2215352058410645, "loss_mode_switch": 0.0, "loss_total": 0.6087835431098938, "step": 232 }, { "batch_size": 4, "epoch": 0.0928, "step": 232, "tokens_per_device": 6628 }, { "epoch": 0.0928, "loss_ce": 0.18181297183036804, "loss_lvr": 0.8544291853904724, "loss_mode_switch": 0.0, "loss_total": 0.26725590229034424, "step": 232 }, { "batch_size": 4, "epoch": 0.0928, "step": 232, "tokens_per_device": 3512 }, { "epoch": 0.0928, "loss_ce": 0.2975587844848633, "loss_lvr": 0.688092827796936, "loss_mode_switch": 0.0, "loss_total": 0.36636805534362793, "step": 232 }, { "batch_size": 1, "epoch": 0.0928, "step": 232, "tokens_per_device": 5095 }, { "epoch": 0.0928, "loss_ce": 0.09485778212547302, "loss_lvr": 1.0709514617919922, "loss_mode_switch": 0.0, "loss_total": 0.20195293426513672, "step": 232 }, { "batch_size": 4, "epoch": 0.0928, "step": 232, "tokens_per_device": 14972 }, { "epoch": 0.0928, "loss_ce": 0.1342865526676178, "loss_lvr": 1.2604049444198608, "loss_mode_switch": 0.0, "loss_total": 0.2603270411491394, "step": 232 }, { "batch_size": 1, "epoch": 0.0928, "step": 232, "tokens_per_device": 5191 }, { "epoch": 0.0928, "loss_ce": 0.4604862630367279, "loss_lvr": 0.9976256489753723, "loss_mode_switch": 0.0, "loss_total": 0.560248851776123, "step": 232 }, { "batch_size": 4, "epoch": 0.0928, "step": 232, "tokens_per_device": 5752 }, { "epoch": 0.0928, "loss_ce": 0.3081114590167999, "loss_lvr": 1.0710334777832031, "loss_mode_switch": 0.0, "loss_total": 0.41521480679512024, "step": 232 }, { "epoch": 0.0932, "grad_norm": 2.183842182159424, "learning_rate": 9.895620891528437e-06, "loss": 0.3467, "step": 233 }, { "batch_size": 4, "epoch": 0.0932, "step": 233, "tokens_per_device": 4268 }, { "epoch": 0.0932, "loss_ce": 0.22434450685977936, "loss_lvr": 1.1793488264083862, "loss_mode_switch": 0.0, "loss_total": 0.3422793745994568, "step": 233 }, { "batch_size": 1, "epoch": 0.0932, "step": 233, "tokens_per_device": 4879 }, { "epoch": 0.0932, "loss_ce": 0.13770513236522675, "loss_lvr": 0.6742678284645081, "loss_mode_switch": 0.0, "loss_total": 0.2051319181919098, "step": 233 }, { "batch_size": 4, "epoch": 0.0932, "step": 233, "tokens_per_device": 3832 }, { "epoch": 0.0932, "loss_ce": 0.10755672305822372, "loss_lvr": 0.9096929430961609, "loss_mode_switch": 0.0, "loss_total": 0.1985260248184204, "step": 233 }, { "batch_size": 4, "epoch": 0.0932, "step": 233, "tokens_per_device": 5136 }, { "epoch": 0.0932, "loss_ce": 0.22369317710399628, "loss_lvr": 1.1663923263549805, "loss_mode_switch": 0.0, "loss_total": 0.34033241868019104, "step": 233 }, { "batch_size": 4, "epoch": 0.0932, "step": 233, "tokens_per_device": 3852 }, { "epoch": 0.0932, "loss_ce": 0.3773188889026642, "loss_lvr": 1.3743408918380737, "loss_mode_switch": 0.0, "loss_total": 0.514752984046936, "step": 233 }, { "batch_size": 4, "epoch": 0.0932, "step": 233, "tokens_per_device": 6136 }, { "epoch": 0.0932, "loss_ce": 0.40424802899360657, "loss_lvr": 0.9254868030548096, "loss_mode_switch": 0.0, "loss_total": 0.49679672718048096, "step": 233 }, { "batch_size": 4, "epoch": 0.0932, "step": 233, "tokens_per_device": 3796 }, { "epoch": 0.0932, "loss_ce": 0.08166394382715225, "loss_lvr": 1.3719395399093628, "loss_mode_switch": 0.0, "loss_total": 0.21885791420936584, "step": 233 }, { "batch_size": 4, "epoch": 0.0932, "step": 233, "tokens_per_device": 10280 }, { "epoch": 0.0932, "loss_ce": 0.0566178523004055, "loss_lvr": 0.8212128281593323, "loss_mode_switch": 0.0, "loss_total": 0.13873913884162903, "step": 233 }, { "epoch": 0.0936, "grad_norm": 2.4695074558258057, "learning_rate": 9.894300145490763e-06, "loss": 0.3515, "step": 234 }, { "batch_size": 1, "epoch": 0.0936, "step": 234, "tokens_per_device": 4693 }, { "epoch": 0.0936, "loss_ce": 0.19007471203804016, "loss_lvr": 0.6176976561546326, "loss_mode_switch": 0.0, "loss_total": 0.25184446573257446, "step": 234 }, { "batch_size": 4, "epoch": 0.0936, "step": 234, "tokens_per_device": 3556 }, { "epoch": 0.0936, "loss_ce": 0.26821309328079224, "loss_lvr": 1.5505869388580322, "loss_mode_switch": 0.0, "loss_total": 0.4232717752456665, "step": 234 }, { "batch_size": 4, "epoch": 0.0936, "step": 234, "tokens_per_device": 4416 }, { "epoch": 0.0936, "loss_ce": 0.12986139953136444, "loss_lvr": 1.369930386543274, "loss_mode_switch": 0.0, "loss_total": 0.2668544352054596, "step": 234 }, { "batch_size": 1, "epoch": 0.0936, "step": 234, "tokens_per_device": 6476 }, { "epoch": 0.0936, "loss_ce": 0.056309428066015244, "loss_lvr": 0.6557325720787048, "loss_mode_switch": 0.0, "loss_total": 0.12188269197940826, "step": 234 }, { "batch_size": 1, "epoch": 0.0936, "step": 234, "tokens_per_device": 4882 }, { "epoch": 0.0936, "loss_ce": 0.010476280935108662, "loss_lvr": 0.673949122428894, "loss_mode_switch": 0.0, "loss_total": 0.07787119597196579, "step": 234 }, { "batch_size": 4, "epoch": 0.0936, "step": 234, "tokens_per_device": 3788 }, { "epoch": 0.0936, "loss_ce": 0.4021091163158417, "loss_lvr": 1.3675729036331177, "loss_mode_switch": 0.0, "loss_total": 0.538866400718689, "step": 234 }, { "batch_size": 4, "epoch": 0.0936, "step": 234, "tokens_per_device": 4164 }, { "epoch": 0.0936, "loss_ce": 0.19378599524497986, "loss_lvr": 1.272355079650879, "loss_mode_switch": 0.0, "loss_total": 0.32102149724960327, "step": 234 }, { "batch_size": 1, "epoch": 0.0936, "step": 234, "tokens_per_device": 4858 }, { "epoch": 0.0936, "loss_ce": 0.00809573382139206, "loss_lvr": 1.276192307472229, "loss_mode_switch": 0.0, "loss_total": 0.13571497797966003, "step": 234 }, { "epoch": 0.094, "grad_norm": 1.40639328956604, "learning_rate": 9.892971185224244e-06, "loss": 0.3222, "step": 235 }, { "batch_size": 4, "epoch": 0.094, "step": 235, "tokens_per_device": 5980 }, { "epoch": 0.094, "loss_ce": 0.43147918581962585, "loss_lvr": 1.2337367534637451, "loss_mode_switch": 0.0, "loss_total": 0.5548528432846069, "step": 235 }, { "batch_size": 1, "epoch": 0.094, "step": 235, "tokens_per_device": 5116 }, { "epoch": 0.094, "loss_ce": 0.16510814428329468, "loss_lvr": 0.6135387420654297, "loss_mode_switch": 0.0, "loss_total": 0.22646202147006989, "step": 235 }, { "batch_size": 4, "epoch": 0.094, "step": 235, "tokens_per_device": 6504 }, { "epoch": 0.094, "loss_ce": 0.09697767347097397, "loss_lvr": 0.8682519197463989, "loss_mode_switch": 0.0, "loss_total": 0.18380287289619446, "step": 235 }, { "batch_size": 4, "epoch": 0.094, "step": 235, "tokens_per_device": 4032 }, { "epoch": 0.094, "loss_ce": 0.18006616830825806, "loss_lvr": 1.1329811811447144, "loss_mode_switch": 0.0, "loss_total": 0.2933642864227295, "step": 235 }, { "batch_size": 4, "epoch": 0.094, "step": 235, "tokens_per_device": 4388 }, { "epoch": 0.094, "loss_ce": 0.0920565128326416, "loss_lvr": 0.9203843474388123, "loss_mode_switch": 0.0, "loss_total": 0.18409495055675507, "step": 235 }, { "batch_size": 4, "epoch": 0.094, "step": 235, "tokens_per_device": 1652 }, { "epoch": 0.094, "loss_ce": 0.5308932662010193, "loss_lvr": 1.1097017526626587, "loss_mode_switch": 0.0, "loss_total": 0.6418634653091431, "step": 235 }, { "batch_size": 4, "epoch": 0.094, "step": 235, "tokens_per_device": 5220 }, { "epoch": 0.094, "loss_ce": 0.27925172448158264, "loss_lvr": 1.0302296876907349, "loss_mode_switch": 0.0, "loss_total": 0.38227468729019165, "step": 235 }, { "batch_size": 1, "epoch": 0.094, "step": 235, "tokens_per_device": 8004 }, { "epoch": 0.094, "loss_ce": 0.016487564891576767, "loss_lvr": 0.6744285225868225, "loss_mode_switch": 0.0, "loss_total": 0.08393041789531708, "step": 235 }, { "epoch": 0.0944, "grad_norm": 1.456020474433899, "learning_rate": 9.891634012959311e-06, "loss": 0.3191, "step": 236 }, { "batch_size": 4, "epoch": 0.0944, "step": 236, "tokens_per_device": 1540 }, { "epoch": 0.0944, "loss_ce": 0.16636686027050018, "loss_lvr": 1.2160868644714355, "loss_mode_switch": 0.0, "loss_total": 0.287975549697876, "step": 236 }, { "batch_size": 4, "epoch": 0.0944, "step": 236, "tokens_per_device": 4196 }, { "epoch": 0.0944, "loss_ce": 0.0996120423078537, "loss_lvr": 1.421000361442566, "loss_mode_switch": 0.0, "loss_total": 0.2417120784521103, "step": 236 }, { "batch_size": 1, "epoch": 0.0944, "step": 236, "tokens_per_device": 4898 }, { "epoch": 0.0944, "loss_ce": 0.009703943505883217, "loss_lvr": 0.7947067618370056, "loss_mode_switch": 0.0, "loss_total": 0.08917462080717087, "step": 236 }, { "batch_size": 1, "epoch": 0.0944, "step": 236, "tokens_per_device": 5174 }, { "epoch": 0.0944, "loss_ce": 0.06943361461162567, "loss_lvr": 1.1964143514633179, "loss_mode_switch": 0.0, "loss_total": 0.1890750527381897, "step": 236 }, { "batch_size": 4, "epoch": 0.0944, "step": 236, "tokens_per_device": 3888 }, { "epoch": 0.0944, "loss_ce": 0.3364807069301605, "loss_lvr": 1.1801748275756836, "loss_mode_switch": 0.0, "loss_total": 0.45449820160865784, "step": 236 }, { "batch_size": 4, "epoch": 0.0944, "step": 236, "tokens_per_device": 13956 }, { "epoch": 0.0944, "loss_ce": 0.25277331471443176, "loss_lvr": 1.0146489143371582, "loss_mode_switch": 0.0, "loss_total": 0.35423821210861206, "step": 236 }, { "batch_size": 1, "epoch": 0.0944, "step": 236, "tokens_per_device": 5092 }, { "epoch": 0.0944, "loss_ce": 0.6200579404830933, "loss_lvr": 1.0323975086212158, "loss_mode_switch": 0.0, "loss_total": 0.7232977151870728, "step": 236 }, { "batch_size": 4, "epoch": 0.0944, "step": 236, "tokens_per_device": 3808 }, { "epoch": 0.0944, "loss_ce": 0.4105086922645569, "loss_lvr": 1.0535471439361572, "loss_mode_switch": 0.0, "loss_total": 0.5158634185791016, "step": 236 }, { "epoch": 0.0948, "grad_norm": 1.7360237836837769, "learning_rate": 9.890288630940168e-06, "loss": 0.3399, "step": 237 }, { "batch_size": 4, "epoch": 0.0948, "step": 237, "tokens_per_device": 14620 }, { "epoch": 0.0948, "loss_ce": 0.08033385127782822, "loss_lvr": 0.8845310211181641, "loss_mode_switch": 0.0, "loss_total": 0.16878695785999298, "step": 237 }, { "batch_size": 1, "epoch": 0.0948, "step": 237, "tokens_per_device": 4517 }, { "epoch": 0.0948, "loss_ce": 0.12320013344287872, "loss_lvr": 0.7815015316009521, "loss_mode_switch": 0.0, "loss_total": 0.20135028660297394, "step": 237 }, { "batch_size": 1, "epoch": 0.0948, "step": 237, "tokens_per_device": 5153 }, { "epoch": 0.0948, "loss_ce": 0.13439951837062836, "loss_lvr": 0.3459164798259735, "loss_mode_switch": 0.0, "loss_total": 0.16899116337299347, "step": 237 }, { "batch_size": 4, "epoch": 0.0948, "step": 237, "tokens_per_device": 1584 }, { "epoch": 0.0948, "loss_ce": 0.26368242502212524, "loss_lvr": 2.4494121074676514, "loss_mode_switch": 0.0, "loss_total": 0.5086236596107483, "step": 237 }, { "batch_size": 1, "epoch": 0.0948, "step": 237, "tokens_per_device": 5189 }, { "epoch": 0.0948, "loss_ce": 0.0363735631108284, "loss_lvr": 1.0466644763946533, "loss_mode_switch": 0.0, "loss_total": 0.14104001224040985, "step": 237 }, { "batch_size": 1, "epoch": 0.0948, "step": 237, "tokens_per_device": 4903 }, { "epoch": 0.0948, "loss_ce": 0.1294657588005066, "loss_lvr": 0.3563291132450104, "loss_mode_switch": 0.0, "loss_total": 0.1650986671447754, "step": 237 }, { "batch_size": 4, "epoch": 0.0948, "step": 237, "tokens_per_device": 4396 }, { "epoch": 0.0948, "loss_ce": 0.15901969373226166, "loss_lvr": 0.8890385627746582, "loss_mode_switch": 0.0, "loss_total": 0.24792355298995972, "step": 237 }, { "batch_size": 4, "epoch": 0.0948, "step": 237, "tokens_per_device": 1376 }, { "epoch": 0.0948, "loss_ce": 0.5474459528923035, "loss_lvr": 1.405591607093811, "loss_mode_switch": 0.0, "loss_total": 0.6880050897598267, "step": 237 }, { "epoch": 0.0952, "grad_norm": 1.557005524635315, "learning_rate": 9.88893504142481e-06, "loss": 0.3386, "step": 238 }, { "batch_size": 4, "epoch": 0.0952, "step": 238, "tokens_per_device": 4208 }, { "epoch": 0.0952, "loss_ce": 0.07465054839849472, "loss_lvr": 0.7821645736694336, "loss_mode_switch": 0.0, "loss_total": 0.15286700427532196, "step": 238 }, { "batch_size": 4, "epoch": 0.0952, "step": 238, "tokens_per_device": 4828 }, { "epoch": 0.0952, "loss_ce": 0.19545042514801025, "loss_lvr": 1.12752103805542, "loss_mode_switch": 0.0, "loss_total": 0.3082025349140167, "step": 238 }, { "batch_size": 4, "epoch": 0.0952, "step": 238, "tokens_per_device": 2676 }, { "epoch": 0.0952, "loss_ce": 0.14934201538562775, "loss_lvr": 1.327879548072815, "loss_mode_switch": 0.0, "loss_total": 0.2821299731731415, "step": 238 }, { "batch_size": 4, "epoch": 0.0952, "step": 238, "tokens_per_device": 1528 }, { "epoch": 0.0952, "loss_ce": 0.4654003083705902, "loss_lvr": 1.35101318359375, "loss_mode_switch": 0.0, "loss_total": 0.6005016565322876, "step": 238 }, { "batch_size": 4, "epoch": 0.0952, "step": 238, "tokens_per_device": 4268 }, { "epoch": 0.0952, "loss_ce": 0.24131619930267334, "loss_lvr": 1.3193928003311157, "loss_mode_switch": 0.0, "loss_total": 0.37325549125671387, "step": 238 }, { "batch_size": 1, "epoch": 0.0952, "step": 238, "tokens_per_device": 6748 }, { "epoch": 0.0952, "loss_ce": 0.10719835013151169, "loss_lvr": 0.5307987332344055, "loss_mode_switch": 0.0, "loss_total": 0.16027823090553284, "step": 238 }, { "batch_size": 1, "epoch": 0.0952, "step": 238, "tokens_per_device": 4880 }, { "epoch": 0.0952, "loss_ce": 0.040248073637485504, "loss_lvr": 0.4101303219795227, "loss_mode_switch": 0.0, "loss_total": 0.08126110583543777, "step": 238 }, { "batch_size": 4, "epoch": 0.0952, "step": 238, "tokens_per_device": 8884 }, { "epoch": 0.0952, "loss_ce": 0.013537362217903137, "loss_lvr": 1.15168035030365, "loss_mode_switch": 0.0, "loss_total": 0.12870539724826813, "step": 238 }, { "epoch": 0.0956, "grad_norm": 1.5463796854019165, "learning_rate": 9.887573246684998e-06, "loss": 0.3619, "step": 239 }, { "batch_size": 1, "epoch": 0.0956, "step": 239, "tokens_per_device": 4869 }, { "epoch": 0.0956, "loss_ce": 0.09498019516468048, "loss_lvr": 0.44953250885009766, "loss_mode_switch": 0.0, "loss_total": 0.13993345201015472, "step": 239 }, { "batch_size": 1, "epoch": 0.0956, "step": 239, "tokens_per_device": 5150 }, { "epoch": 0.0956, "loss_ce": 0.004024895373731852, "loss_lvr": 0.5166605710983276, "loss_mode_switch": 0.0, "loss_total": 0.0556909553706646, "step": 239 }, { "batch_size": 1, "epoch": 0.0956, "step": 239, "tokens_per_device": 4802 }, { "epoch": 0.0956, "loss_ce": 0.046980421990156174, "loss_lvr": 0.9465421438217163, "loss_mode_switch": 0.0, "loss_total": 0.14163464307785034, "step": 239 }, { "batch_size": 4, "epoch": 0.0956, "step": 239, "tokens_per_device": 2928 }, { "epoch": 0.0956, "loss_ce": 0.01859426125884056, "loss_lvr": 0.8774659633636475, "loss_mode_switch": 0.0, "loss_total": 0.10634085536003113, "step": 239 }, { "batch_size": 4, "epoch": 0.0956, "step": 239, "tokens_per_device": 4616 }, { "epoch": 0.0956, "loss_ce": 0.034998293966054916, "loss_lvr": 1.0126241445541382, "loss_mode_switch": 0.0, "loss_total": 0.13626070320606232, "step": 239 }, { "batch_size": 4, "epoch": 0.0956, "step": 239, "tokens_per_device": 3688 }, { "epoch": 0.0956, "loss_ce": 0.33445489406585693, "loss_lvr": 1.411395788192749, "loss_mode_switch": 0.0, "loss_total": 0.4755944609642029, "step": 239 }, { "batch_size": 4, "epoch": 0.0956, "step": 239, "tokens_per_device": 3300 }, { "epoch": 0.0956, "loss_ce": 0.09944818913936615, "loss_lvr": 1.0219523906707764, "loss_mode_switch": 0.0, "loss_total": 0.2016434371471405, "step": 239 }, { "batch_size": 4, "epoch": 0.0956, "step": 239, "tokens_per_device": 3604 }, { "epoch": 0.0956, "loss_ce": 0.11303798109292984, "loss_lvr": 0.7198531627655029, "loss_mode_switch": 0.0, "loss_total": 0.18502330780029297, "step": 239 }, { "epoch": 0.096, "grad_norm": 1.6130980253219604, "learning_rate": 9.886203249006265e-06, "loss": 0.2785, "step": 240 }, { "batch_size": 4, "epoch": 0.096, "step": 240, "tokens_per_device": 4992 }, { "epoch": 0.096, "loss_ce": 0.04044497385621071, "loss_lvr": 0.9030840396881104, "loss_mode_switch": 0.0, "loss_total": 0.13075338304042816, "step": 240 }, { "batch_size": 1, "epoch": 0.096, "step": 240, "tokens_per_device": 4316 }, { "epoch": 0.096, "loss_ce": 0.07028406113386154, "loss_lvr": 1.068235993385315, "loss_mode_switch": 0.0, "loss_total": 0.17710766196250916, "step": 240 }, { "batch_size": 1, "epoch": 0.096, "step": 240, "tokens_per_device": 6716 }, { "epoch": 0.096, "loss_ce": 0.1989373415708542, "loss_lvr": 0.8059281706809998, "loss_mode_switch": 0.0, "loss_total": 0.2795301675796509, "step": 240 }, { "batch_size": 4, "epoch": 0.096, "step": 240, "tokens_per_device": 3876 }, { "epoch": 0.096, "loss_ce": 0.9049807190895081, "loss_lvr": 2.7318592071533203, "loss_mode_switch": 0.0, "loss_total": 1.1781666278839111, "step": 240 }, { "batch_size": 4, "epoch": 0.096, "step": 240, "tokens_per_device": 3468 }, { "epoch": 0.096, "loss_ce": 0.3614738881587982, "loss_lvr": 0.5581212639808655, "loss_mode_switch": 0.0, "loss_total": 0.4172860085964203, "step": 240 }, { "batch_size": 4, "epoch": 0.096, "step": 240, "tokens_per_device": 3724 }, { "epoch": 0.096, "loss_ce": 0.49546775221824646, "loss_lvr": 0.8792060613632202, "loss_mode_switch": 0.0, "loss_total": 0.5833883285522461, "step": 240 }, { "batch_size": 1, "epoch": 0.096, "step": 240, "tokens_per_device": 4917 }, { "epoch": 0.096, "loss_ce": 0.7181178331375122, "loss_lvr": 0.7384029626846313, "loss_mode_switch": 0.0, "loss_total": 0.7919581532478333, "step": 240 }, { "batch_size": 4, "epoch": 0.096, "step": 240, "tokens_per_device": 5880 }, { "epoch": 0.096, "loss_ce": 0.7230575680732727, "loss_lvr": 0.9267601370811462, "loss_mode_switch": 0.0, "loss_total": 0.8157335519790649, "step": 240 }, { "epoch": 0.0964, "grad_norm": 1.823134183883667, "learning_rate": 9.884825050687918e-06, "loss": 0.408, "step": 241 }, { "batch_size": 4, "epoch": 0.0964, "step": 241, "tokens_per_device": 4256 }, { "epoch": 0.0964, "loss_ce": 0.2847212255001068, "loss_lvr": 1.1327742338180542, "loss_mode_switch": 0.0, "loss_total": 0.3979986608028412, "step": 241 }, { "batch_size": 1, "epoch": 0.0964, "step": 241, "tokens_per_device": 5142 }, { "epoch": 0.0964, "loss_ce": 0.18938420712947845, "loss_lvr": 0.7477266192436218, "loss_mode_switch": 0.0, "loss_total": 0.26415687799453735, "step": 241 }, { "batch_size": 4, "epoch": 0.0964, "step": 241, "tokens_per_device": 1820 }, { "epoch": 0.0964, "loss_ce": 0.367565393447876, "loss_lvr": 1.1579474210739136, "loss_mode_switch": 0.0, "loss_total": 0.4833601415157318, "step": 241 }, { "batch_size": 4, "epoch": 0.0964, "step": 241, "tokens_per_device": 5884 }, { "epoch": 0.0964, "loss_ce": 0.21392059326171875, "loss_lvr": 0.8493403196334839, "loss_mode_switch": 0.0, "loss_total": 0.29885461926460266, "step": 241 }, { "batch_size": 1, "epoch": 0.0964, "step": 241, "tokens_per_device": 5106 }, { "epoch": 0.0964, "loss_ce": 0.0545608289539814, "loss_lvr": 0.5037497282028198, "loss_mode_switch": 0.0, "loss_total": 0.10493580251932144, "step": 241 }, { "batch_size": 4, "epoch": 0.0964, "step": 241, "tokens_per_device": 4516 }, { "epoch": 0.0964, "loss_ce": 0.1774756759405136, "loss_lvr": 1.236660122871399, "loss_mode_switch": 0.0, "loss_total": 0.3011416792869568, "step": 241 }, { "batch_size": 4, "epoch": 0.0964, "step": 241, "tokens_per_device": 1572 }, { "epoch": 0.0964, "loss_ce": 0.47818440198898315, "loss_lvr": 1.0719410181045532, "loss_mode_switch": 0.0, "loss_total": 0.5853785276412964, "step": 241 }, { "batch_size": 4, "epoch": 0.0964, "step": 241, "tokens_per_device": 2664 }, { "epoch": 0.0964, "loss_ce": 0.6730211973190308, "loss_lvr": 1.305716633796692, "loss_mode_switch": 0.0, "loss_total": 0.8035928606987, "step": 241 }, { "epoch": 0.0968, "grad_norm": 1.4564988613128662, "learning_rate": 9.883438654043019e-06, "loss": 0.3105, "step": 242 }, { "batch_size": 4, "epoch": 0.0968, "step": 242, "tokens_per_device": 9484 }, { "epoch": 0.0968, "loss_ce": 0.17260490357875824, "loss_lvr": 1.1602929830551147, "loss_mode_switch": 0.0, "loss_total": 0.28863421082496643, "step": 242 }, { "batch_size": 4, "epoch": 0.0968, "step": 242, "tokens_per_device": 4300 }, { "epoch": 0.0968, "loss_ce": 0.09054797142744064, "loss_lvr": 1.5362343788146973, "loss_mode_switch": 0.0, "loss_total": 0.2441714107990265, "step": 242 }, { "batch_size": 1, "epoch": 0.0968, "step": 242, "tokens_per_device": 5155 }, { "epoch": 0.0968, "loss_ce": 0.02907971292734146, "loss_lvr": 0.5065604448318481, "loss_mode_switch": 0.0, "loss_total": 0.07973575592041016, "step": 242 }, { "batch_size": 4, "epoch": 0.0968, "step": 242, "tokens_per_device": 4400 }, { "epoch": 0.0968, "loss_ce": 0.5484516024589539, "loss_lvr": 1.08059561252594, "loss_mode_switch": 0.0, "loss_total": 0.6565111875534058, "step": 242 }, { "batch_size": 1, "epoch": 0.0968, "step": 242, "tokens_per_device": 6739 }, { "epoch": 0.0968, "loss_ce": 0.04530397802591324, "loss_lvr": 1.1409823894500732, "loss_mode_switch": 0.0, "loss_total": 0.15940222144126892, "step": 242 }, { "batch_size": 1, "epoch": 0.0968, "step": 242, "tokens_per_device": 4752 }, { "epoch": 0.0968, "loss_ce": 0.01719886250793934, "loss_lvr": 0.9535273909568787, "loss_mode_switch": 0.0, "loss_total": 0.11255159974098206, "step": 242 }, { "batch_size": 4, "epoch": 0.0968, "step": 242, "tokens_per_device": 3500 }, { "epoch": 0.0968, "loss_ce": 0.5126753449440002, "loss_lvr": 1.1421984434127808, "loss_mode_switch": 0.0, "loss_total": 0.6268951892852783, "step": 242 }, { "batch_size": 1, "epoch": 0.0968, "step": 242, "tokens_per_device": 5660 }, { "epoch": 0.0968, "loss_ce": 0.15103428065776825, "loss_lvr": 0.7662175297737122, "loss_mode_switch": 0.0, "loss_total": 0.2276560366153717, "step": 242 }, { "epoch": 0.0972, "grad_norm": 1.4119266271591187, "learning_rate": 9.882044061398393e-06, "loss": 0.3047, "step": 243 }, { "batch_size": 4, "epoch": 0.0972, "step": 243, "tokens_per_device": 5696 }, { "epoch": 0.0972, "loss_ce": 0.31333667039871216, "loss_lvr": 1.1070070266723633, "loss_mode_switch": 0.0, "loss_total": 0.424037367105484, "step": 243 }, { "batch_size": 1, "epoch": 0.0972, "step": 243, "tokens_per_device": 5057 }, { "epoch": 0.0972, "loss_ce": 0.0996432900428772, "loss_lvr": 0.7017344236373901, "loss_mode_switch": 0.0, "loss_total": 0.1698167324066162, "step": 243 }, { "batch_size": 4, "epoch": 0.0972, "step": 243, "tokens_per_device": 12176 }, { "epoch": 0.0972, "loss_ce": 0.0675646960735321, "loss_lvr": 1.1094348430633545, "loss_mode_switch": 0.0, "loss_total": 0.1785081923007965, "step": 243 }, { "batch_size": 4, "epoch": 0.0972, "step": 243, "tokens_per_device": 5016 }, { "epoch": 0.0972, "loss_ce": 0.14028891921043396, "loss_lvr": 1.1122952699661255, "loss_mode_switch": 0.0, "loss_total": 0.25151845812797546, "step": 243 }, { "batch_size": 1, "epoch": 0.0972, "step": 243, "tokens_per_device": 4741 }, { "epoch": 0.0972, "loss_ce": 0.01755361258983612, "loss_lvr": 0.5653206706047058, "loss_mode_switch": 0.0, "loss_total": 0.07408568263053894, "step": 243 }, { "batch_size": 4, "epoch": 0.0972, "step": 243, "tokens_per_device": 3824 }, { "epoch": 0.0972, "loss_ce": 0.13356873393058777, "loss_lvr": 1.2410861253738403, "loss_mode_switch": 0.0, "loss_total": 0.2576773464679718, "step": 243 }, { "batch_size": 1, "epoch": 0.0972, "step": 243, "tokens_per_device": 5107 }, { "epoch": 0.0972, "loss_ce": 0.015731101855635643, "loss_lvr": 0.5287289023399353, "loss_mode_switch": 0.0, "loss_total": 0.0686039924621582, "step": 243 }, { "batch_size": 4, "epoch": 0.0972, "step": 243, "tokens_per_device": 4992 }, { "epoch": 0.0972, "loss_ce": 0.22619512677192688, "loss_lvr": 1.1122257709503174, "loss_mode_switch": 0.0, "loss_total": 0.33741772174835205, "step": 243 }, { "epoch": 0.0976, "grad_norm": 1.839645266532898, "learning_rate": 9.88064127509462e-06, "loss": 0.3528, "step": 244 }, { "batch_size": 4, "epoch": 0.0976, "step": 244, "tokens_per_device": 5028 }, { "epoch": 0.0976, "loss_ce": 0.7510149478912354, "loss_lvr": 0.8969478607177734, "loss_mode_switch": 0.0, "loss_total": 0.8407097458839417, "step": 244 }, { "batch_size": 4, "epoch": 0.0976, "step": 244, "tokens_per_device": 12284 }, { "epoch": 0.0976, "loss_ce": 0.19704140722751617, "loss_lvr": 0.9483231902122498, "loss_mode_switch": 0.0, "loss_total": 0.2918737232685089, "step": 244 }, { "batch_size": 4, "epoch": 0.0976, "step": 244, "tokens_per_device": 10836 }, { "epoch": 0.0976, "loss_ce": 0.10430034250020981, "loss_lvr": 1.0627466440200806, "loss_mode_switch": 0.0, "loss_total": 0.21057501435279846, "step": 244 }, { "batch_size": 4, "epoch": 0.0976, "step": 244, "tokens_per_device": 4460 }, { "epoch": 0.0976, "loss_ce": 0.6344356536865234, "loss_lvr": 1.0343999862670898, "loss_mode_switch": 0.0, "loss_total": 0.7378756403923035, "step": 244 }, { "batch_size": 4, "epoch": 0.0976, "step": 244, "tokens_per_device": 4680 }, { "epoch": 0.0976, "loss_ce": 0.16351260244846344, "loss_lvr": 0.5580013394355774, "loss_mode_switch": 0.0, "loss_total": 0.21931274235248566, "step": 244 }, { "batch_size": 1, "epoch": 0.0976, "step": 244, "tokens_per_device": 5218 }, { "epoch": 0.0976, "loss_ce": 0.15942861139774323, "loss_lvr": 0.7377835512161255, "loss_mode_switch": 0.0, "loss_total": 0.23320695757865906, "step": 244 }, { "batch_size": 4, "epoch": 0.0976, "step": 244, "tokens_per_device": 2472 }, { "epoch": 0.0976, "loss_ce": 0.5060690641403198, "loss_lvr": 1.0208930969238281, "loss_mode_switch": 0.0, "loss_total": 0.6081583499908447, "step": 244 }, { "batch_size": 1, "epoch": 0.0976, "step": 244, "tokens_per_device": 4749 }, { "epoch": 0.0976, "loss_ce": 0.004496368113905191, "loss_lvr": 0.6850696802139282, "loss_mode_switch": 0.0, "loss_total": 0.07300333678722382, "step": 244 }, { "epoch": 0.098, "grad_norm": 2.070295572280884, "learning_rate": 9.879230297486034e-06, "loss": 0.4276, "step": 245 }, { "batch_size": 1, "epoch": 0.098, "step": 245, "tokens_per_device": 5482 }, { "epoch": 0.098, "loss_ce": 1.357852816581726, "loss_lvr": 0.7340582609176636, "loss_mode_switch": 0.0, "loss_total": 1.4312586784362793, "step": 245 }, { "batch_size": 1, "epoch": 0.098, "step": 245, "tokens_per_device": 5095 }, { "epoch": 0.098, "loss_ce": 0.030123773962259293, "loss_lvr": 0.489874005317688, "loss_mode_switch": 0.0, "loss_total": 0.07911117374897003, "step": 245 }, { "batch_size": 4, "epoch": 0.098, "step": 245, "tokens_per_device": 5352 }, { "epoch": 0.098, "loss_ce": 0.07257174700498581, "loss_lvr": 0.8475334644317627, "loss_mode_switch": 0.0, "loss_total": 0.15732508897781372, "step": 245 }, { "batch_size": 4, "epoch": 0.098, "step": 245, "tokens_per_device": 11108 }, { "epoch": 0.098, "loss_ce": 0.11940966546535492, "loss_lvr": 0.8578580021858215, "loss_mode_switch": 0.0, "loss_total": 0.20519545674324036, "step": 245 }, { "batch_size": 4, "epoch": 0.098, "step": 245, "tokens_per_device": 1680 }, { "epoch": 0.098, "loss_ce": 0.2536197304725647, "loss_lvr": 1.0078977346420288, "loss_mode_switch": 0.0, "loss_total": 0.35440951585769653, "step": 245 }, { "batch_size": 1, "epoch": 0.098, "step": 245, "tokens_per_device": 4501 }, { "epoch": 0.098, "loss_ce": 0.14517441391944885, "loss_lvr": 0.5931087136268616, "loss_mode_switch": 0.0, "loss_total": 0.20448528230190277, "step": 245 }, { "batch_size": 4, "epoch": 0.098, "step": 245, "tokens_per_device": 4760 }, { "epoch": 0.098, "loss_ce": 0.7424693703651428, "loss_lvr": 1.1465624570846558, "loss_mode_switch": 0.0, "loss_total": 0.8571256399154663, "step": 245 }, { "batch_size": 4, "epoch": 0.098, "step": 245, "tokens_per_device": 10476 }, { "epoch": 0.098, "loss_ce": 0.21040941774845123, "loss_lvr": 1.4941909313201904, "loss_mode_switch": 0.0, "loss_total": 0.35982853174209595, "step": 245 }, { "epoch": 0.0984, "grad_norm": 1.6698640584945679, "learning_rate": 9.877811130940715e-06, "loss": 0.3886, "step": 246 }, { "batch_size": 1, "epoch": 0.0984, "step": 246, "tokens_per_device": 5150 }, { "epoch": 0.0984, "loss_ce": 0.009105149656534195, "loss_lvr": 0.5140849351882935, "loss_mode_switch": 0.0, "loss_total": 0.06051364541053772, "step": 246 }, { "batch_size": 4, "epoch": 0.0984, "step": 246, "tokens_per_device": 3796 }, { "epoch": 0.0984, "loss_ce": 0.03674639016389847, "loss_lvr": 1.1807805299758911, "loss_mode_switch": 0.0, "loss_total": 0.15482443571090698, "step": 246 }, { "batch_size": 4, "epoch": 0.0984, "step": 246, "tokens_per_device": 7296 }, { "epoch": 0.0984, "loss_ce": 0.2358182668685913, "loss_lvr": 0.9014924764633179, "loss_mode_switch": 0.0, "loss_total": 0.3259675204753876, "step": 246 }, { "batch_size": 4, "epoch": 0.0984, "step": 246, "tokens_per_device": 4580 }, { "epoch": 0.0984, "loss_ce": 0.026179064065217972, "loss_lvr": 0.897336483001709, "loss_mode_switch": 0.0, "loss_total": 0.11591272056102753, "step": 246 }, { "batch_size": 4, "epoch": 0.0984, "step": 246, "tokens_per_device": 1680 }, { "epoch": 0.0984, "loss_ce": 0.18124566972255707, "loss_lvr": 1.0355809926986694, "loss_mode_switch": 0.0, "loss_total": 0.2848037779331207, "step": 246 }, { "batch_size": 4, "epoch": 0.0984, "step": 246, "tokens_per_device": 1632 }, { "epoch": 0.0984, "loss_ce": 0.4740220606327057, "loss_lvr": 1.0371618270874023, "loss_mode_switch": 0.0, "loss_total": 0.5777382254600525, "step": 246 }, { "batch_size": 4, "epoch": 0.0984, "step": 246, "tokens_per_device": 3924 }, { "epoch": 0.0984, "loss_ce": 0.3781163692474365, "loss_lvr": 1.264804720878601, "loss_mode_switch": 0.0, "loss_total": 0.5045968294143677, "step": 246 }, { "batch_size": 4, "epoch": 0.0984, "step": 246, "tokens_per_device": 1216 }, { "epoch": 0.0984, "loss_ce": 0.4185786545276642, "loss_lvr": 1.293737530708313, "loss_mode_switch": 0.0, "loss_total": 0.54795241355896, "step": 246 }, { "epoch": 0.0988, "grad_norm": 1.5453389883041382, "learning_rate": 9.876383777840484e-06, "loss": 0.3392, "step": 247 }, { "batch_size": 4, "epoch": 0.0988, "step": 247, "tokens_per_device": 2732 }, { "epoch": 0.0988, "loss_ce": 0.09200112521648407, "loss_lvr": 1.0778896808624268, "loss_mode_switch": 0.0, "loss_total": 0.1997900903224945, "step": 247 }, { "batch_size": 4, "epoch": 0.0988, "step": 247, "tokens_per_device": 3188 }, { "epoch": 0.0988, "loss_ce": 0.1470472812652588, "loss_lvr": 0.8389691710472107, "loss_mode_switch": 0.0, "loss_total": 0.2309442013502121, "step": 247 }, { "batch_size": 4, "epoch": 0.0988, "step": 247, "tokens_per_device": 5300 }, { "epoch": 0.0988, "loss_ce": 0.12888310849666595, "loss_lvr": 0.7634463906288147, "loss_mode_switch": 0.0, "loss_total": 0.20522774755954742, "step": 247 }, { "batch_size": 4, "epoch": 0.0988, "step": 247, "tokens_per_device": 4824 }, { "epoch": 0.0988, "loss_ce": 0.2974688708782196, "loss_lvr": 0.997083842754364, "loss_mode_switch": 0.0, "loss_total": 0.39717724919319153, "step": 247 }, { "batch_size": 4, "epoch": 0.0988, "step": 247, "tokens_per_device": 4264 }, { "epoch": 0.0988, "loss_ce": 0.29138821363449097, "loss_lvr": 1.1958551406860352, "loss_mode_switch": 0.0, "loss_total": 0.4109737277030945, "step": 247 }, { "batch_size": 4, "epoch": 0.0988, "step": 247, "tokens_per_device": 8856 }, { "epoch": 0.0988, "loss_ce": 0.1780100017786026, "loss_lvr": 1.3907766342163086, "loss_mode_switch": 0.0, "loss_total": 0.31708765029907227, "step": 247 }, { "batch_size": 1, "epoch": 0.0988, "step": 247, "tokens_per_device": 5273 }, { "epoch": 0.0988, "loss_ce": 0.0515102855861187, "loss_lvr": 0.8211578130722046, "loss_mode_switch": 0.0, "loss_total": 0.1336260735988617, "step": 247 }, { "batch_size": 1, "epoch": 0.0988, "step": 247, "tokens_per_device": 5019 }, { "epoch": 0.0988, "loss_ce": 0.010053171776235104, "loss_lvr": 0.379119873046875, "loss_mode_switch": 0.0, "loss_total": 0.0479651615023613, "step": 247 }, { "epoch": 0.0992, "grad_norm": 1.5669984817504883, "learning_rate": 9.874948240580903e-06, "loss": 0.3107, "step": 248 }, { "batch_size": 1, "epoch": 0.0992, "step": 248, "tokens_per_device": 5316 }, { "epoch": 0.0992, "loss_ce": 0.786037802696228, "loss_lvr": 0.8915767073631287, "loss_mode_switch": 0.0, "loss_total": 0.8751955032348633, "step": 248 }, { "batch_size": 1, "epoch": 0.0992, "step": 248, "tokens_per_device": 4870 }, { "epoch": 0.0992, "loss_ce": 0.14335812628269196, "loss_lvr": 0.5777748227119446, "loss_mode_switch": 0.0, "loss_total": 0.20113560557365417, "step": 248 }, { "batch_size": 4, "epoch": 0.0992, "step": 248, "tokens_per_device": 4380 }, { "epoch": 0.0992, "loss_ce": 0.1341608464717865, "loss_lvr": 1.0778627395629883, "loss_mode_switch": 0.0, "loss_total": 0.24194711446762085, "step": 248 }, { "batch_size": 4, "epoch": 0.0992, "step": 248, "tokens_per_device": 1300 }, { "epoch": 0.0992, "loss_ce": 0.3678376376628876, "loss_lvr": 1.4748274087905884, "loss_mode_switch": 0.0, "loss_total": 0.515320360660553, "step": 248 }, { "batch_size": 4, "epoch": 0.0992, "step": 248, "tokens_per_device": 4704 }, { "epoch": 0.0992, "loss_ce": 0.09143286943435669, "loss_lvr": 1.207289457321167, "loss_mode_switch": 0.0, "loss_total": 0.2121618092060089, "step": 248 }, { "batch_size": 4, "epoch": 0.0992, "step": 248, "tokens_per_device": 2724 }, { "epoch": 0.0992, "loss_ce": 0.21351364254951477, "loss_lvr": 1.0954232215881348, "loss_mode_switch": 0.0, "loss_total": 0.3230559825897217, "step": 248 }, { "batch_size": 1, "epoch": 0.0992, "step": 248, "tokens_per_device": 4900 }, { "epoch": 0.0992, "loss_ce": 0.030794154852628708, "loss_lvr": 0.4691072106361389, "loss_mode_switch": 0.0, "loss_total": 0.07770487666130066, "step": 248 }, { "batch_size": 4, "epoch": 0.0992, "step": 248, "tokens_per_device": 12980 }, { "epoch": 0.0992, "loss_ce": 0.06501533836126328, "loss_lvr": 1.0208841562271118, "loss_mode_switch": 0.0, "loss_total": 0.16710375249385834, "step": 248 }, { "epoch": 0.0996, "grad_norm": 1.5860506296157837, "learning_rate": 9.873504521571278e-06, "loss": 0.366, "step": 249 }, { "batch_size": 4, "epoch": 0.0996, "step": 249, "tokens_per_device": 1268 }, { "epoch": 0.0996, "loss_ce": 0.4649094343185425, "loss_lvr": 1.5181764364242554, "loss_mode_switch": 0.0, "loss_total": 0.6167271137237549, "step": 249 }, { "batch_size": 4, "epoch": 0.0996, "step": 249, "tokens_per_device": 3908 }, { "epoch": 0.0996, "loss_ce": 0.044431816786527634, "loss_lvr": 1.2431379556655884, "loss_mode_switch": 0.0, "loss_total": 0.16874560713768005, "step": 249 }, { "batch_size": 4, "epoch": 0.0996, "step": 249, "tokens_per_device": 3780 }, { "epoch": 0.0996, "loss_ce": 0.2746509611606598, "loss_lvr": 1.1402596235275269, "loss_mode_switch": 0.0, "loss_total": 0.3886769413948059, "step": 249 }, { "batch_size": 4, "epoch": 0.0996, "step": 249, "tokens_per_device": 4784 }, { "epoch": 0.0996, "loss_ce": 0.3693148195743561, "loss_lvr": 1.2767903804779053, "loss_mode_switch": 0.0, "loss_total": 0.4969938397407532, "step": 249 }, { "batch_size": 1, "epoch": 0.0996, "step": 249, "tokens_per_device": 4883 }, { "epoch": 0.0996, "loss_ce": 0.0067391968332231045, "loss_lvr": 1.2576398849487305, "loss_mode_switch": 0.0, "loss_total": 0.1325031965970993, "step": 249 }, { "batch_size": 4, "epoch": 0.0996, "step": 249, "tokens_per_device": 2792 }, { "epoch": 0.0996, "loss_ce": 0.31860899925231934, "loss_lvr": 0.857498049736023, "loss_mode_switch": 0.0, "loss_total": 0.40435880422592163, "step": 249 }, { "batch_size": 4, "epoch": 0.0996, "step": 249, "tokens_per_device": 6672 }, { "epoch": 0.0996, "loss_ce": 0.23493023216724396, "loss_lvr": 0.9965989589691162, "loss_mode_switch": 0.0, "loss_total": 0.3345901370048523, "step": 249 }, { "batch_size": 4, "epoch": 0.0996, "step": 249, "tokens_per_device": 3832 }, { "epoch": 0.0996, "loss_ce": 0.11448823660612106, "loss_lvr": 1.2359589338302612, "loss_mode_switch": 0.0, "loss_total": 0.23808413743972778, "step": 249 }, { "epoch": 0.1, "grad_norm": 1.6915944814682007, "learning_rate": 9.872052623234632e-06, "loss": 0.3728, "step": 250 }, { "batch_size": 1, "epoch": 0.1, "step": 250, "tokens_per_device": 5170 }, { "epoch": 0.1, "loss_ce": 0.09355662763118744, "loss_lvr": 0.40192848443984985, "loss_mode_switch": 0.0, "loss_total": 0.13374948501586914, "step": 250 }, { "batch_size": 4, "epoch": 0.1, "step": 250, "tokens_per_device": 3800 }, { "epoch": 0.1, "loss_ce": 0.30103063583374023, "loss_lvr": 1.818063497543335, "loss_mode_switch": 0.0, "loss_total": 0.4828369915485382, "step": 250 }, { "batch_size": 1, "epoch": 0.1, "step": 250, "tokens_per_device": 6415 }, { "epoch": 0.1, "loss_ce": 0.01069965586066246, "loss_lvr": 0.5568680763244629, "loss_mode_switch": 0.0, "loss_total": 0.06638646125793457, "step": 250 }, { "batch_size": 1, "epoch": 0.1, "step": 250, "tokens_per_device": 4733 }, { "epoch": 0.1, "loss_ce": 0.010787568986415863, "loss_lvr": 0.5754861235618591, "loss_mode_switch": 0.0, "loss_total": 0.06833618134260178, "step": 250 }, { "batch_size": 4, "epoch": 0.1, "step": 250, "tokens_per_device": 1576 }, { "epoch": 0.1, "loss_ce": 0.4428831934928894, "loss_lvr": 1.0584098100662231, "loss_mode_switch": 0.0, "loss_total": 0.5487241744995117, "step": 250 }, { "batch_size": 1, "epoch": 0.1, "step": 250, "tokens_per_device": 4337 }, { "epoch": 0.1, "loss_ce": 0.006384397856891155, "loss_lvr": 0.6928460597991943, "loss_mode_switch": 0.0, "loss_total": 0.07566900551319122, "step": 250 }, { "batch_size": 4, "epoch": 0.1, "step": 250, "tokens_per_device": 3824 }, { "epoch": 0.1, "loss_ce": 0.22565041482448578, "loss_lvr": 1.6272965669631958, "loss_mode_switch": 0.0, "loss_total": 0.3883800804615021, "step": 250 }, { "batch_size": 1, "epoch": 0.1, "step": 250, "tokens_per_device": 5058 }, { "epoch": 0.1, "loss_ce": 0.048768360167741776, "loss_lvr": 0.821216344833374, "loss_mode_switch": 0.0, "loss_total": 0.13088999688625336, "step": 250 }, { "epoch": 0.1004, "grad_norm": 1.3498443365097046, "learning_rate": 9.870592548007725e-06, "loss": 0.3638, "step": 251 }, { "batch_size": 4, "epoch": 0.1004, "step": 251, "tokens_per_device": 1304 }, { "epoch": 0.1004, "loss_ce": 0.9798239469528198, "loss_lvr": 1.480831265449524, "loss_mode_switch": 0.0, "loss_total": 1.1279070377349854, "step": 251 }, { "batch_size": 4, "epoch": 0.1004, "step": 251, "tokens_per_device": 8892 }, { "epoch": 0.1004, "loss_ce": 0.18984395265579224, "loss_lvr": 0.8662999868392944, "loss_mode_switch": 0.0, "loss_total": 0.2764739394187927, "step": 251 }, { "batch_size": 4, "epoch": 0.1004, "step": 251, "tokens_per_device": 1356 }, { "epoch": 0.1004, "loss_ce": 0.2927248775959015, "loss_lvr": 1.3152475357055664, "loss_mode_switch": 0.0, "loss_total": 0.42424964904785156, "step": 251 }, { "batch_size": 4, "epoch": 0.1004, "step": 251, "tokens_per_device": 4220 }, { "epoch": 0.1004, "loss_ce": 0.26227879524230957, "loss_lvr": 1.843501091003418, "loss_mode_switch": 0.0, "loss_total": 0.4466289281845093, "step": 251 }, { "batch_size": 4, "epoch": 0.1004, "step": 251, "tokens_per_device": 4860 }, { "epoch": 0.1004, "loss_ce": 0.6230578422546387, "loss_lvr": 1.7010103464126587, "loss_mode_switch": 0.0, "loss_total": 0.7931588888168335, "step": 251 }, { "batch_size": 4, "epoch": 0.1004, "step": 251, "tokens_per_device": 4536 }, { "epoch": 0.1004, "loss_ce": 0.42478883266448975, "loss_lvr": 0.9580349922180176, "loss_mode_switch": 0.0, "loss_total": 0.5205923318862915, "step": 251 }, { "batch_size": 4, "epoch": 0.1004, "step": 251, "tokens_per_device": 7412 }, { "epoch": 0.1004, "loss_ce": 0.18296927213668823, "loss_lvr": 1.0547269582748413, "loss_mode_switch": 0.0, "loss_total": 0.2884419560432434, "step": 251 }, { "batch_size": 4, "epoch": 0.1004, "step": 251, "tokens_per_device": 5516 }, { "epoch": 0.1004, "loss_ce": 0.2763819396495819, "loss_lvr": 0.8957651853561401, "loss_mode_switch": 0.0, "loss_total": 0.36595845222473145, "step": 251 }, { "epoch": 0.1008, "grad_norm": 1.4863203763961792, "learning_rate": 9.869124298341039e-06, "loss": 0.3563, "step": 252 }, { "batch_size": 1, "epoch": 0.1008, "step": 252, "tokens_per_device": 5061 }, { "epoch": 0.1008, "loss_ce": 0.01759614609181881, "loss_lvr": 0.4429977536201477, "loss_mode_switch": 0.0, "loss_total": 0.06189592182636261, "step": 252 }, { "batch_size": 1, "epoch": 0.1008, "step": 252, "tokens_per_device": 5228 }, { "epoch": 0.1008, "loss_ce": 0.15642903745174408, "loss_lvr": 0.5479271411895752, "loss_mode_switch": 0.0, "loss_total": 0.21122175455093384, "step": 252 }, { "batch_size": 1, "epoch": 0.1008, "step": 252, "tokens_per_device": 5523 }, { "epoch": 0.1008, "loss_ce": 0.03566514328122139, "loss_lvr": 0.563288152217865, "loss_mode_switch": 0.0, "loss_total": 0.09199395775794983, "step": 252 }, { "batch_size": 4, "epoch": 0.1008, "step": 252, "tokens_per_device": 8068 }, { "epoch": 0.1008, "loss_ce": 0.02100849151611328, "loss_lvr": 0.7407406568527222, "loss_mode_switch": 0.0, "loss_total": 0.09508255869150162, "step": 252 }, { "batch_size": 1, "epoch": 0.1008, "step": 252, "tokens_per_device": 4891 }, { "epoch": 0.1008, "loss_ce": 0.09499170631170273, "loss_lvr": 0.9192763566970825, "loss_mode_switch": 0.0, "loss_total": 0.18691934645175934, "step": 252 }, { "batch_size": 4, "epoch": 0.1008, "step": 252, "tokens_per_device": 7300 }, { "epoch": 0.1008, "loss_ce": 0.18011796474456787, "loss_lvr": 0.9202672839164734, "loss_mode_switch": 0.0, "loss_total": 0.27214470505714417, "step": 252 }, { "batch_size": 4, "epoch": 0.1008, "step": 252, "tokens_per_device": 2644 }, { "epoch": 0.1008, "loss_ce": 0.2902880311012268, "loss_lvr": 1.326167106628418, "loss_mode_switch": 0.0, "loss_total": 0.42290472984313965, "step": 252 }, { "batch_size": 1, "epoch": 0.1008, "step": 252, "tokens_per_device": 4905 }, { "epoch": 0.1008, "loss_ce": 0.1000891700387001, "loss_lvr": 0.5964165925979614, "loss_mode_switch": 0.0, "loss_total": 0.15973082184791565, "step": 252 }, { "epoch": 0.1012, "grad_norm": 1.6404603719711304, "learning_rate": 9.867647876698776e-06, "loss": 0.3193, "step": 253 }, { "batch_size": 4, "epoch": 0.1012, "step": 253, "tokens_per_device": 1780 }, { "epoch": 0.1012, "loss_ce": 0.40813034772872925, "loss_lvr": 1.0824472904205322, "loss_mode_switch": 0.0, "loss_total": 0.5163750648498535, "step": 253 }, { "batch_size": 1, "epoch": 0.1012, "step": 253, "tokens_per_device": 4878 }, { "epoch": 0.1012, "loss_ce": 0.030903294682502747, "loss_lvr": 0.584105372428894, "loss_mode_switch": 0.0, "loss_total": 0.08931383490562439, "step": 253 }, { "batch_size": 1, "epoch": 0.1012, "step": 253, "tokens_per_device": 5119 }, { "epoch": 0.1012, "loss_ce": 0.2101513147354126, "loss_lvr": 0.4224468767642975, "loss_mode_switch": 0.0, "loss_total": 0.25239598751068115, "step": 253 }, { "batch_size": 4, "epoch": 0.1012, "step": 253, "tokens_per_device": 15356 }, { "epoch": 0.1012, "loss_ce": 0.22605298459529877, "loss_lvr": 0.9539151787757874, "loss_mode_switch": 0.0, "loss_total": 0.3214445114135742, "step": 253 }, { "batch_size": 4, "epoch": 0.1012, "step": 253, "tokens_per_device": 6316 }, { "epoch": 0.1012, "loss_ce": 0.4135596454143524, "loss_lvr": 1.005794644355774, "loss_mode_switch": 0.0, "loss_total": 0.5141391158103943, "step": 253 }, { "batch_size": 1, "epoch": 0.1012, "step": 253, "tokens_per_device": 4985 }, { "epoch": 0.1012, "loss_ce": 0.02707635425031185, "loss_lvr": 0.6770985722541809, "loss_mode_switch": 0.0, "loss_total": 0.09478621184825897, "step": 253 }, { "batch_size": 1, "epoch": 0.1012, "step": 253, "tokens_per_device": 4951 }, { "epoch": 0.1012, "loss_ce": 0.11594130098819733, "loss_lvr": 0.23544055223464966, "loss_mode_switch": 0.0, "loss_total": 0.13948535919189453, "step": 253 }, { "batch_size": 4, "epoch": 0.1012, "step": 253, "tokens_per_device": 5772 }, { "epoch": 0.1012, "loss_ce": 0.2482321560382843, "loss_lvr": 1.4558420181274414, "loss_mode_switch": 0.0, "loss_total": 0.39381635189056396, "step": 253 }, { "epoch": 0.1016, "grad_norm": 1.6507863998413086, "learning_rate": 9.866163285558851e-06, "loss": 0.355, "step": 254 }, { "batch_size": 4, "epoch": 0.1016, "step": 254, "tokens_per_device": 3816 }, { "epoch": 0.1016, "loss_ce": 0.5804677605628967, "loss_lvr": 1.1569342613220215, "loss_mode_switch": 0.0, "loss_total": 0.6961612105369568, "step": 254 }, { "batch_size": 4, "epoch": 0.1016, "step": 254, "tokens_per_device": 2664 }, { "epoch": 0.1016, "loss_ce": 0.1449350267648697, "loss_lvr": 1.4151700735092163, "loss_mode_switch": 0.0, "loss_total": 0.286452054977417, "step": 254 }, { "batch_size": 1, "epoch": 0.1016, "step": 254, "tokens_per_device": 4899 }, { "epoch": 0.1016, "loss_ce": 0.012622995302081108, "loss_lvr": 0.5968537330627441, "loss_mode_switch": 0.0, "loss_total": 0.07230836898088455, "step": 254 }, { "batch_size": 4, "epoch": 0.1016, "step": 254, "tokens_per_device": 11024 }, { "epoch": 0.1016, "loss_ce": 0.3661501407623291, "loss_lvr": 1.071303367614746, "loss_mode_switch": 0.0, "loss_total": 0.47328048944473267, "step": 254 }, { "batch_size": 4, "epoch": 0.1016, "step": 254, "tokens_per_device": 4216 }, { "epoch": 0.1016, "loss_ce": 0.2085040807723999, "loss_lvr": 1.2642556428909302, "loss_mode_switch": 0.0, "loss_total": 0.3349296450614929, "step": 254 }, { "batch_size": 4, "epoch": 0.1016, "step": 254, "tokens_per_device": 5028 }, { "epoch": 0.1016, "loss_ce": 0.46045058965682983, "loss_lvr": 0.9023472666740417, "loss_mode_switch": 0.0, "loss_total": 0.5506852865219116, "step": 254 }, { "batch_size": 4, "epoch": 0.1016, "step": 254, "tokens_per_device": 2860 }, { "epoch": 0.1016, "loss_ce": 0.4190910756587982, "loss_lvr": 1.1158623695373535, "loss_mode_switch": 0.0, "loss_total": 0.530677318572998, "step": 254 }, { "batch_size": 4, "epoch": 0.1016, "step": 254, "tokens_per_device": 3328 }, { "epoch": 0.1016, "loss_ce": 0.41866713762283325, "loss_lvr": 0.8920291066169739, "loss_mode_switch": 0.0, "loss_total": 0.507870078086853, "step": 254 }, { "epoch": 0.102, "grad_norm": 1.5969691276550293, "learning_rate": 9.864670527412891e-06, "loss": 0.3626, "step": 255 }, { "batch_size": 4, "epoch": 0.102, "step": 255, "tokens_per_device": 3692 }, { "epoch": 0.102, "loss_ce": 0.6038756370544434, "loss_lvr": 0.9126940369606018, "loss_mode_switch": 0.0, "loss_total": 0.6951450109481812, "step": 255 }, { "batch_size": 4, "epoch": 0.102, "step": 255, "tokens_per_device": 3752 }, { "epoch": 0.102, "loss_ce": 0.3649982511997223, "loss_lvr": 1.3963823318481445, "loss_mode_switch": 0.0, "loss_total": 0.5046364665031433, "step": 255 }, { "batch_size": 4, "epoch": 0.102, "step": 255, "tokens_per_device": 2644 }, { "epoch": 0.102, "loss_ce": 0.2170562744140625, "loss_lvr": 1.0361292362213135, "loss_mode_switch": 0.0, "loss_total": 0.3206692039966583, "step": 255 }, { "batch_size": 4, "epoch": 0.102, "step": 255, "tokens_per_device": 1916 }, { "epoch": 0.102, "loss_ce": 0.26580673456192017, "loss_lvr": 1.0647735595703125, "loss_mode_switch": 0.0, "loss_total": 0.37228408455848694, "step": 255 }, { "batch_size": 1, "epoch": 0.102, "step": 255, "tokens_per_device": 5133 }, { "epoch": 0.102, "loss_ce": 0.0021723415702581406, "loss_lvr": 0.9819591045379639, "loss_mode_switch": 0.0, "loss_total": 0.10036825388669968, "step": 255 }, { "batch_size": 4, "epoch": 0.102, "step": 255, "tokens_per_device": 13240 }, { "epoch": 0.102, "loss_ce": 0.054419782012701035, "loss_lvr": 1.0101114511489868, "loss_mode_switch": 0.0, "loss_total": 0.15543092787265778, "step": 255 }, { "batch_size": 1, "epoch": 0.102, "step": 255, "tokens_per_device": 5133 }, { "epoch": 0.102, "loss_ce": 0.40378689765930176, "loss_lvr": 0.7276837825775146, "loss_mode_switch": 0.0, "loss_total": 0.4765552878379822, "step": 255 }, { "batch_size": 4, "epoch": 0.102, "step": 255, "tokens_per_device": 4244 }, { "epoch": 0.102, "loss_ce": 0.12708213925361633, "loss_lvr": 0.7742549777030945, "loss_mode_switch": 0.0, "loss_total": 0.20450764894485474, "step": 255 }, { "epoch": 0.1024, "grad_norm": 1.512823462486267, "learning_rate": 9.863169604766231e-06, "loss": 0.3371, "step": 256 }, { "batch_size": 4, "epoch": 0.1024, "step": 256, "tokens_per_device": 3876 }, { "epoch": 0.1024, "loss_ce": 0.10101871192455292, "loss_lvr": 1.1766849756240845, "loss_mode_switch": 0.0, "loss_total": 0.21868720650672913, "step": 256 }, { "batch_size": 4, "epoch": 0.1024, "step": 256, "tokens_per_device": 10736 }, { "epoch": 0.1024, "loss_ce": 0.13922907412052155, "loss_lvr": 1.1960253715515137, "loss_mode_switch": 0.0, "loss_total": 0.25883162021636963, "step": 256 }, { "batch_size": 4, "epoch": 0.1024, "step": 256, "tokens_per_device": 7024 }, { "epoch": 0.1024, "loss_ce": 0.0769016370177269, "loss_lvr": 0.9184483885765076, "loss_mode_switch": 0.0, "loss_total": 0.1687464714050293, "step": 256 }, { "batch_size": 1, "epoch": 0.1024, "step": 256, "tokens_per_device": 6103 }, { "epoch": 0.1024, "loss_ce": 0.012042809277772903, "loss_lvr": 0.49462416768074036, "loss_mode_switch": 0.0, "loss_total": 0.06150522828102112, "step": 256 }, { "batch_size": 4, "epoch": 0.1024, "step": 256, "tokens_per_device": 4384 }, { "epoch": 0.1024, "loss_ce": 0.5814447999000549, "loss_lvr": 1.159658432006836, "loss_mode_switch": 0.0, "loss_total": 0.6974106431007385, "step": 256 }, { "batch_size": 4, "epoch": 0.1024, "step": 256, "tokens_per_device": 4428 }, { "epoch": 0.1024, "loss_ce": 0.14173933863639832, "loss_lvr": 0.8913851380348206, "loss_mode_switch": 0.0, "loss_total": 0.2308778464794159, "step": 256 }, { "batch_size": 4, "epoch": 0.1024, "step": 256, "tokens_per_device": 7080 }, { "epoch": 0.1024, "loss_ce": 0.08689489960670471, "loss_lvr": 0.9745370745658875, "loss_mode_switch": 0.0, "loss_total": 0.18434861302375793, "step": 256 }, { "batch_size": 4, "epoch": 0.1024, "step": 256, "tokens_per_device": 11020 }, { "epoch": 0.1024, "loss_ce": 0.3554941415786743, "loss_lvr": 0.8935892581939697, "loss_mode_switch": 0.0, "loss_total": 0.4448530673980713, "step": 256 }, { "epoch": 0.1028, "grad_norm": 1.5554378032684326, "learning_rate": 9.861660520137908e-06, "loss": 0.3298, "step": 257 }, { "batch_size": 1, "epoch": 0.1028, "step": 257, "tokens_per_device": 4857 }, { "epoch": 0.1028, "loss_ce": 0.10330881178379059, "loss_lvr": 0.5401949882507324, "loss_mode_switch": 0.0, "loss_total": 0.1573283076286316, "step": 257 }, { "batch_size": 4, "epoch": 0.1028, "step": 257, "tokens_per_device": 6184 }, { "epoch": 0.1028, "loss_ce": 0.03359326720237732, "loss_lvr": 0.7565231323242188, "loss_mode_switch": 0.0, "loss_total": 0.10924558341503143, "step": 257 }, { "batch_size": 4, "epoch": 0.1028, "step": 257, "tokens_per_device": 4396 }, { "epoch": 0.1028, "loss_ce": 0.47551658749580383, "loss_lvr": 1.1090800762176514, "loss_mode_switch": 0.0, "loss_total": 0.5864245891571045, "step": 257 }, { "batch_size": 4, "epoch": 0.1028, "step": 257, "tokens_per_device": 5728 }, { "epoch": 0.1028, "loss_ce": 0.003962589893490076, "loss_lvr": 0.9493273496627808, "loss_mode_switch": 0.0, "loss_total": 0.09889532625675201, "step": 257 }, { "batch_size": 4, "epoch": 0.1028, "step": 257, "tokens_per_device": 4720 }, { "epoch": 0.1028, "loss_ce": 0.5686489939689636, "loss_lvr": 0.9981175661087036, "loss_mode_switch": 0.0, "loss_total": 0.6684607267379761, "step": 257 }, { "batch_size": 4, "epoch": 0.1028, "step": 257, "tokens_per_device": 2548 }, { "epoch": 0.1028, "loss_ce": 0.23014040291309357, "loss_lvr": 1.3467464447021484, "loss_mode_switch": 0.0, "loss_total": 0.3648150563240051, "step": 257 }, { "batch_size": 1, "epoch": 0.1028, "step": 257, "tokens_per_device": 5156 }, { "epoch": 0.1028, "loss_ce": 0.0016748453490436077, "loss_lvr": 0.6548755168914795, "loss_mode_switch": 0.0, "loss_total": 0.0671624019742012, "step": 257 }, { "batch_size": 4, "epoch": 0.1028, "step": 257, "tokens_per_device": 4172 }, { "epoch": 0.1028, "loss_ce": 0.12844720482826233, "loss_lvr": 0.7502744197845459, "loss_mode_switch": 0.0, "loss_total": 0.20347464084625244, "step": 257 }, { "epoch": 0.1032, "grad_norm": 1.4840940237045288, "learning_rate": 9.860143276060655e-06, "loss": 0.3365, "step": 258 }, { "batch_size": 4, "epoch": 0.1032, "step": 258, "tokens_per_device": 4792 }, { "epoch": 0.1032, "loss_ce": 0.14645738899707794, "loss_lvr": 0.7514412999153137, "loss_mode_switch": 0.0, "loss_total": 0.22160151600837708, "step": 258 }, { "batch_size": 1, "epoch": 0.1032, "step": 258, "tokens_per_device": 4431 }, { "epoch": 0.1032, "loss_ce": 0.02838602289557457, "loss_lvr": 1.9079811573028564, "loss_mode_switch": 0.0, "loss_total": 0.21918414533138275, "step": 258 }, { "batch_size": 4, "epoch": 0.1032, "step": 258, "tokens_per_device": 3828 }, { "epoch": 0.1032, "loss_ce": 0.4036196768283844, "loss_lvr": 1.2808541059494019, "loss_mode_switch": 0.0, "loss_total": 0.5317050814628601, "step": 258 }, { "batch_size": 4, "epoch": 0.1032, "step": 258, "tokens_per_device": 3924 }, { "epoch": 0.1032, "loss_ce": 0.11765041947364807, "loss_lvr": 0.9272823929786682, "loss_mode_switch": 0.0, "loss_total": 0.21037866175174713, "step": 258 }, { "batch_size": 4, "epoch": 0.1032, "step": 258, "tokens_per_device": 4388 }, { "epoch": 0.1032, "loss_ce": 0.4987329840660095, "loss_lvr": 0.9640094637870789, "loss_mode_switch": 0.0, "loss_total": 0.595133900642395, "step": 258 }, { "batch_size": 4, "epoch": 0.1032, "step": 258, "tokens_per_device": 2684 }, { "epoch": 0.1032, "loss_ce": 0.7273106575012207, "loss_lvr": 0.9865350723266602, "loss_mode_switch": 0.0, "loss_total": 0.8259641528129578, "step": 258 }, { "batch_size": 4, "epoch": 0.1032, "step": 258, "tokens_per_device": 4632 }, { "epoch": 0.1032, "loss_ce": 0.2955564260482788, "loss_lvr": 0.975534200668335, "loss_mode_switch": 0.0, "loss_total": 0.39310985803604126, "step": 258 }, { "batch_size": 1, "epoch": 0.1032, "step": 258, "tokens_per_device": 4494 }, { "epoch": 0.1032, "loss_ce": 0.7714948654174805, "loss_lvr": 0.6108344197273254, "loss_mode_switch": 0.0, "loss_total": 0.8325783014297485, "step": 258 }, { "epoch": 0.1036, "grad_norm": 1.5175901651382446, "learning_rate": 9.858617875080904e-06, "loss": 0.3292, "step": 259 }, { "batch_size": 1, "epoch": 0.1036, "step": 259, "tokens_per_device": 4961 }, { "epoch": 0.1036, "loss_ce": 0.06296214461326599, "loss_lvr": 0.5901980400085449, "loss_mode_switch": 0.0, "loss_total": 0.12198194861412048, "step": 259 }, { "batch_size": 1, "epoch": 0.1036, "step": 259, "tokens_per_device": 5087 }, { "epoch": 0.1036, "loss_ce": 0.052837174385786057, "loss_lvr": 0.6354374885559082, "loss_mode_switch": 0.0, "loss_total": 0.11638092994689941, "step": 259 }, { "batch_size": 4, "epoch": 0.1036, "step": 259, "tokens_per_device": 3776 }, { "epoch": 0.1036, "loss_ce": 0.15837955474853516, "loss_lvr": 1.369619607925415, "loss_mode_switch": 0.0, "loss_total": 0.29534152150154114, "step": 259 }, { "batch_size": 4, "epoch": 0.1036, "step": 259, "tokens_per_device": 5572 }, { "epoch": 0.1036, "loss_ce": 0.8678373694419861, "loss_lvr": 1.0123018026351929, "loss_mode_switch": 0.0, "loss_total": 0.9690675735473633, "step": 259 }, { "batch_size": 4, "epoch": 0.1036, "step": 259, "tokens_per_device": 1636 }, { "epoch": 0.1036, "loss_ce": 0.40424904227256775, "loss_lvr": 1.2998437881469727, "loss_mode_switch": 0.0, "loss_total": 0.5342334508895874, "step": 259 }, { "batch_size": 4, "epoch": 0.1036, "step": 259, "tokens_per_device": 6092 }, { "epoch": 0.1036, "loss_ce": 0.11287965625524521, "loss_lvr": 0.7389079332351685, "loss_mode_switch": 0.0, "loss_total": 0.1867704540491104, "step": 259 }, { "batch_size": 4, "epoch": 0.1036, "step": 259, "tokens_per_device": 10296 }, { "epoch": 0.1036, "loss_ce": 0.011175679974257946, "loss_lvr": 0.9277862310409546, "loss_mode_switch": 0.0, "loss_total": 0.10395430028438568, "step": 259 }, { "batch_size": 1, "epoch": 0.1036, "step": 259, "tokens_per_device": 5142 }, { "epoch": 0.1036, "loss_ce": 0.003423908492550254, "loss_lvr": 1.2135502099990845, "loss_mode_switch": 0.0, "loss_total": 0.12477892637252808, "step": 259 }, { "epoch": 0.104, "grad_norm": 2.056281328201294, "learning_rate": 9.857084319758772e-06, "loss": 0.305, "step": 260 }, { "batch_size": 4, "epoch": 0.104, "step": 260, "tokens_per_device": 3820 }, { "epoch": 0.104, "loss_ce": 0.3658905625343323, "loss_lvr": 1.061171531677246, "loss_mode_switch": 0.0, "loss_total": 0.47200772166252136, "step": 260 }, { "batch_size": 4, "epoch": 0.104, "step": 260, "tokens_per_device": 3360 }, { "epoch": 0.104, "loss_ce": 0.23415681719779968, "loss_lvr": 0.9687069058418274, "loss_mode_switch": 0.0, "loss_total": 0.3310275077819824, "step": 260 }, { "batch_size": 1, "epoch": 0.104, "step": 260, "tokens_per_device": 5224 }, { "epoch": 0.104, "loss_ce": 0.09043119847774506, "loss_lvr": 0.9250232577323914, "loss_mode_switch": 0.0, "loss_total": 0.1829335242509842, "step": 260 }, { "batch_size": 4, "epoch": 0.104, "step": 260, "tokens_per_device": 1444 }, { "epoch": 0.104, "loss_ce": 0.396990567445755, "loss_lvr": 1.046890139579773, "loss_mode_switch": 0.0, "loss_total": 0.5016795992851257, "step": 260 }, { "batch_size": 4, "epoch": 0.104, "step": 260, "tokens_per_device": 3784 }, { "epoch": 0.104, "loss_ce": 0.2427443563938141, "loss_lvr": 1.3890199661254883, "loss_mode_switch": 0.0, "loss_total": 0.3816463351249695, "step": 260 }, { "batch_size": 1, "epoch": 0.104, "step": 260, "tokens_per_device": 5161 }, { "epoch": 0.104, "loss_ce": 0.11141554266214371, "loss_lvr": 0.5873242616653442, "loss_mode_switch": 0.0, "loss_total": 0.17014797031879425, "step": 260 }, { "batch_size": 1, "epoch": 0.104, "step": 260, "tokens_per_device": 4761 }, { "epoch": 0.104, "loss_ce": 0.010207435116171837, "loss_lvr": 0.2746000289916992, "loss_mode_switch": 0.0, "loss_total": 0.03766743838787079, "step": 260 }, { "batch_size": 4, "epoch": 0.104, "step": 260, "tokens_per_device": 5868 }, { "epoch": 0.104, "loss_ce": 0.16105176508426666, "loss_lvr": 1.206786870956421, "loss_mode_switch": 0.0, "loss_total": 0.28173044323921204, "step": 260 }, { "epoch": 0.1044, "grad_norm": 1.5788146257400513, "learning_rate": 9.855542612668066e-06, "loss": 0.332, "step": 261 }, { "batch_size": 4, "epoch": 0.1044, "step": 261, "tokens_per_device": 1500 }, { "epoch": 0.1044, "loss_ce": 0.24064089357852936, "loss_lvr": 1.4136959314346313, "loss_mode_switch": 0.0, "loss_total": 0.38201048970222473, "step": 261 }, { "batch_size": 4, "epoch": 0.1044, "step": 261, "tokens_per_device": 2760 }, { "epoch": 0.1044, "loss_ce": 0.16153068840503693, "loss_lvr": 0.9731077551841736, "loss_mode_switch": 0.0, "loss_total": 0.25884145498275757, "step": 261 }, { "batch_size": 4, "epoch": 0.1044, "step": 261, "tokens_per_device": 1196 }, { "epoch": 0.1044, "loss_ce": 0.34135425090789795, "loss_lvr": 1.6590290069580078, "loss_mode_switch": 0.0, "loss_total": 0.5072571635246277, "step": 261 }, { "batch_size": 1, "epoch": 0.1044, "step": 261, "tokens_per_device": 4884 }, { "epoch": 0.1044, "loss_ce": 0.026061659678816795, "loss_lvr": 0.38734713196754456, "loss_mode_switch": 0.0, "loss_total": 0.06479637324810028, "step": 261 }, { "batch_size": 4, "epoch": 0.1044, "step": 261, "tokens_per_device": 4420 }, { "epoch": 0.1044, "loss_ce": 0.5150874853134155, "loss_lvr": 1.1002120971679688, "loss_mode_switch": 0.0, "loss_total": 0.6251087188720703, "step": 261 }, { "batch_size": 4, "epoch": 0.1044, "step": 261, "tokens_per_device": 4280 }, { "epoch": 0.1044, "loss_ce": 0.4421333968639374, "loss_lvr": 0.8292714357376099, "loss_mode_switch": 0.0, "loss_total": 0.5250605344772339, "step": 261 }, { "batch_size": 4, "epoch": 0.1044, "step": 261, "tokens_per_device": 1748 }, { "epoch": 0.1044, "loss_ce": 0.15476065874099731, "loss_lvr": 1.1363799571990967, "loss_mode_switch": 0.0, "loss_total": 0.268398642539978, "step": 261 }, { "batch_size": 4, "epoch": 0.1044, "step": 261, "tokens_per_device": 3496 }, { "epoch": 0.1044, "loss_ce": 0.46180394291877747, "loss_lvr": 0.9179450869560242, "loss_mode_switch": 0.0, "loss_total": 0.5535984635353088, "step": 261 }, { "epoch": 0.1048, "grad_norm": 1.9728288650512695, "learning_rate": 9.853992756396272e-06, "loss": 0.3849, "step": 262 }, { "batch_size": 4, "epoch": 0.1048, "step": 262, "tokens_per_device": 4340 }, { "epoch": 0.1048, "loss_ce": 0.5720913410186768, "loss_lvr": 1.1265712976455688, "loss_mode_switch": 0.0, "loss_total": 0.6847484707832336, "step": 262 }, { "batch_size": 4, "epoch": 0.1048, "step": 262, "tokens_per_device": 4112 }, { "epoch": 0.1048, "loss_ce": 0.4891481101512909, "loss_lvr": 1.028696060180664, "loss_mode_switch": 0.0, "loss_total": 0.5920177102088928, "step": 262 }, { "batch_size": 4, "epoch": 0.1048, "step": 262, "tokens_per_device": 4496 }, { "epoch": 0.1048, "loss_ce": 0.35396382212638855, "loss_lvr": 1.0692896842956543, "loss_mode_switch": 0.0, "loss_total": 0.46089279651641846, "step": 262 }, { "batch_size": 4, "epoch": 0.1048, "step": 262, "tokens_per_device": 6232 }, { "epoch": 0.1048, "loss_ce": 0.380458801984787, "loss_lvr": 1.2292112112045288, "loss_mode_switch": 0.0, "loss_total": 0.5033799409866333, "step": 262 }, { "batch_size": 1, "epoch": 0.1048, "step": 262, "tokens_per_device": 5018 }, { "epoch": 0.1048, "loss_ce": 0.017366722226142883, "loss_lvr": 0.36788293719291687, "loss_mode_switch": 0.0, "loss_total": 0.05415501818060875, "step": 262 }, { "batch_size": 4, "epoch": 0.1048, "step": 262, "tokens_per_device": 3920 }, { "epoch": 0.1048, "loss_ce": 0.26037144660949707, "loss_lvr": 1.0221350193023682, "loss_mode_switch": 0.0, "loss_total": 0.3625849485397339, "step": 262 }, { "batch_size": 4, "epoch": 0.1048, "step": 262, "tokens_per_device": 2776 }, { "epoch": 0.1048, "loss_ce": 0.2389417141675949, "loss_lvr": 0.8194600939750671, "loss_mode_switch": 0.0, "loss_total": 0.3208877146244049, "step": 262 }, { "batch_size": 1, "epoch": 0.1048, "step": 262, "tokens_per_device": 4114 }, { "epoch": 0.1048, "loss_ce": 0.0061529651284217834, "loss_lvr": 0.6111856698989868, "loss_mode_switch": 0.0, "loss_total": 0.06727153062820435, "step": 262 }, { "epoch": 0.1052, "grad_norm": 1.3809314966201782, "learning_rate": 9.852434753544552e-06, "loss": 0.2965, "step": 263 }, { "batch_size": 1, "epoch": 0.1052, "step": 263, "tokens_per_device": 4855 }, { "epoch": 0.1052, "loss_ce": 0.09260134398937225, "loss_lvr": 0.7538998126983643, "loss_mode_switch": 0.0, "loss_total": 0.16799132525920868, "step": 263 }, { "batch_size": 1, "epoch": 0.1052, "step": 263, "tokens_per_device": 5210 }, { "epoch": 0.1052, "loss_ce": 0.03752429783344269, "loss_lvr": 0.44280797243118286, "loss_mode_switch": 0.0, "loss_total": 0.08180509507656097, "step": 263 }, { "batch_size": 4, "epoch": 0.1052, "step": 263, "tokens_per_device": 2656 }, { "epoch": 0.1052, "loss_ce": 0.2160359025001526, "loss_lvr": 1.2156661748886108, "loss_mode_switch": 0.0, "loss_total": 0.33760252594947815, "step": 263 }, { "batch_size": 4, "epoch": 0.1052, "step": 263, "tokens_per_device": 1312 }, { "epoch": 0.1052, "loss_ce": 0.14242421090602875, "loss_lvr": 1.3985137939453125, "loss_mode_switch": 0.0, "loss_total": 0.28227558732032776, "step": 263 }, { "batch_size": 4, "epoch": 0.1052, "step": 263, "tokens_per_device": 4432 }, { "epoch": 0.1052, "loss_ce": 0.07433363795280457, "loss_lvr": 1.2153452634811401, "loss_mode_switch": 0.0, "loss_total": 0.19586816430091858, "step": 263 }, { "batch_size": 1, "epoch": 0.1052, "step": 263, "tokens_per_device": 5105 }, { "epoch": 0.1052, "loss_ce": 0.24469952285289764, "loss_lvr": 0.6817392110824585, "loss_mode_switch": 0.0, "loss_total": 0.3128734529018402, "step": 263 }, { "batch_size": 1, "epoch": 0.1052, "step": 263, "tokens_per_device": 4908 }, { "epoch": 0.1052, "loss_ce": 0.17189741134643555, "loss_lvr": 0.5570888519287109, "loss_mode_switch": 0.0, "loss_total": 0.22760629653930664, "step": 263 }, { "batch_size": 1, "epoch": 0.1052, "step": 263, "tokens_per_device": 4973 }, { "epoch": 0.1052, "loss_ce": 0.021359572187066078, "loss_lvr": 0.9209030866622925, "loss_mode_switch": 0.0, "loss_total": 0.11344987899065018, "step": 263 }, { "epoch": 0.1056, "grad_norm": 1.6294230222702026, "learning_rate": 9.850868606727745e-06, "loss": 0.3088, "step": 264 }, { "batch_size": 1, "epoch": 0.1056, "step": 264, "tokens_per_device": 5169 }, { "epoch": 0.1056, "loss_ce": 0.010790517553687096, "loss_lvr": 0.912134051322937, "loss_mode_switch": 0.0, "loss_total": 0.10200392454862595, "step": 264 }, { "batch_size": 4, "epoch": 0.1056, "step": 264, "tokens_per_device": 2668 }, { "epoch": 0.1056, "loss_ce": 0.20305562019348145, "loss_lvr": 0.744167685508728, "loss_mode_switch": 0.0, "loss_total": 0.2774723768234253, "step": 264 }, { "batch_size": 1, "epoch": 0.1056, "step": 264, "tokens_per_device": 4948 }, { "epoch": 0.1056, "loss_ce": 0.0014024539850652218, "loss_lvr": 0.4051525294780731, "loss_mode_switch": 0.0, "loss_total": 0.04191770777106285, "step": 264 }, { "batch_size": 4, "epoch": 0.1056, "step": 264, "tokens_per_device": 4484 }, { "epoch": 0.1056, "loss_ce": 0.6460081338882446, "loss_lvr": 1.0841971635818481, "loss_mode_switch": 0.0, "loss_total": 0.7544278502464294, "step": 264 }, { "batch_size": 4, "epoch": 0.1056, "step": 264, "tokens_per_device": 4052 }, { "epoch": 0.1056, "loss_ce": 0.40923815965652466, "loss_lvr": 1.202964425086975, "loss_mode_switch": 0.0, "loss_total": 0.5295345783233643, "step": 264 }, { "batch_size": 1, "epoch": 0.1056, "step": 264, "tokens_per_device": 5016 }, { "epoch": 0.1056, "loss_ce": 0.12877503037452698, "loss_lvr": 0.6433250308036804, "loss_mode_switch": 0.0, "loss_total": 0.19310754537582397, "step": 264 }, { "batch_size": 1, "epoch": 0.1056, "step": 264, "tokens_per_device": 4883 }, { "epoch": 0.1056, "loss_ce": 0.007075416389852762, "loss_lvr": 0.38794660568237305, "loss_mode_switch": 0.0, "loss_total": 0.04587008059024811, "step": 264 }, { "batch_size": 4, "epoch": 0.1056, "step": 264, "tokens_per_device": 6472 }, { "epoch": 0.1056, "loss_ce": 0.04474261775612831, "loss_lvr": 1.060189962387085, "loss_mode_switch": 0.0, "loss_total": 0.15076161921024323, "step": 264 }, { "epoch": 0.106, "grad_norm": 1.4561430215835571, "learning_rate": 9.849294318574353e-06, "loss": 0.3096, "step": 265 }, { "batch_size": 4, "epoch": 0.106, "step": 265, "tokens_per_device": 4204 }, { "epoch": 0.106, "loss_ce": 0.5665826797485352, "loss_lvr": 1.0202279090881348, "loss_mode_switch": 0.0, "loss_total": 0.6686054468154907, "step": 265 }, { "batch_size": 1, "epoch": 0.106, "step": 265, "tokens_per_device": 5016 }, { "epoch": 0.106, "loss_ce": 0.16812703013420105, "loss_lvr": 0.921527624130249, "loss_mode_switch": 0.0, "loss_total": 0.2602798044681549, "step": 265 }, { "batch_size": 1, "epoch": 0.106, "step": 265, "tokens_per_device": 5153 }, { "epoch": 0.106, "loss_ce": 0.059662505984306335, "loss_lvr": 1.1434156894683838, "loss_mode_switch": 0.0, "loss_total": 0.17400407791137695, "step": 265 }, { "batch_size": 1, "epoch": 0.106, "step": 265, "tokens_per_device": 6474 }, { "epoch": 0.106, "loss_ce": 0.0011318009346723557, "loss_lvr": 0.7197110056877136, "loss_mode_switch": 0.0, "loss_total": 0.0731029063463211, "step": 265 }, { "batch_size": 1, "epoch": 0.106, "step": 265, "tokens_per_device": 6569 }, { "epoch": 0.106, "loss_ce": 1.4110137224197388, "loss_lvr": 0.6088274121284485, "loss_mode_switch": 0.0, "loss_total": 1.4718964099884033, "step": 265 }, { "batch_size": 4, "epoch": 0.106, "step": 265, "tokens_per_device": 16248 }, { "epoch": 0.106, "loss_ce": 0.09388168901205063, "loss_lvr": 0.4962504506111145, "loss_mode_switch": 0.0, "loss_total": 0.1435067355632782, "step": 265 }, { "batch_size": 4, "epoch": 0.106, "step": 265, "tokens_per_device": 3884 }, { "epoch": 0.106, "loss_ce": 0.19133511185646057, "loss_lvr": 1.2489922046661377, "loss_mode_switch": 0.0, "loss_total": 0.3162343502044678, "step": 265 }, { "batch_size": 4, "epoch": 0.106, "step": 265, "tokens_per_device": 3808 }, { "epoch": 0.106, "loss_ce": 0.11537496000528336, "loss_lvr": 0.5746616125106812, "loss_mode_switch": 0.0, "loss_total": 0.1728411167860031, "step": 265 }, { "epoch": 0.1064, "grad_norm": 1.5374813079833984, "learning_rate": 9.847711891726543e-06, "loss": 0.3353, "step": 266 }, { "batch_size": 4, "epoch": 0.1064, "step": 266, "tokens_per_device": 4308 }, { "epoch": 0.1064, "loss_ce": 0.28897008299827576, "loss_lvr": 1.1575987339019775, "loss_mode_switch": 0.0, "loss_total": 0.404729962348938, "step": 266 }, { "batch_size": 4, "epoch": 0.1064, "step": 266, "tokens_per_device": 4024 }, { "epoch": 0.1064, "loss_ce": 0.35889002680778503, "loss_lvr": 1.1263314485549927, "loss_mode_switch": 0.0, "loss_total": 0.4715231657028198, "step": 266 }, { "batch_size": 1, "epoch": 0.1064, "step": 266, "tokens_per_device": 4864 }, { "epoch": 0.1064, "loss_ce": 0.0014488525921478868, "loss_lvr": 0.5305236577987671, "loss_mode_switch": 0.0, "loss_total": 0.05450121685862541, "step": 266 }, { "batch_size": 4, "epoch": 0.1064, "step": 266, "tokens_per_device": 1288 }, { "epoch": 0.1064, "loss_ce": 0.21374143660068512, "loss_lvr": 2.3967597484588623, "loss_mode_switch": 0.0, "loss_total": 0.45341742038726807, "step": 266 }, { "batch_size": 4, "epoch": 0.1064, "step": 266, "tokens_per_device": 3624 }, { "epoch": 0.1064, "loss_ce": 0.3879414498806, "loss_lvr": 0.7568089962005615, "loss_mode_switch": 0.0, "loss_total": 0.4636223614215851, "step": 266 }, { "batch_size": 4, "epoch": 0.1064, "step": 266, "tokens_per_device": 6108 }, { "epoch": 0.1064, "loss_ce": 0.47494783997535706, "loss_lvr": 0.9906542301177979, "loss_mode_switch": 0.0, "loss_total": 0.5740132331848145, "step": 266 }, { "batch_size": 4, "epoch": 0.1064, "step": 266, "tokens_per_device": 4260 }, { "epoch": 0.1064, "loss_ce": 0.17290356755256653, "loss_lvr": 1.1700446605682373, "loss_mode_switch": 0.0, "loss_total": 0.2899080514907837, "step": 266 }, { "batch_size": 4, "epoch": 0.1064, "step": 266, "tokens_per_device": 4348 }, { "epoch": 0.1064, "loss_ce": 0.13531938195228577, "loss_lvr": 0.8480215668678284, "loss_mode_switch": 0.0, "loss_total": 0.22012153267860413, "step": 266 }, { "epoch": 0.1068, "grad_norm": 1.537113904953003, "learning_rate": 9.846121328840143e-06, "loss": 0.369, "step": 267 }, { "batch_size": 1, "epoch": 0.1068, "step": 267, "tokens_per_device": 5184 }, { "epoch": 0.1068, "loss_ce": 0.004143067169934511, "loss_lvr": 0.6380758881568909, "loss_mode_switch": 0.0, "loss_total": 0.06795065850019455, "step": 267 }, { "batch_size": 4, "epoch": 0.1068, "step": 267, "tokens_per_device": 3604 }, { "epoch": 0.1068, "loss_ce": 0.4908975064754486, "loss_lvr": 1.8791905641555786, "loss_mode_switch": 0.0, "loss_total": 0.678816556930542, "step": 267 }, { "batch_size": 4, "epoch": 0.1068, "step": 267, "tokens_per_device": 4612 }, { "epoch": 0.1068, "loss_ce": 0.05292193219065666, "loss_lvr": 0.7030839323997498, "loss_mode_switch": 0.0, "loss_total": 0.12323032319545746, "step": 267 }, { "batch_size": 4, "epoch": 0.1068, "step": 267, "tokens_per_device": 4004 }, { "epoch": 0.1068, "loss_ce": 0.6840646862983704, "loss_lvr": 0.9830480217933655, "loss_mode_switch": 0.0, "loss_total": 0.7823694944381714, "step": 267 }, { "batch_size": 4, "epoch": 0.1068, "step": 267, "tokens_per_device": 2808 }, { "epoch": 0.1068, "loss_ce": 0.5714784860610962, "loss_lvr": 0.9164808988571167, "loss_mode_switch": 0.0, "loss_total": 0.6631265878677368, "step": 267 }, { "batch_size": 1, "epoch": 0.1068, "step": 267, "tokens_per_device": 4948 }, { "epoch": 0.1068, "loss_ce": 0.11553873866796494, "loss_lvr": 1.058141827583313, "loss_mode_switch": 0.0, "loss_total": 0.22135291993618011, "step": 267 }, { "batch_size": 4, "epoch": 0.1068, "step": 267, "tokens_per_device": 3808 }, { "epoch": 0.1068, "loss_ce": 0.6064403653144836, "loss_lvr": 1.362453579902649, "loss_mode_switch": 0.0, "loss_total": 0.7426857352256775, "step": 267 }, { "batch_size": 4, "epoch": 0.1068, "step": 267, "tokens_per_device": 6160 }, { "epoch": 0.1068, "loss_ce": 0.19834142923355103, "loss_lvr": 1.012537956237793, "loss_mode_switch": 0.0, "loss_total": 0.2995952367782593, "step": 267 }, { "epoch": 0.1072, "grad_norm": 1.63005530834198, "learning_rate": 9.844522632584636e-06, "loss": 0.3822, "step": 268 }, { "batch_size": 4, "epoch": 0.1072, "step": 268, "tokens_per_device": 4428 }, { "epoch": 0.1072, "loss_ce": 0.5308658480644226, "loss_lvr": 1.0687925815582275, "loss_mode_switch": 0.0, "loss_total": 0.6377450823783875, "step": 268 }, { "batch_size": 1, "epoch": 0.1072, "step": 268, "tokens_per_device": 4861 }, { "epoch": 0.1072, "loss_ce": 0.20202751457691193, "loss_lvr": 0.5614241361618042, "loss_mode_switch": 0.0, "loss_total": 0.25816991925239563, "step": 268 }, { "batch_size": 4, "epoch": 0.1072, "step": 268, "tokens_per_device": 8980 }, { "epoch": 0.1072, "loss_ce": 0.10305362939834595, "loss_lvr": 1.012906789779663, "loss_mode_switch": 0.0, "loss_total": 0.20434430241584778, "step": 268 }, { "batch_size": 4, "epoch": 0.1072, "step": 268, "tokens_per_device": 4396 }, { "epoch": 0.1072, "loss_ce": 0.49706918001174927, "loss_lvr": 1.2267450094223022, "loss_mode_switch": 0.0, "loss_total": 0.6197437047958374, "step": 268 }, { "batch_size": 4, "epoch": 0.1072, "step": 268, "tokens_per_device": 1380 }, { "epoch": 0.1072, "loss_ce": 0.07744733989238739, "loss_lvr": 1.2435226440429688, "loss_mode_switch": 0.0, "loss_total": 0.20179960131645203, "step": 268 }, { "batch_size": 4, "epoch": 0.1072, "step": 268, "tokens_per_device": 5696 }, { "epoch": 0.1072, "loss_ce": 0.2713152766227722, "loss_lvr": 0.8783820867538452, "loss_mode_switch": 0.0, "loss_total": 0.35915347933769226, "step": 268 }, { "batch_size": 4, "epoch": 0.1072, "step": 268, "tokens_per_device": 5756 }, { "epoch": 0.1072, "loss_ce": 0.08917371183633804, "loss_lvr": 1.227419137954712, "loss_mode_switch": 0.0, "loss_total": 0.21191562712192535, "step": 268 }, { "batch_size": 1, "epoch": 0.1072, "step": 268, "tokens_per_device": 4872 }, { "epoch": 0.1072, "loss_ce": 0.015021036379039288, "loss_lvr": 0.7781435251235962, "loss_mode_switch": 0.0, "loss_total": 0.09283538907766342, "step": 268 }, { "epoch": 0.1076, "grad_norm": 1.6545977592468262, "learning_rate": 9.842915805643156e-06, "loss": 0.3495, "step": 269 }, { "batch_size": 1, "epoch": 0.1076, "step": 269, "tokens_per_device": 4133 }, { "epoch": 0.1076, "loss_ce": 0.005456696264445782, "loss_lvr": 0.5087257027626038, "loss_mode_switch": 0.0, "loss_total": 0.05632926896214485, "step": 269 }, { "batch_size": 4, "epoch": 0.1076, "step": 269, "tokens_per_device": 6740 }, { "epoch": 0.1076, "loss_ce": 0.4258846938610077, "loss_lvr": 0.5477356910705566, "loss_mode_switch": 0.0, "loss_total": 0.48065826296806335, "step": 269 }, { "batch_size": 4, "epoch": 0.1076, "step": 269, "tokens_per_device": 3856 }, { "epoch": 0.1076, "loss_ce": 0.08221052587032318, "loss_lvr": 1.346430778503418, "loss_mode_switch": 0.0, "loss_total": 0.21685360372066498, "step": 269 }, { "batch_size": 4, "epoch": 0.1076, "step": 269, "tokens_per_device": 6496 }, { "epoch": 0.1076, "loss_ce": 0.643836498260498, "loss_lvr": 0.8182451128959656, "loss_mode_switch": 0.0, "loss_total": 0.725661039352417, "step": 269 }, { "batch_size": 4, "epoch": 0.1076, "step": 269, "tokens_per_device": 4548 }, { "epoch": 0.1076, "loss_ce": 0.3080122172832489, "loss_lvr": 0.7997761368751526, "loss_mode_switch": 0.0, "loss_total": 0.3879898190498352, "step": 269 }, { "batch_size": 4, "epoch": 0.1076, "step": 269, "tokens_per_device": 4948 }, { "epoch": 0.1076, "loss_ce": 0.09200487285852432, "loss_lvr": 1.4624626636505127, "loss_mode_switch": 0.0, "loss_total": 0.23825114965438843, "step": 269 }, { "batch_size": 4, "epoch": 0.1076, "step": 269, "tokens_per_device": 4928 }, { "epoch": 0.1076, "loss_ce": 0.43199679255485535, "loss_lvr": 1.0409643650054932, "loss_mode_switch": 0.0, "loss_total": 0.5360932350158691, "step": 269 }, { "batch_size": 4, "epoch": 0.1076, "step": 269, "tokens_per_device": 1624 }, { "epoch": 0.1076, "loss_ce": 0.43877243995666504, "loss_lvr": 0.9897989630699158, "loss_mode_switch": 0.0, "loss_total": 0.5377523303031921, "step": 269 }, { "epoch": 0.108, "grad_norm": 1.976271152496338, "learning_rate": 9.841300850712479e-06, "loss": 0.381, "step": 270 }, { "batch_size": 1, "epoch": 0.108, "step": 270, "tokens_per_device": 4837 }, { "epoch": 0.108, "loss_ce": 0.027968663722276688, "loss_lvr": 0.8110148906707764, "loss_mode_switch": 0.0, "loss_total": 0.10907015204429626, "step": 270 }, { "batch_size": 4, "epoch": 0.108, "step": 270, "tokens_per_device": 1204 }, { "epoch": 0.108, "loss_ce": 0.46779608726501465, "loss_lvr": 1.2145036458969116, "loss_mode_switch": 0.0, "loss_total": 0.5892464518547058, "step": 270 }, { "batch_size": 4, "epoch": 0.108, "step": 270, "tokens_per_device": 4204 }, { "epoch": 0.108, "loss_ce": 0.0453174002468586, "loss_lvr": 0.826191782951355, "loss_mode_switch": 0.0, "loss_total": 0.12793658673763275, "step": 270 }, { "batch_size": 4, "epoch": 0.108, "step": 270, "tokens_per_device": 1540 }, { "epoch": 0.108, "loss_ce": 0.5672143697738647, "loss_lvr": 1.2192555665969849, "loss_mode_switch": 0.0, "loss_total": 0.6891399025917053, "step": 270 }, { "batch_size": 1, "epoch": 0.108, "step": 270, "tokens_per_device": 6119 }, { "epoch": 0.108, "loss_ce": 0.3401259183883667, "loss_lvr": 0.5822291970252991, "loss_mode_switch": 0.0, "loss_total": 0.3983488380908966, "step": 270 }, { "batch_size": 4, "epoch": 0.108, "step": 270, "tokens_per_device": 3992 }, { "epoch": 0.108, "loss_ce": 0.23430053889751434, "loss_lvr": 1.156503438949585, "loss_mode_switch": 0.0, "loss_total": 0.3499508798122406, "step": 270 }, { "batch_size": 1, "epoch": 0.108, "step": 270, "tokens_per_device": 4248 }, { "epoch": 0.108, "loss_ce": 0.021802019327878952, "loss_lvr": 0.997331976890564, "loss_mode_switch": 0.0, "loss_total": 0.12153521180152893, "step": 270 }, { "batch_size": 4, "epoch": 0.108, "step": 270, "tokens_per_device": 1296 }, { "epoch": 0.108, "loss_ce": 0.1901153177022934, "loss_lvr": 1.3899744749069214, "loss_mode_switch": 0.0, "loss_total": 0.3291127681732178, "step": 270 }, { "epoch": 0.1084, "grad_norm": 37.2047119140625, "learning_rate": 9.839677770503028e-06, "loss": 0.4364, "step": 271 }, { "batch_size": 4, "epoch": 0.1084, "step": 271, "tokens_per_device": 3008 }, { "epoch": 0.1084, "loss_ce": 0.022510521113872528, "loss_lvr": 0.8950150012969971, "loss_mode_switch": 0.0, "loss_total": 0.11201202124357224, "step": 271 }, { "batch_size": 4, "epoch": 0.1084, "step": 271, "tokens_per_device": 4132 }, { "epoch": 0.1084, "loss_ce": 0.4047462046146393, "loss_lvr": 1.3242591619491577, "loss_mode_switch": 0.0, "loss_total": 0.5371721386909485, "step": 271 }, { "batch_size": 4, "epoch": 0.1084, "step": 271, "tokens_per_device": 2720 }, { "epoch": 0.1084, "loss_ce": 0.31762975454330444, "loss_lvr": 1.0147265195846558, "loss_mode_switch": 0.0, "loss_total": 0.41910240054130554, "step": 271 }, { "batch_size": 4, "epoch": 0.1084, "step": 271, "tokens_per_device": 4216 }, { "epoch": 0.1084, "loss_ce": 0.6307262182235718, "loss_lvr": 1.2328802347183228, "loss_mode_switch": 0.0, "loss_total": 0.754014253616333, "step": 271 }, { "batch_size": 4, "epoch": 0.1084, "step": 271, "tokens_per_device": 5496 }, { "epoch": 0.1084, "loss_ce": 0.6456297039985657, "loss_lvr": 1.1638225317001343, "loss_mode_switch": 0.0, "loss_total": 0.7620119452476501, "step": 271 }, { "batch_size": 1, "epoch": 0.1084, "step": 271, "tokens_per_device": 4939 }, { "epoch": 0.1084, "loss_ce": 0.005676912609487772, "loss_lvr": 0.4702606201171875, "loss_mode_switch": 0.0, "loss_total": 0.05270297825336456, "step": 271 }, { "batch_size": 4, "epoch": 0.1084, "step": 271, "tokens_per_device": 1384 }, { "epoch": 0.1084, "loss_ce": 0.607210636138916, "loss_lvr": 1.1441177129745483, "loss_mode_switch": 0.0, "loss_total": 0.7216224074363708, "step": 271 }, { "batch_size": 1, "epoch": 0.1084, "step": 271, "tokens_per_device": 4698 }, { "epoch": 0.1084, "loss_ce": 0.14897172152996063, "loss_lvr": 0.5954666137695312, "loss_mode_switch": 0.0, "loss_total": 0.208518385887146, "step": 271 }, { "epoch": 0.1088, "grad_norm": 2.1127514839172363, "learning_rate": 9.838046567738856e-06, "loss": 0.3359, "step": 272 }, { "batch_size": 4, "epoch": 0.1088, "step": 272, "tokens_per_device": 4688 }, { "epoch": 0.1088, "loss_ce": 0.2587270140647888, "loss_lvr": 1.2740968465805054, "loss_mode_switch": 0.0, "loss_total": 0.3861367106437683, "step": 272 }, { "batch_size": 4, "epoch": 0.1088, "step": 272, "tokens_per_device": 4236 }, { "epoch": 0.1088, "loss_ce": 0.4126317799091339, "loss_lvr": 1.4630368947982788, "loss_mode_switch": 0.0, "loss_total": 0.5589354634284973, "step": 272 }, { "batch_size": 1, "epoch": 0.1088, "step": 272, "tokens_per_device": 4837 }, { "epoch": 0.1088, "loss_ce": 0.020979193970561028, "loss_lvr": 0.5895028710365295, "loss_mode_switch": 0.0, "loss_total": 0.07992947846651077, "step": 272 }, { "batch_size": 4, "epoch": 0.1088, "step": 272, "tokens_per_device": 3772 }, { "epoch": 0.1088, "loss_ce": 0.6385202407836914, "loss_lvr": 1.0086442232131958, "loss_mode_switch": 0.0, "loss_total": 0.739384651184082, "step": 272 }, { "batch_size": 1, "epoch": 0.1088, "step": 272, "tokens_per_device": 4868 }, { "epoch": 0.1088, "loss_ce": 0.5889862179756165, "loss_lvr": 0.4380559027194977, "loss_mode_switch": 0.0, "loss_total": 0.6327918171882629, "step": 272 }, { "batch_size": 4, "epoch": 0.1088, "step": 272, "tokens_per_device": 1532 }, { "epoch": 0.1088, "loss_ce": 0.4531812071800232, "loss_lvr": 1.1091653108596802, "loss_mode_switch": 0.0, "loss_total": 0.5640977621078491, "step": 272 }, { "batch_size": 1, "epoch": 0.1088, "step": 272, "tokens_per_device": 5060 }, { "epoch": 0.1088, "loss_ce": 0.009354227222502232, "loss_lvr": 1.165390968322754, "loss_mode_switch": 0.0, "loss_total": 0.12589332461357117, "step": 272 }, { "batch_size": 1, "epoch": 0.1088, "step": 272, "tokens_per_device": 4902 }, { "epoch": 0.1088, "loss_ce": 0.14060358703136444, "loss_lvr": 1.1727863550186157, "loss_mode_switch": 0.0, "loss_total": 0.2578822374343872, "step": 272 }, { "epoch": 0.1092, "grad_norm": 1.8899184465408325, "learning_rate": 9.836407245157656e-06, "loss": 0.3441, "step": 273 }, { "batch_size": 4, "epoch": 0.1092, "step": 273, "tokens_per_device": 7392 }, { "epoch": 0.1092, "loss_ce": 0.16054651141166687, "loss_lvr": 1.3728039264678955, "loss_mode_switch": 0.0, "loss_total": 0.297826886177063, "step": 273 }, { "batch_size": 1, "epoch": 0.1092, "step": 273, "tokens_per_device": 6316 }, { "epoch": 0.1092, "loss_ce": 0.036390434950590134, "loss_lvr": 0.6800625920295715, "loss_mode_switch": 0.0, "loss_total": 0.10439670085906982, "step": 273 }, { "batch_size": 4, "epoch": 0.1092, "step": 273, "tokens_per_device": 3996 }, { "epoch": 0.1092, "loss_ce": 0.05342085659503937, "loss_lvr": 1.3785004615783691, "loss_mode_switch": 0.0, "loss_total": 0.19127090275287628, "step": 273 }, { "batch_size": 4, "epoch": 0.1092, "step": 273, "tokens_per_device": 5768 }, { "epoch": 0.1092, "loss_ce": 0.5772992372512817, "loss_lvr": 0.8242921233177185, "loss_mode_switch": 0.0, "loss_total": 0.659728467464447, "step": 273 }, { "batch_size": 4, "epoch": 0.1092, "step": 273, "tokens_per_device": 2568 }, { "epoch": 0.1092, "loss_ce": 0.32203611731529236, "loss_lvr": 1.0999122858047485, "loss_mode_switch": 0.0, "loss_total": 0.43202733993530273, "step": 273 }, { "batch_size": 4, "epoch": 0.1092, "step": 273, "tokens_per_device": 2632 }, { "epoch": 0.1092, "loss_ce": 0.10861105471849442, "loss_lvr": 1.7415990829467773, "loss_mode_switch": 0.0, "loss_total": 0.28277096152305603, "step": 273 }, { "batch_size": 4, "epoch": 0.1092, "step": 273, "tokens_per_device": 10752 }, { "epoch": 0.1092, "loss_ce": 0.1673920601606369, "loss_lvr": 0.9467620849609375, "loss_mode_switch": 0.0, "loss_total": 0.2620682716369629, "step": 273 }, { "batch_size": 4, "epoch": 0.1092, "step": 273, "tokens_per_device": 5428 }, { "epoch": 0.1092, "loss_ce": 0.626875638961792, "loss_lvr": 1.069183588027954, "loss_mode_switch": 0.0, "loss_total": 0.7337939739227295, "step": 273 }, { "epoch": 0.1096, "grad_norm": 2.187711000442505, "learning_rate": 9.834759805510742e-06, "loss": 0.3464, "step": 274 }, { "batch_size": 4, "epoch": 0.1096, "step": 274, "tokens_per_device": 1372 }, { "epoch": 0.1096, "loss_ce": 0.25563085079193115, "loss_lvr": 1.1995941400527954, "loss_mode_switch": 0.0, "loss_total": 0.3755902647972107, "step": 274 }, { "batch_size": 4, "epoch": 0.1096, "step": 274, "tokens_per_device": 4356 }, { "epoch": 0.1096, "loss_ce": 0.2530252933502197, "loss_lvr": 1.064863920211792, "loss_mode_switch": 0.0, "loss_total": 0.35951167345046997, "step": 274 }, { "batch_size": 1, "epoch": 0.1096, "step": 274, "tokens_per_device": 5039 }, { "epoch": 0.1096, "loss_ce": 0.029388362541794777, "loss_lvr": 1.2004990577697754, "loss_mode_switch": 0.0, "loss_total": 0.14943827688694, "step": 274 }, { "batch_size": 4, "epoch": 0.1096, "step": 274, "tokens_per_device": 4264 }, { "epoch": 0.1096, "loss_ce": 0.3279736340045929, "loss_lvr": 1.3223869800567627, "loss_mode_switch": 0.0, "loss_total": 0.4602123498916626, "step": 274 }, { "batch_size": 4, "epoch": 0.1096, "step": 274, "tokens_per_device": 3800 }, { "epoch": 0.1096, "loss_ce": 0.003629673272371292, "loss_lvr": 0.8228468894958496, "loss_mode_switch": 0.0, "loss_total": 0.08591435849666595, "step": 274 }, { "batch_size": 4, "epoch": 0.1096, "step": 274, "tokens_per_device": 5176 }, { "epoch": 0.1096, "loss_ce": 0.2118605226278305, "loss_lvr": 0.8090083599090576, "loss_mode_switch": 0.0, "loss_total": 0.29276135563850403, "step": 274 }, { "batch_size": 4, "epoch": 0.1096, "step": 274, "tokens_per_device": 2736 }, { "epoch": 0.1096, "loss_ce": 0.27139607071876526, "loss_lvr": 0.8971340656280518, "loss_mode_switch": 0.0, "loss_total": 0.36110949516296387, "step": 274 }, { "batch_size": 4, "epoch": 0.1096, "step": 274, "tokens_per_device": 1764 }, { "epoch": 0.1096, "loss_ce": 0.7056156396865845, "loss_lvr": 1.1053017377853394, "loss_mode_switch": 0.0, "loss_total": 0.8161458373069763, "step": 274 }, { "epoch": 0.11, "grad_norm": 1.722551941871643, "learning_rate": 9.833104251563058e-06, "loss": 0.3659, "step": 275 }, { "batch_size": 4, "epoch": 0.11, "step": 275, "tokens_per_device": 13052 }, { "epoch": 0.11, "loss_ce": 0.4342091977596283, "loss_lvr": 1.3896565437316895, "loss_mode_switch": 0.0, "loss_total": 0.5731748342514038, "step": 275 }, { "batch_size": 4, "epoch": 0.11, "step": 275, "tokens_per_device": 4776 }, { "epoch": 0.11, "loss_ce": 0.013040340505540371, "loss_lvr": 0.9027764797210693, "loss_mode_switch": 0.0, "loss_total": 0.103317990899086, "step": 275 }, { "batch_size": 4, "epoch": 0.11, "step": 275, "tokens_per_device": 4492 }, { "epoch": 0.11, "loss_ce": 0.3237776458263397, "loss_lvr": 1.1450433731079102, "loss_mode_switch": 0.0, "loss_total": 0.43828198313713074, "step": 275 }, { "batch_size": 1, "epoch": 0.11, "step": 275, "tokens_per_device": 5157 }, { "epoch": 0.11, "loss_ce": 0.11436530202627182, "loss_lvr": 1.0552361011505127, "loss_mode_switch": 0.0, "loss_total": 0.21988891065120697, "step": 275 }, { "batch_size": 4, "epoch": 0.11, "step": 275, "tokens_per_device": 2812 }, { "epoch": 0.11, "loss_ce": 0.2702334523200989, "loss_lvr": 1.339247703552246, "loss_mode_switch": 0.0, "loss_total": 0.40415823459625244, "step": 275 }, { "batch_size": 1, "epoch": 0.11, "step": 275, "tokens_per_device": 5080 }, { "epoch": 0.11, "loss_ce": 0.002481649164110422, "loss_lvr": 1.145979404449463, "loss_mode_switch": 0.0, "loss_total": 0.11707958579063416, "step": 275 }, { "batch_size": 4, "epoch": 0.11, "step": 275, "tokens_per_device": 3860 }, { "epoch": 0.11, "loss_ce": 0.28835031390190125, "loss_lvr": 1.2781059741973877, "loss_mode_switch": 0.0, "loss_total": 0.41616091132164, "step": 275 }, { "batch_size": 4, "epoch": 0.11, "step": 275, "tokens_per_device": 5684 }, { "epoch": 0.11, "loss_ce": 0.11351042240858078, "loss_lvr": 0.8043926358222961, "loss_mode_switch": 0.0, "loss_total": 0.19394968450069427, "step": 275 }, { "epoch": 0.1104, "grad_norm": 1.6023801565170288, "learning_rate": 9.831440586093157e-06, "loss": 0.3199, "step": 276 }, { "batch_size": 4, "epoch": 0.1104, "step": 276, "tokens_per_device": 4704 }, { "epoch": 0.1104, "loss_ce": 0.09888340532779694, "loss_lvr": 0.8645613193511963, "loss_mode_switch": 0.0, "loss_total": 0.1853395402431488, "step": 276 }, { "batch_size": 1, "epoch": 0.1104, "step": 276, "tokens_per_device": 5056 }, { "epoch": 0.1104, "loss_ce": 0.0025390328373759985, "loss_lvr": 0.4343225061893463, "loss_mode_switch": 0.0, "loss_total": 0.045971281826496124, "step": 276 }, { "batch_size": 4, "epoch": 0.1104, "step": 276, "tokens_per_device": 8500 }, { "epoch": 0.1104, "loss_ce": 0.39465510845184326, "loss_lvr": 1.2060307264328003, "loss_mode_switch": 0.0, "loss_total": 0.5152581930160522, "step": 276 }, { "batch_size": 4, "epoch": 0.1104, "step": 276, "tokens_per_device": 4948 }, { "epoch": 0.1104, "loss_ce": 0.2837730348110199, "loss_lvr": 0.8129852414131165, "loss_mode_switch": 0.0, "loss_total": 0.365071564912796, "step": 276 }, { "batch_size": 4, "epoch": 0.1104, "step": 276, "tokens_per_device": 1300 }, { "epoch": 0.1104, "loss_ce": 0.7110844850540161, "loss_lvr": 1.2410815954208374, "loss_mode_switch": 0.0, "loss_total": 0.8351926207542419, "step": 276 }, { "batch_size": 1, "epoch": 0.1104, "step": 276, "tokens_per_device": 5140 }, { "epoch": 0.1104, "loss_ce": 0.03568520024418831, "loss_lvr": 0.3767875134944916, "loss_mode_switch": 0.0, "loss_total": 0.07336395233869553, "step": 276 }, { "batch_size": 1, "epoch": 0.1104, "step": 276, "tokens_per_device": 4879 }, { "epoch": 0.1104, "loss_ce": 1.0020599365234375, "loss_lvr": 0.4476354122161865, "loss_mode_switch": 0.0, "loss_total": 1.046823501586914, "step": 276 }, { "batch_size": 4, "epoch": 0.1104, "step": 276, "tokens_per_device": 5776 }, { "epoch": 0.1104, "loss_ce": 0.19781842827796936, "loss_lvr": 0.9496331214904785, "loss_mode_switch": 0.0, "loss_total": 0.2927817404270172, "step": 276 }, { "epoch": 0.1108, "grad_norm": 1.625160813331604, "learning_rate": 9.829768811893214e-06, "loss": 0.343, "step": 277 }, { "batch_size": 4, "epoch": 0.1108, "step": 277, "tokens_per_device": 4260 }, { "epoch": 0.1108, "loss_ce": 0.15384788811206818, "loss_lvr": 1.1930487155914307, "loss_mode_switch": 0.0, "loss_total": 0.27315276861190796, "step": 277 }, { "batch_size": 4, "epoch": 0.1108, "step": 277, "tokens_per_device": 3728 }, { "epoch": 0.1108, "loss_ce": 0.20633874833583832, "loss_lvr": 1.469946265220642, "loss_mode_switch": 0.0, "loss_total": 0.35333338379859924, "step": 277 }, { "batch_size": 1, "epoch": 0.1108, "step": 277, "tokens_per_device": 8129 }, { "epoch": 0.1108, "loss_ce": 0.18547005951404572, "loss_lvr": 0.5960408449172974, "loss_mode_switch": 0.0, "loss_total": 0.24507415294647217, "step": 277 }, { "batch_size": 4, "epoch": 0.1108, "step": 277, "tokens_per_device": 6024 }, { "epoch": 0.1108, "loss_ce": 0.4562193751335144, "loss_lvr": 1.0000027418136597, "loss_mode_switch": 0.0, "loss_total": 0.5562196373939514, "step": 277 }, { "batch_size": 1, "epoch": 0.1108, "step": 277, "tokens_per_device": 4747 }, { "epoch": 0.1108, "loss_ce": 0.019589325413107872, "loss_lvr": 0.6588231325149536, "loss_mode_switch": 0.0, "loss_total": 0.08547164499759674, "step": 277 }, { "batch_size": 4, "epoch": 0.1108, "step": 277, "tokens_per_device": 4232 }, { "epoch": 0.1108, "loss_ce": 0.3060576021671295, "loss_lvr": 1.1880106925964355, "loss_mode_switch": 0.0, "loss_total": 0.4248586893081665, "step": 277 }, { "batch_size": 1, "epoch": 0.1108, "step": 277, "tokens_per_device": 4881 }, { "epoch": 0.1108, "loss_ce": 0.006957057863473892, "loss_lvr": 0.6973864436149597, "loss_mode_switch": 0.0, "loss_total": 0.07669571042060852, "step": 277 }, { "batch_size": 1, "epoch": 0.1108, "step": 277, "tokens_per_device": 5024 }, { "epoch": 0.1108, "loss_ce": 0.024947114288806915, "loss_lvr": 0.9587437510490417, "loss_mode_switch": 0.0, "loss_total": 0.12082149088382721, "step": 277 }, { "epoch": 0.1112, "grad_norm": 1.670998215675354, "learning_rate": 9.828088931769012e-06, "loss": 0.3831, "step": 278 }, { "batch_size": 4, "epoch": 0.1112, "step": 278, "tokens_per_device": 1360 }, { "epoch": 0.1112, "loss_ce": 0.2821813225746155, "loss_lvr": 1.071028709411621, "loss_mode_switch": 0.0, "loss_total": 0.3892841935157776, "step": 278 }, { "batch_size": 1, "epoch": 0.1112, "step": 278, "tokens_per_device": 5176 }, { "epoch": 0.1112, "loss_ce": 0.03699317201972008, "loss_lvr": 0.5656681060791016, "loss_mode_switch": 0.0, "loss_total": 0.09355998039245605, "step": 278 }, { "batch_size": 1, "epoch": 0.1112, "step": 278, "tokens_per_device": 4932 }, { "epoch": 0.1112, "loss_ce": 0.03624271973967552, "loss_lvr": 0.8521162867546082, "loss_mode_switch": 0.0, "loss_total": 0.12145434319972992, "step": 278 }, { "batch_size": 4, "epoch": 0.1112, "step": 278, "tokens_per_device": 9676 }, { "epoch": 0.1112, "loss_ce": 0.08559983223676682, "loss_lvr": 0.8595852851867676, "loss_mode_switch": 0.0, "loss_total": 0.17155836522579193, "step": 278 }, { "batch_size": 4, "epoch": 0.1112, "step": 278, "tokens_per_device": 1300 }, { "epoch": 0.1112, "loss_ce": 0.6294123530387878, "loss_lvr": 1.08673095703125, "loss_mode_switch": 0.0, "loss_total": 0.7380854487419128, "step": 278 }, { "batch_size": 4, "epoch": 0.1112, "step": 278, "tokens_per_device": 5536 }, { "epoch": 0.1112, "loss_ce": 0.07378343492746353, "loss_lvr": 1.0088139772415161, "loss_mode_switch": 0.0, "loss_total": 0.17466482520103455, "step": 278 }, { "batch_size": 4, "epoch": 0.1112, "step": 278, "tokens_per_device": 5372 }, { "epoch": 0.1112, "loss_ce": 0.13610917329788208, "loss_lvr": 0.8481507301330566, "loss_mode_switch": 0.0, "loss_total": 0.2209242582321167, "step": 278 }, { "batch_size": 1, "epoch": 0.1112, "step": 278, "tokens_per_device": 4857 }, { "epoch": 0.1112, "loss_ce": 0.0034778581466525793, "loss_lvr": 0.4663795828819275, "loss_mode_switch": 0.0, "loss_total": 0.05011581629514694, "step": 278 }, { "epoch": 0.1116, "grad_norm": 1.5864628553390503, "learning_rate": 9.826400948539935e-06, "loss": 0.3358, "step": 279 }, { "batch_size": 4, "epoch": 0.1116, "step": 279, "tokens_per_device": 4348 }, { "epoch": 0.1116, "loss_ce": 0.7516375184059143, "loss_lvr": 1.0682244300842285, "loss_mode_switch": 0.0, "loss_total": 0.8584599494934082, "step": 279 }, { "batch_size": 4, "epoch": 0.1116, "step": 279, "tokens_per_device": 4232 }, { "epoch": 0.1116, "loss_ce": 0.40138402581214905, "loss_lvr": 1.175858497619629, "loss_mode_switch": 0.0, "loss_total": 0.5189698934555054, "step": 279 }, { "batch_size": 4, "epoch": 0.1116, "step": 279, "tokens_per_device": 5552 }, { "epoch": 0.1116, "loss_ce": 0.17510183155536652, "loss_lvr": 0.8122454881668091, "loss_mode_switch": 0.0, "loss_total": 0.2563263773918152, "step": 279 }, { "batch_size": 1, "epoch": 0.1116, "step": 279, "tokens_per_device": 4889 }, { "epoch": 0.1116, "loss_ce": 0.022244112566113472, "loss_lvr": 0.5402004718780518, "loss_mode_switch": 0.0, "loss_total": 0.0762641578912735, "step": 279 }, { "batch_size": 4, "epoch": 0.1116, "step": 279, "tokens_per_device": 5920 }, { "epoch": 0.1116, "loss_ce": 0.09147398918867111, "loss_lvr": 0.892024040222168, "loss_mode_switch": 0.0, "loss_total": 0.1806764006614685, "step": 279 }, { "batch_size": 1, "epoch": 0.1116, "step": 279, "tokens_per_device": 5229 }, { "epoch": 0.1116, "loss_ce": 0.08536235988140106, "loss_lvr": 0.9679043889045715, "loss_mode_switch": 0.0, "loss_total": 0.18215280771255493, "step": 279 }, { "batch_size": 4, "epoch": 0.1116, "step": 279, "tokens_per_device": 4904 }, { "epoch": 0.1116, "loss_ce": 0.36374977231025696, "loss_lvr": 1.2592896223068237, "loss_mode_switch": 0.0, "loss_total": 0.4896787405014038, "step": 279 }, { "batch_size": 4, "epoch": 0.1116, "step": 279, "tokens_per_device": 6064 }, { "epoch": 0.1116, "loss_ce": 0.011579765938222408, "loss_lvr": 0.9255951046943665, "loss_mode_switch": 0.0, "loss_total": 0.10413927584886551, "step": 279 }, { "epoch": 0.112, "grad_norm": 1.5947065353393555, "learning_rate": 9.824704865038967e-06, "loss": 0.3909, "step": 280 }, { "batch_size": 4, "epoch": 0.112, "step": 280, "tokens_per_device": 7648 }, { "epoch": 0.112, "loss_ce": 0.4736321270465851, "loss_lvr": 1.0962163209915161, "loss_mode_switch": 0.0, "loss_total": 0.5832537412643433, "step": 280 }, { "batch_size": 1, "epoch": 0.112, "step": 280, "tokens_per_device": 5109 }, { "epoch": 0.112, "loss_ce": 0.21177171170711517, "loss_lvr": 0.8227402567863464, "loss_mode_switch": 0.0, "loss_total": 0.29404574632644653, "step": 280 }, { "batch_size": 4, "epoch": 0.112, "step": 280, "tokens_per_device": 2680 }, { "epoch": 0.112, "loss_ce": 0.4471353590488434, "loss_lvr": 1.2718437910079956, "loss_mode_switch": 0.0, "loss_total": 0.5743197202682495, "step": 280 }, { "batch_size": 4, "epoch": 0.112, "step": 280, "tokens_per_device": 4312 }, { "epoch": 0.112, "loss_ce": 0.3588147461414337, "loss_lvr": 1.2615259885787964, "loss_mode_switch": 0.0, "loss_total": 0.48496735095977783, "step": 280 }, { "batch_size": 4, "epoch": 0.112, "step": 280, "tokens_per_device": 4644 }, { "epoch": 0.112, "loss_ce": 0.130350723862648, "loss_lvr": 1.1989415884017944, "loss_mode_switch": 0.0, "loss_total": 0.2502448856830597, "step": 280 }, { "batch_size": 4, "epoch": 0.112, "step": 280, "tokens_per_device": 3744 }, { "epoch": 0.112, "loss_ce": 0.20393089950084686, "loss_lvr": 1.267136573791504, "loss_mode_switch": 0.0, "loss_total": 0.33064454793930054, "step": 280 }, { "batch_size": 4, "epoch": 0.112, "step": 280, "tokens_per_device": 2068 }, { "epoch": 0.112, "loss_ce": 0.25913146138191223, "loss_lvr": 1.0148802995681763, "loss_mode_switch": 0.0, "loss_total": 0.3606194853782654, "step": 280 }, { "batch_size": 4, "epoch": 0.112, "step": 280, "tokens_per_device": 6184 }, { "epoch": 0.112, "loss_ce": 0.12260651588439941, "loss_lvr": 0.8408101201057434, "loss_mode_switch": 0.0, "loss_total": 0.2066875398159027, "step": 280 }, { "epoch": 0.1124, "grad_norm": 1.5937871932983398, "learning_rate": 9.823000684112691e-06, "loss": 0.3789, "step": 281 }, { "batch_size": 4, "epoch": 0.1124, "step": 281, "tokens_per_device": 4632 }, { "epoch": 0.1124, "loss_ce": 0.07629426568746567, "loss_lvr": 1.0349076986312866, "loss_mode_switch": 0.0, "loss_total": 0.17978504300117493, "step": 281 }, { "batch_size": 4, "epoch": 0.1124, "step": 281, "tokens_per_device": 2240 }, { "epoch": 0.1124, "loss_ce": 0.815000057220459, "loss_lvr": 0.9458028674125671, "loss_mode_switch": 0.0, "loss_total": 0.9095803499221802, "step": 281 }, { "batch_size": 4, "epoch": 0.1124, "step": 281, "tokens_per_device": 3356 }, { "epoch": 0.1124, "loss_ce": 0.07730554789304733, "loss_lvr": 0.9415662288665771, "loss_mode_switch": 0.0, "loss_total": 0.17146217823028564, "step": 281 }, { "batch_size": 4, "epoch": 0.1124, "step": 281, "tokens_per_device": 4268 }, { "epoch": 0.1124, "loss_ce": 0.47953158617019653, "loss_lvr": 1.212353229522705, "loss_mode_switch": 0.0, "loss_total": 0.6007668972015381, "step": 281 }, { "batch_size": 4, "epoch": 0.1124, "step": 281, "tokens_per_device": 2780 }, { "epoch": 0.1124, "loss_ce": 0.08234027028083801, "loss_lvr": 0.9085915684700012, "loss_mode_switch": 0.0, "loss_total": 0.17319943010807037, "step": 281 }, { "batch_size": 1, "epoch": 0.1124, "step": 281, "tokens_per_device": 4918 }, { "epoch": 0.1124, "loss_ce": 0.017209354788064957, "loss_lvr": 0.3636068105697632, "loss_mode_switch": 0.0, "loss_total": 0.053570035845041275, "step": 281 }, { "batch_size": 1, "epoch": 0.1124, "step": 281, "tokens_per_device": 4741 }, { "epoch": 0.1124, "loss_ce": 0.7100738883018494, "loss_lvr": 0.34363120794296265, "loss_mode_switch": 0.0, "loss_total": 0.7444369792938232, "step": 281 }, { "batch_size": 1, "epoch": 0.1124, "step": 281, "tokens_per_device": 5193 }, { "epoch": 0.1124, "loss_ce": 0.17080746591091156, "loss_lvr": 0.5957831144332886, "loss_mode_switch": 0.0, "loss_total": 0.23038578033447266, "step": 281 }, { "epoch": 0.1128, "grad_norm": 2.6293745040893555, "learning_rate": 9.821288408621276e-06, "loss": 0.3437, "step": 282 }, { "batch_size": 1, "epoch": 0.1128, "step": 282, "tokens_per_device": 4879 }, { "epoch": 0.1128, "loss_ce": 0.2008756846189499, "loss_lvr": 1.2966997623443604, "loss_mode_switch": 0.0, "loss_total": 0.33054566383361816, "step": 282 }, { "batch_size": 1, "epoch": 0.1128, "step": 282, "tokens_per_device": 5088 }, { "epoch": 0.1128, "loss_ce": 0.011830826289951801, "loss_lvr": 1.0061464309692383, "loss_mode_switch": 0.0, "loss_total": 0.11244547367095947, "step": 282 }, { "batch_size": 1, "epoch": 0.1128, "step": 282, "tokens_per_device": 5125 }, { "epoch": 0.1128, "loss_ce": 0.1471255123615265, "loss_lvr": 1.1804795265197754, "loss_mode_switch": 0.0, "loss_total": 0.26517346501350403, "step": 282 }, { "batch_size": 1, "epoch": 0.1128, "step": 282, "tokens_per_device": 4867 }, { "epoch": 0.1128, "loss_ce": 0.0012554824352264404, "loss_lvr": 0.5721071362495422, "loss_mode_switch": 0.0, "loss_total": 0.058466196060180664, "step": 282 }, { "batch_size": 4, "epoch": 0.1128, "step": 282, "tokens_per_device": 5264 }, { "epoch": 0.1128, "loss_ce": 0.04785776138305664, "loss_lvr": 1.0508086681365967, "loss_mode_switch": 0.0, "loss_total": 0.1529386341571808, "step": 282 }, { "batch_size": 1, "epoch": 0.1128, "step": 282, "tokens_per_device": 4887 }, { "epoch": 0.1128, "loss_ce": 0.7267695665359497, "loss_lvr": 0.5692073702812195, "loss_mode_switch": 0.0, "loss_total": 0.783690333366394, "step": 282 }, { "batch_size": 4, "epoch": 0.1128, "step": 282, "tokens_per_device": 5820 }, { "epoch": 0.1128, "loss_ce": 0.14399082958698273, "loss_lvr": 0.743524968624115, "loss_mode_switch": 0.0, "loss_total": 0.2183433175086975, "step": 282 }, { "batch_size": 4, "epoch": 0.1128, "step": 282, "tokens_per_device": 4300 }, { "epoch": 0.1128, "loss_ce": 0.5971096158027649, "loss_lvr": 0.8221993446350098, "loss_mode_switch": 0.0, "loss_total": 0.6793295741081238, "step": 282 }, { "epoch": 0.1132, "grad_norm": 1.9110742807388306, "learning_rate": 9.819568041438477e-06, "loss": 0.4155, "step": 283 }, { "batch_size": 4, "epoch": 0.1132, "step": 283, "tokens_per_device": 1296 }, { "epoch": 0.1132, "loss_ce": 0.7294158339500427, "loss_lvr": 1.4930270910263062, "loss_mode_switch": 0.0, "loss_total": 0.8787185549736023, "step": 283 }, { "batch_size": 4, "epoch": 0.1132, "step": 283, "tokens_per_device": 4200 }, { "epoch": 0.1132, "loss_ce": 0.49835342168807983, "loss_lvr": 1.2753257751464844, "loss_mode_switch": 0.0, "loss_total": 0.6258860230445862, "step": 283 }, { "batch_size": 4, "epoch": 0.1132, "step": 283, "tokens_per_device": 2244 }, { "epoch": 0.1132, "loss_ce": 0.3377636671066284, "loss_lvr": 0.9947813749313354, "loss_mode_switch": 0.0, "loss_total": 0.437241792678833, "step": 283 }, { "batch_size": 4, "epoch": 0.1132, "step": 283, "tokens_per_device": 3516 }, { "epoch": 0.1132, "loss_ce": 0.5067334771156311, "loss_lvr": 0.9914278388023376, "loss_mode_switch": 0.0, "loss_total": 0.6058762669563293, "step": 283 }, { "batch_size": 4, "epoch": 0.1132, "step": 283, "tokens_per_device": 4240 }, { "epoch": 0.1132, "loss_ce": 0.24051553010940552, "loss_lvr": 1.5023369789123535, "loss_mode_switch": 0.0, "loss_total": 0.3907492160797119, "step": 283 }, { "batch_size": 4, "epoch": 0.1132, "step": 283, "tokens_per_device": 1712 }, { "epoch": 0.1132, "loss_ce": 0.6892839670181274, "loss_lvr": 2.1479341983795166, "loss_mode_switch": 0.0, "loss_total": 0.904077410697937, "step": 283 }, { "batch_size": 4, "epoch": 0.1132, "step": 283, "tokens_per_device": 4496 }, { "epoch": 0.1132, "loss_ce": 0.3335837721824646, "loss_lvr": 1.0495511293411255, "loss_mode_switch": 0.0, "loss_total": 0.43853887915611267, "step": 283 }, { "batch_size": 1, "epoch": 0.1132, "step": 283, "tokens_per_device": 4876 }, { "epoch": 0.1132, "loss_ce": 0.16508787870407104, "loss_lvr": 0.8760632276535034, "loss_mode_switch": 0.0, "loss_total": 0.25269418954849243, "step": 283 }, { "epoch": 0.1136, "grad_norm": 1.7114776372909546, "learning_rate": 9.817839585451629e-06, "loss": 0.365, "step": 284 }, { "batch_size": 4, "epoch": 0.1136, "step": 284, "tokens_per_device": 4512 }, { "epoch": 0.1136, "loss_ce": 0.25143083930015564, "loss_lvr": 1.1633310317993164, "loss_mode_switch": 0.0, "loss_total": 0.3677639365196228, "step": 284 }, { "batch_size": 4, "epoch": 0.1136, "step": 284, "tokens_per_device": 1480 }, { "epoch": 0.1136, "loss_ce": 0.23855070769786835, "loss_lvr": 1.5314407348632812, "loss_mode_switch": 0.0, "loss_total": 0.3916947841644287, "step": 284 }, { "batch_size": 4, "epoch": 0.1136, "step": 284, "tokens_per_device": 3496 }, { "epoch": 0.1136, "loss_ce": 0.23672620952129364, "loss_lvr": 0.9848626255989075, "loss_mode_switch": 0.0, "loss_total": 0.33521246910095215, "step": 284 }, { "batch_size": 4, "epoch": 0.1136, "step": 284, "tokens_per_device": 2660 }, { "epoch": 0.1136, "loss_ce": 0.36299580335617065, "loss_lvr": 0.9585716724395752, "loss_mode_switch": 0.0, "loss_total": 0.45885297656059265, "step": 284 }, { "batch_size": 4, "epoch": 0.1136, "step": 284, "tokens_per_device": 4244 }, { "epoch": 0.1136, "loss_ce": 0.055192168802022934, "loss_lvr": 1.0360187292099, "loss_mode_switch": 0.0, "loss_total": 0.15879404544830322, "step": 284 }, { "batch_size": 4, "epoch": 0.1136, "step": 284, "tokens_per_device": 5712 }, { "epoch": 0.1136, "loss_ce": 0.1530938595533371, "loss_lvr": 0.9449344277381897, "loss_mode_switch": 0.0, "loss_total": 0.24758729338645935, "step": 284 }, { "batch_size": 4, "epoch": 0.1136, "step": 284, "tokens_per_device": 4736 }, { "epoch": 0.1136, "loss_ce": 0.39714574813842773, "loss_lvr": 1.3174793720245361, "loss_mode_switch": 0.0, "loss_total": 0.5288937091827393, "step": 284 }, { "batch_size": 4, "epoch": 0.1136, "step": 284, "tokens_per_device": 3300 }, { "epoch": 0.1136, "loss_ce": 0.045070137828588486, "loss_lvr": 1.13728666305542, "loss_mode_switch": 0.0, "loss_total": 0.15879879891872406, "step": 284 }, { "epoch": 0.114, "grad_norm": 1.493011474609375, "learning_rate": 9.816103043561648e-06, "loss": 0.3246, "step": 285 }, { "batch_size": 1, "epoch": 0.114, "step": 285, "tokens_per_device": 4888 }, { "epoch": 0.114, "loss_ce": 0.09877649694681168, "loss_lvr": 0.6392080187797546, "loss_mode_switch": 0.0, "loss_total": 0.16269730031490326, "step": 285 }, { "batch_size": 4, "epoch": 0.114, "step": 285, "tokens_per_device": 2448 }, { "epoch": 0.114, "loss_ce": 0.11499065905809402, "loss_lvr": 0.924277126789093, "loss_mode_switch": 0.0, "loss_total": 0.20741838216781616, "step": 285 }, { "batch_size": 4, "epoch": 0.114, "step": 285, "tokens_per_device": 2784 }, { "epoch": 0.114, "loss_ce": 0.3283724784851074, "loss_lvr": 1.1753218173980713, "loss_mode_switch": 0.0, "loss_total": 0.4459046721458435, "step": 285 }, { "batch_size": 1, "epoch": 0.114, "step": 285, "tokens_per_device": 5178 }, { "epoch": 0.114, "loss_ce": 0.01074658241122961, "loss_lvr": 0.846560001373291, "loss_mode_switch": 0.0, "loss_total": 0.09540258347988129, "step": 285 }, { "batch_size": 1, "epoch": 0.114, "step": 285, "tokens_per_device": 4816 }, { "epoch": 0.114, "loss_ce": 0.014540252275764942, "loss_lvr": 0.8044564723968506, "loss_mode_switch": 0.0, "loss_total": 0.09498590230941772, "step": 285 }, { "batch_size": 4, "epoch": 0.114, "step": 285, "tokens_per_device": 5252 }, { "epoch": 0.114, "loss_ce": 0.25229713320732117, "loss_lvr": 1.0480482578277588, "loss_mode_switch": 0.0, "loss_total": 0.3571019768714905, "step": 285 }, { "batch_size": 4, "epoch": 0.114, "step": 285, "tokens_per_device": 2868 }, { "epoch": 0.114, "loss_ce": 0.07330566644668579, "loss_lvr": 0.4645007252693176, "loss_mode_switch": 0.0, "loss_total": 0.11975574493408203, "step": 285 }, { "batch_size": 4, "epoch": 0.114, "step": 285, "tokens_per_device": 4124 }, { "epoch": 0.114, "loss_ce": 0.37860098481178284, "loss_lvr": 1.1400395631790161, "loss_mode_switch": 0.0, "loss_total": 0.49260494112968445, "step": 285 }, { "epoch": 0.1144, "grad_norm": 1.4249303340911865, "learning_rate": 9.814358418683014e-06, "loss": 0.2977, "step": 286 }, { "batch_size": 4, "epoch": 0.1144, "step": 286, "tokens_per_device": 4388 }, { "epoch": 0.1144, "loss_ce": 0.12207252532243729, "loss_lvr": 0.7839583158493042, "loss_mode_switch": 0.0, "loss_total": 0.20046836137771606, "step": 286 }, { "batch_size": 1, "epoch": 0.1144, "step": 286, "tokens_per_device": 4893 }, { "epoch": 0.1144, "loss_ce": 0.014353048987686634, "loss_lvr": 0.6600807905197144, "loss_mode_switch": 0.0, "loss_total": 0.08036113530397415, "step": 286 }, { "batch_size": 1, "epoch": 0.1144, "step": 286, "tokens_per_device": 5092 }, { "epoch": 0.1144, "loss_ce": 0.042409274727106094, "loss_lvr": 0.8225287199020386, "loss_mode_switch": 0.0, "loss_total": 0.12466214597225189, "step": 286 }, { "batch_size": 4, "epoch": 0.1144, "step": 286, "tokens_per_device": 6100 }, { "epoch": 0.1144, "loss_ce": 0.024262744933366776, "loss_lvr": 1.275120735168457, "loss_mode_switch": 0.0, "loss_total": 0.1517748236656189, "step": 286 }, { "batch_size": 1, "epoch": 0.1144, "step": 286, "tokens_per_device": 4863 }, { "epoch": 0.1144, "loss_ce": 0.38611483573913574, "loss_lvr": 0.3819667398929596, "loss_mode_switch": 0.0, "loss_total": 0.4243115186691284, "step": 286 }, { "batch_size": 4, "epoch": 0.1144, "step": 286, "tokens_per_device": 5884 }, { "epoch": 0.1144, "loss_ce": 0.035238366574048996, "loss_lvr": 0.8239883780479431, "loss_mode_switch": 0.0, "loss_total": 0.11763720214366913, "step": 286 }, { "batch_size": 4, "epoch": 0.1144, "step": 286, "tokens_per_device": 2640 }, { "epoch": 0.1144, "loss_ce": 0.21274380385875702, "loss_lvr": 1.8550083637237549, "loss_mode_switch": 0.0, "loss_total": 0.3982446491718292, "step": 286 }, { "batch_size": 1, "epoch": 0.1144, "step": 286, "tokens_per_device": 5046 }, { "epoch": 0.1144, "loss_ce": 0.2543751299381256, "loss_lvr": 1.0715662240982056, "loss_mode_switch": 0.0, "loss_total": 0.3615317642688751, "step": 286 }, { "epoch": 0.1148, "grad_norm": 1.8500438928604126, "learning_rate": 9.812605713743775e-06, "loss": 0.3809, "step": 287 }, { "batch_size": 4, "epoch": 0.1148, "step": 287, "tokens_per_device": 4096 }, { "epoch": 0.1148, "loss_ce": 0.1754765808582306, "loss_lvr": 0.8986841440200806, "loss_mode_switch": 0.0, "loss_total": 0.2653450071811676, "step": 287 }, { "batch_size": 4, "epoch": 0.1148, "step": 287, "tokens_per_device": 4736 }, { "epoch": 0.1148, "loss_ce": 0.5655613541603088, "loss_lvr": 0.6944869756698608, "loss_mode_switch": 0.0, "loss_total": 0.6350100636482239, "step": 287 }, { "batch_size": 1, "epoch": 0.1148, "step": 287, "tokens_per_device": 5166 }, { "epoch": 0.1148, "loss_ce": 0.04505518451333046, "loss_lvr": 0.5602215528488159, "loss_mode_switch": 0.0, "loss_total": 0.10107734054327011, "step": 287 }, { "batch_size": 1, "epoch": 0.1148, "step": 287, "tokens_per_device": 5099 }, { "epoch": 0.1148, "loss_ce": 0.06479819864034653, "loss_lvr": 0.738519549369812, "loss_mode_switch": 0.0, "loss_total": 0.13865014910697937, "step": 287 }, { "batch_size": 4, "epoch": 0.1148, "step": 287, "tokens_per_device": 2692 }, { "epoch": 0.1148, "loss_ce": 0.18130525946617126, "loss_lvr": 1.6687887907028198, "loss_mode_switch": 0.0, "loss_total": 0.34818413853645325, "step": 287 }, { "batch_size": 4, "epoch": 0.1148, "step": 287, "tokens_per_device": 4528 }, { "epoch": 0.1148, "loss_ce": 0.05564381927251816, "loss_lvr": 1.1890102624893188, "loss_mode_switch": 0.0, "loss_total": 0.17454484105110168, "step": 287 }, { "batch_size": 4, "epoch": 0.1148, "step": 287, "tokens_per_device": 3744 }, { "epoch": 0.1148, "loss_ce": 0.14513735473155975, "loss_lvr": 1.712420105934143, "loss_mode_switch": 0.0, "loss_total": 0.3163793683052063, "step": 287 }, { "batch_size": 1, "epoch": 0.1148, "step": 287, "tokens_per_device": 4899 }, { "epoch": 0.1148, "loss_ce": 0.07686907798051834, "loss_lvr": 0.6007996797561646, "loss_mode_switch": 0.0, "loss_total": 0.13694904744625092, "step": 287 }, { "epoch": 0.1152, "grad_norm": 1.4373600482940674, "learning_rate": 9.810844931685542e-06, "loss": 0.3102, "step": 288 }, { "batch_size": 4, "epoch": 0.1152, "step": 288, "tokens_per_device": 3752 }, { "epoch": 0.1152, "loss_ce": 0.17753778398036957, "loss_lvr": 1.9039994478225708, "loss_mode_switch": 0.0, "loss_total": 0.36793774366378784, "step": 288 }, { "batch_size": 1, "epoch": 0.1152, "step": 288, "tokens_per_device": 5374 }, { "epoch": 0.1152, "loss_ce": 0.010671788826584816, "loss_lvr": 0.5212012529373169, "loss_mode_switch": 0.0, "loss_total": 0.06279191374778748, "step": 288 }, { "batch_size": 4, "epoch": 0.1152, "step": 288, "tokens_per_device": 4268 }, { "epoch": 0.1152, "loss_ce": 0.26889121532440186, "loss_lvr": 1.3541879653930664, "loss_mode_switch": 0.0, "loss_total": 0.404310017824173, "step": 288 }, { "batch_size": 4, "epoch": 0.1152, "step": 288, "tokens_per_device": 5816 }, { "epoch": 0.1152, "loss_ce": 0.5171265602111816, "loss_lvr": 1.1414949893951416, "loss_mode_switch": 0.0, "loss_total": 0.6312760710716248, "step": 288 }, { "batch_size": 4, "epoch": 0.1152, "step": 288, "tokens_per_device": 4432 }, { "epoch": 0.1152, "loss_ce": 0.17752055823802948, "loss_lvr": 0.7788605093955994, "loss_mode_switch": 0.0, "loss_total": 0.25540661811828613, "step": 288 }, { "batch_size": 1, "epoch": 0.1152, "step": 288, "tokens_per_device": 4875 }, { "epoch": 0.1152, "loss_ce": 0.042433008551597595, "loss_lvr": 0.688551664352417, "loss_mode_switch": 0.0, "loss_total": 0.1112881749868393, "step": 288 }, { "batch_size": 4, "epoch": 0.1152, "step": 288, "tokens_per_device": 5180 }, { "epoch": 0.1152, "loss_ce": 0.14915217459201813, "loss_lvr": 0.6542049050331116, "loss_mode_switch": 0.0, "loss_total": 0.21457266807556152, "step": 288 }, { "batch_size": 4, "epoch": 0.1152, "step": 288, "tokens_per_device": 5056 }, { "epoch": 0.1152, "loss_ce": 0.0989796444773674, "loss_lvr": 0.9132177233695984, "loss_mode_switch": 0.0, "loss_total": 0.19030141830444336, "step": 288 }, { "epoch": 0.1156, "grad_norm": 1.5039023160934448, "learning_rate": 9.809076075463476e-06, "loss": 0.3368, "step": 289 }, { "batch_size": 4, "epoch": 0.1156, "step": 289, "tokens_per_device": 3372 }, { "epoch": 0.1156, "loss_ce": 0.06755542010068893, "loss_lvr": 0.9497397541999817, "loss_mode_switch": 0.0, "loss_total": 0.16252939403057098, "step": 289 }, { "batch_size": 1, "epoch": 0.1156, "step": 289, "tokens_per_device": 5169 }, { "epoch": 0.1156, "loss_ce": 0.054725587368011475, "loss_lvr": 0.6668209433555603, "loss_mode_switch": 0.0, "loss_total": 0.12140768021345139, "step": 289 }, { "batch_size": 4, "epoch": 0.1156, "step": 289, "tokens_per_device": 4752 }, { "epoch": 0.1156, "loss_ce": 0.14431586861610413, "loss_lvr": 1.0212230682373047, "loss_mode_switch": 0.0, "loss_total": 0.2464381754398346, "step": 289 }, { "batch_size": 4, "epoch": 0.1156, "step": 289, "tokens_per_device": 1196 }, { "epoch": 0.1156, "loss_ce": 0.05415080115199089, "loss_lvr": 1.4204961061477661, "loss_mode_switch": 0.0, "loss_total": 0.1962004154920578, "step": 289 }, { "batch_size": 4, "epoch": 0.1156, "step": 289, "tokens_per_device": 6268 }, { "epoch": 0.1156, "loss_ce": 0.6333903670310974, "loss_lvr": 0.8250405788421631, "loss_mode_switch": 0.0, "loss_total": 0.7158944010734558, "step": 289 }, { "batch_size": 4, "epoch": 0.1156, "step": 289, "tokens_per_device": 4704 }, { "epoch": 0.1156, "loss_ce": 0.03530845418572426, "loss_lvr": 0.994219183921814, "loss_mode_switch": 0.0, "loss_total": 0.13473036885261536, "step": 289 }, { "batch_size": 4, "epoch": 0.1156, "step": 289, "tokens_per_device": 11228 }, { "epoch": 0.1156, "loss_ce": 0.01373500656336546, "loss_lvr": 0.7714115977287292, "loss_mode_switch": 0.0, "loss_total": 0.09087616205215454, "step": 289 }, { "batch_size": 1, "epoch": 0.1156, "step": 289, "tokens_per_device": 4995 }, { "epoch": 0.1156, "loss_ce": 1.1546467542648315, "loss_lvr": 0.8487573862075806, "loss_mode_switch": 0.0, "loss_total": 1.2395224571228027, "step": 289 }, { "epoch": 0.116, "grad_norm": 1.6218321323394775, "learning_rate": 9.807299148046301e-06, "loss": 0.3474, "step": 290 }, { "batch_size": 4, "epoch": 0.116, "step": 290, "tokens_per_device": 2632 }, { "epoch": 0.116, "loss_ce": 0.6984436511993408, "loss_lvr": 1.2083518505096436, "loss_mode_switch": 0.0, "loss_total": 0.8192788362503052, "step": 290 }, { "batch_size": 4, "epoch": 0.116, "step": 290, "tokens_per_device": 3396 }, { "epoch": 0.116, "loss_ce": 0.5133683085441589, "loss_lvr": 1.3321164846420288, "loss_mode_switch": 0.0, "loss_total": 0.6465799808502197, "step": 290 }, { "batch_size": 1, "epoch": 0.116, "step": 290, "tokens_per_device": 4874 }, { "epoch": 0.116, "loss_ce": 0.016915883868932724, "loss_lvr": 0.4054248034954071, "loss_mode_switch": 0.0, "loss_total": 0.057458363473415375, "step": 290 }, { "batch_size": 4, "epoch": 0.116, "step": 290, "tokens_per_device": 4332 }, { "epoch": 0.116, "loss_ce": 0.4758467376232147, "loss_lvr": 1.06118643283844, "loss_mode_switch": 0.0, "loss_total": 0.5819653868675232, "step": 290 }, { "batch_size": 1, "epoch": 0.116, "step": 290, "tokens_per_device": 4758 }, { "epoch": 0.116, "loss_ce": 0.1279621720314026, "loss_lvr": 0.7938695549964905, "loss_mode_switch": 0.0, "loss_total": 0.20734912157058716, "step": 290 }, { "batch_size": 4, "epoch": 0.116, "step": 290, "tokens_per_device": 6380 }, { "epoch": 0.116, "loss_ce": 0.3657746911048889, "loss_lvr": 0.9317734837532043, "loss_mode_switch": 0.0, "loss_total": 0.45895203948020935, "step": 290 }, { "batch_size": 1, "epoch": 0.116, "step": 290, "tokens_per_device": 4886 }, { "epoch": 0.116, "loss_ce": 0.01777394860982895, "loss_lvr": 0.4893169403076172, "loss_mode_switch": 0.0, "loss_total": 0.06670564413070679, "step": 290 }, { "batch_size": 4, "epoch": 0.116, "step": 290, "tokens_per_device": 4196 }, { "epoch": 0.116, "loss_ce": 0.42158573865890503, "loss_lvr": 1.1441049575805664, "loss_mode_switch": 0.0, "loss_total": 0.5359962582588196, "step": 290 }, { "epoch": 0.1164, "grad_norm": 1.5012151002883911, "learning_rate": 9.805514152416274e-06, "loss": 0.3304, "step": 291 }, { "batch_size": 1, "epoch": 0.1164, "step": 291, "tokens_per_device": 4572 }, { "epoch": 0.1164, "loss_ce": 0.3078809082508087, "loss_lvr": 0.6645296812057495, "loss_mode_switch": 0.0, "loss_total": 0.3743338882923126, "step": 291 }, { "batch_size": 4, "epoch": 0.1164, "step": 291, "tokens_per_device": 3792 }, { "epoch": 0.1164, "loss_ce": 0.21801839768886566, "loss_lvr": 1.272250771522522, "loss_mode_switch": 0.0, "loss_total": 0.3452434837818146, "step": 291 }, { "batch_size": 4, "epoch": 0.1164, "step": 291, "tokens_per_device": 4340 }, { "epoch": 0.1164, "loss_ce": 0.8775266408920288, "loss_lvr": 1.1195850372314453, "loss_mode_switch": 0.0, "loss_total": 0.9894851446151733, "step": 291 }, { "batch_size": 4, "epoch": 0.1164, "step": 291, "tokens_per_device": 2768 }, { "epoch": 0.1164, "loss_ce": 0.11788136512041092, "loss_lvr": 0.7065582275390625, "loss_mode_switch": 0.0, "loss_total": 0.18853718042373657, "step": 291 }, { "batch_size": 4, "epoch": 0.1164, "step": 291, "tokens_per_device": 2504 }, { "epoch": 0.1164, "loss_ce": 0.2095642387866974, "loss_lvr": 1.047950267791748, "loss_mode_switch": 0.0, "loss_total": 0.31435927748680115, "step": 291 }, { "batch_size": 4, "epoch": 0.1164, "step": 291, "tokens_per_device": 3972 }, { "epoch": 0.1164, "loss_ce": 0.3800867199897766, "loss_lvr": 1.3221131563186646, "loss_mode_switch": 0.0, "loss_total": 0.512298047542572, "step": 291 }, { "batch_size": 1, "epoch": 0.1164, "step": 291, "tokens_per_device": 4892 }, { "epoch": 0.1164, "loss_ce": 0.12279447168111801, "loss_lvr": 0.8817243576049805, "loss_mode_switch": 0.0, "loss_total": 0.21096691489219666, "step": 291 }, { "batch_size": 4, "epoch": 0.1164, "step": 291, "tokens_per_device": 1396 }, { "epoch": 0.1164, "loss_ce": 0.34994184970855713, "loss_lvr": 1.5803594589233398, "loss_mode_switch": 0.0, "loss_total": 0.5079777836799622, "step": 291 }, { "epoch": 0.1168, "grad_norm": 1.4695698022842407, "learning_rate": 9.803721091569201e-06, "loss": 0.337, "step": 292 }, { "batch_size": 4, "epoch": 0.1168, "step": 292, "tokens_per_device": 15520 }, { "epoch": 0.1168, "loss_ce": 0.20046667754650116, "loss_lvr": 1.021676778793335, "loss_mode_switch": 0.0, "loss_total": 0.3026343584060669, "step": 292 }, { "batch_size": 4, "epoch": 0.1168, "step": 292, "tokens_per_device": 5656 }, { "epoch": 0.1168, "loss_ce": 0.049360115081071854, "loss_lvr": 0.8523555994033813, "loss_mode_switch": 0.0, "loss_total": 0.13459567725658417, "step": 292 }, { "batch_size": 4, "epoch": 0.1168, "step": 292, "tokens_per_device": 4768 }, { "epoch": 0.1168, "loss_ce": 0.006155823357403278, "loss_lvr": 1.286112666130066, "loss_mode_switch": 0.0, "loss_total": 0.134767085313797, "step": 292 }, { "batch_size": 1, "epoch": 0.1168, "step": 292, "tokens_per_device": 4888 }, { "epoch": 0.1168, "loss_ce": 0.007898524403572083, "loss_lvr": 0.7915511727333069, "loss_mode_switch": 0.0, "loss_total": 0.08705364167690277, "step": 292 }, { "batch_size": 1, "epoch": 0.1168, "step": 292, "tokens_per_device": 4870 }, { "epoch": 0.1168, "loss_ce": 0.007656468078494072, "loss_lvr": 0.17329387366771698, "loss_mode_switch": 0.0, "loss_total": 0.02498585544526577, "step": 292 }, { "batch_size": 1, "epoch": 0.1168, "step": 292, "tokens_per_device": 4961 }, { "epoch": 0.1168, "loss_ce": 0.448601096868515, "loss_lvr": 0.9874148964881897, "loss_mode_switch": 0.0, "loss_total": 0.5473425984382629, "step": 292 }, { "batch_size": 4, "epoch": 0.1168, "step": 292, "tokens_per_device": 3764 }, { "epoch": 0.1168, "loss_ce": 0.08267070353031158, "loss_lvr": 1.4454264640808105, "loss_mode_switch": 0.0, "loss_total": 0.22721335291862488, "step": 292 }, { "batch_size": 4, "epoch": 0.1168, "step": 292, "tokens_per_device": 5212 }, { "epoch": 0.1168, "loss_ce": 0.27860698103904724, "loss_lvr": 1.021070957183838, "loss_mode_switch": 0.0, "loss_total": 0.38071408867836, "step": 292 }, { "epoch": 0.1172, "grad_norm": 1.6944210529327393, "learning_rate": 9.80191996851442e-06, "loss": 0.3131, "step": 293 }, { "batch_size": 4, "epoch": 0.1172, "step": 293, "tokens_per_device": 4816 }, { "epoch": 0.1172, "loss_ce": 0.43751731514930725, "loss_lvr": 0.9319411516189575, "loss_mode_switch": 0.0, "loss_total": 0.5307114124298096, "step": 293 }, { "batch_size": 1, "epoch": 0.1172, "step": 293, "tokens_per_device": 5173 }, { "epoch": 0.1172, "loss_ce": 0.024513455107808113, "loss_lvr": 0.7430086731910706, "loss_mode_switch": 0.0, "loss_total": 0.09881432354450226, "step": 293 }, { "batch_size": 4, "epoch": 0.1172, "step": 293, "tokens_per_device": 6740 }, { "epoch": 0.1172, "loss_ce": 0.3332120478153229, "loss_lvr": 0.8225151300430298, "loss_mode_switch": 0.0, "loss_total": 0.41546356678009033, "step": 293 }, { "batch_size": 4, "epoch": 0.1172, "step": 293, "tokens_per_device": 1664 }, { "epoch": 0.1172, "loss_ce": 0.4866608679294586, "loss_lvr": 1.110303521156311, "loss_mode_switch": 0.0, "loss_total": 0.5976912379264832, "step": 293 }, { "batch_size": 4, "epoch": 0.1172, "step": 293, "tokens_per_device": 5948 }, { "epoch": 0.1172, "loss_ce": 0.1109265461564064, "loss_lvr": 1.050840139389038, "loss_mode_switch": 0.0, "loss_total": 0.21601057052612305, "step": 293 }, { "batch_size": 4, "epoch": 0.1172, "step": 293, "tokens_per_device": 4292 }, { "epoch": 0.1172, "loss_ce": 0.33615294098854065, "loss_lvr": 1.0160161256790161, "loss_mode_switch": 0.0, "loss_total": 0.4377545714378357, "step": 293 }, { "batch_size": 1, "epoch": 0.1172, "step": 293, "tokens_per_device": 4893 }, { "epoch": 0.1172, "loss_ce": 0.29866111278533936, "loss_lvr": 1.1785868406295776, "loss_mode_switch": 0.0, "loss_total": 0.41651979088783264, "step": 293 }, { "batch_size": 1, "epoch": 0.1172, "step": 293, "tokens_per_device": 5832 }, { "epoch": 0.1172, "loss_ce": 0.002546792384237051, "loss_lvr": 0.6789219975471497, "loss_mode_switch": 0.0, "loss_total": 0.07043899595737457, "step": 293 }, { "epoch": 0.1176, "grad_norm": 1.5207029581069946, "learning_rate": 9.800110786274803e-06, "loss": 0.3657, "step": 294 }, { "batch_size": 1, "epoch": 0.1176, "step": 294, "tokens_per_device": 4893 }, { "epoch": 0.1176, "loss_ce": 0.6104176640510559, "loss_lvr": 0.5381092429161072, "loss_mode_switch": 0.0, "loss_total": 0.6642285585403442, "step": 294 }, { "batch_size": 4, "epoch": 0.1176, "step": 294, "tokens_per_device": 6228 }, { "epoch": 0.1176, "loss_ce": 0.3222629725933075, "loss_lvr": 1.1244831085205078, "loss_mode_switch": 0.0, "loss_total": 0.4347112774848938, "step": 294 }, { "batch_size": 4, "epoch": 0.1176, "step": 294, "tokens_per_device": 1924 }, { "epoch": 0.1176, "loss_ce": 0.19234497845172882, "loss_lvr": 1.1474332809448242, "loss_mode_switch": 0.0, "loss_total": 0.30708831548690796, "step": 294 }, { "batch_size": 1, "epoch": 0.1176, "step": 294, "tokens_per_device": 5119 }, { "epoch": 0.1176, "loss_ce": 0.05661393329501152, "loss_lvr": 0.38564640283584595, "loss_mode_switch": 0.0, "loss_total": 0.09517857432365417, "step": 294 }, { "batch_size": 4, "epoch": 0.1176, "step": 294, "tokens_per_device": 4236 }, { "epoch": 0.1176, "loss_ce": 0.0946815237402916, "loss_lvr": 1.3744016885757446, "loss_mode_switch": 0.0, "loss_total": 0.23212170600891113, "step": 294 }, { "batch_size": 1, "epoch": 0.1176, "step": 294, "tokens_per_device": 4902 }, { "epoch": 0.1176, "loss_ce": 0.08598487079143524, "loss_lvr": 0.5923107862472534, "loss_mode_switch": 0.0, "loss_total": 0.1452159583568573, "step": 294 }, { "batch_size": 1, "epoch": 0.1176, "step": 294, "tokens_per_device": 5853 }, { "epoch": 0.1176, "loss_ce": 0.012136331759393215, "loss_lvr": 0.5749626159667969, "loss_mode_switch": 0.0, "loss_total": 0.06963258981704712, "step": 294 }, { "batch_size": 4, "epoch": 0.1176, "step": 294, "tokens_per_device": 10304 }, { "epoch": 0.1176, "loss_ce": 0.18753810226917267, "loss_lvr": 1.0714999437332153, "loss_mode_switch": 0.0, "loss_total": 0.2946881055831909, "step": 294 }, { "epoch": 0.118, "grad_norm": 1.6010466814041138, "learning_rate": 9.798293547886748e-06, "loss": 0.2948, "step": 295 }, { "batch_size": 4, "epoch": 0.118, "step": 295, "tokens_per_device": 7380 }, { "epoch": 0.118, "loss_ce": 0.019362201914191246, "loss_lvr": 0.8407734036445618, "loss_mode_switch": 0.0, "loss_total": 0.10343954712152481, "step": 295 }, { "batch_size": 4, "epoch": 0.118, "step": 295, "tokens_per_device": 4392 }, { "epoch": 0.118, "loss_ce": 0.17156296968460083, "loss_lvr": 0.9357666969299316, "loss_mode_switch": 0.0, "loss_total": 0.265139639377594, "step": 295 }, { "batch_size": 1, "epoch": 0.118, "step": 295, "tokens_per_device": 4921 }, { "epoch": 0.118, "loss_ce": 0.5686545968055725, "loss_lvr": 0.8807204961776733, "loss_mode_switch": 0.0, "loss_total": 0.6567266583442688, "step": 295 }, { "batch_size": 4, "epoch": 0.118, "step": 295, "tokens_per_device": 4128 }, { "epoch": 0.118, "loss_ce": 0.42087796330451965, "loss_lvr": 1.0754464864730835, "loss_mode_switch": 0.0, "loss_total": 0.5284225940704346, "step": 295 }, { "batch_size": 4, "epoch": 0.118, "step": 295, "tokens_per_device": 5992 }, { "epoch": 0.118, "loss_ce": 0.613669216632843, "loss_lvr": 0.9893975853919983, "loss_mode_switch": 0.0, "loss_total": 0.7126089930534363, "step": 295 }, { "batch_size": 4, "epoch": 0.118, "step": 295, "tokens_per_device": 5500 }, { "epoch": 0.118, "loss_ce": 0.027104629203677177, "loss_lvr": 0.8481507301330566, "loss_mode_switch": 0.0, "loss_total": 0.11191970854997635, "step": 295 }, { "batch_size": 4, "epoch": 0.118, "step": 295, "tokens_per_device": 3812 }, { "epoch": 0.118, "loss_ce": 0.05698671564459801, "loss_lvr": 0.8533307909965515, "loss_mode_switch": 0.0, "loss_total": 0.14231979846954346, "step": 295 }, { "batch_size": 1, "epoch": 0.118, "step": 295, "tokens_per_device": 5129 }, { "epoch": 0.118, "loss_ce": 0.10993178933858871, "loss_lvr": 0.5771634578704834, "loss_mode_switch": 0.0, "loss_total": 0.16764813661575317, "step": 295 }, { "epoch": 0.1184, "grad_norm": 1.4454209804534912, "learning_rate": 9.79646825640017e-06, "loss": 0.3206, "step": 296 }, { "batch_size": 4, "epoch": 0.1184, "step": 296, "tokens_per_device": 5228 }, { "epoch": 0.1184, "loss_ce": 0.4471619129180908, "loss_lvr": 0.8463453054428101, "loss_mode_switch": 0.0, "loss_total": 0.5317964553833008, "step": 296 }, { "batch_size": 4, "epoch": 0.1184, "step": 296, "tokens_per_device": 6252 }, { "epoch": 0.1184, "loss_ce": 0.2840774953365326, "loss_lvr": 0.9226998686790466, "loss_mode_switch": 0.0, "loss_total": 0.37634748220443726, "step": 296 }, { "batch_size": 4, "epoch": 0.1184, "step": 296, "tokens_per_device": 4236 }, { "epoch": 0.1184, "loss_ce": 0.13434302806854248, "loss_lvr": 0.9487857818603516, "loss_mode_switch": 0.0, "loss_total": 0.22922161221504211, "step": 296 }, { "batch_size": 1, "epoch": 0.1184, "step": 296, "tokens_per_device": 4428 }, { "epoch": 0.1184, "loss_ce": 0.0018161212792620063, "loss_lvr": 0.5274758338928223, "loss_mode_switch": 0.0, "loss_total": 0.05456370487809181, "step": 296 }, { "batch_size": 1, "epoch": 0.1184, "step": 296, "tokens_per_device": 4829 }, { "epoch": 0.1184, "loss_ce": 0.055061958730220795, "loss_lvr": 0.6585803627967834, "loss_mode_switch": 0.0, "loss_total": 0.12091999500989914, "step": 296 }, { "batch_size": 4, "epoch": 0.1184, "step": 296, "tokens_per_device": 2608 }, { "epoch": 0.1184, "loss_ce": 0.17648380994796753, "loss_lvr": 0.7655403017997742, "loss_mode_switch": 0.0, "loss_total": 0.25303784012794495, "step": 296 }, { "batch_size": 1, "epoch": 0.1184, "step": 296, "tokens_per_device": 4956 }, { "epoch": 0.1184, "loss_ce": 0.018706733360886574, "loss_lvr": 0.2819872498512268, "loss_mode_switch": 0.0, "loss_total": 0.046905457973480225, "step": 296 }, { "batch_size": 1, "epoch": 0.1184, "step": 296, "tokens_per_device": 5163 }, { "epoch": 0.1184, "loss_ce": 0.028141168877482414, "loss_lvr": 0.652112603187561, "loss_mode_switch": 0.0, "loss_total": 0.09335242956876755, "step": 296 }, { "epoch": 0.1188, "grad_norm": 1.7100518941879272, "learning_rate": 9.794634914878505e-06, "loss": 0.3731, "step": 297 }, { "batch_size": 4, "epoch": 0.1188, "step": 297, "tokens_per_device": 5532 }, { "epoch": 0.1188, "loss_ce": 0.16287538409233093, "loss_lvr": 0.9975219368934631, "loss_mode_switch": 0.0, "loss_total": 0.26262757182121277, "step": 297 }, { "batch_size": 4, "epoch": 0.1188, "step": 297, "tokens_per_device": 2712 }, { "epoch": 0.1188, "loss_ce": 0.2018427848815918, "loss_lvr": 1.1796939373016357, "loss_mode_switch": 0.0, "loss_total": 0.31981217861175537, "step": 297 }, { "batch_size": 4, "epoch": 0.1188, "step": 297, "tokens_per_device": 4244 }, { "epoch": 0.1188, "loss_ce": 0.42206910252571106, "loss_lvr": 1.0587453842163086, "loss_mode_switch": 0.0, "loss_total": 0.5279436111450195, "step": 297 }, { "batch_size": 4, "epoch": 0.1188, "step": 297, "tokens_per_device": 4220 }, { "epoch": 0.1188, "loss_ce": 0.2646600306034088, "loss_lvr": 0.9630793333053589, "loss_mode_switch": 0.0, "loss_total": 0.3609679639339447, "step": 297 }, { "batch_size": 4, "epoch": 0.1188, "step": 297, "tokens_per_device": 6168 }, { "epoch": 0.1188, "loss_ce": 0.36157822608947754, "loss_lvr": 0.9446465373039246, "loss_mode_switch": 0.0, "loss_total": 0.4560428857803345, "step": 297 }, { "batch_size": 1, "epoch": 0.1188, "step": 297, "tokens_per_device": 4884 }, { "epoch": 0.1188, "loss_ce": 0.003942704293876886, "loss_lvr": 0.3937186300754547, "loss_mode_switch": 0.0, "loss_total": 0.04331456869840622, "step": 297 }, { "batch_size": 4, "epoch": 0.1188, "step": 297, "tokens_per_device": 4260 }, { "epoch": 0.1188, "loss_ce": 0.6027906537055969, "loss_lvr": 1.3470426797866821, "loss_mode_switch": 0.0, "loss_total": 0.737494945526123, "step": 297 }, { "batch_size": 4, "epoch": 0.1188, "step": 297, "tokens_per_device": 1468 }, { "epoch": 0.1188, "loss_ce": 0.4106127619743347, "loss_lvr": 1.2378883361816406, "loss_mode_switch": 0.0, "loss_total": 0.5344015955924988, "step": 297 }, { "epoch": 0.1192, "grad_norm": 1.4297456741333008, "learning_rate": 9.792793526398694e-06, "loss": 0.3568, "step": 298 }, { "batch_size": 4, "epoch": 0.1192, "step": 298, "tokens_per_device": 4036 }, { "epoch": 0.1192, "loss_ce": 0.7247449159622192, "loss_lvr": 1.07331383228302, "loss_mode_switch": 0.0, "loss_total": 0.8320763111114502, "step": 298 }, { "batch_size": 4, "epoch": 0.1192, "step": 298, "tokens_per_device": 10904 }, { "epoch": 0.1192, "loss_ce": 0.17382380366325378, "loss_lvr": 0.8850475549697876, "loss_mode_switch": 0.0, "loss_total": 0.262328565120697, "step": 298 }, { "batch_size": 1, "epoch": 0.1192, "step": 298, "tokens_per_device": 5142 }, { "epoch": 0.1192, "loss_ce": 0.17501826584339142, "loss_lvr": 0.3280276656150818, "loss_mode_switch": 0.0, "loss_total": 0.2078210413455963, "step": 298 }, { "batch_size": 4, "epoch": 0.1192, "step": 298, "tokens_per_device": 4208 }, { "epoch": 0.1192, "loss_ce": 0.12027371674776077, "loss_lvr": 1.0408580303192139, "loss_mode_switch": 0.0, "loss_total": 0.22435951232910156, "step": 298 }, { "batch_size": 4, "epoch": 0.1192, "step": 298, "tokens_per_device": 4092 }, { "epoch": 0.1192, "loss_ce": 0.1566714644432068, "loss_lvr": 1.1902257204055786, "loss_mode_switch": 0.0, "loss_total": 0.2756940424442291, "step": 298 }, { "batch_size": 4, "epoch": 0.1192, "step": 298, "tokens_per_device": 6012 }, { "epoch": 0.1192, "loss_ce": 0.24624651670455933, "loss_lvr": 0.8297960162162781, "loss_mode_switch": 0.0, "loss_total": 0.32922613620758057, "step": 298 }, { "batch_size": 4, "epoch": 0.1192, "step": 298, "tokens_per_device": 5696 }, { "epoch": 0.1192, "loss_ce": 0.4520339071750641, "loss_lvr": 1.0287326574325562, "loss_mode_switch": 0.0, "loss_total": 0.5549072027206421, "step": 298 }, { "batch_size": 4, "epoch": 0.1192, "step": 298, "tokens_per_device": 3868 }, { "epoch": 0.1192, "loss_ce": 0.045645106583833694, "loss_lvr": 0.7765117287635803, "loss_mode_switch": 0.0, "loss_total": 0.12329627573490143, "step": 298 }, { "epoch": 0.1196, "grad_norm": 1.3376975059509277, "learning_rate": 9.790944094051188e-06, "loss": 0.3301, "step": 299 }, { "batch_size": 1, "epoch": 0.1196, "step": 299, "tokens_per_device": 4918 }, { "epoch": 0.1196, "loss_ce": 0.05814819037914276, "loss_lvr": 0.9153436422348022, "loss_mode_switch": 0.0, "loss_total": 0.14968255162239075, "step": 299 }, { "batch_size": 4, "epoch": 0.1196, "step": 299, "tokens_per_device": 1716 }, { "epoch": 0.1196, "loss_ce": 0.6930623650550842, "loss_lvr": 1.2559723854064941, "loss_mode_switch": 0.0, "loss_total": 0.8186596035957336, "step": 299 }, { "batch_size": 4, "epoch": 0.1196, "step": 299, "tokens_per_device": 3672 }, { "epoch": 0.1196, "loss_ce": 0.32696089148521423, "loss_lvr": 0.9466135501861572, "loss_mode_switch": 0.0, "loss_total": 0.42162224650382996, "step": 299 }, { "batch_size": 4, "epoch": 0.1196, "step": 299, "tokens_per_device": 4924 }, { "epoch": 0.1196, "loss_ce": 0.1507575660943985, "loss_lvr": 0.9876301288604736, "loss_mode_switch": 0.0, "loss_total": 0.24952057003974915, "step": 299 }, { "batch_size": 4, "epoch": 0.1196, "step": 299, "tokens_per_device": 14436 }, { "epoch": 0.1196, "loss_ce": 0.16576246917247772, "loss_lvr": 1.2369524240493774, "loss_mode_switch": 0.0, "loss_total": 0.2894577085971832, "step": 299 }, { "batch_size": 4, "epoch": 0.1196, "step": 299, "tokens_per_device": 3816 }, { "epoch": 0.1196, "loss_ce": 0.23651304841041565, "loss_lvr": 1.0742202997207642, "loss_mode_switch": 0.0, "loss_total": 0.3439350724220276, "step": 299 }, { "batch_size": 4, "epoch": 0.1196, "step": 299, "tokens_per_device": 4572 }, { "epoch": 0.1196, "loss_ce": 0.22867277264595032, "loss_lvr": 0.9472741484642029, "loss_mode_switch": 0.0, "loss_total": 0.32340019941329956, "step": 299 }, { "batch_size": 1, "epoch": 0.1196, "step": 299, "tokens_per_device": 5081 }, { "epoch": 0.1196, "loss_ce": 0.03275914490222931, "loss_lvr": 0.7267709970474243, "loss_mode_switch": 0.0, "loss_total": 0.10543624311685562, "step": 299 }, { "epoch": 0.12, "grad_norm": 1.5146819353103638, "learning_rate": 9.789086620939936e-06, "loss": 0.3426, "step": 300 }, { "batch_size": 4, "epoch": 0.12, "step": 300, "tokens_per_device": 4792 }, { "epoch": 0.12, "loss_ce": 0.009646154008805752, "loss_lvr": 1.1400446891784668, "loss_mode_switch": 0.0, "loss_total": 0.12365062534809113, "step": 300 }, { "batch_size": 4, "epoch": 0.12, "step": 300, "tokens_per_device": 1176 }, { "epoch": 0.12, "loss_ce": 0.10015241801738739, "loss_lvr": 1.2461961507797241, "loss_mode_switch": 0.0, "loss_total": 0.22477203607559204, "step": 300 }, { "batch_size": 1, "epoch": 0.12, "step": 300, "tokens_per_device": 5145 }, { "epoch": 0.12, "loss_ce": 0.05766405537724495, "loss_lvr": 0.5653188824653625, "loss_mode_switch": 0.0, "loss_total": 0.11419594287872314, "step": 300 }, { "batch_size": 4, "epoch": 0.12, "step": 300, "tokens_per_device": 10512 }, { "epoch": 0.12, "loss_ce": 0.6119652390480042, "loss_lvr": 1.0601295232772827, "loss_mode_switch": 0.0, "loss_total": 0.7179781794548035, "step": 300 }, { "batch_size": 4, "epoch": 0.12, "step": 300, "tokens_per_device": 4248 }, { "epoch": 0.12, "loss_ce": 0.6754512786865234, "loss_lvr": 1.2308024168014526, "loss_mode_switch": 0.0, "loss_total": 0.7985315322875977, "step": 300 }, { "batch_size": 4, "epoch": 0.12, "step": 300, "tokens_per_device": 4312 }, { "epoch": 0.12, "loss_ce": 0.2271769791841507, "loss_lvr": 0.9675533771514893, "loss_mode_switch": 0.0, "loss_total": 0.32393231987953186, "step": 300 }, { "batch_size": 4, "epoch": 0.12, "step": 300, "tokens_per_device": 7800 }, { "epoch": 0.12, "loss_ce": 0.6279960870742798, "loss_lvr": 0.8954453468322754, "loss_mode_switch": 0.0, "loss_total": 0.7175406217575073, "step": 300 }, { "batch_size": 4, "epoch": 0.12, "step": 300, "tokens_per_device": 4152 }, { "epoch": 0.12, "loss_ce": 0.3771500587463379, "loss_lvr": 0.9451122283935547, "loss_mode_switch": 0.0, "loss_total": 0.4716612696647644, "step": 300 }, { "epoch": 0.1204, "grad_norm": 1.3852202892303467, "learning_rate": 9.787221110182384e-06, "loss": 0.3211, "step": 301 }, { "batch_size": 4, "epoch": 0.1204, "step": 301, "tokens_per_device": 4272 }, { "epoch": 0.1204, "loss_ce": 0.12002713978290558, "loss_lvr": 1.153953194618225, "loss_mode_switch": 0.0, "loss_total": 0.23542246222496033, "step": 301 }, { "batch_size": 1, "epoch": 0.1204, "step": 301, "tokens_per_device": 5181 }, { "epoch": 0.1204, "loss_ce": 0.16128337383270264, "loss_lvr": 0.358357697725296, "loss_mode_switch": 0.0, "loss_total": 0.19711914658546448, "step": 301 }, { "batch_size": 4, "epoch": 0.1204, "step": 301, "tokens_per_device": 4144 }, { "epoch": 0.1204, "loss_ce": 0.23068256676197052, "loss_lvr": 0.9173515439033508, "loss_mode_switch": 0.0, "loss_total": 0.3224177360534668, "step": 301 }, { "batch_size": 1, "epoch": 0.1204, "step": 301, "tokens_per_device": 4903 }, { "epoch": 0.1204, "loss_ce": 0.5042587518692017, "loss_lvr": 0.9587647318840027, "loss_mode_switch": 0.0, "loss_total": 0.6001352071762085, "step": 301 }, { "batch_size": 1, "epoch": 0.1204, "step": 301, "tokens_per_device": 4184 }, { "epoch": 0.1204, "loss_ce": 0.058487288653850555, "loss_lvr": 0.7603790163993835, "loss_mode_switch": 0.0, "loss_total": 0.13452519476413727, "step": 301 }, { "batch_size": 1, "epoch": 0.1204, "step": 301, "tokens_per_device": 4908 }, { "epoch": 0.1204, "loss_ce": 0.010230355896055698, "loss_lvr": 0.5716346502304077, "loss_mode_switch": 0.0, "loss_total": 0.06739382445812225, "step": 301 }, { "batch_size": 4, "epoch": 0.1204, "step": 301, "tokens_per_device": 4824 }, { "epoch": 0.1204, "loss_ce": 0.19273200631141663, "loss_lvr": 0.98765629529953, "loss_mode_switch": 0.0, "loss_total": 0.2914976477622986, "step": 301 }, { "batch_size": 4, "epoch": 0.1204, "step": 301, "tokens_per_device": 3844 }, { "epoch": 0.1204, "loss_ce": 0.39646437764167786, "loss_lvr": 1.132610559463501, "loss_mode_switch": 0.0, "loss_total": 0.5097254514694214, "step": 301 }, { "epoch": 0.1208, "grad_norm": 1.5738248825073242, "learning_rate": 9.785347564909464e-06, "loss": 0.3795, "step": 302 }, { "batch_size": 1, "epoch": 0.1208, "step": 302, "tokens_per_device": 5778 }, { "epoch": 0.1208, "loss_ce": 0.0037639862857759, "loss_lvr": 0.7084395885467529, "loss_mode_switch": 0.0, "loss_total": 0.07460794597864151, "step": 302 }, { "batch_size": 4, "epoch": 0.1208, "step": 302, "tokens_per_device": 1840 }, { "epoch": 0.1208, "loss_ce": 0.7749778032302856, "loss_lvr": 1.1466662883758545, "loss_mode_switch": 0.0, "loss_total": 0.8896444439888, "step": 302 }, { "batch_size": 4, "epoch": 0.1208, "step": 302, "tokens_per_device": 2712 }, { "epoch": 0.1208, "loss_ce": 0.2500609755516052, "loss_lvr": 0.8842479586601257, "loss_mode_switch": 0.0, "loss_total": 0.3384857773780823, "step": 302 }, { "batch_size": 4, "epoch": 0.1208, "step": 302, "tokens_per_device": 3260 }, { "epoch": 0.1208, "loss_ce": 0.2900388538837433, "loss_lvr": 1.0495320558547974, "loss_mode_switch": 0.0, "loss_total": 0.39499205350875854, "step": 302 }, { "batch_size": 1, "epoch": 0.1208, "step": 302, "tokens_per_device": 5163 }, { "epoch": 0.1208, "loss_ce": 0.0052726492285728455, "loss_lvr": 0.7261212468147278, "loss_mode_switch": 0.0, "loss_total": 0.07788477838039398, "step": 302 }, { "batch_size": 4, "epoch": 0.1208, "step": 302, "tokens_per_device": 5820 }, { "epoch": 0.1208, "loss_ce": 0.020029693841934204, "loss_lvr": 1.1759470701217651, "loss_mode_switch": 0.0, "loss_total": 0.13762441277503967, "step": 302 }, { "batch_size": 1, "epoch": 0.1208, "step": 302, "tokens_per_device": 5209 }, { "epoch": 0.1208, "loss_ce": 0.03115302138030529, "loss_lvr": 0.43816831707954407, "loss_mode_switch": 0.0, "loss_total": 0.07496985048055649, "step": 302 }, { "batch_size": 1, "epoch": 0.1208, "step": 302, "tokens_per_device": 5011 }, { "epoch": 0.1208, "loss_ce": 0.03606153652071953, "loss_lvr": 0.3976432681083679, "loss_mode_switch": 0.0, "loss_total": 0.07582586258649826, "step": 302 }, { "epoch": 0.1212, "grad_norm": 1.7585134506225586, "learning_rate": 9.783465988265594e-06, "loss": 0.3286, "step": 303 }, { "batch_size": 4, "epoch": 0.1212, "step": 303, "tokens_per_device": 4320 }, { "epoch": 0.1212, "loss_ce": 0.11378943175077438, "loss_lvr": 0.7702096700668335, "loss_mode_switch": 0.0, "loss_total": 0.1908103972673416, "step": 303 }, { "batch_size": 1, "epoch": 0.1212, "step": 303, "tokens_per_device": 4902 }, { "epoch": 0.1212, "loss_ce": 0.007917257957160473, "loss_lvr": 0.7629456520080566, "loss_mode_switch": 0.0, "loss_total": 0.0842118188738823, "step": 303 }, { "batch_size": 1, "epoch": 0.1212, "step": 303, "tokens_per_device": 4908 }, { "epoch": 0.1212, "loss_ce": 0.002182665280997753, "loss_lvr": 0.39936184883117676, "loss_mode_switch": 0.0, "loss_total": 0.042118851095438004, "step": 303 }, { "batch_size": 4, "epoch": 0.1212, "step": 303, "tokens_per_device": 2664 }, { "epoch": 0.1212, "loss_ce": 0.2668127417564392, "loss_lvr": 0.9275620579719543, "loss_mode_switch": 0.0, "loss_total": 0.3595689535140991, "step": 303 }, { "batch_size": 4, "epoch": 0.1212, "step": 303, "tokens_per_device": 7728 }, { "epoch": 0.1212, "loss_ce": 0.20457667112350464, "loss_lvr": 1.1558902263641357, "loss_mode_switch": 0.0, "loss_total": 0.3201656937599182, "step": 303 }, { "batch_size": 4, "epoch": 0.1212, "step": 303, "tokens_per_device": 4396 }, { "epoch": 0.1212, "loss_ce": 0.45686784386634827, "loss_lvr": 0.9852213859558105, "loss_mode_switch": 0.0, "loss_total": 0.5553900003433228, "step": 303 }, { "batch_size": 4, "epoch": 0.1212, "step": 303, "tokens_per_device": 3800 }, { "epoch": 0.1212, "loss_ce": 0.6478171944618225, "loss_lvr": 1.2406091690063477, "loss_mode_switch": 0.0, "loss_total": 0.7718781232833862, "step": 303 }, { "batch_size": 4, "epoch": 0.1212, "step": 303, "tokens_per_device": 4868 }, { "epoch": 0.1212, "loss_ce": 0.35890060663223267, "loss_lvr": 0.974953830242157, "loss_mode_switch": 0.0, "loss_total": 0.4563959836959839, "step": 303 }, { "epoch": 0.1216, "grad_norm": 1.8235294818878174, "learning_rate": 9.781576383408678e-06, "loss": 0.3592, "step": 304 }, { "batch_size": 4, "epoch": 0.1216, "step": 304, "tokens_per_device": 1336 }, { "epoch": 0.1216, "loss_ce": 0.5314844846725464, "loss_lvr": 1.433666706085205, "loss_mode_switch": 0.0, "loss_total": 0.6748511791229248, "step": 304 }, { "batch_size": 4, "epoch": 0.1216, "step": 304, "tokens_per_device": 4252 }, { "epoch": 0.1216, "loss_ce": 0.12635599076747894, "loss_lvr": 1.0280951261520386, "loss_mode_switch": 0.0, "loss_total": 0.22916549444198608, "step": 304 }, { "batch_size": 4, "epoch": 0.1216, "step": 304, "tokens_per_device": 1560 }, { "epoch": 0.1216, "loss_ce": 0.3197507858276367, "loss_lvr": 1.0488072633743286, "loss_mode_switch": 0.0, "loss_total": 0.4246315062046051, "step": 304 }, { "batch_size": 1, "epoch": 0.1216, "step": 304, "tokens_per_device": 4943 }, { "epoch": 0.1216, "loss_ce": 0.1009943038225174, "loss_lvr": 0.5679890513420105, "loss_mode_switch": 0.0, "loss_total": 0.15779320895671844, "step": 304 }, { "batch_size": 4, "epoch": 0.1216, "step": 304, "tokens_per_device": 1568 }, { "epoch": 0.1216, "loss_ce": 0.324099600315094, "loss_lvr": 1.655198097229004, "loss_mode_switch": 0.0, "loss_total": 0.4896194338798523, "step": 304 }, { "batch_size": 4, "epoch": 0.1216, "step": 304, "tokens_per_device": 11164 }, { "epoch": 0.1216, "loss_ce": 0.11591945588588715, "loss_lvr": 0.9854142069816589, "loss_mode_switch": 0.0, "loss_total": 0.21446087956428528, "step": 304 }, { "batch_size": 4, "epoch": 0.1216, "step": 304, "tokens_per_device": 7112 }, { "epoch": 0.1216, "loss_ce": 0.038604531437158585, "loss_lvr": 0.9229826331138611, "loss_mode_switch": 0.0, "loss_total": 0.13090279698371887, "step": 304 }, { "batch_size": 1, "epoch": 0.1216, "step": 304, "tokens_per_device": 5837 }, { "epoch": 0.1216, "loss_ce": 0.002793170278891921, "loss_lvr": 0.5280782580375671, "loss_mode_switch": 0.0, "loss_total": 0.05560099706053734, "step": 304 }, { "epoch": 0.122, "grad_norm": 1.5209914445877075, "learning_rate": 9.779678753510082e-06, "loss": 0.3081, "step": 305 }, { "batch_size": 1, "epoch": 0.122, "step": 305, "tokens_per_device": 5122 }, { "epoch": 0.122, "loss_ce": 0.004710891749709845, "loss_lvr": 0.48446378111839294, "loss_mode_switch": 0.0, "loss_total": 0.053157269954681396, "step": 305 }, { "batch_size": 4, "epoch": 0.122, "step": 305, "tokens_per_device": 3384 }, { "epoch": 0.122, "loss_ce": 0.12384460121393204, "loss_lvr": 1.2174346446990967, "loss_mode_switch": 0.0, "loss_total": 0.24558806419372559, "step": 305 }, { "batch_size": 4, "epoch": 0.122, "step": 305, "tokens_per_device": 4244 }, { "epoch": 0.122, "loss_ce": 0.20121614634990692, "loss_lvr": 1.4652968645095825, "loss_mode_switch": 0.0, "loss_total": 0.3477458357810974, "step": 305 }, { "batch_size": 4, "epoch": 0.122, "step": 305, "tokens_per_device": 3876 }, { "epoch": 0.122, "loss_ce": 0.3133319616317749, "loss_lvr": 0.957502007484436, "loss_mode_switch": 0.0, "loss_total": 0.40908217430114746, "step": 305 }, { "batch_size": 4, "epoch": 0.122, "step": 305, "tokens_per_device": 6768 }, { "epoch": 0.122, "loss_ce": 0.3123887777328491, "loss_lvr": 0.9921237230300903, "loss_mode_switch": 0.0, "loss_total": 0.41160115599632263, "step": 305 }, { "batch_size": 1, "epoch": 0.122, "step": 305, "tokens_per_device": 4916 }, { "epoch": 0.122, "loss_ce": 0.1355224847793579, "loss_lvr": 0.5482026934623718, "loss_mode_switch": 0.0, "loss_total": 0.1903427541255951, "step": 305 }, { "batch_size": 4, "epoch": 0.122, "step": 305, "tokens_per_device": 3860 }, { "epoch": 0.122, "loss_ce": 0.08440881967544556, "loss_lvr": 0.9735673666000366, "loss_mode_switch": 0.0, "loss_total": 0.18176555633544922, "step": 305 }, { "batch_size": 4, "epoch": 0.122, "step": 305, "tokens_per_device": 4228 }, { "epoch": 0.122, "loss_ce": 0.13987168669700623, "loss_lvr": 2.7455286979675293, "loss_mode_switch": 0.0, "loss_total": 0.4144245684146881, "step": 305 }, { "epoch": 0.1224, "grad_norm": 1.516728401184082, "learning_rate": 9.777773101754648e-06, "loss": 0.3159, "step": 306 }, { "batch_size": 4, "epoch": 0.1224, "step": 306, "tokens_per_device": 1576 }, { "epoch": 0.1224, "loss_ce": 0.13228626549243927, "loss_lvr": 2.5837433338165283, "loss_mode_switch": 0.0, "loss_total": 0.3906605839729309, "step": 306 }, { "batch_size": 4, "epoch": 0.1224, "step": 306, "tokens_per_device": 4540 }, { "epoch": 0.1224, "loss_ce": 0.15366441011428833, "loss_lvr": 0.7272021174430847, "loss_mode_switch": 0.0, "loss_total": 0.22638462483882904, "step": 306 }, { "batch_size": 4, "epoch": 0.1224, "step": 306, "tokens_per_device": 4420 }, { "epoch": 0.1224, "loss_ce": 0.40759801864624023, "loss_lvr": 0.9144858121871948, "loss_mode_switch": 0.0, "loss_total": 0.49904659390449524, "step": 306 }, { "batch_size": 4, "epoch": 0.1224, "step": 306, "tokens_per_device": 7152 }, { "epoch": 0.1224, "loss_ce": 0.027720289304852486, "loss_lvr": 1.4940836429595947, "loss_mode_switch": 0.0, "loss_total": 0.1771286576986313, "step": 306 }, { "batch_size": 4, "epoch": 0.1224, "step": 306, "tokens_per_device": 5516 }, { "epoch": 0.1224, "loss_ce": 0.38204225897789, "loss_lvr": 0.7810888290405273, "loss_mode_switch": 0.0, "loss_total": 0.46015113592147827, "step": 306 }, { "batch_size": 4, "epoch": 0.1224, "step": 306, "tokens_per_device": 2692 }, { "epoch": 0.1224, "loss_ce": 0.7133578658103943, "loss_lvr": 0.9964544773101807, "loss_mode_switch": 0.0, "loss_total": 0.8130033016204834, "step": 306 }, { "batch_size": 1, "epoch": 0.1224, "step": 306, "tokens_per_device": 4885 }, { "epoch": 0.1224, "loss_ce": 0.001795097254216671, "loss_lvr": 0.3871476352214813, "loss_mode_switch": 0.0, "loss_total": 0.04050986096262932, "step": 306 }, { "batch_size": 4, "epoch": 0.1224, "step": 306, "tokens_per_device": 3796 }, { "epoch": 0.1224, "loss_ce": 0.5077767372131348, "loss_lvr": 1.162766456604004, "loss_mode_switch": 0.0, "loss_total": 0.6240533590316772, "step": 306 }, { "epoch": 0.1228, "grad_norm": 1.541133165359497, "learning_rate": 9.775859431340681e-06, "loss": 0.3668, "step": 307 }, { "batch_size": 4, "epoch": 0.1228, "step": 307, "tokens_per_device": 2140 }, { "epoch": 0.1228, "loss_ce": 0.6091019511222839, "loss_lvr": 1.4378795623779297, "loss_mode_switch": 0.0, "loss_total": 0.7528899312019348, "step": 307 }, { "batch_size": 4, "epoch": 0.1228, "step": 307, "tokens_per_device": 3832 }, { "epoch": 0.1228, "loss_ce": 0.6956803202629089, "loss_lvr": 1.2332137823104858, "loss_mode_switch": 0.0, "loss_total": 0.8190016746520996, "step": 307 }, { "batch_size": 4, "epoch": 0.1228, "step": 307, "tokens_per_device": 5456 }, { "epoch": 0.1228, "loss_ce": 0.13135282695293427, "loss_lvr": 0.7888365983963013, "loss_mode_switch": 0.0, "loss_total": 0.21023648977279663, "step": 307 }, { "batch_size": 4, "epoch": 0.1228, "step": 307, "tokens_per_device": 4300 }, { "epoch": 0.1228, "loss_ce": 0.4715101420879364, "loss_lvr": 1.2338882684707642, "loss_mode_switch": 0.0, "loss_total": 0.5948989391326904, "step": 307 }, { "batch_size": 1, "epoch": 0.1228, "step": 307, "tokens_per_device": 5025 }, { "epoch": 0.1228, "loss_ce": 0.0240290779620409, "loss_lvr": 0.5938842296600342, "loss_mode_switch": 0.0, "loss_total": 0.08341750502586365, "step": 307 }, { "batch_size": 4, "epoch": 0.1228, "step": 307, "tokens_per_device": 2876 }, { "epoch": 0.1228, "loss_ce": 0.2723492980003357, "loss_lvr": 0.7952590584754944, "loss_mode_switch": 0.0, "loss_total": 0.3518752157688141, "step": 307 }, { "batch_size": 4, "epoch": 0.1228, "step": 307, "tokens_per_device": 1632 }, { "epoch": 0.1228, "loss_ce": 0.2872358560562134, "loss_lvr": 1.1300599575042725, "loss_mode_switch": 0.0, "loss_total": 0.4002418518066406, "step": 307 }, { "batch_size": 4, "epoch": 0.1228, "step": 307, "tokens_per_device": 13364 }, { "epoch": 0.1228, "loss_ce": 0.8611254692077637, "loss_lvr": 0.8226191997528076, "loss_mode_switch": 0.0, "loss_total": 0.9433873891830444, "step": 307 }, { "epoch": 0.1232, "grad_norm": 1.932875394821167, "learning_rate": 9.773937745479942e-06, "loss": 0.3526, "step": 308 }, { "batch_size": 4, "epoch": 0.1232, "step": 308, "tokens_per_device": 2880 }, { "epoch": 0.1232, "loss_ce": 0.10109212249517441, "loss_lvr": 0.7050848603248596, "loss_mode_switch": 0.0, "loss_total": 0.1716006100177765, "step": 308 }, { "batch_size": 4, "epoch": 0.1232, "step": 308, "tokens_per_device": 1492 }, { "epoch": 0.1232, "loss_ce": 0.20588096976280212, "loss_lvr": 0.9756062030792236, "loss_mode_switch": 0.0, "loss_total": 0.30344158411026, "step": 308 }, { "batch_size": 1, "epoch": 0.1232, "step": 308, "tokens_per_device": 5816 }, { "epoch": 0.1232, "loss_ce": 0.4396255910396576, "loss_lvr": 0.4876576066017151, "loss_mode_switch": 0.0, "loss_total": 0.48839133977890015, "step": 308 }, { "batch_size": 1, "epoch": 0.1232, "step": 308, "tokens_per_device": 4856 }, { "epoch": 0.1232, "loss_ce": 0.002377241151407361, "loss_lvr": 0.4766175448894501, "loss_mode_switch": 0.0, "loss_total": 0.05003899708390236, "step": 308 }, { "batch_size": 4, "epoch": 0.1232, "step": 308, "tokens_per_device": 4192 }, { "epoch": 0.1232, "loss_ce": 0.04922982305288315, "loss_lvr": 0.7784972190856934, "loss_mode_switch": 0.0, "loss_total": 0.1270795464515686, "step": 308 }, { "batch_size": 4, "epoch": 0.1232, "step": 308, "tokens_per_device": 4444 }, { "epoch": 0.1232, "loss_ce": 0.18478472530841827, "loss_lvr": 0.7599895596504211, "loss_mode_switch": 0.0, "loss_total": 0.26078367233276367, "step": 308 }, { "batch_size": 4, "epoch": 0.1232, "step": 308, "tokens_per_device": 1452 }, { "epoch": 0.1232, "loss_ce": 0.24439094960689545, "loss_lvr": 1.0383591651916504, "loss_mode_switch": 0.0, "loss_total": 0.3482268750667572, "step": 308 }, { "batch_size": 4, "epoch": 0.1232, "step": 308, "tokens_per_device": 6016 }, { "epoch": 0.1232, "loss_ce": 0.08183551579713821, "loss_lvr": 0.8421730995178223, "loss_mode_switch": 0.0, "loss_total": 0.16605281829833984, "step": 308 }, { "epoch": 0.1236, "grad_norm": 1.472779631614685, "learning_rate": 9.772008047397647e-06, "loss": 0.3308, "step": 309 }, { "batch_size": 4, "epoch": 0.1236, "step": 309, "tokens_per_device": 5924 }, { "epoch": 0.1236, "loss_ce": 0.4497041702270508, "loss_lvr": 0.8913658261299133, "loss_mode_switch": 0.0, "loss_total": 0.5388407707214355, "step": 309 }, { "batch_size": 4, "epoch": 0.1236, "step": 309, "tokens_per_device": 3116 }, { "epoch": 0.1236, "loss_ce": 0.5843241810798645, "loss_lvr": 0.5779348611831665, "loss_mode_switch": 0.0, "loss_total": 0.6421176791191101, "step": 309 }, { "batch_size": 4, "epoch": 0.1236, "step": 309, "tokens_per_device": 1984 }, { "epoch": 0.1236, "loss_ce": 0.0801069512963295, "loss_lvr": 1.2789318561553955, "loss_mode_switch": 0.0, "loss_total": 0.20800015330314636, "step": 309 }, { "batch_size": 4, "epoch": 0.1236, "step": 309, "tokens_per_device": 1292 }, { "epoch": 0.1236, "loss_ce": 0.5358964204788208, "loss_lvr": 1.2840561866760254, "loss_mode_switch": 0.0, "loss_total": 0.6643020510673523, "step": 309 }, { "batch_size": 4, "epoch": 0.1236, "step": 309, "tokens_per_device": 4640 }, { "epoch": 0.1236, "loss_ce": 0.05897923931479454, "loss_lvr": 0.8812344074249268, "loss_mode_switch": 0.0, "loss_total": 0.14710268378257751, "step": 309 }, { "batch_size": 4, "epoch": 0.1236, "step": 309, "tokens_per_device": 5728 }, { "epoch": 0.1236, "loss_ce": 0.10231486707925797, "loss_lvr": 0.9748605489730835, "loss_mode_switch": 0.0, "loss_total": 0.19980092346668243, "step": 309 }, { "batch_size": 1, "epoch": 0.1236, "step": 309, "tokens_per_device": 5010 }, { "epoch": 0.1236, "loss_ce": 0.6572733521461487, "loss_lvr": 0.6408131718635559, "loss_mode_switch": 0.0, "loss_total": 0.7213546633720398, "step": 309 }, { "batch_size": 4, "epoch": 0.1236, "step": 309, "tokens_per_device": 5788 }, { "epoch": 0.1236, "loss_ce": 0.07344832271337509, "loss_lvr": 0.8908396363258362, "loss_mode_switch": 0.0, "loss_total": 0.1625322848558426, "step": 309 }, { "epoch": 0.124, "grad_norm": 1.6491122245788574, "learning_rate": 9.770070340332457e-06, "loss": 0.3596, "step": 310 }, { "batch_size": 1, "epoch": 0.124, "step": 310, "tokens_per_device": 5037 }, { "epoch": 0.124, "loss_ce": 0.014344708994030952, "loss_lvr": 0.5290852785110474, "loss_mode_switch": 0.0, "loss_total": 0.0672532394528389, "step": 310 }, { "batch_size": 4, "epoch": 0.124, "step": 310, "tokens_per_device": 4320 }, { "epoch": 0.124, "loss_ce": 0.18664774298667908, "loss_lvr": 1.1994614601135254, "loss_mode_switch": 0.0, "loss_total": 0.3065938949584961, "step": 310 }, { "batch_size": 1, "epoch": 0.124, "step": 310, "tokens_per_device": 5078 }, { "epoch": 0.124, "loss_ce": 0.021903811022639275, "loss_lvr": 0.3626166880130768, "loss_mode_switch": 0.0, "loss_total": 0.05816548317670822, "step": 310 }, { "batch_size": 1, "epoch": 0.124, "step": 310, "tokens_per_device": 8155 }, { "epoch": 0.124, "loss_ce": 0.08658412098884583, "loss_lvr": 0.4600811004638672, "loss_mode_switch": 0.0, "loss_total": 0.13259223103523254, "step": 310 }, { "batch_size": 4, "epoch": 0.124, "step": 310, "tokens_per_device": 4712 }, { "epoch": 0.124, "loss_ce": 0.30587226152420044, "loss_lvr": 0.9939765334129333, "loss_mode_switch": 0.0, "loss_total": 0.40526992082595825, "step": 310 }, { "batch_size": 4, "epoch": 0.124, "step": 310, "tokens_per_device": 3748 }, { "epoch": 0.124, "loss_ce": 0.006655046716332436, "loss_lvr": 1.1972213983535767, "loss_mode_switch": 0.0, "loss_total": 0.1263771951198578, "step": 310 }, { "batch_size": 1, "epoch": 0.124, "step": 310, "tokens_per_device": 4901 }, { "epoch": 0.124, "loss_ce": 0.005420829635113478, "loss_lvr": 0.71489417552948, "loss_mode_switch": 0.0, "loss_total": 0.07691024243831635, "step": 310 }, { "batch_size": 1, "epoch": 0.124, "step": 310, "tokens_per_device": 5162 }, { "epoch": 0.124, "loss_ce": 0.013308960944414139, "loss_lvr": 0.6498157978057861, "loss_mode_switch": 0.0, "loss_total": 0.07829053699970245, "step": 310 }, { "epoch": 0.1244, "grad_norm": 1.3504188060760498, "learning_rate": 9.768124627536474e-06, "loss": 0.2942, "step": 311 }, { "batch_size": 4, "epoch": 0.1244, "step": 311, "tokens_per_device": 4200 }, { "epoch": 0.1244, "loss_ce": 0.9196925163269043, "loss_lvr": 1.0909569263458252, "loss_mode_switch": 0.0, "loss_total": 1.0287882089614868, "step": 311 }, { "batch_size": 1, "epoch": 0.1244, "step": 311, "tokens_per_device": 4776 }, { "epoch": 0.1244, "loss_ce": 0.013216803781688213, "loss_lvr": 0.5816164612770081, "loss_mode_switch": 0.0, "loss_total": 0.0713784471154213, "step": 311 }, { "batch_size": 4, "epoch": 0.1244, "step": 311, "tokens_per_device": 4668 }, { "epoch": 0.1244, "loss_ce": 0.18985623121261597, "loss_lvr": 1.214209794998169, "loss_mode_switch": 0.0, "loss_total": 0.31127721071243286, "step": 311 }, { "batch_size": 1, "epoch": 0.1244, "step": 311, "tokens_per_device": 4764 }, { "epoch": 0.1244, "loss_ce": 0.08194953203201294, "loss_lvr": 0.540938675403595, "loss_mode_switch": 0.0, "loss_total": 0.13604339957237244, "step": 311 }, { "batch_size": 1, "epoch": 0.1244, "step": 311, "tokens_per_device": 4942 }, { "epoch": 0.1244, "loss_ce": 0.04452965781092644, "loss_lvr": 0.48537227511405945, "loss_mode_switch": 0.0, "loss_total": 0.09306688606739044, "step": 311 }, { "batch_size": 4, "epoch": 0.1244, "step": 311, "tokens_per_device": 2640 }, { "epoch": 0.1244, "loss_ce": 0.2030736804008484, "loss_lvr": 0.9144192337989807, "loss_mode_switch": 0.0, "loss_total": 0.29451560974121094, "step": 311 }, { "batch_size": 4, "epoch": 0.1244, "step": 311, "tokens_per_device": 3732 }, { "epoch": 0.1244, "loss_ce": 0.7192474007606506, "loss_lvr": 1.0380922555923462, "loss_mode_switch": 0.0, "loss_total": 0.8230566382408142, "step": 311 }, { "batch_size": 4, "epoch": 0.1244, "step": 311, "tokens_per_device": 2788 }, { "epoch": 0.1244, "loss_ce": 0.11777999252080917, "loss_lvr": 0.8989800810813904, "loss_mode_switch": 0.0, "loss_total": 0.20767800509929657, "step": 311 }, { "epoch": 0.1248, "grad_norm": 1.4953253269195557, "learning_rate": 9.76617091227524e-06, "loss": 0.3336, "step": 312 }, { "batch_size": 4, "epoch": 0.1248, "step": 312, "tokens_per_device": 4772 }, { "epoch": 0.1248, "loss_ce": 0.1196012869477272, "loss_lvr": 1.1931688785552979, "loss_mode_switch": 0.0, "loss_total": 0.23891818523406982, "step": 312 }, { "batch_size": 4, "epoch": 0.1248, "step": 312, "tokens_per_device": 4060 }, { "epoch": 0.1248, "loss_ce": 0.2047988921403885, "loss_lvr": 1.0816149711608887, "loss_mode_switch": 0.0, "loss_total": 0.3129603862762451, "step": 312 }, { "batch_size": 4, "epoch": 0.1248, "step": 312, "tokens_per_device": 4304 }, { "epoch": 0.1248, "loss_ce": 0.330806702375412, "loss_lvr": 0.8276686668395996, "loss_mode_switch": 0.0, "loss_total": 0.41357356309890747, "step": 312 }, { "batch_size": 4, "epoch": 0.1248, "step": 312, "tokens_per_device": 6736 }, { "epoch": 0.1248, "loss_ce": 0.21007795631885529, "loss_lvr": 0.8466930389404297, "loss_mode_switch": 0.0, "loss_total": 0.2947472631931305, "step": 312 }, { "batch_size": 4, "epoch": 0.1248, "step": 312, "tokens_per_device": 2688 }, { "epoch": 0.1248, "loss_ce": 0.19980420172214508, "loss_lvr": 0.7306161522865295, "loss_mode_switch": 0.0, "loss_total": 0.27286583185195923, "step": 312 }, { "batch_size": 4, "epoch": 0.1248, "step": 312, "tokens_per_device": 4256 }, { "epoch": 0.1248, "loss_ce": 0.1471739113330841, "loss_lvr": 0.8074837923049927, "loss_mode_switch": 0.0, "loss_total": 0.22792229056358337, "step": 312 }, { "batch_size": 4, "epoch": 0.1248, "step": 312, "tokens_per_device": 4420 }, { "epoch": 0.1248, "loss_ce": 0.3977872133255005, "loss_lvr": 0.8549374938011169, "loss_mode_switch": 0.0, "loss_total": 0.4832809567451477, "step": 312 }, { "batch_size": 4, "epoch": 0.1248, "step": 312, "tokens_per_device": 4644 }, { "epoch": 0.1248, "loss_ce": 0.24599990248680115, "loss_lvr": 0.8654943704605103, "loss_mode_switch": 0.0, "loss_total": 0.3325493335723877, "step": 312 }, { "epoch": 0.1252, "grad_norm": 1.3889952898025513, "learning_rate": 9.764209197827721e-06, "loss": 0.3515, "step": 313 }, { "batch_size": 1, "epoch": 0.1252, "step": 313, "tokens_per_device": 4949 }, { "epoch": 0.1252, "loss_ce": 0.14199316501617432, "loss_lvr": 0.4300992488861084, "loss_mode_switch": 0.0, "loss_total": 0.18500308692455292, "step": 313 }, { "batch_size": 4, "epoch": 0.1252, "step": 313, "tokens_per_device": 4412 }, { "epoch": 0.1252, "loss_ce": 0.28512075543403625, "loss_lvr": 1.18724524974823, "loss_mode_switch": 0.0, "loss_total": 0.40384528040885925, "step": 313 }, { "batch_size": 4, "epoch": 0.1252, "step": 313, "tokens_per_device": 4568 }, { "epoch": 0.1252, "loss_ce": 0.3199423849582672, "loss_lvr": 1.155387282371521, "loss_mode_switch": 0.0, "loss_total": 0.43548113107681274, "step": 313 }, { "batch_size": 4, "epoch": 0.1252, "step": 313, "tokens_per_device": 2664 }, { "epoch": 0.1252, "loss_ce": 0.48064184188842773, "loss_lvr": 1.0746921300888062, "loss_mode_switch": 0.0, "loss_total": 0.5881110429763794, "step": 313 }, { "batch_size": 4, "epoch": 0.1252, "step": 313, "tokens_per_device": 4196 }, { "epoch": 0.1252, "loss_ce": 0.31244957447052, "loss_lvr": 1.1268621683120728, "loss_mode_switch": 0.0, "loss_total": 0.4251357913017273, "step": 313 }, { "batch_size": 4, "epoch": 0.1252, "step": 313, "tokens_per_device": 4736 }, { "epoch": 0.1252, "loss_ce": 0.18532396852970123, "loss_lvr": 0.9393036365509033, "loss_mode_switch": 0.0, "loss_total": 0.27925431728363037, "step": 313 }, { "batch_size": 4, "epoch": 0.1252, "step": 313, "tokens_per_device": 3936 }, { "epoch": 0.1252, "loss_ce": 0.09727108478546143, "loss_lvr": 1.0266838073730469, "loss_mode_switch": 0.0, "loss_total": 0.19993945956230164, "step": 313 }, { "batch_size": 4, "epoch": 0.1252, "step": 313, "tokens_per_device": 9068 }, { "epoch": 0.1252, "loss_ce": 0.27529338002204895, "loss_lvr": 1.0556000471115112, "loss_mode_switch": 0.0, "loss_total": 0.3808533847332001, "step": 313 }, { "epoch": 0.1256, "grad_norm": 1.5575939416885376, "learning_rate": 9.762239487486316e-06, "loss": 0.3754, "step": 314 }, { "batch_size": 4, "epoch": 0.1256, "step": 314, "tokens_per_device": 3776 }, { "epoch": 0.1256, "loss_ce": 0.2289021909236908, "loss_lvr": 0.8787098526954651, "loss_mode_switch": 0.0, "loss_total": 0.3167731761932373, "step": 314 }, { "batch_size": 1, "epoch": 0.1256, "step": 314, "tokens_per_device": 5149 }, { "epoch": 0.1256, "loss_ce": 0.1448550671339035, "loss_lvr": 0.588098406791687, "loss_mode_switch": 0.0, "loss_total": 0.20366491377353668, "step": 314 }, { "batch_size": 4, "epoch": 0.1256, "step": 314, "tokens_per_device": 4240 }, { "epoch": 0.1256, "loss_ce": 0.6026562452316284, "loss_lvr": 1.3246181011199951, "loss_mode_switch": 0.0, "loss_total": 0.73511803150177, "step": 314 }, { "batch_size": 4, "epoch": 0.1256, "step": 314, "tokens_per_device": 2732 }, { "epoch": 0.1256, "loss_ce": 0.6386830806732178, "loss_lvr": 1.2706037759780884, "loss_mode_switch": 0.0, "loss_total": 0.7657434940338135, "step": 314 }, { "batch_size": 1, "epoch": 0.1256, "step": 314, "tokens_per_device": 5128 }, { "epoch": 0.1256, "loss_ce": 0.03742478787899017, "loss_lvr": 0.6311582326889038, "loss_mode_switch": 0.0, "loss_total": 0.10054061561822891, "step": 314 }, { "batch_size": 1, "epoch": 0.1256, "step": 314, "tokens_per_device": 4842 }, { "epoch": 0.1256, "loss_ce": 0.028567107394337654, "loss_lvr": 0.6609544157981873, "loss_mode_switch": 0.0, "loss_total": 0.09466254711151123, "step": 314 }, { "batch_size": 4, "epoch": 0.1256, "step": 314, "tokens_per_device": 2660 }, { "epoch": 0.1256, "loss_ce": 0.6705185174942017, "loss_lvr": 0.9591317176818848, "loss_mode_switch": 0.0, "loss_total": 0.7664316892623901, "step": 314 }, { "batch_size": 1, "epoch": 0.1256, "step": 314, "tokens_per_device": 4836 }, { "epoch": 0.1256, "loss_ce": 0.059155385941267014, "loss_lvr": 0.7770878076553345, "loss_mode_switch": 0.0, "loss_total": 0.13686417043209076, "step": 314 }, { "epoch": 0.126, "grad_norm": 1.7742446660995483, "learning_rate": 9.76026178455684e-06, "loss": 0.3612, "step": 315 }, { "batch_size": 1, "epoch": 0.126, "step": 315, "tokens_per_device": 5112 }, { "epoch": 0.126, "loss_ce": 0.0017009804723784328, "loss_lvr": 0.25745338201522827, "loss_mode_switch": 0.0, "loss_total": 0.027446318417787552, "step": 315 }, { "batch_size": 4, "epoch": 0.126, "step": 315, "tokens_per_device": 4400 }, { "epoch": 0.126, "loss_ce": 0.16730621457099915, "loss_lvr": 0.7978530526161194, "loss_mode_switch": 0.0, "loss_total": 0.24709153175354004, "step": 315 }, { "batch_size": 4, "epoch": 0.126, "step": 315, "tokens_per_device": 8928 }, { "epoch": 0.126, "loss_ce": 0.33574026823043823, "loss_lvr": 0.5202577710151672, "loss_mode_switch": 0.0, "loss_total": 0.387766033411026, "step": 315 }, { "batch_size": 4, "epoch": 0.126, "step": 315, "tokens_per_device": 5944 }, { "epoch": 0.126, "loss_ce": 0.42880478501319885, "loss_lvr": 0.966513454914093, "loss_mode_switch": 0.0, "loss_total": 0.5254561305046082, "step": 315 }, { "batch_size": 4, "epoch": 0.126, "step": 315, "tokens_per_device": 6452 }, { "epoch": 0.126, "loss_ce": 0.5253722667694092, "loss_lvr": 0.8965876698493958, "loss_mode_switch": 0.0, "loss_total": 0.6150310039520264, "step": 315 }, { "batch_size": 4, "epoch": 0.126, "step": 315, "tokens_per_device": 2732 }, { "epoch": 0.126, "loss_ce": 0.10945001244544983, "loss_lvr": 0.7046592831611633, "loss_mode_switch": 0.0, "loss_total": 0.17991593480110168, "step": 315 }, { "batch_size": 1, "epoch": 0.126, "step": 315, "tokens_per_device": 4325 }, { "epoch": 0.126, "loss_ce": 0.15255622565746307, "loss_lvr": 0.5349271297454834, "loss_mode_switch": 0.0, "loss_total": 0.20604893565177917, "step": 315 }, { "batch_size": 1, "epoch": 0.126, "step": 315, "tokens_per_device": 5036 }, { "epoch": 0.126, "loss_ce": 0.039789021015167236, "loss_lvr": 0.7465275526046753, "loss_mode_switch": 0.0, "loss_total": 0.11444177478551865, "step": 315 }, { "epoch": 0.1264, "grad_norm": 1.7856323719024658, "learning_rate": 9.758276092358518e-06, "loss": 0.3004, "step": 316 }, { "batch_size": 1, "epoch": 0.1264, "step": 316, "tokens_per_device": 5189 }, { "epoch": 0.1264, "loss_ce": 1.5205706357955933, "loss_lvr": 0.662966787815094, "loss_mode_switch": 0.0, "loss_total": 1.586867332458496, "step": 316 }, { "batch_size": 4, "epoch": 0.1264, "step": 316, "tokens_per_device": 2684 }, { "epoch": 0.1264, "loss_ce": 0.6482028961181641, "loss_lvr": 1.044753909111023, "loss_mode_switch": 0.0, "loss_total": 0.7526782751083374, "step": 316 }, { "batch_size": 4, "epoch": 0.1264, "step": 316, "tokens_per_device": 2520 }, { "epoch": 0.1264, "loss_ce": 0.16580772399902344, "loss_lvr": 0.9374126195907593, "loss_mode_switch": 0.0, "loss_total": 0.25954899191856384, "step": 316 }, { "batch_size": 1, "epoch": 0.1264, "step": 316, "tokens_per_device": 5607 }, { "epoch": 0.1264, "loss_ce": 0.00642033526673913, "loss_lvr": 0.634932816028595, "loss_mode_switch": 0.0, "loss_total": 0.06991361826658249, "step": 316 }, { "batch_size": 4, "epoch": 0.1264, "step": 316, "tokens_per_device": 3800 }, { "epoch": 0.1264, "loss_ce": 0.022234681993722916, "loss_lvr": 0.9581170678138733, "loss_mode_switch": 0.0, "loss_total": 0.11804638803005219, "step": 316 }, { "batch_size": 1, "epoch": 0.1264, "step": 316, "tokens_per_device": 5163 }, { "epoch": 0.1264, "loss_ce": 0.018920375034213066, "loss_lvr": 0.7991976141929626, "loss_mode_switch": 0.0, "loss_total": 0.0988401398062706, "step": 316 }, { "batch_size": 4, "epoch": 0.1264, "step": 316, "tokens_per_device": 5576 }, { "epoch": 0.1264, "loss_ce": 0.26568377017974854, "loss_lvr": 0.900037944316864, "loss_mode_switch": 0.0, "loss_total": 0.35568755865097046, "step": 316 }, { "batch_size": 4, "epoch": 0.1264, "step": 316, "tokens_per_device": 4084 }, { "epoch": 0.1264, "loss_ce": 0.22331058979034424, "loss_lvr": 1.0429284572601318, "loss_mode_switch": 0.0, "loss_total": 0.32760342955589294, "step": 316 }, { "epoch": 0.1268, "grad_norm": 2.083437919616699, "learning_rate": 9.756282414223995e-06, "loss": 0.3895, "step": 317 }, { "batch_size": 1, "epoch": 0.1268, "step": 317, "tokens_per_device": 5008 }, { "epoch": 0.1268, "loss_ce": 0.002958902157843113, "loss_lvr": 0.41023746132850647, "loss_mode_switch": 0.0, "loss_total": 0.043982647359371185, "step": 317 }, { "batch_size": 1, "epoch": 0.1268, "step": 317, "tokens_per_device": 5041 }, { "epoch": 0.1268, "loss_ce": 1.4094566106796265, "loss_lvr": 0.566420316696167, "loss_mode_switch": 0.0, "loss_total": 1.466098666191101, "step": 317 }, { "batch_size": 1, "epoch": 0.1268, "step": 317, "tokens_per_device": 4896 }, { "epoch": 0.1268, "loss_ce": 0.16194896399974823, "loss_lvr": 0.3887976408004761, "loss_mode_switch": 0.0, "loss_total": 0.20082873106002808, "step": 317 }, { "batch_size": 4, "epoch": 0.1268, "step": 317, "tokens_per_device": 4236 }, { "epoch": 0.1268, "loss_ce": 0.15136390924453735, "loss_lvr": 1.3137943744659424, "loss_mode_switch": 0.0, "loss_total": 0.28274333477020264, "step": 317 }, { "batch_size": 4, "epoch": 0.1268, "step": 317, "tokens_per_device": 3800 }, { "epoch": 0.1268, "loss_ce": 0.09197675436735153, "loss_lvr": 1.1612372398376465, "loss_mode_switch": 0.0, "loss_total": 0.20810048282146454, "step": 317 }, { "batch_size": 4, "epoch": 0.1268, "step": 317, "tokens_per_device": 4416 }, { "epoch": 0.1268, "loss_ce": 0.44875192642211914, "loss_lvr": 0.9293187856674194, "loss_mode_switch": 0.0, "loss_total": 0.5416837930679321, "step": 317 }, { "batch_size": 1, "epoch": 0.1268, "step": 317, "tokens_per_device": 5098 }, { "epoch": 0.1268, "loss_ce": 0.07125157862901688, "loss_lvr": 0.41236239671707153, "loss_mode_switch": 0.0, "loss_total": 0.11248782277107239, "step": 317 }, { "batch_size": 1, "epoch": 0.1268, "step": 317, "tokens_per_device": 5604 }, { "epoch": 0.1268, "loss_ce": 0.04050503671169281, "loss_lvr": 0.7348803877830505, "loss_mode_switch": 0.0, "loss_total": 0.1139930784702301, "step": 317 }, { "epoch": 0.1272, "grad_norm": 1.567205786705017, "learning_rate": 9.754280753499306e-06, "loss": 0.3349, "step": 318 }, { "batch_size": 4, "epoch": 0.1272, "step": 318, "tokens_per_device": 5248 }, { "epoch": 0.1272, "loss_ce": 0.048727188259363174, "loss_lvr": 1.188793420791626, "loss_mode_switch": 0.0, "loss_total": 0.16760653257369995, "step": 318 }, { "batch_size": 4, "epoch": 0.1272, "step": 318, "tokens_per_device": 6564 }, { "epoch": 0.1272, "loss_ce": 0.2694076597690582, "loss_lvr": 1.192989468574524, "loss_mode_switch": 0.0, "loss_total": 0.38870662450790405, "step": 318 }, { "batch_size": 4, "epoch": 0.1272, "step": 318, "tokens_per_device": 4580 }, { "epoch": 0.1272, "loss_ce": 0.2951505184173584, "loss_lvr": 1.5566037893295288, "loss_mode_switch": 0.0, "loss_total": 0.45081090927124023, "step": 318 }, { "batch_size": 1, "epoch": 0.1272, "step": 318, "tokens_per_device": 4926 }, { "epoch": 0.1272, "loss_ce": 0.47985222935676575, "loss_lvr": 0.8665832877159119, "loss_mode_switch": 0.0, "loss_total": 0.5665105581283569, "step": 318 }, { "batch_size": 4, "epoch": 0.1272, "step": 318, "tokens_per_device": 5116 }, { "epoch": 0.1272, "loss_ce": 0.23576588928699493, "loss_lvr": 0.9679732322692871, "loss_mode_switch": 0.0, "loss_total": 0.33256322145462036, "step": 318 }, { "batch_size": 4, "epoch": 0.1272, "step": 318, "tokens_per_device": 1376 }, { "epoch": 0.1272, "loss_ce": 0.821223795413971, "loss_lvr": 1.290448546409607, "loss_mode_switch": 0.0, "loss_total": 0.9502686262130737, "step": 318 }, { "batch_size": 4, "epoch": 0.1272, "step": 318, "tokens_per_device": 4256 }, { "epoch": 0.1272, "loss_ce": 0.09479042887687683, "loss_lvr": 1.2434251308441162, "loss_mode_switch": 0.0, "loss_total": 0.2191329449415207, "step": 318 }, { "batch_size": 1, "epoch": 0.1272, "step": 318, "tokens_per_device": 4899 }, { "epoch": 0.1272, "loss_ce": 0.0026400580536574125, "loss_lvr": 0.4386700987815857, "loss_mode_switch": 0.0, "loss_total": 0.04650706797838211, "step": 318 }, { "epoch": 0.1276, "grad_norm": 1.6467088460922241, "learning_rate": 9.75227111354389e-06, "loss": 0.3851, "step": 319 }, { "batch_size": 1, "epoch": 0.1276, "step": 319, "tokens_per_device": 5131 }, { "epoch": 0.1276, "loss_ce": 0.0018105931812897325, "loss_lvr": 0.5120601654052734, "loss_mode_switch": 0.0, "loss_total": 0.05301661044359207, "step": 319 }, { "batch_size": 4, "epoch": 0.1276, "step": 319, "tokens_per_device": 2592 }, { "epoch": 0.1276, "loss_ce": 0.2174711525440216, "loss_lvr": 1.1008394956588745, "loss_mode_switch": 0.0, "loss_total": 0.3275551199913025, "step": 319 }, { "batch_size": 1, "epoch": 0.1276, "step": 319, "tokens_per_device": 4882 }, { "epoch": 0.1276, "loss_ce": 0.4939606487751007, "loss_lvr": 0.11117065697908401, "loss_mode_switch": 0.0, "loss_total": 0.5050777196884155, "step": 319 }, { "batch_size": 4, "epoch": 0.1276, "step": 319, "tokens_per_device": 4844 }, { "epoch": 0.1276, "loss_ce": 0.4497103989124298, "loss_lvr": 1.1742112636566162, "loss_mode_switch": 0.0, "loss_total": 0.567131519317627, "step": 319 }, { "batch_size": 4, "epoch": 0.1276, "step": 319, "tokens_per_device": 3780 }, { "epoch": 0.1276, "loss_ce": 0.23307816684246063, "loss_lvr": 1.2974402904510498, "loss_mode_switch": 0.0, "loss_total": 0.36282220482826233, "step": 319 }, { "batch_size": 1, "epoch": 0.1276, "step": 319, "tokens_per_device": 5238 }, { "epoch": 0.1276, "loss_ce": 0.046545546501874924, "loss_lvr": 1.0095041990280151, "loss_mode_switch": 0.0, "loss_total": 0.14749597012996674, "step": 319 }, { "batch_size": 4, "epoch": 0.1276, "step": 319, "tokens_per_device": 8532 }, { "epoch": 0.1276, "loss_ce": 0.1847742348909378, "loss_lvr": 0.7411960959434509, "loss_mode_switch": 0.0, "loss_total": 0.25889384746551514, "step": 319 }, { "batch_size": 4, "epoch": 0.1276, "step": 319, "tokens_per_device": 4236 }, { "epoch": 0.1276, "loss_ce": 0.49206680059432983, "loss_lvr": 0.8335537314414978, "loss_mode_switch": 0.0, "loss_total": 0.5754221677780151, "step": 319 }, { "epoch": 0.128, "grad_norm": 1.4903197288513184, "learning_rate": 9.75025349773058e-06, "loss": 0.3473, "step": 320 }, { "batch_size": 4, "epoch": 0.128, "step": 320, "tokens_per_device": 3860 }, { "epoch": 0.128, "loss_ce": 0.07155189663171768, "loss_lvr": 0.8961834907531738, "loss_mode_switch": 0.0, "loss_total": 0.16117024421691895, "step": 320 }, { "batch_size": 1, "epoch": 0.128, "step": 320, "tokens_per_device": 5224 }, { "epoch": 0.128, "loss_ce": 0.03066958673298359, "loss_lvr": 0.5450435876846313, "loss_mode_switch": 0.0, "loss_total": 0.08517394959926605, "step": 320 }, { "batch_size": 1, "epoch": 0.128, "step": 320, "tokens_per_device": 4870 }, { "epoch": 0.128, "loss_ce": 0.0292478259652853, "loss_lvr": 0.5100293755531311, "loss_mode_switch": 0.0, "loss_total": 0.08025076240301132, "step": 320 }, { "batch_size": 4, "epoch": 0.128, "step": 320, "tokens_per_device": 3760 }, { "epoch": 0.128, "loss_ce": 0.13407200574874878, "loss_lvr": 0.9603203535079956, "loss_mode_switch": 0.0, "loss_total": 0.23010404407978058, "step": 320 }, { "batch_size": 4, "epoch": 0.128, "step": 320, "tokens_per_device": 5368 }, { "epoch": 0.128, "loss_ce": 0.030072737485170364, "loss_lvr": 0.8764126896858215, "loss_mode_switch": 0.0, "loss_total": 0.11771400272846222, "step": 320 }, { "batch_size": 1, "epoch": 0.128, "step": 320, "tokens_per_device": 5042 }, { "epoch": 0.128, "loss_ce": 0.2839544117450714, "loss_lvr": 0.5118880271911621, "loss_mode_switch": 0.0, "loss_total": 0.33514320850372314, "step": 320 }, { "batch_size": 4, "epoch": 0.128, "step": 320, "tokens_per_device": 2656 }, { "epoch": 0.128, "loss_ce": 0.1126755103468895, "loss_lvr": 0.9419669508934021, "loss_mode_switch": 0.0, "loss_total": 0.20687220990657806, "step": 320 }, { "batch_size": 4, "epoch": 0.128, "step": 320, "tokens_per_device": 4728 }, { "epoch": 0.128, "loss_ce": 0.09618454426527023, "loss_lvr": 0.9873843789100647, "loss_mode_switch": 0.0, "loss_total": 0.19492298364639282, "step": 320 }, { "epoch": 0.1284, "grad_norm": 3.000905752182007, "learning_rate": 9.74822790944559e-06, "loss": 0.302, "step": 321 }, { "batch_size": 4, "epoch": 0.1284, "step": 321, "tokens_per_device": 1356 }, { "epoch": 0.1284, "loss_ce": 0.32127028703689575, "loss_lvr": 1.0531656742095947, "loss_mode_switch": 0.0, "loss_total": 0.4265868663787842, "step": 321 }, { "batch_size": 1, "epoch": 0.1284, "step": 321, "tokens_per_device": 5871 }, { "epoch": 0.1284, "loss_ce": 0.07671404629945755, "loss_lvr": 0.5554532408714294, "loss_mode_switch": 0.0, "loss_total": 0.13225936889648438, "step": 321 }, { "batch_size": 4, "epoch": 0.1284, "step": 321, "tokens_per_device": 4236 }, { "epoch": 0.1284, "loss_ce": 0.18612921237945557, "loss_lvr": 1.0146986246109009, "loss_mode_switch": 0.0, "loss_total": 0.2875990867614746, "step": 321 }, { "batch_size": 4, "epoch": 0.1284, "step": 321, "tokens_per_device": 1520 }, { "epoch": 0.1284, "loss_ce": 0.5471339225769043, "loss_lvr": 1.3074512481689453, "loss_mode_switch": 0.0, "loss_total": 0.6778790354728699, "step": 321 }, { "batch_size": 4, "epoch": 0.1284, "step": 321, "tokens_per_device": 3936 }, { "epoch": 0.1284, "loss_ce": 0.10770261287689209, "loss_lvr": 1.421636700630188, "loss_mode_switch": 0.0, "loss_total": 0.2498662918806076, "step": 321 }, { "batch_size": 1, "epoch": 0.1284, "step": 321, "tokens_per_device": 4774 }, { "epoch": 0.1284, "loss_ce": 0.0959818884730339, "loss_lvr": 0.6674546599388123, "loss_mode_switch": 0.0, "loss_total": 0.16272735595703125, "step": 321 }, { "batch_size": 4, "epoch": 0.1284, "step": 321, "tokens_per_device": 2852 }, { "epoch": 0.1284, "loss_ce": 0.36406266689300537, "loss_lvr": 0.6047658920288086, "loss_mode_switch": 0.0, "loss_total": 0.4245392680168152, "step": 321 }, { "batch_size": 1, "epoch": 0.1284, "step": 321, "tokens_per_device": 4896 }, { "epoch": 0.1284, "loss_ce": 0.39979371428489685, "loss_lvr": 1.1656465530395508, "loss_mode_switch": 0.0, "loss_total": 0.5163583755493164, "step": 321 }, { "epoch": 0.1288, "grad_norm": 1.5130215883255005, "learning_rate": 9.746194352088518e-06, "loss": 0.3075, "step": 322 }, { "batch_size": 4, "epoch": 0.1288, "step": 322, "tokens_per_device": 10548 }, { "epoch": 0.1288, "loss_ce": 0.1382931023836136, "loss_lvr": 0.8345386385917664, "loss_mode_switch": 0.0, "loss_total": 0.22174696624279022, "step": 322 }, { "batch_size": 4, "epoch": 0.1288, "step": 322, "tokens_per_device": 1416 }, { "epoch": 0.1288, "loss_ce": 0.296962171792984, "loss_lvr": 1.1821707487106323, "loss_mode_switch": 0.0, "loss_total": 0.4151792526245117, "step": 322 }, { "batch_size": 4, "epoch": 0.1288, "step": 322, "tokens_per_device": 6976 }, { "epoch": 0.1288, "loss_ce": 0.38399118185043335, "loss_lvr": 0.8599955439567566, "loss_mode_switch": 0.0, "loss_total": 0.46999073028564453, "step": 322 }, { "batch_size": 4, "epoch": 0.1288, "step": 322, "tokens_per_device": 5652 }, { "epoch": 0.1288, "loss_ce": 0.11302745342254639, "loss_lvr": 0.9327338337898254, "loss_mode_switch": 0.0, "loss_total": 0.20630083978176117, "step": 322 }, { "batch_size": 4, "epoch": 0.1288, "step": 322, "tokens_per_device": 8692 }, { "epoch": 0.1288, "loss_ce": 0.16603584587574005, "loss_lvr": 0.9011596441268921, "loss_mode_switch": 0.0, "loss_total": 0.25615179538726807, "step": 322 }, { "batch_size": 4, "epoch": 0.1288, "step": 322, "tokens_per_device": 5560 }, { "epoch": 0.1288, "loss_ce": 0.34067362546920776, "loss_lvr": 0.8468042016029358, "loss_mode_switch": 0.0, "loss_total": 0.4253540635108948, "step": 322 }, { "batch_size": 4, "epoch": 0.1288, "step": 322, "tokens_per_device": 5688 }, { "epoch": 0.1288, "loss_ce": 0.0670771449804306, "loss_lvr": 0.8610959053039551, "loss_mode_switch": 0.0, "loss_total": 0.15318673849105835, "step": 322 }, { "batch_size": 4, "epoch": 0.1288, "step": 322, "tokens_per_device": 2588 }, { "epoch": 0.1288, "loss_ce": 0.7653839588165283, "loss_lvr": 1.1266387701034546, "loss_mode_switch": 0.0, "loss_total": 0.8780478239059448, "step": 322 }, { "epoch": 0.1292, "grad_norm": 1.5499333143234253, "learning_rate": 9.744152829072333e-06, "loss": 0.3565, "step": 323 }, { "batch_size": 4, "epoch": 0.1292, "step": 323, "tokens_per_device": 4516 }, { "epoch": 0.1292, "loss_ce": 0.15459606051445007, "loss_lvr": 1.0754529237747192, "loss_mode_switch": 0.0, "loss_total": 0.2621413469314575, "step": 323 }, { "batch_size": 4, "epoch": 0.1292, "step": 323, "tokens_per_device": 4596 }, { "epoch": 0.1292, "loss_ce": 0.22019009292125702, "loss_lvr": 1.7404415607452393, "loss_mode_switch": 0.0, "loss_total": 0.39423424005508423, "step": 323 }, { "batch_size": 4, "epoch": 0.1292, "step": 323, "tokens_per_device": 6896 }, { "epoch": 0.1292, "loss_ce": 0.48660385608673096, "loss_lvr": 0.9255183339118958, "loss_mode_switch": 0.0, "loss_total": 0.579155683517456, "step": 323 }, { "batch_size": 4, "epoch": 0.1292, "step": 323, "tokens_per_device": 3756 }, { "epoch": 0.1292, "loss_ce": 0.40368324518203735, "loss_lvr": 0.930388331413269, "loss_mode_switch": 0.0, "loss_total": 0.4967220723628998, "step": 323 }, { "batch_size": 4, "epoch": 0.1292, "step": 323, "tokens_per_device": 10316 }, { "epoch": 0.1292, "loss_ce": 0.11148792505264282, "loss_lvr": 0.5847947001457214, "loss_mode_switch": 0.0, "loss_total": 0.1699673980474472, "step": 323 }, { "batch_size": 1, "epoch": 0.1292, "step": 323, "tokens_per_device": 5149 }, { "epoch": 0.1292, "loss_ce": 0.004762902390211821, "loss_lvr": 0.3330346345901489, "loss_mode_switch": 0.0, "loss_total": 0.03806636855006218, "step": 323 }, { "batch_size": 4, "epoch": 0.1292, "step": 323, "tokens_per_device": 6032 }, { "epoch": 0.1292, "loss_ce": 0.16692818701267242, "loss_lvr": 0.8465122580528259, "loss_mode_switch": 0.0, "loss_total": 0.2515794038772583, "step": 323 }, { "batch_size": 1, "epoch": 0.1292, "step": 323, "tokens_per_device": 5391 }, { "epoch": 0.1292, "loss_ce": 0.018896590918302536, "loss_lvr": 0.3603220582008362, "loss_mode_switch": 0.0, "loss_total": 0.054928798228502274, "step": 323 }, { "epoch": 0.1296, "grad_norm": 1.4284324645996094, "learning_rate": 9.742103343823376e-06, "loss": 0.3383, "step": 324 }, { "batch_size": 1, "epoch": 0.1296, "step": 324, "tokens_per_device": 4801 }, { "epoch": 0.1296, "loss_ce": 0.04817738011479378, "loss_lvr": 0.6369771957397461, "loss_mode_switch": 0.0, "loss_total": 0.11187510192394257, "step": 324 }, { "batch_size": 4, "epoch": 0.1296, "step": 324, "tokens_per_device": 5056 }, { "epoch": 0.1296, "loss_ce": 0.2990964651107788, "loss_lvr": 1.0664728879928589, "loss_mode_switch": 0.0, "loss_total": 0.4057437479496002, "step": 324 }, { "batch_size": 4, "epoch": 0.1296, "step": 324, "tokens_per_device": 6100 }, { "epoch": 0.1296, "loss_ce": 0.09841073304414749, "loss_lvr": 0.8499810099601746, "loss_mode_switch": 0.0, "loss_total": 0.18340882658958435, "step": 324 }, { "batch_size": 4, "epoch": 0.1296, "step": 324, "tokens_per_device": 14868 }, { "epoch": 0.1296, "loss_ce": 0.29175132513046265, "loss_lvr": 1.1517843008041382, "loss_mode_switch": 0.0, "loss_total": 0.40692976117134094, "step": 324 }, { "batch_size": 4, "epoch": 0.1296, "step": 324, "tokens_per_device": 4356 }, { "epoch": 0.1296, "loss_ce": 0.40776965022087097, "loss_lvr": 1.1211657524108887, "loss_mode_switch": 0.0, "loss_total": 0.5198862552642822, "step": 324 }, { "batch_size": 4, "epoch": 0.1296, "step": 324, "tokens_per_device": 5720 }, { "epoch": 0.1296, "loss_ce": 0.2744273245334625, "loss_lvr": 0.6833791136741638, "loss_mode_switch": 0.0, "loss_total": 0.3427652418613434, "step": 324 }, { "batch_size": 4, "epoch": 0.1296, "step": 324, "tokens_per_device": 4428 }, { "epoch": 0.1296, "loss_ce": 0.13474442064762115, "loss_lvr": 0.9489595293998718, "loss_mode_switch": 0.0, "loss_total": 0.22964036464691162, "step": 324 }, { "batch_size": 1, "epoch": 0.1296, "step": 324, "tokens_per_device": 5618 }, { "epoch": 0.1296, "loss_ce": 0.12823471426963806, "loss_lvr": 0.5683414340019226, "loss_mode_switch": 0.0, "loss_total": 0.18506886065006256, "step": 324 }, { "epoch": 0.13, "grad_norm": 1.3558365106582642, "learning_rate": 9.740045899781353e-06, "loss": 0.3269, "step": 325 }, { "batch_size": 4, "epoch": 0.13, "step": 325, "tokens_per_device": 3556 }, { "epoch": 0.13, "loss_ce": 0.7797709107398987, "loss_lvr": 1.2165865898132324, "loss_mode_switch": 0.0, "loss_total": 0.9014295935630798, "step": 325 }, { "batch_size": 4, "epoch": 0.13, "step": 325, "tokens_per_device": 4260 }, { "epoch": 0.13, "loss_ce": 0.31891825795173645, "loss_lvr": 1.0045816898345947, "loss_mode_switch": 0.0, "loss_total": 0.4193764328956604, "step": 325 }, { "batch_size": 4, "epoch": 0.13, "step": 325, "tokens_per_device": 5736 }, { "epoch": 0.13, "loss_ce": 0.04286898672580719, "loss_lvr": 0.8315196633338928, "loss_mode_switch": 0.0, "loss_total": 0.12602095305919647, "step": 325 }, { "batch_size": 4, "epoch": 0.13, "step": 325, "tokens_per_device": 1356 }, { "epoch": 0.13, "loss_ce": 0.7401612997055054, "loss_lvr": 1.1586295366287231, "loss_mode_switch": 0.0, "loss_total": 0.8560242652893066, "step": 325 }, { "batch_size": 4, "epoch": 0.13, "step": 325, "tokens_per_device": 3848 }, { "epoch": 0.13, "loss_ce": 0.20695124566555023, "loss_lvr": 1.0206196308135986, "loss_mode_switch": 0.0, "loss_total": 0.3090132176876068, "step": 325 }, { "batch_size": 1, "epoch": 0.13, "step": 325, "tokens_per_device": 4752 }, { "epoch": 0.13, "loss_ce": 0.1690736711025238, "loss_lvr": 0.30337658524513245, "loss_mode_switch": 0.0, "loss_total": 0.1994113326072693, "step": 325 }, { "batch_size": 4, "epoch": 0.13, "step": 325, "tokens_per_device": 4388 }, { "epoch": 0.13, "loss_ce": 0.22151829302310944, "loss_lvr": 1.0567245483398438, "loss_mode_switch": 0.0, "loss_total": 0.3271907567977905, "step": 325 }, { "batch_size": 4, "epoch": 0.13, "step": 325, "tokens_per_device": 8816 }, { "epoch": 0.13, "loss_ce": 0.4071999490261078, "loss_lvr": 0.7434004545211792, "loss_mode_switch": 0.0, "loss_total": 0.4815399944782257, "step": 325 }, { "epoch": 0.1304, "grad_norm": 1.4906572103500366, "learning_rate": 9.737980500399322e-06, "loss": 0.3879, "step": 326 }, { "batch_size": 1, "epoch": 0.1304, "step": 326, "tokens_per_device": 5187 }, { "epoch": 0.1304, "loss_ce": 0.04925558343529701, "loss_lvr": 0.889431357383728, "loss_mode_switch": 0.0, "loss_total": 0.13819871842861176, "step": 326 }, { "batch_size": 4, "epoch": 0.1304, "step": 326, "tokens_per_device": 2572 }, { "epoch": 0.1304, "loss_ce": 0.5271813869476318, "loss_lvr": 0.9680708050727844, "loss_mode_switch": 0.0, "loss_total": 0.6239884495735168, "step": 326 }, { "batch_size": 4, "epoch": 0.1304, "step": 326, "tokens_per_device": 2708 }, { "epoch": 0.1304, "loss_ce": 0.6136506199836731, "loss_lvr": 1.3745890855789185, "loss_mode_switch": 0.0, "loss_total": 0.7511095404624939, "step": 326 }, { "batch_size": 1, "epoch": 0.1304, "step": 326, "tokens_per_device": 4792 }, { "epoch": 0.1304, "loss_ce": 0.0023157892283052206, "loss_lvr": 0.41682136058807373, "loss_mode_switch": 0.0, "loss_total": 0.04399792477488518, "step": 326 }, { "batch_size": 1, "epoch": 0.1304, "step": 326, "tokens_per_device": 4876 }, { "epoch": 0.1304, "loss_ce": 0.0076663256622850895, "loss_lvr": 0.492989718914032, "loss_mode_switch": 0.0, "loss_total": 0.05696529895067215, "step": 326 }, { "batch_size": 4, "epoch": 0.1304, "step": 326, "tokens_per_device": 5120 }, { "epoch": 0.1304, "loss_ce": 0.20315031707286835, "loss_lvr": 0.9758588075637817, "loss_mode_switch": 0.0, "loss_total": 0.3007361888885498, "step": 326 }, { "batch_size": 4, "epoch": 0.1304, "step": 326, "tokens_per_device": 3908 }, { "epoch": 0.1304, "loss_ce": 0.41857263445854187, "loss_lvr": 1.2433944940567017, "loss_mode_switch": 0.0, "loss_total": 0.5429120659828186, "step": 326 }, { "batch_size": 4, "epoch": 0.1304, "step": 326, "tokens_per_device": 4860 }, { "epoch": 0.1304, "loss_ce": 0.2842179536819458, "loss_lvr": 0.9427943229675293, "loss_mode_switch": 0.0, "loss_total": 0.3784973919391632, "step": 326 }, { "epoch": 0.1308, "grad_norm": 1.1388132572174072, "learning_rate": 9.735907149143695e-06, "loss": 0.2854, "step": 327 }, { "batch_size": 4, "epoch": 0.1308, "step": 327, "tokens_per_device": 3788 }, { "epoch": 0.1308, "loss_ce": 0.27543357014656067, "loss_lvr": 0.9329615235328674, "loss_mode_switch": 0.0, "loss_total": 0.36872971057891846, "step": 327 }, { "batch_size": 4, "epoch": 0.1308, "step": 327, "tokens_per_device": 5248 }, { "epoch": 0.1308, "loss_ce": 0.26785990595817566, "loss_lvr": 0.8092315196990967, "loss_mode_switch": 0.0, "loss_total": 0.34878307580947876, "step": 327 }, { "batch_size": 4, "epoch": 0.1308, "step": 327, "tokens_per_device": 1404 }, { "epoch": 0.1308, "loss_ce": 0.7436977624893188, "loss_lvr": 1.1432043313980103, "loss_mode_switch": 0.0, "loss_total": 0.8580182194709778, "step": 327 }, { "batch_size": 4, "epoch": 0.1308, "step": 327, "tokens_per_device": 2632 }, { "epoch": 0.1308, "loss_ce": 0.5738093852996826, "loss_lvr": 2.5742886066436768, "loss_mode_switch": 0.0, "loss_total": 0.8312382698059082, "step": 327 }, { "batch_size": 4, "epoch": 0.1308, "step": 327, "tokens_per_device": 6596 }, { "epoch": 0.1308, "loss_ce": 0.04424336925148964, "loss_lvr": 0.7803539037704468, "loss_mode_switch": 0.0, "loss_total": 0.12227876484394073, "step": 327 }, { "batch_size": 4, "epoch": 0.1308, "step": 327, "tokens_per_device": 5236 }, { "epoch": 0.1308, "loss_ce": 0.00578112481161952, "loss_lvr": 0.8639171719551086, "loss_mode_switch": 0.0, "loss_total": 0.09217283874750137, "step": 327 }, { "batch_size": 4, "epoch": 0.1308, "step": 327, "tokens_per_device": 3864 }, { "epoch": 0.1308, "loss_ce": 0.13526692986488342, "loss_lvr": 0.8401155471801758, "loss_mode_switch": 0.0, "loss_total": 0.219278484582901, "step": 327 }, { "batch_size": 4, "epoch": 0.1308, "step": 327, "tokens_per_device": 3796 }, { "epoch": 0.1308, "loss_ce": 0.2493208646774292, "loss_lvr": 1.2431390285491943, "loss_mode_switch": 0.0, "loss_total": 0.3736347556114197, "step": 327 }, { "epoch": 0.1312, "grad_norm": 2.131697654724121, "learning_rate": 9.733825849494232e-06, "loss": 0.3215, "step": 328 }, { "batch_size": 4, "epoch": 0.1312, "step": 328, "tokens_per_device": 12508 }, { "epoch": 0.1312, "loss_ce": 0.15674243867397308, "loss_lvr": 0.853691577911377, "loss_mode_switch": 0.0, "loss_total": 0.24211159348487854, "step": 328 }, { "batch_size": 1, "epoch": 0.1312, "step": 328, "tokens_per_device": 7881 }, { "epoch": 0.1312, "loss_ce": 0.06782553344964981, "loss_lvr": 0.5639328956604004, "loss_mode_switch": 0.0, "loss_total": 0.12421882152557373, "step": 328 }, { "batch_size": 1, "epoch": 0.1312, "step": 328, "tokens_per_device": 5133 }, { "epoch": 0.1312, "loss_ce": 0.0056274086236953735, "loss_lvr": 0.6161238551139832, "loss_mode_switch": 0.0, "loss_total": 0.06723979115486145, "step": 328 }, { "batch_size": 4, "epoch": 0.1312, "step": 328, "tokens_per_device": 4144 }, { "epoch": 0.1312, "loss_ce": 0.04109010100364685, "loss_lvr": 0.8285272717475891, "loss_mode_switch": 0.0, "loss_total": 0.12394282966852188, "step": 328 }, { "batch_size": 4, "epoch": 0.1312, "step": 328, "tokens_per_device": 4200 }, { "epoch": 0.1312, "loss_ce": 0.25596314668655396, "loss_lvr": 1.0987054109573364, "loss_mode_switch": 0.0, "loss_total": 0.36583369970321655, "step": 328 }, { "batch_size": 4, "epoch": 0.1312, "step": 328, "tokens_per_device": 3784 }, { "epoch": 0.1312, "loss_ce": 0.8343908786773682, "loss_lvr": 1.0734459161758423, "loss_mode_switch": 0.0, "loss_total": 0.9417354464530945, "step": 328 }, { "batch_size": 1, "epoch": 0.1312, "step": 328, "tokens_per_device": 4906 }, { "epoch": 0.1312, "loss_ce": 0.0017273632111027837, "loss_lvr": 0.6086467504501343, "loss_mode_switch": 0.0, "loss_total": 0.0625920370221138, "step": 328 }, { "batch_size": 1, "epoch": 0.1312, "step": 328, "tokens_per_device": 5106 }, { "epoch": 0.1312, "loss_ce": 0.00903872586786747, "loss_lvr": 0.7360725402832031, "loss_mode_switch": 0.0, "loss_total": 0.08264598250389099, "step": 328 }, { "epoch": 0.1316, "grad_norm": 1.467160940170288, "learning_rate": 9.731736604944031e-06, "loss": 0.3243, "step": 329 }, { "batch_size": 4, "epoch": 0.1316, "step": 329, "tokens_per_device": 5296 }, { "epoch": 0.1316, "loss_ce": 0.39544034004211426, "loss_lvr": 0.8824259638786316, "loss_mode_switch": 0.0, "loss_total": 0.48368293046951294, "step": 329 }, { "batch_size": 4, "epoch": 0.1316, "step": 329, "tokens_per_device": 5156 }, { "epoch": 0.1316, "loss_ce": 0.6620728373527527, "loss_lvr": 0.9034022092819214, "loss_mode_switch": 0.0, "loss_total": 0.7524130344390869, "step": 329 }, { "batch_size": 4, "epoch": 0.1316, "step": 329, "tokens_per_device": 15064 }, { "epoch": 0.1316, "loss_ce": 0.2537912428379059, "loss_lvr": 0.9914076924324036, "loss_mode_switch": 0.0, "loss_total": 0.35293200612068176, "step": 329 }, { "batch_size": 1, "epoch": 0.1316, "step": 329, "tokens_per_device": 5209 }, { "epoch": 0.1316, "loss_ce": 0.0032001270446926355, "loss_lvr": 0.5959175229072571, "loss_mode_switch": 0.0, "loss_total": 0.06279187649488449, "step": 329 }, { "batch_size": 1, "epoch": 0.1316, "step": 329, "tokens_per_device": 4875 }, { "epoch": 0.1316, "loss_ce": 0.4030933976173401, "loss_lvr": 0.488461971282959, "loss_mode_switch": 0.0, "loss_total": 0.45193958282470703, "step": 329 }, { "batch_size": 1, "epoch": 0.1316, "step": 329, "tokens_per_device": 5462 }, { "epoch": 0.1316, "loss_ce": 0.01098768599331379, "loss_lvr": 0.5986294150352478, "loss_mode_switch": 0.0, "loss_total": 0.07085062563419342, "step": 329 }, { "batch_size": 4, "epoch": 0.1316, "step": 329, "tokens_per_device": 2536 }, { "epoch": 0.1316, "loss_ce": 0.07903661578893661, "loss_lvr": 0.974079430103302, "loss_mode_switch": 0.0, "loss_total": 0.17644456028938293, "step": 329 }, { "batch_size": 4, "epoch": 0.1316, "step": 329, "tokens_per_device": 1284 }, { "epoch": 0.1316, "loss_ce": 0.7680126428604126, "loss_lvr": 1.2956182956695557, "loss_mode_switch": 0.0, "loss_total": 0.8975744843482971, "step": 329 }, { "epoch": 0.132, "grad_norm": 1.4583721160888672, "learning_rate": 9.729639418999524e-06, "loss": 0.3382, "step": 330 }, { "batch_size": 1, "epoch": 0.132, "step": 330, "tokens_per_device": 5583 }, { "epoch": 0.132, "loss_ce": 0.09761541336774826, "loss_lvr": 0.7826054692268372, "loss_mode_switch": 0.0, "loss_total": 0.1758759617805481, "step": 330 }, { "batch_size": 4, "epoch": 0.132, "step": 330, "tokens_per_device": 5128 }, { "epoch": 0.132, "loss_ce": 0.029046660289168358, "loss_lvr": 0.6883039474487305, "loss_mode_switch": 0.0, "loss_total": 0.09787705540657043, "step": 330 }, { "batch_size": 4, "epoch": 0.132, "step": 330, "tokens_per_device": 1668 }, { "epoch": 0.132, "loss_ce": 0.44413572549819946, "loss_lvr": 0.926944375038147, "loss_mode_switch": 0.0, "loss_total": 0.5368301868438721, "step": 330 }, { "batch_size": 4, "epoch": 0.132, "step": 330, "tokens_per_device": 10824 }, { "epoch": 0.132, "loss_ce": 0.4324214458465576, "loss_lvr": 0.9847306609153748, "loss_mode_switch": 0.0, "loss_total": 0.5308945178985596, "step": 330 }, { "batch_size": 4, "epoch": 0.132, "step": 330, "tokens_per_device": 14848 }, { "epoch": 0.132, "loss_ce": 0.37162667512893677, "loss_lvr": 1.3049249649047852, "loss_mode_switch": 0.0, "loss_total": 0.5021191835403442, "step": 330 }, { "batch_size": 4, "epoch": 0.132, "step": 330, "tokens_per_device": 11276 }, { "epoch": 0.132, "loss_ce": 0.12572096288204193, "loss_lvr": 0.9102018475532532, "loss_mode_switch": 0.0, "loss_total": 0.216741144657135, "step": 330 }, { "batch_size": 1, "epoch": 0.132, "step": 330, "tokens_per_device": 5107 }, { "epoch": 0.132, "loss_ce": 0.005910488776862621, "loss_lvr": 1.0081382989883423, "loss_mode_switch": 0.0, "loss_total": 0.106724314391613, "step": 330 }, { "batch_size": 4, "epoch": 0.132, "step": 330, "tokens_per_device": 5404 }, { "epoch": 0.132, "loss_ce": 0.19119098782539368, "loss_lvr": 0.9464623928070068, "loss_mode_switch": 0.0, "loss_total": 0.28583723306655884, "step": 330 }, { "epoch": 0.1324, "grad_norm": 1.8874025344848633, "learning_rate": 9.727534295180471e-06, "loss": 0.3278, "step": 331 }, { "batch_size": 4, "epoch": 0.1324, "step": 331, "tokens_per_device": 4872 }, { "epoch": 0.1324, "loss_ce": 0.5252668857574463, "loss_lvr": 0.9787937998771667, "loss_mode_switch": 0.0, "loss_total": 0.6231462955474854, "step": 331 }, { "batch_size": 4, "epoch": 0.1324, "step": 331, "tokens_per_device": 2620 }, { "epoch": 0.1324, "loss_ce": 0.06029072776436806, "loss_lvr": 0.8697654008865356, "loss_mode_switch": 0.0, "loss_total": 0.14726726710796356, "step": 331 }, { "batch_size": 4, "epoch": 0.1324, "step": 331, "tokens_per_device": 4816 }, { "epoch": 0.1324, "loss_ce": 0.03889862447977066, "loss_lvr": 1.0312597751617432, "loss_mode_switch": 0.0, "loss_total": 0.14202460646629333, "step": 331 }, { "batch_size": 4, "epoch": 0.1324, "step": 331, "tokens_per_device": 5752 }, { "epoch": 0.1324, "loss_ce": 0.34262827038764954, "loss_lvr": 1.0153971910476685, "loss_mode_switch": 0.0, "loss_total": 0.44416800141334534, "step": 331 }, { "batch_size": 4, "epoch": 0.1324, "step": 331, "tokens_per_device": 1496 }, { "epoch": 0.1324, "loss_ce": 0.5086063742637634, "loss_lvr": 0.9065598845481873, "loss_mode_switch": 0.0, "loss_total": 0.5992623567581177, "step": 331 }, { "batch_size": 1, "epoch": 0.1324, "step": 331, "tokens_per_device": 5103 }, { "epoch": 0.1324, "loss_ce": 0.7272047400474548, "loss_lvr": 0.3409276306629181, "loss_mode_switch": 0.0, "loss_total": 0.7612975239753723, "step": 331 }, { "batch_size": 4, "epoch": 0.1324, "step": 331, "tokens_per_device": 2548 }, { "epoch": 0.1324, "loss_ce": 0.16875411570072174, "loss_lvr": 1.5854929685592651, "loss_mode_switch": 0.0, "loss_total": 0.327303409576416, "step": 331 }, { "batch_size": 4, "epoch": 0.1324, "step": 331, "tokens_per_device": 2780 }, { "epoch": 0.1324, "loss_ce": 0.8414136171340942, "loss_lvr": 0.8732973337173462, "loss_mode_switch": 0.0, "loss_total": 0.9287433624267578, "step": 331 }, { "epoch": 0.1328, "grad_norm": 1.5394196510314941, "learning_rate": 9.725421237019957e-06, "loss": 0.34, "step": 332 }, { "batch_size": 4, "epoch": 0.1328, "step": 332, "tokens_per_device": 4612 }, { "epoch": 0.1328, "loss_ce": 0.45951220393180847, "loss_lvr": 1.0132043361663818, "loss_mode_switch": 0.0, "loss_total": 0.5608326196670532, "step": 332 }, { "batch_size": 4, "epoch": 0.1328, "step": 332, "tokens_per_device": 6028 }, { "epoch": 0.1328, "loss_ce": 0.05214443802833557, "loss_lvr": 0.913473904132843, "loss_mode_switch": 0.0, "loss_total": 0.14349183440208435, "step": 332 }, { "batch_size": 4, "epoch": 0.1328, "step": 332, "tokens_per_device": 4128 }, { "epoch": 0.1328, "loss_ce": 0.36085134744644165, "loss_lvr": 0.9246127009391785, "loss_mode_switch": 0.0, "loss_total": 0.45331263542175293, "step": 332 }, { "batch_size": 4, "epoch": 0.1328, "step": 332, "tokens_per_device": 6076 }, { "epoch": 0.1328, "loss_ce": 0.2730870246887207, "loss_lvr": 1.0198100805282593, "loss_mode_switch": 0.0, "loss_total": 0.3750680387020111, "step": 332 }, { "batch_size": 4, "epoch": 0.1328, "step": 332, "tokens_per_device": 1272 }, { "epoch": 0.1328, "loss_ce": 0.32105448842048645, "loss_lvr": 1.0979284048080444, "loss_mode_switch": 0.0, "loss_total": 0.4308473467826843, "step": 332 }, { "batch_size": 1, "epoch": 0.1328, "step": 332, "tokens_per_device": 6575 }, { "epoch": 0.1328, "loss_ce": 0.013534681871533394, "loss_lvr": 0.6057842969894409, "loss_mode_switch": 0.0, "loss_total": 0.07411311566829681, "step": 332 }, { "batch_size": 4, "epoch": 0.1328, "step": 332, "tokens_per_device": 4956 }, { "epoch": 0.1328, "loss_ce": 0.3405509293079376, "loss_lvr": 1.1814063787460327, "loss_mode_switch": 0.0, "loss_total": 0.4586915671825409, "step": 332 }, { "batch_size": 1, "epoch": 0.1328, "step": 332, "tokens_per_device": 4940 }, { "epoch": 0.1328, "loss_ce": 0.06822194904088974, "loss_lvr": 0.6877081394195557, "loss_mode_switch": 0.0, "loss_total": 0.13699276745319366, "step": 332 }, { "epoch": 0.1332, "grad_norm": 1.6273363828659058, "learning_rate": 9.72330024806438e-06, "loss": 0.3109, "step": 333 }, { "batch_size": 4, "epoch": 0.1332, "step": 333, "tokens_per_device": 4380 }, { "epoch": 0.1332, "loss_ce": 0.20075775682926178, "loss_lvr": 1.1691457033157349, "loss_mode_switch": 0.0, "loss_total": 0.3176723122596741, "step": 333 }, { "batch_size": 4, "epoch": 0.1332, "step": 333, "tokens_per_device": 7268 }, { "epoch": 0.1332, "loss_ce": 0.10345038026571274, "loss_lvr": 0.8340774774551392, "loss_mode_switch": 0.0, "loss_total": 0.186858132481575, "step": 333 }, { "batch_size": 4, "epoch": 0.1332, "step": 333, "tokens_per_device": 1796 }, { "epoch": 0.1332, "loss_ce": 0.4448580741882324, "loss_lvr": 0.923183262348175, "loss_mode_switch": 0.0, "loss_total": 0.5371763706207275, "step": 333 }, { "batch_size": 4, "epoch": 0.1332, "step": 333, "tokens_per_device": 4248 }, { "epoch": 0.1332, "loss_ce": 0.6995844841003418, "loss_lvr": 1.299023985862732, "loss_mode_switch": 0.0, "loss_total": 0.8294869065284729, "step": 333 }, { "batch_size": 4, "epoch": 0.1332, "step": 333, "tokens_per_device": 3788 }, { "epoch": 0.1332, "loss_ce": 0.07485619932413101, "loss_lvr": 1.234702467918396, "loss_mode_switch": 0.0, "loss_total": 0.19832643866539001, "step": 333 }, { "batch_size": 4, "epoch": 0.1332, "step": 333, "tokens_per_device": 5412 }, { "epoch": 0.1332, "loss_ce": 0.0733010470867157, "loss_lvr": 0.8877348303794861, "loss_mode_switch": 0.0, "loss_total": 0.16207453608512878, "step": 333 }, { "batch_size": 4, "epoch": 0.1332, "step": 333, "tokens_per_device": 5016 }, { "epoch": 0.1332, "loss_ce": 0.5131427645683289, "loss_lvr": 0.8940083384513855, "loss_mode_switch": 0.0, "loss_total": 0.6025435924530029, "step": 333 }, { "batch_size": 4, "epoch": 0.1332, "step": 333, "tokens_per_device": 6068 }, { "epoch": 0.1332, "loss_ce": 0.3843931257724762, "loss_lvr": 1.1898865699768066, "loss_mode_switch": 0.0, "loss_total": 0.5033817887306213, "step": 333 }, { "epoch": 0.1336, "grad_norm": 1.5089943408966064, "learning_rate": 9.721171331873452e-06, "loss": 0.3741, "step": 334 }, { "batch_size": 4, "epoch": 0.1336, "step": 334, "tokens_per_device": 3772 }, { "epoch": 0.1336, "loss_ce": 0.5435532927513123, "loss_lvr": 1.348662257194519, "loss_mode_switch": 0.0, "loss_total": 0.6784195303916931, "step": 334 }, { "batch_size": 4, "epoch": 0.1336, "step": 334, "tokens_per_device": 5116 }, { "epoch": 0.1336, "loss_ce": 0.8258609175682068, "loss_lvr": 0.9744411706924438, "loss_mode_switch": 0.0, "loss_total": 0.9233050346374512, "step": 334 }, { "batch_size": 4, "epoch": 0.1336, "step": 334, "tokens_per_device": 2112 }, { "epoch": 0.1336, "loss_ce": 0.26621320843696594, "loss_lvr": 1.3111852407455444, "loss_mode_switch": 0.0, "loss_total": 0.39733171463012695, "step": 334 }, { "batch_size": 4, "epoch": 0.1336, "step": 334, "tokens_per_device": 4308 }, { "epoch": 0.1336, "loss_ce": 0.37815597653388977, "loss_lvr": 0.9454870820045471, "loss_mode_switch": 0.0, "loss_total": 0.47270467877388, "step": 334 }, { "batch_size": 4, "epoch": 0.1336, "step": 334, "tokens_per_device": 4260 }, { "epoch": 0.1336, "loss_ce": 0.7155743837356567, "loss_lvr": 1.0413076877593994, "loss_mode_switch": 0.0, "loss_total": 0.8197051286697388, "step": 334 }, { "batch_size": 1, "epoch": 0.1336, "step": 334, "tokens_per_device": 4884 }, { "epoch": 0.1336, "loss_ce": 0.020540159195661545, "loss_lvr": 1.6364573240280151, "loss_mode_switch": 0.0, "loss_total": 0.18418589234352112, "step": 334 }, { "batch_size": 1, "epoch": 0.1336, "step": 334, "tokens_per_device": 4828 }, { "epoch": 0.1336, "loss_ce": 0.001603669486939907, "loss_lvr": 0.4052271246910095, "loss_mode_switch": 0.0, "loss_total": 0.042126383632421494, "step": 334 }, { "batch_size": 4, "epoch": 0.1336, "step": 334, "tokens_per_device": 12548 }, { "epoch": 0.1336, "loss_ce": 1.1135941743850708, "loss_lvr": 0.9637138843536377, "loss_mode_switch": 0.0, "loss_total": 1.2099655866622925, "step": 334 }, { "epoch": 0.134, "grad_norm": 1.3431892395019531, "learning_rate": 9.719034492020183e-06, "loss": 0.3528, "step": 335 }, { "batch_size": 1, "epoch": 0.134, "step": 335, "tokens_per_device": 5175 }, { "epoch": 0.134, "loss_ce": 0.4624539613723755, "loss_lvr": 0.5804670453071594, "loss_mode_switch": 0.0, "loss_total": 0.520500659942627, "step": 335 }, { "batch_size": 1, "epoch": 0.134, "step": 335, "tokens_per_device": 4777 }, { "epoch": 0.134, "loss_ce": 0.2239900529384613, "loss_lvr": 0.3782729506492615, "loss_mode_switch": 0.0, "loss_total": 0.2618173360824585, "step": 335 }, { "batch_size": 4, "epoch": 0.134, "step": 335, "tokens_per_device": 4232 }, { "epoch": 0.134, "loss_ce": 0.008596522733569145, "loss_lvr": 1.279449224472046, "loss_mode_switch": 0.0, "loss_total": 0.1365414559841156, "step": 335 }, { "batch_size": 4, "epoch": 0.134, "step": 335, "tokens_per_device": 3752 }, { "epoch": 0.134, "loss_ce": 0.19182196259498596, "loss_lvr": 1.1360222101211548, "loss_mode_switch": 0.0, "loss_total": 0.30542418360710144, "step": 335 }, { "batch_size": 1, "epoch": 0.134, "step": 335, "tokens_per_device": 5125 }, { "epoch": 0.134, "loss_ce": 0.002632105490192771, "loss_lvr": 0.809436023235321, "loss_mode_switch": 0.0, "loss_total": 0.08357571065425873, "step": 335 }, { "batch_size": 4, "epoch": 0.134, "step": 335, "tokens_per_device": 3804 }, { "epoch": 0.134, "loss_ce": 0.15802140533924103, "loss_lvr": 1.1148738861083984, "loss_mode_switch": 0.0, "loss_total": 0.2695087790489197, "step": 335 }, { "batch_size": 4, "epoch": 0.134, "step": 335, "tokens_per_device": 2648 }, { "epoch": 0.134, "loss_ce": 0.5624975562095642, "loss_lvr": 0.9874665141105652, "loss_mode_switch": 0.0, "loss_total": 0.6612442135810852, "step": 335 }, { "batch_size": 4, "epoch": 0.134, "step": 335, "tokens_per_device": 4360 }, { "epoch": 0.134, "loss_ce": 0.24395079910755157, "loss_lvr": 0.9201207160949707, "loss_mode_switch": 0.0, "loss_total": 0.33596286177635193, "step": 335 }, { "epoch": 0.1344, "grad_norm": 1.6348261833190918, "learning_rate": 9.71688973209089e-06, "loss": 0.3501, "step": 336 }, { "batch_size": 4, "epoch": 0.1344, "step": 336, "tokens_per_device": 3820 }, { "epoch": 0.1344, "loss_ce": 0.16413307189941406, "loss_lvr": 1.7204864025115967, "loss_mode_switch": 0.0, "loss_total": 0.3361817002296448, "step": 336 }, { "batch_size": 4, "epoch": 0.1344, "step": 336, "tokens_per_device": 14488 }, { "epoch": 0.1344, "loss_ce": 0.10723934322595596, "loss_lvr": 0.6692519187927246, "loss_mode_switch": 0.0, "loss_total": 0.1741645336151123, "step": 336 }, { "batch_size": 4, "epoch": 0.1344, "step": 336, "tokens_per_device": 5940 }, { "epoch": 0.1344, "loss_ce": 0.11493273824453354, "loss_lvr": 1.2099082469940186, "loss_mode_switch": 0.0, "loss_total": 0.23592355847358704, "step": 336 }, { "batch_size": 4, "epoch": 0.1344, "step": 336, "tokens_per_device": 3960 }, { "epoch": 0.1344, "loss_ce": 0.04089543968439102, "loss_lvr": 1.3102717399597168, "loss_mode_switch": 0.0, "loss_total": 0.17192262411117554, "step": 336 }, { "batch_size": 4, "epoch": 0.1344, "step": 336, "tokens_per_device": 3328 }, { "epoch": 0.1344, "loss_ce": 0.17735743522644043, "loss_lvr": 1.237207293510437, "loss_mode_switch": 0.0, "loss_total": 0.3010781705379486, "step": 336 }, { "batch_size": 4, "epoch": 0.1344, "step": 336, "tokens_per_device": 6944 }, { "epoch": 0.1344, "loss_ce": 0.04474356770515442, "loss_lvr": 0.8140453100204468, "loss_mode_switch": 0.0, "loss_total": 0.12614810466766357, "step": 336 }, { "batch_size": 4, "epoch": 0.1344, "step": 336, "tokens_per_device": 4460 }, { "epoch": 0.1344, "loss_ce": 0.15975558757781982, "loss_lvr": 1.1981408596038818, "loss_mode_switch": 0.0, "loss_total": 0.27956968545913696, "step": 336 }, { "batch_size": 4, "epoch": 0.1344, "step": 336, "tokens_per_device": 6176 }, { "epoch": 0.1344, "loss_ce": 0.24070486426353455, "loss_lvr": 0.9967554807662964, "loss_mode_switch": 0.0, "loss_total": 0.3403804302215576, "step": 336 }, { "epoch": 0.1348, "grad_norm": 1.5359421968460083, "learning_rate": 9.714737055685176e-06, "loss": 0.3616, "step": 337 }, { "batch_size": 1, "epoch": 0.1348, "step": 337, "tokens_per_device": 5133 }, { "epoch": 0.1348, "loss_ce": 0.025291364639997482, "loss_lvr": 0.9541181325912476, "loss_mode_switch": 0.0, "loss_total": 0.12070317566394806, "step": 337 }, { "batch_size": 1, "epoch": 0.1348, "step": 337, "tokens_per_device": 5064 }, { "epoch": 0.1348, "loss_ce": 0.0064024971798062325, "loss_lvr": 0.2867855131626129, "loss_mode_switch": 0.0, "loss_total": 0.03508104756474495, "step": 337 }, { "batch_size": 4, "epoch": 0.1348, "step": 337, "tokens_per_device": 4004 }, { "epoch": 0.1348, "loss_ce": 0.1096259132027626, "loss_lvr": 0.8529234528541565, "loss_mode_switch": 0.0, "loss_total": 0.19491825997829437, "step": 337 }, { "batch_size": 4, "epoch": 0.1348, "step": 337, "tokens_per_device": 4496 }, { "epoch": 0.1348, "loss_ce": 0.27167174220085144, "loss_lvr": 1.160231113433838, "loss_mode_switch": 0.0, "loss_total": 0.3876948654651642, "step": 337 }, { "batch_size": 4, "epoch": 0.1348, "step": 337, "tokens_per_device": 3484 }, { "epoch": 0.1348, "loss_ce": 0.4312739968299866, "loss_lvr": 1.3285009860992432, "loss_mode_switch": 0.0, "loss_total": 0.5641241073608398, "step": 337 }, { "batch_size": 4, "epoch": 0.1348, "step": 337, "tokens_per_device": 3760 }, { "epoch": 0.1348, "loss_ce": 0.29786157608032227, "loss_lvr": 1.2197951078414917, "loss_mode_switch": 0.0, "loss_total": 0.41984108090400696, "step": 337 }, { "batch_size": 4, "epoch": 0.1348, "step": 337, "tokens_per_device": 9588 }, { "epoch": 0.1348, "loss_ce": 0.3497493267059326, "loss_lvr": 1.3017903566360474, "loss_mode_switch": 0.0, "loss_total": 0.4799283742904663, "step": 337 }, { "batch_size": 4, "epoch": 0.1348, "step": 337, "tokens_per_device": 4300 }, { "epoch": 0.1348, "loss_ce": 0.6366587281227112, "loss_lvr": 1.0764217376708984, "loss_mode_switch": 0.0, "loss_total": 0.744300901889801, "step": 337 }, { "epoch": 0.1352, "grad_norm": 1.621329665184021, "learning_rate": 9.712576466415935e-06, "loss": 0.3668, "step": 338 }, { "batch_size": 4, "epoch": 0.1352, "step": 338, "tokens_per_device": 4448 }, { "epoch": 0.1352, "loss_ce": 0.10659125447273254, "loss_lvr": 1.0706934928894043, "loss_mode_switch": 0.0, "loss_total": 0.2136605978012085, "step": 338 }, { "batch_size": 1, "epoch": 0.1352, "step": 338, "tokens_per_device": 5118 }, { "epoch": 0.1352, "loss_ce": 0.0032545928843319416, "loss_lvr": 0.7170109152793884, "loss_mode_switch": 0.0, "loss_total": 0.07495568692684174, "step": 338 }, { "batch_size": 4, "epoch": 0.1352, "step": 338, "tokens_per_device": 3928 }, { "epoch": 0.1352, "loss_ce": 0.7499417066574097, "loss_lvr": 2.2939658164978027, "loss_mode_switch": 0.0, "loss_total": 0.9793382883071899, "step": 338 }, { "batch_size": 4, "epoch": 0.1352, "step": 338, "tokens_per_device": 3908 }, { "epoch": 0.1352, "loss_ce": 0.06927407532930374, "loss_lvr": 0.965730607509613, "loss_mode_switch": 0.0, "loss_total": 0.16584713757038116, "step": 338 }, { "batch_size": 1, "epoch": 0.1352, "step": 338, "tokens_per_device": 5234 }, { "epoch": 0.1352, "loss_ce": 0.09852343052625656, "loss_lvr": 0.49727776646614075, "loss_mode_switch": 0.0, "loss_total": 0.14825120568275452, "step": 338 }, { "batch_size": 4, "epoch": 0.1352, "step": 338, "tokens_per_device": 8424 }, { "epoch": 0.1352, "loss_ce": 0.3430904150009155, "loss_lvr": 0.8388459086418152, "loss_mode_switch": 0.0, "loss_total": 0.4269750118255615, "step": 338 }, { "batch_size": 4, "epoch": 0.1352, "step": 338, "tokens_per_device": 1424 }, { "epoch": 0.1352, "loss_ce": 0.2759658992290497, "loss_lvr": 1.252942442893982, "loss_mode_switch": 0.0, "loss_total": 0.4012601375579834, "step": 338 }, { "batch_size": 4, "epoch": 0.1352, "step": 338, "tokens_per_device": 7660 }, { "epoch": 0.1352, "loss_ce": 0.5471052527427673, "loss_lvr": 0.8788697123527527, "loss_mode_switch": 0.0, "loss_total": 0.634992241859436, "step": 338 }, { "epoch": 0.1356, "grad_norm": 1.668126106262207, "learning_rate": 9.710407967909336e-06, "loss": 0.3201, "step": 339 }, { "batch_size": 4, "epoch": 0.1356, "step": 339, "tokens_per_device": 1732 }, { "epoch": 0.1356, "loss_ce": 0.04459073022007942, "loss_lvr": 1.0793532133102417, "loss_mode_switch": 0.0, "loss_total": 0.15252605080604553, "step": 339 }, { "batch_size": 1, "epoch": 0.1356, "step": 339, "tokens_per_device": 4868 }, { "epoch": 0.1356, "loss_ce": 0.11165235191583633, "loss_lvr": 0.3531650900840759, "loss_mode_switch": 0.0, "loss_total": 0.14696885645389557, "step": 339 }, { "batch_size": 1, "epoch": 0.1356, "step": 339, "tokens_per_device": 4859 }, { "epoch": 0.1356, "loss_ce": 0.011444887146353722, "loss_lvr": 0.427213191986084, "loss_mode_switch": 0.0, "loss_total": 0.05416620522737503, "step": 339 }, { "batch_size": 1, "epoch": 0.1356, "step": 339, "tokens_per_device": 5075 }, { "epoch": 0.1356, "loss_ce": 0.07398883253335953, "loss_lvr": 0.925358772277832, "loss_mode_switch": 0.0, "loss_total": 0.1665247082710266, "step": 339 }, { "batch_size": 1, "epoch": 0.1356, "step": 339, "tokens_per_device": 5150 }, { "epoch": 0.1356, "loss_ce": 0.22650550305843353, "loss_lvr": 0.5415854454040527, "loss_mode_switch": 0.0, "loss_total": 0.2806640565395355, "step": 339 }, { "batch_size": 4, "epoch": 0.1356, "step": 339, "tokens_per_device": 4216 }, { "epoch": 0.1356, "loss_ce": 0.12813447415828705, "loss_lvr": 1.0641902685165405, "loss_mode_switch": 0.0, "loss_total": 0.2345535010099411, "step": 339 }, { "batch_size": 4, "epoch": 0.1356, "step": 339, "tokens_per_device": 4228 }, { "epoch": 0.1356, "loss_ce": 0.1299801766872406, "loss_lvr": 0.5266320109367371, "loss_mode_switch": 0.0, "loss_total": 0.18264338374137878, "step": 339 }, { "batch_size": 1, "epoch": 0.1356, "step": 339, "tokens_per_device": 5149 }, { "epoch": 0.1356, "loss_ce": 0.007658602669835091, "loss_lvr": 0.5727499723434448, "loss_mode_switch": 0.0, "loss_total": 0.06493359804153442, "step": 339 }, { "epoch": 0.136, "grad_norm": 1.4661600589752197, "learning_rate": 9.708231563804828e-06, "loss": 0.3233, "step": 340 }, { "batch_size": 4, "epoch": 0.136, "step": 340, "tokens_per_device": 10052 }, { "epoch": 0.136, "loss_ce": 0.039576247334480286, "loss_lvr": 0.8073220252990723, "loss_mode_switch": 0.0, "loss_total": 0.12030845135450363, "step": 340 }, { "batch_size": 4, "epoch": 0.136, "step": 340, "tokens_per_device": 2704 }, { "epoch": 0.136, "loss_ce": 0.49236956238746643, "loss_lvr": 0.7392624020576477, "loss_mode_switch": 0.0, "loss_total": 0.5662958025932312, "step": 340 }, { "batch_size": 4, "epoch": 0.136, "step": 340, "tokens_per_device": 4316 }, { "epoch": 0.136, "loss_ce": 0.16517698764801025, "loss_lvr": 0.9565781950950623, "loss_mode_switch": 0.0, "loss_total": 0.26083481311798096, "step": 340 }, { "batch_size": 4, "epoch": 0.136, "step": 340, "tokens_per_device": 4264 }, { "epoch": 0.136, "loss_ce": 0.5732089877128601, "loss_lvr": 1.1376577615737915, "loss_mode_switch": 0.0, "loss_total": 0.6869747638702393, "step": 340 }, { "batch_size": 4, "epoch": 0.136, "step": 340, "tokens_per_device": 4728 }, { "epoch": 0.136, "loss_ce": 0.19409900903701782, "loss_lvr": 0.7947728633880615, "loss_mode_switch": 0.0, "loss_total": 0.2735762894153595, "step": 340 }, { "batch_size": 4, "epoch": 0.136, "step": 340, "tokens_per_device": 4284 }, { "epoch": 0.136, "loss_ce": 0.21053674817085266, "loss_lvr": 0.8390436768531799, "loss_mode_switch": 0.0, "loss_total": 0.2944411039352417, "step": 340 }, { "batch_size": 4, "epoch": 0.136, "step": 340, "tokens_per_device": 1544 }, { "epoch": 0.136, "loss_ce": 0.5166521072387695, "loss_lvr": 1.097258448600769, "loss_mode_switch": 0.0, "loss_total": 0.6263779401779175, "step": 340 }, { "batch_size": 1, "epoch": 0.136, "step": 340, "tokens_per_device": 4886 }, { "epoch": 0.136, "loss_ce": 0.10175967961549759, "loss_lvr": 1.2701495885849, "loss_mode_switch": 0.0, "loss_total": 0.22877463698387146, "step": 340 }, { "epoch": 0.1364, "grad_norm": 1.6859822273254395, "learning_rate": 9.706047257755124e-06, "loss": 0.3688, "step": 341 }, { "batch_size": 4, "epoch": 0.1364, "step": 341, "tokens_per_device": 15520 }, { "epoch": 0.1364, "loss_ce": 0.252411425113678, "loss_lvr": 0.8587133288383484, "loss_mode_switch": 0.0, "loss_total": 0.3382827639579773, "step": 341 }, { "batch_size": 1, "epoch": 0.1364, "step": 341, "tokens_per_device": 4887 }, { "epoch": 0.1364, "loss_ce": 0.035377077758312225, "loss_lvr": 0.9242126941680908, "loss_mode_switch": 0.0, "loss_total": 0.12779834866523743, "step": 341 }, { "batch_size": 4, "epoch": 0.1364, "step": 341, "tokens_per_device": 2720 }, { "epoch": 0.1364, "loss_ce": 0.19345465302467346, "loss_lvr": 0.8919506072998047, "loss_mode_switch": 0.0, "loss_total": 0.2826497256755829, "step": 341 }, { "batch_size": 4, "epoch": 0.1364, "step": 341, "tokens_per_device": 11944 }, { "epoch": 0.1364, "loss_ce": 0.31394022703170776, "loss_lvr": 0.8466386795043945, "loss_mode_switch": 0.0, "loss_total": 0.3986040949821472, "step": 341 }, { "batch_size": 4, "epoch": 0.1364, "step": 341, "tokens_per_device": 1424 }, { "epoch": 0.1364, "loss_ce": 0.4522591829299927, "loss_lvr": 1.0364820957183838, "loss_mode_switch": 0.0, "loss_total": 0.5559073686599731, "step": 341 }, { "batch_size": 1, "epoch": 0.1364, "step": 341, "tokens_per_device": 4973 }, { "epoch": 0.1364, "loss_ce": 0.3683275282382965, "loss_lvr": 0.45626404881477356, "loss_mode_switch": 0.0, "loss_total": 0.4139539301395416, "step": 341 }, { "batch_size": 4, "epoch": 0.1364, "step": 341, "tokens_per_device": 3792 }, { "epoch": 0.1364, "loss_ce": 0.1291356086730957, "loss_lvr": 0.7203459739685059, "loss_mode_switch": 0.0, "loss_total": 0.2011702060699463, "step": 341 }, { "batch_size": 1, "epoch": 0.1364, "step": 341, "tokens_per_device": 4978 }, { "epoch": 0.1364, "loss_ce": 0.3280041515827179, "loss_lvr": 0.4753386974334717, "loss_mode_switch": 0.0, "loss_total": 0.37553802132606506, "step": 341 }, { "epoch": 0.1368, "grad_norm": 1.376504898071289, "learning_rate": 9.703855053426202e-06, "loss": 0.3147, "step": 342 }, { "batch_size": 1, "epoch": 0.1368, "step": 342, "tokens_per_device": 4864 }, { "epoch": 0.1368, "loss_ce": 0.17596422135829926, "loss_lvr": 0.8568013310432434, "loss_mode_switch": 0.0, "loss_total": 0.2616443634033203, "step": 342 }, { "batch_size": 4, "epoch": 0.1368, "step": 342, "tokens_per_device": 4148 }, { "epoch": 0.1368, "loss_ce": 0.4561172127723694, "loss_lvr": 0.8971259593963623, "loss_mode_switch": 0.0, "loss_total": 0.5458298325538635, "step": 342 }, { "batch_size": 4, "epoch": 0.1368, "step": 342, "tokens_per_device": 4544 }, { "epoch": 0.1368, "loss_ce": 0.3231925964355469, "loss_lvr": 0.9811719059944153, "loss_mode_switch": 0.0, "loss_total": 0.42130979895591736, "step": 342 }, { "batch_size": 4, "epoch": 0.1368, "step": 342, "tokens_per_device": 3740 }, { "epoch": 0.1368, "loss_ce": 0.5279861688613892, "loss_lvr": 1.1355050802230835, "loss_mode_switch": 0.0, "loss_total": 0.6415366530418396, "step": 342 }, { "batch_size": 1, "epoch": 0.1368, "step": 342, "tokens_per_device": 5110 }, { "epoch": 0.1368, "loss_ce": 0.003032782580703497, "loss_lvr": 0.3748120963573456, "loss_mode_switch": 0.0, "loss_total": 0.04051399230957031, "step": 342 }, { "batch_size": 4, "epoch": 0.1368, "step": 342, "tokens_per_device": 1336 }, { "epoch": 0.1368, "loss_ce": 0.3351143002510071, "loss_lvr": 1.0898923873901367, "loss_mode_switch": 0.0, "loss_total": 0.44410353899002075, "step": 342 }, { "batch_size": 4, "epoch": 0.1368, "step": 342, "tokens_per_device": 5668 }, { "epoch": 0.1368, "loss_ce": 0.20399217307567596, "loss_lvr": 0.8554058074951172, "loss_mode_switch": 0.0, "loss_total": 0.28953275084495544, "step": 342 }, { "batch_size": 4, "epoch": 0.1368, "step": 342, "tokens_per_device": 5720 }, { "epoch": 0.1368, "loss_ce": 0.20201121270656586, "loss_lvr": 1.1567308902740479, "loss_mode_switch": 0.0, "loss_total": 0.3176842927932739, "step": 342 }, { "epoch": 0.1372, "grad_norm": 1.4028770923614502, "learning_rate": 9.701654954497294e-06, "loss": 0.3277, "step": 343 }, { "batch_size": 4, "epoch": 0.1372, "step": 343, "tokens_per_device": 1524 }, { "epoch": 0.1372, "loss_ce": 0.34001368284225464, "loss_lvr": 1.0866018533706665, "loss_mode_switch": 0.0, "loss_total": 0.44867387413978577, "step": 343 }, { "batch_size": 4, "epoch": 0.1372, "step": 343, "tokens_per_device": 3904 }, { "epoch": 0.1372, "loss_ce": 0.07970749586820602, "loss_lvr": 1.1290217638015747, "loss_mode_switch": 0.0, "loss_total": 0.19260966777801514, "step": 343 }, { "batch_size": 4, "epoch": 0.1372, "step": 343, "tokens_per_device": 2908 }, { "epoch": 0.1372, "loss_ce": 0.3528155982494354, "loss_lvr": 0.9642813205718994, "loss_mode_switch": 0.0, "loss_total": 0.4492437243461609, "step": 343 }, { "batch_size": 4, "epoch": 0.1372, "step": 343, "tokens_per_device": 4948 }, { "epoch": 0.1372, "loss_ce": 0.6028176546096802, "loss_lvr": 0.9681093096733093, "loss_mode_switch": 0.0, "loss_total": 0.6996285915374756, "step": 343 }, { "batch_size": 1, "epoch": 0.1372, "step": 343, "tokens_per_device": 5109 }, { "epoch": 0.1372, "loss_ce": 0.014079366810619831, "loss_lvr": 0.818157970905304, "loss_mode_switch": 0.0, "loss_total": 0.09589517116546631, "step": 343 }, { "batch_size": 4, "epoch": 0.1372, "step": 343, "tokens_per_device": 5840 }, { "epoch": 0.1372, "loss_ce": 0.27306076884269714, "loss_lvr": 1.4604140520095825, "loss_mode_switch": 0.0, "loss_total": 0.41910219192504883, "step": 343 }, { "batch_size": 4, "epoch": 0.1372, "step": 343, "tokens_per_device": 2684 }, { "epoch": 0.1372, "loss_ce": 0.3694804906845093, "loss_lvr": 0.8076186180114746, "loss_mode_switch": 0.0, "loss_total": 0.4502423405647278, "step": 343 }, { "batch_size": 4, "epoch": 0.1372, "step": 343, "tokens_per_device": 2728 }, { "epoch": 0.1372, "loss_ce": 0.4610940217971802, "loss_lvr": 1.0983155965805054, "loss_mode_switch": 0.0, "loss_total": 0.5709255933761597, "step": 343 }, { "epoch": 0.1376, "grad_norm": 1.4096225500106812, "learning_rate": 9.699446964660882e-06, "loss": 0.3527, "step": 344 }, { "batch_size": 1, "epoch": 0.1376, "step": 344, "tokens_per_device": 4894 }, { "epoch": 0.1376, "loss_ce": 0.24175462126731873, "loss_lvr": 0.21682943403720856, "loss_mode_switch": 0.0, "loss_total": 0.26343756914138794, "step": 344 }, { "batch_size": 4, "epoch": 0.1376, "step": 344, "tokens_per_device": 1348 }, { "epoch": 0.1376, "loss_ce": 0.4975872337818146, "loss_lvr": 1.2553220987319946, "loss_mode_switch": 0.0, "loss_total": 0.6231194734573364, "step": 344 }, { "batch_size": 4, "epoch": 0.1376, "step": 344, "tokens_per_device": 3464 }, { "epoch": 0.1376, "loss_ce": 0.6429182887077332, "loss_lvr": 0.6783955097198486, "loss_mode_switch": 0.0, "loss_total": 0.710757851600647, "step": 344 }, { "batch_size": 4, "epoch": 0.1376, "step": 344, "tokens_per_device": 9024 }, { "epoch": 0.1376, "loss_ce": 0.028689850121736526, "loss_lvr": 1.0756539106369019, "loss_mode_switch": 0.0, "loss_total": 0.13625524938106537, "step": 344 }, { "batch_size": 4, "epoch": 0.1376, "step": 344, "tokens_per_device": 4356 }, { "epoch": 0.1376, "loss_ce": 0.01807568036019802, "loss_lvr": 1.371671438217163, "loss_mode_switch": 0.0, "loss_total": 0.15524281561374664, "step": 344 }, { "batch_size": 1, "epoch": 0.1376, "step": 344, "tokens_per_device": 4872 }, { "epoch": 0.1376, "loss_ce": 0.0424630343914032, "loss_lvr": 0.4163399636745453, "loss_mode_switch": 0.0, "loss_total": 0.08409702777862549, "step": 344 }, { "batch_size": 4, "epoch": 0.1376, "step": 344, "tokens_per_device": 1448 }, { "epoch": 0.1376, "loss_ce": 0.3090309500694275, "loss_lvr": 1.1200459003448486, "loss_mode_switch": 0.0, "loss_total": 0.4210355281829834, "step": 344 }, { "batch_size": 4, "epoch": 0.1376, "step": 344, "tokens_per_device": 5852 }, { "epoch": 0.1376, "loss_ce": 0.2594207227230072, "loss_lvr": 1.0100460052490234, "loss_mode_switch": 0.0, "loss_total": 0.36042532324790955, "step": 344 }, { "epoch": 0.138, "grad_norm": 1.7007917165756226, "learning_rate": 9.697231087622691e-06, "loss": 0.3663, "step": 345 }, { "batch_size": 4, "epoch": 0.138, "step": 345, "tokens_per_device": 4276 }, { "epoch": 0.138, "loss_ce": 0.227227121591568, "loss_lvr": 0.7651155591011047, "loss_mode_switch": 0.0, "loss_total": 0.30373868346214294, "step": 345 }, { "batch_size": 4, "epoch": 0.138, "step": 345, "tokens_per_device": 5216 }, { "epoch": 0.138, "loss_ce": 0.27900487184524536, "loss_lvr": 0.9590309858322144, "loss_mode_switch": 0.0, "loss_total": 0.3749079704284668, "step": 345 }, { "batch_size": 4, "epoch": 0.138, "step": 345, "tokens_per_device": 7272 }, { "epoch": 0.138, "loss_ce": 0.30325499176979065, "loss_lvr": 0.7912468910217285, "loss_mode_switch": 0.0, "loss_total": 0.3823796808719635, "step": 345 }, { "batch_size": 4, "epoch": 0.138, "step": 345, "tokens_per_device": 2664 }, { "epoch": 0.138, "loss_ce": 0.14397844672203064, "loss_lvr": 1.044672966003418, "loss_mode_switch": 0.0, "loss_total": 0.24844574928283691, "step": 345 }, { "batch_size": 4, "epoch": 0.138, "step": 345, "tokens_per_device": 5852 }, { "epoch": 0.138, "loss_ce": 0.13615922629833221, "loss_lvr": 1.062852382659912, "loss_mode_switch": 0.0, "loss_total": 0.2424444556236267, "step": 345 }, { "batch_size": 4, "epoch": 0.138, "step": 345, "tokens_per_device": 2656 }, { "epoch": 0.138, "loss_ce": 0.27722370624542236, "loss_lvr": 0.9593098759651184, "loss_mode_switch": 0.0, "loss_total": 0.3731546998023987, "step": 345 }, { "batch_size": 1, "epoch": 0.138, "step": 345, "tokens_per_device": 4665 }, { "epoch": 0.138, "loss_ce": 0.0532379150390625, "loss_lvr": 0.5690824389457703, "loss_mode_switch": 0.0, "loss_total": 0.110146164894104, "step": 345 }, { "batch_size": 4, "epoch": 0.138, "step": 345, "tokens_per_device": 4832 }, { "epoch": 0.138, "loss_ce": 0.17373895645141602, "loss_lvr": 0.9995779395103455, "loss_mode_switch": 0.0, "loss_total": 0.27369675040245056, "step": 345 }, { "epoch": 0.1384, "grad_norm": 1.5374761819839478, "learning_rate": 9.695007327101685e-06, "loss": 0.3663, "step": 346 }, { "batch_size": 4, "epoch": 0.1384, "step": 346, "tokens_per_device": 14424 }, { "epoch": 0.1384, "loss_ce": 0.06315107643604279, "loss_lvr": 0.9791688323020935, "loss_mode_switch": 0.0, "loss_total": 0.16106796264648438, "step": 346 }, { "batch_size": 4, "epoch": 0.1384, "step": 346, "tokens_per_device": 4240 }, { "epoch": 0.1384, "loss_ce": 0.27831149101257324, "loss_lvr": 0.9898369908332825, "loss_mode_switch": 0.0, "loss_total": 0.37729519605636597, "step": 346 }, { "batch_size": 1, "epoch": 0.1384, "step": 346, "tokens_per_device": 5918 }, { "epoch": 0.1384, "loss_ce": 0.0020129489712417126, "loss_lvr": 0.45818328857421875, "loss_mode_switch": 0.0, "loss_total": 0.047831278294324875, "step": 346 }, { "batch_size": 4, "epoch": 0.1384, "step": 346, "tokens_per_device": 4312 }, { "epoch": 0.1384, "loss_ce": 0.044318120926618576, "loss_lvr": 0.8449575304985046, "loss_mode_switch": 0.0, "loss_total": 0.12881387770175934, "step": 346 }, { "batch_size": 4, "epoch": 0.1384, "step": 346, "tokens_per_device": 1856 }, { "epoch": 0.1384, "loss_ce": 0.015186781994998455, "loss_lvr": 1.0167286396026611, "loss_mode_switch": 0.0, "loss_total": 0.11685964465141296, "step": 346 }, { "batch_size": 4, "epoch": 0.1384, "step": 346, "tokens_per_device": 3888 }, { "epoch": 0.1384, "loss_ce": 0.6411693096160889, "loss_lvr": 0.9400085806846619, "loss_mode_switch": 0.0, "loss_total": 0.7351701855659485, "step": 346 }, { "batch_size": 4, "epoch": 0.1384, "step": 346, "tokens_per_device": 2664 }, { "epoch": 0.1384, "loss_ce": 0.6418445110321045, "loss_lvr": 1.1646751165390015, "loss_mode_switch": 0.0, "loss_total": 0.7583120465278625, "step": 346 }, { "batch_size": 4, "epoch": 0.1384, "step": 346, "tokens_per_device": 3784 }, { "epoch": 0.1384, "loss_ce": 0.3634702265262604, "loss_lvr": 1.0154772996902466, "loss_mode_switch": 0.0, "loss_total": 0.46501797437667847, "step": 346 }, { "epoch": 0.1388, "grad_norm": 1.3784370422363281, "learning_rate": 9.692775686830057e-06, "loss": 0.3123, "step": 347 }, { "batch_size": 1, "epoch": 0.1388, "step": 347, "tokens_per_device": 5196 }, { "epoch": 0.1388, "loss_ce": 0.05625062435865402, "loss_lvr": 0.5927525758743286, "loss_mode_switch": 0.0, "loss_total": 0.11552588641643524, "step": 347 }, { "batch_size": 1, "epoch": 0.1388, "step": 347, "tokens_per_device": 4885 }, { "epoch": 0.1388, "loss_ce": 0.05116888880729675, "loss_lvr": 0.9020362496376038, "loss_mode_switch": 0.0, "loss_total": 0.14137251675128937, "step": 347 }, { "batch_size": 4, "epoch": 0.1388, "step": 347, "tokens_per_device": 2768 }, { "epoch": 0.1388, "loss_ce": 0.037520624697208405, "loss_lvr": 0.5622093081474304, "loss_mode_switch": 0.0, "loss_total": 0.09374155104160309, "step": 347 }, { "batch_size": 4, "epoch": 0.1388, "step": 347, "tokens_per_device": 3952 }, { "epoch": 0.1388, "loss_ce": 1.3804274797439575, "loss_lvr": 1.3199187517166138, "loss_mode_switch": 0.0, "loss_total": 1.51241934299469, "step": 347 }, { "batch_size": 4, "epoch": 0.1388, "step": 347, "tokens_per_device": 4044 }, { "epoch": 0.1388, "loss_ce": 0.1783190667629242, "loss_lvr": 0.9344997406005859, "loss_mode_switch": 0.0, "loss_total": 0.27176904678344727, "step": 347 }, { "batch_size": 4, "epoch": 0.1388, "step": 347, "tokens_per_device": 4808 }, { "epoch": 0.1388, "loss_ce": 0.14458155632019043, "loss_lvr": 0.8279451727867126, "loss_mode_switch": 0.0, "loss_total": 0.2273760735988617, "step": 347 }, { "batch_size": 1, "epoch": 0.1388, "step": 347, "tokens_per_device": 5159 }, { "epoch": 0.1388, "loss_ce": 0.0017681324388831854, "loss_lvr": 0.4959860146045685, "loss_mode_switch": 0.0, "loss_total": 0.05136673152446747, "step": 347 }, { "batch_size": 4, "epoch": 0.1388, "step": 347, "tokens_per_device": 4376 }, { "epoch": 0.1388, "loss_ce": 0.3785942494869232, "loss_lvr": 1.0695363283157349, "loss_mode_switch": 0.0, "loss_total": 0.48554790019989014, "step": 347 }, { "epoch": 0.1392, "grad_norm": 1.4335486888885498, "learning_rate": 9.690536170553226e-06, "loss": 0.3362, "step": 348 }, { "batch_size": 1, "epoch": 0.1392, "step": 348, "tokens_per_device": 4877 }, { "epoch": 0.1392, "loss_ce": 0.1354801505804062, "loss_lvr": 0.8280584216117859, "loss_mode_switch": 0.0, "loss_total": 0.21828599274158478, "step": 348 }, { "batch_size": 4, "epoch": 0.1392, "step": 348, "tokens_per_device": 1512 }, { "epoch": 0.1392, "loss_ce": 0.38628119230270386, "loss_lvr": 1.0681860446929932, "loss_mode_switch": 0.0, "loss_total": 0.49309980869293213, "step": 348 }, { "batch_size": 4, "epoch": 0.1392, "step": 348, "tokens_per_device": 3280 }, { "epoch": 0.1392, "loss_ce": 0.15723897516727448, "loss_lvr": 0.6699647903442383, "loss_mode_switch": 0.0, "loss_total": 0.2242354452610016, "step": 348 }, { "batch_size": 1, "epoch": 0.1392, "step": 348, "tokens_per_device": 5001 }, { "epoch": 0.1392, "loss_ce": 0.12224576622247696, "loss_lvr": 0.8908125758171082, "loss_mode_switch": 0.0, "loss_total": 0.21132701635360718, "step": 348 }, { "batch_size": 4, "epoch": 0.1392, "step": 348, "tokens_per_device": 5472 }, { "epoch": 0.1392, "loss_ce": 0.3200225532054901, "loss_lvr": 0.83122718334198, "loss_mode_switch": 0.0, "loss_total": 0.40314528346061707, "step": 348 }, { "batch_size": 4, "epoch": 0.1392, "step": 348, "tokens_per_device": 5932 }, { "epoch": 0.1392, "loss_ce": 0.4536101520061493, "loss_lvr": 0.7546654343605042, "loss_mode_switch": 0.0, "loss_total": 0.5290766954421997, "step": 348 }, { "batch_size": 1, "epoch": 0.1392, "step": 348, "tokens_per_device": 4911 }, { "epoch": 0.1392, "loss_ce": 0.15840086340904236, "loss_lvr": 1.7904165983200073, "loss_mode_switch": 0.0, "loss_total": 0.3374425172805786, "step": 348 }, { "batch_size": 1, "epoch": 0.1392, "step": 348, "tokens_per_device": 4902 }, { "epoch": 0.1392, "loss_ce": 0.02592247724533081, "loss_lvr": 0.7515151500701904, "loss_mode_switch": 0.0, "loss_total": 0.10107399523258209, "step": 348 }, { "epoch": 0.1396, "grad_norm": 1.352169394493103, "learning_rate": 9.68828878202983e-06, "loss": 0.3199, "step": 349 }, { "batch_size": 4, "epoch": 0.1396, "step": 349, "tokens_per_device": 4572 }, { "epoch": 0.1396, "loss_ce": 0.0755946934223175, "loss_lvr": 0.9765185713768005, "loss_mode_switch": 0.0, "loss_total": 0.1732465624809265, "step": 349 }, { "batch_size": 4, "epoch": 0.1396, "step": 349, "tokens_per_device": 4328 }, { "epoch": 0.1396, "loss_ce": 0.31173089146614075, "loss_lvr": 1.1012710332870483, "loss_mode_switch": 0.0, "loss_total": 0.421858012676239, "step": 349 }, { "batch_size": 4, "epoch": 0.1396, "step": 349, "tokens_per_device": 3832 }, { "epoch": 0.1396, "loss_ce": 0.1607203185558319, "loss_lvr": 1.0908998250961304, "loss_mode_switch": 0.0, "loss_total": 0.2698103189468384, "step": 349 }, { "batch_size": 1, "epoch": 0.1396, "step": 349, "tokens_per_device": 5012 }, { "epoch": 0.1396, "loss_ce": 0.9461134672164917, "loss_lvr": 1.0169976949691772, "loss_mode_switch": 0.0, "loss_total": 1.0478131771087646, "step": 349 }, { "batch_size": 4, "epoch": 0.1396, "step": 349, "tokens_per_device": 5716 }, { "epoch": 0.1396, "loss_ce": 0.07858610898256302, "loss_lvr": 0.7955946326255798, "loss_mode_switch": 0.0, "loss_total": 0.15814557671546936, "step": 349 }, { "batch_size": 4, "epoch": 0.1396, "step": 349, "tokens_per_device": 7516 }, { "epoch": 0.1396, "loss_ce": 0.4639666676521301, "loss_lvr": 0.6046556830406189, "loss_mode_switch": 0.0, "loss_total": 0.5244322419166565, "step": 349 }, { "batch_size": 4, "epoch": 0.1396, "step": 349, "tokens_per_device": 2712 }, { "epoch": 0.1396, "loss_ce": 0.35158878564834595, "loss_lvr": 0.5165221095085144, "loss_mode_switch": 0.0, "loss_total": 0.40324100852012634, "step": 349 }, { "batch_size": 1, "epoch": 0.1396, "step": 349, "tokens_per_device": 5257 }, { "epoch": 0.1396, "loss_ce": 0.2645326852798462, "loss_lvr": 0.7122482061386108, "loss_mode_switch": 0.0, "loss_total": 0.3357574939727783, "step": 349 }, { "epoch": 0.14, "grad_norm": 1.5420318841934204, "learning_rate": 9.68603352503172e-06, "loss": 0.3648, "step": 350 }, { "batch_size": 4, "epoch": 0.14, "step": 350, "tokens_per_device": 5232 }, { "epoch": 0.14, "loss_ce": 0.5318362712860107, "loss_lvr": 0.938467264175415, "loss_mode_switch": 0.0, "loss_total": 0.6256830096244812, "step": 350 }, { "batch_size": 4, "epoch": 0.14, "step": 350, "tokens_per_device": 4048 }, { "epoch": 0.14, "loss_ce": 0.1353052407503128, "loss_lvr": 1.0355852842330933, "loss_mode_switch": 0.0, "loss_total": 0.2388637661933899, "step": 350 }, { "batch_size": 4, "epoch": 0.14, "step": 350, "tokens_per_device": 4296 }, { "epoch": 0.14, "loss_ce": 0.18781395256519318, "loss_lvr": 0.6698618531227112, "loss_mode_switch": 0.0, "loss_total": 0.25480014085769653, "step": 350 }, { "batch_size": 4, "epoch": 0.14, "step": 350, "tokens_per_device": 10268 }, { "epoch": 0.14, "loss_ce": 0.2241048961877823, "loss_lvr": 0.8682483434677124, "loss_mode_switch": 0.0, "loss_total": 0.31092971563339233, "step": 350 }, { "batch_size": 1, "epoch": 0.14, "step": 350, "tokens_per_device": 5016 }, { "epoch": 0.14, "loss_ce": 0.21762947738170624, "loss_lvr": 0.4963386058807373, "loss_mode_switch": 0.0, "loss_total": 0.26726335287094116, "step": 350 }, { "batch_size": 4, "epoch": 0.14, "step": 350, "tokens_per_device": 11376 }, { "epoch": 0.14, "loss_ce": 0.12421678751707077, "loss_lvr": 0.631462037563324, "loss_mode_switch": 0.0, "loss_total": 0.18736299872398376, "step": 350 }, { "batch_size": 4, "epoch": 0.14, "step": 350, "tokens_per_device": 3976 }, { "epoch": 0.14, "loss_ce": 0.10774076730012894, "loss_lvr": 2.0496833324432373, "loss_mode_switch": 0.0, "loss_total": 0.31270909309387207, "step": 350 }, { "batch_size": 4, "epoch": 0.14, "step": 350, "tokens_per_device": 4236 }, { "epoch": 0.14, "loss_ce": 0.4779713749885559, "loss_lvr": 1.0034818649291992, "loss_mode_switch": 0.0, "loss_total": 0.5783195495605469, "step": 350 }, { "epoch": 0.1404, "grad_norm": 1.5673078298568726, "learning_rate": 9.683770403343947e-06, "loss": 0.3406, "step": 351 }, { "batch_size": 1, "epoch": 0.1404, "step": 351, "tokens_per_device": 5124 }, { "epoch": 0.1404, "loss_ce": 0.0010577745269984007, "loss_lvr": 0.47723865509033203, "loss_mode_switch": 0.0, "loss_total": 0.04878164082765579, "step": 351 }, { "batch_size": 4, "epoch": 0.1404, "step": 351, "tokens_per_device": 1668 }, { "epoch": 0.1404, "loss_ce": 0.7312259078025818, "loss_lvr": 1.0286694765090942, "loss_mode_switch": 0.0, "loss_total": 0.8340928554534912, "step": 351 }, { "batch_size": 4, "epoch": 0.1404, "step": 351, "tokens_per_device": 6016 }, { "epoch": 0.1404, "loss_ce": 0.1564597487449646, "loss_lvr": 0.981818437576294, "loss_mode_switch": 0.0, "loss_total": 0.254641592502594, "step": 351 }, { "batch_size": 1, "epoch": 0.1404, "step": 351, "tokens_per_device": 5336 }, { "epoch": 0.1404, "loss_ce": 0.36991050839424133, "loss_lvr": 0.6754032373428345, "loss_mode_switch": 0.0, "loss_total": 0.4374508261680603, "step": 351 }, { "batch_size": 4, "epoch": 0.1404, "step": 351, "tokens_per_device": 3844 }, { "epoch": 0.1404, "loss_ce": 0.005896930117160082, "loss_lvr": 1.0561907291412354, "loss_mode_switch": 0.0, "loss_total": 0.11151600629091263, "step": 351 }, { "batch_size": 1, "epoch": 0.1404, "step": 351, "tokens_per_device": 4868 }, { "epoch": 0.1404, "loss_ce": 0.03264663740992546, "loss_lvr": 0.5006377100944519, "loss_mode_switch": 0.0, "loss_total": 0.08271040767431259, "step": 351 }, { "batch_size": 4, "epoch": 0.1404, "step": 351, "tokens_per_device": 1468 }, { "epoch": 0.1404, "loss_ce": 0.3560316264629364, "loss_lvr": 1.1765071153640747, "loss_mode_switch": 0.0, "loss_total": 0.47368234395980835, "step": 351 }, { "batch_size": 4, "epoch": 0.1404, "step": 351, "tokens_per_device": 2672 }, { "epoch": 0.1404, "loss_ce": 0.1225009635090828, "loss_lvr": 1.0306476354599, "loss_mode_switch": 0.0, "loss_total": 0.22556573152542114, "step": 351 }, { "epoch": 0.1408, "grad_norm": 1.7940980195999146, "learning_rate": 9.681499420764771e-06, "loss": 0.3746, "step": 352 }, { "batch_size": 1, "epoch": 0.1408, "step": 352, "tokens_per_device": 4729 }, { "epoch": 0.1408, "loss_ce": 0.015098625794053078, "loss_lvr": 0.450899213552475, "loss_mode_switch": 0.0, "loss_total": 0.060188546776771545, "step": 352 }, { "batch_size": 4, "epoch": 0.1408, "step": 352, "tokens_per_device": 1364 }, { "epoch": 0.1408, "loss_ce": 0.7410678267478943, "loss_lvr": 1.2068613767623901, "loss_mode_switch": 0.0, "loss_total": 0.8617539405822754, "step": 352 }, { "batch_size": 4, "epoch": 0.1408, "step": 352, "tokens_per_device": 1160 }, { "epoch": 0.1408, "loss_ce": 0.2424314320087433, "loss_lvr": 1.551737904548645, "loss_mode_switch": 0.0, "loss_total": 0.3976052403450012, "step": 352 }, { "batch_size": 4, "epoch": 0.1408, "step": 352, "tokens_per_device": 11644 }, { "epoch": 0.1408, "loss_ce": 0.36135342717170715, "loss_lvr": 0.5557752251625061, "loss_mode_switch": 0.0, "loss_total": 0.4169309437274933, "step": 352 }, { "batch_size": 1, "epoch": 0.1408, "step": 352, "tokens_per_device": 4862 }, { "epoch": 0.1408, "loss_ce": 0.09103256464004517, "loss_lvr": 0.4978111982345581, "loss_mode_switch": 0.0, "loss_total": 0.1408136785030365, "step": 352 }, { "batch_size": 4, "epoch": 0.1408, "step": 352, "tokens_per_device": 3376 }, { "epoch": 0.1408, "loss_ce": 0.21405261754989624, "loss_lvr": 1.1789114475250244, "loss_mode_switch": 0.0, "loss_total": 0.3319437503814697, "step": 352 }, { "batch_size": 4, "epoch": 0.1408, "step": 352, "tokens_per_device": 4644 }, { "epoch": 0.1408, "loss_ce": 0.20407384634017944, "loss_lvr": 0.8849543929100037, "loss_mode_switch": 0.0, "loss_total": 0.29256927967071533, "step": 352 }, { "batch_size": 4, "epoch": 0.1408, "step": 352, "tokens_per_device": 4348 }, { "epoch": 0.1408, "loss_ce": 0.2823186218738556, "loss_lvr": 1.2237775325775146, "loss_mode_switch": 0.0, "loss_total": 0.40469637513160706, "step": 352 }, { "epoch": 0.1412, "grad_norm": 1.7997419834136963, "learning_rate": 9.679220581105636e-06, "loss": 0.3401, "step": 353 }, { "batch_size": 4, "epoch": 0.1412, "step": 353, "tokens_per_device": 6292 }, { "epoch": 0.1412, "loss_ce": 0.16690364480018616, "loss_lvr": 1.4193739891052246, "loss_mode_switch": 0.0, "loss_total": 0.3088410496711731, "step": 353 }, { "batch_size": 4, "epoch": 0.1412, "step": 353, "tokens_per_device": 1224 }, { "epoch": 0.1412, "loss_ce": 0.42488762736320496, "loss_lvr": 1.102664828300476, "loss_mode_switch": 0.0, "loss_total": 0.5351541042327881, "step": 353 }, { "batch_size": 1, "epoch": 0.1412, "step": 353, "tokens_per_device": 5151 }, { "epoch": 0.1412, "loss_ce": 0.009158351458609104, "loss_lvr": 0.8923566937446594, "loss_mode_switch": 0.0, "loss_total": 0.09839402139186859, "step": 353 }, { "batch_size": 4, "epoch": 0.1412, "step": 353, "tokens_per_device": 4152 }, { "epoch": 0.1412, "loss_ce": 0.028024958446621895, "loss_lvr": 1.0827810764312744, "loss_mode_switch": 0.0, "loss_total": 0.13630306720733643, "step": 353 }, { "batch_size": 4, "epoch": 0.1412, "step": 353, "tokens_per_device": 4316 }, { "epoch": 0.1412, "loss_ce": 0.36413371562957764, "loss_lvr": 0.8991329073905945, "loss_mode_switch": 0.0, "loss_total": 0.4540470242500305, "step": 353 }, { "batch_size": 4, "epoch": 0.1412, "step": 353, "tokens_per_device": 2940 }, { "epoch": 0.1412, "loss_ce": 0.271310955286026, "loss_lvr": 1.0562905073165894, "loss_mode_switch": 0.0, "loss_total": 0.3769400119781494, "step": 353 }, { "batch_size": 1, "epoch": 0.1412, "step": 353, "tokens_per_device": 4789 }, { "epoch": 0.1412, "loss_ce": 0.020249467343091965, "loss_lvr": 0.6360130310058594, "loss_mode_switch": 0.0, "loss_total": 0.08385077118873596, "step": 353 }, { "batch_size": 4, "epoch": 0.1412, "step": 353, "tokens_per_device": 3932 }, { "epoch": 0.1412, "loss_ce": 0.5280002951622009, "loss_lvr": 1.157961368560791, "loss_mode_switch": 0.0, "loss_total": 0.643796443939209, "step": 353 }, { "epoch": 0.1416, "grad_norm": 1.306069016456604, "learning_rate": 9.676933888191178e-06, "loss": 0.313, "step": 354 }, { "batch_size": 1, "epoch": 0.1416, "step": 354, "tokens_per_device": 5170 }, { "epoch": 0.1416, "loss_ce": 0.00838567316532135, "loss_lvr": 1.2790714502334595, "loss_mode_switch": 0.0, "loss_total": 0.13629281520843506, "step": 354 }, { "batch_size": 1, "epoch": 0.1416, "step": 354, "tokens_per_device": 5101 }, { "epoch": 0.1416, "loss_ce": 0.0032973014749586582, "loss_lvr": 0.6307107210159302, "loss_mode_switch": 0.0, "loss_total": 0.06636837124824524, "step": 354 }, { "batch_size": 1, "epoch": 0.1416, "step": 354, "tokens_per_device": 4879 }, { "epoch": 0.1416, "loss_ce": 0.040942054241895676, "loss_lvr": 1.7612773180007935, "loss_mode_switch": 0.0, "loss_total": 0.21706978976726532, "step": 354 }, { "batch_size": 1, "epoch": 0.1416, "step": 354, "tokens_per_device": 4758 }, { "epoch": 0.1416, "loss_ce": 0.1795484721660614, "loss_lvr": 0.3359339237213135, "loss_mode_switch": 0.0, "loss_total": 0.21314185857772827, "step": 354 }, { "batch_size": 4, "epoch": 0.1416, "step": 354, "tokens_per_device": 7808 }, { "epoch": 0.1416, "loss_ce": 0.04769313707947731, "loss_lvr": 0.7368614673614502, "loss_mode_switch": 0.0, "loss_total": 0.12137928605079651, "step": 354 }, { "batch_size": 1, "epoch": 0.1416, "step": 354, "tokens_per_device": 4949 }, { "epoch": 0.1416, "loss_ce": 0.042822226881980896, "loss_lvr": 0.6963954567909241, "loss_mode_switch": 0.0, "loss_total": 0.11246177554130554, "step": 354 }, { "batch_size": 4, "epoch": 0.1416, "step": 354, "tokens_per_device": 6440 }, { "epoch": 0.1416, "loss_ce": 0.04658113420009613, "loss_lvr": 0.7706034779548645, "loss_mode_switch": 0.0, "loss_total": 0.1236414834856987, "step": 354 }, { "batch_size": 4, "epoch": 0.1416, "step": 354, "tokens_per_device": 6080 }, { "epoch": 0.1416, "loss_ce": 0.08017939329147339, "loss_lvr": 0.9018521308898926, "loss_mode_switch": 0.0, "loss_total": 0.1703646183013916, "step": 354 }, { "epoch": 0.142, "grad_norm": 1.4731733798980713, "learning_rate": 9.674639345859213e-06, "loss": 0.2851, "step": 355 }, { "batch_size": 4, "epoch": 0.142, "step": 355, "tokens_per_device": 4092 }, { "epoch": 0.142, "loss_ce": 0.3377808630466461, "loss_lvr": 0.7297369241714478, "loss_mode_switch": 0.0, "loss_total": 0.41075456142425537, "step": 355 }, { "batch_size": 4, "epoch": 0.142, "step": 355, "tokens_per_device": 5232 }, { "epoch": 0.142, "loss_ce": 0.9570870399475098, "loss_lvr": 1.0063962936401367, "loss_mode_switch": 0.0, "loss_total": 1.0577266216278076, "step": 355 }, { "batch_size": 4, "epoch": 0.142, "step": 355, "tokens_per_device": 4196 }, { "epoch": 0.142, "loss_ce": 0.5968063473701477, "loss_lvr": 1.1582597494125366, "loss_mode_switch": 0.0, "loss_total": 0.7126322984695435, "step": 355 }, { "batch_size": 4, "epoch": 0.142, "step": 355, "tokens_per_device": 5760 }, { "epoch": 0.142, "loss_ce": 0.00406266376376152, "loss_lvr": 2.363830804824829, "loss_mode_switch": 0.0, "loss_total": 0.24044574797153473, "step": 355 }, { "batch_size": 4, "epoch": 0.142, "step": 355, "tokens_per_device": 4620 }, { "epoch": 0.142, "loss_ce": 0.19633454084396362, "loss_lvr": 0.8536315560340881, "loss_mode_switch": 0.0, "loss_total": 0.28169769048690796, "step": 355 }, { "batch_size": 4, "epoch": 0.142, "step": 355, "tokens_per_device": 2568 }, { "epoch": 0.142, "loss_ce": 0.41666558384895325, "loss_lvr": 1.7029414176940918, "loss_mode_switch": 0.0, "loss_total": 0.586959719657898, "step": 355 }, { "batch_size": 1, "epoch": 0.142, "step": 355, "tokens_per_device": 6552 }, { "epoch": 0.142, "loss_ce": 0.00259156571701169, "loss_lvr": 0.5737541317939758, "loss_mode_switch": 0.0, "loss_total": 0.059966977685689926, "step": 355 }, { "batch_size": 1, "epoch": 0.142, "step": 355, "tokens_per_device": 4884 }, { "epoch": 0.142, "loss_ce": 0.452217161655426, "loss_lvr": 1.3147447109222412, "loss_mode_switch": 0.0, "loss_total": 0.5836916565895081, "step": 355 }, { "epoch": 0.1424, "grad_norm": 1.5225876569747925, "learning_rate": 9.67233695796073e-06, "loss": 0.3556, "step": 356 }, { "batch_size": 4, "epoch": 0.1424, "step": 356, "tokens_per_device": 4192 }, { "epoch": 0.1424, "loss_ce": 0.29131028056144714, "loss_lvr": 1.1893490552902222, "loss_mode_switch": 0.0, "loss_total": 0.4102451801300049, "step": 356 }, { "batch_size": 4, "epoch": 0.1424, "step": 356, "tokens_per_device": 5288 }, { "epoch": 0.1424, "loss_ce": 0.13238923251628876, "loss_lvr": 0.9726778864860535, "loss_mode_switch": 0.0, "loss_total": 0.22965702414512634, "step": 356 }, { "batch_size": 4, "epoch": 0.1424, "step": 356, "tokens_per_device": 4212 }, { "epoch": 0.1424, "loss_ce": 0.05408906564116478, "loss_lvr": 0.999667227268219, "loss_mode_switch": 0.0, "loss_total": 0.15405578911304474, "step": 356 }, { "batch_size": 4, "epoch": 0.1424, "step": 356, "tokens_per_device": 4936 }, { "epoch": 0.1424, "loss_ce": 0.24056033790111542, "loss_lvr": 1.0082060098648071, "loss_mode_switch": 0.0, "loss_total": 0.3413809537887573, "step": 356 }, { "batch_size": 4, "epoch": 0.1424, "step": 356, "tokens_per_device": 5784 }, { "epoch": 0.1424, "loss_ce": 0.09186825156211853, "loss_lvr": 0.7598844766616821, "loss_mode_switch": 0.0, "loss_total": 0.16785669326782227, "step": 356 }, { "batch_size": 4, "epoch": 0.1424, "step": 356, "tokens_per_device": 1344 }, { "epoch": 0.1424, "loss_ce": 0.6338379979133606, "loss_lvr": 1.2241647243499756, "loss_mode_switch": 0.0, "loss_total": 0.7562544941902161, "step": 356 }, { "batch_size": 1, "epoch": 0.1424, "step": 356, "tokens_per_device": 7473 }, { "epoch": 0.1424, "loss_ce": 0.03664606064558029, "loss_lvr": 0.5706009864807129, "loss_mode_switch": 0.0, "loss_total": 0.0937061607837677, "step": 356 }, { "batch_size": 4, "epoch": 0.1424, "step": 356, "tokens_per_device": 4460 }, { "epoch": 0.1424, "loss_ce": 0.26654142141342163, "loss_lvr": 0.7997124791145325, "loss_mode_switch": 0.0, "loss_total": 0.34651267528533936, "step": 356 }, { "epoch": 0.1428, "grad_norm": 1.622273564338684, "learning_rate": 9.670026728359884e-06, "loss": 0.3282, "step": 357 }, { "batch_size": 4, "epoch": 0.1428, "step": 357, "tokens_per_device": 3872 }, { "epoch": 0.1428, "loss_ce": 0.6987056732177734, "loss_lvr": 1.1663739681243896, "loss_mode_switch": 0.0, "loss_total": 0.8153430819511414, "step": 357 }, { "batch_size": 4, "epoch": 0.1428, "step": 357, "tokens_per_device": 4544 }, { "epoch": 0.1428, "loss_ce": 0.2549059987068176, "loss_lvr": 1.011715054512024, "loss_mode_switch": 0.0, "loss_total": 0.35607749223709106, "step": 357 }, { "batch_size": 4, "epoch": 0.1428, "step": 357, "tokens_per_device": 2680 }, { "epoch": 0.1428, "loss_ce": 0.4229711890220642, "loss_lvr": 1.2564654350280762, "loss_mode_switch": 0.0, "loss_total": 0.5486177206039429, "step": 357 }, { "batch_size": 4, "epoch": 0.1428, "step": 357, "tokens_per_device": 5188 }, { "epoch": 0.1428, "loss_ce": 0.1387084722518921, "loss_lvr": 0.7609565258026123, "loss_mode_switch": 0.0, "loss_total": 0.21480412781238556, "step": 357 }, { "batch_size": 4, "epoch": 0.1428, "step": 357, "tokens_per_device": 5208 }, { "epoch": 0.1428, "loss_ce": 0.35960137844085693, "loss_lvr": 0.9752299189567566, "loss_mode_switch": 0.0, "loss_total": 0.45712438225746155, "step": 357 }, { "batch_size": 1, "epoch": 0.1428, "step": 357, "tokens_per_device": 4877 }, { "epoch": 0.1428, "loss_ce": 0.018388614058494568, "loss_lvr": 0.4400865137577057, "loss_mode_switch": 0.0, "loss_total": 0.062397267669439316, "step": 357 }, { "batch_size": 4, "epoch": 0.1428, "step": 357, "tokens_per_device": 3764 }, { "epoch": 0.1428, "loss_ce": 0.516835629940033, "loss_lvr": 2.030363082885742, "loss_mode_switch": 0.0, "loss_total": 0.7198719382286072, "step": 357 }, { "batch_size": 1, "epoch": 0.1428, "step": 357, "tokens_per_device": 4888 }, { "epoch": 0.1428, "loss_ce": 0.0011442814720794559, "loss_lvr": 0.32128000259399414, "loss_mode_switch": 0.0, "loss_total": 0.033272285014390945, "step": 357 }, { "epoch": 0.1432, "grad_norm": 1.4802919626235962, "learning_rate": 9.667708660933994e-06, "loss": 0.4028, "step": 358 }, { "batch_size": 4, "epoch": 0.1432, "step": 358, "tokens_per_device": 4592 }, { "epoch": 0.1432, "loss_ce": 0.030711153522133827, "loss_lvr": 0.952667772769928, "loss_mode_switch": 0.0, "loss_total": 0.12597793340682983, "step": 358 }, { "batch_size": 4, "epoch": 0.1432, "step": 358, "tokens_per_device": 4364 }, { "epoch": 0.1432, "loss_ce": 0.5763834714889526, "loss_lvr": 0.8526589870452881, "loss_mode_switch": 0.0, "loss_total": 0.6616493463516235, "step": 358 }, { "batch_size": 1, "epoch": 0.1432, "step": 358, "tokens_per_device": 4894 }, { "epoch": 0.1432, "loss_ce": 0.2509785592556, "loss_lvr": 0.4313165545463562, "loss_mode_switch": 0.0, "loss_total": 0.2941102087497711, "step": 358 }, { "batch_size": 1, "epoch": 0.1432, "step": 358, "tokens_per_device": 4941 }, { "epoch": 0.1432, "loss_ce": 0.07217901200056076, "loss_lvr": 0.8021750450134277, "loss_mode_switch": 0.0, "loss_total": 0.15239651501178741, "step": 358 }, { "batch_size": 1, "epoch": 0.1432, "step": 358, "tokens_per_device": 5092 }, { "epoch": 0.1432, "loss_ce": 0.05433151498436928, "loss_lvr": 0.6787862181663513, "loss_mode_switch": 0.0, "loss_total": 0.12221014499664307, "step": 358 }, { "batch_size": 4, "epoch": 0.1432, "step": 358, "tokens_per_device": 9148 }, { "epoch": 0.1432, "loss_ce": 0.9157136678695679, "loss_lvr": 1.2350801229476929, "loss_mode_switch": 0.0, "loss_total": 1.0392216444015503, "step": 358 }, { "batch_size": 4, "epoch": 0.1432, "step": 358, "tokens_per_device": 4992 }, { "epoch": 0.1432, "loss_ce": 0.5354769825935364, "loss_lvr": 0.8260498642921448, "loss_mode_switch": 0.0, "loss_total": 0.6180819869041443, "step": 358 }, { "batch_size": 1, "epoch": 0.1432, "step": 358, "tokens_per_device": 5123 }, { "epoch": 0.1432, "loss_ce": 0.04389112815260887, "loss_lvr": 0.2511238157749176, "loss_mode_switch": 0.0, "loss_total": 0.06900350749492645, "step": 358 }, { "epoch": 0.1436, "grad_norm": 1.5734699964523315, "learning_rate": 9.665382759573529e-06, "loss": 0.3514, "step": 359 }, { "batch_size": 4, "epoch": 0.1436, "step": 359, "tokens_per_device": 4256 }, { "epoch": 0.1436, "loss_ce": 0.0409972183406353, "loss_lvr": 1.0532722473144531, "loss_mode_switch": 0.0, "loss_total": 0.14632444083690643, "step": 359 }, { "batch_size": 4, "epoch": 0.1436, "step": 359, "tokens_per_device": 5140 }, { "epoch": 0.1436, "loss_ce": 0.8221216201782227, "loss_lvr": 1.2015340328216553, "loss_mode_switch": 0.0, "loss_total": 0.9422750473022461, "step": 359 }, { "batch_size": 4, "epoch": 0.1436, "step": 359, "tokens_per_device": 9452 }, { "epoch": 0.1436, "loss_ce": 0.10924810916185379, "loss_lvr": 0.6770128607749939, "loss_mode_switch": 0.0, "loss_total": 0.1769493967294693, "step": 359 }, { "batch_size": 1, "epoch": 0.1436, "step": 359, "tokens_per_device": 5162 }, { "epoch": 0.1436, "loss_ce": 0.9681580662727356, "loss_lvr": 0.7352285981178284, "loss_mode_switch": 0.0, "loss_total": 1.041680932044983, "step": 359 }, { "batch_size": 1, "epoch": 0.1436, "step": 359, "tokens_per_device": 4892 }, { "epoch": 0.1436, "loss_ce": 0.005951653700321913, "loss_lvr": 0.6752356886863708, "loss_mode_switch": 0.0, "loss_total": 0.07347521930932999, "step": 359 }, { "batch_size": 1, "epoch": 0.1436, "step": 359, "tokens_per_device": 4743 }, { "epoch": 0.1436, "loss_ce": 0.009873525239527225, "loss_lvr": 0.46819961071014404, "loss_mode_switch": 0.0, "loss_total": 0.056693486869335175, "step": 359 }, { "batch_size": 1, "epoch": 0.1436, "step": 359, "tokens_per_device": 4840 }, { "epoch": 0.1436, "loss_ce": 0.004180672578513622, "loss_lvr": 0.5233402252197266, "loss_mode_switch": 0.0, "loss_total": 0.05651469528675079, "step": 359 }, { "batch_size": 4, "epoch": 0.1436, "step": 359, "tokens_per_device": 2644 }, { "epoch": 0.1436, "loss_ce": 0.26801130175590515, "loss_lvr": 1.0431190729141235, "loss_mode_switch": 0.0, "loss_total": 0.372323215007782, "step": 359 }, { "epoch": 0.144, "grad_norm": 1.3845373392105103, "learning_rate": 9.663049028182112e-06, "loss": 0.3571, "step": 360 }, { "batch_size": 1, "epoch": 0.144, "step": 360, "tokens_per_device": 4866 }, { "epoch": 0.144, "loss_ce": 0.000811329809948802, "loss_lvr": 0.7740408182144165, "loss_mode_switch": 0.0, "loss_total": 0.07821541279554367, "step": 360 }, { "batch_size": 4, "epoch": 0.144, "step": 360, "tokens_per_device": 4260 }, { "epoch": 0.144, "loss_ce": 0.41711145639419556, "loss_lvr": 0.7251610159873962, "loss_mode_switch": 0.0, "loss_total": 0.48962756991386414, "step": 360 }, { "batch_size": 4, "epoch": 0.144, "step": 360, "tokens_per_device": 3820 }, { "epoch": 0.144, "loss_ce": 0.14920899271965027, "loss_lvr": 1.067552089691162, "loss_mode_switch": 0.0, "loss_total": 0.2559642195701599, "step": 360 }, { "batch_size": 4, "epoch": 0.144, "step": 360, "tokens_per_device": 5528 }, { "epoch": 0.144, "loss_ce": 0.3262408375740051, "loss_lvr": 1.2058426141738892, "loss_mode_switch": 0.0, "loss_total": 0.4468250870704651, "step": 360 }, { "batch_size": 4, "epoch": 0.144, "step": 360, "tokens_per_device": 2060 }, { "epoch": 0.144, "loss_ce": 0.026604093611240387, "loss_lvr": 1.38882577419281, "loss_mode_switch": 0.0, "loss_total": 0.1654866635799408, "step": 360 }, { "batch_size": 4, "epoch": 0.144, "step": 360, "tokens_per_device": 4272 }, { "epoch": 0.144, "loss_ce": 0.2741375267505646, "loss_lvr": 1.822059154510498, "loss_mode_switch": 0.0, "loss_total": 0.4563434422016144, "step": 360 }, { "batch_size": 4, "epoch": 0.144, "step": 360, "tokens_per_device": 5972 }, { "epoch": 0.144, "loss_ce": 0.22393885254859924, "loss_lvr": 0.8009903430938721, "loss_mode_switch": 0.0, "loss_total": 0.3040378987789154, "step": 360 }, { "batch_size": 1, "epoch": 0.144, "step": 360, "tokens_per_device": 4859 }, { "epoch": 0.144, "loss_ce": 0.24859681725502014, "loss_lvr": 0.380564421415329, "loss_mode_switch": 0.0, "loss_total": 0.2866532504558563, "step": 360 }, { "epoch": 0.1444, "grad_norm": 1.5455999374389648, "learning_rate": 9.660707470676503e-06, "loss": 0.3541, "step": 361 }, { "batch_size": 4, "epoch": 0.1444, "step": 361, "tokens_per_device": 5244 }, { "epoch": 0.1444, "loss_ce": 0.4457862973213196, "loss_lvr": 0.9871312379837036, "loss_mode_switch": 0.0, "loss_total": 0.544499397277832, "step": 361 }, { "batch_size": 1, "epoch": 0.1444, "step": 361, "tokens_per_device": 5085 }, { "epoch": 0.1444, "loss_ce": 0.0352453738451004, "loss_lvr": 1.0828251838684082, "loss_mode_switch": 0.0, "loss_total": 0.14352789521217346, "step": 361 }, { "batch_size": 4, "epoch": 0.1444, "step": 361, "tokens_per_device": 5724 }, { "epoch": 0.1444, "loss_ce": 0.8887240290641785, "loss_lvr": 0.9040492177009583, "loss_mode_switch": 0.0, "loss_total": 0.9791289567947388, "step": 361 }, { "batch_size": 4, "epoch": 0.1444, "step": 361, "tokens_per_device": 6848 }, { "epoch": 0.1444, "loss_ce": 0.08993338793516159, "loss_lvr": 1.056644320487976, "loss_mode_switch": 0.0, "loss_total": 0.1955978274345398, "step": 361 }, { "batch_size": 4, "epoch": 0.1444, "step": 361, "tokens_per_device": 13416 }, { "epoch": 0.1444, "loss_ce": 0.15074221789836884, "loss_lvr": 1.4841740131378174, "loss_mode_switch": 0.0, "loss_total": 0.29915961623191833, "step": 361 }, { "batch_size": 4, "epoch": 0.1444, "step": 361, "tokens_per_device": 2668 }, { "epoch": 0.1444, "loss_ce": 0.374584823846817, "loss_lvr": 0.9972219467163086, "loss_mode_switch": 0.0, "loss_total": 0.47430703043937683, "step": 361 }, { "batch_size": 4, "epoch": 0.1444, "step": 361, "tokens_per_device": 3812 }, { "epoch": 0.1444, "loss_ce": 0.35914814472198486, "loss_lvr": 1.1379791498184204, "loss_mode_switch": 0.0, "loss_total": 0.47294604778289795, "step": 361 }, { "batch_size": 4, "epoch": 0.1444, "step": 361, "tokens_per_device": 5820 }, { "epoch": 0.1444, "loss_ce": 0.12702414393424988, "loss_lvr": 1.0256315469741821, "loss_mode_switch": 0.0, "loss_total": 0.22958730161190033, "step": 361 }, { "epoch": 0.1448, "grad_norm": 1.6434940099716187, "learning_rate": 9.658358090986594e-06, "loss": 0.3865, "step": 362 }, { "batch_size": 4, "epoch": 0.1448, "step": 362, "tokens_per_device": 4220 }, { "epoch": 0.1448, "loss_ce": 0.07985692471265793, "loss_lvr": 1.045131802558899, "loss_mode_switch": 0.0, "loss_total": 0.18437010049819946, "step": 362 }, { "batch_size": 4, "epoch": 0.1448, "step": 362, "tokens_per_device": 7316 }, { "epoch": 0.1448, "loss_ce": 0.5122965574264526, "loss_lvr": 0.9094242453575134, "loss_mode_switch": 0.0, "loss_total": 0.6032389998435974, "step": 362 }, { "batch_size": 4, "epoch": 0.1448, "step": 362, "tokens_per_device": 5264 }, { "epoch": 0.1448, "loss_ce": 0.38093650341033936, "loss_lvr": 0.9052156805992126, "loss_mode_switch": 0.0, "loss_total": 0.4714580774307251, "step": 362 }, { "batch_size": 4, "epoch": 0.1448, "step": 362, "tokens_per_device": 2736 }, { "epoch": 0.1448, "loss_ce": 0.41888943314552307, "loss_lvr": 0.894662082195282, "loss_mode_switch": 0.0, "loss_total": 0.5083556175231934, "step": 362 }, { "batch_size": 4, "epoch": 0.1448, "step": 362, "tokens_per_device": 4508 }, { "epoch": 0.1448, "loss_ce": 0.04106978699564934, "loss_lvr": 0.9008779525756836, "loss_mode_switch": 0.0, "loss_total": 0.13115757703781128, "step": 362 }, { "batch_size": 4, "epoch": 0.1448, "step": 362, "tokens_per_device": 5776 }, { "epoch": 0.1448, "loss_ce": 0.11499043554067612, "loss_lvr": 0.9489587545394897, "loss_mode_switch": 0.0, "loss_total": 0.2098863124847412, "step": 362 }, { "batch_size": 4, "epoch": 0.1448, "step": 362, "tokens_per_device": 1528 }, { "epoch": 0.1448, "loss_ce": 0.6848022937774658, "loss_lvr": 1.1297144889831543, "loss_mode_switch": 0.0, "loss_total": 0.7977737188339233, "step": 362 }, { "batch_size": 1, "epoch": 0.1448, "step": 362, "tokens_per_device": 7298 }, { "epoch": 0.1448, "loss_ce": 0.32724592089653015, "loss_lvr": 0.3692450225353241, "loss_mode_switch": 0.0, "loss_total": 0.3641704320907593, "step": 362 }, { "epoch": 0.1452, "grad_norm": 1.4847500324249268, "learning_rate": 9.656000893055416e-06, "loss": 0.3584, "step": 363 }, { "batch_size": 4, "epoch": 0.1452, "step": 363, "tokens_per_device": 5928 }, { "epoch": 0.1452, "loss_ce": 0.1323283314704895, "loss_lvr": 0.8861400485038757, "loss_mode_switch": 0.0, "loss_total": 0.22094234824180603, "step": 363 }, { "batch_size": 4, "epoch": 0.1452, "step": 363, "tokens_per_device": 2600 }, { "epoch": 0.1452, "loss_ce": 0.09798391908407211, "loss_lvr": 0.8575484156608582, "loss_mode_switch": 0.0, "loss_total": 0.18373876810073853, "step": 363 }, { "batch_size": 1, "epoch": 0.1452, "step": 363, "tokens_per_device": 4950 }, { "epoch": 0.1452, "loss_ce": 0.0075504956766963005, "loss_lvr": 0.42505964636802673, "loss_mode_switch": 0.0, "loss_total": 0.05005646124482155, "step": 363 }, { "batch_size": 1, "epoch": 0.1452, "step": 363, "tokens_per_device": 4877 }, { "epoch": 0.1452, "loss_ce": 0.012343293987214565, "loss_lvr": 0.6132550835609436, "loss_mode_switch": 0.0, "loss_total": 0.07366880029439926, "step": 363 }, { "batch_size": 4, "epoch": 0.1452, "step": 363, "tokens_per_device": 2548 }, { "epoch": 0.1452, "loss_ce": 0.48013147711753845, "loss_lvr": 1.2954212427139282, "loss_mode_switch": 0.0, "loss_total": 0.6096736192703247, "step": 363 }, { "batch_size": 4, "epoch": 0.1452, "step": 363, "tokens_per_device": 4916 }, { "epoch": 0.1452, "loss_ce": 0.34254834055900574, "loss_lvr": 0.9307364821434021, "loss_mode_switch": 0.0, "loss_total": 0.435621976852417, "step": 363 }, { "batch_size": 1, "epoch": 0.1452, "step": 363, "tokens_per_device": 4877 }, { "epoch": 0.1452, "loss_ce": 0.030196908861398697, "loss_lvr": 0.5257921814918518, "loss_mode_switch": 0.0, "loss_total": 0.08277612924575806, "step": 363 }, { "batch_size": 4, "epoch": 0.1452, "step": 363, "tokens_per_device": 4212 }, { "epoch": 0.1452, "loss_ce": 0.7381271123886108, "loss_lvr": 0.8527506589889526, "loss_mode_switch": 0.0, "loss_total": 0.8234021663665771, "step": 363 }, { "epoch": 0.1456, "grad_norm": 1.4178179502487183, "learning_rate": 9.653635880839107e-06, "loss": 0.3193, "step": 364 }, { "batch_size": 1, "epoch": 0.1456, "step": 364, "tokens_per_device": 5135 }, { "epoch": 0.1456, "loss_ce": 0.08732070028781891, "loss_lvr": 0.517906665802002, "loss_mode_switch": 0.0, "loss_total": 0.13911136984825134, "step": 364 }, { "batch_size": 4, "epoch": 0.1456, "step": 364, "tokens_per_device": 1556 }, { "epoch": 0.1456, "loss_ce": 0.7041684985160828, "loss_lvr": 1.1711909770965576, "loss_mode_switch": 0.0, "loss_total": 0.8212875723838806, "step": 364 }, { "batch_size": 4, "epoch": 0.1456, "step": 364, "tokens_per_device": 4236 }, { "epoch": 0.1456, "loss_ce": 0.47748079895973206, "loss_lvr": 1.3735620975494385, "loss_mode_switch": 0.0, "loss_total": 0.6148369908332825, "step": 364 }, { "batch_size": 4, "epoch": 0.1456, "step": 364, "tokens_per_device": 4252 }, { "epoch": 0.1456, "loss_ce": 0.10601279884576797, "loss_lvr": 1.3095182180404663, "loss_mode_switch": 0.0, "loss_total": 0.236964613199234, "step": 364 }, { "batch_size": 1, "epoch": 0.1456, "step": 364, "tokens_per_device": 4904 }, { "epoch": 0.1456, "loss_ce": 0.04458385333418846, "loss_lvr": 0.5606998801231384, "loss_mode_switch": 0.0, "loss_total": 0.10065384209156036, "step": 364 }, { "batch_size": 4, "epoch": 0.1456, "step": 364, "tokens_per_device": 4848 }, { "epoch": 0.1456, "loss_ce": 0.8233581781387329, "loss_lvr": 0.9570496678352356, "loss_mode_switch": 0.0, "loss_total": 0.919063150882721, "step": 364 }, { "batch_size": 1, "epoch": 0.1456, "step": 364, "tokens_per_device": 4921 }, { "epoch": 0.1456, "loss_ce": 0.04935338720679283, "loss_lvr": 0.3340773284435272, "loss_mode_switch": 0.0, "loss_total": 0.08276112377643585, "step": 364 }, { "batch_size": 4, "epoch": 0.1456, "step": 364, "tokens_per_device": 2736 }, { "epoch": 0.1456, "loss_ce": 0.43296846747398376, "loss_lvr": 1.0308096408843994, "loss_mode_switch": 0.0, "loss_total": 0.5360494256019592, "step": 364 }, { "epoch": 0.146, "grad_norm": 2.1112704277038574, "learning_rate": 9.651263058306932e-06, "loss": 0.3669, "step": 365 }, { "batch_size": 4, "epoch": 0.146, "step": 365, "tokens_per_device": 1436 }, { "epoch": 0.146, "loss_ce": 0.36552727222442627, "loss_lvr": 2.5795018672943115, "loss_mode_switch": 0.0, "loss_total": 0.6234774589538574, "step": 365 }, { "batch_size": 4, "epoch": 0.146, "step": 365, "tokens_per_device": 3512 }, { "epoch": 0.146, "loss_ce": 0.5082136988639832, "loss_lvr": 0.8330106735229492, "loss_mode_switch": 0.0, "loss_total": 0.5915147662162781, "step": 365 }, { "batch_size": 4, "epoch": 0.146, "step": 365, "tokens_per_device": 4592 }, { "epoch": 0.146, "loss_ce": 0.6244508624076843, "loss_lvr": 1.0324293375015259, "loss_mode_switch": 0.0, "loss_total": 0.7276937961578369, "step": 365 }, { "batch_size": 4, "epoch": 0.146, "step": 365, "tokens_per_device": 4644 }, { "epoch": 0.146, "loss_ce": 0.6496102809906006, "loss_lvr": 0.7873305082321167, "loss_mode_switch": 0.0, "loss_total": 0.7283433079719543, "step": 365 }, { "batch_size": 4, "epoch": 0.146, "step": 365, "tokens_per_device": 2636 }, { "epoch": 0.146, "loss_ce": 0.3899706304073334, "loss_lvr": 1.045495867729187, "loss_mode_switch": 0.0, "loss_total": 0.4945202171802521, "step": 365 }, { "batch_size": 4, "epoch": 0.146, "step": 365, "tokens_per_device": 1260 }, { "epoch": 0.146, "loss_ce": 0.5248160362243652, "loss_lvr": 1.3379967212677002, "loss_mode_switch": 0.0, "loss_total": 0.6586157083511353, "step": 365 }, { "batch_size": 1, "epoch": 0.146, "step": 365, "tokens_per_device": 4882 }, { "epoch": 0.146, "loss_ce": 0.01858881302177906, "loss_lvr": 0.33628490567207336, "loss_mode_switch": 0.0, "loss_total": 0.052217304706573486, "step": 365 }, { "batch_size": 4, "epoch": 0.146, "step": 365, "tokens_per_device": 1232 }, { "epoch": 0.146, "loss_ce": 0.12913139164447784, "loss_lvr": 1.1078481674194336, "loss_mode_switch": 0.0, "loss_total": 0.23991620540618896, "step": 365 }, { "epoch": 0.1464, "grad_norm": 1.5757408142089844, "learning_rate": 9.648882429441258e-06, "loss": 0.344, "step": 366 }, { "batch_size": 4, "epoch": 0.1464, "step": 366, "tokens_per_device": 3940 }, { "epoch": 0.1464, "loss_ce": 0.5391146540641785, "loss_lvr": 0.950849175453186, "loss_mode_switch": 0.0, "loss_total": 0.6341995596885681, "step": 366 }, { "batch_size": 4, "epoch": 0.1464, "step": 366, "tokens_per_device": 1540 }, { "epoch": 0.1464, "loss_ce": 0.26234233379364014, "loss_lvr": 1.0240064859390259, "loss_mode_switch": 0.0, "loss_total": 0.3647429943084717, "step": 366 }, { "batch_size": 1, "epoch": 0.1464, "step": 366, "tokens_per_device": 5102 }, { "epoch": 0.1464, "loss_ce": 0.03227098658680916, "loss_lvr": 0.9111745357513428, "loss_mode_switch": 0.0, "loss_total": 0.12338843941688538, "step": 366 }, { "batch_size": 4, "epoch": 0.1464, "step": 366, "tokens_per_device": 6288 }, { "epoch": 0.1464, "loss_ce": 0.45405685901641846, "loss_lvr": 0.8358083963394165, "loss_mode_switch": 0.0, "loss_total": 0.5376377105712891, "step": 366 }, { "batch_size": 4, "epoch": 0.1464, "step": 366, "tokens_per_device": 7076 }, { "epoch": 0.1464, "loss_ce": 0.17973119020462036, "loss_lvr": 1.0918867588043213, "loss_mode_switch": 0.0, "loss_total": 0.2889198660850525, "step": 366 }, { "batch_size": 1, "epoch": 0.1464, "step": 366, "tokens_per_device": 5161 }, { "epoch": 0.1464, "loss_ce": 0.6648918390274048, "loss_lvr": 0.575850248336792, "loss_mode_switch": 0.0, "loss_total": 0.7224768400192261, "step": 366 }, { "batch_size": 4, "epoch": 0.1464, "step": 366, "tokens_per_device": 5728 }, { "epoch": 0.1464, "loss_ce": 0.3925059139728546, "loss_lvr": 1.1998528242111206, "loss_mode_switch": 0.0, "loss_total": 0.5124912261962891, "step": 366 }, { "batch_size": 1, "epoch": 0.1464, "step": 366, "tokens_per_device": 4932 }, { "epoch": 0.1464, "loss_ce": 0.025568408891558647, "loss_lvr": 0.7757513523101807, "loss_mode_switch": 0.0, "loss_total": 0.10314355045557022, "step": 366 }, { "epoch": 0.1468, "grad_norm": 1.4419556856155396, "learning_rate": 9.646493998237557e-06, "loss": 0.3036, "step": 367 }, { "batch_size": 1, "epoch": 0.1468, "step": 367, "tokens_per_device": 5213 }, { "epoch": 0.1468, "loss_ce": 0.12799160182476044, "loss_lvr": 0.744158148765564, "loss_mode_switch": 0.0, "loss_total": 0.20240741968154907, "step": 367 }, { "batch_size": 4, "epoch": 0.1468, "step": 367, "tokens_per_device": 5132 }, { "epoch": 0.1468, "loss_ce": 0.4626947045326233, "loss_lvr": 0.9637705683708191, "loss_mode_switch": 0.0, "loss_total": 0.5590717792510986, "step": 367 }, { "batch_size": 4, "epoch": 0.1468, "step": 367, "tokens_per_device": 2728 }, { "epoch": 0.1468, "loss_ce": 0.23932193219661713, "loss_lvr": 0.9837303161621094, "loss_mode_switch": 0.0, "loss_total": 0.3376949727535248, "step": 367 }, { "batch_size": 4, "epoch": 0.1468, "step": 367, "tokens_per_device": 2684 }, { "epoch": 0.1468, "loss_ce": 0.3601026237010956, "loss_lvr": 0.9201724529266357, "loss_mode_switch": 0.0, "loss_total": 0.4521198868751526, "step": 367 }, { "batch_size": 4, "epoch": 0.1468, "step": 367, "tokens_per_device": 5896 }, { "epoch": 0.1468, "loss_ce": 0.6648178100585938, "loss_lvr": 0.9397600889205933, "loss_mode_switch": 0.0, "loss_total": 0.758793830871582, "step": 367 }, { "batch_size": 4, "epoch": 0.1468, "step": 367, "tokens_per_device": 4348 }, { "epoch": 0.1468, "loss_ce": 0.4077456295490265, "loss_lvr": 1.0643415451049805, "loss_mode_switch": 0.0, "loss_total": 0.5141797661781311, "step": 367 }, { "batch_size": 1, "epoch": 0.1468, "step": 367, "tokens_per_device": 5514 }, { "epoch": 0.1468, "loss_ce": 0.12325117737054825, "loss_lvr": 1.1386080980300903, "loss_mode_switch": 0.0, "loss_total": 0.23711198568344116, "step": 367 }, { "batch_size": 4, "epoch": 0.1468, "step": 367, "tokens_per_device": 5740 }, { "epoch": 0.1468, "loss_ce": 0.14975525438785553, "loss_lvr": 1.3441588878631592, "loss_mode_switch": 0.0, "loss_total": 0.2841711640357971, "step": 367 }, { "epoch": 0.1472, "grad_norm": 1.716734766960144, "learning_rate": 9.64409776870439e-06, "loss": 0.3291, "step": 368 }, { "batch_size": 4, "epoch": 0.1472, "step": 368, "tokens_per_device": 3724 }, { "epoch": 0.1472, "loss_ce": 0.6970065832138062, "loss_lvr": 0.9156883358955383, "loss_mode_switch": 0.0, "loss_total": 0.7885754108428955, "step": 368 }, { "batch_size": 1, "epoch": 0.1472, "step": 368, "tokens_per_device": 4760 }, { "epoch": 0.1472, "loss_ce": 0.006494453642517328, "loss_lvr": 0.6258871555328369, "loss_mode_switch": 0.0, "loss_total": 0.06908316910266876, "step": 368 }, { "batch_size": 4, "epoch": 0.1472, "step": 368, "tokens_per_device": 1360 }, { "epoch": 0.1472, "loss_ce": 0.8151519894599915, "loss_lvr": 1.3928591012954712, "loss_mode_switch": 0.0, "loss_total": 0.9544379115104675, "step": 368 }, { "batch_size": 4, "epoch": 0.1472, "step": 368, "tokens_per_device": 4516 }, { "epoch": 0.1472, "loss_ce": 0.5867264270782471, "loss_lvr": 1.1824212074279785, "loss_mode_switch": 0.0, "loss_total": 0.7049685716629028, "step": 368 }, { "batch_size": 4, "epoch": 0.1472, "step": 368, "tokens_per_device": 1564 }, { "epoch": 0.1472, "loss_ce": 0.32637494802474976, "loss_lvr": 1.1803185939788818, "loss_mode_switch": 0.0, "loss_total": 0.44440680742263794, "step": 368 }, { "batch_size": 4, "epoch": 0.1472, "step": 368, "tokens_per_device": 3092 }, { "epoch": 0.1472, "loss_ce": 0.2607302963733673, "loss_lvr": 0.8470399975776672, "loss_mode_switch": 0.0, "loss_total": 0.345434308052063, "step": 368 }, { "batch_size": 1, "epoch": 0.1472, "step": 368, "tokens_per_device": 5109 }, { "epoch": 0.1472, "loss_ce": 0.002927228808403015, "loss_lvr": 0.22831368446350098, "loss_mode_switch": 0.0, "loss_total": 0.025758597999811172, "step": 368 }, { "batch_size": 4, "epoch": 0.1472, "step": 368, "tokens_per_device": 4540 }, { "epoch": 0.1472, "loss_ce": 0.254084050655365, "loss_lvr": 1.3146997690200806, "loss_mode_switch": 0.0, "loss_total": 0.3855540156364441, "step": 368 }, { "epoch": 0.1476, "grad_norm": 1.5685456991195679, "learning_rate": 9.641693744863413e-06, "loss": 0.4099, "step": 369 }, { "batch_size": 4, "epoch": 0.1476, "step": 369, "tokens_per_device": 4332 }, { "epoch": 0.1476, "loss_ce": 0.5922935605049133, "loss_lvr": 0.9746197462081909, "loss_mode_switch": 0.0, "loss_total": 0.6897555589675903, "step": 369 }, { "batch_size": 4, "epoch": 0.1476, "step": 369, "tokens_per_device": 4452 }, { "epoch": 0.1476, "loss_ce": 0.09497774392366409, "loss_lvr": 0.6495993733406067, "loss_mode_switch": 0.0, "loss_total": 0.15993767976760864, "step": 369 }, { "batch_size": 4, "epoch": 0.1476, "step": 369, "tokens_per_device": 1204 }, { "epoch": 0.1476, "loss_ce": 0.19819067418575287, "loss_lvr": 1.4818962812423706, "loss_mode_switch": 0.0, "loss_total": 0.3463802933692932, "step": 369 }, { "batch_size": 4, "epoch": 0.1476, "step": 369, "tokens_per_device": 6508 }, { "epoch": 0.1476, "loss_ce": 0.3149040937423706, "loss_lvr": 0.7984309792518616, "loss_mode_switch": 0.0, "loss_total": 0.39474719762802124, "step": 369 }, { "batch_size": 4, "epoch": 0.1476, "step": 369, "tokens_per_device": 3836 }, { "epoch": 0.1476, "loss_ce": 0.4498835504055023, "loss_lvr": 1.0885239839553833, "loss_mode_switch": 0.0, "loss_total": 0.5587359666824341, "step": 369 }, { "batch_size": 1, "epoch": 0.1476, "step": 369, "tokens_per_device": 4882 }, { "epoch": 0.1476, "loss_ce": 0.0029559105169028044, "loss_lvr": 0.4169105887413025, "loss_mode_switch": 0.0, "loss_total": 0.0446469709277153, "step": 369 }, { "batch_size": 4, "epoch": 0.1476, "step": 369, "tokens_per_device": 1472 }, { "epoch": 0.1476, "loss_ce": 0.4504493474960327, "loss_lvr": 1.1469179391860962, "loss_mode_switch": 0.0, "loss_total": 0.5651411414146423, "step": 369 }, { "batch_size": 4, "epoch": 0.1476, "step": 369, "tokens_per_device": 4272 }, { "epoch": 0.1476, "loss_ce": 0.07663267850875854, "loss_lvr": 0.8984450101852417, "loss_mode_switch": 0.0, "loss_total": 0.16647717356681824, "step": 369 }, { "epoch": 0.148, "grad_norm": 1.5112056732177734, "learning_rate": 9.639281930749363e-06, "loss": 0.3606, "step": 370 }, { "batch_size": 4, "epoch": 0.148, "step": 370, "tokens_per_device": 4252 }, { "epoch": 0.148, "loss_ce": 0.22837689518928528, "loss_lvr": 0.9024043679237366, "loss_mode_switch": 0.0, "loss_total": 0.3186173439025879, "step": 370 }, { "batch_size": 4, "epoch": 0.148, "step": 370, "tokens_per_device": 3776 }, { "epoch": 0.148, "loss_ce": 0.32611480355262756, "loss_lvr": 1.4019286632537842, "loss_mode_switch": 0.0, "loss_total": 0.466307669878006, "step": 370 }, { "batch_size": 4, "epoch": 0.148, "step": 370, "tokens_per_device": 2996 }, { "epoch": 0.148, "loss_ce": 0.27957528829574585, "loss_lvr": 0.7206088900566101, "loss_mode_switch": 0.0, "loss_total": 0.3516361713409424, "step": 370 }, { "batch_size": 4, "epoch": 0.148, "step": 370, "tokens_per_device": 4588 }, { "epoch": 0.148, "loss_ce": 0.3701227605342865, "loss_lvr": 1.1265218257904053, "loss_mode_switch": 0.0, "loss_total": 0.482774943113327, "step": 370 }, { "batch_size": 4, "epoch": 0.148, "step": 370, "tokens_per_device": 4244 }, { "epoch": 0.148, "loss_ce": 0.11493279039859772, "loss_lvr": 1.0714821815490723, "loss_mode_switch": 0.0, "loss_total": 0.2220810055732727, "step": 370 }, { "batch_size": 1, "epoch": 0.148, "step": 370, "tokens_per_device": 5676 }, { "epoch": 0.148, "loss_ce": 0.03861350193619728, "loss_lvr": 0.6534059643745422, "loss_mode_switch": 0.0, "loss_total": 0.10395410656929016, "step": 370 }, { "batch_size": 1, "epoch": 0.148, "step": 370, "tokens_per_device": 4925 }, { "epoch": 0.148, "loss_ce": 0.2943513095378876, "loss_lvr": 0.5012581944465637, "loss_mode_switch": 0.0, "loss_total": 0.344477117061615, "step": 370 }, { "batch_size": 4, "epoch": 0.148, "step": 370, "tokens_per_device": 4972 }, { "epoch": 0.148, "loss_ce": 0.15074004232883453, "loss_lvr": 1.1772665977478027, "loss_mode_switch": 0.0, "loss_total": 0.2684667110443115, "step": 370 }, { "epoch": 0.1484, "grad_norm": 2.0138885974884033, "learning_rate": 9.636862330410043e-06, "loss": 0.3476, "step": 371 }, { "batch_size": 1, "epoch": 0.1484, "step": 371, "tokens_per_device": 5860 }, { "epoch": 0.1484, "loss_ce": 0.0026108697056770325, "loss_lvr": 0.450773149728775, "loss_mode_switch": 0.0, "loss_total": 0.047688186168670654, "step": 371 }, { "batch_size": 4, "epoch": 0.1484, "step": 371, "tokens_per_device": 3352 }, { "epoch": 0.1484, "loss_ce": 0.2448974996805191, "loss_lvr": 1.1482832431793213, "loss_mode_switch": 0.0, "loss_total": 0.35972583293914795, "step": 371 }, { "batch_size": 4, "epoch": 0.1484, "step": 371, "tokens_per_device": 4508 }, { "epoch": 0.1484, "loss_ce": 0.11583782732486725, "loss_lvr": 0.9760226011276245, "loss_mode_switch": 0.0, "loss_total": 0.21344009041786194, "step": 371 }, { "batch_size": 4, "epoch": 0.1484, "step": 371, "tokens_per_device": 7852 }, { "epoch": 0.1484, "loss_ce": 0.2240859866142273, "loss_lvr": 0.982197105884552, "loss_mode_switch": 0.0, "loss_total": 0.32230570912361145, "step": 371 }, { "batch_size": 4, "epoch": 0.1484, "step": 371, "tokens_per_device": 10652 }, { "epoch": 0.1484, "loss_ce": 0.4768536686897278, "loss_lvr": 1.2453523874282837, "loss_mode_switch": 0.0, "loss_total": 0.6013889312744141, "step": 371 }, { "batch_size": 4, "epoch": 0.1484, "step": 371, "tokens_per_device": 1564 }, { "epoch": 0.1484, "loss_ce": 0.37027135491371155, "loss_lvr": 1.344544768333435, "loss_mode_switch": 0.0, "loss_total": 0.5047258138656616, "step": 371 }, { "batch_size": 4, "epoch": 0.1484, "step": 371, "tokens_per_device": 4236 }, { "epoch": 0.1484, "loss_ce": 0.22568738460540771, "loss_lvr": 1.1288552284240723, "loss_mode_switch": 0.0, "loss_total": 0.3385729193687439, "step": 371 }, { "batch_size": 4, "epoch": 0.1484, "step": 371, "tokens_per_device": 1316 }, { "epoch": 0.1484, "loss_ce": 0.37272170186042786, "loss_lvr": 1.1390016078948975, "loss_mode_switch": 0.0, "loss_total": 0.4866218566894531, "step": 371 }, { "epoch": 0.1488, "grad_norm": 1.621241807937622, "learning_rate": 9.634434947906337e-06, "loss": 0.3226, "step": 372 }, { "batch_size": 1, "epoch": 0.1488, "step": 372, "tokens_per_device": 5164 }, { "epoch": 0.1488, "loss_ce": 0.042192958295345306, "loss_lvr": 0.7574476003646851, "loss_mode_switch": 0.0, "loss_total": 0.11793772131204605, "step": 372 }, { "batch_size": 4, "epoch": 0.1488, "step": 372, "tokens_per_device": 8212 }, { "epoch": 0.1488, "loss_ce": 0.23807619512081146, "loss_lvr": 0.8309065699577332, "loss_mode_switch": 0.0, "loss_total": 0.32116684317588806, "step": 372 }, { "batch_size": 1, "epoch": 0.1488, "step": 372, "tokens_per_device": 5122 }, { "epoch": 0.1488, "loss_ce": 0.005979675333946943, "loss_lvr": 0.37563908100128174, "loss_mode_switch": 0.0, "loss_total": 0.043543584644794464, "step": 372 }, { "batch_size": 1, "epoch": 0.1488, "step": 372, "tokens_per_device": 5102 }, { "epoch": 0.1488, "loss_ce": 0.005472690798342228, "loss_lvr": 0.6229945421218872, "loss_mode_switch": 0.0, "loss_total": 0.06777215003967285, "step": 372 }, { "batch_size": 4, "epoch": 0.1488, "step": 372, "tokens_per_device": 4172 }, { "epoch": 0.1488, "loss_ce": 0.31419265270233154, "loss_lvr": 0.6830998659133911, "loss_mode_switch": 0.0, "loss_total": 0.38250264525413513, "step": 372 }, { "batch_size": 4, "epoch": 0.1488, "step": 372, "tokens_per_device": 1456 }, { "epoch": 0.1488, "loss_ce": 0.5116283893585205, "loss_lvr": 1.1950712203979492, "loss_mode_switch": 0.0, "loss_total": 0.6311355233192444, "step": 372 }, { "batch_size": 1, "epoch": 0.1488, "step": 372, "tokens_per_device": 5156 }, { "epoch": 0.1488, "loss_ce": 0.22072365880012512, "loss_lvr": 0.23520483076572418, "loss_mode_switch": 0.0, "loss_total": 0.24424414336681366, "step": 372 }, { "batch_size": 1, "epoch": 0.1488, "step": 372, "tokens_per_device": 4823 }, { "epoch": 0.1488, "loss_ce": 0.00867887120693922, "loss_lvr": 0.7477582097053528, "loss_mode_switch": 0.0, "loss_total": 0.0834546908736229, "step": 372 }, { "epoch": 0.1492, "grad_norm": 1.353614330291748, "learning_rate": 9.631999787312179e-06, "loss": 0.3128, "step": 373 }, { "batch_size": 4, "epoch": 0.1492, "step": 373, "tokens_per_device": 4280 }, { "epoch": 0.1492, "loss_ce": 0.21049530804157257, "loss_lvr": 1.0876708030700684, "loss_mode_switch": 0.0, "loss_total": 0.31926238536834717, "step": 373 }, { "batch_size": 4, "epoch": 0.1492, "step": 373, "tokens_per_device": 6496 }, { "epoch": 0.1492, "loss_ce": 0.44781872630119324, "loss_lvr": 0.8306199908256531, "loss_mode_switch": 0.0, "loss_total": 0.5308807492256165, "step": 373 }, { "batch_size": 4, "epoch": 0.1492, "step": 373, "tokens_per_device": 15468 }, { "epoch": 0.1492, "loss_ce": 0.037363432347774506, "loss_lvr": 0.5112289786338806, "loss_mode_switch": 0.0, "loss_total": 0.08848632872104645, "step": 373 }, { "batch_size": 4, "epoch": 0.1492, "step": 373, "tokens_per_device": 4756 }, { "epoch": 0.1492, "loss_ce": 0.33421528339385986, "loss_lvr": 0.9780808687210083, "loss_mode_switch": 0.0, "loss_total": 0.43202337622642517, "step": 373 }, { "batch_size": 1, "epoch": 0.1492, "step": 373, "tokens_per_device": 4722 }, { "epoch": 0.1492, "loss_ce": 0.04099464789032936, "loss_lvr": 0.6682965159416199, "loss_mode_switch": 0.0, "loss_total": 0.10782429575920105, "step": 373 }, { "batch_size": 1, "epoch": 0.1492, "step": 373, "tokens_per_device": 6755 }, { "epoch": 0.1492, "loss_ce": 0.09874312579631805, "loss_lvr": 0.44667932391166687, "loss_mode_switch": 0.0, "loss_total": 0.1434110552072525, "step": 373 }, { "batch_size": 4, "epoch": 0.1492, "step": 373, "tokens_per_device": 7196 }, { "epoch": 0.1492, "loss_ce": 0.10705961287021637, "loss_lvr": 1.017805576324463, "loss_mode_switch": 0.0, "loss_total": 0.20884016156196594, "step": 373 }, { "batch_size": 1, "epoch": 0.1492, "step": 373, "tokens_per_device": 5178 }, { "epoch": 0.1492, "loss_ce": 0.01356741413474083, "loss_lvr": 0.47013866901397705, "loss_mode_switch": 0.0, "loss_total": 0.060581281781196594, "step": 373 }, { "epoch": 0.1496, "grad_norm": 1.6124049425125122, "learning_rate": 9.62955685271456e-06, "loss": 0.3018, "step": 374 }, { "batch_size": 1, "epoch": 0.1496, "step": 374, "tokens_per_device": 4678 }, { "epoch": 0.1496, "loss_ce": 0.018238119781017303, "loss_lvr": 0.4121537506580353, "loss_mode_switch": 0.0, "loss_total": 0.05945349484682083, "step": 374 }, { "batch_size": 1, "epoch": 0.1496, "step": 374, "tokens_per_device": 5182 }, { "epoch": 0.1496, "loss_ce": 0.0015721070813015103, "loss_lvr": 0.5311710834503174, "loss_mode_switch": 0.0, "loss_total": 0.05468921363353729, "step": 374 }, { "batch_size": 1, "epoch": 0.1496, "step": 374, "tokens_per_device": 5094 }, { "epoch": 0.1496, "loss_ce": 0.03147033974528313, "loss_lvr": 0.6895280480384827, "loss_mode_switch": 0.0, "loss_total": 0.10042314231395721, "step": 374 }, { "batch_size": 4, "epoch": 0.1496, "step": 374, "tokens_per_device": 1616 }, { "epoch": 0.1496, "loss_ce": 0.3067949712276459, "loss_lvr": 1.2430330514907837, "loss_mode_switch": 0.0, "loss_total": 0.4310982823371887, "step": 374 }, { "batch_size": 1, "epoch": 0.1496, "step": 374, "tokens_per_device": 4882 }, { "epoch": 0.1496, "loss_ce": 0.026652852073311806, "loss_lvr": 0.5332642793655396, "loss_mode_switch": 0.0, "loss_total": 0.07997927814722061, "step": 374 }, { "batch_size": 1, "epoch": 0.1496, "step": 374, "tokens_per_device": 4896 }, { "epoch": 0.1496, "loss_ce": 0.2533213198184967, "loss_lvr": 0.4296569228172302, "loss_mode_switch": 0.0, "loss_total": 0.29628700017929077, "step": 374 }, { "batch_size": 4, "epoch": 0.1496, "step": 374, "tokens_per_device": 3824 }, { "epoch": 0.1496, "loss_ce": 0.17326392233371735, "loss_lvr": 0.9247183799743652, "loss_mode_switch": 0.0, "loss_total": 0.2657357454299927, "step": 374 }, { "batch_size": 4, "epoch": 0.1496, "step": 374, "tokens_per_device": 1580 }, { "epoch": 0.1496, "loss_ce": 0.37289541959762573, "loss_lvr": 1.0367944240570068, "loss_mode_switch": 0.0, "loss_total": 0.4765748679637909, "step": 374 }, { "epoch": 0.15, "grad_norm": 1.362815499305725, "learning_rate": 9.627106148213521e-06, "loss": 0.3301, "step": 375 }, { "batch_size": 4, "epoch": 0.15, "step": 375, "tokens_per_device": 3988 }, { "epoch": 0.15, "loss_ce": 0.09813577681779861, "loss_lvr": 1.096860647201538, "loss_mode_switch": 0.0, "loss_total": 0.20782184600830078, "step": 375 }, { "batch_size": 4, "epoch": 0.15, "step": 375, "tokens_per_device": 3788 }, { "epoch": 0.15, "loss_ce": 0.061048321425914764, "loss_lvr": 0.8118615746498108, "loss_mode_switch": 0.0, "loss_total": 0.14223447442054749, "step": 375 }, { "batch_size": 4, "epoch": 0.15, "step": 375, "tokens_per_device": 5924 }, { "epoch": 0.15, "loss_ce": 0.19897252321243286, "loss_lvr": 0.9284152984619141, "loss_mode_switch": 0.0, "loss_total": 0.29181405901908875, "step": 375 }, { "batch_size": 4, "epoch": 0.15, "step": 375, "tokens_per_device": 4636 }, { "epoch": 0.15, "loss_ce": 0.23008476197719574, "loss_lvr": 0.7544113397598267, "loss_mode_switch": 0.0, "loss_total": 0.30552589893341064, "step": 375 }, { "batch_size": 4, "epoch": 0.15, "step": 375, "tokens_per_device": 4248 }, { "epoch": 0.15, "loss_ce": 0.4016074240207672, "loss_lvr": 1.0723568201065063, "loss_mode_switch": 0.0, "loss_total": 0.5088431239128113, "step": 375 }, { "batch_size": 4, "epoch": 0.15, "step": 375, "tokens_per_device": 2692 }, { "epoch": 0.15, "loss_ce": 0.5875147581100464, "loss_lvr": 0.9407359957695007, "loss_mode_switch": 0.0, "loss_total": 0.681588351726532, "step": 375 }, { "batch_size": 4, "epoch": 0.15, "step": 375, "tokens_per_device": 5260 }, { "epoch": 0.15, "loss_ce": 0.4223131537437439, "loss_lvr": 0.675005316734314, "loss_mode_switch": 0.0, "loss_total": 0.4898136854171753, "step": 375 }, { "batch_size": 4, "epoch": 0.15, "step": 375, "tokens_per_device": 2588 }, { "epoch": 0.15, "loss_ce": 0.21427451074123383, "loss_lvr": 1.0230209827423096, "loss_mode_switch": 0.0, "loss_total": 0.31657660007476807, "step": 375 }, { "epoch": 0.1504, "grad_norm": 1.4282768964767456, "learning_rate": 9.624647677922143e-06, "loss": 0.2979, "step": 376 }, { "batch_size": 1, "epoch": 0.1504, "step": 376, "tokens_per_device": 5175 }, { "epoch": 0.1504, "loss_ce": 0.007071895524859428, "loss_lvr": 1.0379465818405151, "loss_mode_switch": 0.0, "loss_total": 0.11086655408143997, "step": 376 }, { "batch_size": 4, "epoch": 0.1504, "step": 376, "tokens_per_device": 1400 }, { "epoch": 0.1504, "loss_ce": 0.6094762086868286, "loss_lvr": 1.1167280673980713, "loss_mode_switch": 0.0, "loss_total": 0.7211490273475647, "step": 376 }, { "batch_size": 4, "epoch": 0.1504, "step": 376, "tokens_per_device": 3788 }, { "epoch": 0.1504, "loss_ce": 0.13145260512828827, "loss_lvr": 0.9781451225280762, "loss_mode_switch": 0.0, "loss_total": 0.22926712036132812, "step": 376 }, { "batch_size": 4, "epoch": 0.1504, "step": 376, "tokens_per_device": 9232 }, { "epoch": 0.1504, "loss_ce": 0.3360655903816223, "loss_lvr": 0.936113715171814, "loss_mode_switch": 0.0, "loss_total": 0.42967694997787476, "step": 376 }, { "batch_size": 1, "epoch": 0.1504, "step": 376, "tokens_per_device": 5134 }, { "epoch": 0.1504, "loss_ce": 0.36539947986602783, "loss_lvr": 0.8596649765968323, "loss_mode_switch": 0.0, "loss_total": 0.45136597752571106, "step": 376 }, { "batch_size": 1, "epoch": 0.1504, "step": 376, "tokens_per_device": 4737 }, { "epoch": 0.1504, "loss_ce": 0.3737892210483551, "loss_lvr": 1.1328859329223633, "loss_mode_switch": 0.0, "loss_total": 0.48707783222198486, "step": 376 }, { "batch_size": 1, "epoch": 0.1504, "step": 376, "tokens_per_device": 5111 }, { "epoch": 0.1504, "loss_ce": 0.12153702974319458, "loss_lvr": 0.7539200782775879, "loss_mode_switch": 0.0, "loss_total": 0.19692903757095337, "step": 376 }, { "batch_size": 1, "epoch": 0.1504, "step": 376, "tokens_per_device": 5166 }, { "epoch": 0.1504, "loss_ce": 0.015783723443746567, "loss_lvr": 0.5118377804756165, "loss_mode_switch": 0.0, "loss_total": 0.06696750223636627, "step": 376 }, { "epoch": 0.1508, "grad_norm": 1.6517524719238281, "learning_rate": 9.622181445966539e-06, "loss": 0.3833, "step": 377 }, { "batch_size": 4, "epoch": 0.1508, "step": 377, "tokens_per_device": 4440 }, { "epoch": 0.1508, "loss_ce": 0.3065183758735657, "loss_lvr": 1.116408348083496, "loss_mode_switch": 0.0, "loss_total": 0.41815921664237976, "step": 377 }, { "batch_size": 1, "epoch": 0.1508, "step": 377, "tokens_per_device": 4993 }, { "epoch": 0.1508, "loss_ce": 0.002549265744164586, "loss_lvr": 0.5137568116188049, "loss_mode_switch": 0.0, "loss_total": 0.05392494797706604, "step": 377 }, { "batch_size": 1, "epoch": 0.1508, "step": 377, "tokens_per_device": 4935 }, { "epoch": 0.1508, "loss_ce": 0.14791540801525116, "loss_lvr": 0.5367274284362793, "loss_mode_switch": 0.0, "loss_total": 0.20158815383911133, "step": 377 }, { "batch_size": 1, "epoch": 0.1508, "step": 377, "tokens_per_device": 4750 }, { "epoch": 0.1508, "loss_ce": 0.010112136602401733, "loss_lvr": 0.6195278167724609, "loss_mode_switch": 0.0, "loss_total": 0.07206492125988007, "step": 377 }, { "batch_size": 4, "epoch": 0.1508, "step": 377, "tokens_per_device": 2796 }, { "epoch": 0.1508, "loss_ce": 0.23343141376972198, "loss_lvr": 0.8862071633338928, "loss_mode_switch": 0.0, "loss_total": 0.32205212116241455, "step": 377 }, { "batch_size": 1, "epoch": 0.1508, "step": 377, "tokens_per_device": 4912 }, { "epoch": 0.1508, "loss_ce": 0.18257158994674683, "loss_lvr": 0.5073240399360657, "loss_mode_switch": 0.0, "loss_total": 0.2333039939403534, "step": 377 }, { "batch_size": 4, "epoch": 0.1508, "step": 377, "tokens_per_device": 4244 }, { "epoch": 0.1508, "loss_ce": 0.11960916966199875, "loss_lvr": 0.9778361916542053, "loss_mode_switch": 0.0, "loss_total": 0.21739278733730316, "step": 377 }, { "batch_size": 4, "epoch": 0.1508, "step": 377, "tokens_per_device": 4868 }, { "epoch": 0.1508, "loss_ce": 0.12150024622678757, "loss_lvr": 0.7943480014801025, "loss_mode_switch": 0.0, "loss_total": 0.20093505084514618, "step": 377 }, { "epoch": 0.1512, "grad_norm": 1.2428373098373413, "learning_rate": 9.619707456485848e-06, "loss": 0.3064, "step": 378 }, { "batch_size": 4, "epoch": 0.1512, "step": 378, "tokens_per_device": 15540 }, { "epoch": 0.1512, "loss_ce": 0.13886022567749023, "loss_lvr": 1.2001737356185913, "loss_mode_switch": 0.0, "loss_total": 0.25887760519981384, "step": 378 }, { "batch_size": 1, "epoch": 0.1512, "step": 378, "tokens_per_device": 5179 }, { "epoch": 0.1512, "loss_ce": 0.0496356226503849, "loss_lvr": 0.38477426767349243, "loss_mode_switch": 0.0, "loss_total": 0.08811305463314056, "step": 378 }, { "batch_size": 1, "epoch": 0.1512, "step": 378, "tokens_per_device": 5106 }, { "epoch": 0.1512, "loss_ce": 0.0007487069815397263, "loss_lvr": 0.5882166624069214, "loss_mode_switch": 0.0, "loss_total": 0.059570372104644775, "step": 378 }, { "batch_size": 4, "epoch": 0.1512, "step": 378, "tokens_per_device": 2592 }, { "epoch": 0.1512, "loss_ce": 0.456285297870636, "loss_lvr": 1.1152974367141724, "loss_mode_switch": 0.0, "loss_total": 0.5678150653839111, "step": 378 }, { "batch_size": 4, "epoch": 0.1512, "step": 378, "tokens_per_device": 5568 }, { "epoch": 0.1512, "loss_ce": 0.203728586435318, "loss_lvr": 1.206106424331665, "loss_mode_switch": 0.0, "loss_total": 0.32433924078941345, "step": 378 }, { "batch_size": 4, "epoch": 0.1512, "step": 378, "tokens_per_device": 2672 }, { "epoch": 0.1512, "loss_ce": 0.09004175662994385, "loss_lvr": 0.6977189183235168, "loss_mode_switch": 0.0, "loss_total": 0.15981364250183105, "step": 378 }, { "batch_size": 4, "epoch": 0.1512, "step": 378, "tokens_per_device": 1460 }, { "epoch": 0.1512, "loss_ce": 0.22990593314170837, "loss_lvr": 1.0702104568481445, "loss_mode_switch": 0.0, "loss_total": 0.33692699670791626, "step": 378 }, { "batch_size": 4, "epoch": 0.1512, "step": 378, "tokens_per_device": 1380 }, { "epoch": 0.1512, "loss_ce": 0.32683587074279785, "loss_lvr": 1.819747805595398, "loss_mode_switch": 0.0, "loss_total": 0.5088106393814087, "step": 378 }, { "epoch": 0.1516, "grad_norm": 1.5547850131988525, "learning_rate": 9.61722571363223e-06, "loss": 0.3653, "step": 379 }, { "batch_size": 1, "epoch": 0.1516, "step": 379, "tokens_per_device": 4960 }, { "epoch": 0.1516, "loss_ce": 0.004776874557137489, "loss_lvr": 0.38684016466140747, "loss_mode_switch": 0.0, "loss_total": 0.04346089065074921, "step": 379 }, { "batch_size": 4, "epoch": 0.1516, "step": 379, "tokens_per_device": 2700 }, { "epoch": 0.1516, "loss_ce": 0.6905608773231506, "loss_lvr": 0.9271300435066223, "loss_mode_switch": 0.0, "loss_total": 0.7832738757133484, "step": 379 }, { "batch_size": 1, "epoch": 0.1516, "step": 379, "tokens_per_device": 4862 }, { "epoch": 0.1516, "loss_ce": 0.0006260563968680799, "loss_lvr": 0.9026930928230286, "loss_mode_switch": 0.0, "loss_total": 0.09089536964893341, "step": 379 }, { "batch_size": 4, "epoch": 0.1516, "step": 379, "tokens_per_device": 2584 }, { "epoch": 0.1516, "loss_ce": 0.2027026116847992, "loss_lvr": 0.9659826159477234, "loss_mode_switch": 0.0, "loss_total": 0.299300879240036, "step": 379 }, { "batch_size": 4, "epoch": 0.1516, "step": 379, "tokens_per_device": 8432 }, { "epoch": 0.1516, "loss_ce": 0.1974620521068573, "loss_lvr": 0.6316545605659485, "loss_mode_switch": 0.0, "loss_total": 0.26062750816345215, "step": 379 }, { "batch_size": 4, "epoch": 0.1516, "step": 379, "tokens_per_device": 7108 }, { "epoch": 0.1516, "loss_ce": 0.021160397678613663, "loss_lvr": 0.9101309180259705, "loss_mode_switch": 0.0, "loss_total": 0.11217349767684937, "step": 379 }, { "batch_size": 4, "epoch": 0.1516, "step": 379, "tokens_per_device": 4224 }, { "epoch": 0.1516, "loss_ce": 0.5570511817932129, "loss_lvr": 0.8437231779098511, "loss_mode_switch": 0.0, "loss_total": 0.6414235234260559, "step": 379 }, { "batch_size": 4, "epoch": 0.1516, "step": 379, "tokens_per_device": 3768 }, { "epoch": 0.1516, "loss_ce": 0.7994076013565063, "loss_lvr": 1.1578885316848755, "loss_mode_switch": 0.0, "loss_total": 0.9151964783668518, "step": 379 }, { "epoch": 0.152, "grad_norm": 1.4839072227478027, "learning_rate": 9.61473622157086e-06, "loss": 0.3575, "step": 380 }, { "batch_size": 4, "epoch": 0.152, "step": 380, "tokens_per_device": 10008 }, { "epoch": 0.152, "loss_ce": 0.11303387582302094, "loss_lvr": 0.5867375135421753, "loss_mode_switch": 0.0, "loss_total": 0.1717076301574707, "step": 380 }, { "batch_size": 1, "epoch": 0.152, "step": 380, "tokens_per_device": 4656 }, { "epoch": 0.152, "loss_ce": 0.0038114795461297035, "loss_lvr": 0.7506209015846252, "loss_mode_switch": 0.0, "loss_total": 0.07887356728315353, "step": 380 }, { "batch_size": 4, "epoch": 0.152, "step": 380, "tokens_per_device": 4388 }, { "epoch": 0.152, "loss_ce": 0.11318518966436386, "loss_lvr": 1.4608439207077026, "loss_mode_switch": 0.0, "loss_total": 0.2592695951461792, "step": 380 }, { "batch_size": 4, "epoch": 0.152, "step": 380, "tokens_per_device": 2712 }, { "epoch": 0.152, "loss_ce": 0.1255417913198471, "loss_lvr": 1.0004515647888184, "loss_mode_switch": 0.0, "loss_total": 0.22558695077896118, "step": 380 }, { "batch_size": 4, "epoch": 0.152, "step": 380, "tokens_per_device": 8856 }, { "epoch": 0.152, "loss_ce": 0.04914809763431549, "loss_lvr": 0.6655860543251038, "loss_mode_switch": 0.0, "loss_total": 0.11570670455694199, "step": 380 }, { "batch_size": 1, "epoch": 0.152, "step": 380, "tokens_per_device": 5200 }, { "epoch": 0.152, "loss_ce": 0.010263788513839245, "loss_lvr": 0.3762179911136627, "loss_mode_switch": 0.0, "loss_total": 0.04788558930158615, "step": 380 }, { "batch_size": 1, "epoch": 0.152, "step": 380, "tokens_per_device": 5113 }, { "epoch": 0.152, "loss_ce": 0.017210859805345535, "loss_lvr": 0.6449829339981079, "loss_mode_switch": 0.0, "loss_total": 0.08170916140079498, "step": 380 }, { "batch_size": 4, "epoch": 0.152, "step": 380, "tokens_per_device": 8256 }, { "epoch": 0.152, "loss_ce": 0.11480610817670822, "loss_lvr": 1.0094369649887085, "loss_mode_switch": 0.0, "loss_total": 0.2157498002052307, "step": 380 }, { "epoch": 0.1524, "grad_norm": 1.3540719747543335, "learning_rate": 9.61223898447991e-06, "loss": 0.3092, "step": 381 }, { "batch_size": 1, "epoch": 0.1524, "step": 381, "tokens_per_device": 5095 }, { "epoch": 0.1524, "loss_ce": 0.00313164503313601, "loss_lvr": 0.4902135729789734, "loss_mode_switch": 0.0, "loss_total": 0.052153006196022034, "step": 381 }, { "batch_size": 1, "epoch": 0.1524, "step": 381, "tokens_per_device": 4892 }, { "epoch": 0.1524, "loss_ce": 0.0065734535455703735, "loss_lvr": 1.0669716596603394, "loss_mode_switch": 0.0, "loss_total": 0.11327061802148819, "step": 381 }, { "batch_size": 4, "epoch": 0.1524, "step": 381, "tokens_per_device": 1320 }, { "epoch": 0.1524, "loss_ce": 0.25312158465385437, "loss_lvr": 1.073249101638794, "loss_mode_switch": 0.0, "loss_total": 0.3604465126991272, "step": 381 }, { "batch_size": 4, "epoch": 0.1524, "step": 381, "tokens_per_device": 4304 }, { "epoch": 0.1524, "loss_ce": 0.439175546169281, "loss_lvr": 1.0793766975402832, "loss_mode_switch": 0.0, "loss_total": 0.5471132397651672, "step": 381 }, { "batch_size": 4, "epoch": 0.1524, "step": 381, "tokens_per_device": 4292 }, { "epoch": 0.1524, "loss_ce": 0.021295469254255295, "loss_lvr": 1.0256474018096924, "loss_mode_switch": 0.0, "loss_total": 0.12386021018028259, "step": 381 }, { "batch_size": 4, "epoch": 0.1524, "step": 381, "tokens_per_device": 7020 }, { "epoch": 0.1524, "loss_ce": 0.15062184631824493, "loss_lvr": 0.8505524396896362, "loss_mode_switch": 0.0, "loss_total": 0.2356770932674408, "step": 381 }, { "batch_size": 4, "epoch": 0.1524, "step": 381, "tokens_per_device": 1464 }, { "epoch": 0.1524, "loss_ce": 0.3859783113002777, "loss_lvr": 1.1424330472946167, "loss_mode_switch": 0.0, "loss_total": 0.5002216100692749, "step": 381 }, { "batch_size": 4, "epoch": 0.1524, "step": 381, "tokens_per_device": 3248 }, { "epoch": 0.1524, "loss_ce": 0.34358519315719604, "loss_lvr": 1.0695464611053467, "loss_mode_switch": 0.0, "loss_total": 0.45053982734680176, "step": 381 }, { "epoch": 0.1528, "grad_norm": 1.8080116510391235, "learning_rate": 9.609734006550562e-06, "loss": 0.3114, "step": 382 }, { "batch_size": 4, "epoch": 0.1528, "step": 382, "tokens_per_device": 4184 }, { "epoch": 0.1528, "loss_ce": 0.28707411885261536, "loss_lvr": 0.7699311375617981, "loss_mode_switch": 0.0, "loss_total": 0.3640672266483307, "step": 382 }, { "batch_size": 4, "epoch": 0.1528, "step": 382, "tokens_per_device": 2728 }, { "epoch": 0.1528, "loss_ce": 0.20855417847633362, "loss_lvr": 0.8040369749069214, "loss_mode_switch": 0.0, "loss_total": 0.2889578938484192, "step": 382 }, { "batch_size": 4, "epoch": 0.1528, "step": 382, "tokens_per_device": 1196 }, { "epoch": 0.1528, "loss_ce": 0.22526764869689941, "loss_lvr": 1.2839248180389404, "loss_mode_switch": 0.0, "loss_total": 0.35366013646125793, "step": 382 }, { "batch_size": 4, "epoch": 0.1528, "step": 382, "tokens_per_device": 5832 }, { "epoch": 0.1528, "loss_ce": 0.270204097032547, "loss_lvr": 0.570810854434967, "loss_mode_switch": 0.0, "loss_total": 0.32728517055511475, "step": 382 }, { "batch_size": 4, "epoch": 0.1528, "step": 382, "tokens_per_device": 3008 }, { "epoch": 0.1528, "loss_ce": 0.44057559967041016, "loss_lvr": 0.9847935438156128, "loss_mode_switch": 0.0, "loss_total": 0.5390549302101135, "step": 382 }, { "batch_size": 1, "epoch": 0.1528, "step": 382, "tokens_per_device": 5121 }, { "epoch": 0.1528, "loss_ce": 0.016178051009774208, "loss_lvr": 0.3060544729232788, "loss_mode_switch": 0.0, "loss_total": 0.04678349941968918, "step": 382 }, { "batch_size": 4, "epoch": 0.1528, "step": 382, "tokens_per_device": 3880 }, { "epoch": 0.1528, "loss_ce": 0.027881359681487083, "loss_lvr": 1.012843370437622, "loss_mode_switch": 0.0, "loss_total": 0.12916569411754608, "step": 382 }, { "batch_size": 4, "epoch": 0.1528, "step": 382, "tokens_per_device": 14192 }, { "epoch": 0.1528, "loss_ce": 0.37863588333129883, "loss_lvr": 0.9453637599945068, "loss_mode_switch": 0.0, "loss_total": 0.47317224740982056, "step": 382 }, { "epoch": 0.1532, "grad_norm": 1.7540521621704102, "learning_rate": 9.607221291986983e-06, "loss": 0.2952, "step": 383 }, { "batch_size": 1, "epoch": 0.1532, "step": 383, "tokens_per_device": 4903 }, { "epoch": 0.1532, "loss_ce": 0.20552517473697662, "loss_lvr": 0.48902541399002075, "loss_mode_switch": 0.0, "loss_total": 0.2544277310371399, "step": 383 }, { "batch_size": 4, "epoch": 0.1532, "step": 383, "tokens_per_device": 7012 }, { "epoch": 0.1532, "loss_ce": 0.11948857456445694, "loss_lvr": 0.401028037071228, "loss_mode_switch": 0.0, "loss_total": 0.15959137678146362, "step": 383 }, { "batch_size": 4, "epoch": 0.1532, "step": 383, "tokens_per_device": 4228 }, { "epoch": 0.1532, "loss_ce": 0.061633892357349396, "loss_lvr": 1.2722439765930176, "loss_mode_switch": 0.0, "loss_total": 0.188858300447464, "step": 383 }, { "batch_size": 1, "epoch": 0.1532, "step": 383, "tokens_per_device": 5233 }, { "epoch": 0.1532, "loss_ce": 0.09359592199325562, "loss_lvr": 0.5330105423927307, "loss_mode_switch": 0.0, "loss_total": 0.14689697325229645, "step": 383 }, { "batch_size": 4, "epoch": 0.1532, "step": 383, "tokens_per_device": 10408 }, { "epoch": 0.1532, "loss_ce": 0.0277717188000679, "loss_lvr": 0.7356839179992676, "loss_mode_switch": 0.0, "loss_total": 0.10134011507034302, "step": 383 }, { "batch_size": 4, "epoch": 0.1532, "step": 383, "tokens_per_device": 13932 }, { "epoch": 0.1532, "loss_ce": 0.04064028710126877, "loss_lvr": 0.8693975210189819, "loss_mode_switch": 0.0, "loss_total": 0.12758004665374756, "step": 383 }, { "batch_size": 4, "epoch": 0.1532, "step": 383, "tokens_per_device": 4520 }, { "epoch": 0.1532, "loss_ce": 0.35381391644477844, "loss_lvr": 1.1427783966064453, "loss_mode_switch": 0.0, "loss_total": 0.468091756105423, "step": 383 }, { "batch_size": 4, "epoch": 0.1532, "step": 383, "tokens_per_device": 5400 }, { "epoch": 0.1532, "loss_ce": 0.067206472158432, "loss_lvr": 1.0448696613311768, "loss_mode_switch": 0.0, "loss_total": 0.17169344425201416, "step": 383 }, { "epoch": 0.1536, "grad_norm": 1.4569895267486572, "learning_rate": 9.604700845006326e-06, "loss": 0.2922, "step": 384 }, { "batch_size": 1, "epoch": 0.1536, "step": 384, "tokens_per_device": 5164 }, { "epoch": 0.1536, "loss_ce": 0.0010618247324600816, "loss_lvr": 0.3536994159221649, "loss_mode_switch": 0.0, "loss_total": 0.03643176704645157, "step": 384 }, { "batch_size": 1, "epoch": 0.1536, "step": 384, "tokens_per_device": 5022 }, { "epoch": 0.1536, "loss_ce": 0.0207492895424366, "loss_lvr": 0.16472755372524261, "loss_mode_switch": 0.0, "loss_total": 0.03722204267978668, "step": 384 }, { "batch_size": 4, "epoch": 0.1536, "step": 384, "tokens_per_device": 1312 }, { "epoch": 0.1536, "loss_ce": 0.5854791402816772, "loss_lvr": 1.2051093578338623, "loss_mode_switch": 0.0, "loss_total": 0.7059900760650635, "step": 384 }, { "batch_size": 1, "epoch": 0.1536, "step": 384, "tokens_per_device": 5506 }, { "epoch": 0.1536, "loss_ce": 0.019228067249059677, "loss_lvr": 0.3868545591831207, "loss_mode_switch": 0.0, "loss_total": 0.05791352316737175, "step": 384 }, { "batch_size": 4, "epoch": 0.1536, "step": 384, "tokens_per_device": 4144 }, { "epoch": 0.1536, "loss_ce": 0.1877896934747696, "loss_lvr": 0.9962294697761536, "loss_mode_switch": 0.0, "loss_total": 0.2874126434326172, "step": 384 }, { "batch_size": 4, "epoch": 0.1536, "step": 384, "tokens_per_device": 5572 }, { "epoch": 0.1536, "loss_ce": 0.44018447399139404, "loss_lvr": 1.1904791593551636, "loss_mode_switch": 0.0, "loss_total": 0.5592324137687683, "step": 384 }, { "batch_size": 4, "epoch": 0.1536, "step": 384, "tokens_per_device": 2788 }, { "epoch": 0.1536, "loss_ce": 0.5988858342170715, "loss_lvr": 1.109115719795227, "loss_mode_switch": 0.0, "loss_total": 0.7097973823547363, "step": 384 }, { "batch_size": 4, "epoch": 0.1536, "step": 384, "tokens_per_device": 3760 }, { "epoch": 0.1536, "loss_ce": 0.26396825909614563, "loss_lvr": 1.205643892288208, "loss_mode_switch": 0.0, "loss_total": 0.3845326602458954, "step": 384 }, { "epoch": 0.154, "grad_norm": 1.4108011722564697, "learning_rate": 9.602172669838721e-06, "loss": 0.343, "step": 385 }, { "batch_size": 4, "epoch": 0.154, "step": 385, "tokens_per_device": 4464 }, { "epoch": 0.154, "loss_ce": 0.2959626615047455, "loss_lvr": 2.3101840019226074, "loss_mode_switch": 0.0, "loss_total": 0.5269810557365417, "step": 385 }, { "batch_size": 4, "epoch": 0.154, "step": 385, "tokens_per_device": 6120 }, { "epoch": 0.154, "loss_ce": 0.557555079460144, "loss_lvr": 0.8658533096313477, "loss_mode_switch": 0.0, "loss_total": 0.6441404223442078, "step": 385 }, { "batch_size": 4, "epoch": 0.154, "step": 385, "tokens_per_device": 9736 }, { "epoch": 0.154, "loss_ce": 0.33992278575897217, "loss_lvr": 1.0530571937561035, "loss_mode_switch": 0.0, "loss_total": 0.4452285170555115, "step": 385 }, { "batch_size": 1, "epoch": 0.154, "step": 385, "tokens_per_device": 5089 }, { "epoch": 0.154, "loss_ce": 0.7911924719810486, "loss_lvr": 1.1019643545150757, "loss_mode_switch": 0.0, "loss_total": 0.9013888835906982, "step": 385 }, { "batch_size": 4, "epoch": 0.154, "step": 385, "tokens_per_device": 3824 }, { "epoch": 0.154, "loss_ce": 0.20692025125026703, "loss_lvr": 1.0141180753707886, "loss_mode_switch": 0.0, "loss_total": 0.30833205580711365, "step": 385 }, { "batch_size": 4, "epoch": 0.154, "step": 385, "tokens_per_device": 1492 }, { "epoch": 0.154, "loss_ce": 0.11829187721014023, "loss_lvr": 1.2665095329284668, "loss_mode_switch": 0.0, "loss_total": 0.24494284391403198, "step": 385 }, { "batch_size": 4, "epoch": 0.154, "step": 385, "tokens_per_device": 4528 }, { "epoch": 0.154, "loss_ce": 0.09184082597494125, "loss_lvr": 0.6763695478439331, "loss_mode_switch": 0.0, "loss_total": 0.15947778522968292, "step": 385 }, { "batch_size": 4, "epoch": 0.154, "step": 385, "tokens_per_device": 5864 }, { "epoch": 0.154, "loss_ce": 0.357024610042572, "loss_lvr": 0.866159200668335, "loss_mode_switch": 0.0, "loss_total": 0.4436405301094055, "step": 385 }, { "epoch": 0.1544, "grad_norm": 1.544589877128601, "learning_rate": 9.59963677072727e-06, "loss": 0.3135, "step": 386 }, { "batch_size": 4, "epoch": 0.1544, "step": 386, "tokens_per_device": 4248 }, { "epoch": 0.1544, "loss_ce": 0.44314759969711304, "loss_lvr": 1.9068710803985596, "loss_mode_switch": 0.0, "loss_total": 0.633834719657898, "step": 386 }, { "batch_size": 4, "epoch": 0.1544, "step": 386, "tokens_per_device": 7324 }, { "epoch": 0.1544, "loss_ce": 0.15600791573524475, "loss_lvr": 0.9414993524551392, "loss_mode_switch": 0.0, "loss_total": 0.2501578629016876, "step": 386 }, { "batch_size": 4, "epoch": 0.1544, "step": 386, "tokens_per_device": 2568 }, { "epoch": 0.1544, "loss_ce": 0.28324654698371887, "loss_lvr": 1.4436757564544678, "loss_mode_switch": 0.0, "loss_total": 0.42761412262916565, "step": 386 }, { "batch_size": 4, "epoch": 0.1544, "step": 386, "tokens_per_device": 6744 }, { "epoch": 0.1544, "loss_ce": 0.0803990364074707, "loss_lvr": 0.7060446739196777, "loss_mode_switch": 0.0, "loss_total": 0.15100350975990295, "step": 386 }, { "batch_size": 1, "epoch": 0.1544, "step": 386, "tokens_per_device": 5131 }, { "epoch": 0.1544, "loss_ce": 0.11208062618970871, "loss_lvr": 0.850426435470581, "loss_mode_switch": 0.0, "loss_total": 0.19712327420711517, "step": 386 }, { "batch_size": 1, "epoch": 0.1544, "step": 386, "tokens_per_device": 4900 }, { "epoch": 0.1544, "loss_ce": 0.12480760365724564, "loss_lvr": 1.1409718990325928, "loss_mode_switch": 0.0, "loss_total": 0.23890480399131775, "step": 386 }, { "batch_size": 4, "epoch": 0.1544, "step": 386, "tokens_per_device": 4364 }, { "epoch": 0.1544, "loss_ce": 0.0938669815659523, "loss_lvr": 0.8715136647224426, "loss_mode_switch": 0.0, "loss_total": 0.18101835250854492, "step": 386 }, { "batch_size": 4, "epoch": 0.1544, "step": 386, "tokens_per_device": 3796 }, { "epoch": 0.1544, "loss_ce": 0.055170007050037384, "loss_lvr": 0.9596611261367798, "loss_mode_switch": 0.0, "loss_total": 0.1511361300945282, "step": 386 }, { "epoch": 0.1548, "grad_norm": 1.3348270654678345, "learning_rate": 9.597093151928035e-06, "loss": 0.3205, "step": 387 }, { "batch_size": 1, "epoch": 0.1548, "step": 387, "tokens_per_device": 5062 }, { "epoch": 0.1548, "loss_ce": 0.0210344847291708, "loss_lvr": 0.6608169078826904, "loss_mode_switch": 0.0, "loss_total": 0.08711618185043335, "step": 387 }, { "batch_size": 4, "epoch": 0.1548, "step": 387, "tokens_per_device": 6100 }, { "epoch": 0.1548, "loss_ce": 0.41319331526756287, "loss_lvr": 0.9706212878227234, "loss_mode_switch": 0.0, "loss_total": 0.5102554559707642, "step": 387 }, { "batch_size": 4, "epoch": 0.1548, "step": 387, "tokens_per_device": 4512 }, { "epoch": 0.1548, "loss_ce": 0.17561063170433044, "loss_lvr": 0.7776727676391602, "loss_mode_switch": 0.0, "loss_total": 0.25337791442871094, "step": 387 }, { "batch_size": 4, "epoch": 0.1548, "step": 387, "tokens_per_device": 7648 }, { "epoch": 0.1548, "loss_ce": 0.20547759532928467, "loss_lvr": 1.14409339427948, "loss_mode_switch": 0.0, "loss_total": 0.3198869228363037, "step": 387 }, { "batch_size": 4, "epoch": 0.1548, "step": 387, "tokens_per_device": 6528 }, { "epoch": 0.1548, "loss_ce": 0.009448698721826077, "loss_lvr": 0.8403939604759216, "loss_mode_switch": 0.0, "loss_total": 0.09348809719085693, "step": 387 }, { "batch_size": 4, "epoch": 0.1548, "step": 387, "tokens_per_device": 4332 }, { "epoch": 0.1548, "loss_ce": 0.25089144706726074, "loss_lvr": 1.087294578552246, "loss_mode_switch": 0.0, "loss_total": 0.3596208989620209, "step": 387 }, { "batch_size": 1, "epoch": 0.1548, "step": 387, "tokens_per_device": 5059 }, { "epoch": 0.1548, "loss_ce": 0.6451095342636108, "loss_lvr": 0.30963194370269775, "loss_mode_switch": 0.0, "loss_total": 0.6760727167129517, "step": 387 }, { "batch_size": 4, "epoch": 0.1548, "step": 387, "tokens_per_device": 4396 }, { "epoch": 0.1548, "loss_ce": 0.5336665511131287, "loss_lvr": 1.076248049736023, "loss_mode_switch": 0.0, "loss_total": 0.6412913799285889, "step": 387 }, { "epoch": 0.1552, "grad_norm": 1.7115631103515625, "learning_rate": 9.594541817710039e-06, "loss": 0.3335, "step": 388 }, { "batch_size": 4, "epoch": 0.1552, "step": 388, "tokens_per_device": 5672 }, { "epoch": 0.1552, "loss_ce": 0.029398811981081963, "loss_lvr": 0.762498676776886, "loss_mode_switch": 0.0, "loss_total": 0.10564868152141571, "step": 388 }, { "batch_size": 4, "epoch": 0.1552, "step": 388, "tokens_per_device": 2600 }, { "epoch": 0.1552, "loss_ce": 0.4646508991718292, "loss_lvr": 0.9797332286834717, "loss_mode_switch": 0.0, "loss_total": 0.5626242160797119, "step": 388 }, { "batch_size": 4, "epoch": 0.1552, "step": 388, "tokens_per_device": 1504 }, { "epoch": 0.1552, "loss_ce": 0.35137274861335754, "loss_lvr": 0.9297395348548889, "loss_mode_switch": 0.0, "loss_total": 0.44434669613838196, "step": 388 }, { "batch_size": 4, "epoch": 0.1552, "step": 388, "tokens_per_device": 2696 }, { "epoch": 0.1552, "loss_ce": 0.11128485202789307, "loss_lvr": 0.887860894203186, "loss_mode_switch": 0.0, "loss_total": 0.20007094740867615, "step": 388 }, { "batch_size": 1, "epoch": 0.1552, "step": 388, "tokens_per_device": 5025 }, { "epoch": 0.1552, "loss_ce": 0.2737913131713867, "loss_lvr": 0.2381182163953781, "loss_mode_switch": 0.0, "loss_total": 0.29760313034057617, "step": 388 }, { "batch_size": 4, "epoch": 0.1552, "step": 388, "tokens_per_device": 2664 }, { "epoch": 0.1552, "loss_ce": 0.03374204412102699, "loss_lvr": 0.953533411026001, "loss_mode_switch": 0.0, "loss_total": 0.1290953904390335, "step": 388 }, { "batch_size": 4, "epoch": 0.1552, "step": 388, "tokens_per_device": 3992 }, { "epoch": 0.1552, "loss_ce": 0.30111566185951233, "loss_lvr": 0.8828447461128235, "loss_mode_switch": 0.0, "loss_total": 0.3894001245498657, "step": 388 }, { "batch_size": 4, "epoch": 0.1552, "step": 388, "tokens_per_device": 10724 }, { "epoch": 0.1552, "loss_ce": 0.3839055895805359, "loss_lvr": 0.710784375667572, "loss_mode_switch": 0.0, "loss_total": 0.45498403906822205, "step": 388 }, { "epoch": 0.1556, "grad_norm": 1.589958667755127, "learning_rate": 9.591982772355248e-06, "loss": 0.3561, "step": 389 }, { "batch_size": 4, "epoch": 0.1556, "step": 389, "tokens_per_device": 4596 }, { "epoch": 0.1556, "loss_ce": 0.21356481313705444, "loss_lvr": 0.9272635579109192, "loss_mode_switch": 0.0, "loss_total": 0.3062911629676819, "step": 389 }, { "batch_size": 1, "epoch": 0.1556, "step": 389, "tokens_per_device": 4866 }, { "epoch": 0.1556, "loss_ce": 0.002045656321570277, "loss_lvr": 0.18649853765964508, "loss_mode_switch": 0.0, "loss_total": 0.020695509389042854, "step": 389 }, { "batch_size": 1, "epoch": 0.1556, "step": 389, "tokens_per_device": 5911 }, { "epoch": 0.1556, "loss_ce": 0.09057078510522842, "loss_lvr": 0.6398302912712097, "loss_mode_switch": 0.0, "loss_total": 0.15455381572246552, "step": 389 }, { "batch_size": 4, "epoch": 0.1556, "step": 389, "tokens_per_device": 4752 }, { "epoch": 0.1556, "loss_ce": 0.4892805814743042, "loss_lvr": 1.2667676210403442, "loss_mode_switch": 0.0, "loss_total": 0.6159573793411255, "step": 389 }, { "batch_size": 1, "epoch": 0.1556, "step": 389, "tokens_per_device": 5008 }, { "epoch": 0.1556, "loss_ce": 0.07212672382593155, "loss_lvr": 0.1768343150615692, "loss_mode_switch": 0.0, "loss_total": 0.08981015533208847, "step": 389 }, { "batch_size": 1, "epoch": 0.1556, "step": 389, "tokens_per_device": 5040 }, { "epoch": 0.1556, "loss_ce": 0.564271867275238, "loss_lvr": 0.682481050491333, "loss_mode_switch": 0.0, "loss_total": 0.6325199604034424, "step": 389 }, { "batch_size": 1, "epoch": 0.1556, "step": 389, "tokens_per_device": 5115 }, { "epoch": 0.1556, "loss_ce": 0.00645079743117094, "loss_lvr": 0.5245147347450256, "loss_mode_switch": 0.0, "loss_total": 0.05890227109193802, "step": 389 }, { "batch_size": 1, "epoch": 0.1556, "step": 389, "tokens_per_device": 4935 }, { "epoch": 0.1556, "loss_ce": 0.16725081205368042, "loss_lvr": 0.6597110033035278, "loss_mode_switch": 0.0, "loss_total": 0.23322191834449768, "step": 389 }, { "epoch": 0.156, "grad_norm": 1.4782075881958008, "learning_rate": 9.589416020158577e-06, "loss": 0.3643, "step": 390 }, { "batch_size": 4, "epoch": 0.156, "step": 390, "tokens_per_device": 1380 }, { "epoch": 0.156, "loss_ce": 0.3526991605758667, "loss_lvr": 1.7721154689788818, "loss_mode_switch": 0.0, "loss_total": 0.529910683631897, "step": 390 }, { "batch_size": 4, "epoch": 0.156, "step": 390, "tokens_per_device": 4300 }, { "epoch": 0.156, "loss_ce": 0.23997220396995544, "loss_lvr": 0.896256148815155, "loss_mode_switch": 0.0, "loss_total": 0.3295978307723999, "step": 390 }, { "batch_size": 4, "epoch": 0.156, "step": 390, "tokens_per_device": 2648 }, { "epoch": 0.156, "loss_ce": 0.2712554931640625, "loss_lvr": 0.8783801198005676, "loss_mode_switch": 0.0, "loss_total": 0.3590935170650482, "step": 390 }, { "batch_size": 4, "epoch": 0.156, "step": 390, "tokens_per_device": 3532 }, { "epoch": 0.156, "loss_ce": 0.19859600067138672, "loss_lvr": 0.7652801871299744, "loss_mode_switch": 0.0, "loss_total": 0.2751240134239197, "step": 390 }, { "batch_size": 4, "epoch": 0.156, "step": 390, "tokens_per_device": 3808 }, { "epoch": 0.156, "loss_ce": 0.14707116782665253, "loss_lvr": 1.0316603183746338, "loss_mode_switch": 0.0, "loss_total": 0.25023719668388367, "step": 390 }, { "batch_size": 4, "epoch": 0.156, "step": 390, "tokens_per_device": 2556 }, { "epoch": 0.156, "loss_ce": 0.33689194917678833, "loss_lvr": 1.1790416240692139, "loss_mode_switch": 0.0, "loss_total": 0.45479610562324524, "step": 390 }, { "batch_size": 4, "epoch": 0.156, "step": 390, "tokens_per_device": 4720 }, { "epoch": 0.156, "loss_ce": 0.2260436862707138, "loss_lvr": 0.861229658126831, "loss_mode_switch": 0.0, "loss_total": 0.31216666102409363, "step": 390 }, { "batch_size": 1, "epoch": 0.156, "step": 390, "tokens_per_device": 5342 }, { "epoch": 0.156, "loss_ce": 0.02016947977244854, "loss_lvr": 0.48763447999954224, "loss_mode_switch": 0.0, "loss_total": 0.0689329281449318, "step": 390 }, { "epoch": 0.1564, "grad_norm": 2.880974292755127, "learning_rate": 9.586841565427869e-06, "loss": 0.3622, "step": 391 }, { "batch_size": 4, "epoch": 0.1564, "step": 391, "tokens_per_device": 5156 }, { "epoch": 0.1564, "loss_ce": 0.007719903718680143, "loss_lvr": 0.8572932481765747, "loss_mode_switch": 0.0, "loss_total": 0.09344922751188278, "step": 391 }, { "batch_size": 4, "epoch": 0.1564, "step": 391, "tokens_per_device": 4268 }, { "epoch": 0.1564, "loss_ce": 0.20794175565242767, "loss_lvr": 0.6963508129119873, "loss_mode_switch": 0.0, "loss_total": 0.27757683396339417, "step": 391 }, { "batch_size": 4, "epoch": 0.1564, "step": 391, "tokens_per_device": 4384 }, { "epoch": 0.1564, "loss_ce": 0.32369616627693176, "loss_lvr": 1.120828628540039, "loss_mode_switch": 0.0, "loss_total": 0.43577903509140015, "step": 391 }, { "batch_size": 4, "epoch": 0.1564, "step": 391, "tokens_per_device": 3472 }, { "epoch": 0.1564, "loss_ce": 0.5802494883537292, "loss_lvr": 1.1379380226135254, "loss_mode_switch": 0.0, "loss_total": 0.6940432786941528, "step": 391 }, { "batch_size": 4, "epoch": 0.1564, "step": 391, "tokens_per_device": 1500 }, { "epoch": 0.1564, "loss_ce": 0.5394177436828613, "loss_lvr": 1.1070507764816284, "loss_mode_switch": 0.0, "loss_total": 0.6501228213310242, "step": 391 }, { "batch_size": 4, "epoch": 0.1564, "step": 391, "tokens_per_device": 5576 }, { "epoch": 0.1564, "loss_ce": 0.6947021484375, "loss_lvr": 1.0923432111740112, "loss_mode_switch": 0.0, "loss_total": 0.8039364814758301, "step": 391 }, { "batch_size": 4, "epoch": 0.1564, "step": 391, "tokens_per_device": 6296 }, { "epoch": 0.1564, "loss_ce": 0.38466084003448486, "loss_lvr": 0.5793682932853699, "loss_mode_switch": 0.0, "loss_total": 0.4425976574420929, "step": 391 }, { "batch_size": 1, "epoch": 0.1564, "step": 391, "tokens_per_device": 5070 }, { "epoch": 0.1564, "loss_ce": 0.055685125291347504, "loss_lvr": 0.3980659246444702, "loss_mode_switch": 0.0, "loss_total": 0.09549172222614288, "step": 391 }, { "epoch": 0.1568, "grad_norm": 1.3698124885559082, "learning_rate": 9.584259412483899e-06, "loss": 0.3343, "step": 392 }, { "batch_size": 4, "epoch": 0.1568, "step": 392, "tokens_per_device": 4276 }, { "epoch": 0.1568, "loss_ce": 0.21234172582626343, "loss_lvr": 1.0861799716949463, "loss_mode_switch": 0.0, "loss_total": 0.3209597170352936, "step": 392 }, { "batch_size": 1, "epoch": 0.1568, "step": 392, "tokens_per_device": 5013 }, { "epoch": 0.1568, "loss_ce": 0.029502861201763153, "loss_lvr": 0.5659250020980835, "loss_mode_switch": 0.0, "loss_total": 0.08609536290168762, "step": 392 }, { "batch_size": 4, "epoch": 0.1568, "step": 392, "tokens_per_device": 4888 }, { "epoch": 0.1568, "loss_ce": 0.05654241144657135, "loss_lvr": 1.013005256652832, "loss_mode_switch": 0.0, "loss_total": 0.15784293413162231, "step": 392 }, { "batch_size": 4, "epoch": 0.1568, "step": 392, "tokens_per_device": 4884 }, { "epoch": 0.1568, "loss_ce": 0.9534056782722473, "loss_lvr": 0.8509038686752319, "loss_mode_switch": 0.0, "loss_total": 1.0384960174560547, "step": 392 }, { "batch_size": 1, "epoch": 0.1568, "step": 392, "tokens_per_device": 4980 }, { "epoch": 0.1568, "loss_ce": 0.3377240002155304, "loss_lvr": 0.6350473761558533, "loss_mode_switch": 0.0, "loss_total": 0.40122872591018677, "step": 392 }, { "batch_size": 1, "epoch": 0.1568, "step": 392, "tokens_per_device": 5124 }, { "epoch": 0.1568, "loss_ce": 0.018563680350780487, "loss_lvr": 0.5173007845878601, "loss_mode_switch": 0.0, "loss_total": 0.07029375433921814, "step": 392 }, { "batch_size": 1, "epoch": 0.1568, "step": 392, "tokens_per_device": 5143 }, { "epoch": 0.1568, "loss_ce": 0.29826101660728455, "loss_lvr": 0.5403832197189331, "loss_mode_switch": 0.0, "loss_total": 0.3522993326187134, "step": 392 }, { "batch_size": 4, "epoch": 0.1568, "step": 392, "tokens_per_device": 5552 }, { "epoch": 0.1568, "loss_ce": 0.24893248081207275, "loss_lvr": 0.9714875221252441, "loss_mode_switch": 0.0, "loss_total": 0.3460812270641327, "step": 392 }, { "epoch": 0.1572, "grad_norm": 1.6760573387145996, "learning_rate": 9.58166956566036e-06, "loss": 0.3439, "step": 393 }, { "batch_size": 4, "epoch": 0.1572, "step": 393, "tokens_per_device": 3936 }, { "epoch": 0.1572, "loss_ce": 0.34311407804489136, "loss_lvr": 1.1136748790740967, "loss_mode_switch": 0.0, "loss_total": 0.4544815719127655, "step": 393 }, { "batch_size": 4, "epoch": 0.1572, "step": 393, "tokens_per_device": 5876 }, { "epoch": 0.1572, "loss_ce": 0.09891834855079651, "loss_lvr": 0.8394769430160522, "loss_mode_switch": 0.0, "loss_total": 0.18286603689193726, "step": 393 }, { "batch_size": 4, "epoch": 0.1572, "step": 393, "tokens_per_device": 4164 }, { "epoch": 0.1572, "loss_ce": 0.25128811597824097, "loss_lvr": 1.2561159133911133, "loss_mode_switch": 0.0, "loss_total": 0.37689971923828125, "step": 393 }, { "batch_size": 4, "epoch": 0.1572, "step": 393, "tokens_per_device": 1696 }, { "epoch": 0.1572, "loss_ce": 0.518787145614624, "loss_lvr": 1.0444495677947998, "loss_mode_switch": 0.0, "loss_total": 0.6232321262359619, "step": 393 }, { "batch_size": 4, "epoch": 0.1572, "step": 393, "tokens_per_device": 5816 }, { "epoch": 0.1572, "loss_ce": 0.05342191457748413, "loss_lvr": 0.9781569242477417, "loss_mode_switch": 0.0, "loss_total": 0.1512376070022583, "step": 393 }, { "batch_size": 4, "epoch": 0.1572, "step": 393, "tokens_per_device": 2700 }, { "epoch": 0.1572, "loss_ce": 0.40737971663475037, "loss_lvr": 0.972169816493988, "loss_mode_switch": 0.0, "loss_total": 0.5045967102050781, "step": 393 }, { "batch_size": 4, "epoch": 0.1572, "step": 393, "tokens_per_device": 3828 }, { "epoch": 0.1572, "loss_ce": 0.37698835134506226, "loss_lvr": 0.8642206788063049, "loss_mode_switch": 0.0, "loss_total": 0.4634104371070862, "step": 393 }, { "batch_size": 4, "epoch": 0.1572, "step": 393, "tokens_per_device": 4224 }, { "epoch": 0.1572, "loss_ce": 0.3212089240550995, "loss_lvr": 1.040114402770996, "loss_mode_switch": 0.0, "loss_total": 0.4252203702926636, "step": 393 }, { "epoch": 0.1576, "grad_norm": 1.2944636344909668, "learning_rate": 9.579072029303855e-06, "loss": 0.3202, "step": 394 }, { "batch_size": 4, "epoch": 0.1576, "step": 394, "tokens_per_device": 4004 }, { "epoch": 0.1576, "loss_ce": 0.09002553671598434, "loss_lvr": 0.9379895925521851, "loss_mode_switch": 0.0, "loss_total": 0.18382449448108673, "step": 394 }, { "batch_size": 4, "epoch": 0.1576, "step": 394, "tokens_per_device": 9152 }, { "epoch": 0.1576, "loss_ce": 0.318137526512146, "loss_lvr": 0.8820662498474121, "loss_mode_switch": 0.0, "loss_total": 0.40634414553642273, "step": 394 }, { "batch_size": 1, "epoch": 0.1576, "step": 394, "tokens_per_device": 5231 }, { "epoch": 0.1576, "loss_ce": 0.04493265971541405, "loss_lvr": 0.8747121095657349, "loss_mode_switch": 0.0, "loss_total": 0.13240386545658112, "step": 394 }, { "batch_size": 4, "epoch": 0.1576, "step": 394, "tokens_per_device": 4800 }, { "epoch": 0.1576, "loss_ce": 0.5262687802314758, "loss_lvr": 0.8954148888587952, "loss_mode_switch": 0.0, "loss_total": 0.6158102750778198, "step": 394 }, { "batch_size": 1, "epoch": 0.1576, "step": 394, "tokens_per_device": 4888 }, { "epoch": 0.1576, "loss_ce": 0.010583173483610153, "loss_lvr": 0.3240295946598053, "loss_mode_switch": 0.0, "loss_total": 0.04298613220453262, "step": 394 }, { "batch_size": 4, "epoch": 0.1576, "step": 394, "tokens_per_device": 3812 }, { "epoch": 0.1576, "loss_ce": 0.16252657771110535, "loss_lvr": 1.125172734260559, "loss_mode_switch": 0.0, "loss_total": 0.2750438451766968, "step": 394 }, { "batch_size": 4, "epoch": 0.1576, "step": 394, "tokens_per_device": 4384 }, { "epoch": 0.1576, "loss_ce": 0.5230069756507874, "loss_lvr": 1.2454472780227661, "loss_mode_switch": 0.0, "loss_total": 0.6475517153739929, "step": 394 }, { "batch_size": 1, "epoch": 0.1576, "step": 394, "tokens_per_device": 5130 }, { "epoch": 0.1576, "loss_ce": 0.14122210443019867, "loss_lvr": 0.8166133165359497, "loss_mode_switch": 0.0, "loss_total": 0.2228834331035614, "step": 394 }, { "epoch": 0.158, "grad_norm": 1.3134692907333374, "learning_rate": 9.5764668077739e-06, "loss": 0.2871, "step": 395 }, { "batch_size": 4, "epoch": 0.158, "step": 395, "tokens_per_device": 1528 }, { "epoch": 0.158, "loss_ce": 0.6522037982940674, "loss_lvr": 1.0774706602096558, "loss_mode_switch": 0.0, "loss_total": 0.7599508762359619, "step": 395 }, { "batch_size": 4, "epoch": 0.158, "step": 395, "tokens_per_device": 1492 }, { "epoch": 0.158, "loss_ce": 0.12968683242797852, "loss_lvr": 1.1868523359298706, "loss_mode_switch": 0.0, "loss_total": 0.24837207794189453, "step": 395 }, { "batch_size": 4, "epoch": 0.158, "step": 395, "tokens_per_device": 3948 }, { "epoch": 0.158, "loss_ce": 0.12998072803020477, "loss_lvr": 1.8347396850585938, "loss_mode_switch": 0.0, "loss_total": 0.31345468759536743, "step": 395 }, { "batch_size": 4, "epoch": 0.158, "step": 395, "tokens_per_device": 12460 }, { "epoch": 0.158, "loss_ce": 0.5473818182945251, "loss_lvr": 1.7612916231155396, "loss_mode_switch": 0.0, "loss_total": 0.7235109806060791, "step": 395 }, { "batch_size": 4, "epoch": 0.158, "step": 395, "tokens_per_device": 4296 }, { "epoch": 0.158, "loss_ce": 0.14061543345451355, "loss_lvr": 0.7983748316764832, "loss_mode_switch": 0.0, "loss_total": 0.2204529196023941, "step": 395 }, { "batch_size": 1, "epoch": 0.158, "step": 395, "tokens_per_device": 4863 }, { "epoch": 0.158, "loss_ce": 0.05375145375728607, "loss_lvr": 0.5234741568565369, "loss_mode_switch": 0.0, "loss_total": 0.10609887540340424, "step": 395 }, { "batch_size": 1, "epoch": 0.158, "step": 395, "tokens_per_device": 5063 }, { "epoch": 0.158, "loss_ce": 0.005019719712436199, "loss_lvr": 0.8890277743339539, "loss_mode_switch": 0.0, "loss_total": 0.09392249584197998, "step": 395 }, { "batch_size": 4, "epoch": 0.158, "step": 395, "tokens_per_device": 4680 }, { "epoch": 0.158, "loss_ce": 0.41829049587249756, "loss_lvr": 0.6231762170791626, "loss_mode_switch": 0.0, "loss_total": 0.48060810565948486, "step": 395 }, { "epoch": 0.1584, "grad_norm": 1.3265801668167114, "learning_rate": 9.573853905442899e-06, "loss": 0.3424, "step": 396 }, { "batch_size": 4, "epoch": 0.1584, "step": 396, "tokens_per_device": 3956 }, { "epoch": 0.1584, "loss_ce": 0.2945154905319214, "loss_lvr": 0.8996855020523071, "loss_mode_switch": 0.0, "loss_total": 0.38448405265808105, "step": 396 }, { "batch_size": 1, "epoch": 0.1584, "step": 396, "tokens_per_device": 5164 }, { "epoch": 0.1584, "loss_ce": 0.017182644456624985, "loss_lvr": 0.5170037746429443, "loss_mode_switch": 0.0, "loss_total": 0.0688830241560936, "step": 396 }, { "batch_size": 1, "epoch": 0.1584, "step": 396, "tokens_per_device": 5224 }, { "epoch": 0.1584, "loss_ce": 0.23974774777889252, "loss_lvr": 0.7279816269874573, "loss_mode_switch": 0.0, "loss_total": 0.31254589557647705, "step": 396 }, { "batch_size": 4, "epoch": 0.1584, "step": 396, "tokens_per_device": 4448 }, { "epoch": 0.1584, "loss_ce": 0.13374283909797668, "loss_lvr": 1.1672096252441406, "loss_mode_switch": 0.0, "loss_total": 0.2504638135433197, "step": 396 }, { "batch_size": 4, "epoch": 0.1584, "step": 396, "tokens_per_device": 4240 }, { "epoch": 0.1584, "loss_ce": 0.2522546947002411, "loss_lvr": 1.3820054531097412, "loss_mode_switch": 0.0, "loss_total": 0.3904552459716797, "step": 396 }, { "batch_size": 4, "epoch": 0.1584, "step": 396, "tokens_per_device": 3896 }, { "epoch": 0.1584, "loss_ce": 0.5607417821884155, "loss_lvr": 1.0613408088684082, "loss_mode_switch": 0.0, "loss_total": 0.6668758392333984, "step": 396 }, { "batch_size": 4, "epoch": 0.1584, "step": 396, "tokens_per_device": 4420 }, { "epoch": 0.1584, "loss_ce": 0.06635251641273499, "loss_lvr": 0.9054068922996521, "loss_mode_switch": 0.0, "loss_total": 0.15689320862293243, "step": 396 }, { "batch_size": 1, "epoch": 0.1584, "step": 396, "tokens_per_device": 7094 }, { "epoch": 0.1584, "loss_ce": 0.001136129372753203, "loss_lvr": 0.3789653778076172, "loss_mode_switch": 0.0, "loss_total": 0.03903266787528992, "step": 396 }, { "epoch": 0.1588, "grad_norm": 1.251804232597351, "learning_rate": 9.571233326696159e-06, "loss": 0.272, "step": 397 }, { "batch_size": 4, "epoch": 0.1588, "step": 397, "tokens_per_device": 5720 }, { "epoch": 0.1588, "loss_ce": 0.30053216218948364, "loss_lvr": 0.98442542552948, "loss_mode_switch": 0.0, "loss_total": 0.3989747166633606, "step": 397 }, { "batch_size": 4, "epoch": 0.1588, "step": 397, "tokens_per_device": 2880 }, { "epoch": 0.1588, "loss_ce": 0.1774044781923294, "loss_lvr": 0.8426772952079773, "loss_mode_switch": 0.0, "loss_total": 0.2616721987724304, "step": 397 }, { "batch_size": 4, "epoch": 0.1588, "step": 397, "tokens_per_device": 5496 }, { "epoch": 0.1588, "loss_ce": 0.20098678767681122, "loss_lvr": 0.9274544715881348, "loss_mode_switch": 0.0, "loss_total": 0.293732225894928, "step": 397 }, { "batch_size": 1, "epoch": 0.1588, "step": 397, "tokens_per_device": 5115 }, { "epoch": 0.1588, "loss_ce": 0.04076864942908287, "loss_lvr": 0.34883925318717957, "loss_mode_switch": 0.0, "loss_total": 0.075652576982975, "step": 397 }, { "batch_size": 4, "epoch": 0.1588, "step": 397, "tokens_per_device": 2500 }, { "epoch": 0.1588, "loss_ce": 0.8490585684776306, "loss_lvr": 0.9224013090133667, "loss_mode_switch": 0.0, "loss_total": 0.9412987232208252, "step": 397 }, { "batch_size": 1, "epoch": 0.1588, "step": 397, "tokens_per_device": 5126 }, { "epoch": 0.1588, "loss_ce": 0.07009230554103851, "loss_lvr": 0.5804960131645203, "loss_mode_switch": 0.0, "loss_total": 0.12814190983772278, "step": 397 }, { "batch_size": 1, "epoch": 0.1588, "step": 397, "tokens_per_device": 4826 }, { "epoch": 0.1588, "loss_ce": 0.007073391694575548, "loss_lvr": 0.6039460897445679, "loss_mode_switch": 0.0, "loss_total": 0.06746800243854523, "step": 397 }, { "batch_size": 1, "epoch": 0.1588, "step": 397, "tokens_per_device": 5048 }, { "epoch": 0.1588, "loss_ce": 1.33112370967865, "loss_lvr": 0.37969180941581726, "loss_mode_switch": 0.0, "loss_total": 1.3690929412841797, "step": 397 }, { "epoch": 0.1592, "grad_norm": 1.807140588760376, "learning_rate": 9.56860507593186e-06, "loss": 0.3861, "step": 398 }, { "batch_size": 4, "epoch": 0.1592, "step": 398, "tokens_per_device": 6632 }, { "epoch": 0.1592, "loss_ce": 0.7437963485717773, "loss_lvr": 1.0936942100524902, "loss_mode_switch": 0.0, "loss_total": 0.8531657457351685, "step": 398 }, { "batch_size": 1, "epoch": 0.1592, "step": 398, "tokens_per_device": 5023 }, { "epoch": 0.1592, "loss_ce": 0.5537698864936829, "loss_lvr": 0.552357017993927, "loss_mode_switch": 0.0, "loss_total": 0.6090055704116821, "step": 398 }, { "batch_size": 4, "epoch": 0.1592, "step": 398, "tokens_per_device": 5908 }, { "epoch": 0.1592, "loss_ce": 0.27399447560310364, "loss_lvr": 1.1421490907669067, "loss_mode_switch": 0.0, "loss_total": 0.38820940256118774, "step": 398 }, { "batch_size": 1, "epoch": 0.1592, "step": 398, "tokens_per_device": 5124 }, { "epoch": 0.1592, "loss_ce": 0.031251341104507446, "loss_lvr": 0.3604010045528412, "loss_mode_switch": 0.0, "loss_total": 0.06729143857955933, "step": 398 }, { "batch_size": 4, "epoch": 0.1592, "step": 398, "tokens_per_device": 3788 }, { "epoch": 0.1592, "loss_ce": 0.30174773931503296, "loss_lvr": 1.8253188133239746, "loss_mode_switch": 0.0, "loss_total": 0.4842796325683594, "step": 398 }, { "batch_size": 1, "epoch": 0.1592, "step": 398, "tokens_per_device": 6358 }, { "epoch": 0.1592, "loss_ce": 0.04399547725915909, "loss_lvr": 0.420579195022583, "loss_mode_switch": 0.0, "loss_total": 0.08605340123176575, "step": 398 }, { "batch_size": 4, "epoch": 0.1592, "step": 398, "tokens_per_device": 7296 }, { "epoch": 0.1592, "loss_ce": 0.06605058908462524, "loss_lvr": 0.8115290403366089, "loss_mode_switch": 0.0, "loss_total": 0.1472035050392151, "step": 398 }, { "batch_size": 1, "epoch": 0.1592, "step": 398, "tokens_per_device": 4882 }, { "epoch": 0.1592, "loss_ce": 0.03575756773352623, "loss_lvr": 0.8159806728363037, "loss_mode_switch": 0.0, "loss_total": 0.11735562980175018, "step": 398 }, { "epoch": 0.1596, "grad_norm": 1.3871980905532837, "learning_rate": 9.565969157561066e-06, "loss": 0.3313, "step": 399 }, { "batch_size": 4, "epoch": 0.1596, "step": 399, "tokens_per_device": 4496 }, { "epoch": 0.1596, "loss_ce": 0.38500064611434937, "loss_lvr": 0.9306434392929077, "loss_mode_switch": 0.0, "loss_total": 0.47806498408317566, "step": 399 }, { "batch_size": 4, "epoch": 0.1596, "step": 399, "tokens_per_device": 3748 }, { "epoch": 0.1596, "loss_ce": 0.5334450006484985, "loss_lvr": 0.9434258341789246, "loss_mode_switch": 0.0, "loss_total": 0.6277875900268555, "step": 399 }, { "batch_size": 4, "epoch": 0.1596, "step": 399, "tokens_per_device": 3816 }, { "epoch": 0.1596, "loss_ce": 0.19079145789146423, "loss_lvr": 1.0569275617599487, "loss_mode_switch": 0.0, "loss_total": 0.29648423194885254, "step": 399 }, { "batch_size": 4, "epoch": 0.1596, "step": 399, "tokens_per_device": 1744 }, { "epoch": 0.1596, "loss_ce": 0.4197733700275421, "loss_lvr": 1.2050048112869263, "loss_mode_switch": 0.0, "loss_total": 0.5402738451957703, "step": 399 }, { "batch_size": 4, "epoch": 0.1596, "step": 399, "tokens_per_device": 2672 }, { "epoch": 0.1596, "loss_ce": 0.06939028948545456, "loss_lvr": 1.6791729927062988, "loss_mode_switch": 0.0, "loss_total": 0.2373075783252716, "step": 399 }, { "batch_size": 1, "epoch": 0.1596, "step": 399, "tokens_per_device": 5132 }, { "epoch": 0.1596, "loss_ce": 0.1803709864616394, "loss_lvr": 0.48943114280700684, "loss_mode_switch": 0.0, "loss_total": 0.22931410372257233, "step": 399 }, { "batch_size": 1, "epoch": 0.1596, "step": 399, "tokens_per_device": 4897 }, { "epoch": 0.1596, "loss_ce": 0.003299092408269644, "loss_lvr": 0.7458878755569458, "loss_mode_switch": 0.0, "loss_total": 0.0778878852725029, "step": 399 }, { "batch_size": 1, "epoch": 0.1596, "step": 399, "tokens_per_device": 5910 }, { "epoch": 0.1596, "loss_ce": 0.007282196544110775, "loss_lvr": 0.34133845567703247, "loss_mode_switch": 0.0, "loss_total": 0.041416045278310776, "step": 399 }, { "epoch": 0.16, "grad_norm": 1.481837511062622, "learning_rate": 9.563325576007702e-06, "loss": 0.317, "step": 400 }, { "batch_size": 1, "epoch": 0.16, "step": 400, "tokens_per_device": 5094 }, { "epoch": 0.16, "loss_ce": 0.04906280338764191, "loss_lvr": 0.38868096470832825, "loss_mode_switch": 0.0, "loss_total": 0.08793090283870697, "step": 400 }, { "batch_size": 1, "epoch": 0.16, "step": 400, "tokens_per_device": 4873 }, { "epoch": 0.16, "loss_ce": 0.38049864768981934, "loss_lvr": 0.3739181458950043, "loss_mode_switch": 0.0, "loss_total": 0.4178904592990875, "step": 400 }, { "batch_size": 4, "epoch": 0.16, "step": 400, "tokens_per_device": 2688 }, { "epoch": 0.16, "loss_ce": 0.5352905988693237, "loss_lvr": 1.183380126953125, "loss_mode_switch": 0.0, "loss_total": 0.6536285877227783, "step": 400 }, { "batch_size": 4, "epoch": 0.16, "step": 400, "tokens_per_device": 2632 }, { "epoch": 0.16, "loss_ce": 0.3809882402420044, "loss_lvr": 0.9061920642852783, "loss_mode_switch": 0.0, "loss_total": 0.4716074466705322, "step": 400 }, { "batch_size": 4, "epoch": 0.16, "step": 400, "tokens_per_device": 6564 }, { "epoch": 0.16, "loss_ce": 0.02707543596625328, "loss_lvr": 0.6510552167892456, "loss_mode_switch": 0.0, "loss_total": 0.09218095242977142, "step": 400 }, { "batch_size": 4, "epoch": 0.16, "step": 400, "tokens_per_device": 1712 }, { "epoch": 0.16, "loss_ce": 0.5006504058837891, "loss_lvr": 1.265122413635254, "loss_mode_switch": 0.0, "loss_total": 0.6271626353263855, "step": 400 }, { "batch_size": 4, "epoch": 0.16, "step": 400, "tokens_per_device": 15112 }, { "epoch": 0.16, "loss_ce": 0.05233035236597061, "loss_lvr": 0.8844942450523376, "loss_mode_switch": 0.0, "loss_total": 0.1407797783613205, "step": 400 }, { "batch_size": 4, "epoch": 0.16, "step": 400, "tokens_per_device": 14196 }, { "epoch": 0.16, "loss_ce": 0.054195694625377655, "loss_lvr": 0.7833970189094543, "loss_mode_switch": 0.0, "loss_total": 0.1325353980064392, "step": 400 }, { "epoch": 0.1604, "grad_norm": 1.6010594367980957, "learning_rate": 9.56067433570856e-06, "loss": 0.3255, "step": 401 }, { "batch_size": 4, "epoch": 0.1604, "step": 401, "tokens_per_device": 4348 }, { "epoch": 0.1604, "loss_ce": 0.1359138935804367, "loss_lvr": 0.8244260549545288, "loss_mode_switch": 0.0, "loss_total": 0.21835649013519287, "step": 401 }, { "batch_size": 1, "epoch": 0.1604, "step": 401, "tokens_per_device": 4962 }, { "epoch": 0.1604, "loss_ce": 0.014290097169578075, "loss_lvr": 0.284865140914917, "loss_mode_switch": 0.0, "loss_total": 0.04277661070227623, "step": 401 }, { "batch_size": 4, "epoch": 0.1604, "step": 401, "tokens_per_device": 3792 }, { "epoch": 0.1604, "loss_ce": 0.19762775301933289, "loss_lvr": 0.9977861642837524, "loss_mode_switch": 0.0, "loss_total": 0.2974063754081726, "step": 401 }, { "batch_size": 4, "epoch": 0.1604, "step": 401, "tokens_per_device": 2556 }, { "epoch": 0.1604, "loss_ce": 0.3569316864013672, "loss_lvr": 1.2041653394699097, "loss_mode_switch": 0.0, "loss_total": 0.4773482084274292, "step": 401 }, { "batch_size": 4, "epoch": 0.1604, "step": 401, "tokens_per_device": 2624 }, { "epoch": 0.1604, "loss_ce": 0.6068558692932129, "loss_lvr": 1.0172584056854248, "loss_mode_switch": 0.0, "loss_total": 0.7085816860198975, "step": 401 }, { "batch_size": 4, "epoch": 0.1604, "step": 401, "tokens_per_device": 4324 }, { "epoch": 0.1604, "loss_ce": 0.20561222732067108, "loss_lvr": 0.8714879155158997, "loss_mode_switch": 0.0, "loss_total": 0.29276102781295776, "step": 401 }, { "batch_size": 4, "epoch": 0.1604, "step": 401, "tokens_per_device": 10276 }, { "epoch": 0.1604, "loss_ce": 0.25647640228271484, "loss_lvr": 0.9839109182357788, "loss_mode_switch": 0.0, "loss_total": 0.35486748814582825, "step": 401 }, { "batch_size": 4, "epoch": 0.1604, "step": 401, "tokens_per_device": 13760 }, { "epoch": 0.1604, "loss_ce": 0.0632563903927803, "loss_lvr": 0.6491389274597168, "loss_mode_switch": 0.0, "loss_total": 0.12817028164863586, "step": 401 }, { "epoch": 0.1608, "grad_norm": 1.382583737373352, "learning_rate": 9.558015441113285e-06, "loss": 0.3289, "step": 402 }, { "batch_size": 1, "epoch": 0.1608, "step": 402, "tokens_per_device": 4789 }, { "epoch": 0.1608, "loss_ce": 0.017910869792103767, "loss_lvr": 0.3982013761997223, "loss_mode_switch": 0.0, "loss_total": 0.057731010019779205, "step": 402 }, { "batch_size": 1, "epoch": 0.1608, "step": 402, "tokens_per_device": 7969 }, { "epoch": 0.1608, "loss_ce": 0.012166809290647507, "loss_lvr": 0.35468775033950806, "loss_mode_switch": 0.0, "loss_total": 0.04763558506965637, "step": 402 }, { "batch_size": 1, "epoch": 0.1608, "step": 402, "tokens_per_device": 5473 }, { "epoch": 0.1608, "loss_ce": 0.15536166727542877, "loss_lvr": 0.5466648936271667, "loss_mode_switch": 0.0, "loss_total": 0.21002815663814545, "step": 402 }, { "batch_size": 1, "epoch": 0.1608, "step": 402, "tokens_per_device": 5089 }, { "epoch": 0.1608, "loss_ce": 0.27301010489463806, "loss_lvr": 0.6478354930877686, "loss_mode_switch": 0.0, "loss_total": 0.33779364824295044, "step": 402 }, { "batch_size": 1, "epoch": 0.1608, "step": 402, "tokens_per_device": 4941 }, { "epoch": 0.1608, "loss_ce": 0.008789285086095333, "loss_lvr": 0.4932137727737427, "loss_mode_switch": 0.0, "loss_total": 0.058110665529966354, "step": 402 }, { "batch_size": 4, "epoch": 0.1608, "step": 402, "tokens_per_device": 5096 }, { "epoch": 0.1608, "loss_ce": 0.08645009994506836, "loss_lvr": 0.8264709711074829, "loss_mode_switch": 0.0, "loss_total": 0.1690972000360489, "step": 402 }, { "batch_size": 1, "epoch": 0.1608, "step": 402, "tokens_per_device": 5121 }, { "epoch": 0.1608, "loss_ce": 0.05939503014087677, "loss_lvr": 0.23515011370182037, "loss_mode_switch": 0.0, "loss_total": 0.08291004598140717, "step": 402 }, { "batch_size": 1, "epoch": 0.1608, "step": 402, "tokens_per_device": 5072 }, { "epoch": 0.1608, "loss_ce": 0.05463028326630592, "loss_lvr": 0.2783810496330261, "loss_mode_switch": 0.0, "loss_total": 0.08246839046478271, "step": 402 }, { "epoch": 0.1612, "grad_norm": 1.4838277101516724, "learning_rate": 9.555348896684366e-06, "loss": 0.2977, "step": 403 }, { "batch_size": 1, "epoch": 0.1612, "step": 403, "tokens_per_device": 5041 }, { "epoch": 0.1612, "loss_ce": 0.005122170317918062, "loss_lvr": 0.27012166380882263, "loss_mode_switch": 0.0, "loss_total": 0.03213433921337128, "step": 403 }, { "batch_size": 4, "epoch": 0.1612, "step": 403, "tokens_per_device": 5404 }, { "epoch": 0.1612, "loss_ce": 0.02859921008348465, "loss_lvr": 0.5522517561912537, "loss_mode_switch": 0.0, "loss_total": 0.08382438123226166, "step": 403 }, { "batch_size": 4, "epoch": 0.1612, "step": 403, "tokens_per_device": 6508 }, { "epoch": 0.1612, "loss_ce": 0.7275035381317139, "loss_lvr": 0.757614254951477, "loss_mode_switch": 0.0, "loss_total": 0.8032649755477905, "step": 403 }, { "batch_size": 1, "epoch": 0.1612, "step": 403, "tokens_per_device": 5198 }, { "epoch": 0.1612, "loss_ce": 0.0010007427772507071, "loss_lvr": 0.5150637626647949, "loss_mode_switch": 0.0, "loss_total": 0.05250712111592293, "step": 403 }, { "batch_size": 4, "epoch": 0.1612, "step": 403, "tokens_per_device": 4444 }, { "epoch": 0.1612, "loss_ce": 0.25235432386398315, "loss_lvr": 0.9936897158622742, "loss_mode_switch": 0.0, "loss_total": 0.351723313331604, "step": 403 }, { "batch_size": 1, "epoch": 0.1612, "step": 403, "tokens_per_device": 5169 }, { "epoch": 0.1612, "loss_ce": 0.01066562533378601, "loss_lvr": 0.28269162774086, "loss_mode_switch": 0.0, "loss_total": 0.03893478959798813, "step": 403 }, { "batch_size": 4, "epoch": 0.1612, "step": 403, "tokens_per_device": 9200 }, { "epoch": 0.1612, "loss_ce": 0.5467479825019836, "loss_lvr": 0.8297562003135681, "loss_mode_switch": 0.0, "loss_total": 0.6297236084938049, "step": 403 }, { "batch_size": 4, "epoch": 0.1612, "step": 403, "tokens_per_device": 3768 }, { "epoch": 0.1612, "loss_ce": 0.22725431621074677, "loss_lvr": 0.7224984765052795, "loss_mode_switch": 0.0, "loss_total": 0.2995041608810425, "step": 403 }, { "epoch": 0.1616, "grad_norm": 1.4227772951126099, "learning_rate": 9.552674706897136e-06, "loss": 0.3398, "step": 404 }, { "batch_size": 4, "epoch": 0.1616, "step": 404, "tokens_per_device": 4644 }, { "epoch": 0.1616, "loss_ce": 0.21861733496189117, "loss_lvr": 0.9133596420288086, "loss_mode_switch": 0.0, "loss_total": 0.3099533021450043, "step": 404 }, { "batch_size": 4, "epoch": 0.1616, "step": 404, "tokens_per_device": 4200 }, { "epoch": 0.1616, "loss_ce": 0.09334786981344223, "loss_lvr": 1.0096222162246704, "loss_mode_switch": 0.0, "loss_total": 0.19431009888648987, "step": 404 }, { "batch_size": 4, "epoch": 0.1616, "step": 404, "tokens_per_device": 8184 }, { "epoch": 0.1616, "loss_ce": 0.18730628490447998, "loss_lvr": 0.7804608941078186, "loss_mode_switch": 0.0, "loss_total": 0.26535236835479736, "step": 404 }, { "batch_size": 4, "epoch": 0.1616, "step": 404, "tokens_per_device": 1612 }, { "epoch": 0.1616, "loss_ce": 0.40268999338150024, "loss_lvr": 1.075695276260376, "loss_mode_switch": 0.0, "loss_total": 0.5102595090866089, "step": 404 }, { "batch_size": 4, "epoch": 0.1616, "step": 404, "tokens_per_device": 5688 }, { "epoch": 0.1616, "loss_ce": 0.1921893060207367, "loss_lvr": 0.9481350183486938, "loss_mode_switch": 0.0, "loss_total": 0.2870028018951416, "step": 404 }, { "batch_size": 1, "epoch": 0.1616, "step": 404, "tokens_per_device": 4903 }, { "epoch": 0.1616, "loss_ce": 0.1269921213388443, "loss_lvr": 0.2450217604637146, "loss_mode_switch": 0.0, "loss_total": 0.15149429440498352, "step": 404 }, { "batch_size": 4, "epoch": 0.1616, "step": 404, "tokens_per_device": 3884 }, { "epoch": 0.1616, "loss_ce": 0.43812599778175354, "loss_lvr": 1.0493897199630737, "loss_mode_switch": 0.0, "loss_total": 0.5430649518966675, "step": 404 }, { "batch_size": 4, "epoch": 0.1616, "step": 404, "tokens_per_device": 4084 }, { "epoch": 0.1616, "loss_ce": 0.28583353757858276, "loss_lvr": 0.8502645492553711, "loss_mode_switch": 0.0, "loss_total": 0.3708599805831909, "step": 404 }, { "epoch": 0.162, "grad_norm": 1.3965212106704712, "learning_rate": 9.549992876239753e-06, "loss": 0.3507, "step": 405 }, { "batch_size": 1, "epoch": 0.162, "step": 405, "tokens_per_device": 5363 }, { "epoch": 0.162, "loss_ce": 0.033018115907907486, "loss_lvr": 0.2894965410232544, "loss_mode_switch": 0.0, "loss_total": 0.061967767775058746, "step": 405 }, { "batch_size": 4, "epoch": 0.162, "step": 405, "tokens_per_device": 14728 }, { "epoch": 0.162, "loss_ce": 0.6071714758872986, "loss_lvr": 0.6476444005966187, "loss_mode_switch": 0.0, "loss_total": 0.6719359159469604, "step": 405 }, { "batch_size": 4, "epoch": 0.162, "step": 405, "tokens_per_device": 1504 }, { "epoch": 0.162, "loss_ce": 0.2935587763786316, "loss_lvr": 0.8446741700172424, "loss_mode_switch": 0.0, "loss_total": 0.37802618741989136, "step": 405 }, { "batch_size": 4, "epoch": 0.162, "step": 405, "tokens_per_device": 4444 }, { "epoch": 0.162, "loss_ce": 0.10919136554002762, "loss_lvr": 0.8810364603996277, "loss_mode_switch": 0.0, "loss_total": 0.19729501008987427, "step": 405 }, { "batch_size": 4, "epoch": 0.162, "step": 405, "tokens_per_device": 8132 }, { "epoch": 0.162, "loss_ce": 0.09127775579690933, "loss_lvr": 0.7658355832099915, "loss_mode_switch": 0.0, "loss_total": 0.16786131262779236, "step": 405 }, { "batch_size": 4, "epoch": 0.162, "step": 405, "tokens_per_device": 2588 }, { "epoch": 0.162, "loss_ce": 0.5664815902709961, "loss_lvr": 1.1425316333770752, "loss_mode_switch": 0.0, "loss_total": 0.6807347536087036, "step": 405 }, { "batch_size": 4, "epoch": 0.162, "step": 405, "tokens_per_device": 3756 }, { "epoch": 0.162, "loss_ce": 0.4614749848842621, "loss_lvr": 0.7681071758270264, "loss_mode_switch": 0.0, "loss_total": 0.5382857322692871, "step": 405 }, { "batch_size": 4, "epoch": 0.162, "step": 405, "tokens_per_device": 4264 }, { "epoch": 0.162, "loss_ce": 0.21856553852558136, "loss_lvr": 1.1715937852859497, "loss_mode_switch": 0.0, "loss_total": 0.33572492003440857, "step": 405 }, { "epoch": 0.1624, "grad_norm": 1.6321901082992554, "learning_rate": 9.547303409213202e-06, "loss": 0.3203, "step": 406 }, { "batch_size": 4, "epoch": 0.1624, "step": 406, "tokens_per_device": 4432 }, { "epoch": 0.1624, "loss_ce": 0.16750843822956085, "loss_lvr": 0.8392202258110046, "loss_mode_switch": 0.0, "loss_total": 0.2514304518699646, "step": 406 }, { "batch_size": 4, "epoch": 0.1624, "step": 406, "tokens_per_device": 3876 }, { "epoch": 0.1624, "loss_ce": 0.20778818428516388, "loss_lvr": 1.162014365196228, "loss_mode_switch": 0.0, "loss_total": 0.3239896297454834, "step": 406 }, { "batch_size": 1, "epoch": 0.1624, "step": 406, "tokens_per_device": 4941 }, { "epoch": 0.1624, "loss_ce": 0.32436811923980713, "loss_lvr": 0.5117272138595581, "loss_mode_switch": 0.0, "loss_total": 0.3755408525466919, "step": 406 }, { "batch_size": 4, "epoch": 0.1624, "step": 406, "tokens_per_device": 1364 }, { "epoch": 0.1624, "loss_ce": 0.3035913407802582, "loss_lvr": 1.18644118309021, "loss_mode_switch": 0.0, "loss_total": 0.4222354590892792, "step": 406 }, { "batch_size": 4, "epoch": 0.1624, "step": 406, "tokens_per_device": 1932 }, { "epoch": 0.1624, "loss_ce": 0.07205183804035187, "loss_lvr": 0.9678540825843811, "loss_mode_switch": 0.0, "loss_total": 0.16883724927902222, "step": 406 }, { "batch_size": 1, "epoch": 0.1624, "step": 406, "tokens_per_device": 4913 }, { "epoch": 0.1624, "loss_ce": 0.4602031111717224, "loss_lvr": 0.6853004693984985, "loss_mode_switch": 0.0, "loss_total": 0.5287331342697144, "step": 406 }, { "batch_size": 4, "epoch": 0.1624, "step": 406, "tokens_per_device": 7312 }, { "epoch": 0.1624, "loss_ce": 0.006706594955176115, "loss_lvr": 0.880248486995697, "loss_mode_switch": 0.0, "loss_total": 0.09473144263029099, "step": 406 }, { "batch_size": 4, "epoch": 0.1624, "step": 406, "tokens_per_device": 5820 }, { "epoch": 0.1624, "loss_ce": 0.07368739694356918, "loss_lvr": 0.8121177554130554, "loss_mode_switch": 0.0, "loss_total": 0.15489917993545532, "step": 406 }, { "epoch": 0.1628, "grad_norm": 1.6556116342544556, "learning_rate": 9.544606310331284e-06, "loss": 0.3083, "step": 407 }, { "batch_size": 4, "epoch": 0.1628, "step": 407, "tokens_per_device": 3308 }, { "epoch": 0.1628, "loss_ce": 0.7120068669319153, "loss_lvr": 1.3893003463745117, "loss_mode_switch": 0.0, "loss_total": 0.8509368896484375, "step": 407 }, { "batch_size": 4, "epoch": 0.1628, "step": 407, "tokens_per_device": 2564 }, { "epoch": 0.1628, "loss_ce": 0.13296982645988464, "loss_lvr": 1.006327748298645, "loss_mode_switch": 0.0, "loss_total": 0.2336026132106781, "step": 407 }, { "batch_size": 4, "epoch": 0.1628, "step": 407, "tokens_per_device": 4924 }, { "epoch": 0.1628, "loss_ce": 0.5615507364273071, "loss_lvr": 0.8527231812477112, "loss_mode_switch": 0.0, "loss_total": 0.6468230485916138, "step": 407 }, { "batch_size": 1, "epoch": 0.1628, "step": 407, "tokens_per_device": 4825 }, { "epoch": 0.1628, "loss_ce": 0.18518976867198944, "loss_lvr": 0.2720125615596771, "loss_mode_switch": 0.0, "loss_total": 0.21239101886749268, "step": 407 }, { "batch_size": 1, "epoch": 0.1628, "step": 407, "tokens_per_device": 5802 }, { "epoch": 0.1628, "loss_ce": 0.04610108584165573, "loss_lvr": 0.5584286451339722, "loss_mode_switch": 0.0, "loss_total": 0.1019439548254013, "step": 407 }, { "batch_size": 1, "epoch": 0.1628, "step": 407, "tokens_per_device": 5191 }, { "epoch": 0.1628, "loss_ce": 0.021465495228767395, "loss_lvr": 0.53642737865448, "loss_mode_switch": 0.0, "loss_total": 0.07510823011398315, "step": 407 }, { "batch_size": 1, "epoch": 0.1628, "step": 407, "tokens_per_device": 4739 }, { "epoch": 0.1628, "loss_ce": 0.0008134886156767607, "loss_lvr": 0.8531961441040039, "loss_mode_switch": 0.0, "loss_total": 0.08613310754299164, "step": 407 }, { "batch_size": 1, "epoch": 0.1628, "step": 407, "tokens_per_device": 4226 }, { "epoch": 0.1628, "loss_ce": 0.07799769937992096, "loss_lvr": 1.1053279638290405, "loss_mode_switch": 0.0, "loss_total": 0.18853050470352173, "step": 407 }, { "epoch": 0.1632, "grad_norm": 1.4991888999938965, "learning_rate": 9.541901584120612e-06, "loss": 0.2989, "step": 408 }, { "batch_size": 4, "epoch": 0.1632, "step": 408, "tokens_per_device": 4356 }, { "epoch": 0.1632, "loss_ce": 0.2588634490966797, "loss_lvr": 0.9461568593978882, "loss_mode_switch": 0.0, "loss_total": 0.35347914695739746, "step": 408 }, { "batch_size": 1, "epoch": 0.1632, "step": 408, "tokens_per_device": 4914 }, { "epoch": 0.1632, "loss_ce": 0.002382092410698533, "loss_lvr": 0.3311302065849304, "loss_mode_switch": 0.0, "loss_total": 0.03549511358141899, "step": 408 }, { "batch_size": 4, "epoch": 0.1632, "step": 408, "tokens_per_device": 1988 }, { "epoch": 0.1632, "loss_ce": 0.6919951438903809, "loss_lvr": 0.9360923767089844, "loss_mode_switch": 0.0, "loss_total": 0.7856043577194214, "step": 408 }, { "batch_size": 4, "epoch": 0.1632, "step": 408, "tokens_per_device": 4460 }, { "epoch": 0.1632, "loss_ce": 0.6489347815513611, "loss_lvr": 0.9894739985466003, "loss_mode_switch": 0.0, "loss_total": 0.7478821873664856, "step": 408 }, { "batch_size": 4, "epoch": 0.1632, "step": 408, "tokens_per_device": 5728 }, { "epoch": 0.1632, "loss_ce": 0.5146382451057434, "loss_lvr": 0.8297988176345825, "loss_mode_switch": 0.0, "loss_total": 0.5976181030273438, "step": 408 }, { "batch_size": 1, "epoch": 0.1632, "step": 408, "tokens_per_device": 6263 }, { "epoch": 0.1632, "loss_ce": 0.4292168617248535, "loss_lvr": 0.4057174026966095, "loss_mode_switch": 0.0, "loss_total": 0.4697886109352112, "step": 408 }, { "batch_size": 4, "epoch": 0.1632, "step": 408, "tokens_per_device": 4496 }, { "epoch": 0.1632, "loss_ce": 0.23525869846343994, "loss_lvr": 1.2706151008605957, "loss_mode_switch": 0.0, "loss_total": 0.362320214509964, "step": 408 }, { "batch_size": 1, "epoch": 0.1632, "step": 408, "tokens_per_device": 4910 }, { "epoch": 0.1632, "loss_ce": 0.341345876455307, "loss_lvr": 1.1832057237625122, "loss_mode_switch": 0.0, "loss_total": 0.4596664607524872, "step": 408 }, { "epoch": 0.1636, "grad_norm": 1.6053898334503174, "learning_rate": 9.539189235120591e-06, "loss": 0.3399, "step": 409 }, { "batch_size": 4, "epoch": 0.1636, "step": 409, "tokens_per_device": 2788 }, { "epoch": 0.1636, "loss_ce": 0.7440776824951172, "loss_lvr": 0.8063163161277771, "loss_mode_switch": 0.0, "loss_total": 0.8247092962265015, "step": 409 }, { "batch_size": 4, "epoch": 0.1636, "step": 409, "tokens_per_device": 5640 }, { "epoch": 0.1636, "loss_ce": 0.04387989640235901, "loss_lvr": 1.0025407075881958, "loss_mode_switch": 0.0, "loss_total": 0.14413397014141083, "step": 409 }, { "batch_size": 1, "epoch": 0.1636, "step": 409, "tokens_per_device": 4775 }, { "epoch": 0.1636, "loss_ce": 0.014016353525221348, "loss_lvr": 0.3753628730773926, "loss_mode_switch": 0.0, "loss_total": 0.05155264213681221, "step": 409 }, { "batch_size": 4, "epoch": 0.1636, "step": 409, "tokens_per_device": 4644 }, { "epoch": 0.1636, "loss_ce": 0.0950498953461647, "loss_lvr": 0.7434568405151367, "loss_mode_switch": 0.0, "loss_total": 0.1693955808877945, "step": 409 }, { "batch_size": 1, "epoch": 0.1636, "step": 409, "tokens_per_device": 6437 }, { "epoch": 0.1636, "loss_ce": 0.022859662771224976, "loss_lvr": 0.4911165237426758, "loss_mode_switch": 0.0, "loss_total": 0.07197131216526031, "step": 409 }, { "batch_size": 4, "epoch": 0.1636, "step": 409, "tokens_per_device": 1340 }, { "epoch": 0.1636, "loss_ce": 0.8535870313644409, "loss_lvr": 1.1923483610153198, "loss_mode_switch": 0.0, "loss_total": 0.9728218913078308, "step": 409 }, { "batch_size": 1, "epoch": 0.1636, "step": 409, "tokens_per_device": 4116 }, { "epoch": 0.1636, "loss_ce": 0.00390548980794847, "loss_lvr": 0.7720147967338562, "loss_mode_switch": 0.0, "loss_total": 0.08110696822404861, "step": 409 }, { "batch_size": 4, "epoch": 0.1636, "step": 409, "tokens_per_device": 4216 }, { "epoch": 0.1636, "loss_ce": 0.513170063495636, "loss_lvr": 0.9734575152397156, "loss_mode_switch": 0.0, "loss_total": 0.610515832901001, "step": 409 }, { "epoch": 0.164, "grad_norm": 1.5192221403121948, "learning_rate": 9.536469267883432e-06, "loss": 0.3123, "step": 410 }, { "batch_size": 1, "epoch": 0.164, "step": 410, "tokens_per_device": 5217 }, { "epoch": 0.164, "loss_ce": 0.028201304376125336, "loss_lvr": 0.27454692125320435, "loss_mode_switch": 0.0, "loss_total": 0.05565599724650383, "step": 410 }, { "batch_size": 4, "epoch": 0.164, "step": 410, "tokens_per_device": 4816 }, { "epoch": 0.164, "loss_ce": 0.025231629610061646, "loss_lvr": 1.0770732164382935, "loss_mode_switch": 0.0, "loss_total": 0.132938951253891, "step": 410 }, { "batch_size": 4, "epoch": 0.164, "step": 410, "tokens_per_device": 1580 }, { "epoch": 0.164, "loss_ce": 0.34166139364242554, "loss_lvr": 1.0235388278961182, "loss_mode_switch": 0.0, "loss_total": 0.4440152645111084, "step": 410 }, { "batch_size": 4, "epoch": 0.164, "step": 410, "tokens_per_device": 7868 }, { "epoch": 0.164, "loss_ce": 0.07996527850627899, "loss_lvr": 0.8198755383491516, "loss_mode_switch": 0.0, "loss_total": 0.16195282340049744, "step": 410 }, { "batch_size": 4, "epoch": 0.164, "step": 410, "tokens_per_device": 2868 }, { "epoch": 0.164, "loss_ce": 0.3906269371509552, "loss_lvr": 1.0619239807128906, "loss_mode_switch": 0.0, "loss_total": 0.4968193471431732, "step": 410 }, { "batch_size": 4, "epoch": 0.164, "step": 410, "tokens_per_device": 1576 }, { "epoch": 0.164, "loss_ce": 0.5083950757980347, "loss_lvr": 1.2071577310562134, "loss_mode_switch": 0.0, "loss_total": 0.6291108727455139, "step": 410 }, { "batch_size": 1, "epoch": 0.164, "step": 410, "tokens_per_device": 4916 }, { "epoch": 0.164, "loss_ce": 0.008387754671275616, "loss_lvr": 0.5290001034736633, "loss_mode_switch": 0.0, "loss_total": 0.0612877681851387, "step": 410 }, { "batch_size": 1, "epoch": 0.164, "step": 410, "tokens_per_device": 5110 }, { "epoch": 0.164, "loss_ce": 0.006565776653587818, "loss_lvr": 0.929908037185669, "loss_mode_switch": 0.0, "loss_total": 0.0995565876364708, "step": 410 }, { "epoch": 0.1644, "grad_norm": 1.2848355770111084, "learning_rate": 9.533741686974122e-06, "loss": 0.3288, "step": 411 }, { "batch_size": 1, "epoch": 0.1644, "step": 411, "tokens_per_device": 5170 }, { "epoch": 0.1644, "loss_ce": 0.005236410070210695, "loss_lvr": 0.765428900718689, "loss_mode_switch": 0.0, "loss_total": 0.08177930116653442, "step": 411 }, { "batch_size": 4, "epoch": 0.1644, "step": 411, "tokens_per_device": 6780 }, { "epoch": 0.1644, "loss_ce": 0.23468178510665894, "loss_lvr": 0.4490799903869629, "loss_mode_switch": 0.0, "loss_total": 0.27958977222442627, "step": 411 }, { "batch_size": 4, "epoch": 0.1644, "step": 411, "tokens_per_device": 4604 }, { "epoch": 0.1644, "loss_ce": 0.11960307508707047, "loss_lvr": 0.8607006072998047, "loss_mode_switch": 0.0, "loss_total": 0.20567312836647034, "step": 411 }, { "batch_size": 4, "epoch": 0.1644, "step": 411, "tokens_per_device": 5808 }, { "epoch": 0.1644, "loss_ce": 0.42813795804977417, "loss_lvr": 1.031339168548584, "loss_mode_switch": 0.0, "loss_total": 0.5312718749046326, "step": 411 }, { "batch_size": 4, "epoch": 0.1644, "step": 411, "tokens_per_device": 6188 }, { "epoch": 0.1644, "loss_ce": 0.12102726101875305, "loss_lvr": 0.8008245229721069, "loss_mode_switch": 0.0, "loss_total": 0.20110970735549927, "step": 411 }, { "batch_size": 1, "epoch": 0.1644, "step": 411, "tokens_per_device": 5165 }, { "epoch": 0.1644, "loss_ce": 0.07924258708953857, "loss_lvr": 0.4333325922489166, "loss_mode_switch": 0.0, "loss_total": 0.12257584929466248, "step": 411 }, { "batch_size": 4, "epoch": 0.1644, "step": 411, "tokens_per_device": 4396 }, { "epoch": 0.1644, "loss_ce": 0.35418403148651123, "loss_lvr": 0.8571174740791321, "loss_mode_switch": 0.0, "loss_total": 0.43989577889442444, "step": 411 }, { "batch_size": 4, "epoch": 0.1644, "step": 411, "tokens_per_device": 1260 }, { "epoch": 0.1644, "loss_ce": 0.28964048624038696, "loss_lvr": 1.3893567323684692, "loss_mode_switch": 0.0, "loss_total": 0.42857617139816284, "step": 411 }, { "epoch": 0.1648, "grad_norm": 1.496854543685913, "learning_rate": 9.53100649697043e-06, "loss": 0.3467, "step": 412 }, { "batch_size": 4, "epoch": 0.1648, "step": 412, "tokens_per_device": 3912 }, { "epoch": 0.1648, "loss_ce": 0.222788006067276, "loss_lvr": 1.1693384647369385, "loss_mode_switch": 0.0, "loss_total": 0.3397218585014343, "step": 412 }, { "batch_size": 4, "epoch": 0.1648, "step": 412, "tokens_per_device": 5564 }, { "epoch": 0.1648, "loss_ce": 0.34617453813552856, "loss_lvr": 0.8816230297088623, "loss_mode_switch": 0.0, "loss_total": 0.4343368411064148, "step": 412 }, { "batch_size": 4, "epoch": 0.1648, "step": 412, "tokens_per_device": 1648 }, { "epoch": 0.1648, "loss_ce": 0.27832910418510437, "loss_lvr": 0.9881746768951416, "loss_mode_switch": 0.0, "loss_total": 0.37714657187461853, "step": 412 }, { "batch_size": 4, "epoch": 0.1648, "step": 412, "tokens_per_device": 1256 }, { "epoch": 0.1648, "loss_ce": 0.26991498470306396, "loss_lvr": 1.2126383781433105, "loss_mode_switch": 0.0, "loss_total": 0.39117881655693054, "step": 412 }, { "batch_size": 4, "epoch": 0.1648, "step": 412, "tokens_per_device": 3784 }, { "epoch": 0.1648, "loss_ce": 0.060670070350170135, "loss_lvr": 0.9654360413551331, "loss_mode_switch": 0.0, "loss_total": 0.15721367299556732, "step": 412 }, { "batch_size": 4, "epoch": 0.1648, "step": 412, "tokens_per_device": 4804 }, { "epoch": 0.1648, "loss_ce": 0.33025994896888733, "loss_lvr": 0.9477748870849609, "loss_mode_switch": 0.0, "loss_total": 0.4250374436378479, "step": 412 }, { "batch_size": 4, "epoch": 0.1648, "step": 412, "tokens_per_device": 4900 }, { "epoch": 0.1648, "loss_ce": 0.078327976167202, "loss_lvr": 0.9646488428115845, "loss_mode_switch": 0.0, "loss_total": 0.17479285597801208, "step": 412 }, { "batch_size": 4, "epoch": 0.1648, "step": 412, "tokens_per_device": 1256 }, { "epoch": 0.1648, "loss_ce": 0.6523814797401428, "loss_lvr": 1.0841646194458008, "loss_mode_switch": 0.0, "loss_total": 0.760797917842865, "step": 412 }, { "epoch": 0.1652, "grad_norm": 1.4120042324066162, "learning_rate": 9.528263702462894e-06, "loss": 0.3493, "step": 413 }, { "batch_size": 4, "epoch": 0.1652, "step": 413, "tokens_per_device": 5708 }, { "epoch": 0.1652, "loss_ce": 0.16512161493301392, "loss_lvr": 0.8941763043403625, "loss_mode_switch": 0.0, "loss_total": 0.25453925132751465, "step": 413 }, { "batch_size": 4, "epoch": 0.1652, "step": 413, "tokens_per_device": 1248 }, { "epoch": 0.1652, "loss_ce": 0.32198819518089294, "loss_lvr": 1.2907323837280273, "loss_mode_switch": 0.0, "loss_total": 0.4510614275932312, "step": 413 }, { "batch_size": 4, "epoch": 0.1652, "step": 413, "tokens_per_device": 10868 }, { "epoch": 0.1652, "loss_ce": 0.012396197766065598, "loss_lvr": 1.1702414751052856, "loss_mode_switch": 0.0, "loss_total": 0.12942034006118774, "step": 413 }, { "batch_size": 1, "epoch": 0.1652, "step": 413, "tokens_per_device": 4872 }, { "epoch": 0.1652, "loss_ce": 0.016714636236429214, "loss_lvr": 0.5469992160797119, "loss_mode_switch": 0.0, "loss_total": 0.07141456007957458, "step": 413 }, { "batch_size": 4, "epoch": 0.1652, "step": 413, "tokens_per_device": 4792 }, { "epoch": 0.1652, "loss_ce": 0.08417943120002747, "loss_lvr": 0.7581890821456909, "loss_mode_switch": 0.0, "loss_total": 0.1599983423948288, "step": 413 }, { "batch_size": 1, "epoch": 0.1652, "step": 413, "tokens_per_device": 5124 }, { "epoch": 0.1652, "loss_ce": 0.006885469425469637, "loss_lvr": 0.6129249930381775, "loss_mode_switch": 0.0, "loss_total": 0.0681779682636261, "step": 413 }, { "batch_size": 4, "epoch": 0.1652, "step": 413, "tokens_per_device": 3800 }, { "epoch": 0.1652, "loss_ce": 0.1552610695362091, "loss_lvr": 0.8033284544944763, "loss_mode_switch": 0.0, "loss_total": 0.23559391498565674, "step": 413 }, { "batch_size": 1, "epoch": 0.1652, "step": 413, "tokens_per_device": 5107 }, { "epoch": 0.1652, "loss_ce": 0.002011020202189684, "loss_lvr": 0.7521917819976807, "loss_mode_switch": 0.0, "loss_total": 0.07723020017147064, "step": 413 }, { "epoch": 0.1656, "grad_norm": 1.3058617115020752, "learning_rate": 9.525513308054818e-06, "loss": 0.2729, "step": 414 }, { "batch_size": 4, "epoch": 0.1656, "step": 414, "tokens_per_device": 1440 }, { "epoch": 0.1656, "loss_ce": 0.5106464624404907, "loss_lvr": 1.072776198387146, "loss_mode_switch": 0.0, "loss_total": 0.6179240942001343, "step": 414 }, { "batch_size": 1, "epoch": 0.1656, "step": 414, "tokens_per_device": 5218 }, { "epoch": 0.1656, "loss_ce": 0.6939618587493896, "loss_lvr": 0.5702018737792969, "loss_mode_switch": 0.0, "loss_total": 0.7509820461273193, "step": 414 }, { "batch_size": 4, "epoch": 0.1656, "step": 414, "tokens_per_device": 4400 }, { "epoch": 0.1656, "loss_ce": 0.37342214584350586, "loss_lvr": 1.2900640964508057, "loss_mode_switch": 0.0, "loss_total": 0.5024285316467285, "step": 414 }, { "batch_size": 4, "epoch": 0.1656, "step": 414, "tokens_per_device": 4636 }, { "epoch": 0.1656, "loss_ce": 0.28394654393196106, "loss_lvr": 0.9593040943145752, "loss_mode_switch": 0.0, "loss_total": 0.379876971244812, "step": 414 }, { "batch_size": 1, "epoch": 0.1656, "step": 414, "tokens_per_device": 4554 }, { "epoch": 0.1656, "loss_ce": 0.16833651065826416, "loss_lvr": 0.9306485056877136, "loss_mode_switch": 0.0, "loss_total": 0.26140135526657104, "step": 414 }, { "batch_size": 4, "epoch": 0.1656, "step": 414, "tokens_per_device": 4452 }, { "epoch": 0.1656, "loss_ce": 0.10653512924909592, "loss_lvr": 1.3477237224578857, "loss_mode_switch": 0.0, "loss_total": 0.24130749702453613, "step": 414 }, { "batch_size": 4, "epoch": 0.1656, "step": 414, "tokens_per_device": 5588 }, { "epoch": 0.1656, "loss_ce": 0.37886542081832886, "loss_lvr": 0.7564692497253418, "loss_mode_switch": 0.0, "loss_total": 0.454512357711792, "step": 414 }, { "batch_size": 4, "epoch": 0.1656, "step": 414, "tokens_per_device": 5220 }, { "epoch": 0.1656, "loss_ce": 0.11756224185228348, "loss_lvr": 1.145921230316162, "loss_mode_switch": 0.0, "loss_total": 0.23215436935424805, "step": 414 }, { "epoch": 0.166, "grad_norm": 1.3927701711654663, "learning_rate": 9.52275531836226e-06, "loss": 0.3283, "step": 415 }, { "batch_size": 1, "epoch": 0.166, "step": 415, "tokens_per_device": 6267 }, { "epoch": 0.166, "loss_ce": 0.004692117217928171, "loss_lvr": 0.3952011168003082, "loss_mode_switch": 0.0, "loss_total": 0.044212229549884796, "step": 415 }, { "batch_size": 4, "epoch": 0.166, "step": 415, "tokens_per_device": 3876 }, { "epoch": 0.166, "loss_ce": 0.11898795515298843, "loss_lvr": 0.9499903321266174, "loss_mode_switch": 0.0, "loss_total": 0.21398699283599854, "step": 415 }, { "batch_size": 1, "epoch": 0.166, "step": 415, "tokens_per_device": 5090 }, { "epoch": 0.166, "loss_ce": 0.23527230322360992, "loss_lvr": 0.4210389256477356, "loss_mode_switch": 0.0, "loss_total": 0.2773762047290802, "step": 415 }, { "batch_size": 4, "epoch": 0.166, "step": 415, "tokens_per_device": 4284 }, { "epoch": 0.166, "loss_ce": 0.40152299404144287, "loss_lvr": 0.922249436378479, "loss_mode_switch": 0.0, "loss_total": 0.4937479496002197, "step": 415 }, { "batch_size": 4, "epoch": 0.166, "step": 415, "tokens_per_device": 4692 }, { "epoch": 0.166, "loss_ce": 0.24405191838741302, "loss_lvr": 0.7859978079795837, "loss_mode_switch": 0.0, "loss_total": 0.3226516842842102, "step": 415 }, { "batch_size": 1, "epoch": 0.166, "step": 415, "tokens_per_device": 5048 }, { "epoch": 0.166, "loss_ce": 0.05993836000561714, "loss_lvr": 0.39651429653167725, "loss_mode_switch": 0.0, "loss_total": 0.09958979487419128, "step": 415 }, { "batch_size": 4, "epoch": 0.166, "step": 415, "tokens_per_device": 4272 }, { "epoch": 0.166, "loss_ce": 0.4908454120159149, "loss_lvr": 0.9100563526153564, "loss_mode_switch": 0.0, "loss_total": 0.581851065158844, "step": 415 }, { "batch_size": 1, "epoch": 0.166, "step": 415, "tokens_per_device": 6388 }, { "epoch": 0.166, "loss_ce": 0.06751513481140137, "loss_lvr": 0.48050931096076965, "loss_mode_switch": 0.0, "loss_total": 0.11556606739759445, "step": 415 }, { "epoch": 0.1664, "grad_norm": 1.4678051471710205, "learning_rate": 9.519989738014022e-06, "loss": 0.3481, "step": 416 }, { "batch_size": 1, "epoch": 0.1664, "step": 416, "tokens_per_device": 4876 }, { "epoch": 0.1664, "loss_ce": 0.0013419041642919183, "loss_lvr": 0.3738435208797455, "loss_mode_switch": 0.0, "loss_total": 0.038726259022951126, "step": 416 }, { "batch_size": 4, "epoch": 0.1664, "step": 416, "tokens_per_device": 4472 }, { "epoch": 0.1664, "loss_ce": 0.15221434831619263, "loss_lvr": 1.418540358543396, "loss_mode_switch": 0.0, "loss_total": 0.2940683960914612, "step": 416 }, { "batch_size": 4, "epoch": 0.1664, "step": 416, "tokens_per_device": 1308 }, { "epoch": 0.1664, "loss_ce": 0.35075417160987854, "loss_lvr": 1.1148782968521118, "loss_mode_switch": 0.0, "loss_total": 0.4622420072555542, "step": 416 }, { "batch_size": 4, "epoch": 0.1664, "step": 416, "tokens_per_device": 4228 }, { "epoch": 0.1664, "loss_ce": 0.30863991379737854, "loss_lvr": 1.7434667348861694, "loss_mode_switch": 0.0, "loss_total": 0.48298656940460205, "step": 416 }, { "batch_size": 1, "epoch": 0.1664, "step": 416, "tokens_per_device": 5175 }, { "epoch": 0.1664, "loss_ce": 0.07420165091753006, "loss_lvr": 0.4528484642505646, "loss_mode_switch": 0.0, "loss_total": 0.1194864958524704, "step": 416 }, { "batch_size": 4, "epoch": 0.1664, "step": 416, "tokens_per_device": 2552 }, { "epoch": 0.1664, "loss_ce": 0.3025294840335846, "loss_lvr": 1.1257251501083374, "loss_mode_switch": 0.0, "loss_total": 0.4151020050048828, "step": 416 }, { "batch_size": 1, "epoch": 0.1664, "step": 416, "tokens_per_device": 4894 }, { "epoch": 0.1664, "loss_ce": 0.020608950406312943, "loss_lvr": 0.480714350938797, "loss_mode_switch": 0.0, "loss_total": 0.06868039071559906, "step": 416 }, { "batch_size": 1, "epoch": 0.1664, "step": 416, "tokens_per_device": 4897 }, { "epoch": 0.1664, "loss_ce": 0.29857781529426575, "loss_lvr": 0.7930230498313904, "loss_mode_switch": 0.0, "loss_total": 0.37788012623786926, "step": 416 }, { "epoch": 0.1668, "grad_norm": 1.5387797355651855, "learning_rate": 9.51721657165165e-06, "loss": 0.3673, "step": 417 }, { "batch_size": 1, "epoch": 0.1668, "step": 417, "tokens_per_device": 5167 }, { "epoch": 0.1668, "loss_ce": 0.021969785913825035, "loss_lvr": 0.5688947439193726, "loss_mode_switch": 0.0, "loss_total": 0.07885926216840744, "step": 417 }, { "batch_size": 4, "epoch": 0.1668, "step": 417, "tokens_per_device": 5720 }, { "epoch": 0.1668, "loss_ce": 0.1595705896615982, "loss_lvr": 0.9908000826835632, "loss_mode_switch": 0.0, "loss_total": 0.25865060091018677, "step": 417 }, { "batch_size": 4, "epoch": 0.1668, "step": 417, "tokens_per_device": 2600 }, { "epoch": 0.1668, "loss_ce": 0.2720506191253662, "loss_lvr": 0.9622750282287598, "loss_mode_switch": 0.0, "loss_total": 0.3682781159877777, "step": 417 }, { "batch_size": 4, "epoch": 0.1668, "step": 417, "tokens_per_device": 5604 }, { "epoch": 0.1668, "loss_ce": 0.22242633998394012, "loss_lvr": 0.7611218094825745, "loss_mode_switch": 0.0, "loss_total": 0.2985385060310364, "step": 417 }, { "batch_size": 4, "epoch": 0.1668, "step": 417, "tokens_per_device": 3812 }, { "epoch": 0.1668, "loss_ce": 0.05324607715010643, "loss_lvr": 1.0607198476791382, "loss_mode_switch": 0.0, "loss_total": 0.15931805968284607, "step": 417 }, { "batch_size": 4, "epoch": 0.1668, "step": 417, "tokens_per_device": 3968 }, { "epoch": 0.1668, "loss_ce": 0.45992207527160645, "loss_lvr": 1.0510494709014893, "loss_mode_switch": 0.0, "loss_total": 0.5650269985198975, "step": 417 }, { "batch_size": 1, "epoch": 0.1668, "step": 417, "tokens_per_device": 4888 }, { "epoch": 0.1668, "loss_ce": 0.052249591797590256, "loss_lvr": 0.44791921973228455, "loss_mode_switch": 0.0, "loss_total": 0.09704151749610901, "step": 417 }, { "batch_size": 4, "epoch": 0.1668, "step": 417, "tokens_per_device": 2692 }, { "epoch": 0.1668, "loss_ce": 0.0424746572971344, "loss_lvr": 0.8441380858421326, "loss_mode_switch": 0.0, "loss_total": 0.1268884688615799, "step": 417 }, { "epoch": 0.1672, "grad_norm": 1.7188539505004883, "learning_rate": 9.514435823929418e-06, "loss": 0.3026, "step": 418 }, { "batch_size": 4, "epoch": 0.1672, "step": 418, "tokens_per_device": 5972 }, { "epoch": 0.1672, "loss_ce": 0.018907830119132996, "loss_lvr": 0.8884127736091614, "loss_mode_switch": 0.0, "loss_total": 0.10774911195039749, "step": 418 }, { "batch_size": 4, "epoch": 0.1672, "step": 418, "tokens_per_device": 1196 }, { "epoch": 0.1672, "loss_ce": 0.07967812567949295, "loss_lvr": 1.1590116024017334, "loss_mode_switch": 0.0, "loss_total": 0.19557929039001465, "step": 418 }, { "batch_size": 1, "epoch": 0.1672, "step": 418, "tokens_per_device": 4888 }, { "epoch": 0.1672, "loss_ce": 0.013989491388201714, "loss_lvr": 1.0908503532409668, "loss_mode_switch": 0.0, "loss_total": 0.12307453155517578, "step": 418 }, { "batch_size": 1, "epoch": 0.1672, "step": 418, "tokens_per_device": 5119 }, { "epoch": 0.1672, "loss_ce": 0.12183341383934021, "loss_lvr": 0.9273096919059753, "loss_mode_switch": 0.0, "loss_total": 0.21456438302993774, "step": 418 }, { "batch_size": 4, "epoch": 0.1672, "step": 418, "tokens_per_device": 2220 }, { "epoch": 0.1672, "loss_ce": 0.23696690797805786, "loss_lvr": 0.857377290725708, "loss_mode_switch": 0.0, "loss_total": 0.32270464301109314, "step": 418 }, { "batch_size": 4, "epoch": 0.1672, "step": 418, "tokens_per_device": 9704 }, { "epoch": 0.1672, "loss_ce": 0.4864450693130493, "loss_lvr": 0.8629292249679565, "loss_mode_switch": 0.0, "loss_total": 0.572737991809845, "step": 418 }, { "batch_size": 1, "epoch": 0.1672, "step": 418, "tokens_per_device": 5867 }, { "epoch": 0.1672, "loss_ce": 0.19969207048416138, "loss_lvr": 0.40328338742256165, "loss_mode_switch": 0.0, "loss_total": 0.24002040922641754, "step": 418 }, { "batch_size": 1, "epoch": 0.1672, "step": 418, "tokens_per_device": 4864 }, { "epoch": 0.1672, "loss_ce": 0.005082066636532545, "loss_lvr": 0.5017915368080139, "loss_mode_switch": 0.0, "loss_total": 0.055261220782995224, "step": 418 }, { "epoch": 0.1676, "grad_norm": 2.060317277908325, "learning_rate": 9.511647499514327e-06, "loss": 0.3113, "step": 419 }, { "batch_size": 4, "epoch": 0.1676, "step": 419, "tokens_per_device": 4580 }, { "epoch": 0.1676, "loss_ce": 0.04874557629227638, "loss_lvr": 1.0109760761260986, "loss_mode_switch": 0.0, "loss_total": 0.14984318614006042, "step": 419 }, { "batch_size": 4, "epoch": 0.1676, "step": 419, "tokens_per_device": 5236 }, { "epoch": 0.1676, "loss_ce": 0.07058992981910706, "loss_lvr": 0.8479647636413574, "loss_mode_switch": 0.0, "loss_total": 0.15538641810417175, "step": 419 }, { "batch_size": 4, "epoch": 0.1676, "step": 419, "tokens_per_device": 1444 }, { "epoch": 0.1676, "loss_ce": 0.3484480679035187, "loss_lvr": 1.096327781677246, "loss_mode_switch": 0.0, "loss_total": 0.45808085799217224, "step": 419 }, { "batch_size": 4, "epoch": 0.1676, "step": 419, "tokens_per_device": 3760 }, { "epoch": 0.1676, "loss_ce": 0.4364088773727417, "loss_lvr": 0.6395165324211121, "loss_mode_switch": 0.0, "loss_total": 0.5003605484962463, "step": 419 }, { "batch_size": 4, "epoch": 0.1676, "step": 419, "tokens_per_device": 4332 }, { "epoch": 0.1676, "loss_ce": 0.47272059321403503, "loss_lvr": 1.1249830722808838, "loss_mode_switch": 0.0, "loss_total": 0.5852189064025879, "step": 419 }, { "batch_size": 4, "epoch": 0.1676, "step": 419, "tokens_per_device": 1344 }, { "epoch": 0.1676, "loss_ce": 0.7456650733947754, "loss_lvr": 1.1749862432479858, "loss_mode_switch": 0.0, "loss_total": 0.8631637096405029, "step": 419 }, { "batch_size": 4, "epoch": 0.1676, "step": 419, "tokens_per_device": 4180 }, { "epoch": 0.1676, "loss_ce": 0.3056372404098511, "loss_lvr": 1.1583178043365479, "loss_mode_switch": 0.0, "loss_total": 0.4214690327644348, "step": 419 }, { "batch_size": 4, "epoch": 0.1676, "step": 419, "tokens_per_device": 3968 }, { "epoch": 0.1676, "loss_ce": 0.3842881917953491, "loss_lvr": 0.9280254244804382, "loss_mode_switch": 0.0, "loss_total": 0.4770907461643219, "step": 419 }, { "epoch": 0.168, "grad_norm": 1.486429214477539, "learning_rate": 9.508851603086094e-06, "loss": 0.3932, "step": 420 }, { "batch_size": 4, "epoch": 0.168, "step": 420, "tokens_per_device": 2556 }, { "epoch": 0.168, "loss_ce": 0.46625611186027527, "loss_lvr": 1.183329463005066, "loss_mode_switch": 0.0, "loss_total": 0.5845890641212463, "step": 420 }, { "batch_size": 4, "epoch": 0.168, "step": 420, "tokens_per_device": 4496 }, { "epoch": 0.168, "loss_ce": 0.02691742591559887, "loss_lvr": 0.8168038725852966, "loss_mode_switch": 0.0, "loss_total": 0.10859781503677368, "step": 420 }, { "batch_size": 4, "epoch": 0.168, "step": 420, "tokens_per_device": 3992 }, { "epoch": 0.168, "loss_ce": 0.04664117842912674, "loss_lvr": 0.7002164125442505, "loss_mode_switch": 0.0, "loss_total": 0.11666282266378403, "step": 420 }, { "batch_size": 4, "epoch": 0.168, "step": 420, "tokens_per_device": 4956 }, { "epoch": 0.168, "loss_ce": 0.1332291215658188, "loss_lvr": 0.9577393531799316, "loss_mode_switch": 0.0, "loss_total": 0.22900305688381195, "step": 420 }, { "batch_size": 4, "epoch": 0.168, "step": 420, "tokens_per_device": 6632 }, { "epoch": 0.168, "loss_ce": 0.03796028345823288, "loss_lvr": 0.7452540397644043, "loss_mode_switch": 0.0, "loss_total": 0.11248569190502167, "step": 420 }, { "batch_size": 4, "epoch": 0.168, "step": 420, "tokens_per_device": 3864 }, { "epoch": 0.168, "loss_ce": 0.09946328401565552, "loss_lvr": 1.0425935983657837, "loss_mode_switch": 0.0, "loss_total": 0.20372265577316284, "step": 420 }, { "batch_size": 4, "epoch": 0.168, "step": 420, "tokens_per_device": 1164 }, { "epoch": 0.168, "loss_ce": 0.17013771831989288, "loss_lvr": 1.5314269065856934, "loss_mode_switch": 0.0, "loss_total": 0.323280394077301, "step": 420 }, { "batch_size": 1, "epoch": 0.168, "step": 420, "tokens_per_device": 4876 }, { "epoch": 0.168, "loss_ce": 0.2849932312965393, "loss_lvr": 0.34913790225982666, "loss_mode_switch": 0.0, "loss_total": 0.319907009601593, "step": 420 }, { "epoch": 0.1684, "grad_norm": 1.4291956424713135, "learning_rate": 9.506048139337142e-06, "loss": 0.2949, "step": 421 }, { "batch_size": 4, "epoch": 0.1684, "step": 421, "tokens_per_device": 3820 }, { "epoch": 0.1684, "loss_ce": 0.34071415662765503, "loss_lvr": 1.1641135215759277, "loss_mode_switch": 0.0, "loss_total": 0.4571255147457123, "step": 421 }, { "batch_size": 1, "epoch": 0.1684, "step": 421, "tokens_per_device": 5069 }, { "epoch": 0.1684, "loss_ce": 0.5106040239334106, "loss_lvr": 0.6733838319778442, "loss_mode_switch": 0.0, "loss_total": 0.577942430973053, "step": 421 }, { "batch_size": 4, "epoch": 0.1684, "step": 421, "tokens_per_device": 4488 }, { "epoch": 0.1684, "loss_ce": 0.46684902906417847, "loss_lvr": 1.0318214893341064, "loss_mode_switch": 0.0, "loss_total": 0.5700311660766602, "step": 421 }, { "batch_size": 1, "epoch": 0.1684, "step": 421, "tokens_per_device": 4896 }, { "epoch": 0.1684, "loss_ce": 0.01642085239291191, "loss_lvr": 0.5432345867156982, "loss_mode_switch": 0.0, "loss_total": 0.07074431329965591, "step": 421 }, { "batch_size": 1, "epoch": 0.1684, "step": 421, "tokens_per_device": 4890 }, { "epoch": 0.1684, "loss_ce": 0.23273929953575134, "loss_lvr": 0.9899729490280151, "loss_mode_switch": 0.0, "loss_total": 0.33173659443855286, "step": 421 }, { "batch_size": 1, "epoch": 0.1684, "step": 421, "tokens_per_device": 5118 }, { "epoch": 0.1684, "loss_ce": 0.001676379470154643, "loss_lvr": 0.32715457677841187, "loss_mode_switch": 0.0, "loss_total": 0.03439183905720711, "step": 421 }, { "batch_size": 4, "epoch": 0.1684, "step": 421, "tokens_per_device": 1352 }, { "epoch": 0.1684, "loss_ce": 0.7137120366096497, "loss_lvr": 1.3414584398269653, "loss_mode_switch": 0.0, "loss_total": 0.8478578925132751, "step": 421 }, { "batch_size": 4, "epoch": 0.1684, "step": 421, "tokens_per_device": 2636 }, { "epoch": 0.1684, "loss_ce": 0.3978881537914276, "loss_lvr": 0.9184780120849609, "loss_mode_switch": 0.0, "loss_total": 0.4897359609603882, "step": 421 }, { "epoch": 0.1688, "grad_norm": 1.4236812591552734, "learning_rate": 9.503237112972594e-06, "loss": 0.3677, "step": 422 }, { "batch_size": 4, "epoch": 0.1688, "step": 422, "tokens_per_device": 3724 }, { "epoch": 0.1688, "loss_ce": 0.2076997309923172, "loss_lvr": 1.219246745109558, "loss_mode_switch": 0.0, "loss_total": 0.3296244144439697, "step": 422 }, { "batch_size": 4, "epoch": 0.1688, "step": 422, "tokens_per_device": 5468 }, { "epoch": 0.1688, "loss_ce": 0.09929057955741882, "loss_lvr": 0.924456775188446, "loss_mode_switch": 0.0, "loss_total": 0.19173625111579895, "step": 422 }, { "batch_size": 4, "epoch": 0.1688, "step": 422, "tokens_per_device": 4748 }, { "epoch": 0.1688, "loss_ce": 0.4720366895198822, "loss_lvr": 0.7940016388893127, "loss_mode_switch": 0.0, "loss_total": 0.5514368414878845, "step": 422 }, { "batch_size": 1, "epoch": 0.1688, "step": 422, "tokens_per_device": 4877 }, { "epoch": 0.1688, "loss_ce": 0.009113074280321598, "loss_lvr": 1.762122392654419, "loss_mode_switch": 0.0, "loss_total": 0.18532530963420868, "step": 422 }, { "batch_size": 4, "epoch": 0.1688, "step": 422, "tokens_per_device": 5088 }, { "epoch": 0.1688, "loss_ce": 0.16574303805828094, "loss_lvr": 0.8936299085617065, "loss_mode_switch": 0.0, "loss_total": 0.25510603189468384, "step": 422 }, { "batch_size": 4, "epoch": 0.1688, "step": 422, "tokens_per_device": 5144 }, { "epoch": 0.1688, "loss_ce": 0.22091706097126007, "loss_lvr": 1.048715591430664, "loss_mode_switch": 0.0, "loss_total": 0.32578861713409424, "step": 422 }, { "batch_size": 4, "epoch": 0.1688, "step": 422, "tokens_per_device": 4500 }, { "epoch": 0.1688, "loss_ce": 0.6199012994766235, "loss_lvr": 1.1304835081100464, "loss_mode_switch": 0.0, "loss_total": 0.7329496741294861, "step": 422 }, { "batch_size": 1, "epoch": 0.1688, "step": 422, "tokens_per_device": 4829 }, { "epoch": 0.1688, "loss_ce": 0.23411212861537933, "loss_lvr": 0.43405959010124207, "loss_mode_switch": 0.0, "loss_total": 0.277518093585968, "step": 422 }, { "epoch": 0.1692, "grad_norm": 1.6216269731521606, "learning_rate": 9.50041852871027e-06, "loss": 0.3373, "step": 423 }, { "batch_size": 4, "epoch": 0.1692, "step": 423, "tokens_per_device": 5456 }, { "epoch": 0.1692, "loss_ce": 0.44235190749168396, "loss_lvr": 0.7034224271774292, "loss_mode_switch": 0.0, "loss_total": 0.5126941204071045, "step": 423 }, { "batch_size": 4, "epoch": 0.1692, "step": 423, "tokens_per_device": 4964 }, { "epoch": 0.1692, "loss_ce": 0.4650503993034363, "loss_lvr": 0.6952860355377197, "loss_mode_switch": 0.0, "loss_total": 0.5345789790153503, "step": 423 }, { "batch_size": 4, "epoch": 0.1692, "step": 423, "tokens_per_device": 2732 }, { "epoch": 0.1692, "loss_ce": 0.6854075193405151, "loss_lvr": 0.958862841129303, "loss_mode_switch": 0.0, "loss_total": 0.7812938094139099, "step": 423 }, { "batch_size": 4, "epoch": 0.1692, "step": 423, "tokens_per_device": 4268 }, { "epoch": 0.1692, "loss_ce": 0.04461413994431496, "loss_lvr": 1.0579710006713867, "loss_mode_switch": 0.0, "loss_total": 0.15041124820709229, "step": 423 }, { "batch_size": 1, "epoch": 0.1692, "step": 423, "tokens_per_device": 4908 }, { "epoch": 0.1692, "loss_ce": 0.03550746291875839, "loss_lvr": 0.9290294647216797, "loss_mode_switch": 0.0, "loss_total": 0.12841041386127472, "step": 423 }, { "batch_size": 4, "epoch": 0.1692, "step": 423, "tokens_per_device": 3892 }, { "epoch": 0.1692, "loss_ce": 0.2810461223125458, "loss_lvr": 0.9161785244941711, "loss_mode_switch": 0.0, "loss_total": 0.3726639747619629, "step": 423 }, { "batch_size": 4, "epoch": 0.1692, "step": 423, "tokens_per_device": 5652 }, { "epoch": 0.1692, "loss_ce": 0.1637297421693802, "loss_lvr": 0.9804996848106384, "loss_mode_switch": 0.0, "loss_total": 0.2617797255516052, "step": 423 }, { "batch_size": 1, "epoch": 0.1692, "step": 423, "tokens_per_device": 5026 }, { "epoch": 0.1692, "loss_ce": 0.016073821112513542, "loss_lvr": 0.582827627658844, "loss_mode_switch": 0.0, "loss_total": 0.07435658574104309, "step": 423 }, { "epoch": 0.1696, "grad_norm": 1.3826186656951904, "learning_rate": 9.497592391280672e-06, "loss": 0.3316, "step": 424 }, { "batch_size": 1, "epoch": 0.1696, "step": 424, "tokens_per_device": 5165 }, { "epoch": 0.1696, "loss_ce": 0.07918305695056915, "loss_lvr": 0.22496607899665833, "loss_mode_switch": 0.0, "loss_total": 0.10167966783046722, "step": 424 }, { "batch_size": 4, "epoch": 0.1696, "step": 424, "tokens_per_device": 7680 }, { "epoch": 0.1696, "loss_ce": 0.6858342885971069, "loss_lvr": 0.880622148513794, "loss_mode_switch": 0.0, "loss_total": 0.7738965153694153, "step": 424 }, { "batch_size": 4, "epoch": 0.1696, "step": 424, "tokens_per_device": 2668 }, { "epoch": 0.1696, "loss_ce": 0.2870238423347473, "loss_lvr": 0.9484624862670898, "loss_mode_switch": 0.0, "loss_total": 0.3818700909614563, "step": 424 }, { "batch_size": 1, "epoch": 0.1696, "step": 424, "tokens_per_device": 5004 }, { "epoch": 0.1696, "loss_ce": 0.13129644095897675, "loss_lvr": 0.6158351302146912, "loss_mode_switch": 0.0, "loss_total": 0.19287995994091034, "step": 424 }, { "batch_size": 4, "epoch": 0.1696, "step": 424, "tokens_per_device": 5380 }, { "epoch": 0.1696, "loss_ce": 0.1274586170911789, "loss_lvr": 1.0575259923934937, "loss_mode_switch": 0.0, "loss_total": 0.2332112193107605, "step": 424 }, { "batch_size": 1, "epoch": 0.1696, "step": 424, "tokens_per_device": 4914 }, { "epoch": 0.1696, "loss_ce": 0.07977121323347092, "loss_lvr": 0.3873727023601532, "loss_mode_switch": 0.0, "loss_total": 0.1185084879398346, "step": 424 }, { "batch_size": 1, "epoch": 0.1696, "step": 424, "tokens_per_device": 4896 }, { "epoch": 0.1696, "loss_ce": 0.006053711287677288, "loss_lvr": 0.4920836091041565, "loss_mode_switch": 0.0, "loss_total": 0.05526207387447357, "step": 424 }, { "batch_size": 4, "epoch": 0.1696, "step": 424, "tokens_per_device": 5228 }, { "epoch": 0.1696, "loss_ce": 0.18727289140224457, "loss_lvr": 1.1271408796310425, "loss_mode_switch": 0.0, "loss_total": 0.29998698830604553, "step": 424 }, { "epoch": 0.17, "grad_norm": 1.4127470254898071, "learning_rate": 9.494758705426978e-06, "loss": 0.3145, "step": 425 }, { "batch_size": 4, "epoch": 0.17, "step": 425, "tokens_per_device": 6124 }, { "epoch": 0.17, "loss_ce": 0.05724842846393585, "loss_lvr": 0.7693079710006714, "loss_mode_switch": 0.0, "loss_total": 0.1341792345046997, "step": 425 }, { "batch_size": 1, "epoch": 0.17, "step": 425, "tokens_per_device": 4967 }, { "epoch": 0.17, "loss_ce": 0.3319650888442993, "loss_lvr": 1.0720661878585815, "loss_mode_switch": 0.0, "loss_total": 0.439171701669693, "step": 425 }, { "batch_size": 1, "epoch": 0.17, "step": 425, "tokens_per_device": 4892 }, { "epoch": 0.17, "loss_ce": 0.020608769729733467, "loss_lvr": 0.5616187453269958, "loss_mode_switch": 0.0, "loss_total": 0.07677064836025238, "step": 425 }, { "batch_size": 4, "epoch": 0.17, "step": 425, "tokens_per_device": 3924 }, { "epoch": 0.17, "loss_ce": 0.37059393525123596, "loss_lvr": 0.9632602334022522, "loss_mode_switch": 0.0, "loss_total": 0.4669199585914612, "step": 425 }, { "batch_size": 4, "epoch": 0.17, "step": 425, "tokens_per_device": 4228 }, { "epoch": 0.17, "loss_ce": 0.2744430899620056, "loss_lvr": 0.6214975118637085, "loss_mode_switch": 0.0, "loss_total": 0.3365928530693054, "step": 425 }, { "batch_size": 1, "epoch": 0.17, "step": 425, "tokens_per_device": 4883 }, { "epoch": 0.17, "loss_ce": 0.3696615695953369, "loss_lvr": 0.8510739207267761, "loss_mode_switch": 0.0, "loss_total": 0.45476895570755005, "step": 425 }, { "batch_size": 4, "epoch": 0.17, "step": 425, "tokens_per_device": 4392 }, { "epoch": 0.17, "loss_ce": 0.0293534304946661, "loss_lvr": 1.4795331954956055, "loss_mode_switch": 0.0, "loss_total": 0.17730674147605896, "step": 425 }, { "batch_size": 4, "epoch": 0.17, "step": 425, "tokens_per_device": 9100 }, { "epoch": 0.17, "loss_ce": 0.07344980537891388, "loss_lvr": 0.7025541067123413, "loss_mode_switch": 0.0, "loss_total": 0.14370521903038025, "step": 425 }, { "epoch": 0.1704, "grad_norm": 1.6708693504333496, "learning_rate": 9.491917475905034e-06, "loss": 0.3542, "step": 426 }, { "batch_size": 1, "epoch": 0.1704, "step": 426, "tokens_per_device": 5433 }, { "epoch": 0.1704, "loss_ce": 0.08655991405248642, "loss_lvr": 0.3915661573410034, "loss_mode_switch": 0.0, "loss_total": 0.12571653723716736, "step": 426 }, { "batch_size": 4, "epoch": 0.1704, "step": 426, "tokens_per_device": 5104 }, { "epoch": 0.1704, "loss_ce": 0.3305817246437073, "loss_lvr": 1.051413893699646, "loss_mode_switch": 0.0, "loss_total": 0.43572312593460083, "step": 426 }, { "batch_size": 1, "epoch": 0.1704, "step": 426, "tokens_per_device": 5143 }, { "epoch": 0.1704, "loss_ce": 1.2125941514968872, "loss_lvr": 0.7503674626350403, "loss_mode_switch": 0.0, "loss_total": 1.2876309156417847, "step": 426 }, { "batch_size": 4, "epoch": 0.1704, "step": 426, "tokens_per_device": 4724 }, { "epoch": 0.1704, "loss_ce": 0.3227728307247162, "loss_lvr": 0.8468278646469116, "loss_mode_switch": 0.0, "loss_total": 0.4074556231498718, "step": 426 }, { "batch_size": 1, "epoch": 0.1704, "step": 426, "tokens_per_device": 5170 }, { "epoch": 0.1704, "loss_ce": 0.014292735606431961, "loss_lvr": 0.4020848274230957, "loss_mode_switch": 0.0, "loss_total": 0.05450122058391571, "step": 426 }, { "batch_size": 1, "epoch": 0.1704, "step": 426, "tokens_per_device": 4886 }, { "epoch": 0.1704, "loss_ce": 0.010661560110747814, "loss_lvr": 0.672486424446106, "loss_mode_switch": 0.0, "loss_total": 0.07791019976139069, "step": 426 }, { "batch_size": 1, "epoch": 0.1704, "step": 426, "tokens_per_device": 5135 }, { "epoch": 0.1704, "loss_ce": 0.022584963589906693, "loss_lvr": 0.6587576866149902, "loss_mode_switch": 0.0, "loss_total": 0.08846072852611542, "step": 426 }, { "batch_size": 4, "epoch": 0.1704, "step": 426, "tokens_per_device": 5548 }, { "epoch": 0.1704, "loss_ce": 0.17510594427585602, "loss_lvr": 0.7798974514007568, "loss_mode_switch": 0.0, "loss_total": 0.25309568643569946, "step": 426 }, { "epoch": 0.1708, "grad_norm": 1.8255419731140137, "learning_rate": 9.48906870748335e-06, "loss": 0.3346, "step": 427 }, { "batch_size": 4, "epoch": 0.1708, "step": 427, "tokens_per_device": 15452 }, { "epoch": 0.1708, "loss_ce": 0.44813594222068787, "loss_lvr": 1.1102439165115356, "loss_mode_switch": 0.0, "loss_total": 0.5591603517532349, "step": 427 }, { "batch_size": 4, "epoch": 0.1708, "step": 427, "tokens_per_device": 4308 }, { "epoch": 0.1708, "loss_ce": 0.17210127413272858, "loss_lvr": 0.8282691240310669, "loss_mode_switch": 0.0, "loss_total": 0.2549281716346741, "step": 427 }, { "batch_size": 4, "epoch": 0.1708, "step": 427, "tokens_per_device": 5188 }, { "epoch": 0.1708, "loss_ce": 0.40887758135795593, "loss_lvr": 0.8404557704925537, "loss_mode_switch": 0.0, "loss_total": 0.49292317032814026, "step": 427 }, { "batch_size": 4, "epoch": 0.1708, "step": 427, "tokens_per_device": 4204 }, { "epoch": 0.1708, "loss_ce": 0.6383386254310608, "loss_lvr": 1.1935534477233887, "loss_mode_switch": 0.0, "loss_total": 0.7576939463615417, "step": 427 }, { "batch_size": 4, "epoch": 0.1708, "step": 427, "tokens_per_device": 4004 }, { "epoch": 0.1708, "loss_ce": 0.4251480996608734, "loss_lvr": 1.0894688367843628, "loss_mode_switch": 0.0, "loss_total": 0.5340949892997742, "step": 427 }, { "batch_size": 4, "epoch": 0.1708, "step": 427, "tokens_per_device": 4588 }, { "epoch": 0.1708, "loss_ce": 0.15317529439926147, "loss_lvr": 0.6818464398384094, "loss_mode_switch": 0.0, "loss_total": 0.22135993838310242, "step": 427 }, { "batch_size": 4, "epoch": 0.1708, "step": 427, "tokens_per_device": 3524 }, { "epoch": 0.1708, "loss_ce": 0.2995625436306, "loss_lvr": 0.9156388640403748, "loss_mode_switch": 0.0, "loss_total": 0.391126424074173, "step": 427 }, { "batch_size": 1, "epoch": 0.1708, "step": 427, "tokens_per_device": 5180 }, { "epoch": 0.1708, "loss_ce": 0.029325144365429878, "loss_lvr": 0.5764966011047363, "loss_mode_switch": 0.0, "loss_total": 0.08697480708360672, "step": 427 }, { "epoch": 0.1712, "grad_norm": 1.3596290349960327, "learning_rate": 9.486212404943084e-06, "loss": 0.3135, "step": 428 }, { "batch_size": 1, "epoch": 0.1712, "step": 428, "tokens_per_device": 5178 }, { "epoch": 0.1712, "loss_ce": 0.18751664459705353, "loss_lvr": 0.31981900334358215, "loss_mode_switch": 0.0, "loss_total": 0.21949854493141174, "step": 428 }, { "batch_size": 4, "epoch": 0.1712, "step": 428, "tokens_per_device": 4180 }, { "epoch": 0.1712, "loss_ce": 0.13152478635311127, "loss_lvr": 0.8926963806152344, "loss_mode_switch": 0.0, "loss_total": 0.2207944244146347, "step": 428 }, { "batch_size": 4, "epoch": 0.1712, "step": 428, "tokens_per_device": 3848 }, { "epoch": 0.1712, "loss_ce": 0.4431500732898712, "loss_lvr": 0.7962790131568909, "loss_mode_switch": 0.0, "loss_total": 0.5227779746055603, "step": 428 }, { "batch_size": 4, "epoch": 0.1712, "step": 428, "tokens_per_device": 4204 }, { "epoch": 0.1712, "loss_ce": 0.7976586222648621, "loss_lvr": 0.6211782693862915, "loss_mode_switch": 0.0, "loss_total": 0.8597764372825623, "step": 428 }, { "batch_size": 4, "epoch": 0.1712, "step": 428, "tokens_per_device": 2620 }, { "epoch": 0.1712, "loss_ce": 0.643700897693634, "loss_lvr": 1.0263588428497314, "loss_mode_switch": 0.0, "loss_total": 0.7463367581367493, "step": 428 }, { "batch_size": 4, "epoch": 0.1712, "step": 428, "tokens_per_device": 6600 }, { "epoch": 0.1712, "loss_ce": 0.36704587936401367, "loss_lvr": 0.939757764339447, "loss_mode_switch": 0.0, "loss_total": 0.46102166175842285, "step": 428 }, { "batch_size": 4, "epoch": 0.1712, "step": 428, "tokens_per_device": 1776 }, { "epoch": 0.1712, "loss_ce": 0.43665218353271484, "loss_lvr": 0.9528527855873108, "loss_mode_switch": 0.0, "loss_total": 0.5319374799728394, "step": 428 }, { "batch_size": 4, "epoch": 0.1712, "step": 428, "tokens_per_device": 1544 }, { "epoch": 0.1712, "loss_ce": 0.4750364124774933, "loss_lvr": 0.9727752208709717, "loss_mode_switch": 0.0, "loss_total": 0.5723139047622681, "step": 428 }, { "epoch": 0.1716, "grad_norm": 1.5192619562149048, "learning_rate": 9.483348573078046e-06, "loss": 0.3763, "step": 429 }, { "batch_size": 4, "epoch": 0.1716, "step": 429, "tokens_per_device": 3800 }, { "epoch": 0.1716, "loss_ce": 0.35730990767478943, "loss_lvr": 0.9722133278846741, "loss_mode_switch": 0.0, "loss_total": 0.4545312523841858, "step": 429 }, { "batch_size": 1, "epoch": 0.1716, "step": 429, "tokens_per_device": 5161 }, { "epoch": 0.1716, "loss_ce": 0.30450114607810974, "loss_lvr": 0.5516741275787354, "loss_mode_switch": 0.0, "loss_total": 0.3596685528755188, "step": 429 }, { "batch_size": 1, "epoch": 0.1716, "step": 429, "tokens_per_device": 4820 }, { "epoch": 0.1716, "loss_ce": 0.06026618182659149, "loss_lvr": 0.480490118265152, "loss_mode_switch": 0.0, "loss_total": 0.10831519961357117, "step": 429 }, { "batch_size": 4, "epoch": 0.1716, "step": 429, "tokens_per_device": 5884 }, { "epoch": 0.1716, "loss_ce": 0.6538441181182861, "loss_lvr": 0.8580135703086853, "loss_mode_switch": 0.0, "loss_total": 0.7396454811096191, "step": 429 }, { "batch_size": 1, "epoch": 0.1716, "step": 429, "tokens_per_device": 4893 }, { "epoch": 0.1716, "loss_ce": 0.0025099278427660465, "loss_lvr": 0.6780015826225281, "loss_mode_switch": 0.0, "loss_total": 0.0703100860118866, "step": 429 }, { "batch_size": 1, "epoch": 0.1716, "step": 429, "tokens_per_device": 4512 }, { "epoch": 0.1716, "loss_ce": 0.005316932685673237, "loss_lvr": 0.8881462812423706, "loss_mode_switch": 0.0, "loss_total": 0.09413156658411026, "step": 429 }, { "batch_size": 4, "epoch": 0.1716, "step": 429, "tokens_per_device": 4652 }, { "epoch": 0.1716, "loss_ce": 0.43156698346138, "loss_lvr": 1.2155568599700928, "loss_mode_switch": 0.0, "loss_total": 0.5531226396560669, "step": 429 }, { "batch_size": 4, "epoch": 0.1716, "step": 429, "tokens_per_device": 5056 }, { "epoch": 0.1716, "loss_ce": 0.17878073453903198, "loss_lvr": 0.8712886571884155, "loss_mode_switch": 0.0, "loss_total": 0.2659096121788025, "step": 429 }, { "epoch": 0.172, "grad_norm": 1.5047050714492798, "learning_rate": 9.480477216694674e-06, "loss": 0.3464, "step": 430 }, { "batch_size": 4, "epoch": 0.172, "step": 430, "tokens_per_device": 3664 }, { "epoch": 0.172, "loss_ce": 0.9344995021820068, "loss_lvr": 0.9562789797782898, "loss_mode_switch": 0.0, "loss_total": 1.0301274061203003, "step": 430 }, { "batch_size": 4, "epoch": 0.172, "step": 430, "tokens_per_device": 7072 }, { "epoch": 0.172, "loss_ce": 0.0848071500658989, "loss_lvr": 1.0054712295532227, "loss_mode_switch": 0.0, "loss_total": 0.18535427749156952, "step": 430 }, { "batch_size": 4, "epoch": 0.172, "step": 430, "tokens_per_device": 3908 }, { "epoch": 0.172, "loss_ce": 0.04163036867976189, "loss_lvr": 0.9025647640228271, "loss_mode_switch": 0.0, "loss_total": 0.13188683986663818, "step": 430 }, { "batch_size": 1, "epoch": 0.172, "step": 430, "tokens_per_device": 5207 }, { "epoch": 0.172, "loss_ce": 0.08995743095874786, "loss_lvr": 0.49213749170303345, "loss_mode_switch": 0.0, "loss_total": 0.13917118310928345, "step": 430 }, { "batch_size": 4, "epoch": 0.172, "step": 430, "tokens_per_device": 3148 }, { "epoch": 0.172, "loss_ce": 0.08137109130620956, "loss_lvr": 0.6580623984336853, "loss_mode_switch": 0.0, "loss_total": 0.1471773386001587, "step": 430 }, { "batch_size": 4, "epoch": 0.172, "step": 430, "tokens_per_device": 4288 }, { "epoch": 0.172, "loss_ce": 0.6337418556213379, "loss_lvr": 1.1990835666656494, "loss_mode_switch": 0.0, "loss_total": 0.7536501884460449, "step": 430 }, { "batch_size": 4, "epoch": 0.172, "step": 430, "tokens_per_device": 3776 }, { "epoch": 0.172, "loss_ce": 0.12269043922424316, "loss_lvr": 2.2452378273010254, "loss_mode_switch": 0.0, "loss_total": 0.3472142219543457, "step": 430 }, { "batch_size": 1, "epoch": 0.172, "step": 430, "tokens_per_device": 4900 }, { "epoch": 0.172, "loss_ce": 0.1786242425441742, "loss_lvr": 0.9661396145820618, "loss_mode_switch": 0.0, "loss_total": 0.2752382159233093, "step": 430 }, { "epoch": 0.1724, "grad_norm": 1.4737380743026733, "learning_rate": 9.47759834061204e-06, "loss": 0.3572, "step": 431 }, { "batch_size": 4, "epoch": 0.1724, "step": 431, "tokens_per_device": 9792 }, { "epoch": 0.1724, "loss_ce": 0.2508358359336853, "loss_lvr": 0.9937220215797424, "loss_mode_switch": 0.0, "loss_total": 0.350208044052124, "step": 431 }, { "batch_size": 4, "epoch": 0.1724, "step": 431, "tokens_per_device": 3764 }, { "epoch": 0.1724, "loss_ce": 0.3594847023487091, "loss_lvr": 1.1459481716156006, "loss_mode_switch": 0.0, "loss_total": 0.47407951951026917, "step": 431 }, { "batch_size": 4, "epoch": 0.1724, "step": 431, "tokens_per_device": 2568 }, { "epoch": 0.1724, "loss_ce": 0.026531629264354706, "loss_lvr": 1.295140027999878, "loss_mode_switch": 0.0, "loss_total": 0.15604564547538757, "step": 431 }, { "batch_size": 1, "epoch": 0.1724, "step": 431, "tokens_per_device": 4364 }, { "epoch": 0.1724, "loss_ce": 0.09475807100534439, "loss_lvr": 0.6940808892250061, "loss_mode_switch": 0.0, "loss_total": 0.1641661524772644, "step": 431 }, { "batch_size": 4, "epoch": 0.1724, "step": 431, "tokens_per_device": 4144 }, { "epoch": 0.1724, "loss_ce": 0.41985419392585754, "loss_lvr": 0.8676877617835999, "loss_mode_switch": 0.0, "loss_total": 0.5066229701042175, "step": 431 }, { "batch_size": 4, "epoch": 0.1724, "step": 431, "tokens_per_device": 2740 }, { "epoch": 0.1724, "loss_ce": 0.5987730622291565, "loss_lvr": 1.1197435855865479, "loss_mode_switch": 0.0, "loss_total": 0.7107474207878113, "step": 431 }, { "batch_size": 1, "epoch": 0.1724, "step": 431, "tokens_per_device": 5099 }, { "epoch": 0.1724, "loss_ce": 0.0032180093694478273, "loss_lvr": 0.6054481863975525, "loss_mode_switch": 0.0, "loss_total": 0.06376282870769501, "step": 431 }, { "batch_size": 1, "epoch": 0.1724, "step": 431, "tokens_per_device": 5170 }, { "epoch": 0.1724, "loss_ce": 0.051449570804834366, "loss_lvr": 0.4245683550834656, "loss_mode_switch": 0.0, "loss_total": 0.09390640258789062, "step": 431 }, { "epoch": 0.1728, "grad_norm": 1.480266809463501, "learning_rate": 9.474711949661835e-06, "loss": 0.3726, "step": 432 }, { "batch_size": 4, "epoch": 0.1728, "step": 432, "tokens_per_device": 4224 }, { "epoch": 0.1728, "loss_ce": 0.0634356141090393, "loss_lvr": 0.7573962807655334, "loss_mode_switch": 0.0, "loss_total": 0.13917523622512817, "step": 432 }, { "batch_size": 1, "epoch": 0.1728, "step": 432, "tokens_per_device": 5023 }, { "epoch": 0.1728, "loss_ce": 0.006539931986480951, "loss_lvr": 0.6216005682945251, "loss_mode_switch": 0.0, "loss_total": 0.06869998574256897, "step": 432 }, { "batch_size": 4, "epoch": 0.1728, "step": 432, "tokens_per_device": 1268 }, { "epoch": 0.1728, "loss_ce": 0.7560740113258362, "loss_lvr": 1.2947980165481567, "loss_mode_switch": 0.0, "loss_total": 0.8855538368225098, "step": 432 }, { "batch_size": 4, "epoch": 0.1728, "step": 432, "tokens_per_device": 5720 }, { "epoch": 0.1728, "loss_ce": 0.05644256994128227, "loss_lvr": 0.7240834832191467, "loss_mode_switch": 0.0, "loss_total": 0.12885092198848724, "step": 432 }, { "batch_size": 4, "epoch": 0.1728, "step": 432, "tokens_per_device": 4236 }, { "epoch": 0.1728, "loss_ce": 0.1309426873922348, "loss_lvr": 0.9501339197158813, "loss_mode_switch": 0.0, "loss_total": 0.22595608234405518, "step": 432 }, { "batch_size": 4, "epoch": 0.1728, "step": 432, "tokens_per_device": 3844 }, { "epoch": 0.1728, "loss_ce": 0.3184548020362854, "loss_lvr": 1.3319555521011353, "loss_mode_switch": 0.0, "loss_total": 0.45165038108825684, "step": 432 }, { "batch_size": 4, "epoch": 0.1728, "step": 432, "tokens_per_device": 5688 }, { "epoch": 0.1728, "loss_ce": 0.1896824985742569, "loss_lvr": 0.8982160687446594, "loss_mode_switch": 0.0, "loss_total": 0.27950412034988403, "step": 432 }, { "batch_size": 4, "epoch": 0.1728, "step": 432, "tokens_per_device": 3880 }, { "epoch": 0.1728, "loss_ce": 0.09282838553190231, "loss_lvr": 0.9032818078994751, "loss_mode_switch": 0.0, "loss_total": 0.1831565648317337, "step": 432 }, { "epoch": 0.1732, "grad_norm": 1.278266191482544, "learning_rate": 9.471818048688364e-06, "loss": 0.2774, "step": 433 }, { "batch_size": 4, "epoch": 0.1732, "step": 433, "tokens_per_device": 3612 }, { "epoch": 0.1732, "loss_ce": 0.05699530988931656, "loss_lvr": 1.0963776111602783, "loss_mode_switch": 0.0, "loss_total": 0.16663306951522827, "step": 433 }, { "batch_size": 4, "epoch": 0.1732, "step": 433, "tokens_per_device": 3780 }, { "epoch": 0.1732, "loss_ce": 0.4796260893344879, "loss_lvr": 0.9529805183410645, "loss_mode_switch": 0.0, "loss_total": 0.574924111366272, "step": 433 }, { "batch_size": 4, "epoch": 0.1732, "step": 433, "tokens_per_device": 4192 }, { "epoch": 0.1732, "loss_ce": 0.32251179218292236, "loss_lvr": 1.2698419094085693, "loss_mode_switch": 0.0, "loss_total": 0.44949597120285034, "step": 433 }, { "batch_size": 4, "epoch": 0.1732, "step": 433, "tokens_per_device": 2664 }, { "epoch": 0.1732, "loss_ce": 0.6108472347259521, "loss_lvr": 0.9806715250015259, "loss_mode_switch": 0.0, "loss_total": 0.7089143991470337, "step": 433 }, { "batch_size": 4, "epoch": 0.1732, "step": 433, "tokens_per_device": 1344 }, { "epoch": 0.1732, "loss_ce": 0.4339481592178345, "loss_lvr": 1.0557454824447632, "loss_mode_switch": 0.0, "loss_total": 0.5395227074623108, "step": 433 }, { "batch_size": 4, "epoch": 0.1732, "step": 433, "tokens_per_device": 5540 }, { "epoch": 0.1732, "loss_ce": 0.31477177143096924, "loss_lvr": 0.7728422284126282, "loss_mode_switch": 0.0, "loss_total": 0.3920559883117676, "step": 433 }, { "batch_size": 4, "epoch": 0.1732, "step": 433, "tokens_per_device": 5236 }, { "epoch": 0.1732, "loss_ce": 0.16701823472976685, "loss_lvr": 1.0898417234420776, "loss_mode_switch": 0.0, "loss_total": 0.2760024070739746, "step": 433 }, { "batch_size": 4, "epoch": 0.1732, "step": 433, "tokens_per_device": 4344 }, { "epoch": 0.1732, "loss_ce": 0.16946381330490112, "loss_lvr": 0.8729286789894104, "loss_mode_switch": 0.0, "loss_total": 0.2567566931247711, "step": 433 }, { "epoch": 0.1736, "grad_norm": 1.4159834384918213, "learning_rate": 9.468916642548534e-06, "loss": 0.3251, "step": 434 }, { "batch_size": 4, "epoch": 0.1736, "step": 434, "tokens_per_device": 2612 }, { "epoch": 0.1736, "loss_ce": 0.62427818775177, "loss_lvr": 1.0070631504058838, "loss_mode_switch": 0.0, "loss_total": 0.7249845266342163, "step": 434 }, { "batch_size": 4, "epoch": 0.1736, "step": 434, "tokens_per_device": 2572 }, { "epoch": 0.1736, "loss_ce": 0.3636416792869568, "loss_lvr": 1.0357438325881958, "loss_mode_switch": 0.0, "loss_total": 0.4672160744667053, "step": 434 }, { "batch_size": 4, "epoch": 0.1736, "step": 434, "tokens_per_device": 3796 }, { "epoch": 0.1736, "loss_ce": 0.5789130926132202, "loss_lvr": 1.50629723072052, "loss_mode_switch": 0.0, "loss_total": 0.7295428514480591, "step": 434 }, { "batch_size": 4, "epoch": 0.1736, "step": 434, "tokens_per_device": 1632 }, { "epoch": 0.1736, "loss_ce": 0.13647450506687164, "loss_lvr": 0.8732365369796753, "loss_mode_switch": 0.0, "loss_total": 0.22379815578460693, "step": 434 }, { "batch_size": 1, "epoch": 0.1736, "step": 434, "tokens_per_device": 5124 }, { "epoch": 0.1736, "loss_ce": 0.08844810724258423, "loss_lvr": 0.3834480047225952, "loss_mode_switch": 0.0, "loss_total": 0.12679290771484375, "step": 434 }, { "batch_size": 1, "epoch": 0.1736, "step": 434, "tokens_per_device": 4934 }, { "epoch": 0.1736, "loss_ce": 0.11736828833818436, "loss_lvr": 0.48180893063545227, "loss_mode_switch": 0.0, "loss_total": 0.16554918885231018, "step": 434 }, { "batch_size": 4, "epoch": 0.1736, "step": 434, "tokens_per_device": 1448 }, { "epoch": 0.1736, "loss_ce": 0.10209972411394119, "loss_lvr": 1.1788458824157715, "loss_mode_switch": 0.0, "loss_total": 0.21998432278633118, "step": 434 }, { "batch_size": 4, "epoch": 0.1736, "step": 434, "tokens_per_device": 4300 }, { "epoch": 0.1736, "loss_ce": 0.10335582494735718, "loss_lvr": 0.7258892059326172, "loss_mode_switch": 0.0, "loss_total": 0.1759447455406189, "step": 434 }, { "epoch": 0.174, "grad_norm": 1.380939245223999, "learning_rate": 9.466007736111846e-06, "loss": 0.3366, "step": 435 }, { "batch_size": 4, "epoch": 0.174, "step": 435, "tokens_per_device": 3644 }, { "epoch": 0.174, "loss_ce": 0.08205194026231766, "loss_lvr": 1.411841630935669, "loss_mode_switch": 0.0, "loss_total": 0.2232361137866974, "step": 435 }, { "batch_size": 1, "epoch": 0.174, "step": 435, "tokens_per_device": 5138 }, { "epoch": 0.174, "loss_ce": 0.02302134409546852, "loss_lvr": 0.305561900138855, "loss_mode_switch": 0.0, "loss_total": 0.05357753485441208, "step": 435 }, { "batch_size": 1, "epoch": 0.174, "step": 435, "tokens_per_device": 5106 }, { "epoch": 0.174, "loss_ce": 0.18690019845962524, "loss_lvr": 1.2656683921813965, "loss_mode_switch": 0.0, "loss_total": 0.31346702575683594, "step": 435 }, { "batch_size": 4, "epoch": 0.174, "step": 435, "tokens_per_device": 4668 }, { "epoch": 0.174, "loss_ce": 0.08628647774457932, "loss_lvr": 1.2111679315567017, "loss_mode_switch": 0.0, "loss_total": 0.2074032723903656, "step": 435 }, { "batch_size": 4, "epoch": 0.174, "step": 435, "tokens_per_device": 3784 }, { "epoch": 0.174, "loss_ce": 0.09768959879875183, "loss_lvr": 1.3067612648010254, "loss_mode_switch": 0.0, "loss_total": 0.2283657342195511, "step": 435 }, { "batch_size": 4, "epoch": 0.174, "step": 435, "tokens_per_device": 5008 }, { "epoch": 0.174, "loss_ce": 0.14117063581943512, "loss_lvr": 0.9053266048431396, "loss_mode_switch": 0.0, "loss_total": 0.23170329630374908, "step": 435 }, { "batch_size": 1, "epoch": 0.174, "step": 435, "tokens_per_device": 5008 }, { "epoch": 0.174, "loss_ce": 0.004099221434444189, "loss_lvr": 1.9091416597366333, "loss_mode_switch": 0.0, "loss_total": 0.1950133889913559, "step": 435 }, { "batch_size": 4, "epoch": 0.174, "step": 435, "tokens_per_device": 4328 }, { "epoch": 0.174, "loss_ce": 0.25914785265922546, "loss_lvr": 0.8960154056549072, "loss_mode_switch": 0.0, "loss_total": 0.34874939918518066, "step": 435 }, { "epoch": 0.1744, "grad_norm": 1.5013219118118286, "learning_rate": 9.463091334260397e-06, "loss": 0.3048, "step": 436 }, { "batch_size": 4, "epoch": 0.1744, "step": 436, "tokens_per_device": 3832 }, { "epoch": 0.1744, "loss_ce": 0.5564274191856384, "loss_lvr": 1.0636498928070068, "loss_mode_switch": 0.0, "loss_total": 0.6627923846244812, "step": 436 }, { "batch_size": 4, "epoch": 0.1744, "step": 436, "tokens_per_device": 6028 }, { "epoch": 0.1744, "loss_ce": 0.11777329444885254, "loss_lvr": 0.7402207851409912, "loss_mode_switch": 0.0, "loss_total": 0.19179537892341614, "step": 436 }, { "batch_size": 4, "epoch": 0.1744, "step": 436, "tokens_per_device": 2672 }, { "epoch": 0.1744, "loss_ce": 0.4230066239833832, "loss_lvr": 0.8656050562858582, "loss_mode_switch": 0.0, "loss_total": 0.509567141532898, "step": 436 }, { "batch_size": 4, "epoch": 0.1744, "step": 436, "tokens_per_device": 4532 }, { "epoch": 0.1744, "loss_ce": 0.10495715588331223, "loss_lvr": 1.0066373348236084, "loss_mode_switch": 0.0, "loss_total": 0.2056208848953247, "step": 436 }, { "batch_size": 4, "epoch": 0.1744, "step": 436, "tokens_per_device": 4592 }, { "epoch": 0.1744, "loss_ce": 0.1619233638048172, "loss_lvr": 1.1177842617034912, "loss_mode_switch": 0.0, "loss_total": 0.2737017869949341, "step": 436 }, { "batch_size": 1, "epoch": 0.1744, "step": 436, "tokens_per_device": 5104 }, { "epoch": 0.1744, "loss_ce": 0.07021141052246094, "loss_lvr": 0.2067858874797821, "loss_mode_switch": 0.0, "loss_total": 0.09088999778032303, "step": 436 }, { "batch_size": 4, "epoch": 0.1744, "step": 436, "tokens_per_device": 4552 }, { "epoch": 0.1744, "loss_ce": 0.3521365523338318, "loss_lvr": 0.9372149705886841, "loss_mode_switch": 0.0, "loss_total": 0.44585806131362915, "step": 436 }, { "batch_size": 1, "epoch": 0.1744, "step": 436, "tokens_per_device": 4902 }, { "epoch": 0.1744, "loss_ce": 0.07045329362154007, "loss_lvr": 0.231816366314888, "loss_mode_switch": 0.0, "loss_total": 0.09363493323326111, "step": 436 }, { "epoch": 0.1748, "grad_norm": 1.3595035076141357, "learning_rate": 9.460167441888855e-06, "loss": 0.3065, "step": 437 }, { "batch_size": 1, "epoch": 0.1748, "step": 437, "tokens_per_device": 4862 }, { "epoch": 0.1748, "loss_ce": 0.006098754238337278, "loss_lvr": 0.6063376069068909, "loss_mode_switch": 0.0, "loss_total": 0.0667325109243393, "step": 437 }, { "batch_size": 4, "epoch": 0.1748, "step": 437, "tokens_per_device": 2248 }, { "epoch": 0.1748, "loss_ce": 0.38221895694732666, "loss_lvr": 1.0768064260482788, "loss_mode_switch": 0.0, "loss_total": 0.489899605512619, "step": 437 }, { "batch_size": 1, "epoch": 0.1748, "step": 437, "tokens_per_device": 5204 }, { "epoch": 0.1748, "loss_ce": 0.021211378276348114, "loss_lvr": 0.3643362522125244, "loss_mode_switch": 0.0, "loss_total": 0.057645004242658615, "step": 437 }, { "batch_size": 4, "epoch": 0.1748, "step": 437, "tokens_per_device": 4260 }, { "epoch": 0.1748, "loss_ce": 0.12117903679609299, "loss_lvr": 1.0376588106155396, "loss_mode_switch": 0.0, "loss_total": 0.22494491934776306, "step": 437 }, { "batch_size": 1, "epoch": 0.1748, "step": 437, "tokens_per_device": 5173 }, { "epoch": 0.1748, "loss_ce": 0.002283187583088875, "loss_lvr": 0.5634946823120117, "loss_mode_switch": 0.0, "loss_total": 0.058632656931877136, "step": 437 }, { "batch_size": 4, "epoch": 0.1748, "step": 437, "tokens_per_device": 4748 }, { "epoch": 0.1748, "loss_ce": 0.07990404963493347, "loss_lvr": 0.860807478427887, "loss_mode_switch": 0.0, "loss_total": 0.16598480939865112, "step": 437 }, { "batch_size": 4, "epoch": 0.1748, "step": 437, "tokens_per_device": 3916 }, { "epoch": 0.1748, "loss_ce": 0.27451837062835693, "loss_lvr": 0.790820837020874, "loss_mode_switch": 0.0, "loss_total": 0.3536004424095154, "step": 437 }, { "batch_size": 1, "epoch": 0.1748, "step": 437, "tokens_per_device": 4823 }, { "epoch": 0.1748, "loss_ce": 0.004582913126796484, "loss_lvr": 1.0973953008651733, "loss_mode_switch": 0.0, "loss_total": 0.11432244628667831, "step": 437 }, { "epoch": 0.1752, "grad_norm": 1.4133710861206055, "learning_rate": 9.457236063904465e-06, "loss": 0.2957, "step": 438 }, { "batch_size": 4, "epoch": 0.1752, "step": 438, "tokens_per_device": 5252 }, { "epoch": 0.1752, "loss_ce": 0.43588680028915405, "loss_lvr": 0.9967089295387268, "loss_mode_switch": 0.0, "loss_total": 0.5355576872825623, "step": 438 }, { "batch_size": 1, "epoch": 0.1752, "step": 438, "tokens_per_device": 5161 }, { "epoch": 0.1752, "loss_ce": 0.0007872470887377858, "loss_lvr": 0.6268090605735779, "loss_mode_switch": 0.0, "loss_total": 0.06346815824508667, "step": 438 }, { "batch_size": 1, "epoch": 0.1752, "step": 438, "tokens_per_device": 4688 }, { "epoch": 0.1752, "loss_ce": 0.5566282272338867, "loss_lvr": 0.60896235704422, "loss_mode_switch": 0.0, "loss_total": 0.6175244450569153, "step": 438 }, { "batch_size": 1, "epoch": 0.1752, "step": 438, "tokens_per_device": 5269 }, { "epoch": 0.1752, "loss_ce": 0.002699578646570444, "loss_lvr": 0.41612881422042847, "loss_mode_switch": 0.0, "loss_total": 0.04431246221065521, "step": 438 }, { "batch_size": 4, "epoch": 0.1752, "step": 438, "tokens_per_device": 4180 }, { "epoch": 0.1752, "loss_ce": 0.5366469621658325, "loss_lvr": 1.390075445175171, "loss_mode_switch": 0.0, "loss_total": 0.6756545305252075, "step": 438 }, { "batch_size": 4, "epoch": 0.1752, "step": 438, "tokens_per_device": 3204 }, { "epoch": 0.1752, "loss_ce": 0.18648980557918549, "loss_lvr": 1.1641709804534912, "loss_mode_switch": 0.0, "loss_total": 0.30290690064430237, "step": 438 }, { "batch_size": 4, "epoch": 0.1752, "step": 438, "tokens_per_device": 5444 }, { "epoch": 0.1752, "loss_ce": 0.28457385301589966, "loss_lvr": 1.1858601570129395, "loss_mode_switch": 0.0, "loss_total": 0.40315985679626465, "step": 438 }, { "batch_size": 4, "epoch": 0.1752, "step": 438, "tokens_per_device": 5024 }, { "epoch": 0.1752, "loss_ce": 0.11760204285383224, "loss_lvr": 0.8585732579231262, "loss_mode_switch": 0.0, "loss_total": 0.20345936715602875, "step": 438 }, { "epoch": 0.1756, "grad_norm": 1.2776445150375366, "learning_rate": 9.454297205227034e-06, "loss": 0.2879, "step": 439 }, { "batch_size": 1, "epoch": 0.1756, "step": 439, "tokens_per_device": 5688 }, { "epoch": 0.1756, "loss_ce": 0.03341963142156601, "loss_lvr": 0.4772316813468933, "loss_mode_switch": 0.0, "loss_total": 0.08114279806613922, "step": 439 }, { "batch_size": 1, "epoch": 0.1756, "step": 439, "tokens_per_device": 4886 }, { "epoch": 0.1756, "loss_ce": 0.004390518181025982, "loss_lvr": 0.36470457911491394, "loss_mode_switch": 0.0, "loss_total": 0.04086097702383995, "step": 439 }, { "batch_size": 4, "epoch": 0.1756, "step": 439, "tokens_per_device": 5212 }, { "epoch": 0.1756, "loss_ce": 0.3006531298160553, "loss_lvr": 1.2910867929458618, "loss_mode_switch": 0.0, "loss_total": 0.4297618269920349, "step": 439 }, { "batch_size": 4, "epoch": 0.1756, "step": 439, "tokens_per_device": 4292 }, { "epoch": 0.1756, "loss_ce": 0.06468473374843597, "loss_lvr": 1.0798310041427612, "loss_mode_switch": 0.0, "loss_total": 0.17266783118247986, "step": 439 }, { "batch_size": 4, "epoch": 0.1756, "step": 439, "tokens_per_device": 4328 }, { "epoch": 0.1756, "loss_ce": 0.2816562056541443, "loss_lvr": 0.8820421099662781, "loss_mode_switch": 0.0, "loss_total": 0.3698604106903076, "step": 439 }, { "batch_size": 4, "epoch": 0.1756, "step": 439, "tokens_per_device": 12408 }, { "epoch": 0.1756, "loss_ce": 0.48091837763786316, "loss_lvr": 1.2788177728652954, "loss_mode_switch": 0.0, "loss_total": 0.6088001728057861, "step": 439 }, { "batch_size": 4, "epoch": 0.1756, "step": 439, "tokens_per_device": 5784 }, { "epoch": 0.1756, "loss_ce": 0.14690761268138885, "loss_lvr": 1.004791259765625, "loss_mode_switch": 0.0, "loss_total": 0.24738673865795135, "step": 439 }, { "batch_size": 1, "epoch": 0.1756, "step": 439, "tokens_per_device": 4868 }, { "epoch": 0.1756, "loss_ce": 0.0050866794772446156, "loss_lvr": 0.7582125067710876, "loss_mode_switch": 0.0, "loss_total": 0.08090793341398239, "step": 439 }, { "epoch": 0.176, "grad_norm": 1.2700412273406982, "learning_rate": 9.451350870788922e-06, "loss": 0.2992, "step": 440 }, { "batch_size": 4, "epoch": 0.176, "step": 440, "tokens_per_device": 1776 }, { "epoch": 0.176, "loss_ce": 0.7469650506973267, "loss_lvr": 1.0467885732650757, "loss_mode_switch": 0.0, "loss_total": 0.8516439199447632, "step": 440 }, { "batch_size": 1, "epoch": 0.176, "step": 440, "tokens_per_device": 5092 }, { "epoch": 0.176, "loss_ce": 0.018038811162114143, "loss_lvr": 0.4456801414489746, "loss_mode_switch": 0.0, "loss_total": 0.0626068264245987, "step": 440 }, { "batch_size": 1, "epoch": 0.176, "step": 440, "tokens_per_device": 4891 }, { "epoch": 0.176, "loss_ce": 0.0006260389927774668, "loss_lvr": 0.34217527508735657, "loss_mode_switch": 0.0, "loss_total": 0.0348435677587986, "step": 440 }, { "batch_size": 4, "epoch": 0.176, "step": 440, "tokens_per_device": 3796 }, { "epoch": 0.176, "loss_ce": 0.042568501085042953, "loss_lvr": 1.1249936819076538, "loss_mode_switch": 0.0, "loss_total": 0.15506787598133087, "step": 440 }, { "batch_size": 4, "epoch": 0.176, "step": 440, "tokens_per_device": 4380 }, { "epoch": 0.176, "loss_ce": 0.3217545747756958, "loss_lvr": 1.0254862308502197, "loss_mode_switch": 0.0, "loss_total": 0.42430320382118225, "step": 440 }, { "batch_size": 4, "epoch": 0.176, "step": 440, "tokens_per_device": 3828 }, { "epoch": 0.176, "loss_ce": 0.14134103059768677, "loss_lvr": 0.9345362186431885, "loss_mode_switch": 0.0, "loss_total": 0.23479464650154114, "step": 440 }, { "batch_size": 1, "epoch": 0.176, "step": 440, "tokens_per_device": 5046 }, { "epoch": 0.176, "loss_ce": 0.1459123194217682, "loss_lvr": 0.8809800148010254, "loss_mode_switch": 0.0, "loss_total": 0.23401032388210297, "step": 440 }, { "batch_size": 4, "epoch": 0.176, "step": 440, "tokens_per_device": 5756 }, { "epoch": 0.176, "loss_ce": 0.17317049205303192, "loss_lvr": 1.1113035678863525, "loss_mode_switch": 0.0, "loss_total": 0.28430086374282837, "step": 440 }, { "epoch": 0.1764, "grad_norm": 1.566057562828064, "learning_rate": 9.448397065535037e-06, "loss": 0.321, "step": 441 }, { "batch_size": 4, "epoch": 0.1764, "step": 441, "tokens_per_device": 6528 }, { "epoch": 0.1764, "loss_ce": 0.37093088030815125, "loss_lvr": 1.0434147119522095, "loss_mode_switch": 0.0, "loss_total": 0.47527235746383667, "step": 441 }, { "batch_size": 4, "epoch": 0.1764, "step": 441, "tokens_per_device": 3848 }, { "epoch": 0.1764, "loss_ce": 0.7302036285400391, "loss_lvr": 0.9687442779541016, "loss_mode_switch": 0.0, "loss_total": 0.8270780444145203, "step": 441 }, { "batch_size": 4, "epoch": 0.1764, "step": 441, "tokens_per_device": 3800 }, { "epoch": 0.1764, "loss_ce": 0.367063969373703, "loss_lvr": 1.408527135848999, "loss_mode_switch": 0.0, "loss_total": 0.5079166889190674, "step": 441 }, { "batch_size": 4, "epoch": 0.1764, "step": 441, "tokens_per_device": 4380 }, { "epoch": 0.1764, "loss_ce": 0.04050767421722412, "loss_lvr": 1.0072568655014038, "loss_mode_switch": 0.0, "loss_total": 0.14123335480690002, "step": 441 }, { "batch_size": 1, "epoch": 0.1764, "step": 441, "tokens_per_device": 4883 }, { "epoch": 0.1764, "loss_ce": 0.5682121515274048, "loss_lvr": 1.363891363143921, "loss_mode_switch": 0.0, "loss_total": 0.7046012878417969, "step": 441 }, { "batch_size": 4, "epoch": 0.1764, "step": 441, "tokens_per_device": 4560 }, { "epoch": 0.1764, "loss_ce": 0.3215317726135254, "loss_lvr": 0.9493221044540405, "loss_mode_switch": 0.0, "loss_total": 0.4164639711380005, "step": 441 }, { "batch_size": 4, "epoch": 0.1764, "step": 441, "tokens_per_device": 1712 }, { "epoch": 0.1764, "loss_ce": 0.5296652317047119, "loss_lvr": 1.0337748527526855, "loss_mode_switch": 0.0, "loss_total": 0.6330426931381226, "step": 441 }, { "batch_size": 4, "epoch": 0.1764, "step": 441, "tokens_per_device": 4188 }, { "epoch": 0.1764, "loss_ce": 0.08501314371824265, "loss_lvr": 1.1102241277694702, "loss_mode_switch": 0.0, "loss_total": 0.19603556394577026, "step": 441 }, { "epoch": 0.1768, "grad_norm": 1.4115245342254639, "learning_rate": 9.445435794422826e-06, "loss": 0.3377, "step": 442 }, { "batch_size": 4, "epoch": 0.1768, "step": 442, "tokens_per_device": 4284 }, { "epoch": 0.1768, "loss_ce": 0.0653022825717926, "loss_lvr": 1.0343073606491089, "loss_mode_switch": 0.0, "loss_total": 0.16873303055763245, "step": 442 }, { "batch_size": 1, "epoch": 0.1768, "step": 442, "tokens_per_device": 7076 }, { "epoch": 0.1768, "loss_ce": 0.0013821363681927323, "loss_lvr": 0.37688031792640686, "loss_mode_switch": 0.0, "loss_total": 0.039070166647434235, "step": 442 }, { "batch_size": 4, "epoch": 0.1768, "step": 442, "tokens_per_device": 4208 }, { "epoch": 0.1768, "loss_ce": 0.1864069700241089, "loss_lvr": 0.8969720602035522, "loss_mode_switch": 0.0, "loss_total": 0.2761041820049286, "step": 442 }, { "batch_size": 4, "epoch": 0.1768, "step": 442, "tokens_per_device": 4356 }, { "epoch": 0.1768, "loss_ce": 0.0017757327295839787, "loss_lvr": 0.6786747574806213, "loss_mode_switch": 0.0, "loss_total": 0.06964321434497833, "step": 442 }, { "batch_size": 4, "epoch": 0.1768, "step": 442, "tokens_per_device": 3760 }, { "epoch": 0.1768, "loss_ce": 0.13030476868152618, "loss_lvr": 1.1013699769973755, "loss_mode_switch": 0.0, "loss_total": 0.24044176936149597, "step": 442 }, { "batch_size": 4, "epoch": 0.1768, "step": 442, "tokens_per_device": 4672 }, { "epoch": 0.1768, "loss_ce": 0.02978864312171936, "loss_lvr": 1.3263893127441406, "loss_mode_switch": 0.0, "loss_total": 0.16242757439613342, "step": 442 }, { "batch_size": 4, "epoch": 0.1768, "step": 442, "tokens_per_device": 1352 }, { "epoch": 0.1768, "loss_ce": 0.3373146653175354, "loss_lvr": 1.040181040763855, "loss_mode_switch": 0.0, "loss_total": 0.44133275747299194, "step": 442 }, { "batch_size": 1, "epoch": 0.1768, "step": 442, "tokens_per_device": 4878 }, { "epoch": 0.1768, "loss_ce": 0.003142849775031209, "loss_lvr": 0.8750923275947571, "loss_mode_switch": 0.0, "loss_total": 0.09065208584070206, "step": 442 }, { "epoch": 0.1772, "grad_norm": 1.4875121116638184, "learning_rate": 9.442467062422267e-06, "loss": 0.3383, "step": 443 }, { "batch_size": 4, "epoch": 0.1772, "step": 443, "tokens_per_device": 5032 }, { "epoch": 0.1772, "loss_ce": 0.16605913639068604, "loss_lvr": 0.5017842650413513, "loss_mode_switch": 0.0, "loss_total": 0.21623755991458893, "step": 443 }, { "batch_size": 4, "epoch": 0.1772, "step": 443, "tokens_per_device": 5000 }, { "epoch": 0.1772, "loss_ce": 0.36421963572502136, "loss_lvr": 1.0592894554138184, "loss_mode_switch": 0.0, "loss_total": 0.47014859318733215, "step": 443 }, { "batch_size": 4, "epoch": 0.1772, "step": 443, "tokens_per_device": 4304 }, { "epoch": 0.1772, "loss_ce": 0.029981404542922974, "loss_lvr": 1.0742201805114746, "loss_mode_switch": 0.0, "loss_total": 0.1374034285545349, "step": 443 }, { "batch_size": 1, "epoch": 0.1772, "step": 443, "tokens_per_device": 5030 }, { "epoch": 0.1772, "loss_ce": 0.018615568056702614, "loss_lvr": 1.3188631534576416, "loss_mode_switch": 0.0, "loss_total": 0.15050189197063446, "step": 443 }, { "batch_size": 4, "epoch": 0.1772, "step": 443, "tokens_per_device": 13464 }, { "epoch": 0.1772, "loss_ce": 0.43005356192588806, "loss_lvr": 1.1423529386520386, "loss_mode_switch": 0.0, "loss_total": 0.5442888736724854, "step": 443 }, { "batch_size": 1, "epoch": 0.1772, "step": 443, "tokens_per_device": 7007 }, { "epoch": 0.1772, "loss_ce": 0.06611113995313644, "loss_lvr": 0.583477795124054, "loss_mode_switch": 0.0, "loss_total": 0.1244589239358902, "step": 443 }, { "batch_size": 1, "epoch": 0.1772, "step": 443, "tokens_per_device": 4926 }, { "epoch": 0.1772, "loss_ce": 0.029104553163051605, "loss_lvr": 0.9893462061882019, "loss_mode_switch": 0.0, "loss_total": 0.1280391812324524, "step": 443 }, { "batch_size": 4, "epoch": 0.1772, "step": 443, "tokens_per_device": 5752 }, { "epoch": 0.1772, "loss_ce": 0.1443246603012085, "loss_lvr": 0.830841600894928, "loss_mode_switch": 0.0, "loss_total": 0.22740882635116577, "step": 443 }, { "epoch": 0.1776, "grad_norm": 1.3809654712677002, "learning_rate": 9.439490874515859e-06, "loss": 0.3197, "step": 444 }, { "batch_size": 4, "epoch": 0.1776, "step": 444, "tokens_per_device": 2688 }, { "epoch": 0.1776, "loss_ce": 0.27803120017051697, "loss_lvr": 0.876541256904602, "loss_mode_switch": 0.0, "loss_total": 0.3656853437423706, "step": 444 }, { "batch_size": 4, "epoch": 0.1776, "step": 444, "tokens_per_device": 2748 }, { "epoch": 0.1776, "loss_ce": 0.18050517141819, "loss_lvr": 0.8380424976348877, "loss_mode_switch": 0.0, "loss_total": 0.2643094062805176, "step": 444 }, { "batch_size": 4, "epoch": 0.1776, "step": 444, "tokens_per_device": 3808 }, { "epoch": 0.1776, "loss_ce": 0.44156041741371155, "loss_lvr": 1.091120958328247, "loss_mode_switch": 0.0, "loss_total": 0.5506725311279297, "step": 444 }, { "batch_size": 4, "epoch": 0.1776, "step": 444, "tokens_per_device": 1716 }, { "epoch": 0.1776, "loss_ce": 0.5315293073654175, "loss_lvr": 0.9443637728691101, "loss_mode_switch": 0.0, "loss_total": 0.6259657144546509, "step": 444 }, { "batch_size": 4, "epoch": 0.1776, "step": 444, "tokens_per_device": 3820 }, { "epoch": 0.1776, "loss_ce": 0.08122558146715164, "loss_lvr": 1.325453519821167, "loss_mode_switch": 0.0, "loss_total": 0.21377092599868774, "step": 444 }, { "batch_size": 1, "epoch": 0.1776, "step": 444, "tokens_per_device": 4603 }, { "epoch": 0.1776, "loss_ce": 0.19800525903701782, "loss_lvr": 0.6663050055503845, "loss_mode_switch": 0.0, "loss_total": 0.26463577151298523, "step": 444 }, { "batch_size": 4, "epoch": 0.1776, "step": 444, "tokens_per_device": 5816 }, { "epoch": 0.1776, "loss_ce": 0.35681071877479553, "loss_lvr": 1.0585981607437134, "loss_mode_switch": 0.0, "loss_total": 0.46267053484916687, "step": 444 }, { "batch_size": 4, "epoch": 0.1776, "step": 444, "tokens_per_device": 4868 }, { "epoch": 0.1776, "loss_ce": 0.16049471497535706, "loss_lvr": 0.8375219106674194, "loss_mode_switch": 0.0, "loss_total": 0.24424690008163452, "step": 444 }, { "epoch": 0.178, "grad_norm": 1.5892329216003418, "learning_rate": 9.436507235698613e-06, "loss": 0.3189, "step": 445 }, { "batch_size": 4, "epoch": 0.178, "step": 445, "tokens_per_device": 4244 }, { "epoch": 0.178, "loss_ce": 0.13959376513957977, "loss_lvr": 0.8708603978157043, "loss_mode_switch": 0.0, "loss_total": 0.22667980194091797, "step": 445 }, { "batch_size": 4, "epoch": 0.178, "step": 445, "tokens_per_device": 4292 }, { "epoch": 0.178, "loss_ce": 0.039897553622722626, "loss_lvr": 0.9824628829956055, "loss_mode_switch": 0.0, "loss_total": 0.13814383745193481, "step": 445 }, { "batch_size": 4, "epoch": 0.178, "step": 445, "tokens_per_device": 5512 }, { "epoch": 0.178, "loss_ce": 0.8902694582939148, "loss_lvr": 0.8117871284484863, "loss_mode_switch": 0.0, "loss_total": 0.9714481830596924, "step": 445 }, { "batch_size": 4, "epoch": 0.178, "step": 445, "tokens_per_device": 4420 }, { "epoch": 0.178, "loss_ce": 0.4065418243408203, "loss_lvr": 1.176411747932434, "loss_mode_switch": 0.0, "loss_total": 0.5241829752922058, "step": 445 }, { "batch_size": 4, "epoch": 0.178, "step": 445, "tokens_per_device": 4568 }, { "epoch": 0.178, "loss_ce": 0.30799880623817444, "loss_lvr": 0.9433994889259338, "loss_mode_switch": 0.0, "loss_total": 0.40233874320983887, "step": 445 }, { "batch_size": 4, "epoch": 0.178, "step": 445, "tokens_per_device": 1416 }, { "epoch": 0.178, "loss_ce": 0.7802145481109619, "loss_lvr": 1.1420680284500122, "loss_mode_switch": 0.0, "loss_total": 0.8944213390350342, "step": 445 }, { "batch_size": 4, "epoch": 0.178, "step": 445, "tokens_per_device": 8516 }, { "epoch": 0.178, "loss_ce": 0.07230721414089203, "loss_lvr": 0.9625996351242065, "loss_mode_switch": 0.0, "loss_total": 0.16856718063354492, "step": 445 }, { "batch_size": 4, "epoch": 0.178, "step": 445, "tokens_per_device": 5588 }, { "epoch": 0.178, "loss_ce": 0.2797767221927643, "loss_lvr": 0.8171575665473938, "loss_mode_switch": 0.0, "loss_total": 0.36149248480796814, "step": 445 }, { "epoch": 0.1784, "grad_norm": 1.4023635387420654, "learning_rate": 9.433516150978045e-06, "loss": 0.3354, "step": 446 }, { "batch_size": 1, "epoch": 0.1784, "step": 446, "tokens_per_device": 4952 }, { "epoch": 0.1784, "loss_ce": 0.02282906137406826, "loss_lvr": 0.5933690667152405, "loss_mode_switch": 0.0, "loss_total": 0.08216597139835358, "step": 446 }, { "batch_size": 4, "epoch": 0.1784, "step": 446, "tokens_per_device": 4380 }, { "epoch": 0.1784, "loss_ce": 0.5422903299331665, "loss_lvr": 1.136701226234436, "loss_mode_switch": 0.0, "loss_total": 0.6559604406356812, "step": 446 }, { "batch_size": 1, "epoch": 0.1784, "step": 446, "tokens_per_device": 5146 }, { "epoch": 0.1784, "loss_ce": 0.11213628947734833, "loss_lvr": 0.44816794991493225, "loss_mode_switch": 0.0, "loss_total": 0.15695308148860931, "step": 446 }, { "batch_size": 1, "epoch": 0.1784, "step": 446, "tokens_per_device": 4914 }, { "epoch": 0.1784, "loss_ce": 0.012459390796720982, "loss_lvr": 0.29551440477371216, "loss_mode_switch": 0.0, "loss_total": 0.0420108325779438, "step": 446 }, { "batch_size": 4, "epoch": 0.1784, "step": 446, "tokens_per_device": 4300 }, { "epoch": 0.1784, "loss_ce": 0.31634268164634705, "loss_lvr": 1.0298454761505127, "loss_mode_switch": 0.0, "loss_total": 0.4193272292613983, "step": 446 }, { "batch_size": 4, "epoch": 0.1784, "step": 446, "tokens_per_device": 4460 }, { "epoch": 0.1784, "loss_ce": 0.12401232123374939, "loss_lvr": 0.9912388324737549, "loss_mode_switch": 0.0, "loss_total": 0.22313621640205383, "step": 446 }, { "batch_size": 1, "epoch": 0.1784, "step": 446, "tokens_per_device": 4873 }, { "epoch": 0.1784, "loss_ce": 0.07199827581644058, "loss_lvr": 0.8648828864097595, "loss_mode_switch": 0.0, "loss_total": 0.15848657488822937, "step": 446 }, { "batch_size": 1, "epoch": 0.1784, "step": 446, "tokens_per_device": 5252 }, { "epoch": 0.1784, "loss_ce": 0.0828198567032814, "loss_lvr": 0.3039749562740326, "loss_mode_switch": 0.0, "loss_total": 0.11321735382080078, "step": 446 }, { "epoch": 0.1788, "grad_norm": 1.3802645206451416, "learning_rate": 9.430517625374171e-06, "loss": 0.3327, "step": 447 }, { "batch_size": 4, "epoch": 0.1788, "step": 447, "tokens_per_device": 2684 }, { "epoch": 0.1788, "loss_ce": 0.4540433883666992, "loss_lvr": 1.0170338153839111, "loss_mode_switch": 0.0, "loss_total": 0.5557467937469482, "step": 447 }, { "batch_size": 4, "epoch": 0.1788, "step": 447, "tokens_per_device": 6704 }, { "epoch": 0.1788, "loss_ce": 0.20350728929042816, "loss_lvr": 0.8583353161811829, "loss_mode_switch": 0.0, "loss_total": 0.2893408238887787, "step": 447 }, { "batch_size": 1, "epoch": 0.1788, "step": 447, "tokens_per_device": 6347 }, { "epoch": 0.1788, "loss_ce": 0.3425061106681824, "loss_lvr": 0.3560786545276642, "loss_mode_switch": 0.0, "loss_total": 0.3781139850616455, "step": 447 }, { "batch_size": 1, "epoch": 0.1788, "step": 447, "tokens_per_device": 4905 }, { "epoch": 0.1788, "loss_ce": 0.0055020470172166824, "loss_lvr": 0.6314032077789307, "loss_mode_switch": 0.0, "loss_total": 0.06864237040281296, "step": 447 }, { "batch_size": 1, "epoch": 0.1788, "step": 447, "tokens_per_device": 4897 }, { "epoch": 0.1788, "loss_ce": 0.0010072438744828105, "loss_lvr": 0.34141215682029724, "loss_mode_switch": 0.0, "loss_total": 0.035148460417985916, "step": 447 }, { "batch_size": 4, "epoch": 0.1788, "step": 447, "tokens_per_device": 4228 }, { "epoch": 0.1788, "loss_ce": 0.34561389684677124, "loss_lvr": 1.0827088356018066, "loss_mode_switch": 0.0, "loss_total": 0.4538847804069519, "step": 447 }, { "batch_size": 4, "epoch": 0.1788, "step": 447, "tokens_per_device": 4876 }, { "epoch": 0.1788, "loss_ce": 0.0029325492214411497, "loss_lvr": 1.0314770936965942, "loss_mode_switch": 0.0, "loss_total": 0.10608025640249252, "step": 447 }, { "batch_size": 4, "epoch": 0.1788, "step": 447, "tokens_per_device": 4128 }, { "epoch": 0.1788, "loss_ce": 0.32752084732055664, "loss_lvr": 1.1265933513641357, "loss_mode_switch": 0.0, "loss_total": 0.4401801824569702, "step": 447 }, { "epoch": 0.1792, "grad_norm": 1.4213931560516357, "learning_rate": 9.427511663919492e-06, "loss": 0.3261, "step": 448 }, { "batch_size": 4, "epoch": 0.1792, "step": 448, "tokens_per_device": 5516 }, { "epoch": 0.1792, "loss_ce": 0.02339266426861286, "loss_lvr": 0.8627214431762695, "loss_mode_switch": 0.0, "loss_total": 0.10966480523347855, "step": 448 }, { "batch_size": 4, "epoch": 0.1792, "step": 448, "tokens_per_device": 4428 }, { "epoch": 0.1792, "loss_ce": 0.2653697729110718, "loss_lvr": 0.9689112305641174, "loss_mode_switch": 0.0, "loss_total": 0.3622609078884125, "step": 448 }, { "batch_size": 4, "epoch": 0.1792, "step": 448, "tokens_per_device": 1468 }, { "epoch": 0.1792, "loss_ce": 0.9140061736106873, "loss_lvr": 1.2994998693466187, "loss_mode_switch": 0.0, "loss_total": 1.0439561605453491, "step": 448 }, { "batch_size": 4, "epoch": 0.1792, "step": 448, "tokens_per_device": 2668 }, { "epoch": 0.1792, "loss_ce": 0.451552152633667, "loss_lvr": 0.8556099534034729, "loss_mode_switch": 0.0, "loss_total": 0.5371131300926208, "step": 448 }, { "batch_size": 4, "epoch": 0.1792, "step": 448, "tokens_per_device": 4236 }, { "epoch": 0.1792, "loss_ce": 0.4418979287147522, "loss_lvr": 0.9505205750465393, "loss_mode_switch": 0.0, "loss_total": 0.5369499921798706, "step": 448 }, { "batch_size": 1, "epoch": 0.1792, "step": 448, "tokens_per_device": 4787 }, { "epoch": 0.1792, "loss_ce": 0.014918947592377663, "loss_lvr": 0.4278249442577362, "loss_mode_switch": 0.0, "loss_total": 0.05770144611597061, "step": 448 }, { "batch_size": 1, "epoch": 0.1792, "step": 448, "tokens_per_device": 5157 }, { "epoch": 0.1792, "loss_ce": 0.005462100729346275, "loss_lvr": 0.5072712898254395, "loss_mode_switch": 0.0, "loss_total": 0.05618923157453537, "step": 448 }, { "batch_size": 4, "epoch": 0.1792, "step": 448, "tokens_per_device": 4236 }, { "epoch": 0.1792, "loss_ce": 0.05515993759036064, "loss_lvr": 0.9639000296592712, "loss_mode_switch": 0.0, "loss_total": 0.15154993534088135, "step": 448 }, { "epoch": 0.1796, "grad_norm": 1.2040915489196777, "learning_rate": 9.424498271658991e-06, "loss": 0.2745, "step": 449 }, { "batch_size": 4, "epoch": 0.1796, "step": 449, "tokens_per_device": 2932 }, { "epoch": 0.1796, "loss_ce": 0.29098281264305115, "loss_lvr": 0.6359381675720215, "loss_mode_switch": 0.0, "loss_total": 0.35457664728164673, "step": 449 }, { "batch_size": 1, "epoch": 0.1796, "step": 449, "tokens_per_device": 4842 }, { "epoch": 0.1796, "loss_ce": 0.011904329992830753, "loss_lvr": 0.395673543214798, "loss_mode_switch": 0.0, "loss_total": 0.051471684128046036, "step": 449 }, { "batch_size": 4, "epoch": 0.1796, "step": 449, "tokens_per_device": 3936 }, { "epoch": 0.1796, "loss_ce": 0.3137803077697754, "loss_lvr": 0.9770983457565308, "loss_mode_switch": 0.0, "loss_total": 0.41149014234542847, "step": 449 }, { "batch_size": 1, "epoch": 0.1796, "step": 449, "tokens_per_device": 5659 }, { "epoch": 0.1796, "loss_ce": 0.13770800828933716, "loss_lvr": 0.6112616658210754, "loss_mode_switch": 0.0, "loss_total": 0.19883418083190918, "step": 449 }, { "batch_size": 1, "epoch": 0.1796, "step": 449, "tokens_per_device": 5108 }, { "epoch": 0.1796, "loss_ce": 0.005850592628121376, "loss_lvr": 0.6097078919410706, "loss_mode_switch": 0.0, "loss_total": 0.0668213814496994, "step": 449 }, { "batch_size": 4, "epoch": 0.1796, "step": 449, "tokens_per_device": 3220 }, { "epoch": 0.1796, "loss_ce": 0.5149300694465637, "loss_lvr": 0.6546257734298706, "loss_mode_switch": 0.0, "loss_total": 0.5803926587104797, "step": 449 }, { "batch_size": 4, "epoch": 0.1796, "step": 449, "tokens_per_device": 4256 }, { "epoch": 0.1796, "loss_ce": 0.23158589005470276, "loss_lvr": 0.8130836486816406, "loss_mode_switch": 0.0, "loss_total": 0.3128942549228668, "step": 449 }, { "batch_size": 1, "epoch": 0.1796, "step": 449, "tokens_per_device": 5311 }, { "epoch": 0.1796, "loss_ce": 0.0012033452512696385, "loss_lvr": 0.5470185875892639, "loss_mode_switch": 0.0, "loss_total": 0.05590520799160004, "step": 449 }, { "epoch": 0.18, "grad_norm": 1.457327961921692, "learning_rate": 9.421477453650118e-06, "loss": 0.318, "step": 450 }, { "batch_size": 1, "epoch": 0.18, "step": 450, "tokens_per_device": 5149 }, { "epoch": 0.18, "loss_ce": 0.0284718070179224, "loss_lvr": 0.6251606941223145, "loss_mode_switch": 0.0, "loss_total": 0.09098787605762482, "step": 450 }, { "batch_size": 4, "epoch": 0.18, "step": 450, "tokens_per_device": 5100 }, { "epoch": 0.18, "loss_ce": 0.08515053987503052, "loss_lvr": 0.7863079309463501, "loss_mode_switch": 0.0, "loss_total": 0.16378134489059448, "step": 450 }, { "batch_size": 4, "epoch": 0.18, "step": 450, "tokens_per_device": 4836 }, { "epoch": 0.18, "loss_ce": 0.5741028785705566, "loss_lvr": 0.7380979061126709, "loss_mode_switch": 0.0, "loss_total": 0.6479126811027527, "step": 450 }, { "batch_size": 4, "epoch": 0.18, "step": 450, "tokens_per_device": 5708 }, { "epoch": 0.18, "loss_ce": 0.0694541484117508, "loss_lvr": 0.9075025916099548, "loss_mode_switch": 0.0, "loss_total": 0.16020441055297852, "step": 450 }, { "batch_size": 4, "epoch": 0.18, "step": 450, "tokens_per_device": 4440 }, { "epoch": 0.18, "loss_ce": 0.3701407313346863, "loss_lvr": 1.2940459251403809, "loss_mode_switch": 0.0, "loss_total": 0.4995453357696533, "step": 450 }, { "batch_size": 1, "epoch": 0.18, "step": 450, "tokens_per_device": 4628 }, { "epoch": 0.18, "loss_ce": 0.0005620449665002525, "loss_lvr": 0.5397401452064514, "loss_mode_switch": 0.0, "loss_total": 0.054536059498786926, "step": 450 }, { "batch_size": 4, "epoch": 0.18, "step": 450, "tokens_per_device": 2720 }, { "epoch": 0.18, "loss_ce": 0.12601661682128906, "loss_lvr": 0.8278180956840515, "loss_mode_switch": 0.0, "loss_total": 0.20879843831062317, "step": 450 }, { "batch_size": 1, "epoch": 0.18, "step": 450, "tokens_per_device": 5110 }, { "epoch": 0.18, "loss_ce": 0.00500165019184351, "loss_lvr": 0.7260500192642212, "loss_mode_switch": 0.0, "loss_total": 0.07760665565729141, "step": 450 }, { "epoch": 0.1804, "grad_norm": 1.4912667274475098, "learning_rate": 9.418449214962793e-06, "loss": 0.3393, "step": 451 }, { "batch_size": 4, "epoch": 0.1804, "step": 451, "tokens_per_device": 13884 }, { "epoch": 0.1804, "loss_ce": 0.06700745224952698, "loss_lvr": 0.7639800310134888, "loss_mode_switch": 0.0, "loss_total": 0.1434054672718048, "step": 451 }, { "batch_size": 1, "epoch": 0.1804, "step": 451, "tokens_per_device": 6354 }, { "epoch": 0.1804, "loss_ce": 0.08343616873025894, "loss_lvr": 0.4408881664276123, "loss_mode_switch": 0.0, "loss_total": 0.1275249868631363, "step": 451 }, { "batch_size": 1, "epoch": 0.1804, "step": 451, "tokens_per_device": 5404 }, { "epoch": 0.1804, "loss_ce": 0.0005122207221575081, "loss_lvr": 0.3858485221862793, "loss_mode_switch": 0.0, "loss_total": 0.039097074419260025, "step": 451 }, { "batch_size": 1, "epoch": 0.1804, "step": 451, "tokens_per_device": 4892 }, { "epoch": 0.1804, "loss_ce": 0.005356689915060997, "loss_lvr": 0.6048985123634338, "loss_mode_switch": 0.0, "loss_total": 0.06584654003381729, "step": 451 }, { "batch_size": 4, "epoch": 0.1804, "step": 451, "tokens_per_device": 8176 }, { "epoch": 0.1804, "loss_ce": 0.10454028099775314, "loss_lvr": 0.6052514910697937, "loss_mode_switch": 0.0, "loss_total": 0.1650654375553131, "step": 451 }, { "batch_size": 4, "epoch": 0.1804, "step": 451, "tokens_per_device": 1296 }, { "epoch": 0.1804, "loss_ce": 0.5016034841537476, "loss_lvr": 1.3860523700714111, "loss_mode_switch": 0.0, "loss_total": 0.6402087211608887, "step": 451 }, { "batch_size": 1, "epoch": 0.1804, "step": 451, "tokens_per_device": 5154 }, { "epoch": 0.1804, "loss_ce": 0.03479708731174469, "loss_lvr": 0.5489415526390076, "loss_mode_switch": 0.0, "loss_total": 0.08969124406576157, "step": 451 }, { "batch_size": 4, "epoch": 0.1804, "step": 451, "tokens_per_device": 4680 }, { "epoch": 0.1804, "loss_ce": 0.2924002707004547, "loss_lvr": 0.9048933386802673, "loss_mode_switch": 0.0, "loss_total": 0.38288959860801697, "step": 451 }, { "epoch": 0.1808, "grad_norm": 1.3802179098129272, "learning_rate": 9.415413560679385e-06, "loss": 0.3001, "step": 452 }, { "batch_size": 4, "epoch": 0.1808, "step": 452, "tokens_per_device": 4872 }, { "epoch": 0.1808, "loss_ce": 0.22338148951530457, "loss_lvr": 0.6557639241218567, "loss_mode_switch": 0.0, "loss_total": 0.2889578938484192, "step": 452 }, { "batch_size": 4, "epoch": 0.1808, "step": 452, "tokens_per_device": 4356 }, { "epoch": 0.1808, "loss_ce": 0.14676907658576965, "loss_lvr": 0.8963598608970642, "loss_mode_switch": 0.0, "loss_total": 0.23640507459640503, "step": 452 }, { "batch_size": 4, "epoch": 0.1808, "step": 452, "tokens_per_device": 7012 }, { "epoch": 0.1808, "loss_ce": 0.27747467160224915, "loss_lvr": 0.9861695170402527, "loss_mode_switch": 0.0, "loss_total": 0.3760916292667389, "step": 452 }, { "batch_size": 4, "epoch": 0.1808, "step": 452, "tokens_per_device": 3868 }, { "epoch": 0.1808, "loss_ce": 0.2823125720024109, "loss_lvr": 0.8890058994293213, "loss_mode_switch": 0.0, "loss_total": 0.3712131679058075, "step": 452 }, { "batch_size": 1, "epoch": 0.1808, "step": 452, "tokens_per_device": 5179 }, { "epoch": 0.1808, "loss_ce": 0.25088992714881897, "loss_lvr": 0.639096736907959, "loss_mode_switch": 0.0, "loss_total": 0.31479960680007935, "step": 452 }, { "batch_size": 4, "epoch": 0.1808, "step": 452, "tokens_per_device": 4264 }, { "epoch": 0.1808, "loss_ce": 0.26402199268341064, "loss_lvr": 1.2163357734680176, "loss_mode_switch": 0.0, "loss_total": 0.38565558195114136, "step": 452 }, { "batch_size": 4, "epoch": 0.1808, "step": 452, "tokens_per_device": 1408 }, { "epoch": 0.1808, "loss_ce": 0.2886123061180115, "loss_lvr": 1.1323143243789673, "loss_mode_switch": 0.0, "loss_total": 0.40184372663497925, "step": 452 }, { "batch_size": 1, "epoch": 0.1808, "step": 452, "tokens_per_device": 4893 }, { "epoch": 0.1808, "loss_ce": 0.07119850069284439, "loss_lvr": 0.4934936761856079, "loss_mode_switch": 0.0, "loss_total": 0.12054786831140518, "step": 452 }, { "epoch": 0.1812, "grad_norm": 1.5850721597671509, "learning_rate": 9.412370495894708e-06, "loss": 0.3259, "step": 453 }, { "batch_size": 1, "epoch": 0.1812, "step": 453, "tokens_per_device": 4911 }, { "epoch": 0.1812, "loss_ce": 0.013405428268015385, "loss_lvr": 1.10670006275177, "loss_mode_switch": 0.0, "loss_total": 0.12407543510198593, "step": 453 }, { "batch_size": 4, "epoch": 0.1812, "step": 453, "tokens_per_device": 4200 }, { "epoch": 0.1812, "loss_ce": 0.1473487913608551, "loss_lvr": 1.028315782546997, "loss_mode_switch": 0.0, "loss_total": 0.25018036365509033, "step": 453 }, { "batch_size": 1, "epoch": 0.1812, "step": 453, "tokens_per_device": 4869 }, { "epoch": 0.1812, "loss_ce": 0.05245806649327278, "loss_lvr": 0.19350853562355042, "loss_mode_switch": 0.0, "loss_total": 0.07180891931056976, "step": 453 }, { "batch_size": 4, "epoch": 0.1812, "step": 453, "tokens_per_device": 7088 }, { "epoch": 0.1812, "loss_ce": 0.19190232455730438, "loss_lvr": 0.8403032422065735, "loss_mode_switch": 0.0, "loss_total": 0.275932639837265, "step": 453 }, { "batch_size": 1, "epoch": 0.1812, "step": 453, "tokens_per_device": 4910 }, { "epoch": 0.1812, "loss_ce": 0.14648793637752533, "loss_lvr": 0.6054956912994385, "loss_mode_switch": 0.0, "loss_total": 0.20703750848770142, "step": 453 }, { "batch_size": 4, "epoch": 0.1812, "step": 453, "tokens_per_device": 6104 }, { "epoch": 0.1812, "loss_ce": 0.0663621798157692, "loss_lvr": 0.6353204846382141, "loss_mode_switch": 0.0, "loss_total": 0.1298942267894745, "step": 453 }, { "batch_size": 1, "epoch": 0.1812, "step": 453, "tokens_per_device": 4896 }, { "epoch": 0.1812, "loss_ce": 0.0994831770658493, "loss_lvr": 0.27120521664619446, "loss_mode_switch": 0.0, "loss_total": 0.12660369277000427, "step": 453 }, { "batch_size": 4, "epoch": 0.1812, "step": 453, "tokens_per_device": 3536 }, { "epoch": 0.1812, "loss_ce": 0.38283783197402954, "loss_lvr": 1.2824125289916992, "loss_mode_switch": 0.0, "loss_total": 0.5110790729522705, "step": 453 }, { "epoch": 0.1816, "grad_norm": 1.5183995962142944, "learning_rate": 9.409320025716018e-06, "loss": 0.3155, "step": 454 }, { "batch_size": 1, "epoch": 0.1816, "step": 454, "tokens_per_device": 4728 }, { "epoch": 0.1816, "loss_ce": 0.0017528892494738102, "loss_lvr": 0.26891928911209106, "loss_mode_switch": 0.0, "loss_total": 0.02864481881260872, "step": 454 }, { "batch_size": 4, "epoch": 0.1816, "step": 454, "tokens_per_device": 1228 }, { "epoch": 0.1816, "loss_ce": 0.2704913914203644, "loss_lvr": 1.092819333076477, "loss_mode_switch": 0.0, "loss_total": 0.3797733187675476, "step": 454 }, { "batch_size": 4, "epoch": 0.1816, "step": 454, "tokens_per_device": 5436 }, { "epoch": 0.1816, "loss_ce": 0.11916181445121765, "loss_lvr": 0.6951577663421631, "loss_mode_switch": 0.0, "loss_total": 0.1886775940656662, "step": 454 }, { "batch_size": 1, "epoch": 0.1816, "step": 454, "tokens_per_device": 5022 }, { "epoch": 0.1816, "loss_ce": 0.0013632553163915873, "loss_lvr": 0.7071807384490967, "loss_mode_switch": 0.0, "loss_total": 0.07208132743835449, "step": 454 }, { "batch_size": 1, "epoch": 0.1816, "step": 454, "tokens_per_device": 4977 }, { "epoch": 0.1816, "loss_ce": 0.0040099541656672955, "loss_lvr": 0.6801736950874329, "loss_mode_switch": 0.0, "loss_total": 0.07202732563018799, "step": 454 }, { "batch_size": 4, "epoch": 0.1816, "step": 454, "tokens_per_device": 10852 }, { "epoch": 0.1816, "loss_ce": 0.026884624734520912, "loss_lvr": 0.7587602138519287, "loss_mode_switch": 0.0, "loss_total": 0.10276064276695251, "step": 454 }, { "batch_size": 1, "epoch": 0.1816, "step": 454, "tokens_per_device": 5121 }, { "epoch": 0.1816, "loss_ce": 0.24106653034687042, "loss_lvr": 0.45397692918777466, "loss_mode_switch": 0.0, "loss_total": 0.28646421432495117, "step": 454 }, { "batch_size": 4, "epoch": 0.1816, "step": 454, "tokens_per_device": 5176 }, { "epoch": 0.1816, "loss_ce": 0.9499804973602295, "loss_lvr": 0.7789378762245178, "loss_mode_switch": 0.0, "loss_total": 1.027874231338501, "step": 454 }, { "epoch": 0.182, "grad_norm": 1.4284898042678833, "learning_rate": 9.406262155262995e-06, "loss": 0.2782, "step": 455 }, { "batch_size": 1, "epoch": 0.182, "step": 455, "tokens_per_device": 5000 }, { "epoch": 0.182, "loss_ce": 0.0344218946993351, "loss_lvr": 0.5844652652740479, "loss_mode_switch": 0.0, "loss_total": 0.09286841750144958, "step": 455 }, { "batch_size": 1, "epoch": 0.182, "step": 455, "tokens_per_device": 7617 }, { "epoch": 0.182, "loss_ce": 0.00094635970890522, "loss_lvr": 0.4168347418308258, "loss_mode_switch": 0.0, "loss_total": 0.04262983798980713, "step": 455 }, { "batch_size": 4, "epoch": 0.182, "step": 455, "tokens_per_device": 3492 }, { "epoch": 0.182, "loss_ce": 0.16505490243434906, "loss_lvr": 0.9375492334365845, "loss_mode_switch": 0.0, "loss_total": 0.2588098347187042, "step": 455 }, { "batch_size": 4, "epoch": 0.182, "step": 455, "tokens_per_device": 15600 }, { "epoch": 0.182, "loss_ce": 0.06169360876083374, "loss_lvr": 0.7338841557502747, "loss_mode_switch": 0.0, "loss_total": 0.13508203625679016, "step": 455 }, { "batch_size": 4, "epoch": 0.182, "step": 455, "tokens_per_device": 10588 }, { "epoch": 0.182, "loss_ce": 0.08948598057031631, "loss_lvr": 0.8770446181297302, "loss_mode_switch": 0.0, "loss_total": 0.17719045281410217, "step": 455 }, { "batch_size": 4, "epoch": 0.182, "step": 455, "tokens_per_device": 4220 }, { "epoch": 0.182, "loss_ce": 0.04716094955801964, "loss_lvr": 1.099841594696045, "loss_mode_switch": 0.0, "loss_total": 0.15714511275291443, "step": 455 }, { "batch_size": 1, "epoch": 0.182, "step": 455, "tokens_per_device": 7304 }, { "epoch": 0.182, "loss_ce": 0.0005537909455597401, "loss_lvr": 0.40595945715904236, "loss_mode_switch": 0.0, "loss_total": 0.04114973545074463, "step": 455 }, { "batch_size": 1, "epoch": 0.182, "step": 455, "tokens_per_device": 4955 }, { "epoch": 0.182, "loss_ce": 0.1718747466802597, "loss_lvr": 0.5416315197944641, "loss_mode_switch": 0.0, "loss_total": 0.2260379046201706, "step": 455 }, { "epoch": 0.1824, "grad_norm": 1.5928764343261719, "learning_rate": 9.403196889667742e-06, "loss": 0.3284, "step": 456 }, { "batch_size": 4, "epoch": 0.1824, "step": 456, "tokens_per_device": 4636 }, { "epoch": 0.1824, "loss_ce": 0.2208254188299179, "loss_lvr": 0.7871938943862915, "loss_mode_switch": 0.0, "loss_total": 0.2995448112487793, "step": 456 }, { "batch_size": 1, "epoch": 0.1824, "step": 456, "tokens_per_device": 7406 }, { "epoch": 0.1824, "loss_ce": 0.0005878515657968819, "loss_lvr": 0.6997911334037781, "loss_mode_switch": 0.0, "loss_total": 0.07056696712970734, "step": 456 }, { "batch_size": 4, "epoch": 0.1824, "step": 456, "tokens_per_device": 4256 }, { "epoch": 0.1824, "loss_ce": 0.0704173818230629, "loss_lvr": 0.7905781269073486, "loss_mode_switch": 0.0, "loss_total": 0.14947518706321716, "step": 456 }, { "batch_size": 4, "epoch": 0.1824, "step": 456, "tokens_per_device": 6512 }, { "epoch": 0.1824, "loss_ce": 0.06740424782037735, "loss_lvr": 0.8329446911811829, "loss_mode_switch": 0.0, "loss_total": 0.150698721408844, "step": 456 }, { "batch_size": 1, "epoch": 0.1824, "step": 456, "tokens_per_device": 4882 }, { "epoch": 0.1824, "loss_ce": 0.04215896874666214, "loss_lvr": 0.447820782661438, "loss_mode_switch": 0.0, "loss_total": 0.08694104850292206, "step": 456 }, { "batch_size": 1, "epoch": 0.1824, "step": 456, "tokens_per_device": 4876 }, { "epoch": 0.1824, "loss_ce": 0.0003166037204209715, "loss_lvr": 0.22898000478744507, "loss_mode_switch": 0.0, "loss_total": 0.02321460470557213, "step": 456 }, { "batch_size": 1, "epoch": 0.1824, "step": 456, "tokens_per_device": 5202 }, { "epoch": 0.1824, "loss_ce": 0.002588744042441249, "loss_lvr": 0.25340571999549866, "loss_mode_switch": 0.0, "loss_total": 0.027929315343499184, "step": 456 }, { "batch_size": 4, "epoch": 0.1824, "step": 456, "tokens_per_device": 5072 }, { "epoch": 0.1824, "loss_ce": 0.2339462786912918, "loss_lvr": 1.1399692296981812, "loss_mode_switch": 0.0, "loss_total": 0.34794318675994873, "step": 456 }, { "epoch": 0.1828, "grad_norm": 1.5147932767868042, "learning_rate": 9.400124234074772e-06, "loss": 0.3113, "step": 457 }, { "batch_size": 4, "epoch": 0.1828, "step": 457, "tokens_per_device": 3744 }, { "epoch": 0.1828, "loss_ce": 0.9288753867149353, "loss_lvr": 0.8846307992935181, "loss_mode_switch": 0.0, "loss_total": 1.017338514328003, "step": 457 }, { "batch_size": 4, "epoch": 0.1828, "step": 457, "tokens_per_device": 4648 }, { "epoch": 0.1828, "loss_ce": 0.13269947469234467, "loss_lvr": 0.9259656071662903, "loss_mode_switch": 0.0, "loss_total": 0.2252960354089737, "step": 457 }, { "batch_size": 1, "epoch": 0.1828, "step": 457, "tokens_per_device": 4812 }, { "epoch": 0.1828, "loss_ce": 0.030020827427506447, "loss_lvr": 0.5581739544868469, "loss_mode_switch": 0.0, "loss_total": 0.08583822101354599, "step": 457 }, { "batch_size": 4, "epoch": 0.1828, "step": 457, "tokens_per_device": 8516 }, { "epoch": 0.1828, "loss_ce": 0.11975246667861938, "loss_lvr": 0.8496768474578857, "loss_mode_switch": 0.0, "loss_total": 0.2047201544046402, "step": 457 }, { "batch_size": 1, "epoch": 0.1828, "step": 457, "tokens_per_device": 5333 }, { "epoch": 0.1828, "loss_ce": 0.0649629533290863, "loss_lvr": 0.39830031991004944, "loss_mode_switch": 0.0, "loss_total": 0.10479298233985901, "step": 457 }, { "batch_size": 4, "epoch": 0.1828, "step": 457, "tokens_per_device": 1404 }, { "epoch": 0.1828, "loss_ce": 0.6492281556129456, "loss_lvr": 1.0707859992980957, "loss_mode_switch": 0.0, "loss_total": 0.7563067674636841, "step": 457 }, { "batch_size": 4, "epoch": 0.1828, "step": 457, "tokens_per_device": 5600 }, { "epoch": 0.1828, "loss_ce": 0.36170485615730286, "loss_lvr": 0.9693340063095093, "loss_mode_switch": 0.0, "loss_total": 0.4586382508277893, "step": 457 }, { "batch_size": 4, "epoch": 0.1828, "step": 457, "tokens_per_device": 4112 }, { "epoch": 0.1828, "loss_ce": 0.04389166459441185, "loss_lvr": 0.7039967179298401, "loss_mode_switch": 0.0, "loss_total": 0.11429134011268616, "step": 457 }, { "epoch": 0.1832, "grad_norm": 1.409343957901001, "learning_rate": 9.397044193641e-06, "loss": 0.3559, "step": 458 }, { "batch_size": 4, "epoch": 0.1832, "step": 458, "tokens_per_device": 2572 }, { "epoch": 0.1832, "loss_ce": 0.4886213541030884, "loss_lvr": 1.0127891302108765, "loss_mode_switch": 0.0, "loss_total": 0.5899002552032471, "step": 458 }, { "batch_size": 1, "epoch": 0.1832, "step": 458, "tokens_per_device": 5694 }, { "epoch": 0.1832, "loss_ce": 0.0018530471716076136, "loss_lvr": 0.39429447054862976, "loss_mode_switch": 0.0, "loss_total": 0.041282497346401215, "step": 458 }, { "batch_size": 4, "epoch": 0.1832, "step": 458, "tokens_per_device": 1224 }, { "epoch": 0.1832, "loss_ce": 0.33700668811798096, "loss_lvr": 1.2233843803405762, "loss_mode_switch": 0.0, "loss_total": 0.45934513211250305, "step": 458 }, { "batch_size": 4, "epoch": 0.1832, "step": 458, "tokens_per_device": 3592 }, { "epoch": 0.1832, "loss_ce": 0.38117504119873047, "loss_lvr": 0.8163315653800964, "loss_mode_switch": 0.0, "loss_total": 0.46280819177627563, "step": 458 }, { "batch_size": 4, "epoch": 0.1832, "step": 458, "tokens_per_device": 1916 }, { "epoch": 0.1832, "loss_ce": 0.19188986718654633, "loss_lvr": 1.0177615880966187, "loss_mode_switch": 0.0, "loss_total": 0.2936660349369049, "step": 458 }, { "batch_size": 4, "epoch": 0.1832, "step": 458, "tokens_per_device": 12784 }, { "epoch": 0.1832, "loss_ce": 0.09700863808393478, "loss_lvr": 1.07011878490448, "loss_mode_switch": 0.0, "loss_total": 0.20402051508426666, "step": 458 }, { "batch_size": 4, "epoch": 0.1832, "step": 458, "tokens_per_device": 4916 }, { "epoch": 0.1832, "loss_ce": 0.15239913761615753, "loss_lvr": 0.8408600091934204, "loss_mode_switch": 0.0, "loss_total": 0.23648513853549957, "step": 458 }, { "batch_size": 4, "epoch": 0.1832, "step": 458, "tokens_per_device": 4608 }, { "epoch": 0.1832, "loss_ce": 0.5797974467277527, "loss_lvr": 1.0189940929412842, "loss_mode_switch": 0.0, "loss_total": 0.6816968321800232, "step": 458 }, { "epoch": 0.1836, "grad_norm": 1.461596131324768, "learning_rate": 9.393956773535742e-06, "loss": 0.3367, "step": 459 }, { "batch_size": 1, "epoch": 0.1836, "step": 459, "tokens_per_device": 4729 }, { "epoch": 0.1836, "loss_ce": 0.0007057505426928401, "loss_lvr": 0.48662200570106506, "loss_mode_switch": 0.0, "loss_total": 0.04936794936656952, "step": 459 }, { "batch_size": 4, "epoch": 0.1836, "step": 459, "tokens_per_device": 1832 }, { "epoch": 0.1836, "loss_ce": 0.0668162927031517, "loss_lvr": 1.0569369792938232, "loss_mode_switch": 0.0, "loss_total": 0.17250999808311462, "step": 459 }, { "batch_size": 1, "epoch": 0.1836, "step": 459, "tokens_per_device": 4868 }, { "epoch": 0.1836, "loss_ce": 0.000959879020228982, "loss_lvr": 0.49085819721221924, "loss_mode_switch": 0.0, "loss_total": 0.05004570260643959, "step": 459 }, { "batch_size": 1, "epoch": 0.1836, "step": 459, "tokens_per_device": 5162 }, { "epoch": 0.1836, "loss_ce": 0.003029598155990243, "loss_lvr": 0.522646427154541, "loss_mode_switch": 0.0, "loss_total": 0.05529424175620079, "step": 459 }, { "batch_size": 1, "epoch": 0.1836, "step": 459, "tokens_per_device": 5111 }, { "epoch": 0.1836, "loss_ce": 0.002393122762441635, "loss_lvr": 0.3524602949619293, "loss_mode_switch": 0.0, "loss_total": 0.03763915225863457, "step": 459 }, { "batch_size": 1, "epoch": 0.1836, "step": 459, "tokens_per_device": 4982 }, { "epoch": 0.1836, "loss_ce": 0.019088689237833023, "loss_lvr": 0.36445993185043335, "loss_mode_switch": 0.0, "loss_total": 0.05553468316793442, "step": 459 }, { "batch_size": 1, "epoch": 0.1836, "step": 459, "tokens_per_device": 4798 }, { "epoch": 0.1836, "loss_ce": 0.005516073666512966, "loss_lvr": 0.4758343994617462, "loss_mode_switch": 0.0, "loss_total": 0.05309951677918434, "step": 459 }, { "batch_size": 1, "epoch": 0.1836, "step": 459, "tokens_per_device": 5406 }, { "epoch": 0.1836, "loss_ce": 0.004793182481080294, "loss_lvr": 0.4959723651409149, "loss_mode_switch": 0.0, "loss_total": 0.05439041927456856, "step": 459 }, { "epoch": 0.184, "grad_norm": 1.361655354499817, "learning_rate": 9.390861978940687e-06, "loss": 0.2985, "step": 460 }, { "batch_size": 4, "epoch": 0.184, "step": 460, "tokens_per_device": 3788 }, { "epoch": 0.184, "loss_ce": 0.22684045135974884, "loss_lvr": 0.8516547679901123, "loss_mode_switch": 0.0, "loss_total": 0.3120059370994568, "step": 460 }, { "batch_size": 4, "epoch": 0.184, "step": 460, "tokens_per_device": 5744 }, { "epoch": 0.184, "loss_ce": 0.2627568542957306, "loss_lvr": 0.9988746643066406, "loss_mode_switch": 0.0, "loss_total": 0.3626443147659302, "step": 460 }, { "batch_size": 1, "epoch": 0.184, "step": 460, "tokens_per_device": 4767 }, { "epoch": 0.184, "loss_ce": 0.4121326208114624, "loss_lvr": 0.368008553981781, "loss_mode_switch": 0.0, "loss_total": 0.448933482170105, "step": 460 }, { "batch_size": 1, "epoch": 0.184, "step": 460, "tokens_per_device": 5078 }, { "epoch": 0.184, "loss_ce": 0.03290190547704697, "loss_lvr": 0.5401067733764648, "loss_mode_switch": 0.0, "loss_total": 0.08691258728504181, "step": 460 }, { "batch_size": 4, "epoch": 0.184, "step": 460, "tokens_per_device": 1896 }, { "epoch": 0.184, "loss_ce": 0.14582106471061707, "loss_lvr": 1.0989924669265747, "loss_mode_switch": 0.0, "loss_total": 0.255720317363739, "step": 460 }, { "batch_size": 4, "epoch": 0.184, "step": 460, "tokens_per_device": 1460 }, { "epoch": 0.184, "loss_ce": 0.35355672240257263, "loss_lvr": 1.1456475257873535, "loss_mode_switch": 0.0, "loss_total": 0.4681214690208435, "step": 460 }, { "batch_size": 4, "epoch": 0.184, "step": 460, "tokens_per_device": 2628 }, { "epoch": 0.184, "loss_ce": 0.04270076006650925, "loss_lvr": 0.8996354937553406, "loss_mode_switch": 0.0, "loss_total": 0.13266430795192719, "step": 460 }, { "batch_size": 1, "epoch": 0.184, "step": 460, "tokens_per_device": 5164 }, { "epoch": 0.184, "loss_ce": 0.0018188034882768989, "loss_lvr": 0.8064242005348206, "loss_mode_switch": 0.0, "loss_total": 0.08246123045682907, "step": 460 }, { "epoch": 0.1844, "grad_norm": 1.5197192430496216, "learning_rate": 9.387759815049911e-06, "loss": 0.2888, "step": 461 }, { "batch_size": 1, "epoch": 0.1844, "step": 461, "tokens_per_device": 4226 }, { "epoch": 0.1844, "loss_ce": 0.012449482455849648, "loss_lvr": 0.3120346963405609, "loss_mode_switch": 0.0, "loss_total": 0.04365295171737671, "step": 461 }, { "batch_size": 1, "epoch": 0.1844, "step": 461, "tokens_per_device": 4819 }, { "epoch": 0.1844, "loss_ce": 0.15989162027835846, "loss_lvr": 0.4120853543281555, "loss_mode_switch": 0.0, "loss_total": 0.201100155711174, "step": 461 }, { "batch_size": 4, "epoch": 0.1844, "step": 461, "tokens_per_device": 6404 }, { "epoch": 0.1844, "loss_ce": 0.14092722535133362, "loss_lvr": 0.8543359041213989, "loss_mode_switch": 0.0, "loss_total": 0.22636082768440247, "step": 461 }, { "batch_size": 4, "epoch": 0.1844, "step": 461, "tokens_per_device": 5024 }, { "epoch": 0.1844, "loss_ce": 0.40514296293258667, "loss_lvr": 0.6653281450271606, "loss_mode_switch": 0.0, "loss_total": 0.4716757833957672, "step": 461 }, { "batch_size": 4, "epoch": 0.1844, "step": 461, "tokens_per_device": 16148 }, { "epoch": 0.1844, "loss_ce": 0.4432271122932434, "loss_lvr": 0.4620649814605713, "loss_mode_switch": 0.0, "loss_total": 0.489433616399765, "step": 461 }, { "batch_size": 4, "epoch": 0.1844, "step": 461, "tokens_per_device": 8460 }, { "epoch": 0.1844, "loss_ce": 0.050082627683877945, "loss_lvr": 0.5614224672317505, "loss_mode_switch": 0.0, "loss_total": 0.10622487962245941, "step": 461 }, { "batch_size": 1, "epoch": 0.1844, "step": 461, "tokens_per_device": 5195 }, { "epoch": 0.1844, "loss_ce": 0.7792930006980896, "loss_lvr": 0.5011197924613953, "loss_mode_switch": 0.0, "loss_total": 0.8294049501419067, "step": 461 }, { "batch_size": 4, "epoch": 0.1844, "step": 461, "tokens_per_device": 1940 }, { "epoch": 0.1844, "loss_ce": 0.24759668111801147, "loss_lvr": 1.0630055665969849, "loss_mode_switch": 0.0, "loss_total": 0.35389724373817444, "step": 461 }, { "epoch": 0.1848, "grad_norm": 1.5164158344268799, "learning_rate": 9.384650287069856e-06, "loss": 0.329, "step": 462 }, { "batch_size": 4, "epoch": 0.1848, "step": 462, "tokens_per_device": 3860 }, { "epoch": 0.1848, "loss_ce": 0.1688864827156067, "loss_lvr": 0.8857361078262329, "loss_mode_switch": 0.0, "loss_total": 0.2574600875377655, "step": 462 }, { "batch_size": 4, "epoch": 0.1848, "step": 462, "tokens_per_device": 4668 }, { "epoch": 0.1848, "loss_ce": 0.13093873858451843, "loss_lvr": 0.7513055205345154, "loss_mode_switch": 0.0, "loss_total": 0.20606929063796997, "step": 462 }, { "batch_size": 4, "epoch": 0.1848, "step": 462, "tokens_per_device": 5444 }, { "epoch": 0.1848, "loss_ce": 0.07551290839910507, "loss_lvr": 0.8199750185012817, "loss_mode_switch": 0.0, "loss_total": 0.1575104147195816, "step": 462 }, { "batch_size": 4, "epoch": 0.1848, "step": 462, "tokens_per_device": 1688 }, { "epoch": 0.1848, "loss_ce": 0.34678566455841064, "loss_lvr": 1.068835973739624, "loss_mode_switch": 0.0, "loss_total": 0.4536692500114441, "step": 462 }, { "batch_size": 4, "epoch": 0.1848, "step": 462, "tokens_per_device": 3856 }, { "epoch": 0.1848, "loss_ce": 0.16588497161865234, "loss_lvr": 1.0105621814727783, "loss_mode_switch": 0.0, "loss_total": 0.2669411897659302, "step": 462 }, { "batch_size": 4, "epoch": 0.1848, "step": 462, "tokens_per_device": 4252 }, { "epoch": 0.1848, "loss_ce": 0.03471976891160011, "loss_lvr": 0.6926187872886658, "loss_mode_switch": 0.0, "loss_total": 0.10398164391517639, "step": 462 }, { "batch_size": 4, "epoch": 0.1848, "step": 462, "tokens_per_device": 5044 }, { "epoch": 0.1848, "loss_ce": 0.4977734684944153, "loss_lvr": 1.062679648399353, "loss_mode_switch": 0.0, "loss_total": 0.6040414571762085, "step": 462 }, { "batch_size": 4, "epoch": 0.1848, "step": 462, "tokens_per_device": 2600 }, { "epoch": 0.1848, "loss_ce": 0.2776137888431549, "loss_lvr": 1.1233090162277222, "loss_mode_switch": 0.0, "loss_total": 0.3899447023868561, "step": 462 }, { "epoch": 0.1852, "grad_norm": 1.3987805843353271, "learning_rate": 9.381533400219319e-06, "loss": 0.2948, "step": 463 }, { "batch_size": 1, "epoch": 0.1852, "step": 463, "tokens_per_device": 5030 }, { "epoch": 0.1852, "loss_ce": 2.4018044471740723, "loss_lvr": 0.5130808353424072, "loss_mode_switch": 0.0, "loss_total": 2.4531126022338867, "step": 463 }, { "batch_size": 4, "epoch": 0.1852, "step": 463, "tokens_per_device": 5168 }, { "epoch": 0.1852, "loss_ce": 0.2503136098384857, "loss_lvr": 1.0134447813034058, "loss_mode_switch": 0.0, "loss_total": 0.3516581058502197, "step": 463 }, { "batch_size": 4, "epoch": 0.1852, "step": 463, "tokens_per_device": 4252 }, { "epoch": 0.1852, "loss_ce": 0.11221218854188919, "loss_lvr": 0.9860910177230835, "loss_mode_switch": 0.0, "loss_total": 0.21082130074501038, "step": 463 }, { "batch_size": 4, "epoch": 0.1852, "step": 463, "tokens_per_device": 3756 }, { "epoch": 0.1852, "loss_ce": 0.584915816783905, "loss_lvr": 1.127556562423706, "loss_mode_switch": 0.0, "loss_total": 0.6976714730262756, "step": 463 }, { "batch_size": 1, "epoch": 0.1852, "step": 463, "tokens_per_device": 5106 }, { "epoch": 0.1852, "loss_ce": 0.3749248683452606, "loss_lvr": 0.5982552766799927, "loss_mode_switch": 0.0, "loss_total": 0.43475040793418884, "step": 463 }, { "batch_size": 4, "epoch": 0.1852, "step": 463, "tokens_per_device": 7648 }, { "epoch": 0.1852, "loss_ce": 0.0015634974697604775, "loss_lvr": 0.9009966850280762, "loss_mode_switch": 0.0, "loss_total": 0.0916631668806076, "step": 463 }, { "batch_size": 1, "epoch": 0.1852, "step": 463, "tokens_per_device": 4895 }, { "epoch": 0.1852, "loss_ce": 0.008254951797425747, "loss_lvr": 0.7195910215377808, "loss_mode_switch": 0.0, "loss_total": 0.08021405339241028, "step": 463 }, { "batch_size": 4, "epoch": 0.1852, "step": 463, "tokens_per_device": 4048 }, { "epoch": 0.1852, "loss_ce": 0.6614551544189453, "loss_lvr": 1.2343519926071167, "loss_mode_switch": 0.0, "loss_total": 0.784890353679657, "step": 463 }, { "epoch": 0.1856, "grad_norm": 1.6280202865600586, "learning_rate": 9.378409159729454e-06, "loss": 0.4085, "step": 464 }, { "batch_size": 4, "epoch": 0.1856, "step": 464, "tokens_per_device": 1360 }, { "epoch": 0.1856, "loss_ce": 0.44608399271965027, "loss_lvr": 1.1944270133972168, "loss_mode_switch": 0.0, "loss_total": 0.5655267238616943, "step": 464 }, { "batch_size": 1, "epoch": 0.1856, "step": 464, "tokens_per_device": 4683 }, { "epoch": 0.1856, "loss_ce": 0.05040512979030609, "loss_lvr": 0.3902996778488159, "loss_mode_switch": 0.0, "loss_total": 0.08943510055541992, "step": 464 }, { "batch_size": 4, "epoch": 0.1856, "step": 464, "tokens_per_device": 3912 }, { "epoch": 0.1856, "loss_ce": 0.318521648645401, "loss_lvr": 1.0920056104660034, "loss_mode_switch": 0.0, "loss_total": 0.4277222156524658, "step": 464 }, { "batch_size": 4, "epoch": 0.1856, "step": 464, "tokens_per_device": 4220 }, { "epoch": 0.1856, "loss_ce": 0.6115888357162476, "loss_lvr": 1.0036369562149048, "loss_mode_switch": 0.0, "loss_total": 0.7119525074958801, "step": 464 }, { "batch_size": 4, "epoch": 0.1856, "step": 464, "tokens_per_device": 2516 }, { "epoch": 0.1856, "loss_ce": 0.360679030418396, "loss_lvr": 1.042197585105896, "loss_mode_switch": 0.0, "loss_total": 0.4648987948894501, "step": 464 }, { "batch_size": 1, "epoch": 0.1856, "step": 464, "tokens_per_device": 5106 }, { "epoch": 0.1856, "loss_ce": 0.002027569804340601, "loss_lvr": 0.33669888973236084, "loss_mode_switch": 0.0, "loss_total": 0.03569746017456055, "step": 464 }, { "batch_size": 4, "epoch": 0.1856, "step": 464, "tokens_per_device": 8244 }, { "epoch": 0.1856, "loss_ce": 0.01553533785045147, "loss_lvr": 1.3492900133132935, "loss_mode_switch": 0.0, "loss_total": 0.15046434104442596, "step": 464 }, { "batch_size": 4, "epoch": 0.1856, "step": 464, "tokens_per_device": 4612 }, { "epoch": 0.1856, "loss_ce": 0.3790472745895386, "loss_lvr": 1.148783802986145, "loss_mode_switch": 0.0, "loss_total": 0.49392566084861755, "step": 464 }, { "epoch": 0.186, "grad_norm": 1.3568655252456665, "learning_rate": 9.37527757084375e-06, "loss": 0.3716, "step": 465 }, { "batch_size": 1, "epoch": 0.186, "step": 465, "tokens_per_device": 4888 }, { "epoch": 0.186, "loss_ce": 0.22409965097904205, "loss_lvr": 0.5462204217910767, "loss_mode_switch": 0.0, "loss_total": 0.2787216901779175, "step": 465 }, { "batch_size": 4, "epoch": 0.186, "step": 465, "tokens_per_device": 10276 }, { "epoch": 0.186, "loss_ce": 0.03947233781218529, "loss_lvr": 0.6696073412895203, "loss_mode_switch": 0.0, "loss_total": 0.10643307864665985, "step": 465 }, { "batch_size": 4, "epoch": 0.186, "step": 465, "tokens_per_device": 3744 }, { "epoch": 0.186, "loss_ce": 0.34106528759002686, "loss_lvr": 0.7564058303833008, "loss_mode_switch": 0.0, "loss_total": 0.4167058765888214, "step": 465 }, { "batch_size": 4, "epoch": 0.186, "step": 465, "tokens_per_device": 1716 }, { "epoch": 0.186, "loss_ce": 0.5205795764923096, "loss_lvr": 1.1885957717895508, "loss_mode_switch": 0.0, "loss_total": 0.6394391655921936, "step": 465 }, { "batch_size": 1, "epoch": 0.186, "step": 465, "tokens_per_device": 4637 }, { "epoch": 0.186, "loss_ce": 0.02581026591360569, "loss_lvr": 0.6712003946304321, "loss_mode_switch": 0.0, "loss_total": 0.09293030202388763, "step": 465 }, { "batch_size": 4, "epoch": 0.186, "step": 465, "tokens_per_device": 4320 }, { "epoch": 0.186, "loss_ce": 0.2107761800289154, "loss_lvr": 1.214264988899231, "loss_mode_switch": 0.0, "loss_total": 0.332202672958374, "step": 465 }, { "batch_size": 4, "epoch": 0.186, "step": 465, "tokens_per_device": 7116 }, { "epoch": 0.186, "loss_ce": 0.08762311935424805, "loss_lvr": 0.858238935470581, "loss_mode_switch": 0.0, "loss_total": 0.17344701290130615, "step": 465 }, { "batch_size": 4, "epoch": 0.186, "step": 465, "tokens_per_device": 3788 }, { "epoch": 0.186, "loss_ce": 0.4540629982948303, "loss_lvr": 1.3098406791687012, "loss_mode_switch": 0.0, "loss_total": 0.5850470662117004, "step": 465 }, { "epoch": 0.1864, "grad_norm": 1.66191828250885, "learning_rate": 9.372138638818036e-06, "loss": 0.3496, "step": 466 }, { "batch_size": 1, "epoch": 0.1864, "step": 466, "tokens_per_device": 5028 }, { "epoch": 0.1864, "loss_ce": 0.010651160031557083, "loss_lvr": 0.45607876777648926, "loss_mode_switch": 0.0, "loss_total": 0.05625903606414795, "step": 466 }, { "batch_size": 4, "epoch": 0.1864, "step": 466, "tokens_per_device": 1484 }, { "epoch": 0.1864, "loss_ce": 0.7930969595909119, "loss_lvr": 0.9767717123031616, "loss_mode_switch": 0.0, "loss_total": 0.890774130821228, "step": 466 }, { "batch_size": 1, "epoch": 0.1864, "step": 466, "tokens_per_device": 4676 }, { "epoch": 0.1864, "loss_ce": 0.5580964088439941, "loss_lvr": 0.911678671836853, "loss_mode_switch": 0.0, "loss_total": 0.6492642760276794, "step": 466 }, { "batch_size": 1, "epoch": 0.1864, "step": 466, "tokens_per_device": 5125 }, { "epoch": 0.1864, "loss_ce": 0.014115889556705952, "loss_lvr": 0.8940826058387756, "loss_mode_switch": 0.0, "loss_total": 0.10352415591478348, "step": 466 }, { "batch_size": 4, "epoch": 0.1864, "step": 466, "tokens_per_device": 5744 }, { "epoch": 0.1864, "loss_ce": 0.3848706781864166, "loss_lvr": 1.0362385511398315, "loss_mode_switch": 0.0, "loss_total": 0.48849454522132874, "step": 466 }, { "batch_size": 4, "epoch": 0.1864, "step": 466, "tokens_per_device": 8608 }, { "epoch": 0.1864, "loss_ce": 0.30585142970085144, "loss_lvr": 0.5695496797561646, "loss_mode_switch": 0.0, "loss_total": 0.36280640959739685, "step": 466 }, { "batch_size": 4, "epoch": 0.1864, "step": 466, "tokens_per_device": 4220 }, { "epoch": 0.1864, "loss_ce": 0.8781405091285706, "loss_lvr": 1.1184414625167847, "loss_mode_switch": 0.0, "loss_total": 0.9899846315383911, "step": 466 }, { "batch_size": 1, "epoch": 0.1864, "step": 466, "tokens_per_device": 4724 }, { "epoch": 0.1864, "loss_ce": 0.00579705648124218, "loss_lvr": 0.9655650854110718, "loss_mode_switch": 0.0, "loss_total": 0.10235356539487839, "step": 466 }, { "epoch": 0.1868, "grad_norm": 1.46889328956604, "learning_rate": 9.36899236892046e-06, "loss": 0.3331, "step": 467 }, { "batch_size": 4, "epoch": 0.1868, "step": 467, "tokens_per_device": 6552 }, { "epoch": 0.1868, "loss_ce": 0.0076337638311088085, "loss_lvr": 0.6931308507919312, "loss_mode_switch": 0.0, "loss_total": 0.07694684714078903, "step": 467 }, { "batch_size": 4, "epoch": 0.1868, "step": 467, "tokens_per_device": 4444 }, { "epoch": 0.1868, "loss_ce": 0.17611731588840485, "loss_lvr": 0.8146634101867676, "loss_mode_switch": 0.0, "loss_total": 0.2575836479663849, "step": 467 }, { "batch_size": 1, "epoch": 0.1868, "step": 467, "tokens_per_device": 4878 }, { "epoch": 0.1868, "loss_ce": 0.21489307284355164, "loss_lvr": 0.6464987397193909, "loss_mode_switch": 0.0, "loss_total": 0.2795429527759552, "step": 467 }, { "batch_size": 1, "epoch": 0.1868, "step": 467, "tokens_per_device": 5422 }, { "epoch": 0.1868, "loss_ce": 0.2317424714565277, "loss_lvr": 0.42393454909324646, "loss_mode_switch": 0.0, "loss_total": 0.27413591742515564, "step": 467 }, { "batch_size": 1, "epoch": 0.1868, "step": 467, "tokens_per_device": 4937 }, { "epoch": 0.1868, "loss_ce": 0.10115199536085129, "loss_lvr": 0.5163394212722778, "loss_mode_switch": 0.0, "loss_total": 0.15278594195842743, "step": 467 }, { "batch_size": 4, "epoch": 0.1868, "step": 467, "tokens_per_device": 5160 }, { "epoch": 0.1868, "loss_ce": 0.04851944372057915, "loss_lvr": 0.8788269758224487, "loss_mode_switch": 0.0, "loss_total": 0.13640214502811432, "step": 467 }, { "batch_size": 1, "epoch": 0.1868, "step": 467, "tokens_per_device": 4896 }, { "epoch": 0.1868, "loss_ce": 0.08789705485105515, "loss_lvr": 0.3516390323638916, "loss_mode_switch": 0.0, "loss_total": 0.12306095659732819, "step": 467 }, { "batch_size": 4, "epoch": 0.1868, "step": 467, "tokens_per_device": 4792 }, { "epoch": 0.1868, "loss_ce": 0.6148492097854614, "loss_lvr": 0.9405359625816345, "loss_mode_switch": 0.0, "loss_total": 0.7089028358459473, "step": 467 }, { "epoch": 0.1872, "grad_norm": 1.4964592456817627, "learning_rate": 9.365838766431487e-06, "loss": 0.3218, "step": 468 }, { "batch_size": 4, "epoch": 0.1872, "step": 468, "tokens_per_device": 3328 }, { "epoch": 0.1872, "loss_ce": 0.3907667398452759, "loss_lvr": 0.9935700297355652, "loss_mode_switch": 0.0, "loss_total": 0.4901237487792969, "step": 468 }, { "batch_size": 1, "epoch": 0.1872, "step": 468, "tokens_per_device": 4864 }, { "epoch": 0.1872, "loss_ce": 0.014963626861572266, "loss_lvr": 0.5341507792472839, "loss_mode_switch": 0.0, "loss_total": 0.06837870180606842, "step": 468 }, { "batch_size": 4, "epoch": 0.1872, "step": 468, "tokens_per_device": 1672 }, { "epoch": 0.1872, "loss_ce": 0.40569034218788147, "loss_lvr": 1.3642096519470215, "loss_mode_switch": 0.0, "loss_total": 0.5421112775802612, "step": 468 }, { "batch_size": 4, "epoch": 0.1872, "step": 468, "tokens_per_device": 1396 }, { "epoch": 0.1872, "loss_ce": 0.2736820578575134, "loss_lvr": 1.2485281229019165, "loss_mode_switch": 0.0, "loss_total": 0.3985348641872406, "step": 468 }, { "batch_size": 4, "epoch": 0.1872, "step": 468, "tokens_per_device": 3016 }, { "epoch": 0.1872, "loss_ce": 0.35741138458251953, "loss_lvr": 0.8800378441810608, "loss_mode_switch": 0.0, "loss_total": 0.4454151690006256, "step": 468 }, { "batch_size": 1, "epoch": 0.1872, "step": 468, "tokens_per_device": 5058 }, { "epoch": 0.1872, "loss_ce": 0.01871698722243309, "loss_lvr": 0.8766346573829651, "loss_mode_switch": 0.0, "loss_total": 0.10638044774532318, "step": 468 }, { "batch_size": 4, "epoch": 0.1872, "step": 468, "tokens_per_device": 1428 }, { "epoch": 0.1872, "loss_ce": 0.2866230905056, "loss_lvr": 1.1081732511520386, "loss_mode_switch": 0.0, "loss_total": 0.39744043350219727, "step": 468 }, { "batch_size": 1, "epoch": 0.1872, "step": 468, "tokens_per_device": 5165 }, { "epoch": 0.1872, "loss_ce": 0.022193286567926407, "loss_lvr": 0.5801011323928833, "loss_mode_switch": 0.0, "loss_total": 0.08020339906215668, "step": 468 }, { "epoch": 0.1876, "grad_norm": 1.841931700706482, "learning_rate": 9.36267783664389e-06, "loss": 0.3236, "step": 469 }, { "batch_size": 4, "epoch": 0.1876, "step": 469, "tokens_per_device": 1784 }, { "epoch": 0.1876, "loss_ce": 0.319509357213974, "loss_lvr": 0.9869359731674194, "loss_mode_switch": 0.0, "loss_total": 0.4182029664516449, "step": 469 }, { "batch_size": 4, "epoch": 0.1876, "step": 469, "tokens_per_device": 4356 }, { "epoch": 0.1876, "loss_ce": 0.7169682383537292, "loss_lvr": 0.9299700856208801, "loss_mode_switch": 0.0, "loss_total": 0.8099652528762817, "step": 469 }, { "batch_size": 4, "epoch": 0.1876, "step": 469, "tokens_per_device": 3788 }, { "epoch": 0.1876, "loss_ce": 0.3916868567466736, "loss_lvr": 1.0761187076568604, "loss_mode_switch": 0.0, "loss_total": 0.49929872155189514, "step": 469 }, { "batch_size": 1, "epoch": 0.1876, "step": 469, "tokens_per_device": 4859 }, { "epoch": 0.1876, "loss_ce": 0.12549719214439392, "loss_lvr": 0.7378957271575928, "loss_mode_switch": 0.0, "loss_total": 0.19928675889968872, "step": 469 }, { "batch_size": 1, "epoch": 0.1876, "step": 469, "tokens_per_device": 5115 }, { "epoch": 0.1876, "loss_ce": 0.0013666447484865785, "loss_lvr": 0.417510449886322, "loss_mode_switch": 0.0, "loss_total": 0.043117690831422806, "step": 469 }, { "batch_size": 4, "epoch": 0.1876, "step": 469, "tokens_per_device": 3788 }, { "epoch": 0.1876, "loss_ce": 0.18594852089881897, "loss_lvr": 1.0204472541809082, "loss_mode_switch": 0.0, "loss_total": 0.28799325227737427, "step": 469 }, { "batch_size": 1, "epoch": 0.1876, "step": 469, "tokens_per_device": 5681 }, { "epoch": 0.1876, "loss_ce": 0.21562135219573975, "loss_lvr": 0.47826072573661804, "loss_mode_switch": 0.0, "loss_total": 0.26344743371009827, "step": 469 }, { "batch_size": 4, "epoch": 0.1876, "step": 469, "tokens_per_device": 1572 }, { "epoch": 0.1876, "loss_ce": 0.5965110063552856, "loss_lvr": 1.338761806488037, "loss_mode_switch": 0.0, "loss_total": 0.7303872108459473, "step": 469 }, { "epoch": 0.188, "grad_norm": 1.556510329246521, "learning_rate": 9.359509584862735e-06, "loss": 0.3234, "step": 470 }, { "batch_size": 4, "epoch": 0.188, "step": 470, "tokens_per_device": 4272 }, { "epoch": 0.188, "loss_ce": 0.1870216578245163, "loss_lvr": 1.212239146232605, "loss_mode_switch": 0.0, "loss_total": 0.30824556946754456, "step": 470 }, { "batch_size": 1, "epoch": 0.188, "step": 470, "tokens_per_device": 4907 }, { "epoch": 0.188, "loss_ce": 0.00774972653016448, "loss_lvr": 0.9226264357566833, "loss_mode_switch": 0.0, "loss_total": 0.10001237690448761, "step": 470 }, { "batch_size": 1, "epoch": 0.188, "step": 470, "tokens_per_device": 5052 }, { "epoch": 0.188, "loss_ce": 0.20027688145637512, "loss_lvr": 0.5110443234443665, "loss_mode_switch": 0.0, "loss_total": 0.2513813078403473, "step": 470 }, { "batch_size": 1, "epoch": 0.188, "step": 470, "tokens_per_device": 4975 }, { "epoch": 0.188, "loss_ce": 0.5104996562004089, "loss_lvr": 0.6112982034683228, "loss_mode_switch": 0.0, "loss_total": 0.5716294646263123, "step": 470 }, { "batch_size": 4, "epoch": 0.188, "step": 470, "tokens_per_device": 8732 }, { "epoch": 0.188, "loss_ce": 0.060601916164159775, "loss_lvr": 0.8972070813179016, "loss_mode_switch": 0.0, "loss_total": 0.15032263100147247, "step": 470 }, { "batch_size": 4, "epoch": 0.188, "step": 470, "tokens_per_device": 4576 }, { "epoch": 0.188, "loss_ce": 0.03220300376415253, "loss_lvr": 1.0357658863067627, "loss_mode_switch": 0.0, "loss_total": 0.13577958941459656, "step": 470 }, { "batch_size": 4, "epoch": 0.188, "step": 470, "tokens_per_device": 4264 }, { "epoch": 0.188, "loss_ce": 0.2261219322681427, "loss_lvr": 1.2450846433639526, "loss_mode_switch": 0.0, "loss_total": 0.35063040256500244, "step": 470 }, { "batch_size": 4, "epoch": 0.188, "step": 470, "tokens_per_device": 2700 }, { "epoch": 0.188, "loss_ce": 0.24860011041164398, "loss_lvr": 0.8849994540214539, "loss_mode_switch": 0.0, "loss_total": 0.3371000587940216, "step": 470 }, { "epoch": 0.1884, "grad_norm": 1.7164087295532227, "learning_rate": 9.356334016405383e-06, "loss": 0.3071, "step": 471 }, { "batch_size": 4, "epoch": 0.1884, "step": 471, "tokens_per_device": 4268 }, { "epoch": 0.1884, "loss_ce": 0.3046410381793976, "loss_lvr": 1.077262043952942, "loss_mode_switch": 0.0, "loss_total": 0.4123672544956207, "step": 471 }, { "batch_size": 4, "epoch": 0.1884, "step": 471, "tokens_per_device": 3816 }, { "epoch": 0.1884, "loss_ce": 0.1958855241537094, "loss_lvr": 1.101920247077942, "loss_mode_switch": 0.0, "loss_total": 0.3060775399208069, "step": 471 }, { "batch_size": 4, "epoch": 0.1884, "step": 471, "tokens_per_device": 2372 }, { "epoch": 0.1884, "loss_ce": 0.541609525680542, "loss_lvr": 0.8349670171737671, "loss_mode_switch": 0.0, "loss_total": 0.6251062154769897, "step": 471 }, { "batch_size": 4, "epoch": 0.1884, "step": 471, "tokens_per_device": 7404 }, { "epoch": 0.1884, "loss_ce": 0.0723293200135231, "loss_lvr": 1.1803843975067139, "loss_mode_switch": 0.0, "loss_total": 0.19036775827407837, "step": 471 }, { "batch_size": 1, "epoch": 0.1884, "step": 471, "tokens_per_device": 4943 }, { "epoch": 0.1884, "loss_ce": 0.027659360319375992, "loss_lvr": 0.6005803942680359, "loss_mode_switch": 0.0, "loss_total": 0.08771739900112152, "step": 471 }, { "batch_size": 1, "epoch": 0.1884, "step": 471, "tokens_per_device": 5149 }, { "epoch": 0.1884, "loss_ce": 0.05238794535398483, "loss_lvr": 0.517634391784668, "loss_mode_switch": 0.0, "loss_total": 0.10415138304233551, "step": 471 }, { "batch_size": 4, "epoch": 0.1884, "step": 471, "tokens_per_device": 4716 }, { "epoch": 0.1884, "loss_ce": 0.23716668784618378, "loss_lvr": 1.0648860931396484, "loss_mode_switch": 0.0, "loss_total": 0.3436552882194519, "step": 471 }, { "batch_size": 1, "epoch": 0.1884, "step": 471, "tokens_per_device": 5400 }, { "epoch": 0.1884, "loss_ce": 0.445515513420105, "loss_lvr": 0.802017331123352, "loss_mode_switch": 0.0, "loss_total": 0.5257172584533691, "step": 471 }, { "epoch": 0.1888, "grad_norm": 1.4328755140304565, "learning_rate": 9.35315113660147e-06, "loss": 0.2911, "step": 472 }, { "batch_size": 4, "epoch": 0.1888, "step": 472, "tokens_per_device": 4240 }, { "epoch": 0.1888, "loss_ce": 0.6239727735519409, "loss_lvr": 1.0365378856658936, "loss_mode_switch": 0.0, "loss_total": 0.7276265621185303, "step": 472 }, { "batch_size": 1, "epoch": 0.1888, "step": 472, "tokens_per_device": 5501 }, { "epoch": 0.1888, "loss_ce": 0.025812242180109024, "loss_lvr": 0.6346033811569214, "loss_mode_switch": 0.0, "loss_total": 0.08927258849143982, "step": 472 }, { "batch_size": 1, "epoch": 0.1888, "step": 472, "tokens_per_device": 5030 }, { "epoch": 0.1888, "loss_ce": 0.020183062180876732, "loss_lvr": 0.7827094197273254, "loss_mode_switch": 0.0, "loss_total": 0.09845400601625443, "step": 472 }, { "batch_size": 1, "epoch": 0.1888, "step": 472, "tokens_per_device": 4745 }, { "epoch": 0.1888, "loss_ce": 0.010835446417331696, "loss_lvr": 0.5714994668960571, "loss_mode_switch": 0.0, "loss_total": 0.06798539310693741, "step": 472 }, { "batch_size": 4, "epoch": 0.1888, "step": 472, "tokens_per_device": 4188 }, { "epoch": 0.1888, "loss_ce": 0.3376808762550354, "loss_lvr": 1.037366271018982, "loss_mode_switch": 0.0, "loss_total": 0.44141751527786255, "step": 472 }, { "batch_size": 4, "epoch": 0.1888, "step": 472, "tokens_per_device": 5616 }, { "epoch": 0.1888, "loss_ce": 0.1235622763633728, "loss_lvr": 0.8731755614280701, "loss_mode_switch": 0.0, "loss_total": 0.2108798325061798, "step": 472 }, { "batch_size": 4, "epoch": 0.1888, "step": 472, "tokens_per_device": 5208 }, { "epoch": 0.1888, "loss_ce": 0.24269743263721466, "loss_lvr": 0.9916858077049255, "loss_mode_switch": 0.0, "loss_total": 0.34186601638793945, "step": 472 }, { "batch_size": 4, "epoch": 0.1888, "step": 472, "tokens_per_device": 4236 }, { "epoch": 0.1888, "loss_ce": 0.2484031766653061, "loss_lvr": 0.924422562122345, "loss_mode_switch": 0.0, "loss_total": 0.3408454358577728, "step": 472 }, { "epoch": 0.1892, "grad_norm": 1.192412257194519, "learning_rate": 9.349960950792907e-06, "loss": 0.2956, "step": 473 }, { "batch_size": 4, "epoch": 0.1892, "step": 473, "tokens_per_device": 5684 }, { "epoch": 0.1892, "loss_ce": 0.02354934997856617, "loss_lvr": 0.9795121550559998, "loss_mode_switch": 0.0, "loss_total": 0.12150056660175323, "step": 473 }, { "batch_size": 4, "epoch": 0.1892, "step": 473, "tokens_per_device": 1652 }, { "epoch": 0.1892, "loss_ce": 0.5864518284797668, "loss_lvr": 0.9034629464149475, "loss_mode_switch": 0.0, "loss_total": 0.6767981052398682, "step": 473 }, { "batch_size": 4, "epoch": 0.1892, "step": 473, "tokens_per_device": 4364 }, { "epoch": 0.1892, "loss_ce": 0.3512575328350067, "loss_lvr": 1.1588143110275269, "loss_mode_switch": 0.0, "loss_total": 0.46713897585868835, "step": 473 }, { "batch_size": 1, "epoch": 0.1892, "step": 473, "tokens_per_device": 6342 }, { "epoch": 0.1892, "loss_ce": 0.13848118484020233, "loss_lvr": 0.5856906175613403, "loss_mode_switch": 0.0, "loss_total": 0.19705024361610413, "step": 473 }, { "batch_size": 4, "epoch": 0.1892, "step": 473, "tokens_per_device": 2612 }, { "epoch": 0.1892, "loss_ce": 0.10691551864147186, "loss_lvr": 1.0368704795837402, "loss_mode_switch": 0.0, "loss_total": 0.2106025665998459, "step": 473 }, { "batch_size": 4, "epoch": 0.1892, "step": 473, "tokens_per_device": 5672 }, { "epoch": 0.1892, "loss_ce": 0.42984622716903687, "loss_lvr": 1.344429850578308, "loss_mode_switch": 0.0, "loss_total": 0.5642892122268677, "step": 473 }, { "batch_size": 4, "epoch": 0.1892, "step": 473, "tokens_per_device": 11728 }, { "epoch": 0.1892, "loss_ce": 0.056817442178726196, "loss_lvr": 1.1971086263656616, "loss_mode_switch": 0.0, "loss_total": 0.17652830481529236, "step": 473 }, { "batch_size": 4, "epoch": 0.1892, "step": 473, "tokens_per_device": 4236 }, { "epoch": 0.1892, "loss_ce": 0.30486202239990234, "loss_lvr": 1.0834782123565674, "loss_mode_switch": 0.0, "loss_total": 0.41320985555648804, "step": 473 }, { "epoch": 0.1896, "grad_norm": 1.4719058275222778, "learning_rate": 9.346763464333862e-06, "loss": 0.3689, "step": 474 }, { "batch_size": 4, "epoch": 0.1896, "step": 474, "tokens_per_device": 5948 }, { "epoch": 0.1896, "loss_ce": 0.12659472227096558, "loss_lvr": 0.9935351610183716, "loss_mode_switch": 0.0, "loss_total": 0.2259482443332672, "step": 474 }, { "batch_size": 4, "epoch": 0.1896, "step": 474, "tokens_per_device": 3880 }, { "epoch": 0.1896, "loss_ce": 0.8511039614677429, "loss_lvr": 1.4320775270462036, "loss_mode_switch": 0.0, "loss_total": 0.9943116903305054, "step": 474 }, { "batch_size": 4, "epoch": 0.1896, "step": 474, "tokens_per_device": 1384 }, { "epoch": 0.1896, "loss_ce": 0.627532422542572, "loss_lvr": 1.1466760635375977, "loss_mode_switch": 0.0, "loss_total": 0.7422000169754028, "step": 474 }, { "batch_size": 4, "epoch": 0.1896, "step": 474, "tokens_per_device": 3956 }, { "epoch": 0.1896, "loss_ce": 0.19070829451084137, "loss_lvr": 0.8426620364189148, "loss_mode_switch": 0.0, "loss_total": 0.2749744951725006, "step": 474 }, { "batch_size": 4, "epoch": 0.1896, "step": 474, "tokens_per_device": 5704 }, { "epoch": 0.1896, "loss_ce": 0.05029687657952309, "loss_lvr": 0.9848834276199341, "loss_mode_switch": 0.0, "loss_total": 0.14878521859645844, "step": 474 }, { "batch_size": 1, "epoch": 0.1896, "step": 474, "tokens_per_device": 4923 }, { "epoch": 0.1896, "loss_ce": 0.009630865417420864, "loss_lvr": 0.3535047769546509, "loss_mode_switch": 0.0, "loss_total": 0.044981345534324646, "step": 474 }, { "batch_size": 4, "epoch": 0.1896, "step": 474, "tokens_per_device": 4200 }, { "epoch": 0.1896, "loss_ce": 0.169197678565979, "loss_lvr": 1.1661865711212158, "loss_mode_switch": 0.0, "loss_total": 0.28581634163856506, "step": 474 }, { "batch_size": 4, "epoch": 0.1896, "step": 474, "tokens_per_device": 4304 }, { "epoch": 0.1896, "loss_ce": 0.5929499268531799, "loss_lvr": 1.0755596160888672, "loss_mode_switch": 0.0, "loss_total": 0.7005059123039246, "step": 474 }, { "epoch": 0.19, "grad_norm": 1.5703176259994507, "learning_rate": 9.343558682590757e-06, "loss": 0.3669, "step": 475 }, { "batch_size": 4, "epoch": 0.19, "step": 475, "tokens_per_device": 5356 }, { "epoch": 0.19, "loss_ce": 0.23207199573516846, "loss_lvr": 0.896365761756897, "loss_mode_switch": 0.0, "loss_total": 0.3217085599899292, "step": 475 }, { "batch_size": 4, "epoch": 0.19, "step": 475, "tokens_per_device": 3776 }, { "epoch": 0.19, "loss_ce": 0.2630302608013153, "loss_lvr": 0.6790158152580261, "loss_mode_switch": 0.0, "loss_total": 0.3309318423271179, "step": 475 }, { "batch_size": 4, "epoch": 0.19, "step": 475, "tokens_per_device": 7316 }, { "epoch": 0.19, "loss_ce": 0.09007083624601364, "loss_lvr": 0.7617708444595337, "loss_mode_switch": 0.0, "loss_total": 0.1662479192018509, "step": 475 }, { "batch_size": 4, "epoch": 0.19, "step": 475, "tokens_per_device": 1552 }, { "epoch": 0.19, "loss_ce": 0.4703512489795685, "loss_lvr": 1.4889769554138184, "loss_mode_switch": 0.0, "loss_total": 0.6192489266395569, "step": 475 }, { "batch_size": 4, "epoch": 0.19, "step": 475, "tokens_per_device": 4572 }, { "epoch": 0.19, "loss_ce": 0.3224993348121643, "loss_lvr": 0.896543562412262, "loss_mode_switch": 0.0, "loss_total": 0.4121536910533905, "step": 475 }, { "batch_size": 4, "epoch": 0.19, "step": 475, "tokens_per_device": 5784 }, { "epoch": 0.19, "loss_ce": 0.2999955117702484, "loss_lvr": 0.9102662801742554, "loss_mode_switch": 0.0, "loss_total": 0.3910221457481384, "step": 475 }, { "batch_size": 4, "epoch": 0.19, "step": 475, "tokens_per_device": 11304 }, { "epoch": 0.19, "loss_ce": 0.14042288064956665, "loss_lvr": 1.148107647895813, "loss_mode_switch": 0.0, "loss_total": 0.25523364543914795, "step": 475 }, { "batch_size": 4, "epoch": 0.19, "step": 475, "tokens_per_device": 4252 }, { "epoch": 0.19, "loss_ce": 0.1318122297525406, "loss_lvr": 1.2329938411712646, "loss_mode_switch": 0.0, "loss_total": 0.25511160492897034, "step": 475 }, { "epoch": 0.1904, "grad_norm": 1.2755167484283447, "learning_rate": 9.34034661094226e-06, "loss": 0.3179, "step": 476 }, { "batch_size": 4, "epoch": 0.1904, "step": 476, "tokens_per_device": 2568 }, { "epoch": 0.1904, "loss_ce": 0.6173824667930603, "loss_lvr": 1.1536344289779663, "loss_mode_switch": 0.0, "loss_total": 0.732745885848999, "step": 476 }, { "batch_size": 1, "epoch": 0.1904, "step": 476, "tokens_per_device": 4814 }, { "epoch": 0.1904, "loss_ce": 0.13962319493293762, "loss_lvr": 0.41277366876602173, "loss_mode_switch": 0.0, "loss_total": 0.18090055882930756, "step": 476 }, { "batch_size": 4, "epoch": 0.1904, "step": 476, "tokens_per_device": 1712 }, { "epoch": 0.1904, "loss_ce": 0.2917932868003845, "loss_lvr": 0.9062068462371826, "loss_mode_switch": 0.0, "loss_total": 0.38241398334503174, "step": 476 }, { "batch_size": 4, "epoch": 0.1904, "step": 476, "tokens_per_device": 3796 }, { "epoch": 0.1904, "loss_ce": 0.4770483672618866, "loss_lvr": 1.1528531312942505, "loss_mode_switch": 0.0, "loss_total": 0.5923336744308472, "step": 476 }, { "batch_size": 4, "epoch": 0.1904, "step": 476, "tokens_per_device": 3800 }, { "epoch": 0.1904, "loss_ce": 0.3679077923297882, "loss_lvr": 0.8505521416664124, "loss_mode_switch": 0.0, "loss_total": 0.4529629945755005, "step": 476 }, { "batch_size": 1, "epoch": 0.1904, "step": 476, "tokens_per_device": 5151 }, { "epoch": 0.1904, "loss_ce": 0.014792714267969131, "loss_lvr": 0.6112071871757507, "loss_mode_switch": 0.0, "loss_total": 0.0759134292602539, "step": 476 }, { "batch_size": 1, "epoch": 0.1904, "step": 476, "tokens_per_device": 5107 }, { "epoch": 0.1904, "loss_ce": 0.011219956912100315, "loss_lvr": 0.7527199983596802, "loss_mode_switch": 0.0, "loss_total": 0.08649195730686188, "step": 476 }, { "batch_size": 1, "epoch": 0.1904, "step": 476, "tokens_per_device": 4649 }, { "epoch": 0.1904, "loss_ce": 0.037297558039426804, "loss_lvr": 0.8805376291275024, "loss_mode_switch": 0.0, "loss_total": 0.12535132467746735, "step": 476 }, { "epoch": 0.1908, "grad_norm": 1.537852168083191, "learning_rate": 9.337127254779272e-06, "loss": 0.3695, "step": 477 }, { "batch_size": 4, "epoch": 0.1908, "step": 477, "tokens_per_device": 4384 }, { "epoch": 0.1908, "loss_ce": 0.0925072655081749, "loss_lvr": 0.9794943332672119, "loss_mode_switch": 0.0, "loss_total": 0.19045670330524445, "step": 477 }, { "batch_size": 4, "epoch": 0.1908, "step": 477, "tokens_per_device": 4652 }, { "epoch": 0.1908, "loss_ce": 0.2229308933019638, "loss_lvr": 1.1000365018844604, "loss_mode_switch": 0.0, "loss_total": 0.33293455839157104, "step": 477 }, { "batch_size": 4, "epoch": 0.1908, "step": 477, "tokens_per_device": 6536 }, { "epoch": 0.1908, "loss_ce": 0.07892882078886032, "loss_lvr": 0.8565843105316162, "loss_mode_switch": 0.0, "loss_total": 0.16458725929260254, "step": 477 }, { "batch_size": 4, "epoch": 0.1908, "step": 477, "tokens_per_device": 4080 }, { "epoch": 0.1908, "loss_ce": 0.156234011054039, "loss_lvr": 0.971684992313385, "loss_mode_switch": 0.0, "loss_total": 0.2534025013446808, "step": 477 }, { "batch_size": 4, "epoch": 0.1908, "step": 477, "tokens_per_device": 15728 }, { "epoch": 0.1908, "loss_ce": 0.3891116976737976, "loss_lvr": 1.1044697761535645, "loss_mode_switch": 0.0, "loss_total": 0.499558687210083, "step": 477 }, { "batch_size": 4, "epoch": 0.1908, "step": 477, "tokens_per_device": 3228 }, { "epoch": 0.1908, "loss_ce": 0.04307990148663521, "loss_lvr": 0.8246678709983826, "loss_mode_switch": 0.0, "loss_total": 0.12554669380187988, "step": 477 }, { "batch_size": 4, "epoch": 0.1908, "step": 477, "tokens_per_device": 3944 }, { "epoch": 0.1908, "loss_ce": 0.06103084608912468, "loss_lvr": 0.9614593386650085, "loss_mode_switch": 0.0, "loss_total": 0.15717677772045135, "step": 477 }, { "batch_size": 4, "epoch": 0.1908, "step": 477, "tokens_per_device": 4332 }, { "epoch": 0.1908, "loss_ce": 0.1729992926120758, "loss_lvr": 1.0296791791915894, "loss_mode_switch": 0.0, "loss_total": 0.27596721053123474, "step": 477 }, { "epoch": 0.1912, "grad_norm": 1.314965844154358, "learning_rate": 9.333900619504923e-06, "loss": 0.3219, "step": 478 }, { "batch_size": 1, "epoch": 0.1912, "step": 478, "tokens_per_device": 4878 }, { "epoch": 0.1912, "loss_ce": 0.0020742935594171286, "loss_lvr": 0.4493584930896759, "loss_mode_switch": 0.0, "loss_total": 0.0470101423561573, "step": 478 }, { "batch_size": 1, "epoch": 0.1912, "step": 478, "tokens_per_device": 5103 }, { "epoch": 0.1912, "loss_ce": 0.023730086162686348, "loss_lvr": 0.6946859359741211, "loss_mode_switch": 0.0, "loss_total": 0.09319867938756943, "step": 478 }, { "batch_size": 4, "epoch": 0.1912, "step": 478, "tokens_per_device": 7652 }, { "epoch": 0.1912, "loss_ce": 0.4409390687942505, "loss_lvr": 0.861089825630188, "loss_mode_switch": 0.0, "loss_total": 0.5270480513572693, "step": 478 }, { "batch_size": 4, "epoch": 0.1912, "step": 478, "tokens_per_device": 4224 }, { "epoch": 0.1912, "loss_ce": 0.4501076340675354, "loss_lvr": 0.967681884765625, "loss_mode_switch": 0.0, "loss_total": 0.5468758344650269, "step": 478 }, { "batch_size": 1, "epoch": 0.1912, "step": 478, "tokens_per_device": 4793 }, { "epoch": 0.1912, "loss_ce": 0.23784424364566803, "loss_lvr": 0.5326165556907654, "loss_mode_switch": 0.0, "loss_total": 0.29110589623451233, "step": 478 }, { "batch_size": 4, "epoch": 0.1912, "step": 478, "tokens_per_device": 6056 }, { "epoch": 0.1912, "loss_ce": 0.22255265712738037, "loss_lvr": 1.471301555633545, "loss_mode_switch": 0.0, "loss_total": 0.36968281865119934, "step": 478 }, { "batch_size": 4, "epoch": 0.1912, "step": 478, "tokens_per_device": 1156 }, { "epoch": 0.1912, "loss_ce": 0.5740344524383545, "loss_lvr": 1.0698182582855225, "loss_mode_switch": 0.0, "loss_total": 0.6810162663459778, "step": 478 }, { "batch_size": 4, "epoch": 0.1912, "step": 478, "tokens_per_device": 2980 }, { "epoch": 0.1912, "loss_ce": 0.20290620625019073, "loss_lvr": 0.664149820804596, "loss_mode_switch": 0.0, "loss_total": 0.2693212032318115, "step": 478 }, { "epoch": 0.1916, "grad_norm": 1.6037378311157227, "learning_rate": 9.330666710534556e-06, "loss": 0.3545, "step": 479 }, { "batch_size": 4, "epoch": 0.1916, "step": 479, "tokens_per_device": 4272 }, { "epoch": 0.1916, "loss_ce": 0.06413079053163528, "loss_lvr": 1.048990249633789, "loss_mode_switch": 0.0, "loss_total": 0.1690298169851303, "step": 479 }, { "batch_size": 4, "epoch": 0.1916, "step": 479, "tokens_per_device": 2588 }, { "epoch": 0.1916, "loss_ce": 0.403852254152298, "loss_lvr": 0.9739813208580017, "loss_mode_switch": 0.0, "loss_total": 0.5012503862380981, "step": 479 }, { "batch_size": 4, "epoch": 0.1916, "step": 479, "tokens_per_device": 3512 }, { "epoch": 0.1916, "loss_ce": 0.12357520312070847, "loss_lvr": 1.0712628364562988, "loss_mode_switch": 0.0, "loss_total": 0.2307014912366867, "step": 479 }, { "batch_size": 4, "epoch": 0.1916, "step": 479, "tokens_per_device": 1948 }, { "epoch": 0.1916, "loss_ce": 0.14344379305839539, "loss_lvr": 0.9638646841049194, "loss_mode_switch": 0.0, "loss_total": 0.23983025550842285, "step": 479 }, { "batch_size": 4, "epoch": 0.1916, "step": 479, "tokens_per_device": 4588 }, { "epoch": 0.1916, "loss_ce": 0.029847001656889915, "loss_lvr": 0.8590332865715027, "loss_mode_switch": 0.0, "loss_total": 0.11575033515691757, "step": 479 }, { "batch_size": 4, "epoch": 0.1916, "step": 479, "tokens_per_device": 1948 }, { "epoch": 0.1916, "loss_ce": 0.5964963436126709, "loss_lvr": 1.123897910118103, "loss_mode_switch": 0.0, "loss_total": 0.7088861465454102, "step": 479 }, { "batch_size": 1, "epoch": 0.1916, "step": 479, "tokens_per_device": 5094 }, { "epoch": 0.1916, "loss_ce": 0.0007253738003782928, "loss_lvr": 0.3414802551269531, "loss_mode_switch": 0.0, "loss_total": 0.03487339988350868, "step": 479 }, { "batch_size": 4, "epoch": 0.1916, "step": 479, "tokens_per_device": 3104 }, { "epoch": 0.1916, "loss_ce": 0.01634758524596691, "loss_lvr": 0.9260905385017395, "loss_mode_switch": 0.0, "loss_total": 0.10895664244890213, "step": 479 }, { "epoch": 0.192, "grad_norm": 1.4959179162979126, "learning_rate": 9.327425533295725e-06, "loss": 0.3307, "step": 480 }, { "batch_size": 4, "epoch": 0.192, "step": 480, "tokens_per_device": 4204 }, { "epoch": 0.192, "loss_ce": 0.02915813773870468, "loss_lvr": 1.4964090585708618, "loss_mode_switch": 0.0, "loss_total": 0.17879903316497803, "step": 480 }, { "batch_size": 4, "epoch": 0.192, "step": 480, "tokens_per_device": 2676 }, { "epoch": 0.192, "loss_ce": 0.07206740975379944, "loss_lvr": 1.4254562854766846, "loss_mode_switch": 0.0, "loss_total": 0.21461303532123566, "step": 480 }, { "batch_size": 4, "epoch": 0.192, "step": 480, "tokens_per_device": 2612 }, { "epoch": 0.192, "loss_ce": 0.2955671548843384, "loss_lvr": 0.9258042573928833, "loss_mode_switch": 0.0, "loss_total": 0.38814759254455566, "step": 480 }, { "batch_size": 4, "epoch": 0.192, "step": 480, "tokens_per_device": 4288 }, { "epoch": 0.192, "loss_ce": 0.8623311519622803, "loss_lvr": 1.2384870052337646, "loss_mode_switch": 0.0, "loss_total": 0.9861798286437988, "step": 480 }, { "batch_size": 4, "epoch": 0.192, "step": 480, "tokens_per_device": 4668 }, { "epoch": 0.192, "loss_ce": 0.06626172363758087, "loss_lvr": 0.961135983467102, "loss_mode_switch": 0.0, "loss_total": 0.1623753309249878, "step": 480 }, { "batch_size": 1, "epoch": 0.192, "step": 480, "tokens_per_device": 4914 }, { "epoch": 0.192, "loss_ce": 0.025248147547245026, "loss_lvr": 0.7617726922035217, "loss_mode_switch": 0.0, "loss_total": 0.1014254167675972, "step": 480 }, { "batch_size": 4, "epoch": 0.192, "step": 480, "tokens_per_device": 4768 }, { "epoch": 0.192, "loss_ce": 0.023144029080867767, "loss_lvr": 0.9364861249923706, "loss_mode_switch": 0.0, "loss_total": 0.11679264158010483, "step": 480 }, { "batch_size": 1, "epoch": 0.192, "step": 480, "tokens_per_device": 4894 }, { "epoch": 0.192, "loss_ce": 0.34779924154281616, "loss_lvr": 0.3592032790184021, "loss_mode_switch": 0.0, "loss_total": 0.3837195634841919, "step": 480 }, { "epoch": 0.1924, "grad_norm": 1.6283475160598755, "learning_rate": 9.324177093228175e-06, "loss": 0.3248, "step": 481 }, { "batch_size": 1, "epoch": 0.1924, "step": 481, "tokens_per_device": 4111 }, { "epoch": 0.1924, "loss_ce": 0.02528844028711319, "loss_lvr": 0.4255693852901459, "loss_mode_switch": 0.0, "loss_total": 0.06784537434577942, "step": 481 }, { "batch_size": 4, "epoch": 0.1924, "step": 481, "tokens_per_device": 9968 }, { "epoch": 0.1924, "loss_ce": 0.026890411972999573, "loss_lvr": 0.9989235401153564, "loss_mode_switch": 0.0, "loss_total": 0.12678277492523193, "step": 481 }, { "batch_size": 1, "epoch": 0.1924, "step": 481, "tokens_per_device": 5147 }, { "epoch": 0.1924, "loss_ce": 0.011947336606681347, "loss_lvr": 0.3449840843677521, "loss_mode_switch": 0.0, "loss_total": 0.04644574597477913, "step": 481 }, { "batch_size": 4, "epoch": 0.1924, "step": 481, "tokens_per_device": 4284 }, { "epoch": 0.1924, "loss_ce": 0.11029406636953354, "loss_lvr": 1.042938232421875, "loss_mode_switch": 0.0, "loss_total": 0.21458789706230164, "step": 481 }, { "batch_size": 1, "epoch": 0.1924, "step": 481, "tokens_per_device": 4875 }, { "epoch": 0.1924, "loss_ce": 0.010462055914103985, "loss_lvr": 0.3747619688510895, "loss_mode_switch": 0.0, "loss_total": 0.04793825373053551, "step": 481 }, { "batch_size": 4, "epoch": 0.1924, "step": 481, "tokens_per_device": 9692 }, { "epoch": 0.1924, "loss_ce": 0.13094469904899597, "loss_lvr": 0.8433628082275391, "loss_mode_switch": 0.0, "loss_total": 0.21528097987174988, "step": 481 }, { "batch_size": 4, "epoch": 0.1924, "step": 481, "tokens_per_device": 7628 }, { "epoch": 0.1924, "loss_ce": 0.44208186864852905, "loss_lvr": 1.1555551290512085, "loss_mode_switch": 0.0, "loss_total": 0.5576373934745789, "step": 481 }, { "batch_size": 4, "epoch": 0.1924, "step": 481, "tokens_per_device": 4208 }, { "epoch": 0.1924, "loss_ce": 0.5115291476249695, "loss_lvr": 1.1982553005218506, "loss_mode_switch": 0.0, "loss_total": 0.6313546895980835, "step": 481 }, { "epoch": 0.1928, "grad_norm": 1.3746753931045532, "learning_rate": 9.32092139578385e-06, "loss": 0.3297, "step": 482 }, { "batch_size": 4, "epoch": 0.1928, "step": 482, "tokens_per_device": 6372 }, { "epoch": 0.1928, "loss_ce": 0.03716600313782692, "loss_lvr": 0.7682949900627136, "loss_mode_switch": 0.0, "loss_total": 0.1139955073595047, "step": 482 }, { "batch_size": 4, "epoch": 0.1928, "step": 482, "tokens_per_device": 4228 }, { "epoch": 0.1928, "loss_ce": 0.3826853930950165, "loss_lvr": 1.782652497291565, "loss_mode_switch": 0.0, "loss_total": 0.5609506368637085, "step": 482 }, { "batch_size": 4, "epoch": 0.1928, "step": 482, "tokens_per_device": 3740 }, { "epoch": 0.1928, "loss_ce": 0.21000362932682037, "loss_lvr": 1.137613296508789, "loss_mode_switch": 0.0, "loss_total": 0.32376495003700256, "step": 482 }, { "batch_size": 1, "epoch": 0.1928, "step": 482, "tokens_per_device": 5131 }, { "epoch": 0.1928, "loss_ce": 0.15975745022296906, "loss_lvr": 0.2869243025779724, "loss_mode_switch": 0.0, "loss_total": 0.18844987452030182, "step": 482 }, { "batch_size": 4, "epoch": 0.1928, "step": 482, "tokens_per_device": 4224 }, { "epoch": 0.1928, "loss_ce": 0.47787797451019287, "loss_lvr": 1.074817419052124, "loss_mode_switch": 0.0, "loss_total": 0.5853596925735474, "step": 482 }, { "batch_size": 4, "epoch": 0.1928, "step": 482, "tokens_per_device": 4276 }, { "epoch": 0.1928, "loss_ce": 0.1439712792634964, "loss_lvr": 0.8746753931045532, "loss_mode_switch": 0.0, "loss_total": 0.23143881559371948, "step": 482 }, { "batch_size": 4, "epoch": 0.1928, "step": 482, "tokens_per_device": 4200 }, { "epoch": 0.1928, "loss_ce": 0.6300095319747925, "loss_lvr": 1.0579711198806763, "loss_mode_switch": 0.0, "loss_total": 0.7358066439628601, "step": 482 }, { "batch_size": 4, "epoch": 0.1928, "step": 482, "tokens_per_device": 2664 }, { "epoch": 0.1928, "loss_ce": 0.21131519973278046, "loss_lvr": 0.8939554691314697, "loss_mode_switch": 0.0, "loss_total": 0.3007107377052307, "step": 482 }, { "epoch": 0.1932, "grad_norm": 1.25123929977417, "learning_rate": 9.31765844642687e-06, "loss": 0.3029, "step": 483 }, { "batch_size": 4, "epoch": 0.1932, "step": 483, "tokens_per_device": 13408 }, { "epoch": 0.1932, "loss_ce": 0.025055045261979103, "loss_lvr": 0.6709128022193909, "loss_mode_switch": 0.0, "loss_total": 0.09214632213115692, "step": 483 }, { "batch_size": 1, "epoch": 0.1932, "step": 483, "tokens_per_device": 4906 }, { "epoch": 0.1932, "loss_ce": 0.08750810474157333, "loss_lvr": 0.4529029428958893, "loss_mode_switch": 0.0, "loss_total": 0.13279840350151062, "step": 483 }, { "batch_size": 1, "epoch": 0.1932, "step": 483, "tokens_per_device": 5121 }, { "epoch": 0.1932, "loss_ce": 0.02327534183859825, "loss_lvr": 0.24573780596256256, "loss_mode_switch": 0.0, "loss_total": 0.04784912243485451, "step": 483 }, { "batch_size": 4, "epoch": 0.1932, "step": 483, "tokens_per_device": 5800 }, { "epoch": 0.1932, "loss_ce": 0.10086006671190262, "loss_lvr": 0.8917819857597351, "loss_mode_switch": 0.0, "loss_total": 0.19003826379776, "step": 483 }, { "batch_size": 4, "epoch": 0.1932, "step": 483, "tokens_per_device": 3464 }, { "epoch": 0.1932, "loss_ce": 0.27588051557540894, "loss_lvr": 0.9717848896980286, "loss_mode_switch": 0.0, "loss_total": 0.3730590045452118, "step": 483 }, { "batch_size": 4, "epoch": 0.1932, "step": 483, "tokens_per_device": 4632 }, { "epoch": 0.1932, "loss_ce": 0.382944256067276, "loss_lvr": 1.062790870666504, "loss_mode_switch": 0.0, "loss_total": 0.4892233610153198, "step": 483 }, { "batch_size": 4, "epoch": 0.1932, "step": 483, "tokens_per_device": 4032 }, { "epoch": 0.1932, "loss_ce": 0.5871662497520447, "loss_lvr": 1.053101897239685, "loss_mode_switch": 0.0, "loss_total": 0.6924764513969421, "step": 483 }, { "batch_size": 4, "epoch": 0.1932, "step": 483, "tokens_per_device": 8864 }, { "epoch": 0.1932, "loss_ce": 0.14786683022975922, "loss_lvr": 0.724046528339386, "loss_mode_switch": 0.0, "loss_total": 0.22027148306369781, "step": 483 }, { "epoch": 0.1936, "grad_norm": 1.619901180267334, "learning_rate": 9.314388250633526e-06, "loss": 0.3569, "step": 484 }, { "batch_size": 4, "epoch": 0.1936, "step": 484, "tokens_per_device": 9924 }, { "epoch": 0.1936, "loss_ce": 0.17308282852172852, "loss_lvr": 0.7208570241928101, "loss_mode_switch": 0.0, "loss_total": 0.245168536901474, "step": 484 }, { "batch_size": 4, "epoch": 0.1936, "step": 484, "tokens_per_device": 4260 }, { "epoch": 0.1936, "loss_ce": 0.22121082246303558, "loss_lvr": 1.0511741638183594, "loss_mode_switch": 0.0, "loss_total": 0.32632824778556824, "step": 484 }, { "batch_size": 1, "epoch": 0.1936, "step": 484, "tokens_per_device": 4903 }, { "epoch": 0.1936, "loss_ce": 0.01972566917538643, "loss_lvr": 0.5131701231002808, "loss_mode_switch": 0.0, "loss_total": 0.07104268670082092, "step": 484 }, { "batch_size": 1, "epoch": 0.1936, "step": 484, "tokens_per_device": 5191 }, { "epoch": 0.1936, "loss_ce": 0.007516895420849323, "loss_lvr": 0.6121128797531128, "loss_mode_switch": 0.0, "loss_total": 0.06872818619012833, "step": 484 }, { "batch_size": 1, "epoch": 0.1936, "step": 484, "tokens_per_device": 4860 }, { "epoch": 0.1936, "loss_ce": 0.006517557427287102, "loss_lvr": 0.3493943512439728, "loss_mode_switch": 0.0, "loss_total": 0.04145698994398117, "step": 484 }, { "batch_size": 1, "epoch": 0.1936, "step": 484, "tokens_per_device": 7202 }, { "epoch": 0.1936, "loss_ce": 0.3359822630882263, "loss_lvr": 0.4408681392669678, "loss_mode_switch": 0.0, "loss_total": 0.3800690770149231, "step": 484 }, { "batch_size": 1, "epoch": 0.1936, "step": 484, "tokens_per_device": 5178 }, { "epoch": 0.1936, "loss_ce": 0.010717215947806835, "loss_lvr": 0.25093457102775574, "loss_mode_switch": 0.0, "loss_total": 0.035810671746730804, "step": 484 }, { "batch_size": 4, "epoch": 0.1936, "step": 484, "tokens_per_device": 5832 }, { "epoch": 0.1936, "loss_ce": 0.12041664123535156, "loss_lvr": 1.0280427932739258, "loss_mode_switch": 0.0, "loss_total": 0.22322091460227966, "step": 484 }, { "epoch": 0.194, "grad_norm": 2.0176024436950684, "learning_rate": 9.31111081389227e-06, "loss": 0.2579, "step": 485 }, { "batch_size": 4, "epoch": 0.194, "step": 485, "tokens_per_device": 4772 }, { "epoch": 0.194, "loss_ce": 0.06891144812107086, "loss_lvr": 0.8905200958251953, "loss_mode_switch": 0.0, "loss_total": 0.15796345472335815, "step": 485 }, { "batch_size": 1, "epoch": 0.194, "step": 485, "tokens_per_device": 4886 }, { "epoch": 0.194, "loss_ce": 0.018565630540251732, "loss_lvr": 0.45839303731918335, "loss_mode_switch": 0.0, "loss_total": 0.0644049346446991, "step": 485 }, { "batch_size": 4, "epoch": 0.194, "step": 485, "tokens_per_device": 2028 }, { "epoch": 0.194, "loss_ce": 0.4959740936756134, "loss_lvr": 1.1865284442901611, "loss_mode_switch": 0.0, "loss_total": 0.614626944065094, "step": 485 }, { "batch_size": 4, "epoch": 0.194, "step": 485, "tokens_per_device": 5536 }, { "epoch": 0.194, "loss_ce": 0.1852862536907196, "loss_lvr": 0.86396723985672, "loss_mode_switch": 0.0, "loss_total": 0.2716829776763916, "step": 485 }, { "batch_size": 4, "epoch": 0.194, "step": 485, "tokens_per_device": 10812 }, { "epoch": 0.194, "loss_ce": 0.03998686373233795, "loss_lvr": 0.8611835241317749, "loss_mode_switch": 0.0, "loss_total": 0.12610521912574768, "step": 485 }, { "batch_size": 4, "epoch": 0.194, "step": 485, "tokens_per_device": 1552 }, { "epoch": 0.194, "loss_ce": 0.042964767664670944, "loss_lvr": 2.1298556327819824, "loss_mode_switch": 0.0, "loss_total": 0.25595033168792725, "step": 485 }, { "batch_size": 4, "epoch": 0.194, "step": 485, "tokens_per_device": 11552 }, { "epoch": 0.194, "loss_ce": 0.059732742607593536, "loss_lvr": 0.5413542985916138, "loss_mode_switch": 0.0, "loss_total": 0.11386817693710327, "step": 485 }, { "batch_size": 4, "epoch": 0.194, "step": 485, "tokens_per_device": 2580 }, { "epoch": 0.194, "loss_ce": 0.38702157139778137, "loss_lvr": 1.1513450145721436, "loss_mode_switch": 0.0, "loss_total": 0.5021560788154602, "step": 485 }, { "epoch": 0.1944, "grad_norm": 1.3818460702896118, "learning_rate": 9.30782614170371e-06, "loss": 0.3149, "step": 486 }, { "batch_size": 4, "epoch": 0.1944, "step": 486, "tokens_per_device": 4240 }, { "epoch": 0.1944, "loss_ce": 0.511550784111023, "loss_lvr": 1.0549288988113403, "loss_mode_switch": 0.0, "loss_total": 0.617043673992157, "step": 486 }, { "batch_size": 4, "epoch": 0.1944, "step": 486, "tokens_per_device": 8152 }, { "epoch": 0.1944, "loss_ce": 0.09079118072986603, "loss_lvr": 0.518923819065094, "loss_mode_switch": 0.0, "loss_total": 0.14268356561660767, "step": 486 }, { "batch_size": 1, "epoch": 0.1944, "step": 486, "tokens_per_device": 5173 }, { "epoch": 0.1944, "loss_ce": 0.04714310169219971, "loss_lvr": 0.377364844083786, "loss_mode_switch": 0.0, "loss_total": 0.08487959206104279, "step": 486 }, { "batch_size": 4, "epoch": 0.1944, "step": 486, "tokens_per_device": 2708 }, { "epoch": 0.1944, "loss_ce": 0.15055429935455322, "loss_lvr": 1.4362521171569824, "loss_mode_switch": 0.0, "loss_total": 0.2941794991493225, "step": 486 }, { "batch_size": 4, "epoch": 0.1944, "step": 486, "tokens_per_device": 7152 }, { "epoch": 0.1944, "loss_ce": 0.06219882518053055, "loss_lvr": 0.8028247356414795, "loss_mode_switch": 0.0, "loss_total": 0.14248129725456238, "step": 486 }, { "batch_size": 1, "epoch": 0.1944, "step": 486, "tokens_per_device": 4965 }, { "epoch": 0.1944, "loss_ce": 0.10221337527036667, "loss_lvr": 0.4540206789970398, "loss_mode_switch": 0.0, "loss_total": 0.147615447640419, "step": 486 }, { "batch_size": 4, "epoch": 0.1944, "step": 486, "tokens_per_device": 3948 }, { "epoch": 0.1944, "loss_ce": 0.31690478324890137, "loss_lvr": 1.4642040729522705, "loss_mode_switch": 0.0, "loss_total": 0.4633252024650574, "step": 486 }, { "batch_size": 1, "epoch": 0.1944, "step": 486, "tokens_per_device": 4775 }, { "epoch": 0.1944, "loss_ce": 0.07962050288915634, "loss_lvr": 0.5175691843032837, "loss_mode_switch": 0.0, "loss_total": 0.1313774287700653, "step": 486 }, { "epoch": 0.1948, "grad_norm": 1.3850899934768677, "learning_rate": 9.304534239580591e-06, "loss": 0.2733, "step": 487 }, { "batch_size": 4, "epoch": 0.1948, "step": 487, "tokens_per_device": 1276 }, { "epoch": 0.1948, "loss_ce": 0.8025998473167419, "loss_lvr": 1.2037196159362793, "loss_mode_switch": 0.0, "loss_total": 0.922971785068512, "step": 487 }, { "batch_size": 1, "epoch": 0.1948, "step": 487, "tokens_per_device": 4836 }, { "epoch": 0.1948, "loss_ce": 0.010241391137242317, "loss_lvr": 0.6981645226478577, "loss_mode_switch": 0.0, "loss_total": 0.08005784451961517, "step": 487 }, { "batch_size": 1, "epoch": 0.1948, "step": 487, "tokens_per_device": 5098 }, { "epoch": 0.1948, "loss_ce": 0.07797598838806152, "loss_lvr": 0.4073292911052704, "loss_mode_switch": 0.0, "loss_total": 0.11870892345905304, "step": 487 }, { "batch_size": 4, "epoch": 0.1948, "step": 487, "tokens_per_device": 5624 }, { "epoch": 0.1948, "loss_ce": 0.35598689317703247, "loss_lvr": 0.8092897534370422, "loss_mode_switch": 0.0, "loss_total": 0.43691587448120117, "step": 487 }, { "batch_size": 4, "epoch": 0.1948, "step": 487, "tokens_per_device": 3920 }, { "epoch": 0.1948, "loss_ce": 0.7894357442855835, "loss_lvr": 0.733547031879425, "loss_mode_switch": 0.0, "loss_total": 0.8627904653549194, "step": 487 }, { "batch_size": 4, "epoch": 0.1948, "step": 487, "tokens_per_device": 4584 }, { "epoch": 0.1948, "loss_ce": 0.016469737514853477, "loss_lvr": 0.737497091293335, "loss_mode_switch": 0.0, "loss_total": 0.09021945297718048, "step": 487 }, { "batch_size": 4, "epoch": 0.1948, "step": 487, "tokens_per_device": 4264 }, { "epoch": 0.1948, "loss_ce": 0.03153840824961662, "loss_lvr": 0.8647188544273376, "loss_mode_switch": 0.0, "loss_total": 0.11801029741764069, "step": 487 }, { "batch_size": 4, "epoch": 0.1948, "step": 487, "tokens_per_device": 5396 }, { "epoch": 0.1948, "loss_ce": 0.012982486747205257, "loss_lvr": 0.8757312893867493, "loss_mode_switch": 0.0, "loss_total": 0.10055562108755112, "step": 487 }, { "epoch": 0.1952, "grad_norm": 1.393433690071106, "learning_rate": 9.301235113047801e-06, "loss": 0.3006, "step": 488 }, { "batch_size": 1, "epoch": 0.1952, "step": 488, "tokens_per_device": 4915 }, { "epoch": 0.1952, "loss_ce": 0.01539768185466528, "loss_lvr": 0.3807239532470703, "loss_mode_switch": 0.0, "loss_total": 0.053470078855752945, "step": 488 }, { "batch_size": 4, "epoch": 0.1952, "step": 488, "tokens_per_device": 4192 }, { "epoch": 0.1952, "loss_ce": 0.3280617296695709, "loss_lvr": 0.8886029124259949, "loss_mode_switch": 0.0, "loss_total": 0.41692203283309937, "step": 488 }, { "batch_size": 4, "epoch": 0.1952, "step": 488, "tokens_per_device": 4228 }, { "epoch": 0.1952, "loss_ce": 0.23876169323921204, "loss_lvr": 0.9819960594177246, "loss_mode_switch": 0.0, "loss_total": 0.3369612991809845, "step": 488 }, { "batch_size": 1, "epoch": 0.1952, "step": 488, "tokens_per_device": 6221 }, { "epoch": 0.1952, "loss_ce": 0.044981036335229874, "loss_lvr": 0.6697052121162415, "loss_mode_switch": 0.0, "loss_total": 0.1119515597820282, "step": 488 }, { "batch_size": 4, "epoch": 0.1952, "step": 488, "tokens_per_device": 5764 }, { "epoch": 0.1952, "loss_ce": 0.14522358775138855, "loss_lvr": 0.9496167898178101, "loss_mode_switch": 0.0, "loss_total": 0.24018526077270508, "step": 488 }, { "batch_size": 4, "epoch": 0.1952, "step": 488, "tokens_per_device": 4692 }, { "epoch": 0.1952, "loss_ce": 0.029036400839686394, "loss_lvr": 0.8001930117607117, "loss_mode_switch": 0.0, "loss_total": 0.10905570536851883, "step": 488 }, { "batch_size": 1, "epoch": 0.1952, "step": 488, "tokens_per_device": 5099 }, { "epoch": 0.1952, "loss_ce": 0.021083863452076912, "loss_lvr": 0.5158049464225769, "loss_mode_switch": 0.0, "loss_total": 0.07266435772180557, "step": 488 }, { "batch_size": 4, "epoch": 0.1952, "step": 488, "tokens_per_device": 6632 }, { "epoch": 0.1952, "loss_ce": 0.03092237003147602, "loss_lvr": 0.8514257073402405, "loss_mode_switch": 0.0, "loss_total": 0.11606494337320328, "step": 488 }, { "epoch": 0.1956, "grad_norm": 1.9921804666519165, "learning_rate": 9.297928767642346e-06, "loss": 0.2798, "step": 489 }, { "batch_size": 1, "epoch": 0.1956, "step": 489, "tokens_per_device": 4909 }, { "epoch": 0.1956, "loss_ce": 0.1812334954738617, "loss_lvr": 0.8200945258140564, "loss_mode_switch": 0.0, "loss_total": 0.2632429599761963, "step": 489 }, { "batch_size": 4, "epoch": 0.1956, "step": 489, "tokens_per_device": 4644 }, { "epoch": 0.1956, "loss_ce": 0.15483777225017548, "loss_lvr": 0.9424318671226501, "loss_mode_switch": 0.0, "loss_total": 0.24908095598220825, "step": 489 }, { "batch_size": 4, "epoch": 0.1956, "step": 489, "tokens_per_device": 4236 }, { "epoch": 0.1956, "loss_ce": 0.19564715027809143, "loss_lvr": 0.8184689283370972, "loss_mode_switch": 0.0, "loss_total": 0.27749404311180115, "step": 489 }, { "batch_size": 4, "epoch": 0.1956, "step": 489, "tokens_per_device": 4788 }, { "epoch": 0.1956, "loss_ce": 0.11933358758687973, "loss_lvr": 0.7571324110031128, "loss_mode_switch": 0.0, "loss_total": 0.1950468271970749, "step": 489 }, { "batch_size": 4, "epoch": 0.1956, "step": 489, "tokens_per_device": 2616 }, { "epoch": 0.1956, "loss_ce": 0.5264697074890137, "loss_lvr": 1.0112179517745972, "loss_mode_switch": 0.0, "loss_total": 0.6275914907455444, "step": 489 }, { "batch_size": 4, "epoch": 0.1956, "step": 489, "tokens_per_device": 4672 }, { "epoch": 0.1956, "loss_ce": 0.260408490896225, "loss_lvr": 0.8525667786598206, "loss_mode_switch": 0.0, "loss_total": 0.3456651568412781, "step": 489 }, { "batch_size": 1, "epoch": 0.1956, "step": 489, "tokens_per_device": 4864 }, { "epoch": 0.1956, "loss_ce": 0.001109940349124372, "loss_lvr": 0.3236156404018402, "loss_mode_switch": 0.0, "loss_total": 0.03347150236368179, "step": 489 }, { "batch_size": 4, "epoch": 0.1956, "step": 489, "tokens_per_device": 1312 }, { "epoch": 0.1956, "loss_ce": 0.1517515480518341, "loss_lvr": 1.1183315515518188, "loss_mode_switch": 0.0, "loss_total": 0.263584703207016, "step": 489 }, { "epoch": 0.196, "grad_norm": 1.3544315099716187, "learning_rate": 9.29461520891335e-06, "loss": 0.3501, "step": 490 }, { "batch_size": 1, "epoch": 0.196, "step": 490, "tokens_per_device": 4786 }, { "epoch": 0.196, "loss_ce": 0.006593172438442707, "loss_lvr": 0.454693078994751, "loss_mode_switch": 0.0, "loss_total": 0.05206248164176941, "step": 490 }, { "batch_size": 4, "epoch": 0.196, "step": 490, "tokens_per_device": 5124 }, { "epoch": 0.196, "loss_ce": 0.1750781089067459, "loss_lvr": 0.8686189651489258, "loss_mode_switch": 0.0, "loss_total": 0.26194000244140625, "step": 490 }, { "batch_size": 4, "epoch": 0.196, "step": 490, "tokens_per_device": 6332 }, { "epoch": 0.196, "loss_ce": 0.26691290736198425, "loss_lvr": 0.7022420167922974, "loss_mode_switch": 0.0, "loss_total": 0.3371371030807495, "step": 490 }, { "batch_size": 1, "epoch": 0.196, "step": 490, "tokens_per_device": 4935 }, { "epoch": 0.196, "loss_ce": 0.4705195426940918, "loss_lvr": 0.5539083480834961, "loss_mode_switch": 0.0, "loss_total": 0.5259103775024414, "step": 490 }, { "batch_size": 1, "epoch": 0.196, "step": 490, "tokens_per_device": 4866 }, { "epoch": 0.196, "loss_ce": 0.17529651522636414, "loss_lvr": 0.5387723445892334, "loss_mode_switch": 0.0, "loss_total": 0.22917374968528748, "step": 490 }, { "batch_size": 1, "epoch": 0.196, "step": 490, "tokens_per_device": 4881 }, { "epoch": 0.196, "loss_ce": 0.08278923481702805, "loss_lvr": 0.4145956039428711, "loss_mode_switch": 0.0, "loss_total": 0.12424879521131516, "step": 490 }, { "batch_size": 4, "epoch": 0.196, "step": 490, "tokens_per_device": 2660 }, { "epoch": 0.196, "loss_ce": 0.4157244861125946, "loss_lvr": 0.9773344397544861, "loss_mode_switch": 0.0, "loss_total": 0.5134579539299011, "step": 490 }, { "batch_size": 4, "epoch": 0.196, "step": 490, "tokens_per_device": 1592 }, { "epoch": 0.196, "loss_ce": 0.29810306429862976, "loss_lvr": 1.0929487943649292, "loss_mode_switch": 0.0, "loss_total": 0.40739795565605164, "step": 490 }, { "epoch": 0.1964, "grad_norm": 1.6763217449188232, "learning_rate": 9.291294442422043e-06, "loss": 0.3372, "step": 491 }, { "batch_size": 1, "epoch": 0.1964, "step": 491, "tokens_per_device": 4886 }, { "epoch": 0.1964, "loss_ce": 0.0075575606897473335, "loss_lvr": 0.8171536922454834, "loss_mode_switch": 0.0, "loss_total": 0.08927293121814728, "step": 491 }, { "batch_size": 4, "epoch": 0.1964, "step": 491, "tokens_per_device": 4296 }, { "epoch": 0.1964, "loss_ce": 0.3839224874973297, "loss_lvr": 0.8357469439506531, "loss_mode_switch": 0.0, "loss_total": 0.46749716997146606, "step": 491 }, { "batch_size": 1, "epoch": 0.1964, "step": 491, "tokens_per_device": 4779 }, { "epoch": 0.1964, "loss_ce": 0.0010444237850606441, "loss_lvr": 0.596616804599762, "loss_mode_switch": 0.0, "loss_total": 0.06070610508322716, "step": 491 }, { "batch_size": 4, "epoch": 0.1964, "step": 491, "tokens_per_device": 3864 }, { "epoch": 0.1964, "loss_ce": 0.4513949751853943, "loss_lvr": 1.089552402496338, "loss_mode_switch": 0.0, "loss_total": 0.560350239276886, "step": 491 }, { "batch_size": 1, "epoch": 0.1964, "step": 491, "tokens_per_device": 7052 }, { "epoch": 0.1964, "loss_ce": 0.004436022136360407, "loss_lvr": 0.5347175598144531, "loss_mode_switch": 0.0, "loss_total": 0.05790777876973152, "step": 491 }, { "batch_size": 4, "epoch": 0.1964, "step": 491, "tokens_per_device": 2908 }, { "epoch": 0.1964, "loss_ce": 0.8243066072463989, "loss_lvr": 0.6853289604187012, "loss_mode_switch": 0.0, "loss_total": 0.8928394913673401, "step": 491 }, { "batch_size": 4, "epoch": 0.1964, "step": 491, "tokens_per_device": 4796 }, { "epoch": 0.1964, "loss_ce": 0.4892178177833557, "loss_lvr": 1.0326160192489624, "loss_mode_switch": 0.0, "loss_total": 0.592479407787323, "step": 491 }, { "batch_size": 1, "epoch": 0.1964, "step": 491, "tokens_per_device": 5027 }, { "epoch": 0.1964, "loss_ce": 0.009920621290802956, "loss_lvr": 0.37323060631752014, "loss_mode_switch": 0.0, "loss_total": 0.04724368453025818, "step": 491 }, { "epoch": 0.1968, "grad_norm": 1.359840989112854, "learning_rate": 9.287966473741752e-06, "loss": 0.3314, "step": 492 }, { "batch_size": 4, "epoch": 0.1968, "step": 492, "tokens_per_device": 4660 }, { "epoch": 0.1968, "loss_ce": 0.7304947972297668, "loss_lvr": 0.832511842250824, "loss_mode_switch": 0.0, "loss_total": 0.8137459754943848, "step": 492 }, { "batch_size": 1, "epoch": 0.1968, "step": 492, "tokens_per_device": 4975 }, { "epoch": 0.1968, "loss_ce": 0.0173444002866745, "loss_lvr": 0.45743465423583984, "loss_mode_switch": 0.0, "loss_total": 0.06308786571025848, "step": 492 }, { "batch_size": 4, "epoch": 0.1968, "step": 492, "tokens_per_device": 5828 }, { "epoch": 0.1968, "loss_ce": 0.20002442598342896, "loss_lvr": 0.9902228116989136, "loss_mode_switch": 0.0, "loss_total": 0.29904669523239136, "step": 492 }, { "batch_size": 1, "epoch": 0.1968, "step": 492, "tokens_per_device": 5023 }, { "epoch": 0.1968, "loss_ce": 0.0022505971137434244, "loss_lvr": 0.2456888109445572, "loss_mode_switch": 0.0, "loss_total": 0.02681947872042656, "step": 492 }, { "batch_size": 1, "epoch": 0.1968, "step": 492, "tokens_per_device": 4912 }, { "epoch": 0.1968, "loss_ce": 0.8313472867012024, "loss_lvr": 0.529472291469574, "loss_mode_switch": 0.0, "loss_total": 0.8842945098876953, "step": 492 }, { "batch_size": 4, "epoch": 0.1968, "step": 492, "tokens_per_device": 3784 }, { "epoch": 0.1968, "loss_ce": 0.24194753170013428, "loss_lvr": 1.3041212558746338, "loss_mode_switch": 0.0, "loss_total": 0.37235966324806213, "step": 492 }, { "batch_size": 1, "epoch": 0.1968, "step": 492, "tokens_per_device": 5109 }, { "epoch": 0.1968, "loss_ce": 0.13202504813671112, "loss_lvr": 0.5893117189407349, "loss_mode_switch": 0.0, "loss_total": 0.1909562200307846, "step": 492 }, { "batch_size": 4, "epoch": 0.1968, "step": 492, "tokens_per_device": 4260 }, { "epoch": 0.1968, "loss_ce": 0.08750129491090775, "loss_lvr": 1.101837396621704, "loss_mode_switch": 0.0, "loss_total": 0.19768503308296204, "step": 492 }, { "epoch": 0.1972, "grad_norm": 1.6851187944412231, "learning_rate": 9.284631308457892e-06, "loss": 0.3611, "step": 493 }, { "batch_size": 4, "epoch": 0.1972, "step": 493, "tokens_per_device": 5784 }, { "epoch": 0.1972, "loss_ce": 0.03252248466014862, "loss_lvr": 0.85801762342453, "loss_mode_switch": 0.0, "loss_total": 0.11832424998283386, "step": 493 }, { "batch_size": 1, "epoch": 0.1972, "step": 493, "tokens_per_device": 4960 }, { "epoch": 0.1972, "loss_ce": 0.0332358255982399, "loss_lvr": 0.6973344087600708, "loss_mode_switch": 0.0, "loss_total": 0.10296926647424698, "step": 493 }, { "batch_size": 4, "epoch": 0.1972, "step": 493, "tokens_per_device": 1248 }, { "epoch": 0.1972, "loss_ce": 0.3455638289451599, "loss_lvr": 1.5877732038497925, "loss_mode_switch": 0.0, "loss_total": 0.5043411254882812, "step": 493 }, { "batch_size": 4, "epoch": 0.1972, "step": 493, "tokens_per_device": 3812 }, { "epoch": 0.1972, "loss_ce": 0.22826939821243286, "loss_lvr": 1.0298699140548706, "loss_mode_switch": 0.0, "loss_total": 0.3312563896179199, "step": 493 }, { "batch_size": 1, "epoch": 0.1972, "step": 493, "tokens_per_device": 4861 }, { "epoch": 0.1972, "loss_ce": 0.030694173648953438, "loss_lvr": 0.3253996670246124, "loss_mode_switch": 0.0, "loss_total": 0.06323414295911789, "step": 493 }, { "batch_size": 4, "epoch": 0.1972, "step": 493, "tokens_per_device": 4348 }, { "epoch": 0.1972, "loss_ce": 0.13177742063999176, "loss_lvr": 1.0316681861877441, "loss_mode_switch": 0.0, "loss_total": 0.23494423925876617, "step": 493 }, { "batch_size": 4, "epoch": 0.1972, "step": 493, "tokens_per_device": 7352 }, { "epoch": 0.1972, "loss_ce": 0.37782031297683716, "loss_lvr": 0.7405832409858704, "loss_mode_switch": 0.0, "loss_total": 0.4518786370754242, "step": 493 }, { "batch_size": 4, "epoch": 0.1972, "step": 493, "tokens_per_device": 4428 }, { "epoch": 0.1972, "loss_ce": 0.2169646918773651, "loss_lvr": 0.9662465453147888, "loss_mode_switch": 0.0, "loss_total": 0.31358933448791504, "step": 493 }, { "epoch": 0.1976, "grad_norm": 1.633129596710205, "learning_rate": 9.281288952167957e-06, "loss": 0.3242, "step": 494 }, { "batch_size": 4, "epoch": 0.1976, "step": 494, "tokens_per_device": 3752 }, { "epoch": 0.1976, "loss_ce": 0.13743725419044495, "loss_lvr": 1.240339994430542, "loss_mode_switch": 0.0, "loss_total": 0.2614712715148926, "step": 494 }, { "batch_size": 4, "epoch": 0.1976, "step": 494, "tokens_per_device": 6260 }, { "epoch": 0.1976, "loss_ce": 0.04767756164073944, "loss_lvr": 0.7012221217155457, "loss_mode_switch": 0.0, "loss_total": 0.117799773812294, "step": 494 }, { "batch_size": 4, "epoch": 0.1976, "step": 494, "tokens_per_device": 2520 }, { "epoch": 0.1976, "loss_ce": 0.3749728500843048, "loss_lvr": 0.96810382604599, "loss_mode_switch": 0.0, "loss_total": 0.47178322076797485, "step": 494 }, { "batch_size": 1, "epoch": 0.1976, "step": 494, "tokens_per_device": 5047 }, { "epoch": 0.1976, "loss_ce": 0.12277689576148987, "loss_lvr": 0.40403252840042114, "loss_mode_switch": 0.0, "loss_total": 0.1631801426410675, "step": 494 }, { "batch_size": 1, "epoch": 0.1976, "step": 494, "tokens_per_device": 4893 }, { "epoch": 0.1976, "loss_ce": 0.006380516104400158, "loss_lvr": 0.8648051619529724, "loss_mode_switch": 0.0, "loss_total": 0.09286103397607803, "step": 494 }, { "batch_size": 4, "epoch": 0.1976, "step": 494, "tokens_per_device": 3800 }, { "epoch": 0.1976, "loss_ce": 0.19650746881961823, "loss_lvr": 0.7311309576034546, "loss_mode_switch": 0.0, "loss_total": 0.2696205675601959, "step": 494 }, { "batch_size": 1, "epoch": 0.1976, "step": 494, "tokens_per_device": 5209 }, { "epoch": 0.1976, "loss_ce": 0.25956958532333374, "loss_lvr": 0.40095940232276917, "loss_mode_switch": 0.0, "loss_total": 0.29966551065444946, "step": 494 }, { "batch_size": 1, "epoch": 0.1976, "step": 494, "tokens_per_device": 4936 }, { "epoch": 0.1976, "loss_ce": 0.013302234001457691, "loss_lvr": 0.39447444677352905, "loss_mode_switch": 0.0, "loss_total": 0.05274967849254608, "step": 494 }, { "epoch": 0.198, "grad_norm": 1.4820479154586792, "learning_rate": 9.277939410481507e-06, "loss": 0.3542, "step": 495 }, { "batch_size": 4, "epoch": 0.198, "step": 495, "tokens_per_device": 4492 }, { "epoch": 0.198, "loss_ce": 0.18159788846969604, "loss_lvr": 0.8469097018241882, "loss_mode_switch": 0.0, "loss_total": 0.2662888765335083, "step": 495 }, { "batch_size": 4, "epoch": 0.198, "step": 495, "tokens_per_device": 14632 }, { "epoch": 0.198, "loss_ce": 0.3156110644340515, "loss_lvr": 0.9866691827774048, "loss_mode_switch": 0.0, "loss_total": 0.41427797079086304, "step": 495 }, { "batch_size": 1, "epoch": 0.198, "step": 495, "tokens_per_device": 4856 }, { "epoch": 0.198, "loss_ce": 0.0030986787751317024, "loss_lvr": 0.5243799090385437, "loss_mode_switch": 0.0, "loss_total": 0.0555366687476635, "step": 495 }, { "batch_size": 4, "epoch": 0.198, "step": 495, "tokens_per_device": 2564 }, { "epoch": 0.198, "loss_ce": 0.7872844934463501, "loss_lvr": 1.1147822141647339, "loss_mode_switch": 0.0, "loss_total": 0.8987627029418945, "step": 495 }, { "batch_size": 4, "epoch": 0.198, "step": 495, "tokens_per_device": 1560 }, { "epoch": 0.198, "loss_ce": 0.25268980860710144, "loss_lvr": 1.1001161336898804, "loss_mode_switch": 0.0, "loss_total": 0.362701416015625, "step": 495 }, { "batch_size": 4, "epoch": 0.198, "step": 495, "tokens_per_device": 4288 }, { "epoch": 0.198, "loss_ce": 0.37178659439086914, "loss_lvr": 1.0463882684707642, "loss_mode_switch": 0.0, "loss_total": 0.4764254093170166, "step": 495 }, { "batch_size": 4, "epoch": 0.198, "step": 495, "tokens_per_device": 4392 }, { "epoch": 0.198, "loss_ce": 0.10813979804515839, "loss_lvr": 0.9134658575057983, "loss_mode_switch": 0.0, "loss_total": 0.1994863748550415, "step": 495 }, { "batch_size": 4, "epoch": 0.198, "step": 495, "tokens_per_device": 3804 }, { "epoch": 0.198, "loss_ce": 0.3307279050350189, "loss_lvr": 1.0440399646759033, "loss_mode_switch": 0.0, "loss_total": 0.43513190746307373, "step": 495 }, { "epoch": 0.1984, "grad_norm": 1.4050648212432861, "learning_rate": 9.274582689020164e-06, "loss": 0.3176, "step": 496 }, { "batch_size": 4, "epoch": 0.1984, "step": 496, "tokens_per_device": 5000 }, { "epoch": 0.1984, "loss_ce": 0.0954151302576065, "loss_lvr": 1.0858895778656006, "loss_mode_switch": 0.0, "loss_total": 0.20400407910346985, "step": 496 }, { "batch_size": 4, "epoch": 0.1984, "step": 496, "tokens_per_device": 3760 }, { "epoch": 0.1984, "loss_ce": 0.1161339282989502, "loss_lvr": 0.6404039263725281, "loss_mode_switch": 0.0, "loss_total": 0.180174320936203, "step": 496 }, { "batch_size": 4, "epoch": 0.1984, "step": 496, "tokens_per_device": 1676 }, { "epoch": 0.1984, "loss_ce": 0.8209365606307983, "loss_lvr": 1.0128756761550903, "loss_mode_switch": 0.0, "loss_total": 0.9222241044044495, "step": 496 }, { "batch_size": 1, "epoch": 0.1984, "step": 496, "tokens_per_device": 4891 }, { "epoch": 0.1984, "loss_ce": 0.14763982594013214, "loss_lvr": 1.3957462310791016, "loss_mode_switch": 0.0, "loss_total": 0.287214457988739, "step": 496 }, { "batch_size": 1, "epoch": 0.1984, "step": 496, "tokens_per_device": 5369 }, { "epoch": 0.1984, "loss_ce": 0.05726782977581024, "loss_lvr": 0.670734167098999, "loss_mode_switch": 0.0, "loss_total": 0.12434124946594238, "step": 496 }, { "batch_size": 4, "epoch": 0.1984, "step": 496, "tokens_per_device": 4176 }, { "epoch": 0.1984, "loss_ce": 0.3641769289970398, "loss_lvr": 1.0219978094100952, "loss_mode_switch": 0.0, "loss_total": 0.46637672185897827, "step": 496 }, { "batch_size": 4, "epoch": 0.1984, "step": 496, "tokens_per_device": 1580 }, { "epoch": 0.1984, "loss_ce": 0.3982735574245453, "loss_lvr": 1.1394109725952148, "loss_mode_switch": 0.0, "loss_total": 0.5122146606445312, "step": 496 }, { "batch_size": 1, "epoch": 0.1984, "step": 496, "tokens_per_device": 5098 }, { "epoch": 0.1984, "loss_ce": 0.006840083282440901, "loss_lvr": 0.3518993556499481, "loss_mode_switch": 0.0, "loss_total": 0.04203002154827118, "step": 496 }, { "epoch": 0.1988, "grad_norm": 1.4715179204940796, "learning_rate": 9.271218793417601e-06, "loss": 0.3022, "step": 497 }, { "batch_size": 4, "epoch": 0.1988, "step": 497, "tokens_per_device": 4032 }, { "epoch": 0.1988, "loss_ce": 0.2429255247116089, "loss_lvr": 1.1906720399856567, "loss_mode_switch": 0.0, "loss_total": 0.3619927167892456, "step": 497 }, { "batch_size": 1, "epoch": 0.1988, "step": 497, "tokens_per_device": 4895 }, { "epoch": 0.1988, "loss_ce": 0.04164685681462288, "loss_lvr": 0.4536682963371277, "loss_mode_switch": 0.0, "loss_total": 0.08701369166374207, "step": 497 }, { "batch_size": 1, "epoch": 0.1988, "step": 497, "tokens_per_device": 5046 }, { "epoch": 0.1988, "loss_ce": 0.012594223022460938, "loss_lvr": 0.5552108287811279, "loss_mode_switch": 0.0, "loss_total": 0.06811530888080597, "step": 497 }, { "batch_size": 1, "epoch": 0.1988, "step": 497, "tokens_per_device": 5051 }, { "epoch": 0.1988, "loss_ce": 0.19044004380702972, "loss_lvr": 0.4219924509525299, "loss_mode_switch": 0.0, "loss_total": 0.23263928294181824, "step": 497 }, { "batch_size": 4, "epoch": 0.1988, "step": 497, "tokens_per_device": 5780 }, { "epoch": 0.1988, "loss_ce": 0.0645182803273201, "loss_lvr": 0.8507349491119385, "loss_mode_switch": 0.0, "loss_total": 0.14959177374839783, "step": 497 }, { "batch_size": 4, "epoch": 0.1988, "step": 497, "tokens_per_device": 3940 }, { "epoch": 0.1988, "loss_ce": 0.13177083432674408, "loss_lvr": 0.9735849499702454, "loss_mode_switch": 0.0, "loss_total": 0.22912932932376862, "step": 497 }, { "batch_size": 4, "epoch": 0.1988, "step": 497, "tokens_per_device": 3912 }, { "epoch": 0.1988, "loss_ce": 0.5665491223335266, "loss_lvr": 1.1226541996002197, "loss_mode_switch": 0.0, "loss_total": 0.6788145303726196, "step": 497 }, { "batch_size": 4, "epoch": 0.1988, "step": 497, "tokens_per_device": 4364 }, { "epoch": 0.1988, "loss_ce": 0.016817322000861168, "loss_lvr": 1.0066975355148315, "loss_mode_switch": 0.0, "loss_total": 0.11748708039522171, "step": 497 }, { "epoch": 0.1992, "grad_norm": 1.276397705078125, "learning_rate": 9.267847729319528e-06, "loss": 0.2901, "step": 498 }, { "batch_size": 1, "epoch": 0.1992, "step": 498, "tokens_per_device": 4899 }, { "epoch": 0.1992, "loss_ce": 0.3221707344055176, "loss_lvr": 0.8140865564346313, "loss_mode_switch": 0.0, "loss_total": 0.40357938408851624, "step": 498 }, { "batch_size": 1, "epoch": 0.1992, "step": 498, "tokens_per_device": 4596 }, { "epoch": 0.1992, "loss_ce": 0.007684112060815096, "loss_lvr": 0.9766733646392822, "loss_mode_switch": 0.0, "loss_total": 0.10535144805908203, "step": 498 }, { "batch_size": 4, "epoch": 0.1992, "step": 498, "tokens_per_device": 5008 }, { "epoch": 0.1992, "loss_ce": 0.2798505425453186, "loss_lvr": 1.0170716047286987, "loss_mode_switch": 0.0, "loss_total": 0.3815577030181885, "step": 498 }, { "batch_size": 4, "epoch": 0.1992, "step": 498, "tokens_per_device": 4364 }, { "epoch": 0.1992, "loss_ce": 0.48062556982040405, "loss_lvr": 1.0600661039352417, "loss_mode_switch": 0.0, "loss_total": 0.5866321921348572, "step": 498 }, { "batch_size": 1, "epoch": 0.1992, "step": 498, "tokens_per_device": 5048 }, { "epoch": 0.1992, "loss_ce": 0.035135842859745026, "loss_lvr": 0.22413703799247742, "loss_mode_switch": 0.0, "loss_total": 0.05754954740405083, "step": 498 }, { "batch_size": 4, "epoch": 0.1992, "step": 498, "tokens_per_device": 1204 }, { "epoch": 0.1992, "loss_ce": 0.25273755192756653, "loss_lvr": 1.123553991317749, "loss_mode_switch": 0.0, "loss_total": 0.3650929629802704, "step": 498 }, { "batch_size": 4, "epoch": 0.1992, "step": 498, "tokens_per_device": 15016 }, { "epoch": 0.1992, "loss_ce": 0.00793802086263895, "loss_lvr": 0.8711889386177063, "loss_mode_switch": 0.0, "loss_total": 0.095056913793087, "step": 498 }, { "batch_size": 4, "epoch": 0.1992, "step": 498, "tokens_per_device": 3792 }, { "epoch": 0.1992, "loss_ce": 0.5121566653251648, "loss_lvr": 1.2872298955917358, "loss_mode_switch": 0.0, "loss_total": 0.6408796310424805, "step": 498 }, { "epoch": 0.1996, "grad_norm": 1.3751871585845947, "learning_rate": 9.264469502383689e-06, "loss": 0.3049, "step": 499 }, { "batch_size": 1, "epoch": 0.1996, "step": 499, "tokens_per_device": 4898 }, { "epoch": 0.1996, "loss_ce": 0.6024063229560852, "loss_lvr": 1.0651028156280518, "loss_mode_switch": 0.0, "loss_total": 0.7089166045188904, "step": 499 }, { "batch_size": 4, "epoch": 0.1996, "step": 499, "tokens_per_device": 4224 }, { "epoch": 0.1996, "loss_ce": 0.18137015402317047, "loss_lvr": 1.1081180572509766, "loss_mode_switch": 0.0, "loss_total": 0.29218196868896484, "step": 499 }, { "batch_size": 4, "epoch": 0.1996, "step": 499, "tokens_per_device": 4392 }, { "epoch": 0.1996, "loss_ce": 0.4015980362892151, "loss_lvr": 1.08781099319458, "loss_mode_switch": 0.0, "loss_total": 0.5103791356086731, "step": 499 }, { "batch_size": 4, "epoch": 0.1996, "step": 499, "tokens_per_device": 4588 }, { "epoch": 0.1996, "loss_ce": 0.1446438431739807, "loss_lvr": 1.223587989807129, "loss_mode_switch": 0.0, "loss_total": 0.2670026421546936, "step": 499 }, { "batch_size": 4, "epoch": 0.1996, "step": 499, "tokens_per_device": 3872 }, { "epoch": 0.1996, "loss_ce": 0.1400289237499237, "loss_lvr": 1.0035302639007568, "loss_mode_switch": 0.0, "loss_total": 0.24038195610046387, "step": 499 }, { "batch_size": 4, "epoch": 0.1996, "step": 499, "tokens_per_device": 4404 }, { "epoch": 0.1996, "loss_ce": 0.13552908599376678, "loss_lvr": 1.0092827081680298, "loss_mode_switch": 0.0, "loss_total": 0.23645734786987305, "step": 499 }, { "batch_size": 1, "epoch": 0.1996, "step": 499, "tokens_per_device": 4950 }, { "epoch": 0.1996, "loss_ce": 0.21793942153453827, "loss_lvr": 0.7220105528831482, "loss_mode_switch": 0.0, "loss_total": 0.2901404798030853, "step": 499 }, { "batch_size": 1, "epoch": 0.1996, "step": 499, "tokens_per_device": 5021 }, { "epoch": 0.1996, "loss_ce": 0.016053486615419388, "loss_lvr": 0.5310226678848267, "loss_mode_switch": 0.0, "loss_total": 0.069155752658844, "step": 499 }, { "epoch": 0.2, "grad_norm": 1.3189820051193237, "learning_rate": 9.261084118279846e-06, "loss": 0.2645, "step": 500 }, { "batch_size": 4, "epoch": 0.2, "step": 500, "tokens_per_device": 6080 }, { "epoch": 0.2, "loss_ce": 0.6853750348091125, "loss_lvr": 0.9754453897476196, "loss_mode_switch": 0.0, "loss_total": 0.7829195857048035, "step": 500 }, { "batch_size": 1, "epoch": 0.2, "step": 500, "tokens_per_device": 5505 }, { "epoch": 0.2, "loss_ce": 0.00850648432970047, "loss_lvr": 0.41676512360572815, "loss_mode_switch": 0.0, "loss_total": 0.050182998180389404, "step": 500 }, { "batch_size": 1, "epoch": 0.2, "step": 500, "tokens_per_device": 5719 }, { "epoch": 0.2, "loss_ce": 0.22603964805603027, "loss_lvr": 0.4792030453681946, "loss_mode_switch": 0.0, "loss_total": 0.2739599645137787, "step": 500 }, { "batch_size": 4, "epoch": 0.2, "step": 500, "tokens_per_device": 4320 }, { "epoch": 0.2, "loss_ce": 0.16179883480072021, "loss_lvr": 0.908233106136322, "loss_mode_switch": 0.0, "loss_total": 0.25262215733528137, "step": 500 }, { "batch_size": 4, "epoch": 0.2, "step": 500, "tokens_per_device": 5772 }, { "epoch": 0.2, "loss_ce": 0.09335393458604813, "loss_lvr": 0.9356135725975037, "loss_mode_switch": 0.0, "loss_total": 0.1869152933359146, "step": 500 }, { "batch_size": 1, "epoch": 0.2, "step": 500, "tokens_per_device": 7192 }, { "epoch": 0.2, "loss_ce": 0.02567295730113983, "loss_lvr": 0.4908747971057892, "loss_mode_switch": 0.0, "loss_total": 0.07476043701171875, "step": 500 }, { "batch_size": 1, "epoch": 0.2, "step": 500, "tokens_per_device": 4759 }, { "epoch": 0.2, "loss_ce": 0.012315780855715275, "loss_lvr": 0.4773579239845276, "loss_mode_switch": 0.0, "loss_total": 0.0600515715777874, "step": 500 }, { "batch_size": 4, "epoch": 0.2, "step": 500, "tokens_per_device": 3756 }, { "epoch": 0.2, "loss_ce": 0.46271535754203796, "loss_lvr": 1.2000885009765625, "loss_mode_switch": 0.0, "loss_total": 0.5827242136001587, "step": 500 }, { "epoch": 0.2004, "grad_norm": 1.2651457786560059, "learning_rate": 9.25769158268978e-06, "loss": 0.3106, "step": 501 }, { "batch_size": 4, "epoch": 0.2004, "step": 501, "tokens_per_device": 11228 }, { "epoch": 0.2004, "loss_ce": 0.0620361752808094, "loss_lvr": 0.6310283541679382, "loss_mode_switch": 0.0, "loss_total": 0.1251390129327774, "step": 501 }, { "batch_size": 1, "epoch": 0.2004, "step": 501, "tokens_per_device": 4981 }, { "epoch": 0.2004, "loss_ce": 0.5639300346374512, "loss_lvr": 0.6063057780265808, "loss_mode_switch": 0.0, "loss_total": 0.6245605945587158, "step": 501 }, { "batch_size": 4, "epoch": 0.2004, "step": 501, "tokens_per_device": 1388 }, { "epoch": 0.2004, "loss_ce": 0.5739256143569946, "loss_lvr": 1.0140682458877563, "loss_mode_switch": 0.0, "loss_total": 0.6753324270248413, "step": 501 }, { "batch_size": 4, "epoch": 0.2004, "step": 501, "tokens_per_device": 4152 }, { "epoch": 0.2004, "loss_ce": 0.34632742404937744, "loss_lvr": 1.0182335376739502, "loss_mode_switch": 0.0, "loss_total": 0.44815078377723694, "step": 501 }, { "batch_size": 1, "epoch": 0.2004, "step": 501, "tokens_per_device": 4859 }, { "epoch": 0.2004, "loss_ce": 0.0005513874930329621, "loss_lvr": 0.1989593505859375, "loss_mode_switch": 0.0, "loss_total": 0.020447323098778725, "step": 501 }, { "batch_size": 4, "epoch": 0.2004, "step": 501, "tokens_per_device": 2768 }, { "epoch": 0.2004, "loss_ce": 0.37604740262031555, "loss_lvr": 0.7985544204711914, "loss_mode_switch": 0.0, "loss_total": 0.4559028446674347, "step": 501 }, { "batch_size": 4, "epoch": 0.2004, "step": 501, "tokens_per_device": 3820 }, { "epoch": 0.2004, "loss_ce": 0.29619693756103516, "loss_lvr": 1.2568782567977905, "loss_mode_switch": 0.0, "loss_total": 0.42188477516174316, "step": 501 }, { "batch_size": 4, "epoch": 0.2004, "step": 501, "tokens_per_device": 5436 }, { "epoch": 0.2004, "loss_ce": 0.40034398436546326, "loss_lvr": 1.1916505098342896, "loss_mode_switch": 0.0, "loss_total": 0.5195090174674988, "step": 501 }, { "epoch": 0.2008, "grad_norm": 1.3602310419082642, "learning_rate": 9.254291901307267e-06, "loss": 0.3388, "step": 502 }, { "batch_size": 4, "epoch": 0.2008, "step": 502, "tokens_per_device": 4256 }, { "epoch": 0.2008, "loss_ce": 0.15735842287540436, "loss_lvr": 0.7980904579162598, "loss_mode_switch": 0.0, "loss_total": 0.23716747760772705, "step": 502 }, { "batch_size": 4, "epoch": 0.2008, "step": 502, "tokens_per_device": 7656 }, { "epoch": 0.2008, "loss_ce": 0.08404799550771713, "loss_lvr": 0.7883628010749817, "loss_mode_switch": 0.0, "loss_total": 0.16288428008556366, "step": 502 }, { "batch_size": 4, "epoch": 0.2008, "step": 502, "tokens_per_device": 3828 }, { "epoch": 0.2008, "loss_ce": 0.21885500848293304, "loss_lvr": 0.9289957284927368, "loss_mode_switch": 0.0, "loss_total": 0.31175458431243896, "step": 502 }, { "batch_size": 1, "epoch": 0.2008, "step": 502, "tokens_per_device": 5113 }, { "epoch": 0.2008, "loss_ce": 0.009302349761128426, "loss_lvr": 0.4084192216396332, "loss_mode_switch": 0.0, "loss_total": 0.050144270062446594, "step": 502 }, { "batch_size": 4, "epoch": 0.2008, "step": 502, "tokens_per_device": 5952 }, { "epoch": 0.2008, "loss_ce": 0.09128585457801819, "loss_lvr": 0.79268479347229, "loss_mode_switch": 0.0, "loss_total": 0.17055433988571167, "step": 502 }, { "batch_size": 4, "epoch": 0.2008, "step": 502, "tokens_per_device": 4172 }, { "epoch": 0.2008, "loss_ce": 0.4749518036842346, "loss_lvr": 0.858110249042511, "loss_mode_switch": 0.0, "loss_total": 0.5607628226280212, "step": 502 }, { "batch_size": 1, "epoch": 0.2008, "step": 502, "tokens_per_device": 5171 }, { "epoch": 0.2008, "loss_ce": 0.1328050196170807, "loss_lvr": 0.8474560379981995, "loss_mode_switch": 0.0, "loss_total": 0.2175506353378296, "step": 502 }, { "batch_size": 4, "epoch": 0.2008, "step": 502, "tokens_per_device": 2580 }, { "epoch": 0.2008, "loss_ce": 0.036318663507699966, "loss_lvr": 1.3008285760879517, "loss_mode_switch": 0.0, "loss_total": 0.16640152037143707, "step": 502 }, { "epoch": 0.2012, "grad_norm": 1.5095014572143555, "learning_rate": 9.250885079838079e-06, "loss": 0.2713, "step": 503 }, { "batch_size": 4, "epoch": 0.2012, "step": 503, "tokens_per_device": 2292 }, { "epoch": 0.2012, "loss_ce": 0.5042087435722351, "loss_lvr": 0.9572306871414185, "loss_mode_switch": 0.0, "loss_total": 0.5999318361282349, "step": 503 }, { "batch_size": 4, "epoch": 0.2012, "step": 503, "tokens_per_device": 1144 }, { "epoch": 0.2012, "loss_ce": 0.46810734272003174, "loss_lvr": 1.3146618604660034, "loss_mode_switch": 0.0, "loss_total": 0.59957355260849, "step": 503 }, { "batch_size": 4, "epoch": 0.2012, "step": 503, "tokens_per_device": 5868 }, { "epoch": 0.2012, "loss_ce": 0.4642769396305084, "loss_lvr": 0.9622887372970581, "loss_mode_switch": 0.0, "loss_total": 0.5605058073997498, "step": 503 }, { "batch_size": 4, "epoch": 0.2012, "step": 503, "tokens_per_device": 5716 }, { "epoch": 0.2012, "loss_ce": 0.36148032546043396, "loss_lvr": 0.792488157749176, "loss_mode_switch": 0.0, "loss_total": 0.44072914123535156, "step": 503 }, { "batch_size": 4, "epoch": 0.2012, "step": 503, "tokens_per_device": 5188 }, { "epoch": 0.2012, "loss_ce": 0.10264737904071808, "loss_lvr": 0.8414735794067383, "loss_mode_switch": 0.0, "loss_total": 0.1867947280406952, "step": 503 }, { "batch_size": 4, "epoch": 0.2012, "step": 503, "tokens_per_device": 12196 }, { "epoch": 0.2012, "loss_ce": 0.057053957134485245, "loss_lvr": 0.96598881483078, "loss_mode_switch": 0.0, "loss_total": 0.1536528468132019, "step": 503 }, { "batch_size": 4, "epoch": 0.2012, "step": 503, "tokens_per_device": 3900 }, { "epoch": 0.2012, "loss_ce": 0.43823927640914917, "loss_lvr": 1.172183871269226, "loss_mode_switch": 0.0, "loss_total": 0.5554576516151428, "step": 503 }, { "batch_size": 1, "epoch": 0.2012, "step": 503, "tokens_per_device": 5141 }, { "epoch": 0.2012, "loss_ce": 0.2185087949037552, "loss_lvr": 0.42428040504455566, "loss_mode_switch": 0.0, "loss_total": 0.26093682646751404, "step": 503 }, { "epoch": 0.2016, "grad_norm": 1.3971651792526245, "learning_rate": 9.24747112399997e-06, "loss": 0.339, "step": 504 }, { "batch_size": 4, "epoch": 0.2016, "step": 504, "tokens_per_device": 13848 }, { "epoch": 0.2016, "loss_ce": 0.2903361916542053, "loss_lvr": 0.5010702610015869, "loss_mode_switch": 0.0, "loss_total": 0.3404432237148285, "step": 504 }, { "batch_size": 4, "epoch": 0.2016, "step": 504, "tokens_per_device": 1200 }, { "epoch": 0.2016, "loss_ce": 0.5660575032234192, "loss_lvr": 1.2983875274658203, "loss_mode_switch": 0.0, "loss_total": 0.6958962678909302, "step": 504 }, { "batch_size": 1, "epoch": 0.2016, "step": 504, "tokens_per_device": 5255 }, { "epoch": 0.2016, "loss_ce": 0.03505463898181915, "loss_lvr": 0.6032100915908813, "loss_mode_switch": 0.0, "loss_total": 0.09537564963102341, "step": 504 }, { "batch_size": 4, "epoch": 0.2016, "step": 504, "tokens_per_device": 4120 }, { "epoch": 0.2016, "loss_ce": 0.005616697482764721, "loss_lvr": 0.8662779331207275, "loss_mode_switch": 0.0, "loss_total": 0.09224449098110199, "step": 504 }, { "batch_size": 4, "epoch": 0.2016, "step": 504, "tokens_per_device": 1208 }, { "epoch": 0.2016, "loss_ce": 0.4713974893093109, "loss_lvr": 0.9413467049598694, "loss_mode_switch": 0.0, "loss_total": 0.5655321478843689, "step": 504 }, { "batch_size": 1, "epoch": 0.2016, "step": 504, "tokens_per_device": 6881 }, { "epoch": 0.2016, "loss_ce": 0.0023975467775017023, "loss_lvr": 0.5976090431213379, "loss_mode_switch": 0.0, "loss_total": 0.062158454209566116, "step": 504 }, { "batch_size": 4, "epoch": 0.2016, "step": 504, "tokens_per_device": 8004 }, { "epoch": 0.2016, "loss_ce": 0.005124374758452177, "loss_lvr": 0.7712207436561584, "loss_mode_switch": 0.0, "loss_total": 0.08224645256996155, "step": 504 }, { "batch_size": 4, "epoch": 0.2016, "step": 504, "tokens_per_device": 2712 }, { "epoch": 0.2016, "loss_ce": 0.7292059063911438, "loss_lvr": 0.9633512496948242, "loss_mode_switch": 0.0, "loss_total": 0.8255410194396973, "step": 504 }, { "epoch": 0.202, "grad_norm": 1.3889329433441162, "learning_rate": 9.244050039522673e-06, "loss": 0.2898, "step": 505 }, { "batch_size": 1, "epoch": 0.202, "step": 505, "tokens_per_device": 6762 }, { "epoch": 0.202, "loss_ce": 0.0029293242841959, "loss_lvr": 0.477508544921875, "loss_mode_switch": 0.0, "loss_total": 0.05068018287420273, "step": 505 }, { "batch_size": 4, "epoch": 0.202, "step": 505, "tokens_per_device": 1640 }, { "epoch": 0.202, "loss_ce": 0.4278751015663147, "loss_lvr": 1.0343573093414307, "loss_mode_switch": 0.0, "loss_total": 0.5313108563423157, "step": 505 }, { "batch_size": 4, "epoch": 0.202, "step": 505, "tokens_per_device": 4208 }, { "epoch": 0.202, "loss_ce": 0.24099062383174896, "loss_lvr": 0.7619981169700623, "loss_mode_switch": 0.0, "loss_total": 0.3171904385089874, "step": 505 }, { "batch_size": 1, "epoch": 0.202, "step": 505, "tokens_per_device": 6659 }, { "epoch": 0.202, "loss_ce": 0.03788067027926445, "loss_lvr": 0.47888314723968506, "loss_mode_switch": 0.0, "loss_total": 0.08576898276805878, "step": 505 }, { "batch_size": 4, "epoch": 0.202, "step": 505, "tokens_per_device": 1776 }, { "epoch": 0.202, "loss_ce": 0.2479180544614792, "loss_lvr": 1.0429978370666504, "loss_mode_switch": 0.0, "loss_total": 0.3522178530693054, "step": 505 }, { "batch_size": 4, "epoch": 0.202, "step": 505, "tokens_per_device": 4924 }, { "epoch": 0.202, "loss_ce": 0.47983404994010925, "loss_lvr": 1.0944082736968994, "loss_mode_switch": 0.0, "loss_total": 0.5892748832702637, "step": 505 }, { "batch_size": 1, "epoch": 0.202, "step": 505, "tokens_per_device": 4957 }, { "epoch": 0.202, "loss_ce": 0.07559613138437271, "loss_lvr": 0.28233858942985535, "loss_mode_switch": 0.0, "loss_total": 0.1038299947977066, "step": 505 }, { "batch_size": 1, "epoch": 0.202, "step": 505, "tokens_per_device": 4900 }, { "epoch": 0.202, "loss_ce": 0.022154461592435837, "loss_lvr": 0.8639942407608032, "loss_mode_switch": 0.0, "loss_total": 0.10855388641357422, "step": 505 }, { "epoch": 0.2024, "grad_norm": 2.4471304416656494, "learning_rate": 9.24062183214788e-06, "loss": 0.2443, "step": 506 }, { "batch_size": 4, "epoch": 0.2024, "step": 506, "tokens_per_device": 1324 }, { "epoch": 0.2024, "loss_ce": 0.1744810938835144, "loss_lvr": 1.2895960807800293, "loss_mode_switch": 0.0, "loss_total": 0.3034406900405884, "step": 506 }, { "batch_size": 4, "epoch": 0.2024, "step": 506, "tokens_per_device": 8220 }, { "epoch": 0.2024, "loss_ce": 0.13130976259708405, "loss_lvr": 0.6542181968688965, "loss_mode_switch": 0.0, "loss_total": 0.1967315822839737, "step": 506 }, { "batch_size": 4, "epoch": 0.2024, "step": 506, "tokens_per_device": 6576 }, { "epoch": 0.2024, "loss_ce": 0.03887295722961426, "loss_lvr": 1.3168636560440063, "loss_mode_switch": 0.0, "loss_total": 0.1705593317747116, "step": 506 }, { "batch_size": 4, "epoch": 0.2024, "step": 506, "tokens_per_device": 3776 }, { "epoch": 0.2024, "loss_ce": 0.17034383118152618, "loss_lvr": 1.2718164920806885, "loss_mode_switch": 0.0, "loss_total": 0.29752546548843384, "step": 506 }, { "batch_size": 4, "epoch": 0.2024, "step": 506, "tokens_per_device": 12900 }, { "epoch": 0.2024, "loss_ce": 0.36055582761764526, "loss_lvr": 0.8205028176307678, "loss_mode_switch": 0.0, "loss_total": 0.442606121301651, "step": 506 }, { "batch_size": 1, "epoch": 0.2024, "step": 506, "tokens_per_device": 4747 }, { "epoch": 0.2024, "loss_ce": 0.10684973001480103, "loss_lvr": 0.7866368293762207, "loss_mode_switch": 0.0, "loss_total": 0.18551340699195862, "step": 506 }, { "batch_size": 4, "epoch": 0.2024, "step": 506, "tokens_per_device": 4268 }, { "epoch": 0.2024, "loss_ce": 0.34761306643486023, "loss_lvr": 1.0587546825408936, "loss_mode_switch": 0.0, "loss_total": 0.4534885287284851, "step": 506 }, { "batch_size": 4, "epoch": 0.2024, "step": 506, "tokens_per_device": 1584 }, { "epoch": 0.2024, "loss_ce": 0.30389466881752014, "loss_lvr": 0.946862518787384, "loss_mode_switch": 0.0, "loss_total": 0.3985809087753296, "step": 506 }, { "epoch": 0.2028, "grad_norm": 1.3750110864639282, "learning_rate": 9.237186507629236e-06, "loss": 0.3035, "step": 507 }, { "batch_size": 1, "epoch": 0.2028, "step": 507, "tokens_per_device": 4853 }, { "epoch": 0.2028, "loss_ce": 0.010464459657669067, "loss_lvr": 0.6358463168144226, "loss_mode_switch": 0.0, "loss_total": 0.07404909282922745, "step": 507 }, { "batch_size": 4, "epoch": 0.2028, "step": 507, "tokens_per_device": 6656 }, { "epoch": 0.2028, "loss_ce": 0.16202157735824585, "loss_lvr": 1.1596741676330566, "loss_mode_switch": 0.0, "loss_total": 0.277989000082016, "step": 507 }, { "batch_size": 1, "epoch": 0.2028, "step": 507, "tokens_per_device": 4889 }, { "epoch": 0.2028, "loss_ce": 0.006109260953962803, "loss_lvr": 0.49566853046417236, "loss_mode_switch": 0.0, "loss_total": 0.055676113814115524, "step": 507 }, { "batch_size": 4, "epoch": 0.2028, "step": 507, "tokens_per_device": 5036 }, { "epoch": 0.2028, "loss_ce": 0.018557868897914886, "loss_lvr": 0.775416374206543, "loss_mode_switch": 0.0, "loss_total": 0.09609951078891754, "step": 507 }, { "batch_size": 1, "epoch": 0.2028, "step": 507, "tokens_per_device": 4981 }, { "epoch": 0.2028, "loss_ce": 0.002212580991908908, "loss_lvr": 0.253828227519989, "loss_mode_switch": 0.0, "loss_total": 0.02759540267288685, "step": 507 }, { "batch_size": 1, "epoch": 0.2028, "step": 507, "tokens_per_device": 6212 }, { "epoch": 0.2028, "loss_ce": 0.04444754123687744, "loss_lvr": 0.8005296587944031, "loss_mode_switch": 0.0, "loss_total": 0.12450050562620163, "step": 507 }, { "batch_size": 4, "epoch": 0.2028, "step": 507, "tokens_per_device": 4112 }, { "epoch": 0.2028, "loss_ce": 0.2469208538532257, "loss_lvr": 0.9042452573776245, "loss_mode_switch": 0.0, "loss_total": 0.3373453915119171, "step": 507 }, { "batch_size": 1, "epoch": 0.2028, "step": 507, "tokens_per_device": 4875 }, { "epoch": 0.2028, "loss_ce": 0.02921219915151596, "loss_lvr": 0.23924988508224487, "loss_mode_switch": 0.0, "loss_total": 0.05313719063997269, "step": 507 }, { "epoch": 0.2032, "grad_norm": 1.3716332912445068, "learning_rate": 9.23374407173234e-06, "loss": 0.2913, "step": 508 }, { "batch_size": 1, "epoch": 0.2032, "step": 508, "tokens_per_device": 4884 }, { "epoch": 0.2032, "loss_ce": 0.4371338188648224, "loss_lvr": 1.0951658487319946, "loss_mode_switch": 0.0, "loss_total": 0.5466504096984863, "step": 508 }, { "batch_size": 1, "epoch": 0.2032, "step": 508, "tokens_per_device": 4806 }, { "epoch": 0.2032, "loss_ce": 1.9266505241394043, "loss_lvr": 0.4019216001033783, "loss_mode_switch": 0.0, "loss_total": 1.9668426513671875, "step": 508 }, { "batch_size": 4, "epoch": 0.2032, "step": 508, "tokens_per_device": 5716 }, { "epoch": 0.2032, "loss_ce": 0.443663626909256, "loss_lvr": 1.0957063436508179, "loss_mode_switch": 0.0, "loss_total": 0.5532342791557312, "step": 508 }, { "batch_size": 1, "epoch": 0.2032, "step": 508, "tokens_per_device": 4904 }, { "epoch": 0.2032, "loss_ce": 0.03260349482297897, "loss_lvr": 0.4680478274822235, "loss_mode_switch": 0.0, "loss_total": 0.07940827310085297, "step": 508 }, { "batch_size": 4, "epoch": 0.2032, "step": 508, "tokens_per_device": 3804 }, { "epoch": 0.2032, "loss_ce": 0.2292773723602295, "loss_lvr": 0.8700257539749146, "loss_mode_switch": 0.0, "loss_total": 0.31627994775772095, "step": 508 }, { "batch_size": 4, "epoch": 0.2032, "step": 508, "tokens_per_device": 4236 }, { "epoch": 0.2032, "loss_ce": 0.5718015432357788, "loss_lvr": 1.109330177307129, "loss_mode_switch": 0.0, "loss_total": 0.6827345490455627, "step": 508 }, { "batch_size": 1, "epoch": 0.2032, "step": 508, "tokens_per_device": 4662 }, { "epoch": 0.2032, "loss_ce": 0.008283356204628944, "loss_lvr": 0.7875691652297974, "loss_mode_switch": 0.0, "loss_total": 0.08704027533531189, "step": 508 }, { "batch_size": 4, "epoch": 0.2032, "step": 508, "tokens_per_device": 2704 }, { "epoch": 0.2032, "loss_ce": 0.41017088294029236, "loss_lvr": 0.8464111685752869, "loss_mode_switch": 0.0, "loss_total": 0.49481201171875, "step": 508 }, { "epoch": 0.2036, "grad_norm": 1.5775911808013916, "learning_rate": 9.230294530234714e-06, "loss": 0.3605, "step": 509 }, { "batch_size": 1, "epoch": 0.2036, "step": 509, "tokens_per_device": 4907 }, { "epoch": 0.2036, "loss_ce": 1.0024936199188232, "loss_lvr": 1.0593798160552979, "loss_mode_switch": 0.0, "loss_total": 1.1084315776824951, "step": 509 }, { "batch_size": 4, "epoch": 0.2036, "step": 509, "tokens_per_device": 2588 }, { "epoch": 0.2036, "loss_ce": 0.24333685636520386, "loss_lvr": 1.1369116306304932, "loss_mode_switch": 0.0, "loss_total": 0.3570280075073242, "step": 509 }, { "batch_size": 1, "epoch": 0.2036, "step": 509, "tokens_per_device": 5171 }, { "epoch": 0.2036, "loss_ce": 0.13701532781124115, "loss_lvr": 0.7243859171867371, "loss_mode_switch": 0.0, "loss_total": 0.20945391058921814, "step": 509 }, { "batch_size": 4, "epoch": 0.2036, "step": 509, "tokens_per_device": 4256 }, { "epoch": 0.2036, "loss_ce": 0.24937991797924042, "loss_lvr": 0.8656644821166992, "loss_mode_switch": 0.0, "loss_total": 0.33594638109207153, "step": 509 }, { "batch_size": 1, "epoch": 0.2036, "step": 509, "tokens_per_device": 4835 }, { "epoch": 0.2036, "loss_ce": 0.012521863915026188, "loss_lvr": 0.5514098405838013, "loss_mode_switch": 0.0, "loss_total": 0.06766285002231598, "step": 509 }, { "batch_size": 1, "epoch": 0.2036, "step": 509, "tokens_per_device": 4972 }, { "epoch": 0.2036, "loss_ce": 0.09736835956573486, "loss_lvr": 0.6337135434150696, "loss_mode_switch": 0.0, "loss_total": 0.1607397198677063, "step": 509 }, { "batch_size": 4, "epoch": 0.2036, "step": 509, "tokens_per_device": 6272 }, { "epoch": 0.2036, "loss_ce": 0.43094921112060547, "loss_lvr": 0.884523868560791, "loss_mode_switch": 0.0, "loss_total": 0.5194016098976135, "step": 509 }, { "batch_size": 1, "epoch": 0.2036, "step": 509, "tokens_per_device": 5105 }, { "epoch": 0.2036, "loss_ce": 0.0010797043796628714, "loss_lvr": 0.3595178425312042, "loss_mode_switch": 0.0, "loss_total": 0.03703149035573006, "step": 509 }, { "epoch": 0.204, "grad_norm": 1.5360291004180908, "learning_rate": 9.226837888925813e-06, "loss": 0.3389, "step": 510 }, { "batch_size": 4, "epoch": 0.204, "step": 510, "tokens_per_device": 3756 }, { "epoch": 0.204, "loss_ce": 0.5520115494728088, "loss_lvr": 1.1877251863479614, "loss_mode_switch": 0.0, "loss_total": 0.670784056186676, "step": 510 }, { "batch_size": 4, "epoch": 0.204, "step": 510, "tokens_per_device": 2600 }, { "epoch": 0.204, "loss_ce": 0.41452205181121826, "loss_lvr": 0.9574128985404968, "loss_mode_switch": 0.0, "loss_total": 0.5102633237838745, "step": 510 }, { "batch_size": 4, "epoch": 0.204, "step": 510, "tokens_per_device": 4300 }, { "epoch": 0.204, "loss_ce": 0.25902390480041504, "loss_lvr": 0.6827309727668762, "loss_mode_switch": 0.0, "loss_total": 0.32729700207710266, "step": 510 }, { "batch_size": 4, "epoch": 0.204, "step": 510, "tokens_per_device": 1500 }, { "epoch": 0.204, "loss_ce": 0.3691423237323761, "loss_lvr": 1.1728686094284058, "loss_mode_switch": 0.0, "loss_total": 0.4864291846752167, "step": 510 }, { "batch_size": 1, "epoch": 0.204, "step": 510, "tokens_per_device": 5355 }, { "epoch": 0.204, "loss_ce": 0.0952305942773819, "loss_lvr": 0.5613808631896973, "loss_mode_switch": 0.0, "loss_total": 0.15136867761611938, "step": 510 }, { "batch_size": 4, "epoch": 0.204, "step": 510, "tokens_per_device": 10544 }, { "epoch": 0.204, "loss_ce": 0.8837463855743408, "loss_lvr": 0.8889150023460388, "loss_mode_switch": 0.0, "loss_total": 0.9726378917694092, "step": 510 }, { "batch_size": 1, "epoch": 0.204, "step": 510, "tokens_per_device": 5174 }, { "epoch": 0.204, "loss_ce": 0.008485090918838978, "loss_lvr": 0.4112209677696228, "loss_mode_switch": 0.0, "loss_total": 0.04960718750953674, "step": 510 }, { "batch_size": 4, "epoch": 0.204, "step": 510, "tokens_per_device": 4000 }, { "epoch": 0.204, "loss_ce": 0.1360071301460266, "loss_lvr": 0.8461942672729492, "loss_mode_switch": 0.0, "loss_total": 0.220626562833786, "step": 510 }, { "epoch": 0.2044, "grad_norm": 1.8519724607467651, "learning_rate": 9.22337415360701e-06, "loss": 0.354, "step": 511 }, { "batch_size": 4, "epoch": 0.2044, "step": 511, "tokens_per_device": 4368 }, { "epoch": 0.2044, "loss_ce": 0.22001904249191284, "loss_lvr": 0.9272263646125793, "loss_mode_switch": 0.0, "loss_total": 0.3127416968345642, "step": 511 }, { "batch_size": 1, "epoch": 0.2044, "step": 511, "tokens_per_device": 4267 }, { "epoch": 0.2044, "loss_ce": 0.26467129588127136, "loss_lvr": 0.5991716384887695, "loss_mode_switch": 0.0, "loss_total": 0.32458844780921936, "step": 511 }, { "batch_size": 4, "epoch": 0.2044, "step": 511, "tokens_per_device": 3872 }, { "epoch": 0.2044, "loss_ce": 0.07965672761201859, "loss_lvr": 1.2705060243606567, "loss_mode_switch": 0.0, "loss_total": 0.20670732855796814, "step": 511 }, { "batch_size": 4, "epoch": 0.2044, "step": 511, "tokens_per_device": 4416 }, { "epoch": 0.2044, "loss_ce": 0.07909298688173294, "loss_lvr": 1.038643479347229, "loss_mode_switch": 0.0, "loss_total": 0.18295733630657196, "step": 511 }, { "batch_size": 4, "epoch": 0.2044, "step": 511, "tokens_per_device": 5816 }, { "epoch": 0.2044, "loss_ce": 0.514521598815918, "loss_lvr": 0.8629325032234192, "loss_mode_switch": 0.0, "loss_total": 0.6008148193359375, "step": 511 }, { "batch_size": 1, "epoch": 0.2044, "step": 511, "tokens_per_device": 4621 }, { "epoch": 0.2044, "loss_ce": 0.008974405005574226, "loss_lvr": 0.40121927857398987, "loss_mode_switch": 0.0, "loss_total": 0.049096331000328064, "step": 511 }, { "batch_size": 4, "epoch": 0.2044, "step": 511, "tokens_per_device": 2332 }, { "epoch": 0.2044, "loss_ce": 0.07726338505744934, "loss_lvr": 1.0715161561965942, "loss_mode_switch": 0.0, "loss_total": 0.18441501259803772, "step": 511 }, { "batch_size": 1, "epoch": 0.2044, "step": 511, "tokens_per_device": 4947 }, { "epoch": 0.2044, "loss_ce": 0.0951143354177475, "loss_lvr": 0.40281713008880615, "loss_mode_switch": 0.0, "loss_total": 0.1353960484266281, "step": 511 }, { "epoch": 0.2048, "grad_norm": 1.3112318515777588, "learning_rate": 9.219903330091575e-06, "loss": 0.2797, "step": 512 }, { "batch_size": 1, "epoch": 0.2048, "step": 512, "tokens_per_device": 5119 }, { "epoch": 0.2048, "loss_ce": 0.05521770194172859, "loss_lvr": 1.1644320487976074, "loss_mode_switch": 0.0, "loss_total": 0.171660915017128, "step": 512 }, { "batch_size": 4, "epoch": 0.2048, "step": 512, "tokens_per_device": 4196 }, { "epoch": 0.2048, "loss_ce": 0.27144673466682434, "loss_lvr": 0.9152243137359619, "loss_mode_switch": 0.0, "loss_total": 0.36296916007995605, "step": 512 }, { "batch_size": 4, "epoch": 0.2048, "step": 512, "tokens_per_device": 4176 }, { "epoch": 0.2048, "loss_ce": 0.6903262138366699, "loss_lvr": 1.111145257949829, "loss_mode_switch": 0.0, "loss_total": 0.8014407157897949, "step": 512 }, { "batch_size": 1, "epoch": 0.2048, "step": 512, "tokens_per_device": 5126 }, { "epoch": 0.2048, "loss_ce": 0.0034685679711401463, "loss_lvr": 0.7072504758834839, "loss_mode_switch": 0.0, "loss_total": 0.07419361174106598, "step": 512 }, { "batch_size": 4, "epoch": 0.2048, "step": 512, "tokens_per_device": 1372 }, { "epoch": 0.2048, "loss_ce": 0.20564092695713043, "loss_lvr": 1.0262296199798584, "loss_mode_switch": 0.0, "loss_total": 0.308263897895813, "step": 512 }, { "batch_size": 4, "epoch": 0.2048, "step": 512, "tokens_per_device": 3772 }, { "epoch": 0.2048, "loss_ce": 0.08468055725097656, "loss_lvr": 0.9246222376823425, "loss_mode_switch": 0.0, "loss_total": 0.17714278399944305, "step": 512 }, { "batch_size": 1, "epoch": 0.2048, "step": 512, "tokens_per_device": 7115 }, { "epoch": 0.2048, "loss_ce": 1.1861109733581543, "loss_lvr": 0.6176573038101196, "loss_mode_switch": 0.0, "loss_total": 1.2478766441345215, "step": 512 }, { "batch_size": 4, "epoch": 0.2048, "step": 512, "tokens_per_device": 2660 }, { "epoch": 0.2048, "loss_ce": 0.16405479609966278, "loss_lvr": 0.8568686842918396, "loss_mode_switch": 0.0, "loss_total": 0.24974167346954346, "step": 512 }, { "epoch": 0.2052, "grad_norm": 1.6459358930587769, "learning_rate": 9.216425424204683e-06, "loss": 0.3388, "step": 513 }, { "batch_size": 1, "epoch": 0.2052, "step": 513, "tokens_per_device": 4872 }, { "epoch": 0.2052, "loss_ce": 0.008598372340202332, "loss_lvr": 0.47657671570777893, "loss_mode_switch": 0.0, "loss_total": 0.056256044656038284, "step": 513 }, { "batch_size": 1, "epoch": 0.2052, "step": 513, "tokens_per_device": 5110 }, { "epoch": 0.2052, "loss_ce": 0.012722262181341648, "loss_lvr": 0.6063455939292908, "loss_mode_switch": 0.0, "loss_total": 0.07335682213306427, "step": 513 }, { "batch_size": 4, "epoch": 0.2052, "step": 513, "tokens_per_device": 4840 }, { "epoch": 0.2052, "loss_ce": 0.15794725716114044, "loss_lvr": 1.686707854270935, "loss_mode_switch": 0.0, "loss_total": 0.3266180455684662, "step": 513 }, { "batch_size": 4, "epoch": 0.2052, "step": 513, "tokens_per_device": 5744 }, { "epoch": 0.2052, "loss_ce": 0.02441985346376896, "loss_lvr": 0.869867742061615, "loss_mode_switch": 0.0, "loss_total": 0.11140662431716919, "step": 513 }, { "batch_size": 4, "epoch": 0.2052, "step": 513, "tokens_per_device": 3980 }, { "epoch": 0.2052, "loss_ce": 0.008583401329815388, "loss_lvr": 0.8874467015266418, "loss_mode_switch": 0.0, "loss_total": 0.0973280742764473, "step": 513 }, { "batch_size": 1, "epoch": 0.2052, "step": 513, "tokens_per_device": 4736 }, { "epoch": 0.2052, "loss_ce": 0.009995691478252411, "loss_lvr": 0.42168933153152466, "loss_mode_switch": 0.0, "loss_total": 0.052164625376462936, "step": 513 }, { "batch_size": 4, "epoch": 0.2052, "step": 513, "tokens_per_device": 4484 }, { "epoch": 0.2052, "loss_ce": 0.15712398290634155, "loss_lvr": 0.9181967973709106, "loss_mode_switch": 0.0, "loss_total": 0.24894365668296814, "step": 513 }, { "batch_size": 4, "epoch": 0.2052, "step": 513, "tokens_per_device": 6544 }, { "epoch": 0.2052, "loss_ce": 0.28734785318374634, "loss_lvr": 0.8844327926635742, "loss_mode_switch": 0.0, "loss_total": 0.37579113245010376, "step": 513 }, { "epoch": 0.2056, "grad_norm": 1.5459253787994385, "learning_rate": 9.212940441783392e-06, "loss": 0.3448, "step": 514 }, { "batch_size": 4, "epoch": 0.2056, "step": 514, "tokens_per_device": 6336 }, { "epoch": 0.2056, "loss_ce": 0.1665116548538208, "loss_lvr": 0.7402763962745667, "loss_mode_switch": 0.0, "loss_total": 0.2405392974615097, "step": 514 }, { "batch_size": 4, "epoch": 0.2056, "step": 514, "tokens_per_device": 3424 }, { "epoch": 0.2056, "loss_ce": 0.3624576926231384, "loss_lvr": 0.7744241952896118, "loss_mode_switch": 0.0, "loss_total": 0.43990010023117065, "step": 514 }, { "batch_size": 4, "epoch": 0.2056, "step": 514, "tokens_per_device": 6480 }, { "epoch": 0.2056, "loss_ce": 0.30596086382865906, "loss_lvr": 1.036285638809204, "loss_mode_switch": 0.0, "loss_total": 0.4095894396305084, "step": 514 }, { "batch_size": 4, "epoch": 0.2056, "step": 514, "tokens_per_device": 2300 }, { "epoch": 0.2056, "loss_ce": 0.40510523319244385, "loss_lvr": 1.7329946756362915, "loss_mode_switch": 0.0, "loss_total": 0.5784047245979309, "step": 514 }, { "batch_size": 4, "epoch": 0.2056, "step": 514, "tokens_per_device": 5800 }, { "epoch": 0.2056, "loss_ce": 0.225547656416893, "loss_lvr": 1.558437705039978, "loss_mode_switch": 0.0, "loss_total": 0.3813914358615875, "step": 514 }, { "batch_size": 1, "epoch": 0.2056, "step": 514, "tokens_per_device": 5068 }, { "epoch": 0.2056, "loss_ce": 0.008609620854258537, "loss_lvr": 0.40494009852409363, "loss_mode_switch": 0.0, "loss_total": 0.04910363256931305, "step": 514 }, { "batch_size": 4, "epoch": 0.2056, "step": 514, "tokens_per_device": 4268 }, { "epoch": 0.2056, "loss_ce": 0.05522420257329941, "loss_lvr": 0.9317993521690369, "loss_mode_switch": 0.0, "loss_total": 0.14840413630008698, "step": 514 }, { "batch_size": 1, "epoch": 0.2056, "step": 514, "tokens_per_device": 5112 }, { "epoch": 0.2056, "loss_ce": 0.0038840577472001314, "loss_lvr": 0.4122767746448517, "loss_mode_switch": 0.0, "loss_total": 0.04511173814535141, "step": 514 }, { "epoch": 0.206, "grad_norm": 1.5013066530227661, "learning_rate": 9.209448388676636e-06, "loss": 0.3708, "step": 515 }, { "batch_size": 1, "epoch": 0.206, "step": 515, "tokens_per_device": 4772 }, { "epoch": 0.206, "loss_ce": 0.05065714567899704, "loss_lvr": 0.6560413837432861, "loss_mode_switch": 0.0, "loss_total": 0.11626128852367401, "step": 515 }, { "batch_size": 1, "epoch": 0.206, "step": 515, "tokens_per_device": 5119 }, { "epoch": 0.206, "loss_ce": 0.011554964818060398, "loss_lvr": 0.42065566778182983, "loss_mode_switch": 0.0, "loss_total": 0.053620532155036926, "step": 515 }, { "batch_size": 4, "epoch": 0.206, "step": 515, "tokens_per_device": 5368 }, { "epoch": 0.206, "loss_ce": 0.09315852075815201, "loss_lvr": 0.8958858251571655, "loss_mode_switch": 0.0, "loss_total": 0.18274709582328796, "step": 515 }, { "batch_size": 4, "epoch": 0.206, "step": 515, "tokens_per_device": 3836 }, { "epoch": 0.206, "loss_ce": 0.08175760507583618, "loss_lvr": 1.0738779306411743, "loss_mode_switch": 0.0, "loss_total": 0.18914540112018585, "step": 515 }, { "batch_size": 4, "epoch": 0.206, "step": 515, "tokens_per_device": 2920 }, { "epoch": 0.206, "loss_ce": 0.6609625220298767, "loss_lvr": 0.9966858625411987, "loss_mode_switch": 0.0, "loss_total": 0.7606310844421387, "step": 515 }, { "batch_size": 4, "epoch": 0.206, "step": 515, "tokens_per_device": 14732 }, { "epoch": 0.206, "loss_ce": 0.28281837701797485, "loss_lvr": 0.40704813599586487, "loss_mode_switch": 0.0, "loss_total": 0.3235231935977936, "step": 515 }, { "batch_size": 4, "epoch": 0.206, "step": 515, "tokens_per_device": 2372 }, { "epoch": 0.206, "loss_ce": 0.6910081505775452, "loss_lvr": 0.7146845459938049, "loss_mode_switch": 0.0, "loss_total": 0.7624766230583191, "step": 515 }, { "batch_size": 1, "epoch": 0.206, "step": 515, "tokens_per_device": 5139 }, { "epoch": 0.206, "loss_ce": 0.0005011982284486294, "loss_lvr": 0.3996478021144867, "loss_mode_switch": 0.0, "loss_total": 0.04046597704291344, "step": 515 }, { "epoch": 0.2064, "grad_norm": 1.3211957216262817, "learning_rate": 9.205949270745217e-06, "loss": 0.3063, "step": 516 }, { "batch_size": 4, "epoch": 0.2064, "step": 516, "tokens_per_device": 4392 }, { "epoch": 0.2064, "loss_ce": 0.02440945990383625, "loss_lvr": 1.065220832824707, "loss_mode_switch": 0.0, "loss_total": 0.1309315413236618, "step": 516 }, { "batch_size": 4, "epoch": 0.2064, "step": 516, "tokens_per_device": 4012 }, { "epoch": 0.2064, "loss_ce": 0.5203127264976501, "loss_lvr": 0.6697571277618408, "loss_mode_switch": 0.0, "loss_total": 0.5872884392738342, "step": 516 }, { "batch_size": 4, "epoch": 0.2064, "step": 516, "tokens_per_device": 4224 }, { "epoch": 0.2064, "loss_ce": 0.256807416677475, "loss_lvr": 1.0706098079681396, "loss_mode_switch": 0.0, "loss_total": 0.3638684153556824, "step": 516 }, { "batch_size": 1, "epoch": 0.2064, "step": 516, "tokens_per_device": 4913 }, { "epoch": 0.2064, "loss_ce": 0.05960044637322426, "loss_lvr": 0.403656542301178, "loss_mode_switch": 0.0, "loss_total": 0.09996610134840012, "step": 516 }, { "batch_size": 4, "epoch": 0.2064, "step": 516, "tokens_per_device": 7512 }, { "epoch": 0.2064, "loss_ce": 0.39322859048843384, "loss_lvr": 0.7207303643226624, "loss_mode_switch": 0.0, "loss_total": 0.46530163288116455, "step": 516 }, { "batch_size": 1, "epoch": 0.2064, "step": 516, "tokens_per_device": 5007 }, { "epoch": 0.2064, "loss_ce": 0.00673125684261322, "loss_lvr": 0.6590403318405151, "loss_mode_switch": 0.0, "loss_total": 0.07263529300689697, "step": 516 }, { "batch_size": 4, "epoch": 0.2064, "step": 516, "tokens_per_device": 3740 }, { "epoch": 0.2064, "loss_ce": 0.9454119205474854, "loss_lvr": 0.9522229433059692, "loss_mode_switch": 0.0, "loss_total": 1.0406341552734375, "step": 516 }, { "batch_size": 4, "epoch": 0.2064, "step": 516, "tokens_per_device": 1316 }, { "epoch": 0.2064, "loss_ce": 0.47999897599220276, "loss_lvr": 1.0906307697296143, "loss_mode_switch": 0.0, "loss_total": 0.5890620350837708, "step": 516 }, { "epoch": 0.2068, "grad_norm": 1.3545690774917603, "learning_rate": 9.202443093861796e-06, "loss": 0.3519, "step": 517 }, { "batch_size": 4, "epoch": 0.2068, "step": 517, "tokens_per_device": 4244 }, { "epoch": 0.2068, "loss_ce": 0.8104988932609558, "loss_lvr": 1.134488821029663, "loss_mode_switch": 0.0, "loss_total": 0.9239477515220642, "step": 517 }, { "batch_size": 4, "epoch": 0.2068, "step": 517, "tokens_per_device": 2264 }, { "epoch": 0.2068, "loss_ce": 0.28590822219848633, "loss_lvr": 1.105151653289795, "loss_mode_switch": 0.0, "loss_total": 0.3964233994483948, "step": 517 }, { "batch_size": 4, "epoch": 0.2068, "step": 517, "tokens_per_device": 1496 }, { "epoch": 0.2068, "loss_ce": 0.08841673284769058, "loss_lvr": 0.9730450510978699, "loss_mode_switch": 0.0, "loss_total": 0.1857212483882904, "step": 517 }, { "batch_size": 4, "epoch": 0.2068, "step": 517, "tokens_per_device": 4232 }, { "epoch": 0.2068, "loss_ce": 0.0893009677529335, "loss_lvr": 0.8388968110084534, "loss_mode_switch": 0.0, "loss_total": 0.1731906533241272, "step": 517 }, { "batch_size": 4, "epoch": 0.2068, "step": 517, "tokens_per_device": 4640 }, { "epoch": 0.2068, "loss_ce": 0.4646943211555481, "loss_lvr": 0.9109672904014587, "loss_mode_switch": 0.0, "loss_total": 0.5557910203933716, "step": 517 }, { "batch_size": 4, "epoch": 0.2068, "step": 517, "tokens_per_device": 4472 }, { "epoch": 0.2068, "loss_ce": 0.1905970275402069, "loss_lvr": 0.8829741477966309, "loss_mode_switch": 0.0, "loss_total": 0.27889445424079895, "step": 517 }, { "batch_size": 4, "epoch": 0.2068, "step": 517, "tokens_per_device": 5704 }, { "epoch": 0.2068, "loss_ce": 0.20463937520980835, "loss_lvr": 0.989646315574646, "loss_mode_switch": 0.0, "loss_total": 0.30360400676727295, "step": 517 }, { "batch_size": 1, "epoch": 0.2068, "step": 517, "tokens_per_device": 4456 }, { "epoch": 0.2068, "loss_ce": 0.26682955026626587, "loss_lvr": 0.5944725275039673, "loss_mode_switch": 0.0, "loss_total": 0.3262768089771271, "step": 517 }, { "epoch": 0.2072, "grad_norm": 1.2494502067565918, "learning_rate": 9.198929863910874e-06, "loss": 0.3113, "step": 518 }, { "batch_size": 4, "epoch": 0.2072, "step": 518, "tokens_per_device": 2504 }, { "epoch": 0.2072, "loss_ce": 0.22544291615486145, "loss_lvr": 0.9842544198036194, "loss_mode_switch": 0.0, "loss_total": 0.32386836409568787, "step": 518 }, { "batch_size": 4, "epoch": 0.2072, "step": 518, "tokens_per_device": 4332 }, { "epoch": 0.2072, "loss_ce": 0.571207582950592, "loss_lvr": 0.8611791133880615, "loss_mode_switch": 0.0, "loss_total": 0.6573255062103271, "step": 518 }, { "batch_size": 1, "epoch": 0.2072, "step": 518, "tokens_per_device": 4922 }, { "epoch": 0.2072, "loss_ce": 0.08375602215528488, "loss_lvr": 0.571824848651886, "loss_mode_switch": 0.0, "loss_total": 0.14093850553035736, "step": 518 }, { "batch_size": 4, "epoch": 0.2072, "step": 518, "tokens_per_device": 4832 }, { "epoch": 0.2072, "loss_ce": 0.11804850399494171, "loss_lvr": 0.7923516035079956, "loss_mode_switch": 0.0, "loss_total": 0.19728365540504456, "step": 518 }, { "batch_size": 1, "epoch": 0.2072, "step": 518, "tokens_per_device": 5071 }, { "epoch": 0.2072, "loss_ce": 0.007727321237325668, "loss_lvr": 0.4637988805770874, "loss_mode_switch": 0.0, "loss_total": 0.05410721153020859, "step": 518 }, { "batch_size": 4, "epoch": 0.2072, "step": 518, "tokens_per_device": 1824 }, { "epoch": 0.2072, "loss_ce": 0.17690768837928772, "loss_lvr": 0.7331905961036682, "loss_mode_switch": 0.0, "loss_total": 0.2502267360687256, "step": 518 }, { "batch_size": 1, "epoch": 0.2072, "step": 518, "tokens_per_device": 4902 }, { "epoch": 0.2072, "loss_ce": 0.0014011369785293937, "loss_lvr": 0.46797001361846924, "loss_mode_switch": 0.0, "loss_total": 0.048198141157627106, "step": 518 }, { "batch_size": 4, "epoch": 0.2072, "step": 518, "tokens_per_device": 4212 }, { "epoch": 0.2072, "loss_ce": 0.1253650039434433, "loss_lvr": 1.021071434020996, "loss_mode_switch": 0.0, "loss_total": 0.22747215628623962, "step": 518 }, { "epoch": 0.2076, "grad_norm": 1.4162758588790894, "learning_rate": 9.1954095867888e-06, "loss": 0.316, "step": 519 }, { "batch_size": 4, "epoch": 0.2076, "step": 519, "tokens_per_device": 11316 }, { "epoch": 0.2076, "loss_ce": 0.12297987192869186, "loss_lvr": 0.938609778881073, "loss_mode_switch": 0.0, "loss_total": 0.21684084832668304, "step": 519 }, { "batch_size": 4, "epoch": 0.2076, "step": 519, "tokens_per_device": 6536 }, { "epoch": 0.2076, "loss_ce": 0.4199436604976654, "loss_lvr": 0.7979769110679626, "loss_mode_switch": 0.0, "loss_total": 0.4997413456439972, "step": 519 }, { "batch_size": 1, "epoch": 0.2076, "step": 519, "tokens_per_device": 5017 }, { "epoch": 0.2076, "loss_ce": 0.024937519803643227, "loss_lvr": 0.7859372496604919, "loss_mode_switch": 0.0, "loss_total": 0.10353124141693115, "step": 519 }, { "batch_size": 1, "epoch": 0.2076, "step": 519, "tokens_per_device": 4771 }, { "epoch": 0.2076, "loss_ce": 0.0033009620383381844, "loss_lvr": 0.35960549116134644, "loss_mode_switch": 0.0, "loss_total": 0.03926151245832443, "step": 519 }, { "batch_size": 4, "epoch": 0.2076, "step": 519, "tokens_per_device": 4264 }, { "epoch": 0.2076, "loss_ce": 0.10623683780431747, "loss_lvr": 0.885568380355835, "loss_mode_switch": 0.0, "loss_total": 0.1947936713695526, "step": 519 }, { "batch_size": 1, "epoch": 0.2076, "step": 519, "tokens_per_device": 4886 }, { "epoch": 0.2076, "loss_ce": 0.007637615781277418, "loss_lvr": 0.7403693199157715, "loss_mode_switch": 0.0, "loss_total": 0.08167454600334167, "step": 519 }, { "batch_size": 4, "epoch": 0.2076, "step": 519, "tokens_per_device": 1384 }, { "epoch": 0.2076, "loss_ce": 0.4439295828342438, "loss_lvr": 1.5691277980804443, "loss_mode_switch": 0.0, "loss_total": 0.6008423566818237, "step": 519 }, { "batch_size": 4, "epoch": 0.2076, "step": 519, "tokens_per_device": 1376 }, { "epoch": 0.2076, "loss_ce": 0.5167747139930725, "loss_lvr": 1.1595338582992554, "loss_mode_switch": 0.0, "loss_total": 0.632728099822998, "step": 519 }, { "epoch": 0.208, "grad_norm": 1.3415123224258423, "learning_rate": 9.191882268403743e-06, "loss": 0.3187, "step": 520 }, { "batch_size": 4, "epoch": 0.208, "step": 520, "tokens_per_device": 2700 }, { "epoch": 0.208, "loss_ce": 0.3066801428794861, "loss_lvr": 0.9674352407455444, "loss_mode_switch": 0.0, "loss_total": 0.4034236669540405, "step": 520 }, { "batch_size": 4, "epoch": 0.208, "step": 520, "tokens_per_device": 1608 }, { "epoch": 0.208, "loss_ce": 0.1473962962627411, "loss_lvr": 1.1586439609527588, "loss_mode_switch": 0.0, "loss_total": 0.26326069235801697, "step": 520 }, { "batch_size": 4, "epoch": 0.208, "step": 520, "tokens_per_device": 5764 }, { "epoch": 0.208, "loss_ce": 0.24113331735134125, "loss_lvr": 1.184431552886963, "loss_mode_switch": 0.0, "loss_total": 0.3595764636993408, "step": 520 }, { "batch_size": 1, "epoch": 0.208, "step": 520, "tokens_per_device": 4878 }, { "epoch": 0.208, "loss_ce": 0.36031419038772583, "loss_lvr": 1.1802258491516113, "loss_mode_switch": 0.0, "loss_total": 0.47833678126335144, "step": 520 }, { "batch_size": 1, "epoch": 0.208, "step": 520, "tokens_per_device": 4885 }, { "epoch": 0.208, "loss_ce": 0.0018576213624328375, "loss_lvr": 0.3845606744289398, "loss_mode_switch": 0.0, "loss_total": 0.040313687175512314, "step": 520 }, { "batch_size": 4, "epoch": 0.208, "step": 520, "tokens_per_device": 5756 }, { "epoch": 0.208, "loss_ce": 0.09751984477043152, "loss_lvr": 0.8349987864494324, "loss_mode_switch": 0.0, "loss_total": 0.18101972341537476, "step": 520 }, { "batch_size": 4, "epoch": 0.208, "step": 520, "tokens_per_device": 1296 }, { "epoch": 0.208, "loss_ce": 0.09012822061777115, "loss_lvr": 1.058490514755249, "loss_mode_switch": 0.0, "loss_total": 0.19597727060317993, "step": 520 }, { "batch_size": 4, "epoch": 0.208, "step": 520, "tokens_per_device": 3792 }, { "epoch": 0.208, "loss_ce": 0.21012912690639496, "loss_lvr": 1.0444226264953613, "loss_mode_switch": 0.0, "loss_total": 0.3145713806152344, "step": 520 }, { "epoch": 0.2084, "grad_norm": 1.4972704648971558, "learning_rate": 9.188347914675689e-06, "loss": 0.3205, "step": 521 }, { "batch_size": 1, "epoch": 0.2084, "step": 521, "tokens_per_device": 5418 }, { "epoch": 0.2084, "loss_ce": 0.16750644147396088, "loss_lvr": 0.6896654963493347, "loss_mode_switch": 0.0, "loss_total": 0.2364729940891266, "step": 521 }, { "batch_size": 4, "epoch": 0.2084, "step": 521, "tokens_per_device": 6268 }, { "epoch": 0.2084, "loss_ce": 0.12241773307323456, "loss_lvr": 0.983803391456604, "loss_mode_switch": 0.0, "loss_total": 0.2207980751991272, "step": 521 }, { "batch_size": 4, "epoch": 0.2084, "step": 521, "tokens_per_device": 2852 }, { "epoch": 0.2084, "loss_ce": 0.3465192914009094, "loss_lvr": 1.1571661233901978, "loss_mode_switch": 0.0, "loss_total": 0.4622358977794647, "step": 521 }, { "batch_size": 4, "epoch": 0.2084, "step": 521, "tokens_per_device": 1424 }, { "epoch": 0.2084, "loss_ce": 0.48117056488990784, "loss_lvr": 1.1516109704971313, "loss_mode_switch": 0.0, "loss_total": 0.5963316559791565, "step": 521 }, { "batch_size": 4, "epoch": 0.2084, "step": 521, "tokens_per_device": 4388 }, { "epoch": 0.2084, "loss_ce": 0.1142842173576355, "loss_lvr": 0.8956285119056702, "loss_mode_switch": 0.0, "loss_total": 0.20384708046913147, "step": 521 }, { "batch_size": 4, "epoch": 0.2084, "step": 521, "tokens_per_device": 1376 }, { "epoch": 0.2084, "loss_ce": 0.3888963758945465, "loss_lvr": 0.9973532557487488, "loss_mode_switch": 0.0, "loss_total": 0.4886316955089569, "step": 521 }, { "batch_size": 1, "epoch": 0.2084, "step": 521, "tokens_per_device": 5188 }, { "epoch": 0.2084, "loss_ce": 0.06339164823293686, "loss_lvr": 0.6420254111289978, "loss_mode_switch": 0.0, "loss_total": 0.12759418785572052, "step": 521 }, { "batch_size": 4, "epoch": 0.2084, "step": 521, "tokens_per_device": 5568 }, { "epoch": 0.2084, "loss_ce": 0.7075682282447815, "loss_lvr": 0.7498903274536133, "loss_mode_switch": 0.0, "loss_total": 0.7825572490692139, "step": 521 }, { "epoch": 0.2088, "grad_norm": 1.6502830982208252, "learning_rate": 9.184806531536438e-06, "loss": 0.3444, "step": 522 }, { "batch_size": 4, "epoch": 0.2088, "step": 522, "tokens_per_device": 8400 }, { "epoch": 0.2088, "loss_ce": 0.6951605677604675, "loss_lvr": 1.0491479635238647, "loss_mode_switch": 0.0, "loss_total": 0.800075352191925, "step": 522 }, { "batch_size": 4, "epoch": 0.2088, "step": 522, "tokens_per_device": 4244 }, { "epoch": 0.2088, "loss_ce": 0.2788906693458557, "loss_lvr": 0.9928966164588928, "loss_mode_switch": 0.0, "loss_total": 0.3781803250312805, "step": 522 }, { "batch_size": 1, "epoch": 0.2088, "step": 522, "tokens_per_device": 5022 }, { "epoch": 0.2088, "loss_ce": 0.005756664089858532, "loss_lvr": 0.503150999546051, "loss_mode_switch": 0.0, "loss_total": 0.05607176572084427, "step": 522 }, { "batch_size": 4, "epoch": 0.2088, "step": 522, "tokens_per_device": 1384 }, { "epoch": 0.2088, "loss_ce": 0.7708868384361267, "loss_lvr": 1.246193289756775, "loss_mode_switch": 0.0, "loss_total": 0.8955061435699463, "step": 522 }, { "batch_size": 4, "epoch": 0.2088, "step": 522, "tokens_per_device": 9496 }, { "epoch": 0.2088, "loss_ce": 0.23435689508914948, "loss_lvr": 0.839201807975769, "loss_mode_switch": 0.0, "loss_total": 0.3182770609855652, "step": 522 }, { "batch_size": 1, "epoch": 0.2088, "step": 522, "tokens_per_device": 5147 }, { "epoch": 0.2088, "loss_ce": 0.02144407294690609, "loss_lvr": 0.6087408661842346, "loss_mode_switch": 0.0, "loss_total": 0.08231815695762634, "step": 522 }, { "batch_size": 4, "epoch": 0.2088, "step": 522, "tokens_per_device": 4240 }, { "epoch": 0.2088, "loss_ce": 0.009856930933892727, "loss_lvr": 1.0149391889572144, "loss_mode_switch": 0.0, "loss_total": 0.11135084927082062, "step": 522 }, { "batch_size": 4, "epoch": 0.2088, "step": 522, "tokens_per_device": 5036 }, { "epoch": 0.2088, "loss_ce": 0.07485032826662064, "loss_lvr": 0.767885684967041, "loss_mode_switch": 0.0, "loss_total": 0.15163889527320862, "step": 522 }, { "epoch": 0.2092, "grad_norm": 1.3063697814941406, "learning_rate": 9.181258124929582e-06, "loss": 0.3218, "step": 523 }, { "batch_size": 4, "epoch": 0.2092, "step": 523, "tokens_per_device": 4468 }, { "epoch": 0.2092, "loss_ce": 0.047064416110515594, "loss_lvr": 0.6754578351974487, "loss_mode_switch": 0.0, "loss_total": 0.1146102026104927, "step": 523 }, { "batch_size": 4, "epoch": 0.2092, "step": 523, "tokens_per_device": 1280 }, { "epoch": 0.2092, "loss_ce": 0.15792512893676758, "loss_lvr": 1.0462934970855713, "loss_mode_switch": 0.0, "loss_total": 0.26255446672439575, "step": 523 }, { "batch_size": 4, "epoch": 0.2092, "step": 523, "tokens_per_device": 5100 }, { "epoch": 0.2092, "loss_ce": 0.1022370457649231, "loss_lvr": 0.927055299282074, "loss_mode_switch": 0.0, "loss_total": 0.19494257867336273, "step": 523 }, { "batch_size": 4, "epoch": 0.2092, "step": 523, "tokens_per_device": 4248 }, { "epoch": 0.2092, "loss_ce": 0.08509515970945358, "loss_lvr": 0.8824592232704163, "loss_mode_switch": 0.0, "loss_total": 0.1733410805463791, "step": 523 }, { "batch_size": 4, "epoch": 0.2092, "step": 523, "tokens_per_device": 1304 }, { "epoch": 0.2092, "loss_ce": 0.5955176949501038, "loss_lvr": 1.1880990266799927, "loss_mode_switch": 0.0, "loss_total": 0.7143275737762451, "step": 523 }, { "batch_size": 4, "epoch": 0.2092, "step": 523, "tokens_per_device": 5716 }, { "epoch": 0.2092, "loss_ce": 0.17493240535259247, "loss_lvr": 0.8971646428108215, "loss_mode_switch": 0.0, "loss_total": 0.2646488547325134, "step": 523 }, { "batch_size": 4, "epoch": 0.2092, "step": 523, "tokens_per_device": 5524 }, { "epoch": 0.2092, "loss_ce": 0.6146918535232544, "loss_lvr": 0.9271485805511475, "loss_mode_switch": 0.0, "loss_total": 0.7074066996574402, "step": 523 }, { "batch_size": 1, "epoch": 0.2092, "step": 523, "tokens_per_device": 5052 }, { "epoch": 0.2092, "loss_ce": 0.01672552525997162, "loss_lvr": 0.49029475450515747, "loss_mode_switch": 0.0, "loss_total": 0.06575500220060349, "step": 523 }, { "epoch": 0.2096, "grad_norm": 1.337912917137146, "learning_rate": 9.1777027008105e-06, "loss": 0.2825, "step": 524 }, { "batch_size": 4, "epoch": 0.2096, "step": 524, "tokens_per_device": 1432 }, { "epoch": 0.2096, "loss_ce": 0.3873882591724396, "loss_lvr": 1.0525702238082886, "loss_mode_switch": 0.0, "loss_total": 0.4926452934741974, "step": 524 }, { "batch_size": 4, "epoch": 0.2096, "step": 524, "tokens_per_device": 5004 }, { "epoch": 0.2096, "loss_ce": 0.06039926037192345, "loss_lvr": 0.9147704243659973, "loss_mode_switch": 0.0, "loss_total": 0.151876300573349, "step": 524 }, { "batch_size": 1, "epoch": 0.2096, "step": 524, "tokens_per_device": 4754 }, { "epoch": 0.2096, "loss_ce": 0.015614558942615986, "loss_lvr": 0.5746909976005554, "loss_mode_switch": 0.0, "loss_total": 0.07308366149663925, "step": 524 }, { "batch_size": 4, "epoch": 0.2096, "step": 524, "tokens_per_device": 1408 }, { "epoch": 0.2096, "loss_ce": 0.686558187007904, "loss_lvr": 1.1214803457260132, "loss_mode_switch": 0.0, "loss_total": 0.7987062335014343, "step": 524 }, { "batch_size": 1, "epoch": 0.2096, "step": 524, "tokens_per_device": 4851 }, { "epoch": 0.2096, "loss_ce": 0.0003653892781585455, "loss_lvr": 0.3993474841117859, "loss_mode_switch": 0.0, "loss_total": 0.04030013829469681, "step": 524 }, { "batch_size": 4, "epoch": 0.2096, "step": 524, "tokens_per_device": 1956 }, { "epoch": 0.2096, "loss_ce": 0.5655516386032104, "loss_lvr": 0.954674243927002, "loss_mode_switch": 0.0, "loss_total": 0.6610190868377686, "step": 524 }, { "batch_size": 4, "epoch": 0.2096, "step": 524, "tokens_per_device": 4340 }, { "epoch": 0.2096, "loss_ce": 0.26781165599823, "loss_lvr": 1.2786818742752075, "loss_mode_switch": 0.0, "loss_total": 0.3956798315048218, "step": 524 }, { "batch_size": 4, "epoch": 0.2096, "step": 524, "tokens_per_device": 1596 }, { "epoch": 0.2096, "loss_ce": 0.5621351599693298, "loss_lvr": 1.0232504606246948, "loss_mode_switch": 0.0, "loss_total": 0.6644601821899414, "step": 524 }, { "epoch": 0.21, "grad_norm": 1.6712180376052856, "learning_rate": 9.174140265146356e-06, "loss": 0.3318, "step": 525 }, { "batch_size": 4, "epoch": 0.21, "step": 525, "tokens_per_device": 4240 }, { "epoch": 0.21, "loss_ce": 0.44157519936561584, "loss_lvr": 1.0789859294891357, "loss_mode_switch": 0.0, "loss_total": 0.549473762512207, "step": 525 }, { "batch_size": 4, "epoch": 0.21, "step": 525, "tokens_per_device": 9836 }, { "epoch": 0.21, "loss_ce": 0.052295442670583725, "loss_lvr": 0.9551504850387573, "loss_mode_switch": 0.0, "loss_total": 0.14781048893928528, "step": 525 }, { "batch_size": 4, "epoch": 0.21, "step": 525, "tokens_per_device": 5832 }, { "epoch": 0.21, "loss_ce": 0.5670886039733887, "loss_lvr": 0.8158836960792542, "loss_mode_switch": 0.0, "loss_total": 0.6486769914627075, "step": 525 }, { "batch_size": 4, "epoch": 0.21, "step": 525, "tokens_per_device": 3780 }, { "epoch": 0.21, "loss_ce": 0.16393618285655975, "loss_lvr": 1.616544485092163, "loss_mode_switch": 0.0, "loss_total": 0.3255906403064728, "step": 525 }, { "batch_size": 4, "epoch": 0.21, "step": 525, "tokens_per_device": 4040 }, { "epoch": 0.21, "loss_ce": 0.0810014009475708, "loss_lvr": 0.8310956954956055, "loss_mode_switch": 0.0, "loss_total": 0.1641109734773636, "step": 525 }, { "batch_size": 4, "epoch": 0.21, "step": 525, "tokens_per_device": 4976 }, { "epoch": 0.21, "loss_ce": 0.4666815996170044, "loss_lvr": 0.9197801947593689, "loss_mode_switch": 0.0, "loss_total": 0.5586596131324768, "step": 525 }, { "batch_size": 4, "epoch": 0.21, "step": 525, "tokens_per_device": 2664 }, { "epoch": 0.21, "loss_ce": 0.3280107080936432, "loss_lvr": 1.1028640270233154, "loss_mode_switch": 0.0, "loss_total": 0.4382971227169037, "step": 525 }, { "batch_size": 4, "epoch": 0.21, "step": 525, "tokens_per_device": 5744 }, { "epoch": 0.21, "loss_ce": 0.06686513125896454, "loss_lvr": 0.8492441773414612, "loss_mode_switch": 0.0, "loss_total": 0.15178954601287842, "step": 525 }, { "epoch": 0.2104, "grad_norm": 1.448541283607483, "learning_rate": 9.170570823916074e-06, "loss": 0.3009, "step": 526 }, { "batch_size": 4, "epoch": 0.2104, "step": 526, "tokens_per_device": 4720 }, { "epoch": 0.2104, "loss_ce": 0.261991024017334, "loss_lvr": 0.9760095477104187, "loss_mode_switch": 0.0, "loss_total": 0.3595919907093048, "step": 526 }, { "batch_size": 1, "epoch": 0.2104, "step": 526, "tokens_per_device": 5220 }, { "epoch": 0.2104, "loss_ce": 0.08332517743110657, "loss_lvr": 0.39215999841690063, "loss_mode_switch": 0.0, "loss_total": 0.12254117429256439, "step": 526 }, { "batch_size": 4, "epoch": 0.2104, "step": 526, "tokens_per_device": 3772 }, { "epoch": 0.2104, "loss_ce": 0.2638859152793884, "loss_lvr": 1.0238022804260254, "loss_mode_switch": 0.0, "loss_total": 0.366266131401062, "step": 526 }, { "batch_size": 4, "epoch": 0.2104, "step": 526, "tokens_per_device": 3832 }, { "epoch": 0.2104, "loss_ce": 0.26839679479599, "loss_lvr": 1.0753756761550903, "loss_mode_switch": 0.0, "loss_total": 0.375934362411499, "step": 526 }, { "batch_size": 1, "epoch": 0.2104, "step": 526, "tokens_per_device": 5006 }, { "epoch": 0.2104, "loss_ce": 0.005672765430063009, "loss_lvr": 0.47570905089378357, "loss_mode_switch": 0.0, "loss_total": 0.053243670612573624, "step": 526 }, { "batch_size": 4, "epoch": 0.2104, "step": 526, "tokens_per_device": 5360 }, { "epoch": 0.2104, "loss_ce": 0.34669384360313416, "loss_lvr": 1.1431541442871094, "loss_mode_switch": 0.0, "loss_total": 0.46100926399230957, "step": 526 }, { "batch_size": 4, "epoch": 0.2104, "step": 526, "tokens_per_device": 4756 }, { "epoch": 0.2104, "loss_ce": 0.39220768213272095, "loss_lvr": 1.0053147077560425, "loss_mode_switch": 0.0, "loss_total": 0.49273914098739624, "step": 526 }, { "batch_size": 1, "epoch": 0.2104, "step": 526, "tokens_per_device": 8125 }, { "epoch": 0.2104, "loss_ce": 0.0017892931355163455, "loss_lvr": 0.42562997341156006, "loss_mode_switch": 0.0, "loss_total": 0.04435229301452637, "step": 526 }, { "epoch": 0.2108, "grad_norm": 1.5058262348175049, "learning_rate": 9.166994383110338e-06, "loss": 0.3002, "step": 527 }, { "batch_size": 4, "epoch": 0.2108, "step": 527, "tokens_per_device": 4192 }, { "epoch": 0.2108, "loss_ce": 0.07980985939502716, "loss_lvr": 1.4579862356185913, "loss_mode_switch": 0.0, "loss_total": 0.2256084829568863, "step": 527 }, { "batch_size": 1, "epoch": 0.2108, "step": 527, "tokens_per_device": 4880 }, { "epoch": 0.2108, "loss_ce": 0.0007622059783898294, "loss_lvr": 0.2767105996608734, "loss_mode_switch": 0.0, "loss_total": 0.028433265164494514, "step": 527 }, { "batch_size": 1, "epoch": 0.2108, "step": 527, "tokens_per_device": 4884 }, { "epoch": 0.2108, "loss_ce": 0.2077505737543106, "loss_lvr": 0.30150094628334045, "loss_mode_switch": 0.0, "loss_total": 0.23790067434310913, "step": 527 }, { "batch_size": 4, "epoch": 0.2108, "step": 527, "tokens_per_device": 10056 }, { "epoch": 0.2108, "loss_ce": 0.1774516999721527, "loss_lvr": 0.7154772877693176, "loss_mode_switch": 0.0, "loss_total": 0.2489994317293167, "step": 527 }, { "batch_size": 4, "epoch": 0.2108, "step": 527, "tokens_per_device": 2652 }, { "epoch": 0.2108, "loss_ce": 0.15710708498954773, "loss_lvr": 1.0167949199676514, "loss_mode_switch": 0.0, "loss_total": 0.2587865889072418, "step": 527 }, { "batch_size": 1, "epoch": 0.2108, "step": 527, "tokens_per_device": 5135 }, { "epoch": 0.2108, "loss_ce": 0.609001874923706, "loss_lvr": 0.2233162820339203, "loss_mode_switch": 0.0, "loss_total": 0.6313335299491882, "step": 527 }, { "batch_size": 1, "epoch": 0.2108, "step": 527, "tokens_per_device": 5175 }, { "epoch": 0.2108, "loss_ce": 0.0033898665569722652, "loss_lvr": 0.2984011769294739, "loss_mode_switch": 0.0, "loss_total": 0.03322998434305191, "step": 527 }, { "batch_size": 4, "epoch": 0.2108, "step": 527, "tokens_per_device": 2884 }, { "epoch": 0.2108, "loss_ce": 0.0886535793542862, "loss_lvr": 0.702028214931488, "loss_mode_switch": 0.0, "loss_total": 0.15885639190673828, "step": 527 }, { "epoch": 0.2112, "grad_norm": 1.6216130256652832, "learning_rate": 9.16341094873158e-06, "loss": 0.3298, "step": 528 }, { "batch_size": 4, "epoch": 0.2112, "step": 528, "tokens_per_device": 4728 }, { "epoch": 0.2112, "loss_ce": 0.2800561487674713, "loss_lvr": 1.0425388813018799, "loss_mode_switch": 0.0, "loss_total": 0.3843100368976593, "step": 528 }, { "batch_size": 4, "epoch": 0.2112, "step": 528, "tokens_per_device": 9292 }, { "epoch": 0.2112, "loss_ce": 0.0582316517829895, "loss_lvr": 0.7722368240356445, "loss_mode_switch": 0.0, "loss_total": 0.13545534014701843, "step": 528 }, { "batch_size": 4, "epoch": 0.2112, "step": 528, "tokens_per_device": 4420 }, { "epoch": 0.2112, "loss_ce": 0.32565003633499146, "loss_lvr": 1.0378788709640503, "loss_mode_switch": 0.0, "loss_total": 0.42943793535232544, "step": 528 }, { "batch_size": 4, "epoch": 0.2112, "step": 528, "tokens_per_device": 4228 }, { "epoch": 0.2112, "loss_ce": 0.16840660572052002, "loss_lvr": 1.0194065570831299, "loss_mode_switch": 0.0, "loss_total": 0.2703472673892975, "step": 528 }, { "batch_size": 4, "epoch": 0.2112, "step": 528, "tokens_per_device": 3872 }, { "epoch": 0.2112, "loss_ce": 0.18427759408950806, "loss_lvr": 1.074588418006897, "loss_mode_switch": 0.0, "loss_total": 0.2917364239692688, "step": 528 }, { "batch_size": 4, "epoch": 0.2112, "step": 528, "tokens_per_device": 5968 }, { "epoch": 0.2112, "loss_ce": 0.16806235909461975, "loss_lvr": 0.9113882780075073, "loss_mode_switch": 0.0, "loss_total": 0.25920119881629944, "step": 528 }, { "batch_size": 1, "epoch": 0.2112, "step": 528, "tokens_per_device": 5163 }, { "epoch": 0.2112, "loss_ce": 0.01256517507135868, "loss_lvr": 0.6247388124465942, "loss_mode_switch": 0.0, "loss_total": 0.07503905892372131, "step": 528 }, { "batch_size": 1, "epoch": 0.2112, "step": 528, "tokens_per_device": 5647 }, { "epoch": 0.2112, "loss_ce": 0.006017948966473341, "loss_lvr": 0.6902487874031067, "loss_mode_switch": 0.0, "loss_total": 0.07504282891750336, "step": 528 }, { "epoch": 0.2116, "grad_norm": 1.2262769937515259, "learning_rate": 9.15982052679397e-06, "loss": 0.2892, "step": 529 }, { "batch_size": 4, "epoch": 0.2116, "step": 529, "tokens_per_device": 4660 }, { "epoch": 0.2116, "loss_ce": 0.3164481520652771, "loss_lvr": 0.9908116459846497, "loss_mode_switch": 0.0, "loss_total": 0.4155293107032776, "step": 529 }, { "batch_size": 4, "epoch": 0.2116, "step": 529, "tokens_per_device": 3784 }, { "epoch": 0.2116, "loss_ce": 0.09286010265350342, "loss_lvr": 3.0330586433410645, "loss_mode_switch": 0.0, "loss_total": 0.39616596698760986, "step": 529 }, { "batch_size": 1, "epoch": 0.2116, "step": 529, "tokens_per_device": 5185 }, { "epoch": 0.2116, "loss_ce": 0.005916418042033911, "loss_lvr": 0.4500901699066162, "loss_mode_switch": 0.0, "loss_total": 0.05092543363571167, "step": 529 }, { "batch_size": 1, "epoch": 0.2116, "step": 529, "tokens_per_device": 5155 }, { "epoch": 0.2116, "loss_ce": 0.004105785395950079, "loss_lvr": 0.6056641340255737, "loss_mode_switch": 0.0, "loss_total": 0.06467220187187195, "step": 529 }, { "batch_size": 4, "epoch": 0.2116, "step": 529, "tokens_per_device": 10768 }, { "epoch": 0.2116, "loss_ce": 0.3196311891078949, "loss_lvr": 1.02586030960083, "loss_mode_switch": 0.0, "loss_total": 0.4222172200679779, "step": 529 }, { "batch_size": 4, "epoch": 0.2116, "step": 529, "tokens_per_device": 6308 }, { "epoch": 0.2116, "loss_ce": 0.6758937239646912, "loss_lvr": 0.8970709443092346, "loss_mode_switch": 0.0, "loss_total": 0.7656008005142212, "step": 529 }, { "batch_size": 4, "epoch": 0.2116, "step": 529, "tokens_per_device": 3804 }, { "epoch": 0.2116, "loss_ce": 0.5351808071136475, "loss_lvr": 1.1649459600448608, "loss_mode_switch": 0.0, "loss_total": 0.6516754031181335, "step": 529 }, { "batch_size": 4, "epoch": 0.2116, "step": 529, "tokens_per_device": 3784 }, { "epoch": 0.2116, "loss_ce": 0.7010184526443481, "loss_lvr": 1.0386232137680054, "loss_mode_switch": 0.0, "loss_total": 0.8048807978630066, "step": 529 }, { "epoch": 0.212, "grad_norm": 1.6544302701950073, "learning_rate": 9.156223123323405e-06, "loss": 0.3263, "step": 530 }, { "batch_size": 1, "epoch": 0.212, "step": 530, "tokens_per_device": 4819 }, { "epoch": 0.212, "loss_ce": 0.020243383944034576, "loss_lvr": 0.4531395435333252, "loss_mode_switch": 0.0, "loss_total": 0.0655573382973671, "step": 530 }, { "batch_size": 4, "epoch": 0.212, "step": 530, "tokens_per_device": 2644 }, { "epoch": 0.212, "loss_ce": 0.07848381996154785, "loss_lvr": 0.9529128074645996, "loss_mode_switch": 0.0, "loss_total": 0.1737751066684723, "step": 530 }, { "batch_size": 4, "epoch": 0.212, "step": 530, "tokens_per_device": 5704 }, { "epoch": 0.212, "loss_ce": 0.004396185744553804, "loss_lvr": 0.6548223495483398, "loss_mode_switch": 0.0, "loss_total": 0.06987842172384262, "step": 530 }, { "batch_size": 1, "epoch": 0.212, "step": 530, "tokens_per_device": 5676 }, { "epoch": 0.212, "loss_ce": 0.03185155615210533, "loss_lvr": 1.1420637369155884, "loss_mode_switch": 0.0, "loss_total": 0.14605793356895447, "step": 530 }, { "batch_size": 4, "epoch": 0.212, "step": 530, "tokens_per_device": 2920 }, { "epoch": 0.212, "loss_ce": 0.1873042732477188, "loss_lvr": 0.8084220290184021, "loss_mode_switch": 0.0, "loss_total": 0.26814648509025574, "step": 530 }, { "batch_size": 4, "epoch": 0.212, "step": 530, "tokens_per_device": 8884 }, { "epoch": 0.212, "loss_ce": 0.2537461817264557, "loss_lvr": 1.1044058799743652, "loss_mode_switch": 0.0, "loss_total": 0.36418676376342773, "step": 530 }, { "batch_size": 4, "epoch": 0.212, "step": 530, "tokens_per_device": 3920 }, { "epoch": 0.212, "loss_ce": 0.5129700303077698, "loss_lvr": 1.1990753412246704, "loss_mode_switch": 0.0, "loss_total": 0.6328775882720947, "step": 530 }, { "batch_size": 4, "epoch": 0.212, "step": 530, "tokens_per_device": 1828 }, { "epoch": 0.212, "loss_ce": 0.5702210068702698, "loss_lvr": 1.0191919803619385, "loss_mode_switch": 0.0, "loss_total": 0.6721401810646057, "step": 530 }, { "epoch": 0.2124, "grad_norm": 1.382888913154602, "learning_rate": 9.152618744357498e-06, "loss": 0.3363, "step": 531 }, { "batch_size": 4, "epoch": 0.2124, "step": 531, "tokens_per_device": 5180 }, { "epoch": 0.2124, "loss_ce": 0.09241335093975067, "loss_lvr": 0.8754076957702637, "loss_mode_switch": 0.0, "loss_total": 0.17995411157608032, "step": 531 }, { "batch_size": 1, "epoch": 0.2124, "step": 531, "tokens_per_device": 4927 }, { "epoch": 0.2124, "loss_ce": 0.011371833272278309, "loss_lvr": 0.34728768467903137, "loss_mode_switch": 0.0, "loss_total": 0.04610060155391693, "step": 531 }, { "batch_size": 4, "epoch": 0.2124, "step": 531, "tokens_per_device": 4300 }, { "epoch": 0.2124, "loss_ce": 0.2561890780925751, "loss_lvr": 1.0960792303085327, "loss_mode_switch": 0.0, "loss_total": 0.3657970130443573, "step": 531 }, { "batch_size": 4, "epoch": 0.2124, "step": 531, "tokens_per_device": 4616 }, { "epoch": 0.2124, "loss_ce": 0.23954305052757263, "loss_lvr": 1.2071996927261353, "loss_mode_switch": 0.0, "loss_total": 0.36026301980018616, "step": 531 }, { "batch_size": 4, "epoch": 0.2124, "step": 531, "tokens_per_device": 7408 }, { "epoch": 0.2124, "loss_ce": 0.4400314688682556, "loss_lvr": 0.9126396775245667, "loss_mode_switch": 0.0, "loss_total": 0.5312954187393188, "step": 531 }, { "batch_size": 1, "epoch": 0.2124, "step": 531, "tokens_per_device": 4884 }, { "epoch": 0.2124, "loss_ce": 0.02359744906425476, "loss_lvr": 0.35732561349868774, "loss_mode_switch": 0.0, "loss_total": 0.059330012649297714, "step": 531 }, { "batch_size": 4, "epoch": 0.2124, "step": 531, "tokens_per_device": 3868 }, { "epoch": 0.2124, "loss_ce": 0.01299481000751257, "loss_lvr": 0.9669399857521057, "loss_mode_switch": 0.0, "loss_total": 0.10968881100416183, "step": 531 }, { "batch_size": 4, "epoch": 0.2124, "step": 531, "tokens_per_device": 9672 }, { "epoch": 0.2124, "loss_ce": 0.4150632321834564, "loss_lvr": 0.8048545122146606, "loss_mode_switch": 0.0, "loss_total": 0.49554869532585144, "step": 531 }, { "epoch": 0.2128, "grad_norm": 1.2667938470840454, "learning_rate": 9.149007395945569e-06, "loss": 0.2834, "step": 532 }, { "batch_size": 4, "epoch": 0.2128, "step": 532, "tokens_per_device": 1204 }, { "epoch": 0.2128, "loss_ce": 0.3220442235469818, "loss_lvr": 1.0162959098815918, "loss_mode_switch": 0.0, "loss_total": 0.4236738085746765, "step": 532 }, { "batch_size": 4, "epoch": 0.2128, "step": 532, "tokens_per_device": 5528 }, { "epoch": 0.2128, "loss_ce": 0.10471096634864807, "loss_lvr": 1.06007981300354, "loss_mode_switch": 0.0, "loss_total": 0.21071895956993103, "step": 532 }, { "batch_size": 4, "epoch": 0.2128, "step": 532, "tokens_per_device": 7652 }, { "epoch": 0.2128, "loss_ce": 0.1671120673418045, "loss_lvr": 0.7123638391494751, "loss_mode_switch": 0.0, "loss_total": 0.23834845423698425, "step": 532 }, { "batch_size": 4, "epoch": 0.2128, "step": 532, "tokens_per_device": 1412 }, { "epoch": 0.2128, "loss_ce": 0.5413988828659058, "loss_lvr": 1.1642545461654663, "loss_mode_switch": 0.0, "loss_total": 0.6578243374824524, "step": 532 }, { "batch_size": 1, "epoch": 0.2128, "step": 532, "tokens_per_device": 4912 }, { "epoch": 0.2128, "loss_ce": 0.4799268841743469, "loss_lvr": 0.7022611498832703, "loss_mode_switch": 0.0, "loss_total": 0.5501530170440674, "step": 532 }, { "batch_size": 4, "epoch": 0.2128, "step": 532, "tokens_per_device": 3380 }, { "epoch": 0.2128, "loss_ce": 0.2459668666124344, "loss_lvr": 0.9315454959869385, "loss_mode_switch": 0.0, "loss_total": 0.33912140130996704, "step": 532 }, { "batch_size": 1, "epoch": 0.2128, "step": 532, "tokens_per_device": 5107 }, { "epoch": 0.2128, "loss_ce": 0.008384938351809978, "loss_lvr": 0.553602397441864, "loss_mode_switch": 0.0, "loss_total": 0.0637451782822609, "step": 532 }, { "batch_size": 1, "epoch": 0.2128, "step": 532, "tokens_per_device": 5047 }, { "epoch": 0.2128, "loss_ce": 0.005417520180344582, "loss_lvr": 0.8279329538345337, "loss_mode_switch": 0.0, "loss_total": 0.0882108137011528, "step": 532 }, { "epoch": 0.2132, "grad_norm": 1.4737859964370728, "learning_rate": 9.14538908414864e-06, "loss": 0.2995, "step": 533 }, { "batch_size": 1, "epoch": 0.2132, "step": 533, "tokens_per_device": 4353 }, { "epoch": 0.2132, "loss_ce": 0.1649651974439621, "loss_lvr": 0.4775809943675995, "loss_mode_switch": 0.0, "loss_total": 0.21272329986095428, "step": 533 }, { "batch_size": 4, "epoch": 0.2132, "step": 533, "tokens_per_device": 4836 }, { "epoch": 0.2132, "loss_ce": 0.24154914915561676, "loss_lvr": 1.0828301906585693, "loss_mode_switch": 0.0, "loss_total": 0.3498321771621704, "step": 533 }, { "batch_size": 4, "epoch": 0.2132, "step": 533, "tokens_per_device": 3904 }, { "epoch": 0.2132, "loss_ce": 0.11604270339012146, "loss_lvr": 1.7838579416275024, "loss_mode_switch": 0.0, "loss_total": 0.2944284975528717, "step": 533 }, { "batch_size": 4, "epoch": 0.2132, "step": 533, "tokens_per_device": 4588 }, { "epoch": 0.2132, "loss_ce": 0.06130344048142433, "loss_lvr": 0.9906789660453796, "loss_mode_switch": 0.0, "loss_total": 0.160371333360672, "step": 533 }, { "batch_size": 1, "epoch": 0.2132, "step": 533, "tokens_per_device": 5114 }, { "epoch": 0.2132, "loss_ce": 0.07561792433261871, "loss_lvr": 0.4796014428138733, "loss_mode_switch": 0.0, "loss_total": 0.12357807159423828, "step": 533 }, { "batch_size": 1, "epoch": 0.2132, "step": 533, "tokens_per_device": 5098 }, { "epoch": 0.2132, "loss_ce": 0.24855327606201172, "loss_lvr": 0.6374943256378174, "loss_mode_switch": 0.0, "loss_total": 0.31230270862579346, "step": 533 }, { "batch_size": 4, "epoch": 0.2132, "step": 533, "tokens_per_device": 1556 }, { "epoch": 0.2132, "loss_ce": 0.848705530166626, "loss_lvr": 0.9952012300491333, "loss_mode_switch": 0.0, "loss_total": 0.9482256770133972, "step": 533 }, { "batch_size": 1, "epoch": 0.2132, "step": 533, "tokens_per_device": 4734 }, { "epoch": 0.2132, "loss_ce": 0.01044007670134306, "loss_lvr": 0.5249648094177246, "loss_mode_switch": 0.0, "loss_total": 0.06293655931949615, "step": 533 }, { "epoch": 0.2136, "grad_norm": 1.4375680685043335, "learning_rate": 9.141763815039413e-06, "loss": 0.276, "step": 534 }, { "batch_size": 4, "epoch": 0.2136, "step": 534, "tokens_per_device": 10392 }, { "epoch": 0.2136, "loss_ce": 0.14987702667713165, "loss_lvr": 0.6955607533454895, "loss_mode_switch": 0.0, "loss_total": 0.21943309903144836, "step": 534 }, { "batch_size": 4, "epoch": 0.2136, "step": 534, "tokens_per_device": 1816 }, { "epoch": 0.2136, "loss_ce": 0.5204967260360718, "loss_lvr": 1.031288504600525, "loss_mode_switch": 0.0, "loss_total": 0.6236255764961243, "step": 534 }, { "batch_size": 4, "epoch": 0.2136, "step": 534, "tokens_per_device": 4460 }, { "epoch": 0.2136, "loss_ce": 0.05213085189461708, "loss_lvr": 0.9731796979904175, "loss_mode_switch": 0.0, "loss_total": 0.14944882690906525, "step": 534 }, { "batch_size": 4, "epoch": 0.2136, "step": 534, "tokens_per_device": 4508 }, { "epoch": 0.2136, "loss_ce": 0.6924089193344116, "loss_lvr": 0.9764528274536133, "loss_mode_switch": 0.0, "loss_total": 0.790054202079773, "step": 534 }, { "batch_size": 4, "epoch": 0.2136, "step": 534, "tokens_per_device": 3488 }, { "epoch": 0.2136, "loss_ce": 0.5375983715057373, "loss_lvr": 0.976243793964386, "loss_mode_switch": 0.0, "loss_total": 0.6352227330207825, "step": 534 }, { "batch_size": 4, "epoch": 0.2136, "step": 534, "tokens_per_device": 7056 }, { "epoch": 0.2136, "loss_ce": 0.20676442980766296, "loss_lvr": 0.7084634304046631, "loss_mode_switch": 0.0, "loss_total": 0.27761077880859375, "step": 534 }, { "batch_size": 4, "epoch": 0.2136, "step": 534, "tokens_per_device": 6524 }, { "epoch": 0.2136, "loss_ce": 0.027828514575958252, "loss_lvr": 0.845072865486145, "loss_mode_switch": 0.0, "loss_total": 0.11233580112457275, "step": 534 }, { "batch_size": 4, "epoch": 0.2136, "step": 534, "tokens_per_device": 9964 }, { "epoch": 0.2136, "loss_ce": 0.28205516934394836, "loss_lvr": 0.5931901931762695, "loss_mode_switch": 0.0, "loss_total": 0.3413741886615753, "step": 534 }, { "epoch": 0.214, "grad_norm": 2.2195050716400146, "learning_rate": 9.13813159470227e-06, "loss": 0.312, "step": 535 }, { "batch_size": 1, "epoch": 0.214, "step": 535, "tokens_per_device": 5106 }, { "epoch": 0.214, "loss_ce": 0.0076621416956186295, "loss_lvr": 0.6054593324661255, "loss_mode_switch": 0.0, "loss_total": 0.06820807605981827, "step": 535 }, { "batch_size": 4, "epoch": 0.214, "step": 535, "tokens_per_device": 1376 }, { "epoch": 0.214, "loss_ce": 0.2116526961326599, "loss_lvr": 1.2473151683807373, "loss_mode_switch": 0.0, "loss_total": 0.33638420701026917, "step": 535 }, { "batch_size": 4, "epoch": 0.214, "step": 535, "tokens_per_device": 4288 }, { "epoch": 0.214, "loss_ce": 0.3149406313896179, "loss_lvr": 0.9287247657775879, "loss_mode_switch": 0.0, "loss_total": 0.40781310200691223, "step": 535 }, { "batch_size": 1, "epoch": 0.214, "step": 535, "tokens_per_device": 5101 }, { "epoch": 0.214, "loss_ce": 0.0005283429054543376, "loss_lvr": 0.4348282814025879, "loss_mode_switch": 0.0, "loss_total": 0.04401117190718651, "step": 535 }, { "batch_size": 4, "epoch": 0.214, "step": 535, "tokens_per_device": 1440 }, { "epoch": 0.214, "loss_ce": 0.6132773160934448, "loss_lvr": 1.0396312475204468, "loss_mode_switch": 0.0, "loss_total": 0.7172404527664185, "step": 535 }, { "batch_size": 1, "epoch": 0.214, "step": 535, "tokens_per_device": 5126 }, { "epoch": 0.214, "loss_ce": 0.0033373197074979544, "loss_lvr": 0.4102177321910858, "loss_mode_switch": 0.0, "loss_total": 0.04435909539461136, "step": 535 }, { "batch_size": 1, "epoch": 0.214, "step": 535, "tokens_per_device": 4907 }, { "epoch": 0.214, "loss_ce": 0.17699377238750458, "loss_lvr": 0.8466832637786865, "loss_mode_switch": 0.0, "loss_total": 0.261662095785141, "step": 535 }, { "batch_size": 4, "epoch": 0.214, "step": 535, "tokens_per_device": 1404 }, { "epoch": 0.214, "loss_ce": 0.5494763851165771, "loss_lvr": 1.7342922687530518, "loss_mode_switch": 0.0, "loss_total": 0.7229056358337402, "step": 535 }, { "epoch": 0.2144, "grad_norm": 1.5031079053878784, "learning_rate": 9.134492429233262e-06, "loss": 0.3218, "step": 536 }, { "batch_size": 1, "epoch": 0.2144, "step": 536, "tokens_per_device": 5142 }, { "epoch": 0.2144, "loss_ce": 0.0008194954134523869, "loss_lvr": 0.3251551687717438, "loss_mode_switch": 0.0, "loss_total": 0.033335015177726746, "step": 536 }, { "batch_size": 1, "epoch": 0.2144, "step": 536, "tokens_per_device": 4913 }, { "epoch": 0.2144, "loss_ce": 0.042746152728796005, "loss_lvr": 1.0085880756378174, "loss_mode_switch": 0.0, "loss_total": 0.14360496401786804, "step": 536 }, { "batch_size": 4, "epoch": 0.2144, "step": 536, "tokens_per_device": 3788 }, { "epoch": 0.2144, "loss_ce": 0.49721235036849976, "loss_lvr": 1.8847577571868896, "loss_mode_switch": 0.0, "loss_total": 0.6856881380081177, "step": 536 }, { "batch_size": 4, "epoch": 0.2144, "step": 536, "tokens_per_device": 1704 }, { "epoch": 0.2144, "loss_ce": 0.31657010316848755, "loss_lvr": 1.1245561838150024, "loss_mode_switch": 0.0, "loss_total": 0.42902570962905884, "step": 536 }, { "batch_size": 4, "epoch": 0.2144, "step": 536, "tokens_per_device": 4368 }, { "epoch": 0.2144, "loss_ce": 0.4472123682498932, "loss_lvr": 0.9326825737953186, "loss_mode_switch": 0.0, "loss_total": 0.5404806137084961, "step": 536 }, { "batch_size": 4, "epoch": 0.2144, "step": 536, "tokens_per_device": 4268 }, { "epoch": 0.2144, "loss_ce": 0.20522888004779816, "loss_lvr": 1.1562471389770508, "loss_mode_switch": 0.0, "loss_total": 0.320853590965271, "step": 536 }, { "batch_size": 1, "epoch": 0.2144, "step": 536, "tokens_per_device": 5123 }, { "epoch": 0.2144, "loss_ce": 0.6606279611587524, "loss_lvr": 0.3155444860458374, "loss_mode_switch": 0.0, "loss_total": 0.6921824216842651, "step": 536 }, { "batch_size": 4, "epoch": 0.2144, "step": 536, "tokens_per_device": 5180 }, { "epoch": 0.2144, "loss_ce": 0.04277594015002251, "loss_lvr": 0.7530605792999268, "loss_mode_switch": 0.0, "loss_total": 0.11808200180530548, "step": 536 }, { "epoch": 0.2148, "grad_norm": 1.382746934890747, "learning_rate": 9.130846324740087e-06, "loss": 0.3359, "step": 537 }, { "batch_size": 4, "epoch": 0.2148, "step": 537, "tokens_per_device": 2744 }, { "epoch": 0.2148, "loss_ce": 0.017224781215190887, "loss_lvr": 0.9612712860107422, "loss_mode_switch": 0.0, "loss_total": 0.11335191130638123, "step": 537 }, { "batch_size": 4, "epoch": 0.2148, "step": 537, "tokens_per_device": 4720 }, { "epoch": 0.2148, "loss_ce": 0.03397507593035698, "loss_lvr": 0.7562791705131531, "loss_mode_switch": 0.0, "loss_total": 0.10960298776626587, "step": 537 }, { "batch_size": 4, "epoch": 0.2148, "step": 537, "tokens_per_device": 6700 }, { "epoch": 0.2148, "loss_ce": 0.22058013081550598, "loss_lvr": 1.0764408111572266, "loss_mode_switch": 0.0, "loss_total": 0.32822421193122864, "step": 537 }, { "batch_size": 4, "epoch": 0.2148, "step": 537, "tokens_per_device": 3848 }, { "epoch": 0.2148, "loss_ce": 0.39287424087524414, "loss_lvr": 0.8602709770202637, "loss_mode_switch": 0.0, "loss_total": 0.47890132665634155, "step": 537 }, { "batch_size": 1, "epoch": 0.2148, "step": 537, "tokens_per_device": 4906 }, { "epoch": 0.2148, "loss_ce": 0.15867221355438232, "loss_lvr": 0.4819001853466034, "loss_mode_switch": 0.0, "loss_total": 0.20686224102973938, "step": 537 }, { "batch_size": 4, "epoch": 0.2148, "step": 537, "tokens_per_device": 4368 }, { "epoch": 0.2148, "loss_ce": 0.02064153552055359, "loss_lvr": 0.9204971194267273, "loss_mode_switch": 0.0, "loss_total": 0.1126912459731102, "step": 537 }, { "batch_size": 4, "epoch": 0.2148, "step": 537, "tokens_per_device": 6816 }, { "epoch": 0.2148, "loss_ce": 0.02797960676252842, "loss_lvr": 1.1152235269546509, "loss_mode_switch": 0.0, "loss_total": 0.13950195908546448, "step": 537 }, { "batch_size": 4, "epoch": 0.2148, "step": 537, "tokens_per_device": 3308 }, { "epoch": 0.2148, "loss_ce": 0.173609659075737, "loss_lvr": 1.1069484949111938, "loss_mode_switch": 0.0, "loss_total": 0.28430449962615967, "step": 537 }, { "epoch": 0.2152, "grad_norm": 1.3062652349472046, "learning_rate": 9.127193287342103e-06, "loss": 0.2952, "step": 538 }, { "batch_size": 1, "epoch": 0.2152, "step": 538, "tokens_per_device": 5110 }, { "epoch": 0.2152, "loss_ce": 0.06506890803575516, "loss_lvr": 0.7306159734725952, "loss_mode_switch": 0.0, "loss_total": 0.13813051581382751, "step": 538 }, { "batch_size": 1, "epoch": 0.2152, "step": 538, "tokens_per_device": 5051 }, { "epoch": 0.2152, "loss_ce": 0.22583027184009552, "loss_lvr": 0.7496198415756226, "loss_mode_switch": 0.0, "loss_total": 0.30079224705696106, "step": 538 }, { "batch_size": 4, "epoch": 0.2152, "step": 538, "tokens_per_device": 3920 }, { "epoch": 0.2152, "loss_ce": 0.3274073004722595, "loss_lvr": 1.1746035814285278, "loss_mode_switch": 0.0, "loss_total": 0.44486767053604126, "step": 538 }, { "batch_size": 1, "epoch": 0.2152, "step": 538, "tokens_per_device": 4889 }, { "epoch": 0.2152, "loss_ce": 0.0015048100613057613, "loss_lvr": 0.7320350408554077, "loss_mode_switch": 0.0, "loss_total": 0.07470831274986267, "step": 538 }, { "batch_size": 4, "epoch": 0.2152, "step": 538, "tokens_per_device": 1636 }, { "epoch": 0.2152, "loss_ce": 0.08183235675096512, "loss_lvr": 1.3359007835388184, "loss_mode_switch": 0.0, "loss_total": 0.21542245149612427, "step": 538 }, { "batch_size": 4, "epoch": 0.2152, "step": 538, "tokens_per_device": 1248 }, { "epoch": 0.2152, "loss_ce": 0.12409394979476929, "loss_lvr": 1.232424259185791, "loss_mode_switch": 0.0, "loss_total": 0.24733638763427734, "step": 538 }, { "batch_size": 1, "epoch": 0.2152, "step": 538, "tokens_per_device": 4902 }, { "epoch": 0.2152, "loss_ce": 0.09871324151754379, "loss_lvr": 0.7170125246047974, "loss_mode_switch": 0.0, "loss_total": 0.1704144924879074, "step": 538 }, { "batch_size": 4, "epoch": 0.2152, "step": 538, "tokens_per_device": 1280 }, { "epoch": 0.2152, "loss_ce": 0.6370663642883301, "loss_lvr": 1.374541163444519, "loss_mode_switch": 0.0, "loss_total": 0.7745205163955688, "step": 538 }, { "epoch": 0.2156, "grad_norm": 1.5713067054748535, "learning_rate": 9.12353332317029e-06, "loss": 0.2924, "step": 539 }, { "batch_size": 1, "epoch": 0.2156, "step": 539, "tokens_per_device": 4922 }, { "epoch": 0.2156, "loss_ce": 1.0390129089355469, "loss_lvr": 0.7577549815177917, "loss_mode_switch": 0.0, "loss_total": 1.1147884130477905, "step": 539 }, { "batch_size": 4, "epoch": 0.2156, "step": 539, "tokens_per_device": 4176 }, { "epoch": 0.2156, "loss_ce": 0.48733407258987427, "loss_lvr": 1.25014328956604, "loss_mode_switch": 0.0, "loss_total": 0.6123484373092651, "step": 539 }, { "batch_size": 1, "epoch": 0.2156, "step": 539, "tokens_per_device": 5565 }, { "epoch": 0.2156, "loss_ce": 0.0358903594315052, "loss_lvr": 0.5466145873069763, "loss_mode_switch": 0.0, "loss_total": 0.09055182337760925, "step": 539 }, { "batch_size": 4, "epoch": 0.2156, "step": 539, "tokens_per_device": 4432 }, { "epoch": 0.2156, "loss_ce": 0.05221077799797058, "loss_lvr": 0.9053795337677002, "loss_mode_switch": 0.0, "loss_total": 0.14274874329566956, "step": 539 }, { "batch_size": 4, "epoch": 0.2156, "step": 539, "tokens_per_device": 7056 }, { "epoch": 0.2156, "loss_ce": 0.35120752453804016, "loss_lvr": 0.7551922798156738, "loss_mode_switch": 0.0, "loss_total": 0.426726758480072, "step": 539 }, { "batch_size": 1, "epoch": 0.2156, "step": 539, "tokens_per_device": 5182 }, { "epoch": 0.2156, "loss_ce": 0.18767012655735016, "loss_lvr": 0.6412332057952881, "loss_mode_switch": 0.0, "loss_total": 0.25179344415664673, "step": 539 }, { "batch_size": 4, "epoch": 0.2156, "step": 539, "tokens_per_device": 4368 }, { "epoch": 0.2156, "loss_ce": 0.07691755890846252, "loss_lvr": 1.463025450706482, "loss_mode_switch": 0.0, "loss_total": 0.2232201099395752, "step": 539 }, { "batch_size": 4, "epoch": 0.2156, "step": 539, "tokens_per_device": 3812 }, { "epoch": 0.2156, "loss_ce": 0.3265557587146759, "loss_lvr": 0.9584568738937378, "loss_mode_switch": 0.0, "loss_total": 0.42240145802497864, "step": 539 }, { "epoch": 0.216, "grad_norm": 1.4952484369277954, "learning_rate": 9.119866438367263e-06, "loss": 0.3433, "step": 540 }, { "batch_size": 4, "epoch": 0.216, "step": 540, "tokens_per_device": 4488 }, { "epoch": 0.216, "loss_ce": 0.2396247237920761, "loss_lvr": 0.779149055480957, "loss_mode_switch": 0.0, "loss_total": 0.31753963232040405, "step": 540 }, { "batch_size": 1, "epoch": 0.216, "step": 540, "tokens_per_device": 4868 }, { "epoch": 0.216, "loss_ce": 0.0009267631685361266, "loss_lvr": 0.6423481106758118, "loss_mode_switch": 0.0, "loss_total": 0.0651615783572197, "step": 540 }, { "batch_size": 1, "epoch": 0.216, "step": 540, "tokens_per_device": 5032 }, { "epoch": 0.216, "loss_ce": 0.19087886810302734, "loss_lvr": 0.6036747097969055, "loss_mode_switch": 0.0, "loss_total": 0.2512463331222534, "step": 540 }, { "batch_size": 4, "epoch": 0.216, "step": 540, "tokens_per_device": 3600 }, { "epoch": 0.216, "loss_ce": 0.21831990778446198, "loss_lvr": 1.0264928340911865, "loss_mode_switch": 0.0, "loss_total": 0.32096919417381287, "step": 540 }, { "batch_size": 1, "epoch": 0.216, "step": 540, "tokens_per_device": 4881 }, { "epoch": 0.216, "loss_ce": 0.00772278755903244, "loss_lvr": 0.8099384307861328, "loss_mode_switch": 0.0, "loss_total": 0.08871663361787796, "step": 540 }, { "batch_size": 4, "epoch": 0.216, "step": 540, "tokens_per_device": 2548 }, { "epoch": 0.216, "loss_ce": 0.107619509100914, "loss_lvr": 0.981316864490509, "loss_mode_switch": 0.0, "loss_total": 0.2057511955499649, "step": 540 }, { "batch_size": 1, "epoch": 0.216, "step": 540, "tokens_per_device": 5014 }, { "epoch": 0.216, "loss_ce": 0.19919635355472565, "loss_lvr": 0.9778537750244141, "loss_mode_switch": 0.0, "loss_total": 0.29698172211647034, "step": 540 }, { "batch_size": 1, "epoch": 0.216, "step": 540, "tokens_per_device": 4880 }, { "epoch": 0.216, "loss_ce": 0.1850551962852478, "loss_lvr": 0.30441588163375854, "loss_mode_switch": 0.0, "loss_total": 0.21549677848815918, "step": 540 }, { "epoch": 0.2164, "grad_norm": 1.4033972024917603, "learning_rate": 9.116192639087245e-06, "loss": 0.3079, "step": 541 }, { "batch_size": 1, "epoch": 0.2164, "step": 541, "tokens_per_device": 4906 }, { "epoch": 0.2164, "loss_ce": 0.16296695172786713, "loss_lvr": 0.3344220519065857, "loss_mode_switch": 0.0, "loss_total": 0.1964091658592224, "step": 541 }, { "batch_size": 4, "epoch": 0.2164, "step": 541, "tokens_per_device": 3868 }, { "epoch": 0.2164, "loss_ce": 0.42148929834365845, "loss_lvr": 0.7763963341712952, "loss_mode_switch": 0.0, "loss_total": 0.49912893772125244, "step": 541 }, { "batch_size": 4, "epoch": 0.2164, "step": 541, "tokens_per_device": 7660 }, { "epoch": 0.2164, "loss_ce": 0.47037792205810547, "loss_lvr": 0.9296366572380066, "loss_mode_switch": 0.0, "loss_total": 0.5633416175842285, "step": 541 }, { "batch_size": 4, "epoch": 0.2164, "step": 541, "tokens_per_device": 1688 }, { "epoch": 0.2164, "loss_ce": 0.060356661677360535, "loss_lvr": 0.9724594950675964, "loss_mode_switch": 0.0, "loss_total": 0.15760260820388794, "step": 541 }, { "batch_size": 4, "epoch": 0.2164, "step": 541, "tokens_per_device": 1728 }, { "epoch": 0.2164, "loss_ce": 0.6196584701538086, "loss_lvr": 1.0731905698776245, "loss_mode_switch": 0.0, "loss_total": 0.726977527141571, "step": 541 }, { "batch_size": 1, "epoch": 0.2164, "step": 541, "tokens_per_device": 5120 }, { "epoch": 0.2164, "loss_ce": 0.0030464131850749254, "loss_lvr": 0.5334432125091553, "loss_mode_switch": 0.0, "loss_total": 0.056390732526779175, "step": 541 }, { "batch_size": 4, "epoch": 0.2164, "step": 541, "tokens_per_device": 5444 }, { "epoch": 0.2164, "loss_ce": 0.26708972454071045, "loss_lvr": 0.8443354964256287, "loss_mode_switch": 0.0, "loss_total": 0.3515232801437378, "step": 541 }, { "batch_size": 1, "epoch": 0.2164, "step": 541, "tokens_per_device": 5101 }, { "epoch": 0.2164, "loss_ce": 0.012260446324944496, "loss_lvr": 0.33740943670272827, "loss_mode_switch": 0.0, "loss_total": 0.04600138962268829, "step": 541 }, { "epoch": 0.2168, "grad_norm": 1.3992656469345093, "learning_rate": 9.11251193149607e-06, "loss": 0.295, "step": 542 }, { "batch_size": 4, "epoch": 0.2168, "step": 542, "tokens_per_device": 2032 }, { "epoch": 0.2168, "loss_ce": 0.2934471070766449, "loss_lvr": 1.0019760131835938, "loss_mode_switch": 0.0, "loss_total": 0.3936447203159332, "step": 542 }, { "batch_size": 4, "epoch": 0.2168, "step": 542, "tokens_per_device": 4872 }, { "epoch": 0.2168, "loss_ce": 0.45727819204330444, "loss_lvr": 0.9993897676467896, "loss_mode_switch": 0.0, "loss_total": 0.5572171807289124, "step": 542 }, { "batch_size": 4, "epoch": 0.2168, "step": 542, "tokens_per_device": 6908 }, { "epoch": 0.2168, "loss_ce": 0.6004970073699951, "loss_lvr": 0.8082262873649597, "loss_mode_switch": 0.0, "loss_total": 0.6813196539878845, "step": 542 }, { "batch_size": 1, "epoch": 0.2168, "step": 542, "tokens_per_device": 8291 }, { "epoch": 0.2168, "loss_ce": 0.0020523262210190296, "loss_lvr": 0.3716568052768707, "loss_mode_switch": 0.0, "loss_total": 0.039218008518218994, "step": 542 }, { "batch_size": 4, "epoch": 0.2168, "step": 542, "tokens_per_device": 6420 }, { "epoch": 0.2168, "loss_ce": 0.03354867175221443, "loss_lvr": 0.8315992951393127, "loss_mode_switch": 0.0, "loss_total": 0.11670860648155212, "step": 542 }, { "batch_size": 4, "epoch": 0.2168, "step": 542, "tokens_per_device": 1900 }, { "epoch": 0.2168, "loss_ce": 0.03652522712945938, "loss_lvr": 1.060074806213379, "loss_mode_switch": 0.0, "loss_total": 0.14253270626068115, "step": 542 }, { "batch_size": 4, "epoch": 0.2168, "step": 542, "tokens_per_device": 4288 }, { "epoch": 0.2168, "loss_ce": 0.026242926716804504, "loss_lvr": 0.7408556342124939, "loss_mode_switch": 0.0, "loss_total": 0.1003284901380539, "step": 542 }, { "batch_size": 4, "epoch": 0.2168, "step": 542, "tokens_per_device": 11636 }, { "epoch": 0.2168, "loss_ce": 0.7440751791000366, "loss_lvr": 0.6141095161437988, "loss_mode_switch": 0.0, "loss_total": 0.8054861426353455, "step": 542 }, { "epoch": 0.2172, "grad_norm": 1.3111774921417236, "learning_rate": 9.108824321771163e-06, "loss": 0.2962, "step": 543 }, { "batch_size": 1, "epoch": 0.2172, "step": 543, "tokens_per_device": 4866 }, { "epoch": 0.2172, "loss_ce": 0.0008738564793020487, "loss_lvr": 0.48223528265953064, "loss_mode_switch": 0.0, "loss_total": 0.04909738525748253, "step": 543 }, { "batch_size": 4, "epoch": 0.2172, "step": 543, "tokens_per_device": 4200 }, { "epoch": 0.2172, "loss_ce": 0.055656079202890396, "loss_lvr": 1.0945132970809937, "loss_mode_switch": 0.0, "loss_total": 0.16510741412639618, "step": 543 }, { "batch_size": 4, "epoch": 0.2172, "step": 543, "tokens_per_device": 6276 }, { "epoch": 0.2172, "loss_ce": 0.4550950229167938, "loss_lvr": 0.5495843887329102, "loss_mode_switch": 0.0, "loss_total": 0.5100534558296204, "step": 543 }, { "batch_size": 4, "epoch": 0.2172, "step": 543, "tokens_per_device": 2800 }, { "epoch": 0.2172, "loss_ce": 0.2865166962146759, "loss_lvr": 1.3739477396011353, "loss_mode_switch": 0.0, "loss_total": 0.423911452293396, "step": 543 }, { "batch_size": 1, "epoch": 0.2172, "step": 543, "tokens_per_device": 6089 }, { "epoch": 0.2172, "loss_ce": 0.021886834874749184, "loss_lvr": 0.2761375904083252, "loss_mode_switch": 0.0, "loss_total": 0.049500592052936554, "step": 543 }, { "batch_size": 1, "epoch": 0.2172, "step": 543, "tokens_per_device": 4953 }, { "epoch": 0.2172, "loss_ce": 0.32723522186279297, "loss_lvr": 0.3233242332935333, "loss_mode_switch": 0.0, "loss_total": 0.35956764221191406, "step": 543 }, { "batch_size": 4, "epoch": 0.2172, "step": 543, "tokens_per_device": 1648 }, { "epoch": 0.2172, "loss_ce": 0.2798464298248291, "loss_lvr": 0.9457574486732483, "loss_mode_switch": 0.0, "loss_total": 0.37442219257354736, "step": 543 }, { "batch_size": 1, "epoch": 0.2172, "step": 543, "tokens_per_device": 4956 }, { "epoch": 0.2172, "loss_ce": 0.33695605397224426, "loss_lvr": 0.5198503732681274, "loss_mode_switch": 0.0, "loss_total": 0.38894107937812805, "step": 543 }, { "epoch": 0.2176, "grad_norm": 1.4199334383010864, "learning_rate": 9.105129816101531e-06, "loss": 0.3181, "step": 544 }, { "batch_size": 4, "epoch": 0.2176, "step": 544, "tokens_per_device": 4920 }, { "epoch": 0.2176, "loss_ce": 0.43845289945602417, "loss_lvr": 0.7072041630744934, "loss_mode_switch": 0.0, "loss_total": 0.5091733336448669, "step": 544 }, { "batch_size": 4, "epoch": 0.2176, "step": 544, "tokens_per_device": 2716 }, { "epoch": 0.2176, "loss_ce": 0.12908980250358582, "loss_lvr": 0.8363677859306335, "loss_mode_switch": 0.0, "loss_total": 0.21272659301757812, "step": 544 }, { "batch_size": 1, "epoch": 0.2176, "step": 544, "tokens_per_device": 4130 }, { "epoch": 0.2176, "loss_ce": 0.16083483397960663, "loss_lvr": 0.593319296836853, "loss_mode_switch": 0.0, "loss_total": 0.22016677260398865, "step": 544 }, { "batch_size": 1, "epoch": 0.2176, "step": 544, "tokens_per_device": 5167 }, { "epoch": 0.2176, "loss_ce": 0.0009723232360556722, "loss_lvr": 0.5203130841255188, "loss_mode_switch": 0.0, "loss_total": 0.05300363153219223, "step": 544 }, { "batch_size": 1, "epoch": 0.2176, "step": 544, "tokens_per_device": 4318 }, { "epoch": 0.2176, "loss_ce": 0.012244259007275105, "loss_lvr": 0.42026403546333313, "loss_mode_switch": 0.0, "loss_total": 0.0542706623673439, "step": 544 }, { "batch_size": 4, "epoch": 0.2176, "step": 544, "tokens_per_device": 3768 }, { "epoch": 0.2176, "loss_ce": 0.11342364549636841, "loss_lvr": 1.1514959335327148, "loss_mode_switch": 0.0, "loss_total": 0.22857323288917542, "step": 544 }, { "batch_size": 1, "epoch": 0.2176, "step": 544, "tokens_per_device": 4899 }, { "epoch": 0.2176, "loss_ce": 0.030322669073939323, "loss_lvr": 0.5709477663040161, "loss_mode_switch": 0.0, "loss_total": 0.08741744607686996, "step": 544 }, { "batch_size": 1, "epoch": 0.2176, "step": 544, "tokens_per_device": 5201 }, { "epoch": 0.2176, "loss_ce": 0.06665942817926407, "loss_lvr": 0.8891755938529968, "loss_mode_switch": 0.0, "loss_total": 0.15557698905467987, "step": 544 }, { "epoch": 0.218, "grad_norm": 1.605170726776123, "learning_rate": 9.101428420687759e-06, "loss": 0.3163, "step": 545 }, { "batch_size": 4, "epoch": 0.218, "step": 545, "tokens_per_device": 5304 }, { "epoch": 0.218, "loss_ce": 0.12939761579036713, "loss_lvr": 0.9814149141311646, "loss_mode_switch": 0.0, "loss_total": 0.22753910720348358, "step": 545 }, { "batch_size": 4, "epoch": 0.218, "step": 545, "tokens_per_device": 1912 }, { "epoch": 0.218, "loss_ce": 0.9238083958625793, "loss_lvr": 0.9505600333213806, "loss_mode_switch": 0.0, "loss_total": 1.018864393234253, "step": 545 }, { "batch_size": 4, "epoch": 0.218, "step": 545, "tokens_per_device": 5868 }, { "epoch": 0.218, "loss_ce": 0.5166869163513184, "loss_lvr": 0.7787386775016785, "loss_mode_switch": 0.0, "loss_total": 0.5945608019828796, "step": 545 }, { "batch_size": 1, "epoch": 0.218, "step": 545, "tokens_per_device": 4744 }, { "epoch": 0.218, "loss_ce": 0.034281451255083084, "loss_lvr": 0.8134855031967163, "loss_mode_switch": 0.0, "loss_total": 0.11563000082969666, "step": 545 }, { "batch_size": 4, "epoch": 0.218, "step": 545, "tokens_per_device": 4956 }, { "epoch": 0.218, "loss_ce": 0.29471728205680847, "loss_lvr": 0.8745697140693665, "loss_mode_switch": 0.0, "loss_total": 0.3821742534637451, "step": 545 }, { "batch_size": 4, "epoch": 0.218, "step": 545, "tokens_per_device": 1456 }, { "epoch": 0.218, "loss_ce": 0.8212464451789856, "loss_lvr": 0.9774969816207886, "loss_mode_switch": 0.0, "loss_total": 0.9189961552619934, "step": 545 }, { "batch_size": 1, "epoch": 0.218, "step": 545, "tokens_per_device": 4922 }, { "epoch": 0.218, "loss_ce": 0.031527530401945114, "loss_lvr": 0.2503044605255127, "loss_mode_switch": 0.0, "loss_total": 0.056557975709438324, "step": 545 }, { "batch_size": 4, "epoch": 0.218, "step": 545, "tokens_per_device": 4232 }, { "epoch": 0.218, "loss_ce": 0.03256189450621605, "loss_lvr": 0.8895703554153442, "loss_mode_switch": 0.0, "loss_total": 0.12151892483234406, "step": 545 }, { "epoch": 0.2184, "grad_norm": 1.458807349205017, "learning_rate": 9.097720141741994e-06, "loss": 0.3055, "step": 546 }, { "batch_size": 4, "epoch": 0.2184, "step": 546, "tokens_per_device": 3816 }, { "epoch": 0.2184, "loss_ce": 0.03336025029420853, "loss_lvr": 1.2044594287872314, "loss_mode_switch": 0.0, "loss_total": 0.1538061946630478, "step": 546 }, { "batch_size": 4, "epoch": 0.2184, "step": 546, "tokens_per_device": 5732 }, { "epoch": 0.2184, "loss_ce": 0.04040128365159035, "loss_lvr": 1.4702404737472534, "loss_mode_switch": 0.0, "loss_total": 0.18742533028125763, "step": 546 }, { "batch_size": 4, "epoch": 0.2184, "step": 546, "tokens_per_device": 4796 }, { "epoch": 0.2184, "loss_ce": 0.043280720710754395, "loss_lvr": 1.4964123964309692, "loss_mode_switch": 0.0, "loss_total": 0.1929219663143158, "step": 546 }, { "batch_size": 1, "epoch": 0.2184, "step": 546, "tokens_per_device": 8007 }, { "epoch": 0.2184, "loss_ce": 0.012295554392039776, "loss_lvr": 0.3982507884502411, "loss_mode_switch": 0.0, "loss_total": 0.0521206334233284, "step": 546 }, { "batch_size": 1, "epoch": 0.2184, "step": 546, "tokens_per_device": 5165 }, { "epoch": 0.2184, "loss_ce": 0.4487968981266022, "loss_lvr": 1.0203332901000977, "loss_mode_switch": 0.0, "loss_total": 0.5508302450180054, "step": 546 }, { "batch_size": 4, "epoch": 0.2184, "step": 546, "tokens_per_device": 5716 }, { "epoch": 0.2184, "loss_ce": 0.03430208936333656, "loss_lvr": 0.816851019859314, "loss_mode_switch": 0.0, "loss_total": 0.11598719656467438, "step": 546 }, { "batch_size": 1, "epoch": 0.2184, "step": 546, "tokens_per_device": 4771 }, { "epoch": 0.2184, "loss_ce": 0.0024628671817481518, "loss_lvr": 0.38920605182647705, "loss_mode_switch": 0.0, "loss_total": 0.04138347506523132, "step": 546 }, { "batch_size": 4, "epoch": 0.2184, "step": 546, "tokens_per_device": 5872 }, { "epoch": 0.2184, "loss_ce": 0.22887441515922546, "loss_lvr": 0.8791703581809998, "loss_mode_switch": 0.0, "loss_total": 0.31679144501686096, "step": 546 }, { "epoch": 0.2188, "grad_norm": 1.5327249765396118, "learning_rate": 9.094004985487935e-06, "loss": 0.3245, "step": 547 }, { "batch_size": 4, "epoch": 0.2188, "step": 547, "tokens_per_device": 4212 }, { "epoch": 0.2188, "loss_ce": 0.5552846789360046, "loss_lvr": 1.0246126651763916, "loss_mode_switch": 0.0, "loss_total": 0.6577459573745728, "step": 547 }, { "batch_size": 4, "epoch": 0.2188, "step": 547, "tokens_per_device": 1360 }, { "epoch": 0.2188, "loss_ce": 0.257089227437973, "loss_lvr": 1.0565987825393677, "loss_mode_switch": 0.0, "loss_total": 0.3627490997314453, "step": 547 }, { "batch_size": 1, "epoch": 0.2188, "step": 547, "tokens_per_device": 4898 }, { "epoch": 0.2188, "loss_ce": 0.20063109695911407, "loss_lvr": 0.22115902602672577, "loss_mode_switch": 0.0, "loss_total": 0.22274699807167053, "step": 547 }, { "batch_size": 4, "epoch": 0.2188, "step": 547, "tokens_per_device": 5088 }, { "epoch": 0.2188, "loss_ce": 0.23153169453144073, "loss_lvr": 1.0566731691360474, "loss_mode_switch": 0.0, "loss_total": 0.33719900250434875, "step": 547 }, { "batch_size": 4, "epoch": 0.2188, "step": 547, "tokens_per_device": 1440 }, { "epoch": 0.2188, "loss_ce": 0.4667569696903229, "loss_lvr": 0.9555017948150635, "loss_mode_switch": 0.0, "loss_total": 0.5623071193695068, "step": 547 }, { "batch_size": 1, "epoch": 0.2188, "step": 547, "tokens_per_device": 4849 }, { "epoch": 0.2188, "loss_ce": 0.009420521557331085, "loss_lvr": 0.4465116858482361, "loss_mode_switch": 0.0, "loss_total": 0.05407169088721275, "step": 547 }, { "batch_size": 4, "epoch": 0.2188, "step": 547, "tokens_per_device": 4408 }, { "epoch": 0.2188, "loss_ce": 0.002692944137379527, "loss_lvr": 0.8163585066795349, "loss_mode_switch": 0.0, "loss_total": 0.0843288004398346, "step": 547 }, { "batch_size": 4, "epoch": 0.2188, "step": 547, "tokens_per_device": 1352 }, { "epoch": 0.2188, "loss_ce": 0.2839401960372925, "loss_lvr": 1.075875997543335, "loss_mode_switch": 0.0, "loss_total": 0.39152780175209045, "step": 547 }, { "epoch": 0.2192, "grad_norm": 1.4168124198913574, "learning_rate": 9.090282958160823e-06, "loss": 0.3042, "step": 548 }, { "batch_size": 4, "epoch": 0.2192, "step": 548, "tokens_per_device": 2636 }, { "epoch": 0.2192, "loss_ce": 0.36280906200408936, "loss_lvr": 0.9122559428215027, "loss_mode_switch": 0.0, "loss_total": 0.4540346562862396, "step": 548 }, { "batch_size": 1, "epoch": 0.2192, "step": 548, "tokens_per_device": 4962 }, { "epoch": 0.2192, "loss_ce": 0.35473671555519104, "loss_lvr": 0.4872928559780121, "loss_mode_switch": 0.0, "loss_total": 0.40346598625183105, "step": 548 }, { "batch_size": 1, "epoch": 0.2192, "step": 548, "tokens_per_device": 4912 }, { "epoch": 0.2192, "loss_ce": 0.3868730962276459, "loss_lvr": 0.7708649635314941, "loss_mode_switch": 0.0, "loss_total": 0.46395960450172424, "step": 548 }, { "batch_size": 4, "epoch": 0.2192, "step": 548, "tokens_per_device": 5932 }, { "epoch": 0.2192, "loss_ce": 0.37362140417099, "loss_lvr": 0.7878767848014832, "loss_mode_switch": 0.0, "loss_total": 0.4524090886116028, "step": 548 }, { "batch_size": 4, "epoch": 0.2192, "step": 548, "tokens_per_device": 2660 }, { "epoch": 0.2192, "loss_ce": 0.5960120558738708, "loss_lvr": 0.9589303135871887, "loss_mode_switch": 0.0, "loss_total": 0.6919050812721252, "step": 548 }, { "batch_size": 4, "epoch": 0.2192, "step": 548, "tokens_per_device": 4284 }, { "epoch": 0.2192, "loss_ce": 0.06885727494955063, "loss_lvr": 1.09459388256073, "loss_mode_switch": 0.0, "loss_total": 0.17831666767597198, "step": 548 }, { "batch_size": 1, "epoch": 0.2192, "step": 548, "tokens_per_device": 5156 }, { "epoch": 0.2192, "loss_ce": 0.02459069900214672, "loss_lvr": 0.32477208971977234, "loss_mode_switch": 0.0, "loss_total": 0.057067908346652985, "step": 548 }, { "batch_size": 1, "epoch": 0.2192, "step": 548, "tokens_per_device": 6481 }, { "epoch": 0.2192, "loss_ce": 0.01451791264116764, "loss_lvr": 0.3935326337814331, "loss_mode_switch": 0.0, "loss_total": 0.05387117713689804, "step": 548 }, { "epoch": 0.2196, "grad_norm": 1.7287148237228394, "learning_rate": 9.08655406600743e-06, "loss": 0.3652, "step": 549 }, { "batch_size": 4, "epoch": 0.2196, "step": 549, "tokens_per_device": 3748 }, { "epoch": 0.2196, "loss_ce": 0.22487576305866241, "loss_lvr": 1.3276588916778564, "loss_mode_switch": 0.0, "loss_total": 0.35764163732528687, "step": 549 }, { "batch_size": 1, "epoch": 0.2196, "step": 549, "tokens_per_device": 4875 }, { "epoch": 0.2196, "loss_ce": 0.000914429547265172, "loss_lvr": 0.5179421901702881, "loss_mode_switch": 0.0, "loss_total": 0.05270864814519882, "step": 549 }, { "batch_size": 4, "epoch": 0.2196, "step": 549, "tokens_per_device": 2548 }, { "epoch": 0.2196, "loss_ce": 0.2386963665485382, "loss_lvr": 1.0010418891906738, "loss_mode_switch": 0.0, "loss_total": 0.3388005495071411, "step": 549 }, { "batch_size": 1, "epoch": 0.2196, "step": 549, "tokens_per_device": 5177 }, { "epoch": 0.2196, "loss_ce": 0.10646115988492966, "loss_lvr": 0.3468172550201416, "loss_mode_switch": 0.0, "loss_total": 0.14114288985729218, "step": 549 }, { "batch_size": 4, "epoch": 0.2196, "step": 549, "tokens_per_device": 1252 }, { "epoch": 0.2196, "loss_ce": 0.3834933042526245, "loss_lvr": 1.0771223306655884, "loss_mode_switch": 0.0, "loss_total": 0.4912055432796478, "step": 549 }, { "batch_size": 4, "epoch": 0.2196, "step": 549, "tokens_per_device": 4196 }, { "epoch": 0.2196, "loss_ce": 0.15623073279857635, "loss_lvr": 1.0736548900604248, "loss_mode_switch": 0.0, "loss_total": 0.26359623670578003, "step": 549 }, { "batch_size": 1, "epoch": 0.2196, "step": 549, "tokens_per_device": 5059 }, { "epoch": 0.2196, "loss_ce": 0.1612151563167572, "loss_lvr": 0.5452201962471008, "loss_mode_switch": 0.0, "loss_total": 0.21573717892169952, "step": 549 }, { "batch_size": 1, "epoch": 0.2196, "step": 549, "tokens_per_device": 5110 }, { "epoch": 0.2196, "loss_ce": 0.09135926514863968, "loss_lvr": 0.3581729531288147, "loss_mode_switch": 0.0, "loss_total": 0.12717655301094055, "step": 549 }, { "epoch": 0.22, "grad_norm": 1.3992317914962769, "learning_rate": 9.082818315286054e-06, "loss": 0.3454, "step": 550 }, { "batch_size": 1, "epoch": 0.22, "step": 550, "tokens_per_device": 4896 }, { "epoch": 0.22, "loss_ce": 0.03730255737900734, "loss_lvr": 0.6477370858192444, "loss_mode_switch": 0.0, "loss_total": 0.10207626223564148, "step": 550 }, { "batch_size": 4, "epoch": 0.22, "step": 550, "tokens_per_device": 4116 }, { "epoch": 0.22, "loss_ce": 0.04611097648739815, "loss_lvr": 1.2135560512542725, "loss_mode_switch": 0.0, "loss_total": 0.16746658086776733, "step": 550 }, { "batch_size": 4, "epoch": 0.22, "step": 550, "tokens_per_device": 4680 }, { "epoch": 0.22, "loss_ce": 0.014967127703130245, "loss_lvr": 0.9945526123046875, "loss_mode_switch": 0.0, "loss_total": 0.11442238837480545, "step": 550 }, { "batch_size": 4, "epoch": 0.22, "step": 550, "tokens_per_device": 4196 }, { "epoch": 0.22, "loss_ce": 0.459063857793808, "loss_lvr": 0.9672229886054993, "loss_mode_switch": 0.0, "loss_total": 0.5557861328125, "step": 550 }, { "batch_size": 4, "epoch": 0.22, "step": 550, "tokens_per_device": 6628 }, { "epoch": 0.22, "loss_ce": 0.03162773698568344, "loss_lvr": 0.8382808566093445, "loss_mode_switch": 0.0, "loss_total": 0.11545582115650177, "step": 550 }, { "batch_size": 4, "epoch": 0.22, "step": 550, "tokens_per_device": 4292 }, { "epoch": 0.22, "loss_ce": 0.3281461000442505, "loss_lvr": 0.7842157483100891, "loss_mode_switch": 0.0, "loss_total": 0.40656769275665283, "step": 550 }, { "batch_size": 1, "epoch": 0.22, "step": 550, "tokens_per_device": 5174 }, { "epoch": 0.22, "loss_ce": 0.09394676238298416, "loss_lvr": 0.39877647161483765, "loss_mode_switch": 0.0, "loss_total": 0.1338244080543518, "step": 550 }, { "batch_size": 1, "epoch": 0.22, "step": 550, "tokens_per_device": 5049 }, { "epoch": 0.22, "loss_ce": 0.01955536939203739, "loss_lvr": 0.7276822328567505, "loss_mode_switch": 0.0, "loss_total": 0.09232359379529953, "step": 550 }, { "epoch": 0.2204, "grad_norm": 1.214693546295166, "learning_rate": 9.079075712266501e-06, "loss": 0.2911, "step": 551 }, { "batch_size": 4, "epoch": 0.2204, "step": 551, "tokens_per_device": 4204 }, { "epoch": 0.2204, "loss_ce": 0.26368486881256104, "loss_lvr": 1.1003880500793457, "loss_mode_switch": 0.0, "loss_total": 0.37372368574142456, "step": 551 }, { "batch_size": 4, "epoch": 0.2204, "step": 551, "tokens_per_device": 1904 }, { "epoch": 0.2204, "loss_ce": 0.33425378799438477, "loss_lvr": 1.0361099243164062, "loss_mode_switch": 0.0, "loss_total": 0.4378647804260254, "step": 551 }, { "batch_size": 4, "epoch": 0.2204, "step": 551, "tokens_per_device": 3908 }, { "epoch": 0.2204, "loss_ce": 0.17474551498889923, "loss_lvr": 0.9131606817245483, "loss_mode_switch": 0.0, "loss_total": 0.26606157422065735, "step": 551 }, { "batch_size": 4, "epoch": 0.2204, "step": 551, "tokens_per_device": 5740 }, { "epoch": 0.2204, "loss_ce": 0.07731734961271286, "loss_lvr": 0.9103326201438904, "loss_mode_switch": 0.0, "loss_total": 0.16835060715675354, "step": 551 }, { "batch_size": 1, "epoch": 0.2204, "step": 551, "tokens_per_device": 5122 }, { "epoch": 0.2204, "loss_ce": 0.03533508628606796, "loss_lvr": 0.8866409659385681, "loss_mode_switch": 0.0, "loss_total": 0.12399918586015701, "step": 551 }, { "batch_size": 4, "epoch": 0.2204, "step": 551, "tokens_per_device": 4268 }, { "epoch": 0.2204, "loss_ce": 0.39029181003570557, "loss_lvr": 0.961564302444458, "loss_mode_switch": 0.0, "loss_total": 0.4864482283592224, "step": 551 }, { "batch_size": 4, "epoch": 0.2204, "step": 551, "tokens_per_device": 2684 }, { "epoch": 0.2204, "loss_ce": 0.17757149040699005, "loss_lvr": 0.9081159234046936, "loss_mode_switch": 0.0, "loss_total": 0.26838308572769165, "step": 551 }, { "batch_size": 4, "epoch": 0.2204, "step": 551, "tokens_per_device": 4196 }, { "epoch": 0.2204, "loss_ce": 0.31547510623931885, "loss_lvr": 1.2247470617294312, "loss_mode_switch": 0.0, "loss_total": 0.4379498064517975, "step": 551 }, { "epoch": 0.2208, "grad_norm": 1.3201855421066284, "learning_rate": 9.075326263230073e-06, "loss": 0.2927, "step": 552 }, { "batch_size": 1, "epoch": 0.2208, "step": 552, "tokens_per_device": 4877 }, { "epoch": 0.2208, "loss_ce": 1.064315915107727, "loss_lvr": 0.9075669050216675, "loss_mode_switch": 0.0, "loss_total": 1.155072569847107, "step": 552 }, { "batch_size": 1, "epoch": 0.2208, "step": 552, "tokens_per_device": 5064 }, { "epoch": 0.2208, "loss_ce": 0.021872444078326225, "loss_lvr": 0.5329376459121704, "loss_mode_switch": 0.0, "loss_total": 0.07516621053218842, "step": 552 }, { "batch_size": 4, "epoch": 0.2208, "step": 552, "tokens_per_device": 3380 }, { "epoch": 0.2208, "loss_ce": 0.07163665443658829, "loss_lvr": 0.8504347205162048, "loss_mode_switch": 0.0, "loss_total": 0.1566801369190216, "step": 552 }, { "batch_size": 4, "epoch": 0.2208, "step": 552, "tokens_per_device": 3796 }, { "epoch": 0.2208, "loss_ce": 0.5602887868881226, "loss_lvr": 1.4781360626220703, "loss_mode_switch": 0.0, "loss_total": 0.7081024050712585, "step": 552 }, { "batch_size": 1, "epoch": 0.2208, "step": 552, "tokens_per_device": 5178 }, { "epoch": 0.2208, "loss_ce": 0.0038852104917168617, "loss_lvr": 0.48132821917533875, "loss_mode_switch": 0.0, "loss_total": 0.05201803147792816, "step": 552 }, { "batch_size": 4, "epoch": 0.2208, "step": 552, "tokens_per_device": 6432 }, { "epoch": 0.2208, "loss_ce": 0.43548068404197693, "loss_lvr": 0.9855943322181702, "loss_mode_switch": 0.0, "loss_total": 0.534040093421936, "step": 552 }, { "batch_size": 4, "epoch": 0.2208, "step": 552, "tokens_per_device": 3816 }, { "epoch": 0.2208, "loss_ce": 0.14739146828651428, "loss_lvr": 0.9341610074043274, "loss_mode_switch": 0.0, "loss_total": 0.24080756306648254, "step": 552 }, { "batch_size": 1, "epoch": 0.2208, "step": 552, "tokens_per_device": 4974 }, { "epoch": 0.2208, "loss_ce": 0.002189299091696739, "loss_lvr": 0.4700450897216797, "loss_mode_switch": 0.0, "loss_total": 0.04919380694627762, "step": 552 }, { "epoch": 0.2212, "grad_norm": 1.3722072839736938, "learning_rate": 9.071569974469569e-06, "loss": 0.312, "step": 553 }, { "batch_size": 4, "epoch": 0.2212, "step": 553, "tokens_per_device": 4644 }, { "epoch": 0.2212, "loss_ce": 0.40470457077026367, "loss_lvr": 0.9366785287857056, "loss_mode_switch": 0.0, "loss_total": 0.4983724355697632, "step": 553 }, { "batch_size": 4, "epoch": 0.2212, "step": 553, "tokens_per_device": 2728 }, { "epoch": 0.2212, "loss_ce": 0.13831566274166107, "loss_lvr": 0.6242692470550537, "loss_mode_switch": 0.0, "loss_total": 0.20074258744716644, "step": 553 }, { "batch_size": 4, "epoch": 0.2212, "step": 553, "tokens_per_device": 1644 }, { "epoch": 0.2212, "loss_ce": 0.4664033055305481, "loss_lvr": 0.8523979187011719, "loss_mode_switch": 0.0, "loss_total": 0.5516430735588074, "step": 553 }, { "batch_size": 1, "epoch": 0.2212, "step": 553, "tokens_per_device": 4236 }, { "epoch": 0.2212, "loss_ce": 0.11304287612438202, "loss_lvr": 0.34073588252067566, "loss_mode_switch": 0.0, "loss_total": 0.14711646735668182, "step": 553 }, { "batch_size": 4, "epoch": 0.2212, "step": 553, "tokens_per_device": 5220 }, { "epoch": 0.2212, "loss_ce": 0.20401515066623688, "loss_lvr": 0.7851863503456116, "loss_mode_switch": 0.0, "loss_total": 0.28253379464149475, "step": 553 }, { "batch_size": 1, "epoch": 0.2212, "step": 553, "tokens_per_device": 5323 }, { "epoch": 0.2212, "loss_ce": 0.03342679888010025, "loss_lvr": 0.7417510151863098, "loss_mode_switch": 0.0, "loss_total": 0.10760190337896347, "step": 553 }, { "batch_size": 4, "epoch": 0.2212, "step": 553, "tokens_per_device": 1312 }, { "epoch": 0.2212, "loss_ce": 0.24847626686096191, "loss_lvr": 1.0696961879730225, "loss_mode_switch": 0.0, "loss_total": 0.35544589161872864, "step": 553 }, { "batch_size": 4, "epoch": 0.2212, "step": 553, "tokens_per_device": 1368 }, { "epoch": 0.2212, "loss_ce": 0.3071233928203583, "loss_lvr": 1.09218168258667, "loss_mode_switch": 0.0, "loss_total": 0.4163415729999542, "step": 553 }, { "epoch": 0.2216, "grad_norm": 1.4250166416168213, "learning_rate": 9.067806852289262e-06, "loss": 0.3165, "step": 554 }, { "batch_size": 1, "epoch": 0.2216, "step": 554, "tokens_per_device": 5112 }, { "epoch": 0.2216, "loss_ce": 0.12149560451507568, "loss_lvr": 0.33710092306137085, "loss_mode_switch": 0.0, "loss_total": 0.15520569682121277, "step": 554 }, { "batch_size": 4, "epoch": 0.2216, "step": 554, "tokens_per_device": 4456 }, { "epoch": 0.2216, "loss_ce": 0.11515803635120392, "loss_lvr": 0.8966224193572998, "loss_mode_switch": 0.0, "loss_total": 0.20482027530670166, "step": 554 }, { "batch_size": 4, "epoch": 0.2216, "step": 554, "tokens_per_device": 1468 }, { "epoch": 0.2216, "loss_ce": 0.4191262125968933, "loss_lvr": 1.0363216400146484, "loss_mode_switch": 0.0, "loss_total": 0.5227583646774292, "step": 554 }, { "batch_size": 4, "epoch": 0.2216, "step": 554, "tokens_per_device": 5248 }, { "epoch": 0.2216, "loss_ce": 0.005141683854162693, "loss_lvr": 0.8947610259056091, "loss_mode_switch": 0.0, "loss_total": 0.09461778402328491, "step": 554 }, { "batch_size": 4, "epoch": 0.2216, "step": 554, "tokens_per_device": 4692 }, { "epoch": 0.2216, "loss_ce": 0.11920532584190369, "loss_lvr": 0.8399302363395691, "loss_mode_switch": 0.0, "loss_total": 0.20319834351539612, "step": 554 }, { "batch_size": 1, "epoch": 0.2216, "step": 554, "tokens_per_device": 5189 }, { "epoch": 0.2216, "loss_ce": 0.00950666144490242, "loss_lvr": 0.5304911732673645, "loss_mode_switch": 0.0, "loss_total": 0.06255577504634857, "step": 554 }, { "batch_size": 4, "epoch": 0.2216, "step": 554, "tokens_per_device": 5216 }, { "epoch": 0.2216, "loss_ce": 0.4471956789493561, "loss_lvr": 0.9467212557792664, "loss_mode_switch": 0.0, "loss_total": 0.5418677926063538, "step": 554 }, { "batch_size": 4, "epoch": 0.2216, "step": 554, "tokens_per_device": 4496 }, { "epoch": 0.2216, "loss_ce": 0.26450270414352417, "loss_lvr": 0.8702408075332642, "loss_mode_switch": 0.0, "loss_total": 0.35152679681777954, "step": 554 }, { "epoch": 0.222, "grad_norm": 1.2283577919006348, "learning_rate": 9.0640369030049e-06, "loss": 0.2778, "step": 555 }, { "batch_size": 4, "epoch": 0.222, "step": 555, "tokens_per_device": 3764 }, { "epoch": 0.222, "loss_ce": 0.20772406458854675, "loss_lvr": 1.0574896335601807, "loss_mode_switch": 0.0, "loss_total": 0.31347304582595825, "step": 555 }, { "batch_size": 4, "epoch": 0.222, "step": 555, "tokens_per_device": 3800 }, { "epoch": 0.222, "loss_ce": 0.0984344631433487, "loss_lvr": 1.156434416770935, "loss_mode_switch": 0.0, "loss_total": 0.2140779048204422, "step": 555 }, { "batch_size": 1, "epoch": 0.222, "step": 555, "tokens_per_device": 7515 }, { "epoch": 0.222, "loss_ce": 0.021528279408812523, "loss_lvr": 0.3521891236305237, "loss_mode_switch": 0.0, "loss_total": 0.0567471906542778, "step": 555 }, { "batch_size": 4, "epoch": 0.222, "step": 555, "tokens_per_device": 2532 }, { "epoch": 0.222, "loss_ce": 0.3365069627761841, "loss_lvr": 1.0538610219955444, "loss_mode_switch": 0.0, "loss_total": 0.441893070936203, "step": 555 }, { "batch_size": 1, "epoch": 0.222, "step": 555, "tokens_per_device": 4903 }, { "epoch": 0.222, "loss_ce": 0.18254292011260986, "loss_lvr": 0.5546469688415527, "loss_mode_switch": 0.0, "loss_total": 0.23800761997699738, "step": 555 }, { "batch_size": 4, "epoch": 0.222, "step": 555, "tokens_per_device": 4384 }, { "epoch": 0.222, "loss_ce": 0.595184862613678, "loss_lvr": 0.782838761806488, "loss_mode_switch": 0.0, "loss_total": 0.6734687089920044, "step": 555 }, { "batch_size": 4, "epoch": 0.222, "step": 555, "tokens_per_device": 1476 }, { "epoch": 0.222, "loss_ce": 0.43086591362953186, "loss_lvr": 1.1163907051086426, "loss_mode_switch": 0.0, "loss_total": 0.5425049662590027, "step": 555 }, { "batch_size": 1, "epoch": 0.222, "step": 555, "tokens_per_device": 4851 }, { "epoch": 0.222, "loss_ce": 0.0007652059430256486, "loss_lvr": 0.49446210265159607, "loss_mode_switch": 0.0, "loss_total": 0.05021141469478607, "step": 555 }, { "epoch": 0.2224, "grad_norm": 1.5752352476119995, "learning_rate": 9.060260132943682e-06, "loss": 0.3342, "step": 556 }, { "batch_size": 4, "epoch": 0.2224, "step": 556, "tokens_per_device": 4816 }, { "epoch": 0.2224, "loss_ce": 0.23865845799446106, "loss_lvr": 0.6990105509757996, "loss_mode_switch": 0.0, "loss_total": 0.30855950713157654, "step": 556 }, { "batch_size": 4, "epoch": 0.2224, "step": 556, "tokens_per_device": 4984 }, { "epoch": 0.2224, "loss_ce": 0.17030705511569977, "loss_lvr": 1.0681718587875366, "loss_mode_switch": 0.0, "loss_total": 0.27712422609329224, "step": 556 }, { "batch_size": 1, "epoch": 0.2224, "step": 556, "tokens_per_device": 4892 }, { "epoch": 0.2224, "loss_ce": 0.05799083784222603, "loss_lvr": 0.6424645185470581, "loss_mode_switch": 0.0, "loss_total": 0.12223729491233826, "step": 556 }, { "batch_size": 1, "epoch": 0.2224, "step": 556, "tokens_per_device": 5179 }, { "epoch": 0.2224, "loss_ce": 0.0605638213455677, "loss_lvr": 0.30058032274246216, "loss_mode_switch": 0.0, "loss_total": 0.09062185138463974, "step": 556 }, { "batch_size": 4, "epoch": 0.2224, "step": 556, "tokens_per_device": 4088 }, { "epoch": 0.2224, "loss_ce": 0.4790050983428955, "loss_lvr": 0.935703694820404, "loss_mode_switch": 0.0, "loss_total": 0.5725754499435425, "step": 556 }, { "batch_size": 4, "epoch": 0.2224, "step": 556, "tokens_per_device": 6244 }, { "epoch": 0.2224, "loss_ce": 0.019657880067825317, "loss_lvr": 1.1824469566345215, "loss_mode_switch": 0.0, "loss_total": 0.13790258765220642, "step": 556 }, { "batch_size": 4, "epoch": 0.2224, "step": 556, "tokens_per_device": 1456 }, { "epoch": 0.2224, "loss_ce": 0.41755202412605286, "loss_lvr": 1.1718287467956543, "loss_mode_switch": 0.0, "loss_total": 0.5347349047660828, "step": 556 }, { "batch_size": 4, "epoch": 0.2224, "step": 556, "tokens_per_device": 5464 }, { "epoch": 0.2224, "loss_ce": 0.03534260764718056, "loss_lvr": 0.9572725296020508, "loss_mode_switch": 0.0, "loss_total": 0.1310698688030243, "step": 556 }, { "epoch": 0.2228, "grad_norm": 1.560701608657837, "learning_rate": 9.056476548444258e-06, "loss": 0.3216, "step": 557 }, { "batch_size": 4, "epoch": 0.2228, "step": 557, "tokens_per_device": 5808 }, { "epoch": 0.2228, "loss_ce": 0.6821380257606506, "loss_lvr": 0.8257409930229187, "loss_mode_switch": 0.0, "loss_total": 0.7647120952606201, "step": 557 }, { "batch_size": 1, "epoch": 0.2228, "step": 557, "tokens_per_device": 4769 }, { "epoch": 0.2228, "loss_ce": 0.014375695027410984, "loss_lvr": 0.702316403388977, "loss_mode_switch": 0.0, "loss_total": 0.08460733294487, "step": 557 }, { "batch_size": 4, "epoch": 0.2228, "step": 557, "tokens_per_device": 4756 }, { "epoch": 0.2228, "loss_ce": 0.13159430027008057, "loss_lvr": 0.8063037395477295, "loss_mode_switch": 0.0, "loss_total": 0.21222467720508575, "step": 557 }, { "batch_size": 4, "epoch": 0.2228, "step": 557, "tokens_per_device": 1384 }, { "epoch": 0.2228, "loss_ce": 0.20109426975250244, "loss_lvr": 1.032952070236206, "loss_mode_switch": 0.0, "loss_total": 0.30438947677612305, "step": 557 }, { "batch_size": 1, "epoch": 0.2228, "step": 557, "tokens_per_device": 4891 }, { "epoch": 0.2228, "loss_ce": 0.009714005514979362, "loss_lvr": 0.2794884145259857, "loss_mode_switch": 0.0, "loss_total": 0.037662848830223083, "step": 557 }, { "batch_size": 4, "epoch": 0.2228, "step": 557, "tokens_per_device": 6016 }, { "epoch": 0.2228, "loss_ce": 0.11913515627384186, "loss_lvr": 0.9006925225257874, "loss_mode_switch": 0.0, "loss_total": 0.20920440554618835, "step": 557 }, { "batch_size": 4, "epoch": 0.2228, "step": 557, "tokens_per_device": 4392 }, { "epoch": 0.2228, "loss_ce": 0.25246700644493103, "loss_lvr": 0.9194543361663818, "loss_mode_switch": 0.0, "loss_total": 0.3444124460220337, "step": 557 }, { "batch_size": 1, "epoch": 0.2228, "step": 557, "tokens_per_device": 4885 }, { "epoch": 0.2228, "loss_ce": 0.13096678256988525, "loss_lvr": 0.6918911337852478, "loss_mode_switch": 0.0, "loss_total": 0.20015589892864227, "step": 557 }, { "epoch": 0.2232, "grad_norm": 1.5243338346481323, "learning_rate": 9.052686155856716e-06, "loss": 0.3645, "step": 558 }, { "batch_size": 1, "epoch": 0.2232, "step": 558, "tokens_per_device": 5149 }, { "epoch": 0.2232, "loss_ce": 0.28446319699287415, "loss_lvr": 0.4033674895763397, "loss_mode_switch": 0.0, "loss_total": 0.32479995489120483, "step": 558 }, { "batch_size": 4, "epoch": 0.2232, "step": 558, "tokens_per_device": 6472 }, { "epoch": 0.2232, "loss_ce": 0.027789654210209846, "loss_lvr": 0.7647474408149719, "loss_mode_switch": 0.0, "loss_total": 0.10426440089941025, "step": 558 }, { "batch_size": 1, "epoch": 0.2232, "step": 558, "tokens_per_device": 4965 }, { "epoch": 0.2232, "loss_ce": 0.09119370579719543, "loss_lvr": 0.2853034436702728, "loss_mode_switch": 0.0, "loss_total": 0.11972405016422272, "step": 558 }, { "batch_size": 1, "epoch": 0.2232, "step": 558, "tokens_per_device": 5117 }, { "epoch": 0.2232, "loss_ce": 0.0090761324390769, "loss_lvr": 1.2012884616851807, "loss_mode_switch": 0.0, "loss_total": 0.12920497357845306, "step": 558 }, { "batch_size": 1, "epoch": 0.2232, "step": 558, "tokens_per_device": 5135 }, { "epoch": 0.2232, "loss_ce": 0.10468325018882751, "loss_lvr": 0.45618128776550293, "loss_mode_switch": 0.0, "loss_total": 0.15030138194561005, "step": 558 }, { "batch_size": 1, "epoch": 0.2232, "step": 558, "tokens_per_device": 4876 }, { "epoch": 0.2232, "loss_ce": 0.027665821835398674, "loss_lvr": 0.6914520263671875, "loss_mode_switch": 0.0, "loss_total": 0.09681102633476257, "step": 558 }, { "batch_size": 1, "epoch": 0.2232, "step": 558, "tokens_per_device": 5182 }, { "epoch": 0.2232, "loss_ce": 0.3068957030773163, "loss_lvr": 0.5836157202720642, "loss_mode_switch": 0.0, "loss_total": 0.36525726318359375, "step": 558 }, { "batch_size": 4, "epoch": 0.2232, "step": 558, "tokens_per_device": 4956 }, { "epoch": 0.2232, "loss_ce": 0.03048047050833702, "loss_lvr": 0.8205105066299438, "loss_mode_switch": 0.0, "loss_total": 0.11253152787685394, "step": 558 }, { "epoch": 0.2236, "grad_norm": 1.6255826950073242, "learning_rate": 9.048888961542565e-06, "loss": 0.3213, "step": 559 }, { "batch_size": 4, "epoch": 0.2236, "step": 559, "tokens_per_device": 4980 }, { "epoch": 0.2236, "loss_ce": 0.19730515778064728, "loss_lvr": 0.8313707113265991, "loss_mode_switch": 0.0, "loss_total": 0.2804422378540039, "step": 559 }, { "batch_size": 4, "epoch": 0.2236, "step": 559, "tokens_per_device": 4212 }, { "epoch": 0.2236, "loss_ce": 0.11166519671678543, "loss_lvr": 1.0806916952133179, "loss_mode_switch": 0.0, "loss_total": 0.21973437070846558, "step": 559 }, { "batch_size": 4, "epoch": 0.2236, "step": 559, "tokens_per_device": 4004 }, { "epoch": 0.2236, "loss_ce": 0.12097005546092987, "loss_lvr": 0.8466465473175049, "loss_mode_switch": 0.0, "loss_total": 0.2056347131729126, "step": 559 }, { "batch_size": 4, "epoch": 0.2236, "step": 559, "tokens_per_device": 5920 }, { "epoch": 0.2236, "loss_ce": 0.2975163459777832, "loss_lvr": 0.8938024044036865, "loss_mode_switch": 0.0, "loss_total": 0.3868965804576874, "step": 559 }, { "batch_size": 4, "epoch": 0.2236, "step": 559, "tokens_per_device": 4280 }, { "epoch": 0.2236, "loss_ce": 0.301850825548172, "loss_lvr": 1.0701161623001099, "loss_mode_switch": 0.0, "loss_total": 0.408862441778183, "step": 559 }, { "batch_size": 4, "epoch": 0.2236, "step": 559, "tokens_per_device": 3188 }, { "epoch": 0.2236, "loss_ce": 0.4079414904117584, "loss_lvr": 0.9658601880073547, "loss_mode_switch": 0.0, "loss_total": 0.5045275092124939, "step": 559 }, { "batch_size": 1, "epoch": 0.2236, "step": 559, "tokens_per_device": 5088 }, { "epoch": 0.2236, "loss_ce": 0.0031119936611503363, "loss_lvr": 0.3138340711593628, "loss_mode_switch": 0.0, "loss_total": 0.03449539840221405, "step": 559 }, { "batch_size": 4, "epoch": 0.2236, "step": 559, "tokens_per_device": 4192 }, { "epoch": 0.2236, "loss_ce": 0.2548366189002991, "loss_lvr": 1.1441341638565063, "loss_mode_switch": 0.0, "loss_total": 0.36925002932548523, "step": 559 }, { "epoch": 0.224, "grad_norm": 1.2355042695999146, "learning_rate": 9.045084971874738e-06, "loss": 0.3232, "step": 560 }, { "batch_size": 1, "epoch": 0.224, "step": 560, "tokens_per_device": 7387 }, { "epoch": 0.224, "loss_ce": 0.0017712267581373453, "loss_lvr": 0.4058615267276764, "loss_mode_switch": 0.0, "loss_total": 0.04235738143324852, "step": 560 }, { "batch_size": 1, "epoch": 0.224, "step": 560, "tokens_per_device": 4865 }, { "epoch": 0.224, "loss_ce": 0.0079041738063097, "loss_lvr": 0.2697421610355377, "loss_mode_switch": 0.0, "loss_total": 0.03487838804721832, "step": 560 }, { "batch_size": 1, "epoch": 0.224, "step": 560, "tokens_per_device": 5182 }, { "epoch": 0.224, "loss_ce": 0.006450179498642683, "loss_lvr": 0.3202361762523651, "loss_mode_switch": 0.0, "loss_total": 0.03847379982471466, "step": 560 }, { "batch_size": 4, "epoch": 0.224, "step": 560, "tokens_per_device": 5912 }, { "epoch": 0.224, "loss_ce": 0.6068503856658936, "loss_lvr": 0.8157130479812622, "loss_mode_switch": 0.0, "loss_total": 0.6884216666221619, "step": 560 }, { "batch_size": 4, "epoch": 0.224, "step": 560, "tokens_per_device": 4432 }, { "epoch": 0.224, "loss_ce": 0.4854480028152466, "loss_lvr": 0.5731713175773621, "loss_mode_switch": 0.0, "loss_total": 0.5427651405334473, "step": 560 }, { "batch_size": 1, "epoch": 0.224, "step": 560, "tokens_per_device": 5162 }, { "epoch": 0.224, "loss_ce": 0.014225024729967117, "loss_lvr": 0.6388701796531677, "loss_mode_switch": 0.0, "loss_total": 0.07811205089092255, "step": 560 }, { "batch_size": 4, "epoch": 0.224, "step": 560, "tokens_per_device": 1364 }, { "epoch": 0.224, "loss_ce": 0.14190910756587982, "loss_lvr": 1.038162112236023, "loss_mode_switch": 0.0, "loss_total": 0.24572531878948212, "step": 560 }, { "batch_size": 1, "epoch": 0.224, "step": 560, "tokens_per_device": 4890 }, { "epoch": 0.224, "loss_ce": 0.012593785300850868, "loss_lvr": 0.41637885570526123, "loss_mode_switch": 0.0, "loss_total": 0.0542316734790802, "step": 560 }, { "epoch": 0.2244, "grad_norm": 1.391332745552063, "learning_rate": 9.041274193237565e-06, "loss": 0.3074, "step": 561 }, { "batch_size": 4, "epoch": 0.2244, "step": 561, "tokens_per_device": 4276 }, { "epoch": 0.2244, "loss_ce": 0.33161354064941406, "loss_lvr": 1.0475777387619019, "loss_mode_switch": 0.0, "loss_total": 0.4363713264465332, "step": 561 }, { "batch_size": 4, "epoch": 0.2244, "step": 561, "tokens_per_device": 5296 }, { "epoch": 0.2244, "loss_ce": 0.15630288422107697, "loss_lvr": 0.890988826751709, "loss_mode_switch": 0.0, "loss_total": 0.2454017698764801, "step": 561 }, { "batch_size": 4, "epoch": 0.2244, "step": 561, "tokens_per_device": 1452 }, { "epoch": 0.2244, "loss_ce": 0.47235992550849915, "loss_lvr": 1.1036368608474731, "loss_mode_switch": 0.0, "loss_total": 0.5827236175537109, "step": 561 }, { "batch_size": 1, "epoch": 0.2244, "step": 561, "tokens_per_device": 4882 }, { "epoch": 0.2244, "loss_ce": 0.004508242942392826, "loss_lvr": 0.32703182101249695, "loss_mode_switch": 0.0, "loss_total": 0.037211425602436066, "step": 561 }, { "batch_size": 1, "epoch": 0.2244, "step": 561, "tokens_per_device": 4900 }, { "epoch": 0.2244, "loss_ce": 0.014170778915286064, "loss_lvr": 0.47758975625038147, "loss_mode_switch": 0.0, "loss_total": 0.06192975491285324, "step": 561 }, { "batch_size": 4, "epoch": 0.2244, "step": 561, "tokens_per_device": 1304 }, { "epoch": 0.2244, "loss_ce": 0.3871956765651703, "loss_lvr": 1.3569324016571045, "loss_mode_switch": 0.0, "loss_total": 0.5228888988494873, "step": 561 }, { "batch_size": 1, "epoch": 0.2244, "step": 561, "tokens_per_device": 5335 }, { "epoch": 0.2244, "loss_ce": 0.002145326230674982, "loss_lvr": 0.5442814826965332, "loss_mode_switch": 0.0, "loss_total": 0.056573476642370224, "step": 561 }, { "batch_size": 1, "epoch": 0.2244, "step": 561, "tokens_per_device": 5088 }, { "epoch": 0.2244, "loss_ce": 0.03476257622241974, "loss_lvr": 1.3390339612960815, "loss_mode_switch": 0.0, "loss_total": 0.16866597533226013, "step": 561 }, { "epoch": 0.2248, "grad_norm": 1.4258689880371094, "learning_rate": 9.037456632026774e-06, "loss": 0.295, "step": 562 }, { "batch_size": 1, "epoch": 0.2248, "step": 562, "tokens_per_device": 4877 }, { "epoch": 0.2248, "loss_ce": 0.039426494389772415, "loss_lvr": 1.1687676906585693, "loss_mode_switch": 0.0, "loss_total": 0.156303271651268, "step": 562 }, { "batch_size": 4, "epoch": 0.2248, "step": 562, "tokens_per_device": 2612 }, { "epoch": 0.2248, "loss_ce": 0.6756799817085266, "loss_lvr": 0.8769570589065552, "loss_mode_switch": 0.0, "loss_total": 0.7633756995201111, "step": 562 }, { "batch_size": 4, "epoch": 0.2248, "step": 562, "tokens_per_device": 6076 }, { "epoch": 0.2248, "loss_ce": 0.2556096017360687, "loss_lvr": 0.7970169186592102, "loss_mode_switch": 0.0, "loss_total": 0.33531129360198975, "step": 562 }, { "batch_size": 4, "epoch": 0.2248, "step": 562, "tokens_per_device": 4508 }, { "epoch": 0.2248, "loss_ce": 0.477385938167572, "loss_lvr": 0.8760702013969421, "loss_mode_switch": 0.0, "loss_total": 0.5649929642677307, "step": 562 }, { "batch_size": 1, "epoch": 0.2248, "step": 562, "tokens_per_device": 5015 }, { "epoch": 0.2248, "loss_ce": 0.09598936885595322, "loss_lvr": 0.5942376255989075, "loss_mode_switch": 0.0, "loss_total": 0.15541313588619232, "step": 562 }, { "batch_size": 1, "epoch": 0.2248, "step": 562, "tokens_per_device": 4890 }, { "epoch": 0.2248, "loss_ce": 0.889984667301178, "loss_lvr": 0.5428001284599304, "loss_mode_switch": 0.0, "loss_total": 0.9442646503448486, "step": 562 }, { "batch_size": 4, "epoch": 0.2248, "step": 562, "tokens_per_device": 4368 }, { "epoch": 0.2248, "loss_ce": 0.4465098977088928, "loss_lvr": 1.201530933380127, "loss_mode_switch": 0.0, "loss_total": 0.5666629672050476, "step": 562 }, { "batch_size": 4, "epoch": 0.2248, "step": 562, "tokens_per_device": 2864 }, { "epoch": 0.2248, "loss_ce": 0.26564428210258484, "loss_lvr": 1.2143570184707642, "loss_mode_switch": 0.0, "loss_total": 0.38707998394966125, "step": 562 }, { "epoch": 0.2252, "grad_norm": 1.7523305416107178, "learning_rate": 9.033632294649473e-06, "loss": 0.3528, "step": 563 }, { "batch_size": 4, "epoch": 0.2252, "step": 563, "tokens_per_device": 2756 }, { "epoch": 0.2252, "loss_ce": 0.6557647585868835, "loss_lvr": 0.7570354342460632, "loss_mode_switch": 0.0, "loss_total": 0.7314683198928833, "step": 563 }, { "batch_size": 4, "epoch": 0.2252, "step": 563, "tokens_per_device": 2752 }, { "epoch": 0.2252, "loss_ce": 0.3463233411312103, "loss_lvr": 0.8035928606987, "loss_mode_switch": 0.0, "loss_total": 0.42668262124061584, "step": 563 }, { "batch_size": 1, "epoch": 0.2252, "step": 563, "tokens_per_device": 4888 }, { "epoch": 0.2252, "loss_ce": 0.10172998160123825, "loss_lvr": 0.32913637161254883, "loss_mode_switch": 0.0, "loss_total": 0.13464361429214478, "step": 563 }, { "batch_size": 4, "epoch": 0.2252, "step": 563, "tokens_per_device": 5104 }, { "epoch": 0.2252, "loss_ce": 0.11942192912101746, "loss_lvr": 0.7284114956855774, "loss_mode_switch": 0.0, "loss_total": 0.19226308166980743, "step": 563 }, { "batch_size": 4, "epoch": 0.2252, "step": 563, "tokens_per_device": 1516 }, { "epoch": 0.2252, "loss_ce": 0.10757529735565186, "loss_lvr": 0.9429633617401123, "loss_mode_switch": 0.0, "loss_total": 0.20187163352966309, "step": 563 }, { "batch_size": 4, "epoch": 0.2252, "step": 563, "tokens_per_device": 2596 }, { "epoch": 0.2252, "loss_ce": 0.660582423210144, "loss_lvr": 1.0133781433105469, "loss_mode_switch": 0.0, "loss_total": 0.7619202136993408, "step": 563 }, { "batch_size": 4, "epoch": 0.2252, "step": 563, "tokens_per_device": 2624 }, { "epoch": 0.2252, "loss_ce": 0.22277875244617462, "loss_lvr": 1.228399395942688, "loss_mode_switch": 0.0, "loss_total": 0.34561869502067566, "step": 563 }, { "batch_size": 1, "epoch": 0.2252, "step": 563, "tokens_per_device": 5104 }, { "epoch": 0.2252, "loss_ce": 0.026890184730291367, "loss_lvr": 0.6469277739524841, "loss_mode_switch": 0.0, "loss_total": 0.09158296883106232, "step": 563 }, { "epoch": 0.2256, "grad_norm": 1.5620683431625366, "learning_rate": 9.029801187524147e-06, "loss": 0.315, "step": 564 }, { "batch_size": 1, "epoch": 0.2256, "step": 564, "tokens_per_device": 5167 }, { "epoch": 0.2256, "loss_ce": 0.15329082310199738, "loss_lvr": 0.29165396094322205, "loss_mode_switch": 0.0, "loss_total": 0.18245622515678406, "step": 564 }, { "batch_size": 1, "epoch": 0.2256, "step": 564, "tokens_per_device": 4984 }, { "epoch": 0.2256, "loss_ce": 0.07272929698228836, "loss_lvr": 0.48775941133499146, "loss_mode_switch": 0.0, "loss_total": 0.1215052381157875, "step": 564 }, { "batch_size": 1, "epoch": 0.2256, "step": 564, "tokens_per_device": 5110 }, { "epoch": 0.2256, "loss_ce": 0.005763735622167587, "loss_lvr": 0.28923550248146057, "loss_mode_switch": 0.0, "loss_total": 0.034687288105487823, "step": 564 }, { "batch_size": 4, "epoch": 0.2256, "step": 564, "tokens_per_device": 2300 }, { "epoch": 0.2256, "loss_ce": 0.6240912079811096, "loss_lvr": 1.130682349205017, "loss_mode_switch": 0.0, "loss_total": 0.7371594309806824, "step": 564 }, { "batch_size": 4, "epoch": 0.2256, "step": 564, "tokens_per_device": 4276 }, { "epoch": 0.2256, "loss_ce": 0.629426121711731, "loss_lvr": 1.097609519958496, "loss_mode_switch": 0.0, "loss_total": 0.7391870617866516, "step": 564 }, { "batch_size": 1, "epoch": 0.2256, "step": 564, "tokens_per_device": 5567 }, { "epoch": 0.2256, "loss_ce": 0.0010176942450925708, "loss_lvr": 0.3656157851219177, "loss_mode_switch": 0.0, "loss_total": 0.03757927194237709, "step": 564 }, { "batch_size": 4, "epoch": 0.2256, "step": 564, "tokens_per_device": 4724 }, { "epoch": 0.2256, "loss_ce": 0.5250881314277649, "loss_lvr": 1.001857042312622, "loss_mode_switch": 0.0, "loss_total": 0.6252738237380981, "step": 564 }, { "batch_size": 1, "epoch": 0.2256, "step": 564, "tokens_per_device": 4870 }, { "epoch": 0.2256, "loss_ce": 0.22111843526363373, "loss_lvr": 0.3863767981529236, "loss_mode_switch": 0.0, "loss_total": 0.2597561180591583, "step": 564 }, { "epoch": 0.226, "grad_norm": 1.443268060684204, "learning_rate": 9.025963317080641e-06, "loss": 0.3118, "step": 565 }, { "batch_size": 1, "epoch": 0.226, "step": 565, "tokens_per_device": 5136 }, { "epoch": 0.226, "loss_ce": 0.07365667819976807, "loss_lvr": 0.42404013872146606, "loss_mode_switch": 0.0, "loss_total": 0.11606068909168243, "step": 565 }, { "batch_size": 1, "epoch": 0.226, "step": 565, "tokens_per_device": 4883 }, { "epoch": 0.226, "loss_ce": 0.010245620273053646, "loss_lvr": 0.38093098998069763, "loss_mode_switch": 0.0, "loss_total": 0.04833872243762016, "step": 565 }, { "batch_size": 4, "epoch": 0.226, "step": 565, "tokens_per_device": 4224 }, { "epoch": 0.226, "loss_ce": 0.5889759659767151, "loss_lvr": 1.2164331674575806, "loss_mode_switch": 0.0, "loss_total": 0.7106192708015442, "step": 565 }, { "batch_size": 1, "epoch": 0.226, "step": 565, "tokens_per_device": 5122 }, { "epoch": 0.226, "loss_ce": 0.04557505622506142, "loss_lvr": 0.561075747013092, "loss_mode_switch": 0.0, "loss_total": 0.1016826331615448, "step": 565 }, { "batch_size": 4, "epoch": 0.226, "step": 565, "tokens_per_device": 7320 }, { "epoch": 0.226, "loss_ce": 0.6416330337524414, "loss_lvr": 0.9830434918403625, "loss_mode_switch": 0.0, "loss_total": 0.7399373650550842, "step": 565 }, { "batch_size": 4, "epoch": 0.226, "step": 565, "tokens_per_device": 3736 }, { "epoch": 0.226, "loss_ce": 0.05555712431669235, "loss_lvr": 1.0712037086486816, "loss_mode_switch": 0.0, "loss_total": 0.16267749667167664, "step": 565 }, { "batch_size": 4, "epoch": 0.226, "step": 565, "tokens_per_device": 1392 }, { "epoch": 0.226, "loss_ce": 0.28152352571487427, "loss_lvr": 1.1420270204544067, "loss_mode_switch": 0.0, "loss_total": 0.3957262337207794, "step": 565 }, { "batch_size": 4, "epoch": 0.226, "step": 565, "tokens_per_device": 4476 }, { "epoch": 0.226, "loss_ce": 0.5845144391059875, "loss_lvr": 1.0648002624511719, "loss_mode_switch": 0.0, "loss_total": 0.6909944415092468, "step": 565 }, { "epoch": 0.2264, "grad_norm": 1.5455626249313354, "learning_rate": 9.022118689760153e-06, "loss": 0.3299, "step": 566 }, { "batch_size": 1, "epoch": 0.2264, "step": 566, "tokens_per_device": 7537 }, { "epoch": 0.2264, "loss_ce": 0.0014686573995277286, "loss_lvr": 0.5683281421661377, "loss_mode_switch": 0.0, "loss_total": 0.058301474899053574, "step": 566 }, { "batch_size": 4, "epoch": 0.2264, "step": 566, "tokens_per_device": 4908 }, { "epoch": 0.2264, "loss_ce": 0.4743315875530243, "loss_lvr": 0.9139931201934814, "loss_mode_switch": 0.0, "loss_total": 0.5657309293746948, "step": 566 }, { "batch_size": 1, "epoch": 0.2264, "step": 566, "tokens_per_device": 4829 }, { "epoch": 0.2264, "loss_ce": 0.010715640150010586, "loss_lvr": 0.752159595489502, "loss_mode_switch": 0.0, "loss_total": 0.08593159914016724, "step": 566 }, { "batch_size": 1, "epoch": 0.2264, "step": 566, "tokens_per_device": 4886 }, { "epoch": 0.2264, "loss_ce": 0.013681059703230858, "loss_lvr": 0.26276183128356934, "loss_mode_switch": 0.0, "loss_total": 0.03995724394917488, "step": 566 }, { "batch_size": 4, "epoch": 0.2264, "step": 566, "tokens_per_device": 4104 }, { "epoch": 0.2264, "loss_ce": 0.8280071020126343, "loss_lvr": 1.0295339822769165, "loss_mode_switch": 0.0, "loss_total": 0.930960476398468, "step": 566 }, { "batch_size": 4, "epoch": 0.2264, "step": 566, "tokens_per_device": 2716 }, { "epoch": 0.2264, "loss_ce": 0.08546476066112518, "loss_lvr": 0.8851373791694641, "loss_mode_switch": 0.0, "loss_total": 0.1739785075187683, "step": 566 }, { "batch_size": 1, "epoch": 0.2264, "step": 566, "tokens_per_device": 5058 }, { "epoch": 0.2264, "loss_ce": 0.010953851975500584, "loss_lvr": 0.39836204051971436, "loss_mode_switch": 0.0, "loss_total": 0.050790056586265564, "step": 566 }, { "batch_size": 1, "epoch": 0.2264, "step": 566, "tokens_per_device": 5119 }, { "epoch": 0.2264, "loss_ce": 0.06833215802907944, "loss_lvr": 0.5196557641029358, "loss_mode_switch": 0.0, "loss_total": 0.12029772996902466, "step": 566 }, { "epoch": 0.2268, "grad_norm": 1.3268028497695923, "learning_rate": 9.018267312015214e-06, "loss": 0.2621, "step": 567 }, { "batch_size": 4, "epoch": 0.2268, "step": 567, "tokens_per_device": 1556 }, { "epoch": 0.2268, "loss_ce": 0.056777264922857285, "loss_lvr": 1.8506609201431274, "loss_mode_switch": 0.0, "loss_total": 0.2418433576822281, "step": 567 }, { "batch_size": 4, "epoch": 0.2268, "step": 567, "tokens_per_device": 2688 }, { "epoch": 0.2268, "loss_ce": 0.3457849621772766, "loss_lvr": 1.5846244096755981, "loss_mode_switch": 0.0, "loss_total": 0.5042474269866943, "step": 567 }, { "batch_size": 4, "epoch": 0.2268, "step": 567, "tokens_per_device": 5868 }, { "epoch": 0.2268, "loss_ce": 0.31326958537101746, "loss_lvr": 0.8474998474121094, "loss_mode_switch": 0.0, "loss_total": 0.39801958203315735, "step": 567 }, { "batch_size": 4, "epoch": 0.2268, "step": 567, "tokens_per_device": 5480 }, { "epoch": 0.2268, "loss_ce": 0.30061981081962585, "loss_lvr": 0.900465190410614, "loss_mode_switch": 0.0, "loss_total": 0.39066633582115173, "step": 567 }, { "batch_size": 4, "epoch": 0.2268, "step": 567, "tokens_per_device": 4216 }, { "epoch": 0.2268, "loss_ce": 0.12849000096321106, "loss_lvr": 0.9731132984161377, "loss_mode_switch": 0.0, "loss_total": 0.22580133378505707, "step": 567 }, { "batch_size": 1, "epoch": 0.2268, "step": 567, "tokens_per_device": 5359 }, { "epoch": 0.2268, "loss_ce": 0.014961080625653267, "loss_lvr": 0.5291289687156677, "loss_mode_switch": 0.0, "loss_total": 0.06787397712469101, "step": 567 }, { "batch_size": 4, "epoch": 0.2268, "step": 567, "tokens_per_device": 2676 }, { "epoch": 0.2268, "loss_ce": 0.05878780409693718, "loss_lvr": 0.8754081130027771, "loss_mode_switch": 0.0, "loss_total": 0.1463286131620407, "step": 567 }, { "batch_size": 4, "epoch": 0.2268, "step": 567, "tokens_per_device": 1476 }, { "epoch": 0.2268, "loss_ce": 0.5050081014633179, "loss_lvr": 1.0565003156661987, "loss_mode_switch": 0.0, "loss_total": 0.6106581091880798, "step": 567 }, { "epoch": 0.2272, "grad_norm": 1.3949135541915894, "learning_rate": 9.014409190309695e-06, "loss": 0.3646, "step": 568 }, { "batch_size": 4, "epoch": 0.2272, "step": 568, "tokens_per_device": 4244 }, { "epoch": 0.2272, "loss_ce": 0.10982254147529602, "loss_lvr": 1.0690540075302124, "loss_mode_switch": 0.0, "loss_total": 0.21672794222831726, "step": 568 }, { "batch_size": 1, "epoch": 0.2272, "step": 568, "tokens_per_device": 4888 }, { "epoch": 0.2272, "loss_ce": 0.00044900542707182467, "loss_lvr": 0.32078176736831665, "loss_mode_switch": 0.0, "loss_total": 0.03252718225121498, "step": 568 }, { "batch_size": 4, "epoch": 0.2272, "step": 568, "tokens_per_device": 6588 }, { "epoch": 0.2272, "loss_ce": 0.06136060506105423, "loss_lvr": 0.7997786998748779, "loss_mode_switch": 0.0, "loss_total": 0.14133846759796143, "step": 568 }, { "batch_size": 1, "epoch": 0.2272, "step": 568, "tokens_per_device": 6621 }, { "epoch": 0.2272, "loss_ce": 0.010629738681018353, "loss_lvr": 0.40208232402801514, "loss_mode_switch": 0.0, "loss_total": 0.05083797127008438, "step": 568 }, { "batch_size": 4, "epoch": 0.2272, "step": 568, "tokens_per_device": 4812 }, { "epoch": 0.2272, "loss_ce": 0.4103526473045349, "loss_lvr": 0.8976351618766785, "loss_mode_switch": 0.0, "loss_total": 0.5001161694526672, "step": 568 }, { "batch_size": 4, "epoch": 0.2272, "step": 568, "tokens_per_device": 7160 }, { "epoch": 0.2272, "loss_ce": 0.5703300833702087, "loss_lvr": 0.9951385259628296, "loss_mode_switch": 0.0, "loss_total": 0.6698439121246338, "step": 568 }, { "batch_size": 4, "epoch": 0.2272, "step": 568, "tokens_per_device": 4548 }, { "epoch": 0.2272, "loss_ce": 0.07842376828193665, "loss_lvr": 0.8281299471855164, "loss_mode_switch": 0.0, "loss_total": 0.16123676300048828, "step": 568 }, { "batch_size": 1, "epoch": 0.2272, "step": 568, "tokens_per_device": 4852 }, { "epoch": 0.2272, "loss_ce": 0.3194814920425415, "loss_lvr": 0.3302730917930603, "loss_mode_switch": 0.0, "loss_total": 0.3525088131427765, "step": 568 }, { "epoch": 0.2276, "grad_norm": 1.443527102470398, "learning_rate": 9.010544331118776e-06, "loss": 0.2689, "step": 569 }, { "batch_size": 4, "epoch": 0.2276, "step": 569, "tokens_per_device": 4076 }, { "epoch": 0.2276, "loss_ce": 0.36603349447250366, "loss_lvr": 1.2922184467315674, "loss_mode_switch": 0.0, "loss_total": 0.49525535106658936, "step": 569 }, { "batch_size": 1, "epoch": 0.2276, "step": 569, "tokens_per_device": 4957 }, { "epoch": 0.2276, "loss_ce": 0.008245030418038368, "loss_lvr": 0.38617509603500366, "loss_mode_switch": 0.0, "loss_total": 0.04686254262924194, "step": 569 }, { "batch_size": 4, "epoch": 0.2276, "step": 569, "tokens_per_device": 2576 }, { "epoch": 0.2276, "loss_ce": 0.44258278608322144, "loss_lvr": 1.0252586603164673, "loss_mode_switch": 0.0, "loss_total": 0.5451086759567261, "step": 569 }, { "batch_size": 4, "epoch": 0.2276, "step": 569, "tokens_per_device": 2744 }, { "epoch": 0.2276, "loss_ce": 0.37436047196388245, "loss_lvr": 0.7518138289451599, "loss_mode_switch": 0.0, "loss_total": 0.4495418667793274, "step": 569 }, { "batch_size": 4, "epoch": 0.2276, "step": 569, "tokens_per_device": 1220 }, { "epoch": 0.2276, "loss_ce": 0.6321426630020142, "loss_lvr": 1.177194356918335, "loss_mode_switch": 0.0, "loss_total": 0.7498620748519897, "step": 569 }, { "batch_size": 4, "epoch": 0.2276, "step": 569, "tokens_per_device": 3808 }, { "epoch": 0.2276, "loss_ce": 0.4745507538318634, "loss_lvr": 0.9513328671455383, "loss_mode_switch": 0.0, "loss_total": 0.5696840286254883, "step": 569 }, { "batch_size": 4, "epoch": 0.2276, "step": 569, "tokens_per_device": 2668 }, { "epoch": 0.2276, "loss_ce": 0.09140294790267944, "loss_lvr": 0.8474665880203247, "loss_mode_switch": 0.0, "loss_total": 0.17614960670471191, "step": 569 }, { "batch_size": 4, "epoch": 0.2276, "step": 569, "tokens_per_device": 3792 }, { "epoch": 0.2276, "loss_ce": 0.25495627522468567, "loss_lvr": 1.2084888219833374, "loss_mode_switch": 0.0, "loss_total": 0.37580516934394836, "step": 569 }, { "epoch": 0.228, "grad_norm": 1.4310418367385864, "learning_rate": 9.006672740928952e-06, "loss": 0.3397, "step": 570 }, { "batch_size": 1, "epoch": 0.228, "step": 570, "tokens_per_device": 4908 }, { "epoch": 0.228, "loss_ce": 0.6263468861579895, "loss_lvr": 1.1311757564544678, "loss_mode_switch": 0.0, "loss_total": 0.7394644618034363, "step": 570 }, { "batch_size": 4, "epoch": 0.228, "step": 570, "tokens_per_device": 3316 }, { "epoch": 0.228, "loss_ce": 0.019466139376163483, "loss_lvr": 1.028794765472412, "loss_mode_switch": 0.0, "loss_total": 0.12234561890363693, "step": 570 }, { "batch_size": 4, "epoch": 0.228, "step": 570, "tokens_per_device": 4068 }, { "epoch": 0.228, "loss_ce": 0.042589422315359116, "loss_lvr": 0.9156234860420227, "loss_mode_switch": 0.0, "loss_total": 0.13415177166461945, "step": 570 }, { "batch_size": 1, "epoch": 0.228, "step": 570, "tokens_per_device": 4889 }, { "epoch": 0.228, "loss_ce": 0.10096048563718796, "loss_lvr": 0.7252086997032166, "loss_mode_switch": 0.0, "loss_total": 0.17348136007785797, "step": 570 }, { "batch_size": 4, "epoch": 0.228, "step": 570, "tokens_per_device": 3132 }, { "epoch": 0.228, "loss_ce": 0.05651135370135307, "loss_lvr": 0.8356807231903076, "loss_mode_switch": 0.0, "loss_total": 0.14007942378520966, "step": 570 }, { "batch_size": 1, "epoch": 0.228, "step": 570, "tokens_per_device": 5111 }, { "epoch": 0.228, "loss_ce": 0.007167622447013855, "loss_lvr": 0.9749070405960083, "loss_mode_switch": 0.0, "loss_total": 0.1046583279967308, "step": 570 }, { "batch_size": 1, "epoch": 0.228, "step": 570, "tokens_per_device": 4921 }, { "epoch": 0.228, "loss_ce": 0.062386833131313324, "loss_lvr": 0.4067724645137787, "loss_mode_switch": 0.0, "loss_total": 0.10306407511234283, "step": 570 }, { "batch_size": 1, "epoch": 0.228, "step": 570, "tokens_per_device": 5164 }, { "epoch": 0.228, "loss_ce": 0.00990221370011568, "loss_lvr": 0.21060246229171753, "loss_mode_switch": 0.0, "loss_total": 0.03096245974302292, "step": 570 }, { "epoch": 0.2284, "grad_norm": 1.3434282541275024, "learning_rate": 9.002794426238009e-06, "loss": 0.2914, "step": 571 }, { "batch_size": 4, "epoch": 0.2284, "step": 571, "tokens_per_device": 3604 }, { "epoch": 0.2284, "loss_ce": 0.9659446477890015, "loss_lvr": 1.1236181259155273, "loss_mode_switch": 0.0, "loss_total": 1.0783064365386963, "step": 571 }, { "batch_size": 4, "epoch": 0.2284, "step": 571, "tokens_per_device": 10864 }, { "epoch": 0.2284, "loss_ce": 0.3269425630569458, "loss_lvr": 0.7156771421432495, "loss_mode_switch": 0.0, "loss_total": 0.39851027727127075, "step": 571 }, { "batch_size": 4, "epoch": 0.2284, "step": 571, "tokens_per_device": 4252 }, { "epoch": 0.2284, "loss_ce": 0.1531878113746643, "loss_lvr": 0.6794182062149048, "loss_mode_switch": 0.0, "loss_total": 0.2211296260356903, "step": 571 }, { "batch_size": 1, "epoch": 0.2284, "step": 571, "tokens_per_device": 5850 }, { "epoch": 0.2284, "loss_ce": 0.04904896393418312, "loss_lvr": 0.5230234861373901, "loss_mode_switch": 0.0, "loss_total": 0.1013513132929802, "step": 571 }, { "batch_size": 4, "epoch": 0.2284, "step": 571, "tokens_per_device": 5772 }, { "epoch": 0.2284, "loss_ce": 0.16466863453388214, "loss_lvr": 0.7235532402992249, "loss_mode_switch": 0.0, "loss_total": 0.2370239496231079, "step": 571 }, { "batch_size": 4, "epoch": 0.2284, "step": 571, "tokens_per_device": 10996 }, { "epoch": 0.2284, "loss_ce": 0.3547273278236389, "loss_lvr": 0.5378335118293762, "loss_mode_switch": 0.0, "loss_total": 0.408510684967041, "step": 571 }, { "batch_size": 4, "epoch": 0.2284, "step": 571, "tokens_per_device": 4944 }, { "epoch": 0.2284, "loss_ce": 0.286957710981369, "loss_lvr": 0.986704409122467, "loss_mode_switch": 0.0, "loss_total": 0.3856281638145447, "step": 571 }, { "batch_size": 4, "epoch": 0.2284, "step": 571, "tokens_per_device": 4760 }, { "epoch": 0.2284, "loss_ce": 0.05234147608280182, "loss_lvr": 0.8245705366134644, "loss_mode_switch": 0.0, "loss_total": 0.13479852676391602, "step": 571 }, { "epoch": 0.2288, "grad_norm": 1.5011094808578491, "learning_rate": 8.998909393555022e-06, "loss": 0.3253, "step": 572 }, { "batch_size": 4, "epoch": 0.2288, "step": 572, "tokens_per_device": 4220 }, { "epoch": 0.2288, "loss_ce": 0.1563045233488083, "loss_lvr": 0.8820679783821106, "loss_mode_switch": 0.0, "loss_total": 0.24451132118701935, "step": 572 }, { "batch_size": 1, "epoch": 0.2288, "step": 572, "tokens_per_device": 4899 }, { "epoch": 0.2288, "loss_ce": 0.1014091968536377, "loss_lvr": 0.5796952247619629, "loss_mode_switch": 0.0, "loss_total": 0.15937872231006622, "step": 572 }, { "batch_size": 1, "epoch": 0.2288, "step": 572, "tokens_per_device": 5205 }, { "epoch": 0.2288, "loss_ce": 0.00805187039077282, "loss_lvr": 0.31203633546829224, "loss_mode_switch": 0.0, "loss_total": 0.03925550356507301, "step": 572 }, { "batch_size": 1, "epoch": 0.2288, "step": 572, "tokens_per_device": 4742 }, { "epoch": 0.2288, "loss_ce": 0.07965614646673203, "loss_lvr": 0.5810741186141968, "loss_mode_switch": 0.0, "loss_total": 0.13776355981826782, "step": 572 }, { "batch_size": 1, "epoch": 0.2288, "step": 572, "tokens_per_device": 5064 }, { "epoch": 0.2288, "loss_ce": 0.1553330421447754, "loss_lvr": 0.7390453219413757, "loss_mode_switch": 0.0, "loss_total": 0.22923758625984192, "step": 572 }, { "batch_size": 4, "epoch": 0.2288, "step": 572, "tokens_per_device": 11700 }, { "epoch": 0.2288, "loss_ce": 0.38326582312583923, "loss_lvr": 0.7547308206558228, "loss_mode_switch": 0.0, "loss_total": 0.45873892307281494, "step": 572 }, { "batch_size": 1, "epoch": 0.2288, "step": 572, "tokens_per_device": 4854 }, { "epoch": 0.2288, "loss_ce": 0.7993269562721252, "loss_lvr": 0.42998722195625305, "loss_mode_switch": 0.0, "loss_total": 0.8423256874084473, "step": 572 }, { "batch_size": 4, "epoch": 0.2288, "step": 572, "tokens_per_device": 4500 }, { "epoch": 0.2288, "loss_ce": 0.3607538342475891, "loss_lvr": 0.763573944568634, "loss_mode_switch": 0.0, "loss_total": 0.4371112287044525, "step": 572 }, { "epoch": 0.2292, "grad_norm": 1.6446058750152588, "learning_rate": 8.995017649400341e-06, "loss": 0.3358, "step": 573 }, { "batch_size": 1, "epoch": 0.2292, "step": 573, "tokens_per_device": 5125 }, { "epoch": 0.2292, "loss_ce": 0.4363909959793091, "loss_lvr": 0.32058560848236084, "loss_mode_switch": 0.0, "loss_total": 0.46844956278800964, "step": 573 }, { "batch_size": 4, "epoch": 0.2292, "step": 573, "tokens_per_device": 7676 }, { "epoch": 0.2292, "loss_ce": 0.20069889724254608, "loss_lvr": 1.0106669664382935, "loss_mode_switch": 0.0, "loss_total": 0.3017655909061432, "step": 573 }, { "batch_size": 4, "epoch": 0.2292, "step": 573, "tokens_per_device": 4692 }, { "epoch": 0.2292, "loss_ce": 0.1862010508775711, "loss_lvr": 0.9686952233314514, "loss_mode_switch": 0.0, "loss_total": 0.28307056427001953, "step": 573 }, { "batch_size": 1, "epoch": 0.2292, "step": 573, "tokens_per_device": 4922 }, { "epoch": 0.2292, "loss_ce": 0.044478919357061386, "loss_lvr": 0.5201048851013184, "loss_mode_switch": 0.0, "loss_total": 0.09648940712213516, "step": 573 }, { "batch_size": 4, "epoch": 0.2292, "step": 573, "tokens_per_device": 7148 }, { "epoch": 0.2292, "loss_ce": 0.18847604095935822, "loss_lvr": 0.9075520634651184, "loss_mode_switch": 0.0, "loss_total": 0.2792312502861023, "step": 573 }, { "batch_size": 4, "epoch": 0.2292, "step": 573, "tokens_per_device": 13564 }, { "epoch": 0.2292, "loss_ce": 0.09339119493961334, "loss_lvr": 0.6777989268302917, "loss_mode_switch": 0.0, "loss_total": 0.1611710786819458, "step": 573 }, { "batch_size": 4, "epoch": 0.2292, "step": 573, "tokens_per_device": 8220 }, { "epoch": 0.2292, "loss_ce": 0.07800881564617157, "loss_lvr": 0.49287906289100647, "loss_mode_switch": 0.0, "loss_total": 0.12729671597480774, "step": 573 }, { "batch_size": 4, "epoch": 0.2292, "step": 573, "tokens_per_device": 4932 }, { "epoch": 0.2292, "loss_ce": 0.012135816738009453, "loss_lvr": 0.6569364070892334, "loss_mode_switch": 0.0, "loss_total": 0.07782945781946182, "step": 573 }, { "epoch": 0.2296, "grad_norm": 1.2816921472549438, "learning_rate": 8.99111920030558e-06, "loss": 0.3187, "step": 574 }, { "batch_size": 4, "epoch": 0.2296, "step": 574, "tokens_per_device": 2620 }, { "epoch": 0.2296, "loss_ce": 0.3763367235660553, "loss_lvr": 1.188081979751587, "loss_mode_switch": 0.0, "loss_total": 0.49514493346214294, "step": 574 }, { "batch_size": 1, "epoch": 0.2296, "step": 574, "tokens_per_device": 5031 }, { "epoch": 0.2296, "loss_ce": 0.030326707288622856, "loss_lvr": 0.3714592456817627, "loss_mode_switch": 0.0, "loss_total": 0.06747262924909592, "step": 574 }, { "batch_size": 4, "epoch": 0.2296, "step": 574, "tokens_per_device": 4068 }, { "epoch": 0.2296, "loss_ce": 0.17898811399936676, "loss_lvr": 1.0745375156402588, "loss_mode_switch": 0.0, "loss_total": 0.2864418625831604, "step": 574 }, { "batch_size": 4, "epoch": 0.2296, "step": 574, "tokens_per_device": 4096 }, { "epoch": 0.2296, "loss_ce": 0.3059481978416443, "loss_lvr": 0.7831444144248962, "loss_mode_switch": 0.0, "loss_total": 0.38426265120506287, "step": 574 }, { "batch_size": 4, "epoch": 0.2296, "step": 574, "tokens_per_device": 2700 }, { "epoch": 0.2296, "loss_ce": 0.18963021039962769, "loss_lvr": 0.9727507829666138, "loss_mode_switch": 0.0, "loss_total": 0.28690528869628906, "step": 574 }, { "batch_size": 4, "epoch": 0.2296, "step": 574, "tokens_per_device": 5660 }, { "epoch": 0.2296, "loss_ce": 0.4703753590583801, "loss_lvr": 0.8302914500236511, "loss_mode_switch": 0.0, "loss_total": 0.5534045100212097, "step": 574 }, { "batch_size": 4, "epoch": 0.2296, "step": 574, "tokens_per_device": 3036 }, { "epoch": 0.2296, "loss_ce": 0.428957998752594, "loss_lvr": 1.2337394952774048, "loss_mode_switch": 0.0, "loss_total": 0.5523319244384766, "step": 574 }, { "batch_size": 1, "epoch": 0.2296, "step": 574, "tokens_per_device": 5859 }, { "epoch": 0.2296, "loss_ce": 0.010180382989346981, "loss_lvr": 0.3466527462005615, "loss_mode_switch": 0.0, "loss_total": 0.04484565928578377, "step": 574 }, { "epoch": 0.23, "grad_norm": 1.3228968381881714, "learning_rate": 8.987214052813605e-06, "loss": 0.2736, "step": 575 }, { "batch_size": 4, "epoch": 0.23, "step": 575, "tokens_per_device": 1356 }, { "epoch": 0.23, "loss_ce": 0.5180255174636841, "loss_lvr": 1.333573579788208, "loss_mode_switch": 0.0, "loss_total": 0.6513828635215759, "step": 575 }, { "batch_size": 4, "epoch": 0.23, "step": 575, "tokens_per_device": 1448 }, { "epoch": 0.23, "loss_ce": 0.1824416220188141, "loss_lvr": 1.143282413482666, "loss_mode_switch": 0.0, "loss_total": 0.2967698574066162, "step": 575 }, { "batch_size": 4, "epoch": 0.23, "step": 575, "tokens_per_device": 3768 }, { "epoch": 0.23, "loss_ce": 0.4387299120426178, "loss_lvr": 0.8881211876869202, "loss_mode_switch": 0.0, "loss_total": 0.5275420546531677, "step": 575 }, { "batch_size": 4, "epoch": 0.23, "step": 575, "tokens_per_device": 2528 }, { "epoch": 0.23, "loss_ce": 0.2750549912452698, "loss_lvr": 1.0290714502334595, "loss_mode_switch": 0.0, "loss_total": 0.3779621422290802, "step": 575 }, { "batch_size": 4, "epoch": 0.23, "step": 575, "tokens_per_device": 4312 }, { "epoch": 0.23, "loss_ce": 0.5070517659187317, "loss_lvr": 0.8498039841651917, "loss_mode_switch": 0.0, "loss_total": 0.5920321941375732, "step": 575 }, { "batch_size": 1, "epoch": 0.23, "step": 575, "tokens_per_device": 4640 }, { "epoch": 0.23, "loss_ce": 0.15940897166728973, "loss_lvr": 0.5914442539215088, "loss_mode_switch": 0.0, "loss_total": 0.21855339407920837, "step": 575 }, { "batch_size": 1, "epoch": 0.23, "step": 575, "tokens_per_device": 4744 }, { "epoch": 0.23, "loss_ce": 0.03901437297463417, "loss_lvr": 0.4094468057155609, "loss_mode_switch": 0.0, "loss_total": 0.07995904982089996, "step": 575 }, { "batch_size": 4, "epoch": 0.23, "step": 575, "tokens_per_device": 4712 }, { "epoch": 0.23, "loss_ce": 0.37781664729118347, "loss_lvr": 0.8870769143104553, "loss_mode_switch": 0.0, "loss_total": 0.4665243327617645, "step": 575 }, { "epoch": 0.2304, "grad_norm": 1.4319339990615845, "learning_rate": 8.983302213478525e-06, "loss": 0.3541, "step": 576 }, { "batch_size": 4, "epoch": 0.2304, "step": 576, "tokens_per_device": 4152 }, { "epoch": 0.2304, "loss_ce": 0.3917449414730072, "loss_lvr": 0.9223522543907166, "loss_mode_switch": 0.0, "loss_total": 0.4839801788330078, "step": 576 }, { "batch_size": 1, "epoch": 0.2304, "step": 576, "tokens_per_device": 4880 }, { "epoch": 0.2304, "loss_ce": 0.015750328078866005, "loss_lvr": 0.21372604370117188, "loss_mode_switch": 0.0, "loss_total": 0.0371229350566864, "step": 576 }, { "batch_size": 4, "epoch": 0.2304, "step": 576, "tokens_per_device": 12548 }, { "epoch": 0.2304, "loss_ce": 0.029147787019610405, "loss_lvr": 0.679320216178894, "loss_mode_switch": 0.0, "loss_total": 0.0970798134803772, "step": 576 }, { "batch_size": 4, "epoch": 0.2304, "step": 576, "tokens_per_device": 2720 }, { "epoch": 0.2304, "loss_ce": 0.49427077174186707, "loss_lvr": 0.9007265567779541, "loss_mode_switch": 0.0, "loss_total": 0.584343433380127, "step": 576 }, { "batch_size": 4, "epoch": 0.2304, "step": 576, "tokens_per_device": 1384 }, { "epoch": 0.2304, "loss_ce": 0.6974558234214783, "loss_lvr": 2.1492056846618652, "loss_mode_switch": 0.0, "loss_total": 0.9123764038085938, "step": 576 }, { "batch_size": 1, "epoch": 0.2304, "step": 576, "tokens_per_device": 4860 }, { "epoch": 0.2304, "loss_ce": 0.22543680667877197, "loss_lvr": 0.6234894394874573, "loss_mode_switch": 0.0, "loss_total": 0.28778573870658875, "step": 576 }, { "batch_size": 4, "epoch": 0.2304, "step": 576, "tokens_per_device": 2552 }, { "epoch": 0.2304, "loss_ce": 0.5761927962303162, "loss_lvr": 1.0706342458724976, "loss_mode_switch": 0.0, "loss_total": 0.683256208896637, "step": 576 }, { "batch_size": 4, "epoch": 0.2304, "step": 576, "tokens_per_device": 1224 }, { "epoch": 0.2304, "loss_ce": 0.5362778306007385, "loss_lvr": 1.4150279760360718, "loss_mode_switch": 0.0, "loss_total": 0.6777806282043457, "step": 576 }, { "epoch": 0.2308, "grad_norm": 1.5263826847076416, "learning_rate": 8.97938368886568e-06, "loss": 0.3329, "step": 577 }, { "batch_size": 4, "epoch": 0.2308, "step": 577, "tokens_per_device": 4404 }, { "epoch": 0.2308, "loss_ce": 0.18064256012439728, "loss_lvr": 0.8892738819122314, "loss_mode_switch": 0.0, "loss_total": 0.26956993341445923, "step": 577 }, { "batch_size": 1, "epoch": 0.2308, "step": 577, "tokens_per_device": 5081 }, { "epoch": 0.2308, "loss_ce": 0.3898153603076935, "loss_lvr": 0.7714951038360596, "loss_mode_switch": 0.0, "loss_total": 0.46696487069129944, "step": 577 }, { "batch_size": 4, "epoch": 0.2308, "step": 577, "tokens_per_device": 1248 }, { "epoch": 0.2308, "loss_ce": 0.6109781265258789, "loss_lvr": 1.1906133890151978, "loss_mode_switch": 0.0, "loss_total": 0.7300394773483276, "step": 577 }, { "batch_size": 1, "epoch": 0.2308, "step": 577, "tokens_per_device": 6050 }, { "epoch": 0.2308, "loss_ce": 0.0011081418488174677, "loss_lvr": 0.5706850290298462, "loss_mode_switch": 0.0, "loss_total": 0.05817664787173271, "step": 577 }, { "batch_size": 1, "epoch": 0.2308, "step": 577, "tokens_per_device": 5042 }, { "epoch": 0.2308, "loss_ce": 0.061668604612350464, "loss_lvr": 2.425708532333374, "loss_mode_switch": 0.0, "loss_total": 0.3042394518852234, "step": 577 }, { "batch_size": 4, "epoch": 0.2308, "step": 577, "tokens_per_device": 4752 }, { "epoch": 0.2308, "loss_ce": 0.009561145678162575, "loss_lvr": 1.0655924081802368, "loss_mode_switch": 0.0, "loss_total": 0.11612038314342499, "step": 577 }, { "batch_size": 4, "epoch": 0.2308, "step": 577, "tokens_per_device": 2700 }, { "epoch": 0.2308, "loss_ce": 0.2268206775188446, "loss_lvr": 0.9049139022827148, "loss_mode_switch": 0.0, "loss_total": 0.3173120617866516, "step": 577 }, { "batch_size": 4, "epoch": 0.2308, "step": 577, "tokens_per_device": 1320 }, { "epoch": 0.2308, "loss_ce": 0.2841550409793854, "loss_lvr": 1.1572061777114868, "loss_mode_switch": 0.0, "loss_total": 0.399875670671463, "step": 577 }, { "epoch": 0.2312, "grad_norm": 1.462526798248291, "learning_rate": 8.97545848555163e-06, "loss": 0.2747, "step": 578 }, { "batch_size": 1, "epoch": 0.2312, "step": 578, "tokens_per_device": 5478 }, { "epoch": 0.2312, "loss_ce": 0.007267565932124853, "loss_lvr": 0.378897100687027, "loss_mode_switch": 0.0, "loss_total": 0.04515727609395981, "step": 578 }, { "batch_size": 4, "epoch": 0.2312, "step": 578, "tokens_per_device": 5208 }, { "epoch": 0.2312, "loss_ce": 0.13491590321063995, "loss_lvr": 0.6722815036773682, "loss_mode_switch": 0.0, "loss_total": 0.202144056558609, "step": 578 }, { "batch_size": 4, "epoch": 0.2312, "step": 578, "tokens_per_device": 4844 }, { "epoch": 0.2312, "loss_ce": 0.030816521495580673, "loss_lvr": 0.9118322134017944, "loss_mode_switch": 0.0, "loss_total": 0.12199974060058594, "step": 578 }, { "batch_size": 1, "epoch": 0.2312, "step": 578, "tokens_per_device": 4881 }, { "epoch": 0.2312, "loss_ce": 0.02805936150252819, "loss_lvr": 0.21094632148742676, "loss_mode_switch": 0.0, "loss_total": 0.049153994768857956, "step": 578 }, { "batch_size": 4, "epoch": 0.2312, "step": 578, "tokens_per_device": 4632 }, { "epoch": 0.2312, "loss_ce": 0.19388118386268616, "loss_lvr": 0.8457408547401428, "loss_mode_switch": 0.0, "loss_total": 0.2784552574157715, "step": 578 }, { "batch_size": 4, "epoch": 0.2312, "step": 578, "tokens_per_device": 9380 }, { "epoch": 0.2312, "loss_ce": 0.3520090579986572, "loss_lvr": 1.3832937479019165, "loss_mode_switch": 0.0, "loss_total": 0.49033844470977783, "step": 578 }, { "batch_size": 1, "epoch": 0.2312, "step": 578, "tokens_per_device": 5114 }, { "epoch": 0.2312, "loss_ce": 0.008541745133697987, "loss_lvr": 0.570461630821228, "loss_mode_switch": 0.0, "loss_total": 0.06558790802955627, "step": 578 }, { "batch_size": 1, "epoch": 0.2312, "step": 578, "tokens_per_device": 4887 }, { "epoch": 0.2312, "loss_ce": 0.003441506763920188, "loss_lvr": 0.34387755393981934, "loss_mode_switch": 0.0, "loss_total": 0.037829261273145676, "step": 578 }, { "epoch": 0.2316, "grad_norm": 1.5256010293960571, "learning_rate": 8.971526610124142e-06, "loss": 0.3133, "step": 579 }, { "batch_size": 1, "epoch": 0.2316, "step": 579, "tokens_per_device": 5019 }, { "epoch": 0.2316, "loss_ce": 0.012267387472093105, "loss_lvr": 0.35744526982307434, "loss_mode_switch": 0.0, "loss_total": 0.04801191762089729, "step": 579 }, { "batch_size": 4, "epoch": 0.2316, "step": 579, "tokens_per_device": 3736 }, { "epoch": 0.2316, "loss_ce": 0.5656194686889648, "loss_lvr": 0.9648900032043457, "loss_mode_switch": 0.0, "loss_total": 0.6621084809303284, "step": 579 }, { "batch_size": 4, "epoch": 0.2316, "step": 579, "tokens_per_device": 5688 }, { "epoch": 0.2316, "loss_ce": 0.03390985354781151, "loss_lvr": 1.0112828016281128, "loss_mode_switch": 0.0, "loss_total": 0.13503813743591309, "step": 579 }, { "batch_size": 1, "epoch": 0.2316, "step": 579, "tokens_per_device": 4709 }, { "epoch": 0.2316, "loss_ce": 0.03619847446680069, "loss_lvr": 0.4336649775505066, "loss_mode_switch": 0.0, "loss_total": 0.07956497371196747, "step": 579 }, { "batch_size": 4, "epoch": 0.2316, "step": 579, "tokens_per_device": 3896 }, { "epoch": 0.2316, "loss_ce": 0.11856500804424286, "loss_lvr": 0.7016058564186096, "loss_mode_switch": 0.0, "loss_total": 0.18872559070587158, "step": 579 }, { "batch_size": 1, "epoch": 0.2316, "step": 579, "tokens_per_device": 4909 }, { "epoch": 0.2316, "loss_ce": 0.0047692712396383286, "loss_lvr": 0.43464750051498413, "loss_mode_switch": 0.0, "loss_total": 0.04823402315378189, "step": 579 }, { "batch_size": 4, "epoch": 0.2316, "step": 579, "tokens_per_device": 3900 }, { "epoch": 0.2316, "loss_ce": 0.5739376544952393, "loss_lvr": 1.0701508522033691, "loss_mode_switch": 0.0, "loss_total": 0.6809527277946472, "step": 579 }, { "batch_size": 4, "epoch": 0.2316, "step": 579, "tokens_per_device": 3808 }, { "epoch": 0.2316, "loss_ce": 0.024054812267422676, "loss_lvr": 1.6093688011169434, "loss_mode_switch": 0.0, "loss_total": 0.18499168753623962, "step": 579 }, { "epoch": 0.232, "grad_norm": 1.4702544212341309, "learning_rate": 8.967588069182184e-06, "loss": 0.3296, "step": 580 }, { "batch_size": 4, "epoch": 0.232, "step": 580, "tokens_per_device": 5048 }, { "epoch": 0.232, "loss_ce": 0.2224290519952774, "loss_lvr": 0.6930006742477417, "loss_mode_switch": 0.0, "loss_total": 0.2917291224002838, "step": 580 }, { "batch_size": 4, "epoch": 0.232, "step": 580, "tokens_per_device": 4996 }, { "epoch": 0.232, "loss_ce": 0.3602737486362457, "loss_lvr": 1.1309839487075806, "loss_mode_switch": 0.0, "loss_total": 0.4733721613883972, "step": 580 }, { "batch_size": 1, "epoch": 0.232, "step": 580, "tokens_per_device": 5272 }, { "epoch": 0.232, "loss_ce": 0.7988593578338623, "loss_lvr": 0.7430428862571716, "loss_mode_switch": 0.0, "loss_total": 0.873163640499115, "step": 580 }, { "batch_size": 4, "epoch": 0.232, "step": 580, "tokens_per_device": 1304 }, { "epoch": 0.232, "loss_ce": 0.03331413492560387, "loss_lvr": 1.0042414665222168, "loss_mode_switch": 0.0, "loss_total": 0.13373827934265137, "step": 580 }, { "batch_size": 4, "epoch": 0.232, "step": 580, "tokens_per_device": 3296 }, { "epoch": 0.232, "loss_ce": 0.3560716509819031, "loss_lvr": 0.6548117399215698, "loss_mode_switch": 0.0, "loss_total": 0.421552836894989, "step": 580 }, { "batch_size": 1, "epoch": 0.232, "step": 580, "tokens_per_device": 6656 }, { "epoch": 0.232, "loss_ce": 0.0028760586865246296, "loss_lvr": 0.3308461606502533, "loss_mode_switch": 0.0, "loss_total": 0.03596067428588867, "step": 580 }, { "batch_size": 4, "epoch": 0.232, "step": 580, "tokens_per_device": 5752 }, { "epoch": 0.232, "loss_ce": 0.015350214205682278, "loss_lvr": 0.909458577632904, "loss_mode_switch": 0.0, "loss_total": 0.10629607737064362, "step": 580 }, { "batch_size": 4, "epoch": 0.232, "step": 580, "tokens_per_device": 14380 }, { "epoch": 0.232, "loss_ce": 0.28121280670166016, "loss_lvr": 1.2702722549438477, "loss_mode_switch": 0.0, "loss_total": 0.40824002027511597, "step": 580 }, { "epoch": 0.2324, "grad_norm": 1.656965970993042, "learning_rate": 8.963642869335913e-06, "loss": 0.3118, "step": 581 }, { "batch_size": 4, "epoch": 0.2324, "step": 581, "tokens_per_device": 4472 }, { "epoch": 0.2324, "loss_ce": 0.24750985205173492, "loss_lvr": 1.098413348197937, "loss_mode_switch": 0.0, "loss_total": 0.3573511838912964, "step": 581 }, { "batch_size": 4, "epoch": 0.2324, "step": 581, "tokens_per_device": 3724 }, { "epoch": 0.2324, "loss_ce": 0.187484472990036, "loss_lvr": 1.0518196821212769, "loss_mode_switch": 0.0, "loss_total": 0.2926664352416992, "step": 581 }, { "batch_size": 4, "epoch": 0.2324, "step": 581, "tokens_per_device": 6100 }, { "epoch": 0.2324, "loss_ce": 0.0834629088640213, "loss_lvr": 0.5004618167877197, "loss_mode_switch": 0.0, "loss_total": 0.13350909948349, "step": 581 }, { "batch_size": 4, "epoch": 0.2324, "step": 581, "tokens_per_device": 4224 }, { "epoch": 0.2324, "loss_ce": 0.030001871287822723, "loss_lvr": 0.939030110836029, "loss_mode_switch": 0.0, "loss_total": 0.12390488386154175, "step": 581 }, { "batch_size": 4, "epoch": 0.2324, "step": 581, "tokens_per_device": 4292 }, { "epoch": 0.2324, "loss_ce": 0.584116518497467, "loss_lvr": 0.9290614128112793, "loss_mode_switch": 0.0, "loss_total": 0.6770226359367371, "step": 581 }, { "batch_size": 4, "epoch": 0.2324, "step": 581, "tokens_per_device": 4244 }, { "epoch": 0.2324, "loss_ce": 0.21550270915031433, "loss_lvr": 0.9852581024169922, "loss_mode_switch": 0.0, "loss_total": 0.3140285313129425, "step": 581 }, { "batch_size": 1, "epoch": 0.2324, "step": 581, "tokens_per_device": 5137 }, { "epoch": 0.2324, "loss_ce": 0.004363093990832567, "loss_lvr": 0.5168230533599854, "loss_mode_switch": 0.0, "loss_total": 0.056045398116111755, "step": 581 }, { "batch_size": 4, "epoch": 0.2324, "step": 581, "tokens_per_device": 3840 }, { "epoch": 0.2324, "loss_ce": 0.43691086769104004, "loss_lvr": 0.9408621191978455, "loss_mode_switch": 0.0, "loss_total": 0.530997097492218, "step": 581 }, { "epoch": 0.2328, "grad_norm": 1.349892497062683, "learning_rate": 8.959691017206653e-06, "loss": 0.3247, "step": 582 }, { "batch_size": 4, "epoch": 0.2328, "step": 582, "tokens_per_device": 5892 }, { "epoch": 0.2328, "loss_ce": 0.18465398252010345, "loss_lvr": 0.8570841550827026, "loss_mode_switch": 0.0, "loss_total": 0.27036240696907043, "step": 582 }, { "batch_size": 1, "epoch": 0.2328, "step": 582, "tokens_per_device": 4895 }, { "epoch": 0.2328, "loss_ce": 0.007382906042039394, "loss_lvr": 0.9357045888900757, "loss_mode_switch": 0.0, "loss_total": 0.1009533703327179, "step": 582 }, { "batch_size": 4, "epoch": 0.2328, "step": 582, "tokens_per_device": 4640 }, { "epoch": 0.2328, "loss_ce": 0.016593987122178078, "loss_lvr": 0.8639009594917297, "loss_mode_switch": 0.0, "loss_total": 0.10298408567905426, "step": 582 }, { "batch_size": 1, "epoch": 0.2328, "step": 582, "tokens_per_device": 5092 }, { "epoch": 0.2328, "loss_ce": 0.24526210129261017, "loss_lvr": 0.4331819415092468, "loss_mode_switch": 0.0, "loss_total": 0.2885802984237671, "step": 582 }, { "batch_size": 4, "epoch": 0.2328, "step": 582, "tokens_per_device": 4716 }, { "epoch": 0.2328, "loss_ce": 0.060601986944675446, "loss_lvr": 0.9243767261505127, "loss_mode_switch": 0.0, "loss_total": 0.15303966403007507, "step": 582 }, { "batch_size": 1, "epoch": 0.2328, "step": 582, "tokens_per_device": 4855 }, { "epoch": 0.2328, "loss_ce": 0.013133753091096878, "loss_lvr": 0.8383700847625732, "loss_mode_switch": 0.0, "loss_total": 0.09697076678276062, "step": 582 }, { "batch_size": 4, "epoch": 0.2328, "step": 582, "tokens_per_device": 1904 }, { "epoch": 0.2328, "loss_ce": 0.2890707850456238, "loss_lvr": 0.9226371049880981, "loss_mode_switch": 0.0, "loss_total": 0.38133448362350464, "step": 582 }, { "batch_size": 4, "epoch": 0.2328, "step": 582, "tokens_per_device": 2860 }, { "epoch": 0.2328, "loss_ce": 0.3594045042991638, "loss_lvr": 0.9956204295158386, "loss_mode_switch": 0.0, "loss_total": 0.45896655321121216, "step": 582 }, { "epoch": 0.2332, "grad_norm": 1.6768804788589478, "learning_rate": 8.955732519426902e-06, "loss": 0.3586, "step": 583 }, { "batch_size": 4, "epoch": 0.2332, "step": 583, "tokens_per_device": 4260 }, { "epoch": 0.2332, "loss_ce": 0.26488807797431946, "loss_lvr": 1.1294912099838257, "loss_mode_switch": 0.0, "loss_total": 0.377837210893631, "step": 583 }, { "batch_size": 4, "epoch": 0.2332, "step": 583, "tokens_per_device": 4248 }, { "epoch": 0.2332, "loss_ce": 0.3029190003871918, "loss_lvr": 1.1852799654006958, "loss_mode_switch": 0.0, "loss_total": 0.4214470088481903, "step": 583 }, { "batch_size": 1, "epoch": 0.2332, "step": 583, "tokens_per_device": 4876 }, { "epoch": 0.2332, "loss_ce": 0.4314083755016327, "loss_lvr": 0.45897993445396423, "loss_mode_switch": 0.0, "loss_total": 0.4773063659667969, "step": 583 }, { "batch_size": 4, "epoch": 0.2332, "step": 583, "tokens_per_device": 12864 }, { "epoch": 0.2332, "loss_ce": 0.7730231881141663, "loss_lvr": 0.6892603039741516, "loss_mode_switch": 0.0, "loss_total": 0.8419492244720459, "step": 583 }, { "batch_size": 4, "epoch": 0.2332, "step": 583, "tokens_per_device": 6912 }, { "epoch": 0.2332, "loss_ce": 0.035707633942365646, "loss_lvr": 0.9308732151985168, "loss_mode_switch": 0.0, "loss_total": 0.12879495322704315, "step": 583 }, { "batch_size": 4, "epoch": 0.2332, "step": 583, "tokens_per_device": 5116 }, { "epoch": 0.2332, "loss_ce": 0.16832607984542847, "loss_lvr": 0.8588955402374268, "loss_mode_switch": 0.0, "loss_total": 0.25421562790870667, "step": 583 }, { "batch_size": 4, "epoch": 0.2332, "step": 583, "tokens_per_device": 3884 }, { "epoch": 0.2332, "loss_ce": 0.37629127502441406, "loss_lvr": 0.9745602607727051, "loss_mode_switch": 0.0, "loss_total": 0.4737473130226135, "step": 583 }, { "batch_size": 4, "epoch": 0.2332, "step": 583, "tokens_per_device": 4304 }, { "epoch": 0.2332, "loss_ce": 0.1922404021024704, "loss_lvr": 0.5957506895065308, "loss_mode_switch": 0.0, "loss_total": 0.25181546807289124, "step": 583 }, { "epoch": 0.2336, "grad_norm": 2.0720393657684326, "learning_rate": 8.951767382640308e-06, "loss": 0.3184, "step": 584 }, { "batch_size": 1, "epoch": 0.2336, "step": 584, "tokens_per_device": 4968 }, { "epoch": 0.2336, "loss_ce": 0.001247929991222918, "loss_lvr": 0.18664386868476868, "loss_mode_switch": 0.0, "loss_total": 0.019912317395210266, "step": 584 }, { "batch_size": 4, "epoch": 0.2336, "step": 584, "tokens_per_device": 7280 }, { "epoch": 0.2336, "loss_ce": 0.048029422760009766, "loss_lvr": 0.9194840788841248, "loss_mode_switch": 0.0, "loss_total": 0.1399778425693512, "step": 584 }, { "batch_size": 4, "epoch": 0.2336, "step": 584, "tokens_per_device": 4212 }, { "epoch": 0.2336, "loss_ce": 0.09262435138225555, "loss_lvr": 0.9801672101020813, "loss_mode_switch": 0.0, "loss_total": 0.19064107537269592, "step": 584 }, { "batch_size": 1, "epoch": 0.2336, "step": 584, "tokens_per_device": 4975 }, { "epoch": 0.2336, "loss_ce": 0.21709096431732178, "loss_lvr": 0.6252760887145996, "loss_mode_switch": 0.0, "loss_total": 0.2796185612678528, "step": 584 }, { "batch_size": 1, "epoch": 0.2336, "step": 584, "tokens_per_device": 4964 }, { "epoch": 0.2336, "loss_ce": 0.36299875378608704, "loss_lvr": 0.2057039439678192, "loss_mode_switch": 0.0, "loss_total": 0.3835691511631012, "step": 584 }, { "batch_size": 4, "epoch": 0.2336, "step": 584, "tokens_per_device": 3748 }, { "epoch": 0.2336, "loss_ce": 0.8559873700141907, "loss_lvr": 1.132795810699463, "loss_mode_switch": 0.0, "loss_total": 0.969266951084137, "step": 584 }, { "batch_size": 1, "epoch": 0.2336, "step": 584, "tokens_per_device": 6375 }, { "epoch": 0.2336, "loss_ce": 0.005576220341026783, "loss_lvr": 0.5934984087944031, "loss_mode_switch": 0.0, "loss_total": 0.06492606550455093, "step": 584 }, { "batch_size": 4, "epoch": 0.2336, "step": 584, "tokens_per_device": 4376 }, { "epoch": 0.2336, "loss_ce": 0.2971685230731964, "loss_lvr": 0.7713801264762878, "loss_mode_switch": 0.0, "loss_total": 0.3743065297603607, "step": 584 }, { "epoch": 0.234, "grad_norm": 1.4576032161712646, "learning_rate": 8.947795613501658e-06, "loss": 0.3234, "step": 585 }, { "batch_size": 4, "epoch": 0.234, "step": 585, "tokens_per_device": 3764 }, { "epoch": 0.234, "loss_ce": 0.361467182636261, "loss_lvr": 0.9740546941757202, "loss_mode_switch": 0.0, "loss_total": 0.45887264609336853, "step": 585 }, { "batch_size": 1, "epoch": 0.234, "step": 585, "tokens_per_device": 4878 }, { "epoch": 0.234, "loss_ce": 0.007901455275714397, "loss_lvr": 0.5959833264350891, "loss_mode_switch": 0.0, "loss_total": 0.0674997866153717, "step": 585 }, { "batch_size": 4, "epoch": 0.234, "step": 585, "tokens_per_device": 3748 }, { "epoch": 0.234, "loss_ce": 0.1438254714012146, "loss_lvr": 0.9202763438224792, "loss_mode_switch": 0.0, "loss_total": 0.23585310578346252, "step": 585 }, { "batch_size": 4, "epoch": 0.234, "step": 585, "tokens_per_device": 2644 }, { "epoch": 0.234, "loss_ce": 0.4232688546180725, "loss_lvr": 1.1027345657348633, "loss_mode_switch": 0.0, "loss_total": 0.5335423350334167, "step": 585 }, { "batch_size": 4, "epoch": 0.234, "step": 585, "tokens_per_device": 4584 }, { "epoch": 0.234, "loss_ce": 0.44251319766044617, "loss_lvr": 0.8605279922485352, "loss_mode_switch": 0.0, "loss_total": 0.5285660028457642, "step": 585 }, { "batch_size": 1, "epoch": 0.234, "step": 585, "tokens_per_device": 5093 }, { "epoch": 0.234, "loss_ce": 0.0028649778105318546, "loss_lvr": 0.5035159587860107, "loss_mode_switch": 0.0, "loss_total": 0.05321657657623291, "step": 585 }, { "batch_size": 4, "epoch": 0.234, "step": 585, "tokens_per_device": 3920 }, { "epoch": 0.234, "loss_ce": 0.5751256942749023, "loss_lvr": 1.0337326526641846, "loss_mode_switch": 0.0, "loss_total": 0.6784989833831787, "step": 585 }, { "batch_size": 4, "epoch": 0.234, "step": 585, "tokens_per_device": 2748 }, { "epoch": 0.234, "loss_ce": 0.5698716044425964, "loss_lvr": 0.9872668385505676, "loss_mode_switch": 0.0, "loss_total": 0.6685982942581177, "step": 585 }, { "epoch": 0.2344, "grad_norm": 1.5773000717163086, "learning_rate": 8.943817218676877e-06, "loss": 0.3589, "step": 586 }, { "batch_size": 4, "epoch": 0.2344, "step": 586, "tokens_per_device": 4536 }, { "epoch": 0.2344, "loss_ce": 0.1573234349489212, "loss_lvr": 0.8153903484344482, "loss_mode_switch": 0.0, "loss_total": 0.23886246979236603, "step": 586 }, { "batch_size": 1, "epoch": 0.2344, "step": 586, "tokens_per_device": 4767 }, { "epoch": 0.2344, "loss_ce": 0.45444685220718384, "loss_lvr": 0.37530094385147095, "loss_mode_switch": 0.0, "loss_total": 0.49197694659233093, "step": 586 }, { "batch_size": 4, "epoch": 0.2344, "step": 586, "tokens_per_device": 3828 }, { "epoch": 0.2344, "loss_ce": 0.43706125020980835, "loss_lvr": 1.100472092628479, "loss_mode_switch": 0.0, "loss_total": 0.5471084713935852, "step": 586 }, { "batch_size": 1, "epoch": 0.2344, "step": 586, "tokens_per_device": 5105 }, { "epoch": 0.2344, "loss_ce": 0.013358604162931442, "loss_lvr": 0.6442851424217224, "loss_mode_switch": 0.0, "loss_total": 0.0777871161699295, "step": 586 }, { "batch_size": 4, "epoch": 0.2344, "step": 586, "tokens_per_device": 3508 }, { "epoch": 0.2344, "loss_ce": 0.13900157809257507, "loss_lvr": 1.1542809009552002, "loss_mode_switch": 0.0, "loss_total": 0.2544296681880951, "step": 586 }, { "batch_size": 4, "epoch": 0.2344, "step": 586, "tokens_per_device": 7400 }, { "epoch": 0.2344, "loss_ce": 0.38023391366004944, "loss_lvr": 1.0260463953018188, "loss_mode_switch": 0.0, "loss_total": 0.48283857107162476, "step": 586 }, { "batch_size": 4, "epoch": 0.2344, "step": 586, "tokens_per_device": 1632 }, { "epoch": 0.2344, "loss_ce": 0.3879401981830597, "loss_lvr": 1.0750895738601685, "loss_mode_switch": 0.0, "loss_total": 0.49544915556907654, "step": 586 }, { "batch_size": 1, "epoch": 0.2344, "step": 586, "tokens_per_device": 4878 }, { "epoch": 0.2344, "loss_ce": 0.009280764497816563, "loss_lvr": 0.44714200496673584, "loss_mode_switch": 0.0, "loss_total": 0.05399496480822563, "step": 586 }, { "epoch": 0.2348, "grad_norm": 1.5490448474884033, "learning_rate": 8.939832204843003e-06, "loss": 0.3176, "step": 587 }, { "batch_size": 4, "epoch": 0.2348, "step": 587, "tokens_per_device": 5472 }, { "epoch": 0.2348, "loss_ce": 0.5098745226860046, "loss_lvr": 0.7768874168395996, "loss_mode_switch": 0.0, "loss_total": 0.5875632762908936, "step": 587 }, { "batch_size": 4, "epoch": 0.2348, "step": 587, "tokens_per_device": 4212 }, { "epoch": 0.2348, "loss_ce": 0.03811895102262497, "loss_lvr": 1.2454214096069336, "loss_mode_switch": 0.0, "loss_total": 0.1626610904932022, "step": 587 }, { "batch_size": 4, "epoch": 0.2348, "step": 587, "tokens_per_device": 5904 }, { "epoch": 0.2348, "loss_ce": 0.045180805027484894, "loss_lvr": 1.0007860660552979, "loss_mode_switch": 0.0, "loss_total": 0.14525941014289856, "step": 587 }, { "batch_size": 4, "epoch": 0.2348, "step": 587, "tokens_per_device": 2644 }, { "epoch": 0.2348, "loss_ce": 0.155388742685318, "loss_lvr": 0.7905688285827637, "loss_mode_switch": 0.0, "loss_total": 0.23444563150405884, "step": 587 }, { "batch_size": 1, "epoch": 0.2348, "step": 587, "tokens_per_device": 5342 }, { "epoch": 0.2348, "loss_ce": 0.16106057167053223, "loss_lvr": 0.5184722542762756, "loss_mode_switch": 0.0, "loss_total": 0.2129077911376953, "step": 587 }, { "batch_size": 4, "epoch": 0.2348, "step": 587, "tokens_per_device": 6024 }, { "epoch": 0.2348, "loss_ce": 0.1592877060174942, "loss_lvr": 0.55788254737854, "loss_mode_switch": 0.0, "loss_total": 0.21507596969604492, "step": 587 }, { "batch_size": 4, "epoch": 0.2348, "step": 587, "tokens_per_device": 1432 }, { "epoch": 0.2348, "loss_ce": 0.39356517791748047, "loss_lvr": 0.9117134809494019, "loss_mode_switch": 0.0, "loss_total": 0.48473653197288513, "step": 587 }, { "batch_size": 1, "epoch": 0.2348, "step": 587, "tokens_per_device": 5155 }, { "epoch": 0.2348, "loss_ce": 0.0690881758928299, "loss_lvr": 0.2827478051185608, "loss_mode_switch": 0.0, "loss_total": 0.0973629578948021, "step": 587 }, { "epoch": 0.2352, "grad_norm": 1.405134677886963, "learning_rate": 8.935840578688191e-06, "loss": 0.3139, "step": 588 }, { "batch_size": 4, "epoch": 0.2352, "step": 588, "tokens_per_device": 4840 }, { "epoch": 0.2352, "loss_ce": 0.0703168511390686, "loss_lvr": 0.8230687379837036, "loss_mode_switch": 0.0, "loss_total": 0.1526237279176712, "step": 588 }, { "batch_size": 1, "epoch": 0.2352, "step": 588, "tokens_per_device": 5246 }, { "epoch": 0.2352, "loss_ce": 0.058385614305734634, "loss_lvr": 1.0890754461288452, "loss_mode_switch": 0.0, "loss_total": 0.16729316115379333, "step": 588 }, { "batch_size": 4, "epoch": 0.2352, "step": 588, "tokens_per_device": 5860 }, { "epoch": 0.2352, "loss_ce": 0.51032954454422, "loss_lvr": 0.9195811748504639, "loss_mode_switch": 0.0, "loss_total": 0.6022876501083374, "step": 588 }, { "batch_size": 4, "epoch": 0.2352, "step": 588, "tokens_per_device": 13548 }, { "epoch": 0.2352, "loss_ce": 0.3126414716243744, "loss_lvr": 0.8662745356559753, "loss_mode_switch": 0.0, "loss_total": 0.3992689251899719, "step": 588 }, { "batch_size": 4, "epoch": 0.2352, "step": 588, "tokens_per_device": 3956 }, { "epoch": 0.2352, "loss_ce": 0.3321983516216278, "loss_lvr": 0.7962905764579773, "loss_mode_switch": 0.0, "loss_total": 0.41182741522789, "step": 588 }, { "batch_size": 4, "epoch": 0.2352, "step": 588, "tokens_per_device": 4272 }, { "epoch": 0.2352, "loss_ce": 0.09876758605241776, "loss_lvr": 0.8026126027107239, "loss_mode_switch": 0.0, "loss_total": 0.17902883887290955, "step": 588 }, { "batch_size": 1, "epoch": 0.2352, "step": 588, "tokens_per_device": 4900 }, { "epoch": 0.2352, "loss_ce": 0.12712103128433228, "loss_lvr": 0.4594763219356537, "loss_mode_switch": 0.0, "loss_total": 0.17306867241859436, "step": 588 }, { "batch_size": 1, "epoch": 0.2352, "step": 588, "tokens_per_device": 4886 }, { "epoch": 0.2352, "loss_ce": 0.00588953634724021, "loss_lvr": 0.5025117993354797, "loss_mode_switch": 0.0, "loss_total": 0.0561407171189785, "step": 588 }, { "epoch": 0.2356, "grad_norm": 1.4600424766540527, "learning_rate": 8.931842346911688e-06, "loss": 0.3319, "step": 589 }, { "batch_size": 4, "epoch": 0.2356, "step": 589, "tokens_per_device": 4328 }, { "epoch": 0.2356, "loss_ce": 0.21215011179447174, "loss_lvr": 1.023352026939392, "loss_mode_switch": 0.0, "loss_total": 0.3144853115081787, "step": 589 }, { "batch_size": 1, "epoch": 0.2356, "step": 589, "tokens_per_device": 5166 }, { "epoch": 0.2356, "loss_ce": 0.3781673312187195, "loss_lvr": 0.3362841308116913, "loss_mode_switch": 0.0, "loss_total": 0.4117957353591919, "step": 589 }, { "batch_size": 4, "epoch": 0.2356, "step": 589, "tokens_per_device": 4216 }, { "epoch": 0.2356, "loss_ce": 0.044392094016075134, "loss_lvr": 1.0183279514312744, "loss_mode_switch": 0.0, "loss_total": 0.14622488617897034, "step": 589 }, { "batch_size": 4, "epoch": 0.2356, "step": 589, "tokens_per_device": 5740 }, { "epoch": 0.2356, "loss_ce": 0.058506984263658524, "loss_lvr": 0.7830981016159058, "loss_mode_switch": 0.0, "loss_total": 0.13681679964065552, "step": 589 }, { "batch_size": 1, "epoch": 0.2356, "step": 589, "tokens_per_device": 5098 }, { "epoch": 0.2356, "loss_ce": 0.0041820877231657505, "loss_lvr": 0.4643312692642212, "loss_mode_switch": 0.0, "loss_total": 0.05061521753668785, "step": 589 }, { "batch_size": 1, "epoch": 0.2356, "step": 589, "tokens_per_device": 5105 }, { "epoch": 0.2356, "loss_ce": 0.03947416692972183, "loss_lvr": 0.8735374212265015, "loss_mode_switch": 0.0, "loss_total": 0.1268279105424881, "step": 589 }, { "batch_size": 1, "epoch": 0.2356, "step": 589, "tokens_per_device": 4895 }, { "epoch": 0.2356, "loss_ce": 0.1907082498073578, "loss_lvr": 0.9436264038085938, "loss_mode_switch": 0.0, "loss_total": 0.28507089614868164, "step": 589 }, { "batch_size": 4, "epoch": 0.2356, "step": 589, "tokens_per_device": 3852 }, { "epoch": 0.2356, "loss_ce": 0.32172009348869324, "loss_lvr": 0.7214856743812561, "loss_mode_switch": 0.0, "loss_total": 0.39386865496635437, "step": 589 }, { "epoch": 0.236, "grad_norm": 1.4250799417495728, "learning_rate": 8.927837516223824e-06, "loss": 0.2516, "step": 590 }, { "batch_size": 4, "epoch": 0.236, "step": 590, "tokens_per_device": 3840 }, { "epoch": 0.236, "loss_ce": 0.4099940061569214, "loss_lvr": 1.242351770401001, "loss_mode_switch": 0.0, "loss_total": 0.5342291593551636, "step": 590 }, { "batch_size": 4, "epoch": 0.236, "step": 590, "tokens_per_device": 16144 }, { "epoch": 0.236, "loss_ce": 0.18956786394119263, "loss_lvr": 0.6568307280540466, "loss_mode_switch": 0.0, "loss_total": 0.2552509307861328, "step": 590 }, { "batch_size": 1, "epoch": 0.236, "step": 590, "tokens_per_device": 4977 }, { "epoch": 0.236, "loss_ce": 1.5489152669906616, "loss_lvr": 0.6404711008071899, "loss_mode_switch": 0.0, "loss_total": 1.6129623651504517, "step": 590 }, { "batch_size": 4, "epoch": 0.236, "step": 590, "tokens_per_device": 4236 }, { "epoch": 0.236, "loss_ce": 0.8696603178977966, "loss_lvr": 0.9583526849746704, "loss_mode_switch": 0.0, "loss_total": 0.9654955863952637, "step": 590 }, { "batch_size": 4, "epoch": 0.236, "step": 590, "tokens_per_device": 4116 }, { "epoch": 0.236, "loss_ce": 0.3013816177845001, "loss_lvr": 0.9088620543479919, "loss_mode_switch": 0.0, "loss_total": 0.3922678232192993, "step": 590 }, { "batch_size": 4, "epoch": 0.236, "step": 590, "tokens_per_device": 7076 }, { "epoch": 0.236, "loss_ce": 0.3140280246734619, "loss_lvr": 0.5743194222450256, "loss_mode_switch": 0.0, "loss_total": 0.3714599609375, "step": 590 }, { "batch_size": 1, "epoch": 0.236, "step": 590, "tokens_per_device": 4903 }, { "epoch": 0.236, "loss_ce": 0.1032143086194992, "loss_lvr": 0.24306797981262207, "loss_mode_switch": 0.0, "loss_total": 0.1275211125612259, "step": 590 }, { "batch_size": 4, "epoch": 0.236, "step": 590, "tokens_per_device": 1596 }, { "epoch": 0.236, "loss_ce": 0.7054354548454285, "loss_lvr": 1.0043350458145142, "loss_mode_switch": 0.0, "loss_total": 0.8058689832687378, "step": 590 }, { "epoch": 0.2364, "grad_norm": 1.7314008474349976, "learning_rate": 8.923826093346013e-06, "loss": 0.417, "step": 591 }, { "batch_size": 4, "epoch": 0.2364, "step": 591, "tokens_per_device": 6864 }, { "epoch": 0.2364, "loss_ce": 0.05639395862817764, "loss_lvr": 0.8210633993148804, "loss_mode_switch": 0.0, "loss_total": 0.13850030303001404, "step": 591 }, { "batch_size": 1, "epoch": 0.2364, "step": 591, "tokens_per_device": 5116 }, { "epoch": 0.2364, "loss_ce": 0.0020380113273859024, "loss_lvr": 0.6822317838668823, "loss_mode_switch": 0.0, "loss_total": 0.07026118785142899, "step": 591 }, { "batch_size": 4, "epoch": 0.2364, "step": 591, "tokens_per_device": 4276 }, { "epoch": 0.2364, "loss_ce": 0.40957674384117126, "loss_lvr": 1.060188889503479, "loss_mode_switch": 0.0, "loss_total": 0.5155956149101257, "step": 591 }, { "batch_size": 4, "epoch": 0.2364, "step": 591, "tokens_per_device": 4348 }, { "epoch": 0.2364, "loss_ce": 0.2805965542793274, "loss_lvr": 0.9887155294418335, "loss_mode_switch": 0.0, "loss_total": 0.3794681131839752, "step": 591 }, { "batch_size": 4, "epoch": 0.2364, "step": 591, "tokens_per_device": 2792 }, { "epoch": 0.2364, "loss_ce": 0.022327853366732597, "loss_lvr": 0.6397836208343506, "loss_mode_switch": 0.0, "loss_total": 0.08630622178316116, "step": 591 }, { "batch_size": 4, "epoch": 0.2364, "step": 591, "tokens_per_device": 2712 }, { "epoch": 0.2364, "loss_ce": 0.03045695647597313, "loss_lvr": 0.817535936832428, "loss_mode_switch": 0.0, "loss_total": 0.11221055686473846, "step": 591 }, { "batch_size": 1, "epoch": 0.2364, "step": 591, "tokens_per_device": 4906 }, { "epoch": 0.2364, "loss_ce": 0.0032750312238931656, "loss_lvr": 0.6199885606765747, "loss_mode_switch": 0.0, "loss_total": 0.06527388840913773, "step": 591 }, { "batch_size": 4, "epoch": 0.2364, "step": 591, "tokens_per_device": 5192 }, { "epoch": 0.2364, "loss_ce": 0.07795816659927368, "loss_lvr": 1.3808656930923462, "loss_mode_switch": 0.0, "loss_total": 0.21604473888874054, "step": 591 }, { "epoch": 0.2368, "grad_norm": 1.4019684791564941, "learning_rate": 8.919808085010726e-06, "loss": 0.3236, "step": 592 }, { "batch_size": 4, "epoch": 0.2368, "step": 592, "tokens_per_device": 6364 }, { "epoch": 0.2368, "loss_ce": 0.09632063657045364, "loss_lvr": 0.7875986099243164, "loss_mode_switch": 0.0, "loss_total": 0.17508050799369812, "step": 592 }, { "batch_size": 1, "epoch": 0.2368, "step": 592, "tokens_per_device": 5162 }, { "epoch": 0.2368, "loss_ce": 0.0005628790240734816, "loss_lvr": 0.32776930928230286, "loss_mode_switch": 0.0, "loss_total": 0.033339813351631165, "step": 592 }, { "batch_size": 1, "epoch": 0.2368, "step": 592, "tokens_per_device": 5127 }, { "epoch": 0.2368, "loss_ce": 0.006182082463055849, "loss_lvr": 0.6359790563583374, "loss_mode_switch": 0.0, "loss_total": 0.06977999210357666, "step": 592 }, { "batch_size": 4, "epoch": 0.2368, "step": 592, "tokens_per_device": 4076 }, { "epoch": 0.2368, "loss_ce": 0.17713108658790588, "loss_lvr": 1.1141685247421265, "loss_mode_switch": 0.0, "loss_total": 0.28854793310165405, "step": 592 }, { "batch_size": 4, "epoch": 0.2368, "step": 592, "tokens_per_device": 3928 }, { "epoch": 0.2368, "loss_ce": 0.4268098771572113, "loss_lvr": 0.9160218834877014, "loss_mode_switch": 0.0, "loss_total": 0.5184120535850525, "step": 592 }, { "batch_size": 1, "epoch": 0.2368, "step": 592, "tokens_per_device": 5019 }, { "epoch": 0.2368, "loss_ce": 0.07017786800861359, "loss_lvr": 0.6644966006278992, "loss_mode_switch": 0.0, "loss_total": 0.13662752509117126, "step": 592 }, { "batch_size": 4, "epoch": 0.2368, "step": 592, "tokens_per_device": 5260 }, { "epoch": 0.2368, "loss_ce": 0.04666287451982498, "loss_lvr": 0.7566777467727661, "loss_mode_switch": 0.0, "loss_total": 0.12233065068721771, "step": 592 }, { "batch_size": 1, "epoch": 0.2368, "step": 592, "tokens_per_device": 5066 }, { "epoch": 0.2368, "loss_ce": 0.022316837683320045, "loss_lvr": 0.3634309470653534, "loss_mode_switch": 0.0, "loss_total": 0.058659933507442474, "step": 592 }, { "epoch": 0.2372, "grad_norm": 1.4805976152420044, "learning_rate": 8.915783497961492e-06, "loss": 0.3149, "step": 593 }, { "batch_size": 1, "epoch": 0.2372, "step": 593, "tokens_per_device": 5204 }, { "epoch": 0.2372, "loss_ce": 0.03958804905414581, "loss_lvr": 0.6593126058578491, "loss_mode_switch": 0.0, "loss_total": 0.10551930963993073, "step": 593 }, { "batch_size": 4, "epoch": 0.2372, "step": 593, "tokens_per_device": 8248 }, { "epoch": 0.2372, "loss_ce": 0.09510726481676102, "loss_lvr": 0.8253600597381592, "loss_mode_switch": 0.0, "loss_total": 0.17764326930046082, "step": 593 }, { "batch_size": 1, "epoch": 0.2372, "step": 593, "tokens_per_device": 5138 }, { "epoch": 0.2372, "loss_ce": 0.009251262992620468, "loss_lvr": 0.44211071729660034, "loss_mode_switch": 0.0, "loss_total": 0.05346233397722244, "step": 593 }, { "batch_size": 4, "epoch": 0.2372, "step": 593, "tokens_per_device": 2564 }, { "epoch": 0.2372, "loss_ce": 0.11189037561416626, "loss_lvr": 1.1074438095092773, "loss_mode_switch": 0.0, "loss_total": 0.22263476252555847, "step": 593 }, { "batch_size": 4, "epoch": 0.2372, "step": 593, "tokens_per_device": 1472 }, { "epoch": 0.2372, "loss_ce": 0.6242066621780396, "loss_lvr": 0.9816731810569763, "loss_mode_switch": 0.0, "loss_total": 0.7223739624023438, "step": 593 }, { "batch_size": 4, "epoch": 0.2372, "step": 593, "tokens_per_device": 1240 }, { "epoch": 0.2372, "loss_ce": 0.27801722288131714, "loss_lvr": 1.4282230138778687, "loss_mode_switch": 0.0, "loss_total": 0.4208395481109619, "step": 593 }, { "batch_size": 4, "epoch": 0.2372, "step": 593, "tokens_per_device": 3824 }, { "epoch": 0.2372, "loss_ce": 0.4725828170776367, "loss_lvr": 0.9131382703781128, "loss_mode_switch": 0.0, "loss_total": 0.563896656036377, "step": 593 }, { "batch_size": 4, "epoch": 0.2372, "step": 593, "tokens_per_device": 3284 }, { "epoch": 0.2372, "loss_ce": 0.13681668043136597, "loss_lvr": 1.5539073944091797, "loss_mode_switch": 0.0, "loss_total": 0.29220741987228394, "step": 593 }, { "epoch": 0.2376, "grad_norm": 1.596863031387329, "learning_rate": 8.911752338952875e-06, "loss": 0.316, "step": 594 }, { "batch_size": 4, "epoch": 0.2376, "step": 594, "tokens_per_device": 2672 }, { "epoch": 0.2376, "loss_ce": 0.3557962477207184, "loss_lvr": 0.8487155437469482, "loss_mode_switch": 0.0, "loss_total": 0.4406678080558777, "step": 594 }, { "batch_size": 1, "epoch": 0.2376, "step": 594, "tokens_per_device": 4891 }, { "epoch": 0.2376, "loss_ce": 0.008602899499237537, "loss_lvr": 0.8077412247657776, "loss_mode_switch": 0.0, "loss_total": 0.0893770232796669, "step": 594 }, { "batch_size": 4, "epoch": 0.2376, "step": 594, "tokens_per_device": 4952 }, { "epoch": 0.2376, "loss_ce": 0.4778713583946228, "loss_lvr": 0.8594463467597961, "loss_mode_switch": 0.0, "loss_total": 0.5638160109519958, "step": 594 }, { "batch_size": 4, "epoch": 0.2376, "step": 594, "tokens_per_device": 3580 }, { "epoch": 0.2376, "loss_ce": 0.17132703959941864, "loss_lvr": 0.8741257786750793, "loss_mode_switch": 0.0, "loss_total": 0.2587396204471588, "step": 594 }, { "batch_size": 4, "epoch": 0.2376, "step": 594, "tokens_per_device": 4224 }, { "epoch": 0.2376, "loss_ce": 0.029853159561753273, "loss_lvr": 0.6725080013275146, "loss_mode_switch": 0.0, "loss_total": 0.09710396081209183, "step": 594 }, { "batch_size": 1, "epoch": 0.2376, "step": 594, "tokens_per_device": 4215 }, { "epoch": 0.2376, "loss_ce": 0.003801584243774414, "loss_lvr": 0.5549497008323669, "loss_mode_switch": 0.0, "loss_total": 0.05929655581712723, "step": 594 }, { "batch_size": 4, "epoch": 0.2376, "step": 594, "tokens_per_device": 4560 }, { "epoch": 0.2376, "loss_ce": 0.3974267244338989, "loss_lvr": 0.9480219483375549, "loss_mode_switch": 0.0, "loss_total": 0.4922289252281189, "step": 594 }, { "batch_size": 4, "epoch": 0.2376, "step": 594, "tokens_per_device": 4864 }, { "epoch": 0.2376, "loss_ce": 0.11945071816444397, "loss_lvr": 0.8045348525047302, "loss_mode_switch": 0.0, "loss_total": 0.199904203414917, "step": 594 }, { "epoch": 0.238, "grad_norm": 1.325903296470642, "learning_rate": 8.907714614750473e-06, "loss": 0.3389, "step": 595 }, { "batch_size": 1, "epoch": 0.238, "step": 595, "tokens_per_device": 5106 }, { "epoch": 0.238, "loss_ce": 0.6841204762458801, "loss_lvr": 0.3122345209121704, "loss_mode_switch": 0.0, "loss_total": 0.7153439521789551, "step": 595 }, { "batch_size": 4, "epoch": 0.238, "step": 595, "tokens_per_device": 1492 }, { "epoch": 0.238, "loss_ce": 0.14677785336971283, "loss_lvr": 0.9560324549674988, "loss_mode_switch": 0.0, "loss_total": 0.24238109588623047, "step": 595 }, { "batch_size": 4, "epoch": 0.238, "step": 595, "tokens_per_device": 3800 }, { "epoch": 0.238, "loss_ce": 0.41543152928352356, "loss_lvr": 1.113567590713501, "loss_mode_switch": 0.0, "loss_total": 0.5267882943153381, "step": 595 }, { "batch_size": 4, "epoch": 0.238, "step": 595, "tokens_per_device": 1476 }, { "epoch": 0.238, "loss_ce": 0.2795713245868683, "loss_lvr": 1.0741826295852661, "loss_mode_switch": 0.0, "loss_total": 0.3869895935058594, "step": 595 }, { "batch_size": 4, "epoch": 0.238, "step": 595, "tokens_per_device": 3896 }, { "epoch": 0.238, "loss_ce": 0.8129653930664062, "loss_lvr": 1.0292940139770508, "loss_mode_switch": 0.0, "loss_total": 0.9158948063850403, "step": 595 }, { "batch_size": 1, "epoch": 0.238, "step": 595, "tokens_per_device": 5154 }, { "epoch": 0.238, "loss_ce": 0.009876839816570282, "loss_lvr": 0.635929524898529, "loss_mode_switch": 0.0, "loss_total": 0.07346979528665543, "step": 595 }, { "batch_size": 4, "epoch": 0.238, "step": 595, "tokens_per_device": 2736 }, { "epoch": 0.238, "loss_ce": 0.03922749683260918, "loss_lvr": 0.5781698226928711, "loss_mode_switch": 0.0, "loss_total": 0.09704448282718658, "step": 595 }, { "batch_size": 4, "epoch": 0.238, "step": 595, "tokens_per_device": 4532 }, { "epoch": 0.238, "loss_ce": 0.4126719534397125, "loss_lvr": 1.3973404169082642, "loss_mode_switch": 0.0, "loss_total": 0.5524060130119324, "step": 595 }, { "epoch": 0.2384, "grad_norm": 1.6549614667892456, "learning_rate": 8.9036703321309e-06, "loss": 0.3381, "step": 596 }, { "batch_size": 1, "epoch": 0.2384, "step": 596, "tokens_per_device": 5106 }, { "epoch": 0.2384, "loss_ce": 0.17808307707309723, "loss_lvr": 0.32246655225753784, "loss_mode_switch": 0.0, "loss_total": 0.21032974123954773, "step": 596 }, { "batch_size": 4, "epoch": 0.2384, "step": 596, "tokens_per_device": 5280 }, { "epoch": 0.2384, "loss_ce": 0.5458148717880249, "loss_lvr": 0.874180018901825, "loss_mode_switch": 0.0, "loss_total": 0.6332328915596008, "step": 596 }, { "batch_size": 4, "epoch": 0.2384, "step": 596, "tokens_per_device": 5944 }, { "epoch": 0.2384, "loss_ce": 0.18359901010990143, "loss_lvr": 0.6409659385681152, "loss_mode_switch": 0.0, "loss_total": 0.24769559502601624, "step": 596 }, { "batch_size": 4, "epoch": 0.2384, "step": 596, "tokens_per_device": 4616 }, { "epoch": 0.2384, "loss_ce": 0.5839654803276062, "loss_lvr": 0.9660230278968811, "loss_mode_switch": 0.0, "loss_total": 0.6805678009986877, "step": 596 }, { "batch_size": 4, "epoch": 0.2384, "step": 596, "tokens_per_device": 1308 }, { "epoch": 0.2384, "loss_ce": 0.2181757092475891, "loss_lvr": 1.049630045890808, "loss_mode_switch": 0.0, "loss_total": 0.3231387138366699, "step": 596 }, { "batch_size": 4, "epoch": 0.2384, "step": 596, "tokens_per_device": 5756 }, { "epoch": 0.2384, "loss_ce": 0.04421532154083252, "loss_lvr": 1.4379204511642456, "loss_mode_switch": 0.0, "loss_total": 0.18800736963748932, "step": 596 }, { "batch_size": 4, "epoch": 0.2384, "step": 596, "tokens_per_device": 5008 }, { "epoch": 0.2384, "loss_ce": 0.3987007141113281, "loss_lvr": 0.9617482423782349, "loss_mode_switch": 0.0, "loss_total": 0.49487555027008057, "step": 596 }, { "batch_size": 1, "epoch": 0.2384, "step": 596, "tokens_per_device": 4948 }, { "epoch": 0.2384, "loss_ce": 0.01829586550593376, "loss_lvr": 0.5452553629875183, "loss_mode_switch": 0.0, "loss_total": 0.07282140105962753, "step": 596 }, { "epoch": 0.2388, "grad_norm": 1.7131282091140747, "learning_rate": 8.899619497881784e-06, "loss": 0.3077, "step": 597 }, { "batch_size": 4, "epoch": 0.2388, "step": 597, "tokens_per_device": 4392 }, { "epoch": 0.2388, "loss_ce": 0.06306792050600052, "loss_lvr": 0.9022321701049805, "loss_mode_switch": 0.0, "loss_total": 0.15329113602638245, "step": 597 }, { "batch_size": 4, "epoch": 0.2388, "step": 597, "tokens_per_device": 5788 }, { "epoch": 0.2388, "loss_ce": 0.03888006880879402, "loss_lvr": 0.7033426761627197, "loss_mode_switch": 0.0, "loss_total": 0.10921433568000793, "step": 597 }, { "batch_size": 4, "epoch": 0.2388, "step": 597, "tokens_per_device": 3148 }, { "epoch": 0.2388, "loss_ce": 0.19526419043540955, "loss_lvr": 1.180260419845581, "loss_mode_switch": 0.0, "loss_total": 0.31329023838043213, "step": 597 }, { "batch_size": 4, "epoch": 0.2388, "step": 597, "tokens_per_device": 2604 }, { "epoch": 0.2388, "loss_ce": 0.576662540435791, "loss_lvr": 0.7895030379295349, "loss_mode_switch": 0.0, "loss_total": 0.6556128263473511, "step": 597 }, { "batch_size": 4, "epoch": 0.2388, "step": 597, "tokens_per_device": 10364 }, { "epoch": 0.2388, "loss_ce": 0.19931000471115112, "loss_lvr": 0.6548351645469666, "loss_mode_switch": 0.0, "loss_total": 0.2647935152053833, "step": 597 }, { "batch_size": 1, "epoch": 0.2388, "step": 597, "tokens_per_device": 4860 }, { "epoch": 0.2388, "loss_ce": 0.001052746083587408, "loss_lvr": 0.8765811324119568, "loss_mode_switch": 0.0, "loss_total": 0.08871085941791534, "step": 597 }, { "batch_size": 4, "epoch": 0.2388, "step": 597, "tokens_per_device": 7948 }, { "epoch": 0.2388, "loss_ce": 0.2237095683813095, "loss_lvr": 1.0580683946609497, "loss_mode_switch": 0.0, "loss_total": 0.3295164108276367, "step": 597 }, { "batch_size": 4, "epoch": 0.2388, "step": 597, "tokens_per_device": 4048 }, { "epoch": 0.2388, "loss_ce": 0.2542285621166229, "loss_lvr": 0.7684464454650879, "loss_mode_switch": 0.0, "loss_total": 0.33107322454452515, "step": 597 }, { "epoch": 0.2392, "grad_norm": 1.3223947286605835, "learning_rate": 8.895562118801739e-06, "loss": 0.3343, "step": 598 }, { "batch_size": 4, "epoch": 0.2392, "step": 598, "tokens_per_device": 1528 }, { "epoch": 0.2392, "loss_ce": 0.7159562706947327, "loss_lvr": 0.9577344059944153, "loss_mode_switch": 0.0, "loss_total": 0.8117297291755676, "step": 598 }, { "batch_size": 4, "epoch": 0.2392, "step": 598, "tokens_per_device": 4712 }, { "epoch": 0.2392, "loss_ce": 0.27663886547088623, "loss_lvr": 1.3930506706237793, "loss_mode_switch": 0.0, "loss_total": 0.4159439206123352, "step": 598 }, { "batch_size": 4, "epoch": 0.2392, "step": 598, "tokens_per_device": 7304 }, { "epoch": 0.2392, "loss_ce": 0.12487246841192245, "loss_lvr": 0.868529200553894, "loss_mode_switch": 0.0, "loss_total": 0.2117253839969635, "step": 598 }, { "batch_size": 4, "epoch": 0.2392, "step": 598, "tokens_per_device": 1416 }, { "epoch": 0.2392, "loss_ce": 0.18409083783626556, "loss_lvr": 2.7514092922210693, "loss_mode_switch": 0.0, "loss_total": 0.45923179388046265, "step": 598 }, { "batch_size": 4, "epoch": 0.2392, "step": 598, "tokens_per_device": 7156 }, { "epoch": 0.2392, "loss_ce": 0.06567362695932388, "loss_lvr": 0.757051408290863, "loss_mode_switch": 0.0, "loss_total": 0.1413787603378296, "step": 598 }, { "batch_size": 4, "epoch": 0.2392, "step": 598, "tokens_per_device": 9056 }, { "epoch": 0.2392, "loss_ce": 0.20209096372127533, "loss_lvr": 0.8867215514183044, "loss_mode_switch": 0.0, "loss_total": 0.29076310992240906, "step": 598 }, { "batch_size": 1, "epoch": 0.2392, "step": 598, "tokens_per_device": 5107 }, { "epoch": 0.2392, "loss_ce": 0.003062676638364792, "loss_lvr": 0.8305073380470276, "loss_mode_switch": 0.0, "loss_total": 0.08611340820789337, "step": 598 }, { "batch_size": 1, "epoch": 0.2392, "step": 598, "tokens_per_device": 4875 }, { "epoch": 0.2392, "loss_ce": 0.0004640869447030127, "loss_lvr": 1.2616887092590332, "loss_mode_switch": 0.0, "loss_total": 0.126632958650589, "step": 598 }, { "epoch": 0.2396, "grad_norm": 1.407102108001709, "learning_rate": 8.891498201700368e-06, "loss": 0.36, "step": 599 }, { "batch_size": 4, "epoch": 0.2396, "step": 599, "tokens_per_device": 6152 }, { "epoch": 0.2396, "loss_ce": 0.16900190711021423, "loss_lvr": 1.208150863647461, "loss_mode_switch": 0.0, "loss_total": 0.2898170053958893, "step": 599 }, { "batch_size": 4, "epoch": 0.2396, "step": 599, "tokens_per_device": 4240 }, { "epoch": 0.2396, "loss_ce": 0.39681804180145264, "loss_lvr": 0.9145194888114929, "loss_mode_switch": 0.0, "loss_total": 0.48826998472213745, "step": 599 }, { "batch_size": 4, "epoch": 0.2396, "step": 599, "tokens_per_device": 2644 }, { "epoch": 0.2396, "loss_ce": 0.3081313669681549, "loss_lvr": 0.8563232421875, "loss_mode_switch": 0.0, "loss_total": 0.3937636911869049, "step": 599 }, { "batch_size": 1, "epoch": 0.2396, "step": 599, "tokens_per_device": 4892 }, { "epoch": 0.2396, "loss_ce": 0.30115383863449097, "loss_lvr": 0.3106655776500702, "loss_mode_switch": 0.0, "loss_total": 0.3322204053401947, "step": 599 }, { "batch_size": 4, "epoch": 0.2396, "step": 599, "tokens_per_device": 4372 }, { "epoch": 0.2396, "loss_ce": 0.3082903325557709, "loss_lvr": 0.977847695350647, "loss_mode_switch": 0.0, "loss_total": 0.406075119972229, "step": 599 }, { "batch_size": 1, "epoch": 0.2396, "step": 599, "tokens_per_device": 5105 }, { "epoch": 0.2396, "loss_ce": 0.08082520216703415, "loss_lvr": 0.5342844128608704, "loss_mode_switch": 0.0, "loss_total": 0.13425365090370178, "step": 599 }, { "batch_size": 4, "epoch": 0.2396, "step": 599, "tokens_per_device": 4620 }, { "epoch": 0.2396, "loss_ce": 0.28517618775367737, "loss_lvr": 0.8112674355506897, "loss_mode_switch": 0.0, "loss_total": 0.3663029372692108, "step": 599 }, { "batch_size": 4, "epoch": 0.2396, "step": 599, "tokens_per_device": 7012 }, { "epoch": 0.2396, "loss_ce": 0.05647207796573639, "loss_lvr": 0.4220481812953949, "loss_mode_switch": 0.0, "loss_total": 0.098676897585392, "step": 599 }, { "epoch": 0.24, "grad_norm": 1.5670580863952637, "learning_rate": 8.887427753398249e-06, "loss": 0.2561, "step": 600 }, { "batch_size": 4, "epoch": 0.24, "step": 600, "tokens_per_device": 7504 }, { "epoch": 0.24, "loss_ce": 0.13802096247673035, "loss_lvr": 0.49297088384628296, "loss_mode_switch": 0.0, "loss_total": 0.18731805682182312, "step": 600 }, { "batch_size": 4, "epoch": 0.24, "step": 600, "tokens_per_device": 3760 }, { "epoch": 0.24, "loss_ce": 0.2764824628829956, "loss_lvr": 0.6138027310371399, "loss_mode_switch": 0.0, "loss_total": 0.3378627300262451, "step": 600 }, { "batch_size": 4, "epoch": 0.24, "step": 600, "tokens_per_device": 4648 }, { "epoch": 0.24, "loss_ce": 0.20205432176589966, "loss_lvr": 0.9204785823822021, "loss_mode_switch": 0.0, "loss_total": 0.29410219192504883, "step": 600 }, { "batch_size": 4, "epoch": 0.24, "step": 600, "tokens_per_device": 6132 }, { "epoch": 0.24, "loss_ce": 0.4365577697753906, "loss_lvr": 0.8197058439254761, "loss_mode_switch": 0.0, "loss_total": 0.5185283422470093, "step": 600 }, { "batch_size": 4, "epoch": 0.24, "step": 600, "tokens_per_device": 1400 }, { "epoch": 0.24, "loss_ce": 0.6242194771766663, "loss_lvr": 1.031485915184021, "loss_mode_switch": 0.0, "loss_total": 0.7273680567741394, "step": 600 }, { "batch_size": 4, "epoch": 0.24, "step": 600, "tokens_per_device": 1824 }, { "epoch": 0.24, "loss_ce": 0.4089803099632263, "loss_lvr": 3.156777858734131, "loss_mode_switch": 0.0, "loss_total": 0.7246581315994263, "step": 600 }, { "batch_size": 4, "epoch": 0.24, "step": 600, "tokens_per_device": 10276 }, { "epoch": 0.24, "loss_ce": 0.17785818874835968, "loss_lvr": 0.9972863793373108, "loss_mode_switch": 0.0, "loss_total": 0.27758681774139404, "step": 600 }, { "batch_size": 4, "epoch": 0.24, "step": 600, "tokens_per_device": 4208 }, { "epoch": 0.24, "loss_ce": 0.3986012935638428, "loss_lvr": 0.9610106945037842, "loss_mode_switch": 0.0, "loss_total": 0.49470236897468567, "step": 600 }, { "epoch": 0.2404, "grad_norm": 1.2834582328796387, "learning_rate": 8.883350780726915e-06, "loss": 0.2973, "step": 601 }, { "batch_size": 4, "epoch": 0.2404, "step": 601, "tokens_per_device": 6064 }, { "epoch": 0.2404, "loss_ce": 0.11314882338047028, "loss_lvr": 0.8666664361953735, "loss_mode_switch": 0.0, "loss_total": 0.19981546700000763, "step": 601 }, { "batch_size": 4, "epoch": 0.2404, "step": 601, "tokens_per_device": 6732 }, { "epoch": 0.2404, "loss_ce": 0.5150108933448792, "loss_lvr": 1.4860965013504028, "loss_mode_switch": 0.0, "loss_total": 0.6636205315589905, "step": 601 }, { "batch_size": 4, "epoch": 0.2404, "step": 601, "tokens_per_device": 4312 }, { "epoch": 0.2404, "loss_ce": 0.14767418801784515, "loss_lvr": 1.1255857944488525, "loss_mode_switch": 0.0, "loss_total": 0.2602327764034271, "step": 601 }, { "batch_size": 4, "epoch": 0.2404, "step": 601, "tokens_per_device": 2244 }, { "epoch": 0.2404, "loss_ce": 0.3816930651664734, "loss_lvr": 1.1808446645736694, "loss_mode_switch": 0.0, "loss_total": 0.49977752566337585, "step": 601 }, { "batch_size": 4, "epoch": 0.2404, "step": 601, "tokens_per_device": 6128 }, { "epoch": 0.2404, "loss_ce": 0.30567866563796997, "loss_lvr": 0.6776309609413147, "loss_mode_switch": 0.0, "loss_total": 0.37344175577163696, "step": 601 }, { "batch_size": 4, "epoch": 0.2404, "step": 601, "tokens_per_device": 4104 }, { "epoch": 0.2404, "loss_ce": 0.47840002179145813, "loss_lvr": 1.0167425870895386, "loss_mode_switch": 0.0, "loss_total": 0.5800743103027344, "step": 601 }, { "batch_size": 4, "epoch": 0.2404, "step": 601, "tokens_per_device": 5888 }, { "epoch": 0.2404, "loss_ce": 0.6294957399368286, "loss_lvr": 0.7517462968826294, "loss_mode_switch": 0.0, "loss_total": 0.7046703696250916, "step": 601 }, { "batch_size": 4, "epoch": 0.2404, "step": 601, "tokens_per_device": 2644 }, { "epoch": 0.2404, "loss_ce": 0.326126366853714, "loss_lvr": 0.8932649493217468, "loss_mode_switch": 0.0, "loss_total": 0.41545286774635315, "step": 601 }, { "epoch": 0.2408, "grad_norm": 3.8570220470428467, "learning_rate": 8.87926729052886e-06, "loss": 0.3433, "step": 602 }, { "batch_size": 4, "epoch": 0.2408, "step": 602, "tokens_per_device": 4220 }, { "epoch": 0.2408, "loss_ce": 0.5090467929840088, "loss_lvr": 0.9033244848251343, "loss_mode_switch": 0.0, "loss_total": 0.5993792414665222, "step": 602 }, { "batch_size": 4, "epoch": 0.2408, "step": 602, "tokens_per_device": 1468 }, { "epoch": 0.2408, "loss_ce": 0.06473175436258316, "loss_lvr": 0.892866313457489, "loss_mode_switch": 0.0, "loss_total": 0.15401838719844818, "step": 602 }, { "batch_size": 4, "epoch": 0.2408, "step": 602, "tokens_per_device": 4316 }, { "epoch": 0.2408, "loss_ce": 0.21734893321990967, "loss_lvr": 1.0818674564361572, "loss_mode_switch": 0.0, "loss_total": 0.32553568482398987, "step": 602 }, { "batch_size": 1, "epoch": 0.2408, "step": 602, "tokens_per_device": 4908 }, { "epoch": 0.2408, "loss_ce": 0.0618855357170105, "loss_lvr": 0.6937915682792664, "loss_mode_switch": 0.0, "loss_total": 0.13126468658447266, "step": 602 }, { "batch_size": 1, "epoch": 0.2408, "step": 602, "tokens_per_device": 5063 }, { "epoch": 0.2408, "loss_ce": 0.007157525047659874, "loss_lvr": 0.5662516355514526, "loss_mode_switch": 0.0, "loss_total": 0.0637826919555664, "step": 602 }, { "batch_size": 4, "epoch": 0.2408, "step": 602, "tokens_per_device": 4236 }, { "epoch": 0.2408, "loss_ce": 0.26230648159980774, "loss_lvr": 0.7766278982162476, "loss_mode_switch": 0.0, "loss_total": 0.339969277381897, "step": 602 }, { "batch_size": 1, "epoch": 0.2408, "step": 602, "tokens_per_device": 5812 }, { "epoch": 0.2408, "loss_ce": 0.02277432195842266, "loss_lvr": 0.7178429961204529, "loss_mode_switch": 0.0, "loss_total": 0.09455862641334534, "step": 602 }, { "batch_size": 1, "epoch": 0.2408, "step": 602, "tokens_per_device": 5023 }, { "epoch": 0.2408, "loss_ce": 0.18730729818344116, "loss_lvr": 0.5056047439575195, "loss_mode_switch": 0.0, "loss_total": 0.23786777257919312, "step": 602 }, { "epoch": 0.2412, "grad_norm": 1.62863290309906, "learning_rate": 8.875177289657502e-06, "loss": 0.3571, "step": 603 }, { "batch_size": 4, "epoch": 0.2412, "step": 603, "tokens_per_device": 4264 }, { "epoch": 0.2412, "loss_ce": 0.33746322989463806, "loss_lvr": 1.08199942111969, "loss_mode_switch": 0.0, "loss_total": 0.445663183927536, "step": 603 }, { "batch_size": 1, "epoch": 0.2412, "step": 603, "tokens_per_device": 6934 }, { "epoch": 0.2412, "loss_ce": 0.0013132375897839665, "loss_lvr": 0.41415688395500183, "loss_mode_switch": 0.0, "loss_total": 0.04272892698645592, "step": 603 }, { "batch_size": 1, "epoch": 0.2412, "step": 603, "tokens_per_device": 7573 }, { "epoch": 0.2412, "loss_ce": 0.016632046550512314, "loss_lvr": 0.4022451937198639, "loss_mode_switch": 0.0, "loss_total": 0.05685656517744064, "step": 603 }, { "batch_size": 4, "epoch": 0.2412, "step": 603, "tokens_per_device": 1288 }, { "epoch": 0.2412, "loss_ce": 0.39341482520103455, "loss_lvr": 2.04569935798645, "loss_mode_switch": 0.0, "loss_total": 0.597984790802002, "step": 603 }, { "batch_size": 4, "epoch": 0.2412, "step": 603, "tokens_per_device": 1392 }, { "epoch": 0.2412, "loss_ce": 0.2525377869606018, "loss_lvr": 1.111914873123169, "loss_mode_switch": 0.0, "loss_total": 0.3637292683124542, "step": 603 }, { "batch_size": 4, "epoch": 0.2412, "step": 603, "tokens_per_device": 4220 }, { "epoch": 0.2412, "loss_ce": 0.4348123073577881, "loss_lvr": 1.1853163242340088, "loss_mode_switch": 0.0, "loss_total": 0.5533439517021179, "step": 603 }, { "batch_size": 4, "epoch": 0.2412, "step": 603, "tokens_per_device": 4208 }, { "epoch": 0.2412, "loss_ce": 0.08216838538646698, "loss_lvr": 1.5099502801895142, "loss_mode_switch": 0.0, "loss_total": 0.23316341638565063, "step": 603 }, { "batch_size": 1, "epoch": 0.2412, "step": 603, "tokens_per_device": 5095 }, { "epoch": 0.2412, "loss_ce": 0.00528921140357852, "loss_lvr": 0.5646508932113647, "loss_mode_switch": 0.0, "loss_total": 0.06175430119037628, "step": 603 }, { "epoch": 0.2416, "grad_norm": 1.5307384729385376, "learning_rate": 8.8710807849772e-06, "loss": 0.3392, "step": 604 }, { "batch_size": 1, "epoch": 0.2416, "step": 604, "tokens_per_device": 7752 }, { "epoch": 0.2416, "loss_ce": 0.006608373485505581, "loss_lvr": 0.4456607401371002, "loss_mode_switch": 0.0, "loss_total": 0.05117445066571236, "step": 604 }, { "batch_size": 4, "epoch": 0.2416, "step": 604, "tokens_per_device": 5264 }, { "epoch": 0.2416, "loss_ce": 0.26041439175605774, "loss_lvr": 0.795547366142273, "loss_mode_switch": 0.0, "loss_total": 0.33996912837028503, "step": 604 }, { "batch_size": 4, "epoch": 0.2416, "step": 604, "tokens_per_device": 12496 }, { "epoch": 0.2416, "loss_ce": 0.5124820470809937, "loss_lvr": 0.8509489893913269, "loss_mode_switch": 0.0, "loss_total": 0.5975769758224487, "step": 604 }, { "batch_size": 4, "epoch": 0.2416, "step": 604, "tokens_per_device": 5672 }, { "epoch": 0.2416, "loss_ce": 0.13041435182094574, "loss_lvr": 0.8168831467628479, "loss_mode_switch": 0.0, "loss_total": 0.21210266649723053, "step": 604 }, { "batch_size": 4, "epoch": 0.2416, "step": 604, "tokens_per_device": 4012 }, { "epoch": 0.2416, "loss_ce": 0.17669561505317688, "loss_lvr": 0.9188353419303894, "loss_mode_switch": 0.0, "loss_total": 0.2685791552066803, "step": 604 }, { "batch_size": 4, "epoch": 0.2416, "step": 604, "tokens_per_device": 5928 }, { "epoch": 0.2416, "loss_ce": 0.4327828884124756, "loss_lvr": 1.1196861267089844, "loss_mode_switch": 0.0, "loss_total": 0.5447515249252319, "step": 604 }, { "batch_size": 1, "epoch": 0.2416, "step": 604, "tokens_per_device": 4757 }, { "epoch": 0.2416, "loss_ce": 0.005460451822727919, "loss_lvr": 0.4809083938598633, "loss_mode_switch": 0.0, "loss_total": 0.05355129390954971, "step": 604 }, { "batch_size": 1, "epoch": 0.2416, "step": 604, "tokens_per_device": 5163 }, { "epoch": 0.2416, "loss_ce": 0.3318486213684082, "loss_lvr": 0.47431209683418274, "loss_mode_switch": 0.0, "loss_total": 0.37927982211112976, "step": 604 }, { "epoch": 0.242, "grad_norm": 2.2983686923980713, "learning_rate": 8.866977783363219e-06, "loss": 0.324, "step": 605 }, { "batch_size": 1, "epoch": 0.242, "step": 605, "tokens_per_device": 6318 }, { "epoch": 0.242, "loss_ce": 0.09631863981485367, "loss_lvr": 0.5490373373031616, "loss_mode_switch": 0.0, "loss_total": 0.1512223780155182, "step": 605 }, { "batch_size": 1, "epoch": 0.242, "step": 605, "tokens_per_device": 5341 }, { "epoch": 0.242, "loss_ce": 0.272800087928772, "loss_lvr": 0.6757899522781372, "loss_mode_switch": 0.0, "loss_total": 0.34037908911705017, "step": 605 }, { "batch_size": 4, "epoch": 0.242, "step": 605, "tokens_per_device": 2616 }, { "epoch": 0.242, "loss_ce": 0.38230177760124207, "loss_lvr": 0.9355814456939697, "loss_mode_switch": 0.0, "loss_total": 0.47585994005203247, "step": 605 }, { "batch_size": 4, "epoch": 0.242, "step": 605, "tokens_per_device": 4168 }, { "epoch": 0.242, "loss_ce": 0.038767214864492416, "loss_lvr": 1.6036136150360107, "loss_mode_switch": 0.0, "loss_total": 0.19912858307361603, "step": 605 }, { "batch_size": 4, "epoch": 0.242, "step": 605, "tokens_per_device": 4852 }, { "epoch": 0.242, "loss_ce": 0.5113093852996826, "loss_lvr": 0.9958881139755249, "loss_mode_switch": 0.0, "loss_total": 0.6108981966972351, "step": 605 }, { "batch_size": 1, "epoch": 0.242, "step": 605, "tokens_per_device": 6541 }, { "epoch": 0.242, "loss_ce": 0.030136270448565483, "loss_lvr": 0.35860103368759155, "loss_mode_switch": 0.0, "loss_total": 0.06599637120962143, "step": 605 }, { "batch_size": 4, "epoch": 0.242, "step": 605, "tokens_per_device": 5724 }, { "epoch": 0.242, "loss_ce": 0.10254792124032974, "loss_lvr": 0.9267277717590332, "loss_mode_switch": 0.0, "loss_total": 0.1952207088470459, "step": 605 }, { "batch_size": 4, "epoch": 0.242, "step": 605, "tokens_per_device": 2672 }, { "epoch": 0.242, "loss_ce": 0.8149619102478027, "loss_lvr": 0.9799462556838989, "loss_mode_switch": 0.0, "loss_total": 0.9129565358161926, "step": 605 }, { "epoch": 0.2424, "grad_norm": 1.6849589347839355, "learning_rate": 8.862868291701735e-06, "loss": 0.3056, "step": 606 }, { "batch_size": 4, "epoch": 0.2424, "step": 606, "tokens_per_device": 5408 }, { "epoch": 0.2424, "loss_ce": 0.514589250087738, "loss_lvr": 0.8151628375053406, "loss_mode_switch": 0.0, "loss_total": 0.5961055159568787, "step": 606 }, { "batch_size": 4, "epoch": 0.2424, "step": 606, "tokens_per_device": 5792 }, { "epoch": 0.2424, "loss_ce": 0.2481764703989029, "loss_lvr": 0.7068463563919067, "loss_mode_switch": 0.0, "loss_total": 0.31886109709739685, "step": 606 }, { "batch_size": 4, "epoch": 0.2424, "step": 606, "tokens_per_device": 3380 }, { "epoch": 0.2424, "loss_ce": 0.70457923412323, "loss_lvr": 1.0722808837890625, "loss_mode_switch": 0.0, "loss_total": 0.8118073344230652, "step": 606 }, { "batch_size": 1, "epoch": 0.2424, "step": 606, "tokens_per_device": 5149 }, { "epoch": 0.2424, "loss_ce": 0.051884621381759644, "loss_lvr": 0.49897584319114685, "loss_mode_switch": 0.0, "loss_total": 0.10178220272064209, "step": 606 }, { "batch_size": 4, "epoch": 0.2424, "step": 606, "tokens_per_device": 4324 }, { "epoch": 0.2424, "loss_ce": 0.20862047374248505, "loss_lvr": 1.2237248420715332, "loss_mode_switch": 0.0, "loss_total": 0.3309929668903351, "step": 606 }, { "batch_size": 1, "epoch": 0.2424, "step": 606, "tokens_per_device": 5162 }, { "epoch": 0.2424, "loss_ce": 0.014201159588992596, "loss_lvr": 0.4356522858142853, "loss_mode_switch": 0.0, "loss_total": 0.0577663891017437, "step": 606 }, { "batch_size": 4, "epoch": 0.2424, "step": 606, "tokens_per_device": 5120 }, { "epoch": 0.2424, "loss_ce": 0.6584903001785278, "loss_lvr": 0.7469045519828796, "loss_mode_switch": 0.0, "loss_total": 0.7331807613372803, "step": 606 }, { "batch_size": 4, "epoch": 0.2424, "step": 606, "tokens_per_device": 1404 }, { "epoch": 0.2424, "loss_ce": 0.5621956586837769, "loss_lvr": 1.0689479112625122, "loss_mode_switch": 0.0, "loss_total": 0.6690904498100281, "step": 606 }, { "epoch": 0.2428, "grad_norm": 1.3967705965042114, "learning_rate": 8.858752316889809e-06, "loss": 0.3009, "step": 607 }, { "batch_size": 4, "epoch": 0.2428, "step": 607, "tokens_per_device": 3844 }, { "epoch": 0.2428, "loss_ce": 0.442067414522171, "loss_lvr": 0.9598419666290283, "loss_mode_switch": 0.0, "loss_total": 0.5380516052246094, "step": 607 }, { "batch_size": 4, "epoch": 0.2428, "step": 607, "tokens_per_device": 1500 }, { "epoch": 0.2428, "loss_ce": 0.2257021814584732, "loss_lvr": 1.078614592552185, "loss_mode_switch": 0.0, "loss_total": 0.3335636258125305, "step": 607 }, { "batch_size": 4, "epoch": 0.2428, "step": 607, "tokens_per_device": 3492 }, { "epoch": 0.2428, "loss_ce": 0.2705812454223633, "loss_lvr": 0.7390007376670837, "loss_mode_switch": 0.0, "loss_total": 0.34448131918907166, "step": 607 }, { "batch_size": 4, "epoch": 0.2428, "step": 607, "tokens_per_device": 4060 }, { "epoch": 0.2428, "loss_ce": 0.06650329381227493, "loss_lvr": 0.853169858455658, "loss_mode_switch": 0.0, "loss_total": 0.15182027220726013, "step": 607 }, { "batch_size": 4, "epoch": 0.2428, "step": 607, "tokens_per_device": 3984 }, { "epoch": 0.2428, "loss_ce": 0.08503258973360062, "loss_lvr": 0.8021447658538818, "loss_mode_switch": 0.0, "loss_total": 0.16524706780910492, "step": 607 }, { "batch_size": 4, "epoch": 0.2428, "step": 607, "tokens_per_device": 2456 }, { "epoch": 0.2428, "loss_ce": 0.5477267503738403, "loss_lvr": 1.2757437229156494, "loss_mode_switch": 0.0, "loss_total": 0.6753011345863342, "step": 607 }, { "batch_size": 4, "epoch": 0.2428, "step": 607, "tokens_per_device": 8392 }, { "epoch": 0.2428, "loss_ce": 0.46102046966552734, "loss_lvr": 0.762560248374939, "loss_mode_switch": 0.0, "loss_total": 0.5372765064239502, "step": 607 }, { "batch_size": 1, "epoch": 0.2428, "step": 607, "tokens_per_device": 5088 }, { "epoch": 0.2428, "loss_ce": 0.004719567485153675, "loss_lvr": 0.5437228679656982, "loss_mode_switch": 0.0, "loss_total": 0.059091854840517044, "step": 607 }, { "epoch": 0.2432, "grad_norm": 1.461087942123413, "learning_rate": 8.854629865835387e-06, "loss": 0.3345, "step": 608 }, { "batch_size": 4, "epoch": 0.2432, "step": 608, "tokens_per_device": 2600 }, { "epoch": 0.2432, "loss_ce": 0.41528409719467163, "loss_lvr": 0.9375616312026978, "loss_mode_switch": 0.0, "loss_total": 0.5090402364730835, "step": 608 }, { "batch_size": 4, "epoch": 0.2432, "step": 608, "tokens_per_device": 4816 }, { "epoch": 0.2432, "loss_ce": 0.06753616780042648, "loss_lvr": 0.7945675253868103, "loss_mode_switch": 0.0, "loss_total": 0.14699292182922363, "step": 608 }, { "batch_size": 4, "epoch": 0.2432, "step": 608, "tokens_per_device": 13908 }, { "epoch": 0.2432, "loss_ce": 0.08600439876317978, "loss_lvr": 0.7941851019859314, "loss_mode_switch": 0.0, "loss_total": 0.16542291641235352, "step": 608 }, { "batch_size": 4, "epoch": 0.2432, "step": 608, "tokens_per_device": 4696 }, { "epoch": 0.2432, "loss_ce": 0.24577905237674713, "loss_lvr": 0.8290591239929199, "loss_mode_switch": 0.0, "loss_total": 0.3286849558353424, "step": 608 }, { "batch_size": 4, "epoch": 0.2432, "step": 608, "tokens_per_device": 3916 }, { "epoch": 0.2432, "loss_ce": 0.058618053793907166, "loss_lvr": 1.4547611474990845, "loss_mode_switch": 0.0, "loss_total": 0.20409417152404785, "step": 608 }, { "batch_size": 4, "epoch": 0.2432, "step": 608, "tokens_per_device": 1856 }, { "epoch": 0.2432, "loss_ce": 0.5290848612785339, "loss_lvr": 1.12605619430542, "loss_mode_switch": 0.0, "loss_total": 0.6416904926300049, "step": 608 }, { "batch_size": 1, "epoch": 0.2432, "step": 608, "tokens_per_device": 4906 }, { "epoch": 0.2432, "loss_ce": 0.015041396021842957, "loss_lvr": 0.354775607585907, "loss_mode_switch": 0.0, "loss_total": 0.050518956035375595, "step": 608 }, { "batch_size": 1, "epoch": 0.2432, "step": 608, "tokens_per_device": 5234 }, { "epoch": 0.2432, "loss_ce": 1.4468228816986084, "loss_lvr": 0.6026279926300049, "loss_mode_switch": 0.0, "loss_total": 1.5070856809616089, "step": 608 }, { "epoch": 0.2436, "grad_norm": 1.3693289756774902, "learning_rate": 8.850500945457286e-06, "loss": 0.3135, "step": 609 }, { "batch_size": 1, "epoch": 0.2436, "step": 609, "tokens_per_device": 5012 }, { "epoch": 0.2436, "loss_ce": 0.29811984300613403, "loss_lvr": 0.5777795314788818, "loss_mode_switch": 0.0, "loss_total": 0.35589778423309326, "step": 609 }, { "batch_size": 1, "epoch": 0.2436, "step": 609, "tokens_per_device": 4875 }, { "epoch": 0.2436, "loss_ce": 0.008223671466112137, "loss_lvr": 0.36291778087615967, "loss_mode_switch": 0.0, "loss_total": 0.044515449553728104, "step": 609 }, { "batch_size": 4, "epoch": 0.2436, "step": 609, "tokens_per_device": 6532 }, { "epoch": 0.2436, "loss_ce": 0.48259836435317993, "loss_lvr": 0.9182369112968445, "loss_mode_switch": 0.0, "loss_total": 0.5744220614433289, "step": 609 }, { "batch_size": 4, "epoch": 0.2436, "step": 609, "tokens_per_device": 5928 }, { "epoch": 0.2436, "loss_ce": 0.1743447631597519, "loss_lvr": 0.8290116190910339, "loss_mode_switch": 0.0, "loss_total": 0.2572459280490875, "step": 609 }, { "batch_size": 4, "epoch": 0.2436, "step": 609, "tokens_per_device": 1924 }, { "epoch": 0.2436, "loss_ce": 0.7900944352149963, "loss_lvr": 0.963472306728363, "loss_mode_switch": 0.0, "loss_total": 0.8864416480064392, "step": 609 }, { "batch_size": 1, "epoch": 0.2436, "step": 609, "tokens_per_device": 5933 }, { "epoch": 0.2436, "loss_ce": 0.05007113888859749, "loss_lvr": 0.4011092782020569, "loss_mode_switch": 0.0, "loss_total": 0.09018206596374512, "step": 609 }, { "batch_size": 4, "epoch": 0.2436, "step": 609, "tokens_per_device": 5396 }, { "epoch": 0.2436, "loss_ce": 0.3280249834060669, "loss_lvr": 0.8640009760856628, "loss_mode_switch": 0.0, "loss_total": 0.4144250750541687, "step": 609 }, { "batch_size": 4, "epoch": 0.2436, "step": 609, "tokens_per_device": 5944 }, { "epoch": 0.2436, "loss_ce": 0.6468248963356018, "loss_lvr": 0.6542974710464478, "loss_mode_switch": 0.0, "loss_total": 0.7122546434402466, "step": 609 }, { "epoch": 0.244, "grad_norm": 1.3472988605499268, "learning_rate": 8.846365562685178e-06, "loss": 0.3401, "step": 610 }, { "batch_size": 4, "epoch": 0.244, "step": 610, "tokens_per_device": 3928 }, { "epoch": 0.244, "loss_ce": 0.24141496419906616, "loss_lvr": 0.8429635763168335, "loss_mode_switch": 0.0, "loss_total": 0.32571130990982056, "step": 610 }, { "batch_size": 4, "epoch": 0.244, "step": 610, "tokens_per_device": 1780 }, { "epoch": 0.244, "loss_ce": 0.4974935054779053, "loss_lvr": 1.039893627166748, "loss_mode_switch": 0.0, "loss_total": 0.6014828681945801, "step": 610 }, { "batch_size": 4, "epoch": 0.244, "step": 610, "tokens_per_device": 8224 }, { "epoch": 0.244, "loss_ce": 0.14188778400421143, "loss_lvr": 1.2610241174697876, "loss_mode_switch": 0.0, "loss_total": 0.26799020171165466, "step": 610 }, { "batch_size": 1, "epoch": 0.244, "step": 610, "tokens_per_device": 4787 }, { "epoch": 0.244, "loss_ce": 0.0066175274550914764, "loss_lvr": 0.26568466424942017, "loss_mode_switch": 0.0, "loss_total": 0.03318599611520767, "step": 610 }, { "batch_size": 4, "epoch": 0.244, "step": 610, "tokens_per_device": 4772 }, { "epoch": 0.244, "loss_ce": 0.2715546190738678, "loss_lvr": 0.7482436299324036, "loss_mode_switch": 0.0, "loss_total": 0.34637898206710815, "step": 610 }, { "batch_size": 1, "epoch": 0.244, "step": 610, "tokens_per_device": 4771 }, { "epoch": 0.244, "loss_ce": 0.09031739085912704, "loss_lvr": 0.18143653869628906, "loss_mode_switch": 0.0, "loss_total": 0.10846104472875595, "step": 610 }, { "batch_size": 1, "epoch": 0.244, "step": 610, "tokens_per_device": 5192 }, { "epoch": 0.244, "loss_ce": 0.006568582728505135, "loss_lvr": 0.31417375802993774, "loss_mode_switch": 0.0, "loss_total": 0.03798595815896988, "step": 610 }, { "batch_size": 4, "epoch": 0.244, "step": 610, "tokens_per_device": 2132 }, { "epoch": 0.244, "loss_ce": 0.2981989085674286, "loss_lvr": 0.9643350839614868, "loss_mode_switch": 0.0, "loss_total": 0.3946324288845062, "step": 610 }, { "epoch": 0.2444, "grad_norm": 1.4334981441497803, "learning_rate": 8.842223724459578e-06, "loss": 0.2829, "step": 611 }, { "batch_size": 4, "epoch": 0.2444, "step": 611, "tokens_per_device": 1792 }, { "epoch": 0.2444, "loss_ce": 0.6179676055908203, "loss_lvr": 0.8650297522544861, "loss_mode_switch": 0.0, "loss_total": 0.7044705748558044, "step": 611 }, { "batch_size": 4, "epoch": 0.2444, "step": 611, "tokens_per_device": 3784 }, { "epoch": 0.2444, "loss_ce": 0.010106652043759823, "loss_lvr": 0.7930710315704346, "loss_mode_switch": 0.0, "loss_total": 0.08941375464200974, "step": 611 }, { "batch_size": 4, "epoch": 0.2444, "step": 611, "tokens_per_device": 3968 }, { "epoch": 0.2444, "loss_ce": 0.3178619146347046, "loss_lvr": 0.804044783115387, "loss_mode_switch": 0.0, "loss_total": 0.39826640486717224, "step": 611 }, { "batch_size": 4, "epoch": 0.2444, "step": 611, "tokens_per_device": 7428 }, { "epoch": 0.2444, "loss_ce": 0.0571325458586216, "loss_lvr": 0.686941385269165, "loss_mode_switch": 0.0, "loss_total": 0.12582668662071228, "step": 611 }, { "batch_size": 4, "epoch": 0.2444, "step": 611, "tokens_per_device": 3140 }, { "epoch": 0.2444, "loss_ce": 0.315419465303421, "loss_lvr": 0.8539136052131653, "loss_mode_switch": 0.0, "loss_total": 0.4008108377456665, "step": 611 }, { "batch_size": 4, "epoch": 0.2444, "step": 611, "tokens_per_device": 1512 }, { "epoch": 0.2444, "loss_ce": 0.3869679868221283, "loss_lvr": 0.9413769841194153, "loss_mode_switch": 0.0, "loss_total": 0.4811056852340698, "step": 611 }, { "batch_size": 4, "epoch": 0.2444, "step": 611, "tokens_per_device": 3816 }, { "epoch": 0.2444, "loss_ce": 0.17329829931259155, "loss_lvr": 0.9545562267303467, "loss_mode_switch": 0.0, "loss_total": 0.26875391602516174, "step": 611 }, { "batch_size": 4, "epoch": 0.2444, "step": 611, "tokens_per_device": 3928 }, { "epoch": 0.2444, "loss_ce": 0.39794886112213135, "loss_lvr": 0.884936511516571, "loss_mode_switch": 0.0, "loss_total": 0.486442506313324, "step": 611 }, { "epoch": 0.2448, "grad_norm": 1.4041736125946045, "learning_rate": 8.838075437731844e-06, "loss": 0.2954, "step": 612 }, { "batch_size": 4, "epoch": 0.2448, "step": 612, "tokens_per_device": 3948 }, { "epoch": 0.2448, "loss_ce": 0.05628138780593872, "loss_lvr": 0.9039642810821533, "loss_mode_switch": 0.0, "loss_total": 0.14667782187461853, "step": 612 }, { "batch_size": 4, "epoch": 0.2448, "step": 612, "tokens_per_device": 1268 }, { "epoch": 0.2448, "loss_ce": 0.47629570960998535, "loss_lvr": 1.0220094919204712, "loss_mode_switch": 0.0, "loss_total": 0.5784966349601746, "step": 612 }, { "batch_size": 4, "epoch": 0.2448, "step": 612, "tokens_per_device": 1484 }, { "epoch": 0.2448, "loss_ce": 0.30645686388015747, "loss_lvr": 1.2047816514968872, "loss_mode_switch": 0.0, "loss_total": 0.42693501710891724, "step": 612 }, { "batch_size": 4, "epoch": 0.2448, "step": 612, "tokens_per_device": 4564 }, { "epoch": 0.2448, "loss_ce": 0.0702584758400917, "loss_lvr": 0.9369350671768188, "loss_mode_switch": 0.0, "loss_total": 0.16395199298858643, "step": 612 }, { "batch_size": 1, "epoch": 0.2448, "step": 612, "tokens_per_device": 4887 }, { "epoch": 0.2448, "loss_ce": 0.003142036497592926, "loss_lvr": 0.4208517372608185, "loss_mode_switch": 0.0, "loss_total": 0.045227210968732834, "step": 612 }, { "batch_size": 1, "epoch": 0.2448, "step": 612, "tokens_per_device": 4881 }, { "epoch": 0.2448, "loss_ce": 0.035244446247816086, "loss_lvr": 0.18140597641468048, "loss_mode_switch": 0.0, "loss_total": 0.053385041654109955, "step": 612 }, { "batch_size": 4, "epoch": 0.2448, "step": 612, "tokens_per_device": 4204 }, { "epoch": 0.2448, "loss_ce": 0.2024359554052353, "loss_lvr": 0.7602952718734741, "loss_mode_switch": 0.0, "loss_total": 0.27846547961235046, "step": 612 }, { "batch_size": 4, "epoch": 0.2448, "step": 612, "tokens_per_device": 2628 }, { "epoch": 0.2448, "loss_ce": 0.14763392508029938, "loss_lvr": 0.7047014832496643, "loss_mode_switch": 0.0, "loss_total": 0.2181040644645691, "step": 612 }, { "epoch": 0.2452, "grad_norm": 1.464770793914795, "learning_rate": 8.833920709464146e-06, "loss": 0.2914, "step": 613 }, { "batch_size": 4, "epoch": 0.2452, "step": 613, "tokens_per_device": 11036 }, { "epoch": 0.2452, "loss_ce": 0.613227128982544, "loss_lvr": 1.0120590925216675, "loss_mode_switch": 0.0, "loss_total": 0.7144330143928528, "step": 613 }, { "batch_size": 4, "epoch": 0.2452, "step": 613, "tokens_per_device": 5696 }, { "epoch": 0.2452, "loss_ce": 0.15156665444374084, "loss_lvr": 0.8895769119262695, "loss_mode_switch": 0.0, "loss_total": 0.24052435159683228, "step": 613 }, { "batch_size": 1, "epoch": 0.2452, "step": 613, "tokens_per_device": 4883 }, { "epoch": 0.2452, "loss_ce": 0.22678861021995544, "loss_lvr": 0.3132525682449341, "loss_mode_switch": 0.0, "loss_total": 0.2581138610839844, "step": 613 }, { "batch_size": 4, "epoch": 0.2452, "step": 613, "tokens_per_device": 1252 }, { "epoch": 0.2452, "loss_ce": 0.5416854023933411, "loss_lvr": 0.8676506280899048, "loss_mode_switch": 0.0, "loss_total": 0.6284504532814026, "step": 613 }, { "batch_size": 4, "epoch": 0.2452, "step": 613, "tokens_per_device": 1944 }, { "epoch": 0.2452, "loss_ce": 0.13549284636974335, "loss_lvr": 1.074424386024475, "loss_mode_switch": 0.0, "loss_total": 0.24293528497219086, "step": 613 }, { "batch_size": 1, "epoch": 0.2452, "step": 613, "tokens_per_device": 4896 }, { "epoch": 0.2452, "loss_ce": 0.024333534762263298, "loss_lvr": 0.8798646330833435, "loss_mode_switch": 0.0, "loss_total": 0.11231999844312668, "step": 613 }, { "batch_size": 1, "epoch": 0.2452, "step": 613, "tokens_per_device": 5207 }, { "epoch": 0.2452, "loss_ce": 0.05186738818883896, "loss_lvr": 0.7498722672462463, "loss_mode_switch": 0.0, "loss_total": 0.12685461342334747, "step": 613 }, { "batch_size": 4, "epoch": 0.2452, "step": 613, "tokens_per_device": 5760 }, { "epoch": 0.2452, "loss_ce": 0.3286043107509613, "loss_lvr": 1.0978788137435913, "loss_mode_switch": 0.0, "loss_total": 0.43839219212532043, "step": 613 }, { "epoch": 0.2456, "grad_norm": 1.4231873750686646, "learning_rate": 8.829759546629474e-06, "loss": 0.3447, "step": 614 }, { "batch_size": 4, "epoch": 0.2456, "step": 614, "tokens_per_device": 4296 }, { "epoch": 0.2456, "loss_ce": 0.29243993759155273, "loss_lvr": 0.800916314125061, "loss_mode_switch": 0.0, "loss_total": 0.37253156304359436, "step": 614 }, { "batch_size": 4, "epoch": 0.2456, "step": 614, "tokens_per_device": 2772 }, { "epoch": 0.2456, "loss_ce": 0.49702152609825134, "loss_lvr": 0.6622428297996521, "loss_mode_switch": 0.0, "loss_total": 0.5632458329200745, "step": 614 }, { "batch_size": 4, "epoch": 0.2456, "step": 614, "tokens_per_device": 3776 }, { "epoch": 0.2456, "loss_ce": 0.9555911421775818, "loss_lvr": 1.729496717453003, "loss_mode_switch": 0.0, "loss_total": 1.1285407543182373, "step": 614 }, { "batch_size": 4, "epoch": 0.2456, "step": 614, "tokens_per_device": 3944 }, { "epoch": 0.2456, "loss_ce": 0.10946863144636154, "loss_lvr": 1.1895198822021484, "loss_mode_switch": 0.0, "loss_total": 0.22842061519622803, "step": 614 }, { "batch_size": 1, "epoch": 0.2456, "step": 614, "tokens_per_device": 5608 }, { "epoch": 0.2456, "loss_ce": 0.14018869400024414, "loss_lvr": 0.48093536496162415, "loss_mode_switch": 0.0, "loss_total": 0.18828223645687103, "step": 614 }, { "batch_size": 4, "epoch": 0.2456, "step": 614, "tokens_per_device": 1288 }, { "epoch": 0.2456, "loss_ce": 0.10838524252176285, "loss_lvr": 1.0321614742279053, "loss_mode_switch": 0.0, "loss_total": 0.2116013914346695, "step": 614 }, { "batch_size": 1, "epoch": 0.2456, "step": 614, "tokens_per_device": 4870 }, { "epoch": 0.2456, "loss_ce": 0.008711365982890129, "loss_lvr": 1.0807535648345947, "loss_mode_switch": 0.0, "loss_total": 0.11678672581911087, "step": 614 }, { "batch_size": 4, "epoch": 0.2456, "step": 614, "tokens_per_device": 2548 }, { "epoch": 0.2456, "loss_ce": 0.1405174434185028, "loss_lvr": 0.8884611129760742, "loss_mode_switch": 0.0, "loss_total": 0.2293635606765747, "step": 614 }, { "epoch": 0.246, "grad_norm": 1.4769763946533203, "learning_rate": 8.825591956211614e-06, "loss": 0.3558, "step": 615 }, { "batch_size": 4, "epoch": 0.246, "step": 615, "tokens_per_device": 7432 }, { "epoch": 0.246, "loss_ce": 0.08851838111877441, "loss_lvr": 1.0273946523666382, "loss_mode_switch": 0.0, "loss_total": 0.19125784933567047, "step": 615 }, { "batch_size": 4, "epoch": 0.246, "step": 615, "tokens_per_device": 4324 }, { "epoch": 0.246, "loss_ce": 0.014452806673943996, "loss_lvr": 0.49602267146110535, "loss_mode_switch": 0.0, "loss_total": 0.06405507028102875, "step": 615 }, { "batch_size": 4, "epoch": 0.246, "step": 615, "tokens_per_device": 5668 }, { "epoch": 0.246, "loss_ce": 0.03715745359659195, "loss_lvr": 0.8346701264381409, "loss_mode_switch": 0.0, "loss_total": 0.12062446773052216, "step": 615 }, { "batch_size": 4, "epoch": 0.246, "step": 615, "tokens_per_device": 3788 }, { "epoch": 0.246, "loss_ce": 1.180479645729065, "loss_lvr": 0.7059447765350342, "loss_mode_switch": 0.0, "loss_total": 1.2510740756988525, "step": 615 }, { "batch_size": 1, "epoch": 0.246, "step": 615, "tokens_per_device": 4944 }, { "epoch": 0.246, "loss_ce": 0.02136407420039177, "loss_lvr": 0.4541071057319641, "loss_mode_switch": 0.0, "loss_total": 0.06677478551864624, "step": 615 }, { "batch_size": 4, "epoch": 0.246, "step": 615, "tokens_per_device": 14880 }, { "epoch": 0.246, "loss_ce": 0.2167677879333496, "loss_lvr": 0.5628696084022522, "loss_mode_switch": 0.0, "loss_total": 0.27305474877357483, "step": 615 }, { "batch_size": 4, "epoch": 0.246, "step": 615, "tokens_per_device": 3208 }, { "epoch": 0.246, "loss_ce": 0.2339474856853485, "loss_lvr": 0.4457210600376129, "loss_mode_switch": 0.0, "loss_total": 0.2785196006298065, "step": 615 }, { "batch_size": 1, "epoch": 0.246, "step": 615, "tokens_per_device": 5081 }, { "epoch": 0.246, "loss_ce": 0.07211387902498245, "loss_lvr": 0.35531705617904663, "loss_mode_switch": 0.0, "loss_total": 0.10764558613300323, "step": 615 }, { "epoch": 0.2464, "grad_norm": 1.2621722221374512, "learning_rate": 8.82141794520514e-06, "loss": 0.3155, "step": 616 }, { "batch_size": 4, "epoch": 0.2464, "step": 616, "tokens_per_device": 1436 }, { "epoch": 0.2464, "loss_ce": 0.37802019715309143, "loss_lvr": 1.1628446578979492, "loss_mode_switch": 0.0, "loss_total": 0.4943046569824219, "step": 616 }, { "batch_size": 4, "epoch": 0.2464, "step": 616, "tokens_per_device": 1216 }, { "epoch": 0.2464, "loss_ce": 0.39435821771621704, "loss_lvr": 1.1334965229034424, "loss_mode_switch": 0.0, "loss_total": 0.5077078938484192, "step": 616 }, { "batch_size": 4, "epoch": 0.2464, "step": 616, "tokens_per_device": 3352 }, { "epoch": 0.2464, "loss_ce": 0.03902049735188484, "loss_lvr": 1.2509002685546875, "loss_mode_switch": 0.0, "loss_total": 0.16411052644252777, "step": 616 }, { "batch_size": 1, "epoch": 0.2464, "step": 616, "tokens_per_device": 4869 }, { "epoch": 0.2464, "loss_ce": 0.05072503909468651, "loss_lvr": 0.2683507204055786, "loss_mode_switch": 0.0, "loss_total": 0.07756011188030243, "step": 616 }, { "batch_size": 4, "epoch": 0.2464, "step": 616, "tokens_per_device": 5864 }, { "epoch": 0.2464, "loss_ce": 0.07605118304491043, "loss_lvr": 0.8833156228065491, "loss_mode_switch": 0.0, "loss_total": 0.16438275575637817, "step": 616 }, { "batch_size": 4, "epoch": 0.2464, "step": 616, "tokens_per_device": 5044 }, { "epoch": 0.2464, "loss_ce": 0.21067383885383606, "loss_lvr": 0.9554809331893921, "loss_mode_switch": 0.0, "loss_total": 0.30622193217277527, "step": 616 }, { "batch_size": 4, "epoch": 0.2464, "step": 616, "tokens_per_device": 4264 }, { "epoch": 0.2464, "loss_ce": 0.13166697323322296, "loss_lvr": 1.003706932067871, "loss_mode_switch": 0.0, "loss_total": 0.23203766345977783, "step": 616 }, { "batch_size": 4, "epoch": 0.2464, "step": 616, "tokens_per_device": 4872 }, { "epoch": 0.2464, "loss_ce": 0.030967380851507187, "loss_lvr": 0.8920193314552307, "loss_mode_switch": 0.0, "loss_total": 0.12016931176185608, "step": 616 }, { "epoch": 0.2468, "grad_norm": 1.2787699699401855, "learning_rate": 8.817237520615398e-06, "loss": 0.264, "step": 617 }, { "batch_size": 4, "epoch": 0.2468, "step": 617, "tokens_per_device": 3792 }, { "epoch": 0.2468, "loss_ce": 0.08887924998998642, "loss_lvr": 0.8872947096824646, "loss_mode_switch": 0.0, "loss_total": 0.17760872840881348, "step": 617 }, { "batch_size": 1, "epoch": 0.2468, "step": 617, "tokens_per_device": 5054 }, { "epoch": 0.2468, "loss_ce": 0.12502266466617584, "loss_lvr": 0.23154699802398682, "loss_mode_switch": 0.0, "loss_total": 0.148177370429039, "step": 617 }, { "batch_size": 1, "epoch": 0.2468, "step": 617, "tokens_per_device": 5147 }, { "epoch": 0.2468, "loss_ce": 0.011018619872629642, "loss_lvr": 0.38900354504585266, "loss_mode_switch": 0.0, "loss_total": 0.04991897568106651, "step": 617 }, { "batch_size": 4, "epoch": 0.2468, "step": 617, "tokens_per_device": 4516 }, { "epoch": 0.2468, "loss_ce": 0.2691083252429962, "loss_lvr": 0.8100869655609131, "loss_mode_switch": 0.0, "loss_total": 0.350117027759552, "step": 617 }, { "batch_size": 4, "epoch": 0.2468, "step": 617, "tokens_per_device": 4212 }, { "epoch": 0.2468, "loss_ce": 0.03568485751748085, "loss_lvr": 1.2784990072250366, "loss_mode_switch": 0.0, "loss_total": 0.1635347604751587, "step": 617 }, { "batch_size": 4, "epoch": 0.2468, "step": 617, "tokens_per_device": 6044 }, { "epoch": 0.2468, "loss_ce": 0.13893289864063263, "loss_lvr": 1.2315586805343628, "loss_mode_switch": 0.0, "loss_total": 0.2620887756347656, "step": 617 }, { "batch_size": 4, "epoch": 0.2468, "step": 617, "tokens_per_device": 1376 }, { "epoch": 0.2468, "loss_ce": 0.352316290140152, "loss_lvr": 1.1313896179199219, "loss_mode_switch": 0.0, "loss_total": 0.4654552638530731, "step": 617 }, { "batch_size": 1, "epoch": 0.2468, "step": 617, "tokens_per_device": 4677 }, { "epoch": 0.2468, "loss_ce": 0.09420636296272278, "loss_lvr": 0.6983316540718079, "loss_mode_switch": 0.0, "loss_total": 0.1640395224094391, "step": 617 }, { "epoch": 0.2472, "grad_norm": 1.6755908727645874, "learning_rate": 8.813050689458502e-06, "loss": 0.2945, "step": 618 }, { "batch_size": 4, "epoch": 0.2472, "step": 618, "tokens_per_device": 4696 }, { "epoch": 0.2472, "loss_ce": 0.6420396566390991, "loss_lvr": 0.9951736330986023, "loss_mode_switch": 0.0, "loss_total": 0.7415570020675659, "step": 618 }, { "batch_size": 4, "epoch": 0.2472, "step": 618, "tokens_per_device": 4824 }, { "epoch": 0.2472, "loss_ce": 0.05362807214260101, "loss_lvr": 0.8299703598022461, "loss_mode_switch": 0.0, "loss_total": 0.13662511110305786, "step": 618 }, { "batch_size": 4, "epoch": 0.2472, "step": 618, "tokens_per_device": 3964 }, { "epoch": 0.2472, "loss_ce": 0.04386402294039726, "loss_lvr": 1.2393102645874023, "loss_mode_switch": 0.0, "loss_total": 0.16779504716396332, "step": 618 }, { "batch_size": 1, "epoch": 0.2472, "step": 618, "tokens_per_device": 4788 }, { "epoch": 0.2472, "loss_ce": 0.0008872319012880325, "loss_lvr": 0.4247630536556244, "loss_mode_switch": 0.0, "loss_total": 0.0433635413646698, "step": 618 }, { "batch_size": 4, "epoch": 0.2472, "step": 618, "tokens_per_device": 3912 }, { "epoch": 0.2472, "loss_ce": 0.2788952887058258, "loss_lvr": 1.1128579378128052, "loss_mode_switch": 0.0, "loss_total": 0.3901810944080353, "step": 618 }, { "batch_size": 4, "epoch": 0.2472, "step": 618, "tokens_per_device": 3756 }, { "epoch": 0.2472, "loss_ce": 0.5519714951515198, "loss_lvr": 1.1745082139968872, "loss_mode_switch": 0.0, "loss_total": 0.6694223284721375, "step": 618 }, { "batch_size": 4, "epoch": 0.2472, "step": 618, "tokens_per_device": 4848 }, { "epoch": 0.2472, "loss_ce": 0.2510041296482086, "loss_lvr": 1.1340211629867554, "loss_mode_switch": 0.0, "loss_total": 0.3644062578678131, "step": 618 }, { "batch_size": 4, "epoch": 0.2472, "step": 618, "tokens_per_device": 7860 }, { "epoch": 0.2472, "loss_ce": 0.13997414708137512, "loss_lvr": 0.7259370684623718, "loss_mode_switch": 0.0, "loss_total": 0.21256786584854126, "step": 618 }, { "epoch": 0.2476, "grad_norm": 1.2662273645401, "learning_rate": 8.80885745876132e-06, "loss": 0.3188, "step": 619 }, { "batch_size": 4, "epoch": 0.2476, "step": 619, "tokens_per_device": 2556 }, { "epoch": 0.2476, "loss_ce": 0.3998764753341675, "loss_lvr": 1.1438102722167969, "loss_mode_switch": 0.0, "loss_total": 0.5142574906349182, "step": 619 }, { "batch_size": 4, "epoch": 0.2476, "step": 619, "tokens_per_device": 6556 }, { "epoch": 0.2476, "loss_ce": 0.40096816420555115, "loss_lvr": 1.3671091794967651, "loss_mode_switch": 0.0, "loss_total": 0.5376790761947632, "step": 619 }, { "batch_size": 4, "epoch": 0.2476, "step": 619, "tokens_per_device": 1328 }, { "epoch": 0.2476, "loss_ce": 0.23904626071453094, "loss_lvr": 1.0669598579406738, "loss_mode_switch": 0.0, "loss_total": 0.34574225544929504, "step": 619 }, { "batch_size": 4, "epoch": 0.2476, "step": 619, "tokens_per_device": 2692 }, { "epoch": 0.2476, "loss_ce": 0.29800599813461304, "loss_lvr": 1.0911359786987305, "loss_mode_switch": 0.0, "loss_total": 0.40711960196495056, "step": 619 }, { "batch_size": 1, "epoch": 0.2476, "step": 619, "tokens_per_device": 5143 }, { "epoch": 0.2476, "loss_ce": 0.0014463044935837388, "loss_lvr": 0.8513916730880737, "loss_mode_switch": 0.0, "loss_total": 0.08658547699451447, "step": 619 }, { "batch_size": 4, "epoch": 0.2476, "step": 619, "tokens_per_device": 4400 }, { "epoch": 0.2476, "loss_ce": 0.43600061535835266, "loss_lvr": 1.2064608335494995, "loss_mode_switch": 0.0, "loss_total": 0.5566467046737671, "step": 619 }, { "batch_size": 4, "epoch": 0.2476, "step": 619, "tokens_per_device": 1352 }, { "epoch": 0.2476, "loss_ce": 0.4964042007923126, "loss_lvr": 1.2511427402496338, "loss_mode_switch": 0.0, "loss_total": 0.6215184926986694, "step": 619 }, { "batch_size": 1, "epoch": 0.2476, "step": 619, "tokens_per_device": 5179 }, { "epoch": 0.2476, "loss_ce": 0.296394407749176, "loss_lvr": 0.949425995349884, "loss_mode_switch": 0.0, "loss_total": 0.39133700728416443, "step": 619 }, { "epoch": 0.248, "grad_norm": 1.3736571073532104, "learning_rate": 8.804657835561456e-06, "loss": 0.3165, "step": 620 }, { "batch_size": 4, "epoch": 0.248, "step": 620, "tokens_per_device": 4268 }, { "epoch": 0.248, "loss_ce": 0.3729245662689209, "loss_lvr": 0.8568162322044373, "loss_mode_switch": 0.0, "loss_total": 0.45860618352890015, "step": 620 }, { "batch_size": 1, "epoch": 0.248, "step": 620, "tokens_per_device": 5108 }, { "epoch": 0.248, "loss_ce": 0.005028360988944769, "loss_lvr": 0.46809202432632446, "loss_mode_switch": 0.0, "loss_total": 0.05183756351470947, "step": 620 }, { "batch_size": 4, "epoch": 0.248, "step": 620, "tokens_per_device": 4224 }, { "epoch": 0.248, "loss_ce": 0.05395194888114929, "loss_lvr": 0.8657503724098206, "loss_mode_switch": 0.0, "loss_total": 0.14052698016166687, "step": 620 }, { "batch_size": 1, "epoch": 0.248, "step": 620, "tokens_per_device": 4741 }, { "epoch": 0.248, "loss_ce": 0.09195704758167267, "loss_lvr": 0.2260129600763321, "loss_mode_switch": 0.0, "loss_total": 0.11455834656953812, "step": 620 }, { "batch_size": 1, "epoch": 0.248, "step": 620, "tokens_per_device": 5198 }, { "epoch": 0.248, "loss_ce": 0.05570778250694275, "loss_lvr": 0.44178396463394165, "loss_mode_switch": 0.0, "loss_total": 0.09988617897033691, "step": 620 }, { "batch_size": 4, "epoch": 0.248, "step": 620, "tokens_per_device": 6924 }, { "epoch": 0.248, "loss_ce": 0.11791771650314331, "loss_lvr": 0.7316188812255859, "loss_mode_switch": 0.0, "loss_total": 0.19107961654663086, "step": 620 }, { "batch_size": 4, "epoch": 0.248, "step": 620, "tokens_per_device": 1504 }, { "epoch": 0.248, "loss_ce": 0.6201726198196411, "loss_lvr": 0.9130269885063171, "loss_mode_switch": 0.0, "loss_total": 0.7114753127098083, "step": 620 }, { "batch_size": 4, "epoch": 0.248, "step": 620, "tokens_per_device": 5264 }, { "epoch": 0.248, "loss_ce": 0.474621444940567, "loss_lvr": 0.8982073664665222, "loss_mode_switch": 0.0, "loss_total": 0.5644421577453613, "step": 620 }, { "epoch": 0.2484, "grad_norm": 2.103450298309326, "learning_rate": 8.800451826907245e-06, "loss": 0.3216, "step": 621 }, { "batch_size": 4, "epoch": 0.2484, "step": 621, "tokens_per_device": 5268 }, { "epoch": 0.2484, "loss_ce": 0.16622574627399445, "loss_lvr": 0.7268642783164978, "loss_mode_switch": 0.0, "loss_total": 0.2389121651649475, "step": 621 }, { "batch_size": 1, "epoch": 0.2484, "step": 621, "tokens_per_device": 4903 }, { "epoch": 0.2484, "loss_ce": 0.346375048160553, "loss_lvr": 0.6290679574012756, "loss_mode_switch": 0.0, "loss_total": 0.409281849861145, "step": 621 }, { "batch_size": 4, "epoch": 0.2484, "step": 621, "tokens_per_device": 4436 }, { "epoch": 0.2484, "loss_ce": 0.03216628357768059, "loss_lvr": 0.9980306625366211, "loss_mode_switch": 0.0, "loss_total": 0.13196934759616852, "step": 621 }, { "batch_size": 4, "epoch": 0.2484, "step": 621, "tokens_per_device": 4040 }, { "epoch": 0.2484, "loss_ce": 0.43964192271232605, "loss_lvr": 0.7942652106285095, "loss_mode_switch": 0.0, "loss_total": 0.5190684199333191, "step": 621 }, { "batch_size": 4, "epoch": 0.2484, "step": 621, "tokens_per_device": 2812 }, { "epoch": 0.2484, "loss_ce": 0.06553811579942703, "loss_lvr": 0.6123279333114624, "loss_mode_switch": 0.0, "loss_total": 0.12677091360092163, "step": 621 }, { "batch_size": 1, "epoch": 0.2484, "step": 621, "tokens_per_device": 4774 }, { "epoch": 0.2484, "loss_ce": 0.11029491573572159, "loss_lvr": 0.26635223627090454, "loss_mode_switch": 0.0, "loss_total": 0.13693013787269592, "step": 621 }, { "batch_size": 4, "epoch": 0.2484, "step": 621, "tokens_per_device": 4756 }, { "epoch": 0.2484, "loss_ce": 0.3981372117996216, "loss_lvr": 0.9873345494270325, "loss_mode_switch": 0.0, "loss_total": 0.49687066674232483, "step": 621 }, { "batch_size": 4, "epoch": 0.2484, "step": 621, "tokens_per_device": 1632 }, { "epoch": 0.2484, "loss_ce": 0.3988870084285736, "loss_lvr": 0.9975666999816895, "loss_mode_switch": 0.0, "loss_total": 0.498643696308136, "step": 621 }, { "epoch": 0.2488, "grad_norm": 1.4228681325912476, "learning_rate": 8.79623943985774e-06, "loss": 0.3139, "step": 622 }, { "batch_size": 4, "epoch": 0.2488, "step": 622, "tokens_per_device": 5684 }, { "epoch": 0.2488, "loss_ce": 0.051559798419475555, "loss_lvr": 0.4292525053024292, "loss_mode_switch": 0.0, "loss_total": 0.09448504447937012, "step": 622 }, { "batch_size": 4, "epoch": 0.2488, "step": 622, "tokens_per_device": 4196 }, { "epoch": 0.2488, "loss_ce": 0.16032202541828156, "loss_lvr": 0.9984845519065857, "loss_mode_switch": 0.0, "loss_total": 0.26017048954963684, "step": 622 }, { "batch_size": 4, "epoch": 0.2488, "step": 622, "tokens_per_device": 10464 }, { "epoch": 0.2488, "loss_ce": 0.14097344875335693, "loss_lvr": 0.9094133973121643, "loss_mode_switch": 0.0, "loss_total": 0.23191478848457336, "step": 622 }, { "batch_size": 4, "epoch": 0.2488, "step": 622, "tokens_per_device": 3004 }, { "epoch": 0.2488, "loss_ce": 0.23481033742427826, "loss_lvr": 0.5540004968643188, "loss_mode_switch": 0.0, "loss_total": 0.29021039605140686, "step": 622 }, { "batch_size": 4, "epoch": 0.2488, "step": 622, "tokens_per_device": 11912 }, { "epoch": 0.2488, "loss_ce": 0.2643308937549591, "loss_lvr": 0.7821869254112244, "loss_mode_switch": 0.0, "loss_total": 0.342549592256546, "step": 622 }, { "batch_size": 4, "epoch": 0.2488, "step": 622, "tokens_per_device": 5748 }, { "epoch": 0.2488, "loss_ce": 0.2222628891468048, "loss_lvr": 0.9912785887718201, "loss_mode_switch": 0.0, "loss_total": 0.3213907480239868, "step": 622 }, { "batch_size": 4, "epoch": 0.2488, "step": 622, "tokens_per_device": 3792 }, { "epoch": 0.2488, "loss_ce": 0.7325997352600098, "loss_lvr": 0.9213348031044006, "loss_mode_switch": 0.0, "loss_total": 0.8247331976890564, "step": 622 }, { "batch_size": 4, "epoch": 0.2488, "step": 622, "tokens_per_device": 15164 }, { "epoch": 0.2488, "loss_ce": 0.07091391086578369, "loss_lvr": 0.5497263669967651, "loss_mode_switch": 0.0, "loss_total": 0.12588654458522797, "step": 622 }, { "epoch": 0.2492, "grad_norm": 1.3274593353271484, "learning_rate": 8.792020681482698e-06, "loss": 0.3117, "step": 623 }, { "batch_size": 4, "epoch": 0.2492, "step": 623, "tokens_per_device": 1752 }, { "epoch": 0.2492, "loss_ce": 0.32280582189559937, "loss_lvr": 0.8367169499397278, "loss_mode_switch": 0.0, "loss_total": 0.40647751092910767, "step": 623 }, { "batch_size": 4, "epoch": 0.2492, "step": 623, "tokens_per_device": 4276 }, { "epoch": 0.2492, "loss_ce": 0.10121989250183105, "loss_lvr": 1.0632808208465576, "loss_mode_switch": 0.0, "loss_total": 0.20754797756671906, "step": 623 }, { "batch_size": 1, "epoch": 0.2492, "step": 623, "tokens_per_device": 4759 }, { "epoch": 0.2492, "loss_ce": 0.3401170074939728, "loss_lvr": 0.22701214253902435, "loss_mode_switch": 0.0, "loss_total": 0.3628182113170624, "step": 623 }, { "batch_size": 4, "epoch": 0.2492, "step": 623, "tokens_per_device": 15260 }, { "epoch": 0.2492, "loss_ce": 0.20069755613803864, "loss_lvr": 0.733482837677002, "loss_mode_switch": 0.0, "loss_total": 0.27404582500457764, "step": 623 }, { "batch_size": 4, "epoch": 0.2492, "step": 623, "tokens_per_device": 1292 }, { "epoch": 0.2492, "loss_ce": 0.09600139409303665, "loss_lvr": 1.432090401649475, "loss_mode_switch": 0.0, "loss_total": 0.23921042680740356, "step": 623 }, { "batch_size": 4, "epoch": 0.2492, "step": 623, "tokens_per_device": 5016 }, { "epoch": 0.2492, "loss_ce": 0.2566077709197998, "loss_lvr": 1.2448503971099854, "loss_mode_switch": 0.0, "loss_total": 0.3810928165912628, "step": 623 }, { "batch_size": 4, "epoch": 0.2492, "step": 623, "tokens_per_device": 5696 }, { "epoch": 0.2492, "loss_ce": 0.25995051860809326, "loss_lvr": 1.0393075942993164, "loss_mode_switch": 0.0, "loss_total": 0.36388128995895386, "step": 623 }, { "batch_size": 4, "epoch": 0.2492, "step": 623, "tokens_per_device": 5484 }, { "epoch": 0.2492, "loss_ce": 0.12620675563812256, "loss_lvr": 0.5937494039535522, "loss_mode_switch": 0.0, "loss_total": 0.18558169901371002, "step": 623 }, { "epoch": 0.2496, "grad_norm": 1.5917538404464722, "learning_rate": 8.787795558862566e-06, "loss": 0.3344, "step": 624 }, { "batch_size": 1, "epoch": 0.2496, "step": 624, "tokens_per_device": 4355 }, { "epoch": 0.2496, "loss_ce": 0.0496758297085762, "loss_lvr": 0.2956829369068146, "loss_mode_switch": 0.0, "loss_total": 0.07924412190914154, "step": 624 }, { "batch_size": 4, "epoch": 0.2496, "step": 624, "tokens_per_device": 1224 }, { "epoch": 0.2496, "loss_ce": 0.02650594152510166, "loss_lvr": 1.373863697052002, "loss_mode_switch": 0.0, "loss_total": 0.16389231383800507, "step": 624 }, { "batch_size": 4, "epoch": 0.2496, "step": 624, "tokens_per_device": 1332 }, { "epoch": 0.2496, "loss_ce": 0.5982279777526855, "loss_lvr": 0.9327887892723083, "loss_mode_switch": 0.0, "loss_total": 0.6915068626403809, "step": 624 }, { "batch_size": 4, "epoch": 0.2496, "step": 624, "tokens_per_device": 10568 }, { "epoch": 0.2496, "loss_ce": 0.6090906262397766, "loss_lvr": 0.9533355832099915, "loss_mode_switch": 0.0, "loss_total": 0.7044242024421692, "step": 624 }, { "batch_size": 4, "epoch": 0.2496, "step": 624, "tokens_per_device": 2860 }, { "epoch": 0.2496, "loss_ce": 0.19218827784061432, "loss_lvr": 0.9625365734100342, "loss_mode_switch": 0.0, "loss_total": 0.288441926240921, "step": 624 }, { "batch_size": 4, "epoch": 0.2496, "step": 624, "tokens_per_device": 4576 }, { "epoch": 0.2496, "loss_ce": 0.2836476266384125, "loss_lvr": 1.042817234992981, "loss_mode_switch": 0.0, "loss_total": 0.38792935013771057, "step": 624 }, { "batch_size": 4, "epoch": 0.2496, "step": 624, "tokens_per_device": 5112 }, { "epoch": 0.2496, "loss_ce": 0.241743266582489, "loss_lvr": 0.8690841794013977, "loss_mode_switch": 0.0, "loss_total": 0.32865169644355774, "step": 624 }, { "batch_size": 4, "epoch": 0.2496, "step": 624, "tokens_per_device": 4200 }, { "epoch": 0.2496, "loss_ce": 0.439973920583725, "loss_lvr": 0.6453597545623779, "loss_mode_switch": 0.0, "loss_total": 0.5045099258422852, "step": 624 }, { "epoch": 0.25, "grad_norm": 1.4869598150253296, "learning_rate": 8.783564079088478e-06, "loss": 0.3412, "step": 625 }, { "batch_size": 4, "epoch": 0.25, "step": 625, "tokens_per_device": 6424 }, { "epoch": 0.25, "loss_ce": 0.02611801214516163, "loss_lvr": 0.7966024279594421, "loss_mode_switch": 0.0, "loss_total": 0.10577825456857681, "step": 625 }, { "batch_size": 4, "epoch": 0.25, "step": 625, "tokens_per_device": 8784 }, { "epoch": 0.25, "loss_ce": 0.3159588575363159, "loss_lvr": 0.5631386637687683, "loss_mode_switch": 0.0, "loss_total": 0.3722727298736572, "step": 625 }, { "batch_size": 1, "epoch": 0.25, "step": 625, "tokens_per_device": 6129 }, { "epoch": 0.25, "loss_ce": 0.004349254071712494, "loss_lvr": 0.40610596537590027, "loss_mode_switch": 0.0, "loss_total": 0.04495985060930252, "step": 625 }, { "batch_size": 1, "epoch": 0.25, "step": 625, "tokens_per_device": 4911 }, { "epoch": 0.25, "loss_ce": 0.009214580059051514, "loss_lvr": 0.26862645149230957, "loss_mode_switch": 0.0, "loss_total": 0.03607722371816635, "step": 625 }, { "batch_size": 1, "epoch": 0.25, "step": 625, "tokens_per_device": 4842 }, { "epoch": 0.25, "loss_ce": 0.00345807196572423, "loss_lvr": 0.6008438467979431, "loss_mode_switch": 0.0, "loss_total": 0.0635424554347992, "step": 625 }, { "batch_size": 4, "epoch": 0.25, "step": 625, "tokens_per_device": 3928 }, { "epoch": 0.25, "loss_ce": 0.2641451358795166, "loss_lvr": 0.8651684522628784, "loss_mode_switch": 0.0, "loss_total": 0.3506619930267334, "step": 625 }, { "batch_size": 4, "epoch": 0.25, "step": 625, "tokens_per_device": 3940 }, { "epoch": 0.25, "loss_ce": 0.02350768633186817, "loss_lvr": 0.9699378609657288, "loss_mode_switch": 0.0, "loss_total": 0.12050147354602814, "step": 625 }, { "batch_size": 4, "epoch": 0.25, "step": 625, "tokens_per_device": 4556 }, { "epoch": 0.25, "loss_ce": 0.3751550614833832, "loss_lvr": 0.7977858185768127, "loss_mode_switch": 0.0, "loss_total": 0.45493364334106445, "step": 625 }, { "epoch": 0.2504, "grad_norm": 1.4789621829986572, "learning_rate": 8.779326249262232e-06, "loss": 0.3052, "step": 626 }, { "batch_size": 4, "epoch": 0.2504, "step": 626, "tokens_per_device": 4236 }, { "epoch": 0.2504, "loss_ce": 0.2476540058851242, "loss_lvr": 1.4007996320724487, "loss_mode_switch": 0.0, "loss_total": 0.38773396611213684, "step": 626 }, { "batch_size": 4, "epoch": 0.2504, "step": 626, "tokens_per_device": 4224 }, { "epoch": 0.2504, "loss_ce": 0.4225974977016449, "loss_lvr": 1.0635199546813965, "loss_mode_switch": 0.0, "loss_total": 0.528949499130249, "step": 626 }, { "batch_size": 4, "epoch": 0.2504, "step": 626, "tokens_per_device": 2680 }, { "epoch": 0.2504, "loss_ce": 0.14419598877429962, "loss_lvr": 1.5439387559890747, "loss_mode_switch": 0.0, "loss_total": 0.29858988523483276, "step": 626 }, { "batch_size": 4, "epoch": 0.2504, "step": 626, "tokens_per_device": 1412 }, { "epoch": 0.2504, "loss_ce": 0.5094455480575562, "loss_lvr": 1.0476926565170288, "loss_mode_switch": 0.0, "loss_total": 0.6142148375511169, "step": 626 }, { "batch_size": 4, "epoch": 0.2504, "step": 626, "tokens_per_device": 9052 }, { "epoch": 0.2504, "loss_ce": 0.21851567924022675, "loss_lvr": 0.934094250202179, "loss_mode_switch": 0.0, "loss_total": 0.31192511320114136, "step": 626 }, { "batch_size": 4, "epoch": 0.2504, "step": 626, "tokens_per_device": 4700 }, { "epoch": 0.2504, "loss_ce": 0.4682951271533966, "loss_lvr": 0.835366427898407, "loss_mode_switch": 0.0, "loss_total": 0.5518317818641663, "step": 626 }, { "batch_size": 4, "epoch": 0.2504, "step": 626, "tokens_per_device": 7304 }, { "epoch": 0.2504, "loss_ce": 0.03667692095041275, "loss_lvr": 0.6557053923606873, "loss_mode_switch": 0.0, "loss_total": 0.1022474616765976, "step": 626 }, { "batch_size": 1, "epoch": 0.2504, "step": 626, "tokens_per_device": 4814 }, { "epoch": 0.2504, "loss_ce": 0.0015268126735463738, "loss_lvr": 0.36517462134361267, "loss_mode_switch": 0.0, "loss_total": 0.0380442775785923, "step": 626 }, { "epoch": 0.2508, "grad_norm": 1.268163800239563, "learning_rate": 8.775082076496287e-06, "loss": 0.3046, "step": 627 }, { "batch_size": 4, "epoch": 0.2508, "step": 627, "tokens_per_device": 2608 }, { "epoch": 0.2508, "loss_ce": 0.09678371250629425, "loss_lvr": 0.8083025217056274, "loss_mode_switch": 0.0, "loss_total": 0.1776139736175537, "step": 627 }, { "batch_size": 4, "epoch": 0.2508, "step": 627, "tokens_per_device": 5128 }, { "epoch": 0.2508, "loss_ce": 0.11571000516414642, "loss_lvr": 0.8878729343414307, "loss_mode_switch": 0.0, "loss_total": 0.2044973075389862, "step": 627 }, { "batch_size": 4, "epoch": 0.2508, "step": 627, "tokens_per_device": 6672 }, { "epoch": 0.2508, "loss_ce": 0.16189706325531006, "loss_lvr": 0.8369070887565613, "loss_mode_switch": 0.0, "loss_total": 0.2455877661705017, "step": 627 }, { "batch_size": 1, "epoch": 0.2508, "step": 627, "tokens_per_device": 4878 }, { "epoch": 0.2508, "loss_ce": 0.0009598793694749475, "loss_lvr": 0.52220219373703, "loss_mode_switch": 0.0, "loss_total": 0.05318010225892067, "step": 627 }, { "batch_size": 1, "epoch": 0.2508, "step": 627, "tokens_per_device": 4965 }, { "epoch": 0.2508, "loss_ce": 0.02785634621977806, "loss_lvr": 0.4897318184375763, "loss_mode_switch": 0.0, "loss_total": 0.07682953029870987, "step": 627 }, { "batch_size": 4, "epoch": 0.2508, "step": 627, "tokens_per_device": 6604 }, { "epoch": 0.2508, "loss_ce": 0.02366413176059723, "loss_lvr": 0.7281576991081238, "loss_mode_switch": 0.0, "loss_total": 0.09647990018129349, "step": 627 }, { "batch_size": 4, "epoch": 0.2508, "step": 627, "tokens_per_device": 7176 }, { "epoch": 0.2508, "loss_ce": 0.08335810154676437, "loss_lvr": 0.7398536801338196, "loss_mode_switch": 0.0, "loss_total": 0.15734347701072693, "step": 627 }, { "batch_size": 4, "epoch": 0.2508, "step": 627, "tokens_per_device": 4216 }, { "epoch": 0.2508, "loss_ce": 0.2066766619682312, "loss_lvr": 0.9768499135971069, "loss_mode_switch": 0.0, "loss_total": 0.30436164140701294, "step": 627 }, { "epoch": 0.2512, "grad_norm": 1.2220810651779175, "learning_rate": 8.770831567913747e-06, "loss": 0.2487, "step": 628 }, { "batch_size": 4, "epoch": 0.2512, "step": 628, "tokens_per_device": 4212 }, { "epoch": 0.2512, "loss_ce": 0.33748894929885864, "loss_lvr": 0.9024437069892883, "loss_mode_switch": 0.0, "loss_total": 0.42773333191871643, "step": 628 }, { "batch_size": 4, "epoch": 0.2512, "step": 628, "tokens_per_device": 5252 }, { "epoch": 0.2512, "loss_ce": 0.2210656851530075, "loss_lvr": 0.9231635332107544, "loss_mode_switch": 0.0, "loss_total": 0.31338202953338623, "step": 628 }, { "batch_size": 4, "epoch": 0.2512, "step": 628, "tokens_per_device": 5904 }, { "epoch": 0.2512, "loss_ce": 0.04257693514227867, "loss_lvr": 0.6770674586296082, "loss_mode_switch": 0.0, "loss_total": 0.11028368771076202, "step": 628 }, { "batch_size": 4, "epoch": 0.2512, "step": 628, "tokens_per_device": 3800 }, { "epoch": 0.2512, "loss_ce": 0.050498008728027344, "loss_lvr": 1.2668817043304443, "loss_mode_switch": 0.0, "loss_total": 0.17718617618083954, "step": 628 }, { "batch_size": 4, "epoch": 0.2512, "step": 628, "tokens_per_device": 4216 }, { "epoch": 0.2512, "loss_ce": 0.08505098521709442, "loss_lvr": 1.0890566110610962, "loss_mode_switch": 0.0, "loss_total": 0.1939566433429718, "step": 628 }, { "batch_size": 4, "epoch": 0.2512, "step": 628, "tokens_per_device": 4268 }, { "epoch": 0.2512, "loss_ce": 0.41889873147010803, "loss_lvr": 1.39206063747406, "loss_mode_switch": 0.0, "loss_total": 0.5581048130989075, "step": 628 }, { "batch_size": 1, "epoch": 0.2512, "step": 628, "tokens_per_device": 5164 }, { "epoch": 0.2512, "loss_ce": 0.21056760847568512, "loss_lvr": 0.5610278844833374, "loss_mode_switch": 0.0, "loss_total": 0.2666704058647156, "step": 628 }, { "batch_size": 1, "epoch": 0.2512, "step": 628, "tokens_per_device": 5090 }, { "epoch": 0.2512, "loss_ce": 0.09122581779956818, "loss_lvr": 0.6167259216308594, "loss_mode_switch": 0.0, "loss_total": 0.1528984159231186, "step": 628 }, { "epoch": 0.2516, "grad_norm": 1.315434217453003, "learning_rate": 8.76657473064835e-06, "loss": 0.3207, "step": 629 }, { "batch_size": 4, "epoch": 0.2516, "step": 629, "tokens_per_device": 14368 }, { "epoch": 0.2516, "loss_ce": 0.613167405128479, "loss_lvr": 0.8596543669700623, "loss_mode_switch": 0.0, "loss_total": 0.6991328597068787, "step": 629 }, { "batch_size": 4, "epoch": 0.2516, "step": 629, "tokens_per_device": 4244 }, { "epoch": 0.2516, "loss_ce": 0.2168503999710083, "loss_lvr": 0.7587940692901611, "loss_mode_switch": 0.0, "loss_total": 0.29272979497909546, "step": 629 }, { "batch_size": 4, "epoch": 0.2516, "step": 629, "tokens_per_device": 3920 }, { "epoch": 0.2516, "loss_ce": 0.23812450468540192, "loss_lvr": 0.6703425645828247, "loss_mode_switch": 0.0, "loss_total": 0.3051587641239166, "step": 629 }, { "batch_size": 1, "epoch": 0.2516, "step": 629, "tokens_per_device": 5398 }, { "epoch": 0.2516, "loss_ce": 0.0009799494873732328, "loss_lvr": 0.9617531299591064, "loss_mode_switch": 0.0, "loss_total": 0.09715526551008224, "step": 629 }, { "batch_size": 1, "epoch": 0.2516, "step": 629, "tokens_per_device": 4893 }, { "epoch": 0.2516, "loss_ce": 0.05561273917555809, "loss_lvr": 1.300374984741211, "loss_mode_switch": 0.0, "loss_total": 0.18565024435520172, "step": 629 }, { "batch_size": 4, "epoch": 0.2516, "step": 629, "tokens_per_device": 2684 }, { "epoch": 0.2516, "loss_ce": 0.33949747681617737, "loss_lvr": 0.8588805198669434, "loss_mode_switch": 0.0, "loss_total": 0.4253855347633362, "step": 629 }, { "batch_size": 1, "epoch": 0.2516, "step": 629, "tokens_per_device": 6739 }, { "epoch": 0.2516, "loss_ce": 0.14908403158187866, "loss_lvr": 0.32752642035484314, "loss_mode_switch": 0.0, "loss_total": 0.18183667957782745, "step": 629 }, { "batch_size": 4, "epoch": 0.2516, "step": 629, "tokens_per_device": 3788 }, { "epoch": 0.2516, "loss_ce": 0.1533825546503067, "loss_lvr": 1.0362221002578735, "loss_mode_switch": 0.0, "loss_total": 0.2570047676563263, "step": 629 }, { "epoch": 0.252, "grad_norm": 1.4152706861495972, "learning_rate": 8.762311571844453e-06, "loss": 0.3215, "step": 630 }, { "batch_size": 4, "epoch": 0.252, "step": 630, "tokens_per_device": 4220 }, { "epoch": 0.252, "loss_ce": 0.18144725263118744, "loss_lvr": 1.0729519128799438, "loss_mode_switch": 0.0, "loss_total": 0.28874245285987854, "step": 630 }, { "batch_size": 1, "epoch": 0.252, "step": 630, "tokens_per_device": 5171 }, { "epoch": 0.252, "loss_ce": 0.05168507620692253, "loss_lvr": 0.6364376544952393, "loss_mode_switch": 0.0, "loss_total": 0.115328848361969, "step": 630 }, { "batch_size": 4, "epoch": 0.252, "step": 630, "tokens_per_device": 1560 }, { "epoch": 0.252, "loss_ce": 0.34597542881965637, "loss_lvr": 1.2940481901168823, "loss_mode_switch": 0.0, "loss_total": 0.4753802418708801, "step": 630 }, { "batch_size": 4, "epoch": 0.252, "step": 630, "tokens_per_device": 5144 }, { "epoch": 0.252, "loss_ce": 0.28340280055999756, "loss_lvr": 0.6768434047698975, "loss_mode_switch": 0.0, "loss_total": 0.35108715295791626, "step": 630 }, { "batch_size": 4, "epoch": 0.252, "step": 630, "tokens_per_device": 4764 }, { "epoch": 0.252, "loss_ce": 0.2702946662902832, "loss_lvr": 0.8745616674423218, "loss_mode_switch": 0.0, "loss_total": 0.3577508330345154, "step": 630 }, { "batch_size": 1, "epoch": 0.252, "step": 630, "tokens_per_device": 4994 }, { "epoch": 0.252, "loss_ce": 0.030808672308921814, "loss_lvr": 0.7166459560394287, "loss_mode_switch": 0.0, "loss_total": 0.10247326642274857, "step": 630 }, { "batch_size": 4, "epoch": 0.252, "step": 630, "tokens_per_device": 5476 }, { "epoch": 0.252, "loss_ce": 0.8303167223930359, "loss_lvr": 0.9459925293922424, "loss_mode_switch": 0.0, "loss_total": 0.9249159693717957, "step": 630 }, { "batch_size": 4, "epoch": 0.252, "step": 630, "tokens_per_device": 3472 }, { "epoch": 0.252, "loss_ce": 0.3293459117412567, "loss_lvr": 1.4477174282073975, "loss_mode_switch": 0.0, "loss_total": 0.474117636680603, "step": 630 }, { "epoch": 0.2524, "grad_norm": 1.3059004545211792, "learning_rate": 8.758042098657022e-06, "loss": 0.3362, "step": 631 }, { "batch_size": 4, "epoch": 0.2524, "step": 631, "tokens_per_device": 4116 }, { "epoch": 0.2524, "loss_ce": 0.3143715560436249, "loss_lvr": 1.2077245712280273, "loss_mode_switch": 0.0, "loss_total": 0.43514400720596313, "step": 631 }, { "batch_size": 4, "epoch": 0.2524, "step": 631, "tokens_per_device": 1356 }, { "epoch": 0.2524, "loss_ce": 0.6219013929367065, "loss_lvr": 0.9369902610778809, "loss_mode_switch": 0.0, "loss_total": 0.7156004309654236, "step": 631 }, { "batch_size": 4, "epoch": 0.2524, "step": 631, "tokens_per_device": 5468 }, { "epoch": 0.2524, "loss_ce": 0.18848201632499695, "loss_lvr": 0.7621279954910278, "loss_mode_switch": 0.0, "loss_total": 0.26469480991363525, "step": 631 }, { "batch_size": 4, "epoch": 0.2524, "step": 631, "tokens_per_device": 4048 }, { "epoch": 0.2524, "loss_ce": 0.36289775371551514, "loss_lvr": 0.8638038635253906, "loss_mode_switch": 0.0, "loss_total": 0.4492781460285187, "step": 631 }, { "batch_size": 4, "epoch": 0.2524, "step": 631, "tokens_per_device": 12980 }, { "epoch": 0.2524, "loss_ce": 0.18855729699134827, "loss_lvr": 0.4369467794895172, "loss_mode_switch": 0.0, "loss_total": 0.23225197196006775, "step": 631 }, { "batch_size": 1, "epoch": 0.2524, "step": 631, "tokens_per_device": 5108 }, { "epoch": 0.2524, "loss_ce": 0.9676192402839661, "loss_lvr": 0.5651434063911438, "loss_mode_switch": 0.0, "loss_total": 1.024133563041687, "step": 631 }, { "batch_size": 4, "epoch": 0.2524, "step": 631, "tokens_per_device": 1320 }, { "epoch": 0.2524, "loss_ce": 0.220516636967659, "loss_lvr": 1.0716617107391357, "loss_mode_switch": 0.0, "loss_total": 0.3276827931404114, "step": 631 }, { "batch_size": 4, "epoch": 0.2524, "step": 631, "tokens_per_device": 2652 }, { "epoch": 0.2524, "loss_ce": 0.24489860236644745, "loss_lvr": 1.1100198030471802, "loss_mode_switch": 0.0, "loss_total": 0.3559005856513977, "step": 631 }, { "epoch": 0.2528, "grad_norm": 1.271156668663025, "learning_rate": 8.753766318251628e-06, "loss": 0.3217, "step": 632 }, { "batch_size": 4, "epoch": 0.2528, "step": 632, "tokens_per_device": 5272 }, { "epoch": 0.2528, "loss_ce": 0.3765212893486023, "loss_lvr": 0.7605605721473694, "loss_mode_switch": 0.0, "loss_total": 0.4525773525238037, "step": 632 }, { "batch_size": 4, "epoch": 0.2528, "step": 632, "tokens_per_device": 1352 }, { "epoch": 0.2528, "loss_ce": 0.6845629215240479, "loss_lvr": 1.1855378150939941, "loss_mode_switch": 0.0, "loss_total": 0.8031166791915894, "step": 632 }, { "batch_size": 4, "epoch": 0.2528, "step": 632, "tokens_per_device": 12624 }, { "epoch": 0.2528, "loss_ce": 0.0923001617193222, "loss_lvr": 0.9348388314247131, "loss_mode_switch": 0.0, "loss_total": 0.18578404188156128, "step": 632 }, { "batch_size": 4, "epoch": 0.2528, "step": 632, "tokens_per_device": 4252 }, { "epoch": 0.2528, "loss_ce": 0.844643235206604, "loss_lvr": 0.971281886100769, "loss_mode_switch": 0.0, "loss_total": 0.9417714476585388, "step": 632 }, { "batch_size": 1, "epoch": 0.2528, "step": 632, "tokens_per_device": 5419 }, { "epoch": 0.2528, "loss_ce": 0.010463343001902103, "loss_lvr": 0.741432785987854, "loss_mode_switch": 0.0, "loss_total": 0.08460662513971329, "step": 632 }, { "batch_size": 4, "epoch": 0.2528, "step": 632, "tokens_per_device": 4064 }, { "epoch": 0.2528, "loss_ce": 0.1751265823841095, "loss_lvr": 1.6017143726348877, "loss_mode_switch": 0.0, "loss_total": 0.33529800176620483, "step": 632 }, { "batch_size": 1, "epoch": 0.2528, "step": 632, "tokens_per_device": 5722 }, { "epoch": 0.2528, "loss_ce": 0.29671019315719604, "loss_lvr": 0.6964452862739563, "loss_mode_switch": 0.0, "loss_total": 0.36635473370552063, "step": 632 }, { "batch_size": 1, "epoch": 0.2528, "step": 632, "tokens_per_device": 5136 }, { "epoch": 0.2528, "loss_ce": 0.07698036730289459, "loss_lvr": 0.6814424991607666, "loss_mode_switch": 0.0, "loss_total": 0.145124614238739, "step": 632 }, { "epoch": 0.2532, "grad_norm": 1.3815391063690186, "learning_rate": 8.74948423780442e-06, "loss": 0.3013, "step": 633 }, { "batch_size": 4, "epoch": 0.2532, "step": 633, "tokens_per_device": 3992 }, { "epoch": 0.2532, "loss_ce": 0.31131941080093384, "loss_lvr": 1.059225082397461, "loss_mode_switch": 0.0, "loss_total": 0.4172419309616089, "step": 633 }, { "batch_size": 1, "epoch": 0.2532, "step": 633, "tokens_per_device": 5256 }, { "epoch": 0.2532, "loss_ce": 0.019515883177518845, "loss_lvr": 0.5119805932044983, "loss_mode_switch": 0.0, "loss_total": 0.07071394473314285, "step": 633 }, { "batch_size": 1, "epoch": 0.2532, "step": 633, "tokens_per_device": 4880 }, { "epoch": 0.2532, "loss_ce": 0.001228164997883141, "loss_lvr": 0.24025960266590118, "loss_mode_switch": 0.0, "loss_total": 0.025254124775528908, "step": 633 }, { "batch_size": 4, "epoch": 0.2532, "step": 633, "tokens_per_device": 2608 }, { "epoch": 0.2532, "loss_ce": 0.4526236653327942, "loss_lvr": 1.3725303411483765, "loss_mode_switch": 0.0, "loss_total": 0.5898767113685608, "step": 633 }, { "batch_size": 4, "epoch": 0.2532, "step": 633, "tokens_per_device": 4252 }, { "epoch": 0.2532, "loss_ce": 0.5782597661018372, "loss_lvr": 0.9661592245101929, "loss_mode_switch": 0.0, "loss_total": 0.6748756766319275, "step": 633 }, { "batch_size": 4, "epoch": 0.2532, "step": 633, "tokens_per_device": 1268 }, { "epoch": 0.2532, "loss_ce": 0.6236597895622253, "loss_lvr": 1.1352570056915283, "loss_mode_switch": 0.0, "loss_total": 0.7371854782104492, "step": 633 }, { "batch_size": 4, "epoch": 0.2532, "step": 633, "tokens_per_device": 1260 }, { "epoch": 0.2532, "loss_ce": 0.20039625465869904, "loss_lvr": 1.0972795486450195, "loss_mode_switch": 0.0, "loss_total": 0.3101242184638977, "step": 633 }, { "batch_size": 4, "epoch": 0.2532, "step": 633, "tokens_per_device": 1692 }, { "epoch": 0.2532, "loss_ce": 0.5469989776611328, "loss_lvr": 1.0032567977905273, "loss_mode_switch": 0.0, "loss_total": 0.6473246812820435, "step": 633 }, { "epoch": 0.2536, "grad_norm": 1.4069746732711792, "learning_rate": 8.745195864502121e-06, "loss": 0.3407, "step": 634 }, { "batch_size": 1, "epoch": 0.2536, "step": 634, "tokens_per_device": 4911 }, { "epoch": 0.2536, "loss_ce": 0.22299982607364655, "loss_lvr": 0.8417288064956665, "loss_mode_switch": 0.0, "loss_total": 0.3071727156639099, "step": 634 }, { "batch_size": 4, "epoch": 0.2536, "step": 634, "tokens_per_device": 4444 }, { "epoch": 0.2536, "loss_ce": 0.3601665496826172, "loss_lvr": 0.849881649017334, "loss_mode_switch": 0.0, "loss_total": 0.44515472650527954, "step": 634 }, { "batch_size": 4, "epoch": 0.2536, "step": 634, "tokens_per_device": 4612 }, { "epoch": 0.2536, "loss_ce": 0.38610294461250305, "loss_lvr": 0.8445429801940918, "loss_mode_switch": 0.0, "loss_total": 0.47055724263191223, "step": 634 }, { "batch_size": 4, "epoch": 0.2536, "step": 634, "tokens_per_device": 4468 }, { "epoch": 0.2536, "loss_ce": 0.3287135660648346, "loss_lvr": 0.9463315010070801, "loss_mode_switch": 0.0, "loss_total": 0.42334672808647156, "step": 634 }, { "batch_size": 4, "epoch": 0.2536, "step": 634, "tokens_per_device": 1508 }, { "epoch": 0.2536, "loss_ce": 0.23635515570640564, "loss_lvr": 1.1347496509552002, "loss_mode_switch": 0.0, "loss_total": 0.34983012080192566, "step": 634 }, { "batch_size": 4, "epoch": 0.2536, "step": 634, "tokens_per_device": 3796 }, { "epoch": 0.2536, "loss_ce": 0.2477831095457077, "loss_lvr": 0.934257984161377, "loss_mode_switch": 0.0, "loss_total": 0.34120890498161316, "step": 634 }, { "batch_size": 4, "epoch": 0.2536, "step": 634, "tokens_per_device": 5500 }, { "epoch": 0.2536, "loss_ce": 0.003076385473832488, "loss_lvr": 0.7204388976097107, "loss_mode_switch": 0.0, "loss_total": 0.07512027025222778, "step": 634 }, { "batch_size": 4, "epoch": 0.2536, "step": 634, "tokens_per_device": 3744 }, { "epoch": 0.2536, "loss_ce": 0.5239877700805664, "loss_lvr": 1.1111770868301392, "loss_mode_switch": 0.0, "loss_total": 0.6351054906845093, "step": 634 }, { "epoch": 0.254, "grad_norm": 1.394799828529358, "learning_rate": 8.74090120554202e-06, "loss": 0.3418, "step": 635 }, { "batch_size": 1, "epoch": 0.254, "step": 635, "tokens_per_device": 5185 }, { "epoch": 0.254, "loss_ce": 0.1512780487537384, "loss_lvr": 0.825697660446167, "loss_mode_switch": 0.0, "loss_total": 0.23384782671928406, "step": 635 }, { "batch_size": 1, "epoch": 0.254, "step": 635, "tokens_per_device": 4759 }, { "epoch": 0.254, "loss_ce": 0.03573322296142578, "loss_lvr": 0.4698515832424164, "loss_mode_switch": 0.0, "loss_total": 0.0827183872461319, "step": 635 }, { "batch_size": 1, "epoch": 0.254, "step": 635, "tokens_per_device": 4951 }, { "epoch": 0.254, "loss_ce": 0.14637133479118347, "loss_lvr": 0.39756372570991516, "loss_mode_switch": 0.0, "loss_total": 0.186127707362175, "step": 635 }, { "batch_size": 1, "epoch": 0.254, "step": 635, "tokens_per_device": 5163 }, { "epoch": 0.254, "loss_ce": 0.05764782056212425, "loss_lvr": 0.22054819762706757, "loss_mode_switch": 0.0, "loss_total": 0.07970263808965683, "step": 635 }, { "batch_size": 1, "epoch": 0.254, "step": 635, "tokens_per_device": 5164 }, { "epoch": 0.254, "loss_ce": 0.02829929068684578, "loss_lvr": 1.1447786092758179, "loss_mode_switch": 0.0, "loss_total": 0.14277715981006622, "step": 635 }, { "batch_size": 1, "epoch": 0.254, "step": 635, "tokens_per_device": 5129 }, { "epoch": 0.254, "loss_ce": 0.023103812709450722, "loss_lvr": 0.4722534120082855, "loss_mode_switch": 0.0, "loss_total": 0.07032915204763412, "step": 635 }, { "batch_size": 1, "epoch": 0.254, "step": 635, "tokens_per_device": 4847 }, { "epoch": 0.254, "loss_ce": 0.13593533635139465, "loss_lvr": 0.8809956312179565, "loss_mode_switch": 0.0, "loss_total": 0.22403490543365479, "step": 635 }, { "batch_size": 1, "epoch": 0.254, "step": 635, "tokens_per_device": 4832 }, { "epoch": 0.254, "loss_ce": 0.02620760165154934, "loss_lvr": 0.9785487651824951, "loss_mode_switch": 0.0, "loss_total": 0.12406247854232788, "step": 635 }, { "epoch": 0.2544, "grad_norm": 1.5128835439682007, "learning_rate": 8.736600268131953e-06, "loss": 0.338, "step": 636 }, { "batch_size": 4, "epoch": 0.2544, "step": 636, "tokens_per_device": 4228 }, { "epoch": 0.2544, "loss_ce": 0.16571307182312012, "loss_lvr": 0.9888274669647217, "loss_mode_switch": 0.0, "loss_total": 0.26459580659866333, "step": 636 }, { "batch_size": 4, "epoch": 0.2544, "step": 636, "tokens_per_device": 4276 }, { "epoch": 0.2544, "loss_ce": 0.1618632972240448, "loss_lvr": 0.9971689581871033, "loss_mode_switch": 0.0, "loss_total": 0.2615801990032196, "step": 636 }, { "batch_size": 4, "epoch": 0.2544, "step": 636, "tokens_per_device": 1340 }, { "epoch": 0.2544, "loss_ce": 0.2856670916080475, "loss_lvr": 1.081912636756897, "loss_mode_switch": 0.0, "loss_total": 0.3938583731651306, "step": 636 }, { "batch_size": 4, "epoch": 0.2544, "step": 636, "tokens_per_device": 4628 }, { "epoch": 0.2544, "loss_ce": 0.07211627811193466, "loss_lvr": 1.1771680116653442, "loss_mode_switch": 0.0, "loss_total": 0.18983307480812073, "step": 636 }, { "batch_size": 4, "epoch": 0.2544, "step": 636, "tokens_per_device": 15856 }, { "epoch": 0.2544, "loss_ce": 0.18312612175941467, "loss_lvr": 0.747205376625061, "loss_mode_switch": 0.0, "loss_total": 0.2578466534614563, "step": 636 }, { "batch_size": 4, "epoch": 0.2544, "step": 636, "tokens_per_device": 4432 }, { "epoch": 0.2544, "loss_ce": 0.06783446669578552, "loss_lvr": 0.989085853099823, "loss_mode_switch": 0.0, "loss_total": 0.16674305498600006, "step": 636 }, { "batch_size": 4, "epoch": 0.2544, "step": 636, "tokens_per_device": 6208 }, { "epoch": 0.2544, "loss_ce": 0.34383755922317505, "loss_lvr": 0.9177494645118713, "loss_mode_switch": 0.0, "loss_total": 0.4356124997138977, "step": 636 }, { "batch_size": 4, "epoch": 0.2544, "step": 636, "tokens_per_device": 4820 }, { "epoch": 0.2544, "loss_ce": 0.08600794523954391, "loss_lvr": 0.7804787755012512, "loss_mode_switch": 0.0, "loss_total": 0.16405582427978516, "step": 636 }, { "epoch": 0.2548, "grad_norm": 1.3552130460739136, "learning_rate": 8.73229305949029e-06, "loss": 0.2771, "step": 637 }, { "batch_size": 1, "epoch": 0.2548, "step": 637, "tokens_per_device": 4935 }, { "epoch": 0.2548, "loss_ce": 0.007954081520438194, "loss_lvr": 0.4544992744922638, "loss_mode_switch": 0.0, "loss_total": 0.05340401083230972, "step": 637 }, { "batch_size": 4, "epoch": 0.2548, "step": 637, "tokens_per_device": 2684 }, { "epoch": 0.2548, "loss_ce": 0.4893696904182434, "loss_lvr": 1.0144531726837158, "loss_mode_switch": 0.0, "loss_total": 0.590815007686615, "step": 637 }, { "batch_size": 4, "epoch": 0.2548, "step": 637, "tokens_per_device": 1908 }, { "epoch": 0.2548, "loss_ce": 0.14119651913642883, "loss_lvr": 1.074112892150879, "loss_mode_switch": 0.0, "loss_total": 0.2486078143119812, "step": 637 }, { "batch_size": 4, "epoch": 0.2548, "step": 637, "tokens_per_device": 4528 }, { "epoch": 0.2548, "loss_ce": 0.4308340847492218, "loss_lvr": 0.6218771934509277, "loss_mode_switch": 0.0, "loss_total": 0.49302181601524353, "step": 637 }, { "batch_size": 4, "epoch": 0.2548, "step": 637, "tokens_per_device": 2568 }, { "epoch": 0.2548, "loss_ce": 0.0770999863743782, "loss_lvr": 1.142421841621399, "loss_mode_switch": 0.0, "loss_total": 0.19134217500686646, "step": 637 }, { "batch_size": 1, "epoch": 0.2548, "step": 637, "tokens_per_device": 5142 }, { "epoch": 0.2548, "loss_ce": 0.06883314251899719, "loss_lvr": 0.7125144004821777, "loss_mode_switch": 0.0, "loss_total": 0.14008459448814392, "step": 637 }, { "batch_size": 4, "epoch": 0.2548, "step": 637, "tokens_per_device": 5716 }, { "epoch": 0.2548, "loss_ce": 0.4451674818992615, "loss_lvr": 0.9070542454719543, "loss_mode_switch": 0.0, "loss_total": 0.5358729362487793, "step": 637 }, { "batch_size": 1, "epoch": 0.2548, "step": 637, "tokens_per_device": 5605 }, { "epoch": 0.2548, "loss_ce": 0.002698227996006608, "loss_lvr": 0.3529064953327179, "loss_mode_switch": 0.0, "loss_total": 0.03798887878656387, "step": 637 }, { "epoch": 0.2552, "grad_norm": 1.39629328250885, "learning_rate": 8.727979586845931e-06, "loss": 0.3005, "step": 638 }, { "batch_size": 4, "epoch": 0.2552, "step": 638, "tokens_per_device": 7188 }, { "epoch": 0.2552, "loss_ce": 0.8637137413024902, "loss_lvr": 0.4876726567745209, "loss_mode_switch": 0.0, "loss_total": 0.9124810099601746, "step": 638 }, { "batch_size": 1, "epoch": 0.2552, "step": 638, "tokens_per_device": 5074 }, { "epoch": 0.2552, "loss_ce": 0.008796876296401024, "loss_lvr": 0.5119669437408447, "loss_mode_switch": 0.0, "loss_total": 0.059993572533130646, "step": 638 }, { "batch_size": 4, "epoch": 0.2552, "step": 638, "tokens_per_device": 3348 }, { "epoch": 0.2552, "loss_ce": 0.3526596426963806, "loss_lvr": 1.0248825550079346, "loss_mode_switch": 0.0, "loss_total": 0.4551478922367096, "step": 638 }, { "batch_size": 4, "epoch": 0.2552, "step": 638, "tokens_per_device": 2732 }, { "epoch": 0.2552, "loss_ce": 0.4873647689819336, "loss_lvr": 0.9921963810920715, "loss_mode_switch": 0.0, "loss_total": 0.5865843892097473, "step": 638 }, { "batch_size": 4, "epoch": 0.2552, "step": 638, "tokens_per_device": 4368 }, { "epoch": 0.2552, "loss_ce": 0.333934485912323, "loss_lvr": 0.9100133180618286, "loss_mode_switch": 0.0, "loss_total": 0.42493581771850586, "step": 638 }, { "batch_size": 4, "epoch": 0.2552, "step": 638, "tokens_per_device": 6280 }, { "epoch": 0.2552, "loss_ce": 0.5305992364883423, "loss_lvr": 0.8663706183433533, "loss_mode_switch": 0.0, "loss_total": 0.617236316204071, "step": 638 }, { "batch_size": 1, "epoch": 0.2552, "step": 638, "tokens_per_device": 4892 }, { "epoch": 0.2552, "loss_ce": 1.2002613544464111, "loss_lvr": 1.4452391862869263, "loss_mode_switch": 0.0, "loss_total": 1.344785213470459, "step": 638 }, { "batch_size": 4, "epoch": 0.2552, "step": 638, "tokens_per_device": 2760 }, { "epoch": 0.2552, "loss_ce": 0.35118916630744934, "loss_lvr": 1.4003828763961792, "loss_mode_switch": 0.0, "loss_total": 0.4912274479866028, "step": 638 }, { "epoch": 0.2556, "grad_norm": 1.2273988723754883, "learning_rate": 8.72365985743829e-06, "loss": 0.2924, "step": 639 }, { "batch_size": 4, "epoch": 0.2556, "step": 639, "tokens_per_device": 4296 }, { "epoch": 0.2556, "loss_ce": 0.3009605407714844, "loss_lvr": 1.1451297998428345, "loss_mode_switch": 0.0, "loss_total": 0.4154735207557678, "step": 639 }, { "batch_size": 1, "epoch": 0.2556, "step": 639, "tokens_per_device": 5432 }, { "epoch": 0.2556, "loss_ce": 0.039284639060497284, "loss_lvr": 0.4586188495159149, "loss_mode_switch": 0.0, "loss_total": 0.08514652401208878, "step": 639 }, { "batch_size": 4, "epoch": 0.2556, "step": 639, "tokens_per_device": 1292 }, { "epoch": 0.2556, "loss_ce": 0.4862701892852783, "loss_lvr": 1.3314738273620605, "loss_mode_switch": 0.0, "loss_total": 0.6194175481796265, "step": 639 }, { "batch_size": 1, "epoch": 0.2556, "step": 639, "tokens_per_device": 5103 }, { "epoch": 0.2556, "loss_ce": 0.26823005080223083, "loss_lvr": 0.5200244784355164, "loss_mode_switch": 0.0, "loss_total": 0.3202325105667114, "step": 639 }, { "batch_size": 4, "epoch": 0.2556, "step": 639, "tokens_per_device": 3796 }, { "epoch": 0.2556, "loss_ce": 0.33101886510849, "loss_lvr": 1.0201785564422607, "loss_mode_switch": 0.0, "loss_total": 0.4330367147922516, "step": 639 }, { "batch_size": 4, "epoch": 0.2556, "step": 639, "tokens_per_device": 4300 }, { "epoch": 0.2556, "loss_ce": 0.4371248483657837, "loss_lvr": 1.0386883020401, "loss_mode_switch": 0.0, "loss_total": 0.5409936904907227, "step": 639 }, { "batch_size": 4, "epoch": 0.2556, "step": 639, "tokens_per_device": 4148 }, { "epoch": 0.2556, "loss_ce": 0.37488776445388794, "loss_lvr": 0.9070961475372314, "loss_mode_switch": 0.0, "loss_total": 0.46559739112854004, "step": 639 }, { "batch_size": 1, "epoch": 0.2556, "step": 639, "tokens_per_device": 5028 }, { "epoch": 0.2556, "loss_ce": 0.051397550851106644, "loss_lvr": 1.1330331563949585, "loss_mode_switch": 0.0, "loss_total": 0.16470086574554443, "step": 639 }, { "epoch": 0.256, "grad_norm": 1.5620075464248657, "learning_rate": 8.719333878517274e-06, "loss": 0.3461, "step": 640 }, { "batch_size": 4, "epoch": 0.256, "step": 640, "tokens_per_device": 4344 }, { "epoch": 0.256, "loss_ce": 0.30993950366973877, "loss_lvr": 1.234677791595459, "loss_mode_switch": 0.0, "loss_total": 0.4334072768688202, "step": 640 }, { "batch_size": 4, "epoch": 0.256, "step": 640, "tokens_per_device": 1856 }, { "epoch": 0.256, "loss_ce": 0.2031501829624176, "loss_lvr": 1.0197854042053223, "loss_mode_switch": 0.0, "loss_total": 0.30512872338294983, "step": 640 }, { "batch_size": 4, "epoch": 0.256, "step": 640, "tokens_per_device": 3904 }, { "epoch": 0.256, "loss_ce": 0.5199184417724609, "loss_lvr": 0.8318714499473572, "loss_mode_switch": 0.0, "loss_total": 0.6031056046485901, "step": 640 }, { "batch_size": 1, "epoch": 0.256, "step": 640, "tokens_per_device": 5896 }, { "epoch": 0.256, "loss_ce": 0.0205920971930027, "loss_lvr": 0.4554750323295593, "loss_mode_switch": 0.0, "loss_total": 0.0661396011710167, "step": 640 }, { "batch_size": 4, "epoch": 0.256, "step": 640, "tokens_per_device": 1272 }, { "epoch": 0.256, "loss_ce": 0.27872294187545776, "loss_lvr": 1.3198438882827759, "loss_mode_switch": 0.0, "loss_total": 0.41070735454559326, "step": 640 }, { "batch_size": 4, "epoch": 0.256, "step": 640, "tokens_per_device": 3960 }, { "epoch": 0.256, "loss_ce": 0.6341944932937622, "loss_lvr": 0.9109839200973511, "loss_mode_switch": 0.0, "loss_total": 0.7252928614616394, "step": 640 }, { "batch_size": 4, "epoch": 0.256, "step": 640, "tokens_per_device": 3744 }, { "epoch": 0.256, "loss_ce": 0.4014021158218384, "loss_lvr": 1.1372628211975098, "loss_mode_switch": 0.0, "loss_total": 0.5151283740997314, "step": 640 }, { "batch_size": 1, "epoch": 0.256, "step": 640, "tokens_per_device": 5014 }, { "epoch": 0.256, "loss_ce": 0.01785304956138134, "loss_lvr": 0.3886711597442627, "loss_mode_switch": 0.0, "loss_total": 0.05672016739845276, "step": 640 }, { "epoch": 0.2564, "grad_norm": 1.6669154167175293, "learning_rate": 8.715001657343285e-06, "loss": 0.3521, "step": 641 }, { "batch_size": 1, "epoch": 0.2564, "step": 641, "tokens_per_device": 4866 }, { "epoch": 0.2564, "loss_ce": 0.06909621506929398, "loss_lvr": 0.28557586669921875, "loss_mode_switch": 0.0, "loss_total": 0.09765380620956421, "step": 641 }, { "batch_size": 4, "epoch": 0.2564, "step": 641, "tokens_per_device": 11052 }, { "epoch": 0.2564, "loss_ce": 0.05361059680581093, "loss_lvr": 0.7772498726844788, "loss_mode_switch": 0.0, "loss_total": 0.13133558630943298, "step": 641 }, { "batch_size": 4, "epoch": 0.2564, "step": 641, "tokens_per_device": 3808 }, { "epoch": 0.2564, "loss_ce": 0.39025479555130005, "loss_lvr": 2.6716887950897217, "loss_mode_switch": 0.0, "loss_total": 0.6574236750602722, "step": 641 }, { "batch_size": 4, "epoch": 0.2564, "step": 641, "tokens_per_device": 4252 }, { "epoch": 0.2564, "loss_ce": 0.5153487920761108, "loss_lvr": 0.9553263783454895, "loss_mode_switch": 0.0, "loss_total": 0.6108814477920532, "step": 641 }, { "batch_size": 4, "epoch": 0.2564, "step": 641, "tokens_per_device": 3892 }, { "epoch": 0.2564, "loss_ce": 0.5050137639045715, "loss_lvr": 1.1885590553283691, "loss_mode_switch": 0.0, "loss_total": 0.6238696575164795, "step": 641 }, { "batch_size": 1, "epoch": 0.2564, "step": 641, "tokens_per_device": 5179 }, { "epoch": 0.2564, "loss_ce": 0.014094068668782711, "loss_lvr": 0.8030762076377869, "loss_mode_switch": 0.0, "loss_total": 0.09440169483423233, "step": 641 }, { "batch_size": 4, "epoch": 0.2564, "step": 641, "tokens_per_device": 2608 }, { "epoch": 0.2564, "loss_ce": 0.07596477121114731, "loss_lvr": 1.1286838054656982, "loss_mode_switch": 0.0, "loss_total": 0.18883314728736877, "step": 641 }, { "batch_size": 4, "epoch": 0.2564, "step": 641, "tokens_per_device": 6484 }, { "epoch": 0.2564, "loss_ce": 0.6767822504043579, "loss_lvr": 0.8077648878097534, "loss_mode_switch": 0.0, "loss_total": 0.7575587630271912, "step": 641 }, { "epoch": 0.2568, "grad_norm": 1.51707124710083, "learning_rate": 8.710663201187203e-06, "loss": 0.3628, "step": 642 }, { "batch_size": 4, "epoch": 0.2568, "step": 642, "tokens_per_device": 1220 }, { "epoch": 0.2568, "loss_ce": 0.16566935181617737, "loss_lvr": 1.111140251159668, "loss_mode_switch": 0.0, "loss_total": 0.27678337693214417, "step": 642 }, { "batch_size": 4, "epoch": 0.2568, "step": 642, "tokens_per_device": 3808 }, { "epoch": 0.2568, "loss_ce": 0.12030817568302155, "loss_lvr": 1.014850378036499, "loss_mode_switch": 0.0, "loss_total": 0.22179320454597473, "step": 642 }, { "batch_size": 4, "epoch": 0.2568, "step": 642, "tokens_per_device": 2688 }, { "epoch": 0.2568, "loss_ce": 0.6514657139778137, "loss_lvr": 1.0110973119735718, "loss_mode_switch": 0.0, "loss_total": 0.7525754570960999, "step": 642 }, { "batch_size": 4, "epoch": 0.2568, "step": 642, "tokens_per_device": 5168 }, { "epoch": 0.2568, "loss_ce": 0.03486121445894241, "loss_lvr": 1.0357635021209717, "loss_mode_switch": 0.0, "loss_total": 0.13843756914138794, "step": 642 }, { "batch_size": 1, "epoch": 0.2568, "step": 642, "tokens_per_device": 5816 }, { "epoch": 0.2568, "loss_ce": 0.0013582013780251145, "loss_lvr": 0.5631324052810669, "loss_mode_switch": 0.0, "loss_total": 0.0576714426279068, "step": 642 }, { "batch_size": 1, "epoch": 0.2568, "step": 642, "tokens_per_device": 5820 }, { "epoch": 0.2568, "loss_ce": 0.00129132776055485, "loss_lvr": 0.487281858921051, "loss_mode_switch": 0.0, "loss_total": 0.0500195138156414, "step": 642 }, { "batch_size": 4, "epoch": 0.2568, "step": 642, "tokens_per_device": 5172 }, { "epoch": 0.2568, "loss_ce": 0.04178975895047188, "loss_lvr": 0.8574323058128357, "loss_mode_switch": 0.0, "loss_total": 0.1275329887866974, "step": 642 }, { "batch_size": 1, "epoch": 0.2568, "step": 642, "tokens_per_device": 4890 }, { "epoch": 0.2568, "loss_ce": 0.8948147296905518, "loss_lvr": 0.6398868560791016, "loss_mode_switch": 0.0, "loss_total": 0.9588034152984619, "step": 642 }, { "epoch": 0.2572, "grad_norm": 1.4572232961654663, "learning_rate": 8.706318517330368e-06, "loss": 0.3461, "step": 643 }, { "batch_size": 4, "epoch": 0.2572, "step": 643, "tokens_per_device": 2604 }, { "epoch": 0.2572, "loss_ce": 0.37371596693992615, "loss_lvr": 0.9089880585670471, "loss_mode_switch": 0.0, "loss_total": 0.46461477875709534, "step": 643 }, { "batch_size": 4, "epoch": 0.2572, "step": 643, "tokens_per_device": 3768 }, { "epoch": 0.2572, "loss_ce": 0.08886769413948059, "loss_lvr": 0.9407424926757812, "loss_mode_switch": 0.0, "loss_total": 0.18294194340705872, "step": 643 }, { "batch_size": 1, "epoch": 0.2572, "step": 643, "tokens_per_device": 5070 }, { "epoch": 0.2572, "loss_ce": 0.052976448088884354, "loss_lvr": 0.40276458859443665, "loss_mode_switch": 0.0, "loss_total": 0.09325291216373444, "step": 643 }, { "batch_size": 4, "epoch": 0.2572, "step": 643, "tokens_per_device": 1432 }, { "epoch": 0.2572, "loss_ce": 0.9461140036582947, "loss_lvr": 0.9695808291435242, "loss_mode_switch": 0.0, "loss_total": 1.0430721044540405, "step": 643 }, { "batch_size": 4, "epoch": 0.2572, "step": 643, "tokens_per_device": 3988 }, { "epoch": 0.2572, "loss_ce": 0.19367261230945587, "loss_lvr": 0.925525426864624, "loss_mode_switch": 0.0, "loss_total": 0.2862251400947571, "step": 643 }, { "batch_size": 4, "epoch": 0.2572, "step": 643, "tokens_per_device": 3352 }, { "epoch": 0.2572, "loss_ce": 0.6605960726737976, "loss_lvr": 1.1747584342956543, "loss_mode_switch": 0.0, "loss_total": 0.778071939945221, "step": 643 }, { "batch_size": 1, "epoch": 0.2572, "step": 643, "tokens_per_device": 5052 }, { "epoch": 0.2572, "loss_ce": 0.001791875227354467, "loss_lvr": 0.5403662323951721, "loss_mode_switch": 0.0, "loss_total": 0.05582850053906441, "step": 643 }, { "batch_size": 4, "epoch": 0.2572, "step": 643, "tokens_per_device": 1580 }, { "epoch": 0.2572, "loss_ce": 0.43108469247817993, "loss_lvr": 0.9997578859329224, "loss_mode_switch": 0.0, "loss_total": 0.5310604572296143, "step": 643 }, { "epoch": 0.2576, "grad_norm": 1.3437471389770508, "learning_rate": 8.701967613064575e-06, "loss": 0.3512, "step": 644 }, { "batch_size": 1, "epoch": 0.2576, "step": 644, "tokens_per_device": 4863 }, { "epoch": 0.2576, "loss_ce": 0.00910522136837244, "loss_lvr": 0.48057836294174194, "loss_mode_switch": 0.0, "loss_total": 0.057163055986166, "step": 644 }, { "batch_size": 1, "epoch": 0.2576, "step": 644, "tokens_per_device": 4855 }, { "epoch": 0.2576, "loss_ce": 0.00827179104089737, "loss_lvr": 0.3910319209098816, "loss_mode_switch": 0.0, "loss_total": 0.04737498238682747, "step": 644 }, { "batch_size": 4, "epoch": 0.2576, "step": 644, "tokens_per_device": 4092 }, { "epoch": 0.2576, "loss_ce": 0.8036607503890991, "loss_lvr": 1.008307933807373, "loss_mode_switch": 0.0, "loss_total": 0.9044915437698364, "step": 644 }, { "batch_size": 4, "epoch": 0.2576, "step": 644, "tokens_per_device": 1440 }, { "epoch": 0.2576, "loss_ce": 0.569735586643219, "loss_lvr": 0.8463127017021179, "loss_mode_switch": 0.0, "loss_total": 0.6543668508529663, "step": 644 }, { "batch_size": 1, "epoch": 0.2576, "step": 644, "tokens_per_device": 4882 }, { "epoch": 0.2576, "loss_ce": 0.22267283499240875, "loss_lvr": 1.0898922681808472, "loss_mode_switch": 0.0, "loss_total": 0.33166205883026123, "step": 644 }, { "batch_size": 4, "epoch": 0.2576, "step": 644, "tokens_per_device": 3748 }, { "epoch": 0.2576, "loss_ce": 0.019406616687774658, "loss_lvr": 1.2573670148849487, "loss_mode_switch": 0.0, "loss_total": 0.1451433151960373, "step": 644 }, { "batch_size": 4, "epoch": 0.2576, "step": 644, "tokens_per_device": 4012 }, { "epoch": 0.2576, "loss_ce": 0.27463874220848083, "loss_lvr": 1.1015878915786743, "loss_mode_switch": 0.0, "loss_total": 0.3847975432872772, "step": 644 }, { "batch_size": 1, "epoch": 0.2576, "step": 644, "tokens_per_device": 5033 }, { "epoch": 0.2576, "loss_ce": 0.12932780385017395, "loss_lvr": 0.46605610847473145, "loss_mode_switch": 0.0, "loss_total": 0.17593342065811157, "step": 644 }, { "epoch": 0.258, "grad_norm": 1.4528051614761353, "learning_rate": 8.697610495692055e-06, "loss": 0.3028, "step": 645 }, { "batch_size": 4, "epoch": 0.258, "step": 645, "tokens_per_device": 3768 }, { "epoch": 0.258, "loss_ce": 0.1813764125108719, "loss_lvr": 0.8586586713790894, "loss_mode_switch": 0.0, "loss_total": 0.26724228262901306, "step": 645 }, { "batch_size": 1, "epoch": 0.258, "step": 645, "tokens_per_device": 4900 }, { "epoch": 0.258, "loss_ce": 0.02961556427180767, "loss_lvr": 0.9971606135368347, "loss_mode_switch": 0.0, "loss_total": 0.12933161854743958, "step": 645 }, { "batch_size": 1, "epoch": 0.258, "step": 645, "tokens_per_device": 5069 }, { "epoch": 0.258, "loss_ce": 0.00048027560114860535, "loss_lvr": 0.44288432598114014, "loss_mode_switch": 0.0, "loss_total": 0.04476870968937874, "step": 645 }, { "batch_size": 4, "epoch": 0.258, "step": 645, "tokens_per_device": 5568 }, { "epoch": 0.258, "loss_ce": 0.33466964960098267, "loss_lvr": 1.1915485858917236, "loss_mode_switch": 0.0, "loss_total": 0.453824520111084, "step": 645 }, { "batch_size": 1, "epoch": 0.258, "step": 645, "tokens_per_device": 4834 }, { "epoch": 0.258, "loss_ce": 2.0311152935028076, "loss_lvr": 0.4572230279445648, "loss_mode_switch": 0.0, "loss_total": 2.0768375396728516, "step": 645 }, { "batch_size": 1, "epoch": 0.258, "step": 645, "tokens_per_device": 5463 }, { "epoch": 0.258, "loss_ce": 0.2614569664001465, "loss_lvr": 0.3125355839729309, "loss_mode_switch": 0.0, "loss_total": 0.2927105128765106, "step": 645 }, { "batch_size": 4, "epoch": 0.258, "step": 645, "tokens_per_device": 4536 }, { "epoch": 0.258, "loss_ce": 0.31145742535591125, "loss_lvr": 1.3438758850097656, "loss_mode_switch": 0.0, "loss_total": 0.44584500789642334, "step": 645 }, { "batch_size": 4, "epoch": 0.258, "step": 645, "tokens_per_device": 2544 }, { "epoch": 0.258, "loss_ce": 0.26879727840423584, "loss_lvr": 1.2525701522827148, "loss_mode_switch": 0.0, "loss_total": 0.3940542936325073, "step": 645 }, { "epoch": 0.2584, "grad_norm": 1.4626743793487549, "learning_rate": 8.693247172525472e-06, "loss": 0.3297, "step": 646 }, { "batch_size": 1, "epoch": 0.2584, "step": 646, "tokens_per_device": 4759 }, { "epoch": 0.2584, "loss_ce": 0.16325026750564575, "loss_lvr": 0.26172611117362976, "loss_mode_switch": 0.0, "loss_total": 0.1894228756427765, "step": 646 }, { "batch_size": 4, "epoch": 0.2584, "step": 646, "tokens_per_device": 3944 }, { "epoch": 0.2584, "loss_ce": 0.20453351736068726, "loss_lvr": 1.0410614013671875, "loss_mode_switch": 0.0, "loss_total": 0.30863964557647705, "step": 646 }, { "batch_size": 1, "epoch": 0.2584, "step": 646, "tokens_per_device": 5051 }, { "epoch": 0.2584, "loss_ce": 0.13237808644771576, "loss_lvr": 0.603076159954071, "loss_mode_switch": 0.0, "loss_total": 0.19268570840358734, "step": 646 }, { "batch_size": 1, "epoch": 0.2584, "step": 646, "tokens_per_device": 4963 }, { "epoch": 0.2584, "loss_ce": 0.16476568579673767, "loss_lvr": 0.5735282301902771, "loss_mode_switch": 0.0, "loss_total": 0.22211851179599762, "step": 646 }, { "batch_size": 4, "epoch": 0.2584, "step": 646, "tokens_per_device": 3028 }, { "epoch": 0.2584, "loss_ce": 0.2063654065132141, "loss_lvr": 0.8704984784126282, "loss_mode_switch": 0.0, "loss_total": 0.29341524839401245, "step": 646 }, { "batch_size": 4, "epoch": 0.2584, "step": 646, "tokens_per_device": 4212 }, { "epoch": 0.2584, "loss_ce": 0.024743005633354187, "loss_lvr": 0.56468665599823, "loss_mode_switch": 0.0, "loss_total": 0.08121167123317719, "step": 646 }, { "batch_size": 4, "epoch": 0.2584, "step": 646, "tokens_per_device": 8092 }, { "epoch": 0.2584, "loss_ce": 0.02395949885249138, "loss_lvr": 0.5283374190330505, "loss_mode_switch": 0.0, "loss_total": 0.07679323852062225, "step": 646 }, { "batch_size": 4, "epoch": 0.2584, "step": 646, "tokens_per_device": 4432 }, { "epoch": 0.2584, "loss_ce": 0.07341811805963516, "loss_lvr": 0.7500742077827454, "loss_mode_switch": 0.0, "loss_total": 0.14842554926872253, "step": 646 }, { "epoch": 0.2588, "grad_norm": 1.3911505937576294, "learning_rate": 8.6888776508879e-06, "loss": 0.3349, "step": 647 }, { "batch_size": 4, "epoch": 0.2588, "step": 647, "tokens_per_device": 2808 }, { "epoch": 0.2588, "loss_ce": 0.4276913106441498, "loss_lvr": 0.9768526554107666, "loss_mode_switch": 0.0, "loss_total": 0.525376558303833, "step": 647 }, { "batch_size": 1, "epoch": 0.2588, "step": 647, "tokens_per_device": 5108 }, { "epoch": 0.2588, "loss_ce": 0.021800056099891663, "loss_lvr": 0.3615778982639313, "loss_mode_switch": 0.0, "loss_total": 0.05795784667134285, "step": 647 }, { "batch_size": 4, "epoch": 0.2588, "step": 647, "tokens_per_device": 4400 }, { "epoch": 0.2588, "loss_ce": 0.18907776474952698, "loss_lvr": 1.085821509361267, "loss_mode_switch": 0.0, "loss_total": 0.2976599335670471, "step": 647 }, { "batch_size": 4, "epoch": 0.2588, "step": 647, "tokens_per_device": 1588 }, { "epoch": 0.2588, "loss_ce": 0.7657960057258606, "loss_lvr": 1.020229697227478, "loss_mode_switch": 0.0, "loss_total": 0.8678189516067505, "step": 647 }, { "batch_size": 4, "epoch": 0.2588, "step": 647, "tokens_per_device": 4328 }, { "epoch": 0.2588, "loss_ce": 0.46110859513282776, "loss_lvr": 1.0723907947540283, "loss_mode_switch": 0.0, "loss_total": 0.568347692489624, "step": 647 }, { "batch_size": 4, "epoch": 0.2588, "step": 647, "tokens_per_device": 4260 }, { "epoch": 0.2588, "loss_ce": 0.2337273806333542, "loss_lvr": 0.9072934985160828, "loss_mode_switch": 0.0, "loss_total": 0.32445672154426575, "step": 647 }, { "batch_size": 4, "epoch": 0.2588, "step": 647, "tokens_per_device": 4444 }, { "epoch": 0.2588, "loss_ce": 0.6263801455497742, "loss_lvr": 1.0264211893081665, "loss_mode_switch": 0.0, "loss_total": 0.7290222644805908, "step": 647 }, { "batch_size": 4, "epoch": 0.2588, "step": 647, "tokens_per_device": 4300 }, { "epoch": 0.2588, "loss_ce": 0.02459581010043621, "loss_lvr": 1.6892482042312622, "loss_mode_switch": 0.0, "loss_total": 0.19352063536643982, "step": 647 }, { "epoch": 0.2592, "grad_norm": 1.878273367881775, "learning_rate": 8.684501938112822e-06, "loss": 0.3528, "step": 648 }, { "batch_size": 1, "epoch": 0.2592, "step": 648, "tokens_per_device": 4881 }, { "epoch": 0.2592, "loss_ce": 0.06595741212368011, "loss_lvr": 0.5247538089752197, "loss_mode_switch": 0.0, "loss_total": 0.11843279004096985, "step": 648 }, { "batch_size": 4, "epoch": 0.2592, "step": 648, "tokens_per_device": 5420 }, { "epoch": 0.2592, "loss_ce": 0.6300186514854431, "loss_lvr": 0.8944836854934692, "loss_mode_switch": 0.0, "loss_total": 0.719467043876648, "step": 648 }, { "batch_size": 4, "epoch": 0.2592, "step": 648, "tokens_per_device": 3612 }, { "epoch": 0.2592, "loss_ce": 0.16584348678588867, "loss_lvr": 0.9284868836402893, "loss_mode_switch": 0.0, "loss_total": 0.2586921751499176, "step": 648 }, { "batch_size": 4, "epoch": 0.2592, "step": 648, "tokens_per_device": 3896 }, { "epoch": 0.2592, "loss_ce": 0.15125350654125214, "loss_lvr": 1.0168734788894653, "loss_mode_switch": 0.0, "loss_total": 0.2529408633708954, "step": 648 }, { "batch_size": 4, "epoch": 0.2592, "step": 648, "tokens_per_device": 2612 }, { "epoch": 0.2592, "loss_ce": 0.2515307664871216, "loss_lvr": 1.1733698844909668, "loss_mode_switch": 0.0, "loss_total": 0.36886775493621826, "step": 648 }, { "batch_size": 4, "epoch": 0.2592, "step": 648, "tokens_per_device": 10552 }, { "epoch": 0.2592, "loss_ce": 0.21086041629314423, "loss_lvr": 1.2027862071990967, "loss_mode_switch": 0.0, "loss_total": 0.3311390280723572, "step": 648 }, { "batch_size": 4, "epoch": 0.2592, "step": 648, "tokens_per_device": 3440 }, { "epoch": 0.2592, "loss_ce": 0.0958729162812233, "loss_lvr": 0.9881448149681091, "loss_mode_switch": 0.0, "loss_total": 0.1946873962879181, "step": 648 }, { "batch_size": 4, "epoch": 0.2592, "step": 648, "tokens_per_device": 1316 }, { "epoch": 0.2592, "loss_ce": 0.3641774356365204, "loss_lvr": 1.0420713424682617, "loss_mode_switch": 0.0, "loss_total": 0.4683845639228821, "step": 648 }, { "epoch": 0.2596, "grad_norm": 1.5515869855880737, "learning_rate": 8.680120041544106e-06, "loss": 0.3563, "step": 649 }, { "batch_size": 4, "epoch": 0.2596, "step": 649, "tokens_per_device": 9996 }, { "epoch": 0.2596, "loss_ce": 0.31865444779396057, "loss_lvr": 0.6685915589332581, "loss_mode_switch": 0.0, "loss_total": 0.3855136036872864, "step": 649 }, { "batch_size": 1, "epoch": 0.2596, "step": 649, "tokens_per_device": 5005 }, { "epoch": 0.2596, "loss_ce": 0.05857059732079506, "loss_lvr": 0.10187948495149612, "loss_mode_switch": 0.0, "loss_total": 0.06875854730606079, "step": 649 }, { "batch_size": 4, "epoch": 0.2596, "step": 649, "tokens_per_device": 9996 }, { "epoch": 0.2596, "loss_ce": 0.0624699667096138, "loss_lvr": 0.707707941532135, "loss_mode_switch": 0.0, "loss_total": 0.13324075937271118, "step": 649 }, { "batch_size": 4, "epoch": 0.2596, "step": 649, "tokens_per_device": 4224 }, { "epoch": 0.2596, "loss_ce": 0.26230788230895996, "loss_lvr": 0.871564507484436, "loss_mode_switch": 0.0, "loss_total": 0.3494643270969391, "step": 649 }, { "batch_size": 1, "epoch": 0.2596, "step": 649, "tokens_per_device": 6064 }, { "epoch": 0.2596, "loss_ce": 0.36706310510635376, "loss_lvr": 0.7165483236312866, "loss_mode_switch": 0.0, "loss_total": 0.43871793150901794, "step": 649 }, { "batch_size": 1, "epoch": 0.2596, "step": 649, "tokens_per_device": 4883 }, { "epoch": 0.2596, "loss_ce": 0.0004883524961769581, "loss_lvr": 0.5151474475860596, "loss_mode_switch": 0.0, "loss_total": 0.05200309678912163, "step": 649 }, { "batch_size": 4, "epoch": 0.2596, "step": 649, "tokens_per_device": 5560 }, { "epoch": 0.2596, "loss_ce": 0.2660026550292969, "loss_lvr": 0.9599370956420898, "loss_mode_switch": 0.0, "loss_total": 0.3619963526725769, "step": 649 }, { "batch_size": 1, "epoch": 0.2596, "step": 649, "tokens_per_device": 4887 }, { "epoch": 0.2596, "loss_ce": 0.005261681973934174, "loss_lvr": 0.4564438760280609, "loss_mode_switch": 0.0, "loss_total": 0.050906069576740265, "step": 649 }, { "epoch": 0.26, "grad_norm": 1.3419935703277588, "learning_rate": 8.675731968536004e-06, "loss": 0.3028, "step": 650 }, { "batch_size": 1, "epoch": 0.26, "step": 650, "tokens_per_device": 4907 }, { "epoch": 0.26, "loss_ce": 0.31796810030937195, "loss_lvr": 0.39894044399261475, "loss_mode_switch": 0.0, "loss_total": 0.3578621447086334, "step": 650 }, { "batch_size": 4, "epoch": 0.26, "step": 650, "tokens_per_device": 4316 }, { "epoch": 0.26, "loss_ce": 0.05101480334997177, "loss_lvr": 1.5516833066940308, "loss_mode_switch": 0.0, "loss_total": 0.20618313550949097, "step": 650 }, { "batch_size": 4, "epoch": 0.26, "step": 650, "tokens_per_device": 1724 }, { "epoch": 0.26, "loss_ce": 0.23393678665161133, "loss_lvr": 1.257378339767456, "loss_mode_switch": 0.0, "loss_total": 0.3596746325492859, "step": 650 }, { "batch_size": 4, "epoch": 0.26, "step": 650, "tokens_per_device": 11684 }, { "epoch": 0.26, "loss_ce": 0.003843392012640834, "loss_lvr": 0.5944463610649109, "loss_mode_switch": 0.0, "loss_total": 0.06328802555799484, "step": 650 }, { "batch_size": 4, "epoch": 0.26, "step": 650, "tokens_per_device": 5640 }, { "epoch": 0.26, "loss_ce": 0.5402956008911133, "loss_lvr": 0.8553517460823059, "loss_mode_switch": 0.0, "loss_total": 0.6258307695388794, "step": 650 }, { "batch_size": 1, "epoch": 0.26, "step": 650, "tokens_per_device": 4860 }, { "epoch": 0.26, "loss_ce": 0.0003580684424377978, "loss_lvr": 0.32090064883232117, "loss_mode_switch": 0.0, "loss_total": 0.03244813159108162, "step": 650 }, { "batch_size": 1, "epoch": 0.26, "step": 650, "tokens_per_device": 4859 }, { "epoch": 0.26, "loss_ce": 0.1439536064863205, "loss_lvr": 0.8000903725624084, "loss_mode_switch": 0.0, "loss_total": 0.22396263480186462, "step": 650 }, { "batch_size": 1, "epoch": 0.26, "step": 650, "tokens_per_device": 4870 }, { "epoch": 0.26, "loss_ce": 0.02410612255334854, "loss_lvr": 0.5786659717559814, "loss_mode_switch": 0.0, "loss_total": 0.08197271823883057, "step": 650 }, { "epoch": 0.2604, "grad_norm": 1.3377459049224854, "learning_rate": 8.671337726453126e-06, "loss": 0.2878, "step": 651 }, { "batch_size": 4, "epoch": 0.2604, "step": 651, "tokens_per_device": 6144 }, { "epoch": 0.2604, "loss_ce": 0.245545893907547, "loss_lvr": 0.8678510189056396, "loss_mode_switch": 0.0, "loss_total": 0.33233100175857544, "step": 651 }, { "batch_size": 4, "epoch": 0.2604, "step": 651, "tokens_per_device": 3536 }, { "epoch": 0.2604, "loss_ce": 0.23858126997947693, "loss_lvr": 1.0828150510787964, "loss_mode_switch": 0.0, "loss_total": 0.34686279296875, "step": 651 }, { "batch_size": 4, "epoch": 0.2604, "step": 651, "tokens_per_device": 1536 }, { "epoch": 0.2604, "loss_ce": 0.8668252825737, "loss_lvr": 1.259942650794983, "loss_mode_switch": 0.0, "loss_total": 0.9928195476531982, "step": 651 }, { "batch_size": 4, "epoch": 0.2604, "step": 651, "tokens_per_device": 4244 }, { "epoch": 0.2604, "loss_ce": 0.2583855986595154, "loss_lvr": 1.427232265472412, "loss_mode_switch": 0.0, "loss_total": 0.40110883116722107, "step": 651 }, { "batch_size": 1, "epoch": 0.2604, "step": 651, "tokens_per_device": 4853 }, { "epoch": 0.2604, "loss_ce": 0.14113974571228027, "loss_lvr": 0.7467222809791565, "loss_mode_switch": 0.0, "loss_total": 0.21581196784973145, "step": 651 }, { "batch_size": 4, "epoch": 0.2604, "step": 651, "tokens_per_device": 4496 }, { "epoch": 0.2604, "loss_ce": 0.16246291995048523, "loss_lvr": 0.8578051924705505, "loss_mode_switch": 0.0, "loss_total": 0.24824345111846924, "step": 651 }, { "batch_size": 4, "epoch": 0.2604, "step": 651, "tokens_per_device": 11344 }, { "epoch": 0.2604, "loss_ce": 0.026452740654349327, "loss_lvr": 0.642249345779419, "loss_mode_switch": 0.0, "loss_total": 0.09067767858505249, "step": 651 }, { "batch_size": 1, "epoch": 0.2604, "step": 651, "tokens_per_device": 5090 }, { "epoch": 0.2604, "loss_ce": 0.019693730399012566, "loss_lvr": 0.4876057207584381, "loss_mode_switch": 0.0, "loss_total": 0.0684543028473854, "step": 651 }, { "epoch": 0.2608, "grad_norm": 1.4691308736801147, "learning_rate": 8.666937322670443e-06, "loss": 0.3017, "step": 652 }, { "batch_size": 4, "epoch": 0.2608, "step": 652, "tokens_per_device": 5836 }, { "epoch": 0.2608, "loss_ce": 0.056574106216430664, "loss_lvr": 0.8976925015449524, "loss_mode_switch": 0.0, "loss_total": 0.14634335041046143, "step": 652 }, { "batch_size": 1, "epoch": 0.2608, "step": 652, "tokens_per_device": 5026 }, { "epoch": 0.2608, "loss_ce": 0.06065724045038223, "loss_lvr": 0.5786805748939514, "loss_mode_switch": 0.0, "loss_total": 0.11852529644966125, "step": 652 }, { "batch_size": 1, "epoch": 0.2608, "step": 652, "tokens_per_device": 5142 }, { "epoch": 0.2608, "loss_ce": 0.015638209879398346, "loss_lvr": 0.5669052600860596, "loss_mode_switch": 0.0, "loss_total": 0.07232873141765594, "step": 652 }, { "batch_size": 1, "epoch": 0.2608, "step": 652, "tokens_per_device": 4866 }, { "epoch": 0.2608, "loss_ce": 0.023113742470741272, "loss_lvr": 0.2794187366962433, "loss_mode_switch": 0.0, "loss_total": 0.05105561763048172, "step": 652 }, { "batch_size": 1, "epoch": 0.2608, "step": 652, "tokens_per_device": 4965 }, { "epoch": 0.2608, "loss_ce": 0.25850698351860046, "loss_lvr": 0.7839319705963135, "loss_mode_switch": 0.0, "loss_total": 0.33690017461776733, "step": 652 }, { "batch_size": 4, "epoch": 0.2608, "step": 652, "tokens_per_device": 3724 }, { "epoch": 0.2608, "loss_ce": 0.17851531505584717, "loss_lvr": 0.9600979685783386, "loss_mode_switch": 0.0, "loss_total": 0.27452510595321655, "step": 652 }, { "batch_size": 4, "epoch": 0.2608, "step": 652, "tokens_per_device": 11980 }, { "epoch": 0.2608, "loss_ce": 0.01607424020767212, "loss_lvr": 0.4343527853488922, "loss_mode_switch": 0.0, "loss_total": 0.0595095194876194, "step": 652 }, { "batch_size": 4, "epoch": 0.2608, "step": 652, "tokens_per_device": 6632 }, { "epoch": 0.2608, "loss_ce": 0.07040150463581085, "loss_lvr": 0.8053926825523376, "loss_mode_switch": 0.0, "loss_total": 0.15094077587127686, "step": 652 }, { "epoch": 0.2612, "grad_norm": 1.3394010066986084, "learning_rate": 8.662530764573264e-06, "loss": 0.3263, "step": 653 }, { "batch_size": 4, "epoch": 0.2612, "step": 653, "tokens_per_device": 10552 }, { "epoch": 0.2612, "loss_ce": 0.07474226504564285, "loss_lvr": 0.7384806275367737, "loss_mode_switch": 0.0, "loss_total": 0.1485903263092041, "step": 653 }, { "batch_size": 4, "epoch": 0.2612, "step": 653, "tokens_per_device": 3804 }, { "epoch": 0.2612, "loss_ce": 0.42897629737854004, "loss_lvr": 1.102157711982727, "loss_mode_switch": 0.0, "loss_total": 0.5391920804977417, "step": 653 }, { "batch_size": 4, "epoch": 0.2612, "step": 653, "tokens_per_device": 4452 }, { "epoch": 0.2612, "loss_ce": 0.3399715721607208, "loss_lvr": 0.8524060845375061, "loss_mode_switch": 0.0, "loss_total": 0.42521217465400696, "step": 653 }, { "batch_size": 1, "epoch": 0.2612, "step": 653, "tokens_per_device": 4787 }, { "epoch": 0.2612, "loss_ce": 1.737000823020935, "loss_lvr": 0.46576061844825745, "loss_mode_switch": 0.0, "loss_total": 1.7835768461227417, "step": 653 }, { "batch_size": 4, "epoch": 0.2612, "step": 653, "tokens_per_device": 4452 }, { "epoch": 0.2612, "loss_ce": 0.3692236840724945, "loss_lvr": 0.7311843633651733, "loss_mode_switch": 0.0, "loss_total": 0.4423421323299408, "step": 653 }, { "batch_size": 4, "epoch": 0.2612, "step": 653, "tokens_per_device": 2680 }, { "epoch": 0.2612, "loss_ce": 0.2876198887825012, "loss_lvr": 0.7744967341423035, "loss_mode_switch": 0.0, "loss_total": 0.36506956815719604, "step": 653 }, { "batch_size": 4, "epoch": 0.2612, "step": 653, "tokens_per_device": 6812 }, { "epoch": 0.2612, "loss_ce": 0.3825449049472809, "loss_lvr": 0.7298076152801514, "loss_mode_switch": 0.0, "loss_total": 0.455525666475296, "step": 653 }, { "batch_size": 1, "epoch": 0.2612, "step": 653, "tokens_per_device": 5201 }, { "epoch": 0.2612, "loss_ce": 0.04993105307221413, "loss_lvr": 0.7210831046104431, "loss_mode_switch": 0.0, "loss_total": 0.12203936278820038, "step": 653 }, { "epoch": 0.2616, "grad_norm": 1.5822778940200806, "learning_rate": 8.658118059557233e-06, "loss": 0.3253, "step": 654 }, { "batch_size": 1, "epoch": 0.2616, "step": 654, "tokens_per_device": 5312 }, { "epoch": 0.2616, "loss_ce": 0.5326700210571289, "loss_lvr": 0.7300371527671814, "loss_mode_switch": 0.0, "loss_total": 0.6056737303733826, "step": 654 }, { "batch_size": 4, "epoch": 0.2616, "step": 654, "tokens_per_device": 4380 }, { "epoch": 0.2616, "loss_ce": 0.1705002784729004, "loss_lvr": 1.0038777589797974, "loss_mode_switch": 0.0, "loss_total": 0.2708880603313446, "step": 654 }, { "batch_size": 4, "epoch": 0.2616, "step": 654, "tokens_per_device": 2480 }, { "epoch": 0.2616, "loss_ce": 0.5365329384803772, "loss_lvr": 1.051067590713501, "loss_mode_switch": 0.0, "loss_total": 0.6416397094726562, "step": 654 }, { "batch_size": 4, "epoch": 0.2616, "step": 654, "tokens_per_device": 1676 }, { "epoch": 0.2616, "loss_ce": 0.3920504152774811, "loss_lvr": 1.0807360410690308, "loss_mode_switch": 0.0, "loss_total": 0.5001240372657776, "step": 654 }, { "batch_size": 4, "epoch": 0.2616, "step": 654, "tokens_per_device": 1316 }, { "epoch": 0.2616, "loss_ce": 0.3592335283756256, "loss_lvr": 1.134606122970581, "loss_mode_switch": 0.0, "loss_total": 0.47269415855407715, "step": 654 }, { "batch_size": 4, "epoch": 0.2616, "step": 654, "tokens_per_device": 5008 }, { "epoch": 0.2616, "loss_ce": 0.006841795984655619, "loss_lvr": 0.862320601940155, "loss_mode_switch": 0.0, "loss_total": 0.09307385236024857, "step": 654 }, { "batch_size": 4, "epoch": 0.2616, "step": 654, "tokens_per_device": 4220 }, { "epoch": 0.2616, "loss_ce": 0.21355965733528137, "loss_lvr": 1.0094503164291382, "loss_mode_switch": 0.0, "loss_total": 0.3145046830177307, "step": 654 }, { "batch_size": 4, "epoch": 0.2616, "step": 654, "tokens_per_device": 1404 }, { "epoch": 0.2616, "loss_ce": 0.6032416820526123, "loss_lvr": 1.6700174808502197, "loss_mode_switch": 0.0, "loss_total": 0.7702434062957764, "step": 654 }, { "epoch": 0.262, "grad_norm": 1.4908506870269775, "learning_rate": 8.653699215028298e-06, "loss": 0.3028, "step": 655 }, { "batch_size": 1, "epoch": 0.262, "step": 655, "tokens_per_device": 4892 }, { "epoch": 0.262, "loss_ce": 0.17043110728263855, "loss_lvr": 0.8230219483375549, "loss_mode_switch": 0.0, "loss_total": 0.2527332901954651, "step": 655 }, { "batch_size": 1, "epoch": 0.262, "step": 655, "tokens_per_device": 4798 }, { "epoch": 0.262, "loss_ce": 0.19156765937805176, "loss_lvr": 0.4288663864135742, "loss_mode_switch": 0.0, "loss_total": 0.23445430397987366, "step": 655 }, { "batch_size": 1, "epoch": 0.262, "step": 655, "tokens_per_device": 4736 }, { "epoch": 0.262, "loss_ce": 0.016927555203437805, "loss_lvr": 0.278638631105423, "loss_mode_switch": 0.0, "loss_total": 0.04479141905903816, "step": 655 }, { "batch_size": 4, "epoch": 0.262, "step": 655, "tokens_per_device": 5756 }, { "epoch": 0.262, "loss_ce": 0.32971081137657166, "loss_lvr": 0.722014844417572, "loss_mode_switch": 0.0, "loss_total": 0.40191230177879333, "step": 655 }, { "batch_size": 4, "epoch": 0.262, "step": 655, "tokens_per_device": 4296 }, { "epoch": 0.262, "loss_ce": 0.3498460054397583, "loss_lvr": 0.9099009037017822, "loss_mode_switch": 0.0, "loss_total": 0.440836101770401, "step": 655 }, { "batch_size": 4, "epoch": 0.262, "step": 655, "tokens_per_device": 4344 }, { "epoch": 0.262, "loss_ce": 0.3751075565814972, "loss_lvr": 0.8961731791496277, "loss_mode_switch": 0.0, "loss_total": 0.4647248685359955, "step": 655 }, { "batch_size": 4, "epoch": 0.262, "step": 655, "tokens_per_device": 3764 }, { "epoch": 0.262, "loss_ce": 0.2544381618499756, "loss_lvr": 0.9659058451652527, "loss_mode_switch": 0.0, "loss_total": 0.3510287404060364, "step": 655 }, { "batch_size": 1, "epoch": 0.262, "step": 655, "tokens_per_device": 5023 }, { "epoch": 0.262, "loss_ce": 0.7701624035835266, "loss_lvr": 0.5874338150024414, "loss_mode_switch": 0.0, "loss_total": 0.8289057612419128, "step": 655 }, { "epoch": 0.2624, "grad_norm": 1.3018633127212524, "learning_rate": 8.649274238402723e-06, "loss": 0.2838, "step": 656 }, { "batch_size": 4, "epoch": 0.2624, "step": 656, "tokens_per_device": 3372 }, { "epoch": 0.2624, "loss_ce": 0.2884586751461029, "loss_lvr": 0.6925889849662781, "loss_mode_switch": 0.0, "loss_total": 0.3577175736427307, "step": 656 }, { "batch_size": 4, "epoch": 0.2624, "step": 656, "tokens_per_device": 4420 }, { "epoch": 0.2624, "loss_ce": 0.20738548040390015, "loss_lvr": 1.1247138977050781, "loss_mode_switch": 0.0, "loss_total": 0.3198568820953369, "step": 656 }, { "batch_size": 1, "epoch": 0.2624, "step": 656, "tokens_per_device": 5134 }, { "epoch": 0.2624, "loss_ce": 0.012924359180033207, "loss_lvr": 0.3626862168312073, "loss_mode_switch": 0.0, "loss_total": 0.04919297993183136, "step": 656 }, { "batch_size": 4, "epoch": 0.2624, "step": 656, "tokens_per_device": 1608 }, { "epoch": 0.2624, "loss_ce": 0.46366673707962036, "loss_lvr": 1.0464240312576294, "loss_mode_switch": 0.0, "loss_total": 0.5683091282844543, "step": 656 }, { "batch_size": 4, "epoch": 0.2624, "step": 656, "tokens_per_device": 3792 }, { "epoch": 0.2624, "loss_ce": 0.4548511505126953, "loss_lvr": 0.9314236044883728, "loss_mode_switch": 0.0, "loss_total": 0.547993540763855, "step": 656 }, { "batch_size": 4, "epoch": 0.2624, "step": 656, "tokens_per_device": 5884 }, { "epoch": 0.2624, "loss_ce": 0.39335891604423523, "loss_lvr": 0.8129990696907043, "loss_mode_switch": 0.0, "loss_total": 0.4746588170528412, "step": 656 }, { "batch_size": 4, "epoch": 0.2624, "step": 656, "tokens_per_device": 1180 }, { "epoch": 0.2624, "loss_ce": 0.051413752138614655, "loss_lvr": 1.190289855003357, "loss_mode_switch": 0.0, "loss_total": 0.17044273018836975, "step": 656 }, { "batch_size": 4, "epoch": 0.2624, "step": 656, "tokens_per_device": 4768 }, { "epoch": 0.2624, "loss_ce": 0.3773551881313324, "loss_lvr": 0.8990882635116577, "loss_mode_switch": 0.0, "loss_total": 0.4672640264034271, "step": 656 }, { "epoch": 0.2628, "grad_norm": 1.447235345840454, "learning_rate": 8.644843137107058e-06, "loss": 0.3039, "step": 657 }, { "batch_size": 1, "epoch": 0.2628, "step": 657, "tokens_per_device": 4931 }, { "epoch": 0.2628, "loss_ce": 0.07951882481575012, "loss_lvr": 0.528560996055603, "loss_mode_switch": 0.0, "loss_total": 0.13237492740154266, "step": 657 }, { "batch_size": 4, "epoch": 0.2628, "step": 657, "tokens_per_device": 3844 }, { "epoch": 0.2628, "loss_ce": 0.1164216473698616, "loss_lvr": 1.0720213651657104, "loss_mode_switch": 0.0, "loss_total": 0.22362378239631653, "step": 657 }, { "batch_size": 4, "epoch": 0.2628, "step": 657, "tokens_per_device": 6944 }, { "epoch": 0.2628, "loss_ce": 0.24460966885089874, "loss_lvr": 0.9798447489738464, "loss_mode_switch": 0.0, "loss_total": 0.3425941467285156, "step": 657 }, { "batch_size": 4, "epoch": 0.2628, "step": 657, "tokens_per_device": 12440 }, { "epoch": 0.2628, "loss_ce": 0.3757644593715668, "loss_lvr": 0.8708094358444214, "loss_mode_switch": 0.0, "loss_total": 0.46284541487693787, "step": 657 }, { "batch_size": 4, "epoch": 0.2628, "step": 657, "tokens_per_device": 4304 }, { "epoch": 0.2628, "loss_ce": 0.6494417786598206, "loss_lvr": 1.1826733350753784, "loss_mode_switch": 0.0, "loss_total": 0.7677091360092163, "step": 657 }, { "batch_size": 1, "epoch": 0.2628, "step": 657, "tokens_per_device": 4988 }, { "epoch": 0.2628, "loss_ce": 0.0005933195352554321, "loss_lvr": 0.4748222827911377, "loss_mode_switch": 0.0, "loss_total": 0.04807554930448532, "step": 657 }, { "batch_size": 1, "epoch": 0.2628, "step": 657, "tokens_per_device": 5172 }, { "epoch": 0.2628, "loss_ce": 0.014629114419221878, "loss_lvr": 0.5606065988540649, "loss_mode_switch": 0.0, "loss_total": 0.07068977504968643, "step": 657 }, { "batch_size": 1, "epoch": 0.2628, "step": 657, "tokens_per_device": 5178 }, { "epoch": 0.2628, "loss_ce": 0.010076817125082016, "loss_lvr": 0.4884850084781647, "loss_mode_switch": 0.0, "loss_total": 0.0589253194630146, "step": 657 }, { "epoch": 0.2632, "grad_norm": 1.2627736330032349, "learning_rate": 8.640405918578134e-06, "loss": 0.2674, "step": 658 }, { "batch_size": 4, "epoch": 0.2632, "step": 658, "tokens_per_device": 5436 }, { "epoch": 0.2632, "loss_ce": 0.3903093934059143, "loss_lvr": 0.8243808150291443, "loss_mode_switch": 0.0, "loss_total": 0.47274747490882874, "step": 658 }, { "batch_size": 4, "epoch": 0.2632, "step": 658, "tokens_per_device": 4956 }, { "epoch": 0.2632, "loss_ce": 0.6917099952697754, "loss_lvr": 1.0257203578948975, "loss_mode_switch": 0.0, "loss_total": 0.7942820191383362, "step": 658 }, { "batch_size": 1, "epoch": 0.2632, "step": 658, "tokens_per_device": 4856 }, { "epoch": 0.2632, "loss_ce": 0.0031341700814664364, "loss_lvr": 0.5126025676727295, "loss_mode_switch": 0.0, "loss_total": 0.0543944276869297, "step": 658 }, { "batch_size": 4, "epoch": 0.2632, "step": 658, "tokens_per_device": 3424 }, { "epoch": 0.2632, "loss_ce": 0.25799423456192017, "loss_lvr": 0.876502513885498, "loss_mode_switch": 0.0, "loss_total": 0.345644474029541, "step": 658 }, { "batch_size": 4, "epoch": 0.2632, "step": 658, "tokens_per_device": 5100 }, { "epoch": 0.2632, "loss_ce": 0.008819004520773888, "loss_lvr": 1.0925589799880981, "loss_mode_switch": 0.0, "loss_total": 0.11807490885257721, "step": 658 }, { "batch_size": 4, "epoch": 0.2632, "step": 658, "tokens_per_device": 1484 }, { "epoch": 0.2632, "loss_ce": 0.4930582642555237, "loss_lvr": 0.9605764746665955, "loss_mode_switch": 0.0, "loss_total": 0.5891159176826477, "step": 658 }, { "batch_size": 1, "epoch": 0.2632, "step": 658, "tokens_per_device": 4950 }, { "epoch": 0.2632, "loss_ce": 0.05762581154704094, "loss_lvr": 0.5311366319656372, "loss_mode_switch": 0.0, "loss_total": 0.11073947697877884, "step": 658 }, { "batch_size": 4, "epoch": 0.2632, "step": 658, "tokens_per_device": 1512 }, { "epoch": 0.2632, "loss_ce": 0.2031368762254715, "loss_lvr": 1.1010658740997314, "loss_mode_switch": 0.0, "loss_total": 0.31324344873428345, "step": 658 }, { "epoch": 0.2636, "grad_norm": 1.2610054016113281, "learning_rate": 8.635962590263047e-06, "loss": 0.3234, "step": 659 }, { "batch_size": 4, "epoch": 0.2636, "step": 659, "tokens_per_device": 1184 }, { "epoch": 0.2636, "loss_ce": 0.20147189497947693, "loss_lvr": 1.075447678565979, "loss_mode_switch": 0.0, "loss_total": 0.3090166747570038, "step": 659 }, { "batch_size": 1, "epoch": 0.2636, "step": 659, "tokens_per_device": 5156 }, { "epoch": 0.2636, "loss_ce": 0.0027026082389056683, "loss_lvr": 0.6430961489677429, "loss_mode_switch": 0.0, "loss_total": 0.0670122280716896, "step": 659 }, { "batch_size": 1, "epoch": 0.2636, "step": 659, "tokens_per_device": 4748 }, { "epoch": 0.2636, "loss_ce": 0.0033229582477360964, "loss_lvr": 0.3665647804737091, "loss_mode_switch": 0.0, "loss_total": 0.03997943922877312, "step": 659 }, { "batch_size": 4, "epoch": 0.2636, "step": 659, "tokens_per_device": 2636 }, { "epoch": 0.2636, "loss_ce": 0.2822408080101013, "loss_lvr": 0.8809348940849304, "loss_mode_switch": 0.0, "loss_total": 0.37033429741859436, "step": 659 }, { "batch_size": 1, "epoch": 0.2636, "step": 659, "tokens_per_device": 4636 }, { "epoch": 0.2636, "loss_ce": 0.03371625766158104, "loss_lvr": 0.41949263215065, "loss_mode_switch": 0.0, "loss_total": 0.07566551864147186, "step": 659 }, { "batch_size": 4, "epoch": 0.2636, "step": 659, "tokens_per_device": 4296 }, { "epoch": 0.2636, "loss_ce": 0.1379195898771286, "loss_lvr": 0.7873704433441162, "loss_mode_switch": 0.0, "loss_total": 0.2166566252708435, "step": 659 }, { "batch_size": 1, "epoch": 0.2636, "step": 659, "tokens_per_device": 4906 }, { "epoch": 0.2636, "loss_ce": 0.023525137454271317, "loss_lvr": 0.4898447096347809, "loss_mode_switch": 0.0, "loss_total": 0.07250960916280746, "step": 659 }, { "batch_size": 4, "epoch": 0.2636, "step": 659, "tokens_per_device": 3836 }, { "epoch": 0.2636, "loss_ce": 0.031386200338602066, "loss_lvr": 1.1155959367752075, "loss_mode_switch": 0.0, "loss_total": 0.142945796251297, "step": 659 }, { "epoch": 0.264, "grad_norm": 1.5284855365753174, "learning_rate": 8.63151315961915e-06, "loss": 0.3531, "step": 660 }, { "batch_size": 4, "epoch": 0.264, "step": 660, "tokens_per_device": 3828 }, { "epoch": 0.264, "loss_ce": 0.022032033652067184, "loss_lvr": 1.1505852937698364, "loss_mode_switch": 0.0, "loss_total": 0.1370905637741089, "step": 660 }, { "batch_size": 4, "epoch": 0.264, "step": 660, "tokens_per_device": 1716 }, { "epoch": 0.264, "loss_ce": 0.3236202895641327, "loss_lvr": 0.8980565667152405, "loss_mode_switch": 0.0, "loss_total": 0.4134259521961212, "step": 660 }, { "batch_size": 4, "epoch": 0.264, "step": 660, "tokens_per_device": 4204 }, { "epoch": 0.264, "loss_ce": 0.2899109125137329, "loss_lvr": 0.9978193044662476, "loss_mode_switch": 0.0, "loss_total": 0.38969284296035767, "step": 660 }, { "batch_size": 4, "epoch": 0.264, "step": 660, "tokens_per_device": 4388 }, { "epoch": 0.264, "loss_ce": 0.05371186137199402, "loss_lvr": 0.7171339392662048, "loss_mode_switch": 0.0, "loss_total": 0.12542524933815002, "step": 660 }, { "batch_size": 1, "epoch": 0.264, "step": 660, "tokens_per_device": 5318 }, { "epoch": 0.264, "loss_ce": 0.0008138183038681746, "loss_lvr": 0.45390385389328003, "loss_mode_switch": 0.0, "loss_total": 0.046204205602407455, "step": 660 }, { "batch_size": 4, "epoch": 0.264, "step": 660, "tokens_per_device": 4660 }, { "epoch": 0.264, "loss_ce": 0.3469410538673401, "loss_lvr": 0.8558708429336548, "loss_mode_switch": 0.0, "loss_total": 0.43252813816070557, "step": 660 }, { "batch_size": 4, "epoch": 0.264, "step": 660, "tokens_per_device": 1232 }, { "epoch": 0.264, "loss_ce": 0.3380487561225891, "loss_lvr": 1.0824735164642334, "loss_mode_switch": 0.0, "loss_total": 0.4462960958480835, "step": 660 }, { "batch_size": 4, "epoch": 0.264, "step": 660, "tokens_per_device": 12924 }, { "epoch": 0.264, "loss_ce": 0.20333147048950195, "loss_lvr": 0.8355860710144043, "loss_mode_switch": 0.0, "loss_total": 0.28689008951187134, "step": 660 }, { "epoch": 0.2644, "grad_norm": 1.2544678449630737, "learning_rate": 8.627057634114036e-06, "loss": 0.2728, "step": 661 }, { "batch_size": 1, "epoch": 0.2644, "step": 661, "tokens_per_device": 4887 }, { "epoch": 0.2644, "loss_ce": 0.008053230121731758, "loss_lvr": 0.39947929978370667, "loss_mode_switch": 0.0, "loss_total": 0.048001162707805634, "step": 661 }, { "batch_size": 1, "epoch": 0.2644, "step": 661, "tokens_per_device": 4931 }, { "epoch": 0.2644, "loss_ce": 0.019101373851299286, "loss_lvr": 0.770893394947052, "loss_mode_switch": 0.0, "loss_total": 0.09619071334600449, "step": 661 }, { "batch_size": 4, "epoch": 0.2644, "step": 661, "tokens_per_device": 9768 }, { "epoch": 0.2644, "loss_ce": 0.17333552241325378, "loss_lvr": 0.5161200761795044, "loss_mode_switch": 0.0, "loss_total": 0.22494752705097198, "step": 661 }, { "batch_size": 4, "epoch": 0.2644, "step": 661, "tokens_per_device": 5656 }, { "epoch": 0.2644, "loss_ce": 0.830742359161377, "loss_lvr": 0.9131558537483215, "loss_mode_switch": 0.0, "loss_total": 0.9220579266548157, "step": 661 }, { "batch_size": 1, "epoch": 0.2644, "step": 661, "tokens_per_device": 4887 }, { "epoch": 0.2644, "loss_ce": 0.018504273146390915, "loss_lvr": 0.21369832754135132, "loss_mode_switch": 0.0, "loss_total": 0.039874106645584106, "step": 661 }, { "batch_size": 4, "epoch": 0.2644, "step": 661, "tokens_per_device": 3520 }, { "epoch": 0.2644, "loss_ce": 0.6311362981796265, "loss_lvr": 0.8526965975761414, "loss_mode_switch": 0.0, "loss_total": 0.716405987739563, "step": 661 }, { "batch_size": 4, "epoch": 0.2644, "step": 661, "tokens_per_device": 4236 }, { "epoch": 0.2644, "loss_ce": 0.33619460463523865, "loss_lvr": 1.6906567811965942, "loss_mode_switch": 0.0, "loss_total": 0.5052602887153625, "step": 661 }, { "batch_size": 4, "epoch": 0.2644, "step": 661, "tokens_per_device": 4552 }, { "epoch": 0.2644, "loss_ce": 0.23986172676086426, "loss_lvr": 0.8853126764297485, "loss_mode_switch": 0.0, "loss_total": 0.32839298248291016, "step": 661 }, { "epoch": 0.2648, "grad_norm": 1.3871418237686157, "learning_rate": 8.622596021225524e-06, "loss": 0.3353, "step": 662 }, { "batch_size": 1, "epoch": 0.2648, "step": 662, "tokens_per_device": 5099 }, { "epoch": 0.2648, "loss_ce": 0.007441162131726742, "loss_lvr": 0.28411900997161865, "loss_mode_switch": 0.0, "loss_total": 0.035853061825037, "step": 662 }, { "batch_size": 1, "epoch": 0.2648, "step": 662, "tokens_per_device": 5297 }, { "epoch": 0.2648, "loss_ce": 0.3542742133140564, "loss_lvr": 0.49851059913635254, "loss_mode_switch": 0.0, "loss_total": 0.40412527322769165, "step": 662 }, { "batch_size": 1, "epoch": 0.2648, "step": 662, "tokens_per_device": 5115 }, { "epoch": 0.2648, "loss_ce": 0.026820562779903412, "loss_lvr": 0.3462998569011688, "loss_mode_switch": 0.0, "loss_total": 0.061450548470020294, "step": 662 }, { "batch_size": 4, "epoch": 0.2648, "step": 662, "tokens_per_device": 1248 }, { "epoch": 0.2648, "loss_ce": 0.24062637984752655, "loss_lvr": 1.0844558477401733, "loss_mode_switch": 0.0, "loss_total": 0.3490719795227051, "step": 662 }, { "batch_size": 4, "epoch": 0.2648, "step": 662, "tokens_per_device": 4940 }, { "epoch": 0.2648, "loss_ce": 0.20909462869167328, "loss_lvr": 0.9089977741241455, "loss_mode_switch": 0.0, "loss_total": 0.29999440908432007, "step": 662 }, { "batch_size": 4, "epoch": 0.2648, "step": 662, "tokens_per_device": 1512 }, { "epoch": 0.2648, "loss_ce": 0.046542584896087646, "loss_lvr": 1.2417097091674805, "loss_mode_switch": 0.0, "loss_total": 0.17071355879306793, "step": 662 }, { "batch_size": 4, "epoch": 0.2648, "step": 662, "tokens_per_device": 3956 }, { "epoch": 0.2648, "loss_ce": 0.17342275381088257, "loss_lvr": 0.8447925448417664, "loss_mode_switch": 0.0, "loss_total": 0.25790202617645264, "step": 662 }, { "batch_size": 4, "epoch": 0.2648, "step": 662, "tokens_per_device": 4164 }, { "epoch": 0.2648, "loss_ce": 0.16188554465770721, "loss_lvr": 1.0431824922561646, "loss_mode_switch": 0.0, "loss_total": 0.26620379090309143, "step": 662 }, { "epoch": 0.2652, "grad_norm": 1.2513740062713623, "learning_rate": 8.618128328441655e-06, "loss": 0.2938, "step": 663 }, { "batch_size": 1, "epoch": 0.2652, "step": 663, "tokens_per_device": 4827 }, { "epoch": 0.2652, "loss_ce": 0.00212624273262918, "loss_lvr": 0.3420798182487488, "loss_mode_switch": 0.0, "loss_total": 0.03633422404527664, "step": 663 }, { "batch_size": 4, "epoch": 0.2652, "step": 663, "tokens_per_device": 4216 }, { "epoch": 0.2652, "loss_ce": 0.07354764640331268, "loss_lvr": 1.14556086063385, "loss_mode_switch": 0.0, "loss_total": 0.18810373544692993, "step": 663 }, { "batch_size": 1, "epoch": 0.2652, "step": 663, "tokens_per_device": 5153 }, { "epoch": 0.2652, "loss_ce": 0.0008695517899468541, "loss_lvr": 0.5475144982337952, "loss_mode_switch": 0.0, "loss_total": 0.05562100559473038, "step": 663 }, { "batch_size": 1, "epoch": 0.2652, "step": 663, "tokens_per_device": 4876 }, { "epoch": 0.2652, "loss_ce": 0.004052230156958103, "loss_lvr": 0.23424983024597168, "loss_mode_switch": 0.0, "loss_total": 0.027477212250232697, "step": 663 }, { "batch_size": 4, "epoch": 0.2652, "step": 663, "tokens_per_device": 3360 }, { "epoch": 0.2652, "loss_ce": 0.3919314742088318, "loss_lvr": 0.8907777667045593, "loss_mode_switch": 0.0, "loss_total": 0.48100924491882324, "step": 663 }, { "batch_size": 1, "epoch": 0.2652, "step": 663, "tokens_per_device": 4369 }, { "epoch": 0.2652, "loss_ce": 0.07997176796197891, "loss_lvr": 0.7793034315109253, "loss_mode_switch": 0.0, "loss_total": 0.15790212154388428, "step": 663 }, { "batch_size": 4, "epoch": 0.2652, "step": 663, "tokens_per_device": 3860 }, { "epoch": 0.2652, "loss_ce": 0.7921644449234009, "loss_lvr": 1.0156437158584595, "loss_mode_switch": 0.0, "loss_total": 0.8937287926673889, "step": 663 }, { "batch_size": 4, "epoch": 0.2652, "step": 663, "tokens_per_device": 4220 }, { "epoch": 0.2652, "loss_ce": 0.010794859379529953, "loss_lvr": 0.8253479599952698, "loss_mode_switch": 0.0, "loss_total": 0.09332965314388275, "step": 663 }, { "epoch": 0.2656, "grad_norm": 1.312677025794983, "learning_rate": 8.613654563260673e-06, "loss": 0.2681, "step": 664 }, { "batch_size": 4, "epoch": 0.2656, "step": 664, "tokens_per_device": 1332 }, { "epoch": 0.2656, "loss_ce": 0.20233289897441864, "loss_lvr": 1.305483341217041, "loss_mode_switch": 0.0, "loss_total": 0.33288124203681946, "step": 664 }, { "batch_size": 4, "epoch": 0.2656, "step": 664, "tokens_per_device": 2720 }, { "epoch": 0.2656, "loss_ce": 0.488679438829422, "loss_lvr": 0.8969948887825012, "loss_mode_switch": 0.0, "loss_total": 0.5783789157867432, "step": 664 }, { "batch_size": 1, "epoch": 0.2656, "step": 664, "tokens_per_device": 5115 }, { "epoch": 0.2656, "loss_ce": 0.1508965790271759, "loss_lvr": 0.2252587527036667, "loss_mode_switch": 0.0, "loss_total": 0.1734224557876587, "step": 664 }, { "batch_size": 1, "epoch": 0.2656, "step": 664, "tokens_per_device": 5154 }, { "epoch": 0.2656, "loss_ce": 0.017996134236454964, "loss_lvr": 0.8312119245529175, "loss_mode_switch": 0.0, "loss_total": 0.1011173278093338, "step": 664 }, { "batch_size": 4, "epoch": 0.2656, "step": 664, "tokens_per_device": 2660 }, { "epoch": 0.2656, "loss_ce": 0.33023616671562195, "loss_lvr": 0.8614475727081299, "loss_mode_switch": 0.0, "loss_total": 0.41638094186782837, "step": 664 }, { "batch_size": 4, "epoch": 0.2656, "step": 664, "tokens_per_device": 11744 }, { "epoch": 0.2656, "loss_ce": 0.3857631981372833, "loss_lvr": 1.0033955574035645, "loss_mode_switch": 0.0, "loss_total": 0.48610275983810425, "step": 664 }, { "batch_size": 4, "epoch": 0.2656, "step": 664, "tokens_per_device": 1376 }, { "epoch": 0.2656, "loss_ce": 0.33911871910095215, "loss_lvr": 1.1295344829559326, "loss_mode_switch": 0.0, "loss_total": 0.4520721733570099, "step": 664 }, { "batch_size": 1, "epoch": 0.2656, "step": 664, "tokens_per_device": 5401 }, { "epoch": 0.2656, "loss_ce": 0.18010510504245758, "loss_lvr": 0.4959712624549866, "loss_mode_switch": 0.0, "loss_total": 0.22970223426818848, "step": 664 }, { "epoch": 0.266, "grad_norm": 1.323895812034607, "learning_rate": 8.609174733191012e-06, "loss": 0.3077, "step": 665 }, { "batch_size": 1, "epoch": 0.266, "step": 665, "tokens_per_device": 4696 }, { "epoch": 0.266, "loss_ce": 0.16776226460933685, "loss_lvr": 0.46990230679512024, "loss_mode_switch": 0.0, "loss_total": 0.21475249528884888, "step": 665 }, { "batch_size": 4, "epoch": 0.266, "step": 665, "tokens_per_device": 4328 }, { "epoch": 0.266, "loss_ce": 0.34936392307281494, "loss_lvr": 0.7630934119224548, "loss_mode_switch": 0.0, "loss_total": 0.4256732761859894, "step": 665 }, { "batch_size": 4, "epoch": 0.266, "step": 665, "tokens_per_device": 2524 }, { "epoch": 0.266, "loss_ce": 0.27812498807907104, "loss_lvr": 0.9833506345748901, "loss_mode_switch": 0.0, "loss_total": 0.3764600455760956, "step": 665 }, { "batch_size": 1, "epoch": 0.266, "step": 665, "tokens_per_device": 4895 }, { "epoch": 0.266, "loss_ce": 0.07098031789064407, "loss_lvr": 1.4651539325714111, "loss_mode_switch": 0.0, "loss_total": 0.21749570965766907, "step": 665 }, { "batch_size": 4, "epoch": 0.266, "step": 665, "tokens_per_device": 1484 }, { "epoch": 0.266, "loss_ce": 0.14149275422096252, "loss_lvr": 0.8597724437713623, "loss_mode_switch": 0.0, "loss_total": 0.2274700105190277, "step": 665 }, { "batch_size": 4, "epoch": 0.266, "step": 665, "tokens_per_device": 4240 }, { "epoch": 0.266, "loss_ce": 0.7444583773612976, "loss_lvr": 0.7666944861412048, "loss_mode_switch": 0.0, "loss_total": 0.8211278319358826, "step": 665 }, { "batch_size": 1, "epoch": 0.266, "step": 665, "tokens_per_device": 7552 }, { "epoch": 0.266, "loss_ce": 0.013179227709770203, "loss_lvr": 0.32303035259246826, "loss_mode_switch": 0.0, "loss_total": 0.04548226296901703, "step": 665 }, { "batch_size": 1, "epoch": 0.266, "step": 665, "tokens_per_device": 4900 }, { "epoch": 0.266, "loss_ce": 0.13710111379623413, "loss_lvr": 1.3260782957077026, "loss_mode_switch": 0.0, "loss_total": 0.26970893144607544, "step": 665 }, { "epoch": 0.2664, "grad_norm": 1.3150343894958496, "learning_rate": 8.604688845751283e-06, "loss": 0.3048, "step": 666 }, { "batch_size": 4, "epoch": 0.2664, "step": 666, "tokens_per_device": 4544 }, { "epoch": 0.2664, "loss_ce": 0.12820373475551605, "loss_lvr": 0.9440925717353821, "loss_mode_switch": 0.0, "loss_total": 0.22261299192905426, "step": 666 }, { "batch_size": 4, "epoch": 0.2664, "step": 666, "tokens_per_device": 3364 }, { "epoch": 0.2664, "loss_ce": 0.07392386347055435, "loss_lvr": 0.951583743095398, "loss_mode_switch": 0.0, "loss_total": 0.16908223927021027, "step": 666 }, { "batch_size": 4, "epoch": 0.2664, "step": 666, "tokens_per_device": 4276 }, { "epoch": 0.2664, "loss_ce": 0.06666294485330582, "loss_lvr": 1.1901187896728516, "loss_mode_switch": 0.0, "loss_total": 0.18567481637001038, "step": 666 }, { "batch_size": 4, "epoch": 0.2664, "step": 666, "tokens_per_device": 4028 }, { "epoch": 0.2664, "loss_ce": 0.14253175258636475, "loss_lvr": 0.7740097045898438, "loss_mode_switch": 0.0, "loss_total": 0.21993273496627808, "step": 666 }, { "batch_size": 4, "epoch": 0.2664, "step": 666, "tokens_per_device": 2748 }, { "epoch": 0.2664, "loss_ce": 0.05473046377301216, "loss_lvr": 1.0156196355819702, "loss_mode_switch": 0.0, "loss_total": 0.15629242360591888, "step": 666 }, { "batch_size": 4, "epoch": 0.2664, "step": 666, "tokens_per_device": 6976 }, { "epoch": 0.2664, "loss_ce": 0.21105855703353882, "loss_lvr": 0.8017768859863281, "loss_mode_switch": 0.0, "loss_total": 0.2912362515926361, "step": 666 }, { "batch_size": 4, "epoch": 0.2664, "step": 666, "tokens_per_device": 4744 }, { "epoch": 0.2664, "loss_ce": 0.44855985045433044, "loss_lvr": 0.7852330207824707, "loss_mode_switch": 0.0, "loss_total": 0.527083158493042, "step": 666 }, { "batch_size": 4, "epoch": 0.2664, "step": 666, "tokens_per_device": 5812 }, { "epoch": 0.2664, "loss_ce": 0.5012409687042236, "loss_lvr": 0.8928501009941101, "loss_mode_switch": 0.0, "loss_total": 0.5905259847640991, "step": 666 }, { "epoch": 0.2668, "grad_norm": 1.308983564376831, "learning_rate": 8.600196908470265e-06, "loss": 0.2968, "step": 667 }, { "batch_size": 4, "epoch": 0.2668, "step": 667, "tokens_per_device": 2536 }, { "epoch": 0.2668, "loss_ce": 0.4230392575263977, "loss_lvr": 0.890762448310852, "loss_mode_switch": 0.0, "loss_total": 0.512115478515625, "step": 667 }, { "batch_size": 4, "epoch": 0.2668, "step": 667, "tokens_per_device": 4244 }, { "epoch": 0.2668, "loss_ce": 1.1726369857788086, "loss_lvr": 0.9445186853408813, "loss_mode_switch": 0.0, "loss_total": 1.2670888900756836, "step": 667 }, { "batch_size": 4, "epoch": 0.2668, "step": 667, "tokens_per_device": 9640 }, { "epoch": 0.2668, "loss_ce": 0.08832648396492004, "loss_lvr": 0.5463234782218933, "loss_mode_switch": 0.0, "loss_total": 0.1429588347673416, "step": 667 }, { "batch_size": 4, "epoch": 0.2668, "step": 667, "tokens_per_device": 4288 }, { "epoch": 0.2668, "loss_ce": 0.5994176864624023, "loss_lvr": 1.0006016492843628, "loss_mode_switch": 0.0, "loss_total": 0.6994778513908386, "step": 667 }, { "batch_size": 4, "epoch": 0.2668, "step": 667, "tokens_per_device": 4620 }, { "epoch": 0.2668, "loss_ce": 0.18438389897346497, "loss_lvr": 0.847131073474884, "loss_mode_switch": 0.0, "loss_total": 0.2690970003604889, "step": 667 }, { "batch_size": 4, "epoch": 0.2668, "step": 667, "tokens_per_device": 6012 }, { "epoch": 0.2668, "loss_ce": 0.027071231976151466, "loss_lvr": 0.7048557996749878, "loss_mode_switch": 0.0, "loss_total": 0.09755681455135345, "step": 667 }, { "batch_size": 4, "epoch": 0.2668, "step": 667, "tokens_per_device": 2644 }, { "epoch": 0.2668, "loss_ce": 0.0838712826371193, "loss_lvr": 0.7139716744422913, "loss_mode_switch": 0.0, "loss_total": 0.15526846051216125, "step": 667 }, { "batch_size": 4, "epoch": 0.2668, "step": 667, "tokens_per_device": 3780 }, { "epoch": 0.2668, "loss_ce": 0.08989040553569794, "loss_lvr": 0.9525789618492126, "loss_mode_switch": 0.0, "loss_total": 0.18514829874038696, "step": 667 }, { "epoch": 0.2672, "grad_norm": 1.1978930234909058, "learning_rate": 8.595698928886894e-06, "loss": 0.2516, "step": 668 }, { "batch_size": 1, "epoch": 0.2672, "step": 668, "tokens_per_device": 4865 }, { "epoch": 0.2672, "loss_ce": 0.0011267329100519419, "loss_lvr": 0.40839681029319763, "loss_mode_switch": 0.0, "loss_total": 0.04196641594171524, "step": 668 }, { "batch_size": 4, "epoch": 0.2672, "step": 668, "tokens_per_device": 4872 }, { "epoch": 0.2672, "loss_ce": 0.07855106890201569, "loss_lvr": 0.937633752822876, "loss_mode_switch": 0.0, "loss_total": 0.17231443524360657, "step": 668 }, { "batch_size": 1, "epoch": 0.2672, "step": 668, "tokens_per_device": 5395 }, { "epoch": 0.2672, "loss_ce": 0.11328509449958801, "loss_lvr": 0.6963188052177429, "loss_mode_switch": 0.0, "loss_total": 0.18291696906089783, "step": 668 }, { "batch_size": 1, "epoch": 0.2672, "step": 668, "tokens_per_device": 5076 }, { "epoch": 0.2672, "loss_ce": 0.1044546440243721, "loss_lvr": 0.3113132417201996, "loss_mode_switch": 0.0, "loss_total": 0.1355859637260437, "step": 668 }, { "batch_size": 4, "epoch": 0.2672, "step": 668, "tokens_per_device": 4444 }, { "epoch": 0.2672, "loss_ce": 0.07179439067840576, "loss_lvr": 0.9685243368148804, "loss_mode_switch": 0.0, "loss_total": 0.16864682734012604, "step": 668 }, { "batch_size": 1, "epoch": 0.2672, "step": 668, "tokens_per_device": 6533 }, { "epoch": 0.2672, "loss_ce": 0.001176813617348671, "loss_lvr": 0.31713953614234924, "loss_mode_switch": 0.0, "loss_total": 0.032890766859054565, "step": 668 }, { "batch_size": 1, "epoch": 0.2672, "step": 668, "tokens_per_device": 4284 }, { "epoch": 0.2672, "loss_ce": 0.7325177192687988, "loss_lvr": 0.4851934611797333, "loss_mode_switch": 0.0, "loss_total": 0.7810370922088623, "step": 668 }, { "batch_size": 4, "epoch": 0.2672, "step": 668, "tokens_per_device": 4316 }, { "epoch": 0.2672, "loss_ce": 0.32529720664024353, "loss_lvr": 1.064655065536499, "loss_mode_switch": 0.0, "loss_total": 0.4317627251148224, "step": 668 }, { "epoch": 0.2676, "grad_norm": 1.2516273260116577, "learning_rate": 8.591194914550242e-06, "loss": 0.2708, "step": 669 }, { "batch_size": 4, "epoch": 0.2676, "step": 669, "tokens_per_device": 3768 }, { "epoch": 0.2676, "loss_ce": 0.11417997628450394, "loss_lvr": 0.881165623664856, "loss_mode_switch": 0.0, "loss_total": 0.20229654014110565, "step": 669 }, { "batch_size": 4, "epoch": 0.2676, "step": 669, "tokens_per_device": 3860 }, { "epoch": 0.2676, "loss_ce": 0.5311078429222107, "loss_lvr": 1.2246427536010742, "loss_mode_switch": 0.0, "loss_total": 0.653572142124176, "step": 669 }, { "batch_size": 4, "epoch": 0.2676, "step": 669, "tokens_per_device": 4260 }, { "epoch": 0.2676, "loss_ce": 0.2698252499103546, "loss_lvr": 0.9529609084129333, "loss_mode_switch": 0.0, "loss_total": 0.36512133479118347, "step": 669 }, { "batch_size": 4, "epoch": 0.2676, "step": 669, "tokens_per_device": 4516 }, { "epoch": 0.2676, "loss_ce": 0.09094321727752686, "loss_lvr": 1.024613380432129, "loss_mode_switch": 0.0, "loss_total": 0.19340455532073975, "step": 669 }, { "batch_size": 4, "epoch": 0.2676, "step": 669, "tokens_per_device": 2656 }, { "epoch": 0.2676, "loss_ce": 0.4470919668674469, "loss_lvr": 0.9262564182281494, "loss_mode_switch": 0.0, "loss_total": 0.5397176146507263, "step": 669 }, { "batch_size": 4, "epoch": 0.2676, "step": 669, "tokens_per_device": 13492 }, { "epoch": 0.2676, "loss_ce": 0.15072883665561676, "loss_lvr": 0.5436056852340698, "loss_mode_switch": 0.0, "loss_total": 0.20508940517902374, "step": 669 }, { "batch_size": 4, "epoch": 0.2676, "step": 669, "tokens_per_device": 4240 }, { "epoch": 0.2676, "loss_ce": 0.13636255264282227, "loss_lvr": 0.9518162608146667, "loss_mode_switch": 0.0, "loss_total": 0.23154418170452118, "step": 669 }, { "batch_size": 1, "epoch": 0.2676, "step": 669, "tokens_per_device": 5102 }, { "epoch": 0.2676, "loss_ce": 0.09039468318223953, "loss_lvr": 0.2872040867805481, "loss_mode_switch": 0.0, "loss_total": 0.11911509186029434, "step": 669 }, { "epoch": 0.268, "grad_norm": 1.3272631168365479, "learning_rate": 8.586684873019513e-06, "loss": 0.2691, "step": 670 }, { "batch_size": 4, "epoch": 0.268, "step": 670, "tokens_per_device": 5388 }, { "epoch": 0.268, "loss_ce": 0.13642923533916473, "loss_lvr": 0.8566305637359619, "loss_mode_switch": 0.0, "loss_total": 0.22209230065345764, "step": 670 }, { "batch_size": 4, "epoch": 0.268, "step": 670, "tokens_per_device": 4828 }, { "epoch": 0.268, "loss_ce": 0.6489839553833008, "loss_lvr": 1.1207356452941895, "loss_mode_switch": 0.0, "loss_total": 0.7610574960708618, "step": 670 }, { "batch_size": 4, "epoch": 0.268, "step": 670, "tokens_per_device": 4224 }, { "epoch": 0.268, "loss_ce": 0.12699683010578156, "loss_lvr": 0.903666079044342, "loss_mode_switch": 0.0, "loss_total": 0.21736344695091248, "step": 670 }, { "batch_size": 4, "epoch": 0.268, "step": 670, "tokens_per_device": 4208 }, { "epoch": 0.268, "loss_ce": 0.17830806970596313, "loss_lvr": 1.3316373825073242, "loss_mode_switch": 0.0, "loss_total": 0.3114718198776245, "step": 670 }, { "batch_size": 4, "epoch": 0.268, "step": 670, "tokens_per_device": 5256 }, { "epoch": 0.268, "loss_ce": 0.3275800943374634, "loss_lvr": 0.703192412853241, "loss_mode_switch": 0.0, "loss_total": 0.397899329662323, "step": 670 }, { "batch_size": 1, "epoch": 0.268, "step": 670, "tokens_per_device": 4989 }, { "epoch": 0.268, "loss_ce": 0.0035637142136693, "loss_lvr": 0.7012898921966553, "loss_mode_switch": 0.0, "loss_total": 0.07369270920753479, "step": 670 }, { "batch_size": 4, "epoch": 0.268, "step": 670, "tokens_per_device": 1932 }, { "epoch": 0.268, "loss_ce": 0.19936400651931763, "loss_lvr": 1.0028542280197144, "loss_mode_switch": 0.0, "loss_total": 0.2996494174003601, "step": 670 }, { "batch_size": 4, "epoch": 0.268, "step": 670, "tokens_per_device": 5012 }, { "epoch": 0.268, "loss_ce": 0.29195791482925415, "loss_lvr": 0.9657706618309021, "loss_mode_switch": 0.0, "loss_total": 0.3885349929332733, "step": 670 }, { "epoch": 0.2684, "grad_norm": 1.3448395729064941, "learning_rate": 8.582168811864022e-06, "loss": 0.3296, "step": 671 }, { "batch_size": 4, "epoch": 0.2684, "step": 671, "tokens_per_device": 3900 }, { "epoch": 0.2684, "loss_ce": 0.12085340917110443, "loss_lvr": 0.6803956031799316, "loss_mode_switch": 0.0, "loss_total": 0.18889296054840088, "step": 671 }, { "batch_size": 4, "epoch": 0.2684, "step": 671, "tokens_per_device": 1596 }, { "epoch": 0.2684, "loss_ce": 0.3557018041610718, "loss_lvr": 0.9678927063941956, "loss_mode_switch": 0.0, "loss_total": 0.45249107480049133, "step": 671 }, { "batch_size": 1, "epoch": 0.2684, "step": 671, "tokens_per_device": 5116 }, { "epoch": 0.2684, "loss_ce": 0.32886970043182373, "loss_lvr": 0.24987702071666718, "loss_mode_switch": 0.0, "loss_total": 0.3538573980331421, "step": 671 }, { "batch_size": 1, "epoch": 0.2684, "step": 671, "tokens_per_device": 4909 }, { "epoch": 0.2684, "loss_ce": 0.04523821175098419, "loss_lvr": 0.4061368405818939, "loss_mode_switch": 0.0, "loss_total": 0.08585189282894135, "step": 671 }, { "batch_size": 4, "epoch": 0.2684, "step": 671, "tokens_per_device": 5632 }, { "epoch": 0.2684, "loss_ce": 0.2676166296005249, "loss_lvr": 1.1069234609603882, "loss_mode_switch": 0.0, "loss_total": 0.3783089816570282, "step": 671 }, { "batch_size": 4, "epoch": 0.2684, "step": 671, "tokens_per_device": 4828 }, { "epoch": 0.2684, "loss_ce": 0.48037925362586975, "loss_lvr": 0.8583370447158813, "loss_mode_switch": 0.0, "loss_total": 0.5662129521369934, "step": 671 }, { "batch_size": 4, "epoch": 0.2684, "step": 671, "tokens_per_device": 9828 }, { "epoch": 0.2684, "loss_ce": 0.2747173607349396, "loss_lvr": 0.4996696710586548, "loss_mode_switch": 0.0, "loss_total": 0.3246843218803406, "step": 671 }, { "batch_size": 4, "epoch": 0.2684, "step": 671, "tokens_per_device": 2000 }, { "epoch": 0.2684, "loss_ce": 0.5504310131072998, "loss_lvr": 0.8841922879219055, "loss_mode_switch": 0.0, "loss_total": 0.638850212097168, "step": 671 }, { "epoch": 0.2688, "grad_norm": 1.3156663179397583, "learning_rate": 8.577646738663193e-06, "loss": 0.3389, "step": 672 }, { "batch_size": 1, "epoch": 0.2688, "step": 672, "tokens_per_device": 4924 }, { "epoch": 0.2688, "loss_ce": 0.21128109097480774, "loss_lvr": 0.6118307113647461, "loss_mode_switch": 0.0, "loss_total": 0.27246415615081787, "step": 672 }, { "batch_size": 4, "epoch": 0.2688, "step": 672, "tokens_per_device": 1540 }, { "epoch": 0.2688, "loss_ce": 0.16435600817203522, "loss_lvr": 1.0162501335144043, "loss_mode_switch": 0.0, "loss_total": 0.2659810185432434, "step": 672 }, { "batch_size": 4, "epoch": 0.2688, "step": 672, "tokens_per_device": 3784 }, { "epoch": 0.2688, "loss_ce": 0.10130409896373749, "loss_lvr": 0.8930081725120544, "loss_mode_switch": 0.0, "loss_total": 0.19060492515563965, "step": 672 }, { "batch_size": 4, "epoch": 0.2688, "step": 672, "tokens_per_device": 6280 }, { "epoch": 0.2688, "loss_ce": 0.29827767610549927, "loss_lvr": 1.009190559387207, "loss_mode_switch": 0.0, "loss_total": 0.3991967439651489, "step": 672 }, { "batch_size": 4, "epoch": 0.2688, "step": 672, "tokens_per_device": 10704 }, { "epoch": 0.2688, "loss_ce": 0.09721431136131287, "loss_lvr": 1.1748894453048706, "loss_mode_switch": 0.0, "loss_total": 0.2147032618522644, "step": 672 }, { "batch_size": 4, "epoch": 0.2688, "step": 672, "tokens_per_device": 3404 }, { "epoch": 0.2688, "loss_ce": 0.31692513823509216, "loss_lvr": 1.05794358253479, "loss_mode_switch": 0.0, "loss_total": 0.4227195084095001, "step": 672 }, { "batch_size": 4, "epoch": 0.2688, "step": 672, "tokens_per_device": 3776 }, { "epoch": 0.2688, "loss_ce": 0.3918178379535675, "loss_lvr": 0.9419456124305725, "loss_mode_switch": 0.0, "loss_total": 0.48601239919662476, "step": 672 }, { "batch_size": 4, "epoch": 0.2688, "step": 672, "tokens_per_device": 13648 }, { "epoch": 0.2688, "loss_ce": 0.5146532654762268, "loss_lvr": 1.2961140871047974, "loss_mode_switch": 0.0, "loss_total": 0.6442646980285645, "step": 672 }, { "epoch": 0.2692, "grad_norm": 1.444498896598816, "learning_rate": 8.573118661006535e-06, "loss": 0.297, "step": 673 }, { "batch_size": 4, "epoch": 0.2692, "step": 673, "tokens_per_device": 4100 }, { "epoch": 0.2692, "loss_ce": 0.8888260722160339, "loss_lvr": 2.197854995727539, "loss_mode_switch": 0.0, "loss_total": 1.1086115837097168, "step": 673 }, { "batch_size": 4, "epoch": 0.2692, "step": 673, "tokens_per_device": 4432 }, { "epoch": 0.2692, "loss_ce": 0.24646414816379547, "loss_lvr": 0.8252657651901245, "loss_mode_switch": 0.0, "loss_total": 0.32899072766304016, "step": 673 }, { "batch_size": 1, "epoch": 0.2692, "step": 673, "tokens_per_device": 4866 }, { "epoch": 0.2692, "loss_ce": 0.03442535921931267, "loss_lvr": 0.684384286403656, "loss_mode_switch": 0.0, "loss_total": 0.10286378860473633, "step": 673 }, { "batch_size": 4, "epoch": 0.2692, "step": 673, "tokens_per_device": 1904 }, { "epoch": 0.2692, "loss_ce": 0.3236459791660309, "loss_lvr": 1.0664420127868652, "loss_mode_switch": 0.0, "loss_total": 0.43029019236564636, "step": 673 }, { "batch_size": 4, "epoch": 0.2692, "step": 673, "tokens_per_device": 6888 }, { "epoch": 0.2692, "loss_ce": 0.29913437366485596, "loss_lvr": 0.8626341223716736, "loss_mode_switch": 0.0, "loss_total": 0.3853977918624878, "step": 673 }, { "batch_size": 1, "epoch": 0.2692, "step": 673, "tokens_per_device": 5147 }, { "epoch": 0.2692, "loss_ce": 0.020297130569815636, "loss_lvr": 0.5920602679252625, "loss_mode_switch": 0.0, "loss_total": 0.07950315624475479, "step": 673 }, { "batch_size": 4, "epoch": 0.2692, "step": 673, "tokens_per_device": 1452 }, { "epoch": 0.2692, "loss_ce": 0.04876627027988434, "loss_lvr": 1.0249886512756348, "loss_mode_switch": 0.0, "loss_total": 0.15126514434814453, "step": 673 }, { "batch_size": 4, "epoch": 0.2692, "step": 673, "tokens_per_device": 3856 }, { "epoch": 0.2692, "loss_ce": 0.31155890226364136, "loss_lvr": 0.9155598878860474, "loss_mode_switch": 0.0, "loss_total": 0.4031148850917816, "step": 673 }, { "epoch": 0.2696, "grad_norm": 2.0966947078704834, "learning_rate": 8.568584586493635e-06, "loss": 0.3225, "step": 674 }, { "batch_size": 4, "epoch": 0.2696, "step": 674, "tokens_per_device": 4312 }, { "epoch": 0.2696, "loss_ce": 0.12131240963935852, "loss_lvr": 0.8279209733009338, "loss_mode_switch": 0.0, "loss_total": 0.20410451292991638, "step": 674 }, { "batch_size": 4, "epoch": 0.2696, "step": 674, "tokens_per_device": 4176 }, { "epoch": 0.2696, "loss_ce": 0.2298990786075592, "loss_lvr": 0.9829831123352051, "loss_mode_switch": 0.0, "loss_total": 0.3281973898410797, "step": 674 }, { "batch_size": 4, "epoch": 0.2696, "step": 674, "tokens_per_device": 4916 }, { "epoch": 0.2696, "loss_ce": 0.33547794818878174, "loss_lvr": 0.8698993921279907, "loss_mode_switch": 0.0, "loss_total": 0.4224678874015808, "step": 674 }, { "batch_size": 4, "epoch": 0.2696, "step": 674, "tokens_per_device": 2040 }, { "epoch": 0.2696, "loss_ce": 0.9299414753913879, "loss_lvr": 0.8820973634719849, "loss_mode_switch": 0.0, "loss_total": 1.0181511640548706, "step": 674 }, { "batch_size": 1, "epoch": 0.2696, "step": 674, "tokens_per_device": 4156 }, { "epoch": 0.2696, "loss_ce": 0.28940948843955994, "loss_lvr": 0.6418771147727966, "loss_mode_switch": 0.0, "loss_total": 0.3535971939563751, "step": 674 }, { "batch_size": 1, "epoch": 0.2696, "step": 674, "tokens_per_device": 4745 }, { "epoch": 0.2696, "loss_ce": 0.5296595096588135, "loss_lvr": 0.5642126202583313, "loss_mode_switch": 0.0, "loss_total": 0.58608078956604, "step": 674 }, { "batch_size": 4, "epoch": 0.2696, "step": 674, "tokens_per_device": 6296 }, { "epoch": 0.2696, "loss_ce": 0.3774981200695038, "loss_lvr": 0.6663210988044739, "loss_mode_switch": 0.0, "loss_total": 0.4441302418708801, "step": 674 }, { "batch_size": 4, "epoch": 0.2696, "step": 674, "tokens_per_device": 1476 }, { "epoch": 0.2696, "loss_ce": 0.16991034150123596, "loss_lvr": 2.1921331882476807, "loss_mode_switch": 0.0, "loss_total": 0.38912367820739746, "step": 674 }, { "epoch": 0.27, "grad_norm": 1.3881213665008545, "learning_rate": 8.564044522734147e-06, "loss": 0.3554, "step": 675 }, { "batch_size": 4, "epoch": 0.27, "step": 675, "tokens_per_device": 4236 }, { "epoch": 0.27, "loss_ce": 0.4861123561859131, "loss_lvr": 0.831781268119812, "loss_mode_switch": 0.0, "loss_total": 0.5692904591560364, "step": 675 }, { "batch_size": 4, "epoch": 0.27, "step": 675, "tokens_per_device": 5688 }, { "epoch": 0.27, "loss_ce": 0.7852851152420044, "loss_lvr": 1.1138877868652344, "loss_mode_switch": 0.0, "loss_total": 0.8966739177703857, "step": 675 }, { "batch_size": 1, "epoch": 0.27, "step": 675, "tokens_per_device": 6698 }, { "epoch": 0.27, "loss_ce": 0.15644259750843048, "loss_lvr": 0.3227766454219818, "loss_mode_switch": 0.0, "loss_total": 0.18872025609016418, "step": 675 }, { "batch_size": 4, "epoch": 0.27, "step": 675, "tokens_per_device": 2680 }, { "epoch": 0.27, "loss_ce": 0.5178873538970947, "loss_lvr": 0.7882623672485352, "loss_mode_switch": 0.0, "loss_total": 0.5967136025428772, "step": 675 }, { "batch_size": 4, "epoch": 0.27, "step": 675, "tokens_per_device": 4524 }, { "epoch": 0.27, "loss_ce": 0.3050604462623596, "loss_lvr": 1.0199638605117798, "loss_mode_switch": 0.0, "loss_total": 0.4070568382740021, "step": 675 }, { "batch_size": 4, "epoch": 0.27, "step": 675, "tokens_per_device": 12428 }, { "epoch": 0.27, "loss_ce": 0.30952203273773193, "loss_lvr": 1.4887629747390747, "loss_mode_switch": 0.0, "loss_total": 0.45839834213256836, "step": 675 }, { "batch_size": 4, "epoch": 0.27, "step": 675, "tokens_per_device": 11384 }, { "epoch": 0.27, "loss_ce": 0.3428684175014496, "loss_lvr": 0.8603721261024475, "loss_mode_switch": 0.0, "loss_total": 0.4289056360721588, "step": 675 }, { "batch_size": 4, "epoch": 0.27, "step": 675, "tokens_per_device": 1388 }, { "epoch": 0.27, "loss_ce": 0.45717042684555054, "loss_lvr": 1.002997875213623, "loss_mode_switch": 0.0, "loss_total": 0.5574702024459839, "step": 675 }, { "epoch": 0.2704, "grad_norm": 1.69199538230896, "learning_rate": 8.559498477347777e-06, "loss": 0.3215, "step": 676 }, { "batch_size": 4, "epoch": 0.2704, "step": 676, "tokens_per_device": 5244 }, { "epoch": 0.2704, "loss_ce": 0.0028319654520601034, "loss_lvr": 0.7942527532577515, "loss_mode_switch": 0.0, "loss_total": 0.0822572410106659, "step": 676 }, { "batch_size": 1, "epoch": 0.2704, "step": 676, "tokens_per_device": 5790 }, { "epoch": 0.2704, "loss_ce": 0.0010624240385368466, "loss_lvr": 0.37571489810943604, "loss_mode_switch": 0.0, "loss_total": 0.038633912801742554, "step": 676 }, { "batch_size": 4, "epoch": 0.2704, "step": 676, "tokens_per_device": 1456 }, { "epoch": 0.2704, "loss_ce": 0.1634182333946228, "loss_lvr": 1.9414048194885254, "loss_mode_switch": 0.0, "loss_total": 0.3575587272644043, "step": 676 }, { "batch_size": 1, "epoch": 0.2704, "step": 676, "tokens_per_device": 4878 }, { "epoch": 0.2704, "loss_ce": 0.0948803648352623, "loss_lvr": 0.8005826473236084, "loss_mode_switch": 0.0, "loss_total": 0.1749386340379715, "step": 676 }, { "batch_size": 1, "epoch": 0.2704, "step": 676, "tokens_per_device": 5119 }, { "epoch": 0.2704, "loss_ce": 0.046974360942840576, "loss_lvr": 0.5276939868927002, "loss_mode_switch": 0.0, "loss_total": 0.09974376112222672, "step": 676 }, { "batch_size": 1, "epoch": 0.2704, "step": 676, "tokens_per_device": 4644 }, { "epoch": 0.2704, "loss_ce": 0.012066700495779514, "loss_lvr": 0.26179948449134827, "loss_mode_switch": 0.0, "loss_total": 0.038246650248765945, "step": 676 }, { "batch_size": 4, "epoch": 0.2704, "step": 676, "tokens_per_device": 5148 }, { "epoch": 0.2704, "loss_ce": 0.1424066424369812, "loss_lvr": 0.9539399743080139, "loss_mode_switch": 0.0, "loss_total": 0.23780064284801483, "step": 676 }, { "batch_size": 4, "epoch": 0.2704, "step": 676, "tokens_per_device": 3784 }, { "epoch": 0.2704, "loss_ce": 0.2245863676071167, "loss_lvr": 0.6884468197822571, "loss_mode_switch": 0.0, "loss_total": 0.29343104362487793, "step": 676 }, { "epoch": 0.2708, "grad_norm": 1.234402060508728, "learning_rate": 8.554946457964268e-06, "loss": 0.2588, "step": 677 }, { "batch_size": 1, "epoch": 0.2708, "step": 677, "tokens_per_device": 5190 }, { "epoch": 0.2708, "loss_ce": 0.002318883314728737, "loss_lvr": 0.27936381101608276, "loss_mode_switch": 0.0, "loss_total": 0.030255265533924103, "step": 677 }, { "batch_size": 4, "epoch": 0.2708, "step": 677, "tokens_per_device": 8644 }, { "epoch": 0.2708, "loss_ce": 0.7356569170951843, "loss_lvr": 1.0646065473556519, "loss_mode_switch": 0.0, "loss_total": 0.8421175479888916, "step": 677 }, { "batch_size": 4, "epoch": 0.2708, "step": 677, "tokens_per_device": 1400 }, { "epoch": 0.2708, "loss_ce": 0.509635329246521, "loss_lvr": 1.4998581409454346, "loss_mode_switch": 0.0, "loss_total": 0.6596211194992065, "step": 677 }, { "batch_size": 1, "epoch": 0.2708, "step": 677, "tokens_per_device": 5093 }, { "epoch": 0.2708, "loss_ce": 0.004159391857683659, "loss_lvr": 0.3240884840488434, "loss_mode_switch": 0.0, "loss_total": 0.03656823933124542, "step": 677 }, { "batch_size": 4, "epoch": 0.2708, "step": 677, "tokens_per_device": 4776 }, { "epoch": 0.2708, "loss_ce": 0.7061971426010132, "loss_lvr": 0.9290477633476257, "loss_mode_switch": 0.0, "loss_total": 0.7991019487380981, "step": 677 }, { "batch_size": 1, "epoch": 0.2708, "step": 677, "tokens_per_device": 4911 }, { "epoch": 0.2708, "loss_ce": 0.19415999948978424, "loss_lvr": 0.2676877975463867, "loss_mode_switch": 0.0, "loss_total": 0.22092877328395844, "step": 677 }, { "batch_size": 1, "epoch": 0.2708, "step": 677, "tokens_per_device": 4747 }, { "epoch": 0.2708, "loss_ce": 0.03080674074590206, "loss_lvr": 0.4428291916847229, "loss_mode_switch": 0.0, "loss_total": 0.07508966326713562, "step": 677 }, { "batch_size": 1, "epoch": 0.2708, "step": 677, "tokens_per_device": 5088 }, { "epoch": 0.2708, "loss_ce": 1.2462891340255737, "loss_lvr": 0.5308325886726379, "loss_mode_switch": 0.0, "loss_total": 1.2993724346160889, "step": 677 }, { "epoch": 0.2712, "grad_norm": 1.5363879203796387, "learning_rate": 8.550388472223391e-06, "loss": 0.3615, "step": 678 }, { "batch_size": 4, "epoch": 0.2712, "step": 678, "tokens_per_device": 4668 }, { "epoch": 0.2712, "loss_ce": 0.6779007315635681, "loss_lvr": 0.9090110063552856, "loss_mode_switch": 0.0, "loss_total": 0.7688018083572388, "step": 678 }, { "batch_size": 1, "epoch": 0.2712, "step": 678, "tokens_per_device": 4943 }, { "epoch": 0.2712, "loss_ce": 0.11790189892053604, "loss_lvr": 0.1995691955089569, "loss_mode_switch": 0.0, "loss_total": 0.1378588229417801, "step": 678 }, { "batch_size": 4, "epoch": 0.2712, "step": 678, "tokens_per_device": 4520 }, { "epoch": 0.2712, "loss_ce": 0.6268073320388794, "loss_lvr": 0.8203334808349609, "loss_mode_switch": 0.0, "loss_total": 0.7088406682014465, "step": 678 }, { "batch_size": 4, "epoch": 0.2712, "step": 678, "tokens_per_device": 1640 }, { "epoch": 0.2712, "loss_ce": 0.6221320629119873, "loss_lvr": 1.0219935178756714, "loss_mode_switch": 0.0, "loss_total": 0.7243314385414124, "step": 678 }, { "batch_size": 1, "epoch": 0.2712, "step": 678, "tokens_per_device": 5107 }, { "epoch": 0.2712, "loss_ce": 0.005244055297225714, "loss_lvr": 0.6840936541557312, "loss_mode_switch": 0.0, "loss_total": 0.07365342229604721, "step": 678 }, { "batch_size": 4, "epoch": 0.2712, "step": 678, "tokens_per_device": 11060 }, { "epoch": 0.2712, "loss_ce": 0.23074482381343842, "loss_lvr": 1.0089466571807861, "loss_mode_switch": 0.0, "loss_total": 0.33163949847221375, "step": 678 }, { "batch_size": 4, "epoch": 0.2712, "step": 678, "tokens_per_device": 6400 }, { "epoch": 0.2712, "loss_ce": 0.05665358901023865, "loss_lvr": 0.7930884957313538, "loss_mode_switch": 0.0, "loss_total": 0.13596244156360626, "step": 678 }, { "batch_size": 4, "epoch": 0.2712, "step": 678, "tokens_per_device": 5416 }, { "epoch": 0.2712, "loss_ce": 0.26793932914733887, "loss_lvr": 0.7403599619865417, "loss_mode_switch": 0.0, "loss_total": 0.3419753313064575, "step": 678 }, { "epoch": 0.2716, "grad_norm": 1.6685054302215576, "learning_rate": 8.54582452777493e-06, "loss": 0.3962, "step": 679 }, { "batch_size": 4, "epoch": 0.2716, "step": 679, "tokens_per_device": 3756 }, { "epoch": 0.2716, "loss_ce": 0.14510081708431244, "loss_lvr": 0.8872198462486267, "loss_mode_switch": 0.0, "loss_total": 0.2338227927684784, "step": 679 }, { "batch_size": 4, "epoch": 0.2716, "step": 679, "tokens_per_device": 11480 }, { "epoch": 0.2716, "loss_ce": 0.37904995679855347, "loss_lvr": 0.8103550672531128, "loss_mode_switch": 0.0, "loss_total": 0.4600854516029358, "step": 679 }, { "batch_size": 4, "epoch": 0.2716, "step": 679, "tokens_per_device": 3800 }, { "epoch": 0.2716, "loss_ce": 0.11504022032022476, "loss_lvr": 0.9844602346420288, "loss_mode_switch": 0.0, "loss_total": 0.21348625421524048, "step": 679 }, { "batch_size": 4, "epoch": 0.2716, "step": 679, "tokens_per_device": 4392 }, { "epoch": 0.2716, "loss_ce": 0.1276596635580063, "loss_lvr": 0.8783298134803772, "loss_mode_switch": 0.0, "loss_total": 0.2154926359653473, "step": 679 }, { "batch_size": 4, "epoch": 0.2716, "step": 679, "tokens_per_device": 9224 }, { "epoch": 0.2716, "loss_ce": 0.30387425422668457, "loss_lvr": 0.7174531817436218, "loss_mode_switch": 0.0, "loss_total": 0.3756195902824402, "step": 679 }, { "batch_size": 4, "epoch": 0.2716, "step": 679, "tokens_per_device": 3804 }, { "epoch": 0.2716, "loss_ce": 0.6825653314590454, "loss_lvr": 0.7494696378707886, "loss_mode_switch": 0.0, "loss_total": 0.7575122714042664, "step": 679 }, { "batch_size": 4, "epoch": 0.2716, "step": 679, "tokens_per_device": 6124 }, { "epoch": 0.2716, "loss_ce": 0.5006155967712402, "loss_lvr": 0.8420716524124146, "loss_mode_switch": 0.0, "loss_total": 0.5848227739334106, "step": 679 }, { "batch_size": 4, "epoch": 0.2716, "step": 679, "tokens_per_device": 4232 }, { "epoch": 0.2716, "loss_ce": 0.18657410144805908, "loss_lvr": 1.1840656995773315, "loss_mode_switch": 0.0, "loss_total": 0.30498066544532776, "step": 679 }, { "epoch": 0.272, "grad_norm": 1.4025737047195435, "learning_rate": 8.541254632278667e-06, "loss": 0.3544, "step": 680 }, { "batch_size": 4, "epoch": 0.272, "step": 680, "tokens_per_device": 1316 }, { "epoch": 0.272, "loss_ce": 0.3402683734893799, "loss_lvr": 1.0561100244522095, "loss_mode_switch": 0.0, "loss_total": 0.44587936997413635, "step": 680 }, { "batch_size": 4, "epoch": 0.272, "step": 680, "tokens_per_device": 2060 }, { "epoch": 0.272, "loss_ce": 0.3013201355934143, "loss_lvr": 1.0026392936706543, "loss_mode_switch": 0.0, "loss_total": 0.40158405900001526, "step": 680 }, { "batch_size": 1, "epoch": 0.272, "step": 680, "tokens_per_device": 4814 }, { "epoch": 0.272, "loss_ce": 0.0003943812334910035, "loss_lvr": 0.281005859375, "loss_mode_switch": 0.0, "loss_total": 0.02849496714770794, "step": 680 }, { "batch_size": 1, "epoch": 0.272, "step": 680, "tokens_per_device": 5573 }, { "epoch": 0.272, "loss_ce": 0.0013097504852339625, "loss_lvr": 0.4352967143058777, "loss_mode_switch": 0.0, "loss_total": 0.044839419424533844, "step": 680 }, { "batch_size": 1, "epoch": 0.272, "step": 680, "tokens_per_device": 4119 }, { "epoch": 0.272, "loss_ce": 0.13549742102622986, "loss_lvr": 0.5228860974311829, "loss_mode_switch": 0.0, "loss_total": 0.1877860277891159, "step": 680 }, { "batch_size": 4, "epoch": 0.272, "step": 680, "tokens_per_device": 1380 }, { "epoch": 0.272, "loss_ce": 0.8383550047874451, "loss_lvr": 1.0663387775421143, "loss_mode_switch": 0.0, "loss_total": 0.9449889063835144, "step": 680 }, { "batch_size": 4, "epoch": 0.272, "step": 680, "tokens_per_device": 4300 }, { "epoch": 0.272, "loss_ce": 0.12976007163524628, "loss_lvr": 0.8920919895172119, "loss_mode_switch": 0.0, "loss_total": 0.21896927058696747, "step": 680 }, { "batch_size": 1, "epoch": 0.272, "step": 680, "tokens_per_device": 4935 }, { "epoch": 0.272, "loss_ce": 0.0032558569218963385, "loss_lvr": 0.5290246605873108, "loss_mode_switch": 0.0, "loss_total": 0.05615832284092903, "step": 680 }, { "epoch": 0.2724, "grad_norm": 1.296458125114441, "learning_rate": 8.536678793404376e-06, "loss": 0.3094, "step": 681 }, { "batch_size": 1, "epoch": 0.2724, "step": 681, "tokens_per_device": 5120 }, { "epoch": 0.2724, "loss_ce": 0.002348338719457388, "loss_lvr": 0.745143473148346, "loss_mode_switch": 0.0, "loss_total": 0.07686269283294678, "step": 681 }, { "batch_size": 4, "epoch": 0.2724, "step": 681, "tokens_per_device": 1304 }, { "epoch": 0.2724, "loss_ce": 0.3339751958847046, "loss_lvr": 1.0793817043304443, "loss_mode_switch": 0.0, "loss_total": 0.441913366317749, "step": 681 }, { "batch_size": 4, "epoch": 0.2724, "step": 681, "tokens_per_device": 5108 }, { "epoch": 0.2724, "loss_ce": 0.42470604181289673, "loss_lvr": 1.1758630275726318, "loss_mode_switch": 0.0, "loss_total": 0.5422923564910889, "step": 681 }, { "batch_size": 1, "epoch": 0.2724, "step": 681, "tokens_per_device": 4863 }, { "epoch": 0.2724, "loss_ce": 0.045194584876298904, "loss_lvr": 0.7505512237548828, "loss_mode_switch": 0.0, "loss_total": 0.12024970352649689, "step": 681 }, { "batch_size": 4, "epoch": 0.2724, "step": 681, "tokens_per_device": 1492 }, { "epoch": 0.2724, "loss_ce": 0.41270360350608826, "loss_lvr": 2.803650140762329, "loss_mode_switch": 0.0, "loss_total": 0.6930686235427856, "step": 681 }, { "batch_size": 4, "epoch": 0.2724, "step": 681, "tokens_per_device": 11072 }, { "epoch": 0.2724, "loss_ce": 0.005237925797700882, "loss_lvr": 0.4254651367664337, "loss_mode_switch": 0.0, "loss_total": 0.04778444021940231, "step": 681 }, { "batch_size": 4, "epoch": 0.2724, "step": 681, "tokens_per_device": 4280 }, { "epoch": 0.2724, "loss_ce": 0.374423086643219, "loss_lvr": 1.0785839557647705, "loss_mode_switch": 0.0, "loss_total": 0.48228147625923157, "step": 681 }, { "batch_size": 1, "epoch": 0.2724, "step": 681, "tokens_per_device": 5076 }, { "epoch": 0.2724, "loss_ce": 0.008563956245779991, "loss_lvr": 0.7208784818649292, "loss_mode_switch": 0.0, "loss_total": 0.08065180480480194, "step": 681 }, { "epoch": 0.2728, "grad_norm": 1.5052516460418701, "learning_rate": 8.532097018831805e-06, "loss": 0.3508, "step": 682 }, { "batch_size": 1, "epoch": 0.2728, "step": 682, "tokens_per_device": 5222 }, { "epoch": 0.2728, "loss_ce": 0.020655889064073563, "loss_lvr": 0.2690862715244293, "loss_mode_switch": 0.0, "loss_total": 0.047564513981342316, "step": 682 }, { "batch_size": 4, "epoch": 0.2728, "step": 682, "tokens_per_device": 3820 }, { "epoch": 0.2728, "loss_ce": 0.2525704503059387, "loss_lvr": 1.052697777748108, "loss_mode_switch": 0.0, "loss_total": 0.35784024000167847, "step": 682 }, { "batch_size": 1, "epoch": 0.2728, "step": 682, "tokens_per_device": 5196 }, { "epoch": 0.2728, "loss_ce": 0.008697391487658024, "loss_lvr": 0.7130717039108276, "loss_mode_switch": 0.0, "loss_total": 0.08000456541776657, "step": 682 }, { "batch_size": 4, "epoch": 0.2728, "step": 682, "tokens_per_device": 4008 }, { "epoch": 0.2728, "loss_ce": 0.2424774169921875, "loss_lvr": 0.7762266397476196, "loss_mode_switch": 0.0, "loss_total": 0.3201000690460205, "step": 682 }, { "batch_size": 1, "epoch": 0.2728, "step": 682, "tokens_per_device": 4924 }, { "epoch": 0.2728, "loss_ce": 0.161307230591774, "loss_lvr": 0.39660167694091797, "loss_mode_switch": 0.0, "loss_total": 0.20096740126609802, "step": 682 }, { "batch_size": 1, "epoch": 0.2728, "step": 682, "tokens_per_device": 5132 }, { "epoch": 0.2728, "loss_ce": 0.0074109602719545364, "loss_lvr": 0.8129197359085083, "loss_mode_switch": 0.0, "loss_total": 0.08870293200016022, "step": 682 }, { "batch_size": 4, "epoch": 0.2728, "step": 682, "tokens_per_device": 5172 }, { "epoch": 0.2728, "loss_ce": 0.19661320745944977, "loss_lvr": 1.0227364301681519, "loss_mode_switch": 0.0, "loss_total": 0.29888683557510376, "step": 682 }, { "batch_size": 4, "epoch": 0.2728, "step": 682, "tokens_per_device": 3800 }, { "epoch": 0.2728, "loss_ce": 0.1963714063167572, "loss_lvr": 1.2319868803024292, "loss_mode_switch": 0.0, "loss_total": 0.3195700943470001, "step": 682 }, { "epoch": 0.2732, "grad_norm": 1.3047540187835693, "learning_rate": 8.527509316250663e-06, "loss": 0.2923, "step": 683 }, { "batch_size": 4, "epoch": 0.2732, "step": 683, "tokens_per_device": 5632 }, { "epoch": 0.2732, "loss_ce": 0.21528971195220947, "loss_lvr": 0.6709771156311035, "loss_mode_switch": 0.0, "loss_total": 0.2823874354362488, "step": 683 }, { "batch_size": 1, "epoch": 0.2732, "step": 683, "tokens_per_device": 4833 }, { "epoch": 0.2732, "loss_ce": 0.011958250775933266, "loss_lvr": 1.2025574445724487, "loss_mode_switch": 0.0, "loss_total": 0.1322139948606491, "step": 683 }, { "batch_size": 4, "epoch": 0.2732, "step": 683, "tokens_per_device": 1440 }, { "epoch": 0.2732, "loss_ce": 0.03903943672776222, "loss_lvr": 0.8662797808647156, "loss_mode_switch": 0.0, "loss_total": 0.12566742300987244, "step": 683 }, { "batch_size": 4, "epoch": 0.2732, "step": 683, "tokens_per_device": 3944 }, { "epoch": 0.2732, "loss_ce": 0.47438299655914307, "loss_lvr": 0.9551290273666382, "loss_mode_switch": 0.0, "loss_total": 0.5698959231376648, "step": 683 }, { "batch_size": 4, "epoch": 0.2732, "step": 683, "tokens_per_device": 4672 }, { "epoch": 0.2732, "loss_ce": 0.116355761885643, "loss_lvr": 0.815965473651886, "loss_mode_switch": 0.0, "loss_total": 0.1979523003101349, "step": 683 }, { "batch_size": 1, "epoch": 0.2732, "step": 683, "tokens_per_device": 5015 }, { "epoch": 0.2732, "loss_ce": 0.014411075972020626, "loss_lvr": 0.33587968349456787, "loss_mode_switch": 0.0, "loss_total": 0.04799904674291611, "step": 683 }, { "batch_size": 1, "epoch": 0.2732, "step": 683, "tokens_per_device": 4955 }, { "epoch": 0.2732, "loss_ce": 0.09351914376020432, "loss_lvr": 0.17566370964050293, "loss_mode_switch": 0.0, "loss_total": 0.11108551919460297, "step": 683 }, { "batch_size": 1, "epoch": 0.2732, "step": 683, "tokens_per_device": 5084 }, { "epoch": 0.2732, "loss_ce": 0.0444597490131855, "loss_lvr": 0.7447046041488647, "loss_mode_switch": 0.0, "loss_total": 0.11893020570278168, "step": 683 }, { "epoch": 0.2736, "grad_norm": 1.4137362241744995, "learning_rate": 8.522915693360607e-06, "loss": 0.3147, "step": 684 }, { "batch_size": 4, "epoch": 0.2736, "step": 684, "tokens_per_device": 3824 }, { "epoch": 0.2736, "loss_ce": 0.8458240032196045, "loss_lvr": 0.9808081984519958, "loss_mode_switch": 0.0, "loss_total": 0.9439048171043396, "step": 684 }, { "batch_size": 4, "epoch": 0.2736, "step": 684, "tokens_per_device": 4404 }, { "epoch": 0.2736, "loss_ce": 0.462374746799469, "loss_lvr": 0.7967216372489929, "loss_mode_switch": 0.0, "loss_total": 0.5420469045639038, "step": 684 }, { "batch_size": 1, "epoch": 0.2736, "step": 684, "tokens_per_device": 5062 }, { "epoch": 0.2736, "loss_ce": 0.019254550337791443, "loss_lvr": 0.5464732050895691, "loss_mode_switch": 0.0, "loss_total": 0.07390187680721283, "step": 684 }, { "batch_size": 1, "epoch": 0.2736, "step": 684, "tokens_per_device": 5249 }, { "epoch": 0.2736, "loss_ce": 0.023766744881868362, "loss_lvr": 0.4487704336643219, "loss_mode_switch": 0.0, "loss_total": 0.06864379346370697, "step": 684 }, { "batch_size": 1, "epoch": 0.2736, "step": 684, "tokens_per_device": 4173 }, { "epoch": 0.2736, "loss_ce": 0.3366711139678955, "loss_lvr": 0.7269874811172485, "loss_mode_switch": 0.0, "loss_total": 0.4093698561191559, "step": 684 }, { "batch_size": 4, "epoch": 0.2736, "step": 684, "tokens_per_device": 5504 }, { "epoch": 0.2736, "loss_ce": 0.03537248447537422, "loss_lvr": 0.9489871263504028, "loss_mode_switch": 0.0, "loss_total": 0.13027119636535645, "step": 684 }, { "batch_size": 4, "epoch": 0.2736, "step": 684, "tokens_per_device": 1264 }, { "epoch": 0.2736, "loss_ce": 0.2529255151748657, "loss_lvr": 1.7151015996932983, "loss_mode_switch": 0.0, "loss_total": 0.42443567514419556, "step": 684 }, { "batch_size": 1, "epoch": 0.2736, "step": 684, "tokens_per_device": 4888 }, { "epoch": 0.2736, "loss_ce": 0.032189104706048965, "loss_lvr": 0.419827938079834, "loss_mode_switch": 0.0, "loss_total": 0.07417190074920654, "step": 684 }, { "epoch": 0.274, "grad_norm": 1.3155940771102905, "learning_rate": 8.518316157871232e-06, "loss": 0.3377, "step": 685 }, { "batch_size": 1, "epoch": 0.274, "step": 685, "tokens_per_device": 6488 }, { "epoch": 0.274, "loss_ce": 0.10711988806724548, "loss_lvr": 0.46298524737358093, "loss_mode_switch": 0.0, "loss_total": 0.1534184217453003, "step": 685 }, { "batch_size": 4, "epoch": 0.274, "step": 685, "tokens_per_device": 4296 }, { "epoch": 0.274, "loss_ce": 0.10927143692970276, "loss_lvr": 0.891411542892456, "loss_mode_switch": 0.0, "loss_total": 0.19841259717941284, "step": 685 }, { "batch_size": 4, "epoch": 0.274, "step": 685, "tokens_per_device": 1284 }, { "epoch": 0.274, "loss_ce": 0.6855195760726929, "loss_lvr": 1.5645185708999634, "loss_mode_switch": 0.0, "loss_total": 0.8419714570045471, "step": 685 }, { "batch_size": 1, "epoch": 0.274, "step": 685, "tokens_per_device": 5112 }, { "epoch": 0.274, "loss_ce": 0.00033484637970104814, "loss_lvr": 0.20888826251029968, "loss_mode_switch": 0.0, "loss_total": 0.02122367173433304, "step": 685 }, { "batch_size": 4, "epoch": 0.274, "step": 685, "tokens_per_device": 4964 }, { "epoch": 0.274, "loss_ce": 0.13626769185066223, "loss_lvr": 0.6911308765411377, "loss_mode_switch": 0.0, "loss_total": 0.20538078248500824, "step": 685 }, { "batch_size": 4, "epoch": 0.274, "step": 685, "tokens_per_device": 4052 }, { "epoch": 0.274, "loss_ce": 0.36001497507095337, "loss_lvr": 0.9418631196022034, "loss_mode_switch": 0.0, "loss_total": 0.45420128107070923, "step": 685 }, { "batch_size": 4, "epoch": 0.274, "step": 685, "tokens_per_device": 3868 }, { "epoch": 0.274, "loss_ce": 0.3183249533176422, "loss_lvr": 0.9785124063491821, "loss_mode_switch": 0.0, "loss_total": 0.4161761999130249, "step": 685 }, { "batch_size": 4, "epoch": 0.274, "step": 685, "tokens_per_device": 4092 }, { "epoch": 0.274, "loss_ce": 0.19843530654907227, "loss_lvr": 0.9391624927520752, "loss_mode_switch": 0.0, "loss_total": 0.29235154390335083, "step": 685 }, { "epoch": 0.2744, "grad_norm": 1.2367385625839233, "learning_rate": 8.513710717502057e-06, "loss": 0.2678, "step": 686 }, { "batch_size": 4, "epoch": 0.2744, "step": 686, "tokens_per_device": 4196 }, { "epoch": 0.2744, "loss_ce": 0.5247668027877808, "loss_lvr": 0.996839165687561, "loss_mode_switch": 0.0, "loss_total": 0.6244507431983948, "step": 686 }, { "batch_size": 1, "epoch": 0.2744, "step": 686, "tokens_per_device": 5567 }, { "epoch": 0.2744, "loss_ce": 0.006769652012735605, "loss_lvr": 0.47806429862976074, "loss_mode_switch": 0.0, "loss_total": 0.0545760840177536, "step": 686 }, { "batch_size": 4, "epoch": 0.2744, "step": 686, "tokens_per_device": 4324 }, { "epoch": 0.2744, "loss_ce": 0.10093235224485397, "loss_lvr": 0.921920120716095, "loss_mode_switch": 0.0, "loss_total": 0.19312436878681183, "step": 686 }, { "batch_size": 4, "epoch": 0.2744, "step": 686, "tokens_per_device": 6220 }, { "epoch": 0.2744, "loss_ce": 0.10482459515333176, "loss_lvr": 0.7312508225440979, "loss_mode_switch": 0.0, "loss_total": 0.1779496818780899, "step": 686 }, { "batch_size": 4, "epoch": 0.2744, "step": 686, "tokens_per_device": 4612 }, { "epoch": 0.2744, "loss_ce": 0.5730615258216858, "loss_lvr": 0.9472624659538269, "loss_mode_switch": 0.0, "loss_total": 0.6677877902984619, "step": 686 }, { "batch_size": 1, "epoch": 0.2744, "step": 686, "tokens_per_device": 5131 }, { "epoch": 0.2744, "loss_ce": 0.016183681786060333, "loss_lvr": 0.22009408473968506, "loss_mode_switch": 0.0, "loss_total": 0.03819309175014496, "step": 686 }, { "batch_size": 1, "epoch": 0.2744, "step": 686, "tokens_per_device": 4879 }, { "epoch": 0.2744, "loss_ce": 0.0003194641030859202, "loss_lvr": 0.2406315803527832, "loss_mode_switch": 0.0, "loss_total": 0.02438262291252613, "step": 686 }, { "batch_size": 4, "epoch": 0.2744, "step": 686, "tokens_per_device": 3828 }, { "epoch": 0.2744, "loss_ce": 0.055460553616285324, "loss_lvr": 0.9438380002975464, "loss_mode_switch": 0.0, "loss_total": 0.14984434843063354, "step": 686 }, { "epoch": 0.2748, "grad_norm": 1.3970832824707031, "learning_rate": 8.509099379982509e-06, "loss": 0.3021, "step": 687 }, { "batch_size": 4, "epoch": 0.2748, "step": 687, "tokens_per_device": 4292 }, { "epoch": 0.2748, "loss_ce": 0.20678552985191345, "loss_lvr": 0.6486177444458008, "loss_mode_switch": 0.0, "loss_total": 0.27164730429649353, "step": 687 }, { "batch_size": 4, "epoch": 0.2748, "step": 687, "tokens_per_device": 6288 }, { "epoch": 0.2748, "loss_ce": 0.014103410765528679, "loss_lvr": 0.8999781012535095, "loss_mode_switch": 0.0, "loss_total": 0.10410122573375702, "step": 687 }, { "batch_size": 4, "epoch": 0.2748, "step": 687, "tokens_per_device": 4212 }, { "epoch": 0.2748, "loss_ce": 0.10861050337553024, "loss_lvr": 0.8778277039527893, "loss_mode_switch": 0.0, "loss_total": 0.19639328122138977, "step": 687 }, { "batch_size": 4, "epoch": 0.2748, "step": 687, "tokens_per_device": 2636 }, { "epoch": 0.2748, "loss_ce": 0.6805899739265442, "loss_lvr": 0.8469359278678894, "loss_mode_switch": 0.0, "loss_total": 0.7652835845947266, "step": 687 }, { "batch_size": 4, "epoch": 0.2748, "step": 687, "tokens_per_device": 2608 }, { "epoch": 0.2748, "loss_ce": 0.38847634196281433, "loss_lvr": 0.9824977517127991, "loss_mode_switch": 0.0, "loss_total": 0.4867261052131653, "step": 687 }, { "batch_size": 4, "epoch": 0.2748, "step": 687, "tokens_per_device": 11432 }, { "epoch": 0.2748, "loss_ce": 0.12626507878303528, "loss_lvr": 1.1165528297424316, "loss_mode_switch": 0.0, "loss_total": 0.2379203736782074, "step": 687 }, { "batch_size": 1, "epoch": 0.2748, "step": 687, "tokens_per_device": 4938 }, { "epoch": 0.2748, "loss_ce": 0.034231722354888916, "loss_lvr": 0.1897398978471756, "loss_mode_switch": 0.0, "loss_total": 0.053205713629722595, "step": 687 }, { "batch_size": 4, "epoch": 0.2748, "step": 687, "tokens_per_device": 4528 }, { "epoch": 0.2748, "loss_ce": 0.3487400710582733, "loss_lvr": 0.9462435841560364, "loss_mode_switch": 0.0, "loss_total": 0.4433644413948059, "step": 687 }, { "epoch": 0.2752, "grad_norm": 1.2948553562164307, "learning_rate": 8.504482153051912e-06, "loss": 0.3295, "step": 688 }, { "batch_size": 4, "epoch": 0.2752, "step": 688, "tokens_per_device": 4860 }, { "epoch": 0.2752, "loss_ce": 0.17504483461380005, "loss_lvr": 0.8097882270812988, "loss_mode_switch": 0.0, "loss_total": 0.256023645401001, "step": 688 }, { "batch_size": 1, "epoch": 0.2752, "step": 688, "tokens_per_device": 4863 }, { "epoch": 0.2752, "loss_ce": 0.0011976560344919562, "loss_lvr": 0.37604185938835144, "loss_mode_switch": 0.0, "loss_total": 0.03880184143781662, "step": 688 }, { "batch_size": 4, "epoch": 0.2752, "step": 688, "tokens_per_device": 6804 }, { "epoch": 0.2752, "loss_ce": 0.08320711553096771, "loss_lvr": 0.4316728115081787, "loss_mode_switch": 0.0, "loss_total": 0.12637439370155334, "step": 688 }, { "batch_size": 4, "epoch": 0.2752, "step": 688, "tokens_per_device": 4432 }, { "epoch": 0.2752, "loss_ce": 0.036055490374565125, "loss_lvr": 0.9779440760612488, "loss_mode_switch": 0.0, "loss_total": 0.1338498890399933, "step": 688 }, { "batch_size": 4, "epoch": 0.2752, "step": 688, "tokens_per_device": 4348 }, { "epoch": 0.2752, "loss_ce": 0.06378263235092163, "loss_lvr": 0.8825609683990479, "loss_mode_switch": 0.0, "loss_total": 0.15203872323036194, "step": 688 }, { "batch_size": 4, "epoch": 0.2752, "step": 688, "tokens_per_device": 4188 }, { "epoch": 0.2752, "loss_ce": 0.4168263375759125, "loss_lvr": 0.6806212663650513, "loss_mode_switch": 0.0, "loss_total": 0.4848884642124176, "step": 688 }, { "batch_size": 4, "epoch": 0.2752, "step": 688, "tokens_per_device": 4240 }, { "epoch": 0.2752, "loss_ce": 0.5565468668937683, "loss_lvr": 0.9371207356452942, "loss_mode_switch": 0.0, "loss_total": 0.6502589583396912, "step": 688 }, { "batch_size": 1, "epoch": 0.2752, "step": 688, "tokens_per_device": 5062 }, { "epoch": 0.2752, "loss_ce": 0.15904740989208221, "loss_lvr": 0.3478885889053345, "loss_mode_switch": 0.0, "loss_total": 0.1938362717628479, "step": 688 }, { "epoch": 0.2756, "grad_norm": 1.7119686603546143, "learning_rate": 8.499859044459478e-06, "loss": 0.3814, "step": 689 }, { "batch_size": 4, "epoch": 0.2756, "step": 689, "tokens_per_device": 2660 }, { "epoch": 0.2756, "loss_ce": 0.7899270057678223, "loss_lvr": 1.0158075094223022, "loss_mode_switch": 0.0, "loss_total": 0.8915077447891235, "step": 689 }, { "batch_size": 1, "epoch": 0.2756, "step": 689, "tokens_per_device": 5098 }, { "epoch": 0.2756, "loss_ce": 0.06709514558315277, "loss_lvr": 0.13975080847740173, "loss_mode_switch": 0.0, "loss_total": 0.08107022941112518, "step": 689 }, { "batch_size": 4, "epoch": 0.2756, "step": 689, "tokens_per_device": 6164 }, { "epoch": 0.2756, "loss_ce": 0.5805909633636475, "loss_lvr": 0.8586840629577637, "loss_mode_switch": 0.0, "loss_total": 0.6664593815803528, "step": 689 }, { "batch_size": 1, "epoch": 0.2756, "step": 689, "tokens_per_device": 5018 }, { "epoch": 0.2756, "loss_ce": 0.009058231487870216, "loss_lvr": 1.2173962593078613, "loss_mode_switch": 0.0, "loss_total": 0.1307978630065918, "step": 689 }, { "batch_size": 4, "epoch": 0.2756, "step": 689, "tokens_per_device": 4432 }, { "epoch": 0.2756, "loss_ce": 0.1142532080411911, "loss_lvr": 1.0602978467941284, "loss_mode_switch": 0.0, "loss_total": 0.22028300166130066, "step": 689 }, { "batch_size": 4, "epoch": 0.2756, "step": 689, "tokens_per_device": 3784 }, { "epoch": 0.2756, "loss_ce": 0.4580603241920471, "loss_lvr": 1.0322682857513428, "loss_mode_switch": 0.0, "loss_total": 0.5612871646881104, "step": 689 }, { "batch_size": 4, "epoch": 0.2756, "step": 689, "tokens_per_device": 7136 }, { "epoch": 0.2756, "loss_ce": 0.555520236492157, "loss_lvr": 1.118027925491333, "loss_mode_switch": 0.0, "loss_total": 0.6673230528831482, "step": 689 }, { "batch_size": 4, "epoch": 0.2756, "step": 689, "tokens_per_device": 3512 }, { "epoch": 0.2756, "loss_ce": 0.03349219262599945, "loss_lvr": 1.2640235424041748, "loss_mode_switch": 0.0, "loss_total": 0.15989455580711365, "step": 689 }, { "epoch": 0.276, "grad_norm": 1.5711617469787598, "learning_rate": 8.495230061964289e-06, "loss": 0.3532, "step": 690 }, { "batch_size": 1, "epoch": 0.276, "step": 690, "tokens_per_device": 4773 }, { "epoch": 0.276, "loss_ce": 0.0009399122791364789, "loss_lvr": 0.5031118988990784, "loss_mode_switch": 0.0, "loss_total": 0.05125110223889351, "step": 690 }, { "batch_size": 4, "epoch": 0.276, "step": 690, "tokens_per_device": 7996 }, { "epoch": 0.276, "loss_ce": 0.07483170926570892, "loss_lvr": 0.7288603782653809, "loss_mode_switch": 0.0, "loss_total": 0.14771774411201477, "step": 690 }, { "batch_size": 4, "epoch": 0.276, "step": 690, "tokens_per_device": 2556 }, { "epoch": 0.276, "loss_ce": 0.6279179453849792, "loss_lvr": 1.0761905908584595, "loss_mode_switch": 0.0, "loss_total": 0.7355369925498962, "step": 690 }, { "batch_size": 4, "epoch": 0.276, "step": 690, "tokens_per_device": 3808 }, { "epoch": 0.276, "loss_ce": 0.27571237087249756, "loss_lvr": 1.0030580759048462, "loss_mode_switch": 0.0, "loss_total": 0.3760181665420532, "step": 690 }, { "batch_size": 4, "epoch": 0.276, "step": 690, "tokens_per_device": 1516 }, { "epoch": 0.276, "loss_ce": 0.09059765934944153, "loss_lvr": 0.8383883833885193, "loss_mode_switch": 0.0, "loss_total": 0.1744365096092224, "step": 690 }, { "batch_size": 4, "epoch": 0.276, "step": 690, "tokens_per_device": 1244 }, { "epoch": 0.276, "loss_ce": 0.2927107810974121, "loss_lvr": 1.1730451583862305, "loss_mode_switch": 0.0, "loss_total": 0.4100152850151062, "step": 690 }, { "batch_size": 4, "epoch": 0.276, "step": 690, "tokens_per_device": 6516 }, { "epoch": 0.276, "loss_ce": 0.011666649021208286, "loss_lvr": 0.7591490745544434, "loss_mode_switch": 0.0, "loss_total": 0.0875815600156784, "step": 690 }, { "batch_size": 4, "epoch": 0.276, "step": 690, "tokens_per_device": 4332 }, { "epoch": 0.276, "loss_ce": 0.31479141116142273, "loss_lvr": 0.8640553951263428, "loss_mode_switch": 0.0, "loss_total": 0.4011969566345215, "step": 690 }, { "epoch": 0.2764, "grad_norm": 1.3831654787063599, "learning_rate": 8.49059521333528e-06, "loss": 0.3321, "step": 691 }, { "batch_size": 4, "epoch": 0.2764, "step": 691, "tokens_per_device": 4324 }, { "epoch": 0.2764, "loss_ce": 0.17112654447555542, "loss_lvr": 1.1450982093811035, "loss_mode_switch": 0.0, "loss_total": 0.28563636541366577, "step": 691 }, { "batch_size": 4, "epoch": 0.2764, "step": 691, "tokens_per_device": 2004 }, { "epoch": 0.2764, "loss_ce": 0.09885108470916748, "loss_lvr": 0.9692549705505371, "loss_mode_switch": 0.0, "loss_total": 0.1957765817642212, "step": 691 }, { "batch_size": 4, "epoch": 0.2764, "step": 691, "tokens_per_device": 2632 }, { "epoch": 0.2764, "loss_ce": 0.10208212584257126, "loss_lvr": 0.9292696118354797, "loss_mode_switch": 0.0, "loss_total": 0.19500908255577087, "step": 691 }, { "batch_size": 1, "epoch": 0.2764, "step": 691, "tokens_per_device": 4875 }, { "epoch": 0.2764, "loss_ce": 0.0006142858183011413, "loss_lvr": 1.188271164894104, "loss_mode_switch": 0.0, "loss_total": 0.11944140493869781, "step": 691 }, { "batch_size": 4, "epoch": 0.2764, "step": 691, "tokens_per_device": 4312 }, { "epoch": 0.2764, "loss_ce": 0.15916013717651367, "loss_lvr": 0.6544256806373596, "loss_mode_switch": 0.0, "loss_total": 0.22460269927978516, "step": 691 }, { "batch_size": 1, "epoch": 0.2764, "step": 691, "tokens_per_device": 5175 }, { "epoch": 0.2764, "loss_ce": 0.029497774317860603, "loss_lvr": 0.4482579231262207, "loss_mode_switch": 0.0, "loss_total": 0.07432356476783752, "step": 691 }, { "batch_size": 4, "epoch": 0.2764, "step": 691, "tokens_per_device": 3848 }, { "epoch": 0.2764, "loss_ce": 0.16665107011795044, "loss_lvr": 1.108072280883789, "loss_mode_switch": 0.0, "loss_total": 0.2774583101272583, "step": 691 }, { "batch_size": 4, "epoch": 0.2764, "step": 691, "tokens_per_device": 4796 }, { "epoch": 0.2764, "loss_ce": 0.12980011105537415, "loss_lvr": 0.7637221217155457, "loss_mode_switch": 0.0, "loss_total": 0.20617231726646423, "step": 691 }, { "epoch": 0.2768, "grad_norm": 1.4618834257125854, "learning_rate": 8.485954506351241e-06, "loss": 0.3225, "step": 692 }, { "batch_size": 4, "epoch": 0.2768, "step": 692, "tokens_per_device": 4708 }, { "epoch": 0.2768, "loss_ce": 0.9606863260269165, "loss_lvr": 1.043190836906433, "loss_mode_switch": 0.0, "loss_total": 1.0650054216384888, "step": 692 }, { "batch_size": 4, "epoch": 0.2768, "step": 692, "tokens_per_device": 5464 }, { "epoch": 0.2768, "loss_ce": 0.1390388160943985, "loss_lvr": 0.7594485878944397, "loss_mode_switch": 0.0, "loss_total": 0.21498367190361023, "step": 692 }, { "batch_size": 4, "epoch": 0.2768, "step": 692, "tokens_per_device": 2724 }, { "epoch": 0.2768, "loss_ce": 0.13242436945438385, "loss_lvr": 0.6871271729469299, "loss_mode_switch": 0.0, "loss_total": 0.20113709568977356, "step": 692 }, { "batch_size": 4, "epoch": 0.2768, "step": 692, "tokens_per_device": 5952 }, { "epoch": 0.2768, "loss_ce": 0.1314694881439209, "loss_lvr": 1.0813615322113037, "loss_mode_switch": 0.0, "loss_total": 0.2396056354045868, "step": 692 }, { "batch_size": 1, "epoch": 0.2768, "step": 692, "tokens_per_device": 4242 }, { "epoch": 0.2768, "loss_ce": 0.018915003165602684, "loss_lvr": 0.7091767191886902, "loss_mode_switch": 0.0, "loss_total": 0.08983267843723297, "step": 692 }, { "batch_size": 4, "epoch": 0.2768, "step": 692, "tokens_per_device": 4280 }, { "epoch": 0.2768, "loss_ce": 0.015240891836583614, "loss_lvr": 0.9750903248786926, "loss_mode_switch": 0.0, "loss_total": 0.11274992674589157, "step": 692 }, { "batch_size": 4, "epoch": 0.2768, "step": 692, "tokens_per_device": 12188 }, { "epoch": 0.2768, "loss_ce": 0.7800886034965515, "loss_lvr": 0.8777444958686829, "loss_mode_switch": 0.0, "loss_total": 0.8678630590438843, "step": 692 }, { "batch_size": 4, "epoch": 0.2768, "step": 692, "tokens_per_device": 7372 }, { "epoch": 0.2768, "loss_ce": 0.03348081558942795, "loss_lvr": 0.8110862970352173, "loss_mode_switch": 0.0, "loss_total": 0.11458944529294968, "step": 692 }, { "epoch": 0.2772, "grad_norm": 1.3171454668045044, "learning_rate": 8.481307948800787e-06, "loss": 0.3478, "step": 693 }, { "batch_size": 1, "epoch": 0.2772, "step": 693, "tokens_per_device": 5334 }, { "epoch": 0.2772, "loss_ce": 0.07794986665248871, "loss_lvr": 0.7584951519966125, "loss_mode_switch": 0.0, "loss_total": 0.1537993848323822, "step": 693 }, { "batch_size": 4, "epoch": 0.2772, "step": 693, "tokens_per_device": 3852 }, { "epoch": 0.2772, "loss_ce": 0.08058563619852066, "loss_lvr": 0.5611488223075867, "loss_mode_switch": 0.0, "loss_total": 0.13670051097869873, "step": 693 }, { "batch_size": 1, "epoch": 0.2772, "step": 693, "tokens_per_device": 4873 }, { "epoch": 0.2772, "loss_ce": 0.006249399855732918, "loss_lvr": 0.6057267785072327, "loss_mode_switch": 0.0, "loss_total": 0.06682208180427551, "step": 693 }, { "batch_size": 4, "epoch": 0.2772, "step": 693, "tokens_per_device": 13864 }, { "epoch": 0.2772, "loss_ce": 0.1826227903366089, "loss_lvr": 0.7413793206214905, "loss_mode_switch": 0.0, "loss_total": 0.25676071643829346, "step": 693 }, { "batch_size": 4, "epoch": 0.2772, "step": 693, "tokens_per_device": 2712 }, { "epoch": 0.2772, "loss_ce": 0.027242807671427727, "loss_lvr": 0.5468629598617554, "loss_mode_switch": 0.0, "loss_total": 0.08192910254001617, "step": 693 }, { "batch_size": 4, "epoch": 0.2772, "step": 693, "tokens_per_device": 4424 }, { "epoch": 0.2772, "loss_ce": 0.3642462491989136, "loss_lvr": 0.7305670380592346, "loss_mode_switch": 0.0, "loss_total": 0.43730294704437256, "step": 693 }, { "batch_size": 4, "epoch": 0.2772, "step": 693, "tokens_per_device": 2556 }, { "epoch": 0.2772, "loss_ce": 0.46421560645103455, "loss_lvr": 0.9080520868301392, "loss_mode_switch": 0.0, "loss_total": 0.555020809173584, "step": 693 }, { "batch_size": 1, "epoch": 0.2772, "step": 693, "tokens_per_device": 4688 }, { "epoch": 0.2772, "loss_ce": 0.017557982355356216, "loss_lvr": 0.5233907103538513, "loss_mode_switch": 0.0, "loss_total": 0.06989705562591553, "step": 693 }, { "epoch": 0.2776, "grad_norm": 1.3502613306045532, "learning_rate": 8.476655548482353e-06, "loss": 0.2861, "step": 694 }, { "batch_size": 4, "epoch": 0.2776, "step": 694, "tokens_per_device": 4876 }, { "epoch": 0.2776, "loss_ce": 0.025302812457084656, "loss_lvr": 0.9385969042778015, "loss_mode_switch": 0.0, "loss_total": 0.11916250735521317, "step": 694 }, { "batch_size": 1, "epoch": 0.2776, "step": 694, "tokens_per_device": 4871 }, { "epoch": 0.2776, "loss_ce": 0.018152089789509773, "loss_lvr": 0.7922133207321167, "loss_mode_switch": 0.0, "loss_total": 0.09737341850996017, "step": 694 }, { "batch_size": 4, "epoch": 0.2776, "step": 694, "tokens_per_device": 4820 }, { "epoch": 0.2776, "loss_ce": 0.18982917070388794, "loss_lvr": 0.8126530647277832, "loss_mode_switch": 0.0, "loss_total": 0.2710944712162018, "step": 694 }, { "batch_size": 4, "epoch": 0.2776, "step": 694, "tokens_per_device": 3920 }, { "epoch": 0.2776, "loss_ce": 0.42248135805130005, "loss_lvr": 0.898313581943512, "loss_mode_switch": 0.0, "loss_total": 0.5123127102851868, "step": 694 }, { "batch_size": 4, "epoch": 0.2776, "step": 694, "tokens_per_device": 1608 }, { "epoch": 0.2776, "loss_ce": 0.5278117060661316, "loss_lvr": 1.0697250366210938, "loss_mode_switch": 0.0, "loss_total": 0.6347842216491699, "step": 694 }, { "batch_size": 4, "epoch": 0.2776, "step": 694, "tokens_per_device": 2696 }, { "epoch": 0.2776, "loss_ce": 0.8158460259437561, "loss_lvr": 1.1327279806137085, "loss_mode_switch": 0.0, "loss_total": 0.929118812084198, "step": 694 }, { "batch_size": 4, "epoch": 0.2776, "step": 694, "tokens_per_device": 1504 }, { "epoch": 0.2776, "loss_ce": 0.32696041464805603, "loss_lvr": 1.1763734817504883, "loss_mode_switch": 0.0, "loss_total": 0.4445977807044983, "step": 694 }, { "batch_size": 1, "epoch": 0.2776, "step": 694, "tokens_per_device": 4886 }, { "epoch": 0.2776, "loss_ce": 0.016375230625271797, "loss_lvr": 0.6564738154411316, "loss_mode_switch": 0.0, "loss_total": 0.08202261477708817, "step": 694 }, { "epoch": 0.278, "grad_norm": 1.2875912189483643, "learning_rate": 8.471997313204183e-06, "loss": 0.3061, "step": 695 }, { "batch_size": 4, "epoch": 0.278, "step": 695, "tokens_per_device": 2688 }, { "epoch": 0.278, "loss_ce": 0.2583276331424713, "loss_lvr": 1.048454761505127, "loss_mode_switch": 0.0, "loss_total": 0.36317312717437744, "step": 695 }, { "batch_size": 4, "epoch": 0.278, "step": 695, "tokens_per_device": 1380 }, { "epoch": 0.278, "loss_ce": 0.2571168839931488, "loss_lvr": 0.9716700911521912, "loss_mode_switch": 0.0, "loss_total": 0.3542838990688324, "step": 695 }, { "batch_size": 4, "epoch": 0.278, "step": 695, "tokens_per_device": 13064 }, { "epoch": 0.278, "loss_ce": 0.04919126257300377, "loss_lvr": 0.5271804928779602, "loss_mode_switch": 0.0, "loss_total": 0.10190930962562561, "step": 695 }, { "batch_size": 4, "epoch": 0.278, "step": 695, "tokens_per_device": 3996 }, { "epoch": 0.278, "loss_ce": 0.6267423033714294, "loss_lvr": 0.9056622982025146, "loss_mode_switch": 0.0, "loss_total": 0.717308521270752, "step": 695 }, { "batch_size": 4, "epoch": 0.278, "step": 695, "tokens_per_device": 2620 }, { "epoch": 0.278, "loss_ce": 0.31961044669151306, "loss_lvr": 1.0247859954833984, "loss_mode_switch": 0.0, "loss_total": 0.4220890402793884, "step": 695 }, { "batch_size": 4, "epoch": 0.278, "step": 695, "tokens_per_device": 1200 }, { "epoch": 0.278, "loss_ce": 0.4843078553676605, "loss_lvr": 1.0722910165786743, "loss_mode_switch": 0.0, "loss_total": 0.5915369391441345, "step": 695 }, { "batch_size": 4, "epoch": 0.278, "step": 695, "tokens_per_device": 4040 }, { "epoch": 0.278, "loss_ce": 0.476683109998703, "loss_lvr": 1.0425050258636475, "loss_mode_switch": 0.0, "loss_total": 0.5809336304664612, "step": 695 }, { "batch_size": 4, "epoch": 0.278, "step": 695, "tokens_per_device": 1948 }, { "epoch": 0.278, "loss_ce": 0.16189348697662354, "loss_lvr": 1.7572214603424072, "loss_mode_switch": 0.0, "loss_total": 0.33761563897132874, "step": 695 }, { "epoch": 0.2784, "grad_norm": 1.3383952379226685, "learning_rate": 8.467333250784309e-06, "loss": 0.3519, "step": 696 }, { "batch_size": 1, "epoch": 0.2784, "step": 696, "tokens_per_device": 4962 }, { "epoch": 0.2784, "loss_ce": 0.09674906730651855, "loss_lvr": 0.41907474398612976, "loss_mode_switch": 0.0, "loss_total": 0.13865654170513153, "step": 696 }, { "batch_size": 4, "epoch": 0.2784, "step": 696, "tokens_per_device": 15648 }, { "epoch": 0.2784, "loss_ce": 0.09335021674633026, "loss_lvr": 1.7998720407485962, "loss_mode_switch": 0.0, "loss_total": 0.2733374238014221, "step": 696 }, { "batch_size": 1, "epoch": 0.2784, "step": 696, "tokens_per_device": 5296 }, { "epoch": 0.2784, "loss_ce": 0.0070833624340593815, "loss_lvr": 0.42757275700569153, "loss_mode_switch": 0.0, "loss_total": 0.049840640276670456, "step": 696 }, { "batch_size": 4, "epoch": 0.2784, "step": 696, "tokens_per_device": 4256 }, { "epoch": 0.2784, "loss_ce": 0.22198352217674255, "loss_lvr": 0.8154141902923584, "loss_mode_switch": 0.0, "loss_total": 0.3035249412059784, "step": 696 }, { "batch_size": 4, "epoch": 0.2784, "step": 696, "tokens_per_device": 1924 }, { "epoch": 0.2784, "loss_ce": 0.21403636038303375, "loss_lvr": 0.9072778224945068, "loss_mode_switch": 0.0, "loss_total": 0.30476415157318115, "step": 696 }, { "batch_size": 4, "epoch": 0.2784, "step": 696, "tokens_per_device": 7792 }, { "epoch": 0.2784, "loss_ce": 0.46104657649993896, "loss_lvr": 0.9391165375709534, "loss_mode_switch": 0.0, "loss_total": 0.5549582242965698, "step": 696 }, { "batch_size": 4, "epoch": 0.2784, "step": 696, "tokens_per_device": 5152 }, { "epoch": 0.2784, "loss_ce": 0.08199959248304367, "loss_lvr": 0.7615139484405518, "loss_mode_switch": 0.0, "loss_total": 0.15815098583698273, "step": 696 }, { "batch_size": 1, "epoch": 0.2784, "step": 696, "tokens_per_device": 6506 }, { "epoch": 0.2784, "loss_ce": 0.08497491478919983, "loss_lvr": 0.40810635685920715, "loss_mode_switch": 0.0, "loss_total": 0.12578555941581726, "step": 696 }, { "epoch": 0.2788, "grad_norm": 1.4748990535736084, "learning_rate": 8.46266336905055e-06, "loss": 0.3136, "step": 697 }, { "batch_size": 4, "epoch": 0.2788, "step": 697, "tokens_per_device": 7688 }, { "epoch": 0.2788, "loss_ce": 0.4126795828342438, "loss_lvr": 0.7603738903999329, "loss_mode_switch": 0.0, "loss_total": 0.4887169599533081, "step": 697 }, { "batch_size": 1, "epoch": 0.2788, "step": 697, "tokens_per_device": 4930 }, { "epoch": 0.2788, "loss_ce": 0.22682182490825653, "loss_lvr": 0.4739520251750946, "loss_mode_switch": 0.0, "loss_total": 0.27421703934669495, "step": 697 }, { "batch_size": 4, "epoch": 0.2788, "step": 697, "tokens_per_device": 6384 }, { "epoch": 0.2788, "loss_ce": 0.7460720539093018, "loss_lvr": 0.9164907336235046, "loss_mode_switch": 0.0, "loss_total": 0.8377211093902588, "step": 697 }, { "batch_size": 1, "epoch": 0.2788, "step": 697, "tokens_per_device": 5017 }, { "epoch": 0.2788, "loss_ce": 0.010870284400880337, "loss_lvr": 0.17329132556915283, "loss_mode_switch": 0.0, "loss_total": 0.028199415653944016, "step": 697 }, { "batch_size": 4, "epoch": 0.2788, "step": 697, "tokens_per_device": 15856 }, { "epoch": 0.2788, "loss_ce": 0.577210009098053, "loss_lvr": 0.9798722267150879, "loss_mode_switch": 0.0, "loss_total": 0.6751972436904907, "step": 697 }, { "batch_size": 1, "epoch": 0.2788, "step": 697, "tokens_per_device": 5120 }, { "epoch": 0.2788, "loss_ce": 0.006911025382578373, "loss_lvr": 2.017911911010742, "loss_mode_switch": 0.0, "loss_total": 0.2087022215127945, "step": 697 }, { "batch_size": 4, "epoch": 0.2788, "step": 697, "tokens_per_device": 1484 }, { "epoch": 0.2788, "loss_ce": 0.5589082837104797, "loss_lvr": 1.0625152587890625, "loss_mode_switch": 0.0, "loss_total": 0.6651598215103149, "step": 697 }, { "batch_size": 4, "epoch": 0.2788, "step": 697, "tokens_per_device": 8804 }, { "epoch": 0.2788, "loss_ce": 0.4930140972137451, "loss_lvr": 0.6894352436065674, "loss_mode_switch": 0.0, "loss_total": 0.561957597732544, "step": 697 }, { "epoch": 0.2792, "grad_norm": 1.4256974458694458, "learning_rate": 8.457987675840484e-06, "loss": 0.35, "step": 698 }, { "batch_size": 4, "epoch": 0.2792, "step": 698, "tokens_per_device": 2788 }, { "epoch": 0.2792, "loss_ce": 0.48819124698638916, "loss_lvr": 0.8264760375022888, "loss_mode_switch": 0.0, "loss_total": 0.5708388686180115, "step": 698 }, { "batch_size": 4, "epoch": 0.2792, "step": 698, "tokens_per_device": 1684 }, { "epoch": 0.2792, "loss_ce": 0.13127197325229645, "loss_lvr": 0.9344606399536133, "loss_mode_switch": 0.0, "loss_total": 0.22471803426742554, "step": 698 }, { "batch_size": 4, "epoch": 0.2792, "step": 698, "tokens_per_device": 4048 }, { "epoch": 0.2792, "loss_ce": 0.1768285185098648, "loss_lvr": 2.0081825256347656, "loss_mode_switch": 0.0, "loss_total": 0.3776467740535736, "step": 698 }, { "batch_size": 4, "epoch": 0.2792, "step": 698, "tokens_per_device": 1168 }, { "epoch": 0.2792, "loss_ce": 0.4094536602497101, "loss_lvr": 1.4988012313842773, "loss_mode_switch": 0.0, "loss_total": 0.5593338012695312, "step": 698 }, { "batch_size": 1, "epoch": 0.2792, "step": 698, "tokens_per_device": 5382 }, { "epoch": 0.2792, "loss_ce": 0.10172295570373535, "loss_lvr": 0.6704787015914917, "loss_mode_switch": 0.0, "loss_total": 0.16877081990242004, "step": 698 }, { "batch_size": 4, "epoch": 0.2792, "step": 698, "tokens_per_device": 6184 }, { "epoch": 0.2792, "loss_ce": 0.062018200755119324, "loss_lvr": 0.8020475506782532, "loss_mode_switch": 0.0, "loss_total": 0.14222295582294464, "step": 698 }, { "batch_size": 4, "epoch": 0.2792, "step": 698, "tokens_per_device": 4936 }, { "epoch": 0.2792, "loss_ce": 0.15189793705940247, "loss_lvr": 1.2702882289886475, "loss_mode_switch": 0.0, "loss_total": 0.2789267599582672, "step": 698 }, { "batch_size": 1, "epoch": 0.2792, "step": 698, "tokens_per_device": 5313 }, { "epoch": 0.2792, "loss_ce": 0.009246132336556911, "loss_lvr": 0.35249030590057373, "loss_mode_switch": 0.0, "loss_total": 0.04449516534805298, "step": 698 }, { "epoch": 0.2796, "grad_norm": 1.4440473318099976, "learning_rate": 8.45330617900145e-06, "loss": 0.3289, "step": 699 }, { "batch_size": 4, "epoch": 0.2796, "step": 699, "tokens_per_device": 11928 }, { "epoch": 0.2796, "loss_ce": 0.5510358810424805, "loss_lvr": 1.0851805210113525, "loss_mode_switch": 0.0, "loss_total": 0.6595539450645447, "step": 699 }, { "batch_size": 4, "epoch": 0.2796, "step": 699, "tokens_per_device": 1376 }, { "epoch": 0.2796, "loss_ce": 0.24757720530033112, "loss_lvr": 0.9797724485397339, "loss_mode_switch": 0.0, "loss_total": 0.3455544412136078, "step": 699 }, { "batch_size": 4, "epoch": 0.2796, "step": 699, "tokens_per_device": 3644 }, { "epoch": 0.2796, "loss_ce": 0.6456726789474487, "loss_lvr": 0.7786950469017029, "loss_mode_switch": 0.0, "loss_total": 0.7235422134399414, "step": 699 }, { "batch_size": 4, "epoch": 0.2796, "step": 699, "tokens_per_device": 2784 }, { "epoch": 0.2796, "loss_ce": 0.09216964244842529, "loss_lvr": 1.082290530204773, "loss_mode_switch": 0.0, "loss_total": 0.20039869844913483, "step": 699 }, { "batch_size": 4, "epoch": 0.2796, "step": 699, "tokens_per_device": 2644 }, { "epoch": 0.2796, "loss_ce": 0.3278994560241699, "loss_lvr": 0.8565382361412048, "loss_mode_switch": 0.0, "loss_total": 0.41355329751968384, "step": 699 }, { "batch_size": 4, "epoch": 0.2796, "step": 699, "tokens_per_device": 5188 }, { "epoch": 0.2796, "loss_ce": 0.34998613595962524, "loss_lvr": 0.8469787240028381, "loss_mode_switch": 0.0, "loss_total": 0.43468400835990906, "step": 699 }, { "batch_size": 4, "epoch": 0.2796, "step": 699, "tokens_per_device": 12388 }, { "epoch": 0.2796, "loss_ce": 0.3393498957157135, "loss_lvr": 0.8247997760772705, "loss_mode_switch": 0.0, "loss_total": 0.42182987928390503, "step": 699 }, { "batch_size": 4, "epoch": 0.2796, "step": 699, "tokens_per_device": 5648 }, { "epoch": 0.2796, "loss_ce": 0.2603362798690796, "loss_lvr": 1.0110077857971191, "loss_mode_switch": 0.0, "loss_total": 0.361437052488327, "step": 699 }, { "epoch": 0.28, "grad_norm": 1.3762187957763672, "learning_rate": 8.448618886390523e-06, "loss": 0.3625, "step": 700 }, { "batch_size": 4, "epoch": 0.28, "step": 700, "tokens_per_device": 4244 }, { "epoch": 0.28, "loss_ce": 0.2507613003253937, "loss_lvr": 1.0325261354446411, "loss_mode_switch": 0.0, "loss_total": 0.35401391983032227, "step": 700 }, { "batch_size": 4, "epoch": 0.28, "step": 700, "tokens_per_device": 1160 }, { "epoch": 0.28, "loss_ce": 0.3196127712726593, "loss_lvr": 1.0794367790222168, "loss_mode_switch": 0.0, "loss_total": 0.42755645513534546, "step": 700 }, { "batch_size": 1, "epoch": 0.28, "step": 700, "tokens_per_device": 5186 }, { "epoch": 0.28, "loss_ce": 0.10919245332479477, "loss_lvr": 0.5480767488479614, "loss_mode_switch": 0.0, "loss_total": 0.16400012373924255, "step": 700 }, { "batch_size": 4, "epoch": 0.28, "step": 700, "tokens_per_device": 4268 }, { "epoch": 0.28, "loss_ce": 0.7724229097366333, "loss_lvr": 1.2109031677246094, "loss_mode_switch": 0.0, "loss_total": 0.8935132026672363, "step": 700 }, { "batch_size": 1, "epoch": 0.28, "step": 700, "tokens_per_device": 4695 }, { "epoch": 0.28, "loss_ce": 0.006686581298708916, "loss_lvr": 0.7574411034584045, "loss_mode_switch": 0.0, "loss_total": 0.08243069797754288, "step": 700 }, { "batch_size": 4, "epoch": 0.28, "step": 700, "tokens_per_device": 3808 }, { "epoch": 0.28, "loss_ce": 0.3009645938873291, "loss_lvr": 1.0680242776870728, "loss_mode_switch": 0.0, "loss_total": 0.40776702761650085, "step": 700 }, { "batch_size": 1, "epoch": 0.28, "step": 700, "tokens_per_device": 5138 }, { "epoch": 0.28, "loss_ce": 0.07399427145719528, "loss_lvr": 0.4566929340362549, "loss_mode_switch": 0.0, "loss_total": 0.11966356635093689, "step": 700 }, { "batch_size": 4, "epoch": 0.28, "step": 700, "tokens_per_device": 6612 }, { "epoch": 0.28, "loss_ce": 0.2084183245897293, "loss_lvr": 0.771093487739563, "loss_mode_switch": 0.0, "loss_total": 0.28552767634391785, "step": 700 }, { "epoch": 0.2804, "grad_norm": 1.317683219909668, "learning_rate": 8.443925805874502e-06, "loss": 0.3103, "step": 701 }, { "batch_size": 1, "epoch": 0.2804, "step": 701, "tokens_per_device": 4885 }, { "epoch": 0.2804, "loss_ce": 0.01678130216896534, "loss_lvr": 0.399101585149765, "loss_mode_switch": 0.0, "loss_total": 0.05669146031141281, "step": 701 }, { "batch_size": 4, "epoch": 0.2804, "step": 701, "tokens_per_device": 2700 }, { "epoch": 0.2804, "loss_ce": 0.15511804819107056, "loss_lvr": 0.8634624481201172, "loss_mode_switch": 0.0, "loss_total": 0.2414642870426178, "step": 701 }, { "batch_size": 4, "epoch": 0.2804, "step": 701, "tokens_per_device": 4260 }, { "epoch": 0.2804, "loss_ce": 0.5382702946662903, "loss_lvr": 0.8698548078536987, "loss_mode_switch": 0.0, "loss_total": 0.6252557635307312, "step": 701 }, { "batch_size": 4, "epoch": 0.2804, "step": 701, "tokens_per_device": 4244 }, { "epoch": 0.2804, "loss_ce": 0.56487637758255, "loss_lvr": 0.9576225876808167, "loss_mode_switch": 0.0, "loss_total": 0.6606386303901672, "step": 701 }, { "batch_size": 4, "epoch": 0.2804, "step": 701, "tokens_per_device": 2444 }, { "epoch": 0.2804, "loss_ce": 0.36908963322639465, "loss_lvr": 0.9123968482017517, "loss_mode_switch": 0.0, "loss_total": 0.4603293240070343, "step": 701 }, { "batch_size": 4, "epoch": 0.2804, "step": 701, "tokens_per_device": 5204 }, { "epoch": 0.2804, "loss_ce": 0.40604856610298157, "loss_lvr": 0.9108015298843384, "loss_mode_switch": 0.0, "loss_total": 0.4971287250518799, "step": 701 }, { "batch_size": 1, "epoch": 0.2804, "step": 701, "tokens_per_device": 4743 }, { "epoch": 0.2804, "loss_ce": 0.035465728491544724, "loss_lvr": 0.5447633266448975, "loss_mode_switch": 0.0, "loss_total": 0.08994206041097641, "step": 701 }, { "batch_size": 4, "epoch": 0.2804, "step": 701, "tokens_per_device": 2600 }, { "epoch": 0.2804, "loss_ce": 0.35657474398612976, "loss_lvr": 0.8223934769630432, "loss_mode_switch": 0.0, "loss_total": 0.43881410360336304, "step": 701 }, { "epoch": 0.2808, "grad_norm": 1.3481837511062622, "learning_rate": 8.439226945329908e-06, "loss": 0.3561, "step": 702 }, { "batch_size": 1, "epoch": 0.2808, "step": 702, "tokens_per_device": 5004 }, { "epoch": 0.2808, "loss_ce": 0.05675528198480606, "loss_lvr": 0.2944498360157013, "loss_mode_switch": 0.0, "loss_total": 0.08620026707649231, "step": 702 }, { "batch_size": 4, "epoch": 0.2808, "step": 702, "tokens_per_device": 4248 }, { "epoch": 0.2808, "loss_ce": 0.16746169328689575, "loss_lvr": 1.7093827724456787, "loss_mode_switch": 0.0, "loss_total": 0.3383999764919281, "step": 702 }, { "batch_size": 4, "epoch": 0.2808, "step": 702, "tokens_per_device": 1456 }, { "epoch": 0.2808, "loss_ce": 0.30585381388664246, "loss_lvr": 1.176788568496704, "loss_mode_switch": 0.0, "loss_total": 0.4235326647758484, "step": 702 }, { "batch_size": 4, "epoch": 0.2808, "step": 702, "tokens_per_device": 12400 }, { "epoch": 0.2808, "loss_ce": 0.0789405032992363, "loss_lvr": 0.7496812343597412, "loss_mode_switch": 0.0, "loss_total": 0.1539086252450943, "step": 702 }, { "batch_size": 4, "epoch": 0.2808, "step": 702, "tokens_per_device": 1516 }, { "epoch": 0.2808, "loss_ce": 0.21536928415298462, "loss_lvr": 1.0239789485931396, "loss_mode_switch": 0.0, "loss_total": 0.3177671730518341, "step": 702 }, { "batch_size": 1, "epoch": 0.2808, "step": 702, "tokens_per_device": 7689 }, { "epoch": 0.2808, "loss_ce": 0.0013648413587361574, "loss_lvr": 0.2980664074420929, "loss_mode_switch": 0.0, "loss_total": 0.031171483919024467, "step": 702 }, { "batch_size": 4, "epoch": 0.2808, "step": 702, "tokens_per_device": 8948 }, { "epoch": 0.2808, "loss_ce": 0.14127495884895325, "loss_lvr": 0.5499933958053589, "loss_mode_switch": 0.0, "loss_total": 0.1962742954492569, "step": 702 }, { "batch_size": 4, "epoch": 0.2808, "step": 702, "tokens_per_device": 6380 }, { "epoch": 0.2808, "loss_ce": 0.008506479673087597, "loss_lvr": 0.747131884098053, "loss_mode_switch": 0.0, "loss_total": 0.08321966975927353, "step": 702 }, { "epoch": 0.2812, "grad_norm": 1.3286787271499634, "learning_rate": 8.434522312642955e-06, "loss": 0.3074, "step": 703 }, { "batch_size": 4, "epoch": 0.2812, "step": 703, "tokens_per_device": 4236 }, { "epoch": 0.2812, "loss_ce": 0.055021513253450394, "loss_lvr": 1.1132187843322754, "loss_mode_switch": 0.0, "loss_total": 0.16634339094161987, "step": 703 }, { "batch_size": 1, "epoch": 0.2812, "step": 703, "tokens_per_device": 5114 }, { "epoch": 0.2812, "loss_ce": 0.0047561777755618095, "loss_lvr": 0.4672691822052002, "loss_mode_switch": 0.0, "loss_total": 0.05148309841752052, "step": 703 }, { "batch_size": 4, "epoch": 0.2812, "step": 703, "tokens_per_device": 1456 }, { "epoch": 0.2812, "loss_ce": 0.324822336435318, "loss_lvr": 0.9922852516174316, "loss_mode_switch": 0.0, "loss_total": 0.42405086755752563, "step": 703 }, { "batch_size": 4, "epoch": 0.2812, "step": 703, "tokens_per_device": 5272 }, { "epoch": 0.2812, "loss_ce": 0.09599979221820831, "loss_lvr": 0.6846331357955933, "loss_mode_switch": 0.0, "loss_total": 0.1644631028175354, "step": 703 }, { "batch_size": 1, "epoch": 0.2812, "step": 703, "tokens_per_device": 4832 }, { "epoch": 0.2812, "loss_ce": 0.00783032737672329, "loss_lvr": 0.564026415348053, "loss_mode_switch": 0.0, "loss_total": 0.0642329677939415, "step": 703 }, { "batch_size": 1, "epoch": 0.2812, "step": 703, "tokens_per_device": 4820 }, { "epoch": 0.2812, "loss_ce": 0.049584437161684036, "loss_lvr": 0.26051172614097595, "loss_mode_switch": 0.0, "loss_total": 0.07563561201095581, "step": 703 }, { "batch_size": 4, "epoch": 0.2812, "step": 703, "tokens_per_device": 6376 }, { "epoch": 0.2812, "loss_ce": 0.4615911543369293, "loss_lvr": 0.757231593132019, "loss_mode_switch": 0.0, "loss_total": 0.5373142957687378, "step": 703 }, { "batch_size": 1, "epoch": 0.2812, "step": 703, "tokens_per_device": 5099 }, { "epoch": 0.2812, "loss_ce": 0.14898064732551575, "loss_lvr": 0.3243952691555023, "loss_mode_switch": 0.0, "loss_total": 0.18142017722129822, "step": 703 }, { "epoch": 0.2816, "grad_norm": 1.577262282371521, "learning_rate": 8.42981191570955e-06, "loss": 0.2848, "step": 704 }, { "batch_size": 1, "epoch": 0.2816, "step": 704, "tokens_per_device": 4935 }, { "epoch": 0.2816, "loss_ce": 0.07594893127679825, "loss_lvr": 0.9204472899436951, "loss_mode_switch": 0.0, "loss_total": 0.1679936647415161, "step": 704 }, { "batch_size": 1, "epoch": 0.2816, "step": 704, "tokens_per_device": 5003 }, { "epoch": 0.2816, "loss_ce": 0.016991185024380684, "loss_lvr": 0.6529752612113953, "loss_mode_switch": 0.0, "loss_total": 0.0822887122631073, "step": 704 }, { "batch_size": 4, "epoch": 0.2816, "step": 704, "tokens_per_device": 10220 }, { "epoch": 0.2816, "loss_ce": 0.10866259038448334, "loss_lvr": 0.7714880108833313, "loss_mode_switch": 0.0, "loss_total": 0.18581140041351318, "step": 704 }, { "batch_size": 1, "epoch": 0.2816, "step": 704, "tokens_per_device": 4860 }, { "epoch": 0.2816, "loss_ce": 0.0014883553376421332, "loss_lvr": 0.4685763716697693, "loss_mode_switch": 0.0, "loss_total": 0.04834599047899246, "step": 704 }, { "batch_size": 4, "epoch": 0.2816, "step": 704, "tokens_per_device": 4040 }, { "epoch": 0.2816, "loss_ce": 0.18921931087970734, "loss_lvr": 0.9395734667778015, "loss_mode_switch": 0.0, "loss_total": 0.2831766605377197, "step": 704 }, { "batch_size": 1, "epoch": 0.2816, "step": 704, "tokens_per_device": 5263 }, { "epoch": 0.2816, "loss_ce": 0.0076860785484313965, "loss_lvr": 0.7301124334335327, "loss_mode_switch": 0.0, "loss_total": 0.08069732040166855, "step": 704 }, { "batch_size": 4, "epoch": 0.2816, "step": 704, "tokens_per_device": 9004 }, { "epoch": 0.2816, "loss_ce": 0.19102922081947327, "loss_lvr": 0.5810552835464478, "loss_mode_switch": 0.0, "loss_total": 0.24913474917411804, "step": 704 }, { "batch_size": 4, "epoch": 0.2816, "step": 704, "tokens_per_device": 3764 }, { "epoch": 0.2816, "loss_ce": 0.44758495688438416, "loss_lvr": 0.9873179197311401, "loss_mode_switch": 0.0, "loss_total": 0.5463167428970337, "step": 704 }, { "epoch": 0.282, "grad_norm": 1.326404333114624, "learning_rate": 8.425095762435274e-06, "loss": 0.3043, "step": 705 }, { "batch_size": 4, "epoch": 0.282, "step": 705, "tokens_per_device": 3860 }, { "epoch": 0.282, "loss_ce": 0.3719417452812195, "loss_lvr": 1.614086389541626, "loss_mode_switch": 0.0, "loss_total": 0.53335040807724, "step": 705 }, { "batch_size": 4, "epoch": 0.282, "step": 705, "tokens_per_device": 4144 }, { "epoch": 0.282, "loss_ce": 0.14051830768585205, "loss_lvr": 1.0064365863800049, "loss_mode_switch": 0.0, "loss_total": 0.24116197228431702, "step": 705 }, { "batch_size": 1, "epoch": 0.282, "step": 705, "tokens_per_device": 5147 }, { "epoch": 0.282, "loss_ce": 0.14965569972991943, "loss_lvr": 0.2273082584142685, "loss_mode_switch": 0.0, "loss_total": 0.1723865270614624, "step": 705 }, { "batch_size": 1, "epoch": 0.282, "step": 705, "tokens_per_device": 5110 }, { "epoch": 0.282, "loss_ce": 0.03204271197319031, "loss_lvr": 0.39501845836639404, "loss_mode_switch": 0.0, "loss_total": 0.07154455780982971, "step": 705 }, { "batch_size": 4, "epoch": 0.282, "step": 705, "tokens_per_device": 4748 }, { "epoch": 0.282, "loss_ce": 0.04898770526051521, "loss_lvr": 0.6840733885765076, "loss_mode_switch": 0.0, "loss_total": 0.11739504337310791, "step": 705 }, { "batch_size": 4, "epoch": 0.282, "step": 705, "tokens_per_device": 4220 }, { "epoch": 0.282, "loss_ce": 0.2941192090511322, "loss_lvr": 1.2007849216461182, "loss_mode_switch": 0.0, "loss_total": 0.414197713136673, "step": 705 }, { "batch_size": 4, "epoch": 0.282, "step": 705, "tokens_per_device": 2788 }, { "epoch": 0.282, "loss_ce": 0.3768969774246216, "loss_lvr": 0.860514760017395, "loss_mode_switch": 0.0, "loss_total": 0.46294844150543213, "step": 705 }, { "batch_size": 1, "epoch": 0.282, "step": 705, "tokens_per_device": 5086 }, { "epoch": 0.282, "loss_ce": 0.04758383333683014, "loss_lvr": 0.5069867968559265, "loss_mode_switch": 0.0, "loss_total": 0.09828251600265503, "step": 705 }, { "epoch": 0.2824, "grad_norm": 1.3669906854629517, "learning_rate": 8.420373860735366e-06, "loss": 0.3631, "step": 706 }, { "batch_size": 4, "epoch": 0.2824, "step": 706, "tokens_per_device": 3296 }, { "epoch": 0.2824, "loss_ce": 0.5549596548080444, "loss_lvr": 1.0678086280822754, "loss_mode_switch": 0.0, "loss_total": 0.6617405414581299, "step": 706 }, { "batch_size": 4, "epoch": 0.2824, "step": 706, "tokens_per_device": 3736 }, { "epoch": 0.2824, "loss_ce": 0.4771740436553955, "loss_lvr": 1.1884791851043701, "loss_mode_switch": 0.0, "loss_total": 0.5960219502449036, "step": 706 }, { "batch_size": 4, "epoch": 0.2824, "step": 706, "tokens_per_device": 5080 }, { "epoch": 0.2824, "loss_ce": 0.06060957908630371, "loss_lvr": 0.6798457503318787, "loss_mode_switch": 0.0, "loss_total": 0.12859416007995605, "step": 706 }, { "batch_size": 4, "epoch": 0.2824, "step": 706, "tokens_per_device": 4224 }, { "epoch": 0.2824, "loss_ce": 0.4557175934314728, "loss_lvr": 0.9327448606491089, "loss_mode_switch": 0.0, "loss_total": 0.5489920973777771, "step": 706 }, { "batch_size": 4, "epoch": 0.2824, "step": 706, "tokens_per_device": 4392 }, { "epoch": 0.2824, "loss_ce": 0.14127695560455322, "loss_lvr": 0.919265866279602, "loss_mode_switch": 0.0, "loss_total": 0.23320354521274567, "step": 706 }, { "batch_size": 4, "epoch": 0.2824, "step": 706, "tokens_per_device": 4668 }, { "epoch": 0.2824, "loss_ce": 0.5131105780601501, "loss_lvr": 0.7256106734275818, "loss_mode_switch": 0.0, "loss_total": 0.5856716632843018, "step": 706 }, { "batch_size": 4, "epoch": 0.2824, "step": 706, "tokens_per_device": 2768 }, { "epoch": 0.2824, "loss_ce": 0.12717531621456146, "loss_lvr": 0.8274592161178589, "loss_mode_switch": 0.0, "loss_total": 0.2099212408065796, "step": 706 }, { "batch_size": 4, "epoch": 0.2824, "step": 706, "tokens_per_device": 1300 }, { "epoch": 0.2824, "loss_ce": 0.5802924036979675, "loss_lvr": 0.8315714001655579, "loss_mode_switch": 0.0, "loss_total": 0.6634495258331299, "step": 706 }, { "epoch": 0.2828, "grad_norm": 1.4114030599594116, "learning_rate": 8.415646218534713e-06, "loss": 0.3207, "step": 707 }, { "batch_size": 1, "epoch": 0.2828, "step": 707, "tokens_per_device": 5610 }, { "epoch": 0.2828, "loss_ce": 0.013121554628014565, "loss_lvr": 0.40455740690231323, "loss_mode_switch": 0.0, "loss_total": 0.05357729643583298, "step": 707 }, { "batch_size": 1, "epoch": 0.2828, "step": 707, "tokens_per_device": 5177 }, { "epoch": 0.2828, "loss_ce": 0.09825457632541656, "loss_lvr": 0.43427857756614685, "loss_mode_switch": 0.0, "loss_total": 0.141682431101799, "step": 707 }, { "batch_size": 4, "epoch": 0.2828, "step": 707, "tokens_per_device": 3824 }, { "epoch": 0.2828, "loss_ce": 0.3058513402938843, "loss_lvr": 1.1587785482406616, "loss_mode_switch": 0.0, "loss_total": 0.4217292070388794, "step": 707 }, { "batch_size": 4, "epoch": 0.2828, "step": 707, "tokens_per_device": 4356 }, { "epoch": 0.2828, "loss_ce": 0.2794342637062073, "loss_lvr": 2.2607975006103516, "loss_mode_switch": 0.0, "loss_total": 0.5055140256881714, "step": 707 }, { "batch_size": 1, "epoch": 0.2828, "step": 707, "tokens_per_device": 4882 }, { "epoch": 0.2828, "loss_ce": 0.009303884580731392, "loss_lvr": 0.30411577224731445, "loss_mode_switch": 0.0, "loss_total": 0.03971546143293381, "step": 707 }, { "batch_size": 4, "epoch": 0.2828, "step": 707, "tokens_per_device": 1444 }, { "epoch": 0.2828, "loss_ce": 0.3606385886669159, "loss_lvr": 1.095797061920166, "loss_mode_switch": 0.0, "loss_total": 0.470218300819397, "step": 707 }, { "batch_size": 1, "epoch": 0.2828, "step": 707, "tokens_per_device": 5097 }, { "epoch": 0.2828, "loss_ce": 0.02094896510243416, "loss_lvr": 1.2220555543899536, "loss_mode_switch": 0.0, "loss_total": 0.14315451681613922, "step": 707 }, { "batch_size": 4, "epoch": 0.2828, "step": 707, "tokens_per_device": 4808 }, { "epoch": 0.2828, "loss_ce": 0.40324336290359497, "loss_lvr": 1.0611505508422852, "loss_mode_switch": 0.0, "loss_total": 0.5093584060668945, "step": 707 }, { "epoch": 0.2832, "grad_norm": 1.3491920232772827, "learning_rate": 8.410912843767837e-06, "loss": 0.2921, "step": 708 }, { "batch_size": 4, "epoch": 0.2832, "step": 708, "tokens_per_device": 2620 }, { "epoch": 0.2832, "loss_ce": 0.24912427365779877, "loss_lvr": 1.066800832748413, "loss_mode_switch": 0.0, "loss_total": 0.35580435395240784, "step": 708 }, { "batch_size": 1, "epoch": 0.2832, "step": 708, "tokens_per_device": 4995 }, { "epoch": 0.2832, "loss_ce": 0.34414246678352356, "loss_lvr": 0.6138501167297363, "loss_mode_switch": 0.0, "loss_total": 0.4055274724960327, "step": 708 }, { "batch_size": 4, "epoch": 0.2832, "step": 708, "tokens_per_device": 1552 }, { "epoch": 0.2832, "loss_ce": 0.3317008912563324, "loss_lvr": 0.9565393328666687, "loss_mode_switch": 0.0, "loss_total": 0.4273548126220703, "step": 708 }, { "batch_size": 4, "epoch": 0.2832, "step": 708, "tokens_per_device": 4368 }, { "epoch": 0.2832, "loss_ce": 0.21536947786808014, "loss_lvr": 1.1093418598175049, "loss_mode_switch": 0.0, "loss_total": 0.3263036608695984, "step": 708 }, { "batch_size": 1, "epoch": 0.2832, "step": 708, "tokens_per_device": 4869 }, { "epoch": 0.2832, "loss_ce": 0.03584306314587593, "loss_lvr": 0.19030973315238953, "loss_mode_switch": 0.0, "loss_total": 0.05487403646111488, "step": 708 }, { "batch_size": 4, "epoch": 0.2832, "step": 708, "tokens_per_device": 4228 }, { "epoch": 0.2832, "loss_ce": 0.00448932871222496, "loss_lvr": 0.8534016609191895, "loss_mode_switch": 0.0, "loss_total": 0.08982948958873749, "step": 708 }, { "batch_size": 4, "epoch": 0.2832, "step": 708, "tokens_per_device": 4244 }, { "epoch": 0.2832, "loss_ce": 0.7994521260261536, "loss_lvr": 0.9314924478530884, "loss_mode_switch": 0.0, "loss_total": 0.8926013708114624, "step": 708 }, { "batch_size": 4, "epoch": 0.2832, "step": 708, "tokens_per_device": 1248 }, { "epoch": 0.2832, "loss_ce": 0.3033379018306732, "loss_lvr": 1.106370210647583, "loss_mode_switch": 0.0, "loss_total": 0.41397494077682495, "step": 708 }, { "epoch": 0.2836, "grad_norm": 1.5612595081329346, "learning_rate": 8.406173744378887e-06, "loss": 0.293, "step": 709 }, { "batch_size": 4, "epoch": 0.2836, "step": 709, "tokens_per_device": 12528 }, { "epoch": 0.2836, "loss_ce": 0.08105732500553131, "loss_lvr": 0.8240030407905579, "loss_mode_switch": 0.0, "loss_total": 0.16345763206481934, "step": 709 }, { "batch_size": 4, "epoch": 0.2836, "step": 709, "tokens_per_device": 4284 }, { "epoch": 0.2836, "loss_ce": 0.5264427065849304, "loss_lvr": 0.7689527869224548, "loss_mode_switch": 0.0, "loss_total": 0.6033380031585693, "step": 709 }, { "batch_size": 4, "epoch": 0.2836, "step": 709, "tokens_per_device": 3772 }, { "epoch": 0.2836, "loss_ce": 0.4089395999908447, "loss_lvr": 1.0152969360351562, "loss_mode_switch": 0.0, "loss_total": 0.5104693174362183, "step": 709 }, { "batch_size": 1, "epoch": 0.2836, "step": 709, "tokens_per_device": 5123 }, { "epoch": 0.2836, "loss_ce": 0.11495165526866913, "loss_lvr": 0.7412746548652649, "loss_mode_switch": 0.0, "loss_total": 0.18907912075519562, "step": 709 }, { "batch_size": 4, "epoch": 0.2836, "step": 709, "tokens_per_device": 1352 }, { "epoch": 0.2836, "loss_ce": 0.6156496405601501, "loss_lvr": 1.0688800811767578, "loss_mode_switch": 0.0, "loss_total": 0.722537636756897, "step": 709 }, { "batch_size": 4, "epoch": 0.2836, "step": 709, "tokens_per_device": 10412 }, { "epoch": 0.2836, "loss_ce": 0.4254544675350189, "loss_lvr": 0.8073469996452332, "loss_mode_switch": 0.0, "loss_total": 0.5061891674995422, "step": 709 }, { "batch_size": 1, "epoch": 0.2836, "step": 709, "tokens_per_device": 5063 }, { "epoch": 0.2836, "loss_ce": 0.0037124829832464457, "loss_lvr": 0.45621514320373535, "loss_mode_switch": 0.0, "loss_total": 0.04933399707078934, "step": 709 }, { "batch_size": 4, "epoch": 0.2836, "step": 709, "tokens_per_device": 6800 }, { "epoch": 0.2836, "loss_ce": 0.013242131099104881, "loss_lvr": 0.6142172813415527, "loss_mode_switch": 0.0, "loss_total": 0.07466386258602142, "step": 709 }, { "epoch": 0.284, "grad_norm": 1.4001367092132568, "learning_rate": 8.401428928321607e-06, "loss": 0.322, "step": 710 }, { "batch_size": 4, "epoch": 0.284, "step": 710, "tokens_per_device": 3844 }, { "epoch": 0.284, "loss_ce": 1.0741853713989258, "loss_lvr": 0.8603845834732056, "loss_mode_switch": 0.0, "loss_total": 1.1602238416671753, "step": 710 }, { "batch_size": 4, "epoch": 0.284, "step": 710, "tokens_per_device": 5988 }, { "epoch": 0.284, "loss_ce": 0.4334684908390045, "loss_lvr": 0.7154895067214966, "loss_mode_switch": 0.0, "loss_total": 0.5050174593925476, "step": 710 }, { "batch_size": 4, "epoch": 0.284, "step": 710, "tokens_per_device": 3760 }, { "epoch": 0.284, "loss_ce": 0.39171043038368225, "loss_lvr": 0.9647476077079773, "loss_mode_switch": 0.0, "loss_total": 0.48818519711494446, "step": 710 }, { "batch_size": 4, "epoch": 0.284, "step": 710, "tokens_per_device": 2712 }, { "epoch": 0.284, "loss_ce": 0.24552062153816223, "loss_lvr": 0.7531018853187561, "loss_mode_switch": 0.0, "loss_total": 0.3208308219909668, "step": 710 }, { "batch_size": 4, "epoch": 0.284, "step": 710, "tokens_per_device": 5808 }, { "epoch": 0.284, "loss_ce": 0.024653416126966476, "loss_lvr": 0.8559341430664062, "loss_mode_switch": 0.0, "loss_total": 0.11024683713912964, "step": 710 }, { "batch_size": 4, "epoch": 0.284, "step": 710, "tokens_per_device": 4620 }, { "epoch": 0.284, "loss_ce": 0.3694421052932739, "loss_lvr": 0.8033230304718018, "loss_mode_switch": 0.0, "loss_total": 0.4497744143009186, "step": 710 }, { "batch_size": 4, "epoch": 0.284, "step": 710, "tokens_per_device": 4500 }, { "epoch": 0.284, "loss_ce": 0.3509145975112915, "loss_lvr": 1.1486927270889282, "loss_mode_switch": 0.0, "loss_total": 0.46578386425971985, "step": 710 }, { "batch_size": 1, "epoch": 0.284, "step": 710, "tokens_per_device": 5421 }, { "epoch": 0.284, "loss_ce": 0.005574606359004974, "loss_lvr": 0.683834969997406, "loss_mode_switch": 0.0, "loss_total": 0.07395810633897781, "step": 710 }, { "epoch": 0.2844, "grad_norm": 1.3622556924819946, "learning_rate": 8.396678403559348e-06, "loss": 0.3305, "step": 711 }, { "batch_size": 1, "epoch": 0.2844, "step": 711, "tokens_per_device": 4996 }, { "epoch": 0.2844, "loss_ce": 0.013624184764921665, "loss_lvr": 0.34770265221595764, "loss_mode_switch": 0.0, "loss_total": 0.048394449055194855, "step": 711 }, { "batch_size": 4, "epoch": 0.2844, "step": 711, "tokens_per_device": 4288 }, { "epoch": 0.2844, "loss_ce": 0.256680428981781, "loss_lvr": 0.9313843250274658, "loss_mode_switch": 0.0, "loss_total": 0.3498188555240631, "step": 711 }, { "batch_size": 4, "epoch": 0.2844, "step": 711, "tokens_per_device": 5752 }, { "epoch": 0.2844, "loss_ce": 0.42259863018989563, "loss_lvr": 0.7198679447174072, "loss_mode_switch": 0.0, "loss_total": 0.49458542466163635, "step": 711 }, { "batch_size": 4, "epoch": 0.2844, "step": 711, "tokens_per_device": 6404 }, { "epoch": 0.2844, "loss_ce": 0.7295475006103516, "loss_lvr": 0.6445726752281189, "loss_mode_switch": 0.0, "loss_total": 0.7940047979354858, "step": 711 }, { "batch_size": 4, "epoch": 0.2844, "step": 711, "tokens_per_device": 7208 }, { "epoch": 0.2844, "loss_ce": 0.332665354013443, "loss_lvr": 0.7749749422073364, "loss_mode_switch": 0.0, "loss_total": 0.41016286611557007, "step": 711 }, { "batch_size": 1, "epoch": 0.2844, "step": 711, "tokens_per_device": 5139 }, { "epoch": 0.2844, "loss_ce": 0.1247233897447586, "loss_lvr": 0.40715816617012024, "loss_mode_switch": 0.0, "loss_total": 0.1654392033815384, "step": 711 }, { "batch_size": 1, "epoch": 0.2844, "step": 711, "tokens_per_device": 4171 }, { "epoch": 0.2844, "loss_ce": 0.0025964928790926933, "loss_lvr": 0.41783303022384644, "loss_mode_switch": 0.0, "loss_total": 0.04437979683279991, "step": 711 }, { "batch_size": 4, "epoch": 0.2844, "step": 711, "tokens_per_device": 5468 }, { "epoch": 0.2844, "loss_ce": 0.3867351710796356, "loss_lvr": 0.8310027122497559, "loss_mode_switch": 0.0, "loss_total": 0.46983546018600464, "step": 711 }, { "epoch": 0.2848, "grad_norm": 1.218788981437683, "learning_rate": 8.391922178065037e-06, "loss": 0.2888, "step": 712 }, { "batch_size": 4, "epoch": 0.2848, "step": 712, "tokens_per_device": 3820 }, { "epoch": 0.2848, "loss_ce": 0.3087373971939087, "loss_lvr": 1.1621015071868896, "loss_mode_switch": 0.0, "loss_total": 0.4249475598335266, "step": 712 }, { "batch_size": 4, "epoch": 0.2848, "step": 712, "tokens_per_device": 4336 }, { "epoch": 0.2848, "loss_ce": 0.19242516160011292, "loss_lvr": 0.7362156510353088, "loss_mode_switch": 0.0, "loss_total": 0.2660467326641083, "step": 712 }, { "batch_size": 1, "epoch": 0.2848, "step": 712, "tokens_per_device": 5008 }, { "epoch": 0.2848, "loss_ce": 0.03570547327399254, "loss_lvr": 0.3548262119293213, "loss_mode_switch": 0.0, "loss_total": 0.07118809223175049, "step": 712 }, { "batch_size": 4, "epoch": 0.2848, "step": 712, "tokens_per_device": 2960 }, { "epoch": 0.2848, "loss_ce": 0.10268009454011917, "loss_lvr": 1.0495803356170654, "loss_mode_switch": 0.0, "loss_total": 0.20763812959194183, "step": 712 }, { "batch_size": 1, "epoch": 0.2848, "step": 712, "tokens_per_device": 4120 }, { "epoch": 0.2848, "loss_ce": 0.3632829487323761, "loss_lvr": 0.23354151844978333, "loss_mode_switch": 0.0, "loss_total": 0.3866370916366577, "step": 712 }, { "batch_size": 4, "epoch": 0.2848, "step": 712, "tokens_per_device": 3928 }, { "epoch": 0.2848, "loss_ce": 0.14767473936080933, "loss_lvr": 1.032566785812378, "loss_mode_switch": 0.0, "loss_total": 0.25093141198158264, "step": 712 }, { "batch_size": 1, "epoch": 0.2848, "step": 712, "tokens_per_device": 5374 }, { "epoch": 0.2848, "loss_ce": 0.34431853890419006, "loss_lvr": 0.4186120629310608, "loss_mode_switch": 0.0, "loss_total": 0.38617974519729614, "step": 712 }, { "batch_size": 4, "epoch": 0.2848, "step": 712, "tokens_per_device": 4200 }, { "epoch": 0.2848, "loss_ce": 0.21903441846370697, "loss_lvr": 1.1924059391021729, "loss_mode_switch": 0.0, "loss_total": 0.3382750153541565, "step": 712 }, { "epoch": 0.2852, "grad_norm": 1.445595145225525, "learning_rate": 8.387160259821165e-06, "loss": 0.3384, "step": 713 }, { "batch_size": 1, "epoch": 0.2852, "step": 713, "tokens_per_device": 5043 }, { "epoch": 0.2852, "loss_ce": 0.212090864777565, "loss_lvr": 0.6552237272262573, "loss_mode_switch": 0.0, "loss_total": 0.27761322259902954, "step": 713 }, { "batch_size": 4, "epoch": 0.2852, "step": 713, "tokens_per_device": 2696 }, { "epoch": 0.2852, "loss_ce": 0.24589312076568604, "loss_lvr": 0.7674376964569092, "loss_mode_switch": 0.0, "loss_total": 0.3226369023323059, "step": 713 }, { "batch_size": 4, "epoch": 0.2852, "step": 713, "tokens_per_device": 5300 }, { "epoch": 0.2852, "loss_ce": 0.1468464583158493, "loss_lvr": 0.8463582396507263, "loss_mode_switch": 0.0, "loss_total": 0.23148228228092194, "step": 713 }, { "batch_size": 4, "epoch": 0.2852, "step": 713, "tokens_per_device": 7788 }, { "epoch": 0.2852, "loss_ce": 0.37101784348487854, "loss_lvr": 0.8142531514167786, "loss_mode_switch": 0.0, "loss_total": 0.4524431526660919, "step": 713 }, { "batch_size": 4, "epoch": 0.2852, "step": 713, "tokens_per_device": 3940 }, { "epoch": 0.2852, "loss_ce": 0.14992843568325043, "loss_lvr": 1.0801513195037842, "loss_mode_switch": 0.0, "loss_total": 0.2579435706138611, "step": 713 }, { "batch_size": 4, "epoch": 0.2852, "step": 713, "tokens_per_device": 3444 }, { "epoch": 0.2852, "loss_ce": 0.6785870790481567, "loss_lvr": 0.9311038255691528, "loss_mode_switch": 0.0, "loss_total": 0.771697461605072, "step": 713 }, { "batch_size": 1, "epoch": 0.2852, "step": 713, "tokens_per_device": 5318 }, { "epoch": 0.2852, "loss_ce": 0.2729298174381256, "loss_lvr": 0.6292524337768555, "loss_mode_switch": 0.0, "loss_total": 0.33585506677627563, "step": 713 }, { "batch_size": 4, "epoch": 0.2852, "step": 713, "tokens_per_device": 3808 }, { "epoch": 0.2852, "loss_ce": 0.2833327054977417, "loss_lvr": 1.0370194911956787, "loss_mode_switch": 0.0, "loss_total": 0.38703465461730957, "step": 713 }, { "epoch": 0.2856, "grad_norm": 1.6089072227478027, "learning_rate": 8.382392656819784e-06, "loss": 0.3289, "step": 714 }, { "batch_size": 1, "epoch": 0.2856, "step": 714, "tokens_per_device": 5188 }, { "epoch": 0.2856, "loss_ce": 0.004936231300234795, "loss_lvr": 0.3714383840560913, "loss_mode_switch": 0.0, "loss_total": 0.04208006709814072, "step": 714 }, { "batch_size": 1, "epoch": 0.2856, "step": 714, "tokens_per_device": 4998 }, { "epoch": 0.2856, "loss_ce": 0.36069536209106445, "loss_lvr": 0.2917728126049042, "loss_mode_switch": 0.0, "loss_total": 0.38987264037132263, "step": 714 }, { "batch_size": 4, "epoch": 0.2856, "step": 714, "tokens_per_device": 3856 }, { "epoch": 0.2856, "loss_ce": 0.009115303866565228, "loss_lvr": 0.5728938579559326, "loss_mode_switch": 0.0, "loss_total": 0.06640469282865524, "step": 714 }, { "batch_size": 4, "epoch": 0.2856, "step": 714, "tokens_per_device": 4988 }, { "epoch": 0.2856, "loss_ce": 0.12328353524208069, "loss_lvr": 0.84954833984375, "loss_mode_switch": 0.0, "loss_total": 0.2082383632659912, "step": 714 }, { "batch_size": 1, "epoch": 0.2856, "step": 714, "tokens_per_device": 5158 }, { "epoch": 0.2856, "loss_ce": 0.07980595529079437, "loss_lvr": 0.33854883909225464, "loss_mode_switch": 0.0, "loss_total": 0.11366084218025208, "step": 714 }, { "batch_size": 1, "epoch": 0.2856, "step": 714, "tokens_per_device": 5729 }, { "epoch": 0.2856, "loss_ce": 0.028445787727832794, "loss_lvr": 0.40877988934516907, "loss_mode_switch": 0.0, "loss_total": 0.06932377815246582, "step": 714 }, { "batch_size": 4, "epoch": 0.2856, "step": 714, "tokens_per_device": 4516 }, { "epoch": 0.2856, "loss_ce": 0.5483704209327698, "loss_lvr": 0.889299213886261, "loss_mode_switch": 0.0, "loss_total": 0.6373003721237183, "step": 714 }, { "batch_size": 4, "epoch": 0.2856, "step": 714, "tokens_per_device": 2100 }, { "epoch": 0.2856, "loss_ce": 0.5292537808418274, "loss_lvr": 1.102702260017395, "loss_mode_switch": 0.0, "loss_total": 0.639523983001709, "step": 714 }, { "epoch": 0.286, "grad_norm": 1.893189549446106, "learning_rate": 8.377619377062483e-06, "loss": 0.3599, "step": 715 }, { "batch_size": 4, "epoch": 0.286, "step": 715, "tokens_per_device": 1184 }, { "epoch": 0.286, "loss_ce": 0.054460156708955765, "loss_lvr": 1.2468935251235962, "loss_mode_switch": 0.0, "loss_total": 0.17914950847625732, "step": 715 }, { "batch_size": 4, "epoch": 0.286, "step": 715, "tokens_per_device": 5560 }, { "epoch": 0.286, "loss_ce": 0.12706167995929718, "loss_lvr": 0.7684781551361084, "loss_mode_switch": 0.0, "loss_total": 0.2039094865322113, "step": 715 }, { "batch_size": 4, "epoch": 0.286, "step": 715, "tokens_per_device": 3880 }, { "epoch": 0.286, "loss_ce": 0.3114607632160187, "loss_lvr": 0.9329041242599487, "loss_mode_switch": 0.0, "loss_total": 0.404751181602478, "step": 715 }, { "batch_size": 4, "epoch": 0.286, "step": 715, "tokens_per_device": 5520 }, { "epoch": 0.286, "loss_ce": 0.1348937451839447, "loss_lvr": 1.1172412633895874, "loss_mode_switch": 0.0, "loss_total": 0.2466178834438324, "step": 715 }, { "batch_size": 1, "epoch": 0.286, "step": 715, "tokens_per_device": 5579 }, { "epoch": 0.286, "loss_ce": 0.16693897545337677, "loss_lvr": 0.40612316131591797, "loss_mode_switch": 0.0, "loss_total": 0.20755130052566528, "step": 715 }, { "batch_size": 4, "epoch": 0.286, "step": 715, "tokens_per_device": 5496 }, { "epoch": 0.286, "loss_ce": 0.022380594164133072, "loss_lvr": 0.6160447597503662, "loss_mode_switch": 0.0, "loss_total": 0.08398507535457611, "step": 715 }, { "batch_size": 4, "epoch": 0.286, "step": 715, "tokens_per_device": 12100 }, { "epoch": 0.286, "loss_ce": 0.2947555482387543, "loss_lvr": 0.3336319327354431, "loss_mode_switch": 0.0, "loss_total": 0.3281187415122986, "step": 715 }, { "batch_size": 4, "epoch": 0.286, "step": 715, "tokens_per_device": 1428 }, { "epoch": 0.286, "loss_ce": 0.325063019990921, "loss_lvr": 1.3376375436782837, "loss_mode_switch": 0.0, "loss_total": 0.45882678031921387, "step": 715 }, { "epoch": 0.2864, "grad_norm": 1.4917742013931274, "learning_rate": 8.372840428560379e-06, "loss": 0.2762, "step": 716 }, { "batch_size": 4, "epoch": 0.2864, "step": 716, "tokens_per_device": 2744 }, { "epoch": 0.2864, "loss_ce": 0.17266318202018738, "loss_lvr": 1.0753432512283325, "loss_mode_switch": 0.0, "loss_total": 0.28019750118255615, "step": 716 }, { "batch_size": 4, "epoch": 0.2864, "step": 716, "tokens_per_device": 3776 }, { "epoch": 0.2864, "loss_ce": 0.33075833320617676, "loss_lvr": 0.791312038898468, "loss_mode_switch": 0.0, "loss_total": 0.4098895490169525, "step": 716 }, { "batch_size": 1, "epoch": 0.2864, "step": 716, "tokens_per_device": 5178 }, { "epoch": 0.2864, "loss_ce": 0.03493597358465195, "loss_lvr": 0.6176907420158386, "loss_mode_switch": 0.0, "loss_total": 0.09670504927635193, "step": 716 }, { "batch_size": 4, "epoch": 0.2864, "step": 716, "tokens_per_device": 6068 }, { "epoch": 0.2864, "loss_ce": 0.3303564786911011, "loss_lvr": 0.6653909087181091, "loss_mode_switch": 0.0, "loss_total": 0.3968955874443054, "step": 716 }, { "batch_size": 4, "epoch": 0.2864, "step": 716, "tokens_per_device": 1380 }, { "epoch": 0.2864, "loss_ce": 0.21522028744220734, "loss_lvr": 1.4797130823135376, "loss_mode_switch": 0.0, "loss_total": 0.3631916046142578, "step": 716 }, { "batch_size": 4, "epoch": 0.2864, "step": 716, "tokens_per_device": 4472 }, { "epoch": 0.2864, "loss_ce": 0.2023831307888031, "loss_lvr": 1.0451843738555908, "loss_mode_switch": 0.0, "loss_total": 0.30690157413482666, "step": 716 }, { "batch_size": 1, "epoch": 0.2864, "step": 716, "tokens_per_device": 5184 }, { "epoch": 0.2864, "loss_ce": 0.0016865275101736188, "loss_lvr": 0.3930436968803406, "loss_mode_switch": 0.0, "loss_total": 0.04099090024828911, "step": 716 }, { "batch_size": 4, "epoch": 0.2864, "step": 716, "tokens_per_device": 1216 }, { "epoch": 0.2864, "loss_ce": 0.2742939889431, "loss_lvr": 1.0930328369140625, "loss_mode_switch": 0.0, "loss_total": 0.3835972845554352, "step": 716 }, { "epoch": 0.2868, "grad_norm": 1.2798235416412354, "learning_rate": 8.368055819334101e-06, "loss": 0.2905, "step": 717 }, { "batch_size": 4, "epoch": 0.2868, "step": 717, "tokens_per_device": 14484 }, { "epoch": 0.2868, "loss_ce": 0.1978476345539093, "loss_lvr": 0.7610499858856201, "loss_mode_switch": 0.0, "loss_total": 0.2739526331424713, "step": 717 }, { "batch_size": 4, "epoch": 0.2868, "step": 717, "tokens_per_device": 14796 }, { "epoch": 0.2868, "loss_ce": 0.10045772045850754, "loss_lvr": 0.9851641058921814, "loss_mode_switch": 0.0, "loss_total": 0.1989741325378418, "step": 717 }, { "batch_size": 4, "epoch": 0.2868, "step": 717, "tokens_per_device": 1328 }, { "epoch": 0.2868, "loss_ce": 0.2917894423007965, "loss_lvr": 1.1443629264831543, "loss_mode_switch": 0.0, "loss_total": 0.4062257409095764, "step": 717 }, { "batch_size": 1, "epoch": 0.2868, "step": 717, "tokens_per_device": 4950 }, { "epoch": 0.2868, "loss_ce": 0.0430724211037159, "loss_lvr": 0.49406740069389343, "loss_mode_switch": 0.0, "loss_total": 0.0924791619181633, "step": 717 }, { "batch_size": 4, "epoch": 0.2868, "step": 717, "tokens_per_device": 4428 }, { "epoch": 0.2868, "loss_ce": 0.04673158377408981, "loss_lvr": 0.8431958556175232, "loss_mode_switch": 0.0, "loss_total": 0.131051167845726, "step": 717 }, { "batch_size": 4, "epoch": 0.2868, "step": 717, "tokens_per_device": 5088 }, { "epoch": 0.2868, "loss_ce": 0.023937242105603218, "loss_lvr": 0.7277946472167969, "loss_mode_switch": 0.0, "loss_total": 0.09671670943498611, "step": 717 }, { "batch_size": 4, "epoch": 0.2868, "step": 717, "tokens_per_device": 3004 }, { "epoch": 0.2868, "loss_ce": 0.40310126543045044, "loss_lvr": 0.9192944765090942, "loss_mode_switch": 0.0, "loss_total": 0.4950307011604309, "step": 717 }, { "batch_size": 4, "epoch": 0.2868, "step": 717, "tokens_per_device": 5780 }, { "epoch": 0.2868, "loss_ce": 0.004235235508531332, "loss_lvr": 0.7644330859184265, "loss_mode_switch": 0.0, "loss_total": 0.0806785449385643, "step": 717 }, { "epoch": 0.2872, "grad_norm": 1.2433005571365356, "learning_rate": 8.363265557413786e-06, "loss": 0.2879, "step": 718 }, { "batch_size": 4, "epoch": 0.2872, "step": 718, "tokens_per_device": 4464 }, { "epoch": 0.2872, "loss_ce": 0.38967955112457275, "loss_lvr": 0.5682897567749023, "loss_mode_switch": 0.0, "loss_total": 0.446508526802063, "step": 718 }, { "batch_size": 4, "epoch": 0.2872, "step": 718, "tokens_per_device": 5520 }, { "epoch": 0.2872, "loss_ce": 0.17296001315116882, "loss_lvr": 0.9946839213371277, "loss_mode_switch": 0.0, "loss_total": 0.27242839336395264, "step": 718 }, { "batch_size": 4, "epoch": 0.2872, "step": 718, "tokens_per_device": 3760 }, { "epoch": 0.2872, "loss_ce": 0.08203279227018356, "loss_lvr": 0.9956129789352417, "loss_mode_switch": 0.0, "loss_total": 0.1815940886735916, "step": 718 }, { "batch_size": 1, "epoch": 0.2872, "step": 718, "tokens_per_device": 4382 }, { "epoch": 0.2872, "loss_ce": 0.002717463066801429, "loss_lvr": 0.5181515216827393, "loss_mode_switch": 0.0, "loss_total": 0.05453261360526085, "step": 718 }, { "batch_size": 4, "epoch": 0.2872, "step": 718, "tokens_per_device": 3952 }, { "epoch": 0.2872, "loss_ce": 0.09038341790437698, "loss_lvr": 1.034378170967102, "loss_mode_switch": 0.0, "loss_total": 0.1938212364912033, "step": 718 }, { "batch_size": 4, "epoch": 0.2872, "step": 718, "tokens_per_device": 8112 }, { "epoch": 0.2872, "loss_ce": 0.08520478010177612, "loss_lvr": 0.8855626583099365, "loss_mode_switch": 0.0, "loss_total": 0.1737610399723053, "step": 718 }, { "batch_size": 1, "epoch": 0.2872, "step": 718, "tokens_per_device": 4758 }, { "epoch": 0.2872, "loss_ce": 0.07071080803871155, "loss_lvr": 0.3858339786529541, "loss_mode_switch": 0.0, "loss_total": 0.10929420590400696, "step": 718 }, { "batch_size": 1, "epoch": 0.2872, "step": 718, "tokens_per_device": 4754 }, { "epoch": 0.2872, "loss_ce": 0.043934646993875504, "loss_lvr": 0.3857412040233612, "loss_mode_switch": 0.0, "loss_total": 0.08250877261161804, "step": 718 }, { "epoch": 0.2876, "grad_norm": 1.405380368232727, "learning_rate": 8.358469650839049e-06, "loss": 0.3142, "step": 719 }, { "batch_size": 4, "epoch": 0.2876, "step": 719, "tokens_per_device": 8748 }, { "epoch": 0.2876, "loss_ce": 0.01036821585148573, "loss_lvr": 1.0151400566101074, "loss_mode_switch": 0.0, "loss_total": 0.11188221722841263, "step": 719 }, { "batch_size": 4, "epoch": 0.2876, "step": 719, "tokens_per_device": 1512 }, { "epoch": 0.2876, "loss_ce": 0.5637667775154114, "loss_lvr": 0.9808164238929749, "loss_mode_switch": 0.0, "loss_total": 0.6618484258651733, "step": 719 }, { "batch_size": 1, "epoch": 0.2876, "step": 719, "tokens_per_device": 4891 }, { "epoch": 0.2876, "loss_ce": 0.14811144769191742, "loss_lvr": 0.529327929019928, "loss_mode_switch": 0.0, "loss_total": 0.2010442465543747, "step": 719 }, { "batch_size": 4, "epoch": 0.2876, "step": 719, "tokens_per_device": 5408 }, { "epoch": 0.2876, "loss_ce": 0.4369170069694519, "loss_lvr": 0.6435332298278809, "loss_mode_switch": 0.0, "loss_total": 0.5012703537940979, "step": 719 }, { "batch_size": 4, "epoch": 0.2876, "step": 719, "tokens_per_device": 4152 }, { "epoch": 0.2876, "loss_ce": 0.4293692111968994, "loss_lvr": 0.9227683544158936, "loss_mode_switch": 0.0, "loss_total": 0.5216460227966309, "step": 719 }, { "batch_size": 4, "epoch": 0.2876, "step": 719, "tokens_per_device": 4868 }, { "epoch": 0.2876, "loss_ce": 0.28810328245162964, "loss_lvr": 0.923134982585907, "loss_mode_switch": 0.0, "loss_total": 0.38041678071022034, "step": 719 }, { "batch_size": 4, "epoch": 0.2876, "step": 719, "tokens_per_device": 4392 }, { "epoch": 0.2876, "loss_ce": 0.13395093381404877, "loss_lvr": 1.0170789957046509, "loss_mode_switch": 0.0, "loss_total": 0.23565882444381714, "step": 719 }, { "batch_size": 4, "epoch": 0.2876, "step": 719, "tokens_per_device": 4596 }, { "epoch": 0.2876, "loss_ce": 0.10175583511590958, "loss_lvr": 0.7808391451835632, "loss_mode_switch": 0.0, "loss_total": 0.17983976006507874, "step": 719 }, { "epoch": 0.288, "grad_norm": 1.418599247932434, "learning_rate": 8.353668107658984e-06, "loss": 0.3393, "step": 720 }, { "batch_size": 4, "epoch": 0.288, "step": 720, "tokens_per_device": 5568 }, { "epoch": 0.288, "loss_ce": 0.03912201523780823, "loss_lvr": 0.7632306814193726, "loss_mode_switch": 0.0, "loss_total": 0.1154450848698616, "step": 720 }, { "batch_size": 4, "epoch": 0.288, "step": 720, "tokens_per_device": 3664 }, { "epoch": 0.288, "loss_ce": 0.06344932317733765, "loss_lvr": 0.9016358852386475, "loss_mode_switch": 0.0, "loss_total": 0.1536129117012024, "step": 720 }, { "batch_size": 4, "epoch": 0.288, "step": 720, "tokens_per_device": 2484 }, { "epoch": 0.288, "loss_ce": 0.5028056502342224, "loss_lvr": 0.8677518963813782, "loss_mode_switch": 0.0, "loss_total": 0.5895808339118958, "step": 720 }, { "batch_size": 4, "epoch": 0.288, "step": 720, "tokens_per_device": 2732 }, { "epoch": 0.288, "loss_ce": 0.4728677272796631, "loss_lvr": 0.918267011642456, "loss_mode_switch": 0.0, "loss_total": 0.5646944046020508, "step": 720 }, { "batch_size": 1, "epoch": 0.288, "step": 720, "tokens_per_device": 4876 }, { "epoch": 0.288, "loss_ce": 0.006792424246668816, "loss_lvr": 0.5529076457023621, "loss_mode_switch": 0.0, "loss_total": 0.06208319216966629, "step": 720 }, { "batch_size": 1, "epoch": 0.288, "step": 720, "tokens_per_device": 5086 }, { "epoch": 0.288, "loss_ce": 0.06724712997674942, "loss_lvr": 0.8850972652435303, "loss_mode_switch": 0.0, "loss_total": 0.1557568609714508, "step": 720 }, { "batch_size": 1, "epoch": 0.288, "step": 720, "tokens_per_device": 5013 }, { "epoch": 0.288, "loss_ce": 0.001852752990089357, "loss_lvr": 0.5986999273300171, "loss_mode_switch": 0.0, "loss_total": 0.06172274798154831, "step": 720 }, { "batch_size": 1, "epoch": 0.288, "step": 720, "tokens_per_device": 4859 }, { "epoch": 0.288, "loss_ce": 0.0070765260607004166, "loss_lvr": 0.2418605238199234, "loss_mode_switch": 0.0, "loss_total": 0.03126257658004761, "step": 720 }, { "epoch": 0.2884, "grad_norm": 1.4742777347564697, "learning_rate": 8.348860935932143e-06, "loss": 0.2942, "step": 721 }, { "batch_size": 4, "epoch": 0.2884, "step": 721, "tokens_per_device": 3808 }, { "epoch": 0.2884, "loss_ce": 0.16216379404067993, "loss_lvr": 1.0861684083938599, "loss_mode_switch": 0.0, "loss_total": 0.27078062295913696, "step": 721 }, { "batch_size": 4, "epoch": 0.2884, "step": 721, "tokens_per_device": 1252 }, { "epoch": 0.2884, "loss_ce": 0.43310508131980896, "loss_lvr": 1.1053669452667236, "loss_mode_switch": 0.0, "loss_total": 0.5436418056488037, "step": 721 }, { "batch_size": 1, "epoch": 0.2884, "step": 721, "tokens_per_device": 5168 }, { "epoch": 0.2884, "loss_ce": 0.529387354850769, "loss_lvr": 0.17953400313854218, "loss_mode_switch": 0.0, "loss_total": 0.5473407506942749, "step": 721 }, { "batch_size": 4, "epoch": 0.2884, "step": 721, "tokens_per_device": 4588 }, { "epoch": 0.2884, "loss_ce": 0.03424987569451332, "loss_lvr": 0.819873571395874, "loss_mode_switch": 0.0, "loss_total": 0.11623723804950714, "step": 721 }, { "batch_size": 1, "epoch": 0.2884, "step": 721, "tokens_per_device": 5189 }, { "epoch": 0.2884, "loss_ce": 0.1107206791639328, "loss_lvr": 0.2915591299533844, "loss_mode_switch": 0.0, "loss_total": 0.139876589179039, "step": 721 }, { "batch_size": 4, "epoch": 0.2884, "step": 721, "tokens_per_device": 4708 }, { "epoch": 0.2884, "loss_ce": 0.14162813127040863, "loss_lvr": 0.8464918732643127, "loss_mode_switch": 0.0, "loss_total": 0.22627732157707214, "step": 721 }, { "batch_size": 4, "epoch": 0.2884, "step": 721, "tokens_per_device": 2756 }, { "epoch": 0.2884, "loss_ce": 0.9009724259376526, "loss_lvr": 0.7624476552009583, "loss_mode_switch": 0.0, "loss_total": 0.9772171974182129, "step": 721 }, { "batch_size": 4, "epoch": 0.2884, "step": 721, "tokens_per_device": 4164 }, { "epoch": 0.2884, "loss_ce": 0.10180015861988068, "loss_lvr": 0.8047510981559753, "loss_mode_switch": 0.0, "loss_total": 0.18227526545524597, "step": 721 }, { "epoch": 0.2888, "grad_norm": 1.4535691738128662, "learning_rate": 8.344048143726524e-06, "loss": 0.3057, "step": 722 }, { "batch_size": 4, "epoch": 0.2888, "step": 722, "tokens_per_device": 5744 }, { "epoch": 0.2888, "loss_ce": 0.3483022451400757, "loss_lvr": 1.2231053113937378, "loss_mode_switch": 0.0, "loss_total": 0.4706127643585205, "step": 722 }, { "batch_size": 1, "epoch": 0.2888, "step": 722, "tokens_per_device": 5062 }, { "epoch": 0.2888, "loss_ce": 0.20431707799434662, "loss_lvr": 0.4829573929309845, "loss_mode_switch": 0.0, "loss_total": 0.252612829208374, "step": 722 }, { "batch_size": 4, "epoch": 0.2888, "step": 722, "tokens_per_device": 4508 }, { "epoch": 0.2888, "loss_ce": 0.14942516386508942, "loss_lvr": 0.8876909613609314, "loss_mode_switch": 0.0, "loss_total": 0.23819425702095032, "step": 722 }, { "batch_size": 1, "epoch": 0.2888, "step": 722, "tokens_per_device": 5189 }, { "epoch": 0.2888, "loss_ce": 0.04273561015725136, "loss_lvr": 0.3311535120010376, "loss_mode_switch": 0.0, "loss_total": 0.0758509635925293, "step": 722 }, { "batch_size": 1, "epoch": 0.2888, "step": 722, "tokens_per_device": 5120 }, { "epoch": 0.2888, "loss_ce": 0.005873051006346941, "loss_lvr": 0.5021309852600098, "loss_mode_switch": 0.0, "loss_total": 0.05608614906668663, "step": 722 }, { "batch_size": 4, "epoch": 0.2888, "step": 722, "tokens_per_device": 5848 }, { "epoch": 0.2888, "loss_ce": 0.09039954841136932, "loss_lvr": 0.8672971725463867, "loss_mode_switch": 0.0, "loss_total": 0.17712926864624023, "step": 722 }, { "batch_size": 4, "epoch": 0.2888, "step": 722, "tokens_per_device": 2644 }, { "epoch": 0.2888, "loss_ce": 0.2789301574230194, "loss_lvr": 0.8233997225761414, "loss_mode_switch": 0.0, "loss_total": 0.36127012968063354, "step": 722 }, { "batch_size": 1, "epoch": 0.2888, "step": 722, "tokens_per_device": 6060 }, { "epoch": 0.2888, "loss_ce": 0.00384287117049098, "loss_lvr": 0.3175966739654541, "loss_mode_switch": 0.0, "loss_total": 0.03560253977775574, "step": 722 }, { "epoch": 0.2892, "grad_norm": 2.1388473510742188, "learning_rate": 8.339229739119558e-06, "loss": 0.3362, "step": 723 }, { "batch_size": 4, "epoch": 0.2892, "step": 723, "tokens_per_device": 3924 }, { "epoch": 0.2892, "loss_ce": 0.5727074146270752, "loss_lvr": 1.2319934368133545, "loss_mode_switch": 0.0, "loss_total": 0.6959067583084106, "step": 723 }, { "batch_size": 4, "epoch": 0.2892, "step": 723, "tokens_per_device": 4592 }, { "epoch": 0.2892, "loss_ce": 0.3501497805118561, "loss_lvr": 0.7544204592704773, "loss_mode_switch": 0.0, "loss_total": 0.4255918264389038, "step": 723 }, { "batch_size": 4, "epoch": 0.2892, "step": 723, "tokens_per_device": 12432 }, { "epoch": 0.2892, "loss_ce": 0.1794959455728531, "loss_lvr": 0.968946099281311, "loss_mode_switch": 0.0, "loss_total": 0.27639055252075195, "step": 723 }, { "batch_size": 4, "epoch": 0.2892, "step": 723, "tokens_per_device": 4208 }, { "epoch": 0.2892, "loss_ce": 0.001927221892401576, "loss_lvr": 0.7919761538505554, "loss_mode_switch": 0.0, "loss_total": 0.08112483471632004, "step": 723 }, { "batch_size": 4, "epoch": 0.2892, "step": 723, "tokens_per_device": 3796 }, { "epoch": 0.2892, "loss_ce": 0.49933522939682007, "loss_lvr": 0.8512021899223328, "loss_mode_switch": 0.0, "loss_total": 0.5844554305076599, "step": 723 }, { "batch_size": 4, "epoch": 0.2892, "step": 723, "tokens_per_device": 15388 }, { "epoch": 0.2892, "loss_ce": 0.3838394284248352, "loss_lvr": 0.7785441875457764, "loss_mode_switch": 0.0, "loss_total": 0.4616938531398773, "step": 723 }, { "batch_size": 4, "epoch": 0.2892, "step": 723, "tokens_per_device": 6208 }, { "epoch": 0.2892, "loss_ce": 0.24035611748695374, "loss_lvr": 0.7570178508758545, "loss_mode_switch": 0.0, "loss_total": 0.3160579204559326, "step": 723 }, { "batch_size": 4, "epoch": 0.2892, "step": 723, "tokens_per_device": 6520 }, { "epoch": 0.2892, "loss_ce": 0.5316572189331055, "loss_lvr": 1.0614378452301025, "loss_mode_switch": 0.0, "loss_total": 0.6378009915351868, "step": 723 }, { "epoch": 0.2896, "grad_norm": 1.3445184230804443, "learning_rate": 8.334405730198101e-06, "loss": 0.3157, "step": 724 }, { "batch_size": 1, "epoch": 0.2896, "step": 724, "tokens_per_device": 7296 }, { "epoch": 0.2896, "loss_ce": 0.0019240336259827018, "loss_lvr": 0.5520550608634949, "loss_mode_switch": 0.0, "loss_total": 0.05712953954935074, "step": 724 }, { "batch_size": 4, "epoch": 0.2896, "step": 724, "tokens_per_device": 5240 }, { "epoch": 0.2896, "loss_ce": 0.14991503953933716, "loss_lvr": 0.7298885583877563, "loss_mode_switch": 0.0, "loss_total": 0.22290390729904175, "step": 724 }, { "batch_size": 1, "epoch": 0.2896, "step": 724, "tokens_per_device": 4869 }, { "epoch": 0.2896, "loss_ce": 0.0005925007280893624, "loss_lvr": 0.25147026777267456, "loss_mode_switch": 0.0, "loss_total": 0.025739526376128197, "step": 724 }, { "batch_size": 1, "epoch": 0.2896, "step": 724, "tokens_per_device": 5095 }, { "epoch": 0.2896, "loss_ce": 0.03406941145658493, "loss_lvr": 0.7053683400154114, "loss_mode_switch": 0.0, "loss_total": 0.1046062484383583, "step": 724 }, { "batch_size": 4, "epoch": 0.2896, "step": 724, "tokens_per_device": 4284 }, { "epoch": 0.2896, "loss_ce": 0.3369086980819702, "loss_lvr": 1.1469930410385132, "loss_mode_switch": 0.0, "loss_total": 0.45160800218582153, "step": 724 }, { "batch_size": 1, "epoch": 0.2896, "step": 724, "tokens_per_device": 4912 }, { "epoch": 0.2896, "loss_ce": 0.005872049368917942, "loss_lvr": 0.20743831992149353, "loss_mode_switch": 0.0, "loss_total": 0.02661588042974472, "step": 724 }, { "batch_size": 4, "epoch": 0.2896, "step": 724, "tokens_per_device": 3796 }, { "epoch": 0.2896, "loss_ce": 0.8268750905990601, "loss_lvr": 1.036787748336792, "loss_mode_switch": 0.0, "loss_total": 0.9305538535118103, "step": 724 }, { "batch_size": 4, "epoch": 0.2896, "step": 724, "tokens_per_device": 9512 }, { "epoch": 0.2896, "loss_ce": 0.16444502770900726, "loss_lvr": 0.7873613238334656, "loss_mode_switch": 0.0, "loss_total": 0.24318116903305054, "step": 724 }, { "epoch": 0.29, "grad_norm": 1.4808154106140137, "learning_rate": 8.329576125058406e-06, "loss": 0.2836, "step": 725 }, { "batch_size": 4, "epoch": 0.29, "step": 725, "tokens_per_device": 3840 }, { "epoch": 0.29, "loss_ce": 0.1351565420627594, "loss_lvr": 1.465128779411316, "loss_mode_switch": 0.0, "loss_total": 0.2816694378852844, "step": 725 }, { "batch_size": 4, "epoch": 0.29, "step": 725, "tokens_per_device": 2652 }, { "epoch": 0.29, "loss_ce": 0.1807597279548645, "loss_lvr": 0.8199450969696045, "loss_mode_switch": 0.0, "loss_total": 0.2627542316913605, "step": 725 }, { "batch_size": 4, "epoch": 0.29, "step": 725, "tokens_per_device": 2688 }, { "epoch": 0.29, "loss_ce": 0.5635896325111389, "loss_lvr": 0.8674455285072327, "loss_mode_switch": 0.0, "loss_total": 0.6503341794013977, "step": 725 }, { "batch_size": 1, "epoch": 0.29, "step": 725, "tokens_per_device": 4889 }, { "epoch": 0.29, "loss_ce": 0.18432483077049255, "loss_lvr": 0.42254307866096497, "loss_mode_switch": 0.0, "loss_total": 0.22657914459705353, "step": 725 }, { "batch_size": 4, "epoch": 0.29, "step": 725, "tokens_per_device": 6936 }, { "epoch": 0.29, "loss_ce": 0.22188971936702728, "loss_lvr": 0.8615646362304688, "loss_mode_switch": 0.0, "loss_total": 0.3080461919307709, "step": 725 }, { "batch_size": 4, "epoch": 0.29, "step": 725, "tokens_per_device": 2556 }, { "epoch": 0.29, "loss_ce": 0.6608968377113342, "loss_lvr": 0.9858013391494751, "loss_mode_switch": 0.0, "loss_total": 0.7594769597053528, "step": 725 }, { "batch_size": 4, "epoch": 0.29, "step": 725, "tokens_per_device": 5632 }, { "epoch": 0.29, "loss_ce": 0.08708566427230835, "loss_lvr": 0.9275287985801697, "loss_mode_switch": 0.0, "loss_total": 0.17983853816986084, "step": 725 }, { "batch_size": 4, "epoch": 0.29, "step": 725, "tokens_per_device": 3696 }, { "epoch": 0.29, "loss_ce": 0.05373046547174454, "loss_lvr": 0.44519245624542236, "loss_mode_switch": 0.0, "loss_total": 0.09824971109628677, "step": 725 }, { "epoch": 0.2904, "grad_norm": 1.3247689008712769, "learning_rate": 8.324740931806125e-06, "loss": 0.3106, "step": 726 }, { "batch_size": 4, "epoch": 0.2904, "step": 726, "tokens_per_device": 4592 }, { "epoch": 0.2904, "loss_ce": 0.4961621165275574, "loss_lvr": 0.6466880440711975, "loss_mode_switch": 0.0, "loss_total": 0.5608309507369995, "step": 726 }, { "batch_size": 4, "epoch": 0.2904, "step": 726, "tokens_per_device": 6092 }, { "epoch": 0.2904, "loss_ce": 0.568673312664032, "loss_lvr": 0.8598048090934753, "loss_mode_switch": 0.0, "loss_total": 0.654653787612915, "step": 726 }, { "batch_size": 1, "epoch": 0.2904, "step": 726, "tokens_per_device": 7405 }, { "epoch": 0.2904, "loss_ce": 0.06495603173971176, "loss_lvr": 0.3225099742412567, "loss_mode_switch": 0.0, "loss_total": 0.09720702469348907, "step": 726 }, { "batch_size": 4, "epoch": 0.2904, "step": 726, "tokens_per_device": 2720 }, { "epoch": 0.2904, "loss_ce": 0.05207274854183197, "loss_lvr": 0.7092944979667664, "loss_mode_switch": 0.0, "loss_total": 0.12300220131874084, "step": 726 }, { "batch_size": 4, "epoch": 0.2904, "step": 726, "tokens_per_device": 4264 }, { "epoch": 0.2904, "loss_ce": 0.18782760202884674, "loss_lvr": 0.8721793293952942, "loss_mode_switch": 0.0, "loss_total": 0.2750455439090729, "step": 726 }, { "batch_size": 4, "epoch": 0.2904, "step": 726, "tokens_per_device": 3856 }, { "epoch": 0.2904, "loss_ce": 0.8116663098335266, "loss_lvr": 1.1987104415893555, "loss_mode_switch": 0.0, "loss_total": 0.9315373301506042, "step": 726 }, { "batch_size": 1, "epoch": 0.2904, "step": 726, "tokens_per_device": 4901 }, { "epoch": 0.2904, "loss_ce": 0.07158497720956802, "loss_lvr": 0.15843693912029266, "loss_mode_switch": 0.0, "loss_total": 0.08742867410182953, "step": 726 }, { "batch_size": 4, "epoch": 0.2904, "step": 726, "tokens_per_device": 7744 }, { "epoch": 0.2904, "loss_ce": 0.44294461607933044, "loss_lvr": 0.6026530861854553, "loss_mode_switch": 0.0, "loss_total": 0.5032099485397339, "step": 726 }, { "epoch": 0.2908, "grad_norm": 1.482918381690979, "learning_rate": 8.319900158556285e-06, "loss": 0.3701, "step": 727 }, { "batch_size": 4, "epoch": 0.2908, "step": 727, "tokens_per_device": 4392 }, { "epoch": 0.2908, "loss_ce": 0.165663480758667, "loss_lvr": 1.150644302368164, "loss_mode_switch": 0.0, "loss_total": 0.28072792291641235, "step": 727 }, { "batch_size": 4, "epoch": 0.2908, "step": 727, "tokens_per_device": 4296 }, { "epoch": 0.2908, "loss_ce": 0.08851466327905655, "loss_lvr": 0.8556779623031616, "loss_mode_switch": 0.0, "loss_total": 0.1740824580192566, "step": 727 }, { "batch_size": 4, "epoch": 0.2908, "step": 727, "tokens_per_device": 1472 }, { "epoch": 0.2908, "loss_ce": 0.3840293884277344, "loss_lvr": 1.0473499298095703, "loss_mode_switch": 0.0, "loss_total": 0.48876437544822693, "step": 727 }, { "batch_size": 4, "epoch": 0.2908, "step": 727, "tokens_per_device": 6176 }, { "epoch": 0.2908, "loss_ce": 0.8318272829055786, "loss_lvr": 1.377309799194336, "loss_mode_switch": 0.0, "loss_total": 0.9695582389831543, "step": 727 }, { "batch_size": 1, "epoch": 0.2908, "step": 727, "tokens_per_device": 5036 }, { "epoch": 0.2908, "loss_ce": 0.003243969986215234, "loss_lvr": 0.7087855339050293, "loss_mode_switch": 0.0, "loss_total": 0.07412252575159073, "step": 727 }, { "batch_size": 4, "epoch": 0.2908, "step": 727, "tokens_per_device": 5180 }, { "epoch": 0.2908, "loss_ce": 0.33594146370887756, "loss_lvr": 0.8277382850646973, "loss_mode_switch": 0.0, "loss_total": 0.41871529817581177, "step": 727 }, { "batch_size": 4, "epoch": 0.2908, "step": 727, "tokens_per_device": 1172 }, { "epoch": 0.2908, "loss_ce": 0.3425542712211609, "loss_lvr": 1.092248797416687, "loss_mode_switch": 0.0, "loss_total": 0.45177915692329407, "step": 727 }, { "batch_size": 4, "epoch": 0.2908, "step": 727, "tokens_per_device": 2580 }, { "epoch": 0.2908, "loss_ce": 0.5665391683578491, "loss_lvr": 0.9877780675888062, "loss_mode_switch": 0.0, "loss_total": 0.6653169989585876, "step": 727 }, { "epoch": 0.2912, "grad_norm": 1.3455370664596558, "learning_rate": 8.315053813433279e-06, "loss": 0.3148, "step": 728 }, { "batch_size": 1, "epoch": 0.2912, "step": 728, "tokens_per_device": 5179 }, { "epoch": 0.2912, "loss_ce": 0.0033363725524395704, "loss_lvr": 0.2985169589519501, "loss_mode_switch": 0.0, "loss_total": 0.03318806737661362, "step": 728 }, { "batch_size": 4, "epoch": 0.2912, "step": 728, "tokens_per_device": 11444 }, { "epoch": 0.2912, "loss_ce": 0.2641620934009552, "loss_lvr": 0.9216525554656982, "loss_mode_switch": 0.0, "loss_total": 0.3563273549079895, "step": 728 }, { "batch_size": 4, "epoch": 0.2912, "step": 728, "tokens_per_device": 4348 }, { "epoch": 0.2912, "loss_ce": 0.39288243651390076, "loss_lvr": 0.6280575394630432, "loss_mode_switch": 0.0, "loss_total": 0.4556881785392761, "step": 728 }, { "batch_size": 1, "epoch": 0.2912, "step": 728, "tokens_per_device": 5605 }, { "epoch": 0.2912, "loss_ce": 0.028028687462210655, "loss_lvr": 0.314614862203598, "loss_mode_switch": 0.0, "loss_total": 0.05949017405509949, "step": 728 }, { "batch_size": 1, "epoch": 0.2912, "step": 728, "tokens_per_device": 4925 }, { "epoch": 0.2912, "loss_ce": 0.4398168623447418, "loss_lvr": 0.4821580946445465, "loss_mode_switch": 0.0, "loss_total": 0.48803266882896423, "step": 728 }, { "batch_size": 4, "epoch": 0.2912, "step": 728, "tokens_per_device": 5608 }, { "epoch": 0.2912, "loss_ce": 0.09331325441598892, "loss_lvr": 0.93354332447052, "loss_mode_switch": 0.0, "loss_total": 0.18666759133338928, "step": 728 }, { "batch_size": 4, "epoch": 0.2912, "step": 728, "tokens_per_device": 3452 }, { "epoch": 0.2912, "loss_ce": 0.06677471101284027, "loss_lvr": 1.1038217544555664, "loss_mode_switch": 0.0, "loss_total": 0.17715689539909363, "step": 728 }, { "batch_size": 1, "epoch": 0.2912, "step": 728, "tokens_per_device": 4921 }, { "epoch": 0.2912, "loss_ce": 0.09428445994853973, "loss_lvr": 0.3476020097732544, "loss_mode_switch": 0.0, "loss_total": 0.12904466688632965, "step": 728 }, { "epoch": 0.2916, "grad_norm": 1.3064308166503906, "learning_rate": 8.310201904570853e-06, "loss": 0.2607, "step": 729 }, { "batch_size": 4, "epoch": 0.2916, "step": 729, "tokens_per_device": 3824 }, { "epoch": 0.2916, "loss_ce": 0.09617568552494049, "loss_lvr": 0.8687200546264648, "loss_mode_switch": 0.0, "loss_total": 0.18304768204689026, "step": 729 }, { "batch_size": 1, "epoch": 0.2916, "step": 729, "tokens_per_device": 4909 }, { "epoch": 0.2916, "loss_ce": 0.008343851193785667, "loss_lvr": 0.4707905948162079, "loss_mode_switch": 0.0, "loss_total": 0.055422909557819366, "step": 729 }, { "batch_size": 4, "epoch": 0.2916, "step": 729, "tokens_per_device": 2628 }, { "epoch": 0.2916, "loss_ce": 0.01702234521508217, "loss_lvr": 0.8770227432250977, "loss_mode_switch": 0.0, "loss_total": 0.10472461581230164, "step": 729 }, { "batch_size": 4, "epoch": 0.2916, "step": 729, "tokens_per_device": 1356 }, { "epoch": 0.2916, "loss_ce": 0.3712252676486969, "loss_lvr": 1.1097335815429688, "loss_mode_switch": 0.0, "loss_total": 0.4821986258029938, "step": 729 }, { "batch_size": 4, "epoch": 0.2916, "step": 729, "tokens_per_device": 1560 }, { "epoch": 0.2916, "loss_ce": 0.470466285943985, "loss_lvr": 1.2651551961898804, "loss_mode_switch": 0.0, "loss_total": 0.5969818234443665, "step": 729 }, { "batch_size": 1, "epoch": 0.2916, "step": 729, "tokens_per_device": 4894 }, { "epoch": 0.2916, "loss_ce": 0.03068552166223526, "loss_lvr": 0.7375989556312561, "loss_mode_switch": 0.0, "loss_total": 0.10444542020559311, "step": 729 }, { "batch_size": 1, "epoch": 0.2916, "step": 729, "tokens_per_device": 5873 }, { "epoch": 0.2916, "loss_ce": 0.16003663837909698, "loss_lvr": 0.47280776500701904, "loss_mode_switch": 0.0, "loss_total": 0.20731741189956665, "step": 729 }, { "batch_size": 1, "epoch": 0.2916, "step": 729, "tokens_per_device": 5171 }, { "epoch": 0.2916, "loss_ce": 0.0007387929363176227, "loss_lvr": 0.4898965060710907, "loss_mode_switch": 0.0, "loss_total": 0.04972844198346138, "step": 729 }, { "epoch": 0.292, "grad_norm": 2.7852725982666016, "learning_rate": 8.305344440112089e-06, "loss": 0.3055, "step": 730 }, { "batch_size": 4, "epoch": 0.292, "step": 730, "tokens_per_device": 1592 }, { "epoch": 0.292, "loss_ce": 0.1653943657875061, "loss_lvr": 1.1157333850860596, "loss_mode_switch": 0.0, "loss_total": 0.27696770429611206, "step": 730 }, { "batch_size": 4, "epoch": 0.292, "step": 730, "tokens_per_device": 4208 }, { "epoch": 0.292, "loss_ce": 0.14750215411186218, "loss_lvr": 1.1504242420196533, "loss_mode_switch": 0.0, "loss_total": 0.26254457235336304, "step": 730 }, { "batch_size": 4, "epoch": 0.292, "step": 730, "tokens_per_device": 4784 }, { "epoch": 0.292, "loss_ce": 0.1814224272966385, "loss_lvr": 0.7744713425636292, "loss_mode_switch": 0.0, "loss_total": 0.25886955857276917, "step": 730 }, { "batch_size": 4, "epoch": 0.292, "step": 730, "tokens_per_device": 4708 }, { "epoch": 0.292, "loss_ce": 0.1401010900735855, "loss_lvr": 1.6760088205337524, "loss_mode_switch": 0.0, "loss_total": 0.307701975107193, "step": 730 }, { "batch_size": 4, "epoch": 0.292, "step": 730, "tokens_per_device": 5764 }, { "epoch": 0.292, "loss_ce": 0.2539995312690735, "loss_lvr": 0.9498351216316223, "loss_mode_switch": 0.0, "loss_total": 0.3489830493927002, "step": 730 }, { "batch_size": 4, "epoch": 0.292, "step": 730, "tokens_per_device": 1292 }, { "epoch": 0.292, "loss_ce": 0.41814282536506653, "loss_lvr": 1.031287431716919, "loss_mode_switch": 0.0, "loss_total": 0.5212715864181519, "step": 730 }, { "batch_size": 1, "epoch": 0.292, "step": 730, "tokens_per_device": 5280 }, { "epoch": 0.292, "loss_ce": 0.4019235074520111, "loss_lvr": 0.9775808453559875, "loss_mode_switch": 0.0, "loss_total": 0.49968159198760986, "step": 730 }, { "batch_size": 1, "epoch": 0.292, "step": 730, "tokens_per_device": 7836 }, { "epoch": 0.292, "loss_ce": 0.001136653940193355, "loss_lvr": 0.3809816837310791, "loss_mode_switch": 0.0, "loss_total": 0.03923482075333595, "step": 730 }, { "epoch": 0.2924, "grad_norm": 1.5070053339004517, "learning_rate": 8.300481428209391e-06, "loss": 0.3393, "step": 731 }, { "batch_size": 4, "epoch": 0.2924, "step": 731, "tokens_per_device": 3728 }, { "epoch": 0.2924, "loss_ce": 0.2818601131439209, "loss_lvr": 0.6502665877342224, "loss_mode_switch": 0.0, "loss_total": 0.3468867838382721, "step": 731 }, { "batch_size": 1, "epoch": 0.2924, "step": 731, "tokens_per_device": 5056 }, { "epoch": 0.2924, "loss_ce": 0.013558865524828434, "loss_lvr": 0.9019918441772461, "loss_mode_switch": 0.0, "loss_total": 0.10375805199146271, "step": 731 }, { "batch_size": 4, "epoch": 0.2924, "step": 731, "tokens_per_device": 1364 }, { "epoch": 0.2924, "loss_ce": 0.4385719895362854, "loss_lvr": 1.006345510482788, "loss_mode_switch": 0.0, "loss_total": 0.5392065644264221, "step": 731 }, { "batch_size": 4, "epoch": 0.2924, "step": 731, "tokens_per_device": 9996 }, { "epoch": 0.2924, "loss_ce": 0.0669432207942009, "loss_lvr": 0.7563371062278748, "loss_mode_switch": 0.0, "loss_total": 0.1425769329071045, "step": 731 }, { "batch_size": 4, "epoch": 0.2924, "step": 731, "tokens_per_device": 1712 }, { "epoch": 0.2924, "loss_ce": 0.17380481958389282, "loss_lvr": 0.9915794730186462, "loss_mode_switch": 0.0, "loss_total": 0.2729627788066864, "step": 731 }, { "batch_size": 4, "epoch": 0.2924, "step": 731, "tokens_per_device": 5732 }, { "epoch": 0.2924, "loss_ce": 0.4531235992908478, "loss_lvr": 0.9368221163749695, "loss_mode_switch": 0.0, "loss_total": 0.5468057990074158, "step": 731 }, { "batch_size": 4, "epoch": 0.2924, "step": 731, "tokens_per_device": 2632 }, { "epoch": 0.2924, "loss_ce": 0.30765873193740845, "loss_lvr": 0.9512290954589844, "loss_mode_switch": 0.0, "loss_total": 0.4027816355228424, "step": 731 }, { "batch_size": 4, "epoch": 0.2924, "step": 731, "tokens_per_device": 2644 }, { "epoch": 0.2924, "loss_ce": 0.12973350286483765, "loss_lvr": 0.8260473608970642, "loss_mode_switch": 0.0, "loss_total": 0.21233823895454407, "step": 731 }, { "epoch": 0.2928, "grad_norm": 1.2707394361495972, "learning_rate": 8.295612877024482e-06, "loss": 0.3326, "step": 732 }, { "batch_size": 1, "epoch": 0.2928, "step": 732, "tokens_per_device": 5200 }, { "epoch": 0.2928, "loss_ce": 0.011495540849864483, "loss_lvr": 0.3267255127429962, "loss_mode_switch": 0.0, "loss_total": 0.04416809231042862, "step": 732 }, { "batch_size": 4, "epoch": 0.2928, "step": 732, "tokens_per_device": 2700 }, { "epoch": 0.2928, "loss_ce": 0.5485014319419861, "loss_lvr": 0.6605356335639954, "loss_mode_switch": 0.0, "loss_total": 0.6145550012588501, "step": 732 }, { "batch_size": 4, "epoch": 0.2928, "step": 732, "tokens_per_device": 4352 }, { "epoch": 0.2928, "loss_ce": 0.11075015366077423, "loss_lvr": 1.022257685661316, "loss_mode_switch": 0.0, "loss_total": 0.21297591924667358, "step": 732 }, { "batch_size": 1, "epoch": 0.2928, "step": 732, "tokens_per_device": 5138 }, { "epoch": 0.2928, "loss_ce": 0.03992805629968643, "loss_lvr": 0.39395982027053833, "loss_mode_switch": 0.0, "loss_total": 0.07932403683662415, "step": 732 }, { "batch_size": 4, "epoch": 0.2928, "step": 732, "tokens_per_device": 5680 }, { "epoch": 0.2928, "loss_ce": 0.05944754183292389, "loss_lvr": 0.7642203569412231, "loss_mode_switch": 0.0, "loss_total": 0.1358695775270462, "step": 732 }, { "batch_size": 1, "epoch": 0.2928, "step": 732, "tokens_per_device": 4889 }, { "epoch": 0.2928, "loss_ce": 0.11933685094118118, "loss_lvr": 0.7371000647544861, "loss_mode_switch": 0.0, "loss_total": 0.19304686784744263, "step": 732 }, { "batch_size": 4, "epoch": 0.2928, "step": 732, "tokens_per_device": 4000 }, { "epoch": 0.2928, "loss_ce": 0.4314301013946533, "loss_lvr": 1.623443603515625, "loss_mode_switch": 0.0, "loss_total": 0.5937744379043579, "step": 732 }, { "batch_size": 1, "epoch": 0.2928, "step": 732, "tokens_per_device": 5104 }, { "epoch": 0.2928, "loss_ce": 0.0071379560977220535, "loss_lvr": 0.15825487673282623, "loss_mode_switch": 0.0, "loss_total": 0.022963443771004677, "step": 732 }, { "epoch": 0.2932, "grad_norm": 1.1765556335449219, "learning_rate": 8.290738794728369e-06, "loss": 0.2822, "step": 733 }, { "batch_size": 1, "epoch": 0.2932, "step": 733, "tokens_per_device": 5002 }, { "epoch": 0.2932, "loss_ce": 0.002047034213319421, "loss_lvr": 0.47419074177742004, "loss_mode_switch": 0.0, "loss_total": 0.04946611076593399, "step": 733 }, { "batch_size": 4, "epoch": 0.2932, "step": 733, "tokens_per_device": 4548 }, { "epoch": 0.2932, "loss_ce": 0.01336092408746481, "loss_lvr": 0.8161023259162903, "loss_mode_switch": 0.0, "loss_total": 0.09497115761041641, "step": 733 }, { "batch_size": 1, "epoch": 0.2932, "step": 733, "tokens_per_device": 4954 }, { "epoch": 0.2932, "loss_ce": 0.37232789397239685, "loss_lvr": 0.6378180384635925, "loss_mode_switch": 0.0, "loss_total": 0.4361096918582916, "step": 733 }, { "batch_size": 4, "epoch": 0.2932, "step": 733, "tokens_per_device": 1348 }, { "epoch": 0.2932, "loss_ce": 0.46780410408973694, "loss_lvr": 1.0630935430526733, "loss_mode_switch": 0.0, "loss_total": 0.5741134881973267, "step": 733 }, { "batch_size": 1, "epoch": 0.2932, "step": 733, "tokens_per_device": 5016 }, { "epoch": 0.2932, "loss_ce": 0.009077927097678185, "loss_lvr": 0.5897312164306641, "loss_mode_switch": 0.0, "loss_total": 0.0680510476231575, "step": 733 }, { "batch_size": 4, "epoch": 0.2932, "step": 733, "tokens_per_device": 3948 }, { "epoch": 0.2932, "loss_ce": 0.024205463007092476, "loss_lvr": 0.9572655558586121, "loss_mode_switch": 0.0, "loss_total": 0.11993201822042465, "step": 733 }, { "batch_size": 1, "epoch": 0.2932, "step": 733, "tokens_per_device": 4868 }, { "epoch": 0.2932, "loss_ce": 0.0006739285890944302, "loss_lvr": 1.8914000988006592, "loss_mode_switch": 0.0, "loss_total": 0.18981394171714783, "step": 733 }, { "batch_size": 4, "epoch": 0.2932, "step": 733, "tokens_per_device": 10276 }, { "epoch": 0.2932, "loss_ce": 0.018670545890927315, "loss_lvr": 0.6945799589157104, "loss_mode_switch": 0.0, "loss_total": 0.08812854439020157, "step": 733 }, { "epoch": 0.2936, "grad_norm": 1.359222173690796, "learning_rate": 8.285859189501353e-06, "loss": 0.3006, "step": 734 }, { "batch_size": 4, "epoch": 0.2936, "step": 734, "tokens_per_device": 6336 }, { "epoch": 0.2936, "loss_ce": 0.2903207838535309, "loss_lvr": 0.6907468438148499, "loss_mode_switch": 0.0, "loss_total": 0.35939547419548035, "step": 734 }, { "batch_size": 1, "epoch": 0.2936, "step": 734, "tokens_per_device": 4876 }, { "epoch": 0.2936, "loss_ce": 0.020748214796185493, "loss_lvr": 0.37592869997024536, "loss_mode_switch": 0.0, "loss_total": 0.05834108591079712, "step": 734 }, { "batch_size": 4, "epoch": 0.2936, "step": 734, "tokens_per_device": 2712 }, { "epoch": 0.2936, "loss_ce": 0.1960967630147934, "loss_lvr": 1.242120385169983, "loss_mode_switch": 0.0, "loss_total": 0.3203088045120239, "step": 734 }, { "batch_size": 4, "epoch": 0.2936, "step": 734, "tokens_per_device": 5988 }, { "epoch": 0.2936, "loss_ce": 0.18458299338817596, "loss_lvr": 0.8331133723258972, "loss_mode_switch": 0.0, "loss_total": 0.26789432764053345, "step": 734 }, { "batch_size": 4, "epoch": 0.2936, "step": 734, "tokens_per_device": 4440 }, { "epoch": 0.2936, "loss_ce": 0.14885105192661285, "loss_lvr": 0.8380600214004517, "loss_mode_switch": 0.0, "loss_total": 0.2326570451259613, "step": 734 }, { "batch_size": 4, "epoch": 0.2936, "step": 734, "tokens_per_device": 5124 }, { "epoch": 0.2936, "loss_ce": 0.5220416784286499, "loss_lvr": 0.8970706462860107, "loss_mode_switch": 0.0, "loss_total": 0.6117487549781799, "step": 734 }, { "batch_size": 4, "epoch": 0.2936, "step": 734, "tokens_per_device": 5628 }, { "epoch": 0.2936, "loss_ce": 0.24782007932662964, "loss_lvr": 0.8172087669372559, "loss_mode_switch": 0.0, "loss_total": 0.3295409679412842, "step": 734 }, { "batch_size": 1, "epoch": 0.2936, "step": 734, "tokens_per_device": 4893 }, { "epoch": 0.2936, "loss_ce": 0.011623723432421684, "loss_lvr": 0.4961368143558502, "loss_mode_switch": 0.0, "loss_total": 0.0612374022603035, "step": 734 }, { "epoch": 0.294, "grad_norm": 1.4762208461761475, "learning_rate": 8.280974069532999e-06, "loss": 0.3215, "step": 735 }, { "batch_size": 4, "epoch": 0.294, "step": 735, "tokens_per_device": 5904 }, { "epoch": 0.294, "loss_ce": 0.6503972411155701, "loss_lvr": 0.8406463265419006, "loss_mode_switch": 0.0, "loss_total": 0.7344619035720825, "step": 735 }, { "batch_size": 1, "epoch": 0.294, "step": 735, "tokens_per_device": 5081 }, { "epoch": 0.294, "loss_ce": 2.4940185546875, "loss_lvr": 0.5212885737419128, "loss_mode_switch": 0.0, "loss_total": 2.546147346496582, "step": 735 }, { "batch_size": 4, "epoch": 0.294, "step": 735, "tokens_per_device": 3896 }, { "epoch": 0.294, "loss_ce": 0.7640561461448669, "loss_lvr": 1.0718239545822144, "loss_mode_switch": 0.0, "loss_total": 0.8712385296821594, "step": 735 }, { "batch_size": 1, "epoch": 0.294, "step": 735, "tokens_per_device": 4913 }, { "epoch": 0.294, "loss_ce": 0.10039369761943817, "loss_lvr": 0.40819624066352844, "loss_mode_switch": 0.0, "loss_total": 0.1412133276462555, "step": 735 }, { "batch_size": 1, "epoch": 0.294, "step": 735, "tokens_per_device": 4723 }, { "epoch": 0.294, "loss_ce": 0.1908523440361023, "loss_lvr": 0.34593164920806885, "loss_mode_switch": 0.0, "loss_total": 0.22544550895690918, "step": 735 }, { "batch_size": 4, "epoch": 0.294, "step": 735, "tokens_per_device": 1504 }, { "epoch": 0.294, "loss_ce": 0.2719041705131531, "loss_lvr": 1.2322075366973877, "loss_mode_switch": 0.0, "loss_total": 0.3951249122619629, "step": 735 }, { "batch_size": 4, "epoch": 0.294, "step": 735, "tokens_per_device": 4104 }, { "epoch": 0.294, "loss_ce": 0.12766215205192566, "loss_lvr": 0.7556431889533997, "loss_mode_switch": 0.0, "loss_total": 0.2032264769077301, "step": 735 }, { "batch_size": 4, "epoch": 0.294, "step": 735, "tokens_per_device": 2628 }, { "epoch": 0.294, "loss_ce": 0.267248272895813, "loss_lvr": 0.9721143245697021, "loss_mode_switch": 0.0, "loss_total": 0.36445969343185425, "step": 735 }, { "epoch": 0.2944, "grad_norm": 1.5330942869186401, "learning_rate": 8.276083443022127e-06, "loss": 0.3522, "step": 736 }, { "batch_size": 4, "epoch": 0.2944, "step": 736, "tokens_per_device": 6440 }, { "epoch": 0.2944, "loss_ce": 0.4110882580280304, "loss_lvr": 0.7642518877983093, "loss_mode_switch": 0.0, "loss_total": 0.4875134527683258, "step": 736 }, { "batch_size": 4, "epoch": 0.2944, "step": 736, "tokens_per_device": 3348 }, { "epoch": 0.2944, "loss_ce": 0.7088754773139954, "loss_lvr": 0.8150141835212708, "loss_mode_switch": 0.0, "loss_total": 0.7903769016265869, "step": 736 }, { "batch_size": 4, "epoch": 0.2944, "step": 736, "tokens_per_device": 4252 }, { "epoch": 0.2944, "loss_ce": 0.3270433247089386, "loss_lvr": 1.4814437627792358, "loss_mode_switch": 0.0, "loss_total": 0.4751877188682556, "step": 736 }, { "batch_size": 1, "epoch": 0.2944, "step": 736, "tokens_per_device": 4951 }, { "epoch": 0.2944, "loss_ce": 0.010280204936861992, "loss_lvr": 0.2745419144630432, "loss_mode_switch": 0.0, "loss_total": 0.03773439675569534, "step": 736 }, { "batch_size": 4, "epoch": 0.2944, "step": 736, "tokens_per_device": 5464 }, { "epoch": 0.2944, "loss_ce": 0.02028520591557026, "loss_lvr": 0.9763203859329224, "loss_mode_switch": 0.0, "loss_total": 0.1179172471165657, "step": 736 }, { "batch_size": 4, "epoch": 0.2944, "step": 736, "tokens_per_device": 11128 }, { "epoch": 0.2944, "loss_ce": 0.039833225309848785, "loss_lvr": 0.6828535199165344, "loss_mode_switch": 0.0, "loss_total": 0.10811857879161835, "step": 736 }, { "batch_size": 4, "epoch": 0.2944, "step": 736, "tokens_per_device": 4824 }, { "epoch": 0.2944, "loss_ce": 0.5971660017967224, "loss_lvr": 0.905371904373169, "loss_mode_switch": 0.0, "loss_total": 0.6877031922340393, "step": 736 }, { "batch_size": 1, "epoch": 0.2944, "step": 736, "tokens_per_device": 4854 }, { "epoch": 0.2944, "loss_ce": 0.11638722568750381, "loss_lvr": 0.4284338653087616, "loss_mode_switch": 0.0, "loss_total": 0.15923061966896057, "step": 736 }, { "epoch": 0.2948, "grad_norm": 1.4994020462036133, "learning_rate": 8.271187318176803e-06, "loss": 0.3119, "step": 737 }, { "batch_size": 4, "epoch": 0.2948, "step": 737, "tokens_per_device": 5148 }, { "epoch": 0.2948, "loss_ce": 0.3312515914440155, "loss_lvr": 0.8070099949836731, "loss_mode_switch": 0.0, "loss_total": 0.41195258498191833, "step": 737 }, { "batch_size": 4, "epoch": 0.2948, "step": 737, "tokens_per_device": 4220 }, { "epoch": 0.2948, "loss_ce": 0.05491115525364876, "loss_lvr": 0.8292325139045715, "loss_mode_switch": 0.0, "loss_total": 0.13783441483974457, "step": 737 }, { "batch_size": 4, "epoch": 0.2948, "step": 737, "tokens_per_device": 4428 }, { "epoch": 0.2948, "loss_ce": 0.18387901782989502, "loss_lvr": 0.915435791015625, "loss_mode_switch": 0.0, "loss_total": 0.275422602891922, "step": 737 }, { "batch_size": 1, "epoch": 0.2948, "step": 737, "tokens_per_device": 5383 }, { "epoch": 0.2948, "loss_ce": 0.001786418491974473, "loss_lvr": 0.616206169128418, "loss_mode_switch": 0.0, "loss_total": 0.0634070411324501, "step": 737 }, { "batch_size": 4, "epoch": 0.2948, "step": 737, "tokens_per_device": 3840 }, { "epoch": 0.2948, "loss_ce": 0.13288486003875732, "loss_lvr": 1.2354635000228882, "loss_mode_switch": 0.0, "loss_total": 0.2564312219619751, "step": 737 }, { "batch_size": 4, "epoch": 0.2948, "step": 737, "tokens_per_device": 3328 }, { "epoch": 0.2948, "loss_ce": 0.5480238795280457, "loss_lvr": 1.037875771522522, "loss_mode_switch": 0.0, "loss_total": 0.6518114805221558, "step": 737 }, { "batch_size": 4, "epoch": 0.2948, "step": 737, "tokens_per_device": 3812 }, { "epoch": 0.2948, "loss_ce": 0.3850599527359009, "loss_lvr": 1.011907696723938, "loss_mode_switch": 0.0, "loss_total": 0.48625072836875916, "step": 737 }, { "batch_size": 4, "epoch": 0.2948, "step": 737, "tokens_per_device": 4084 }, { "epoch": 0.2948, "loss_ce": 0.11350621283054352, "loss_lvr": 0.8605010509490967, "loss_mode_switch": 0.0, "loss_total": 0.19955632090568542, "step": 737 }, { "epoch": 0.2952, "grad_norm": 1.2777588367462158, "learning_rate": 8.266285703214315e-06, "loss": 0.3111, "step": 738 }, { "batch_size": 1, "epoch": 0.2952, "step": 738, "tokens_per_device": 4974 }, { "epoch": 0.2952, "loss_ce": 0.009948993101716042, "loss_lvr": 0.4541216492652893, "loss_mode_switch": 0.0, "loss_total": 0.05536115914583206, "step": 738 }, { "batch_size": 4, "epoch": 0.2952, "step": 738, "tokens_per_device": 5180 }, { "epoch": 0.2952, "loss_ce": 0.13590510189533234, "loss_lvr": 1.1562724113464355, "loss_mode_switch": 0.0, "loss_total": 0.25153234601020813, "step": 738 }, { "batch_size": 4, "epoch": 0.2952, "step": 738, "tokens_per_device": 5612 }, { "epoch": 0.2952, "loss_ce": 0.6792752742767334, "loss_lvr": 0.7513432502746582, "loss_mode_switch": 0.0, "loss_total": 0.7544096112251282, "step": 738 }, { "batch_size": 4, "epoch": 0.2952, "step": 738, "tokens_per_device": 4472 }, { "epoch": 0.2952, "loss_ce": 0.2244621217250824, "loss_lvr": 1.1734447479248047, "loss_mode_switch": 0.0, "loss_total": 0.3418065905570984, "step": 738 }, { "batch_size": 1, "epoch": 0.2952, "step": 738, "tokens_per_device": 4732 }, { "epoch": 0.2952, "loss_ce": 0.09687051922082901, "loss_lvr": 0.4571070969104767, "loss_mode_switch": 0.0, "loss_total": 0.14258122444152832, "step": 738 }, { "batch_size": 1, "epoch": 0.2952, "step": 738, "tokens_per_device": 5834 }, { "epoch": 0.2952, "loss_ce": 0.04411459341645241, "loss_lvr": 0.9440807104110718, "loss_mode_switch": 0.0, "loss_total": 0.138522669672966, "step": 738 }, { "batch_size": 4, "epoch": 0.2952, "step": 738, "tokens_per_device": 4460 }, { "epoch": 0.2952, "loss_ce": 0.008707200177013874, "loss_lvr": 0.906997561454773, "loss_mode_switch": 0.0, "loss_total": 0.09940695762634277, "step": 738 }, { "batch_size": 4, "epoch": 0.2952, "step": 738, "tokens_per_device": 6080 }, { "epoch": 0.2952, "loss_ce": 0.3161011338233948, "loss_lvr": 0.906024158000946, "loss_mode_switch": 0.0, "loss_total": 0.40670356154441833, "step": 738 }, { "epoch": 0.2956, "grad_norm": 1.2077748775482178, "learning_rate": 8.261378606361173e-06, "loss": 0.2508, "step": 739 }, { "batch_size": 1, "epoch": 0.2956, "step": 739, "tokens_per_device": 5110 }, { "epoch": 0.2956, "loss_ce": 0.1633061021566391, "loss_lvr": 0.5381945967674255, "loss_mode_switch": 0.0, "loss_total": 0.2171255648136139, "step": 739 }, { "batch_size": 4, "epoch": 0.2956, "step": 739, "tokens_per_device": 2712 }, { "epoch": 0.2956, "loss_ce": 0.1507057547569275, "loss_lvr": 0.8289726376533508, "loss_mode_switch": 0.0, "loss_total": 0.23360303044319153, "step": 739 }, { "batch_size": 4, "epoch": 0.2956, "step": 739, "tokens_per_device": 4212 }, { "epoch": 0.2956, "loss_ce": 0.17418815195560455, "loss_lvr": 1.0152990818023682, "loss_mode_switch": 0.0, "loss_total": 0.2757180631160736, "step": 739 }, { "batch_size": 1, "epoch": 0.2956, "step": 739, "tokens_per_device": 4781 }, { "epoch": 0.2956, "loss_ce": 0.06846394389867783, "loss_lvr": 0.24258922040462494, "loss_mode_switch": 0.0, "loss_total": 0.09272286295890808, "step": 739 }, { "batch_size": 1, "epoch": 0.2956, "step": 739, "tokens_per_device": 5188 }, { "epoch": 0.2956, "loss_ce": 0.08963672071695328, "loss_lvr": 0.6866586804389954, "loss_mode_switch": 0.0, "loss_total": 0.15830259025096893, "step": 739 }, { "batch_size": 4, "epoch": 0.2956, "step": 739, "tokens_per_device": 4228 }, { "epoch": 0.2956, "loss_ce": 0.054261963814496994, "loss_lvr": 0.7977901101112366, "loss_mode_switch": 0.0, "loss_total": 0.1340409815311432, "step": 739 }, { "batch_size": 4, "epoch": 0.2956, "step": 739, "tokens_per_device": 10608 }, { "epoch": 0.2956, "loss_ce": 0.24067427217960358, "loss_lvr": 0.8132826089859009, "loss_mode_switch": 0.0, "loss_total": 0.3220025300979614, "step": 739 }, { "batch_size": 4, "epoch": 0.2956, "step": 739, "tokens_per_device": 5684 }, { "epoch": 0.2956, "loss_ce": 0.03459571674466133, "loss_lvr": 0.829349160194397, "loss_mode_switch": 0.0, "loss_total": 0.11753062903881073, "step": 739 }, { "epoch": 0.296, "grad_norm": 1.370503306388855, "learning_rate": 8.256466035853077e-06, "loss": 0.3132, "step": 740 }, { "batch_size": 4, "epoch": 0.296, "step": 740, "tokens_per_device": 4464 }, { "epoch": 0.296, "loss_ce": 0.06597602367401123, "loss_lvr": 0.8996759057044983, "loss_mode_switch": 0.0, "loss_total": 0.1559436172246933, "step": 740 }, { "batch_size": 4, "epoch": 0.296, "step": 740, "tokens_per_device": 2664 }, { "epoch": 0.296, "loss_ce": 0.16004787385463715, "loss_lvr": 0.6993933320045471, "loss_mode_switch": 0.0, "loss_total": 0.22998720407485962, "step": 740 }, { "batch_size": 4, "epoch": 0.296, "step": 740, "tokens_per_device": 3760 }, { "epoch": 0.296, "loss_ce": 0.25875958800315857, "loss_lvr": 0.9788985252380371, "loss_mode_switch": 0.0, "loss_total": 0.3566494584083557, "step": 740 }, { "batch_size": 1, "epoch": 0.296, "step": 740, "tokens_per_device": 4939 }, { "epoch": 0.296, "loss_ce": 0.12428619712591171, "loss_lvr": 0.23881608247756958, "loss_mode_switch": 0.0, "loss_total": 0.14816780388355255, "step": 740 }, { "batch_size": 1, "epoch": 0.296, "step": 740, "tokens_per_device": 5108 }, { "epoch": 0.296, "loss_ce": 0.05809229239821434, "loss_lvr": 0.4379170835018158, "loss_mode_switch": 0.0, "loss_total": 0.10188400000333786, "step": 740 }, { "batch_size": 1, "epoch": 0.296, "step": 740, "tokens_per_device": 5020 }, { "epoch": 0.296, "loss_ce": 0.03975939750671387, "loss_lvr": 0.6237091422080994, "loss_mode_switch": 0.0, "loss_total": 0.10213030874729156, "step": 740 }, { "batch_size": 1, "epoch": 0.296, "step": 740, "tokens_per_device": 5223 }, { "epoch": 0.296, "loss_ce": 0.0031563176307827234, "loss_lvr": 0.3628776967525482, "loss_mode_switch": 0.0, "loss_total": 0.03944408893585205, "step": 740 }, { "batch_size": 4, "epoch": 0.296, "step": 740, "tokens_per_device": 5672 }, { "epoch": 0.296, "loss_ce": 0.22386719286441803, "loss_lvr": 1.0013641119003296, "loss_mode_switch": 0.0, "loss_total": 0.3240036070346832, "step": 740 }, { "epoch": 0.2964, "grad_norm": 1.5159976482391357, "learning_rate": 8.251547999934924e-06, "loss": 0.3495, "step": 741 }, { "batch_size": 4, "epoch": 0.2964, "step": 741, "tokens_per_device": 3764 }, { "epoch": 0.2964, "loss_ce": 0.14059053361415863, "loss_lvr": 0.835715115070343, "loss_mode_switch": 0.0, "loss_total": 0.2241620421409607, "step": 741 }, { "batch_size": 4, "epoch": 0.2964, "step": 741, "tokens_per_device": 6100 }, { "epoch": 0.2964, "loss_ce": 0.02551843971014023, "loss_lvr": 0.8652092814445496, "loss_mode_switch": 0.0, "loss_total": 0.11203937232494354, "step": 741 }, { "batch_size": 4, "epoch": 0.2964, "step": 741, "tokens_per_device": 4928 }, { "epoch": 0.2964, "loss_ce": 0.3097621202468872, "loss_lvr": 0.8171526193618774, "loss_mode_switch": 0.0, "loss_total": 0.3914773762226105, "step": 741 }, { "batch_size": 4, "epoch": 0.2964, "step": 741, "tokens_per_device": 2752 }, { "epoch": 0.2964, "loss_ce": 0.14910824596881866, "loss_lvr": 0.8065586090087891, "loss_mode_switch": 0.0, "loss_total": 0.22976410388946533, "step": 741 }, { "batch_size": 4, "epoch": 0.2964, "step": 741, "tokens_per_device": 3792 }, { "epoch": 0.2964, "loss_ce": 0.3384135067462921, "loss_lvr": 1.0774633884429932, "loss_mode_switch": 0.0, "loss_total": 0.44615983963012695, "step": 741 }, { "batch_size": 4, "epoch": 0.2964, "step": 741, "tokens_per_device": 4632 }, { "epoch": 0.2964, "loss_ce": 0.4151073396205902, "loss_lvr": 0.8230306506156921, "loss_mode_switch": 0.0, "loss_total": 0.4974104166030884, "step": 741 }, { "batch_size": 1, "epoch": 0.2964, "step": 741, "tokens_per_device": 5105 }, { "epoch": 0.2964, "loss_ce": 0.001584334997460246, "loss_lvr": 0.30971404910087585, "loss_mode_switch": 0.0, "loss_total": 0.03255574032664299, "step": 741 }, { "batch_size": 4, "epoch": 0.2964, "step": 741, "tokens_per_device": 1880 }, { "epoch": 0.2964, "loss_ce": 0.6600592732429504, "loss_lvr": 0.9082163572311401, "loss_mode_switch": 0.0, "loss_total": 0.7508808970451355, "step": 741 }, { "epoch": 0.2968, "grad_norm": 1.7545114755630493, "learning_rate": 8.246624506860779e-06, "loss": 0.3338, "step": 742 }, { "batch_size": 1, "epoch": 0.2968, "step": 742, "tokens_per_device": 5077 }, { "epoch": 0.2968, "loss_ce": 0.001714759157039225, "loss_lvr": 0.2579769194126129, "loss_mode_switch": 0.0, "loss_total": 0.027512451633810997, "step": 742 }, { "batch_size": 4, "epoch": 0.2968, "step": 742, "tokens_per_device": 1704 }, { "epoch": 0.2968, "loss_ce": 0.23080775141716003, "loss_lvr": 1.1721816062927246, "loss_mode_switch": 0.0, "loss_total": 0.348025918006897, "step": 742 }, { "batch_size": 1, "epoch": 0.2968, "step": 742, "tokens_per_device": 4895 }, { "epoch": 0.2968, "loss_ce": 0.039016153663396835, "loss_lvr": 0.6362002491950989, "loss_mode_switch": 0.0, "loss_total": 0.1026361733675003, "step": 742 }, { "batch_size": 4, "epoch": 0.2968, "step": 742, "tokens_per_device": 4240 }, { "epoch": 0.2968, "loss_ce": 0.23816169798374176, "loss_lvr": 1.0889837741851807, "loss_mode_switch": 0.0, "loss_total": 0.34706008434295654, "step": 742 }, { "batch_size": 1, "epoch": 0.2968, "step": 742, "tokens_per_device": 8004 }, { "epoch": 0.2968, "loss_ce": 0.0015172326238825917, "loss_lvr": 0.45730605721473694, "loss_mode_switch": 0.0, "loss_total": 0.047247838228940964, "step": 742 }, { "batch_size": 4, "epoch": 0.2968, "step": 742, "tokens_per_device": 2688 }, { "epoch": 0.2968, "loss_ce": 0.5343276858329773, "loss_lvr": 0.9235210418701172, "loss_mode_switch": 0.0, "loss_total": 0.6266797780990601, "step": 742 }, { "batch_size": 1, "epoch": 0.2968, "step": 742, "tokens_per_device": 4917 }, { "epoch": 0.2968, "loss_ce": 0.01676030084490776, "loss_lvr": 0.7334463596343994, "loss_mode_switch": 0.0, "loss_total": 0.09010493755340576, "step": 742 }, { "batch_size": 1, "epoch": 0.2968, "step": 742, "tokens_per_device": 5100 }, { "epoch": 0.2968, "loss_ce": 0.028298446908593178, "loss_lvr": 0.4122470021247864, "loss_mode_switch": 0.0, "loss_total": 0.0695231482386589, "step": 742 }, { "epoch": 0.2972, "grad_norm": 1.5525118112564087, "learning_rate": 8.241695564893863e-06, "loss": 0.3133, "step": 743 }, { "batch_size": 4, "epoch": 0.2972, "step": 743, "tokens_per_device": 4292 }, { "epoch": 0.2972, "loss_ce": 0.3277653455734253, "loss_lvr": 1.1480053663253784, "loss_mode_switch": 0.0, "loss_total": 0.4425658881664276, "step": 743 }, { "batch_size": 4, "epoch": 0.2972, "step": 743, "tokens_per_device": 3884 }, { "epoch": 0.2972, "loss_ce": 0.09826201945543289, "loss_lvr": 0.9822317361831665, "loss_mode_switch": 0.0, "loss_total": 0.19648519158363342, "step": 743 }, { "batch_size": 1, "epoch": 0.2972, "step": 743, "tokens_per_device": 5118 }, { "epoch": 0.2972, "loss_ce": 0.0010658561950549483, "loss_lvr": 0.3605809509754181, "loss_mode_switch": 0.0, "loss_total": 0.03712395206093788, "step": 743 }, { "batch_size": 4, "epoch": 0.2972, "step": 743, "tokens_per_device": 6732 }, { "epoch": 0.2972, "loss_ce": 0.10152918845415115, "loss_lvr": 0.616002082824707, "loss_mode_switch": 0.0, "loss_total": 0.16312938928604126, "step": 743 }, { "batch_size": 4, "epoch": 0.2972, "step": 743, "tokens_per_device": 4240 }, { "epoch": 0.2972, "loss_ce": 0.6835850477218628, "loss_lvr": 0.6830409169197083, "loss_mode_switch": 0.0, "loss_total": 0.7518891096115112, "step": 743 }, { "batch_size": 4, "epoch": 0.2972, "step": 743, "tokens_per_device": 1780 }, { "epoch": 0.2972, "loss_ce": 0.1442071497440338, "loss_lvr": 1.033719539642334, "loss_mode_switch": 0.0, "loss_total": 0.24757909774780273, "step": 743 }, { "batch_size": 4, "epoch": 0.2972, "step": 743, "tokens_per_device": 7156 }, { "epoch": 0.2972, "loss_ce": 0.012628920376300812, "loss_lvr": 0.932849109172821, "loss_mode_switch": 0.0, "loss_total": 0.10591383278369904, "step": 743 }, { "batch_size": 1, "epoch": 0.2972, "step": 743, "tokens_per_device": 4891 }, { "epoch": 0.2972, "loss_ce": 0.03931761160492897, "loss_lvr": 0.3482489287853241, "loss_mode_switch": 0.0, "loss_total": 0.07414250075817108, "step": 743 }, { "epoch": 0.2976, "grad_norm": 1.5308363437652588, "learning_rate": 8.23676118230655e-06, "loss": 0.3434, "step": 744 }, { "batch_size": 1, "epoch": 0.2976, "step": 744, "tokens_per_device": 5118 }, { "epoch": 0.2976, "loss_ce": 0.3973594307899475, "loss_lvr": 0.21550533175468445, "loss_mode_switch": 0.0, "loss_total": 0.4189099669456482, "step": 744 }, { "batch_size": 4, "epoch": 0.2976, "step": 744, "tokens_per_device": 1428 }, { "epoch": 0.2976, "loss_ce": 0.35773566365242004, "loss_lvr": 0.9646490812301636, "loss_mode_switch": 0.0, "loss_total": 0.4542005658149719, "step": 744 }, { "batch_size": 4, "epoch": 0.2976, "step": 744, "tokens_per_device": 4200 }, { "epoch": 0.2976, "loss_ce": 0.4829860031604767, "loss_lvr": 0.9932937026023865, "loss_mode_switch": 0.0, "loss_total": 0.5823153853416443, "step": 744 }, { "batch_size": 4, "epoch": 0.2976, "step": 744, "tokens_per_device": 4396 }, { "epoch": 0.2976, "loss_ce": 0.05854112282395363, "loss_lvr": 1.6386522054672241, "loss_mode_switch": 0.0, "loss_total": 0.22240634262561798, "step": 744 }, { "batch_size": 1, "epoch": 0.2976, "step": 744, "tokens_per_device": 4882 }, { "epoch": 0.2976, "loss_ce": 0.002016137121245265, "loss_lvr": 0.791966438293457, "loss_mode_switch": 0.0, "loss_total": 0.08121278136968613, "step": 744 }, { "batch_size": 4, "epoch": 0.2976, "step": 744, "tokens_per_device": 3888 }, { "epoch": 0.2976, "loss_ce": 0.13516786694526672, "loss_lvr": 0.887662410736084, "loss_mode_switch": 0.0, "loss_total": 0.2239341139793396, "step": 744 }, { "batch_size": 4, "epoch": 0.2976, "step": 744, "tokens_per_device": 4752 }, { "epoch": 0.2976, "loss_ce": 0.09174757450819016, "loss_lvr": 0.7787700295448303, "loss_mode_switch": 0.0, "loss_total": 0.16962458193302155, "step": 744 }, { "batch_size": 4, "epoch": 0.2976, "step": 744, "tokens_per_device": 5120 }, { "epoch": 0.2976, "loss_ce": 0.5583364367485046, "loss_lvr": 0.8009505271911621, "loss_mode_switch": 0.0, "loss_total": 0.6384314894676208, "step": 744 }, { "epoch": 0.298, "grad_norm": 1.608465313911438, "learning_rate": 8.231821367380335e-06, "loss": 0.2449, "step": 745 }, { "batch_size": 1, "epoch": 0.298, "step": 745, "tokens_per_device": 5124 }, { "epoch": 0.298, "loss_ce": 0.005908209830522537, "loss_lvr": 0.6174592971801758, "loss_mode_switch": 0.0, "loss_total": 0.06765414029359818, "step": 745 }, { "batch_size": 4, "epoch": 0.298, "step": 745, "tokens_per_device": 5596 }, { "epoch": 0.298, "loss_ce": 0.11642488837242126, "loss_lvr": 0.7498298287391663, "loss_mode_switch": 0.0, "loss_total": 0.19140787422657013, "step": 745 }, { "batch_size": 1, "epoch": 0.298, "step": 745, "tokens_per_device": 5138 }, { "epoch": 0.298, "loss_ce": 0.1957724243402481, "loss_lvr": 0.6274260878562927, "loss_mode_switch": 0.0, "loss_total": 0.25851503014564514, "step": 745 }, { "batch_size": 4, "epoch": 0.298, "step": 745, "tokens_per_device": 8548 }, { "epoch": 0.298, "loss_ce": 0.2712627351284027, "loss_lvr": 0.8496460914611816, "loss_mode_switch": 0.0, "loss_total": 0.3562273383140564, "step": 745 }, { "batch_size": 1, "epoch": 0.298, "step": 745, "tokens_per_device": 4870 }, { "epoch": 0.298, "loss_ce": 0.010157483629882336, "loss_lvr": 1.287354588508606, "loss_mode_switch": 0.0, "loss_total": 0.13889294862747192, "step": 745 }, { "batch_size": 4, "epoch": 0.298, "step": 745, "tokens_per_device": 2680 }, { "epoch": 0.298, "loss_ce": 0.44823840260505676, "loss_lvr": 0.7874282002449036, "loss_mode_switch": 0.0, "loss_total": 0.5269812345504761, "step": 745 }, { "batch_size": 4, "epoch": 0.298, "step": 745, "tokens_per_device": 2772 }, { "epoch": 0.298, "loss_ce": 0.12239796668291092, "loss_lvr": 0.47612524032592773, "loss_mode_switch": 0.0, "loss_total": 0.1700104922056198, "step": 745 }, { "batch_size": 4, "epoch": 0.298, "step": 745, "tokens_per_device": 4268 }, { "epoch": 0.298, "loss_ce": 0.18911013007164001, "loss_lvr": 0.9608361124992371, "loss_mode_switch": 0.0, "loss_total": 0.2851937413215637, "step": 745 }, { "epoch": 0.2984, "grad_norm": 2.1143054962158203, "learning_rate": 8.226876128405838e-06, "loss": 0.3119, "step": 746 }, { "batch_size": 4, "epoch": 0.2984, "step": 746, "tokens_per_device": 2816 }, { "epoch": 0.2984, "loss_ce": 0.26019784808158875, "loss_lvr": 0.8219946026802063, "loss_mode_switch": 0.0, "loss_total": 0.3423973023891449, "step": 746 }, { "batch_size": 4, "epoch": 0.2984, "step": 746, "tokens_per_device": 3764 }, { "epoch": 0.2984, "loss_ce": 0.31156137585639954, "loss_lvr": 1.048826813697815, "loss_mode_switch": 0.0, "loss_total": 0.4164440631866455, "step": 746 }, { "batch_size": 1, "epoch": 0.2984, "step": 746, "tokens_per_device": 5084 }, { "epoch": 0.2984, "loss_ce": 0.01095487829297781, "loss_lvr": 0.2154436558485031, "loss_mode_switch": 0.0, "loss_total": 0.03249924257397652, "step": 746 }, { "batch_size": 4, "epoch": 0.2984, "step": 746, "tokens_per_device": 5712 }, { "epoch": 0.2984, "loss_ce": 0.071204774081707, "loss_lvr": 0.9708040356636047, "loss_mode_switch": 0.0, "loss_total": 0.16828517615795135, "step": 746 }, { "batch_size": 4, "epoch": 0.2984, "step": 746, "tokens_per_device": 10372 }, { "epoch": 0.2984, "loss_ce": 0.045549340546131134, "loss_lvr": 0.6951262354850769, "loss_mode_switch": 0.0, "loss_total": 0.11506196856498718, "step": 746 }, { "batch_size": 4, "epoch": 0.2984, "step": 746, "tokens_per_device": 2560 }, { "epoch": 0.2984, "loss_ce": 0.44759488105773926, "loss_lvr": 0.9378379583358765, "loss_mode_switch": 0.0, "loss_total": 0.5413786768913269, "step": 746 }, { "batch_size": 1, "epoch": 0.2984, "step": 746, "tokens_per_device": 5140 }, { "epoch": 0.2984, "loss_ce": 0.011617569252848625, "loss_lvr": 0.39311808347702026, "loss_mode_switch": 0.0, "loss_total": 0.05092937499284744, "step": 746 }, { "batch_size": 4, "epoch": 0.2984, "step": 746, "tokens_per_device": 5292 }, { "epoch": 0.2984, "loss_ce": 0.03040776215493679, "loss_lvr": 0.7683854103088379, "loss_mode_switch": 0.0, "loss_total": 0.10724630951881409, "step": 746 }, { "epoch": 0.2988, "grad_norm": 1.518923282623291, "learning_rate": 8.22192547368278e-06, "loss": 0.3262, "step": 747 }, { "batch_size": 4, "epoch": 0.2988, "step": 747, "tokens_per_device": 6288 }, { "epoch": 0.2988, "loss_ce": 0.09056999534368515, "loss_lvr": 0.7305510640144348, "loss_mode_switch": 0.0, "loss_total": 0.163625106215477, "step": 747 }, { "batch_size": 1, "epoch": 0.2988, "step": 747, "tokens_per_device": 4849 }, { "epoch": 0.2988, "loss_ce": 0.004004436079412699, "loss_lvr": 0.2946215271949768, "loss_mode_switch": 0.0, "loss_total": 0.03346658870577812, "step": 747 }, { "batch_size": 1, "epoch": 0.2988, "step": 747, "tokens_per_device": 5107 }, { "epoch": 0.2988, "loss_ce": 0.00995713658630848, "loss_lvr": 0.38715630769729614, "loss_mode_switch": 0.0, "loss_total": 0.048672765493392944, "step": 747 }, { "batch_size": 4, "epoch": 0.2988, "step": 747, "tokens_per_device": 1808 }, { "epoch": 0.2988, "loss_ce": 0.32848912477493286, "loss_lvr": 1.2466391324996948, "loss_mode_switch": 0.0, "loss_total": 0.4531530439853668, "step": 747 }, { "batch_size": 4, "epoch": 0.2988, "step": 747, "tokens_per_device": 8148 }, { "epoch": 0.2988, "loss_ce": 0.034463513642549515, "loss_lvr": 0.6456571817398071, "loss_mode_switch": 0.0, "loss_total": 0.09902922809123993, "step": 747 }, { "batch_size": 4, "epoch": 0.2988, "step": 747, "tokens_per_device": 4556 }, { "epoch": 0.2988, "loss_ce": 0.24477124214172363, "loss_lvr": 0.8080893158912659, "loss_mode_switch": 0.0, "loss_total": 0.3255801796913147, "step": 747 }, { "batch_size": 1, "epoch": 0.2988, "step": 747, "tokens_per_device": 4919 }, { "epoch": 0.2988, "loss_ce": 0.161119282245636, "loss_lvr": 0.3672499358654022, "loss_mode_switch": 0.0, "loss_total": 0.19784428179264069, "step": 747 }, { "batch_size": 1, "epoch": 0.2988, "step": 747, "tokens_per_device": 4876 }, { "epoch": 0.2988, "loss_ce": 0.011710203252732754, "loss_lvr": 1.6013282537460327, "loss_mode_switch": 0.0, "loss_total": 0.171843022108078, "step": 747 }, { "epoch": 0.2992, "grad_norm": 1.3068602085113525, "learning_rate": 8.216969411519971e-06, "loss": 0.312, "step": 748 }, { "batch_size": 4, "epoch": 0.2992, "step": 748, "tokens_per_device": 6080 }, { "epoch": 0.2992, "loss_ce": 0.4518777132034302, "loss_lvr": 0.8247389197349548, "loss_mode_switch": 0.0, "loss_total": 0.5343515872955322, "step": 748 }, { "batch_size": 4, "epoch": 0.2992, "step": 748, "tokens_per_device": 4980 }, { "epoch": 0.2992, "loss_ce": 0.15601573884487152, "loss_lvr": 0.8726489543914795, "loss_mode_switch": 0.0, "loss_total": 0.24328063428401947, "step": 748 }, { "batch_size": 4, "epoch": 0.2992, "step": 748, "tokens_per_device": 2704 }, { "epoch": 0.2992, "loss_ce": 0.46131113171577454, "loss_lvr": 1.0520597696304321, "loss_mode_switch": 0.0, "loss_total": 0.5665171146392822, "step": 748 }, { "batch_size": 4, "epoch": 0.2992, "step": 748, "tokens_per_device": 8228 }, { "epoch": 0.2992, "loss_ce": 0.44904977083206177, "loss_lvr": 0.7612490653991699, "loss_mode_switch": 0.0, "loss_total": 0.5251746773719788, "step": 748 }, { "batch_size": 4, "epoch": 0.2992, "step": 748, "tokens_per_device": 6952 }, { "epoch": 0.2992, "loss_ce": 0.04841182008385658, "loss_lvr": 0.8820856809616089, "loss_mode_switch": 0.0, "loss_total": 0.1366203874349594, "step": 748 }, { "batch_size": 4, "epoch": 0.2992, "step": 748, "tokens_per_device": 3928 }, { "epoch": 0.2992, "loss_ce": 0.28918012976646423, "loss_lvr": 0.8387531638145447, "loss_mode_switch": 0.0, "loss_total": 0.37305545806884766, "step": 748 }, { "batch_size": 1, "epoch": 0.2992, "step": 748, "tokens_per_device": 5067 }, { "epoch": 0.2992, "loss_ce": 0.007605388760566711, "loss_lvr": 0.6418523788452148, "loss_mode_switch": 0.0, "loss_total": 0.07179062813520432, "step": 748 }, { "batch_size": 4, "epoch": 0.2992, "step": 748, "tokens_per_device": 4948 }, { "epoch": 0.2992, "loss_ce": 0.2797396183013916, "loss_lvr": 0.7905007600784302, "loss_mode_switch": 0.0, "loss_total": 0.35878968238830566, "step": 748 }, { "epoch": 0.2996, "grad_norm": 1.2305819988250732, "learning_rate": 8.212007950235295e-06, "loss": 0.2833, "step": 749 }, { "batch_size": 4, "epoch": 0.2996, "step": 749, "tokens_per_device": 2816 }, { "epoch": 0.2996, "loss_ce": 0.1787741333246231, "loss_lvr": 0.5081284046173096, "loss_mode_switch": 0.0, "loss_total": 0.22958697378635406, "step": 749 }, { "batch_size": 4, "epoch": 0.2996, "step": 749, "tokens_per_device": 4292 }, { "epoch": 0.2996, "loss_ce": 0.2868092358112335, "loss_lvr": 1.0252845287322998, "loss_mode_switch": 0.0, "loss_total": 0.3893376886844635, "step": 749 }, { "batch_size": 4, "epoch": 0.2996, "step": 749, "tokens_per_device": 3704 }, { "epoch": 0.2996, "loss_ce": 0.5904179215431213, "loss_lvr": 0.9521600008010864, "loss_mode_switch": 0.0, "loss_total": 0.6856338977813721, "step": 749 }, { "batch_size": 4, "epoch": 0.2996, "step": 749, "tokens_per_device": 3476 }, { "epoch": 0.2996, "loss_ce": 0.16168655455112457, "loss_lvr": 1.0381218194961548, "loss_mode_switch": 0.0, "loss_total": 0.26549872756004333, "step": 749 }, { "batch_size": 4, "epoch": 0.2996, "step": 749, "tokens_per_device": 4636 }, { "epoch": 0.2996, "loss_ce": 0.17603805661201477, "loss_lvr": 1.0190452337265015, "loss_mode_switch": 0.0, "loss_total": 0.27794259786605835, "step": 749 }, { "batch_size": 4, "epoch": 0.2996, "step": 749, "tokens_per_device": 3872 }, { "epoch": 0.2996, "loss_ce": 0.6286671757698059, "loss_lvr": 0.6226008534431458, "loss_mode_switch": 0.0, "loss_total": 0.690927267074585, "step": 749 }, { "batch_size": 4, "epoch": 0.2996, "step": 749, "tokens_per_device": 4204 }, { "epoch": 0.2996, "loss_ce": 0.290602445602417, "loss_lvr": 1.8680826425552368, "loss_mode_switch": 0.0, "loss_total": 0.4774107336997986, "step": 749 }, { "batch_size": 4, "epoch": 0.2996, "step": 749, "tokens_per_device": 4376 }, { "epoch": 0.2996, "loss_ce": 0.14568348228931427, "loss_lvr": 0.9025941491127014, "loss_mode_switch": 0.0, "loss_total": 0.23594290018081665, "step": 749 }, { "epoch": 0.3, "grad_norm": 1.5365865230560303, "learning_rate": 8.207041098155701e-06, "loss": 0.3232, "step": 750 }, { "batch_size": 1, "epoch": 0.3, "step": 750, "tokens_per_device": 4883 }, { "epoch": 0.3, "loss_ce": 0.0006270991289056838, "loss_lvr": 0.33821263909339905, "loss_mode_switch": 0.0, "loss_total": 0.03444836661219597, "step": 750 }, { "batch_size": 1, "epoch": 0.3, "step": 750, "tokens_per_device": 4748 }, { "epoch": 0.3, "loss_ce": 0.0012717958306893706, "loss_lvr": 0.6199595332145691, "loss_mode_switch": 0.0, "loss_total": 0.06326775252819061, "step": 750 }, { "batch_size": 1, "epoch": 0.3, "step": 750, "tokens_per_device": 5141 }, { "epoch": 0.3, "loss_ce": 0.054999422281980515, "loss_lvr": 0.8013684153556824, "loss_mode_switch": 0.0, "loss_total": 0.13513626158237457, "step": 750 }, { "batch_size": 4, "epoch": 0.3, "step": 750, "tokens_per_device": 1856 }, { "epoch": 0.3, "loss_ce": 0.257730633020401, "loss_lvr": 0.9725703597068787, "loss_mode_switch": 0.0, "loss_total": 0.3549876809120178, "step": 750 }, { "batch_size": 4, "epoch": 0.3, "step": 750, "tokens_per_device": 2772 }, { "epoch": 0.3, "loss_ce": 0.5717914700508118, "loss_lvr": 0.9800472855567932, "loss_mode_switch": 0.0, "loss_total": 0.6697962284088135, "step": 750 }, { "batch_size": 4, "epoch": 0.3, "step": 750, "tokens_per_device": 2736 }, { "epoch": 0.3, "loss_ce": 0.5051840543746948, "loss_lvr": 0.9503101706504822, "loss_mode_switch": 0.0, "loss_total": 0.6002150774002075, "step": 750 }, { "batch_size": 4, "epoch": 0.3, "step": 750, "tokens_per_device": 1976 }, { "epoch": 0.3, "loss_ce": 0.13595469295978546, "loss_lvr": 1.0018951892852783, "loss_mode_switch": 0.0, "loss_total": 0.23614421486854553, "step": 750 }, { "batch_size": 1, "epoch": 0.3, "step": 750, "tokens_per_device": 5192 }, { "epoch": 0.3, "loss_ce": 0.1778956800699234, "loss_lvr": 0.25670918822288513, "loss_mode_switch": 0.0, "loss_total": 0.20356659591197968, "step": 750 }, { "epoch": 0.3004, "grad_norm": 1.1921383142471313, "learning_rate": 8.202068863617183e-06, "loss": 0.2991, "step": 751 }, { "batch_size": 4, "epoch": 0.3004, "step": 751, "tokens_per_device": 4612 }, { "epoch": 0.3004, "loss_ce": 0.24837471544742584, "loss_lvr": 0.8909866213798523, "loss_mode_switch": 0.0, "loss_total": 0.33747339248657227, "step": 751 }, { "batch_size": 4, "epoch": 0.3004, "step": 751, "tokens_per_device": 4264 }, { "epoch": 0.3004, "loss_ce": 0.22150319814682007, "loss_lvr": 0.8168625831604004, "loss_mode_switch": 0.0, "loss_total": 0.3031894564628601, "step": 751 }, { "batch_size": 4, "epoch": 0.3004, "step": 751, "tokens_per_device": 1860 }, { "epoch": 0.3004, "loss_ce": 0.32844334840774536, "loss_lvr": 1.0808101892471313, "loss_mode_switch": 0.0, "loss_total": 0.436524361371994, "step": 751 }, { "batch_size": 4, "epoch": 0.3004, "step": 751, "tokens_per_device": 3776 }, { "epoch": 0.3004, "loss_ce": 0.4609924256801605, "loss_lvr": 0.9222739338874817, "loss_mode_switch": 0.0, "loss_total": 0.5532197952270508, "step": 751 }, { "batch_size": 1, "epoch": 0.3004, "step": 751, "tokens_per_device": 4900 }, { "epoch": 0.3004, "loss_ce": 0.1559571921825409, "loss_lvr": 0.7188612818717957, "loss_mode_switch": 0.0, "loss_total": 0.22784331440925598, "step": 751 }, { "batch_size": 1, "epoch": 0.3004, "step": 751, "tokens_per_device": 6976 }, { "epoch": 0.3004, "loss_ce": 0.8653441667556763, "loss_lvr": 0.4192287027835846, "loss_mode_switch": 0.0, "loss_total": 0.9072670340538025, "step": 751 }, { "batch_size": 4, "epoch": 0.3004, "step": 751, "tokens_per_device": 3780 }, { "epoch": 0.3004, "loss_ce": 0.4371018409729004, "loss_lvr": 0.8134987354278564, "loss_mode_switch": 0.0, "loss_total": 0.5184516906738281, "step": 751 }, { "batch_size": 4, "epoch": 0.3004, "step": 751, "tokens_per_device": 2756 }, { "epoch": 0.3004, "loss_ce": 0.04968288913369179, "loss_lvr": 0.8577567338943481, "loss_mode_switch": 0.0, "loss_total": 0.1354585587978363, "step": 751 }, { "epoch": 0.3008, "grad_norm": 1.5749880075454712, "learning_rate": 8.19709125496477e-06, "loss": 0.348, "step": 752 }, { "batch_size": 4, "epoch": 0.3008, "step": 752, "tokens_per_device": 3768 }, { "epoch": 0.3008, "loss_ce": 0.262367844581604, "loss_lvr": 1.0328624248504639, "loss_mode_switch": 0.0, "loss_total": 0.3656540811061859, "step": 752 }, { "batch_size": 4, "epoch": 0.3008, "step": 752, "tokens_per_device": 4344 }, { "epoch": 0.3008, "loss_ce": 0.4551543593406677, "loss_lvr": 0.900003969669342, "loss_mode_switch": 0.0, "loss_total": 0.5451547503471375, "step": 752 }, { "batch_size": 4, "epoch": 0.3008, "step": 752, "tokens_per_device": 5012 }, { "epoch": 0.3008, "loss_ce": 0.26921331882476807, "loss_lvr": 1.1218994855880737, "loss_mode_switch": 0.0, "loss_total": 0.38140326738357544, "step": 752 }, { "batch_size": 1, "epoch": 0.3008, "step": 752, "tokens_per_device": 5114 }, { "epoch": 0.3008, "loss_ce": 0.0016283931909129024, "loss_lvr": 0.46595486998558044, "loss_mode_switch": 0.0, "loss_total": 0.04822387918829918, "step": 752 }, { "batch_size": 4, "epoch": 0.3008, "step": 752, "tokens_per_device": 1572 }, { "epoch": 0.3008, "loss_ce": 0.11237616091966629, "loss_lvr": 0.9405698776245117, "loss_mode_switch": 0.0, "loss_total": 0.20643314719200134, "step": 752 }, { "batch_size": 4, "epoch": 0.3008, "step": 752, "tokens_per_device": 3784 }, { "epoch": 0.3008, "loss_ce": 0.40431520342826843, "loss_lvr": 1.474513292312622, "loss_mode_switch": 0.0, "loss_total": 0.5517665147781372, "step": 752 }, { "batch_size": 1, "epoch": 0.3008, "step": 752, "tokens_per_device": 4992 }, { "epoch": 0.3008, "loss_ce": 0.02911566197872162, "loss_lvr": 0.826382577419281, "loss_mode_switch": 0.0, "loss_total": 0.1117539182305336, "step": 752 }, { "batch_size": 4, "epoch": 0.3008, "step": 752, "tokens_per_device": 5828 }, { "epoch": 0.3008, "loss_ce": 0.4698666036128998, "loss_lvr": 0.815178394317627, "loss_mode_switch": 0.0, "loss_total": 0.551384449005127, "step": 752 }, { "epoch": 0.3012, "grad_norm": 1.548449158668518, "learning_rate": 8.192108280552507e-06, "loss": 0.383, "step": 753 }, { "batch_size": 1, "epoch": 0.3012, "step": 753, "tokens_per_device": 4751 }, { "epoch": 0.3012, "loss_ce": 0.009864200837910175, "loss_lvr": 0.27419382333755493, "loss_mode_switch": 0.0, "loss_total": 0.03728358447551727, "step": 753 }, { "batch_size": 4, "epoch": 0.3012, "step": 753, "tokens_per_device": 6724 }, { "epoch": 0.3012, "loss_ce": 0.2316981852054596, "loss_lvr": 1.0591771602630615, "loss_mode_switch": 0.0, "loss_total": 0.3376159071922302, "step": 753 }, { "batch_size": 4, "epoch": 0.3012, "step": 753, "tokens_per_device": 2836 }, { "epoch": 0.3012, "loss_ce": 0.8113327622413635, "loss_lvr": 0.7076824307441711, "loss_mode_switch": 0.0, "loss_total": 0.8821009993553162, "step": 753 }, { "batch_size": 4, "epoch": 0.3012, "step": 753, "tokens_per_device": 4284 }, { "epoch": 0.3012, "loss_ce": 0.09388930350542068, "loss_lvr": 1.3123966455459595, "loss_mode_switch": 0.0, "loss_total": 0.22512897849082947, "step": 753 }, { "batch_size": 4, "epoch": 0.3012, "step": 753, "tokens_per_device": 3116 }, { "epoch": 0.3012, "loss_ce": 0.31209471821784973, "loss_lvr": 1.0776934623718262, "loss_mode_switch": 0.0, "loss_total": 0.41986405849456787, "step": 753 }, { "batch_size": 1, "epoch": 0.3012, "step": 753, "tokens_per_device": 8119 }, { "epoch": 0.3012, "loss_ce": 0.000280180131085217, "loss_lvr": 0.33732250332832336, "loss_mode_switch": 0.0, "loss_total": 0.03401242941617966, "step": 753 }, { "batch_size": 4, "epoch": 0.3012, "step": 753, "tokens_per_device": 1608 }, { "epoch": 0.3012, "loss_ce": 0.6000498533248901, "loss_lvr": 0.9528459310531616, "loss_mode_switch": 0.0, "loss_total": 0.6953344345092773, "step": 753 }, { "batch_size": 4, "epoch": 0.3012, "step": 753, "tokens_per_device": 1668 }, { "epoch": 0.3012, "loss_ce": 0.3183166980743408, "loss_lvr": 1.0374748706817627, "loss_mode_switch": 0.0, "loss_total": 0.4220641851425171, "step": 753 }, { "epoch": 0.3016, "grad_norm": 1.4952102899551392, "learning_rate": 8.18711994874345e-06, "loss": 0.3126, "step": 754 }, { "batch_size": 1, "epoch": 0.3016, "step": 754, "tokens_per_device": 4774 }, { "epoch": 0.3016, "loss_ce": 0.16327616572380066, "loss_lvr": 0.3395707309246063, "loss_mode_switch": 0.0, "loss_total": 0.19723324477672577, "step": 754 }, { "batch_size": 1, "epoch": 0.3016, "step": 754, "tokens_per_device": 5028 }, { "epoch": 0.3016, "loss_ce": 0.03513310104608536, "loss_lvr": 0.6046578884124756, "loss_mode_switch": 0.0, "loss_total": 0.09559889137744904, "step": 754 }, { "batch_size": 4, "epoch": 0.3016, "step": 754, "tokens_per_device": 2720 }, { "epoch": 0.3016, "loss_ce": 0.6910005807876587, "loss_lvr": 0.8895902633666992, "loss_mode_switch": 0.0, "loss_total": 0.7799596190452576, "step": 754 }, { "batch_size": 4, "epoch": 0.3016, "step": 754, "tokens_per_device": 1412 }, { "epoch": 0.3016, "loss_ce": 0.15418078005313873, "loss_lvr": 0.9698649644851685, "loss_mode_switch": 0.0, "loss_total": 0.25116726756095886, "step": 754 }, { "batch_size": 1, "epoch": 0.3016, "step": 754, "tokens_per_device": 4879 }, { "epoch": 0.3016, "loss_ce": 0.0024602487683296204, "loss_lvr": 0.5597872138023376, "loss_mode_switch": 0.0, "loss_total": 0.058438971638679504, "step": 754 }, { "batch_size": 4, "epoch": 0.3016, "step": 754, "tokens_per_device": 5688 }, { "epoch": 0.3016, "loss_ce": 0.47468823194503784, "loss_lvr": 1.1845942735671997, "loss_mode_switch": 0.0, "loss_total": 0.5931476354598999, "step": 754 }, { "batch_size": 4, "epoch": 0.3016, "step": 754, "tokens_per_device": 2648 }, { "epoch": 0.3016, "loss_ce": 0.5027099251747131, "loss_lvr": 0.8074826002120972, "loss_mode_switch": 0.0, "loss_total": 0.5834581851959229, "step": 754 }, { "batch_size": 4, "epoch": 0.3016, "step": 754, "tokens_per_device": 2664 }, { "epoch": 0.3016, "loss_ce": 1.0026029348373413, "loss_lvr": 1.9752365350723267, "loss_mode_switch": 0.0, "loss_total": 1.2001266479492188, "step": 754 }, { "epoch": 0.302, "grad_norm": 1.4516721963882446, "learning_rate": 8.182126267909642e-06, "loss": 0.3432, "step": 755 }, { "batch_size": 4, "epoch": 0.302, "step": 755, "tokens_per_device": 7180 }, { "epoch": 0.302, "loss_ce": 0.04238699749112129, "loss_lvr": 0.923958957195282, "loss_mode_switch": 0.0, "loss_total": 0.13478289544582367, "step": 755 }, { "batch_size": 4, "epoch": 0.302, "step": 755, "tokens_per_device": 1672 }, { "epoch": 0.302, "loss_ce": 0.48352766036987305, "loss_lvr": 0.9382216334342957, "loss_mode_switch": 0.0, "loss_total": 0.577349841594696, "step": 755 }, { "batch_size": 4, "epoch": 0.302, "step": 755, "tokens_per_device": 4604 }, { "epoch": 0.302, "loss_ce": 0.14094525575637817, "loss_lvr": 0.7326421141624451, "loss_mode_switch": 0.0, "loss_total": 0.21420946717262268, "step": 755 }, { "batch_size": 4, "epoch": 0.302, "step": 755, "tokens_per_device": 1452 }, { "epoch": 0.302, "loss_ce": 0.5738089680671692, "loss_lvr": 1.0038877725601196, "loss_mode_switch": 0.0, "loss_total": 0.6741977334022522, "step": 755 }, { "batch_size": 4, "epoch": 0.302, "step": 755, "tokens_per_device": 4388 }, { "epoch": 0.302, "loss_ce": 0.25216662883758545, "loss_lvr": 0.9860639572143555, "loss_mode_switch": 0.0, "loss_total": 0.35077303647994995, "step": 755 }, { "batch_size": 4, "epoch": 0.302, "step": 755, "tokens_per_device": 7576 }, { "epoch": 0.302, "loss_ce": 0.04784063249826431, "loss_lvr": 0.7187583446502686, "loss_mode_switch": 0.0, "loss_total": 0.11971646547317505, "step": 755 }, { "batch_size": 1, "epoch": 0.302, "step": 755, "tokens_per_device": 5208 }, { "epoch": 0.302, "loss_ce": 0.0016481562051922083, "loss_lvr": 0.4289425015449524, "loss_mode_switch": 0.0, "loss_total": 0.04454240947961807, "step": 755 }, { "batch_size": 4, "epoch": 0.302, "step": 755, "tokens_per_device": 4232 }, { "epoch": 0.302, "loss_ce": 0.2694057524204254, "loss_lvr": 0.7858017086982727, "loss_mode_switch": 0.0, "loss_total": 0.3479859232902527, "step": 755 }, { "epoch": 0.3024, "grad_norm": 1.5034763813018799, "learning_rate": 8.177127246432105e-06, "loss": 0.3571, "step": 756 }, { "batch_size": 4, "epoch": 0.3024, "step": 756, "tokens_per_device": 4456 }, { "epoch": 0.3024, "loss_ce": 0.3164231777191162, "loss_lvr": 0.9561859965324402, "loss_mode_switch": 0.0, "loss_total": 0.4120417833328247, "step": 756 }, { "batch_size": 1, "epoch": 0.3024, "step": 756, "tokens_per_device": 5115 }, { "epoch": 0.3024, "loss_ce": 0.0849163681268692, "loss_lvr": 0.2872841954231262, "loss_mode_switch": 0.0, "loss_total": 0.1136447861790657, "step": 756 }, { "batch_size": 1, "epoch": 0.3024, "step": 756, "tokens_per_device": 5104 }, { "epoch": 0.3024, "loss_ce": 0.3252193331718445, "loss_lvr": 0.5053821206092834, "loss_mode_switch": 0.0, "loss_total": 0.3757575452327728, "step": 756 }, { "batch_size": 4, "epoch": 0.3024, "step": 756, "tokens_per_device": 4984 }, { "epoch": 0.3024, "loss_ce": 0.25518307089805603, "loss_lvr": 0.6969282031059265, "loss_mode_switch": 0.0, "loss_total": 0.3248758912086487, "step": 756 }, { "batch_size": 1, "epoch": 0.3024, "step": 756, "tokens_per_device": 4756 }, { "epoch": 0.3024, "loss_ce": 0.13650760054588318, "loss_lvr": 0.7630683779716492, "loss_mode_switch": 0.0, "loss_total": 0.21281445026397705, "step": 756 }, { "batch_size": 1, "epoch": 0.3024, "step": 756, "tokens_per_device": 5036 }, { "epoch": 0.3024, "loss_ce": 0.057056672871112823, "loss_lvr": 0.563956618309021, "loss_mode_switch": 0.0, "loss_total": 0.11345233023166656, "step": 756 }, { "batch_size": 4, "epoch": 0.3024, "step": 756, "tokens_per_device": 3352 }, { "epoch": 0.3024, "loss_ce": 0.525945246219635, "loss_lvr": 0.9878831505775452, "loss_mode_switch": 0.0, "loss_total": 0.624733567237854, "step": 756 }, { "batch_size": 1, "epoch": 0.3024, "step": 756, "tokens_per_device": 4899 }, { "epoch": 0.3024, "loss_ce": 0.19025295972824097, "loss_lvr": 0.4309142529964447, "loss_mode_switch": 0.0, "loss_total": 0.23334439098834991, "step": 756 }, { "epoch": 0.3028, "grad_norm": 1.2911651134490967, "learning_rate": 8.172122892700826e-06, "loss": 0.2392, "step": 757 }, { "batch_size": 1, "epoch": 0.3028, "step": 757, "tokens_per_device": 5200 }, { "epoch": 0.3028, "loss_ce": 0.38700664043426514, "loss_lvr": 0.425733357667923, "loss_mode_switch": 0.0, "loss_total": 0.4295799732208252, "step": 757 }, { "batch_size": 4, "epoch": 0.3028, "step": 757, "tokens_per_device": 3308 }, { "epoch": 0.3028, "loss_ce": 0.5452941656112671, "loss_lvr": 1.2410314083099365, "loss_mode_switch": 0.0, "loss_total": 0.6693972945213318, "step": 757 }, { "batch_size": 4, "epoch": 0.3028, "step": 757, "tokens_per_device": 4344 }, { "epoch": 0.3028, "loss_ce": 0.0044050635769963264, "loss_lvr": 0.6399165987968445, "loss_mode_switch": 0.0, "loss_total": 0.06839672476053238, "step": 757 }, { "batch_size": 4, "epoch": 0.3028, "step": 757, "tokens_per_device": 8892 }, { "epoch": 0.3028, "loss_ce": 0.032652366906404495, "loss_lvr": 0.8039076924324036, "loss_mode_switch": 0.0, "loss_total": 0.11304314434528351, "step": 757 }, { "batch_size": 4, "epoch": 0.3028, "step": 757, "tokens_per_device": 1172 }, { "epoch": 0.3028, "loss_ce": 0.2841615378856659, "loss_lvr": 1.448342204093933, "loss_mode_switch": 0.0, "loss_total": 0.4289957582950592, "step": 757 }, { "batch_size": 1, "epoch": 0.3028, "step": 757, "tokens_per_device": 5089 }, { "epoch": 0.3028, "loss_ce": 0.03828340768814087, "loss_lvr": 0.46057409048080444, "loss_mode_switch": 0.0, "loss_total": 0.08434081822633743, "step": 757 }, { "batch_size": 4, "epoch": 0.3028, "step": 757, "tokens_per_device": 4640 }, { "epoch": 0.3028, "loss_ce": 0.6975451111793518, "loss_lvr": 0.6078434586524963, "loss_mode_switch": 0.0, "loss_total": 0.758329451084137, "step": 757 }, { "batch_size": 1, "epoch": 0.3028, "step": 757, "tokens_per_device": 4825 }, { "epoch": 0.3028, "loss_ce": 0.04714369773864746, "loss_lvr": 0.50621098279953, "loss_mode_switch": 0.0, "loss_total": 0.09776479750871658, "step": 757 }, { "epoch": 0.3032, "grad_norm": 2.3540871143341064, "learning_rate": 8.167113215114738e-06, "loss": 0.3308, "step": 758 }, { "batch_size": 1, "epoch": 0.3032, "step": 758, "tokens_per_device": 4872 }, { "epoch": 0.3032, "loss_ce": 0.0788646712899208, "loss_lvr": 0.3069947063922882, "loss_mode_switch": 0.0, "loss_total": 0.10956414043903351, "step": 758 }, { "batch_size": 1, "epoch": 0.3032, "step": 758, "tokens_per_device": 5115 }, { "epoch": 0.3032, "loss_ce": 0.0008493137429468334, "loss_lvr": 0.5927813053131104, "loss_mode_switch": 0.0, "loss_total": 0.06012744456529617, "step": 758 }, { "batch_size": 4, "epoch": 0.3032, "step": 758, "tokens_per_device": 3744 }, { "epoch": 0.3032, "loss_ce": 0.4913173317909241, "loss_lvr": 0.8550574779510498, "loss_mode_switch": 0.0, "loss_total": 0.5768230557441711, "step": 758 }, { "batch_size": 1, "epoch": 0.3032, "step": 758, "tokens_per_device": 5036 }, { "epoch": 0.3032, "loss_ce": 0.024401811882853508, "loss_lvr": 0.4193374812602997, "loss_mode_switch": 0.0, "loss_total": 0.06633555889129639, "step": 758 }, { "batch_size": 4, "epoch": 0.3032, "step": 758, "tokens_per_device": 4208 }, { "epoch": 0.3032, "loss_ce": 0.238480344414711, "loss_lvr": 1.1008172035217285, "loss_mode_switch": 0.0, "loss_total": 0.3485620617866516, "step": 758 }, { "batch_size": 4, "epoch": 0.3032, "step": 758, "tokens_per_device": 3112 }, { "epoch": 0.3032, "loss_ce": 0.23935943841934204, "loss_lvr": 0.6496870517730713, "loss_mode_switch": 0.0, "loss_total": 0.30432814359664917, "step": 758 }, { "batch_size": 4, "epoch": 0.3032, "step": 758, "tokens_per_device": 3980 }, { "epoch": 0.3032, "loss_ce": 1.076088547706604, "loss_lvr": 1.0211477279663086, "loss_mode_switch": 0.0, "loss_total": 1.1782033443450928, "step": 758 }, { "batch_size": 4, "epoch": 0.3032, "step": 758, "tokens_per_device": 4196 }, { "epoch": 0.3032, "loss_ce": 0.0632932111620903, "loss_lvr": 0.7708338499069214, "loss_mode_switch": 0.0, "loss_total": 0.14037659764289856, "step": 758 }, { "epoch": 0.3036, "grad_norm": 1.2635915279388428, "learning_rate": 8.16209822208171e-06, "loss": 0.3036, "step": 759 }, { "batch_size": 4, "epoch": 0.3036, "step": 759, "tokens_per_device": 1560 }, { "epoch": 0.3036, "loss_ce": 0.2982916533946991, "loss_lvr": 0.99024498462677, "loss_mode_switch": 0.0, "loss_total": 0.3973161578178406, "step": 759 }, { "batch_size": 4, "epoch": 0.3036, "step": 759, "tokens_per_device": 6924 }, { "epoch": 0.3036, "loss_ce": 0.4509492516517639, "loss_lvr": 0.7400850057601929, "loss_mode_switch": 0.0, "loss_total": 0.5249577760696411, "step": 759 }, { "batch_size": 4, "epoch": 0.3036, "step": 759, "tokens_per_device": 4204 }, { "epoch": 0.3036, "loss_ce": 0.11617318540811539, "loss_lvr": 1.3137720823287964, "loss_mode_switch": 0.0, "loss_total": 0.24755039811134338, "step": 759 }, { "batch_size": 4, "epoch": 0.3036, "step": 759, "tokens_per_device": 6116 }, { "epoch": 0.3036, "loss_ce": 0.3476800322532654, "loss_lvr": 0.7991880774497986, "loss_mode_switch": 0.0, "loss_total": 0.42759883403778076, "step": 759 }, { "batch_size": 4, "epoch": 0.3036, "step": 759, "tokens_per_device": 2776 }, { "epoch": 0.3036, "loss_ce": 0.26613011956214905, "loss_lvr": 0.8618147373199463, "loss_mode_switch": 0.0, "loss_total": 0.3523116111755371, "step": 759 }, { "batch_size": 4, "epoch": 0.3036, "step": 759, "tokens_per_device": 4460 }, { "epoch": 0.3036, "loss_ce": 0.18764221668243408, "loss_lvr": 0.509109377861023, "loss_mode_switch": 0.0, "loss_total": 0.23855315148830414, "step": 759 }, { "batch_size": 4, "epoch": 0.3036, "step": 759, "tokens_per_device": 3788 }, { "epoch": 0.3036, "loss_ce": 0.4116896688938141, "loss_lvr": 1.519264817237854, "loss_mode_switch": 0.0, "loss_total": 0.563616156578064, "step": 759 }, { "batch_size": 4, "epoch": 0.3036, "step": 759, "tokens_per_device": 4192 }, { "epoch": 0.3036, "loss_ce": 0.48818239569664, "loss_lvr": 1.0623774528503418, "loss_mode_switch": 0.0, "loss_total": 0.5944201350212097, "step": 759 }, { "epoch": 0.304, "grad_norm": 1.2692002058029175, "learning_rate": 8.157077922018537e-06, "loss": 0.3172, "step": 760 }, { "batch_size": 1, "epoch": 0.304, "step": 760, "tokens_per_device": 6738 }, { "epoch": 0.304, "loss_ce": 0.03170285373926163, "loss_lvr": 0.5609302520751953, "loss_mode_switch": 0.0, "loss_total": 0.08779588341712952, "step": 760 }, { "batch_size": 4, "epoch": 0.304, "step": 760, "tokens_per_device": 4248 }, { "epoch": 0.304, "loss_ce": 0.36369526386260986, "loss_lvr": 1.5949746370315552, "loss_mode_switch": 0.0, "loss_total": 0.5231927633285522, "step": 760 }, { "batch_size": 1, "epoch": 0.304, "step": 760, "tokens_per_device": 4879 }, { "epoch": 0.304, "loss_ce": 0.008552107028663158, "loss_lvr": 0.4110850989818573, "loss_mode_switch": 0.0, "loss_total": 0.04966061934828758, "step": 760 }, { "batch_size": 4, "epoch": 0.304, "step": 760, "tokens_per_device": 5384 }, { "epoch": 0.304, "loss_ce": 0.32136815786361694, "loss_lvr": 0.9023638367652893, "loss_mode_switch": 0.0, "loss_total": 0.41160455346107483, "step": 760 }, { "batch_size": 4, "epoch": 0.304, "step": 760, "tokens_per_device": 3752 }, { "epoch": 0.304, "loss_ce": 0.1638357788324356, "loss_lvr": 0.9003412127494812, "loss_mode_switch": 0.0, "loss_total": 0.253869891166687, "step": 760 }, { "batch_size": 1, "epoch": 0.304, "step": 760, "tokens_per_device": 7086 }, { "epoch": 0.304, "loss_ce": 0.008363832719624043, "loss_lvr": 0.45070910453796387, "loss_mode_switch": 0.0, "loss_total": 0.053434744477272034, "step": 760 }, { "batch_size": 4, "epoch": 0.304, "step": 760, "tokens_per_device": 3804 }, { "epoch": 0.304, "loss_ce": 0.01898590289056301, "loss_lvr": 0.7007470726966858, "loss_mode_switch": 0.0, "loss_total": 0.08906061202287674, "step": 760 }, { "batch_size": 1, "epoch": 0.304, "step": 760, "tokens_per_device": 5072 }, { "epoch": 0.304, "loss_ce": 0.003300867509096861, "loss_lvr": 0.7915452718734741, "loss_mode_switch": 0.0, "loss_total": 0.08245539665222168, "step": 760 }, { "epoch": 0.3044, "grad_norm": 1.2386183738708496, "learning_rate": 8.152052323350909e-06, "loss": 0.2725, "step": 761 }, { "batch_size": 4, "epoch": 0.3044, "step": 761, "tokens_per_device": 8820 }, { "epoch": 0.3044, "loss_ce": 0.22950215637683868, "loss_lvr": 0.6028332710266113, "loss_mode_switch": 0.0, "loss_total": 0.2897854745388031, "step": 761 }, { "batch_size": 4, "epoch": 0.3044, "step": 761, "tokens_per_device": 4236 }, { "epoch": 0.3044, "loss_ce": 0.33384963870048523, "loss_lvr": 1.180167317390442, "loss_mode_switch": 0.0, "loss_total": 0.45186638832092285, "step": 761 }, { "batch_size": 4, "epoch": 0.3044, "step": 761, "tokens_per_device": 1428 }, { "epoch": 0.3044, "loss_ce": 0.46616610884666443, "loss_lvr": 1.1871594190597534, "loss_mode_switch": 0.0, "loss_total": 0.5848820209503174, "step": 761 }, { "batch_size": 4, "epoch": 0.3044, "step": 761, "tokens_per_device": 11704 }, { "epoch": 0.3044, "loss_ce": 0.2995525598526001, "loss_lvr": 0.6735782027244568, "loss_mode_switch": 0.0, "loss_total": 0.3669103980064392, "step": 761 }, { "batch_size": 1, "epoch": 0.3044, "step": 761, "tokens_per_device": 4920 }, { "epoch": 0.3044, "loss_ce": 0.15618287026882172, "loss_lvr": 0.41793692111968994, "loss_mode_switch": 0.0, "loss_total": 0.19797655940055847, "step": 761 }, { "batch_size": 4, "epoch": 0.3044, "step": 761, "tokens_per_device": 7332 }, { "epoch": 0.3044, "loss_ce": 0.19610285758972168, "loss_lvr": 0.8699737787246704, "loss_mode_switch": 0.0, "loss_total": 0.2831002473831177, "step": 761 }, { "batch_size": 4, "epoch": 0.3044, "step": 761, "tokens_per_device": 5848 }, { "epoch": 0.3044, "loss_ce": 0.1593295782804489, "loss_lvr": 0.6768677830696106, "loss_mode_switch": 0.0, "loss_total": 0.2270163595676422, "step": 761 }, { "batch_size": 4, "epoch": 0.3044, "step": 761, "tokens_per_device": 1516 }, { "epoch": 0.3044, "loss_ce": 0.5431510806083679, "loss_lvr": 1.0829490423202515, "loss_mode_switch": 0.0, "loss_total": 0.6514459848403931, "step": 761 }, { "epoch": 0.3048, "grad_norm": 1.3043235540390015, "learning_rate": 8.147021434513425e-06, "loss": 0.3247, "step": 762 }, { "batch_size": 4, "epoch": 0.3048, "step": 762, "tokens_per_device": 1428 }, { "epoch": 0.3048, "loss_ce": 0.5602882504463196, "loss_lvr": 0.8566340804100037, "loss_mode_switch": 0.0, "loss_total": 0.6459516286849976, "step": 762 }, { "batch_size": 1, "epoch": 0.3048, "step": 762, "tokens_per_device": 5188 }, { "epoch": 0.3048, "loss_ce": 0.01048946287482977, "loss_lvr": 0.5119260549545288, "loss_mode_switch": 0.0, "loss_total": 0.061682071536779404, "step": 762 }, { "batch_size": 4, "epoch": 0.3048, "step": 762, "tokens_per_device": 3824 }, { "epoch": 0.3048, "loss_ce": 0.04276013374328613, "loss_lvr": 0.9610832929611206, "loss_mode_switch": 0.0, "loss_total": 0.13886846601963043, "step": 762 }, { "batch_size": 4, "epoch": 0.3048, "step": 762, "tokens_per_device": 3772 }, { "epoch": 0.3048, "loss_ce": 0.31126245856285095, "loss_lvr": 1.1226117610931396, "loss_mode_switch": 0.0, "loss_total": 0.4235236346721649, "step": 762 }, { "batch_size": 4, "epoch": 0.3048, "step": 762, "tokens_per_device": 1764 }, { "epoch": 0.3048, "loss_ce": 0.3522384762763977, "loss_lvr": 1.0945765972137451, "loss_mode_switch": 0.0, "loss_total": 0.46169614791870117, "step": 762 }, { "batch_size": 1, "epoch": 0.3048, "step": 762, "tokens_per_device": 4895 }, { "epoch": 0.3048, "loss_ce": 0.03493456169962883, "loss_lvr": 0.8603420853614807, "loss_mode_switch": 0.0, "loss_total": 0.1209687739610672, "step": 762 }, { "batch_size": 4, "epoch": 0.3048, "step": 762, "tokens_per_device": 4108 }, { "epoch": 0.3048, "loss_ce": 0.04711589962244034, "loss_lvr": 1.1027851104736328, "loss_mode_switch": 0.0, "loss_total": 0.1573944091796875, "step": 762 }, { "batch_size": 4, "epoch": 0.3048, "step": 762, "tokens_per_device": 4204 }, { "epoch": 0.3048, "loss_ce": 0.15589383244514465, "loss_lvr": 1.009775161743164, "loss_mode_switch": 0.0, "loss_total": 0.2568713426589966, "step": 762 }, { "epoch": 0.3052, "grad_norm": 1.3632373809814453, "learning_rate": 8.14198526394955e-06, "loss": 0.3151, "step": 763 }, { "batch_size": 1, "epoch": 0.3052, "step": 763, "tokens_per_device": 5104 }, { "epoch": 0.3052, "loss_ce": 0.04824107140302658, "loss_lvr": 0.49456068873405457, "loss_mode_switch": 0.0, "loss_total": 0.09769713878631592, "step": 763 }, { "batch_size": 4, "epoch": 0.3052, "step": 763, "tokens_per_device": 3812 }, { "epoch": 0.3052, "loss_ce": 0.507321834564209, "loss_lvr": 1.113481879234314, "loss_mode_switch": 0.0, "loss_total": 0.6186700463294983, "step": 763 }, { "batch_size": 4, "epoch": 0.3052, "step": 763, "tokens_per_device": 3848 }, { "epoch": 0.3052, "loss_ce": 0.5721607804298401, "loss_lvr": 2.872511148452759, "loss_mode_switch": 0.0, "loss_total": 0.859411895275116, "step": 763 }, { "batch_size": 4, "epoch": 0.3052, "step": 763, "tokens_per_device": 5184 }, { "epoch": 0.3052, "loss_ce": 0.3696799874305725, "loss_lvr": 0.7186089754104614, "loss_mode_switch": 0.0, "loss_total": 0.4415408968925476, "step": 763 }, { "batch_size": 1, "epoch": 0.3052, "step": 763, "tokens_per_device": 4911 }, { "epoch": 0.3052, "loss_ce": 0.031081590801477432, "loss_lvr": 1.0821125507354736, "loss_mode_switch": 0.0, "loss_total": 0.1392928510904312, "step": 763 }, { "batch_size": 1, "epoch": 0.3052, "step": 763, "tokens_per_device": 5150 }, { "epoch": 0.3052, "loss_ce": 0.07164547592401505, "loss_lvr": 0.24308915436267853, "loss_mode_switch": 0.0, "loss_total": 0.09595438838005066, "step": 763 }, { "batch_size": 4, "epoch": 0.3052, "step": 763, "tokens_per_device": 11324 }, { "epoch": 0.3052, "loss_ce": 0.0776243582367897, "loss_lvr": 1.788344144821167, "loss_mode_switch": 0.0, "loss_total": 0.2564587891101837, "step": 763 }, { "batch_size": 4, "epoch": 0.3052, "step": 763, "tokens_per_device": 4756 }, { "epoch": 0.3052, "loss_ce": 0.27457743883132935, "loss_lvr": 1.071373462677002, "loss_mode_switch": 0.0, "loss_total": 0.381714791059494, "step": 763 }, { "epoch": 0.3056, "grad_norm": 1.3334283828735352, "learning_rate": 8.136943820111615e-06, "loss": 0.2982, "step": 764 }, { "batch_size": 4, "epoch": 0.3056, "step": 764, "tokens_per_device": 1936 }, { "epoch": 0.3056, "loss_ce": 1.038950800895691, "loss_lvr": 0.8893007636070251, "loss_mode_switch": 0.0, "loss_total": 1.1278809309005737, "step": 764 }, { "batch_size": 4, "epoch": 0.3056, "step": 764, "tokens_per_device": 3856 }, { "epoch": 0.3056, "loss_ce": 0.2660723626613617, "loss_lvr": 1.0138885974884033, "loss_mode_switch": 0.0, "loss_total": 0.367461234331131, "step": 764 }, { "batch_size": 4, "epoch": 0.3056, "step": 764, "tokens_per_device": 9476 }, { "epoch": 0.3056, "loss_ce": 0.10960659384727478, "loss_lvr": 0.7657680511474609, "loss_mode_switch": 0.0, "loss_total": 0.1861833930015564, "step": 764 }, { "batch_size": 1, "epoch": 0.3056, "step": 764, "tokens_per_device": 4340 }, { "epoch": 0.3056, "loss_ce": 0.20376712083816528, "loss_lvr": 1.5765427350997925, "loss_mode_switch": 0.0, "loss_total": 0.3614214062690735, "step": 764 }, { "batch_size": 4, "epoch": 0.3056, "step": 764, "tokens_per_device": 13792 }, { "epoch": 0.3056, "loss_ce": 0.19658003747463226, "loss_lvr": 0.7003949284553528, "loss_mode_switch": 0.0, "loss_total": 0.2666195333003998, "step": 764 }, { "batch_size": 4, "epoch": 0.3056, "step": 764, "tokens_per_device": 4312 }, { "epoch": 0.3056, "loss_ce": 0.23436982929706573, "loss_lvr": 1.1439568996429443, "loss_mode_switch": 0.0, "loss_total": 0.3487655222415924, "step": 764 }, { "batch_size": 4, "epoch": 0.3056, "step": 764, "tokens_per_device": 3852 }, { "epoch": 0.3056, "loss_ce": 0.10245227813720703, "loss_lvr": 0.9383912682533264, "loss_mode_switch": 0.0, "loss_total": 0.19629141688346863, "step": 764 }, { "batch_size": 1, "epoch": 0.3056, "step": 764, "tokens_per_device": 4586 }, { "epoch": 0.3056, "loss_ce": 0.02215672843158245, "loss_lvr": 0.5127472877502441, "loss_mode_switch": 0.0, "loss_total": 0.07343145459890366, "step": 764 }, { "epoch": 0.306, "grad_norm": 1.6291699409484863, "learning_rate": 8.13189711146081e-06, "loss": 0.3727, "step": 765 }, { "batch_size": 4, "epoch": 0.306, "step": 765, "tokens_per_device": 3940 }, { "epoch": 0.306, "loss_ce": 0.3144738972187042, "loss_lvr": 0.7199557423591614, "loss_mode_switch": 0.0, "loss_total": 0.3864694833755493, "step": 765 }, { "batch_size": 4, "epoch": 0.306, "step": 765, "tokens_per_device": 2584 }, { "epoch": 0.306, "loss_ce": 0.206553116440773, "loss_lvr": 0.8401935696601868, "loss_mode_switch": 0.0, "loss_total": 0.29057246446609497, "step": 765 }, { "batch_size": 1, "epoch": 0.306, "step": 765, "tokens_per_device": 5115 }, { "epoch": 0.306, "loss_ce": 0.0884583443403244, "loss_lvr": 0.23084303736686707, "loss_mode_switch": 0.0, "loss_total": 0.11154264956712723, "step": 765 }, { "batch_size": 4, "epoch": 0.306, "step": 765, "tokens_per_device": 4192 }, { "epoch": 0.306, "loss_ce": 0.41496729850769043, "loss_lvr": 0.8864977359771729, "loss_mode_switch": 0.0, "loss_total": 0.5036170482635498, "step": 765 }, { "batch_size": 4, "epoch": 0.306, "step": 765, "tokens_per_device": 5124 }, { "epoch": 0.306, "loss_ce": 0.20523476600646973, "loss_lvr": 0.6512117981910706, "loss_mode_switch": 0.0, "loss_total": 0.2703559398651123, "step": 765 }, { "batch_size": 4, "epoch": 0.306, "step": 765, "tokens_per_device": 4284 }, { "epoch": 0.306, "loss_ce": 0.2563549280166626, "loss_lvr": 1.177998661994934, "loss_mode_switch": 0.0, "loss_total": 0.37415480613708496, "step": 765 }, { "batch_size": 4, "epoch": 0.306, "step": 765, "tokens_per_device": 4352 }, { "epoch": 0.306, "loss_ce": 0.0839884877204895, "loss_lvr": 0.8908601999282837, "loss_mode_switch": 0.0, "loss_total": 0.17307451367378235, "step": 765 }, { "batch_size": 4, "epoch": 0.306, "step": 765, "tokens_per_device": 1552 }, { "epoch": 0.306, "loss_ce": 0.430741548538208, "loss_lvr": 0.9654003977775574, "loss_mode_switch": 0.0, "loss_total": 0.5272815823554993, "step": 765 }, { "epoch": 0.3064, "grad_norm": 1.9191547632217407, "learning_rate": 8.126845146467151e-06, "loss": 0.3038, "step": 766 }, { "batch_size": 1, "epoch": 0.3064, "step": 766, "tokens_per_device": 5019 }, { "epoch": 0.3064, "loss_ce": 0.0305449441075325, "loss_lvr": 0.6642636060714722, "loss_mode_switch": 0.0, "loss_total": 0.0969713032245636, "step": 766 }, { "batch_size": 1, "epoch": 0.3064, "step": 766, "tokens_per_device": 5183 }, { "epoch": 0.3064, "loss_ce": 0.00593606336042285, "loss_lvr": 0.4784938097000122, "loss_mode_switch": 0.0, "loss_total": 0.05378544703125954, "step": 766 }, { "batch_size": 4, "epoch": 0.3064, "step": 766, "tokens_per_device": 5988 }, { "epoch": 0.3064, "loss_ce": 0.2682003974914551, "loss_lvr": 2.2625348567962646, "loss_mode_switch": 0.0, "loss_total": 0.49445390701293945, "step": 766 }, { "batch_size": 1, "epoch": 0.3064, "step": 766, "tokens_per_device": 4512 }, { "epoch": 0.3064, "loss_ce": 0.00292541878297925, "loss_lvr": 0.43468284606933594, "loss_mode_switch": 0.0, "loss_total": 0.046393703669309616, "step": 766 }, { "batch_size": 4, "epoch": 0.3064, "step": 766, "tokens_per_device": 1204 }, { "epoch": 0.3064, "loss_ce": 0.3061819076538086, "loss_lvr": 1.6139782667160034, "loss_mode_switch": 0.0, "loss_total": 0.46757972240448, "step": 766 }, { "batch_size": 1, "epoch": 0.3064, "step": 766, "tokens_per_device": 5658 }, { "epoch": 0.3064, "loss_ce": 0.21087203919887543, "loss_lvr": 0.6570422649383545, "loss_mode_switch": 0.0, "loss_total": 0.27657628059387207, "step": 766 }, { "batch_size": 1, "epoch": 0.3064, "step": 766, "tokens_per_device": 5137 }, { "epoch": 0.3064, "loss_ce": 0.08728186786174774, "loss_lvr": 1.5591909885406494, "loss_mode_switch": 0.0, "loss_total": 0.24320097267627716, "step": 766 }, { "batch_size": 1, "epoch": 0.3064, "step": 766, "tokens_per_device": 5153 }, { "epoch": 0.3064, "loss_ce": 0.0013777957065030932, "loss_lvr": 0.3926617503166199, "loss_mode_switch": 0.0, "loss_total": 0.04064397141337395, "step": 766 }, { "epoch": 0.3068, "grad_norm": 1.325700044631958, "learning_rate": 8.12178793360948e-06, "loss": 0.3057, "step": 767 }, { "batch_size": 4, "epoch": 0.3068, "step": 767, "tokens_per_device": 1920 }, { "epoch": 0.3068, "loss_ce": 0.47407904267311096, "loss_lvr": 0.9865319728851318, "loss_mode_switch": 0.0, "loss_total": 0.5727322101593018, "step": 767 }, { "batch_size": 4, "epoch": 0.3068, "step": 767, "tokens_per_device": 5676 }, { "epoch": 0.3068, "loss_ce": 0.14304032921791077, "loss_lvr": 0.7240352630615234, "loss_mode_switch": 0.0, "loss_total": 0.21544384956359863, "step": 767 }, { "batch_size": 1, "epoch": 0.3068, "step": 767, "tokens_per_device": 5558 }, { "epoch": 0.3068, "loss_ce": 0.022767364978790283, "loss_lvr": 0.5156745314598083, "loss_mode_switch": 0.0, "loss_total": 0.07433481514453888, "step": 767 }, { "batch_size": 4, "epoch": 0.3068, "step": 767, "tokens_per_device": 1416 }, { "epoch": 0.3068, "loss_ce": 0.4553690254688263, "loss_lvr": 1.0563791990280151, "loss_mode_switch": 0.0, "loss_total": 0.5610069632530212, "step": 767 }, { "batch_size": 4, "epoch": 0.3068, "step": 767, "tokens_per_device": 5820 }, { "epoch": 0.3068, "loss_ce": 0.164778932929039, "loss_lvr": 0.8176644444465637, "loss_mode_switch": 0.0, "loss_total": 0.24654537439346313, "step": 767 }, { "batch_size": 1, "epoch": 0.3068, "step": 767, "tokens_per_device": 5095 }, { "epoch": 0.3068, "loss_ce": 0.0020015237387269735, "loss_lvr": 0.3709971010684967, "loss_mode_switch": 0.0, "loss_total": 0.03910123556852341, "step": 767 }, { "batch_size": 4, "epoch": 0.3068, "step": 767, "tokens_per_device": 3916 }, { "epoch": 0.3068, "loss_ce": 0.17507560551166534, "loss_lvr": 0.773795485496521, "loss_mode_switch": 0.0, "loss_total": 0.2524551451206207, "step": 767 }, { "batch_size": 1, "epoch": 0.3068, "step": 767, "tokens_per_device": 4386 }, { "epoch": 0.3068, "loss_ce": 0.017306623980402946, "loss_lvr": 0.7100327610969543, "loss_mode_switch": 0.0, "loss_total": 0.08830990642309189, "step": 767 }, { "epoch": 0.3072, "grad_norm": 1.251502513885498, "learning_rate": 8.116725481375448e-06, "loss": 0.259, "step": 768 }, { "batch_size": 1, "epoch": 0.3072, "step": 768, "tokens_per_device": 5008 }, { "epoch": 0.3072, "loss_ce": 0.08610200136899948, "loss_lvr": 0.595787525177002, "loss_mode_switch": 0.0, "loss_total": 0.1456807553768158, "step": 768 }, { "batch_size": 4, "epoch": 0.3072, "step": 768, "tokens_per_device": 13248 }, { "epoch": 0.3072, "loss_ce": 0.2634693384170532, "loss_lvr": 1.0042216777801514, "loss_mode_switch": 0.0, "loss_total": 0.36389151215553284, "step": 768 }, { "batch_size": 4, "epoch": 0.3072, "step": 768, "tokens_per_device": 4600 }, { "epoch": 0.3072, "loss_ce": 0.027772387489676476, "loss_lvr": 0.966372013092041, "loss_mode_switch": 0.0, "loss_total": 0.12440959364175797, "step": 768 }, { "batch_size": 4, "epoch": 0.3072, "step": 768, "tokens_per_device": 3864 }, { "epoch": 0.3072, "loss_ce": 0.004886785056442022, "loss_lvr": 0.6920647025108337, "loss_mode_switch": 0.0, "loss_total": 0.07409325242042542, "step": 768 }, { "batch_size": 1, "epoch": 0.3072, "step": 768, "tokens_per_device": 4893 }, { "epoch": 0.3072, "loss_ce": 0.013595253229141235, "loss_lvr": 0.38505712151527405, "loss_mode_switch": 0.0, "loss_total": 0.05210096761584282, "step": 768 }, { "batch_size": 4, "epoch": 0.3072, "step": 768, "tokens_per_device": 4168 }, { "epoch": 0.3072, "loss_ce": 0.23714683949947357, "loss_lvr": 0.8305936455726624, "loss_mode_switch": 0.0, "loss_total": 0.3202061951160431, "step": 768 }, { "batch_size": 4, "epoch": 0.3072, "step": 768, "tokens_per_device": 4296 }, { "epoch": 0.3072, "loss_ce": 0.3210870623588562, "loss_lvr": 0.883292019367218, "loss_mode_switch": 0.0, "loss_total": 0.4094162583351135, "step": 768 }, { "batch_size": 4, "epoch": 0.3072, "step": 768, "tokens_per_device": 7976 }, { "epoch": 0.3072, "loss_ce": 0.2223653793334961, "loss_lvr": 0.8948038220405579, "loss_mode_switch": 0.0, "loss_total": 0.3118457794189453, "step": 768 }, { "epoch": 0.3076, "grad_norm": 1.174282193183899, "learning_rate": 8.111657798261495e-06, "loss": 0.2747, "step": 769 }, { "batch_size": 4, "epoch": 0.3076, "step": 769, "tokens_per_device": 4532 }, { "epoch": 0.3076, "loss_ce": 0.06918596476316452, "loss_lvr": 0.8363956809043884, "loss_mode_switch": 0.0, "loss_total": 0.15282553434371948, "step": 769 }, { "batch_size": 4, "epoch": 0.3076, "step": 769, "tokens_per_device": 3760 }, { "epoch": 0.3076, "loss_ce": 0.2006082832813263, "loss_lvr": 1.1633878946304321, "loss_mode_switch": 0.0, "loss_total": 0.3169470727443695, "step": 769 }, { "batch_size": 1, "epoch": 0.3076, "step": 769, "tokens_per_device": 4886 }, { "epoch": 0.3076, "loss_ce": 0.012037164531648159, "loss_lvr": 0.9959937930107117, "loss_mode_switch": 0.0, "loss_total": 0.11163654923439026, "step": 769 }, { "batch_size": 1, "epoch": 0.3076, "step": 769, "tokens_per_device": 4874 }, { "epoch": 0.3076, "loss_ce": 0.0038203569129109383, "loss_lvr": 0.4868786633014679, "loss_mode_switch": 0.0, "loss_total": 0.05250822380185127, "step": 769 }, { "batch_size": 4, "epoch": 0.3076, "step": 769, "tokens_per_device": 4328 }, { "epoch": 0.3076, "loss_ce": 0.04532770439982414, "loss_lvr": 0.9665694832801819, "loss_mode_switch": 0.0, "loss_total": 0.14198465645313263, "step": 769 }, { "batch_size": 4, "epoch": 0.3076, "step": 769, "tokens_per_device": 3816 }, { "epoch": 0.3076, "loss_ce": 0.07079466432332993, "loss_lvr": 3.2839436531066895, "loss_mode_switch": 0.0, "loss_total": 0.3991890549659729, "step": 769 }, { "batch_size": 4, "epoch": 0.3076, "step": 769, "tokens_per_device": 4964 }, { "epoch": 0.3076, "loss_ce": 0.5197696089744568, "loss_lvr": 0.8865858912467957, "loss_mode_switch": 0.0, "loss_total": 0.6084281802177429, "step": 769 }, { "batch_size": 4, "epoch": 0.3076, "step": 769, "tokens_per_device": 4248 }, { "epoch": 0.3076, "loss_ce": 0.6905110478401184, "loss_lvr": 1.0360320806503296, "loss_mode_switch": 0.0, "loss_total": 0.7941142320632935, "step": 769 }, { "epoch": 0.308, "grad_norm": 1.3344224691390991, "learning_rate": 8.106584892772844e-06, "loss": 0.3377, "step": 770 }, { "batch_size": 1, "epoch": 0.308, "step": 770, "tokens_per_device": 4851 }, { "epoch": 0.308, "loss_ce": 0.02587260492146015, "loss_lvr": 0.2999977469444275, "loss_mode_switch": 0.0, "loss_total": 0.05587238073348999, "step": 770 }, { "batch_size": 4, "epoch": 0.308, "step": 770, "tokens_per_device": 1564 }, { "epoch": 0.308, "loss_ce": 0.5357078909873962, "loss_lvr": 0.9060286283493042, "loss_mode_switch": 0.0, "loss_total": 0.6263107657432556, "step": 770 }, { "batch_size": 4, "epoch": 0.308, "step": 770, "tokens_per_device": 2664 }, { "epoch": 0.308, "loss_ce": 0.04892466217279434, "loss_lvr": 0.7533262372016907, "loss_mode_switch": 0.0, "loss_total": 0.12425728887319565, "step": 770 }, { "batch_size": 4, "epoch": 0.308, "step": 770, "tokens_per_device": 4784 }, { "epoch": 0.308, "loss_ce": 0.8795590400695801, "loss_lvr": 0.7903388738632202, "loss_mode_switch": 0.0, "loss_total": 0.95859295129776, "step": 770 }, { "batch_size": 4, "epoch": 0.308, "step": 770, "tokens_per_device": 5688 }, { "epoch": 0.308, "loss_ce": 0.04744315519928932, "loss_lvr": 0.8558655977249146, "loss_mode_switch": 0.0, "loss_total": 0.13302971422672272, "step": 770 }, { "batch_size": 4, "epoch": 0.308, "step": 770, "tokens_per_device": 7480 }, { "epoch": 0.308, "loss_ce": 0.3646060824394226, "loss_lvr": 0.4612191617488861, "loss_mode_switch": 0.0, "loss_total": 0.41072800755500793, "step": 770 }, { "batch_size": 4, "epoch": 0.308, "step": 770, "tokens_per_device": 4368 }, { "epoch": 0.308, "loss_ce": 0.3107104003429413, "loss_lvr": 0.743480384349823, "loss_mode_switch": 0.0, "loss_total": 0.3850584328174591, "step": 770 }, { "batch_size": 4, "epoch": 0.308, "step": 770, "tokens_per_device": 4376 }, { "epoch": 0.308, "loss_ce": 0.31487277150154114, "loss_lvr": 0.8519654273986816, "loss_mode_switch": 0.0, "loss_total": 0.40006932616233826, "step": 770 }, { "epoch": 0.3084, "grad_norm": 1.3958804607391357, "learning_rate": 8.101506773423484e-06, "loss": 0.315, "step": 771 }, { "batch_size": 4, "epoch": 0.3084, "step": 771, "tokens_per_device": 6780 }, { "epoch": 0.3084, "loss_ce": 0.22399066388607025, "loss_lvr": 0.7425854802131653, "loss_mode_switch": 0.0, "loss_total": 0.298249214887619, "step": 771 }, { "batch_size": 1, "epoch": 0.3084, "step": 771, "tokens_per_device": 5302 }, { "epoch": 0.3084, "loss_ce": 0.3770630657672882, "loss_lvr": 0.7243422269821167, "loss_mode_switch": 0.0, "loss_total": 0.4494972825050354, "step": 771 }, { "batch_size": 1, "epoch": 0.3084, "step": 771, "tokens_per_device": 5123 }, { "epoch": 0.3084, "loss_ce": 0.37885987758636475, "loss_lvr": 0.2182558923959732, "loss_mode_switch": 0.0, "loss_total": 0.40068545937538147, "step": 771 }, { "batch_size": 4, "epoch": 0.3084, "step": 771, "tokens_per_device": 2564 }, { "epoch": 0.3084, "loss_ce": 0.17104746401309967, "loss_lvr": 1.0380103588104248, "loss_mode_switch": 0.0, "loss_total": 0.27484849095344543, "step": 771 }, { "batch_size": 4, "epoch": 0.3084, "step": 771, "tokens_per_device": 4360 }, { "epoch": 0.3084, "loss_ce": 0.6047833561897278, "loss_lvr": 1.0513324737548828, "loss_mode_switch": 0.0, "loss_total": 0.7099165916442871, "step": 771 }, { "batch_size": 1, "epoch": 0.3084, "step": 771, "tokens_per_device": 5171 }, { "epoch": 0.3084, "loss_ce": 0.006112450268119574, "loss_lvr": 0.3434712886810303, "loss_mode_switch": 0.0, "loss_total": 0.04045957699418068, "step": 771 }, { "batch_size": 4, "epoch": 0.3084, "step": 771, "tokens_per_device": 5156 }, { "epoch": 0.3084, "loss_ce": 0.06743138283491135, "loss_lvr": 0.9547798037528992, "loss_mode_switch": 0.0, "loss_total": 0.1629093587398529, "step": 771 }, { "batch_size": 4, "epoch": 0.3084, "step": 771, "tokens_per_device": 9040 }, { "epoch": 0.3084, "loss_ce": 0.4674362242221832, "loss_lvr": 0.5083284378051758, "loss_mode_switch": 0.0, "loss_total": 0.5182690620422363, "step": 771 }, { "epoch": 0.3088, "grad_norm": 1.4642457962036133, "learning_rate": 8.09642344873615e-06, "loss": 0.3292, "step": 772 }, { "batch_size": 1, "epoch": 0.3088, "step": 772, "tokens_per_device": 7637 }, { "epoch": 0.3088, "loss_ce": 0.002624868880957365, "loss_lvr": 0.5192052721977234, "loss_mode_switch": 0.0, "loss_total": 0.05454539880156517, "step": 772 }, { "batch_size": 4, "epoch": 0.3088, "step": 772, "tokens_per_device": 4288 }, { "epoch": 0.3088, "loss_ce": 0.052383676171302795, "loss_lvr": 0.9102563261985779, "loss_mode_switch": 0.0, "loss_total": 0.14340931177139282, "step": 772 }, { "batch_size": 4, "epoch": 0.3088, "step": 772, "tokens_per_device": 9328 }, { "epoch": 0.3088, "loss_ce": 0.8142966628074646, "loss_lvr": 0.8212486505508423, "loss_mode_switch": 0.0, "loss_total": 0.8964215517044067, "step": 772 }, { "batch_size": 4, "epoch": 0.3088, "step": 772, "tokens_per_device": 4676 }, { "epoch": 0.3088, "loss_ce": 0.17889365553855896, "loss_lvr": 0.8960260152816772, "loss_mode_switch": 0.0, "loss_total": 0.2684962749481201, "step": 772 }, { "batch_size": 4, "epoch": 0.3088, "step": 772, "tokens_per_device": 6340 }, { "epoch": 0.3088, "loss_ce": 0.06548949331045151, "loss_lvr": 0.8263565897941589, "loss_mode_switch": 0.0, "loss_total": 0.14812515676021576, "step": 772 }, { "batch_size": 4, "epoch": 0.3088, "step": 772, "tokens_per_device": 6048 }, { "epoch": 0.3088, "loss_ce": 0.035128381103277206, "loss_lvr": 0.6091761589050293, "loss_mode_switch": 0.0, "loss_total": 0.09604600071907043, "step": 772 }, { "batch_size": 1, "epoch": 0.3088, "step": 772, "tokens_per_device": 5147 }, { "epoch": 0.3088, "loss_ce": 0.043637748807668686, "loss_lvr": 0.38829800486564636, "loss_mode_switch": 0.0, "loss_total": 0.08246754854917526, "step": 772 }, { "batch_size": 4, "epoch": 0.3088, "step": 772, "tokens_per_device": 5312 }, { "epoch": 0.3088, "loss_ce": 0.18865004181861877, "loss_lvr": 0.7964138388633728, "loss_mode_switch": 0.0, "loss_total": 0.2682914137840271, "step": 772 }, { "epoch": 0.3092, "grad_norm": 1.6734883785247803, "learning_rate": 8.091334927242315e-06, "loss": 0.3339, "step": 773 }, { "batch_size": 1, "epoch": 0.3092, "step": 773, "tokens_per_device": 5171 }, { "epoch": 0.3092, "loss_ce": 0.016893699765205383, "loss_lvr": 0.5167017579078674, "loss_mode_switch": 0.0, "loss_total": 0.06856387853622437, "step": 773 }, { "batch_size": 4, "epoch": 0.3092, "step": 773, "tokens_per_device": 2520 }, { "epoch": 0.3092, "loss_ce": 0.2729266285896301, "loss_lvr": 1.031582236289978, "loss_mode_switch": 0.0, "loss_total": 0.3760848641395569, "step": 773 }, { "batch_size": 4, "epoch": 0.3092, "step": 773, "tokens_per_device": 4132 }, { "epoch": 0.3092, "loss_ce": 0.186891108751297, "loss_lvr": 0.566345751285553, "loss_mode_switch": 0.0, "loss_total": 0.2435256838798523, "step": 773 }, { "batch_size": 1, "epoch": 0.3092, "step": 773, "tokens_per_device": 5151 }, { "epoch": 0.3092, "loss_ce": 0.00031778786797076464, "loss_lvr": 0.4377152621746063, "loss_mode_switch": 0.0, "loss_total": 0.044089317321777344, "step": 773 }, { "batch_size": 4, "epoch": 0.3092, "step": 773, "tokens_per_device": 2692 }, { "epoch": 0.3092, "loss_ce": 0.4274749755859375, "loss_lvr": 0.9817966222763062, "loss_mode_switch": 0.0, "loss_total": 0.5256546139717102, "step": 773 }, { "batch_size": 4, "epoch": 0.3092, "step": 773, "tokens_per_device": 1484 }, { "epoch": 0.3092, "loss_ce": 0.5966114401817322, "loss_lvr": 0.9933522939682007, "loss_mode_switch": 0.0, "loss_total": 0.6959466934204102, "step": 773 }, { "batch_size": 1, "epoch": 0.3092, "step": 773, "tokens_per_device": 5181 }, { "epoch": 0.3092, "loss_ce": 0.11713656783103943, "loss_lvr": 0.4741494357585907, "loss_mode_switch": 0.0, "loss_total": 0.1645515114068985, "step": 773 }, { "batch_size": 4, "epoch": 0.3092, "step": 773, "tokens_per_device": 1384 }, { "epoch": 0.3092, "loss_ce": 0.30125167965888977, "loss_lvr": 1.0416195392608643, "loss_mode_switch": 0.0, "loss_total": 0.4054136276245117, "step": 773 }, { "epoch": 0.3096, "grad_norm": 1.3799500465393066, "learning_rate": 8.086241217482177e-06, "loss": 0.2911, "step": 774 }, { "batch_size": 1, "epoch": 0.3096, "step": 774, "tokens_per_device": 4913 }, { "epoch": 0.3096, "loss_ce": 0.08003012090921402, "loss_lvr": 0.34543168544769287, "loss_mode_switch": 0.0, "loss_total": 0.11457328498363495, "step": 774 }, { "batch_size": 1, "epoch": 0.3096, "step": 774, "tokens_per_device": 4891 }, { "epoch": 0.3096, "loss_ce": 0.22377833724021912, "loss_lvr": 0.07669869810342789, "loss_mode_switch": 0.0, "loss_total": 0.2314482033252716, "step": 774 }, { "batch_size": 1, "epoch": 0.3096, "step": 774, "tokens_per_device": 7797 }, { "epoch": 0.3096, "loss_ce": 0.005449369084089994, "loss_lvr": 0.3669547438621521, "loss_mode_switch": 0.0, "loss_total": 0.04214484617114067, "step": 774 }, { "batch_size": 4, "epoch": 0.3096, "step": 774, "tokens_per_device": 3844 }, { "epoch": 0.3096, "loss_ce": 0.786066472530365, "loss_lvr": 0.9645742177963257, "loss_mode_switch": 0.0, "loss_total": 0.8825238943099976, "step": 774 }, { "batch_size": 1, "epoch": 0.3096, "step": 774, "tokens_per_device": 5144 }, { "epoch": 0.3096, "loss_ce": 0.010359182953834534, "loss_lvr": 0.28311997652053833, "loss_mode_switch": 0.0, "loss_total": 0.03867118060588837, "step": 774 }, { "batch_size": 4, "epoch": 0.3096, "step": 774, "tokens_per_device": 2528 }, { "epoch": 0.3096, "loss_ce": 0.10903288424015045, "loss_lvr": 0.9836170077323914, "loss_mode_switch": 0.0, "loss_total": 0.2073945850133896, "step": 774 }, { "batch_size": 4, "epoch": 0.3096, "step": 774, "tokens_per_device": 1728 }, { "epoch": 0.3096, "loss_ce": 0.31829962134361267, "loss_lvr": 1.0098878145217896, "loss_mode_switch": 0.0, "loss_total": 0.41928839683532715, "step": 774 }, { "batch_size": 4, "epoch": 0.3096, "step": 774, "tokens_per_device": 5764 }, { "epoch": 0.3096, "loss_ce": 0.1856347620487213, "loss_lvr": 0.8091739416122437, "loss_mode_switch": 0.0, "loss_total": 0.2665521502494812, "step": 774 }, { "epoch": 0.31, "grad_norm": 1.4911011457443237, "learning_rate": 8.081142328004638e-06, "loss": 0.3371, "step": 775 }, { "batch_size": 4, "epoch": 0.31, "step": 775, "tokens_per_device": 4448 }, { "epoch": 0.31, "loss_ce": 0.4328710734844208, "loss_lvr": 0.8722909092903137, "loss_mode_switch": 0.0, "loss_total": 0.5201001763343811, "step": 775 }, { "batch_size": 4, "epoch": 0.31, "step": 775, "tokens_per_device": 3824 }, { "epoch": 0.31, "loss_ce": 0.3956175446510315, "loss_lvr": 0.9525230526924133, "loss_mode_switch": 0.0, "loss_total": 0.4908698499202728, "step": 775 }, { "batch_size": 4, "epoch": 0.31, "step": 775, "tokens_per_device": 4328 }, { "epoch": 0.31, "loss_ce": 0.10901492089033127, "loss_lvr": 0.957897961139679, "loss_mode_switch": 0.0, "loss_total": 0.20480471849441528, "step": 775 }, { "batch_size": 1, "epoch": 0.31, "step": 775, "tokens_per_device": 5302 }, { "epoch": 0.31, "loss_ce": 0.22667133808135986, "loss_lvr": 0.3796980679035187, "loss_mode_switch": 0.0, "loss_total": 0.264641135931015, "step": 775 }, { "batch_size": 4, "epoch": 0.31, "step": 775, "tokens_per_device": 1392 }, { "epoch": 0.31, "loss_ce": 0.35624635219573975, "loss_lvr": 1.0251247882843018, "loss_mode_switch": 0.0, "loss_total": 0.4587588310241699, "step": 775 }, { "batch_size": 1, "epoch": 0.31, "step": 775, "tokens_per_device": 5082 }, { "epoch": 0.31, "loss_ce": 0.09547131508588791, "loss_lvr": 0.5778363943099976, "loss_mode_switch": 0.0, "loss_total": 0.15325495600700378, "step": 775 }, { "batch_size": 4, "epoch": 0.31, "step": 775, "tokens_per_device": 1824 }, { "epoch": 0.31, "loss_ce": 0.634005606174469, "loss_lvr": 0.9202477931976318, "loss_mode_switch": 0.0, "loss_total": 0.7260304093360901, "step": 775 }, { "batch_size": 1, "epoch": 0.31, "step": 775, "tokens_per_device": 5111 }, { "epoch": 0.31, "loss_ce": 0.004906771704554558, "loss_lvr": 1.0633152723312378, "loss_mode_switch": 0.0, "loss_total": 0.11123830080032349, "step": 775 }, { "epoch": 0.3104, "grad_norm": 1.3572556972503662, "learning_rate": 8.076038267367292e-06, "loss": 0.318, "step": 776 }, { "batch_size": 1, "epoch": 0.3104, "step": 776, "tokens_per_device": 5251 }, { "epoch": 0.3104, "loss_ce": 0.042631104588508606, "loss_lvr": 0.302542507648468, "loss_mode_switch": 0.0, "loss_total": 0.07288535684347153, "step": 776 }, { "batch_size": 4, "epoch": 0.3104, "step": 776, "tokens_per_device": 3792 }, { "epoch": 0.3104, "loss_ce": 0.688319206237793, "loss_lvr": 0.8702855110168457, "loss_mode_switch": 0.0, "loss_total": 0.7753477692604065, "step": 776 }, { "batch_size": 4, "epoch": 0.3104, "step": 776, "tokens_per_device": 4956 }, { "epoch": 0.3104, "loss_ce": 0.5775704383850098, "loss_lvr": 0.8526728749275208, "loss_mode_switch": 0.0, "loss_total": 0.6628377437591553, "step": 776 }, { "batch_size": 4, "epoch": 0.3104, "step": 776, "tokens_per_device": 6088 }, { "epoch": 0.3104, "loss_ce": 0.45623353123664856, "loss_lvr": 0.8622193932533264, "loss_mode_switch": 0.0, "loss_total": 0.5424554944038391, "step": 776 }, { "batch_size": 4, "epoch": 0.3104, "step": 776, "tokens_per_device": 4036 }, { "epoch": 0.3104, "loss_ce": 0.03720270097255707, "loss_lvr": 0.8968452215194702, "loss_mode_switch": 0.0, "loss_total": 0.1268872320652008, "step": 776 }, { "batch_size": 1, "epoch": 0.3104, "step": 776, "tokens_per_device": 7774 }, { "epoch": 0.3104, "loss_ce": 0.002283884910866618, "loss_lvr": 0.3131929039955139, "loss_mode_switch": 0.0, "loss_total": 0.03360317647457123, "step": 776 }, { "batch_size": 4, "epoch": 0.3104, "step": 776, "tokens_per_device": 1344 }, { "epoch": 0.3104, "loss_ce": 0.4868997633457184, "loss_lvr": 1.2304797172546387, "loss_mode_switch": 0.0, "loss_total": 0.6099477410316467, "step": 776 }, { "batch_size": 4, "epoch": 0.3104, "step": 776, "tokens_per_device": 2676 }, { "epoch": 0.3104, "loss_ce": 0.4023502469062805, "loss_lvr": 0.6489428281784058, "loss_mode_switch": 0.0, "loss_total": 0.46724453568458557, "step": 776 }, { "epoch": 0.3108, "grad_norm": 1.2058813571929932, "learning_rate": 8.070929044136419e-06, "loss": 0.2855, "step": 777 }, { "batch_size": 1, "epoch": 0.3108, "step": 777, "tokens_per_device": 4876 }, { "epoch": 0.3108, "loss_ce": 0.006126612424850464, "loss_lvr": 0.23477981984615326, "loss_mode_switch": 0.0, "loss_total": 0.02960459515452385, "step": 777 }, { "batch_size": 4, "epoch": 0.3108, "step": 777, "tokens_per_device": 8036 }, { "epoch": 0.3108, "loss_ce": 0.04467462748289108, "loss_lvr": 0.6609266996383667, "loss_mode_switch": 0.0, "loss_total": 0.11076729744672775, "step": 777 }, { "batch_size": 4, "epoch": 0.3108, "step": 777, "tokens_per_device": 1320 }, { "epoch": 0.3108, "loss_ce": 0.5130056738853455, "loss_lvr": 1.129594326019287, "loss_mode_switch": 0.0, "loss_total": 0.6259651184082031, "step": 777 }, { "batch_size": 4, "epoch": 0.3108, "step": 777, "tokens_per_device": 15640 }, { "epoch": 0.3108, "loss_ce": 0.22043751180171967, "loss_lvr": 0.886976957321167, "loss_mode_switch": 0.0, "loss_total": 0.30913519859313965, "step": 777 }, { "batch_size": 4, "epoch": 0.3108, "step": 777, "tokens_per_device": 4396 }, { "epoch": 0.3108, "loss_ce": 0.23549188673496246, "loss_lvr": 0.7916107177734375, "loss_mode_switch": 0.0, "loss_total": 0.3146529495716095, "step": 777 }, { "batch_size": 4, "epoch": 0.3108, "step": 777, "tokens_per_device": 7920 }, { "epoch": 0.3108, "loss_ce": 0.5746847987174988, "loss_lvr": 0.8305686712265015, "loss_mode_switch": 0.0, "loss_total": 0.6577416658401489, "step": 777 }, { "batch_size": 4, "epoch": 0.3108, "step": 777, "tokens_per_device": 4156 }, { "epoch": 0.3108, "loss_ce": 0.022217683494091034, "loss_lvr": 0.8953560590744019, "loss_mode_switch": 0.0, "loss_total": 0.11175329238176346, "step": 777 }, { "batch_size": 4, "epoch": 0.3108, "step": 777, "tokens_per_device": 1244 }, { "epoch": 0.3108, "loss_ce": 0.19397403299808502, "loss_lvr": 1.0763390064239502, "loss_mode_switch": 0.0, "loss_total": 0.3016079366207123, "step": 777 }, { "epoch": 0.3112, "grad_norm": 1.5312610864639282, "learning_rate": 8.065814666886954e-06, "loss": 0.3227, "step": 778 }, { "batch_size": 4, "epoch": 0.3112, "step": 778, "tokens_per_device": 4124 }, { "epoch": 0.3112, "loss_ce": 0.12386351823806763, "loss_lvr": 0.7666027545928955, "loss_mode_switch": 0.0, "loss_total": 0.20052379369735718, "step": 778 }, { "batch_size": 4, "epoch": 0.3112, "step": 778, "tokens_per_device": 1352 }, { "epoch": 0.3112, "loss_ce": 0.1985093504190445, "loss_lvr": 0.9850480556488037, "loss_mode_switch": 0.0, "loss_total": 0.29701414704322815, "step": 778 }, { "batch_size": 4, "epoch": 0.3112, "step": 778, "tokens_per_device": 1492 }, { "epoch": 0.3112, "loss_ce": 0.39409080147743225, "loss_lvr": 1.224949598312378, "loss_mode_switch": 0.0, "loss_total": 0.5165857672691345, "step": 778 }, { "batch_size": 1, "epoch": 0.3112, "step": 778, "tokens_per_device": 4767 }, { "epoch": 0.3112, "loss_ce": 0.029938388615846634, "loss_lvr": 0.531734049320221, "loss_mode_switch": 0.0, "loss_total": 0.08311179280281067, "step": 778 }, { "batch_size": 1, "epoch": 0.3112, "step": 778, "tokens_per_device": 4913 }, { "epoch": 0.3112, "loss_ce": 0.0007733202655799687, "loss_lvr": 0.19160817563533783, "loss_mode_switch": 0.0, "loss_total": 0.019934138283133507, "step": 778 }, { "batch_size": 4, "epoch": 0.3112, "step": 778, "tokens_per_device": 13644 }, { "epoch": 0.3112, "loss_ce": 0.04616308584809303, "loss_lvr": 1.470623254776001, "loss_mode_switch": 0.0, "loss_total": 0.1932254135608673, "step": 778 }, { "batch_size": 4, "epoch": 0.3112, "step": 778, "tokens_per_device": 1940 }, { "epoch": 0.3112, "loss_ce": 0.029436467215418816, "loss_lvr": 0.9281948208808899, "loss_mode_switch": 0.0, "loss_total": 0.12225595116615295, "step": 778 }, { "batch_size": 4, "epoch": 0.3112, "step": 778, "tokens_per_device": 4272 }, { "epoch": 0.3112, "loss_ce": 0.03233645483851433, "loss_lvr": 0.877984881401062, "loss_mode_switch": 0.0, "loss_total": 0.12013494968414307, "step": 778 }, { "epoch": 0.3116, "grad_norm": 1.4453394412994385, "learning_rate": 8.06069514420249e-06, "loss": 0.287, "step": 779 }, { "batch_size": 4, "epoch": 0.3116, "step": 779, "tokens_per_device": 4384 }, { "epoch": 0.3116, "loss_ce": 0.032996516674757004, "loss_lvr": 0.6137111783027649, "loss_mode_switch": 0.0, "loss_total": 0.09436763823032379, "step": 779 }, { "batch_size": 4, "epoch": 0.3116, "step": 779, "tokens_per_device": 4312 }, { "epoch": 0.3116, "loss_ce": 0.353448748588562, "loss_lvr": 0.597655713558197, "loss_mode_switch": 0.0, "loss_total": 0.4132143259048462, "step": 779 }, { "batch_size": 4, "epoch": 0.3116, "step": 779, "tokens_per_device": 5548 }, { "epoch": 0.3116, "loss_ce": 0.3258136808872223, "loss_lvr": 0.9409982562065125, "loss_mode_switch": 0.0, "loss_total": 0.41991350054740906, "step": 779 }, { "batch_size": 4, "epoch": 0.3116, "step": 779, "tokens_per_device": 5792 }, { "epoch": 0.3116, "loss_ce": 0.34627237915992737, "loss_lvr": 0.8525725603103638, "loss_mode_switch": 0.0, "loss_total": 0.4315296411514282, "step": 779 }, { "batch_size": 4, "epoch": 0.3116, "step": 779, "tokens_per_device": 5836 }, { "epoch": 0.3116, "loss_ce": 0.3657568693161011, "loss_lvr": 1.008057713508606, "loss_mode_switch": 0.0, "loss_total": 0.4665626287460327, "step": 779 }, { "batch_size": 4, "epoch": 0.3116, "step": 779, "tokens_per_device": 4708 }, { "epoch": 0.3116, "loss_ce": 0.09140071272850037, "loss_lvr": 0.6662372350692749, "loss_mode_switch": 0.0, "loss_total": 0.15802443027496338, "step": 779 }, { "batch_size": 4, "epoch": 0.3116, "step": 779, "tokens_per_device": 3652 }, { "epoch": 0.3116, "loss_ce": 0.6088711619377136, "loss_lvr": 1.0061527490615845, "loss_mode_switch": 0.0, "loss_total": 0.7094864249229431, "step": 779 }, { "batch_size": 1, "epoch": 0.3116, "step": 779, "tokens_per_device": 5108 }, { "epoch": 0.3116, "loss_ce": 0.01320011354982853, "loss_lvr": 0.29936906695365906, "loss_mode_switch": 0.0, "loss_total": 0.043137021362781525, "step": 779 }, { "epoch": 0.312, "grad_norm": 1.4505282640457153, "learning_rate": 8.055570484675252e-06, "loss": 0.3166, "step": 780 }, { "batch_size": 1, "epoch": 0.312, "step": 780, "tokens_per_device": 4911 }, { "epoch": 0.312, "loss_ce": 0.0690484270453453, "loss_lvr": 0.39313456416130066, "loss_mode_switch": 0.0, "loss_total": 0.10836188495159149, "step": 780 }, { "batch_size": 1, "epoch": 0.312, "step": 780, "tokens_per_device": 4656 }, { "epoch": 0.312, "loss_ce": 0.7182816863059998, "loss_lvr": 0.5684613585472107, "loss_mode_switch": 0.0, "loss_total": 0.7751278281211853, "step": 780 }, { "batch_size": 1, "epoch": 0.312, "step": 780, "tokens_per_device": 5188 }, { "epoch": 0.312, "loss_ce": 0.02061363309621811, "loss_lvr": 0.5370062589645386, "loss_mode_switch": 0.0, "loss_total": 0.07431425899267197, "step": 780 }, { "batch_size": 4, "epoch": 0.312, "step": 780, "tokens_per_device": 13680 }, { "epoch": 0.312, "loss_ce": 0.14796413481235504, "loss_lvr": 0.7515620589256287, "loss_mode_switch": 0.0, "loss_total": 0.2231203317642212, "step": 780 }, { "batch_size": 1, "epoch": 0.312, "step": 780, "tokens_per_device": 5115 }, { "epoch": 0.312, "loss_ce": 0.003919603768736124, "loss_lvr": 0.4358283579349518, "loss_mode_switch": 0.0, "loss_total": 0.04750244319438934, "step": 780 }, { "batch_size": 4, "epoch": 0.312, "step": 780, "tokens_per_device": 7872 }, { "epoch": 0.312, "loss_ce": 0.07464835792779922, "loss_lvr": 0.7172983884811401, "loss_mode_switch": 0.0, "loss_total": 0.14637818932533264, "step": 780 }, { "batch_size": 4, "epoch": 0.312, "step": 780, "tokens_per_device": 14680 }, { "epoch": 0.312, "loss_ce": 0.23469886183738708, "loss_lvr": 0.8549535274505615, "loss_mode_switch": 0.0, "loss_total": 0.32019421458244324, "step": 780 }, { "batch_size": 1, "epoch": 0.312, "step": 780, "tokens_per_device": 4863 }, { "epoch": 0.312, "loss_ce": 0.0026834458112716675, "loss_lvr": 0.20072340965270996, "loss_mode_switch": 0.0, "loss_total": 0.022755786776542664, "step": 780 }, { "epoch": 0.3124, "grad_norm": 1.4668912887573242, "learning_rate": 8.050440696906086e-06, "loss": 0.3276, "step": 781 }, { "batch_size": 4, "epoch": 0.3124, "step": 781, "tokens_per_device": 4092 }, { "epoch": 0.3124, "loss_ce": 0.4166108965873718, "loss_lvr": 1.0019282102584839, "loss_mode_switch": 0.0, "loss_total": 0.5168037414550781, "step": 781 }, { "batch_size": 4, "epoch": 0.3124, "step": 781, "tokens_per_device": 3776 }, { "epoch": 0.3124, "loss_ce": 0.35833224654197693, "loss_lvr": 0.9443263411521912, "loss_mode_switch": 0.0, "loss_total": 0.4527648687362671, "step": 781 }, { "batch_size": 1, "epoch": 0.3124, "step": 781, "tokens_per_device": 5416 }, { "epoch": 0.3124, "loss_ce": 0.026497984305024147, "loss_lvr": 0.6770460605621338, "loss_mode_switch": 0.0, "loss_total": 0.09420259296894073, "step": 781 }, { "batch_size": 1, "epoch": 0.3124, "step": 781, "tokens_per_device": 5338 }, { "epoch": 0.3124, "loss_ce": 0.29090192914009094, "loss_lvr": 0.5024121999740601, "loss_mode_switch": 0.0, "loss_total": 0.3411431610584259, "step": 781 }, { "batch_size": 4, "epoch": 0.3124, "step": 781, "tokens_per_device": 4364 }, { "epoch": 0.3124, "loss_ce": 0.40442517399787903, "loss_lvr": 0.9267956614494324, "loss_mode_switch": 0.0, "loss_total": 0.4971047341823578, "step": 781 }, { "batch_size": 4, "epoch": 0.3124, "step": 781, "tokens_per_device": 1484 }, { "epoch": 0.3124, "loss_ce": 0.35607558488845825, "loss_lvr": 0.9418620467185974, "loss_mode_switch": 0.0, "loss_total": 0.45026180148124695, "step": 781 }, { "batch_size": 4, "epoch": 0.3124, "step": 781, "tokens_per_device": 13332 }, { "epoch": 0.3124, "loss_ce": 0.48908498883247375, "loss_lvr": 0.8955743312835693, "loss_mode_switch": 0.0, "loss_total": 0.5786424279212952, "step": 781 }, { "batch_size": 4, "epoch": 0.3124, "step": 781, "tokens_per_device": 2564 }, { "epoch": 0.3124, "loss_ce": 0.4327910244464874, "loss_lvr": 0.656009316444397, "loss_mode_switch": 0.0, "loss_total": 0.4983919560909271, "step": 781 }, { "epoch": 0.3128, "grad_norm": 1.3740272521972656, "learning_rate": 8.045305789504446e-06, "loss": 0.3111, "step": 782 }, { "batch_size": 4, "epoch": 0.3128, "step": 782, "tokens_per_device": 1388 }, { "epoch": 0.3128, "loss_ce": 0.06934964656829834, "loss_lvr": 1.7148857116699219, "loss_mode_switch": 0.0, "loss_total": 0.2408382147550583, "step": 782 }, { "batch_size": 1, "epoch": 0.3128, "step": 782, "tokens_per_device": 5057 }, { "epoch": 0.3128, "loss_ce": 0.0013918166514486074, "loss_lvr": 0.6507560014724731, "loss_mode_switch": 0.0, "loss_total": 0.06646741181612015, "step": 782 }, { "batch_size": 1, "epoch": 0.3128, "step": 782, "tokens_per_device": 5164 }, { "epoch": 0.3128, "loss_ce": 0.0014637301210314035, "loss_lvr": 0.3015225827693939, "loss_mode_switch": 0.0, "loss_total": 0.03161599114537239, "step": 782 }, { "batch_size": 4, "epoch": 0.3128, "step": 782, "tokens_per_device": 3744 }, { "epoch": 0.3128, "loss_ce": 0.05624326691031456, "loss_lvr": 0.9893840551376343, "loss_mode_switch": 0.0, "loss_total": 0.1551816761493683, "step": 782 }, { "batch_size": 4, "epoch": 0.3128, "step": 782, "tokens_per_device": 1416 }, { "epoch": 0.3128, "loss_ce": 0.6339906454086304, "loss_lvr": 1.1821664571762085, "loss_mode_switch": 0.0, "loss_total": 0.7522072792053223, "step": 782 }, { "batch_size": 4, "epoch": 0.3128, "step": 782, "tokens_per_device": 3972 }, { "epoch": 0.3128, "loss_ce": 0.3239748179912567, "loss_lvr": 0.6384819149971008, "loss_mode_switch": 0.0, "loss_total": 0.3878230154514313, "step": 782 }, { "batch_size": 4, "epoch": 0.3128, "step": 782, "tokens_per_device": 1456 }, { "epoch": 0.3128, "loss_ce": 0.3499237895011902, "loss_lvr": 1.0283821821212769, "loss_mode_switch": 0.0, "loss_total": 0.45276200771331787, "step": 782 }, { "batch_size": 4, "epoch": 0.3128, "step": 782, "tokens_per_device": 2676 }, { "epoch": 0.3128, "loss_ce": 0.46931809186935425, "loss_lvr": 0.7631140351295471, "loss_mode_switch": 0.0, "loss_total": 0.5456295013427734, "step": 782 }, { "epoch": 0.3132, "grad_norm": 1.1682499647140503, "learning_rate": 8.040165771088377e-06, "loss": 0.2952, "step": 783 }, { "batch_size": 4, "epoch": 0.3132, "step": 783, "tokens_per_device": 8460 }, { "epoch": 0.3132, "loss_ce": 0.46556296944618225, "loss_lvr": 0.7333123087882996, "loss_mode_switch": 0.0, "loss_total": 0.5388941764831543, "step": 783 }, { "batch_size": 4, "epoch": 0.3132, "step": 783, "tokens_per_device": 4616 }, { "epoch": 0.3132, "loss_ce": 0.03032793290913105, "loss_lvr": 0.8595146536827087, "loss_mode_switch": 0.0, "loss_total": 0.11627940088510513, "step": 783 }, { "batch_size": 1, "epoch": 0.3132, "step": 783, "tokens_per_device": 5144 }, { "epoch": 0.3132, "loss_ce": 0.004337087273597717, "loss_lvr": 0.5252286195755005, "loss_mode_switch": 0.0, "loss_total": 0.056859951466321945, "step": 783 }, { "batch_size": 4, "epoch": 0.3132, "step": 783, "tokens_per_device": 6140 }, { "epoch": 0.3132, "loss_ce": 0.5158987045288086, "loss_lvr": 0.8221889734268188, "loss_mode_switch": 0.0, "loss_total": 0.5981175899505615, "step": 783 }, { "batch_size": 4, "epoch": 0.3132, "step": 783, "tokens_per_device": 2556 }, { "epoch": 0.3132, "loss_ce": 0.6485081315040588, "loss_lvr": 0.9992993474006653, "loss_mode_switch": 0.0, "loss_total": 0.7484380602836609, "step": 783 }, { "batch_size": 4, "epoch": 0.3132, "step": 783, "tokens_per_device": 10452 }, { "epoch": 0.3132, "loss_ce": 0.10190677642822266, "loss_lvr": 0.7829442620277405, "loss_mode_switch": 0.0, "loss_total": 0.1802012026309967, "step": 783 }, { "batch_size": 1, "epoch": 0.3132, "step": 783, "tokens_per_device": 4739 }, { "epoch": 0.3132, "loss_ce": 0.003269032808020711, "loss_lvr": 0.31713247299194336, "loss_mode_switch": 0.0, "loss_total": 0.03498227894306183, "step": 783 }, { "batch_size": 1, "epoch": 0.3132, "step": 783, "tokens_per_device": 6133 }, { "epoch": 0.3132, "loss_ce": 0.3732534945011139, "loss_lvr": 0.5180184841156006, "loss_mode_switch": 0.0, "loss_total": 0.4250553548336029, "step": 783 }, { "epoch": 0.3136, "grad_norm": 1.3272113800048828, "learning_rate": 8.035020650284507e-06, "loss": 0.3276, "step": 784 }, { "batch_size": 1, "epoch": 0.3136, "step": 784, "tokens_per_device": 4894 }, { "epoch": 0.3136, "loss_ce": 0.6112896203994751, "loss_lvr": 0.3764914274215698, "loss_mode_switch": 0.0, "loss_total": 0.648938775062561, "step": 784 }, { "batch_size": 1, "epoch": 0.3136, "step": 784, "tokens_per_device": 4874 }, { "epoch": 0.3136, "loss_ce": 0.1444431096315384, "loss_lvr": 0.21282191574573517, "loss_mode_switch": 0.0, "loss_total": 0.16572530567646027, "step": 784 }, { "batch_size": 4, "epoch": 0.3136, "step": 784, "tokens_per_device": 4456 }, { "epoch": 0.3136, "loss_ce": 0.028417976573109627, "loss_lvr": 0.7730942368507385, "loss_mode_switch": 0.0, "loss_total": 0.10572739690542221, "step": 784 }, { "batch_size": 4, "epoch": 0.3136, "step": 784, "tokens_per_device": 1620 }, { "epoch": 0.3136, "loss_ce": 0.41258469223976135, "loss_lvr": 1.0284761190414429, "loss_mode_switch": 0.0, "loss_total": 0.5154322981834412, "step": 784 }, { "batch_size": 1, "epoch": 0.3136, "step": 784, "tokens_per_device": 4812 }, { "epoch": 0.3136, "loss_ce": 0.13393616676330566, "loss_lvr": 0.6334644556045532, "loss_mode_switch": 0.0, "loss_total": 0.197282612323761, "step": 784 }, { "batch_size": 1, "epoch": 0.3136, "step": 784, "tokens_per_device": 4892 }, { "epoch": 0.3136, "loss_ce": 0.028867779299616814, "loss_lvr": 0.47448208928108215, "loss_mode_switch": 0.0, "loss_total": 0.0763159915804863, "step": 784 }, { "batch_size": 1, "epoch": 0.3136, "step": 784, "tokens_per_device": 4910 }, { "epoch": 0.3136, "loss_ce": 0.29097887873649597, "loss_lvr": 0.46787363290786743, "loss_mode_switch": 0.0, "loss_total": 0.33776623010635376, "step": 784 }, { "batch_size": 4, "epoch": 0.3136, "step": 784, "tokens_per_device": 5724 }, { "epoch": 0.3136, "loss_ce": 0.4789758324623108, "loss_lvr": 0.8426011800765991, "loss_mode_switch": 0.0, "loss_total": 0.5632359385490417, "step": 784 }, { "epoch": 0.314, "grad_norm": 1.5260875225067139, "learning_rate": 8.029870435728018e-06, "loss": 0.332, "step": 785 }, { "batch_size": 4, "epoch": 0.314, "step": 785, "tokens_per_device": 3832 }, { "epoch": 0.314, "loss_ce": 0.10494396090507507, "loss_lvr": 1.3310623168945312, "loss_mode_switch": 0.0, "loss_total": 0.2380501925945282, "step": 785 }, { "batch_size": 4, "epoch": 0.314, "step": 785, "tokens_per_device": 4060 }, { "epoch": 0.314, "loss_ce": 0.45012104511260986, "loss_lvr": 1.0027551651000977, "loss_mode_switch": 0.0, "loss_total": 0.5503965616226196, "step": 785 }, { "batch_size": 1, "epoch": 0.314, "step": 785, "tokens_per_device": 5118 }, { "epoch": 0.314, "loss_ce": 0.010152879171073437, "loss_lvr": 0.3765108585357666, "loss_mode_switch": 0.0, "loss_total": 0.04780396819114685, "step": 785 }, { "batch_size": 4, "epoch": 0.314, "step": 785, "tokens_per_device": 6128 }, { "epoch": 0.314, "loss_ce": 0.29367974400520325, "loss_lvr": 0.8827336430549622, "loss_mode_switch": 0.0, "loss_total": 0.3819531202316284, "step": 785 }, { "batch_size": 4, "epoch": 0.314, "step": 785, "tokens_per_device": 3304 }, { "epoch": 0.314, "loss_ce": 0.04108182713389397, "loss_lvr": 0.49617043137550354, "loss_mode_switch": 0.0, "loss_total": 0.09069886803627014, "step": 785 }, { "batch_size": 4, "epoch": 0.314, "step": 785, "tokens_per_device": 3824 }, { "epoch": 0.314, "loss_ce": 0.5156033635139465, "loss_lvr": 0.8975418210029602, "loss_mode_switch": 0.0, "loss_total": 0.6053575277328491, "step": 785 }, { "batch_size": 4, "epoch": 0.314, "step": 785, "tokens_per_device": 1748 }, { "epoch": 0.314, "loss_ce": 0.38760673999786377, "loss_lvr": 0.8952788710594177, "loss_mode_switch": 0.0, "loss_total": 0.477134644985199, "step": 785 }, { "batch_size": 4, "epoch": 0.314, "step": 785, "tokens_per_device": 3808 }, { "epoch": 0.314, "loss_ce": 0.3553310036659241, "loss_lvr": 1.1610174179077148, "loss_mode_switch": 0.0, "loss_total": 0.47143274545669556, "step": 785 }, { "epoch": 0.3144, "grad_norm": 1.4529354572296143, "learning_rate": 8.02471513606265e-06, "loss": 0.3527, "step": 786 }, { "batch_size": 4, "epoch": 0.3144, "step": 786, "tokens_per_device": 5980 }, { "epoch": 0.3144, "loss_ce": 0.09106417745351791, "loss_lvr": 0.8575789928436279, "loss_mode_switch": 0.0, "loss_total": 0.17682208120822906, "step": 786 }, { "batch_size": 1, "epoch": 0.3144, "step": 786, "tokens_per_device": 5130 }, { "epoch": 0.3144, "loss_ce": 0.007778956089168787, "loss_lvr": 0.6178905963897705, "loss_mode_switch": 0.0, "loss_total": 0.06956801563501358, "step": 786 }, { "batch_size": 4, "epoch": 0.3144, "step": 786, "tokens_per_device": 4268 }, { "epoch": 0.3144, "loss_ce": 0.5578654408454895, "loss_lvr": 1.0634475946426392, "loss_mode_switch": 0.0, "loss_total": 0.6642102003097534, "step": 786 }, { "batch_size": 4, "epoch": 0.3144, "step": 786, "tokens_per_device": 4676 }, { "epoch": 0.3144, "loss_ce": 0.015179069712758064, "loss_lvr": 0.6745390295982361, "loss_mode_switch": 0.0, "loss_total": 0.08263297379016876, "step": 786 }, { "batch_size": 4, "epoch": 0.3144, "step": 786, "tokens_per_device": 2992 }, { "epoch": 0.3144, "loss_ce": 0.07699935138225555, "loss_lvr": 1.3685503005981445, "loss_mode_switch": 0.0, "loss_total": 0.21385438740253448, "step": 786 }, { "batch_size": 4, "epoch": 0.3144, "step": 786, "tokens_per_device": 5924 }, { "epoch": 0.3144, "loss_ce": 0.006686609238386154, "loss_lvr": 0.5680690407752991, "loss_mode_switch": 0.0, "loss_total": 0.063493512570858, "step": 786 }, { "batch_size": 4, "epoch": 0.3144, "step": 786, "tokens_per_device": 6988 }, { "epoch": 0.3144, "loss_ce": 0.129451721906662, "loss_lvr": 0.6401857137680054, "loss_mode_switch": 0.0, "loss_total": 0.193470299243927, "step": 786 }, { "batch_size": 1, "epoch": 0.3144, "step": 786, "tokens_per_device": 5115 }, { "epoch": 0.3144, "loss_ce": 0.018750129267573357, "loss_lvr": 0.39033064246177673, "loss_mode_switch": 0.0, "loss_total": 0.05778319388628006, "step": 786 }, { "epoch": 0.3148, "grad_norm": 1.664100170135498, "learning_rate": 8.019554759940675e-06, "loss": 0.3431, "step": 787 }, { "batch_size": 4, "epoch": 0.3148, "step": 787, "tokens_per_device": 2568 }, { "epoch": 0.3148, "loss_ce": 0.23363764584064484, "loss_lvr": 1.02004873752594, "loss_mode_switch": 0.0, "loss_total": 0.3356425166130066, "step": 787 }, { "batch_size": 1, "epoch": 0.3148, "step": 787, "tokens_per_device": 4891 }, { "epoch": 0.3148, "loss_ce": 0.022223349660634995, "loss_lvr": 0.8468866944313049, "loss_mode_switch": 0.0, "loss_total": 0.10691201686859131, "step": 787 }, { "batch_size": 4, "epoch": 0.3148, "step": 787, "tokens_per_device": 5612 }, { "epoch": 0.3148, "loss_ce": 0.13197308778762817, "loss_lvr": 0.7448993921279907, "loss_mode_switch": 0.0, "loss_total": 0.2064630389213562, "step": 787 }, { "batch_size": 1, "epoch": 0.3148, "step": 787, "tokens_per_device": 4900 }, { "epoch": 0.3148, "loss_ce": 0.007746517658233643, "loss_lvr": 1.0060163736343384, "loss_mode_switch": 0.0, "loss_total": 0.10834815353155136, "step": 787 }, { "batch_size": 4, "epoch": 0.3148, "step": 787, "tokens_per_device": 10192 }, { "epoch": 0.3148, "loss_ce": 0.050730541348457336, "loss_lvr": 0.6374277472496033, "loss_mode_switch": 0.0, "loss_total": 0.11447332054376602, "step": 787 }, { "batch_size": 4, "epoch": 0.3148, "step": 787, "tokens_per_device": 3800 }, { "epoch": 0.3148, "loss_ce": 0.20912209153175354, "loss_lvr": 0.8938360810279846, "loss_mode_switch": 0.0, "loss_total": 0.2985056936740875, "step": 787 }, { "batch_size": 1, "epoch": 0.3148, "step": 787, "tokens_per_device": 4851 }, { "epoch": 0.3148, "loss_ce": 0.0021150978282094, "loss_lvr": 0.30298396944999695, "loss_mode_switch": 0.0, "loss_total": 0.03241349384188652, "step": 787 }, { "batch_size": 4, "epoch": 0.3148, "step": 787, "tokens_per_device": 4300 }, { "epoch": 0.3148, "loss_ce": 0.22076080739498138, "loss_lvr": 0.9388647079467773, "loss_mode_switch": 0.0, "loss_total": 0.31464728713035583, "step": 787 }, { "epoch": 0.3152, "grad_norm": 1.2970786094665527, "learning_rate": 8.01438931602288e-06, "loss": 0.2994, "step": 788 }, { "batch_size": 4, "epoch": 0.3152, "step": 788, "tokens_per_device": 3820 }, { "epoch": 0.3152, "loss_ce": 0.2581198513507843, "loss_lvr": 0.9674023985862732, "loss_mode_switch": 0.0, "loss_total": 0.3548600971698761, "step": 788 }, { "batch_size": 4, "epoch": 0.3152, "step": 788, "tokens_per_device": 7684 }, { "epoch": 0.3152, "loss_ce": 0.27067291736602783, "loss_lvr": 0.920458972454071, "loss_mode_switch": 0.0, "loss_total": 0.3627188205718994, "step": 788 }, { "batch_size": 1, "epoch": 0.3152, "step": 788, "tokens_per_device": 4987 }, { "epoch": 0.3152, "loss_ce": 0.3150736391544342, "loss_lvr": 0.4361705183982849, "loss_mode_switch": 0.0, "loss_total": 0.35869067907333374, "step": 788 }, { "batch_size": 4, "epoch": 0.3152, "step": 788, "tokens_per_device": 4788 }, { "epoch": 0.3152, "loss_ce": 0.2251049280166626, "loss_lvr": 0.8173936605453491, "loss_mode_switch": 0.0, "loss_total": 0.3068442940711975, "step": 788 }, { "batch_size": 4, "epoch": 0.3152, "step": 788, "tokens_per_device": 12320 }, { "epoch": 0.3152, "loss_ce": 0.11074154078960419, "loss_lvr": 0.953453004360199, "loss_mode_switch": 0.0, "loss_total": 0.20608684420585632, "step": 788 }, { "batch_size": 4, "epoch": 0.3152, "step": 788, "tokens_per_device": 4244 }, { "epoch": 0.3152, "loss_ce": 0.13919396698474884, "loss_lvr": 0.8703234791755676, "loss_mode_switch": 0.0, "loss_total": 0.2262263149023056, "step": 788 }, { "batch_size": 4, "epoch": 0.3152, "step": 788, "tokens_per_device": 2704 }, { "epoch": 0.3152, "loss_ce": 0.32948991656303406, "loss_lvr": 0.7623606324195862, "loss_mode_switch": 0.0, "loss_total": 0.40572598576545715, "step": 788 }, { "batch_size": 4, "epoch": 0.3152, "step": 788, "tokens_per_device": 4520 }, { "epoch": 0.3152, "loss_ce": 0.048864394426345825, "loss_lvr": 0.8870015740394592, "loss_mode_switch": 0.0, "loss_total": 0.137564554810524, "step": 788 }, { "epoch": 0.3156, "grad_norm": 1.1843204498291016, "learning_rate": 8.009218812978567e-06, "loss": 0.2866, "step": 789 }, { "batch_size": 4, "epoch": 0.3156, "step": 789, "tokens_per_device": 4632 }, { "epoch": 0.3156, "loss_ce": 0.1222996786236763, "loss_lvr": 0.8543441295623779, "loss_mode_switch": 0.0, "loss_total": 0.2077340930700302, "step": 789 }, { "batch_size": 1, "epoch": 0.3156, "step": 789, "tokens_per_device": 4573 }, { "epoch": 0.3156, "loss_ce": 0.018477698788046837, "loss_lvr": 0.5767391920089722, "loss_mode_switch": 0.0, "loss_total": 0.07615161687135696, "step": 789 }, { "batch_size": 4, "epoch": 0.3156, "step": 789, "tokens_per_device": 4372 }, { "epoch": 0.3156, "loss_ce": 0.27871933579444885, "loss_lvr": 0.8832406997680664, "loss_mode_switch": 0.0, "loss_total": 0.3670434057712555, "step": 789 }, { "batch_size": 4, "epoch": 0.3156, "step": 789, "tokens_per_device": 2616 }, { "epoch": 0.3156, "loss_ce": 0.3422122895717621, "loss_lvr": 1.1145111322402954, "loss_mode_switch": 0.0, "loss_total": 0.4536634087562561, "step": 789 }, { "batch_size": 1, "epoch": 0.3156, "step": 789, "tokens_per_device": 4899 }, { "epoch": 0.3156, "loss_ce": 0.11392301321029663, "loss_lvr": 0.8664553761482239, "loss_mode_switch": 0.0, "loss_total": 0.2005685567855835, "step": 789 }, { "batch_size": 4, "epoch": 0.3156, "step": 789, "tokens_per_device": 5348 }, { "epoch": 0.3156, "loss_ce": 0.17334483563899994, "loss_lvr": 0.8735727071762085, "loss_mode_switch": 0.0, "loss_total": 0.26070210337638855, "step": 789 }, { "batch_size": 1, "epoch": 0.3156, "step": 789, "tokens_per_device": 4439 }, { "epoch": 0.3156, "loss_ce": 0.08080350607633591, "loss_lvr": 0.8535992503166199, "loss_mode_switch": 0.0, "loss_total": 0.16616342961788177, "step": 789 }, { "batch_size": 1, "epoch": 0.3156, "step": 789, "tokens_per_device": 4926 }, { "epoch": 0.3156, "loss_ce": 0.6486548185348511, "loss_lvr": 0.41840609908103943, "loss_mode_switch": 0.0, "loss_total": 0.6904954314231873, "step": 789 }, { "epoch": 0.316, "grad_norm": 1.2755001783370972, "learning_rate": 8.004043259485519e-06, "loss": 0.2641, "step": 790 }, { "batch_size": 4, "epoch": 0.316, "step": 790, "tokens_per_device": 7952 }, { "epoch": 0.316, "loss_ce": 0.03659550100564957, "loss_lvr": 0.7461283206939697, "loss_mode_switch": 0.0, "loss_total": 0.11120833456516266, "step": 790 }, { "batch_size": 4, "epoch": 0.316, "step": 790, "tokens_per_device": 7136 }, { "epoch": 0.316, "loss_ce": 0.060632944107055664, "loss_lvr": 0.7206469178199768, "loss_mode_switch": 0.0, "loss_total": 0.13269764184951782, "step": 790 }, { "batch_size": 1, "epoch": 0.316, "step": 790, "tokens_per_device": 5030 }, { "epoch": 0.316, "loss_ce": 0.37752383947372437, "loss_lvr": 0.471630722284317, "loss_mode_switch": 0.0, "loss_total": 0.42468690872192383, "step": 790 }, { "batch_size": 4, "epoch": 0.316, "step": 790, "tokens_per_device": 6232 }, { "epoch": 0.316, "loss_ce": 0.11162376403808594, "loss_lvr": 0.48025384545326233, "loss_mode_switch": 0.0, "loss_total": 0.15964914858341217, "step": 790 }, { "batch_size": 4, "epoch": 0.316, "step": 790, "tokens_per_device": 2736 }, { "epoch": 0.316, "loss_ce": 0.5406695008277893, "loss_lvr": 0.9128429293632507, "loss_mode_switch": 0.0, "loss_total": 0.631953775882721, "step": 790 }, { "batch_size": 4, "epoch": 0.316, "step": 790, "tokens_per_device": 3772 }, { "epoch": 0.316, "loss_ce": 0.12888315320014954, "loss_lvr": 0.8219740390777588, "loss_mode_switch": 0.0, "loss_total": 0.21108055114746094, "step": 790 }, { "batch_size": 4, "epoch": 0.316, "step": 790, "tokens_per_device": 5244 }, { "epoch": 0.316, "loss_ce": 0.38603857159614563, "loss_lvr": 1.0025147199630737, "loss_mode_switch": 0.0, "loss_total": 0.4862900376319885, "step": 790 }, { "batch_size": 4, "epoch": 0.316, "step": 790, "tokens_per_device": 1484 }, { "epoch": 0.316, "loss_ce": 0.2545147240161896, "loss_lvr": 2.144383192062378, "loss_mode_switch": 0.0, "loss_total": 0.46895304322242737, "step": 790 }, { "epoch": 0.3164, "grad_norm": 1.2304539680480957, "learning_rate": 7.99886266423e-06, "loss": 0.2785, "step": 791 }, { "batch_size": 1, "epoch": 0.3164, "step": 791, "tokens_per_device": 4877 }, { "epoch": 0.3164, "loss_ce": 0.04052059352397919, "loss_lvr": 0.3283025026321411, "loss_mode_switch": 0.0, "loss_total": 0.07335084676742554, "step": 791 }, { "batch_size": 1, "epoch": 0.3164, "step": 791, "tokens_per_device": 4884 }, { "epoch": 0.3164, "loss_ce": 0.08703388273715973, "loss_lvr": 0.9036169052124023, "loss_mode_switch": 0.0, "loss_total": 0.17739558219909668, "step": 791 }, { "batch_size": 4, "epoch": 0.3164, "step": 791, "tokens_per_device": 4940 }, { "epoch": 0.3164, "loss_ce": 0.062325771898031235, "loss_lvr": 0.7759096026420593, "loss_mode_switch": 0.0, "loss_total": 0.13991673290729523, "step": 791 }, { "batch_size": 4, "epoch": 0.3164, "step": 791, "tokens_per_device": 4204 }, { "epoch": 0.3164, "loss_ce": 0.0187821164727211, "loss_lvr": 0.9575198292732239, "loss_mode_switch": 0.0, "loss_total": 0.11453410238027573, "step": 791 }, { "batch_size": 4, "epoch": 0.3164, "step": 791, "tokens_per_device": 3312 }, { "epoch": 0.3164, "loss_ce": 0.3955520987510681, "loss_lvr": 0.9016489386558533, "loss_mode_switch": 0.0, "loss_total": 0.4857169985771179, "step": 791 }, { "batch_size": 1, "epoch": 0.3164, "step": 791, "tokens_per_device": 4927 }, { "epoch": 0.3164, "loss_ce": 0.04955334588885307, "loss_lvr": 0.43948835134506226, "loss_mode_switch": 0.0, "loss_total": 0.09350217878818512, "step": 791 }, { "batch_size": 4, "epoch": 0.3164, "step": 791, "tokens_per_device": 3320 }, { "epoch": 0.3164, "loss_ce": 0.14619283378124237, "loss_lvr": 1.1500805616378784, "loss_mode_switch": 0.0, "loss_total": 0.2612009048461914, "step": 791 }, { "batch_size": 4, "epoch": 0.3164, "step": 791, "tokens_per_device": 1420 }, { "epoch": 0.3164, "loss_ce": 0.42333701252937317, "loss_lvr": 1.1205098628997803, "loss_mode_switch": 0.0, "loss_total": 0.5353879928588867, "step": 791 }, { "epoch": 0.3168, "grad_norm": 1.2155420780181885, "learning_rate": 7.993677035906734e-06, "loss": 0.2621, "step": 792 }, { "batch_size": 4, "epoch": 0.3168, "step": 792, "tokens_per_device": 4248 }, { "epoch": 0.3168, "loss_ce": 0.02948658913373947, "loss_lvr": 1.0308605432510376, "loss_mode_switch": 0.0, "loss_total": 0.13257265090942383, "step": 792 }, { "batch_size": 4, "epoch": 0.3168, "step": 792, "tokens_per_device": 5852 }, { "epoch": 0.3168, "loss_ce": 0.028677023947238922, "loss_lvr": 0.627170979976654, "loss_mode_switch": 0.0, "loss_total": 0.09139412641525269, "step": 792 }, { "batch_size": 4, "epoch": 0.3168, "step": 792, "tokens_per_device": 6448 }, { "epoch": 0.3168, "loss_ce": 0.42889171838760376, "loss_lvr": 0.7520018219947815, "loss_mode_switch": 0.0, "loss_total": 0.5040919184684753, "step": 792 }, { "batch_size": 4, "epoch": 0.3168, "step": 792, "tokens_per_device": 3808 }, { "epoch": 0.3168, "loss_ce": 0.2648151218891144, "loss_lvr": 0.914945125579834, "loss_mode_switch": 0.0, "loss_total": 0.3563096523284912, "step": 792 }, { "batch_size": 4, "epoch": 0.3168, "step": 792, "tokens_per_device": 3964 }, { "epoch": 0.3168, "loss_ce": 0.024286048486828804, "loss_lvr": 0.7053304314613342, "loss_mode_switch": 0.0, "loss_total": 0.0948190912604332, "step": 792 }, { "batch_size": 4, "epoch": 0.3168, "step": 792, "tokens_per_device": 4264 }, { "epoch": 0.3168, "loss_ce": 0.23133699595928192, "loss_lvr": 0.9509384632110596, "loss_mode_switch": 0.0, "loss_total": 0.3264308571815491, "step": 792 }, { "batch_size": 4, "epoch": 0.3168, "step": 792, "tokens_per_device": 2816 }, { "epoch": 0.3168, "loss_ce": 0.3342517912387848, "loss_lvr": 0.7432054877281189, "loss_mode_switch": 0.0, "loss_total": 0.40857234597206116, "step": 792 }, { "batch_size": 1, "epoch": 0.3168, "step": 792, "tokens_per_device": 5128 }, { "epoch": 0.3168, "loss_ce": 0.07135720551013947, "loss_lvr": 0.5014731884002686, "loss_mode_switch": 0.0, "loss_total": 0.1215045303106308, "step": 792 }, { "epoch": 0.3172, "grad_norm": 1.363958716392517, "learning_rate": 7.988486383218898e-06, "loss": 0.3555, "step": 793 }, { "batch_size": 4, "epoch": 0.3172, "step": 793, "tokens_per_device": 1884 }, { "epoch": 0.3172, "loss_ce": 0.37472954392433167, "loss_lvr": 0.8390862345695496, "loss_mode_switch": 0.0, "loss_total": 0.45863816142082214, "step": 793 }, { "batch_size": 1, "epoch": 0.3172, "step": 793, "tokens_per_device": 5123 }, { "epoch": 0.3172, "loss_ce": 0.0373886376619339, "loss_lvr": 0.3719123303890228, "loss_mode_switch": 0.0, "loss_total": 0.0745798721909523, "step": 793 }, { "batch_size": 4, "epoch": 0.3172, "step": 793, "tokens_per_device": 2524 }, { "epoch": 0.3172, "loss_ce": 0.4932747781276703, "loss_lvr": 1.043650507926941, "loss_mode_switch": 0.0, "loss_total": 0.597639799118042, "step": 793 }, { "batch_size": 4, "epoch": 0.3172, "step": 793, "tokens_per_device": 6228 }, { "epoch": 0.3172, "loss_ce": 0.14102305471897125, "loss_lvr": 0.782347559928894, "loss_mode_switch": 0.0, "loss_total": 0.21925780177116394, "step": 793 }, { "batch_size": 4, "epoch": 0.3172, "step": 793, "tokens_per_device": 5764 }, { "epoch": 0.3172, "loss_ce": 0.38971206545829773, "loss_lvr": 0.975670576095581, "loss_mode_switch": 0.0, "loss_total": 0.48727911710739136, "step": 793 }, { "batch_size": 1, "epoch": 0.3172, "step": 793, "tokens_per_device": 4747 }, { "epoch": 0.3172, "loss_ce": 0.02315680682659149, "loss_lvr": 0.33247315883636475, "loss_mode_switch": 0.0, "loss_total": 0.056404124945402145, "step": 793 }, { "batch_size": 4, "epoch": 0.3172, "step": 793, "tokens_per_device": 1944 }, { "epoch": 0.3172, "loss_ce": 0.035332124680280685, "loss_lvr": 0.9589737057685852, "loss_mode_switch": 0.0, "loss_total": 0.1312294900417328, "step": 793 }, { "batch_size": 1, "epoch": 0.3172, "step": 793, "tokens_per_device": 5041 }, { "epoch": 0.3172, "loss_ce": 0.001003642799332738, "loss_lvr": 0.5266454219818115, "loss_mode_switch": 0.0, "loss_total": 0.05366818606853485, "step": 793 }, { "epoch": 0.3176, "grad_norm": 1.3727569580078125, "learning_rate": 7.98329071487809e-06, "loss": 0.3066, "step": 794 }, { "batch_size": 4, "epoch": 0.3176, "step": 794, "tokens_per_device": 3204 }, { "epoch": 0.3176, "loss_ce": 0.2671196162700653, "loss_lvr": 0.8675902485847473, "loss_mode_switch": 0.0, "loss_total": 0.3538786470890045, "step": 794 }, { "batch_size": 4, "epoch": 0.3176, "step": 794, "tokens_per_device": 2536 }, { "epoch": 0.3176, "loss_ce": 0.16634562611579895, "loss_lvr": 1.122389316558838, "loss_mode_switch": 0.0, "loss_total": 0.2785845696926117, "step": 794 }, { "batch_size": 4, "epoch": 0.3176, "step": 794, "tokens_per_device": 6672 }, { "epoch": 0.3176, "loss_ce": 0.09191121906042099, "loss_lvr": 0.9536087512969971, "loss_mode_switch": 0.0, "loss_total": 0.1872721016407013, "step": 794 }, { "batch_size": 1, "epoch": 0.3176, "step": 794, "tokens_per_device": 4881 }, { "epoch": 0.3176, "loss_ce": 0.7015794515609741, "loss_lvr": 0.8045216202735901, "loss_mode_switch": 0.0, "loss_total": 0.7820315957069397, "step": 794 }, { "batch_size": 4, "epoch": 0.3176, "step": 794, "tokens_per_device": 4748 }, { "epoch": 0.3176, "loss_ce": 0.16206949949264526, "loss_lvr": 0.7393218874931335, "loss_mode_switch": 0.0, "loss_total": 0.23600170016288757, "step": 794 }, { "batch_size": 4, "epoch": 0.3176, "step": 794, "tokens_per_device": 5652 }, { "epoch": 0.3176, "loss_ce": 0.29375046491622925, "loss_lvr": 0.7725126147270203, "loss_mode_switch": 0.0, "loss_total": 0.3710017204284668, "step": 794 }, { "batch_size": 4, "epoch": 0.3176, "step": 794, "tokens_per_device": 4112 }, { "epoch": 0.3176, "loss_ce": 0.24842289090156555, "loss_lvr": 0.936781644821167, "loss_mode_switch": 0.0, "loss_total": 0.3421010673046112, "step": 794 }, { "batch_size": 4, "epoch": 0.3176, "step": 794, "tokens_per_device": 5744 }, { "epoch": 0.3176, "loss_ce": 0.07201937586069107, "loss_lvr": 1.0529141426086426, "loss_mode_switch": 0.0, "loss_total": 0.1773107945919037, "step": 794 }, { "epoch": 0.318, "grad_norm": 1.5152379274368286, "learning_rate": 7.978090039604342e-06, "loss": 0.3339, "step": 795 }, { "batch_size": 4, "epoch": 0.318, "step": 795, "tokens_per_device": 4820 }, { "epoch": 0.318, "loss_ce": 0.009385599754750729, "loss_lvr": 0.7956517934799194, "loss_mode_switch": 0.0, "loss_total": 0.08895078301429749, "step": 795 }, { "batch_size": 4, "epoch": 0.318, "step": 795, "tokens_per_device": 1680 }, { "epoch": 0.318, "loss_ce": 0.4493296444416046, "loss_lvr": 0.8966765403747559, "loss_mode_switch": 0.0, "loss_total": 0.5389972925186157, "step": 795 }, { "batch_size": 1, "epoch": 0.318, "step": 795, "tokens_per_device": 6086 }, { "epoch": 0.318, "loss_ce": 0.0015741455135867, "loss_lvr": 0.4563322067260742, "loss_mode_switch": 0.0, "loss_total": 0.047207366675138474, "step": 795 }, { "batch_size": 1, "epoch": 0.318, "step": 795, "tokens_per_device": 4901 }, { "epoch": 0.318, "loss_ce": 0.00199952838011086, "loss_lvr": 0.5283799171447754, "loss_mode_switch": 0.0, "loss_total": 0.05483752116560936, "step": 795 }, { "batch_size": 1, "epoch": 0.318, "step": 795, "tokens_per_device": 5115 }, { "epoch": 0.318, "loss_ce": 0.030361786484718323, "loss_lvr": 0.31461474299430847, "loss_mode_switch": 0.0, "loss_total": 0.06182326003909111, "step": 795 }, { "batch_size": 4, "epoch": 0.318, "step": 795, "tokens_per_device": 1404 }, { "epoch": 0.318, "loss_ce": 0.22696757316589355, "loss_lvr": 0.958221971988678, "loss_mode_switch": 0.0, "loss_total": 0.3227897882461548, "step": 795 }, { "batch_size": 4, "epoch": 0.318, "step": 795, "tokens_per_device": 4892 }, { "epoch": 0.318, "loss_ce": 0.0794546827673912, "loss_lvr": 0.760521650314331, "loss_mode_switch": 0.0, "loss_total": 0.15550684928894043, "step": 795 }, { "batch_size": 4, "epoch": 0.318, "step": 795, "tokens_per_device": 3952 }, { "epoch": 0.318, "loss_ce": 0.011995907872915268, "loss_lvr": 1.2243623733520508, "loss_mode_switch": 0.0, "loss_total": 0.13443215191364288, "step": 795 }, { "epoch": 0.3184, "grad_norm": 1.3915989398956299, "learning_rate": 7.972884366126072e-06, "loss": 0.3149, "step": 796 }, { "batch_size": 4, "epoch": 0.3184, "step": 796, "tokens_per_device": 1732 }, { "epoch": 0.3184, "loss_ce": 0.4171212315559387, "loss_lvr": 1.1610547304153442, "loss_mode_switch": 0.0, "loss_total": 0.533226728439331, "step": 796 }, { "batch_size": 1, "epoch": 0.3184, "step": 796, "tokens_per_device": 4893 }, { "epoch": 0.3184, "loss_ce": 0.09228547662496567, "loss_lvr": 0.7836769223213196, "loss_mode_switch": 0.0, "loss_total": 0.17065316438674927, "step": 796 }, { "batch_size": 4, "epoch": 0.3184, "step": 796, "tokens_per_device": 3768 }, { "epoch": 0.3184, "loss_ce": 0.16909663379192352, "loss_lvr": 0.9761347770690918, "loss_mode_switch": 0.0, "loss_total": 0.266710102558136, "step": 796 }, { "batch_size": 1, "epoch": 0.3184, "step": 796, "tokens_per_device": 4883 }, { "epoch": 0.3184, "loss_ce": 0.0016174056800082326, "loss_lvr": 0.2787502110004425, "loss_mode_switch": 0.0, "loss_total": 0.02949242666363716, "step": 796 }, { "batch_size": 4, "epoch": 0.3184, "step": 796, "tokens_per_device": 4500 }, { "epoch": 0.3184, "loss_ce": 0.10655628144741058, "loss_lvr": 1.1428861618041992, "loss_mode_switch": 0.0, "loss_total": 0.22084489464759827, "step": 796 }, { "batch_size": 4, "epoch": 0.3184, "step": 796, "tokens_per_device": 5656 }, { "epoch": 0.3184, "loss_ce": 0.038328867405653, "loss_lvr": 0.8733739852905273, "loss_mode_switch": 0.0, "loss_total": 0.12566626071929932, "step": 796 }, { "batch_size": 4, "epoch": 0.3184, "step": 796, "tokens_per_device": 4220 }, { "epoch": 0.3184, "loss_ce": 0.10013213753700256, "loss_lvr": 1.1763755083084106, "loss_mode_switch": 0.0, "loss_total": 0.21776968240737915, "step": 796 }, { "batch_size": 1, "epoch": 0.3184, "step": 796, "tokens_per_device": 4615 }, { "epoch": 0.3184, "loss_ce": 0.6734946966171265, "loss_lvr": 0.6689086556434631, "loss_mode_switch": 0.0, "loss_total": 0.7403855323791504, "step": 796 }, { "epoch": 0.3188, "grad_norm": 1.4297791719436646, "learning_rate": 7.967673703180096e-06, "loss": 0.2825, "step": 797 }, { "batch_size": 4, "epoch": 0.3188, "step": 797, "tokens_per_device": 6004 }, { "epoch": 0.3188, "loss_ce": 0.11530158668756485, "loss_lvr": 1.0687803030014038, "loss_mode_switch": 0.0, "loss_total": 0.2221796214580536, "step": 797 }, { "batch_size": 4, "epoch": 0.3188, "step": 797, "tokens_per_device": 1536 }, { "epoch": 0.3188, "loss_ce": 0.16201895475387573, "loss_lvr": 1.9127767086029053, "loss_mode_switch": 0.0, "loss_total": 0.3532966375350952, "step": 797 }, { "batch_size": 1, "epoch": 0.3188, "step": 797, "tokens_per_device": 4901 }, { "epoch": 0.3188, "loss_ce": 0.0015670402208343148, "loss_lvr": 0.773568332195282, "loss_mode_switch": 0.0, "loss_total": 0.07892388105392456, "step": 797 }, { "batch_size": 4, "epoch": 0.3188, "step": 797, "tokens_per_device": 4280 }, { "epoch": 0.3188, "loss_ce": 0.016327084973454475, "loss_lvr": 0.8575736880302429, "loss_mode_switch": 0.0, "loss_total": 0.1020844504237175, "step": 797 }, { "batch_size": 4, "epoch": 0.3188, "step": 797, "tokens_per_device": 4236 }, { "epoch": 0.3188, "loss_ce": 0.16126494109630585, "loss_lvr": 0.8617474436759949, "loss_mode_switch": 0.0, "loss_total": 0.2474396824836731, "step": 797 }, { "batch_size": 1, "epoch": 0.3188, "step": 797, "tokens_per_device": 5214 }, { "epoch": 0.3188, "loss_ce": 0.01771526411175728, "loss_lvr": 0.20635832846164703, "loss_mode_switch": 0.0, "loss_total": 0.03835109621286392, "step": 797 }, { "batch_size": 4, "epoch": 0.3188, "step": 797, "tokens_per_device": 3804 }, { "epoch": 0.3188, "loss_ce": 0.45413875579833984, "loss_lvr": 1.1809515953063965, "loss_mode_switch": 0.0, "loss_total": 0.5722339153289795, "step": 797 }, { "batch_size": 4, "epoch": 0.3188, "step": 797, "tokens_per_device": 3952 }, { "epoch": 0.3188, "loss_ce": 0.2491735965013504, "loss_lvr": 0.8982402086257935, "loss_mode_switch": 0.0, "loss_total": 0.33899760246276855, "step": 797 }, { "epoch": 0.3192, "grad_norm": 1.2671161890029907, "learning_rate": 7.962458059511607e-06, "loss": 0.3041, "step": 798 }, { "batch_size": 1, "epoch": 0.3192, "step": 798, "tokens_per_device": 4857 }, { "epoch": 0.3192, "loss_ce": 0.011556930840015411, "loss_lvr": 0.46618103981018066, "loss_mode_switch": 0.0, "loss_total": 0.05817503482103348, "step": 798 }, { "batch_size": 1, "epoch": 0.3192, "step": 798, "tokens_per_device": 5171 }, { "epoch": 0.3192, "loss_ce": 0.01445030327886343, "loss_lvr": 0.589467465877533, "loss_mode_switch": 0.0, "loss_total": 0.07339704781770706, "step": 798 }, { "batch_size": 1, "epoch": 0.3192, "step": 798, "tokens_per_device": 4966 }, { "epoch": 0.3192, "loss_ce": 0.017284220084547997, "loss_lvr": 0.5297243595123291, "loss_mode_switch": 0.0, "loss_total": 0.07025665789842606, "step": 798 }, { "batch_size": 4, "epoch": 0.3192, "step": 798, "tokens_per_device": 4404 }, { "epoch": 0.3192, "loss_ce": 0.22999174892902374, "loss_lvr": 0.867376983165741, "loss_mode_switch": 0.0, "loss_total": 0.31672945618629456, "step": 798 }, { "batch_size": 4, "epoch": 0.3192, "step": 798, "tokens_per_device": 4020 }, { "epoch": 0.3192, "loss_ce": 0.29670122265815735, "loss_lvr": 0.9925004839897156, "loss_mode_switch": 0.0, "loss_total": 0.3959512710571289, "step": 798 }, { "batch_size": 4, "epoch": 0.3192, "step": 798, "tokens_per_device": 3828 }, { "epoch": 0.3192, "loss_ce": 0.3156125247478485, "loss_lvr": 0.9764264225959778, "loss_mode_switch": 0.0, "loss_total": 0.41325515508651733, "step": 798 }, { "batch_size": 4, "epoch": 0.3192, "step": 798, "tokens_per_device": 3168 }, { "epoch": 0.3192, "loss_ce": 0.49139687418937683, "loss_lvr": 1.0859706401824951, "loss_mode_switch": 0.0, "loss_total": 0.5999939441680908, "step": 798 }, { "batch_size": 1, "epoch": 0.3192, "step": 798, "tokens_per_device": 4900 }, { "epoch": 0.3192, "loss_ce": 0.20815737545490265, "loss_lvr": 1.2558618783950806, "loss_mode_switch": 0.0, "loss_total": 0.3337435722351074, "step": 798 }, { "epoch": 0.3196, "grad_norm": 1.274861216545105, "learning_rate": 7.957237443874148e-06, "loss": 0.2714, "step": 799 }, { "batch_size": 4, "epoch": 0.3196, "step": 799, "tokens_per_device": 3748 }, { "epoch": 0.3196, "loss_ce": 0.09254854172468185, "loss_lvr": 0.9160829782485962, "loss_mode_switch": 0.0, "loss_total": 0.18415683507919312, "step": 799 }, { "batch_size": 4, "epoch": 0.3196, "step": 799, "tokens_per_device": 4188 }, { "epoch": 0.3196, "loss_ce": 0.4799293577671051, "loss_lvr": 0.9927970170974731, "loss_mode_switch": 0.0, "loss_total": 0.5792090892791748, "step": 799 }, { "batch_size": 1, "epoch": 0.3196, "step": 799, "tokens_per_device": 5110 }, { "epoch": 0.3196, "loss_ce": 0.005464158486574888, "loss_lvr": 0.7870413661003113, "loss_mode_switch": 0.0, "loss_total": 0.08416830003261566, "step": 799 }, { "batch_size": 1, "epoch": 0.3196, "step": 799, "tokens_per_device": 5045 }, { "epoch": 0.3196, "loss_ce": 0.011686371639370918, "loss_lvr": 0.4624609649181366, "loss_mode_switch": 0.0, "loss_total": 0.05793246626853943, "step": 799 }, { "batch_size": 4, "epoch": 0.3196, "step": 799, "tokens_per_device": 4232 }, { "epoch": 0.3196, "loss_ce": 0.04704667255282402, "loss_lvr": 0.7726696729660034, "loss_mode_switch": 0.0, "loss_total": 0.12431363761425018, "step": 799 }, { "batch_size": 1, "epoch": 0.3196, "step": 799, "tokens_per_device": 4874 }, { "epoch": 0.3196, "loss_ce": 0.00697947246953845, "loss_lvr": 0.5811092257499695, "loss_mode_switch": 0.0, "loss_total": 0.06509039551019669, "step": 799 }, { "batch_size": 4, "epoch": 0.3196, "step": 799, "tokens_per_device": 6248 }, { "epoch": 0.3196, "loss_ce": 0.14704658091068268, "loss_lvr": 1.0584712028503418, "loss_mode_switch": 0.0, "loss_total": 0.25289368629455566, "step": 799 }, { "batch_size": 4, "epoch": 0.3196, "step": 799, "tokens_per_device": 4260 }, { "epoch": 0.3196, "loss_ce": 0.46535009145736694, "loss_lvr": 0.9974678158760071, "loss_mode_switch": 0.0, "loss_total": 0.5650968551635742, "step": 799 }, { "epoch": 0.32, "grad_norm": 1.3514552116394043, "learning_rate": 7.952011865029614e-06, "loss": 0.3171, "step": 800 }, { "batch_size": 4, "epoch": 0.32, "step": 800, "tokens_per_device": 6112 }, { "epoch": 0.32, "loss_ce": 0.09483121335506439, "loss_lvr": 0.7596367001533508, "loss_mode_switch": 0.0, "loss_total": 0.17079487442970276, "step": 800 }, { "batch_size": 4, "epoch": 0.32, "step": 800, "tokens_per_device": 7608 }, { "epoch": 0.32, "loss_ce": 0.24677123129367828, "loss_lvr": 0.7359144687652588, "loss_mode_switch": 0.0, "loss_total": 0.3203626871109009, "step": 800 }, { "batch_size": 4, "epoch": 0.32, "step": 800, "tokens_per_device": 13948 }, { "epoch": 0.32, "loss_ce": 0.2877906560897827, "loss_lvr": 0.9073808789253235, "loss_mode_switch": 0.0, "loss_total": 0.37852874398231506, "step": 800 }, { "batch_size": 4, "epoch": 0.32, "step": 800, "tokens_per_device": 5996 }, { "epoch": 0.32, "loss_ce": 0.44505998492240906, "loss_lvr": 0.7850897312164307, "loss_mode_switch": 0.0, "loss_total": 0.5235689878463745, "step": 800 }, { "batch_size": 4, "epoch": 0.32, "step": 800, "tokens_per_device": 5976 }, { "epoch": 0.32, "loss_ce": 0.2136429399251938, "loss_lvr": 0.7514075040817261, "loss_mode_switch": 0.0, "loss_total": 0.2887836992740631, "step": 800 }, { "batch_size": 4, "epoch": 0.32, "step": 800, "tokens_per_device": 4276 }, { "epoch": 0.32, "loss_ce": 0.674684464931488, "loss_lvr": 1.0258963108062744, "loss_mode_switch": 0.0, "loss_total": 0.7772740721702576, "step": 800 }, { "batch_size": 4, "epoch": 0.32, "step": 800, "tokens_per_device": 5264 }, { "epoch": 0.32, "loss_ce": 0.06929117441177368, "loss_lvr": 0.821465790271759, "loss_mode_switch": 0.0, "loss_total": 0.15143775939941406, "step": 800 }, { "batch_size": 1, "epoch": 0.32, "step": 800, "tokens_per_device": 5094 }, { "epoch": 0.32, "loss_ce": 0.1045277789235115, "loss_lvr": 0.5421822667121887, "loss_mode_switch": 0.0, "loss_total": 0.15874600410461426, "step": 800 }, { "epoch": 0.3204, "grad_norm": 1.4721215963363647, "learning_rate": 7.946781331748226e-06, "loss": 0.3287, "step": 801 }, { "batch_size": 4, "epoch": 0.3204, "step": 801, "tokens_per_device": 3840 }, { "epoch": 0.3204, "loss_ce": 0.12488032877445221, "loss_lvr": 1.0229295492172241, "loss_mode_switch": 0.0, "loss_total": 0.22717328369617462, "step": 801 }, { "batch_size": 1, "epoch": 0.3204, "step": 801, "tokens_per_device": 5115 }, { "epoch": 0.3204, "loss_ce": 0.08350531756877899, "loss_lvr": 0.5529621839523315, "loss_mode_switch": 0.0, "loss_total": 0.13880154490470886, "step": 801 }, { "batch_size": 1, "epoch": 0.3204, "step": 801, "tokens_per_device": 4819 }, { "epoch": 0.3204, "loss_ce": 0.000459791102912277, "loss_lvr": 0.3482765555381775, "loss_mode_switch": 0.0, "loss_total": 0.03528744727373123, "step": 801 }, { "batch_size": 1, "epoch": 0.3204, "step": 801, "tokens_per_device": 5140 }, { "epoch": 0.3204, "loss_ce": 0.028161361813545227, "loss_lvr": 0.4265356659889221, "loss_mode_switch": 0.0, "loss_total": 0.07081492990255356, "step": 801 }, { "batch_size": 4, "epoch": 0.3204, "step": 801, "tokens_per_device": 1416 }, { "epoch": 0.3204, "loss_ce": 0.08072931319475174, "loss_lvr": 1.7091437578201294, "loss_mode_switch": 0.0, "loss_total": 0.25164368748664856, "step": 801 }, { "batch_size": 4, "epoch": 0.3204, "step": 801, "tokens_per_device": 3352 }, { "epoch": 0.3204, "loss_ce": 0.298336386680603, "loss_lvr": 0.7941769361495972, "loss_mode_switch": 0.0, "loss_total": 0.3777540922164917, "step": 801 }, { "batch_size": 4, "epoch": 0.3204, "step": 801, "tokens_per_device": 6496 }, { "epoch": 0.3204, "loss_ce": 0.2512800693511963, "loss_lvr": 0.6982936263084412, "loss_mode_switch": 0.0, "loss_total": 0.32110944390296936, "step": 801 }, { "batch_size": 4, "epoch": 0.3204, "step": 801, "tokens_per_device": 2192 }, { "epoch": 0.3204, "loss_ce": 1.0050463676452637, "loss_lvr": 0.8976860642433167, "loss_mode_switch": 0.0, "loss_total": 1.0948150157928467, "step": 801 }, { "epoch": 0.3208, "grad_norm": 1.4186689853668213, "learning_rate": 7.941545852808523e-06, "loss": 0.3109, "step": 802 }, { "batch_size": 4, "epoch": 0.3208, "step": 802, "tokens_per_device": 1572 }, { "epoch": 0.3208, "loss_ce": 0.24762701988220215, "loss_lvr": 1.0584423542022705, "loss_mode_switch": 0.0, "loss_total": 0.3534712493419647, "step": 802 }, { "batch_size": 4, "epoch": 0.3208, "step": 802, "tokens_per_device": 1640 }, { "epoch": 0.3208, "loss_ce": 0.1195845752954483, "loss_lvr": 0.9346299767494202, "loss_mode_switch": 0.0, "loss_total": 0.2130475640296936, "step": 802 }, { "batch_size": 4, "epoch": 0.3208, "step": 802, "tokens_per_device": 2284 }, { "epoch": 0.3208, "loss_ce": 0.532374918460846, "loss_lvr": 1.1123956441879272, "loss_mode_switch": 0.0, "loss_total": 0.6436144709587097, "step": 802 }, { "batch_size": 4, "epoch": 0.3208, "step": 802, "tokens_per_device": 5968 }, { "epoch": 0.3208, "loss_ce": 0.1736765205860138, "loss_lvr": 0.9208979606628418, "loss_mode_switch": 0.0, "loss_total": 0.26576632261276245, "step": 802 }, { "batch_size": 4, "epoch": 0.3208, "step": 802, "tokens_per_device": 4608 }, { "epoch": 0.3208, "loss_ce": 0.2365984320640564, "loss_lvr": 0.7454665899276733, "loss_mode_switch": 0.0, "loss_total": 0.3111450970172882, "step": 802 }, { "batch_size": 4, "epoch": 0.3208, "step": 802, "tokens_per_device": 4268 }, { "epoch": 0.3208, "loss_ce": 0.6261048316955566, "loss_lvr": 0.9618207812309265, "loss_mode_switch": 0.0, "loss_total": 0.7222869396209717, "step": 802 }, { "batch_size": 1, "epoch": 0.3208, "step": 802, "tokens_per_device": 4932 }, { "epoch": 0.3208, "loss_ce": 1.8827037811279297, "loss_lvr": 0.3413250148296356, "loss_mode_switch": 0.0, "loss_total": 1.9168362617492676, "step": 802 }, { "batch_size": 4, "epoch": 0.3208, "step": 802, "tokens_per_device": 2720 }, { "epoch": 0.3208, "loss_ce": 0.2698810398578644, "loss_lvr": 0.7220965623855591, "loss_mode_switch": 0.0, "loss_total": 0.3420906960964203, "step": 802 }, { "epoch": 0.3212, "grad_norm": 1.3870738744735718, "learning_rate": 7.936305436997343e-06, "loss": 0.3384, "step": 803 }, { "batch_size": 4, "epoch": 0.3212, "step": 803, "tokens_per_device": 2700 }, { "epoch": 0.3212, "loss_ce": 0.2793735861778259, "loss_lvr": 0.740201473236084, "loss_mode_switch": 0.0, "loss_total": 0.3533937335014343, "step": 803 }, { "batch_size": 4, "epoch": 0.3212, "step": 803, "tokens_per_device": 5476 }, { "epoch": 0.3212, "loss_ce": 0.14740951359272003, "loss_lvr": 1.7702525854110718, "loss_mode_switch": 0.0, "loss_total": 0.324434757232666, "step": 803 }, { "batch_size": 4, "epoch": 0.3212, "step": 803, "tokens_per_device": 15712 }, { "epoch": 0.3212, "loss_ce": 0.055751875042915344, "loss_lvr": 0.4957982301712036, "loss_mode_switch": 0.0, "loss_total": 0.10533170402050018, "step": 803 }, { "batch_size": 4, "epoch": 0.3212, "step": 803, "tokens_per_device": 4396 }, { "epoch": 0.3212, "loss_ce": 0.21764250099658966, "loss_lvr": 0.7488201856613159, "loss_mode_switch": 0.0, "loss_total": 0.292524516582489, "step": 803 }, { "batch_size": 1, "epoch": 0.3212, "step": 803, "tokens_per_device": 4898 }, { "epoch": 0.3212, "loss_ce": 0.12804225087165833, "loss_lvr": 0.6099680066108704, "loss_mode_switch": 0.0, "loss_total": 0.18903905153274536, "step": 803 }, { "batch_size": 4, "epoch": 0.3212, "step": 803, "tokens_per_device": 5172 }, { "epoch": 0.3212, "loss_ce": 0.10386421531438828, "loss_lvr": 0.7874492406845093, "loss_mode_switch": 0.0, "loss_total": 0.18260914087295532, "step": 803 }, { "batch_size": 4, "epoch": 0.3212, "step": 803, "tokens_per_device": 4900 }, { "epoch": 0.3212, "loss_ce": 0.38098040223121643, "loss_lvr": 0.9772324562072754, "loss_mode_switch": 0.0, "loss_total": 0.47870364785194397, "step": 803 }, { "batch_size": 4, "epoch": 0.3212, "step": 803, "tokens_per_device": 1520 }, { "epoch": 0.3212, "loss_ce": 0.11944673955440521, "loss_lvr": 0.912741482257843, "loss_mode_switch": 0.0, "loss_total": 0.21072089672088623, "step": 803 }, { "epoch": 0.3216, "grad_norm": 1.3588318824768066, "learning_rate": 7.931060093109807e-06, "loss": 0.3083, "step": 804 }, { "batch_size": 4, "epoch": 0.3216, "step": 804, "tokens_per_device": 3872 }, { "epoch": 0.3216, "loss_ce": 0.10348591208457947, "loss_lvr": 0.6214718222618103, "loss_mode_switch": 0.0, "loss_total": 0.16563309729099274, "step": 804 }, { "batch_size": 4, "epoch": 0.3216, "step": 804, "tokens_per_device": 5092 }, { "epoch": 0.3216, "loss_ce": 0.4149850010871887, "loss_lvr": 0.965563178062439, "loss_mode_switch": 0.0, "loss_total": 0.5115413069725037, "step": 804 }, { "batch_size": 1, "epoch": 0.3216, "step": 804, "tokens_per_device": 5047 }, { "epoch": 0.3216, "loss_ce": 0.23462390899658203, "loss_lvr": 0.31793200969696045, "loss_mode_switch": 0.0, "loss_total": 0.26641711592674255, "step": 804 }, { "batch_size": 4, "epoch": 0.3216, "step": 804, "tokens_per_device": 4248 }, { "epoch": 0.3216, "loss_ce": 0.3336259424686432, "loss_lvr": 1.1057718992233276, "loss_mode_switch": 0.0, "loss_total": 0.44420313835144043, "step": 804 }, { "batch_size": 4, "epoch": 0.3216, "step": 804, "tokens_per_device": 5228 }, { "epoch": 0.3216, "loss_ce": 0.20656488835811615, "loss_lvr": 0.7240423560142517, "loss_mode_switch": 0.0, "loss_total": 0.2789691090583801, "step": 804 }, { "batch_size": 4, "epoch": 0.3216, "step": 804, "tokens_per_device": 3880 }, { "epoch": 0.3216, "loss_ce": 0.2225147932767868, "loss_lvr": 0.7868092656135559, "loss_mode_switch": 0.0, "loss_total": 0.3011957108974457, "step": 804 }, { "batch_size": 1, "epoch": 0.3216, "step": 804, "tokens_per_device": 5106 }, { "epoch": 0.3216, "loss_ce": 0.2829737365245819, "loss_lvr": 0.6140186190605164, "loss_mode_switch": 0.0, "loss_total": 0.3443756103515625, "step": 804 }, { "batch_size": 1, "epoch": 0.3216, "step": 804, "tokens_per_device": 5171 }, { "epoch": 0.3216, "loss_ce": 0.008042942732572556, "loss_lvr": 0.805819571018219, "loss_mode_switch": 0.0, "loss_total": 0.08862489461898804, "step": 804 }, { "epoch": 0.322, "grad_norm": 1.358381986618042, "learning_rate": 7.925809829949312e-06, "loss": 0.3118, "step": 805 }, { "batch_size": 4, "epoch": 0.322, "step": 805, "tokens_per_device": 3328 }, { "epoch": 0.322, "loss_ce": 0.6168456673622131, "loss_lvr": 0.783150315284729, "loss_mode_switch": 0.0, "loss_total": 0.6951606869697571, "step": 805 }, { "batch_size": 1, "epoch": 0.322, "step": 805, "tokens_per_device": 4898 }, { "epoch": 0.322, "loss_ce": 0.040044043213129044, "loss_lvr": 0.33753934502601624, "loss_mode_switch": 0.0, "loss_total": 0.07379797846078873, "step": 805 }, { "batch_size": 1, "epoch": 0.322, "step": 805, "tokens_per_device": 5606 }, { "epoch": 0.322, "loss_ce": 0.034592851996421814, "loss_lvr": 0.36022263765335083, "loss_mode_switch": 0.0, "loss_total": 0.07061511278152466, "step": 805 }, { "batch_size": 4, "epoch": 0.322, "step": 805, "tokens_per_device": 6096 }, { "epoch": 0.322, "loss_ce": 0.36348992586135864, "loss_lvr": 0.7606479525566101, "loss_mode_switch": 0.0, "loss_total": 0.43955472111701965, "step": 805 }, { "batch_size": 1, "epoch": 0.322, "step": 805, "tokens_per_device": 4924 }, { "epoch": 0.322, "loss_ce": 0.07332603633403778, "loss_lvr": 0.33924010396003723, "loss_mode_switch": 0.0, "loss_total": 0.10725004971027374, "step": 805 }, { "batch_size": 4, "epoch": 0.322, "step": 805, "tokens_per_device": 4380 }, { "epoch": 0.322, "loss_ce": 0.485054612159729, "loss_lvr": 1.0368667840957642, "loss_mode_switch": 0.0, "loss_total": 0.5887413024902344, "step": 805 }, { "batch_size": 4, "epoch": 0.322, "step": 805, "tokens_per_device": 7052 }, { "epoch": 0.322, "loss_ce": 0.1438656747341156, "loss_lvr": 0.8592853546142578, "loss_mode_switch": 0.0, "loss_total": 0.2297942042350769, "step": 805 }, { "batch_size": 4, "epoch": 0.322, "step": 805, "tokens_per_device": 6352 }, { "epoch": 0.322, "loss_ce": 0.289469838142395, "loss_lvr": 0.7741676568984985, "loss_mode_switch": 0.0, "loss_total": 0.36688661575317383, "step": 805 }, { "epoch": 0.3224, "grad_norm": 1.4033153057098389, "learning_rate": 7.920554656327509e-06, "loss": 0.3317, "step": 806 }, { "batch_size": 4, "epoch": 0.3224, "step": 806, "tokens_per_device": 6984 }, { "epoch": 0.3224, "loss_ce": 0.31552496552467346, "loss_lvr": 0.7296955585479736, "loss_mode_switch": 0.0, "loss_total": 0.3884945213794708, "step": 806 }, { "batch_size": 4, "epoch": 0.3224, "step": 806, "tokens_per_device": 5748 }, { "epoch": 0.3224, "loss_ce": 0.16792844235897064, "loss_lvr": 0.8828181624412537, "loss_mode_switch": 0.0, "loss_total": 0.2562102675437927, "step": 806 }, { "batch_size": 4, "epoch": 0.3224, "step": 806, "tokens_per_device": 2652 }, { "epoch": 0.3224, "loss_ce": 0.3565397560596466, "loss_lvr": 0.9325249195098877, "loss_mode_switch": 0.0, "loss_total": 0.4497922658920288, "step": 806 }, { "batch_size": 4, "epoch": 0.3224, "step": 806, "tokens_per_device": 4316 }, { "epoch": 0.3224, "loss_ce": 0.086143858730793, "loss_lvr": 0.8220384120941162, "loss_mode_switch": 0.0, "loss_total": 0.16834770143032074, "step": 806 }, { "batch_size": 4, "epoch": 0.3224, "step": 806, "tokens_per_device": 4304 }, { "epoch": 0.3224, "loss_ce": 0.19565565884113312, "loss_lvr": 0.8319636583328247, "loss_mode_switch": 0.0, "loss_total": 0.27885201573371887, "step": 806 }, { "batch_size": 4, "epoch": 0.3224, "step": 806, "tokens_per_device": 3852 }, { "epoch": 0.3224, "loss_ce": 0.08216676861047745, "loss_lvr": 0.8294317126274109, "loss_mode_switch": 0.0, "loss_total": 0.16510993242263794, "step": 806 }, { "batch_size": 1, "epoch": 0.3224, "step": 806, "tokens_per_device": 5115 }, { "epoch": 0.3224, "loss_ce": 0.27544525265693665, "loss_lvr": 0.23306424915790558, "loss_mode_switch": 0.0, "loss_total": 0.29875168204307556, "step": 806 }, { "batch_size": 4, "epoch": 0.3224, "step": 806, "tokens_per_device": 10980 }, { "epoch": 0.3224, "loss_ce": 0.11008502542972565, "loss_lvr": 0.8853592276573181, "loss_mode_switch": 0.0, "loss_total": 0.19862094521522522, "step": 806 }, { "epoch": 0.3228, "grad_norm": 1.3909012079238892, "learning_rate": 7.915294581064287e-06, "loss": 0.3428, "step": 807 }, { "batch_size": 1, "epoch": 0.3228, "step": 807, "tokens_per_device": 5150 }, { "epoch": 0.3228, "loss_ce": 0.12667807936668396, "loss_lvr": 0.46303123235702515, "loss_mode_switch": 0.0, "loss_total": 0.17298120260238647, "step": 807 }, { "batch_size": 4, "epoch": 0.3228, "step": 807, "tokens_per_device": 4336 }, { "epoch": 0.3228, "loss_ce": 0.6545668244361877, "loss_lvr": 0.9591989517211914, "loss_mode_switch": 0.0, "loss_total": 0.7504867315292358, "step": 807 }, { "batch_size": 1, "epoch": 0.3228, "step": 807, "tokens_per_device": 4894 }, { "epoch": 0.3228, "loss_ce": 0.3264937698841095, "loss_lvr": 0.3837161958217621, "loss_mode_switch": 0.0, "loss_total": 0.36486539244651794, "step": 807 }, { "batch_size": 1, "epoch": 0.3228, "step": 807, "tokens_per_device": 5251 }, { "epoch": 0.3228, "loss_ce": 0.017684366554021835, "loss_lvr": 0.7570775151252747, "loss_mode_switch": 0.0, "loss_total": 0.09339211881160736, "step": 807 }, { "batch_size": 1, "epoch": 0.3228, "step": 807, "tokens_per_device": 5138 }, { "epoch": 0.3228, "loss_ce": 0.002541144611313939, "loss_lvr": 0.7254397869110107, "loss_mode_switch": 0.0, "loss_total": 0.0750851258635521, "step": 807 }, { "batch_size": 4, "epoch": 0.3228, "step": 807, "tokens_per_device": 2720 }, { "epoch": 0.3228, "loss_ce": 0.3680560886859894, "loss_lvr": 0.6391518115997314, "loss_mode_switch": 0.0, "loss_total": 0.4319712817668915, "step": 807 }, { "batch_size": 4, "epoch": 0.3228, "step": 807, "tokens_per_device": 3952 }, { "epoch": 0.3228, "loss_ce": 0.32705414295196533, "loss_lvr": 1.1231151819229126, "loss_mode_switch": 0.0, "loss_total": 0.4393656551837921, "step": 807 }, { "batch_size": 4, "epoch": 0.3228, "step": 807, "tokens_per_device": 10628 }, { "epoch": 0.3228, "loss_ce": 0.23277831077575684, "loss_lvr": 0.7011392712593079, "loss_mode_switch": 0.0, "loss_total": 0.3028922379016876, "step": 807 }, { "epoch": 0.3232, "grad_norm": 1.2935585975646973, "learning_rate": 7.910029612987766e-06, "loss": 0.2738, "step": 808 }, { "batch_size": 4, "epoch": 0.3232, "step": 808, "tokens_per_device": 6048 }, { "epoch": 0.3232, "loss_ce": 0.11974260956048965, "loss_lvr": 0.8292722105979919, "loss_mode_switch": 0.0, "loss_total": 0.20266982913017273, "step": 808 }, { "batch_size": 1, "epoch": 0.3232, "step": 808, "tokens_per_device": 4909 }, { "epoch": 0.3232, "loss_ce": 0.03509697690606117, "loss_lvr": 0.2727271318435669, "loss_mode_switch": 0.0, "loss_total": 0.0623696893453598, "step": 808 }, { "batch_size": 4, "epoch": 0.3232, "step": 808, "tokens_per_device": 4424 }, { "epoch": 0.3232, "loss_ce": 0.15486589074134827, "loss_lvr": 0.9255624413490295, "loss_mode_switch": 0.0, "loss_total": 0.24742212891578674, "step": 808 }, { "batch_size": 1, "epoch": 0.3232, "step": 808, "tokens_per_device": 5092 }, { "epoch": 0.3232, "loss_ce": 0.007899170741438866, "loss_lvr": 0.48703667521476746, "loss_mode_switch": 0.0, "loss_total": 0.0566028356552124, "step": 808 }, { "batch_size": 1, "epoch": 0.3232, "step": 808, "tokens_per_device": 5071 }, { "epoch": 0.3232, "loss_ce": 0.013216719962656498, "loss_lvr": 0.3600303530693054, "loss_mode_switch": 0.0, "loss_total": 0.049219753593206406, "step": 808 }, { "batch_size": 4, "epoch": 0.3232, "step": 808, "tokens_per_device": 8372 }, { "epoch": 0.3232, "loss_ce": 0.1531306654214859, "loss_lvr": 0.9088001251220703, "loss_mode_switch": 0.0, "loss_total": 0.24401068687438965, "step": 808 }, { "batch_size": 4, "epoch": 0.3232, "step": 808, "tokens_per_device": 3896 }, { "epoch": 0.3232, "loss_ce": 0.3469315767288208, "loss_lvr": 1.1138688325881958, "loss_mode_switch": 0.0, "loss_total": 0.45831847190856934, "step": 808 }, { "batch_size": 1, "epoch": 0.3232, "step": 808, "tokens_per_device": 4892 }, { "epoch": 0.3232, "loss_ce": 0.004567294847220182, "loss_lvr": 0.36809802055358887, "loss_mode_switch": 0.0, "loss_total": 0.041377097368240356, "step": 808 }, { "epoch": 0.3236, "grad_norm": 1.2732659578323364, "learning_rate": 7.90475976093428e-06, "loss": 0.278, "step": 809 }, { "batch_size": 4, "epoch": 0.3236, "step": 809, "tokens_per_device": 3804 }, { "epoch": 0.3236, "loss_ce": 0.4026569128036499, "loss_lvr": 0.8281043767929077, "loss_mode_switch": 0.0, "loss_total": 0.4854673445224762, "step": 809 }, { "batch_size": 4, "epoch": 0.3236, "step": 809, "tokens_per_device": 4076 }, { "epoch": 0.3236, "loss_ce": 0.744877278804779, "loss_lvr": 0.7688166499137878, "loss_mode_switch": 0.0, "loss_total": 0.8217589259147644, "step": 809 }, { "batch_size": 1, "epoch": 0.3236, "step": 809, "tokens_per_device": 4906 }, { "epoch": 0.3236, "loss_ce": 0.006213146727532148, "loss_lvr": 0.5494534969329834, "loss_mode_switch": 0.0, "loss_total": 0.061158496886491776, "step": 809 }, { "batch_size": 4, "epoch": 0.3236, "step": 809, "tokens_per_device": 3864 }, { "epoch": 0.3236, "loss_ce": 0.30451393127441406, "loss_lvr": 1.2856628894805908, "loss_mode_switch": 0.0, "loss_total": 0.4330802261829376, "step": 809 }, { "batch_size": 4, "epoch": 0.3236, "step": 809, "tokens_per_device": 4644 }, { "epoch": 0.3236, "loss_ce": 0.05668600648641586, "loss_lvr": 0.8899165987968445, "loss_mode_switch": 0.0, "loss_total": 0.14567767083644867, "step": 809 }, { "batch_size": 1, "epoch": 0.3236, "step": 809, "tokens_per_device": 4669 }, { "epoch": 0.3236, "loss_ce": 0.0033048647455871105, "loss_lvr": 0.6337495446205139, "loss_mode_switch": 0.0, "loss_total": 0.06667982041835785, "step": 809 }, { "batch_size": 4, "epoch": 0.3236, "step": 809, "tokens_per_device": 6996 }, { "epoch": 0.3236, "loss_ce": 0.18555782735347748, "loss_lvr": 0.8093422651290894, "loss_mode_switch": 0.0, "loss_total": 0.2664920687675476, "step": 809 }, { "batch_size": 4, "epoch": 0.3236, "step": 809, "tokens_per_device": 4304 }, { "epoch": 0.3236, "loss_ce": 0.242567241191864, "loss_lvr": 0.7258670926094055, "loss_mode_switch": 0.0, "loss_total": 0.31515395641326904, "step": 809 }, { "epoch": 0.324, "grad_norm": 1.3629121780395508, "learning_rate": 7.89948503374835e-06, "loss": 0.313, "step": 810 }, { "batch_size": 1, "epoch": 0.324, "step": 810, "tokens_per_device": 4909 }, { "epoch": 0.324, "loss_ce": 0.48860105872154236, "loss_lvr": 0.6761118173599243, "loss_mode_switch": 0.0, "loss_total": 0.5562122464179993, "step": 810 }, { "batch_size": 4, "epoch": 0.324, "step": 810, "tokens_per_device": 4268 }, { "epoch": 0.324, "loss_ce": 0.16026565432548523, "loss_lvr": 0.752493679523468, "loss_mode_switch": 0.0, "loss_total": 0.2355150282382965, "step": 810 }, { "batch_size": 4, "epoch": 0.324, "step": 810, "tokens_per_device": 3856 }, { "epoch": 0.324, "loss_ce": 0.36889126896858215, "loss_lvr": 1.0775971412658691, "loss_mode_switch": 0.0, "loss_total": 0.47665098309516907, "step": 810 }, { "batch_size": 1, "epoch": 0.324, "step": 810, "tokens_per_device": 4861 }, { "epoch": 0.324, "loss_ce": 0.05894111469388008, "loss_lvr": 0.29355937242507935, "loss_mode_switch": 0.0, "loss_total": 0.0882970541715622, "step": 810 }, { "batch_size": 4, "epoch": 0.324, "step": 810, "tokens_per_device": 4220 }, { "epoch": 0.324, "loss_ce": 0.11280865967273712, "loss_lvr": 1.1992523670196533, "loss_mode_switch": 0.0, "loss_total": 0.23273390531539917, "step": 810 }, { "batch_size": 4, "epoch": 0.324, "step": 810, "tokens_per_device": 13592 }, { "epoch": 0.324, "loss_ce": 0.05569540336728096, "loss_lvr": 0.7643532752990723, "loss_mode_switch": 0.0, "loss_total": 0.1321307271718979, "step": 810 }, { "batch_size": 4, "epoch": 0.324, "step": 810, "tokens_per_device": 5132 }, { "epoch": 0.324, "loss_ce": 0.021784130483865738, "loss_lvr": 0.8598312735557556, "loss_mode_switch": 0.0, "loss_total": 0.107767254114151, "step": 810 }, { "batch_size": 4, "epoch": 0.324, "step": 810, "tokens_per_device": 4000 }, { "epoch": 0.324, "loss_ce": 0.42071476578712463, "loss_lvr": 0.9487953186035156, "loss_mode_switch": 0.0, "loss_total": 0.5155943036079407, "step": 810 }, { "epoch": 0.3244, "grad_norm": 1.6218163967132568, "learning_rate": 7.89420544028269e-06, "loss": 0.3185, "step": 811 }, { "batch_size": 4, "epoch": 0.3244, "step": 811, "tokens_per_device": 7952 }, { "epoch": 0.3244, "loss_ce": 0.4925045371055603, "loss_lvr": 0.9770412445068359, "loss_mode_switch": 0.0, "loss_total": 0.5902086496353149, "step": 811 }, { "batch_size": 1, "epoch": 0.3244, "step": 811, "tokens_per_device": 4919 }, { "epoch": 0.3244, "loss_ce": 0.7294282913208008, "loss_lvr": 0.7743719220161438, "loss_mode_switch": 0.0, "loss_total": 0.8068654537200928, "step": 811 }, { "batch_size": 1, "epoch": 0.3244, "step": 811, "tokens_per_device": 4909 }, { "epoch": 0.3244, "loss_ce": 0.0792345404624939, "loss_lvr": 0.2972812354564667, "loss_mode_switch": 0.0, "loss_total": 0.10896266251802444, "step": 811 }, { "batch_size": 4, "epoch": 0.3244, "step": 811, "tokens_per_device": 4132 }, { "epoch": 0.3244, "loss_ce": 0.05928493291139603, "loss_lvr": 0.9957274794578552, "loss_mode_switch": 0.0, "loss_total": 0.15885767340660095, "step": 811 }, { "batch_size": 4, "epoch": 0.3244, "step": 811, "tokens_per_device": 7300 }, { "epoch": 0.3244, "loss_ce": 0.10114538669586182, "loss_lvr": 0.7496371269226074, "loss_mode_switch": 0.0, "loss_total": 0.17610910534858704, "step": 811 }, { "batch_size": 4, "epoch": 0.3244, "step": 811, "tokens_per_device": 4260 }, { "epoch": 0.3244, "loss_ce": 0.26380041241645813, "loss_lvr": 0.9325080513954163, "loss_mode_switch": 0.0, "loss_total": 0.35705122351646423, "step": 811 }, { "batch_size": 4, "epoch": 0.3244, "step": 811, "tokens_per_device": 3888 }, { "epoch": 0.3244, "loss_ce": 0.08400062471628189, "loss_lvr": 0.982471764087677, "loss_mode_switch": 0.0, "loss_total": 0.1822478026151657, "step": 811 }, { "batch_size": 4, "epoch": 0.3244, "step": 811, "tokens_per_device": 8068 }, { "epoch": 0.3244, "loss_ce": 0.09711145609617233, "loss_lvr": 0.6645820736885071, "loss_mode_switch": 0.0, "loss_total": 0.16356965899467468, "step": 811 }, { "epoch": 0.3248, "grad_norm": 1.3611547946929932, "learning_rate": 7.888920989398174e-06, "loss": 0.2927, "step": 812 }, { "batch_size": 4, "epoch": 0.3248, "step": 812, "tokens_per_device": 4852 }, { "epoch": 0.3248, "loss_ce": 0.11868418008089066, "loss_lvr": 0.7061521410942078, "loss_mode_switch": 0.0, "loss_total": 0.18929940462112427, "step": 812 }, { "batch_size": 4, "epoch": 0.3248, "step": 812, "tokens_per_device": 8572 }, { "epoch": 0.3248, "loss_ce": 0.06748249381780624, "loss_lvr": 0.7427656054496765, "loss_mode_switch": 0.0, "loss_total": 0.14175905287265778, "step": 812 }, { "batch_size": 4, "epoch": 0.3248, "step": 812, "tokens_per_device": 1384 }, { "epoch": 0.3248, "loss_ce": 0.8911433815956116, "loss_lvr": 1.116043210029602, "loss_mode_switch": 0.0, "loss_total": 1.002747654914856, "step": 812 }, { "batch_size": 1, "epoch": 0.3248, "step": 812, "tokens_per_device": 4889 }, { "epoch": 0.3248, "loss_ce": 0.3978538513183594, "loss_lvr": 0.1386401355266571, "loss_mode_switch": 0.0, "loss_total": 0.41171786189079285, "step": 812 }, { "batch_size": 1, "epoch": 0.3248, "step": 812, "tokens_per_device": 4863 }, { "epoch": 0.3248, "loss_ce": 0.011748021468520164, "loss_lvr": 0.18245038390159607, "loss_mode_switch": 0.0, "loss_total": 0.02999306097626686, "step": 812 }, { "batch_size": 1, "epoch": 0.3248, "step": 812, "tokens_per_device": 5182 }, { "epoch": 0.3248, "loss_ce": 0.004767275415360928, "loss_lvr": 0.2889959216117859, "loss_mode_switch": 0.0, "loss_total": 0.03366686776280403, "step": 812 }, { "batch_size": 4, "epoch": 0.3248, "step": 812, "tokens_per_device": 4880 }, { "epoch": 0.3248, "loss_ce": 0.18418078124523163, "loss_lvr": 0.7538847327232361, "loss_mode_switch": 0.0, "loss_total": 0.2595692574977875, "step": 812 }, { "batch_size": 4, "epoch": 0.3248, "step": 812, "tokens_per_device": 3340 }, { "epoch": 0.3248, "loss_ce": 0.2670123279094696, "loss_lvr": 0.8331156969070435, "loss_mode_switch": 0.0, "loss_total": 0.3503239154815674, "step": 812 }, { "epoch": 0.3252, "grad_norm": 1.6107465028762817, "learning_rate": 7.883631689963831e-06, "loss": 0.3125, "step": 813 }, { "batch_size": 4, "epoch": 0.3252, "step": 813, "tokens_per_device": 4868 }, { "epoch": 0.3252, "loss_ce": 0.17956791818141937, "loss_lvr": 0.7747396230697632, "loss_mode_switch": 0.0, "loss_total": 0.257041871547699, "step": 813 }, { "batch_size": 4, "epoch": 0.3252, "step": 813, "tokens_per_device": 4552 }, { "epoch": 0.3252, "loss_ce": 0.11149974167346954, "loss_lvr": 0.7893356084823608, "loss_mode_switch": 0.0, "loss_total": 0.1904332935810089, "step": 813 }, { "batch_size": 1, "epoch": 0.3252, "step": 813, "tokens_per_device": 4907 }, { "epoch": 0.3252, "loss_ce": 0.1156635656952858, "loss_lvr": 0.2648226320743561, "loss_mode_switch": 0.0, "loss_total": 0.14214582741260529, "step": 813 }, { "batch_size": 4, "epoch": 0.3252, "step": 813, "tokens_per_device": 2636 }, { "epoch": 0.3252, "loss_ce": 0.3208310306072235, "loss_lvr": 0.7751087546348572, "loss_mode_switch": 0.0, "loss_total": 0.3983418941497803, "step": 813 }, { "batch_size": 4, "epoch": 0.3252, "step": 813, "tokens_per_device": 6828 }, { "epoch": 0.3252, "loss_ce": 0.15626391768455505, "loss_lvr": 0.7379692196846008, "loss_mode_switch": 0.0, "loss_total": 0.23006084561347961, "step": 813 }, { "batch_size": 4, "epoch": 0.3252, "step": 813, "tokens_per_device": 4240 }, { "epoch": 0.3252, "loss_ce": 0.04793264716863632, "loss_lvr": 0.5382492542266846, "loss_mode_switch": 0.0, "loss_total": 0.10175757110118866, "step": 813 }, { "batch_size": 4, "epoch": 0.3252, "step": 813, "tokens_per_device": 6160 }, { "epoch": 0.3252, "loss_ce": 0.19983507692813873, "loss_lvr": 0.8782318234443665, "loss_mode_switch": 0.0, "loss_total": 0.2876582741737366, "step": 813 }, { "batch_size": 4, "epoch": 0.3252, "step": 813, "tokens_per_device": 3800 }, { "epoch": 0.3252, "loss_ce": 0.08438479900360107, "loss_lvr": 1.079188585281372, "loss_mode_switch": 0.0, "loss_total": 0.19230365753173828, "step": 813 }, { "epoch": 0.3256, "grad_norm": 1.3611764907836914, "learning_rate": 7.878337550856829e-06, "loss": 0.2724, "step": 814 }, { "batch_size": 4, "epoch": 0.3256, "step": 814, "tokens_per_device": 5528 }, { "epoch": 0.3256, "loss_ce": 0.04298454523086548, "loss_lvr": 0.8907801508903503, "loss_mode_switch": 0.0, "loss_total": 0.13206255435943604, "step": 814 }, { "batch_size": 4, "epoch": 0.3256, "step": 814, "tokens_per_device": 1468 }, { "epoch": 0.3256, "loss_ce": 0.8721607327461243, "loss_lvr": 0.9527223706245422, "loss_mode_switch": 0.0, "loss_total": 0.967432975769043, "step": 814 }, { "batch_size": 4, "epoch": 0.3256, "step": 814, "tokens_per_device": 4552 }, { "epoch": 0.3256, "loss_ce": 0.1793074756860733, "loss_lvr": 1.148299217224121, "loss_mode_switch": 0.0, "loss_total": 0.2941373884677887, "step": 814 }, { "batch_size": 1, "epoch": 0.3256, "step": 814, "tokens_per_device": 6750 }, { "epoch": 0.3256, "loss_ce": 1.1637492179870605, "loss_lvr": 0.2842857539653778, "loss_mode_switch": 0.0, "loss_total": 1.1921777725219727, "step": 814 }, { "batch_size": 1, "epoch": 0.3256, "step": 814, "tokens_per_device": 4875 }, { "epoch": 0.3256, "loss_ce": 0.12603484094142914, "loss_lvr": 1.4514412879943848, "loss_mode_switch": 0.0, "loss_total": 0.2711789608001709, "step": 814 }, { "batch_size": 1, "epoch": 0.3256, "step": 814, "tokens_per_device": 5026 }, { "epoch": 0.3256, "loss_ce": 0.08886072039604187, "loss_lvr": 1.1430091857910156, "loss_mode_switch": 0.0, "loss_total": 0.20316164195537567, "step": 814 }, { "batch_size": 4, "epoch": 0.3256, "step": 814, "tokens_per_device": 4604 }, { "epoch": 0.3256, "loss_ce": 0.02706410363316536, "loss_lvr": 0.7147050499916077, "loss_mode_switch": 0.0, "loss_total": 0.09853461384773254, "step": 814 }, { "batch_size": 4, "epoch": 0.3256, "step": 814, "tokens_per_device": 1276 }, { "epoch": 0.3256, "loss_ce": 0.1546678990125656, "loss_lvr": 0.9589345455169678, "loss_mode_switch": 0.0, "loss_total": 0.25056135654449463, "step": 814 }, { "epoch": 0.326, "grad_norm": 1.4095441102981567, "learning_rate": 7.873038580962453e-06, "loss": 0.3111, "step": 815 }, { "batch_size": 4, "epoch": 0.326, "step": 815, "tokens_per_device": 1840 }, { "epoch": 0.326, "loss_ce": 0.12276751548051834, "loss_lvr": 0.966854453086853, "loss_mode_switch": 0.0, "loss_total": 0.21945296227931976, "step": 815 }, { "batch_size": 4, "epoch": 0.326, "step": 815, "tokens_per_device": 4204 }, { "epoch": 0.326, "loss_ce": 0.12577979266643524, "loss_lvr": 0.9019038081169128, "loss_mode_switch": 0.0, "loss_total": 0.21597017347812653, "step": 815 }, { "batch_size": 4, "epoch": 0.326, "step": 815, "tokens_per_device": 1528 }, { "epoch": 0.326, "loss_ce": 0.3651212155818939, "loss_lvr": 1.1640607118606567, "loss_mode_switch": 0.0, "loss_total": 0.48152729868888855, "step": 815 }, { "batch_size": 4, "epoch": 0.326, "step": 815, "tokens_per_device": 2768 }, { "epoch": 0.326, "loss_ce": 0.30644816160202026, "loss_lvr": 0.966789722442627, "loss_mode_switch": 0.0, "loss_total": 0.40312713384628296, "step": 815 }, { "batch_size": 4, "epoch": 0.326, "step": 815, "tokens_per_device": 5700 }, { "epoch": 0.326, "loss_ce": 0.3811728060245514, "loss_lvr": 0.5858861804008484, "loss_mode_switch": 0.0, "loss_total": 0.4397614300251007, "step": 815 }, { "batch_size": 4, "epoch": 0.326, "step": 815, "tokens_per_device": 5948 }, { "epoch": 0.326, "loss_ce": 0.2543327808380127, "loss_lvr": 0.8670434951782227, "loss_mode_switch": 0.0, "loss_total": 0.3410371243953705, "step": 815 }, { "batch_size": 1, "epoch": 0.326, "step": 815, "tokens_per_device": 5280 }, { "epoch": 0.326, "loss_ce": 0.009852733463048935, "loss_lvr": 0.618252158164978, "loss_mode_switch": 0.0, "loss_total": 0.07167795300483704, "step": 815 }, { "batch_size": 4, "epoch": 0.326, "step": 815, "tokens_per_device": 3956 }, { "epoch": 0.326, "loss_ce": 0.14899790287017822, "loss_lvr": 0.9445402026176453, "loss_mode_switch": 0.0, "loss_total": 0.24345192313194275, "step": 815 }, { "epoch": 0.3264, "grad_norm": 1.2064069509506226, "learning_rate": 7.867734789174104e-06, "loss": 0.2903, "step": 816 }, { "batch_size": 4, "epoch": 0.3264, "step": 816, "tokens_per_device": 3900 }, { "epoch": 0.3264, "loss_ce": 0.4035540819168091, "loss_lvr": 0.9358345866203308, "loss_mode_switch": 0.0, "loss_total": 0.49713754653930664, "step": 816 }, { "batch_size": 4, "epoch": 0.3264, "step": 816, "tokens_per_device": 5236 }, { "epoch": 0.3264, "loss_ce": 0.6163262128829956, "loss_lvr": 0.6649092435836792, "loss_mode_switch": 0.0, "loss_total": 0.6828171610832214, "step": 816 }, { "batch_size": 4, "epoch": 0.3264, "step": 816, "tokens_per_device": 14192 }, { "epoch": 0.3264, "loss_ce": 0.8203714489936829, "loss_lvr": 0.7438762784004211, "loss_mode_switch": 0.0, "loss_total": 0.8947590589523315, "step": 816 }, { "batch_size": 4, "epoch": 0.3264, "step": 816, "tokens_per_device": 7644 }, { "epoch": 0.3264, "loss_ce": 0.5875803232192993, "loss_lvr": 0.536817729473114, "loss_mode_switch": 0.0, "loss_total": 0.6412621140480042, "step": 816 }, { "batch_size": 1, "epoch": 0.3264, "step": 816, "tokens_per_device": 5181 }, { "epoch": 0.3264, "loss_ce": 0.03692558780312538, "loss_lvr": 0.38894709944725037, "loss_mode_switch": 0.0, "loss_total": 0.07582029700279236, "step": 816 }, { "batch_size": 4, "epoch": 0.3264, "step": 816, "tokens_per_device": 1492 }, { "epoch": 0.3264, "loss_ce": 0.4160829782485962, "loss_lvr": 0.9555174708366394, "loss_mode_switch": 0.0, "loss_total": 0.5116347074508667, "step": 816 }, { "batch_size": 4, "epoch": 0.3264, "step": 816, "tokens_per_device": 4224 }, { "epoch": 0.3264, "loss_ce": 0.0057799918577075005, "loss_lvr": 0.7413709759712219, "loss_mode_switch": 0.0, "loss_total": 0.07991708815097809, "step": 816 }, { "batch_size": 4, "epoch": 0.3264, "step": 816, "tokens_per_device": 15112 }, { "epoch": 0.3264, "loss_ce": 0.3233910799026489, "loss_lvr": 0.3468640446662903, "loss_mode_switch": 0.0, "loss_total": 0.3580774962902069, "step": 816 }, { "epoch": 0.3268, "grad_norm": 1.2772948741912842, "learning_rate": 7.86242618439327e-06, "loss": 0.3391, "step": 817 }, { "batch_size": 1, "epoch": 0.3268, "step": 817, "tokens_per_device": 5116 }, { "epoch": 0.3268, "loss_ce": 0.0012150272959843278, "loss_lvr": 0.7840799689292908, "loss_mode_switch": 0.0, "loss_total": 0.0796230211853981, "step": 817 }, { "batch_size": 1, "epoch": 0.3268, "step": 817, "tokens_per_device": 5108 }, { "epoch": 0.3268, "loss_ce": 0.04248104989528656, "loss_lvr": 0.5929129123687744, "loss_mode_switch": 0.0, "loss_total": 0.10177233815193176, "step": 817 }, { "batch_size": 4, "epoch": 0.3268, "step": 817, "tokens_per_device": 3368 }, { "epoch": 0.3268, "loss_ce": 0.1614728569984436, "loss_lvr": 1.1469007730484009, "loss_mode_switch": 0.0, "loss_total": 0.27616292238235474, "step": 817 }, { "batch_size": 4, "epoch": 0.3268, "step": 817, "tokens_per_device": 2944 }, { "epoch": 0.3268, "loss_ce": 0.4023453891277313, "loss_lvr": 0.8814164400100708, "loss_mode_switch": 0.0, "loss_total": 0.4904870390892029, "step": 817 }, { "batch_size": 4, "epoch": 0.3268, "step": 817, "tokens_per_device": 3760 }, { "epoch": 0.3268, "loss_ce": 0.32833242416381836, "loss_lvr": 1.0539515018463135, "loss_mode_switch": 0.0, "loss_total": 0.43372756242752075, "step": 817 }, { "batch_size": 1, "epoch": 0.3268, "step": 817, "tokens_per_device": 8179 }, { "epoch": 0.3268, "loss_ce": 0.24854809045791626, "loss_lvr": 0.3214718997478485, "loss_mode_switch": 0.0, "loss_total": 0.2806952893733978, "step": 817 }, { "batch_size": 4, "epoch": 0.3268, "step": 817, "tokens_per_device": 5332 }, { "epoch": 0.3268, "loss_ce": 0.11569789052009583, "loss_lvr": 0.7679312825202942, "loss_mode_switch": 0.0, "loss_total": 0.19249102473258972, "step": 817 }, { "batch_size": 4, "epoch": 0.3268, "step": 817, "tokens_per_device": 9716 }, { "epoch": 0.3268, "loss_ce": 0.09022219479084015, "loss_lvr": 0.9088181257247925, "loss_mode_switch": 0.0, "loss_total": 0.18110400438308716, "step": 817 }, { "epoch": 0.3272, "grad_norm": 1.3146390914916992, "learning_rate": 7.857112775529513e-06, "loss": 0.3427, "step": 818 }, { "batch_size": 4, "epoch": 0.3272, "step": 818, "tokens_per_device": 1496 }, { "epoch": 0.3272, "loss_ce": 0.33579713106155396, "loss_lvr": 0.989072859287262, "loss_mode_switch": 0.0, "loss_total": 0.43470442295074463, "step": 818 }, { "batch_size": 4, "epoch": 0.3272, "step": 818, "tokens_per_device": 4896 }, { "epoch": 0.3272, "loss_ce": 0.6547307968139648, "loss_lvr": 0.7118942737579346, "loss_mode_switch": 0.0, "loss_total": 0.7259202003479004, "step": 818 }, { "batch_size": 4, "epoch": 0.3272, "step": 818, "tokens_per_device": 5980 }, { "epoch": 0.3272, "loss_ce": 0.07256122678518295, "loss_lvr": 0.7143493294715881, "loss_mode_switch": 0.0, "loss_total": 0.14399616420269012, "step": 818 }, { "batch_size": 4, "epoch": 0.3272, "step": 818, "tokens_per_device": 1328 }, { "epoch": 0.3272, "loss_ce": 0.38869428634643555, "loss_lvr": 1.0515706539154053, "loss_mode_switch": 0.0, "loss_total": 0.49385136365890503, "step": 818 }, { "batch_size": 1, "epoch": 0.3272, "step": 818, "tokens_per_device": 5030 }, { "epoch": 0.3272, "loss_ce": 0.014125099405646324, "loss_lvr": 0.8495474457740784, "loss_mode_switch": 0.0, "loss_total": 0.09907984733581543, "step": 818 }, { "batch_size": 4, "epoch": 0.3272, "step": 818, "tokens_per_device": 16320 }, { "epoch": 0.3272, "loss_ce": 0.2488146424293518, "loss_lvr": 0.5791321992874146, "loss_mode_switch": 0.0, "loss_total": 0.3067278563976288, "step": 818 }, { "batch_size": 1, "epoch": 0.3272, "step": 818, "tokens_per_device": 4865 }, { "epoch": 0.3272, "loss_ce": 0.0014458937803283334, "loss_lvr": 0.2808438241481781, "loss_mode_switch": 0.0, "loss_total": 0.029530275613069534, "step": 818 }, { "batch_size": 4, "epoch": 0.3272, "step": 818, "tokens_per_device": 1884 }, { "epoch": 0.3272, "loss_ce": 0.7753170728683472, "loss_lvr": 1.0100481510162354, "loss_mode_switch": 0.0, "loss_total": 0.8763219118118286, "step": 818 }, { "epoch": 0.3276, "grad_norm": 1.1761924028396606, "learning_rate": 7.85179457150047e-06, "loss": 0.3156, "step": 819 }, { "batch_size": 4, "epoch": 0.3276, "step": 819, "tokens_per_device": 2544 }, { "epoch": 0.3276, "loss_ce": 0.3250255584716797, "loss_lvr": 1.3208893537521362, "loss_mode_switch": 0.0, "loss_total": 0.4571145176887512, "step": 819 }, { "batch_size": 1, "epoch": 0.3276, "step": 819, "tokens_per_device": 5831 }, { "epoch": 0.3276, "loss_ce": 0.16207556426525116, "loss_lvr": 0.4112144112586975, "loss_mode_switch": 0.0, "loss_total": 0.20319700241088867, "step": 819 }, { "batch_size": 4, "epoch": 0.3276, "step": 819, "tokens_per_device": 6224 }, { "epoch": 0.3276, "loss_ce": 0.08003637194633484, "loss_lvr": 0.7804007530212402, "loss_mode_switch": 0.0, "loss_total": 0.1580764502286911, "step": 819 }, { "batch_size": 4, "epoch": 0.3276, "step": 819, "tokens_per_device": 6028 }, { "epoch": 0.3276, "loss_ce": 0.29911091923713684, "loss_lvr": 1.0349777936935425, "loss_mode_switch": 0.0, "loss_total": 0.4026086926460266, "step": 819 }, { "batch_size": 4, "epoch": 0.3276, "step": 819, "tokens_per_device": 11220 }, { "epoch": 0.3276, "loss_ce": 0.3588808476924896, "loss_lvr": 0.45450499653816223, "loss_mode_switch": 0.0, "loss_total": 0.40433135628700256, "step": 819 }, { "batch_size": 4, "epoch": 0.3276, "step": 819, "tokens_per_device": 1208 }, { "epoch": 0.3276, "loss_ce": 0.19616757333278656, "loss_lvr": 1.300013542175293, "loss_mode_switch": 0.0, "loss_total": 0.3261689245700836, "step": 819 }, { "batch_size": 4, "epoch": 0.3276, "step": 819, "tokens_per_device": 3908 }, { "epoch": 0.3276, "loss_ce": 0.08966612815856934, "loss_lvr": 0.8871820569038391, "loss_mode_switch": 0.0, "loss_total": 0.17838433384895325, "step": 819 }, { "batch_size": 4, "epoch": 0.3276, "step": 819, "tokens_per_device": 4272 }, { "epoch": 0.3276, "loss_ce": 0.7817657589912415, "loss_lvr": 1.0071192979812622, "loss_mode_switch": 0.0, "loss_total": 0.8824777007102966, "step": 819 }, { "epoch": 0.328, "grad_norm": 1.4534469842910767, "learning_rate": 7.846471581231814e-06, "loss": 0.3311, "step": 820 }, { "batch_size": 4, "epoch": 0.328, "step": 820, "tokens_per_device": 2724 }, { "epoch": 0.328, "loss_ce": 0.2349875420331955, "loss_lvr": 0.42568570375442505, "loss_mode_switch": 0.0, "loss_total": 0.2775561213493347, "step": 820 }, { "batch_size": 1, "epoch": 0.328, "step": 820, "tokens_per_device": 4882 }, { "epoch": 0.328, "loss_ce": 0.045338187366724014, "loss_lvr": 0.2004891186952591, "loss_mode_switch": 0.0, "loss_total": 0.06538709998130798, "step": 820 }, { "batch_size": 1, "epoch": 0.328, "step": 820, "tokens_per_device": 5050 }, { "epoch": 0.328, "loss_ce": 0.02858659252524376, "loss_lvr": 0.620673656463623, "loss_mode_switch": 0.0, "loss_total": 0.09065395593643188, "step": 820 }, { "batch_size": 4, "epoch": 0.328, "step": 820, "tokens_per_device": 4348 }, { "epoch": 0.328, "loss_ce": 0.15730568766593933, "loss_lvr": 0.8418473601341248, "loss_mode_switch": 0.0, "loss_total": 0.2414904236793518, "step": 820 }, { "batch_size": 4, "epoch": 0.328, "step": 820, "tokens_per_device": 2532 }, { "epoch": 0.328, "loss_ce": 0.25154927372932434, "loss_lvr": 0.7914206385612488, "loss_mode_switch": 0.0, "loss_total": 0.3306913375854492, "step": 820 }, { "batch_size": 4, "epoch": 0.328, "step": 820, "tokens_per_device": 4920 }, { "epoch": 0.328, "loss_ce": 0.34509384632110596, "loss_lvr": 1.0859471559524536, "loss_mode_switch": 0.0, "loss_total": 0.4536885619163513, "step": 820 }, { "batch_size": 1, "epoch": 0.328, "step": 820, "tokens_per_device": 5180 }, { "epoch": 0.328, "loss_ce": 0.3926481008529663, "loss_lvr": 0.30023178458213806, "loss_mode_switch": 0.0, "loss_total": 0.42267128825187683, "step": 820 }, { "batch_size": 4, "epoch": 0.328, "step": 820, "tokens_per_device": 4228 }, { "epoch": 0.328, "loss_ce": 0.1825074553489685, "loss_lvr": 0.9310370087623596, "loss_mode_switch": 0.0, "loss_total": 0.27561116218566895, "step": 820 }, { "epoch": 0.3284, "grad_norm": 1.3246687650680542, "learning_rate": 7.841143813657257e-06, "loss": 0.2855, "step": 821 }, { "batch_size": 4, "epoch": 0.3284, "step": 821, "tokens_per_device": 5380 }, { "epoch": 0.3284, "loss_ce": 0.25013601779937744, "loss_lvr": 0.7376478910446167, "loss_mode_switch": 0.0, "loss_total": 0.32390081882476807, "step": 821 }, { "batch_size": 4, "epoch": 0.3284, "step": 821, "tokens_per_device": 4236 }, { "epoch": 0.3284, "loss_ce": 0.40298354625701904, "loss_lvr": 0.9694536924362183, "loss_mode_switch": 0.0, "loss_total": 0.49992892146110535, "step": 821 }, { "batch_size": 1, "epoch": 0.3284, "step": 821, "tokens_per_device": 4898 }, { "epoch": 0.3284, "loss_ce": 0.015245764516294003, "loss_lvr": 0.44211918115615845, "loss_mode_switch": 0.0, "loss_total": 0.0594576857984066, "step": 821 }, { "batch_size": 4, "epoch": 0.3284, "step": 821, "tokens_per_device": 5660 }, { "epoch": 0.3284, "loss_ce": 0.15020640194416046, "loss_lvr": 0.7562934160232544, "loss_mode_switch": 0.0, "loss_total": 0.22583574056625366, "step": 821 }, { "batch_size": 4, "epoch": 0.3284, "step": 821, "tokens_per_device": 4240 }, { "epoch": 0.3284, "loss_ce": 0.5496591925621033, "loss_lvr": 0.8986942172050476, "loss_mode_switch": 0.0, "loss_total": 0.6395286321640015, "step": 821 }, { "batch_size": 4, "epoch": 0.3284, "step": 821, "tokens_per_device": 3512 }, { "epoch": 0.3284, "loss_ce": 0.11942648887634277, "loss_lvr": 0.922976016998291, "loss_mode_switch": 0.0, "loss_total": 0.21172410249710083, "step": 821 }, { "batch_size": 1, "epoch": 0.3284, "step": 821, "tokens_per_device": 5176 }, { "epoch": 0.3284, "loss_ce": 0.0035828568506985903, "loss_lvr": 0.6483650207519531, "loss_mode_switch": 0.0, "loss_total": 0.06841935962438583, "step": 821 }, { "batch_size": 4, "epoch": 0.3284, "step": 821, "tokens_per_device": 2716 }, { "epoch": 0.3284, "loss_ce": 0.27847278118133545, "loss_lvr": 1.1402308940887451, "loss_mode_switch": 0.0, "loss_total": 0.39249587059020996, "step": 821 }, { "epoch": 0.3288, "grad_norm": 1.6047680377960205, "learning_rate": 7.835811277718528e-06, "loss": 0.2749, "step": 822 }, { "batch_size": 4, "epoch": 0.3288, "step": 822, "tokens_per_device": 5192 }, { "epoch": 0.3288, "loss_ce": 0.38082727789878845, "loss_lvr": 0.7599466443061829, "loss_mode_switch": 0.0, "loss_total": 0.4568219482898712, "step": 822 }, { "batch_size": 4, "epoch": 0.3288, "step": 822, "tokens_per_device": 9684 }, { "epoch": 0.3288, "loss_ce": 0.0240616537630558, "loss_lvr": 1.0648261308670044, "loss_mode_switch": 0.0, "loss_total": 0.1305442750453949, "step": 822 }, { "batch_size": 4, "epoch": 0.3288, "step": 822, "tokens_per_device": 4104 }, { "epoch": 0.3288, "loss_ce": 0.01304134912788868, "loss_lvr": 0.7794929146766663, "loss_mode_switch": 0.0, "loss_total": 0.09099064022302628, "step": 822 }, { "batch_size": 1, "epoch": 0.3288, "step": 822, "tokens_per_device": 4919 }, { "epoch": 0.3288, "loss_ce": 0.0008218962466344237, "loss_lvr": 0.24487267434597015, "loss_mode_switch": 0.0, "loss_total": 0.025309164077043533, "step": 822 }, { "batch_size": 1, "epoch": 0.3288, "step": 822, "tokens_per_device": 5165 }, { "epoch": 0.3288, "loss_ce": 0.020274151116609573, "loss_lvr": 0.7597599625587463, "loss_mode_switch": 0.0, "loss_total": 0.09625014662742615, "step": 822 }, { "batch_size": 4, "epoch": 0.3288, "step": 822, "tokens_per_device": 6380 }, { "epoch": 0.3288, "loss_ce": 0.03265802934765816, "loss_lvr": 0.9283957481384277, "loss_mode_switch": 0.0, "loss_total": 0.12549760937690735, "step": 822 }, { "batch_size": 1, "epoch": 0.3288, "step": 822, "tokens_per_device": 5297 }, { "epoch": 0.3288, "loss_ce": 0.005417809821665287, "loss_lvr": 0.6629468202590942, "loss_mode_switch": 0.0, "loss_total": 0.07171249389648438, "step": 822 }, { "batch_size": 1, "epoch": 0.3288, "step": 822, "tokens_per_device": 5489 }, { "epoch": 0.3288, "loss_ce": 0.002828156342729926, "loss_lvr": 0.5830315351486206, "loss_mode_switch": 0.0, "loss_total": 0.0611313097178936, "step": 822 }, { "epoch": 0.3292, "grad_norm": 1.4149481058120728, "learning_rate": 7.830473982365355e-06, "loss": 0.282, "step": 823 }, { "batch_size": 1, "epoch": 0.3292, "step": 823, "tokens_per_device": 5123 }, { "epoch": 0.3292, "loss_ce": 0.0094562703743577, "loss_lvr": 0.3800150156021118, "loss_mode_switch": 0.0, "loss_total": 0.047457773238420486, "step": 823 }, { "batch_size": 1, "epoch": 0.3292, "step": 823, "tokens_per_device": 5131 }, { "epoch": 0.3292, "loss_ce": 0.10041418671607971, "loss_lvr": 0.3990143835544586, "loss_mode_switch": 0.0, "loss_total": 0.14031562209129333, "step": 823 }, { "batch_size": 4, "epoch": 0.3292, "step": 823, "tokens_per_device": 5716 }, { "epoch": 0.3292, "loss_ce": 0.09159151464700699, "loss_lvr": 0.8404993414878845, "loss_mode_switch": 0.0, "loss_total": 0.17564144730567932, "step": 823 }, { "batch_size": 4, "epoch": 0.3292, "step": 823, "tokens_per_device": 2252 }, { "epoch": 0.3292, "loss_ce": 0.8500539660453796, "loss_lvr": 0.983496367931366, "loss_mode_switch": 0.0, "loss_total": 0.9484035968780518, "step": 823 }, { "batch_size": 4, "epoch": 0.3292, "step": 823, "tokens_per_device": 6544 }, { "epoch": 0.3292, "loss_ce": 0.27800288796424866, "loss_lvr": 0.9629723429679871, "loss_mode_switch": 0.0, "loss_total": 0.37430012226104736, "step": 823 }, { "batch_size": 1, "epoch": 0.3292, "step": 823, "tokens_per_device": 4887 }, { "epoch": 0.3292, "loss_ce": 0.08308297395706177, "loss_lvr": 0.5641413331031799, "loss_mode_switch": 0.0, "loss_total": 0.13949710130691528, "step": 823 }, { "batch_size": 4, "epoch": 0.3292, "step": 823, "tokens_per_device": 4112 }, { "epoch": 0.3292, "loss_ce": 0.3416559398174286, "loss_lvr": 0.8260465264320374, "loss_mode_switch": 0.0, "loss_total": 0.42426058650016785, "step": 823 }, { "batch_size": 1, "epoch": 0.3292, "step": 823, "tokens_per_device": 5201 }, { "epoch": 0.3292, "loss_ce": 0.09135746955871582, "loss_lvr": 0.4154883027076721, "loss_mode_switch": 0.0, "loss_total": 0.13290630280971527, "step": 823 }, { "epoch": 0.3296, "grad_norm": 1.5701303482055664, "learning_rate": 7.82513193655546e-06, "loss": 0.3143, "step": 824 }, { "batch_size": 4, "epoch": 0.3296, "step": 824, "tokens_per_device": 2620 }, { "epoch": 0.3296, "loss_ce": 0.10886243730783463, "loss_lvr": 1.5164859294891357, "loss_mode_switch": 0.0, "loss_total": 0.26051104068756104, "step": 824 }, { "batch_size": 1, "epoch": 0.3296, "step": 824, "tokens_per_device": 4930 }, { "epoch": 0.3296, "loss_ce": 0.10176367312669754, "loss_lvr": 0.35384583473205566, "loss_mode_switch": 0.0, "loss_total": 0.13714826107025146, "step": 824 }, { "batch_size": 1, "epoch": 0.3296, "step": 824, "tokens_per_device": 5567 }, { "epoch": 0.3296, "loss_ce": 0.02011430263519287, "loss_lvr": 0.5444875955581665, "loss_mode_switch": 0.0, "loss_total": 0.07456306368112564, "step": 824 }, { "batch_size": 1, "epoch": 0.3296, "step": 824, "tokens_per_device": 5122 }, { "epoch": 0.3296, "loss_ce": 0.0005498647224158049, "loss_lvr": 0.426466166973114, "loss_mode_switch": 0.0, "loss_total": 0.043196480721235275, "step": 824 }, { "batch_size": 4, "epoch": 0.3296, "step": 824, "tokens_per_device": 2652 }, { "epoch": 0.3296, "loss_ce": 0.3004293441772461, "loss_lvr": 0.8602137565612793, "loss_mode_switch": 0.0, "loss_total": 0.38645070791244507, "step": 824 }, { "batch_size": 4, "epoch": 0.3296, "step": 824, "tokens_per_device": 3892 }, { "epoch": 0.3296, "loss_ce": 0.12494923919439316, "loss_lvr": 1.0419169664382935, "loss_mode_switch": 0.0, "loss_total": 0.22914093732833862, "step": 824 }, { "batch_size": 1, "epoch": 0.3296, "step": 824, "tokens_per_device": 5149 }, { "epoch": 0.3296, "loss_ce": 0.0031811660155653954, "loss_lvr": 0.7845548391342163, "loss_mode_switch": 0.0, "loss_total": 0.08163665235042572, "step": 824 }, { "batch_size": 4, "epoch": 0.3296, "step": 824, "tokens_per_device": 4528 }, { "epoch": 0.3296, "loss_ce": 0.05987489968538284, "loss_lvr": 0.7659381628036499, "loss_mode_switch": 0.0, "loss_total": 0.13646870851516724, "step": 824 }, { "epoch": 0.33, "grad_norm": 1.3888269662857056, "learning_rate": 7.819785149254534e-06, "loss": 0.3098, "step": 825 }, { "batch_size": 1, "epoch": 0.33, "step": 825, "tokens_per_device": 5249 }, { "epoch": 0.33, "loss_ce": 0.4044932723045349, "loss_lvr": 0.5664964914321899, "loss_mode_switch": 0.0, "loss_total": 0.4611429274082184, "step": 825 }, { "batch_size": 4, "epoch": 0.33, "step": 825, "tokens_per_device": 1652 }, { "epoch": 0.33, "loss_ce": 0.2523556053638458, "loss_lvr": 0.9533713459968567, "loss_mode_switch": 0.0, "loss_total": 0.34769272804260254, "step": 825 }, { "batch_size": 4, "epoch": 0.33, "step": 825, "tokens_per_device": 2680 }, { "epoch": 0.33, "loss_ce": 0.1250687688589096, "loss_lvr": 0.7421163320541382, "loss_mode_switch": 0.0, "loss_total": 0.19928041100502014, "step": 825 }, { "batch_size": 4, "epoch": 0.33, "step": 825, "tokens_per_device": 2940 }, { "epoch": 0.33, "loss_ce": 0.30617430806159973, "loss_lvr": 0.8348355889320374, "loss_mode_switch": 0.0, "loss_total": 0.3896578550338745, "step": 825 }, { "batch_size": 4, "epoch": 0.33, "step": 825, "tokens_per_device": 4772 }, { "epoch": 0.33, "loss_ce": 0.26351526379585266, "loss_lvr": 0.8172215819358826, "loss_mode_switch": 0.0, "loss_total": 0.3452374339103699, "step": 825 }, { "batch_size": 4, "epoch": 0.33, "step": 825, "tokens_per_device": 2752 }, { "epoch": 0.33, "loss_ce": 0.16421212255954742, "loss_lvr": 0.8712804913520813, "loss_mode_switch": 0.0, "loss_total": 0.25134018063545227, "step": 825 }, { "batch_size": 4, "epoch": 0.33, "step": 825, "tokens_per_device": 5744 }, { "epoch": 0.33, "loss_ce": 0.17511215806007385, "loss_lvr": 1.171931505203247, "loss_mode_switch": 0.0, "loss_total": 0.2923053205013275, "step": 825 }, { "batch_size": 1, "epoch": 0.33, "step": 825, "tokens_per_device": 5096 }, { "epoch": 0.33, "loss_ce": 0.08596361428499222, "loss_lvr": 0.8807815909385681, "loss_mode_switch": 0.0, "loss_total": 0.1740417778491974, "step": 825 }, { "epoch": 0.3304, "grad_norm": 2.0703041553497314, "learning_rate": 7.814433629436225e-06, "loss": 0.2892, "step": 826 }, { "batch_size": 4, "epoch": 0.3304, "step": 826, "tokens_per_device": 3780 }, { "epoch": 0.3304, "loss_ce": 0.4461536705493927, "loss_lvr": 1.238429069519043, "loss_mode_switch": 0.0, "loss_total": 0.5699965953826904, "step": 826 }, { "batch_size": 4, "epoch": 0.3304, "step": 826, "tokens_per_device": 3772 }, { "epoch": 0.3304, "loss_ce": 0.5734858512878418, "loss_lvr": 0.7888750433921814, "loss_mode_switch": 0.0, "loss_total": 0.6523733735084534, "step": 826 }, { "batch_size": 4, "epoch": 0.3304, "step": 826, "tokens_per_device": 3776 }, { "epoch": 0.3304, "loss_ce": 0.2526124119758606, "loss_lvr": 1.8424491882324219, "loss_mode_switch": 0.0, "loss_total": 0.43685734272003174, "step": 826 }, { "batch_size": 4, "epoch": 0.3304, "step": 826, "tokens_per_device": 9144 }, { "epoch": 0.3304, "loss_ce": 0.24101385474205017, "loss_lvr": 0.8403759002685547, "loss_mode_switch": 0.0, "loss_total": 0.3250514566898346, "step": 826 }, { "batch_size": 4, "epoch": 0.3304, "step": 826, "tokens_per_device": 5792 }, { "epoch": 0.3304, "loss_ce": 0.129159078001976, "loss_lvr": 0.7773928642272949, "loss_mode_switch": 0.0, "loss_total": 0.20689836144447327, "step": 826 }, { "batch_size": 1, "epoch": 0.3304, "step": 826, "tokens_per_device": 4885 }, { "epoch": 0.3304, "loss_ce": 0.009870294481515884, "loss_lvr": 0.6240431070327759, "loss_mode_switch": 0.0, "loss_total": 0.07227461040019989, "step": 826 }, { "batch_size": 1, "epoch": 0.3304, "step": 826, "tokens_per_device": 5157 }, { "epoch": 0.3304, "loss_ce": 0.014932425692677498, "loss_lvr": 0.9816940426826477, "loss_mode_switch": 0.0, "loss_total": 0.11310183256864548, "step": 826 }, { "batch_size": 4, "epoch": 0.3304, "step": 826, "tokens_per_device": 4372 }, { "epoch": 0.3304, "loss_ce": 0.2017814666032791, "loss_lvr": 0.960615873336792, "loss_mode_switch": 0.0, "loss_total": 0.2978430390357971, "step": 826 }, { "epoch": 0.3308, "grad_norm": 1.277877926826477, "learning_rate": 7.809077386082129e-06, "loss": 0.388, "step": 827 }, { "batch_size": 4, "epoch": 0.3308, "step": 827, "tokens_per_device": 4288 }, { "epoch": 0.3308, "loss_ce": 0.14863811433315277, "loss_lvr": 0.7864207029342651, "loss_mode_switch": 0.0, "loss_total": 0.22728018462657928, "step": 827 }, { "batch_size": 1, "epoch": 0.3308, "step": 827, "tokens_per_device": 5127 }, { "epoch": 0.3308, "loss_ce": 0.044062353670597076, "loss_lvr": 0.6178511381149292, "loss_mode_switch": 0.0, "loss_total": 0.10584746301174164, "step": 827 }, { "batch_size": 4, "epoch": 0.3308, "step": 827, "tokens_per_device": 1892 }, { "epoch": 0.3308, "loss_ce": 0.7534589767456055, "loss_lvr": 0.954580545425415, "loss_mode_switch": 0.0, "loss_total": 0.8489170074462891, "step": 827 }, { "batch_size": 4, "epoch": 0.3308, "step": 827, "tokens_per_device": 3772 }, { "epoch": 0.3308, "loss_ce": 0.10887014865875244, "loss_lvr": 1.030352234840393, "loss_mode_switch": 0.0, "loss_total": 0.211905375123024, "step": 827 }, { "batch_size": 4, "epoch": 0.3308, "step": 827, "tokens_per_device": 4200 }, { "epoch": 0.3308, "loss_ce": 0.2956821620464325, "loss_lvr": 0.8109362721443176, "loss_mode_switch": 0.0, "loss_total": 0.3767758011817932, "step": 827 }, { "batch_size": 4, "epoch": 0.3308, "step": 827, "tokens_per_device": 1580 }, { "epoch": 0.3308, "loss_ce": 0.18413464725017548, "loss_lvr": 0.9843302965164185, "loss_mode_switch": 0.0, "loss_total": 0.28256767988204956, "step": 827 }, { "batch_size": 1, "epoch": 0.3308, "step": 827, "tokens_per_device": 4881 }, { "epoch": 0.3308, "loss_ce": 0.10460034757852554, "loss_lvr": 0.7644743919372559, "loss_mode_switch": 0.0, "loss_total": 0.18104779720306396, "step": 827 }, { "batch_size": 1, "epoch": 0.3308, "step": 827, "tokens_per_device": 5074 }, { "epoch": 0.3308, "loss_ce": 0.0876341387629509, "loss_lvr": 0.3862846791744232, "loss_mode_switch": 0.0, "loss_total": 0.1262626051902771, "step": 827 }, { "epoch": 0.3312, "grad_norm": 1.3334650993347168, "learning_rate": 7.803716428181762e-06, "loss": 0.2643, "step": 828 }, { "batch_size": 1, "epoch": 0.3312, "step": 828, "tokens_per_device": 4891 }, { "epoch": 0.3312, "loss_ce": 0.056620266288518906, "loss_lvr": 0.8741171360015869, "loss_mode_switch": 0.0, "loss_total": 0.14403198659420013, "step": 828 }, { "batch_size": 4, "epoch": 0.3312, "step": 828, "tokens_per_device": 3384 }, { "epoch": 0.3312, "loss_ce": 0.11657615751028061, "loss_lvr": 1.7678903341293335, "loss_mode_switch": 0.0, "loss_total": 0.2933651804924011, "step": 828 }, { "batch_size": 1, "epoch": 0.3312, "step": 828, "tokens_per_device": 5666 }, { "epoch": 0.3312, "loss_ce": 0.2225368767976761, "loss_lvr": 0.5523388385772705, "loss_mode_switch": 0.0, "loss_total": 0.2777707576751709, "step": 828 }, { "batch_size": 4, "epoch": 0.3312, "step": 828, "tokens_per_device": 10192 }, { "epoch": 0.3312, "loss_ce": 0.2568039894104004, "loss_lvr": 0.9748772382736206, "loss_mode_switch": 0.0, "loss_total": 0.354291707277298, "step": 828 }, { "batch_size": 4, "epoch": 0.3312, "step": 828, "tokens_per_device": 6592 }, { "epoch": 0.3312, "loss_ce": 0.4533896744251251, "loss_lvr": 0.8767901062965393, "loss_mode_switch": 0.0, "loss_total": 0.5410686731338501, "step": 828 }, { "batch_size": 4, "epoch": 0.3312, "step": 828, "tokens_per_device": 2892 }, { "epoch": 0.3312, "loss_ce": 0.3164536654949188, "loss_lvr": 0.9528903961181641, "loss_mode_switch": 0.0, "loss_total": 0.4117427170276642, "step": 828 }, { "batch_size": 4, "epoch": 0.3312, "step": 828, "tokens_per_device": 1320 }, { "epoch": 0.3312, "loss_ce": 0.18846839666366577, "loss_lvr": 1.0061240196228027, "loss_mode_switch": 0.0, "loss_total": 0.28908079862594604, "step": 828 }, { "batch_size": 4, "epoch": 0.3312, "step": 828, "tokens_per_device": 4364 }, { "epoch": 0.3312, "loss_ce": 0.37288495898246765, "loss_lvr": 0.9497392773628235, "loss_mode_switch": 0.0, "loss_total": 0.4678588807582855, "step": 828 }, { "epoch": 0.3316, "grad_norm": 1.4115818738937378, "learning_rate": 7.79835076473256e-06, "loss": 0.3288, "step": 829 }, { "batch_size": 1, "epoch": 0.3316, "step": 829, "tokens_per_device": 4119 }, { "epoch": 0.3316, "loss_ce": 0.02756364829838276, "loss_lvr": 0.594431459903717, "loss_mode_switch": 0.0, "loss_total": 0.08700679242610931, "step": 829 }, { "batch_size": 4, "epoch": 0.3316, "step": 829, "tokens_per_device": 1456 }, { "epoch": 0.3316, "loss_ce": 0.5980322957038879, "loss_lvr": 1.0394017696380615, "loss_mode_switch": 0.0, "loss_total": 0.701972484588623, "step": 829 }, { "batch_size": 4, "epoch": 0.3316, "step": 829, "tokens_per_device": 3804 }, { "epoch": 0.3316, "loss_ce": 0.08663727343082428, "loss_lvr": 1.1917279958724976, "loss_mode_switch": 0.0, "loss_total": 0.2058100700378418, "step": 829 }, { "batch_size": 1, "epoch": 0.3316, "step": 829, "tokens_per_device": 4892 }, { "epoch": 0.3316, "loss_ce": 0.15264421701431274, "loss_lvr": 0.37169766426086426, "loss_mode_switch": 0.0, "loss_total": 0.1898139864206314, "step": 829 }, { "batch_size": 4, "epoch": 0.3316, "step": 829, "tokens_per_device": 4000 }, { "epoch": 0.3316, "loss_ce": 0.4590226411819458, "loss_lvr": 0.963753879070282, "loss_mode_switch": 0.0, "loss_total": 0.5553980469703674, "step": 829 }, { "batch_size": 1, "epoch": 0.3316, "step": 829, "tokens_per_device": 5142 }, { "epoch": 0.3316, "loss_ce": 0.024019695818424225, "loss_lvr": 0.2660478353500366, "loss_mode_switch": 0.0, "loss_total": 0.050624482333660126, "step": 829 }, { "batch_size": 1, "epoch": 0.3316, "step": 829, "tokens_per_device": 4876 }, { "epoch": 0.3316, "loss_ce": 0.10309266299009323, "loss_lvr": 0.3889397978782654, "loss_mode_switch": 0.0, "loss_total": 0.1419866383075714, "step": 829 }, { "batch_size": 1, "epoch": 0.3316, "step": 829, "tokens_per_device": 4892 }, { "epoch": 0.3316, "loss_ce": 0.004268605727702379, "loss_lvr": 0.22721581161022186, "loss_mode_switch": 0.0, "loss_total": 0.026990186423063278, "step": 829 }, { "epoch": 0.332, "grad_norm": 1.3521944284439087, "learning_rate": 7.792980404739849e-06, "loss": 0.3607, "step": 830 }, { "batch_size": 4, "epoch": 0.332, "step": 830, "tokens_per_device": 1416 }, { "epoch": 0.332, "loss_ce": 0.8173969984054565, "loss_lvr": 1.1445319652557373, "loss_mode_switch": 0.0, "loss_total": 0.9318501949310303, "step": 830 }, { "batch_size": 4, "epoch": 0.332, "step": 830, "tokens_per_device": 2628 }, { "epoch": 0.332, "loss_ce": 0.7052134871482849, "loss_lvr": 0.8822641968727112, "loss_mode_switch": 0.0, "loss_total": 0.7934399247169495, "step": 830 }, { "batch_size": 4, "epoch": 0.332, "step": 830, "tokens_per_device": 7484 }, { "epoch": 0.332, "loss_ce": 0.01091849897056818, "loss_lvr": 0.8735761642456055, "loss_mode_switch": 0.0, "loss_total": 0.09827611595392227, "step": 830 }, { "batch_size": 4, "epoch": 0.332, "step": 830, "tokens_per_device": 2640 }, { "epoch": 0.332, "loss_ce": 0.046353742480278015, "loss_lvr": 0.9476790428161621, "loss_mode_switch": 0.0, "loss_total": 0.14112165570259094, "step": 830 }, { "batch_size": 4, "epoch": 0.332, "step": 830, "tokens_per_device": 5860 }, { "epoch": 0.332, "loss_ce": 0.15593144297599792, "loss_lvr": 0.650479257106781, "loss_mode_switch": 0.0, "loss_total": 0.22097936272621155, "step": 830 }, { "batch_size": 4, "epoch": 0.332, "step": 830, "tokens_per_device": 3860 }, { "epoch": 0.332, "loss_ce": 1.0316741466522217, "loss_lvr": 0.836719274520874, "loss_mode_switch": 0.0, "loss_total": 1.115346074104309, "step": 830 }, { "batch_size": 1, "epoch": 0.332, "step": 830, "tokens_per_device": 4894 }, { "epoch": 0.332, "loss_ce": 0.2386910766363144, "loss_lvr": 0.43399521708488464, "loss_mode_switch": 0.0, "loss_total": 0.28209060430526733, "step": 830 }, { "batch_size": 4, "epoch": 0.332, "step": 830, "tokens_per_device": 16380 }, { "epoch": 0.332, "loss_ce": 0.0072371503338217735, "loss_lvr": 0.6545810699462891, "loss_mode_switch": 0.0, "loss_total": 0.07269526273012161, "step": 830 }, { "epoch": 0.3324, "grad_norm": 1.3764238357543945, "learning_rate": 7.787605357216843e-06, "loss": 0.3258, "step": 831 }, { "batch_size": 4, "epoch": 0.3324, "step": 831, "tokens_per_device": 3812 }, { "epoch": 0.3324, "loss_ce": 0.466233491897583, "loss_lvr": 0.9325841665267944, "loss_mode_switch": 0.0, "loss_total": 0.5594919323921204, "step": 831 }, { "batch_size": 1, "epoch": 0.3324, "step": 831, "tokens_per_device": 4876 }, { "epoch": 0.3324, "loss_ce": 0.09592841565608978, "loss_lvr": 0.292753130197525, "loss_mode_switch": 0.0, "loss_total": 0.12520372867584229, "step": 831 }, { "batch_size": 4, "epoch": 0.3324, "step": 831, "tokens_per_device": 1620 }, { "epoch": 0.3324, "loss_ce": 0.33213239908218384, "loss_lvr": 1.131618857383728, "loss_mode_switch": 0.0, "loss_total": 0.4452942907810211, "step": 831 }, { "batch_size": 4, "epoch": 0.3324, "step": 831, "tokens_per_device": 5940 }, { "epoch": 0.3324, "loss_ce": 0.15451055765151978, "loss_lvr": 0.7257040739059448, "loss_mode_switch": 0.0, "loss_total": 0.22708097100257874, "step": 831 }, { "batch_size": 4, "epoch": 0.3324, "step": 831, "tokens_per_device": 3760 }, { "epoch": 0.3324, "loss_ce": 0.3262096643447876, "loss_lvr": 1.2109166383743286, "loss_mode_switch": 0.0, "loss_total": 0.44730132818222046, "step": 831 }, { "batch_size": 4, "epoch": 0.3324, "step": 831, "tokens_per_device": 1396 }, { "epoch": 0.3324, "loss_ce": 0.29222235083580017, "loss_lvr": 1.0973697900772095, "loss_mode_switch": 0.0, "loss_total": 0.4019593298435211, "step": 831 }, { "batch_size": 4, "epoch": 0.3324, "step": 831, "tokens_per_device": 2520 }, { "epoch": 0.3324, "loss_ce": 0.09712612628936768, "loss_lvr": 0.8559733629226685, "loss_mode_switch": 0.0, "loss_total": 0.18272346258163452, "step": 831 }, { "batch_size": 4, "epoch": 0.3324, "step": 831, "tokens_per_device": 3088 }, { "epoch": 0.3324, "loss_ce": 0.22659757733345032, "loss_lvr": 1.1499361991882324, "loss_mode_switch": 0.0, "loss_total": 0.3415912091732025, "step": 831 }, { "epoch": 0.3328, "grad_norm": 1.5113376379013062, "learning_rate": 7.782225631184624e-06, "loss": 0.2886, "step": 832 }, { "batch_size": 4, "epoch": 0.3328, "step": 832, "tokens_per_device": 4380 }, { "epoch": 0.3328, "loss_ce": 0.1717391014099121, "loss_lvr": 0.7863782644271851, "loss_mode_switch": 0.0, "loss_total": 0.25037693977355957, "step": 832 }, { "batch_size": 4, "epoch": 0.3328, "step": 832, "tokens_per_device": 4032 }, { "epoch": 0.3328, "loss_ce": 0.05461646616458893, "loss_lvr": 0.9486313462257385, "loss_mode_switch": 0.0, "loss_total": 0.14947959780693054, "step": 832 }, { "batch_size": 4, "epoch": 0.3328, "step": 832, "tokens_per_device": 12712 }, { "epoch": 0.3328, "loss_ce": 0.25597625970840454, "loss_lvr": 0.6866862773895264, "loss_mode_switch": 0.0, "loss_total": 0.32464489340782166, "step": 832 }, { "batch_size": 4, "epoch": 0.3328, "step": 832, "tokens_per_device": 2232 }, { "epoch": 0.3328, "loss_ce": 0.4522576928138733, "loss_lvr": 0.9655649662017822, "loss_mode_switch": 0.0, "loss_total": 0.5488141775131226, "step": 832 }, { "batch_size": 4, "epoch": 0.3328, "step": 832, "tokens_per_device": 5440 }, { "epoch": 0.3328, "loss_ce": 0.17617541551589966, "loss_lvr": 0.6537641286849976, "loss_mode_switch": 0.0, "loss_total": 0.24155183136463165, "step": 832 }, { "batch_size": 1, "epoch": 0.3328, "step": 832, "tokens_per_device": 5126 }, { "epoch": 0.3328, "loss_ce": 0.009108159691095352, "loss_lvr": 0.4874808192253113, "loss_mode_switch": 0.0, "loss_total": 0.0578562431037426, "step": 832 }, { "batch_size": 4, "epoch": 0.3328, "step": 832, "tokens_per_device": 3796 }, { "epoch": 0.3328, "loss_ce": 0.119843490421772, "loss_lvr": 1.0172115564346313, "loss_mode_switch": 0.0, "loss_total": 0.2215646505355835, "step": 832 }, { "batch_size": 1, "epoch": 0.3328, "step": 832, "tokens_per_device": 4994 }, { "epoch": 0.3328, "loss_ce": 0.046476032584905624, "loss_lvr": 0.6126506328582764, "loss_mode_switch": 0.0, "loss_total": 0.1077410951256752, "step": 832 }, { "epoch": 0.3332, "grad_norm": 1.3706555366516113, "learning_rate": 7.776841235672119e-06, "loss": 0.2694, "step": 833 }, { "batch_size": 4, "epoch": 0.3332, "step": 833, "tokens_per_device": 2856 }, { "epoch": 0.3332, "loss_ce": 0.6839454174041748, "loss_lvr": 0.9940512776374817, "loss_mode_switch": 0.0, "loss_total": 0.7833505272865295, "step": 833 }, { "batch_size": 4, "epoch": 0.3332, "step": 833, "tokens_per_device": 5700 }, { "epoch": 0.3332, "loss_ce": 0.05832234397530556, "loss_lvr": 0.9681112766265869, "loss_mode_switch": 0.0, "loss_total": 0.1551334708929062, "step": 833 }, { "batch_size": 1, "epoch": 0.3332, "step": 833, "tokens_per_device": 5076 }, { "epoch": 0.3332, "loss_ce": 0.0054021431133151054, "loss_lvr": 0.6063940525054932, "loss_mode_switch": 0.0, "loss_total": 0.06604155153036118, "step": 833 }, { "batch_size": 4, "epoch": 0.3332, "step": 833, "tokens_per_device": 4264 }, { "epoch": 0.3332, "loss_ce": 0.06174388900399208, "loss_lvr": 0.8250752687454224, "loss_mode_switch": 0.0, "loss_total": 0.14425142109394073, "step": 833 }, { "batch_size": 1, "epoch": 0.3332, "step": 833, "tokens_per_device": 5050 }, { "epoch": 0.3332, "loss_ce": 0.06522291898727417, "loss_lvr": 0.4259220063686371, "loss_mode_switch": 0.0, "loss_total": 0.10781511664390564, "step": 833 }, { "batch_size": 1, "epoch": 0.3332, "step": 833, "tokens_per_device": 5410 }, { "epoch": 0.3332, "loss_ce": 0.0456671416759491, "loss_lvr": 0.34519895911216736, "loss_mode_switch": 0.0, "loss_total": 0.08018703758716583, "step": 833 }, { "batch_size": 1, "epoch": 0.3332, "step": 833, "tokens_per_device": 4878 }, { "epoch": 0.3332, "loss_ce": 0.6154409646987915, "loss_lvr": 0.7244825959205627, "loss_mode_switch": 0.0, "loss_total": 0.6878892183303833, "step": 833 }, { "batch_size": 1, "epoch": 0.3332, "step": 833, "tokens_per_device": 5016 }, { "epoch": 0.3332, "loss_ce": 0.039775632321834564, "loss_lvr": 0.17905743420124054, "loss_mode_switch": 0.0, "loss_total": 0.0576813742518425, "step": 833 }, { "epoch": 0.3336, "grad_norm": 1.3961886167526245, "learning_rate": 7.771452179716099e-06, "loss": 0.3169, "step": 834 }, { "batch_size": 4, "epoch": 0.3336, "step": 834, "tokens_per_device": 4452 }, { "epoch": 0.3336, "loss_ce": 0.2903839945793152, "loss_lvr": 0.8870076537132263, "loss_mode_switch": 0.0, "loss_total": 0.3790847659111023, "step": 834 }, { "batch_size": 4, "epoch": 0.3336, "step": 834, "tokens_per_device": 8548 }, { "epoch": 0.3336, "loss_ce": 0.06946177035570145, "loss_lvr": 0.7434016466140747, "loss_mode_switch": 0.0, "loss_total": 0.14380192756652832, "step": 834 }, { "batch_size": 4, "epoch": 0.3336, "step": 834, "tokens_per_device": 6668 }, { "epoch": 0.3336, "loss_ce": 0.2023674100637436, "loss_lvr": 0.8253875374794006, "loss_mode_switch": 0.0, "loss_total": 0.28490614891052246, "step": 834 }, { "batch_size": 4, "epoch": 0.3336, "step": 834, "tokens_per_device": 4628 }, { "epoch": 0.3336, "loss_ce": 0.11135505884885788, "loss_lvr": 1.0291485786437988, "loss_mode_switch": 0.0, "loss_total": 0.21426992118358612, "step": 834 }, { "batch_size": 4, "epoch": 0.3336, "step": 834, "tokens_per_device": 14032 }, { "epoch": 0.3336, "loss_ce": 0.46759897470474243, "loss_lvr": 0.9848063588142395, "loss_mode_switch": 0.0, "loss_total": 0.5660796165466309, "step": 834 }, { "batch_size": 4, "epoch": 0.3336, "step": 834, "tokens_per_device": 3380 }, { "epoch": 0.3336, "loss_ce": 0.3981685936450958, "loss_lvr": 0.9249626398086548, "loss_mode_switch": 0.0, "loss_total": 0.49066486954689026, "step": 834 }, { "batch_size": 4, "epoch": 0.3336, "step": 834, "tokens_per_device": 1336 }, { "epoch": 0.3336, "loss_ce": 0.40521737933158875, "loss_lvr": 1.0505565404891968, "loss_mode_switch": 0.0, "loss_total": 0.5102730393409729, "step": 834 }, { "batch_size": 4, "epoch": 0.3336, "step": 834, "tokens_per_device": 4924 }, { "epoch": 0.3336, "loss_ce": 0.5580798983573914, "loss_lvr": 0.7265194058418274, "loss_mode_switch": 0.0, "loss_total": 0.6307318210601807, "step": 834 }, { "epoch": 0.334, "grad_norm": 1.5508679151535034, "learning_rate": 7.766058472361154e-06, "loss": 0.3533, "step": 835 }, { "batch_size": 4, "epoch": 0.334, "step": 835, "tokens_per_device": 4968 }, { "epoch": 0.334, "loss_ce": 0.4588488042354584, "loss_lvr": 0.8009232878684998, "loss_mode_switch": 0.0, "loss_total": 0.5389411449432373, "step": 835 }, { "batch_size": 4, "epoch": 0.334, "step": 835, "tokens_per_device": 5688 }, { "epoch": 0.334, "loss_ce": 0.1720058172941208, "loss_lvr": 0.8468689918518066, "loss_mode_switch": 0.0, "loss_total": 0.25669270753860474, "step": 835 }, { "batch_size": 4, "epoch": 0.334, "step": 835, "tokens_per_device": 4368 }, { "epoch": 0.334, "loss_ce": 0.18858292698860168, "loss_lvr": 1.0245896577835083, "loss_mode_switch": 0.0, "loss_total": 0.29104191064834595, "step": 835 }, { "batch_size": 4, "epoch": 0.334, "step": 835, "tokens_per_device": 7768 }, { "epoch": 0.334, "loss_ce": 0.03992936760187149, "loss_lvr": 0.6728019118309021, "loss_mode_switch": 0.0, "loss_total": 0.10720956325531006, "step": 835 }, { "batch_size": 4, "epoch": 0.334, "step": 835, "tokens_per_device": 2544 }, { "epoch": 0.334, "loss_ce": 0.321393221616745, "loss_lvr": 1.099373459815979, "loss_mode_switch": 0.0, "loss_total": 0.4313305616378784, "step": 835 }, { "batch_size": 1, "epoch": 0.334, "step": 835, "tokens_per_device": 5387 }, { "epoch": 0.334, "loss_ce": 0.006454304791986942, "loss_lvr": 0.3697472810745239, "loss_mode_switch": 0.0, "loss_total": 0.04342903196811676, "step": 835 }, { "batch_size": 4, "epoch": 0.334, "step": 835, "tokens_per_device": 5072 }, { "epoch": 0.334, "loss_ce": 0.504206120967865, "loss_lvr": 0.9654366374015808, "loss_mode_switch": 0.0, "loss_total": 0.6007497906684875, "step": 835 }, { "batch_size": 4, "epoch": 0.334, "step": 835, "tokens_per_device": 1328 }, { "epoch": 0.334, "loss_ce": 0.4903576076030731, "loss_lvr": 1.2869436740875244, "loss_mode_switch": 0.0, "loss_total": 0.619051992893219, "step": 835 }, { "epoch": 0.3344, "grad_norm": 1.2329816818237305, "learning_rate": 7.760660122659682e-06, "loss": 0.2963, "step": 836 }, { "batch_size": 4, "epoch": 0.3344, "step": 836, "tokens_per_device": 4168 }, { "epoch": 0.3344, "loss_ce": 0.5736194849014282, "loss_lvr": 0.6313990950584412, "loss_mode_switch": 0.0, "loss_total": 0.6367594003677368, "step": 836 }, { "batch_size": 4, "epoch": 0.3344, "step": 836, "tokens_per_device": 3704 }, { "epoch": 0.3344, "loss_ce": 0.3857695758342743, "loss_lvr": 0.906248152256012, "loss_mode_switch": 0.0, "loss_total": 0.476394385099411, "step": 836 }, { "batch_size": 4, "epoch": 0.3344, "step": 836, "tokens_per_device": 3844 }, { "epoch": 0.3344, "loss_ce": 0.38824304938316345, "loss_lvr": 0.911502480506897, "loss_mode_switch": 0.0, "loss_total": 0.4793933033943176, "step": 836 }, { "batch_size": 4, "epoch": 0.3344, "step": 836, "tokens_per_device": 2568 }, { "epoch": 0.3344, "loss_ce": 0.45572131872177124, "loss_lvr": 1.1929913759231567, "loss_mode_switch": 0.0, "loss_total": 0.575020432472229, "step": 836 }, { "batch_size": 1, "epoch": 0.3344, "step": 836, "tokens_per_device": 5143 }, { "epoch": 0.3344, "loss_ce": 0.1351568102836609, "loss_lvr": 0.4252339005470276, "loss_mode_switch": 0.0, "loss_total": 0.17768019437789917, "step": 836 }, { "batch_size": 1, "epoch": 0.3344, "step": 836, "tokens_per_device": 5077 }, { "epoch": 0.3344, "loss_ce": 0.01861819624900818, "loss_lvr": 0.6304932236671448, "loss_mode_switch": 0.0, "loss_total": 0.08166752010583878, "step": 836 }, { "batch_size": 4, "epoch": 0.3344, "step": 836, "tokens_per_device": 4140 }, { "epoch": 0.3344, "loss_ce": 0.6688543558120728, "loss_lvr": 0.8320568799972534, "loss_mode_switch": 0.0, "loss_total": 0.752060055732727, "step": 836 }, { "batch_size": 4, "epoch": 0.3344, "step": 836, "tokens_per_device": 3448 }, { "epoch": 0.3344, "loss_ce": 0.4157205820083618, "loss_lvr": 0.8599267601966858, "loss_mode_switch": 0.0, "loss_total": 0.5017132759094238, "step": 836 }, { "epoch": 0.3348, "grad_norm": 1.263322114944458, "learning_rate": 7.755257139671868e-06, "loss": 0.3295, "step": 837 }, { "batch_size": 4, "epoch": 0.3348, "step": 837, "tokens_per_device": 4196 }, { "epoch": 0.3348, "loss_ce": 0.5281227827072144, "loss_lvr": 1.114544153213501, "loss_mode_switch": 0.0, "loss_total": 0.6395772099494934, "step": 837 }, { "batch_size": 4, "epoch": 0.3348, "step": 837, "tokens_per_device": 2780 }, { "epoch": 0.3348, "loss_ce": 0.5610005855560303, "loss_lvr": 0.6375295519828796, "loss_mode_switch": 0.0, "loss_total": 0.6247535347938538, "step": 837 }, { "batch_size": 1, "epoch": 0.3348, "step": 837, "tokens_per_device": 5540 }, { "epoch": 0.3348, "loss_ce": 0.0009850264759734273, "loss_lvr": 0.2429489940404892, "loss_mode_switch": 0.0, "loss_total": 0.025279926136136055, "step": 837 }, { "batch_size": 4, "epoch": 0.3348, "step": 837, "tokens_per_device": 5864 }, { "epoch": 0.3348, "loss_ce": 0.25444096326828003, "loss_lvr": 0.6540303230285645, "loss_mode_switch": 0.0, "loss_total": 0.31984400749206543, "step": 837 }, { "batch_size": 4, "epoch": 0.3348, "step": 837, "tokens_per_device": 4156 }, { "epoch": 0.3348, "loss_ce": 0.22768954932689667, "loss_lvr": 1.177943468093872, "loss_mode_switch": 0.0, "loss_total": 0.3454838991165161, "step": 837 }, { "batch_size": 4, "epoch": 0.3348, "step": 837, "tokens_per_device": 4320 }, { "epoch": 0.3348, "loss_ce": 0.12227020412683487, "loss_lvr": 0.917095422744751, "loss_mode_switch": 0.0, "loss_total": 0.21397975087165833, "step": 837 }, { "batch_size": 1, "epoch": 0.3348, "step": 837, "tokens_per_device": 4898 }, { "epoch": 0.3348, "loss_ce": 0.16713237762451172, "loss_lvr": 0.527267336845398, "loss_mode_switch": 0.0, "loss_total": 0.21985910832881927, "step": 837 }, { "batch_size": 4, "epoch": 0.3348, "step": 837, "tokens_per_device": 4384 }, { "epoch": 0.3348, "loss_ce": 0.16095420718193054, "loss_lvr": 0.7293003797531128, "loss_mode_switch": 0.0, "loss_total": 0.23388424515724182, "step": 837 }, { "epoch": 0.3352, "grad_norm": 1.3615707159042358, "learning_rate": 7.749849532465677e-06, "loss": 0.2791, "step": 838 }, { "batch_size": 1, "epoch": 0.3352, "step": 838, "tokens_per_device": 5241 }, { "epoch": 0.3352, "loss_ce": 0.1910228282213211, "loss_lvr": 0.38425126671791077, "loss_mode_switch": 0.0, "loss_total": 0.22944796085357666, "step": 838 }, { "batch_size": 4, "epoch": 0.3352, "step": 838, "tokens_per_device": 3816 }, { "epoch": 0.3352, "loss_ce": 0.2962264120578766, "loss_lvr": 0.8578080534934998, "loss_mode_switch": 0.0, "loss_total": 0.3820072114467621, "step": 838 }, { "batch_size": 1, "epoch": 0.3352, "step": 838, "tokens_per_device": 7331 }, { "epoch": 0.3352, "loss_ce": 0.00028113569715060294, "loss_lvr": 0.3104194402694702, "loss_mode_switch": 0.0, "loss_total": 0.03132307901978493, "step": 838 }, { "batch_size": 4, "epoch": 0.3352, "step": 838, "tokens_per_device": 2644 }, { "epoch": 0.3352, "loss_ce": 0.6293880939483643, "loss_lvr": 0.855201780796051, "loss_mode_switch": 0.0, "loss_total": 0.714908242225647, "step": 838 }, { "batch_size": 4, "epoch": 0.3352, "step": 838, "tokens_per_device": 13564 }, { "epoch": 0.3352, "loss_ce": 0.27299511432647705, "loss_lvr": 0.9248192310333252, "loss_mode_switch": 0.0, "loss_total": 0.3654770255088806, "step": 838 }, { "batch_size": 1, "epoch": 0.3352, "step": 838, "tokens_per_device": 5109 }, { "epoch": 0.3352, "loss_ce": 0.017445530742406845, "loss_lvr": 0.5475520491600037, "loss_mode_switch": 0.0, "loss_total": 0.07220073789358139, "step": 838 }, { "batch_size": 4, "epoch": 0.3352, "step": 838, "tokens_per_device": 2620 }, { "epoch": 0.3352, "loss_ce": 0.34198880195617676, "loss_lvr": 1.0621068477630615, "loss_mode_switch": 0.0, "loss_total": 0.44819948077201843, "step": 838 }, { "batch_size": 4, "epoch": 0.3352, "step": 838, "tokens_per_device": 7684 }, { "epoch": 0.3352, "loss_ce": 0.32496652007102966, "loss_lvr": 0.751241147518158, "loss_mode_switch": 0.0, "loss_total": 0.40009063482284546, "step": 838 }, { "epoch": 0.3356, "grad_norm": 1.2533810138702393, "learning_rate": 7.744437310116837e-06, "loss": 0.2884, "step": 839 }, { "batch_size": 4, "epoch": 0.3356, "step": 839, "tokens_per_device": 4696 }, { "epoch": 0.3356, "loss_ce": 0.07346578687429428, "loss_lvr": 1.0619843006134033, "loss_mode_switch": 0.0, "loss_total": 0.1796642243862152, "step": 839 }, { "batch_size": 1, "epoch": 0.3356, "step": 839, "tokens_per_device": 4893 }, { "epoch": 0.3356, "loss_ce": 0.24572134017944336, "loss_lvr": 0.5312594175338745, "loss_mode_switch": 0.0, "loss_total": 0.2988472878932953, "step": 839 }, { "batch_size": 4, "epoch": 0.3356, "step": 839, "tokens_per_device": 4232 }, { "epoch": 0.3356, "loss_ce": 0.23293080925941467, "loss_lvr": 0.9264490008354187, "loss_mode_switch": 0.0, "loss_total": 0.32557570934295654, "step": 839 }, { "batch_size": 1, "epoch": 0.3356, "step": 839, "tokens_per_device": 5174 }, { "epoch": 0.3356, "loss_ce": 0.010061623528599739, "loss_lvr": 0.29786065220832825, "loss_mode_switch": 0.0, "loss_total": 0.039847686886787415, "step": 839 }, { "batch_size": 1, "epoch": 0.3356, "step": 839, "tokens_per_device": 4587 }, { "epoch": 0.3356, "loss_ce": 0.021728482097387314, "loss_lvr": 0.26668888330459595, "loss_mode_switch": 0.0, "loss_total": 0.04839736968278885, "step": 839 }, { "batch_size": 1, "epoch": 0.3356, "step": 839, "tokens_per_device": 4866 }, { "epoch": 0.3356, "loss_ce": 0.0021621831692755222, "loss_lvr": 0.4169555902481079, "loss_mode_switch": 0.0, "loss_total": 0.043857745826244354, "step": 839 }, { "batch_size": 1, "epoch": 0.3356, "step": 839, "tokens_per_device": 4875 }, { "epoch": 0.3356, "loss_ce": 0.0026716699358075857, "loss_lvr": 0.6035526394844055, "loss_mode_switch": 0.0, "loss_total": 0.06302693486213684, "step": 839 }, { "batch_size": 1, "epoch": 0.3356, "step": 839, "tokens_per_device": 4741 }, { "epoch": 0.3356, "loss_ce": 0.0012561234179884195, "loss_lvr": 0.21531061828136444, "loss_mode_switch": 0.0, "loss_total": 0.02278718538582325, "step": 839 }, { "epoch": 0.336, "grad_norm": 1.326967477798462, "learning_rate": 7.739020481708816e-06, "loss": 0.266, "step": 840 }, { "batch_size": 4, "epoch": 0.336, "step": 840, "tokens_per_device": 4868 }, { "epoch": 0.336, "loss_ce": 0.00699594197794795, "loss_lvr": 0.6492356061935425, "loss_mode_switch": 0.0, "loss_total": 0.0719195008277893, "step": 840 }, { "batch_size": 4, "epoch": 0.336, "step": 840, "tokens_per_device": 1568 }, { "epoch": 0.336, "loss_ce": 0.22214102745056152, "loss_lvr": 0.8824365735054016, "loss_mode_switch": 0.0, "loss_total": 0.31038469076156616, "step": 840 }, { "batch_size": 1, "epoch": 0.336, "step": 840, "tokens_per_device": 5194 }, { "epoch": 0.336, "loss_ce": 0.05872497707605362, "loss_lvr": 0.2844397723674774, "loss_mode_switch": 0.0, "loss_total": 0.08716895431280136, "step": 840 }, { "batch_size": 1, "epoch": 0.336, "step": 840, "tokens_per_device": 5158 }, { "epoch": 0.336, "loss_ce": 0.01785164885222912, "loss_lvr": 0.5191976428031921, "loss_mode_switch": 0.0, "loss_total": 0.0697714164853096, "step": 840 }, { "batch_size": 1, "epoch": 0.336, "step": 840, "tokens_per_device": 5150 }, { "epoch": 0.336, "loss_ce": 0.011975171975791454, "loss_lvr": 0.4047122895717621, "loss_mode_switch": 0.0, "loss_total": 0.0524464026093483, "step": 840 }, { "batch_size": 4, "epoch": 0.336, "step": 840, "tokens_per_device": 1392 }, { "epoch": 0.336, "loss_ce": 0.634218156337738, "loss_lvr": 1.1175072193145752, "loss_mode_switch": 0.0, "loss_total": 0.7459688782691956, "step": 840 }, { "batch_size": 1, "epoch": 0.336, "step": 840, "tokens_per_device": 4929 }, { "epoch": 0.336, "loss_ce": 0.01581568457186222, "loss_lvr": 0.8088338971138, "loss_mode_switch": 0.0, "loss_total": 0.0966990739107132, "step": 840 }, { "batch_size": 4, "epoch": 0.336, "step": 840, "tokens_per_device": 4260 }, { "epoch": 0.336, "loss_ce": 0.14954422414302826, "loss_lvr": 1.2425076961517334, "loss_mode_switch": 0.0, "loss_total": 0.2737950086593628, "step": 840 }, { "epoch": 0.3364, "grad_norm": 1.1756430864334106, "learning_rate": 7.733599056332816e-06, "loss": 0.2486, "step": 841 }, { "batch_size": 4, "epoch": 0.3364, "step": 841, "tokens_per_device": 1428 }, { "epoch": 0.3364, "loss_ce": 0.1257747858762741, "loss_lvr": 1.058132529258728, "loss_mode_switch": 0.0, "loss_total": 0.23158803582191467, "step": 841 }, { "batch_size": 1, "epoch": 0.3364, "step": 841, "tokens_per_device": 5422 }, { "epoch": 0.3364, "loss_ce": 0.0007559954538010061, "loss_lvr": 0.3704932928085327, "loss_mode_switch": 0.0, "loss_total": 0.03780532628297806, "step": 841 }, { "batch_size": 4, "epoch": 0.3364, "step": 841, "tokens_per_device": 7120 }, { "epoch": 0.3364, "loss_ce": 0.05523815006017685, "loss_lvr": 0.7756156325340271, "loss_mode_switch": 0.0, "loss_total": 0.13279971480369568, "step": 841 }, { "batch_size": 4, "epoch": 0.3364, "step": 841, "tokens_per_device": 1296 }, { "epoch": 0.3364, "loss_ce": 0.24446360766887665, "loss_lvr": 1.0029629468917847, "loss_mode_switch": 0.0, "loss_total": 0.34475991129875183, "step": 841 }, { "batch_size": 4, "epoch": 0.3364, "step": 841, "tokens_per_device": 3800 }, { "epoch": 0.3364, "loss_ce": 0.1811763346195221, "loss_lvr": 1.0696617364883423, "loss_mode_switch": 0.0, "loss_total": 0.28814250230789185, "step": 841 }, { "batch_size": 4, "epoch": 0.3364, "step": 841, "tokens_per_device": 6628 }, { "epoch": 0.3364, "loss_ce": 0.2481372356414795, "loss_lvr": 0.8419721722602844, "loss_mode_switch": 0.0, "loss_total": 0.3323344588279724, "step": 841 }, { "batch_size": 4, "epoch": 0.3364, "step": 841, "tokens_per_device": 5168 }, { "epoch": 0.3364, "loss_ce": 0.014803956262767315, "loss_lvr": 0.7120126485824585, "loss_mode_switch": 0.0, "loss_total": 0.08600521832704544, "step": 841 }, { "batch_size": 1, "epoch": 0.3364, "step": 841, "tokens_per_device": 4913 }, { "epoch": 0.3364, "loss_ce": 0.025243820622563362, "loss_lvr": 0.3080262541770935, "loss_mode_switch": 0.0, "loss_total": 0.05604644864797592, "step": 841 }, { "epoch": 0.3368, "grad_norm": 1.1865370273590088, "learning_rate": 7.728173043087754e-06, "loss": 0.2768, "step": 842 }, { "batch_size": 1, "epoch": 0.3368, "step": 842, "tokens_per_device": 4831 }, { "epoch": 0.3368, "loss_ce": 0.006932056043297052, "loss_lvr": 0.18417373299598694, "loss_mode_switch": 0.0, "loss_total": 0.02534942887723446, "step": 842 }, { "batch_size": 1, "epoch": 0.3368, "step": 842, "tokens_per_device": 4895 }, { "epoch": 0.3368, "loss_ce": 0.01865665428340435, "loss_lvr": 0.6777825951576233, "loss_mode_switch": 0.0, "loss_total": 0.08643491566181183, "step": 842 }, { "batch_size": 1, "epoch": 0.3368, "step": 842, "tokens_per_device": 4762 }, { "epoch": 0.3368, "loss_ce": 0.009902013465762138, "loss_lvr": 0.3434290289878845, "loss_mode_switch": 0.0, "loss_total": 0.0442449152469635, "step": 842 }, { "batch_size": 4, "epoch": 0.3368, "step": 842, "tokens_per_device": 2732 }, { "epoch": 0.3368, "loss_ce": 0.4913889169692993, "loss_lvr": 0.8368781208992004, "loss_mode_switch": 0.0, "loss_total": 0.575076699256897, "step": 842 }, { "batch_size": 4, "epoch": 0.3368, "step": 842, "tokens_per_device": 1304 }, { "epoch": 0.3368, "loss_ce": 0.5196909308433533, "loss_lvr": 1.1692458391189575, "loss_mode_switch": 0.0, "loss_total": 0.636615514755249, "step": 842 }, { "batch_size": 1, "epoch": 0.3368, "step": 842, "tokens_per_device": 5187 }, { "epoch": 0.3368, "loss_ce": 0.0018082794267684221, "loss_lvr": 0.44871601462364197, "loss_mode_switch": 0.0, "loss_total": 0.04667988047003746, "step": 842 }, { "batch_size": 4, "epoch": 0.3368, "step": 842, "tokens_per_device": 4240 }, { "epoch": 0.3368, "loss_ce": 0.42991897463798523, "loss_lvr": 1.1233983039855957, "loss_mode_switch": 0.0, "loss_total": 0.5422587990760803, "step": 842 }, { "batch_size": 4, "epoch": 0.3368, "step": 842, "tokens_per_device": 1324 }, { "epoch": 0.3368, "loss_ce": 0.40322965383529663, "loss_lvr": 1.0422427654266357, "loss_mode_switch": 0.0, "loss_total": 0.5074539184570312, "step": 842 }, { "epoch": 0.3372, "grad_norm": 1.630384922027588, "learning_rate": 7.722742451080247e-06, "loss": 0.3569, "step": 843 }, { "batch_size": 1, "epoch": 0.3372, "step": 843, "tokens_per_device": 4899 }, { "epoch": 0.3372, "loss_ce": 0.22521759569644928, "loss_lvr": 0.4787653386592865, "loss_mode_switch": 0.0, "loss_total": 0.273094117641449, "step": 843 }, { "batch_size": 4, "epoch": 0.3372, "step": 843, "tokens_per_device": 5880 }, { "epoch": 0.3372, "loss_ce": 0.49266761541366577, "loss_lvr": 0.9057047367095947, "loss_mode_switch": 0.0, "loss_total": 0.5832380652427673, "step": 843 }, { "batch_size": 4, "epoch": 0.3372, "step": 843, "tokens_per_device": 5192 }, { "epoch": 0.3372, "loss_ce": 0.0735664889216423, "loss_lvr": 0.7424774169921875, "loss_mode_switch": 0.0, "loss_total": 0.14781422913074493, "step": 843 }, { "batch_size": 4, "epoch": 0.3372, "step": 843, "tokens_per_device": 4324 }, { "epoch": 0.3372, "loss_ce": 0.18112121522426605, "loss_lvr": 1.2595707178115845, "loss_mode_switch": 0.0, "loss_total": 0.3070783019065857, "step": 843 }, { "batch_size": 4, "epoch": 0.3372, "step": 843, "tokens_per_device": 5184 }, { "epoch": 0.3372, "loss_ce": 0.21283042430877686, "loss_lvr": 0.9025329947471619, "loss_mode_switch": 0.0, "loss_total": 0.30308371782302856, "step": 843 }, { "batch_size": 4, "epoch": 0.3372, "step": 843, "tokens_per_device": 4244 }, { "epoch": 0.3372, "loss_ce": 0.2918664515018463, "loss_lvr": 0.8258275389671326, "loss_mode_switch": 0.0, "loss_total": 0.3744491934776306, "step": 843 }, { "batch_size": 4, "epoch": 0.3372, "step": 843, "tokens_per_device": 1468 }, { "epoch": 0.3372, "loss_ce": 0.6877380013465881, "loss_lvr": 0.9438440203666687, "loss_mode_switch": 0.0, "loss_total": 0.7821223735809326, "step": 843 }, { "batch_size": 4, "epoch": 0.3372, "step": 843, "tokens_per_device": 4456 }, { "epoch": 0.3372, "loss_ce": 0.21546614170074463, "loss_lvr": 1.4012497663497925, "loss_mode_switch": 0.0, "loss_total": 0.3555911183357239, "step": 843 }, { "epoch": 0.3376, "grad_norm": 1.3612538576126099, "learning_rate": 7.717307289424594e-06, "loss": 0.3004, "step": 844 }, { "batch_size": 4, "epoch": 0.3376, "step": 844, "tokens_per_device": 5340 }, { "epoch": 0.3376, "loss_ce": 0.6306666135787964, "loss_lvr": 0.8286725878715515, "loss_mode_switch": 0.0, "loss_total": 0.713533878326416, "step": 844 }, { "batch_size": 4, "epoch": 0.3376, "step": 844, "tokens_per_device": 4852 }, { "epoch": 0.3376, "loss_ce": 0.07951284945011139, "loss_lvr": 1.1743645668029785, "loss_mode_switch": 0.0, "loss_total": 0.196949303150177, "step": 844 }, { "batch_size": 4, "epoch": 0.3376, "step": 844, "tokens_per_device": 4284 }, { "epoch": 0.3376, "loss_ce": 0.3569517135620117, "loss_lvr": 1.9131591320037842, "loss_mode_switch": 0.0, "loss_total": 0.5482676029205322, "step": 844 }, { "batch_size": 4, "epoch": 0.3376, "step": 844, "tokens_per_device": 4384 }, { "epoch": 0.3376, "loss_ce": 0.4168553948402405, "loss_lvr": 0.6462950706481934, "loss_mode_switch": 0.0, "loss_total": 0.48148488998413086, "step": 844 }, { "batch_size": 4, "epoch": 0.3376, "step": 844, "tokens_per_device": 4892 }, { "epoch": 0.3376, "loss_ce": 0.02194383554160595, "loss_lvr": 1.1798770427703857, "loss_mode_switch": 0.0, "loss_total": 0.1399315446615219, "step": 844 }, { "batch_size": 1, "epoch": 0.3376, "step": 844, "tokens_per_device": 4737 }, { "epoch": 0.3376, "loss_ce": 0.04580792412161827, "loss_lvr": 0.45114579796791077, "loss_mode_switch": 0.0, "loss_total": 0.09092250466346741, "step": 844 }, { "batch_size": 1, "epoch": 0.3376, "step": 844, "tokens_per_device": 5118 }, { "epoch": 0.3376, "loss_ce": 0.009838170371949673, "loss_lvr": 0.2518082857131958, "loss_mode_switch": 0.0, "loss_total": 0.03501899912953377, "step": 844 }, { "batch_size": 4, "epoch": 0.3376, "step": 844, "tokens_per_device": 4328 }, { "epoch": 0.3376, "loss_ce": 0.012103384360671043, "loss_lvr": 0.8624439835548401, "loss_mode_switch": 0.0, "loss_total": 0.09834778308868408, "step": 844 }, { "epoch": 0.338, "grad_norm": 1.7585128545761108, "learning_rate": 7.711867567242769e-06, "loss": 0.3873, "step": 845 }, { "batch_size": 4, "epoch": 0.338, "step": 845, "tokens_per_device": 4060 }, { "epoch": 0.338, "loss_ce": 0.46350812911987305, "loss_lvr": 0.8745055794715881, "loss_mode_switch": 0.0, "loss_total": 0.5509586930274963, "step": 845 }, { "batch_size": 4, "epoch": 0.338, "step": 845, "tokens_per_device": 6848 }, { "epoch": 0.338, "loss_ce": 0.049344390630722046, "loss_lvr": 0.7478407621383667, "loss_mode_switch": 0.0, "loss_total": 0.12412846833467484, "step": 845 }, { "batch_size": 1, "epoch": 0.338, "step": 845, "tokens_per_device": 5041 }, { "epoch": 0.338, "loss_ce": 0.786012589931488, "loss_lvr": 0.20045633614063263, "loss_mode_switch": 0.0, "loss_total": 0.8060582280158997, "step": 845 }, { "batch_size": 4, "epoch": 0.338, "step": 845, "tokens_per_device": 5096 }, { "epoch": 0.338, "loss_ce": 0.5319749116897583, "loss_lvr": 1.0447295904159546, "loss_mode_switch": 0.0, "loss_total": 0.6364478468894958, "step": 845 }, { "batch_size": 4, "epoch": 0.338, "step": 845, "tokens_per_device": 1300 }, { "epoch": 0.338, "loss_ce": 1.0186649560928345, "loss_lvr": 1.0274783372879028, "loss_mode_switch": 0.0, "loss_total": 1.121412754058838, "step": 845 }, { "batch_size": 4, "epoch": 0.338, "step": 845, "tokens_per_device": 4756 }, { "epoch": 0.338, "loss_ce": 0.12750013172626495, "loss_lvr": 0.7707705497741699, "loss_mode_switch": 0.0, "loss_total": 0.20457717776298523, "step": 845 }, { "batch_size": 4, "epoch": 0.338, "step": 845, "tokens_per_device": 2780 }, { "epoch": 0.338, "loss_ce": 0.05224086716771126, "loss_lvr": 0.8758701682090759, "loss_mode_switch": 0.0, "loss_total": 0.1398278921842575, "step": 845 }, { "batch_size": 4, "epoch": 0.338, "step": 845, "tokens_per_device": 5964 }, { "epoch": 0.338, "loss_ce": 0.1633904129266739, "loss_lvr": 0.8192316293716431, "loss_mode_switch": 0.0, "loss_total": 0.2453135848045349, "step": 845 }, { "epoch": 0.3384, "grad_norm": 1.47087824344635, "learning_rate": 7.70642329366439e-06, "loss": 0.3454, "step": 846 }, { "batch_size": 4, "epoch": 0.3384, "step": 846, "tokens_per_device": 3808 }, { "epoch": 0.3384, "loss_ce": 0.35182350873947144, "loss_lvr": 1.0726914405822754, "loss_mode_switch": 0.0, "loss_total": 0.4590926468372345, "step": 846 }, { "batch_size": 1, "epoch": 0.3384, "step": 846, "tokens_per_device": 5188 }, { "epoch": 0.3384, "loss_ce": 0.045698489993810654, "loss_lvr": 0.45178356766700745, "loss_mode_switch": 0.0, "loss_total": 0.09087684750556946, "step": 846 }, { "batch_size": 1, "epoch": 0.3384, "step": 846, "tokens_per_device": 5069 }, { "epoch": 0.3384, "loss_ce": 0.010935359634459019, "loss_lvr": 0.7082172632217407, "loss_mode_switch": 0.0, "loss_total": 0.0817570835351944, "step": 846 }, { "batch_size": 4, "epoch": 0.3384, "step": 846, "tokens_per_device": 15384 }, { "epoch": 0.3384, "loss_ce": 0.07738738507032394, "loss_lvr": 0.7383562922477722, "loss_mode_switch": 0.0, "loss_total": 0.15122301876544952, "step": 846 }, { "batch_size": 4, "epoch": 0.3384, "step": 846, "tokens_per_device": 6128 }, { "epoch": 0.3384, "loss_ce": 0.3463294506072998, "loss_lvr": 0.8941336274147034, "loss_mode_switch": 0.0, "loss_total": 0.4357428252696991, "step": 846 }, { "batch_size": 4, "epoch": 0.3384, "step": 846, "tokens_per_device": 1428 }, { "epoch": 0.3384, "loss_ce": 0.27507635951042175, "loss_lvr": 1.8168622255325317, "loss_mode_switch": 0.0, "loss_total": 0.4567625820636749, "step": 846 }, { "batch_size": 4, "epoch": 0.3384, "step": 846, "tokens_per_device": 10960 }, { "epoch": 0.3384, "loss_ce": 0.7282162308692932, "loss_lvr": 0.5068796873092651, "loss_mode_switch": 0.0, "loss_total": 0.7789041996002197, "step": 846 }, { "batch_size": 1, "epoch": 0.3384, "step": 846, "tokens_per_device": 4973 }, { "epoch": 0.3384, "loss_ce": 0.24817761778831482, "loss_lvr": 0.732844889163971, "loss_mode_switch": 0.0, "loss_total": 0.32146209478378296, "step": 846 }, { "epoch": 0.3388, "grad_norm": 1.3569908142089844, "learning_rate": 7.70097447782673e-06, "loss": 0.2672, "step": 847 }, { "batch_size": 4, "epoch": 0.3388, "step": 847, "tokens_per_device": 5476 }, { "epoch": 0.3388, "loss_ce": 0.12442123889923096, "loss_lvr": 0.777269184589386, "loss_mode_switch": 0.0, "loss_total": 0.2021481692790985, "step": 847 }, { "batch_size": 4, "epoch": 0.3388, "step": 847, "tokens_per_device": 4348 }, { "epoch": 0.3388, "loss_ce": 0.242850199341774, "loss_lvr": 0.8814707398414612, "loss_mode_switch": 0.0, "loss_total": 0.3309972882270813, "step": 847 }, { "batch_size": 4, "epoch": 0.3388, "step": 847, "tokens_per_device": 7304 }, { "epoch": 0.3388, "loss_ce": 0.11818786710500717, "loss_lvr": 0.6967149972915649, "loss_mode_switch": 0.0, "loss_total": 0.18785937130451202, "step": 847 }, { "batch_size": 4, "epoch": 0.3388, "step": 847, "tokens_per_device": 3976 }, { "epoch": 0.3388, "loss_ce": 0.5781505107879639, "loss_lvr": 0.9297364950180054, "loss_mode_switch": 0.0, "loss_total": 0.6711241602897644, "step": 847 }, { "batch_size": 4, "epoch": 0.3388, "step": 847, "tokens_per_device": 3456 }, { "epoch": 0.3388, "loss_ce": 0.31766170263290405, "loss_lvr": 1.018636703491211, "loss_mode_switch": 0.0, "loss_total": 0.4195253849029541, "step": 847 }, { "batch_size": 4, "epoch": 0.3388, "step": 847, "tokens_per_device": 5416 }, { "epoch": 0.3388, "loss_ce": 0.44820281863212585, "loss_lvr": 0.8906062245368958, "loss_mode_switch": 0.0, "loss_total": 0.5372634530067444, "step": 847 }, { "batch_size": 4, "epoch": 0.3388, "step": 847, "tokens_per_device": 4264 }, { "epoch": 0.3388, "loss_ce": 0.2781037986278534, "loss_lvr": 0.92770916223526, "loss_mode_switch": 0.0, "loss_total": 0.37087470293045044, "step": 847 }, { "batch_size": 4, "epoch": 0.3388, "step": 847, "tokens_per_device": 1736 }, { "epoch": 0.3388, "loss_ce": 0.6405571103096008, "loss_lvr": 0.828848123550415, "loss_mode_switch": 0.0, "loss_total": 0.7234418988227844, "step": 847 }, { "epoch": 0.3392, "grad_norm": 1.9453593492507935, "learning_rate": 7.695521128874669e-06, "loss": 0.3094, "step": 848 }, { "batch_size": 4, "epoch": 0.3392, "step": 848, "tokens_per_device": 2568 }, { "epoch": 0.3392, "loss_ce": 0.37313657999038696, "loss_lvr": 0.954509437084198, "loss_mode_switch": 0.0, "loss_total": 0.4685875177383423, "step": 848 }, { "batch_size": 1, "epoch": 0.3392, "step": 848, "tokens_per_device": 4753 }, { "epoch": 0.3392, "loss_ce": 0.022885696962475777, "loss_lvr": 0.34559157490730286, "loss_mode_switch": 0.0, "loss_total": 0.05744485557079315, "step": 848 }, { "batch_size": 4, "epoch": 0.3392, "step": 848, "tokens_per_device": 4356 }, { "epoch": 0.3392, "loss_ce": 0.6371268630027771, "loss_lvr": 0.606350302696228, "loss_mode_switch": 0.0, "loss_total": 0.6977618932723999, "step": 848 }, { "batch_size": 4, "epoch": 0.3392, "step": 848, "tokens_per_device": 4668 }, { "epoch": 0.3392, "loss_ce": 0.1332792490720749, "loss_lvr": 0.9884060025215149, "loss_mode_switch": 0.0, "loss_total": 0.2321198582649231, "step": 848 }, { "batch_size": 1, "epoch": 0.3392, "step": 848, "tokens_per_device": 5034 }, { "epoch": 0.3392, "loss_ce": 0.005932206753641367, "loss_lvr": 1.178722620010376, "loss_mode_switch": 0.0, "loss_total": 0.12380446493625641, "step": 848 }, { "batch_size": 1, "epoch": 0.3392, "step": 848, "tokens_per_device": 4880 }, { "epoch": 0.3392, "loss_ce": 0.040934205055236816, "loss_lvr": 0.41879892349243164, "loss_mode_switch": 0.0, "loss_total": 0.08281409740447998, "step": 848 }, { "batch_size": 4, "epoch": 0.3392, "step": 848, "tokens_per_device": 2728 }, { "epoch": 0.3392, "loss_ce": 0.5362914800643921, "loss_lvr": 0.7321529388427734, "loss_mode_switch": 0.0, "loss_total": 0.6095067858695984, "step": 848 }, { "batch_size": 1, "epoch": 0.3392, "step": 848, "tokens_per_device": 4874 }, { "epoch": 0.3392, "loss_ce": 0.12187252193689346, "loss_lvr": 0.4558798670768738, "loss_mode_switch": 0.0, "loss_total": 0.16746050119400024, "step": 848 }, { "epoch": 0.3396, "grad_norm": 1.4140499830245972, "learning_rate": 7.690063255960702e-06, "loss": 0.3027, "step": 849 }, { "batch_size": 4, "epoch": 0.3396, "step": 849, "tokens_per_device": 5948 }, { "epoch": 0.3396, "loss_ce": 0.0216326043009758, "loss_lvr": 0.8101030588150024, "loss_mode_switch": 0.0, "loss_total": 0.10264290869235992, "step": 849 }, { "batch_size": 4, "epoch": 0.3396, "step": 849, "tokens_per_device": 6776 }, { "epoch": 0.3396, "loss_ce": 0.15282058715820312, "loss_lvr": 0.7554028034210205, "loss_mode_switch": 0.0, "loss_total": 0.2283608615398407, "step": 849 }, { "batch_size": 4, "epoch": 0.3396, "step": 849, "tokens_per_device": 2560 }, { "epoch": 0.3396, "loss_ce": 0.09709543734788895, "loss_lvr": 1.0150041580200195, "loss_mode_switch": 0.0, "loss_total": 0.19859585165977478, "step": 849 }, { "batch_size": 4, "epoch": 0.3396, "step": 849, "tokens_per_device": 4220 }, { "epoch": 0.3396, "loss_ce": 0.023629212751984596, "loss_lvr": 0.9225999712944031, "loss_mode_switch": 0.0, "loss_total": 0.11588920652866364, "step": 849 }, { "batch_size": 1, "epoch": 0.3396, "step": 849, "tokens_per_device": 4768 }, { "epoch": 0.3396, "loss_ce": 0.17436368763446808, "loss_lvr": 0.3530865013599396, "loss_mode_switch": 0.0, "loss_total": 0.20967233180999756, "step": 849 }, { "batch_size": 4, "epoch": 0.3396, "step": 849, "tokens_per_device": 1220 }, { "epoch": 0.3396, "loss_ce": 0.16484424471855164, "loss_lvr": 0.967877984046936, "loss_mode_switch": 0.0, "loss_total": 0.2616320550441742, "step": 849 }, { "batch_size": 4, "epoch": 0.3396, "step": 849, "tokens_per_device": 5000 }, { "epoch": 0.3396, "loss_ce": 0.10790272057056427, "loss_lvr": 0.7545348405838013, "loss_mode_switch": 0.0, "loss_total": 0.18335619568824768, "step": 849 }, { "batch_size": 4, "epoch": 0.3396, "step": 849, "tokens_per_device": 1300 }, { "epoch": 0.3396, "loss_ce": 0.6247756481170654, "loss_lvr": 1.092035174369812, "loss_mode_switch": 0.0, "loss_total": 0.7339791655540466, "step": 849 }, { "epoch": 0.34, "grad_norm": 1.435257077217102, "learning_rate": 7.68460086824492e-06, "loss": 0.3288, "step": 850 }, { "batch_size": 4, "epoch": 0.34, "step": 850, "tokens_per_device": 4280 }, { "epoch": 0.34, "loss_ce": 0.6743955016136169, "loss_lvr": 1.1806082725524902, "loss_mode_switch": 0.0, "loss_total": 0.792456328868866, "step": 850 }, { "batch_size": 1, "epoch": 0.34, "step": 850, "tokens_per_device": 4849 }, { "epoch": 0.34, "loss_ce": 0.12156199663877487, "loss_lvr": 2.0428261756896973, "loss_mode_switch": 0.0, "loss_total": 0.3258446156978607, "step": 850 }, { "batch_size": 1, "epoch": 0.34, "step": 850, "tokens_per_device": 5214 }, { "epoch": 0.34, "loss_ce": 0.5285210609436035, "loss_lvr": 0.43497517704963684, "loss_mode_switch": 0.0, "loss_total": 0.572018563747406, "step": 850 }, { "batch_size": 4, "epoch": 0.34, "step": 850, "tokens_per_device": 5968 }, { "epoch": 0.34, "loss_ce": 0.14491841197013855, "loss_lvr": 0.8646653890609741, "loss_mode_switch": 0.0, "loss_total": 0.23138496279716492, "step": 850 }, { "batch_size": 4, "epoch": 0.34, "step": 850, "tokens_per_device": 7592 }, { "epoch": 0.34, "loss_ce": 0.21444499492645264, "loss_lvr": 0.7746597528457642, "loss_mode_switch": 0.0, "loss_total": 0.29191097617149353, "step": 850 }, { "batch_size": 1, "epoch": 0.34, "step": 850, "tokens_per_device": 5175 }, { "epoch": 0.34, "loss_ce": 1.407686471939087, "loss_lvr": 0.7290331125259399, "loss_mode_switch": 0.0, "loss_total": 1.480589747428894, "step": 850 }, { "batch_size": 1, "epoch": 0.34, "step": 850, "tokens_per_device": 4789 }, { "epoch": 0.34, "loss_ce": 0.33703187108039856, "loss_lvr": 0.5077329277992249, "loss_mode_switch": 0.0, "loss_total": 0.38780516386032104, "step": 850 }, { "batch_size": 4, "epoch": 0.34, "step": 850, "tokens_per_device": 13396 }, { "epoch": 0.34, "loss_ce": 0.8669202923774719, "loss_lvr": 0.5037238001823425, "loss_mode_switch": 0.0, "loss_total": 0.9172926545143127, "step": 850 }, { "epoch": 0.3404, "grad_norm": 1.5710718631744385, "learning_rate": 7.679133974894984e-06, "loss": 0.3335, "step": 851 }, { "batch_size": 4, "epoch": 0.3404, "step": 851, "tokens_per_device": 6468 }, { "epoch": 0.3404, "loss_ce": 0.2061610221862793, "loss_lvr": 0.9269669651985168, "loss_mode_switch": 0.0, "loss_total": 0.298857718706131, "step": 851 }, { "batch_size": 4, "epoch": 0.3404, "step": 851, "tokens_per_device": 4800 }, { "epoch": 0.3404, "loss_ce": 0.16166406869888306, "loss_lvr": 0.7225231528282166, "loss_mode_switch": 0.0, "loss_total": 0.23391638696193695, "step": 851 }, { "batch_size": 4, "epoch": 0.3404, "step": 851, "tokens_per_device": 1300 }, { "epoch": 0.3404, "loss_ce": 0.213088259100914, "loss_lvr": 1.2003436088562012, "loss_mode_switch": 0.0, "loss_total": 0.3331226110458374, "step": 851 }, { "batch_size": 4, "epoch": 0.3404, "step": 851, "tokens_per_device": 7208 }, { "epoch": 0.3404, "loss_ce": 0.16360262036323547, "loss_lvr": 1.0671433210372925, "loss_mode_switch": 0.0, "loss_total": 0.2703169584274292, "step": 851 }, { "batch_size": 4, "epoch": 0.3404, "step": 851, "tokens_per_device": 2688 }, { "epoch": 0.3404, "loss_ce": 0.5168652534484863, "loss_lvr": 0.6105524301528931, "loss_mode_switch": 0.0, "loss_total": 0.5779204964637756, "step": 851 }, { "batch_size": 1, "epoch": 0.3404, "step": 851, "tokens_per_device": 4416 }, { "epoch": 0.3404, "loss_ce": 0.017730120569467545, "loss_lvr": 0.4294460117816925, "loss_mode_switch": 0.0, "loss_total": 0.060674723237752914, "step": 851 }, { "batch_size": 4, "epoch": 0.3404, "step": 851, "tokens_per_device": 1912 }, { "epoch": 0.3404, "loss_ce": 0.6289018988609314, "loss_lvr": 1.041240930557251, "loss_mode_switch": 0.0, "loss_total": 0.7330259680747986, "step": 851 }, { "batch_size": 4, "epoch": 0.3404, "step": 851, "tokens_per_device": 4880 }, { "epoch": 0.3404, "loss_ce": 0.08635176718235016, "loss_lvr": 0.9542903304100037, "loss_mode_switch": 0.0, "loss_total": 0.18178080022335052, "step": 851 }, { "epoch": 0.3408, "grad_norm": 1.2981170415878296, "learning_rate": 7.673662585086123e-06, "loss": 0.3063, "step": 852 }, { "batch_size": 4, "epoch": 0.3408, "step": 852, "tokens_per_device": 3952 }, { "epoch": 0.3408, "loss_ce": 0.20266707241535187, "loss_lvr": 1.2925190925598145, "loss_mode_switch": 0.0, "loss_total": 0.33191898465156555, "step": 852 }, { "batch_size": 4, "epoch": 0.3408, "step": 852, "tokens_per_device": 1456 }, { "epoch": 0.3408, "loss_ce": 0.5415688753128052, "loss_lvr": 0.969668447971344, "loss_mode_switch": 0.0, "loss_total": 0.638535737991333, "step": 852 }, { "batch_size": 4, "epoch": 0.3408, "step": 852, "tokens_per_device": 2560 }, { "epoch": 0.3408, "loss_ce": 0.21835608780384064, "loss_lvr": 0.8954157829284668, "loss_mode_switch": 0.0, "loss_total": 0.3078976571559906, "step": 852 }, { "batch_size": 1, "epoch": 0.3408, "step": 852, "tokens_per_device": 5139 }, { "epoch": 0.3408, "loss_ce": 0.0028780463617295027, "loss_lvr": 0.41759172081947327, "loss_mode_switch": 0.0, "loss_total": 0.04463722184300423, "step": 852 }, { "batch_size": 4, "epoch": 0.3408, "step": 852, "tokens_per_device": 4972 }, { "epoch": 0.3408, "loss_ce": 0.17910397052764893, "loss_lvr": 0.8357031345367432, "loss_mode_switch": 0.0, "loss_total": 0.2626742720603943, "step": 852 }, { "batch_size": 4, "epoch": 0.3408, "step": 852, "tokens_per_device": 6428 }, { "epoch": 0.3408, "loss_ce": 0.3211650550365448, "loss_lvr": 0.5964046120643616, "loss_mode_switch": 0.0, "loss_total": 0.38080552220344543, "step": 852 }, { "batch_size": 1, "epoch": 0.3408, "step": 852, "tokens_per_device": 4136 }, { "epoch": 0.3408, "loss_ce": 2.7007598876953125, "loss_lvr": 0.4538106322288513, "loss_mode_switch": 0.0, "loss_total": 2.746140956878662, "step": 852 }, { "batch_size": 4, "epoch": 0.3408, "step": 852, "tokens_per_device": 5776 }, { "epoch": 0.3408, "loss_ce": 0.14842675626277924, "loss_lvr": 0.9687275290489197, "loss_mode_switch": 0.0, "loss_total": 0.24529951810836792, "step": 852 }, { "epoch": 0.3412, "grad_norm": 1.3719329833984375, "learning_rate": 7.668186708001106e-06, "loss": 0.2956, "step": 853 }, { "batch_size": 1, "epoch": 0.3412, "step": 853, "tokens_per_device": 5093 }, { "epoch": 0.3412, "loss_ce": 0.015290645882487297, "loss_lvr": 0.27913790941238403, "loss_mode_switch": 0.0, "loss_total": 0.04320443794131279, "step": 853 }, { "batch_size": 4, "epoch": 0.3412, "step": 853, "tokens_per_device": 4020 }, { "epoch": 0.3412, "loss_ce": 0.5819152593612671, "loss_lvr": 0.8694693446159363, "loss_mode_switch": 0.0, "loss_total": 0.6688622236251831, "step": 853 }, { "batch_size": 1, "epoch": 0.3412, "step": 853, "tokens_per_device": 4955 }, { "epoch": 0.3412, "loss_ce": 0.1017720103263855, "loss_lvr": 0.4163830876350403, "loss_mode_switch": 0.0, "loss_total": 0.143410325050354, "step": 853 }, { "batch_size": 1, "epoch": 0.3412, "step": 853, "tokens_per_device": 4868 }, { "epoch": 0.3412, "loss_ce": 0.05635157227516174, "loss_lvr": 0.3005751371383667, "loss_mode_switch": 0.0, "loss_total": 0.0864090844988823, "step": 853 }, { "batch_size": 1, "epoch": 0.3412, "step": 853, "tokens_per_device": 5158 }, { "epoch": 0.3412, "loss_ce": 0.07886248826980591, "loss_lvr": 0.4926794469356537, "loss_mode_switch": 0.0, "loss_total": 0.12813043594360352, "step": 853 }, { "batch_size": 4, "epoch": 0.3412, "step": 853, "tokens_per_device": 4384 }, { "epoch": 0.3412, "loss_ce": 0.48907408118247986, "loss_lvr": 0.8573793768882751, "loss_mode_switch": 0.0, "loss_total": 0.5748119950294495, "step": 853 }, { "batch_size": 4, "epoch": 0.3412, "step": 853, "tokens_per_device": 4156 }, { "epoch": 0.3412, "loss_ce": 0.4152822196483612, "loss_lvr": 0.9049482941627502, "loss_mode_switch": 0.0, "loss_total": 0.5057770609855652, "step": 853 }, { "batch_size": 4, "epoch": 0.3412, "step": 853, "tokens_per_device": 3988 }, { "epoch": 0.3412, "loss_ce": 0.10266203433275223, "loss_lvr": 1.5483485460281372, "loss_mode_switch": 0.0, "loss_total": 0.2574968934059143, "step": 853 }, { "epoch": 0.3416, "grad_norm": 1.2579472064971924, "learning_rate": 7.662706352830244e-06, "loss": 0.2895, "step": 854 }, { "batch_size": 1, "epoch": 0.3416, "step": 854, "tokens_per_device": 4927 }, { "epoch": 0.3416, "loss_ce": 0.1965155005455017, "loss_lvr": 0.6119498014450073, "loss_mode_switch": 0.0, "loss_total": 0.2577104866504669, "step": 854 }, { "batch_size": 4, "epoch": 0.3416, "step": 854, "tokens_per_device": 9228 }, { "epoch": 0.3416, "loss_ce": 0.1590805947780609, "loss_lvr": 1.1274231672286987, "loss_mode_switch": 0.0, "loss_total": 0.2718229293823242, "step": 854 }, { "batch_size": 1, "epoch": 0.3416, "step": 854, "tokens_per_device": 5058 }, { "epoch": 0.3416, "loss_ce": 0.004921374376863241, "loss_lvr": 0.1865944266319275, "loss_mode_switch": 0.0, "loss_total": 0.023580817505717278, "step": 854 }, { "batch_size": 4, "epoch": 0.3416, "step": 854, "tokens_per_device": 4284 }, { "epoch": 0.3416, "loss_ce": 0.3800660967826843, "loss_lvr": 0.8814833164215088, "loss_mode_switch": 0.0, "loss_total": 0.4682144224643707, "step": 854 }, { "batch_size": 1, "epoch": 0.3416, "step": 854, "tokens_per_device": 4870 }, { "epoch": 0.3416, "loss_ce": 0.002083170460537076, "loss_lvr": 0.2707882523536682, "loss_mode_switch": 0.0, "loss_total": 0.029161997139453888, "step": 854 }, { "batch_size": 1, "epoch": 0.3416, "step": 854, "tokens_per_device": 4955 }, { "epoch": 0.3416, "loss_ce": 0.3829247057437897, "loss_lvr": 0.2794676125049591, "loss_mode_switch": 0.0, "loss_total": 0.4108714759349823, "step": 854 }, { "batch_size": 1, "epoch": 0.3416, "step": 854, "tokens_per_device": 5111 }, { "epoch": 0.3416, "loss_ce": 0.0016032038256525993, "loss_lvr": 0.31026506423950195, "loss_mode_switch": 0.0, "loss_total": 0.03262970969080925, "step": 854 }, { "batch_size": 4, "epoch": 0.3416, "step": 854, "tokens_per_device": 4148 }, { "epoch": 0.3416, "loss_ce": 0.13958342373371124, "loss_lvr": 0.6884628534317017, "loss_mode_switch": 0.0, "loss_total": 0.2084297090768814, "step": 854 }, { "epoch": 0.342, "grad_norm": 1.57426917552948, "learning_rate": 7.657221528771352e-06, "loss": 0.2935, "step": 855 }, { "batch_size": 4, "epoch": 0.342, "step": 855, "tokens_per_device": 2840 }, { "epoch": 0.342, "loss_ce": 0.11691293120384216, "loss_lvr": 0.7651837468147278, "loss_mode_switch": 0.0, "loss_total": 0.1934313178062439, "step": 855 }, { "batch_size": 1, "epoch": 0.342, "step": 855, "tokens_per_device": 5121 }, { "epoch": 0.342, "loss_ce": 0.01757264882326126, "loss_lvr": 0.4230792820453644, "loss_mode_switch": 0.0, "loss_total": 0.0598805770277977, "step": 855 }, { "batch_size": 4, "epoch": 0.342, "step": 855, "tokens_per_device": 2572 }, { "epoch": 0.342, "loss_ce": 0.09326431155204773, "loss_lvr": 1.0100610256195068, "loss_mode_switch": 0.0, "loss_total": 0.19427041709423065, "step": 855 }, { "batch_size": 4, "epoch": 0.342, "step": 855, "tokens_per_device": 6132 }, { "epoch": 0.342, "loss_ce": 0.14690467715263367, "loss_lvr": 0.9865498542785645, "loss_mode_switch": 0.0, "loss_total": 0.2455596625804901, "step": 855 }, { "batch_size": 4, "epoch": 0.342, "step": 855, "tokens_per_device": 2676 }, { "epoch": 0.342, "loss_ce": 0.5921019315719604, "loss_lvr": 0.8406853675842285, "loss_mode_switch": 0.0, "loss_total": 0.6761704683303833, "step": 855 }, { "batch_size": 4, "epoch": 0.342, "step": 855, "tokens_per_device": 5388 }, { "epoch": 0.342, "loss_ce": 0.5045680999755859, "loss_lvr": 0.3951791822910309, "loss_mode_switch": 0.0, "loss_total": 0.5440860390663147, "step": 855 }, { "batch_size": 1, "epoch": 0.342, "step": 855, "tokens_per_device": 6449 }, { "epoch": 0.342, "loss_ce": 0.027984963729977608, "loss_lvr": 0.4686034321784973, "loss_mode_switch": 0.0, "loss_total": 0.07484530657529831, "step": 855 }, { "batch_size": 4, "epoch": 0.342, "step": 855, "tokens_per_device": 1560 }, { "epoch": 0.342, "loss_ce": 0.45232513546943665, "loss_lvr": 0.9539239406585693, "loss_mode_switch": 0.0, "loss_total": 0.5477175116539001, "step": 855 }, { "epoch": 0.3424, "grad_norm": 1.4175853729248047, "learning_rate": 7.651732245029753e-06, "loss": 0.3481, "step": 856 }, { "batch_size": 4, "epoch": 0.3424, "step": 856, "tokens_per_device": 4256 }, { "epoch": 0.3424, "loss_ce": 0.055440306663513184, "loss_lvr": 0.7217806577682495, "loss_mode_switch": 0.0, "loss_total": 0.12761837244033813, "step": 856 }, { "batch_size": 1, "epoch": 0.3424, "step": 856, "tokens_per_device": 6452 }, { "epoch": 0.3424, "loss_ce": 0.006848943419754505, "loss_lvr": 0.37508758902549744, "loss_mode_switch": 0.0, "loss_total": 0.044357702136039734, "step": 856 }, { "batch_size": 1, "epoch": 0.3424, "step": 856, "tokens_per_device": 4772 }, { "epoch": 0.3424, "loss_ce": 0.01171238161623478, "loss_lvr": 0.527521550655365, "loss_mode_switch": 0.0, "loss_total": 0.06446453928947449, "step": 856 }, { "batch_size": 4, "epoch": 0.3424, "step": 856, "tokens_per_device": 5772 }, { "epoch": 0.3424, "loss_ce": 0.2595096230506897, "loss_lvr": 0.918973982334137, "loss_mode_switch": 0.0, "loss_total": 0.3514070212841034, "step": 856 }, { "batch_size": 1, "epoch": 0.3424, "step": 856, "tokens_per_device": 4920 }, { "epoch": 0.3424, "loss_ce": 0.11446943134069443, "loss_lvr": 0.9566242098808289, "loss_mode_switch": 0.0, "loss_total": 0.21013185381889343, "step": 856 }, { "batch_size": 4, "epoch": 0.3424, "step": 856, "tokens_per_device": 4164 }, { "epoch": 0.3424, "loss_ce": 0.5667263269424438, "loss_lvr": 0.8351839780807495, "loss_mode_switch": 0.0, "loss_total": 0.6502447128295898, "step": 856 }, { "batch_size": 1, "epoch": 0.3424, "step": 856, "tokens_per_device": 4882 }, { "epoch": 0.3424, "loss_ce": 0.007546587847173214, "loss_lvr": 0.6517508625984192, "loss_mode_switch": 0.0, "loss_total": 0.07272167503833771, "step": 856 }, { "batch_size": 4, "epoch": 0.3424, "step": 856, "tokens_per_device": 8372 }, { "epoch": 0.3424, "loss_ce": 0.17191952466964722, "loss_lvr": 1.0560938119888306, "loss_mode_switch": 0.0, "loss_total": 0.27752891182899475, "step": 856 }, { "epoch": 0.3428, "grad_norm": 1.2177342176437378, "learning_rate": 7.646238510818249e-06, "loss": 0.3028, "step": 857 }, { "batch_size": 1, "epoch": 0.3428, "step": 857, "tokens_per_device": 5063 }, { "epoch": 0.3428, "loss_ce": 0.12211739271879196, "loss_lvr": 0.37884852290153503, "loss_mode_switch": 0.0, "loss_total": 0.16000224649906158, "step": 857 }, { "batch_size": 4, "epoch": 0.3428, "step": 857, "tokens_per_device": 1484 }, { "epoch": 0.3428, "loss_ce": 0.40319138765335083, "loss_lvr": 1.8571528196334839, "loss_mode_switch": 0.0, "loss_total": 0.5889066457748413, "step": 857 }, { "batch_size": 1, "epoch": 0.3428, "step": 857, "tokens_per_device": 5011 }, { "epoch": 0.3428, "loss_ce": 0.058719366788864136, "loss_lvr": 0.44195860624313354, "loss_mode_switch": 0.0, "loss_total": 0.10291522741317749, "step": 857 }, { "batch_size": 1, "epoch": 0.3428, "step": 857, "tokens_per_device": 6747 }, { "epoch": 0.3428, "loss_ce": 0.23207229375839233, "loss_lvr": 0.4511035084724426, "loss_mode_switch": 0.0, "loss_total": 0.2771826386451721, "step": 857 }, { "batch_size": 1, "epoch": 0.3428, "step": 857, "tokens_per_device": 4902 }, { "epoch": 0.3428, "loss_ce": 0.11606693267822266, "loss_lvr": 0.2318449467420578, "loss_mode_switch": 0.0, "loss_total": 0.13925142586231232, "step": 857 }, { "batch_size": 4, "epoch": 0.3428, "step": 857, "tokens_per_device": 6216 }, { "epoch": 0.3428, "loss_ce": 0.12058211863040924, "loss_lvr": 0.8627439737319946, "loss_mode_switch": 0.0, "loss_total": 0.20685651898384094, "step": 857 }, { "batch_size": 1, "epoch": 0.3428, "step": 857, "tokens_per_device": 7706 }, { "epoch": 0.3428, "loss_ce": 0.0011335816234350204, "loss_lvr": 0.3500642776489258, "loss_mode_switch": 0.0, "loss_total": 0.03614000976085663, "step": 857 }, { "batch_size": 4, "epoch": 0.3428, "step": 857, "tokens_per_device": 1368 }, { "epoch": 0.3428, "loss_ce": 0.41516992449760437, "loss_lvr": 1.3508857488632202, "loss_mode_switch": 0.0, "loss_total": 0.5502585172653198, "step": 857 }, { "epoch": 0.3432, "grad_norm": 1.4949074983596802, "learning_rate": 7.640740335357116e-06, "loss": 0.3052, "step": 858 }, { "batch_size": 4, "epoch": 0.3432, "step": 858, "tokens_per_device": 1352 }, { "epoch": 0.3432, "loss_ce": 0.24625234305858612, "loss_lvr": 1.1298397779464722, "loss_mode_switch": 0.0, "loss_total": 0.35923632979393005, "step": 858 }, { "batch_size": 4, "epoch": 0.3432, "step": 858, "tokens_per_device": 4272 }, { "epoch": 0.3432, "loss_ce": 0.19497887790203094, "loss_lvr": 1.0221445560455322, "loss_mode_switch": 0.0, "loss_total": 0.29719334840774536, "step": 858 }, { "batch_size": 1, "epoch": 0.3432, "step": 858, "tokens_per_device": 4902 }, { "epoch": 0.3432, "loss_ce": 0.44907477498054504, "loss_lvr": 0.4996114671230316, "loss_mode_switch": 0.0, "loss_total": 0.49903592467308044, "step": 858 }, { "batch_size": 4, "epoch": 0.3432, "step": 858, "tokens_per_device": 6336 }, { "epoch": 0.3432, "loss_ce": 0.2891124486923218, "loss_lvr": 0.5821031928062439, "loss_mode_switch": 0.0, "loss_total": 0.3473227620124817, "step": 858 }, { "batch_size": 4, "epoch": 0.3432, "step": 858, "tokens_per_device": 4240 }, { "epoch": 0.3432, "loss_ce": 0.06554397940635681, "loss_lvr": 0.9372795224189758, "loss_mode_switch": 0.0, "loss_total": 0.15927192568778992, "step": 858 }, { "batch_size": 4, "epoch": 0.3432, "step": 858, "tokens_per_device": 4936 }, { "epoch": 0.3432, "loss_ce": 0.43261009454727173, "loss_lvr": 0.6653631329536438, "loss_mode_switch": 0.0, "loss_total": 0.49914640188217163, "step": 858 }, { "batch_size": 4, "epoch": 0.3432, "step": 858, "tokens_per_device": 4100 }, { "epoch": 0.3432, "loss_ce": 0.2897724509239197, "loss_lvr": 1.1063575744628906, "loss_mode_switch": 0.0, "loss_total": 0.40040820837020874, "step": 858 }, { "batch_size": 1, "epoch": 0.3432, "step": 858, "tokens_per_device": 5141 }, { "epoch": 0.3432, "loss_ce": 0.025203227996826172, "loss_lvr": 0.2987387776374817, "loss_mode_switch": 0.0, "loss_total": 0.05507710576057434, "step": 858 }, { "epoch": 0.3436, "grad_norm": 1.4366612434387207, "learning_rate": 7.635237727874082e-06, "loss": 0.3148, "step": 859 }, { "batch_size": 1, "epoch": 0.3436, "step": 859, "tokens_per_device": 5592 }, { "epoch": 0.3436, "loss_ce": 0.07978975027799606, "loss_lvr": 0.6396753191947937, "loss_mode_switch": 0.0, "loss_total": 0.14375728368759155, "step": 859 }, { "batch_size": 1, "epoch": 0.3436, "step": 859, "tokens_per_device": 5872 }, { "epoch": 0.3436, "loss_ce": 0.01012456975877285, "loss_lvr": 0.4403485953807831, "loss_mode_switch": 0.0, "loss_total": 0.05415943264961243, "step": 859 }, { "batch_size": 4, "epoch": 0.3436, "step": 859, "tokens_per_device": 5244 }, { "epoch": 0.3436, "loss_ce": 0.21373765170574188, "loss_lvr": 0.7533050775527954, "loss_mode_switch": 0.0, "loss_total": 0.28906816244125366, "step": 859 }, { "batch_size": 4, "epoch": 0.3436, "step": 859, "tokens_per_device": 11024 }, { "epoch": 0.3436, "loss_ce": 0.1128857284784317, "loss_lvr": 1.159399390220642, "loss_mode_switch": 0.0, "loss_total": 0.2288256585597992, "step": 859 }, { "batch_size": 4, "epoch": 0.3436, "step": 859, "tokens_per_device": 4212 }, { "epoch": 0.3436, "loss_ce": 0.244576558470726, "loss_lvr": 1.0595594644546509, "loss_mode_switch": 0.0, "loss_total": 0.35053250193595886, "step": 859 }, { "batch_size": 1, "epoch": 0.3436, "step": 859, "tokens_per_device": 4965 }, { "epoch": 0.3436, "loss_ce": 0.00836081150919199, "loss_lvr": 0.2972449064254761, "loss_mode_switch": 0.0, "loss_total": 0.03808530420064926, "step": 859 }, { "batch_size": 1, "epoch": 0.3436, "step": 859, "tokens_per_device": 5044 }, { "epoch": 0.3436, "loss_ce": 0.0006960685132071376, "loss_lvr": 0.5504108667373657, "loss_mode_switch": 0.0, "loss_total": 0.05573715269565582, "step": 859 }, { "batch_size": 1, "epoch": 0.3436, "step": 859, "tokens_per_device": 5153 }, { "epoch": 0.3436, "loss_ce": 0.16680721938610077, "loss_lvr": 0.4210987687110901, "loss_mode_switch": 0.0, "loss_total": 0.20891709625720978, "step": 859 }, { "epoch": 0.344, "grad_norm": 1.2814420461654663, "learning_rate": 7.629730697604314e-06, "loss": 0.3217, "step": 860 }, { "batch_size": 4, "epoch": 0.344, "step": 860, "tokens_per_device": 7696 }, { "epoch": 0.344, "loss_ce": 0.20540283620357513, "loss_lvr": 0.5645901560783386, "loss_mode_switch": 0.0, "loss_total": 0.2618618607521057, "step": 860 }, { "batch_size": 1, "epoch": 0.344, "step": 860, "tokens_per_device": 4887 }, { "epoch": 0.344, "loss_ce": 0.5427088141441345, "loss_lvr": 0.7004749178886414, "loss_mode_switch": 0.0, "loss_total": 0.6127563118934631, "step": 860 }, { "batch_size": 4, "epoch": 0.344, "step": 860, "tokens_per_device": 8692 }, { "epoch": 0.344, "loss_ce": 0.012799692340195179, "loss_lvr": 0.8777186870574951, "loss_mode_switch": 0.0, "loss_total": 0.10057156533002853, "step": 860 }, { "batch_size": 4, "epoch": 0.344, "step": 860, "tokens_per_device": 5524 }, { "epoch": 0.344, "loss_ce": 0.10640685260295868, "loss_lvr": 0.6583917737007141, "loss_mode_switch": 0.0, "loss_total": 0.1722460389137268, "step": 860 }, { "batch_size": 1, "epoch": 0.344, "step": 860, "tokens_per_device": 5652 }, { "epoch": 0.344, "loss_ce": 0.10177678614854813, "loss_lvr": 0.3503881096839905, "loss_mode_switch": 0.0, "loss_total": 0.13681559264659882, "step": 860 }, { "batch_size": 1, "epoch": 0.344, "step": 860, "tokens_per_device": 5102 }, { "epoch": 0.344, "loss_ce": 0.004338583443313837, "loss_lvr": 0.8299480080604553, "loss_mode_switch": 0.0, "loss_total": 0.08733338862657547, "step": 860 }, { "batch_size": 1, "epoch": 0.344, "step": 860, "tokens_per_device": 4981 }, { "epoch": 0.344, "loss_ce": 0.022948021069169044, "loss_lvr": 0.90924471616745, "loss_mode_switch": 0.0, "loss_total": 0.11387249082326889, "step": 860 }, { "batch_size": 1, "epoch": 0.344, "step": 860, "tokens_per_device": 4860 }, { "epoch": 0.344, "loss_ce": 0.0162411667406559, "loss_lvr": 0.46171334385871887, "loss_mode_switch": 0.0, "loss_total": 0.06241250038146973, "step": 860 }, { "epoch": 0.3444, "grad_norm": 1.5536792278289795, "learning_rate": 7.624219253790403e-06, "loss": 0.3614, "step": 861 }, { "batch_size": 4, "epoch": 0.3444, "step": 861, "tokens_per_device": 4212 }, { "epoch": 0.3444, "loss_ce": 0.1060350313782692, "loss_lvr": 0.7670855522155762, "loss_mode_switch": 0.0, "loss_total": 0.18274357914924622, "step": 861 }, { "batch_size": 1, "epoch": 0.3444, "step": 861, "tokens_per_device": 5158 }, { "epoch": 0.3444, "loss_ce": 0.07842111587524414, "loss_lvr": 0.37587589025497437, "loss_mode_switch": 0.0, "loss_total": 0.1160087063908577, "step": 861 }, { "batch_size": 1, "epoch": 0.3444, "step": 861, "tokens_per_device": 4976 }, { "epoch": 0.3444, "loss_ce": 0.01944984681904316, "loss_lvr": 0.7420719265937805, "loss_mode_switch": 0.0, "loss_total": 0.09365703910589218, "step": 861 }, { "batch_size": 4, "epoch": 0.3444, "step": 861, "tokens_per_device": 1492 }, { "epoch": 0.3444, "loss_ce": 0.18519346415996552, "loss_lvr": 1.573903203010559, "loss_mode_switch": 0.0, "loss_total": 0.3425837755203247, "step": 861 }, { "batch_size": 4, "epoch": 0.3444, "step": 861, "tokens_per_device": 5416 }, { "epoch": 0.3444, "loss_ce": 0.23968905210494995, "loss_lvr": 0.6552972793579102, "loss_mode_switch": 0.0, "loss_total": 0.30521878600120544, "step": 861 }, { "batch_size": 4, "epoch": 0.3444, "step": 861, "tokens_per_device": 4944 }, { "epoch": 0.3444, "loss_ce": 0.4579715132713318, "loss_lvr": 0.7320014238357544, "loss_mode_switch": 0.0, "loss_total": 0.5311716794967651, "step": 861 }, { "batch_size": 4, "epoch": 0.3444, "step": 861, "tokens_per_device": 3804 }, { "epoch": 0.3444, "loss_ce": 0.2440270632505417, "loss_lvr": 1.399073839187622, "loss_mode_switch": 0.0, "loss_total": 0.3839344382286072, "step": 861 }, { "batch_size": 4, "epoch": 0.3444, "step": 861, "tokens_per_device": 5112 }, { "epoch": 0.3444, "loss_ce": 0.3826746642589569, "loss_lvr": 0.8087980151176453, "loss_mode_switch": 0.0, "loss_total": 0.4635544717311859, "step": 861 }, { "epoch": 0.3448, "grad_norm": 1.2184460163116455, "learning_rate": 7.618703405682341e-06, "loss": 0.2392, "step": 862 }, { "batch_size": 4, "epoch": 0.3448, "step": 862, "tokens_per_device": 2508 }, { "epoch": 0.3448, "loss_ce": 0.40719541907310486, "loss_lvr": 1.0300397872924805, "loss_mode_switch": 0.0, "loss_total": 0.5101994276046753, "step": 862 }, { "batch_size": 4, "epoch": 0.3448, "step": 862, "tokens_per_device": 5852 }, { "epoch": 0.3448, "loss_ce": 0.42550528049468994, "loss_lvr": 0.7000740766525269, "loss_mode_switch": 0.0, "loss_total": 0.4955126941204071, "step": 862 }, { "batch_size": 4, "epoch": 0.3448, "step": 862, "tokens_per_device": 4596 }, { "epoch": 0.3448, "loss_ce": 0.2067447453737259, "loss_lvr": 1.013410210609436, "loss_mode_switch": 0.0, "loss_total": 0.30808576941490173, "step": 862 }, { "batch_size": 4, "epoch": 0.3448, "step": 862, "tokens_per_device": 4224 }, { "epoch": 0.3448, "loss_ce": 0.2848893702030182, "loss_lvr": 1.0289645195007324, "loss_mode_switch": 0.0, "loss_total": 0.38778582215309143, "step": 862 }, { "batch_size": 4, "epoch": 0.3448, "step": 862, "tokens_per_device": 5772 }, { "epoch": 0.3448, "loss_ce": 0.2728402614593506, "loss_lvr": 0.39969512820243835, "loss_mode_switch": 0.0, "loss_total": 0.3128097653388977, "step": 862 }, { "batch_size": 1, "epoch": 0.3448, "step": 862, "tokens_per_device": 5093 }, { "epoch": 0.3448, "loss_ce": 0.07726329565048218, "loss_lvr": 0.7299061417579651, "loss_mode_switch": 0.0, "loss_total": 0.15025392174720764, "step": 862 }, { "batch_size": 4, "epoch": 0.3448, "step": 862, "tokens_per_device": 4768 }, { "epoch": 0.3448, "loss_ce": 0.011423767544329166, "loss_lvr": 0.554608166217804, "loss_mode_switch": 0.0, "loss_total": 0.0668845847249031, "step": 862 }, { "batch_size": 4, "epoch": 0.3448, "step": 862, "tokens_per_device": 4180 }, { "epoch": 0.3448, "loss_ce": 0.2039521187543869, "loss_lvr": 0.9575029611587524, "loss_mode_switch": 0.0, "loss_total": 0.29970240592956543, "step": 862 }, { "epoch": 0.3452, "grad_norm": 1.2790473699569702, "learning_rate": 7.613183162537521e-06, "loss": 0.3147, "step": 863 }, { "batch_size": 4, "epoch": 0.3452, "step": 863, "tokens_per_device": 4204 }, { "epoch": 0.3452, "loss_ce": 0.40334564447402954, "loss_lvr": 0.965030312538147, "loss_mode_switch": 0.0, "loss_total": 0.4998486638069153, "step": 863 }, { "batch_size": 4, "epoch": 0.3452, "step": 863, "tokens_per_device": 5004 }, { "epoch": 0.3452, "loss_ce": 0.22028081119060516, "loss_lvr": 0.8656124472618103, "loss_mode_switch": 0.0, "loss_total": 0.30684205889701843, "step": 863 }, { "batch_size": 4, "epoch": 0.3452, "step": 863, "tokens_per_device": 5884 }, { "epoch": 0.3452, "loss_ce": 0.4279167652130127, "loss_lvr": 0.7106422185897827, "loss_mode_switch": 0.0, "loss_total": 0.4989809989929199, "step": 863 }, { "batch_size": 1, "epoch": 0.3452, "step": 863, "tokens_per_device": 6777 }, { "epoch": 0.3452, "loss_ce": 0.008287772536277771, "loss_lvr": 0.2824605405330658, "loss_mode_switch": 0.0, "loss_total": 0.03653382509946823, "step": 863 }, { "batch_size": 4, "epoch": 0.3452, "step": 863, "tokens_per_device": 4268 }, { "epoch": 0.3452, "loss_ce": 0.3387267291545868, "loss_lvr": 0.8600722551345825, "loss_mode_switch": 0.0, "loss_total": 0.424733966588974, "step": 863 }, { "batch_size": 4, "epoch": 0.3452, "step": 863, "tokens_per_device": 4636 }, { "epoch": 0.3452, "loss_ce": 0.2824530005455017, "loss_lvr": 0.9263308048248291, "loss_mode_switch": 0.0, "loss_total": 0.37508606910705566, "step": 863 }, { "batch_size": 4, "epoch": 0.3452, "step": 863, "tokens_per_device": 12424 }, { "epoch": 0.3452, "loss_ce": 0.1864139288663864, "loss_lvr": 0.8199172616004944, "loss_mode_switch": 0.0, "loss_total": 0.26840564608573914, "step": 863 }, { "batch_size": 1, "epoch": 0.3452, "step": 863, "tokens_per_device": 5180 }, { "epoch": 0.3452, "loss_ce": 0.002776859328150749, "loss_lvr": 0.4713115394115448, "loss_mode_switch": 0.0, "loss_total": 0.04990801215171814, "step": 863 }, { "epoch": 0.3456, "grad_norm": 1.3276382684707642, "learning_rate": 7.607658533620708e-06, "loss": 0.3444, "step": 864 }, { "batch_size": 4, "epoch": 0.3456, "step": 864, "tokens_per_device": 4980 }, { "epoch": 0.3456, "loss_ce": 0.572761595249176, "loss_lvr": 0.9066164493560791, "loss_mode_switch": 0.0, "loss_total": 0.6634232401847839, "step": 864 }, { "batch_size": 4, "epoch": 0.3456, "step": 864, "tokens_per_device": 4824 }, { "epoch": 0.3456, "loss_ce": 0.4764428734779358, "loss_lvr": 0.9299914240837097, "loss_mode_switch": 0.0, "loss_total": 0.5694420337677002, "step": 864 }, { "batch_size": 4, "epoch": 0.3456, "step": 864, "tokens_per_device": 4532 }, { "epoch": 0.3456, "loss_ce": 0.0024629132822155952, "loss_lvr": 0.7125311493873596, "loss_mode_switch": 0.0, "loss_total": 0.07371602952480316, "step": 864 }, { "batch_size": 4, "epoch": 0.3456, "step": 864, "tokens_per_device": 4248 }, { "epoch": 0.3456, "loss_ce": 0.1381957083940506, "loss_lvr": 0.7290050983428955, "loss_mode_switch": 0.0, "loss_total": 0.21109622716903687, "step": 864 }, { "batch_size": 1, "epoch": 0.3456, "step": 864, "tokens_per_device": 5124 }, { "epoch": 0.3456, "loss_ce": 0.0800946056842804, "loss_lvr": 0.7261735200881958, "loss_mode_switch": 0.0, "loss_total": 0.15271195769309998, "step": 864 }, { "batch_size": 1, "epoch": 0.3456, "step": 864, "tokens_per_device": 4914 }, { "epoch": 0.3456, "loss_ce": 0.06537514179944992, "loss_lvr": 0.5026817321777344, "loss_mode_switch": 0.0, "loss_total": 0.11564331501722336, "step": 864 }, { "batch_size": 4, "epoch": 0.3456, "step": 864, "tokens_per_device": 6696 }, { "epoch": 0.3456, "loss_ce": 0.04044937342405319, "loss_lvr": 0.7966414093971252, "loss_mode_switch": 0.0, "loss_total": 0.12011351436376572, "step": 864 }, { "batch_size": 4, "epoch": 0.3456, "step": 864, "tokens_per_device": 14244 }, { "epoch": 0.3456, "loss_ce": 0.28358420729637146, "loss_lvr": 0.6139565706253052, "loss_mode_switch": 0.0, "loss_total": 0.344979852437973, "step": 864 }, { "epoch": 0.346, "grad_norm": 1.304379940032959, "learning_rate": 7.602129528204023e-06, "loss": 0.2791, "step": 865 }, { "batch_size": 4, "epoch": 0.346, "step": 865, "tokens_per_device": 4236 }, { "epoch": 0.346, "loss_ce": 0.48986709117889404, "loss_lvr": 0.8650985956192017, "loss_mode_switch": 0.0, "loss_total": 0.5763769745826721, "step": 865 }, { "batch_size": 4, "epoch": 0.346, "step": 865, "tokens_per_device": 4200 }, { "epoch": 0.346, "loss_ce": 0.21122069656848907, "loss_lvr": 0.7046958208084106, "loss_mode_switch": 0.0, "loss_total": 0.2816902697086334, "step": 865 }, { "batch_size": 4, "epoch": 0.346, "step": 865, "tokens_per_device": 4192 }, { "epoch": 0.346, "loss_ce": 0.5169895887374878, "loss_lvr": 1.2003978490829468, "loss_mode_switch": 0.0, "loss_total": 0.6370293498039246, "step": 865 }, { "batch_size": 4, "epoch": 0.346, "step": 865, "tokens_per_device": 4228 }, { "epoch": 0.346, "loss_ce": 0.4513149857521057, "loss_lvr": 0.874363124370575, "loss_mode_switch": 0.0, "loss_total": 0.5387513041496277, "step": 865 }, { "batch_size": 4, "epoch": 0.346, "step": 865, "tokens_per_device": 1576 }, { "epoch": 0.346, "loss_ce": 0.4609525501728058, "loss_lvr": 0.9055119752883911, "loss_mode_switch": 0.0, "loss_total": 0.5515037775039673, "step": 865 }, { "batch_size": 1, "epoch": 0.346, "step": 865, "tokens_per_device": 5111 }, { "epoch": 0.346, "loss_ce": 0.011813248507678509, "loss_lvr": 0.4648864269256592, "loss_mode_switch": 0.0, "loss_total": 0.058301892131567, "step": 865 }, { "batch_size": 1, "epoch": 0.346, "step": 865, "tokens_per_device": 4876 }, { "epoch": 0.346, "loss_ce": 0.0006896289414726198, "loss_lvr": 0.39511626958847046, "loss_mode_switch": 0.0, "loss_total": 0.04020125791430473, "step": 865 }, { "batch_size": 1, "epoch": 0.346, "step": 865, "tokens_per_device": 4880 }, { "epoch": 0.346, "loss_ce": 0.0023300531320273876, "loss_lvr": 0.3046348989009857, "loss_mode_switch": 0.0, "loss_total": 0.032793544232845306, "step": 865 }, { "epoch": 0.3464, "grad_norm": 1.1373302936553955, "learning_rate": 7.596596155566942e-06, "loss": 0.2964, "step": 866 }, { "batch_size": 4, "epoch": 0.3464, "step": 866, "tokens_per_device": 3176 }, { "epoch": 0.3464, "loss_ce": 0.20654216408729553, "loss_lvr": 0.9518853425979614, "loss_mode_switch": 0.0, "loss_total": 0.3017306923866272, "step": 866 }, { "batch_size": 1, "epoch": 0.3464, "step": 866, "tokens_per_device": 4874 }, { "epoch": 0.3464, "loss_ce": 0.0005784878158010542, "loss_lvr": 0.3086116909980774, "loss_mode_switch": 0.0, "loss_total": 0.031439658254384995, "step": 866 }, { "batch_size": 1, "epoch": 0.3464, "step": 866, "tokens_per_device": 5003 }, { "epoch": 0.3464, "loss_ce": 0.2063395082950592, "loss_lvr": 0.28336668014526367, "loss_mode_switch": 0.0, "loss_total": 0.23467618227005005, "step": 866 }, { "batch_size": 4, "epoch": 0.3464, "step": 866, "tokens_per_device": 1500 }, { "epoch": 0.3464, "loss_ce": 0.13338683545589447, "loss_lvr": 1.2326111793518066, "loss_mode_switch": 0.0, "loss_total": 0.2566479444503784, "step": 866 }, { "batch_size": 4, "epoch": 0.3464, "step": 866, "tokens_per_device": 4196 }, { "epoch": 0.3464, "loss_ce": 0.1982262134552002, "loss_lvr": 0.9650087952613831, "loss_mode_switch": 0.0, "loss_total": 0.294727087020874, "step": 866 }, { "batch_size": 4, "epoch": 0.3464, "step": 866, "tokens_per_device": 4972 }, { "epoch": 0.3464, "loss_ce": 0.3150109052658081, "loss_lvr": 0.9472825527191162, "loss_mode_switch": 0.0, "loss_total": 0.4097391664981842, "step": 866 }, { "batch_size": 4, "epoch": 0.3464, "step": 866, "tokens_per_device": 6024 }, { "epoch": 0.3464, "loss_ce": 0.4941447377204895, "loss_lvr": 1.0187143087387085, "loss_mode_switch": 0.0, "loss_total": 0.5960161685943604, "step": 866 }, { "batch_size": 4, "epoch": 0.3464, "step": 866, "tokens_per_device": 5884 }, { "epoch": 0.3464, "loss_ce": 0.09333263337612152, "loss_lvr": 1.659523367881775, "loss_mode_switch": 0.0, "loss_total": 0.25928497314453125, "step": 866 }, { "epoch": 0.3468, "grad_norm": 1.3251423835754395, "learning_rate": 7.591058424996264e-06, "loss": 0.2668, "step": 867 }, { "batch_size": 4, "epoch": 0.3468, "step": 867, "tokens_per_device": 4652 }, { "epoch": 0.3468, "loss_ce": 0.10433065891265869, "loss_lvr": 0.6088026165962219, "loss_mode_switch": 0.0, "loss_total": 0.16521091759204865, "step": 867 }, { "batch_size": 4, "epoch": 0.3468, "step": 867, "tokens_per_device": 4448 }, { "epoch": 0.3468, "loss_ce": 0.6664732098579407, "loss_lvr": 1.10264253616333, "loss_mode_switch": 0.0, "loss_total": 0.7767374515533447, "step": 867 }, { "batch_size": 4, "epoch": 0.3468, "step": 867, "tokens_per_device": 1224 }, { "epoch": 0.3468, "loss_ce": 0.3968915343284607, "loss_lvr": 1.237650752067566, "loss_mode_switch": 0.0, "loss_total": 0.5206565856933594, "step": 867 }, { "batch_size": 4, "epoch": 0.3468, "step": 867, "tokens_per_device": 4380 }, { "epoch": 0.3468, "loss_ce": 0.40438035130500793, "loss_lvr": 0.9882857799530029, "loss_mode_switch": 0.0, "loss_total": 0.5032089352607727, "step": 867 }, { "batch_size": 1, "epoch": 0.3468, "step": 867, "tokens_per_device": 4895 }, { "epoch": 0.3468, "loss_ce": 0.17010775208473206, "loss_lvr": 0.43454745411872864, "loss_mode_switch": 0.0, "loss_total": 0.2135625034570694, "step": 867 }, { "batch_size": 1, "epoch": 0.3468, "step": 867, "tokens_per_device": 4957 }, { "epoch": 0.3468, "loss_ce": 1.0887750387191772, "loss_lvr": 0.7652768492698669, "loss_mode_switch": 0.0, "loss_total": 1.1653027534484863, "step": 867 }, { "batch_size": 4, "epoch": 0.3468, "step": 867, "tokens_per_device": 4608 }, { "epoch": 0.3468, "loss_ce": 0.3452608287334442, "loss_lvr": 0.8610356450080872, "loss_mode_switch": 0.0, "loss_total": 0.43136438727378845, "step": 867 }, { "batch_size": 1, "epoch": 0.3468, "step": 867, "tokens_per_device": 5171 }, { "epoch": 0.3468, "loss_ce": 0.000729898049030453, "loss_lvr": 0.27650707960128784, "loss_mode_switch": 0.0, "loss_total": 0.028380606323480606, "step": 867 }, { "epoch": 0.3472, "grad_norm": 1.3697019815444946, "learning_rate": 7.585516345786103e-06, "loss": 0.2969, "step": 868 }, { "batch_size": 4, "epoch": 0.3472, "step": 868, "tokens_per_device": 6224 }, { "epoch": 0.3472, "loss_ce": 0.1949143260717392, "loss_lvr": 0.843084990978241, "loss_mode_switch": 0.0, "loss_total": 0.2792228162288666, "step": 868 }, { "batch_size": 4, "epoch": 0.3472, "step": 868, "tokens_per_device": 4288 }, { "epoch": 0.3472, "loss_ce": 0.11767784506082535, "loss_lvr": 0.9115384221076965, "loss_mode_switch": 0.0, "loss_total": 0.20883169770240784, "step": 868 }, { "batch_size": 4, "epoch": 0.3472, "step": 868, "tokens_per_device": 7696 }, { "epoch": 0.3472, "loss_ce": 0.48154592514038086, "loss_lvr": 0.8806948661804199, "loss_mode_switch": 0.0, "loss_total": 0.5696154236793518, "step": 868 }, { "batch_size": 1, "epoch": 0.3472, "step": 868, "tokens_per_device": 5396 }, { "epoch": 0.3472, "loss_ce": 0.0011246658395975828, "loss_lvr": 1.2208269834518433, "loss_mode_switch": 0.0, "loss_total": 0.12320736795663834, "step": 868 }, { "batch_size": 1, "epoch": 0.3472, "step": 868, "tokens_per_device": 4866 }, { "epoch": 0.3472, "loss_ce": 0.00222405674867332, "loss_lvr": 0.44089144468307495, "loss_mode_switch": 0.0, "loss_total": 0.04631320387125015, "step": 868 }, { "batch_size": 4, "epoch": 0.3472, "step": 868, "tokens_per_device": 4668 }, { "epoch": 0.3472, "loss_ce": 0.3154790699481964, "loss_lvr": 1.0439022779464722, "loss_mode_switch": 0.0, "loss_total": 0.4198693037033081, "step": 868 }, { "batch_size": 4, "epoch": 0.3472, "step": 868, "tokens_per_device": 4516 }, { "epoch": 0.3472, "loss_ce": 0.3493260443210602, "loss_lvr": 1.018597960472107, "loss_mode_switch": 0.0, "loss_total": 0.45118585228919983, "step": 868 }, { "batch_size": 4, "epoch": 0.3472, "step": 868, "tokens_per_device": 5336 }, { "epoch": 0.3472, "loss_ce": 0.12405188381671906, "loss_lvr": 0.8344408869743347, "loss_mode_switch": 0.0, "loss_total": 0.20749597251415253, "step": 868 }, { "epoch": 0.3476, "grad_norm": 1.3219597339630127, "learning_rate": 7.5799699272378715e-06, "loss": 0.3272, "step": 869 }, { "batch_size": 4, "epoch": 0.3476, "step": 869, "tokens_per_device": 4256 }, { "epoch": 0.3476, "loss_ce": 0.2820882797241211, "loss_lvr": 1.2389901876449585, "loss_mode_switch": 0.0, "loss_total": 0.40598729252815247, "step": 869 }, { "batch_size": 4, "epoch": 0.3476, "step": 869, "tokens_per_device": 2688 }, { "epoch": 0.3476, "loss_ce": 0.24030418694019318, "loss_lvr": 0.9440708756446838, "loss_mode_switch": 0.0, "loss_total": 0.3347112834453583, "step": 869 }, { "batch_size": 4, "epoch": 0.3476, "step": 869, "tokens_per_device": 4228 }, { "epoch": 0.3476, "loss_ce": 0.005202151834964752, "loss_lvr": 0.880976140499115, "loss_mode_switch": 0.0, "loss_total": 0.09329976886510849, "step": 869 }, { "batch_size": 4, "epoch": 0.3476, "step": 869, "tokens_per_device": 4656 }, { "epoch": 0.3476, "loss_ce": 0.4818509817123413, "loss_lvr": 0.9485899209976196, "loss_mode_switch": 0.0, "loss_total": 0.5767099857330322, "step": 869 }, { "batch_size": 4, "epoch": 0.3476, "step": 869, "tokens_per_device": 4596 }, { "epoch": 0.3476, "loss_ce": 0.2556152045726776, "loss_lvr": 0.6993950605392456, "loss_mode_switch": 0.0, "loss_total": 0.3255547285079956, "step": 869 }, { "batch_size": 4, "epoch": 0.3476, "step": 869, "tokens_per_device": 4600 }, { "epoch": 0.3476, "loss_ce": 0.10103657096624374, "loss_lvr": 0.9424084424972534, "loss_mode_switch": 0.0, "loss_total": 0.19527742266654968, "step": 869 }, { "batch_size": 1, "epoch": 0.3476, "step": 869, "tokens_per_device": 4979 }, { "epoch": 0.3476, "loss_ce": 0.5607936382293701, "loss_lvr": 0.7209420800209045, "loss_mode_switch": 0.0, "loss_total": 0.6328878402709961, "step": 869 }, { "batch_size": 1, "epoch": 0.3476, "step": 869, "tokens_per_device": 8897 }, { "epoch": 0.3476, "loss_ce": 0.004764936864376068, "loss_lvr": 0.5048759579658508, "loss_mode_switch": 0.0, "loss_total": 0.05525253340601921, "step": 869 }, { "epoch": 0.348, "grad_norm": 1.462066888809204, "learning_rate": 7.574419178660269e-06, "loss": 0.3385, "step": 870 }, { "batch_size": 1, "epoch": 0.348, "step": 870, "tokens_per_device": 4891 }, { "epoch": 0.348, "loss_ce": 1.5795440673828125, "loss_lvr": 1.0579982995986938, "loss_mode_switch": 0.0, "loss_total": 1.685343861579895, "step": 870 }, { "batch_size": 1, "epoch": 0.348, "step": 870, "tokens_per_device": 5032 }, { "epoch": 0.348, "loss_ce": 0.017999209463596344, "loss_lvr": 1.1136916875839233, "loss_mode_switch": 0.0, "loss_total": 0.1293683797121048, "step": 870 }, { "batch_size": 4, "epoch": 0.348, "step": 870, "tokens_per_device": 4728 }, { "epoch": 0.348, "loss_ce": 0.07201813161373138, "loss_lvr": 0.8503658771514893, "loss_mode_switch": 0.0, "loss_total": 0.15705472230911255, "step": 870 }, { "batch_size": 4, "epoch": 0.348, "step": 870, "tokens_per_device": 1420 }, { "epoch": 0.348, "loss_ce": 0.6268234848976135, "loss_lvr": 0.8339813947677612, "loss_mode_switch": 0.0, "loss_total": 0.7102216482162476, "step": 870 }, { "batch_size": 4, "epoch": 0.348, "step": 870, "tokens_per_device": 5700 }, { "epoch": 0.348, "loss_ce": 0.15202414989471436, "loss_lvr": 1.3620871305465698, "loss_mode_switch": 0.0, "loss_total": 0.28823286294937134, "step": 870 }, { "batch_size": 4, "epoch": 0.348, "step": 870, "tokens_per_device": 4256 }, { "epoch": 0.348, "loss_ce": 0.09828191995620728, "loss_lvr": 1.1457535028457642, "loss_mode_switch": 0.0, "loss_total": 0.21285727620124817, "step": 870 }, { "batch_size": 4, "epoch": 0.348, "step": 870, "tokens_per_device": 4780 }, { "epoch": 0.348, "loss_ce": 0.2565016448497772, "loss_lvr": 0.7536577582359314, "loss_mode_switch": 0.0, "loss_total": 0.33186742663383484, "step": 870 }, { "batch_size": 4, "epoch": 0.348, "step": 870, "tokens_per_device": 3780 }, { "epoch": 0.348, "loss_ce": 0.1181199848651886, "loss_lvr": 0.7655306458473206, "loss_mode_switch": 0.0, "loss_total": 0.1946730613708496, "step": 870 }, { "epoch": 0.3484, "grad_norm": 1.3170239925384521, "learning_rate": 7.568864109369252e-06, "loss": 0.3067, "step": 871 }, { "batch_size": 1, "epoch": 0.3484, "step": 871, "tokens_per_device": 5139 }, { "epoch": 0.3484, "loss_ce": 0.002065445529296994, "loss_lvr": 0.5062205791473389, "loss_mode_switch": 0.0, "loss_total": 0.05268750339746475, "step": 871 }, { "batch_size": 1, "epoch": 0.3484, "step": 871, "tokens_per_device": 5069 }, { "epoch": 0.3484, "loss_ce": 0.01784995198249817, "loss_lvr": 0.5183581113815308, "loss_mode_switch": 0.0, "loss_total": 0.06968576461076736, "step": 871 }, { "batch_size": 4, "epoch": 0.3484, "step": 871, "tokens_per_device": 9004 }, { "epoch": 0.3484, "loss_ce": 0.0371515266597271, "loss_lvr": 0.7647913098335266, "loss_mode_switch": 0.0, "loss_total": 0.11363065242767334, "step": 871 }, { "batch_size": 1, "epoch": 0.3484, "step": 871, "tokens_per_device": 5155 }, { "epoch": 0.3484, "loss_ce": 0.02130897529423237, "loss_lvr": 0.44786450266838074, "loss_mode_switch": 0.0, "loss_total": 0.06609542667865753, "step": 871 }, { "batch_size": 4, "epoch": 0.3484, "step": 871, "tokens_per_device": 5780 }, { "epoch": 0.3484, "loss_ce": 0.4859117567539215, "loss_lvr": 0.8405163288116455, "loss_mode_switch": 0.0, "loss_total": 0.5699633955955505, "step": 871 }, { "batch_size": 1, "epoch": 0.3484, "step": 871, "tokens_per_device": 5764 }, { "epoch": 0.3484, "loss_ce": 0.9925321340560913, "loss_lvr": 0.5542274713516235, "loss_mode_switch": 0.0, "loss_total": 1.0479549169540405, "step": 871 }, { "batch_size": 4, "epoch": 0.3484, "step": 871, "tokens_per_device": 2656 }, { "epoch": 0.3484, "loss_ce": 0.24372966587543488, "loss_lvr": 0.9928447008132935, "loss_mode_switch": 0.0, "loss_total": 0.343014121055603, "step": 871 }, { "batch_size": 4, "epoch": 0.3484, "step": 871, "tokens_per_device": 4720 }, { "epoch": 0.3484, "loss_ce": 0.48132839798927307, "loss_lvr": 0.884931206703186, "loss_mode_switch": 0.0, "loss_total": 0.5698215365409851, "step": 871 }, { "epoch": 0.3488, "grad_norm": 1.400207757949829, "learning_rate": 7.56330472868804e-06, "loss": 0.2599, "step": 872 }, { "batch_size": 4, "epoch": 0.3488, "step": 872, "tokens_per_device": 4892 }, { "epoch": 0.3488, "loss_ce": 0.33228597044944763, "loss_lvr": 0.8225499987602234, "loss_mode_switch": 0.0, "loss_total": 0.41454097628593445, "step": 872 }, { "batch_size": 4, "epoch": 0.3488, "step": 872, "tokens_per_device": 4140 }, { "epoch": 0.3488, "loss_ce": 0.09324178099632263, "loss_lvr": 1.0280758142471313, "loss_mode_switch": 0.0, "loss_total": 0.19604936242103577, "step": 872 }, { "batch_size": 1, "epoch": 0.3488, "step": 872, "tokens_per_device": 5168 }, { "epoch": 0.3488, "loss_ce": 0.0006711527821607888, "loss_lvr": 0.47222548723220825, "loss_mode_switch": 0.0, "loss_total": 0.0478937029838562, "step": 872 }, { "batch_size": 4, "epoch": 0.3488, "step": 872, "tokens_per_device": 1416 }, { "epoch": 0.3488, "loss_ce": 0.3893069326877594, "loss_lvr": 1.1297070980072021, "loss_mode_switch": 0.0, "loss_total": 0.5022776126861572, "step": 872 }, { "batch_size": 1, "epoch": 0.3488, "step": 872, "tokens_per_device": 5984 }, { "epoch": 0.3488, "loss_ce": 0.00038538806256838143, "loss_lvr": 0.28524237871170044, "loss_mode_switch": 0.0, "loss_total": 0.028909627348184586, "step": 872 }, { "batch_size": 4, "epoch": 0.3488, "step": 872, "tokens_per_device": 3648 }, { "epoch": 0.3488, "loss_ce": 0.06647071242332458, "loss_lvr": 0.6538963913917542, "loss_mode_switch": 0.0, "loss_total": 0.13186034560203552, "step": 872 }, { "batch_size": 1, "epoch": 0.3488, "step": 872, "tokens_per_device": 4855 }, { "epoch": 0.3488, "loss_ce": 0.046762023121118546, "loss_lvr": 0.29363465309143066, "loss_mode_switch": 0.0, "loss_total": 0.07612548768520355, "step": 872 }, { "batch_size": 4, "epoch": 0.3488, "step": 872, "tokens_per_device": 2692 }, { "epoch": 0.3488, "loss_ce": 0.49324774742126465, "loss_lvr": 1.002984642982483, "loss_mode_switch": 0.0, "loss_total": 0.5935462117195129, "step": 872 }, { "epoch": 0.3492, "grad_norm": 1.2265372276306152, "learning_rate": 7.557741045947082e-06, "loss": 0.2814, "step": 873 }, { "batch_size": 4, "epoch": 0.3492, "step": 873, "tokens_per_device": 4232 }, { "epoch": 0.3492, "loss_ce": 0.06467389315366745, "loss_lvr": 0.9338477253913879, "loss_mode_switch": 0.0, "loss_total": 0.15805867314338684, "step": 873 }, { "batch_size": 1, "epoch": 0.3492, "step": 873, "tokens_per_device": 5186 }, { "epoch": 0.3492, "loss_ce": 0.070415198802948, "loss_lvr": 0.44593167304992676, "loss_mode_switch": 0.0, "loss_total": 0.11500836908817291, "step": 873 }, { "batch_size": 4, "epoch": 0.3492, "step": 873, "tokens_per_device": 4076 }, { "epoch": 0.3492, "loss_ce": 0.08172021806240082, "loss_lvr": 0.9683579206466675, "loss_mode_switch": 0.0, "loss_total": 0.17855601012706757, "step": 873 }, { "batch_size": 4, "epoch": 0.3492, "step": 873, "tokens_per_device": 6264 }, { "epoch": 0.3492, "loss_ce": 0.08041683584451675, "loss_lvr": 0.8333698511123657, "loss_mode_switch": 0.0, "loss_total": 0.16375382244586945, "step": 873 }, { "batch_size": 1, "epoch": 0.3492, "step": 873, "tokens_per_device": 5170 }, { "epoch": 0.3492, "loss_ce": 0.0013383914483711123, "loss_lvr": 0.38667312264442444, "loss_mode_switch": 0.0, "loss_total": 0.04000570625066757, "step": 873 }, { "batch_size": 4, "epoch": 0.3492, "step": 873, "tokens_per_device": 5284 }, { "epoch": 0.3492, "loss_ce": 0.09241805225610733, "loss_lvr": 0.9128691554069519, "loss_mode_switch": 0.0, "loss_total": 0.18370497226715088, "step": 873 }, { "batch_size": 1, "epoch": 0.3492, "step": 873, "tokens_per_device": 5097 }, { "epoch": 0.3492, "loss_ce": 0.10426618903875351, "loss_lvr": 0.7152484059333801, "loss_mode_switch": 0.0, "loss_total": 0.17579102516174316, "step": 873 }, { "batch_size": 4, "epoch": 0.3492, "step": 873, "tokens_per_device": 2680 }, { "epoch": 0.3492, "loss_ce": 0.3622545897960663, "loss_lvr": 0.747870922088623, "loss_mode_switch": 0.0, "loss_total": 0.437041699886322, "step": 873 }, { "epoch": 0.3496, "grad_norm": 1.3409507274627686, "learning_rate": 7.552173070484048e-06, "loss": 0.3279, "step": 874 }, { "batch_size": 4, "epoch": 0.3496, "step": 874, "tokens_per_device": 3756 }, { "epoch": 0.3496, "loss_ce": 0.058331117033958435, "loss_lvr": 0.8427011370658875, "loss_mode_switch": 0.0, "loss_total": 0.14260122179985046, "step": 874 }, { "batch_size": 4, "epoch": 0.3496, "step": 874, "tokens_per_device": 4388 }, { "epoch": 0.3496, "loss_ce": 0.03883056342601776, "loss_lvr": 0.9263938069343567, "loss_mode_switch": 0.0, "loss_total": 0.1314699351787567, "step": 874 }, { "batch_size": 4, "epoch": 0.3496, "step": 874, "tokens_per_device": 2648 }, { "epoch": 0.3496, "loss_ce": 0.09722212702035904, "loss_lvr": 1.0061686038970947, "loss_mode_switch": 0.0, "loss_total": 0.19783899188041687, "step": 874 }, { "batch_size": 4, "epoch": 0.3496, "step": 874, "tokens_per_device": 7484 }, { "epoch": 0.3496, "loss_ce": 0.14061948657035828, "loss_lvr": 0.3373173773288727, "loss_mode_switch": 0.0, "loss_total": 0.17435123026371002, "step": 874 }, { "batch_size": 4, "epoch": 0.3496, "step": 874, "tokens_per_device": 4248 }, { "epoch": 0.3496, "loss_ce": 0.1559312790632248, "loss_lvr": 0.980268657207489, "loss_mode_switch": 0.0, "loss_total": 0.253958135843277, "step": 874 }, { "batch_size": 4, "epoch": 0.3496, "step": 874, "tokens_per_device": 2748 }, { "epoch": 0.3496, "loss_ce": 0.08921864628791809, "loss_lvr": 0.7373543381690979, "loss_mode_switch": 0.0, "loss_total": 0.16295409202575684, "step": 874 }, { "batch_size": 1, "epoch": 0.3496, "step": 874, "tokens_per_device": 5109 }, { "epoch": 0.3496, "loss_ce": 0.0015991672407835722, "loss_lvr": 0.3600490391254425, "loss_mode_switch": 0.0, "loss_total": 0.03760407119989395, "step": 874 }, { "batch_size": 4, "epoch": 0.3496, "step": 874, "tokens_per_device": 4640 }, { "epoch": 0.3496, "loss_ce": 0.34061959385871887, "loss_lvr": 1.101790189743042, "loss_mode_switch": 0.0, "loss_total": 0.4507986307144165, "step": 874 }, { "epoch": 0.35, "grad_norm": 1.3237206935882568, "learning_rate": 7.546600811643816e-06, "loss": 0.298, "step": 875 }, { "batch_size": 1, "epoch": 0.35, "step": 875, "tokens_per_device": 5113 }, { "epoch": 0.35, "loss_ce": 0.026002945378422737, "loss_lvr": 0.8124120831489563, "loss_mode_switch": 0.0, "loss_total": 0.10724415630102158, "step": 875 }, { "batch_size": 4, "epoch": 0.35, "step": 875, "tokens_per_device": 11072 }, { "epoch": 0.35, "loss_ce": 0.04342463240027428, "loss_lvr": 0.9198628067970276, "loss_mode_switch": 0.0, "loss_total": 0.13541091978549957, "step": 875 }, { "batch_size": 1, "epoch": 0.35, "step": 875, "tokens_per_device": 5148 }, { "epoch": 0.35, "loss_ce": 0.04787837713956833, "loss_lvr": 0.7010650038719177, "loss_mode_switch": 0.0, "loss_total": 0.11798487603664398, "step": 875 }, { "batch_size": 1, "epoch": 0.35, "step": 875, "tokens_per_device": 5160 }, { "epoch": 0.35, "loss_ce": 0.18068794906139374, "loss_lvr": 0.9019408822059631, "loss_mode_switch": 0.0, "loss_total": 0.2708820402622223, "step": 875 }, { "batch_size": 1, "epoch": 0.35, "step": 875, "tokens_per_device": 5174 }, { "epoch": 0.35, "loss_ce": 0.0006914827972650528, "loss_lvr": 0.5567061305046082, "loss_mode_switch": 0.0, "loss_total": 0.0563620999455452, "step": 875 }, { "batch_size": 4, "epoch": 0.35, "step": 875, "tokens_per_device": 6400 }, { "epoch": 0.35, "loss_ce": 0.1944247931241989, "loss_lvr": 0.7366430163383484, "loss_mode_switch": 0.0, "loss_total": 0.26808908581733704, "step": 875 }, { "batch_size": 4, "epoch": 0.35, "step": 875, "tokens_per_device": 3760 }, { "epoch": 0.35, "loss_ce": 0.44869133830070496, "loss_lvr": 0.9278593063354492, "loss_mode_switch": 0.0, "loss_total": 0.5414772629737854, "step": 875 }, { "batch_size": 4, "epoch": 0.35, "step": 875, "tokens_per_device": 2736 }, { "epoch": 0.35, "loss_ce": 0.008285125717520714, "loss_lvr": 1.1220442056655884, "loss_mode_switch": 0.0, "loss_total": 0.12048955261707306, "step": 875 }, { "epoch": 0.3504, "grad_norm": 1.4656955003738403, "learning_rate": 7.541024278778446e-06, "loss": 0.2745, "step": 876 }, { "batch_size": 4, "epoch": 0.3504, "step": 876, "tokens_per_device": 5888 }, { "epoch": 0.3504, "loss_ce": 0.8758987188339233, "loss_lvr": 0.8316964507102966, "loss_mode_switch": 0.0, "loss_total": 0.9590683579444885, "step": 876 }, { "batch_size": 1, "epoch": 0.3504, "step": 876, "tokens_per_device": 4892 }, { "epoch": 0.3504, "loss_ce": 0.015386458486318588, "loss_lvr": 0.33465102314949036, "loss_mode_switch": 0.0, "loss_total": 0.048851560801267624, "step": 876 }, { "batch_size": 4, "epoch": 0.3504, "step": 876, "tokens_per_device": 5632 }, { "epoch": 0.3504, "loss_ce": 0.12059982866048813, "loss_lvr": 0.7443531155586243, "loss_mode_switch": 0.0, "loss_total": 0.1950351446866989, "step": 876 }, { "batch_size": 4, "epoch": 0.3504, "step": 876, "tokens_per_device": 4424 }, { "epoch": 0.3504, "loss_ce": 0.03003990463912487, "loss_lvr": 0.7776679992675781, "loss_mode_switch": 0.0, "loss_total": 0.10780670493841171, "step": 876 }, { "batch_size": 4, "epoch": 0.3504, "step": 876, "tokens_per_device": 5888 }, { "epoch": 0.3504, "loss_ce": 0.19417400658130646, "loss_lvr": 0.7037045955657959, "loss_mode_switch": 0.0, "loss_total": 0.26454445719718933, "step": 876 }, { "batch_size": 4, "epoch": 0.3504, "step": 876, "tokens_per_device": 1148 }, { "epoch": 0.3504, "loss_ce": 0.45446130633354187, "loss_lvr": 1.2862409353256226, "loss_mode_switch": 0.0, "loss_total": 0.5830854177474976, "step": 876 }, { "batch_size": 4, "epoch": 0.3504, "step": 876, "tokens_per_device": 4280 }, { "epoch": 0.3504, "loss_ce": 0.07382948696613312, "loss_lvr": 0.986078679561615, "loss_mode_switch": 0.0, "loss_total": 0.17243735492229462, "step": 876 }, { "batch_size": 4, "epoch": 0.3504, "step": 876, "tokens_per_device": 1300 }, { "epoch": 0.3504, "loss_ce": 0.6186320781707764, "loss_lvr": 1.1469002962112427, "loss_mode_switch": 0.0, "loss_total": 0.7333220839500427, "step": 876 }, { "epoch": 0.3508, "grad_norm": 1.4268475770950317, "learning_rate": 7.53544348124718e-06, "loss": 0.2824, "step": 877 }, { "batch_size": 1, "epoch": 0.3508, "step": 877, "tokens_per_device": 4877 }, { "epoch": 0.3508, "loss_ce": 0.0644645243883133, "loss_lvr": 0.3925144076347351, "loss_mode_switch": 0.0, "loss_total": 0.10371597111225128, "step": 877 }, { "batch_size": 4, "epoch": 0.3508, "step": 877, "tokens_per_device": 3804 }, { "epoch": 0.3508, "loss_ce": 0.028251629322767258, "loss_lvr": 0.7590149641036987, "loss_mode_switch": 0.0, "loss_total": 0.10415312647819519, "step": 877 }, { "batch_size": 4, "epoch": 0.3508, "step": 877, "tokens_per_device": 3172 }, { "epoch": 0.3508, "loss_ce": 0.4643750786781311, "loss_lvr": 0.9571034908294678, "loss_mode_switch": 0.0, "loss_total": 0.5600854158401489, "step": 877 }, { "batch_size": 4, "epoch": 0.3508, "step": 877, "tokens_per_device": 2540 }, { "epoch": 0.3508, "loss_ce": 0.41959473490715027, "loss_lvr": 0.9512930512428284, "loss_mode_switch": 0.0, "loss_total": 0.5147240161895752, "step": 877 }, { "batch_size": 4, "epoch": 0.3508, "step": 877, "tokens_per_device": 5704 }, { "epoch": 0.3508, "loss_ce": 0.1354195773601532, "loss_lvr": 1.0880776643753052, "loss_mode_switch": 0.0, "loss_total": 0.2442273497581482, "step": 877 }, { "batch_size": 4, "epoch": 0.3508, "step": 877, "tokens_per_device": 5208 }, { "epoch": 0.3508, "loss_ce": 0.3637225329875946, "loss_lvr": 0.9457222819328308, "loss_mode_switch": 0.0, "loss_total": 0.45829474925994873, "step": 877 }, { "batch_size": 1, "epoch": 0.3508, "step": 877, "tokens_per_device": 4918 }, { "epoch": 0.3508, "loss_ce": 0.0022385986521840096, "loss_lvr": 0.47803932428359985, "loss_mode_switch": 0.0, "loss_total": 0.0500425323843956, "step": 877 }, { "batch_size": 1, "epoch": 0.3508, "step": 877, "tokens_per_device": 4891 }, { "epoch": 0.3508, "loss_ce": 0.07445647567510605, "loss_lvr": 0.2016751915216446, "loss_mode_switch": 0.0, "loss_total": 0.09462399780750275, "step": 877 }, { "epoch": 0.3512, "grad_norm": 1.4510384798049927, "learning_rate": 7.52985842841641e-06, "loss": 0.2882, "step": 878 }, { "batch_size": 4, "epoch": 0.3512, "step": 878, "tokens_per_device": 4612 }, { "epoch": 0.3512, "loss_ce": 0.3302607536315918, "loss_lvr": 1.0853888988494873, "loss_mode_switch": 0.0, "loss_total": 0.438799649477005, "step": 878 }, { "batch_size": 4, "epoch": 0.3512, "step": 878, "tokens_per_device": 5832 }, { "epoch": 0.3512, "loss_ce": 0.14530861377716064, "loss_lvr": 0.7423850893974304, "loss_mode_switch": 0.0, "loss_total": 0.2195471227169037, "step": 878 }, { "batch_size": 4, "epoch": 0.3512, "step": 878, "tokens_per_device": 2680 }, { "epoch": 0.3512, "loss_ce": 0.6947644352912903, "loss_lvr": 0.9696109294891357, "loss_mode_switch": 0.0, "loss_total": 0.7917255163192749, "step": 878 }, { "batch_size": 1, "epoch": 0.3512, "step": 878, "tokens_per_device": 5538 }, { "epoch": 0.3512, "loss_ce": 0.04262879490852356, "loss_lvr": 0.40313848853111267, "loss_mode_switch": 0.0, "loss_total": 0.0829426497220993, "step": 878 }, { "batch_size": 4, "epoch": 0.3512, "step": 878, "tokens_per_device": 4348 }, { "epoch": 0.3512, "loss_ce": 0.21689824759960175, "loss_lvr": 0.901877760887146, "loss_mode_switch": 0.0, "loss_total": 0.3070860207080841, "step": 878 }, { "batch_size": 4, "epoch": 0.3512, "step": 878, "tokens_per_device": 6216 }, { "epoch": 0.3512, "loss_ce": 0.21059300005435944, "loss_lvr": 0.6806156635284424, "loss_mode_switch": 0.0, "loss_total": 0.2786545753479004, "step": 878 }, { "batch_size": 4, "epoch": 0.3512, "step": 878, "tokens_per_device": 9308 }, { "epoch": 0.3512, "loss_ce": 0.02373204566538334, "loss_lvr": 0.5938647985458374, "loss_mode_switch": 0.0, "loss_total": 0.08311852812767029, "step": 878 }, { "batch_size": 1, "epoch": 0.3512, "step": 878, "tokens_per_device": 4937 }, { "epoch": 0.3512, "loss_ce": 0.00754415662959218, "loss_lvr": 0.2985149323940277, "loss_mode_switch": 0.0, "loss_total": 0.0373956523835659, "step": 878 }, { "epoch": 0.3516, "grad_norm": 1.177618145942688, "learning_rate": 7.524269129659674e-06, "loss": 0.3054, "step": 879 }, { "batch_size": 4, "epoch": 0.3516, "step": 879, "tokens_per_device": 5692 }, { "epoch": 0.3516, "loss_ce": 0.23955345153808594, "loss_lvr": 0.8253054022789001, "loss_mode_switch": 0.0, "loss_total": 0.3220840096473694, "step": 879 }, { "batch_size": 4, "epoch": 0.3516, "step": 879, "tokens_per_device": 2392 }, { "epoch": 0.3516, "loss_ce": 0.17114807665348053, "loss_lvr": 0.8028922080993652, "loss_mode_switch": 0.0, "loss_total": 0.25143730640411377, "step": 879 }, { "batch_size": 4, "epoch": 0.3516, "step": 879, "tokens_per_device": 2684 }, { "epoch": 0.3516, "loss_ce": 0.10269840806722641, "loss_lvr": 0.9254335165023804, "loss_mode_switch": 0.0, "loss_total": 0.1952417641878128, "step": 879 }, { "batch_size": 4, "epoch": 0.3516, "step": 879, "tokens_per_device": 3748 }, { "epoch": 0.3516, "loss_ce": 0.49591580033302307, "loss_lvr": 0.9845098257064819, "loss_mode_switch": 0.0, "loss_total": 0.5943667888641357, "step": 879 }, { "batch_size": 1, "epoch": 0.3516, "step": 879, "tokens_per_device": 5042 }, { "epoch": 0.3516, "loss_ce": 0.0028304471634328365, "loss_lvr": 0.41252779960632324, "loss_mode_switch": 0.0, "loss_total": 0.04408322647213936, "step": 879 }, { "batch_size": 4, "epoch": 0.3516, "step": 879, "tokens_per_device": 4596 }, { "epoch": 0.3516, "loss_ce": 0.23400253057479858, "loss_lvr": 0.7859655618667603, "loss_mode_switch": 0.0, "loss_total": 0.3125990927219391, "step": 879 }, { "batch_size": 1, "epoch": 0.3516, "step": 879, "tokens_per_device": 4886 }, { "epoch": 0.3516, "loss_ce": 0.04700321704149246, "loss_lvr": 0.8978288173675537, "loss_mode_switch": 0.0, "loss_total": 0.1367861032485962, "step": 879 }, { "batch_size": 4, "epoch": 0.3516, "step": 879, "tokens_per_device": 4696 }, { "epoch": 0.3516, "loss_ce": 0.28914159536361694, "loss_lvr": 0.7997363805770874, "loss_mode_switch": 0.0, "loss_total": 0.3691152334213257, "step": 879 }, { "epoch": 0.352, "grad_norm": 1.2016087770462036, "learning_rate": 7.5186755943576324e-06, "loss": 0.2532, "step": 880 }, { "batch_size": 1, "epoch": 0.352, "step": 880, "tokens_per_device": 4782 }, { "epoch": 0.352, "loss_ce": 0.031522300094366074, "loss_lvr": 0.7430357336997986, "loss_mode_switch": 0.0, "loss_total": 0.10582587122917175, "step": 880 }, { "batch_size": 4, "epoch": 0.352, "step": 880, "tokens_per_device": 4616 }, { "epoch": 0.352, "loss_ce": 0.4916531443595886, "loss_lvr": 1.046176791191101, "loss_mode_switch": 0.0, "loss_total": 0.5962707996368408, "step": 880 }, { "batch_size": 4, "epoch": 0.352, "step": 880, "tokens_per_device": 11832 }, { "epoch": 0.352, "loss_ce": 0.4370139241218567, "loss_lvr": 0.3949509859085083, "loss_mode_switch": 0.0, "loss_total": 0.4765090346336365, "step": 880 }, { "batch_size": 4, "epoch": 0.352, "step": 880, "tokens_per_device": 3648 }, { "epoch": 0.352, "loss_ce": 0.5759233832359314, "loss_lvr": 0.8962370157241821, "loss_mode_switch": 0.0, "loss_total": 0.6655470728874207, "step": 880 }, { "batch_size": 4, "epoch": 0.352, "step": 880, "tokens_per_device": 4424 }, { "epoch": 0.352, "loss_ce": 0.40001779794692993, "loss_lvr": 0.8233845233917236, "loss_mode_switch": 0.0, "loss_total": 0.4823562502861023, "step": 880 }, { "batch_size": 4, "epoch": 0.352, "step": 880, "tokens_per_device": 10640 }, { "epoch": 0.352, "loss_ce": 0.5788286924362183, "loss_lvr": 0.8097175359725952, "loss_mode_switch": 0.0, "loss_total": 0.6598004698753357, "step": 880 }, { "batch_size": 4, "epoch": 0.352, "step": 880, "tokens_per_device": 5132 }, { "epoch": 0.352, "loss_ce": 0.16046682000160217, "loss_lvr": 0.8329651355743408, "loss_mode_switch": 0.0, "loss_total": 0.24376332759857178, "step": 880 }, { "batch_size": 1, "epoch": 0.352, "step": 880, "tokens_per_device": 4934 }, { "epoch": 0.352, "loss_ce": 0.015042463317513466, "loss_lvr": 0.5098943114280701, "loss_mode_switch": 0.0, "loss_total": 0.06603189557790756, "step": 880 }, { "epoch": 0.3524, "grad_norm": 1.243143081665039, "learning_rate": 7.5130778318980614e-06, "loss": 0.3074, "step": 881 }, { "batch_size": 1, "epoch": 0.3524, "step": 881, "tokens_per_device": 4887 }, { "epoch": 0.3524, "loss_ce": 0.03432854637503624, "loss_lvr": 1.1457817554473877, "loss_mode_switch": 0.0, "loss_total": 0.14890672266483307, "step": 881 }, { "batch_size": 4, "epoch": 0.3524, "step": 881, "tokens_per_device": 9884 }, { "epoch": 0.3524, "loss_ce": 0.12527252733707428, "loss_lvr": 0.8676272630691528, "loss_mode_switch": 0.0, "loss_total": 0.21203525364398956, "step": 881 }, { "batch_size": 4, "epoch": 0.3524, "step": 881, "tokens_per_device": 3732 }, { "epoch": 0.3524, "loss_ce": 0.17478512227535248, "loss_lvr": 0.8771312832832336, "loss_mode_switch": 0.0, "loss_total": 0.26249825954437256, "step": 881 }, { "batch_size": 4, "epoch": 0.3524, "step": 881, "tokens_per_device": 3204 }, { "epoch": 0.3524, "loss_ce": 0.42862290143966675, "loss_lvr": 0.9759235382080078, "loss_mode_switch": 0.0, "loss_total": 0.5262152552604675, "step": 881 }, { "batch_size": 4, "epoch": 0.3524, "step": 881, "tokens_per_device": 5188 }, { "epoch": 0.3524, "loss_ce": 0.1593182533979416, "loss_lvr": 0.7812815308570862, "loss_mode_switch": 0.0, "loss_total": 0.2374463975429535, "step": 881 }, { "batch_size": 4, "epoch": 0.3524, "step": 881, "tokens_per_device": 1540 }, { "epoch": 0.3524, "loss_ce": 0.1298973709344864, "loss_lvr": 0.9089623093605042, "loss_mode_switch": 0.0, "loss_total": 0.22079360485076904, "step": 881 }, { "batch_size": 1, "epoch": 0.3524, "step": 881, "tokens_per_device": 5026 }, { "epoch": 0.3524, "loss_ce": 0.028796978294849396, "loss_lvr": 0.8261737823486328, "loss_mode_switch": 0.0, "loss_total": 0.1114143580198288, "step": 881 }, { "batch_size": 4, "epoch": 0.3524, "step": 881, "tokens_per_device": 3808 }, { "epoch": 0.3524, "loss_ce": 0.2987786531448364, "loss_lvr": 0.9466295838356018, "loss_mode_switch": 0.0, "loss_total": 0.3934416174888611, "step": 881 }, { "epoch": 0.3528, "grad_norm": 1.3488569259643555, "learning_rate": 7.5074758516758276e-06, "loss": 0.2989, "step": 882 }, { "batch_size": 4, "epoch": 0.3528, "step": 882, "tokens_per_device": 1516 }, { "epoch": 0.3528, "loss_ce": 0.3581237196922302, "loss_lvr": 1.1151233911514282, "loss_mode_switch": 0.0, "loss_total": 0.46963605284690857, "step": 882 }, { "batch_size": 4, "epoch": 0.3528, "step": 882, "tokens_per_device": 2756 }, { "epoch": 0.3528, "loss_ce": 0.5089080333709717, "loss_lvr": 1.1108312606811523, "loss_mode_switch": 0.0, "loss_total": 0.6199911832809448, "step": 882 }, { "batch_size": 4, "epoch": 0.3528, "step": 882, "tokens_per_device": 1328 }, { "epoch": 0.3528, "loss_ce": 0.19103221595287323, "loss_lvr": 1.1344653367996216, "loss_mode_switch": 0.0, "loss_total": 0.3044787645339966, "step": 882 }, { "batch_size": 4, "epoch": 0.3528, "step": 882, "tokens_per_device": 1912 }, { "epoch": 0.3528, "loss_ce": 0.09913122653961182, "loss_lvr": 0.9738323092460632, "loss_mode_switch": 0.0, "loss_total": 0.19651445746421814, "step": 882 }, { "batch_size": 1, "epoch": 0.3528, "step": 882, "tokens_per_device": 5108 }, { "epoch": 0.3528, "loss_ce": 0.09863835573196411, "loss_lvr": 0.47802212834358215, "loss_mode_switch": 0.0, "loss_total": 0.1464405655860901, "step": 882 }, { "batch_size": 4, "epoch": 0.3528, "step": 882, "tokens_per_device": 6552 }, { "epoch": 0.3528, "loss_ce": 0.12471074610948563, "loss_lvr": 0.8354390859603882, "loss_mode_switch": 0.0, "loss_total": 0.20825466513633728, "step": 882 }, { "batch_size": 4, "epoch": 0.3528, "step": 882, "tokens_per_device": 4620 }, { "epoch": 0.3528, "loss_ce": 0.17535945773124695, "loss_lvr": 1.0365772247314453, "loss_mode_switch": 0.0, "loss_total": 0.2790171802043915, "step": 882 }, { "batch_size": 4, "epoch": 0.3528, "step": 882, "tokens_per_device": 4288 }, { "epoch": 0.3528, "loss_ce": 0.116269052028656, "loss_lvr": 0.8194051384925842, "loss_mode_switch": 0.0, "loss_total": 0.19820956885814667, "step": 882 }, { "epoch": 0.3532, "grad_norm": 1.3548532724380493, "learning_rate": 7.501869663092875e-06, "loss": 0.3176, "step": 883 }, { "batch_size": 4, "epoch": 0.3532, "step": 883, "tokens_per_device": 8580 }, { "epoch": 0.3532, "loss_ce": 0.17696963250637054, "loss_lvr": 0.9637892842292786, "loss_mode_switch": 0.0, "loss_total": 0.2733485698699951, "step": 883 }, { "batch_size": 4, "epoch": 0.3532, "step": 883, "tokens_per_device": 1540 }, { "epoch": 0.3532, "loss_ce": 0.18256792426109314, "loss_lvr": 0.9997349977493286, "loss_mode_switch": 0.0, "loss_total": 0.282541424036026, "step": 883 }, { "batch_size": 4, "epoch": 0.3532, "step": 883, "tokens_per_device": 3900 }, { "epoch": 0.3532, "loss_ce": 0.3002494275569916, "loss_lvr": 0.9403315186500549, "loss_mode_switch": 0.0, "loss_total": 0.39428257942199707, "step": 883 }, { "batch_size": 1, "epoch": 0.3532, "step": 883, "tokens_per_device": 4737 }, { "epoch": 0.3532, "loss_ce": 0.012254227884113789, "loss_lvr": 0.4628584384918213, "loss_mode_switch": 0.0, "loss_total": 0.05854007229208946, "step": 883 }, { "batch_size": 1, "epoch": 0.3532, "step": 883, "tokens_per_device": 5021 }, { "epoch": 0.3532, "loss_ce": 0.12761905789375305, "loss_lvr": 0.5028669238090515, "loss_mode_switch": 0.0, "loss_total": 0.17790575325489044, "step": 883 }, { "batch_size": 4, "epoch": 0.3532, "step": 883, "tokens_per_device": 4236 }, { "epoch": 0.3532, "loss_ce": 0.5561263561248779, "loss_lvr": 0.857570469379425, "loss_mode_switch": 0.0, "loss_total": 0.641883373260498, "step": 883 }, { "batch_size": 4, "epoch": 0.3532, "step": 883, "tokens_per_device": 6004 }, { "epoch": 0.3532, "loss_ce": 0.0900530219078064, "loss_lvr": 0.8493503332138062, "loss_mode_switch": 0.0, "loss_total": 0.1749880611896515, "step": 883 }, { "batch_size": 1, "epoch": 0.3532, "step": 883, "tokens_per_device": 5113 }, { "epoch": 0.3532, "loss_ce": 0.0019773009698837996, "loss_lvr": 0.3677259683609009, "loss_mode_switch": 0.0, "loss_total": 0.038749899715185165, "step": 883 }, { "epoch": 0.3536, "grad_norm": 1.3243706226348877, "learning_rate": 7.496259275558218e-06, "loss": 0.2621, "step": 884 }, { "batch_size": 1, "epoch": 0.3536, "step": 884, "tokens_per_device": 5419 }, { "epoch": 0.3536, "loss_ce": 0.0062642632983624935, "loss_lvr": 0.34439605474472046, "loss_mode_switch": 0.0, "loss_total": 0.04070386663079262, "step": 884 }, { "batch_size": 4, "epoch": 0.3536, "step": 884, "tokens_per_device": 5440 }, { "epoch": 0.3536, "loss_ce": 0.03778335079550743, "loss_lvr": 0.7819043397903442, "loss_mode_switch": 0.0, "loss_total": 0.11597378551959991, "step": 884 }, { "batch_size": 1, "epoch": 0.3536, "step": 884, "tokens_per_device": 4894 }, { "epoch": 0.3536, "loss_ce": 0.02790958434343338, "loss_lvr": 0.6527600884437561, "loss_mode_switch": 0.0, "loss_total": 0.09318559616804123, "step": 884 }, { "batch_size": 4, "epoch": 0.3536, "step": 884, "tokens_per_device": 3860 }, { "epoch": 0.3536, "loss_ce": 0.31107258796691895, "loss_lvr": 1.0654246807098389, "loss_mode_switch": 0.0, "loss_total": 0.41761505603790283, "step": 884 }, { "batch_size": 4, "epoch": 0.3536, "step": 884, "tokens_per_device": 4556 }, { "epoch": 0.3536, "loss_ce": 0.17629508674144745, "loss_lvr": 0.8301247358322144, "loss_mode_switch": 0.0, "loss_total": 0.2593075633049011, "step": 884 }, { "batch_size": 4, "epoch": 0.3536, "step": 884, "tokens_per_device": 9240 }, { "epoch": 0.3536, "loss_ce": 0.4556008577346802, "loss_lvr": 0.6209656000137329, "loss_mode_switch": 0.0, "loss_total": 0.5176973938941956, "step": 884 }, { "batch_size": 1, "epoch": 0.3536, "step": 884, "tokens_per_device": 4774 }, { "epoch": 0.3536, "loss_ce": 0.004511152859777212, "loss_lvr": 0.27919888496398926, "loss_mode_switch": 0.0, "loss_total": 0.032431039959192276, "step": 884 }, { "batch_size": 4, "epoch": 0.3536, "step": 884, "tokens_per_device": 1464 }, { "epoch": 0.3536, "loss_ce": 0.48003503680229187, "loss_lvr": 1.1336252689361572, "loss_mode_switch": 0.0, "loss_total": 0.5933975577354431, "step": 884 }, { "epoch": 0.354, "grad_norm": 1.3920470476150513, "learning_rate": 7.490644698487909e-06, "loss": 0.2897, "step": 885 }, { "batch_size": 4, "epoch": 0.354, "step": 885, "tokens_per_device": 4296 }, { "epoch": 0.354, "loss_ce": 0.2559117376804352, "loss_lvr": 0.6711258888244629, "loss_mode_switch": 0.0, "loss_total": 0.32302433252334595, "step": 885 }, { "batch_size": 4, "epoch": 0.354, "step": 885, "tokens_per_device": 3892 }, { "epoch": 0.354, "loss_ce": 0.33020612597465515, "loss_lvr": 1.0447304248809814, "loss_mode_switch": 0.0, "loss_total": 0.43467918038368225, "step": 885 }, { "batch_size": 4, "epoch": 0.354, "step": 885, "tokens_per_device": 4840 }, { "epoch": 0.354, "loss_ce": 0.3894759714603424, "loss_lvr": 0.955216646194458, "loss_mode_switch": 0.0, "loss_total": 0.48499763011932373, "step": 885 }, { "batch_size": 1, "epoch": 0.354, "step": 885, "tokens_per_device": 4922 }, { "epoch": 0.354, "loss_ce": 0.0030025627929717302, "loss_lvr": 0.25365734100341797, "loss_mode_switch": 0.0, "loss_total": 0.028368297964334488, "step": 885 }, { "batch_size": 4, "epoch": 0.354, "step": 885, "tokens_per_device": 1520 }, { "epoch": 0.354, "loss_ce": 0.4110530912876129, "loss_lvr": 1.0275224447250366, "loss_mode_switch": 0.0, "loss_total": 0.5138053297996521, "step": 885 }, { "batch_size": 1, "epoch": 0.354, "step": 885, "tokens_per_device": 5080 }, { "epoch": 0.354, "loss_ce": 0.03188064321875572, "loss_lvr": 0.5292264819145203, "loss_mode_switch": 0.0, "loss_total": 0.08480329066514969, "step": 885 }, { "batch_size": 1, "epoch": 0.354, "step": 885, "tokens_per_device": 4858 }, { "epoch": 0.354, "loss_ce": 0.13857100903987885, "loss_lvr": 0.36421334743499756, "loss_mode_switch": 0.0, "loss_total": 0.17499235272407532, "step": 885 }, { "batch_size": 1, "epoch": 0.354, "step": 885, "tokens_per_device": 5140 }, { "epoch": 0.354, "loss_ce": 0.01213359646499157, "loss_lvr": 0.3808537423610687, "loss_mode_switch": 0.0, "loss_total": 0.05021896958351135, "step": 885 }, { "epoch": 0.3544, "grad_norm": 1.4826136827468872, "learning_rate": 7.485025941305036e-06, "loss": 0.2688, "step": 886 }, { "batch_size": 4, "epoch": 0.3544, "step": 886, "tokens_per_device": 2104 }, { "epoch": 0.3544, "loss_ce": 0.1767457276582718, "loss_lvr": 0.9293371438980103, "loss_mode_switch": 0.0, "loss_total": 0.2696794271469116, "step": 886 }, { "batch_size": 1, "epoch": 0.3544, "step": 886, "tokens_per_device": 4901 }, { "epoch": 0.3544, "loss_ce": 0.06689606606960297, "loss_lvr": 0.4587767422199249, "loss_mode_switch": 0.0, "loss_total": 0.11277374625205994, "step": 886 }, { "batch_size": 4, "epoch": 0.3544, "step": 886, "tokens_per_device": 1732 }, { "epoch": 0.3544, "loss_ce": 0.26759594678878784, "loss_lvr": 1.016955852508545, "loss_mode_switch": 0.0, "loss_total": 0.3692915439605713, "step": 886 }, { "batch_size": 1, "epoch": 0.3544, "step": 886, "tokens_per_device": 5128 }, { "epoch": 0.3544, "loss_ce": 0.04353340342640877, "loss_lvr": 0.47774574160575867, "loss_mode_switch": 0.0, "loss_total": 0.09130798280239105, "step": 886 }, { "batch_size": 4, "epoch": 0.3544, "step": 886, "tokens_per_device": 2828 }, { "epoch": 0.3544, "loss_ce": 0.17839910089969635, "loss_lvr": 1.0216007232666016, "loss_mode_switch": 0.0, "loss_total": 0.2805591821670532, "step": 886 }, { "batch_size": 4, "epoch": 0.3544, "step": 886, "tokens_per_device": 14324 }, { "epoch": 0.3544, "loss_ce": 0.3554733097553253, "loss_lvr": 0.7652083039283752, "loss_mode_switch": 0.0, "loss_total": 0.43199414014816284, "step": 886 }, { "batch_size": 4, "epoch": 0.3544, "step": 886, "tokens_per_device": 3928 }, { "epoch": 0.3544, "loss_ce": 0.2506553530693054, "loss_lvr": 0.9228407740592957, "loss_mode_switch": 0.0, "loss_total": 0.34293943643569946, "step": 886 }, { "batch_size": 1, "epoch": 0.3544, "step": 886, "tokens_per_device": 5635 }, { "epoch": 0.3544, "loss_ce": 0.006223954726010561, "loss_lvr": 0.24566121399402618, "loss_mode_switch": 0.0, "loss_total": 0.030790075659751892, "step": 886 }, { "epoch": 0.3548, "grad_norm": 1.2933093309402466, "learning_rate": 7.4794030134397055e-06, "loss": 0.309, "step": 887 }, { "batch_size": 4, "epoch": 0.3548, "step": 887, "tokens_per_device": 1784 }, { "epoch": 0.3548, "loss_ce": 0.4542326033115387, "loss_lvr": 0.809077262878418, "loss_mode_switch": 0.0, "loss_total": 0.535140335559845, "step": 887 }, { "batch_size": 4, "epoch": 0.3548, "step": 887, "tokens_per_device": 4232 }, { "epoch": 0.3548, "loss_ce": 0.3656783103942871, "loss_lvr": 0.8777390122413635, "loss_mode_switch": 0.0, "loss_total": 0.4534522294998169, "step": 887 }, { "batch_size": 4, "epoch": 0.3548, "step": 887, "tokens_per_device": 4220 }, { "epoch": 0.3548, "loss_ce": 0.20948410034179688, "loss_lvr": 0.898121178150177, "loss_mode_switch": 0.0, "loss_total": 0.29929623007774353, "step": 887 }, { "batch_size": 4, "epoch": 0.3548, "step": 887, "tokens_per_device": 5732 }, { "epoch": 0.3548, "loss_ce": 0.27077633142471313, "loss_lvr": 0.9791742563247681, "loss_mode_switch": 0.0, "loss_total": 0.3686937689781189, "step": 887 }, { "batch_size": 4, "epoch": 0.3548, "step": 887, "tokens_per_device": 4252 }, { "epoch": 0.3548, "loss_ce": 0.15726414322853088, "loss_lvr": 0.6986922025680542, "loss_mode_switch": 0.0, "loss_total": 0.2271333634853363, "step": 887 }, { "batch_size": 1, "epoch": 0.3548, "step": 887, "tokens_per_device": 4890 }, { "epoch": 0.3548, "loss_ce": 0.06242603436112404, "loss_lvr": 1.1142964363098145, "loss_mode_switch": 0.0, "loss_total": 0.17385567724704742, "step": 887 }, { "batch_size": 1, "epoch": 0.3548, "step": 887, "tokens_per_device": 4393 }, { "epoch": 0.3548, "loss_ce": 0.018341168761253357, "loss_lvr": 0.855802595615387, "loss_mode_switch": 0.0, "loss_total": 0.10392142832279205, "step": 887 }, { "batch_size": 4, "epoch": 0.3548, "step": 887, "tokens_per_device": 2592 }, { "epoch": 0.3548, "loss_ce": 0.8903533816337585, "loss_lvr": 1.1485564708709717, "loss_mode_switch": 0.0, "loss_total": 1.005208969116211, "step": 887 }, { "epoch": 0.3552, "grad_norm": 1.168035864830017, "learning_rate": 7.473775924329018e-06, "loss": 0.2539, "step": 888 }, { "batch_size": 4, "epoch": 0.3552, "step": 888, "tokens_per_device": 4924 }, { "epoch": 0.3552, "loss_ce": 0.774874746799469, "loss_lvr": 0.9699308276176453, "loss_mode_switch": 0.0, "loss_total": 0.871867835521698, "step": 888 }, { "batch_size": 1, "epoch": 0.3552, "step": 888, "tokens_per_device": 5251 }, { "epoch": 0.3552, "loss_ce": 0.2538658082485199, "loss_lvr": 0.5186436176300049, "loss_mode_switch": 0.0, "loss_total": 0.3057301640510559, "step": 888 }, { "batch_size": 1, "epoch": 0.3552, "step": 888, "tokens_per_device": 5114 }, { "epoch": 0.3552, "loss_ce": 0.06281664222478867, "loss_lvr": 0.6784133911132812, "loss_mode_switch": 0.0, "loss_total": 0.13065798580646515, "step": 888 }, { "batch_size": 4, "epoch": 0.3552, "step": 888, "tokens_per_device": 11268 }, { "epoch": 0.3552, "loss_ce": 0.006194226909428835, "loss_lvr": 0.7367928624153137, "loss_mode_switch": 0.0, "loss_total": 0.07987351715564728, "step": 888 }, { "batch_size": 1, "epoch": 0.3552, "step": 888, "tokens_per_device": 5086 }, { "epoch": 0.3552, "loss_ce": 0.004399394150823355, "loss_lvr": 0.28832945227622986, "loss_mode_switch": 0.0, "loss_total": 0.03323233872652054, "step": 888 }, { "batch_size": 4, "epoch": 0.3552, "step": 888, "tokens_per_device": 4820 }, { "epoch": 0.3552, "loss_ce": 0.08624939620494843, "loss_lvr": 0.7241770625114441, "loss_mode_switch": 0.0, "loss_total": 0.15866710245609283, "step": 888 }, { "batch_size": 4, "epoch": 0.3552, "step": 888, "tokens_per_device": 3496 }, { "epoch": 0.3552, "loss_ce": 0.41105973720550537, "loss_lvr": 0.9098678231239319, "loss_mode_switch": 0.0, "loss_total": 0.502046525478363, "step": 888 }, { "batch_size": 4, "epoch": 0.3552, "step": 888, "tokens_per_device": 10380 }, { "epoch": 0.3552, "loss_ce": 0.20667187869548798, "loss_lvr": 0.9129946827888489, "loss_mode_switch": 0.0, "loss_total": 0.29797133803367615, "step": 888 }, { "epoch": 0.3556, "grad_norm": 1.318596363067627, "learning_rate": 7.468144683417061e-06, "loss": 0.2712, "step": 889 }, { "batch_size": 4, "epoch": 0.3556, "step": 889, "tokens_per_device": 4624 }, { "epoch": 0.3556, "loss_ce": 0.2598191201686859, "loss_lvr": 0.9671381115913391, "loss_mode_switch": 0.0, "loss_total": 0.3565329313278198, "step": 889 }, { "batch_size": 4, "epoch": 0.3556, "step": 889, "tokens_per_device": 4192 }, { "epoch": 0.3556, "loss_ce": 0.44747504591941833, "loss_lvr": 1.0234342813491821, "loss_mode_switch": 0.0, "loss_total": 0.5498184561729431, "step": 889 }, { "batch_size": 4, "epoch": 0.3556, "step": 889, "tokens_per_device": 6236 }, { "epoch": 0.3556, "loss_ce": 0.4135209321975708, "loss_lvr": 0.6078047752380371, "loss_mode_switch": 0.0, "loss_total": 0.47430139780044556, "step": 889 }, { "batch_size": 4, "epoch": 0.3556, "step": 889, "tokens_per_device": 4040 }, { "epoch": 0.3556, "loss_ce": 0.24302330613136292, "loss_lvr": 0.8457964062690735, "loss_mode_switch": 0.0, "loss_total": 0.32760295271873474, "step": 889 }, { "batch_size": 4, "epoch": 0.3556, "step": 889, "tokens_per_device": 9612 }, { "epoch": 0.3556, "loss_ce": 0.015684768557548523, "loss_lvr": 0.5366176962852478, "loss_mode_switch": 0.0, "loss_total": 0.06934653967618942, "step": 889 }, { "batch_size": 4, "epoch": 0.3556, "step": 889, "tokens_per_device": 6244 }, { "epoch": 0.3556, "loss_ce": 0.23740875720977783, "loss_lvr": 1.0558900833129883, "loss_mode_switch": 0.0, "loss_total": 0.3429977595806122, "step": 889 }, { "batch_size": 4, "epoch": 0.3556, "step": 889, "tokens_per_device": 8896 }, { "epoch": 0.3556, "loss_ce": 0.3794509172439575, "loss_lvr": 0.9311023354530334, "loss_mode_switch": 0.0, "loss_total": 0.47256115078926086, "step": 889 }, { "batch_size": 1, "epoch": 0.3556, "step": 889, "tokens_per_device": 4983 }, { "epoch": 0.3556, "loss_ce": 0.01313081942498684, "loss_lvr": 0.25150904059410095, "loss_mode_switch": 0.0, "loss_total": 0.038281723856925964, "step": 889 }, { "epoch": 0.356, "grad_norm": 1.2322148084640503, "learning_rate": 7.462509300154892e-06, "loss": 0.2781, "step": 890 }, { "batch_size": 1, "epoch": 0.356, "step": 890, "tokens_per_device": 5695 }, { "epoch": 0.356, "loss_ce": 0.017464611679315567, "loss_lvr": 0.338837206363678, "loss_mode_switch": 0.0, "loss_total": 0.051348332315683365, "step": 890 }, { "batch_size": 4, "epoch": 0.356, "step": 890, "tokens_per_device": 1272 }, { "epoch": 0.356, "loss_ce": 0.32352933287620544, "loss_lvr": 1.1088913679122925, "loss_mode_switch": 0.0, "loss_total": 0.4344184696674347, "step": 890 }, { "batch_size": 1, "epoch": 0.356, "step": 890, "tokens_per_device": 4924 }, { "epoch": 0.356, "loss_ce": 0.8366226553916931, "loss_lvr": 0.48467981815338135, "loss_mode_switch": 0.0, "loss_total": 0.8850906491279602, "step": 890 }, { "batch_size": 1, "epoch": 0.356, "step": 890, "tokens_per_device": 5410 }, { "epoch": 0.356, "loss_ce": 0.014683468267321587, "loss_lvr": 0.37466344237327576, "loss_mode_switch": 0.0, "loss_total": 0.05214980989694595, "step": 890 }, { "batch_size": 4, "epoch": 0.356, "step": 890, "tokens_per_device": 2628 }, { "epoch": 0.356, "loss_ce": 0.4380250871181488, "loss_lvr": 0.9263234734535217, "loss_mode_switch": 0.0, "loss_total": 0.5306574106216431, "step": 890 }, { "batch_size": 1, "epoch": 0.356, "step": 890, "tokens_per_device": 5162 }, { "epoch": 0.356, "loss_ce": 0.04105820134282112, "loss_lvr": 0.7185532450675964, "loss_mode_switch": 0.0, "loss_total": 0.11291353404521942, "step": 890 }, { "batch_size": 4, "epoch": 0.356, "step": 890, "tokens_per_device": 3820 }, { "epoch": 0.356, "loss_ce": 0.23806403577327728, "loss_lvr": 1.0090835094451904, "loss_mode_switch": 0.0, "loss_total": 0.33897238969802856, "step": 890 }, { "batch_size": 1, "epoch": 0.356, "step": 890, "tokens_per_device": 4909 }, { "epoch": 0.356, "loss_ce": 0.012222906574606895, "loss_lvr": 0.6323642730712891, "loss_mode_switch": 0.0, "loss_total": 0.07545933872461319, "step": 890 }, { "epoch": 0.3564, "grad_norm": 1.278691291809082, "learning_rate": 7.456869784000517e-06, "loss": 0.2795, "step": 891 }, { "batch_size": 4, "epoch": 0.3564, "step": 891, "tokens_per_device": 4256 }, { "epoch": 0.3564, "loss_ce": 0.19886428117752075, "loss_lvr": 0.8018353581428528, "loss_mode_switch": 0.0, "loss_total": 0.27904781699180603, "step": 891 }, { "batch_size": 4, "epoch": 0.3564, "step": 891, "tokens_per_device": 10592 }, { "epoch": 0.3564, "loss_ce": 0.6927980184555054, "loss_lvr": 0.7593468427658081, "loss_mode_switch": 0.0, "loss_total": 0.7687327265739441, "step": 891 }, { "batch_size": 4, "epoch": 0.3564, "step": 891, "tokens_per_device": 4884 }, { "epoch": 0.3564, "loss_ce": 0.25301671028137207, "loss_lvr": 0.9366680979728699, "loss_mode_switch": 0.0, "loss_total": 0.346683531999588, "step": 891 }, { "batch_size": 4, "epoch": 0.3564, "step": 891, "tokens_per_device": 4304 }, { "epoch": 0.3564, "loss_ce": 0.3784143328666687, "loss_lvr": 0.9123296141624451, "loss_mode_switch": 0.0, "loss_total": 0.46964728832244873, "step": 891 }, { "batch_size": 4, "epoch": 0.3564, "step": 891, "tokens_per_device": 3860 }, { "epoch": 0.3564, "loss_ce": 0.2902841567993164, "loss_lvr": 0.9734948873519897, "loss_mode_switch": 0.0, "loss_total": 0.38763365149497986, "step": 891 }, { "batch_size": 4, "epoch": 0.3564, "step": 891, "tokens_per_device": 1332 }, { "epoch": 0.3564, "loss_ce": 0.7334506511688232, "loss_lvr": 0.9466193914413452, "loss_mode_switch": 0.0, "loss_total": 0.8281126022338867, "step": 891 }, { "batch_size": 1, "epoch": 0.3564, "step": 891, "tokens_per_device": 5112 }, { "epoch": 0.3564, "loss_ce": 0.0011496038641780615, "loss_lvr": 0.37604212760925293, "loss_mode_switch": 0.0, "loss_total": 0.03875381499528885, "step": 891 }, { "batch_size": 4, "epoch": 0.3564, "step": 891, "tokens_per_device": 5628 }, { "epoch": 0.3564, "loss_ce": 0.07935931533575058, "loss_lvr": 0.7560107707977295, "loss_mode_switch": 0.0, "loss_total": 0.15496039390563965, "step": 891 }, { "epoch": 0.3568, "grad_norm": 1.2020986080169678, "learning_rate": 7.4512261444188805e-06, "loss": 0.2947, "step": 892 }, { "batch_size": 4, "epoch": 0.3568, "step": 892, "tokens_per_device": 3804 }, { "epoch": 0.3568, "loss_ce": 0.46951764822006226, "loss_lvr": 1.2843531370162964, "loss_mode_switch": 0.0, "loss_total": 0.5979529619216919, "step": 892 }, { "batch_size": 4, "epoch": 0.3568, "step": 892, "tokens_per_device": 5856 }, { "epoch": 0.3568, "loss_ce": 0.023879149928689003, "loss_lvr": 0.7802507877349854, "loss_mode_switch": 0.0, "loss_total": 0.10190422832965851, "step": 892 }, { "batch_size": 4, "epoch": 0.3568, "step": 892, "tokens_per_device": 1468 }, { "epoch": 0.3568, "loss_ce": 0.08474105596542358, "loss_lvr": 0.8711025714874268, "loss_mode_switch": 0.0, "loss_total": 0.17185130715370178, "step": 892 }, { "batch_size": 4, "epoch": 0.3568, "step": 892, "tokens_per_device": 4056 }, { "epoch": 0.3568, "loss_ce": 0.21138016879558563, "loss_lvr": 1.0469070672988892, "loss_mode_switch": 0.0, "loss_total": 0.31607088446617126, "step": 892 }, { "batch_size": 4, "epoch": 0.3568, "step": 892, "tokens_per_device": 1268 }, { "epoch": 0.3568, "loss_ce": 0.26710233092308044, "loss_lvr": 1.1669995784759521, "loss_mode_switch": 0.0, "loss_total": 0.38380229473114014, "step": 892 }, { "batch_size": 4, "epoch": 0.3568, "step": 892, "tokens_per_device": 2824 }, { "epoch": 0.3568, "loss_ce": 0.30542513728141785, "loss_lvr": 0.7795990109443665, "loss_mode_switch": 0.0, "loss_total": 0.38338503241539, "step": 892 }, { "batch_size": 4, "epoch": 0.3568, "step": 892, "tokens_per_device": 4112 }, { "epoch": 0.3568, "loss_ce": 0.024698372930288315, "loss_lvr": 0.7896177768707275, "loss_mode_switch": 0.0, "loss_total": 0.10366015136241913, "step": 892 }, { "batch_size": 4, "epoch": 0.3568, "step": 892, "tokens_per_device": 4276 }, { "epoch": 0.3568, "loss_ce": 0.29011332988739014, "loss_lvr": 0.8244377374649048, "loss_mode_switch": 0.0, "loss_total": 0.3725571036338806, "step": 892 }, { "epoch": 0.3572, "grad_norm": 1.2584882974624634, "learning_rate": 7.445578390881846e-06, "loss": 0.2706, "step": 893 }, { "batch_size": 4, "epoch": 0.3572, "step": 893, "tokens_per_device": 3760 }, { "epoch": 0.3572, "loss_ce": 0.02209155634045601, "loss_lvr": 1.0268303155899048, "loss_mode_switch": 0.0, "loss_total": 0.12477459013462067, "step": 893 }, { "batch_size": 4, "epoch": 0.3572, "step": 893, "tokens_per_device": 1544 }, { "epoch": 0.3572, "loss_ce": 0.19203437864780426, "loss_lvr": 0.9413939118385315, "loss_mode_switch": 0.0, "loss_total": 0.2861737608909607, "step": 893 }, { "batch_size": 4, "epoch": 0.3572, "step": 893, "tokens_per_device": 4096 }, { "epoch": 0.3572, "loss_ce": 0.050880786031484604, "loss_lvr": 0.8878952264785767, "loss_mode_switch": 0.0, "loss_total": 0.13967031240463257, "step": 893 }, { "batch_size": 1, "epoch": 0.3572, "step": 893, "tokens_per_device": 4893 }, { "epoch": 0.3572, "loss_ce": 0.24784936010837555, "loss_lvr": 0.20518971979618073, "loss_mode_switch": 0.0, "loss_total": 0.26836833357810974, "step": 893 }, { "batch_size": 4, "epoch": 0.3572, "step": 893, "tokens_per_device": 2652 }, { "epoch": 0.3572, "loss_ce": 0.10322914272546768, "loss_lvr": 1.1997779607772827, "loss_mode_switch": 0.0, "loss_total": 0.22320693731307983, "step": 893 }, { "batch_size": 4, "epoch": 0.3572, "step": 893, "tokens_per_device": 4472 }, { "epoch": 0.3572, "loss_ce": 0.18514546751976013, "loss_lvr": 1.0138280391693115, "loss_mode_switch": 0.0, "loss_total": 0.2865282893180847, "step": 893 }, { "batch_size": 1, "epoch": 0.3572, "step": 893, "tokens_per_device": 4865 }, { "epoch": 0.3572, "loss_ce": 0.01703181490302086, "loss_lvr": 0.6287813186645508, "loss_mode_switch": 0.0, "loss_total": 0.07990995049476624, "step": 893 }, { "batch_size": 4, "epoch": 0.3572, "step": 893, "tokens_per_device": 4256 }, { "epoch": 0.3572, "loss_ce": 0.4412361681461334, "loss_lvr": 0.8382477760314941, "loss_mode_switch": 0.0, "loss_total": 0.5250609517097473, "step": 893 }, { "epoch": 0.3576, "grad_norm": 2.0643348693847656, "learning_rate": 7.439926532868183e-06, "loss": 0.3205, "step": 894 }, { "batch_size": 4, "epoch": 0.3576, "step": 894, "tokens_per_device": 2424 }, { "epoch": 0.3576, "loss_ce": 0.30211976170539856, "loss_lvr": 1.0646270513534546, "loss_mode_switch": 0.0, "loss_total": 0.408582478761673, "step": 894 }, { "batch_size": 4, "epoch": 0.3576, "step": 894, "tokens_per_device": 4428 }, { "epoch": 0.3576, "loss_ce": 0.27765193581581116, "loss_lvr": 0.8742699027061462, "loss_mode_switch": 0.0, "loss_total": 0.3650789260864258, "step": 894 }, { "batch_size": 1, "epoch": 0.3576, "step": 894, "tokens_per_device": 5054 }, { "epoch": 0.3576, "loss_ce": 0.3802875876426697, "loss_lvr": 0.6928524374961853, "loss_mode_switch": 0.0, "loss_total": 0.4495728313922882, "step": 894 }, { "batch_size": 4, "epoch": 0.3576, "step": 894, "tokens_per_device": 4228 }, { "epoch": 0.3576, "loss_ce": 0.25984200835227966, "loss_lvr": 0.8812747597694397, "loss_mode_switch": 0.0, "loss_total": 0.3479694724082947, "step": 894 }, { "batch_size": 4, "epoch": 0.3576, "step": 894, "tokens_per_device": 3852 }, { "epoch": 0.3576, "loss_ce": 0.56573885679245, "loss_lvr": 0.9520381093025208, "loss_mode_switch": 0.0, "loss_total": 0.6609426736831665, "step": 894 }, { "batch_size": 1, "epoch": 0.3576, "step": 894, "tokens_per_device": 5313 }, { "epoch": 0.3576, "loss_ce": 0.0032263852190226316, "loss_lvr": 0.6966709494590759, "loss_mode_switch": 0.0, "loss_total": 0.07289347797632217, "step": 894 }, { "batch_size": 4, "epoch": 0.3576, "step": 894, "tokens_per_device": 10344 }, { "epoch": 0.3576, "loss_ce": 0.18342381715774536, "loss_lvr": 0.8315384387969971, "loss_mode_switch": 0.0, "loss_total": 0.26657766103744507, "step": 894 }, { "batch_size": 1, "epoch": 0.3576, "step": 894, "tokens_per_device": 4843 }, { "epoch": 0.3576, "loss_ce": 0.04618273675441742, "loss_lvr": 0.4471595883369446, "loss_mode_switch": 0.0, "loss_total": 0.09089869260787964, "step": 894 }, { "epoch": 0.358, "grad_norm": 1.705666422843933, "learning_rate": 7.434270579863549e-06, "loss": 0.313, "step": 895 }, { "batch_size": 1, "epoch": 0.358, "step": 895, "tokens_per_device": 4877 }, { "epoch": 0.358, "loss_ce": 0.6251096129417419, "loss_lvr": 0.582821786403656, "loss_mode_switch": 0.0, "loss_total": 0.683391809463501, "step": 895 }, { "batch_size": 4, "epoch": 0.358, "step": 895, "tokens_per_device": 2500 }, { "epoch": 0.358, "loss_ce": 0.054556090384721756, "loss_lvr": 0.8249308466911316, "loss_mode_switch": 0.0, "loss_total": 0.13704918324947357, "step": 895 }, { "batch_size": 1, "epoch": 0.358, "step": 895, "tokens_per_device": 5117 }, { "epoch": 0.358, "loss_ce": 0.41654062271118164, "loss_lvr": 0.5595298409461975, "loss_mode_switch": 0.0, "loss_total": 0.47249361872673035, "step": 895 }, { "batch_size": 1, "epoch": 0.358, "step": 895, "tokens_per_device": 4763 }, { "epoch": 0.358, "loss_ce": 0.033919963985681534, "loss_lvr": 0.2616446018218994, "loss_mode_switch": 0.0, "loss_total": 0.060084424912929535, "step": 895 }, { "batch_size": 1, "epoch": 0.358, "step": 895, "tokens_per_device": 5108 }, { "epoch": 0.358, "loss_ce": 0.14935487508773804, "loss_lvr": 0.6216201186180115, "loss_mode_switch": 0.0, "loss_total": 0.21151688694953918, "step": 895 }, { "batch_size": 4, "epoch": 0.358, "step": 895, "tokens_per_device": 1988 }, { "epoch": 0.358, "loss_ce": 0.07277242094278336, "loss_lvr": 1.6480215787887573, "loss_mode_switch": 0.0, "loss_total": 0.23757457733154297, "step": 895 }, { "batch_size": 1, "epoch": 0.358, "step": 895, "tokens_per_device": 6279 }, { "epoch": 0.358, "loss_ce": 0.1372041553258896, "loss_lvr": 0.42862895131111145, "loss_mode_switch": 0.0, "loss_total": 0.1800670474767685, "step": 895 }, { "batch_size": 4, "epoch": 0.358, "step": 895, "tokens_per_device": 3856 }, { "epoch": 0.358, "loss_ce": 0.43355128169059753, "loss_lvr": 0.9477742910385132, "loss_mode_switch": 0.0, "loss_total": 0.5283287167549133, "step": 895 }, { "epoch": 0.3584, "grad_norm": 1.5608441829681396, "learning_rate": 7.428610541360475e-06, "loss": 0.3384, "step": 896 }, { "batch_size": 1, "epoch": 0.3584, "step": 896, "tokens_per_device": 5071 }, { "epoch": 0.3584, "loss_ce": 0.1971166431903839, "loss_lvr": 0.48842862248420715, "loss_mode_switch": 0.0, "loss_total": 0.24595950543880463, "step": 896 }, { "batch_size": 1, "epoch": 0.3584, "step": 896, "tokens_per_device": 5104 }, { "epoch": 0.3584, "loss_ce": 0.009392993524670601, "loss_lvr": 0.17960767447948456, "loss_mode_switch": 0.0, "loss_total": 0.027353761717677116, "step": 896 }, { "batch_size": 4, "epoch": 0.3584, "step": 896, "tokens_per_device": 1532 }, { "epoch": 0.3584, "loss_ce": 0.031648434698581696, "loss_lvr": 1.7753677368164062, "loss_mode_switch": 0.0, "loss_total": 0.20918521285057068, "step": 896 }, { "batch_size": 1, "epoch": 0.3584, "step": 896, "tokens_per_device": 5214 }, { "epoch": 0.3584, "loss_ce": 0.12452297657728195, "loss_lvr": 0.29595619440078735, "loss_mode_switch": 0.0, "loss_total": 0.1541185975074768, "step": 896 }, { "batch_size": 1, "epoch": 0.3584, "step": 896, "tokens_per_device": 5691 }, { "epoch": 0.3584, "loss_ce": 0.0008702091872692108, "loss_lvr": 0.46498262882232666, "loss_mode_switch": 0.0, "loss_total": 0.047368474304676056, "step": 896 }, { "batch_size": 4, "epoch": 0.3584, "step": 896, "tokens_per_device": 1892 }, { "epoch": 0.3584, "loss_ce": 0.17353679239749908, "loss_lvr": 1.180660367012024, "loss_mode_switch": 0.0, "loss_total": 0.29160282015800476, "step": 896 }, { "batch_size": 1, "epoch": 0.3584, "step": 896, "tokens_per_device": 4923 }, { "epoch": 0.3584, "loss_ce": 0.06815586984157562, "loss_lvr": 0.1573745310306549, "loss_mode_switch": 0.0, "loss_total": 0.083893321454525, "step": 896 }, { "batch_size": 4, "epoch": 0.3584, "step": 896, "tokens_per_device": 5964 }, { "epoch": 0.3584, "loss_ce": 0.3053695857524872, "loss_lvr": 0.6870858669281006, "loss_mode_switch": 0.0, "loss_total": 0.3740781843662262, "step": 896 }, { "epoch": 0.3588, "grad_norm": 1.3290760517120361, "learning_rate": 7.422946426858346e-06, "loss": 0.3078, "step": 897 }, { "batch_size": 4, "epoch": 0.3588, "step": 897, "tokens_per_device": 4664 }, { "epoch": 0.3588, "loss_ce": 0.27852344512939453, "loss_lvr": 0.762545108795166, "loss_mode_switch": 0.0, "loss_total": 0.3547779619693756, "step": 897 }, { "batch_size": 1, "epoch": 0.3588, "step": 897, "tokens_per_device": 5109 }, { "epoch": 0.3588, "loss_ce": 0.0004578938242048025, "loss_lvr": 0.36661553382873535, "loss_mode_switch": 0.0, "loss_total": 0.03711944818496704, "step": 897 }, { "batch_size": 4, "epoch": 0.3588, "step": 897, "tokens_per_device": 4508 }, { "epoch": 0.3588, "loss_ce": 0.08896984159946442, "loss_lvr": 0.9228784441947937, "loss_mode_switch": 0.0, "loss_total": 0.1812576949596405, "step": 897 }, { "batch_size": 4, "epoch": 0.3588, "step": 897, "tokens_per_device": 3844 }, { "epoch": 0.3588, "loss_ce": 0.24548712372779846, "loss_lvr": 1.088459849357605, "loss_mode_switch": 0.0, "loss_total": 0.3543331027030945, "step": 897 }, { "batch_size": 1, "epoch": 0.3588, "step": 897, "tokens_per_device": 4862 }, { "epoch": 0.3588, "loss_ce": 0.37467333674430847, "loss_lvr": 0.33689865469932556, "loss_mode_switch": 0.0, "loss_total": 0.4083631932735443, "step": 897 }, { "batch_size": 4, "epoch": 0.3588, "step": 897, "tokens_per_device": 4256 }, { "epoch": 0.3588, "loss_ce": 0.11535613983869553, "loss_lvr": 0.830708920955658, "loss_mode_switch": 0.0, "loss_total": 0.19842703640460968, "step": 897 }, { "batch_size": 4, "epoch": 0.3588, "step": 897, "tokens_per_device": 8924 }, { "epoch": 0.3588, "loss_ce": 0.10442367196083069, "loss_lvr": 0.9015669822692871, "loss_mode_switch": 0.0, "loss_total": 0.19458037614822388, "step": 897 }, { "batch_size": 4, "epoch": 0.3588, "step": 897, "tokens_per_device": 2712 }, { "epoch": 0.3588, "loss_ce": 0.40823838114738464, "loss_lvr": 0.9768102169036865, "loss_mode_switch": 0.0, "loss_total": 0.5059193968772888, "step": 897 }, { "epoch": 0.3592, "grad_norm": 1.3723721504211426, "learning_rate": 7.417278245863391e-06, "loss": 0.3392, "step": 898 }, { "batch_size": 4, "epoch": 0.3592, "step": 898, "tokens_per_device": 3848 }, { "epoch": 0.3592, "loss_ce": 0.4222591519355774, "loss_lvr": 0.8814800977706909, "loss_mode_switch": 0.0, "loss_total": 0.5104071497917175, "step": 898 }, { "batch_size": 4, "epoch": 0.3592, "step": 898, "tokens_per_device": 3484 }, { "epoch": 0.3592, "loss_ce": 0.07270681858062744, "loss_lvr": 0.9534366130828857, "loss_mode_switch": 0.0, "loss_total": 0.16805048286914825, "step": 898 }, { "batch_size": 1, "epoch": 0.3592, "step": 898, "tokens_per_device": 4662 }, { "epoch": 0.3592, "loss_ce": 0.16207370162010193, "loss_lvr": 0.5670109987258911, "loss_mode_switch": 0.0, "loss_total": 0.21877479553222656, "step": 898 }, { "batch_size": 4, "epoch": 0.3592, "step": 898, "tokens_per_device": 4260 }, { "epoch": 0.3592, "loss_ce": 0.5226970314979553, "loss_lvr": 1.0730539560317993, "loss_mode_switch": 0.0, "loss_total": 0.6300024390220642, "step": 898 }, { "batch_size": 4, "epoch": 0.3592, "step": 898, "tokens_per_device": 1480 }, { "epoch": 0.3592, "loss_ce": 0.2233080416917801, "loss_lvr": 1.1133915185928345, "loss_mode_switch": 0.0, "loss_total": 0.33464717864990234, "step": 898 }, { "batch_size": 4, "epoch": 0.3592, "step": 898, "tokens_per_device": 3876 }, { "epoch": 0.3592, "loss_ce": 0.6003406643867493, "loss_lvr": 0.9158318042755127, "loss_mode_switch": 0.0, "loss_total": 0.6919238567352295, "step": 898 }, { "batch_size": 1, "epoch": 0.3592, "step": 898, "tokens_per_device": 4900 }, { "epoch": 0.3592, "loss_ce": 0.1431369185447693, "loss_lvr": 0.6542960405349731, "loss_mode_switch": 0.0, "loss_total": 0.20856651663780212, "step": 898 }, { "batch_size": 1, "epoch": 0.3592, "step": 898, "tokens_per_device": 5143 }, { "epoch": 0.3592, "loss_ce": 0.35249119997024536, "loss_lvr": 0.33286213874816895, "loss_mode_switch": 0.0, "loss_total": 0.38577741384506226, "step": 898 }, { "epoch": 0.3596, "grad_norm": 1.4389961957931519, "learning_rate": 7.411606007888665e-06, "loss": 0.3184, "step": 899 }, { "batch_size": 1, "epoch": 0.3596, "step": 899, "tokens_per_device": 5112 }, { "epoch": 0.3596, "loss_ce": 0.12925782799720764, "loss_lvr": 1.0712668895721436, "loss_mode_switch": 0.0, "loss_total": 0.23638451099395752, "step": 899 }, { "batch_size": 4, "epoch": 0.3596, "step": 899, "tokens_per_device": 2608 }, { "epoch": 0.3596, "loss_ce": 0.24053871631622314, "loss_lvr": 1.034157156944275, "loss_mode_switch": 0.0, "loss_total": 0.3439544439315796, "step": 899 }, { "batch_size": 4, "epoch": 0.3596, "step": 899, "tokens_per_device": 5728 }, { "epoch": 0.3596, "loss_ce": 0.09342288225889206, "loss_lvr": 1.0455230474472046, "loss_mode_switch": 0.0, "loss_total": 0.19797518849372864, "step": 899 }, { "batch_size": 4, "epoch": 0.3596, "step": 899, "tokens_per_device": 4456 }, { "epoch": 0.3596, "loss_ce": 0.6834245920181274, "loss_lvr": 0.879527747631073, "loss_mode_switch": 0.0, "loss_total": 0.7713773846626282, "step": 899 }, { "batch_size": 4, "epoch": 0.3596, "step": 899, "tokens_per_device": 8196 }, { "epoch": 0.3596, "loss_ce": 0.5933435559272766, "loss_lvr": 0.8724514842033386, "loss_mode_switch": 0.0, "loss_total": 0.6805887222290039, "step": 899 }, { "batch_size": 4, "epoch": 0.3596, "step": 899, "tokens_per_device": 4180 }, { "epoch": 0.3596, "loss_ce": 0.42958611249923706, "loss_lvr": 0.740264356136322, "loss_mode_switch": 0.0, "loss_total": 0.5036125183105469, "step": 899 }, { "batch_size": 4, "epoch": 0.3596, "step": 899, "tokens_per_device": 5056 }, { "epoch": 0.3596, "loss_ce": 0.3493019640445709, "loss_lvr": 0.8042156100273132, "loss_mode_switch": 0.0, "loss_total": 0.4297235310077667, "step": 899 }, { "batch_size": 4, "epoch": 0.3596, "step": 899, "tokens_per_device": 3752 }, { "epoch": 0.3596, "loss_ce": 0.04387148097157478, "loss_lvr": 1.0123451948165894, "loss_mode_switch": 0.0, "loss_total": 0.1451060026884079, "step": 899 }, { "epoch": 0.36, "grad_norm": 1.422686219215393, "learning_rate": 7.405929722454026e-06, "loss": 0.3481, "step": 900 }, { "batch_size": 4, "epoch": 0.36, "step": 900, "tokens_per_device": 8464 }, { "epoch": 0.36, "loss_ce": 0.5114113688468933, "loss_lvr": 0.8942932486534119, "loss_mode_switch": 0.0, "loss_total": 0.60084068775177, "step": 900 }, { "batch_size": 4, "epoch": 0.36, "step": 900, "tokens_per_device": 3976 }, { "epoch": 0.36, "loss_ce": 0.5227259397506714, "loss_lvr": 0.9667235612869263, "loss_mode_switch": 0.0, "loss_total": 0.619398295879364, "step": 900 }, { "batch_size": 4, "epoch": 0.36, "step": 900, "tokens_per_device": 11364 }, { "epoch": 0.36, "loss_ce": 0.08739388734102249, "loss_lvr": 0.37361061573028564, "loss_mode_switch": 0.0, "loss_total": 0.12475495040416718, "step": 900 }, { "batch_size": 1, "epoch": 0.36, "step": 900, "tokens_per_device": 5094 }, { "epoch": 0.36, "loss_ce": 0.07283879816532135, "loss_lvr": 0.36261749267578125, "loss_mode_switch": 0.0, "loss_total": 0.10910055041313171, "step": 900 }, { "batch_size": 4, "epoch": 0.36, "step": 900, "tokens_per_device": 4640 }, { "epoch": 0.36, "loss_ce": 0.0020156833343207836, "loss_lvr": 0.7804858684539795, "loss_mode_switch": 0.0, "loss_total": 0.08006426692008972, "step": 900 }, { "batch_size": 1, "epoch": 0.36, "step": 900, "tokens_per_device": 4941 }, { "epoch": 0.36, "loss_ce": 0.0860838070511818, "loss_lvr": 0.32304197549819946, "loss_mode_switch": 0.0, "loss_total": 0.11838800460100174, "step": 900 }, { "batch_size": 4, "epoch": 0.36, "step": 900, "tokens_per_device": 3848 }, { "epoch": 0.36, "loss_ce": 0.46193134784698486, "loss_lvr": 0.9414575099945068, "loss_mode_switch": 0.0, "loss_total": 0.5560771226882935, "step": 900 }, { "batch_size": 4, "epoch": 0.36, "step": 900, "tokens_per_device": 3804 }, { "epoch": 0.36, "loss_ce": 0.6020746827125549, "loss_lvr": 1.336158275604248, "loss_mode_switch": 0.0, "loss_total": 0.7356905341148376, "step": 900 }, { "epoch": 0.3604, "grad_norm": 1.388249397277832, "learning_rate": 7.4002493990861314e-06, "loss": 0.3048, "step": 901 }, { "batch_size": 4, "epoch": 0.3604, "step": 901, "tokens_per_device": 1684 }, { "epoch": 0.3604, "loss_ce": 0.3467909097671509, "loss_lvr": 1.0996308326721191, "loss_mode_switch": 0.0, "loss_total": 0.45675399899482727, "step": 901 }, { "batch_size": 4, "epoch": 0.3604, "step": 901, "tokens_per_device": 3640 }, { "epoch": 0.3604, "loss_ce": 0.15696391463279724, "loss_lvr": 1.000272274017334, "loss_mode_switch": 0.0, "loss_total": 0.2569911479949951, "step": 901 }, { "batch_size": 4, "epoch": 0.3604, "step": 901, "tokens_per_device": 6196 }, { "epoch": 0.3604, "loss_ce": 0.062305375933647156, "loss_lvr": 0.9085323214530945, "loss_mode_switch": 0.0, "loss_total": 0.15315860509872437, "step": 901 }, { "batch_size": 4, "epoch": 0.3604, "step": 901, "tokens_per_device": 12544 }, { "epoch": 0.3604, "loss_ce": 0.050390999764204025, "loss_lvr": 0.7410443425178528, "loss_mode_switch": 0.0, "loss_total": 0.12449543178081512, "step": 901 }, { "batch_size": 4, "epoch": 0.3604, "step": 901, "tokens_per_device": 4232 }, { "epoch": 0.3604, "loss_ce": 0.14998483657836914, "loss_lvr": 0.9868984818458557, "loss_mode_switch": 0.0, "loss_total": 0.2486746907234192, "step": 901 }, { "batch_size": 4, "epoch": 0.3604, "step": 901, "tokens_per_device": 3720 }, { "epoch": 0.3604, "loss_ce": 0.2261146605014801, "loss_lvr": 0.8547618389129639, "loss_mode_switch": 0.0, "loss_total": 0.31159085035324097, "step": 901 }, { "batch_size": 4, "epoch": 0.3604, "step": 901, "tokens_per_device": 4536 }, { "epoch": 0.3604, "loss_ce": 0.32280826568603516, "loss_lvr": 0.9949539303779602, "loss_mode_switch": 0.0, "loss_total": 0.4223036766052246, "step": 901 }, { "batch_size": 4, "epoch": 0.3604, "step": 901, "tokens_per_device": 4276 }, { "epoch": 0.3604, "loss_ce": 0.3071781396865845, "loss_lvr": 1.0275267362594604, "loss_mode_switch": 0.0, "loss_total": 0.4099308252334595, "step": 901 }, { "epoch": 0.3608, "grad_norm": 1.438085675239563, "learning_rate": 7.39456504731841e-06, "loss": 0.3625, "step": 902 }, { "batch_size": 1, "epoch": 0.3608, "step": 902, "tokens_per_device": 6842 }, { "epoch": 0.3608, "loss_ce": 0.0024120891466736794, "loss_lvr": 0.5278559923171997, "loss_mode_switch": 0.0, "loss_total": 0.055197689682245255, "step": 902 }, { "batch_size": 4, "epoch": 0.3608, "step": 902, "tokens_per_device": 2684 }, { "epoch": 0.3608, "loss_ce": 0.4062096178531647, "loss_lvr": 1.2431904077529907, "loss_mode_switch": 0.0, "loss_total": 0.5305286645889282, "step": 902 }, { "batch_size": 4, "epoch": 0.3608, "step": 902, "tokens_per_device": 5056 }, { "epoch": 0.3608, "loss_ce": 0.33422279357910156, "loss_lvr": 0.9933168292045593, "loss_mode_switch": 0.0, "loss_total": 0.433554470539093, "step": 902 }, { "batch_size": 1, "epoch": 0.3608, "step": 902, "tokens_per_device": 4901 }, { "epoch": 0.3608, "loss_ce": 0.009501843713223934, "loss_lvr": 0.15236534178256989, "loss_mode_switch": 0.0, "loss_total": 0.024738378822803497, "step": 902 }, { "batch_size": 4, "epoch": 0.3608, "step": 902, "tokens_per_device": 4316 }, { "epoch": 0.3608, "loss_ce": 0.26063814759254456, "loss_lvr": 0.8524261116981506, "loss_mode_switch": 0.0, "loss_total": 0.34588074684143066, "step": 902 }, { "batch_size": 4, "epoch": 0.3608, "step": 902, "tokens_per_device": 4080 }, { "epoch": 0.3608, "loss_ce": 0.5012061595916748, "loss_lvr": 0.8281612992286682, "loss_mode_switch": 0.0, "loss_total": 0.5840222835540771, "step": 902 }, { "batch_size": 4, "epoch": 0.3608, "step": 902, "tokens_per_device": 4248 }, { "epoch": 0.3608, "loss_ce": 0.3951139748096466, "loss_lvr": 0.8658674955368042, "loss_mode_switch": 0.0, "loss_total": 0.48170071840286255, "step": 902 }, { "batch_size": 4, "epoch": 0.3608, "step": 902, "tokens_per_device": 1384 }, { "epoch": 0.3608, "loss_ce": 0.2951807379722595, "loss_lvr": 1.1555253267288208, "loss_mode_switch": 0.0, "loss_total": 0.41073328256607056, "step": 902 }, { "epoch": 0.3612, "grad_norm": 1.1248457431793213, "learning_rate": 7.3888766766910605e-06, "loss": 0.2762, "step": 903 }, { "batch_size": 1, "epoch": 0.3612, "step": 903, "tokens_per_device": 5204 }, { "epoch": 0.3612, "loss_ce": 0.06935340166091919, "loss_lvr": 0.5048553943634033, "loss_mode_switch": 0.0, "loss_total": 0.11983893811702728, "step": 903 }, { "batch_size": 1, "epoch": 0.3612, "step": 903, "tokens_per_device": 5149 }, { "epoch": 0.3612, "loss_ce": 0.09598490595817566, "loss_lvr": 0.31852295994758606, "loss_mode_switch": 0.0, "loss_total": 0.12783721089363098, "step": 903 }, { "batch_size": 4, "epoch": 0.3612, "step": 903, "tokens_per_device": 1416 }, { "epoch": 0.3612, "loss_ce": 0.1521841436624527, "loss_lvr": 1.0335510969161987, "loss_mode_switch": 0.0, "loss_total": 0.2555392384529114, "step": 903 }, { "batch_size": 1, "epoch": 0.3612, "step": 903, "tokens_per_device": 4933 }, { "epoch": 0.3612, "loss_ce": 0.0031045235227793455, "loss_lvr": 0.4588547945022583, "loss_mode_switch": 0.0, "loss_total": 0.048990003764629364, "step": 903 }, { "batch_size": 1, "epoch": 0.3612, "step": 903, "tokens_per_device": 4846 }, { "epoch": 0.3612, "loss_ce": 0.1640348732471466, "loss_lvr": 0.1902473121881485, "loss_mode_switch": 0.0, "loss_total": 0.18305960297584534, "step": 903 }, { "batch_size": 1, "epoch": 0.3612, "step": 903, "tokens_per_device": 4907 }, { "epoch": 0.3612, "loss_ce": 0.028530629351735115, "loss_lvr": 0.3188394606113434, "loss_mode_switch": 0.0, "loss_total": 0.060414575040340424, "step": 903 }, { "batch_size": 4, "epoch": 0.3612, "step": 903, "tokens_per_device": 1608 }, { "epoch": 0.3612, "loss_ce": 0.49053955078125, "loss_lvr": 1.0123506784439087, "loss_mode_switch": 0.0, "loss_total": 0.5917746424674988, "step": 903 }, { "batch_size": 4, "epoch": 0.3612, "step": 903, "tokens_per_device": 1268 }, { "epoch": 0.3612, "loss_ce": 0.13275478780269623, "loss_lvr": 1.6140836477279663, "loss_mode_switch": 0.0, "loss_total": 0.29416316747665405, "step": 903 }, { "epoch": 0.3616, "grad_norm": 1.2874877452850342, "learning_rate": 7.383184296751014e-06, "loss": 0.2735, "step": 904 }, { "batch_size": 1, "epoch": 0.3616, "step": 904, "tokens_per_device": 5154 }, { "epoch": 0.3616, "loss_ce": 0.007626783102750778, "loss_lvr": 0.21037890017032623, "loss_mode_switch": 0.0, "loss_total": 0.02866467274725437, "step": 904 }, { "batch_size": 4, "epoch": 0.3616, "step": 904, "tokens_per_device": 4540 }, { "epoch": 0.3616, "loss_ce": 0.0413203164935112, "loss_lvr": 0.8566995859146118, "loss_mode_switch": 0.0, "loss_total": 0.12699027359485626, "step": 904 }, { "batch_size": 1, "epoch": 0.3616, "step": 904, "tokens_per_device": 4603 }, { "epoch": 0.3616, "loss_ce": 0.1820889115333557, "loss_lvr": 0.8557442426681519, "loss_mode_switch": 0.0, "loss_total": 0.2676633298397064, "step": 904 }, { "batch_size": 4, "epoch": 0.3616, "step": 904, "tokens_per_device": 4204 }, { "epoch": 0.3616, "loss_ce": 0.06827032566070557, "loss_lvr": 1.0345340967178345, "loss_mode_switch": 0.0, "loss_total": 0.17172373831272125, "step": 904 }, { "batch_size": 1, "epoch": 0.3616, "step": 904, "tokens_per_device": 5136 }, { "epoch": 0.3616, "loss_ce": 0.0013948119012638927, "loss_lvr": 0.4470280408859253, "loss_mode_switch": 0.0, "loss_total": 0.04609761759638786, "step": 904 }, { "batch_size": 4, "epoch": 0.3616, "step": 904, "tokens_per_device": 2560 }, { "epoch": 0.3616, "loss_ce": 0.2607196569442749, "loss_lvr": 1.146620273590088, "loss_mode_switch": 0.0, "loss_total": 0.3753816783428192, "step": 904 }, { "batch_size": 4, "epoch": 0.3616, "step": 904, "tokens_per_device": 4736 }, { "epoch": 0.3616, "loss_ce": 0.22357456386089325, "loss_lvr": 0.9180923104286194, "loss_mode_switch": 0.0, "loss_total": 0.31538379192352295, "step": 904 }, { "batch_size": 4, "epoch": 0.3616, "step": 904, "tokens_per_device": 4792 }, { "epoch": 0.3616, "loss_ce": 0.2142314463853836, "loss_lvr": 0.7121464610099792, "loss_mode_switch": 0.0, "loss_total": 0.2854461073875427, "step": 904 }, { "epoch": 0.362, "grad_norm": 1.2270888090133667, "learning_rate": 7.3774879170519386e-06, "loss": 0.3043, "step": 905 }, { "batch_size": 4, "epoch": 0.362, "step": 905, "tokens_per_device": 4648 }, { "epoch": 0.362, "loss_ce": 0.5528526902198792, "loss_lvr": 0.8651197552680969, "loss_mode_switch": 0.0, "loss_total": 0.6393646597862244, "step": 905 }, { "batch_size": 4, "epoch": 0.362, "step": 905, "tokens_per_device": 5988 }, { "epoch": 0.362, "loss_ce": 0.0453108474612236, "loss_lvr": 0.7208226919174194, "loss_mode_switch": 0.0, "loss_total": 0.1173931211233139, "step": 905 }, { "batch_size": 4, "epoch": 0.362, "step": 905, "tokens_per_device": 2788 }, { "epoch": 0.362, "loss_ce": 0.3676736354827881, "loss_lvr": 0.6559483408927917, "loss_mode_switch": 0.0, "loss_total": 0.4332684874534607, "step": 905 }, { "batch_size": 1, "epoch": 0.362, "step": 905, "tokens_per_device": 6245 }, { "epoch": 0.362, "loss_ce": 0.13329516351222992, "loss_lvr": 0.3731606900691986, "loss_mode_switch": 0.0, "loss_total": 0.17061123251914978, "step": 905 }, { "batch_size": 4, "epoch": 0.362, "step": 905, "tokens_per_device": 4304 }, { "epoch": 0.362, "loss_ce": 0.2770915925502777, "loss_lvr": 0.8076220750808716, "loss_mode_switch": 0.0, "loss_total": 0.35785380005836487, "step": 905 }, { "batch_size": 1, "epoch": 0.362, "step": 905, "tokens_per_device": 5143 }, { "epoch": 0.362, "loss_ce": 0.10104035586118698, "loss_lvr": 0.24363155663013458, "loss_mode_switch": 0.0, "loss_total": 0.1254035085439682, "step": 905 }, { "batch_size": 1, "epoch": 0.362, "step": 905, "tokens_per_device": 5263 }, { "epoch": 0.362, "loss_ce": 0.2141427844762802, "loss_lvr": 0.4651482403278351, "loss_mode_switch": 0.0, "loss_total": 0.2606576085090637, "step": 905 }, { "batch_size": 1, "epoch": 0.362, "step": 905, "tokens_per_device": 4685 }, { "epoch": 0.362, "loss_ce": 0.008526208810508251, "loss_lvr": 0.251350462436676, "loss_mode_switch": 0.0, "loss_total": 0.03366125375032425, "step": 905 }, { "epoch": 0.3624, "grad_norm": 1.7743477821350098, "learning_rate": 7.371787547154215e-06, "loss": 0.2912, "step": 906 }, { "batch_size": 4, "epoch": 0.3624, "step": 906, "tokens_per_device": 6068 }, { "epoch": 0.3624, "loss_ce": 0.4682116210460663, "loss_lvr": 0.8427128791809082, "loss_mode_switch": 0.0, "loss_total": 0.5524829030036926, "step": 906 }, { "batch_size": 1, "epoch": 0.3624, "step": 906, "tokens_per_device": 4883 }, { "epoch": 0.3624, "loss_ce": 0.102222740650177, "loss_lvr": 0.29256317019462585, "loss_mode_switch": 0.0, "loss_total": 0.13147905468940735, "step": 906 }, { "batch_size": 1, "epoch": 0.3624, "step": 906, "tokens_per_device": 4872 }, { "epoch": 0.3624, "loss_ce": 0.012560454197227955, "loss_lvr": 0.4861783981323242, "loss_mode_switch": 0.0, "loss_total": 0.0611782930791378, "step": 906 }, { "batch_size": 4, "epoch": 0.3624, "step": 906, "tokens_per_device": 3860 }, { "epoch": 0.3624, "loss_ce": 0.05057663097977638, "loss_lvr": 0.8637526631355286, "loss_mode_switch": 0.0, "loss_total": 0.13695189356803894, "step": 906 }, { "batch_size": 4, "epoch": 0.3624, "step": 906, "tokens_per_device": 1284 }, { "epoch": 0.3624, "loss_ce": 0.45164409279823303, "loss_lvr": 1.0235319137573242, "loss_mode_switch": 0.0, "loss_total": 0.553997278213501, "step": 906 }, { "batch_size": 4, "epoch": 0.3624, "step": 906, "tokens_per_device": 3848 }, { "epoch": 0.3624, "loss_ce": 0.29183974862098694, "loss_lvr": 0.9744881987571716, "loss_mode_switch": 0.0, "loss_total": 0.3892885744571686, "step": 906 }, { "batch_size": 4, "epoch": 0.3624, "step": 906, "tokens_per_device": 2736 }, { "epoch": 0.3624, "loss_ce": 0.33138081431388855, "loss_lvr": 0.6108859777450562, "loss_mode_switch": 0.0, "loss_total": 0.3924694061279297, "step": 906 }, { "batch_size": 4, "epoch": 0.3624, "step": 906, "tokens_per_device": 4424 }, { "epoch": 0.3624, "loss_ce": 0.12073148041963577, "loss_lvr": 0.7720757126808167, "loss_mode_switch": 0.0, "loss_total": 0.19793905317783356, "step": 906 }, { "epoch": 0.3628, "grad_norm": 1.370534062385559, "learning_rate": 7.36608319662492e-06, "loss": 0.3051, "step": 907 }, { "batch_size": 4, "epoch": 0.3628, "step": 907, "tokens_per_device": 1420 }, { "epoch": 0.3628, "loss_ce": 0.7707328200340271, "loss_lvr": 1.0283254384994507, "loss_mode_switch": 0.0, "loss_total": 0.8735653758049011, "step": 907 }, { "batch_size": 4, "epoch": 0.3628, "step": 907, "tokens_per_device": 10372 }, { "epoch": 0.3628, "loss_ce": 0.2363656461238861, "loss_lvr": 0.6353742480278015, "loss_mode_switch": 0.0, "loss_total": 0.2999030649662018, "step": 907 }, { "batch_size": 1, "epoch": 0.3628, "step": 907, "tokens_per_device": 4909 }, { "epoch": 0.3628, "loss_ce": 0.052066393196582794, "loss_lvr": 0.5924779176712036, "loss_mode_switch": 0.0, "loss_total": 0.11131418496370316, "step": 907 }, { "batch_size": 1, "epoch": 0.3628, "step": 907, "tokens_per_device": 4873 }, { "epoch": 0.3628, "loss_ce": 0.02362658455967903, "loss_lvr": 0.27670541405677795, "loss_mode_switch": 0.0, "loss_total": 0.051297128200531006, "step": 907 }, { "batch_size": 4, "epoch": 0.3628, "step": 907, "tokens_per_device": 1392 }, { "epoch": 0.3628, "loss_ce": 0.48499178886413574, "loss_lvr": 1.140582799911499, "loss_mode_switch": 0.0, "loss_total": 0.5990500450134277, "step": 907 }, { "batch_size": 1, "epoch": 0.3628, "step": 907, "tokens_per_device": 4895 }, { "epoch": 0.3628, "loss_ce": 0.7558079957962036, "loss_lvr": 0.559829831123352, "loss_mode_switch": 0.0, "loss_total": 0.8117910027503967, "step": 907 }, { "batch_size": 4, "epoch": 0.3628, "step": 907, "tokens_per_device": 3424 }, { "epoch": 0.3628, "loss_ce": 0.13483595848083496, "loss_lvr": 1.192660927772522, "loss_mode_switch": 0.0, "loss_total": 0.25410205125808716, "step": 907 }, { "batch_size": 4, "epoch": 0.3628, "step": 907, "tokens_per_device": 1596 }, { "epoch": 0.3628, "loss_ce": 0.36299437284469604, "loss_lvr": 0.8593063354492188, "loss_mode_switch": 0.0, "loss_total": 0.4489250183105469, "step": 907 }, { "epoch": 0.3632, "grad_norm": 1.4634581804275513, "learning_rate": 7.36037487503781e-06, "loss": 0.3458, "step": 908 }, { "batch_size": 4, "epoch": 0.3632, "step": 908, "tokens_per_device": 1688 }, { "epoch": 0.3632, "loss_ce": 0.41868481040000916, "loss_lvr": 0.9943044781684875, "loss_mode_switch": 0.0, "loss_total": 0.5181152820587158, "step": 908 }, { "batch_size": 1, "epoch": 0.3632, "step": 908, "tokens_per_device": 4856 }, { "epoch": 0.3632, "loss_ce": 0.05071878433227539, "loss_lvr": 0.4836587607860565, "loss_mode_switch": 0.0, "loss_total": 0.09908466041088104, "step": 908 }, { "batch_size": 4, "epoch": 0.3632, "step": 908, "tokens_per_device": 4320 }, { "epoch": 0.3632, "loss_ce": 0.287977933883667, "loss_lvr": 0.9820688962936401, "loss_mode_switch": 0.0, "loss_total": 0.38618481159210205, "step": 908 }, { "batch_size": 1, "epoch": 0.3632, "step": 908, "tokens_per_device": 7127 }, { "epoch": 0.3632, "loss_ce": 0.7623427510261536, "loss_lvr": 0.7069585919380188, "loss_mode_switch": 0.0, "loss_total": 0.8330386281013489, "step": 908 }, { "batch_size": 1, "epoch": 0.3632, "step": 908, "tokens_per_device": 5107 }, { "epoch": 0.3632, "loss_ce": 0.16487807035446167, "loss_lvr": 0.41778096556663513, "loss_mode_switch": 0.0, "loss_total": 0.20665617287158966, "step": 908 }, { "batch_size": 4, "epoch": 0.3632, "step": 908, "tokens_per_device": 4548 }, { "epoch": 0.3632, "loss_ce": 0.06653343141078949, "loss_lvr": 0.9917561411857605, "loss_mode_switch": 0.0, "loss_total": 0.16570904850959778, "step": 908 }, { "batch_size": 4, "epoch": 0.3632, "step": 908, "tokens_per_device": 5432 }, { "epoch": 0.3632, "loss_ce": 0.024191221222281456, "loss_lvr": 0.7648674845695496, "loss_mode_switch": 0.0, "loss_total": 0.1006779745221138, "step": 908 }, { "batch_size": 4, "epoch": 0.3632, "step": 908, "tokens_per_device": 1496 }, { "epoch": 0.3632, "loss_ce": 0.447081059217453, "loss_lvr": 1.1002418994903564, "loss_mode_switch": 0.0, "loss_total": 0.5571052432060242, "step": 908 }, { "epoch": 0.3636, "grad_norm": 1.362431287765503, "learning_rate": 7.3546625919733065e-06, "loss": 0.2985, "step": 909 }, { "batch_size": 4, "epoch": 0.3636, "step": 909, "tokens_per_device": 2652 }, { "epoch": 0.3636, "loss_ce": 0.6444569826126099, "loss_lvr": 0.8449389338493347, "loss_mode_switch": 0.0, "loss_total": 0.7289508581161499, "step": 909 }, { "batch_size": 4, "epoch": 0.3636, "step": 909, "tokens_per_device": 5700 }, { "epoch": 0.3636, "loss_ce": 0.342620849609375, "loss_lvr": 0.8113020658493042, "loss_mode_switch": 0.0, "loss_total": 0.4237510561943054, "step": 909 }, { "batch_size": 4, "epoch": 0.3636, "step": 909, "tokens_per_device": 2996 }, { "epoch": 0.3636, "loss_ce": 0.48281732201576233, "loss_lvr": 0.6833756566047668, "loss_mode_switch": 0.0, "loss_total": 0.5511549115180969, "step": 909 }, { "batch_size": 4, "epoch": 0.3636, "step": 909, "tokens_per_device": 4456 }, { "epoch": 0.3636, "loss_ce": 0.3637399673461914, "loss_lvr": 1.254741907119751, "loss_mode_switch": 0.0, "loss_total": 0.4892141819000244, "step": 909 }, { "batch_size": 4, "epoch": 0.3636, "step": 909, "tokens_per_device": 6096 }, { "epoch": 0.3636, "loss_ce": 0.14341603219509125, "loss_lvr": 0.41765329241752625, "loss_mode_switch": 0.0, "loss_total": 0.1851813644170761, "step": 909 }, { "batch_size": 4, "epoch": 0.3636, "step": 909, "tokens_per_device": 3272 }, { "epoch": 0.3636, "loss_ce": 0.0786837711930275, "loss_lvr": 0.9079881906509399, "loss_mode_switch": 0.0, "loss_total": 0.16948258876800537, "step": 909 }, { "batch_size": 4, "epoch": 0.3636, "step": 909, "tokens_per_device": 5804 }, { "epoch": 0.3636, "loss_ce": 0.012692713178694248, "loss_lvr": 0.9264768362045288, "loss_mode_switch": 0.0, "loss_total": 0.1053403988480568, "step": 909 }, { "batch_size": 1, "epoch": 0.3636, "step": 909, "tokens_per_device": 4874 }, { "epoch": 0.3636, "loss_ce": 0.14430899918079376, "loss_lvr": 2.3468074798583984, "loss_mode_switch": 0.0, "loss_total": 0.3789897561073303, "step": 909 }, { "epoch": 0.364, "grad_norm": 1.3494421243667603, "learning_rate": 7.348946357018479e-06, "loss": 0.2809, "step": 910 }, { "batch_size": 1, "epoch": 0.364, "step": 910, "tokens_per_device": 5032 }, { "epoch": 0.364, "loss_ce": 0.018510110676288605, "loss_lvr": 0.43650633096694946, "loss_mode_switch": 0.0, "loss_total": 0.06216074526309967, "step": 910 }, { "batch_size": 4, "epoch": 0.364, "step": 910, "tokens_per_device": 4468 }, { "epoch": 0.364, "loss_ce": 0.09937077760696411, "loss_lvr": 0.8373881578445435, "loss_mode_switch": 0.0, "loss_total": 0.1831095963716507, "step": 910 }, { "batch_size": 1, "epoch": 0.364, "step": 910, "tokens_per_device": 5121 }, { "epoch": 0.364, "loss_ce": 0.0003242973471060395, "loss_lvr": 0.7048603892326355, "loss_mode_switch": 0.0, "loss_total": 0.07081033289432526, "step": 910 }, { "batch_size": 1, "epoch": 0.364, "step": 910, "tokens_per_device": 5064 }, { "epoch": 0.364, "loss_ce": 0.0017447532154619694, "loss_lvr": 0.2821805477142334, "loss_mode_switch": 0.0, "loss_total": 0.029962807893753052, "step": 910 }, { "batch_size": 1, "epoch": 0.364, "step": 910, "tokens_per_device": 6236 }, { "epoch": 0.364, "loss_ce": 0.013224871829152107, "loss_lvr": 0.33207425475120544, "loss_mode_switch": 0.0, "loss_total": 0.04643230140209198, "step": 910 }, { "batch_size": 4, "epoch": 0.364, "step": 910, "tokens_per_device": 1364 }, { "epoch": 0.364, "loss_ce": 0.6014180183410645, "loss_lvr": 1.324170470237732, "loss_mode_switch": 0.0, "loss_total": 0.7338351011276245, "step": 910 }, { "batch_size": 4, "epoch": 0.364, "step": 910, "tokens_per_device": 6692 }, { "epoch": 0.364, "loss_ce": 0.08668932318687439, "loss_lvr": 0.6779054999351501, "loss_mode_switch": 0.0, "loss_total": 0.15447987616062164, "step": 910 }, { "batch_size": 4, "epoch": 0.364, "step": 910, "tokens_per_device": 7060 }, { "epoch": 0.364, "loss_ce": 0.25948870182037354, "loss_lvr": 0.41790297627449036, "loss_mode_switch": 0.0, "loss_total": 0.3012790083885193, "step": 910 }, { "epoch": 0.3644, "grad_norm": 1.33576238155365, "learning_rate": 7.343226179767034e-06, "loss": 0.3277, "step": 911 }, { "batch_size": 4, "epoch": 0.3644, "step": 911, "tokens_per_device": 8508 }, { "epoch": 0.3644, "loss_ce": 0.8205080628395081, "loss_lvr": 0.7105571627616882, "loss_mode_switch": 0.0, "loss_total": 0.8915637731552124, "step": 911 }, { "batch_size": 4, "epoch": 0.3644, "step": 911, "tokens_per_device": 4128 }, { "epoch": 0.3644, "loss_ce": 0.4536009728908539, "loss_lvr": 0.8586107492446899, "loss_mode_switch": 0.0, "loss_total": 0.5394620299339294, "step": 911 }, { "batch_size": 4, "epoch": 0.3644, "step": 911, "tokens_per_device": 6596 }, { "epoch": 0.3644, "loss_ce": 0.16237570345401764, "loss_lvr": 0.7584013342857361, "loss_mode_switch": 0.0, "loss_total": 0.238215833902359, "step": 911 }, { "batch_size": 4, "epoch": 0.3644, "step": 911, "tokens_per_device": 4264 }, { "epoch": 0.3644, "loss_ce": 0.9989905953407288, "loss_lvr": 0.8938250541687012, "loss_mode_switch": 0.0, "loss_total": 1.088373064994812, "step": 911 }, { "batch_size": 4, "epoch": 0.3644, "step": 911, "tokens_per_device": 5016 }, { "epoch": 0.3644, "loss_ce": 0.25227490067481995, "loss_lvr": 0.7771411538124084, "loss_mode_switch": 0.0, "loss_total": 0.3299890160560608, "step": 911 }, { "batch_size": 4, "epoch": 0.3644, "step": 911, "tokens_per_device": 10832 }, { "epoch": 0.3644, "loss_ce": 0.1694694459438324, "loss_lvr": 0.6735297441482544, "loss_mode_switch": 0.0, "loss_total": 0.23682242631912231, "step": 911 }, { "batch_size": 4, "epoch": 0.3644, "step": 911, "tokens_per_device": 1284 }, { "epoch": 0.3644, "loss_ce": 0.4742855429649353, "loss_lvr": 1.0552374124526978, "loss_mode_switch": 0.0, "loss_total": 0.579809308052063, "step": 911 }, { "batch_size": 4, "epoch": 0.3644, "step": 911, "tokens_per_device": 1664 }, { "epoch": 0.3644, "loss_ce": 0.17167919874191284, "loss_lvr": 1.2177342176437378, "loss_mode_switch": 0.0, "loss_total": 0.2934526205062866, "step": 911 }, { "epoch": 0.3648, "grad_norm": 1.3116872310638428, "learning_rate": 7.337502069819285e-06, "loss": 0.3486, "step": 912 }, { "batch_size": 4, "epoch": 0.3648, "step": 912, "tokens_per_device": 12504 }, { "epoch": 0.3648, "loss_ce": 0.06464768946170807, "loss_lvr": 0.4619539976119995, "loss_mode_switch": 0.0, "loss_total": 0.11084309220314026, "step": 912 }, { "batch_size": 4, "epoch": 0.3648, "step": 912, "tokens_per_device": 2540 }, { "epoch": 0.3648, "loss_ce": 0.19692084193229675, "loss_lvr": 1.4479115009307861, "loss_mode_switch": 0.0, "loss_total": 0.34171199798583984, "step": 912 }, { "batch_size": 4, "epoch": 0.3648, "step": 912, "tokens_per_device": 8412 }, { "epoch": 0.3648, "loss_ce": 0.5039547681808472, "loss_lvr": 0.9294453263282776, "loss_mode_switch": 0.0, "loss_total": 0.5968992710113525, "step": 912 }, { "batch_size": 1, "epoch": 0.3648, "step": 912, "tokens_per_device": 4877 }, { "epoch": 0.3648, "loss_ce": 0.0019550577271729708, "loss_lvr": 0.3116498589515686, "loss_mode_switch": 0.0, "loss_total": 0.0331200435757637, "step": 912 }, { "batch_size": 4, "epoch": 0.3648, "step": 912, "tokens_per_device": 5768 }, { "epoch": 0.3648, "loss_ce": 0.2518680691719055, "loss_lvr": 0.9616368412971497, "loss_mode_switch": 0.0, "loss_total": 0.34803175926208496, "step": 912 }, { "batch_size": 4, "epoch": 0.3648, "step": 912, "tokens_per_device": 4416 }, { "epoch": 0.3648, "loss_ce": 0.016932107508182526, "loss_lvr": 0.8090578317642212, "loss_mode_switch": 0.0, "loss_total": 0.097837895154953, "step": 912 }, { "batch_size": 4, "epoch": 0.3648, "step": 912, "tokens_per_device": 3664 }, { "epoch": 0.3648, "loss_ce": 0.20095686614513397, "loss_lvr": 0.9176433086395264, "loss_mode_switch": 0.0, "loss_total": 0.2927212119102478, "step": 912 }, { "batch_size": 4, "epoch": 0.3648, "step": 912, "tokens_per_device": 2640 }, { "epoch": 0.3648, "loss_ce": 0.3025674819946289, "loss_lvr": 0.9395039677619934, "loss_mode_switch": 0.0, "loss_total": 0.39651787281036377, "step": 912 }, { "epoch": 0.3652, "grad_norm": 1.3495464324951172, "learning_rate": 7.331774036782158e-06, "loss": 0.3165, "step": 913 }, { "batch_size": 1, "epoch": 0.3652, "step": 913, "tokens_per_device": 5167 }, { "epoch": 0.3652, "loss_ce": 0.05899002030491829, "loss_lvr": 0.7099823951721191, "loss_mode_switch": 0.0, "loss_total": 0.12998826801776886, "step": 913 }, { "batch_size": 4, "epoch": 0.3652, "step": 913, "tokens_per_device": 1448 }, { "epoch": 0.3652, "loss_ce": 0.31868207454681396, "loss_lvr": 1.661011815071106, "loss_mode_switch": 0.0, "loss_total": 0.48478326201438904, "step": 913 }, { "batch_size": 4, "epoch": 0.3652, "step": 913, "tokens_per_device": 5448 }, { "epoch": 0.3652, "loss_ce": 0.09103164076805115, "loss_lvr": 0.6162713766098022, "loss_mode_switch": 0.0, "loss_total": 0.15265877544879913, "step": 913 }, { "batch_size": 4, "epoch": 0.3652, "step": 913, "tokens_per_device": 2572 }, { "epoch": 0.3652, "loss_ce": 0.09986095130443573, "loss_lvr": 1.2166922092437744, "loss_mode_switch": 0.0, "loss_total": 0.22153016924858093, "step": 913 }, { "batch_size": 4, "epoch": 0.3652, "step": 913, "tokens_per_device": 1536 }, { "epoch": 0.3652, "loss_ce": 0.24838414788246155, "loss_lvr": 0.8959924578666687, "loss_mode_switch": 0.0, "loss_total": 0.3379833996295929, "step": 913 }, { "batch_size": 1, "epoch": 0.3652, "step": 913, "tokens_per_device": 5171 }, { "epoch": 0.3652, "loss_ce": 0.0019258292159065604, "loss_lvr": 0.8293330073356628, "loss_mode_switch": 0.0, "loss_total": 0.08485912531614304, "step": 913 }, { "batch_size": 4, "epoch": 0.3652, "step": 913, "tokens_per_device": 6080 }, { "epoch": 0.3652, "loss_ce": 0.17136341333389282, "loss_lvr": 0.6849348545074463, "loss_mode_switch": 0.0, "loss_total": 0.23985689878463745, "step": 913 }, { "batch_size": 1, "epoch": 0.3652, "step": 913, "tokens_per_device": 4955 }, { "epoch": 0.3652, "loss_ce": 0.03211439400911331, "loss_lvr": 0.38357535004615784, "loss_mode_switch": 0.0, "loss_total": 0.07047192752361298, "step": 913 }, { "epoch": 0.3656, "grad_norm": 1.2479077577590942, "learning_rate": 7.326042090269152e-06, "loss": 0.248, "step": 914 }, { "batch_size": 4, "epoch": 0.3656, "step": 914, "tokens_per_device": 4160 }, { "epoch": 0.3656, "loss_ce": 0.13519904017448425, "loss_lvr": 0.7447006702423096, "loss_mode_switch": 0.0, "loss_total": 0.2096691131591797, "step": 914 }, { "batch_size": 1, "epoch": 0.3656, "step": 914, "tokens_per_device": 5145 }, { "epoch": 0.3656, "loss_ce": 0.004669912159442902, "loss_lvr": 0.4838860332965851, "loss_mode_switch": 0.0, "loss_total": 0.05305851623415947, "step": 914 }, { "batch_size": 4, "epoch": 0.3656, "step": 914, "tokens_per_device": 9756 }, { "epoch": 0.3656, "loss_ce": 0.08252009749412537, "loss_lvr": 0.6155003905296326, "loss_mode_switch": 0.0, "loss_total": 0.14407013356685638, "step": 914 }, { "batch_size": 1, "epoch": 0.3656, "step": 914, "tokens_per_device": 4915 }, { "epoch": 0.3656, "loss_ce": 0.1490151733160019, "loss_lvr": 0.7677041292190552, "loss_mode_switch": 0.0, "loss_total": 0.22578558325767517, "step": 914 }, { "batch_size": 4, "epoch": 0.3656, "step": 914, "tokens_per_device": 5020 }, { "epoch": 0.3656, "loss_ce": 0.10652801394462585, "loss_lvr": 0.7857837677001953, "loss_mode_switch": 0.0, "loss_total": 0.18510639667510986, "step": 914 }, { "batch_size": 4, "epoch": 0.3656, "step": 914, "tokens_per_device": 4280 }, { "epoch": 0.3656, "loss_ce": 0.1454293578863144, "loss_lvr": 0.822947084903717, "loss_mode_switch": 0.0, "loss_total": 0.2277240753173828, "step": 914 }, { "batch_size": 4, "epoch": 0.3656, "step": 914, "tokens_per_device": 5936 }, { "epoch": 0.3656, "loss_ce": 0.3777922987937927, "loss_lvr": 0.6442352533340454, "loss_mode_switch": 0.0, "loss_total": 0.44221583008766174, "step": 914 }, { "batch_size": 4, "epoch": 0.3656, "step": 914, "tokens_per_device": 1312 }, { "epoch": 0.3656, "loss_ce": 0.32133224606513977, "loss_lvr": 1.1415071487426758, "loss_mode_switch": 0.0, "loss_total": 0.4354829788208008, "step": 914 }, { "epoch": 0.366, "grad_norm": 1.2628817558288574, "learning_rate": 7.320306239900343e-06, "loss": 0.3144, "step": 915 }, { "batch_size": 4, "epoch": 0.366, "step": 915, "tokens_per_device": 4204 }, { "epoch": 0.366, "loss_ce": 0.26365038752555847, "loss_lvr": 1.2222517728805542, "loss_mode_switch": 0.0, "loss_total": 0.3858755826950073, "step": 915 }, { "batch_size": 4, "epoch": 0.366, "step": 915, "tokens_per_device": 4972 }, { "epoch": 0.366, "loss_ce": 0.6023176312446594, "loss_lvr": 0.8551590442657471, "loss_mode_switch": 0.0, "loss_total": 0.6878335475921631, "step": 915 }, { "batch_size": 1, "epoch": 0.366, "step": 915, "tokens_per_device": 4899 }, { "epoch": 0.366, "loss_ce": 0.03739365190267563, "loss_lvr": 0.433807373046875, "loss_mode_switch": 0.0, "loss_total": 0.08077438920736313, "step": 915 }, { "batch_size": 1, "epoch": 0.366, "step": 915, "tokens_per_device": 4762 }, { "epoch": 0.366, "loss_ce": 0.07765327394008636, "loss_lvr": 0.3926325738430023, "loss_mode_switch": 0.0, "loss_total": 0.11691653728485107, "step": 915 }, { "batch_size": 4, "epoch": 0.366, "step": 915, "tokens_per_device": 2728 }, { "epoch": 0.366, "loss_ce": 0.0033770930022001266, "loss_lvr": 0.3423275947570801, "loss_mode_switch": 0.0, "loss_total": 0.037609852850437164, "step": 915 }, { "batch_size": 4, "epoch": 0.366, "step": 915, "tokens_per_device": 1456 }, { "epoch": 0.366, "loss_ce": 0.11510226130485535, "loss_lvr": 0.9114064574241638, "loss_mode_switch": 0.0, "loss_total": 0.20624291896820068, "step": 915 }, { "batch_size": 1, "epoch": 0.366, "step": 915, "tokens_per_device": 5899 }, { "epoch": 0.366, "loss_ce": 0.28861770033836365, "loss_lvr": 0.47194379568099976, "loss_mode_switch": 0.0, "loss_total": 0.3358120918273926, "step": 915 }, { "batch_size": 1, "epoch": 0.366, "step": 915, "tokens_per_device": 5114 }, { "epoch": 0.366, "loss_ce": 0.00105394353158772, "loss_lvr": 0.32259175181388855, "loss_mode_switch": 0.0, "loss_total": 0.033313121646642685, "step": 915 }, { "epoch": 0.3664, "grad_norm": 1.281174659729004, "learning_rate": 7.314566495302353e-06, "loss": 0.2728, "step": 916 }, { "batch_size": 1, "epoch": 0.3664, "step": 916, "tokens_per_device": 4887 }, { "epoch": 0.3664, "loss_ce": 0.01568762958049774, "loss_lvr": 0.7244777679443359, "loss_mode_switch": 0.0, "loss_total": 0.08813540637493134, "step": 916 }, { "batch_size": 1, "epoch": 0.3664, "step": 916, "tokens_per_device": 5188 }, { "epoch": 0.3664, "loss_ce": 0.11654914915561676, "loss_lvr": 0.30844786763191223, "loss_mode_switch": 0.0, "loss_total": 0.14739394187927246, "step": 916 }, { "batch_size": 4, "epoch": 0.3664, "step": 916, "tokens_per_device": 4252 }, { "epoch": 0.3664, "loss_ce": 0.07589350640773773, "loss_lvr": 1.232918381690979, "loss_mode_switch": 0.0, "loss_total": 0.1991853415966034, "step": 916 }, { "batch_size": 4, "epoch": 0.3664, "step": 916, "tokens_per_device": 4340 }, { "epoch": 0.3664, "loss_ce": 0.4171111285686493, "loss_lvr": 0.9754908084869385, "loss_mode_switch": 0.0, "loss_total": 0.5146602392196655, "step": 916 }, { "batch_size": 1, "epoch": 0.3664, "step": 916, "tokens_per_device": 4919 }, { "epoch": 0.3664, "loss_ce": 0.02537289820611477, "loss_lvr": 0.23761935532093048, "loss_mode_switch": 0.0, "loss_total": 0.049134835600852966, "step": 916 }, { "batch_size": 1, "epoch": 0.3664, "step": 916, "tokens_per_device": 5192 }, { "epoch": 0.3664, "loss_ce": 0.1372903436422348, "loss_lvr": 0.6559742093086243, "loss_mode_switch": 0.0, "loss_total": 0.20288777351379395, "step": 916 }, { "batch_size": 4, "epoch": 0.3664, "step": 916, "tokens_per_device": 2576 }, { "epoch": 0.3664, "loss_ce": 0.3727841079235077, "loss_lvr": 1.1208314895629883, "loss_mode_switch": 0.0, "loss_total": 0.48486727476119995, "step": 916 }, { "batch_size": 1, "epoch": 0.3664, "step": 916, "tokens_per_device": 4753 }, { "epoch": 0.3664, "loss_ce": 0.06360848993062973, "loss_lvr": 0.7821250557899475, "loss_mode_switch": 0.0, "loss_total": 0.1418209969997406, "step": 916 }, { "epoch": 0.3668, "grad_norm": 1.3869431018829346, "learning_rate": 7.308822866108343e-06, "loss": 0.2878, "step": 917 }, { "batch_size": 1, "epoch": 0.3668, "step": 917, "tokens_per_device": 5014 }, { "epoch": 0.3668, "loss_ce": 0.011880919337272644, "loss_lvr": 0.6818453669548035, "loss_mode_switch": 0.0, "loss_total": 0.08006545901298523, "step": 917 }, { "batch_size": 4, "epoch": 0.3668, "step": 917, "tokens_per_device": 1232 }, { "epoch": 0.3668, "loss_ce": 0.2225201427936554, "loss_lvr": 1.4870902299880981, "loss_mode_switch": 0.0, "loss_total": 0.3712291717529297, "step": 917 }, { "batch_size": 4, "epoch": 0.3668, "step": 917, "tokens_per_device": 3900 }, { "epoch": 0.3668, "loss_ce": 0.2110603153705597, "loss_lvr": 0.8959691524505615, "loss_mode_switch": 0.0, "loss_total": 0.3006572425365448, "step": 917 }, { "batch_size": 1, "epoch": 0.3668, "step": 917, "tokens_per_device": 5142 }, { "epoch": 0.3668, "loss_ce": 0.009554757736623287, "loss_lvr": 0.5135683417320251, "loss_mode_switch": 0.0, "loss_total": 0.06091159209609032, "step": 917 }, { "batch_size": 4, "epoch": 0.3668, "step": 917, "tokens_per_device": 3060 }, { "epoch": 0.3668, "loss_ce": 0.5158357620239258, "loss_lvr": 0.6923476457595825, "loss_mode_switch": 0.0, "loss_total": 0.5850705504417419, "step": 917 }, { "batch_size": 4, "epoch": 0.3668, "step": 917, "tokens_per_device": 5756 }, { "epoch": 0.3668, "loss_ce": 0.42952439188957214, "loss_lvr": 0.6980599761009216, "loss_mode_switch": 0.0, "loss_total": 0.49933040142059326, "step": 917 }, { "batch_size": 1, "epoch": 0.3668, "step": 917, "tokens_per_device": 4905 }, { "epoch": 0.3668, "loss_ce": 0.00017924878920894116, "loss_lvr": 0.30969923734664917, "loss_mode_switch": 0.0, "loss_total": 0.03114917129278183, "step": 917 }, { "batch_size": 4, "epoch": 0.3668, "step": 917, "tokens_per_device": 3796 }, { "epoch": 0.3668, "loss_ce": 0.08098003268241882, "loss_lvr": 0.9269617795944214, "loss_mode_switch": 0.0, "loss_total": 0.17367622256278992, "step": 917 }, { "epoch": 0.3672, "grad_norm": 1.4590970277786255, "learning_rate": 7.303075361957992e-06, "loss": 0.3594, "step": 918 }, { "batch_size": 1, "epoch": 0.3672, "step": 918, "tokens_per_device": 4879 }, { "epoch": 0.3672, "loss_ce": 0.018787311390042305, "loss_lvr": 1.5545647144317627, "loss_mode_switch": 0.0, "loss_total": 0.1742437779903412, "step": 918 }, { "batch_size": 4, "epoch": 0.3672, "step": 918, "tokens_per_device": 6176 }, { "epoch": 0.3672, "loss_ce": 0.040358129888772964, "loss_lvr": 0.8431934714317322, "loss_mode_switch": 0.0, "loss_total": 0.12467747926712036, "step": 918 }, { "batch_size": 4, "epoch": 0.3672, "step": 918, "tokens_per_device": 3468 }, { "epoch": 0.3672, "loss_ce": 0.28479039669036865, "loss_lvr": 1.0169856548309326, "loss_mode_switch": 0.0, "loss_total": 0.38648897409439087, "step": 918 }, { "batch_size": 1, "epoch": 0.3672, "step": 918, "tokens_per_device": 4855 }, { "epoch": 0.3672, "loss_ce": 0.0008457156945951283, "loss_lvr": 0.3708341717720032, "loss_mode_switch": 0.0, "loss_total": 0.03792913258075714, "step": 918 }, { "batch_size": 4, "epoch": 0.3672, "step": 918, "tokens_per_device": 1204 }, { "epoch": 0.3672, "loss_ce": 0.30032601952552795, "loss_lvr": 1.026845097541809, "loss_mode_switch": 0.0, "loss_total": 0.4030105471611023, "step": 918 }, { "batch_size": 4, "epoch": 0.3672, "step": 918, "tokens_per_device": 3940 }, { "epoch": 0.3672, "loss_ce": 0.38079193234443665, "loss_lvr": 0.9811255931854248, "loss_mode_switch": 0.0, "loss_total": 0.47890448570251465, "step": 918 }, { "batch_size": 4, "epoch": 0.3672, "step": 918, "tokens_per_device": 1328 }, { "epoch": 0.3672, "loss_ce": 0.2993745803833008, "loss_lvr": 0.9260685443878174, "loss_mode_switch": 0.0, "loss_total": 0.39198142290115356, "step": 918 }, { "batch_size": 4, "epoch": 0.3672, "step": 918, "tokens_per_device": 4292 }, { "epoch": 0.3672, "loss_ce": 0.30848750472068787, "loss_lvr": 1.0475330352783203, "loss_mode_switch": 0.0, "loss_total": 0.41324082016944885, "step": 918 }, { "epoch": 0.3676, "grad_norm": 1.4360401630401611, "learning_rate": 7.297323992497483e-06, "loss": 0.3087, "step": 919 }, { "batch_size": 4, "epoch": 0.3676, "step": 919, "tokens_per_device": 3924 }, { "epoch": 0.3676, "loss_ce": 0.538536787033081, "loss_lvr": 1.0668805837631226, "loss_mode_switch": 0.0, "loss_total": 0.6452248692512512, "step": 919 }, { "batch_size": 1, "epoch": 0.3676, "step": 919, "tokens_per_device": 5102 }, { "epoch": 0.3676, "loss_ce": 0.09432809799909592, "loss_lvr": 0.6757463216781616, "loss_mode_switch": 0.0, "loss_total": 0.16190272569656372, "step": 919 }, { "batch_size": 1, "epoch": 0.3676, "step": 919, "tokens_per_device": 5004 }, { "epoch": 0.3676, "loss_ce": 0.1037682518362999, "loss_lvr": 0.6676722168922424, "loss_mode_switch": 0.0, "loss_total": 0.17053547501564026, "step": 919 }, { "batch_size": 4, "epoch": 0.3676, "step": 919, "tokens_per_device": 1432 }, { "epoch": 0.3676, "loss_ce": 0.5062804818153381, "loss_lvr": 0.7975839972496033, "loss_mode_switch": 0.0, "loss_total": 0.5860388875007629, "step": 919 }, { "batch_size": 4, "epoch": 0.3676, "step": 919, "tokens_per_device": 5820 }, { "epoch": 0.3676, "loss_ce": 0.012465509586036205, "loss_lvr": 0.7019294500350952, "loss_mode_switch": 0.0, "loss_total": 0.08265845477581024, "step": 919 }, { "batch_size": 4, "epoch": 0.3676, "step": 919, "tokens_per_device": 3888 }, { "epoch": 0.3676, "loss_ce": 0.3879255950450897, "loss_lvr": 0.8299034237861633, "loss_mode_switch": 0.0, "loss_total": 0.47091594338417053, "step": 919 }, { "batch_size": 4, "epoch": 0.3676, "step": 919, "tokens_per_device": 1276 }, { "epoch": 0.3676, "loss_ce": 0.3075733780860901, "loss_lvr": 1.0608484745025635, "loss_mode_switch": 0.0, "loss_total": 0.4136582314968109, "step": 919 }, { "batch_size": 4, "epoch": 0.3676, "step": 919, "tokens_per_device": 2564 }, { "epoch": 0.3676, "loss_ce": 0.2916189134120941, "loss_lvr": 1.0770671367645264, "loss_mode_switch": 0.0, "loss_total": 0.3993256390094757, "step": 919 }, { "epoch": 0.368, "grad_norm": 1.3145133256912231, "learning_rate": 7.291568767379484e-06, "loss": 0.3321, "step": 920 }, { "batch_size": 4, "epoch": 0.368, "step": 920, "tokens_per_device": 4504 }, { "epoch": 0.368, "loss_ce": 0.3080788552761078, "loss_lvr": 0.8908387422561646, "loss_mode_switch": 0.0, "loss_total": 0.3971627354621887, "step": 920 }, { "batch_size": 4, "epoch": 0.368, "step": 920, "tokens_per_device": 1436 }, { "epoch": 0.368, "loss_ce": 0.4907020628452301, "loss_lvr": 0.8417207598686218, "loss_mode_switch": 0.0, "loss_total": 0.5748741626739502, "step": 920 }, { "batch_size": 1, "epoch": 0.368, "step": 920, "tokens_per_device": 5166 }, { "epoch": 0.368, "loss_ce": 0.015680288895964622, "loss_lvr": 0.36432138085365295, "loss_mode_switch": 0.0, "loss_total": 0.052112430334091187, "step": 920 }, { "batch_size": 4, "epoch": 0.368, "step": 920, "tokens_per_device": 13936 }, { "epoch": 0.368, "loss_ce": 0.3646346628665924, "loss_lvr": 0.8392789959907532, "loss_mode_switch": 0.0, "loss_total": 0.4485625624656677, "step": 920 }, { "batch_size": 4, "epoch": 0.368, "step": 920, "tokens_per_device": 2776 }, { "epoch": 0.368, "loss_ce": 0.314206600189209, "loss_lvr": 0.9669454097747803, "loss_mode_switch": 0.0, "loss_total": 0.41090112924575806, "step": 920 }, { "batch_size": 4, "epoch": 0.368, "step": 920, "tokens_per_device": 11108 }, { "epoch": 0.368, "loss_ce": 0.0025309263728559017, "loss_lvr": 0.7410293817520142, "loss_mode_switch": 0.0, "loss_total": 0.07663386315107346, "step": 920 }, { "batch_size": 1, "epoch": 0.368, "step": 920, "tokens_per_device": 4891 }, { "epoch": 0.368, "loss_ce": 0.037674546241760254, "loss_lvr": 0.9023154973983765, "loss_mode_switch": 0.0, "loss_total": 0.12790609896183014, "step": 920 }, { "batch_size": 4, "epoch": 0.368, "step": 920, "tokens_per_device": 5200 }, { "epoch": 0.368, "loss_ce": 0.13444100320339203, "loss_lvr": 0.7999094128608704, "loss_mode_switch": 0.0, "loss_total": 0.21443194150924683, "step": 920 }, { "epoch": 0.3684, "grad_norm": 1.3486137390136719, "learning_rate": 7.2858096962631395e-06, "loss": 0.3155, "step": 921 }, { "batch_size": 4, "epoch": 0.3684, "step": 921, "tokens_per_device": 8480 }, { "epoch": 0.3684, "loss_ce": 0.14842253923416138, "loss_lvr": 0.6347500085830688, "loss_mode_switch": 0.0, "loss_total": 0.21189755201339722, "step": 921 }, { "batch_size": 4, "epoch": 0.3684, "step": 921, "tokens_per_device": 7012 }, { "epoch": 0.3684, "loss_ce": 0.09337453544139862, "loss_lvr": 0.8997381329536438, "loss_mode_switch": 0.0, "loss_total": 0.18334835767745972, "step": 921 }, { "batch_size": 1, "epoch": 0.3684, "step": 921, "tokens_per_device": 4749 }, { "epoch": 0.3684, "loss_ce": 0.011119941249489784, "loss_lvr": 0.3585282564163208, "loss_mode_switch": 0.0, "loss_total": 0.046972766518592834, "step": 921 }, { "batch_size": 4, "epoch": 0.3684, "step": 921, "tokens_per_device": 4412 }, { "epoch": 0.3684, "loss_ce": 0.010012130253016949, "loss_lvr": 0.9493837952613831, "loss_mode_switch": 0.0, "loss_total": 0.10495050996541977, "step": 921 }, { "batch_size": 4, "epoch": 0.3684, "step": 921, "tokens_per_device": 1536 }, { "epoch": 0.3684, "loss_ce": 0.6900200247764587, "loss_lvr": 0.9039273262023926, "loss_mode_switch": 0.0, "loss_total": 0.7804127335548401, "step": 921 }, { "batch_size": 4, "epoch": 0.3684, "step": 921, "tokens_per_device": 2656 }, { "epoch": 0.3684, "loss_ce": 0.0758613869547844, "loss_lvr": 0.8161880970001221, "loss_mode_switch": 0.0, "loss_total": 0.15748019516468048, "step": 921 }, { "batch_size": 4, "epoch": 0.3684, "step": 921, "tokens_per_device": 1600 }, { "epoch": 0.3684, "loss_ce": 0.49217987060546875, "loss_lvr": 1.6018515825271606, "loss_mode_switch": 0.0, "loss_total": 0.6523650288581848, "step": 921 }, { "batch_size": 4, "epoch": 0.3684, "step": 921, "tokens_per_device": 4260 }, { "epoch": 0.3684, "loss_ce": 0.12428827583789825, "loss_lvr": 1.9917101860046387, "loss_mode_switch": 0.0, "loss_total": 0.32345929741859436, "step": 921 }, { "epoch": 0.3688, "grad_norm": 1.1865607500076294, "learning_rate": 7.280046788814045e-06, "loss": 0.2827, "step": 922 }, { "batch_size": 1, "epoch": 0.3688, "step": 922, "tokens_per_device": 4871 }, { "epoch": 0.3688, "loss_ce": 0.002259771339595318, "loss_lvr": 0.3455688953399658, "loss_mode_switch": 0.0, "loss_total": 0.036816664040088654, "step": 922 }, { "batch_size": 1, "epoch": 0.3688, "step": 922, "tokens_per_device": 4882 }, { "epoch": 0.3688, "loss_ce": 0.002601848216727376, "loss_lvr": 0.2385331243276596, "loss_mode_switch": 0.0, "loss_total": 0.026455162093043327, "step": 922 }, { "batch_size": 1, "epoch": 0.3688, "step": 922, "tokens_per_device": 4970 }, { "epoch": 0.3688, "loss_ce": 0.3681471347808838, "loss_lvr": 0.8104625940322876, "loss_mode_switch": 0.0, "loss_total": 0.44919338822364807, "step": 922 }, { "batch_size": 4, "epoch": 0.3688, "step": 922, "tokens_per_device": 7052 }, { "epoch": 0.3688, "loss_ce": 0.4000299870967865, "loss_lvr": 0.74215167760849, "loss_mode_switch": 0.0, "loss_total": 0.4742451608181, "step": 922 }, { "batch_size": 4, "epoch": 0.3688, "step": 922, "tokens_per_device": 5712 }, { "epoch": 0.3688, "loss_ce": 0.25760889053344727, "loss_lvr": 0.7979254126548767, "loss_mode_switch": 0.0, "loss_total": 0.33740144968032837, "step": 922 }, { "batch_size": 1, "epoch": 0.3688, "step": 922, "tokens_per_device": 4916 }, { "epoch": 0.3688, "loss_ce": 0.051694151014089584, "loss_lvr": 0.4967925250530243, "loss_mode_switch": 0.0, "loss_total": 0.10137340426445007, "step": 922 }, { "batch_size": 4, "epoch": 0.3688, "step": 922, "tokens_per_device": 5320 }, { "epoch": 0.3688, "loss_ce": 0.14280487596988678, "loss_lvr": 0.8028767108917236, "loss_mode_switch": 0.0, "loss_total": 0.22309255599975586, "step": 922 }, { "batch_size": 4, "epoch": 0.3688, "step": 922, "tokens_per_device": 4908 }, { "epoch": 0.3688, "loss_ce": 0.07459276914596558, "loss_lvr": 1.603660225868225, "loss_mode_switch": 0.0, "loss_total": 0.23495879769325256, "step": 922 }, { "epoch": 0.3692, "grad_norm": 1.3494715690612793, "learning_rate": 7.274280054704232e-06, "loss": 0.3242, "step": 923 }, { "batch_size": 1, "epoch": 0.3692, "step": 923, "tokens_per_device": 5070 }, { "epoch": 0.3692, "loss_ce": 0.013113666325807571, "loss_lvr": 0.4297657608985901, "loss_mode_switch": 0.0, "loss_total": 0.05609024316072464, "step": 923 }, { "batch_size": 4, "epoch": 0.3692, "step": 923, "tokens_per_device": 1936 }, { "epoch": 0.3692, "loss_ce": 0.2651379108428955, "loss_lvr": 0.7996848225593567, "loss_mode_switch": 0.0, "loss_total": 0.3451063930988312, "step": 923 }, { "batch_size": 4, "epoch": 0.3692, "step": 923, "tokens_per_device": 5048 }, { "epoch": 0.3692, "loss_ce": 0.4840969145298004, "loss_lvr": 0.7421775460243225, "loss_mode_switch": 0.0, "loss_total": 0.5583146810531616, "step": 923 }, { "batch_size": 4, "epoch": 0.3692, "step": 923, "tokens_per_device": 4860 }, { "epoch": 0.3692, "loss_ce": 0.04398370906710625, "loss_lvr": 0.8101551532745361, "loss_mode_switch": 0.0, "loss_total": 0.12499922513961792, "step": 923 }, { "batch_size": 4, "epoch": 0.3692, "step": 923, "tokens_per_device": 3808 }, { "epoch": 0.3692, "loss_ce": 0.7322524785995483, "loss_lvr": 0.9086147546768188, "loss_mode_switch": 0.0, "loss_total": 0.8231139779090881, "step": 923 }, { "batch_size": 4, "epoch": 0.3692, "step": 923, "tokens_per_device": 4244 }, { "epoch": 0.3692, "loss_ce": 0.2964341640472412, "loss_lvr": 0.9327219128608704, "loss_mode_switch": 0.0, "loss_total": 0.3897063732147217, "step": 923 }, { "batch_size": 1, "epoch": 0.3692, "step": 923, "tokens_per_device": 5174 }, { "epoch": 0.3692, "loss_ce": 0.007894198410212994, "loss_lvr": 0.5455322861671448, "loss_mode_switch": 0.0, "loss_total": 0.062447428703308105, "step": 923 }, { "batch_size": 1, "epoch": 0.3692, "step": 923, "tokens_per_device": 4746 }, { "epoch": 0.3692, "loss_ce": 0.16155514121055603, "loss_lvr": 1.5480259656906128, "loss_mode_switch": 0.0, "loss_total": 0.31635773181915283, "step": 923 }, { "epoch": 0.3696, "grad_norm": 1.2043393850326538, "learning_rate": 7.268509503612162e-06, "loss": 0.2445, "step": 924 }, { "batch_size": 4, "epoch": 0.3696, "step": 924, "tokens_per_device": 3636 }, { "epoch": 0.3696, "loss_ce": 0.13295984268188477, "loss_lvr": 0.8538198471069336, "loss_mode_switch": 0.0, "loss_total": 0.21834182739257812, "step": 924 }, { "batch_size": 4, "epoch": 0.3696, "step": 924, "tokens_per_device": 6268 }, { "epoch": 0.3696, "loss_ce": 0.015501633286476135, "loss_lvr": 0.784345805644989, "loss_mode_switch": 0.0, "loss_total": 0.09393621236085892, "step": 924 }, { "batch_size": 4, "epoch": 0.3696, "step": 924, "tokens_per_device": 6460 }, { "epoch": 0.3696, "loss_ce": 0.011385196819901466, "loss_lvr": 0.7550579905509949, "loss_mode_switch": 0.0, "loss_total": 0.08689099550247192, "step": 924 }, { "batch_size": 4, "epoch": 0.3696, "step": 924, "tokens_per_device": 6264 }, { "epoch": 0.3696, "loss_ce": 0.11076248437166214, "loss_lvr": 0.9245509505271912, "loss_mode_switch": 0.0, "loss_total": 0.20321758091449738, "step": 924 }, { "batch_size": 1, "epoch": 0.3696, "step": 924, "tokens_per_device": 6948 }, { "epoch": 0.3696, "loss_ce": 0.22031602263450623, "loss_lvr": 0.5516829490661621, "loss_mode_switch": 0.0, "loss_total": 0.2754843235015869, "step": 924 }, { "batch_size": 4, "epoch": 0.3696, "step": 924, "tokens_per_device": 2740 }, { "epoch": 0.3696, "loss_ce": 0.6885791420936584, "loss_lvr": 0.9138321876525879, "loss_mode_switch": 0.0, "loss_total": 0.7799623608589172, "step": 924 }, { "batch_size": 4, "epoch": 0.3696, "step": 924, "tokens_per_device": 3812 }, { "epoch": 0.3696, "loss_ce": 0.14025937020778656, "loss_lvr": 0.8709324598312378, "loss_mode_switch": 0.0, "loss_total": 0.22735261917114258, "step": 924 }, { "batch_size": 4, "epoch": 0.3696, "step": 924, "tokens_per_device": 3800 }, { "epoch": 0.3696, "loss_ce": 0.12194499373435974, "loss_lvr": 1.069005012512207, "loss_mode_switch": 0.0, "loss_total": 0.2288455069065094, "step": 924 }, { "epoch": 0.37, "grad_norm": 1.2987009286880493, "learning_rate": 7.262735145222696e-06, "loss": 0.2855, "step": 925 }, { "batch_size": 1, "epoch": 0.37, "step": 925, "tokens_per_device": 4863 }, { "epoch": 0.37, "loss_ce": 0.03375566750764847, "loss_lvr": 0.312914103269577, "loss_mode_switch": 0.0, "loss_total": 0.06504707783460617, "step": 925 }, { "batch_size": 4, "epoch": 0.37, "step": 925, "tokens_per_device": 3920 }, { "epoch": 0.37, "loss_ce": 0.676486074924469, "loss_lvr": 0.9194969534873962, "loss_mode_switch": 0.0, "loss_total": 0.7684357762336731, "step": 925 }, { "batch_size": 4, "epoch": 0.37, "step": 925, "tokens_per_device": 4308 }, { "epoch": 0.37, "loss_ce": 0.2688680589199066, "loss_lvr": 0.8604117631912231, "loss_mode_switch": 0.0, "loss_total": 0.3549092411994934, "step": 925 }, { "batch_size": 1, "epoch": 0.37, "step": 925, "tokens_per_device": 5565 }, { "epoch": 0.37, "loss_ce": 0.02731359750032425, "loss_lvr": 0.49757084250450134, "loss_mode_switch": 0.0, "loss_total": 0.0770706832408905, "step": 925 }, { "batch_size": 4, "epoch": 0.37, "step": 925, "tokens_per_device": 2612 }, { "epoch": 0.37, "loss_ce": 0.3980799615383148, "loss_lvr": 0.8985201120376587, "loss_mode_switch": 0.0, "loss_total": 0.4879319667816162, "step": 925 }, { "batch_size": 4, "epoch": 0.37, "step": 925, "tokens_per_device": 11056 }, { "epoch": 0.37, "loss_ce": 0.6774871349334717, "loss_lvr": 1.2466319799423218, "loss_mode_switch": 0.0, "loss_total": 0.802150309085846, "step": 925 }, { "batch_size": 1, "epoch": 0.37, "step": 925, "tokens_per_device": 7500 }, { "epoch": 0.37, "loss_ce": 0.0013512298464775085, "loss_lvr": 0.3005017340183258, "loss_mode_switch": 0.0, "loss_total": 0.03140140324831009, "step": 925 }, { "batch_size": 4, "epoch": 0.37, "step": 925, "tokens_per_device": 4148 }, { "epoch": 0.37, "loss_ce": 0.5787555575370789, "loss_lvr": 0.9387843012809753, "loss_mode_switch": 0.0, "loss_total": 0.6726340055465698, "step": 925 }, { "epoch": 0.3704, "grad_norm": 1.2106711864471436, "learning_rate": 7.256956989227084e-06, "loss": 0.2953, "step": 926 }, { "batch_size": 4, "epoch": 0.3704, "step": 926, "tokens_per_device": 3880 }, { "epoch": 0.3704, "loss_ce": 0.5340346097946167, "loss_lvr": 0.7711302042007446, "loss_mode_switch": 0.0, "loss_total": 0.6111476421356201, "step": 926 }, { "batch_size": 1, "epoch": 0.3704, "step": 926, "tokens_per_device": 7096 }, { "epoch": 0.3704, "loss_ce": 0.2479105293750763, "loss_lvr": 0.40160316228866577, "loss_mode_switch": 0.0, "loss_total": 0.2880708575248718, "step": 926 }, { "batch_size": 1, "epoch": 0.3704, "step": 926, "tokens_per_device": 5221 }, { "epoch": 0.3704, "loss_ce": 0.19104111194610596, "loss_lvr": 0.38752931356430054, "loss_mode_switch": 0.0, "loss_total": 0.22979404032230377, "step": 926 }, { "batch_size": 4, "epoch": 0.3704, "step": 926, "tokens_per_device": 4260 }, { "epoch": 0.3704, "loss_ce": 0.33278095722198486, "loss_lvr": 1.0739041566848755, "loss_mode_switch": 0.0, "loss_total": 0.44017136096954346, "step": 926 }, { "batch_size": 4, "epoch": 0.3704, "step": 926, "tokens_per_device": 3996 }, { "epoch": 0.3704, "loss_ce": 0.06583784520626068, "loss_lvr": 0.9364754557609558, "loss_mode_switch": 0.0, "loss_total": 0.15948539972305298, "step": 926 }, { "batch_size": 4, "epoch": 0.3704, "step": 926, "tokens_per_device": 2532 }, { "epoch": 0.3704, "loss_ce": 0.49701863527297974, "loss_lvr": 0.9698647856712341, "loss_mode_switch": 0.0, "loss_total": 0.5940051078796387, "step": 926 }, { "batch_size": 4, "epoch": 0.3704, "step": 926, "tokens_per_device": 2796 }, { "epoch": 0.3704, "loss_ce": 0.30040550231933594, "loss_lvr": 0.7568035125732422, "loss_mode_switch": 0.0, "loss_total": 0.3760858476161957, "step": 926 }, { "batch_size": 1, "epoch": 0.3704, "step": 926, "tokens_per_device": 5116 }, { "epoch": 0.3704, "loss_ce": 0.1733439862728119, "loss_lvr": 0.4318452775478363, "loss_mode_switch": 0.0, "loss_total": 0.21652851998806, "step": 926 }, { "epoch": 0.3708, "grad_norm": 1.4816315174102783, "learning_rate": 7.251175045322959e-06, "loss": 0.2886, "step": 927 }, { "batch_size": 1, "epoch": 0.3708, "step": 927, "tokens_per_device": 5113 }, { "epoch": 0.3708, "loss_ce": 0.007131737656891346, "loss_lvr": 0.43389859795570374, "loss_mode_switch": 0.0, "loss_total": 0.050521597266197205, "step": 927 }, { "batch_size": 4, "epoch": 0.3708, "step": 927, "tokens_per_device": 7484 }, { "epoch": 0.3708, "loss_ce": 0.26768338680267334, "loss_lvr": 0.8009693622589111, "loss_mode_switch": 0.0, "loss_total": 0.3477803170681, "step": 927 }, { "batch_size": 4, "epoch": 0.3708, "step": 927, "tokens_per_device": 2556 }, { "epoch": 0.3708, "loss_ce": 0.018802782520651817, "loss_lvr": 0.9416801333427429, "loss_mode_switch": 0.0, "loss_total": 0.11297079920768738, "step": 927 }, { "batch_size": 4, "epoch": 0.3708, "step": 927, "tokens_per_device": 5740 }, { "epoch": 0.3708, "loss_ce": 0.6560482978820801, "loss_lvr": 0.986809492111206, "loss_mode_switch": 0.0, "loss_total": 0.7547292709350586, "step": 927 }, { "batch_size": 1, "epoch": 0.3708, "step": 927, "tokens_per_device": 4881 }, { "epoch": 0.3708, "loss_ce": 0.0020576510578393936, "loss_lvr": 0.25901690125465393, "loss_mode_switch": 0.0, "loss_total": 0.027959341183304787, "step": 927 }, { "batch_size": 1, "epoch": 0.3708, "step": 927, "tokens_per_device": 4875 }, { "epoch": 0.3708, "loss_ce": 0.011670003645122051, "loss_lvr": 0.4817422330379486, "loss_mode_switch": 0.0, "loss_total": 0.059844229370355606, "step": 927 }, { "batch_size": 4, "epoch": 0.3708, "step": 927, "tokens_per_device": 4252 }, { "epoch": 0.3708, "loss_ce": 0.571931004524231, "loss_lvr": 0.8764835596084595, "loss_mode_switch": 0.0, "loss_total": 0.659579336643219, "step": 927 }, { "batch_size": 4, "epoch": 0.3708, "step": 927, "tokens_per_device": 5908 }, { "epoch": 0.3708, "loss_ce": 0.10575402528047562, "loss_lvr": 0.7497338652610779, "loss_mode_switch": 0.0, "loss_total": 0.18072742223739624, "step": 927 }, { "epoch": 0.3712, "grad_norm": 1.3454351425170898, "learning_rate": 7.245389323214301e-06, "loss": 0.3091, "step": 928 }, { "batch_size": 1, "epoch": 0.3712, "step": 928, "tokens_per_device": 4881 }, { "epoch": 0.3712, "loss_ce": 0.34705692529678345, "loss_lvr": 0.4618259370326996, "loss_mode_switch": 0.0, "loss_total": 0.3932395279407501, "step": 928 }, { "batch_size": 1, "epoch": 0.3712, "step": 928, "tokens_per_device": 4930 }, { "epoch": 0.3712, "loss_ce": 0.5813609957695007, "loss_lvr": 1.0991008281707764, "loss_mode_switch": 0.0, "loss_total": 0.6912710666656494, "step": 928 }, { "batch_size": 1, "epoch": 0.3712, "step": 928, "tokens_per_device": 4776 }, { "epoch": 0.3712, "loss_ce": 0.008856179192662239, "loss_lvr": 0.2231464684009552, "loss_mode_switch": 0.0, "loss_total": 0.03117082640528679, "step": 928 }, { "batch_size": 4, "epoch": 0.3712, "step": 928, "tokens_per_device": 3892 }, { "epoch": 0.3712, "loss_ce": 0.5328096747398376, "loss_lvr": 1.0534119606018066, "loss_mode_switch": 0.0, "loss_total": 0.6381508708000183, "step": 928 }, { "batch_size": 4, "epoch": 0.3712, "step": 928, "tokens_per_device": 12800 }, { "epoch": 0.3712, "loss_ce": 0.38698193430900574, "loss_lvr": 1.2115001678466797, "loss_mode_switch": 0.0, "loss_total": 0.5081319808959961, "step": 928 }, { "batch_size": 4, "epoch": 0.3712, "step": 928, "tokens_per_device": 4748 }, { "epoch": 0.3712, "loss_ce": 0.4625189006328583, "loss_lvr": 0.7789584994316101, "loss_mode_switch": 0.0, "loss_total": 0.5404147505760193, "step": 928 }, { "batch_size": 4, "epoch": 0.3712, "step": 928, "tokens_per_device": 4312 }, { "epoch": 0.3712, "loss_ce": 0.24613890051841736, "loss_lvr": 0.8507846593856812, "loss_mode_switch": 0.0, "loss_total": 0.33121737837791443, "step": 928 }, { "batch_size": 4, "epoch": 0.3712, "step": 928, "tokens_per_device": 4000 }, { "epoch": 0.3712, "loss_ce": 0.19297091662883759, "loss_lvr": 0.7253729701042175, "loss_mode_switch": 0.0, "loss_total": 0.2655082046985626, "step": 928 }, { "epoch": 0.3716, "grad_norm": 1.3172039985656738, "learning_rate": 7.2395998326114345e-06, "loss": 0.3271, "step": 929 }, { "batch_size": 4, "epoch": 0.3716, "step": 929, "tokens_per_device": 1508 }, { "epoch": 0.3716, "loss_ce": 0.639845073223114, "loss_lvr": 1.1734178066253662, "loss_mode_switch": 0.0, "loss_total": 0.7571868300437927, "step": 929 }, { "batch_size": 4, "epoch": 0.3716, "step": 929, "tokens_per_device": 5696 }, { "epoch": 0.3716, "loss_ce": 0.4146299362182617, "loss_lvr": 0.8034157156944275, "loss_mode_switch": 0.0, "loss_total": 0.49497151374816895, "step": 929 }, { "batch_size": 1, "epoch": 0.3716, "step": 929, "tokens_per_device": 4933 }, { "epoch": 0.3716, "loss_ce": 0.013760047033429146, "loss_lvr": 0.3098231256008148, "loss_mode_switch": 0.0, "loss_total": 0.04474236071109772, "step": 929 }, { "batch_size": 4, "epoch": 0.3716, "step": 929, "tokens_per_device": 5444 }, { "epoch": 0.3716, "loss_ce": 0.1479659080505371, "loss_lvr": 0.8877827525138855, "loss_mode_switch": 0.0, "loss_total": 0.23674419522285461, "step": 929 }, { "batch_size": 4, "epoch": 0.3716, "step": 929, "tokens_per_device": 3644 }, { "epoch": 0.3716, "loss_ce": 0.5799322724342346, "loss_lvr": 0.9450610876083374, "loss_mode_switch": 0.0, "loss_total": 0.6744383573532104, "step": 929 }, { "batch_size": 4, "epoch": 0.3716, "step": 929, "tokens_per_device": 3788 }, { "epoch": 0.3716, "loss_ce": 0.3266577124595642, "loss_lvr": 1.1545826196670532, "loss_mode_switch": 0.0, "loss_total": 0.4421159625053406, "step": 929 }, { "batch_size": 1, "epoch": 0.3716, "step": 929, "tokens_per_device": 5099 }, { "epoch": 0.3716, "loss_ce": 0.006575252860784531, "loss_lvr": 0.3204909563064575, "loss_mode_switch": 0.0, "loss_total": 0.0386243499815464, "step": 929 }, { "batch_size": 4, "epoch": 0.3716, "step": 929, "tokens_per_device": 5456 }, { "epoch": 0.3716, "loss_ce": 0.3882249593734741, "loss_lvr": 0.7756497263908386, "loss_mode_switch": 0.0, "loss_total": 0.46578994393348694, "step": 929 }, { "epoch": 0.372, "grad_norm": 1.2655109167099, "learning_rate": 7.233806583231012e-06, "loss": 0.3353, "step": 930 }, { "batch_size": 4, "epoch": 0.372, "step": 930, "tokens_per_device": 8156 }, { "epoch": 0.372, "loss_ce": 0.37997910380363464, "loss_lvr": 0.6884804964065552, "loss_mode_switch": 0.0, "loss_total": 0.4488271474838257, "step": 930 }, { "batch_size": 4, "epoch": 0.372, "step": 930, "tokens_per_device": 2740 }, { "epoch": 0.372, "loss_ce": 0.21921080350875854, "loss_lvr": 1.207790493965149, "loss_mode_switch": 0.0, "loss_total": 0.3399898409843445, "step": 930 }, { "batch_size": 4, "epoch": 0.372, "step": 930, "tokens_per_device": 4256 }, { "epoch": 0.372, "loss_ce": 0.24041715264320374, "loss_lvr": 0.9764010310173035, "loss_mode_switch": 0.0, "loss_total": 0.3380572497844696, "step": 930 }, { "batch_size": 1, "epoch": 0.372, "step": 930, "tokens_per_device": 5150 }, { "epoch": 0.372, "loss_ce": 0.0012617919128388166, "loss_lvr": 0.40087220072746277, "loss_mode_switch": 0.0, "loss_total": 0.04134901240468025, "step": 930 }, { "batch_size": 4, "epoch": 0.372, "step": 930, "tokens_per_device": 2632 }, { "epoch": 0.372, "loss_ce": 0.12167703360319138, "loss_lvr": 1.1091949939727783, "loss_mode_switch": 0.0, "loss_total": 0.2325965315103531, "step": 930 }, { "batch_size": 1, "epoch": 0.372, "step": 930, "tokens_per_device": 5167 }, { "epoch": 0.372, "loss_ce": 0.2628379464149475, "loss_lvr": 0.36905866861343384, "loss_mode_switch": 0.0, "loss_total": 0.29974380135536194, "step": 930 }, { "batch_size": 4, "epoch": 0.372, "step": 930, "tokens_per_device": 2636 }, { "epoch": 0.372, "loss_ce": 0.24178564548492432, "loss_lvr": 0.8853565454483032, "loss_mode_switch": 0.0, "loss_total": 0.3303213119506836, "step": 930 }, { "batch_size": 4, "epoch": 0.372, "step": 930, "tokens_per_device": 6076 }, { "epoch": 0.372, "loss_ce": 0.34779834747314453, "loss_lvr": 0.9175663590431213, "loss_mode_switch": 0.0, "loss_total": 0.43955498933792114, "step": 930 }, { "epoch": 0.3724, "grad_norm": 1.2732292413711548, "learning_rate": 7.22800958479599e-06, "loss": 0.3011, "step": 931 }, { "batch_size": 1, "epoch": 0.3724, "step": 931, "tokens_per_device": 5007 }, { "epoch": 0.3724, "loss_ce": 0.02021588198840618, "loss_lvr": 0.26009294390678406, "loss_mode_switch": 0.0, "loss_total": 0.0462251752614975, "step": 931 }, { "batch_size": 4, "epoch": 0.3724, "step": 931, "tokens_per_device": 1320 }, { "epoch": 0.3724, "loss_ce": 0.09054829925298691, "loss_lvr": 1.0474061965942383, "loss_mode_switch": 0.0, "loss_total": 0.19528892636299133, "step": 931 }, { "batch_size": 1, "epoch": 0.3724, "step": 931, "tokens_per_device": 4838 }, { "epoch": 0.3724, "loss_ce": 0.003982665948569775, "loss_lvr": 0.5025330185890198, "loss_mode_switch": 0.0, "loss_total": 0.05423596873879433, "step": 931 }, { "batch_size": 4, "epoch": 0.3724, "step": 931, "tokens_per_device": 7876 }, { "epoch": 0.3724, "loss_ce": 0.08197546750307083, "loss_lvr": 0.6947458386421204, "loss_mode_switch": 0.0, "loss_total": 0.151450052857399, "step": 931 }, { "batch_size": 4, "epoch": 0.3724, "step": 931, "tokens_per_device": 5404 }, { "epoch": 0.3724, "loss_ce": 0.18351460993289948, "loss_lvr": 0.6629782319068909, "loss_mode_switch": 0.0, "loss_total": 0.24981242418289185, "step": 931 }, { "batch_size": 4, "epoch": 0.3724, "step": 931, "tokens_per_device": 6564 }, { "epoch": 0.3724, "loss_ce": 0.3567354381084442, "loss_lvr": 0.7329467535018921, "loss_mode_switch": 0.0, "loss_total": 0.43003010749816895, "step": 931 }, { "batch_size": 4, "epoch": 0.3724, "step": 931, "tokens_per_device": 9888 }, { "epoch": 0.3724, "loss_ce": 0.24251781404018402, "loss_lvr": 0.7134382128715515, "loss_mode_switch": 0.0, "loss_total": 0.3138616383075714, "step": 931 }, { "batch_size": 4, "epoch": 0.3724, "step": 931, "tokens_per_device": 4200 }, { "epoch": 0.3724, "loss_ce": 0.47608041763305664, "loss_lvr": 1.2108029127120972, "loss_mode_switch": 0.0, "loss_total": 0.5971606969833374, "step": 931 }, { "epoch": 0.3728, "grad_norm": 1.2309033870697021, "learning_rate": 7.222208847035621e-06, "loss": 0.2847, "step": 932 }, { "batch_size": 4, "epoch": 0.3728, "step": 932, "tokens_per_device": 1280 }, { "epoch": 0.3728, "loss_ce": 0.8707219958305359, "loss_lvr": 1.2508821487426758, "loss_mode_switch": 0.0, "loss_total": 0.9958102107048035, "step": 932 }, { "batch_size": 4, "epoch": 0.3728, "step": 932, "tokens_per_device": 4304 }, { "epoch": 0.3728, "loss_ce": 0.5001765489578247, "loss_lvr": 1.0164158344268799, "loss_mode_switch": 0.0, "loss_total": 0.6018181443214417, "step": 932 }, { "batch_size": 4, "epoch": 0.3728, "step": 932, "tokens_per_device": 1640 }, { "epoch": 0.3728, "loss_ce": 0.6904351115226746, "loss_lvr": 0.9695857167243958, "loss_mode_switch": 0.0, "loss_total": 0.7873936891555786, "step": 932 }, { "batch_size": 4, "epoch": 0.3728, "step": 932, "tokens_per_device": 4272 }, { "epoch": 0.3728, "loss_ce": 0.2251344621181488, "loss_lvr": 0.7764207124710083, "loss_mode_switch": 0.0, "loss_total": 0.3027765452861786, "step": 932 }, { "batch_size": 4, "epoch": 0.3728, "step": 932, "tokens_per_device": 4948 }, { "epoch": 0.3728, "loss_ce": 0.5352431535720825, "loss_lvr": 0.9898518919944763, "loss_mode_switch": 0.0, "loss_total": 0.6342283487319946, "step": 932 }, { "batch_size": 4, "epoch": 0.3728, "step": 932, "tokens_per_device": 2648 }, { "epoch": 0.3728, "loss_ce": 0.42175158858299255, "loss_lvr": 0.8135970830917358, "loss_mode_switch": 0.0, "loss_total": 0.5031113028526306, "step": 932 }, { "batch_size": 4, "epoch": 0.3728, "step": 932, "tokens_per_device": 13476 }, { "epoch": 0.3728, "loss_ce": 0.08698705583810806, "loss_lvr": 0.7199499607086182, "loss_mode_switch": 0.0, "loss_total": 0.158982053399086, "step": 932 }, { "batch_size": 4, "epoch": 0.3728, "step": 932, "tokens_per_device": 4176 }, { "epoch": 0.3728, "loss_ce": 0.2535596787929535, "loss_lvr": 1.0353529453277588, "loss_mode_switch": 0.0, "loss_total": 0.35709497332572937, "step": 932 }, { "epoch": 0.3732, "grad_norm": 1.4426696300506592, "learning_rate": 7.216404379685427e-06, "loss": 0.3528, "step": 933 }, { "batch_size": 4, "epoch": 0.3732, "step": 933, "tokens_per_device": 3064 }, { "epoch": 0.3732, "loss_ce": 0.1892055869102478, "loss_lvr": 0.9899940490722656, "loss_mode_switch": 0.0, "loss_total": 0.28820499777793884, "step": 933 }, { "batch_size": 1, "epoch": 0.3732, "step": 933, "tokens_per_device": 4935 }, { "epoch": 0.3732, "loss_ce": 0.0025381711311638355, "loss_lvr": 0.38659462332725525, "loss_mode_switch": 0.0, "loss_total": 0.04119763523340225, "step": 933 }, { "batch_size": 4, "epoch": 0.3732, "step": 933, "tokens_per_device": 3848 }, { "epoch": 0.3732, "loss_ce": 0.15929166972637177, "loss_lvr": 0.8338795900344849, "loss_mode_switch": 0.0, "loss_total": 0.242679625749588, "step": 933 }, { "batch_size": 4, "epoch": 0.3732, "step": 933, "tokens_per_device": 3796 }, { "epoch": 0.3732, "loss_ce": 0.26521608233451843, "loss_lvr": 1.1795191764831543, "loss_mode_switch": 0.0, "loss_total": 0.3831680119037628, "step": 933 }, { "batch_size": 4, "epoch": 0.3732, "step": 933, "tokens_per_device": 2580 }, { "epoch": 0.3732, "loss_ce": 0.010332438163459301, "loss_lvr": 0.9510366320610046, "loss_mode_switch": 0.0, "loss_total": 0.10543610155582428, "step": 933 }, { "batch_size": 1, "epoch": 0.3732, "step": 933, "tokens_per_device": 5106 }, { "epoch": 0.3732, "loss_ce": 0.0006063840701244771, "loss_lvr": 0.26892614364624023, "loss_mode_switch": 0.0, "loss_total": 0.02749899961054325, "step": 933 }, { "batch_size": 1, "epoch": 0.3732, "step": 933, "tokens_per_device": 5113 }, { "epoch": 0.3732, "loss_ce": 0.004071266856044531, "loss_lvr": 0.44866809248924255, "loss_mode_switch": 0.0, "loss_total": 0.0489380769431591, "step": 933 }, { "batch_size": 1, "epoch": 0.3732, "step": 933, "tokens_per_device": 4882 }, { "epoch": 0.3732, "loss_ce": 0.03894319385290146, "loss_lvr": 0.34643515944480896, "loss_mode_switch": 0.0, "loss_total": 0.07358670979738235, "step": 933 }, { "epoch": 0.3736, "grad_norm": 1.327223300933838, "learning_rate": 7.210596192487198e-06, "loss": 0.3041, "step": 934 }, { "batch_size": 4, "epoch": 0.3736, "step": 934, "tokens_per_device": 1332 }, { "epoch": 0.3736, "loss_ce": 0.6332812905311584, "loss_lvr": 1.0949875116348267, "loss_mode_switch": 0.0, "loss_total": 0.7427800297737122, "step": 934 }, { "batch_size": 4, "epoch": 0.3736, "step": 934, "tokens_per_device": 4020 }, { "epoch": 0.3736, "loss_ce": 0.20913368463516235, "loss_lvr": 0.7713548541069031, "loss_mode_switch": 0.0, "loss_total": 0.2862691879272461, "step": 934 }, { "batch_size": 4, "epoch": 0.3736, "step": 934, "tokens_per_device": 3772 }, { "epoch": 0.3736, "loss_ce": 0.2662445604801178, "loss_lvr": 1.1408958435058594, "loss_mode_switch": 0.0, "loss_total": 0.38033413887023926, "step": 934 }, { "batch_size": 1, "epoch": 0.3736, "step": 934, "tokens_per_device": 4873 }, { "epoch": 0.3736, "loss_ce": 0.089024119079113, "loss_lvr": 0.5896180272102356, "loss_mode_switch": 0.0, "loss_total": 0.14798592031002045, "step": 934 }, { "batch_size": 4, "epoch": 0.3736, "step": 934, "tokens_per_device": 5788 }, { "epoch": 0.3736, "loss_ce": 0.1777380257844925, "loss_lvr": 0.9546368718147278, "loss_mode_switch": 0.0, "loss_total": 0.27320170402526855, "step": 934 }, { "batch_size": 4, "epoch": 0.3736, "step": 934, "tokens_per_device": 1460 }, { "epoch": 0.3736, "loss_ce": 0.15241731703281403, "loss_lvr": 0.973095178604126, "loss_mode_switch": 0.0, "loss_total": 0.24972683191299438, "step": 934 }, { "batch_size": 4, "epoch": 0.3736, "step": 934, "tokens_per_device": 5436 }, { "epoch": 0.3736, "loss_ce": 0.5455800294876099, "loss_lvr": 0.9690660834312439, "loss_mode_switch": 0.0, "loss_total": 0.6424866318702698, "step": 934 }, { "batch_size": 4, "epoch": 0.3736, "step": 934, "tokens_per_device": 4388 }, { "epoch": 0.3736, "loss_ce": 0.004440627060830593, "loss_lvr": 1.200947642326355, "loss_mode_switch": 0.0, "loss_total": 0.12453539669513702, "step": 934 }, { "epoch": 0.374, "grad_norm": 1.3990226984024048, "learning_rate": 7.204784295188959e-06, "loss": 0.3367, "step": 935 }, { "batch_size": 1, "epoch": 0.374, "step": 935, "tokens_per_device": 5709 }, { "epoch": 0.374, "loss_ce": 0.002994589740410447, "loss_lvr": 0.5507352948188782, "loss_mode_switch": 0.0, "loss_total": 0.05806811898946762, "step": 935 }, { "batch_size": 4, "epoch": 0.374, "step": 935, "tokens_per_device": 2784 }, { "epoch": 0.374, "loss_ce": 0.6876077651977539, "loss_lvr": 0.8423309922218323, "loss_mode_switch": 0.0, "loss_total": 0.7718408703804016, "step": 935 }, { "batch_size": 1, "epoch": 0.374, "step": 935, "tokens_per_device": 5350 }, { "epoch": 0.374, "loss_ce": 0.0015267871785908937, "loss_lvr": 0.5155335664749146, "loss_mode_switch": 0.0, "loss_total": 0.05308014526963234, "step": 935 }, { "batch_size": 4, "epoch": 0.374, "step": 935, "tokens_per_device": 4404 }, { "epoch": 0.374, "loss_ce": 0.2307588756084442, "loss_lvr": 0.8644508719444275, "loss_mode_switch": 0.0, "loss_total": 0.31720396876335144, "step": 935 }, { "batch_size": 4, "epoch": 0.374, "step": 935, "tokens_per_device": 5768 }, { "epoch": 0.374, "loss_ce": 0.532683253288269, "loss_lvr": 0.7867054343223572, "loss_mode_switch": 0.0, "loss_total": 0.6113538146018982, "step": 935 }, { "batch_size": 4, "epoch": 0.374, "step": 935, "tokens_per_device": 4244 }, { "epoch": 0.374, "loss_ce": 0.10065697133541107, "loss_lvr": 1.1536294221878052, "loss_mode_switch": 0.0, "loss_total": 0.2160199135541916, "step": 935 }, { "batch_size": 4, "epoch": 0.374, "step": 935, "tokens_per_device": 4656 }, { "epoch": 0.374, "loss_ce": 0.42221611738204956, "loss_lvr": 0.7058358192443848, "loss_mode_switch": 0.0, "loss_total": 0.49279969930648804, "step": 935 }, { "batch_size": 4, "epoch": 0.374, "step": 935, "tokens_per_device": 15460 }, { "epoch": 0.374, "loss_ce": 0.014996681362390518, "loss_lvr": 0.7953153848648071, "loss_mode_switch": 0.0, "loss_total": 0.09452822804450989, "step": 935 }, { "epoch": 0.3744, "grad_norm": 1.3502269983291626, "learning_rate": 7.19896869754497e-06, "loss": 0.2905, "step": 936 }, { "batch_size": 4, "epoch": 0.3744, "step": 936, "tokens_per_device": 3364 }, { "epoch": 0.3744, "loss_ce": 0.11805697530508041, "loss_lvr": 1.0586225986480713, "loss_mode_switch": 0.0, "loss_total": 0.22391924262046814, "step": 936 }, { "batch_size": 1, "epoch": 0.3744, "step": 936, "tokens_per_device": 4954 }, { "epoch": 0.3744, "loss_ce": 0.02904714085161686, "loss_lvr": 0.7013303637504578, "loss_mode_switch": 0.0, "loss_total": 0.0991801768541336, "step": 936 }, { "batch_size": 4, "epoch": 0.3744, "step": 936, "tokens_per_device": 4344 }, { "epoch": 0.3744, "loss_ce": 0.1679200381040573, "loss_lvr": 1.0209757089614868, "loss_mode_switch": 0.0, "loss_total": 0.2700176239013672, "step": 936 }, { "batch_size": 4, "epoch": 0.3744, "step": 936, "tokens_per_device": 11504 }, { "epoch": 0.3744, "loss_ce": 0.3285825550556183, "loss_lvr": 0.8454629182815552, "loss_mode_switch": 0.0, "loss_total": 0.4131288528442383, "step": 936 }, { "batch_size": 1, "epoch": 0.3744, "step": 936, "tokens_per_device": 4903 }, { "epoch": 0.3744, "loss_ce": 0.04146701842546463, "loss_lvr": 1.2162652015686035, "loss_mode_switch": 0.0, "loss_total": 0.16309353709220886, "step": 936 }, { "batch_size": 1, "epoch": 0.3744, "step": 936, "tokens_per_device": 5353 }, { "epoch": 0.3744, "loss_ce": 0.027117978781461716, "loss_lvr": 0.5465549826622009, "loss_mode_switch": 0.0, "loss_total": 0.08177347481250763, "step": 936 }, { "batch_size": 1, "epoch": 0.3744, "step": 936, "tokens_per_device": 5185 }, { "epoch": 0.3744, "loss_ce": 0.0306817889213562, "loss_lvr": 0.44828441739082336, "loss_mode_switch": 0.0, "loss_total": 0.07551023364067078, "step": 936 }, { "batch_size": 4, "epoch": 0.3744, "step": 936, "tokens_per_device": 1424 }, { "epoch": 0.3744, "loss_ce": 0.39011350274086, "loss_lvr": 1.4957529306411743, "loss_mode_switch": 0.0, "loss_total": 0.5396888256072998, "step": 936 }, { "epoch": 0.3748, "grad_norm": 1.2334206104278564, "learning_rate": 7.193149409315694e-06, "loss": 0.2785, "step": 937 }, { "batch_size": 1, "epoch": 0.3748, "step": 937, "tokens_per_device": 5147 }, { "epoch": 0.3748, "loss_ce": 0.0020575637463480234, "loss_lvr": 0.6765415072441101, "loss_mode_switch": 0.0, "loss_total": 0.06971172243356705, "step": 937 }, { "batch_size": 4, "epoch": 0.3748, "step": 937, "tokens_per_device": 5324 }, { "epoch": 0.3748, "loss_ce": 0.5420020222663879, "loss_lvr": 0.8325263261795044, "loss_mode_switch": 0.0, "loss_total": 0.6252546310424805, "step": 937 }, { "batch_size": 4, "epoch": 0.3748, "step": 937, "tokens_per_device": 4088 }, { "epoch": 0.3748, "loss_ce": 0.26001307368278503, "loss_lvr": 0.9946637153625488, "loss_mode_switch": 0.0, "loss_total": 0.35947945713996887, "step": 937 }, { "batch_size": 4, "epoch": 0.3748, "step": 937, "tokens_per_device": 3612 }, { "epoch": 0.3748, "loss_ce": 0.4636823832988739, "loss_lvr": 0.7192632555961609, "loss_mode_switch": 0.0, "loss_total": 0.53560870885849, "step": 937 }, { "batch_size": 4, "epoch": 0.3748, "step": 937, "tokens_per_device": 1324 }, { "epoch": 0.3748, "loss_ce": 0.119182288646698, "loss_lvr": 1.04543936252594, "loss_mode_switch": 0.0, "loss_total": 0.22372622787952423, "step": 937 }, { "batch_size": 4, "epoch": 0.3748, "step": 937, "tokens_per_device": 1720 }, { "epoch": 0.3748, "loss_ce": 0.473940908908844, "loss_lvr": 1.0491268634796143, "loss_mode_switch": 0.0, "loss_total": 0.5788536071777344, "step": 937 }, { "batch_size": 4, "epoch": 0.3748, "step": 937, "tokens_per_device": 1428 }, { "epoch": 0.3748, "loss_ce": 0.7668008208274841, "loss_lvr": 1.1278069019317627, "loss_mode_switch": 0.0, "loss_total": 0.8795815110206604, "step": 937 }, { "batch_size": 1, "epoch": 0.3748, "step": 937, "tokens_per_device": 7755 }, { "epoch": 0.3748, "loss_ce": 0.09257587045431137, "loss_lvr": 0.45199963450431824, "loss_mode_switch": 0.0, "loss_total": 0.13777583837509155, "step": 937 }, { "epoch": 0.3752, "grad_norm": 1.6366755962371826, "learning_rate": 7.18732644026779e-06, "loss": 0.3645, "step": 938 }, { "batch_size": 4, "epoch": 0.3752, "step": 938, "tokens_per_device": 9144 }, { "epoch": 0.3752, "loss_ce": 0.1237335205078125, "loss_lvr": 0.765221893787384, "loss_mode_switch": 0.0, "loss_total": 0.20025572180747986, "step": 938 }, { "batch_size": 4, "epoch": 0.3752, "step": 938, "tokens_per_device": 3896 }, { "epoch": 0.3752, "loss_ce": 0.621299147605896, "loss_lvr": 1.0227634906768799, "loss_mode_switch": 0.0, "loss_total": 0.7235754728317261, "step": 938 }, { "batch_size": 1, "epoch": 0.3752, "step": 938, "tokens_per_device": 4887 }, { "epoch": 0.3752, "loss_ce": 0.1776587814092636, "loss_lvr": 0.6119060516357422, "loss_mode_switch": 0.0, "loss_total": 0.23884938657283783, "step": 938 }, { "batch_size": 4, "epoch": 0.3752, "step": 938, "tokens_per_device": 4224 }, { "epoch": 0.3752, "loss_ce": 0.6084188222885132, "loss_lvr": 0.9570969343185425, "loss_mode_switch": 0.0, "loss_total": 0.7041285037994385, "step": 938 }, { "batch_size": 4, "epoch": 0.3752, "step": 938, "tokens_per_device": 4368 }, { "epoch": 0.3752, "loss_ce": 0.25207990407943726, "loss_lvr": 0.7050897479057312, "loss_mode_switch": 0.0, "loss_total": 0.32258889079093933, "step": 938 }, { "batch_size": 4, "epoch": 0.3752, "step": 938, "tokens_per_device": 6612 }, { "epoch": 0.3752, "loss_ce": 0.1570645123720169, "loss_lvr": 0.6278697848320007, "loss_mode_switch": 0.0, "loss_total": 0.21985149383544922, "step": 938 }, { "batch_size": 4, "epoch": 0.3752, "step": 938, "tokens_per_device": 2736 }, { "epoch": 0.3752, "loss_ce": 0.5195023417472839, "loss_lvr": 0.6077979207038879, "loss_mode_switch": 0.0, "loss_total": 0.5802821516990662, "step": 938 }, { "batch_size": 1, "epoch": 0.3752, "step": 938, "tokens_per_device": 5044 }, { "epoch": 0.3752, "loss_ce": 0.009979413822293282, "loss_lvr": 0.6407750248908997, "loss_mode_switch": 0.0, "loss_total": 0.07405691593885422, "step": 938 }, { "epoch": 0.3756, "grad_norm": 1.4142146110534668, "learning_rate": 7.181499800174099e-06, "loss": 0.3377, "step": 939 }, { "batch_size": 1, "epoch": 0.3756, "step": 939, "tokens_per_device": 4874 }, { "epoch": 0.3756, "loss_ce": 0.1664828211069107, "loss_lvr": 0.6481045484542847, "loss_mode_switch": 0.0, "loss_total": 0.23129327595233917, "step": 939 }, { "batch_size": 4, "epoch": 0.3756, "step": 939, "tokens_per_device": 4636 }, { "epoch": 0.3756, "loss_ce": 0.11778785288333893, "loss_lvr": 0.9049782752990723, "loss_mode_switch": 0.0, "loss_total": 0.20828568935394287, "step": 939 }, { "batch_size": 4, "epoch": 0.3756, "step": 939, "tokens_per_device": 1544 }, { "epoch": 0.3756, "loss_ce": 0.39639562368392944, "loss_lvr": 0.930699348449707, "loss_mode_switch": 0.0, "loss_total": 0.4894655644893646, "step": 939 }, { "batch_size": 1, "epoch": 0.3756, "step": 939, "tokens_per_device": 5120 }, { "epoch": 0.3756, "loss_ce": 0.09872975200414658, "loss_lvr": 0.5636922717094421, "loss_mode_switch": 0.0, "loss_total": 0.15509897470474243, "step": 939 }, { "batch_size": 4, "epoch": 0.3756, "step": 939, "tokens_per_device": 1332 }, { "epoch": 0.3756, "loss_ce": 0.6092140078544617, "loss_lvr": 1.370408535003662, "loss_mode_switch": 0.0, "loss_total": 0.7462548613548279, "step": 939 }, { "batch_size": 1, "epoch": 0.3756, "step": 939, "tokens_per_device": 5367 }, { "epoch": 0.3756, "loss_ce": 0.07941526174545288, "loss_lvr": 0.4842817485332489, "loss_mode_switch": 0.0, "loss_total": 0.12784343957901, "step": 939 }, { "batch_size": 1, "epoch": 0.3756, "step": 939, "tokens_per_device": 5024 }, { "epoch": 0.3756, "loss_ce": 0.02410983294248581, "loss_lvr": 0.5768874287605286, "loss_mode_switch": 0.0, "loss_total": 0.08179857581853867, "step": 939 }, { "batch_size": 4, "epoch": 0.3756, "step": 939, "tokens_per_device": 4492 }, { "epoch": 0.3756, "loss_ce": 0.4382108747959137, "loss_lvr": 0.7964509725570679, "loss_mode_switch": 0.0, "loss_total": 0.5178560018539429, "step": 939 }, { "epoch": 0.376, "grad_norm": 1.5314544439315796, "learning_rate": 7.1756694988136165e-06, "loss": 0.3337, "step": 940 }, { "batch_size": 4, "epoch": 0.376, "step": 940, "tokens_per_device": 5620 }, { "epoch": 0.376, "loss_ce": 0.16553206741809845, "loss_lvr": 0.8579826354980469, "loss_mode_switch": 0.0, "loss_total": 0.25133031606674194, "step": 940 }, { "batch_size": 1, "epoch": 0.376, "step": 940, "tokens_per_device": 5223 }, { "epoch": 0.376, "loss_ce": 0.06683836877346039, "loss_lvr": 0.2664378583431244, "loss_mode_switch": 0.0, "loss_total": 0.09348215162754059, "step": 940 }, { "batch_size": 4, "epoch": 0.376, "step": 940, "tokens_per_device": 1424 }, { "epoch": 0.376, "loss_ce": 0.7172830700874329, "loss_lvr": 1.0346336364746094, "loss_mode_switch": 0.0, "loss_total": 0.8207464218139648, "step": 940 }, { "batch_size": 4, "epoch": 0.376, "step": 940, "tokens_per_device": 1804 }, { "epoch": 0.376, "loss_ce": 0.722816526889801, "loss_lvr": 2.173920154571533, "loss_mode_switch": 0.0, "loss_total": 0.9402085542678833, "step": 940 }, { "batch_size": 1, "epoch": 0.376, "step": 940, "tokens_per_device": 5104 }, { "epoch": 0.376, "loss_ce": 0.0037997837644070387, "loss_lvr": 0.4141068160533905, "loss_mode_switch": 0.0, "loss_total": 0.04521046578884125, "step": 940 }, { "batch_size": 4, "epoch": 0.376, "step": 940, "tokens_per_device": 4204 }, { "epoch": 0.376, "loss_ce": 0.20160937309265137, "loss_lvr": 0.8897533416748047, "loss_mode_switch": 0.0, "loss_total": 0.2905847132205963, "step": 940 }, { "batch_size": 4, "epoch": 0.376, "step": 940, "tokens_per_device": 1544 }, { "epoch": 0.376, "loss_ce": 0.10715734958648682, "loss_lvr": 0.9765179753303528, "loss_mode_switch": 0.0, "loss_total": 0.20480915904045105, "step": 940 }, { "batch_size": 4, "epoch": 0.376, "step": 940, "tokens_per_device": 15476 }, { "epoch": 0.376, "loss_ce": 0.07462738454341888, "loss_lvr": 0.5108429193496704, "loss_mode_switch": 0.0, "loss_total": 0.12571167945861816, "step": 940 }, { "epoch": 0.3764, "grad_norm": 1.5539848804473877, "learning_rate": 7.16983554597149e-06, "loss": 0.3497, "step": 941 }, { "batch_size": 4, "epoch": 0.3764, "step": 941, "tokens_per_device": 4588 }, { "epoch": 0.3764, "loss_ce": 0.28971561789512634, "loss_lvr": 0.8438960909843445, "loss_mode_switch": 0.0, "loss_total": 0.37410521507263184, "step": 941 }, { "batch_size": 4, "epoch": 0.3764, "step": 941, "tokens_per_device": 5240 }, { "epoch": 0.3764, "loss_ce": 0.10507379472255707, "loss_lvr": 0.8034575581550598, "loss_mode_switch": 0.0, "loss_total": 0.18541955947875977, "step": 941 }, { "batch_size": 4, "epoch": 0.3764, "step": 941, "tokens_per_device": 3332 }, { "epoch": 0.3764, "loss_ce": 0.13032200932502747, "loss_lvr": 0.9377058148384094, "loss_mode_switch": 0.0, "loss_total": 0.22409260272979736, "step": 941 }, { "batch_size": 4, "epoch": 0.3764, "step": 941, "tokens_per_device": 11068 }, { "epoch": 0.3764, "loss_ce": 0.03972608968615532, "loss_lvr": 0.9708262085914612, "loss_mode_switch": 0.0, "loss_total": 0.13680870831012726, "step": 941 }, { "batch_size": 4, "epoch": 0.3764, "step": 941, "tokens_per_device": 4576 }, { "epoch": 0.3764, "loss_ce": 0.13655351102352142, "loss_lvr": 0.6689242720603943, "loss_mode_switch": 0.0, "loss_total": 0.2034459412097931, "step": 941 }, { "batch_size": 4, "epoch": 0.3764, "step": 941, "tokens_per_device": 3928 }, { "epoch": 0.3764, "loss_ce": 0.5242341160774231, "loss_lvr": 0.983120858669281, "loss_mode_switch": 0.0, "loss_total": 0.6225461959838867, "step": 941 }, { "batch_size": 4, "epoch": 0.3764, "step": 941, "tokens_per_device": 3212 }, { "epoch": 0.3764, "loss_ce": 0.4283323884010315, "loss_lvr": 0.9113683104515076, "loss_mode_switch": 0.0, "loss_total": 0.5194692015647888, "step": 941 }, { "batch_size": 4, "epoch": 0.3764, "step": 941, "tokens_per_device": 5540 }, { "epoch": 0.3764, "loss_ce": 0.20386044681072235, "loss_lvr": 0.6948992013931274, "loss_mode_switch": 0.0, "loss_total": 0.2733503580093384, "step": 941 }, { "epoch": 0.3768, "grad_norm": 1.2627618312835693, "learning_rate": 7.163997951438986e-06, "loss": 0.3193, "step": 942 }, { "batch_size": 1, "epoch": 0.3768, "step": 942, "tokens_per_device": 5175 }, { "epoch": 0.3768, "loss_ce": 0.3426125943660736, "loss_lvr": 0.937222957611084, "loss_mode_switch": 0.0, "loss_total": 0.43633490800857544, "step": 942 }, { "batch_size": 1, "epoch": 0.3768, "step": 942, "tokens_per_device": 5146 }, { "epoch": 0.3768, "loss_ce": 0.03821604326367378, "loss_lvr": 0.2381862998008728, "loss_mode_switch": 0.0, "loss_total": 0.06203467398881912, "step": 942 }, { "batch_size": 1, "epoch": 0.3768, "step": 942, "tokens_per_device": 4970 }, { "epoch": 0.3768, "loss_ce": 0.3023805618286133, "loss_lvr": 0.4801236391067505, "loss_mode_switch": 0.0, "loss_total": 0.3503929376602173, "step": 942 }, { "batch_size": 4, "epoch": 0.3768, "step": 942, "tokens_per_device": 1440 }, { "epoch": 0.3768, "loss_ce": 0.27256953716278076, "loss_lvr": 1.0549055337905884, "loss_mode_switch": 0.0, "loss_total": 0.37806010246276855, "step": 942 }, { "batch_size": 4, "epoch": 0.3768, "step": 942, "tokens_per_device": 6108 }, { "epoch": 0.3768, "loss_ce": 0.05134689807891846, "loss_lvr": 0.6383436918258667, "loss_mode_switch": 0.0, "loss_total": 0.11518126726150513, "step": 942 }, { "batch_size": 4, "epoch": 0.3768, "step": 942, "tokens_per_device": 4864 }, { "epoch": 0.3768, "loss_ce": 0.49156653881073, "loss_lvr": 0.7497357130050659, "loss_mode_switch": 0.0, "loss_total": 0.5665401220321655, "step": 942 }, { "batch_size": 4, "epoch": 0.3768, "step": 942, "tokens_per_device": 4328 }, { "epoch": 0.3768, "loss_ce": 0.16833341121673584, "loss_lvr": 0.8237085938453674, "loss_mode_switch": 0.0, "loss_total": 0.250704288482666, "step": 942 }, { "batch_size": 4, "epoch": 0.3768, "step": 942, "tokens_per_device": 13024 }, { "epoch": 0.3768, "loss_ce": 0.4092419743537903, "loss_lvr": 1.1271579265594482, "loss_mode_switch": 0.0, "loss_total": 0.5219577550888062, "step": 942 }, { "epoch": 0.3772, "grad_norm": 1.313905119895935, "learning_rate": 7.158156725013493e-06, "loss": 0.2945, "step": 943 }, { "batch_size": 4, "epoch": 0.3772, "step": 943, "tokens_per_device": 4428 }, { "epoch": 0.3772, "loss_ce": 0.02594241499900818, "loss_lvr": 0.7683685421943665, "loss_mode_switch": 0.0, "loss_total": 0.10277926921844482, "step": 943 }, { "batch_size": 4, "epoch": 0.3772, "step": 943, "tokens_per_device": 5092 }, { "epoch": 0.3772, "loss_ce": 0.6823538541793823, "loss_lvr": 0.760034441947937, "loss_mode_switch": 0.0, "loss_total": 0.7583572864532471, "step": 943 }, { "batch_size": 4, "epoch": 0.3772, "step": 943, "tokens_per_device": 1500 }, { "epoch": 0.3772, "loss_ce": 0.4131101965904236, "loss_lvr": 1.1812618970870972, "loss_mode_switch": 0.0, "loss_total": 0.5312364101409912, "step": 943 }, { "batch_size": 1, "epoch": 0.3772, "step": 943, "tokens_per_device": 4753 }, { "epoch": 0.3772, "loss_ce": 0.05752187594771385, "loss_lvr": 0.24909022450447083, "loss_mode_switch": 0.0, "loss_total": 0.082430899143219, "step": 943 }, { "batch_size": 1, "epoch": 0.3772, "step": 943, "tokens_per_device": 4888 }, { "epoch": 0.3772, "loss_ce": 0.13832905888557434, "loss_lvr": 0.42896372079849243, "loss_mode_switch": 0.0, "loss_total": 0.18122543394565582, "step": 943 }, { "batch_size": 1, "epoch": 0.3772, "step": 943, "tokens_per_device": 5092 }, { "epoch": 0.3772, "loss_ce": 0.05018572136759758, "loss_lvr": 0.9446025490760803, "loss_mode_switch": 0.0, "loss_total": 0.14464597404003143, "step": 943 }, { "batch_size": 4, "epoch": 0.3772, "step": 943, "tokens_per_device": 4776 }, { "epoch": 0.3772, "loss_ce": 0.4883294701576233, "loss_lvr": 0.7492427229881287, "loss_mode_switch": 0.0, "loss_total": 0.5632537603378296, "step": 943 }, { "batch_size": 4, "epoch": 0.3772, "step": 943, "tokens_per_device": 3620 }, { "epoch": 0.3772, "loss_ce": 0.09579955041408539, "loss_lvr": 0.8973239660263062, "loss_mode_switch": 0.0, "loss_total": 0.18553194403648376, "step": 943 }, { "epoch": 0.3776, "grad_norm": 1.4064351320266724, "learning_rate": 7.152311876498487e-06, "loss": 0.3273, "step": 944 }, { "batch_size": 1, "epoch": 0.3776, "step": 944, "tokens_per_device": 4882 }, { "epoch": 0.3776, "loss_ce": 0.28465530276298523, "loss_lvr": 0.7696371674537659, "loss_mode_switch": 0.0, "loss_total": 0.3616190254688263, "step": 944 }, { "batch_size": 4, "epoch": 0.3776, "step": 944, "tokens_per_device": 9196 }, { "epoch": 0.3776, "loss_ce": 0.26726749539375305, "loss_lvr": 0.933855414390564, "loss_mode_switch": 0.0, "loss_total": 0.3606530427932739, "step": 944 }, { "batch_size": 4, "epoch": 0.3776, "step": 944, "tokens_per_device": 4316 }, { "epoch": 0.3776, "loss_ce": 0.368712842464447, "loss_lvr": 1.1369267702102661, "loss_mode_switch": 0.0, "loss_total": 0.48240551352500916, "step": 944 }, { "batch_size": 4, "epoch": 0.3776, "step": 944, "tokens_per_device": 8380 }, { "epoch": 0.3776, "loss_ce": 0.03499918803572655, "loss_lvr": 1.1144359111785889, "loss_mode_switch": 0.0, "loss_total": 0.14644278585910797, "step": 944 }, { "batch_size": 1, "epoch": 0.3776, "step": 944, "tokens_per_device": 4820 }, { "epoch": 0.3776, "loss_ce": 0.07346083223819733, "loss_lvr": 0.4476044774055481, "loss_mode_switch": 0.0, "loss_total": 0.11822128295898438, "step": 944 }, { "batch_size": 4, "epoch": 0.3776, "step": 944, "tokens_per_device": 2604 }, { "epoch": 0.3776, "loss_ce": 0.24088209867477417, "loss_lvr": 1.0581188201904297, "loss_mode_switch": 0.0, "loss_total": 0.3466939926147461, "step": 944 }, { "batch_size": 4, "epoch": 0.3776, "step": 944, "tokens_per_device": 5336 }, { "epoch": 0.3776, "loss_ce": 0.16520647704601288, "loss_lvr": 0.8791197538375854, "loss_mode_switch": 0.0, "loss_total": 0.25311845541000366, "step": 944 }, { "batch_size": 1, "epoch": 0.3776, "step": 944, "tokens_per_device": 7470 }, { "epoch": 0.3776, "loss_ce": 0.0008094938821159303, "loss_lvr": 0.4138113856315613, "loss_mode_switch": 0.0, "loss_total": 0.042190633714199066, "step": 944 }, { "epoch": 0.378, "grad_norm": 1.3408132791519165, "learning_rate": 7.14646341570353e-06, "loss": 0.2969, "step": 945 }, { "batch_size": 4, "epoch": 0.378, "step": 945, "tokens_per_device": 4348 }, { "epoch": 0.378, "loss_ce": 0.45464909076690674, "loss_lvr": 0.7291315793991089, "loss_mode_switch": 0.0, "loss_total": 0.5275622606277466, "step": 945 }, { "batch_size": 4, "epoch": 0.378, "step": 945, "tokens_per_device": 7660 }, { "epoch": 0.378, "loss_ce": 0.2358250766992569, "loss_lvr": 0.4578627943992615, "loss_mode_switch": 0.0, "loss_total": 0.2816113531589508, "step": 945 }, { "batch_size": 4, "epoch": 0.378, "step": 945, "tokens_per_device": 4552 }, { "epoch": 0.378, "loss_ce": 0.09685904532670975, "loss_lvr": 0.9033864140510559, "loss_mode_switch": 0.0, "loss_total": 0.18719768524169922, "step": 945 }, { "batch_size": 4, "epoch": 0.378, "step": 945, "tokens_per_device": 1648 }, { "epoch": 0.378, "loss_ce": 0.2966800928115845, "loss_lvr": 0.8730054497718811, "loss_mode_switch": 0.0, "loss_total": 0.3839806318283081, "step": 945 }, { "batch_size": 4, "epoch": 0.378, "step": 945, "tokens_per_device": 2624 }, { "epoch": 0.378, "loss_ce": 0.289179265499115, "loss_lvr": 1.1438783407211304, "loss_mode_switch": 0.0, "loss_total": 0.4035671055316925, "step": 945 }, { "batch_size": 4, "epoch": 0.378, "step": 945, "tokens_per_device": 2628 }, { "epoch": 0.378, "loss_ce": 0.1273663491010666, "loss_lvr": 0.9477225542068481, "loss_mode_switch": 0.0, "loss_total": 0.22213861346244812, "step": 945 }, { "batch_size": 4, "epoch": 0.378, "step": 945, "tokens_per_device": 5144 }, { "epoch": 0.378, "loss_ce": 0.28831562399864197, "loss_lvr": 0.7166017293930054, "loss_mode_switch": 0.0, "loss_total": 0.35997581481933594, "step": 945 }, { "batch_size": 4, "epoch": 0.378, "step": 945, "tokens_per_device": 2536 }, { "epoch": 0.378, "loss_ce": 0.6625351309776306, "loss_lvr": 1.1168063879013062, "loss_mode_switch": 0.0, "loss_total": 0.7742157578468323, "step": 945 }, { "epoch": 0.3784, "grad_norm": 1.3434573411941528, "learning_rate": 7.14061135244424e-06, "loss": 0.2989, "step": 946 }, { "batch_size": 4, "epoch": 0.3784, "step": 946, "tokens_per_device": 1864 }, { "epoch": 0.3784, "loss_ce": 0.14941179752349854, "loss_lvr": 1.0074342489242554, "loss_mode_switch": 0.0, "loss_total": 0.2501552104949951, "step": 946 }, { "batch_size": 1, "epoch": 0.3784, "step": 946, "tokens_per_device": 7798 }, { "epoch": 0.3784, "loss_ce": 0.004326855298131704, "loss_lvr": 0.5943617224693298, "loss_mode_switch": 0.0, "loss_total": 0.06376302987337112, "step": 946 }, { "batch_size": 1, "epoch": 0.3784, "step": 946, "tokens_per_device": 5117 }, { "epoch": 0.3784, "loss_ce": 0.1867915838956833, "loss_lvr": 0.5607864856719971, "loss_mode_switch": 0.0, "loss_total": 0.2428702414035797, "step": 946 }, { "batch_size": 1, "epoch": 0.3784, "step": 946, "tokens_per_device": 4887 }, { "epoch": 0.3784, "loss_ce": 0.007919485680758953, "loss_lvr": 0.3352661728858948, "loss_mode_switch": 0.0, "loss_total": 0.041446104645729065, "step": 946 }, { "batch_size": 4, "epoch": 0.3784, "step": 946, "tokens_per_device": 4212 }, { "epoch": 0.3784, "loss_ce": 0.012426702305674553, "loss_lvr": 1.0515936613082886, "loss_mode_switch": 0.0, "loss_total": 0.11758606880903244, "step": 946 }, { "batch_size": 1, "epoch": 0.3784, "step": 946, "tokens_per_device": 4977 }, { "epoch": 0.3784, "loss_ce": 0.0924900472164154, "loss_lvr": 0.2277340441942215, "loss_mode_switch": 0.0, "loss_total": 0.1152634546160698, "step": 946 }, { "batch_size": 4, "epoch": 0.3784, "step": 946, "tokens_per_device": 5732 }, { "epoch": 0.3784, "loss_ce": 0.034128475934267044, "loss_lvr": 0.7317323088645935, "loss_mode_switch": 0.0, "loss_total": 0.10730171203613281, "step": 946 }, { "batch_size": 4, "epoch": 0.3784, "step": 946, "tokens_per_device": 1280 }, { "epoch": 0.3784, "loss_ce": 0.8134105801582336, "loss_lvr": 1.1734930276870728, "loss_mode_switch": 0.0, "loss_total": 0.9307599067687988, "step": 946 }, { "epoch": 0.3788, "grad_norm": 1.3820440769195557, "learning_rate": 7.134755696542286e-06, "loss": 0.296, "step": 947 }, { "batch_size": 4, "epoch": 0.3788, "step": 947, "tokens_per_device": 4376 }, { "epoch": 0.3788, "loss_ce": 0.3415426015853882, "loss_lvr": 0.9570280909538269, "loss_mode_switch": 0.0, "loss_total": 0.4372454285621643, "step": 947 }, { "batch_size": 4, "epoch": 0.3788, "step": 947, "tokens_per_device": 3820 }, { "epoch": 0.3788, "loss_ce": 0.43494081497192383, "loss_lvr": 0.92119300365448, "loss_mode_switch": 0.0, "loss_total": 0.5270600914955139, "step": 947 }, { "batch_size": 4, "epoch": 0.3788, "step": 947, "tokens_per_device": 15248 }, { "epoch": 0.3788, "loss_ce": 0.4717271625995636, "loss_lvr": 1.408299207687378, "loss_mode_switch": 0.0, "loss_total": 0.612557053565979, "step": 947 }, { "batch_size": 4, "epoch": 0.3788, "step": 947, "tokens_per_device": 4288 }, { "epoch": 0.3788, "loss_ce": 0.4881417453289032, "loss_lvr": 1.0282361507415771, "loss_mode_switch": 0.0, "loss_total": 0.5909653902053833, "step": 947 }, { "batch_size": 1, "epoch": 0.3788, "step": 947, "tokens_per_device": 5115 }, { "epoch": 0.3788, "loss_ce": 0.0012592807179316878, "loss_lvr": 0.3937583565711975, "loss_mode_switch": 0.0, "loss_total": 0.0406351201236248, "step": 947 }, { "batch_size": 4, "epoch": 0.3788, "step": 947, "tokens_per_device": 4728 }, { "epoch": 0.3788, "loss_ce": 0.06714390963315964, "loss_lvr": 0.7836564779281616, "loss_mode_switch": 0.0, "loss_total": 0.14550955593585968, "step": 947 }, { "batch_size": 4, "epoch": 0.3788, "step": 947, "tokens_per_device": 1616 }, { "epoch": 0.3788, "loss_ce": 0.5064927935600281, "loss_lvr": 0.9173384308815002, "loss_mode_switch": 0.0, "loss_total": 0.5982266664505005, "step": 947 }, { "batch_size": 4, "epoch": 0.3788, "step": 947, "tokens_per_device": 2688 }, { "epoch": 0.3788, "loss_ce": 0.2837735116481781, "loss_lvr": 0.8518498539924622, "loss_mode_switch": 0.0, "loss_total": 0.3689585030078888, "step": 947 }, { "epoch": 0.3792, "grad_norm": 1.4615293741226196, "learning_rate": 7.128896457825364e-06, "loss": 0.3003, "step": 948 }, { "batch_size": 1, "epoch": 0.3792, "step": 948, "tokens_per_device": 4880 }, { "epoch": 0.3792, "loss_ce": 0.011166452430188656, "loss_lvr": 0.34209445118904114, "loss_mode_switch": 0.0, "loss_total": 0.045375898480415344, "step": 948 }, { "batch_size": 1, "epoch": 0.3792, "step": 948, "tokens_per_device": 4657 }, { "epoch": 0.3792, "loss_ce": 0.020570319145917892, "loss_lvr": 0.12718355655670166, "loss_mode_switch": 0.0, "loss_total": 0.03328867629170418, "step": 948 }, { "batch_size": 4, "epoch": 0.3792, "step": 948, "tokens_per_device": 11184 }, { "epoch": 0.3792, "loss_ce": 0.5725576281547546, "loss_lvr": 0.49866706132888794, "loss_mode_switch": 0.0, "loss_total": 0.6224243640899658, "step": 948 }, { "batch_size": 1, "epoch": 0.3792, "step": 948, "tokens_per_device": 4302 }, { "epoch": 0.3792, "loss_ce": 0.004799072630703449, "loss_lvr": 0.24559280276298523, "loss_mode_switch": 0.0, "loss_total": 0.029358353465795517, "step": 948 }, { "batch_size": 1, "epoch": 0.3792, "step": 948, "tokens_per_device": 5116 }, { "epoch": 0.3792, "loss_ce": 0.010457864962518215, "loss_lvr": 0.37659117579460144, "loss_mode_switch": 0.0, "loss_total": 0.04811698570847511, "step": 948 }, { "batch_size": 4, "epoch": 0.3792, "step": 948, "tokens_per_device": 6304 }, { "epoch": 0.3792, "loss_ce": 0.06183129549026489, "loss_lvr": 0.7961105108261108, "loss_mode_switch": 0.0, "loss_total": 0.14144235849380493, "step": 948 }, { "batch_size": 1, "epoch": 0.3792, "step": 948, "tokens_per_device": 4866 }, { "epoch": 0.3792, "loss_ce": 0.011295393109321594, "loss_lvr": 0.6330738067626953, "loss_mode_switch": 0.0, "loss_total": 0.07460277527570724, "step": 948 }, { "batch_size": 4, "epoch": 0.3792, "step": 948, "tokens_per_device": 5524 }, { "epoch": 0.3792, "loss_ce": 0.036828264594078064, "loss_lvr": 0.7721173167228699, "loss_mode_switch": 0.0, "loss_total": 0.11403999477624893, "step": 948 }, { "epoch": 0.3796, "grad_norm": 2.621499538421631, "learning_rate": 7.123033646127183e-06, "loss": 0.2818, "step": 949 }, { "batch_size": 4, "epoch": 0.3796, "step": 949, "tokens_per_device": 1520 }, { "epoch": 0.3796, "loss_ce": 0.5310092568397522, "loss_lvr": 1.058849811553955, "loss_mode_switch": 0.0, "loss_total": 0.6368942260742188, "step": 949 }, { "batch_size": 4, "epoch": 0.3796, "step": 949, "tokens_per_device": 5896 }, { "epoch": 0.3796, "loss_ce": 0.07105065882205963, "loss_lvr": 0.8499078154563904, "loss_mode_switch": 0.0, "loss_total": 0.1560414433479309, "step": 949 }, { "batch_size": 1, "epoch": 0.3796, "step": 949, "tokens_per_device": 5463 }, { "epoch": 0.3796, "loss_ce": 0.0039842743426561356, "loss_lvr": 0.48773425817489624, "loss_mode_switch": 0.0, "loss_total": 0.05275770276784897, "step": 949 }, { "batch_size": 4, "epoch": 0.3796, "step": 949, "tokens_per_device": 10328 }, { "epoch": 0.3796, "loss_ce": 0.015186375938355923, "loss_lvr": 0.36622947454452515, "loss_mode_switch": 0.0, "loss_total": 0.05180932581424713, "step": 949 }, { "batch_size": 4, "epoch": 0.3796, "step": 949, "tokens_per_device": 4252 }, { "epoch": 0.3796, "loss_ce": 0.34265008568763733, "loss_lvr": 1.043156623840332, "loss_mode_switch": 0.0, "loss_total": 0.446965754032135, "step": 949 }, { "batch_size": 4, "epoch": 0.3796, "step": 949, "tokens_per_device": 2712 }, { "epoch": 0.3796, "loss_ce": 0.287457138299942, "loss_lvr": 0.8107155561447144, "loss_mode_switch": 0.0, "loss_total": 0.36852869391441345, "step": 949 }, { "batch_size": 4, "epoch": 0.3796, "step": 949, "tokens_per_device": 1596 }, { "epoch": 0.3796, "loss_ce": 0.04279935359954834, "loss_lvr": 0.7813019156455994, "loss_mode_switch": 0.0, "loss_total": 0.1209295466542244, "step": 949 }, { "batch_size": 4, "epoch": 0.3796, "step": 949, "tokens_per_device": 12024 }, { "epoch": 0.3796, "loss_ce": 0.37176790833473206, "loss_lvr": 1.0184221267700195, "loss_mode_switch": 0.0, "loss_total": 0.47361013293266296, "step": 949 }, { "epoch": 0.38, "grad_norm": 1.2134214639663696, "learning_rate": 7.117167271287453e-06, "loss": 0.2725, "step": 950 }, { "batch_size": 4, "epoch": 0.38, "step": 950, "tokens_per_device": 1572 }, { "epoch": 0.38, "loss_ce": 0.12245135009288788, "loss_lvr": 1.1415377855300903, "loss_mode_switch": 0.0, "loss_total": 0.23660513758659363, "step": 950 }, { "batch_size": 4, "epoch": 0.38, "step": 950, "tokens_per_device": 4212 }, { "epoch": 0.38, "loss_ce": 0.054449792951345444, "loss_lvr": 0.9084690809249878, "loss_mode_switch": 0.0, "loss_total": 0.14529670774936676, "step": 950 }, { "batch_size": 4, "epoch": 0.38, "step": 950, "tokens_per_device": 1800 }, { "epoch": 0.38, "loss_ce": 0.04521867632865906, "loss_lvr": 0.931140124797821, "loss_mode_switch": 0.0, "loss_total": 0.13833269476890564, "step": 950 }, { "batch_size": 4, "epoch": 0.38, "step": 950, "tokens_per_device": 4392 }, { "epoch": 0.38, "loss_ce": 0.08773607760667801, "loss_lvr": 1.1277269124984741, "loss_mode_switch": 0.0, "loss_total": 0.20050877332687378, "step": 950 }, { "batch_size": 4, "epoch": 0.38, "step": 950, "tokens_per_device": 1316 }, { "epoch": 0.38, "loss_ce": 0.4534604251384735, "loss_lvr": 1.4336450099945068, "loss_mode_switch": 0.0, "loss_total": 0.5968249440193176, "step": 950 }, { "batch_size": 4, "epoch": 0.38, "step": 950, "tokens_per_device": 4936 }, { "epoch": 0.38, "loss_ce": 0.05966617912054062, "loss_lvr": 0.6351226568222046, "loss_mode_switch": 0.0, "loss_total": 0.12317844480276108, "step": 950 }, { "batch_size": 4, "epoch": 0.38, "step": 950, "tokens_per_device": 4360 }, { "epoch": 0.38, "loss_ce": 0.5361400842666626, "loss_lvr": 1.0927231311798096, "loss_mode_switch": 0.0, "loss_total": 0.6454123854637146, "step": 950 }, { "batch_size": 1, "epoch": 0.38, "step": 950, "tokens_per_device": 4835 }, { "epoch": 0.38, "loss_ce": 0.014413253404200077, "loss_lvr": 0.14986102283000946, "loss_mode_switch": 0.0, "loss_total": 0.029399355873465538, "step": 950 }, { "epoch": 0.3804, "grad_norm": 1.2027978897094727, "learning_rate": 7.111297343151854e-06, "loss": 0.3076, "step": 951 }, { "batch_size": 4, "epoch": 0.3804, "step": 951, "tokens_per_device": 4612 }, { "epoch": 0.3804, "loss_ce": 0.09788425266742706, "loss_lvr": 0.7048749923706055, "loss_mode_switch": 0.0, "loss_total": 0.1683717519044876, "step": 951 }, { "batch_size": 4, "epoch": 0.3804, "step": 951, "tokens_per_device": 1668 }, { "epoch": 0.3804, "loss_ce": 0.18482761085033417, "loss_lvr": 0.8267374634742737, "loss_mode_switch": 0.0, "loss_total": 0.2675013542175293, "step": 951 }, { "batch_size": 4, "epoch": 0.3804, "step": 951, "tokens_per_device": 7264 }, { "epoch": 0.3804, "loss_ce": 0.09601571410894394, "loss_lvr": 0.9682034850120544, "loss_mode_switch": 0.0, "loss_total": 0.19283606112003326, "step": 951 }, { "batch_size": 4, "epoch": 0.3804, "step": 951, "tokens_per_device": 5196 }, { "epoch": 0.3804, "loss_ce": 0.1089354157447815, "loss_lvr": 0.635106086730957, "loss_mode_switch": 0.0, "loss_total": 0.17244602739810944, "step": 951 }, { "batch_size": 4, "epoch": 0.3804, "step": 951, "tokens_per_device": 3912 }, { "epoch": 0.3804, "loss_ce": 0.2616514563560486, "loss_lvr": 0.9203703999519348, "loss_mode_switch": 0.0, "loss_total": 0.353688508272171, "step": 951 }, { "batch_size": 4, "epoch": 0.3804, "step": 951, "tokens_per_device": 1640 }, { "epoch": 0.3804, "loss_ce": 0.5187419056892395, "loss_lvr": 1.2685439586639404, "loss_mode_switch": 0.0, "loss_total": 0.6455963253974915, "step": 951 }, { "batch_size": 4, "epoch": 0.3804, "step": 951, "tokens_per_device": 4392 }, { "epoch": 0.3804, "loss_ce": 0.11994597315788269, "loss_lvr": 0.9275487661361694, "loss_mode_switch": 0.0, "loss_total": 0.21270084381103516, "step": 951 }, { "batch_size": 4, "epoch": 0.3804, "step": 951, "tokens_per_device": 4924 }, { "epoch": 0.3804, "loss_ce": 0.24093389511108398, "loss_lvr": 0.7493821382522583, "loss_mode_switch": 0.0, "loss_total": 0.31587210297584534, "step": 951 }, { "epoch": 0.3808, "grad_norm": 1.3243426084518433, "learning_rate": 7.105423871572043e-06, "loss": 0.2617, "step": 952 }, { "batch_size": 4, "epoch": 0.3808, "step": 952, "tokens_per_device": 4240 }, { "epoch": 0.3808, "loss_ce": 0.008239099755883217, "loss_lvr": 0.9602453112602234, "loss_mode_switch": 0.0, "loss_total": 0.10426363348960876, "step": 952 }, { "batch_size": 4, "epoch": 0.3808, "step": 952, "tokens_per_device": 11092 }, { "epoch": 0.3808, "loss_ce": 0.25272050499916077, "loss_lvr": 0.889747679233551, "loss_mode_switch": 0.0, "loss_total": 0.34169527888298035, "step": 952 }, { "batch_size": 4, "epoch": 0.3808, "step": 952, "tokens_per_device": 4488 }, { "epoch": 0.3808, "loss_ce": 0.2113637924194336, "loss_lvr": 1.103065848350525, "loss_mode_switch": 0.0, "loss_total": 0.32167038321495056, "step": 952 }, { "batch_size": 1, "epoch": 0.3808, "step": 952, "tokens_per_device": 4886 }, { "epoch": 0.3808, "loss_ce": 0.038319654762744904, "loss_lvr": 0.46758702397346497, "loss_mode_switch": 0.0, "loss_total": 0.08507835865020752, "step": 952 }, { "batch_size": 1, "epoch": 0.3808, "step": 952, "tokens_per_device": 5156 }, { "epoch": 0.3808, "loss_ce": 0.008555766195058823, "loss_lvr": 0.2778140604496002, "loss_mode_switch": 0.0, "loss_total": 0.036337174475193024, "step": 952 }, { "batch_size": 4, "epoch": 0.3808, "step": 952, "tokens_per_device": 2620 }, { "epoch": 0.3808, "loss_ce": 0.06178104132413864, "loss_lvr": 0.7260331511497498, "loss_mode_switch": 0.0, "loss_total": 0.1343843638896942, "step": 952 }, { "batch_size": 1, "epoch": 0.3808, "step": 952, "tokens_per_device": 5111 }, { "epoch": 0.3808, "loss_ce": 0.04796096310019493, "loss_lvr": 0.496768593788147, "loss_mode_switch": 0.0, "loss_total": 0.09763782471418381, "step": 952 }, { "batch_size": 4, "epoch": 0.3808, "step": 952, "tokens_per_device": 4284 }, { "epoch": 0.3808, "loss_ce": 0.1508849412202835, "loss_lvr": 1.1215715408325195, "loss_mode_switch": 0.0, "loss_total": 0.2630420923233032, "step": 952 }, { "epoch": 0.3812, "grad_norm": 1.2148255109786987, "learning_rate": 7.0995468664056135e-06, "loss": 0.2703, "step": 953 }, { "batch_size": 1, "epoch": 0.3812, "step": 953, "tokens_per_device": 4845 }, { "epoch": 0.3812, "loss_ce": 0.005040499847382307, "loss_lvr": 0.3707921802997589, "loss_mode_switch": 0.0, "loss_total": 0.042119719088077545, "step": 953 }, { "batch_size": 1, "epoch": 0.3812, "step": 953, "tokens_per_device": 4679 }, { "epoch": 0.3812, "loss_ce": 0.051731497049331665, "loss_lvr": 0.9334968328475952, "loss_mode_switch": 0.0, "loss_total": 0.14508119225502014, "step": 953 }, { "batch_size": 4, "epoch": 0.3812, "step": 953, "tokens_per_device": 5056 }, { "epoch": 0.3812, "loss_ce": 0.027472756803035736, "loss_lvr": 0.8285325169563293, "loss_mode_switch": 0.0, "loss_total": 0.11032600700855255, "step": 953 }, { "batch_size": 1, "epoch": 0.3812, "step": 953, "tokens_per_device": 4953 }, { "epoch": 0.3812, "loss_ce": 0.03466642647981644, "loss_lvr": 0.3317107558250427, "loss_mode_switch": 0.0, "loss_total": 0.06783750653266907, "step": 953 }, { "batch_size": 4, "epoch": 0.3812, "step": 953, "tokens_per_device": 4448 }, { "epoch": 0.3812, "loss_ce": 0.653003454208374, "loss_lvr": 0.9925047755241394, "loss_mode_switch": 0.0, "loss_total": 0.7522539496421814, "step": 953 }, { "batch_size": 4, "epoch": 0.3812, "step": 953, "tokens_per_device": 1416 }, { "epoch": 0.3812, "loss_ce": 0.5619854927062988, "loss_lvr": 1.0693178176879883, "loss_mode_switch": 0.0, "loss_total": 0.6689172983169556, "step": 953 }, { "batch_size": 4, "epoch": 0.3812, "step": 953, "tokens_per_device": 2768 }, { "epoch": 0.3812, "loss_ce": 0.21084833145141602, "loss_lvr": 0.680242657661438, "loss_mode_switch": 0.0, "loss_total": 0.27887260913848877, "step": 953 }, { "batch_size": 4, "epoch": 0.3812, "step": 953, "tokens_per_device": 6016 }, { "epoch": 0.3812, "loss_ce": 0.1574956625699997, "loss_lvr": 0.7706167697906494, "loss_mode_switch": 0.0, "loss_total": 0.23455733060836792, "step": 953 }, { "epoch": 0.3816, "grad_norm": 1.395438313484192, "learning_rate": 7.093666337516094e-06, "loss": 0.2788, "step": 954 }, { "batch_size": 4, "epoch": 0.3816, "step": 954, "tokens_per_device": 4556 }, { "epoch": 0.3816, "loss_ce": 0.3050075173377991, "loss_lvr": 0.8482418656349182, "loss_mode_switch": 0.0, "loss_total": 0.3898317217826843, "step": 954 }, { "batch_size": 4, "epoch": 0.3816, "step": 954, "tokens_per_device": 4884 }, { "epoch": 0.3816, "loss_ce": 0.7432773113250732, "loss_lvr": 0.8263476490974426, "loss_mode_switch": 0.0, "loss_total": 0.8259120583534241, "step": 954 }, { "batch_size": 4, "epoch": 0.3816, "step": 954, "tokens_per_device": 9944 }, { "epoch": 0.3816, "loss_ce": 0.31533583998680115, "loss_lvr": 0.5072243213653564, "loss_mode_switch": 0.0, "loss_total": 0.36605826020240784, "step": 954 }, { "batch_size": 4, "epoch": 0.3816, "step": 954, "tokens_per_device": 3972 }, { "epoch": 0.3816, "loss_ce": 0.23327726125717163, "loss_lvr": 0.7604758143424988, "loss_mode_switch": 0.0, "loss_total": 0.30932486057281494, "step": 954 }, { "batch_size": 4, "epoch": 0.3816, "step": 954, "tokens_per_device": 4424 }, { "epoch": 0.3816, "loss_ce": 0.48881882429122925, "loss_lvr": 0.9518231749534607, "loss_mode_switch": 0.0, "loss_total": 0.5840011239051819, "step": 954 }, { "batch_size": 4, "epoch": 0.3816, "step": 954, "tokens_per_device": 4644 }, { "epoch": 0.3816, "loss_ce": 0.21703237295150757, "loss_lvr": 0.8855416178703308, "loss_mode_switch": 0.0, "loss_total": 0.3055865466594696, "step": 954 }, { "batch_size": 4, "epoch": 0.3816, "step": 954, "tokens_per_device": 4552 }, { "epoch": 0.3816, "loss_ce": 0.5954484939575195, "loss_lvr": 1.0400364398956299, "loss_mode_switch": 0.0, "loss_total": 0.6994521617889404, "step": 954 }, { "batch_size": 1, "epoch": 0.3816, "step": 954, "tokens_per_device": 5040 }, { "epoch": 0.3816, "loss_ce": 0.0029053816106170416, "loss_lvr": 0.41369593143463135, "loss_mode_switch": 0.0, "loss_total": 0.04427497461438179, "step": 954 }, { "epoch": 0.382, "grad_norm": 1.4901372194290161, "learning_rate": 7.0877822947729265e-06, "loss": 0.3701, "step": 955 }, { "batch_size": 4, "epoch": 0.382, "step": 955, "tokens_per_device": 3908 }, { "epoch": 0.382, "loss_ce": 0.3756834864616394, "loss_lvr": 0.7826575040817261, "loss_mode_switch": 0.0, "loss_total": 0.4539492428302765, "step": 955 }, { "batch_size": 4, "epoch": 0.382, "step": 955, "tokens_per_device": 6084 }, { "epoch": 0.382, "loss_ce": 0.10383588075637817, "loss_lvr": 0.6818276643753052, "loss_mode_switch": 0.0, "loss_total": 0.1720186471939087, "step": 955 }, { "batch_size": 1, "epoch": 0.382, "step": 955, "tokens_per_device": 4916 }, { "epoch": 0.382, "loss_ce": 0.030033834278583527, "loss_lvr": 1.0765249729156494, "loss_mode_switch": 0.0, "loss_total": 0.1376863420009613, "step": 955 }, { "batch_size": 1, "epoch": 0.382, "step": 955, "tokens_per_device": 5187 }, { "epoch": 0.382, "loss_ce": 0.0030287716072052717, "loss_lvr": 0.6675699353218079, "loss_mode_switch": 0.0, "loss_total": 0.06978576630353928, "step": 955 }, { "batch_size": 4, "epoch": 0.382, "step": 955, "tokens_per_device": 2304 }, { "epoch": 0.382, "loss_ce": 0.5649375915527344, "loss_lvr": 0.8849127888679504, "loss_mode_switch": 0.0, "loss_total": 0.653428852558136, "step": 955 }, { "batch_size": 4, "epoch": 0.382, "step": 955, "tokens_per_device": 9504 }, { "epoch": 0.382, "loss_ce": 0.6569299697875977, "loss_lvr": 0.6706411242485046, "loss_mode_switch": 0.0, "loss_total": 0.7239940762519836, "step": 955 }, { "batch_size": 1, "epoch": 0.382, "step": 955, "tokens_per_device": 4887 }, { "epoch": 0.382, "loss_ce": 1.3677971363067627, "loss_lvr": 0.6796321868896484, "loss_mode_switch": 0.0, "loss_total": 1.4357603788375854, "step": 955 }, { "batch_size": 4, "epoch": 0.382, "step": 955, "tokens_per_device": 3848 }, { "epoch": 0.382, "loss_ce": 0.18339136242866516, "loss_lvr": 1.142805814743042, "loss_mode_switch": 0.0, "loss_total": 0.29767194390296936, "step": 955 }, { "epoch": 0.3824, "grad_norm": 1.4273613691329956, "learning_rate": 7.081894748051451e-06, "loss": 0.3366, "step": 956 }, { "batch_size": 4, "epoch": 0.3824, "step": 956, "tokens_per_device": 6112 }, { "epoch": 0.3824, "loss_ce": 0.020896606147289276, "loss_lvr": 0.6225602030754089, "loss_mode_switch": 0.0, "loss_total": 0.08315262198448181, "step": 956 }, { "batch_size": 4, "epoch": 0.3824, "step": 956, "tokens_per_device": 6952 }, { "epoch": 0.3824, "loss_ce": 0.46995049715042114, "loss_lvr": 0.888151228427887, "loss_mode_switch": 0.0, "loss_total": 0.5587656497955322, "step": 956 }, { "batch_size": 4, "epoch": 0.3824, "step": 956, "tokens_per_device": 2524 }, { "epoch": 0.3824, "loss_ce": 0.10861659795045853, "loss_lvr": 1.11323082447052, "loss_mode_switch": 0.0, "loss_total": 0.2199396789073944, "step": 956 }, { "batch_size": 4, "epoch": 0.3824, "step": 956, "tokens_per_device": 3952 }, { "epoch": 0.3824, "loss_ce": 0.3101961016654968, "loss_lvr": 1.4825778007507324, "loss_mode_switch": 0.0, "loss_total": 0.458453893661499, "step": 956 }, { "batch_size": 4, "epoch": 0.3824, "step": 956, "tokens_per_device": 1328 }, { "epoch": 0.3824, "loss_ce": 0.6063469052314758, "loss_lvr": 0.9903424382209778, "loss_mode_switch": 0.0, "loss_total": 0.7053811550140381, "step": 956 }, { "batch_size": 4, "epoch": 0.3824, "step": 956, "tokens_per_device": 4736 }, { "epoch": 0.3824, "loss_ce": 0.2513464689254761, "loss_lvr": 0.8999412059783936, "loss_mode_switch": 0.0, "loss_total": 0.3413406014442444, "step": 956 }, { "batch_size": 4, "epoch": 0.3824, "step": 956, "tokens_per_device": 5336 }, { "epoch": 0.3824, "loss_ce": 0.650136411190033, "loss_lvr": 1.0763583183288574, "loss_mode_switch": 0.0, "loss_total": 0.7577722668647766, "step": 956 }, { "batch_size": 4, "epoch": 0.3824, "step": 956, "tokens_per_device": 4444 }, { "epoch": 0.3824, "loss_ce": 0.12518395483493805, "loss_lvr": 0.9232175350189209, "loss_mode_switch": 0.0, "loss_total": 0.21750570833683014, "step": 956 }, { "epoch": 0.3828, "grad_norm": 1.508607029914856, "learning_rate": 7.0760037072328855e-06, "loss": 0.3505, "step": 957 }, { "batch_size": 4, "epoch": 0.3828, "step": 957, "tokens_per_device": 1608 }, { "epoch": 0.3828, "loss_ce": 0.2821626365184784, "loss_lvr": 0.8676705360412598, "loss_mode_switch": 0.0, "loss_total": 0.3689296841621399, "step": 957 }, { "batch_size": 4, "epoch": 0.3828, "step": 957, "tokens_per_device": 4388 }, { "epoch": 0.3828, "loss_ce": 0.18426933884620667, "loss_lvr": 0.903678297996521, "loss_mode_switch": 0.0, "loss_total": 0.2746371626853943, "step": 957 }, { "batch_size": 4, "epoch": 0.3828, "step": 957, "tokens_per_device": 3856 }, { "epoch": 0.3828, "loss_ce": 0.10719339549541473, "loss_lvr": 0.9694320559501648, "loss_mode_switch": 0.0, "loss_total": 0.20413661003112793, "step": 957 }, { "batch_size": 4, "epoch": 0.3828, "step": 957, "tokens_per_device": 3752 }, { "epoch": 0.3828, "loss_ce": 0.576008677482605, "loss_lvr": 1.2386205196380615, "loss_mode_switch": 0.0, "loss_total": 0.6998707056045532, "step": 957 }, { "batch_size": 4, "epoch": 0.3828, "step": 957, "tokens_per_device": 2640 }, { "epoch": 0.3828, "loss_ce": 0.6787137389183044, "loss_lvr": 0.9104401469230652, "loss_mode_switch": 0.0, "loss_total": 0.7697577476501465, "step": 957 }, { "batch_size": 4, "epoch": 0.3828, "step": 957, "tokens_per_device": 12484 }, { "epoch": 0.3828, "loss_ce": 0.22691410779953003, "loss_lvr": 0.7780026197433472, "loss_mode_switch": 0.0, "loss_total": 0.3047143816947937, "step": 957 }, { "batch_size": 1, "epoch": 0.3828, "step": 957, "tokens_per_device": 4871 }, { "epoch": 0.3828, "loss_ce": 0.0004147648869547993, "loss_lvr": 0.271562397480011, "loss_mode_switch": 0.0, "loss_total": 0.027571003884077072, "step": 957 }, { "batch_size": 1, "epoch": 0.3828, "step": 957, "tokens_per_device": 5161 }, { "epoch": 0.3828, "loss_ce": 0.00268025160767138, "loss_lvr": 0.5389227867126465, "loss_mode_switch": 0.0, "loss_total": 0.05657253414392471, "step": 957 }, { "epoch": 0.3832, "grad_norm": 1.2944358587265015, "learning_rate": 7.070109182204317e-06, "loss": 0.2964, "step": 958 }, { "batch_size": 1, "epoch": 0.3832, "step": 958, "tokens_per_device": 5123 }, { "epoch": 0.3832, "loss_ce": 0.003544354811310768, "loss_lvr": 0.4372398257255554, "loss_mode_switch": 0.0, "loss_total": 0.0472683385014534, "step": 958 }, { "batch_size": 4, "epoch": 0.3832, "step": 958, "tokens_per_device": 6852 }, { "epoch": 0.3832, "loss_ce": 0.43845081329345703, "loss_lvr": 0.85899418592453, "loss_mode_switch": 0.0, "loss_total": 0.5243502259254456, "step": 958 }, { "batch_size": 1, "epoch": 0.3832, "step": 958, "tokens_per_device": 4854 }, { "epoch": 0.3832, "loss_ce": 0.00825033150613308, "loss_lvr": 0.26485756039619446, "loss_mode_switch": 0.0, "loss_total": 0.034736089408397675, "step": 958 }, { "batch_size": 4, "epoch": 0.3832, "step": 958, "tokens_per_device": 15604 }, { "epoch": 0.3832, "loss_ce": 0.033953506499528885, "loss_lvr": 0.7819328904151917, "loss_mode_switch": 0.0, "loss_total": 0.11214679479598999, "step": 958 }, { "batch_size": 1, "epoch": 0.3832, "step": 958, "tokens_per_device": 5110 }, { "epoch": 0.3832, "loss_ce": 0.0010842481860890985, "loss_lvr": 0.2882941961288452, "loss_mode_switch": 0.0, "loss_total": 0.02991366758942604, "step": 958 }, { "batch_size": 4, "epoch": 0.3832, "step": 958, "tokens_per_device": 4344 }, { "epoch": 0.3832, "loss_ce": 0.05712907761335373, "loss_lvr": 0.8783106207847595, "loss_mode_switch": 0.0, "loss_total": 0.14496013522148132, "step": 958 }, { "batch_size": 4, "epoch": 0.3832, "step": 958, "tokens_per_device": 4720 }, { "epoch": 0.3832, "loss_ce": 0.1522555649280548, "loss_lvr": 0.9891676902770996, "loss_mode_switch": 0.0, "loss_total": 0.25117233395576477, "step": 958 }, { "batch_size": 4, "epoch": 0.3832, "step": 958, "tokens_per_device": 4200 }, { "epoch": 0.3832, "loss_ce": 0.036053624004125595, "loss_lvr": 1.0416449308395386, "loss_mode_switch": 0.0, "loss_total": 0.140218123793602, "step": 958 }, { "epoch": 0.3836, "grad_norm": 1.3311803340911865, "learning_rate": 7.064211182858673e-06, "loss": 0.2589, "step": 959 }, { "batch_size": 4, "epoch": 0.3836, "step": 959, "tokens_per_device": 5084 }, { "epoch": 0.3836, "loss_ce": 0.10948633402585983, "loss_lvr": 0.9863694906234741, "loss_mode_switch": 0.0, "loss_total": 0.20812328159809113, "step": 959 }, { "batch_size": 4, "epoch": 0.3836, "step": 959, "tokens_per_device": 5740 }, { "epoch": 0.3836, "loss_ce": 0.535064160823822, "loss_lvr": 0.7944576144218445, "loss_mode_switch": 0.0, "loss_total": 0.6145099401473999, "step": 959 }, { "batch_size": 1, "epoch": 0.3836, "step": 959, "tokens_per_device": 4930 }, { "epoch": 0.3836, "loss_ce": 0.32542121410369873, "loss_lvr": 0.804326057434082, "loss_mode_switch": 0.0, "loss_total": 0.405853807926178, "step": 959 }, { "batch_size": 4, "epoch": 0.3836, "step": 959, "tokens_per_device": 4240 }, { "epoch": 0.3836, "loss_ce": 0.7766751050949097, "loss_lvr": 0.9404098391532898, "loss_mode_switch": 0.0, "loss_total": 0.8707160949707031, "step": 959 }, { "batch_size": 4, "epoch": 0.3836, "step": 959, "tokens_per_device": 6624 }, { "epoch": 0.3836, "loss_ce": 0.05631104111671448, "loss_lvr": 0.8767356276512146, "loss_mode_switch": 0.0, "loss_total": 0.1439846158027649, "step": 959 }, { "batch_size": 4, "epoch": 0.3836, "step": 959, "tokens_per_device": 3864 }, { "epoch": 0.3836, "loss_ce": 0.06550620496273041, "loss_lvr": 0.9297788739204407, "loss_mode_switch": 0.0, "loss_total": 0.1584841012954712, "step": 959 }, { "batch_size": 1, "epoch": 0.3836, "step": 959, "tokens_per_device": 4879 }, { "epoch": 0.3836, "loss_ce": 0.40440434217453003, "loss_lvr": 0.212471604347229, "loss_mode_switch": 0.0, "loss_total": 0.425651490688324, "step": 959 }, { "batch_size": 4, "epoch": 0.3836, "step": 959, "tokens_per_device": 1840 }, { "epoch": 0.3836, "loss_ce": 0.4112919270992279, "loss_lvr": 0.9790741205215454, "loss_mode_switch": 0.0, "loss_total": 0.509199321269989, "step": 959 }, { "epoch": 0.384, "grad_norm": 1.5291814804077148, "learning_rate": 7.05830971909472e-06, "loss": 0.3121, "step": 960 }, { "batch_size": 4, "epoch": 0.384, "step": 960, "tokens_per_device": 10492 }, { "epoch": 0.384, "loss_ce": 0.24563424289226532, "loss_lvr": 1.0621168613433838, "loss_mode_switch": 0.0, "loss_total": 0.351845920085907, "step": 960 }, { "batch_size": 4, "epoch": 0.384, "step": 960, "tokens_per_device": 9816 }, { "epoch": 0.384, "loss_ce": 0.3564160168170929, "loss_lvr": 1.0323431491851807, "loss_mode_switch": 0.0, "loss_total": 0.45965033769607544, "step": 960 }, { "batch_size": 4, "epoch": 0.384, "step": 960, "tokens_per_device": 5136 }, { "epoch": 0.384, "loss_ce": 0.059018924832344055, "loss_lvr": 0.7964982390403748, "loss_mode_switch": 0.0, "loss_total": 0.1386687457561493, "step": 960 }, { "batch_size": 1, "epoch": 0.384, "step": 960, "tokens_per_device": 5066 }, { "epoch": 0.384, "loss_ce": 0.11384725570678711, "loss_lvr": 0.31053444743156433, "loss_mode_switch": 0.0, "loss_total": 0.14490069448947906, "step": 960 }, { "batch_size": 4, "epoch": 0.384, "step": 960, "tokens_per_device": 1556 }, { "epoch": 0.384, "loss_ce": 0.6058789491653442, "loss_lvr": 1.0568199157714844, "loss_mode_switch": 0.0, "loss_total": 0.7115609645843506, "step": 960 }, { "batch_size": 4, "epoch": 0.384, "step": 960, "tokens_per_device": 5296 }, { "epoch": 0.384, "loss_ce": 0.1093885526061058, "loss_lvr": 0.846405029296875, "loss_mode_switch": 0.0, "loss_total": 0.1940290629863739, "step": 960 }, { "batch_size": 4, "epoch": 0.384, "step": 960, "tokens_per_device": 4424 }, { "epoch": 0.384, "loss_ce": 0.12275396287441254, "loss_lvr": 2.1855366230010986, "loss_mode_switch": 0.0, "loss_total": 0.3413076400756836, "step": 960 }, { "batch_size": 1, "epoch": 0.384, "step": 960, "tokens_per_device": 4889 }, { "epoch": 0.384, "loss_ce": 0.3785878121852875, "loss_lvr": 1.4693374633789062, "loss_mode_switch": 0.0, "loss_total": 0.5255215764045715, "step": 960 }, { "epoch": 0.3844, "grad_norm": 1.4941037893295288, "learning_rate": 7.0524048008170345e-06, "loss": 0.3233, "step": 961 }, { "batch_size": 4, "epoch": 0.3844, "step": 961, "tokens_per_device": 4236 }, { "epoch": 0.3844, "loss_ce": 0.07377681136131287, "loss_lvr": 0.9574885368347168, "loss_mode_switch": 0.0, "loss_total": 0.16952566802501678, "step": 961 }, { "batch_size": 4, "epoch": 0.3844, "step": 961, "tokens_per_device": 3780 }, { "epoch": 0.3844, "loss_ce": 0.12631842494010925, "loss_lvr": 1.126827597618103, "loss_mode_switch": 0.0, "loss_total": 0.23900118470191956, "step": 961 }, { "batch_size": 4, "epoch": 0.3844, "step": 961, "tokens_per_device": 4032 }, { "epoch": 0.3844, "loss_ce": 0.20461750030517578, "loss_lvr": 0.9236095547676086, "loss_mode_switch": 0.0, "loss_total": 0.2969784736633301, "step": 961 }, { "batch_size": 4, "epoch": 0.3844, "step": 961, "tokens_per_device": 1596 }, { "epoch": 0.3844, "loss_ce": 0.2530430853366852, "loss_lvr": 0.9186651706695557, "loss_mode_switch": 0.0, "loss_total": 0.3449096083641052, "step": 961 }, { "batch_size": 4, "epoch": 0.3844, "step": 961, "tokens_per_device": 3240 }, { "epoch": 0.3844, "loss_ce": 0.27236250042915344, "loss_lvr": 0.9844688177108765, "loss_mode_switch": 0.0, "loss_total": 0.3708093762397766, "step": 961 }, { "batch_size": 4, "epoch": 0.3844, "step": 961, "tokens_per_device": 3812 }, { "epoch": 0.3844, "loss_ce": 0.25982293486595154, "loss_lvr": 0.91255122423172, "loss_mode_switch": 0.0, "loss_total": 0.351078063249588, "step": 961 }, { "batch_size": 4, "epoch": 0.3844, "step": 961, "tokens_per_device": 4432 }, { "epoch": 0.3844, "loss_ce": 0.4919441044330597, "loss_lvr": 0.5149883031845093, "loss_mode_switch": 0.0, "loss_total": 0.543442964553833, "step": 961 }, { "batch_size": 4, "epoch": 0.3844, "step": 961, "tokens_per_device": 4576 }, { "epoch": 0.3844, "loss_ce": 0.3048003613948822, "loss_lvr": 0.8948934078216553, "loss_mode_switch": 0.0, "loss_total": 0.39428970217704773, "step": 961 }, { "epoch": 0.3848, "grad_norm": 1.8618065118789673, "learning_rate": 7.04649643793599e-06, "loss": 0.3637, "step": 962 }, { "batch_size": 1, "epoch": 0.3848, "step": 962, "tokens_per_device": 5136 }, { "epoch": 0.3848, "loss_ce": 0.01609499566257, "loss_lvr": 0.35200536251068115, "loss_mode_switch": 0.0, "loss_total": 0.051295533776283264, "step": 962 }, { "batch_size": 4, "epoch": 0.3848, "step": 962, "tokens_per_device": 3756 }, { "epoch": 0.3848, "loss_ce": 0.2833843529224396, "loss_lvr": 0.892638087272644, "loss_mode_switch": 0.0, "loss_total": 0.3726481795310974, "step": 962 }, { "batch_size": 4, "epoch": 0.3848, "step": 962, "tokens_per_device": 1320 }, { "epoch": 0.3848, "loss_ce": 0.4053246080875397, "loss_lvr": 0.9833971858024597, "loss_mode_switch": 0.0, "loss_total": 0.5036643147468567, "step": 962 }, { "batch_size": 1, "epoch": 0.3848, "step": 962, "tokens_per_device": 4881 }, { "epoch": 0.3848, "loss_ce": 0.06548845767974854, "loss_lvr": 0.5295630097389221, "loss_mode_switch": 0.0, "loss_total": 0.11844475567340851, "step": 962 }, { "batch_size": 4, "epoch": 0.3848, "step": 962, "tokens_per_device": 5856 }, { "epoch": 0.3848, "loss_ce": 0.40821436047554016, "loss_lvr": 0.872154712677002, "loss_mode_switch": 0.0, "loss_total": 0.4954298436641693, "step": 962 }, { "batch_size": 4, "epoch": 0.3848, "step": 962, "tokens_per_device": 6604 }, { "epoch": 0.3848, "loss_ce": 0.42596378922462463, "loss_lvr": 0.8791854381561279, "loss_mode_switch": 0.0, "loss_total": 0.5138823390007019, "step": 962 }, { "batch_size": 4, "epoch": 0.3848, "step": 962, "tokens_per_device": 4428 }, { "epoch": 0.3848, "loss_ce": 0.29143816232681274, "loss_lvr": 0.8019698262214661, "loss_mode_switch": 0.0, "loss_total": 0.3716351389884949, "step": 962 }, { "batch_size": 4, "epoch": 0.3848, "step": 962, "tokens_per_device": 4364 }, { "epoch": 0.3848, "loss_ce": 0.10800409317016602, "loss_lvr": 1.0420221090316772, "loss_mode_switch": 0.0, "loss_total": 0.21220630407333374, "step": 962 }, { "epoch": 0.3852, "grad_norm": 1.2808072566986084, "learning_rate": 7.040584640367744e-06, "loss": 0.2991, "step": 963 }, { "batch_size": 1, "epoch": 0.3852, "step": 963, "tokens_per_device": 5596 }, { "epoch": 0.3852, "loss_ce": 0.023534134030342102, "loss_lvr": 0.6947070956230164, "loss_mode_switch": 0.0, "loss_total": 0.09300484508275986, "step": 963 }, { "batch_size": 4, "epoch": 0.3852, "step": 963, "tokens_per_device": 8716 }, { "epoch": 0.3852, "loss_ce": 0.003050216706469655, "loss_lvr": 0.4213196933269501, "loss_mode_switch": 0.0, "loss_total": 0.04518218711018562, "step": 963 }, { "batch_size": 4, "epoch": 0.3852, "step": 963, "tokens_per_device": 4388 }, { "epoch": 0.3852, "loss_ce": 0.05057526379823685, "loss_lvr": 0.7397573590278625, "loss_mode_switch": 0.0, "loss_total": 0.12455099821090698, "step": 963 }, { "batch_size": 1, "epoch": 0.3852, "step": 963, "tokens_per_device": 5624 }, { "epoch": 0.3852, "loss_ce": 0.1190190315246582, "loss_lvr": 0.3953315317630768, "loss_mode_switch": 0.0, "loss_total": 0.15855218470096588, "step": 963 }, { "batch_size": 1, "epoch": 0.3852, "step": 963, "tokens_per_device": 4897 }, { "epoch": 0.3852, "loss_ce": 0.22250786423683167, "loss_lvr": 0.32092252373695374, "loss_mode_switch": 0.0, "loss_total": 0.2546001076698303, "step": 963 }, { "batch_size": 4, "epoch": 0.3852, "step": 963, "tokens_per_device": 5704 }, { "epoch": 0.3852, "loss_ce": 0.023766180500388145, "loss_lvr": 0.8576704859733582, "loss_mode_switch": 0.0, "loss_total": 0.10953323543071747, "step": 963 }, { "batch_size": 4, "epoch": 0.3852, "step": 963, "tokens_per_device": 5648 }, { "epoch": 0.3852, "loss_ce": 0.019559817388653755, "loss_lvr": 0.7996055483818054, "loss_mode_switch": 0.0, "loss_total": 0.09952037036418915, "step": 963 }, { "batch_size": 1, "epoch": 0.3852, "step": 963, "tokens_per_device": 5126 }, { "epoch": 0.3852, "loss_ce": 0.1330062448978424, "loss_lvr": 0.7189633846282959, "loss_mode_switch": 0.0, "loss_total": 0.20490258932113647, "step": 963 }, { "epoch": 0.3856, "grad_norm": 1.1442062854766846, "learning_rate": 7.034669418034217e-06, "loss": 0.2571, "step": 964 }, { "batch_size": 4, "epoch": 0.3856, "step": 964, "tokens_per_device": 3612 }, { "epoch": 0.3856, "loss_ce": 0.07978878170251846, "loss_lvr": 0.7855080962181091, "loss_mode_switch": 0.0, "loss_total": 0.15833958983421326, "step": 964 }, { "batch_size": 1, "epoch": 0.3856, "step": 964, "tokens_per_device": 4954 }, { "epoch": 0.3856, "loss_ce": 0.045431315898895264, "loss_lvr": 0.29537051916122437, "loss_mode_switch": 0.0, "loss_total": 0.0749683678150177, "step": 964 }, { "batch_size": 1, "epoch": 0.3856, "step": 964, "tokens_per_device": 5022 }, { "epoch": 0.3856, "loss_ce": 0.0017713458510115743, "loss_lvr": 0.6735469698905945, "loss_mode_switch": 0.0, "loss_total": 0.06912604719400406, "step": 964 }, { "batch_size": 4, "epoch": 0.3856, "step": 964, "tokens_per_device": 4044 }, { "epoch": 0.3856, "loss_ce": 0.02600872702896595, "loss_lvr": 0.6667754650115967, "loss_mode_switch": 0.0, "loss_total": 0.09268627315759659, "step": 964 }, { "batch_size": 4, "epoch": 0.3856, "step": 964, "tokens_per_device": 4388 }, { "epoch": 0.3856, "loss_ce": 0.009440609253942966, "loss_lvr": 0.8642387390136719, "loss_mode_switch": 0.0, "loss_total": 0.09586448222398758, "step": 964 }, { "batch_size": 4, "epoch": 0.3856, "step": 964, "tokens_per_device": 9984 }, { "epoch": 0.3856, "loss_ce": 0.4875476062297821, "loss_lvr": 0.7459403276443481, "loss_mode_switch": 0.0, "loss_total": 0.5621416568756104, "step": 964 }, { "batch_size": 4, "epoch": 0.3856, "step": 964, "tokens_per_device": 5696 }, { "epoch": 0.3856, "loss_ce": 0.7850415110588074, "loss_lvr": 1.0768775939941406, "loss_mode_switch": 0.0, "loss_total": 0.8927292823791504, "step": 964 }, { "batch_size": 4, "epoch": 0.3856, "step": 964, "tokens_per_device": 4260 }, { "epoch": 0.3856, "loss_ce": 0.04143473133444786, "loss_lvr": 0.7074650526046753, "loss_mode_switch": 0.0, "loss_total": 0.11218123137950897, "step": 964 }, { "epoch": 0.386, "grad_norm": 1.3617382049560547, "learning_rate": 7.028750780863078e-06, "loss": 0.3331, "step": 965 }, { "batch_size": 4, "epoch": 0.386, "step": 965, "tokens_per_device": 4892 }, { "epoch": 0.386, "loss_ce": 0.22506514191627502, "loss_lvr": 0.8791080713272095, "loss_mode_switch": 0.0, "loss_total": 0.3129759430885315, "step": 965 }, { "batch_size": 1, "epoch": 0.386, "step": 965, "tokens_per_device": 5110 }, { "epoch": 0.386, "loss_ce": 0.004367304965853691, "loss_lvr": 0.9960880875587463, "loss_mode_switch": 0.0, "loss_total": 0.10397611558437347, "step": 965 }, { "batch_size": 4, "epoch": 0.386, "step": 965, "tokens_per_device": 4532 }, { "epoch": 0.386, "loss_ce": 0.3367519676685333, "loss_lvr": 0.9875277876853943, "loss_mode_switch": 0.0, "loss_total": 0.4355047345161438, "step": 965 }, { "batch_size": 4, "epoch": 0.386, "step": 965, "tokens_per_device": 2544 }, { "epoch": 0.386, "loss_ce": 0.6362699866294861, "loss_lvr": 1.0535919666290283, "loss_mode_switch": 0.0, "loss_total": 0.7416291832923889, "step": 965 }, { "batch_size": 4, "epoch": 0.386, "step": 965, "tokens_per_device": 11324 }, { "epoch": 0.386, "loss_ce": 0.11730294674634933, "loss_lvr": 0.8257790803909302, "loss_mode_switch": 0.0, "loss_total": 0.19988085329532623, "step": 965 }, { "batch_size": 1, "epoch": 0.386, "step": 965, "tokens_per_device": 5139 }, { "epoch": 0.386, "loss_ce": 0.15513354539871216, "loss_lvr": 1.0025389194488525, "loss_mode_switch": 0.0, "loss_total": 0.25538742542266846, "step": 965 }, { "batch_size": 1, "epoch": 0.386, "step": 965, "tokens_per_device": 4885 }, { "epoch": 0.386, "loss_ce": 0.017174871638417244, "loss_lvr": 0.4735100567340851, "loss_mode_switch": 0.0, "loss_total": 0.06452587991952896, "step": 965 }, { "batch_size": 4, "epoch": 0.386, "step": 965, "tokens_per_device": 3888 }, { "epoch": 0.386, "loss_ce": 0.01118754968047142, "loss_lvr": 1.6408295631408691, "loss_mode_switch": 0.0, "loss_total": 0.17527051270008087, "step": 965 }, { "epoch": 0.3864, "grad_norm": 1.2458350658416748, "learning_rate": 7.022828738787725e-06, "loss": 0.2905, "step": 966 }, { "batch_size": 4, "epoch": 0.3864, "step": 966, "tokens_per_device": 8632 }, { "epoch": 0.3864, "loss_ce": 0.13525034487247467, "loss_lvr": 0.7433162331581116, "loss_mode_switch": 0.0, "loss_total": 0.20958197116851807, "step": 966 }, { "batch_size": 1, "epoch": 0.3864, "step": 966, "tokens_per_device": 5101 }, { "epoch": 0.3864, "loss_ce": 0.0028658355586230755, "loss_lvr": 0.5344502925872803, "loss_mode_switch": 0.0, "loss_total": 0.05631086602807045, "step": 966 }, { "batch_size": 4, "epoch": 0.3864, "step": 966, "tokens_per_device": 4228 }, { "epoch": 0.3864, "loss_ce": 0.5686653852462769, "loss_lvr": 0.9148673415184021, "loss_mode_switch": 0.0, "loss_total": 0.6601521372795105, "step": 966 }, { "batch_size": 4, "epoch": 0.3864, "step": 966, "tokens_per_device": 1716 }, { "epoch": 0.3864, "loss_ce": 0.26956650614738464, "loss_lvr": 0.9743528962135315, "loss_mode_switch": 0.0, "loss_total": 0.36700180172920227, "step": 966 }, { "batch_size": 4, "epoch": 0.3864, "step": 966, "tokens_per_device": 5492 }, { "epoch": 0.3864, "loss_ce": 0.33552995324134827, "loss_lvr": 0.6473870873451233, "loss_mode_switch": 0.0, "loss_total": 0.40026867389678955, "step": 966 }, { "batch_size": 4, "epoch": 0.3864, "step": 966, "tokens_per_device": 4336 }, { "epoch": 0.3864, "loss_ce": 0.10237753391265869, "loss_lvr": 0.7741956114768982, "loss_mode_switch": 0.0, "loss_total": 0.17979709804058075, "step": 966 }, { "batch_size": 1, "epoch": 0.3864, "step": 966, "tokens_per_device": 5102 }, { "epoch": 0.3864, "loss_ce": 0.0035263500176370144, "loss_lvr": 0.46848157048225403, "loss_mode_switch": 0.0, "loss_total": 0.050374507904052734, "step": 966 }, { "batch_size": 4, "epoch": 0.3864, "step": 966, "tokens_per_device": 4444 }, { "epoch": 0.3864, "loss_ce": 0.5016394257545471, "loss_lvr": 0.808904230594635, "loss_mode_switch": 0.0, "loss_total": 0.5825298428535461, "step": 966 }, { "epoch": 0.3868, "grad_norm": 1.1693273782730103, "learning_rate": 7.016903301747275e-06, "loss": 0.2666, "step": 967 }, { "batch_size": 4, "epoch": 0.3868, "step": 967, "tokens_per_device": 1324 }, { "epoch": 0.3868, "loss_ce": 0.5610642433166504, "loss_lvr": 0.9171018004417419, "loss_mode_switch": 0.0, "loss_total": 0.652774453163147, "step": 967 }, { "batch_size": 4, "epoch": 0.3868, "step": 967, "tokens_per_device": 8392 }, { "epoch": 0.3868, "loss_ce": 0.3669455647468567, "loss_lvr": 0.8290070295333862, "loss_mode_switch": 0.0, "loss_total": 0.4498462677001953, "step": 967 }, { "batch_size": 1, "epoch": 0.3868, "step": 967, "tokens_per_device": 4791 }, { "epoch": 0.3868, "loss_ce": 0.0033478843979537487, "loss_lvr": 0.3341694474220276, "loss_mode_switch": 0.0, "loss_total": 0.036764830350875854, "step": 967 }, { "batch_size": 4, "epoch": 0.3868, "step": 967, "tokens_per_device": 4292 }, { "epoch": 0.3868, "loss_ce": 0.07874835282564163, "loss_lvr": 0.8892385363578796, "loss_mode_switch": 0.0, "loss_total": 0.16767221689224243, "step": 967 }, { "batch_size": 4, "epoch": 0.3868, "step": 967, "tokens_per_device": 2596 }, { "epoch": 0.3868, "loss_ce": 0.23748764395713806, "loss_lvr": 0.906058669090271, "loss_mode_switch": 0.0, "loss_total": 0.3280935287475586, "step": 967 }, { "batch_size": 4, "epoch": 0.3868, "step": 967, "tokens_per_device": 4268 }, { "epoch": 0.3868, "loss_ce": 0.17314513027668, "loss_lvr": 0.7912784218788147, "loss_mode_switch": 0.0, "loss_total": 0.25227296352386475, "step": 967 }, { "batch_size": 4, "epoch": 0.3868, "step": 967, "tokens_per_device": 6804 }, { "epoch": 0.3868, "loss_ce": 0.4642944931983948, "loss_lvr": 0.8128141760826111, "loss_mode_switch": 0.0, "loss_total": 0.5455759167671204, "step": 967 }, { "batch_size": 4, "epoch": 0.3868, "step": 967, "tokens_per_device": 4380 }, { "epoch": 0.3868, "loss_ce": 0.1348738670349121, "loss_lvr": 1.051049828529358, "loss_mode_switch": 0.0, "loss_total": 0.2399788498878479, "step": 967 }, { "epoch": 0.3872, "grad_norm": 1.3783941268920898, "learning_rate": 7.010974479686538e-06, "loss": 0.3027, "step": 968 }, { "batch_size": 4, "epoch": 0.3872, "step": 968, "tokens_per_device": 4808 }, { "epoch": 0.3872, "loss_ce": 0.12865719199180603, "loss_lvr": 0.8327564597129822, "loss_mode_switch": 0.0, "loss_total": 0.21193283796310425, "step": 968 }, { "batch_size": 1, "epoch": 0.3872, "step": 968, "tokens_per_device": 5172 }, { "epoch": 0.3872, "loss_ce": 0.0006964004132896662, "loss_lvr": 0.4370148181915283, "loss_mode_switch": 0.0, "loss_total": 0.044397883117198944, "step": 968 }, { "batch_size": 1, "epoch": 0.3872, "step": 968, "tokens_per_device": 4908 }, { "epoch": 0.3872, "loss_ce": 0.015164499171078205, "loss_lvr": 0.5095511078834534, "loss_mode_switch": 0.0, "loss_total": 0.06611961126327515, "step": 968 }, { "batch_size": 1, "epoch": 0.3872, "step": 968, "tokens_per_device": 5158 }, { "epoch": 0.3872, "loss_ce": 0.002488251542672515, "loss_lvr": 0.6292318105697632, "loss_mode_switch": 0.0, "loss_total": 0.06541143357753754, "step": 968 }, { "batch_size": 4, "epoch": 0.3872, "step": 968, "tokens_per_device": 5008 }, { "epoch": 0.3872, "loss_ce": 0.36864084005355835, "loss_lvr": 0.718599259853363, "loss_mode_switch": 0.0, "loss_total": 0.44050076603889465, "step": 968 }, { "batch_size": 1, "epoch": 0.3872, "step": 968, "tokens_per_device": 4745 }, { "epoch": 0.3872, "loss_ce": 0.0005620303563773632, "loss_lvr": 0.6585313677787781, "loss_mode_switch": 0.0, "loss_total": 0.06641516834497452, "step": 968 }, { "batch_size": 1, "epoch": 0.3872, "step": 968, "tokens_per_device": 4755 }, { "epoch": 0.3872, "loss_ce": 0.0017393366433680058, "loss_lvr": 0.25491297245025635, "loss_mode_switch": 0.0, "loss_total": 0.027230633422732353, "step": 968 }, { "batch_size": 4, "epoch": 0.3872, "step": 968, "tokens_per_device": 1564 }, { "epoch": 0.3872, "loss_ce": 0.42256447672843933, "loss_lvr": 1.1380609273910522, "loss_mode_switch": 0.0, "loss_total": 0.536370575428009, "step": 968 }, { "epoch": 0.3876, "grad_norm": 1.1771743297576904, "learning_rate": 7.005042282556009e-06, "loss": 0.2734, "step": 969 }, { "batch_size": 4, "epoch": 0.3876, "step": 969, "tokens_per_device": 9036 }, { "epoch": 0.3876, "loss_ce": 0.07018574327230453, "loss_lvr": 0.6629458069801331, "loss_mode_switch": 0.0, "loss_total": 0.13648033142089844, "step": 969 }, { "batch_size": 1, "epoch": 0.3876, "step": 969, "tokens_per_device": 4873 }, { "epoch": 0.3876, "loss_ce": 0.00042509002378210425, "loss_lvr": 0.5210647583007812, "loss_mode_switch": 0.0, "loss_total": 0.052531566470861435, "step": 969 }, { "batch_size": 4, "epoch": 0.3876, "step": 969, "tokens_per_device": 11060 }, { "epoch": 0.3876, "loss_ce": 0.33906906843185425, "loss_lvr": 0.6761059165000916, "loss_mode_switch": 0.0, "loss_total": 0.4066796600818634, "step": 969 }, { "batch_size": 4, "epoch": 0.3876, "step": 969, "tokens_per_device": 3028 }, { "epoch": 0.3876, "loss_ce": 0.4280796945095062, "loss_lvr": 0.7420547604560852, "loss_mode_switch": 0.0, "loss_total": 0.5022851824760437, "step": 969 }, { "batch_size": 4, "epoch": 0.3876, "step": 969, "tokens_per_device": 3488 }, { "epoch": 0.3876, "loss_ce": 0.03580346703529358, "loss_lvr": 0.8141651749610901, "loss_mode_switch": 0.0, "loss_total": 0.11721998453140259, "step": 969 }, { "batch_size": 4, "epoch": 0.3876, "step": 969, "tokens_per_device": 8156 }, { "epoch": 0.3876, "loss_ce": 0.2763610780239105, "loss_lvr": 0.944940447807312, "loss_mode_switch": 0.0, "loss_total": 0.3708551228046417, "step": 969 }, { "batch_size": 4, "epoch": 0.3876, "step": 969, "tokens_per_device": 4500 }, { "epoch": 0.3876, "loss_ce": 0.28041473031044006, "loss_lvr": 0.7393773794174194, "loss_mode_switch": 0.0, "loss_total": 0.3543524742126465, "step": 969 }, { "batch_size": 1, "epoch": 0.3876, "step": 969, "tokens_per_device": 4981 }, { "epoch": 0.3876, "loss_ce": 0.0030501224100589752, "loss_lvr": 0.44433146715164185, "loss_mode_switch": 0.0, "loss_total": 0.04748326912522316, "step": 969 }, { "epoch": 0.388, "grad_norm": 1.2938222885131836, "learning_rate": 6.999106720311846e-06, "loss": 0.3068, "step": 970 }, { "batch_size": 4, "epoch": 0.388, "step": 970, "tokens_per_device": 2696 }, { "epoch": 0.388, "loss_ce": 0.35186460614204407, "loss_lvr": 0.889526903629303, "loss_mode_switch": 0.0, "loss_total": 0.44081729650497437, "step": 970 }, { "batch_size": 4, "epoch": 0.388, "step": 970, "tokens_per_device": 4248 }, { "epoch": 0.388, "loss_ce": 0.005806503351777792, "loss_lvr": 1.0085084438323975, "loss_mode_switch": 0.0, "loss_total": 0.10665734857320786, "step": 970 }, { "batch_size": 4, "epoch": 0.388, "step": 970, "tokens_per_device": 4352 }, { "epoch": 0.388, "loss_ce": 0.18150529265403748, "loss_lvr": 1.0972579717636108, "loss_mode_switch": 0.0, "loss_total": 0.29123109579086304, "step": 970 }, { "batch_size": 4, "epoch": 0.388, "step": 970, "tokens_per_device": 4352 }, { "epoch": 0.388, "loss_ce": 0.1857188194990158, "loss_lvr": 0.9989586472511292, "loss_mode_switch": 0.0, "loss_total": 0.28561466932296753, "step": 970 }, { "batch_size": 1, "epoch": 0.388, "step": 970, "tokens_per_device": 5154 }, { "epoch": 0.388, "loss_ce": 0.0022137498017400503, "loss_lvr": 0.33566442131996155, "loss_mode_switch": 0.0, "loss_total": 0.03578019142150879, "step": 970 }, { "batch_size": 4, "epoch": 0.388, "step": 970, "tokens_per_device": 2608 }, { "epoch": 0.388, "loss_ce": 0.09054961055517197, "loss_lvr": 0.9487833380699158, "loss_mode_switch": 0.0, "loss_total": 0.1854279488325119, "step": 970 }, { "batch_size": 4, "epoch": 0.388, "step": 970, "tokens_per_device": 4040 }, { "epoch": 0.388, "loss_ce": 0.09478098154067993, "loss_lvr": 0.8907680511474609, "loss_mode_switch": 0.0, "loss_total": 0.18385779857635498, "step": 970 }, { "batch_size": 4, "epoch": 0.388, "step": 970, "tokens_per_device": 2708 }, { "epoch": 0.388, "loss_ce": 0.10790058970451355, "loss_lvr": 0.9292523264884949, "loss_mode_switch": 0.0, "loss_total": 0.20082582533359528, "step": 970 }, { "epoch": 0.3884, "grad_norm": 1.2554744482040405, "learning_rate": 6.993167802915854e-06, "loss": 0.2976, "step": 971 }, { "batch_size": 4, "epoch": 0.3884, "step": 971, "tokens_per_device": 3624 }, { "epoch": 0.3884, "loss_ce": 0.2130631059408188, "loss_lvr": 0.5910400748252869, "loss_mode_switch": 0.0, "loss_total": 0.2721671164035797, "step": 971 }, { "batch_size": 4, "epoch": 0.3884, "step": 971, "tokens_per_device": 2532 }, { "epoch": 0.3884, "loss_ce": 0.19570790231227875, "loss_lvr": 0.9432235360145569, "loss_mode_switch": 0.0, "loss_total": 0.29003024101257324, "step": 971 }, { "batch_size": 1, "epoch": 0.3884, "step": 971, "tokens_per_device": 4912 }, { "epoch": 0.3884, "loss_ce": 0.01725546084344387, "loss_lvr": 1.0464814901351929, "loss_mode_switch": 0.0, "loss_total": 0.12190361320972443, "step": 971 }, { "batch_size": 1, "epoch": 0.3884, "step": 971, "tokens_per_device": 4899 }, { "epoch": 0.3884, "loss_ce": 0.05250293388962746, "loss_lvr": 0.1789214015007019, "loss_mode_switch": 0.0, "loss_total": 0.0703950747847557, "step": 971 }, { "batch_size": 4, "epoch": 0.3884, "step": 971, "tokens_per_device": 4872 }, { "epoch": 0.3884, "loss_ce": 0.04809162765741348, "loss_lvr": 0.9393797516822815, "loss_mode_switch": 0.0, "loss_total": 0.14202961325645447, "step": 971 }, { "batch_size": 1, "epoch": 0.3884, "step": 971, "tokens_per_device": 5108 }, { "epoch": 0.3884, "loss_ce": 0.5703819990158081, "loss_lvr": 0.4478548765182495, "loss_mode_switch": 0.0, "loss_total": 0.615167498588562, "step": 971 }, { "batch_size": 1, "epoch": 0.3884, "step": 971, "tokens_per_device": 5141 }, { "epoch": 0.3884, "loss_ce": 0.0006111637922003865, "loss_lvr": 0.2641298472881317, "loss_mode_switch": 0.0, "loss_total": 0.027024148032069206, "step": 971 }, { "batch_size": 1, "epoch": 0.3884, "step": 971, "tokens_per_device": 4914 }, { "epoch": 0.3884, "loss_ce": 0.04878511279821396, "loss_lvr": 0.36853501200675964, "loss_mode_switch": 0.0, "loss_total": 0.0856386125087738, "step": 971 }, { "epoch": 0.3888, "grad_norm": 1.9216303825378418, "learning_rate": 6.987225540335467e-06, "loss": 0.3008, "step": 972 }, { "batch_size": 4, "epoch": 0.3888, "step": 972, "tokens_per_device": 5816 }, { "epoch": 0.3888, "loss_ce": 0.1003071516752243, "loss_lvr": 0.5865129828453064, "loss_mode_switch": 0.0, "loss_total": 0.15895844995975494, "step": 972 }, { "batch_size": 4, "epoch": 0.3888, "step": 972, "tokens_per_device": 1624 }, { "epoch": 0.3888, "loss_ce": 0.779024064540863, "loss_lvr": 1.085517406463623, "loss_mode_switch": 0.0, "loss_total": 0.8875758051872253, "step": 972 }, { "batch_size": 1, "epoch": 0.3888, "step": 972, "tokens_per_device": 4898 }, { "epoch": 0.3888, "loss_ce": 0.0102112190797925, "loss_lvr": 0.27091991901397705, "loss_mode_switch": 0.0, "loss_total": 0.03730321303009987, "step": 972 }, { "batch_size": 4, "epoch": 0.3888, "step": 972, "tokens_per_device": 5212 }, { "epoch": 0.3888, "loss_ce": 0.6800626516342163, "loss_lvr": 0.7988252639770508, "loss_mode_switch": 0.0, "loss_total": 0.7599451541900635, "step": 972 }, { "batch_size": 4, "epoch": 0.3888, "step": 972, "tokens_per_device": 2632 }, { "epoch": 0.3888, "loss_ce": 0.37268224358558655, "loss_lvr": 0.9292914867401123, "loss_mode_switch": 0.0, "loss_total": 0.46561139822006226, "step": 972 }, { "batch_size": 4, "epoch": 0.3888, "step": 972, "tokens_per_device": 4076 }, { "epoch": 0.3888, "loss_ce": 0.12253213673830032, "loss_lvr": 0.8319578170776367, "loss_mode_switch": 0.0, "loss_total": 0.20572791993618011, "step": 972 }, { "batch_size": 4, "epoch": 0.3888, "step": 972, "tokens_per_device": 9220 }, { "epoch": 0.3888, "loss_ce": 0.2795451283454895, "loss_lvr": 0.8000742793083191, "loss_mode_switch": 0.0, "loss_total": 0.3595525622367859, "step": 972 }, { "batch_size": 4, "epoch": 0.3888, "step": 972, "tokens_per_device": 7916 }, { "epoch": 0.3888, "loss_ce": 0.1438741683959961, "loss_lvr": 0.8152226805686951, "loss_mode_switch": 0.0, "loss_total": 0.22539643943309784, "step": 972 }, { "epoch": 0.3892, "grad_norm": 1.3066176176071167, "learning_rate": 6.981279942543741e-06, "loss": 0.3194, "step": 973 }, { "batch_size": 4, "epoch": 0.3892, "step": 973, "tokens_per_device": 1404 }, { "epoch": 0.3892, "loss_ce": 0.30650681257247925, "loss_lvr": 1.085557460784912, "loss_mode_switch": 0.0, "loss_total": 0.4150625467300415, "step": 973 }, { "batch_size": 4, "epoch": 0.3892, "step": 973, "tokens_per_device": 2608 }, { "epoch": 0.3892, "loss_ce": 0.17568573355674744, "loss_lvr": 0.8849409222602844, "loss_mode_switch": 0.0, "loss_total": 0.2641798257827759, "step": 973 }, { "batch_size": 4, "epoch": 0.3892, "step": 973, "tokens_per_device": 4276 }, { "epoch": 0.3892, "loss_ce": 0.07534637302160263, "loss_lvr": 0.7380912899971008, "loss_mode_switch": 0.0, "loss_total": 0.14915549755096436, "step": 973 }, { "batch_size": 4, "epoch": 0.3892, "step": 973, "tokens_per_device": 3916 }, { "epoch": 0.3892, "loss_ce": 0.10787714272737503, "loss_lvr": 0.6912829875946045, "loss_mode_switch": 0.0, "loss_total": 0.17700543999671936, "step": 973 }, { "batch_size": 1, "epoch": 0.3892, "step": 973, "tokens_per_device": 5612 }, { "epoch": 0.3892, "loss_ce": 0.10466910898685455, "loss_lvr": 0.43729209899902344, "loss_mode_switch": 0.0, "loss_total": 0.14839832484722137, "step": 973 }, { "batch_size": 4, "epoch": 0.3892, "step": 973, "tokens_per_device": 4556 }, { "epoch": 0.3892, "loss_ce": 0.11207699030637741, "loss_lvr": 0.8286452293395996, "loss_mode_switch": 0.0, "loss_total": 0.19494152069091797, "step": 973 }, { "batch_size": 4, "epoch": 0.3892, "step": 973, "tokens_per_device": 4496 }, { "epoch": 0.3892, "loss_ce": 0.22741572558879852, "loss_lvr": 0.9151477217674255, "loss_mode_switch": 0.0, "loss_total": 0.3189305067062378, "step": 973 }, { "batch_size": 4, "epoch": 0.3892, "step": 973, "tokens_per_device": 7904 }, { "epoch": 0.3892, "loss_ce": 0.011019822210073471, "loss_lvr": 1.0658680200576782, "loss_mode_switch": 0.0, "loss_total": 0.11760662496089935, "step": 973 }, { "epoch": 0.3896, "grad_norm": 1.2944245338439941, "learning_rate": 6.975331019519322e-06, "loss": 0.2832, "step": 974 }, { "batch_size": 4, "epoch": 0.3896, "step": 974, "tokens_per_device": 1376 }, { "epoch": 0.3896, "loss_ce": 0.5153288245201111, "loss_lvr": 0.8809216618537903, "loss_mode_switch": 0.0, "loss_total": 0.6034209728240967, "step": 974 }, { "batch_size": 4, "epoch": 0.3896, "step": 974, "tokens_per_device": 12928 }, { "epoch": 0.3896, "loss_ce": 0.19485262036323547, "loss_lvr": 0.555420458316803, "loss_mode_switch": 0.0, "loss_total": 0.25039467215538025, "step": 974 }, { "batch_size": 4, "epoch": 0.3896, "step": 974, "tokens_per_device": 4992 }, { "epoch": 0.3896, "loss_ce": 0.6456727981567383, "loss_lvr": 0.951617956161499, "loss_mode_switch": 0.0, "loss_total": 0.7408345937728882, "step": 974 }, { "batch_size": 1, "epoch": 0.3896, "step": 974, "tokens_per_device": 4886 }, { "epoch": 0.3896, "loss_ce": 0.0034894528798758984, "loss_lvr": 0.3629416227340698, "loss_mode_switch": 0.0, "loss_total": 0.03978361561894417, "step": 974 }, { "batch_size": 4, "epoch": 0.3896, "step": 974, "tokens_per_device": 10900 }, { "epoch": 0.3896, "loss_ce": 0.2516142725944519, "loss_lvr": 0.5368655323982239, "loss_mode_switch": 0.0, "loss_total": 0.30530083179473877, "step": 974 }, { "batch_size": 1, "epoch": 0.3896, "step": 974, "tokens_per_device": 4608 }, { "epoch": 0.3896, "loss_ce": 0.21359966695308685, "loss_lvr": 0.6474933624267578, "loss_mode_switch": 0.0, "loss_total": 0.27834901213645935, "step": 974 }, { "batch_size": 4, "epoch": 0.3896, "step": 974, "tokens_per_device": 1392 }, { "epoch": 0.3896, "loss_ce": 0.3340788185596466, "loss_lvr": 1.0520986318588257, "loss_mode_switch": 0.0, "loss_total": 0.4392886757850647, "step": 974 }, { "batch_size": 4, "epoch": 0.3896, "step": 974, "tokens_per_device": 4728 }, { "epoch": 0.3896, "loss_ce": 0.034759849309921265, "loss_lvr": 1.0099191665649414, "loss_mode_switch": 0.0, "loss_total": 0.13575176894664764, "step": 974 }, { "epoch": 0.39, "grad_norm": 1.3458184003829956, "learning_rate": 6.969378781246436e-06, "loss": 0.2952, "step": 975 }, { "batch_size": 1, "epoch": 0.39, "step": 975, "tokens_per_device": 5022 }, { "epoch": 0.39, "loss_ce": 0.008117037825286388, "loss_lvr": 0.26996323466300964, "loss_mode_switch": 0.0, "loss_total": 0.03511336073279381, "step": 975 }, { "batch_size": 1, "epoch": 0.39, "step": 975, "tokens_per_device": 4875 }, { "epoch": 0.39, "loss_ce": 0.12449538707733154, "loss_lvr": 0.48054540157318115, "loss_mode_switch": 0.0, "loss_total": 0.17254993319511414, "step": 975 }, { "batch_size": 1, "epoch": 0.39, "step": 975, "tokens_per_device": 6756 }, { "epoch": 0.39, "loss_ce": 0.01959080994129181, "loss_lvr": 0.5289852619171143, "loss_mode_switch": 0.0, "loss_total": 0.07248933613300323, "step": 975 }, { "batch_size": 1, "epoch": 0.39, "step": 975, "tokens_per_device": 6514 }, { "epoch": 0.39, "loss_ce": 0.01710258051753044, "loss_lvr": 0.33693650364875793, "loss_mode_switch": 0.0, "loss_total": 0.050796233117580414, "step": 975 }, { "batch_size": 4, "epoch": 0.39, "step": 975, "tokens_per_device": 4484 }, { "epoch": 0.39, "loss_ce": 0.3457743227481842, "loss_lvr": 0.6728562116622925, "loss_mode_switch": 0.0, "loss_total": 0.41305994987487793, "step": 975 }, { "batch_size": 1, "epoch": 0.39, "step": 975, "tokens_per_device": 5257 }, { "epoch": 0.39, "loss_ce": 0.0029426526743918657, "loss_lvr": 0.40705132484436035, "loss_mode_switch": 0.0, "loss_total": 0.04364778473973274, "step": 975 }, { "batch_size": 1, "epoch": 0.39, "step": 975, "tokens_per_device": 4826 }, { "epoch": 0.39, "loss_ce": 0.3747232258319855, "loss_lvr": 0.9403383731842041, "loss_mode_switch": 0.0, "loss_total": 0.4687570631504059, "step": 975 }, { "batch_size": 4, "epoch": 0.39, "step": 975, "tokens_per_device": 4244 }, { "epoch": 0.39, "loss_ce": 0.23533649742603302, "loss_lvr": 0.7615686655044556, "loss_mode_switch": 0.0, "loss_total": 0.3114933669567108, "step": 975 }, { "epoch": 0.3904, "grad_norm": 1.2495357990264893, "learning_rate": 6.9634232377148835e-06, "loss": 0.3176, "step": 976 }, { "batch_size": 1, "epoch": 0.3904, "step": 976, "tokens_per_device": 5175 }, { "epoch": 0.3904, "loss_ce": 0.13251309096813202, "loss_lvr": 0.40129515528678894, "loss_mode_switch": 0.0, "loss_total": 0.17264260351657867, "step": 976 }, { "batch_size": 4, "epoch": 0.3904, "step": 976, "tokens_per_device": 1536 }, { "epoch": 0.3904, "loss_ce": 0.37830308079719543, "loss_lvr": 0.9290620684623718, "loss_mode_switch": 0.0, "loss_total": 0.4712092876434326, "step": 976 }, { "batch_size": 4, "epoch": 0.3904, "step": 976, "tokens_per_device": 2668 }, { "epoch": 0.3904, "loss_ce": 0.09567352384328842, "loss_lvr": 0.8551613688468933, "loss_mode_switch": 0.0, "loss_total": 0.1811896562576294, "step": 976 }, { "batch_size": 4, "epoch": 0.3904, "step": 976, "tokens_per_device": 10544 }, { "epoch": 0.3904, "loss_ce": 0.6424915790557861, "loss_lvr": 1.2089394330978394, "loss_mode_switch": 0.0, "loss_total": 0.763385534286499, "step": 976 }, { "batch_size": 4, "epoch": 0.3904, "step": 976, "tokens_per_device": 6992 }, { "epoch": 0.3904, "loss_ce": 0.4840432405471802, "loss_lvr": 0.7050661444664001, "loss_mode_switch": 0.0, "loss_total": 0.5545498728752136, "step": 976 }, { "batch_size": 4, "epoch": 0.3904, "step": 976, "tokens_per_device": 4316 }, { "epoch": 0.3904, "loss_ce": 0.20021946728229523, "loss_lvr": 0.9732604026794434, "loss_mode_switch": 0.0, "loss_total": 0.29754549264907837, "step": 976 }, { "batch_size": 1, "epoch": 0.3904, "step": 976, "tokens_per_device": 4871 }, { "epoch": 0.3904, "loss_ce": 0.026875359937548637, "loss_lvr": 0.6800957322120667, "loss_mode_switch": 0.0, "loss_total": 0.09488493949174881, "step": 976 }, { "batch_size": 1, "epoch": 0.3904, "step": 976, "tokens_per_device": 5310 }, { "epoch": 0.3904, "loss_ce": 0.04497905448079109, "loss_lvr": 0.4897821843624115, "loss_mode_switch": 0.0, "loss_total": 0.09395727515220642, "step": 976 }, { "epoch": 0.3908, "grad_norm": 1.3519443273544312, "learning_rate": 6.957464398919998e-06, "loss": 0.3322, "step": 977 }, { "batch_size": 4, "epoch": 0.3908, "step": 977, "tokens_per_device": 4224 }, { "epoch": 0.3908, "loss_ce": 0.25055477023124695, "loss_lvr": 0.94179368019104, "loss_mode_switch": 0.0, "loss_total": 0.3447341322898865, "step": 977 }, { "batch_size": 1, "epoch": 0.3908, "step": 977, "tokens_per_device": 5063 }, { "epoch": 0.3908, "loss_ce": 0.2655857801437378, "loss_lvr": 1.191365122795105, "loss_mode_switch": 0.0, "loss_total": 0.3847222924232483, "step": 977 }, { "batch_size": 1, "epoch": 0.3908, "step": 977, "tokens_per_device": 4890 }, { "epoch": 0.3908, "loss_ce": 0.0579494908452034, "loss_lvr": 0.8886143565177917, "loss_mode_switch": 0.0, "loss_total": 0.14681091904640198, "step": 977 }, { "batch_size": 1, "epoch": 0.3908, "step": 977, "tokens_per_device": 4885 }, { "epoch": 0.3908, "loss_ce": 0.0028975254390388727, "loss_lvr": 0.9063462018966675, "loss_mode_switch": 0.0, "loss_total": 0.09353214502334595, "step": 977 }, { "batch_size": 1, "epoch": 0.3908, "step": 977, "tokens_per_device": 5204 }, { "epoch": 0.3908, "loss_ce": 0.021794848144054413, "loss_lvr": 0.446096807718277, "loss_mode_switch": 0.0, "loss_total": 0.06640452891588211, "step": 977 }, { "batch_size": 4, "epoch": 0.3908, "step": 977, "tokens_per_device": 4420 }, { "epoch": 0.3908, "loss_ce": 0.18337613344192505, "loss_lvr": 1.107309341430664, "loss_mode_switch": 0.0, "loss_total": 0.2941070795059204, "step": 977 }, { "batch_size": 1, "epoch": 0.3908, "step": 977, "tokens_per_device": 4895 }, { "epoch": 0.3908, "loss_ce": 0.18976636230945587, "loss_lvr": 0.3503654897212982, "loss_mode_switch": 0.0, "loss_total": 0.2248029112815857, "step": 977 }, { "batch_size": 4, "epoch": 0.3908, "step": 977, "tokens_per_device": 5792 }, { "epoch": 0.3908, "loss_ce": 0.14307084679603577, "loss_lvr": 0.807632327079773, "loss_mode_switch": 0.0, "loss_total": 0.2238340824842453, "step": 977 }, { "epoch": 0.3912, "grad_norm": 1.6755590438842773, "learning_rate": 6.951502274862656e-06, "loss": 0.3153, "step": 978 }, { "batch_size": 4, "epoch": 0.3912, "step": 978, "tokens_per_device": 5032 }, { "epoch": 0.3912, "loss_ce": 0.5907845497131348, "loss_lvr": 0.9132159948348999, "loss_mode_switch": 0.0, "loss_total": 0.6821061372756958, "step": 978 }, { "batch_size": 4, "epoch": 0.3912, "step": 978, "tokens_per_device": 2660 }, { "epoch": 0.3912, "loss_ce": 0.04938174784183502, "loss_lvr": 0.8610347509384155, "loss_mode_switch": 0.0, "loss_total": 0.1354852318763733, "step": 978 }, { "batch_size": 4, "epoch": 0.3912, "step": 978, "tokens_per_device": 3840 }, { "epoch": 0.3912, "loss_ce": 0.2070448249578476, "loss_lvr": 0.7085510492324829, "loss_mode_switch": 0.0, "loss_total": 0.27789992094039917, "step": 978 }, { "batch_size": 4, "epoch": 0.3912, "step": 978, "tokens_per_device": 3932 }, { "epoch": 0.3912, "loss_ce": 0.5616798996925354, "loss_lvr": 1.0202006101608276, "loss_mode_switch": 0.0, "loss_total": 0.6636999845504761, "step": 978 }, { "batch_size": 4, "epoch": 0.3912, "step": 978, "tokens_per_device": 8040 }, { "epoch": 0.3912, "loss_ce": 0.03380557894706726, "loss_lvr": 0.7409896850585938, "loss_mode_switch": 0.0, "loss_total": 0.10790454596281052, "step": 978 }, { "batch_size": 1, "epoch": 0.3912, "step": 978, "tokens_per_device": 4728 }, { "epoch": 0.3912, "loss_ce": 0.009234466589987278, "loss_lvr": 0.5873966217041016, "loss_mode_switch": 0.0, "loss_total": 0.06797412782907486, "step": 978 }, { "batch_size": 4, "epoch": 0.3912, "step": 978, "tokens_per_device": 5412 }, { "epoch": 0.3912, "loss_ce": 0.42058825492858887, "loss_lvr": 0.8527747392654419, "loss_mode_switch": 0.0, "loss_total": 0.505865752696991, "step": 978 }, { "batch_size": 4, "epoch": 0.3912, "step": 978, "tokens_per_device": 4864 }, { "epoch": 0.3912, "loss_ce": 0.393378883600235, "loss_lvr": 1.0497773885726929, "loss_mode_switch": 0.0, "loss_total": 0.4983566403388977, "step": 978 }, { "epoch": 0.3916, "grad_norm": 1.3998634815216064, "learning_rate": 6.945536875549241e-06, "loss": 0.3105, "step": 979 }, { "batch_size": 4, "epoch": 0.3916, "step": 979, "tokens_per_device": 1796 }, { "epoch": 0.3916, "loss_ce": 0.7841362357139587, "loss_lvr": 0.860825777053833, "loss_mode_switch": 0.0, "loss_total": 0.870218813419342, "step": 979 }, { "batch_size": 4, "epoch": 0.3916, "step": 979, "tokens_per_device": 4272 }, { "epoch": 0.3916, "loss_ce": 0.044568877667188644, "loss_lvr": 0.5501235127449036, "loss_mode_switch": 0.0, "loss_total": 0.09958122670650482, "step": 979 }, { "batch_size": 4, "epoch": 0.3916, "step": 979, "tokens_per_device": 2944 }, { "epoch": 0.3916, "loss_ce": 0.3660617470741272, "loss_lvr": 1.0076401233673096, "loss_mode_switch": 0.0, "loss_total": 0.4668257534503937, "step": 979 }, { "batch_size": 4, "epoch": 0.3916, "step": 979, "tokens_per_device": 3628 }, { "epoch": 0.3916, "loss_ce": 0.5363180637359619, "loss_lvr": 1.0533428192138672, "loss_mode_switch": 0.0, "loss_total": 0.6416523456573486, "step": 979 }, { "batch_size": 4, "epoch": 0.3916, "step": 979, "tokens_per_device": 4240 }, { "epoch": 0.3916, "loss_ce": 0.1141933873295784, "loss_lvr": 0.7897550463676453, "loss_mode_switch": 0.0, "loss_total": 0.19316889345645905, "step": 979 }, { "batch_size": 1, "epoch": 0.3916, "step": 979, "tokens_per_device": 4870 }, { "epoch": 0.3916, "loss_ce": 0.0014617261476814747, "loss_lvr": 0.3102533519268036, "loss_mode_switch": 0.0, "loss_total": 0.032487060874700546, "step": 979 }, { "batch_size": 1, "epoch": 0.3916, "step": 979, "tokens_per_device": 5069 }, { "epoch": 0.3916, "loss_ce": 0.34037184715270996, "loss_lvr": 0.5693511962890625, "loss_mode_switch": 0.0, "loss_total": 0.39730697870254517, "step": 979 }, { "batch_size": 4, "epoch": 0.3916, "step": 979, "tokens_per_device": 3820 }, { "epoch": 0.3916, "loss_ce": 0.21952031552791595, "loss_lvr": 0.7802250385284424, "loss_mode_switch": 0.0, "loss_total": 0.2975428104400635, "step": 979 }, { "epoch": 0.392, "grad_norm": 1.8260383605957031, "learning_rate": 6.939568210991633e-06, "loss": 0.3089, "step": 980 }, { "batch_size": 4, "epoch": 0.392, "step": 980, "tokens_per_device": 3928 }, { "epoch": 0.392, "loss_ce": 0.5969595909118652, "loss_lvr": 0.8143753409385681, "loss_mode_switch": 0.0, "loss_total": 0.6783971190452576, "step": 980 }, { "batch_size": 4, "epoch": 0.392, "step": 980, "tokens_per_device": 2360 }, { "epoch": 0.392, "loss_ce": 0.6297206282615662, "loss_lvr": 0.8128839731216431, "loss_mode_switch": 0.0, "loss_total": 0.7110090255737305, "step": 980 }, { "batch_size": 1, "epoch": 0.392, "step": 980, "tokens_per_device": 4737 }, { "epoch": 0.392, "loss_ce": 0.04105355218052864, "loss_lvr": 0.1510448455810547, "loss_mode_switch": 0.0, "loss_total": 0.05615803599357605, "step": 980 }, { "batch_size": 4, "epoch": 0.392, "step": 980, "tokens_per_device": 4280 }, { "epoch": 0.392, "loss_ce": 0.20018459856510162, "loss_lvr": 0.8533645272254944, "loss_mode_switch": 0.0, "loss_total": 0.2855210602283478, "step": 980 }, { "batch_size": 4, "epoch": 0.392, "step": 980, "tokens_per_device": 6604 }, { "epoch": 0.392, "loss_ce": 0.4149113893508911, "loss_lvr": 0.896213710308075, "loss_mode_switch": 0.0, "loss_total": 0.5045327544212341, "step": 980 }, { "batch_size": 4, "epoch": 0.392, "step": 980, "tokens_per_device": 4204 }, { "epoch": 0.392, "loss_ce": 0.015675680711865425, "loss_lvr": 0.9697466492652893, "loss_mode_switch": 0.0, "loss_total": 0.11265034228563309, "step": 980 }, { "batch_size": 4, "epoch": 0.392, "step": 980, "tokens_per_device": 6092 }, { "epoch": 0.392, "loss_ce": 0.03994690254330635, "loss_lvr": 0.7546895742416382, "loss_mode_switch": 0.0, "loss_total": 0.11541585624217987, "step": 980 }, { "batch_size": 4, "epoch": 0.392, "step": 980, "tokens_per_device": 1956 }, { "epoch": 0.392, "loss_ce": 0.4735727608203888, "loss_lvr": 0.9368367195129395, "loss_mode_switch": 0.0, "loss_total": 0.5672564506530762, "step": 980 }, { "epoch": 0.3924, "grad_norm": 1.2519078254699707, "learning_rate": 6.933596291207196e-06, "loss": 0.274, "step": 981 }, { "batch_size": 1, "epoch": 0.3924, "step": 981, "tokens_per_device": 7211 }, { "epoch": 0.3924, "loss_ce": 0.007300544064491987, "loss_lvr": 0.4123443067073822, "loss_mode_switch": 0.0, "loss_total": 0.048534974455833435, "step": 981 }, { "batch_size": 4, "epoch": 0.3924, "step": 981, "tokens_per_device": 5568 }, { "epoch": 0.3924, "loss_ce": 0.4134669005870819, "loss_lvr": 0.6574521660804749, "loss_mode_switch": 0.0, "loss_total": 0.47921210527420044, "step": 981 }, { "batch_size": 1, "epoch": 0.3924, "step": 981, "tokens_per_device": 5110 }, { "epoch": 0.3924, "loss_ce": 0.07621562480926514, "loss_lvr": 0.6339816451072693, "loss_mode_switch": 0.0, "loss_total": 0.1396137923002243, "step": 981 }, { "batch_size": 4, "epoch": 0.3924, "step": 981, "tokens_per_device": 2716 }, { "epoch": 0.3924, "loss_ce": 0.35767650604248047, "loss_lvr": 0.8149321675300598, "loss_mode_switch": 0.0, "loss_total": 0.4391697347164154, "step": 981 }, { "batch_size": 4, "epoch": 0.3924, "step": 981, "tokens_per_device": 3360 }, { "epoch": 0.3924, "loss_ce": 0.5722938776016235, "loss_lvr": 1.031130075454712, "loss_mode_switch": 0.0, "loss_total": 0.6754068732261658, "step": 981 }, { "batch_size": 4, "epoch": 0.3924, "step": 981, "tokens_per_device": 3884 }, { "epoch": 0.3924, "loss_ce": 0.4856627583503723, "loss_lvr": 0.9302837252616882, "loss_mode_switch": 0.0, "loss_total": 0.5786911249160767, "step": 981 }, { "batch_size": 4, "epoch": 0.3924, "step": 981, "tokens_per_device": 4272 }, { "epoch": 0.3924, "loss_ce": 0.3094099164009094, "loss_lvr": 0.8569123148918152, "loss_mode_switch": 0.0, "loss_total": 0.3951011598110199, "step": 981 }, { "batch_size": 4, "epoch": 0.3924, "step": 981, "tokens_per_device": 2948 }, { "epoch": 0.3924, "loss_ce": 0.17239335179328918, "loss_lvr": 0.7748331427574158, "loss_mode_switch": 0.0, "loss_total": 0.24987667798995972, "step": 981 }, { "epoch": 0.3928, "grad_norm": 1.3424586057662964, "learning_rate": 6.927621126218756e-06, "loss": 0.3192, "step": 982 }, { "batch_size": 4, "epoch": 0.3928, "step": 982, "tokens_per_device": 9028 }, { "epoch": 0.3928, "loss_ce": 0.023321330547332764, "loss_lvr": 0.40726691484451294, "loss_mode_switch": 0.0, "loss_total": 0.06404802203178406, "step": 982 }, { "batch_size": 1, "epoch": 0.3928, "step": 982, "tokens_per_device": 4911 }, { "epoch": 0.3928, "loss_ce": 0.015855105593800545, "loss_lvr": 0.5571288466453552, "loss_mode_switch": 0.0, "loss_total": 0.07156798988580704, "step": 982 }, { "batch_size": 4, "epoch": 0.3928, "step": 982, "tokens_per_device": 1852 }, { "epoch": 0.3928, "loss_ce": 0.15049399435520172, "loss_lvr": 1.2165374755859375, "loss_mode_switch": 0.0, "loss_total": 0.2721477448940277, "step": 982 }, { "batch_size": 1, "epoch": 0.3928, "step": 982, "tokens_per_device": 4675 }, { "epoch": 0.3928, "loss_ce": 0.1754453331232071, "loss_lvr": 0.33922073245048523, "loss_mode_switch": 0.0, "loss_total": 0.20936740934848785, "step": 982 }, { "batch_size": 1, "epoch": 0.3928, "step": 982, "tokens_per_device": 4855 }, { "epoch": 0.3928, "loss_ce": 0.0032855088356882334, "loss_lvr": 0.5250687599182129, "loss_mode_switch": 0.0, "loss_total": 0.05579238384962082, "step": 982 }, { "batch_size": 4, "epoch": 0.3928, "step": 982, "tokens_per_device": 13064 }, { "epoch": 0.3928, "loss_ce": 0.22935576736927032, "loss_lvr": 0.7888396978378296, "loss_mode_switch": 0.0, "loss_total": 0.30823972821235657, "step": 982 }, { "batch_size": 1, "epoch": 0.3928, "step": 982, "tokens_per_device": 5173 }, { "epoch": 0.3928, "loss_ce": 0.007315605413168669, "loss_lvr": 0.4855550527572632, "loss_mode_switch": 0.0, "loss_total": 0.055871110409498215, "step": 982 }, { "batch_size": 1, "epoch": 0.3928, "step": 982, "tokens_per_device": 5062 }, { "epoch": 0.3928, "loss_ce": 0.08779621869325638, "loss_lvr": 0.4319160580635071, "loss_mode_switch": 0.0, "loss_total": 0.13098782300949097, "step": 982 }, { "epoch": 0.3932, "grad_norm": 1.3852280378341675, "learning_rate": 6.921642726054583e-06, "loss": 0.2881, "step": 983 }, { "batch_size": 1, "epoch": 0.3932, "step": 983, "tokens_per_device": 4877 }, { "epoch": 0.3932, "loss_ce": 0.15174946188926697, "loss_lvr": 0.30390068888664246, "loss_mode_switch": 0.0, "loss_total": 0.1821395307779312, "step": 983 }, { "batch_size": 1, "epoch": 0.3932, "step": 983, "tokens_per_device": 4871 }, { "epoch": 0.3932, "loss_ce": 0.018197912722826004, "loss_lvr": 0.2796558439731598, "loss_mode_switch": 0.0, "loss_total": 0.04616349935531616, "step": 983 }, { "batch_size": 4, "epoch": 0.3932, "step": 983, "tokens_per_device": 1320 }, { "epoch": 0.3932, "loss_ce": 0.48057302832603455, "loss_lvr": 1.156015157699585, "loss_mode_switch": 0.0, "loss_total": 0.5961745381355286, "step": 983 }, { "batch_size": 1, "epoch": 0.3932, "step": 983, "tokens_per_device": 5178 }, { "epoch": 0.3932, "loss_ce": 0.010740778408944607, "loss_lvr": 0.3268057107925415, "loss_mode_switch": 0.0, "loss_total": 0.04342135041952133, "step": 983 }, { "batch_size": 4, "epoch": 0.3932, "step": 983, "tokens_per_device": 4044 }, { "epoch": 0.3932, "loss_ce": 0.23923276364803314, "loss_lvr": 0.7362143993377686, "loss_mode_switch": 0.0, "loss_total": 0.31285420060157776, "step": 983 }, { "batch_size": 4, "epoch": 0.3932, "step": 983, "tokens_per_device": 3384 }, { "epoch": 0.3932, "loss_ce": 0.8966785669326782, "loss_lvr": 0.6741321086883545, "loss_mode_switch": 0.0, "loss_total": 0.9640917778015137, "step": 983 }, { "batch_size": 4, "epoch": 0.3932, "step": 983, "tokens_per_device": 1808 }, { "epoch": 0.3932, "loss_ce": 0.5440531373023987, "loss_lvr": 1.24135160446167, "loss_mode_switch": 0.0, "loss_total": 0.6681882739067078, "step": 983 }, { "batch_size": 1, "epoch": 0.3932, "step": 983, "tokens_per_device": 5012 }, { "epoch": 0.3932, "loss_ce": 0.14842747151851654, "loss_lvr": 0.6694964170455933, "loss_mode_switch": 0.0, "loss_total": 0.21537712216377258, "step": 983 }, { "epoch": 0.3936, "grad_norm": 1.3577624559402466, "learning_rate": 6.915661100748379e-06, "loss": 0.3205, "step": 984 }, { "batch_size": 4, "epoch": 0.3936, "step": 984, "tokens_per_device": 4384 }, { "epoch": 0.3936, "loss_ce": 0.16899174451828003, "loss_lvr": 0.9290819764137268, "loss_mode_switch": 0.0, "loss_total": 0.2618999481201172, "step": 984 }, { "batch_size": 4, "epoch": 0.3936, "step": 984, "tokens_per_device": 4424 }, { "epoch": 0.3936, "loss_ce": 0.18024538457393646, "loss_lvr": 0.7733901739120483, "loss_mode_switch": 0.0, "loss_total": 0.2575843930244446, "step": 984 }, { "batch_size": 4, "epoch": 0.3936, "step": 984, "tokens_per_device": 4304 }, { "epoch": 0.3936, "loss_ce": 0.6359226703643799, "loss_lvr": 1.0092554092407227, "loss_mode_switch": 0.0, "loss_total": 0.7368482351303101, "step": 984 }, { "batch_size": 1, "epoch": 0.3936, "step": 984, "tokens_per_device": 4883 }, { "epoch": 0.3936, "loss_ce": 0.00023913370387163013, "loss_lvr": 1.0282244682312012, "loss_mode_switch": 0.0, "loss_total": 0.10306157916784286, "step": 984 }, { "batch_size": 1, "epoch": 0.3936, "step": 984, "tokens_per_device": 4916 }, { "epoch": 0.3936, "loss_ce": 0.08073106408119202, "loss_lvr": 0.8229457139968872, "loss_mode_switch": 0.0, "loss_total": 0.1630256474018097, "step": 984 }, { "batch_size": 4, "epoch": 0.3936, "step": 984, "tokens_per_device": 3780 }, { "epoch": 0.3936, "loss_ce": 0.31775930523872375, "loss_lvr": 1.07566237449646, "loss_mode_switch": 0.0, "loss_total": 0.42532554268836975, "step": 984 }, { "batch_size": 4, "epoch": 0.3936, "step": 984, "tokens_per_device": 4204 }, { "epoch": 0.3936, "loss_ce": 0.1608949601650238, "loss_lvr": 0.9283727407455444, "loss_mode_switch": 0.0, "loss_total": 0.25373223423957825, "step": 984 }, { "batch_size": 4, "epoch": 0.3936, "step": 984, "tokens_per_device": 1880 }, { "epoch": 0.3936, "loss_ce": 0.6140175461769104, "loss_lvr": 0.966670036315918, "loss_mode_switch": 0.0, "loss_total": 0.7106845378875732, "step": 984 }, { "epoch": 0.394, "grad_norm": 1.250469446182251, "learning_rate": 6.9096762603392595e-06, "loss": 0.2968, "step": 985 }, { "batch_size": 4, "epoch": 0.394, "step": 985, "tokens_per_device": 1708 }, { "epoch": 0.394, "loss_ce": 0.3546067774295807, "loss_lvr": 0.9365589618682861, "loss_mode_switch": 0.0, "loss_total": 0.44826269149780273, "step": 985 }, { "batch_size": 4, "epoch": 0.394, "step": 985, "tokens_per_device": 4688 }, { "epoch": 0.394, "loss_ce": 0.05136743187904358, "loss_lvr": 0.7559853196144104, "loss_mode_switch": 0.0, "loss_total": 0.1269659698009491, "step": 985 }, { "batch_size": 1, "epoch": 0.394, "step": 985, "tokens_per_device": 5030 }, { "epoch": 0.394, "loss_ce": 0.41889652609825134, "loss_lvr": 0.4377812147140503, "loss_mode_switch": 0.0, "loss_total": 0.46267464756965637, "step": 985 }, { "batch_size": 4, "epoch": 0.394, "step": 985, "tokens_per_device": 7256 }, { "epoch": 0.394, "loss_ce": 0.32843005657196045, "loss_lvr": 0.7212759852409363, "loss_mode_switch": 0.0, "loss_total": 0.40055766701698303, "step": 985 }, { "batch_size": 4, "epoch": 0.394, "step": 985, "tokens_per_device": 15652 }, { "epoch": 0.394, "loss_ce": 0.1259600818157196, "loss_lvr": 1.5285850763320923, "loss_mode_switch": 0.0, "loss_total": 0.27881860733032227, "step": 985 }, { "batch_size": 1, "epoch": 0.394, "step": 985, "tokens_per_device": 5189 }, { "epoch": 0.394, "loss_ce": 0.11000575870275497, "loss_lvr": 0.1818353831768036, "loss_mode_switch": 0.0, "loss_total": 0.1281892955303192, "step": 985 }, { "batch_size": 1, "epoch": 0.394, "step": 985, "tokens_per_device": 4859 }, { "epoch": 0.394, "loss_ce": 0.026906650513410568, "loss_lvr": 0.3127816319465637, "loss_mode_switch": 0.0, "loss_total": 0.05818481370806694, "step": 985 }, { "batch_size": 4, "epoch": 0.394, "step": 985, "tokens_per_device": 4376 }, { "epoch": 0.394, "loss_ce": 0.047376230359077454, "loss_lvr": 1.0380172729492188, "loss_mode_switch": 0.0, "loss_total": 0.15117795765399933, "step": 985 }, { "epoch": 0.3944, "grad_norm": 1.2737163305282593, "learning_rate": 6.903688214871734e-06, "loss": 0.2979, "step": 986 }, { "batch_size": 4, "epoch": 0.3944, "step": 986, "tokens_per_device": 4768 }, { "epoch": 0.3944, "loss_ce": 0.23089919984340668, "loss_lvr": 0.7785914540290833, "loss_mode_switch": 0.0, "loss_total": 0.30875834822654724, "step": 986 }, { "batch_size": 1, "epoch": 0.3944, "step": 986, "tokens_per_device": 7031 }, { "epoch": 0.3944, "loss_ce": 0.018533548340201378, "loss_lvr": 0.2886119782924652, "loss_mode_switch": 0.0, "loss_total": 0.04739474505186081, "step": 986 }, { "batch_size": 4, "epoch": 0.3944, "step": 986, "tokens_per_device": 4216 }, { "epoch": 0.3944, "loss_ce": 0.4586729407310486, "loss_lvr": 0.9396973848342896, "loss_mode_switch": 0.0, "loss_total": 0.5526427030563354, "step": 986 }, { "batch_size": 4, "epoch": 0.3944, "step": 986, "tokens_per_device": 5876 }, { "epoch": 0.3944, "loss_ce": 0.0451115258038044, "loss_lvr": 0.6504418253898621, "loss_mode_switch": 0.0, "loss_total": 0.11015571653842926, "step": 986 }, { "batch_size": 4, "epoch": 0.3944, "step": 986, "tokens_per_device": 4196 }, { "epoch": 0.3944, "loss_ce": 0.1797943413257599, "loss_lvr": 0.7855425477027893, "loss_mode_switch": 0.0, "loss_total": 0.25834858417510986, "step": 986 }, { "batch_size": 4, "epoch": 0.3944, "step": 986, "tokens_per_device": 4272 }, { "epoch": 0.3944, "loss_ce": 0.4987509846687317, "loss_lvr": 1.7522618770599365, "loss_mode_switch": 0.0, "loss_total": 0.6739771962165833, "step": 986 }, { "batch_size": 4, "epoch": 0.3944, "step": 986, "tokens_per_device": 6268 }, { "epoch": 0.3944, "loss_ce": 0.029957758262753487, "loss_lvr": 0.7189438343048096, "loss_mode_switch": 0.0, "loss_total": 0.10185214132070541, "step": 986 }, { "batch_size": 1, "epoch": 0.3944, "step": 986, "tokens_per_device": 4913 }, { "epoch": 0.3944, "loss_ce": 0.14418405294418335, "loss_lvr": 0.399217814207077, "loss_mode_switch": 0.0, "loss_total": 0.18410584330558777, "step": 986 }, { "epoch": 0.3948, "grad_norm": 1.311296820640564, "learning_rate": 6.897696974395691e-06, "loss": 0.2767, "step": 987 }, { "batch_size": 4, "epoch": 0.3948, "step": 987, "tokens_per_device": 5028 }, { "epoch": 0.3948, "loss_ce": 0.10494900494813919, "loss_lvr": 0.8197572827339172, "loss_mode_switch": 0.0, "loss_total": 0.18692472577095032, "step": 987 }, { "batch_size": 4, "epoch": 0.3948, "step": 987, "tokens_per_device": 2676 }, { "epoch": 0.3948, "loss_ce": 0.0692322850227356, "loss_lvr": 0.8650587201118469, "loss_mode_switch": 0.0, "loss_total": 0.15573816001415253, "step": 987 }, { "batch_size": 1, "epoch": 0.3948, "step": 987, "tokens_per_device": 4742 }, { "epoch": 0.3948, "loss_ce": 0.0030394201166927814, "loss_lvr": 0.3307892084121704, "loss_mode_switch": 0.0, "loss_total": 0.036118339747190475, "step": 987 }, { "batch_size": 1, "epoch": 0.3948, "step": 987, "tokens_per_device": 4863 }, { "epoch": 0.3948, "loss_ce": 0.00764663377776742, "loss_lvr": 0.4266777038574219, "loss_mode_switch": 0.0, "loss_total": 0.05031440779566765, "step": 987 }, { "batch_size": 4, "epoch": 0.3948, "step": 987, "tokens_per_device": 4876 }, { "epoch": 0.3948, "loss_ce": 0.024463610723614693, "loss_lvr": 0.7618650197982788, "loss_mode_switch": 0.0, "loss_total": 0.1006501093506813, "step": 987 }, { "batch_size": 4, "epoch": 0.3948, "step": 987, "tokens_per_device": 4316 }, { "epoch": 0.3948, "loss_ce": 0.4085805416107178, "loss_lvr": 0.8395001292228699, "loss_mode_switch": 0.0, "loss_total": 0.49253055453300476, "step": 987 }, { "batch_size": 4, "epoch": 0.3948, "step": 987, "tokens_per_device": 4436 }, { "epoch": 0.3948, "loss_ce": 0.18201258778572083, "loss_lvr": 0.789930522441864, "loss_mode_switch": 0.0, "loss_total": 0.2610056400299072, "step": 987 }, { "batch_size": 4, "epoch": 0.3948, "step": 987, "tokens_per_device": 5808 }, { "epoch": 0.3948, "loss_ce": 0.24076524376869202, "loss_lvr": 0.8659473657608032, "loss_mode_switch": 0.0, "loss_total": 0.32735997438430786, "step": 987 }, { "epoch": 0.3952, "grad_norm": 1.2886849641799927, "learning_rate": 6.891702548966386e-06, "loss": 0.3017, "step": 988 }, { "batch_size": 4, "epoch": 0.3952, "step": 988, "tokens_per_device": 5928 }, { "epoch": 0.3952, "loss_ce": 0.4087429642677307, "loss_lvr": 0.9220407009124756, "loss_mode_switch": 0.0, "loss_total": 0.5009470582008362, "step": 988 }, { "batch_size": 4, "epoch": 0.3952, "step": 988, "tokens_per_device": 9744 }, { "epoch": 0.3952, "loss_ce": 0.2290668487548828, "loss_lvr": 0.4432052671909332, "loss_mode_switch": 0.0, "loss_total": 0.2733873724937439, "step": 988 }, { "batch_size": 4, "epoch": 0.3952, "step": 988, "tokens_per_device": 4304 }, { "epoch": 0.3952, "loss_ce": 0.3498118817806244, "loss_lvr": 0.8624820113182068, "loss_mode_switch": 0.0, "loss_total": 0.4360600709915161, "step": 988 }, { "batch_size": 4, "epoch": 0.3952, "step": 988, "tokens_per_device": 2836 }, { "epoch": 0.3952, "loss_ce": 0.045761965215206146, "loss_lvr": 0.7064545750617981, "loss_mode_switch": 0.0, "loss_total": 0.11640742421150208, "step": 988 }, { "batch_size": 1, "epoch": 0.3952, "step": 988, "tokens_per_device": 5347 }, { "epoch": 0.3952, "loss_ce": 0.06286636739969254, "loss_lvr": 0.4957960546016693, "loss_mode_switch": 0.0, "loss_total": 0.11244597285985947, "step": 988 }, { "batch_size": 4, "epoch": 0.3952, "step": 988, "tokens_per_device": 3836 }, { "epoch": 0.3952, "loss_ce": 0.4651038646697998, "loss_lvr": 0.9981445074081421, "loss_mode_switch": 0.0, "loss_total": 0.5649183392524719, "step": 988 }, { "batch_size": 4, "epoch": 0.3952, "step": 988, "tokens_per_device": 4280 }, { "epoch": 0.3952, "loss_ce": 0.29924556612968445, "loss_lvr": 0.9253200888633728, "loss_mode_switch": 0.0, "loss_total": 0.39177757501602173, "step": 988 }, { "batch_size": 4, "epoch": 0.3952, "step": 988, "tokens_per_device": 3872 }, { "epoch": 0.3952, "loss_ce": 0.22746190428733826, "loss_lvr": 1.0040559768676758, "loss_mode_switch": 0.0, "loss_total": 0.3278675079345703, "step": 988 }, { "epoch": 0.3956, "grad_norm": 1.3601229190826416, "learning_rate": 6.885704948644411e-06, "loss": 0.3007, "step": 989 }, { "batch_size": 4, "epoch": 0.3956, "step": 989, "tokens_per_device": 5760 }, { "epoch": 0.3956, "loss_ce": 0.2442760169506073, "loss_lvr": 0.7424281239509583, "loss_mode_switch": 0.0, "loss_total": 0.31851881742477417, "step": 989 }, { "batch_size": 4, "epoch": 0.3956, "step": 989, "tokens_per_device": 13828 }, { "epoch": 0.3956, "loss_ce": 0.48250553011894226, "loss_lvr": 0.8910537958145142, "loss_mode_switch": 0.0, "loss_total": 0.5716109275817871, "step": 989 }, { "batch_size": 4, "epoch": 0.3956, "step": 989, "tokens_per_device": 1740 }, { "epoch": 0.3956, "loss_ce": 0.742838978767395, "loss_lvr": 1.0882984399795532, "loss_mode_switch": 0.0, "loss_total": 0.8516688346862793, "step": 989 }, { "batch_size": 4, "epoch": 0.3956, "step": 989, "tokens_per_device": 4272 }, { "epoch": 0.3956, "loss_ce": 0.42633479833602905, "loss_lvr": 1.022682785987854, "loss_mode_switch": 0.0, "loss_total": 0.5286030769348145, "step": 989 }, { "batch_size": 4, "epoch": 0.3956, "step": 989, "tokens_per_device": 1656 }, { "epoch": 0.3956, "loss_ce": 0.21060094237327576, "loss_lvr": 1.0320353507995605, "loss_mode_switch": 0.0, "loss_total": 0.3138044774532318, "step": 989 }, { "batch_size": 4, "epoch": 0.3956, "step": 989, "tokens_per_device": 7716 }, { "epoch": 0.3956, "loss_ce": 0.5242626070976257, "loss_lvr": 0.7817907333374023, "loss_mode_switch": 0.0, "loss_total": 0.602441668510437, "step": 989 }, { "batch_size": 4, "epoch": 0.3956, "step": 989, "tokens_per_device": 4296 }, { "epoch": 0.3956, "loss_ce": 0.21136344969272614, "loss_lvr": 0.7693778872489929, "loss_mode_switch": 0.0, "loss_total": 0.2883012294769287, "step": 989 }, { "batch_size": 1, "epoch": 0.3956, "step": 989, "tokens_per_device": 4904 }, { "epoch": 0.3956, "loss_ce": 0.08773333579301834, "loss_lvr": 0.5435398817062378, "loss_mode_switch": 0.0, "loss_total": 0.14208732545375824, "step": 989 }, { "epoch": 0.396, "grad_norm": 1.3704819679260254, "learning_rate": 6.8797041834956955e-06, "loss": 0.3292, "step": 990 }, { "batch_size": 4, "epoch": 0.396, "step": 990, "tokens_per_device": 1460 }, { "epoch": 0.396, "loss_ce": 0.5837835669517517, "loss_lvr": 0.9557288885116577, "loss_mode_switch": 0.0, "loss_total": 0.6793564558029175, "step": 990 }, { "batch_size": 4, "epoch": 0.396, "step": 990, "tokens_per_device": 3512 }, { "epoch": 0.396, "loss_ce": 0.12790802121162415, "loss_lvr": 1.1196935176849365, "loss_mode_switch": 0.0, "loss_total": 0.2398773729801178, "step": 990 }, { "batch_size": 1, "epoch": 0.396, "step": 990, "tokens_per_device": 4911 }, { "epoch": 0.396, "loss_ce": 0.03544265776872635, "loss_lvr": 0.5104222297668457, "loss_mode_switch": 0.0, "loss_total": 0.0864848792552948, "step": 990 }, { "batch_size": 1, "epoch": 0.396, "step": 990, "tokens_per_device": 5071 }, { "epoch": 0.396, "loss_ce": 0.0009386710589751601, "loss_lvr": 0.636800229549408, "loss_mode_switch": 0.0, "loss_total": 0.06461869180202484, "step": 990 }, { "batch_size": 1, "epoch": 0.396, "step": 990, "tokens_per_device": 4891 }, { "epoch": 0.396, "loss_ce": 0.2522565424442291, "loss_lvr": 0.5555921196937561, "loss_mode_switch": 0.0, "loss_total": 0.3078157603740692, "step": 990 }, { "batch_size": 4, "epoch": 0.396, "step": 990, "tokens_per_device": 5028 }, { "epoch": 0.396, "loss_ce": 0.15575461089611053, "loss_lvr": 0.7625763416290283, "loss_mode_switch": 0.0, "loss_total": 0.23201224207878113, "step": 990 }, { "batch_size": 4, "epoch": 0.396, "step": 990, "tokens_per_device": 2876 }, { "epoch": 0.396, "loss_ce": 0.10879446566104889, "loss_lvr": 0.8325560688972473, "loss_mode_switch": 0.0, "loss_total": 0.19205006957054138, "step": 990 }, { "batch_size": 4, "epoch": 0.396, "step": 990, "tokens_per_device": 4332 }, { "epoch": 0.396, "loss_ce": 0.588770866394043, "loss_lvr": 1.0946754217147827, "loss_mode_switch": 0.0, "loss_total": 0.6982384324073792, "step": 990 }, { "epoch": 0.3964, "grad_norm": 1.3391382694244385, "learning_rate": 6.873700263591476e-06, "loss": 0.2895, "step": 991 }, { "batch_size": 4, "epoch": 0.3964, "step": 991, "tokens_per_device": 7980 }, { "epoch": 0.3964, "loss_ce": 0.007592920213937759, "loss_lvr": 0.8674652576446533, "loss_mode_switch": 0.0, "loss_total": 0.09433944523334503, "step": 991 }, { "batch_size": 1, "epoch": 0.3964, "step": 991, "tokens_per_device": 4969 }, { "epoch": 0.3964, "loss_ce": 0.05935017392039299, "loss_lvr": 0.6440860629081726, "loss_mode_switch": 0.0, "loss_total": 0.12375877797603607, "step": 991 }, { "batch_size": 4, "epoch": 0.3964, "step": 991, "tokens_per_device": 5600 }, { "epoch": 0.3964, "loss_ce": 0.010705042630434036, "loss_lvr": 0.6669774651527405, "loss_mode_switch": 0.0, "loss_total": 0.07740278542041779, "step": 991 }, { "batch_size": 1, "epoch": 0.3964, "step": 991, "tokens_per_device": 4912 }, { "epoch": 0.3964, "loss_ce": 0.12408515065908432, "loss_lvr": 0.22496630251407623, "loss_mode_switch": 0.0, "loss_total": 0.14658178389072418, "step": 991 }, { "batch_size": 4, "epoch": 0.3964, "step": 991, "tokens_per_device": 13868 }, { "epoch": 0.3964, "loss_ce": 0.2510508596897125, "loss_lvr": 0.7835693359375, "loss_mode_switch": 0.0, "loss_total": 0.32940781116485596, "step": 991 }, { "batch_size": 4, "epoch": 0.3964, "step": 991, "tokens_per_device": 5500 }, { "epoch": 0.3964, "loss_ce": 0.4679500460624695, "loss_lvr": 0.9952645897865295, "loss_mode_switch": 0.0, "loss_total": 0.5674765110015869, "step": 991 }, { "batch_size": 4, "epoch": 0.3964, "step": 991, "tokens_per_device": 4248 }, { "epoch": 0.3964, "loss_ce": 0.1279028058052063, "loss_lvr": 0.6155508160591125, "loss_mode_switch": 0.0, "loss_total": 0.18945789337158203, "step": 991 }, { "batch_size": 1, "epoch": 0.3964, "step": 991, "tokens_per_device": 4670 }, { "epoch": 0.3964, "loss_ce": 0.07927615195512772, "loss_lvr": 0.7812519669532776, "loss_mode_switch": 0.0, "loss_total": 0.15740135312080383, "step": 991 }, { "epoch": 0.3968, "grad_norm": 1.4028757810592651, "learning_rate": 6.867693199008285e-06, "loss": 0.3051, "step": 992 }, { "batch_size": 1, "epoch": 0.3968, "step": 992, "tokens_per_device": 5135 }, { "epoch": 0.3968, "loss_ce": 0.009194775484502316, "loss_lvr": 0.35491591691970825, "loss_mode_switch": 0.0, "loss_total": 0.044686369597911835, "step": 992 }, { "batch_size": 4, "epoch": 0.3968, "step": 992, "tokens_per_device": 7164 }, { "epoch": 0.3968, "loss_ce": 0.1502750664949417, "loss_lvr": 0.6393316984176636, "loss_mode_switch": 0.0, "loss_total": 0.21420824527740479, "step": 992 }, { "batch_size": 1, "epoch": 0.3968, "step": 992, "tokens_per_device": 5287 }, { "epoch": 0.3968, "loss_ce": 0.005832333117723465, "loss_lvr": 0.5999762415885925, "loss_mode_switch": 0.0, "loss_total": 0.06582996249198914, "step": 992 }, { "batch_size": 4, "epoch": 0.3968, "step": 992, "tokens_per_device": 6620 }, { "epoch": 0.3968, "loss_ce": 0.14464138448238373, "loss_lvr": 0.7430497407913208, "loss_mode_switch": 0.0, "loss_total": 0.21894636750221252, "step": 992 }, { "batch_size": 4, "epoch": 0.3968, "step": 992, "tokens_per_device": 4560 }, { "epoch": 0.3968, "loss_ce": 0.001360977184958756, "loss_lvr": 1.3359278440475464, "loss_mode_switch": 0.0, "loss_total": 0.13495376706123352, "step": 992 }, { "batch_size": 4, "epoch": 0.3968, "step": 992, "tokens_per_device": 2768 }, { "epoch": 0.3968, "loss_ce": 0.16913986206054688, "loss_lvr": 0.7776728868484497, "loss_mode_switch": 0.0, "loss_total": 0.24690714478492737, "step": 992 }, { "batch_size": 4, "epoch": 0.3968, "step": 992, "tokens_per_device": 2940 }, { "epoch": 0.3968, "loss_ce": 0.799930989742279, "loss_lvr": 0.7995544075965881, "loss_mode_switch": 0.0, "loss_total": 0.8798864483833313, "step": 992 }, { "batch_size": 1, "epoch": 0.3968, "step": 992, "tokens_per_device": 5116 }, { "epoch": 0.3968, "loss_ce": 0.3122214376926422, "loss_lvr": 0.6502611041069031, "loss_mode_switch": 0.0, "loss_total": 0.37724754214286804, "step": 992 }, { "epoch": 0.3972, "grad_norm": 1.2941490411758423, "learning_rate": 6.8616829998279295e-06, "loss": 0.2726, "step": 993 }, { "batch_size": 4, "epoch": 0.3972, "step": 993, "tokens_per_device": 2664 }, { "epoch": 0.3972, "loss_ce": 0.22522863745689392, "loss_lvr": 0.9991922974586487, "loss_mode_switch": 0.0, "loss_total": 0.3251478672027588, "step": 993 }, { "batch_size": 1, "epoch": 0.3972, "step": 993, "tokens_per_device": 5188 }, { "epoch": 0.3972, "loss_ce": 0.7335890531539917, "loss_lvr": 0.577297568321228, "loss_mode_switch": 0.0, "loss_total": 0.7913188338279724, "step": 993 }, { "batch_size": 4, "epoch": 0.3972, "step": 993, "tokens_per_device": 2684 }, { "epoch": 0.3972, "loss_ce": 0.69406658411026, "loss_lvr": 0.8224422335624695, "loss_mode_switch": 0.0, "loss_total": 0.7763108015060425, "step": 993 }, { "batch_size": 4, "epoch": 0.3972, "step": 993, "tokens_per_device": 4788 }, { "epoch": 0.3972, "loss_ce": 0.03482306748628616, "loss_lvr": 0.8909206986427307, "loss_mode_switch": 0.0, "loss_total": 0.12391513586044312, "step": 993 }, { "batch_size": 4, "epoch": 0.3972, "step": 993, "tokens_per_device": 5316 }, { "epoch": 0.3972, "loss_ce": 0.10041264444589615, "loss_lvr": 0.7752770781517029, "loss_mode_switch": 0.0, "loss_total": 0.17794035375118256, "step": 993 }, { "batch_size": 4, "epoch": 0.3972, "step": 993, "tokens_per_device": 2528 }, { "epoch": 0.3972, "loss_ce": 0.37344643473625183, "loss_lvr": 0.8396143913269043, "loss_mode_switch": 0.0, "loss_total": 0.4574078917503357, "step": 993 }, { "batch_size": 4, "epoch": 0.3972, "step": 993, "tokens_per_device": 1172 }, { "epoch": 0.3972, "loss_ce": 0.26704689860343933, "loss_lvr": 1.1291611194610596, "loss_mode_switch": 0.0, "loss_total": 0.3799630105495453, "step": 993 }, { "batch_size": 4, "epoch": 0.3972, "step": 993, "tokens_per_device": 3536 }, { "epoch": 0.3972, "loss_ce": 0.5864690542221069, "loss_lvr": 1.134912133216858, "loss_mode_switch": 0.0, "loss_total": 0.6999602913856506, "step": 993 }, { "epoch": 0.3976, "grad_norm": 1.4468239545822144, "learning_rate": 6.8556696761374844e-06, "loss": 0.3028, "step": 994 }, { "batch_size": 4, "epoch": 0.3976, "step": 994, "tokens_per_device": 5668 }, { "epoch": 0.3976, "loss_ce": 0.023699738085269928, "loss_lvr": 0.868274986743927, "loss_mode_switch": 0.0, "loss_total": 0.11052723973989487, "step": 994 }, { "batch_size": 4, "epoch": 0.3976, "step": 994, "tokens_per_device": 3592 }, { "epoch": 0.3976, "loss_ce": 0.011076630093157291, "loss_lvr": 0.6475039720535278, "loss_mode_switch": 0.0, "loss_total": 0.07582702487707138, "step": 994 }, { "batch_size": 4, "epoch": 0.3976, "step": 994, "tokens_per_device": 5868 }, { "epoch": 0.3976, "loss_ce": 0.15814486145973206, "loss_lvr": 0.5770301818847656, "loss_mode_switch": 0.0, "loss_total": 0.21584787964820862, "step": 994 }, { "batch_size": 4, "epoch": 0.3976, "step": 994, "tokens_per_device": 1596 }, { "epoch": 0.3976, "loss_ce": 0.40011370182037354, "loss_lvr": 2.7628448009490967, "loss_mode_switch": 0.0, "loss_total": 0.6763981580734253, "step": 994 }, { "batch_size": 1, "epoch": 0.3976, "step": 994, "tokens_per_device": 5260 }, { "epoch": 0.3976, "loss_ce": 0.02182234264910221, "loss_lvr": 0.4178503155708313, "loss_mode_switch": 0.0, "loss_total": 0.06360737234354019, "step": 994 }, { "batch_size": 4, "epoch": 0.3976, "step": 994, "tokens_per_device": 4564 }, { "epoch": 0.3976, "loss_ce": 0.25210630893707275, "loss_lvr": 1.2387425899505615, "loss_mode_switch": 0.0, "loss_total": 0.37598055601119995, "step": 994 }, { "batch_size": 4, "epoch": 0.3976, "step": 994, "tokens_per_device": 4216 }, { "epoch": 0.3976, "loss_ce": 0.3199715316295624, "loss_lvr": 1.0258945226669312, "loss_mode_switch": 0.0, "loss_total": 0.42256098985671997, "step": 994 }, { "batch_size": 4, "epoch": 0.3976, "step": 994, "tokens_per_device": 4252 }, { "epoch": 0.3976, "loss_ce": 0.12328917533159256, "loss_lvr": 1.1834990978240967, "loss_mode_switch": 0.0, "loss_total": 0.24163907766342163, "step": 994 }, { "epoch": 0.398, "grad_norm": 1.9767197370529175, "learning_rate": 6.849653238029261e-06, "loss": 0.3098, "step": 995 }, { "batch_size": 4, "epoch": 0.398, "step": 995, "tokens_per_device": 4332 }, { "epoch": 0.398, "loss_ce": 0.3787793219089508, "loss_lvr": 1.0415854454040527, "loss_mode_switch": 0.0, "loss_total": 0.48293787240982056, "step": 995 }, { "batch_size": 1, "epoch": 0.398, "step": 995, "tokens_per_device": 4880 }, { "epoch": 0.398, "loss_ce": 0.0011261629406362772, "loss_lvr": 0.34720340371131897, "loss_mode_switch": 0.0, "loss_total": 0.03584650531411171, "step": 995 }, { "batch_size": 4, "epoch": 0.398, "step": 995, "tokens_per_device": 3832 }, { "epoch": 0.398, "loss_ce": 0.1700817495584488, "loss_lvr": 0.9417942762374878, "loss_mode_switch": 0.0, "loss_total": 0.2642611861228943, "step": 995 }, { "batch_size": 1, "epoch": 0.398, "step": 995, "tokens_per_device": 4166 }, { "epoch": 0.398, "loss_ce": 0.0021313223987817764, "loss_lvr": 0.6526251435279846, "loss_mode_switch": 0.0, "loss_total": 0.06739383935928345, "step": 995 }, { "batch_size": 1, "epoch": 0.398, "step": 995, "tokens_per_device": 5094 }, { "epoch": 0.398, "loss_ce": 0.01159489806741476, "loss_lvr": 0.38045641779899597, "loss_mode_switch": 0.0, "loss_total": 0.04964054003357887, "step": 995 }, { "batch_size": 4, "epoch": 0.398, "step": 995, "tokens_per_device": 4436 }, { "epoch": 0.398, "loss_ce": 0.4417973756790161, "loss_lvr": 0.9831279516220093, "loss_mode_switch": 0.0, "loss_total": 0.540110170841217, "step": 995 }, { "batch_size": 4, "epoch": 0.398, "step": 995, "tokens_per_device": 1436 }, { "epoch": 0.398, "loss_ce": 0.4260692596435547, "loss_lvr": 0.9093843698501587, "loss_mode_switch": 0.0, "loss_total": 0.5170077085494995, "step": 995 }, { "batch_size": 4, "epoch": 0.398, "step": 995, "tokens_per_device": 4248 }, { "epoch": 0.398, "loss_ce": 0.014724714681506157, "loss_lvr": 1.131807565689087, "loss_mode_switch": 0.0, "loss_total": 0.12790547311306, "step": 995 }, { "epoch": 0.3984, "grad_norm": 1.3620214462280273, "learning_rate": 6.843633695600802e-06, "loss": 0.3129, "step": 996 }, { "batch_size": 1, "epoch": 0.3984, "step": 996, "tokens_per_device": 5058 }, { "epoch": 0.3984, "loss_ce": 0.10501036792993546, "loss_lvr": 0.5351628065109253, "loss_mode_switch": 0.0, "loss_total": 0.15852664411067963, "step": 996 }, { "batch_size": 4, "epoch": 0.3984, "step": 996, "tokens_per_device": 3880 }, { "epoch": 0.3984, "loss_ce": 0.026828397065401077, "loss_lvr": 0.5879905223846436, "loss_mode_switch": 0.0, "loss_total": 0.08562745153903961, "step": 996 }, { "batch_size": 4, "epoch": 0.3984, "step": 996, "tokens_per_device": 4268 }, { "epoch": 0.3984, "loss_ce": 0.3028426766395569, "loss_lvr": 0.9102292656898499, "loss_mode_switch": 0.0, "loss_total": 0.3938656151294708, "step": 996 }, { "batch_size": 4, "epoch": 0.3984, "step": 996, "tokens_per_device": 13308 }, { "epoch": 0.3984, "loss_ce": 0.001959410961717367, "loss_lvr": 0.7302976250648499, "loss_mode_switch": 0.0, "loss_total": 0.07498917728662491, "step": 996 }, { "batch_size": 4, "epoch": 0.3984, "step": 996, "tokens_per_device": 4240 }, { "epoch": 0.3984, "loss_ce": 0.05755219608545303, "loss_lvr": 0.9520072937011719, "loss_mode_switch": 0.0, "loss_total": 0.15275293588638306, "step": 996 }, { "batch_size": 1, "epoch": 0.3984, "step": 996, "tokens_per_device": 4903 }, { "epoch": 0.3984, "loss_ce": 0.09908227622509003, "loss_lvr": 0.4035260081291199, "loss_mode_switch": 0.0, "loss_total": 0.13943487405776978, "step": 996 }, { "batch_size": 4, "epoch": 0.3984, "step": 996, "tokens_per_device": 5684 }, { "epoch": 0.3984, "loss_ce": 0.07887930423021317, "loss_lvr": 0.9095457196235657, "loss_mode_switch": 0.0, "loss_total": 0.16983386874198914, "step": 996 }, { "batch_size": 4, "epoch": 0.3984, "step": 996, "tokens_per_device": 2600 }, { "epoch": 0.3984, "loss_ce": 0.1383259892463684, "loss_lvr": 0.8136237859725952, "loss_mode_switch": 0.0, "loss_total": 0.21968837082386017, "step": 996 }, { "epoch": 0.3988, "grad_norm": 1.5070781707763672, "learning_rate": 6.837611058954858e-06, "loss": 0.3002, "step": 997 }, { "batch_size": 1, "epoch": 0.3988, "step": 997, "tokens_per_device": 4890 }, { "epoch": 0.3988, "loss_ce": 0.48700663447380066, "loss_lvr": 0.42459678649902344, "loss_mode_switch": 0.0, "loss_total": 0.5294663310050964, "step": 997 }, { "batch_size": 1, "epoch": 0.3988, "step": 997, "tokens_per_device": 4875 }, { "epoch": 0.3988, "loss_ce": 0.00723687931895256, "loss_lvr": 0.8129280805587769, "loss_mode_switch": 0.0, "loss_total": 0.08852969110012054, "step": 997 }, { "batch_size": 1, "epoch": 0.3988, "step": 997, "tokens_per_device": 4926 }, { "epoch": 0.3988, "loss_ce": 0.021153327077627182, "loss_lvr": 0.6435988545417786, "loss_mode_switch": 0.0, "loss_total": 0.08551321923732758, "step": 997 }, { "batch_size": 4, "epoch": 0.3988, "step": 997, "tokens_per_device": 4672 }, { "epoch": 0.3988, "loss_ce": 0.09301406145095825, "loss_lvr": 0.88550865650177, "loss_mode_switch": 0.0, "loss_total": 0.18156492710113525, "step": 997 }, { "batch_size": 4, "epoch": 0.3988, "step": 997, "tokens_per_device": 4384 }, { "epoch": 0.3988, "loss_ce": 0.07006649672985077, "loss_lvr": 1.2509586811065674, "loss_mode_switch": 0.0, "loss_total": 0.19516237080097198, "step": 997 }, { "batch_size": 4, "epoch": 0.3988, "step": 997, "tokens_per_device": 1400 }, { "epoch": 0.3988, "loss_ce": 0.5003994107246399, "loss_lvr": 0.9942870140075684, "loss_mode_switch": 0.0, "loss_total": 0.5998281240463257, "step": 997 }, { "batch_size": 4, "epoch": 0.3988, "step": 997, "tokens_per_device": 4484 }, { "epoch": 0.3988, "loss_ce": 0.23420493304729462, "loss_lvr": 0.9066810011863708, "loss_mode_switch": 0.0, "loss_total": 0.32487303018569946, "step": 997 }, { "batch_size": 1, "epoch": 0.3988, "step": 997, "tokens_per_device": 4884 }, { "epoch": 0.3988, "loss_ce": 0.23741258680820465, "loss_lvr": 0.2586955726146698, "loss_mode_switch": 0.0, "loss_total": 0.2632821500301361, "step": 997 }, { "epoch": 0.3992, "grad_norm": 1.4767091274261475, "learning_rate": 6.831585338199375e-06, "loss": 0.2838, "step": 998 }, { "batch_size": 4, "epoch": 0.3992, "step": 998, "tokens_per_device": 4232 }, { "epoch": 0.3992, "loss_ce": 0.4330381453037262, "loss_lvr": 1.2307112216949463, "loss_mode_switch": 0.0, "loss_total": 0.5561092495918274, "step": 998 }, { "batch_size": 1, "epoch": 0.3992, "step": 998, "tokens_per_device": 4887 }, { "epoch": 0.3992, "loss_ce": 0.1759335994720459, "loss_lvr": 0.5936910510063171, "loss_mode_switch": 0.0, "loss_total": 0.23530270159244537, "step": 998 }, { "batch_size": 4, "epoch": 0.3992, "step": 998, "tokens_per_device": 5472 }, { "epoch": 0.3992, "loss_ce": 0.4022100269794464, "loss_lvr": 0.5345773696899414, "loss_mode_switch": 0.0, "loss_total": 0.45566776394844055, "step": 998 }, { "batch_size": 1, "epoch": 0.3992, "step": 998, "tokens_per_device": 5117 }, { "epoch": 0.3992, "loss_ce": 0.004159464966505766, "loss_lvr": 0.433914452791214, "loss_mode_switch": 0.0, "loss_total": 0.04755091294646263, "step": 998 }, { "batch_size": 4, "epoch": 0.3992, "step": 998, "tokens_per_device": 4140 }, { "epoch": 0.3992, "loss_ce": 0.3750561773777008, "loss_lvr": 0.9015036225318909, "loss_mode_switch": 0.0, "loss_total": 0.4652065336704254, "step": 998 }, { "batch_size": 4, "epoch": 0.3992, "step": 998, "tokens_per_device": 8492 }, { "epoch": 0.3992, "loss_ce": 0.023817213252186775, "loss_lvr": 0.8375223875045776, "loss_mode_switch": 0.0, "loss_total": 0.10756944864988327, "step": 998 }, { "batch_size": 4, "epoch": 0.3992, "step": 998, "tokens_per_device": 4416 }, { "epoch": 0.3992, "loss_ce": 0.11671438813209534, "loss_lvr": 0.9540022611618042, "loss_mode_switch": 0.0, "loss_total": 0.212114617228508, "step": 998 }, { "batch_size": 4, "epoch": 0.3992, "step": 998, "tokens_per_device": 2664 }, { "epoch": 0.3992, "loss_ce": 0.06925126910209656, "loss_lvr": 0.8230225443840027, "loss_mode_switch": 0.0, "loss_total": 0.15155352652072906, "step": 998 }, { "epoch": 0.3996, "grad_norm": 1.2151737213134766, "learning_rate": 6.825556543447476e-06, "loss": 0.2981, "step": 999 }, { "batch_size": 4, "epoch": 0.3996, "step": 999, "tokens_per_device": 4584 }, { "epoch": 0.3996, "loss_ce": 0.438424289226532, "loss_lvr": 0.814018964767456, "loss_mode_switch": 0.0, "loss_total": 0.5198261737823486, "step": 999 }, { "batch_size": 4, "epoch": 0.3996, "step": 999, "tokens_per_device": 7048 }, { "epoch": 0.3996, "loss_ce": 0.5053809881210327, "loss_lvr": 0.9451563358306885, "loss_mode_switch": 0.0, "loss_total": 0.5998966097831726, "step": 999 }, { "batch_size": 1, "epoch": 0.3996, "step": 999, "tokens_per_device": 4875 }, { "epoch": 0.3996, "loss_ce": 0.023341434076428413, "loss_lvr": 0.4440535306930542, "loss_mode_switch": 0.0, "loss_total": 0.06774678826332092, "step": 999 }, { "batch_size": 4, "epoch": 0.3996, "step": 999, "tokens_per_device": 3768 }, { "epoch": 0.3996, "loss_ce": 0.43562397360801697, "loss_lvr": 0.9905261397361755, "loss_mode_switch": 0.0, "loss_total": 0.5346766114234924, "step": 999 }, { "batch_size": 1, "epoch": 0.3996, "step": 999, "tokens_per_device": 5038 }, { "epoch": 0.3996, "loss_ce": 0.16744542121887207, "loss_lvr": 0.5081032514572144, "loss_mode_switch": 0.0, "loss_total": 0.21825574338436127, "step": 999 }, { "batch_size": 4, "epoch": 0.3996, "step": 999, "tokens_per_device": 4668 }, { "epoch": 0.3996, "loss_ce": 0.24928756058216095, "loss_lvr": 0.8812459111213684, "loss_mode_switch": 0.0, "loss_total": 0.33741214871406555, "step": 999 }, { "batch_size": 4, "epoch": 0.3996, "step": 999, "tokens_per_device": 3964 }, { "epoch": 0.3996, "loss_ce": 0.012619547545909882, "loss_lvr": 0.6358755826950073, "loss_mode_switch": 0.0, "loss_total": 0.07620710879564285, "step": 999 }, { "batch_size": 4, "epoch": 0.3996, "step": 999, "tokens_per_device": 4912 }, { "epoch": 0.3996, "loss_ce": 0.006017869338393211, "loss_lvr": 0.7590722441673279, "loss_mode_switch": 0.0, "loss_total": 0.08192509412765503, "step": 999 }, { "epoch": 0.4, "grad_norm": 1.363625407218933, "learning_rate": 6.819524684817439e-06, "loss": 0.3128, "step": 1000 }, { "batch_size": 4, "epoch": 0.4, "step": 1000, "tokens_per_device": 5260 }, { "epoch": 0.4, "loss_ce": 0.40649083256721497, "loss_lvr": 0.9100536704063416, "loss_mode_switch": 0.0, "loss_total": 0.49749618768692017, "step": 1000 }, { "batch_size": 4, "epoch": 0.4, "step": 1000, "tokens_per_device": 3796 }, { "epoch": 0.4, "loss_ce": 0.2528855502605438, "loss_lvr": 0.7274863719940186, "loss_mode_switch": 0.0, "loss_total": 0.3256341814994812, "step": 1000 }, { "batch_size": 4, "epoch": 0.4, "step": 1000, "tokens_per_device": 11068 }, { "epoch": 0.4, "loss_ce": 0.03969332575798035, "loss_lvr": 0.5749158263206482, "loss_mode_switch": 0.0, "loss_total": 0.0971849113702774, "step": 1000 }, { "batch_size": 4, "epoch": 0.4, "step": 1000, "tokens_per_device": 1472 }, { "epoch": 0.4, "loss_ce": 0.33401528000831604, "loss_lvr": 1.1750843524932861, "loss_mode_switch": 0.0, "loss_total": 0.45152372121810913, "step": 1000 }, { "batch_size": 4, "epoch": 0.4, "step": 1000, "tokens_per_device": 1580 }, { "epoch": 0.4, "loss_ce": 0.18497338891029358, "loss_lvr": 1.0148457288742065, "loss_mode_switch": 0.0, "loss_total": 0.28645795583724976, "step": 1000 }, { "batch_size": 4, "epoch": 0.4, "step": 1000, "tokens_per_device": 2920 }, { "epoch": 0.4, "loss_ce": 0.10001419484615326, "loss_lvr": 0.7188662886619568, "loss_mode_switch": 0.0, "loss_total": 0.17190082371234894, "step": 1000 }, { "batch_size": 4, "epoch": 0.4, "step": 1000, "tokens_per_device": 3204 }, { "epoch": 0.4, "loss_ce": 0.24293561279773712, "loss_lvr": 1.0208733081817627, "loss_mode_switch": 0.0, "loss_total": 0.34502294659614563, "step": 1000 }, { "batch_size": 4, "epoch": 0.4, "step": 1000, "tokens_per_device": 4324 }, { "epoch": 0.4, "loss_ce": 0.3093753755092621, "loss_lvr": 0.859203040599823, "loss_mode_switch": 0.0, "loss_total": 0.3952956795692444, "step": 1000 }, { "epoch": 0.4004, "grad_norm": 1.2036546468734741, "learning_rate": 6.8134897724326846e-06, "loss": 0.275, "step": 1001 }, { "batch_size": 4, "epoch": 0.4004, "step": 1001, "tokens_per_device": 2892 }, { "epoch": 0.4004, "loss_ce": 0.33685171604156494, "loss_lvr": 0.9000573754310608, "loss_mode_switch": 0.0, "loss_total": 0.42685747146606445, "step": 1001 }, { "batch_size": 4, "epoch": 0.4004, "step": 1001, "tokens_per_device": 6920 }, { "epoch": 0.4004, "loss_ce": 0.1062488779425621, "loss_lvr": 0.9447477459907532, "loss_mode_switch": 0.0, "loss_total": 0.20072364807128906, "step": 1001 }, { "batch_size": 4, "epoch": 0.4004, "step": 1001, "tokens_per_device": 3752 }, { "epoch": 0.4004, "loss_ce": 0.1689455211162567, "loss_lvr": 1.0666964054107666, "loss_mode_switch": 0.0, "loss_total": 0.2756151556968689, "step": 1001 }, { "batch_size": 4, "epoch": 0.4004, "step": 1001, "tokens_per_device": 1252 }, { "epoch": 0.4004, "loss_ce": 0.16526024043560028, "loss_lvr": 1.1827822923660278, "loss_mode_switch": 0.0, "loss_total": 0.28353846073150635, "step": 1001 }, { "batch_size": 1, "epoch": 0.4004, "step": 1001, "tokens_per_device": 4899 }, { "epoch": 0.4004, "loss_ce": 0.07653883099555969, "loss_lvr": 0.3818179666996002, "loss_mode_switch": 0.0, "loss_total": 0.11472062766551971, "step": 1001 }, { "batch_size": 4, "epoch": 0.4004, "step": 1001, "tokens_per_device": 2732 }, { "epoch": 0.4004, "loss_ce": 0.25571444630622864, "loss_lvr": 0.6498335003852844, "loss_mode_switch": 0.0, "loss_total": 0.3206977844238281, "step": 1001 }, { "batch_size": 4, "epoch": 0.4004, "step": 1001, "tokens_per_device": 12736 }, { "epoch": 0.4004, "loss_ce": 0.12453556060791016, "loss_lvr": 1.0553536415100098, "loss_mode_switch": 0.0, "loss_total": 0.23007091879844666, "step": 1001 }, { "batch_size": 4, "epoch": 0.4004, "step": 1001, "tokens_per_device": 2700 }, { "epoch": 0.4004, "loss_ce": 0.5268036127090454, "loss_lvr": 0.9645638465881348, "loss_mode_switch": 0.0, "loss_total": 0.6232600212097168, "step": 1001 }, { "epoch": 0.4008, "grad_norm": 1.2702258825302124, "learning_rate": 6.807451816421762e-06, "loss": 0.3396, "step": 1002 }, { "batch_size": 4, "epoch": 0.4008, "step": 1002, "tokens_per_device": 3316 }, { "epoch": 0.4008, "loss_ce": 0.08856900781393051, "loss_lvr": 1.161770224571228, "loss_mode_switch": 0.0, "loss_total": 0.2047460377216339, "step": 1002 }, { "batch_size": 1, "epoch": 0.4008, "step": 1002, "tokens_per_device": 4910 }, { "epoch": 0.4008, "loss_ce": 0.037516526877880096, "loss_lvr": 0.7277138233184814, "loss_mode_switch": 0.0, "loss_total": 0.11028791218996048, "step": 1002 }, { "batch_size": 4, "epoch": 0.4008, "step": 1002, "tokens_per_device": 1680 }, { "epoch": 0.4008, "loss_ce": 0.32728397846221924, "loss_lvr": 1.1386528015136719, "loss_mode_switch": 0.0, "loss_total": 0.4411492645740509, "step": 1002 }, { "batch_size": 1, "epoch": 0.4008, "step": 1002, "tokens_per_device": 4907 }, { "epoch": 0.4008, "loss_ce": 0.06405523419380188, "loss_lvr": 0.6570438146591187, "loss_mode_switch": 0.0, "loss_total": 0.12975960969924927, "step": 1002 }, { "batch_size": 4, "epoch": 0.4008, "step": 1002, "tokens_per_device": 16028 }, { "epoch": 0.4008, "loss_ce": 0.259921669960022, "loss_lvr": 0.9210453033447266, "loss_mode_switch": 0.0, "loss_total": 0.35202619433403015, "step": 1002 }, { "batch_size": 1, "epoch": 0.4008, "step": 1002, "tokens_per_device": 4870 }, { "epoch": 0.4008, "loss_ce": 0.019766274839639664, "loss_lvr": 1.218678593635559, "loss_mode_switch": 0.0, "loss_total": 0.14163413643836975, "step": 1002 }, { "batch_size": 4, "epoch": 0.4008, "step": 1002, "tokens_per_device": 8500 }, { "epoch": 0.4008, "loss_ce": 0.2747937738895416, "loss_lvr": 0.9119816422462463, "loss_mode_switch": 0.0, "loss_total": 0.3659919500350952, "step": 1002 }, { "batch_size": 4, "epoch": 0.4008, "step": 1002, "tokens_per_device": 4360 }, { "epoch": 0.4008, "loss_ce": 0.1489170491695404, "loss_lvr": 0.7895693778991699, "loss_mode_switch": 0.0, "loss_total": 0.22787398099899292, "step": 1002 }, { "epoch": 0.4012, "grad_norm": 1.2996602058410645, "learning_rate": 6.801410826918327e-06, "loss": 0.3232, "step": 1003 }, { "batch_size": 1, "epoch": 0.4012, "step": 1003, "tokens_per_device": 5165 }, { "epoch": 0.4012, "loss_ce": 0.0014041687827557325, "loss_lvr": 0.7992857098579407, "loss_mode_switch": 0.0, "loss_total": 0.0813327357172966, "step": 1003 }, { "batch_size": 4, "epoch": 0.4012, "step": 1003, "tokens_per_device": 1408 }, { "epoch": 0.4012, "loss_ce": 0.48955583572387695, "loss_lvr": 1.2839630842208862, "loss_mode_switch": 0.0, "loss_total": 0.6179521679878235, "step": 1003 }, { "batch_size": 4, "epoch": 0.4012, "step": 1003, "tokens_per_device": 4756 }, { "epoch": 0.4012, "loss_ce": 0.033342041075229645, "loss_lvr": 0.796051025390625, "loss_mode_switch": 0.0, "loss_total": 0.11294714361429214, "step": 1003 }, { "batch_size": 1, "epoch": 0.4012, "step": 1003, "tokens_per_device": 5059 }, { "epoch": 0.4012, "loss_ce": 0.0056927623227238655, "loss_lvr": 0.5332337617874146, "loss_mode_switch": 0.0, "loss_total": 0.059016138315200806, "step": 1003 }, { "batch_size": 4, "epoch": 0.4012, "step": 1003, "tokens_per_device": 10828 }, { "epoch": 0.4012, "loss_ce": 0.3737267553806305, "loss_lvr": 0.7837535738945007, "loss_mode_switch": 0.0, "loss_total": 0.4521021246910095, "step": 1003 }, { "batch_size": 4, "epoch": 0.4012, "step": 1003, "tokens_per_device": 13756 }, { "epoch": 0.4012, "loss_ce": 0.12966617941856384, "loss_lvr": 0.8056383728981018, "loss_mode_switch": 0.0, "loss_total": 0.2102300226688385, "step": 1003 }, { "batch_size": 1, "epoch": 0.4012, "step": 1003, "tokens_per_device": 5853 }, { "epoch": 0.4012, "loss_ce": 0.008662653155624866, "loss_lvr": 0.36258795857429504, "loss_mode_switch": 0.0, "loss_total": 0.044921450316905975, "step": 1003 }, { "batch_size": 4, "epoch": 0.4012, "step": 1003, "tokens_per_device": 2756 }, { "epoch": 0.4012, "loss_ce": 0.6320943236351013, "loss_lvr": 0.6627116799354553, "loss_mode_switch": 0.0, "loss_total": 0.6983655095100403, "step": 1003 }, { "epoch": 0.4016, "grad_norm": 1.2601573467254639, "learning_rate": 6.7953668140611264e-06, "loss": 0.2914, "step": 1004 }, { "batch_size": 4, "epoch": 0.4016, "step": 1004, "tokens_per_device": 4156 }, { "epoch": 0.4016, "loss_ce": 0.11989350616931915, "loss_lvr": 1.0113517045974731, "loss_mode_switch": 0.0, "loss_total": 0.22102868556976318, "step": 1004 }, { "batch_size": 4, "epoch": 0.4016, "step": 1004, "tokens_per_device": 5180 }, { "epoch": 0.4016, "loss_ce": 0.13540945947170258, "loss_lvr": 0.7597231268882751, "loss_mode_switch": 0.0, "loss_total": 0.21138176321983337, "step": 1004 }, { "batch_size": 4, "epoch": 0.4016, "step": 1004, "tokens_per_device": 3752 }, { "epoch": 0.4016, "loss_ce": 0.30561161041259766, "loss_lvr": 0.8334059715270996, "loss_mode_switch": 0.0, "loss_total": 0.38895219564437866, "step": 1004 }, { "batch_size": 4, "epoch": 0.4016, "step": 1004, "tokens_per_device": 1336 }, { "epoch": 0.4016, "loss_ce": 0.7696217894554138, "loss_lvr": 1.106197476387024, "loss_mode_switch": 0.0, "loss_total": 0.8802415132522583, "step": 1004 }, { "batch_size": 4, "epoch": 0.4016, "step": 1004, "tokens_per_device": 13620 }, { "epoch": 0.4016, "loss_ce": 0.15208600461483002, "loss_lvr": 0.7949216961860657, "loss_mode_switch": 0.0, "loss_total": 0.23157817125320435, "step": 1004 }, { "batch_size": 4, "epoch": 0.4016, "step": 1004, "tokens_per_device": 1576 }, { "epoch": 0.4016, "loss_ce": 0.2597201466560364, "loss_lvr": 0.7357763648033142, "loss_mode_switch": 0.0, "loss_total": 0.3332977890968323, "step": 1004 }, { "batch_size": 4, "epoch": 0.4016, "step": 1004, "tokens_per_device": 16272 }, { "epoch": 0.4016, "loss_ce": 0.07134449481964111, "loss_lvr": 0.6091100573539734, "loss_mode_switch": 0.0, "loss_total": 0.13225549459457397, "step": 1004 }, { "batch_size": 1, "epoch": 0.4016, "step": 1004, "tokens_per_device": 5090 }, { "epoch": 0.4016, "loss_ce": 0.008712363429367542, "loss_lvr": 0.9567106366157532, "loss_mode_switch": 0.0, "loss_total": 0.1043834313750267, "step": 1004 }, { "epoch": 0.402, "grad_norm": 1.521959900856018, "learning_rate": 6.78931978799398e-06, "loss": 0.3359, "step": 1005 }, { "batch_size": 4, "epoch": 0.402, "step": 1005, "tokens_per_device": 1376 }, { "epoch": 0.402, "loss_ce": 0.6157352328300476, "loss_lvr": 0.9241423606872559, "loss_mode_switch": 0.0, "loss_total": 0.7081494927406311, "step": 1005 }, { "batch_size": 4, "epoch": 0.402, "step": 1005, "tokens_per_device": 1336 }, { "epoch": 0.402, "loss_ce": 0.07225765287876129, "loss_lvr": 1.2074991464614868, "loss_mode_switch": 0.0, "loss_total": 0.19300755858421326, "step": 1005 }, { "batch_size": 4, "epoch": 0.402, "step": 1005, "tokens_per_device": 5668 }, { "epoch": 0.402, "loss_ce": 0.025121552869677544, "loss_lvr": 0.9144638776779175, "loss_mode_switch": 0.0, "loss_total": 0.1165679469704628, "step": 1005 }, { "batch_size": 4, "epoch": 0.402, "step": 1005, "tokens_per_device": 4580 }, { "epoch": 0.402, "loss_ce": 0.35042688250541687, "loss_lvr": 0.8118572235107422, "loss_mode_switch": 0.0, "loss_total": 0.43161261081695557, "step": 1005 }, { "batch_size": 4, "epoch": 0.402, "step": 1005, "tokens_per_device": 3764 }, { "epoch": 0.402, "loss_ce": 0.04913104325532913, "loss_lvr": 0.7804723381996155, "loss_mode_switch": 0.0, "loss_total": 0.12717828154563904, "step": 1005 }, { "batch_size": 4, "epoch": 0.402, "step": 1005, "tokens_per_device": 2752 }, { "epoch": 0.402, "loss_ce": 0.1991620659828186, "loss_lvr": 1.7779992818832397, "loss_mode_switch": 0.0, "loss_total": 0.37696200609207153, "step": 1005 }, { "batch_size": 4, "epoch": 0.402, "step": 1005, "tokens_per_device": 3916 }, { "epoch": 0.402, "loss_ce": 0.6966482400894165, "loss_lvr": 0.9539644718170166, "loss_mode_switch": 0.0, "loss_total": 0.7920446991920471, "step": 1005 }, { "batch_size": 1, "epoch": 0.402, "step": 1005, "tokens_per_device": 4249 }, { "epoch": 0.402, "loss_ce": 0.011498766951262951, "loss_lvr": 0.40241777896881104, "loss_mode_switch": 0.0, "loss_total": 0.05174054577946663, "step": 1005 }, { "epoch": 0.4024, "grad_norm": 1.3298488855361938, "learning_rate": 6.783269758865768e-06, "loss": 0.287, "step": 1006 }, { "batch_size": 4, "epoch": 0.4024, "step": 1006, "tokens_per_device": 4228 }, { "epoch": 0.4024, "loss_ce": 0.37863487005233765, "loss_lvr": 0.8040429353713989, "loss_mode_switch": 0.0, "loss_total": 0.4590391516685486, "step": 1006 }, { "batch_size": 4, "epoch": 0.4024, "step": 1006, "tokens_per_device": 5016 }, { "epoch": 0.4024, "loss_ce": 0.3005748689174652, "loss_lvr": 0.8689408898353577, "loss_mode_switch": 0.0, "loss_total": 0.38746896386146545, "step": 1006 }, { "batch_size": 4, "epoch": 0.4024, "step": 1006, "tokens_per_device": 4068 }, { "epoch": 0.4024, "loss_ce": 0.2715506851673126, "loss_lvr": 0.8654952049255371, "loss_mode_switch": 0.0, "loss_total": 0.35810020565986633, "step": 1006 }, { "batch_size": 1, "epoch": 0.4024, "step": 1006, "tokens_per_device": 4880 }, { "epoch": 0.4024, "loss_ce": 0.07191600650548935, "loss_lvr": 1.2368379831314087, "loss_mode_switch": 0.0, "loss_total": 0.19559980928897858, "step": 1006 }, { "batch_size": 1, "epoch": 0.4024, "step": 1006, "tokens_per_device": 5046 }, { "epoch": 0.4024, "loss_ce": 0.07030592113733292, "loss_lvr": 0.31441637873649597, "loss_mode_switch": 0.0, "loss_total": 0.1017475575208664, "step": 1006 }, { "batch_size": 4, "epoch": 0.4024, "step": 1006, "tokens_per_device": 4292 }, { "epoch": 0.4024, "loss_ce": 0.07877527177333832, "loss_lvr": 0.8756508231163025, "loss_mode_switch": 0.0, "loss_total": 0.16634035110473633, "step": 1006 }, { "batch_size": 4, "epoch": 0.4024, "step": 1006, "tokens_per_device": 4392 }, { "epoch": 0.4024, "loss_ce": 0.3666568398475647, "loss_lvr": 0.7752206921577454, "loss_mode_switch": 0.0, "loss_total": 0.44417890906333923, "step": 1006 }, { "batch_size": 4, "epoch": 0.4024, "step": 1006, "tokens_per_device": 4464 }, { "epoch": 0.4024, "loss_ce": 0.22721070051193237, "loss_lvr": 0.7240629196166992, "loss_mode_switch": 0.0, "loss_total": 0.2996169924736023, "step": 1006 }, { "epoch": 0.4028, "grad_norm": 1.329482913017273, "learning_rate": 6.777216736830409e-06, "loss": 0.3187, "step": 1007 }, { "batch_size": 4, "epoch": 0.4028, "step": 1007, "tokens_per_device": 4240 }, { "epoch": 0.4028, "loss_ce": 0.14259058237075806, "loss_lvr": 0.9421015381813049, "loss_mode_switch": 0.0, "loss_total": 0.23680073022842407, "step": 1007 }, { "batch_size": 4, "epoch": 0.4028, "step": 1007, "tokens_per_device": 1460 }, { "epoch": 0.4028, "loss_ce": 0.6023052930831909, "loss_lvr": 0.9781608581542969, "loss_mode_switch": 0.0, "loss_total": 0.7001214027404785, "step": 1007 }, { "batch_size": 4, "epoch": 0.4028, "step": 1007, "tokens_per_device": 7364 }, { "epoch": 0.4028, "loss_ce": 0.4997096359729767, "loss_lvr": 0.9046297073364258, "loss_mode_switch": 0.0, "loss_total": 0.5901725888252258, "step": 1007 }, { "batch_size": 4, "epoch": 0.4028, "step": 1007, "tokens_per_device": 5740 }, { "epoch": 0.4028, "loss_ce": 0.3609620928764343, "loss_lvr": 1.0002861022949219, "loss_mode_switch": 0.0, "loss_total": 0.46099069714546204, "step": 1007 }, { "batch_size": 4, "epoch": 0.4028, "step": 1007, "tokens_per_device": 3900 }, { "epoch": 0.4028, "loss_ce": 0.5040139555931091, "loss_lvr": 0.9608151316642761, "loss_mode_switch": 0.0, "loss_total": 0.6000954508781433, "step": 1007 }, { "batch_size": 4, "epoch": 0.4028, "step": 1007, "tokens_per_device": 3800 }, { "epoch": 0.4028, "loss_ce": 0.025783495977520943, "loss_lvr": 1.0127137899398804, "loss_mode_switch": 0.0, "loss_total": 0.12705488502979279, "step": 1007 }, { "batch_size": 4, "epoch": 0.4028, "step": 1007, "tokens_per_device": 4232 }, { "epoch": 0.4028, "loss_ce": 0.7548355460166931, "loss_lvr": 0.9135351777076721, "loss_mode_switch": 0.0, "loss_total": 0.8461890816688538, "step": 1007 }, { "batch_size": 4, "epoch": 0.4028, "step": 1007, "tokens_per_device": 7056 }, { "epoch": 0.4028, "loss_ce": 0.13366733491420746, "loss_lvr": 0.663888156414032, "loss_mode_switch": 0.0, "loss_total": 0.20005615055561066, "step": 1007 }, { "epoch": 0.4032, "grad_norm": 1.345774531364441, "learning_rate": 6.77116073204684e-06, "loss": 0.3443, "step": 1008 }, { "batch_size": 4, "epoch": 0.4032, "step": 1008, "tokens_per_device": 1484 }, { "epoch": 0.4032, "loss_ce": 0.32319238781929016, "loss_lvr": 0.9461266994476318, "loss_mode_switch": 0.0, "loss_total": 0.4178050756454468, "step": 1008 }, { "batch_size": 1, "epoch": 0.4032, "step": 1008, "tokens_per_device": 5245 }, { "epoch": 0.4032, "loss_ce": 0.12318779528141022, "loss_lvr": 0.4887135624885559, "loss_mode_switch": 0.0, "loss_total": 0.17205914855003357, "step": 1008 }, { "batch_size": 4, "epoch": 0.4032, "step": 1008, "tokens_per_device": 4212 }, { "epoch": 0.4032, "loss_ce": 0.20337869226932526, "loss_lvr": 0.829084575176239, "loss_mode_switch": 0.0, "loss_total": 0.2862871587276459, "step": 1008 }, { "batch_size": 1, "epoch": 0.4032, "step": 1008, "tokens_per_device": 5137 }, { "epoch": 0.4032, "loss_ce": 0.018502529710531235, "loss_lvr": 0.24701355397701263, "loss_mode_switch": 0.0, "loss_total": 0.04320388287305832, "step": 1008 }, { "batch_size": 1, "epoch": 0.4032, "step": 1008, "tokens_per_device": 4930 }, { "epoch": 0.4032, "loss_ce": 0.059643883258104324, "loss_lvr": 0.3704114854335785, "loss_mode_switch": 0.0, "loss_total": 0.09668503701686859, "step": 1008 }, { "batch_size": 4, "epoch": 0.4032, "step": 1008, "tokens_per_device": 4252 }, { "epoch": 0.4032, "loss_ce": 0.27498146891593933, "loss_lvr": 0.7062747478485107, "loss_mode_switch": 0.0, "loss_total": 0.3456089496612549, "step": 1008 }, { "batch_size": 4, "epoch": 0.4032, "step": 1008, "tokens_per_device": 4580 }, { "epoch": 0.4032, "loss_ce": 0.0635552778840065, "loss_lvr": 0.6564967036247253, "loss_mode_switch": 0.0, "loss_total": 0.12920495867729187, "step": 1008 }, { "batch_size": 4, "epoch": 0.4032, "step": 1008, "tokens_per_device": 6576 }, { "epoch": 0.4032, "loss_ce": 0.05435175821185112, "loss_lvr": 1.0885716676712036, "loss_mode_switch": 0.0, "loss_total": 0.16320893168449402, "step": 1008 }, { "epoch": 0.4036, "grad_norm": 1.439813494682312, "learning_rate": 6.765101754679015e-06, "loss": 0.3161, "step": 1009 }, { "batch_size": 4, "epoch": 0.4036, "step": 1009, "tokens_per_device": 14228 }, { "epoch": 0.4036, "loss_ce": 0.4332914650440216, "loss_lvr": 0.4747593402862549, "loss_mode_switch": 0.0, "loss_total": 0.4807673990726471, "step": 1009 }, { "batch_size": 4, "epoch": 0.4036, "step": 1009, "tokens_per_device": 2796 }, { "epoch": 0.4036, "loss_ce": 0.6263940930366516, "loss_lvr": 0.8068837523460388, "loss_mode_switch": 0.0, "loss_total": 0.7070824503898621, "step": 1009 }, { "batch_size": 1, "epoch": 0.4036, "step": 1009, "tokens_per_device": 5261 }, { "epoch": 0.4036, "loss_ce": 0.004771755542606115, "loss_lvr": 0.4539478123188019, "loss_mode_switch": 0.0, "loss_total": 0.0501665361225605, "step": 1009 }, { "batch_size": 4, "epoch": 0.4036, "step": 1009, "tokens_per_device": 3736 }, { "epoch": 0.4036, "loss_ce": 0.25644227862358093, "loss_lvr": 0.8114745616912842, "loss_mode_switch": 0.0, "loss_total": 0.33758974075317383, "step": 1009 }, { "batch_size": 1, "epoch": 0.4036, "step": 1009, "tokens_per_device": 5037 }, { "epoch": 0.4036, "loss_ce": 0.012631603516638279, "loss_lvr": 0.7905037999153137, "loss_mode_switch": 0.0, "loss_total": 0.09168198704719543, "step": 1009 }, { "batch_size": 4, "epoch": 0.4036, "step": 1009, "tokens_per_device": 3780 }, { "epoch": 0.4036, "loss_ce": 0.2797890305519104, "loss_lvr": 0.8981223106384277, "loss_mode_switch": 0.0, "loss_total": 0.3696012496948242, "step": 1009 }, { "batch_size": 4, "epoch": 0.4036, "step": 1009, "tokens_per_device": 4352 }, { "epoch": 0.4036, "loss_ce": 0.10363583266735077, "loss_lvr": 0.9332129955291748, "loss_mode_switch": 0.0, "loss_total": 0.19695714116096497, "step": 1009 }, { "batch_size": 4, "epoch": 0.4036, "step": 1009, "tokens_per_device": 4268 }, { "epoch": 0.4036, "loss_ce": 0.06887224316596985, "loss_lvr": 0.8484910726547241, "loss_mode_switch": 0.0, "loss_total": 0.15372136235237122, "step": 1009 }, { "epoch": 0.404, "grad_norm": 1.5423190593719482, "learning_rate": 6.7590398148958625e-06, "loss": 0.2961, "step": 1010 }, { "batch_size": 4, "epoch": 0.404, "step": 1010, "tokens_per_device": 4352 }, { "epoch": 0.404, "loss_ce": 0.15462175011634827, "loss_lvr": 1.1322109699249268, "loss_mode_switch": 0.0, "loss_total": 0.2678428590297699, "step": 1010 }, { "batch_size": 4, "epoch": 0.404, "step": 1010, "tokens_per_device": 11072 }, { "epoch": 0.404, "loss_ce": 0.2236298769712448, "loss_lvr": 0.8321583271026611, "loss_mode_switch": 0.0, "loss_total": 0.3068457245826721, "step": 1010 }, { "batch_size": 4, "epoch": 0.404, "step": 1010, "tokens_per_device": 5148 }, { "epoch": 0.404, "loss_ce": 0.37475451827049255, "loss_lvr": 0.8427305221557617, "loss_mode_switch": 0.0, "loss_total": 0.45902758836746216, "step": 1010 }, { "batch_size": 1, "epoch": 0.404, "step": 1010, "tokens_per_device": 5154 }, { "epoch": 0.404, "loss_ce": 0.0013085035607218742, "loss_lvr": 0.2893407642841339, "loss_mode_switch": 0.0, "loss_total": 0.03024258092045784, "step": 1010 }, { "batch_size": 4, "epoch": 0.404, "step": 1010, "tokens_per_device": 5424 }, { "epoch": 0.404, "loss_ce": 0.13820794224739075, "loss_lvr": 0.7959982752799988, "loss_mode_switch": 0.0, "loss_total": 0.21780776977539062, "step": 1010 }, { "batch_size": 4, "epoch": 0.404, "step": 1010, "tokens_per_device": 6040 }, { "epoch": 0.404, "loss_ce": 0.13878317177295685, "loss_lvr": 1.4480702877044678, "loss_mode_switch": 0.0, "loss_total": 0.2835901975631714, "step": 1010 }, { "batch_size": 4, "epoch": 0.404, "step": 1010, "tokens_per_device": 10700 }, { "epoch": 0.404, "loss_ce": 0.00297872768715024, "loss_lvr": 0.8417232036590576, "loss_mode_switch": 0.0, "loss_total": 0.08715105056762695, "step": 1010 }, { "batch_size": 4, "epoch": 0.404, "step": 1010, "tokens_per_device": 4240 }, { "epoch": 0.404, "loss_ce": 0.6614899635314941, "loss_lvr": 0.9793646931648254, "loss_mode_switch": 0.0, "loss_total": 0.7594264149665833, "step": 1010 }, { "epoch": 0.4044, "grad_norm": 1.093377709388733, "learning_rate": 6.7529749228712994e-06, "loss": 0.2843, "step": 1011 }, { "batch_size": 4, "epoch": 0.4044, "step": 1011, "tokens_per_device": 4316 }, { "epoch": 0.4044, "loss_ce": 0.17635072767734528, "loss_lvr": 0.768700897693634, "loss_mode_switch": 0.0, "loss_total": 0.2532208263874054, "step": 1011 }, { "batch_size": 4, "epoch": 0.4044, "step": 1011, "tokens_per_device": 15224 }, { "epoch": 0.4044, "loss_ce": 0.7536008954048157, "loss_lvr": 0.5883575677871704, "loss_mode_switch": 0.0, "loss_total": 0.8124366402626038, "step": 1011 }, { "batch_size": 4, "epoch": 0.4044, "step": 1011, "tokens_per_device": 5440 }, { "epoch": 0.4044, "loss_ce": 0.1904405951499939, "loss_lvr": 0.800043523311615, "loss_mode_switch": 0.0, "loss_total": 0.27044495940208435, "step": 1011 }, { "batch_size": 4, "epoch": 0.4044, "step": 1011, "tokens_per_device": 10796 }, { "epoch": 0.4044, "loss_ce": 0.632692277431488, "loss_lvr": 1.0556522607803345, "loss_mode_switch": 0.0, "loss_total": 0.7382575273513794, "step": 1011 }, { "batch_size": 1, "epoch": 0.4044, "step": 1011, "tokens_per_device": 5168 }, { "epoch": 0.4044, "loss_ce": 0.011233167722821236, "loss_lvr": 0.2826744616031647, "loss_mode_switch": 0.0, "loss_total": 0.03950061649084091, "step": 1011 }, { "batch_size": 4, "epoch": 0.4044, "step": 1011, "tokens_per_device": 5928 }, { "epoch": 0.4044, "loss_ce": 0.13429798185825348, "loss_lvr": 0.6950010657310486, "loss_mode_switch": 0.0, "loss_total": 0.2037980854511261, "step": 1011 }, { "batch_size": 4, "epoch": 0.4044, "step": 1011, "tokens_per_device": 3884 }, { "epoch": 0.4044, "loss_ce": 0.19347690045833588, "loss_lvr": 0.9671805500984192, "loss_mode_switch": 0.0, "loss_total": 0.29019495844841003, "step": 1011 }, { "batch_size": 4, "epoch": 0.4044, "step": 1011, "tokens_per_device": 3804 }, { "epoch": 0.4044, "loss_ce": 0.2349543422460556, "loss_lvr": 1.1628565788269043, "loss_mode_switch": 0.0, "loss_total": 0.35124000906944275, "step": 1011 }, { "epoch": 0.4048, "grad_norm": 1.1717313528060913, "learning_rate": 6.746907088784182e-06, "loss": 0.2774, "step": 1012 }, { "batch_size": 1, "epoch": 0.4048, "step": 1012, "tokens_per_device": 5316 }, { "epoch": 0.4048, "loss_ce": 0.11419945955276489, "loss_lvr": 0.3684690296649933, "loss_mode_switch": 0.0, "loss_total": 0.15104636549949646, "step": 1012 }, { "batch_size": 4, "epoch": 0.4048, "step": 1012, "tokens_per_device": 4064 }, { "epoch": 0.4048, "loss_ce": 0.46673864126205444, "loss_lvr": 0.8242018818855286, "loss_mode_switch": 0.0, "loss_total": 0.5491588115692139, "step": 1012 }, { "batch_size": 4, "epoch": 0.4048, "step": 1012, "tokens_per_device": 4540 }, { "epoch": 0.4048, "loss_ce": 0.00857479963451624, "loss_lvr": 0.7037346959114075, "loss_mode_switch": 0.0, "loss_total": 0.07894826680421829, "step": 1012 }, { "batch_size": 1, "epoch": 0.4048, "step": 1012, "tokens_per_device": 6558 }, { "epoch": 0.4048, "loss_ce": 0.123597651720047, "loss_lvr": 0.38711950182914734, "loss_mode_switch": 0.0, "loss_total": 0.16230960190296173, "step": 1012 }, { "batch_size": 4, "epoch": 0.4048, "step": 1012, "tokens_per_device": 5228 }, { "epoch": 0.4048, "loss_ce": 0.08080103993415833, "loss_lvr": 0.8875316977500916, "loss_mode_switch": 0.0, "loss_total": 0.169554203748703, "step": 1012 }, { "batch_size": 4, "epoch": 0.4048, "step": 1012, "tokens_per_device": 1428 }, { "epoch": 0.4048, "loss_ce": 0.26737844944000244, "loss_lvr": 1.0090070962905884, "loss_mode_switch": 0.0, "loss_total": 0.3682791590690613, "step": 1012 }, { "batch_size": 4, "epoch": 0.4048, "step": 1012, "tokens_per_device": 3776 }, { "epoch": 0.4048, "loss_ce": 0.3448896110057831, "loss_lvr": 1.0495840311050415, "loss_mode_switch": 0.0, "loss_total": 0.4498480260372162, "step": 1012 }, { "batch_size": 1, "epoch": 0.4048, "step": 1012, "tokens_per_device": 5196 }, { "epoch": 0.4048, "loss_ce": 0.035109762102365494, "loss_lvr": 0.6127638816833496, "loss_mode_switch": 0.0, "loss_total": 0.0963861495256424, "step": 1012 }, { "epoch": 0.4052, "grad_norm": 1.2382947206497192, "learning_rate": 6.740836322818314e-06, "loss": 0.2965, "step": 1013 }, { "batch_size": 4, "epoch": 0.4052, "step": 1013, "tokens_per_device": 4724 }, { "epoch": 0.4052, "loss_ce": 0.17795701324939728, "loss_lvr": 0.8103017210960388, "loss_mode_switch": 0.0, "loss_total": 0.2589871883392334, "step": 1013 }, { "batch_size": 4, "epoch": 0.4052, "step": 1013, "tokens_per_device": 4300 }, { "epoch": 0.4052, "loss_ce": 0.3297325670719147, "loss_lvr": 0.9677176475524902, "loss_mode_switch": 0.0, "loss_total": 0.42650434374809265, "step": 1013 }, { "batch_size": 4, "epoch": 0.4052, "step": 1013, "tokens_per_device": 7072 }, { "epoch": 0.4052, "loss_ce": 0.17476128041744232, "loss_lvr": 0.7366930842399597, "loss_mode_switch": 0.0, "loss_total": 0.24843057990074158, "step": 1013 }, { "batch_size": 4, "epoch": 0.4052, "step": 1013, "tokens_per_device": 2800 }, { "epoch": 0.4052, "loss_ce": 0.16782300174236298, "loss_lvr": 0.7939254641532898, "loss_mode_switch": 0.0, "loss_total": 0.24721553921699524, "step": 1013 }, { "batch_size": 4, "epoch": 0.4052, "step": 1013, "tokens_per_device": 1172 }, { "epoch": 0.4052, "loss_ce": 0.49650031328201294, "loss_lvr": 1.3953949213027954, "loss_mode_switch": 0.0, "loss_total": 0.6360397934913635, "step": 1013 }, { "batch_size": 4, "epoch": 0.4052, "step": 1013, "tokens_per_device": 4036 }, { "epoch": 0.4052, "loss_ce": 0.0047933971509337425, "loss_lvr": 1.0163626670837402, "loss_mode_switch": 0.0, "loss_total": 0.10642966628074646, "step": 1013 }, { "batch_size": 4, "epoch": 0.4052, "step": 1013, "tokens_per_device": 1272 }, { "epoch": 0.4052, "loss_ce": 0.2819245457649231, "loss_lvr": 0.9419938921928406, "loss_mode_switch": 0.0, "loss_total": 0.37612393498420715, "step": 1013 }, { "batch_size": 4, "epoch": 0.4052, "step": 1013, "tokens_per_device": 1944 }, { "epoch": 0.4052, "loss_ce": 0.4664188623428345, "loss_lvr": 1.7018216848373413, "loss_mode_switch": 0.0, "loss_total": 0.6366010308265686, "step": 1013 }, { "epoch": 0.4056, "grad_norm": 1.2818129062652588, "learning_rate": 6.734762635162417e-06, "loss": 0.3317, "step": 1014 }, { "batch_size": 4, "epoch": 0.4056, "step": 1014, "tokens_per_device": 3776 }, { "epoch": 0.4056, "loss_ce": 0.07076151669025421, "loss_lvr": 0.7433140277862549, "loss_mode_switch": 0.0, "loss_total": 0.1450929194688797, "step": 1014 }, { "batch_size": 4, "epoch": 0.4056, "step": 1014, "tokens_per_device": 3996 }, { "epoch": 0.4056, "loss_ce": 0.02127637155354023, "loss_lvr": 0.8919112086296082, "loss_mode_switch": 0.0, "loss_total": 0.11046749353408813, "step": 1014 }, { "batch_size": 1, "epoch": 0.4056, "step": 1014, "tokens_per_device": 4872 }, { "epoch": 0.4056, "loss_ce": 0.21060694754123688, "loss_lvr": 0.23666028678417206, "loss_mode_switch": 0.0, "loss_total": 0.23427297174930573, "step": 1014 }, { "batch_size": 1, "epoch": 0.4056, "step": 1014, "tokens_per_device": 5025 }, { "epoch": 0.4056, "loss_ce": 0.029428018257021904, "loss_lvr": 0.45725125074386597, "loss_mode_switch": 0.0, "loss_total": 0.07515314221382141, "step": 1014 }, { "batch_size": 4, "epoch": 0.4056, "step": 1014, "tokens_per_device": 2700 }, { "epoch": 0.4056, "loss_ce": 0.20355729758739471, "loss_lvr": 1.3820085525512695, "loss_mode_switch": 0.0, "loss_total": 0.3417581617832184, "step": 1014 }, { "batch_size": 4, "epoch": 0.4056, "step": 1014, "tokens_per_device": 4800 }, { "epoch": 0.4056, "loss_ce": 0.4217482805252075, "loss_lvr": 0.6678282022476196, "loss_mode_switch": 0.0, "loss_total": 0.48853111267089844, "step": 1014 }, { "batch_size": 4, "epoch": 0.4056, "step": 1014, "tokens_per_device": 11344 }, { "epoch": 0.4056, "loss_ce": 0.08633258938789368, "loss_lvr": 0.8763847351074219, "loss_mode_switch": 0.0, "loss_total": 0.1739710569381714, "step": 1014 }, { "batch_size": 4, "epoch": 0.4056, "step": 1014, "tokens_per_device": 2672 }, { "epoch": 0.4056, "loss_ce": 0.5118682980537415, "loss_lvr": 0.9134358167648315, "loss_mode_switch": 0.0, "loss_total": 0.6032118797302246, "step": 1014 }, { "epoch": 0.406, "grad_norm": 1.428195595741272, "learning_rate": 6.728686036010115e-06, "loss": 0.3116, "step": 1015 }, { "batch_size": 1, "epoch": 0.406, "step": 1015, "tokens_per_device": 5183 }, { "epoch": 0.406, "loss_ce": 0.19046437740325928, "loss_lvr": 0.6707233190536499, "loss_mode_switch": 0.0, "loss_total": 0.25753670930862427, "step": 1015 }, { "batch_size": 4, "epoch": 0.406, "step": 1015, "tokens_per_device": 5580 }, { "epoch": 0.406, "loss_ce": 0.12975487112998962, "loss_lvr": 0.8421914577484131, "loss_mode_switch": 0.0, "loss_total": 0.2139740288257599, "step": 1015 }, { "batch_size": 4, "epoch": 0.406, "step": 1015, "tokens_per_device": 4428 }, { "epoch": 0.406, "loss_ce": 0.08126389980316162, "loss_lvr": 0.6517532467842102, "loss_mode_switch": 0.0, "loss_total": 0.14643922448158264, "step": 1015 }, { "batch_size": 4, "epoch": 0.406, "step": 1015, "tokens_per_device": 10752 }, { "epoch": 0.406, "loss_ce": 0.12288863211870193, "loss_lvr": 0.8311525583267212, "loss_mode_switch": 0.0, "loss_total": 0.20600388944149017, "step": 1015 }, { "batch_size": 4, "epoch": 0.406, "step": 1015, "tokens_per_device": 4468 }, { "epoch": 0.406, "loss_ce": 0.2340611070394516, "loss_lvr": 0.871181845664978, "loss_mode_switch": 0.0, "loss_total": 0.3211793005466461, "step": 1015 }, { "batch_size": 1, "epoch": 0.406, "step": 1015, "tokens_per_device": 5056 }, { "epoch": 0.406, "loss_ce": 0.002412423025816679, "loss_lvr": 0.4608084261417389, "loss_mode_switch": 0.0, "loss_total": 0.048493266105651855, "step": 1015 }, { "batch_size": 4, "epoch": 0.406, "step": 1015, "tokens_per_device": 4584 }, { "epoch": 0.406, "loss_ce": 0.2521204650402069, "loss_lvr": 0.6103606820106506, "loss_mode_switch": 0.0, "loss_total": 0.3131565451622009, "step": 1015 }, { "batch_size": 4, "epoch": 0.406, "step": 1015, "tokens_per_device": 4292 }, { "epoch": 0.406, "loss_ce": 0.3432598114013672, "loss_lvr": 0.9320035576820374, "loss_mode_switch": 0.0, "loss_total": 0.4364601671695709, "step": 1015 }, { "epoch": 0.4064, "grad_norm": 1.4337131977081299, "learning_rate": 6.7226065355599204e-06, "loss": 0.314, "step": 1016 }, { "batch_size": 4, "epoch": 0.4064, "step": 1016, "tokens_per_device": 1524 }, { "epoch": 0.4064, "loss_ce": 0.47379133105278015, "loss_lvr": 1.0269620418548584, "loss_mode_switch": 0.0, "loss_total": 0.5764875411987305, "step": 1016 }, { "batch_size": 1, "epoch": 0.4064, "step": 1016, "tokens_per_device": 5188 }, { "epoch": 0.4064, "loss_ce": 0.023938974365592003, "loss_lvr": 0.21683120727539062, "loss_mode_switch": 0.0, "loss_total": 0.045622095465660095, "step": 1016 }, { "batch_size": 4, "epoch": 0.4064, "step": 1016, "tokens_per_device": 3752 }, { "epoch": 0.4064, "loss_ce": 0.2412721812725067, "loss_lvr": 1.1041926145553589, "loss_mode_switch": 0.0, "loss_total": 0.35169145464897156, "step": 1016 }, { "batch_size": 4, "epoch": 0.4064, "step": 1016, "tokens_per_device": 4328 }, { "epoch": 0.4064, "loss_ce": 0.6133938431739807, "loss_lvr": 1.082468867301941, "loss_mode_switch": 0.0, "loss_total": 0.7216407060623169, "step": 1016 }, { "batch_size": 1, "epoch": 0.4064, "step": 1016, "tokens_per_device": 6377 }, { "epoch": 0.4064, "loss_ce": 0.0664857029914856, "loss_lvr": 0.31277382373809814, "loss_mode_switch": 0.0, "loss_total": 0.09776309132575989, "step": 1016 }, { "batch_size": 4, "epoch": 0.4064, "step": 1016, "tokens_per_device": 13992 }, { "epoch": 0.4064, "loss_ce": 0.19360746443271637, "loss_lvr": 0.4399966895580292, "loss_mode_switch": 0.0, "loss_total": 0.23760713636875153, "step": 1016 }, { "batch_size": 1, "epoch": 0.4064, "step": 1016, "tokens_per_device": 4882 }, { "epoch": 0.4064, "loss_ce": 0.04898687079548836, "loss_lvr": 0.2096252739429474, "loss_mode_switch": 0.0, "loss_total": 0.06994939595460892, "step": 1016 }, { "batch_size": 4, "epoch": 0.4064, "step": 1016, "tokens_per_device": 10564 }, { "epoch": 0.4064, "loss_ce": 0.016563329845666885, "loss_lvr": 0.7632642388343811, "loss_mode_switch": 0.0, "loss_total": 0.09288975596427917, "step": 1016 }, { "epoch": 0.4068, "grad_norm": 1.3831284046173096, "learning_rate": 6.716524144015212e-06, "loss": 0.3131, "step": 1017 }, { "batch_size": 1, "epoch": 0.4068, "step": 1017, "tokens_per_device": 5165 }, { "epoch": 0.4068, "loss_ce": 0.0008207837818190455, "loss_lvr": 0.5234777927398682, "loss_mode_switch": 0.0, "loss_total": 0.053168561309576035, "step": 1017 }, { "batch_size": 4, "epoch": 0.4068, "step": 1017, "tokens_per_device": 4292 }, { "epoch": 0.4068, "loss_ce": 0.22948019206523895, "loss_lvr": 0.9844632148742676, "loss_mode_switch": 0.0, "loss_total": 0.32792651653289795, "step": 1017 }, { "batch_size": 4, "epoch": 0.4068, "step": 1017, "tokens_per_device": 4432 }, { "epoch": 0.4068, "loss_ce": 0.3637962341308594, "loss_lvr": 0.7729551792144775, "loss_mode_switch": 0.0, "loss_total": 0.44109174609184265, "step": 1017 }, { "batch_size": 4, "epoch": 0.4068, "step": 1017, "tokens_per_device": 1608 }, { "epoch": 0.4068, "loss_ce": 0.506851077079773, "loss_lvr": 0.9691652655601501, "loss_mode_switch": 0.0, "loss_total": 0.6037676334381104, "step": 1017 }, { "batch_size": 4, "epoch": 0.4068, "step": 1017, "tokens_per_device": 12688 }, { "epoch": 0.4068, "loss_ce": 0.04261022061109543, "loss_lvr": 0.4821352958679199, "loss_mode_switch": 0.0, "loss_total": 0.09082375466823578, "step": 1017 }, { "batch_size": 1, "epoch": 0.4068, "step": 1017, "tokens_per_device": 4834 }, { "epoch": 0.4068, "loss_ce": 0.0015588526148349047, "loss_lvr": 1.0051518678665161, "loss_mode_switch": 0.0, "loss_total": 0.1020740419626236, "step": 1017 }, { "batch_size": 4, "epoch": 0.4068, "step": 1017, "tokens_per_device": 6544 }, { "epoch": 0.4068, "loss_ce": 0.06709763407707214, "loss_lvr": 0.6644506454467773, "loss_mode_switch": 0.0, "loss_total": 0.13354270160198212, "step": 1017 }, { "batch_size": 1, "epoch": 0.4068, "step": 1017, "tokens_per_device": 4883 }, { "epoch": 0.4068, "loss_ce": 0.01175798662006855, "loss_lvr": 1.7314403057098389, "loss_mode_switch": 0.0, "loss_total": 0.18490201234817505, "step": 1017 }, { "epoch": 0.4072, "grad_norm": 1.2767760753631592, "learning_rate": 6.710438871584225e-06, "loss": 0.2704, "step": 1018 }, { "batch_size": 4, "epoch": 0.4072, "step": 1018, "tokens_per_device": 6120 }, { "epoch": 0.4072, "loss_ce": 0.4490338861942291, "loss_lvr": 0.7197777032852173, "loss_mode_switch": 0.0, "loss_total": 0.5210116505622864, "step": 1018 }, { "batch_size": 1, "epoch": 0.4072, "step": 1018, "tokens_per_device": 5683 }, { "epoch": 0.4072, "loss_ce": 0.0005395713960751891, "loss_lvr": 0.36530745029449463, "loss_mode_switch": 0.0, "loss_total": 0.03707031533122063, "step": 1018 }, { "batch_size": 1, "epoch": 0.4072, "step": 1018, "tokens_per_device": 7425 }, { "epoch": 0.4072, "loss_ce": 0.04221674054861069, "loss_lvr": 0.35521307587623596, "loss_mode_switch": 0.0, "loss_total": 0.07773804664611816, "step": 1018 }, { "batch_size": 4, "epoch": 0.4072, "step": 1018, "tokens_per_device": 4000 }, { "epoch": 0.4072, "loss_ce": 0.19471502304077148, "loss_lvr": 0.7854179739952087, "loss_mode_switch": 0.0, "loss_total": 0.2732568383216858, "step": 1018 }, { "batch_size": 4, "epoch": 0.4072, "step": 1018, "tokens_per_device": 4712 }, { "epoch": 0.4072, "loss_ce": 0.025303957983851433, "loss_lvr": 0.920272171497345, "loss_mode_switch": 0.0, "loss_total": 0.11733117699623108, "step": 1018 }, { "batch_size": 4, "epoch": 0.4072, "step": 1018, "tokens_per_device": 4172 }, { "epoch": 0.4072, "loss_ce": 0.1471889317035675, "loss_lvr": 0.5501462817192078, "loss_mode_switch": 0.0, "loss_total": 0.20220355689525604, "step": 1018 }, { "batch_size": 1, "epoch": 0.4072, "step": 1018, "tokens_per_device": 5875 }, { "epoch": 0.4072, "loss_ce": 0.10785657912492752, "loss_lvr": 0.39517778158187866, "loss_mode_switch": 0.0, "loss_total": 0.14737436175346375, "step": 1018 }, { "batch_size": 4, "epoch": 0.4072, "step": 1018, "tokens_per_device": 2648 }, { "epoch": 0.4072, "loss_ce": 0.2856929898262024, "loss_lvr": 0.7414253950119019, "loss_mode_switch": 0.0, "loss_total": 0.35983553528785706, "step": 1018 }, { "epoch": 0.4076, "grad_norm": 1.2685011625289917, "learning_rate": 6.704350728480026e-06, "loss": 0.3445, "step": 1019 }, { "batch_size": 4, "epoch": 0.4076, "step": 1019, "tokens_per_device": 6360 }, { "epoch": 0.4076, "loss_ce": 0.24569332599639893, "loss_lvr": 0.6758729815483093, "loss_mode_switch": 0.0, "loss_total": 0.3132806420326233, "step": 1019 }, { "batch_size": 1, "epoch": 0.4076, "step": 1019, "tokens_per_device": 5101 }, { "epoch": 0.4076, "loss_ce": 0.5438030958175659, "loss_lvr": 0.5378231406211853, "loss_mode_switch": 0.0, "loss_total": 0.5975854396820068, "step": 1019 }, { "batch_size": 4, "epoch": 0.4076, "step": 1019, "tokens_per_device": 4240 }, { "epoch": 0.4076, "loss_ce": 0.26214417815208435, "loss_lvr": 0.827613353729248, "loss_mode_switch": 0.0, "loss_total": 0.3449055254459381, "step": 1019 }, { "batch_size": 4, "epoch": 0.4076, "step": 1019, "tokens_per_device": 1300 }, { "epoch": 0.4076, "loss_ce": 0.3812853693962097, "loss_lvr": 1.163442850112915, "loss_mode_switch": 0.0, "loss_total": 0.49762964248657227, "step": 1019 }, { "batch_size": 4, "epoch": 0.4076, "step": 1019, "tokens_per_device": 3996 }, { "epoch": 0.4076, "loss_ce": 0.2448100745677948, "loss_lvr": 0.9058230519294739, "loss_mode_switch": 0.0, "loss_total": 0.33539238572120667, "step": 1019 }, { "batch_size": 4, "epoch": 0.4076, "step": 1019, "tokens_per_device": 4372 }, { "epoch": 0.4076, "loss_ce": 0.004183425102382898, "loss_lvr": 0.6949108242988586, "loss_mode_switch": 0.0, "loss_total": 0.0736745074391365, "step": 1019 }, { "batch_size": 4, "epoch": 0.4076, "step": 1019, "tokens_per_device": 4328 }, { "epoch": 0.4076, "loss_ce": 0.7091825604438782, "loss_lvr": 0.7975708246231079, "loss_mode_switch": 0.0, "loss_total": 0.7889396548271179, "step": 1019 }, { "batch_size": 4, "epoch": 0.4076, "step": 1019, "tokens_per_device": 1320 }, { "epoch": 0.4076, "loss_ce": 0.8686839938163757, "loss_lvr": 0.88999342918396, "loss_mode_switch": 0.0, "loss_total": 0.9576833248138428, "step": 1019 }, { "epoch": 0.408, "grad_norm": 1.6942862272262573, "learning_rate": 6.698259724920503e-06, "loss": 0.3709, "step": 1020 }, { "batch_size": 4, "epoch": 0.408, "step": 1020, "tokens_per_device": 5712 }, { "epoch": 0.408, "loss_ce": 0.12047584354877472, "loss_lvr": 0.6445002555847168, "loss_mode_switch": 0.0, "loss_total": 0.1849258691072464, "step": 1020 }, { "batch_size": 4, "epoch": 0.408, "step": 1020, "tokens_per_device": 4820 }, { "epoch": 0.408, "loss_ce": 0.39731407165527344, "loss_lvr": 0.870018720626831, "loss_mode_switch": 0.0, "loss_total": 0.4843159317970276, "step": 1020 }, { "batch_size": 4, "epoch": 0.408, "step": 1020, "tokens_per_device": 14024 }, { "epoch": 0.408, "loss_ce": 0.3991888165473938, "loss_lvr": 1.0049328804016113, "loss_mode_switch": 0.0, "loss_total": 0.49968209862709045, "step": 1020 }, { "batch_size": 4, "epoch": 0.408, "step": 1020, "tokens_per_device": 3768 }, { "epoch": 0.408, "loss_ce": 0.1354645937681198, "loss_lvr": 0.8284228444099426, "loss_mode_switch": 0.0, "loss_total": 0.21830686926841736, "step": 1020 }, { "batch_size": 4, "epoch": 0.408, "step": 1020, "tokens_per_device": 4572 }, { "epoch": 0.408, "loss_ce": 0.3364158570766449, "loss_lvr": 0.8117093443870544, "loss_mode_switch": 0.0, "loss_total": 0.4175868034362793, "step": 1020 }, { "batch_size": 4, "epoch": 0.408, "step": 1020, "tokens_per_device": 4696 }, { "epoch": 0.408, "loss_ce": 0.42092350125312805, "loss_lvr": 0.838089644908905, "loss_mode_switch": 0.0, "loss_total": 0.5047324895858765, "step": 1020 }, { "batch_size": 4, "epoch": 0.408, "step": 1020, "tokens_per_device": 4204 }, { "epoch": 0.408, "loss_ce": 0.21114255487918854, "loss_lvr": 1.0233328342437744, "loss_mode_switch": 0.0, "loss_total": 0.3134758472442627, "step": 1020 }, { "batch_size": 1, "epoch": 0.408, "step": 1020, "tokens_per_device": 4747 }, { "epoch": 0.408, "loss_ce": 0.0011490440228953958, "loss_lvr": 0.6307803988456726, "loss_mode_switch": 0.0, "loss_total": 0.06422708183526993, "step": 1020 }, { "epoch": 0.4084, "grad_norm": 1.3107216358184814, "learning_rate": 6.69216587112834e-06, "loss": 0.3409, "step": 1021 }, { "batch_size": 4, "epoch": 0.4084, "step": 1021, "tokens_per_device": 2796 }, { "epoch": 0.4084, "loss_ce": 0.434019535779953, "loss_lvr": 1.0757883787155151, "loss_mode_switch": 0.0, "loss_total": 0.541598379611969, "step": 1021 }, { "batch_size": 1, "epoch": 0.4084, "step": 1021, "tokens_per_device": 5153 }, { "epoch": 0.4084, "loss_ce": 0.5036652088165283, "loss_lvr": 0.2937246561050415, "loss_mode_switch": 0.0, "loss_total": 0.5330376625061035, "step": 1021 }, { "batch_size": 4, "epoch": 0.4084, "step": 1021, "tokens_per_device": 3776 }, { "epoch": 0.4084, "loss_ce": 0.0673823282122612, "loss_lvr": 1.0348362922668457, "loss_mode_switch": 0.0, "loss_total": 0.1708659529685974, "step": 1021 }, { "batch_size": 4, "epoch": 0.4084, "step": 1021, "tokens_per_device": 1416 }, { "epoch": 0.4084, "loss_ce": 0.8066301345825195, "loss_lvr": 0.8900889754295349, "loss_mode_switch": 0.0, "loss_total": 0.8956390619277954, "step": 1021 }, { "batch_size": 4, "epoch": 0.4084, "step": 1021, "tokens_per_device": 9268 }, { "epoch": 0.4084, "loss_ce": 0.4704184830188751, "loss_lvr": 0.5909974575042725, "loss_mode_switch": 0.0, "loss_total": 0.5295182466506958, "step": 1021 }, { "batch_size": 4, "epoch": 0.4084, "step": 1021, "tokens_per_device": 5248 }, { "epoch": 0.4084, "loss_ce": 0.05784393101930618, "loss_lvr": 0.8242461681365967, "loss_mode_switch": 0.0, "loss_total": 0.14026854932308197, "step": 1021 }, { "batch_size": 4, "epoch": 0.4084, "step": 1021, "tokens_per_device": 1292 }, { "epoch": 0.4084, "loss_ce": 0.35018888115882874, "loss_lvr": 0.9604271054267883, "loss_mode_switch": 0.0, "loss_total": 0.4462316036224365, "step": 1021 }, { "batch_size": 1, "epoch": 0.4084, "step": 1021, "tokens_per_device": 5082 }, { "epoch": 0.4084, "loss_ce": 0.18160852789878845, "loss_lvr": 0.34589269757270813, "loss_mode_switch": 0.0, "loss_total": 0.21619780361652374, "step": 1021 }, { "epoch": 0.4088, "grad_norm": 1.4105418920516968, "learning_rate": 6.686069177331009e-06, "loss": 0.3464, "step": 1022 }, { "batch_size": 4, "epoch": 0.4088, "step": 1022, "tokens_per_device": 5308 }, { "epoch": 0.4088, "loss_ce": 0.30667802691459656, "loss_lvr": 0.7105579376220703, "loss_mode_switch": 0.0, "loss_total": 0.37773382663726807, "step": 1022 }, { "batch_size": 4, "epoch": 0.4088, "step": 1022, "tokens_per_device": 4980 }, { "epoch": 0.4088, "loss_ce": 0.45623284578323364, "loss_lvr": 0.6668013334274292, "loss_mode_switch": 0.0, "loss_total": 0.5229129791259766, "step": 1022 }, { "batch_size": 4, "epoch": 0.4088, "step": 1022, "tokens_per_device": 4652 }, { "epoch": 0.4088, "loss_ce": 0.1522132307291031, "loss_lvr": 0.8428086042404175, "loss_mode_switch": 0.0, "loss_total": 0.23649409413337708, "step": 1022 }, { "batch_size": 4, "epoch": 0.4088, "step": 1022, "tokens_per_device": 5372 }, { "epoch": 0.4088, "loss_ce": 0.036710213869810104, "loss_lvr": 0.8339749574661255, "loss_mode_switch": 0.0, "loss_total": 0.12010771036148071, "step": 1022 }, { "batch_size": 4, "epoch": 0.4088, "step": 1022, "tokens_per_device": 4868 }, { "epoch": 0.4088, "loss_ce": 0.07145117968320847, "loss_lvr": 0.8065723180770874, "loss_mode_switch": 0.0, "loss_total": 0.15210841596126556, "step": 1022 }, { "batch_size": 4, "epoch": 0.4088, "step": 1022, "tokens_per_device": 2692 }, { "epoch": 0.4088, "loss_ce": 0.49407365918159485, "loss_lvr": 0.6170207858085632, "loss_mode_switch": 0.0, "loss_total": 0.5557757616043091, "step": 1022 }, { "batch_size": 1, "epoch": 0.4088, "step": 1022, "tokens_per_device": 5575 }, { "epoch": 0.4088, "loss_ce": 0.042641814798116684, "loss_lvr": 0.5252514481544495, "loss_mode_switch": 0.0, "loss_total": 0.09516695886850357, "step": 1022 }, { "batch_size": 4, "epoch": 0.4088, "step": 1022, "tokens_per_device": 4288 }, { "epoch": 0.4088, "loss_ce": 0.13377255201339722, "loss_lvr": 0.8657355308532715, "loss_mode_switch": 0.0, "loss_total": 0.2203461080789566, "step": 1022 }, { "epoch": 0.4092, "grad_norm": 1.426489233970642, "learning_rate": 6.679969653760747e-06, "loss": 0.2958, "step": 1023 }, { "batch_size": 4, "epoch": 0.4092, "step": 1023, "tokens_per_device": 4216 }, { "epoch": 0.4092, "loss_ce": 0.5029773116111755, "loss_lvr": 0.6458110213279724, "loss_mode_switch": 0.0, "loss_total": 0.5675584077835083, "step": 1023 }, { "batch_size": 4, "epoch": 0.4092, "step": 1023, "tokens_per_device": 3792 }, { "epoch": 0.4092, "loss_ce": 0.29260945320129395, "loss_lvr": 0.9982646703720093, "loss_mode_switch": 0.0, "loss_total": 0.3924359083175659, "step": 1023 }, { "batch_size": 4, "epoch": 0.4092, "step": 1023, "tokens_per_device": 8336 }, { "epoch": 0.4092, "loss_ce": 0.24749265611171722, "loss_lvr": 0.8370664119720459, "loss_mode_switch": 0.0, "loss_total": 0.3311992883682251, "step": 1023 }, { "batch_size": 4, "epoch": 0.4092, "step": 1023, "tokens_per_device": 4220 }, { "epoch": 0.4092, "loss_ce": 0.06258951127529144, "loss_lvr": 0.896208643913269, "loss_mode_switch": 0.0, "loss_total": 0.15221038460731506, "step": 1023 }, { "batch_size": 4, "epoch": 0.4092, "step": 1023, "tokens_per_device": 5052 }, { "epoch": 0.4092, "loss_ce": 0.14471308887004852, "loss_lvr": 0.7705478072166443, "loss_mode_switch": 0.0, "loss_total": 0.2217678725719452, "step": 1023 }, { "batch_size": 4, "epoch": 0.4092, "step": 1023, "tokens_per_device": 16364 }, { "epoch": 0.4092, "loss_ce": 0.059468433260917664, "loss_lvr": 0.8839350342750549, "loss_mode_switch": 0.0, "loss_total": 0.14786192774772644, "step": 1023 }, { "batch_size": 4, "epoch": 0.4092, "step": 1023, "tokens_per_device": 4316 }, { "epoch": 0.4092, "loss_ce": 0.15050934255123138, "loss_lvr": 1.0372697114944458, "loss_mode_switch": 0.0, "loss_total": 0.2542363107204437, "step": 1023 }, { "batch_size": 4, "epoch": 0.4092, "step": 1023, "tokens_per_device": 3304 }, { "epoch": 0.4092, "loss_ce": 0.6132491827011108, "loss_lvr": 1.1196703910827637, "loss_mode_switch": 0.0, "loss_total": 0.7252162098884583, "step": 1023 }, { "epoch": 0.4096, "grad_norm": 1.6218565702438354, "learning_rate": 6.673867310654538e-06, "loss": 0.3518, "step": 1024 }, { "batch_size": 4, "epoch": 0.4096, "step": 1024, "tokens_per_device": 3936 }, { "epoch": 0.4096, "loss_ce": 0.49896013736724854, "loss_lvr": 1.0338449478149414, "loss_mode_switch": 0.0, "loss_total": 0.6023446321487427, "step": 1024 }, { "batch_size": 4, "epoch": 0.4096, "step": 1024, "tokens_per_device": 4808 }, { "epoch": 0.4096, "loss_ce": 0.19059999287128448, "loss_lvr": 0.7207003831863403, "loss_mode_switch": 0.0, "loss_total": 0.26267004013061523, "step": 1024 }, { "batch_size": 1, "epoch": 0.4096, "step": 1024, "tokens_per_device": 5119 }, { "epoch": 0.4096, "loss_ce": 0.750148355960846, "loss_lvr": 0.7277922630310059, "loss_mode_switch": 0.0, "loss_total": 0.8229275941848755, "step": 1024 }, { "batch_size": 1, "epoch": 0.4096, "step": 1024, "tokens_per_device": 5106 }, { "epoch": 0.4096, "loss_ce": 0.009168531745672226, "loss_lvr": 0.7255878448486328, "loss_mode_switch": 0.0, "loss_total": 0.08172731101512909, "step": 1024 }, { "batch_size": 4, "epoch": 0.4096, "step": 1024, "tokens_per_device": 2924 }, { "epoch": 0.4096, "loss_ce": 0.36022597551345825, "loss_lvr": 0.8414335250854492, "loss_mode_switch": 0.0, "loss_total": 0.4443693161010742, "step": 1024 }, { "batch_size": 4, "epoch": 0.4096, "step": 1024, "tokens_per_device": 4292 }, { "epoch": 0.4096, "loss_ce": 0.3738802373409271, "loss_lvr": 0.7546014189720154, "loss_mode_switch": 0.0, "loss_total": 0.4493403732776642, "step": 1024 }, { "batch_size": 1, "epoch": 0.4096, "step": 1024, "tokens_per_device": 4733 }, { "epoch": 0.4096, "loss_ce": 0.006048532202839851, "loss_lvr": 0.35080456733703613, "loss_mode_switch": 0.0, "loss_total": 0.04112899303436279, "step": 1024 }, { "batch_size": 4, "epoch": 0.4096, "step": 1024, "tokens_per_device": 1700 }, { "epoch": 0.4096, "loss_ce": 0.041599467396736145, "loss_lvr": 1.0032771825790405, "loss_mode_switch": 0.0, "loss_total": 0.14192718267440796, "step": 1024 }, { "epoch": 0.41, "grad_norm": 1.2702311277389526, "learning_rate": 6.667762158254104e-06, "loss": 0.3017, "step": 1025 }, { "batch_size": 4, "epoch": 0.41, "step": 1025, "tokens_per_device": 4076 }, { "epoch": 0.41, "loss_ce": 0.11509092152118683, "loss_lvr": 0.9065759181976318, "loss_mode_switch": 0.0, "loss_total": 0.20574851334095, "step": 1025 }, { "batch_size": 4, "epoch": 0.41, "step": 1025, "tokens_per_device": 5632 }, { "epoch": 0.41, "loss_ce": 0.04585728049278259, "loss_lvr": 0.825913667678833, "loss_mode_switch": 0.0, "loss_total": 0.12844865024089813, "step": 1025 }, { "batch_size": 1, "epoch": 0.41, "step": 1025, "tokens_per_device": 4872 }, { "epoch": 0.41, "loss_ce": 0.3139936923980713, "loss_lvr": 0.21924646198749542, "loss_mode_switch": 0.0, "loss_total": 0.3359183371067047, "step": 1025 }, { "batch_size": 1, "epoch": 0.41, "step": 1025, "tokens_per_device": 5090 }, { "epoch": 0.41, "loss_ce": 0.012315674684941769, "loss_lvr": 0.48482468724250793, "loss_mode_switch": 0.0, "loss_total": 0.060798145830631256, "step": 1025 }, { "batch_size": 4, "epoch": 0.41, "step": 1025, "tokens_per_device": 7736 }, { "epoch": 0.41, "loss_ce": 0.3976166546344757, "loss_lvr": 0.779735803604126, "loss_mode_switch": 0.0, "loss_total": 0.47559022903442383, "step": 1025 }, { "batch_size": 4, "epoch": 0.41, "step": 1025, "tokens_per_device": 2584 }, { "epoch": 0.41, "loss_ce": 0.7460587024688721, "loss_lvr": 2.6081643104553223, "loss_mode_switch": 0.0, "loss_total": 1.0068751573562622, "step": 1025 }, { "batch_size": 4, "epoch": 0.41, "step": 1025, "tokens_per_device": 3548 }, { "epoch": 0.41, "loss_ce": 0.05165465921163559, "loss_lvr": 0.6095556616783142, "loss_mode_switch": 0.0, "loss_total": 0.11261022090911865, "step": 1025 }, { "batch_size": 1, "epoch": 0.41, "step": 1025, "tokens_per_device": 5117 }, { "epoch": 0.41, "loss_ce": 0.10787460207939148, "loss_lvr": 0.481585830450058, "loss_mode_switch": 0.0, "loss_total": 0.15603318810462952, "step": 1025 }, { "epoch": 0.4104, "grad_norm": 1.3246833086013794, "learning_rate": 6.661654206805874e-06, "loss": 0.3121, "step": 1026 }, { "batch_size": 1, "epoch": 0.4104, "step": 1026, "tokens_per_device": 5226 }, { "epoch": 0.4104, "loss_ce": 0.0933021679520607, "loss_lvr": 0.4748387336730957, "loss_mode_switch": 0.0, "loss_total": 0.1407860368490219, "step": 1026 }, { "batch_size": 1, "epoch": 0.4104, "step": 1026, "tokens_per_device": 5072 }, { "epoch": 0.4104, "loss_ce": 0.10570617020130157, "loss_lvr": 1.1020190715789795, "loss_mode_switch": 0.0, "loss_total": 0.21590808033943176, "step": 1026 }, { "batch_size": 4, "epoch": 0.4104, "step": 1026, "tokens_per_device": 3012 }, { "epoch": 0.4104, "loss_ce": 0.18533021211624146, "loss_lvr": 0.5635201334953308, "loss_mode_switch": 0.0, "loss_total": 0.241682231426239, "step": 1026 }, { "batch_size": 4, "epoch": 0.4104, "step": 1026, "tokens_per_device": 4324 }, { "epoch": 0.4104, "loss_ce": 0.033016182482242584, "loss_lvr": 0.7959622740745544, "loss_mode_switch": 0.0, "loss_total": 0.11261241137981415, "step": 1026 }, { "batch_size": 1, "epoch": 0.4104, "step": 1026, "tokens_per_device": 4890 }, { "epoch": 0.4104, "loss_ce": 0.13114364445209503, "loss_lvr": 0.17938312888145447, "loss_mode_switch": 0.0, "loss_total": 0.14908196032047272, "step": 1026 }, { "batch_size": 4, "epoch": 0.4104, "step": 1026, "tokens_per_device": 4200 }, { "epoch": 0.4104, "loss_ce": 0.030189601704478264, "loss_lvr": 3.789059638977051, "loss_mode_switch": 0.0, "loss_total": 0.4090955853462219, "step": 1026 }, { "batch_size": 1, "epoch": 0.4104, "step": 1026, "tokens_per_device": 5067 }, { "epoch": 0.4104, "loss_ce": 0.0013929444830864668, "loss_lvr": 0.5500867366790771, "loss_mode_switch": 0.0, "loss_total": 0.05640162155032158, "step": 1026 }, { "batch_size": 1, "epoch": 0.4104, "step": 1026, "tokens_per_device": 5029 }, { "epoch": 0.4104, "loss_ce": 0.057392388582229614, "loss_lvr": 0.40752607583999634, "loss_mode_switch": 0.0, "loss_total": 0.09814499318599701, "step": 1026 }, { "epoch": 0.4108, "grad_norm": 1.2183281183242798, "learning_rate": 6.6555434665609806e-06, "loss": 0.2711, "step": 1027 }, { "batch_size": 4, "epoch": 0.4108, "step": 1027, "tokens_per_device": 4396 }, { "epoch": 0.4108, "loss_ce": 0.3000909388065338, "loss_lvr": 0.7759059071540833, "loss_mode_switch": 0.0, "loss_total": 0.37768152356147766, "step": 1027 }, { "batch_size": 4, "epoch": 0.4108, "step": 1027, "tokens_per_device": 4260 }, { "epoch": 0.4108, "loss_ce": 0.23855558037757874, "loss_lvr": 1.0354846715927124, "loss_mode_switch": 0.0, "loss_total": 0.34210404753685, "step": 1027 }, { "batch_size": 1, "epoch": 0.4108, "step": 1027, "tokens_per_device": 4967 }, { "epoch": 0.4108, "loss_ce": 0.01867004483938217, "loss_lvr": 0.34401148557662964, "loss_mode_switch": 0.0, "loss_total": 0.053071193397045135, "step": 1027 }, { "batch_size": 4, "epoch": 0.4108, "step": 1027, "tokens_per_device": 4856 }, { "epoch": 0.4108, "loss_ce": 0.21778006851673126, "loss_lvr": 1.2937339544296265, "loss_mode_switch": 0.0, "loss_total": 0.3471534848213196, "step": 1027 }, { "batch_size": 4, "epoch": 0.4108, "step": 1027, "tokens_per_device": 6604 }, { "epoch": 0.4108, "loss_ce": 0.3115825951099396, "loss_lvr": 1.023318886756897, "loss_mode_switch": 0.0, "loss_total": 0.4139145016670227, "step": 1027 }, { "batch_size": 4, "epoch": 0.4108, "step": 1027, "tokens_per_device": 4308 }, { "epoch": 0.4108, "loss_ce": 0.04546130821108818, "loss_lvr": 0.9561289548873901, "loss_mode_switch": 0.0, "loss_total": 0.14107421040534973, "step": 1027 }, { "batch_size": 4, "epoch": 0.4108, "step": 1027, "tokens_per_device": 3460 }, { "epoch": 0.4108, "loss_ce": 0.24060535430908203, "loss_lvr": 1.1262176036834717, "loss_mode_switch": 0.0, "loss_total": 0.3532271087169647, "step": 1027 }, { "batch_size": 4, "epoch": 0.4108, "step": 1027, "tokens_per_device": 1368 }, { "epoch": 0.4108, "loss_ce": 0.2843872010707855, "loss_lvr": 1.0213226079940796, "loss_mode_switch": 0.0, "loss_total": 0.3865194618701935, "step": 1027 }, { "epoch": 0.4112, "grad_norm": 1.4084768295288086, "learning_rate": 6.6494299477752364e-06, "loss": 0.2986, "step": 1028 }, { "batch_size": 4, "epoch": 0.4112, "step": 1028, "tokens_per_device": 2780 }, { "epoch": 0.4112, "loss_ce": 0.1475791037082672, "loss_lvr": 0.6024978160858154, "loss_mode_switch": 0.0, "loss_total": 0.20782887935638428, "step": 1028 }, { "batch_size": 4, "epoch": 0.4112, "step": 1028, "tokens_per_device": 4860 }, { "epoch": 0.4112, "loss_ce": 0.0362551249563694, "loss_lvr": 0.8520383238792419, "loss_mode_switch": 0.0, "loss_total": 0.12145896255970001, "step": 1028 }, { "batch_size": 4, "epoch": 0.4112, "step": 1028, "tokens_per_device": 7348 }, { "epoch": 0.4112, "loss_ce": 0.2530566155910492, "loss_lvr": 0.9573875069618225, "loss_mode_switch": 0.0, "loss_total": 0.3487953543663025, "step": 1028 }, { "batch_size": 1, "epoch": 0.4112, "step": 1028, "tokens_per_device": 5156 }, { "epoch": 0.4112, "loss_ce": 0.012351559475064278, "loss_lvr": 0.3883327841758728, "loss_mode_switch": 0.0, "loss_total": 0.05118484050035477, "step": 1028 }, { "batch_size": 4, "epoch": 0.4112, "step": 1028, "tokens_per_device": 6120 }, { "epoch": 0.4112, "loss_ce": 0.10259580612182617, "loss_lvr": 0.8915462493896484, "loss_mode_switch": 0.0, "loss_total": 0.1917504370212555, "step": 1028 }, { "batch_size": 1, "epoch": 0.4112, "step": 1028, "tokens_per_device": 5129 }, { "epoch": 0.4112, "loss_ce": 0.00544479675590992, "loss_lvr": 0.3085979223251343, "loss_mode_switch": 0.0, "loss_total": 0.03630458936095238, "step": 1028 }, { "batch_size": 4, "epoch": 0.4112, "step": 1028, "tokens_per_device": 4452 }, { "epoch": 0.4112, "loss_ce": 0.24667099118232727, "loss_lvr": 1.0608766078948975, "loss_mode_switch": 0.0, "loss_total": 0.35275864601135254, "step": 1028 }, { "batch_size": 4, "epoch": 0.4112, "step": 1028, "tokens_per_device": 2576 }, { "epoch": 0.4112, "loss_ce": 0.36376264691352844, "loss_lvr": 1.0085036754608154, "loss_mode_switch": 0.0, "loss_total": 0.46461302042007446, "step": 1028 }, { "epoch": 0.4116, "grad_norm": 1.2305083274841309, "learning_rate": 6.643313660709114e-06, "loss": 0.2705, "step": 1029 }, { "batch_size": 4, "epoch": 0.4116, "step": 1029, "tokens_per_device": 4252 }, { "epoch": 0.4116, "loss_ce": 0.19367797672748566, "loss_lvr": 1.0091906785964966, "loss_mode_switch": 0.0, "loss_total": 0.2945970296859741, "step": 1029 }, { "batch_size": 4, "epoch": 0.4116, "step": 1029, "tokens_per_device": 5736 }, { "epoch": 0.4116, "loss_ce": 0.0614168755710125, "loss_lvr": 0.9016903638839722, "loss_mode_switch": 0.0, "loss_total": 0.1515859067440033, "step": 1029 }, { "batch_size": 1, "epoch": 0.4116, "step": 1029, "tokens_per_device": 6731 }, { "epoch": 0.4116, "loss_ce": 0.1303393542766571, "loss_lvr": 0.38003337383270264, "loss_mode_switch": 0.0, "loss_total": 0.1683426946401596, "step": 1029 }, { "batch_size": 4, "epoch": 0.4116, "step": 1029, "tokens_per_device": 5212 }, { "epoch": 0.4116, "loss_ce": 0.5460560321807861, "loss_lvr": 0.7703472375869751, "loss_mode_switch": 0.0, "loss_total": 0.6230907440185547, "step": 1029 }, { "batch_size": 1, "epoch": 0.4116, "step": 1029, "tokens_per_device": 4901 }, { "epoch": 0.4116, "loss_ce": 0.04305960610508919, "loss_lvr": 0.5720556974411011, "loss_mode_switch": 0.0, "loss_total": 0.10026517510414124, "step": 1029 }, { "batch_size": 4, "epoch": 0.4116, "step": 1029, "tokens_per_device": 10736 }, { "epoch": 0.4116, "loss_ce": 0.4416275918483734, "loss_lvr": 0.9916415214538574, "loss_mode_switch": 0.0, "loss_total": 0.5407917499542236, "step": 1029 }, { "batch_size": 4, "epoch": 0.4116, "step": 1029, "tokens_per_device": 5140 }, { "epoch": 0.4116, "loss_ce": 0.08092807978391647, "loss_lvr": 1.3443971872329712, "loss_mode_switch": 0.0, "loss_total": 0.21536779403686523, "step": 1029 }, { "batch_size": 1, "epoch": 0.4116, "step": 1029, "tokens_per_device": 5025 }, { "epoch": 0.4116, "loss_ce": 0.004579688888043165, "loss_lvr": 0.4723662734031677, "loss_mode_switch": 0.0, "loss_total": 0.051816318184137344, "step": 1029 }, { "epoch": 0.412, "grad_norm": 1.3562195301055908, "learning_rate": 6.637194615627733e-06, "loss": 0.3015, "step": 1030 }, { "batch_size": 4, "epoch": 0.412, "step": 1030, "tokens_per_device": 1868 }, { "epoch": 0.412, "loss_ce": 0.36734676361083984, "loss_lvr": 1.9456419944763184, "loss_mode_switch": 0.0, "loss_total": 0.5619109869003296, "step": 1030 }, { "batch_size": 4, "epoch": 0.412, "step": 1030, "tokens_per_device": 4568 }, { "epoch": 0.412, "loss_ce": 0.27835872769355774, "loss_lvr": 1.0419974327087402, "loss_mode_switch": 0.0, "loss_total": 0.3825584650039673, "step": 1030 }, { "batch_size": 1, "epoch": 0.412, "step": 1030, "tokens_per_device": 5138 }, { "epoch": 0.412, "loss_ce": 0.012720880098640919, "loss_lvr": 0.18293912708759308, "loss_mode_switch": 0.0, "loss_total": 0.03101479262113571, "step": 1030 }, { "batch_size": 4, "epoch": 0.412, "step": 1030, "tokens_per_device": 4296 }, { "epoch": 0.412, "loss_ce": 0.2817871868610382, "loss_lvr": 0.9808482527732849, "loss_mode_switch": 0.0, "loss_total": 0.37987202405929565, "step": 1030 }, { "batch_size": 4, "epoch": 0.412, "step": 1030, "tokens_per_device": 3936 }, { "epoch": 0.412, "loss_ce": 0.22769245505332947, "loss_lvr": 1.3210700750350952, "loss_mode_switch": 0.0, "loss_total": 0.35979944467544556, "step": 1030 }, { "batch_size": 4, "epoch": 0.412, "step": 1030, "tokens_per_device": 1356 }, { "epoch": 0.412, "loss_ce": 0.6417074203491211, "loss_lvr": 1.1035821437835693, "loss_mode_switch": 0.0, "loss_total": 0.7520656585693359, "step": 1030 }, { "batch_size": 4, "epoch": 0.412, "step": 1030, "tokens_per_device": 1260 }, { "epoch": 0.412, "loss_ce": 0.11221332848072052, "loss_lvr": 1.6372344493865967, "loss_mode_switch": 0.0, "loss_total": 0.2759367823600769, "step": 1030 }, { "batch_size": 4, "epoch": 0.412, "step": 1030, "tokens_per_device": 6272 }, { "epoch": 0.412, "loss_ce": 0.14506895840168, "loss_lvr": 0.7237566709518433, "loss_mode_switch": 0.0, "loss_total": 0.21744462847709656, "step": 1030 }, { "epoch": 0.4124, "grad_norm": 1.335555076599121, "learning_rate": 6.631072822800847e-06, "loss": 0.3703, "step": 1031 }, { "batch_size": 4, "epoch": 0.4124, "step": 1031, "tokens_per_device": 6120 }, { "epoch": 0.4124, "loss_ce": 0.1875450760126114, "loss_lvr": 0.7294094562530518, "loss_mode_switch": 0.0, "loss_total": 0.26048600673675537, "step": 1031 }, { "batch_size": 4, "epoch": 0.4124, "step": 1031, "tokens_per_device": 9224 }, { "epoch": 0.4124, "loss_ce": 0.4632437527179718, "loss_lvr": 0.660270094871521, "loss_mode_switch": 0.0, "loss_total": 0.5292707681655884, "step": 1031 }, { "batch_size": 4, "epoch": 0.4124, "step": 1031, "tokens_per_device": 4024 }, { "epoch": 0.4124, "loss_ce": 0.5383264422416687, "loss_lvr": 0.9274815320968628, "loss_mode_switch": 0.0, "loss_total": 0.6310746073722839, "step": 1031 }, { "batch_size": 1, "epoch": 0.4124, "step": 1031, "tokens_per_device": 5093 }, { "epoch": 0.4124, "loss_ce": 0.001181327155791223, "loss_lvr": 0.45437127351760864, "loss_mode_switch": 0.0, "loss_total": 0.04661845415830612, "step": 1031 }, { "batch_size": 4, "epoch": 0.4124, "step": 1031, "tokens_per_device": 5088 }, { "epoch": 0.4124, "loss_ce": 0.4282064735889435, "loss_lvr": 0.8245248198509216, "loss_mode_switch": 0.0, "loss_total": 0.5106589794158936, "step": 1031 }, { "batch_size": 4, "epoch": 0.4124, "step": 1031, "tokens_per_device": 3808 }, { "epoch": 0.4124, "loss_ce": 0.5937658548355103, "loss_lvr": 1.187022089958191, "loss_mode_switch": 0.0, "loss_total": 0.7124680876731873, "step": 1031 }, { "batch_size": 4, "epoch": 0.4124, "step": 1031, "tokens_per_device": 3060 }, { "epoch": 0.4124, "loss_ce": 0.5425843596458435, "loss_lvr": 0.8699842691421509, "loss_mode_switch": 0.0, "loss_total": 0.6295827627182007, "step": 1031 }, { "batch_size": 4, "epoch": 0.4124, "step": 1031, "tokens_per_device": 4724 }, { "epoch": 0.4124, "loss_ce": 0.42197054624557495, "loss_lvr": 0.8206127285957336, "loss_mode_switch": 0.0, "loss_total": 0.5040318369865417, "step": 1031 }, { "epoch": 0.4128, "grad_norm": 1.36354398727417, "learning_rate": 6.624948292502814e-06, "loss": 0.3321, "step": 1032 }, { "batch_size": 4, "epoch": 0.4128, "step": 1032, "tokens_per_device": 6860 }, { "epoch": 0.4128, "loss_ce": 0.20013666152954102, "loss_lvr": 0.7622373700141907, "loss_mode_switch": 0.0, "loss_total": 0.2763603925704956, "step": 1032 }, { "batch_size": 4, "epoch": 0.4128, "step": 1032, "tokens_per_device": 4276 }, { "epoch": 0.4128, "loss_ce": 0.477059006690979, "loss_lvr": 1.1107462644577026, "loss_mode_switch": 0.0, "loss_total": 0.5881336331367493, "step": 1032 }, { "batch_size": 4, "epoch": 0.4128, "step": 1032, "tokens_per_device": 5084 }, { "epoch": 0.4128, "loss_ce": 0.16805057227611542, "loss_lvr": 0.7828689217567444, "loss_mode_switch": 0.0, "loss_total": 0.24633747339248657, "step": 1032 }, { "batch_size": 4, "epoch": 0.4128, "step": 1032, "tokens_per_device": 4304 }, { "epoch": 0.4128, "loss_ce": 0.07439569383859634, "loss_lvr": 0.9505279660224915, "loss_mode_switch": 0.0, "loss_total": 0.16944849491119385, "step": 1032 }, { "batch_size": 1, "epoch": 0.4128, "step": 1032, "tokens_per_device": 4980 }, { "epoch": 0.4128, "loss_ce": 0.007475125603377819, "loss_lvr": 0.35665246844291687, "loss_mode_switch": 0.0, "loss_total": 0.04314037412405014, "step": 1032 }, { "batch_size": 4, "epoch": 0.4128, "step": 1032, "tokens_per_device": 3764 }, { "epoch": 0.4128, "loss_ce": 0.6525459885597229, "loss_lvr": 1.357890009880066, "loss_mode_switch": 0.0, "loss_total": 0.7883349657058716, "step": 1032 }, { "batch_size": 4, "epoch": 0.4128, "step": 1032, "tokens_per_device": 4332 }, { "epoch": 0.4128, "loss_ce": 0.2888195216655731, "loss_lvr": 0.852474570274353, "loss_mode_switch": 0.0, "loss_total": 0.3740669786930084, "step": 1032 }, { "batch_size": 4, "epoch": 0.4128, "step": 1032, "tokens_per_device": 11044 }, { "epoch": 0.4128, "loss_ce": 0.15291361510753632, "loss_lvr": 0.959011971950531, "loss_mode_switch": 0.0, "loss_total": 0.24881482124328613, "step": 1032 }, { "epoch": 0.4132, "grad_norm": 1.201372742652893, "learning_rate": 6.618821035012591e-06, "loss": 0.3104, "step": 1033 }, { "batch_size": 4, "epoch": 0.4132, "step": 1033, "tokens_per_device": 4196 }, { "epoch": 0.4132, "loss_ce": 0.519707202911377, "loss_lvr": 0.7042167782783508, "loss_mode_switch": 0.0, "loss_total": 0.5901288986206055, "step": 1033 }, { "batch_size": 1, "epoch": 0.4132, "step": 1033, "tokens_per_device": 4878 }, { "epoch": 0.4132, "loss_ce": 0.04744577407836914, "loss_lvr": 0.34797918796539307, "loss_mode_switch": 0.0, "loss_total": 0.08224369585514069, "step": 1033 }, { "batch_size": 1, "epoch": 0.4132, "step": 1033, "tokens_per_device": 7061 }, { "epoch": 0.4132, "loss_ce": 0.18207773566246033, "loss_lvr": 0.22213412821292877, "loss_mode_switch": 0.0, "loss_total": 0.20429114997386932, "step": 1033 }, { "batch_size": 4, "epoch": 0.4132, "step": 1033, "tokens_per_device": 4380 }, { "epoch": 0.4132, "loss_ce": 0.058317672461271286, "loss_lvr": 0.7319717407226562, "loss_mode_switch": 0.0, "loss_total": 0.13151484727859497, "step": 1033 }, { "batch_size": 4, "epoch": 0.4132, "step": 1033, "tokens_per_device": 3812 }, { "epoch": 0.4132, "loss_ce": 0.21661590039730072, "loss_lvr": 0.8131436705589294, "loss_mode_switch": 0.0, "loss_total": 0.2979302704334259, "step": 1033 }, { "batch_size": 4, "epoch": 0.4132, "step": 1033, "tokens_per_device": 3764 }, { "epoch": 0.4132, "loss_ce": 0.34527596831321716, "loss_lvr": 0.8268881440162659, "loss_mode_switch": 0.0, "loss_total": 0.4279647767543793, "step": 1033 }, { "batch_size": 4, "epoch": 0.4132, "step": 1033, "tokens_per_device": 3824 }, { "epoch": 0.4132, "loss_ce": 0.6165376305580139, "loss_lvr": 0.8437835574150085, "loss_mode_switch": 0.0, "loss_total": 0.7009159922599792, "step": 1033 }, { "batch_size": 4, "epoch": 0.4132, "step": 1033, "tokens_per_device": 2288 }, { "epoch": 0.4132, "loss_ce": 0.28112271428108215, "loss_lvr": 0.8491088151931763, "loss_mode_switch": 0.0, "loss_total": 0.3660336136817932, "step": 1033 }, { "epoch": 0.4136, "grad_norm": 1.1869375705718994, "learning_rate": 6.61269106061371e-06, "loss": 0.2475, "step": 1034 }, { "batch_size": 1, "epoch": 0.4136, "step": 1034, "tokens_per_device": 4890 }, { "epoch": 0.4136, "loss_ce": 0.1382257342338562, "loss_lvr": 0.25036129355430603, "loss_mode_switch": 0.0, "loss_total": 0.16326186060905457, "step": 1034 }, { "batch_size": 4, "epoch": 0.4136, "step": 1034, "tokens_per_device": 1876 }, { "epoch": 0.4136, "loss_ce": 0.10726306587457657, "loss_lvr": 1.6177657842636108, "loss_mode_switch": 0.0, "loss_total": 0.26903966069221497, "step": 1034 }, { "batch_size": 4, "epoch": 0.4136, "step": 1034, "tokens_per_device": 3868 }, { "epoch": 0.4136, "loss_ce": 0.5852629542350769, "loss_lvr": 1.0113178491592407, "loss_mode_switch": 0.0, "loss_total": 0.6863947510719299, "step": 1034 }, { "batch_size": 4, "epoch": 0.4136, "step": 1034, "tokens_per_device": 3860 }, { "epoch": 0.4136, "loss_ce": 0.44229656457901, "loss_lvr": 0.9996014833450317, "loss_mode_switch": 0.0, "loss_total": 0.5422567129135132, "step": 1034 }, { "batch_size": 4, "epoch": 0.4136, "step": 1034, "tokens_per_device": 6532 }, { "epoch": 0.4136, "loss_ce": 0.17871515452861786, "loss_lvr": 0.6598173975944519, "loss_mode_switch": 0.0, "loss_total": 0.24469688534736633, "step": 1034 }, { "batch_size": 4, "epoch": 0.4136, "step": 1034, "tokens_per_device": 9616 }, { "epoch": 0.4136, "loss_ce": 0.44744959473609924, "loss_lvr": 1.0112355947494507, "loss_mode_switch": 0.0, "loss_total": 0.5485731363296509, "step": 1034 }, { "batch_size": 4, "epoch": 0.4136, "step": 1034, "tokens_per_device": 4368 }, { "epoch": 0.4136, "loss_ce": 0.4635893702507019, "loss_lvr": 0.36107105016708374, "loss_mode_switch": 0.0, "loss_total": 0.4996964633464813, "step": 1034 }, { "batch_size": 4, "epoch": 0.4136, "step": 1034, "tokens_per_device": 1428 }, { "epoch": 0.4136, "loss_ce": 0.6826806664466858, "loss_lvr": 0.9175925850868225, "loss_mode_switch": 0.0, "loss_total": 0.7744399309158325, "step": 1034 }, { "epoch": 0.414, "grad_norm": 1.331743836402893, "learning_rate": 6.6065583795942625e-06, "loss": 0.348, "step": 1035 }, { "batch_size": 1, "epoch": 0.414, "step": 1035, "tokens_per_device": 4512 }, { "epoch": 0.414, "loss_ce": 0.008360576815903187, "loss_lvr": 0.4892921447753906, "loss_mode_switch": 0.0, "loss_total": 0.057289790362119675, "step": 1035 }, { "batch_size": 4, "epoch": 0.414, "step": 1035, "tokens_per_device": 4372 }, { "epoch": 0.414, "loss_ce": 0.2004111260175705, "loss_lvr": 1.016233205795288, "loss_mode_switch": 0.0, "loss_total": 0.3020344376564026, "step": 1035 }, { "batch_size": 4, "epoch": 0.414, "step": 1035, "tokens_per_device": 4636 }, { "epoch": 0.414, "loss_ce": 0.3536832928657532, "loss_lvr": 0.7344745397567749, "loss_mode_switch": 0.0, "loss_total": 0.4271307587623596, "step": 1035 }, { "batch_size": 4, "epoch": 0.414, "step": 1035, "tokens_per_device": 4056 }, { "epoch": 0.414, "loss_ce": 0.0848400741815567, "loss_lvr": 0.957621157169342, "loss_mode_switch": 0.0, "loss_total": 0.18060219287872314, "step": 1035 }, { "batch_size": 4, "epoch": 0.414, "step": 1035, "tokens_per_device": 15184 }, { "epoch": 0.414, "loss_ce": 0.302560031414032, "loss_lvr": 0.8908226490020752, "loss_mode_switch": 0.0, "loss_total": 0.391642302274704, "step": 1035 }, { "batch_size": 1, "epoch": 0.414, "step": 1035, "tokens_per_device": 4965 }, { "epoch": 0.414, "loss_ce": 0.2819236218929291, "loss_lvr": 0.47383275628089905, "loss_mode_switch": 0.0, "loss_total": 0.3293069005012512, "step": 1035 }, { "batch_size": 4, "epoch": 0.414, "step": 1035, "tokens_per_device": 2608 }, { "epoch": 0.414, "loss_ce": 0.7492167353630066, "loss_lvr": 0.8314346671104431, "loss_mode_switch": 0.0, "loss_total": 0.8323602080345154, "step": 1035 }, { "batch_size": 4, "epoch": 0.414, "step": 1035, "tokens_per_device": 5828 }, { "epoch": 0.414, "loss_ce": 0.14782510697841644, "loss_lvr": 0.7200556993484497, "loss_mode_switch": 0.0, "loss_total": 0.2198306769132614, "step": 1035 }, { "epoch": 0.4144, "grad_norm": 1.1695036888122559, "learning_rate": 6.600423002246885e-06, "loss": 0.2774, "step": 1036 }, { "batch_size": 4, "epoch": 0.4144, "step": 1036, "tokens_per_device": 4228 }, { "epoch": 0.4144, "loss_ce": 0.13692571222782135, "loss_lvr": 1.1695263385772705, "loss_mode_switch": 0.0, "loss_total": 0.2538783550262451, "step": 1036 }, { "batch_size": 4, "epoch": 0.4144, "step": 1036, "tokens_per_device": 5660 }, { "epoch": 0.4144, "loss_ce": 0.24605578184127808, "loss_lvr": 0.8848736882209778, "loss_mode_switch": 0.0, "loss_total": 0.3345431685447693, "step": 1036 }, { "batch_size": 4, "epoch": 0.4144, "step": 1036, "tokens_per_device": 4116 }, { "epoch": 0.4144, "loss_ce": 0.4214922785758972, "loss_lvr": 0.3991377353668213, "loss_mode_switch": 0.0, "loss_total": 0.46140605211257935, "step": 1036 }, { "batch_size": 1, "epoch": 0.4144, "step": 1036, "tokens_per_device": 4881 }, { "epoch": 0.4144, "loss_ce": 0.40119263529777527, "loss_lvr": 0.601179838180542, "loss_mode_switch": 0.0, "loss_total": 0.46131062507629395, "step": 1036 }, { "batch_size": 4, "epoch": 0.4144, "step": 1036, "tokens_per_device": 10376 }, { "epoch": 0.4144, "loss_ce": 0.2805372476577759, "loss_lvr": 0.7599603533744812, "loss_mode_switch": 0.0, "loss_total": 0.3565332889556885, "step": 1036 }, { "batch_size": 4, "epoch": 0.4144, "step": 1036, "tokens_per_device": 2716 }, { "epoch": 0.4144, "loss_ce": 0.5226566791534424, "loss_lvr": 0.9676722884178162, "loss_mode_switch": 0.0, "loss_total": 0.6194239258766174, "step": 1036 }, { "batch_size": 1, "epoch": 0.4144, "step": 1036, "tokens_per_device": 5424 }, { "epoch": 0.4144, "loss_ce": 0.042885925620794296, "loss_lvr": 0.5407801270484924, "loss_mode_switch": 0.0, "loss_total": 0.09696394205093384, "step": 1036 }, { "batch_size": 1, "epoch": 0.4144, "step": 1036, "tokens_per_device": 5071 }, { "epoch": 0.4144, "loss_ce": 0.070982426404953, "loss_lvr": 0.2843264937400818, "loss_mode_switch": 0.0, "loss_total": 0.09941507875919342, "step": 1036 }, { "epoch": 0.4148, "grad_norm": 1.405820369720459, "learning_rate": 6.594284938868737e-06, "loss": 0.3358, "step": 1037 }, { "batch_size": 4, "epoch": 0.4148, "step": 1037, "tokens_per_device": 1268 }, { "epoch": 0.4148, "loss_ce": 0.10397985577583313, "loss_lvr": 1.0801618099212646, "loss_mode_switch": 0.0, "loss_total": 0.21199604868888855, "step": 1037 }, { "batch_size": 1, "epoch": 0.4148, "step": 1037, "tokens_per_device": 4901 }, { "epoch": 0.4148, "loss_ce": 0.09918980300426483, "loss_lvr": 0.3639892637729645, "loss_mode_switch": 0.0, "loss_total": 0.13558873534202576, "step": 1037 }, { "batch_size": 4, "epoch": 0.4148, "step": 1037, "tokens_per_device": 2700 }, { "epoch": 0.4148, "loss_ce": 0.30892038345336914, "loss_lvr": 0.6951438784599304, "loss_mode_switch": 0.0, "loss_total": 0.37843477725982666, "step": 1037 }, { "batch_size": 4, "epoch": 0.4148, "step": 1037, "tokens_per_device": 1480 }, { "epoch": 0.4148, "loss_ce": 0.47169190645217896, "loss_lvr": 0.8900877833366394, "loss_mode_switch": 0.0, "loss_total": 0.5607006549835205, "step": 1037 }, { "batch_size": 4, "epoch": 0.4148, "step": 1037, "tokens_per_device": 2868 }, { "epoch": 0.4148, "loss_ce": 0.09102591872215271, "loss_lvr": 0.5133237242698669, "loss_mode_switch": 0.0, "loss_total": 0.14235828816890717, "step": 1037 }, { "batch_size": 4, "epoch": 0.4148, "step": 1037, "tokens_per_device": 4316 }, { "epoch": 0.4148, "loss_ce": 0.27164989709854126, "loss_lvr": 0.9548747539520264, "loss_mode_switch": 0.0, "loss_total": 0.3671373724937439, "step": 1037 }, { "batch_size": 4, "epoch": 0.4148, "step": 1037, "tokens_per_device": 12792 }, { "epoch": 0.4148, "loss_ce": 0.10939347743988037, "loss_lvr": 0.6086592078208923, "loss_mode_switch": 0.0, "loss_total": 0.17025940120220184, "step": 1037 }, { "batch_size": 1, "epoch": 0.4148, "step": 1037, "tokens_per_device": 4914 }, { "epoch": 0.4148, "loss_ce": 0.034375619143247604, "loss_lvr": 1.3026295900344849, "loss_mode_switch": 0.0, "loss_total": 0.16463857889175415, "step": 1037 }, { "epoch": 0.4152, "grad_norm": 1.2904229164123535, "learning_rate": 6.588144199761487e-06, "loss": 0.287, "step": 1038 }, { "batch_size": 4, "epoch": 0.4152, "step": 1038, "tokens_per_device": 5172 }, { "epoch": 0.4152, "loss_ce": 0.011895298957824707, "loss_lvr": 0.9846178889274597, "loss_mode_switch": 0.0, "loss_total": 0.11035709083080292, "step": 1038 }, { "batch_size": 4, "epoch": 0.4152, "step": 1038, "tokens_per_device": 4856 }, { "epoch": 0.4152, "loss_ce": 0.1330147236585617, "loss_lvr": 0.9256874918937683, "loss_mode_switch": 0.0, "loss_total": 0.22558346390724182, "step": 1038 }, { "batch_size": 1, "epoch": 0.4152, "step": 1038, "tokens_per_device": 4899 }, { "epoch": 0.4152, "loss_ce": 0.13468796014785767, "loss_lvr": 0.44990143179893494, "loss_mode_switch": 0.0, "loss_total": 0.17967811226844788, "step": 1038 }, { "batch_size": 4, "epoch": 0.4152, "step": 1038, "tokens_per_device": 2528 }, { "epoch": 0.4152, "loss_ce": 0.26910898089408875, "loss_lvr": 1.0223302841186523, "loss_mode_switch": 0.0, "loss_total": 0.3713420033454895, "step": 1038 }, { "batch_size": 4, "epoch": 0.4152, "step": 1038, "tokens_per_device": 7316 }, { "epoch": 0.4152, "loss_ce": 0.33953240513801575, "loss_lvr": 0.7049918174743652, "loss_mode_switch": 0.0, "loss_total": 0.41003158688545227, "step": 1038 }, { "batch_size": 4, "epoch": 0.4152, "step": 1038, "tokens_per_device": 1736 }, { "epoch": 0.4152, "loss_ce": 0.3081493079662323, "loss_lvr": 1.054200530052185, "loss_mode_switch": 0.0, "loss_total": 0.4135693609714508, "step": 1038 }, { "batch_size": 4, "epoch": 0.4152, "step": 1038, "tokens_per_device": 5424 }, { "epoch": 0.4152, "loss_ce": 0.3367079496383667, "loss_lvr": 1.006973385810852, "loss_mode_switch": 0.0, "loss_total": 0.4374052882194519, "step": 1038 }, { "batch_size": 4, "epoch": 0.4152, "step": 1038, "tokens_per_device": 4784 }, { "epoch": 0.4152, "loss_ce": 0.10048139095306396, "loss_lvr": 0.7493072152137756, "loss_mode_switch": 0.0, "loss_total": 0.175412118434906, "step": 1038 }, { "epoch": 0.4156, "grad_norm": 1.4043577909469604, "learning_rate": 6.582000795231296e-06, "loss": 0.344, "step": 1039 }, { "batch_size": 4, "epoch": 0.4156, "step": 1039, "tokens_per_device": 5232 }, { "epoch": 0.4156, "loss_ce": 0.16219443082809448, "loss_lvr": 0.8407160639762878, "loss_mode_switch": 0.0, "loss_total": 0.24626603722572327, "step": 1039 }, { "batch_size": 4, "epoch": 0.4156, "step": 1039, "tokens_per_device": 3772 }, { "epoch": 0.4156, "loss_ce": 0.5751797556877136, "loss_lvr": 1.0955531597137451, "loss_mode_switch": 0.0, "loss_total": 0.6847350597381592, "step": 1039 }, { "batch_size": 4, "epoch": 0.4156, "step": 1039, "tokens_per_device": 5308 }, { "epoch": 0.4156, "loss_ce": 0.14015018939971924, "loss_lvr": 0.7277924418449402, "loss_mode_switch": 0.0, "loss_total": 0.21292942762374878, "step": 1039 }, { "batch_size": 4, "epoch": 0.4156, "step": 1039, "tokens_per_device": 4372 }, { "epoch": 0.4156, "loss_ce": 0.011351470835506916, "loss_lvr": 0.9085946679115295, "loss_mode_switch": 0.0, "loss_total": 0.10221093893051147, "step": 1039 }, { "batch_size": 1, "epoch": 0.4156, "step": 1039, "tokens_per_device": 7584 }, { "epoch": 0.4156, "loss_ce": 0.001380270579829812, "loss_lvr": 0.3656429648399353, "loss_mode_switch": 0.0, "loss_total": 0.03794457018375397, "step": 1039 }, { "batch_size": 4, "epoch": 0.4156, "step": 1039, "tokens_per_device": 4740 }, { "epoch": 0.4156, "loss_ce": 0.18843188881874084, "loss_lvr": 0.8295491933822632, "loss_mode_switch": 0.0, "loss_total": 0.2713868021965027, "step": 1039 }, { "batch_size": 4, "epoch": 0.4156, "step": 1039, "tokens_per_device": 3944 }, { "epoch": 0.4156, "loss_ce": 0.364092081785202, "loss_lvr": 0.9585058689117432, "loss_mode_switch": 0.0, "loss_total": 0.45994266867637634, "step": 1039 }, { "batch_size": 4, "epoch": 0.4156, "step": 1039, "tokens_per_device": 1320 }, { "epoch": 0.4156, "loss_ce": 0.4305531084537506, "loss_lvr": 1.0248593091964722, "loss_mode_switch": 0.0, "loss_total": 0.5330390334129333, "step": 1039 }, { "epoch": 0.416, "grad_norm": 1.576751708984375, "learning_rate": 6.5758547355887944e-06, "loss": 0.3337, "step": 1040 }, { "batch_size": 4, "epoch": 0.416, "step": 1040, "tokens_per_device": 4632 }, { "epoch": 0.416, "loss_ce": 0.005674212705343962, "loss_lvr": 0.6039122343063354, "loss_mode_switch": 0.0, "loss_total": 0.06606543809175491, "step": 1040 }, { "batch_size": 4, "epoch": 0.416, "step": 1040, "tokens_per_device": 10512 }, { "epoch": 0.416, "loss_ce": 0.253243088722229, "loss_lvr": 0.8422045111656189, "loss_mode_switch": 0.0, "loss_total": 0.3374635577201843, "step": 1040 }, { "batch_size": 4, "epoch": 0.416, "step": 1040, "tokens_per_device": 4596 }, { "epoch": 0.416, "loss_ce": 0.0802333801984787, "loss_lvr": 0.5296146273612976, "loss_mode_switch": 0.0, "loss_total": 0.13319484889507294, "step": 1040 }, { "batch_size": 4, "epoch": 0.416, "step": 1040, "tokens_per_device": 12288 }, { "epoch": 0.416, "loss_ce": 0.22486238181591034, "loss_lvr": 0.8709125518798828, "loss_mode_switch": 0.0, "loss_total": 0.3119536340236664, "step": 1040 }, { "batch_size": 4, "epoch": 0.416, "step": 1040, "tokens_per_device": 3808 }, { "epoch": 0.416, "loss_ce": 0.41769498586654663, "loss_lvr": 1.0767141580581665, "loss_mode_switch": 0.0, "loss_total": 0.5253664255142212, "step": 1040 }, { "batch_size": 1, "epoch": 0.416, "step": 1040, "tokens_per_device": 5101 }, { "epoch": 0.416, "loss_ce": 0.24710284173488617, "loss_lvr": 0.6084278225898743, "loss_mode_switch": 0.0, "loss_total": 0.3079456090927124, "step": 1040 }, { "batch_size": 4, "epoch": 0.416, "step": 1040, "tokens_per_device": 1232 }, { "epoch": 0.416, "loss_ce": 0.12231164425611496, "loss_lvr": 1.2865303754806519, "loss_mode_switch": 0.0, "loss_total": 0.2509646713733673, "step": 1040 }, { "batch_size": 4, "epoch": 0.416, "step": 1040, "tokens_per_device": 4232 }, { "epoch": 0.416, "loss_ce": 0.20206566154956818, "loss_lvr": 0.7759588956832886, "loss_mode_switch": 0.0, "loss_total": 0.27966153621673584, "step": 1040 }, { "epoch": 0.4164, "grad_norm": 1.2822859287261963, "learning_rate": 6.5697060311490705e-06, "loss": 0.2989, "step": 1041 }, { "batch_size": 1, "epoch": 0.4164, "step": 1041, "tokens_per_device": 5186 }, { "epoch": 0.4164, "loss_ce": 0.0444483645260334, "loss_lvr": 0.5137717127799988, "loss_mode_switch": 0.0, "loss_total": 0.09582553803920746, "step": 1041 }, { "batch_size": 4, "epoch": 0.4164, "step": 1041, "tokens_per_device": 3800 }, { "epoch": 0.4164, "loss_ce": 0.28825655579566956, "loss_lvr": 0.8707247972488403, "loss_mode_switch": 0.0, "loss_total": 0.37532904744148254, "step": 1041 }, { "batch_size": 4, "epoch": 0.4164, "step": 1041, "tokens_per_device": 4820 }, { "epoch": 0.4164, "loss_ce": 0.3865809440612793, "loss_lvr": 0.8444269895553589, "loss_mode_switch": 0.0, "loss_total": 0.47102364897727966, "step": 1041 }, { "batch_size": 4, "epoch": 0.4164, "step": 1041, "tokens_per_device": 7596 }, { "epoch": 0.4164, "loss_ce": 0.2621733546257019, "loss_lvr": 1.079831600189209, "loss_mode_switch": 0.0, "loss_total": 0.37015652656555176, "step": 1041 }, { "batch_size": 4, "epoch": 0.4164, "step": 1041, "tokens_per_device": 4644 }, { "epoch": 0.4164, "loss_ce": 0.34835970401763916, "loss_lvr": 0.6782910227775574, "loss_mode_switch": 0.0, "loss_total": 0.4161888062953949, "step": 1041 }, { "batch_size": 1, "epoch": 0.4164, "step": 1041, "tokens_per_device": 4960 }, { "epoch": 0.4164, "loss_ce": 0.012472039088606834, "loss_lvr": 0.38503527641296387, "loss_mode_switch": 0.0, "loss_total": 0.05097556859254837, "step": 1041 }, { "batch_size": 1, "epoch": 0.4164, "step": 1041, "tokens_per_device": 7450 }, { "epoch": 0.4164, "loss_ce": 0.0009028548956848681, "loss_lvr": 0.31081366539001465, "loss_mode_switch": 0.0, "loss_total": 0.03198422119021416, "step": 1041 }, { "batch_size": 4, "epoch": 0.4164, "step": 1041, "tokens_per_device": 3820 }, { "epoch": 0.4164, "loss_ce": 0.10139483958482742, "loss_lvr": 1.2848066091537476, "loss_mode_switch": 0.0, "loss_total": 0.22987550497055054, "step": 1041 }, { "epoch": 0.4168, "grad_norm": 1.316951870918274, "learning_rate": 6.563554692231655e-06, "loss": 0.2709, "step": 1042 }, { "batch_size": 4, "epoch": 0.4168, "step": 1042, "tokens_per_device": 4244 }, { "epoch": 0.4168, "loss_ce": 0.16046303510665894, "loss_lvr": 1.110023856163025, "loss_mode_switch": 0.0, "loss_total": 0.2714654207229614, "step": 1042 }, { "batch_size": 4, "epoch": 0.4168, "step": 1042, "tokens_per_device": 4960 }, { "epoch": 0.4168, "loss_ce": 0.3140200674533844, "loss_lvr": 0.7928191423416138, "loss_mode_switch": 0.0, "loss_total": 0.39330199360847473, "step": 1042 }, { "batch_size": 1, "epoch": 0.4168, "step": 1042, "tokens_per_device": 5007 }, { "epoch": 0.4168, "loss_ce": 0.004400156904011965, "loss_lvr": 0.39425909519195557, "loss_mode_switch": 0.0, "loss_total": 0.043826065957546234, "step": 1042 }, { "batch_size": 4, "epoch": 0.4168, "step": 1042, "tokens_per_device": 2588 }, { "epoch": 0.4168, "loss_ce": 0.3173396587371826, "loss_lvr": 1.0794793367385864, "loss_mode_switch": 0.0, "loss_total": 0.4252876043319702, "step": 1042 }, { "batch_size": 4, "epoch": 0.4168, "step": 1042, "tokens_per_device": 1708 }, { "epoch": 0.4168, "loss_ce": 0.2586505115032196, "loss_lvr": 0.9026010036468506, "loss_mode_switch": 0.0, "loss_total": 0.3489106297492981, "step": 1042 }, { "batch_size": 4, "epoch": 0.4168, "step": 1042, "tokens_per_device": 8976 }, { "epoch": 0.4168, "loss_ce": 0.22242337465286255, "loss_lvr": 0.45204541087150574, "loss_mode_switch": 0.0, "loss_total": 0.26762792468070984, "step": 1042 }, { "batch_size": 4, "epoch": 0.4168, "step": 1042, "tokens_per_device": 12840 }, { "epoch": 0.4168, "loss_ce": 0.23455014824867249, "loss_lvr": 1.094137191772461, "loss_mode_switch": 0.0, "loss_total": 0.3439638614654541, "step": 1042 }, { "batch_size": 4, "epoch": 0.4168, "step": 1042, "tokens_per_device": 4400 }, { "epoch": 0.4168, "loss_ce": 0.662014365196228, "loss_lvr": 0.8661085367202759, "loss_mode_switch": 0.0, "loss_total": 0.7486252188682556, "step": 1042 }, { "epoch": 0.4172, "grad_norm": 1.27744460105896, "learning_rate": 6.557400729160494e-06, "loss": 0.3074, "step": 1043 }, { "batch_size": 1, "epoch": 0.4172, "step": 1043, "tokens_per_device": 4501 }, { "epoch": 0.4172, "loss_ce": 0.12258598208427429, "loss_lvr": 0.7485732436180115, "loss_mode_switch": 0.0, "loss_total": 0.19744330644607544, "step": 1043 }, { "batch_size": 4, "epoch": 0.4172, "step": 1043, "tokens_per_device": 3768 }, { "epoch": 0.4172, "loss_ce": 0.09190404415130615, "loss_lvr": 1.2161976099014282, "loss_mode_switch": 0.0, "loss_total": 0.21352380514144897, "step": 1043 }, { "batch_size": 4, "epoch": 0.4172, "step": 1043, "tokens_per_device": 4288 }, { "epoch": 0.4172, "loss_ce": 0.3229515254497528, "loss_lvr": 0.8357369303703308, "loss_mode_switch": 0.0, "loss_total": 0.40652522444725037, "step": 1043 }, { "batch_size": 4, "epoch": 0.4172, "step": 1043, "tokens_per_device": 5748 }, { "epoch": 0.4172, "loss_ce": 0.3613831102848053, "loss_lvr": 1.0503324270248413, "loss_mode_switch": 0.0, "loss_total": 0.4664163589477539, "step": 1043 }, { "batch_size": 4, "epoch": 0.4172, "step": 1043, "tokens_per_device": 3796 }, { "epoch": 0.4172, "loss_ce": 0.0061193848960101604, "loss_lvr": 0.7522647380828857, "loss_mode_switch": 0.0, "loss_total": 0.08134586364030838, "step": 1043 }, { "batch_size": 4, "epoch": 0.4172, "step": 1043, "tokens_per_device": 4180 }, { "epoch": 0.4172, "loss_ce": 0.22685964405536652, "loss_lvr": 1.8364273309707642, "loss_mode_switch": 0.0, "loss_total": 0.4105023741722107, "step": 1043 }, { "batch_size": 4, "epoch": 0.4172, "step": 1043, "tokens_per_device": 3712 }, { "epoch": 0.4172, "loss_ce": 0.6802648305892944, "loss_lvr": 0.8565961122512817, "loss_mode_switch": 0.0, "loss_total": 0.7659244537353516, "step": 1043 }, { "batch_size": 4, "epoch": 0.4172, "step": 1043, "tokens_per_device": 3708 }, { "epoch": 0.4172, "loss_ce": 0.1450105905532837, "loss_lvr": 0.9450868368148804, "loss_mode_switch": 0.0, "loss_total": 0.23951926827430725, "step": 1043 }, { "epoch": 0.4176, "grad_norm": 1.3241286277770996, "learning_rate": 6.5512441522639415e-06, "loss": 0.321, "step": 1044 }, { "batch_size": 4, "epoch": 0.4176, "step": 1044, "tokens_per_device": 3836 }, { "epoch": 0.4176, "loss_ce": 0.7068474888801575, "loss_lvr": 1.0268386602401733, "loss_mode_switch": 0.0, "loss_total": 0.8095313310623169, "step": 1044 }, { "batch_size": 1, "epoch": 0.4176, "step": 1044, "tokens_per_device": 5102 }, { "epoch": 0.4176, "loss_ce": 0.003747443901374936, "loss_lvr": 0.26677247881889343, "loss_mode_switch": 0.0, "loss_total": 0.0304246935993433, "step": 1044 }, { "batch_size": 1, "epoch": 0.4176, "step": 1044, "tokens_per_device": 5120 }, { "epoch": 0.4176, "loss_ce": 0.11803493648767471, "loss_lvr": 0.4507051408290863, "loss_mode_switch": 0.0, "loss_total": 0.16310545802116394, "step": 1044 }, { "batch_size": 4, "epoch": 0.4176, "step": 1044, "tokens_per_device": 3316 }, { "epoch": 0.4176, "loss_ce": 0.29481762647628784, "loss_lvr": 1.3228117227554321, "loss_mode_switch": 0.0, "loss_total": 0.42709881067276, "step": 1044 }, { "batch_size": 4, "epoch": 0.4176, "step": 1044, "tokens_per_device": 4592 }, { "epoch": 0.4176, "loss_ce": 0.2910265028476715, "loss_lvr": 0.6025694608688354, "loss_mode_switch": 0.0, "loss_total": 0.351283460855484, "step": 1044 }, { "batch_size": 4, "epoch": 0.4176, "step": 1044, "tokens_per_device": 4604 }, { "epoch": 0.4176, "loss_ce": 0.06561826169490814, "loss_lvr": 0.8084181547164917, "loss_mode_switch": 0.0, "loss_total": 0.14646008610725403, "step": 1044 }, { "batch_size": 4, "epoch": 0.4176, "step": 1044, "tokens_per_device": 3744 }, { "epoch": 0.4176, "loss_ce": 0.18696115911006927, "loss_lvr": 1.1354480981826782, "loss_mode_switch": 0.0, "loss_total": 0.30050596594810486, "step": 1044 }, { "batch_size": 1, "epoch": 0.4176, "step": 1044, "tokens_per_device": 5373 }, { "epoch": 0.4176, "loss_ce": 0.01963876560330391, "loss_lvr": 0.527346670627594, "loss_mode_switch": 0.0, "loss_total": 0.07237343490123749, "step": 1044 }, { "epoch": 0.418, "grad_norm": 1.343848705291748, "learning_rate": 6.545084971874738e-06, "loss": 0.2697, "step": 1045 }, { "batch_size": 4, "epoch": 0.418, "step": 1045, "tokens_per_device": 2520 }, { "epoch": 0.418, "loss_ce": 0.3091426491737366, "loss_lvr": 1.0692988634109497, "loss_mode_switch": 0.0, "loss_total": 0.4160725474357605, "step": 1045 }, { "batch_size": 4, "epoch": 0.418, "step": 1045, "tokens_per_device": 1460 }, { "epoch": 0.418, "loss_ce": 0.1426353007555008, "loss_lvr": 1.2792081832885742, "loss_mode_switch": 0.0, "loss_total": 0.27055612206459045, "step": 1045 }, { "batch_size": 1, "epoch": 0.418, "step": 1045, "tokens_per_device": 4212 }, { "epoch": 0.418, "loss_ce": 0.15995308756828308, "loss_lvr": 0.5348920822143555, "loss_mode_switch": 0.0, "loss_total": 0.21344229578971863, "step": 1045 }, { "batch_size": 4, "epoch": 0.418, "step": 1045, "tokens_per_device": 6276 }, { "epoch": 0.418, "loss_ce": 0.04017523676156998, "loss_lvr": 0.6873432993888855, "loss_mode_switch": 0.0, "loss_total": 0.10890956968069077, "step": 1045 }, { "batch_size": 1, "epoch": 0.418, "step": 1045, "tokens_per_device": 4558 }, { "epoch": 0.418, "loss_ce": 0.10434314608573914, "loss_lvr": 0.273738831281662, "loss_mode_switch": 0.0, "loss_total": 0.1317170262336731, "step": 1045 }, { "batch_size": 4, "epoch": 0.418, "step": 1045, "tokens_per_device": 5684 }, { "epoch": 0.418, "loss_ce": 0.14917507767677307, "loss_lvr": 0.8778139352798462, "loss_mode_switch": 0.0, "loss_total": 0.23695647716522217, "step": 1045 }, { "batch_size": 4, "epoch": 0.418, "step": 1045, "tokens_per_device": 5392 }, { "epoch": 0.418, "loss_ce": 0.22235402464866638, "loss_lvr": 0.7808271050453186, "loss_mode_switch": 0.0, "loss_total": 0.30043673515319824, "step": 1045 }, { "batch_size": 1, "epoch": 0.418, "step": 1045, "tokens_per_device": 4879 }, { "epoch": 0.418, "loss_ce": 0.0004797783913090825, "loss_lvr": 0.26470091938972473, "loss_mode_switch": 0.0, "loss_total": 0.026949871331453323, "step": 1045 }, { "epoch": 0.4184, "grad_norm": 1.2927777767181396, "learning_rate": 6.538923198329993e-06, "loss": 0.2984, "step": 1046 }, { "batch_size": 4, "epoch": 0.4184, "step": 1046, "tokens_per_device": 4376 }, { "epoch": 0.4184, "loss_ce": 0.042464759200811386, "loss_lvr": 0.775276780128479, "loss_mode_switch": 0.0, "loss_total": 0.11999243497848511, "step": 1046 }, { "batch_size": 4, "epoch": 0.4184, "step": 1046, "tokens_per_device": 1456 }, { "epoch": 0.4184, "loss_ce": 0.5400470495223999, "loss_lvr": 0.9659608602523804, "loss_mode_switch": 0.0, "loss_total": 0.63664311170578, "step": 1046 }, { "batch_size": 4, "epoch": 0.4184, "step": 1046, "tokens_per_device": 4060 }, { "epoch": 0.4184, "loss_ce": 0.04418672248721123, "loss_lvr": 0.831685483455658, "loss_mode_switch": 0.0, "loss_total": 0.12735527753829956, "step": 1046 }, { "batch_size": 4, "epoch": 0.4184, "step": 1046, "tokens_per_device": 1488 }, { "epoch": 0.4184, "loss_ce": 0.32135093212127686, "loss_lvr": 1.19196355342865, "loss_mode_switch": 0.0, "loss_total": 0.44054728746414185, "step": 1046 }, { "batch_size": 4, "epoch": 0.4184, "step": 1046, "tokens_per_device": 4200 }, { "epoch": 0.4184, "loss_ce": 0.39461350440979004, "loss_lvr": 1.0703799724578857, "loss_mode_switch": 0.0, "loss_total": 0.5016515254974365, "step": 1046 }, { "batch_size": 4, "epoch": 0.4184, "step": 1046, "tokens_per_device": 1516 }, { "epoch": 0.4184, "loss_ce": 0.16165010631084442, "loss_lvr": 1.0092971324920654, "loss_mode_switch": 0.0, "loss_total": 0.2625798285007477, "step": 1046 }, { "batch_size": 4, "epoch": 0.4184, "step": 1046, "tokens_per_device": 5616 }, { "epoch": 0.4184, "loss_ce": 0.3634166717529297, "loss_lvr": 0.7687883973121643, "loss_mode_switch": 0.0, "loss_total": 0.4402955174446106, "step": 1046 }, { "batch_size": 1, "epoch": 0.4184, "step": 1046, "tokens_per_device": 4887 }, { "epoch": 0.4184, "loss_ce": 0.0006578733446076512, "loss_lvr": 0.8007374405860901, "loss_mode_switch": 0.0, "loss_total": 0.08073161542415619, "step": 1046 }, { "epoch": 0.4188, "grad_norm": 1.358562707901001, "learning_rate": 6.5327588419711695e-06, "loss": 0.308, "step": 1047 }, { "batch_size": 4, "epoch": 0.4188, "step": 1047, "tokens_per_device": 2636 }, { "epoch": 0.4188, "loss_ce": 0.3521316647529602, "loss_lvr": 0.9993411302566528, "loss_mode_switch": 0.0, "loss_total": 0.45206576585769653, "step": 1047 }, { "batch_size": 4, "epoch": 0.4188, "step": 1047, "tokens_per_device": 4808 }, { "epoch": 0.4188, "loss_ce": 0.14488466084003448, "loss_lvr": 0.8647754788398743, "loss_mode_switch": 0.0, "loss_total": 0.2313622087240219, "step": 1047 }, { "batch_size": 4, "epoch": 0.4188, "step": 1047, "tokens_per_device": 5508 }, { "epoch": 0.4188, "loss_ce": 0.100569948554039, "loss_lvr": 0.8009214401245117, "loss_mode_switch": 0.0, "loss_total": 0.1806620955467224, "step": 1047 }, { "batch_size": 1, "epoch": 0.4188, "step": 1047, "tokens_per_device": 5817 }, { "epoch": 0.4188, "loss_ce": 0.031163783743977547, "loss_lvr": 0.30059680342674255, "loss_mode_switch": 0.0, "loss_total": 0.06122346222400665, "step": 1047 }, { "batch_size": 4, "epoch": 0.4188, "step": 1047, "tokens_per_device": 4376 }, { "epoch": 0.4188, "loss_ce": 0.299187570810318, "loss_lvr": 1.1840951442718506, "loss_mode_switch": 0.0, "loss_total": 0.41759708523750305, "step": 1047 }, { "batch_size": 4, "epoch": 0.4188, "step": 1047, "tokens_per_device": 4644 }, { "epoch": 0.4188, "loss_ce": 0.562915563583374, "loss_lvr": 0.9978645443916321, "loss_mode_switch": 0.0, "loss_total": 0.6627020239830017, "step": 1047 }, { "batch_size": 4, "epoch": 0.4188, "step": 1047, "tokens_per_device": 4216 }, { "epoch": 0.4188, "loss_ce": 0.3056178092956543, "loss_lvr": 1.060197114944458, "loss_mode_switch": 0.0, "loss_total": 0.4116375148296356, "step": 1047 }, { "batch_size": 4, "epoch": 0.4188, "step": 1047, "tokens_per_device": 1520 }, { "epoch": 0.4188, "loss_ce": 0.8544471859931946, "loss_lvr": 0.9961757659912109, "loss_mode_switch": 0.0, "loss_total": 0.9540647864341736, "step": 1047 }, { "epoch": 0.4192, "grad_norm": 1.2037807703018188, "learning_rate": 6.526591913144062e-06, "loss": 0.2717, "step": 1048 }, { "batch_size": 4, "epoch": 0.4192, "step": 1048, "tokens_per_device": 4064 }, { "epoch": 0.4192, "loss_ce": 0.323536217212677, "loss_lvr": 0.8306044340133667, "loss_mode_switch": 0.0, "loss_total": 0.40659666061401367, "step": 1048 }, { "batch_size": 4, "epoch": 0.4192, "step": 1048, "tokens_per_device": 8884 }, { "epoch": 0.4192, "loss_ce": 0.48543328046798706, "loss_lvr": 0.9424766898155212, "loss_mode_switch": 0.0, "loss_total": 0.5796809196472168, "step": 1048 }, { "batch_size": 4, "epoch": 0.4192, "step": 1048, "tokens_per_device": 4968 }, { "epoch": 0.4192, "loss_ce": 0.0782393366098404, "loss_lvr": 1.2983397245407104, "loss_mode_switch": 0.0, "loss_total": 0.20807331800460815, "step": 1048 }, { "batch_size": 4, "epoch": 0.4192, "step": 1048, "tokens_per_device": 1484 }, { "epoch": 0.4192, "loss_ce": 0.24220412969589233, "loss_lvr": 1.0889830589294434, "loss_mode_switch": 0.0, "loss_total": 0.35110244154930115, "step": 1048 }, { "batch_size": 4, "epoch": 0.4192, "step": 1048, "tokens_per_device": 4304 }, { "epoch": 0.4192, "loss_ce": 0.39389219880104065, "loss_lvr": 0.7392041087150574, "loss_mode_switch": 0.0, "loss_total": 0.46781259775161743, "step": 1048 }, { "batch_size": 4, "epoch": 0.4192, "step": 1048, "tokens_per_device": 4216 }, { "epoch": 0.4192, "loss_ce": 0.003060939023271203, "loss_lvr": 0.9031952023506165, "loss_mode_switch": 0.0, "loss_total": 0.09338045865297318, "step": 1048 }, { "batch_size": 4, "epoch": 0.4192, "step": 1048, "tokens_per_device": 8316 }, { "epoch": 0.4192, "loss_ce": 0.31886789202690125, "loss_lvr": 0.672319769859314, "loss_mode_switch": 0.0, "loss_total": 0.3860998749732971, "step": 1048 }, { "batch_size": 1, "epoch": 0.4192, "step": 1048, "tokens_per_device": 4887 }, { "epoch": 0.4192, "loss_ce": 0.3979639410972595, "loss_lvr": 0.5528722405433655, "loss_mode_switch": 0.0, "loss_total": 0.4532511532306671, "step": 1048 }, { "epoch": 0.4196, "grad_norm": 1.2150810956954956, "learning_rate": 6.5204224221987864e-06, "loss": 0.2648, "step": 1049 }, { "batch_size": 1, "epoch": 0.4196, "step": 1049, "tokens_per_device": 5037 }, { "epoch": 0.4196, "loss_ce": 0.0005937939858995378, "loss_lvr": 0.29636120796203613, "loss_mode_switch": 0.0, "loss_total": 0.03022991493344307, "step": 1049 }, { "batch_size": 4, "epoch": 0.4196, "step": 1049, "tokens_per_device": 4028 }, { "epoch": 0.4196, "loss_ce": 0.21519486606121063, "loss_lvr": 1.2652499675750732, "loss_mode_switch": 0.0, "loss_total": 0.3417198657989502, "step": 1049 }, { "batch_size": 4, "epoch": 0.4196, "step": 1049, "tokens_per_device": 5612 }, { "epoch": 0.4196, "loss_ce": 0.03023255616426468, "loss_lvr": 1.0381263494491577, "loss_mode_switch": 0.0, "loss_total": 0.13404518365859985, "step": 1049 }, { "batch_size": 1, "epoch": 0.4196, "step": 1049, "tokens_per_device": 5139 }, { "epoch": 0.4196, "loss_ce": 0.0071475752629339695, "loss_lvr": 0.5528736114501953, "loss_mode_switch": 0.0, "loss_total": 0.06243493780493736, "step": 1049 }, { "batch_size": 1, "epoch": 0.4196, "step": 1049, "tokens_per_device": 5164 }, { "epoch": 0.4196, "loss_ce": 0.0007009651744738221, "loss_lvr": 0.5509194731712341, "loss_mode_switch": 0.0, "loss_total": 0.0557929128408432, "step": 1049 }, { "batch_size": 4, "epoch": 0.4196, "step": 1049, "tokens_per_device": 6528 }, { "epoch": 0.4196, "loss_ce": 0.13438987731933594, "loss_lvr": 0.9405965209007263, "loss_mode_switch": 0.0, "loss_total": 0.2284495234489441, "step": 1049 }, { "batch_size": 1, "epoch": 0.4196, "step": 1049, "tokens_per_device": 8147 }, { "epoch": 0.4196, "loss_ce": 0.013200366869568825, "loss_lvr": 0.45879843831062317, "loss_mode_switch": 0.0, "loss_total": 0.05908021330833435, "step": 1049 }, { "batch_size": 4, "epoch": 0.4196, "step": 1049, "tokens_per_device": 3868 }, { "epoch": 0.4196, "loss_ce": 0.3035251796245575, "loss_lvr": 1.993769645690918, "loss_mode_switch": 0.0, "loss_total": 0.5029021501541138, "step": 1049 }, { "epoch": 0.42, "grad_norm": 1.2008291482925415, "learning_rate": 6.514250379489754e-06, "loss": 0.2233, "step": 1050 }, { "batch_size": 1, "epoch": 0.42, "step": 1050, "tokens_per_device": 5024 }, { "epoch": 0.42, "loss_ce": 0.03841790184378624, "loss_lvr": 0.6080677509307861, "loss_mode_switch": 0.0, "loss_total": 0.09922467917203903, "step": 1050 }, { "batch_size": 4, "epoch": 0.42, "step": 1050, "tokens_per_device": 1512 }, { "epoch": 0.42, "loss_ce": 0.21621276438236237, "loss_lvr": 0.9420198202133179, "loss_mode_switch": 0.0, "loss_total": 0.31041473150253296, "step": 1050 }, { "batch_size": 4, "epoch": 0.42, "step": 1050, "tokens_per_device": 4196 }, { "epoch": 0.42, "loss_ce": 0.03537202626466751, "loss_lvr": 1.2059526443481445, "loss_mode_switch": 0.0, "loss_total": 0.15596729516983032, "step": 1050 }, { "batch_size": 4, "epoch": 0.42, "step": 1050, "tokens_per_device": 2220 }, { "epoch": 0.42, "loss_ce": 0.5356878638267517, "loss_lvr": 0.9896592497825623, "loss_mode_switch": 0.0, "loss_total": 0.6346538066864014, "step": 1050 }, { "batch_size": 4, "epoch": 0.42, "step": 1050, "tokens_per_device": 3160 }, { "epoch": 0.42, "loss_ce": 0.1394483894109726, "loss_lvr": 0.7556666135787964, "loss_mode_switch": 0.0, "loss_total": 0.21501505374908447, "step": 1050 }, { "batch_size": 4, "epoch": 0.42, "step": 1050, "tokens_per_device": 1556 }, { "epoch": 0.42, "loss_ce": 0.109708271920681, "loss_lvr": 0.9449064135551453, "loss_mode_switch": 0.0, "loss_total": 0.2041989117860794, "step": 1050 }, { "batch_size": 4, "epoch": 0.42, "step": 1050, "tokens_per_device": 1568 }, { "epoch": 0.42, "loss_ce": 0.30470824241638184, "loss_lvr": 1.3325245380401611, "loss_mode_switch": 0.0, "loss_total": 0.437960684299469, "step": 1050 }, { "batch_size": 4, "epoch": 0.42, "step": 1050, "tokens_per_device": 5072 }, { "epoch": 0.42, "loss_ce": 0.21063777804374695, "loss_lvr": 0.7063567042350769, "loss_mode_switch": 0.0, "loss_total": 0.2812734544277191, "step": 1050 }, { "epoch": 0.4204, "grad_norm": 1.7188489437103271, "learning_rate": 6.508075795375666e-06, "loss": 0.3384, "step": 1051 }, { "batch_size": 1, "epoch": 0.4204, "step": 1051, "tokens_per_device": 4953 }, { "epoch": 0.4204, "loss_ce": 0.001996212638914585, "loss_lvr": 0.263021856546402, "loss_mode_switch": 0.0, "loss_total": 0.028298400342464447, "step": 1051 }, { "batch_size": 1, "epoch": 0.4204, "step": 1051, "tokens_per_device": 5186 }, { "epoch": 0.4204, "loss_ce": 0.015841858461499214, "loss_lvr": 0.5184335708618164, "loss_mode_switch": 0.0, "loss_total": 0.06768521666526794, "step": 1051 }, { "batch_size": 4, "epoch": 0.4204, "step": 1051, "tokens_per_device": 5136 }, { "epoch": 0.4204, "loss_ce": 0.023525213822722435, "loss_lvr": 0.6686134338378906, "loss_mode_switch": 0.0, "loss_total": 0.09038656204938889, "step": 1051 }, { "batch_size": 4, "epoch": 0.4204, "step": 1051, "tokens_per_device": 9564 }, { "epoch": 0.4204, "loss_ce": 0.09933122247457504, "loss_lvr": 0.8650446534156799, "loss_mode_switch": 0.0, "loss_total": 0.18583568930625916, "step": 1051 }, { "batch_size": 4, "epoch": 0.4204, "step": 1051, "tokens_per_device": 2612 }, { "epoch": 0.4204, "loss_ce": 0.406917929649353, "loss_lvr": 0.9418118596076965, "loss_mode_switch": 0.0, "loss_total": 0.5010991096496582, "step": 1051 }, { "batch_size": 4, "epoch": 0.4204, "step": 1051, "tokens_per_device": 14568 }, { "epoch": 0.4204, "loss_ce": 0.24927574396133423, "loss_lvr": 1.1125690937042236, "loss_mode_switch": 0.0, "loss_total": 0.36053264141082764, "step": 1051 }, { "batch_size": 4, "epoch": 0.4204, "step": 1051, "tokens_per_device": 4540 }, { "epoch": 0.4204, "loss_ce": 0.46853920817375183, "loss_lvr": 0.8750881552696228, "loss_mode_switch": 0.0, "loss_total": 0.5560480356216431, "step": 1051 }, { "batch_size": 4, "epoch": 0.4204, "step": 1051, "tokens_per_device": 1288 }, { "epoch": 0.4204, "loss_ce": 0.5191768407821655, "loss_lvr": 1.1863774061203003, "loss_mode_switch": 0.0, "loss_total": 0.6378145813941956, "step": 1051 }, { "epoch": 0.4208, "grad_norm": 1.2630679607391357, "learning_rate": 6.5018986802194805e-06, "loss": 0.2694, "step": 1052 }, { "batch_size": 4, "epoch": 0.4208, "step": 1052, "tokens_per_device": 3864 }, { "epoch": 0.4208, "loss_ce": 0.08497541397809982, "loss_lvr": 0.7236239910125732, "loss_mode_switch": 0.0, "loss_total": 0.15733781456947327, "step": 1052 }, { "batch_size": 4, "epoch": 0.4208, "step": 1052, "tokens_per_device": 5728 }, { "epoch": 0.4208, "loss_ce": 0.4911557137966156, "loss_lvr": 0.5804992914199829, "loss_mode_switch": 0.0, "loss_total": 0.5492056608200073, "step": 1052 }, { "batch_size": 1, "epoch": 0.4208, "step": 1052, "tokens_per_device": 5006 }, { "epoch": 0.4208, "loss_ce": 0.016395602375268936, "loss_lvr": 0.4157346487045288, "loss_mode_switch": 0.0, "loss_total": 0.05796906724572182, "step": 1052 }, { "batch_size": 1, "epoch": 0.4208, "step": 1052, "tokens_per_device": 5023 }, { "epoch": 0.4208, "loss_ce": 0.010919302701950073, "loss_lvr": 0.35631465911865234, "loss_mode_switch": 0.0, "loss_total": 0.04655076935887337, "step": 1052 }, { "batch_size": 4, "epoch": 0.4208, "step": 1052, "tokens_per_device": 5496 }, { "epoch": 0.4208, "loss_ce": 0.015782013535499573, "loss_lvr": 0.7055396437644958, "loss_mode_switch": 0.0, "loss_total": 0.08633597940206528, "step": 1052 }, { "batch_size": 4, "epoch": 0.4208, "step": 1052, "tokens_per_device": 4572 }, { "epoch": 0.4208, "loss_ce": 0.5151135921478271, "loss_lvr": 0.8748682737350464, "loss_mode_switch": 0.0, "loss_total": 0.6026003956794739, "step": 1052 }, { "batch_size": 1, "epoch": 0.4208, "step": 1052, "tokens_per_device": 4931 }, { "epoch": 0.4208, "loss_ce": 0.01838746853172779, "loss_lvr": 0.3526350259780884, "loss_mode_switch": 0.0, "loss_total": 0.05365097522735596, "step": 1052 }, { "batch_size": 1, "epoch": 0.4208, "step": 1052, "tokens_per_device": 4904 }, { "epoch": 0.4208, "loss_ce": 0.1198839321732521, "loss_lvr": 0.20484556257724762, "loss_mode_switch": 0.0, "loss_total": 0.1403684914112091, "step": 1052 }, { "epoch": 0.4212, "grad_norm": 1.2972851991653442, "learning_rate": 6.495719044388409e-06, "loss": 0.2864, "step": 1053 }, { "batch_size": 1, "epoch": 0.4212, "step": 1053, "tokens_per_device": 5096 }, { "epoch": 0.4212, "loss_ce": 0.04586813226342201, "loss_lvr": 0.2960360050201416, "loss_mode_switch": 0.0, "loss_total": 0.07547172904014587, "step": 1053 }, { "batch_size": 1, "epoch": 0.4212, "step": 1053, "tokens_per_device": 5008 }, { "epoch": 0.4212, "loss_ce": 0.03729529306292534, "loss_lvr": 1.6809420585632324, "loss_mode_switch": 0.0, "loss_total": 0.20538949966430664, "step": 1053 }, { "batch_size": 4, "epoch": 0.4212, "step": 1053, "tokens_per_device": 6464 }, { "epoch": 0.4212, "loss_ce": 0.6327721476554871, "loss_lvr": 0.5499283671379089, "loss_mode_switch": 0.0, "loss_total": 0.6877650022506714, "step": 1053 }, { "batch_size": 4, "epoch": 0.4212, "step": 1053, "tokens_per_device": 10152 }, { "epoch": 0.4212, "loss_ce": 0.1943962574005127, "loss_lvr": 0.45438843965530396, "loss_mode_switch": 0.0, "loss_total": 0.23983509838581085, "step": 1053 }, { "batch_size": 4, "epoch": 0.4212, "step": 1053, "tokens_per_device": 8376 }, { "epoch": 0.4212, "loss_ce": 0.15788669884204865, "loss_lvr": 0.8111526966094971, "loss_mode_switch": 0.0, "loss_total": 0.23900195956230164, "step": 1053 }, { "batch_size": 4, "epoch": 0.4212, "step": 1053, "tokens_per_device": 4544 }, { "epoch": 0.4212, "loss_ce": 0.6302369236946106, "loss_lvr": 0.917011022567749, "loss_mode_switch": 0.0, "loss_total": 0.7219380140304565, "step": 1053 }, { "batch_size": 1, "epoch": 0.4212, "step": 1053, "tokens_per_device": 5149 }, { "epoch": 0.4212, "loss_ce": 0.008717752993106842, "loss_lvr": 0.43092361092567444, "loss_mode_switch": 0.0, "loss_total": 0.051810115575790405, "step": 1053 }, { "batch_size": 1, "epoch": 0.4212, "step": 1053, "tokens_per_device": 5032 }, { "epoch": 0.4212, "loss_ce": 0.007240933831781149, "loss_lvr": 0.33268824219703674, "loss_mode_switch": 0.0, "loss_total": 0.04050975665450096, "step": 1053 }, { "epoch": 0.4216, "grad_norm": 1.218768835067749, "learning_rate": 6.489536898253893e-06, "loss": 0.2947, "step": 1054 }, { "batch_size": 4, "epoch": 0.4216, "step": 1054, "tokens_per_device": 5144 }, { "epoch": 0.4216, "loss_ce": 0.2004079967737198, "loss_lvr": 0.9005174040794373, "loss_mode_switch": 0.0, "loss_total": 0.2904597520828247, "step": 1054 }, { "batch_size": 4, "epoch": 0.4216, "step": 1054, "tokens_per_device": 2532 }, { "epoch": 0.4216, "loss_ce": 0.451834499835968, "loss_lvr": 1.0568902492523193, "loss_mode_switch": 0.0, "loss_total": 0.5575235486030579, "step": 1054 }, { "batch_size": 4, "epoch": 0.4216, "step": 1054, "tokens_per_device": 1776 }, { "epoch": 0.4216, "loss_ce": 0.1570100039243698, "loss_lvr": 0.9210522770881653, "loss_mode_switch": 0.0, "loss_total": 0.2491152286529541, "step": 1054 }, { "batch_size": 4, "epoch": 0.4216, "step": 1054, "tokens_per_device": 3804 }, { "epoch": 0.4216, "loss_ce": 0.03763515502214432, "loss_lvr": 0.8521550893783569, "loss_mode_switch": 0.0, "loss_total": 0.12285066395998001, "step": 1054 }, { "batch_size": 4, "epoch": 0.4216, "step": 1054, "tokens_per_device": 1508 }, { "epoch": 0.4216, "loss_ce": 0.11747875809669495, "loss_lvr": 1.333311676979065, "loss_mode_switch": 0.0, "loss_total": 0.250809907913208, "step": 1054 }, { "batch_size": 4, "epoch": 0.4216, "step": 1054, "tokens_per_device": 9072 }, { "epoch": 0.4216, "loss_ce": 0.08878166228532791, "loss_lvr": 0.8409001231193542, "loss_mode_switch": 0.0, "loss_total": 0.1728716790676117, "step": 1054 }, { "batch_size": 4, "epoch": 0.4216, "step": 1054, "tokens_per_device": 3588 }, { "epoch": 0.4216, "loss_ce": 0.1570541113615036, "loss_lvr": 0.9448935389518738, "loss_mode_switch": 0.0, "loss_total": 0.25154346227645874, "step": 1054 }, { "batch_size": 4, "epoch": 0.4216, "step": 1054, "tokens_per_device": 5420 }, { "epoch": 0.4216, "loss_ce": 0.043714769184589386, "loss_lvr": 1.0820707082748413, "loss_mode_switch": 0.0, "loss_total": 0.1519218385219574, "step": 1054 }, { "epoch": 0.422, "grad_norm": 1.3864071369171143, "learning_rate": 6.483352252191585e-06, "loss": 0.2897, "step": 1055 }, { "batch_size": 4, "epoch": 0.422, "step": 1055, "tokens_per_device": 6056 }, { "epoch": 0.422, "loss_ce": 0.024723678827285767, "loss_lvr": 1.326254963874817, "loss_mode_switch": 0.0, "loss_total": 0.15734918415546417, "step": 1055 }, { "batch_size": 1, "epoch": 0.422, "step": 1055, "tokens_per_device": 5069 }, { "epoch": 0.422, "loss_ce": 0.0008949150796979666, "loss_lvr": 0.40759652853012085, "loss_mode_switch": 0.0, "loss_total": 0.041654568165540695, "step": 1055 }, { "batch_size": 4, "epoch": 0.422, "step": 1055, "tokens_per_device": 5244 }, { "epoch": 0.422, "loss_ce": 0.31772246956825256, "loss_lvr": 0.7724752426147461, "loss_mode_switch": 0.0, "loss_total": 0.39496999979019165, "step": 1055 }, { "batch_size": 1, "epoch": 0.422, "step": 1055, "tokens_per_device": 4741 }, { "epoch": 0.422, "loss_ce": 0.001310671679675579, "loss_lvr": 0.2504100501537323, "loss_mode_switch": 0.0, "loss_total": 0.026351675391197205, "step": 1055 }, { "batch_size": 1, "epoch": 0.422, "step": 1055, "tokens_per_device": 4932 }, { "epoch": 0.422, "loss_ce": 0.04632040858268738, "loss_lvr": 0.6710950136184692, "loss_mode_switch": 0.0, "loss_total": 0.11342991143465042, "step": 1055 }, { "batch_size": 4, "epoch": 0.422, "step": 1055, "tokens_per_device": 2552 }, { "epoch": 0.422, "loss_ce": 0.3627990186214447, "loss_lvr": 1.1002522706985474, "loss_mode_switch": 0.0, "loss_total": 0.47282424569129944, "step": 1055 }, { "batch_size": 1, "epoch": 0.422, "step": 1055, "tokens_per_device": 5092 }, { "epoch": 0.422, "loss_ce": 0.06920607388019562, "loss_lvr": 0.3367573916912079, "loss_mode_switch": 0.0, "loss_total": 0.10288181900978088, "step": 1055 }, { "batch_size": 1, "epoch": 0.422, "step": 1055, "tokens_per_device": 5284 }, { "epoch": 0.422, "loss_ce": 0.05131809785962105, "loss_lvr": 0.449474036693573, "loss_mode_switch": 0.0, "loss_total": 0.09626550227403641, "step": 1055 }, { "epoch": 0.4224, "grad_norm": 1.4062350988388062, "learning_rate": 6.4771651165813345e-06, "loss": 0.3328, "step": 1056 }, { "batch_size": 4, "epoch": 0.4224, "step": 1056, "tokens_per_device": 4532 }, { "epoch": 0.4224, "loss_ce": 0.47967270016670227, "loss_lvr": 0.7399605512619019, "loss_mode_switch": 0.0, "loss_total": 0.553668737411499, "step": 1056 }, { "batch_size": 4, "epoch": 0.4224, "step": 1056, "tokens_per_device": 5308 }, { "epoch": 0.4224, "loss_ce": 0.021534254774451256, "loss_lvr": 0.7490469813346863, "loss_mode_switch": 0.0, "loss_total": 0.09643895924091339, "step": 1056 }, { "batch_size": 4, "epoch": 0.4224, "step": 1056, "tokens_per_device": 1388 }, { "epoch": 0.4224, "loss_ce": 0.6333969831466675, "loss_lvr": 1.0078091621398926, "loss_mode_switch": 0.0, "loss_total": 0.7341778874397278, "step": 1056 }, { "batch_size": 4, "epoch": 0.4224, "step": 1056, "tokens_per_device": 5296 }, { "epoch": 0.4224, "loss_ce": 0.26286566257476807, "loss_lvr": 0.7722225189208984, "loss_mode_switch": 0.0, "loss_total": 0.3400879204273224, "step": 1056 }, { "batch_size": 4, "epoch": 0.4224, "step": 1056, "tokens_per_device": 2584 }, { "epoch": 0.4224, "loss_ce": 0.5184623003005981, "loss_lvr": 0.8896241784095764, "loss_mode_switch": 0.0, "loss_total": 0.6074247360229492, "step": 1056 }, { "batch_size": 4, "epoch": 0.4224, "step": 1056, "tokens_per_device": 4648 }, { "epoch": 0.4224, "loss_ce": 0.25567004084587097, "loss_lvr": 0.8086279630661011, "loss_mode_switch": 0.0, "loss_total": 0.3365328311920166, "step": 1056 }, { "batch_size": 4, "epoch": 0.4224, "step": 1056, "tokens_per_device": 4296 }, { "epoch": 0.4224, "loss_ce": 0.5678699016571045, "loss_lvr": 1.0938788652420044, "loss_mode_switch": 0.0, "loss_total": 0.677257776260376, "step": 1056 }, { "batch_size": 4, "epoch": 0.4224, "step": 1056, "tokens_per_device": 8348 }, { "epoch": 0.4224, "loss_ce": 0.3241382837295532, "loss_lvr": 0.5609421133995056, "loss_mode_switch": 0.0, "loss_total": 0.38023248314857483, "step": 1056 }, { "epoch": 0.4228, "grad_norm": 1.4168962240219116, "learning_rate": 6.4709755018071685e-06, "loss": 0.3288, "step": 1057 }, { "batch_size": 4, "epoch": 0.4228, "step": 1057, "tokens_per_device": 4296 }, { "epoch": 0.4228, "loss_ce": 0.43971508741378784, "loss_lvr": 0.6731356978416443, "loss_mode_switch": 0.0, "loss_total": 0.5070286393165588, "step": 1057 }, { "batch_size": 4, "epoch": 0.4228, "step": 1057, "tokens_per_device": 5812 }, { "epoch": 0.4228, "loss_ce": 0.09077198058366776, "loss_lvr": 0.7901532053947449, "loss_mode_switch": 0.0, "loss_total": 0.16978730261325836, "step": 1057 }, { "batch_size": 4, "epoch": 0.4228, "step": 1057, "tokens_per_device": 3788 }, { "epoch": 0.4228, "loss_ce": 0.017011288553476334, "loss_lvr": 0.9568406939506531, "loss_mode_switch": 0.0, "loss_total": 0.1126953661441803, "step": 1057 }, { "batch_size": 4, "epoch": 0.4228, "step": 1057, "tokens_per_device": 6260 }, { "epoch": 0.4228, "loss_ce": 0.08719386160373688, "loss_lvr": 0.5997788906097412, "loss_mode_switch": 0.0, "loss_total": 0.147171750664711, "step": 1057 }, { "batch_size": 1, "epoch": 0.4228, "step": 1057, "tokens_per_device": 5115 }, { "epoch": 0.4228, "loss_ce": 0.09069964289665222, "loss_lvr": 0.3422084152698517, "loss_mode_switch": 0.0, "loss_total": 0.12492048740386963, "step": 1057 }, { "batch_size": 4, "epoch": 0.4228, "step": 1057, "tokens_per_device": 2056 }, { "epoch": 0.4228, "loss_ce": 0.5750018954277039, "loss_lvr": 1.1157073974609375, "loss_mode_switch": 0.0, "loss_total": 0.6865726113319397, "step": 1057 }, { "batch_size": 4, "epoch": 0.4228, "step": 1057, "tokens_per_device": 15228 }, { "epoch": 0.4228, "loss_ce": 0.8256399035453796, "loss_lvr": 0.9144914150238037, "loss_mode_switch": 0.0, "loss_total": 0.91708904504776, "step": 1057 }, { "batch_size": 4, "epoch": 0.4228, "step": 1057, "tokens_per_device": 4860 }, { "epoch": 0.4228, "loss_ce": 0.22673757374286652, "loss_lvr": 0.5262340903282166, "loss_mode_switch": 0.0, "loss_total": 0.27936097979545593, "step": 1057 }, { "epoch": 0.4232, "grad_norm": 1.3871911764144897, "learning_rate": 6.464783418257278e-06, "loss": 0.3471, "step": 1058 }, { "batch_size": 1, "epoch": 0.4232, "step": 1058, "tokens_per_device": 4911 }, { "epoch": 0.4232, "loss_ce": 0.19235080480575562, "loss_lvr": 0.3985384404659271, "loss_mode_switch": 0.0, "loss_total": 0.2322046458721161, "step": 1058 }, { "batch_size": 1, "epoch": 0.4232, "step": 1058, "tokens_per_device": 4741 }, { "epoch": 0.4232, "loss_ce": 0.19595053791999817, "loss_lvr": 0.509061872959137, "loss_mode_switch": 0.0, "loss_total": 0.2468567192554474, "step": 1058 }, { "batch_size": 4, "epoch": 0.4232, "step": 1058, "tokens_per_device": 5696 }, { "epoch": 0.4232, "loss_ce": 0.0920797660946846, "loss_lvr": 0.965283215045929, "loss_mode_switch": 0.0, "loss_total": 0.1886080801486969, "step": 1058 }, { "batch_size": 4, "epoch": 0.4232, "step": 1058, "tokens_per_device": 4224 }, { "epoch": 0.4232, "loss_ce": 0.23993022739887238, "loss_lvr": 1.0647631883621216, "loss_mode_switch": 0.0, "loss_total": 0.3464065492153168, "step": 1058 }, { "batch_size": 1, "epoch": 0.4232, "step": 1058, "tokens_per_device": 5015 }, { "epoch": 0.4232, "loss_ce": 0.014852424152195454, "loss_lvr": 0.6157532930374146, "loss_mode_switch": 0.0, "loss_total": 0.07642775774002075, "step": 1058 }, { "batch_size": 4, "epoch": 0.4232, "step": 1058, "tokens_per_device": 1824 }, { "epoch": 0.4232, "loss_ce": 0.4070638418197632, "loss_lvr": 1.0144002437591553, "loss_mode_switch": 0.0, "loss_total": 0.5085038542747498, "step": 1058 }, { "batch_size": 4, "epoch": 0.4232, "step": 1058, "tokens_per_device": 7412 }, { "epoch": 0.4232, "loss_ce": 0.23045560717582703, "loss_lvr": 1.3426693677902222, "loss_mode_switch": 0.0, "loss_total": 0.3647225499153137, "step": 1058 }, { "batch_size": 4, "epoch": 0.4232, "step": 1058, "tokens_per_device": 4836 }, { "epoch": 0.4232, "loss_ce": 0.37494078278541565, "loss_lvr": 0.8384962677955627, "loss_mode_switch": 0.0, "loss_total": 0.4587904214859009, "step": 1058 }, { "epoch": 0.4236, "grad_norm": 1.2873215675354004, "learning_rate": 6.45858887632399e-06, "loss": 0.3259, "step": 1059 }, { "batch_size": 4, "epoch": 0.4236, "step": 1059, "tokens_per_device": 8012 }, { "epoch": 0.4236, "loss_ce": 0.40596333146095276, "loss_lvr": 0.7419033050537109, "loss_mode_switch": 0.0, "loss_total": 0.4801536798477173, "step": 1059 }, { "batch_size": 1, "epoch": 0.4236, "step": 1059, "tokens_per_device": 5107 }, { "epoch": 0.4236, "loss_ce": 0.012490352615714073, "loss_lvr": 0.37410444021224976, "loss_mode_switch": 0.0, "loss_total": 0.04990079998970032, "step": 1059 }, { "batch_size": 4, "epoch": 0.4236, "step": 1059, "tokens_per_device": 4480 }, { "epoch": 0.4236, "loss_ce": 0.11097672581672668, "loss_lvr": 0.8055575489997864, "loss_mode_switch": 0.0, "loss_total": 0.19153249263763428, "step": 1059 }, { "batch_size": 4, "epoch": 0.4236, "step": 1059, "tokens_per_device": 1360 }, { "epoch": 0.4236, "loss_ce": 0.6430141925811768, "loss_lvr": 1.172229290008545, "loss_mode_switch": 0.0, "loss_total": 0.7602370977401733, "step": 1059 }, { "batch_size": 1, "epoch": 0.4236, "step": 1059, "tokens_per_device": 4796 }, { "epoch": 0.4236, "loss_ce": 0.12680895626544952, "loss_lvr": 0.4283173084259033, "loss_mode_switch": 0.0, "loss_total": 0.1696406900882721, "step": 1059 }, { "batch_size": 4, "epoch": 0.4236, "step": 1059, "tokens_per_device": 6924 }, { "epoch": 0.4236, "loss_ce": 0.14874079823493958, "loss_lvr": 0.7673106789588928, "loss_mode_switch": 0.0, "loss_total": 0.2254718691110611, "step": 1059 }, { "batch_size": 4, "epoch": 0.4236, "step": 1059, "tokens_per_device": 1640 }, { "epoch": 0.4236, "loss_ce": 0.5715179443359375, "loss_lvr": 0.9256909489631653, "loss_mode_switch": 0.0, "loss_total": 0.6640870571136475, "step": 1059 }, { "batch_size": 4, "epoch": 0.4236, "step": 1059, "tokens_per_device": 4368 }, { "epoch": 0.4236, "loss_ce": 0.15439260005950928, "loss_lvr": 0.7394111156463623, "loss_mode_switch": 0.0, "loss_total": 0.2283337116241455, "step": 1059 }, { "epoch": 0.424, "grad_norm": 1.4081941843032837, "learning_rate": 6.452391886403767e-06, "loss": 0.2888, "step": 1060 }, { "batch_size": 1, "epoch": 0.424, "step": 1060, "tokens_per_device": 5128 }, { "epoch": 0.424, "loss_ce": 0.25714588165283203, "loss_lvr": 0.3318420648574829, "loss_mode_switch": 0.0, "loss_total": 0.29033008217811584, "step": 1060 }, { "batch_size": 1, "epoch": 0.424, "step": 1060, "tokens_per_device": 4860 }, { "epoch": 0.424, "loss_ce": 0.011519956402480602, "loss_lvr": 0.19518956542015076, "loss_mode_switch": 0.0, "loss_total": 0.031038913875818253, "step": 1060 }, { "batch_size": 4, "epoch": 0.424, "step": 1060, "tokens_per_device": 3652 }, { "epoch": 0.424, "loss_ce": 0.03469575569033623, "loss_lvr": 0.9092373847961426, "loss_mode_switch": 0.0, "loss_total": 0.12561950087547302, "step": 1060 }, { "batch_size": 4, "epoch": 0.424, "step": 1060, "tokens_per_device": 4248 }, { "epoch": 0.424, "loss_ce": 0.1990753412246704, "loss_lvr": 0.7585856914520264, "loss_mode_switch": 0.0, "loss_total": 0.27493390440940857, "step": 1060 }, { "batch_size": 4, "epoch": 0.424, "step": 1060, "tokens_per_device": 2764 }, { "epoch": 0.424, "loss_ce": 0.41635993123054504, "loss_lvr": 0.8280315399169922, "loss_mode_switch": 0.0, "loss_total": 0.49916309118270874, "step": 1060 }, { "batch_size": 1, "epoch": 0.424, "step": 1060, "tokens_per_device": 5103 }, { "epoch": 0.424, "loss_ce": 0.0011765360832214355, "loss_lvr": 0.2297624796628952, "loss_mode_switch": 0.0, "loss_total": 0.024152783676981926, "step": 1060 }, { "batch_size": 4, "epoch": 0.424, "step": 1060, "tokens_per_device": 4072 }, { "epoch": 0.424, "loss_ce": 0.7665985226631165, "loss_lvr": 0.9614949226379395, "loss_mode_switch": 0.0, "loss_total": 0.8627480268478394, "step": 1060 }, { "batch_size": 4, "epoch": 0.424, "step": 1060, "tokens_per_device": 3704 }, { "epoch": 0.424, "loss_ce": 0.11218422651290894, "loss_lvr": 0.8604094386100769, "loss_mode_switch": 0.0, "loss_total": 0.19822517037391663, "step": 1060 }, { "epoch": 0.4244, "grad_norm": 1.08424973487854, "learning_rate": 6.446192458897174e-06, "loss": 0.2269, "step": 1061 }, { "batch_size": 1, "epoch": 0.4244, "step": 1061, "tokens_per_device": 5120 }, { "epoch": 0.4244, "loss_ce": 0.005708691198378801, "loss_lvr": 0.4372379183769226, "loss_mode_switch": 0.0, "loss_total": 0.049432482570409775, "step": 1061 }, { "batch_size": 4, "epoch": 0.4244, "step": 1061, "tokens_per_device": 5328 }, { "epoch": 0.4244, "loss_ce": 0.2916575074195862, "loss_lvr": 0.7218477129936218, "loss_mode_switch": 0.0, "loss_total": 0.36384227871894836, "step": 1061 }, { "batch_size": 4, "epoch": 0.4244, "step": 1061, "tokens_per_device": 10832 }, { "epoch": 0.4244, "loss_ce": 0.07325562834739685, "loss_lvr": 0.5588325262069702, "loss_mode_switch": 0.0, "loss_total": 0.12913888692855835, "step": 1061 }, { "batch_size": 1, "epoch": 0.4244, "step": 1061, "tokens_per_device": 5118 }, { "epoch": 0.4244, "loss_ce": 0.09223158657550812, "loss_lvr": 0.6759626865386963, "loss_mode_switch": 0.0, "loss_total": 0.15982785820960999, "step": 1061 }, { "batch_size": 4, "epoch": 0.4244, "step": 1061, "tokens_per_device": 4444 }, { "epoch": 0.4244, "loss_ce": 0.02335616946220398, "loss_lvr": 0.4655672311782837, "loss_mode_switch": 0.0, "loss_total": 0.06991289556026459, "step": 1061 }, { "batch_size": 4, "epoch": 0.4244, "step": 1061, "tokens_per_device": 2140 }, { "epoch": 0.4244, "loss_ce": 0.35754984617233276, "loss_lvr": 1.0443061590194702, "loss_mode_switch": 0.0, "loss_total": 0.4619804620742798, "step": 1061 }, { "batch_size": 1, "epoch": 0.4244, "step": 1061, "tokens_per_device": 4994 }, { "epoch": 0.4244, "loss_ce": 0.20130655169487, "loss_lvr": 0.40044915676116943, "loss_mode_switch": 0.0, "loss_total": 0.24135147035121918, "step": 1061 }, { "batch_size": 4, "epoch": 0.4244, "step": 1061, "tokens_per_device": 4548 }, { "epoch": 0.4244, "loss_ce": 0.3696480393409729, "loss_lvr": 1.0068329572677612, "loss_mode_switch": 0.0, "loss_total": 0.4703313410282135, "step": 1061 }, { "epoch": 0.4248, "grad_norm": 1.4616703987121582, "learning_rate": 6.439990604208868e-06, "loss": 0.2731, "step": 1062 }, { "batch_size": 4, "epoch": 0.4248, "step": 1062, "tokens_per_device": 1528 }, { "epoch": 0.4248, "loss_ce": 0.5972234010696411, "loss_lvr": 1.1879194974899292, "loss_mode_switch": 0.0, "loss_total": 0.7160153388977051, "step": 1062 }, { "batch_size": 1, "epoch": 0.4248, "step": 1062, "tokens_per_device": 4893 }, { "epoch": 0.4248, "loss_ce": 0.017040695995092392, "loss_lvr": 0.3045547902584076, "loss_mode_switch": 0.0, "loss_total": 0.04749617725610733, "step": 1062 }, { "batch_size": 4, "epoch": 0.4248, "step": 1062, "tokens_per_device": 4196 }, { "epoch": 0.4248, "loss_ce": 0.09773216396570206, "loss_lvr": 0.8526925444602966, "loss_mode_switch": 0.0, "loss_total": 0.18300142884254456, "step": 1062 }, { "batch_size": 4, "epoch": 0.4248, "step": 1062, "tokens_per_device": 3764 }, { "epoch": 0.4248, "loss_ce": 0.11386680603027344, "loss_lvr": 0.7295949459075928, "loss_mode_switch": 0.0, "loss_total": 0.18682630360126495, "step": 1062 }, { "batch_size": 4, "epoch": 0.4248, "step": 1062, "tokens_per_device": 2320 }, { "epoch": 0.4248, "loss_ce": 0.3208294212818146, "loss_lvr": 0.8822227716445923, "loss_mode_switch": 0.0, "loss_total": 0.40905171632766724, "step": 1062 }, { "batch_size": 4, "epoch": 0.4248, "step": 1062, "tokens_per_device": 1548 }, { "epoch": 0.4248, "loss_ce": 0.4193483591079712, "loss_lvr": 1.0512890815734863, "loss_mode_switch": 0.0, "loss_total": 0.5244772434234619, "step": 1062 }, { "batch_size": 4, "epoch": 0.4248, "step": 1062, "tokens_per_device": 4304 }, { "epoch": 0.4248, "loss_ce": 0.11878939718008041, "loss_lvr": 0.698918342590332, "loss_mode_switch": 0.0, "loss_total": 0.1886812299489975, "step": 1062 }, { "batch_size": 1, "epoch": 0.4248, "step": 1062, "tokens_per_device": 4979 }, { "epoch": 0.4248, "loss_ce": 0.6313456892967224, "loss_lvr": 0.2015310823917389, "loss_mode_switch": 0.0, "loss_total": 0.6514987945556641, "step": 1062 }, { "epoch": 0.4252, "grad_norm": 1.3174630403518677, "learning_rate": 6.433786332747578e-06, "loss": 0.3394, "step": 1063 }, { "batch_size": 4, "epoch": 0.4252, "step": 1063, "tokens_per_device": 2688 }, { "epoch": 0.4252, "loss_ce": 0.19595354795455933, "loss_lvr": 0.837407648563385, "loss_mode_switch": 0.0, "loss_total": 0.2796943187713623, "step": 1063 }, { "batch_size": 4, "epoch": 0.4252, "step": 1063, "tokens_per_device": 8580 }, { "epoch": 0.4252, "loss_ce": 0.17128874361515045, "loss_lvr": 0.7868083715438843, "loss_mode_switch": 0.0, "loss_total": 0.24996957182884216, "step": 1063 }, { "batch_size": 1, "epoch": 0.4252, "step": 1063, "tokens_per_device": 5049 }, { "epoch": 0.4252, "loss_ce": 0.0935344472527504, "loss_lvr": 0.3998620808124542, "loss_mode_switch": 0.0, "loss_total": 0.13352066278457642, "step": 1063 }, { "batch_size": 4, "epoch": 0.4252, "step": 1063, "tokens_per_device": 5140 }, { "epoch": 0.4252, "loss_ce": 0.5302549004554749, "loss_lvr": 0.7729794979095459, "loss_mode_switch": 0.0, "loss_total": 0.6075528264045715, "step": 1063 }, { "batch_size": 4, "epoch": 0.4252, "step": 1063, "tokens_per_device": 13592 }, { "epoch": 0.4252, "loss_ce": 0.12844227254390717, "loss_lvr": 0.452380895614624, "loss_mode_switch": 0.0, "loss_total": 0.1736803650856018, "step": 1063 }, { "batch_size": 1, "epoch": 0.4252, "step": 1063, "tokens_per_device": 6382 }, { "epoch": 0.4252, "loss_ce": 0.10243892669677734, "loss_lvr": 0.4083883464336395, "loss_mode_switch": 0.0, "loss_total": 0.14327776432037354, "step": 1063 }, { "batch_size": 4, "epoch": 0.4252, "step": 1063, "tokens_per_device": 4476 }, { "epoch": 0.4252, "loss_ce": 0.034668952226638794, "loss_lvr": 0.8010162711143494, "loss_mode_switch": 0.0, "loss_total": 0.11477058380842209, "step": 1063 }, { "batch_size": 4, "epoch": 0.4252, "step": 1063, "tokens_per_device": 7736 }, { "epoch": 0.4252, "loss_ce": 0.2563555836677551, "loss_lvr": 0.7313125729560852, "loss_mode_switch": 0.0, "loss_total": 0.3294868469238281, "step": 1063 }, { "epoch": 0.4256, "grad_norm": 1.3854680061340332, "learning_rate": 6.427579654926095e-06, "loss": 0.2533, "step": 1064 }, { "batch_size": 4, "epoch": 0.4256, "step": 1064, "tokens_per_device": 4588 }, { "epoch": 0.4256, "loss_ce": 0.5414785742759705, "loss_lvr": 0.9232439398765564, "loss_mode_switch": 0.0, "loss_total": 0.6338029503822327, "step": 1064 }, { "batch_size": 4, "epoch": 0.4256, "step": 1064, "tokens_per_device": 6232 }, { "epoch": 0.4256, "loss_ce": 0.5373260378837585, "loss_lvr": 0.8632793426513672, "loss_mode_switch": 0.0, "loss_total": 0.6236539483070374, "step": 1064 }, { "batch_size": 1, "epoch": 0.4256, "step": 1064, "tokens_per_device": 5173 }, { "epoch": 0.4256, "loss_ce": 0.28350377082824707, "loss_lvr": 0.655081570148468, "loss_mode_switch": 0.0, "loss_total": 0.34901192784309387, "step": 1064 }, { "batch_size": 4, "epoch": 0.4256, "step": 1064, "tokens_per_device": 3620 }, { "epoch": 0.4256, "loss_ce": 0.39634349942207336, "loss_lvr": 0.43536853790283203, "loss_mode_switch": 0.0, "loss_total": 0.4398803412914276, "step": 1064 }, { "batch_size": 4, "epoch": 0.4256, "step": 1064, "tokens_per_device": 1512 }, { "epoch": 0.4256, "loss_ce": 0.38883426785469055, "loss_lvr": 1.3486248254776, "loss_mode_switch": 0.0, "loss_total": 0.523696780204773, "step": 1064 }, { "batch_size": 4, "epoch": 0.4256, "step": 1064, "tokens_per_device": 10688 }, { "epoch": 0.4256, "loss_ce": 0.4583339989185333, "loss_lvr": 0.7193483114242554, "loss_mode_switch": 0.0, "loss_total": 0.5302688479423523, "step": 1064 }, { "batch_size": 1, "epoch": 0.4256, "step": 1064, "tokens_per_device": 5009 }, { "epoch": 0.4256, "loss_ce": 0.23798425495624542, "loss_lvr": 0.5634657144546509, "loss_mode_switch": 0.0, "loss_total": 0.2943308353424072, "step": 1064 }, { "batch_size": 1, "epoch": 0.4256, "step": 1064, "tokens_per_device": 5147 }, { "epoch": 0.4256, "loss_ce": 0.0015915549593046308, "loss_lvr": 0.35662779211997986, "loss_mode_switch": 0.0, "loss_total": 0.03725433722138405, "step": 1064 }, { "epoch": 0.426, "grad_norm": 1.3871986865997314, "learning_rate": 6.421370581161244e-06, "loss": 0.313, "step": 1065 }, { "batch_size": 1, "epoch": 0.426, "step": 1065, "tokens_per_device": 5064 }, { "epoch": 0.426, "loss_ce": 0.056298352777957916, "loss_lvr": 0.6523832678794861, "loss_mode_switch": 0.0, "loss_total": 0.12153667956590652, "step": 1065 }, { "batch_size": 4, "epoch": 0.426, "step": 1065, "tokens_per_device": 4492 }, { "epoch": 0.426, "loss_ce": 0.36007946729660034, "loss_lvr": 0.8193874359130859, "loss_mode_switch": 0.0, "loss_total": 0.44201821088790894, "step": 1065 }, { "batch_size": 4, "epoch": 0.426, "step": 1065, "tokens_per_device": 8268 }, { "epoch": 0.426, "loss_ce": 0.2725438177585602, "loss_lvr": 1.0708136558532715, "loss_mode_switch": 0.0, "loss_total": 0.37962520122528076, "step": 1065 }, { "batch_size": 4, "epoch": 0.426, "step": 1065, "tokens_per_device": 3784 }, { "epoch": 0.426, "loss_ce": 0.6861570477485657, "loss_lvr": 1.1691036224365234, "loss_mode_switch": 0.0, "loss_total": 0.8030673861503601, "step": 1065 }, { "batch_size": 4, "epoch": 0.426, "step": 1065, "tokens_per_device": 5936 }, { "epoch": 0.426, "loss_ce": 0.40174198150634766, "loss_lvr": 0.7305746078491211, "loss_mode_switch": 0.0, "loss_total": 0.4747994542121887, "step": 1065 }, { "batch_size": 4, "epoch": 0.426, "step": 1065, "tokens_per_device": 5792 }, { "epoch": 0.426, "loss_ce": 0.3382636308670044, "loss_lvr": 0.6059966087341309, "loss_mode_switch": 0.0, "loss_total": 0.398863285779953, "step": 1065 }, { "batch_size": 4, "epoch": 0.426, "step": 1065, "tokens_per_device": 5868 }, { "epoch": 0.426, "loss_ce": 0.08553773909807205, "loss_lvr": 1.3124092817306519, "loss_mode_switch": 0.0, "loss_total": 0.21677866578102112, "step": 1065 }, { "batch_size": 4, "epoch": 0.426, "step": 1065, "tokens_per_device": 4080 }, { "epoch": 0.426, "loss_ce": 0.45514461398124695, "loss_lvr": 0.9794865846633911, "loss_mode_switch": 0.0, "loss_total": 0.5530932545661926, "step": 1065 }, { "epoch": 0.4264, "grad_norm": 1.3534903526306152, "learning_rate": 6.415159121873868e-06, "loss": 0.3147, "step": 1066 }, { "batch_size": 4, "epoch": 0.4264, "step": 1066, "tokens_per_device": 3924 }, { "epoch": 0.4264, "loss_ce": 0.2508520483970642, "loss_lvr": 0.6805726289749146, "loss_mode_switch": 0.0, "loss_total": 0.31890931725502014, "step": 1066 }, { "batch_size": 1, "epoch": 0.4264, "step": 1066, "tokens_per_device": 4548 }, { "epoch": 0.4264, "loss_ce": 0.6242218017578125, "loss_lvr": 0.5151182413101196, "loss_mode_switch": 0.0, "loss_total": 0.6757336258888245, "step": 1066 }, { "batch_size": 4, "epoch": 0.4264, "step": 1066, "tokens_per_device": 5220 }, { "epoch": 0.4264, "loss_ce": 0.6289114356040955, "loss_lvr": 0.7647603750228882, "loss_mode_switch": 0.0, "loss_total": 0.7053874731063843, "step": 1066 }, { "batch_size": 1, "epoch": 0.4264, "step": 1066, "tokens_per_device": 5255 }, { "epoch": 0.4264, "loss_ce": 0.2632109820842743, "loss_lvr": 0.4553605616092682, "loss_mode_switch": 0.0, "loss_total": 0.3087470531463623, "step": 1066 }, { "batch_size": 4, "epoch": 0.4264, "step": 1066, "tokens_per_device": 4928 }, { "epoch": 0.4264, "loss_ce": 0.04338197037577629, "loss_lvr": 1.5072578191757202, "loss_mode_switch": 0.0, "loss_total": 0.1941077560186386, "step": 1066 }, { "batch_size": 1, "epoch": 0.4264, "step": 1066, "tokens_per_device": 5114 }, { "epoch": 0.4264, "loss_ce": 0.04312426596879959, "loss_lvr": 0.35816246271133423, "loss_mode_switch": 0.0, "loss_total": 0.0789405107498169, "step": 1066 }, { "batch_size": 4, "epoch": 0.4264, "step": 1066, "tokens_per_device": 3596 }, { "epoch": 0.4264, "loss_ce": 0.09465907514095306, "loss_lvr": 0.8529481291770935, "loss_mode_switch": 0.0, "loss_total": 0.17995388805866241, "step": 1066 }, { "batch_size": 4, "epoch": 0.4264, "step": 1066, "tokens_per_device": 4224 }, { "epoch": 0.4264, "loss_ce": 0.17491599917411804, "loss_lvr": 1.1378813982009888, "loss_mode_switch": 0.0, "loss_total": 0.28870415687561035, "step": 1066 }, { "epoch": 0.4268, "grad_norm": 1.2546128034591675, "learning_rate": 6.408945287488824e-06, "loss": 0.2957, "step": 1067 }, { "batch_size": 1, "epoch": 0.4268, "step": 1067, "tokens_per_device": 5130 }, { "epoch": 0.4268, "loss_ce": 0.00569173414260149, "loss_lvr": 0.5129624605178833, "loss_mode_switch": 0.0, "loss_total": 0.056987978518009186, "step": 1067 }, { "batch_size": 4, "epoch": 0.4268, "step": 1067, "tokens_per_device": 4140 }, { "epoch": 0.4268, "loss_ce": 0.3322056233882904, "loss_lvr": 0.9152078628540039, "loss_mode_switch": 0.0, "loss_total": 0.4237264096736908, "step": 1067 }, { "batch_size": 1, "epoch": 0.4268, "step": 1067, "tokens_per_device": 4903 }, { "epoch": 0.4268, "loss_ce": 0.0027374224737286568, "loss_lvr": 0.24518723785877228, "loss_mode_switch": 0.0, "loss_total": 0.02725614607334137, "step": 1067 }, { "batch_size": 4, "epoch": 0.4268, "step": 1067, "tokens_per_device": 4588 }, { "epoch": 0.4268, "loss_ce": 0.4617285430431366, "loss_lvr": 0.9337276816368103, "loss_mode_switch": 0.0, "loss_total": 0.5551013350486755, "step": 1067 }, { "batch_size": 4, "epoch": 0.4268, "step": 1067, "tokens_per_device": 3740 }, { "epoch": 0.4268, "loss_ce": 0.0852622389793396, "loss_lvr": 1.2945103645324707, "loss_mode_switch": 0.0, "loss_total": 0.21471327543258667, "step": 1067 }, { "batch_size": 4, "epoch": 0.4268, "step": 1067, "tokens_per_device": 6356 }, { "epoch": 0.4268, "loss_ce": 0.2449774146080017, "loss_lvr": 0.994247555732727, "loss_mode_switch": 0.0, "loss_total": 0.34440216422080994, "step": 1067 }, { "batch_size": 4, "epoch": 0.4268, "step": 1067, "tokens_per_device": 11836 }, { "epoch": 0.4268, "loss_ce": 0.004781719297170639, "loss_lvr": 0.8469452857971191, "loss_mode_switch": 0.0, "loss_total": 0.08947624266147614, "step": 1067 }, { "batch_size": 4, "epoch": 0.4268, "step": 1067, "tokens_per_device": 14644 }, { "epoch": 0.4268, "loss_ce": 0.067025326192379, "loss_lvr": 1.0005007982254028, "loss_mode_switch": 0.0, "loss_total": 0.16707541048526764, "step": 1067 }, { "epoch": 0.4272, "grad_norm": 1.542955756187439, "learning_rate": 6.402729088434942e-06, "loss": 0.2691, "step": 1068 }, { "batch_size": 4, "epoch": 0.4272, "step": 1068, "tokens_per_device": 4164 }, { "epoch": 0.4272, "loss_ce": 0.2617191970348358, "loss_lvr": 0.8584243059158325, "loss_mode_switch": 0.0, "loss_total": 0.34756162762641907, "step": 1068 }, { "batch_size": 1, "epoch": 0.4272, "step": 1068, "tokens_per_device": 4952 }, { "epoch": 0.4272, "loss_ce": 0.038045067340135574, "loss_lvr": 1.3744103908538818, "loss_mode_switch": 0.0, "loss_total": 0.17548610270023346, "step": 1068 }, { "batch_size": 4, "epoch": 0.4272, "step": 1068, "tokens_per_device": 5844 }, { "epoch": 0.4272, "loss_ce": 0.13926056027412415, "loss_lvr": 0.805823028087616, "loss_mode_switch": 0.0, "loss_total": 0.21984286606311798, "step": 1068 }, { "batch_size": 4, "epoch": 0.4272, "step": 1068, "tokens_per_device": 4436 }, { "epoch": 0.4272, "loss_ce": 0.3387533724308014, "loss_lvr": 0.7382237911224365, "loss_mode_switch": 0.0, "loss_total": 0.41257575154304504, "step": 1068 }, { "batch_size": 4, "epoch": 0.4272, "step": 1068, "tokens_per_device": 4680 }, { "epoch": 0.4272, "loss_ce": 0.10999532788991928, "loss_lvr": 0.8934699296951294, "loss_mode_switch": 0.0, "loss_total": 0.19934232532978058, "step": 1068 }, { "batch_size": 4, "epoch": 0.4272, "step": 1068, "tokens_per_device": 3756 }, { "epoch": 0.4272, "loss_ce": 0.21244293451309204, "loss_lvr": 0.8374683856964111, "loss_mode_switch": 0.0, "loss_total": 0.2961897850036621, "step": 1068 }, { "batch_size": 4, "epoch": 0.4272, "step": 1068, "tokens_per_device": 2320 }, { "epoch": 0.4272, "loss_ce": 0.4192695617675781, "loss_lvr": 1.1781342029571533, "loss_mode_switch": 0.0, "loss_total": 0.5370829701423645, "step": 1068 }, { "batch_size": 4, "epoch": 0.4272, "step": 1068, "tokens_per_device": 15580 }, { "epoch": 0.4272, "loss_ce": 0.28606554865837097, "loss_lvr": 0.7209562659263611, "loss_mode_switch": 0.0, "loss_total": 0.35816118121147156, "step": 1068 }, { "epoch": 0.4276, "grad_norm": 1.091521143913269, "learning_rate": 6.396510535145033e-06, "loss": 0.2317, "step": 1069 }, { "batch_size": 1, "epoch": 0.4276, "step": 1069, "tokens_per_device": 5965 }, { "epoch": 0.4276, "loss_ce": 0.0005516010569408536, "loss_lvr": 0.4442993998527527, "loss_mode_switch": 0.0, "loss_total": 0.044981539249420166, "step": 1069 }, { "batch_size": 4, "epoch": 0.4276, "step": 1069, "tokens_per_device": 1640 }, { "epoch": 0.4276, "loss_ce": 0.3757449686527252, "loss_lvr": 1.0474255084991455, "loss_mode_switch": 0.0, "loss_total": 0.48048752546310425, "step": 1069 }, { "batch_size": 4, "epoch": 0.4276, "step": 1069, "tokens_per_device": 1332 }, { "epoch": 0.4276, "loss_ce": 0.40848419070243835, "loss_lvr": 0.9783816337585449, "loss_mode_switch": 0.0, "loss_total": 0.5063223838806152, "step": 1069 }, { "batch_size": 1, "epoch": 0.4276, "step": 1069, "tokens_per_device": 4898 }, { "epoch": 0.4276, "loss_ce": 0.028194956481456757, "loss_lvr": 0.4940294027328491, "loss_mode_switch": 0.0, "loss_total": 0.07759790122509003, "step": 1069 }, { "batch_size": 1, "epoch": 0.4276, "step": 1069, "tokens_per_device": 4896 }, { "epoch": 0.4276, "loss_ce": 0.0010716511169448495, "loss_lvr": 0.24799838662147522, "loss_mode_switch": 0.0, "loss_total": 0.025871489197015762, "step": 1069 }, { "batch_size": 1, "epoch": 0.4276, "step": 1069, "tokens_per_device": 5360 }, { "epoch": 0.4276, "loss_ce": 0.04353976622223854, "loss_lvr": 0.3920695185661316, "loss_mode_switch": 0.0, "loss_total": 0.0827467143535614, "step": 1069 }, { "batch_size": 1, "epoch": 0.4276, "step": 1069, "tokens_per_device": 5173 }, { "epoch": 0.4276, "loss_ce": 0.012466252781450748, "loss_lvr": 0.2653864920139313, "loss_mode_switch": 0.0, "loss_total": 0.03900490328669548, "step": 1069 }, { "batch_size": 1, "epoch": 0.4276, "step": 1069, "tokens_per_device": 5602 }, { "epoch": 0.4276, "loss_ce": 0.0002186770288972184, "loss_lvr": 0.649250864982605, "loss_mode_switch": 0.0, "loss_total": 0.06514376401901245, "step": 1069 }, { "epoch": 0.428, "grad_norm": 1.330411672592163, "learning_rate": 6.390289638055851e-06, "loss": 0.29, "step": 1070 }, { "batch_size": 4, "epoch": 0.428, "step": 1070, "tokens_per_device": 2760 }, { "epoch": 0.428, "loss_ce": 0.38741904497146606, "loss_lvr": 0.8649935722351074, "loss_mode_switch": 0.0, "loss_total": 0.4739184081554413, "step": 1070 }, { "batch_size": 4, "epoch": 0.428, "step": 1070, "tokens_per_device": 2264 }, { "epoch": 0.428, "loss_ce": 0.4283873438835144, "loss_lvr": 0.7693566083908081, "loss_mode_switch": 0.0, "loss_total": 0.5053229928016663, "step": 1070 }, { "batch_size": 4, "epoch": 0.428, "step": 1070, "tokens_per_device": 5000 }, { "epoch": 0.428, "loss_ce": 0.1761292964220047, "loss_lvr": 0.759300947189331, "loss_mode_switch": 0.0, "loss_total": 0.2520594000816345, "step": 1070 }, { "batch_size": 4, "epoch": 0.428, "step": 1070, "tokens_per_device": 4528 }, { "epoch": 0.428, "loss_ce": 0.2448391318321228, "loss_lvr": 1.0097076892852783, "loss_mode_switch": 0.0, "loss_total": 0.3458099067211151, "step": 1070 }, { "batch_size": 4, "epoch": 0.428, "step": 1070, "tokens_per_device": 4808 }, { "epoch": 0.428, "loss_ce": 0.25825560092926025, "loss_lvr": 0.6696749329566956, "loss_mode_switch": 0.0, "loss_total": 0.32522308826446533, "step": 1070 }, { "batch_size": 4, "epoch": 0.428, "step": 1070, "tokens_per_device": 1456 }, { "epoch": 0.428, "loss_ce": 0.0694868266582489, "loss_lvr": 1.393978476524353, "loss_mode_switch": 0.0, "loss_total": 0.20888467133045197, "step": 1070 }, { "batch_size": 4, "epoch": 0.428, "step": 1070, "tokens_per_device": 3848 }, { "epoch": 0.428, "loss_ce": 0.36768898367881775, "loss_lvr": 0.9022998809814453, "loss_mode_switch": 0.0, "loss_total": 0.4579189717769623, "step": 1070 }, { "batch_size": 4, "epoch": 0.428, "step": 1070, "tokens_per_device": 8476 }, { "epoch": 0.428, "loss_ce": 0.1294625997543335, "loss_lvr": 0.8126474618911743, "loss_mode_switch": 0.0, "loss_total": 0.21072734892368317, "step": 1070 }, { "epoch": 0.4284, "grad_norm": 1.3756824731826782, "learning_rate": 6.384066407608087e-06, "loss": 0.3206, "step": 1071 }, { "batch_size": 1, "epoch": 0.4284, "step": 1071, "tokens_per_device": 4907 }, { "epoch": 0.4284, "loss_ce": 0.004143040161579847, "loss_lvr": 0.2618485987186432, "loss_mode_switch": 0.0, "loss_total": 0.030327901244163513, "step": 1071 }, { "batch_size": 4, "epoch": 0.4284, "step": 1071, "tokens_per_device": 4256 }, { "epoch": 0.4284, "loss_ce": 0.6543623805046082, "loss_lvr": 1.886792778968811, "loss_mode_switch": 0.0, "loss_total": 0.8430416584014893, "step": 1071 }, { "batch_size": 1, "epoch": 0.4284, "step": 1071, "tokens_per_device": 4889 }, { "epoch": 0.4284, "loss_ce": 0.5341116189956665, "loss_lvr": 0.7078869938850403, "loss_mode_switch": 0.0, "loss_total": 0.6049003005027771, "step": 1071 }, { "batch_size": 4, "epoch": 0.4284, "step": 1071, "tokens_per_device": 8960 }, { "epoch": 0.4284, "loss_ce": 0.002816846827045083, "loss_lvr": 0.5154180526733398, "loss_mode_switch": 0.0, "loss_total": 0.05435865372419357, "step": 1071 }, { "batch_size": 4, "epoch": 0.4284, "step": 1071, "tokens_per_device": 4400 }, { "epoch": 0.4284, "loss_ce": 0.1834806352853775, "loss_lvr": 0.8409124612808228, "loss_mode_switch": 0.0, "loss_total": 0.2675718665122986, "step": 1071 }, { "batch_size": 4, "epoch": 0.4284, "step": 1071, "tokens_per_device": 3760 }, { "epoch": 0.4284, "loss_ce": 0.2167733758687973, "loss_lvr": 1.1165077686309814, "loss_mode_switch": 0.0, "loss_total": 0.3284241557121277, "step": 1071 }, { "batch_size": 4, "epoch": 0.4284, "step": 1071, "tokens_per_device": 3824 }, { "epoch": 0.4284, "loss_ce": 0.3151766359806061, "loss_lvr": 1.0080852508544922, "loss_mode_switch": 0.0, "loss_total": 0.4159851670265198, "step": 1071 }, { "batch_size": 4, "epoch": 0.4284, "step": 1071, "tokens_per_device": 4032 }, { "epoch": 0.4284, "loss_ce": 0.2317812591791153, "loss_lvr": 0.822088897228241, "loss_mode_switch": 0.0, "loss_total": 0.31399014592170715, "step": 1071 }, { "epoch": 0.4288, "grad_norm": 1.3180829286575317, "learning_rate": 6.377840854246348e-06, "loss": 0.2757, "step": 1072 }, { "batch_size": 4, "epoch": 0.4288, "step": 1072, "tokens_per_device": 4268 }, { "epoch": 0.4288, "loss_ce": 0.2104724794626236, "loss_lvr": 0.9317295551300049, "loss_mode_switch": 0.0, "loss_total": 0.30364543199539185, "step": 1072 }, { "batch_size": 1, "epoch": 0.4288, "step": 1072, "tokens_per_device": 5209 }, { "epoch": 0.4288, "loss_ce": 0.007389542181044817, "loss_lvr": 0.556143045425415, "loss_mode_switch": 0.0, "loss_total": 0.06300384551286697, "step": 1072 }, { "batch_size": 4, "epoch": 0.4288, "step": 1072, "tokens_per_device": 4460 }, { "epoch": 0.4288, "loss_ce": 0.03005032055079937, "loss_lvr": 0.8212373852729797, "loss_mode_switch": 0.0, "loss_total": 0.11217406392097473, "step": 1072 }, { "batch_size": 4, "epoch": 0.4288, "step": 1072, "tokens_per_device": 2968 }, { "epoch": 0.4288, "loss_ce": 0.17467013001441956, "loss_lvr": 0.7892019748687744, "loss_mode_switch": 0.0, "loss_total": 0.25359034538269043, "step": 1072 }, { "batch_size": 1, "epoch": 0.4288, "step": 1072, "tokens_per_device": 4870 }, { "epoch": 0.4288, "loss_ce": 0.11495840549468994, "loss_lvr": 0.2419869601726532, "loss_mode_switch": 0.0, "loss_total": 0.13915710151195526, "step": 1072 }, { "batch_size": 4, "epoch": 0.4288, "step": 1072, "tokens_per_device": 1380 }, { "epoch": 0.4288, "loss_ce": 0.06681246310472488, "loss_lvr": 0.7811415791511536, "loss_mode_switch": 0.0, "loss_total": 0.14492662250995636, "step": 1072 }, { "batch_size": 4, "epoch": 0.4288, "step": 1072, "tokens_per_device": 10852 }, { "epoch": 0.4288, "loss_ce": 0.03766173869371414, "loss_lvr": 0.7053263187408447, "loss_mode_switch": 0.0, "loss_total": 0.10819437354803085, "step": 1072 }, { "batch_size": 1, "epoch": 0.4288, "step": 1072, "tokens_per_device": 5122 }, { "epoch": 0.4288, "loss_ce": 0.0017541060224175453, "loss_lvr": 0.47365739941596985, "loss_mode_switch": 0.0, "loss_total": 0.049119845032691956, "step": 1072 }, { "epoch": 0.4292, "grad_norm": 1.5774047374725342, "learning_rate": 6.371612988419138e-06, "loss": 0.2741, "step": 1073 }, { "batch_size": 1, "epoch": 0.4292, "step": 1073, "tokens_per_device": 5133 }, { "epoch": 0.4292, "loss_ce": 0.11326615512371063, "loss_lvr": 0.2314106971025467, "loss_mode_switch": 0.0, "loss_total": 0.13640722632408142, "step": 1073 }, { "batch_size": 1, "epoch": 0.4292, "step": 1073, "tokens_per_device": 5715 }, { "epoch": 0.4292, "loss_ce": 0.004408194683492184, "loss_lvr": 0.26674723625183105, "loss_mode_switch": 0.0, "loss_total": 0.031082917004823685, "step": 1073 }, { "batch_size": 4, "epoch": 0.4292, "step": 1073, "tokens_per_device": 5624 }, { "epoch": 0.4292, "loss_ce": 0.026665331795811653, "loss_lvr": 0.82294762134552, "loss_mode_switch": 0.0, "loss_total": 0.1089600920677185, "step": 1073 }, { "batch_size": 4, "epoch": 0.4292, "step": 1073, "tokens_per_device": 1332 }, { "epoch": 0.4292, "loss_ce": 0.725957989692688, "loss_lvr": 1.0563600063323975, "loss_mode_switch": 0.0, "loss_total": 0.8315939903259277, "step": 1073 }, { "batch_size": 4, "epoch": 0.4292, "step": 1073, "tokens_per_device": 9760 }, { "epoch": 0.4292, "loss_ce": 0.06954886764287949, "loss_lvr": 0.8770768642425537, "loss_mode_switch": 0.0, "loss_total": 0.15725655853748322, "step": 1073 }, { "batch_size": 1, "epoch": 0.4292, "step": 1073, "tokens_per_device": 4873 }, { "epoch": 0.4292, "loss_ce": 0.0061049023643136024, "loss_lvr": 1.169897198677063, "loss_mode_switch": 0.0, "loss_total": 0.12309462577104568, "step": 1073 }, { "batch_size": 4, "epoch": 0.4292, "step": 1073, "tokens_per_device": 4284 }, { "epoch": 0.4292, "loss_ce": 0.04614541307091713, "loss_lvr": 0.6781105995178223, "loss_mode_switch": 0.0, "loss_total": 0.11395648121833801, "step": 1073 }, { "batch_size": 4, "epoch": 0.4292, "step": 1073, "tokens_per_device": 2664 }, { "epoch": 0.4292, "loss_ce": 0.06996207684278488, "loss_lvr": 1.0183509588241577, "loss_mode_switch": 0.0, "loss_total": 0.17179717123508453, "step": 1073 }, { "epoch": 0.4296, "grad_norm": 1.1809728145599365, "learning_rate": 6.3653828205788445e-06, "loss": 0.2693, "step": 1074 }, { "batch_size": 4, "epoch": 0.4296, "step": 1074, "tokens_per_device": 1984 }, { "epoch": 0.4296, "loss_ce": 0.5411390066146851, "loss_lvr": 0.8103106617927551, "loss_mode_switch": 0.0, "loss_total": 0.622170090675354, "step": 1074 }, { "batch_size": 1, "epoch": 0.4296, "step": 1074, "tokens_per_device": 5132 }, { "epoch": 0.4296, "loss_ce": 0.0011514283251017332, "loss_lvr": 1.2343333959579468, "loss_mode_switch": 0.0, "loss_total": 0.12458477169275284, "step": 1074 }, { "batch_size": 4, "epoch": 0.4296, "step": 1074, "tokens_per_device": 2676 }, { "epoch": 0.4296, "loss_ce": 0.6935642957687378, "loss_lvr": 0.9003170728683472, "loss_mode_switch": 0.0, "loss_total": 0.7835959792137146, "step": 1074 }, { "batch_size": 4, "epoch": 0.4296, "step": 1074, "tokens_per_device": 2544 }, { "epoch": 0.4296, "loss_ce": 0.26979556679725647, "loss_lvr": 1.0197601318359375, "loss_mode_switch": 0.0, "loss_total": 0.37177157402038574, "step": 1074 }, { "batch_size": 4, "epoch": 0.4296, "step": 1074, "tokens_per_device": 2992 }, { "epoch": 0.4296, "loss_ce": 0.5755521655082703, "loss_lvr": 0.9793358445167542, "loss_mode_switch": 0.0, "loss_total": 0.6734857559204102, "step": 1074 }, { "batch_size": 1, "epoch": 0.4296, "step": 1074, "tokens_per_device": 6137 }, { "epoch": 0.4296, "loss_ce": 0.15916883945465088, "loss_lvr": 0.3821125626564026, "loss_mode_switch": 0.0, "loss_total": 0.19738009572029114, "step": 1074 }, { "batch_size": 4, "epoch": 0.4296, "step": 1074, "tokens_per_device": 2692 }, { "epoch": 0.4296, "loss_ce": 0.34871676564216614, "loss_lvr": 0.8683377504348755, "loss_mode_switch": 0.0, "loss_total": 0.4355505406856537, "step": 1074 }, { "batch_size": 4, "epoch": 0.4296, "step": 1074, "tokens_per_device": 1624 }, { "epoch": 0.4296, "loss_ce": 0.5305761694908142, "loss_lvr": 0.7600032687187195, "loss_mode_switch": 0.0, "loss_total": 0.6065765023231506, "step": 1074 }, { "epoch": 0.43, "grad_norm": 1.4146968126296997, "learning_rate": 6.3591503611817155e-06, "loss": 0.3299, "step": 1075 }, { "batch_size": 4, "epoch": 0.43, "step": 1075, "tokens_per_device": 3764 }, { "epoch": 0.43, "loss_ce": 0.3183346390724182, "loss_lvr": 1.0883334875106812, "loss_mode_switch": 0.0, "loss_total": 0.42716798186302185, "step": 1075 }, { "batch_size": 4, "epoch": 0.43, "step": 1075, "tokens_per_device": 1248 }, { "epoch": 0.43, "loss_ce": 0.4964967370033264, "loss_lvr": 1.1681751012802124, "loss_mode_switch": 0.0, "loss_total": 0.6133142709732056, "step": 1075 }, { "batch_size": 4, "epoch": 0.43, "step": 1075, "tokens_per_device": 3012 }, { "epoch": 0.43, "loss_ce": 0.26282835006713867, "loss_lvr": 0.623622715473175, "loss_mode_switch": 0.0, "loss_total": 0.32519063353538513, "step": 1075 }, { "batch_size": 4, "epoch": 0.43, "step": 1075, "tokens_per_device": 1396 }, { "epoch": 0.43, "loss_ce": 0.673263430595398, "loss_lvr": 1.081808090209961, "loss_mode_switch": 0.0, "loss_total": 0.781444251537323, "step": 1075 }, { "batch_size": 4, "epoch": 0.43, "step": 1075, "tokens_per_device": 4196 }, { "epoch": 0.43, "loss_ce": 0.26650163531303406, "loss_lvr": 1.4123432636260986, "loss_mode_switch": 0.0, "loss_total": 0.4077359437942505, "step": 1075 }, { "batch_size": 4, "epoch": 0.43, "step": 1075, "tokens_per_device": 7068 }, { "epoch": 0.43, "loss_ce": 0.20506133139133453, "loss_lvr": 0.6302157640457153, "loss_mode_switch": 0.0, "loss_total": 0.2680829167366028, "step": 1075 }, { "batch_size": 4, "epoch": 0.43, "step": 1075, "tokens_per_device": 5128 }, { "epoch": 0.43, "loss_ce": 0.11038745939731598, "loss_lvr": 0.8304267525672913, "loss_mode_switch": 0.0, "loss_total": 0.1934301257133484, "step": 1075 }, { "batch_size": 1, "epoch": 0.43, "step": 1075, "tokens_per_device": 5145 }, { "epoch": 0.43, "loss_ce": 0.09500300884246826, "loss_lvr": 0.7583460211753845, "loss_mode_switch": 0.0, "loss_total": 0.1708376109600067, "step": 1075 }, { "epoch": 0.4304, "grad_norm": 1.3568373918533325, "learning_rate": 6.352915620687848e-06, "loss": 0.2734, "step": 1076 }, { "batch_size": 1, "epoch": 0.4304, "step": 1076, "tokens_per_device": 5213 }, { "epoch": 0.4304, "loss_ce": 0.0014571218052878976, "loss_lvr": 0.4423457980155945, "loss_mode_switch": 0.0, "loss_total": 0.045691702514886856, "step": 1076 }, { "batch_size": 4, "epoch": 0.4304, "step": 1076, "tokens_per_device": 2772 }, { "epoch": 0.4304, "loss_ce": 0.6313725709915161, "loss_lvr": 0.6131205558776855, "loss_mode_switch": 0.0, "loss_total": 0.6926846504211426, "step": 1076 }, { "batch_size": 4, "epoch": 0.4304, "step": 1076, "tokens_per_device": 2556 }, { "epoch": 0.4304, "loss_ce": 0.32696211338043213, "loss_lvr": 1.1556755304336548, "loss_mode_switch": 0.0, "loss_total": 0.44252967834472656, "step": 1076 }, { "batch_size": 4, "epoch": 0.4304, "step": 1076, "tokens_per_device": 4188 }, { "epoch": 0.4304, "loss_ce": 0.012586037628352642, "loss_lvr": 0.9042195677757263, "loss_mode_switch": 0.0, "loss_total": 0.10300799459218979, "step": 1076 }, { "batch_size": 4, "epoch": 0.4304, "step": 1076, "tokens_per_device": 1476 }, { "epoch": 0.4304, "loss_ce": 0.4887906312942505, "loss_lvr": 1.1321980953216553, "loss_mode_switch": 0.0, "loss_total": 0.6020104289054871, "step": 1076 }, { "batch_size": 4, "epoch": 0.4304, "step": 1076, "tokens_per_device": 3820 }, { "epoch": 0.4304, "loss_ce": 0.02425679750740528, "loss_lvr": 0.9784276485443115, "loss_mode_switch": 0.0, "loss_total": 0.12209956347942352, "step": 1076 }, { "batch_size": 4, "epoch": 0.4304, "step": 1076, "tokens_per_device": 4588 }, { "epoch": 0.4304, "loss_ce": 0.16421173512935638, "loss_lvr": 0.86979079246521, "loss_mode_switch": 0.0, "loss_total": 0.25119081139564514, "step": 1076 }, { "batch_size": 4, "epoch": 0.4304, "step": 1076, "tokens_per_device": 9780 }, { "epoch": 0.4304, "loss_ce": 0.13880163431167603, "loss_lvr": 1.1438941955566406, "loss_mode_switch": 0.0, "loss_total": 0.2531910538673401, "step": 1076 }, { "epoch": 0.4308, "grad_norm": 1.1322487592697144, "learning_rate": 6.346678609561166e-06, "loss": 0.2771, "step": 1077 }, { "batch_size": 4, "epoch": 0.4308, "step": 1077, "tokens_per_device": 11940 }, { "epoch": 0.4308, "loss_ce": 0.20472103357315063, "loss_lvr": 0.4689774513244629, "loss_mode_switch": 0.0, "loss_total": 0.25161877274513245, "step": 1077 }, { "batch_size": 4, "epoch": 0.4308, "step": 1077, "tokens_per_device": 3776 }, { "epoch": 0.4308, "loss_ce": 0.09389188140630722, "loss_lvr": 1.1530485153198242, "loss_mode_switch": 0.0, "loss_total": 0.20919673144817352, "step": 1077 }, { "batch_size": 4, "epoch": 0.4308, "step": 1077, "tokens_per_device": 4232 }, { "epoch": 0.4308, "loss_ce": 0.5885640978813171, "loss_lvr": 1.1184805631637573, "loss_mode_switch": 0.0, "loss_total": 0.7004121541976929, "step": 1077 }, { "batch_size": 4, "epoch": 0.4308, "step": 1077, "tokens_per_device": 3548 }, { "epoch": 0.4308, "loss_ce": 0.3950807750225067, "loss_lvr": 0.8499013781547546, "loss_mode_switch": 0.0, "loss_total": 0.48007091879844666, "step": 1077 }, { "batch_size": 4, "epoch": 0.4308, "step": 1077, "tokens_per_device": 2732 }, { "epoch": 0.4308, "loss_ce": 0.19818180799484253, "loss_lvr": 0.7307841181755066, "loss_mode_switch": 0.0, "loss_total": 0.27126023173332214, "step": 1077 }, { "batch_size": 4, "epoch": 0.4308, "step": 1077, "tokens_per_device": 5740 }, { "epoch": 0.4308, "loss_ce": 0.2952139675617218, "loss_lvr": 0.8961799740791321, "loss_mode_switch": 0.0, "loss_total": 0.384831964969635, "step": 1077 }, { "batch_size": 1, "epoch": 0.4308, "step": 1077, "tokens_per_device": 5039 }, { "epoch": 0.4308, "loss_ce": 0.01680353470146656, "loss_lvr": 0.461374431848526, "loss_mode_switch": 0.0, "loss_total": 0.06294097751379013, "step": 1077 }, { "batch_size": 4, "epoch": 0.4308, "step": 1077, "tokens_per_device": 2620 }, { "epoch": 0.4308, "loss_ce": 0.11563390493392944, "loss_lvr": 0.9243476986885071, "loss_mode_switch": 0.0, "loss_total": 0.20806866884231567, "step": 1077 }, { "epoch": 0.4312, "grad_norm": 1.4071623086929321, "learning_rate": 6.340439338269402e-06, "loss": 0.3147, "step": 1078 }, { "batch_size": 4, "epoch": 0.4312, "step": 1078, "tokens_per_device": 1388 }, { "epoch": 0.4312, "loss_ce": 0.3023609220981598, "loss_lvr": 1.8811863660812378, "loss_mode_switch": 0.0, "loss_total": 0.49047955870628357, "step": 1078 }, { "batch_size": 1, "epoch": 0.4312, "step": 1078, "tokens_per_device": 4883 }, { "epoch": 0.4312, "loss_ce": 0.5449322462081909, "loss_lvr": 0.3654176592826843, "loss_mode_switch": 0.0, "loss_total": 0.5814740061759949, "step": 1078 }, { "batch_size": 1, "epoch": 0.4312, "step": 1078, "tokens_per_device": 4885 }, { "epoch": 0.4312, "loss_ce": 0.0007788179791532457, "loss_lvr": 0.2315901815891266, "loss_mode_switch": 0.0, "loss_total": 0.023937836289405823, "step": 1078 }, { "batch_size": 4, "epoch": 0.4312, "step": 1078, "tokens_per_device": 3956 }, { "epoch": 0.4312, "loss_ce": 0.7206466794013977, "loss_lvr": 1.0130479335784912, "loss_mode_switch": 0.0, "loss_total": 0.8219514489173889, "step": 1078 }, { "batch_size": 1, "epoch": 0.4312, "step": 1078, "tokens_per_device": 5339 }, { "epoch": 0.4312, "loss_ce": 0.009950647130608559, "loss_lvr": 0.37338685989379883, "loss_mode_switch": 0.0, "loss_total": 0.04728933423757553, "step": 1078 }, { "batch_size": 4, "epoch": 0.4312, "step": 1078, "tokens_per_device": 4928 }, { "epoch": 0.4312, "loss_ce": 0.09274810552597046, "loss_lvr": 0.5923946499824524, "loss_mode_switch": 0.0, "loss_total": 0.15198756754398346, "step": 1078 }, { "batch_size": 1, "epoch": 0.4312, "step": 1078, "tokens_per_device": 5194 }, { "epoch": 0.4312, "loss_ce": 0.39450961351394653, "loss_lvr": 0.3546387851238251, "loss_mode_switch": 0.0, "loss_total": 0.4299734830856323, "step": 1078 }, { "batch_size": 4, "epoch": 0.4312, "step": 1078, "tokens_per_device": 4932 }, { "epoch": 0.4312, "loss_ce": 0.16648538410663605, "loss_lvr": 1.011329174041748, "loss_mode_switch": 0.0, "loss_total": 0.2676182985305786, "step": 1078 }, { "epoch": 0.4316, "grad_norm": 1.295484185218811, "learning_rate": 6.3341978172840875e-06, "loss": 0.3025, "step": 1079 }, { "batch_size": 4, "epoch": 0.4316, "step": 1079, "tokens_per_device": 1592 }, { "epoch": 0.4316, "loss_ce": 0.4103498160839081, "loss_lvr": 1.0301792621612549, "loss_mode_switch": 0.0, "loss_total": 0.513367772102356, "step": 1079 }, { "batch_size": 4, "epoch": 0.4316, "step": 1079, "tokens_per_device": 4292 }, { "epoch": 0.4316, "loss_ce": 0.43135547637939453, "loss_lvr": 0.7488874197006226, "loss_mode_switch": 0.0, "loss_total": 0.5062442421913147, "step": 1079 }, { "batch_size": 1, "epoch": 0.4316, "step": 1079, "tokens_per_device": 4866 }, { "epoch": 0.4316, "loss_ce": 0.0033863538410514593, "loss_lvr": 0.4057261645793915, "loss_mode_switch": 0.0, "loss_total": 0.04395896941423416, "step": 1079 }, { "batch_size": 1, "epoch": 0.4316, "step": 1079, "tokens_per_device": 5123 }, { "epoch": 0.4316, "loss_ce": 0.008236570283770561, "loss_lvr": 0.1851266622543335, "loss_mode_switch": 0.0, "loss_total": 0.02674923650920391, "step": 1079 }, { "batch_size": 4, "epoch": 0.4316, "step": 1079, "tokens_per_device": 4308 }, { "epoch": 0.4316, "loss_ce": 0.0644623264670372, "loss_lvr": 0.8169058561325073, "loss_mode_switch": 0.0, "loss_total": 0.14615291357040405, "step": 1079 }, { "batch_size": 1, "epoch": 0.4316, "step": 1079, "tokens_per_device": 7080 }, { "epoch": 0.4316, "loss_ce": 0.13806934654712677, "loss_lvr": 0.38748615980148315, "loss_mode_switch": 0.0, "loss_total": 0.17681796848773956, "step": 1079 }, { "batch_size": 4, "epoch": 0.4316, "step": 1079, "tokens_per_device": 4324 }, { "epoch": 0.4316, "loss_ce": 0.23068499565124512, "loss_lvr": 0.8684437274932861, "loss_mode_switch": 0.0, "loss_total": 0.3175293803215027, "step": 1079 }, { "batch_size": 4, "epoch": 0.4316, "step": 1079, "tokens_per_device": 3160 }, { "epoch": 0.4316, "loss_ce": 0.34268584847450256, "loss_lvr": 0.8865353465080261, "loss_mode_switch": 0.0, "loss_total": 0.4313393831253052, "step": 1079 }, { "epoch": 0.432, "grad_norm": 1.5705538988113403, "learning_rate": 6.3279540570805265e-06, "loss": 0.2959, "step": 1080 }, { "batch_size": 1, "epoch": 0.432, "step": 1080, "tokens_per_device": 5108 }, { "epoch": 0.432, "loss_ce": 0.0009395665838383138, "loss_lvr": 0.5259463787078857, "loss_mode_switch": 0.0, "loss_total": 0.05353420600295067, "step": 1080 }, { "batch_size": 4, "epoch": 0.432, "step": 1080, "tokens_per_device": 4224 }, { "epoch": 0.432, "loss_ce": 0.576888382434845, "loss_lvr": 1.0388145446777344, "loss_mode_switch": 0.0, "loss_total": 0.6807698607444763, "step": 1080 }, { "batch_size": 4, "epoch": 0.432, "step": 1080, "tokens_per_device": 13572 }, { "epoch": 0.432, "loss_ce": 0.4461876451969147, "loss_lvr": 0.9768872261047363, "loss_mode_switch": 0.0, "loss_total": 0.5438763499259949, "step": 1080 }, { "batch_size": 4, "epoch": 0.432, "step": 1080, "tokens_per_device": 1472 }, { "epoch": 0.432, "loss_ce": 0.13109487295150757, "loss_lvr": 0.9128458499908447, "loss_mode_switch": 0.0, "loss_total": 0.22237946093082428, "step": 1080 }, { "batch_size": 1, "epoch": 0.432, "step": 1080, "tokens_per_device": 4906 }, { "epoch": 0.432, "loss_ce": 0.0032323156483471394, "loss_lvr": 0.22440029680728912, "loss_mode_switch": 0.0, "loss_total": 0.025672344490885735, "step": 1080 }, { "batch_size": 4, "epoch": 0.432, "step": 1080, "tokens_per_device": 4232 }, { "epoch": 0.432, "loss_ce": 0.232528954744339, "loss_lvr": 0.9914378523826599, "loss_mode_switch": 0.0, "loss_total": 0.331672728061676, "step": 1080 }, { "batch_size": 4, "epoch": 0.432, "step": 1080, "tokens_per_device": 4200 }, { "epoch": 0.432, "loss_ce": 0.11117781698703766, "loss_lvr": 0.8498409986495972, "loss_mode_switch": 0.0, "loss_total": 0.1961619257926941, "step": 1080 }, { "batch_size": 1, "epoch": 0.432, "step": 1080, "tokens_per_device": 5115 }, { "epoch": 0.432, "loss_ce": 0.097932368516922, "loss_lvr": 0.3874736726284027, "loss_mode_switch": 0.0, "loss_total": 0.1366797387599945, "step": 1080 }, { "epoch": 0.4324, "grad_norm": 1.4046058654785156, "learning_rate": 6.321708068137778e-06, "loss": 0.2883, "step": 1081 }, { "batch_size": 1, "epoch": 0.4324, "step": 1081, "tokens_per_device": 4904 }, { "epoch": 0.4324, "loss_ce": 0.014794735237956047, "loss_lvr": 0.36903953552246094, "loss_mode_switch": 0.0, "loss_total": 0.05169869214296341, "step": 1081 }, { "batch_size": 4, "epoch": 0.4324, "step": 1081, "tokens_per_device": 3020 }, { "epoch": 0.4324, "loss_ce": 0.3258528709411621, "loss_lvr": 0.7111291289329529, "loss_mode_switch": 0.0, "loss_total": 0.39696580171585083, "step": 1081 }, { "batch_size": 4, "epoch": 0.4324, "step": 1081, "tokens_per_device": 1560 }, { "epoch": 0.4324, "loss_ce": 0.27516359090805054, "loss_lvr": 1.0556559562683105, "loss_mode_switch": 0.0, "loss_total": 0.38072919845581055, "step": 1081 }, { "batch_size": 4, "epoch": 0.4324, "step": 1081, "tokens_per_device": 1436 }, { "epoch": 0.4324, "loss_ce": 0.44436702132225037, "loss_lvr": 0.9850261211395264, "loss_mode_switch": 0.0, "loss_total": 0.5428696274757385, "step": 1081 }, { "batch_size": 4, "epoch": 0.4324, "step": 1081, "tokens_per_device": 4324 }, { "epoch": 0.4324, "loss_ce": 0.14024385809898376, "loss_lvr": 0.7925476431846619, "loss_mode_switch": 0.0, "loss_total": 0.2194986343383789, "step": 1081 }, { "batch_size": 1, "epoch": 0.4324, "step": 1081, "tokens_per_device": 4700 }, { "epoch": 0.4324, "loss_ce": 0.01796828769147396, "loss_lvr": 0.6444528102874756, "loss_mode_switch": 0.0, "loss_total": 0.08241356909275055, "step": 1081 }, { "batch_size": 4, "epoch": 0.4324, "step": 1081, "tokens_per_device": 6688 }, { "epoch": 0.4324, "loss_ce": 0.5843079090118408, "loss_lvr": 0.8186421394348145, "loss_mode_switch": 0.0, "loss_total": 0.6661721467971802, "step": 1081 }, { "batch_size": 4, "epoch": 0.4324, "step": 1081, "tokens_per_device": 2596 }, { "epoch": 0.4324, "loss_ce": 0.16519275307655334, "loss_lvr": 1.0463298559188843, "loss_mode_switch": 0.0, "loss_total": 0.2698257565498352, "step": 1081 }, { "epoch": 0.4328, "grad_norm": 1.2299107313156128, "learning_rate": 6.315459860938649e-06, "loss": 0.3011, "step": 1082 }, { "batch_size": 4, "epoch": 0.4328, "step": 1082, "tokens_per_device": 5036 }, { "epoch": 0.4328, "loss_ce": 0.26758307218551636, "loss_lvr": 0.9133172035217285, "loss_mode_switch": 0.0, "loss_total": 0.3589147925376892, "step": 1082 }, { "batch_size": 1, "epoch": 0.4328, "step": 1082, "tokens_per_device": 4867 }, { "epoch": 0.4328, "loss_ce": 0.020988700911402702, "loss_lvr": 0.8535359501838684, "loss_mode_switch": 0.0, "loss_total": 0.10634230077266693, "step": 1082 }, { "batch_size": 1, "epoch": 0.4328, "step": 1082, "tokens_per_device": 4958 }, { "epoch": 0.4328, "loss_ce": 0.10920456796884537, "loss_lvr": 0.4887990653514862, "loss_mode_switch": 0.0, "loss_total": 0.15808448195457458, "step": 1082 }, { "batch_size": 1, "epoch": 0.4328, "step": 1082, "tokens_per_device": 4868 }, { "epoch": 0.4328, "loss_ce": 0.0028947372920811176, "loss_lvr": 0.3283900022506714, "loss_mode_switch": 0.0, "loss_total": 0.03573373705148697, "step": 1082 }, { "batch_size": 1, "epoch": 0.4328, "step": 1082, "tokens_per_device": 6951 }, { "epoch": 0.4328, "loss_ce": 0.12656304240226746, "loss_lvr": 0.2748744487762451, "loss_mode_switch": 0.0, "loss_total": 0.15405048429965973, "step": 1082 }, { "batch_size": 4, "epoch": 0.4328, "step": 1082, "tokens_per_device": 1460 }, { "epoch": 0.4328, "loss_ce": 0.235944002866745, "loss_lvr": 1.0862183570861816, "loss_mode_switch": 0.0, "loss_total": 0.34456583857536316, "step": 1082 }, { "batch_size": 4, "epoch": 0.4328, "step": 1082, "tokens_per_device": 4848 }, { "epoch": 0.4328, "loss_ce": 0.41927817463874817, "loss_lvr": 0.804807722568512, "loss_mode_switch": 0.0, "loss_total": 0.4997589588165283, "step": 1082 }, { "batch_size": 4, "epoch": 0.4328, "step": 1082, "tokens_per_device": 5676 }, { "epoch": 0.4328, "loss_ce": 0.20931629836559296, "loss_lvr": 1.0531529188156128, "loss_mode_switch": 0.0, "loss_total": 0.3146315813064575, "step": 1082 }, { "epoch": 0.4332, "grad_norm": 1.1268317699432373, "learning_rate": 6.30920944596966e-06, "loss": 0.2774, "step": 1083 }, { "batch_size": 4, "epoch": 0.4332, "step": 1083, "tokens_per_device": 3660 }, { "epoch": 0.4332, "loss_ce": 0.4567398726940155, "loss_lvr": 0.5484551191329956, "loss_mode_switch": 0.0, "loss_total": 0.5115853548049927, "step": 1083 }, { "batch_size": 4, "epoch": 0.4332, "step": 1083, "tokens_per_device": 3808 }, { "epoch": 0.4332, "loss_ce": 0.12860698997974396, "loss_lvr": 0.7603702545166016, "loss_mode_switch": 0.0, "loss_total": 0.20464402437210083, "step": 1083 }, { "batch_size": 4, "epoch": 0.4332, "step": 1083, "tokens_per_device": 9160 }, { "epoch": 0.4332, "loss_ce": 0.0009619055781513453, "loss_lvr": 0.6669560074806213, "loss_mode_switch": 0.0, "loss_total": 0.06765750795602798, "step": 1083 }, { "batch_size": 4, "epoch": 0.4332, "step": 1083, "tokens_per_device": 4940 }, { "epoch": 0.4332, "loss_ce": 0.22960864007472992, "loss_lvr": 0.7904427647590637, "loss_mode_switch": 0.0, "loss_total": 0.3086529076099396, "step": 1083 }, { "batch_size": 4, "epoch": 0.4332, "step": 1083, "tokens_per_device": 6020 }, { "epoch": 0.4332, "loss_ce": 0.2856050133705139, "loss_lvr": 0.76559978723526, "loss_mode_switch": 0.0, "loss_total": 0.36216500401496887, "step": 1083 }, { "batch_size": 4, "epoch": 0.4332, "step": 1083, "tokens_per_device": 4592 }, { "epoch": 0.4332, "loss_ce": 0.15640737116336823, "loss_lvr": 0.6976040601730347, "loss_mode_switch": 0.0, "loss_total": 0.22616776823997498, "step": 1083 }, { "batch_size": 1, "epoch": 0.4332, "step": 1083, "tokens_per_device": 5101 }, { "epoch": 0.4332, "loss_ce": 0.023079009726643562, "loss_lvr": 0.7337122559547424, "loss_mode_switch": 0.0, "loss_total": 0.09645023196935654, "step": 1083 }, { "batch_size": 1, "epoch": 0.4332, "step": 1083, "tokens_per_device": 4819 }, { "epoch": 0.4332, "loss_ce": 0.2272396981716156, "loss_lvr": 0.2302578091621399, "loss_mode_switch": 0.0, "loss_total": 0.2502654790878296, "step": 1083 }, { "epoch": 0.4336, "grad_norm": 1.446488380432129, "learning_rate": 6.302956833721048e-06, "loss": 0.2817, "step": 1084 }, { "batch_size": 4, "epoch": 0.4336, "step": 1084, "tokens_per_device": 1716 }, { "epoch": 0.4336, "loss_ce": 0.1520819067955017, "loss_lvr": 1.055594801902771, "loss_mode_switch": 0.0, "loss_total": 0.25764137506484985, "step": 1084 }, { "batch_size": 4, "epoch": 0.4336, "step": 1084, "tokens_per_device": 9700 }, { "epoch": 0.4336, "loss_ce": 0.38724005222320557, "loss_lvr": 0.6392637491226196, "loss_mode_switch": 0.0, "loss_total": 0.45116642117500305, "step": 1084 }, { "batch_size": 4, "epoch": 0.4336, "step": 1084, "tokens_per_device": 7312 }, { "epoch": 0.4336, "loss_ce": 0.09536395967006683, "loss_lvr": 0.5880495309829712, "loss_mode_switch": 0.0, "loss_total": 0.15416891872882843, "step": 1084 }, { "batch_size": 1, "epoch": 0.4336, "step": 1084, "tokens_per_device": 4981 }, { "epoch": 0.4336, "loss_ce": 0.038113754242658615, "loss_lvr": 0.40676644444465637, "loss_mode_switch": 0.0, "loss_total": 0.07879039645195007, "step": 1084 }, { "batch_size": 4, "epoch": 0.4336, "step": 1084, "tokens_per_device": 3872 }, { "epoch": 0.4336, "loss_ce": 0.3184379041194916, "loss_lvr": 0.6927130818367004, "loss_mode_switch": 0.0, "loss_total": 0.38770920038223267, "step": 1084 }, { "batch_size": 1, "epoch": 0.4336, "step": 1084, "tokens_per_device": 5196 }, { "epoch": 0.4336, "loss_ce": 0.0013998394133523107, "loss_lvr": 0.6140267848968506, "loss_mode_switch": 0.0, "loss_total": 0.0628025159239769, "step": 1084 }, { "batch_size": 1, "epoch": 0.4336, "step": 1084, "tokens_per_device": 4846 }, { "epoch": 0.4336, "loss_ce": 0.039661698043346405, "loss_lvr": 0.2758913040161133, "loss_mode_switch": 0.0, "loss_total": 0.06725083291530609, "step": 1084 }, { "batch_size": 1, "epoch": 0.4336, "step": 1084, "tokens_per_device": 4951 }, { "epoch": 0.4336, "loss_ce": 0.14202123880386353, "loss_lvr": 0.6274757385253906, "loss_mode_switch": 0.0, "loss_total": 0.2047688066959381, "step": 1084 }, { "epoch": 0.434, "grad_norm": 1.2807481288909912, "learning_rate": 6.296702034686726e-06, "loss": 0.2776, "step": 1085 }, { "batch_size": 4, "epoch": 0.434, "step": 1085, "tokens_per_device": 2392 }, { "epoch": 0.434, "loss_ce": 0.07197778671979904, "loss_lvr": 0.7601839303970337, "loss_mode_switch": 0.0, "loss_total": 0.147996187210083, "step": 1085 }, { "batch_size": 4, "epoch": 0.434, "step": 1085, "tokens_per_device": 3892 }, { "epoch": 0.434, "loss_ce": 0.31831374764442444, "loss_lvr": 0.9892911314964294, "loss_mode_switch": 0.0, "loss_total": 0.4172428548336029, "step": 1085 }, { "batch_size": 4, "epoch": 0.434, "step": 1085, "tokens_per_device": 4424 }, { "epoch": 0.434, "loss_ce": 0.1602126657962799, "loss_lvr": 0.9419091939926147, "loss_mode_switch": 0.0, "loss_total": 0.25440359115600586, "step": 1085 }, { "batch_size": 4, "epoch": 0.434, "step": 1085, "tokens_per_device": 3796 }, { "epoch": 0.434, "loss_ce": 0.16349340975284576, "loss_lvr": 0.8677821755409241, "loss_mode_switch": 0.0, "loss_total": 0.25027161836624146, "step": 1085 }, { "batch_size": 4, "epoch": 0.434, "step": 1085, "tokens_per_device": 6092 }, { "epoch": 0.434, "loss_ce": 0.1578015238046646, "loss_lvr": 0.7784596681594849, "loss_mode_switch": 0.0, "loss_total": 0.23564749956130981, "step": 1085 }, { "batch_size": 4, "epoch": 0.434, "step": 1085, "tokens_per_device": 4256 }, { "epoch": 0.434, "loss_ce": 0.13154225051403046, "loss_lvr": 0.7212246060371399, "loss_mode_switch": 0.0, "loss_total": 0.20366472005844116, "step": 1085 }, { "batch_size": 4, "epoch": 0.434, "step": 1085, "tokens_per_device": 11072 }, { "epoch": 0.434, "loss_ce": 0.2498406171798706, "loss_lvr": 0.7727640271186829, "loss_mode_switch": 0.0, "loss_total": 0.32711702585220337, "step": 1085 }, { "batch_size": 4, "epoch": 0.434, "step": 1085, "tokens_per_device": 3532 }, { "epoch": 0.434, "loss_ce": 0.006653361488133669, "loss_lvr": 1.1979061365127563, "loss_mode_switch": 0.0, "loss_total": 0.1264439821243286, "step": 1085 }, { "epoch": 0.4344, "grad_norm": 1.2707823514938354, "learning_rate": 6.290445059364286e-06, "loss": 0.2748, "step": 1086 }, { "batch_size": 1, "epoch": 0.4344, "step": 1086, "tokens_per_device": 4864 }, { "epoch": 0.4344, "loss_ce": 0.1198151633143425, "loss_lvr": 0.2132815271615982, "loss_mode_switch": 0.0, "loss_total": 0.1411433219909668, "step": 1086 }, { "batch_size": 1, "epoch": 0.4344, "step": 1086, "tokens_per_device": 6288 }, { "epoch": 0.4344, "loss_ce": 0.023390082642436028, "loss_lvr": 0.3805612027645111, "loss_mode_switch": 0.0, "loss_total": 0.06144620478153229, "step": 1086 }, { "batch_size": 4, "epoch": 0.4344, "step": 1086, "tokens_per_device": 1668 }, { "epoch": 0.4344, "loss_ce": 0.487755686044693, "loss_lvr": 0.9464501142501831, "loss_mode_switch": 0.0, "loss_total": 0.5824006795883179, "step": 1086 }, { "batch_size": 1, "epoch": 0.4344, "step": 1086, "tokens_per_device": 4643 }, { "epoch": 0.4344, "loss_ce": 0.062292058020830154, "loss_lvr": 0.27268657088279724, "loss_mode_switch": 0.0, "loss_total": 0.08956071734428406, "step": 1086 }, { "batch_size": 4, "epoch": 0.4344, "step": 1086, "tokens_per_device": 1536 }, { "epoch": 0.4344, "loss_ce": 0.747244656085968, "loss_lvr": 1.113349437713623, "loss_mode_switch": 0.0, "loss_total": 0.8585795760154724, "step": 1086 }, { "batch_size": 4, "epoch": 0.4344, "step": 1086, "tokens_per_device": 12280 }, { "epoch": 0.4344, "loss_ce": 0.1163860484957695, "loss_lvr": 0.6839224696159363, "loss_mode_switch": 0.0, "loss_total": 0.18477830290794373, "step": 1086 }, { "batch_size": 4, "epoch": 0.4344, "step": 1086, "tokens_per_device": 3912 }, { "epoch": 0.4344, "loss_ce": 0.2625647187232971, "loss_lvr": 0.7255249619483948, "loss_mode_switch": 0.0, "loss_total": 0.3351172208786011, "step": 1086 }, { "batch_size": 1, "epoch": 0.4344, "step": 1086, "tokens_per_device": 4870 }, { "epoch": 0.4344, "loss_ce": 0.0013174189953133464, "loss_lvr": 0.2484370917081833, "loss_mode_switch": 0.0, "loss_total": 0.026161128655076027, "step": 1086 }, { "epoch": 0.4348, "grad_norm": 1.2663021087646484, "learning_rate": 6.284185918254968e-06, "loss": 0.2829, "step": 1087 }, { "batch_size": 4, "epoch": 0.4348, "step": 1087, "tokens_per_device": 12204 }, { "epoch": 0.4348, "loss_ce": 0.019474003463983536, "loss_lvr": 1.0516693592071533, "loss_mode_switch": 0.0, "loss_total": 0.12464094161987305, "step": 1087 }, { "batch_size": 4, "epoch": 0.4348, "step": 1087, "tokens_per_device": 1884 }, { "epoch": 0.4348, "loss_ce": 0.3628709316253662, "loss_lvr": 1.0078667402267456, "loss_mode_switch": 0.0, "loss_total": 0.4636576175689697, "step": 1087 }, { "batch_size": 4, "epoch": 0.4348, "step": 1087, "tokens_per_device": 4296 }, { "epoch": 0.4348, "loss_ce": 0.5740459561347961, "loss_lvr": 0.8414478898048401, "loss_mode_switch": 0.0, "loss_total": 0.6581907272338867, "step": 1087 }, { "batch_size": 4, "epoch": 0.4348, "step": 1087, "tokens_per_device": 4260 }, { "epoch": 0.4348, "loss_ce": 0.1628100723028183, "loss_lvr": 0.9420852065086365, "loss_mode_switch": 0.0, "loss_total": 0.2570185959339142, "step": 1087 }, { "batch_size": 4, "epoch": 0.4348, "step": 1087, "tokens_per_device": 1316 }, { "epoch": 0.4348, "loss_ce": 0.266062468290329, "loss_lvr": 0.8882483839988708, "loss_mode_switch": 0.0, "loss_total": 0.35488730669021606, "step": 1087 }, { "batch_size": 1, "epoch": 0.4348, "step": 1087, "tokens_per_device": 4906 }, { "epoch": 0.4348, "loss_ce": 0.041862159967422485, "loss_lvr": 0.45017117261886597, "loss_mode_switch": 0.0, "loss_total": 0.08687928318977356, "step": 1087 }, { "batch_size": 4, "epoch": 0.4348, "step": 1087, "tokens_per_device": 4688 }, { "epoch": 0.4348, "loss_ce": 0.2688852846622467, "loss_lvr": 1.1254103183746338, "loss_mode_switch": 0.0, "loss_total": 0.3814263343811035, "step": 1087 }, { "batch_size": 1, "epoch": 0.4348, "step": 1087, "tokens_per_device": 5124 }, { "epoch": 0.4348, "loss_ce": 0.00537874037399888, "loss_lvr": 0.49590450525283813, "loss_mode_switch": 0.0, "loss_total": 0.054969191551208496, "step": 1087 }, { "epoch": 0.4352, "grad_norm": 1.3509753942489624, "learning_rate": 6.277924621863649e-06, "loss": 0.3151, "step": 1088 }, { "batch_size": 4, "epoch": 0.4352, "step": 1088, "tokens_per_device": 1500 }, { "epoch": 0.4352, "loss_ce": 0.7127577662467957, "loss_lvr": 1.0423791408538818, "loss_mode_switch": 0.0, "loss_total": 0.8169956803321838, "step": 1088 }, { "batch_size": 1, "epoch": 0.4352, "step": 1088, "tokens_per_device": 5059 }, { "epoch": 0.4352, "loss_ce": 0.01613139547407627, "loss_lvr": 0.3085361421108246, "loss_mode_switch": 0.0, "loss_total": 0.04698500782251358, "step": 1088 }, { "batch_size": 1, "epoch": 0.4352, "step": 1088, "tokens_per_device": 5109 }, { "epoch": 0.4352, "loss_ce": 0.06729167699813843, "loss_lvr": 0.5168330073356628, "loss_mode_switch": 0.0, "loss_total": 0.11897498369216919, "step": 1088 }, { "batch_size": 4, "epoch": 0.4352, "step": 1088, "tokens_per_device": 5828 }, { "epoch": 0.4352, "loss_ce": 0.05173567309975624, "loss_lvr": 0.6885213255882263, "loss_mode_switch": 0.0, "loss_total": 0.12058781087398529, "step": 1088 }, { "batch_size": 4, "epoch": 0.4352, "step": 1088, "tokens_per_device": 4376 }, { "epoch": 0.4352, "loss_ce": 0.05496100336313248, "loss_lvr": 0.7616450190544128, "loss_mode_switch": 0.0, "loss_total": 0.13112550973892212, "step": 1088 }, { "batch_size": 4, "epoch": 0.4352, "step": 1088, "tokens_per_device": 3984 }, { "epoch": 0.4352, "loss_ce": 0.5614941120147705, "loss_lvr": 1.1342352628707886, "loss_mode_switch": 0.0, "loss_total": 0.6749176383018494, "step": 1088 }, { "batch_size": 4, "epoch": 0.4352, "step": 1088, "tokens_per_device": 5808 }, { "epoch": 0.4352, "loss_ce": 0.11136709153652191, "loss_lvr": 0.9554992318153381, "loss_mode_switch": 0.0, "loss_total": 0.20691701769828796, "step": 1088 }, { "batch_size": 4, "epoch": 0.4352, "step": 1088, "tokens_per_device": 2660 }, { "epoch": 0.4352, "loss_ce": 0.5370756983757019, "loss_lvr": 2.0682458877563477, "loss_mode_switch": 0.0, "loss_total": 0.7439002990722656, "step": 1088 }, { "epoch": 0.4356, "grad_norm": 1.5933266878128052, "learning_rate": 6.271661180698824e-06, "loss": 0.3558, "step": 1089 }, { "batch_size": 4, "epoch": 0.4356, "step": 1089, "tokens_per_device": 5708 }, { "epoch": 0.4356, "loss_ce": 0.019240710884332657, "loss_lvr": 0.8886001110076904, "loss_mode_switch": 0.0, "loss_total": 0.10810072720050812, "step": 1089 }, { "batch_size": 4, "epoch": 0.4356, "step": 1089, "tokens_per_device": 5092 }, { "epoch": 0.4356, "loss_ce": 0.346046507358551, "loss_lvr": 1.0441722869873047, "loss_mode_switch": 0.0, "loss_total": 0.45046374201774597, "step": 1089 }, { "batch_size": 1, "epoch": 0.4356, "step": 1089, "tokens_per_device": 6825 }, { "epoch": 0.4356, "loss_ce": 0.0007268527406267822, "loss_lvr": 0.4559755027294159, "loss_mode_switch": 0.0, "loss_total": 0.04632440209388733, "step": 1089 }, { "batch_size": 4, "epoch": 0.4356, "step": 1089, "tokens_per_device": 1860 }, { "epoch": 0.4356, "loss_ce": 0.1857772022485733, "loss_lvr": 0.84844571352005, "loss_mode_switch": 0.0, "loss_total": 0.27062177658081055, "step": 1089 }, { "batch_size": 1, "epoch": 0.4356, "step": 1089, "tokens_per_device": 5027 }, { "epoch": 0.4356, "loss_ce": 0.15660390257835388, "loss_lvr": 0.2498881071805954, "loss_mode_switch": 0.0, "loss_total": 0.18159271776676178, "step": 1089 }, { "batch_size": 4, "epoch": 0.4356, "step": 1089, "tokens_per_device": 5792 }, { "epoch": 0.4356, "loss_ce": 0.0022863843478262424, "loss_lvr": 1.039527416229248, "loss_mode_switch": 0.0, "loss_total": 0.10623912513256073, "step": 1089 }, { "batch_size": 1, "epoch": 0.4356, "step": 1089, "tokens_per_device": 5173 }, { "epoch": 0.4356, "loss_ce": 0.0009644509991630912, "loss_lvr": 0.4784546196460724, "loss_mode_switch": 0.0, "loss_total": 0.04880991578102112, "step": 1089 }, { "batch_size": 1, "epoch": 0.4356, "step": 1089, "tokens_per_device": 4826 }, { "epoch": 0.4356, "loss_ce": 0.03123847208917141, "loss_lvr": 0.24668943881988525, "loss_mode_switch": 0.0, "loss_total": 0.055907417088747025, "step": 1089 }, { "epoch": 0.436, "grad_norm": 1.5200973749160767, "learning_rate": 6.265395605272581e-06, "loss": 0.3024, "step": 1090 }, { "batch_size": 4, "epoch": 0.436, "step": 1090, "tokens_per_device": 5572 }, { "epoch": 0.436, "loss_ce": 0.04087423160672188, "loss_lvr": 0.6985200643539429, "loss_mode_switch": 0.0, "loss_total": 0.1107262372970581, "step": 1090 }, { "batch_size": 4, "epoch": 0.436, "step": 1090, "tokens_per_device": 4212 }, { "epoch": 0.436, "loss_ce": 0.45011624693870544, "loss_lvr": 0.5998589396476746, "loss_mode_switch": 0.0, "loss_total": 0.5101021528244019, "step": 1090 }, { "batch_size": 4, "epoch": 0.436, "step": 1090, "tokens_per_device": 3972 }, { "epoch": 0.436, "loss_ce": 0.12378685921430588, "loss_lvr": 0.6313722133636475, "loss_mode_switch": 0.0, "loss_total": 0.18692408502101898, "step": 1090 }, { "batch_size": 4, "epoch": 0.436, "step": 1090, "tokens_per_device": 4276 }, { "epoch": 0.436, "loss_ce": 0.017590317875146866, "loss_lvr": 0.6360187530517578, "loss_mode_switch": 0.0, "loss_total": 0.08119219541549683, "step": 1090 }, { "batch_size": 1, "epoch": 0.436, "step": 1090, "tokens_per_device": 4465 }, { "epoch": 0.436, "loss_ce": 0.04683643579483032, "loss_lvr": 0.5597979426383972, "loss_mode_switch": 0.0, "loss_total": 0.10281623154878616, "step": 1090 }, { "batch_size": 4, "epoch": 0.436, "step": 1090, "tokens_per_device": 3856 }, { "epoch": 0.436, "loss_ce": 0.09900087118148804, "loss_lvr": 0.9500827193260193, "loss_mode_switch": 0.0, "loss_total": 0.19400915503501892, "step": 1090 }, { "batch_size": 4, "epoch": 0.436, "step": 1090, "tokens_per_device": 4040 }, { "epoch": 0.436, "loss_ce": 0.4984607398509979, "loss_lvr": 0.9282662868499756, "loss_mode_switch": 0.0, "loss_total": 0.59128737449646, "step": 1090 }, { "batch_size": 1, "epoch": 0.436, "step": 1090, "tokens_per_device": 4878 }, { "epoch": 0.436, "loss_ce": 0.9682703018188477, "loss_lvr": 0.3117651343345642, "loss_mode_switch": 0.0, "loss_total": 0.9994468092918396, "step": 1090 }, { "epoch": 0.4364, "grad_norm": 1.5232837200164795, "learning_rate": 6.259127906100601e-06, "loss": 0.32, "step": 1091 }, { "batch_size": 1, "epoch": 0.4364, "step": 1091, "tokens_per_device": 5032 }, { "epoch": 0.4364, "loss_ce": 0.07497163116931915, "loss_lvr": 0.27360856533050537, "loss_mode_switch": 0.0, "loss_total": 0.10233248770236969, "step": 1091 }, { "batch_size": 4, "epoch": 0.4364, "step": 1091, "tokens_per_device": 2752 }, { "epoch": 0.4364, "loss_ce": 0.2152429223060608, "loss_lvr": 0.7423235774040222, "loss_mode_switch": 0.0, "loss_total": 0.28947529196739197, "step": 1091 }, { "batch_size": 1, "epoch": 0.4364, "step": 1091, "tokens_per_device": 5188 }, { "epoch": 0.4364, "loss_ce": 0.01620483584702015, "loss_lvr": 0.7065378427505493, "loss_mode_switch": 0.0, "loss_total": 0.08685862272977829, "step": 1091 }, { "batch_size": 4, "epoch": 0.4364, "step": 1091, "tokens_per_device": 1380 }, { "epoch": 0.4364, "loss_ce": 0.1767769753932953, "loss_lvr": 0.9040932059288025, "loss_mode_switch": 0.0, "loss_total": 0.2671862840652466, "step": 1091 }, { "batch_size": 4, "epoch": 0.4364, "step": 1091, "tokens_per_device": 1656 }, { "epoch": 0.4364, "loss_ce": 0.12150952965021133, "loss_lvr": 0.9164395928382874, "loss_mode_switch": 0.0, "loss_total": 0.21315348148345947, "step": 1091 }, { "batch_size": 1, "epoch": 0.4364, "step": 1091, "tokens_per_device": 5191 }, { "epoch": 0.4364, "loss_ce": 0.004897038917988539, "loss_lvr": 0.41563794016838074, "loss_mode_switch": 0.0, "loss_total": 0.0464608334004879, "step": 1091 }, { "batch_size": 4, "epoch": 0.4364, "step": 1091, "tokens_per_device": 7008 }, { "epoch": 0.4364, "loss_ce": 0.6752756237983704, "loss_lvr": 1.0794823169708252, "loss_mode_switch": 0.0, "loss_total": 0.7832238674163818, "step": 1091 }, { "batch_size": 4, "epoch": 0.4364, "step": 1091, "tokens_per_device": 4536 }, { "epoch": 0.4364, "loss_ce": 0.4408353567123413, "loss_lvr": 0.8491145968437195, "loss_mode_switch": 0.0, "loss_total": 0.5257468223571777, "step": 1091 }, { "epoch": 0.4368, "grad_norm": 1.4472225904464722, "learning_rate": 6.252858093702121e-06, "loss": 0.2709, "step": 1092 }, { "batch_size": 4, "epoch": 0.4368, "step": 1092, "tokens_per_device": 2884 }, { "epoch": 0.4368, "loss_ce": 0.18134541809558868, "loss_lvr": 0.6843828558921814, "loss_mode_switch": 0.0, "loss_total": 0.2497836947441101, "step": 1092 }, { "batch_size": 1, "epoch": 0.4368, "step": 1092, "tokens_per_device": 4885 }, { "epoch": 0.4368, "loss_ce": 0.010811001993715763, "loss_lvr": 0.42141348123550415, "loss_mode_switch": 0.0, "loss_total": 0.052952349185943604, "step": 1092 }, { "batch_size": 4, "epoch": 0.4368, "step": 1092, "tokens_per_device": 2700 }, { "epoch": 0.4368, "loss_ce": 0.623436450958252, "loss_lvr": 0.46665939688682556, "loss_mode_switch": 0.0, "loss_total": 0.6701024174690247, "step": 1092 }, { "batch_size": 4, "epoch": 0.4368, "step": 1092, "tokens_per_device": 1444 }, { "epoch": 0.4368, "loss_ce": 0.48251253366470337, "loss_lvr": 0.9132869839668274, "loss_mode_switch": 0.0, "loss_total": 0.5738412141799927, "step": 1092 }, { "batch_size": 4, "epoch": 0.4368, "step": 1092, "tokens_per_device": 2688 }, { "epoch": 0.4368, "loss_ce": 0.28963127732276917, "loss_lvr": 0.7885804176330566, "loss_mode_switch": 0.0, "loss_total": 0.3684893250465393, "step": 1092 }, { "batch_size": 4, "epoch": 0.4368, "step": 1092, "tokens_per_device": 4244 }, { "epoch": 0.4368, "loss_ce": 0.40869930386543274, "loss_lvr": 1.2117478847503662, "loss_mode_switch": 0.0, "loss_total": 0.5298740863800049, "step": 1092 }, { "batch_size": 4, "epoch": 0.4368, "step": 1092, "tokens_per_device": 3360 }, { "epoch": 0.4368, "loss_ce": 0.19021397829055786, "loss_lvr": 0.9845739006996155, "loss_mode_switch": 0.0, "loss_total": 0.2886713743209839, "step": 1092 }, { "batch_size": 1, "epoch": 0.4368, "step": 1092, "tokens_per_device": 5187 }, { "epoch": 0.4368, "loss_ce": 0.0026200597640126944, "loss_lvr": 0.6579774022102356, "loss_mode_switch": 0.0, "loss_total": 0.06841779500246048, "step": 1092 }, { "epoch": 0.4372, "grad_norm": 1.2665231227874756, "learning_rate": 6.246586178599928e-06, "loss": 0.2917, "step": 1093 }, { "batch_size": 4, "epoch": 0.4372, "step": 1093, "tokens_per_device": 2688 }, { "epoch": 0.4372, "loss_ce": 0.43820834159851074, "loss_lvr": 0.730193018913269, "loss_mode_switch": 0.0, "loss_total": 0.5112276673316956, "step": 1093 }, { "batch_size": 1, "epoch": 0.4372, "step": 1093, "tokens_per_device": 4916 }, { "epoch": 0.4372, "loss_ce": 1.038576602935791, "loss_lvr": 0.37693560123443604, "loss_mode_switch": 0.0, "loss_total": 1.0762701034545898, "step": 1093 }, { "batch_size": 4, "epoch": 0.4372, "step": 1093, "tokens_per_device": 3756 }, { "epoch": 0.4372, "loss_ce": 0.17025645077228546, "loss_lvr": 0.7757808566093445, "loss_mode_switch": 0.0, "loss_total": 0.24783453345298767, "step": 1093 }, { "batch_size": 4, "epoch": 0.4372, "step": 1093, "tokens_per_device": 4532 }, { "epoch": 0.4372, "loss_ce": 0.08706687390804291, "loss_lvr": 0.98847895860672, "loss_mode_switch": 0.0, "loss_total": 0.1859147697687149, "step": 1093 }, { "batch_size": 4, "epoch": 0.4372, "step": 1093, "tokens_per_device": 4588 }, { "epoch": 0.4372, "loss_ce": 0.3551238477230072, "loss_lvr": 0.7546591758728027, "loss_mode_switch": 0.0, "loss_total": 0.4305897653102875, "step": 1093 }, { "batch_size": 4, "epoch": 0.4372, "step": 1093, "tokens_per_device": 4240 }, { "epoch": 0.4372, "loss_ce": 0.36226075887680054, "loss_lvr": 1.22623610496521, "loss_mode_switch": 0.0, "loss_total": 0.4848843812942505, "step": 1093 }, { "batch_size": 1, "epoch": 0.4372, "step": 1093, "tokens_per_device": 5272 }, { "epoch": 0.4372, "loss_ce": 1.0664104223251343, "loss_lvr": 0.2990296185016632, "loss_mode_switch": 0.0, "loss_total": 1.0963133573532104, "step": 1093 }, { "batch_size": 4, "epoch": 0.4372, "step": 1093, "tokens_per_device": 5580 }, { "epoch": 0.4372, "loss_ce": 0.1932615041732788, "loss_lvr": 0.8417796492576599, "loss_mode_switch": 0.0, "loss_total": 0.2774394750595093, "step": 1093 }, { "epoch": 0.4376, "grad_norm": 1.5351948738098145, "learning_rate": 6.240312171320336e-06, "loss": 0.3237, "step": 1094 }, { "batch_size": 4, "epoch": 0.4376, "step": 1094, "tokens_per_device": 9588 }, { "epoch": 0.4376, "loss_ce": 0.24101829528808594, "loss_lvr": 0.48551419377326965, "loss_mode_switch": 0.0, "loss_total": 0.2895697057247162, "step": 1094 }, { "batch_size": 4, "epoch": 0.4376, "step": 1094, "tokens_per_device": 4484 }, { "epoch": 0.4376, "loss_ce": 0.1707385927438736, "loss_lvr": 0.828018844127655, "loss_mode_switch": 0.0, "loss_total": 0.2535404860973358, "step": 1094 }, { "batch_size": 4, "epoch": 0.4376, "step": 1094, "tokens_per_device": 7028 }, { "epoch": 0.4376, "loss_ce": 0.33769136667251587, "loss_lvr": 0.6167599558830261, "loss_mode_switch": 0.0, "loss_total": 0.3993673622608185, "step": 1094 }, { "batch_size": 4, "epoch": 0.4376, "step": 1094, "tokens_per_device": 4336 }, { "epoch": 0.4376, "loss_ce": 0.25035420060157776, "loss_lvr": 0.7919504642486572, "loss_mode_switch": 0.0, "loss_total": 0.32954925298690796, "step": 1094 }, { "batch_size": 4, "epoch": 0.4376, "step": 1094, "tokens_per_device": 1612 }, { "epoch": 0.4376, "loss_ce": 0.7393385171890259, "loss_lvr": 0.8224318623542786, "loss_mode_switch": 0.0, "loss_total": 0.8215817213058472, "step": 1094 }, { "batch_size": 4, "epoch": 0.4376, "step": 1094, "tokens_per_device": 5568 }, { "epoch": 0.4376, "loss_ce": 0.05451493337750435, "loss_lvr": 0.7128252983093262, "loss_mode_switch": 0.0, "loss_total": 0.12579746544361115, "step": 1094 }, { "batch_size": 4, "epoch": 0.4376, "step": 1094, "tokens_per_device": 1348 }, { "epoch": 0.4376, "loss_ce": 0.32960081100463867, "loss_lvr": 1.1384719610214233, "loss_mode_switch": 0.0, "loss_total": 0.443448007106781, "step": 1094 }, { "batch_size": 1, "epoch": 0.4376, "step": 1094, "tokens_per_device": 5159 }, { "epoch": 0.4376, "loss_ce": 0.003038430819287896, "loss_lvr": 0.6308242678642273, "loss_mode_switch": 0.0, "loss_total": 0.06612085551023483, "step": 1094 }, { "epoch": 0.438, "grad_norm": 1.1989065408706665, "learning_rate": 6.234036082393171e-06, "loss": 0.2682, "step": 1095 }, { "batch_size": 4, "epoch": 0.438, "step": 1095, "tokens_per_device": 9240 }, { "epoch": 0.438, "loss_ce": 0.23241543769836426, "loss_lvr": 0.7338337302207947, "loss_mode_switch": 0.0, "loss_total": 0.30579882860183716, "step": 1095 }, { "batch_size": 4, "epoch": 0.438, "step": 1095, "tokens_per_device": 5236 }, { "epoch": 0.438, "loss_ce": 0.32417428493499756, "loss_lvr": 0.7933602333068848, "loss_mode_switch": 0.0, "loss_total": 0.40351030230522156, "step": 1095 }, { "batch_size": 4, "epoch": 0.438, "step": 1095, "tokens_per_device": 4264 }, { "epoch": 0.438, "loss_ce": 0.06109175086021423, "loss_lvr": 0.8705615401268005, "loss_mode_switch": 0.0, "loss_total": 0.14814791083335876, "step": 1095 }, { "batch_size": 4, "epoch": 0.438, "step": 1095, "tokens_per_device": 3968 }, { "epoch": 0.438, "loss_ce": 0.25782015919685364, "loss_lvr": 0.8633844256401062, "loss_mode_switch": 0.0, "loss_total": 0.3441585898399353, "step": 1095 }, { "batch_size": 1, "epoch": 0.438, "step": 1095, "tokens_per_device": 5049 }, { "epoch": 0.438, "loss_ce": 0.002380939433351159, "loss_lvr": 0.5853748321533203, "loss_mode_switch": 0.0, "loss_total": 0.060918424278497696, "step": 1095 }, { "batch_size": 4, "epoch": 0.438, "step": 1095, "tokens_per_device": 5704 }, { "epoch": 0.438, "loss_ce": 0.4230988323688507, "loss_lvr": 1.1093932390213013, "loss_mode_switch": 0.0, "loss_total": 0.5340381860733032, "step": 1095 }, { "batch_size": 1, "epoch": 0.438, "step": 1095, "tokens_per_device": 5052 }, { "epoch": 0.438, "loss_ce": 0.03592715412378311, "loss_lvr": 0.4185827076435089, "loss_mode_switch": 0.0, "loss_total": 0.077785424888134, "step": 1095 }, { "batch_size": 1, "epoch": 0.438, "step": 1095, "tokens_per_device": 5122 }, { "epoch": 0.438, "loss_ce": 0.02620001882314682, "loss_lvr": 0.30481407046318054, "loss_mode_switch": 0.0, "loss_total": 0.056681424379348755, "step": 1095 }, { "epoch": 0.4384, "grad_norm": 1.524359107017517, "learning_rate": 6.227757922351756e-06, "loss": 0.2852, "step": 1096 }, { "batch_size": 4, "epoch": 0.4384, "step": 1096, "tokens_per_device": 4396 }, { "epoch": 0.4384, "loss_ce": 0.2573942244052887, "loss_lvr": 0.751162052154541, "loss_mode_switch": 0.0, "loss_total": 0.33251044154167175, "step": 1096 }, { "batch_size": 4, "epoch": 0.4384, "step": 1096, "tokens_per_device": 8156 }, { "epoch": 0.4384, "loss_ce": 0.11464671790599823, "loss_lvr": 0.6278693079948425, "loss_mode_switch": 0.0, "loss_total": 0.17743363976478577, "step": 1096 }, { "batch_size": 4, "epoch": 0.4384, "step": 1096, "tokens_per_device": 4280 }, { "epoch": 0.4384, "loss_ce": 0.03963466361165047, "loss_lvr": 0.9664744734764099, "loss_mode_switch": 0.0, "loss_total": 0.13628211617469788, "step": 1096 }, { "batch_size": 4, "epoch": 0.4384, "step": 1096, "tokens_per_device": 12012 }, { "epoch": 0.4384, "loss_ce": 0.10691414773464203, "loss_lvr": 0.9965750575065613, "loss_mode_switch": 0.0, "loss_total": 0.20657165348529816, "step": 1096 }, { "batch_size": 4, "epoch": 0.4384, "step": 1096, "tokens_per_device": 1268 }, { "epoch": 0.4384, "loss_ce": 0.07700060307979584, "loss_lvr": 1.320548415184021, "loss_mode_switch": 0.0, "loss_total": 0.20905545353889465, "step": 1096 }, { "batch_size": 4, "epoch": 0.4384, "step": 1096, "tokens_per_device": 4200 }, { "epoch": 0.4384, "loss_ce": 0.060847774147987366, "loss_lvr": 0.8843334913253784, "loss_mode_switch": 0.0, "loss_total": 0.1492811143398285, "step": 1096 }, { "batch_size": 1, "epoch": 0.4384, "step": 1096, "tokens_per_device": 4387 }, { "epoch": 0.4384, "loss_ce": 0.028183963149785995, "loss_lvr": 0.5472408533096313, "loss_mode_switch": 0.0, "loss_total": 0.08290804922580719, "step": 1096 }, { "batch_size": 4, "epoch": 0.4384, "step": 1096, "tokens_per_device": 5756 }, { "epoch": 0.4384, "loss_ce": 0.5244163274765015, "loss_lvr": 0.9472115635871887, "loss_mode_switch": 0.0, "loss_total": 0.6191374659538269, "step": 1096 }, { "epoch": 0.4388, "grad_norm": 1.147673487663269, "learning_rate": 6.221477701732884e-06, "loss": 0.2568, "step": 1097 }, { "batch_size": 4, "epoch": 0.4388, "step": 1097, "tokens_per_device": 1940 }, { "epoch": 0.4388, "loss_ce": 0.6593789458274841, "loss_lvr": 0.8409364223480225, "loss_mode_switch": 0.0, "loss_total": 0.7434725761413574, "step": 1097 }, { "batch_size": 1, "epoch": 0.4388, "step": 1097, "tokens_per_device": 4754 }, { "epoch": 0.4388, "loss_ce": 0.20998631417751312, "loss_lvr": 0.2875273823738098, "loss_mode_switch": 0.0, "loss_total": 0.23873905837535858, "step": 1097 }, { "batch_size": 1, "epoch": 0.4388, "step": 1097, "tokens_per_device": 7688 }, { "epoch": 0.4388, "loss_ce": 0.0007659952389076352, "loss_lvr": 0.34018561244010925, "loss_mode_switch": 0.0, "loss_total": 0.034784555435180664, "step": 1097 }, { "batch_size": 1, "epoch": 0.4388, "step": 1097, "tokens_per_device": 5234 }, { "epoch": 0.4388, "loss_ce": 1.2327947616577148, "loss_lvr": 0.32334399223327637, "loss_mode_switch": 0.0, "loss_total": 1.2651292085647583, "step": 1097 }, { "batch_size": 4, "epoch": 0.4388, "step": 1097, "tokens_per_device": 4544 }, { "epoch": 0.4388, "loss_ce": 0.15927229821681976, "loss_lvr": 0.7990829348564148, "loss_mode_switch": 0.0, "loss_total": 0.23918059468269348, "step": 1097 }, { "batch_size": 1, "epoch": 0.4388, "step": 1097, "tokens_per_device": 5124 }, { "epoch": 0.4388, "loss_ce": 0.6505813598632812, "loss_lvr": 0.7424001097679138, "loss_mode_switch": 0.0, "loss_total": 0.7248213887214661, "step": 1097 }, { "batch_size": 4, "epoch": 0.4388, "step": 1097, "tokens_per_device": 5996 }, { "epoch": 0.4388, "loss_ce": 0.0240486953407526, "loss_lvr": 0.8075090646743774, "loss_mode_switch": 0.0, "loss_total": 0.10479959845542908, "step": 1097 }, { "batch_size": 1, "epoch": 0.4388, "step": 1097, "tokens_per_device": 4909 }, { "epoch": 0.4388, "loss_ce": 0.05655495077371597, "loss_lvr": 0.3453816771507263, "loss_mode_switch": 0.0, "loss_total": 0.09109312295913696, "step": 1097 }, { "epoch": 0.4392, "grad_norm": 1.5983006954193115, "learning_rate": 6.215195431076813e-06, "loss": 0.3241, "step": 1098 }, { "batch_size": 4, "epoch": 0.4392, "step": 1098, "tokens_per_device": 10092 }, { "epoch": 0.4392, "loss_ce": 0.08310822397470474, "loss_lvr": 0.823884904384613, "loss_mode_switch": 0.0, "loss_total": 0.16549670696258545, "step": 1098 }, { "batch_size": 4, "epoch": 0.4392, "step": 1098, "tokens_per_device": 5128 }, { "epoch": 0.4392, "loss_ce": 0.21596871316432953, "loss_lvr": 0.8289372324943542, "loss_mode_switch": 0.0, "loss_total": 0.29886242747306824, "step": 1098 }, { "batch_size": 4, "epoch": 0.4392, "step": 1098, "tokens_per_device": 4344 }, { "epoch": 0.4392, "loss_ce": 0.09282758831977844, "loss_lvr": 0.9443520307540894, "loss_mode_switch": 0.0, "loss_total": 0.18726280331611633, "step": 1098 }, { "batch_size": 1, "epoch": 0.4392, "step": 1098, "tokens_per_device": 5026 }, { "epoch": 0.4392, "loss_ce": 0.07250120490789413, "loss_lvr": 0.26381272077560425, "loss_mode_switch": 0.0, "loss_total": 0.09888248145580292, "step": 1098 }, { "batch_size": 4, "epoch": 0.4392, "step": 1098, "tokens_per_device": 4172 }, { "epoch": 0.4392, "loss_ce": 0.04653934761881828, "loss_lvr": 0.8233070373535156, "loss_mode_switch": 0.0, "loss_total": 0.12887005507946014, "step": 1098 }, { "batch_size": 4, "epoch": 0.4392, "step": 1098, "tokens_per_device": 4280 }, { "epoch": 0.4392, "loss_ce": 0.08375029265880585, "loss_lvr": 0.9377955794334412, "loss_mode_switch": 0.0, "loss_total": 0.17752984166145325, "step": 1098 }, { "batch_size": 4, "epoch": 0.4392, "step": 1098, "tokens_per_device": 2604 }, { "epoch": 0.4392, "loss_ce": 0.07515932619571686, "loss_lvr": 0.6885446310043335, "loss_mode_switch": 0.0, "loss_total": 0.14401379227638245, "step": 1098 }, { "batch_size": 4, "epoch": 0.4392, "step": 1098, "tokens_per_device": 3688 }, { "epoch": 0.4392, "loss_ce": 0.30617645382881165, "loss_lvr": 0.8786239624023438, "loss_mode_switch": 0.0, "loss_total": 0.3940388560295105, "step": 1098 }, { "epoch": 0.4396, "grad_norm": 1.3074357509613037, "learning_rate": 6.208911120927233e-06, "loss": 0.2951, "step": 1099 }, { "batch_size": 4, "epoch": 0.4396, "step": 1099, "tokens_per_device": 5376 }, { "epoch": 0.4396, "loss_ce": 0.4124598205089569, "loss_lvr": 1.1594287157058716, "loss_mode_switch": 0.0, "loss_total": 0.5284026861190796, "step": 1099 }, { "batch_size": 1, "epoch": 0.4396, "step": 1099, "tokens_per_device": 5179 }, { "epoch": 0.4396, "loss_ce": 0.20154006779193878, "loss_lvr": 0.23509852588176727, "loss_mode_switch": 0.0, "loss_total": 0.2250499129295349, "step": 1099 }, { "batch_size": 4, "epoch": 0.4396, "step": 1099, "tokens_per_device": 3756 }, { "epoch": 0.4396, "loss_ce": 0.2993599772453308, "loss_lvr": 0.955833375453949, "loss_mode_switch": 0.0, "loss_total": 0.39494332671165466, "step": 1099 }, { "batch_size": 4, "epoch": 0.4396, "step": 1099, "tokens_per_device": 4284 }, { "epoch": 0.4396, "loss_ce": 0.03292842209339142, "loss_lvr": 0.8440502285957336, "loss_mode_switch": 0.0, "loss_total": 0.11733344942331314, "step": 1099 }, { "batch_size": 4, "epoch": 0.4396, "step": 1099, "tokens_per_device": 15180 }, { "epoch": 0.4396, "loss_ce": 0.08320049196481705, "loss_lvr": 0.8192573189735413, "loss_mode_switch": 0.0, "loss_total": 0.165126234292984, "step": 1099 }, { "batch_size": 4, "epoch": 0.4396, "step": 1099, "tokens_per_device": 4708 }, { "epoch": 0.4396, "loss_ce": 0.4035380482673645, "loss_lvr": 0.8578632473945618, "loss_mode_switch": 0.0, "loss_total": 0.4893243908882141, "step": 1099 }, { "batch_size": 4, "epoch": 0.4396, "step": 1099, "tokens_per_device": 14356 }, { "epoch": 0.4396, "loss_ce": 0.06466422230005264, "loss_lvr": 0.6709632873535156, "loss_mode_switch": 0.0, "loss_total": 0.13176055252552032, "step": 1099 }, { "batch_size": 1, "epoch": 0.4396, "step": 1099, "tokens_per_device": 4646 }, { "epoch": 0.4396, "loss_ce": 0.007241418119519949, "loss_lvr": 0.4254164397716522, "loss_mode_switch": 0.0, "loss_total": 0.04978306218981743, "step": 1099 }, { "epoch": 0.44, "grad_norm": 1.302219033241272, "learning_rate": 6.202624781831269e-06, "loss": 0.2921, "step": 1100 }, { "batch_size": 4, "epoch": 0.44, "step": 1100, "tokens_per_device": 3940 }, { "epoch": 0.44, "loss_ce": 0.11692490428686142, "loss_lvr": 0.8516203165054321, "loss_mode_switch": 0.0, "loss_total": 0.202086940407753, "step": 1100 }, { "batch_size": 4, "epoch": 0.44, "step": 1100, "tokens_per_device": 5136 }, { "epoch": 0.44, "loss_ce": 0.1519336700439453, "loss_lvr": 0.7402085065841675, "loss_mode_switch": 0.0, "loss_total": 0.22595453262329102, "step": 1100 }, { "batch_size": 4, "epoch": 0.44, "step": 1100, "tokens_per_device": 4220 }, { "epoch": 0.44, "loss_ce": 0.31942665576934814, "loss_lvr": 0.9072061777114868, "loss_mode_switch": 0.0, "loss_total": 0.4101472795009613, "step": 1100 }, { "batch_size": 1, "epoch": 0.44, "step": 1100, "tokens_per_device": 5186 }, { "epoch": 0.44, "loss_ce": 0.0017574441153556108, "loss_lvr": 0.25436684489250183, "loss_mode_switch": 0.0, "loss_total": 0.027194129303097725, "step": 1100 }, { "batch_size": 4, "epoch": 0.44, "step": 1100, "tokens_per_device": 1712 }, { "epoch": 0.44, "loss_ce": 0.6397217512130737, "loss_lvr": 0.8750333189964294, "loss_mode_switch": 0.0, "loss_total": 0.7272250652313232, "step": 1100 }, { "batch_size": 1, "epoch": 0.44, "step": 1100, "tokens_per_device": 4881 }, { "epoch": 0.44, "loss_ce": 0.24711595475673676, "loss_lvr": 0.4399893581867218, "loss_mode_switch": 0.0, "loss_total": 0.2911148965358734, "step": 1100 }, { "batch_size": 4, "epoch": 0.44, "step": 1100, "tokens_per_device": 5088 }, { "epoch": 0.44, "loss_ce": 0.2485361099243164, "loss_lvr": 0.7397724986076355, "loss_mode_switch": 0.0, "loss_total": 0.3225133717060089, "step": 1100 }, { "batch_size": 4, "epoch": 0.44, "step": 1100, "tokens_per_device": 9476 }, { "epoch": 0.44, "loss_ce": 0.15634600818157196, "loss_lvr": 0.4735255837440491, "loss_mode_switch": 0.0, "loss_total": 0.20369857549667358, "step": 1100 }, { "epoch": 0.4404, "grad_norm": 1.1824792623519897, "learning_rate": 6.1963364243394386e-06, "loss": 0.2492, "step": 1101 }, { "batch_size": 4, "epoch": 0.4404, "step": 1101, "tokens_per_device": 1636 }, { "epoch": 0.4404, "loss_ce": 0.10458940267562866, "loss_lvr": 0.884260892868042, "loss_mode_switch": 0.0, "loss_total": 0.19301548600196838, "step": 1101 }, { "batch_size": 4, "epoch": 0.4404, "step": 1101, "tokens_per_device": 4048 }, { "epoch": 0.4404, "loss_ce": 0.020838487893342972, "loss_lvr": 0.51141357421875, "loss_mode_switch": 0.0, "loss_total": 0.07197985053062439, "step": 1101 }, { "batch_size": 1, "epoch": 0.4404, "step": 1101, "tokens_per_device": 5173 }, { "epoch": 0.4404, "loss_ce": 0.0003734768251888454, "loss_lvr": 0.2980020046234131, "loss_mode_switch": 0.0, "loss_total": 0.03017367795109749, "step": 1101 }, { "batch_size": 1, "epoch": 0.4404, "step": 1101, "tokens_per_device": 5713 }, { "epoch": 0.4404, "loss_ce": 0.003754823002964258, "loss_lvr": 0.5421906113624573, "loss_mode_switch": 0.0, "loss_total": 0.05797388777136803, "step": 1101 }, { "batch_size": 4, "epoch": 0.4404, "step": 1101, "tokens_per_device": 2540 }, { "epoch": 0.4404, "loss_ce": 0.6948923468589783, "loss_lvr": 0.8843731880187988, "loss_mode_switch": 0.0, "loss_total": 0.7833296656608582, "step": 1101 }, { "batch_size": 4, "epoch": 0.4404, "step": 1101, "tokens_per_device": 1924 }, { "epoch": 0.4404, "loss_ce": 0.19243735074996948, "loss_lvr": 0.8923432230949402, "loss_mode_switch": 0.0, "loss_total": 0.2816716730594635, "step": 1101 }, { "batch_size": 1, "epoch": 0.4404, "step": 1101, "tokens_per_device": 5258 }, { "epoch": 0.4404, "loss_ce": 0.03336187079548836, "loss_lvr": 0.508156418800354, "loss_mode_switch": 0.0, "loss_total": 0.08417750895023346, "step": 1101 }, { "batch_size": 4, "epoch": 0.4404, "step": 1101, "tokens_per_device": 7172 }, { "epoch": 0.4404, "loss_ce": 0.8392773270606995, "loss_lvr": 0.5176231861114502, "loss_mode_switch": 0.0, "loss_total": 0.8910396695137024, "step": 1101 }, { "epoch": 0.4408, "grad_norm": 1.3110066652297974, "learning_rate": 6.190046059005655e-06, "loss": 0.2947, "step": 1102 }, { "batch_size": 4, "epoch": 0.4408, "step": 1102, "tokens_per_device": 7320 }, { "epoch": 0.4408, "loss_ce": 0.37897542119026184, "loss_lvr": 0.7207990288734436, "loss_mode_switch": 0.0, "loss_total": 0.4510553181171417, "step": 1102 }, { "batch_size": 4, "epoch": 0.4408, "step": 1102, "tokens_per_device": 5548 }, { "epoch": 0.4408, "loss_ce": 0.4268958568572998, "loss_lvr": 0.6694704294204712, "loss_mode_switch": 0.0, "loss_total": 0.4938428997993469, "step": 1102 }, { "batch_size": 4, "epoch": 0.4408, "step": 1102, "tokens_per_device": 4320 }, { "epoch": 0.4408, "loss_ce": 0.43193870782852173, "loss_lvr": 0.8619951605796814, "loss_mode_switch": 0.0, "loss_total": 0.5181382298469543, "step": 1102 }, { "batch_size": 1, "epoch": 0.4408, "step": 1102, "tokens_per_device": 4972 }, { "epoch": 0.4408, "loss_ce": 0.008554567582905293, "loss_lvr": 0.49168333411216736, "loss_mode_switch": 0.0, "loss_total": 0.057722900062799454, "step": 1102 }, { "batch_size": 4, "epoch": 0.4408, "step": 1102, "tokens_per_device": 1392 }, { "epoch": 0.4408, "loss_ce": 0.330471932888031, "loss_lvr": 0.8801145553588867, "loss_mode_switch": 0.0, "loss_total": 0.4184833765029907, "step": 1102 }, { "batch_size": 4, "epoch": 0.4408, "step": 1102, "tokens_per_device": 5068 }, { "epoch": 0.4408, "loss_ce": 0.10013668239116669, "loss_lvr": 0.9596896171569824, "loss_mode_switch": 0.0, "loss_total": 0.19610564410686493, "step": 1102 }, { "batch_size": 4, "epoch": 0.4408, "step": 1102, "tokens_per_device": 4276 }, { "epoch": 0.4408, "loss_ce": 0.05704919248819351, "loss_lvr": 1.0190038681030273, "loss_mode_switch": 0.0, "loss_total": 0.1589495837688446, "step": 1102 }, { "batch_size": 1, "epoch": 0.4408, "step": 1102, "tokens_per_device": 5133 }, { "epoch": 0.4408, "loss_ce": 0.2776505947113037, "loss_lvr": 0.24006444215774536, "loss_mode_switch": 0.0, "loss_total": 0.3016570508480072, "step": 1102 }, { "epoch": 0.4412, "grad_norm": 1.4527124166488647, "learning_rate": 6.183753696387199e-06, "loss": 0.3291, "step": 1103 }, { "batch_size": 4, "epoch": 0.4412, "step": 1103, "tokens_per_device": 5028 }, { "epoch": 0.4412, "loss_ce": 0.20326420664787292, "loss_lvr": 0.693142294883728, "loss_mode_switch": 0.0, "loss_total": 0.2725784480571747, "step": 1103 }, { "batch_size": 1, "epoch": 0.4412, "step": 1103, "tokens_per_device": 5111 }, { "epoch": 0.4412, "loss_ce": 0.0011319101322442293, "loss_lvr": 0.327788382768631, "loss_mode_switch": 0.0, "loss_total": 0.03391075134277344, "step": 1103 }, { "batch_size": 4, "epoch": 0.4412, "step": 1103, "tokens_per_device": 1164 }, { "epoch": 0.4412, "loss_ce": 0.1264534592628479, "loss_lvr": 0.8364927172660828, "loss_mode_switch": 0.0, "loss_total": 0.21010273694992065, "step": 1103 }, { "batch_size": 1, "epoch": 0.4412, "step": 1103, "tokens_per_device": 5224 }, { "epoch": 0.4412, "loss_ce": 0.025942271575331688, "loss_lvr": 0.6014256477355957, "loss_mode_switch": 0.0, "loss_total": 0.08608483523130417, "step": 1103 }, { "batch_size": 4, "epoch": 0.4412, "step": 1103, "tokens_per_device": 5108 }, { "epoch": 0.4412, "loss_ce": 0.5134769082069397, "loss_lvr": 0.8357818722724915, "loss_mode_switch": 0.0, "loss_total": 0.5970550775527954, "step": 1103 }, { "batch_size": 4, "epoch": 0.4412, "step": 1103, "tokens_per_device": 3828 }, { "epoch": 0.4412, "loss_ce": 0.3547755181789398, "loss_lvr": 0.879604160785675, "loss_mode_switch": 0.0, "loss_total": 0.4427359402179718, "step": 1103 }, { "batch_size": 4, "epoch": 0.4412, "step": 1103, "tokens_per_device": 5488 }, { "epoch": 0.4412, "loss_ce": 0.12760229408740997, "loss_lvr": 0.6990938782691956, "loss_mode_switch": 0.0, "loss_total": 0.1975116729736328, "step": 1103 }, { "batch_size": 4, "epoch": 0.4412, "step": 1103, "tokens_per_device": 1860 }, { "epoch": 0.4412, "loss_ce": 0.3206242322921753, "loss_lvr": 1.2634320259094238, "loss_mode_switch": 0.0, "loss_total": 0.4469674229621887, "step": 1103 }, { "epoch": 0.4416, "grad_norm": 1.2742568254470825, "learning_rate": 6.177459347044703e-06, "loss": 0.2986, "step": 1104 }, { "batch_size": 4, "epoch": 0.4416, "step": 1104, "tokens_per_device": 3848 }, { "epoch": 0.4416, "loss_ce": 0.3541376292705536, "loss_lvr": 1.1620852947235107, "loss_mode_switch": 0.0, "loss_total": 0.4703461527824402, "step": 1104 }, { "batch_size": 4, "epoch": 0.4416, "step": 1104, "tokens_per_device": 9632 }, { "epoch": 0.4416, "loss_ce": 0.02666134014725685, "loss_lvr": 0.6865816712379456, "loss_mode_switch": 0.0, "loss_total": 0.09531950950622559, "step": 1104 }, { "batch_size": 1, "epoch": 0.4416, "step": 1104, "tokens_per_device": 4896 }, { "epoch": 0.4416, "loss_ce": 0.02852003090083599, "loss_lvr": 0.6334741115570068, "loss_mode_switch": 0.0, "loss_total": 0.09186744689941406, "step": 1104 }, { "batch_size": 4, "epoch": 0.4416, "step": 1104, "tokens_per_device": 4216 }, { "epoch": 0.4416, "loss_ce": 0.04851814731955528, "loss_lvr": 0.6970148682594299, "loss_mode_switch": 0.0, "loss_total": 0.11821962893009186, "step": 1104 }, { "batch_size": 4, "epoch": 0.4416, "step": 1104, "tokens_per_device": 4820 }, { "epoch": 0.4416, "loss_ce": 0.031786490231752396, "loss_lvr": 0.6400390863418579, "loss_mode_switch": 0.0, "loss_total": 0.09579040110111237, "step": 1104 }, { "batch_size": 4, "epoch": 0.4416, "step": 1104, "tokens_per_device": 3720 }, { "epoch": 0.4416, "loss_ce": 0.10530557483434677, "loss_lvr": 1.026816487312317, "loss_mode_switch": 0.0, "loss_total": 0.2079872190952301, "step": 1104 }, { "batch_size": 4, "epoch": 0.4416, "step": 1104, "tokens_per_device": 5000 }, { "epoch": 0.4416, "loss_ce": 0.02398082986474037, "loss_lvr": 0.8167061805725098, "loss_mode_switch": 0.0, "loss_total": 0.10565145313739777, "step": 1104 }, { "batch_size": 1, "epoch": 0.4416, "step": 1104, "tokens_per_device": 4546 }, { "epoch": 0.4416, "loss_ce": 0.105557382106781, "loss_lvr": 0.35685741901397705, "loss_mode_switch": 0.0, "loss_total": 0.1412431299686432, "step": 1104 }, { "epoch": 0.442, "grad_norm": 1.210742712020874, "learning_rate": 6.171163021542134e-06, "loss": 0.2859, "step": 1105 }, { "batch_size": 4, "epoch": 0.442, "step": 1105, "tokens_per_device": 2588 }, { "epoch": 0.442, "loss_ce": 0.2139437049627304, "loss_lvr": 0.9939603209495544, "loss_mode_switch": 0.0, "loss_total": 0.3133397400379181, "step": 1105 }, { "batch_size": 1, "epoch": 0.442, "step": 1105, "tokens_per_device": 5105 }, { "epoch": 0.442, "loss_ce": 0.05597483366727829, "loss_lvr": 0.5444193482398987, "loss_mode_switch": 0.0, "loss_total": 0.11041676998138428, "step": 1105 }, { "batch_size": 1, "epoch": 0.442, "step": 1105, "tokens_per_device": 4898 }, { "epoch": 0.442, "loss_ce": 0.17839688062667847, "loss_lvr": 0.20669616758823395, "loss_mode_switch": 0.0, "loss_total": 0.19906648993492126, "step": 1105 }, { "batch_size": 1, "epoch": 0.442, "step": 1105, "tokens_per_device": 5291 }, { "epoch": 0.442, "loss_ce": 0.17906804382801056, "loss_lvr": 0.5221130847930908, "loss_mode_switch": 0.0, "loss_total": 0.23127935826778412, "step": 1105 }, { "batch_size": 1, "epoch": 0.442, "step": 1105, "tokens_per_device": 4164 }, { "epoch": 0.442, "loss_ce": 1.4929451942443848, "loss_lvr": 0.40062063932418823, "loss_mode_switch": 0.0, "loss_total": 1.533007264137268, "step": 1105 }, { "batch_size": 1, "epoch": 0.442, "step": 1105, "tokens_per_device": 5103 }, { "epoch": 0.442, "loss_ce": 0.14463678002357483, "loss_lvr": 0.2684382200241089, "loss_mode_switch": 0.0, "loss_total": 0.17148059606552124, "step": 1105 }, { "batch_size": 1, "epoch": 0.442, "step": 1105, "tokens_per_device": 5160 }, { "epoch": 0.442, "loss_ce": 0.026868540793657303, "loss_lvr": 0.39503559470176697, "loss_mode_switch": 0.0, "loss_total": 0.0663720965385437, "step": 1105 }, { "batch_size": 4, "epoch": 0.442, "step": 1105, "tokens_per_device": 1584 }, { "epoch": 0.442, "loss_ce": 0.434365451335907, "loss_lvr": 1.3417664766311646, "loss_mode_switch": 0.0, "loss_total": 0.5685421228408813, "step": 1105 }, { "epoch": 0.4424, "grad_norm": 1.2808231115341187, "learning_rate": 6.164864730446776e-06, "loss": 0.3212, "step": 1106 }, { "batch_size": 1, "epoch": 0.4424, "step": 1106, "tokens_per_device": 4870 }, { "epoch": 0.4424, "loss_ce": 0.0023972855415195227, "loss_lvr": 0.40810948610305786, "loss_mode_switch": 0.0, "loss_total": 0.04320823401212692, "step": 1106 }, { "batch_size": 4, "epoch": 0.4424, "step": 1106, "tokens_per_device": 4088 }, { "epoch": 0.4424, "loss_ce": 0.3762357831001282, "loss_lvr": 0.9605830311775208, "loss_mode_switch": 0.0, "loss_total": 0.4722940921783447, "step": 1106 }, { "batch_size": 4, "epoch": 0.4424, "step": 1106, "tokens_per_device": 1380 }, { "epoch": 0.4424, "loss_ce": 0.17726781964302063, "loss_lvr": 0.9874016642570496, "loss_mode_switch": 0.0, "loss_total": 0.2760079801082611, "step": 1106 }, { "batch_size": 4, "epoch": 0.4424, "step": 1106, "tokens_per_device": 4248 }, { "epoch": 0.4424, "loss_ce": 0.3841429352760315, "loss_lvr": 0.9279084801673889, "loss_mode_switch": 0.0, "loss_total": 0.4769337773323059, "step": 1106 }, { "batch_size": 4, "epoch": 0.4424, "step": 1106, "tokens_per_device": 5572 }, { "epoch": 0.4424, "loss_ce": 0.11550344526767731, "loss_lvr": 1.1309422254562378, "loss_mode_switch": 0.0, "loss_total": 0.22859767079353333, "step": 1106 }, { "batch_size": 4, "epoch": 0.4424, "step": 1106, "tokens_per_device": 2700 }, { "epoch": 0.4424, "loss_ce": 0.8239930272102356, "loss_lvr": 0.8342523574829102, "loss_mode_switch": 0.0, "loss_total": 0.9074182510375977, "step": 1106 }, { "batch_size": 4, "epoch": 0.4424, "step": 1106, "tokens_per_device": 5736 }, { "epoch": 0.4424, "loss_ce": 0.26173505187034607, "loss_lvr": 0.7291378378868103, "loss_mode_switch": 0.0, "loss_total": 0.33464884757995605, "step": 1106 }, { "batch_size": 1, "epoch": 0.4424, "step": 1106, "tokens_per_device": 6686 }, { "epoch": 0.4424, "loss_ce": 0.22631171345710754, "loss_lvr": 0.28502607345581055, "loss_mode_switch": 0.0, "loss_total": 0.2548143267631531, "step": 1106 }, { "epoch": 0.4428, "grad_norm": 1.483127474784851, "learning_rate": 6.158564484329212e-06, "loss": 0.3161, "step": 1107 }, { "batch_size": 4, "epoch": 0.4428, "step": 1107, "tokens_per_device": 4232 }, { "epoch": 0.4428, "loss_ce": 0.28662148118019104, "loss_lvr": 1.1524465084075928, "loss_mode_switch": 0.0, "loss_total": 0.4018661379814148, "step": 1107 }, { "batch_size": 1, "epoch": 0.4428, "step": 1107, "tokens_per_device": 4865 }, { "epoch": 0.4428, "loss_ce": 0.011181263253092766, "loss_lvr": 0.3045148551464081, "loss_mode_switch": 0.0, "loss_total": 0.041632749140262604, "step": 1107 }, { "batch_size": 4, "epoch": 0.4428, "step": 1107, "tokens_per_device": 5768 }, { "epoch": 0.4428, "loss_ce": 0.07878664135932922, "loss_lvr": 0.952042281627655, "loss_mode_switch": 0.0, "loss_total": 0.1739908754825592, "step": 1107 }, { "batch_size": 4, "epoch": 0.4428, "step": 1107, "tokens_per_device": 2644 }, { "epoch": 0.4428, "loss_ce": 0.6873494386672974, "loss_lvr": 0.7678310871124268, "loss_mode_switch": 0.0, "loss_total": 0.764132559299469, "step": 1107 }, { "batch_size": 1, "epoch": 0.4428, "step": 1107, "tokens_per_device": 5044 }, { "epoch": 0.4428, "loss_ce": 0.0016031175618991256, "loss_lvr": 0.3925665020942688, "loss_mode_switch": 0.0, "loss_total": 0.04085977002978325, "step": 1107 }, { "batch_size": 4, "epoch": 0.4428, "step": 1107, "tokens_per_device": 2756 }, { "epoch": 0.4428, "loss_ce": 0.08613896369934082, "loss_lvr": 0.7763088345527649, "loss_mode_switch": 0.0, "loss_total": 0.16376984119415283, "step": 1107 }, { "batch_size": 4, "epoch": 0.4428, "step": 1107, "tokens_per_device": 6008 }, { "epoch": 0.4428, "loss_ce": 0.09487124532461166, "loss_lvr": 1.112755298614502, "loss_mode_switch": 0.0, "loss_total": 0.20614677667617798, "step": 1107 }, { "batch_size": 1, "epoch": 0.4428, "step": 1107, "tokens_per_device": 5107 }, { "epoch": 0.4428, "loss_ce": 0.044960811734199524, "loss_lvr": 0.1947372853755951, "loss_mode_switch": 0.0, "loss_total": 0.06443454325199127, "step": 1107 }, { "epoch": 0.4432, "grad_norm": 1.3561201095581055, "learning_rate": 6.1522622937633044e-06, "loss": 0.2551, "step": 1108 }, { "batch_size": 4, "epoch": 0.4432, "step": 1108, "tokens_per_device": 4368 }, { "epoch": 0.4432, "loss_ce": 0.3583281934261322, "loss_lvr": 0.9579573273658752, "loss_mode_switch": 0.0, "loss_total": 0.45412391424179077, "step": 1108 }, { "batch_size": 1, "epoch": 0.4432, "step": 1108, "tokens_per_device": 4888 }, { "epoch": 0.4432, "loss_ce": 0.007454004138708115, "loss_lvr": 0.6844927072525024, "loss_mode_switch": 0.0, "loss_total": 0.0759032815694809, "step": 1108 }, { "batch_size": 4, "epoch": 0.4432, "step": 1108, "tokens_per_device": 14048 }, { "epoch": 0.4432, "loss_ce": 0.012209242209792137, "loss_lvr": 1.0046230554580688, "loss_mode_switch": 0.0, "loss_total": 0.11267155408859253, "step": 1108 }, { "batch_size": 1, "epoch": 0.4432, "step": 1108, "tokens_per_device": 4905 }, { "epoch": 0.4432, "loss_ce": 0.04205907881259918, "loss_lvr": 0.29970401525497437, "loss_mode_switch": 0.0, "loss_total": 0.0720294788479805, "step": 1108 }, { "batch_size": 4, "epoch": 0.4432, "step": 1108, "tokens_per_device": 4076 }, { "epoch": 0.4432, "loss_ce": 0.21097345650196075, "loss_lvr": 0.9040488004684448, "loss_mode_switch": 0.0, "loss_total": 0.3013783395290375, "step": 1108 }, { "batch_size": 4, "epoch": 0.4432, "step": 1108, "tokens_per_device": 3760 }, { "epoch": 0.4432, "loss_ce": 0.2906091809272766, "loss_lvr": 0.5993214845657349, "loss_mode_switch": 0.0, "loss_total": 0.3505413234233856, "step": 1108 }, { "batch_size": 1, "epoch": 0.4432, "step": 1108, "tokens_per_device": 5106 }, { "epoch": 0.4432, "loss_ce": 0.04322019964456558, "loss_lvr": 0.4851109981536865, "loss_mode_switch": 0.0, "loss_total": 0.09173129498958588, "step": 1108 }, { "batch_size": 4, "epoch": 0.4432, "step": 1108, "tokens_per_device": 6096 }, { "epoch": 0.4432, "loss_ce": 0.06695288419723511, "loss_lvr": 0.8307891488075256, "loss_mode_switch": 0.0, "loss_total": 0.15003180503845215, "step": 1108 }, { "epoch": 0.4436, "grad_norm": 1.2690201997756958, "learning_rate": 6.1459581693261825e-06, "loss": 0.3111, "step": 1109 }, { "batch_size": 4, "epoch": 0.4436, "step": 1109, "tokens_per_device": 6300 }, { "epoch": 0.4436, "loss_ce": 0.19610054790973663, "loss_lvr": 0.7307212352752686, "loss_mode_switch": 0.0, "loss_total": 0.26917266845703125, "step": 1109 }, { "batch_size": 4, "epoch": 0.4436, "step": 1109, "tokens_per_device": 2536 }, { "epoch": 0.4436, "loss_ce": 0.27374744415283203, "loss_lvr": 0.6658734679222107, "loss_mode_switch": 0.0, "loss_total": 0.34033480286598206, "step": 1109 }, { "batch_size": 4, "epoch": 0.4436, "step": 1109, "tokens_per_device": 4880 }, { "epoch": 0.4436, "loss_ce": 0.25225603580474854, "loss_lvr": 0.8516372442245483, "loss_mode_switch": 0.0, "loss_total": 0.3374197483062744, "step": 1109 }, { "batch_size": 4, "epoch": 0.4436, "step": 1109, "tokens_per_device": 4360 }, { "epoch": 0.4436, "loss_ce": 0.16687628626823425, "loss_lvr": 0.9521415829658508, "loss_mode_switch": 0.0, "loss_total": 0.26209044456481934, "step": 1109 }, { "batch_size": 1, "epoch": 0.4436, "step": 1109, "tokens_per_device": 4818 }, { "epoch": 0.4436, "loss_ce": 0.0011006058193743229, "loss_lvr": 0.36426684260368347, "loss_mode_switch": 0.0, "loss_total": 0.03752729296684265, "step": 1109 }, { "batch_size": 4, "epoch": 0.4436, "step": 1109, "tokens_per_device": 4484 }, { "epoch": 0.4436, "loss_ce": 0.4623090326786041, "loss_lvr": 0.9102938771247864, "loss_mode_switch": 0.0, "loss_total": 0.5533384084701538, "step": 1109 }, { "batch_size": 4, "epoch": 0.4436, "step": 1109, "tokens_per_device": 1244 }, { "epoch": 0.4436, "loss_ce": 0.8905138969421387, "loss_lvr": 1.0408275127410889, "loss_mode_switch": 0.0, "loss_total": 0.9945966601371765, "step": 1109 }, { "batch_size": 4, "epoch": 0.4436, "step": 1109, "tokens_per_device": 7260 }, { "epoch": 0.4436, "loss_ce": 0.08138822019100189, "loss_lvr": 0.6908166408538818, "loss_mode_switch": 0.0, "loss_total": 0.15046988427639008, "step": 1109 }, { "epoch": 0.444, "grad_norm": 1.32771635055542, "learning_rate": 6.139652121598219e-06, "loss": 0.3373, "step": 1110 }, { "batch_size": 4, "epoch": 0.444, "step": 1110, "tokens_per_device": 10028 }, { "epoch": 0.444, "loss_ce": 0.45733845233917236, "loss_lvr": 1.2684376239776611, "loss_mode_switch": 0.0, "loss_total": 0.5841822028160095, "step": 1110 }, { "batch_size": 4, "epoch": 0.444, "step": 1110, "tokens_per_device": 4780 }, { "epoch": 0.444, "loss_ce": 0.3739743232727051, "loss_lvr": 0.8523138165473938, "loss_mode_switch": 0.0, "loss_total": 0.4592057168483734, "step": 1110 }, { "batch_size": 1, "epoch": 0.444, "step": 1110, "tokens_per_device": 5111 }, { "epoch": 0.444, "loss_ce": 0.009023883379995823, "loss_lvr": 0.3392724394798279, "loss_mode_switch": 0.0, "loss_total": 0.04295112565159798, "step": 1110 }, { "batch_size": 1, "epoch": 0.444, "step": 1110, "tokens_per_device": 5057 }, { "epoch": 0.444, "loss_ce": 0.3847213089466095, "loss_lvr": 0.6590086817741394, "loss_mode_switch": 0.0, "loss_total": 0.45062217116355896, "step": 1110 }, { "batch_size": 4, "epoch": 0.444, "step": 1110, "tokens_per_device": 4908 }, { "epoch": 0.444, "loss_ce": 0.17148873209953308, "loss_lvr": 0.9121224284172058, "loss_mode_switch": 0.0, "loss_total": 0.26270097494125366, "step": 1110 }, { "batch_size": 4, "epoch": 0.444, "step": 1110, "tokens_per_device": 5416 }, { "epoch": 0.444, "loss_ce": 0.49899834394454956, "loss_lvr": 0.8597325682640076, "loss_mode_switch": 0.0, "loss_total": 0.5849716067314148, "step": 1110 }, { "batch_size": 4, "epoch": 0.444, "step": 1110, "tokens_per_device": 1504 }, { "epoch": 0.444, "loss_ce": 0.2560514211654663, "loss_lvr": 1.067548394203186, "loss_mode_switch": 0.0, "loss_total": 0.3628062605857849, "step": 1110 }, { "batch_size": 1, "epoch": 0.444, "step": 1110, "tokens_per_device": 5119 }, { "epoch": 0.444, "loss_ce": 0.05804373323917389, "loss_lvr": 0.42944684624671936, "loss_mode_switch": 0.0, "loss_total": 0.10098841786384583, "step": 1110 }, { "epoch": 0.4444, "grad_norm": 1.7094494104385376, "learning_rate": 6.133344161163012e-06, "loss": 0.3166, "step": 1111 }, { "batch_size": 4, "epoch": 0.4444, "step": 1111, "tokens_per_device": 4616 }, { "epoch": 0.4444, "loss_ce": 0.6279132962226868, "loss_lvr": 1.0080218315124512, "loss_mode_switch": 0.0, "loss_total": 0.7287154793739319, "step": 1111 }, { "batch_size": 4, "epoch": 0.4444, "step": 1111, "tokens_per_device": 4248 }, { "epoch": 0.4444, "loss_ce": 0.17936132848262787, "loss_lvr": 1.1960889101028442, "loss_mode_switch": 0.0, "loss_total": 0.29897022247314453, "step": 1111 }, { "batch_size": 4, "epoch": 0.4444, "step": 1111, "tokens_per_device": 2668 }, { "epoch": 0.4444, "loss_ce": 0.3156382739543915, "loss_lvr": 0.6497024297714233, "loss_mode_switch": 0.0, "loss_total": 0.38060852885246277, "step": 1111 }, { "batch_size": 1, "epoch": 0.4444, "step": 1111, "tokens_per_device": 6416 }, { "epoch": 0.4444, "loss_ce": 0.006519929505884647, "loss_lvr": 0.3588096797466278, "loss_mode_switch": 0.0, "loss_total": 0.042400896549224854, "step": 1111 }, { "batch_size": 1, "epoch": 0.4444, "step": 1111, "tokens_per_device": 4906 }, { "epoch": 0.4444, "loss_ce": 0.05880238115787506, "loss_lvr": 0.4317556619644165, "loss_mode_switch": 0.0, "loss_total": 0.10197794437408447, "step": 1111 }, { "batch_size": 1, "epoch": 0.4444, "step": 1111, "tokens_per_device": 4865 }, { "epoch": 0.4444, "loss_ce": 0.33009248971939087, "loss_lvr": 0.22892913222312927, "loss_mode_switch": 0.0, "loss_total": 0.3529854118824005, "step": 1111 }, { "batch_size": 4, "epoch": 0.4444, "step": 1111, "tokens_per_device": 2692 }, { "epoch": 0.4444, "loss_ce": 0.2843005657196045, "loss_lvr": 0.8524417281150818, "loss_mode_switch": 0.0, "loss_total": 0.36954474449157715, "step": 1111 }, { "batch_size": 4, "epoch": 0.4444, "step": 1111, "tokens_per_device": 2692 }, { "epoch": 0.4444, "loss_ce": 0.230068638920784, "loss_lvr": 0.7573824524879456, "loss_mode_switch": 0.0, "loss_total": 0.30580687522888184, "step": 1111 }, { "epoch": 0.4448, "grad_norm": 1.1808844804763794, "learning_rate": 6.127034298607375e-06, "loss": 0.2761, "step": 1112 }, { "batch_size": 1, "epoch": 0.4448, "step": 1112, "tokens_per_device": 4905 }, { "epoch": 0.4448, "loss_ce": 0.011288349516689777, "loss_lvr": 0.2690786123275757, "loss_mode_switch": 0.0, "loss_total": 0.03819620981812477, "step": 1112 }, { "batch_size": 4, "epoch": 0.4448, "step": 1112, "tokens_per_device": 2920 }, { "epoch": 0.4448, "loss_ce": 0.3652169704437256, "loss_lvr": 0.9615995287895203, "loss_mode_switch": 0.0, "loss_total": 0.46137693524360657, "step": 1112 }, { "batch_size": 4, "epoch": 0.4448, "step": 1112, "tokens_per_device": 5968 }, { "epoch": 0.4448, "loss_ce": 0.42125189304351807, "loss_lvr": 0.7530377507209778, "loss_mode_switch": 0.0, "loss_total": 0.4965556859970093, "step": 1112 }, { "batch_size": 4, "epoch": 0.4448, "step": 1112, "tokens_per_device": 6600 }, { "epoch": 0.4448, "loss_ce": 0.21715658903121948, "loss_lvr": 0.9205679893493652, "loss_mode_switch": 0.0, "loss_total": 0.30921339988708496, "step": 1112 }, { "batch_size": 4, "epoch": 0.4448, "step": 1112, "tokens_per_device": 5232 }, { "epoch": 0.4448, "loss_ce": 0.31629490852355957, "loss_lvr": 0.7114794850349426, "loss_mode_switch": 0.0, "loss_total": 0.38744285702705383, "step": 1112 }, { "batch_size": 4, "epoch": 0.4448, "step": 1112, "tokens_per_device": 9640 }, { "epoch": 0.4448, "loss_ce": 0.07964037358760834, "loss_lvr": 1.0214126110076904, "loss_mode_switch": 0.0, "loss_total": 0.18178163468837738, "step": 1112 }, { "batch_size": 1, "epoch": 0.4448, "step": 1112, "tokens_per_device": 4938 }, { "epoch": 0.4448, "loss_ce": 0.39438092708587646, "loss_lvr": 0.6625819206237793, "loss_mode_switch": 0.0, "loss_total": 0.4606391191482544, "step": 1112 }, { "batch_size": 1, "epoch": 0.4448, "step": 1112, "tokens_per_device": 4850 }, { "epoch": 0.4448, "loss_ce": 0.21477508544921875, "loss_lvr": 0.21999289095401764, "loss_mode_switch": 0.0, "loss_total": 0.23677437007427216, "step": 1112 }, { "epoch": 0.4452, "grad_norm": 1.4788830280303955, "learning_rate": 6.120722544521312e-06, "loss": 0.2924, "step": 1113 }, { "batch_size": 4, "epoch": 0.4452, "step": 1113, "tokens_per_device": 5236 }, { "epoch": 0.4452, "loss_ce": 0.42227768898010254, "loss_lvr": 0.5729466676712036, "loss_mode_switch": 0.0, "loss_total": 0.4795723557472229, "step": 1113 }, { "batch_size": 4, "epoch": 0.4452, "step": 1113, "tokens_per_device": 2624 }, { "epoch": 0.4452, "loss_ce": 0.5339805483818054, "loss_lvr": 0.9992223381996155, "loss_mode_switch": 0.0, "loss_total": 0.6339027881622314, "step": 1113 }, { "batch_size": 1, "epoch": 0.4452, "step": 1113, "tokens_per_device": 4934 }, { "epoch": 0.4452, "loss_ce": 0.07048685848712921, "loss_lvr": 0.8455536961555481, "loss_mode_switch": 0.0, "loss_total": 0.15504223108291626, "step": 1113 }, { "batch_size": 4, "epoch": 0.4452, "step": 1113, "tokens_per_device": 4340 }, { "epoch": 0.4452, "loss_ce": 0.5440807938575745, "loss_lvr": 1.02298104763031, "loss_mode_switch": 0.0, "loss_total": 0.6463788747787476, "step": 1113 }, { "batch_size": 4, "epoch": 0.4452, "step": 1113, "tokens_per_device": 2560 }, { "epoch": 0.4452, "loss_ce": 0.2507741153240204, "loss_lvr": 0.8963724970817566, "loss_mode_switch": 0.0, "loss_total": 0.34041136503219604, "step": 1113 }, { "batch_size": 1, "epoch": 0.4452, "step": 1113, "tokens_per_device": 5144 }, { "epoch": 0.4452, "loss_ce": 0.0035861146170645952, "loss_lvr": 0.47134485840797424, "loss_mode_switch": 0.0, "loss_total": 0.05072059854865074, "step": 1113 }, { "batch_size": 4, "epoch": 0.4452, "step": 1113, "tokens_per_device": 2628 }, { "epoch": 0.4452, "loss_ce": 0.15669701993465424, "loss_lvr": 0.9485464692115784, "loss_mode_switch": 0.0, "loss_total": 0.25155165791511536, "step": 1113 }, { "batch_size": 4, "epoch": 0.4452, "step": 1113, "tokens_per_device": 4252 }, { "epoch": 0.4452, "loss_ce": 0.26322275400161743, "loss_lvr": 1.1031047105789185, "loss_mode_switch": 0.0, "loss_total": 0.3735332190990448, "step": 1113 }, { "epoch": 0.4456, "grad_norm": 1.26278817653656, "learning_rate": 6.114408909497999e-06, "loss": 0.2761, "step": 1114 }, { "batch_size": 4, "epoch": 0.4456, "step": 1114, "tokens_per_device": 2596 }, { "epoch": 0.4456, "loss_ce": 0.2582750916481018, "loss_lvr": 0.8572698831558228, "loss_mode_switch": 0.0, "loss_total": 0.3440020680427551, "step": 1114 }, { "batch_size": 4, "epoch": 0.4456, "step": 1114, "tokens_per_device": 2632 }, { "epoch": 0.4456, "loss_ce": 0.3779442012310028, "loss_lvr": 0.6997079849243164, "loss_mode_switch": 0.0, "loss_total": 0.4479150176048279, "step": 1114 }, { "batch_size": 4, "epoch": 0.4456, "step": 1114, "tokens_per_device": 14140 }, { "epoch": 0.4456, "loss_ce": 0.43001386523246765, "loss_lvr": 1.0763564109802246, "loss_mode_switch": 0.0, "loss_total": 0.5376495122909546, "step": 1114 }, { "batch_size": 1, "epoch": 0.4456, "step": 1114, "tokens_per_device": 5111 }, { "epoch": 0.4456, "loss_ce": 0.05241771042346954, "loss_lvr": 0.5246394276618958, "loss_mode_switch": 0.0, "loss_total": 0.1048816591501236, "step": 1114 }, { "batch_size": 4, "epoch": 0.4456, "step": 1114, "tokens_per_device": 5832 }, { "epoch": 0.4456, "loss_ce": 0.17544113099575043, "loss_lvr": 0.6569278836250305, "loss_mode_switch": 0.0, "loss_total": 0.2411339282989502, "step": 1114 }, { "batch_size": 4, "epoch": 0.4456, "step": 1114, "tokens_per_device": 2820 }, { "epoch": 0.4456, "loss_ce": 0.3785013258457184, "loss_lvr": 0.7777734398841858, "loss_mode_switch": 0.0, "loss_total": 0.4562786817550659, "step": 1114 }, { "batch_size": 1, "epoch": 0.4456, "step": 1114, "tokens_per_device": 5158 }, { "epoch": 0.4456, "loss_ce": 0.0533420667052269, "loss_lvr": 0.5283359289169312, "loss_mode_switch": 0.0, "loss_total": 0.10617566108703613, "step": 1114 }, { "batch_size": 4, "epoch": 0.4456, "step": 1114, "tokens_per_device": 9776 }, { "epoch": 0.4456, "loss_ce": 0.314124196767807, "loss_lvr": 0.5466040968894958, "loss_mode_switch": 0.0, "loss_total": 0.3687846064567566, "step": 1114 }, { "epoch": 0.446, "grad_norm": 1.320359230041504, "learning_rate": 6.108093404133772e-06, "loss": 0.3474, "step": 1115 }, { "batch_size": 4, "epoch": 0.446, "step": 1115, "tokens_per_device": 4268 }, { "epoch": 0.446, "loss_ce": 0.15596014261245728, "loss_lvr": 0.9811294078826904, "loss_mode_switch": 0.0, "loss_total": 0.2540730834007263, "step": 1115 }, { "batch_size": 1, "epoch": 0.446, "step": 1115, "tokens_per_device": 4905 }, { "epoch": 0.446, "loss_ce": 0.16914187371730804, "loss_lvr": 1.7329882383346558, "loss_mode_switch": 0.0, "loss_total": 0.3424406945705414, "step": 1115 }, { "batch_size": 1, "epoch": 0.446, "step": 1115, "tokens_per_device": 5104 }, { "epoch": 0.446, "loss_ce": 0.012493627145886421, "loss_lvr": 0.3502158224582672, "loss_mode_switch": 0.0, "loss_total": 0.04751521348953247, "step": 1115 }, { "batch_size": 4, "epoch": 0.446, "step": 1115, "tokens_per_device": 1672 }, { "epoch": 0.446, "loss_ce": 0.6737828254699707, "loss_lvr": 0.902283251285553, "loss_mode_switch": 0.0, "loss_total": 0.7640111446380615, "step": 1115 }, { "batch_size": 1, "epoch": 0.446, "step": 1115, "tokens_per_device": 4885 }, { "epoch": 0.446, "loss_ce": 0.07740914076566696, "loss_lvr": 0.4271024465560913, "loss_mode_switch": 0.0, "loss_total": 0.12011938542127609, "step": 1115 }, { "batch_size": 4, "epoch": 0.446, "step": 1115, "tokens_per_device": 8648 }, { "epoch": 0.446, "loss_ce": 0.29271697998046875, "loss_lvr": 0.7066671848297119, "loss_mode_switch": 0.0, "loss_total": 0.3633837103843689, "step": 1115 }, { "batch_size": 1, "epoch": 0.446, "step": 1115, "tokens_per_device": 5115 }, { "epoch": 0.446, "loss_ce": 0.014096193946897984, "loss_lvr": 0.37159252166748047, "loss_mode_switch": 0.0, "loss_total": 0.051255445927381516, "step": 1115 }, { "batch_size": 1, "epoch": 0.446, "step": 1115, "tokens_per_device": 4191 }, { "epoch": 0.446, "loss_ce": 0.09227211773395538, "loss_lvr": 0.44817987084388733, "loss_mode_switch": 0.0, "loss_total": 0.13709010183811188, "step": 1115 }, { "epoch": 0.4464, "grad_norm": 1.2390329837799072, "learning_rate": 6.101776039028104e-06, "loss": 0.3078, "step": 1116 }, { "batch_size": 1, "epoch": 0.4464, "step": 1116, "tokens_per_device": 5155 }, { "epoch": 0.4464, "loss_ce": 0.25337207317352295, "loss_lvr": 1.0948295593261719, "loss_mode_switch": 0.0, "loss_total": 0.3628550171852112, "step": 1116 }, { "batch_size": 4, "epoch": 0.4464, "step": 1116, "tokens_per_device": 14992 }, { "epoch": 0.4464, "loss_ce": 0.09083252400159836, "loss_lvr": 0.6532405018806458, "loss_mode_switch": 0.0, "loss_total": 0.15615656971931458, "step": 1116 }, { "batch_size": 1, "epoch": 0.4464, "step": 1116, "tokens_per_device": 5160 }, { "epoch": 0.4464, "loss_ce": 0.09197010099887848, "loss_lvr": 0.4887799024581909, "loss_mode_switch": 0.0, "loss_total": 0.1408481001853943, "step": 1116 }, { "batch_size": 1, "epoch": 0.4464, "step": 1116, "tokens_per_device": 5412 }, { "epoch": 0.4464, "loss_ce": 0.18653517961502075, "loss_lvr": 0.3936430513858795, "loss_mode_switch": 0.0, "loss_total": 0.22589948773384094, "step": 1116 }, { "batch_size": 4, "epoch": 0.4464, "step": 1116, "tokens_per_device": 4924 }, { "epoch": 0.4464, "loss_ce": 0.7063978314399719, "loss_lvr": 0.8829274773597717, "loss_mode_switch": 0.0, "loss_total": 0.7946906089782715, "step": 1116 }, { "batch_size": 1, "epoch": 0.4464, "step": 1116, "tokens_per_device": 4761 }, { "epoch": 0.4464, "loss_ce": 0.023133328184485435, "loss_lvr": 0.23663124442100525, "loss_mode_switch": 0.0, "loss_total": 0.04679645225405693, "step": 1116 }, { "batch_size": 4, "epoch": 0.4464, "step": 1116, "tokens_per_device": 4236 }, { "epoch": 0.4464, "loss_ce": 0.05075285956263542, "loss_lvr": 0.7770035862922668, "loss_mode_switch": 0.0, "loss_total": 0.12845322489738464, "step": 1116 }, { "batch_size": 4, "epoch": 0.4464, "step": 1116, "tokens_per_device": 7720 }, { "epoch": 0.4464, "loss_ce": 0.5924046039581299, "loss_lvr": 0.7011151909828186, "loss_mode_switch": 0.0, "loss_total": 0.6625161170959473, "step": 1116 }, { "epoch": 0.4468, "grad_norm": 1.3573106527328491, "learning_rate": 6.095456824783592e-06, "loss": 0.32, "step": 1117 }, { "batch_size": 1, "epoch": 0.4468, "step": 1117, "tokens_per_device": 5070 }, { "epoch": 0.4468, "loss_ce": 0.01336854137480259, "loss_lvr": 0.6635656356811523, "loss_mode_switch": 0.0, "loss_total": 0.07972510159015656, "step": 1117 }, { "batch_size": 1, "epoch": 0.4468, "step": 1117, "tokens_per_device": 4885 }, { "epoch": 0.4468, "loss_ce": 0.022564221173524857, "loss_lvr": 0.9850649833679199, "loss_mode_switch": 0.0, "loss_total": 0.1210707277059555, "step": 1117 }, { "batch_size": 1, "epoch": 0.4468, "step": 1117, "tokens_per_device": 4858 }, { "epoch": 0.4468, "loss_ce": 0.0019857946317642927, "loss_lvr": 0.4004250466823578, "loss_mode_switch": 0.0, "loss_total": 0.04202830046415329, "step": 1117 }, { "batch_size": 4, "epoch": 0.4468, "step": 1117, "tokens_per_device": 10532 }, { "epoch": 0.4468, "loss_ce": 0.04806814715266228, "loss_lvr": 0.7098414301872253, "loss_mode_switch": 0.0, "loss_total": 0.11905229091644287, "step": 1117 }, { "batch_size": 1, "epoch": 0.4468, "step": 1117, "tokens_per_device": 4897 }, { "epoch": 0.4468, "loss_ce": 0.013630256988108158, "loss_lvr": 0.2601848840713501, "loss_mode_switch": 0.0, "loss_total": 0.03964874520897865, "step": 1117 }, { "batch_size": 4, "epoch": 0.4468, "step": 1117, "tokens_per_device": 1480 }, { "epoch": 0.4468, "loss_ce": 0.5102938413619995, "loss_lvr": 0.9663202166557312, "loss_mode_switch": 0.0, "loss_total": 0.6069258451461792, "step": 1117 }, { "batch_size": 1, "epoch": 0.4468, "step": 1117, "tokens_per_device": 5237 }, { "epoch": 0.4468, "loss_ce": 0.07474806159734726, "loss_lvr": 0.5296062231063843, "loss_mode_switch": 0.0, "loss_total": 0.12770868837833405, "step": 1117 }, { "batch_size": 4, "epoch": 0.4468, "step": 1117, "tokens_per_device": 5052 }, { "epoch": 0.4468, "loss_ce": 0.29530033469200134, "loss_lvr": 1.0535888671875, "loss_mode_switch": 0.0, "loss_total": 0.4006592333316803, "step": 1117 }, { "epoch": 0.4472, "grad_norm": 1.3149265050888062, "learning_rate": 6.089135772005932e-06, "loss": 0.3133, "step": 1118 }, { "batch_size": 4, "epoch": 0.4472, "step": 1118, "tokens_per_device": 2804 }, { "epoch": 0.4472, "loss_ce": 0.31598666310310364, "loss_lvr": 0.8314645290374756, "loss_mode_switch": 0.0, "loss_total": 0.3991331160068512, "step": 1118 }, { "batch_size": 4, "epoch": 0.4472, "step": 1118, "tokens_per_device": 2816 }, { "epoch": 0.4472, "loss_ce": 0.2074039727449417, "loss_lvr": 0.7621623873710632, "loss_mode_switch": 0.0, "loss_total": 0.2836202085018158, "step": 1118 }, { "batch_size": 4, "epoch": 0.4472, "step": 1118, "tokens_per_device": 2796 }, { "epoch": 0.4472, "loss_ce": 0.402530312538147, "loss_lvr": 1.0408014059066772, "loss_mode_switch": 0.0, "loss_total": 0.5066104531288147, "step": 1118 }, { "batch_size": 4, "epoch": 0.4472, "step": 1118, "tokens_per_device": 5468 }, { "epoch": 0.4472, "loss_ce": 0.07567472755908966, "loss_lvr": 0.5919433832168579, "loss_mode_switch": 0.0, "loss_total": 0.1348690688610077, "step": 1118 }, { "batch_size": 4, "epoch": 0.4472, "step": 1118, "tokens_per_device": 2616 }, { "epoch": 0.4472, "loss_ce": 0.5537890791893005, "loss_lvr": 0.8511514067649841, "loss_mode_switch": 0.0, "loss_total": 0.6389042139053345, "step": 1118 }, { "batch_size": 4, "epoch": 0.4472, "step": 1118, "tokens_per_device": 4488 }, { "epoch": 0.4472, "loss_ce": 0.42006340622901917, "loss_lvr": 0.7019317150115967, "loss_mode_switch": 0.0, "loss_total": 0.49025657773017883, "step": 1118 }, { "batch_size": 4, "epoch": 0.4472, "step": 1118, "tokens_per_device": 1380 }, { "epoch": 0.4472, "loss_ce": 0.09055352210998535, "loss_lvr": 0.9757746458053589, "loss_mode_switch": 0.0, "loss_total": 0.18813098967075348, "step": 1118 }, { "batch_size": 4, "epoch": 0.4472, "step": 1118, "tokens_per_device": 4208 }, { "epoch": 0.4472, "loss_ce": 0.027924660593271255, "loss_lvr": 1.0594488382339478, "loss_mode_switch": 0.0, "loss_total": 0.13386954367160797, "step": 1118 }, { "epoch": 0.4476, "grad_norm": 2.993147611618042, "learning_rate": 6.0828128913039085e-06, "loss": 0.3247, "step": 1119 }, { "batch_size": 4, "epoch": 0.4476, "step": 1119, "tokens_per_device": 3928 }, { "epoch": 0.4476, "loss_ce": 0.16830790042877197, "loss_lvr": 1.0360091924667358, "loss_mode_switch": 0.0, "loss_total": 0.27190881967544556, "step": 1119 }, { "batch_size": 1, "epoch": 0.4476, "step": 1119, "tokens_per_device": 5255 }, { "epoch": 0.4476, "loss_ce": 0.11557776480913162, "loss_lvr": 0.4493209421634674, "loss_mode_switch": 0.0, "loss_total": 0.16050985455513, "step": 1119 }, { "batch_size": 1, "epoch": 0.4476, "step": 1119, "tokens_per_device": 5007 }, { "epoch": 0.4476, "loss_ce": 1.1659117937088013, "loss_lvr": 0.6682834625244141, "loss_mode_switch": 0.0, "loss_total": 1.2327401638031006, "step": 1119 }, { "batch_size": 4, "epoch": 0.4476, "step": 1119, "tokens_per_device": 4584 }, { "epoch": 0.4476, "loss_ce": 0.32676249742507935, "loss_lvr": 1.0269654989242554, "loss_mode_switch": 0.0, "loss_total": 0.4294590353965759, "step": 1119 }, { "batch_size": 4, "epoch": 0.4476, "step": 1119, "tokens_per_device": 2704 }, { "epoch": 0.4476, "loss_ce": 0.09873895347118378, "loss_lvr": 0.7120720148086548, "loss_mode_switch": 0.0, "loss_total": 0.16994616389274597, "step": 1119 }, { "batch_size": 1, "epoch": 0.4476, "step": 1119, "tokens_per_device": 4873 }, { "epoch": 0.4476, "loss_ce": 0.09516013413667679, "loss_lvr": 0.1644161194562912, "loss_mode_switch": 0.0, "loss_total": 0.11160174757242203, "step": 1119 }, { "batch_size": 4, "epoch": 0.4476, "step": 1119, "tokens_per_device": 4632 }, { "epoch": 0.4476, "loss_ce": 0.02465054765343666, "loss_lvr": 1.0967379808425903, "loss_mode_switch": 0.0, "loss_total": 0.1343243420124054, "step": 1119 }, { "batch_size": 4, "epoch": 0.4476, "step": 1119, "tokens_per_device": 9652 }, { "epoch": 0.4476, "loss_ce": 0.2999078035354614, "loss_lvr": 0.7655569314956665, "loss_mode_switch": 0.0, "loss_total": 0.37646350264549255, "step": 1119 }, { "epoch": 0.448, "grad_norm": 1.4986943006515503, "learning_rate": 6.076488193289375e-06, "loss": 0.3014, "step": 1120 }, { "batch_size": 1, "epoch": 0.448, "step": 1120, "tokens_per_device": 4906 }, { "epoch": 0.448, "loss_ce": 0.2539229691028595, "loss_lvr": 0.7249690890312195, "loss_mode_switch": 0.0, "loss_total": 0.3264198899269104, "step": 1120 }, { "batch_size": 4, "epoch": 0.448, "step": 1120, "tokens_per_device": 6812 }, { "epoch": 0.448, "loss_ce": 0.4331570565700531, "loss_lvr": 0.8428550958633423, "loss_mode_switch": 0.0, "loss_total": 0.5174425840377808, "step": 1120 }, { "batch_size": 4, "epoch": 0.448, "step": 1120, "tokens_per_device": 6264 }, { "epoch": 0.448, "loss_ce": 0.3612515926361084, "loss_lvr": 0.6740816831588745, "loss_mode_switch": 0.0, "loss_total": 0.4286597669124603, "step": 1120 }, { "batch_size": 4, "epoch": 0.448, "step": 1120, "tokens_per_device": 2720 }, { "epoch": 0.448, "loss_ce": 0.4643974006175995, "loss_lvr": 0.884239137172699, "loss_mode_switch": 0.0, "loss_total": 0.5528213381767273, "step": 1120 }, { "batch_size": 4, "epoch": 0.448, "step": 1120, "tokens_per_device": 10828 }, { "epoch": 0.448, "loss_ce": 0.2643308937549591, "loss_lvr": 0.6464073657989502, "loss_mode_switch": 0.0, "loss_total": 0.32897162437438965, "step": 1120 }, { "batch_size": 4, "epoch": 0.448, "step": 1120, "tokens_per_device": 3836 }, { "epoch": 0.448, "loss_ce": 0.168696328997612, "loss_lvr": 1.0785881280899048, "loss_mode_switch": 0.0, "loss_total": 0.2765551507472992, "step": 1120 }, { "batch_size": 4, "epoch": 0.448, "step": 1120, "tokens_per_device": 1456 }, { "epoch": 0.448, "loss_ce": 0.32586580514907837, "loss_lvr": 1.0240719318389893, "loss_mode_switch": 0.0, "loss_total": 0.4282729923725128, "step": 1120 }, { "batch_size": 4, "epoch": 0.448, "step": 1120, "tokens_per_device": 4292 }, { "epoch": 0.448, "loss_ce": 0.3912396728992462, "loss_lvr": 0.9859801530838013, "loss_mode_switch": 0.0, "loss_total": 0.4898377060890198, "step": 1120 }, { "epoch": 0.4484, "grad_norm": 1.2507176399230957, "learning_rate": 6.070161688577233e-06, "loss": 0.2712, "step": 1121 }, { "batch_size": 4, "epoch": 0.4484, "step": 1121, "tokens_per_device": 5328 }, { "epoch": 0.4484, "loss_ce": 0.391826868057251, "loss_lvr": 1.1786603927612305, "loss_mode_switch": 0.0, "loss_total": 0.509692907333374, "step": 1121 }, { "batch_size": 4, "epoch": 0.4484, "step": 1121, "tokens_per_device": 3828 }, { "epoch": 0.4484, "loss_ce": 0.07597475498914719, "loss_lvr": 1.1508897542953491, "loss_mode_switch": 0.0, "loss_total": 0.19106373190879822, "step": 1121 }, { "batch_size": 4, "epoch": 0.4484, "step": 1121, "tokens_per_device": 3748 }, { "epoch": 0.4484, "loss_ce": 0.018432872369885445, "loss_lvr": 0.9596155881881714, "loss_mode_switch": 0.0, "loss_total": 0.11439443379640579, "step": 1121 }, { "batch_size": 1, "epoch": 0.4484, "step": 1121, "tokens_per_device": 5138 }, { "epoch": 0.4484, "loss_ce": 0.00034404711914248765, "loss_lvr": 0.23224511742591858, "loss_mode_switch": 0.0, "loss_total": 0.02356855943799019, "step": 1121 }, { "batch_size": 4, "epoch": 0.4484, "step": 1121, "tokens_per_device": 1924 }, { "epoch": 0.4484, "loss_ce": 0.20199541747570038, "loss_lvr": 1.0047158002853394, "loss_mode_switch": 0.0, "loss_total": 0.3024669885635376, "step": 1121 }, { "batch_size": 1, "epoch": 0.4484, "step": 1121, "tokens_per_device": 4850 }, { "epoch": 0.4484, "loss_ce": 0.005148422904312611, "loss_lvr": 0.28680935502052307, "loss_mode_switch": 0.0, "loss_total": 0.03382935747504234, "step": 1121 }, { "batch_size": 4, "epoch": 0.4484, "step": 1121, "tokens_per_device": 4116 }, { "epoch": 0.4484, "loss_ce": 0.23290294408798218, "loss_lvr": 1.1068321466445923, "loss_mode_switch": 0.0, "loss_total": 0.34358614683151245, "step": 1121 }, { "batch_size": 1, "epoch": 0.4484, "step": 1121, "tokens_per_device": 4899 }, { "epoch": 0.4484, "loss_ce": 0.05809244513511658, "loss_lvr": 0.36646899580955505, "loss_mode_switch": 0.0, "loss_total": 0.09473934769630432, "step": 1121 }, { "epoch": 0.4488, "grad_norm": 1.2568604946136475, "learning_rate": 6.0638333877854185e-06, "loss": 0.2857, "step": 1122 }, { "batch_size": 4, "epoch": 0.4488, "step": 1122, "tokens_per_device": 4196 }, { "epoch": 0.4488, "loss_ce": 0.06724915653467178, "loss_lvr": 0.7264693379402161, "loss_mode_switch": 0.0, "loss_total": 0.13989609479904175, "step": 1122 }, { "batch_size": 1, "epoch": 0.4488, "step": 1122, "tokens_per_device": 4943 }, { "epoch": 0.4488, "loss_ce": 0.0032779036555439234, "loss_lvr": 0.5646726489067078, "loss_mode_switch": 0.0, "loss_total": 0.059745170176029205, "step": 1122 }, { "batch_size": 1, "epoch": 0.4488, "step": 1122, "tokens_per_device": 5162 }, { "epoch": 0.4488, "loss_ce": 0.02161998860538006, "loss_lvr": 0.3606165647506714, "loss_mode_switch": 0.0, "loss_total": 0.05768164247274399, "step": 1122 }, { "batch_size": 4, "epoch": 0.4488, "step": 1122, "tokens_per_device": 5420 }, { "epoch": 0.4488, "loss_ce": 0.08229852467775345, "loss_lvr": 0.6631654500961304, "loss_mode_switch": 0.0, "loss_total": 0.1486150622367859, "step": 1122 }, { "batch_size": 4, "epoch": 0.4488, "step": 1122, "tokens_per_device": 5076 }, { "epoch": 0.4488, "loss_ce": 0.5702622532844543, "loss_lvr": 0.7989829182624817, "loss_mode_switch": 0.0, "loss_total": 0.650160551071167, "step": 1122 }, { "batch_size": 1, "epoch": 0.4488, "step": 1122, "tokens_per_device": 5457 }, { "epoch": 0.4488, "loss_ce": 0.004145990591496229, "loss_lvr": 0.583990752696991, "loss_mode_switch": 0.0, "loss_total": 0.06254506856203079, "step": 1122 }, { "batch_size": 4, "epoch": 0.4488, "step": 1122, "tokens_per_device": 1496 }, { "epoch": 0.4488, "loss_ce": 0.9493494629859924, "loss_lvr": 0.9208176136016846, "loss_mode_switch": 0.0, "loss_total": 1.041431188583374, "step": 1122 }, { "batch_size": 4, "epoch": 0.4488, "step": 1122, "tokens_per_device": 1252 }, { "epoch": 0.4488, "loss_ce": 0.40060943365097046, "loss_lvr": 1.3630746603012085, "loss_mode_switch": 0.0, "loss_total": 0.5369169116020203, "step": 1122 }, { "epoch": 0.4492, "grad_norm": 1.3659989833831787, "learning_rate": 6.057503301534875e-06, "loss": 0.3182, "step": 1123 }, { "batch_size": 4, "epoch": 0.4492, "step": 1123, "tokens_per_device": 1488 }, { "epoch": 0.4492, "loss_ce": 0.31155288219451904, "loss_lvr": 0.9514811038970947, "loss_mode_switch": 0.0, "loss_total": 0.406700998544693, "step": 1123 }, { "batch_size": 4, "epoch": 0.4492, "step": 1123, "tokens_per_device": 1364 }, { "epoch": 0.4492, "loss_ce": 0.3038843274116516, "loss_lvr": 1.060302972793579, "loss_mode_switch": 0.0, "loss_total": 0.40991461277008057, "step": 1123 }, { "batch_size": 4, "epoch": 0.4492, "step": 1123, "tokens_per_device": 1416 }, { "epoch": 0.4492, "loss_ce": 0.20912288129329681, "loss_lvr": 0.8383790254592896, "loss_mode_switch": 0.0, "loss_total": 0.2929607927799225, "step": 1123 }, { "batch_size": 4, "epoch": 0.4492, "step": 1123, "tokens_per_device": 4112 }, { "epoch": 0.4492, "loss_ce": 0.10940489917993546, "loss_lvr": 0.7090309858322144, "loss_mode_switch": 0.0, "loss_total": 0.180307999253273, "step": 1123 }, { "batch_size": 4, "epoch": 0.4492, "step": 1123, "tokens_per_device": 3744 }, { "epoch": 0.4492, "loss_ce": 0.22546762228012085, "loss_lvr": 1.0966758728027344, "loss_mode_switch": 0.0, "loss_total": 0.33513522148132324, "step": 1123 }, { "batch_size": 4, "epoch": 0.4492, "step": 1123, "tokens_per_device": 4260 }, { "epoch": 0.4492, "loss_ce": 0.16267505288124084, "loss_lvr": 0.6668334007263184, "loss_mode_switch": 0.0, "loss_total": 0.22935840487480164, "step": 1123 }, { "batch_size": 4, "epoch": 0.4492, "step": 1123, "tokens_per_device": 14076 }, { "epoch": 0.4492, "loss_ce": 0.2143363058567047, "loss_lvr": 0.9842475056648254, "loss_mode_switch": 0.0, "loss_total": 0.3127610683441162, "step": 1123 }, { "batch_size": 4, "epoch": 0.4492, "step": 1123, "tokens_per_device": 3824 }, { "epoch": 0.4492, "loss_ce": 0.3826196491718292, "loss_lvr": 0.936424195766449, "loss_mode_switch": 0.0, "loss_total": 0.47626206278800964, "step": 1123 }, { "epoch": 0.4496, "grad_norm": 1.2406811714172363, "learning_rate": 6.051171440449555e-06, "loss": 0.2822, "step": 1124 }, { "batch_size": 1, "epoch": 0.4496, "step": 1124, "tokens_per_device": 4935 }, { "epoch": 0.4496, "loss_ce": 0.11813843250274658, "loss_lvr": 0.3180544078350067, "loss_mode_switch": 0.0, "loss_total": 0.14994387328624725, "step": 1124 }, { "batch_size": 1, "epoch": 0.4496, "step": 1124, "tokens_per_device": 5088 }, { "epoch": 0.4496, "loss_ce": 0.00933138933032751, "loss_lvr": 0.6325027346611023, "loss_mode_switch": 0.0, "loss_total": 0.07258166372776031, "step": 1124 }, { "batch_size": 1, "epoch": 0.4496, "step": 1124, "tokens_per_device": 4740 }, { "epoch": 0.4496, "loss_ce": 0.01322279591113329, "loss_lvr": 0.19624987244606018, "loss_mode_switch": 0.0, "loss_total": 0.03284778445959091, "step": 1124 }, { "batch_size": 4, "epoch": 0.4496, "step": 1124, "tokens_per_device": 4212 }, { "epoch": 0.4496, "loss_ce": 0.26001518964767456, "loss_lvr": 0.93886399269104, "loss_mode_switch": 0.0, "loss_total": 0.35390159487724304, "step": 1124 }, { "batch_size": 1, "epoch": 0.4496, "step": 1124, "tokens_per_device": 5175 }, { "epoch": 0.4496, "loss_ce": 0.06140566244721413, "loss_lvr": 1.0999464988708496, "loss_mode_switch": 0.0, "loss_total": 0.1714003086090088, "step": 1124 }, { "batch_size": 4, "epoch": 0.4496, "step": 1124, "tokens_per_device": 4040 }, { "epoch": 0.4496, "loss_ce": 0.21024547517299652, "loss_lvr": 0.946442723274231, "loss_mode_switch": 0.0, "loss_total": 0.3048897385597229, "step": 1124 }, { "batch_size": 4, "epoch": 0.4496, "step": 1124, "tokens_per_device": 4432 }, { "epoch": 0.4496, "loss_ce": 0.4383338391780853, "loss_lvr": 0.6149542927742004, "loss_mode_switch": 0.0, "loss_total": 0.4998292624950409, "step": 1124 }, { "batch_size": 1, "epoch": 0.4496, "step": 1124, "tokens_per_device": 4892 }, { "epoch": 0.4496, "loss_ce": 0.19458317756652832, "loss_lvr": 0.5462732315063477, "loss_mode_switch": 0.0, "loss_total": 0.24921050667762756, "step": 1124 }, { "epoch": 0.45, "grad_norm": 1.3412092924118042, "learning_rate": 6.044837815156377e-06, "loss": 0.2626, "step": 1125 }, { "batch_size": 4, "epoch": 0.45, "step": 1125, "tokens_per_device": 9016 }, { "epoch": 0.45, "loss_ce": 0.10278363525867462, "loss_lvr": 0.7432579398155212, "loss_mode_switch": 0.0, "loss_total": 0.17710942029953003, "step": 1125 }, { "batch_size": 4, "epoch": 0.45, "step": 1125, "tokens_per_device": 4744 }, { "epoch": 0.45, "loss_ce": 0.08956855535507202, "loss_lvr": 0.8783450126647949, "loss_mode_switch": 0.0, "loss_total": 0.177403062582016, "step": 1125 }, { "batch_size": 4, "epoch": 0.45, "step": 1125, "tokens_per_device": 2212 }, { "epoch": 0.45, "loss_ce": 0.5955352187156677, "loss_lvr": 0.7692118883132935, "loss_mode_switch": 0.0, "loss_total": 0.6724563837051392, "step": 1125 }, { "batch_size": 4, "epoch": 0.45, "step": 1125, "tokens_per_device": 2720 }, { "epoch": 0.45, "loss_ce": 0.618076741695404, "loss_lvr": 0.882770299911499, "loss_mode_switch": 0.0, "loss_total": 0.7063537836074829, "step": 1125 }, { "batch_size": 4, "epoch": 0.45, "step": 1125, "tokens_per_device": 11032 }, { "epoch": 0.45, "loss_ce": 0.13346344232559204, "loss_lvr": 0.7251911163330078, "loss_mode_switch": 0.0, "loss_total": 0.20598256587982178, "step": 1125 }, { "batch_size": 4, "epoch": 0.45, "step": 1125, "tokens_per_device": 11008 }, { "epoch": 0.45, "loss_ce": 0.029164474457502365, "loss_lvr": 0.6800840497016907, "loss_mode_switch": 0.0, "loss_total": 0.09717288613319397, "step": 1125 }, { "batch_size": 4, "epoch": 0.45, "step": 1125, "tokens_per_device": 4208 }, { "epoch": 0.45, "loss_ce": 0.018219104036688805, "loss_lvr": 1.2444428205490112, "loss_mode_switch": 0.0, "loss_total": 0.1426633894443512, "step": 1125 }, { "batch_size": 4, "epoch": 0.45, "step": 1125, "tokens_per_device": 2680 }, { "epoch": 0.45, "loss_ce": 0.4108063578605652, "loss_lvr": 1.312200665473938, "loss_mode_switch": 0.0, "loss_total": 0.5420264005661011, "step": 1125 }, { "epoch": 0.4504, "grad_norm": 1.5196174383163452, "learning_rate": 6.038502436285227e-06, "loss": 0.2822, "step": 1126 }, { "batch_size": 4, "epoch": 0.4504, "step": 1126, "tokens_per_device": 3208 }, { "epoch": 0.4504, "loss_ce": 0.13060924410820007, "loss_lvr": 0.9613748788833618, "loss_mode_switch": 0.0, "loss_total": 0.22674673795700073, "step": 1126 }, { "batch_size": 1, "epoch": 0.4504, "step": 1126, "tokens_per_device": 5724 }, { "epoch": 0.4504, "loss_ce": 0.31482839584350586, "loss_lvr": 0.5324968695640564, "loss_mode_switch": 0.0, "loss_total": 0.3680780827999115, "step": 1126 }, { "batch_size": 4, "epoch": 0.4504, "step": 1126, "tokens_per_device": 4240 }, { "epoch": 0.4504, "loss_ce": 0.3447074592113495, "loss_lvr": 1.3040474653244019, "loss_mode_switch": 0.0, "loss_total": 0.4751121997833252, "step": 1126 }, { "batch_size": 4, "epoch": 0.4504, "step": 1126, "tokens_per_device": 2632 }, { "epoch": 0.4504, "loss_ce": 0.28971800208091736, "loss_lvr": 0.816838800907135, "loss_mode_switch": 0.0, "loss_total": 0.3714018762111664, "step": 1126 }, { "batch_size": 1, "epoch": 0.4504, "step": 1126, "tokens_per_device": 4889 }, { "epoch": 0.4504, "loss_ce": 0.011843920685350895, "loss_lvr": 0.33549800515174866, "loss_mode_switch": 0.0, "loss_total": 0.045393720269203186, "step": 1126 }, { "batch_size": 4, "epoch": 0.4504, "step": 1126, "tokens_per_device": 4836 }, { "epoch": 0.4504, "loss_ce": 0.09291687607765198, "loss_lvr": 0.8867552876472473, "loss_mode_switch": 0.0, "loss_total": 0.1815924048423767, "step": 1126 }, { "batch_size": 1, "epoch": 0.4504, "step": 1126, "tokens_per_device": 4864 }, { "epoch": 0.4504, "loss_ce": 0.015724310651421547, "loss_lvr": 0.277042031288147, "loss_mode_switch": 0.0, "loss_total": 0.043428514152765274, "step": 1126 }, { "batch_size": 4, "epoch": 0.4504, "step": 1126, "tokens_per_device": 4276 }, { "epoch": 0.4504, "loss_ce": 0.6036017537117004, "loss_lvr": 0.6955233216285706, "loss_mode_switch": 0.0, "loss_total": 0.6731541156768799, "step": 1126 }, { "epoch": 0.4508, "grad_norm": 1.2792953252792358, "learning_rate": 6.032165314468935e-06, "loss": 0.28, "step": 1127 }, { "batch_size": 4, "epoch": 0.4508, "step": 1127, "tokens_per_device": 3428 }, { "epoch": 0.4508, "loss_ce": 0.4723990261554718, "loss_lvr": 0.843078076839447, "loss_mode_switch": 0.0, "loss_total": 0.5567068457603455, "step": 1127 }, { "batch_size": 1, "epoch": 0.4508, "step": 1127, "tokens_per_device": 5108 }, { "epoch": 0.4508, "loss_ce": 0.002005329355597496, "loss_lvr": 0.3243296146392822, "loss_mode_switch": 0.0, "loss_total": 0.03443828970193863, "step": 1127 }, { "batch_size": 1, "epoch": 0.4508, "step": 1127, "tokens_per_device": 5014 }, { "epoch": 0.4508, "loss_ce": 0.0436408706009388, "loss_lvr": 1.0753589868545532, "loss_mode_switch": 0.0, "loss_total": 0.15117676556110382, "step": 1127 }, { "batch_size": 4, "epoch": 0.4508, "step": 1127, "tokens_per_device": 4572 }, { "epoch": 0.4508, "loss_ce": 0.061783090233802795, "loss_lvr": 1.1811072826385498, "loss_mode_switch": 0.0, "loss_total": 0.17989382147789001, "step": 1127 }, { "batch_size": 4, "epoch": 0.4508, "step": 1127, "tokens_per_device": 1924 }, { "epoch": 0.4508, "loss_ce": 0.5660059452056885, "loss_lvr": 1.0894702672958374, "loss_mode_switch": 0.0, "loss_total": 0.6749529838562012, "step": 1127 }, { "batch_size": 4, "epoch": 0.4508, "step": 1127, "tokens_per_device": 2680 }, { "epoch": 0.4508, "loss_ce": 0.11564821749925613, "loss_lvr": 0.8338658213615417, "loss_mode_switch": 0.0, "loss_total": 0.19903481006622314, "step": 1127 }, { "batch_size": 4, "epoch": 0.4508, "step": 1127, "tokens_per_device": 5896 }, { "epoch": 0.4508, "loss_ce": 0.364045649766922, "loss_lvr": 0.8013330101966858, "loss_mode_switch": 0.0, "loss_total": 0.4441789388656616, "step": 1127 }, { "batch_size": 4, "epoch": 0.4508, "step": 1127, "tokens_per_device": 4148 }, { "epoch": 0.4508, "loss_ce": 0.22403214871883392, "loss_lvr": 0.7541830539703369, "loss_mode_switch": 0.0, "loss_total": 0.29945045709609985, "step": 1127 }, { "epoch": 0.4512, "grad_norm": 1.4092092514038086, "learning_rate": 6.025826460343252e-06, "loss": 0.2987, "step": 1128 }, { "batch_size": 4, "epoch": 0.4512, "step": 1128, "tokens_per_device": 4260 }, { "epoch": 0.4512, "loss_ce": 0.466202974319458, "loss_lvr": 0.8626462817192078, "loss_mode_switch": 0.0, "loss_total": 0.5524675846099854, "step": 1128 }, { "batch_size": 4, "epoch": 0.4512, "step": 1128, "tokens_per_device": 1508 }, { "epoch": 0.4512, "loss_ce": 0.5404914617538452, "loss_lvr": 1.0046197175979614, "loss_mode_switch": 0.0, "loss_total": 0.6409534215927124, "step": 1128 }, { "batch_size": 4, "epoch": 0.4512, "step": 1128, "tokens_per_device": 4260 }, { "epoch": 0.4512, "loss_ce": 0.5845950245857239, "loss_lvr": 0.887083113193512, "loss_mode_switch": 0.0, "loss_total": 0.6733033657073975, "step": 1128 }, { "batch_size": 1, "epoch": 0.4512, "step": 1128, "tokens_per_device": 5046 }, { "epoch": 0.4512, "loss_ce": 0.031082237139344215, "loss_lvr": 0.4250524342060089, "loss_mode_switch": 0.0, "loss_total": 0.07358748465776443, "step": 1128 }, { "batch_size": 4, "epoch": 0.4512, "step": 1128, "tokens_per_device": 2624 }, { "epoch": 0.4512, "loss_ce": 0.14809459447860718, "loss_lvr": 0.9628721475601196, "loss_mode_switch": 0.0, "loss_total": 0.24438181519508362, "step": 1128 }, { "batch_size": 4, "epoch": 0.4512, "step": 1128, "tokens_per_device": 4324 }, { "epoch": 0.4512, "loss_ce": 0.3375832140445709, "loss_lvr": 0.8547165989875793, "loss_mode_switch": 0.0, "loss_total": 0.42305487394332886, "step": 1128 }, { "batch_size": 4, "epoch": 0.4512, "step": 1128, "tokens_per_device": 4436 }, { "epoch": 0.4512, "loss_ce": 0.7708277702331543, "loss_lvr": 0.8770949840545654, "loss_mode_switch": 0.0, "loss_total": 0.8585372567176819, "step": 1128 }, { "batch_size": 4, "epoch": 0.4512, "step": 1128, "tokens_per_device": 4676 }, { "epoch": 0.4512, "loss_ce": 0.4950065314769745, "loss_lvr": 0.946452796459198, "loss_mode_switch": 0.0, "loss_total": 0.5896518230438232, "step": 1128 }, { "epoch": 0.4516, "grad_norm": 1.2248127460479736, "learning_rate": 6.0194858845468425e-06, "loss": 0.3075, "step": 1129 }, { "batch_size": 4, "epoch": 0.4516, "step": 1129, "tokens_per_device": 2584 }, { "epoch": 0.4516, "loss_ce": 0.04184768721461296, "loss_lvr": 0.8306637406349182, "loss_mode_switch": 0.0, "loss_total": 0.12491406500339508, "step": 1129 }, { "batch_size": 1, "epoch": 0.4516, "step": 1129, "tokens_per_device": 4873 }, { "epoch": 0.4516, "loss_ce": 0.00480966130271554, "loss_lvr": 0.9911512136459351, "loss_mode_switch": 0.0, "loss_total": 0.10392478853464127, "step": 1129 }, { "batch_size": 4, "epoch": 0.4516, "step": 1129, "tokens_per_device": 4272 }, { "epoch": 0.4516, "loss_ce": 0.037028372287750244, "loss_lvr": 1.0269334316253662, "loss_mode_switch": 0.0, "loss_total": 0.13972172141075134, "step": 1129 }, { "batch_size": 4, "epoch": 0.4516, "step": 1129, "tokens_per_device": 3988 }, { "epoch": 0.4516, "loss_ce": 0.017267746850848198, "loss_lvr": 0.8185770511627197, "loss_mode_switch": 0.0, "loss_total": 0.0991254523396492, "step": 1129 }, { "batch_size": 4, "epoch": 0.4516, "step": 1129, "tokens_per_device": 8056 }, { "epoch": 0.4516, "loss_ce": 0.14632351696491241, "loss_lvr": 0.8147099614143372, "loss_mode_switch": 0.0, "loss_total": 0.22779451310634613, "step": 1129 }, { "batch_size": 4, "epoch": 0.4516, "step": 1129, "tokens_per_device": 5136 }, { "epoch": 0.4516, "loss_ce": 0.0006723181577399373, "loss_lvr": 0.5074055194854736, "loss_mode_switch": 0.0, "loss_total": 0.05141286924481392, "step": 1129 }, { "batch_size": 4, "epoch": 0.4516, "step": 1129, "tokens_per_device": 4092 }, { "epoch": 0.4516, "loss_ce": 0.6234183311462402, "loss_lvr": 0.8570572733879089, "loss_mode_switch": 0.0, "loss_total": 0.7091240882873535, "step": 1129 }, { "batch_size": 4, "epoch": 0.4516, "step": 1129, "tokens_per_device": 1492 }, { "epoch": 0.4516, "loss_ce": 0.22768336534500122, "loss_lvr": 1.091336965560913, "loss_mode_switch": 0.0, "loss_total": 0.33681705594062805, "step": 1129 }, { "epoch": 0.452, "grad_norm": 1.2087818384170532, "learning_rate": 6.013143597721252e-06, "loss": 0.2783, "step": 1130 }, { "batch_size": 4, "epoch": 0.452, "step": 1130, "tokens_per_device": 8476 }, { "epoch": 0.452, "loss_ce": 0.22965985536575317, "loss_lvr": 0.9828601479530334, "loss_mode_switch": 0.0, "loss_total": 0.32794588804244995, "step": 1130 }, { "batch_size": 4, "epoch": 0.452, "step": 1130, "tokens_per_device": 4900 }, { "epoch": 0.452, "loss_ce": 0.07746121287345886, "loss_lvr": 0.8123681545257568, "loss_mode_switch": 0.0, "loss_total": 0.15869802236557007, "step": 1130 }, { "batch_size": 4, "epoch": 0.452, "step": 1130, "tokens_per_device": 5944 }, { "epoch": 0.452, "loss_ce": 0.7061774730682373, "loss_lvr": 0.7350157499313354, "loss_mode_switch": 0.0, "loss_total": 0.7796790599822998, "step": 1130 }, { "batch_size": 1, "epoch": 0.452, "step": 1130, "tokens_per_device": 4887 }, { "epoch": 0.452, "loss_ce": 0.025884615257382393, "loss_lvr": 0.5505647659301758, "loss_mode_switch": 0.0, "loss_total": 0.0809410959482193, "step": 1130 }, { "batch_size": 1, "epoch": 0.452, "step": 1130, "tokens_per_device": 4949 }, { "epoch": 0.452, "loss_ce": 1.0086934566497803, "loss_lvr": 0.6271729469299316, "loss_mode_switch": 0.0, "loss_total": 1.0714107751846313, "step": 1130 }, { "batch_size": 4, "epoch": 0.452, "step": 1130, "tokens_per_device": 1404 }, { "epoch": 0.452, "loss_ce": 0.31497228145599365, "loss_lvr": 0.9279292225837708, "loss_mode_switch": 0.0, "loss_total": 0.4077652096748352, "step": 1130 }, { "batch_size": 1, "epoch": 0.452, "step": 1130, "tokens_per_device": 4902 }, { "epoch": 0.452, "loss_ce": 0.38393455743789673, "loss_lvr": 0.45417678356170654, "loss_mode_switch": 0.0, "loss_total": 0.4293522238731384, "step": 1130 }, { "batch_size": 4, "epoch": 0.452, "step": 1130, "tokens_per_device": 3788 }, { "epoch": 0.452, "loss_ce": 0.18258267641067505, "loss_lvr": 0.9700767993927002, "loss_mode_switch": 0.0, "loss_total": 0.279590368270874, "step": 1130 }, { "epoch": 0.4524, "grad_norm": 1.8462023735046387, "learning_rate": 6.006799610510905e-06, "loss": 0.3507, "step": 1131 }, { "batch_size": 1, "epoch": 0.4524, "step": 1131, "tokens_per_device": 4895 }, { "epoch": 0.4524, "loss_ce": 0.18983806669712067, "loss_lvr": 0.4212546646595001, "loss_mode_switch": 0.0, "loss_total": 0.23196353018283844, "step": 1131 }, { "batch_size": 1, "epoch": 0.4524, "step": 1131, "tokens_per_device": 5185 }, { "epoch": 0.4524, "loss_ce": 0.03270544856786728, "loss_lvr": 0.5115962624549866, "loss_mode_switch": 0.0, "loss_total": 0.08386507630348206, "step": 1131 }, { "batch_size": 1, "epoch": 0.4524, "step": 1131, "tokens_per_device": 4876 }, { "epoch": 0.4524, "loss_ce": 0.00621907040476799, "loss_lvr": 0.1476055234670639, "loss_mode_switch": 0.0, "loss_total": 0.0209796242415905, "step": 1131 }, { "batch_size": 4, "epoch": 0.4524, "step": 1131, "tokens_per_device": 4212 }, { "epoch": 0.4524, "loss_ce": 0.4031263291835785, "loss_lvr": 0.9158163666725159, "loss_mode_switch": 0.0, "loss_total": 0.49470797181129456, "step": 1131 }, { "batch_size": 4, "epoch": 0.4524, "step": 1131, "tokens_per_device": 5820 }, { "epoch": 0.4524, "loss_ce": 0.26792237162590027, "loss_lvr": 1.1013493537902832, "loss_mode_switch": 0.0, "loss_total": 0.3780573010444641, "step": 1131 }, { "batch_size": 1, "epoch": 0.4524, "step": 1131, "tokens_per_device": 4741 }, { "epoch": 0.4524, "loss_ce": 0.0034952983260154724, "loss_lvr": 0.35033509135246277, "loss_mode_switch": 0.0, "loss_total": 0.03852880746126175, "step": 1131 }, { "batch_size": 1, "epoch": 0.4524, "step": 1131, "tokens_per_device": 4893 }, { "epoch": 0.4524, "loss_ce": 0.1561591923236847, "loss_lvr": 0.21275624632835388, "loss_mode_switch": 0.0, "loss_total": 0.17743481695652008, "step": 1131 }, { "batch_size": 4, "epoch": 0.4524, "step": 1131, "tokens_per_device": 1316 }, { "epoch": 0.4524, "loss_ce": 0.06744575500488281, "loss_lvr": 1.0039767026901245, "loss_mode_switch": 0.0, "loss_total": 0.16784343123435974, "step": 1131 }, { "epoch": 0.4528, "grad_norm": 1.2732008695602417, "learning_rate": 6.000453933563075e-06, "loss": 0.2883, "step": 1132 }, { "batch_size": 4, "epoch": 0.4528, "step": 1132, "tokens_per_device": 8152 }, { "epoch": 0.4528, "loss_ce": 0.24922487139701843, "loss_lvr": 0.8243896961212158, "loss_mode_switch": 0.0, "loss_total": 0.3316638469696045, "step": 1132 }, { "batch_size": 4, "epoch": 0.4528, "step": 1132, "tokens_per_device": 4224 }, { "epoch": 0.4528, "loss_ce": 0.5110427737236023, "loss_lvr": 0.9198247194290161, "loss_mode_switch": 0.0, "loss_total": 0.6030252575874329, "step": 1132 }, { "batch_size": 4, "epoch": 0.4528, "step": 1132, "tokens_per_device": 1620 }, { "epoch": 0.4528, "loss_ce": 0.3593294620513916, "loss_lvr": 1.0405728816986084, "loss_mode_switch": 0.0, "loss_total": 0.46338674426078796, "step": 1132 }, { "batch_size": 4, "epoch": 0.4528, "step": 1132, "tokens_per_device": 4216 }, { "epoch": 0.4528, "loss_ce": 0.04225797951221466, "loss_lvr": 0.797683596611023, "loss_mode_switch": 0.0, "loss_total": 0.12202633917331696, "step": 1132 }, { "batch_size": 4, "epoch": 0.4528, "step": 1132, "tokens_per_device": 4652 }, { "epoch": 0.4528, "loss_ce": 0.18622742593288422, "loss_lvr": 0.7382999062538147, "loss_mode_switch": 0.0, "loss_total": 0.2600574195384979, "step": 1132 }, { "batch_size": 4, "epoch": 0.4528, "step": 1132, "tokens_per_device": 6984 }, { "epoch": 0.4528, "loss_ce": 0.16449730098247528, "loss_lvr": 0.8867077231407166, "loss_mode_switch": 0.0, "loss_total": 0.2531680762767792, "step": 1132 }, { "batch_size": 4, "epoch": 0.4528, "step": 1132, "tokens_per_device": 6896 }, { "epoch": 0.4528, "loss_ce": 0.0673694983124733, "loss_lvr": 0.5698768496513367, "loss_mode_switch": 0.0, "loss_total": 0.1243571788072586, "step": 1132 }, { "batch_size": 4, "epoch": 0.4528, "step": 1132, "tokens_per_device": 2620 }, { "epoch": 0.4528, "loss_ce": 0.24736936390399933, "loss_lvr": 0.6944669485092163, "loss_mode_switch": 0.0, "loss_total": 0.3168160617351532, "step": 1132 }, { "epoch": 0.4532, "grad_norm": 1.3520785570144653, "learning_rate": 5.994106577527877e-06, "loss": 0.3044, "step": 1133 }, { "batch_size": 4, "epoch": 0.4532, "step": 1133, "tokens_per_device": 4308 }, { "epoch": 0.4532, "loss_ce": 0.21880961954593658, "loss_lvr": 1.0146279335021973, "loss_mode_switch": 0.0, "loss_total": 0.32027241587638855, "step": 1133 }, { "batch_size": 1, "epoch": 0.4532, "step": 1133, "tokens_per_device": 4908 }, { "epoch": 0.4532, "loss_ce": 0.017147962003946304, "loss_lvr": 0.4090758264064789, "loss_mode_switch": 0.0, "loss_total": 0.05805554613471031, "step": 1133 }, { "batch_size": 4, "epoch": 0.4532, "step": 1133, "tokens_per_device": 4228 }, { "epoch": 0.4532, "loss_ce": 0.7591156959533691, "loss_lvr": 1.1398283243179321, "loss_mode_switch": 0.0, "loss_total": 0.8730985522270203, "step": 1133 }, { "batch_size": 4, "epoch": 0.4532, "step": 1133, "tokens_per_device": 2576 }, { "epoch": 0.4532, "loss_ce": 0.22781674563884735, "loss_lvr": 1.1618746519088745, "loss_mode_switch": 0.0, "loss_total": 0.34400421380996704, "step": 1133 }, { "batch_size": 1, "epoch": 0.4532, "step": 1133, "tokens_per_device": 5057 }, { "epoch": 0.4532, "loss_ce": 0.2086358666419983, "loss_lvr": 0.3524082899093628, "loss_mode_switch": 0.0, "loss_total": 0.24387669563293457, "step": 1133 }, { "batch_size": 1, "epoch": 0.4532, "step": 1133, "tokens_per_device": 4857 }, { "epoch": 0.4532, "loss_ce": 0.008804119192063808, "loss_lvr": 0.21203027665615082, "loss_mode_switch": 0.0, "loss_total": 0.030007146298885345, "step": 1133 }, { "batch_size": 4, "epoch": 0.4532, "step": 1133, "tokens_per_device": 5696 }, { "epoch": 0.4532, "loss_ce": 0.2697339355945587, "loss_lvr": 0.7870887517929077, "loss_mode_switch": 0.0, "loss_total": 0.34844282269477844, "step": 1133 }, { "batch_size": 4, "epoch": 0.4532, "step": 1133, "tokens_per_device": 1208 }, { "epoch": 0.4532, "loss_ce": 0.06075095385313034, "loss_lvr": 1.1125967502593994, "loss_mode_switch": 0.0, "loss_total": 0.1720106303691864, "step": 1133 }, { "epoch": 0.4536, "grad_norm": 1.1451865434646606, "learning_rate": 5.987757553058236e-06, "loss": 0.2309, "step": 1134 }, { "batch_size": 4, "epoch": 0.4536, "step": 1134, "tokens_per_device": 6660 }, { "epoch": 0.4536, "loss_ce": 0.16906936466693878, "loss_lvr": 0.45002683997154236, "loss_mode_switch": 0.0, "loss_total": 0.21407204866409302, "step": 1134 }, { "batch_size": 4, "epoch": 0.4536, "step": 1134, "tokens_per_device": 11052 }, { "epoch": 0.4536, "loss_ce": 0.12141966074705124, "loss_lvr": 0.6394789218902588, "loss_mode_switch": 0.0, "loss_total": 0.18536755442619324, "step": 1134 }, { "batch_size": 1, "epoch": 0.4536, "step": 1134, "tokens_per_device": 4655 }, { "epoch": 0.4536, "loss_ce": 0.0007777741411700845, "loss_lvr": 0.5204026699066162, "loss_mode_switch": 0.0, "loss_total": 0.052818041294813156, "step": 1134 }, { "batch_size": 1, "epoch": 0.4536, "step": 1134, "tokens_per_device": 5115 }, { "epoch": 0.4536, "loss_ce": 0.0038028727285563946, "loss_lvr": 0.23341256380081177, "loss_mode_switch": 0.0, "loss_total": 0.02714413031935692, "step": 1134 }, { "batch_size": 4, "epoch": 0.4536, "step": 1134, "tokens_per_device": 5068 }, { "epoch": 0.4536, "loss_ce": 0.4840807616710663, "loss_lvr": 0.8928013443946838, "loss_mode_switch": 0.0, "loss_total": 0.5733609199523926, "step": 1134 }, { "batch_size": 4, "epoch": 0.4536, "step": 1134, "tokens_per_device": 10692 }, { "epoch": 0.4536, "loss_ce": 0.42173436284065247, "loss_lvr": 1.016533613204956, "loss_mode_switch": 0.0, "loss_total": 0.5233877301216125, "step": 1134 }, { "batch_size": 1, "epoch": 0.4536, "step": 1134, "tokens_per_device": 6755 }, { "epoch": 0.4536, "loss_ce": 0.14663225412368774, "loss_lvr": 0.3299829959869385, "loss_mode_switch": 0.0, "loss_total": 0.17963054776191711, "step": 1134 }, { "batch_size": 4, "epoch": 0.4536, "step": 1134, "tokens_per_device": 3684 }, { "epoch": 0.4536, "loss_ce": 0.5098504424095154, "loss_lvr": 0.9092419743537903, "loss_mode_switch": 0.0, "loss_total": 0.6007746458053589, "step": 1134 }, { "epoch": 0.454, "grad_norm": 1.7347288131713867, "learning_rate": 5.981406870809889e-06, "loss": 0.3594, "step": 1135 }, { "batch_size": 4, "epoch": 0.454, "step": 1135, "tokens_per_device": 1296 }, { "epoch": 0.454, "loss_ce": 0.12881438434123993, "loss_lvr": 1.1951745748519897, "loss_mode_switch": 0.0, "loss_total": 0.24833184480667114, "step": 1135 }, { "batch_size": 4, "epoch": 0.454, "step": 1135, "tokens_per_device": 3024 }, { "epoch": 0.454, "loss_ce": 0.378499835729599, "loss_lvr": 0.6381654143333435, "loss_mode_switch": 0.0, "loss_total": 0.4423163831233978, "step": 1135 }, { "batch_size": 1, "epoch": 0.454, "step": 1135, "tokens_per_device": 4752 }, { "epoch": 0.454, "loss_ce": 0.02968255989253521, "loss_lvr": 0.3417363464832306, "loss_mode_switch": 0.0, "loss_total": 0.06385619193315506, "step": 1135 }, { "batch_size": 4, "epoch": 0.454, "step": 1135, "tokens_per_device": 8880 }, { "epoch": 0.454, "loss_ce": 0.18430328369140625, "loss_lvr": 0.6418318152427673, "loss_mode_switch": 0.0, "loss_total": 0.2484864592552185, "step": 1135 }, { "batch_size": 1, "epoch": 0.454, "step": 1135, "tokens_per_device": 4692 }, { "epoch": 0.454, "loss_ce": 0.020769990980625153, "loss_lvr": 0.8889697790145874, "loss_mode_switch": 0.0, "loss_total": 0.10966697335243225, "step": 1135 }, { "batch_size": 4, "epoch": 0.454, "step": 1135, "tokens_per_device": 11488 }, { "epoch": 0.454, "loss_ce": 0.41468939185142517, "loss_lvr": 0.876112699508667, "loss_mode_switch": 0.0, "loss_total": 0.5023006796836853, "step": 1135 }, { "batch_size": 4, "epoch": 0.454, "step": 1135, "tokens_per_device": 7052 }, { "epoch": 0.454, "loss_ce": 0.4532691240310669, "loss_lvr": 0.6643432378768921, "loss_mode_switch": 0.0, "loss_total": 0.5197034478187561, "step": 1135 }, { "batch_size": 4, "epoch": 0.454, "step": 1135, "tokens_per_device": 8628 }, { "epoch": 0.454, "loss_ce": 0.017366209998726845, "loss_lvr": 0.8177526593208313, "loss_mode_switch": 0.0, "loss_total": 0.09914147853851318, "step": 1135 }, { "epoch": 0.4544, "grad_norm": 1.2608492374420166, "learning_rate": 5.9750545414413405e-06, "loss": 0.3233, "step": 1136 }, { "batch_size": 4, "epoch": 0.4544, "step": 1136, "tokens_per_device": 11204 }, { "epoch": 0.4544, "loss_ce": 0.5299434065818787, "loss_lvr": 0.6320205926895142, "loss_mode_switch": 0.0, "loss_total": 0.593145489692688, "step": 1136 }, { "batch_size": 4, "epoch": 0.4544, "step": 1136, "tokens_per_device": 4220 }, { "epoch": 0.4544, "loss_ce": 0.27178412675857544, "loss_lvr": 0.8102257251739502, "loss_mode_switch": 0.0, "loss_total": 0.3528066873550415, "step": 1136 }, { "batch_size": 4, "epoch": 0.4544, "step": 1136, "tokens_per_device": 5844 }, { "epoch": 0.4544, "loss_ce": 0.2001299113035202, "loss_lvr": 0.6718998551368713, "loss_mode_switch": 0.0, "loss_total": 0.2673198878765106, "step": 1136 }, { "batch_size": 1, "epoch": 0.4544, "step": 1136, "tokens_per_device": 6755 }, { "epoch": 0.4544, "loss_ce": 0.2597888708114624, "loss_lvr": 0.3127569854259491, "loss_mode_switch": 0.0, "loss_total": 0.2910645604133606, "step": 1136 }, { "batch_size": 1, "epoch": 0.4544, "step": 1136, "tokens_per_device": 4945 }, { "epoch": 0.4544, "loss_ce": 0.15016096830368042, "loss_lvr": 0.529450535774231, "loss_mode_switch": 0.0, "loss_total": 0.20310601592063904, "step": 1136 }, { "batch_size": 4, "epoch": 0.4544, "step": 1136, "tokens_per_device": 3812 }, { "epoch": 0.4544, "loss_ce": 0.13723036646842957, "loss_lvr": 1.65645170211792, "loss_mode_switch": 0.0, "loss_total": 0.3028755187988281, "step": 1136 }, { "batch_size": 4, "epoch": 0.4544, "step": 1136, "tokens_per_device": 5248 }, { "epoch": 0.4544, "loss_ce": 0.721676230430603, "loss_lvr": 1.1269162893295288, "loss_mode_switch": 0.0, "loss_total": 0.8343678712844849, "step": 1136 }, { "batch_size": 4, "epoch": 0.4544, "step": 1136, "tokens_per_device": 5752 }, { "epoch": 0.4544, "loss_ce": 0.18652504682540894, "loss_lvr": 0.813923716545105, "loss_mode_switch": 0.0, "loss_total": 0.2679174244403839, "step": 1136 }, { "epoch": 0.4548, "grad_norm": 1.2448424100875854, "learning_rate": 5.96870057561387e-06, "loss": 0.2829, "step": 1137 }, { "batch_size": 4, "epoch": 0.4548, "step": 1137, "tokens_per_device": 4260 }, { "epoch": 0.4548, "loss_ce": 0.3302413821220398, "loss_lvr": 0.8820832967758179, "loss_mode_switch": 0.0, "loss_total": 0.4184496998786926, "step": 1137 }, { "batch_size": 4, "epoch": 0.4548, "step": 1137, "tokens_per_device": 3996 }, { "epoch": 0.4548, "loss_ce": 0.47631406784057617, "loss_lvr": 0.9270046353340149, "loss_mode_switch": 0.0, "loss_total": 0.5690145492553711, "step": 1137 }, { "batch_size": 1, "epoch": 0.4548, "step": 1137, "tokens_per_device": 5064 }, { "epoch": 0.4548, "loss_ce": 0.015477295033633709, "loss_lvr": 0.37557318806648254, "loss_mode_switch": 0.0, "loss_total": 0.05303461477160454, "step": 1137 }, { "batch_size": 1, "epoch": 0.4548, "step": 1137, "tokens_per_device": 5116 }, { "epoch": 0.4548, "loss_ce": 0.004564845934510231, "loss_lvr": 0.20943963527679443, "loss_mode_switch": 0.0, "loss_total": 0.025508809834718704, "step": 1137 }, { "batch_size": 1, "epoch": 0.4548, "step": 1137, "tokens_per_device": 4742 }, { "epoch": 0.4548, "loss_ce": 0.017217816784977913, "loss_lvr": 0.28105077147483826, "loss_mode_switch": 0.0, "loss_total": 0.04532289505004883, "step": 1137 }, { "batch_size": 4, "epoch": 0.4548, "step": 1137, "tokens_per_device": 5708 }, { "epoch": 0.4548, "loss_ce": 0.3465758264064789, "loss_lvr": 0.7072722911834717, "loss_mode_switch": 0.0, "loss_total": 0.41730305552482605, "step": 1137 }, { "batch_size": 1, "epoch": 0.4548, "step": 1137, "tokens_per_device": 5829 }, { "epoch": 0.4548, "loss_ce": 0.0007803210173733532, "loss_lvr": 0.4090363681316376, "loss_mode_switch": 0.0, "loss_total": 0.04168396070599556, "step": 1137 }, { "batch_size": 4, "epoch": 0.4548, "step": 1137, "tokens_per_device": 4948 }, { "epoch": 0.4548, "loss_ce": 0.29920023679733276, "loss_lvr": 0.9955642223358154, "loss_mode_switch": 0.0, "loss_total": 0.39875665307044983, "step": 1137 }, { "epoch": 0.4552, "grad_norm": 1.497707724571228, "learning_rate": 5.962344983991503e-06, "loss": 0.2901, "step": 1138 }, { "batch_size": 1, "epoch": 0.4552, "step": 1138, "tokens_per_device": 4914 }, { "epoch": 0.4552, "loss_ce": 0.3700765073299408, "loss_lvr": 0.32057663798332214, "loss_mode_switch": 0.0, "loss_total": 0.4021341800689697, "step": 1138 }, { "batch_size": 1, "epoch": 0.4552, "step": 1138, "tokens_per_device": 4183 }, { "epoch": 0.4552, "loss_ce": 0.014883887954056263, "loss_lvr": 0.3479517698287964, "loss_mode_switch": 0.0, "loss_total": 0.04967906326055527, "step": 1138 }, { "batch_size": 4, "epoch": 0.4552, "step": 1138, "tokens_per_device": 4280 }, { "epoch": 0.4552, "loss_ce": 0.07668974250555038, "loss_lvr": 0.9926730990409851, "loss_mode_switch": 0.0, "loss_total": 0.17595705389976501, "step": 1138 }, { "batch_size": 4, "epoch": 0.4552, "step": 1138, "tokens_per_device": 16200 }, { "epoch": 0.4552, "loss_ce": 0.09684360027313232, "loss_lvr": 0.7829873561859131, "loss_mode_switch": 0.0, "loss_total": 0.1751423478126526, "step": 1138 }, { "batch_size": 4, "epoch": 0.4552, "step": 1138, "tokens_per_device": 5588 }, { "epoch": 0.4552, "loss_ce": 0.007770128548145294, "loss_lvr": 0.7697790861129761, "loss_mode_switch": 0.0, "loss_total": 0.0847480371594429, "step": 1138 }, { "batch_size": 4, "epoch": 0.4552, "step": 1138, "tokens_per_device": 1480 }, { "epoch": 0.4552, "loss_ce": 0.28856149315834045, "loss_lvr": 0.9242924451828003, "loss_mode_switch": 0.0, "loss_total": 0.38099074363708496, "step": 1138 }, { "batch_size": 4, "epoch": 0.4552, "step": 1138, "tokens_per_device": 4264 }, { "epoch": 0.4552, "loss_ce": 0.06868826597929001, "loss_lvr": 0.922417402267456, "loss_mode_switch": 0.0, "loss_total": 0.16093000769615173, "step": 1138 }, { "batch_size": 1, "epoch": 0.4552, "step": 1138, "tokens_per_device": 5034 }, { "epoch": 0.4552, "loss_ce": 0.057069938629865646, "loss_lvr": 0.6423529386520386, "loss_mode_switch": 0.0, "loss_total": 0.12130522727966309, "step": 1138 }, { "epoch": 0.4556, "grad_norm": 1.5626826286315918, "learning_rate": 5.955987777240985e-06, "loss": 0.3333, "step": 1139 }, { "batch_size": 4, "epoch": 0.4556, "step": 1139, "tokens_per_device": 2524 }, { "epoch": 0.4556, "loss_ce": 0.4980980455875397, "loss_lvr": 0.8105636239051819, "loss_mode_switch": 0.0, "loss_total": 0.5791544318199158, "step": 1139 }, { "batch_size": 4, "epoch": 0.4556, "step": 1139, "tokens_per_device": 4304 }, { "epoch": 0.4556, "loss_ce": 0.0004793382540810853, "loss_lvr": 0.7019848823547363, "loss_mode_switch": 0.0, "loss_total": 0.07067783176898956, "step": 1139 }, { "batch_size": 4, "epoch": 0.4556, "step": 1139, "tokens_per_device": 4168 }, { "epoch": 0.4556, "loss_ce": 0.10292449593544006, "loss_lvr": 0.5680076479911804, "loss_mode_switch": 0.0, "loss_total": 0.15972526371479034, "step": 1139 }, { "batch_size": 4, "epoch": 0.4556, "step": 1139, "tokens_per_device": 2784 }, { "epoch": 0.4556, "loss_ce": 0.48100948333740234, "loss_lvr": 0.7486911416053772, "loss_mode_switch": 0.0, "loss_total": 0.5558785796165466, "step": 1139 }, { "batch_size": 1, "epoch": 0.4556, "step": 1139, "tokens_per_device": 4910 }, { "epoch": 0.4556, "loss_ce": 0.022261254489421844, "loss_lvr": 0.46385812759399414, "loss_mode_switch": 0.0, "loss_total": 0.06864707171916962, "step": 1139 }, { "batch_size": 1, "epoch": 0.4556, "step": 1139, "tokens_per_device": 4907 }, { "epoch": 0.4556, "loss_ce": 0.1882459819316864, "loss_lvr": 0.4936429560184479, "loss_mode_switch": 0.0, "loss_total": 0.23761028051376343, "step": 1139 }, { "batch_size": 4, "epoch": 0.4556, "step": 1139, "tokens_per_device": 4248 }, { "epoch": 0.4556, "loss_ce": 0.003896596608683467, "loss_lvr": 0.9450675845146179, "loss_mode_switch": 0.0, "loss_total": 0.09840335696935654, "step": 1139 }, { "batch_size": 1, "epoch": 0.4556, "step": 1139, "tokens_per_device": 5117 }, { "epoch": 0.4556, "loss_ce": 0.0008747646352276206, "loss_lvr": 0.5689980387687683, "loss_mode_switch": 0.0, "loss_total": 0.05777456983923912, "step": 1139 }, { "epoch": 0.456, "grad_norm": 1.51164972782135, "learning_rate": 5.949628966031785e-06, "loss": 0.2613, "step": 1140 }, { "batch_size": 1, "epoch": 0.456, "step": 1140, "tokens_per_device": 5162 }, { "epoch": 0.456, "loss_ce": 0.002879560925066471, "loss_lvr": 0.23121221363544464, "loss_mode_switch": 0.0, "loss_total": 0.02600078284740448, "step": 1140 }, { "batch_size": 1, "epoch": 0.456, "step": 1140, "tokens_per_device": 4832 }, { "epoch": 0.456, "loss_ce": 0.1014900803565979, "loss_lvr": 0.3577515184879303, "loss_mode_switch": 0.0, "loss_total": 0.13726523518562317, "step": 1140 }, { "batch_size": 4, "epoch": 0.456, "step": 1140, "tokens_per_device": 1700 }, { "epoch": 0.456, "loss_ce": 0.4087786078453064, "loss_lvr": 1.219834804534912, "loss_mode_switch": 0.0, "loss_total": 0.5307620763778687, "step": 1140 }, { "batch_size": 1, "epoch": 0.456, "step": 1140, "tokens_per_device": 4444 }, { "epoch": 0.456, "loss_ce": 0.003243369748815894, "loss_lvr": 0.5947350263595581, "loss_mode_switch": 0.0, "loss_total": 0.06271687150001526, "step": 1140 }, { "batch_size": 4, "epoch": 0.456, "step": 1140, "tokens_per_device": 4836 }, { "epoch": 0.456, "loss_ce": 0.2344619482755661, "loss_lvr": 0.7932385802268982, "loss_mode_switch": 0.0, "loss_total": 0.3137857913970947, "step": 1140 }, { "batch_size": 4, "epoch": 0.456, "step": 1140, "tokens_per_device": 4508 }, { "epoch": 0.456, "loss_ce": 0.16794267296791077, "loss_lvr": 1.2048108577728271, "loss_mode_switch": 0.0, "loss_total": 0.2884237766265869, "step": 1140 }, { "batch_size": 4, "epoch": 0.456, "step": 1140, "tokens_per_device": 4036 }, { "epoch": 0.456, "loss_ce": 0.22029712796211243, "loss_lvr": 1.1849063634872437, "loss_mode_switch": 0.0, "loss_total": 0.3387877643108368, "step": 1140 }, { "batch_size": 4, "epoch": 0.456, "step": 1140, "tokens_per_device": 1612 }, { "epoch": 0.456, "loss_ce": 0.1583927869796753, "loss_lvr": 0.9675546884536743, "loss_mode_switch": 0.0, "loss_total": 0.2551482617855072, "step": 1140 }, { "epoch": 0.4564, "grad_norm": 1.4536104202270508, "learning_rate": 5.943268561036053e-06, "loss": 0.2655, "step": 1141 }, { "batch_size": 4, "epoch": 0.4564, "step": 1141, "tokens_per_device": 5832 }, { "epoch": 0.4564, "loss_ce": 0.5585606694221497, "loss_lvr": 1.017057180404663, "loss_mode_switch": 0.0, "loss_total": 0.6602663993835449, "step": 1141 }, { "batch_size": 4, "epoch": 0.4564, "step": 1141, "tokens_per_device": 4552 }, { "epoch": 0.4564, "loss_ce": 0.02070450782775879, "loss_lvr": 0.9157145023345947, "loss_mode_switch": 0.0, "loss_total": 0.11227595806121826, "step": 1141 }, { "batch_size": 4, "epoch": 0.4564, "step": 1141, "tokens_per_device": 4568 }, { "epoch": 0.4564, "loss_ce": 0.22716008126735687, "loss_lvr": 1.0041190385818481, "loss_mode_switch": 0.0, "loss_total": 0.3275719881057739, "step": 1141 }, { "batch_size": 4, "epoch": 0.4564, "step": 1141, "tokens_per_device": 5684 }, { "epoch": 0.4564, "loss_ce": 0.4288131892681122, "loss_lvr": 1.1051173210144043, "loss_mode_switch": 0.0, "loss_total": 0.539324939250946, "step": 1141 }, { "batch_size": 4, "epoch": 0.4564, "step": 1141, "tokens_per_device": 1832 }, { "epoch": 0.4564, "loss_ce": 0.5820720791816711, "loss_lvr": 0.9049969911575317, "loss_mode_switch": 0.0, "loss_total": 0.6725717782974243, "step": 1141 }, { "batch_size": 4, "epoch": 0.4564, "step": 1141, "tokens_per_device": 10160 }, { "epoch": 0.4564, "loss_ce": 0.06924648582935333, "loss_lvr": 0.748221218585968, "loss_mode_switch": 0.0, "loss_total": 0.14406859874725342, "step": 1141 }, { "batch_size": 4, "epoch": 0.4564, "step": 1141, "tokens_per_device": 4572 }, { "epoch": 0.4564, "loss_ce": 0.2264835685491562, "loss_lvr": 0.8585084080696106, "loss_mode_switch": 0.0, "loss_total": 0.31233441829681396, "step": 1141 }, { "batch_size": 1, "epoch": 0.4564, "step": 1141, "tokens_per_device": 5663 }, { "epoch": 0.4564, "loss_ce": 0.27124160528182983, "loss_lvr": 0.2736297845840454, "loss_mode_switch": 0.0, "loss_total": 0.2986045777797699, "step": 1141 }, { "epoch": 0.4568, "grad_norm": 1.5907766819000244, "learning_rate": 5.936906572928625e-06, "loss": 0.3101, "step": 1142 }, { "batch_size": 4, "epoch": 0.4568, "step": 1142, "tokens_per_device": 5564 }, { "epoch": 0.4568, "loss_ce": 0.09501796215772629, "loss_lvr": 1.161879062652588, "loss_mode_switch": 0.0, "loss_total": 0.2112058699131012, "step": 1142 }, { "batch_size": 1, "epoch": 0.4568, "step": 1142, "tokens_per_device": 5219 }, { "epoch": 0.4568, "loss_ce": 0.030573982745409012, "loss_lvr": 0.4364941120147705, "loss_mode_switch": 0.0, "loss_total": 0.07422339916229248, "step": 1142 }, { "batch_size": 4, "epoch": 0.4568, "step": 1142, "tokens_per_device": 4296 }, { "epoch": 0.4568, "loss_ce": 0.26046040654182434, "loss_lvr": 0.8086906671524048, "loss_mode_switch": 0.0, "loss_total": 0.3413294851779938, "step": 1142 }, { "batch_size": 4, "epoch": 0.4568, "step": 1142, "tokens_per_device": 4892 }, { "epoch": 0.4568, "loss_ce": 0.5481979250907898, "loss_lvr": 1.2429996728897095, "loss_mode_switch": 0.0, "loss_total": 0.6724978685379028, "step": 1142 }, { "batch_size": 4, "epoch": 0.4568, "step": 1142, "tokens_per_device": 4296 }, { "epoch": 0.4568, "loss_ce": 0.46209296584129333, "loss_lvr": 1.1352721452713013, "loss_mode_switch": 0.0, "loss_total": 0.575620174407959, "step": 1142 }, { "batch_size": 4, "epoch": 0.4568, "step": 1142, "tokens_per_device": 1368 }, { "epoch": 0.4568, "loss_ce": 0.5450726747512817, "loss_lvr": 0.9102612137794495, "loss_mode_switch": 0.0, "loss_total": 0.6360988020896912, "step": 1142 }, { "batch_size": 4, "epoch": 0.4568, "step": 1142, "tokens_per_device": 2900 }, { "epoch": 0.4568, "loss_ce": 0.5721556544303894, "loss_lvr": 1.3053202629089355, "loss_mode_switch": 0.0, "loss_total": 0.702687680721283, "step": 1142 }, { "batch_size": 4, "epoch": 0.4568, "step": 1142, "tokens_per_device": 4512 }, { "epoch": 0.4568, "loss_ce": 0.4106220304965973, "loss_lvr": 1.0003849267959595, "loss_mode_switch": 0.0, "loss_total": 0.5106605291366577, "step": 1142 }, { "epoch": 0.4572, "grad_norm": 1.336107611656189, "learning_rate": 5.930543012386981e-06, "loss": 0.3253, "step": 1143 }, { "batch_size": 4, "epoch": 0.4572, "step": 1143, "tokens_per_device": 5156 }, { "epoch": 0.4572, "loss_ce": 0.09814389795064926, "loss_lvr": 0.8239454030990601, "loss_mode_switch": 0.0, "loss_total": 0.18053844571113586, "step": 1143 }, { "batch_size": 4, "epoch": 0.4572, "step": 1143, "tokens_per_device": 2524 }, { "epoch": 0.4572, "loss_ce": 0.14415980875492096, "loss_lvr": 1.3027704954147339, "loss_mode_switch": 0.0, "loss_total": 0.2744368612766266, "step": 1143 }, { "batch_size": 4, "epoch": 0.4572, "step": 1143, "tokens_per_device": 3760 }, { "epoch": 0.4572, "loss_ce": 0.3561728596687317, "loss_lvr": 0.8470566272735596, "loss_mode_switch": 0.0, "loss_total": 0.4408785104751587, "step": 1143 }, { "batch_size": 4, "epoch": 0.4572, "step": 1143, "tokens_per_device": 1508 }, { "epoch": 0.4572, "loss_ce": 0.4085855185985565, "loss_lvr": 1.0152310132980347, "loss_mode_switch": 0.0, "loss_total": 0.5101085901260376, "step": 1143 }, { "batch_size": 4, "epoch": 0.4572, "step": 1143, "tokens_per_device": 5228 }, { "epoch": 0.4572, "loss_ce": 0.1851581335067749, "loss_lvr": 0.7970650792121887, "loss_mode_switch": 0.0, "loss_total": 0.26486465334892273, "step": 1143 }, { "batch_size": 4, "epoch": 0.4572, "step": 1143, "tokens_per_device": 1232 }, { "epoch": 0.4572, "loss_ce": 0.26334312558174133, "loss_lvr": 0.9243679046630859, "loss_mode_switch": 0.0, "loss_total": 0.3557799160480499, "step": 1143 }, { "batch_size": 4, "epoch": 0.4572, "step": 1143, "tokens_per_device": 5812 }, { "epoch": 0.4572, "loss_ce": 0.24508078396320343, "loss_lvr": 0.8373213410377502, "loss_mode_switch": 0.0, "loss_total": 0.32881292700767517, "step": 1143 }, { "batch_size": 1, "epoch": 0.4572, "step": 1143, "tokens_per_device": 4742 }, { "epoch": 0.4572, "loss_ce": 0.04155684635043144, "loss_lvr": 0.37809184193611145, "loss_mode_switch": 0.0, "loss_total": 0.07936602830886841, "step": 1143 }, { "epoch": 0.4576, "grad_norm": 1.42678701877594, "learning_rate": 5.924177890091251e-06, "loss": 0.2993, "step": 1144 }, { "batch_size": 1, "epoch": 0.4576, "step": 1144, "tokens_per_device": 5165 }, { "epoch": 0.4576, "loss_ce": 0.05053864046931267, "loss_lvr": 0.3738809823989868, "loss_mode_switch": 0.0, "loss_total": 0.08792673796415329, "step": 1144 }, { "batch_size": 1, "epoch": 0.4576, "step": 1144, "tokens_per_device": 4878 }, { "epoch": 0.4576, "loss_ce": 0.28328239917755127, "loss_lvr": 0.5942589044570923, "loss_mode_switch": 0.0, "loss_total": 0.3427082896232605, "step": 1144 }, { "batch_size": 4, "epoch": 0.4576, "step": 1144, "tokens_per_device": 4200 }, { "epoch": 0.4576, "loss_ce": 0.1684250682592392, "loss_lvr": 0.749722957611084, "loss_mode_switch": 0.0, "loss_total": 0.24339735507965088, "step": 1144 }, { "batch_size": 1, "epoch": 0.4576, "step": 1144, "tokens_per_device": 5668 }, { "epoch": 0.4576, "loss_ce": 0.4686504602432251, "loss_lvr": 0.6746938228607178, "loss_mode_switch": 0.0, "loss_total": 0.536119818687439, "step": 1144 }, { "batch_size": 4, "epoch": 0.4576, "step": 1144, "tokens_per_device": 4276 }, { "epoch": 0.4576, "loss_ce": 0.17546218633651733, "loss_lvr": 0.7466210722923279, "loss_mode_switch": 0.0, "loss_total": 0.2501243054866791, "step": 1144 }, { "batch_size": 1, "epoch": 0.4576, "step": 1144, "tokens_per_device": 5145 }, { "epoch": 0.4576, "loss_ce": 0.007367471233010292, "loss_lvr": 0.36700570583343506, "loss_mode_switch": 0.0, "loss_total": 0.044068045914173126, "step": 1144 }, { "batch_size": 4, "epoch": 0.4576, "step": 1144, "tokens_per_device": 4236 }, { "epoch": 0.4576, "loss_ce": 0.0591055229306221, "loss_lvr": 1.1408500671386719, "loss_mode_switch": 0.0, "loss_total": 0.17319053411483765, "step": 1144 }, { "batch_size": 4, "epoch": 0.4576, "step": 1144, "tokens_per_device": 5328 }, { "epoch": 0.4576, "loss_ce": 0.3223240077495575, "loss_lvr": 0.7552527189254761, "loss_mode_switch": 0.0, "loss_total": 0.39784929156303406, "step": 1144 }, { "epoch": 0.458, "grad_norm": 1.3971717357635498, "learning_rate": 5.9178112167241805e-06, "loss": 0.2856, "step": 1145 }, { "batch_size": 4, "epoch": 0.458, "step": 1145, "tokens_per_device": 4932 }, { "epoch": 0.458, "loss_ce": 0.05560476705431938, "loss_lvr": 0.8706054091453552, "loss_mode_switch": 0.0, "loss_total": 0.1426653116941452, "step": 1145 }, { "batch_size": 4, "epoch": 0.458, "step": 1145, "tokens_per_device": 4092 }, { "epoch": 0.458, "loss_ce": 0.331528902053833, "loss_lvr": 1.0258086919784546, "loss_mode_switch": 0.0, "loss_total": 0.43410977721214294, "step": 1145 }, { "batch_size": 1, "epoch": 0.458, "step": 1145, "tokens_per_device": 5051 }, { "epoch": 0.458, "loss_ce": 0.06523162871599197, "loss_lvr": 0.40052005648612976, "loss_mode_switch": 0.0, "loss_total": 0.10528363287448883, "step": 1145 }, { "batch_size": 1, "epoch": 0.458, "step": 1145, "tokens_per_device": 4230 }, { "epoch": 0.458, "loss_ce": 0.0020886389538645744, "loss_lvr": 0.3710700571537018, "loss_mode_switch": 0.0, "loss_total": 0.03919564560055733, "step": 1145 }, { "batch_size": 4, "epoch": 0.458, "step": 1145, "tokens_per_device": 3828 }, { "epoch": 0.458, "loss_ce": 0.6430785059928894, "loss_lvr": 0.9266045093536377, "loss_mode_switch": 0.0, "loss_total": 0.7357389330863953, "step": 1145 }, { "batch_size": 4, "epoch": 0.458, "step": 1145, "tokens_per_device": 1824 }, { "epoch": 0.458, "loss_ce": 0.6949787139892578, "loss_lvr": 0.8980736136436462, "loss_mode_switch": 0.0, "loss_total": 0.7847861051559448, "step": 1145 }, { "batch_size": 4, "epoch": 0.458, "step": 1145, "tokens_per_device": 3920 }, { "epoch": 0.458, "loss_ce": 0.5940269827842712, "loss_lvr": 0.8328619003295898, "loss_mode_switch": 0.0, "loss_total": 0.6773131489753723, "step": 1145 }, { "batch_size": 4, "epoch": 0.458, "step": 1145, "tokens_per_device": 12144 }, { "epoch": 0.458, "loss_ce": 0.30954697728157043, "loss_lvr": 0.6754001379013062, "loss_mode_switch": 0.0, "loss_total": 0.3770869970321655, "step": 1145 }, { "epoch": 0.4584, "grad_norm": 1.2747737169265747, "learning_rate": 5.911443002971122e-06, "loss": 0.3067, "step": 1146 }, { "batch_size": 4, "epoch": 0.4584, "step": 1146, "tokens_per_device": 1524 }, { "epoch": 0.4584, "loss_ce": 0.09781026840209961, "loss_lvr": 1.8675264120101929, "loss_mode_switch": 0.0, "loss_total": 0.2845629155635834, "step": 1146 }, { "batch_size": 1, "epoch": 0.4584, "step": 1146, "tokens_per_device": 5159 }, { "epoch": 0.4584, "loss_ce": 0.0011007341090589762, "loss_lvr": 0.801313579082489, "loss_mode_switch": 0.0, "loss_total": 0.08123209327459335, "step": 1146 }, { "batch_size": 4, "epoch": 0.4584, "step": 1146, "tokens_per_device": 1224 }, { "epoch": 0.4584, "loss_ce": 0.3610849976539612, "loss_lvr": 1.2129496335983276, "loss_mode_switch": 0.0, "loss_total": 0.4823799729347229, "step": 1146 }, { "batch_size": 4, "epoch": 0.4584, "step": 1146, "tokens_per_device": 3780 }, { "epoch": 0.4584, "loss_ce": 0.3641502261161804, "loss_lvr": 0.9786773920059204, "loss_mode_switch": 0.0, "loss_total": 0.4620179533958435, "step": 1146 }, { "batch_size": 1, "epoch": 0.4584, "step": 1146, "tokens_per_device": 4832 }, { "epoch": 0.4584, "loss_ce": 0.6439988017082214, "loss_lvr": 0.3320717513561249, "loss_mode_switch": 0.0, "loss_total": 0.6772059798240662, "step": 1146 }, { "batch_size": 1, "epoch": 0.4584, "step": 1146, "tokens_per_device": 5139 }, { "epoch": 0.4584, "loss_ce": 0.010155337862670422, "loss_lvr": 0.32692310214042664, "loss_mode_switch": 0.0, "loss_total": 0.0428476482629776, "step": 1146 }, { "batch_size": 4, "epoch": 0.4584, "step": 1146, "tokens_per_device": 3840 }, { "epoch": 0.4584, "loss_ce": 0.4059915542602539, "loss_lvr": 1.1562869548797607, "loss_mode_switch": 0.0, "loss_total": 0.5216202735900879, "step": 1146 }, { "batch_size": 4, "epoch": 0.4584, "step": 1146, "tokens_per_device": 5196 }, { "epoch": 0.4584, "loss_ce": 0.4908689856529236, "loss_lvr": 0.6986591815948486, "loss_mode_switch": 0.0, "loss_total": 0.5607349276542664, "step": 1146 }, { "epoch": 0.4588, "grad_norm": 1.5948885679244995, "learning_rate": 5.905073259520011e-06, "loss": 0.3727, "step": 1147 }, { "batch_size": 1, "epoch": 0.4588, "step": 1147, "tokens_per_device": 4425 }, { "epoch": 0.4588, "loss_ce": 0.01270940899848938, "loss_lvr": 0.46815571188926697, "loss_mode_switch": 0.0, "loss_total": 0.05952497944235802, "step": 1147 }, { "batch_size": 1, "epoch": 0.4588, "step": 1147, "tokens_per_device": 5108 }, { "epoch": 0.4588, "loss_ce": 0.007148109842091799, "loss_lvr": 0.3183879554271698, "loss_mode_switch": 0.0, "loss_total": 0.03898690640926361, "step": 1147 }, { "batch_size": 4, "epoch": 0.4588, "step": 1147, "tokens_per_device": 2508 }, { "epoch": 0.4588, "loss_ce": 0.3907420039176941, "loss_lvr": 1.1099718809127808, "loss_mode_switch": 0.0, "loss_total": 0.5017392039299011, "step": 1147 }, { "batch_size": 4, "epoch": 0.4588, "step": 1147, "tokens_per_device": 5808 }, { "epoch": 0.4588, "loss_ce": 0.21874873340129852, "loss_lvr": 1.1098990440368652, "loss_mode_switch": 0.0, "loss_total": 0.32973864674568176, "step": 1147 }, { "batch_size": 1, "epoch": 0.4588, "step": 1147, "tokens_per_device": 4901 }, { "epoch": 0.4588, "loss_ce": 0.003915184177458286, "loss_lvr": 0.21910779178142548, "loss_mode_switch": 0.0, "loss_total": 0.02582596242427826, "step": 1147 }, { "batch_size": 4, "epoch": 0.4588, "step": 1147, "tokens_per_device": 1356 }, { "epoch": 0.4588, "loss_ce": 0.62619948387146, "loss_lvr": 1.0396941900253296, "loss_mode_switch": 0.0, "loss_total": 0.730168879032135, "step": 1147 }, { "batch_size": 4, "epoch": 0.4588, "step": 1147, "tokens_per_device": 4692 }, { "epoch": 0.4588, "loss_ce": 0.27074772119522095, "loss_lvr": 0.7924733757972717, "loss_mode_switch": 0.0, "loss_total": 0.34999507665634155, "step": 1147 }, { "batch_size": 1, "epoch": 0.4588, "step": 1147, "tokens_per_device": 5000 }, { "epoch": 0.4588, "loss_ce": 0.011276650242507458, "loss_lvr": 0.7849525213241577, "loss_mode_switch": 0.0, "loss_total": 0.08977190405130386, "step": 1147 }, { "epoch": 0.4592, "grad_norm": 1.294521689414978, "learning_rate": 5.898701997061349e-06, "loss": 0.3042, "step": 1148 }, { "batch_size": 1, "epoch": 0.4592, "step": 1148, "tokens_per_device": 4876 }, { "epoch": 0.4592, "loss_ce": 0.009478731080889702, "loss_lvr": 0.5694382786750793, "loss_mode_switch": 0.0, "loss_total": 0.06642255932092667, "step": 1148 }, { "batch_size": 4, "epoch": 0.4592, "step": 1148, "tokens_per_device": 4212 }, { "epoch": 0.4592, "loss_ce": 0.16947047412395477, "loss_lvr": 0.599347710609436, "loss_mode_switch": 0.0, "loss_total": 0.2294052541255951, "step": 1148 }, { "batch_size": 1, "epoch": 0.4592, "step": 1148, "tokens_per_device": 5098 }, { "epoch": 0.4592, "loss_ce": 0.12184298783540726, "loss_lvr": 0.3450872600078583, "loss_mode_switch": 0.0, "loss_total": 0.1563517153263092, "step": 1148 }, { "batch_size": 1, "epoch": 0.4592, "step": 1148, "tokens_per_device": 4932 }, { "epoch": 0.4592, "loss_ce": 0.0017621752340346575, "loss_lvr": 0.3120930790901184, "loss_mode_switch": 0.0, "loss_total": 0.03297148272395134, "step": 1148 }, { "batch_size": 1, "epoch": 0.4592, "step": 1148, "tokens_per_device": 6225 }, { "epoch": 0.4592, "loss_ce": 0.16186921298503876, "loss_lvr": 0.2746199071407318, "loss_mode_switch": 0.0, "loss_total": 0.18933120369911194, "step": 1148 }, { "batch_size": 4, "epoch": 0.4592, "step": 1148, "tokens_per_device": 4360 }, { "epoch": 0.4592, "loss_ce": 0.07722296565771103, "loss_lvr": 0.8702797889709473, "loss_mode_switch": 0.0, "loss_total": 0.1642509400844574, "step": 1148 }, { "batch_size": 1, "epoch": 0.4592, "step": 1148, "tokens_per_device": 5019 }, { "epoch": 0.4592, "loss_ce": 0.02067725732922554, "loss_lvr": 0.28615549206733704, "loss_mode_switch": 0.0, "loss_total": 0.049292806535959244, "step": 1148 }, { "batch_size": 4, "epoch": 0.4592, "step": 1148, "tokens_per_device": 4468 }, { "epoch": 0.4592, "loss_ce": 0.13250844180583954, "loss_lvr": 0.8897934556007385, "loss_mode_switch": 0.0, "loss_total": 0.22148779034614563, "step": 1148 }, { "epoch": 0.4596, "grad_norm": 1.3005082607269287, "learning_rate": 5.89232922628819e-06, "loss": 0.28, "step": 1149 }, { "batch_size": 4, "epoch": 0.4596, "step": 1149, "tokens_per_device": 13956 }, { "epoch": 0.4596, "loss_ce": 0.026374176144599915, "loss_lvr": 1.3709884881973267, "loss_mode_switch": 0.0, "loss_total": 0.16347302496433258, "step": 1149 }, { "batch_size": 1, "epoch": 0.4596, "step": 1149, "tokens_per_device": 5212 }, { "epoch": 0.4596, "loss_ce": 0.0012079095467925072, "loss_lvr": 0.31009846925735474, "loss_mode_switch": 0.0, "loss_total": 0.032217755913734436, "step": 1149 }, { "batch_size": 4, "epoch": 0.4596, "step": 1149, "tokens_per_device": 1244 }, { "epoch": 0.4596, "loss_ce": 0.19494716823101044, "loss_lvr": 1.3427822589874268, "loss_mode_switch": 0.0, "loss_total": 0.3292253911495209, "step": 1149 }, { "batch_size": 4, "epoch": 0.4596, "step": 1149, "tokens_per_device": 4248 }, { "epoch": 0.4596, "loss_ce": 0.06713224947452545, "loss_lvr": 1.1311465501785278, "loss_mode_switch": 0.0, "loss_total": 0.18024690449237823, "step": 1149 }, { "batch_size": 1, "epoch": 0.4596, "step": 1149, "tokens_per_device": 4744 }, { "epoch": 0.4596, "loss_ce": 0.0015437804395332932, "loss_lvr": 0.31345322728157043, "loss_mode_switch": 0.0, "loss_total": 0.03288910165429115, "step": 1149 }, { "batch_size": 4, "epoch": 0.4596, "step": 1149, "tokens_per_device": 4040 }, { "epoch": 0.4596, "loss_ce": 0.24983102083206177, "loss_lvr": 0.8586174249649048, "loss_mode_switch": 0.0, "loss_total": 0.33569276332855225, "step": 1149 }, { "batch_size": 4, "epoch": 0.4596, "step": 1149, "tokens_per_device": 11256 }, { "epoch": 0.4596, "loss_ce": 0.26389047503471375, "loss_lvr": 0.714006781578064, "loss_mode_switch": 0.0, "loss_total": 0.33529114723205566, "step": 1149 }, { "batch_size": 1, "epoch": 0.4596, "step": 1149, "tokens_per_device": 5166 }, { "epoch": 0.4596, "loss_ce": 0.012887676246464252, "loss_lvr": 0.5564025640487671, "loss_mode_switch": 0.0, "loss_total": 0.0685279369354248, "step": 1149 }, { "epoch": 0.46, "grad_norm": 1.2057584524154663, "learning_rate": 5.885954957896115e-06, "loss": 0.2657, "step": 1150 }, { "batch_size": 1, "epoch": 0.46, "step": 1150, "tokens_per_device": 5319 }, { "epoch": 0.46, "loss_ce": 0.01784679852426052, "loss_lvr": 0.27258655428886414, "loss_mode_switch": 0.0, "loss_total": 0.045105453580617905, "step": 1150 }, { "batch_size": 4, "epoch": 0.46, "step": 1150, "tokens_per_device": 3760 }, { "epoch": 0.46, "loss_ce": 0.00027464283630251884, "loss_lvr": 0.6533287167549133, "loss_mode_switch": 0.0, "loss_total": 0.06560751795768738, "step": 1150 }, { "batch_size": 1, "epoch": 0.46, "step": 1150, "tokens_per_device": 4618 }, { "epoch": 0.46, "loss_ce": 0.7140775918960571, "loss_lvr": 0.6223694086074829, "loss_mode_switch": 0.0, "loss_total": 0.7763145565986633, "step": 1150 }, { "batch_size": 4, "epoch": 0.46, "step": 1150, "tokens_per_device": 1440 }, { "epoch": 0.46, "loss_ce": 0.6704601049423218, "loss_lvr": 0.9808753728866577, "loss_mode_switch": 0.0, "loss_total": 0.7685476541519165, "step": 1150 }, { "batch_size": 4, "epoch": 0.46, "step": 1150, "tokens_per_device": 6808 }, { "epoch": 0.46, "loss_ce": 0.005697931163012981, "loss_lvr": 0.7119377255439758, "loss_mode_switch": 0.0, "loss_total": 0.0768917053937912, "step": 1150 }, { "batch_size": 4, "epoch": 0.46, "step": 1150, "tokens_per_device": 4364 }, { "epoch": 0.46, "loss_ce": 0.5702783465385437, "loss_lvr": 1.2924116849899292, "loss_mode_switch": 0.0, "loss_total": 0.6995195150375366, "step": 1150 }, { "batch_size": 4, "epoch": 0.46, "step": 1150, "tokens_per_device": 6680 }, { "epoch": 0.46, "loss_ce": 0.28575509786605835, "loss_lvr": 0.8146128058433533, "loss_mode_switch": 0.0, "loss_total": 0.3672163784503937, "step": 1150 }, { "batch_size": 1, "epoch": 0.46, "step": 1150, "tokens_per_device": 4964 }, { "epoch": 0.46, "loss_ce": 0.48683223128318787, "loss_lvr": 0.4021129906177521, "loss_mode_switch": 0.0, "loss_total": 0.5270435214042664, "step": 1150 }, { "epoch": 0.4604, "grad_norm": 1.2326467037200928, "learning_rate": 5.879579202583223e-06, "loss": 0.2657, "step": 1151 }, { "batch_size": 4, "epoch": 0.4604, "step": 1151, "tokens_per_device": 5796 }, { "epoch": 0.4604, "loss_ce": 0.2337200790643692, "loss_lvr": 0.7676355242729187, "loss_mode_switch": 0.0, "loss_total": 0.3104836344718933, "step": 1151 }, { "batch_size": 4, "epoch": 0.4604, "step": 1151, "tokens_per_device": 2704 }, { "epoch": 0.4604, "loss_ce": 0.33171942830085754, "loss_lvr": 1.016845941543579, "loss_mode_switch": 0.0, "loss_total": 0.43340402841567993, "step": 1151 }, { "batch_size": 4, "epoch": 0.4604, "step": 1151, "tokens_per_device": 4236 }, { "epoch": 0.4604, "loss_ce": 0.1811162680387497, "loss_lvr": 1.058013677597046, "loss_mode_switch": 0.0, "loss_total": 0.28691762685775757, "step": 1151 }, { "batch_size": 4, "epoch": 0.4604, "step": 1151, "tokens_per_device": 3772 }, { "epoch": 0.4604, "loss_ce": 0.0519271083176136, "loss_lvr": 0.9241846203804016, "loss_mode_switch": 0.0, "loss_total": 0.14434556663036346, "step": 1151 }, { "batch_size": 1, "epoch": 0.4604, "step": 1151, "tokens_per_device": 6066 }, { "epoch": 0.4604, "loss_ce": 0.13627466559410095, "loss_lvr": 0.2979869544506073, "loss_mode_switch": 0.0, "loss_total": 0.16607336699962616, "step": 1151 }, { "batch_size": 4, "epoch": 0.4604, "step": 1151, "tokens_per_device": 4968 }, { "epoch": 0.4604, "loss_ce": 0.14892876148223877, "loss_lvr": 0.9297208786010742, "loss_mode_switch": 0.0, "loss_total": 0.24190086126327515, "step": 1151 }, { "batch_size": 1, "epoch": 0.4604, "step": 1151, "tokens_per_device": 4887 }, { "epoch": 0.4604, "loss_ce": 0.7512317299842834, "loss_lvr": 0.4174867570400238, "loss_mode_switch": 0.0, "loss_total": 0.792980432510376, "step": 1151 }, { "batch_size": 4, "epoch": 0.4604, "step": 1151, "tokens_per_device": 7748 }, { "epoch": 0.4604, "loss_ce": 0.006157898344099522, "loss_lvr": 0.7224616408348083, "loss_mode_switch": 0.0, "loss_total": 0.07840406149625778, "step": 1151 }, { "epoch": 0.4608, "grad_norm": 1.4826661348342896, "learning_rate": 5.8732019710501075e-06, "loss": 0.3514, "step": 1152 }, { "batch_size": 1, "epoch": 0.4608, "step": 1152, "tokens_per_device": 4802 }, { "epoch": 0.4608, "loss_ce": 0.03916335105895996, "loss_lvr": 0.36302292346954346, "loss_mode_switch": 0.0, "loss_total": 0.07546564936637878, "step": 1152 }, { "batch_size": 1, "epoch": 0.4608, "step": 1152, "tokens_per_device": 4453 }, { "epoch": 0.4608, "loss_ce": 0.0048636882565915585, "loss_lvr": 0.4817095100879669, "loss_mode_switch": 0.0, "loss_total": 0.05303463712334633, "step": 1152 }, { "batch_size": 1, "epoch": 0.4608, "step": 1152, "tokens_per_device": 4532 }, { "epoch": 0.4608, "loss_ce": 0.5115567445755005, "loss_lvr": 0.3542190492153168, "loss_mode_switch": 0.0, "loss_total": 0.5469786524772644, "step": 1152 }, { "batch_size": 4, "epoch": 0.4608, "step": 1152, "tokens_per_device": 4364 }, { "epoch": 0.4608, "loss_ce": 0.09087922424077988, "loss_lvr": 0.9334754347801208, "loss_mode_switch": 0.0, "loss_total": 0.18422676622867584, "step": 1152 }, { "batch_size": 4, "epoch": 0.4608, "step": 1152, "tokens_per_device": 4232 }, { "epoch": 0.4608, "loss_ce": 0.04347221925854683, "loss_lvr": 1.7585718631744385, "loss_mode_switch": 0.0, "loss_total": 0.21932940185070038, "step": 1152 }, { "batch_size": 4, "epoch": 0.4608, "step": 1152, "tokens_per_device": 1392 }, { "epoch": 0.4608, "loss_ce": 0.30633264780044556, "loss_lvr": 1.2997889518737793, "loss_mode_switch": 0.0, "loss_total": 0.4363115429878235, "step": 1152 }, { "batch_size": 4, "epoch": 0.4608, "step": 1152, "tokens_per_device": 4176 }, { "epoch": 0.4608, "loss_ce": 0.04808323085308075, "loss_lvr": 0.7182034254074097, "loss_mode_switch": 0.0, "loss_total": 0.1199035719037056, "step": 1152 }, { "batch_size": 4, "epoch": 0.4608, "step": 1152, "tokens_per_device": 3772 }, { "epoch": 0.4608, "loss_ce": 0.38258010149002075, "loss_lvr": 0.7350078225135803, "loss_mode_switch": 0.0, "loss_total": 0.4560808837413788, "step": 1152 }, { "epoch": 0.4612, "grad_norm": 1.2671560049057007, "learning_rate": 5.866823273999839e-06, "loss": 0.2657, "step": 1153 }, { "batch_size": 4, "epoch": 0.4612, "step": 1153, "tokens_per_device": 4200 }, { "epoch": 0.4612, "loss_ce": 0.2483278065919876, "loss_lvr": 0.9958549737930298, "loss_mode_switch": 0.0, "loss_total": 0.34791329503059387, "step": 1153 }, { "batch_size": 4, "epoch": 0.4612, "step": 1153, "tokens_per_device": 13108 }, { "epoch": 0.4612, "loss_ce": 0.4491730332374573, "loss_lvr": 0.41719773411750793, "loss_mode_switch": 0.0, "loss_total": 0.49089279770851135, "step": 1153 }, { "batch_size": 1, "epoch": 0.4612, "step": 1153, "tokens_per_device": 4973 }, { "epoch": 0.4612, "loss_ce": 0.006377984303981066, "loss_lvr": 0.38516008853912354, "loss_mode_switch": 0.0, "loss_total": 0.04489399492740631, "step": 1153 }, { "batch_size": 4, "epoch": 0.4612, "step": 1153, "tokens_per_device": 2680 }, { "epoch": 0.4612, "loss_ce": 0.9830681085586548, "loss_lvr": 0.9017500877380371, "loss_mode_switch": 0.0, "loss_total": 1.0732431411743164, "step": 1153 }, { "batch_size": 1, "epoch": 0.4612, "step": 1153, "tokens_per_device": 5186 }, { "epoch": 0.4612, "loss_ce": 0.0009921836899593472, "loss_lvr": 0.5414858460426331, "loss_mode_switch": 0.0, "loss_total": 0.055140767246484756, "step": 1153 }, { "batch_size": 4, "epoch": 0.4612, "step": 1153, "tokens_per_device": 4220 }, { "epoch": 0.4612, "loss_ce": 0.19429340958595276, "loss_lvr": 1.115140676498413, "loss_mode_switch": 0.0, "loss_total": 0.3058074712753296, "step": 1153 }, { "batch_size": 1, "epoch": 0.4612, "step": 1153, "tokens_per_device": 4848 }, { "epoch": 0.4612, "loss_ce": 0.48225700855255127, "loss_lvr": 0.5974984765052795, "loss_mode_switch": 0.0, "loss_total": 0.5420068502426147, "step": 1153 }, { "batch_size": 1, "epoch": 0.4612, "step": 1153, "tokens_per_device": 5180 }, { "epoch": 0.4612, "loss_ce": 0.06653502583503723, "loss_lvr": 0.2794721722602844, "loss_mode_switch": 0.0, "loss_total": 0.09448224306106567, "step": 1153 }, { "epoch": 0.4616, "grad_norm": 1.7481615543365479, "learning_rate": 5.860443122137946e-06, "loss": 0.2809, "step": 1154 }, { "batch_size": 1, "epoch": 0.4616, "step": 1154, "tokens_per_device": 4985 }, { "epoch": 0.4616, "loss_ce": 0.010535857640206814, "loss_lvr": 0.2243204414844513, "loss_mode_switch": 0.0, "loss_total": 0.03296790271997452, "step": 1154 }, { "batch_size": 4, "epoch": 0.4616, "step": 1154, "tokens_per_device": 9544 }, { "epoch": 0.4616, "loss_ce": 0.17021897435188293, "loss_lvr": 0.6480066776275635, "loss_mode_switch": 0.0, "loss_total": 0.23501965403556824, "step": 1154 }, { "batch_size": 4, "epoch": 0.4616, "step": 1154, "tokens_per_device": 1456 }, { "epoch": 0.4616, "loss_ce": 0.2199530452489853, "loss_lvr": 0.9615508317947388, "loss_mode_switch": 0.0, "loss_total": 0.3161081373691559, "step": 1154 }, { "batch_size": 4, "epoch": 0.4616, "step": 1154, "tokens_per_device": 5180 }, { "epoch": 0.4616, "loss_ce": 0.25795477628707886, "loss_lvr": 0.9093466401100159, "loss_mode_switch": 0.0, "loss_total": 0.34888944029808044, "step": 1154 }, { "batch_size": 4, "epoch": 0.4616, "step": 1154, "tokens_per_device": 4592 }, { "epoch": 0.4616, "loss_ce": 0.2109440267086029, "loss_lvr": 0.7620139718055725, "loss_mode_switch": 0.0, "loss_total": 0.2871454358100891, "step": 1154 }, { "batch_size": 4, "epoch": 0.4616, "step": 1154, "tokens_per_device": 1212 }, { "epoch": 0.4616, "loss_ce": 0.22581900656223297, "loss_lvr": 1.2079815864562988, "loss_mode_switch": 0.0, "loss_total": 0.3466171622276306, "step": 1154 }, { "batch_size": 4, "epoch": 0.4616, "step": 1154, "tokens_per_device": 5556 }, { "epoch": 0.4616, "loss_ce": 0.144191712141037, "loss_lvr": 0.7360280156135559, "loss_mode_switch": 0.0, "loss_total": 0.2177945077419281, "step": 1154 }, { "batch_size": 1, "epoch": 0.4616, "step": 1154, "tokens_per_device": 5144 }, { "epoch": 0.4616, "loss_ce": 0.03623542562127113, "loss_lvr": 0.32397395372390747, "loss_mode_switch": 0.0, "loss_total": 0.0686328262090683, "step": 1154 }, { "epoch": 0.462, "grad_norm": 1.2888532876968384, "learning_rate": 5.854061526172402e-06, "loss": 0.2765, "step": 1155 }, { "batch_size": 4, "epoch": 0.462, "step": 1155, "tokens_per_device": 4544 }, { "epoch": 0.462, "loss_ce": 0.04115903005003929, "loss_lvr": 0.866542398929596, "loss_mode_switch": 0.0, "loss_total": 0.12781326472759247, "step": 1155 }, { "batch_size": 1, "epoch": 0.462, "step": 1155, "tokens_per_device": 4893 }, { "epoch": 0.462, "loss_ce": 0.2266061156988144, "loss_lvr": 0.39067479968070984, "loss_mode_switch": 0.0, "loss_total": 0.26567360758781433, "step": 1155 }, { "batch_size": 4, "epoch": 0.462, "step": 1155, "tokens_per_device": 9768 }, { "epoch": 0.462, "loss_ce": 0.3732205927371979, "loss_lvr": 0.8289322257041931, "loss_mode_switch": 0.0, "loss_total": 0.4561138153076172, "step": 1155 }, { "batch_size": 4, "epoch": 0.462, "step": 1155, "tokens_per_device": 1244 }, { "epoch": 0.462, "loss_ce": 0.25764644145965576, "loss_lvr": 0.9423067569732666, "loss_mode_switch": 0.0, "loss_total": 0.3518771231174469, "step": 1155 }, { "batch_size": 1, "epoch": 0.462, "step": 1155, "tokens_per_device": 5013 }, { "epoch": 0.462, "loss_ce": 0.0007879922632128, "loss_lvr": 0.7357856035232544, "loss_mode_switch": 0.0, "loss_total": 0.07436655461788177, "step": 1155 }, { "batch_size": 4, "epoch": 0.462, "step": 1155, "tokens_per_device": 3812 }, { "epoch": 0.462, "loss_ce": 0.4552813768386841, "loss_lvr": 0.9327276349067688, "loss_mode_switch": 0.0, "loss_total": 0.5485541224479675, "step": 1155 }, { "batch_size": 4, "epoch": 0.462, "step": 1155, "tokens_per_device": 2788 }, { "epoch": 0.462, "loss_ce": 0.5407870411872864, "loss_lvr": 0.7362825274467468, "loss_mode_switch": 0.0, "loss_total": 0.6144152879714966, "step": 1155 }, { "batch_size": 4, "epoch": 0.462, "step": 1155, "tokens_per_device": 5028 }, { "epoch": 0.462, "loss_ce": 0.15971814095973969, "loss_lvr": 0.6747965812683105, "loss_mode_switch": 0.0, "loss_total": 0.2271977961063385, "step": 1155 }, { "epoch": 0.4624, "grad_norm": 1.3715898990631104, "learning_rate": 5.847678496813601e-06, "loss": 0.2477, "step": 1156 }, { "batch_size": 4, "epoch": 0.4624, "step": 1156, "tokens_per_device": 3788 }, { "epoch": 0.4624, "loss_ce": 0.4312991201877594, "loss_lvr": 0.7963308095932007, "loss_mode_switch": 0.0, "loss_total": 0.510932207107544, "step": 1156 }, { "batch_size": 1, "epoch": 0.4624, "step": 1156, "tokens_per_device": 5120 }, { "epoch": 0.4624, "loss_ce": 0.0026832076255232096, "loss_lvr": 0.523227334022522, "loss_mode_switch": 0.0, "loss_total": 0.05500594154000282, "step": 1156 }, { "batch_size": 4, "epoch": 0.4624, "step": 1156, "tokens_per_device": 4600 }, { "epoch": 0.4624, "loss_ce": 0.17238563299179077, "loss_lvr": 0.9341941475868225, "loss_mode_switch": 0.0, "loss_total": 0.26580506563186646, "step": 1156 }, { "batch_size": 1, "epoch": 0.4624, "step": 1156, "tokens_per_device": 5104 }, { "epoch": 0.4624, "loss_ce": 0.01433646772056818, "loss_lvr": 0.7164193987846375, "loss_mode_switch": 0.0, "loss_total": 0.08597841113805771, "step": 1156 }, { "batch_size": 4, "epoch": 0.4624, "step": 1156, "tokens_per_device": 2648 }, { "epoch": 0.4624, "loss_ce": 0.04414292052388191, "loss_lvr": 0.6522937417030334, "loss_mode_switch": 0.0, "loss_total": 0.10937230288982391, "step": 1156 }, { "batch_size": 1, "epoch": 0.4624, "step": 1156, "tokens_per_device": 5069 }, { "epoch": 0.4624, "loss_ce": 0.007473950739949942, "loss_lvr": 0.5425166487693787, "loss_mode_switch": 0.0, "loss_total": 0.061725616455078125, "step": 1156 }, { "batch_size": 4, "epoch": 0.4624, "step": 1156, "tokens_per_device": 1260 }, { "epoch": 0.4624, "loss_ce": 0.2179279923439026, "loss_lvr": 1.1200535297393799, "loss_mode_switch": 0.0, "loss_total": 0.3299333453178406, "step": 1156 }, { "batch_size": 1, "epoch": 0.4624, "step": 1156, "tokens_per_device": 5084 }, { "epoch": 0.4624, "loss_ce": 0.0003692788886837661, "loss_lvr": 0.4309699237346649, "loss_mode_switch": 0.0, "loss_total": 0.043466273695230484, "step": 1156 }, { "epoch": 0.4628, "grad_norm": 1.1330082416534424, "learning_rate": 5.841294044774346e-06, "loss": 0.2774, "step": 1157 }, { "batch_size": 1, "epoch": 0.4628, "step": 1157, "tokens_per_device": 5097 }, { "epoch": 0.4628, "loss_ce": 0.04056641831994057, "loss_lvr": 0.4958251416683197, "loss_mode_switch": 0.0, "loss_total": 0.0901489332318306, "step": 1157 }, { "batch_size": 1, "epoch": 0.4628, "step": 1157, "tokens_per_device": 4768 }, { "epoch": 0.4628, "loss_ce": 0.14901013672351837, "loss_lvr": 0.451145201921463, "loss_mode_switch": 0.0, "loss_total": 0.19412465393543243, "step": 1157 }, { "batch_size": 4, "epoch": 0.4628, "step": 1157, "tokens_per_device": 3788 }, { "epoch": 0.4628, "loss_ce": 0.23601138591766357, "loss_lvr": 1.0397928953170776, "loss_mode_switch": 0.0, "loss_total": 0.33999067544937134, "step": 1157 }, { "batch_size": 4, "epoch": 0.4628, "step": 1157, "tokens_per_device": 3856 }, { "epoch": 0.4628, "loss_ce": 0.4487682282924652, "loss_lvr": 0.8415692448616028, "loss_mode_switch": 0.0, "loss_total": 0.5329251289367676, "step": 1157 }, { "batch_size": 4, "epoch": 0.4628, "step": 1157, "tokens_per_device": 3816 }, { "epoch": 0.4628, "loss_ce": 0.47342753410339355, "loss_lvr": 0.8499150276184082, "loss_mode_switch": 0.0, "loss_total": 0.5584190487861633, "step": 1157 }, { "batch_size": 4, "epoch": 0.4628, "step": 1157, "tokens_per_device": 5752 }, { "epoch": 0.4628, "loss_ce": 0.4570549428462982, "loss_lvr": 0.7288291454315186, "loss_mode_switch": 0.0, "loss_total": 0.5299378633499146, "step": 1157 }, { "batch_size": 1, "epoch": 0.4628, "step": 1157, "tokens_per_device": 5237 }, { "epoch": 0.4628, "loss_ce": 0.028854137286543846, "loss_lvr": 0.38088738918304443, "loss_mode_switch": 0.0, "loss_total": 0.06694287806749344, "step": 1157 }, { "batch_size": 4, "epoch": 0.4628, "step": 1157, "tokens_per_device": 1228 }, { "epoch": 0.4628, "loss_ce": 0.11324761807918549, "loss_lvr": 0.9198242425918579, "loss_mode_switch": 0.0, "loss_total": 0.20523004233837128, "step": 1157 }, { "epoch": 0.4632, "grad_norm": 1.3610173463821411, "learning_rate": 5.834908180769824e-06, "loss": 0.2951, "step": 1158 }, { "batch_size": 1, "epoch": 0.4632, "step": 1158, "tokens_per_device": 4971 }, { "epoch": 0.4632, "loss_ce": 0.08164606243371964, "loss_lvr": 0.6800042390823364, "loss_mode_switch": 0.0, "loss_total": 0.14964649081230164, "step": 1158 }, { "batch_size": 4, "epoch": 0.4632, "step": 1158, "tokens_per_device": 4548 }, { "epoch": 0.4632, "loss_ce": 0.18271639943122864, "loss_lvr": 0.8905636668205261, "loss_mode_switch": 0.0, "loss_total": 0.2717727720737457, "step": 1158 }, { "batch_size": 4, "epoch": 0.4632, "step": 1158, "tokens_per_device": 6804 }, { "epoch": 0.4632, "loss_ce": 0.34179383516311646, "loss_lvr": 0.6623014211654663, "loss_mode_switch": 0.0, "loss_total": 0.40802398324012756, "step": 1158 }, { "batch_size": 1, "epoch": 0.4632, "step": 1158, "tokens_per_device": 4765 }, { "epoch": 0.4632, "loss_ce": 0.09861084818840027, "loss_lvr": 0.4424227476119995, "loss_mode_switch": 0.0, "loss_total": 0.14285312592983246, "step": 1158 }, { "batch_size": 1, "epoch": 0.4632, "step": 1158, "tokens_per_device": 4384 }, { "epoch": 0.4632, "loss_ce": 0.23062382638454437, "loss_lvr": 0.44844573736190796, "loss_mode_switch": 0.0, "loss_total": 0.2754684090614319, "step": 1158 }, { "batch_size": 1, "epoch": 0.4632, "step": 1158, "tokens_per_device": 6357 }, { "epoch": 0.4632, "loss_ce": 0.014086728915572166, "loss_lvr": 0.2762153446674347, "loss_mode_switch": 0.0, "loss_total": 0.041708264499902725, "step": 1158 }, { "batch_size": 4, "epoch": 0.4632, "step": 1158, "tokens_per_device": 1412 }, { "epoch": 0.4632, "loss_ce": 0.1795598566532135, "loss_lvr": 1.5745913982391357, "loss_mode_switch": 0.0, "loss_total": 0.3370189964771271, "step": 1158 }, { "batch_size": 4, "epoch": 0.4632, "step": 1158, "tokens_per_device": 3860 }, { "epoch": 0.4632, "loss_ce": 0.4925641715526581, "loss_lvr": 0.9396536946296692, "loss_mode_switch": 0.0, "loss_total": 0.586529552936554, "step": 1158 }, { "epoch": 0.4636, "grad_norm": 1.4052621126174927, "learning_rate": 5.828520915517593e-06, "loss": 0.3009, "step": 1159 }, { "batch_size": 4, "epoch": 0.4636, "step": 1159, "tokens_per_device": 4868 }, { "epoch": 0.4636, "loss_ce": 0.17296531796455383, "loss_lvr": 0.8093742728233337, "loss_mode_switch": 0.0, "loss_total": 0.25390273332595825, "step": 1159 }, { "batch_size": 1, "epoch": 0.4636, "step": 1159, "tokens_per_device": 5151 }, { "epoch": 0.4636, "loss_ce": 0.01673761010169983, "loss_lvr": 0.22242684662342072, "loss_mode_switch": 0.0, "loss_total": 0.03898029774427414, "step": 1159 }, { "batch_size": 4, "epoch": 0.4636, "step": 1159, "tokens_per_device": 4752 }, { "epoch": 0.4636, "loss_ce": 0.4801420569419861, "loss_lvr": 0.8438882231712341, "loss_mode_switch": 0.0, "loss_total": 0.5645308494567871, "step": 1159 }, { "batch_size": 4, "epoch": 0.4636, "step": 1159, "tokens_per_device": 1568 }, { "epoch": 0.4636, "loss_ce": 0.2341182380914688, "loss_lvr": 0.8664387464523315, "loss_mode_switch": 0.0, "loss_total": 0.32076209783554077, "step": 1159 }, { "batch_size": 4, "epoch": 0.4636, "step": 1159, "tokens_per_device": 5564 }, { "epoch": 0.4636, "loss_ce": 0.245143324136734, "loss_lvr": 0.6116431355476379, "loss_mode_switch": 0.0, "loss_total": 0.3063076436519623, "step": 1159 }, { "batch_size": 1, "epoch": 0.4636, "step": 1159, "tokens_per_device": 4905 }, { "epoch": 0.4636, "loss_ce": 0.015936922281980515, "loss_lvr": 0.7611832022666931, "loss_mode_switch": 0.0, "loss_total": 0.09205524623394012, "step": 1159 }, { "batch_size": 4, "epoch": 0.4636, "step": 1159, "tokens_per_device": 1328 }, { "epoch": 0.4636, "loss_ce": 0.185787171125412, "loss_lvr": 1.0950161218643188, "loss_mode_switch": 0.0, "loss_total": 0.2952888011932373, "step": 1159 }, { "batch_size": 4, "epoch": 0.4636, "step": 1159, "tokens_per_device": 3896 }, { "epoch": 0.4636, "loss_ce": 0.45892977714538574, "loss_lvr": 0.8567330241203308, "loss_mode_switch": 0.0, "loss_total": 0.5446031093597412, "step": 1159 }, { "epoch": 0.464, "grad_norm": 1.342966079711914, "learning_rate": 5.822132259737565e-06, "loss": 0.2903, "step": 1160 }, { "batch_size": 1, "epoch": 0.464, "step": 1160, "tokens_per_device": 4871 }, { "epoch": 0.464, "loss_ce": 0.010572138242423534, "loss_lvr": 0.34441277384757996, "loss_mode_switch": 0.0, "loss_total": 0.045013416558504105, "step": 1160 }, { "batch_size": 4, "epoch": 0.464, "step": 1160, "tokens_per_device": 6076 }, { "epoch": 0.464, "loss_ce": 0.3025659918785095, "loss_lvr": 0.7184258103370667, "loss_mode_switch": 0.0, "loss_total": 0.3744085729122162, "step": 1160 }, { "batch_size": 1, "epoch": 0.464, "step": 1160, "tokens_per_device": 4912 }, { "epoch": 0.464, "loss_ce": 0.04795978590846062, "loss_lvr": 0.5124215483665466, "loss_mode_switch": 0.0, "loss_total": 0.09920194000005722, "step": 1160 }, { "batch_size": 4, "epoch": 0.464, "step": 1160, "tokens_per_device": 4612 }, { "epoch": 0.464, "loss_ce": 0.07902882993221283, "loss_lvr": 0.8397417664527893, "loss_mode_switch": 0.0, "loss_total": 0.16300299763679504, "step": 1160 }, { "batch_size": 4, "epoch": 0.464, "step": 1160, "tokens_per_device": 4416 }, { "epoch": 0.464, "loss_ce": 0.30507373809814453, "loss_lvr": 0.9468852877616882, "loss_mode_switch": 0.0, "loss_total": 0.39976227283477783, "step": 1160 }, { "batch_size": 4, "epoch": 0.464, "step": 1160, "tokens_per_device": 3612 }, { "epoch": 0.464, "loss_ce": 0.5209053158760071, "loss_lvr": 0.8392143249511719, "loss_mode_switch": 0.0, "loss_total": 0.6048267483711243, "step": 1160 }, { "batch_size": 1, "epoch": 0.464, "step": 1160, "tokens_per_device": 4779 }, { "epoch": 0.464, "loss_ce": 0.0026289757806807756, "loss_lvr": 0.45138564705848694, "loss_mode_switch": 0.0, "loss_total": 0.04776753857731819, "step": 1160 }, { "batch_size": 4, "epoch": 0.464, "step": 1160, "tokens_per_device": 5772 }, { "epoch": 0.464, "loss_ce": 0.2719040513038635, "loss_lvr": 0.6836040019989014, "loss_mode_switch": 0.0, "loss_total": 0.3402644395828247, "step": 1160 }, { "epoch": 0.4644, "grad_norm": 1.348775029182434, "learning_rate": 5.815742224151982e-06, "loss": 0.289, "step": 1161 }, { "batch_size": 4, "epoch": 0.4644, "step": 1161, "tokens_per_device": 4256 }, { "epoch": 0.4644, "loss_ce": 0.0023358024191111326, "loss_lvr": 0.7437164783477783, "loss_mode_switch": 0.0, "loss_total": 0.07670745253562927, "step": 1161 }, { "batch_size": 4, "epoch": 0.4644, "step": 1161, "tokens_per_device": 6048 }, { "epoch": 0.4644, "loss_ce": 0.5230787396430969, "loss_lvr": 0.8384429812431335, "loss_mode_switch": 0.0, "loss_total": 0.6069230437278748, "step": 1161 }, { "batch_size": 4, "epoch": 0.4644, "step": 1161, "tokens_per_device": 1876 }, { "epoch": 0.4644, "loss_ce": 0.7784652709960938, "loss_lvr": 0.9160242080688477, "loss_mode_switch": 0.0, "loss_total": 0.8700677156448364, "step": 1161 }, { "batch_size": 1, "epoch": 0.4644, "step": 1161, "tokens_per_device": 4943 }, { "epoch": 0.4644, "loss_ce": 0.2968415319919586, "loss_lvr": 0.5420133471488953, "loss_mode_switch": 0.0, "loss_total": 0.35104286670684814, "step": 1161 }, { "batch_size": 4, "epoch": 0.4644, "step": 1161, "tokens_per_device": 1412 }, { "epoch": 0.4644, "loss_ce": 0.5523726940155029, "loss_lvr": 1.803769826889038, "loss_mode_switch": 0.0, "loss_total": 0.7327497005462646, "step": 1161 }, { "batch_size": 4, "epoch": 0.4644, "step": 1161, "tokens_per_device": 6464 }, { "epoch": 0.4644, "loss_ce": 0.02940310910344124, "loss_lvr": 0.7275006771087646, "loss_mode_switch": 0.0, "loss_total": 0.10215318202972412, "step": 1161 }, { "batch_size": 4, "epoch": 0.4644, "step": 1161, "tokens_per_device": 1928 }, { "epoch": 0.4644, "loss_ce": 0.1891125738620758, "loss_lvr": 1.2543498277664185, "loss_mode_switch": 0.0, "loss_total": 0.3145475387573242, "step": 1161 }, { "batch_size": 4, "epoch": 0.4644, "step": 1161, "tokens_per_device": 4700 }, { "epoch": 0.4644, "loss_ce": 0.07512710988521576, "loss_lvr": 0.7364553809165955, "loss_mode_switch": 0.0, "loss_total": 0.14877265691757202, "step": 1161 }, { "epoch": 0.4648, "grad_norm": 1.3426390886306763, "learning_rate": 5.809350819485408e-06, "loss": 0.2939, "step": 1162 }, { "batch_size": 4, "epoch": 0.4648, "step": 1162, "tokens_per_device": 5184 }, { "epoch": 0.4648, "loss_ce": 0.014322749339044094, "loss_lvr": 0.8623445630073547, "loss_mode_switch": 0.0, "loss_total": 0.10055720806121826, "step": 1162 }, { "batch_size": 4, "epoch": 0.4648, "step": 1162, "tokens_per_device": 5640 }, { "epoch": 0.4648, "loss_ce": 0.04584691300988197, "loss_lvr": 0.7979046106338501, "loss_mode_switch": 0.0, "loss_total": 0.12563738226890564, "step": 1162 }, { "batch_size": 1, "epoch": 0.4648, "step": 1162, "tokens_per_device": 5059 }, { "epoch": 0.4648, "loss_ce": 0.11575255542993546, "loss_lvr": 0.2192990928888321, "loss_mode_switch": 0.0, "loss_total": 0.1376824676990509, "step": 1162 }, { "batch_size": 1, "epoch": 0.4648, "step": 1162, "tokens_per_device": 5148 }, { "epoch": 0.4648, "loss_ce": 0.1296900063753128, "loss_lvr": 0.3218827247619629, "loss_mode_switch": 0.0, "loss_total": 0.1618782877922058, "step": 1162 }, { "batch_size": 4, "epoch": 0.4648, "step": 1162, "tokens_per_device": 2980 }, { "epoch": 0.4648, "loss_ce": 0.3156748414039612, "loss_lvr": 0.528618574142456, "loss_mode_switch": 0.0, "loss_total": 0.36853671073913574, "step": 1162 }, { "batch_size": 1, "epoch": 0.4648, "step": 1162, "tokens_per_device": 5052 }, { "epoch": 0.4648, "loss_ce": 0.020817995071411133, "loss_lvr": 0.1709788739681244, "loss_mode_switch": 0.0, "loss_total": 0.03791588544845581, "step": 1162 }, { "batch_size": 4, "epoch": 0.4648, "step": 1162, "tokens_per_device": 9136 }, { "epoch": 0.4648, "loss_ce": 0.04297061264514923, "loss_lvr": 0.6948668956756592, "loss_mode_switch": 0.0, "loss_total": 0.11245730519294739, "step": 1162 }, { "batch_size": 1, "epoch": 0.4648, "step": 1162, "tokens_per_device": 5878 }, { "epoch": 0.4648, "loss_ce": 0.002103835577145219, "loss_lvr": 0.38518911600112915, "loss_mode_switch": 0.0, "loss_total": 0.04062274843454361, "step": 1162 }, { "epoch": 0.4652, "grad_norm": 1.2537815570831299, "learning_rate": 5.802958056464694e-06, "loss": 0.2519, "step": 1163 }, { "batch_size": 4, "epoch": 0.4652, "step": 1163, "tokens_per_device": 1252 }, { "epoch": 0.4652, "loss_ce": 0.2756716310977936, "loss_lvr": 1.138005018234253, "loss_mode_switch": 0.0, "loss_total": 0.3894721269607544, "step": 1163 }, { "batch_size": 4, "epoch": 0.4652, "step": 1163, "tokens_per_device": 5988 }, { "epoch": 0.4652, "loss_ce": 0.029602443799376488, "loss_lvr": 0.7574336528778076, "loss_mode_switch": 0.0, "loss_total": 0.10534581542015076, "step": 1163 }, { "batch_size": 4, "epoch": 0.4652, "step": 1163, "tokens_per_device": 15660 }, { "epoch": 0.4652, "loss_ce": 0.2654741704463959, "loss_lvr": 0.7120023369789124, "loss_mode_switch": 0.0, "loss_total": 0.33667439222335815, "step": 1163 }, { "batch_size": 4, "epoch": 0.4652, "step": 1163, "tokens_per_device": 4276 }, { "epoch": 0.4652, "loss_ce": 0.16100233793258667, "loss_lvr": 1.0318101644515991, "loss_mode_switch": 0.0, "loss_total": 0.2641833424568176, "step": 1163 }, { "batch_size": 1, "epoch": 0.4652, "step": 1163, "tokens_per_device": 5114 }, { "epoch": 0.4652, "loss_ce": 0.009538199752569199, "loss_lvr": 0.4839058220386505, "loss_mode_switch": 0.0, "loss_total": 0.05792878195643425, "step": 1163 }, { "batch_size": 4, "epoch": 0.4652, "step": 1163, "tokens_per_device": 4312 }, { "epoch": 0.4652, "loss_ce": 0.12360669672489166, "loss_lvr": 0.7005561590194702, "loss_mode_switch": 0.0, "loss_total": 0.19366231560707092, "step": 1163 }, { "batch_size": 1, "epoch": 0.4652, "step": 1163, "tokens_per_device": 7223 }, { "epoch": 0.4652, "loss_ce": 0.0008178381831385195, "loss_lvr": 0.25701192021369934, "loss_mode_switch": 0.0, "loss_total": 0.026519032195210457, "step": 1163 }, { "batch_size": 4, "epoch": 0.4652, "step": 1163, "tokens_per_device": 5968 }, { "epoch": 0.4652, "loss_ce": 0.29225364327430725, "loss_lvr": 0.8482795357704163, "loss_mode_switch": 0.0, "loss_total": 0.37708160281181335, "step": 1163 }, { "epoch": 0.4656, "grad_norm": 1.3127739429473877, "learning_rate": 5.796563945818984e-06, "loss": 0.2798, "step": 1164 }, { "batch_size": 4, "epoch": 0.4656, "step": 1164, "tokens_per_device": 3776 }, { "epoch": 0.4656, "loss_ce": 0.3059084117412567, "loss_lvr": 1.0961174964904785, "loss_mode_switch": 0.0, "loss_total": 0.41552016139030457, "step": 1164 }, { "batch_size": 4, "epoch": 0.4656, "step": 1164, "tokens_per_device": 13376 }, { "epoch": 0.4656, "loss_ce": 0.4185166656970978, "loss_lvr": 0.8276747465133667, "loss_mode_switch": 0.0, "loss_total": 0.501284122467041, "step": 1164 }, { "batch_size": 4, "epoch": 0.4656, "step": 1164, "tokens_per_device": 6672 }, { "epoch": 0.4656, "loss_ce": 0.7783690094947815, "loss_lvr": 0.7708211541175842, "loss_mode_switch": 0.0, "loss_total": 0.8554511070251465, "step": 1164 }, { "batch_size": 4, "epoch": 0.4656, "step": 1164, "tokens_per_device": 3224 }, { "epoch": 0.4656, "loss_ce": 0.40764814615249634, "loss_lvr": 0.9235190749168396, "loss_mode_switch": 0.0, "loss_total": 0.5000000596046448, "step": 1164 }, { "batch_size": 4, "epoch": 0.4656, "step": 1164, "tokens_per_device": 4336 }, { "epoch": 0.4656, "loss_ce": 0.5225158333778381, "loss_lvr": 1.1843150854110718, "loss_mode_switch": 0.0, "loss_total": 0.6409473419189453, "step": 1164 }, { "batch_size": 1, "epoch": 0.4656, "step": 1164, "tokens_per_device": 4879 }, { "epoch": 0.4656, "loss_ce": 0.08198294788599014, "loss_lvr": 0.2581199109554291, "loss_mode_switch": 0.0, "loss_total": 0.10779494047164917, "step": 1164 }, { "batch_size": 4, "epoch": 0.4656, "step": 1164, "tokens_per_device": 1616 }, { "epoch": 0.4656, "loss_ce": 0.1233268529176712, "loss_lvr": 1.0954244136810303, "loss_mode_switch": 0.0, "loss_total": 0.23286929726600647, "step": 1164 }, { "batch_size": 1, "epoch": 0.4656, "step": 1164, "tokens_per_device": 4627 }, { "epoch": 0.4656, "loss_ce": 0.6314985752105713, "loss_lvr": 0.507029116153717, "loss_mode_switch": 0.0, "loss_total": 0.6822015047073364, "step": 1164 }, { "epoch": 0.466, "grad_norm": 1.3278748989105225, "learning_rate": 5.7901684982796716e-06, "loss": 0.3218, "step": 1165 }, { "batch_size": 4, "epoch": 0.466, "step": 1165, "tokens_per_device": 5064 }, { "epoch": 0.466, "loss_ce": 0.43201011419296265, "loss_lvr": 0.9069691896438599, "loss_mode_switch": 0.0, "loss_total": 0.5227070450782776, "step": 1165 }, { "batch_size": 1, "epoch": 0.466, "step": 1165, "tokens_per_device": 5161 }, { "epoch": 0.466, "loss_ce": 0.22167038917541504, "loss_lvr": 0.23557274043560028, "loss_mode_switch": 0.0, "loss_total": 0.2452276647090912, "step": 1165 }, { "batch_size": 4, "epoch": 0.466, "step": 1165, "tokens_per_device": 2584 }, { "epoch": 0.466, "loss_ce": 0.7891861200332642, "loss_lvr": 0.9987167119979858, "loss_mode_switch": 0.0, "loss_total": 0.8890578150749207, "step": 1165 }, { "batch_size": 4, "epoch": 0.466, "step": 1165, "tokens_per_device": 1420 }, { "epoch": 0.466, "loss_ce": 0.5387890338897705, "loss_lvr": 1.056296706199646, "loss_mode_switch": 0.0, "loss_total": 0.6444187164306641, "step": 1165 }, { "batch_size": 1, "epoch": 0.466, "step": 1165, "tokens_per_device": 5497 }, { "epoch": 0.466, "loss_ce": 0.12337184697389603, "loss_lvr": 0.3836003541946411, "loss_mode_switch": 0.0, "loss_total": 0.16173188388347626, "step": 1165 }, { "batch_size": 1, "epoch": 0.466, "step": 1165, "tokens_per_device": 5117 }, { "epoch": 0.466, "loss_ce": 0.0159650519490242, "loss_lvr": 0.5505523085594177, "loss_mode_switch": 0.0, "loss_total": 0.07102028280496597, "step": 1165 }, { "batch_size": 1, "epoch": 0.466, "step": 1165, "tokens_per_device": 5080 }, { "epoch": 0.466, "loss_ce": 0.08234503120183945, "loss_lvr": 0.34985387325286865, "loss_mode_switch": 0.0, "loss_total": 0.11733041703701019, "step": 1165 }, { "batch_size": 1, "epoch": 0.466, "step": 1165, "tokens_per_device": 4908 }, { "epoch": 0.466, "loss_ce": 0.2277701497077942, "loss_lvr": 0.4938768744468689, "loss_mode_switch": 0.0, "loss_total": 0.27715784311294556, "step": 1165 }, { "epoch": 0.4664, "grad_norm": 1.2819541692733765, "learning_rate": 5.783771724580405e-06, "loss": 0.3367, "step": 1166 }, { "batch_size": 1, "epoch": 0.4664, "step": 1166, "tokens_per_device": 4879 }, { "epoch": 0.4664, "loss_ce": 0.02322002500295639, "loss_lvr": 0.2186528444290161, "loss_mode_switch": 0.0, "loss_total": 0.04508531093597412, "step": 1166 }, { "batch_size": 4, "epoch": 0.4664, "step": 1166, "tokens_per_device": 2716 }, { "epoch": 0.4664, "loss_ce": 0.284649133682251, "loss_lvr": 0.71900874376297, "loss_mode_switch": 0.0, "loss_total": 0.356550008058548, "step": 1166 }, { "batch_size": 4, "epoch": 0.4664, "step": 1166, "tokens_per_device": 3772 }, { "epoch": 0.4664, "loss_ce": 0.016539521515369415, "loss_lvr": 0.821743369102478, "loss_mode_switch": 0.0, "loss_total": 0.09871385991573334, "step": 1166 }, { "batch_size": 4, "epoch": 0.4664, "step": 1166, "tokens_per_device": 9048 }, { "epoch": 0.4664, "loss_ce": 0.16779208183288574, "loss_lvr": 0.578248918056488, "loss_mode_switch": 0.0, "loss_total": 0.22561697661876678, "step": 1166 }, { "batch_size": 4, "epoch": 0.4664, "step": 1166, "tokens_per_device": 4936 }, { "epoch": 0.4664, "loss_ce": 0.02549484372138977, "loss_lvr": 0.6469316482543945, "loss_mode_switch": 0.0, "loss_total": 0.09018801152706146, "step": 1166 }, { "batch_size": 4, "epoch": 0.4664, "step": 1166, "tokens_per_device": 1664 }, { "epoch": 0.4664, "loss_ce": 0.6027984023094177, "loss_lvr": 1.1379512548446655, "loss_mode_switch": 0.0, "loss_total": 0.7165935039520264, "step": 1166 }, { "batch_size": 4, "epoch": 0.4664, "step": 1166, "tokens_per_device": 11056 }, { "epoch": 0.4664, "loss_ce": 0.054471470415592194, "loss_lvr": 0.8545602560043335, "loss_mode_switch": 0.0, "loss_total": 0.13992750644683838, "step": 1166 }, { "batch_size": 1, "epoch": 0.4664, "step": 1166, "tokens_per_device": 4880 }, { "epoch": 0.4664, "loss_ce": 0.00994951743632555, "loss_lvr": 0.8475629091262817, "loss_mode_switch": 0.0, "loss_total": 0.09470581263303757, "step": 1166 }, { "epoch": 0.4668, "grad_norm": 1.3350952863693237, "learning_rate": 5.777373635457049e-06, "loss": 0.3294, "step": 1167 }, { "batch_size": 1, "epoch": 0.4668, "step": 1167, "tokens_per_device": 5176 }, { "epoch": 0.4668, "loss_ce": 0.04328956827521324, "loss_lvr": 0.17203839123249054, "loss_mode_switch": 0.0, "loss_total": 0.060493409633636475, "step": 1167 }, { "batch_size": 4, "epoch": 0.4668, "step": 1167, "tokens_per_device": 5024 }, { "epoch": 0.4668, "loss_ce": 0.2498672753572464, "loss_lvr": 0.7987656593322754, "loss_mode_switch": 0.0, "loss_total": 0.3297438323497772, "step": 1167 }, { "batch_size": 4, "epoch": 0.4668, "step": 1167, "tokens_per_device": 3892 }, { "epoch": 0.4668, "loss_ce": 0.3166552186012268, "loss_lvr": 1.112572431564331, "loss_mode_switch": 0.0, "loss_total": 0.42791247367858887, "step": 1167 }, { "batch_size": 1, "epoch": 0.4668, "step": 1167, "tokens_per_device": 5168 }, { "epoch": 0.4668, "loss_ce": 0.005765560083091259, "loss_lvr": 0.5365300178527832, "loss_mode_switch": 0.0, "loss_total": 0.059418562799692154, "step": 1167 }, { "batch_size": 4, "epoch": 0.4668, "step": 1167, "tokens_per_device": 4468 }, { "epoch": 0.4668, "loss_ce": 0.4596650302410126, "loss_lvr": 0.9957123398780823, "loss_mode_switch": 0.0, "loss_total": 0.5592362880706787, "step": 1167 }, { "batch_size": 4, "epoch": 0.4668, "step": 1167, "tokens_per_device": 3516 }, { "epoch": 0.4668, "loss_ce": 0.2436530739068985, "loss_lvr": 0.47400742769241333, "loss_mode_switch": 0.0, "loss_total": 0.291053831577301, "step": 1167 }, { "batch_size": 4, "epoch": 0.4668, "step": 1167, "tokens_per_device": 1280 }, { "epoch": 0.4668, "loss_ce": 0.3283466696739197, "loss_lvr": 1.0029191970825195, "loss_mode_switch": 0.0, "loss_total": 0.4286385774612427, "step": 1167 }, { "batch_size": 4, "epoch": 0.4668, "step": 1167, "tokens_per_device": 1216 }, { "epoch": 0.4668, "loss_ce": 0.2836412787437439, "loss_lvr": 1.046790361404419, "loss_mode_switch": 0.0, "loss_total": 0.38832032680511475, "step": 1167 }, { "epoch": 0.4672, "grad_norm": 1.2284612655639648, "learning_rate": 5.770974241647679e-06, "loss": 0.3151, "step": 1168 }, { "batch_size": 4, "epoch": 0.4672, "step": 1168, "tokens_per_device": 4200 }, { "epoch": 0.4672, "loss_ce": 0.6055582165718079, "loss_lvr": 0.7688146233558655, "loss_mode_switch": 0.0, "loss_total": 0.6824396848678589, "step": 1168 }, { "batch_size": 4, "epoch": 0.4672, "step": 1168, "tokens_per_device": 4632 }, { "epoch": 0.4672, "loss_ce": 0.45109954476356506, "loss_lvr": 0.8222798705101013, "loss_mode_switch": 0.0, "loss_total": 0.5333275198936462, "step": 1168 }, { "batch_size": 4, "epoch": 0.4672, "step": 1168, "tokens_per_device": 3780 }, { "epoch": 0.4672, "loss_ce": 0.03466995060443878, "loss_lvr": 0.42727261781692505, "loss_mode_switch": 0.0, "loss_total": 0.07739721238613129, "step": 1168 }, { "batch_size": 4, "epoch": 0.4672, "step": 1168, "tokens_per_device": 12068 }, { "epoch": 0.4672, "loss_ce": 0.7561547160148621, "loss_lvr": 0.994306206703186, "loss_mode_switch": 0.0, "loss_total": 0.8555853366851807, "step": 1168 }, { "batch_size": 4, "epoch": 0.4672, "step": 1168, "tokens_per_device": 1352 }, { "epoch": 0.4672, "loss_ce": 0.4927023947238922, "loss_lvr": 0.9437665343284607, "loss_mode_switch": 0.0, "loss_total": 0.5870790481567383, "step": 1168 }, { "batch_size": 4, "epoch": 0.4672, "step": 1168, "tokens_per_device": 3340 }, { "epoch": 0.4672, "loss_ce": 0.14252488315105438, "loss_lvr": 0.8091223239898682, "loss_mode_switch": 0.0, "loss_total": 0.2234371155500412, "step": 1168 }, { "batch_size": 4, "epoch": 0.4672, "step": 1168, "tokens_per_device": 3780 }, { "epoch": 0.4672, "loss_ce": 0.8524569869041443, "loss_lvr": 1.1077375411987305, "loss_mode_switch": 0.0, "loss_total": 0.9632307291030884, "step": 1168 }, { "batch_size": 1, "epoch": 0.4672, "step": 1168, "tokens_per_device": 5195 }, { "epoch": 0.4672, "loss_ce": 0.016307564452290535, "loss_lvr": 0.582001268863678, "loss_mode_switch": 0.0, "loss_total": 0.0745076909661293, "step": 1168 }, { "epoch": 0.4676, "grad_norm": 1.4046127796173096, "learning_rate": 5.764573553892564e-06, "loss": 0.3458, "step": 1169 }, { "batch_size": 4, "epoch": 0.4676, "step": 1169, "tokens_per_device": 4276 }, { "epoch": 0.4676, "loss_ce": 0.44835135340690613, "loss_lvr": 0.923205554485321, "loss_mode_switch": 0.0, "loss_total": 0.5406718850135803, "step": 1169 }, { "batch_size": 4, "epoch": 0.4676, "step": 1169, "tokens_per_device": 2652 }, { "epoch": 0.4676, "loss_ce": 0.2824079096317291, "loss_lvr": 1.189106822013855, "loss_mode_switch": 0.0, "loss_total": 0.40131860971450806, "step": 1169 }, { "batch_size": 4, "epoch": 0.4676, "step": 1169, "tokens_per_device": 4180 }, { "epoch": 0.4676, "loss_ce": 0.26019278168678284, "loss_lvr": 0.7622448801994324, "loss_mode_switch": 0.0, "loss_total": 0.3364172577857971, "step": 1169 }, { "batch_size": 1, "epoch": 0.4676, "step": 1169, "tokens_per_device": 4823 }, { "epoch": 0.4676, "loss_ce": 0.0011108586331829429, "loss_lvr": 0.2587064206600189, "loss_mode_switch": 0.0, "loss_total": 0.026981500908732414, "step": 1169 }, { "batch_size": 1, "epoch": 0.4676, "step": 1169, "tokens_per_device": 6412 }, { "epoch": 0.4676, "loss_ce": 0.0071119265630841255, "loss_lvr": 0.44239169359207153, "loss_mode_switch": 0.0, "loss_total": 0.051351096481084824, "step": 1169 }, { "batch_size": 4, "epoch": 0.4676, "step": 1169, "tokens_per_device": 4244 }, { "epoch": 0.4676, "loss_ce": 0.08502528071403503, "loss_lvr": 0.8431956768035889, "loss_mode_switch": 0.0, "loss_total": 0.16934484243392944, "step": 1169 }, { "batch_size": 1, "epoch": 0.4676, "step": 1169, "tokens_per_device": 5118 }, { "epoch": 0.4676, "loss_ce": 0.02929624542593956, "loss_lvr": 0.6857863068580627, "loss_mode_switch": 0.0, "loss_total": 0.09787487983703613, "step": 1169 }, { "batch_size": 4, "epoch": 0.4676, "step": 1169, "tokens_per_device": 5216 }, { "epoch": 0.4676, "loss_ce": 0.10230140388011932, "loss_lvr": 0.5329391360282898, "loss_mode_switch": 0.0, "loss_total": 0.1555953174829483, "step": 1169 }, { "epoch": 0.468, "grad_norm": 1.50879967212677, "learning_rate": 5.75817158293414e-06, "loss": 0.3088, "step": 1170 }, { "batch_size": 4, "epoch": 0.468, "step": 1170, "tokens_per_device": 5872 }, { "epoch": 0.468, "loss_ce": 0.2991173267364502, "loss_lvr": 0.8803356885910034, "loss_mode_switch": 0.0, "loss_total": 0.3871508836746216, "step": 1170 }, { "batch_size": 4, "epoch": 0.468, "step": 1170, "tokens_per_device": 3776 }, { "epoch": 0.468, "loss_ce": 0.1739698350429535, "loss_lvr": 0.9561283588409424, "loss_mode_switch": 0.0, "loss_total": 0.26958268880844116, "step": 1170 }, { "batch_size": 4, "epoch": 0.468, "step": 1170, "tokens_per_device": 4956 }, { "epoch": 0.468, "loss_ce": 0.230390265583992, "loss_lvr": 1.2387442588806152, "loss_mode_switch": 0.0, "loss_total": 0.35426467657089233, "step": 1170 }, { "batch_size": 4, "epoch": 0.468, "step": 1170, "tokens_per_device": 4272 }, { "epoch": 0.468, "loss_ce": 0.18568967282772064, "loss_lvr": 0.8200317621231079, "loss_mode_switch": 0.0, "loss_total": 0.2676928639411926, "step": 1170 }, { "batch_size": 1, "epoch": 0.468, "step": 1170, "tokens_per_device": 4543 }, { "epoch": 0.468, "loss_ce": 0.038508445024490356, "loss_lvr": 0.19944363832473755, "loss_mode_switch": 0.0, "loss_total": 0.05845280736684799, "step": 1170 }, { "batch_size": 4, "epoch": 0.468, "step": 1170, "tokens_per_device": 4848 }, { "epoch": 0.468, "loss_ce": 0.4587669372558594, "loss_lvr": 0.9337102770805359, "loss_mode_switch": 0.0, "loss_total": 0.5521379709243774, "step": 1170 }, { "batch_size": 4, "epoch": 0.468, "step": 1170, "tokens_per_device": 4008 }, { "epoch": 0.468, "loss_ce": 0.11045795679092407, "loss_lvr": 0.7542425990104675, "loss_mode_switch": 0.0, "loss_total": 0.18588221073150635, "step": 1170 }, { "batch_size": 1, "epoch": 0.468, "step": 1170, "tokens_per_device": 4870 }, { "epoch": 0.468, "loss_ce": 1.0259073972702026, "loss_lvr": 0.16811297833919525, "loss_mode_switch": 0.0, "loss_total": 1.0427186489105225, "step": 1170 }, { "epoch": 0.4684, "grad_norm": 1.7958645820617676, "learning_rate": 5.751768339517e-06, "loss": 0.3552, "step": 1171 }, { "batch_size": 4, "epoch": 0.4684, "step": 1171, "tokens_per_device": 1420 }, { "epoch": 0.4684, "loss_ce": 0.3439985513687134, "loss_lvr": 0.8761753439903259, "loss_mode_switch": 0.0, "loss_total": 0.4316160976886749, "step": 1171 }, { "batch_size": 4, "epoch": 0.4684, "step": 1171, "tokens_per_device": 3456 }, { "epoch": 0.4684, "loss_ce": 0.39269137382507324, "loss_lvr": 0.9317592978477478, "loss_mode_switch": 0.0, "loss_total": 0.48586732149124146, "step": 1171 }, { "batch_size": 1, "epoch": 0.4684, "step": 1171, "tokens_per_device": 5499 }, { "epoch": 0.4684, "loss_ce": 0.008419351652264595, "loss_lvr": 0.33700644969940186, "loss_mode_switch": 0.0, "loss_total": 0.04211999475955963, "step": 1171 }, { "batch_size": 1, "epoch": 0.4684, "step": 1171, "tokens_per_device": 5369 }, { "epoch": 0.4684, "loss_ce": 0.011800470761954784, "loss_lvr": 0.40947651863098145, "loss_mode_switch": 0.0, "loss_total": 0.05274812504649162, "step": 1171 }, { "batch_size": 4, "epoch": 0.4684, "step": 1171, "tokens_per_device": 2668 }, { "epoch": 0.4684, "loss_ce": 0.030570929870009422, "loss_lvr": 0.7576906085014343, "loss_mode_switch": 0.0, "loss_total": 0.10633999109268188, "step": 1171 }, { "batch_size": 4, "epoch": 0.4684, "step": 1171, "tokens_per_device": 1504 }, { "epoch": 0.4684, "loss_ce": 0.09530059248209, "loss_lvr": 1.2279952764511108, "loss_mode_switch": 0.0, "loss_total": 0.21810013055801392, "step": 1171 }, { "batch_size": 1, "epoch": 0.4684, "step": 1171, "tokens_per_device": 4894 }, { "epoch": 0.4684, "loss_ce": 0.009343346580862999, "loss_lvr": 0.3705404996871948, "loss_mode_switch": 0.0, "loss_total": 0.04639739543199539, "step": 1171 }, { "batch_size": 4, "epoch": 0.4684, "step": 1171, "tokens_per_device": 13832 }, { "epoch": 0.4684, "loss_ce": 0.12979525327682495, "loss_lvr": 0.47043082118034363, "loss_mode_switch": 0.0, "loss_total": 0.17683833837509155, "step": 1171 }, { "epoch": 0.4688, "grad_norm": 1.1974096298217773, "learning_rate": 5.745363834387867e-06, "loss": 0.3064, "step": 1172 }, { "batch_size": 1, "epoch": 0.4688, "step": 1172, "tokens_per_device": 4859 }, { "epoch": 0.4688, "loss_ce": 0.0076242471113801, "loss_lvr": 0.27581530809402466, "loss_mode_switch": 0.0, "loss_total": 0.03520577773451805, "step": 1172 }, { "batch_size": 4, "epoch": 0.4688, "step": 1172, "tokens_per_device": 3632 }, { "epoch": 0.4688, "loss_ce": 0.4081309735774994, "loss_lvr": 0.873753011226654, "loss_mode_switch": 0.0, "loss_total": 0.49550628662109375, "step": 1172 }, { "batch_size": 4, "epoch": 0.4688, "step": 1172, "tokens_per_device": 1356 }, { "epoch": 0.4688, "loss_ce": 0.3831406831741333, "loss_lvr": 2.1385045051574707, "loss_mode_switch": 0.0, "loss_total": 0.5969911217689514, "step": 1172 }, { "batch_size": 4, "epoch": 0.4688, "step": 1172, "tokens_per_device": 4244 }, { "epoch": 0.4688, "loss_ce": 0.253212034702301, "loss_lvr": 0.4446336328983307, "loss_mode_switch": 0.0, "loss_total": 0.29767540097236633, "step": 1172 }, { "batch_size": 4, "epoch": 0.4688, "step": 1172, "tokens_per_device": 7288 }, { "epoch": 0.4688, "loss_ce": 0.19670581817626953, "loss_lvr": 0.7806572914123535, "loss_mode_switch": 0.0, "loss_total": 0.2747715413570404, "step": 1172 }, { "batch_size": 1, "epoch": 0.4688, "step": 1172, "tokens_per_device": 5108 }, { "epoch": 0.4688, "loss_ce": 0.027476809918880463, "loss_lvr": 0.9434776306152344, "loss_mode_switch": 0.0, "loss_total": 0.12182457745075226, "step": 1172 }, { "batch_size": 4, "epoch": 0.4688, "step": 1172, "tokens_per_device": 4176 }, { "epoch": 0.4688, "loss_ce": 0.25241926312446594, "loss_lvr": 1.0521124601364136, "loss_mode_switch": 0.0, "loss_total": 0.35763052105903625, "step": 1172 }, { "batch_size": 1, "epoch": 0.4688, "step": 1172, "tokens_per_device": 5179 }, { "epoch": 0.4688, "loss_ce": 0.002224092371761799, "loss_lvr": 0.3632732033729553, "loss_mode_switch": 0.0, "loss_total": 0.038551412522792816, "step": 1172 }, { "epoch": 0.4692, "grad_norm": 1.2575311660766602, "learning_rate": 5.73895807829559e-06, "loss": 0.2864, "step": 1173 }, { "batch_size": 4, "epoch": 0.4692, "step": 1173, "tokens_per_device": 5712 }, { "epoch": 0.4692, "loss_ce": 0.23896093666553497, "loss_lvr": 1.078514814376831, "loss_mode_switch": 0.0, "loss_total": 0.3468124270439148, "step": 1173 }, { "batch_size": 1, "epoch": 0.4692, "step": 1173, "tokens_per_device": 5189 }, { "epoch": 0.4692, "loss_ce": 0.09774939715862274, "loss_lvr": 0.43715324997901917, "loss_mode_switch": 0.0, "loss_total": 0.1414647251367569, "step": 1173 }, { "batch_size": 4, "epoch": 0.4692, "step": 1173, "tokens_per_device": 4228 }, { "epoch": 0.4692, "loss_ce": 0.35008588433265686, "loss_lvr": 1.0870429277420044, "loss_mode_switch": 0.0, "loss_total": 0.4587901830673218, "step": 1173 }, { "batch_size": 1, "epoch": 0.4692, "step": 1173, "tokens_per_device": 4835 }, { "epoch": 0.4692, "loss_ce": 0.007218723651021719, "loss_lvr": 0.32822683453559875, "loss_mode_switch": 0.0, "loss_total": 0.04004140570759773, "step": 1173 }, { "batch_size": 4, "epoch": 0.4692, "step": 1173, "tokens_per_device": 8952 }, { "epoch": 0.4692, "loss_ce": 0.3202820420265198, "loss_lvr": 0.888708233833313, "loss_mode_switch": 0.0, "loss_total": 0.4091528654098511, "step": 1173 }, { "batch_size": 4, "epoch": 0.4692, "step": 1173, "tokens_per_device": 5984 }, { "epoch": 0.4692, "loss_ce": 0.25416475534439087, "loss_lvr": 0.6622405052185059, "loss_mode_switch": 0.0, "loss_total": 0.3203887939453125, "step": 1173 }, { "batch_size": 1, "epoch": 0.4692, "step": 1173, "tokens_per_device": 4904 }, { "epoch": 0.4692, "loss_ce": 0.07514292001724243, "loss_lvr": 0.7719988226890564, "loss_mode_switch": 0.0, "loss_total": 0.1523427963256836, "step": 1173 }, { "batch_size": 4, "epoch": 0.4692, "step": 1173, "tokens_per_device": 4300 }, { "epoch": 0.4692, "loss_ce": 0.6763257384300232, "loss_lvr": 0.646588921546936, "loss_mode_switch": 0.0, "loss_total": 0.7409846186637878, "step": 1173 }, { "epoch": 0.4696, "grad_norm": 1.4101849794387817, "learning_rate": 5.732551081991109e-06, "loss": 0.2984, "step": 1174 }, { "batch_size": 4, "epoch": 0.4696, "step": 1174, "tokens_per_device": 5708 }, { "epoch": 0.4696, "loss_ce": 0.3682911992073059, "loss_lvr": 0.4775402545928955, "loss_mode_switch": 0.0, "loss_total": 0.416045218706131, "step": 1174 }, { "batch_size": 4, "epoch": 0.4696, "step": 1174, "tokens_per_device": 3756 }, { "epoch": 0.4696, "loss_ce": 0.06894180178642273, "loss_lvr": 0.8349576592445374, "loss_mode_switch": 0.0, "loss_total": 0.15243756771087646, "step": 1174 }, { "batch_size": 4, "epoch": 0.4696, "step": 1174, "tokens_per_device": 3832 }, { "epoch": 0.4696, "loss_ce": 0.46796926856040955, "loss_lvr": 0.9047445058822632, "loss_mode_switch": 0.0, "loss_total": 0.5584437251091003, "step": 1174 }, { "batch_size": 4, "epoch": 0.4696, "step": 1174, "tokens_per_device": 1268 }, { "epoch": 0.4696, "loss_ce": 0.2484072893857956, "loss_lvr": 1.0030477046966553, "loss_mode_switch": 0.0, "loss_total": 0.3487120568752289, "step": 1174 }, { "batch_size": 1, "epoch": 0.4696, "step": 1174, "tokens_per_device": 4859 }, { "epoch": 0.4696, "loss_ce": 0.37456008791923523, "loss_lvr": 0.4436115622520447, "loss_mode_switch": 0.0, "loss_total": 0.41892123222351074, "step": 1174 }, { "batch_size": 1, "epoch": 0.4696, "step": 1174, "tokens_per_device": 5101 }, { "epoch": 0.4696, "loss_ce": 0.0030883424915373325, "loss_lvr": 0.6067367196083069, "loss_mode_switch": 0.0, "loss_total": 0.06376201659440994, "step": 1174 }, { "batch_size": 1, "epoch": 0.4696, "step": 1174, "tokens_per_device": 4583 }, { "epoch": 0.4696, "loss_ce": 0.27560195326805115, "loss_lvr": 0.6293700933456421, "loss_mode_switch": 0.0, "loss_total": 0.3385389745235443, "step": 1174 }, { "batch_size": 1, "epoch": 0.4696, "step": 1174, "tokens_per_device": 5129 }, { "epoch": 0.4696, "loss_ce": 0.0038844048976898193, "loss_lvr": 0.3156067728996277, "loss_mode_switch": 0.0, "loss_total": 0.03544508293271065, "step": 1174 }, { "epoch": 0.47, "grad_norm": 1.415592074394226, "learning_rate": 5.726142856227453e-06, "loss": 0.3314, "step": 1175 }, { "batch_size": 1, "epoch": 0.47, "step": 1175, "tokens_per_device": 7815 }, { "epoch": 0.47, "loss_ce": 0.012439193204045296, "loss_lvr": 0.27503088116645813, "loss_mode_switch": 0.0, "loss_total": 0.03994227945804596, "step": 1175 }, { "batch_size": 1, "epoch": 0.47, "step": 1175, "tokens_per_device": 4917 }, { "epoch": 0.47, "loss_ce": 0.05265692248940468, "loss_lvr": 0.3273935317993164, "loss_mode_switch": 0.0, "loss_total": 0.08539627492427826, "step": 1175 }, { "batch_size": 4, "epoch": 0.47, "step": 1175, "tokens_per_device": 5088 }, { "epoch": 0.47, "loss_ce": 0.5114720463752747, "loss_lvr": 0.8015775084495544, "loss_mode_switch": 0.0, "loss_total": 0.5916298031806946, "step": 1175 }, { "batch_size": 1, "epoch": 0.47, "step": 1175, "tokens_per_device": 6277 }, { "epoch": 0.47, "loss_ce": 0.007395902182906866, "loss_lvr": 0.39414918422698975, "loss_mode_switch": 0.0, "loss_total": 0.0468108206987381, "step": 1175 }, { "batch_size": 4, "epoch": 0.47, "step": 1175, "tokens_per_device": 2772 }, { "epoch": 0.47, "loss_ce": 0.1783919483423233, "loss_lvr": 0.6182345151901245, "loss_mode_switch": 0.0, "loss_total": 0.24021540582180023, "step": 1175 }, { "batch_size": 1, "epoch": 0.47, "step": 1175, "tokens_per_device": 5463 }, { "epoch": 0.47, "loss_ce": 0.5874155759811401, "loss_lvr": 0.26346445083618164, "loss_mode_switch": 0.0, "loss_total": 0.6137620210647583, "step": 1175 }, { "batch_size": 4, "epoch": 0.47, "step": 1175, "tokens_per_device": 4484 }, { "epoch": 0.47, "loss_ce": 0.2480233907699585, "loss_lvr": 0.7883562445640564, "loss_mode_switch": 0.0, "loss_total": 0.3268590271472931, "step": 1175 }, { "batch_size": 4, "epoch": 0.47, "step": 1175, "tokens_per_device": 2748 }, { "epoch": 0.47, "loss_ce": 0.06797504425048828, "loss_lvr": 1.043994665145874, "loss_mode_switch": 0.0, "loss_total": 0.17237451672554016, "step": 1175 }, { "epoch": 0.4704, "grad_norm": 1.6407289505004883, "learning_rate": 5.719733411759707e-06, "loss": 0.2944, "step": 1176 }, { "batch_size": 4, "epoch": 0.4704, "step": 1176, "tokens_per_device": 2896 }, { "epoch": 0.4704, "loss_ce": 0.1487039476633072, "loss_lvr": 0.7693802118301392, "loss_mode_switch": 0.0, "loss_total": 0.22564196586608887, "step": 1176 }, { "batch_size": 4, "epoch": 0.4704, "step": 1176, "tokens_per_device": 5688 }, { "epoch": 0.4704, "loss_ce": 0.06712882965803146, "loss_lvr": 0.7921520471572876, "loss_mode_switch": 0.0, "loss_total": 0.14634403586387634, "step": 1176 }, { "batch_size": 4, "epoch": 0.4704, "step": 1176, "tokens_per_device": 4200 }, { "epoch": 0.4704, "loss_ce": 0.5800750851631165, "loss_lvr": 1.015394926071167, "loss_mode_switch": 0.0, "loss_total": 0.6816145777702332, "step": 1176 }, { "batch_size": 4, "epoch": 0.4704, "step": 1176, "tokens_per_device": 2644 }, { "epoch": 0.4704, "loss_ce": 0.4609244465827942, "loss_lvr": 0.9097051024436951, "loss_mode_switch": 0.0, "loss_total": 0.5518949627876282, "step": 1176 }, { "batch_size": 1, "epoch": 0.4704, "step": 1176, "tokens_per_device": 5079 }, { "epoch": 0.4704, "loss_ce": 0.0174806397408247, "loss_lvr": 0.28186872601509094, "loss_mode_switch": 0.0, "loss_total": 0.04566751420497894, "step": 1176 }, { "batch_size": 1, "epoch": 0.4704, "step": 1176, "tokens_per_device": 5137 }, { "epoch": 0.4704, "loss_ce": 0.10411746799945831, "loss_lvr": 0.4812083840370178, "loss_mode_switch": 0.0, "loss_total": 0.15223830938339233, "step": 1176 }, { "batch_size": 1, "epoch": 0.4704, "step": 1176, "tokens_per_device": 5703 }, { "epoch": 0.4704, "loss_ce": 0.0005967506440356374, "loss_lvr": 0.33300483226776123, "loss_mode_switch": 0.0, "loss_total": 0.03389723598957062, "step": 1176 }, { "batch_size": 4, "epoch": 0.4704, "step": 1176, "tokens_per_device": 4388 }, { "epoch": 0.4704, "loss_ce": 0.5312271118164062, "loss_lvr": 1.0003838539123535, "loss_mode_switch": 0.0, "loss_total": 0.6312655210494995, "step": 1176 }, { "epoch": 0.4708, "grad_norm": 1.4415392875671387, "learning_rate": 5.713322759345008e-06, "loss": 0.3494, "step": 1177 }, { "batch_size": 4, "epoch": 0.4708, "step": 1177, "tokens_per_device": 1560 }, { "epoch": 0.4708, "loss_ce": 0.2760176360607147, "loss_lvr": 0.8420796394348145, "loss_mode_switch": 0.0, "loss_total": 0.3602256178855896, "step": 1177 }, { "batch_size": 4, "epoch": 0.4708, "step": 1177, "tokens_per_device": 3816 }, { "epoch": 0.4708, "loss_ce": 0.4582269489765167, "loss_lvr": 0.982390820980072, "loss_mode_switch": 0.0, "loss_total": 0.5564660429954529, "step": 1177 }, { "batch_size": 4, "epoch": 0.4708, "step": 1177, "tokens_per_device": 4208 }, { "epoch": 0.4708, "loss_ce": 0.5276231169700623, "loss_lvr": 1.0624054670333862, "loss_mode_switch": 0.0, "loss_total": 0.6338636875152588, "step": 1177 }, { "batch_size": 4, "epoch": 0.4708, "step": 1177, "tokens_per_device": 3424 }, { "epoch": 0.4708, "loss_ce": 0.17992271482944489, "loss_lvr": 0.9558379650115967, "loss_mode_switch": 0.0, "loss_total": 0.27550649642944336, "step": 1177 }, { "batch_size": 4, "epoch": 0.4708, "step": 1177, "tokens_per_device": 5284 }, { "epoch": 0.4708, "loss_ce": 0.021233512088656425, "loss_lvr": 0.7887588739395142, "loss_mode_switch": 0.0, "loss_total": 0.10010940581560135, "step": 1177 }, { "batch_size": 4, "epoch": 0.4708, "step": 1177, "tokens_per_device": 2552 }, { "epoch": 0.4708, "loss_ce": 0.09484763443470001, "loss_lvr": 1.109775424003601, "loss_mode_switch": 0.0, "loss_total": 0.20582517981529236, "step": 1177 }, { "batch_size": 4, "epoch": 0.4708, "step": 1177, "tokens_per_device": 4708 }, { "epoch": 0.4708, "loss_ce": 0.07943196594715118, "loss_lvr": 1.3023006916046143, "loss_mode_switch": 0.0, "loss_total": 0.2096620351076126, "step": 1177 }, { "batch_size": 4, "epoch": 0.4708, "step": 1177, "tokens_per_device": 2556 }, { "epoch": 0.4708, "loss_ce": 0.42379918694496155, "loss_lvr": 0.9554257988929749, "loss_mode_switch": 0.0, "loss_total": 0.519341766834259, "step": 1177 }, { "epoch": 0.4712, "grad_norm": 1.2123732566833496, "learning_rate": 5.7069109097425176e-06, "loss": 0.2799, "step": 1178 }, { "batch_size": 1, "epoch": 0.4712, "step": 1178, "tokens_per_device": 4862 }, { "epoch": 0.4712, "loss_ce": 0.0009484856855124235, "loss_lvr": 0.24411310255527496, "loss_mode_switch": 0.0, "loss_total": 0.02535979636013508, "step": 1178 }, { "batch_size": 4, "epoch": 0.4712, "step": 1178, "tokens_per_device": 4440 }, { "epoch": 0.4712, "loss_ce": 0.20716248452663422, "loss_lvr": 0.8609620332717896, "loss_mode_switch": 0.0, "loss_total": 0.2932586967945099, "step": 1178 }, { "batch_size": 4, "epoch": 0.4712, "step": 1178, "tokens_per_device": 4776 }, { "epoch": 0.4712, "loss_ce": 0.2533881664276123, "loss_lvr": 0.7496646642684937, "loss_mode_switch": 0.0, "loss_total": 0.3283546268939972, "step": 1178 }, { "batch_size": 1, "epoch": 0.4712, "step": 1178, "tokens_per_device": 4881 }, { "epoch": 0.4712, "loss_ce": 0.020564138889312744, "loss_lvr": 0.3337251842021942, "loss_mode_switch": 0.0, "loss_total": 0.053936656564474106, "step": 1178 }, { "batch_size": 1, "epoch": 0.4712, "step": 1178, "tokens_per_device": 4923 }, { "epoch": 0.4712, "loss_ce": 0.018830182030797005, "loss_lvr": 0.35209134221076965, "loss_mode_switch": 0.0, "loss_total": 0.05403931438922882, "step": 1178 }, { "batch_size": 1, "epoch": 0.4712, "step": 1178, "tokens_per_device": 5169 }, { "epoch": 0.4712, "loss_ce": 0.0272138100117445, "loss_lvr": 0.42399632930755615, "loss_mode_switch": 0.0, "loss_total": 0.06961344182491302, "step": 1178 }, { "batch_size": 4, "epoch": 0.4712, "step": 1178, "tokens_per_device": 13320 }, { "epoch": 0.4712, "loss_ce": 0.038211215287446976, "loss_lvr": 0.6465465426445007, "loss_mode_switch": 0.0, "loss_total": 0.10286587476730347, "step": 1178 }, { "batch_size": 4, "epoch": 0.4712, "step": 1178, "tokens_per_device": 4820 }, { "epoch": 0.4712, "loss_ce": 0.07711683958768845, "loss_lvr": 0.7511169910430908, "loss_mode_switch": 0.0, "loss_total": 0.15222853422164917, "step": 1178 }, { "epoch": 0.4716, "grad_norm": 1.1015193462371826, "learning_rate": 5.700497873713405e-06, "loss": 0.2529, "step": 1179 }, { "batch_size": 4, "epoch": 0.4716, "step": 1179, "tokens_per_device": 4228 }, { "epoch": 0.4716, "loss_ce": 0.1548955738544464, "loss_lvr": 1.0045548677444458, "loss_mode_switch": 0.0, "loss_total": 0.25535106658935547, "step": 1179 }, { "batch_size": 1, "epoch": 0.4716, "step": 1179, "tokens_per_device": 4582 }, { "epoch": 0.4716, "loss_ce": 0.0016982498345896602, "loss_lvr": 0.17118924856185913, "loss_mode_switch": 0.0, "loss_total": 0.018817175179719925, "step": 1179 }, { "batch_size": 1, "epoch": 0.4716, "step": 1179, "tokens_per_device": 4953 }, { "epoch": 0.4716, "loss_ce": 0.023553501814603806, "loss_lvr": 0.4039645493030548, "loss_mode_switch": 0.0, "loss_total": 0.06394995748996735, "step": 1179 }, { "batch_size": 4, "epoch": 0.4716, "step": 1179, "tokens_per_device": 5104 }, { "epoch": 0.4716, "loss_ce": 0.03351078927516937, "loss_lvr": 0.6299210786819458, "loss_mode_switch": 0.0, "loss_total": 0.09650290012359619, "step": 1179 }, { "batch_size": 1, "epoch": 0.4716, "step": 1179, "tokens_per_device": 5084 }, { "epoch": 0.4716, "loss_ce": 0.00686533423140645, "loss_lvr": 0.300869345664978, "loss_mode_switch": 0.0, "loss_total": 0.036952268332242966, "step": 1179 }, { "batch_size": 4, "epoch": 0.4716, "step": 1179, "tokens_per_device": 5116 }, { "epoch": 0.4716, "loss_ce": 0.09580741077661514, "loss_lvr": 0.7314145565032959, "loss_mode_switch": 0.0, "loss_total": 0.16894885897636414, "step": 1179 }, { "batch_size": 4, "epoch": 0.4716, "step": 1179, "tokens_per_device": 4364 }, { "epoch": 0.4716, "loss_ce": 0.02931191585958004, "loss_lvr": 0.9087397456169128, "loss_mode_switch": 0.0, "loss_total": 0.12018589675426483, "step": 1179 }, { "batch_size": 4, "epoch": 0.4716, "step": 1179, "tokens_per_device": 4720 }, { "epoch": 0.4716, "loss_ce": 0.12214631587266922, "loss_lvr": 0.7975510954856873, "loss_mode_switch": 0.0, "loss_total": 0.20190143585205078, "step": 1179 }, { "epoch": 0.472, "grad_norm": 1.129455804824829, "learning_rate": 5.694083662020835e-06, "loss": 0.2373, "step": 1180 }, { "batch_size": 4, "epoch": 0.472, "step": 1180, "tokens_per_device": 3780 }, { "epoch": 0.472, "loss_ce": 0.32976076006889343, "loss_lvr": 1.1496739387512207, "loss_mode_switch": 0.0, "loss_total": 0.44472816586494446, "step": 1180 }, { "batch_size": 4, "epoch": 0.472, "step": 1180, "tokens_per_device": 1292 }, { "epoch": 0.472, "loss_ce": 0.5512521862983704, "loss_lvr": 0.9409053921699524, "loss_mode_switch": 0.0, "loss_total": 0.6453427076339722, "step": 1180 }, { "batch_size": 1, "epoch": 0.472, "step": 1180, "tokens_per_device": 5098 }, { "epoch": 0.472, "loss_ce": 0.6527771949768066, "loss_lvr": 0.49121126532554626, "loss_mode_switch": 0.0, "loss_total": 0.7018983364105225, "step": 1180 }, { "batch_size": 1, "epoch": 0.472, "step": 1180, "tokens_per_device": 4913 }, { "epoch": 0.472, "loss_ce": 0.093452088534832, "loss_lvr": 0.2860545516014099, "loss_mode_switch": 0.0, "loss_total": 0.12205754220485687, "step": 1180 }, { "batch_size": 4, "epoch": 0.472, "step": 1180, "tokens_per_device": 1408 }, { "epoch": 0.472, "loss_ce": 0.4461170434951782, "loss_lvr": 1.0852717161178589, "loss_mode_switch": 0.0, "loss_total": 0.5546442270278931, "step": 1180 }, { "batch_size": 4, "epoch": 0.472, "step": 1180, "tokens_per_device": 1304 }, { "epoch": 0.472, "loss_ce": 0.885672926902771, "loss_lvr": 1.18284273147583, "loss_mode_switch": 0.0, "loss_total": 1.0039571523666382, "step": 1180 }, { "batch_size": 1, "epoch": 0.472, "step": 1180, "tokens_per_device": 8415 }, { "epoch": 0.472, "loss_ce": 0.1136479526758194, "loss_lvr": 0.4919377565383911, "loss_mode_switch": 0.0, "loss_total": 0.16284173727035522, "step": 1180 }, { "batch_size": 4, "epoch": 0.472, "step": 1180, "tokens_per_device": 6144 }, { "epoch": 0.472, "loss_ce": 0.15977084636688232, "loss_lvr": 0.7106522917747498, "loss_mode_switch": 0.0, "loss_total": 0.23083607852458954, "step": 1180 }, { "epoch": 0.4724, "grad_norm": 1.3424557447433472, "learning_rate": 5.6876682854299385e-06, "loss": 0.305, "step": 1181 }, { "batch_size": 4, "epoch": 0.4724, "step": 1181, "tokens_per_device": 1452 }, { "epoch": 0.4724, "loss_ce": 0.18288683891296387, "loss_lvr": 1.027487874031067, "loss_mode_switch": 0.0, "loss_total": 0.2856356203556061, "step": 1181 }, { "batch_size": 1, "epoch": 0.4724, "step": 1181, "tokens_per_device": 5462 }, { "epoch": 0.4724, "loss_ce": 0.004396552219986916, "loss_lvr": 0.28524988889694214, "loss_mode_switch": 0.0, "loss_total": 0.03292154148221016, "step": 1181 }, { "batch_size": 4, "epoch": 0.4724, "step": 1181, "tokens_per_device": 3776 }, { "epoch": 0.4724, "loss_ce": 0.21854203939437866, "loss_lvr": 0.8936024904251099, "loss_mode_switch": 0.0, "loss_total": 0.3079022765159607, "step": 1181 }, { "batch_size": 1, "epoch": 0.4724, "step": 1181, "tokens_per_device": 4900 }, { "epoch": 0.4724, "loss_ce": 0.002636237069964409, "loss_lvr": 0.4218173027038574, "loss_mode_switch": 0.0, "loss_total": 0.0448179692029953, "step": 1181 }, { "batch_size": 4, "epoch": 0.4724, "step": 1181, "tokens_per_device": 1172 }, { "epoch": 0.4724, "loss_ce": 0.2952175438404083, "loss_lvr": 1.116506576538086, "loss_mode_switch": 0.0, "loss_total": 0.40686821937561035, "step": 1181 }, { "batch_size": 4, "epoch": 0.4724, "step": 1181, "tokens_per_device": 2888 }, { "epoch": 0.4724, "loss_ce": 0.29721853137016296, "loss_lvr": 0.6454746723175049, "loss_mode_switch": 0.0, "loss_total": 0.3617660105228424, "step": 1181 }, { "batch_size": 4, "epoch": 0.4724, "step": 1181, "tokens_per_device": 12128 }, { "epoch": 0.4724, "loss_ce": 0.16035196185112, "loss_lvr": 0.5726233124732971, "loss_mode_switch": 0.0, "loss_total": 0.2176142930984497, "step": 1181 }, { "batch_size": 4, "epoch": 0.4724, "step": 1181, "tokens_per_device": 1496 }, { "epoch": 0.4724, "loss_ce": 0.5807148218154907, "loss_lvr": 1.2997500896453857, "loss_mode_switch": 0.0, "loss_total": 0.7106898427009583, "step": 1181 }, { "epoch": 0.4728, "grad_norm": 1.2296897172927856, "learning_rate": 5.68125175470781e-06, "loss": 0.2716, "step": 1182 }, { "batch_size": 4, "epoch": 0.4728, "step": 1182, "tokens_per_device": 8288 }, { "epoch": 0.4728, "loss_ce": 0.322992205619812, "loss_lvr": 0.6968821287155151, "loss_mode_switch": 0.0, "loss_total": 0.39268040657043457, "step": 1182 }, { "batch_size": 4, "epoch": 0.4728, "step": 1182, "tokens_per_device": 1672 }, { "epoch": 0.4728, "loss_ce": 0.5455362200737, "loss_lvr": 0.9446711540222168, "loss_mode_switch": 0.0, "loss_total": 0.6400033235549927, "step": 1182 }, { "batch_size": 4, "epoch": 0.4728, "step": 1182, "tokens_per_device": 4188 }, { "epoch": 0.4728, "loss_ce": 0.20706143975257874, "loss_lvr": 0.7341371774673462, "loss_mode_switch": 0.0, "loss_total": 0.2804751694202423, "step": 1182 }, { "batch_size": 4, "epoch": 0.4728, "step": 1182, "tokens_per_device": 4040 }, { "epoch": 0.4728, "loss_ce": 0.36293160915374756, "loss_lvr": 0.9359852075576782, "loss_mode_switch": 0.0, "loss_total": 0.4565301239490509, "step": 1182 }, { "batch_size": 4, "epoch": 0.4728, "step": 1182, "tokens_per_device": 4356 }, { "epoch": 0.4728, "loss_ce": 0.07241347432136536, "loss_lvr": 0.6141598224639893, "loss_mode_switch": 0.0, "loss_total": 0.13382945954799652, "step": 1182 }, { "batch_size": 1, "epoch": 0.4728, "step": 1182, "tokens_per_device": 4911 }, { "epoch": 0.4728, "loss_ce": 0.022104965522885323, "loss_lvr": 0.2826055884361267, "loss_mode_switch": 0.0, "loss_total": 0.050365522503852844, "step": 1182 }, { "batch_size": 4, "epoch": 0.4728, "step": 1182, "tokens_per_device": 5620 }, { "epoch": 0.4728, "loss_ce": 0.37104442715644836, "loss_lvr": 1.073099136352539, "loss_mode_switch": 0.0, "loss_total": 0.4783543348312378, "step": 1182 }, { "batch_size": 4, "epoch": 0.4728, "step": 1182, "tokens_per_device": 5760 }, { "epoch": 0.4728, "loss_ce": 0.21240085363388062, "loss_lvr": 0.804331362247467, "loss_mode_switch": 0.0, "loss_total": 0.29283398389816284, "step": 1182 }, { "epoch": 0.4732, "grad_norm": 1.3455177545547485, "learning_rate": 5.674834080623472e-06, "loss": 0.2833, "step": 1183 }, { "batch_size": 1, "epoch": 0.4732, "step": 1183, "tokens_per_device": 5168 }, { "epoch": 0.4732, "loss_ce": 0.002568403957411647, "loss_lvr": 0.5448895692825317, "loss_mode_switch": 0.0, "loss_total": 0.05705736204981804, "step": 1183 }, { "batch_size": 4, "epoch": 0.4732, "step": 1183, "tokens_per_device": 5744 }, { "epoch": 0.4732, "loss_ce": 0.03844962269067764, "loss_lvr": 1.0690491199493408, "loss_mode_switch": 0.0, "loss_total": 0.14535453915596008, "step": 1183 }, { "batch_size": 1, "epoch": 0.4732, "step": 1183, "tokens_per_device": 4924 }, { "epoch": 0.4732, "loss_ce": 0.5902649164199829, "loss_lvr": 0.5809593200683594, "loss_mode_switch": 0.0, "loss_total": 0.6483608484268188, "step": 1183 }, { "batch_size": 1, "epoch": 0.4732, "step": 1183, "tokens_per_device": 4883 }, { "epoch": 0.4732, "loss_ce": 0.13834872841835022, "loss_lvr": 1.011690616607666, "loss_mode_switch": 0.0, "loss_total": 0.23951779305934906, "step": 1183 }, { "batch_size": 4, "epoch": 0.4732, "step": 1183, "tokens_per_device": 7528 }, { "epoch": 0.4732, "loss_ce": 0.02488592080771923, "loss_lvr": 0.7792931199073792, "loss_mode_switch": 0.0, "loss_total": 0.10281523317098618, "step": 1183 }, { "batch_size": 1, "epoch": 0.4732, "step": 1183, "tokens_per_device": 5303 }, { "epoch": 0.4732, "loss_ce": 0.04057500511407852, "loss_lvr": 0.4358232021331787, "loss_mode_switch": 0.0, "loss_total": 0.08415732532739639, "step": 1183 }, { "batch_size": 4, "epoch": 0.4732, "step": 1183, "tokens_per_device": 4052 }, { "epoch": 0.4732, "loss_ce": 0.21429462730884552, "loss_lvr": 1.0837968587875366, "loss_mode_switch": 0.0, "loss_total": 0.32267430424690247, "step": 1183 }, { "batch_size": 4, "epoch": 0.4732, "step": 1183, "tokens_per_device": 1292 }, { "epoch": 0.4732, "loss_ce": 0.5253663063049316, "loss_lvr": 1.1899385452270508, "loss_mode_switch": 0.0, "loss_total": 0.6443601846694946, "step": 1183 }, { "epoch": 0.4736, "grad_norm": 1.4839222431182861, "learning_rate": 5.668415273947876e-06, "loss": 0.2998, "step": 1184 }, { "batch_size": 1, "epoch": 0.4736, "step": 1184, "tokens_per_device": 5188 }, { "epoch": 0.4736, "loss_ce": 0.004517764784395695, "loss_lvr": 0.32030120491981506, "loss_mode_switch": 0.0, "loss_total": 0.036547884345054626, "step": 1184 }, { "batch_size": 4, "epoch": 0.4736, "step": 1184, "tokens_per_device": 5808 }, { "epoch": 0.4736, "loss_ce": 0.04801977798342705, "loss_lvr": 0.7730405926704407, "loss_mode_switch": 0.0, "loss_total": 0.1253238320350647, "step": 1184 }, { "batch_size": 4, "epoch": 0.4736, "step": 1184, "tokens_per_device": 3040 }, { "epoch": 0.4736, "loss_ce": 0.019416138529777527, "loss_lvr": 0.6820046305656433, "loss_mode_switch": 0.0, "loss_total": 0.08761660009622574, "step": 1184 }, { "batch_size": 4, "epoch": 0.4736, "step": 1184, "tokens_per_device": 5768 }, { "epoch": 0.4736, "loss_ce": 0.023464765399694443, "loss_lvr": 0.8182708024978638, "loss_mode_switch": 0.0, "loss_total": 0.10529184341430664, "step": 1184 }, { "batch_size": 4, "epoch": 0.4736, "step": 1184, "tokens_per_device": 11136 }, { "epoch": 0.4736, "loss_ce": 0.3803606927394867, "loss_lvr": 1.1605294942855835, "loss_mode_switch": 0.0, "loss_total": 0.4964136481285095, "step": 1184 }, { "batch_size": 1, "epoch": 0.4736, "step": 1184, "tokens_per_device": 4872 }, { "epoch": 0.4736, "loss_ce": 0.2315596640110016, "loss_lvr": 0.7904204726219177, "loss_mode_switch": 0.0, "loss_total": 0.31060171127319336, "step": 1184 }, { "batch_size": 4, "epoch": 0.4736, "step": 1184, "tokens_per_device": 1344 }, { "epoch": 0.4736, "loss_ce": 0.680051863193512, "loss_lvr": 0.9908975958824158, "loss_mode_switch": 0.0, "loss_total": 0.7791416049003601, "step": 1184 }, { "batch_size": 4, "epoch": 0.4736, "step": 1184, "tokens_per_device": 3908 }, { "epoch": 0.4736, "loss_ce": 0.0726056918501854, "loss_lvr": 0.6958167552947998, "loss_mode_switch": 0.0, "loss_total": 0.14218737185001373, "step": 1184 }, { "epoch": 0.474, "grad_norm": 1.1983273029327393, "learning_rate": 5.661995345453867e-06, "loss": 0.293, "step": 1185 }, { "batch_size": 4, "epoch": 0.474, "step": 1185, "tokens_per_device": 4540 }, { "epoch": 0.474, "loss_ce": 0.06271584331989288, "loss_lvr": 0.9096465706825256, "loss_mode_switch": 0.0, "loss_total": 0.15368050336837769, "step": 1185 }, { "batch_size": 1, "epoch": 0.474, "step": 1185, "tokens_per_device": 4875 }, { "epoch": 0.474, "loss_ce": 0.1166883185505867, "loss_lvr": 0.36933788657188416, "loss_mode_switch": 0.0, "loss_total": 0.153622105717659, "step": 1185 }, { "batch_size": 1, "epoch": 0.474, "step": 1185, "tokens_per_device": 4862 }, { "epoch": 0.474, "loss_ce": 0.028868375346064568, "loss_lvr": 0.369757741689682, "loss_mode_switch": 0.0, "loss_total": 0.06584414839744568, "step": 1185 }, { "batch_size": 4, "epoch": 0.474, "step": 1185, "tokens_per_device": 4456 }, { "epoch": 0.474, "loss_ce": 0.2531777322292328, "loss_lvr": 0.7766481041908264, "loss_mode_switch": 0.0, "loss_total": 0.3308425545692444, "step": 1185 }, { "batch_size": 4, "epoch": 0.474, "step": 1185, "tokens_per_device": 5464 }, { "epoch": 0.474, "loss_ce": 0.1341332197189331, "loss_lvr": 0.5450916290283203, "loss_mode_switch": 0.0, "loss_total": 0.18864238262176514, "step": 1185 }, { "batch_size": 4, "epoch": 0.474, "step": 1185, "tokens_per_device": 5008 }, { "epoch": 0.474, "loss_ce": 0.21118533611297607, "loss_lvr": 0.8607178330421448, "loss_mode_switch": 0.0, "loss_total": 0.29725712537765503, "step": 1185 }, { "batch_size": 1, "epoch": 0.474, "step": 1185, "tokens_per_device": 5001 }, { "epoch": 0.474, "loss_ce": 0.4895569384098053, "loss_lvr": 0.9434534907341003, "loss_mode_switch": 0.0, "loss_total": 0.5839022994041443, "step": 1185 }, { "batch_size": 4, "epoch": 0.474, "step": 1185, "tokens_per_device": 5656 }, { "epoch": 0.474, "loss_ce": 0.8956539630889893, "loss_lvr": 0.8923028707504272, "loss_mode_switch": 0.0, "loss_total": 0.9848842620849609, "step": 1185 }, { "epoch": 0.4744, "grad_norm": 1.6200742721557617, "learning_rate": 5.655574305916173e-06, "loss": 0.3252, "step": 1186 }, { "batch_size": 4, "epoch": 0.4744, "step": 1186, "tokens_per_device": 1456 }, { "epoch": 0.4744, "loss_ce": 0.49629151821136475, "loss_lvr": 1.0091947317123413, "loss_mode_switch": 0.0, "loss_total": 0.5972110033035278, "step": 1186 }, { "batch_size": 4, "epoch": 0.4744, "step": 1186, "tokens_per_device": 8472 }, { "epoch": 0.4744, "loss_ce": 0.21268658339977264, "loss_lvr": 0.587042510509491, "loss_mode_switch": 0.0, "loss_total": 0.271390825510025, "step": 1186 }, { "batch_size": 4, "epoch": 0.4744, "step": 1186, "tokens_per_device": 5396 }, { "epoch": 0.4744, "loss_ce": 0.42135849595069885, "loss_lvr": 0.7944180369377136, "loss_mode_switch": 0.0, "loss_total": 0.5008003115653992, "step": 1186 }, { "batch_size": 4, "epoch": 0.4744, "step": 1186, "tokens_per_device": 3968 }, { "epoch": 0.4744, "loss_ce": 0.1143663302063942, "loss_lvr": 0.9154290556907654, "loss_mode_switch": 0.0, "loss_total": 0.20590923726558685, "step": 1186 }, { "batch_size": 4, "epoch": 0.4744, "step": 1186, "tokens_per_device": 3760 }, { "epoch": 0.4744, "loss_ce": 0.35370340943336487, "loss_lvr": 1.055184006690979, "loss_mode_switch": 0.0, "loss_total": 0.45922181010246277, "step": 1186 }, { "batch_size": 4, "epoch": 0.4744, "step": 1186, "tokens_per_device": 1184 }, { "epoch": 0.4744, "loss_ce": 0.2866879403591156, "loss_lvr": 0.7843891382217407, "loss_mode_switch": 0.0, "loss_total": 0.3651268482208252, "step": 1186 }, { "batch_size": 4, "epoch": 0.4744, "step": 1186, "tokens_per_device": 4364 }, { "epoch": 0.4744, "loss_ce": 0.06859298795461655, "loss_lvr": 0.7577086687088013, "loss_mode_switch": 0.0, "loss_total": 0.14436385035514832, "step": 1186 }, { "batch_size": 4, "epoch": 0.4744, "step": 1186, "tokens_per_device": 3584 }, { "epoch": 0.4744, "loss_ce": 0.26367509365081787, "loss_lvr": 0.8323085904121399, "loss_mode_switch": 0.0, "loss_total": 0.3469059467315674, "step": 1186 }, { "epoch": 0.4748, "grad_norm": 1.3909168243408203, "learning_rate": 5.64915216611139e-06, "loss": 0.3338, "step": 1187 }, { "batch_size": 1, "epoch": 0.4748, "step": 1187, "tokens_per_device": 4904 }, { "epoch": 0.4748, "loss_ce": 0.004427686333656311, "loss_lvr": 0.21507528424263, "loss_mode_switch": 0.0, "loss_total": 0.0259352158755064, "step": 1187 }, { "batch_size": 4, "epoch": 0.4748, "step": 1187, "tokens_per_device": 4088 }, { "epoch": 0.4748, "loss_ce": 0.3206271231174469, "loss_lvr": 0.7743127942085266, "loss_mode_switch": 0.0, "loss_total": 0.3980584144592285, "step": 1187 }, { "batch_size": 4, "epoch": 0.4748, "step": 1187, "tokens_per_device": 2620 }, { "epoch": 0.4748, "loss_ce": 0.26685845851898193, "loss_lvr": 0.9232771992683411, "loss_mode_switch": 0.0, "loss_total": 0.35918617248535156, "step": 1187 }, { "batch_size": 4, "epoch": 0.4748, "step": 1187, "tokens_per_device": 2648 }, { "epoch": 0.4748, "loss_ce": 0.22826918959617615, "loss_lvr": 0.9010835886001587, "loss_mode_switch": 0.0, "loss_total": 0.3183775544166565, "step": 1187 }, { "batch_size": 1, "epoch": 0.4748, "step": 1187, "tokens_per_device": 4993 }, { "epoch": 0.4748, "loss_ce": 0.01145605742931366, "loss_lvr": 0.3955564498901367, "loss_mode_switch": 0.0, "loss_total": 0.05101170390844345, "step": 1187 }, { "batch_size": 1, "epoch": 0.4748, "step": 1187, "tokens_per_device": 7368 }, { "epoch": 0.4748, "loss_ce": 0.1154688373208046, "loss_lvr": 0.25394466519355774, "loss_mode_switch": 0.0, "loss_total": 0.140863299369812, "step": 1187 }, { "batch_size": 4, "epoch": 0.4748, "step": 1187, "tokens_per_device": 4352 }, { "epoch": 0.4748, "loss_ce": 0.1273774802684784, "loss_lvr": 0.8700103759765625, "loss_mode_switch": 0.0, "loss_total": 0.21437852084636688, "step": 1187 }, { "batch_size": 4, "epoch": 0.4748, "step": 1187, "tokens_per_device": 2708 }, { "epoch": 0.4748, "loss_ce": 0.2099657952785492, "loss_lvr": 0.8406130075454712, "loss_mode_switch": 0.0, "loss_total": 0.29402709007263184, "step": 1187 }, { "epoch": 0.4752, "grad_norm": 1.4588522911071777, "learning_rate": 5.64272893681796e-06, "loss": 0.2965, "step": 1188 }, { "batch_size": 4, "epoch": 0.4752, "step": 1188, "tokens_per_device": 1432 }, { "epoch": 0.4752, "loss_ce": 0.5304921865463257, "loss_lvr": 0.8592312335968018, "loss_mode_switch": 0.0, "loss_total": 0.6164153218269348, "step": 1188 }, { "batch_size": 4, "epoch": 0.4752, "step": 1188, "tokens_per_device": 7556 }, { "epoch": 0.4752, "loss_ce": 0.14347536861896515, "loss_lvr": 0.7289015650749207, "loss_mode_switch": 0.0, "loss_total": 0.2163655161857605, "step": 1188 }, { "batch_size": 4, "epoch": 0.4752, "step": 1188, "tokens_per_device": 2660 }, { "epoch": 0.4752, "loss_ce": 0.434598445892334, "loss_lvr": 0.7552955150604248, "loss_mode_switch": 0.0, "loss_total": 0.5101280212402344, "step": 1188 }, { "batch_size": 1, "epoch": 0.4752, "step": 1188, "tokens_per_device": 5123 }, { "epoch": 0.4752, "loss_ce": 0.08354946970939636, "loss_lvr": 0.353802889585495, "loss_mode_switch": 0.0, "loss_total": 0.11892975866794586, "step": 1188 }, { "batch_size": 1, "epoch": 0.4752, "step": 1188, "tokens_per_device": 5261 }, { "epoch": 0.4752, "loss_ce": 0.0015499584842473269, "loss_lvr": 0.36728158593177795, "loss_mode_switch": 0.0, "loss_total": 0.03827811777591705, "step": 1188 }, { "batch_size": 4, "epoch": 0.4752, "step": 1188, "tokens_per_device": 7400 }, { "epoch": 0.4752, "loss_ce": 0.14205916225910187, "loss_lvr": 0.6950165033340454, "loss_mode_switch": 0.0, "loss_total": 0.21156081557273865, "step": 1188 }, { "batch_size": 1, "epoch": 0.4752, "step": 1188, "tokens_per_device": 4883 }, { "epoch": 0.4752, "loss_ce": 0.001537061994895339, "loss_lvr": 0.2642531394958496, "loss_mode_switch": 0.0, "loss_total": 0.027962377294898033, "step": 1188 }, { "batch_size": 4, "epoch": 0.4752, "step": 1188, "tokens_per_device": 6000 }, { "epoch": 0.4752, "loss_ce": 0.07910117506980896, "loss_lvr": 0.818651556968689, "loss_mode_switch": 0.0, "loss_total": 0.16096633672714233, "step": 1188 }, { "epoch": 0.4756, "grad_norm": 1.4426130056381226, "learning_rate": 5.636304628816153e-06, "loss": 0.2903, "step": 1189 }, { "batch_size": 4, "epoch": 0.4756, "step": 1189, "tokens_per_device": 2624 }, { "epoch": 0.4756, "loss_ce": 0.056475136429071426, "loss_lvr": 0.8710243701934814, "loss_mode_switch": 0.0, "loss_total": 0.14357757568359375, "step": 1189 }, { "batch_size": 1, "epoch": 0.4756, "step": 1189, "tokens_per_device": 4890 }, { "epoch": 0.4756, "loss_ce": 0.00035348249366506934, "loss_lvr": 0.5703259706497192, "loss_mode_switch": 0.0, "loss_total": 0.05738607794046402, "step": 1189 }, { "batch_size": 4, "epoch": 0.4756, "step": 1189, "tokens_per_device": 4916 }, { "epoch": 0.4756, "loss_ce": 0.37904033064842224, "loss_lvr": 1.026682734489441, "loss_mode_switch": 0.0, "loss_total": 0.4817086160182953, "step": 1189 }, { "batch_size": 4, "epoch": 0.4756, "step": 1189, "tokens_per_device": 3956 }, { "epoch": 0.4756, "loss_ce": 0.30689355731010437, "loss_lvr": 0.7439401745796204, "loss_mode_switch": 0.0, "loss_total": 0.3812875747680664, "step": 1189 }, { "batch_size": 4, "epoch": 0.4756, "step": 1189, "tokens_per_device": 1340 }, { "epoch": 0.4756, "loss_ce": 0.14793752133846283, "loss_lvr": 1.1599754095077515, "loss_mode_switch": 0.0, "loss_total": 0.26393505930900574, "step": 1189 }, { "batch_size": 4, "epoch": 0.4756, "step": 1189, "tokens_per_device": 5100 }, { "epoch": 0.4756, "loss_ce": 0.2273644357919693, "loss_lvr": 0.8886609673500061, "loss_mode_switch": 0.0, "loss_total": 0.31623053550720215, "step": 1189 }, { "batch_size": 4, "epoch": 0.4756, "step": 1189, "tokens_per_device": 5884 }, { "epoch": 0.4756, "loss_ce": 0.01756029762327671, "loss_lvr": 0.8413712978363037, "loss_mode_switch": 0.0, "loss_total": 0.10169743001461029, "step": 1189 }, { "batch_size": 4, "epoch": 0.4756, "step": 1189, "tokens_per_device": 3408 }, { "epoch": 0.4756, "loss_ce": 0.09817790985107422, "loss_lvr": 0.9743383526802063, "loss_mode_switch": 0.0, "loss_total": 0.19561174511909485, "step": 1189 }, { "epoch": 0.476, "grad_norm": 1.218389630317688, "learning_rate": 5.629879252888046e-06, "loss": 0.2643, "step": 1190 }, { "batch_size": 4, "epoch": 0.476, "step": 1190, "tokens_per_device": 4040 }, { "epoch": 0.476, "loss_ce": 0.13936327397823334, "loss_lvr": 0.9760921001434326, "loss_mode_switch": 0.0, "loss_total": 0.23697248101234436, "step": 1190 }, { "batch_size": 4, "epoch": 0.476, "step": 1190, "tokens_per_device": 4324 }, { "epoch": 0.476, "loss_ce": 0.16978254914283752, "loss_lvr": 0.8727537393569946, "loss_mode_switch": 0.0, "loss_total": 0.25705793499946594, "step": 1190 }, { "batch_size": 4, "epoch": 0.476, "step": 1190, "tokens_per_device": 4932 }, { "epoch": 0.476, "loss_ce": 0.36887267231941223, "loss_lvr": 1.0028295516967773, "loss_mode_switch": 0.0, "loss_total": 0.4691556394100189, "step": 1190 }, { "batch_size": 1, "epoch": 0.476, "step": 1190, "tokens_per_device": 4931 }, { "epoch": 0.476, "loss_ce": 0.014428161084651947, "loss_lvr": 0.5512005686759949, "loss_mode_switch": 0.0, "loss_total": 0.06954821944236755, "step": 1190 }, { "batch_size": 4, "epoch": 0.476, "step": 1190, "tokens_per_device": 4756 }, { "epoch": 0.476, "loss_ce": 0.04941224679350853, "loss_lvr": 0.7641943693161011, "loss_mode_switch": 0.0, "loss_total": 0.12583167850971222, "step": 1190 }, { "batch_size": 1, "epoch": 0.476, "step": 1190, "tokens_per_device": 4909 }, { "epoch": 0.476, "loss_ce": 0.09905007481575012, "loss_lvr": 0.23770643770694733, "loss_mode_switch": 0.0, "loss_total": 0.12282072007656097, "step": 1190 }, { "batch_size": 4, "epoch": 0.476, "step": 1190, "tokens_per_device": 5016 }, { "epoch": 0.476, "loss_ce": 0.5079154968261719, "loss_lvr": 0.8939600586891174, "loss_mode_switch": 0.0, "loss_total": 0.5973114967346191, "step": 1190 }, { "batch_size": 4, "epoch": 0.476, "step": 1190, "tokens_per_device": 3380 }, { "epoch": 0.476, "loss_ce": 0.8071382641792297, "loss_lvr": 0.8956919312477112, "loss_mode_switch": 0.0, "loss_total": 0.8967074751853943, "step": 1190 }, { "epoch": 0.4764, "grad_norm": 1.6548100709915161, "learning_rate": 5.623452819817514e-06, "loss": 0.3117, "step": 1191 }, { "batch_size": 4, "epoch": 0.4764, "step": 1191, "tokens_per_device": 2752 }, { "epoch": 0.4764, "loss_ce": 0.2365199774503708, "loss_lvr": 1.1041873693466187, "loss_mode_switch": 0.0, "loss_total": 0.34693872928619385, "step": 1191 }, { "batch_size": 4, "epoch": 0.4764, "step": 1191, "tokens_per_device": 1420 }, { "epoch": 0.4764, "loss_ce": 0.4718465209007263, "loss_lvr": 0.9379723072052002, "loss_mode_switch": 0.0, "loss_total": 0.5656437277793884, "step": 1191 }, { "batch_size": 4, "epoch": 0.4764, "step": 1191, "tokens_per_device": 2772 }, { "epoch": 0.4764, "loss_ce": 0.4735596776008606, "loss_lvr": 0.8734831213951111, "loss_mode_switch": 0.0, "loss_total": 0.5609079599380493, "step": 1191 }, { "batch_size": 4, "epoch": 0.4764, "step": 1191, "tokens_per_device": 5684 }, { "epoch": 0.4764, "loss_ce": 0.08461654931306839, "loss_lvr": 0.7736207842826843, "loss_mode_switch": 0.0, "loss_total": 0.16197863221168518, "step": 1191 }, { "batch_size": 4, "epoch": 0.4764, "step": 1191, "tokens_per_device": 3896 }, { "epoch": 0.4764, "loss_ce": 0.19557255506515503, "loss_lvr": 0.9394754767417908, "loss_mode_switch": 0.0, "loss_total": 0.28952011466026306, "step": 1191 }, { "batch_size": 4, "epoch": 0.4764, "step": 1191, "tokens_per_device": 4272 }, { "epoch": 0.4764, "loss_ce": 0.14764662086963654, "loss_lvr": 0.8402978777885437, "loss_mode_switch": 0.0, "loss_total": 0.2316763997077942, "step": 1191 }, { "batch_size": 1, "epoch": 0.4764, "step": 1191, "tokens_per_device": 4914 }, { "epoch": 0.4764, "loss_ce": 0.2092529535293579, "loss_lvr": 0.20788803696632385, "loss_mode_switch": 0.0, "loss_total": 0.2300417572259903, "step": 1191 }, { "batch_size": 1, "epoch": 0.4764, "step": 1191, "tokens_per_device": 4814 }, { "epoch": 0.4764, "loss_ce": 0.009450326673686504, "loss_lvr": 0.331745445728302, "loss_mode_switch": 0.0, "loss_total": 0.04262487217783928, "step": 1191 }, { "epoch": 0.4768, "grad_norm": 1.9704383611679077, "learning_rate": 5.617025340390203e-06, "loss": 0.2571, "step": 1192 }, { "batch_size": 4, "epoch": 0.4768, "step": 1192, "tokens_per_device": 4424 }, { "epoch": 0.4768, "loss_ce": 0.48699092864990234, "loss_lvr": 0.8954570293426514, "loss_mode_switch": 0.0, "loss_total": 0.5765366554260254, "step": 1192 }, { "batch_size": 1, "epoch": 0.4768, "step": 1192, "tokens_per_device": 5124 }, { "epoch": 0.4768, "loss_ce": 0.25462499260902405, "loss_lvr": 0.3311119079589844, "loss_mode_switch": 0.0, "loss_total": 0.287736177444458, "step": 1192 }, { "batch_size": 1, "epoch": 0.4768, "step": 1192, "tokens_per_device": 4878 }, { "epoch": 0.4768, "loss_ce": 0.0025400021113455296, "loss_lvr": 0.4292326867580414, "loss_mode_switch": 0.0, "loss_total": 0.04546327143907547, "step": 1192 }, { "batch_size": 1, "epoch": 0.4768, "step": 1192, "tokens_per_device": 6566 }, { "epoch": 0.4768, "loss_ce": 0.0007353996043093503, "loss_lvr": 0.29740428924560547, "loss_mode_switch": 0.0, "loss_total": 0.030475828796625137, "step": 1192 }, { "batch_size": 4, "epoch": 0.4768, "step": 1192, "tokens_per_device": 14720 }, { "epoch": 0.4768, "loss_ce": 0.35441046953201294, "loss_lvr": 0.7124771475791931, "loss_mode_switch": 0.0, "loss_total": 0.4256581962108612, "step": 1192 }, { "batch_size": 4, "epoch": 0.4768, "step": 1192, "tokens_per_device": 2664 }, { "epoch": 0.4768, "loss_ce": 0.6061453223228455, "loss_lvr": 0.7677539587020874, "loss_mode_switch": 0.0, "loss_total": 0.6829206943511963, "step": 1192 }, { "batch_size": 1, "epoch": 0.4768, "step": 1192, "tokens_per_device": 4861 }, { "epoch": 0.4768, "loss_ce": 0.020352447405457497, "loss_lvr": 0.45154446363449097, "loss_mode_switch": 0.0, "loss_total": 0.06550689786672592, "step": 1192 }, { "batch_size": 4, "epoch": 0.4768, "step": 1192, "tokens_per_device": 3332 }, { "epoch": 0.4768, "loss_ce": 0.17797629535198212, "loss_lvr": 0.9224890470504761, "loss_mode_switch": 0.0, "loss_total": 0.2702251970767975, "step": 1192 }, { "epoch": 0.4772, "grad_norm": 1.2384637594223022, "learning_rate": 5.610596825393516e-06, "loss": 0.2764, "step": 1193 }, { "batch_size": 4, "epoch": 0.4772, "step": 1193, "tokens_per_device": 4204 }, { "epoch": 0.4772, "loss_ce": 0.16039925813674927, "loss_lvr": 0.7886924743652344, "loss_mode_switch": 0.0, "loss_total": 0.23926851153373718, "step": 1193 }, { "batch_size": 4, "epoch": 0.4772, "step": 1193, "tokens_per_device": 3360 }, { "epoch": 0.4772, "loss_ce": 0.10592702031135559, "loss_lvr": 0.7557319402694702, "loss_mode_switch": 0.0, "loss_total": 0.18150022625923157, "step": 1193 }, { "batch_size": 4, "epoch": 0.4772, "step": 1193, "tokens_per_device": 7920 }, { "epoch": 0.4772, "loss_ce": 0.4195683002471924, "loss_lvr": 0.8789070844650269, "loss_mode_switch": 0.0, "loss_total": 0.5074589848518372, "step": 1193 }, { "batch_size": 4, "epoch": 0.4772, "step": 1193, "tokens_per_device": 1468 }, { "epoch": 0.4772, "loss_ce": 0.2547410726547241, "loss_lvr": 0.9977947473526001, "loss_mode_switch": 0.0, "loss_total": 0.3545205593109131, "step": 1193 }, { "batch_size": 4, "epoch": 0.4772, "step": 1193, "tokens_per_device": 1408 }, { "epoch": 0.4772, "loss_ce": 0.41895779967308044, "loss_lvr": 0.8462271690368652, "loss_mode_switch": 0.0, "loss_total": 0.5035805106163025, "step": 1193 }, { "batch_size": 4, "epoch": 0.4772, "step": 1193, "tokens_per_device": 1208 }, { "epoch": 0.4772, "loss_ce": 0.5240106582641602, "loss_lvr": 1.0402792692184448, "loss_mode_switch": 0.0, "loss_total": 0.6280385851860046, "step": 1193 }, { "batch_size": 4, "epoch": 0.4772, "step": 1193, "tokens_per_device": 3792 }, { "epoch": 0.4772, "loss_ce": 0.17783120274543762, "loss_lvr": 1.3090920448303223, "loss_mode_switch": 0.0, "loss_total": 0.30874040722846985, "step": 1193 }, { "batch_size": 4, "epoch": 0.4772, "step": 1193, "tokens_per_device": 3816 }, { "epoch": 0.4772, "loss_ce": 0.15929222106933594, "loss_lvr": 1.0664480924606323, "loss_mode_switch": 0.0, "loss_total": 0.26593703031539917, "step": 1193 }, { "epoch": 0.4776, "grad_norm": 1.3196910619735718, "learning_rate": 5.604167285616593e-06, "loss": 0.3221, "step": 1194 }, { "batch_size": 4, "epoch": 0.4776, "step": 1194, "tokens_per_device": 4828 }, { "epoch": 0.4776, "loss_ce": 0.05575107783079147, "loss_lvr": 0.820122480392456, "loss_mode_switch": 0.0, "loss_total": 0.13776332139968872, "step": 1194 }, { "batch_size": 4, "epoch": 0.4776, "step": 1194, "tokens_per_device": 3824 }, { "epoch": 0.4776, "loss_ce": 0.2168131172657013, "loss_lvr": 0.9641864895820618, "loss_mode_switch": 0.0, "loss_total": 0.31323176622390747, "step": 1194 }, { "batch_size": 1, "epoch": 0.4776, "step": 1194, "tokens_per_device": 4980 }, { "epoch": 0.4776, "loss_ce": 0.36390137672424316, "loss_lvr": 0.20985622704029083, "loss_mode_switch": 0.0, "loss_total": 0.3848870098590851, "step": 1194 }, { "batch_size": 4, "epoch": 0.4776, "step": 1194, "tokens_per_device": 2540 }, { "epoch": 0.4776, "loss_ce": 0.45618048310279846, "loss_lvr": 1.0032821893692017, "loss_mode_switch": 0.0, "loss_total": 0.5565087199211121, "step": 1194 }, { "batch_size": 4, "epoch": 0.4776, "step": 1194, "tokens_per_device": 4344 }, { "epoch": 0.4776, "loss_ce": 0.6395138502120972, "loss_lvr": 1.1917643547058105, "loss_mode_switch": 0.0, "loss_total": 0.7586902976036072, "step": 1194 }, { "batch_size": 1, "epoch": 0.4776, "step": 1194, "tokens_per_device": 4794 }, { "epoch": 0.4776, "loss_ce": 0.0003553895221557468, "loss_lvr": 0.44170546531677246, "loss_mode_switch": 0.0, "loss_total": 0.044525936245918274, "step": 1194 }, { "batch_size": 4, "epoch": 0.4776, "step": 1194, "tokens_per_device": 4236 }, { "epoch": 0.4776, "loss_ce": 0.2156018316745758, "loss_lvr": 0.8839285969734192, "loss_mode_switch": 0.0, "loss_total": 0.30399468541145325, "step": 1194 }, { "batch_size": 4, "epoch": 0.4776, "step": 1194, "tokens_per_device": 5348 }, { "epoch": 0.4776, "loss_ce": 0.029574809595942497, "loss_lvr": 0.80402672290802, "loss_mode_switch": 0.0, "loss_total": 0.10997748374938965, "step": 1194 }, { "epoch": 0.478, "grad_norm": 1.314452886581421, "learning_rate": 5.597736731850295e-06, "loss": 0.307, "step": 1195 }, { "batch_size": 4, "epoch": 0.478, "step": 1195, "tokens_per_device": 4512 }, { "epoch": 0.478, "loss_ce": 0.30092674493789673, "loss_lvr": 0.6601333618164062, "loss_mode_switch": 0.0, "loss_total": 0.36694008111953735, "step": 1195 }, { "batch_size": 1, "epoch": 0.478, "step": 1195, "tokens_per_device": 4974 }, { "epoch": 0.478, "loss_ce": 0.015755172818899155, "loss_lvr": 0.1940246820449829, "loss_mode_switch": 0.0, "loss_total": 0.035157643258571625, "step": 1195 }, { "batch_size": 1, "epoch": 0.478, "step": 1195, "tokens_per_device": 5143 }, { "epoch": 0.478, "loss_ce": 0.0007809103699401021, "loss_lvr": 0.4724271893501282, "loss_mode_switch": 0.0, "loss_total": 0.04802362993359566, "step": 1195 }, { "batch_size": 1, "epoch": 0.478, "step": 1195, "tokens_per_device": 6609 }, { "epoch": 0.478, "loss_ce": 0.00033665975206531584, "loss_lvr": 0.4731247127056122, "loss_mode_switch": 0.0, "loss_total": 0.04764913022518158, "step": 1195 }, { "batch_size": 1, "epoch": 0.478, "step": 1195, "tokens_per_device": 5036 }, { "epoch": 0.478, "loss_ce": 0.06537467986345291, "loss_lvr": 0.4834216833114624, "loss_mode_switch": 0.0, "loss_total": 0.11371684819459915, "step": 1195 }, { "batch_size": 1, "epoch": 0.478, "step": 1195, "tokens_per_device": 4760 }, { "epoch": 0.478, "loss_ce": 0.020068803802132607, "loss_lvr": 0.37396562099456787, "loss_mode_switch": 0.0, "loss_total": 0.05746536701917648, "step": 1195 }, { "batch_size": 4, "epoch": 0.478, "step": 1195, "tokens_per_device": 4708 }, { "epoch": 0.478, "loss_ce": 0.17082823812961578, "loss_lvr": 0.7149679660797119, "loss_mode_switch": 0.0, "loss_total": 0.2423250377178192, "step": 1195 }, { "batch_size": 1, "epoch": 0.478, "step": 1195, "tokens_per_device": 4953 }, { "epoch": 0.478, "loss_ce": 0.03864236921072006, "loss_lvr": 0.3778010904788971, "loss_mode_switch": 0.0, "loss_total": 0.07642248272895813, "step": 1195 }, { "epoch": 0.4784, "grad_norm": 1.1461918354034424, "learning_rate": 5.591305174887185e-06, "loss": 0.2621, "step": 1196 }, { "batch_size": 4, "epoch": 0.4784, "step": 1196, "tokens_per_device": 4376 }, { "epoch": 0.4784, "loss_ce": 0.09838299453258514, "loss_lvr": 1.0752191543579102, "loss_mode_switch": 0.0, "loss_total": 0.20590490102767944, "step": 1196 }, { "batch_size": 4, "epoch": 0.4784, "step": 1196, "tokens_per_device": 3116 }, { "epoch": 0.4784, "loss_ce": 0.31322145462036133, "loss_lvr": 1.0247758626937866, "loss_mode_switch": 0.0, "loss_total": 0.4156990349292755, "step": 1196 }, { "batch_size": 4, "epoch": 0.4784, "step": 1196, "tokens_per_device": 2668 }, { "epoch": 0.4784, "loss_ce": 0.169399693608284, "loss_lvr": 0.8807752132415771, "loss_mode_switch": 0.0, "loss_total": 0.2574772238731384, "step": 1196 }, { "batch_size": 4, "epoch": 0.4784, "step": 1196, "tokens_per_device": 4296 }, { "epoch": 0.4784, "loss_ce": 0.3932530879974365, "loss_lvr": 0.9200636744499207, "loss_mode_switch": 0.0, "loss_total": 0.485259473323822, "step": 1196 }, { "batch_size": 4, "epoch": 0.4784, "step": 1196, "tokens_per_device": 5300 }, { "epoch": 0.4784, "loss_ce": 0.16359707713127136, "loss_lvr": 0.9909408092498779, "loss_mode_switch": 0.0, "loss_total": 0.2626911699771881, "step": 1196 }, { "batch_size": 4, "epoch": 0.4784, "step": 1196, "tokens_per_device": 4320 }, { "epoch": 0.4784, "loss_ce": 0.14627854526042938, "loss_lvr": 0.8154686689376831, "loss_mode_switch": 0.0, "loss_total": 0.22782540321350098, "step": 1196 }, { "batch_size": 4, "epoch": 0.4784, "step": 1196, "tokens_per_device": 5212 }, { "epoch": 0.4784, "loss_ce": 0.1362190991640091, "loss_lvr": 0.8095578551292419, "loss_mode_switch": 0.0, "loss_total": 0.21717488765716553, "step": 1196 }, { "batch_size": 4, "epoch": 0.4784, "step": 1196, "tokens_per_device": 5560 }, { "epoch": 0.4784, "loss_ce": 0.0014507850864902139, "loss_lvr": 0.9233024716377258, "loss_mode_switch": 0.0, "loss_total": 0.09378103166818619, "step": 1196 }, { "epoch": 0.4788, "grad_norm": 1.16895592212677, "learning_rate": 5.58487262552151e-06, "loss": 0.2856, "step": 1197 }, { "batch_size": 4, "epoch": 0.4788, "step": 1197, "tokens_per_device": 2640 }, { "epoch": 0.4788, "loss_ce": 0.3008791208267212, "loss_lvr": 0.8798336982727051, "loss_mode_switch": 0.0, "loss_total": 0.3888624906539917, "step": 1197 }, { "batch_size": 4, "epoch": 0.4788, "step": 1197, "tokens_per_device": 5148 }, { "epoch": 0.4788, "loss_ce": 0.04510626569390297, "loss_lvr": 0.7885844707489014, "loss_mode_switch": 0.0, "loss_total": 0.12396471202373505, "step": 1197 }, { "batch_size": 4, "epoch": 0.4788, "step": 1197, "tokens_per_device": 4640 }, { "epoch": 0.4788, "loss_ce": 0.1989128440618515, "loss_lvr": 1.024808645248413, "loss_mode_switch": 0.0, "loss_total": 0.3013937175273895, "step": 1197 }, { "batch_size": 4, "epoch": 0.4788, "step": 1197, "tokens_per_device": 1304 }, { "epoch": 0.4788, "loss_ce": 0.018024494871497154, "loss_lvr": 0.9513944983482361, "loss_mode_switch": 0.0, "loss_total": 0.11316394805908203, "step": 1197 }, { "batch_size": 4, "epoch": 0.4788, "step": 1197, "tokens_per_device": 4944 }, { "epoch": 0.4788, "loss_ce": 0.23997844755649567, "loss_lvr": 0.7476099729537964, "loss_mode_switch": 0.0, "loss_total": 0.3147394359111786, "step": 1197 }, { "batch_size": 4, "epoch": 0.4788, "step": 1197, "tokens_per_device": 3744 }, { "epoch": 0.4788, "loss_ce": 0.36759552359580994, "loss_lvr": 0.8815022110939026, "loss_mode_switch": 0.0, "loss_total": 0.45574575662612915, "step": 1197 }, { "batch_size": 1, "epoch": 0.4788, "step": 1197, "tokens_per_device": 6255 }, { "epoch": 0.4788, "loss_ce": 0.2784709334373474, "loss_lvr": 0.41724416613578796, "loss_mode_switch": 0.0, "loss_total": 0.32019534707069397, "step": 1197 }, { "batch_size": 4, "epoch": 0.4788, "step": 1197, "tokens_per_device": 6180 }, { "epoch": 0.4788, "loss_ce": 0.12873154878616333, "loss_lvr": 0.639224112033844, "loss_mode_switch": 0.0, "loss_total": 0.19265395402908325, "step": 1197 }, { "epoch": 0.4792, "grad_norm": 1.2606141567230225, "learning_rate": 5.5784390945491784e-06, "loss": 0.3181, "step": 1198 }, { "batch_size": 4, "epoch": 0.4792, "step": 1198, "tokens_per_device": 3624 }, { "epoch": 0.4792, "loss_ce": 0.36282795667648315, "loss_lvr": 0.8961144685745239, "loss_mode_switch": 0.0, "loss_total": 0.45243939757347107, "step": 1198 }, { "batch_size": 4, "epoch": 0.4792, "step": 1198, "tokens_per_device": 7260 }, { "epoch": 0.4792, "loss_ce": 0.035239335149526596, "loss_lvr": 0.7432675957679749, "loss_mode_switch": 0.0, "loss_total": 0.1095660924911499, "step": 1198 }, { "batch_size": 4, "epoch": 0.4792, "step": 1198, "tokens_per_device": 4424 }, { "epoch": 0.4792, "loss_ce": 0.7361338138580322, "loss_lvr": 0.7927187085151672, "loss_mode_switch": 0.0, "loss_total": 0.8154056668281555, "step": 1198 }, { "batch_size": 1, "epoch": 0.4792, "step": 1198, "tokens_per_device": 4854 }, { "epoch": 0.4792, "loss_ce": 0.016982456669211388, "loss_lvr": 0.23041343688964844, "loss_mode_switch": 0.0, "loss_total": 0.0400237999856472, "step": 1198 }, { "batch_size": 4, "epoch": 0.4792, "step": 1198, "tokens_per_device": 1456 }, { "epoch": 0.4792, "loss_ce": 0.31357550621032715, "loss_lvr": 1.067297339439392, "loss_mode_switch": 0.0, "loss_total": 0.4203052520751953, "step": 1198 }, { "batch_size": 1, "epoch": 0.4792, "step": 1198, "tokens_per_device": 4822 }, { "epoch": 0.4792, "loss_ce": 0.17881974577903748, "loss_lvr": 0.6123332381248474, "loss_mode_switch": 0.0, "loss_total": 0.24005307257175446, "step": 1198 }, { "batch_size": 1, "epoch": 0.4792, "step": 1198, "tokens_per_device": 4273 }, { "epoch": 0.4792, "loss_ce": 1.071234941482544, "loss_lvr": 0.6405642628669739, "loss_mode_switch": 0.0, "loss_total": 1.135291337966919, "step": 1198 }, { "batch_size": 4, "epoch": 0.4792, "step": 1198, "tokens_per_device": 4768 }, { "epoch": 0.4792, "loss_ce": 0.20110513269901276, "loss_lvr": 0.7482956647872925, "loss_mode_switch": 0.0, "loss_total": 0.27593469619750977, "step": 1198 }, { "epoch": 0.4796, "grad_norm": 1.239230990409851, "learning_rate": 5.572004592767755e-06, "loss": 0.2827, "step": 1199 }, { "batch_size": 4, "epoch": 0.4796, "step": 1199, "tokens_per_device": 10576 }, { "epoch": 0.4796, "loss_ce": 0.38075825572013855, "loss_lvr": 1.3178586959838867, "loss_mode_switch": 0.0, "loss_total": 0.5125441551208496, "step": 1199 }, { "batch_size": 4, "epoch": 0.4796, "step": 1199, "tokens_per_device": 2864 }, { "epoch": 0.4796, "loss_ce": 0.6858187317848206, "loss_lvr": 0.7826880216598511, "loss_mode_switch": 0.0, "loss_total": 0.7640875577926636, "step": 1199 }, { "batch_size": 4, "epoch": 0.4796, "step": 1199, "tokens_per_device": 1976 }, { "epoch": 0.4796, "loss_ce": 0.6720824241638184, "loss_lvr": 1.0175703763961792, "loss_mode_switch": 0.0, "loss_total": 0.7738394737243652, "step": 1199 }, { "batch_size": 4, "epoch": 0.4796, "step": 1199, "tokens_per_device": 5592 }, { "epoch": 0.4796, "loss_ce": 0.05094167962670326, "loss_lvr": 0.7779450416564941, "loss_mode_switch": 0.0, "loss_total": 0.12873618304729462, "step": 1199 }, { "batch_size": 4, "epoch": 0.4796, "step": 1199, "tokens_per_device": 2764 }, { "epoch": 0.4796, "loss_ce": 0.4050334692001343, "loss_lvr": 0.9407066106796265, "loss_mode_switch": 0.0, "loss_total": 0.4991041421890259, "step": 1199 }, { "batch_size": 4, "epoch": 0.4796, "step": 1199, "tokens_per_device": 2672 }, { "epoch": 0.4796, "loss_ce": 0.12327636033296585, "loss_lvr": 0.895684003829956, "loss_mode_switch": 0.0, "loss_total": 0.21284475922584534, "step": 1199 }, { "batch_size": 4, "epoch": 0.4796, "step": 1199, "tokens_per_device": 6856 }, { "epoch": 0.4796, "loss_ce": 0.2690519392490387, "loss_lvr": 0.48262834548950195, "loss_mode_switch": 0.0, "loss_total": 0.3173147737979889, "step": 1199 }, { "batch_size": 1, "epoch": 0.4796, "step": 1199, "tokens_per_device": 5096 }, { "epoch": 0.4796, "loss_ce": 0.0019990012515336275, "loss_lvr": 0.46037784218788147, "loss_mode_switch": 0.0, "loss_total": 0.048036787658929825, "step": 1199 }, { "epoch": 0.48, "grad_norm": 1.494361400604248, "learning_rate": 5.5655691309764225e-06, "loss": 0.3117, "step": 1200 }, { "batch_size": 1, "epoch": 0.48, "step": 1200, "tokens_per_device": 5537 }, { "epoch": 0.48, "loss_ce": 0.05716193839907646, "loss_lvr": 0.5626987814903259, "loss_mode_switch": 0.0, "loss_total": 0.11343181878328323, "step": 1200 }, { "batch_size": 4, "epoch": 0.48, "step": 1200, "tokens_per_device": 2844 }, { "epoch": 0.48, "loss_ce": 0.3761642575263977, "loss_lvr": 0.6408262252807617, "loss_mode_switch": 0.0, "loss_total": 0.4402468800544739, "step": 1200 }, { "batch_size": 4, "epoch": 0.48, "step": 1200, "tokens_per_device": 1260 }, { "epoch": 0.48, "loss_ce": 0.25367534160614014, "loss_lvr": 1.1481608152389526, "loss_mode_switch": 0.0, "loss_total": 0.36849141120910645, "step": 1200 }, { "batch_size": 4, "epoch": 0.48, "step": 1200, "tokens_per_device": 2224 }, { "epoch": 0.48, "loss_ce": 0.3650158941745758, "loss_lvr": 0.9177759289741516, "loss_mode_switch": 0.0, "loss_total": 0.45679348707199097, "step": 1200 }, { "batch_size": 4, "epoch": 0.48, "step": 1200, "tokens_per_device": 3992 }, { "epoch": 0.48, "loss_ce": 0.32479777932167053, "loss_lvr": 0.6799757480621338, "loss_mode_switch": 0.0, "loss_total": 0.3927953541278839, "step": 1200 }, { "batch_size": 1, "epoch": 0.48, "step": 1200, "tokens_per_device": 5234 }, { "epoch": 0.48, "loss_ce": 0.008521015755832195, "loss_lvr": 0.6784204244613647, "loss_mode_switch": 0.0, "loss_total": 0.07636305689811707, "step": 1200 }, { "batch_size": 4, "epoch": 0.48, "step": 1200, "tokens_per_device": 1656 }, { "epoch": 0.48, "loss_ce": 0.5380966663360596, "loss_lvr": 0.7909587025642395, "loss_mode_switch": 0.0, "loss_total": 0.6171925067901611, "step": 1200 }, { "batch_size": 4, "epoch": 0.48, "step": 1200, "tokens_per_device": 3728 }, { "epoch": 0.48, "loss_ce": 0.11816678196191788, "loss_lvr": 0.890373170375824, "loss_mode_switch": 0.0, "loss_total": 0.20720410346984863, "step": 1200 }, { "epoch": 0.4804, "grad_norm": 1.538864016532898, "learning_rate": 5.559132719975984e-06, "loss": 0.3661, "step": 1201 }, { "batch_size": 4, "epoch": 0.4804, "step": 1201, "tokens_per_device": 4300 }, { "epoch": 0.4804, "loss_ce": 0.31800875067710876, "loss_lvr": 0.9353832602500916, "loss_mode_switch": 0.0, "loss_total": 0.41154706478118896, "step": 1201 }, { "batch_size": 4, "epoch": 0.4804, "step": 1201, "tokens_per_device": 4772 }, { "epoch": 0.4804, "loss_ce": 0.03748202323913574, "loss_lvr": 0.8570608496665955, "loss_mode_switch": 0.0, "loss_total": 0.12318810820579529, "step": 1201 }, { "batch_size": 1, "epoch": 0.4804, "step": 1201, "tokens_per_device": 5127 }, { "epoch": 0.4804, "loss_ce": 0.0002797577762976289, "loss_lvr": 0.2675534188747406, "loss_mode_switch": 0.0, "loss_total": 0.027035100385546684, "step": 1201 }, { "batch_size": 4, "epoch": 0.4804, "step": 1201, "tokens_per_device": 2676 }, { "epoch": 0.4804, "loss_ce": 0.7229916453361511, "loss_lvr": 0.773643434047699, "loss_mode_switch": 0.0, "loss_total": 0.8003559708595276, "step": 1201 }, { "batch_size": 1, "epoch": 0.4804, "step": 1201, "tokens_per_device": 5208 }, { "epoch": 0.4804, "loss_ce": 0.04041290283203125, "loss_lvr": 0.5158168077468872, "loss_mode_switch": 0.0, "loss_total": 0.09199458360671997, "step": 1201 }, { "batch_size": 4, "epoch": 0.4804, "step": 1201, "tokens_per_device": 5892 }, { "epoch": 0.4804, "loss_ce": 0.1988123506307602, "loss_lvr": 0.7964844703674316, "loss_mode_switch": 0.0, "loss_total": 0.2784608006477356, "step": 1201 }, { "batch_size": 4, "epoch": 0.4804, "step": 1201, "tokens_per_device": 5436 }, { "epoch": 0.4804, "loss_ce": 0.1105087548494339, "loss_lvr": 0.8496024012565613, "loss_mode_switch": 0.0, "loss_total": 0.1954689919948578, "step": 1201 }, { "batch_size": 1, "epoch": 0.4804, "step": 1201, "tokens_per_device": 5104 }, { "epoch": 0.4804, "loss_ce": 0.00041060676448978484, "loss_lvr": 0.2805885076522827, "loss_mode_switch": 0.0, "loss_total": 0.02846945822238922, "step": 1201 }, { "epoch": 0.4808, "grad_norm": 1.2256383895874023, "learning_rate": 5.55269537056883e-06, "loss": 0.3022, "step": 1202 }, { "batch_size": 4, "epoch": 0.4808, "step": 1202, "tokens_per_device": 3792 }, { "epoch": 0.4808, "loss_ce": 0.06690744310617447, "loss_lvr": 2.3482351303100586, "loss_mode_switch": 0.0, "loss_total": 0.3017309606075287, "step": 1202 }, { "batch_size": 4, "epoch": 0.4808, "step": 1202, "tokens_per_device": 4216 }, { "epoch": 0.4808, "loss_ce": 0.44276803731918335, "loss_lvr": 0.9154427647590637, "loss_mode_switch": 0.0, "loss_total": 0.5343123078346252, "step": 1202 }, { "batch_size": 4, "epoch": 0.4808, "step": 1202, "tokens_per_device": 10832 }, { "epoch": 0.4808, "loss_ce": 0.10114631056785583, "loss_lvr": 0.6929726004600525, "loss_mode_switch": 0.0, "loss_total": 0.1704435646533966, "step": 1202 }, { "batch_size": 1, "epoch": 0.4808, "step": 1202, "tokens_per_device": 4875 }, { "epoch": 0.4808, "loss_ce": 0.16749340295791626, "loss_lvr": 0.46057558059692383, "loss_mode_switch": 0.0, "loss_total": 0.21355095505714417, "step": 1202 }, { "batch_size": 4, "epoch": 0.4808, "step": 1202, "tokens_per_device": 6976 }, { "epoch": 0.4808, "loss_ce": 0.2919171452522278, "loss_lvr": 1.3941478729248047, "loss_mode_switch": 0.0, "loss_total": 0.43133193254470825, "step": 1202 }, { "batch_size": 4, "epoch": 0.4808, "step": 1202, "tokens_per_device": 3880 }, { "epoch": 0.4808, "loss_ce": 0.3461609482765198, "loss_lvr": 0.8715179562568665, "loss_mode_switch": 0.0, "loss_total": 0.4333127439022064, "step": 1202 }, { "batch_size": 4, "epoch": 0.4808, "step": 1202, "tokens_per_device": 4380 }, { "epoch": 0.4808, "loss_ce": 0.0839986503124237, "loss_lvr": 0.9305952191352844, "loss_mode_switch": 0.0, "loss_total": 0.1770581752061844, "step": 1202 }, { "batch_size": 4, "epoch": 0.4808, "step": 1202, "tokens_per_device": 2648 }, { "epoch": 0.4808, "loss_ce": 0.371843159198761, "loss_lvr": 0.8662608861923218, "loss_mode_switch": 0.0, "loss_total": 0.4584692418575287, "step": 1202 }, { "epoch": 0.4812, "grad_norm": 1.3069195747375488, "learning_rate": 5.546257093558932e-06, "loss": 0.2745, "step": 1203 }, { "batch_size": 4, "epoch": 0.4812, "step": 1203, "tokens_per_device": 3784 }, { "epoch": 0.4812, "loss_ce": 0.5322430729866028, "loss_lvr": 1.000622034072876, "loss_mode_switch": 0.0, "loss_total": 0.6323052644729614, "step": 1203 }, { "batch_size": 1, "epoch": 0.4812, "step": 1203, "tokens_per_device": 5122 }, { "epoch": 0.4812, "loss_ce": 0.008023837581276894, "loss_lvr": 0.3338390588760376, "loss_mode_switch": 0.0, "loss_total": 0.041407741606235504, "step": 1203 }, { "batch_size": 4, "epoch": 0.4812, "step": 1203, "tokens_per_device": 3788 }, { "epoch": 0.4812, "loss_ce": 0.23594054579734802, "loss_lvr": 0.8805947303771973, "loss_mode_switch": 0.0, "loss_total": 0.3240000307559967, "step": 1203 }, { "batch_size": 4, "epoch": 0.4812, "step": 1203, "tokens_per_device": 4220 }, { "epoch": 0.4812, "loss_ce": 0.061320960521698, "loss_lvr": 0.9968780279159546, "loss_mode_switch": 0.0, "loss_total": 0.1610087752342224, "step": 1203 }, { "batch_size": 4, "epoch": 0.4812, "step": 1203, "tokens_per_device": 1580 }, { "epoch": 0.4812, "loss_ce": 0.14604482054710388, "loss_lvr": 0.9372565150260925, "loss_mode_switch": 0.0, "loss_total": 0.23977047204971313, "step": 1203 }, { "batch_size": 4, "epoch": 0.4812, "step": 1203, "tokens_per_device": 4456 }, { "epoch": 0.4812, "loss_ce": 0.5259408950805664, "loss_lvr": 0.8200443387031555, "loss_mode_switch": 0.0, "loss_total": 0.6079453229904175, "step": 1203 }, { "batch_size": 4, "epoch": 0.4812, "step": 1203, "tokens_per_device": 6404 }, { "epoch": 0.4812, "loss_ce": 0.2679668962955475, "loss_lvr": 0.829513430595398, "loss_mode_switch": 0.0, "loss_total": 0.3509182333946228, "step": 1203 }, { "batch_size": 1, "epoch": 0.4812, "step": 1203, "tokens_per_device": 4883 }, { "epoch": 0.4812, "loss_ce": 0.03826303780078888, "loss_lvr": 0.15102848410606384, "loss_mode_switch": 0.0, "loss_total": 0.053365886211395264, "step": 1203 }, { "epoch": 0.4816, "grad_norm": 1.0808677673339844, "learning_rate": 5.539817899751813e-06, "loss": 0.2343, "step": 1204 }, { "batch_size": 1, "epoch": 0.4816, "step": 1204, "tokens_per_device": 4893 }, { "epoch": 0.4816, "loss_ce": 0.15763972699642181, "loss_lvr": 0.3189876675605774, "loss_mode_switch": 0.0, "loss_total": 0.18953849375247955, "step": 1204 }, { "batch_size": 4, "epoch": 0.4816, "step": 1204, "tokens_per_device": 1792 }, { "epoch": 0.4816, "loss_ce": 0.34603530168533325, "loss_lvr": 1.1009732484817505, "loss_mode_switch": 0.0, "loss_total": 0.4561326205730438, "step": 1204 }, { "batch_size": 4, "epoch": 0.4816, "step": 1204, "tokens_per_device": 3740 }, { "epoch": 0.4816, "loss_ce": 0.3124517798423767, "loss_lvr": 0.7921848297119141, "loss_mode_switch": 0.0, "loss_total": 0.39167025685310364, "step": 1204 }, { "batch_size": 4, "epoch": 0.4816, "step": 1204, "tokens_per_device": 3788 }, { "epoch": 0.4816, "loss_ce": 0.1386379450559616, "loss_lvr": 1.052281141281128, "loss_mode_switch": 0.0, "loss_total": 0.24386605620384216, "step": 1204 }, { "batch_size": 4, "epoch": 0.4816, "step": 1204, "tokens_per_device": 4224 }, { "epoch": 0.4816, "loss_ce": 0.036684028804302216, "loss_lvr": 0.8083603382110596, "loss_mode_switch": 0.0, "loss_total": 0.11752006411552429, "step": 1204 }, { "batch_size": 4, "epoch": 0.4816, "step": 1204, "tokens_per_device": 4372 }, { "epoch": 0.4816, "loss_ce": 0.04649697244167328, "loss_lvr": 0.5759845972061157, "loss_mode_switch": 0.0, "loss_total": 0.10409542918205261, "step": 1204 }, { "batch_size": 1, "epoch": 0.4816, "step": 1204, "tokens_per_device": 5110 }, { "epoch": 0.4816, "loss_ce": 0.003994735423475504, "loss_lvr": 0.28282690048217773, "loss_mode_switch": 0.0, "loss_total": 0.0322774276137352, "step": 1204 }, { "batch_size": 4, "epoch": 0.4816, "step": 1204, "tokens_per_device": 1444 }, { "epoch": 0.4816, "loss_ce": 0.6195699572563171, "loss_lvr": 0.9751887917518616, "loss_mode_switch": 0.0, "loss_total": 0.7170888185501099, "step": 1204 }, { "epoch": 0.482, "grad_norm": 1.945726752281189, "learning_rate": 5.533377799954532e-06, "loss": 0.2894, "step": 1205 }, { "batch_size": 4, "epoch": 0.482, "step": 1205, "tokens_per_device": 1516 }, { "epoch": 0.482, "loss_ce": 0.10528045892715454, "loss_lvr": 1.340101957321167, "loss_mode_switch": 0.0, "loss_total": 0.23929065465927124, "step": 1205 }, { "batch_size": 4, "epoch": 0.482, "step": 1205, "tokens_per_device": 12472 }, { "epoch": 0.482, "loss_ce": 0.023255525156855583, "loss_lvr": 0.7773929834365845, "loss_mode_switch": 0.0, "loss_total": 0.10099482536315918, "step": 1205 }, { "batch_size": 4, "epoch": 0.482, "step": 1205, "tokens_per_device": 6268 }, { "epoch": 0.482, "loss_ce": 0.5860053300857544, "loss_lvr": 0.5842387676239014, "loss_mode_switch": 0.0, "loss_total": 0.6444292068481445, "step": 1205 }, { "batch_size": 1, "epoch": 0.482, "step": 1205, "tokens_per_device": 4972 }, { "epoch": 0.482, "loss_ce": 0.5487481951713562, "loss_lvr": 0.8295819759368896, "loss_mode_switch": 0.0, "loss_total": 0.6317064166069031, "step": 1205 }, { "batch_size": 1, "epoch": 0.482, "step": 1205, "tokens_per_device": 4911 }, { "epoch": 0.482, "loss_ce": 0.025184620171785355, "loss_lvr": 0.4449353814125061, "loss_mode_switch": 0.0, "loss_total": 0.0696781575679779, "step": 1205 }, { "batch_size": 4, "epoch": 0.482, "step": 1205, "tokens_per_device": 4336 }, { "epoch": 0.482, "loss_ce": 0.30377909541130066, "loss_lvr": 0.7862962484359741, "loss_mode_switch": 0.0, "loss_total": 0.3824087381362915, "step": 1205 }, { "batch_size": 1, "epoch": 0.482, "step": 1205, "tokens_per_device": 4889 }, { "epoch": 0.482, "loss_ce": 0.04294276982545853, "loss_lvr": 0.41032880544662476, "loss_mode_switch": 0.0, "loss_total": 0.083975650370121, "step": 1205 }, { "batch_size": 4, "epoch": 0.482, "step": 1205, "tokens_per_device": 5240 }, { "epoch": 0.482, "loss_ce": 0.000704774574842304, "loss_lvr": 0.494286447763443, "loss_mode_switch": 0.0, "loss_total": 0.050133418291807175, "step": 1205 }, { "epoch": 0.4824, "grad_norm": 1.21684730052948, "learning_rate": 5.526936804975681e-06, "loss": 0.3112, "step": 1206 }, { "batch_size": 4, "epoch": 0.4824, "step": 1206, "tokens_per_device": 4252 }, { "epoch": 0.4824, "loss_ce": 0.003148927353322506, "loss_lvr": 0.9792031645774841, "loss_mode_switch": 0.0, "loss_total": 0.10106924921274185, "step": 1206 }, { "batch_size": 4, "epoch": 0.4824, "step": 1206, "tokens_per_device": 5648 }, { "epoch": 0.4824, "loss_ce": 0.0612536184489727, "loss_lvr": 0.7524973750114441, "loss_mode_switch": 0.0, "loss_total": 0.13650335371494293, "step": 1206 }, { "batch_size": 4, "epoch": 0.4824, "step": 1206, "tokens_per_device": 3276 }, { "epoch": 0.4824, "loss_ce": 0.11373898386955261, "loss_lvr": 0.7971468567848206, "loss_mode_switch": 0.0, "loss_total": 0.19345366954803467, "step": 1206 }, { "batch_size": 1, "epoch": 0.4824, "step": 1206, "tokens_per_device": 4865 }, { "epoch": 0.4824, "loss_ce": 0.03261213377118111, "loss_lvr": 0.3316959738731384, "loss_mode_switch": 0.0, "loss_total": 0.06578172743320465, "step": 1206 }, { "batch_size": 4, "epoch": 0.4824, "step": 1206, "tokens_per_device": 7136 }, { "epoch": 0.4824, "loss_ce": 0.1655379682779312, "loss_lvr": 0.655580997467041, "loss_mode_switch": 0.0, "loss_total": 0.2310960590839386, "step": 1206 }, { "batch_size": 4, "epoch": 0.4824, "step": 1206, "tokens_per_device": 7804 }, { "epoch": 0.4824, "loss_ce": 0.31272417306900024, "loss_lvr": 0.6136314272880554, "loss_mode_switch": 0.0, "loss_total": 0.37408730387687683, "step": 1206 }, { "batch_size": 4, "epoch": 0.4824, "step": 1206, "tokens_per_device": 4860 }, { "epoch": 0.4824, "loss_ce": 0.07364822179079056, "loss_lvr": 0.8558064103126526, "loss_mode_switch": 0.0, "loss_total": 0.1592288613319397, "step": 1206 }, { "batch_size": 4, "epoch": 0.4824, "step": 1206, "tokens_per_device": 1772 }, { "epoch": 0.4824, "loss_ce": 0.38981202244758606, "loss_lvr": 1.1480435132980347, "loss_mode_switch": 0.0, "loss_total": 0.504616379737854, "step": 1206 }, { "epoch": 0.4828, "grad_norm": 1.2485082149505615, "learning_rate": 5.520494925625339e-06, "loss": 0.283, "step": 1207 }, { "batch_size": 4, "epoch": 0.4828, "step": 1207, "tokens_per_device": 4228 }, { "epoch": 0.4828, "loss_ce": 0.2756057381629944, "loss_lvr": 0.9836261868476868, "loss_mode_switch": 0.0, "loss_total": 0.37396836280822754, "step": 1207 }, { "batch_size": 4, "epoch": 0.4828, "step": 1207, "tokens_per_device": 2576 }, { "epoch": 0.4828, "loss_ce": 0.24723076820373535, "loss_lvr": 0.9381872415542603, "loss_mode_switch": 0.0, "loss_total": 0.3410494923591614, "step": 1207 }, { "batch_size": 4, "epoch": 0.4828, "step": 1207, "tokens_per_device": 4196 }, { "epoch": 0.4828, "loss_ce": 0.27635711431503296, "loss_lvr": 0.9258841872215271, "loss_mode_switch": 0.0, "loss_total": 0.36894553899765015, "step": 1207 }, { "batch_size": 4, "epoch": 0.4828, "step": 1207, "tokens_per_device": 5220 }, { "epoch": 0.4828, "loss_ce": 0.0005624754121527076, "loss_lvr": 0.6459743976593018, "loss_mode_switch": 0.0, "loss_total": 0.06515991687774658, "step": 1207 }, { "batch_size": 4, "epoch": 0.4828, "step": 1207, "tokens_per_device": 5756 }, { "epoch": 0.4828, "loss_ce": 0.49835920333862305, "loss_lvr": 1.0910872220993042, "loss_mode_switch": 0.0, "loss_total": 0.6074679493904114, "step": 1207 }, { "batch_size": 4, "epoch": 0.4828, "step": 1207, "tokens_per_device": 3076 }, { "epoch": 0.4828, "loss_ce": 0.8378692269325256, "loss_lvr": 1.0265464782714844, "loss_mode_switch": 0.0, "loss_total": 0.9405238628387451, "step": 1207 }, { "batch_size": 4, "epoch": 0.4828, "step": 1207, "tokens_per_device": 5872 }, { "epoch": 0.4828, "loss_ce": 0.6138788461685181, "loss_lvr": 0.551927924156189, "loss_mode_switch": 0.0, "loss_total": 0.669071614742279, "step": 1207 }, { "batch_size": 4, "epoch": 0.4828, "step": 1207, "tokens_per_device": 2936 }, { "epoch": 0.4828, "loss_ce": 0.5415364503860474, "loss_lvr": 0.9781420826911926, "loss_mode_switch": 0.0, "loss_total": 0.6393506526947021, "step": 1207 }, { "epoch": 0.4832, "grad_norm": 1.166178584098816, "learning_rate": 5.5140521727150805e-06, "loss": 0.3006, "step": 1208 }, { "batch_size": 1, "epoch": 0.4832, "step": 1208, "tokens_per_device": 5135 }, { "epoch": 0.4832, "loss_ce": 0.0013186249416321516, "loss_lvr": 0.5165455937385559, "loss_mode_switch": 0.0, "loss_total": 0.0529731847345829, "step": 1208 }, { "batch_size": 4, "epoch": 0.4832, "step": 1208, "tokens_per_device": 4344 }, { "epoch": 0.4832, "loss_ce": 0.4181554615497589, "loss_lvr": 0.6624554395675659, "loss_mode_switch": 0.0, "loss_total": 0.48440101742744446, "step": 1208 }, { "batch_size": 4, "epoch": 0.4832, "step": 1208, "tokens_per_device": 8920 }, { "epoch": 0.4832, "loss_ce": 0.5242434144020081, "loss_lvr": 0.619920015335083, "loss_mode_switch": 0.0, "loss_total": 0.5862354040145874, "step": 1208 }, { "batch_size": 1, "epoch": 0.4832, "step": 1208, "tokens_per_device": 5528 }, { "epoch": 0.4832, "loss_ce": 0.0019416527356952429, "loss_lvr": 0.32059380412101746, "loss_mode_switch": 0.0, "loss_total": 0.03400103375315666, "step": 1208 }, { "batch_size": 4, "epoch": 0.4832, "step": 1208, "tokens_per_device": 4916 }, { "epoch": 0.4832, "loss_ce": 0.12899117171764374, "loss_lvr": 0.776145339012146, "loss_mode_switch": 0.0, "loss_total": 0.2066057026386261, "step": 1208 }, { "batch_size": 1, "epoch": 0.4832, "step": 1208, "tokens_per_device": 5105 }, { "epoch": 0.4832, "loss_ce": 0.002445363439619541, "loss_lvr": 0.45600518584251404, "loss_mode_switch": 0.0, "loss_total": 0.04804588109254837, "step": 1208 }, { "batch_size": 4, "epoch": 0.4832, "step": 1208, "tokens_per_device": 3848 }, { "epoch": 0.4832, "loss_ce": 0.3409148156642914, "loss_lvr": 1.0207349061965942, "loss_mode_switch": 0.0, "loss_total": 0.4429883062839508, "step": 1208 }, { "batch_size": 4, "epoch": 0.4832, "step": 1208, "tokens_per_device": 4176 }, { "epoch": 0.4832, "loss_ce": 0.4476666748523712, "loss_lvr": 0.8487103581428528, "loss_mode_switch": 0.0, "loss_total": 0.5325376987457275, "step": 1208 }, { "epoch": 0.4836, "grad_norm": 1.3676753044128418, "learning_rate": 5.507608557057942e-06, "loss": 0.3119, "step": 1209 }, { "batch_size": 4, "epoch": 0.4836, "step": 1209, "tokens_per_device": 4064 }, { "epoch": 0.4836, "loss_ce": 0.8079642057418823, "loss_lvr": 0.7442180514335632, "loss_mode_switch": 0.0, "loss_total": 0.8823860287666321, "step": 1209 }, { "batch_size": 4, "epoch": 0.4836, "step": 1209, "tokens_per_device": 4916 }, { "epoch": 0.4836, "loss_ce": 0.3272174000740051, "loss_lvr": 0.8843085765838623, "loss_mode_switch": 0.0, "loss_total": 0.4156482517719269, "step": 1209 }, { "batch_size": 1, "epoch": 0.4836, "step": 1209, "tokens_per_device": 4914 }, { "epoch": 0.4836, "loss_ce": 0.03943927213549614, "loss_lvr": 0.434887170791626, "loss_mode_switch": 0.0, "loss_total": 0.08292798697948456, "step": 1209 }, { "batch_size": 4, "epoch": 0.4836, "step": 1209, "tokens_per_device": 4112 }, { "epoch": 0.4836, "loss_ce": 0.5194130539894104, "loss_lvr": 0.6856966018676758, "loss_mode_switch": 0.0, "loss_total": 0.587982714176178, "step": 1209 }, { "batch_size": 1, "epoch": 0.4836, "step": 1209, "tokens_per_device": 5072 }, { "epoch": 0.4836, "loss_ce": 0.06009689345955849, "loss_lvr": 0.31570833921432495, "loss_mode_switch": 0.0, "loss_total": 0.09166772663593292, "step": 1209 }, { "batch_size": 4, "epoch": 0.4836, "step": 1209, "tokens_per_device": 5400 }, { "epoch": 0.4836, "loss_ce": 0.15418292582035065, "loss_lvr": 0.900722861289978, "loss_mode_switch": 0.0, "loss_total": 0.2442552149295807, "step": 1209 }, { "batch_size": 1, "epoch": 0.4836, "step": 1209, "tokens_per_device": 4881 }, { "epoch": 0.4836, "loss_ce": 0.22231966257095337, "loss_lvr": 0.9992323517799377, "loss_mode_switch": 0.0, "loss_total": 0.3222429156303406, "step": 1209 }, { "batch_size": 4, "epoch": 0.4836, "step": 1209, "tokens_per_device": 4580 }, { "epoch": 0.4836, "loss_ce": 0.4469432532787323, "loss_lvr": 0.8074215650558472, "loss_mode_switch": 0.0, "loss_total": 0.5276854038238525, "step": 1209 }, { "epoch": 0.484, "grad_norm": 1.2281943559646606, "learning_rate": 5.501164089468406e-06, "loss": 0.2881, "step": 1210 }, { "batch_size": 4, "epoch": 0.484, "step": 1210, "tokens_per_device": 3824 }, { "epoch": 0.484, "loss_ce": 0.03848382830619812, "loss_lvr": 0.9562129974365234, "loss_mode_switch": 0.0, "loss_total": 0.1341051310300827, "step": 1210 }, { "batch_size": 4, "epoch": 0.484, "step": 1210, "tokens_per_device": 4916 }, { "epoch": 0.484, "loss_ce": 0.061825040727853775, "loss_lvr": 0.8892056941986084, "loss_mode_switch": 0.0, "loss_total": 0.15074561536312103, "step": 1210 }, { "batch_size": 1, "epoch": 0.484, "step": 1210, "tokens_per_device": 4792 }, { "epoch": 0.484, "loss_ce": 0.00206826813519001, "loss_lvr": 0.28674080967903137, "loss_mode_switch": 0.0, "loss_total": 0.030742349103093147, "step": 1210 }, { "batch_size": 1, "epoch": 0.484, "step": 1210, "tokens_per_device": 5095 }, { "epoch": 0.484, "loss_ce": 0.13677754998207092, "loss_lvr": 0.26114118099212646, "loss_mode_switch": 0.0, "loss_total": 0.1628916710615158, "step": 1210 }, { "batch_size": 4, "epoch": 0.484, "step": 1210, "tokens_per_device": 1388 }, { "epoch": 0.484, "loss_ce": 0.3365703523159027, "loss_lvr": 0.9196464419364929, "loss_mode_switch": 0.0, "loss_total": 0.42853498458862305, "step": 1210 }, { "batch_size": 4, "epoch": 0.484, "step": 1210, "tokens_per_device": 3408 }, { "epoch": 0.484, "loss_ce": 0.3142094612121582, "loss_lvr": 0.9847537279129028, "loss_mode_switch": 0.0, "loss_total": 0.412684828042984, "step": 1210 }, { "batch_size": 4, "epoch": 0.484, "step": 1210, "tokens_per_device": 4476 }, { "epoch": 0.484, "loss_ce": 0.3184642493724823, "loss_lvr": 0.7342334389686584, "loss_mode_switch": 0.0, "loss_total": 0.3918876051902771, "step": 1210 }, { "batch_size": 1, "epoch": 0.484, "step": 1210, "tokens_per_device": 5249 }, { "epoch": 0.484, "loss_ce": 0.0025605016853660345, "loss_lvr": 0.6806296706199646, "loss_mode_switch": 0.0, "loss_total": 0.0706234723329544, "step": 1210 }, { "epoch": 0.4844, "grad_norm": 1.2022035121917725, "learning_rate": 5.494718780762388e-06, "loss": 0.2809, "step": 1211 }, { "batch_size": 1, "epoch": 0.4844, "step": 1211, "tokens_per_device": 4948 }, { "epoch": 0.4844, "loss_ce": 0.06593629717826843, "loss_lvr": 0.2783903181552887, "loss_mode_switch": 0.0, "loss_total": 0.09377533197402954, "step": 1211 }, { "batch_size": 4, "epoch": 0.4844, "step": 1211, "tokens_per_device": 4228 }, { "epoch": 0.4844, "loss_ce": 0.10860691964626312, "loss_lvr": 0.7482520937919617, "loss_mode_switch": 0.0, "loss_total": 0.18343213200569153, "step": 1211 }, { "batch_size": 4, "epoch": 0.4844, "step": 1211, "tokens_per_device": 2612 }, { "epoch": 0.4844, "loss_ce": 0.21316924691200256, "loss_lvr": 0.9435380101203918, "loss_mode_switch": 0.0, "loss_total": 0.30752304196357727, "step": 1211 }, { "batch_size": 1, "epoch": 0.4844, "step": 1211, "tokens_per_device": 5131 }, { "epoch": 0.4844, "loss_ce": 0.029686959460377693, "loss_lvr": 0.3439614474773407, "loss_mode_switch": 0.0, "loss_total": 0.06408310681581497, "step": 1211 }, { "batch_size": 4, "epoch": 0.4844, "step": 1211, "tokens_per_device": 4284 }, { "epoch": 0.4844, "loss_ce": 0.7242655158042908, "loss_lvr": 1.3127069473266602, "loss_mode_switch": 0.0, "loss_total": 0.8555362224578857, "step": 1211 }, { "batch_size": 1, "epoch": 0.4844, "step": 1211, "tokens_per_device": 5188 }, { "epoch": 0.4844, "loss_ce": 0.07952824980020523, "loss_lvr": 0.45666009187698364, "loss_mode_switch": 0.0, "loss_total": 0.125194251537323, "step": 1211 }, { "batch_size": 1, "epoch": 0.4844, "step": 1211, "tokens_per_device": 4849 }, { "epoch": 0.4844, "loss_ce": 0.11024738103151321, "loss_lvr": 0.34526756405830383, "loss_mode_switch": 0.0, "loss_total": 0.14477413892745972, "step": 1211 }, { "batch_size": 4, "epoch": 0.4844, "step": 1211, "tokens_per_device": 5436 }, { "epoch": 0.4844, "loss_ce": 0.3180347979068756, "loss_lvr": 0.8029740452766418, "loss_mode_switch": 0.0, "loss_total": 0.3983322083950043, "step": 1211 }, { "epoch": 0.4848, "grad_norm": 1.2746702432632446, "learning_rate": 5.488272641757215e-06, "loss": 0.2608, "step": 1212 }, { "batch_size": 1, "epoch": 0.4848, "step": 1212, "tokens_per_device": 5181 }, { "epoch": 0.4848, "loss_ce": 0.03203340992331505, "loss_lvr": 0.2957570552825928, "loss_mode_switch": 0.0, "loss_total": 0.061609115451574326, "step": 1212 }, { "batch_size": 1, "epoch": 0.4848, "step": 1212, "tokens_per_device": 4860 }, { "epoch": 0.4848, "loss_ce": 0.006060650106519461, "loss_lvr": 0.35859644412994385, "loss_mode_switch": 0.0, "loss_total": 0.041920293122529984, "step": 1212 }, { "batch_size": 1, "epoch": 0.4848, "step": 1212, "tokens_per_device": 5163 }, { "epoch": 0.4848, "loss_ce": 0.014181794598698616, "loss_lvr": 0.3995093107223511, "loss_mode_switch": 0.0, "loss_total": 0.05413272976875305, "step": 1212 }, { "batch_size": 4, "epoch": 0.4848, "step": 1212, "tokens_per_device": 3812 }, { "epoch": 0.4848, "loss_ce": 0.17574593424797058, "loss_lvr": 1.0673553943634033, "loss_mode_switch": 0.0, "loss_total": 0.28248149156570435, "step": 1212 }, { "batch_size": 1, "epoch": 0.4848, "step": 1212, "tokens_per_device": 4254 }, { "epoch": 0.4848, "loss_ce": 0.10557752847671509, "loss_lvr": 0.1953258067369461, "loss_mode_switch": 0.0, "loss_total": 0.12511010468006134, "step": 1212 }, { "batch_size": 4, "epoch": 0.4848, "step": 1212, "tokens_per_device": 4300 }, { "epoch": 0.4848, "loss_ce": 0.17658892273902893, "loss_lvr": 0.7172032594680786, "loss_mode_switch": 0.0, "loss_total": 0.24830925464630127, "step": 1212 }, { "batch_size": 4, "epoch": 0.4848, "step": 1212, "tokens_per_device": 3044 }, { "epoch": 0.4848, "loss_ce": 0.06497389823198318, "loss_lvr": 0.5929385423660278, "loss_mode_switch": 0.0, "loss_total": 0.12426775693893433, "step": 1212 }, { "batch_size": 4, "epoch": 0.4848, "step": 1212, "tokens_per_device": 1464 }, { "epoch": 0.4848, "loss_ce": 0.13188958168029785, "loss_lvr": 0.7925282716751099, "loss_mode_switch": 0.0, "loss_total": 0.2111424207687378, "step": 1212 }, { "epoch": 0.4852, "grad_norm": 1.2718946933746338, "learning_rate": 5.481825683271607e-06, "loss": 0.26, "step": 1213 }, { "batch_size": 4, "epoch": 0.4852, "step": 1213, "tokens_per_device": 4592 }, { "epoch": 0.4852, "loss_ce": 0.10818956047296524, "loss_lvr": 0.707568347454071, "loss_mode_switch": 0.0, "loss_total": 0.17894640564918518, "step": 1213 }, { "batch_size": 4, "epoch": 0.4852, "step": 1213, "tokens_per_device": 3864 }, { "epoch": 0.4852, "loss_ce": 0.3773278295993805, "loss_lvr": 0.993746280670166, "loss_mode_switch": 0.0, "loss_total": 0.4767024517059326, "step": 1213 }, { "batch_size": 4, "epoch": 0.4852, "step": 1213, "tokens_per_device": 7524 }, { "epoch": 0.4852, "loss_ce": 0.02821110561490059, "loss_lvr": 0.882719099521637, "loss_mode_switch": 0.0, "loss_total": 0.11648301780223846, "step": 1213 }, { "batch_size": 4, "epoch": 0.4852, "step": 1213, "tokens_per_device": 4132 }, { "epoch": 0.4852, "loss_ce": 0.6718023419380188, "loss_lvr": 0.7610325217247009, "loss_mode_switch": 0.0, "loss_total": 0.7479056119918823, "step": 1213 }, { "batch_size": 4, "epoch": 0.4852, "step": 1213, "tokens_per_device": 1308 }, { "epoch": 0.4852, "loss_ce": 0.05210570991039276, "loss_lvr": 1.091505527496338, "loss_mode_switch": 0.0, "loss_total": 0.16125625371932983, "step": 1213 }, { "batch_size": 1, "epoch": 0.4852, "step": 1213, "tokens_per_device": 4877 }, { "epoch": 0.4852, "loss_ce": 0.000123677818919532, "loss_lvr": 0.20812009274959564, "loss_mode_switch": 0.0, "loss_total": 0.02093568816781044, "step": 1213 }, { "batch_size": 4, "epoch": 0.4852, "step": 1213, "tokens_per_device": 4272 }, { "epoch": 0.4852, "loss_ce": 0.0023539464455097914, "loss_lvr": 0.9347782731056213, "loss_mode_switch": 0.0, "loss_total": 0.09583177417516708, "step": 1213 }, { "batch_size": 1, "epoch": 0.4852, "step": 1213, "tokens_per_device": 5058 }, { "epoch": 0.4852, "loss_ce": 0.240955650806427, "loss_lvr": 0.4383539855480194, "loss_mode_switch": 0.0, "loss_total": 0.2847910523414612, "step": 1213 }, { "epoch": 0.4856, "grad_norm": 1.2616429328918457, "learning_rate": 5.475377916125655e-06, "loss": 0.2562, "step": 1214 }, { "batch_size": 1, "epoch": 0.4856, "step": 1214, "tokens_per_device": 5557 }, { "epoch": 0.4856, "loss_ce": 0.06579547375440598, "loss_lvr": 0.7663974761962891, "loss_mode_switch": 0.0, "loss_total": 0.142435222864151, "step": 1214 }, { "batch_size": 1, "epoch": 0.4856, "step": 1214, "tokens_per_device": 5235 }, { "epoch": 0.4856, "loss_ce": 0.05829795077443123, "loss_lvr": 0.2722228169441223, "loss_mode_switch": 0.0, "loss_total": 0.08552023023366928, "step": 1214 }, { "batch_size": 4, "epoch": 0.4856, "step": 1214, "tokens_per_device": 4832 }, { "epoch": 0.4856, "loss_ce": 0.00751511799171567, "loss_lvr": 1.043662190437317, "loss_mode_switch": 0.0, "loss_total": 0.11188133805990219, "step": 1214 }, { "batch_size": 4, "epoch": 0.4856, "step": 1214, "tokens_per_device": 5648 }, { "epoch": 0.4856, "loss_ce": 0.2905031144618988, "loss_lvr": 0.944116473197937, "loss_mode_switch": 0.0, "loss_total": 0.384914755821228, "step": 1214 }, { "batch_size": 1, "epoch": 0.4856, "step": 1214, "tokens_per_device": 4867 }, { "epoch": 0.4856, "loss_ce": 0.30989372730255127, "loss_lvr": 0.3554816246032715, "loss_mode_switch": 0.0, "loss_total": 0.34544187784194946, "step": 1214 }, { "batch_size": 4, "epoch": 0.4856, "step": 1214, "tokens_per_device": 5652 }, { "epoch": 0.4856, "loss_ce": 0.18085376918315887, "loss_lvr": 0.7393320798873901, "loss_mode_switch": 0.0, "loss_total": 0.25478696823120117, "step": 1214 }, { "batch_size": 4, "epoch": 0.4856, "step": 1214, "tokens_per_device": 4632 }, { "epoch": 0.4856, "loss_ce": 0.03863298520445824, "loss_lvr": 0.6971269249916077, "loss_mode_switch": 0.0, "loss_total": 0.10834567248821259, "step": 1214 }, { "batch_size": 4, "epoch": 0.4856, "step": 1214, "tokens_per_device": 4272 }, { "epoch": 0.4856, "loss_ce": 0.09983054548501968, "loss_lvr": 0.9625726938247681, "loss_mode_switch": 0.0, "loss_total": 0.1960878074169159, "step": 1214 }, { "epoch": 0.486, "grad_norm": 1.2647532224655151, "learning_rate": 5.4689293511408155e-06, "loss": 0.2734, "step": 1215 }, { "batch_size": 1, "epoch": 0.486, "step": 1215, "tokens_per_device": 6486 }, { "epoch": 0.486, "loss_ce": 0.1253049522638321, "loss_lvr": 0.28052210807800293, "loss_mode_switch": 0.0, "loss_total": 0.15335716307163239, "step": 1215 }, { "batch_size": 4, "epoch": 0.486, "step": 1215, "tokens_per_device": 4304 }, { "epoch": 0.486, "loss_ce": 0.352202445268631, "loss_lvr": 0.9488434195518494, "loss_mode_switch": 0.0, "loss_total": 0.44708678126335144, "step": 1215 }, { "batch_size": 4, "epoch": 0.486, "step": 1215, "tokens_per_device": 5132 }, { "epoch": 0.486, "loss_ce": 0.25492632389068604, "loss_lvr": 1.2626539468765259, "loss_mode_switch": 0.0, "loss_total": 0.3811917304992676, "step": 1215 }, { "batch_size": 4, "epoch": 0.486, "step": 1215, "tokens_per_device": 7136 }, { "epoch": 0.486, "loss_ce": 0.02856459654867649, "loss_lvr": 0.6879178285598755, "loss_mode_switch": 0.0, "loss_total": 0.09735637903213501, "step": 1215 }, { "batch_size": 4, "epoch": 0.486, "step": 1215, "tokens_per_device": 6264 }, { "epoch": 0.486, "loss_ce": 0.2765761911869049, "loss_lvr": 0.7647676467895508, "loss_mode_switch": 0.0, "loss_total": 0.3530529737472534, "step": 1215 }, { "batch_size": 1, "epoch": 0.486, "step": 1215, "tokens_per_device": 5175 }, { "epoch": 0.486, "loss_ce": 0.26249662041664124, "loss_lvr": 0.4036255478858948, "loss_mode_switch": 0.0, "loss_total": 0.30285918712615967, "step": 1215 }, { "batch_size": 1, "epoch": 0.486, "step": 1215, "tokens_per_device": 5113 }, { "epoch": 0.486, "loss_ce": 0.00029776155133731663, "loss_lvr": 0.2666768729686737, "loss_mode_switch": 0.0, "loss_total": 0.02696545049548149, "step": 1215 }, { "batch_size": 1, "epoch": 0.486, "step": 1215, "tokens_per_device": 5132 }, { "epoch": 0.486, "loss_ce": 0.00041665451135486364, "loss_lvr": 0.27488863468170166, "loss_mode_switch": 0.0, "loss_total": 0.02790551818907261, "step": 1215 }, { "epoch": 0.4864, "grad_norm": 1.5039256811141968, "learning_rate": 5.462479999139877e-06, "loss": 0.2886, "step": 1216 }, { "batch_size": 1, "epoch": 0.4864, "step": 1216, "tokens_per_device": 4291 }, { "epoch": 0.4864, "loss_ce": 0.008294863626360893, "loss_lvr": 0.4648468792438507, "loss_mode_switch": 0.0, "loss_total": 0.054779551923274994, "step": 1216 }, { "batch_size": 4, "epoch": 0.4864, "step": 1216, "tokens_per_device": 3812 }, { "epoch": 0.4864, "loss_ce": 0.04764951393008232, "loss_lvr": 1.0108164548873901, "loss_mode_switch": 0.0, "loss_total": 0.14873115718364716, "step": 1216 }, { "batch_size": 1, "epoch": 0.4864, "step": 1216, "tokens_per_device": 4892 }, { "epoch": 0.4864, "loss_ce": 0.08237681537866592, "loss_lvr": 0.3191238045692444, "loss_mode_switch": 0.0, "loss_total": 0.11428919434547424, "step": 1216 }, { "batch_size": 4, "epoch": 0.4864, "step": 1216, "tokens_per_device": 3772 }, { "epoch": 0.4864, "loss_ce": 0.09236456453800201, "loss_lvr": 0.7323430180549622, "loss_mode_switch": 0.0, "loss_total": 0.16559886932373047, "step": 1216 }, { "batch_size": 1, "epoch": 0.4864, "step": 1216, "tokens_per_device": 5038 }, { "epoch": 0.4864, "loss_ce": 0.001248657237738371, "loss_lvr": 0.4496476352214813, "loss_mode_switch": 0.0, "loss_total": 0.04621342197060585, "step": 1216 }, { "batch_size": 4, "epoch": 0.4864, "step": 1216, "tokens_per_device": 1928 }, { "epoch": 0.4864, "loss_ce": 0.11236357688903809, "loss_lvr": 1.0354970693588257, "loss_mode_switch": 0.0, "loss_total": 0.2159132957458496, "step": 1216 }, { "batch_size": 4, "epoch": 0.4864, "step": 1216, "tokens_per_device": 4220 }, { "epoch": 0.4864, "loss_ce": 0.22374854981899261, "loss_lvr": 1.3328168392181396, "loss_mode_switch": 0.0, "loss_total": 0.3570302426815033, "step": 1216 }, { "batch_size": 4, "epoch": 0.4864, "step": 1216, "tokens_per_device": 4892 }, { "epoch": 0.4864, "loss_ce": 0.1357710212469101, "loss_lvr": 0.9989179372787476, "loss_mode_switch": 0.0, "loss_total": 0.2356628179550171, "step": 1216 }, { "epoch": 0.4868, "grad_norm": 1.3393027782440186, "learning_rate": 5.456029870946954e-06, "loss": 0.266, "step": 1217 }, { "batch_size": 4, "epoch": 0.4868, "step": 1217, "tokens_per_device": 3772 }, { "epoch": 0.4868, "loss_ce": 0.3709244132041931, "loss_lvr": 1.0516767501831055, "loss_mode_switch": 0.0, "loss_total": 0.4760921001434326, "step": 1217 }, { "batch_size": 4, "epoch": 0.4868, "step": 1217, "tokens_per_device": 5704 }, { "epoch": 0.4868, "loss_ce": 0.4098018705844879, "loss_lvr": 0.7043951749801636, "loss_mode_switch": 0.0, "loss_total": 0.4802413880825043, "step": 1217 }, { "batch_size": 1, "epoch": 0.4868, "step": 1217, "tokens_per_device": 4771 }, { "epoch": 0.4868, "loss_ce": 0.010325348004698753, "loss_lvr": 0.23152349889278412, "loss_mode_switch": 0.0, "loss_total": 0.033477697521448135, "step": 1217 }, { "batch_size": 1, "epoch": 0.4868, "step": 1217, "tokens_per_device": 4878 }, { "epoch": 0.4868, "loss_ce": 0.02417816035449505, "loss_lvr": 0.9422639608383179, "loss_mode_switch": 0.0, "loss_total": 0.1184045597910881, "step": 1217 }, { "batch_size": 4, "epoch": 0.4868, "step": 1217, "tokens_per_device": 2644 }, { "epoch": 0.4868, "loss_ce": 0.08993656933307648, "loss_lvr": 0.9300679564476013, "loss_mode_switch": 0.0, "loss_total": 0.18294337391853333, "step": 1217 }, { "batch_size": 4, "epoch": 0.4868, "step": 1217, "tokens_per_device": 2848 }, { "epoch": 0.4868, "loss_ce": 0.1836284101009369, "loss_lvr": 0.7367748618125916, "loss_mode_switch": 0.0, "loss_total": 0.25730589032173157, "step": 1217 }, { "batch_size": 4, "epoch": 0.4868, "step": 1217, "tokens_per_device": 5504 }, { "epoch": 0.4868, "loss_ce": 0.04433393105864525, "loss_lvr": 0.8926446437835693, "loss_mode_switch": 0.0, "loss_total": 0.13359840214252472, "step": 1217 }, { "batch_size": 1, "epoch": 0.4868, "step": 1217, "tokens_per_device": 5141 }, { "epoch": 0.4868, "loss_ce": 0.062334898859262466, "loss_lvr": 0.2478463053703308, "loss_mode_switch": 0.0, "loss_total": 0.08711952716112137, "step": 1217 }, { "epoch": 0.4872, "grad_norm": 1.3662502765655518, "learning_rate": 5.44957897738746e-06, "loss": 0.2696, "step": 1218 }, { "batch_size": 4, "epoch": 0.4872, "step": 1218, "tokens_per_device": 5828 }, { "epoch": 0.4872, "loss_ce": 0.20520149171352386, "loss_lvr": 0.8045624494552612, "loss_mode_switch": 0.0, "loss_total": 0.28565773367881775, "step": 1218 }, { "batch_size": 4, "epoch": 0.4872, "step": 1218, "tokens_per_device": 1148 }, { "epoch": 0.4872, "loss_ce": 0.06053611636161804, "loss_lvr": 1.1538463830947876, "loss_mode_switch": 0.0, "loss_total": 0.1759207546710968, "step": 1218 }, { "batch_size": 4, "epoch": 0.4872, "step": 1218, "tokens_per_device": 1772 }, { "epoch": 0.4872, "loss_ce": 0.5335143208503723, "loss_lvr": 1.0764479637145996, "loss_mode_switch": 0.0, "loss_total": 0.6411591172218323, "step": 1218 }, { "batch_size": 1, "epoch": 0.4872, "step": 1218, "tokens_per_device": 5108 }, { "epoch": 0.4872, "loss_ce": 0.16147461533546448, "loss_lvr": 0.5071521997451782, "loss_mode_switch": 0.0, "loss_total": 0.21218983829021454, "step": 1218 }, { "batch_size": 1, "epoch": 0.4872, "step": 1218, "tokens_per_device": 5751 }, { "epoch": 0.4872, "loss_ce": 0.08639678359031677, "loss_lvr": 0.43377959728240967, "loss_mode_switch": 0.0, "loss_total": 0.12977474927902222, "step": 1218 }, { "batch_size": 4, "epoch": 0.4872, "step": 1218, "tokens_per_device": 3820 }, { "epoch": 0.4872, "loss_ce": 0.21572889387607574, "loss_lvr": 0.8362657427787781, "loss_mode_switch": 0.0, "loss_total": 0.29935547709465027, "step": 1218 }, { "batch_size": 1, "epoch": 0.4872, "step": 1218, "tokens_per_device": 4889 }, { "epoch": 0.4872, "loss_ce": 0.033455125987529755, "loss_lvr": 0.6222826838493347, "loss_mode_switch": 0.0, "loss_total": 0.09568339586257935, "step": 1218 }, { "batch_size": 4, "epoch": 0.4872, "step": 1218, "tokens_per_device": 6084 }, { "epoch": 0.4872, "loss_ce": 0.32208871841430664, "loss_lvr": 0.8334051966667175, "loss_mode_switch": 0.0, "loss_total": 0.40542924404144287, "step": 1218 }, { "epoch": 0.4876, "grad_norm": 1.5107967853546143, "learning_rate": 5.443127329288093e-06, "loss": 0.336, "step": 1219 }, { "batch_size": 1, "epoch": 0.4876, "step": 1219, "tokens_per_device": 5123 }, { "epoch": 0.4876, "loss_ce": 0.004367683082818985, "loss_lvr": 0.19044262170791626, "loss_mode_switch": 0.0, "loss_total": 0.0234119463711977, "step": 1219 }, { "batch_size": 4, "epoch": 0.4876, "step": 1219, "tokens_per_device": 2560 }, { "epoch": 0.4876, "loss_ce": 0.022130144760012627, "loss_lvr": 0.9346187710762024, "loss_mode_switch": 0.0, "loss_total": 0.11559202522039413, "step": 1219 }, { "batch_size": 4, "epoch": 0.4876, "step": 1219, "tokens_per_device": 1368 }, { "epoch": 0.4876, "loss_ce": 0.5322479605674744, "loss_lvr": 0.9858341217041016, "loss_mode_switch": 0.0, "loss_total": 0.6308313608169556, "step": 1219 }, { "batch_size": 4, "epoch": 0.4876, "step": 1219, "tokens_per_device": 2956 }, { "epoch": 0.4876, "loss_ce": 0.021016540005803108, "loss_lvr": 0.5181602239608765, "loss_mode_switch": 0.0, "loss_total": 0.07283256202936172, "step": 1219 }, { "batch_size": 4, "epoch": 0.4876, "step": 1219, "tokens_per_device": 3936 }, { "epoch": 0.4876, "loss_ce": 0.4151713252067566, "loss_lvr": 0.8571803569793701, "loss_mode_switch": 0.0, "loss_total": 0.5008893609046936, "step": 1219 }, { "batch_size": 1, "epoch": 0.4876, "step": 1219, "tokens_per_device": 4840 }, { "epoch": 0.4876, "loss_ce": 0.013453864492475986, "loss_lvr": 0.3949992060661316, "loss_mode_switch": 0.0, "loss_total": 0.05295378342270851, "step": 1219 }, { "batch_size": 4, "epoch": 0.4876, "step": 1219, "tokens_per_device": 4468 }, { "epoch": 0.4876, "loss_ce": 0.351095050573349, "loss_lvr": 0.740145742893219, "loss_mode_switch": 0.0, "loss_total": 0.4251096248626709, "step": 1219 }, { "batch_size": 1, "epoch": 0.4876, "step": 1219, "tokens_per_device": 6097 }, { "epoch": 0.4876, "loss_ce": 0.006176704075187445, "loss_lvr": 0.26487451791763306, "loss_mode_switch": 0.0, "loss_total": 0.03266415745019913, "step": 1219 }, { "epoch": 0.488, "grad_norm": 1.2738463878631592, "learning_rate": 5.43667493747682e-06, "loss": 0.2659, "step": 1220 }, { "batch_size": 4, "epoch": 0.488, "step": 1220, "tokens_per_device": 3972 }, { "epoch": 0.488, "loss_ce": 0.11750810593366623, "loss_lvr": 0.8700142502784729, "loss_mode_switch": 0.0, "loss_total": 0.20450952649116516, "step": 1220 }, { "batch_size": 1, "epoch": 0.488, "step": 1220, "tokens_per_device": 5430 }, { "epoch": 0.488, "loss_ce": 0.053453318774700165, "loss_lvr": 0.4067409932613373, "loss_mode_switch": 0.0, "loss_total": 0.09412741661071777, "step": 1220 }, { "batch_size": 1, "epoch": 0.488, "step": 1220, "tokens_per_device": 5050 }, { "epoch": 0.488, "loss_ce": 0.40928927063941956, "loss_lvr": 0.41425445675849915, "loss_mode_switch": 0.0, "loss_total": 0.45071470737457275, "step": 1220 }, { "batch_size": 4, "epoch": 0.488, "step": 1220, "tokens_per_device": 4224 }, { "epoch": 0.488, "loss_ce": 0.0069634318351745605, "loss_lvr": 1.0515085458755493, "loss_mode_switch": 0.0, "loss_total": 0.11211428791284561, "step": 1220 }, { "batch_size": 1, "epoch": 0.488, "step": 1220, "tokens_per_device": 4839 }, { "epoch": 0.488, "loss_ce": 1.100295901298523, "loss_lvr": 0.3856643736362457, "loss_mode_switch": 0.0, "loss_total": 1.1388623714447021, "step": 1220 }, { "batch_size": 4, "epoch": 0.488, "step": 1220, "tokens_per_device": 4200 }, { "epoch": 0.488, "loss_ce": 0.5509272217750549, "loss_lvr": 1.0412546396255493, "loss_mode_switch": 0.0, "loss_total": 0.655052661895752, "step": 1220 }, { "batch_size": 4, "epoch": 0.488, "step": 1220, "tokens_per_device": 4724 }, { "epoch": 0.488, "loss_ce": 0.08095332980155945, "loss_lvr": 0.7891958355903625, "loss_mode_switch": 0.0, "loss_total": 0.15987291932106018, "step": 1220 }, { "batch_size": 1, "epoch": 0.488, "step": 1220, "tokens_per_device": 5105 }, { "epoch": 0.488, "loss_ce": 0.0011142572620883584, "loss_lvr": 0.33184269070625305, "loss_mode_switch": 0.0, "loss_total": 0.034298527985811234, "step": 1220 }, { "epoch": 0.4884, "grad_norm": 1.4054946899414062, "learning_rate": 5.430221812782856e-06, "loss": 0.3098, "step": 1221 }, { "batch_size": 1, "epoch": 0.4884, "step": 1221, "tokens_per_device": 5127 }, { "epoch": 0.4884, "loss_ce": 0.15353529155254364, "loss_lvr": 0.6780616641044617, "loss_mode_switch": 0.0, "loss_total": 0.22134146094322205, "step": 1221 }, { "batch_size": 4, "epoch": 0.4884, "step": 1221, "tokens_per_device": 4336 }, { "epoch": 0.4884, "loss_ce": 0.08320195227861404, "loss_lvr": 0.9581937193870544, "loss_mode_switch": 0.0, "loss_total": 0.17902132868766785, "step": 1221 }, { "batch_size": 4, "epoch": 0.4884, "step": 1221, "tokens_per_device": 4364 }, { "epoch": 0.4884, "loss_ce": 0.5210416913032532, "loss_lvr": 0.8091462254524231, "loss_mode_switch": 0.0, "loss_total": 0.601956307888031, "step": 1221 }, { "batch_size": 4, "epoch": 0.4884, "step": 1221, "tokens_per_device": 2564 }, { "epoch": 0.4884, "loss_ce": 0.7056556940078735, "loss_lvr": 1.3196425437927246, "loss_mode_switch": 0.0, "loss_total": 0.837619960308075, "step": 1221 }, { "batch_size": 4, "epoch": 0.4884, "step": 1221, "tokens_per_device": 5104 }, { "epoch": 0.4884, "loss_ce": 0.6632007956504822, "loss_lvr": 0.8489402532577515, "loss_mode_switch": 0.0, "loss_total": 0.7480947971343994, "step": 1221 }, { "batch_size": 4, "epoch": 0.4884, "step": 1221, "tokens_per_device": 4196 }, { "epoch": 0.4884, "loss_ce": 0.42515265941619873, "loss_lvr": 0.8186721801757812, "loss_mode_switch": 0.0, "loss_total": 0.5070198774337769, "step": 1221 }, { "batch_size": 4, "epoch": 0.4884, "step": 1221, "tokens_per_device": 4400 }, { "epoch": 0.4884, "loss_ce": 0.1332426518201828, "loss_lvr": 0.9171351790428162, "loss_mode_switch": 0.0, "loss_total": 0.22495616972446442, "step": 1221 }, { "batch_size": 4, "epoch": 0.4884, "step": 1221, "tokens_per_device": 4064 }, { "epoch": 0.4884, "loss_ce": 0.09664644300937653, "loss_lvr": 0.8359248638153076, "loss_mode_switch": 0.0, "loss_total": 0.18023893237113953, "step": 1221 }, { "epoch": 0.4888, "grad_norm": 1.319310188293457, "learning_rate": 5.423767966036644e-06, "loss": 0.3187, "step": 1222 }, { "batch_size": 4, "epoch": 0.4888, "step": 1222, "tokens_per_device": 4232 }, { "epoch": 0.4888, "loss_ce": 0.23088860511779785, "loss_lvr": 1.7976391315460205, "loss_mode_switch": 0.0, "loss_total": 0.4106525182723999, "step": 1222 }, { "batch_size": 4, "epoch": 0.4888, "step": 1222, "tokens_per_device": 2924 }, { "epoch": 0.4888, "loss_ce": 0.743932843208313, "loss_lvr": 0.8489940166473389, "loss_mode_switch": 0.0, "loss_total": 0.8288322687149048, "step": 1222 }, { "batch_size": 4, "epoch": 0.4888, "step": 1222, "tokens_per_device": 5028 }, { "epoch": 0.4888, "loss_ce": 0.04594980552792549, "loss_lvr": 0.7683466672897339, "loss_mode_switch": 0.0, "loss_total": 0.12278448045253754, "step": 1222 }, { "batch_size": 1, "epoch": 0.4888, "step": 1222, "tokens_per_device": 5491 }, { "epoch": 0.4888, "loss_ce": 0.0009809290058910847, "loss_lvr": 0.3367118537425995, "loss_mode_switch": 0.0, "loss_total": 0.034652113914489746, "step": 1222 }, { "batch_size": 1, "epoch": 0.4888, "step": 1222, "tokens_per_device": 4532 }, { "epoch": 0.4888, "loss_ce": 0.27371007204055786, "loss_lvr": 0.5796456336975098, "loss_mode_switch": 0.0, "loss_total": 0.33167463541030884, "step": 1222 }, { "batch_size": 4, "epoch": 0.4888, "step": 1222, "tokens_per_device": 4216 }, { "epoch": 0.4888, "loss_ce": 0.08392462879419327, "loss_lvr": 1.017478585243225, "loss_mode_switch": 0.0, "loss_total": 0.18567249178886414, "step": 1222 }, { "batch_size": 1, "epoch": 0.4888, "step": 1222, "tokens_per_device": 4894 }, { "epoch": 0.4888, "loss_ce": 0.043619535863399506, "loss_lvr": 0.487509548664093, "loss_mode_switch": 0.0, "loss_total": 0.09237049520015717, "step": 1222 }, { "batch_size": 4, "epoch": 0.4888, "step": 1222, "tokens_per_device": 4656 }, { "epoch": 0.4888, "loss_ce": 0.0013445408549159765, "loss_lvr": 0.6983544826507568, "loss_mode_switch": 0.0, "loss_total": 0.07117998600006104, "step": 1222 }, { "epoch": 0.4892, "grad_norm": 1.3767350912094116, "learning_rate": 5.417313408069839e-06, "loss": 0.3012, "step": 1223 }, { "batch_size": 4, "epoch": 0.4892, "step": 1223, "tokens_per_device": 4740 }, { "epoch": 0.4892, "loss_ce": 0.031530287116765976, "loss_lvr": 0.7671220302581787, "loss_mode_switch": 0.0, "loss_total": 0.10824249684810638, "step": 1223 }, { "batch_size": 1, "epoch": 0.4892, "step": 1223, "tokens_per_device": 4942 }, { "epoch": 0.4892, "loss_ce": 0.0020794046577066183, "loss_lvr": 0.4346608519554138, "loss_mode_switch": 0.0, "loss_total": 0.04554549232125282, "step": 1223 }, { "batch_size": 4, "epoch": 0.4892, "step": 1223, "tokens_per_device": 3088 }, { "epoch": 0.4892, "loss_ce": 0.5738087892532349, "loss_lvr": 0.9621379971504211, "loss_mode_switch": 0.0, "loss_total": 0.6700226068496704, "step": 1223 }, { "batch_size": 4, "epoch": 0.4892, "step": 1223, "tokens_per_device": 7308 }, { "epoch": 0.4892, "loss_ce": 0.22078970074653625, "loss_lvr": 1.0057458877563477, "loss_mode_switch": 0.0, "loss_total": 0.32136428356170654, "step": 1223 }, { "batch_size": 4, "epoch": 0.4892, "step": 1223, "tokens_per_device": 1248 }, { "epoch": 0.4892, "loss_ce": 0.3324680030345917, "loss_lvr": 1.3368662595748901, "loss_mode_switch": 0.0, "loss_total": 0.46615463495254517, "step": 1223 }, { "batch_size": 4, "epoch": 0.4892, "step": 1223, "tokens_per_device": 6332 }, { "epoch": 0.4892, "loss_ce": 0.0863656997680664, "loss_lvr": 0.8782813549041748, "loss_mode_switch": 0.0, "loss_total": 0.1741938292980194, "step": 1223 }, { "batch_size": 1, "epoch": 0.4892, "step": 1223, "tokens_per_device": 5129 }, { "epoch": 0.4892, "loss_ce": 0.023102672770619392, "loss_lvr": 1.0670127868652344, "loss_mode_switch": 0.0, "loss_total": 0.12980395555496216, "step": 1223 }, { "batch_size": 1, "epoch": 0.4892, "step": 1223, "tokens_per_device": 5191 }, { "epoch": 0.4892, "loss_ce": 0.008294917643070221, "loss_lvr": 0.35902920365333557, "loss_mode_switch": 0.0, "loss_total": 0.04419783875346184, "step": 1223 }, { "epoch": 0.4896, "grad_norm": 1.1819877624511719, "learning_rate": 5.410858149715289e-06, "loss": 0.2685, "step": 1224 }, { "batch_size": 1, "epoch": 0.4896, "step": 1224, "tokens_per_device": 4761 }, { "epoch": 0.4896, "loss_ce": 0.06548625230789185, "loss_lvr": 0.23456040024757385, "loss_mode_switch": 0.0, "loss_total": 0.08894228935241699, "step": 1224 }, { "batch_size": 4, "epoch": 0.4896, "step": 1224, "tokens_per_device": 4476 }, { "epoch": 0.4896, "loss_ce": 0.3957519829273224, "loss_lvr": 0.8710457682609558, "loss_mode_switch": 0.0, "loss_total": 0.4828565716743469, "step": 1224 }, { "batch_size": 4, "epoch": 0.4896, "step": 1224, "tokens_per_device": 9368 }, { "epoch": 0.4896, "loss_ce": 0.13427896797657013, "loss_lvr": 0.7284227013587952, "loss_mode_switch": 0.0, "loss_total": 0.20712123811244965, "step": 1224 }, { "batch_size": 4, "epoch": 0.4896, "step": 1224, "tokens_per_device": 3820 }, { "epoch": 0.4896, "loss_ce": 0.028851209208369255, "loss_lvr": 0.9410421848297119, "loss_mode_switch": 0.0, "loss_total": 0.12295543402433395, "step": 1224 }, { "batch_size": 1, "epoch": 0.4896, "step": 1224, "tokens_per_device": 4819 }, { "epoch": 0.4896, "loss_ce": 0.5201443433761597, "loss_lvr": 1.9963068962097168, "loss_mode_switch": 0.0, "loss_total": 0.7197750210762024, "step": 1224 }, { "batch_size": 4, "epoch": 0.4896, "step": 1224, "tokens_per_device": 4204 }, { "epoch": 0.4896, "loss_ce": 0.2293255627155304, "loss_lvr": 1.1343876123428345, "loss_mode_switch": 0.0, "loss_total": 0.34276431798934937, "step": 1224 }, { "batch_size": 4, "epoch": 0.4896, "step": 1224, "tokens_per_device": 4204 }, { "epoch": 0.4896, "loss_ce": 0.197931170463562, "loss_lvr": 0.903229832649231, "loss_mode_switch": 0.0, "loss_total": 0.28825414180755615, "step": 1224 }, { "batch_size": 4, "epoch": 0.4896, "step": 1224, "tokens_per_device": 5748 }, { "epoch": 0.4896, "loss_ce": 0.043946586549282074, "loss_lvr": 0.9909417629241943, "loss_mode_switch": 0.0, "loss_total": 0.1430407613515854, "step": 1224 }, { "epoch": 0.49, "grad_norm": 1.2403876781463623, "learning_rate": 5.404402201807022e-06, "loss": 0.2638, "step": 1225 }, { "batch_size": 1, "epoch": 0.49, "step": 1225, "tokens_per_device": 5117 }, { "epoch": 0.49, "loss_ce": 0.0919816792011261, "loss_lvr": 0.39290860295295715, "loss_mode_switch": 0.0, "loss_total": 0.13127253949642181, "step": 1225 }, { "batch_size": 4, "epoch": 0.49, "step": 1225, "tokens_per_device": 4108 }, { "epoch": 0.49, "loss_ce": 0.31806257367134094, "loss_lvr": 0.8602209687232971, "loss_mode_switch": 0.0, "loss_total": 0.4040846824645996, "step": 1225 }, { "batch_size": 1, "epoch": 0.49, "step": 1225, "tokens_per_device": 5115 }, { "epoch": 0.49, "loss_ce": 0.058003053069114685, "loss_lvr": 0.4822618067264557, "loss_mode_switch": 0.0, "loss_total": 0.10622923076152802, "step": 1225 }, { "batch_size": 4, "epoch": 0.49, "step": 1225, "tokens_per_device": 4084 }, { "epoch": 0.49, "loss_ce": 0.08669324219226837, "loss_lvr": 0.9893559813499451, "loss_mode_switch": 0.0, "loss_total": 0.18562883138656616, "step": 1225 }, { "batch_size": 4, "epoch": 0.49, "step": 1225, "tokens_per_device": 1428 }, { "epoch": 0.49, "loss_ce": 0.7277605533599854, "loss_lvr": 1.7307236194610596, "loss_mode_switch": 0.0, "loss_total": 0.9008328914642334, "step": 1225 }, { "batch_size": 1, "epoch": 0.49, "step": 1225, "tokens_per_device": 4893 }, { "epoch": 0.49, "loss_ce": 0.08507085591554642, "loss_lvr": 0.6522454619407654, "loss_mode_switch": 0.0, "loss_total": 0.1502954065799713, "step": 1225 }, { "batch_size": 4, "epoch": 0.49, "step": 1225, "tokens_per_device": 4376 }, { "epoch": 0.49, "loss_ce": 0.36061999201774597, "loss_lvr": 0.7955490350723267, "loss_mode_switch": 0.0, "loss_total": 0.4401749074459076, "step": 1225 }, { "batch_size": 1, "epoch": 0.49, "step": 1225, "tokens_per_device": 4902 }, { "epoch": 0.49, "loss_ce": 0.022239932790398598, "loss_lvr": 0.5499156713485718, "loss_mode_switch": 0.0, "loss_total": 0.0772315040230751, "step": 1225 }, { "epoch": 0.4904, "grad_norm": 1.1697003841400146, "learning_rate": 5.3979455751802175e-06, "loss": 0.2809, "step": 1226 }, { "batch_size": 1, "epoch": 0.4904, "step": 1226, "tokens_per_device": 5140 }, { "epoch": 0.4904, "loss_ce": 0.005276428535580635, "loss_lvr": 0.3143123686313629, "loss_mode_switch": 0.0, "loss_total": 0.036707669496536255, "step": 1226 }, { "batch_size": 4, "epoch": 0.4904, "step": 1226, "tokens_per_device": 4300 }, { "epoch": 0.4904, "loss_ce": 0.08885493874549866, "loss_lvr": 0.9513319134712219, "loss_mode_switch": 0.0, "loss_total": 0.18398812413215637, "step": 1226 }, { "batch_size": 4, "epoch": 0.4904, "step": 1226, "tokens_per_device": 2928 }, { "epoch": 0.4904, "loss_ce": 0.18108724057674408, "loss_lvr": 0.9245646595954895, "loss_mode_switch": 0.0, "loss_total": 0.27354371547698975, "step": 1226 }, { "batch_size": 4, "epoch": 0.4904, "step": 1226, "tokens_per_device": 3860 }, { "epoch": 0.4904, "loss_ce": 0.281327486038208, "loss_lvr": 0.674377977848053, "loss_mode_switch": 0.0, "loss_total": 0.3487652838230133, "step": 1226 }, { "batch_size": 1, "epoch": 0.4904, "step": 1226, "tokens_per_device": 5050 }, { "epoch": 0.4904, "loss_ce": 0.004836765117943287, "loss_lvr": 0.5626862049102783, "loss_mode_switch": 0.0, "loss_total": 0.061105385422706604, "step": 1226 }, { "batch_size": 4, "epoch": 0.4904, "step": 1226, "tokens_per_device": 4780 }, { "epoch": 0.4904, "loss_ce": 0.44855257868766785, "loss_lvr": 0.9830175638198853, "loss_mode_switch": 0.0, "loss_total": 0.5468543171882629, "step": 1226 }, { "batch_size": 4, "epoch": 0.4904, "step": 1226, "tokens_per_device": 5744 }, { "epoch": 0.4904, "loss_ce": 0.42004263401031494, "loss_lvr": 0.6680096387863159, "loss_mode_switch": 0.0, "loss_total": 0.4868435859680176, "step": 1226 }, { "batch_size": 4, "epoch": 0.4904, "step": 1226, "tokens_per_device": 3908 }, { "epoch": 0.4904, "loss_ce": 0.006931380368769169, "loss_lvr": 0.3648300766944885, "loss_mode_switch": 0.0, "loss_total": 0.043414387851953506, "step": 1226 }, { "epoch": 0.4908, "grad_norm": 1.2136085033416748, "learning_rate": 5.391488280671199e-06, "loss": 0.2754, "step": 1227 }, { "batch_size": 4, "epoch": 0.4908, "step": 1227, "tokens_per_device": 5452 }, { "epoch": 0.4908, "loss_ce": 0.20757120847702026, "loss_lvr": 0.8437827229499817, "loss_mode_switch": 0.0, "loss_total": 0.29194948077201843, "step": 1227 }, { "batch_size": 4, "epoch": 0.4908, "step": 1227, "tokens_per_device": 4232 }, { "epoch": 0.4908, "loss_ce": 0.5027464628219604, "loss_lvr": 0.7664711475372314, "loss_mode_switch": 0.0, "loss_total": 0.5793935656547546, "step": 1227 }, { "batch_size": 4, "epoch": 0.4908, "step": 1227, "tokens_per_device": 1676 }, { "epoch": 0.4908, "loss_ce": 0.9362116456031799, "loss_lvr": 0.9570775628089905, "loss_mode_switch": 0.0, "loss_total": 1.0319193601608276, "step": 1227 }, { "batch_size": 4, "epoch": 0.4908, "step": 1227, "tokens_per_device": 2716 }, { "epoch": 0.4908, "loss_ce": 0.14097106456756592, "loss_lvr": 0.8667017221450806, "loss_mode_switch": 0.0, "loss_total": 0.2276412397623062, "step": 1227 }, { "batch_size": 1, "epoch": 0.4908, "step": 1227, "tokens_per_device": 4124 }, { "epoch": 0.4908, "loss_ce": 0.1259891539812088, "loss_lvr": 0.40027034282684326, "loss_mode_switch": 0.0, "loss_total": 0.16601619124412537, "step": 1227 }, { "batch_size": 1, "epoch": 0.4908, "step": 1227, "tokens_per_device": 5129 }, { "epoch": 0.4908, "loss_ce": 0.024023879319429398, "loss_lvr": 0.6379991769790649, "loss_mode_switch": 0.0, "loss_total": 0.0878237932920456, "step": 1227 }, { "batch_size": 1, "epoch": 0.4908, "step": 1227, "tokens_per_device": 6942 }, { "epoch": 0.4908, "loss_ce": 0.007357660215348005, "loss_lvr": 0.44942349195480347, "loss_mode_switch": 0.0, "loss_total": 0.05230000987648964, "step": 1227 }, { "batch_size": 1, "epoch": 0.4908, "step": 1227, "tokens_per_device": 5131 }, { "epoch": 0.4908, "loss_ce": 0.011137732304632664, "loss_lvr": 0.34020477533340454, "loss_mode_switch": 0.0, "loss_total": 0.04515821114182472, "step": 1227 }, { "epoch": 0.4912, "grad_norm": 1.385764718055725, "learning_rate": 5.3850303291174076e-06, "loss": 0.3071, "step": 1228 }, { "batch_size": 1, "epoch": 0.4912, "step": 1228, "tokens_per_device": 5341 }, { "epoch": 0.4912, "loss_ce": 0.07277016341686249, "loss_lvr": 0.5883280634880066, "loss_mode_switch": 0.0, "loss_total": 0.13160297274589539, "step": 1228 }, { "batch_size": 1, "epoch": 0.4912, "step": 1228, "tokens_per_device": 5133 }, { "epoch": 0.4912, "loss_ce": 0.01610364019870758, "loss_lvr": 0.34066417813301086, "loss_mode_switch": 0.0, "loss_total": 0.050170060247182846, "step": 1228 }, { "batch_size": 4, "epoch": 0.4912, "step": 1228, "tokens_per_device": 5780 }, { "epoch": 0.4912, "loss_ce": 0.028308073058724403, "loss_lvr": 0.8015356659889221, "loss_mode_switch": 0.0, "loss_total": 0.1084616407752037, "step": 1228 }, { "batch_size": 4, "epoch": 0.4912, "step": 1228, "tokens_per_device": 3792 }, { "epoch": 0.4912, "loss_ce": 0.6034832000732422, "loss_lvr": 0.7110751867294312, "loss_mode_switch": 0.0, "loss_total": 0.6745907068252563, "step": 1228 }, { "batch_size": 4, "epoch": 0.4912, "step": 1228, "tokens_per_device": 1496 }, { "epoch": 0.4912, "loss_ce": 0.29535892605781555, "loss_lvr": 1.0945955514907837, "loss_mode_switch": 0.0, "loss_total": 0.40481847524642944, "step": 1228 }, { "batch_size": 4, "epoch": 0.4912, "step": 1228, "tokens_per_device": 2788 }, { "epoch": 0.4912, "loss_ce": 0.06563373655080795, "loss_lvr": 0.721064567565918, "loss_mode_switch": 0.0, "loss_total": 0.13774019479751587, "step": 1228 }, { "batch_size": 4, "epoch": 0.4912, "step": 1228, "tokens_per_device": 4272 }, { "epoch": 0.4912, "loss_ce": 0.4158279001712799, "loss_lvr": 0.4658224582672119, "loss_mode_switch": 0.0, "loss_total": 0.4624101519584656, "step": 1228 }, { "batch_size": 4, "epoch": 0.4912, "step": 1228, "tokens_per_device": 5144 }, { "epoch": 0.4912, "loss_ce": 0.1896885335445404, "loss_lvr": 0.7473652958869934, "loss_mode_switch": 0.0, "loss_total": 0.2644250690937042, "step": 1228 }, { "epoch": 0.4916, "grad_norm": 1.3194897174835205, "learning_rate": 5.378571731357388e-06, "loss": 0.3176, "step": 1229 }, { "batch_size": 4, "epoch": 0.4916, "step": 1229, "tokens_per_device": 5832 }, { "epoch": 0.4916, "loss_ce": 0.36992475390434265, "loss_lvr": 1.0189532041549683, "loss_mode_switch": 0.0, "loss_total": 0.47182008624076843, "step": 1229 }, { "batch_size": 4, "epoch": 0.4916, "step": 1229, "tokens_per_device": 2832 }, { "epoch": 0.4916, "loss_ce": 0.09618747234344482, "loss_lvr": 0.6561774611473083, "loss_mode_switch": 0.0, "loss_total": 0.16180521249771118, "step": 1229 }, { "batch_size": 4, "epoch": 0.4916, "step": 1229, "tokens_per_device": 4240 }, { "epoch": 0.4916, "loss_ce": 0.23285624384880066, "loss_lvr": 0.9870951175689697, "loss_mode_switch": 0.0, "loss_total": 0.3315657675266266, "step": 1229 }, { "batch_size": 4, "epoch": 0.4916, "step": 1229, "tokens_per_device": 6708 }, { "epoch": 0.4916, "loss_ce": 0.00934254378080368, "loss_lvr": 0.8529141545295715, "loss_mode_switch": 0.0, "loss_total": 0.09463395923376083, "step": 1229 }, { "batch_size": 1, "epoch": 0.4916, "step": 1229, "tokens_per_device": 4878 }, { "epoch": 0.4916, "loss_ce": 0.0019493637373670936, "loss_lvr": 0.3110714554786682, "loss_mode_switch": 0.0, "loss_total": 0.033056508749723434, "step": 1229 }, { "batch_size": 1, "epoch": 0.4916, "step": 1229, "tokens_per_device": 4884 }, { "epoch": 0.4916, "loss_ce": 0.10411006212234497, "loss_lvr": 0.2840557098388672, "loss_mode_switch": 0.0, "loss_total": 0.13251563906669617, "step": 1229 }, { "batch_size": 1, "epoch": 0.4916, "step": 1229, "tokens_per_device": 4911 }, { "epoch": 0.4916, "loss_ce": 0.0376194529235363, "loss_lvr": 0.7569622993469238, "loss_mode_switch": 0.0, "loss_total": 0.11331568658351898, "step": 1229 }, { "batch_size": 1, "epoch": 0.4916, "step": 1229, "tokens_per_device": 4950 }, { "epoch": 0.4916, "loss_ce": 0.2735351622104645, "loss_lvr": 0.27226290106773376, "loss_mode_switch": 0.0, "loss_total": 0.30076146125793457, "step": 1229 }, { "epoch": 0.492, "grad_norm": 1.231785774230957, "learning_rate": 5.372112498230771e-06, "loss": 0.2785, "step": 1230 }, { "batch_size": 1, "epoch": 0.492, "step": 1230, "tokens_per_device": 4785 }, { "epoch": 0.492, "loss_ce": 0.5434022545814514, "loss_lvr": 1.3848282098770142, "loss_mode_switch": 0.0, "loss_total": 0.6818850636482239, "step": 1230 }, { "batch_size": 1, "epoch": 0.492, "step": 1230, "tokens_per_device": 5042 }, { "epoch": 0.492, "loss_ce": 0.014678023755550385, "loss_lvr": 0.3706233501434326, "loss_mode_switch": 0.0, "loss_total": 0.051740359514951706, "step": 1230 }, { "batch_size": 4, "epoch": 0.492, "step": 1230, "tokens_per_device": 3036 }, { "epoch": 0.492, "loss_ce": 0.13976024091243744, "loss_lvr": 1.0657325983047485, "loss_mode_switch": 0.0, "loss_total": 0.246333509683609, "step": 1230 }, { "batch_size": 1, "epoch": 0.492, "step": 1230, "tokens_per_device": 4923 }, { "epoch": 0.492, "loss_ce": 0.05431678891181946, "loss_lvr": 0.26245832443237305, "loss_mode_switch": 0.0, "loss_total": 0.08056262135505676, "step": 1230 }, { "batch_size": 4, "epoch": 0.492, "step": 1230, "tokens_per_device": 1360 }, { "epoch": 0.492, "loss_ce": 0.506571352481842, "loss_lvr": 1.1745530366897583, "loss_mode_switch": 0.0, "loss_total": 0.6240266561508179, "step": 1230 }, { "batch_size": 4, "epoch": 0.492, "step": 1230, "tokens_per_device": 5368 }, { "epoch": 0.492, "loss_ce": 0.5770080089569092, "loss_lvr": 0.47278597950935364, "loss_mode_switch": 0.0, "loss_total": 0.6242865920066833, "step": 1230 }, { "batch_size": 4, "epoch": 0.492, "step": 1230, "tokens_per_device": 4196 }, { "epoch": 0.492, "loss_ce": 0.2571994960308075, "loss_lvr": 0.9405633211135864, "loss_mode_switch": 0.0, "loss_total": 0.3512558341026306, "step": 1230 }, { "batch_size": 4, "epoch": 0.492, "step": 1230, "tokens_per_device": 5504 }, { "epoch": 0.492, "loss_ce": 0.2076301872730255, "loss_lvr": 0.759030818939209, "loss_mode_switch": 0.0, "loss_total": 0.2835332751274109, "step": 1230 }, { "epoch": 0.4924, "grad_norm": 1.3739100694656372, "learning_rate": 5.365652640578249e-06, "loss": 0.2695, "step": 1231 }, { "batch_size": 1, "epoch": 0.4924, "step": 1231, "tokens_per_device": 4906 }, { "epoch": 0.4924, "loss_ce": 0.273062139749527, "loss_lvr": 0.6612650752067566, "loss_mode_switch": 0.0, "loss_total": 0.3391886353492737, "step": 1231 }, { "batch_size": 4, "epoch": 0.4924, "step": 1231, "tokens_per_device": 3788 }, { "epoch": 0.4924, "loss_ce": 0.03935980051755905, "loss_lvr": 0.5834802985191345, "loss_mode_switch": 0.0, "loss_total": 0.0977078303694725, "step": 1231 }, { "batch_size": 4, "epoch": 0.4924, "step": 1231, "tokens_per_device": 5728 }, { "epoch": 0.4924, "loss_ce": 0.2436833679676056, "loss_lvr": 0.6692982316017151, "loss_mode_switch": 0.0, "loss_total": 0.3106131851673126, "step": 1231 }, { "batch_size": 4, "epoch": 0.4924, "step": 1231, "tokens_per_device": 1268 }, { "epoch": 0.4924, "loss_ce": 0.6425780653953552, "loss_lvr": 0.9942044019699097, "loss_mode_switch": 0.0, "loss_total": 0.7419984936714172, "step": 1231 }, { "batch_size": 1, "epoch": 0.4924, "step": 1231, "tokens_per_device": 4918 }, { "epoch": 0.4924, "loss_ce": 0.009939704090356827, "loss_lvr": 0.7034444212913513, "loss_mode_switch": 0.0, "loss_total": 0.08028414845466614, "step": 1231 }, { "batch_size": 4, "epoch": 0.4924, "step": 1231, "tokens_per_device": 2728 }, { "epoch": 0.4924, "loss_ce": 0.3774785101413727, "loss_lvr": 0.7705808877944946, "loss_mode_switch": 0.0, "loss_total": 0.4545366168022156, "step": 1231 }, { "batch_size": 1, "epoch": 0.4924, "step": 1231, "tokens_per_device": 4954 }, { "epoch": 0.4924, "loss_ce": 0.03296515718102455, "loss_lvr": 0.41522958874702454, "loss_mode_switch": 0.0, "loss_total": 0.07448811829090118, "step": 1231 }, { "batch_size": 4, "epoch": 0.4924, "step": 1231, "tokens_per_device": 1540 }, { "epoch": 0.4924, "loss_ce": 0.3931940197944641, "loss_lvr": 1.1709462404251099, "loss_mode_switch": 0.0, "loss_total": 0.510288655757904, "step": 1231 }, { "epoch": 0.4928, "grad_norm": 1.3491783142089844, "learning_rate": 5.3591921692415706e-06, "loss": 0.3234, "step": 1232 }, { "batch_size": 4, "epoch": 0.4928, "step": 1232, "tokens_per_device": 4564 }, { "epoch": 0.4928, "loss_ce": 0.4041294455528259, "loss_lvr": 0.7400904297828674, "loss_mode_switch": 0.0, "loss_total": 0.4781385064125061, "step": 1232 }, { "batch_size": 4, "epoch": 0.4928, "step": 1232, "tokens_per_device": 4612 }, { "epoch": 0.4928, "loss_ce": 0.11982493102550507, "loss_lvr": 0.9764169454574585, "loss_mode_switch": 0.0, "loss_total": 0.21746662259101868, "step": 1232 }, { "batch_size": 4, "epoch": 0.4928, "step": 1232, "tokens_per_device": 5560 }, { "epoch": 0.4928, "loss_ce": 0.13920779526233673, "loss_lvr": 0.6669262051582336, "loss_mode_switch": 0.0, "loss_total": 0.2059004157781601, "step": 1232 }, { "batch_size": 4, "epoch": 0.4928, "step": 1232, "tokens_per_device": 1208 }, { "epoch": 0.4928, "loss_ce": 0.5881161093711853, "loss_lvr": 1.0444555282592773, "loss_mode_switch": 0.0, "loss_total": 0.692561686038971, "step": 1232 }, { "batch_size": 4, "epoch": 0.4928, "step": 1232, "tokens_per_device": 4868 }, { "epoch": 0.4928, "loss_ce": 0.14082476496696472, "loss_lvr": 0.9234501719474792, "loss_mode_switch": 0.0, "loss_total": 0.2331697940826416, "step": 1232 }, { "batch_size": 1, "epoch": 0.4928, "step": 1232, "tokens_per_device": 4765 }, { "epoch": 0.4928, "loss_ce": 0.00162549561355263, "loss_lvr": 0.7214143872261047, "loss_mode_switch": 0.0, "loss_total": 0.07376693189144135, "step": 1232 }, { "batch_size": 4, "epoch": 0.4928, "step": 1232, "tokens_per_device": 3772 }, { "epoch": 0.4928, "loss_ce": 0.42908212542533875, "loss_lvr": 1.0937098264694214, "loss_mode_switch": 0.0, "loss_total": 0.5384531021118164, "step": 1232 }, { "batch_size": 4, "epoch": 0.4928, "step": 1232, "tokens_per_device": 4368 }, { "epoch": 0.4928, "loss_ce": 0.2181701362133026, "loss_lvr": 0.7552844882011414, "loss_mode_switch": 0.0, "loss_total": 0.29369857907295227, "step": 1232 }, { "epoch": 0.4932, "grad_norm": 1.3020241260528564, "learning_rate": 5.352731095063506e-06, "loss": 0.2562, "step": 1233 }, { "batch_size": 4, "epoch": 0.4932, "step": 1233, "tokens_per_device": 5476 }, { "epoch": 0.4932, "loss_ce": 0.7739030122756958, "loss_lvr": 0.8966230750083923, "loss_mode_switch": 0.0, "loss_total": 0.8635653257369995, "step": 1233 }, { "batch_size": 1, "epoch": 0.4932, "step": 1233, "tokens_per_device": 5035 }, { "epoch": 0.4932, "loss_ce": 0.14625518023967743, "loss_lvr": 0.3533567786216736, "loss_mode_switch": 0.0, "loss_total": 0.18159085512161255, "step": 1233 }, { "batch_size": 4, "epoch": 0.4932, "step": 1233, "tokens_per_device": 2924 }, { "epoch": 0.4932, "loss_ce": 0.09931983053684235, "loss_lvr": 0.9151158332824707, "loss_mode_switch": 0.0, "loss_total": 0.19083142280578613, "step": 1233 }, { "batch_size": 1, "epoch": 0.4932, "step": 1233, "tokens_per_device": 5564 }, { "epoch": 0.4932, "loss_ce": 0.0037619839422404766, "loss_lvr": 0.28747427463531494, "loss_mode_switch": 0.0, "loss_total": 0.03250941261649132, "step": 1233 }, { "batch_size": 4, "epoch": 0.4932, "step": 1233, "tokens_per_device": 14340 }, { "epoch": 0.4932, "loss_ce": 0.6246837973594666, "loss_lvr": 0.864042341709137, "loss_mode_switch": 0.0, "loss_total": 0.7110880613327026, "step": 1233 }, { "batch_size": 1, "epoch": 0.4932, "step": 1233, "tokens_per_device": 5153 }, { "epoch": 0.4932, "loss_ce": 0.006634491495788097, "loss_lvr": 0.2093586027622223, "loss_mode_switch": 0.0, "loss_total": 0.02757035195827484, "step": 1233 }, { "batch_size": 1, "epoch": 0.4932, "step": 1233, "tokens_per_device": 6804 }, { "epoch": 0.4932, "loss_ce": 0.0002977658878080547, "loss_lvr": 0.44071176648139954, "loss_mode_switch": 0.0, "loss_total": 0.04436894506216049, "step": 1233 }, { "batch_size": 4, "epoch": 0.4932, "step": 1233, "tokens_per_device": 5968 }, { "epoch": 0.4932, "loss_ce": 0.016935888677835464, "loss_lvr": 0.7081031203269958, "loss_mode_switch": 0.0, "loss_total": 0.08774620294570923, "step": 1233 }, { "epoch": 0.4936, "grad_norm": 1.132805585861206, "learning_rate": 5.346269428887843e-06, "loss": 0.2555, "step": 1234 }, { "batch_size": 4, "epoch": 0.4936, "step": 1234, "tokens_per_device": 4240 }, { "epoch": 0.4936, "loss_ce": 0.590560257434845, "loss_lvr": 1.0379058122634888, "loss_mode_switch": 0.0, "loss_total": 0.6943508386611938, "step": 1234 }, { "batch_size": 4, "epoch": 0.4936, "step": 1234, "tokens_per_device": 3804 }, { "epoch": 0.4936, "loss_ce": 0.24382708966732025, "loss_lvr": 1.1348518133163452, "loss_mode_switch": 0.0, "loss_total": 0.35731226205825806, "step": 1234 }, { "batch_size": 4, "epoch": 0.4936, "step": 1234, "tokens_per_device": 4244 }, { "epoch": 0.4936, "loss_ce": 0.09407398104667664, "loss_lvr": 0.9712573885917664, "loss_mode_switch": 0.0, "loss_total": 0.19119971990585327, "step": 1234 }, { "batch_size": 1, "epoch": 0.4936, "step": 1234, "tokens_per_device": 4915 }, { "epoch": 0.4936, "loss_ce": 0.05069996044039726, "loss_lvr": 0.49374520778656006, "loss_mode_switch": 0.0, "loss_total": 0.10007448494434357, "step": 1234 }, { "batch_size": 1, "epoch": 0.4936, "step": 1234, "tokens_per_device": 5140 }, { "epoch": 0.4936, "loss_ce": 0.004633709322661161, "loss_lvr": 0.18097317218780518, "loss_mode_switch": 0.0, "loss_total": 0.022731026634573936, "step": 1234 }, { "batch_size": 4, "epoch": 0.4936, "step": 1234, "tokens_per_device": 2688 }, { "epoch": 0.4936, "loss_ce": 0.3590681552886963, "loss_lvr": 0.6204808354377747, "loss_mode_switch": 0.0, "loss_total": 0.4211162328720093, "step": 1234 }, { "batch_size": 4, "epoch": 0.4936, "step": 1234, "tokens_per_device": 2628 }, { "epoch": 0.4936, "loss_ce": 0.37974441051483154, "loss_lvr": 0.7695066332817078, "loss_mode_switch": 0.0, "loss_total": 0.4566950798034668, "step": 1234 }, { "batch_size": 4, "epoch": 0.4936, "step": 1234, "tokens_per_device": 8208 }, { "epoch": 0.4936, "loss_ce": 0.28833436965942383, "loss_lvr": 1.066912055015564, "loss_mode_switch": 0.0, "loss_total": 0.3950255811214447, "step": 1234 }, { "epoch": 0.494, "grad_norm": 1.3966004848480225, "learning_rate": 5.339807181559359e-06, "loss": 0.348, "step": 1235 }, { "batch_size": 4, "epoch": 0.494, "step": 1235, "tokens_per_device": 6064 }, { "epoch": 0.494, "loss_ce": 0.30373185873031616, "loss_lvr": 0.7705607414245605, "loss_mode_switch": 0.0, "loss_total": 0.3807879388332367, "step": 1235 }, { "batch_size": 4, "epoch": 0.494, "step": 1235, "tokens_per_device": 4248 }, { "epoch": 0.494, "loss_ce": 0.20885659754276276, "loss_lvr": 0.3726986348628998, "loss_mode_switch": 0.0, "loss_total": 0.2461264580488205, "step": 1235 }, { "batch_size": 4, "epoch": 0.494, "step": 1235, "tokens_per_device": 3780 }, { "epoch": 0.494, "loss_ce": 0.3532227873802185, "loss_lvr": 0.7669331431388855, "loss_mode_switch": 0.0, "loss_total": 0.429916113615036, "step": 1235 }, { "batch_size": 4, "epoch": 0.494, "step": 1235, "tokens_per_device": 4344 }, { "epoch": 0.494, "loss_ce": 0.027150345966219902, "loss_lvr": 0.6679295301437378, "loss_mode_switch": 0.0, "loss_total": 0.09394330531358719, "step": 1235 }, { "batch_size": 4, "epoch": 0.494, "step": 1235, "tokens_per_device": 3796 }, { "epoch": 0.494, "loss_ce": 0.4652794897556305, "loss_lvr": 1.0382896661758423, "loss_mode_switch": 0.0, "loss_total": 0.5691084861755371, "step": 1235 }, { "batch_size": 1, "epoch": 0.494, "step": 1235, "tokens_per_device": 4918 }, { "epoch": 0.494, "loss_ce": 0.03054753504693508, "loss_lvr": 0.7518468499183655, "loss_mode_switch": 0.0, "loss_total": 0.10573222488164902, "step": 1235 }, { "batch_size": 1, "epoch": 0.494, "step": 1235, "tokens_per_device": 4707 }, { "epoch": 0.494, "loss_ce": 0.05342696234583855, "loss_lvr": 0.5524239540100098, "loss_mode_switch": 0.0, "loss_total": 0.10866935551166534, "step": 1235 }, { "batch_size": 4, "epoch": 0.494, "step": 1235, "tokens_per_device": 5496 }, { "epoch": 0.494, "loss_ce": 0.14845158159732819, "loss_lvr": 0.7478120923042297, "loss_mode_switch": 0.0, "loss_total": 0.22323279082775116, "step": 1235 }, { "epoch": 0.4944, "grad_norm": 1.1357641220092773, "learning_rate": 5.33334436392381e-06, "loss": 0.2734, "step": 1236 }, { "batch_size": 4, "epoch": 0.4944, "step": 1236, "tokens_per_device": 4940 }, { "epoch": 0.4944, "loss_ce": 0.3120759427547455, "loss_lvr": 0.6824156641960144, "loss_mode_switch": 0.0, "loss_total": 0.3803175091743469, "step": 1236 }, { "batch_size": 1, "epoch": 0.4944, "step": 1236, "tokens_per_device": 5306 }, { "epoch": 0.4944, "loss_ce": 0.006727555766701698, "loss_lvr": 0.36845463514328003, "loss_mode_switch": 0.0, "loss_total": 0.04357302188873291, "step": 1236 }, { "batch_size": 1, "epoch": 0.4944, "step": 1236, "tokens_per_device": 6160 }, { "epoch": 0.4944, "loss_ce": 0.003861021716147661, "loss_lvr": 0.45521122217178345, "loss_mode_switch": 0.0, "loss_total": 0.04938214272260666, "step": 1236 }, { "batch_size": 1, "epoch": 0.4944, "step": 1236, "tokens_per_device": 4873 }, { "epoch": 0.4944, "loss_ce": 0.1434164196252823, "loss_lvr": 0.24470914900302887, "loss_mode_switch": 0.0, "loss_total": 0.16788733005523682, "step": 1236 }, { "batch_size": 1, "epoch": 0.4944, "step": 1236, "tokens_per_device": 5293 }, { "epoch": 0.4944, "loss_ce": 0.022228633984923363, "loss_lvr": 0.355223149061203, "loss_mode_switch": 0.0, "loss_total": 0.057750947773456573, "step": 1236 }, { "batch_size": 4, "epoch": 0.4944, "step": 1236, "tokens_per_device": 3940 }, { "epoch": 0.4944, "loss_ce": 0.5024127960205078, "loss_lvr": 0.9405519962310791, "loss_mode_switch": 0.0, "loss_total": 0.5964679718017578, "step": 1236 }, { "batch_size": 1, "epoch": 0.4944, "step": 1236, "tokens_per_device": 4810 }, { "epoch": 0.4944, "loss_ce": 0.08121205866336823, "loss_lvr": 0.3910546898841858, "loss_mode_switch": 0.0, "loss_total": 0.12031753361225128, "step": 1236 }, { "batch_size": 4, "epoch": 0.4944, "step": 1236, "tokens_per_device": 1732 }, { "epoch": 0.4944, "loss_ce": 0.1220998466014862, "loss_lvr": 0.9743006229400635, "loss_mode_switch": 0.0, "loss_total": 0.2195299118757248, "step": 1236 }, { "epoch": 0.4948, "grad_norm": 1.3094326257705688, "learning_rate": 5.3268809868279095e-06, "loss": 0.2892, "step": 1237 }, { "batch_size": 4, "epoch": 0.4948, "step": 1237, "tokens_per_device": 3776 }, { "epoch": 0.4948, "loss_ce": 0.32482877373695374, "loss_lvr": 0.8634954690933228, "loss_mode_switch": 0.0, "loss_total": 0.411178320646286, "step": 1237 }, { "batch_size": 4, "epoch": 0.4948, "step": 1237, "tokens_per_device": 4144 }, { "epoch": 0.4948, "loss_ce": 0.27119407057762146, "loss_lvr": 1.4105465412139893, "loss_mode_switch": 0.0, "loss_total": 0.41224873065948486, "step": 1237 }, { "batch_size": 4, "epoch": 0.4948, "step": 1237, "tokens_per_device": 5768 }, { "epoch": 0.4948, "loss_ce": 0.13336004316806793, "loss_lvr": 0.7216578125953674, "loss_mode_switch": 0.0, "loss_total": 0.20552581548690796, "step": 1237 }, { "batch_size": 1, "epoch": 0.4948, "step": 1237, "tokens_per_device": 4918 }, { "epoch": 0.4948, "loss_ce": 0.2488599419593811, "loss_lvr": 0.4140810966491699, "loss_mode_switch": 0.0, "loss_total": 0.29026806354522705, "step": 1237 }, { "batch_size": 4, "epoch": 0.4948, "step": 1237, "tokens_per_device": 9288 }, { "epoch": 0.4948, "loss_ce": 0.6944669485092163, "loss_lvr": 0.7514836192131042, "loss_mode_switch": 0.0, "loss_total": 0.7696152925491333, "step": 1237 }, { "batch_size": 1, "epoch": 0.4948, "step": 1237, "tokens_per_device": 5100 }, { "epoch": 0.4948, "loss_ce": 0.0011570153292268515, "loss_lvr": 0.3929648697376251, "loss_mode_switch": 0.0, "loss_total": 0.040453504770994186, "step": 1237 }, { "batch_size": 4, "epoch": 0.4948, "step": 1237, "tokens_per_device": 1380 }, { "epoch": 0.4948, "loss_ce": 0.3032061755657196, "loss_lvr": 1.2445683479309082, "loss_mode_switch": 0.0, "loss_total": 0.42766302824020386, "step": 1237 }, { "batch_size": 4, "epoch": 0.4948, "step": 1237, "tokens_per_device": 4432 }, { "epoch": 0.4948, "loss_ce": 0.11997866630554199, "loss_lvr": 0.8029869794845581, "loss_mode_switch": 0.0, "loss_total": 0.20027735829353333, "step": 1237 }, { "epoch": 0.4952, "grad_norm": 1.3779040575027466, "learning_rate": 5.320417061119303e-06, "loss": 0.2991, "step": 1238 }, { "batch_size": 4, "epoch": 0.4952, "step": 1238, "tokens_per_device": 4384 }, { "epoch": 0.4952, "loss_ce": 0.031124314293265343, "loss_lvr": 0.8359864354133606, "loss_mode_switch": 0.0, "loss_total": 0.11472295969724655, "step": 1238 }, { "batch_size": 1, "epoch": 0.4952, "step": 1238, "tokens_per_device": 5148 }, { "epoch": 0.4952, "loss_ce": 0.0006916266866028309, "loss_lvr": 0.24198094010353088, "loss_mode_switch": 0.0, "loss_total": 0.024889720603823662, "step": 1238 }, { "batch_size": 4, "epoch": 0.4952, "step": 1238, "tokens_per_device": 8960 }, { "epoch": 0.4952, "loss_ce": 0.29199346899986267, "loss_lvr": 0.941218376159668, "loss_mode_switch": 0.0, "loss_total": 0.38611531257629395, "step": 1238 }, { "batch_size": 4, "epoch": 0.4952, "step": 1238, "tokens_per_device": 4664 }, { "epoch": 0.4952, "loss_ce": 0.20093326270580292, "loss_lvr": 0.8151364922523499, "loss_mode_switch": 0.0, "loss_total": 0.2824469208717346, "step": 1238 }, { "batch_size": 1, "epoch": 0.4952, "step": 1238, "tokens_per_device": 4825 }, { "epoch": 0.4952, "loss_ce": 0.005188945680856705, "loss_lvr": 0.23790106177330017, "loss_mode_switch": 0.0, "loss_total": 0.028979051858186722, "step": 1238 }, { "batch_size": 4, "epoch": 0.4952, "step": 1238, "tokens_per_device": 7220 }, { "epoch": 0.4952, "loss_ce": 0.38646766543388367, "loss_lvr": 0.6388261914253235, "loss_mode_switch": 0.0, "loss_total": 0.450350284576416, "step": 1238 }, { "batch_size": 1, "epoch": 0.4952, "step": 1238, "tokens_per_device": 5076 }, { "epoch": 0.4952, "loss_ce": 0.040167760103940964, "loss_lvr": 0.4989670515060425, "loss_mode_switch": 0.0, "loss_total": 0.09006446599960327, "step": 1238 }, { "batch_size": 1, "epoch": 0.4952, "step": 1238, "tokens_per_device": 4520 }, { "epoch": 0.4952, "loss_ce": 0.32385191321372986, "loss_lvr": 0.43760213255882263, "loss_mode_switch": 0.0, "loss_total": 0.3676121234893799, "step": 1238 }, { "epoch": 0.4956, "grad_norm": 1.243693232536316, "learning_rate": 5.3139525976465675e-06, "loss": 0.3131, "step": 1239 }, { "batch_size": 4, "epoch": 0.4956, "step": 1239, "tokens_per_device": 4220 }, { "epoch": 0.4956, "loss_ce": 0.17756110429763794, "loss_lvr": 1.1673154830932617, "loss_mode_switch": 0.0, "loss_total": 0.2942926585674286, "step": 1239 }, { "batch_size": 4, "epoch": 0.4956, "step": 1239, "tokens_per_device": 1424 }, { "epoch": 0.4956, "loss_ce": 0.33779940009117126, "loss_lvr": 0.8709957003593445, "loss_mode_switch": 0.0, "loss_total": 0.42489898204803467, "step": 1239 }, { "batch_size": 4, "epoch": 0.4956, "step": 1239, "tokens_per_device": 1536 }, { "epoch": 0.4956, "loss_ce": 0.2268495410680771, "loss_lvr": 0.9645548462867737, "loss_mode_switch": 0.0, "loss_total": 0.32330501079559326, "step": 1239 }, { "batch_size": 1, "epoch": 0.4956, "step": 1239, "tokens_per_device": 4897 }, { "epoch": 0.4956, "loss_ce": 0.013330355286598206, "loss_lvr": 1.1248639822006226, "loss_mode_switch": 0.0, "loss_total": 0.12581676244735718, "step": 1239 }, { "batch_size": 4, "epoch": 0.4956, "step": 1239, "tokens_per_device": 3780 }, { "epoch": 0.4956, "loss_ce": 0.5533484220504761, "loss_lvr": 0.869320809841156, "loss_mode_switch": 0.0, "loss_total": 0.6402804851531982, "step": 1239 }, { "batch_size": 4, "epoch": 0.4956, "step": 1239, "tokens_per_device": 5052 }, { "epoch": 0.4956, "loss_ce": 0.12134832888841629, "loss_lvr": 0.5839600563049316, "loss_mode_switch": 0.0, "loss_total": 0.17974433302879333, "step": 1239 }, { "batch_size": 1, "epoch": 0.4956, "step": 1239, "tokens_per_device": 4764 }, { "epoch": 0.4956, "loss_ce": 0.028778597712516785, "loss_lvr": 0.307903915643692, "loss_mode_switch": 0.0, "loss_total": 0.059568990021944046, "step": 1239 }, { "batch_size": 1, "epoch": 0.4956, "step": 1239, "tokens_per_device": 4934 }, { "epoch": 0.4956, "loss_ce": 0.1328364610671997, "loss_lvr": 0.7100460529327393, "loss_mode_switch": 0.0, "loss_total": 0.20384106040000916, "step": 1239 }, { "epoch": 0.496, "grad_norm": 1.2315467596054077, "learning_rate": 5.307487607259175e-06, "loss": 0.2651, "step": 1240 }, { "batch_size": 4, "epoch": 0.496, "step": 1240, "tokens_per_device": 4324 }, { "epoch": 0.496, "loss_ce": 0.35045361518859863, "loss_lvr": 0.7772385478019714, "loss_mode_switch": 0.0, "loss_total": 0.42817747592926025, "step": 1240 }, { "batch_size": 4, "epoch": 0.496, "step": 1240, "tokens_per_device": 5064 }, { "epoch": 0.496, "loss_ce": 0.6004520654678345, "loss_lvr": 0.7881489396095276, "loss_mode_switch": 0.0, "loss_total": 0.6792669296264648, "step": 1240 }, { "batch_size": 1, "epoch": 0.496, "step": 1240, "tokens_per_device": 4862 }, { "epoch": 0.496, "loss_ce": 0.0060561178252100945, "loss_lvr": 0.4782012403011322, "loss_mode_switch": 0.0, "loss_total": 0.05387624353170395, "step": 1240 }, { "batch_size": 1, "epoch": 0.496, "step": 1240, "tokens_per_device": 4444 }, { "epoch": 0.496, "loss_ce": 0.05027386173605919, "loss_lvr": 0.8198438286781311, "loss_mode_switch": 0.0, "loss_total": 0.13225825130939484, "step": 1240 }, { "batch_size": 4, "epoch": 0.496, "step": 1240, "tokens_per_device": 5244 }, { "epoch": 0.496, "loss_ce": 0.18801307678222656, "loss_lvr": 0.9393030405044556, "loss_mode_switch": 0.0, "loss_total": 0.2819433808326721, "step": 1240 }, { "batch_size": 1, "epoch": 0.496, "step": 1240, "tokens_per_device": 5035 }, { "epoch": 0.496, "loss_ce": 0.0005801463848911226, "loss_lvr": 0.527850329875946, "loss_mode_switch": 0.0, "loss_total": 0.053365182131528854, "step": 1240 }, { "batch_size": 4, "epoch": 0.496, "step": 1240, "tokens_per_device": 5448 }, { "epoch": 0.496, "loss_ce": 0.0035170300398021936, "loss_lvr": 0.7507083415985107, "loss_mode_switch": 0.0, "loss_total": 0.07858786731958389, "step": 1240 }, { "batch_size": 1, "epoch": 0.496, "step": 1240, "tokens_per_device": 5214 }, { "epoch": 0.496, "loss_ce": 0.10769661515951157, "loss_lvr": 0.27326804399490356, "loss_mode_switch": 0.0, "loss_total": 0.13502341508865356, "step": 1240 }, { "epoch": 0.4964, "grad_norm": 1.7377034425735474, "learning_rate": 5.301022100807482e-06, "loss": 0.3242, "step": 1241 }, { "batch_size": 4, "epoch": 0.4964, "step": 1241, "tokens_per_device": 4292 }, { "epoch": 0.4964, "loss_ce": 0.2612498104572296, "loss_lvr": 0.9142168760299683, "loss_mode_switch": 0.0, "loss_total": 0.3526715040206909, "step": 1241 }, { "batch_size": 4, "epoch": 0.4964, "step": 1241, "tokens_per_device": 1816 }, { "epoch": 0.4964, "loss_ce": 0.08513093739748001, "loss_lvr": 1.0634238719940186, "loss_mode_switch": 0.0, "loss_total": 0.1914733350276947, "step": 1241 }, { "batch_size": 4, "epoch": 0.4964, "step": 1241, "tokens_per_device": 1388 }, { "epoch": 0.4964, "loss_ce": 0.37539881467819214, "loss_lvr": 0.9231286644935608, "loss_mode_switch": 0.0, "loss_total": 0.4677116870880127, "step": 1241 }, { "batch_size": 1, "epoch": 0.4964, "step": 1241, "tokens_per_device": 5033 }, { "epoch": 0.4964, "loss_ce": 0.038702964782714844, "loss_lvr": 0.5235671401023865, "loss_mode_switch": 0.0, "loss_total": 0.09105968475341797, "step": 1241 }, { "batch_size": 4, "epoch": 0.4964, "step": 1241, "tokens_per_device": 11328 }, { "epoch": 0.4964, "loss_ce": 0.17347507178783417, "loss_lvr": 0.7189015746116638, "loss_mode_switch": 0.0, "loss_total": 0.2453652322292328, "step": 1241 }, { "batch_size": 1, "epoch": 0.4964, "step": 1241, "tokens_per_device": 4886 }, { "epoch": 0.4964, "loss_ce": 0.23794519901275635, "loss_lvr": 2.621750831604004, "loss_mode_switch": 0.0, "loss_total": 0.5001202821731567, "step": 1241 }, { "batch_size": 4, "epoch": 0.4964, "step": 1241, "tokens_per_device": 5148 }, { "epoch": 0.4964, "loss_ce": 0.5724630951881409, "loss_lvr": 0.6650137901306152, "loss_mode_switch": 0.0, "loss_total": 0.6389644742012024, "step": 1241 }, { "batch_size": 4, "epoch": 0.4964, "step": 1241, "tokens_per_device": 4088 }, { "epoch": 0.4964, "loss_ce": 0.48535412549972534, "loss_lvr": 0.949650764465332, "loss_mode_switch": 0.0, "loss_total": 0.5803192257881165, "step": 1241 }, { "epoch": 0.4968, "grad_norm": 1.2092549800872803, "learning_rate": 5.294556089142716e-06, "loss": 0.299, "step": 1242 }, { "batch_size": 4, "epoch": 0.4968, "step": 1242, "tokens_per_device": 4240 }, { "epoch": 0.4968, "loss_ce": 0.10742317885160446, "loss_lvr": 0.8986888527870178, "loss_mode_switch": 0.0, "loss_total": 0.19729205965995789, "step": 1242 }, { "batch_size": 4, "epoch": 0.4968, "step": 1242, "tokens_per_device": 7064 }, { "epoch": 0.4968, "loss_ce": 0.3826224207878113, "loss_lvr": 0.5128467679023743, "loss_mode_switch": 0.0, "loss_total": 0.43390709161758423, "step": 1242 }, { "batch_size": 4, "epoch": 0.4968, "step": 1242, "tokens_per_device": 4980 }, { "epoch": 0.4968, "loss_ce": 0.298713356256485, "loss_lvr": 0.9816011190414429, "loss_mode_switch": 0.0, "loss_total": 0.39687347412109375, "step": 1242 }, { "batch_size": 1, "epoch": 0.4968, "step": 1242, "tokens_per_device": 4876 }, { "epoch": 0.4968, "loss_ce": 0.10615590214729309, "loss_lvr": 0.6345979571342468, "loss_mode_switch": 0.0, "loss_total": 0.16961570084095, "step": 1242 }, { "batch_size": 1, "epoch": 0.4968, "step": 1242, "tokens_per_device": 7035 }, { "epoch": 0.4968, "loss_ce": 0.005917111877351999, "loss_lvr": 0.26790472865104675, "loss_mode_switch": 0.0, "loss_total": 0.032707586884498596, "step": 1242 }, { "batch_size": 4, "epoch": 0.4968, "step": 1242, "tokens_per_device": 4092 }, { "epoch": 0.4968, "loss_ce": 0.018211977556347847, "loss_lvr": 1.120461344718933, "loss_mode_switch": 0.0, "loss_total": 0.13025811314582825, "step": 1242 }, { "batch_size": 1, "epoch": 0.4968, "step": 1242, "tokens_per_device": 4859 }, { "epoch": 0.4968, "loss_ce": 0.004392482340335846, "loss_lvr": 0.2051514983177185, "loss_mode_switch": 0.0, "loss_total": 0.024907631799578667, "step": 1242 }, { "batch_size": 1, "epoch": 0.4968, "step": 1242, "tokens_per_device": 4197 }, { "epoch": 0.4968, "loss_ce": 0.003388036275282502, "loss_lvr": 0.39716631174087524, "loss_mode_switch": 0.0, "loss_total": 0.04310466721653938, "step": 1242 }, { "epoch": 0.4972, "grad_norm": 1.3920351266860962, "learning_rate": 5.2880895831169476e-06, "loss": 0.316, "step": 1243 }, { "batch_size": 4, "epoch": 0.4972, "step": 1243, "tokens_per_device": 4196 }, { "epoch": 0.4972, "loss_ce": 0.22911757230758667, "loss_lvr": 0.7626020908355713, "loss_mode_switch": 0.0, "loss_total": 0.3053777813911438, "step": 1243 }, { "batch_size": 1, "epoch": 0.4972, "step": 1243, "tokens_per_device": 4913 }, { "epoch": 0.4972, "loss_ce": 0.029691265895962715, "loss_lvr": 0.7754141092300415, "loss_mode_switch": 0.0, "loss_total": 0.10723267495632172, "step": 1243 }, { "batch_size": 4, "epoch": 0.4972, "step": 1243, "tokens_per_device": 5576 }, { "epoch": 0.4972, "loss_ce": 0.20227676630020142, "loss_lvr": 0.694484531879425, "loss_mode_switch": 0.0, "loss_total": 0.27172523736953735, "step": 1243 }, { "batch_size": 4, "epoch": 0.4972, "step": 1243, "tokens_per_device": 7332 }, { "epoch": 0.4972, "loss_ce": 0.20295748114585876, "loss_lvr": 0.6276217699050903, "loss_mode_switch": 0.0, "loss_total": 0.2657196521759033, "step": 1243 }, { "batch_size": 4, "epoch": 0.4972, "step": 1243, "tokens_per_device": 4300 }, { "epoch": 0.4972, "loss_ce": 0.0453580766916275, "loss_lvr": 0.9063847661018372, "loss_mode_switch": 0.0, "loss_total": 0.13599655032157898, "step": 1243 }, { "batch_size": 4, "epoch": 0.4972, "step": 1243, "tokens_per_device": 1364 }, { "epoch": 0.4972, "loss_ce": 0.3240968585014343, "loss_lvr": 1.0094153881072998, "loss_mode_switch": 0.0, "loss_total": 0.4250383973121643, "step": 1243 }, { "batch_size": 4, "epoch": 0.4972, "step": 1243, "tokens_per_device": 5200 }, { "epoch": 0.4972, "loss_ce": 0.09930874407291412, "loss_lvr": 0.8119685649871826, "loss_mode_switch": 0.0, "loss_total": 0.18050560355186462, "step": 1243 }, { "batch_size": 4, "epoch": 0.4972, "step": 1243, "tokens_per_device": 2768 }, { "epoch": 0.4972, "loss_ce": 0.37298715114593506, "loss_lvr": 0.8510331511497498, "loss_mode_switch": 0.0, "loss_total": 0.45809048414230347, "step": 1243 }, { "epoch": 0.4976, "grad_norm": 1.2636910676956177, "learning_rate": 5.28162259358308e-06, "loss": 0.2688, "step": 1244 }, { "batch_size": 4, "epoch": 0.4976, "step": 1244, "tokens_per_device": 4636 }, { "epoch": 0.4976, "loss_ce": 0.5468968749046326, "loss_lvr": 0.8354803323745728, "loss_mode_switch": 0.0, "loss_total": 0.6304448843002319, "step": 1244 }, { "batch_size": 4, "epoch": 0.4976, "step": 1244, "tokens_per_device": 4288 }, { "epoch": 0.4976, "loss_ce": 0.8264918923377991, "loss_lvr": 0.8860799670219421, "loss_mode_switch": 0.0, "loss_total": 0.9150998592376709, "step": 1244 }, { "batch_size": 4, "epoch": 0.4976, "step": 1244, "tokens_per_device": 4352 }, { "epoch": 0.4976, "loss_ce": 0.2468089461326599, "loss_lvr": 0.944337010383606, "loss_mode_switch": 0.0, "loss_total": 0.34124264121055603, "step": 1244 }, { "batch_size": 1, "epoch": 0.4976, "step": 1244, "tokens_per_device": 5060 }, { "epoch": 0.4976, "loss_ce": 0.004189789295196533, "loss_lvr": 0.7070363163948059, "loss_mode_switch": 0.0, "loss_total": 0.07489342242479324, "step": 1244 }, { "batch_size": 1, "epoch": 0.4976, "step": 1244, "tokens_per_device": 4942 }, { "epoch": 0.4976, "loss_ce": 0.08689697086811066, "loss_lvr": 0.9682724475860596, "loss_mode_switch": 0.0, "loss_total": 0.18372422456741333, "step": 1244 }, { "batch_size": 1, "epoch": 0.4976, "step": 1244, "tokens_per_device": 5101 }, { "epoch": 0.4976, "loss_ce": 0.052484218031167984, "loss_lvr": 0.4193393290042877, "loss_mode_switch": 0.0, "loss_total": 0.09441815316677094, "step": 1244 }, { "batch_size": 4, "epoch": 0.4976, "step": 1244, "tokens_per_device": 4284 }, { "epoch": 0.4976, "loss_ce": 0.6680669188499451, "loss_lvr": 0.7735938429832458, "loss_mode_switch": 0.0, "loss_total": 0.7454262971878052, "step": 1244 }, { "batch_size": 1, "epoch": 0.4976, "step": 1244, "tokens_per_device": 5296 }, { "epoch": 0.4976, "loss_ce": 0.000667280750349164, "loss_lvr": 0.2888478934764862, "loss_mode_switch": 0.0, "loss_total": 0.029552070423960686, "step": 1244 }, { "epoch": 0.498, "grad_norm": 1.181543231010437, "learning_rate": 5.275155131394825e-06, "loss": 0.2436, "step": 1245 }, { "batch_size": 4, "epoch": 0.498, "step": 1245, "tokens_per_device": 4204 }, { "epoch": 0.498, "loss_ce": 0.41099458932876587, "loss_lvr": 0.5975401997566223, "loss_mode_switch": 0.0, "loss_total": 0.4707486033439636, "step": 1245 }, { "batch_size": 4, "epoch": 0.498, "step": 1245, "tokens_per_device": 5940 }, { "epoch": 0.498, "loss_ce": 0.06874580681324005, "loss_lvr": 0.8353888988494873, "loss_mode_switch": 0.0, "loss_total": 0.15228469669818878, "step": 1245 }, { "batch_size": 4, "epoch": 0.498, "step": 1245, "tokens_per_device": 2672 }, { "epoch": 0.498, "loss_ce": 0.12587934732437134, "loss_lvr": 0.9457457661628723, "loss_mode_switch": 0.0, "loss_total": 0.2204539179801941, "step": 1245 }, { "batch_size": 1, "epoch": 0.498, "step": 1245, "tokens_per_device": 4980 }, { "epoch": 0.498, "loss_ce": 0.11786909401416779, "loss_lvr": 0.31982171535491943, "loss_mode_switch": 0.0, "loss_total": 0.1498512625694275, "step": 1245 }, { "batch_size": 4, "epoch": 0.498, "step": 1245, "tokens_per_device": 3356 }, { "epoch": 0.498, "loss_ce": 0.13474303483963013, "loss_lvr": 1.0893833637237549, "loss_mode_switch": 0.0, "loss_total": 0.24368137121200562, "step": 1245 }, { "batch_size": 4, "epoch": 0.498, "step": 1245, "tokens_per_device": 3764 }, { "epoch": 0.498, "loss_ce": 0.2594258189201355, "loss_lvr": 0.5693626999855042, "loss_mode_switch": 0.0, "loss_total": 0.31636208295822144, "step": 1245 }, { "batch_size": 4, "epoch": 0.498, "step": 1245, "tokens_per_device": 3892 }, { "epoch": 0.498, "loss_ce": 0.25756990909576416, "loss_lvr": 0.8294551372528076, "loss_mode_switch": 0.0, "loss_total": 0.3405154347419739, "step": 1245 }, { "batch_size": 4, "epoch": 0.498, "step": 1245, "tokens_per_device": 4652 }, { "epoch": 0.498, "loss_ce": 0.6181386113166809, "loss_lvr": 0.6484659314155579, "loss_mode_switch": 0.0, "loss_total": 0.6829851865768433, "step": 1245 }, { "epoch": 0.4984, "grad_norm": 1.5471491813659668, "learning_rate": 5.268687207406692e-06, "loss": 0.3329, "step": 1246 }, { "batch_size": 1, "epoch": 0.4984, "step": 1246, "tokens_per_device": 7981 }, { "epoch": 0.4984, "loss_ce": 0.0002370612637605518, "loss_lvr": 0.3410356938838959, "loss_mode_switch": 0.0, "loss_total": 0.03434063121676445, "step": 1246 }, { "batch_size": 4, "epoch": 0.4984, "step": 1246, "tokens_per_device": 1520 }, { "epoch": 0.4984, "loss_ce": 0.8213537335395813, "loss_lvr": 0.9768050909042358, "loss_mode_switch": 0.0, "loss_total": 0.9190342426300049, "step": 1246 }, { "batch_size": 4, "epoch": 0.4984, "step": 1246, "tokens_per_device": 4252 }, { "epoch": 0.4984, "loss_ce": 0.1447584331035614, "loss_lvr": 0.6019987463951111, "loss_mode_switch": 0.0, "loss_total": 0.20495830476284027, "step": 1246 }, { "batch_size": 4, "epoch": 0.4984, "step": 1246, "tokens_per_device": 5928 }, { "epoch": 0.4984, "loss_ce": 0.021860109642148018, "loss_lvr": 0.7140215635299683, "loss_mode_switch": 0.0, "loss_total": 0.09326226264238358, "step": 1246 }, { "batch_size": 1, "epoch": 0.4984, "step": 1246, "tokens_per_device": 5044 }, { "epoch": 0.4984, "loss_ce": 0.028432242572307587, "loss_lvr": 0.46844682097435, "loss_mode_switch": 0.0, "loss_total": 0.0752769261598587, "step": 1246 }, { "batch_size": 1, "epoch": 0.4984, "step": 1246, "tokens_per_device": 7190 }, { "epoch": 0.4984, "loss_ce": 0.021274980157613754, "loss_lvr": 0.29580458998680115, "loss_mode_switch": 0.0, "loss_total": 0.05085543915629387, "step": 1246 }, { "batch_size": 4, "epoch": 0.4984, "step": 1246, "tokens_per_device": 4352 }, { "epoch": 0.4984, "loss_ce": 0.44852644205093384, "loss_lvr": 0.6157640218734741, "loss_mode_switch": 0.0, "loss_total": 0.5101028680801392, "step": 1246 }, { "batch_size": 1, "epoch": 0.4984, "step": 1246, "tokens_per_device": 5148 }, { "epoch": 0.4984, "loss_ce": 0.022894781082868576, "loss_lvr": 0.4267564117908478, "loss_mode_switch": 0.0, "loss_total": 0.0655704215168953, "step": 1246 }, { "epoch": 0.4988, "grad_norm": 1.27628755569458, "learning_rate": 5.262218832473959e-06, "loss": 0.3158, "step": 1247 }, { "batch_size": 4, "epoch": 0.4988, "step": 1247, "tokens_per_device": 5572 }, { "epoch": 0.4988, "loss_ce": 0.01852642558515072, "loss_lvr": 0.6600750088691711, "loss_mode_switch": 0.0, "loss_total": 0.0845339298248291, "step": 1247 }, { "batch_size": 4, "epoch": 0.4988, "step": 1247, "tokens_per_device": 5100 }, { "epoch": 0.4988, "loss_ce": 0.19210439920425415, "loss_lvr": 0.8322521448135376, "loss_mode_switch": 0.0, "loss_total": 0.2753296196460724, "step": 1247 }, { "batch_size": 4, "epoch": 0.4988, "step": 1247, "tokens_per_device": 8852 }, { "epoch": 0.4988, "loss_ce": 0.03801480308175087, "loss_lvr": 0.8377705216407776, "loss_mode_switch": 0.0, "loss_total": 0.12179185450077057, "step": 1247 }, { "batch_size": 4, "epoch": 0.4988, "step": 1247, "tokens_per_device": 4248 }, { "epoch": 0.4988, "loss_ce": 0.36715373396873474, "loss_lvr": 1.0505729913711548, "loss_mode_switch": 0.0, "loss_total": 0.4722110331058502, "step": 1247 }, { "batch_size": 4, "epoch": 0.4988, "step": 1247, "tokens_per_device": 4316 }, { "epoch": 0.4988, "loss_ce": 0.16276447474956512, "loss_lvr": 0.7796010375022888, "loss_mode_switch": 0.0, "loss_total": 0.240724578499794, "step": 1247 }, { "batch_size": 4, "epoch": 0.4988, "step": 1247, "tokens_per_device": 2956 }, { "epoch": 0.4988, "loss_ce": 0.059381406754255295, "loss_lvr": 0.7104125618934631, "loss_mode_switch": 0.0, "loss_total": 0.1304226666688919, "step": 1247 }, { "batch_size": 4, "epoch": 0.4988, "step": 1247, "tokens_per_device": 9364 }, { "epoch": 0.4988, "loss_ce": 0.12095944583415985, "loss_lvr": 1.0475213527679443, "loss_mode_switch": 0.0, "loss_total": 0.22571158409118652, "step": 1247 }, { "batch_size": 1, "epoch": 0.4988, "step": 1247, "tokens_per_device": 5012 }, { "epoch": 0.4988, "loss_ce": 0.027497220784425735, "loss_lvr": 0.7213423848152161, "loss_mode_switch": 0.0, "loss_total": 0.09963145852088928, "step": 1247 }, { "epoch": 0.4992, "grad_norm": 1.2895753383636475, "learning_rate": 5.25575001745267e-06, "loss": 0.2557, "step": 1248 }, { "batch_size": 4, "epoch": 0.4992, "step": 1248, "tokens_per_device": 1272 }, { "epoch": 0.4992, "loss_ce": 0.0777403861284256, "loss_lvr": 0.9605162143707275, "loss_mode_switch": 0.0, "loss_total": 0.1737920045852661, "step": 1248 }, { "batch_size": 1, "epoch": 0.4992, "step": 1248, "tokens_per_device": 4895 }, { "epoch": 0.4992, "loss_ce": 0.019415022805333138, "loss_lvr": 0.3557272255420685, "loss_mode_switch": 0.0, "loss_total": 0.054987743496894836, "step": 1248 }, { "batch_size": 4, "epoch": 0.4992, "step": 1248, "tokens_per_device": 4344 }, { "epoch": 0.4992, "loss_ce": 0.4350099563598633, "loss_lvr": 0.8113477230072021, "loss_mode_switch": 0.0, "loss_total": 0.5161447525024414, "step": 1248 }, { "batch_size": 4, "epoch": 0.4992, "step": 1248, "tokens_per_device": 1248 }, { "epoch": 0.4992, "loss_ce": 0.7711216807365417, "loss_lvr": 1.0745664834976196, "loss_mode_switch": 0.0, "loss_total": 0.8785783052444458, "step": 1248 }, { "batch_size": 4, "epoch": 0.4992, "step": 1248, "tokens_per_device": 4792 }, { "epoch": 0.4992, "loss_ce": 0.3504844605922699, "loss_lvr": 0.8723660707473755, "loss_mode_switch": 0.0, "loss_total": 0.4377210736274719, "step": 1248 }, { "batch_size": 1, "epoch": 0.4992, "step": 1248, "tokens_per_device": 5115 }, { "epoch": 0.4992, "loss_ce": 0.06420686841011047, "loss_lvr": 0.47057604789733887, "loss_mode_switch": 0.0, "loss_total": 0.11126447468996048, "step": 1248 }, { "batch_size": 1, "epoch": 0.4992, "step": 1248, "tokens_per_device": 5531 }, { "epoch": 0.4992, "loss_ce": 0.0005356725887395442, "loss_lvr": 0.5836381316184998, "loss_mode_switch": 0.0, "loss_total": 0.05889948830008507, "step": 1248 }, { "batch_size": 4, "epoch": 0.4992, "step": 1248, "tokens_per_device": 4496 }, { "epoch": 0.4992, "loss_ce": 0.0777752622961998, "loss_lvr": 0.8324698805809021, "loss_mode_switch": 0.0, "loss_total": 0.16102224588394165, "step": 1248 }, { "epoch": 0.4996, "grad_norm": 1.5425537824630737, "learning_rate": 5.249280773199597e-06, "loss": 0.3254, "step": 1249 }, { "batch_size": 1, "epoch": 0.4996, "step": 1249, "tokens_per_device": 5153 }, { "epoch": 0.4996, "loss_ce": 0.03951818495988846, "loss_lvr": 0.23770178854465485, "loss_mode_switch": 0.0, "loss_total": 0.0632883608341217, "step": 1249 }, { "batch_size": 1, "epoch": 0.4996, "step": 1249, "tokens_per_device": 5246 }, { "epoch": 0.4996, "loss_ce": 0.003658676752820611, "loss_lvr": 0.6340430974960327, "loss_mode_switch": 0.0, "loss_total": 0.06706298887729645, "step": 1249 }, { "batch_size": 4, "epoch": 0.4996, "step": 1249, "tokens_per_device": 5156 }, { "epoch": 0.4996, "loss_ce": 0.4014081358909607, "loss_lvr": 0.8701867461204529, "loss_mode_switch": 0.0, "loss_total": 0.4884268045425415, "step": 1249 }, { "batch_size": 4, "epoch": 0.4996, "step": 1249, "tokens_per_device": 8472 }, { "epoch": 0.4996, "loss_ce": 0.35237228870391846, "loss_lvr": 0.6260218620300293, "loss_mode_switch": 0.0, "loss_total": 0.41497448086738586, "step": 1249 }, { "batch_size": 4, "epoch": 0.4996, "step": 1249, "tokens_per_device": 2672 }, { "epoch": 0.4996, "loss_ce": 0.37093570828437805, "loss_lvr": 0.6752346754074097, "loss_mode_switch": 0.0, "loss_total": 0.438459187746048, "step": 1249 }, { "batch_size": 4, "epoch": 0.4996, "step": 1249, "tokens_per_device": 4804 }, { "epoch": 0.4996, "loss_ce": 0.05511622875928879, "loss_lvr": 0.7785107493400574, "loss_mode_switch": 0.0, "loss_total": 0.13296730816364288, "step": 1249 }, { "batch_size": 4, "epoch": 0.4996, "step": 1249, "tokens_per_device": 1484 }, { "epoch": 0.4996, "loss_ce": 0.29534241557121277, "loss_lvr": 1.2065306901931763, "loss_mode_switch": 0.0, "loss_total": 0.4159954786300659, "step": 1249 }, { "batch_size": 4, "epoch": 0.4996, "step": 1249, "tokens_per_device": 4832 }, { "epoch": 0.4996, "loss_ce": 0.4886007606983185, "loss_lvr": 0.7621701955795288, "loss_mode_switch": 0.0, "loss_total": 0.5648177862167358, "step": 1249 }, { "epoch": 0.5, "grad_norm": 1.3784716129302979, "learning_rate": 5.242811110572243e-06, "loss": 0.3328, "step": 1250 }, { "batch_size": 4, "epoch": 0.5, "step": 1250, "tokens_per_device": 8228 }, { "epoch": 0.5, "loss_ce": 0.22604897618293762, "loss_lvr": 1.1459050178527832, "loss_mode_switch": 0.0, "loss_total": 0.34063947200775146, "step": 1250 }, { "batch_size": 4, "epoch": 0.5, "step": 1250, "tokens_per_device": 1708 }, { "epoch": 0.5, "loss_ce": 0.28875577449798584, "loss_lvr": 1.0965567827224731, "loss_mode_switch": 0.0, "loss_total": 0.39841145277023315, "step": 1250 }, { "batch_size": 1, "epoch": 0.5, "step": 1250, "tokens_per_device": 4617 }, { "epoch": 0.5, "loss_ce": 0.062446050345897675, "loss_lvr": 0.36540359258651733, "loss_mode_switch": 0.0, "loss_total": 0.09898640960454941, "step": 1250 }, { "batch_size": 4, "epoch": 0.5, "step": 1250, "tokens_per_device": 4252 }, { "epoch": 0.5, "loss_ce": 0.561145007610321, "loss_lvr": 0.6709060668945312, "loss_mode_switch": 0.0, "loss_total": 0.6282356381416321, "step": 1250 }, { "batch_size": 4, "epoch": 0.5, "step": 1250, "tokens_per_device": 1464 }, { "epoch": 0.5, "loss_ce": 0.49953824281692505, "loss_lvr": 0.978360652923584, "loss_mode_switch": 0.0, "loss_total": 0.5973743200302124, "step": 1250 }, { "batch_size": 4, "epoch": 0.5, "step": 1250, "tokens_per_device": 4368 }, { "epoch": 0.5, "loss_ce": 0.25967398285865784, "loss_lvr": 0.8923323154449463, "loss_mode_switch": 0.0, "loss_total": 0.3489072322845459, "step": 1250 }, { "batch_size": 4, "epoch": 0.5, "step": 1250, "tokens_per_device": 1564 }, { "epoch": 0.5, "loss_ce": 0.3136129379272461, "loss_lvr": 0.9974479079246521, "loss_mode_switch": 0.0, "loss_total": 0.4133577346801758, "step": 1250 }, { "batch_size": 4, "epoch": 0.5, "step": 1250, "tokens_per_device": 4308 }, { "epoch": 0.5, "loss_ce": 0.11077512800693512, "loss_lvr": 0.959376871585846, "loss_mode_switch": 0.0, "loss_total": 0.20671281218528748, "step": 1250 }, { "epoch": 0.5004, "grad_norm": 1.3916656970977783, "learning_rate": 5.236341040428803e-06, "loss": 0.3204, "step": 1251 }, { "batch_size": 4, "epoch": 0.5004, "step": 1251, "tokens_per_device": 3184 }, { "epoch": 0.5004, "loss_ce": 0.3265575170516968, "loss_lvr": 0.7771726250648499, "loss_mode_switch": 0.0, "loss_total": 0.4042747914791107, "step": 1251 }, { "batch_size": 1, "epoch": 0.5004, "step": 1251, "tokens_per_device": 4291 }, { "epoch": 0.5004, "loss_ce": 0.026311252266168594, "loss_lvr": 0.4750370681285858, "loss_mode_switch": 0.0, "loss_total": 0.07381495833396912, "step": 1251 }, { "batch_size": 4, "epoch": 0.5004, "step": 1251, "tokens_per_device": 2888 }, { "epoch": 0.5004, "loss_ce": 0.004078263882547617, "loss_lvr": 0.5878511071205139, "loss_mode_switch": 0.0, "loss_total": 0.06286337226629257, "step": 1251 }, { "batch_size": 4, "epoch": 0.5004, "step": 1251, "tokens_per_device": 1600 }, { "epoch": 0.5004, "loss_ce": 0.4257087707519531, "loss_lvr": 0.9407214522361755, "loss_mode_switch": 0.0, "loss_total": 0.5197809338569641, "step": 1251 }, { "batch_size": 1, "epoch": 0.5004, "step": 1251, "tokens_per_device": 5143 }, { "epoch": 0.5004, "loss_ce": 0.0023690483067184687, "loss_lvr": 0.3142128586769104, "loss_mode_switch": 0.0, "loss_total": 0.033790335059165955, "step": 1251 }, { "batch_size": 4, "epoch": 0.5004, "step": 1251, "tokens_per_device": 6580 }, { "epoch": 0.5004, "loss_ce": 0.12103872001171112, "loss_lvr": 0.805871844291687, "loss_mode_switch": 0.0, "loss_total": 0.20162591338157654, "step": 1251 }, { "batch_size": 4, "epoch": 0.5004, "step": 1251, "tokens_per_device": 4860 }, { "epoch": 0.5004, "loss_ce": 0.44671398401260376, "loss_lvr": 0.7396539449691772, "loss_mode_switch": 0.0, "loss_total": 0.5206793546676636, "step": 1251 }, { "batch_size": 1, "epoch": 0.5004, "step": 1251, "tokens_per_device": 5180 }, { "epoch": 0.5004, "loss_ce": 0.004907268565148115, "loss_lvr": 0.2691340446472168, "loss_mode_switch": 0.0, "loss_total": 0.03182067349553108, "step": 1251 }, { "epoch": 0.5008, "grad_norm": 1.3153295516967773, "learning_rate": 5.229870573628163e-06, "loss": 0.2638, "step": 1252 }, { "batch_size": 1, "epoch": 0.5008, "step": 1252, "tokens_per_device": 4892 }, { "epoch": 0.5008, "loss_ce": 0.006343592423945665, "loss_lvr": 0.21927228569984436, "loss_mode_switch": 0.0, "loss_total": 0.028270820155739784, "step": 1252 }, { "batch_size": 4, "epoch": 0.5008, "step": 1252, "tokens_per_device": 3924 }, { "epoch": 0.5008, "loss_ce": 0.11740535497665405, "loss_lvr": 0.918813169002533, "loss_mode_switch": 0.0, "loss_total": 0.2092866748571396, "step": 1252 }, { "batch_size": 4, "epoch": 0.5008, "step": 1252, "tokens_per_device": 6104 }, { "epoch": 0.5008, "loss_ce": 0.33369410037994385, "loss_lvr": 0.7799661755561829, "loss_mode_switch": 0.0, "loss_total": 0.41169071197509766, "step": 1252 }, { "batch_size": 4, "epoch": 0.5008, "step": 1252, "tokens_per_device": 4988 }, { "epoch": 0.5008, "loss_ce": 0.23440293967723846, "loss_lvr": 0.7788331508636475, "loss_mode_switch": 0.0, "loss_total": 0.31228625774383545, "step": 1252 }, { "batch_size": 1, "epoch": 0.5008, "step": 1252, "tokens_per_device": 5366 }, { "epoch": 0.5008, "loss_ce": 0.18261060118675232, "loss_lvr": 0.4899314045906067, "loss_mode_switch": 0.0, "loss_total": 0.231603741645813, "step": 1252 }, { "batch_size": 4, "epoch": 0.5008, "step": 1252, "tokens_per_device": 1280 }, { "epoch": 0.5008, "loss_ce": 0.3210667669773102, "loss_lvr": 1.0700165033340454, "loss_mode_switch": 0.0, "loss_total": 0.4280684292316437, "step": 1252 }, { "batch_size": 4, "epoch": 0.5008, "step": 1252, "tokens_per_device": 5040 }, { "epoch": 0.5008, "loss_ce": 0.04006190225481987, "loss_lvr": 1.0864124298095703, "loss_mode_switch": 0.0, "loss_total": 0.14870314300060272, "step": 1252 }, { "batch_size": 4, "epoch": 0.5008, "step": 1252, "tokens_per_device": 3892 }, { "epoch": 0.5008, "loss_ce": 0.6332770586013794, "loss_lvr": 0.9530247449874878, "loss_mode_switch": 0.0, "loss_total": 0.7285795211791992, "step": 1252 }, { "epoch": 0.5012, "grad_norm": 1.2528384923934937, "learning_rate": 5.223399721029875e-06, "loss": 0.3072, "step": 1253 }, { "batch_size": 4, "epoch": 0.5012, "step": 1253, "tokens_per_device": 5740 }, { "epoch": 0.5012, "loss_ce": 0.6455405950546265, "loss_lvr": 1.2464030981063843, "loss_mode_switch": 0.0, "loss_total": 0.770180881023407, "step": 1253 }, { "batch_size": 1, "epoch": 0.5012, "step": 1253, "tokens_per_device": 5171 }, { "epoch": 0.5012, "loss_ce": 0.04731523618102074, "loss_lvr": 0.5821588635444641, "loss_mode_switch": 0.0, "loss_total": 0.10553112626075745, "step": 1253 }, { "batch_size": 1, "epoch": 0.5012, "step": 1253, "tokens_per_device": 5114 }, { "epoch": 0.5012, "loss_ce": 0.0031488046515733004, "loss_lvr": 0.3511491119861603, "loss_mode_switch": 0.0, "loss_total": 0.0382637158036232, "step": 1253 }, { "batch_size": 4, "epoch": 0.5012, "step": 1253, "tokens_per_device": 7312 }, { "epoch": 0.5012, "loss_ce": 0.0020361854694783688, "loss_lvr": 0.7278597950935364, "loss_mode_switch": 0.0, "loss_total": 0.07482216507196426, "step": 1253 }, { "batch_size": 1, "epoch": 0.5012, "step": 1253, "tokens_per_device": 4891 }, { "epoch": 0.5012, "loss_ce": 0.14595098793506622, "loss_lvr": 0.2830473780632019, "loss_mode_switch": 0.0, "loss_total": 0.17425572872161865, "step": 1253 }, { "batch_size": 4, "epoch": 0.5012, "step": 1253, "tokens_per_device": 4228 }, { "epoch": 0.5012, "loss_ce": 0.18217244744300842, "loss_lvr": 1.0613020658493042, "loss_mode_switch": 0.0, "loss_total": 0.2883026599884033, "step": 1253 }, { "batch_size": 1, "epoch": 0.5012, "step": 1253, "tokens_per_device": 5123 }, { "epoch": 0.5012, "loss_ce": 0.01256992481648922, "loss_lvr": 0.22645212709903717, "loss_mode_switch": 0.0, "loss_total": 0.035215139389038086, "step": 1253 }, { "batch_size": 1, "epoch": 0.5012, "step": 1253, "tokens_per_device": 5207 }, { "epoch": 0.5012, "loss_ce": 0.054147180169820786, "loss_lvr": 0.43992650508880615, "loss_mode_switch": 0.0, "loss_total": 0.09813982993364334, "step": 1253 }, { "epoch": 0.5016, "grad_norm": 1.1812769174575806, "learning_rate": 5.21692849349413e-06, "loss": 0.2683, "step": 1254 }, { "batch_size": 4, "epoch": 0.5016, "step": 1254, "tokens_per_device": 5856 }, { "epoch": 0.5016, "loss_ce": 0.12117300182580948, "loss_lvr": 0.9260243773460388, "loss_mode_switch": 0.0, "loss_total": 0.21377544105052948, "step": 1254 }, { "batch_size": 4, "epoch": 0.5016, "step": 1254, "tokens_per_device": 3760 }, { "epoch": 0.5016, "loss_ce": 0.49770423769950867, "loss_lvr": 1.0915919542312622, "loss_mode_switch": 0.0, "loss_total": 0.6068634390830994, "step": 1254 }, { "batch_size": 4, "epoch": 0.5016, "step": 1254, "tokens_per_device": 4580 }, { "epoch": 0.5016, "loss_ce": 0.3372803032398224, "loss_lvr": 1.2871683835983276, "loss_mode_switch": 0.0, "loss_total": 0.4659971594810486, "step": 1254 }, { "batch_size": 4, "epoch": 0.5016, "step": 1254, "tokens_per_device": 4652 }, { "epoch": 0.5016, "loss_ce": 0.4550212621688843, "loss_lvr": 0.866235613822937, "loss_mode_switch": 0.0, "loss_total": 0.541644811630249, "step": 1254 }, { "batch_size": 4, "epoch": 0.5016, "step": 1254, "tokens_per_device": 5260 }, { "epoch": 0.5016, "loss_ce": 0.45060110092163086, "loss_lvr": 0.8866965174674988, "loss_mode_switch": 0.0, "loss_total": 0.5392707586288452, "step": 1254 }, { "batch_size": 4, "epoch": 0.5016, "step": 1254, "tokens_per_device": 5432 }, { "epoch": 0.5016, "loss_ce": 0.2514331042766571, "loss_lvr": 0.776005208492279, "loss_mode_switch": 0.0, "loss_total": 0.32903361320495605, "step": 1254 }, { "batch_size": 4, "epoch": 0.5016, "step": 1254, "tokens_per_device": 4996 }, { "epoch": 0.5016, "loss_ce": 0.5486473441123962, "loss_lvr": 0.8708540797233582, "loss_mode_switch": 0.0, "loss_total": 0.6357327699661255, "step": 1254 }, { "batch_size": 4, "epoch": 0.5016, "step": 1254, "tokens_per_device": 4864 }, { "epoch": 0.5016, "loss_ce": 0.11619716137647629, "loss_lvr": 0.6362982392311096, "loss_mode_switch": 0.0, "loss_total": 0.1798269897699356, "step": 1254 }, { "epoch": 0.502, "grad_norm": 1.3112138509750366, "learning_rate": 5.210456901881761e-06, "loss": 0.333, "step": 1255 }, { "batch_size": 1, "epoch": 0.502, "step": 1255, "tokens_per_device": 5114 }, { "epoch": 0.502, "loss_ce": 0.008483792655169964, "loss_lvr": 0.7861527800559998, "loss_mode_switch": 0.0, "loss_total": 0.08709906786680222, "step": 1255 }, { "batch_size": 4, "epoch": 0.502, "step": 1255, "tokens_per_device": 1364 }, { "epoch": 0.502, "loss_ce": 0.1684625893831253, "loss_lvr": 1.5333967208862305, "loss_mode_switch": 0.0, "loss_total": 0.3218022584915161, "step": 1255 }, { "batch_size": 4, "epoch": 0.502, "step": 1255, "tokens_per_device": 4356 }, { "epoch": 0.502, "loss_ce": 0.20025146007537842, "loss_lvr": 0.7728555202484131, "loss_mode_switch": 0.0, "loss_total": 0.2775370180606842, "step": 1255 }, { "batch_size": 4, "epoch": 0.502, "step": 1255, "tokens_per_device": 4704 }, { "epoch": 0.502, "loss_ce": 0.6709469556808472, "loss_lvr": 0.8783740401268005, "loss_mode_switch": 0.0, "loss_total": 0.7587843537330627, "step": 1255 }, { "batch_size": 1, "epoch": 0.502, "step": 1255, "tokens_per_device": 5132 }, { "epoch": 0.502, "loss_ce": 0.015737643465399742, "loss_lvr": 0.21205675601959229, "loss_mode_switch": 0.0, "loss_total": 0.03694332018494606, "step": 1255 }, { "batch_size": 4, "epoch": 0.502, "step": 1255, "tokens_per_device": 4344 }, { "epoch": 0.502, "loss_ce": 0.38608768582344055, "loss_lvr": 0.8554596900939941, "loss_mode_switch": 0.0, "loss_total": 0.4716336727142334, "step": 1255 }, { "batch_size": 4, "epoch": 0.502, "step": 1255, "tokens_per_device": 4216 }, { "epoch": 0.502, "loss_ce": 0.10889651626348495, "loss_lvr": 0.7285365462303162, "loss_mode_switch": 0.0, "loss_total": 0.18175017833709717, "step": 1255 }, { "batch_size": 4, "epoch": 0.502, "step": 1255, "tokens_per_device": 1256 }, { "epoch": 0.502, "loss_ce": 0.3095278739929199, "loss_lvr": 1.0999044179916382, "loss_mode_switch": 0.0, "loss_total": 0.4195183217525482, "step": 1255 }, { "epoch": 0.5024, "grad_norm": 1.331618309020996, "learning_rate": 5.2039849570542e-06, "loss": 0.282, "step": 1256 }, { "batch_size": 4, "epoch": 0.5024, "step": 1256, "tokens_per_device": 2168 }, { "epoch": 0.5024, "loss_ce": 0.28311723470687866, "loss_lvr": 2.159708023071289, "loss_mode_switch": 0.0, "loss_total": 0.4990880489349365, "step": 1256 }, { "batch_size": 4, "epoch": 0.5024, "step": 1256, "tokens_per_device": 5016 }, { "epoch": 0.5024, "loss_ce": 0.3907753527164459, "loss_lvr": 0.7901150584220886, "loss_mode_switch": 0.0, "loss_total": 0.4697868525981903, "step": 1256 }, { "batch_size": 4, "epoch": 0.5024, "step": 1256, "tokens_per_device": 4216 }, { "epoch": 0.5024, "loss_ce": 0.12976551055908203, "loss_lvr": 0.6035853624343872, "loss_mode_switch": 0.0, "loss_total": 0.190124049782753, "step": 1256 }, { "batch_size": 4, "epoch": 0.5024, "step": 1256, "tokens_per_device": 4248 }, { "epoch": 0.5024, "loss_ce": 0.5652429461479187, "loss_lvr": 2.009814739227295, "loss_mode_switch": 0.0, "loss_total": 0.7662244439125061, "step": 1256 }, { "batch_size": 4, "epoch": 0.5024, "step": 1256, "tokens_per_device": 8740 }, { "epoch": 0.5024, "loss_ce": 0.3083380162715912, "loss_lvr": 0.491288959980011, "loss_mode_switch": 0.0, "loss_total": 0.3574669063091278, "step": 1256 }, { "batch_size": 4, "epoch": 0.5024, "step": 1256, "tokens_per_device": 1408 }, { "epoch": 0.5024, "loss_ce": 0.25305503606796265, "loss_lvr": 1.0180675983428955, "loss_mode_switch": 0.0, "loss_total": 0.3548617959022522, "step": 1256 }, { "batch_size": 1, "epoch": 0.5024, "step": 1256, "tokens_per_device": 4876 }, { "epoch": 0.5024, "loss_ce": 0.004140698350965977, "loss_lvr": 0.7778338193893433, "loss_mode_switch": 0.0, "loss_total": 0.08192408084869385, "step": 1256 }, { "batch_size": 1, "epoch": 0.5024, "step": 1256, "tokens_per_device": 4250 }, { "epoch": 0.5024, "loss_ce": 0.07910090684890747, "loss_lvr": 0.2715303897857666, "loss_mode_switch": 0.0, "loss_total": 0.10625394433736801, "step": 1256 }, { "epoch": 0.5028, "grad_norm": 1.1806361675262451, "learning_rate": 5.197512669873482e-06, "loss": 0.2867, "step": 1257 }, { "batch_size": 4, "epoch": 0.5028, "step": 1257, "tokens_per_device": 5004 }, { "epoch": 0.5028, "loss_ce": 0.006204810459166765, "loss_lvr": 1.0219591856002808, "loss_mode_switch": 0.0, "loss_total": 0.10840073227882385, "step": 1257 }, { "batch_size": 4, "epoch": 0.5028, "step": 1257, "tokens_per_device": 4952 }, { "epoch": 0.5028, "loss_ce": 0.11326325684785843, "loss_lvr": 0.8895863890647888, "loss_mode_switch": 0.0, "loss_total": 0.20222190022468567, "step": 1257 }, { "batch_size": 4, "epoch": 0.5028, "step": 1257, "tokens_per_device": 2628 }, { "epoch": 0.5028, "loss_ce": 0.26335862278938293, "loss_lvr": 0.7268317341804504, "loss_mode_switch": 0.0, "loss_total": 0.33604180812835693, "step": 1257 }, { "batch_size": 4, "epoch": 0.5028, "step": 1257, "tokens_per_device": 6964 }, { "epoch": 0.5028, "loss_ce": 0.31481611728668213, "loss_lvr": 0.6403909921646118, "loss_mode_switch": 0.0, "loss_total": 0.37885522842407227, "step": 1257 }, { "batch_size": 4, "epoch": 0.5028, "step": 1257, "tokens_per_device": 1476 }, { "epoch": 0.5028, "loss_ce": 0.37623852491378784, "loss_lvr": 1.0325617790222168, "loss_mode_switch": 0.0, "loss_total": 0.47949469089508057, "step": 1257 }, { "batch_size": 1, "epoch": 0.5028, "step": 1257, "tokens_per_device": 4892 }, { "epoch": 0.5028, "loss_ce": 0.03903854638338089, "loss_lvr": 0.8244606256484985, "loss_mode_switch": 0.0, "loss_total": 0.12148460745811462, "step": 1257 }, { "batch_size": 4, "epoch": 0.5028, "step": 1257, "tokens_per_device": 1532 }, { "epoch": 0.5028, "loss_ce": 0.44841688871383667, "loss_lvr": 0.9207038879394531, "loss_mode_switch": 0.0, "loss_total": 0.5404872894287109, "step": 1257 }, { "batch_size": 4, "epoch": 0.5028, "step": 1257, "tokens_per_device": 10296 }, { "epoch": 0.5028, "loss_ce": 0.39096057415008545, "loss_lvr": 0.632689893245697, "loss_mode_switch": 0.0, "loss_total": 0.45422956347465515, "step": 1257 }, { "epoch": 0.5032, "grad_norm": 1.3166435956954956, "learning_rate": 5.1910400512022084e-06, "loss": 0.2761, "step": 1258 }, { "batch_size": 4, "epoch": 0.5032, "step": 1258, "tokens_per_device": 1560 }, { "epoch": 0.5032, "loss_ce": 0.04365115240216255, "loss_lvr": 0.960932731628418, "loss_mode_switch": 0.0, "loss_total": 0.13974443078041077, "step": 1258 }, { "batch_size": 1, "epoch": 0.5032, "step": 1258, "tokens_per_device": 4889 }, { "epoch": 0.5032, "loss_ce": 0.009085755795240402, "loss_lvr": 0.5215235352516174, "loss_mode_switch": 0.0, "loss_total": 0.061238110065460205, "step": 1258 }, { "batch_size": 4, "epoch": 0.5032, "step": 1258, "tokens_per_device": 2720 }, { "epoch": 0.5032, "loss_ce": 0.21910341084003448, "loss_lvr": 0.8787679672241211, "loss_mode_switch": 0.0, "loss_total": 0.3069801926612854, "step": 1258 }, { "batch_size": 1, "epoch": 0.5032, "step": 1258, "tokens_per_device": 4786 }, { "epoch": 0.5032, "loss_ce": 0.08086292445659637, "loss_lvr": 0.18355785310268402, "loss_mode_switch": 0.0, "loss_total": 0.0992187112569809, "step": 1258 }, { "batch_size": 4, "epoch": 0.5032, "step": 1258, "tokens_per_device": 3944 }, { "epoch": 0.5032, "loss_ce": 0.07225674390792847, "loss_lvr": 0.9843127131462097, "loss_mode_switch": 0.0, "loss_total": 0.17068801820278168, "step": 1258 }, { "batch_size": 4, "epoch": 0.5032, "step": 1258, "tokens_per_device": 2728 }, { "epoch": 0.5032, "loss_ce": 0.08985793590545654, "loss_lvr": 1.0868899822235107, "loss_mode_switch": 0.0, "loss_total": 0.19854694604873657, "step": 1258 }, { "batch_size": 4, "epoch": 0.5032, "step": 1258, "tokens_per_device": 5412 }, { "epoch": 0.5032, "loss_ce": 0.6038070917129517, "loss_lvr": 0.6710597276687622, "loss_mode_switch": 0.0, "loss_total": 0.67091304063797, "step": 1258 }, { "batch_size": 4, "epoch": 0.5032, "step": 1258, "tokens_per_device": 3812 }, { "epoch": 0.5032, "loss_ce": 0.1347212791442871, "loss_lvr": 0.7896716594696045, "loss_mode_switch": 0.0, "loss_total": 0.2136884480714798, "step": 1258 }, { "epoch": 0.5036, "grad_norm": 1.0852380990982056, "learning_rate": 5.184567111903541e-06, "loss": 0.2542, "step": 1259 }, { "batch_size": 4, "epoch": 0.5036, "step": 1259, "tokens_per_device": 4408 }, { "epoch": 0.5036, "loss_ce": 0.028900256380438805, "loss_lvr": 0.831727921962738, "loss_mode_switch": 0.0, "loss_total": 0.11207304894924164, "step": 1259 }, { "batch_size": 4, "epoch": 0.5036, "step": 1259, "tokens_per_device": 6776 }, { "epoch": 0.5036, "loss_ce": 0.0974356085062027, "loss_lvr": 0.7651461958885193, "loss_mode_switch": 0.0, "loss_total": 0.1739502251148224, "step": 1259 }, { "batch_size": 4, "epoch": 0.5036, "step": 1259, "tokens_per_device": 5352 }, { "epoch": 0.5036, "loss_ce": 0.4919074475765228, "loss_lvr": 0.7759349346160889, "loss_mode_switch": 0.0, "loss_total": 0.5695009231567383, "step": 1259 }, { "batch_size": 4, "epoch": 0.5036, "step": 1259, "tokens_per_device": 2640 }, { "epoch": 0.5036, "loss_ce": 0.07774710655212402, "loss_lvr": 0.916477382183075, "loss_mode_switch": 0.0, "loss_total": 0.169394850730896, "step": 1259 }, { "batch_size": 1, "epoch": 0.5036, "step": 1259, "tokens_per_device": 4884 }, { "epoch": 0.5036, "loss_ce": 0.0012651234865188599, "loss_lvr": 0.41936758160591125, "loss_mode_switch": 0.0, "loss_total": 0.043201882392168045, "step": 1259 }, { "batch_size": 4, "epoch": 0.5036, "step": 1259, "tokens_per_device": 2540 }, { "epoch": 0.5036, "loss_ce": 0.46269071102142334, "loss_lvr": 1.023860216140747, "loss_mode_switch": 0.0, "loss_total": 0.5650767087936401, "step": 1259 }, { "batch_size": 1, "epoch": 0.5036, "step": 1259, "tokens_per_device": 4900 }, { "epoch": 0.5036, "loss_ce": 0.006661882158368826, "loss_lvr": 0.2281992882490158, "loss_mode_switch": 0.0, "loss_total": 0.029481811448931694, "step": 1259 }, { "batch_size": 1, "epoch": 0.5036, "step": 1259, "tokens_per_device": 4922 }, { "epoch": 0.5036, "loss_ce": 0.12804323434829712, "loss_lvr": 0.3284967541694641, "loss_mode_switch": 0.0, "loss_total": 0.16089290380477905, "step": 1259 }, { "epoch": 0.504, "grad_norm": 1.3190491199493408, "learning_rate": 5.1780938628411795e-06, "loss": 0.2968, "step": 1260 }, { "batch_size": 4, "epoch": 0.504, "step": 1260, "tokens_per_device": 2540 }, { "epoch": 0.504, "loss_ce": 0.18196609616279602, "loss_lvr": 0.9073417782783508, "loss_mode_switch": 0.0, "loss_total": 0.2727002799510956, "step": 1260 }, { "batch_size": 4, "epoch": 0.504, "step": 1260, "tokens_per_device": 2600 }, { "epoch": 0.504, "loss_ce": 0.15507782995700836, "loss_lvr": 0.7158122658729553, "loss_mode_switch": 0.0, "loss_total": 0.22665905952453613, "step": 1260 }, { "batch_size": 1, "epoch": 0.504, "step": 1260, "tokens_per_device": 4915 }, { "epoch": 0.504, "loss_ce": 0.015443583019077778, "loss_lvr": 0.5625868439674377, "loss_mode_switch": 0.0, "loss_total": 0.0717022716999054, "step": 1260 }, { "batch_size": 4, "epoch": 0.504, "step": 1260, "tokens_per_device": 4548 }, { "epoch": 0.504, "loss_ce": 0.07921433448791504, "loss_lvr": 1.024945855140686, "loss_mode_switch": 0.0, "loss_total": 0.1817089319229126, "step": 1260 }, { "batch_size": 4, "epoch": 0.504, "step": 1260, "tokens_per_device": 2596 }, { "epoch": 0.504, "loss_ce": 0.34897008538246155, "loss_lvr": 0.821329653263092, "loss_mode_switch": 0.0, "loss_total": 0.43110305070877075, "step": 1260 }, { "batch_size": 1, "epoch": 0.504, "step": 1260, "tokens_per_device": 4889 }, { "epoch": 0.504, "loss_ce": 0.5297210216522217, "loss_lvr": 1.0590251684188843, "loss_mode_switch": 0.0, "loss_total": 0.6356235146522522, "step": 1260 }, { "batch_size": 1, "epoch": 0.504, "step": 1260, "tokens_per_device": 4311 }, { "epoch": 0.504, "loss_ce": 0.29354017972946167, "loss_lvr": 0.5285886526107788, "loss_mode_switch": 0.0, "loss_total": 0.3463990390300751, "step": 1260 }, { "batch_size": 4, "epoch": 0.504, "step": 1260, "tokens_per_device": 7272 }, { "epoch": 0.504, "loss_ce": 0.19035625457763672, "loss_lvr": 0.6375179886817932, "loss_mode_switch": 0.0, "loss_total": 0.2541080713272095, "step": 1260 }, { "epoch": 0.5044, "grad_norm": 1.308645248413086, "learning_rate": 5.171620314879342e-06, "loss": 0.264, "step": 1261 }, { "batch_size": 4, "epoch": 0.5044, "step": 1261, "tokens_per_device": 5856 }, { "epoch": 0.5044, "loss_ce": 0.42187684774398804, "loss_lvr": 0.7875457406044006, "loss_mode_switch": 0.0, "loss_total": 0.5006314516067505, "step": 1261 }, { "batch_size": 4, "epoch": 0.5044, "step": 1261, "tokens_per_device": 4396 }, { "epoch": 0.5044, "loss_ce": 0.37570783495903015, "loss_lvr": 1.11569082736969, "loss_mode_switch": 0.0, "loss_total": 0.48727691173553467, "step": 1261 }, { "batch_size": 1, "epoch": 0.5044, "step": 1261, "tokens_per_device": 5131 }, { "epoch": 0.5044, "loss_ce": 0.31034502387046814, "loss_lvr": 1.2319937944412231, "loss_mode_switch": 0.0, "loss_total": 0.433544397354126, "step": 1261 }, { "batch_size": 4, "epoch": 0.5044, "step": 1261, "tokens_per_device": 1416 }, { "epoch": 0.5044, "loss_ce": 0.7300043702125549, "loss_lvr": 0.9005862474441528, "loss_mode_switch": 0.0, "loss_total": 0.8200629949569702, "step": 1261 }, { "batch_size": 4, "epoch": 0.5044, "step": 1261, "tokens_per_device": 13316 }, { "epoch": 0.5044, "loss_ce": 0.15480639040470123, "loss_lvr": 0.7511373162269592, "loss_mode_switch": 0.0, "loss_total": 0.22992011904716492, "step": 1261 }, { "batch_size": 4, "epoch": 0.5044, "step": 1261, "tokens_per_device": 1380 }, { "epoch": 0.5044, "loss_ce": 0.7911098599433899, "loss_lvr": 1.1956555843353271, "loss_mode_switch": 0.0, "loss_total": 0.9106754064559937, "step": 1261 }, { "batch_size": 4, "epoch": 0.5044, "step": 1261, "tokens_per_device": 1356 }, { "epoch": 0.5044, "loss_ce": 0.13795693218708038, "loss_lvr": 1.0571330785751343, "loss_mode_switch": 0.0, "loss_total": 0.2436702400445938, "step": 1261 }, { "batch_size": 1, "epoch": 0.5044, "step": 1261, "tokens_per_device": 4789 }, { "epoch": 0.5044, "loss_ce": 0.0010108177084475756, "loss_lvr": 0.325005441904068, "loss_mode_switch": 0.0, "loss_total": 0.033511362969875336, "step": 1261 }, { "epoch": 0.5048, "grad_norm": 1.3229902982711792, "learning_rate": 5.165146478882751e-06, "loss": 0.2987, "step": 1262 }, { "batch_size": 4, "epoch": 0.5048, "step": 1262, "tokens_per_device": 11300 }, { "epoch": 0.5048, "loss_ce": 0.25106489658355713, "loss_lvr": 1.3025234937667847, "loss_mode_switch": 0.0, "loss_total": 0.38131725788116455, "step": 1262 }, { "batch_size": 1, "epoch": 0.5048, "step": 1262, "tokens_per_device": 4894 }, { "epoch": 0.5048, "loss_ce": 0.06602291762828827, "loss_lvr": 0.694060742855072, "loss_mode_switch": 0.0, "loss_total": 0.1354289948940277, "step": 1262 }, { "batch_size": 4, "epoch": 0.5048, "step": 1262, "tokens_per_device": 4416 }, { "epoch": 0.5048, "loss_ce": 0.27481046319007874, "loss_lvr": 0.7362586259841919, "loss_mode_switch": 0.0, "loss_total": 0.3484363257884979, "step": 1262 }, { "batch_size": 4, "epoch": 0.5048, "step": 1262, "tokens_per_device": 2704 }, { "epoch": 0.5048, "loss_ce": 0.47599998116493225, "loss_lvr": 1.492125153541565, "loss_mode_switch": 0.0, "loss_total": 0.6252124905586243, "step": 1262 }, { "batch_size": 4, "epoch": 0.5048, "step": 1262, "tokens_per_device": 3752 }, { "epoch": 0.5048, "loss_ce": 0.021041657775640488, "loss_lvr": 0.7546491026878357, "loss_mode_switch": 0.0, "loss_total": 0.09650656580924988, "step": 1262 }, { "batch_size": 1, "epoch": 0.5048, "step": 1262, "tokens_per_device": 4905 }, { "epoch": 0.5048, "loss_ce": 0.30630192160606384, "loss_lvr": 1.2542093992233276, "loss_mode_switch": 0.0, "loss_total": 0.43172287940979004, "step": 1262 }, { "batch_size": 4, "epoch": 0.5048, "step": 1262, "tokens_per_device": 4440 }, { "epoch": 0.5048, "loss_ce": 0.28468725085258484, "loss_lvr": 0.8724362254142761, "loss_mode_switch": 0.0, "loss_total": 0.371930867433548, "step": 1262 }, { "batch_size": 4, "epoch": 0.5048, "step": 1262, "tokens_per_device": 6036 }, { "epoch": 0.5048, "loss_ce": 0.07788404077291489, "loss_lvr": 0.7462394833564758, "loss_mode_switch": 0.0, "loss_total": 0.1525079905986786, "step": 1262 }, { "epoch": 0.5052, "grad_norm": 1.2447381019592285, "learning_rate": 5.158672365716608e-06, "loss": 0.2963, "step": 1263 }, { "batch_size": 1, "epoch": 0.5052, "step": 1263, "tokens_per_device": 4902 }, { "epoch": 0.5052, "loss_ce": 0.00027544525801204145, "loss_lvr": 0.4456852674484253, "loss_mode_switch": 0.0, "loss_total": 0.044843971729278564, "step": 1263 }, { "batch_size": 1, "epoch": 0.5052, "step": 1263, "tokens_per_device": 7613 }, { "epoch": 0.5052, "loss_ce": 0.04164501279592514, "loss_lvr": 0.3685140013694763, "loss_mode_switch": 0.0, "loss_total": 0.07849641144275665, "step": 1263 }, { "batch_size": 4, "epoch": 0.5052, "step": 1263, "tokens_per_device": 5720 }, { "epoch": 0.5052, "loss_ce": 0.21741074323654175, "loss_lvr": 0.781130850315094, "loss_mode_switch": 0.0, "loss_total": 0.29552382230758667, "step": 1263 }, { "batch_size": 4, "epoch": 0.5052, "step": 1263, "tokens_per_device": 2716 }, { "epoch": 0.5052, "loss_ce": 0.2955683469772339, "loss_lvr": 0.9382742047309875, "loss_mode_switch": 0.0, "loss_total": 0.3893957734107971, "step": 1263 }, { "batch_size": 1, "epoch": 0.5052, "step": 1263, "tokens_per_device": 5136 }, { "epoch": 0.5052, "loss_ce": 0.20623481273651123, "loss_lvr": 0.42014098167419434, "loss_mode_switch": 0.0, "loss_total": 0.2482489049434662, "step": 1263 }, { "batch_size": 4, "epoch": 0.5052, "step": 1263, "tokens_per_device": 8040 }, { "epoch": 0.5052, "loss_ce": 0.2122013419866562, "loss_lvr": 1.022398829460144, "loss_mode_switch": 0.0, "loss_total": 0.3144412338733673, "step": 1263 }, { "batch_size": 1, "epoch": 0.5052, "step": 1263, "tokens_per_device": 5150 }, { "epoch": 0.5052, "loss_ce": 0.07984582334756851, "loss_lvr": 0.5275067687034607, "loss_mode_switch": 0.0, "loss_total": 0.13259649276733398, "step": 1263 }, { "batch_size": 4, "epoch": 0.5052, "step": 1263, "tokens_per_device": 10872 }, { "epoch": 0.5052, "loss_ce": 0.6900545358657837, "loss_lvr": 0.5428199172019958, "loss_mode_switch": 0.0, "loss_total": 0.7443365454673767, "step": 1263 }, { "epoch": 0.5056, "grad_norm": 1.4384100437164307, "learning_rate": 5.152197986246586e-06, "loss": 0.3155, "step": 1264 }, { "batch_size": 4, "epoch": 0.5056, "step": 1264, "tokens_per_device": 4620 }, { "epoch": 0.5056, "loss_ce": 0.053827572613954544, "loss_lvr": 0.7864295840263367, "loss_mode_switch": 0.0, "loss_total": 0.1324705332517624, "step": 1264 }, { "batch_size": 4, "epoch": 0.5056, "step": 1264, "tokens_per_device": 4500 }, { "epoch": 0.5056, "loss_ce": 0.0579533725976944, "loss_lvr": 0.8176563382148743, "loss_mode_switch": 0.0, "loss_total": 0.13971900939941406, "step": 1264 }, { "batch_size": 1, "epoch": 0.5056, "step": 1264, "tokens_per_device": 6695 }, { "epoch": 0.5056, "loss_ce": 0.06036053225398064, "loss_lvr": 0.2871420383453369, "loss_mode_switch": 0.0, "loss_total": 0.08907473832368851, "step": 1264 }, { "batch_size": 1, "epoch": 0.5056, "step": 1264, "tokens_per_device": 4860 }, { "epoch": 0.5056, "loss_ce": 0.00021657490287907422, "loss_lvr": 0.2171432077884674, "loss_mode_switch": 0.0, "loss_total": 0.021930895745754242, "step": 1264 }, { "batch_size": 1, "epoch": 0.5056, "step": 1264, "tokens_per_device": 5066 }, { "epoch": 0.5056, "loss_ce": 0.1218426525592804, "loss_lvr": 0.29740750789642334, "loss_mode_switch": 0.0, "loss_total": 0.15158340334892273, "step": 1264 }, { "batch_size": 1, "epoch": 0.5056, "step": 1264, "tokens_per_device": 4901 }, { "epoch": 0.5056, "loss_ce": 0.1478678286075592, "loss_lvr": 0.13849817216396332, "loss_mode_switch": 0.0, "loss_total": 0.16171765327453613, "step": 1264 }, { "batch_size": 4, "epoch": 0.5056, "step": 1264, "tokens_per_device": 3484 }, { "epoch": 0.5056, "loss_ce": 0.21198607981204987, "loss_lvr": 0.7603420615196228, "loss_mode_switch": 0.0, "loss_total": 0.2880202829837799, "step": 1264 }, { "batch_size": 4, "epoch": 0.5056, "step": 1264, "tokens_per_device": 4276 }, { "epoch": 0.5056, "loss_ce": 0.006740679033100605, "loss_lvr": 0.8943547606468201, "loss_mode_switch": 0.0, "loss_total": 0.0961761623620987, "step": 1264 }, { "epoch": 0.506, "grad_norm": 1.4652471542358398, "learning_rate": 5.145723351338799e-06, "loss": 0.3163, "step": 1265 }, { "batch_size": 4, "epoch": 0.506, "step": 1265, "tokens_per_device": 1348 }, { "epoch": 0.506, "loss_ce": 0.27145734429359436, "loss_lvr": 0.9314621090888977, "loss_mode_switch": 0.0, "loss_total": 0.36460354924201965, "step": 1265 }, { "batch_size": 4, "epoch": 0.506, "step": 1265, "tokens_per_device": 5728 }, { "epoch": 0.506, "loss_ce": 0.06315673887729645, "loss_lvr": 0.5930163264274597, "loss_mode_switch": 0.0, "loss_total": 0.12245836853981018, "step": 1265 }, { "batch_size": 4, "epoch": 0.506, "step": 1265, "tokens_per_device": 6272 }, { "epoch": 0.506, "loss_ce": 0.3696948289871216, "loss_lvr": 0.8485847115516663, "loss_mode_switch": 0.0, "loss_total": 0.4545533061027527, "step": 1265 }, { "batch_size": 4, "epoch": 0.506, "step": 1265, "tokens_per_device": 4116 }, { "epoch": 0.506, "loss_ce": 0.6385741233825684, "loss_lvr": 0.601850688457489, "loss_mode_switch": 0.0, "loss_total": 0.6987591981887817, "step": 1265 }, { "batch_size": 1, "epoch": 0.506, "step": 1265, "tokens_per_device": 5107 }, { "epoch": 0.506, "loss_ce": 0.01580525003373623, "loss_lvr": 0.4094506800174713, "loss_mode_switch": 0.0, "loss_total": 0.05675031989812851, "step": 1265 }, { "batch_size": 4, "epoch": 0.506, "step": 1265, "tokens_per_device": 1492 }, { "epoch": 0.506, "loss_ce": 0.663709282875061, "loss_lvr": 0.8989574313163757, "loss_mode_switch": 0.0, "loss_total": 0.7536050081253052, "step": 1265 }, { "batch_size": 4, "epoch": 0.506, "step": 1265, "tokens_per_device": 4380 }, { "epoch": 0.506, "loss_ce": 0.05032387748360634, "loss_lvr": 0.6879316568374634, "loss_mode_switch": 0.0, "loss_total": 0.11911705136299133, "step": 1265 }, { "batch_size": 1, "epoch": 0.506, "step": 1265, "tokens_per_device": 5622 }, { "epoch": 0.506, "loss_ce": 0.000288401119178161, "loss_lvr": 0.3847065567970276, "loss_mode_switch": 0.0, "loss_total": 0.038759056478738785, "step": 1265 }, { "epoch": 0.5064, "grad_norm": 1.2258492708206177, "learning_rate": 5.139248471859793e-06, "loss": 0.2691, "step": 1266 }, { "batch_size": 1, "epoch": 0.5064, "step": 1266, "tokens_per_device": 5112 }, { "epoch": 0.5064, "loss_ce": 0.32079073786735535, "loss_lvr": 0.6145823001861572, "loss_mode_switch": 0.0, "loss_total": 0.38224896788597107, "step": 1266 }, { "batch_size": 4, "epoch": 0.5064, "step": 1266, "tokens_per_device": 6224 }, { "epoch": 0.5064, "loss_ce": 0.03914586454629898, "loss_lvr": 0.8047150373458862, "loss_mode_switch": 0.0, "loss_total": 0.11961737275123596, "step": 1266 }, { "batch_size": 1, "epoch": 0.5064, "step": 1266, "tokens_per_device": 6817 }, { "epoch": 0.5064, "loss_ce": 0.44931548833847046, "loss_lvr": 0.3524642288684845, "loss_mode_switch": 0.0, "loss_total": 0.4845619201660156, "step": 1266 }, { "batch_size": 4, "epoch": 0.5064, "step": 1266, "tokens_per_device": 6888 }, { "epoch": 0.5064, "loss_ce": 0.21521113812923431, "loss_lvr": 0.7914589047431946, "loss_mode_switch": 0.0, "loss_total": 0.294357031583786, "step": 1266 }, { "batch_size": 4, "epoch": 0.5064, "step": 1266, "tokens_per_device": 5092 }, { "epoch": 0.5064, "loss_ce": 0.07468894124031067, "loss_lvr": 0.6535876393318176, "loss_mode_switch": 0.0, "loss_total": 0.14004769921302795, "step": 1266 }, { "batch_size": 1, "epoch": 0.5064, "step": 1266, "tokens_per_device": 4372 }, { "epoch": 0.5064, "loss_ce": 0.003643054747954011, "loss_lvr": 0.5002180337905884, "loss_mode_switch": 0.0, "loss_total": 0.053664859384298325, "step": 1266 }, { "batch_size": 4, "epoch": 0.5064, "step": 1266, "tokens_per_device": 5928 }, { "epoch": 0.5064, "loss_ce": 0.007242200896143913, "loss_lvr": 0.7924142479896545, "loss_mode_switch": 0.0, "loss_total": 0.08648362755775452, "step": 1266 }, { "batch_size": 4, "epoch": 0.5064, "step": 1266, "tokens_per_device": 1300 }, { "epoch": 0.5064, "loss_ce": 0.9021437764167786, "loss_lvr": 1.1266148090362549, "loss_mode_switch": 0.0, "loss_total": 1.0148053169250488, "step": 1266 }, { "epoch": 0.5068, "grad_norm": 1.5224446058273315, "learning_rate": 5.1327733586765205e-06, "loss": 0.2989, "step": 1267 }, { "batch_size": 4, "epoch": 0.5068, "step": 1267, "tokens_per_device": 4936 }, { "epoch": 0.5068, "loss_ce": 0.2776879370212555, "loss_lvr": 1.0133016109466553, "loss_mode_switch": 0.0, "loss_total": 0.379018098115921, "step": 1267 }, { "batch_size": 4, "epoch": 0.5068, "step": 1267, "tokens_per_device": 3856 }, { "epoch": 0.5068, "loss_ce": 0.5122328996658325, "loss_lvr": 0.9015990495681763, "loss_mode_switch": 0.0, "loss_total": 0.6023927927017212, "step": 1267 }, { "batch_size": 1, "epoch": 0.5068, "step": 1267, "tokens_per_device": 4885 }, { "epoch": 0.5068, "loss_ce": 0.11971403658390045, "loss_lvr": 0.16102075576782227, "loss_mode_switch": 0.0, "loss_total": 0.13581611216068268, "step": 1267 }, { "batch_size": 4, "epoch": 0.5068, "step": 1267, "tokens_per_device": 8220 }, { "epoch": 0.5068, "loss_ce": 0.040695857256650925, "loss_lvr": 0.6416444778442383, "loss_mode_switch": 0.0, "loss_total": 0.10486030578613281, "step": 1267 }, { "batch_size": 1, "epoch": 0.5068, "step": 1267, "tokens_per_device": 5112 }, { "epoch": 0.5068, "loss_ce": 0.02419336326420307, "loss_lvr": 0.33834731578826904, "loss_mode_switch": 0.0, "loss_total": 0.058028094470500946, "step": 1267 }, { "batch_size": 4, "epoch": 0.5068, "step": 1267, "tokens_per_device": 2700 }, { "epoch": 0.5068, "loss_ce": 0.33209750056266785, "loss_lvr": 0.9653916358947754, "loss_mode_switch": 0.0, "loss_total": 0.42863667011260986, "step": 1267 }, { "batch_size": 4, "epoch": 0.5068, "step": 1267, "tokens_per_device": 8620 }, { "epoch": 0.5068, "loss_ce": 0.21183960139751434, "loss_lvr": 0.653827965259552, "loss_mode_switch": 0.0, "loss_total": 0.2772223949432373, "step": 1267 }, { "batch_size": 1, "epoch": 0.5068, "step": 1267, "tokens_per_device": 5125 }, { "epoch": 0.5068, "loss_ce": 0.03729165345430374, "loss_lvr": 0.46637970209121704, "loss_mode_switch": 0.0, "loss_total": 0.0839296281337738, "step": 1267 }, { "epoch": 0.5072, "grad_norm": 1.2823283672332764, "learning_rate": 5.126298022656333e-06, "loss": 0.2599, "step": 1268 }, { "batch_size": 4, "epoch": 0.5072, "step": 1268, "tokens_per_device": 2664 }, { "epoch": 0.5072, "loss_ce": 0.17339475452899933, "loss_lvr": 0.8237941265106201, "loss_mode_switch": 0.0, "loss_total": 0.2557741701602936, "step": 1268 }, { "batch_size": 4, "epoch": 0.5072, "step": 1268, "tokens_per_device": 3792 }, { "epoch": 0.5072, "loss_ce": 0.0822274386882782, "loss_lvr": 0.9330429434776306, "loss_mode_switch": 0.0, "loss_total": 0.17553174495697021, "step": 1268 }, { "batch_size": 4, "epoch": 0.5072, "step": 1268, "tokens_per_device": 8960 }, { "epoch": 0.5072, "loss_ce": 0.022916987538337708, "loss_lvr": 0.5662114024162292, "loss_mode_switch": 0.0, "loss_total": 0.07953812927007675, "step": 1268 }, { "batch_size": 4, "epoch": 0.5072, "step": 1268, "tokens_per_device": 1528 }, { "epoch": 0.5072, "loss_ce": 0.12706227600574493, "loss_lvr": 0.9714037179946899, "loss_mode_switch": 0.0, "loss_total": 0.22420264780521393, "step": 1268 }, { "batch_size": 1, "epoch": 0.5072, "step": 1268, "tokens_per_device": 7501 }, { "epoch": 0.5072, "loss_ce": 0.13631796836853027, "loss_lvr": 0.24175925552845, "loss_mode_switch": 0.0, "loss_total": 0.1604938954114914, "step": 1268 }, { "batch_size": 4, "epoch": 0.5072, "step": 1268, "tokens_per_device": 3380 }, { "epoch": 0.5072, "loss_ce": 0.11961787194013596, "loss_lvr": 0.9938883781433105, "loss_mode_switch": 0.0, "loss_total": 0.2190067172050476, "step": 1268 }, { "batch_size": 1, "epoch": 0.5072, "step": 1268, "tokens_per_device": 5439 }, { "epoch": 0.5072, "loss_ce": 0.005197342950850725, "loss_lvr": 0.26682737469673157, "loss_mode_switch": 0.0, "loss_total": 0.031880080699920654, "step": 1268 }, { "batch_size": 4, "epoch": 0.5072, "step": 1268, "tokens_per_device": 7264 }, { "epoch": 0.5072, "loss_ce": 0.0013846077490597963, "loss_lvr": 0.9784705638885498, "loss_mode_switch": 0.0, "loss_total": 0.09923166781663895, "step": 1268 }, { "epoch": 0.5076, "grad_norm": 1.4343621730804443, "learning_rate": 5.11982247466695e-06, "loss": 0.2752, "step": 1269 }, { "batch_size": 4, "epoch": 0.5076, "step": 1269, "tokens_per_device": 3884 }, { "epoch": 0.5076, "loss_ce": 0.16558055579662323, "loss_lvr": 1.0545902252197266, "loss_mode_switch": 0.0, "loss_total": 0.27103957533836365, "step": 1269 }, { "batch_size": 4, "epoch": 0.5076, "step": 1269, "tokens_per_device": 2700 }, { "epoch": 0.5076, "loss_ce": 0.0506214015185833, "loss_lvr": 0.8413512110710144, "loss_mode_switch": 0.0, "loss_total": 0.13475652039051056, "step": 1269 }, { "batch_size": 4, "epoch": 0.5076, "step": 1269, "tokens_per_device": 1384 }, { "epoch": 0.5076, "loss_ce": 0.4588789641857147, "loss_lvr": 1.160364031791687, "loss_mode_switch": 0.0, "loss_total": 0.57491534948349, "step": 1269 }, { "batch_size": 1, "epoch": 0.5076, "step": 1269, "tokens_per_device": 5170 }, { "epoch": 0.5076, "loss_ce": 0.09285897016525269, "loss_lvr": 0.3660220503807068, "loss_mode_switch": 0.0, "loss_total": 0.1294611692428589, "step": 1269 }, { "batch_size": 4, "epoch": 0.5076, "step": 1269, "tokens_per_device": 4044 }, { "epoch": 0.5076, "loss_ce": 0.16119356453418732, "loss_lvr": 0.8574228286743164, "loss_mode_switch": 0.0, "loss_total": 0.24693584442138672, "step": 1269 }, { "batch_size": 4, "epoch": 0.5076, "step": 1269, "tokens_per_device": 1252 }, { "epoch": 0.5076, "loss_ce": 0.4543575048446655, "loss_lvr": 1.1772687435150146, "loss_mode_switch": 0.0, "loss_total": 0.572084367275238, "step": 1269 }, { "batch_size": 4, "epoch": 0.5076, "step": 1269, "tokens_per_device": 3752 }, { "epoch": 0.5076, "loss_ce": 0.1375872939825058, "loss_lvr": 0.8595456480979919, "loss_mode_switch": 0.0, "loss_total": 0.22354185581207275, "step": 1269 }, { "batch_size": 4, "epoch": 0.5076, "step": 1269, "tokens_per_device": 3808 }, { "epoch": 0.5076, "loss_ce": 0.075222909450531, "loss_lvr": 1.537243127822876, "loss_mode_switch": 0.0, "loss_total": 0.2289472222328186, "step": 1269 }, { "epoch": 0.508, "grad_norm": 3.183234214782715, "learning_rate": 5.11334672557645e-06, "loss": 0.3062, "step": 1270 }, { "batch_size": 4, "epoch": 0.508, "step": 1270, "tokens_per_device": 4228 }, { "epoch": 0.508, "loss_ce": 0.7902899384498596, "loss_lvr": 0.8874250054359436, "loss_mode_switch": 0.0, "loss_total": 0.8790324330329895, "step": 1270 }, { "batch_size": 4, "epoch": 0.508, "step": 1270, "tokens_per_device": 9332 }, { "epoch": 0.508, "loss_ce": 0.02517230622470379, "loss_lvr": 0.3867686688899994, "loss_mode_switch": 0.0, "loss_total": 0.06384917348623276, "step": 1270 }, { "batch_size": 4, "epoch": 0.508, "step": 1270, "tokens_per_device": 1940 }, { "epoch": 0.508, "loss_ce": 0.354679673910141, "loss_lvr": 0.7899735569953918, "loss_mode_switch": 0.0, "loss_total": 0.4336770176887512, "step": 1270 }, { "batch_size": 4, "epoch": 0.508, "step": 1270, "tokens_per_device": 4288 }, { "epoch": 0.508, "loss_ce": 0.17376938462257385, "loss_lvr": 1.9891973733901978, "loss_mode_switch": 0.0, "loss_total": 0.3726891279220581, "step": 1270 }, { "batch_size": 1, "epoch": 0.508, "step": 1270, "tokens_per_device": 5650 }, { "epoch": 0.508, "loss_ce": 0.00047073629684746265, "loss_lvr": 0.5431533455848694, "loss_mode_switch": 0.0, "loss_total": 0.0547860711812973, "step": 1270 }, { "batch_size": 4, "epoch": 0.508, "step": 1270, "tokens_per_device": 11440 }, { "epoch": 0.508, "loss_ce": 0.06263289600610733, "loss_lvr": 0.8133083581924438, "loss_mode_switch": 0.0, "loss_total": 0.14396372437477112, "step": 1270 }, { "batch_size": 4, "epoch": 0.508, "step": 1270, "tokens_per_device": 6532 }, { "epoch": 0.508, "loss_ce": 0.09618807584047318, "loss_lvr": 0.6912235617637634, "loss_mode_switch": 0.0, "loss_total": 0.16531044244766235, "step": 1270 }, { "batch_size": 1, "epoch": 0.508, "step": 1270, "tokens_per_device": 5077 }, { "epoch": 0.508, "loss_ce": 0.01234061922878027, "loss_lvr": 0.19766725599765778, "loss_mode_switch": 0.0, "loss_total": 0.03210734575986862, "step": 1270 }, { "epoch": 0.5084, "grad_norm": 1.2022548913955688, "learning_rate": 5.106870786253248e-06, "loss": 0.2596, "step": 1271 }, { "batch_size": 4, "epoch": 0.5084, "step": 1271, "tokens_per_device": 1224 }, { "epoch": 0.5084, "loss_ce": 0.2930412292480469, "loss_lvr": 1.1545069217681885, "loss_mode_switch": 0.0, "loss_total": 0.40849190950393677, "step": 1271 }, { "batch_size": 1, "epoch": 0.5084, "step": 1271, "tokens_per_device": 4887 }, { "epoch": 0.5084, "loss_ce": 0.018362585455179214, "loss_lvr": 0.31852391362190247, "loss_mode_switch": 0.0, "loss_total": 0.0502149760723114, "step": 1271 }, { "batch_size": 1, "epoch": 0.5084, "step": 1271, "tokens_per_device": 4867 }, { "epoch": 0.5084, "loss_ce": 0.006921404507011175, "loss_lvr": 0.34361082315444946, "loss_mode_switch": 0.0, "loss_total": 0.04128248617053032, "step": 1271 }, { "batch_size": 4, "epoch": 0.5084, "step": 1271, "tokens_per_device": 1208 }, { "epoch": 0.5084, "loss_ce": 0.46840015053749084, "loss_lvr": 1.035049557685852, "loss_mode_switch": 0.0, "loss_total": 0.5719051361083984, "step": 1271 }, { "batch_size": 1, "epoch": 0.5084, "step": 1271, "tokens_per_device": 4771 }, { "epoch": 0.5084, "loss_ce": 0.17288343608379364, "loss_lvr": 0.7676389217376709, "loss_mode_switch": 0.0, "loss_total": 0.249647319316864, "step": 1271 }, { "batch_size": 1, "epoch": 0.5084, "step": 1271, "tokens_per_device": 4935 }, { "epoch": 0.5084, "loss_ce": 0.0584629587829113, "loss_lvr": 0.36337175965309143, "loss_mode_switch": 0.0, "loss_total": 0.09480013698339462, "step": 1271 }, { "batch_size": 4, "epoch": 0.5084, "step": 1271, "tokens_per_device": 5220 }, { "epoch": 0.5084, "loss_ce": 0.05314326286315918, "loss_lvr": 0.8496310114860535, "loss_mode_switch": 0.0, "loss_total": 0.13810637593269348, "step": 1271 }, { "batch_size": 1, "epoch": 0.5084, "step": 1271, "tokens_per_device": 8282 }, { "epoch": 0.5084, "loss_ce": 0.040004413574934006, "loss_lvr": 0.35002601146698, "loss_mode_switch": 0.0, "loss_total": 0.07500701397657394, "step": 1271 }, { "epoch": 0.5088, "grad_norm": 1.1105464696884155, "learning_rate": 5.100394667566079e-06, "loss": 0.2498, "step": 1272 }, { "batch_size": 1, "epoch": 0.5088, "step": 1272, "tokens_per_device": 4872 }, { "epoch": 0.5088, "loss_ce": 0.001985873095691204, "loss_lvr": 0.30093926191329956, "loss_mode_switch": 0.0, "loss_total": 0.032079800963401794, "step": 1272 }, { "batch_size": 4, "epoch": 0.5088, "step": 1272, "tokens_per_device": 13080 }, { "epoch": 0.5088, "loss_ce": 0.05671103671193123, "loss_lvr": 0.765380322933197, "loss_mode_switch": 0.0, "loss_total": 0.13324907422065735, "step": 1272 }, { "batch_size": 4, "epoch": 0.5088, "step": 1272, "tokens_per_device": 1284 }, { "epoch": 0.5088, "loss_ce": 0.16810746490955353, "loss_lvr": 1.0118706226348877, "loss_mode_switch": 0.0, "loss_total": 0.26929453015327454, "step": 1272 }, { "batch_size": 1, "epoch": 0.5088, "step": 1272, "tokens_per_device": 5110 }, { "epoch": 0.5088, "loss_ce": 0.02496185712516308, "loss_lvr": 0.7285646200180054, "loss_mode_switch": 0.0, "loss_total": 0.09781832247972488, "step": 1272 }, { "batch_size": 4, "epoch": 0.5088, "step": 1272, "tokens_per_device": 1708 }, { "epoch": 0.5088, "loss_ce": 0.18945974111557007, "loss_lvr": 0.8377695679664612, "loss_mode_switch": 0.0, "loss_total": 0.2732366919517517, "step": 1272 }, { "batch_size": 1, "epoch": 0.5088, "step": 1272, "tokens_per_device": 4794 }, { "epoch": 0.5088, "loss_ce": 0.2852870225906372, "loss_lvr": 0.5331560373306274, "loss_mode_switch": 0.0, "loss_total": 0.33860263228416443, "step": 1272 }, { "batch_size": 4, "epoch": 0.5088, "step": 1272, "tokens_per_device": 5000 }, { "epoch": 0.5088, "loss_ce": 0.024865780025720596, "loss_lvr": 0.8474813103675842, "loss_mode_switch": 0.0, "loss_total": 0.10961391031742096, "step": 1272 }, { "batch_size": 1, "epoch": 0.5088, "step": 1272, "tokens_per_device": 4841 }, { "epoch": 0.5088, "loss_ce": 0.0031478796154260635, "loss_lvr": 0.43073341250419617, "loss_mode_switch": 0.0, "loss_total": 0.04622121900320053, "step": 1272 }, { "epoch": 0.5092, "grad_norm": 1.3750724792480469, "learning_rate": 5.093918380383977e-06, "loss": 0.2545, "step": 1273 }, { "batch_size": 4, "epoch": 0.5092, "step": 1273, "tokens_per_device": 1248 }, { "epoch": 0.5092, "loss_ce": 0.5231658816337585, "loss_lvr": 0.830098569393158, "loss_mode_switch": 0.0, "loss_total": 0.6061757206916809, "step": 1273 }, { "batch_size": 1, "epoch": 0.5092, "step": 1273, "tokens_per_device": 5107 }, { "epoch": 0.5092, "loss_ce": 0.06710556149482727, "loss_lvr": 0.32974952459335327, "loss_mode_switch": 0.0, "loss_total": 0.10008051991462708, "step": 1273 }, { "batch_size": 1, "epoch": 0.5092, "step": 1273, "tokens_per_device": 4873 }, { "epoch": 0.5092, "loss_ce": 0.022825300693511963, "loss_lvr": 0.3526305556297302, "loss_mode_switch": 0.0, "loss_total": 0.058088358491659164, "step": 1273 }, { "batch_size": 4, "epoch": 0.5092, "step": 1273, "tokens_per_device": 2864 }, { "epoch": 0.5092, "loss_ce": 0.0771963968873024, "loss_lvr": 0.6794595718383789, "loss_mode_switch": 0.0, "loss_total": 0.1451423466205597, "step": 1273 }, { "batch_size": 4, "epoch": 0.5092, "step": 1273, "tokens_per_device": 6228 }, { "epoch": 0.5092, "loss_ce": 0.5359070301055908, "loss_lvr": 0.7187036275863647, "loss_mode_switch": 0.0, "loss_total": 0.6077774167060852, "step": 1273 }, { "batch_size": 4, "epoch": 0.5092, "step": 1273, "tokens_per_device": 6920 }, { "epoch": 0.5092, "loss_ce": 0.08385664969682693, "loss_lvr": 0.8575237989425659, "loss_mode_switch": 0.0, "loss_total": 0.16960904002189636, "step": 1273 }, { "batch_size": 4, "epoch": 0.5092, "step": 1273, "tokens_per_device": 4336 }, { "epoch": 0.5092, "loss_ce": 0.3823407292366028, "loss_lvr": 0.9086768627166748, "loss_mode_switch": 0.0, "loss_total": 0.4732084274291992, "step": 1273 }, { "batch_size": 4, "epoch": 0.5092, "step": 1273, "tokens_per_device": 4292 }, { "epoch": 0.5092, "loss_ce": 0.24102026224136353, "loss_lvr": 1.2093164920806885, "loss_mode_switch": 0.0, "loss_total": 0.36195191740989685, "step": 1273 }, { "epoch": 0.5096, "grad_norm": 1.3103421926498413, "learning_rate": 5.0874419355762585e-06, "loss": 0.2701, "step": 1274 }, { "batch_size": 4, "epoch": 0.5096, "step": 1274, "tokens_per_device": 4624 }, { "epoch": 0.5096, "loss_ce": 0.4789743423461914, "loss_lvr": 0.7734417915344238, "loss_mode_switch": 0.0, "loss_total": 0.5563185214996338, "step": 1274 }, { "batch_size": 4, "epoch": 0.5096, "step": 1274, "tokens_per_device": 1376 }, { "epoch": 0.5096, "loss_ce": 0.14425386488437653, "loss_lvr": 1.106744647026062, "loss_mode_switch": 0.0, "loss_total": 0.254928320646286, "step": 1274 }, { "batch_size": 4, "epoch": 0.5096, "step": 1274, "tokens_per_device": 5356 }, { "epoch": 0.5096, "loss_ce": 0.12664169073104858, "loss_lvr": 1.2325021028518677, "loss_mode_switch": 0.0, "loss_total": 0.24989190697669983, "step": 1274 }, { "batch_size": 1, "epoch": 0.5096, "step": 1274, "tokens_per_device": 4883 }, { "epoch": 0.5096, "loss_ce": 1.1278005838394165, "loss_lvr": 0.28801342844963074, "loss_mode_switch": 0.0, "loss_total": 1.156601905822754, "step": 1274 }, { "batch_size": 1, "epoch": 0.5096, "step": 1274, "tokens_per_device": 4521 }, { "epoch": 0.5096, "loss_ce": 0.26919952034950256, "loss_lvr": 0.33372607827186584, "loss_mode_switch": 0.0, "loss_total": 0.3025721311569214, "step": 1274 }, { "batch_size": 4, "epoch": 0.5096, "step": 1274, "tokens_per_device": 4372 }, { "epoch": 0.5096, "loss_ce": 0.49498045444488525, "loss_lvr": 0.7765728831291199, "loss_mode_switch": 0.0, "loss_total": 0.5726377367973328, "step": 1274 }, { "batch_size": 4, "epoch": 0.5096, "step": 1274, "tokens_per_device": 3844 }, { "epoch": 0.5096, "loss_ce": 0.6151897311210632, "loss_lvr": 0.8913717865943909, "loss_mode_switch": 0.0, "loss_total": 0.7043269276618958, "step": 1274 }, { "batch_size": 4, "epoch": 0.5096, "step": 1274, "tokens_per_device": 1544 }, { "epoch": 0.5096, "loss_ce": 0.41673871874809265, "loss_lvr": 0.905116856098175, "loss_mode_switch": 0.0, "loss_total": 0.5072504281997681, "step": 1274 }, { "epoch": 0.51, "grad_norm": 1.6424709558486938, "learning_rate": 5.080965344012509e-06, "loss": 0.3088, "step": 1275 }, { "batch_size": 4, "epoch": 0.51, "step": 1275, "tokens_per_device": 2652 }, { "epoch": 0.51, "loss_ce": 0.21393023431301117, "loss_lvr": 0.8965915441513062, "loss_mode_switch": 0.0, "loss_total": 0.303589403629303, "step": 1275 }, { "batch_size": 1, "epoch": 0.51, "step": 1275, "tokens_per_device": 4963 }, { "epoch": 0.51, "loss_ce": 0.26117971539497375, "loss_lvr": 0.5157744288444519, "loss_mode_switch": 0.0, "loss_total": 0.3127571642398834, "step": 1275 }, { "batch_size": 4, "epoch": 0.51, "step": 1275, "tokens_per_device": 3848 }, { "epoch": 0.51, "loss_ce": 0.3726564645767212, "loss_lvr": 0.9326916933059692, "loss_mode_switch": 0.0, "loss_total": 0.4659256339073181, "step": 1275 }, { "batch_size": 4, "epoch": 0.51, "step": 1275, "tokens_per_device": 1572 }, { "epoch": 0.51, "loss_ce": 0.07193659991025925, "loss_lvr": 0.991331934928894, "loss_mode_switch": 0.0, "loss_total": 0.17106980085372925, "step": 1275 }, { "batch_size": 4, "epoch": 0.51, "step": 1275, "tokens_per_device": 3948 }, { "epoch": 0.51, "loss_ce": 0.2547544538974762, "loss_lvr": 0.7388928532600403, "loss_mode_switch": 0.0, "loss_total": 0.3286437392234802, "step": 1275 }, { "batch_size": 4, "epoch": 0.51, "step": 1275, "tokens_per_device": 4224 }, { "epoch": 0.51, "loss_ce": 0.24564994871616364, "loss_lvr": 0.7579814195632935, "loss_mode_switch": 0.0, "loss_total": 0.32144808769226074, "step": 1275 }, { "batch_size": 4, "epoch": 0.51, "step": 1275, "tokens_per_device": 15200 }, { "epoch": 0.51, "loss_ce": 0.10914503782987595, "loss_lvr": 0.7062638401985168, "loss_mode_switch": 0.0, "loss_total": 0.17977142333984375, "step": 1275 }, { "batch_size": 4, "epoch": 0.51, "step": 1275, "tokens_per_device": 3736 }, { "epoch": 0.51, "loss_ce": 0.0451994314789772, "loss_lvr": 0.88960862159729, "loss_mode_switch": 0.0, "loss_total": 0.13416029512882233, "step": 1275 }, { "epoch": 0.5104, "grad_norm": 1.6550589799880981, "learning_rate": 5.074488616562555e-06, "loss": 0.3098, "step": 1276 }, { "batch_size": 1, "epoch": 0.5104, "step": 1276, "tokens_per_device": 4975 }, { "epoch": 0.5104, "loss_ce": 0.07419231534004211, "loss_lvr": 0.5773751735687256, "loss_mode_switch": 0.0, "loss_total": 0.13192982971668243, "step": 1276 }, { "batch_size": 4, "epoch": 0.5104, "step": 1276, "tokens_per_device": 3048 }, { "epoch": 0.5104, "loss_ce": 0.17429369688034058, "loss_lvr": 0.9225950241088867, "loss_mode_switch": 0.0, "loss_total": 0.26655319333076477, "step": 1276 }, { "batch_size": 4, "epoch": 0.5104, "step": 1276, "tokens_per_device": 5436 }, { "epoch": 0.5104, "loss_ce": 0.25146806240081787, "loss_lvr": 0.9200962781906128, "loss_mode_switch": 0.0, "loss_total": 0.34347769618034363, "step": 1276 }, { "batch_size": 4, "epoch": 0.5104, "step": 1276, "tokens_per_device": 4748 }, { "epoch": 0.5104, "loss_ce": 0.062365252524614334, "loss_lvr": 0.8164611458778381, "loss_mode_switch": 0.0, "loss_total": 0.14401136338710785, "step": 1276 }, { "batch_size": 4, "epoch": 0.5104, "step": 1276, "tokens_per_device": 3944 }, { "epoch": 0.5104, "loss_ce": 0.20200908184051514, "loss_lvr": 0.9551939964294434, "loss_mode_switch": 0.0, "loss_total": 0.297528475522995, "step": 1276 }, { "batch_size": 4, "epoch": 0.5104, "step": 1276, "tokens_per_device": 4492 }, { "epoch": 0.5104, "loss_ce": 0.15222690999507904, "loss_lvr": 0.9681586623191833, "loss_mode_switch": 0.0, "loss_total": 0.24904277920722961, "step": 1276 }, { "batch_size": 1, "epoch": 0.5104, "step": 1276, "tokens_per_device": 4894 }, { "epoch": 0.5104, "loss_ce": 0.0274362675845623, "loss_lvr": 0.13291938602924347, "loss_mode_switch": 0.0, "loss_total": 0.04072820767760277, "step": 1276 }, { "batch_size": 4, "epoch": 0.5104, "step": 1276, "tokens_per_device": 6644 }, { "epoch": 0.5104, "loss_ce": 0.5647566318511963, "loss_lvr": 0.7191994190216064, "loss_mode_switch": 0.0, "loss_total": 0.636676549911499, "step": 1276 }, { "epoch": 0.5108, "grad_norm": 1.3797965049743652, "learning_rate": 5.068011764096455e-06, "loss": 0.3053, "step": 1277 }, { "batch_size": 4, "epoch": 0.5108, "step": 1277, "tokens_per_device": 4888 }, { "epoch": 0.5108, "loss_ce": 0.4700269103050232, "loss_lvr": 0.6799284219741821, "loss_mode_switch": 0.0, "loss_total": 0.5380197763442993, "step": 1277 }, { "batch_size": 4, "epoch": 0.5108, "step": 1277, "tokens_per_device": 4252 }, { "epoch": 0.5108, "loss_ce": 0.05283118411898613, "loss_lvr": 0.7039176821708679, "loss_mode_switch": 0.0, "loss_total": 0.1232229471206665, "step": 1277 }, { "batch_size": 4, "epoch": 0.5108, "step": 1277, "tokens_per_device": 1340 }, { "epoch": 0.5108, "loss_ce": 0.36399129033088684, "loss_lvr": 1.094024896621704, "loss_mode_switch": 0.0, "loss_total": 0.4733937978744507, "step": 1277 }, { "batch_size": 4, "epoch": 0.5108, "step": 1277, "tokens_per_device": 5516 }, { "epoch": 0.5108, "loss_ce": 0.6859694719314575, "loss_lvr": 0.7798871994018555, "loss_mode_switch": 0.0, "loss_total": 0.763958215713501, "step": 1277 }, { "batch_size": 4, "epoch": 0.5108, "step": 1277, "tokens_per_device": 1376 }, { "epoch": 0.5108, "loss_ce": 0.18626584112644196, "loss_lvr": 1.0331919193267822, "loss_mode_switch": 0.0, "loss_total": 0.28958502411842346, "step": 1277 }, { "batch_size": 4, "epoch": 0.5108, "step": 1277, "tokens_per_device": 2560 }, { "epoch": 0.5108, "loss_ce": 0.18643814325332642, "loss_lvr": 0.879941999912262, "loss_mode_switch": 0.0, "loss_total": 0.27443236112594604, "step": 1277 }, { "batch_size": 4, "epoch": 0.5108, "step": 1277, "tokens_per_device": 3896 }, { "epoch": 0.5108, "loss_ce": 0.20022276043891907, "loss_lvr": 1.0241791009902954, "loss_mode_switch": 0.0, "loss_total": 0.3026406764984131, "step": 1277 }, { "batch_size": 4, "epoch": 0.5108, "step": 1277, "tokens_per_device": 3360 }, { "epoch": 0.5108, "loss_ce": 0.18175779283046722, "loss_lvr": 0.8998106718063354, "loss_mode_switch": 0.0, "loss_total": 0.27173885703086853, "step": 1277 }, { "epoch": 0.5112, "grad_norm": 1.2108126878738403, "learning_rate": 5.061534797484476e-06, "loss": 0.318, "step": 1278 }, { "batch_size": 1, "epoch": 0.5112, "step": 1278, "tokens_per_device": 5041 }, { "epoch": 0.5112, "loss_ce": 0.052495911717414856, "loss_lvr": 1.3084893226623535, "loss_mode_switch": 0.0, "loss_total": 0.18334484100341797, "step": 1278 }, { "batch_size": 4, "epoch": 0.5112, "step": 1278, "tokens_per_device": 3960 }, { "epoch": 0.5112, "loss_ce": 0.5836117267608643, "loss_lvr": 0.954876184463501, "loss_mode_switch": 0.0, "loss_total": 0.6790993213653564, "step": 1278 }, { "batch_size": 4, "epoch": 0.5112, "step": 1278, "tokens_per_device": 15700 }, { "epoch": 0.5112, "loss_ce": 0.036538608372211456, "loss_lvr": 0.784526526927948, "loss_mode_switch": 0.0, "loss_total": 0.11499126255512238, "step": 1278 }, { "batch_size": 1, "epoch": 0.5112, "step": 1278, "tokens_per_device": 5129 }, { "epoch": 0.5112, "loss_ce": 0.007004117127507925, "loss_lvr": 0.2518053352832794, "loss_mode_switch": 0.0, "loss_total": 0.032184649258852005, "step": 1278 }, { "batch_size": 1, "epoch": 0.5112, "step": 1278, "tokens_per_device": 5514 }, { "epoch": 0.5112, "loss_ce": 0.2076643407344818, "loss_lvr": 0.37333759665489197, "loss_mode_switch": 0.0, "loss_total": 0.24499809741973877, "step": 1278 }, { "batch_size": 4, "epoch": 0.5112, "step": 1278, "tokens_per_device": 1232 }, { "epoch": 0.5112, "loss_ce": 0.3766399919986725, "loss_lvr": 1.1155160665512085, "loss_mode_switch": 0.0, "loss_total": 0.4881916046142578, "step": 1278 }, { "batch_size": 4, "epoch": 0.5112, "step": 1278, "tokens_per_device": 4320 }, { "epoch": 0.5112, "loss_ce": 0.23051929473876953, "loss_lvr": 1.1446871757507324, "loss_mode_switch": 0.0, "loss_total": 0.34498801827430725, "step": 1278 }, { "batch_size": 4, "epoch": 0.5112, "step": 1278, "tokens_per_device": 1348 }, { "epoch": 0.5112, "loss_ce": 0.38410696387290955, "loss_lvr": 0.9673624038696289, "loss_mode_switch": 0.0, "loss_total": 0.4808432161808014, "step": 1278 }, { "epoch": 0.5116, "grad_norm": 1.2617907524108887, "learning_rate": 5.055057727597078e-06, "loss": 0.2902, "step": 1279 }, { "batch_size": 4, "epoch": 0.5116, "step": 1279, "tokens_per_device": 4920 }, { "epoch": 0.5116, "loss_ce": 0.21931439638137817, "loss_lvr": 1.151748776435852, "loss_mode_switch": 0.0, "loss_total": 0.33448928594589233, "step": 1279 }, { "batch_size": 1, "epoch": 0.5116, "step": 1279, "tokens_per_device": 4729 }, { "epoch": 0.5116, "loss_ce": 0.08110950142145157, "loss_lvr": 0.6205717325210571, "loss_mode_switch": 0.0, "loss_total": 0.1431666761636734, "step": 1279 }, { "batch_size": 4, "epoch": 0.5116, "step": 1279, "tokens_per_device": 1512 }, { "epoch": 0.5116, "loss_ce": 0.458368182182312, "loss_lvr": 0.9937252998352051, "loss_mode_switch": 0.0, "loss_total": 0.5577406883239746, "step": 1279 }, { "batch_size": 4, "epoch": 0.5116, "step": 1279, "tokens_per_device": 4064 }, { "epoch": 0.5116, "loss_ce": 0.3595663905143738, "loss_lvr": 0.8600327968597412, "loss_mode_switch": 0.0, "loss_total": 0.4455696642398834, "step": 1279 }, { "batch_size": 4, "epoch": 0.5116, "step": 1279, "tokens_per_device": 4236 }, { "epoch": 0.5116, "loss_ce": 0.3074733018875122, "loss_lvr": 0.9967679977416992, "loss_mode_switch": 0.0, "loss_total": 0.4071500897407532, "step": 1279 }, { "batch_size": 4, "epoch": 0.5116, "step": 1279, "tokens_per_device": 6624 }, { "epoch": 0.5116, "loss_ce": 0.345241904258728, "loss_lvr": 0.8305166959762573, "loss_mode_switch": 0.0, "loss_total": 0.4282935857772827, "step": 1279 }, { "batch_size": 4, "epoch": 0.5116, "step": 1279, "tokens_per_device": 4000 }, { "epoch": 0.5116, "loss_ce": 0.15323229134082794, "loss_lvr": 0.7756285667419434, "loss_mode_switch": 0.0, "loss_total": 0.23079514503479004, "step": 1279 }, { "batch_size": 4, "epoch": 0.5116, "step": 1279, "tokens_per_device": 3864 }, { "epoch": 0.5116, "loss_ce": 0.13955511152744293, "loss_lvr": 0.9295509457588196, "loss_mode_switch": 0.0, "loss_total": 0.23251020908355713, "step": 1279 }, { "epoch": 0.512, "grad_norm": 1.514872431755066, "learning_rate": 5.048580565304887e-06, "loss": 0.3403, "step": 1280 }, { "batch_size": 4, "epoch": 0.512, "step": 1280, "tokens_per_device": 3856 }, { "epoch": 0.512, "loss_ce": 0.01676911860704422, "loss_lvr": 0.8258395791053772, "loss_mode_switch": 0.0, "loss_total": 0.09935307502746582, "step": 1280 }, { "batch_size": 4, "epoch": 0.512, "step": 1280, "tokens_per_device": 2652 }, { "epoch": 0.512, "loss_ce": 0.3532732427120209, "loss_lvr": 0.8039459586143494, "loss_mode_switch": 0.0, "loss_total": 0.4336678385734558, "step": 1280 }, { "batch_size": 4, "epoch": 0.512, "step": 1280, "tokens_per_device": 5148 }, { "epoch": 0.512, "loss_ce": 0.35932523012161255, "loss_lvr": 0.7158617973327637, "loss_mode_switch": 0.0, "loss_total": 0.43091142177581787, "step": 1280 }, { "batch_size": 1, "epoch": 0.512, "step": 1280, "tokens_per_device": 7183 }, { "epoch": 0.512, "loss_ce": 0.3772071301937103, "loss_lvr": 0.46047407388687134, "loss_mode_switch": 0.0, "loss_total": 0.4232545495033264, "step": 1280 }, { "batch_size": 4, "epoch": 0.512, "step": 1280, "tokens_per_device": 1652 }, { "epoch": 0.512, "loss_ce": 0.11608052253723145, "loss_lvr": 0.9957224726676941, "loss_mode_switch": 0.0, "loss_total": 0.21565276384353638, "step": 1280 }, { "batch_size": 4, "epoch": 0.512, "step": 1280, "tokens_per_device": 3536 }, { "epoch": 0.512, "loss_ce": 0.36111700534820557, "loss_lvr": 1.168547511100769, "loss_mode_switch": 0.0, "loss_total": 0.47797176241874695, "step": 1280 }, { "batch_size": 4, "epoch": 0.512, "step": 1280, "tokens_per_device": 3980 }, { "epoch": 0.512, "loss_ce": 0.22798459231853485, "loss_lvr": 1.0382118225097656, "loss_mode_switch": 0.0, "loss_total": 0.3318057656288147, "step": 1280 }, { "batch_size": 1, "epoch": 0.512, "step": 1280, "tokens_per_device": 5016 }, { "epoch": 0.512, "loss_ce": 0.16012340784072876, "loss_lvr": 0.19117210805416107, "loss_mode_switch": 0.0, "loss_total": 0.1792406141757965, "step": 1280 }, { "epoch": 0.5124, "grad_norm": 1.4922131299972534, "learning_rate": 5.0421033214786965e-06, "loss": 0.2909, "step": 1281 }, { "batch_size": 4, "epoch": 0.5124, "step": 1281, "tokens_per_device": 2688 }, { "epoch": 0.5124, "loss_ce": 0.3080146014690399, "loss_lvr": 0.750956654548645, "loss_mode_switch": 0.0, "loss_total": 0.38311028480529785, "step": 1281 }, { "batch_size": 4, "epoch": 0.5124, "step": 1281, "tokens_per_device": 2704 }, { "epoch": 0.5124, "loss_ce": 0.3169361352920532, "loss_lvr": 0.8919091820716858, "loss_mode_switch": 0.0, "loss_total": 0.40612706542015076, "step": 1281 }, { "batch_size": 4, "epoch": 0.5124, "step": 1281, "tokens_per_device": 3732 }, { "epoch": 0.5124, "loss_ce": 0.11881958693265915, "loss_lvr": 0.9046245217323303, "loss_mode_switch": 0.0, "loss_total": 0.2092820405960083, "step": 1281 }, { "batch_size": 4, "epoch": 0.5124, "step": 1281, "tokens_per_device": 7316 }, { "epoch": 0.5124, "loss_ce": 0.2556305527687073, "loss_lvr": 0.5705572962760925, "loss_mode_switch": 0.0, "loss_total": 0.3126862943172455, "step": 1281 }, { "batch_size": 1, "epoch": 0.5124, "step": 1281, "tokens_per_device": 4865 }, { "epoch": 0.5124, "loss_ce": 0.0022094512823969126, "loss_lvr": 0.21601703763008118, "loss_mode_switch": 0.0, "loss_total": 0.023811155930161476, "step": 1281 }, { "batch_size": 4, "epoch": 0.5124, "step": 1281, "tokens_per_device": 3844 }, { "epoch": 0.5124, "loss_ce": 0.2835472524166107, "loss_lvr": 1.0543984174728394, "loss_mode_switch": 0.0, "loss_total": 0.38898709416389465, "step": 1281 }, { "batch_size": 4, "epoch": 0.5124, "step": 1281, "tokens_per_device": 1740 }, { "epoch": 0.5124, "loss_ce": 0.7298918962478638, "loss_lvr": 0.7830883860588074, "loss_mode_switch": 0.0, "loss_total": 0.8082007169723511, "step": 1281 }, { "batch_size": 1, "epoch": 0.5124, "step": 1281, "tokens_per_device": 4921 }, { "epoch": 0.5124, "loss_ce": 0.0028152232989668846, "loss_lvr": 0.8882700800895691, "loss_mode_switch": 0.0, "loss_total": 0.09164223074913025, "step": 1281 }, { "epoch": 0.5128, "grad_norm": 1.3294860124588013, "learning_rate": 5.035626006989426e-06, "loss": 0.251, "step": 1282 }, { "batch_size": 4, "epoch": 0.5128, "step": 1282, "tokens_per_device": 8244 }, { "epoch": 0.5128, "loss_ce": 0.2114129215478897, "loss_lvr": 0.7856552600860596, "loss_mode_switch": 0.0, "loss_total": 0.2899784445762634, "step": 1282 }, { "batch_size": 4, "epoch": 0.5128, "step": 1282, "tokens_per_device": 6576 }, { "epoch": 0.5128, "loss_ce": 1.14326810836792, "loss_lvr": 0.8296072483062744, "loss_mode_switch": 0.0, "loss_total": 1.2262288331985474, "step": 1282 }, { "batch_size": 1, "epoch": 0.5128, "step": 1282, "tokens_per_device": 5029 }, { "epoch": 0.5128, "loss_ce": 0.021943002939224243, "loss_lvr": 0.2838299870491028, "loss_mode_switch": 0.0, "loss_total": 0.05032600462436676, "step": 1282 }, { "batch_size": 4, "epoch": 0.5128, "step": 1282, "tokens_per_device": 4252 }, { "epoch": 0.5128, "loss_ce": 0.30896613001823425, "loss_lvr": 0.9594013094902039, "loss_mode_switch": 0.0, "loss_total": 0.4049062728881836, "step": 1282 }, { "batch_size": 1, "epoch": 0.5128, "step": 1282, "tokens_per_device": 6905 }, { "epoch": 0.5128, "loss_ce": 0.008090698160231113, "loss_lvr": 0.260616660118103, "loss_mode_switch": 0.0, "loss_total": 0.03415236622095108, "step": 1282 }, { "batch_size": 4, "epoch": 0.5128, "step": 1282, "tokens_per_device": 13160 }, { "epoch": 0.5128, "loss_ce": 0.13041147589683533, "loss_lvr": 0.8738839030265808, "loss_mode_switch": 0.0, "loss_total": 0.21779987215995789, "step": 1282 }, { "batch_size": 4, "epoch": 0.5128, "step": 1282, "tokens_per_device": 5884 }, { "epoch": 0.5128, "loss_ce": 0.015861371532082558, "loss_lvr": 0.704306960105896, "loss_mode_switch": 0.0, "loss_total": 0.08629206568002701, "step": 1282 }, { "batch_size": 4, "epoch": 0.5128, "step": 1282, "tokens_per_device": 4236 }, { "epoch": 0.5128, "loss_ce": 0.28816378116607666, "loss_lvr": 0.7161626219749451, "loss_mode_switch": 0.0, "loss_total": 0.35978004336357117, "step": 1282 }, { "epoch": 0.5132, "grad_norm": 1.0996617078781128, "learning_rate": 5.029148632708117e-06, "loss": 0.2465, "step": 1283 }, { "batch_size": 4, "epoch": 0.5132, "step": 1283, "tokens_per_device": 13760 }, { "epoch": 0.5132, "loss_ce": 0.2524723410606384, "loss_lvr": 0.44620481133461, "loss_mode_switch": 0.0, "loss_total": 0.29709282517433167, "step": 1283 }, { "batch_size": 1, "epoch": 0.5132, "step": 1283, "tokens_per_device": 5202 }, { "epoch": 0.5132, "loss_ce": 0.020536845549941063, "loss_lvr": 0.3272099792957306, "loss_mode_switch": 0.0, "loss_total": 0.05325784534215927, "step": 1283 }, { "batch_size": 4, "epoch": 0.5132, "step": 1283, "tokens_per_device": 9076 }, { "epoch": 0.5132, "loss_ce": 0.4029979109764099, "loss_lvr": 0.8932188153266907, "loss_mode_switch": 0.0, "loss_total": 0.492319792509079, "step": 1283 }, { "batch_size": 4, "epoch": 0.5132, "step": 1283, "tokens_per_device": 5780 }, { "epoch": 0.5132, "loss_ce": 0.14701257646083832, "loss_lvr": 0.8005284667015076, "loss_mode_switch": 0.0, "loss_total": 0.22706541419029236, "step": 1283 }, { "batch_size": 4, "epoch": 0.5132, "step": 1283, "tokens_per_device": 4304 }, { "epoch": 0.5132, "loss_ce": 0.07979748398065567, "loss_lvr": 0.5737583637237549, "loss_mode_switch": 0.0, "loss_total": 0.13717332482337952, "step": 1283 }, { "batch_size": 4, "epoch": 0.5132, "step": 1283, "tokens_per_device": 4232 }, { "epoch": 0.5132, "loss_ce": 0.31616881489753723, "loss_lvr": 1.202786922454834, "loss_mode_switch": 0.0, "loss_total": 0.43644750118255615, "step": 1283 }, { "batch_size": 4, "epoch": 0.5132, "step": 1283, "tokens_per_device": 5380 }, { "epoch": 0.5132, "loss_ce": 0.11493727564811707, "loss_lvr": 3.405416488647461, "loss_mode_switch": 0.0, "loss_total": 0.4554789364337921, "step": 1283 }, { "batch_size": 4, "epoch": 0.5132, "step": 1283, "tokens_per_device": 4620 }, { "epoch": 0.5132, "loss_ce": 0.4985690414905548, "loss_lvr": 1.557052493095398, "loss_mode_switch": 0.0, "loss_total": 0.6542742848396301, "step": 1283 }, { "epoch": 0.5136, "grad_norm": 1.3039920330047607, "learning_rate": 5.022671209505916e-06, "loss": 0.287, "step": 1284 }, { "batch_size": 1, "epoch": 0.5136, "step": 1284, "tokens_per_device": 5179 }, { "epoch": 0.5136, "loss_ce": 0.005570293869823217, "loss_lvr": 0.38955771923065186, "loss_mode_switch": 0.0, "loss_total": 0.04452606663107872, "step": 1284 }, { "batch_size": 4, "epoch": 0.5136, "step": 1284, "tokens_per_device": 1840 }, { "epoch": 0.5136, "loss_ce": 0.3722529709339142, "loss_lvr": 0.9804630875587463, "loss_mode_switch": 0.0, "loss_total": 0.47029927372932434, "step": 1284 }, { "batch_size": 4, "epoch": 0.5136, "step": 1284, "tokens_per_device": 3780 }, { "epoch": 0.5136, "loss_ce": 0.1316155195236206, "loss_lvr": 1.1189618110656738, "loss_mode_switch": 0.0, "loss_total": 0.24351170659065247, "step": 1284 }, { "batch_size": 4, "epoch": 0.5136, "step": 1284, "tokens_per_device": 4444 }, { "epoch": 0.5136, "loss_ce": 0.16798919439315796, "loss_lvr": 1.2299531698226929, "loss_mode_switch": 0.0, "loss_total": 0.29098451137542725, "step": 1284 }, { "batch_size": 4, "epoch": 0.5136, "step": 1284, "tokens_per_device": 5180 }, { "epoch": 0.5136, "loss_ce": 0.28930291533470154, "loss_lvr": 0.8184342980384827, "loss_mode_switch": 0.0, "loss_total": 0.3711463510990143, "step": 1284 }, { "batch_size": 1, "epoch": 0.5136, "step": 1284, "tokens_per_device": 5183 }, { "epoch": 0.5136, "loss_ce": 0.09369830787181854, "loss_lvr": 0.41413846611976624, "loss_mode_switch": 0.0, "loss_total": 0.13511215150356293, "step": 1284 }, { "batch_size": 4, "epoch": 0.5136, "step": 1284, "tokens_per_device": 3784 }, { "epoch": 0.5136, "loss_ce": 0.1793273389339447, "loss_lvr": 0.8675445318222046, "loss_mode_switch": 0.0, "loss_total": 0.2660818099975586, "step": 1284 }, { "batch_size": 1, "epoch": 0.5136, "step": 1284, "tokens_per_device": 7823 }, { "epoch": 0.5136, "loss_ce": 0.19189593195915222, "loss_lvr": 0.38092002272605896, "loss_mode_switch": 0.0, "loss_total": 0.22998793423175812, "step": 1284 }, { "epoch": 0.514, "grad_norm": 1.5535403490066528, "learning_rate": 5.016193748254045e-06, "loss": 0.3329, "step": 1285 }, { "batch_size": 1, "epoch": 0.514, "step": 1285, "tokens_per_device": 5123 }, { "epoch": 0.514, "loss_ce": 0.00770680233836174, "loss_lvr": 0.33545130491256714, "loss_mode_switch": 0.0, "loss_total": 0.04125193506479263, "step": 1285 }, { "batch_size": 1, "epoch": 0.514, "step": 1285, "tokens_per_device": 5184 }, { "epoch": 0.514, "loss_ce": 0.03447279706597328, "loss_lvr": 0.3464415371417999, "loss_mode_switch": 0.0, "loss_total": 0.06911695003509521, "step": 1285 }, { "batch_size": 4, "epoch": 0.514, "step": 1285, "tokens_per_device": 2708 }, { "epoch": 0.514, "loss_ce": 0.3278692662715912, "loss_lvr": 0.8551576733589172, "loss_mode_switch": 0.0, "loss_total": 0.4133850336074829, "step": 1285 }, { "batch_size": 4, "epoch": 0.514, "step": 1285, "tokens_per_device": 4948 }, { "epoch": 0.514, "loss_ce": 0.18292440474033356, "loss_lvr": 0.9653065800666809, "loss_mode_switch": 0.0, "loss_total": 0.2794550657272339, "step": 1285 }, { "batch_size": 1, "epoch": 0.514, "step": 1285, "tokens_per_device": 4854 }, { "epoch": 0.514, "loss_ce": 0.0068470174446702, "loss_lvr": 0.26485684514045715, "loss_mode_switch": 0.0, "loss_total": 0.0333327017724514, "step": 1285 }, { "batch_size": 4, "epoch": 0.514, "step": 1285, "tokens_per_device": 5800 }, { "epoch": 0.514, "loss_ce": 0.6505632996559143, "loss_lvr": 0.7942886352539062, "loss_mode_switch": 0.0, "loss_total": 0.729992151260376, "step": 1285 }, { "batch_size": 1, "epoch": 0.514, "step": 1285, "tokens_per_device": 4766 }, { "epoch": 0.514, "loss_ce": 0.027733011171221733, "loss_lvr": 0.6253754496574402, "loss_mode_switch": 0.0, "loss_total": 0.09027055650949478, "step": 1285 }, { "batch_size": 4, "epoch": 0.514, "step": 1285, "tokens_per_device": 4516 }, { "epoch": 0.514, "loss_ce": 0.6800726056098938, "loss_lvr": 0.9392341375350952, "loss_mode_switch": 0.0, "loss_total": 0.7739959955215454, "step": 1285 }, { "epoch": 0.5144, "grad_norm": 1.176144003868103, "learning_rate": 5.009716259823792e-06, "loss": 0.2949, "step": 1286 }, { "batch_size": 4, "epoch": 0.5144, "step": 1286, "tokens_per_device": 3752 }, { "epoch": 0.5144, "loss_ce": 0.06930846720933914, "loss_lvr": 0.8547163605690002, "loss_mode_switch": 0.0, "loss_total": 0.15478010475635529, "step": 1286 }, { "batch_size": 4, "epoch": 0.5144, "step": 1286, "tokens_per_device": 3628 }, { "epoch": 0.5144, "loss_ce": 0.31954479217529297, "loss_lvr": 0.8843360543251038, "loss_mode_switch": 0.0, "loss_total": 0.4079784154891968, "step": 1286 }, { "batch_size": 4, "epoch": 0.5144, "step": 1286, "tokens_per_device": 4452 }, { "epoch": 0.5144, "loss_ce": 0.2516404688358307, "loss_lvr": 0.8990260362625122, "loss_mode_switch": 0.0, "loss_total": 0.3415430784225464, "step": 1286 }, { "batch_size": 4, "epoch": 0.5144, "step": 1286, "tokens_per_device": 1500 }, { "epoch": 0.5144, "loss_ce": 0.2475457787513733, "loss_lvr": 0.9442991614341736, "loss_mode_switch": 0.0, "loss_total": 0.34197568893432617, "step": 1286 }, { "batch_size": 4, "epoch": 0.5144, "step": 1286, "tokens_per_device": 3896 }, { "epoch": 0.5144, "loss_ce": 0.1290268450975418, "loss_lvr": 1.0608981847763062, "loss_mode_switch": 0.0, "loss_total": 0.23511666059494019, "step": 1286 }, { "batch_size": 1, "epoch": 0.5144, "step": 1286, "tokens_per_device": 4654 }, { "epoch": 0.5144, "loss_ce": 0.21024499833583832, "loss_lvr": 0.30817732214927673, "loss_mode_switch": 0.0, "loss_total": 0.241062730550766, "step": 1286 }, { "batch_size": 4, "epoch": 0.5144, "step": 1286, "tokens_per_device": 6872 }, { "epoch": 0.5144, "loss_ce": 0.00654848525300622, "loss_lvr": 0.8222886323928833, "loss_mode_switch": 0.0, "loss_total": 0.08877734839916229, "step": 1286 }, { "batch_size": 4, "epoch": 0.5144, "step": 1286, "tokens_per_device": 1772 }, { "epoch": 0.5144, "loss_ce": 0.7230058908462524, "loss_lvr": 0.8878653645515442, "loss_mode_switch": 0.0, "loss_total": 0.8117924332618713, "step": 1286 }, { "epoch": 0.5148, "grad_norm": 1.2363181114196777, "learning_rate": 5.003238755086492e-06, "loss": 0.2587, "step": 1287 }, { "batch_size": 4, "epoch": 0.5148, "step": 1287, "tokens_per_device": 1248 }, { "epoch": 0.5148, "loss_ce": 0.39944878220558167, "loss_lvr": 1.120638370513916, "loss_mode_switch": 0.0, "loss_total": 0.5115126371383667, "step": 1287 }, { "batch_size": 4, "epoch": 0.5148, "step": 1287, "tokens_per_device": 4412 }, { "epoch": 0.5148, "loss_ce": 0.2896958589553833, "loss_lvr": 0.7558489441871643, "loss_mode_switch": 0.0, "loss_total": 0.36528074741363525, "step": 1287 }, { "batch_size": 4, "epoch": 0.5148, "step": 1287, "tokens_per_device": 1444 }, { "epoch": 0.5148, "loss_ce": 0.08867250382900238, "loss_lvr": 0.9611715078353882, "loss_mode_switch": 0.0, "loss_total": 0.18478965759277344, "step": 1287 }, { "batch_size": 4, "epoch": 0.5148, "step": 1287, "tokens_per_device": 3764 }, { "epoch": 0.5148, "loss_ce": 0.4109979271888733, "loss_lvr": 0.8983654379844666, "loss_mode_switch": 0.0, "loss_total": 0.5008344650268555, "step": 1287 }, { "batch_size": 4, "epoch": 0.5148, "step": 1287, "tokens_per_device": 3812 }, { "epoch": 0.5148, "loss_ce": 0.513308048248291, "loss_lvr": 0.8598856329917908, "loss_mode_switch": 0.0, "loss_total": 0.5992966294288635, "step": 1287 }, { "batch_size": 4, "epoch": 0.5148, "step": 1287, "tokens_per_device": 1192 }, { "epoch": 0.5148, "loss_ce": 0.29300686717033386, "loss_lvr": 1.1116507053375244, "loss_mode_switch": 0.0, "loss_total": 0.4041719436645508, "step": 1287 }, { "batch_size": 4, "epoch": 0.5148, "step": 1287, "tokens_per_device": 3728 }, { "epoch": 0.5148, "loss_ce": 0.2113119512796402, "loss_lvr": 0.9433167576789856, "loss_mode_switch": 0.0, "loss_total": 0.30564361810684204, "step": 1287 }, { "batch_size": 4, "epoch": 0.5148, "step": 1287, "tokens_per_device": 4644 }, { "epoch": 0.5148, "loss_ce": 0.012814640067517757, "loss_lvr": 0.6441303491592407, "loss_mode_switch": 0.0, "loss_total": 0.07722767442464828, "step": 1287 }, { "epoch": 0.5152, "grad_norm": 1.4107081890106201, "learning_rate": 4.996761244913508e-06, "loss": 0.2927, "step": 1288 }, { "batch_size": 4, "epoch": 0.5152, "step": 1288, "tokens_per_device": 6160 }, { "epoch": 0.5152, "loss_ce": 0.005106227472424507, "loss_lvr": 0.8332115411758423, "loss_mode_switch": 0.0, "loss_total": 0.08842737972736359, "step": 1288 }, { "batch_size": 4, "epoch": 0.5152, "step": 1288, "tokens_per_device": 4336 }, { "epoch": 0.5152, "loss_ce": 0.11342686414718628, "loss_lvr": 0.6450840830802917, "loss_mode_switch": 0.0, "loss_total": 0.17793527245521545, "step": 1288 }, { "batch_size": 4, "epoch": 0.5152, "step": 1288, "tokens_per_device": 5236 }, { "epoch": 0.5152, "loss_ce": 0.47343650460243225, "loss_lvr": 0.8273610472679138, "loss_mode_switch": 0.0, "loss_total": 0.5561726093292236, "step": 1288 }, { "batch_size": 4, "epoch": 0.5152, "step": 1288, "tokens_per_device": 5832 }, { "epoch": 0.5152, "loss_ce": 0.7155537009239197, "loss_lvr": 0.6835084557533264, "loss_mode_switch": 0.0, "loss_total": 0.7839045524597168, "step": 1288 }, { "batch_size": 1, "epoch": 0.5152, "step": 1288, "tokens_per_device": 4622 }, { "epoch": 0.5152, "loss_ce": 0.2804526090621948, "loss_lvr": 0.44397222995758057, "loss_mode_switch": 0.0, "loss_total": 0.32484984397888184, "step": 1288 }, { "batch_size": 1, "epoch": 0.5152, "step": 1288, "tokens_per_device": 5024 }, { "epoch": 0.5152, "loss_ce": 0.31756505370140076, "loss_lvr": 0.9459049701690674, "loss_mode_switch": 0.0, "loss_total": 0.4121555685997009, "step": 1288 }, { "batch_size": 4, "epoch": 0.5152, "step": 1288, "tokens_per_device": 4208 }, { "epoch": 0.5152, "loss_ce": 0.17782989144325256, "loss_lvr": 0.955653190612793, "loss_mode_switch": 0.0, "loss_total": 0.27339521050453186, "step": 1288 }, { "batch_size": 4, "epoch": 0.5152, "step": 1288, "tokens_per_device": 5516 }, { "epoch": 0.5152, "loss_ce": 0.05648728832602501, "loss_lvr": 0.9838877320289612, "loss_mode_switch": 0.0, "loss_total": 0.15487606823444366, "step": 1288 }, { "epoch": 0.5156, "grad_norm": 1.5045279264450073, "learning_rate": 4.9902837401762085e-06, "loss": 0.324, "step": 1289 }, { "batch_size": 4, "epoch": 0.5156, "step": 1289, "tokens_per_device": 3804 }, { "epoch": 0.5156, "loss_ce": 0.6464967727661133, "loss_lvr": 0.9445120692253113, "loss_mode_switch": 0.0, "loss_total": 0.740947961807251, "step": 1289 }, { "batch_size": 4, "epoch": 0.5156, "step": 1289, "tokens_per_device": 5636 }, { "epoch": 0.5156, "loss_ce": 0.17544934153556824, "loss_lvr": 0.8610577583312988, "loss_mode_switch": 0.0, "loss_total": 0.26155513525009155, "step": 1289 }, { "batch_size": 1, "epoch": 0.5156, "step": 1289, "tokens_per_device": 5033 }, { "epoch": 0.5156, "loss_ce": 0.014056825079023838, "loss_lvr": 0.5145212411880493, "loss_mode_switch": 0.0, "loss_total": 0.06550895422697067, "step": 1289 }, { "batch_size": 1, "epoch": 0.5156, "step": 1289, "tokens_per_device": 4939 }, { "epoch": 0.5156, "loss_ce": 0.22080358862876892, "loss_lvr": 0.3829234540462494, "loss_mode_switch": 0.0, "loss_total": 0.2590959370136261, "step": 1289 }, { "batch_size": 1, "epoch": 0.5156, "step": 1289, "tokens_per_device": 4741 }, { "epoch": 0.5156, "loss_ce": 0.013581578619778156, "loss_lvr": 0.45967867970466614, "loss_mode_switch": 0.0, "loss_total": 0.059549447149038315, "step": 1289 }, { "batch_size": 1, "epoch": 0.5156, "step": 1289, "tokens_per_device": 5176 }, { "epoch": 0.5156, "loss_ce": 0.13984733819961548, "loss_lvr": 0.26074060797691345, "loss_mode_switch": 0.0, "loss_total": 0.1659214049577713, "step": 1289 }, { "batch_size": 1, "epoch": 0.5156, "step": 1289, "tokens_per_device": 4487 }, { "epoch": 0.5156, "loss_ce": 0.07700391858816147, "loss_lvr": 0.3399084806442261, "loss_mode_switch": 0.0, "loss_total": 0.11099477112293243, "step": 1289 }, { "batch_size": 4, "epoch": 0.5156, "step": 1289, "tokens_per_device": 5960 }, { "epoch": 0.5156, "loss_ce": 0.359002947807312, "loss_lvr": 0.9867845773696899, "loss_mode_switch": 0.0, "loss_total": 0.45768141746520996, "step": 1289 }, { "epoch": 0.516, "grad_norm": 1.8189918994903564, "learning_rate": 4.983806251745958e-06, "loss": 0.2822, "step": 1290 }, { "batch_size": 1, "epoch": 0.516, "step": 1290, "tokens_per_device": 5103 }, { "epoch": 0.516, "loss_ce": 0.004117182455956936, "loss_lvr": 0.5104507803916931, "loss_mode_switch": 0.0, "loss_total": 0.05516226217150688, "step": 1290 }, { "batch_size": 4, "epoch": 0.516, "step": 1290, "tokens_per_device": 3788 }, { "epoch": 0.516, "loss_ce": 0.3726056218147278, "loss_lvr": 1.1823694705963135, "loss_mode_switch": 0.0, "loss_total": 0.4908425807952881, "step": 1290 }, { "batch_size": 4, "epoch": 0.516, "step": 1290, "tokens_per_device": 5108 }, { "epoch": 0.516, "loss_ce": 0.12295582890510559, "loss_lvr": 0.5722666382789612, "loss_mode_switch": 0.0, "loss_total": 0.18018248677253723, "step": 1290 }, { "batch_size": 4, "epoch": 0.516, "step": 1290, "tokens_per_device": 4244 }, { "epoch": 0.516, "loss_ce": 0.13165965676307678, "loss_lvr": 1.191750168800354, "loss_mode_switch": 0.0, "loss_total": 0.2508346736431122, "step": 1290 }, { "batch_size": 4, "epoch": 0.516, "step": 1290, "tokens_per_device": 3844 }, { "epoch": 0.516, "loss_ce": 0.13257654011249542, "loss_lvr": 1.0058867931365967, "loss_mode_switch": 0.0, "loss_total": 0.2331652194261551, "step": 1290 }, { "batch_size": 4, "epoch": 0.516, "step": 1290, "tokens_per_device": 5696 }, { "epoch": 0.516, "loss_ce": 0.11528169363737106, "loss_lvr": 1.0991188287734985, "loss_mode_switch": 0.0, "loss_total": 0.2251935750246048, "step": 1290 }, { "batch_size": 4, "epoch": 0.516, "step": 1290, "tokens_per_device": 1388 }, { "epoch": 0.516, "loss_ce": 0.5090110898017883, "loss_lvr": 0.9917720556259155, "loss_mode_switch": 0.0, "loss_total": 0.608188271522522, "step": 1290 }, { "batch_size": 4, "epoch": 0.516, "step": 1290, "tokens_per_device": 5752 }, { "epoch": 0.516, "loss_ce": 0.42778733372688293, "loss_lvr": 0.9932057857513428, "loss_mode_switch": 0.0, "loss_total": 0.5271078944206238, "step": 1290 }, { "epoch": 0.5164, "grad_norm": 1.2268527746200562, "learning_rate": 4.9773287904940856e-06, "loss": 0.2954, "step": 1291 }, { "batch_size": 4, "epoch": 0.5164, "step": 1291, "tokens_per_device": 1520 }, { "epoch": 0.5164, "loss_ce": 0.8163583278656006, "loss_lvr": 1.4142392873764038, "loss_mode_switch": 0.0, "loss_total": 0.9577822685241699, "step": 1291 }, { "batch_size": 4, "epoch": 0.5164, "step": 1291, "tokens_per_device": 5264 }, { "epoch": 0.5164, "loss_ce": 0.38258257508277893, "loss_lvr": 0.8092019557952881, "loss_mode_switch": 0.0, "loss_total": 0.46350276470184326, "step": 1291 }, { "batch_size": 1, "epoch": 0.5164, "step": 1291, "tokens_per_device": 4890 }, { "epoch": 0.5164, "loss_ce": 0.002659625606611371, "loss_lvr": 0.7235657572746277, "loss_mode_switch": 0.0, "loss_total": 0.07501620054244995, "step": 1291 }, { "batch_size": 4, "epoch": 0.5164, "step": 1291, "tokens_per_device": 4484 }, { "epoch": 0.5164, "loss_ce": 0.06446798890829086, "loss_lvr": 1.105326771736145, "loss_mode_switch": 0.0, "loss_total": 0.17500066757202148, "step": 1291 }, { "batch_size": 4, "epoch": 0.5164, "step": 1291, "tokens_per_device": 4448 }, { "epoch": 0.5164, "loss_ce": 0.05902630090713501, "loss_lvr": 1.0129096508026123, "loss_mode_switch": 0.0, "loss_total": 0.16031727194786072, "step": 1291 }, { "batch_size": 4, "epoch": 0.5164, "step": 1291, "tokens_per_device": 3952 }, { "epoch": 0.5164, "loss_ce": 0.20821653306484222, "loss_lvr": 0.7389209866523743, "loss_mode_switch": 0.0, "loss_total": 0.2821086347103119, "step": 1291 }, { "batch_size": 4, "epoch": 0.5164, "step": 1291, "tokens_per_device": 4280 }, { "epoch": 0.5164, "loss_ce": 0.42177140712738037, "loss_lvr": 0.7404722571372986, "loss_mode_switch": 0.0, "loss_total": 0.4958186447620392, "step": 1291 }, { "batch_size": 4, "epoch": 0.5164, "step": 1291, "tokens_per_device": 4372 }, { "epoch": 0.5164, "loss_ce": 0.47146666049957275, "loss_lvr": 1.0585932731628418, "loss_mode_switch": 0.0, "loss_total": 0.5773259997367859, "step": 1291 }, { "epoch": 0.5168, "grad_norm": 1.3045885562896729, "learning_rate": 4.9708513672918854e-06, "loss": 0.3032, "step": 1292 }, { "batch_size": 1, "epoch": 0.5168, "step": 1292, "tokens_per_device": 4629 }, { "epoch": 0.5168, "loss_ce": 0.0013010402908548713, "loss_lvr": 0.43732666969299316, "loss_mode_switch": 0.0, "loss_total": 0.04503370821475983, "step": 1292 }, { "batch_size": 4, "epoch": 0.5168, "step": 1292, "tokens_per_device": 4116 }, { "epoch": 0.5168, "loss_ce": 0.2870217263698578, "loss_lvr": 0.901536762714386, "loss_mode_switch": 0.0, "loss_total": 0.37717539072036743, "step": 1292 }, { "batch_size": 4, "epoch": 0.5168, "step": 1292, "tokens_per_device": 5944 }, { "epoch": 0.5168, "loss_ce": 0.36534416675567627, "loss_lvr": 0.8652639389038086, "loss_mode_switch": 0.0, "loss_total": 0.45187056064605713, "step": 1292 }, { "batch_size": 4, "epoch": 0.5168, "step": 1292, "tokens_per_device": 2656 }, { "epoch": 0.5168, "loss_ce": 0.20319370925426483, "loss_lvr": 0.625343918800354, "loss_mode_switch": 0.0, "loss_total": 0.2657281160354614, "step": 1292 }, { "batch_size": 4, "epoch": 0.5168, "step": 1292, "tokens_per_device": 3812 }, { "epoch": 0.5168, "loss_ce": 0.4346265494823456, "loss_lvr": 0.7790856957435608, "loss_mode_switch": 0.0, "loss_total": 0.5125350952148438, "step": 1292 }, { "batch_size": 4, "epoch": 0.5168, "step": 1292, "tokens_per_device": 16188 }, { "epoch": 0.5168, "loss_ce": 0.3135433793067932, "loss_lvr": 1.1283401250839233, "loss_mode_switch": 0.0, "loss_total": 0.42637738585472107, "step": 1292 }, { "batch_size": 4, "epoch": 0.5168, "step": 1292, "tokens_per_device": 3776 }, { "epoch": 0.5168, "loss_ce": 0.40669047832489014, "loss_lvr": 0.8863723278045654, "loss_mode_switch": 0.0, "loss_total": 0.4953277111053467, "step": 1292 }, { "batch_size": 4, "epoch": 0.5168, "step": 1292, "tokens_per_device": 3432 }, { "epoch": 0.5168, "loss_ce": 0.4527600109577179, "loss_lvr": 1.1287063360214233, "loss_mode_switch": 0.0, "loss_total": 0.5656306743621826, "step": 1292 }, { "epoch": 0.5172, "grad_norm": 1.277977705001831, "learning_rate": 4.964373993010576e-06, "loss": 0.3336, "step": 1293 }, { "batch_size": 4, "epoch": 0.5172, "step": 1293, "tokens_per_device": 6292 }, { "epoch": 0.5172, "loss_ce": 0.03513025492429733, "loss_lvr": 0.6631338000297546, "loss_mode_switch": 0.0, "loss_total": 0.10144363343715668, "step": 1293 }, { "batch_size": 4, "epoch": 0.5172, "step": 1293, "tokens_per_device": 4680 }, { "epoch": 0.5172, "loss_ce": 0.25782841444015503, "loss_lvr": 0.8149662613868713, "loss_mode_switch": 0.0, "loss_total": 0.33932504057884216, "step": 1293 }, { "batch_size": 4, "epoch": 0.5172, "step": 1293, "tokens_per_device": 4476 }, { "epoch": 0.5172, "loss_ce": 0.10097245126962662, "loss_lvr": 0.8759114146232605, "loss_mode_switch": 0.0, "loss_total": 0.18856358528137207, "step": 1293 }, { "batch_size": 4, "epoch": 0.5172, "step": 1293, "tokens_per_device": 4808 }, { "epoch": 0.5172, "loss_ce": 0.04133619740605354, "loss_lvr": 0.8722325563430786, "loss_mode_switch": 0.0, "loss_total": 0.12855945527553558, "step": 1293 }, { "batch_size": 4, "epoch": 0.5172, "step": 1293, "tokens_per_device": 4700 }, { "epoch": 0.5172, "loss_ce": 0.3964037597179413, "loss_lvr": 0.7589119672775269, "loss_mode_switch": 0.0, "loss_total": 0.47229495644569397, "step": 1293 }, { "batch_size": 1, "epoch": 0.5172, "step": 1293, "tokens_per_device": 5115 }, { "epoch": 0.5172, "loss_ce": 0.005828887224197388, "loss_lvr": 0.5365452766418457, "loss_mode_switch": 0.0, "loss_total": 0.05948341637849808, "step": 1293 }, { "batch_size": 1, "epoch": 0.5172, "step": 1293, "tokens_per_device": 5576 }, { "epoch": 0.5172, "loss_ce": 0.1871270388364792, "loss_lvr": 0.5092682838439941, "loss_mode_switch": 0.0, "loss_total": 0.23805387318134308, "step": 1293 }, { "batch_size": 4, "epoch": 0.5172, "step": 1293, "tokens_per_device": 1184 }, { "epoch": 0.5172, "loss_ce": 0.08791512995958328, "loss_lvr": 1.2467079162597656, "loss_mode_switch": 0.0, "loss_total": 0.2125859260559082, "step": 1293 }, { "epoch": 0.5176, "grad_norm": 1.35210382938385, "learning_rate": 4.957896678521305e-06, "loss": 0.2899, "step": 1294 }, { "batch_size": 4, "epoch": 0.5176, "step": 1294, "tokens_per_device": 2836 }, { "epoch": 0.5176, "loss_ce": 0.24058283865451813, "loss_lvr": 0.5641214847564697, "loss_mode_switch": 0.0, "loss_total": 0.29699498414993286, "step": 1294 }, { "batch_size": 4, "epoch": 0.5176, "step": 1294, "tokens_per_device": 1452 }, { "epoch": 0.5176, "loss_ce": 0.7315446138381958, "loss_lvr": 1.0434049367904663, "loss_mode_switch": 0.0, "loss_total": 0.8358851075172424, "step": 1294 }, { "batch_size": 4, "epoch": 0.5176, "step": 1294, "tokens_per_device": 2636 }, { "epoch": 0.5176, "loss_ce": 0.3491877615451813, "loss_lvr": 0.7217907309532166, "loss_mode_switch": 0.0, "loss_total": 0.4213668406009674, "step": 1294 }, { "batch_size": 4, "epoch": 0.5176, "step": 1294, "tokens_per_device": 4488 }, { "epoch": 0.5176, "loss_ce": 0.09692295640707016, "loss_lvr": 0.80218505859375, "loss_mode_switch": 0.0, "loss_total": 0.1771414577960968, "step": 1294 }, { "batch_size": 4, "epoch": 0.5176, "step": 1294, "tokens_per_device": 4272 }, { "epoch": 0.5176, "loss_ce": 0.17732788622379303, "loss_lvr": 1.270929217338562, "loss_mode_switch": 0.0, "loss_total": 0.3044208288192749, "step": 1294 }, { "batch_size": 4, "epoch": 0.5176, "step": 1294, "tokens_per_device": 5568 }, { "epoch": 0.5176, "loss_ce": 0.22661326825618744, "loss_lvr": 0.7718063592910767, "loss_mode_switch": 0.0, "loss_total": 0.30379390716552734, "step": 1294 }, { "batch_size": 4, "epoch": 0.5176, "step": 1294, "tokens_per_device": 3456 }, { "epoch": 0.5176, "loss_ce": 0.08346019685268402, "loss_lvr": 0.6205946207046509, "loss_mode_switch": 0.0, "loss_total": 0.1455196589231491, "step": 1294 }, { "batch_size": 4, "epoch": 0.5176, "step": 1294, "tokens_per_device": 4932 }, { "epoch": 0.5176, "loss_ce": 0.007227360270917416, "loss_lvr": 0.8492361307144165, "loss_mode_switch": 0.0, "loss_total": 0.09215097874403, "step": 1294 }, { "epoch": 0.518, "grad_norm": 1.364978551864624, "learning_rate": 4.951419434695115e-06, "loss": 0.2858, "step": 1295 }, { "batch_size": 4, "epoch": 0.518, "step": 1295, "tokens_per_device": 4540 }, { "epoch": 0.518, "loss_ce": 0.3061574101448059, "loss_lvr": 1.0854734182357788, "loss_mode_switch": 0.0, "loss_total": 0.41470474004745483, "step": 1295 }, { "batch_size": 4, "epoch": 0.518, "step": 1295, "tokens_per_device": 1548 }, { "epoch": 0.518, "loss_ce": 0.6160905361175537, "loss_lvr": 2.351187229156494, "loss_mode_switch": 0.0, "loss_total": 0.851209282875061, "step": 1295 }, { "batch_size": 1, "epoch": 0.518, "step": 1295, "tokens_per_device": 4884 }, { "epoch": 0.518, "loss_ce": 0.19895607233047485, "loss_lvr": 0.28757360577583313, "loss_mode_switch": 0.0, "loss_total": 0.2277134358882904, "step": 1295 }, { "batch_size": 4, "epoch": 0.518, "step": 1295, "tokens_per_device": 4292 }, { "epoch": 0.518, "loss_ce": 0.236944779753685, "loss_lvr": 0.8572705984115601, "loss_mode_switch": 0.0, "loss_total": 0.3226718306541443, "step": 1295 }, { "batch_size": 4, "epoch": 0.518, "step": 1295, "tokens_per_device": 3760 }, { "epoch": 0.518, "loss_ce": 0.17768608033657074, "loss_lvr": 0.4067903161048889, "loss_mode_switch": 0.0, "loss_total": 0.2183651179075241, "step": 1295 }, { "batch_size": 1, "epoch": 0.518, "step": 1295, "tokens_per_device": 7062 }, { "epoch": 0.518, "loss_ce": 0.0005391820450313389, "loss_lvr": 0.39069390296936035, "loss_mode_switch": 0.0, "loss_total": 0.03960857540369034, "step": 1295 }, { "batch_size": 4, "epoch": 0.518, "step": 1295, "tokens_per_device": 4384 }, { "epoch": 0.518, "loss_ce": 0.1734263300895691, "loss_lvr": 0.8179015517234802, "loss_mode_switch": 0.0, "loss_total": 0.25521647930145264, "step": 1295 }, { "batch_size": 4, "epoch": 0.518, "step": 1295, "tokens_per_device": 5332 }, { "epoch": 0.518, "loss_ce": 0.5447366833686829, "loss_lvr": 0.8347457647323608, "loss_mode_switch": 0.0, "loss_total": 0.628211259841919, "step": 1295 }, { "epoch": 0.5184, "grad_norm": 1.360050916671753, "learning_rate": 4.944942272402925e-06, "loss": 0.3232, "step": 1296 }, { "batch_size": 4, "epoch": 0.5184, "step": 1296, "tokens_per_device": 2576 }, { "epoch": 0.5184, "loss_ce": 0.4352578818798065, "loss_lvr": 1.0586774349212646, "loss_mode_switch": 0.0, "loss_total": 0.5411256551742554, "step": 1296 }, { "batch_size": 4, "epoch": 0.5184, "step": 1296, "tokens_per_device": 15016 }, { "epoch": 0.5184, "loss_ce": 0.04371752217411995, "loss_lvr": 0.689774215221405, "loss_mode_switch": 0.0, "loss_total": 0.11269494891166687, "step": 1296 }, { "batch_size": 4, "epoch": 0.5184, "step": 1296, "tokens_per_device": 1408 }, { "epoch": 0.5184, "loss_ce": 0.3199961185455322, "loss_lvr": 0.967447817325592, "loss_mode_switch": 0.0, "loss_total": 0.41674089431762695, "step": 1296 }, { "batch_size": 1, "epoch": 0.5184, "step": 1296, "tokens_per_device": 4864 }, { "epoch": 0.5184, "loss_ce": 0.0008425777195952833, "loss_lvr": 0.2612558901309967, "loss_mode_switch": 0.0, "loss_total": 0.02696816623210907, "step": 1296 }, { "batch_size": 1, "epoch": 0.5184, "step": 1296, "tokens_per_device": 5749 }, { "epoch": 0.5184, "loss_ce": 0.007062193937599659, "loss_lvr": 0.37817832827568054, "loss_mode_switch": 0.0, "loss_total": 0.04488002508878708, "step": 1296 }, { "batch_size": 4, "epoch": 0.5184, "step": 1296, "tokens_per_device": 4104 }, { "epoch": 0.5184, "loss_ce": 0.2512281537055969, "loss_lvr": 0.8677842617034912, "loss_mode_switch": 0.0, "loss_total": 0.3380065858364105, "step": 1296 }, { "batch_size": 4, "epoch": 0.5184, "step": 1296, "tokens_per_device": 1372 }, { "epoch": 0.5184, "loss_ce": 0.23739226162433624, "loss_lvr": 0.919110119342804, "loss_mode_switch": 0.0, "loss_total": 0.3293032646179199, "step": 1296 }, { "batch_size": 4, "epoch": 0.5184, "step": 1296, "tokens_per_device": 4188 }, { "epoch": 0.5184, "loss_ce": 0.15946519374847412, "loss_lvr": 0.8993377089500427, "loss_mode_switch": 0.0, "loss_total": 0.24939897656440735, "step": 1296 }, { "epoch": 0.5188, "grad_norm": 1.2994561195373535, "learning_rate": 4.938465202515524e-06, "loss": 0.2835, "step": 1297 }, { "batch_size": 1, "epoch": 0.5188, "step": 1297, "tokens_per_device": 5016 }, { "epoch": 0.5188, "loss_ce": 0.16417694091796875, "loss_lvr": 0.4480263888835907, "loss_mode_switch": 0.0, "loss_total": 0.20897957682609558, "step": 1297 }, { "batch_size": 1, "epoch": 0.5188, "step": 1297, "tokens_per_device": 4882 }, { "epoch": 0.5188, "loss_ce": 0.04052797704935074, "loss_lvr": 0.66623455286026, "loss_mode_switch": 0.0, "loss_total": 0.10715143382549286, "step": 1297 }, { "batch_size": 1, "epoch": 0.5188, "step": 1297, "tokens_per_device": 6829 }, { "epoch": 0.5188, "loss_ce": 0.08385727554559708, "loss_lvr": 0.33041107654571533, "loss_mode_switch": 0.0, "loss_total": 0.11689838767051697, "step": 1297 }, { "batch_size": 4, "epoch": 0.5188, "step": 1297, "tokens_per_device": 2676 }, { "epoch": 0.5188, "loss_ce": 0.3006559908390045, "loss_lvr": 0.8207573294639587, "loss_mode_switch": 0.0, "loss_total": 0.38273173570632935, "step": 1297 }, { "batch_size": 4, "epoch": 0.5188, "step": 1297, "tokens_per_device": 7084 }, { "epoch": 0.5188, "loss_ce": 0.5708049535751343, "loss_lvr": 0.7802651524543762, "loss_mode_switch": 0.0, "loss_total": 0.6488314867019653, "step": 1297 }, { "batch_size": 4, "epoch": 0.5188, "step": 1297, "tokens_per_device": 2712 }, { "epoch": 0.5188, "loss_ce": 0.0702367052435875, "loss_lvr": 1.1463623046875, "loss_mode_switch": 0.0, "loss_total": 0.18487294018268585, "step": 1297 }, { "batch_size": 4, "epoch": 0.5188, "step": 1297, "tokens_per_device": 3752 }, { "epoch": 0.5188, "loss_ce": 0.08424151688814163, "loss_lvr": 0.9737675189971924, "loss_mode_switch": 0.0, "loss_total": 0.18161827325820923, "step": 1297 }, { "batch_size": 4, "epoch": 0.5188, "step": 1297, "tokens_per_device": 2672 }, { "epoch": 0.5188, "loss_ce": 0.2686689496040344, "loss_lvr": 0.8400059342384338, "loss_mode_switch": 0.0, "loss_total": 0.35266953706741333, "step": 1297 }, { "epoch": 0.5192, "grad_norm": 1.2519313097000122, "learning_rate": 4.931988235903545e-06, "loss": 0.3099, "step": 1298 }, { "batch_size": 4, "epoch": 0.5192, "step": 1298, "tokens_per_device": 4808 }, { "epoch": 0.5192, "loss_ce": 0.025245847180485725, "loss_lvr": 1.2421740293502808, "loss_mode_switch": 0.0, "loss_total": 0.1494632512331009, "step": 1298 }, { "batch_size": 4, "epoch": 0.5192, "step": 1298, "tokens_per_device": 5000 }, { "epoch": 0.5192, "loss_ce": 0.06412411481142044, "loss_lvr": 0.9691705107688904, "loss_mode_switch": 0.0, "loss_total": 0.16104117035865784, "step": 1298 }, { "batch_size": 1, "epoch": 0.5192, "step": 1298, "tokens_per_device": 5150 }, { "epoch": 0.5192, "loss_ce": 0.04068618640303612, "loss_lvr": 0.26722824573516846, "loss_mode_switch": 0.0, "loss_total": 0.06740900874137878, "step": 1298 }, { "batch_size": 4, "epoch": 0.5192, "step": 1298, "tokens_per_device": 13088 }, { "epoch": 0.5192, "loss_ce": 0.3028572201728821, "loss_lvr": 0.8707371354103088, "loss_mode_switch": 0.0, "loss_total": 0.38993093371391296, "step": 1298 }, { "batch_size": 4, "epoch": 0.5192, "step": 1298, "tokens_per_device": 4436 }, { "epoch": 0.5192, "loss_ce": 0.2785763442516327, "loss_lvr": 0.6062856316566467, "loss_mode_switch": 0.0, "loss_total": 0.33920490741729736, "step": 1298 }, { "batch_size": 1, "epoch": 0.5192, "step": 1298, "tokens_per_device": 5097 }, { "epoch": 0.5192, "loss_ce": 0.1432068794965744, "loss_lvr": 0.43372228741645813, "loss_mode_switch": 0.0, "loss_total": 0.18657910823822021, "step": 1298 }, { "batch_size": 4, "epoch": 0.5192, "step": 1298, "tokens_per_device": 5752 }, { "epoch": 0.5192, "loss_ce": 0.13888561725616455, "loss_lvr": 0.888340175151825, "loss_mode_switch": 0.0, "loss_total": 0.22771963477134705, "step": 1298 }, { "batch_size": 4, "epoch": 0.5192, "step": 1298, "tokens_per_device": 4384 }, { "epoch": 0.5192, "loss_ce": 0.02269417978823185, "loss_lvr": 0.708467423915863, "loss_mode_switch": 0.0, "loss_total": 0.09354092180728912, "step": 1298 }, { "epoch": 0.5196, "grad_norm": 1.2135697603225708, "learning_rate": 4.925511383437446e-06, "loss": 0.26, "step": 1299 }, { "batch_size": 1, "epoch": 0.5196, "step": 1299, "tokens_per_device": 7173 }, { "epoch": 0.5196, "loss_ce": 0.0003085703938268125, "loss_lvr": 0.30001938343048096, "loss_mode_switch": 0.0, "loss_total": 0.030310507863759995, "step": 1299 }, { "batch_size": 4, "epoch": 0.5196, "step": 1299, "tokens_per_device": 4016 }, { "epoch": 0.5196, "loss_ce": 0.39850476384162903, "loss_lvr": 0.6931395530700684, "loss_mode_switch": 0.0, "loss_total": 0.4678187370300293, "step": 1299 }, { "batch_size": 1, "epoch": 0.5196, "step": 1299, "tokens_per_device": 4865 }, { "epoch": 0.5196, "loss_ce": 0.0009357539820484817, "loss_lvr": 0.6670386791229248, "loss_mode_switch": 0.0, "loss_total": 0.06763962656259537, "step": 1299 }, { "batch_size": 1, "epoch": 0.5196, "step": 1299, "tokens_per_device": 4864 }, { "epoch": 0.5196, "loss_ce": 0.0009963930351659656, "loss_lvr": 0.33871060609817505, "loss_mode_switch": 0.0, "loss_total": 0.03486745432019234, "step": 1299 }, { "batch_size": 1, "epoch": 0.5196, "step": 1299, "tokens_per_device": 4445 }, { "epoch": 0.5196, "loss_ce": 0.002826275071129203, "loss_lvr": 0.7079146504402161, "loss_mode_switch": 0.0, "loss_total": 0.07361774146556854, "step": 1299 }, { "batch_size": 4, "epoch": 0.5196, "step": 1299, "tokens_per_device": 6292 }, { "epoch": 0.5196, "loss_ce": 0.10257654637098312, "loss_lvr": 0.7134675979614258, "loss_mode_switch": 0.0, "loss_total": 0.1739233136177063, "step": 1299 }, { "batch_size": 4, "epoch": 0.5196, "step": 1299, "tokens_per_device": 2632 }, { "epoch": 0.5196, "loss_ce": 0.22541303932666779, "loss_lvr": 3.030404806137085, "loss_mode_switch": 0.0, "loss_total": 0.528453528881073, "step": 1299 }, { "batch_size": 4, "epoch": 0.5196, "step": 1299, "tokens_per_device": 5792 }, { "epoch": 0.5196, "loss_ce": 0.14546367526054382, "loss_lvr": 0.8493525981903076, "loss_mode_switch": 0.0, "loss_total": 0.23039893805980682, "step": 1299 }, { "epoch": 0.52, "grad_norm": 1.2665818929672241, "learning_rate": 4.919034655987493e-06, "loss": 0.2642, "step": 1300 }, { "batch_size": 4, "epoch": 0.52, "step": 1300, "tokens_per_device": 6356 }, { "epoch": 0.52, "loss_ce": 0.5751441121101379, "loss_lvr": 0.6321559548377991, "loss_mode_switch": 0.0, "loss_total": 0.6383597254753113, "step": 1300 }, { "batch_size": 4, "epoch": 0.52, "step": 1300, "tokens_per_device": 5776 }, { "epoch": 0.52, "loss_ce": 0.31272852420806885, "loss_lvr": 0.9527928829193115, "loss_mode_switch": 0.0, "loss_total": 0.40800780057907104, "step": 1300 }, { "batch_size": 4, "epoch": 0.52, "step": 1300, "tokens_per_device": 6020 }, { "epoch": 0.52, "loss_ce": 0.12384013086557388, "loss_lvr": 1.0106008052825928, "loss_mode_switch": 0.0, "loss_total": 0.22490021586418152, "step": 1300 }, { "batch_size": 4, "epoch": 0.52, "step": 1300, "tokens_per_device": 1376 }, { "epoch": 0.52, "loss_ce": 0.4606068730354309, "loss_lvr": 1.0166794061660767, "loss_mode_switch": 0.0, "loss_total": 0.5622748136520386, "step": 1300 }, { "batch_size": 4, "epoch": 0.52, "step": 1300, "tokens_per_device": 4056 }, { "epoch": 0.52, "loss_ce": 0.18312843143939972, "loss_lvr": 0.5380371809005737, "loss_mode_switch": 0.0, "loss_total": 0.2369321584701538, "step": 1300 }, { "batch_size": 1, "epoch": 0.52, "step": 1300, "tokens_per_device": 5524 }, { "epoch": 0.52, "loss_ce": 0.02039717137813568, "loss_lvr": 0.48842838406562805, "loss_mode_switch": 0.0, "loss_total": 0.0692400112748146, "step": 1300 }, { "batch_size": 4, "epoch": 0.52, "step": 1300, "tokens_per_device": 9872 }, { "epoch": 0.52, "loss_ce": 0.03945968300104141, "loss_lvr": 0.6819707751274109, "loss_mode_switch": 0.0, "loss_total": 0.10765676200389862, "step": 1300 }, { "batch_size": 4, "epoch": 0.52, "step": 1300, "tokens_per_device": 1436 }, { "epoch": 0.52, "loss_ce": 0.2055167257785797, "loss_lvr": 0.9634732007980347, "loss_mode_switch": 0.0, "loss_total": 0.30186405777931213, "step": 1300 }, { "epoch": 0.5204, "grad_norm": 1.3433012962341309, "learning_rate": 4.912558064423744e-06, "loss": 0.2675, "step": 1301 }, { "batch_size": 4, "epoch": 0.5204, "step": 1301, "tokens_per_device": 3428 }, { "epoch": 0.5204, "loss_ce": 0.29820516705513, "loss_lvr": 0.7933116555213928, "loss_mode_switch": 0.0, "loss_total": 0.3775363266468048, "step": 1301 }, { "batch_size": 1, "epoch": 0.5204, "step": 1301, "tokens_per_device": 4109 }, { "epoch": 0.5204, "loss_ce": 0.021226678043603897, "loss_lvr": 0.5762436389923096, "loss_mode_switch": 0.0, "loss_total": 0.07885104417800903, "step": 1301 }, { "batch_size": 1, "epoch": 0.5204, "step": 1301, "tokens_per_device": 5169 }, { "epoch": 0.5204, "loss_ce": 0.19178104400634766, "loss_lvr": 0.388092964887619, "loss_mode_switch": 0.0, "loss_total": 0.2305903434753418, "step": 1301 }, { "batch_size": 1, "epoch": 0.5204, "step": 1301, "tokens_per_device": 4897 }, { "epoch": 0.5204, "loss_ce": 0.0002289600670337677, "loss_lvr": 0.164082333445549, "loss_mode_switch": 0.0, "loss_total": 0.01663719303905964, "step": 1301 }, { "batch_size": 4, "epoch": 0.5204, "step": 1301, "tokens_per_device": 6688 }, { "epoch": 0.5204, "loss_ce": 0.45945027470588684, "loss_lvr": 0.8726824522018433, "loss_mode_switch": 0.0, "loss_total": 0.5467185378074646, "step": 1301 }, { "batch_size": 4, "epoch": 0.5204, "step": 1301, "tokens_per_device": 7076 }, { "epoch": 0.5204, "loss_ce": 0.09903012216091156, "loss_lvr": 0.4604122042655945, "loss_mode_switch": 0.0, "loss_total": 0.145071342587471, "step": 1301 }, { "batch_size": 4, "epoch": 0.5204, "step": 1301, "tokens_per_device": 4224 }, { "epoch": 0.5204, "loss_ce": 0.04042154178023338, "loss_lvr": 0.6548628807067871, "loss_mode_switch": 0.0, "loss_total": 0.10590782761573792, "step": 1301 }, { "batch_size": 1, "epoch": 0.5204, "step": 1301, "tokens_per_device": 4905 }, { "epoch": 0.5204, "loss_ce": 0.011826405301690102, "loss_lvr": 0.23153738677501678, "loss_mode_switch": 0.0, "loss_total": 0.03498014435172081, "step": 1301 }, { "epoch": 0.5208, "grad_norm": 1.2544221878051758, "learning_rate": 4.906081619616026e-06, "loss": 0.2549, "step": 1302 }, { "batch_size": 4, "epoch": 0.5208, "step": 1302, "tokens_per_device": 1504 }, { "epoch": 0.5208, "loss_ce": 0.4977160692214966, "loss_lvr": 0.7732920050621033, "loss_mode_switch": 0.0, "loss_total": 0.5750452876091003, "step": 1302 }, { "batch_size": 4, "epoch": 0.5208, "step": 1302, "tokens_per_device": 4332 }, { "epoch": 0.5208, "loss_ce": 0.15547488629817963, "loss_lvr": 0.5388184785842896, "loss_mode_switch": 0.0, "loss_total": 0.20935674011707306, "step": 1302 }, { "batch_size": 1, "epoch": 0.5208, "step": 1302, "tokens_per_device": 4904 }, { "epoch": 0.5208, "loss_ce": 0.1421343833208084, "loss_lvr": 0.3277232050895691, "loss_mode_switch": 0.0, "loss_total": 0.17490670084953308, "step": 1302 }, { "batch_size": 4, "epoch": 0.5208, "step": 1302, "tokens_per_device": 4276 }, { "epoch": 0.5208, "loss_ce": 0.5225706696510315, "loss_lvr": 0.8618022799491882, "loss_mode_switch": 0.0, "loss_total": 0.6087508797645569, "step": 1302 }, { "batch_size": 4, "epoch": 0.5208, "step": 1302, "tokens_per_device": 1280 }, { "epoch": 0.5208, "loss_ce": 0.22188782691955566, "loss_lvr": 1.0241276025772095, "loss_mode_switch": 0.0, "loss_total": 0.3243005871772766, "step": 1302 }, { "batch_size": 4, "epoch": 0.5208, "step": 1302, "tokens_per_device": 1592 }, { "epoch": 0.5208, "loss_ce": 0.4690761864185333, "loss_lvr": 1.4229403734207153, "loss_mode_switch": 0.0, "loss_total": 0.6113702058792114, "step": 1302 }, { "batch_size": 1, "epoch": 0.5208, "step": 1302, "tokens_per_device": 5102 }, { "epoch": 0.5208, "loss_ce": 0.033142123371362686, "loss_lvr": 0.22691485285758972, "loss_mode_switch": 0.0, "loss_total": 0.0558336079120636, "step": 1302 }, { "batch_size": 4, "epoch": 0.5208, "step": 1302, "tokens_per_device": 4172 }, { "epoch": 0.5208, "loss_ce": 0.3828699290752411, "loss_lvr": 1.046985149383545, "loss_mode_switch": 0.0, "loss_total": 0.4875684380531311, "step": 1302 }, { "epoch": 0.5212, "grad_norm": 1.3359447717666626, "learning_rate": 4.899605332433922e-06, "loss": 0.3261, "step": 1303 }, { "batch_size": 4, "epoch": 0.5212, "step": 1303, "tokens_per_device": 1252 }, { "epoch": 0.5212, "loss_ce": 0.1203935518860817, "loss_lvr": 1.227323055267334, "loss_mode_switch": 0.0, "loss_total": 0.24312585592269897, "step": 1303 }, { "batch_size": 1, "epoch": 0.5212, "step": 1303, "tokens_per_device": 7406 }, { "epoch": 0.5212, "loss_ce": 0.05445922911167145, "loss_lvr": 0.3726640045642853, "loss_mode_switch": 0.0, "loss_total": 0.09172563254833221, "step": 1303 }, { "batch_size": 4, "epoch": 0.5212, "step": 1303, "tokens_per_device": 5604 }, { "epoch": 0.5212, "loss_ce": 0.405527800321579, "loss_lvr": 0.7468575239181519, "loss_mode_switch": 0.0, "loss_total": 0.48021355271339417, "step": 1303 }, { "batch_size": 4, "epoch": 0.5212, "step": 1303, "tokens_per_device": 5796 }, { "epoch": 0.5212, "loss_ce": 0.1616837978363037, "loss_lvr": 0.7205566763877869, "loss_mode_switch": 0.0, "loss_total": 0.2337394654750824, "step": 1303 }, { "batch_size": 4, "epoch": 0.5212, "step": 1303, "tokens_per_device": 5344 }, { "epoch": 0.5212, "loss_ce": 0.3103296458721161, "loss_lvr": 0.9247984886169434, "loss_mode_switch": 0.0, "loss_total": 0.4028095006942749, "step": 1303 }, { "batch_size": 4, "epoch": 0.5212, "step": 1303, "tokens_per_device": 5144 }, { "epoch": 0.5212, "loss_ce": 0.04709264636039734, "loss_lvr": 0.8925711512565613, "loss_mode_switch": 0.0, "loss_total": 0.13634976744651794, "step": 1303 }, { "batch_size": 4, "epoch": 0.5212, "step": 1303, "tokens_per_device": 1544 }, { "epoch": 0.5212, "loss_ce": 0.2917054295539856, "loss_lvr": 0.8623661398887634, "loss_mode_switch": 0.0, "loss_total": 0.3779420554637909, "step": 1303 }, { "batch_size": 4, "epoch": 0.5212, "step": 1303, "tokens_per_device": 4296 }, { "epoch": 0.5212, "loss_ce": 0.5025737285614014, "loss_lvr": 1.0274794101715088, "loss_mode_switch": 0.0, "loss_total": 0.6053216457366943, "step": 1303 }, { "epoch": 0.5216, "grad_norm": 1.3309653997421265, "learning_rate": 4.8931292137467525e-06, "loss": 0.2815, "step": 1304 }, { "batch_size": 4, "epoch": 0.5216, "step": 1304, "tokens_per_device": 5064 }, { "epoch": 0.5216, "loss_ce": 0.12516887485980988, "loss_lvr": 0.8695307970046997, "loss_mode_switch": 0.0, "loss_total": 0.21212196350097656, "step": 1304 }, { "batch_size": 4, "epoch": 0.5216, "step": 1304, "tokens_per_device": 3996 }, { "epoch": 0.5216, "loss_ce": 0.08597972244024277, "loss_lvr": 0.627242922782898, "loss_mode_switch": 0.0, "loss_total": 0.14870402216911316, "step": 1304 }, { "batch_size": 4, "epoch": 0.5216, "step": 1304, "tokens_per_device": 1292 }, { "epoch": 0.5216, "loss_ce": 0.06671301275491714, "loss_lvr": 0.9265918135643005, "loss_mode_switch": 0.0, "loss_total": 0.15937219560146332, "step": 1304 }, { "batch_size": 4, "epoch": 0.5216, "step": 1304, "tokens_per_device": 4868 }, { "epoch": 0.5216, "loss_ce": 0.26745420694351196, "loss_lvr": 0.6895604729652405, "loss_mode_switch": 0.0, "loss_total": 0.336410254240036, "step": 1304 }, { "batch_size": 1, "epoch": 0.5216, "step": 1304, "tokens_per_device": 5078 }, { "epoch": 0.5216, "loss_ce": 0.0012690700823441148, "loss_lvr": 0.8253244757652283, "loss_mode_switch": 0.0, "loss_total": 0.0838015228509903, "step": 1304 }, { "batch_size": 1, "epoch": 0.5216, "step": 1304, "tokens_per_device": 5177 }, { "epoch": 0.5216, "loss_ce": 0.13038188219070435, "loss_lvr": 0.4295142889022827, "loss_mode_switch": 0.0, "loss_total": 0.1733333170413971, "step": 1304 }, { "batch_size": 1, "epoch": 0.5216, "step": 1304, "tokens_per_device": 5114 }, { "epoch": 0.5216, "loss_ce": 0.08933696895837784, "loss_lvr": 0.37736040353775024, "loss_mode_switch": 0.0, "loss_total": 0.1270730048418045, "step": 1304 }, { "batch_size": 1, "epoch": 0.5216, "step": 1304, "tokens_per_device": 5519 }, { "epoch": 0.5216, "loss_ce": 0.22056543827056885, "loss_lvr": 0.49095356464385986, "loss_mode_switch": 0.0, "loss_total": 0.2696608006954193, "step": 1304 }, { "epoch": 0.522, "grad_norm": 1.4002306461334229, "learning_rate": 4.886653274423551e-06, "loss": 0.2673, "step": 1305 }, { "batch_size": 4, "epoch": 0.522, "step": 1305, "tokens_per_device": 4216 }, { "epoch": 0.522, "loss_ce": 0.17881213128566742, "loss_lvr": 0.9464051127433777, "loss_mode_switch": 0.0, "loss_total": 0.27345263957977295, "step": 1305 }, { "batch_size": 4, "epoch": 0.522, "step": 1305, "tokens_per_device": 3108 }, { "epoch": 0.522, "loss_ce": 0.09982259571552277, "loss_lvr": 1.0917431116104126, "loss_mode_switch": 0.0, "loss_total": 0.20899690687656403, "step": 1305 }, { "batch_size": 4, "epoch": 0.522, "step": 1305, "tokens_per_device": 3924 }, { "epoch": 0.522, "loss_ce": 0.21683724224567413, "loss_lvr": 0.8902897238731384, "loss_mode_switch": 0.0, "loss_total": 0.30586621165275574, "step": 1305 }, { "batch_size": 1, "epoch": 0.522, "step": 1305, "tokens_per_device": 4558 }, { "epoch": 0.522, "loss_ce": 0.00045444496208801866, "loss_lvr": 0.42364344000816345, "loss_mode_switch": 0.0, "loss_total": 0.042818788439035416, "step": 1305 }, { "batch_size": 4, "epoch": 0.522, "step": 1305, "tokens_per_device": 4352 }, { "epoch": 0.522, "loss_ce": 0.09258221834897995, "loss_lvr": 0.7804810404777527, "loss_mode_switch": 0.0, "loss_total": 0.1706303209066391, "step": 1305 }, { "batch_size": 4, "epoch": 0.522, "step": 1305, "tokens_per_device": 1964 }, { "epoch": 0.522, "loss_ce": 0.2801361680030823, "loss_lvr": 0.9798712134361267, "loss_mode_switch": 0.0, "loss_total": 0.37812328338623047, "step": 1305 }, { "batch_size": 4, "epoch": 0.522, "step": 1305, "tokens_per_device": 3304 }, { "epoch": 0.522, "loss_ce": 0.21787306666374207, "loss_lvr": 0.8774206638336182, "loss_mode_switch": 0.0, "loss_total": 0.3056151270866394, "step": 1305 }, { "batch_size": 1, "epoch": 0.522, "step": 1305, "tokens_per_device": 4931 }, { "epoch": 0.522, "loss_ce": 0.039698656648397446, "loss_lvr": 0.5189934968948364, "loss_mode_switch": 0.0, "loss_total": 0.09159800410270691, "step": 1305 }, { "epoch": 0.5224, "grad_norm": 1.2461298704147339, "learning_rate": 4.880177525333051e-06, "loss": 0.2792, "step": 1306 }, { "batch_size": 4, "epoch": 0.5224, "step": 1306, "tokens_per_device": 4128 }, { "epoch": 0.5224, "loss_ce": 0.1380588263273239, "loss_lvr": 1.0767183303833008, "loss_mode_switch": 0.0, "loss_total": 0.2457306683063507, "step": 1306 }, { "batch_size": 4, "epoch": 0.5224, "step": 1306, "tokens_per_device": 3760 }, { "epoch": 0.5224, "loss_ce": 0.43190497159957886, "loss_lvr": 0.9756117463111877, "loss_mode_switch": 0.0, "loss_total": 0.5294661521911621, "step": 1306 }, { "batch_size": 1, "epoch": 0.5224, "step": 1306, "tokens_per_device": 4263 }, { "epoch": 0.5224, "loss_ce": 0.0009225767571479082, "loss_lvr": 0.34298720955848694, "loss_mode_switch": 0.0, "loss_total": 0.035221297293901443, "step": 1306 }, { "batch_size": 4, "epoch": 0.5224, "step": 1306, "tokens_per_device": 5672 }, { "epoch": 0.5224, "loss_ce": 0.28961533308029175, "loss_lvr": 0.9524214267730713, "loss_mode_switch": 0.0, "loss_total": 0.3848574757575989, "step": 1306 }, { "batch_size": 1, "epoch": 0.5224, "step": 1306, "tokens_per_device": 4642 }, { "epoch": 0.5224, "loss_ce": 0.0049290950410068035, "loss_lvr": 1.6486766338348389, "loss_mode_switch": 0.0, "loss_total": 0.16979676485061646, "step": 1306 }, { "batch_size": 4, "epoch": 0.5224, "step": 1306, "tokens_per_device": 3932 }, { "epoch": 0.5224, "loss_ce": 0.3630084693431854, "loss_lvr": 0.9993470311164856, "loss_mode_switch": 0.0, "loss_total": 0.4629431664943695, "step": 1306 }, { "batch_size": 1, "epoch": 0.5224, "step": 1306, "tokens_per_device": 4914 }, { "epoch": 0.5224, "loss_ce": 0.07258016616106033, "loss_lvr": 0.3764340281486511, "loss_mode_switch": 0.0, "loss_total": 0.11022356897592545, "step": 1306 }, { "batch_size": 4, "epoch": 0.5224, "step": 1306, "tokens_per_device": 3400 }, { "epoch": 0.5224, "loss_ce": 0.08406006544828415, "loss_lvr": 0.9266835451126099, "loss_mode_switch": 0.0, "loss_total": 0.17672842741012573, "step": 1306 }, { "epoch": 0.5228, "grad_norm": 1.1449503898620605, "learning_rate": 4.873701977343667e-06, "loss": 0.2826, "step": 1307 }, { "batch_size": 1, "epoch": 0.5228, "step": 1307, "tokens_per_device": 4857 }, { "epoch": 0.5228, "loss_ce": 0.0005234675481915474, "loss_lvr": 0.36250871419906616, "loss_mode_switch": 0.0, "loss_total": 0.03677433729171753, "step": 1307 }, { "batch_size": 4, "epoch": 0.5228, "step": 1307, "tokens_per_device": 5668 }, { "epoch": 0.5228, "loss_ce": 0.18447710573673248, "loss_lvr": 0.7678259611129761, "loss_mode_switch": 0.0, "loss_total": 0.26125970482826233, "step": 1307 }, { "batch_size": 1, "epoch": 0.5228, "step": 1307, "tokens_per_device": 5276 }, { "epoch": 0.5228, "loss_ce": 0.04502798989415169, "loss_lvr": 0.30449312925338745, "loss_mode_switch": 0.0, "loss_total": 0.07547730207443237, "step": 1307 }, { "batch_size": 4, "epoch": 0.5228, "step": 1307, "tokens_per_device": 4512 }, { "epoch": 0.5228, "loss_ce": 0.2854871451854706, "loss_lvr": 0.9419378042221069, "loss_mode_switch": 0.0, "loss_total": 0.37968093156814575, "step": 1307 }, { "batch_size": 4, "epoch": 0.5228, "step": 1307, "tokens_per_device": 1572 }, { "epoch": 0.5228, "loss_ce": 0.7137339115142822, "loss_lvr": 0.9006138443946838, "loss_mode_switch": 0.0, "loss_total": 0.8037952780723572, "step": 1307 }, { "batch_size": 4, "epoch": 0.5228, "step": 1307, "tokens_per_device": 1960 }, { "epoch": 0.5228, "loss_ce": 0.4339141547679901, "loss_lvr": 1.4629981517791748, "loss_mode_switch": 0.0, "loss_total": 0.5802139639854431, "step": 1307 }, { "batch_size": 1, "epoch": 0.5228, "step": 1307, "tokens_per_device": 5616 }, { "epoch": 0.5228, "loss_ce": 0.007182449102401733, "loss_lvr": 0.3356468975543976, "loss_mode_switch": 0.0, "loss_total": 0.04074713960289955, "step": 1307 }, { "batch_size": 1, "epoch": 0.5228, "step": 1307, "tokens_per_device": 4822 }, { "epoch": 0.5228, "loss_ce": 0.0003798821126110852, "loss_lvr": 0.12719383835792542, "loss_mode_switch": 0.0, "loss_total": 0.013099266216158867, "step": 1307 }, { "epoch": 0.5232, "grad_norm": 1.2592910528182983, "learning_rate": 4.867226641323481e-06, "loss": 0.2968, "step": 1308 }, { "batch_size": 1, "epoch": 0.5232, "step": 1308, "tokens_per_device": 4933 }, { "epoch": 0.5232, "loss_ce": 0.043964486569166183, "loss_lvr": 0.5505694150924683, "loss_mode_switch": 0.0, "loss_total": 0.09902142733335495, "step": 1308 }, { "batch_size": 1, "epoch": 0.5232, "step": 1308, "tokens_per_device": 5165 }, { "epoch": 0.5232, "loss_ce": 0.006282842252403498, "loss_lvr": 0.6162218451499939, "loss_mode_switch": 0.0, "loss_total": 0.06790502369403839, "step": 1308 }, { "batch_size": 4, "epoch": 0.5232, "step": 1308, "tokens_per_device": 10576 }, { "epoch": 0.5232, "loss_ce": 0.33868274092674255, "loss_lvr": 0.8529160022735596, "loss_mode_switch": 0.0, "loss_total": 0.42397433519363403, "step": 1308 }, { "batch_size": 4, "epoch": 0.5232, "step": 1308, "tokens_per_device": 4276 }, { "epoch": 0.5232, "loss_ce": 0.29394444823265076, "loss_lvr": 0.8084201216697693, "loss_mode_switch": 0.0, "loss_total": 0.37478646636009216, "step": 1308 }, { "batch_size": 4, "epoch": 0.5232, "step": 1308, "tokens_per_device": 2296 }, { "epoch": 0.5232, "loss_ce": 0.29562821984291077, "loss_lvr": 0.7611331343650818, "loss_mode_switch": 0.0, "loss_total": 0.37174153327941895, "step": 1308 }, { "batch_size": 4, "epoch": 0.5232, "step": 1308, "tokens_per_device": 11020 }, { "epoch": 0.5232, "loss_ce": 0.20013326406478882, "loss_lvr": 0.8672370314598083, "loss_mode_switch": 0.0, "loss_total": 0.2868569791316986, "step": 1308 }, { "batch_size": 4, "epoch": 0.5232, "step": 1308, "tokens_per_device": 9212 }, { "epoch": 0.5232, "loss_ce": 0.26991838216781616, "loss_lvr": 1.1239798069000244, "loss_mode_switch": 0.0, "loss_total": 0.38231635093688965, "step": 1308 }, { "batch_size": 4, "epoch": 0.5232, "step": 1308, "tokens_per_device": 4308 }, { "epoch": 0.5232, "loss_ce": 0.565119743347168, "loss_lvr": 0.9379798173904419, "loss_mode_switch": 0.0, "loss_total": 0.6589177250862122, "step": 1308 }, { "epoch": 0.5236, "grad_norm": 1.7396042346954346, "learning_rate": 4.860751528140209e-06, "loss": 0.2611, "step": 1309 }, { "batch_size": 4, "epoch": 0.5236, "step": 1309, "tokens_per_device": 6172 }, { "epoch": 0.5236, "loss_ce": 0.24890772998332977, "loss_lvr": 0.7104283571243286, "loss_mode_switch": 0.0, "loss_total": 0.31995058059692383, "step": 1309 }, { "batch_size": 4, "epoch": 0.5236, "step": 1309, "tokens_per_device": 7200 }, { "epoch": 0.5236, "loss_ce": 0.2259756624698639, "loss_lvr": 0.9955344200134277, "loss_mode_switch": 0.0, "loss_total": 0.3255290985107422, "step": 1309 }, { "batch_size": 4, "epoch": 0.5236, "step": 1309, "tokens_per_device": 4248 }, { "epoch": 0.5236, "loss_ce": 0.1645488440990448, "loss_lvr": 1.090438961982727, "loss_mode_switch": 0.0, "loss_total": 0.2735927402973175, "step": 1309 }, { "batch_size": 4, "epoch": 0.5236, "step": 1309, "tokens_per_device": 4480 }, { "epoch": 0.5236, "loss_ce": 0.09499441832304001, "loss_lvr": 0.7784069776535034, "loss_mode_switch": 0.0, "loss_total": 0.172835111618042, "step": 1309 }, { "batch_size": 4, "epoch": 0.5236, "step": 1309, "tokens_per_device": 2332 }, { "epoch": 0.5236, "loss_ce": 0.28523916006088257, "loss_lvr": 0.8433293700218201, "loss_mode_switch": 0.0, "loss_total": 0.36957210302352905, "step": 1309 }, { "batch_size": 4, "epoch": 0.5236, "step": 1309, "tokens_per_device": 13804 }, { "epoch": 0.5236, "loss_ce": 0.09206157177686691, "loss_lvr": 0.7251240015029907, "loss_mode_switch": 0.0, "loss_total": 0.16457396745681763, "step": 1309 }, { "batch_size": 1, "epoch": 0.5236, "step": 1309, "tokens_per_device": 5092 }, { "epoch": 0.5236, "loss_ce": 0.02991081401705742, "loss_lvr": 0.12333928048610687, "loss_mode_switch": 0.0, "loss_total": 0.042244743555784225, "step": 1309 }, { "batch_size": 4, "epoch": 0.5236, "step": 1309, "tokens_per_device": 9460 }, { "epoch": 0.5236, "loss_ce": 0.2002931535243988, "loss_lvr": 0.9574525952339172, "loss_mode_switch": 0.0, "loss_total": 0.296038419008255, "step": 1309 }, { "epoch": 0.524, "grad_norm": 1.6177089214324951, "learning_rate": 4.8542766486612035e-06, "loss": 0.273, "step": 1310 }, { "batch_size": 1, "epoch": 0.524, "step": 1310, "tokens_per_device": 4904 }, { "epoch": 0.524, "loss_ce": 0.006424732971936464, "loss_lvr": 0.9470155835151672, "loss_mode_switch": 0.0, "loss_total": 0.1011262908577919, "step": 1310 }, { "batch_size": 1, "epoch": 0.524, "step": 1310, "tokens_per_device": 4894 }, { "epoch": 0.524, "loss_ce": 0.0009474402177147567, "loss_lvr": 0.34745949506759644, "loss_mode_switch": 0.0, "loss_total": 0.035693392157554626, "step": 1310 }, { "batch_size": 4, "epoch": 0.524, "step": 1310, "tokens_per_device": 3912 }, { "epoch": 0.524, "loss_ce": 0.13927923142910004, "loss_lvr": 0.8125774264335632, "loss_mode_switch": 0.0, "loss_total": 0.2205369770526886, "step": 1310 }, { "batch_size": 4, "epoch": 0.524, "step": 1310, "tokens_per_device": 6168 }, { "epoch": 0.524, "loss_ce": 0.17430974543094635, "loss_lvr": 0.8005385994911194, "loss_mode_switch": 0.0, "loss_total": 0.2543635964393616, "step": 1310 }, { "batch_size": 4, "epoch": 0.524, "step": 1310, "tokens_per_device": 5520 }, { "epoch": 0.524, "loss_ce": 0.04309326782822609, "loss_lvr": 0.9508199095726013, "loss_mode_switch": 0.0, "loss_total": 0.13817526400089264, "step": 1310 }, { "batch_size": 4, "epoch": 0.524, "step": 1310, "tokens_per_device": 5704 }, { "epoch": 0.524, "loss_ce": 0.10825636982917786, "loss_lvr": 0.6694634556770325, "loss_mode_switch": 0.0, "loss_total": 0.17520272731781006, "step": 1310 }, { "batch_size": 1, "epoch": 0.524, "step": 1310, "tokens_per_device": 5122 }, { "epoch": 0.524, "loss_ce": 0.0977834016084671, "loss_lvr": 0.7539692521095276, "loss_mode_switch": 0.0, "loss_total": 0.17318032681941986, "step": 1310 }, { "batch_size": 1, "epoch": 0.524, "step": 1310, "tokens_per_device": 4964 }, { "epoch": 0.524, "loss_ce": 0.00532975560054183, "loss_lvr": 0.8388146162033081, "loss_mode_switch": 0.0, "loss_total": 0.08921121805906296, "step": 1310 }, { "epoch": 0.5244, "grad_norm": 1.3915791511535645, "learning_rate": 4.847802013753415e-06, "loss": 0.28, "step": 1311 }, { "batch_size": 4, "epoch": 0.5244, "step": 1311, "tokens_per_device": 3940 }, { "epoch": 0.5244, "loss_ce": 0.2988484799861908, "loss_lvr": 0.9637842178344727, "loss_mode_switch": 0.0, "loss_total": 0.3952268958091736, "step": 1311 }, { "batch_size": 4, "epoch": 0.5244, "step": 1311, "tokens_per_device": 4200 }, { "epoch": 0.5244, "loss_ce": 0.05518524721264839, "loss_lvr": 0.8930599689483643, "loss_mode_switch": 0.0, "loss_total": 0.14449124038219452, "step": 1311 }, { "batch_size": 4, "epoch": 0.5244, "step": 1311, "tokens_per_device": 1372 }, { "epoch": 0.5244, "loss_ce": 0.4365363121032715, "loss_lvr": 0.7774926424026489, "loss_mode_switch": 0.0, "loss_total": 0.5142855644226074, "step": 1311 }, { "batch_size": 4, "epoch": 0.5244, "step": 1311, "tokens_per_device": 5484 }, { "epoch": 0.5244, "loss_ce": 0.45191749930381775, "loss_lvr": 0.590546190738678, "loss_mode_switch": 0.0, "loss_total": 0.5109721422195435, "step": 1311 }, { "batch_size": 4, "epoch": 0.5244, "step": 1311, "tokens_per_device": 6852 }, { "epoch": 0.5244, "loss_ce": 0.16554391384124756, "loss_lvr": 0.8877963423728943, "loss_mode_switch": 0.0, "loss_total": 0.2543235421180725, "step": 1311 }, { "batch_size": 4, "epoch": 0.5244, "step": 1311, "tokens_per_device": 7288 }, { "epoch": 0.5244, "loss_ce": 0.0866720974445343, "loss_lvr": 0.7347792387008667, "loss_mode_switch": 0.0, "loss_total": 0.16015002131462097, "step": 1311 }, { "batch_size": 4, "epoch": 0.5244, "step": 1311, "tokens_per_device": 4224 }, { "epoch": 0.5244, "loss_ce": 0.1600596010684967, "loss_lvr": 0.7145724892616272, "loss_mode_switch": 0.0, "loss_total": 0.23151685297489166, "step": 1311 }, { "batch_size": 1, "epoch": 0.5244, "step": 1311, "tokens_per_device": 5065 }, { "epoch": 0.5244, "loss_ce": 0.28357720375061035, "loss_lvr": 0.3817897140979767, "loss_mode_switch": 0.0, "loss_total": 0.32175618410110474, "step": 1311 }, { "epoch": 0.5248, "grad_norm": 1.2314119338989258, "learning_rate": 4.841327634283392e-06, "loss": 0.2659, "step": 1312 }, { "batch_size": 1, "epoch": 0.5248, "step": 1312, "tokens_per_device": 4835 }, { "epoch": 0.5248, "loss_ce": 0.007328706793487072, "loss_lvr": 0.35331594944000244, "loss_mode_switch": 0.0, "loss_total": 0.04266030341386795, "step": 1312 }, { "batch_size": 1, "epoch": 0.5248, "step": 1312, "tokens_per_device": 4883 }, { "epoch": 0.5248, "loss_ce": 0.002326439833268523, "loss_lvr": 0.3514515459537506, "loss_mode_switch": 0.0, "loss_total": 0.03747159615159035, "step": 1312 }, { "batch_size": 1, "epoch": 0.5248, "step": 1312, "tokens_per_device": 5242 }, { "epoch": 0.5248, "loss_ce": 0.013916085474193096, "loss_lvr": 0.35027414560317993, "loss_mode_switch": 0.0, "loss_total": 0.048943500965833664, "step": 1312 }, { "batch_size": 1, "epoch": 0.5248, "step": 1312, "tokens_per_device": 4864 }, { "epoch": 0.5248, "loss_ce": 0.1500396430492401, "loss_lvr": 0.1855599582195282, "loss_mode_switch": 0.0, "loss_total": 0.16859564185142517, "step": 1312 }, { "batch_size": 4, "epoch": 0.5248, "step": 1312, "tokens_per_device": 2900 }, { "epoch": 0.5248, "loss_ce": 0.6054635047912598, "loss_lvr": 0.8252657651901245, "loss_mode_switch": 0.0, "loss_total": 0.6879900693893433, "step": 1312 }, { "batch_size": 4, "epoch": 0.5248, "step": 1312, "tokens_per_device": 5712 }, { "epoch": 0.5248, "loss_ce": 0.4998513162136078, "loss_lvr": 0.877440869808197, "loss_mode_switch": 0.0, "loss_total": 0.5875954031944275, "step": 1312 }, { "batch_size": 1, "epoch": 0.5248, "step": 1312, "tokens_per_device": 4890 }, { "epoch": 0.5248, "loss_ce": 0.17606845498085022, "loss_lvr": 0.29060959815979004, "loss_mode_switch": 0.0, "loss_total": 0.20512941479682922, "step": 1312 }, { "batch_size": 4, "epoch": 0.5248, "step": 1312, "tokens_per_device": 2528 }, { "epoch": 0.5248, "loss_ce": 0.45650091767311096, "loss_lvr": 1.1611533164978027, "loss_mode_switch": 0.0, "loss_total": 0.5726162195205688, "step": 1312 }, { "epoch": 0.5252, "grad_norm": 1.3309009075164795, "learning_rate": 4.834853521117251e-06, "loss": 0.2747, "step": 1313 }, { "batch_size": 4, "epoch": 0.5252, "step": 1313, "tokens_per_device": 2876 }, { "epoch": 0.5252, "loss_ce": 0.35142600536346436, "loss_lvr": 0.54709392786026, "loss_mode_switch": 0.0, "loss_total": 0.4061354100704193, "step": 1313 }, { "batch_size": 4, "epoch": 0.5252, "step": 1313, "tokens_per_device": 4284 }, { "epoch": 0.5252, "loss_ce": 0.49093204736709595, "loss_lvr": 0.6916158199310303, "loss_mode_switch": 0.0, "loss_total": 0.5600936412811279, "step": 1313 }, { "batch_size": 4, "epoch": 0.5252, "step": 1313, "tokens_per_device": 5036 }, { "epoch": 0.5252, "loss_ce": 0.13047081232070923, "loss_lvr": 0.8989426493644714, "loss_mode_switch": 0.0, "loss_total": 0.22036507725715637, "step": 1313 }, { "batch_size": 4, "epoch": 0.5252, "step": 1313, "tokens_per_device": 5056 }, { "epoch": 0.5252, "loss_ce": 0.6050385236740112, "loss_lvr": 0.7424412369728088, "loss_mode_switch": 0.0, "loss_total": 0.6792826652526855, "step": 1313 }, { "batch_size": 4, "epoch": 0.5252, "step": 1313, "tokens_per_device": 3836 }, { "epoch": 0.5252, "loss_ce": 0.5814791321754456, "loss_lvr": 1.0431822538375854, "loss_mode_switch": 0.0, "loss_total": 0.6857973337173462, "step": 1313 }, { "batch_size": 4, "epoch": 0.5252, "step": 1313, "tokens_per_device": 9632 }, { "epoch": 0.5252, "loss_ce": 0.5197076797485352, "loss_lvr": 0.7685743570327759, "loss_mode_switch": 0.0, "loss_total": 0.5965651273727417, "step": 1313 }, { "batch_size": 4, "epoch": 0.5252, "step": 1313, "tokens_per_device": 3936 }, { "epoch": 0.5252, "loss_ce": 0.18007996678352356, "loss_lvr": 2.0030205249786377, "loss_mode_switch": 0.0, "loss_total": 0.3803820013999939, "step": 1313 }, { "batch_size": 4, "epoch": 0.5252, "step": 1313, "tokens_per_device": 5212 }, { "epoch": 0.5252, "loss_ce": 0.06922518461942673, "loss_lvr": 0.5622938275337219, "loss_mode_switch": 0.0, "loss_total": 0.12545457482337952, "step": 1313 }, { "epoch": 0.5256, "grad_norm": 1.5197467803955078, "learning_rate": 4.828379685120659e-06, "loss": 0.3342, "step": 1314 }, { "batch_size": 4, "epoch": 0.5256, "step": 1314, "tokens_per_device": 2644 }, { "epoch": 0.5256, "loss_ce": 0.4355192184448242, "loss_lvr": 0.8849655389785767, "loss_mode_switch": 0.0, "loss_total": 0.5240157842636108, "step": 1314 }, { "batch_size": 4, "epoch": 0.5256, "step": 1314, "tokens_per_device": 4096 }, { "epoch": 0.5256, "loss_ce": 0.5095131993293762, "loss_lvr": 0.9412026405334473, "loss_mode_switch": 0.0, "loss_total": 0.603633463382721, "step": 1314 }, { "batch_size": 4, "epoch": 0.5256, "step": 1314, "tokens_per_device": 1300 }, { "epoch": 0.5256, "loss_ce": 0.34195655584335327, "loss_lvr": 1.0349767208099365, "loss_mode_switch": 0.0, "loss_total": 0.4454542398452759, "step": 1314 }, { "batch_size": 4, "epoch": 0.5256, "step": 1314, "tokens_per_device": 5084 }, { "epoch": 0.5256, "loss_ce": 0.6995028853416443, "loss_lvr": 0.6976193785667419, "loss_mode_switch": 0.0, "loss_total": 0.769264817237854, "step": 1314 }, { "batch_size": 1, "epoch": 0.5256, "step": 1314, "tokens_per_device": 4886 }, { "epoch": 0.5256, "loss_ce": 0.3607892692089081, "loss_lvr": 0.18120084702968597, "loss_mode_switch": 0.0, "loss_total": 0.3789093494415283, "step": 1314 }, { "batch_size": 1, "epoch": 0.5256, "step": 1314, "tokens_per_device": 4711 }, { "epoch": 0.5256, "loss_ce": 0.2503705322742462, "loss_lvr": 0.35830968618392944, "loss_mode_switch": 0.0, "loss_total": 0.28620150685310364, "step": 1314 }, { "batch_size": 4, "epoch": 0.5256, "step": 1314, "tokens_per_device": 4220 }, { "epoch": 0.5256, "loss_ce": 0.4631069600582123, "loss_lvr": 0.8461882472038269, "loss_mode_switch": 0.0, "loss_total": 0.5477257966995239, "step": 1314 }, { "batch_size": 4, "epoch": 0.5256, "step": 1314, "tokens_per_device": 2636 }, { "epoch": 0.5256, "loss_ce": 0.24048282206058502, "loss_lvr": 1.0042065382003784, "loss_mode_switch": 0.0, "loss_total": 0.34090346097946167, "step": 1314 }, { "epoch": 0.526, "grad_norm": 1.2848104238510132, "learning_rate": 4.821906137158822e-06, "loss": 0.2979, "step": 1315 }, { "batch_size": 4, "epoch": 0.526, "step": 1315, "tokens_per_device": 7460 }, { "epoch": 0.526, "loss_ce": 0.17141640186309814, "loss_lvr": 1.0915576219558716, "loss_mode_switch": 0.0, "loss_total": 0.28057217597961426, "step": 1315 }, { "batch_size": 4, "epoch": 0.526, "step": 1315, "tokens_per_device": 7012 }, { "epoch": 0.526, "loss_ce": 0.4554525315761566, "loss_lvr": 0.7890079617500305, "loss_mode_switch": 0.0, "loss_total": 0.5343533158302307, "step": 1315 }, { "batch_size": 4, "epoch": 0.526, "step": 1315, "tokens_per_device": 12032 }, { "epoch": 0.526, "loss_ce": 0.2995482087135315, "loss_lvr": 0.3402467668056488, "loss_mode_switch": 0.0, "loss_total": 0.3335728943347931, "step": 1315 }, { "batch_size": 1, "epoch": 0.526, "step": 1315, "tokens_per_device": 4731 }, { "epoch": 0.526, "loss_ce": 0.00018180902407038957, "loss_lvr": 0.23857831954956055, "loss_mode_switch": 0.0, "loss_total": 0.02403964102268219, "step": 1315 }, { "batch_size": 4, "epoch": 0.526, "step": 1315, "tokens_per_device": 4052 }, { "epoch": 0.526, "loss_ce": 0.17542581260204315, "loss_lvr": 0.9809847474098206, "loss_mode_switch": 0.0, "loss_total": 0.27352428436279297, "step": 1315 }, { "batch_size": 4, "epoch": 0.526, "step": 1315, "tokens_per_device": 4864 }, { "epoch": 0.526, "loss_ce": 0.1857953816652298, "loss_lvr": 0.9074702858924866, "loss_mode_switch": 0.0, "loss_total": 0.27654242515563965, "step": 1315 }, { "batch_size": 4, "epoch": 0.526, "step": 1315, "tokens_per_device": 2580 }, { "epoch": 0.526, "loss_ce": 0.4348866641521454, "loss_lvr": 0.942147970199585, "loss_mode_switch": 0.0, "loss_total": 0.5291014909744263, "step": 1315 }, { "batch_size": 1, "epoch": 0.526, "step": 1315, "tokens_per_device": 5109 }, { "epoch": 0.526, "loss_ce": 0.07800241559743881, "loss_lvr": 0.45926153659820557, "loss_mode_switch": 0.0, "loss_total": 0.12392856925725937, "step": 1315 }, { "epoch": 0.5264, "grad_norm": 1.223312258720398, "learning_rate": 4.815432888096459e-06, "loss": 0.3083, "step": 1316 }, { "batch_size": 1, "epoch": 0.5264, "step": 1316, "tokens_per_device": 4847 }, { "epoch": 0.5264, "loss_ce": 0.0018898795824497938, "loss_lvr": 0.22427554428577423, "loss_mode_switch": 0.0, "loss_total": 0.024317434057593346, "step": 1316 }, { "batch_size": 4, "epoch": 0.5264, "step": 1316, "tokens_per_device": 2668 }, { "epoch": 0.5264, "loss_ce": 0.37265104055404663, "loss_lvr": 0.9217472076416016, "loss_mode_switch": 0.0, "loss_total": 0.46482574939727783, "step": 1316 }, { "batch_size": 1, "epoch": 0.5264, "step": 1316, "tokens_per_device": 4875 }, { "epoch": 0.5264, "loss_ce": 0.007292419672012329, "loss_lvr": 0.24168133735656738, "loss_mode_switch": 0.0, "loss_total": 0.03146055340766907, "step": 1316 }, { "batch_size": 4, "epoch": 0.5264, "step": 1316, "tokens_per_device": 4260 }, { "epoch": 0.5264, "loss_ce": 0.313874214887619, "loss_lvr": 0.8755505084991455, "loss_mode_switch": 0.0, "loss_total": 0.40142926573753357, "step": 1316 }, { "batch_size": 4, "epoch": 0.5264, "step": 1316, "tokens_per_device": 15088 }, { "epoch": 0.5264, "loss_ce": 0.5754006505012512, "loss_lvr": 0.786399245262146, "loss_mode_switch": 0.0, "loss_total": 0.6540405750274658, "step": 1316 }, { "batch_size": 4, "epoch": 0.5264, "step": 1316, "tokens_per_device": 6916 }, { "epoch": 0.5264, "loss_ce": 0.5482397079467773, "loss_lvr": 0.7343340516090393, "loss_mode_switch": 0.0, "loss_total": 0.6216731071472168, "step": 1316 }, { "batch_size": 4, "epoch": 0.5264, "step": 1316, "tokens_per_device": 6052 }, { "epoch": 0.5264, "loss_ce": 0.1052127331495285, "loss_lvr": 0.677141547203064, "loss_mode_switch": 0.0, "loss_total": 0.1729268878698349, "step": 1316 }, { "batch_size": 1, "epoch": 0.5264, "step": 1316, "tokens_per_device": 4898 }, { "epoch": 0.5264, "loss_ce": 0.07840968668460846, "loss_lvr": 0.4354046583175659, "loss_mode_switch": 0.0, "loss_total": 0.12195014953613281, "step": 1316 }, { "epoch": 0.5268, "grad_norm": 1.2552363872528076, "learning_rate": 4.808959948797793e-06, "loss": 0.28, "step": 1317 }, { "batch_size": 4, "epoch": 0.5268, "step": 1317, "tokens_per_device": 4260 }, { "epoch": 0.5268, "loss_ce": 0.0739348977804184, "loss_lvr": 0.949444055557251, "loss_mode_switch": 0.0, "loss_total": 0.16887930035591125, "step": 1317 }, { "batch_size": 4, "epoch": 0.5268, "step": 1317, "tokens_per_device": 3808 }, { "epoch": 0.5268, "loss_ce": 0.22168558835983276, "loss_lvr": 0.953662097454071, "loss_mode_switch": 0.0, "loss_total": 0.31705179810523987, "step": 1317 }, { "batch_size": 4, "epoch": 0.5268, "step": 1317, "tokens_per_device": 4504 }, { "epoch": 0.5268, "loss_ce": 0.08820410817861557, "loss_lvr": 0.7311446070671082, "loss_mode_switch": 0.0, "loss_total": 0.1613185703754425, "step": 1317 }, { "batch_size": 4, "epoch": 0.5268, "step": 1317, "tokens_per_device": 4292 }, { "epoch": 0.5268, "loss_ce": 0.34758713841438293, "loss_lvr": 0.8706008791923523, "loss_mode_switch": 0.0, "loss_total": 0.43464723229408264, "step": 1317 }, { "batch_size": 1, "epoch": 0.5268, "step": 1317, "tokens_per_device": 4846 }, { "epoch": 0.5268, "loss_ce": 0.0016934624873101711, "loss_lvr": 0.5990480780601501, "loss_mode_switch": 0.0, "loss_total": 0.0615982711315155, "step": 1317 }, { "batch_size": 4, "epoch": 0.5268, "step": 1317, "tokens_per_device": 4496 }, { "epoch": 0.5268, "loss_ce": 0.29988226294517517, "loss_lvr": 1.0245333909988403, "loss_mode_switch": 0.0, "loss_total": 0.40233561396598816, "step": 1317 }, { "batch_size": 4, "epoch": 0.5268, "step": 1317, "tokens_per_device": 4288 }, { "epoch": 0.5268, "loss_ce": 0.7536452412605286, "loss_lvr": 0.8064622282981873, "loss_mode_switch": 0.0, "loss_total": 0.8342914581298828, "step": 1317 }, { "batch_size": 4, "epoch": 0.5268, "step": 1317, "tokens_per_device": 2044 }, { "epoch": 0.5268, "loss_ce": 0.23984023928642273, "loss_lvr": 0.8177553415298462, "loss_mode_switch": 0.0, "loss_total": 0.3216157853603363, "step": 1317 }, { "epoch": 0.5272, "grad_norm": 1.3713370561599731, "learning_rate": 4.802487330126519e-06, "loss": 0.3684, "step": 1318 }, { "batch_size": 4, "epoch": 0.5272, "step": 1318, "tokens_per_device": 8288 }, { "epoch": 0.5272, "loss_ce": 0.14621254801750183, "loss_lvr": 0.8826863169670105, "loss_mode_switch": 0.0, "loss_total": 0.23448118567466736, "step": 1318 }, { "batch_size": 4, "epoch": 0.5272, "step": 1318, "tokens_per_device": 1720 }, { "epoch": 0.5272, "loss_ce": 0.0812687948346138, "loss_lvr": 0.9127999544143677, "loss_mode_switch": 0.0, "loss_total": 0.1725488007068634, "step": 1318 }, { "batch_size": 1, "epoch": 0.5272, "step": 1318, "tokens_per_device": 4298 }, { "epoch": 0.5272, "loss_ce": 0.013918467797338963, "loss_lvr": 0.41111722588539124, "loss_mode_switch": 0.0, "loss_total": 0.05503018945455551, "step": 1318 }, { "batch_size": 4, "epoch": 0.5272, "step": 1318, "tokens_per_device": 3932 }, { "epoch": 0.5272, "loss_ce": 0.3990509808063507, "loss_lvr": 1.1053487062454224, "loss_mode_switch": 0.0, "loss_total": 0.5095858573913574, "step": 1318 }, { "batch_size": 4, "epoch": 0.5272, "step": 1318, "tokens_per_device": 2552 }, { "epoch": 0.5272, "loss_ce": 0.04438125714659691, "loss_lvr": 0.831896185874939, "loss_mode_switch": 0.0, "loss_total": 0.12757088243961334, "step": 1318 }, { "batch_size": 1, "epoch": 0.5272, "step": 1318, "tokens_per_device": 5016 }, { "epoch": 0.5272, "loss_ce": 0.00028329575434327126, "loss_lvr": 1.0588629245758057, "loss_mode_switch": 0.0, "loss_total": 0.10616958886384964, "step": 1318 }, { "batch_size": 4, "epoch": 0.5272, "step": 1318, "tokens_per_device": 7572 }, { "epoch": 0.5272, "loss_ce": 0.2916727066040039, "loss_lvr": 0.8501372337341309, "loss_mode_switch": 0.0, "loss_total": 0.3766864240169525, "step": 1318 }, { "batch_size": 1, "epoch": 0.5272, "step": 1318, "tokens_per_device": 5036 }, { "epoch": 0.5272, "loss_ce": 0.031942617148160934, "loss_lvr": 0.5700129866600037, "loss_mode_switch": 0.0, "loss_total": 0.08894391357898712, "step": 1318 }, { "epoch": 0.5276, "grad_norm": 1.2105075120925903, "learning_rate": 4.796015042945801e-06, "loss": 0.2582, "step": 1319 }, { "batch_size": 4, "epoch": 0.5276, "step": 1319, "tokens_per_device": 7184 }, { "epoch": 0.5276, "loss_ce": 0.49570611119270325, "loss_lvr": 0.7768037915229797, "loss_mode_switch": 0.0, "loss_total": 0.5733864903450012, "step": 1319 }, { "batch_size": 4, "epoch": 0.5276, "step": 1319, "tokens_per_device": 2532 }, { "epoch": 0.5276, "loss_ce": 0.1617279201745987, "loss_lvr": 1.0125949382781982, "loss_mode_switch": 0.0, "loss_total": 0.2629874050617218, "step": 1319 }, { "batch_size": 1, "epoch": 0.5276, "step": 1319, "tokens_per_device": 4916 }, { "epoch": 0.5276, "loss_ce": 0.07191196084022522, "loss_lvr": 0.30506232380867004, "loss_mode_switch": 0.0, "loss_total": 0.1024181917309761, "step": 1319 }, { "batch_size": 4, "epoch": 0.5276, "step": 1319, "tokens_per_device": 5792 }, { "epoch": 0.5276, "loss_ce": 0.30502113699913025, "loss_lvr": 0.8222042322158813, "loss_mode_switch": 0.0, "loss_total": 0.38724157214164734, "step": 1319 }, { "batch_size": 4, "epoch": 0.5276, "step": 1319, "tokens_per_device": 5820 }, { "epoch": 0.5276, "loss_ce": 0.5265090465545654, "loss_lvr": 0.8069211840629578, "loss_mode_switch": 0.0, "loss_total": 0.6072011590003967, "step": 1319 }, { "batch_size": 4, "epoch": 0.5276, "step": 1319, "tokens_per_device": 3844 }, { "epoch": 0.5276, "loss_ce": 0.09170721471309662, "loss_lvr": 0.8118284940719604, "loss_mode_switch": 0.0, "loss_total": 0.1728900671005249, "step": 1319 }, { "batch_size": 1, "epoch": 0.5276, "step": 1319, "tokens_per_device": 5252 }, { "epoch": 0.5276, "loss_ce": 0.2564130425453186, "loss_lvr": 0.5792549252510071, "loss_mode_switch": 0.0, "loss_total": 0.3143385350704193, "step": 1319 }, { "batch_size": 4, "epoch": 0.5276, "step": 1319, "tokens_per_device": 3876 }, { "epoch": 0.5276, "loss_ce": 0.06222354620695114, "loss_lvr": 0.9738048315048218, "loss_mode_switch": 0.0, "loss_total": 0.1596040278673172, "step": 1319 }, { "epoch": 0.528, "grad_norm": 1.2393943071365356, "learning_rate": 4.7895430981182415e-06, "loss": 0.2642, "step": 1320 }, { "batch_size": 4, "epoch": 0.528, "step": 1320, "tokens_per_device": 5756 }, { "epoch": 0.528, "loss_ce": 0.2415803074836731, "loss_lvr": 0.758113443851471, "loss_mode_switch": 0.0, "loss_total": 0.31739166378974915, "step": 1320 }, { "batch_size": 4, "epoch": 0.528, "step": 1320, "tokens_per_device": 4376 }, { "epoch": 0.528, "loss_ce": 0.06280156224966049, "loss_lvr": 0.9878144860267639, "loss_mode_switch": 0.0, "loss_total": 0.16158300638198853, "step": 1320 }, { "batch_size": 1, "epoch": 0.528, "step": 1320, "tokens_per_device": 4900 }, { "epoch": 0.528, "loss_ce": 0.004254867788404226, "loss_lvr": 0.3107944130897522, "loss_mode_switch": 0.0, "loss_total": 0.035334307700395584, "step": 1320 }, { "batch_size": 4, "epoch": 0.528, "step": 1320, "tokens_per_device": 4096 }, { "epoch": 0.528, "loss_ce": 0.35376694798469543, "loss_lvr": 0.9172393679618835, "loss_mode_switch": 0.0, "loss_total": 0.44549089670181274, "step": 1320 }, { "batch_size": 4, "epoch": 0.528, "step": 1320, "tokens_per_device": 4176 }, { "epoch": 0.528, "loss_ce": 0.37242987751960754, "loss_lvr": 0.7623090147972107, "loss_mode_switch": 0.0, "loss_total": 0.44866079092025757, "step": 1320 }, { "batch_size": 1, "epoch": 0.528, "step": 1320, "tokens_per_device": 5058 }, { "epoch": 0.528, "loss_ce": 0.4211714565753937, "loss_lvr": 0.5905174612998962, "loss_mode_switch": 0.0, "loss_total": 0.4802232086658478, "step": 1320 }, { "batch_size": 1, "epoch": 0.528, "step": 1320, "tokens_per_device": 4762 }, { "epoch": 0.528, "loss_ce": 0.033008914440870285, "loss_lvr": 0.39922621846199036, "loss_mode_switch": 0.0, "loss_total": 0.07293153554201126, "step": 1320 }, { "batch_size": 4, "epoch": 0.528, "step": 1320, "tokens_per_device": 4272 }, { "epoch": 0.528, "loss_ce": 0.3876967132091522, "loss_lvr": 0.8780679702758789, "loss_mode_switch": 0.0, "loss_total": 0.47550350427627563, "step": 1320 }, { "epoch": 0.5284, "grad_norm": 1.827856421470642, "learning_rate": 4.78307150650587e-06, "loss": 0.3022, "step": 1321 }, { "batch_size": 1, "epoch": 0.5284, "step": 1321, "tokens_per_device": 5236 }, { "epoch": 0.5284, "loss_ce": 0.03978146240115166, "loss_lvr": 0.29866841435432434, "loss_mode_switch": 0.0, "loss_total": 0.06964830309152603, "step": 1321 }, { "batch_size": 4, "epoch": 0.5284, "step": 1321, "tokens_per_device": 1452 }, { "epoch": 0.5284, "loss_ce": 0.4175362288951874, "loss_lvr": 1.0471739768981934, "loss_mode_switch": 0.0, "loss_total": 0.5222536325454712, "step": 1321 }, { "batch_size": 1, "epoch": 0.5284, "step": 1321, "tokens_per_device": 4780 }, { "epoch": 0.5284, "loss_ce": 0.008353341370821, "loss_lvr": 0.5035185217857361, "loss_mode_switch": 0.0, "loss_total": 0.05870519578456879, "step": 1321 }, { "batch_size": 4, "epoch": 0.5284, "step": 1321, "tokens_per_device": 4628 }, { "epoch": 0.5284, "loss_ce": 0.035499557852745056, "loss_lvr": 0.7776691913604736, "loss_mode_switch": 0.0, "loss_total": 0.1132664754986763, "step": 1321 }, { "batch_size": 1, "epoch": 0.5284, "step": 1321, "tokens_per_device": 4888 }, { "epoch": 0.5284, "loss_ce": 0.00014693517005071044, "loss_lvr": 0.2864326238632202, "loss_mode_switch": 0.0, "loss_total": 0.028790198266506195, "step": 1321 }, { "batch_size": 1, "epoch": 0.5284, "step": 1321, "tokens_per_device": 4932 }, { "epoch": 0.5284, "loss_ce": 0.001893664593808353, "loss_lvr": 0.4052152633666992, "loss_mode_switch": 0.0, "loss_total": 0.04241519421339035, "step": 1321 }, { "batch_size": 4, "epoch": 0.5284, "step": 1321, "tokens_per_device": 6436 }, { "epoch": 0.5284, "loss_ce": 0.0743381679058075, "loss_lvr": 0.7458547949790955, "loss_mode_switch": 0.0, "loss_total": 0.14892365038394928, "step": 1321 }, { "batch_size": 4, "epoch": 0.5284, "step": 1321, "tokens_per_device": 4056 }, { "epoch": 0.5284, "loss_ce": 0.4008340835571289, "loss_lvr": 0.7134017944335938, "loss_mode_switch": 0.0, "loss_total": 0.4721742570400238, "step": 1321 }, { "epoch": 0.5288, "grad_norm": 1.4826056957244873, "learning_rate": 4.776600278970127e-06, "loss": 0.2721, "step": 1322 }, { "batch_size": 1, "epoch": 0.5288, "step": 1322, "tokens_per_device": 4762 }, { "epoch": 0.5288, "loss_ce": 0.02022302895784378, "loss_lvr": 0.3856324553489685, "loss_mode_switch": 0.0, "loss_total": 0.05878627672791481, "step": 1322 }, { "batch_size": 1, "epoch": 0.5288, "step": 1322, "tokens_per_device": 4984 }, { "epoch": 0.5288, "loss_ce": 0.32500219345092773, "loss_lvr": 0.6737932562828064, "loss_mode_switch": 0.0, "loss_total": 0.3923815190792084, "step": 1322 }, { "batch_size": 4, "epoch": 0.5288, "step": 1322, "tokens_per_device": 13272 }, { "epoch": 0.5288, "loss_ce": 0.43369969725608826, "loss_lvr": 0.6749399900436401, "loss_mode_switch": 0.0, "loss_total": 0.5011937022209167, "step": 1322 }, { "batch_size": 4, "epoch": 0.5288, "step": 1322, "tokens_per_device": 4624 }, { "epoch": 0.5288, "loss_ce": 0.04343370348215103, "loss_lvr": 0.84710294008255, "loss_mode_switch": 0.0, "loss_total": 0.12814399600028992, "step": 1322 }, { "batch_size": 1, "epoch": 0.5288, "step": 1322, "tokens_per_device": 4918 }, { "epoch": 0.5288, "loss_ce": 0.23119314014911652, "loss_lvr": 0.4632588326931, "loss_mode_switch": 0.0, "loss_total": 0.27751901745796204, "step": 1322 }, { "batch_size": 4, "epoch": 0.5288, "step": 1322, "tokens_per_device": 1352 }, { "epoch": 0.5288, "loss_ce": 0.4745452404022217, "loss_lvr": 0.986622154712677, "loss_mode_switch": 0.0, "loss_total": 0.573207437992096, "step": 1322 }, { "batch_size": 1, "epoch": 0.5288, "step": 1322, "tokens_per_device": 4744 }, { "epoch": 0.5288, "loss_ce": 0.023391081020236015, "loss_lvr": 0.3472311496734619, "loss_mode_switch": 0.0, "loss_total": 0.058114193379879, "step": 1322 }, { "batch_size": 1, "epoch": 0.5288, "step": 1322, "tokens_per_device": 4883 }, { "epoch": 0.5288, "loss_ce": 0.01169500034302473, "loss_lvr": 0.5593058466911316, "loss_mode_switch": 0.0, "loss_total": 0.06762558221817017, "step": 1322 }, { "epoch": 0.5292, "grad_norm": 1.4456955194473267, "learning_rate": 4.770129426371838e-06, "loss": 0.3143, "step": 1323 }, { "batch_size": 1, "epoch": 0.5292, "step": 1323, "tokens_per_device": 7734 }, { "epoch": 0.5292, "loss_ce": 0.003417219966650009, "loss_lvr": 0.34964659810066223, "loss_mode_switch": 0.0, "loss_total": 0.03838188201189041, "step": 1323 }, { "batch_size": 4, "epoch": 0.5292, "step": 1323, "tokens_per_device": 3788 }, { "epoch": 0.5292, "loss_ce": 0.14108318090438843, "loss_lvr": 1.0358256101608276, "loss_mode_switch": 0.0, "loss_total": 0.2446657419204712, "step": 1323 }, { "batch_size": 4, "epoch": 0.5292, "step": 1323, "tokens_per_device": 10932 }, { "epoch": 0.5292, "loss_ce": 0.32041823863983154, "loss_lvr": 0.9333899021148682, "loss_mode_switch": 0.0, "loss_total": 0.41375723481178284, "step": 1323 }, { "batch_size": 4, "epoch": 0.5292, "step": 1323, "tokens_per_device": 9860 }, { "epoch": 0.5292, "loss_ce": 0.4135754406452179, "loss_lvr": 0.6472213268280029, "loss_mode_switch": 0.0, "loss_total": 0.4782975912094116, "step": 1323 }, { "batch_size": 4, "epoch": 0.5292, "step": 1323, "tokens_per_device": 4568 }, { "epoch": 0.5292, "loss_ce": 0.4998357594013214, "loss_lvr": 0.7730701565742493, "loss_mode_switch": 0.0, "loss_total": 0.5771427750587463, "step": 1323 }, { "batch_size": 1, "epoch": 0.5292, "step": 1323, "tokens_per_device": 4891 }, { "epoch": 0.5292, "loss_ce": 0.0004306787741370499, "loss_lvr": 0.42571601271629333, "loss_mode_switch": 0.0, "loss_total": 0.04300227761268616, "step": 1323 }, { "batch_size": 4, "epoch": 0.5292, "step": 1323, "tokens_per_device": 3804 }, { "epoch": 0.5292, "loss_ce": 0.1312294900417328, "loss_lvr": 0.8271923065185547, "loss_mode_switch": 0.0, "loss_total": 0.21394872665405273, "step": 1323 }, { "batch_size": 4, "epoch": 0.5292, "step": 1323, "tokens_per_device": 6424 }, { "epoch": 0.5292, "loss_ce": 0.44358953833580017, "loss_lvr": 0.8181747794151306, "loss_mode_switch": 0.0, "loss_total": 0.5254070162773132, "step": 1323 }, { "epoch": 0.5296, "grad_norm": 1.3558839559555054, "learning_rate": 4.763658959571199e-06, "loss": 0.3035, "step": 1324 }, { "batch_size": 4, "epoch": 0.5296, "step": 1324, "tokens_per_device": 4064 }, { "epoch": 0.5296, "loss_ce": 0.06959190964698792, "loss_lvr": 0.8797391653060913, "loss_mode_switch": 0.0, "loss_total": 0.15756583213806152, "step": 1324 }, { "batch_size": 4, "epoch": 0.5296, "step": 1324, "tokens_per_device": 3768 }, { "epoch": 0.5296, "loss_ce": 0.6545867919921875, "loss_lvr": 0.7121562957763672, "loss_mode_switch": 0.0, "loss_total": 0.7258024215698242, "step": 1324 }, { "batch_size": 1, "epoch": 0.5296, "step": 1324, "tokens_per_device": 5137 }, { "epoch": 0.5296, "loss_ce": 0.0003716823412105441, "loss_lvr": 0.2386719435453415, "loss_mode_switch": 0.0, "loss_total": 0.02423887699842453, "step": 1324 }, { "batch_size": 4, "epoch": 0.5296, "step": 1324, "tokens_per_device": 4420 }, { "epoch": 0.5296, "loss_ce": 0.02310938760638237, "loss_lvr": 0.7079890370368958, "loss_mode_switch": 0.0, "loss_total": 0.09390829503536224, "step": 1324 }, { "batch_size": 1, "epoch": 0.5296, "step": 1324, "tokens_per_device": 5047 }, { "epoch": 0.5296, "loss_ce": 0.015508508309721947, "loss_lvr": 0.3814956545829773, "loss_mode_switch": 0.0, "loss_total": 0.053658075630664825, "step": 1324 }, { "batch_size": 4, "epoch": 0.5296, "step": 1324, "tokens_per_device": 5056 }, { "epoch": 0.5296, "loss_ce": 0.4677324891090393, "loss_lvr": 0.6696755886077881, "loss_mode_switch": 0.0, "loss_total": 0.5347000360488892, "step": 1324 }, { "batch_size": 1, "epoch": 0.5296, "step": 1324, "tokens_per_device": 6634 }, { "epoch": 0.5296, "loss_ce": 0.11373524367809296, "loss_lvr": 0.4316018223762512, "loss_mode_switch": 0.0, "loss_total": 0.15689542889595032, "step": 1324 }, { "batch_size": 1, "epoch": 0.5296, "step": 1324, "tokens_per_device": 5174 }, { "epoch": 0.5296, "loss_ce": 0.014784849248826504, "loss_lvr": 0.536965548992157, "loss_mode_switch": 0.0, "loss_total": 0.06848140060901642, "step": 1324 }, { "epoch": 0.53, "grad_norm": 1.372248888015747, "learning_rate": 4.757188889427761e-06, "loss": 0.3526, "step": 1325 }, { "batch_size": 4, "epoch": 0.53, "step": 1325, "tokens_per_device": 5912 }, { "epoch": 0.53, "loss_ce": 0.7315364480018616, "loss_lvr": 0.788080632686615, "loss_mode_switch": 0.0, "loss_total": 0.8103445172309875, "step": 1325 }, { "batch_size": 1, "epoch": 0.53, "step": 1325, "tokens_per_device": 4880 }, { "epoch": 0.53, "loss_ce": 0.392366498708725, "loss_lvr": 0.32136061787605286, "loss_mode_switch": 0.0, "loss_total": 0.42450255155563354, "step": 1325 }, { "batch_size": 4, "epoch": 0.53, "step": 1325, "tokens_per_device": 7440 }, { "epoch": 0.53, "loss_ce": 0.2017982453107834, "loss_lvr": 0.8282298445701599, "loss_mode_switch": 0.0, "loss_total": 0.2846212387084961, "step": 1325 }, { "batch_size": 4, "epoch": 0.53, "step": 1325, "tokens_per_device": 12740 }, { "epoch": 0.53, "loss_ce": 0.15636520087718964, "loss_lvr": 0.636971116065979, "loss_mode_switch": 0.0, "loss_total": 0.22006231546401978, "step": 1325 }, { "batch_size": 1, "epoch": 0.53, "step": 1325, "tokens_per_device": 5889 }, { "epoch": 0.53, "loss_ce": 0.005194578319787979, "loss_lvr": 0.5723411440849304, "loss_mode_switch": 0.0, "loss_total": 0.06242869421839714, "step": 1325 }, { "batch_size": 4, "epoch": 0.53, "step": 1325, "tokens_per_device": 4328 }, { "epoch": 0.53, "loss_ce": 0.33861663937568665, "loss_lvr": 0.8412940502166748, "loss_mode_switch": 0.0, "loss_total": 0.42274606227874756, "step": 1325 }, { "batch_size": 4, "epoch": 0.53, "step": 1325, "tokens_per_device": 3768 }, { "epoch": 0.53, "loss_ce": 0.14119002223014832, "loss_lvr": 1.1460365056991577, "loss_mode_switch": 0.0, "loss_total": 0.2557936906814575, "step": 1325 }, { "batch_size": 4, "epoch": 0.53, "step": 1325, "tokens_per_device": 6348 }, { "epoch": 0.53, "loss_ce": 0.042514022439718246, "loss_lvr": 0.9413893818855286, "loss_mode_switch": 0.0, "loss_total": 0.13665296137332916, "step": 1325 }, { "epoch": 0.5304, "grad_norm": 1.2526488304138184, "learning_rate": 4.750719226800404e-06, "loss": 0.304, "step": 1326 }, { "batch_size": 1, "epoch": 0.5304, "step": 1326, "tokens_per_device": 5072 }, { "epoch": 0.5304, "loss_ce": 0.1470428705215454, "loss_lvr": 0.8369481563568115, "loss_mode_switch": 0.0, "loss_total": 0.23073768615722656, "step": 1326 }, { "batch_size": 4, "epoch": 0.5304, "step": 1326, "tokens_per_device": 3792 }, { "epoch": 0.5304, "loss_ce": 0.47403502464294434, "loss_lvr": 1.2837283611297607, "loss_mode_switch": 0.0, "loss_total": 0.6024078726768494, "step": 1326 }, { "batch_size": 4, "epoch": 0.5304, "step": 1326, "tokens_per_device": 6700 }, { "epoch": 0.5304, "loss_ce": 0.07168859243392944, "loss_lvr": 0.6885070204734802, "loss_mode_switch": 0.0, "loss_total": 0.140539288520813, "step": 1326 }, { "batch_size": 1, "epoch": 0.5304, "step": 1326, "tokens_per_device": 4336 }, { "epoch": 0.5304, "loss_ce": 0.22921255230903625, "loss_lvr": 0.4044920802116394, "loss_mode_switch": 0.0, "loss_total": 0.2696617543697357, "step": 1326 }, { "batch_size": 4, "epoch": 0.5304, "step": 1326, "tokens_per_device": 5908 }, { "epoch": 0.5304, "loss_ce": 0.10324505716562271, "loss_lvr": 0.7981462478637695, "loss_mode_switch": 0.0, "loss_total": 0.1830596923828125, "step": 1326 }, { "batch_size": 1, "epoch": 0.5304, "step": 1326, "tokens_per_device": 4753 }, { "epoch": 0.5304, "loss_ce": 0.001626939163543284, "loss_lvr": 0.40056556463241577, "loss_mode_switch": 0.0, "loss_total": 0.04168349504470825, "step": 1326 }, { "batch_size": 4, "epoch": 0.5304, "step": 1326, "tokens_per_device": 4304 }, { "epoch": 0.5304, "loss_ce": 0.45238345861434937, "loss_lvr": 0.9309465289115906, "loss_mode_switch": 0.0, "loss_total": 0.545478105545044, "step": 1326 }, { "batch_size": 1, "epoch": 0.5304, "step": 1326, "tokens_per_device": 4878 }, { "epoch": 0.5304, "loss_ce": 0.002044122200459242, "loss_lvr": 1.1570637226104736, "loss_mode_switch": 0.0, "loss_total": 0.11775049567222595, "step": 1326 }, { "epoch": 0.5308, "grad_norm": 1.2857075929641724, "learning_rate": 4.744249982547332e-06, "loss": 0.3214, "step": 1327 }, { "batch_size": 4, "epoch": 0.5308, "step": 1327, "tokens_per_device": 1504 }, { "epoch": 0.5308, "loss_ce": 0.3340534567832947, "loss_lvr": 1.264197587966919, "loss_mode_switch": 0.0, "loss_total": 0.4604732394218445, "step": 1327 }, { "batch_size": 1, "epoch": 0.5308, "step": 1327, "tokens_per_device": 6275 }, { "epoch": 0.5308, "loss_ce": 0.07881100475788116, "loss_lvr": 0.39050528407096863, "loss_mode_switch": 0.0, "loss_total": 0.1178615391254425, "step": 1327 }, { "batch_size": 1, "epoch": 0.5308, "step": 1327, "tokens_per_device": 5330 }, { "epoch": 0.5308, "loss_ce": 0.2573661804199219, "loss_lvr": 0.405188649892807, "loss_mode_switch": 0.0, "loss_total": 0.29788506031036377, "step": 1327 }, { "batch_size": 4, "epoch": 0.5308, "step": 1327, "tokens_per_device": 7660 }, { "epoch": 0.5308, "loss_ce": 0.5808964371681213, "loss_lvr": 0.6431549787521362, "loss_mode_switch": 0.0, "loss_total": 0.645211935043335, "step": 1327 }, { "batch_size": 1, "epoch": 0.5308, "step": 1327, "tokens_per_device": 5234 }, { "epoch": 0.5308, "loss_ce": 0.059406865388154984, "loss_lvr": 0.3616175055503845, "loss_mode_switch": 0.0, "loss_total": 0.09556861221790314, "step": 1327 }, { "batch_size": 4, "epoch": 0.5308, "step": 1327, "tokens_per_device": 3944 }, { "epoch": 0.5308, "loss_ce": 0.23781314492225647, "loss_lvr": 0.8788028955459595, "loss_mode_switch": 0.0, "loss_total": 0.32569342851638794, "step": 1327 }, { "batch_size": 1, "epoch": 0.5308, "step": 1327, "tokens_per_device": 5094 }, { "epoch": 0.5308, "loss_ce": 0.6340668201446533, "loss_lvr": 0.11822488903999329, "loss_mode_switch": 0.0, "loss_total": 0.6458892822265625, "step": 1327 }, { "batch_size": 4, "epoch": 0.5308, "step": 1327, "tokens_per_device": 2636 }, { "epoch": 0.5308, "loss_ce": 0.42016494274139404, "loss_lvr": 0.7254367470741272, "loss_mode_switch": 0.0, "loss_total": 0.49270862340927124, "step": 1327 }, { "epoch": 0.5312, "grad_norm": 1.7084659337997437, "learning_rate": 4.737781167526043e-06, "loss": 0.3198, "step": 1328 }, { "batch_size": 1, "epoch": 0.5312, "step": 1328, "tokens_per_device": 5188 }, { "epoch": 0.5312, "loss_ce": 0.03768927976489067, "loss_lvr": 0.3605194687843323, "loss_mode_switch": 0.0, "loss_total": 0.07374122738838196, "step": 1328 }, { "batch_size": 4, "epoch": 0.5312, "step": 1328, "tokens_per_device": 2628 }, { "epoch": 0.5312, "loss_ce": 0.33453378081321716, "loss_lvr": 0.8784239888191223, "loss_mode_switch": 0.0, "loss_total": 0.42237618565559387, "step": 1328 }, { "batch_size": 1, "epoch": 0.5312, "step": 1328, "tokens_per_device": 6443 }, { "epoch": 0.5312, "loss_ce": 0.0008108980255201459, "loss_lvr": 0.3915662467479706, "loss_mode_switch": 0.0, "loss_total": 0.039967525750398636, "step": 1328 }, { "batch_size": 4, "epoch": 0.5312, "step": 1328, "tokens_per_device": 15616 }, { "epoch": 0.5312, "loss_ce": 0.20714759826660156, "loss_lvr": 0.6809951066970825, "loss_mode_switch": 0.0, "loss_total": 0.27524709701538086, "step": 1328 }, { "batch_size": 1, "epoch": 0.5312, "step": 1328, "tokens_per_device": 5120 }, { "epoch": 0.5312, "loss_ce": 0.15361252427101135, "loss_lvr": 0.2747303545475006, "loss_mode_switch": 0.0, "loss_total": 0.18108555674552917, "step": 1328 }, { "batch_size": 4, "epoch": 0.5312, "step": 1328, "tokens_per_device": 2652 }, { "epoch": 0.5312, "loss_ce": 0.6648973822593689, "loss_lvr": 0.8238488435745239, "loss_mode_switch": 0.0, "loss_total": 0.7472822666168213, "step": 1328 }, { "batch_size": 4, "epoch": 0.5312, "step": 1328, "tokens_per_device": 4884 }, { "epoch": 0.5312, "loss_ce": 0.4603317081928253, "loss_lvr": 1.2397804260253906, "loss_mode_switch": 0.0, "loss_total": 0.5843097567558289, "step": 1328 }, { "batch_size": 1, "epoch": 0.5312, "step": 1328, "tokens_per_device": 6962 }, { "epoch": 0.5312, "loss_ce": 0.059835728257894516, "loss_lvr": 0.284902423620224, "loss_mode_switch": 0.0, "loss_total": 0.08832596987485886, "step": 1328 }, { "epoch": 0.5316, "grad_norm": 1.5306944847106934, "learning_rate": 4.731312792593311e-06, "loss": 0.3525, "step": 1329 }, { "batch_size": 4, "epoch": 0.5316, "step": 1329, "tokens_per_device": 5844 }, { "epoch": 0.5316, "loss_ce": 0.13306987285614014, "loss_lvr": 0.7328492403030396, "loss_mode_switch": 0.0, "loss_total": 0.2063547968864441, "step": 1329 }, { "batch_size": 1, "epoch": 0.5316, "step": 1329, "tokens_per_device": 4917 }, { "epoch": 0.5316, "loss_ce": 0.06721509248018265, "loss_lvr": 0.2780066132545471, "loss_mode_switch": 0.0, "loss_total": 0.0950157567858696, "step": 1329 }, { "batch_size": 4, "epoch": 0.5316, "step": 1329, "tokens_per_device": 9080 }, { "epoch": 0.5316, "loss_ce": 0.12257631868124008, "loss_lvr": 0.7398399710655212, "loss_mode_switch": 0.0, "loss_total": 0.1965603232383728, "step": 1329 }, { "batch_size": 4, "epoch": 0.5316, "step": 1329, "tokens_per_device": 5388 }, { "epoch": 0.5316, "loss_ce": 0.1340024620294571, "loss_lvr": 0.6730726957321167, "loss_mode_switch": 0.0, "loss_total": 0.20130974054336548, "step": 1329 }, { "batch_size": 4, "epoch": 0.5316, "step": 1329, "tokens_per_device": 8864 }, { "epoch": 0.5316, "loss_ce": 0.39725977182388306, "loss_lvr": 0.8886337876319885, "loss_mode_switch": 0.0, "loss_total": 0.48612314462661743, "step": 1329 }, { "batch_size": 4, "epoch": 0.5316, "step": 1329, "tokens_per_device": 4052 }, { "epoch": 0.5316, "loss_ce": 0.6116935610771179, "loss_lvr": 0.9651064872741699, "loss_mode_switch": 0.0, "loss_total": 0.7082042098045349, "step": 1329 }, { "batch_size": 1, "epoch": 0.5316, "step": 1329, "tokens_per_device": 4999 }, { "epoch": 0.5316, "loss_ce": 0.00572938472032547, "loss_lvr": 0.43081235885620117, "loss_mode_switch": 0.0, "loss_total": 0.04881061986088753, "step": 1329 }, { "batch_size": 4, "epoch": 0.5316, "step": 1329, "tokens_per_device": 1736 }, { "epoch": 0.5316, "loss_ce": 0.1427692174911499, "loss_lvr": 0.8979921340942383, "loss_mode_switch": 0.0, "loss_total": 0.23256844282150269, "step": 1329 }, { "epoch": 0.532, "grad_norm": 1.270910382270813, "learning_rate": 4.724844868605176e-06, "loss": 0.2587, "step": 1330 }, { "batch_size": 4, "epoch": 0.532, "step": 1330, "tokens_per_device": 2716 }, { "epoch": 0.532, "loss_ce": 0.49014371633529663, "loss_lvr": 0.9233912825584412, "loss_mode_switch": 0.0, "loss_total": 0.5824828147888184, "step": 1330 }, { "batch_size": 4, "epoch": 0.532, "step": 1330, "tokens_per_device": 4316 }, { "epoch": 0.532, "loss_ce": 0.34928613901138306, "loss_lvr": 0.8022623658180237, "loss_mode_switch": 0.0, "loss_total": 0.4295123815536499, "step": 1330 }, { "batch_size": 1, "epoch": 0.532, "step": 1330, "tokens_per_device": 5110 }, { "epoch": 0.532, "loss_ce": 0.0019647260196506977, "loss_lvr": 0.3253714442253113, "loss_mode_switch": 0.0, "loss_total": 0.03450186923146248, "step": 1330 }, { "batch_size": 1, "epoch": 0.532, "step": 1330, "tokens_per_device": 6440 }, { "epoch": 0.532, "loss_ce": 0.08522748202085495, "loss_lvr": 0.5272806882858276, "loss_mode_switch": 0.0, "loss_total": 0.13795554637908936, "step": 1330 }, { "batch_size": 1, "epoch": 0.532, "step": 1330, "tokens_per_device": 4868 }, { "epoch": 0.532, "loss_ce": 0.008363600820302963, "loss_lvr": 0.26136958599090576, "loss_mode_switch": 0.0, "loss_total": 0.03450056165456772, "step": 1330 }, { "batch_size": 4, "epoch": 0.532, "step": 1330, "tokens_per_device": 3764 }, { "epoch": 0.532, "loss_ce": 0.25819042325019836, "loss_lvr": 1.0510385036468506, "loss_mode_switch": 0.0, "loss_total": 0.3632942736148834, "step": 1330 }, { "batch_size": 4, "epoch": 0.532, "step": 1330, "tokens_per_device": 4212 }, { "epoch": 0.532, "loss_ce": 0.24928463995456696, "loss_lvr": 1.0399481058120728, "loss_mode_switch": 0.0, "loss_total": 0.3532794415950775, "step": 1330 }, { "batch_size": 4, "epoch": 0.532, "step": 1330, "tokens_per_device": 4480 }, { "epoch": 0.532, "loss_ce": 0.4691806733608246, "loss_lvr": 0.7698538899421692, "loss_mode_switch": 0.0, "loss_total": 0.5461660623550415, "step": 1330 }, { "epoch": 0.5324, "grad_norm": 1.3823378086090088, "learning_rate": 4.7183774064169215e-06, "loss": 0.2856, "step": 1331 }, { "batch_size": 4, "epoch": 0.5324, "step": 1331, "tokens_per_device": 4624 }, { "epoch": 0.5324, "loss_ce": 0.2546220123767853, "loss_lvr": 0.8183789253234863, "loss_mode_switch": 0.0, "loss_total": 0.3364599049091339, "step": 1331 }, { "batch_size": 4, "epoch": 0.5324, "step": 1331, "tokens_per_device": 1456 }, { "epoch": 0.5324, "loss_ce": 0.4036184251308441, "loss_lvr": 1.0600411891937256, "loss_mode_switch": 0.0, "loss_total": 0.5096225738525391, "step": 1331 }, { "batch_size": 4, "epoch": 0.5324, "step": 1331, "tokens_per_device": 5708 }, { "epoch": 0.5324, "loss_ce": 0.07525388151407242, "loss_lvr": 0.9015710949897766, "loss_mode_switch": 0.0, "loss_total": 0.16541099548339844, "step": 1331 }, { "batch_size": 1, "epoch": 0.5324, "step": 1331, "tokens_per_device": 4884 }, { "epoch": 0.5324, "loss_ce": 0.00042756536277011037, "loss_lvr": 0.2447451949119568, "loss_mode_switch": 0.0, "loss_total": 0.024902084842324257, "step": 1331 }, { "batch_size": 4, "epoch": 0.5324, "step": 1331, "tokens_per_device": 6812 }, { "epoch": 0.5324, "loss_ce": 0.28885048627853394, "loss_lvr": 1.0026836395263672, "loss_mode_switch": 0.0, "loss_total": 0.38911885023117065, "step": 1331 }, { "batch_size": 1, "epoch": 0.5324, "step": 1331, "tokens_per_device": 5137 }, { "epoch": 0.5324, "loss_ce": 0.05440644174814224, "loss_lvr": 0.4851973056793213, "loss_mode_switch": 0.0, "loss_total": 0.10292617231607437, "step": 1331 }, { "batch_size": 1, "epoch": 0.5324, "step": 1331, "tokens_per_device": 5100 }, { "epoch": 0.5324, "loss_ce": 0.03825252130627632, "loss_lvr": 0.3345050811767578, "loss_mode_switch": 0.0, "loss_total": 0.07170303165912628, "step": 1331 }, { "batch_size": 4, "epoch": 0.5324, "step": 1331, "tokens_per_device": 3860 }, { "epoch": 0.5324, "loss_ce": 0.3138989210128784, "loss_lvr": 0.996636152267456, "loss_mode_switch": 0.0, "loss_total": 0.413562536239624, "step": 1331 }, { "epoch": 0.5328, "grad_norm": 1.5130094289779663, "learning_rate": 4.711910416883054e-06, "loss": 0.3058, "step": 1332 }, { "batch_size": 4, "epoch": 0.5328, "step": 1332, "tokens_per_device": 6972 }, { "epoch": 0.5328, "loss_ce": 0.5717752575874329, "loss_lvr": 0.7290191054344177, "loss_mode_switch": 0.0, "loss_total": 0.6446771621704102, "step": 1332 }, { "batch_size": 4, "epoch": 0.5328, "step": 1332, "tokens_per_device": 4528 }, { "epoch": 0.5328, "loss_ce": 0.11765839159488678, "loss_lvr": 0.6267692446708679, "loss_mode_switch": 0.0, "loss_total": 0.18033531308174133, "step": 1332 }, { "batch_size": 4, "epoch": 0.5328, "step": 1332, "tokens_per_device": 2836 }, { "epoch": 0.5328, "loss_ce": 0.40041685104370117, "loss_lvr": 0.6399022340774536, "loss_mode_switch": 0.0, "loss_total": 0.4644070863723755, "step": 1332 }, { "batch_size": 1, "epoch": 0.5328, "step": 1332, "tokens_per_device": 5174 }, { "epoch": 0.5328, "loss_ce": 0.28633707761764526, "loss_lvr": 0.3766467273235321, "loss_mode_switch": 0.0, "loss_total": 0.3240017592906952, "step": 1332 }, { "batch_size": 4, "epoch": 0.5328, "step": 1332, "tokens_per_device": 6944 }, { "epoch": 0.5328, "loss_ce": 0.1949252337217331, "loss_lvr": 0.8913867473602295, "loss_mode_switch": 0.0, "loss_total": 0.2840639054775238, "step": 1332 }, { "batch_size": 1, "epoch": 0.5328, "step": 1332, "tokens_per_device": 5159 }, { "epoch": 0.5328, "loss_ce": 0.005537760443985462, "loss_lvr": 0.31840354204177856, "loss_mode_switch": 0.0, "loss_total": 0.037378113716840744, "step": 1332 }, { "batch_size": 4, "epoch": 0.5328, "step": 1332, "tokens_per_device": 3360 }, { "epoch": 0.5328, "loss_ce": 0.15662987530231476, "loss_lvr": 0.9420480728149414, "loss_mode_switch": 0.0, "loss_total": 0.2508346736431122, "step": 1332 }, { "batch_size": 1, "epoch": 0.5328, "step": 1332, "tokens_per_device": 4797 }, { "epoch": 0.5328, "loss_ce": 0.0006442859885282815, "loss_lvr": 0.2271582931280136, "loss_mode_switch": 0.0, "loss_total": 0.02336011454463005, "step": 1332 }, { "epoch": 0.5332, "grad_norm": 1.2809008359909058, "learning_rate": 4.7054439108572856e-06, "loss": 0.2997, "step": 1333 }, { "batch_size": 1, "epoch": 0.5332, "step": 1333, "tokens_per_device": 4909 }, { "epoch": 0.5332, "loss_ce": 0.007924679666757584, "loss_lvr": 0.47564804553985596, "loss_mode_switch": 0.0, "loss_total": 0.05548948422074318, "step": 1333 }, { "batch_size": 1, "epoch": 0.5332, "step": 1333, "tokens_per_device": 4879 }, { "epoch": 0.5332, "loss_ce": 0.0005263460916467011, "loss_lvr": 0.9776304364204407, "loss_mode_switch": 0.0, "loss_total": 0.09828939288854599, "step": 1333 }, { "batch_size": 1, "epoch": 0.5332, "step": 1333, "tokens_per_device": 4892 }, { "epoch": 0.5332, "loss_ce": 0.005628631915897131, "loss_lvr": 0.6090298891067505, "loss_mode_switch": 0.0, "loss_total": 0.06653162091970444, "step": 1333 }, { "batch_size": 4, "epoch": 0.5332, "step": 1333, "tokens_per_device": 9544 }, { "epoch": 0.5332, "loss_ce": 0.017581939697265625, "loss_lvr": 0.7742282152175903, "loss_mode_switch": 0.0, "loss_total": 0.09500475972890854, "step": 1333 }, { "batch_size": 1, "epoch": 0.5332, "step": 1333, "tokens_per_device": 4878 }, { "epoch": 0.5332, "loss_ce": 0.0038934911135584116, "loss_lvr": 0.705015242099762, "loss_mode_switch": 0.0, "loss_total": 0.07439502328634262, "step": 1333 }, { "batch_size": 4, "epoch": 0.5332, "step": 1333, "tokens_per_device": 5196 }, { "epoch": 0.5332, "loss_ce": 0.2830421030521393, "loss_lvr": 0.6661754846572876, "loss_mode_switch": 0.0, "loss_total": 0.34965965151786804, "step": 1333 }, { "batch_size": 4, "epoch": 0.5332, "step": 1333, "tokens_per_device": 4680 }, { "epoch": 0.5332, "loss_ce": 0.605460524559021, "loss_lvr": 0.7218253016471863, "loss_mode_switch": 0.0, "loss_total": 0.6776430606842041, "step": 1333 }, { "batch_size": 1, "epoch": 0.5332, "step": 1333, "tokens_per_device": 4872 }, { "epoch": 0.5332, "loss_ce": 0.0005323392688296735, "loss_lvr": 0.4987446367740631, "loss_mode_switch": 0.0, "loss_total": 0.0504068061709404, "step": 1333 }, { "epoch": 0.5336, "grad_norm": 1.2668092250823975, "learning_rate": 4.69897789919252e-06, "loss": 0.3119, "step": 1334 }, { "batch_size": 4, "epoch": 0.5336, "step": 1334, "tokens_per_device": 16080 }, { "epoch": 0.5336, "loss_ce": 0.2698687016963959, "loss_lvr": 0.4398377239704132, "loss_mode_switch": 0.0, "loss_total": 0.3138524889945984, "step": 1334 }, { "batch_size": 1, "epoch": 0.5336, "step": 1334, "tokens_per_device": 4882 }, { "epoch": 0.5336, "loss_ce": 0.005552432965487242, "loss_lvr": 0.20818595588207245, "loss_mode_switch": 0.0, "loss_total": 0.026371030136942863, "step": 1334 }, { "batch_size": 1, "epoch": 0.5336, "step": 1334, "tokens_per_device": 5258 }, { "epoch": 0.5336, "loss_ce": 0.08087529987096786, "loss_lvr": 0.6412897706031799, "loss_mode_switch": 0.0, "loss_total": 0.1450042724609375, "step": 1334 }, { "batch_size": 4, "epoch": 0.5336, "step": 1334, "tokens_per_device": 4240 }, { "epoch": 0.5336, "loss_ce": 0.43712836503982544, "loss_lvr": 0.7007596492767334, "loss_mode_switch": 0.0, "loss_total": 0.5072043538093567, "step": 1334 }, { "batch_size": 4, "epoch": 0.5336, "step": 1334, "tokens_per_device": 1292 }, { "epoch": 0.5336, "loss_ce": 0.5433076024055481, "loss_lvr": 1.1244001388549805, "loss_mode_switch": 0.0, "loss_total": 0.6557475924491882, "step": 1334 }, { "batch_size": 4, "epoch": 0.5336, "step": 1334, "tokens_per_device": 4440 }, { "epoch": 0.5336, "loss_ce": 0.030429145321249962, "loss_lvr": 0.8514966368675232, "loss_mode_switch": 0.0, "loss_total": 0.11557881534099579, "step": 1334 }, { "batch_size": 4, "epoch": 0.5336, "step": 1334, "tokens_per_device": 1396 }, { "epoch": 0.5336, "loss_ce": 0.5780414342880249, "loss_lvr": 0.9632517099380493, "loss_mode_switch": 0.0, "loss_total": 0.6743665933609009, "step": 1334 }, { "batch_size": 4, "epoch": 0.5336, "step": 1334, "tokens_per_device": 3980 }, { "epoch": 0.5336, "loss_ce": 0.36175984144210815, "loss_lvr": 1.0851675271987915, "loss_mode_switch": 0.0, "loss_total": 0.4702765941619873, "step": 1334 }, { "epoch": 0.534, "grad_norm": 1.3475035429000854, "learning_rate": 4.6925123927408265e-06, "loss": 0.3032, "step": 1335 }, { "batch_size": 1, "epoch": 0.534, "step": 1335, "tokens_per_device": 4895 }, { "epoch": 0.534, "loss_ce": 0.038567397743463516, "loss_lvr": 0.603705883026123, "loss_mode_switch": 0.0, "loss_total": 0.09893798828125, "step": 1335 }, { "batch_size": 4, "epoch": 0.534, "step": 1335, "tokens_per_device": 4552 }, { "epoch": 0.534, "loss_ce": 0.23920737206935883, "loss_lvr": 0.5725582242012024, "loss_mode_switch": 0.0, "loss_total": 0.2964631915092468, "step": 1335 }, { "batch_size": 4, "epoch": 0.534, "step": 1335, "tokens_per_device": 4400 }, { "epoch": 0.534, "loss_ce": 0.2388058453798294, "loss_lvr": 0.845707893371582, "loss_mode_switch": 0.0, "loss_total": 0.3233766257762909, "step": 1335 }, { "batch_size": 1, "epoch": 0.534, "step": 1335, "tokens_per_device": 5069 }, { "epoch": 0.534, "loss_ce": 0.021812185645103455, "loss_lvr": 0.3736124038696289, "loss_mode_switch": 0.0, "loss_total": 0.059173427522182465, "step": 1335 }, { "batch_size": 4, "epoch": 0.534, "step": 1335, "tokens_per_device": 8876 }, { "epoch": 0.534, "loss_ce": 0.23847228288650513, "loss_lvr": 0.768752932548523, "loss_mode_switch": 0.0, "loss_total": 0.3153475821018219, "step": 1335 }, { "batch_size": 4, "epoch": 0.534, "step": 1335, "tokens_per_device": 1460 }, { "epoch": 0.534, "loss_ce": 0.6189773678779602, "loss_lvr": 0.7489508390426636, "loss_mode_switch": 0.0, "loss_total": 0.6938724517822266, "step": 1335 }, { "batch_size": 4, "epoch": 0.534, "step": 1335, "tokens_per_device": 5112 }, { "epoch": 0.534, "loss_ce": 0.35717520117759705, "loss_lvr": 0.7570071816444397, "loss_mode_switch": 0.0, "loss_total": 0.43287593126296997, "step": 1335 }, { "batch_size": 1, "epoch": 0.534, "step": 1335, "tokens_per_device": 6113 }, { "epoch": 0.534, "loss_ce": 0.22790104150772095, "loss_lvr": 0.33219829201698303, "loss_mode_switch": 0.0, "loss_total": 0.26112085580825806, "step": 1335 }, { "epoch": 0.5344, "grad_norm": 1.3623442649841309, "learning_rate": 4.686047402353433e-06, "loss": 0.2824, "step": 1336 }, { "batch_size": 1, "epoch": 0.5344, "step": 1336, "tokens_per_device": 4528 }, { "epoch": 0.5344, "loss_ce": 0.009214455261826515, "loss_lvr": 0.44842496514320374, "loss_mode_switch": 0.0, "loss_total": 0.05405694991350174, "step": 1336 }, { "batch_size": 4, "epoch": 0.5344, "step": 1336, "tokens_per_device": 9308 }, { "epoch": 0.5344, "loss_ce": 0.4008304476737976, "loss_lvr": 0.7901304364204407, "loss_mode_switch": 0.0, "loss_total": 0.47984349727630615, "step": 1336 }, { "batch_size": 4, "epoch": 0.5344, "step": 1336, "tokens_per_device": 1276 }, { "epoch": 0.5344, "loss_ce": 0.4194537103176117, "loss_lvr": 1.0361709594726562, "loss_mode_switch": 0.0, "loss_total": 0.5230708122253418, "step": 1336 }, { "batch_size": 1, "epoch": 0.5344, "step": 1336, "tokens_per_device": 4896 }, { "epoch": 0.5344, "loss_ce": 0.007877581752836704, "loss_lvr": 0.28333553671836853, "loss_mode_switch": 0.0, "loss_total": 0.03621113672852516, "step": 1336 }, { "batch_size": 4, "epoch": 0.5344, "step": 1336, "tokens_per_device": 3784 }, { "epoch": 0.5344, "loss_ce": 0.4514624774456024, "loss_lvr": 1.0459064245224, "loss_mode_switch": 0.0, "loss_total": 0.556053102016449, "step": 1336 }, { "batch_size": 1, "epoch": 0.5344, "step": 1336, "tokens_per_device": 5133 }, { "epoch": 0.5344, "loss_ce": 0.07057101279497147, "loss_lvr": 0.2462339997291565, "loss_mode_switch": 0.0, "loss_total": 0.09519441425800323, "step": 1336 }, { "batch_size": 4, "epoch": 0.5344, "step": 1336, "tokens_per_device": 3944 }, { "epoch": 0.5344, "loss_ce": 0.0079201590269804, "loss_lvr": 0.8348609209060669, "loss_mode_switch": 0.0, "loss_total": 0.09140625596046448, "step": 1336 }, { "batch_size": 4, "epoch": 0.5344, "step": 1336, "tokens_per_device": 5876 }, { "epoch": 0.5344, "loss_ce": 0.1747778356075287, "loss_lvr": 0.8670109510421753, "loss_mode_switch": 0.0, "loss_total": 0.2614789307117462, "step": 1336 }, { "epoch": 0.5348, "grad_norm": 1.4657763242721558, "learning_rate": 4.679582938880698e-06, "loss": 0.3026, "step": 1337 }, { "batch_size": 1, "epoch": 0.5348, "step": 1337, "tokens_per_device": 4855 }, { "epoch": 0.5348, "loss_ce": 0.008480570279061794, "loss_lvr": 0.45007479190826416, "loss_mode_switch": 0.0, "loss_total": 0.053488049656152725, "step": 1337 }, { "batch_size": 1, "epoch": 0.5348, "step": 1337, "tokens_per_device": 4897 }, { "epoch": 0.5348, "loss_ce": 0.0020099736284464598, "loss_lvr": 0.6116971969604492, "loss_mode_switch": 0.0, "loss_total": 0.06317969411611557, "step": 1337 }, { "batch_size": 4, "epoch": 0.5348, "step": 1337, "tokens_per_device": 3836 }, { "epoch": 0.5348, "loss_ce": 0.3750177323818207, "loss_lvr": 1.0926454067230225, "loss_mode_switch": 0.0, "loss_total": 0.4842822849750519, "step": 1337 }, { "batch_size": 4, "epoch": 0.5348, "step": 1337, "tokens_per_device": 2536 }, { "epoch": 0.5348, "loss_ce": 0.021598514169454575, "loss_lvr": 0.8367799520492554, "loss_mode_switch": 0.0, "loss_total": 0.10527651011943817, "step": 1337 }, { "batch_size": 1, "epoch": 0.5348, "step": 1337, "tokens_per_device": 5134 }, { "epoch": 0.5348, "loss_ce": 0.008440128527581692, "loss_lvr": 0.3496495485305786, "loss_mode_switch": 0.0, "loss_total": 0.04340508580207825, "step": 1337 }, { "batch_size": 4, "epoch": 0.5348, "step": 1337, "tokens_per_device": 6400 }, { "epoch": 0.5348, "loss_ce": 0.3641751706600189, "loss_lvr": 1.1185466051101685, "loss_mode_switch": 0.0, "loss_total": 0.4760298430919647, "step": 1337 }, { "batch_size": 4, "epoch": 0.5348, "step": 1337, "tokens_per_device": 3788 }, { "epoch": 0.5348, "loss_ce": 0.052113912999629974, "loss_lvr": 0.9255543351173401, "loss_mode_switch": 0.0, "loss_total": 0.14466935396194458, "step": 1337 }, { "batch_size": 4, "epoch": 0.5348, "step": 1337, "tokens_per_device": 1480 }, { "epoch": 0.5348, "loss_ce": 0.13959653675556183, "loss_lvr": 1.0084309577941895, "loss_mode_switch": 0.0, "loss_total": 0.24043962359428406, "step": 1337 }, { "epoch": 0.5352, "grad_norm": 1.1581228971481323, "learning_rate": 4.673119013172093e-06, "loss": 0.2976, "step": 1338 }, { "batch_size": 4, "epoch": 0.5352, "step": 1338, "tokens_per_device": 5592 }, { "epoch": 0.5352, "loss_ce": 0.41882404685020447, "loss_lvr": 0.4755917191505432, "loss_mode_switch": 0.0, "loss_total": 0.4663832187652588, "step": 1338 }, { "batch_size": 4, "epoch": 0.5352, "step": 1338, "tokens_per_device": 5176 }, { "epoch": 0.5352, "loss_ce": 0.3395925760269165, "loss_lvr": 0.6830294728279114, "loss_mode_switch": 0.0, "loss_total": 0.4078955352306366, "step": 1338 }, { "batch_size": 4, "epoch": 0.5352, "step": 1338, "tokens_per_device": 4784 }, { "epoch": 0.5352, "loss_ce": 0.09350588172674179, "loss_lvr": 0.5334570407867432, "loss_mode_switch": 0.0, "loss_total": 0.1468515843153, "step": 1338 }, { "batch_size": 1, "epoch": 0.5352, "step": 1338, "tokens_per_device": 4911 }, { "epoch": 0.5352, "loss_ce": 0.018811875954270363, "loss_lvr": 0.8991699814796448, "loss_mode_switch": 0.0, "loss_total": 0.10872887074947357, "step": 1338 }, { "batch_size": 4, "epoch": 0.5352, "step": 1338, "tokens_per_device": 4728 }, { "epoch": 0.5352, "loss_ce": 0.22684554755687714, "loss_lvr": 0.7041946649551392, "loss_mode_switch": 0.0, "loss_total": 0.29726502299308777, "step": 1338 }, { "batch_size": 4, "epoch": 0.5352, "step": 1338, "tokens_per_device": 4348 }, { "epoch": 0.5352, "loss_ce": 0.28049153089523315, "loss_lvr": 0.9413889646530151, "loss_mode_switch": 0.0, "loss_total": 0.3746304214000702, "step": 1338 }, { "batch_size": 4, "epoch": 0.5352, "step": 1338, "tokens_per_device": 15196 }, { "epoch": 0.5352, "loss_ce": 0.3744397461414337, "loss_lvr": 0.9043317437171936, "loss_mode_switch": 0.0, "loss_total": 0.46487292647361755, "step": 1338 }, { "batch_size": 1, "epoch": 0.5352, "step": 1338, "tokens_per_device": 5100 }, { "epoch": 0.5352, "loss_ce": 0.05369158089160919, "loss_lvr": 0.2401401251554489, "loss_mode_switch": 0.0, "loss_total": 0.07770559191703796, "step": 1338 }, { "epoch": 0.5356, "grad_norm": 1.354832649230957, "learning_rate": 4.6666556360761925e-06, "loss": 0.3153, "step": 1339 }, { "batch_size": 1, "epoch": 0.5356, "step": 1339, "tokens_per_device": 5135 }, { "epoch": 0.5356, "loss_ce": 0.0012110084062442183, "loss_lvr": 0.9261845350265503, "loss_mode_switch": 0.0, "loss_total": 0.09382946789264679, "step": 1339 }, { "batch_size": 1, "epoch": 0.5356, "step": 1339, "tokens_per_device": 5066 }, { "epoch": 0.5356, "loss_ce": 0.0018139297608286142, "loss_lvr": 0.2700464725494385, "loss_mode_switch": 0.0, "loss_total": 0.028818577527999878, "step": 1339 }, { "batch_size": 1, "epoch": 0.5356, "step": 1339, "tokens_per_device": 5169 }, { "epoch": 0.5356, "loss_ce": 0.0006512948893941939, "loss_lvr": 0.7649251818656921, "loss_mode_switch": 0.0, "loss_total": 0.07714381068944931, "step": 1339 }, { "batch_size": 1, "epoch": 0.5356, "step": 1339, "tokens_per_device": 4874 }, { "epoch": 0.5356, "loss_ce": 0.16862590610980988, "loss_lvr": 0.2193790227174759, "loss_mode_switch": 0.0, "loss_total": 0.19056381285190582, "step": 1339 }, { "batch_size": 4, "epoch": 0.5356, "step": 1339, "tokens_per_device": 2748 }, { "epoch": 0.5356, "loss_ce": 0.1456032395362854, "loss_lvr": 0.940305769443512, "loss_mode_switch": 0.0, "loss_total": 0.23963382840156555, "step": 1339 }, { "batch_size": 1, "epoch": 0.5356, "step": 1339, "tokens_per_device": 4873 }, { "epoch": 0.5356, "loss_ce": 0.010380305349826813, "loss_lvr": 0.1680402010679245, "loss_mode_switch": 0.0, "loss_total": 0.027184326201677322, "step": 1339 }, { "batch_size": 4, "epoch": 0.5356, "step": 1339, "tokens_per_device": 1476 }, { "epoch": 0.5356, "loss_ce": 0.4277944266796112, "loss_lvr": 0.8793131113052368, "loss_mode_switch": 0.0, "loss_total": 0.5157257318496704, "step": 1339 }, { "batch_size": 4, "epoch": 0.5356, "step": 1339, "tokens_per_device": 4180 }, { "epoch": 0.5356, "loss_ce": 0.4206309914588928, "loss_lvr": 0.9422574043273926, "loss_mode_switch": 0.0, "loss_total": 0.51485675573349, "step": 1339 }, { "epoch": 0.536, "grad_norm": 1.6036709547042847, "learning_rate": 4.660192818440642e-06, "loss": 0.3124, "step": 1340 }, { "batch_size": 4, "epoch": 0.536, "step": 1340, "tokens_per_device": 6308 }, { "epoch": 0.536, "loss_ce": 0.24986079335212708, "loss_lvr": 1.0029140710830688, "loss_mode_switch": 0.0, "loss_total": 0.3501521944999695, "step": 1340 }, { "batch_size": 4, "epoch": 0.536, "step": 1340, "tokens_per_device": 5768 }, { "epoch": 0.536, "loss_ce": 0.0016501976642757654, "loss_lvr": 1.365998387336731, "loss_mode_switch": 0.0, "loss_total": 0.13825003802776337, "step": 1340 }, { "batch_size": 4, "epoch": 0.536, "step": 1340, "tokens_per_device": 10728 }, { "epoch": 0.536, "loss_ce": 0.08483993262052536, "loss_lvr": 0.5801264643669128, "loss_mode_switch": 0.0, "loss_total": 0.1428525745868683, "step": 1340 }, { "batch_size": 4, "epoch": 0.536, "step": 1340, "tokens_per_device": 4896 }, { "epoch": 0.536, "loss_ce": 0.1854453682899475, "loss_lvr": 0.8978539109230042, "loss_mode_switch": 0.0, "loss_total": 0.2752307653427124, "step": 1340 }, { "batch_size": 4, "epoch": 0.536, "step": 1340, "tokens_per_device": 3864 }, { "epoch": 0.536, "loss_ce": 0.41018107533454895, "loss_lvr": 0.726270854473114, "loss_mode_switch": 0.0, "loss_total": 0.4828081727027893, "step": 1340 }, { "batch_size": 4, "epoch": 0.536, "step": 1340, "tokens_per_device": 4196 }, { "epoch": 0.536, "loss_ce": 0.2313702404499054, "loss_lvr": 0.8522855639457703, "loss_mode_switch": 0.0, "loss_total": 0.3165988028049469, "step": 1340 }, { "batch_size": 4, "epoch": 0.536, "step": 1340, "tokens_per_device": 8140 }, { "epoch": 0.536, "loss_ce": 0.12818065285682678, "loss_lvr": 0.7040894031524658, "loss_mode_switch": 0.0, "loss_total": 0.19858959317207336, "step": 1340 }, { "batch_size": 1, "epoch": 0.536, "step": 1340, "tokens_per_device": 4938 }, { "epoch": 0.536, "loss_ce": 0.006318447645753622, "loss_lvr": 0.3639317750930786, "loss_mode_switch": 0.0, "loss_total": 0.04271162301301956, "step": 1340 }, { "epoch": 0.5364, "grad_norm": 1.2648519277572632, "learning_rate": 4.653730571112159e-06, "loss": 0.278, "step": 1341 }, { "batch_size": 4, "epoch": 0.5364, "step": 1341, "tokens_per_device": 3788 }, { "epoch": 0.5364, "loss_ce": 0.18229687213897705, "loss_lvr": 1.1753870248794556, "loss_mode_switch": 0.0, "loss_total": 0.29983556270599365, "step": 1341 }, { "batch_size": 4, "epoch": 0.5364, "step": 1341, "tokens_per_device": 4552 }, { "epoch": 0.5364, "loss_ce": 0.037984348833560944, "loss_lvr": 0.587145209312439, "loss_mode_switch": 0.0, "loss_total": 0.09669886529445648, "step": 1341 }, { "batch_size": 4, "epoch": 0.5364, "step": 1341, "tokens_per_device": 4276 }, { "epoch": 0.5364, "loss_ce": 0.3880836069583893, "loss_lvr": 0.7837276458740234, "loss_mode_switch": 0.0, "loss_total": 0.4664563834667206, "step": 1341 }, { "batch_size": 4, "epoch": 0.5364, "step": 1341, "tokens_per_device": 2680 }, { "epoch": 0.5364, "loss_ce": 0.1490568369626999, "loss_lvr": 0.984275758266449, "loss_mode_switch": 0.0, "loss_total": 0.24748441576957703, "step": 1341 }, { "batch_size": 4, "epoch": 0.5364, "step": 1341, "tokens_per_device": 6228 }, { "epoch": 0.5364, "loss_ce": 0.5903028249740601, "loss_lvr": 0.9167641997337341, "loss_mode_switch": 0.0, "loss_total": 0.681979238986969, "step": 1341 }, { "batch_size": 1, "epoch": 0.5364, "step": 1341, "tokens_per_device": 5025 }, { "epoch": 0.5364, "loss_ce": 0.011556783691048622, "loss_lvr": 0.39342471957206726, "loss_mode_switch": 0.0, "loss_total": 0.050899259746074677, "step": 1341 }, { "batch_size": 4, "epoch": 0.5364, "step": 1341, "tokens_per_device": 4328 }, { "epoch": 0.5364, "loss_ce": 0.13700008392333984, "loss_lvr": 0.9926404356956482, "loss_mode_switch": 0.0, "loss_total": 0.23626413941383362, "step": 1341 }, { "batch_size": 4, "epoch": 0.5364, "step": 1341, "tokens_per_device": 2540 }, { "epoch": 0.5364, "loss_ce": 0.48300105333328247, "loss_lvr": 0.8508368730545044, "loss_mode_switch": 0.0, "loss_total": 0.568084716796875, "step": 1341 }, { "epoch": 0.5368, "grad_norm": 1.3724288940429688, "learning_rate": 4.647268904936495e-06, "loss": 0.2891, "step": 1342 }, { "batch_size": 4, "epoch": 0.5368, "step": 1342, "tokens_per_device": 4264 }, { "epoch": 0.5368, "loss_ce": 0.46250417828559875, "loss_lvr": 0.7760810256004333, "loss_mode_switch": 0.0, "loss_total": 0.5401122570037842, "step": 1342 }, { "batch_size": 4, "epoch": 0.5368, "step": 1342, "tokens_per_device": 3844 }, { "epoch": 0.5368, "loss_ce": 0.38564154505729675, "loss_lvr": 0.7957501411437988, "loss_mode_switch": 0.0, "loss_total": 0.46521657705307007, "step": 1342 }, { "batch_size": 1, "epoch": 0.5368, "step": 1342, "tokens_per_device": 5194 }, { "epoch": 0.5368, "loss_ce": 0.05641121417284012, "loss_lvr": 0.6051446199417114, "loss_mode_switch": 0.0, "loss_total": 0.1169256716966629, "step": 1342 }, { "batch_size": 4, "epoch": 0.5368, "step": 1342, "tokens_per_device": 5916 }, { "epoch": 0.5368, "loss_ce": 0.032297275960445404, "loss_lvr": 0.7653933763504028, "loss_mode_switch": 0.0, "loss_total": 0.10883661359548569, "step": 1342 }, { "batch_size": 4, "epoch": 0.5368, "step": 1342, "tokens_per_device": 4068 }, { "epoch": 0.5368, "loss_ce": 0.52889084815979, "loss_lvr": 0.9883405566215515, "loss_mode_switch": 0.0, "loss_total": 0.6277248859405518, "step": 1342 }, { "batch_size": 4, "epoch": 0.5368, "step": 1342, "tokens_per_device": 4052 }, { "epoch": 0.5368, "loss_ce": 0.2448679804801941, "loss_lvr": 0.8105040788650513, "loss_mode_switch": 0.0, "loss_total": 0.32591837644577026, "step": 1342 }, { "batch_size": 4, "epoch": 0.5368, "step": 1342, "tokens_per_device": 3880 }, { "epoch": 0.5368, "loss_ce": 0.12371785938739777, "loss_lvr": 1.0773369073867798, "loss_mode_switch": 0.0, "loss_total": 0.23145154118537903, "step": 1342 }, { "batch_size": 1, "epoch": 0.5368, "step": 1342, "tokens_per_device": 4877 }, { "epoch": 0.5368, "loss_ce": 0.00165538489818573, "loss_lvr": 0.5990737676620483, "loss_mode_switch": 0.0, "loss_total": 0.061562761664390564, "step": 1342 }, { "epoch": 0.5372, "grad_norm": 1.4025791883468628, "learning_rate": 4.640807830758433e-06, "loss": 0.3118, "step": 1343 }, { "batch_size": 4, "epoch": 0.5372, "step": 1343, "tokens_per_device": 5120 }, { "epoch": 0.5372, "loss_ce": 0.5005684494972229, "loss_lvr": 0.703357994556427, "loss_mode_switch": 0.0, "loss_total": 0.5709042549133301, "step": 1343 }, { "batch_size": 1, "epoch": 0.5372, "step": 1343, "tokens_per_device": 4933 }, { "epoch": 0.5372, "loss_ce": 0.13311246037483215, "loss_lvr": 0.5823668837547302, "loss_mode_switch": 0.0, "loss_total": 0.19134914875030518, "step": 1343 }, { "batch_size": 4, "epoch": 0.5372, "step": 1343, "tokens_per_device": 3992 }, { "epoch": 0.5372, "loss_ce": 0.18780995905399323, "loss_lvr": 0.7527908682823181, "loss_mode_switch": 0.0, "loss_total": 0.26308906078338623, "step": 1343 }, { "batch_size": 1, "epoch": 0.5372, "step": 1343, "tokens_per_device": 4898 }, { "epoch": 0.5372, "loss_ce": 0.03953808918595314, "loss_lvr": 0.7719129323959351, "loss_mode_switch": 0.0, "loss_total": 0.11672937870025635, "step": 1343 }, { "batch_size": 1, "epoch": 0.5372, "step": 1343, "tokens_per_device": 4907 }, { "epoch": 0.5372, "loss_ce": 0.08489319682121277, "loss_lvr": 0.9029848575592041, "loss_mode_switch": 0.0, "loss_total": 0.17519168555736542, "step": 1343 }, { "batch_size": 1, "epoch": 0.5372, "step": 1343, "tokens_per_device": 4814 }, { "epoch": 0.5372, "loss_ce": 0.11765580624341965, "loss_lvr": 0.4515249729156494, "loss_mode_switch": 0.0, "loss_total": 0.16280829906463623, "step": 1343 }, { "batch_size": 4, "epoch": 0.5372, "step": 1343, "tokens_per_device": 1160 }, { "epoch": 0.5372, "loss_ce": 0.3154178559780121, "loss_lvr": 1.0453580617904663, "loss_mode_switch": 0.0, "loss_total": 0.41995367407798767, "step": 1343 }, { "batch_size": 4, "epoch": 0.5372, "step": 1343, "tokens_per_device": 1316 }, { "epoch": 0.5372, "loss_ce": 0.5503261685371399, "loss_lvr": 1.2296087741851807, "loss_mode_switch": 0.0, "loss_total": 0.673287034034729, "step": 1343 }, { "epoch": 0.5376, "grad_norm": 1.13593590259552, "learning_rate": 4.6343473594217515e-06, "loss": 0.2313, "step": 1344 }, { "batch_size": 4, "epoch": 0.5376, "step": 1344, "tokens_per_device": 4328 }, { "epoch": 0.5376, "loss_ce": 0.3520607650279999, "loss_lvr": 0.942805826663971, "loss_mode_switch": 0.0, "loss_total": 0.446341335773468, "step": 1344 }, { "batch_size": 1, "epoch": 0.5376, "step": 1344, "tokens_per_device": 5050 }, { "epoch": 0.5376, "loss_ce": 0.20723417401313782, "loss_lvr": 0.3355475664138794, "loss_mode_switch": 0.0, "loss_total": 0.24078893661499023, "step": 1344 }, { "batch_size": 4, "epoch": 0.5376, "step": 1344, "tokens_per_device": 2592 }, { "epoch": 0.5376, "loss_ce": 0.35099276900291443, "loss_lvr": 0.9039879441261292, "loss_mode_switch": 0.0, "loss_total": 0.44139155745506287, "step": 1344 }, { "batch_size": 4, "epoch": 0.5376, "step": 1344, "tokens_per_device": 2064 }, { "epoch": 0.5376, "loss_ce": 0.7692855000495911, "loss_lvr": 0.8890140056610107, "loss_mode_switch": 0.0, "loss_total": 0.8581869006156921, "step": 1344 }, { "batch_size": 4, "epoch": 0.5376, "step": 1344, "tokens_per_device": 3784 }, { "epoch": 0.5376, "loss_ce": 0.15034320950508118, "loss_lvr": 1.401432991027832, "loss_mode_switch": 0.0, "loss_total": 0.29048651456832886, "step": 1344 }, { "batch_size": 4, "epoch": 0.5376, "step": 1344, "tokens_per_device": 2864 }, { "epoch": 0.5376, "loss_ce": 0.2905787229537964, "loss_lvr": 0.7510329484939575, "loss_mode_switch": 0.0, "loss_total": 0.3656820058822632, "step": 1344 }, { "batch_size": 4, "epoch": 0.5376, "step": 1344, "tokens_per_device": 2572 }, { "epoch": 0.5376, "loss_ce": 0.11888812482357025, "loss_lvr": 0.788224458694458, "loss_mode_switch": 0.0, "loss_total": 0.1977105736732483, "step": 1344 }, { "batch_size": 1, "epoch": 0.5376, "step": 1344, "tokens_per_device": 4967 }, { "epoch": 0.5376, "loss_ce": 0.008471496403217316, "loss_lvr": 0.9773212671279907, "loss_mode_switch": 0.0, "loss_total": 0.10620362311601639, "step": 1344 }, { "epoch": 0.538, "grad_norm": 1.3765912055969238, "learning_rate": 4.627887501769231e-06, "loss": 0.3164, "step": 1345 }, { "batch_size": 4, "epoch": 0.538, "step": 1345, "tokens_per_device": 2744 }, { "epoch": 0.538, "loss_ce": 0.03163131698966026, "loss_lvr": 0.44610291719436646, "loss_mode_switch": 0.0, "loss_total": 0.07624161243438721, "step": 1345 }, { "batch_size": 4, "epoch": 0.538, "step": 1345, "tokens_per_device": 9216 }, { "epoch": 0.538, "loss_ce": 0.3928866982460022, "loss_lvr": 0.8412814140319824, "loss_mode_switch": 0.0, "loss_total": 0.47701483964920044, "step": 1345 }, { "batch_size": 1, "epoch": 0.538, "step": 1345, "tokens_per_device": 5215 }, { "epoch": 0.538, "loss_ce": 0.05150838568806648, "loss_lvr": 0.42017096281051636, "loss_mode_switch": 0.0, "loss_total": 0.0935254842042923, "step": 1345 }, { "batch_size": 1, "epoch": 0.538, "step": 1345, "tokens_per_device": 5131 }, { "epoch": 0.538, "loss_ce": 0.005429589655250311, "loss_lvr": 0.5196936726570129, "loss_mode_switch": 0.0, "loss_total": 0.0573989562690258, "step": 1345 }, { "batch_size": 4, "epoch": 0.538, "step": 1345, "tokens_per_device": 2688 }, { "epoch": 0.538, "loss_ce": 0.10428227484226227, "loss_lvr": 0.7040575742721558, "loss_mode_switch": 0.0, "loss_total": 0.17468804121017456, "step": 1345 }, { "batch_size": 4, "epoch": 0.538, "step": 1345, "tokens_per_device": 4212 }, { "epoch": 0.538, "loss_ce": 0.4255351424217224, "loss_lvr": 1.0299526453018188, "loss_mode_switch": 0.0, "loss_total": 0.5285304188728333, "step": 1345 }, { "batch_size": 1, "epoch": 0.538, "step": 1345, "tokens_per_device": 5132 }, { "epoch": 0.538, "loss_ce": 0.1854514628648758, "loss_lvr": 0.6590157747268677, "loss_mode_switch": 0.0, "loss_total": 0.25135302543640137, "step": 1345 }, { "batch_size": 4, "epoch": 0.538, "step": 1345, "tokens_per_device": 4200 }, { "epoch": 0.538, "loss_ce": 0.4121027886867523, "loss_lvr": 0.9014714360237122, "loss_mode_switch": 0.0, "loss_total": 0.5022499561309814, "step": 1345 }, { "epoch": 0.5384, "grad_norm": 1.3413852453231812, "learning_rate": 4.621428268642613e-06, "loss": 0.2925, "step": 1346 }, { "batch_size": 1, "epoch": 0.5384, "step": 1346, "tokens_per_device": 5159 }, { "epoch": 0.5384, "loss_ce": 0.0010065946262329817, "loss_lvr": 0.3295198678970337, "loss_mode_switch": 0.0, "loss_total": 0.03395858407020569, "step": 1346 }, { "batch_size": 1, "epoch": 0.5384, "step": 1346, "tokens_per_device": 5171 }, { "epoch": 0.5384, "loss_ce": 0.009727653115987778, "loss_lvr": 0.4712940752506256, "loss_mode_switch": 0.0, "loss_total": 0.05685706064105034, "step": 1346 }, { "batch_size": 4, "epoch": 0.5384, "step": 1346, "tokens_per_device": 7052 }, { "epoch": 0.5384, "loss_ce": 0.43121880292892456, "loss_lvr": 0.7253859639167786, "loss_mode_switch": 0.0, "loss_total": 0.5037574172019958, "step": 1346 }, { "batch_size": 4, "epoch": 0.5384, "step": 1346, "tokens_per_device": 4256 }, { "epoch": 0.5384, "loss_ce": 0.414762020111084, "loss_lvr": 1.8317939043045044, "loss_mode_switch": 0.0, "loss_total": 0.5979413986206055, "step": 1346 }, { "batch_size": 4, "epoch": 0.5384, "step": 1346, "tokens_per_device": 1420 }, { "epoch": 0.5384, "loss_ce": 0.39988642930984497, "loss_lvr": 0.887322187423706, "loss_mode_switch": 0.0, "loss_total": 0.4886186420917511, "step": 1346 }, { "batch_size": 4, "epoch": 0.5384, "step": 1346, "tokens_per_device": 7612 }, { "epoch": 0.5384, "loss_ce": 0.599528431892395, "loss_lvr": 0.489408016204834, "loss_mode_switch": 0.0, "loss_total": 0.6484692096710205, "step": 1346 }, { "batch_size": 4, "epoch": 0.5384, "step": 1346, "tokens_per_device": 7848 }, { "epoch": 0.5384, "loss_ce": 0.05580311268568039, "loss_lvr": 0.5715852975845337, "loss_mode_switch": 0.0, "loss_total": 0.11296164244413376, "step": 1346 }, { "batch_size": 1, "epoch": 0.5384, "step": 1346, "tokens_per_device": 4899 }, { "epoch": 0.5384, "loss_ce": 0.025724325329065323, "loss_lvr": 0.9081394076347351, "loss_mode_switch": 0.0, "loss_total": 0.11653827130794525, "step": 1346 }, { "epoch": 0.5388, "grad_norm": 1.3571363687515259, "learning_rate": 4.614969670882594e-06, "loss": 0.302, "step": 1347 }, { "batch_size": 4, "epoch": 0.5388, "step": 1347, "tokens_per_device": 3908 }, { "epoch": 0.5388, "loss_ce": 0.23632803559303284, "loss_lvr": 0.6370241045951843, "loss_mode_switch": 0.0, "loss_total": 0.3000304400920868, "step": 1347 }, { "batch_size": 1, "epoch": 0.5388, "step": 1347, "tokens_per_device": 5108 }, { "epoch": 0.5388, "loss_ce": 0.012021016329526901, "loss_lvr": 0.8049963116645813, "loss_mode_switch": 0.0, "loss_total": 0.09252065420150757, "step": 1347 }, { "batch_size": 1, "epoch": 0.5388, "step": 1347, "tokens_per_device": 4767 }, { "epoch": 0.5388, "loss_ce": 0.11320256441831589, "loss_lvr": 0.23212061822414398, "loss_mode_switch": 0.0, "loss_total": 0.13641463220119476, "step": 1347 }, { "batch_size": 1, "epoch": 0.5388, "step": 1347, "tokens_per_device": 4772 }, { "epoch": 0.5388, "loss_ce": 0.0033976142294704914, "loss_lvr": 0.4692189395427704, "loss_mode_switch": 0.0, "loss_total": 0.05031950771808624, "step": 1347 }, { "batch_size": 4, "epoch": 0.5388, "step": 1347, "tokens_per_device": 4232 }, { "epoch": 0.5388, "loss_ce": 0.33791032433509827, "loss_lvr": 0.9571715593338013, "loss_mode_switch": 0.0, "loss_total": 0.43362748622894287, "step": 1347 }, { "batch_size": 1, "epoch": 0.5388, "step": 1347, "tokens_per_device": 4815 }, { "epoch": 0.5388, "loss_ce": 0.01651151292026043, "loss_lvr": 0.37046775221824646, "loss_mode_switch": 0.0, "loss_total": 0.053558290004730225, "step": 1347 }, { "batch_size": 4, "epoch": 0.5388, "step": 1347, "tokens_per_device": 4720 }, { "epoch": 0.5388, "loss_ce": 0.4566432237625122, "loss_lvr": 0.9293012022972107, "loss_mode_switch": 0.0, "loss_total": 0.5495733618736267, "step": 1347 }, { "batch_size": 1, "epoch": 0.5388, "step": 1347, "tokens_per_device": 5102 }, { "epoch": 0.5388, "loss_ce": 0.0026807221584022045, "loss_lvr": 0.4154365360736847, "loss_mode_switch": 0.0, "loss_total": 0.04422437772154808, "step": 1347 }, { "epoch": 0.5392, "grad_norm": 1.6788231134414673, "learning_rate": 4.608511719328803e-06, "loss": 0.3174, "step": 1348 }, { "batch_size": 4, "epoch": 0.5392, "step": 1348, "tokens_per_device": 12644 }, { "epoch": 0.5392, "loss_ce": 0.014654500409960747, "loss_lvr": 0.7341636419296265, "loss_mode_switch": 0.0, "loss_total": 0.08807086944580078, "step": 1348 }, { "batch_size": 4, "epoch": 0.5392, "step": 1348, "tokens_per_device": 4696 }, { "epoch": 0.5392, "loss_ce": 0.3797101676464081, "loss_lvr": 0.6076700091362, "loss_mode_switch": 0.0, "loss_total": 0.4404771625995636, "step": 1348 }, { "batch_size": 4, "epoch": 0.5392, "step": 1348, "tokens_per_device": 4244 }, { "epoch": 0.5392, "loss_ce": 0.4282599985599518, "loss_lvr": 1.152707815170288, "loss_mode_switch": 0.0, "loss_total": 0.5435307621955872, "step": 1348 }, { "batch_size": 4, "epoch": 0.5392, "step": 1348, "tokens_per_device": 4184 }, { "epoch": 0.5392, "loss_ce": 0.2909833788871765, "loss_lvr": 0.779290497303009, "loss_mode_switch": 0.0, "loss_total": 0.3689124286174774, "step": 1348 }, { "batch_size": 1, "epoch": 0.5392, "step": 1348, "tokens_per_device": 4875 }, { "epoch": 0.5392, "loss_ce": 0.0003042455646209419, "loss_lvr": 0.2597261071205139, "loss_mode_switch": 0.0, "loss_total": 0.026276856660842896, "step": 1348 }, { "batch_size": 1, "epoch": 0.5392, "step": 1348, "tokens_per_device": 5100 }, { "epoch": 0.5392, "loss_ce": 0.0008858671644702554, "loss_lvr": 0.7353482842445374, "loss_mode_switch": 0.0, "loss_total": 0.07442069798707962, "step": 1348 }, { "batch_size": 4, "epoch": 0.5392, "step": 1348, "tokens_per_device": 5112 }, { "epoch": 0.5392, "loss_ce": 0.19069485366344452, "loss_lvr": 0.7506851553916931, "loss_mode_switch": 0.0, "loss_total": 0.26576337218284607, "step": 1348 }, { "batch_size": 4, "epoch": 0.5392, "step": 1348, "tokens_per_device": 8312 }, { "epoch": 0.5392, "loss_ce": 0.030055036768317223, "loss_lvr": 0.7302414774894714, "loss_mode_switch": 0.0, "loss_total": 0.1030791848897934, "step": 1348 }, { "epoch": 0.5396, "grad_norm": 1.1930713653564453, "learning_rate": 4.602054424819782e-06, "loss": 0.2877, "step": 1349 }, { "batch_size": 4, "epoch": 0.5396, "step": 1349, "tokens_per_device": 8216 }, { "epoch": 0.5396, "loss_ce": 0.007925674319267273, "loss_lvr": 1.251299262046814, "loss_mode_switch": 0.0, "loss_total": 0.13305559754371643, "step": 1349 }, { "batch_size": 4, "epoch": 0.5396, "step": 1349, "tokens_per_device": 4776 }, { "epoch": 0.5396, "loss_ce": 0.594915509223938, "loss_lvr": 0.8766830563545227, "loss_mode_switch": 0.0, "loss_total": 0.6825838088989258, "step": 1349 }, { "batch_size": 1, "epoch": 0.5396, "step": 1349, "tokens_per_device": 4366 }, { "epoch": 0.5396, "loss_ce": 0.012818682007491589, "loss_lvr": 0.4976581335067749, "loss_mode_switch": 0.0, "loss_total": 0.06258449703454971, "step": 1349 }, { "batch_size": 1, "epoch": 0.5396, "step": 1349, "tokens_per_device": 4924 }, { "epoch": 0.5396, "loss_ce": 0.11841733753681183, "loss_lvr": 0.36080607771873474, "loss_mode_switch": 0.0, "loss_total": 0.15449795126914978, "step": 1349 }, { "batch_size": 4, "epoch": 0.5396, "step": 1349, "tokens_per_device": 5116 }, { "epoch": 0.5396, "loss_ce": 0.09332691878080368, "loss_lvr": 1.05814528465271, "loss_mode_switch": 0.0, "loss_total": 0.19914144277572632, "step": 1349 }, { "batch_size": 4, "epoch": 0.5396, "step": 1349, "tokens_per_device": 1956 }, { "epoch": 0.5396, "loss_ce": 0.3784247934818268, "loss_lvr": 1.0482457876205444, "loss_mode_switch": 0.0, "loss_total": 0.48324936628341675, "step": 1349 }, { "batch_size": 4, "epoch": 0.5396, "step": 1349, "tokens_per_device": 2648 }, { "epoch": 0.5396, "loss_ce": 0.4664563536643982, "loss_lvr": 0.8013486266136169, "loss_mode_switch": 0.0, "loss_total": 0.5465912222862244, "step": 1349 }, { "batch_size": 1, "epoch": 0.5396, "step": 1349, "tokens_per_device": 5134 }, { "epoch": 0.5396, "loss_ce": 0.0035834461450576782, "loss_lvr": 0.4047107696533203, "loss_mode_switch": 0.0, "loss_total": 0.04405452311038971, "step": 1349 }, { "epoch": 0.54, "grad_norm": 1.4065864086151123, "learning_rate": 4.59559779819298e-06, "loss": 0.307, "step": 1350 }, { "batch_size": 1, "epoch": 0.54, "step": 1350, "tokens_per_device": 5097 }, { "epoch": 0.54, "loss_ce": 0.049224384129047394, "loss_lvr": 0.29351115226745605, "loss_mode_switch": 0.0, "loss_total": 0.078575499355793, "step": 1350 }, { "batch_size": 4, "epoch": 0.54, "step": 1350, "tokens_per_device": 8456 }, { "epoch": 0.54, "loss_ce": 0.19487127661705017, "loss_lvr": 0.5569570660591125, "loss_mode_switch": 0.0, "loss_total": 0.2505669891834259, "step": 1350 }, { "batch_size": 4, "epoch": 0.54, "step": 1350, "tokens_per_device": 6264 }, { "epoch": 0.54, "loss_ce": 0.04479638487100601, "loss_lvr": 0.6689234375953674, "loss_mode_switch": 0.0, "loss_total": 0.11168873310089111, "step": 1350 }, { "batch_size": 4, "epoch": 0.54, "step": 1350, "tokens_per_device": 4772 }, { "epoch": 0.54, "loss_ce": 0.1028619334101677, "loss_lvr": 0.953861653804779, "loss_mode_switch": 0.0, "loss_total": 0.19824810326099396, "step": 1350 }, { "batch_size": 4, "epoch": 0.54, "step": 1350, "tokens_per_device": 4416 }, { "epoch": 0.54, "loss_ce": 0.1826593577861786, "loss_lvr": 0.7788008451461792, "loss_mode_switch": 0.0, "loss_total": 0.2605394423007965, "step": 1350 }, { "batch_size": 1, "epoch": 0.54, "step": 1350, "tokens_per_device": 4885 }, { "epoch": 0.54, "loss_ce": 0.1712127923965454, "loss_lvr": 0.11689324676990509, "loss_mode_switch": 0.0, "loss_total": 0.18290211260318756, "step": 1350 }, { "batch_size": 4, "epoch": 0.54, "step": 1350, "tokens_per_device": 4404 }, { "epoch": 0.54, "loss_ce": 0.24339257180690765, "loss_lvr": 0.8006194829940796, "loss_mode_switch": 0.0, "loss_total": 0.32345452904701233, "step": 1350 }, { "batch_size": 4, "epoch": 0.54, "step": 1350, "tokens_per_device": 3712 }, { "epoch": 0.54, "loss_ce": 0.3140243589878082, "loss_lvr": 0.8759337663650513, "loss_mode_switch": 0.0, "loss_total": 0.40161773562431335, "step": 1350 }, { "epoch": 0.5404, "grad_norm": 1.6756703853607178, "learning_rate": 4.589141850284712e-06, "loss": 0.2632, "step": 1351 }, { "batch_size": 4, "epoch": 0.5404, "step": 1351, "tokens_per_device": 1288 }, { "epoch": 0.5404, "loss_ce": 0.11039070039987564, "loss_lvr": 0.9165763258934021, "loss_mode_switch": 0.0, "loss_total": 0.20204833149909973, "step": 1351 }, { "batch_size": 4, "epoch": 0.5404, "step": 1351, "tokens_per_device": 4228 }, { "epoch": 0.5404, "loss_ce": 0.05567101016640663, "loss_lvr": 0.8838735222816467, "loss_mode_switch": 0.0, "loss_total": 0.14405836164951324, "step": 1351 }, { "batch_size": 1, "epoch": 0.5404, "step": 1351, "tokens_per_device": 5149 }, { "epoch": 0.5404, "loss_ce": 0.2004503458738327, "loss_lvr": 0.13887812197208405, "loss_mode_switch": 0.0, "loss_total": 0.21433815360069275, "step": 1351 }, { "batch_size": 4, "epoch": 0.5404, "step": 1351, "tokens_per_device": 3312 }, { "epoch": 0.5404, "loss_ce": 0.5848017930984497, "loss_lvr": 0.9200897216796875, "loss_mode_switch": 0.0, "loss_total": 0.6768107414245605, "step": 1351 }, { "batch_size": 4, "epoch": 0.5404, "step": 1351, "tokens_per_device": 4260 }, { "epoch": 0.5404, "loss_ce": 0.5171127915382385, "loss_lvr": 0.9295265674591064, "loss_mode_switch": 0.0, "loss_total": 0.6100654602050781, "step": 1351 }, { "batch_size": 4, "epoch": 0.5404, "step": 1351, "tokens_per_device": 6292 }, { "epoch": 0.5404, "loss_ce": 0.002026007976382971, "loss_lvr": 1.0926051139831543, "loss_mode_switch": 0.0, "loss_total": 0.11128652095794678, "step": 1351 }, { "batch_size": 4, "epoch": 0.5404, "step": 1351, "tokens_per_device": 5680 }, { "epoch": 0.5404, "loss_ce": 0.35618582367897034, "loss_lvr": 0.6756252646446228, "loss_mode_switch": 0.0, "loss_total": 0.42374834418296814, "step": 1351 }, { "batch_size": 1, "epoch": 0.5404, "step": 1351, "tokens_per_device": 5177 }, { "epoch": 0.5404, "loss_ce": 0.04425423964858055, "loss_lvr": 0.7936477065086365, "loss_mode_switch": 0.0, "loss_total": 0.12361900508403778, "step": 1351 }, { "epoch": 0.5408, "grad_norm": 1.2301665544509888, "learning_rate": 4.5826865919301645e-06, "loss": 0.2983, "step": 1352 }, { "batch_size": 1, "epoch": 0.5408, "step": 1352, "tokens_per_device": 5642 }, { "epoch": 0.5408, "loss_ce": 0.043586865067481995, "loss_lvr": 0.43312087655067444, "loss_mode_switch": 0.0, "loss_total": 0.08689895272254944, "step": 1352 }, { "batch_size": 4, "epoch": 0.5408, "step": 1352, "tokens_per_device": 4088 }, { "epoch": 0.5408, "loss_ce": 0.2318880558013916, "loss_lvr": 0.9016867280006409, "loss_mode_switch": 0.0, "loss_total": 0.32205674052238464, "step": 1352 }, { "batch_size": 4, "epoch": 0.5408, "step": 1352, "tokens_per_device": 7340 }, { "epoch": 0.5408, "loss_ce": 0.14763294160366058, "loss_lvr": 0.7401054501533508, "loss_mode_switch": 0.0, "loss_total": 0.22164347767829895, "step": 1352 }, { "batch_size": 4, "epoch": 0.5408, "step": 1352, "tokens_per_device": 4260 }, { "epoch": 0.5408, "loss_ce": 0.4007563889026642, "loss_lvr": 1.2144864797592163, "loss_mode_switch": 0.0, "loss_total": 0.5222050547599792, "step": 1352 }, { "batch_size": 1, "epoch": 0.5408, "step": 1352, "tokens_per_device": 4878 }, { "epoch": 0.5408, "loss_ce": 0.24463312327861786, "loss_lvr": 0.4628828763961792, "loss_mode_switch": 0.0, "loss_total": 0.2909214198589325, "step": 1352 }, { "batch_size": 4, "epoch": 0.5408, "step": 1352, "tokens_per_device": 4392 }, { "epoch": 0.5408, "loss_ce": 0.5759196281433105, "loss_lvr": 0.8793075084686279, "loss_mode_switch": 0.0, "loss_total": 0.6638503670692444, "step": 1352 }, { "batch_size": 4, "epoch": 0.5408, "step": 1352, "tokens_per_device": 1292 }, { "epoch": 0.5408, "loss_ce": 0.2614629566669464, "loss_lvr": 0.871786892414093, "loss_mode_switch": 0.0, "loss_total": 0.34864163398742676, "step": 1352 }, { "batch_size": 4, "epoch": 0.5408, "step": 1352, "tokens_per_device": 8272 }, { "epoch": 0.5408, "loss_ce": 0.08316680043935776, "loss_lvr": 0.7587776780128479, "loss_mode_switch": 0.0, "loss_total": 0.1590445637702942, "step": 1352 }, { "epoch": 0.5412, "grad_norm": 1.5628001689910889, "learning_rate": 4.5762320339633585e-06, "loss": 0.3207, "step": 1353 }, { "batch_size": 1, "epoch": 0.5412, "step": 1353, "tokens_per_device": 5153 }, { "epoch": 0.5412, "loss_ce": 0.005577476695179939, "loss_lvr": 0.356949120759964, "loss_mode_switch": 0.0, "loss_total": 0.04127238690853119, "step": 1353 }, { "batch_size": 4, "epoch": 0.5412, "step": 1353, "tokens_per_device": 1708 }, { "epoch": 0.5412, "loss_ce": 0.19998185336589813, "loss_lvr": 0.8833602070808411, "loss_mode_switch": 0.0, "loss_total": 0.28831785917282104, "step": 1353 }, { "batch_size": 1, "epoch": 0.5412, "step": 1353, "tokens_per_device": 4875 }, { "epoch": 0.5412, "loss_ce": 0.003382973140105605, "loss_lvr": 1.1949318647384644, "loss_mode_switch": 0.0, "loss_total": 0.12287615984678268, "step": 1353 }, { "batch_size": 1, "epoch": 0.5412, "step": 1353, "tokens_per_device": 7212 }, { "epoch": 0.5412, "loss_ce": 0.0011724366340786219, "loss_lvr": 0.4460090398788452, "loss_mode_switch": 0.0, "loss_total": 0.04577334225177765, "step": 1353 }, { "batch_size": 1, "epoch": 0.5412, "step": 1353, "tokens_per_device": 7010 }, { "epoch": 0.5412, "loss_ce": 0.004588875453919172, "loss_lvr": 0.38033053278923035, "loss_mode_switch": 0.0, "loss_total": 0.042621929198503494, "step": 1353 }, { "batch_size": 1, "epoch": 0.5412, "step": 1353, "tokens_per_device": 4600 }, { "epoch": 0.5412, "loss_ce": 0.0008208686485886574, "loss_lvr": 0.515777587890625, "loss_mode_switch": 0.0, "loss_total": 0.05239862576127052, "step": 1353 }, { "batch_size": 1, "epoch": 0.5412, "step": 1353, "tokens_per_device": 4823 }, { "epoch": 0.5412, "loss_ce": 0.008663604967296124, "loss_lvr": 0.5436667799949646, "loss_mode_switch": 0.0, "loss_total": 0.06303028017282486, "step": 1353 }, { "batch_size": 1, "epoch": 0.5412, "step": 1353, "tokens_per_device": 4858 }, { "epoch": 0.5412, "loss_ce": 0.05642015114426613, "loss_lvr": 0.21310202777385712, "loss_mode_switch": 0.0, "loss_total": 0.07773035764694214, "step": 1353 }, { "epoch": 0.5416, "grad_norm": 1.149611234664917, "learning_rate": 4.569778187217144e-06, "loss": 0.2603, "step": 1354 }, { "batch_size": 4, "epoch": 0.5416, "step": 1354, "tokens_per_device": 4188 }, { "epoch": 0.5416, "loss_ce": 0.1633327156305313, "loss_lvr": 0.8172999024391174, "loss_mode_switch": 0.0, "loss_total": 0.2450627088546753, "step": 1354 }, { "batch_size": 4, "epoch": 0.5416, "step": 1354, "tokens_per_device": 4060 }, { "epoch": 0.5416, "loss_ce": 0.6563096046447754, "loss_lvr": 0.7536584138870239, "loss_mode_switch": 0.0, "loss_total": 0.7316754460334778, "step": 1354 }, { "batch_size": 4, "epoch": 0.5416, "step": 1354, "tokens_per_device": 3320 }, { "epoch": 0.5416, "loss_ce": 0.4387771785259247, "loss_lvr": 0.8941735029220581, "loss_mode_switch": 0.0, "loss_total": 0.5281945466995239, "step": 1354 }, { "batch_size": 4, "epoch": 0.5416, "step": 1354, "tokens_per_device": 4348 }, { "epoch": 0.5416, "loss_ce": 0.4300476312637329, "loss_lvr": 0.7670031189918518, "loss_mode_switch": 0.0, "loss_total": 0.5067479610443115, "step": 1354 }, { "batch_size": 1, "epoch": 0.5416, "step": 1354, "tokens_per_device": 5052 }, { "epoch": 0.5416, "loss_ce": 0.030340876430273056, "loss_lvr": 0.5906273126602173, "loss_mode_switch": 0.0, "loss_total": 0.08940360695123672, "step": 1354 }, { "batch_size": 4, "epoch": 0.5416, "step": 1354, "tokens_per_device": 4720 }, { "epoch": 0.5416, "loss_ce": 0.46979790925979614, "loss_lvr": 0.8438627123832703, "loss_mode_switch": 0.0, "loss_total": 0.5541841983795166, "step": 1354 }, { "batch_size": 4, "epoch": 0.5416, "step": 1354, "tokens_per_device": 5312 }, { "epoch": 0.5416, "loss_ce": 0.10177736729383469, "loss_lvr": 0.8531948328018188, "loss_mode_switch": 0.0, "loss_total": 0.18709684908390045, "step": 1354 }, { "batch_size": 4, "epoch": 0.5416, "step": 1354, "tokens_per_device": 1760 }, { "epoch": 0.5416, "loss_ce": 0.6487231850624084, "loss_lvr": 0.9634501934051514, "loss_mode_switch": 0.0, "loss_total": 0.7450681924819946, "step": 1354 }, { "epoch": 0.542, "grad_norm": 1.811728835105896, "learning_rate": 4.5633250625231806e-06, "loss": 0.3161, "step": 1355 }, { "batch_size": 4, "epoch": 0.542, "step": 1355, "tokens_per_device": 4200 }, { "epoch": 0.542, "loss_ce": 0.09490972757339478, "loss_lvr": 0.8504708409309387, "loss_mode_switch": 0.0, "loss_total": 0.1799568235874176, "step": 1355 }, { "batch_size": 4, "epoch": 0.542, "step": 1355, "tokens_per_device": 3528 }, { "epoch": 0.542, "loss_ce": 0.3012639582157135, "loss_lvr": 0.8475229144096375, "loss_mode_switch": 0.0, "loss_total": 0.38601624965667725, "step": 1355 }, { "batch_size": 1, "epoch": 0.542, "step": 1355, "tokens_per_device": 5184 }, { "epoch": 0.542, "loss_ce": 1.3160616159439087, "loss_lvr": 0.6108736991882324, "loss_mode_switch": 0.0, "loss_total": 1.377148985862732, "step": 1355 }, { "batch_size": 1, "epoch": 0.542, "step": 1355, "tokens_per_device": 4889 }, { "epoch": 0.542, "loss_ce": 0.0050061349757015705, "loss_lvr": 0.9116207361221313, "loss_mode_switch": 0.0, "loss_total": 0.09616821259260178, "step": 1355 }, { "batch_size": 4, "epoch": 0.542, "step": 1355, "tokens_per_device": 3768 }, { "epoch": 0.542, "loss_ce": 0.142299085855484, "loss_lvr": 0.9733040928840637, "loss_mode_switch": 0.0, "loss_total": 0.23962950706481934, "step": 1355 }, { "batch_size": 1, "epoch": 0.542, "step": 1355, "tokens_per_device": 5211 }, { "epoch": 0.542, "loss_ce": 0.0011125752935186028, "loss_lvr": 0.4562876522541046, "loss_mode_switch": 0.0, "loss_total": 0.04674134403467178, "step": 1355 }, { "batch_size": 4, "epoch": 0.542, "step": 1355, "tokens_per_device": 4384 }, { "epoch": 0.542, "loss_ce": 0.276662141084671, "loss_lvr": 0.9810232520103455, "loss_mode_switch": 0.0, "loss_total": 0.37476447224617004, "step": 1355 }, { "batch_size": 4, "epoch": 0.542, "step": 1355, "tokens_per_device": 5544 }, { "epoch": 0.542, "loss_ce": 0.09572036564350128, "loss_lvr": 0.8445669412612915, "loss_mode_switch": 0.0, "loss_total": 0.18017706274986267, "step": 1355 }, { "epoch": 0.5424, "grad_norm": 1.4018999338150024, "learning_rate": 4.556872670711908e-06, "loss": 0.3183, "step": 1356 }, { "batch_size": 4, "epoch": 0.5424, "step": 1356, "tokens_per_device": 1328 }, { "epoch": 0.5424, "loss_ce": 0.15724192559719086, "loss_lvr": 0.8820320963859558, "loss_mode_switch": 0.0, "loss_total": 0.2454451322555542, "step": 1356 }, { "batch_size": 1, "epoch": 0.5424, "step": 1356, "tokens_per_device": 4884 }, { "epoch": 0.5424, "loss_ce": 0.25205233693122864, "loss_lvr": 0.4547263979911804, "loss_mode_switch": 0.0, "loss_total": 0.29752498865127563, "step": 1356 }, { "batch_size": 4, "epoch": 0.5424, "step": 1356, "tokens_per_device": 5792 }, { "epoch": 0.5424, "loss_ce": 0.2785400152206421, "loss_lvr": 0.43975555896759033, "loss_mode_switch": 0.0, "loss_total": 0.3225155770778656, "step": 1356 }, { "batch_size": 1, "epoch": 0.5424, "step": 1356, "tokens_per_device": 4866 }, { "epoch": 0.5424, "loss_ce": 0.2873503565788269, "loss_lvr": 0.24959377944469452, "loss_mode_switch": 0.0, "loss_total": 0.31230974197387695, "step": 1356 }, { "batch_size": 4, "epoch": 0.5424, "step": 1356, "tokens_per_device": 4880 }, { "epoch": 0.5424, "loss_ce": 0.3098682463169098, "loss_lvr": 0.7237539291381836, "loss_mode_switch": 0.0, "loss_total": 0.38224363327026367, "step": 1356 }, { "batch_size": 4, "epoch": 0.5424, "step": 1356, "tokens_per_device": 5728 }, { "epoch": 0.5424, "loss_ce": 0.035973865538835526, "loss_lvr": 0.9717943072319031, "loss_mode_switch": 0.0, "loss_total": 0.1331533044576645, "step": 1356 }, { "batch_size": 1, "epoch": 0.5424, "step": 1356, "tokens_per_device": 4946 }, { "epoch": 0.5424, "loss_ce": 0.12061072140932083, "loss_lvr": 0.44009220600128174, "loss_mode_switch": 0.0, "loss_total": 0.16461993753910065, "step": 1356 }, { "batch_size": 1, "epoch": 0.5424, "step": 1356, "tokens_per_device": 4741 }, { "epoch": 0.5424, "loss_ce": 0.02085822820663452, "loss_lvr": 0.22973236441612244, "loss_mode_switch": 0.0, "loss_total": 0.043831467628479004, "step": 1356 }, { "epoch": 0.5428, "grad_norm": 1.4040623903274536, "learning_rate": 4.550421022612542e-06, "loss": 0.288, "step": 1357 }, { "batch_size": 4, "epoch": 0.5428, "step": 1357, "tokens_per_device": 1392 }, { "epoch": 0.5428, "loss_ce": 0.284574031829834, "loss_lvr": 1.0175964832305908, "loss_mode_switch": 0.0, "loss_total": 0.3863336741924286, "step": 1357 }, { "batch_size": 1, "epoch": 0.5428, "step": 1357, "tokens_per_device": 5271 }, { "epoch": 0.5428, "loss_ce": 0.0014796718023717403, "loss_lvr": 0.6090553402900696, "loss_mode_switch": 0.0, "loss_total": 0.062385205179452896, "step": 1357 }, { "batch_size": 4, "epoch": 0.5428, "step": 1357, "tokens_per_device": 1308 }, { "epoch": 0.5428, "loss_ce": 0.14812316000461578, "loss_lvr": 1.125455379486084, "loss_mode_switch": 0.0, "loss_total": 0.26066869497299194, "step": 1357 }, { "batch_size": 4, "epoch": 0.5428, "step": 1357, "tokens_per_device": 4300 }, { "epoch": 0.5428, "loss_ce": 0.17724238336086273, "loss_lvr": 1.1394764184951782, "loss_mode_switch": 0.0, "loss_total": 0.2911900281906128, "step": 1357 }, { "batch_size": 4, "epoch": 0.5428, "step": 1357, "tokens_per_device": 4464 }, { "epoch": 0.5428, "loss_ce": 0.04178815335035324, "loss_lvr": 0.7532625198364258, "loss_mode_switch": 0.0, "loss_total": 0.11711440980434418, "step": 1357 }, { "batch_size": 4, "epoch": 0.5428, "step": 1357, "tokens_per_device": 4260 }, { "epoch": 0.5428, "loss_ce": 0.0315123088657856, "loss_lvr": 0.7459709644317627, "loss_mode_switch": 0.0, "loss_total": 0.10610941052436829, "step": 1357 }, { "batch_size": 1, "epoch": 0.5428, "step": 1357, "tokens_per_device": 5418 }, { "epoch": 0.5428, "loss_ce": 0.5079542398452759, "loss_lvr": 0.4861355423927307, "loss_mode_switch": 0.0, "loss_total": 0.5565677881240845, "step": 1357 }, { "batch_size": 4, "epoch": 0.5428, "step": 1357, "tokens_per_device": 11484 }, { "epoch": 0.5428, "loss_ce": 0.0013483648654073477, "loss_lvr": 0.8738032579421997, "loss_mode_switch": 0.0, "loss_total": 0.08872868865728378, "step": 1357 }, { "epoch": 0.5432, "grad_norm": 1.9565755128860474, "learning_rate": 4.543970129053047e-06, "loss": 0.347, "step": 1358 }, { "batch_size": 4, "epoch": 0.5432, "step": 1358, "tokens_per_device": 4256 }, { "epoch": 0.5432, "loss_ce": 0.011635362170636654, "loss_lvr": 1.0175985097885132, "loss_mode_switch": 0.0, "loss_total": 0.11339521408081055, "step": 1358 }, { "batch_size": 4, "epoch": 0.5432, "step": 1358, "tokens_per_device": 2960 }, { "epoch": 0.5432, "loss_ce": 0.39349761605262756, "loss_lvr": 0.9876108169555664, "loss_mode_switch": 0.0, "loss_total": 0.4922586977481842, "step": 1358 }, { "batch_size": 1, "epoch": 0.5432, "step": 1358, "tokens_per_device": 6471 }, { "epoch": 0.5432, "loss_ce": 0.03044377639889717, "loss_lvr": 0.5436496734619141, "loss_mode_switch": 0.0, "loss_total": 0.08480874449014664, "step": 1358 }, { "batch_size": 1, "epoch": 0.5432, "step": 1358, "tokens_per_device": 4942 }, { "epoch": 0.5432, "loss_ce": 0.0004387391381897032, "loss_lvr": 0.5765509605407715, "loss_mode_switch": 0.0, "loss_total": 0.0580938346683979, "step": 1358 }, { "batch_size": 4, "epoch": 0.5432, "step": 1358, "tokens_per_device": 6788 }, { "epoch": 0.5432, "loss_ce": 0.618602454662323, "loss_lvr": 0.7983291149139404, "loss_mode_switch": 0.0, "loss_total": 0.698435366153717, "step": 1358 }, { "batch_size": 4, "epoch": 0.5432, "step": 1358, "tokens_per_device": 5016 }, { "epoch": 0.5432, "loss_ce": 0.1037774309515953, "loss_lvr": 0.6444840431213379, "loss_mode_switch": 0.0, "loss_total": 0.16822583973407745, "step": 1358 }, { "batch_size": 4, "epoch": 0.5432, "step": 1358, "tokens_per_device": 4564 }, { "epoch": 0.5432, "loss_ce": 0.18863342702388763, "loss_lvr": 0.786817193031311, "loss_mode_switch": 0.0, "loss_total": 0.267315149307251, "step": 1358 }, { "batch_size": 4, "epoch": 0.5432, "step": 1358, "tokens_per_device": 5096 }, { "epoch": 0.5432, "loss_ce": 0.004713715985417366, "loss_lvr": 0.7854867577552795, "loss_mode_switch": 0.0, "loss_total": 0.08326239138841629, "step": 1358 }, { "epoch": 0.5436, "grad_norm": 1.3820383548736572, "learning_rate": 4.537520000860124e-06, "loss": 0.2955, "step": 1359 }, { "batch_size": 1, "epoch": 0.5436, "step": 1359, "tokens_per_device": 4410 }, { "epoch": 0.5436, "loss_ce": 0.0577443428337574, "loss_lvr": 0.7479633092880249, "loss_mode_switch": 0.0, "loss_total": 0.13254067301750183, "step": 1359 }, { "batch_size": 4, "epoch": 0.5436, "step": 1359, "tokens_per_device": 2668 }, { "epoch": 0.5436, "loss_ce": 0.16255077719688416, "loss_lvr": 0.8616169691085815, "loss_mode_switch": 0.0, "loss_total": 0.2487124800682068, "step": 1359 }, { "batch_size": 4, "epoch": 0.5436, "step": 1359, "tokens_per_device": 5776 }, { "epoch": 0.5436, "loss_ce": 0.07795354723930359, "loss_lvr": 0.8949270844459534, "loss_mode_switch": 0.0, "loss_total": 0.16744625568389893, "step": 1359 }, { "batch_size": 4, "epoch": 0.5436, "step": 1359, "tokens_per_device": 10120 }, { "epoch": 0.5436, "loss_ce": 0.3352772295475006, "loss_lvr": 0.6593518853187561, "loss_mode_switch": 0.0, "loss_total": 0.4012124240398407, "step": 1359 }, { "batch_size": 1, "epoch": 0.5436, "step": 1359, "tokens_per_device": 4934 }, { "epoch": 0.5436, "loss_ce": 0.006472561042755842, "loss_lvr": 0.32431724667549133, "loss_mode_switch": 0.0, "loss_total": 0.03890428692102432, "step": 1359 }, { "batch_size": 1, "epoch": 0.5436, "step": 1359, "tokens_per_device": 5088 }, { "epoch": 0.5436, "loss_ce": 0.011135539971292019, "loss_lvr": 0.3258107900619507, "loss_mode_switch": 0.0, "loss_total": 0.04371662065386772, "step": 1359 }, { "batch_size": 4, "epoch": 0.5436, "step": 1359, "tokens_per_device": 4256 }, { "epoch": 0.5436, "loss_ce": 0.21272149682044983, "loss_lvr": 1.1477693319320679, "loss_mode_switch": 0.0, "loss_total": 0.3274984359741211, "step": 1359 }, { "batch_size": 4, "epoch": 0.5436, "step": 1359, "tokens_per_device": 11096 }, { "epoch": 0.5436, "loss_ce": 0.15777850151062012, "loss_lvr": 0.7214366793632507, "loss_mode_switch": 0.0, "loss_total": 0.22992217540740967, "step": 1359 }, { "epoch": 0.544, "grad_norm": 1.2282267808914185, "learning_rate": 4.531070648859186e-06, "loss": 0.3043, "step": 1360 }, { "batch_size": 4, "epoch": 0.544, "step": 1360, "tokens_per_device": 1472 }, { "epoch": 0.544, "loss_ce": 0.5659956336021423, "loss_lvr": 1.1034289598464966, "loss_mode_switch": 0.0, "loss_total": 0.6763385534286499, "step": 1360 }, { "batch_size": 1, "epoch": 0.544, "step": 1360, "tokens_per_device": 5089 }, { "epoch": 0.544, "loss_ce": 0.003490571863949299, "loss_lvr": 0.3546918034553528, "loss_mode_switch": 0.0, "loss_total": 0.03895975276827812, "step": 1360 }, { "batch_size": 4, "epoch": 0.544, "step": 1360, "tokens_per_device": 3916 }, { "epoch": 0.544, "loss_ce": 0.772463858127594, "loss_lvr": 1.0123034715652466, "loss_mode_switch": 0.0, "loss_total": 0.8736941814422607, "step": 1360 }, { "batch_size": 4, "epoch": 0.544, "step": 1360, "tokens_per_device": 3948 }, { "epoch": 0.544, "loss_ce": 0.06749258190393448, "loss_lvr": 1.3259594440460205, "loss_mode_switch": 0.0, "loss_total": 0.2000885307788849, "step": 1360 }, { "batch_size": 4, "epoch": 0.544, "step": 1360, "tokens_per_device": 4760 }, { "epoch": 0.544, "loss_ce": 0.16030456125736237, "loss_lvr": 0.8196914792060852, "loss_mode_switch": 0.0, "loss_total": 0.2422737181186676, "step": 1360 }, { "batch_size": 4, "epoch": 0.544, "step": 1360, "tokens_per_device": 4280 }, { "epoch": 0.544, "loss_ce": 0.01266813836991787, "loss_lvr": 0.9016059637069702, "loss_mode_switch": 0.0, "loss_total": 0.1028287410736084, "step": 1360 }, { "batch_size": 1, "epoch": 0.544, "step": 1360, "tokens_per_device": 6125 }, { "epoch": 0.544, "loss_ce": 0.06035846099257469, "loss_lvr": 0.5226190090179443, "loss_mode_switch": 0.0, "loss_total": 0.11262036114931107, "step": 1360 }, { "batch_size": 4, "epoch": 0.544, "step": 1360, "tokens_per_device": 4232 }, { "epoch": 0.544, "loss_ce": 0.45744213461875916, "loss_lvr": 0.9755550622940063, "loss_mode_switch": 0.0, "loss_total": 0.5549976229667664, "step": 1360 }, { "epoch": 0.5444, "grad_norm": 1.3413879871368408, "learning_rate": 4.524622083874347e-06, "loss": 0.3021, "step": 1361 }, { "batch_size": 4, "epoch": 0.5444, "step": 1361, "tokens_per_device": 11992 }, { "epoch": 0.5444, "loss_ce": 0.02324916422367096, "loss_lvr": 0.7109082341194153, "loss_mode_switch": 0.0, "loss_total": 0.09433998912572861, "step": 1361 }, { "batch_size": 4, "epoch": 0.5444, "step": 1361, "tokens_per_device": 9896 }, { "epoch": 0.5444, "loss_ce": 0.1499820500612259, "loss_lvr": 0.6761994957923889, "loss_mode_switch": 0.0, "loss_total": 0.21760199964046478, "step": 1361 }, { "batch_size": 4, "epoch": 0.5444, "step": 1361, "tokens_per_device": 3896 }, { "epoch": 0.5444, "loss_ce": 0.21607057750225067, "loss_lvr": 0.9009712934494019, "loss_mode_switch": 0.0, "loss_total": 0.30616772174835205, "step": 1361 }, { "batch_size": 4, "epoch": 0.5444, "step": 1361, "tokens_per_device": 3832 }, { "epoch": 0.5444, "loss_ce": 0.5997117161750793, "loss_lvr": 0.8799281120300293, "loss_mode_switch": 0.0, "loss_total": 0.6877045035362244, "step": 1361 }, { "batch_size": 1, "epoch": 0.5444, "step": 1361, "tokens_per_device": 5351 }, { "epoch": 0.5444, "loss_ce": 0.000308448972646147, "loss_lvr": 0.22774742543697357, "loss_mode_switch": 0.0, "loss_total": 0.023083191365003586, "step": 1361 }, { "batch_size": 1, "epoch": 0.5444, "step": 1361, "tokens_per_device": 4793 }, { "epoch": 0.5444, "loss_ce": 0.000784281175583601, "loss_lvr": 0.4337772727012634, "loss_mode_switch": 0.0, "loss_total": 0.04416200891137123, "step": 1361 }, { "batch_size": 4, "epoch": 0.5444, "step": 1361, "tokens_per_device": 3824 }, { "epoch": 0.5444, "loss_ce": 0.34881851077079773, "loss_lvr": 0.9140722751617432, "loss_mode_switch": 0.0, "loss_total": 0.440225750207901, "step": 1361 }, { "batch_size": 4, "epoch": 0.5444, "step": 1361, "tokens_per_device": 4256 }, { "epoch": 0.5444, "loss_ce": 0.008190049789845943, "loss_lvr": 0.9058080315589905, "loss_mode_switch": 0.0, "loss_total": 0.0987708568572998, "step": 1361 }, { "epoch": 0.5448, "grad_norm": 1.4268540143966675, "learning_rate": 4.518174316728396e-06, "loss": 0.3233, "step": 1362 }, { "batch_size": 4, "epoch": 0.5448, "step": 1362, "tokens_per_device": 4568 }, { "epoch": 0.5448, "loss_ce": 0.1285899579524994, "loss_lvr": 0.8739628195762634, "loss_mode_switch": 0.0, "loss_total": 0.2159862518310547, "step": 1362 }, { "batch_size": 4, "epoch": 0.5448, "step": 1362, "tokens_per_device": 4880 }, { "epoch": 0.5448, "loss_ce": 0.05105263367295265, "loss_lvr": 0.8343024849891663, "loss_mode_switch": 0.0, "loss_total": 0.13448289036750793, "step": 1362 }, { "batch_size": 4, "epoch": 0.5448, "step": 1362, "tokens_per_device": 11920 }, { "epoch": 0.5448, "loss_ce": 0.21091628074645996, "loss_lvr": 0.6085466742515564, "loss_mode_switch": 0.0, "loss_total": 0.2717709541320801, "step": 1362 }, { "batch_size": 1, "epoch": 0.5448, "step": 1362, "tokens_per_device": 4757 }, { "epoch": 0.5448, "loss_ce": 0.21399615705013275, "loss_lvr": 0.558241069316864, "loss_mode_switch": 0.0, "loss_total": 0.26982027292251587, "step": 1362 }, { "batch_size": 4, "epoch": 0.5448, "step": 1362, "tokens_per_device": 4228 }, { "epoch": 0.5448, "loss_ce": 0.18418358266353607, "loss_lvr": 1.0201284885406494, "loss_mode_switch": 0.0, "loss_total": 0.28619644045829773, "step": 1362 }, { "batch_size": 4, "epoch": 0.5448, "step": 1362, "tokens_per_device": 1788 }, { "epoch": 0.5448, "loss_ce": 0.26338979601860046, "loss_lvr": 0.8691603541374207, "loss_mode_switch": 0.0, "loss_total": 0.35030582547187805, "step": 1362 }, { "batch_size": 1, "epoch": 0.5448, "step": 1362, "tokens_per_device": 4158 }, { "epoch": 0.5448, "loss_ce": 0.6866340637207031, "loss_lvr": 1.2239404916763306, "loss_mode_switch": 0.0, "loss_total": 0.8090280890464783, "step": 1362 }, { "batch_size": 1, "epoch": 0.5448, "step": 1362, "tokens_per_device": 5080 }, { "epoch": 0.5448, "loss_ce": 0.05041929706931114, "loss_lvr": 0.4228108525276184, "loss_mode_switch": 0.0, "loss_total": 0.09270038455724716, "step": 1362 }, { "epoch": 0.5452, "grad_norm": 1.399324655532837, "learning_rate": 4.511727358242786e-06, "loss": 0.2833, "step": 1363 }, { "batch_size": 4, "epoch": 0.5452, "step": 1363, "tokens_per_device": 3864 }, { "epoch": 0.5452, "loss_ce": 0.1249440461397171, "loss_lvr": 0.7601722478866577, "loss_mode_switch": 0.0, "loss_total": 0.20096126198768616, "step": 1363 }, { "batch_size": 4, "epoch": 0.5452, "step": 1363, "tokens_per_device": 3364 }, { "epoch": 0.5452, "loss_ce": 0.40101251006126404, "loss_lvr": 0.9950951933860779, "loss_mode_switch": 0.0, "loss_total": 0.5005220174789429, "step": 1363 }, { "batch_size": 4, "epoch": 0.5452, "step": 1363, "tokens_per_device": 12560 }, { "epoch": 0.5452, "loss_ce": 0.3473723828792572, "loss_lvr": 0.737176239490509, "loss_mode_switch": 0.0, "loss_total": 0.4210900068283081, "step": 1363 }, { "batch_size": 1, "epoch": 0.5452, "step": 1363, "tokens_per_device": 5194 }, { "epoch": 0.5452, "loss_ce": 0.003351435763761401, "loss_lvr": 0.31079667806625366, "loss_mode_switch": 0.0, "loss_total": 0.034431103616952896, "step": 1363 }, { "batch_size": 1, "epoch": 0.5452, "step": 1363, "tokens_per_device": 5091 }, { "epoch": 0.5452, "loss_ce": 0.001527262618765235, "loss_lvr": 0.5689231157302856, "loss_mode_switch": 0.0, "loss_total": 0.05841957405209541, "step": 1363 }, { "batch_size": 1, "epoch": 0.5452, "step": 1363, "tokens_per_device": 4894 }, { "epoch": 0.5452, "loss_ce": 0.0007805479108355939, "loss_lvr": 0.326354444026947, "loss_mode_switch": 0.0, "loss_total": 0.03341599553823471, "step": 1363 }, { "batch_size": 4, "epoch": 0.5452, "step": 1363, "tokens_per_device": 3856 }, { "epoch": 0.5452, "loss_ce": 0.20076081156730652, "loss_lvr": 1.0221668481826782, "loss_mode_switch": 0.0, "loss_total": 0.3029775023460388, "step": 1363 }, { "batch_size": 1, "epoch": 0.5452, "step": 1363, "tokens_per_device": 5115 }, { "epoch": 0.5452, "loss_ce": 0.0008915448561310768, "loss_lvr": 0.6969309449195862, "loss_mode_switch": 0.0, "loss_total": 0.07058463990688324, "step": 1363 }, { "epoch": 0.5456, "grad_norm": 1.3369156122207642, "learning_rate": 4.505281219237613e-06, "loss": 0.3006, "step": 1364 }, { "batch_size": 4, "epoch": 0.5456, "step": 1364, "tokens_per_device": 3844 }, { "epoch": 0.5456, "loss_ce": 0.19068390130996704, "loss_lvr": 0.8436597585678101, "loss_mode_switch": 0.0, "loss_total": 0.2750498652458191, "step": 1364 }, { "batch_size": 1, "epoch": 0.5456, "step": 1364, "tokens_per_device": 4893 }, { "epoch": 0.5456, "loss_ce": 0.057782188057899475, "loss_lvr": 0.5563129186630249, "loss_mode_switch": 0.0, "loss_total": 0.1134134829044342, "step": 1364 }, { "batch_size": 1, "epoch": 0.5456, "step": 1364, "tokens_per_device": 5073 }, { "epoch": 0.5456, "loss_ce": 0.01980644464492798, "loss_lvr": 0.9832413196563721, "loss_mode_switch": 0.0, "loss_total": 0.11813057959079742, "step": 1364 }, { "batch_size": 4, "epoch": 0.5456, "step": 1364, "tokens_per_device": 1296 }, { "epoch": 0.5456, "loss_ce": 0.5147891640663147, "loss_lvr": 0.8469140529632568, "loss_mode_switch": 0.0, "loss_total": 0.5994805693626404, "step": 1364 }, { "batch_size": 1, "epoch": 0.5456, "step": 1364, "tokens_per_device": 4403 }, { "epoch": 0.5456, "loss_ce": 0.013162628747522831, "loss_lvr": 0.37711942195892334, "loss_mode_switch": 0.0, "loss_total": 0.05087457224726677, "step": 1364 }, { "batch_size": 4, "epoch": 0.5456, "step": 1364, "tokens_per_device": 3772 }, { "epoch": 0.5456, "loss_ce": 0.3978421986103058, "loss_lvr": 1.3391609191894531, "loss_mode_switch": 0.0, "loss_total": 0.5317583084106445, "step": 1364 }, { "batch_size": 1, "epoch": 0.5456, "step": 1364, "tokens_per_device": 4875 }, { "epoch": 0.5456, "loss_ce": 0.0031204987317323685, "loss_lvr": 0.5539520978927612, "loss_mode_switch": 0.0, "loss_total": 0.05851571261882782, "step": 1364 }, { "batch_size": 4, "epoch": 0.5456, "step": 1364, "tokens_per_device": 4404 }, { "epoch": 0.5456, "loss_ce": 0.09086347371339798, "loss_lvr": 0.9229796528816223, "loss_mode_switch": 0.0, "loss_total": 0.1831614375114441, "step": 1364 }, { "epoch": 0.546, "grad_norm": 1.2644439935684204, "learning_rate": 4.498835910531595e-06, "loss": 0.2629, "step": 1365 }, { "batch_size": 4, "epoch": 0.546, "step": 1365, "tokens_per_device": 2728 }, { "epoch": 0.546, "loss_ce": 0.2860299348831177, "loss_lvr": 0.5848932266235352, "loss_mode_switch": 0.0, "loss_total": 0.3445192575454712, "step": 1365 }, { "batch_size": 4, "epoch": 0.546, "step": 1365, "tokens_per_device": 1732 }, { "epoch": 0.546, "loss_ce": 0.19842106103897095, "loss_lvr": 0.9828926920890808, "loss_mode_switch": 0.0, "loss_total": 0.296710342168808, "step": 1365 }, { "batch_size": 4, "epoch": 0.546, "step": 1365, "tokens_per_device": 5712 }, { "epoch": 0.546, "loss_ce": 0.5126156806945801, "loss_lvr": 0.9825184345245361, "loss_mode_switch": 0.0, "loss_total": 0.6108675003051758, "step": 1365 }, { "batch_size": 4, "epoch": 0.546, "step": 1365, "tokens_per_device": 2556 }, { "epoch": 0.546, "loss_ce": 0.46421870589256287, "loss_lvr": 0.7642623782157898, "loss_mode_switch": 0.0, "loss_total": 0.5406449437141418, "step": 1365 }, { "batch_size": 4, "epoch": 0.546, "step": 1365, "tokens_per_device": 4308 }, { "epoch": 0.546, "loss_ce": 0.5076809525489807, "loss_lvr": 1.0647200345993042, "loss_mode_switch": 0.0, "loss_total": 0.6141529679298401, "step": 1365 }, { "batch_size": 1, "epoch": 0.546, "step": 1365, "tokens_per_device": 4886 }, { "epoch": 0.546, "loss_ce": 0.027630550786852837, "loss_lvr": 0.35166698694229126, "loss_mode_switch": 0.0, "loss_total": 0.06279724836349487, "step": 1365 }, { "batch_size": 4, "epoch": 0.546, "step": 1365, "tokens_per_device": 5792 }, { "epoch": 0.546, "loss_ce": 0.30317363142967224, "loss_lvr": 0.9335851669311523, "loss_mode_switch": 0.0, "loss_total": 0.3965321481227875, "step": 1365 }, { "batch_size": 1, "epoch": 0.546, "step": 1365, "tokens_per_device": 5118 }, { "epoch": 0.546, "loss_ce": 0.011518283747136593, "loss_lvr": 0.5925233364105225, "loss_mode_switch": 0.0, "loss_total": 0.07077061384916306, "step": 1365 }, { "epoch": 0.5464, "grad_norm": 1.3070980310440063, "learning_rate": 4.4923914429420595e-06, "loss": 0.3594, "step": 1366 }, { "batch_size": 4, "epoch": 0.5464, "step": 1366, "tokens_per_device": 3720 }, { "epoch": 0.5464, "loss_ce": 0.10884261876344681, "loss_lvr": 0.8405965566635132, "loss_mode_switch": 0.0, "loss_total": 0.19290226697921753, "step": 1366 }, { "batch_size": 4, "epoch": 0.5464, "step": 1366, "tokens_per_device": 1400 }, { "epoch": 0.5464, "loss_ce": 0.6889544725418091, "loss_lvr": 0.9130356311798096, "loss_mode_switch": 0.0, "loss_total": 0.780258059501648, "step": 1366 }, { "batch_size": 1, "epoch": 0.5464, "step": 1366, "tokens_per_device": 4687 }, { "epoch": 0.5464, "loss_ce": 0.27539902925491333, "loss_lvr": 0.6427053213119507, "loss_mode_switch": 0.0, "loss_total": 0.3396695554256439, "step": 1366 }, { "batch_size": 1, "epoch": 0.5464, "step": 1366, "tokens_per_device": 4917 }, { "epoch": 0.5464, "loss_ce": 0.07677190005779266, "loss_lvr": 0.414433091878891, "loss_mode_switch": 0.0, "loss_total": 0.11821521073579788, "step": 1366 }, { "batch_size": 4, "epoch": 0.5464, "step": 1366, "tokens_per_device": 1524 }, { "epoch": 0.5464, "loss_ce": 0.5378102660179138, "loss_lvr": 0.8797997832298279, "loss_mode_switch": 0.0, "loss_total": 0.6257902383804321, "step": 1366 }, { "batch_size": 4, "epoch": 0.5464, "step": 1366, "tokens_per_device": 4216 }, { "epoch": 0.5464, "loss_ce": 0.24611210823059082, "loss_lvr": 1.0368573665618896, "loss_mode_switch": 0.0, "loss_total": 0.3497978448867798, "step": 1366 }, { "batch_size": 4, "epoch": 0.5464, "step": 1366, "tokens_per_device": 5980 }, { "epoch": 0.5464, "loss_ce": 0.07684172689914703, "loss_lvr": 0.6689625382423401, "loss_mode_switch": 0.0, "loss_total": 0.14373797178268433, "step": 1366 }, { "batch_size": 4, "epoch": 0.5464, "step": 1366, "tokens_per_device": 2612 }, { "epoch": 0.5464, "loss_ce": 0.6946349740028381, "loss_lvr": 0.8950650691986084, "loss_mode_switch": 0.0, "loss_total": 0.784141480922699, "step": 1366 }, { "epoch": 0.5468, "grad_norm": 1.517102837562561, "learning_rate": 4.485947827284921e-06, "loss": 0.3219, "step": 1367 }, { "batch_size": 4, "epoch": 0.5468, "step": 1367, "tokens_per_device": 7088 }, { "epoch": 0.5468, "loss_ce": 0.3305754065513611, "loss_lvr": 0.7372711896896362, "loss_mode_switch": 0.0, "loss_total": 0.40430253744125366, "step": 1367 }, { "batch_size": 1, "epoch": 0.5468, "step": 1367, "tokens_per_device": 5116 }, { "epoch": 0.5468, "loss_ce": 0.06671889126300812, "loss_lvr": 0.35479867458343506, "loss_mode_switch": 0.0, "loss_total": 0.1021987646818161, "step": 1367 }, { "batch_size": 4, "epoch": 0.5468, "step": 1367, "tokens_per_device": 4364 }, { "epoch": 0.5468, "loss_ce": 0.17546617984771729, "loss_lvr": 0.6777880191802979, "loss_mode_switch": 0.0, "loss_total": 0.2432449758052826, "step": 1367 }, { "batch_size": 1, "epoch": 0.5468, "step": 1367, "tokens_per_device": 5102 }, { "epoch": 0.5468, "loss_ce": 0.16888399422168732, "loss_lvr": 0.32839101552963257, "loss_mode_switch": 0.0, "loss_total": 0.2017230987548828, "step": 1367 }, { "batch_size": 1, "epoch": 0.5468, "step": 1367, "tokens_per_device": 4839 }, { "epoch": 0.5468, "loss_ce": 0.0031732397619634867, "loss_lvr": 0.32847118377685547, "loss_mode_switch": 0.0, "loss_total": 0.03602035716176033, "step": 1367 }, { "batch_size": 4, "epoch": 0.5468, "step": 1367, "tokens_per_device": 4428 }, { "epoch": 0.5468, "loss_ce": 0.133236363530159, "loss_lvr": 0.8451395034790039, "loss_mode_switch": 0.0, "loss_total": 0.21775031089782715, "step": 1367 }, { "batch_size": 1, "epoch": 0.5468, "step": 1367, "tokens_per_device": 5116 }, { "epoch": 0.5468, "loss_ce": 0.00040864228503778577, "loss_lvr": 0.5771282315254211, "loss_mode_switch": 0.0, "loss_total": 0.058121465146541595, "step": 1367 }, { "batch_size": 4, "epoch": 0.5468, "step": 1367, "tokens_per_device": 7440 }, { "epoch": 0.5468, "loss_ce": 0.22062841057777405, "loss_lvr": 1.0157932043075562, "loss_mode_switch": 0.0, "loss_total": 0.3222077488899231, "step": 1367 }, { "epoch": 0.5472, "grad_norm": 1.4221223592758179, "learning_rate": 4.479505074374662e-06, "loss": 0.2574, "step": 1368 }, { "batch_size": 4, "epoch": 0.5472, "step": 1368, "tokens_per_device": 3876 }, { "epoch": 0.5472, "loss_ce": 0.052172113209962845, "loss_lvr": 1.2468868494033813, "loss_mode_switch": 0.0, "loss_total": 0.17686079442501068, "step": 1368 }, { "batch_size": 4, "epoch": 0.5472, "step": 1368, "tokens_per_device": 4848 }, { "epoch": 0.5472, "loss_ce": 0.6423467993736267, "loss_lvr": 0.8725342154502869, "loss_mode_switch": 0.0, "loss_total": 0.729600191116333, "step": 1368 }, { "batch_size": 4, "epoch": 0.5472, "step": 1368, "tokens_per_device": 3816 }, { "epoch": 0.5472, "loss_ce": 0.14775586128234863, "loss_lvr": 0.7834556698799133, "loss_mode_switch": 0.0, "loss_total": 0.22610142827033997, "step": 1368 }, { "batch_size": 4, "epoch": 0.5472, "step": 1368, "tokens_per_device": 2672 }, { "epoch": 0.5472, "loss_ce": 0.12039490789175034, "loss_lvr": 0.8494972586631775, "loss_mode_switch": 0.0, "loss_total": 0.20534463226795197, "step": 1368 }, { "batch_size": 4, "epoch": 0.5472, "step": 1368, "tokens_per_device": 1240 }, { "epoch": 0.5472, "loss_ce": 0.20132876932621002, "loss_lvr": 1.0438647270202637, "loss_mode_switch": 0.0, "loss_total": 0.3057152330875397, "step": 1368 }, { "batch_size": 1, "epoch": 0.5472, "step": 1368, "tokens_per_device": 5102 }, { "epoch": 0.5472, "loss_ce": 0.041948333382606506, "loss_lvr": 0.5195782780647278, "loss_mode_switch": 0.0, "loss_total": 0.09390616416931152, "step": 1368 }, { "batch_size": 4, "epoch": 0.5472, "step": 1368, "tokens_per_device": 4740 }, { "epoch": 0.5472, "loss_ce": 0.07483787834644318, "loss_lvr": 0.975334644317627, "loss_mode_switch": 0.0, "loss_total": 0.17237134277820587, "step": 1368 }, { "batch_size": 4, "epoch": 0.5472, "step": 1368, "tokens_per_device": 1564 }, { "epoch": 0.5472, "loss_ce": 0.19065600633621216, "loss_lvr": 1.0007822513580322, "loss_mode_switch": 0.0, "loss_total": 0.2907342314720154, "step": 1368 }, { "epoch": 0.5476, "grad_norm": 1.266611099243164, "learning_rate": 4.47306319502432e-06, "loss": 0.2622, "step": 1369 }, { "batch_size": 1, "epoch": 0.5476, "step": 1369, "tokens_per_device": 5098 }, { "epoch": 0.5476, "loss_ce": 0.040907274931669235, "loss_lvr": 0.37992244958877563, "loss_mode_switch": 0.0, "loss_total": 0.07889951765537262, "step": 1369 }, { "batch_size": 4, "epoch": 0.5476, "step": 1369, "tokens_per_device": 1776 }, { "epoch": 0.5476, "loss_ce": 0.11278343200683594, "loss_lvr": 1.9613797664642334, "loss_mode_switch": 0.0, "loss_total": 0.3089213967323303, "step": 1369 }, { "batch_size": 4, "epoch": 0.5476, "step": 1369, "tokens_per_device": 13588 }, { "epoch": 0.5476, "loss_ce": 0.3307492136955261, "loss_lvr": 0.8455601930618286, "loss_mode_switch": 0.0, "loss_total": 0.4153052270412445, "step": 1369 }, { "batch_size": 1, "epoch": 0.5476, "step": 1369, "tokens_per_device": 4443 }, { "epoch": 0.5476, "loss_ce": 0.007158965803682804, "loss_lvr": 0.6034917235374451, "loss_mode_switch": 0.0, "loss_total": 0.06750813871622086, "step": 1369 }, { "batch_size": 1, "epoch": 0.5476, "step": 1369, "tokens_per_device": 4875 }, { "epoch": 0.5476, "loss_ce": 0.013883192092180252, "loss_lvr": 0.19287891685962677, "loss_mode_switch": 0.0, "loss_total": 0.03317108377814293, "step": 1369 }, { "batch_size": 4, "epoch": 0.5476, "step": 1369, "tokens_per_device": 3760 }, { "epoch": 0.5476, "loss_ce": 0.05814250558614731, "loss_lvr": 0.7956967949867249, "loss_mode_switch": 0.0, "loss_total": 0.13771218061447144, "step": 1369 }, { "batch_size": 4, "epoch": 0.5476, "step": 1369, "tokens_per_device": 5052 }, { "epoch": 0.5476, "loss_ce": 0.36725112795829773, "loss_lvr": 0.651324450969696, "loss_mode_switch": 0.0, "loss_total": 0.43238356709480286, "step": 1369 }, { "batch_size": 1, "epoch": 0.5476, "step": 1369, "tokens_per_device": 4963 }, { "epoch": 0.5476, "loss_ce": 0.03806028142571449, "loss_lvr": 0.21467171609401703, "loss_mode_switch": 0.0, "loss_total": 0.059527453035116196, "step": 1369 }, { "epoch": 0.548, "grad_norm": 1.3529413938522339, "learning_rate": 4.4666222000454685e-06, "loss": 0.2998, "step": 1370 }, { "batch_size": 4, "epoch": 0.548, "step": 1370, "tokens_per_device": 4240 }, { "epoch": 0.548, "loss_ce": 0.2944648563861847, "loss_lvr": 1.128121256828308, "loss_mode_switch": 0.0, "loss_total": 0.40727698802948, "step": 1370 }, { "batch_size": 4, "epoch": 0.548, "step": 1370, "tokens_per_device": 13460 }, { "epoch": 0.548, "loss_ce": 0.22644934058189392, "loss_lvr": 1.0277694463729858, "loss_mode_switch": 0.0, "loss_total": 0.3292262852191925, "step": 1370 }, { "batch_size": 4, "epoch": 0.548, "step": 1370, "tokens_per_device": 4348 }, { "epoch": 0.548, "loss_ce": 0.5236430764198303, "loss_lvr": 0.8222096562385559, "loss_mode_switch": 0.0, "loss_total": 0.6058640480041504, "step": 1370 }, { "batch_size": 1, "epoch": 0.548, "step": 1370, "tokens_per_device": 4973 }, { "epoch": 0.548, "loss_ce": 0.08282509446144104, "loss_lvr": 0.4503891170024872, "loss_mode_switch": 0.0, "loss_total": 0.12786400318145752, "step": 1370 }, { "batch_size": 1, "epoch": 0.548, "step": 1370, "tokens_per_device": 5488 }, { "epoch": 0.548, "loss_ce": 0.24659812450408936, "loss_lvr": 0.5054407119750977, "loss_mode_switch": 0.0, "loss_total": 0.2971422076225281, "step": 1370 }, { "batch_size": 1, "epoch": 0.548, "step": 1370, "tokens_per_device": 5007 }, { "epoch": 0.548, "loss_ce": 0.015326223336160183, "loss_lvr": 0.3793894648551941, "loss_mode_switch": 0.0, "loss_total": 0.053265172988176346, "step": 1370 }, { "batch_size": 1, "epoch": 0.548, "step": 1370, "tokens_per_device": 4876 }, { "epoch": 0.548, "loss_ce": 0.003502441104501486, "loss_lvr": 0.5025049448013306, "loss_mode_switch": 0.0, "loss_total": 0.05375293642282486, "step": 1370 }, { "batch_size": 4, "epoch": 0.548, "step": 1370, "tokens_per_device": 5016 }, { "epoch": 0.548, "loss_ce": 0.06753338128328323, "loss_lvr": 0.8827248215675354, "loss_mode_switch": 0.0, "loss_total": 0.15580585598945618, "step": 1370 }, { "epoch": 0.5484, "grad_norm": 1.344396710395813, "learning_rate": 4.46018210024819e-06, "loss": 0.3017, "step": 1371 }, { "batch_size": 1, "epoch": 0.5484, "step": 1371, "tokens_per_device": 5125 }, { "epoch": 0.5484, "loss_ce": 0.008348433300852776, "loss_lvr": 0.3120993971824646, "loss_mode_switch": 0.0, "loss_total": 0.039558373391628265, "step": 1371 }, { "batch_size": 4, "epoch": 0.5484, "step": 1371, "tokens_per_device": 4840 }, { "epoch": 0.5484, "loss_ce": 0.04982003569602966, "loss_lvr": 0.6213428974151611, "loss_mode_switch": 0.0, "loss_total": 0.11195433139801025, "step": 1371 }, { "batch_size": 1, "epoch": 0.5484, "step": 1371, "tokens_per_device": 4881 }, { "epoch": 0.5484, "loss_ce": 0.012912285514175892, "loss_lvr": 0.2679468095302582, "loss_mode_switch": 0.0, "loss_total": 0.039706967771053314, "step": 1371 }, { "batch_size": 1, "epoch": 0.5484, "step": 1371, "tokens_per_device": 5100 }, { "epoch": 0.5484, "loss_ce": 0.20792299509048462, "loss_lvr": 0.4366176426410675, "loss_mode_switch": 0.0, "loss_total": 0.2515847682952881, "step": 1371 }, { "batch_size": 1, "epoch": 0.5484, "step": 1371, "tokens_per_device": 5563 }, { "epoch": 0.5484, "loss_ce": 0.04190970957279205, "loss_lvr": 0.29360100626945496, "loss_mode_switch": 0.0, "loss_total": 0.07126981019973755, "step": 1371 }, { "batch_size": 4, "epoch": 0.5484, "step": 1371, "tokens_per_device": 3764 }, { "epoch": 0.5484, "loss_ce": 0.3208601474761963, "loss_lvr": 0.9315970540046692, "loss_mode_switch": 0.0, "loss_total": 0.4140198528766632, "step": 1371 }, { "batch_size": 4, "epoch": 0.5484, "step": 1371, "tokens_per_device": 10208 }, { "epoch": 0.5484, "loss_ce": 0.1376747339963913, "loss_lvr": 0.9971173405647278, "loss_mode_switch": 0.0, "loss_total": 0.23738646507263184, "step": 1371 }, { "batch_size": 4, "epoch": 0.5484, "step": 1371, "tokens_per_device": 6708 }, { "epoch": 0.5484, "loss_ce": 0.011171557940542698, "loss_lvr": 0.7333763241767883, "loss_mode_switch": 0.0, "loss_total": 0.08450919389724731, "step": 1371 }, { "epoch": 0.5488, "grad_norm": 1.3591570854187012, "learning_rate": 4.4537429064410685e-06, "loss": 0.3006, "step": 1372 }, { "batch_size": 4, "epoch": 0.5488, "step": 1372, "tokens_per_device": 6252 }, { "epoch": 0.5488, "loss_ce": 0.09727290272712708, "loss_lvr": 0.6060740351676941, "loss_mode_switch": 0.0, "loss_total": 0.15788030624389648, "step": 1372 }, { "batch_size": 4, "epoch": 0.5488, "step": 1372, "tokens_per_device": 2916 }, { "epoch": 0.5488, "loss_ce": 0.18049733340740204, "loss_lvr": 0.5418796539306641, "loss_mode_switch": 0.0, "loss_total": 0.23468530178070068, "step": 1372 }, { "batch_size": 4, "epoch": 0.5488, "step": 1372, "tokens_per_device": 2804 }, { "epoch": 0.5488, "loss_ce": 0.20680741965770721, "loss_lvr": 1.2536993026733398, "loss_mode_switch": 0.0, "loss_total": 0.3321773409843445, "step": 1372 }, { "batch_size": 4, "epoch": 0.5488, "step": 1372, "tokens_per_device": 1396 }, { "epoch": 0.5488, "loss_ce": 0.6015300750732422, "loss_lvr": 1.0461231470108032, "loss_mode_switch": 0.0, "loss_total": 0.7061423659324646, "step": 1372 }, { "batch_size": 4, "epoch": 0.5488, "step": 1372, "tokens_per_device": 4476 }, { "epoch": 0.5488, "loss_ce": 0.276190847158432, "loss_lvr": 0.7049404382705688, "loss_mode_switch": 0.0, "loss_total": 0.34668490290641785, "step": 1372 }, { "batch_size": 4, "epoch": 0.5488, "step": 1372, "tokens_per_device": 5924 }, { "epoch": 0.5488, "loss_ce": 0.11439473927021027, "loss_lvr": 0.7658728361129761, "loss_mode_switch": 0.0, "loss_total": 0.19098201394081116, "step": 1372 }, { "batch_size": 4, "epoch": 0.5488, "step": 1372, "tokens_per_device": 4228 }, { "epoch": 0.5488, "loss_ce": 0.2130284160375595, "loss_lvr": 0.6644335389137268, "loss_mode_switch": 0.0, "loss_total": 0.279471755027771, "step": 1372 }, { "batch_size": 1, "epoch": 0.5488, "step": 1372, "tokens_per_device": 5171 }, { "epoch": 0.5488, "loss_ce": 0.0007103244424797595, "loss_lvr": 0.4772886633872986, "loss_mode_switch": 0.0, "loss_total": 0.04843918979167938, "step": 1372 }, { "epoch": 0.5492, "grad_norm": 1.3103641271591187, "learning_rate": 4.44730462943117e-06, "loss": 0.31, "step": 1373 }, { "batch_size": 4, "epoch": 0.5492, "step": 1373, "tokens_per_device": 1388 }, { "epoch": 0.5492, "loss_ce": 0.26177218556404114, "loss_lvr": 1.0848253965377808, "loss_mode_switch": 0.0, "loss_total": 0.3702547252178192, "step": 1373 }, { "batch_size": 1, "epoch": 0.5492, "step": 1373, "tokens_per_device": 4864 }, { "epoch": 0.5492, "loss_ce": 0.0005288632237352431, "loss_lvr": 0.3693775534629822, "loss_mode_switch": 0.0, "loss_total": 0.0374666191637516, "step": 1373 }, { "batch_size": 4, "epoch": 0.5492, "step": 1373, "tokens_per_device": 8848 }, { "epoch": 0.5492, "loss_ce": 0.21451345086097717, "loss_lvr": 0.755096435546875, "loss_mode_switch": 0.0, "loss_total": 0.2900230884552002, "step": 1373 }, { "batch_size": 4, "epoch": 0.5492, "step": 1373, "tokens_per_device": 2804 }, { "epoch": 0.5492, "loss_ce": 0.17362520098686218, "loss_lvr": 0.8955367803573608, "loss_mode_switch": 0.0, "loss_total": 0.26317888498306274, "step": 1373 }, { "batch_size": 1, "epoch": 0.5492, "step": 1373, "tokens_per_device": 4903 }, { "epoch": 0.5492, "loss_ce": 0.006151162553578615, "loss_lvr": 0.2593560814857483, "loss_mode_switch": 0.0, "loss_total": 0.03208677098155022, "step": 1373 }, { "batch_size": 4, "epoch": 0.5492, "step": 1373, "tokens_per_device": 3792 }, { "epoch": 0.5492, "loss_ce": 0.10940730571746826, "loss_lvr": 0.9189581274986267, "loss_mode_switch": 0.0, "loss_total": 0.2013031244277954, "step": 1373 }, { "batch_size": 1, "epoch": 0.5492, "step": 1373, "tokens_per_device": 4859 }, { "epoch": 0.5492, "loss_ce": 0.007357796188443899, "loss_lvr": 2.0120785236358643, "loss_mode_switch": 0.0, "loss_total": 0.20856565237045288, "step": 1373 }, { "batch_size": 4, "epoch": 0.5492, "step": 1373, "tokens_per_device": 7712 }, { "epoch": 0.5492, "loss_ce": 0.15381519496440887, "loss_lvr": 1.0167522430419922, "loss_mode_switch": 0.0, "loss_total": 0.25549042224884033, "step": 1373 }, { "epoch": 0.5496, "grad_norm": 1.4668539762496948, "learning_rate": 4.4408672800240185e-06, "loss": 0.333, "step": 1374 }, { "batch_size": 4, "epoch": 0.5496, "step": 1374, "tokens_per_device": 4220 }, { "epoch": 0.5496, "loss_ce": 0.5829258561134338, "loss_lvr": 1.0100592374801636, "loss_mode_switch": 0.0, "loss_total": 0.6839317679405212, "step": 1374 }, { "batch_size": 1, "epoch": 0.5496, "step": 1374, "tokens_per_device": 5169 }, { "epoch": 0.5496, "loss_ce": 0.05629406124353409, "loss_lvr": 0.30868056416511536, "loss_mode_switch": 0.0, "loss_total": 0.08716212213039398, "step": 1374 }, { "batch_size": 4, "epoch": 0.5496, "step": 1374, "tokens_per_device": 4368 }, { "epoch": 0.5496, "loss_ce": 0.4276930093765259, "loss_lvr": 1.0425137281417847, "loss_mode_switch": 0.0, "loss_total": 0.5319443941116333, "step": 1374 }, { "batch_size": 4, "epoch": 0.5496, "step": 1374, "tokens_per_device": 3772 }, { "epoch": 0.5496, "loss_ce": 0.266082763671875, "loss_lvr": 0.4781852662563324, "loss_mode_switch": 0.0, "loss_total": 0.31390130519866943, "step": 1374 }, { "batch_size": 4, "epoch": 0.5496, "step": 1374, "tokens_per_device": 6348 }, { "epoch": 0.5496, "loss_ce": 0.11450228095054626, "loss_lvr": 0.6735251545906067, "loss_mode_switch": 0.0, "loss_total": 0.18185479938983917, "step": 1374 }, { "batch_size": 1, "epoch": 0.5496, "step": 1374, "tokens_per_device": 5044 }, { "epoch": 0.5496, "loss_ce": 0.30030396580696106, "loss_lvr": 0.4097006618976593, "loss_mode_switch": 0.0, "loss_total": 0.3412740230560303, "step": 1374 }, { "batch_size": 4, "epoch": 0.5496, "step": 1374, "tokens_per_device": 7216 }, { "epoch": 0.5496, "loss_ce": 0.06986058503389359, "loss_lvr": 0.7674562335014343, "loss_mode_switch": 0.0, "loss_total": 0.1466062068939209, "step": 1374 }, { "batch_size": 4, "epoch": 0.5496, "step": 1374, "tokens_per_device": 4304 }, { "epoch": 0.5496, "loss_ce": 0.12449554353952408, "loss_lvr": 0.7510880827903748, "loss_mode_switch": 0.0, "loss_total": 0.1996043622493744, "step": 1374 }, { "epoch": 0.55, "grad_norm": 1.2209304571151733, "learning_rate": 4.434430869023579e-06, "loss": 0.2889, "step": 1375 }, { "batch_size": 4, "epoch": 0.55, "step": 1375, "tokens_per_device": 6940 }, { "epoch": 0.55, "loss_ce": 0.43825381994247437, "loss_lvr": 0.5457366704940796, "loss_mode_switch": 0.0, "loss_total": 0.49282747507095337, "step": 1375 }, { "batch_size": 4, "epoch": 0.55, "step": 1375, "tokens_per_device": 4268 }, { "epoch": 0.55, "loss_ce": 0.3197149932384491, "loss_lvr": 0.8651925921440125, "loss_mode_switch": 0.0, "loss_total": 0.4062342643737793, "step": 1375 }, { "batch_size": 1, "epoch": 0.55, "step": 1375, "tokens_per_device": 5020 }, { "epoch": 0.55, "loss_ce": 0.47243309020996094, "loss_lvr": 0.7149417400360107, "loss_mode_switch": 0.0, "loss_total": 0.5439272522926331, "step": 1375 }, { "batch_size": 1, "epoch": 0.55, "step": 1375, "tokens_per_device": 5084 }, { "epoch": 0.55, "loss_ce": 0.04326620325446129, "loss_lvr": 0.506392776966095, "loss_mode_switch": 0.0, "loss_total": 0.0939054787158966, "step": 1375 }, { "batch_size": 4, "epoch": 0.55, "step": 1375, "tokens_per_device": 2796 }, { "epoch": 0.55, "loss_ce": 0.2631258964538574, "loss_lvr": 0.538887083530426, "loss_mode_switch": 0.0, "loss_total": 0.3170146048069, "step": 1375 }, { "batch_size": 1, "epoch": 0.55, "step": 1375, "tokens_per_device": 4898 }, { "epoch": 0.55, "loss_ce": 0.22623571753501892, "loss_lvr": 0.3078238070011139, "loss_mode_switch": 0.0, "loss_total": 0.2570180892944336, "step": 1375 }, { "batch_size": 4, "epoch": 0.55, "step": 1375, "tokens_per_device": 3700 }, { "epoch": 0.55, "loss_ce": 0.0029793938156217337, "loss_lvr": 0.8248268961906433, "loss_mode_switch": 0.0, "loss_total": 0.08546207845211029, "step": 1375 }, { "batch_size": 4, "epoch": 0.55, "step": 1375, "tokens_per_device": 1556 }, { "epoch": 0.55, "loss_ce": 0.2862101197242737, "loss_lvr": 0.8946298360824585, "loss_mode_switch": 0.0, "loss_total": 0.3756731152534485, "step": 1375 }, { "epoch": 0.5504, "grad_norm": 1.2434550523757935, "learning_rate": 4.4279954072322486e-06, "loss": 0.2662, "step": 1376 }, { "batch_size": 1, "epoch": 0.5504, "step": 1376, "tokens_per_device": 6369 }, { "epoch": 0.5504, "loss_ce": 0.027599364519119263, "loss_lvr": 0.2834455072879791, "loss_mode_switch": 0.0, "loss_total": 0.055943913757801056, "step": 1376 }, { "batch_size": 4, "epoch": 0.5504, "step": 1376, "tokens_per_device": 1904 }, { "epoch": 0.5504, "loss_ce": 0.3551172912120819, "loss_lvr": 0.9095718264579773, "loss_mode_switch": 0.0, "loss_total": 0.4460744857788086, "step": 1376 }, { "batch_size": 4, "epoch": 0.5504, "step": 1376, "tokens_per_device": 3800 }, { "epoch": 0.5504, "loss_ce": 0.30558422207832336, "loss_lvr": 0.548745334148407, "loss_mode_switch": 0.0, "loss_total": 0.36045876145362854, "step": 1376 }, { "batch_size": 1, "epoch": 0.5504, "step": 1376, "tokens_per_device": 5399 }, { "epoch": 0.5504, "loss_ce": 0.04222828522324562, "loss_lvr": 0.27847686409950256, "loss_mode_switch": 0.0, "loss_total": 0.07007597386837006, "step": 1376 }, { "batch_size": 4, "epoch": 0.5504, "step": 1376, "tokens_per_device": 3820 }, { "epoch": 0.5504, "loss_ce": 0.00583419855684042, "loss_lvr": 0.8512797951698303, "loss_mode_switch": 0.0, "loss_total": 0.09096217900514603, "step": 1376 }, { "batch_size": 4, "epoch": 0.5504, "step": 1376, "tokens_per_device": 13628 }, { "epoch": 0.5504, "loss_ce": 0.5259096026420593, "loss_lvr": 0.861361563205719, "loss_mode_switch": 0.0, "loss_total": 0.6120457649230957, "step": 1376 }, { "batch_size": 1, "epoch": 0.5504, "step": 1376, "tokens_per_device": 4925 }, { "epoch": 0.5504, "loss_ce": 0.5611796379089355, "loss_lvr": 0.42749670147895813, "loss_mode_switch": 0.0, "loss_total": 0.6039292812347412, "step": 1376 }, { "batch_size": 1, "epoch": 0.5504, "step": 1376, "tokens_per_device": 4873 }, { "epoch": 0.5504, "loss_ce": 0.08694577217102051, "loss_lvr": 0.3566139340400696, "loss_mode_switch": 0.0, "loss_total": 0.12260717153549194, "step": 1376 }, { "epoch": 0.5508, "grad_norm": 1.2498269081115723, "learning_rate": 4.4215609054508215e-06, "loss": 0.2545, "step": 1377 }, { "batch_size": 4, "epoch": 0.5508, "step": 1377, "tokens_per_device": 2852 }, { "epoch": 0.5508, "loss_ce": 0.2615608274936676, "loss_lvr": 0.7028642892837524, "loss_mode_switch": 0.0, "loss_total": 0.33184725046157837, "step": 1377 }, { "batch_size": 4, "epoch": 0.5508, "step": 1377, "tokens_per_device": 1608 }, { "epoch": 0.5508, "loss_ce": 0.4474323093891144, "loss_lvr": 1.246987223625183, "loss_mode_switch": 0.0, "loss_total": 0.5721310377120972, "step": 1377 }, { "batch_size": 1, "epoch": 0.5508, "step": 1377, "tokens_per_device": 5363 }, { "epoch": 0.5508, "loss_ce": 0.1730716973543167, "loss_lvr": 0.5093348622322083, "loss_mode_switch": 0.0, "loss_total": 0.22400519251823425, "step": 1377 }, { "batch_size": 4, "epoch": 0.5508, "step": 1377, "tokens_per_device": 6092 }, { "epoch": 0.5508, "loss_ce": 0.4208027124404907, "loss_lvr": 1.017025351524353, "loss_mode_switch": 0.0, "loss_total": 0.5225052237510681, "step": 1377 }, { "batch_size": 4, "epoch": 0.5508, "step": 1377, "tokens_per_device": 4700 }, { "epoch": 0.5508, "loss_ce": 0.21695342659950256, "loss_lvr": 0.738254189491272, "loss_mode_switch": 0.0, "loss_total": 0.29077884554862976, "step": 1377 }, { "batch_size": 1, "epoch": 0.5508, "step": 1377, "tokens_per_device": 6380 }, { "epoch": 0.5508, "loss_ce": 0.00035368913086131215, "loss_lvr": 0.2709280848503113, "loss_mode_switch": 0.0, "loss_total": 0.027446497231721878, "step": 1377 }, { "batch_size": 4, "epoch": 0.5508, "step": 1377, "tokens_per_device": 3896 }, { "epoch": 0.5508, "loss_ce": 0.0012784956488758326, "loss_lvr": 1.0209242105484009, "loss_mode_switch": 0.0, "loss_total": 0.10337091982364655, "step": 1377 }, { "batch_size": 4, "epoch": 0.5508, "step": 1377, "tokens_per_device": 4456 }, { "epoch": 0.5508, "loss_ce": 0.11975854635238647, "loss_lvr": 0.6673116087913513, "loss_mode_switch": 0.0, "loss_total": 0.18648970127105713, "step": 1377 }, { "epoch": 0.5512, "grad_norm": 1.1733883619308472, "learning_rate": 4.415127374478491e-06, "loss": 0.2425, "step": 1378 }, { "batch_size": 4, "epoch": 0.5512, "step": 1378, "tokens_per_device": 4372 }, { "epoch": 0.5512, "loss_ce": 0.02569393254816532, "loss_lvr": 0.44905993342399597, "loss_mode_switch": 0.0, "loss_total": 0.07059992849826813, "step": 1378 }, { "batch_size": 4, "epoch": 0.5512, "step": 1378, "tokens_per_device": 1288 }, { "epoch": 0.5512, "loss_ce": 0.525326132774353, "loss_lvr": 0.9830835461616516, "loss_mode_switch": 0.0, "loss_total": 0.6236344575881958, "step": 1378 }, { "batch_size": 4, "epoch": 0.5512, "step": 1378, "tokens_per_device": 6216 }, { "epoch": 0.5512, "loss_ce": 0.10804902017116547, "loss_lvr": 0.7638595700263977, "loss_mode_switch": 0.0, "loss_total": 0.18443498015403748, "step": 1378 }, { "batch_size": 4, "epoch": 0.5512, "step": 1378, "tokens_per_device": 4180 }, { "epoch": 0.5512, "loss_ce": 0.14707151055335999, "loss_lvr": 0.6760563850402832, "loss_mode_switch": 0.0, "loss_total": 0.21467715501785278, "step": 1378 }, { "batch_size": 1, "epoch": 0.5512, "step": 1378, "tokens_per_device": 4896 }, { "epoch": 0.5512, "loss_ce": 0.13498647511005402, "loss_lvr": 0.2702100872993469, "loss_mode_switch": 0.0, "loss_total": 0.16200748085975647, "step": 1378 }, { "batch_size": 4, "epoch": 0.5512, "step": 1378, "tokens_per_device": 4944 }, { "epoch": 0.5512, "loss_ce": 0.286421537399292, "loss_lvr": 0.7321882843971252, "loss_mode_switch": 0.0, "loss_total": 0.35964035987854004, "step": 1378 }, { "batch_size": 4, "epoch": 0.5512, "step": 1378, "tokens_per_device": 3316 }, { "epoch": 0.5512, "loss_ce": 0.21002115309238434, "loss_lvr": 1.2766402959823608, "loss_mode_switch": 0.0, "loss_total": 0.33768516778945923, "step": 1378 }, { "batch_size": 1, "epoch": 0.5512, "step": 1378, "tokens_per_device": 4941 }, { "epoch": 0.5512, "loss_ce": 0.03173074126243591, "loss_lvr": 0.2784263491630554, "loss_mode_switch": 0.0, "loss_total": 0.059573374688625336, "step": 1378 }, { "epoch": 0.5516, "grad_norm": 1.2451133728027344, "learning_rate": 4.4086948251128155e-06, "loss": 0.2727, "step": 1379 }, { "batch_size": 4, "epoch": 0.5516, "step": 1379, "tokens_per_device": 2612 }, { "epoch": 0.5516, "loss_ce": 0.5487328171730042, "loss_lvr": 0.7802917957305908, "loss_mode_switch": 0.0, "loss_total": 0.6267619729042053, "step": 1379 }, { "batch_size": 4, "epoch": 0.5516, "step": 1379, "tokens_per_device": 4380 }, { "epoch": 0.5516, "loss_ce": 0.24698278307914734, "loss_lvr": 0.5680766701698303, "loss_mode_switch": 0.0, "loss_total": 0.30379045009613037, "step": 1379 }, { "batch_size": 4, "epoch": 0.5516, "step": 1379, "tokens_per_device": 4364 }, { "epoch": 0.5516, "loss_ce": 0.24072222411632538, "loss_lvr": 0.4373416602611542, "loss_mode_switch": 0.0, "loss_total": 0.28445640206336975, "step": 1379 }, { "batch_size": 1, "epoch": 0.5516, "step": 1379, "tokens_per_device": 7285 }, { "epoch": 0.5516, "loss_ce": 0.2657308578491211, "loss_lvr": 0.33477964997291565, "loss_mode_switch": 0.0, "loss_total": 0.2992088198661804, "step": 1379 }, { "batch_size": 4, "epoch": 0.5516, "step": 1379, "tokens_per_device": 6436 }, { "epoch": 0.5516, "loss_ce": 0.0716298297047615, "loss_lvr": 0.6386327147483826, "loss_mode_switch": 0.0, "loss_total": 0.13549309968948364, "step": 1379 }, { "batch_size": 4, "epoch": 0.5516, "step": 1379, "tokens_per_device": 8636 }, { "epoch": 0.5516, "loss_ce": 0.3050386607646942, "loss_lvr": 1.1366311311721802, "loss_mode_switch": 0.0, "loss_total": 0.41870176792144775, "step": 1379 }, { "batch_size": 4, "epoch": 0.5516, "step": 1379, "tokens_per_device": 1300 }, { "epoch": 0.5516, "loss_ce": 0.3558451533317566, "loss_lvr": 1.0057491064071655, "loss_mode_switch": 0.0, "loss_total": 0.45642006397247314, "step": 1379 }, { "batch_size": 1, "epoch": 0.5516, "step": 1379, "tokens_per_device": 4877 }, { "epoch": 0.5516, "loss_ce": 0.05108965188264847, "loss_lvr": 0.547844648361206, "loss_mode_switch": 0.0, "loss_total": 0.10587412118911743, "step": 1379 }, { "epoch": 0.552, "grad_norm": 1.3472914695739746, "learning_rate": 4.402263268149707e-06, "loss": 0.3255, "step": 1380 }, { "batch_size": 4, "epoch": 0.552, "step": 1380, "tokens_per_device": 5792 }, { "epoch": 0.552, "loss_ce": 0.7068973779678345, "loss_lvr": 0.7830165028572083, "loss_mode_switch": 0.0, "loss_total": 0.7851990461349487, "step": 1380 }, { "batch_size": 4, "epoch": 0.552, "step": 1380, "tokens_per_device": 4236 }, { "epoch": 0.552, "loss_ce": 0.2662357985973358, "loss_lvr": 1.0290385484695435, "loss_mode_switch": 0.0, "loss_total": 0.3691396713256836, "step": 1380 }, { "batch_size": 1, "epoch": 0.552, "step": 1380, "tokens_per_device": 4921 }, { "epoch": 0.552, "loss_ce": 0.10511329770088196, "loss_lvr": 0.22675785422325134, "loss_mode_switch": 0.0, "loss_total": 0.12778908014297485, "step": 1380 }, { "batch_size": 4, "epoch": 0.552, "step": 1380, "tokens_per_device": 2652 }, { "epoch": 0.552, "loss_ce": 0.20065076649188995, "loss_lvr": 0.823395311832428, "loss_mode_switch": 0.0, "loss_total": 0.28299030661582947, "step": 1380 }, { "batch_size": 1, "epoch": 0.552, "step": 1380, "tokens_per_device": 7128 }, { "epoch": 0.552, "loss_ce": 0.020295584574341774, "loss_lvr": 0.3483397960662842, "loss_mode_switch": 0.0, "loss_total": 0.05512956529855728, "step": 1380 }, { "batch_size": 1, "epoch": 0.552, "step": 1380, "tokens_per_device": 4879 }, { "epoch": 0.552, "loss_ce": 0.08931980282068253, "loss_lvr": 0.3786454498767853, "loss_mode_switch": 0.0, "loss_total": 0.12718434631824493, "step": 1380 }, { "batch_size": 4, "epoch": 0.552, "step": 1380, "tokens_per_device": 1220 }, { "epoch": 0.552, "loss_ce": 0.10499455779790878, "loss_lvr": 1.0543192625045776, "loss_mode_switch": 0.0, "loss_total": 0.2104264795780182, "step": 1380 }, { "batch_size": 4, "epoch": 0.552, "step": 1380, "tokens_per_device": 2676 }, { "epoch": 0.552, "loss_ce": 0.1972629576921463, "loss_lvr": 0.8849090337753296, "loss_mode_switch": 0.0, "loss_total": 0.28575384616851807, "step": 1380 }, { "epoch": 0.5524, "grad_norm": 1.166766881942749, "learning_rate": 4.39583271438341e-06, "loss": 0.2856, "step": 1381 }, { "batch_size": 4, "epoch": 0.5524, "step": 1381, "tokens_per_device": 6644 }, { "epoch": 0.5524, "loss_ce": 0.016998322680592537, "loss_lvr": 0.7520483136177063, "loss_mode_switch": 0.0, "loss_total": 0.09220315515995026, "step": 1381 }, { "batch_size": 4, "epoch": 0.5524, "step": 1381, "tokens_per_device": 4372 }, { "epoch": 0.5524, "loss_ce": 0.6207565665245056, "loss_lvr": 0.9130886197090149, "loss_mode_switch": 0.0, "loss_total": 0.7120654582977295, "step": 1381 }, { "batch_size": 4, "epoch": 0.5524, "step": 1381, "tokens_per_device": 4240 }, { "epoch": 0.5524, "loss_ce": 0.07619035243988037, "loss_lvr": 1.1038657426834106, "loss_mode_switch": 0.0, "loss_total": 0.1865769326686859, "step": 1381 }, { "batch_size": 4, "epoch": 0.5524, "step": 1381, "tokens_per_device": 4204 }, { "epoch": 0.5524, "loss_ce": 0.5212470293045044, "loss_lvr": 0.7766954898834229, "loss_mode_switch": 0.0, "loss_total": 0.5989165902137756, "step": 1381 }, { "batch_size": 4, "epoch": 0.5524, "step": 1381, "tokens_per_device": 15816 }, { "epoch": 0.5524, "loss_ce": 0.14046981930732727, "loss_lvr": 0.760750412940979, "loss_mode_switch": 0.0, "loss_total": 0.21654486656188965, "step": 1381 }, { "batch_size": 4, "epoch": 0.5524, "step": 1381, "tokens_per_device": 2232 }, { "epoch": 0.5524, "loss_ce": 0.3895419239997864, "loss_lvr": 1.838638186454773, "loss_mode_switch": 0.0, "loss_total": 0.5734057426452637, "step": 1381 }, { "batch_size": 1, "epoch": 0.5524, "step": 1381, "tokens_per_device": 5065 }, { "epoch": 0.5524, "loss_ce": 0.043794531375169754, "loss_lvr": 0.5224698185920715, "loss_mode_switch": 0.0, "loss_total": 0.09604151546955109, "step": 1381 }, { "batch_size": 1, "epoch": 0.5524, "step": 1381, "tokens_per_device": 6370 }, { "epoch": 0.5524, "loss_ce": 0.00676191970705986, "loss_lvr": 0.341174453496933, "loss_mode_switch": 0.0, "loss_total": 0.04087936505675316, "step": 1381 }, { "epoch": 0.5528, "grad_norm": 1.324542760848999, "learning_rate": 4.389403174606484e-06, "loss": 0.2901, "step": 1382 }, { "batch_size": 4, "epoch": 0.5528, "step": 1382, "tokens_per_device": 1432 }, { "epoch": 0.5528, "loss_ce": 0.5706638693809509, "loss_lvr": 1.133277177810669, "loss_mode_switch": 0.0, "loss_total": 0.6839916110038757, "step": 1382 }, { "batch_size": 1, "epoch": 0.5528, "step": 1382, "tokens_per_device": 4918 }, { "epoch": 0.5528, "loss_ce": 0.20981182157993317, "loss_lvr": 0.3725212812423706, "loss_mode_switch": 0.0, "loss_total": 0.24706394970417023, "step": 1382 }, { "batch_size": 1, "epoch": 0.5528, "step": 1382, "tokens_per_device": 5130 }, { "epoch": 0.5528, "loss_ce": 0.01528837624937296, "loss_lvr": 0.4466916620731354, "loss_mode_switch": 0.0, "loss_total": 0.05995754152536392, "step": 1382 }, { "batch_size": 1, "epoch": 0.5528, "step": 1382, "tokens_per_device": 4886 }, { "epoch": 0.5528, "loss_ce": 0.017139626666903496, "loss_lvr": 0.3238103985786438, "loss_mode_switch": 0.0, "loss_total": 0.04952066391706467, "step": 1382 }, { "batch_size": 4, "epoch": 0.5528, "step": 1382, "tokens_per_device": 4264 }, { "epoch": 0.5528, "loss_ce": 0.666984498500824, "loss_lvr": 0.6487852334976196, "loss_mode_switch": 0.0, "loss_total": 0.7318630218505859, "step": 1382 }, { "batch_size": 4, "epoch": 0.5528, "step": 1382, "tokens_per_device": 3772 }, { "epoch": 0.5528, "loss_ce": 0.5807086825370789, "loss_lvr": 1.0870029926300049, "loss_mode_switch": 0.0, "loss_total": 0.6894089579582214, "step": 1382 }, { "batch_size": 4, "epoch": 0.5528, "step": 1382, "tokens_per_device": 4940 }, { "epoch": 0.5528, "loss_ce": 0.1981828808784485, "loss_lvr": 0.7363094091415405, "loss_mode_switch": 0.0, "loss_total": 0.2718138098716736, "step": 1382 }, { "batch_size": 4, "epoch": 0.5528, "step": 1382, "tokens_per_device": 4948 }, { "epoch": 0.5528, "loss_ce": 0.14286263287067413, "loss_lvr": 0.8265959024429321, "loss_mode_switch": 0.0, "loss_total": 0.2255222201347351, "step": 1382 }, { "epoch": 0.5532, "grad_norm": 1.2432304620742798, "learning_rate": 4.3829746596097975e-06, "loss": 0.2939, "step": 1383 }, { "batch_size": 4, "epoch": 0.5532, "step": 1383, "tokens_per_device": 4268 }, { "epoch": 0.5532, "loss_ce": 0.3252854645252228, "loss_lvr": 2.0896291732788086, "loss_mode_switch": 0.0, "loss_total": 0.5342483520507812, "step": 1383 }, { "batch_size": 4, "epoch": 0.5532, "step": 1383, "tokens_per_device": 6976 }, { "epoch": 0.5532, "loss_ce": 0.4980373680591583, "loss_lvr": 0.8843486309051514, "loss_mode_switch": 0.0, "loss_total": 0.58647221326828, "step": 1383 }, { "batch_size": 4, "epoch": 0.5532, "step": 1383, "tokens_per_device": 4256 }, { "epoch": 0.5532, "loss_ce": 0.22413407266139984, "loss_lvr": 0.7842739224433899, "loss_mode_switch": 0.0, "loss_total": 0.3025614619255066, "step": 1383 }, { "batch_size": 4, "epoch": 0.5532, "step": 1383, "tokens_per_device": 4280 }, { "epoch": 0.5532, "loss_ce": 0.30463334918022156, "loss_lvr": 1.081028699874878, "loss_mode_switch": 0.0, "loss_total": 0.4127362370491028, "step": 1383 }, { "batch_size": 4, "epoch": 0.5532, "step": 1383, "tokens_per_device": 1480 }, { "epoch": 0.5532, "loss_ce": 0.430603563785553, "loss_lvr": 0.9703438878059387, "loss_mode_switch": 0.0, "loss_total": 0.5276379585266113, "step": 1383 }, { "batch_size": 4, "epoch": 0.5532, "step": 1383, "tokens_per_device": 4816 }, { "epoch": 0.5532, "loss_ce": 0.17847970128059387, "loss_lvr": 0.6240535378456116, "loss_mode_switch": 0.0, "loss_total": 0.24088504910469055, "step": 1383 }, { "batch_size": 4, "epoch": 0.5532, "step": 1383, "tokens_per_device": 4628 }, { "epoch": 0.5532, "loss_ce": 0.18058860301971436, "loss_lvr": 1.0467857122421265, "loss_mode_switch": 0.0, "loss_total": 0.285267174243927, "step": 1383 }, { "batch_size": 4, "epoch": 0.5532, "step": 1383, "tokens_per_device": 4680 }, { "epoch": 0.5532, "loss_ce": 0.24667689204216003, "loss_lvr": 0.844955325126648, "loss_mode_switch": 0.0, "loss_total": 0.3311724364757538, "step": 1383 }, { "epoch": 0.5536, "grad_norm": 1.3521934747695923, "learning_rate": 4.3765471801824865e-06, "loss": 0.3227, "step": 1384 }, { "batch_size": 1, "epoch": 0.5536, "step": 1384, "tokens_per_device": 4315 }, { "epoch": 0.5536, "loss_ce": 0.011509395204484463, "loss_lvr": 1.3295342922210693, "loss_mode_switch": 0.0, "loss_total": 0.14446282386779785, "step": 1384 }, { "batch_size": 4, "epoch": 0.5536, "step": 1384, "tokens_per_device": 1476 }, { "epoch": 0.5536, "loss_ce": 0.6645969152450562, "loss_lvr": 0.906960666179657, "loss_mode_switch": 0.0, "loss_total": 0.7552930116653442, "step": 1384 }, { "batch_size": 4, "epoch": 0.5536, "step": 1384, "tokens_per_device": 2792 }, { "epoch": 0.5536, "loss_ce": 0.8026935458183289, "loss_lvr": 1.1365528106689453, "loss_mode_switch": 0.0, "loss_total": 0.9163488149642944, "step": 1384 }, { "batch_size": 1, "epoch": 0.5536, "step": 1384, "tokens_per_device": 4948 }, { "epoch": 0.5536, "loss_ce": 0.01678398624062538, "loss_lvr": 0.6629502177238464, "loss_mode_switch": 0.0, "loss_total": 0.0830790102481842, "step": 1384 }, { "batch_size": 1, "epoch": 0.5536, "step": 1384, "tokens_per_device": 4887 }, { "epoch": 0.5536, "loss_ce": 0.0007270878413692117, "loss_lvr": 0.5114001631736755, "loss_mode_switch": 0.0, "loss_total": 0.051867105066776276, "step": 1384 }, { "batch_size": 1, "epoch": 0.5536, "step": 1384, "tokens_per_device": 5176 }, { "epoch": 0.5536, "loss_ce": 0.21412551403045654, "loss_lvr": 0.48566508293151855, "loss_mode_switch": 0.0, "loss_total": 0.26269203424453735, "step": 1384 }, { "batch_size": 4, "epoch": 0.5536, "step": 1384, "tokens_per_device": 1460 }, { "epoch": 0.5536, "loss_ce": 0.0950811505317688, "loss_lvr": 0.9298281073570251, "loss_mode_switch": 0.0, "loss_total": 0.18806396424770355, "step": 1384 }, { "batch_size": 1, "epoch": 0.5536, "step": 1384, "tokens_per_device": 4926 }, { "epoch": 0.5536, "loss_ce": 0.186265766620636, "loss_lvr": 0.33629149198532104, "loss_mode_switch": 0.0, "loss_total": 0.2198949158191681, "step": 1384 }, { "epoch": 0.554, "grad_norm": 1.2554441690444946, "learning_rate": 4.370120747111956e-06, "loss": 0.293, "step": 1385 }, { "batch_size": 4, "epoch": 0.554, "step": 1385, "tokens_per_device": 4260 }, { "epoch": 0.554, "loss_ce": 0.4813705384731293, "loss_lvr": 0.7841672897338867, "loss_mode_switch": 0.0, "loss_total": 0.5597872734069824, "step": 1385 }, { "batch_size": 1, "epoch": 0.554, "step": 1385, "tokens_per_device": 5150 }, { "epoch": 0.554, "loss_ce": 0.04599332809448242, "loss_lvr": 0.33975714445114136, "loss_mode_switch": 0.0, "loss_total": 0.07996904850006104, "step": 1385 }, { "batch_size": 4, "epoch": 0.554, "step": 1385, "tokens_per_device": 3876 }, { "epoch": 0.554, "loss_ce": 0.3697541058063507, "loss_lvr": 0.5895949602127075, "loss_mode_switch": 0.0, "loss_total": 0.4287135899066925, "step": 1385 }, { "batch_size": 1, "epoch": 0.554, "step": 1385, "tokens_per_device": 4874 }, { "epoch": 0.554, "loss_ce": 1.579871654510498, "loss_lvr": 1.781942367553711, "loss_mode_switch": 0.0, "loss_total": 1.758065938949585, "step": 1385 }, { "batch_size": 4, "epoch": 0.554, "step": 1385, "tokens_per_device": 4588 }, { "epoch": 0.554, "loss_ce": 0.006114301737397909, "loss_lvr": 0.6900153756141663, "loss_mode_switch": 0.0, "loss_total": 0.07511584460735321, "step": 1385 }, { "batch_size": 1, "epoch": 0.554, "step": 1385, "tokens_per_device": 5163 }, { "epoch": 0.554, "loss_ce": 0.0005884255515411496, "loss_lvr": 0.3770117461681366, "loss_mode_switch": 0.0, "loss_total": 0.03828959912061691, "step": 1385 }, { "batch_size": 4, "epoch": 0.554, "step": 1385, "tokens_per_device": 1268 }, { "epoch": 0.554, "loss_ce": 0.3389475345611572, "loss_lvr": 1.0689891576766968, "loss_mode_switch": 0.0, "loss_total": 0.44584643840789795, "step": 1385 }, { "batch_size": 4, "epoch": 0.554, "step": 1385, "tokens_per_device": 4516 }, { "epoch": 0.554, "loss_ce": 0.33768582344055176, "loss_lvr": 0.6695327162742615, "loss_mode_switch": 0.0, "loss_total": 0.4046390950679779, "step": 1385 }, { "epoch": 0.5544, "grad_norm": 1.345654845237732, "learning_rate": 4.363695371183849e-06, "loss": 0.2738, "step": 1386 }, { "batch_size": 4, "epoch": 0.5544, "step": 1386, "tokens_per_device": 6652 }, { "epoch": 0.5544, "loss_ce": 0.11730415374040604, "loss_lvr": 0.7212596535682678, "loss_mode_switch": 0.0, "loss_total": 0.1894301176071167, "step": 1386 }, { "batch_size": 4, "epoch": 0.5544, "step": 1386, "tokens_per_device": 1340 }, { "epoch": 0.5544, "loss_ce": 0.36673447489738464, "loss_lvr": 0.9837706685066223, "loss_mode_switch": 0.0, "loss_total": 0.46511155366897583, "step": 1386 }, { "batch_size": 4, "epoch": 0.5544, "step": 1386, "tokens_per_device": 4236 }, { "epoch": 0.5544, "loss_ce": 0.04438329488039017, "loss_lvr": 0.6980387568473816, "loss_mode_switch": 0.0, "loss_total": 0.11418717354536057, "step": 1386 }, { "batch_size": 4, "epoch": 0.5544, "step": 1386, "tokens_per_device": 7592 }, { "epoch": 0.5544, "loss_ce": 0.6692079901695251, "loss_lvr": 0.556090235710144, "loss_mode_switch": 0.0, "loss_total": 0.7248170375823975, "step": 1386 }, { "batch_size": 4, "epoch": 0.5544, "step": 1386, "tokens_per_device": 5884 }, { "epoch": 0.5544, "loss_ce": 0.5361179709434509, "loss_lvr": 0.8665004968643188, "loss_mode_switch": 0.0, "loss_total": 0.6227680444717407, "step": 1386 }, { "batch_size": 1, "epoch": 0.5544, "step": 1386, "tokens_per_device": 5123 }, { "epoch": 0.5544, "loss_ce": 0.021648243069648743, "loss_lvr": 0.7119730710983276, "loss_mode_switch": 0.0, "loss_total": 0.09284555166959763, "step": 1386 }, { "batch_size": 4, "epoch": 0.5544, "step": 1386, "tokens_per_device": 4408 }, { "epoch": 0.5544, "loss_ce": 0.006978072691708803, "loss_lvr": 0.9111320376396179, "loss_mode_switch": 0.0, "loss_total": 0.09809127449989319, "step": 1386 }, { "batch_size": 4, "epoch": 0.5544, "step": 1386, "tokens_per_device": 6116 }, { "epoch": 0.5544, "loss_ce": 0.05487951636314392, "loss_lvr": 0.7787878513336182, "loss_mode_switch": 0.0, "loss_total": 0.13275830447673798, "step": 1386 }, { "epoch": 0.5548, "grad_norm": 1.8276879787445068, "learning_rate": 4.35727106318204e-06, "loss": 0.3331, "step": 1387 }, { "batch_size": 4, "epoch": 0.5548, "step": 1387, "tokens_per_device": 2760 }, { "epoch": 0.5548, "loss_ce": 0.4688223898410797, "loss_lvr": 0.6372882723808289, "loss_mode_switch": 0.0, "loss_total": 0.5325512290000916, "step": 1387 }, { "batch_size": 1, "epoch": 0.5548, "step": 1387, "tokens_per_device": 4897 }, { "epoch": 0.5548, "loss_ce": 0.03295873478055, "loss_lvr": 0.47437751293182373, "loss_mode_switch": 0.0, "loss_total": 0.08039648830890656, "step": 1387 }, { "batch_size": 4, "epoch": 0.5548, "step": 1387, "tokens_per_device": 4244 }, { "epoch": 0.5548, "loss_ce": 0.09469710290431976, "loss_lvr": 1.0472921133041382, "loss_mode_switch": 0.0, "loss_total": 0.1994263231754303, "step": 1387 }, { "batch_size": 4, "epoch": 0.5548, "step": 1387, "tokens_per_device": 5916 }, { "epoch": 0.5548, "loss_ce": 0.085594043135643, "loss_lvr": 0.7022355198860168, "loss_mode_switch": 0.0, "loss_total": 0.15581759810447693, "step": 1387 }, { "batch_size": 4, "epoch": 0.5548, "step": 1387, "tokens_per_device": 4600 }, { "epoch": 0.5548, "loss_ce": 0.0039442977868020535, "loss_lvr": 0.7412976622581482, "loss_mode_switch": 0.0, "loss_total": 0.07807406783103943, "step": 1387 }, { "batch_size": 4, "epoch": 0.5548, "step": 1387, "tokens_per_device": 3916 }, { "epoch": 0.5548, "loss_ce": 0.671319305896759, "loss_lvr": 0.8248420357704163, "loss_mode_switch": 0.0, "loss_total": 0.7538034915924072, "step": 1387 }, { "batch_size": 4, "epoch": 0.5548, "step": 1387, "tokens_per_device": 1272 }, { "epoch": 0.5548, "loss_ce": 0.2553330063819885, "loss_lvr": 0.9341775178909302, "loss_mode_switch": 0.0, "loss_total": 0.3487507700920105, "step": 1387 }, { "batch_size": 1, "epoch": 0.5548, "step": 1387, "tokens_per_device": 5179 }, { "epoch": 0.5548, "loss_ce": 0.03398965671658516, "loss_lvr": 0.3034522831439972, "loss_mode_switch": 0.0, "loss_total": 0.06433488428592682, "step": 1387 }, { "epoch": 0.5552, "grad_norm": 1.2936816215515137, "learning_rate": 4.3508478338886105e-06, "loss": 0.3048, "step": 1388 }, { "batch_size": 4, "epoch": 0.5552, "step": 1388, "tokens_per_device": 7300 }, { "epoch": 0.5552, "loss_ce": 0.04101835936307907, "loss_lvr": 0.7023895382881165, "loss_mode_switch": 0.0, "loss_total": 0.11125731468200684, "step": 1388 }, { "batch_size": 4, "epoch": 0.5552, "step": 1388, "tokens_per_device": 5736 }, { "epoch": 0.5552, "loss_ce": 0.2662770748138428, "loss_lvr": 0.7246936559677124, "loss_mode_switch": 0.0, "loss_total": 0.33874642848968506, "step": 1388 }, { "batch_size": 1, "epoch": 0.5552, "step": 1388, "tokens_per_device": 5148 }, { "epoch": 0.5552, "loss_ce": 0.014424553140997887, "loss_lvr": 0.7726563215255737, "loss_mode_switch": 0.0, "loss_total": 0.09169019013643265, "step": 1388 }, { "batch_size": 4, "epoch": 0.5552, "step": 1388, "tokens_per_device": 4588 }, { "epoch": 0.5552, "loss_ce": 0.04802576079964638, "loss_lvr": 0.6899515986442566, "loss_mode_switch": 0.0, "loss_total": 0.11702091991901398, "step": 1388 }, { "batch_size": 1, "epoch": 0.5552, "step": 1388, "tokens_per_device": 4897 }, { "epoch": 0.5552, "loss_ce": 0.016826864331960678, "loss_lvr": 0.4330046474933624, "loss_mode_switch": 0.0, "loss_total": 0.06012732908129692, "step": 1388 }, { "batch_size": 4, "epoch": 0.5552, "step": 1388, "tokens_per_device": 3792 }, { "epoch": 0.5552, "loss_ce": 0.34347957372665405, "loss_lvr": 1.5296674966812134, "loss_mode_switch": 0.0, "loss_total": 0.49644631147384644, "step": 1388 }, { "batch_size": 4, "epoch": 0.5552, "step": 1388, "tokens_per_device": 4216 }, { "epoch": 0.5552, "loss_ce": 0.5243068337440491, "loss_lvr": 0.8861976861953735, "loss_mode_switch": 0.0, "loss_total": 0.6129266023635864, "step": 1388 }, { "batch_size": 4, "epoch": 0.5552, "step": 1388, "tokens_per_device": 1708 }, { "epoch": 0.5552, "loss_ce": 0.03344051539897919, "loss_lvr": 0.7671489715576172, "loss_mode_switch": 0.0, "loss_total": 0.11015541106462479, "step": 1388 }, { "epoch": 0.5556, "grad_norm": 1.1873040199279785, "learning_rate": 4.344425694083829e-06, "loss": 0.2566, "step": 1389 }, { "batch_size": 1, "epoch": 0.5556, "step": 1389, "tokens_per_device": 4737 }, { "epoch": 0.5556, "loss_ce": 0.00407623453065753, "loss_lvr": 0.5002973079681396, "loss_mode_switch": 0.0, "loss_total": 0.0541059672832489, "step": 1389 }, { "batch_size": 4, "epoch": 0.5556, "step": 1389, "tokens_per_device": 3784 }, { "epoch": 0.5556, "loss_ce": 0.4072205722332001, "loss_lvr": 1.0250117778778076, "loss_mode_switch": 0.0, "loss_total": 0.5097217559814453, "step": 1389 }, { "batch_size": 4, "epoch": 0.5556, "step": 1389, "tokens_per_device": 4860 }, { "epoch": 0.5556, "loss_ce": 0.33448880910873413, "loss_lvr": 0.86789470911026, "loss_mode_switch": 0.0, "loss_total": 0.42127829790115356, "step": 1389 }, { "batch_size": 4, "epoch": 0.5556, "step": 1389, "tokens_per_device": 2572 }, { "epoch": 0.5556, "loss_ce": 0.5482423901557922, "loss_lvr": 0.6420838236808777, "loss_mode_switch": 0.0, "loss_total": 0.6124507784843445, "step": 1389 }, { "batch_size": 1, "epoch": 0.5556, "step": 1389, "tokens_per_device": 7282 }, { "epoch": 0.5556, "loss_ce": 0.01613028161227703, "loss_lvr": 0.35870087146759033, "loss_mode_switch": 0.0, "loss_total": 0.052000366151332855, "step": 1389 }, { "batch_size": 4, "epoch": 0.5556, "step": 1389, "tokens_per_device": 4228 }, { "epoch": 0.5556, "loss_ce": 0.2670351564884186, "loss_lvr": 0.95854252576828, "loss_mode_switch": 0.0, "loss_total": 0.3628894090652466, "step": 1389 }, { "batch_size": 1, "epoch": 0.5556, "step": 1389, "tokens_per_device": 4127 }, { "epoch": 0.5556, "loss_ce": 0.3347233235836029, "loss_lvr": 0.4100326597690582, "loss_mode_switch": 0.0, "loss_total": 0.375726580619812, "step": 1389 }, { "batch_size": 4, "epoch": 0.5556, "step": 1389, "tokens_per_device": 3964 }, { "epoch": 0.5556, "loss_ce": 0.4373794198036194, "loss_lvr": 1.0126547813415527, "loss_mode_switch": 0.0, "loss_total": 0.5386449098587036, "step": 1389 }, { "epoch": 0.556, "grad_norm": 1.5587692260742188, "learning_rate": 4.338004654546136e-06, "loss": 0.3085, "step": 1390 }, { "batch_size": 4, "epoch": 0.556, "step": 1390, "tokens_per_device": 4536 }, { "epoch": 0.556, "loss_ce": 0.2666212022304535, "loss_lvr": 1.0755423307418823, "loss_mode_switch": 0.0, "loss_total": 0.37417542934417725, "step": 1390 }, { "batch_size": 4, "epoch": 0.556, "step": 1390, "tokens_per_device": 5672 }, { "epoch": 0.556, "loss_ce": 0.01503200363367796, "loss_lvr": 0.8028509020805359, "loss_mode_switch": 0.0, "loss_total": 0.09531709551811218, "step": 1390 }, { "batch_size": 4, "epoch": 0.556, "step": 1390, "tokens_per_device": 2584 }, { "epoch": 0.556, "loss_ce": 0.12732279300689697, "loss_lvr": 0.7325970530509949, "loss_mode_switch": 0.0, "loss_total": 0.20058250427246094, "step": 1390 }, { "batch_size": 1, "epoch": 0.556, "step": 1390, "tokens_per_device": 5185 }, { "epoch": 0.556, "loss_ce": 0.0023867469280958176, "loss_lvr": 0.3828679919242859, "loss_mode_switch": 0.0, "loss_total": 0.040673546493053436, "step": 1390 }, { "batch_size": 4, "epoch": 0.556, "step": 1390, "tokens_per_device": 4232 }, { "epoch": 0.556, "loss_ce": 0.2767969071865082, "loss_lvr": 0.9233080148696899, "loss_mode_switch": 0.0, "loss_total": 0.36912772059440613, "step": 1390 }, { "batch_size": 4, "epoch": 0.556, "step": 1390, "tokens_per_device": 6424 }, { "epoch": 0.556, "loss_ce": 0.18623517453670502, "loss_lvr": 0.7018498182296753, "loss_mode_switch": 0.0, "loss_total": 0.25642016530036926, "step": 1390 }, { "batch_size": 4, "epoch": 0.556, "step": 1390, "tokens_per_device": 1688 }, { "epoch": 0.556, "loss_ce": 0.2819133996963501, "loss_lvr": 1.0918676853179932, "loss_mode_switch": 0.0, "loss_total": 0.3911001682281494, "step": 1390 }, { "batch_size": 1, "epoch": 0.556, "step": 1390, "tokens_per_device": 4097 }, { "epoch": 0.556, "loss_ce": 0.027908513322472572, "loss_lvr": 0.36870402097702026, "loss_mode_switch": 0.0, "loss_total": 0.06477891653776169, "step": 1390 }, { "epoch": 0.5564, "grad_norm": 1.1490552425384521, "learning_rate": 4.331584726052124e-06, "loss": 0.295, "step": 1391 }, { "batch_size": 4, "epoch": 0.5564, "step": 1391, "tokens_per_device": 5180 }, { "epoch": 0.5564, "loss_ce": 0.23448637127876282, "loss_lvr": 0.9049882888793945, "loss_mode_switch": 0.0, "loss_total": 0.32498520612716675, "step": 1391 }, { "batch_size": 4, "epoch": 0.5564, "step": 1391, "tokens_per_device": 2700 }, { "epoch": 0.5564, "loss_ce": 0.36879923939704895, "loss_lvr": 0.8313522338867188, "loss_mode_switch": 0.0, "loss_total": 0.45193445682525635, "step": 1391 }, { "batch_size": 4, "epoch": 0.5564, "step": 1391, "tokens_per_device": 3776 }, { "epoch": 0.5564, "loss_ce": 0.21052433550357819, "loss_lvr": 0.7468647360801697, "loss_mode_switch": 0.0, "loss_total": 0.28521081805229187, "step": 1391 }, { "batch_size": 4, "epoch": 0.5564, "step": 1391, "tokens_per_device": 7448 }, { "epoch": 0.5564, "loss_ce": 0.16809743642807007, "loss_lvr": 0.8024040460586548, "loss_mode_switch": 0.0, "loss_total": 0.24833783507347107, "step": 1391 }, { "batch_size": 1, "epoch": 0.5564, "step": 1391, "tokens_per_device": 5782 }, { "epoch": 0.5564, "loss_ce": 0.0444321408867836, "loss_lvr": 0.5052885413169861, "loss_mode_switch": 0.0, "loss_total": 0.09496099501848221, "step": 1391 }, { "batch_size": 4, "epoch": 0.5564, "step": 1391, "tokens_per_device": 3744 }, { "epoch": 0.5564, "loss_ce": 0.16629928350448608, "loss_lvr": 0.8807054162025452, "loss_mode_switch": 0.0, "loss_total": 0.2543698251247406, "step": 1391 }, { "batch_size": 4, "epoch": 0.5564, "step": 1391, "tokens_per_device": 2640 }, { "epoch": 0.5564, "loss_ce": 0.6756009459495544, "loss_lvr": 0.845920205116272, "loss_mode_switch": 0.0, "loss_total": 0.7601929903030396, "step": 1391 }, { "batch_size": 1, "epoch": 0.5564, "step": 1391, "tokens_per_device": 5211 }, { "epoch": 0.5564, "loss_ce": 0.0008014562772586942, "loss_lvr": 0.3240337371826172, "loss_mode_switch": 0.0, "loss_total": 0.033204831182956696, "step": 1391 }, { "epoch": 0.5568, "grad_norm": 1.2672404050827026, "learning_rate": 4.325165919376528e-06, "loss": 0.2869, "step": 1392 }, { "batch_size": 1, "epoch": 0.5568, "step": 1392, "tokens_per_device": 5175 }, { "epoch": 0.5568, "loss_ce": 0.0020576498936861753, "loss_lvr": 0.34668946266174316, "loss_mode_switch": 0.0, "loss_total": 0.03672659769654274, "step": 1392 }, { "batch_size": 1, "epoch": 0.5568, "step": 1392, "tokens_per_device": 5135 }, { "epoch": 0.5568, "loss_ce": 0.019547145813703537, "loss_lvr": 0.346952885389328, "loss_mode_switch": 0.0, "loss_total": 0.05424243584275246, "step": 1392 }, { "batch_size": 4, "epoch": 0.5568, "step": 1392, "tokens_per_device": 6168 }, { "epoch": 0.5568, "loss_ce": 0.2773825526237488, "loss_lvr": 0.8578820824623108, "loss_mode_switch": 0.0, "loss_total": 0.3631707727909088, "step": 1392 }, { "batch_size": 1, "epoch": 0.5568, "step": 1392, "tokens_per_device": 4884 }, { "epoch": 0.5568, "loss_ce": 0.007853728719055653, "loss_lvr": 0.5407412648200989, "loss_mode_switch": 0.0, "loss_total": 0.061927855014801025, "step": 1392 }, { "batch_size": 4, "epoch": 0.5568, "step": 1392, "tokens_per_device": 5724 }, { "epoch": 0.5568, "loss_ce": 0.21615265309810638, "loss_lvr": 0.8114640712738037, "loss_mode_switch": 0.0, "loss_total": 0.2972990572452545, "step": 1392 }, { "batch_size": 1, "epoch": 0.5568, "step": 1392, "tokens_per_device": 4893 }, { "epoch": 0.5568, "loss_ce": 0.14466719329357147, "loss_lvr": 0.22984197735786438, "loss_mode_switch": 0.0, "loss_total": 0.16765138506889343, "step": 1392 }, { "batch_size": 4, "epoch": 0.5568, "step": 1392, "tokens_per_device": 4248 }, { "epoch": 0.5568, "loss_ce": 0.164043128490448, "loss_lvr": 1.016061782836914, "loss_mode_switch": 0.0, "loss_total": 0.26564931869506836, "step": 1392 }, { "batch_size": 4, "epoch": 0.5568, "step": 1392, "tokens_per_device": 1472 }, { "epoch": 0.5568, "loss_ce": 0.579838752746582, "loss_lvr": 1.126426100730896, "loss_mode_switch": 0.0, "loss_total": 0.6924813389778137, "step": 1392 }, { "epoch": 0.5572, "grad_norm": 1.207886815071106, "learning_rate": 4.318748245292193e-06, "loss": 0.2741, "step": 1393 }, { "batch_size": 1, "epoch": 0.5572, "step": 1393, "tokens_per_device": 4887 }, { "epoch": 0.5572, "loss_ce": 0.027404384687542915, "loss_lvr": 0.19490550458431244, "loss_mode_switch": 0.0, "loss_total": 0.04689493775367737, "step": 1393 }, { "batch_size": 4, "epoch": 0.5572, "step": 1393, "tokens_per_device": 8060 }, { "epoch": 0.5572, "loss_ce": 0.22812406718730927, "loss_lvr": 1.116908073425293, "loss_mode_switch": 0.0, "loss_total": 0.3398148715496063, "step": 1393 }, { "batch_size": 4, "epoch": 0.5572, "step": 1393, "tokens_per_device": 4380 }, { "epoch": 0.5572, "loss_ce": 0.2774944603443146, "loss_lvr": 0.8800886869430542, "loss_mode_switch": 0.0, "loss_total": 0.36550334095954895, "step": 1393 }, { "batch_size": 4, "epoch": 0.5572, "step": 1393, "tokens_per_device": 5456 }, { "epoch": 0.5572, "loss_ce": 0.2220579832792282, "loss_lvr": 0.5636276006698608, "loss_mode_switch": 0.0, "loss_total": 0.27842074632644653, "step": 1393 }, { "batch_size": 1, "epoch": 0.5572, "step": 1393, "tokens_per_device": 5062 }, { "epoch": 0.5572, "loss_ce": 0.008437586016952991, "loss_lvr": 0.44742104411125183, "loss_mode_switch": 0.0, "loss_total": 0.05317968875169754, "step": 1393 }, { "batch_size": 4, "epoch": 0.5572, "step": 1393, "tokens_per_device": 4308 }, { "epoch": 0.5572, "loss_ce": 0.5682064890861511, "loss_lvr": 0.7873514890670776, "loss_mode_switch": 0.0, "loss_total": 0.6469416618347168, "step": 1393 }, { "batch_size": 4, "epoch": 0.5572, "step": 1393, "tokens_per_device": 2572 }, { "epoch": 0.5572, "loss_ce": 0.18712925910949707, "loss_lvr": 0.9158157706260681, "loss_mode_switch": 0.0, "loss_total": 0.27871084213256836, "step": 1393 }, { "batch_size": 4, "epoch": 0.5572, "step": 1393, "tokens_per_device": 8176 }, { "epoch": 0.5572, "loss_ce": 0.09691224992275238, "loss_lvr": 1.0446597337722778, "loss_mode_switch": 0.0, "loss_total": 0.2013782262802124, "step": 1393 }, { "epoch": 0.5576, "grad_norm": 1.2737526893615723, "learning_rate": 4.312331714570064e-06, "loss": 0.2756, "step": 1394 }, { "batch_size": 4, "epoch": 0.5576, "step": 1394, "tokens_per_device": 4716 }, { "epoch": 0.5576, "loss_ce": 0.19234015047550201, "loss_lvr": 0.8189361095428467, "loss_mode_switch": 0.0, "loss_total": 0.27423375844955444, "step": 1394 }, { "batch_size": 4, "epoch": 0.5576, "step": 1394, "tokens_per_device": 3812 }, { "epoch": 0.5576, "loss_ce": 0.292349249124527, "loss_lvr": 0.9743889570236206, "loss_mode_switch": 0.0, "loss_total": 0.3897881507873535, "step": 1394 }, { "batch_size": 4, "epoch": 0.5576, "step": 1394, "tokens_per_device": 5060 }, { "epoch": 0.5576, "loss_ce": 0.029545502737164497, "loss_lvr": 0.9117284417152405, "loss_mode_switch": 0.0, "loss_total": 0.1207183450460434, "step": 1394 }, { "batch_size": 1, "epoch": 0.5576, "step": 1394, "tokens_per_device": 5131 }, { "epoch": 0.5576, "loss_ce": 0.0006283298134803772, "loss_lvr": 0.2476237714290619, "loss_mode_switch": 0.0, "loss_total": 0.025390706956386566, "step": 1394 }, { "batch_size": 1, "epoch": 0.5576, "step": 1394, "tokens_per_device": 4571 }, { "epoch": 0.5576, "loss_ce": 0.017847349867224693, "loss_lvr": 0.5025296807289124, "loss_mode_switch": 0.0, "loss_total": 0.06810031831264496, "step": 1394 }, { "batch_size": 4, "epoch": 0.5576, "step": 1394, "tokens_per_device": 4112 }, { "epoch": 0.5576, "loss_ce": 0.12263123691082001, "loss_lvr": 0.808796226978302, "loss_mode_switch": 0.0, "loss_total": 0.2035108506679535, "step": 1394 }, { "batch_size": 1, "epoch": 0.5576, "step": 1394, "tokens_per_device": 4325 }, { "epoch": 0.5576, "loss_ce": 0.11776617169380188, "loss_lvr": 0.43984299898147583, "loss_mode_switch": 0.0, "loss_total": 0.16175046563148499, "step": 1394 }, { "batch_size": 4, "epoch": 0.5576, "step": 1394, "tokens_per_device": 1364 }, { "epoch": 0.5576, "loss_ce": 0.347507506608963, "loss_lvr": 1.1431673765182495, "loss_mode_switch": 0.0, "loss_total": 0.4618242383003235, "step": 1394 }, { "epoch": 0.558, "grad_norm": 1.3328264951705933, "learning_rate": 4.3059163379791676e-06, "loss": 0.3075, "step": 1395 }, { "batch_size": 4, "epoch": 0.558, "step": 1395, "tokens_per_device": 2576 }, { "epoch": 0.558, "loss_ce": 0.506845235824585, "loss_lvr": 0.7331796884536743, "loss_mode_switch": 0.0, "loss_total": 0.5801631808280945, "step": 1395 }, { "batch_size": 4, "epoch": 0.558, "step": 1395, "tokens_per_device": 5776 }, { "epoch": 0.558, "loss_ce": 0.3525697588920593, "loss_lvr": 1.0451754331588745, "loss_mode_switch": 0.0, "loss_total": 0.45708730816841125, "step": 1395 }, { "batch_size": 4, "epoch": 0.558, "step": 1395, "tokens_per_device": 3760 }, { "epoch": 0.558, "loss_ce": 0.06008101999759674, "loss_lvr": 0.9608179926872253, "loss_mode_switch": 0.0, "loss_total": 0.156162828207016, "step": 1395 }, { "batch_size": 4, "epoch": 0.558, "step": 1395, "tokens_per_device": 1416 }, { "epoch": 0.558, "loss_ce": 0.5904771089553833, "loss_lvr": 0.9672536849975586, "loss_mode_switch": 0.0, "loss_total": 0.6872024536132812, "step": 1395 }, { "batch_size": 4, "epoch": 0.558, "step": 1395, "tokens_per_device": 4756 }, { "epoch": 0.558, "loss_ce": 0.04284243658185005, "loss_lvr": 0.7175494432449341, "loss_mode_switch": 0.0, "loss_total": 0.1145973801612854, "step": 1395 }, { "batch_size": 4, "epoch": 0.558, "step": 1395, "tokens_per_device": 3760 }, { "epoch": 0.558, "loss_ce": 0.4569118916988373, "loss_lvr": 0.8677013516426086, "loss_mode_switch": 0.0, "loss_total": 0.5436820387840271, "step": 1395 }, { "batch_size": 1, "epoch": 0.558, "step": 1395, "tokens_per_device": 5435 }, { "epoch": 0.558, "loss_ce": 0.0026445940602570772, "loss_lvr": 0.38893282413482666, "loss_mode_switch": 0.0, "loss_total": 0.041537877172231674, "step": 1395 }, { "batch_size": 4, "epoch": 0.558, "step": 1395, "tokens_per_device": 5364 }, { "epoch": 0.558, "loss_ce": 0.11858958005905151, "loss_lvr": 1.2392734289169312, "loss_mode_switch": 0.0, "loss_total": 0.24251693487167358, "step": 1395 }, { "epoch": 0.5584, "grad_norm": 1.286857008934021, "learning_rate": 4.299502126286596e-06, "loss": 0.3082, "step": 1396 }, { "batch_size": 1, "epoch": 0.5584, "step": 1396, "tokens_per_device": 4752 }, { "epoch": 0.5584, "loss_ce": 0.00664680078625679, "loss_lvr": 0.33702346682548523, "loss_mode_switch": 0.0, "loss_total": 0.04034914821386337, "step": 1396 }, { "batch_size": 4, "epoch": 0.5584, "step": 1396, "tokens_per_device": 1560 }, { "epoch": 0.5584, "loss_ce": 0.4565192759037018, "loss_lvr": 1.1668273210525513, "loss_mode_switch": 0.0, "loss_total": 0.5732020139694214, "step": 1396 }, { "batch_size": 4, "epoch": 0.5584, "step": 1396, "tokens_per_device": 4316 }, { "epoch": 0.5584, "loss_ce": 0.2856988310813904, "loss_lvr": 1.079113245010376, "loss_mode_switch": 0.0, "loss_total": 0.3936101496219635, "step": 1396 }, { "batch_size": 4, "epoch": 0.5584, "step": 1396, "tokens_per_device": 5532 }, { "epoch": 0.5584, "loss_ce": 0.21445868909358978, "loss_lvr": 0.7639269828796387, "loss_mode_switch": 0.0, "loss_total": 0.2908513844013214, "step": 1396 }, { "batch_size": 1, "epoch": 0.5584, "step": 1396, "tokens_per_device": 5181 }, { "epoch": 0.5584, "loss_ce": 0.009532761760056019, "loss_lvr": 0.2739550769329071, "loss_mode_switch": 0.0, "loss_total": 0.036928270012140274, "step": 1396 }, { "batch_size": 4, "epoch": 0.5584, "step": 1396, "tokens_per_device": 3820 }, { "epoch": 0.5584, "loss_ce": 0.5979986786842346, "loss_lvr": 0.8309053182601929, "loss_mode_switch": 0.0, "loss_total": 0.6810892224311829, "step": 1396 }, { "batch_size": 4, "epoch": 0.5584, "step": 1396, "tokens_per_device": 4868 }, { "epoch": 0.5584, "loss_ce": 0.31762394309043884, "loss_lvr": 0.6792702078819275, "loss_mode_switch": 0.0, "loss_total": 0.38555097579956055, "step": 1396 }, { "batch_size": 4, "epoch": 0.5584, "step": 1396, "tokens_per_device": 6492 }, { "epoch": 0.5584, "loss_ce": 0.6802530288696289, "loss_lvr": 0.6757270693778992, "loss_mode_switch": 0.0, "loss_total": 0.7478257417678833, "step": 1396 }, { "epoch": 0.5588, "grad_norm": 1.4186371564865112, "learning_rate": 4.293089090257484e-06, "loss": 0.3058, "step": 1397 }, { "batch_size": 4, "epoch": 0.5588, "step": 1397, "tokens_per_device": 6576 }, { "epoch": 0.5588, "loss_ce": 0.35284218192100525, "loss_lvr": 0.7105811238288879, "loss_mode_switch": 0.0, "loss_total": 0.423900306224823, "step": 1397 }, { "batch_size": 4, "epoch": 0.5588, "step": 1397, "tokens_per_device": 1400 }, { "epoch": 0.5588, "loss_ce": 0.15189692378044128, "loss_lvr": 1.0495827198028564, "loss_mode_switch": 0.0, "loss_total": 0.25685518980026245, "step": 1397 }, { "batch_size": 4, "epoch": 0.5588, "step": 1397, "tokens_per_device": 4232 }, { "epoch": 0.5588, "loss_ce": 0.8528661727905273, "loss_lvr": 0.9372653961181641, "loss_mode_switch": 0.0, "loss_total": 0.9465926885604858, "step": 1397 }, { "batch_size": 4, "epoch": 0.5588, "step": 1397, "tokens_per_device": 10952 }, { "epoch": 0.5588, "loss_ce": 0.16932477056980133, "loss_lvr": 1.1594356298446655, "loss_mode_switch": 0.0, "loss_total": 0.2852683365345001, "step": 1397 }, { "batch_size": 1, "epoch": 0.5588, "step": 1397, "tokens_per_device": 8166 }, { "epoch": 0.5588, "loss_ce": 0.00041798833990469575, "loss_lvr": 0.2579798996448517, "loss_mode_switch": 0.0, "loss_total": 0.02621597982943058, "step": 1397 }, { "batch_size": 4, "epoch": 0.5588, "step": 1397, "tokens_per_device": 1544 }, { "epoch": 0.5588, "loss_ce": 0.5352938771247864, "loss_lvr": 0.9031031727790833, "loss_mode_switch": 0.0, "loss_total": 0.6256042122840881, "step": 1397 }, { "batch_size": 4, "epoch": 0.5588, "step": 1397, "tokens_per_device": 1608 }, { "epoch": 0.5588, "loss_ce": 0.5860474705696106, "loss_lvr": 0.9844423532485962, "loss_mode_switch": 0.0, "loss_total": 0.6844916939735413, "step": 1397 }, { "batch_size": 4, "epoch": 0.5588, "step": 1397, "tokens_per_device": 5712 }, { "epoch": 0.5588, "loss_ce": 0.5178777575492859, "loss_lvr": 0.8910806775093079, "loss_mode_switch": 0.0, "loss_total": 0.6069858074188232, "step": 1397 }, { "epoch": 0.5592, "grad_norm": 1.3434438705444336, "learning_rate": 4.286677240654993e-06, "loss": 0.3441, "step": 1398 }, { "batch_size": 1, "epoch": 0.5592, "step": 1398, "tokens_per_device": 4658 }, { "epoch": 0.5592, "loss_ce": 0.019212301820516586, "loss_lvr": 0.1854459047317505, "loss_mode_switch": 0.0, "loss_total": 0.037756890058517456, "step": 1398 }, { "batch_size": 4, "epoch": 0.5592, "step": 1398, "tokens_per_device": 1328 }, { "epoch": 0.5592, "loss_ce": 0.6394421458244324, "loss_lvr": 1.0630260705947876, "loss_mode_switch": 0.0, "loss_total": 0.7457447648048401, "step": 1398 }, { "batch_size": 1, "epoch": 0.5592, "step": 1398, "tokens_per_device": 4487 }, { "epoch": 0.5592, "loss_ce": 0.09330803155899048, "loss_lvr": 0.5517367720603943, "loss_mode_switch": 0.0, "loss_total": 0.14848171174526215, "step": 1398 }, { "batch_size": 4, "epoch": 0.5592, "step": 1398, "tokens_per_device": 5684 }, { "epoch": 0.5592, "loss_ce": 0.07072348892688751, "loss_lvr": 0.7182085514068604, "loss_mode_switch": 0.0, "loss_total": 0.14254434406757355, "step": 1398 }, { "batch_size": 1, "epoch": 0.5592, "step": 1398, "tokens_per_device": 4881 }, { "epoch": 0.5592, "loss_ce": 0.010827158577740192, "loss_lvr": 0.2819763720035553, "loss_mode_switch": 0.0, "loss_total": 0.03902479633688927, "step": 1398 }, { "batch_size": 1, "epoch": 0.5592, "step": 1398, "tokens_per_device": 5542 }, { "epoch": 0.5592, "loss_ce": 0.04584210366010666, "loss_lvr": 0.28463199734687805, "loss_mode_switch": 0.0, "loss_total": 0.07430530339479446, "step": 1398 }, { "batch_size": 4, "epoch": 0.5592, "step": 1398, "tokens_per_device": 2756 }, { "epoch": 0.5592, "loss_ce": 0.48324820399284363, "loss_lvr": 1.0868215560913086, "loss_mode_switch": 0.0, "loss_total": 0.5919303894042969, "step": 1398 }, { "batch_size": 4, "epoch": 0.5592, "step": 1398, "tokens_per_device": 4376 }, { "epoch": 0.5592, "loss_ce": 0.14291392266750336, "loss_lvr": 1.0365394353866577, "loss_mode_switch": 0.0, "loss_total": 0.24656787514686584, "step": 1398 }, { "epoch": 0.5596, "grad_norm": 1.3715709447860718, "learning_rate": 4.280266588240294e-06, "loss": 0.312, "step": 1399 }, { "batch_size": 4, "epoch": 0.5596, "step": 1399, "tokens_per_device": 7984 }, { "epoch": 0.5596, "loss_ce": 0.21247823536396027, "loss_lvr": 0.7468228936195374, "loss_mode_switch": 0.0, "loss_total": 0.2871605157852173, "step": 1399 }, { "batch_size": 4, "epoch": 0.5596, "step": 1399, "tokens_per_device": 4528 }, { "epoch": 0.5596, "loss_ce": 0.19834992289543152, "loss_lvr": 0.7810953259468079, "loss_mode_switch": 0.0, "loss_total": 0.2764594554901123, "step": 1399 }, { "batch_size": 4, "epoch": 0.5596, "step": 1399, "tokens_per_device": 2512 }, { "epoch": 0.5596, "loss_ce": 0.13057692348957062, "loss_lvr": 1.1337436437606812, "loss_mode_switch": 0.0, "loss_total": 0.24395129084587097, "step": 1399 }, { "batch_size": 1, "epoch": 0.5596, "step": 1399, "tokens_per_device": 5100 }, { "epoch": 0.5596, "loss_ce": 0.0035792668350040913, "loss_lvr": 0.2593468427658081, "loss_mode_switch": 0.0, "loss_total": 0.029513951390981674, "step": 1399 }, { "batch_size": 4, "epoch": 0.5596, "step": 1399, "tokens_per_device": 4220 }, { "epoch": 0.5596, "loss_ce": 0.22188246250152588, "loss_lvr": 0.6083754301071167, "loss_mode_switch": 0.0, "loss_total": 0.28271999955177307, "step": 1399 }, { "batch_size": 4, "epoch": 0.5596, "step": 1399, "tokens_per_device": 4384 }, { "epoch": 0.5596, "loss_ce": 0.12930232286453247, "loss_lvr": 0.6199958920478821, "loss_mode_switch": 0.0, "loss_total": 0.19130191206932068, "step": 1399 }, { "batch_size": 1, "epoch": 0.5596, "step": 1399, "tokens_per_device": 5108 }, { "epoch": 0.5596, "loss_ce": 0.019713934510946274, "loss_lvr": 0.2706916928291321, "loss_mode_switch": 0.0, "loss_total": 0.04678310453891754, "step": 1399 }, { "batch_size": 4, "epoch": 0.5596, "step": 1399, "tokens_per_device": 4492 }, { "epoch": 0.5596, "loss_ce": 0.11782549321651459, "loss_lvr": 0.7690432667732239, "loss_mode_switch": 0.0, "loss_total": 0.19472981989383698, "step": 1399 }, { "epoch": 0.56, "grad_norm": 1.3809846639633179, "learning_rate": 4.27385714377255e-06, "loss": 0.2931, "step": 1400 }, { "batch_size": 1, "epoch": 0.56, "step": 1400, "tokens_per_device": 4865 }, { "epoch": 0.56, "loss_ce": 0.001863441662862897, "loss_lvr": 0.3912034332752228, "loss_mode_switch": 0.0, "loss_total": 0.040983784943819046, "step": 1400 }, { "batch_size": 4, "epoch": 0.56, "step": 1400, "tokens_per_device": 2952 }, { "epoch": 0.56, "loss_ce": 0.10375615954399109, "loss_lvr": 1.1701815128326416, "loss_mode_switch": 0.0, "loss_total": 0.2207743227481842, "step": 1400 }, { "batch_size": 4, "epoch": 0.56, "step": 1400, "tokens_per_device": 4368 }, { "epoch": 0.56, "loss_ce": 0.19826319813728333, "loss_lvr": 0.8245236873626709, "loss_mode_switch": 0.0, "loss_total": 0.28071558475494385, "step": 1400 }, { "batch_size": 4, "epoch": 0.56, "step": 1400, "tokens_per_device": 6568 }, { "epoch": 0.56, "loss_ce": 0.46462804079055786, "loss_lvr": 0.795211911201477, "loss_mode_switch": 0.0, "loss_total": 0.5441492199897766, "step": 1400 }, { "batch_size": 4, "epoch": 0.56, "step": 1400, "tokens_per_device": 4044 }, { "epoch": 0.56, "loss_ce": 0.3101586401462555, "loss_lvr": 0.7187903523445129, "loss_mode_switch": 0.0, "loss_total": 0.3820376694202423, "step": 1400 }, { "batch_size": 4, "epoch": 0.56, "step": 1400, "tokens_per_device": 5704 }, { "epoch": 0.56, "loss_ce": 0.14557555317878723, "loss_lvr": 0.4976500868797302, "loss_mode_switch": 0.0, "loss_total": 0.19534055888652802, "step": 1400 }, { "batch_size": 1, "epoch": 0.56, "step": 1400, "tokens_per_device": 4936 }, { "epoch": 0.56, "loss_ce": 0.5855559706687927, "loss_lvr": 0.78435879945755, "loss_mode_switch": 0.0, "loss_total": 0.6639918684959412, "step": 1400 }, { "batch_size": 1, "epoch": 0.56, "step": 1400, "tokens_per_device": 7417 }, { "epoch": 0.56, "loss_ce": 0.0006445045000873506, "loss_lvr": 0.2606358826160431, "loss_mode_switch": 0.0, "loss_total": 0.026708094403147697, "step": 1400 }, { "epoch": 0.5604, "grad_norm": 1.3247286081314087, "learning_rate": 4.267448918008892e-06, "loss": 0.293, "step": 1401 }, { "batch_size": 4, "epoch": 0.5604, "step": 1401, "tokens_per_device": 3468 }, { "epoch": 0.5604, "loss_ce": 0.27597177028656006, "loss_lvr": 1.0205793380737305, "loss_mode_switch": 0.0, "loss_total": 0.3780297040939331, "step": 1401 }, { "batch_size": 4, "epoch": 0.5604, "step": 1401, "tokens_per_device": 4292 }, { "epoch": 0.5604, "loss_ce": 0.2158529907464981, "loss_lvr": 1.0633431673049927, "loss_mode_switch": 0.0, "loss_total": 0.32218730449676514, "step": 1401 }, { "batch_size": 1, "epoch": 0.5604, "step": 1401, "tokens_per_device": 6251 }, { "epoch": 0.5604, "loss_ce": 0.001317972899414599, "loss_lvr": 0.39994698762893677, "loss_mode_switch": 0.0, "loss_total": 0.041312672197818756, "step": 1401 }, { "batch_size": 1, "epoch": 0.5604, "step": 1401, "tokens_per_device": 4891 }, { "epoch": 0.5604, "loss_ce": 0.005559225101023912, "loss_lvr": 0.16753186285495758, "loss_mode_switch": 0.0, "loss_total": 0.022312412038445473, "step": 1401 }, { "batch_size": 1, "epoch": 0.5604, "step": 1401, "tokens_per_device": 4730 }, { "epoch": 0.5604, "loss_ce": 0.0022380920127034187, "loss_lvr": 0.5707868933677673, "loss_mode_switch": 0.0, "loss_total": 0.05931678041815758, "step": 1401 }, { "batch_size": 4, "epoch": 0.5604, "step": 1401, "tokens_per_device": 5268 }, { "epoch": 0.5604, "loss_ce": 0.3256619870662689, "loss_lvr": 0.7629260420799255, "loss_mode_switch": 0.0, "loss_total": 0.4019545912742615, "step": 1401 }, { "batch_size": 1, "epoch": 0.5604, "step": 1401, "tokens_per_device": 4897 }, { "epoch": 0.5604, "loss_ce": 0.036299336701631546, "loss_lvr": 0.18813928961753845, "loss_mode_switch": 0.0, "loss_total": 0.05511326342821121, "step": 1401 }, { "batch_size": 4, "epoch": 0.5604, "step": 1401, "tokens_per_device": 4268 }, { "epoch": 0.5604, "loss_ce": 0.195020854473114, "loss_lvr": 1.7720357179641724, "loss_mode_switch": 0.0, "loss_total": 0.37222445011138916, "step": 1401 }, { "epoch": 0.5608, "grad_norm": 1.2234703302383423, "learning_rate": 4.2610419217044115e-06, "loss": 0.2995, "step": 1402 }, { "batch_size": 4, "epoch": 0.5608, "step": 1402, "tokens_per_device": 1436 }, { "epoch": 0.5608, "loss_ce": 0.40929776430130005, "loss_lvr": 0.9139484167098999, "loss_mode_switch": 0.0, "loss_total": 0.50069260597229, "step": 1402 }, { "batch_size": 4, "epoch": 0.5608, "step": 1402, "tokens_per_device": 1904 }, { "epoch": 0.5608, "loss_ce": 0.3700755834579468, "loss_lvr": 1.0057079792022705, "loss_mode_switch": 0.0, "loss_total": 0.47064638137817383, "step": 1402 }, { "batch_size": 1, "epoch": 0.5608, "step": 1402, "tokens_per_device": 4903 }, { "epoch": 0.5608, "loss_ce": 0.021104075014591217, "loss_lvr": 1.1292719841003418, "loss_mode_switch": 0.0, "loss_total": 0.1340312659740448, "step": 1402 }, { "batch_size": 4, "epoch": 0.5608, "step": 1402, "tokens_per_device": 4492 }, { "epoch": 0.5608, "loss_ce": 0.004472311586141586, "loss_lvr": 0.5743716359138489, "loss_mode_switch": 0.0, "loss_total": 0.061909474432468414, "step": 1402 }, { "batch_size": 4, "epoch": 0.5608, "step": 1402, "tokens_per_device": 3372 }, { "epoch": 0.5608, "loss_ce": 0.10741623491048813, "loss_lvr": 1.0676804780960083, "loss_mode_switch": 0.0, "loss_total": 0.21418428421020508, "step": 1402 }, { "batch_size": 1, "epoch": 0.5608, "step": 1402, "tokens_per_device": 4742 }, { "epoch": 0.5608, "loss_ce": 0.0005958224646747112, "loss_lvr": 0.37974902987480164, "loss_mode_switch": 0.0, "loss_total": 0.03857072815299034, "step": 1402 }, { "batch_size": 4, "epoch": 0.5608, "step": 1402, "tokens_per_device": 14620 }, { "epoch": 0.5608, "loss_ce": 0.25291571021080017, "loss_lvr": 1.3999650478363037, "loss_mode_switch": 0.0, "loss_total": 0.39291220903396606, "step": 1402 }, { "batch_size": 4, "epoch": 0.5608, "step": 1402, "tokens_per_device": 5932 }, { "epoch": 0.5608, "loss_ce": 0.5258662700653076, "loss_lvr": 0.7692819237709045, "loss_mode_switch": 0.0, "loss_total": 0.6027944684028625, "step": 1402 }, { "epoch": 0.5612, "grad_norm": 1.3237683773040771, "learning_rate": 4.254636165612135e-06, "loss": 0.2783, "step": 1403 }, { "batch_size": 1, "epoch": 0.5612, "step": 1403, "tokens_per_device": 4891 }, { "epoch": 0.5612, "loss_ce": 0.06075190380215645, "loss_lvr": 0.24278146028518677, "loss_mode_switch": 0.0, "loss_total": 0.08503004908561707, "step": 1403 }, { "batch_size": 4, "epoch": 0.5612, "step": 1403, "tokens_per_device": 4440 }, { "epoch": 0.5612, "loss_ce": 0.07522714138031006, "loss_lvr": 0.9940058588981628, "loss_mode_switch": 0.0, "loss_total": 0.17462772130966187, "step": 1403 }, { "batch_size": 4, "epoch": 0.5612, "step": 1403, "tokens_per_device": 7836 }, { "epoch": 0.5612, "loss_ce": 0.435819536447525, "loss_lvr": 0.8854604959487915, "loss_mode_switch": 0.0, "loss_total": 0.5243656039237976, "step": 1403 }, { "batch_size": 4, "epoch": 0.5612, "step": 1403, "tokens_per_device": 2812 }, { "epoch": 0.5612, "loss_ce": 0.2840302288532257, "loss_lvr": 0.7714678049087524, "loss_mode_switch": 0.0, "loss_total": 0.3611770272254944, "step": 1403 }, { "batch_size": 4, "epoch": 0.5612, "step": 1403, "tokens_per_device": 5892 }, { "epoch": 0.5612, "loss_ce": 0.05129440128803253, "loss_lvr": 0.7944496870040894, "loss_mode_switch": 0.0, "loss_total": 0.13073936104774475, "step": 1403 }, { "batch_size": 4, "epoch": 0.5612, "step": 1403, "tokens_per_device": 1540 }, { "epoch": 0.5612, "loss_ce": 0.3786957263946533, "loss_lvr": 1.236783504486084, "loss_mode_switch": 0.0, "loss_total": 0.5023740530014038, "step": 1403 }, { "batch_size": 4, "epoch": 0.5612, "step": 1403, "tokens_per_device": 3816 }, { "epoch": 0.5612, "loss_ce": 0.33603715896606445, "loss_lvr": 0.8804751038551331, "loss_mode_switch": 0.0, "loss_total": 0.4240846633911133, "step": 1403 }, { "batch_size": 4, "epoch": 0.5612, "step": 1403, "tokens_per_device": 5736 }, { "epoch": 0.5612, "loss_ce": 0.30096831917762756, "loss_lvr": 0.8783282041549683, "loss_mode_switch": 0.0, "loss_total": 0.3888011574745178, "step": 1403 }, { "epoch": 0.5616, "grad_norm": 1.504142165184021, "learning_rate": 4.248231660483002e-06, "loss": 0.3385, "step": 1404 }, { "batch_size": 1, "epoch": 0.5616, "step": 1404, "tokens_per_device": 5103 }, { "epoch": 0.5616, "loss_ce": 0.1112542599439621, "loss_lvr": 0.4955703318119049, "loss_mode_switch": 0.0, "loss_total": 0.16081129014492035, "step": 1404 }, { "batch_size": 4, "epoch": 0.5616, "step": 1404, "tokens_per_device": 9360 }, { "epoch": 0.5616, "loss_ce": 0.28528743982315063, "loss_lvr": 0.86855548620224, "loss_mode_switch": 0.0, "loss_total": 0.3721430003643036, "step": 1404 }, { "batch_size": 1, "epoch": 0.5616, "step": 1404, "tokens_per_device": 5172 }, { "epoch": 0.5616, "loss_ce": 0.009927133098244667, "loss_lvr": 0.40250226855278015, "loss_mode_switch": 0.0, "loss_total": 0.05017735809087753, "step": 1404 }, { "batch_size": 1, "epoch": 0.5616, "step": 1404, "tokens_per_device": 5121 }, { "epoch": 0.5616, "loss_ce": 0.0021260136272758245, "loss_lvr": 0.5101525187492371, "loss_mode_switch": 0.0, "loss_total": 0.053141266107559204, "step": 1404 }, { "batch_size": 4, "epoch": 0.5616, "step": 1404, "tokens_per_device": 4180 }, { "epoch": 0.5616, "loss_ce": 0.27107536792755127, "loss_lvr": 0.9179742932319641, "loss_mode_switch": 0.0, "loss_total": 0.36287280917167664, "step": 1404 }, { "batch_size": 4, "epoch": 0.5616, "step": 1404, "tokens_per_device": 11080 }, { "epoch": 0.5616, "loss_ce": 0.04547679424285889, "loss_lvr": 0.9646007418632507, "loss_mode_switch": 0.0, "loss_total": 0.14193686842918396, "step": 1404 }, { "batch_size": 4, "epoch": 0.5616, "step": 1404, "tokens_per_device": 1932 }, { "epoch": 0.5616, "loss_ce": 0.16503815352916718, "loss_lvr": 0.9555715322494507, "loss_mode_switch": 0.0, "loss_total": 0.26059532165527344, "step": 1404 }, { "batch_size": 4, "epoch": 0.5616, "step": 1404, "tokens_per_device": 6564 }, { "epoch": 0.5616, "loss_ce": 0.045979853719472885, "loss_lvr": 0.6582716107368469, "loss_mode_switch": 0.0, "loss_total": 0.11180701851844788, "step": 1404 }, { "epoch": 0.562, "grad_norm": 1.3313902616500854, "learning_rate": 4.24182841706586e-06, "loss": 0.2778, "step": 1405 }, { "batch_size": 4, "epoch": 0.562, "step": 1405, "tokens_per_device": 1428 }, { "epoch": 0.562, "loss_ce": 0.2914141118526459, "loss_lvr": 1.068338394165039, "loss_mode_switch": 0.0, "loss_total": 0.39824795722961426, "step": 1405 }, { "batch_size": 1, "epoch": 0.562, "step": 1405, "tokens_per_device": 4890 }, { "epoch": 0.562, "loss_ce": 0.027967436239123344, "loss_lvr": 0.34637171030044556, "loss_mode_switch": 0.0, "loss_total": 0.06260460615158081, "step": 1405 }, { "batch_size": 4, "epoch": 0.562, "step": 1405, "tokens_per_device": 4212 }, { "epoch": 0.562, "loss_ce": 0.2765139043331146, "loss_lvr": 0.8253902196884155, "loss_mode_switch": 0.0, "loss_total": 0.3590529263019562, "step": 1405 }, { "batch_size": 4, "epoch": 0.562, "step": 1405, "tokens_per_device": 2724 }, { "epoch": 0.562, "loss_ce": 0.30659767985343933, "loss_lvr": 0.7465040683746338, "loss_mode_switch": 0.0, "loss_total": 0.3812480866909027, "step": 1405 }, { "batch_size": 4, "epoch": 0.562, "step": 1405, "tokens_per_device": 2896 }, { "epoch": 0.562, "loss_ce": 0.059750035405159, "loss_lvr": 0.6524408459663391, "loss_mode_switch": 0.0, "loss_total": 0.12499412149190903, "step": 1405 }, { "batch_size": 4, "epoch": 0.562, "step": 1405, "tokens_per_device": 2612 }, { "epoch": 0.562, "loss_ce": 0.7212440371513367, "loss_lvr": 1.2804709672927856, "loss_mode_switch": 0.0, "loss_total": 0.8492911458015442, "step": 1405 }, { "batch_size": 4, "epoch": 0.562, "step": 1405, "tokens_per_device": 4440 }, { "epoch": 0.562, "loss_ce": 0.2804509997367859, "loss_lvr": 0.8346677422523499, "loss_mode_switch": 0.0, "loss_total": 0.3639177680015564, "step": 1405 }, { "batch_size": 4, "epoch": 0.562, "step": 1405, "tokens_per_device": 5696 }, { "epoch": 0.562, "loss_ce": 0.15562669932842255, "loss_lvr": 0.8007151484489441, "loss_mode_switch": 0.0, "loss_total": 0.23569822311401367, "step": 1405 }, { "epoch": 0.5624, "grad_norm": 1.352304458618164, "learning_rate": 4.235426446107437e-06, "loss": 0.3068, "step": 1406 }, { "batch_size": 1, "epoch": 0.5624, "step": 1406, "tokens_per_device": 5159 }, { "epoch": 0.5624, "loss_ce": 0.07944805920124054, "loss_lvr": 0.5093977451324463, "loss_mode_switch": 0.0, "loss_total": 0.13038784265518188, "step": 1406 }, { "batch_size": 4, "epoch": 0.5624, "step": 1406, "tokens_per_device": 5100 }, { "epoch": 0.5624, "loss_ce": 0.2085060030221939, "loss_lvr": 0.6881610155105591, "loss_mode_switch": 0.0, "loss_total": 0.27732211351394653, "step": 1406 }, { "batch_size": 1, "epoch": 0.5624, "step": 1406, "tokens_per_device": 5107 }, { "epoch": 0.5624, "loss_ce": 0.006621482316404581, "loss_lvr": 0.2952646315097809, "loss_mode_switch": 0.0, "loss_total": 0.03614794462919235, "step": 1406 }, { "batch_size": 4, "epoch": 0.5624, "step": 1406, "tokens_per_device": 1332 }, { "epoch": 0.5624, "loss_ce": 0.6145386695861816, "loss_lvr": 1.0072439908981323, "loss_mode_switch": 0.0, "loss_total": 0.7152630686759949, "step": 1406 }, { "batch_size": 4, "epoch": 0.5624, "step": 1406, "tokens_per_device": 6352 }, { "epoch": 0.5624, "loss_ce": 0.7000148892402649, "loss_lvr": 1.2967157363891602, "loss_mode_switch": 0.0, "loss_total": 0.8296864628791809, "step": 1406 }, { "batch_size": 4, "epoch": 0.5624, "step": 1406, "tokens_per_device": 1876 }, { "epoch": 0.5624, "loss_ce": 0.22334380447864532, "loss_lvr": 1.0471707582473755, "loss_mode_switch": 0.0, "loss_total": 0.3280608654022217, "step": 1406 }, { "batch_size": 4, "epoch": 0.5624, "step": 1406, "tokens_per_device": 4216 }, { "epoch": 0.5624, "loss_ce": 0.4697993993759155, "loss_lvr": 0.615278422832489, "loss_mode_switch": 0.0, "loss_total": 0.5313272476196289, "step": 1406 }, { "batch_size": 1, "epoch": 0.5624, "step": 1406, "tokens_per_device": 5056 }, { "epoch": 0.5624, "loss_ce": 0.15715990960597992, "loss_lvr": 1.042075276374817, "loss_mode_switch": 0.0, "loss_total": 0.26136744022369385, "step": 1406 }, { "epoch": 0.5628, "grad_norm": 1.6574431657791138, "learning_rate": 4.229025758352322e-06, "loss": 0.3352, "step": 1407 }, { "batch_size": 4, "epoch": 0.5628, "step": 1407, "tokens_per_device": 4472 }, { "epoch": 0.5628, "loss_ce": 0.17753157019615173, "loss_lvr": 0.5365193486213684, "loss_mode_switch": 0.0, "loss_total": 0.2311834990978241, "step": 1407 }, { "batch_size": 4, "epoch": 0.5628, "step": 1407, "tokens_per_device": 4292 }, { "epoch": 0.5628, "loss_ce": 0.250230997800827, "loss_lvr": 0.8511174917221069, "loss_mode_switch": 0.0, "loss_total": 0.33534276485443115, "step": 1407 }, { "batch_size": 4, "epoch": 0.5628, "step": 1407, "tokens_per_device": 4908 }, { "epoch": 0.5628, "loss_ce": 0.16591012477874756, "loss_lvr": 0.9175328612327576, "loss_mode_switch": 0.0, "loss_total": 0.25766342878341675, "step": 1407 }, { "batch_size": 4, "epoch": 0.5628, "step": 1407, "tokens_per_device": 4352 }, { "epoch": 0.5628, "loss_ce": 0.2464413046836853, "loss_lvr": 0.9852445721626282, "loss_mode_switch": 0.0, "loss_total": 0.34496575593948364, "step": 1407 }, { "batch_size": 4, "epoch": 0.5628, "step": 1407, "tokens_per_device": 3816 }, { "epoch": 0.5628, "loss_ce": 0.6287040114402771, "loss_lvr": 1.3664050102233887, "loss_mode_switch": 0.0, "loss_total": 0.765344500541687, "step": 1407 }, { "batch_size": 4, "epoch": 0.5628, "step": 1407, "tokens_per_device": 5088 }, { "epoch": 0.5628, "loss_ce": 0.35655295848846436, "loss_lvr": 0.7705724239349365, "loss_mode_switch": 0.0, "loss_total": 0.433610200881958, "step": 1407 }, { "batch_size": 1, "epoch": 0.5628, "step": 1407, "tokens_per_device": 5199 }, { "epoch": 0.5628, "loss_ce": 0.00046253486652858555, "loss_lvr": 0.46557459235191345, "loss_mode_switch": 0.0, "loss_total": 0.047019995748996735, "step": 1407 }, { "batch_size": 4, "epoch": 0.5628, "step": 1407, "tokens_per_device": 4256 }, { "epoch": 0.5628, "loss_ce": 0.06444770097732544, "loss_lvr": 0.7680298686027527, "loss_mode_switch": 0.0, "loss_total": 0.14125069975852966, "step": 1407 }, { "epoch": 0.5632, "grad_norm": 1.2752084732055664, "learning_rate": 4.2226263645429536e-06, "loss": 0.3129, "step": 1408 }, { "batch_size": 4, "epoch": 0.5632, "step": 1408, "tokens_per_device": 4068 }, { "epoch": 0.5632, "loss_ce": 0.10392510890960693, "loss_lvr": 0.9436850547790527, "loss_mode_switch": 0.0, "loss_total": 0.19829362630844116, "step": 1408 }, { "batch_size": 4, "epoch": 0.5632, "step": 1408, "tokens_per_device": 4292 }, { "epoch": 0.5632, "loss_ce": 0.14980663359165192, "loss_lvr": 0.9021155834197998, "loss_mode_switch": 0.0, "loss_total": 0.24001818895339966, "step": 1408 }, { "batch_size": 4, "epoch": 0.5632, "step": 1408, "tokens_per_device": 10056 }, { "epoch": 0.5632, "loss_ce": 0.12171149998903275, "loss_lvr": 0.691410481929779, "loss_mode_switch": 0.0, "loss_total": 0.190852552652359, "step": 1408 }, { "batch_size": 4, "epoch": 0.5632, "step": 1408, "tokens_per_device": 5920 }, { "epoch": 0.5632, "loss_ce": 0.010050144046545029, "loss_lvr": 0.714398980140686, "loss_mode_switch": 0.0, "loss_total": 0.08149003982543945, "step": 1408 }, { "batch_size": 4, "epoch": 0.5632, "step": 1408, "tokens_per_device": 6084 }, { "epoch": 0.5632, "loss_ce": 0.27979177236557007, "loss_lvr": 0.5680266618728638, "loss_mode_switch": 0.0, "loss_total": 0.33659443259239197, "step": 1408 }, { "batch_size": 4, "epoch": 0.5632, "step": 1408, "tokens_per_device": 10164 }, { "epoch": 0.5632, "loss_ce": 0.2713650166988373, "loss_lvr": 0.7818512320518494, "loss_mode_switch": 0.0, "loss_total": 0.34955012798309326, "step": 1408 }, { "batch_size": 1, "epoch": 0.5632, "step": 1408, "tokens_per_device": 5023 }, { "epoch": 0.5632, "loss_ce": 0.15621808171272278, "loss_lvr": 0.5314728021621704, "loss_mode_switch": 0.0, "loss_total": 0.2093653678894043, "step": 1408 }, { "batch_size": 1, "epoch": 0.5632, "step": 1408, "tokens_per_device": 4778 }, { "epoch": 0.5632, "loss_ce": 0.09564445912837982, "loss_lvr": 0.3446239233016968, "loss_mode_switch": 0.0, "loss_total": 0.1301068514585495, "step": 1408 }, { "epoch": 0.5636, "grad_norm": 1.2271127700805664, "learning_rate": 4.216228275419598e-06, "loss": 0.2977, "step": 1409 }, { "batch_size": 4, "epoch": 0.5636, "step": 1409, "tokens_per_device": 3636 }, { "epoch": 0.5636, "loss_ce": 0.025947056710720062, "loss_lvr": 0.8301697969436646, "loss_mode_switch": 0.0, "loss_total": 0.10896404087543488, "step": 1409 }, { "batch_size": 4, "epoch": 0.5636, "step": 1409, "tokens_per_device": 10940 }, { "epoch": 0.5636, "loss_ce": 0.6266902685165405, "loss_lvr": 1.0140573978424072, "loss_mode_switch": 0.0, "loss_total": 0.7280960083007812, "step": 1409 }, { "batch_size": 4, "epoch": 0.5636, "step": 1409, "tokens_per_device": 4240 }, { "epoch": 0.5636, "loss_ce": 0.47935402393341064, "loss_lvr": 0.9106521010398865, "loss_mode_switch": 0.0, "loss_total": 0.5704192519187927, "step": 1409 }, { "batch_size": 1, "epoch": 0.5636, "step": 1409, "tokens_per_device": 4881 }, { "epoch": 0.5636, "loss_ce": 0.11058198660612106, "loss_lvr": 0.30162692070007324, "loss_mode_switch": 0.0, "loss_total": 0.14074468612670898, "step": 1409 }, { "batch_size": 1, "epoch": 0.5636, "step": 1409, "tokens_per_device": 5088 }, { "epoch": 0.5636, "loss_ce": 0.07803717255592346, "loss_lvr": 0.17759175598621368, "loss_mode_switch": 0.0, "loss_total": 0.09579634666442871, "step": 1409 }, { "batch_size": 4, "epoch": 0.5636, "step": 1409, "tokens_per_device": 1760 }, { "epoch": 0.5636, "loss_ce": 0.3755696713924408, "loss_lvr": 0.9511927962303162, "loss_mode_switch": 0.0, "loss_total": 0.47068893909454346, "step": 1409 }, { "batch_size": 4, "epoch": 0.5636, "step": 1409, "tokens_per_device": 5052 }, { "epoch": 0.5636, "loss_ce": 0.2101963758468628, "loss_lvr": 0.793491780757904, "loss_mode_switch": 0.0, "loss_total": 0.28954556584358215, "step": 1409 }, { "batch_size": 1, "epoch": 0.5636, "step": 1409, "tokens_per_device": 5013 }, { "epoch": 0.5636, "loss_ce": 0.033923860639333725, "loss_lvr": 0.49660319089889526, "loss_mode_switch": 0.0, "loss_total": 0.08358418196439743, "step": 1409 }, { "epoch": 0.564, "grad_norm": 1.365234136581421, "learning_rate": 4.209831501720328e-06, "loss": 0.2973, "step": 1410 }, { "batch_size": 4, "epoch": 0.564, "step": 1410, "tokens_per_device": 4208 }, { "epoch": 0.564, "loss_ce": 0.24526770412921906, "loss_lvr": 0.7606627941131592, "loss_mode_switch": 0.0, "loss_total": 0.32133397459983826, "step": 1410 }, { "batch_size": 4, "epoch": 0.564, "step": 1410, "tokens_per_device": 2560 }, { "epoch": 0.564, "loss_ce": 0.44067826867103577, "loss_lvr": 0.9524871706962585, "loss_mode_switch": 0.0, "loss_total": 0.5359269976615906, "step": 1410 }, { "batch_size": 1, "epoch": 0.564, "step": 1410, "tokens_per_device": 5126 }, { "epoch": 0.564, "loss_ce": 0.027032481506466866, "loss_lvr": 0.4366165101528168, "loss_mode_switch": 0.0, "loss_total": 0.07069413363933563, "step": 1410 }, { "batch_size": 4, "epoch": 0.564, "step": 1410, "tokens_per_device": 9564 }, { "epoch": 0.564, "loss_ce": 0.14163637161254883, "loss_lvr": 0.7134753465652466, "loss_mode_switch": 0.0, "loss_total": 0.2129839062690735, "step": 1410 }, { "batch_size": 4, "epoch": 0.564, "step": 1410, "tokens_per_device": 4216 }, { "epoch": 0.564, "loss_ce": 0.11410316079854965, "loss_lvr": 0.8110047578811646, "loss_mode_switch": 0.0, "loss_total": 0.19520363211631775, "step": 1410 }, { "batch_size": 4, "epoch": 0.564, "step": 1410, "tokens_per_device": 2936 }, { "epoch": 0.564, "loss_ce": 0.40710100531578064, "loss_lvr": 1.1219652891159058, "loss_mode_switch": 0.0, "loss_total": 0.5192975401878357, "step": 1410 }, { "batch_size": 1, "epoch": 0.564, "step": 1410, "tokens_per_device": 5011 }, { "epoch": 0.564, "loss_ce": 0.012311255559325218, "loss_lvr": 0.6149311065673828, "loss_mode_switch": 0.0, "loss_total": 0.07380436360836029, "step": 1410 }, { "batch_size": 4, "epoch": 0.564, "step": 1410, "tokens_per_device": 1464 }, { "epoch": 0.564, "loss_ce": 0.42652589082717896, "loss_lvr": 0.9876840710639954, "loss_mode_switch": 0.0, "loss_total": 0.525294303894043, "step": 1410 }, { "epoch": 0.5644, "grad_norm": 1.4170589447021484, "learning_rate": 4.203436054181017e-06, "loss": 0.2646, "step": 1411 }, { "batch_size": 4, "epoch": 0.5644, "step": 1411, "tokens_per_device": 5220 }, { "epoch": 0.5644, "loss_ce": 0.35311388969421387, "loss_lvr": 0.7080555558204651, "loss_mode_switch": 0.0, "loss_total": 0.4239194393157959, "step": 1411 }, { "batch_size": 1, "epoch": 0.5644, "step": 1411, "tokens_per_device": 4879 }, { "epoch": 0.5644, "loss_ce": 0.0002612337120808661, "loss_lvr": 1.1517618894577026, "loss_mode_switch": 0.0, "loss_total": 0.11543742567300797, "step": 1411 }, { "batch_size": 4, "epoch": 0.5644, "step": 1411, "tokens_per_device": 4428 }, { "epoch": 0.5644, "loss_ce": 0.25059279799461365, "loss_lvr": 0.8075523972511292, "loss_mode_switch": 0.0, "loss_total": 0.3313480317592621, "step": 1411 }, { "batch_size": 1, "epoch": 0.5644, "step": 1411, "tokens_per_device": 5048 }, { "epoch": 0.5644, "loss_ce": 0.0067139011807739735, "loss_lvr": 0.32396355271339417, "loss_mode_switch": 0.0, "loss_total": 0.03911025822162628, "step": 1411 }, { "batch_size": 4, "epoch": 0.5644, "step": 1411, "tokens_per_device": 1388 }, { "epoch": 0.5644, "loss_ce": 0.2342013120651245, "loss_lvr": 0.8905861973762512, "loss_mode_switch": 0.0, "loss_total": 0.32325994968414307, "step": 1411 }, { "batch_size": 4, "epoch": 0.5644, "step": 1411, "tokens_per_device": 5616 }, { "epoch": 0.5644, "loss_ce": 0.35162511467933655, "loss_lvr": 1.0070334672927856, "loss_mode_switch": 0.0, "loss_total": 0.45232847332954407, "step": 1411 }, { "batch_size": 4, "epoch": 0.5644, "step": 1411, "tokens_per_device": 4316 }, { "epoch": 0.5644, "loss_ce": 0.08279900997877121, "loss_lvr": 0.8588375449180603, "loss_mode_switch": 0.0, "loss_total": 0.1686827689409256, "step": 1411 }, { "batch_size": 1, "epoch": 0.5644, "step": 1411, "tokens_per_device": 4902 }, { "epoch": 0.5644, "loss_ce": 0.38841015100479126, "loss_lvr": 0.5525132417678833, "loss_mode_switch": 0.0, "loss_total": 0.44366148114204407, "step": 1411 }, { "epoch": 0.5648, "grad_norm": 1.3981965780258179, "learning_rate": 4.197041943535307e-06, "loss": 0.2867, "step": 1412 }, { "batch_size": 4, "epoch": 0.5648, "step": 1412, "tokens_per_device": 4540 }, { "epoch": 0.5648, "loss_ce": 0.03297843784093857, "loss_lvr": 0.848063051700592, "loss_mode_switch": 0.0, "loss_total": 0.11778474599123001, "step": 1412 }, { "batch_size": 4, "epoch": 0.5648, "step": 1412, "tokens_per_device": 4768 }, { "epoch": 0.5648, "loss_ce": 0.29024577140808105, "loss_lvr": 0.7504218816757202, "loss_mode_switch": 0.0, "loss_total": 0.3652879595756531, "step": 1412 }, { "batch_size": 1, "epoch": 0.5648, "step": 1412, "tokens_per_device": 5071 }, { "epoch": 0.5648, "loss_ce": 0.0035111617762595415, "loss_lvr": 0.278735876083374, "loss_mode_switch": 0.0, "loss_total": 0.031384751200675964, "step": 1412 }, { "batch_size": 1, "epoch": 0.5648, "step": 1412, "tokens_per_device": 5138 }, { "epoch": 0.5648, "loss_ce": 0.00318303145468235, "loss_lvr": 0.3939191401004791, "loss_mode_switch": 0.0, "loss_total": 0.04257494956254959, "step": 1412 }, { "batch_size": 1, "epoch": 0.5648, "step": 1412, "tokens_per_device": 5029 }, { "epoch": 0.5648, "loss_ce": 0.050473652780056, "loss_lvr": 0.4149753153324127, "loss_mode_switch": 0.0, "loss_total": 0.09197118878364563, "step": 1412 }, { "batch_size": 4, "epoch": 0.5648, "step": 1412, "tokens_per_device": 2656 }, { "epoch": 0.5648, "loss_ce": 0.2858611047267914, "loss_lvr": 0.8612509369850159, "loss_mode_switch": 0.0, "loss_total": 0.3719862103462219, "step": 1412 }, { "batch_size": 4, "epoch": 0.5648, "step": 1412, "tokens_per_device": 4288 }, { "epoch": 0.5648, "loss_ce": 0.018319979310035706, "loss_lvr": 0.8959735035896301, "loss_mode_switch": 0.0, "loss_total": 0.10791733115911484, "step": 1412 }, { "batch_size": 4, "epoch": 0.5648, "step": 1412, "tokens_per_device": 4388 }, { "epoch": 0.5648, "loss_ce": 0.29377445578575134, "loss_lvr": 1.3329638242721558, "loss_mode_switch": 0.0, "loss_total": 0.42707085609436035, "step": 1412 }, { "epoch": 0.5652, "grad_norm": 1.3726457357406616, "learning_rate": 4.190649180514595e-06, "loss": 0.266, "step": 1413 }, { "batch_size": 1, "epoch": 0.5652, "step": 1413, "tokens_per_device": 4419 }, { "epoch": 0.5652, "loss_ce": 0.037239812314510345, "loss_lvr": 0.8303986191749573, "loss_mode_switch": 0.0, "loss_total": 0.12027967721223831, "step": 1413 }, { "batch_size": 4, "epoch": 0.5652, "step": 1413, "tokens_per_device": 4212 }, { "epoch": 0.5652, "loss_ce": 0.16314561665058136, "loss_lvr": 0.9460452198982239, "loss_mode_switch": 0.0, "loss_total": 0.25775015354156494, "step": 1413 }, { "batch_size": 4, "epoch": 0.5652, "step": 1413, "tokens_per_device": 1580 }, { "epoch": 0.5652, "loss_ce": 0.3737306594848633, "loss_lvr": 1.0097203254699707, "loss_mode_switch": 0.0, "loss_total": 0.4747026860713959, "step": 1413 }, { "batch_size": 4, "epoch": 0.5652, "step": 1413, "tokens_per_device": 4500 }, { "epoch": 0.5652, "loss_ce": 0.2761119604110718, "loss_lvr": 0.8019330501556396, "loss_mode_switch": 0.0, "loss_total": 0.3563052713871002, "step": 1413 }, { "batch_size": 4, "epoch": 0.5652, "step": 1413, "tokens_per_device": 4668 }, { "epoch": 0.5652, "loss_ce": 0.039323821663856506, "loss_lvr": 0.769875168800354, "loss_mode_switch": 0.0, "loss_total": 0.11631134152412415, "step": 1413 }, { "batch_size": 4, "epoch": 0.5652, "step": 1413, "tokens_per_device": 2896 }, { "epoch": 0.5652, "loss_ce": 0.02131914533674717, "loss_lvr": 0.3865860402584076, "loss_mode_switch": 0.0, "loss_total": 0.05997774749994278, "step": 1413 }, { "batch_size": 1, "epoch": 0.5652, "step": 1413, "tokens_per_device": 5143 }, { "epoch": 0.5652, "loss_ce": 0.13530264794826508, "loss_lvr": 0.19965817034244537, "loss_mode_switch": 0.0, "loss_total": 0.15526846051216125, "step": 1413 }, { "batch_size": 1, "epoch": 0.5652, "step": 1413, "tokens_per_device": 5725 }, { "epoch": 0.5652, "loss_ce": 0.007578810676932335, "loss_lvr": 0.35318753123283386, "loss_mode_switch": 0.0, "loss_total": 0.04289756715297699, "step": 1413 }, { "epoch": 0.5656, "grad_norm": 1.6349241733551025, "learning_rate": 4.18425777584802e-06, "loss": 0.367, "step": 1414 }, { "batch_size": 1, "epoch": 0.5656, "step": 1414, "tokens_per_device": 4880 }, { "epoch": 0.5656, "loss_ce": 0.04167143628001213, "loss_lvr": 0.5707333087921143, "loss_mode_switch": 0.0, "loss_total": 0.09874476492404938, "step": 1414 }, { "batch_size": 1, "epoch": 0.5656, "step": 1414, "tokens_per_device": 5001 }, { "epoch": 0.5656, "loss_ce": 0.21745260059833527, "loss_lvr": 0.49094903469085693, "loss_mode_switch": 0.0, "loss_total": 0.2665475010871887, "step": 1414 }, { "batch_size": 4, "epoch": 0.5656, "step": 1414, "tokens_per_device": 3960 }, { "epoch": 0.5656, "loss_ce": 0.1456523835659027, "loss_lvr": 0.9559112191200256, "loss_mode_switch": 0.0, "loss_total": 0.24124351143836975, "step": 1414 }, { "batch_size": 1, "epoch": 0.5656, "step": 1414, "tokens_per_device": 4918 }, { "epoch": 0.5656, "loss_ce": 0.0704592689871788, "loss_lvr": 0.32338786125183105, "loss_mode_switch": 0.0, "loss_total": 0.10279805958271027, "step": 1414 }, { "batch_size": 1, "epoch": 0.5656, "step": 1414, "tokens_per_device": 5118 }, { "epoch": 0.5656, "loss_ce": 0.0320737399160862, "loss_lvr": 0.8908947706222534, "loss_mode_switch": 0.0, "loss_total": 0.12116321921348572, "step": 1414 }, { "batch_size": 4, "epoch": 0.5656, "step": 1414, "tokens_per_device": 2668 }, { "epoch": 0.5656, "loss_ce": 0.09243805706501007, "loss_lvr": 1.0088458061218262, "loss_mode_switch": 0.0, "loss_total": 0.19332262873649597, "step": 1414 }, { "batch_size": 4, "epoch": 0.5656, "step": 1414, "tokens_per_device": 4008 }, { "epoch": 0.5656, "loss_ce": 0.09813007712364197, "loss_lvr": 0.7854776382446289, "loss_mode_switch": 0.0, "loss_total": 0.1766778528690338, "step": 1414 }, { "batch_size": 1, "epoch": 0.5656, "step": 1414, "tokens_per_device": 5139 }, { "epoch": 0.5656, "loss_ce": 0.2304869294166565, "loss_lvr": 0.27001920342445374, "loss_mode_switch": 0.0, "loss_total": 0.25748884677886963, "step": 1414 }, { "epoch": 0.566, "grad_norm": 1.2977898120880127, "learning_rate": 4.177867740262437e-06, "loss": 0.2637, "step": 1415 }, { "batch_size": 4, "epoch": 0.566, "step": 1415, "tokens_per_device": 1440 }, { "epoch": 0.566, "loss_ce": 0.33561697602272034, "loss_lvr": 1.0394307374954224, "loss_mode_switch": 0.0, "loss_total": 0.43956005573272705, "step": 1415 }, { "batch_size": 1, "epoch": 0.566, "step": 1415, "tokens_per_device": 4817 }, { "epoch": 0.566, "loss_ce": 0.1823815554380417, "loss_lvr": 0.4471186399459839, "loss_mode_switch": 0.0, "loss_total": 0.2270934283733368, "step": 1415 }, { "batch_size": 4, "epoch": 0.566, "step": 1415, "tokens_per_device": 3824 }, { "epoch": 0.566, "loss_ce": 0.3469277024269104, "loss_lvr": 1.25956392288208, "loss_mode_switch": 0.0, "loss_total": 0.4728841185569763, "step": 1415 }, { "batch_size": 1, "epoch": 0.566, "step": 1415, "tokens_per_device": 4701 }, { "epoch": 0.566, "loss_ce": 0.01970299892127514, "loss_lvr": 0.6711313724517822, "loss_mode_switch": 0.0, "loss_total": 0.08681613951921463, "step": 1415 }, { "batch_size": 1, "epoch": 0.566, "step": 1415, "tokens_per_device": 5110 }, { "epoch": 0.566, "loss_ce": 0.0008033796329982579, "loss_lvr": 0.3594927489757538, "loss_mode_switch": 0.0, "loss_total": 0.03675265610218048, "step": 1415 }, { "batch_size": 4, "epoch": 0.566, "step": 1415, "tokens_per_device": 4008 }, { "epoch": 0.566, "loss_ce": 0.025073062628507614, "loss_lvr": 0.8621525168418884, "loss_mode_switch": 0.0, "loss_total": 0.11128830909729004, "step": 1415 }, { "batch_size": 4, "epoch": 0.566, "step": 1415, "tokens_per_device": 4656 }, { "epoch": 0.566, "loss_ce": 0.16060645878314972, "loss_lvr": 0.9088408350944519, "loss_mode_switch": 0.0, "loss_total": 0.2514905333518982, "step": 1415 }, { "batch_size": 4, "epoch": 0.566, "step": 1415, "tokens_per_device": 4848 }, { "epoch": 0.566, "loss_ce": 0.33611586689949036, "loss_lvr": 0.8263828754425049, "loss_mode_switch": 0.0, "loss_total": 0.4187541604042053, "step": 1415 }, { "epoch": 0.5664, "grad_norm": 1.1758102178573608, "learning_rate": 4.171479084482408e-06, "loss": 0.2891, "step": 1416 }, { "batch_size": 4, "epoch": 0.5664, "step": 1416, "tokens_per_device": 7680 }, { "epoch": 0.5664, "loss_ce": 0.0628158375620842, "loss_lvr": 0.8642441630363464, "loss_mode_switch": 0.0, "loss_total": 0.14924025535583496, "step": 1416 }, { "batch_size": 1, "epoch": 0.5664, "step": 1416, "tokens_per_device": 4888 }, { "epoch": 0.5664, "loss_ce": 0.06322554498910904, "loss_lvr": 0.21458092331886292, "loss_mode_switch": 0.0, "loss_total": 0.08468364179134369, "step": 1416 }, { "batch_size": 4, "epoch": 0.5664, "step": 1416, "tokens_per_device": 2752 }, { "epoch": 0.5664, "loss_ce": 0.26120057702064514, "loss_lvr": 0.7557662129402161, "loss_mode_switch": 0.0, "loss_total": 0.3367772102355957, "step": 1416 }, { "batch_size": 1, "epoch": 0.5664, "step": 1416, "tokens_per_device": 5067 }, { "epoch": 0.5664, "loss_ce": 0.002249932149425149, "loss_lvr": 0.33192259073257446, "loss_mode_switch": 0.0, "loss_total": 0.03544219210743904, "step": 1416 }, { "batch_size": 4, "epoch": 0.5664, "step": 1416, "tokens_per_device": 5048 }, { "epoch": 0.5664, "loss_ce": 0.2402455061674118, "loss_lvr": 0.6808791756629944, "loss_mode_switch": 0.0, "loss_total": 0.3083334267139435, "step": 1416 }, { "batch_size": 4, "epoch": 0.5664, "step": 1416, "tokens_per_device": 3788 }, { "epoch": 0.5664, "loss_ce": 0.2909010350704193, "loss_lvr": 0.9551641345024109, "loss_mode_switch": 0.0, "loss_total": 0.3864174485206604, "step": 1416 }, { "batch_size": 1, "epoch": 0.5664, "step": 1416, "tokens_per_device": 5028 }, { "epoch": 0.5664, "loss_ce": 0.6728482246398926, "loss_lvr": 1.3034822940826416, "loss_mode_switch": 0.0, "loss_total": 0.8031964302062988, "step": 1416 }, { "batch_size": 1, "epoch": 0.5664, "step": 1416, "tokens_per_device": 5007 }, { "epoch": 0.5664, "loss_ce": 0.031124966219067574, "loss_lvr": 0.2337058186531067, "loss_mode_switch": 0.0, "loss_total": 0.05449555069208145, "step": 1416 }, { "epoch": 0.5668, "grad_norm": 1.5666522979736328, "learning_rate": 4.165091819230178e-06, "loss": 0.2957, "step": 1417 }, { "batch_size": 4, "epoch": 0.5668, "step": 1417, "tokens_per_device": 4212 }, { "epoch": 0.5668, "loss_ce": 0.0028496370650827885, "loss_lvr": 0.8058846592903137, "loss_mode_switch": 0.0, "loss_total": 0.08343810588121414, "step": 1417 }, { "batch_size": 4, "epoch": 0.5668, "step": 1417, "tokens_per_device": 4900 }, { "epoch": 0.5668, "loss_ce": 0.08652711659669876, "loss_lvr": 0.7705880403518677, "loss_mode_switch": 0.0, "loss_total": 0.16358593106269836, "step": 1417 }, { "batch_size": 4, "epoch": 0.5668, "step": 1417, "tokens_per_device": 2764 }, { "epoch": 0.5668, "loss_ce": 0.5778101086616516, "loss_lvr": 0.7392634749412537, "loss_mode_switch": 0.0, "loss_total": 0.6517364382743835, "step": 1417 }, { "batch_size": 4, "epoch": 0.5668, "step": 1417, "tokens_per_device": 4452 }, { "epoch": 0.5668, "loss_ce": 0.07930564135313034, "loss_lvr": 0.7860696315765381, "loss_mode_switch": 0.0, "loss_total": 0.15791261196136475, "step": 1417 }, { "batch_size": 4, "epoch": 0.5668, "step": 1417, "tokens_per_device": 3856 }, { "epoch": 0.5668, "loss_ce": 0.29007014632225037, "loss_lvr": 0.8755319118499756, "loss_mode_switch": 0.0, "loss_total": 0.3776233494281769, "step": 1417 }, { "batch_size": 1, "epoch": 0.5668, "step": 1417, "tokens_per_device": 5103 }, { "epoch": 0.5668, "loss_ce": 0.04854421690106392, "loss_lvr": 0.6640875935554504, "loss_mode_switch": 0.0, "loss_total": 0.11495298147201538, "step": 1417 }, { "batch_size": 4, "epoch": 0.5668, "step": 1417, "tokens_per_device": 2816 }, { "epoch": 0.5668, "loss_ce": 0.12395544350147247, "loss_lvr": 0.6307041049003601, "loss_mode_switch": 0.0, "loss_total": 0.18702584505081177, "step": 1417 }, { "batch_size": 1, "epoch": 0.5668, "step": 1417, "tokens_per_device": 4866 }, { "epoch": 0.5668, "loss_ce": 0.00037003745092079043, "loss_lvr": 0.2826146185398102, "loss_mode_switch": 0.0, "loss_total": 0.02863149903714657, "step": 1417 }, { "epoch": 0.5672, "grad_norm": 1.1290234327316284, "learning_rate": 4.1587059552256566e-06, "loss": 0.2515, "step": 1418 }, { "batch_size": 4, "epoch": 0.5672, "step": 1418, "tokens_per_device": 3792 }, { "epoch": 0.5672, "loss_ce": 0.1827377825975418, "loss_lvr": 1.0974491834640503, "loss_mode_switch": 0.0, "loss_total": 0.2924827039241791, "step": 1418 }, { "batch_size": 4, "epoch": 0.5672, "step": 1418, "tokens_per_device": 4212 }, { "epoch": 0.5672, "loss_ce": 0.2647944390773773, "loss_lvr": 0.8011741042137146, "loss_mode_switch": 0.0, "loss_total": 0.3449118435382843, "step": 1418 }, { "batch_size": 4, "epoch": 0.5672, "step": 1418, "tokens_per_device": 11904 }, { "epoch": 0.5672, "loss_ce": 0.5188950300216675, "loss_lvr": 0.932327151298523, "loss_mode_switch": 0.0, "loss_total": 0.6121277213096619, "step": 1418 }, { "batch_size": 1, "epoch": 0.5672, "step": 1418, "tokens_per_device": 7469 }, { "epoch": 0.5672, "loss_ce": 0.0005827732384204865, "loss_lvr": 0.5175737142562866, "loss_mode_switch": 0.0, "loss_total": 0.05234014615416527, "step": 1418 }, { "batch_size": 1, "epoch": 0.5672, "step": 1418, "tokens_per_device": 4858 }, { "epoch": 0.5672, "loss_ce": 0.003743477165699005, "loss_lvr": 0.20774446427822113, "loss_mode_switch": 0.0, "loss_total": 0.024517923593521118, "step": 1418 }, { "batch_size": 4, "epoch": 0.5672, "step": 1418, "tokens_per_device": 2964 }, { "epoch": 0.5672, "loss_ce": 0.03544548153877258, "loss_lvr": 0.6138841509819031, "loss_mode_switch": 0.0, "loss_total": 0.09683389961719513, "step": 1418 }, { "batch_size": 4, "epoch": 0.5672, "step": 1418, "tokens_per_device": 4476 }, { "epoch": 0.5672, "loss_ce": 0.48262524604797363, "loss_lvr": 1.1056644916534424, "loss_mode_switch": 0.0, "loss_total": 0.5931916832923889, "step": 1418 }, { "batch_size": 1, "epoch": 0.5672, "step": 1418, "tokens_per_device": 4879 }, { "epoch": 0.5672, "loss_ce": 0.06194017082452774, "loss_lvr": 0.30961719155311584, "loss_mode_switch": 0.0, "loss_total": 0.09290189296007156, "step": 1418 }, { "epoch": 0.5676, "grad_norm": 1.1746633052825928, "learning_rate": 4.152321503186399e-06, "loss": 0.279, "step": 1419 }, { "batch_size": 1, "epoch": 0.5676, "step": 1419, "tokens_per_device": 4884 }, { "epoch": 0.5676, "loss_ce": 0.7539946436882019, "loss_lvr": 0.241837278008461, "loss_mode_switch": 0.0, "loss_total": 0.7781783938407898, "step": 1419 }, { "batch_size": 4, "epoch": 0.5676, "step": 1419, "tokens_per_device": 5428 }, { "epoch": 0.5676, "loss_ce": 0.14873221516609192, "loss_lvr": 0.8450950980186462, "loss_mode_switch": 0.0, "loss_total": 0.2332417368888855, "step": 1419 }, { "batch_size": 4, "epoch": 0.5676, "step": 1419, "tokens_per_device": 2672 }, { "epoch": 0.5676, "loss_ce": 0.2009180635213852, "loss_lvr": 0.7411725521087646, "loss_mode_switch": 0.0, "loss_total": 0.2750353217124939, "step": 1419 }, { "batch_size": 4, "epoch": 0.5676, "step": 1419, "tokens_per_device": 4268 }, { "epoch": 0.5676, "loss_ce": 0.3513711094856262, "loss_lvr": 0.9285151958465576, "loss_mode_switch": 0.0, "loss_total": 0.444222629070282, "step": 1419 }, { "batch_size": 4, "epoch": 0.5676, "step": 1419, "tokens_per_device": 4512 }, { "epoch": 0.5676, "loss_ce": 0.06024187058210373, "loss_lvr": 1.004737138748169, "loss_mode_switch": 0.0, "loss_total": 0.16071557998657227, "step": 1419 }, { "batch_size": 1, "epoch": 0.5676, "step": 1419, "tokens_per_device": 4757 }, { "epoch": 0.5676, "loss_ce": 0.003132196143269539, "loss_lvr": 0.41315552592277527, "loss_mode_switch": 0.0, "loss_total": 0.044447749853134155, "step": 1419 }, { "batch_size": 4, "epoch": 0.5676, "step": 1419, "tokens_per_device": 11276 }, { "epoch": 0.5676, "loss_ce": 0.703283429145813, "loss_lvr": 0.8247755169868469, "loss_mode_switch": 0.0, "loss_total": 0.7857609987258911, "step": 1419 }, { "batch_size": 4, "epoch": 0.5676, "step": 1419, "tokens_per_device": 2584 }, { "epoch": 0.5676, "loss_ce": 0.14466014504432678, "loss_lvr": 0.979775607585907, "loss_mode_switch": 0.0, "loss_total": 0.24263770878314972, "step": 1419 }, { "epoch": 0.568, "grad_norm": 1.274754524230957, "learning_rate": 4.145938473827598e-06, "loss": 0.2584, "step": 1420 }, { "batch_size": 4, "epoch": 0.568, "step": 1420, "tokens_per_device": 3820 }, { "epoch": 0.568, "loss_ce": 0.22306805849075317, "loss_lvr": 0.9033472537994385, "loss_mode_switch": 0.0, "loss_total": 0.31340277194976807, "step": 1420 }, { "batch_size": 1, "epoch": 0.568, "step": 1420, "tokens_per_device": 4861 }, { "epoch": 0.568, "loss_ce": 0.00199686735868454, "loss_lvr": 0.37637782096862793, "loss_mode_switch": 0.0, "loss_total": 0.03963464871048927, "step": 1420 }, { "batch_size": 4, "epoch": 0.568, "step": 1420, "tokens_per_device": 4680 }, { "epoch": 0.568, "loss_ce": 0.08700163662433624, "loss_lvr": 0.5345882177352905, "loss_mode_switch": 0.0, "loss_total": 0.14046046137809753, "step": 1420 }, { "batch_size": 4, "epoch": 0.568, "step": 1420, "tokens_per_device": 9020 }, { "epoch": 0.568, "loss_ce": 0.5390862822532654, "loss_lvr": 0.6426891088485718, "loss_mode_switch": 0.0, "loss_total": 0.6033551692962646, "step": 1420 }, { "batch_size": 4, "epoch": 0.568, "step": 1420, "tokens_per_device": 4832 }, { "epoch": 0.568, "loss_ce": 0.08755557239055634, "loss_lvr": 0.7341593503952026, "loss_mode_switch": 0.0, "loss_total": 0.1609715074300766, "step": 1420 }, { "batch_size": 4, "epoch": 0.568, "step": 1420, "tokens_per_device": 2812 }, { "epoch": 0.568, "loss_ce": 0.18584707379341125, "loss_lvr": 0.7025730609893799, "loss_mode_switch": 0.0, "loss_total": 0.25610437989234924, "step": 1420 }, { "batch_size": 1, "epoch": 0.568, "step": 1420, "tokens_per_device": 5109 }, { "epoch": 0.568, "loss_ce": 0.05576765537261963, "loss_lvr": 0.4316747486591339, "loss_mode_switch": 0.0, "loss_total": 0.09893512725830078, "step": 1420 }, { "batch_size": 4, "epoch": 0.568, "step": 1420, "tokens_per_device": 2776 }, { "epoch": 0.568, "loss_ce": 0.1376774162054062, "loss_lvr": 0.7854089140892029, "loss_mode_switch": 0.0, "loss_total": 0.21621830761432648, "step": 1420 }, { "epoch": 0.5684, "grad_norm": 1.5638127326965332, "learning_rate": 4.139556877862055e-06, "loss": 0.29, "step": 1421 }, { "batch_size": 4, "epoch": 0.5684, "step": 1421, "tokens_per_device": 8876 }, { "epoch": 0.5684, "loss_ce": 0.30408501625061035, "loss_lvr": 0.6999470591545105, "loss_mode_switch": 0.0, "loss_total": 0.37407973408699036, "step": 1421 }, { "batch_size": 4, "epoch": 0.5684, "step": 1421, "tokens_per_device": 4244 }, { "epoch": 0.5684, "loss_ce": 0.5782183408737183, "loss_lvr": 1.0391654968261719, "loss_mode_switch": 0.0, "loss_total": 0.6821348667144775, "step": 1421 }, { "batch_size": 4, "epoch": 0.5684, "step": 1421, "tokens_per_device": 4212 }, { "epoch": 0.5684, "loss_ce": 0.6835362911224365, "loss_lvr": 0.9412476420402527, "loss_mode_switch": 0.0, "loss_total": 0.7776610851287842, "step": 1421 }, { "batch_size": 4, "epoch": 0.5684, "step": 1421, "tokens_per_device": 4244 }, { "epoch": 0.5684, "loss_ce": 0.0816810205578804, "loss_lvr": 0.8272127509117126, "loss_mode_switch": 0.0, "loss_total": 0.1644023060798645, "step": 1421 }, { "batch_size": 4, "epoch": 0.5684, "step": 1421, "tokens_per_device": 10168 }, { "epoch": 0.5684, "loss_ce": 0.4660051763057709, "loss_lvr": 0.728507399559021, "loss_mode_switch": 0.0, "loss_total": 0.5388559103012085, "step": 1421 }, { "batch_size": 1, "epoch": 0.5684, "step": 1421, "tokens_per_device": 5943 }, { "epoch": 0.5684, "loss_ce": 0.03564584255218506, "loss_lvr": 0.44716253876686096, "loss_mode_switch": 0.0, "loss_total": 0.08036209642887115, "step": 1421 }, { "batch_size": 4, "epoch": 0.5684, "step": 1421, "tokens_per_device": 3856 }, { "epoch": 0.5684, "loss_ce": 0.3794011175632477, "loss_lvr": 0.8450893759727478, "loss_mode_switch": 0.0, "loss_total": 0.4639100432395935, "step": 1421 }, { "batch_size": 4, "epoch": 0.5684, "step": 1421, "tokens_per_device": 1296 }, { "epoch": 0.5684, "loss_ce": 0.3729107975959778, "loss_lvr": 1.159970760345459, "loss_mode_switch": 0.0, "loss_total": 0.4889078736305237, "step": 1421 }, { "epoch": 0.5688, "grad_norm": 1.342029333114624, "learning_rate": 4.133176726000163e-06, "loss": 0.2835, "step": 1422 }, { "batch_size": 4, "epoch": 0.5688, "step": 1422, "tokens_per_device": 7300 }, { "epoch": 0.5688, "loss_ce": 0.11810341477394104, "loss_lvr": 0.7186284065246582, "loss_mode_switch": 0.0, "loss_total": 0.18996626138687134, "step": 1422 }, { "batch_size": 1, "epoch": 0.5688, "step": 1422, "tokens_per_device": 6680 }, { "epoch": 0.5688, "loss_ce": 0.0030188090167939663, "loss_lvr": 0.8399299383163452, "loss_mode_switch": 0.0, "loss_total": 0.08701180666685104, "step": 1422 }, { "batch_size": 4, "epoch": 0.5688, "step": 1422, "tokens_per_device": 5444 }, { "epoch": 0.5688, "loss_ce": 0.2734141945838928, "loss_lvr": 1.0323888063430786, "loss_mode_switch": 0.0, "loss_total": 0.3766530752182007, "step": 1422 }, { "batch_size": 4, "epoch": 0.5688, "step": 1422, "tokens_per_device": 2884 }, { "epoch": 0.5688, "loss_ce": 0.07533044368028641, "loss_lvr": 0.7563576102256775, "loss_mode_switch": 0.0, "loss_total": 0.15096619725227356, "step": 1422 }, { "batch_size": 1, "epoch": 0.5688, "step": 1422, "tokens_per_device": 5121 }, { "epoch": 0.5688, "loss_ce": 0.018465762957930565, "loss_lvr": 0.44804465770721436, "loss_mode_switch": 0.0, "loss_total": 0.06327022612094879, "step": 1422 }, { "batch_size": 4, "epoch": 0.5688, "step": 1422, "tokens_per_device": 1444 }, { "epoch": 0.5688, "loss_ce": 0.07922340929508209, "loss_lvr": 0.9478626251220703, "loss_mode_switch": 0.0, "loss_total": 0.17400968074798584, "step": 1422 }, { "batch_size": 1, "epoch": 0.5688, "step": 1422, "tokens_per_device": 5186 }, { "epoch": 0.5688, "loss_ce": 0.06790381669998169, "loss_lvr": 0.3056682348251343, "loss_mode_switch": 0.0, "loss_total": 0.09847064316272736, "step": 1422 }, { "batch_size": 4, "epoch": 0.5688, "step": 1422, "tokens_per_device": 1764 }, { "epoch": 0.5688, "loss_ce": 0.2342797964811325, "loss_lvr": 2.6119933128356934, "loss_mode_switch": 0.0, "loss_total": 0.49547910690307617, "step": 1422 }, { "epoch": 0.5692, "grad_norm": 1.3455969095230103, "learning_rate": 4.126798028949894e-06, "loss": 0.318, "step": 1423 }, { "batch_size": 4, "epoch": 0.5692, "step": 1423, "tokens_per_device": 2692 }, { "epoch": 0.5692, "loss_ce": 0.14475154876708984, "loss_lvr": 0.6685338616371155, "loss_mode_switch": 0.0, "loss_total": 0.21160493791103363, "step": 1423 }, { "batch_size": 4, "epoch": 0.5692, "step": 1423, "tokens_per_device": 4228 }, { "epoch": 0.5692, "loss_ce": 0.0439065620303154, "loss_lvr": 0.7666395902633667, "loss_mode_switch": 0.0, "loss_total": 0.12057052552700043, "step": 1423 }, { "batch_size": 1, "epoch": 0.5692, "step": 1423, "tokens_per_device": 5017 }, { "epoch": 0.5692, "loss_ce": 0.0044876751489937305, "loss_lvr": 0.6536428332328796, "loss_mode_switch": 0.0, "loss_total": 0.06985196471214294, "step": 1423 }, { "batch_size": 1, "epoch": 0.5692, "step": 1423, "tokens_per_device": 7603 }, { "epoch": 0.5692, "loss_ce": 0.08672989904880524, "loss_lvr": 0.3775189518928528, "loss_mode_switch": 0.0, "loss_total": 0.12448179721832275, "step": 1423 }, { "batch_size": 4, "epoch": 0.5692, "step": 1423, "tokens_per_device": 4384 }, { "epoch": 0.5692, "loss_ce": 0.18048697710037231, "loss_lvr": 0.7160177826881409, "loss_mode_switch": 0.0, "loss_total": 0.2520887553691864, "step": 1423 }, { "batch_size": 4, "epoch": 0.5692, "step": 1423, "tokens_per_device": 4540 }, { "epoch": 0.5692, "loss_ce": 0.39339786767959595, "loss_lvr": 0.49555638432502747, "loss_mode_switch": 0.0, "loss_total": 0.442953497171402, "step": 1423 }, { "batch_size": 4, "epoch": 0.5692, "step": 1423, "tokens_per_device": 5972 }, { "epoch": 0.5692, "loss_ce": 0.17982950806617737, "loss_lvr": 0.777579128742218, "loss_mode_switch": 0.0, "loss_total": 0.2575874328613281, "step": 1423 }, { "batch_size": 4, "epoch": 0.5692, "step": 1423, "tokens_per_device": 1388 }, { "epoch": 0.5692, "loss_ce": 0.14896279573440552, "loss_lvr": 1.0553367137908936, "loss_mode_switch": 0.0, "loss_total": 0.2544964551925659, "step": 1423 }, { "epoch": 0.5696, "grad_norm": 1.2655622959136963, "learning_rate": 4.120420797416777e-06, "loss": 0.2727, "step": 1424 }, { "batch_size": 4, "epoch": 0.5696, "step": 1424, "tokens_per_device": 3840 }, { "epoch": 0.5696, "loss_ce": 0.020590728148818016, "loss_lvr": 0.9595797061920166, "loss_mode_switch": 0.0, "loss_total": 0.11654870212078094, "step": 1424 }, { "batch_size": 4, "epoch": 0.5696, "step": 1424, "tokens_per_device": 4196 }, { "epoch": 0.5696, "loss_ce": 0.29333144426345825, "loss_lvr": 1.0660542249679565, "loss_mode_switch": 0.0, "loss_total": 0.39993685483932495, "step": 1424 }, { "batch_size": 4, "epoch": 0.5696, "step": 1424, "tokens_per_device": 6224 }, { "epoch": 0.5696, "loss_ce": 0.25465288758277893, "loss_lvr": 1.010117530822754, "loss_mode_switch": 0.0, "loss_total": 0.3556646406650543, "step": 1424 }, { "batch_size": 4, "epoch": 0.5696, "step": 1424, "tokens_per_device": 5160 }, { "epoch": 0.5696, "loss_ce": 0.3193761706352234, "loss_lvr": 0.7140158414840698, "loss_mode_switch": 0.0, "loss_total": 0.3907777667045593, "step": 1424 }, { "batch_size": 4, "epoch": 0.5696, "step": 1424, "tokens_per_device": 4268 }, { "epoch": 0.5696, "loss_ce": 0.17819258570671082, "loss_lvr": 0.8264285326004028, "loss_mode_switch": 0.0, "loss_total": 0.2608354389667511, "step": 1424 }, { "batch_size": 4, "epoch": 0.5696, "step": 1424, "tokens_per_device": 3904 }, { "epoch": 0.5696, "loss_ce": 0.014560600742697716, "loss_lvr": 1.3360766172409058, "loss_mode_switch": 0.0, "loss_total": 0.14816826581954956, "step": 1424 }, { "batch_size": 1, "epoch": 0.5696, "step": 1424, "tokens_per_device": 7517 }, { "epoch": 0.5696, "loss_ce": 0.0019472012063488364, "loss_lvr": 0.31351378560066223, "loss_mode_switch": 0.0, "loss_total": 0.03329858183860779, "step": 1424 }, { "batch_size": 4, "epoch": 0.5696, "step": 1424, "tokens_per_device": 1440 }, { "epoch": 0.5696, "loss_ce": 0.7167599201202393, "loss_lvr": 1.0421472787857056, "loss_mode_switch": 0.0, "loss_total": 0.8209746479988098, "step": 1424 }, { "epoch": 0.57, "grad_norm": 1.4235306978225708, "learning_rate": 4.1140450421038865e-06, "loss": 0.3071, "step": 1425 }, { "batch_size": 1, "epoch": 0.57, "step": 1425, "tokens_per_device": 5115 }, { "epoch": 0.57, "loss_ce": 0.0006819856935180724, "loss_lvr": 0.4090670049190521, "loss_mode_switch": 0.0, "loss_total": 0.041588686406612396, "step": 1425 }, { "batch_size": 1, "epoch": 0.57, "step": 1425, "tokens_per_device": 5103 }, { "epoch": 0.57, "loss_ce": 0.02470255270600319, "loss_lvr": 0.5839365720748901, "loss_mode_switch": 0.0, "loss_total": 0.0830962061882019, "step": 1425 }, { "batch_size": 4, "epoch": 0.57, "step": 1425, "tokens_per_device": 1552 }, { "epoch": 0.57, "loss_ce": 0.929111123085022, "loss_lvr": 0.8540704846382141, "loss_mode_switch": 0.0, "loss_total": 1.014518141746521, "step": 1425 }, { "batch_size": 1, "epoch": 0.57, "step": 1425, "tokens_per_device": 5039 }, { "epoch": 0.57, "loss_ce": 0.03101193904876709, "loss_lvr": 0.4831155836582184, "loss_mode_switch": 0.0, "loss_total": 0.07932350039482117, "step": 1425 }, { "batch_size": 4, "epoch": 0.57, "step": 1425, "tokens_per_device": 7684 }, { "epoch": 0.57, "loss_ce": 0.36635226011276245, "loss_lvr": 0.881485641002655, "loss_mode_switch": 0.0, "loss_total": 0.45450082421302795, "step": 1425 }, { "batch_size": 4, "epoch": 0.57, "step": 1425, "tokens_per_device": 5836 }, { "epoch": 0.57, "loss_ce": 0.11978033185005188, "loss_lvr": 1.1150559186935425, "loss_mode_switch": 0.0, "loss_total": 0.2312859296798706, "step": 1425 }, { "batch_size": 1, "epoch": 0.57, "step": 1425, "tokens_per_device": 4888 }, { "epoch": 0.57, "loss_ce": 0.0008236668654717505, "loss_lvr": 0.2323625087738037, "loss_mode_switch": 0.0, "loss_total": 0.024059917777776718, "step": 1425 }, { "batch_size": 1, "epoch": 0.57, "step": 1425, "tokens_per_device": 4916 }, { "epoch": 0.57, "loss_ce": 0.004856063984334469, "loss_lvr": 0.25079241394996643, "loss_mode_switch": 0.0, "loss_total": 0.029935304075479507, "step": 1425 }, { "epoch": 0.5704, "grad_norm": 1.2403068542480469, "learning_rate": 4.107670773711812e-06, "loss": 0.2792, "step": 1426 }, { "batch_size": 1, "epoch": 0.5704, "step": 1426, "tokens_per_device": 4913 }, { "epoch": 0.5704, "loss_ce": 0.010994752869009972, "loss_lvr": 0.43418779969215393, "loss_mode_switch": 0.0, "loss_total": 0.054413534700870514, "step": 1426 }, { "batch_size": 4, "epoch": 0.5704, "step": 1426, "tokens_per_device": 1160 }, { "epoch": 0.5704, "loss_ce": 0.2588944137096405, "loss_lvr": 0.9898319244384766, "loss_mode_switch": 0.0, "loss_total": 0.35787761211395264, "step": 1426 }, { "batch_size": 1, "epoch": 0.5704, "step": 1426, "tokens_per_device": 5081 }, { "epoch": 0.5704, "loss_ce": 0.0048064314760267735, "loss_lvr": 0.37249496579170227, "loss_mode_switch": 0.0, "loss_total": 0.04205593094229698, "step": 1426 }, { "batch_size": 4, "epoch": 0.5704, "step": 1426, "tokens_per_device": 3604 }, { "epoch": 0.5704, "loss_ce": 0.6072379350662231, "loss_lvr": 0.7332347631454468, "loss_mode_switch": 0.0, "loss_total": 0.6805614233016968, "step": 1426 }, { "batch_size": 1, "epoch": 0.5704, "step": 1426, "tokens_per_device": 5092 }, { "epoch": 0.5704, "loss_ce": 0.24651744961738586, "loss_lvr": 0.45200031995773315, "loss_mode_switch": 0.0, "loss_total": 0.2917174696922302, "step": 1426 }, { "batch_size": 1, "epoch": 0.5704, "step": 1426, "tokens_per_device": 5077 }, { "epoch": 0.5704, "loss_ce": 0.07884342968463898, "loss_lvr": 0.18533778190612793, "loss_mode_switch": 0.0, "loss_total": 0.09737721085548401, "step": 1426 }, { "batch_size": 4, "epoch": 0.5704, "step": 1426, "tokens_per_device": 4200 }, { "epoch": 0.5704, "loss_ce": 0.505126953125, "loss_lvr": 1.0380879640579224, "loss_mode_switch": 0.0, "loss_total": 0.6089357733726501, "step": 1426 }, { "batch_size": 4, "epoch": 0.5704, "step": 1426, "tokens_per_device": 5108 }, { "epoch": 0.5704, "loss_ce": 0.23284699022769928, "loss_lvr": 0.9181395769119263, "loss_mode_switch": 0.0, "loss_total": 0.3246609568595886, "step": 1426 }, { "epoch": 0.5708, "grad_norm": 1.13076913356781, "learning_rate": 4.101298002938653e-06, "loss": 0.2459, "step": 1427 }, { "batch_size": 4, "epoch": 0.5708, "step": 1427, "tokens_per_device": 4188 }, { "epoch": 0.5708, "loss_ce": 0.5756866931915283, "loss_lvr": 0.9084003567695618, "loss_mode_switch": 0.0, "loss_total": 0.666526734828949, "step": 1427 }, { "batch_size": 4, "epoch": 0.5708, "step": 1427, "tokens_per_device": 4268 }, { "epoch": 0.5708, "loss_ce": 0.4614783823490143, "loss_lvr": 0.8851913213729858, "loss_mode_switch": 0.0, "loss_total": 0.5499975085258484, "step": 1427 }, { "batch_size": 1, "epoch": 0.5708, "step": 1427, "tokens_per_device": 7656 }, { "epoch": 0.5708, "loss_ce": 0.23463377356529236, "loss_lvr": 0.23534421622753143, "loss_mode_switch": 0.0, "loss_total": 0.25816819071769714, "step": 1427 }, { "batch_size": 4, "epoch": 0.5708, "step": 1427, "tokens_per_device": 4592 }, { "epoch": 0.5708, "loss_ce": 0.3688109219074249, "loss_lvr": 0.8067622780799866, "loss_mode_switch": 0.0, "loss_total": 0.4494871497154236, "step": 1427 }, { "batch_size": 4, "epoch": 0.5708, "step": 1427, "tokens_per_device": 3808 }, { "epoch": 0.5708, "loss_ce": 0.01153483521193266, "loss_lvr": 0.7501474618911743, "loss_mode_switch": 0.0, "loss_total": 0.08654958009719849, "step": 1427 }, { "batch_size": 4, "epoch": 0.5708, "step": 1427, "tokens_per_device": 4716 }, { "epoch": 0.5708, "loss_ce": 0.3361600935459137, "loss_lvr": 0.7599620819091797, "loss_mode_switch": 0.0, "loss_total": 0.4121563136577606, "step": 1427 }, { "batch_size": 4, "epoch": 0.5708, "step": 1427, "tokens_per_device": 4264 }, { "epoch": 0.5708, "loss_ce": 0.11427591741085052, "loss_lvr": 1.2453205585479736, "loss_mode_switch": 0.0, "loss_total": 0.23880797624588013, "step": 1427 }, { "batch_size": 4, "epoch": 0.5708, "step": 1427, "tokens_per_device": 2680 }, { "epoch": 0.5708, "loss_ce": 0.04804171621799469, "loss_lvr": 0.9987717866897583, "loss_mode_switch": 0.0, "loss_total": 0.14791889488697052, "step": 1427 }, { "epoch": 0.5712, "grad_norm": 1.3621996641159058, "learning_rate": 4.094926740479991e-06, "loss": 0.2879, "step": 1428 }, { "batch_size": 4, "epoch": 0.5712, "step": 1428, "tokens_per_device": 8964 }, { "epoch": 0.5712, "loss_ce": 0.5678861141204834, "loss_lvr": 0.7584297060966492, "loss_mode_switch": 0.0, "loss_total": 0.6437290906906128, "step": 1428 }, { "batch_size": 4, "epoch": 0.5712, "step": 1428, "tokens_per_device": 5324 }, { "epoch": 0.5712, "loss_ce": 0.09914153069257736, "loss_lvr": 1.2866679430007935, "loss_mode_switch": 0.0, "loss_total": 0.22780832648277283, "step": 1428 }, { "batch_size": 4, "epoch": 0.5712, "step": 1428, "tokens_per_device": 4268 }, { "epoch": 0.5712, "loss_ce": 0.1536455899477005, "loss_lvr": 0.744921863079071, "loss_mode_switch": 0.0, "loss_total": 0.2281377762556076, "step": 1428 }, { "batch_size": 4, "epoch": 0.5712, "step": 1428, "tokens_per_device": 5160 }, { "epoch": 0.5712, "loss_ce": 0.5020580291748047, "loss_lvr": 0.6871780157089233, "loss_mode_switch": 0.0, "loss_total": 0.5707758069038391, "step": 1428 }, { "batch_size": 4, "epoch": 0.5712, "step": 1428, "tokens_per_device": 5060 }, { "epoch": 0.5712, "loss_ce": 0.12913785874843597, "loss_lvr": 0.9628652930259705, "loss_mode_switch": 0.0, "loss_total": 0.2254243791103363, "step": 1428 }, { "batch_size": 1, "epoch": 0.5712, "step": 1428, "tokens_per_device": 4669 }, { "epoch": 0.5712, "loss_ce": 0.006168865133076906, "loss_lvr": 0.576471209526062, "loss_mode_switch": 0.0, "loss_total": 0.06381598860025406, "step": 1428 }, { "batch_size": 4, "epoch": 0.5712, "step": 1428, "tokens_per_device": 1400 }, { "epoch": 0.5712, "loss_ce": 0.2223420888185501, "loss_lvr": 0.9881871342658997, "loss_mode_switch": 0.0, "loss_total": 0.32116079330444336, "step": 1428 }, { "batch_size": 1, "epoch": 0.5712, "step": 1428, "tokens_per_device": 4774 }, { "epoch": 0.5712, "loss_ce": 0.02990674413740635, "loss_lvr": 0.43514561653137207, "loss_mode_switch": 0.0, "loss_total": 0.07342130690813065, "step": 1428 }, { "epoch": 0.5716, "grad_norm": 1.1706531047821045, "learning_rate": 4.088556997028878e-06, "loss": 0.2496, "step": 1429 }, { "batch_size": 1, "epoch": 0.5716, "step": 1429, "tokens_per_device": 4875 }, { "epoch": 0.5716, "loss_ce": 0.12388114631175995, "loss_lvr": 0.43233922123908997, "loss_mode_switch": 0.0, "loss_total": 0.16711506247520447, "step": 1429 }, { "batch_size": 1, "epoch": 0.5716, "step": 1429, "tokens_per_device": 4774 }, { "epoch": 0.5716, "loss_ce": 0.4337087571620941, "loss_lvr": 0.4217863976955414, "loss_mode_switch": 0.0, "loss_total": 0.47588738799095154, "step": 1429 }, { "batch_size": 4, "epoch": 0.5716, "step": 1429, "tokens_per_device": 1268 }, { "epoch": 0.5716, "loss_ce": 0.26943114399909973, "loss_lvr": 0.9801625609397888, "loss_mode_switch": 0.0, "loss_total": 0.3674474060535431, "step": 1429 }, { "batch_size": 4, "epoch": 0.5716, "step": 1429, "tokens_per_device": 2964 }, { "epoch": 0.5716, "loss_ce": 0.213950514793396, "loss_lvr": 1.0282917022705078, "loss_mode_switch": 0.0, "loss_total": 0.3167796730995178, "step": 1429 }, { "batch_size": 4, "epoch": 0.5716, "step": 1429, "tokens_per_device": 5436 }, { "epoch": 0.5716, "loss_ce": 0.2598012685775757, "loss_lvr": 0.6072904467582703, "loss_mode_switch": 0.0, "loss_total": 0.32053032517433167, "step": 1429 }, { "batch_size": 4, "epoch": 0.5716, "step": 1429, "tokens_per_device": 2832 }, { "epoch": 0.5716, "loss_ce": 0.2711695432662964, "loss_lvr": 1.0463228225708008, "loss_mode_switch": 0.0, "loss_total": 0.37580183148384094, "step": 1429 }, { "batch_size": 4, "epoch": 0.5716, "step": 1429, "tokens_per_device": 3756 }, { "epoch": 0.5716, "loss_ce": 0.2688971161842346, "loss_lvr": 0.7667076587677002, "loss_mode_switch": 0.0, "loss_total": 0.34556788206100464, "step": 1429 }, { "batch_size": 4, "epoch": 0.5716, "step": 1429, "tokens_per_device": 1832 }, { "epoch": 0.5716, "loss_ce": 0.6183959245681763, "loss_lvr": 0.817531406879425, "loss_mode_switch": 0.0, "loss_total": 0.7001490592956543, "step": 1429 }, { "epoch": 0.572, "grad_norm": 1.258871078491211, "learning_rate": 4.08218878327582e-06, "loss": 0.3177, "step": 1430 }, { "batch_size": 4, "epoch": 0.572, "step": 1430, "tokens_per_device": 3820 }, { "epoch": 0.572, "loss_ce": 0.3528994917869568, "loss_lvr": 0.85505211353302, "loss_mode_switch": 0.0, "loss_total": 0.43840470910072327, "step": 1430 }, { "batch_size": 4, "epoch": 0.572, "step": 1430, "tokens_per_device": 3820 }, { "epoch": 0.572, "loss_ce": 0.43848252296447754, "loss_lvr": 0.9952466487884521, "loss_mode_switch": 0.0, "loss_total": 0.5380071997642517, "step": 1430 }, { "batch_size": 4, "epoch": 0.572, "step": 1430, "tokens_per_device": 4336 }, { "epoch": 0.572, "loss_ce": 0.6485474109649658, "loss_lvr": 1.035965085029602, "loss_mode_switch": 0.0, "loss_total": 0.752143919467926, "step": 1430 }, { "batch_size": 4, "epoch": 0.572, "step": 1430, "tokens_per_device": 1348 }, { "epoch": 0.572, "loss_ce": 0.36228224635124207, "loss_lvr": 0.9279361367225647, "loss_mode_switch": 0.0, "loss_total": 0.45507586002349854, "step": 1430 }, { "batch_size": 4, "epoch": 0.572, "step": 1430, "tokens_per_device": 4304 }, { "epoch": 0.572, "loss_ce": 0.46961843967437744, "loss_lvr": 1.2084746360778809, "loss_mode_switch": 0.0, "loss_total": 0.5904659032821655, "step": 1430 }, { "batch_size": 4, "epoch": 0.572, "step": 1430, "tokens_per_device": 2592 }, { "epoch": 0.572, "loss_ce": 0.777675986289978, "loss_lvr": 1.032125473022461, "loss_mode_switch": 0.0, "loss_total": 0.8808885216712952, "step": 1430 }, { "batch_size": 4, "epoch": 0.572, "step": 1430, "tokens_per_device": 2664 }, { "epoch": 0.572, "loss_ce": 0.3096926212310791, "loss_lvr": 0.8118798732757568, "loss_mode_switch": 0.0, "loss_total": 0.39088061451911926, "step": 1430 }, { "batch_size": 1, "epoch": 0.572, "step": 1430, "tokens_per_device": 4895 }, { "epoch": 0.572, "loss_ce": 0.06264179944992065, "loss_lvr": 0.38100770115852356, "loss_mode_switch": 0.0, "loss_total": 0.10074257105588913, "step": 1430 }, { "epoch": 0.5724, "grad_norm": 1.1392136812210083, "learning_rate": 4.07582210990875e-06, "loss": 0.2791, "step": 1431 }, { "batch_size": 4, "epoch": 0.5724, "step": 1431, "tokens_per_device": 1524 }, { "epoch": 0.5724, "loss_ce": 0.503816545009613, "loss_lvr": 0.9546515941619873, "loss_mode_switch": 0.0, "loss_total": 0.5992817282676697, "step": 1431 }, { "batch_size": 4, "epoch": 0.5724, "step": 1431, "tokens_per_device": 4232 }, { "epoch": 0.5724, "loss_ce": 0.183632493019104, "loss_lvr": 0.6316052675247192, "loss_mode_switch": 0.0, "loss_total": 0.24679303169250488, "step": 1431 }, { "batch_size": 4, "epoch": 0.5724, "step": 1431, "tokens_per_device": 5948 }, { "epoch": 0.5724, "loss_ce": 0.14193709194660187, "loss_lvr": 1.4173661470413208, "loss_mode_switch": 0.0, "loss_total": 0.2836737036705017, "step": 1431 }, { "batch_size": 4, "epoch": 0.5724, "step": 1431, "tokens_per_device": 5780 }, { "epoch": 0.5724, "loss_ce": 0.009842781350016594, "loss_lvr": 0.5868165493011475, "loss_mode_switch": 0.0, "loss_total": 0.06852443516254425, "step": 1431 }, { "batch_size": 4, "epoch": 0.5724, "step": 1431, "tokens_per_device": 1488 }, { "epoch": 0.5724, "loss_ce": 0.6466094851493835, "loss_lvr": 0.99465012550354, "loss_mode_switch": 0.0, "loss_total": 0.7460744976997375, "step": 1431 }, { "batch_size": 4, "epoch": 0.5724, "step": 1431, "tokens_per_device": 4388 }, { "epoch": 0.5724, "loss_ce": 0.2572791874408722, "loss_lvr": 0.8436947464942932, "loss_mode_switch": 0.0, "loss_total": 0.341648668050766, "step": 1431 }, { "batch_size": 1, "epoch": 0.5724, "step": 1431, "tokens_per_device": 6924 }, { "epoch": 0.5724, "loss_ce": 0.13279969990253448, "loss_lvr": 0.41184383630752563, "loss_mode_switch": 0.0, "loss_total": 0.1739840805530548, "step": 1431 }, { "batch_size": 4, "epoch": 0.5724, "step": 1431, "tokens_per_device": 4396 }, { "epoch": 0.5724, "loss_ce": 0.007478297688066959, "loss_lvr": 0.6937637329101562, "loss_mode_switch": 0.0, "loss_total": 0.07685466855764389, "step": 1431 }, { "epoch": 0.5728, "grad_norm": 1.247776746749878, "learning_rate": 4.069456987613022e-06, "loss": 0.2849, "step": 1432 }, { "batch_size": 4, "epoch": 0.5728, "step": 1432, "tokens_per_device": 3924 }, { "epoch": 0.5728, "loss_ce": 0.3091375231742859, "loss_lvr": 0.967343807220459, "loss_mode_switch": 0.0, "loss_total": 0.4058718979358673, "step": 1432 }, { "batch_size": 4, "epoch": 0.5728, "step": 1432, "tokens_per_device": 7672 }, { "epoch": 0.5728, "loss_ce": 0.39739400148391724, "loss_lvr": 0.783383846282959, "loss_mode_switch": 0.0, "loss_total": 0.47573238611221313, "step": 1432 }, { "batch_size": 4, "epoch": 0.5728, "step": 1432, "tokens_per_device": 4512 }, { "epoch": 0.5728, "loss_ce": 0.5003785490989685, "loss_lvr": 0.9631351232528687, "loss_mode_switch": 0.0, "loss_total": 0.5966920852661133, "step": 1432 }, { "batch_size": 4, "epoch": 0.5728, "step": 1432, "tokens_per_device": 16300 }, { "epoch": 0.5728, "loss_ce": 0.3578968048095703, "loss_lvr": 0.6820546388626099, "loss_mode_switch": 0.0, "loss_total": 0.42610228061676025, "step": 1432 }, { "batch_size": 4, "epoch": 0.5728, "step": 1432, "tokens_per_device": 3680 }, { "epoch": 0.5728, "loss_ce": 0.5340160131454468, "loss_lvr": 0.8060742616653442, "loss_mode_switch": 0.0, "loss_total": 0.6146234273910522, "step": 1432 }, { "batch_size": 4, "epoch": 0.5728, "step": 1432, "tokens_per_device": 5460 }, { "epoch": 0.5728, "loss_ce": 0.25378188490867615, "loss_lvr": 0.6592113971710205, "loss_mode_switch": 0.0, "loss_total": 0.31970304250717163, "step": 1432 }, { "batch_size": 1, "epoch": 0.5728, "step": 1432, "tokens_per_device": 4898 }, { "epoch": 0.5728, "loss_ce": 0.017505785450339317, "loss_lvr": 0.5125892162322998, "loss_mode_switch": 0.0, "loss_total": 0.06876470893621445, "step": 1432 }, { "batch_size": 4, "epoch": 0.5728, "step": 1432, "tokens_per_device": 5064 }, { "epoch": 0.5728, "loss_ce": 0.9024295210838318, "loss_lvr": 0.7453253865242004, "loss_mode_switch": 0.0, "loss_total": 0.9769620895385742, "step": 1432 }, { "epoch": 0.5732, "grad_norm": 1.3415772914886475, "learning_rate": 4.063093427071376e-06, "loss": 0.2981, "step": 1433 }, { "batch_size": 1, "epoch": 0.5732, "step": 1433, "tokens_per_device": 4957 }, { "epoch": 0.5732, "loss_ce": 0.10867180675268173, "loss_lvr": 0.31099048256874084, "loss_mode_switch": 0.0, "loss_total": 0.13977085053920746, "step": 1433 }, { "batch_size": 4, "epoch": 0.5732, "step": 1433, "tokens_per_device": 4276 }, { "epoch": 0.5732, "loss_ce": 0.45575371384620667, "loss_lvr": 0.7862273454666138, "loss_mode_switch": 0.0, "loss_total": 0.5343764424324036, "step": 1433 }, { "batch_size": 4, "epoch": 0.5732, "step": 1433, "tokens_per_device": 4288 }, { "epoch": 0.5732, "loss_ce": 0.1739736646413803, "loss_lvr": 0.896828293800354, "loss_mode_switch": 0.0, "loss_total": 0.26365649700164795, "step": 1433 }, { "batch_size": 1, "epoch": 0.5732, "step": 1433, "tokens_per_device": 4745 }, { "epoch": 0.5732, "loss_ce": 0.010347291827201843, "loss_lvr": 0.3244889974594116, "loss_mode_switch": 0.0, "loss_total": 0.042796190828084946, "step": 1433 }, { "batch_size": 4, "epoch": 0.5732, "step": 1433, "tokens_per_device": 4408 }, { "epoch": 0.5732, "loss_ce": 0.14900138974189758, "loss_lvr": 1.1203787326812744, "loss_mode_switch": 0.0, "loss_total": 0.26103925704956055, "step": 1433 }, { "batch_size": 1, "epoch": 0.5732, "step": 1433, "tokens_per_device": 4916 }, { "epoch": 0.5732, "loss_ce": 0.5037398338317871, "loss_lvr": 0.492531955242157, "loss_mode_switch": 0.0, "loss_total": 0.5529930591583252, "step": 1433 }, { "batch_size": 4, "epoch": 0.5732, "step": 1433, "tokens_per_device": 3760 }, { "epoch": 0.5732, "loss_ce": 0.4108596742153168, "loss_lvr": 0.8159927129745483, "loss_mode_switch": 0.0, "loss_total": 0.49245893955230713, "step": 1433 }, { "batch_size": 4, "epoch": 0.5732, "step": 1433, "tokens_per_device": 4284 }, { "epoch": 0.5732, "loss_ce": 0.336147665977478, "loss_lvr": 0.9294984340667725, "loss_mode_switch": 0.0, "loss_total": 0.4290975034236908, "step": 1433 }, { "epoch": 0.5736, "grad_norm": 1.5666462182998657, "learning_rate": 4.056731438963947e-06, "loss": 0.3264, "step": 1434 }, { "batch_size": 4, "epoch": 0.5736, "step": 1434, "tokens_per_device": 1396 }, { "epoch": 0.5736, "loss_ce": 0.38073986768722534, "loss_lvr": 1.1642311811447144, "loss_mode_switch": 0.0, "loss_total": 0.49716299772262573, "step": 1434 }, { "batch_size": 1, "epoch": 0.5736, "step": 1434, "tokens_per_device": 4903 }, { "epoch": 0.5736, "loss_ce": 0.07458488643169403, "loss_lvr": 0.28135234117507935, "loss_mode_switch": 0.0, "loss_total": 0.10272011905908585, "step": 1434 }, { "batch_size": 4, "epoch": 0.5736, "step": 1434, "tokens_per_device": 3708 }, { "epoch": 0.5736, "loss_ce": 0.3990243077278137, "loss_lvr": 0.9301472902297974, "loss_mode_switch": 0.0, "loss_total": 0.4920390248298645, "step": 1434 }, { "batch_size": 4, "epoch": 0.5736, "step": 1434, "tokens_per_device": 3780 }, { "epoch": 0.5736, "loss_ce": 0.0764036625623703, "loss_lvr": 0.5678095817565918, "loss_mode_switch": 0.0, "loss_total": 0.13318462669849396, "step": 1434 }, { "batch_size": 4, "epoch": 0.5736, "step": 1434, "tokens_per_device": 2248 }, { "epoch": 0.5736, "loss_ce": 0.30876195430755615, "loss_lvr": 0.9971787929534912, "loss_mode_switch": 0.0, "loss_total": 0.40847983956336975, "step": 1434 }, { "batch_size": 1, "epoch": 0.5736, "step": 1434, "tokens_per_device": 5014 }, { "epoch": 0.5736, "loss_ce": 0.01757499948143959, "loss_lvr": 0.5382365584373474, "loss_mode_switch": 0.0, "loss_total": 0.07139866054058075, "step": 1434 }, { "batch_size": 4, "epoch": 0.5736, "step": 1434, "tokens_per_device": 1904 }, { "epoch": 0.5736, "loss_ce": 0.1760629415512085, "loss_lvr": 0.8455827832221985, "loss_mode_switch": 0.0, "loss_total": 0.26062121987342834, "step": 1434 }, { "batch_size": 1, "epoch": 0.5736, "step": 1434, "tokens_per_device": 4946 }, { "epoch": 0.5736, "loss_ce": 0.0006485064513981342, "loss_lvr": 0.45274195075035095, "loss_mode_switch": 0.0, "loss_total": 0.04592270031571388, "step": 1434 }, { "epoch": 0.574, "grad_norm": 1.3478167057037354, "learning_rate": 4.050371033968216e-06, "loss": 0.3111, "step": 1435 }, { "batch_size": 4, "epoch": 0.574, "step": 1435, "tokens_per_device": 4200 }, { "epoch": 0.574, "loss_ce": 0.01789730228483677, "loss_lvr": 0.8533852100372314, "loss_mode_switch": 0.0, "loss_total": 0.10323582589626312, "step": 1435 }, { "batch_size": 1, "epoch": 0.574, "step": 1435, "tokens_per_device": 4960 }, { "epoch": 0.574, "loss_ce": 0.01910698413848877, "loss_lvr": 0.13764578104019165, "loss_mode_switch": 0.0, "loss_total": 0.032871562987565994, "step": 1435 }, { "batch_size": 4, "epoch": 0.574, "step": 1435, "tokens_per_device": 5220 }, { "epoch": 0.574, "loss_ce": 0.26358357071876526, "loss_lvr": 0.7011491656303406, "loss_mode_switch": 0.0, "loss_total": 0.33369848132133484, "step": 1435 }, { "batch_size": 4, "epoch": 0.574, "step": 1435, "tokens_per_device": 3316 }, { "epoch": 0.574, "loss_ce": 0.13332563638687134, "loss_lvr": 0.8992285132408142, "loss_mode_switch": 0.0, "loss_total": 0.22324848175048828, "step": 1435 }, { "batch_size": 4, "epoch": 0.574, "step": 1435, "tokens_per_device": 8380 }, { "epoch": 0.574, "loss_ce": 0.04443482682108879, "loss_lvr": 0.6638806462287903, "loss_mode_switch": 0.0, "loss_total": 0.1108228862285614, "step": 1435 }, { "batch_size": 4, "epoch": 0.574, "step": 1435, "tokens_per_device": 4264 }, { "epoch": 0.574, "loss_ce": 0.01686234399676323, "loss_lvr": 1.0046403408050537, "loss_mode_switch": 0.0, "loss_total": 0.11732637882232666, "step": 1435 }, { "batch_size": 4, "epoch": 0.574, "step": 1435, "tokens_per_device": 9792 }, { "epoch": 0.574, "loss_ce": 0.21066312491893768, "loss_lvr": 0.7157507538795471, "loss_mode_switch": 0.0, "loss_total": 0.2822381854057312, "step": 1435 }, { "batch_size": 4, "epoch": 0.574, "step": 1435, "tokens_per_device": 1464 }, { "epoch": 0.574, "loss_ce": 0.5712644457817078, "loss_lvr": 1.0152686834335327, "loss_mode_switch": 0.0, "loss_total": 0.6727913022041321, "step": 1435 }, { "epoch": 0.5744, "grad_norm": 1.2740204334259033, "learning_rate": 4.044012222759016e-06, "loss": 0.3177, "step": 1436 }, { "batch_size": 1, "epoch": 0.5744, "step": 1436, "tokens_per_device": 4565 }, { "epoch": 0.5744, "loss_ce": 1.1702896356582642, "loss_lvr": 0.4209305942058563, "loss_mode_switch": 0.0, "loss_total": 1.2123826742172241, "step": 1436 }, { "batch_size": 4, "epoch": 0.5744, "step": 1436, "tokens_per_device": 5820 }, { "epoch": 0.5744, "loss_ce": 0.3818700313568115, "loss_lvr": 0.6520971059799194, "loss_mode_switch": 0.0, "loss_total": 0.44707974791526794, "step": 1436 }, { "batch_size": 4, "epoch": 0.5744, "step": 1436, "tokens_per_device": 1464 }, { "epoch": 0.5744, "loss_ce": 0.3816932737827301, "loss_lvr": 1.0338597297668457, "loss_mode_switch": 0.0, "loss_total": 0.48507925868034363, "step": 1436 }, { "batch_size": 1, "epoch": 0.5744, "step": 1436, "tokens_per_device": 4867 }, { "epoch": 0.5744, "loss_ce": 0.0024567507207393646, "loss_lvr": 0.5940102338790894, "loss_mode_switch": 0.0, "loss_total": 0.06185777485370636, "step": 1436 }, { "batch_size": 1, "epoch": 0.5744, "step": 1436, "tokens_per_device": 4869 }, { "epoch": 0.5744, "loss_ce": 0.000712428882252425, "loss_lvr": 0.3249853849411011, "loss_mode_switch": 0.0, "loss_total": 0.03321096673607826, "step": 1436 }, { "batch_size": 1, "epoch": 0.5744, "step": 1436, "tokens_per_device": 4957 }, { "epoch": 0.5744, "loss_ce": 0.016618933528661728, "loss_lvr": 0.2318962812423706, "loss_mode_switch": 0.0, "loss_total": 0.03980856388807297, "step": 1436 }, { "batch_size": 1, "epoch": 0.5744, "step": 1436, "tokens_per_device": 4267 }, { "epoch": 0.5744, "loss_ce": 0.009995031170547009, "loss_lvr": 0.6851572394371033, "loss_mode_switch": 0.0, "loss_total": 0.07851075381040573, "step": 1436 }, { "batch_size": 4, "epoch": 0.5744, "step": 1436, "tokens_per_device": 3792 }, { "epoch": 0.5744, "loss_ce": 0.5910351872444153, "loss_lvr": 0.9723358750343323, "loss_mode_switch": 0.0, "loss_total": 0.688268780708313, "step": 1436 }, { "epoch": 0.5748, "grad_norm": 1.4103978872299194, "learning_rate": 4.0376550160085e-06, "loss": 0.3362, "step": 1437 }, { "batch_size": 1, "epoch": 0.5748, "step": 1437, "tokens_per_device": 4858 }, { "epoch": 0.5748, "loss_ce": 0.00027761192177422345, "loss_lvr": 0.2572590410709381, "loss_mode_switch": 0.0, "loss_total": 0.02600351721048355, "step": 1437 }, { "batch_size": 1, "epoch": 0.5748, "step": 1437, "tokens_per_device": 5163 }, { "epoch": 0.5748, "loss_ce": 0.0006178372423164546, "loss_lvr": 0.3815368711948395, "loss_mode_switch": 0.0, "loss_total": 0.03877152502536774, "step": 1437 }, { "batch_size": 4, "epoch": 0.5748, "step": 1437, "tokens_per_device": 2916 }, { "epoch": 0.5748, "loss_ce": 0.0007744565955363214, "loss_lvr": 0.5419468879699707, "loss_mode_switch": 0.0, "loss_total": 0.054969146847724915, "step": 1437 }, { "batch_size": 1, "epoch": 0.5748, "step": 1437, "tokens_per_device": 5113 }, { "epoch": 0.5748, "loss_ce": 0.018310027197003365, "loss_lvr": 0.4447922110557556, "loss_mode_switch": 0.0, "loss_total": 0.06278924643993378, "step": 1437 }, { "batch_size": 4, "epoch": 0.5748, "step": 1437, "tokens_per_device": 5820 }, { "epoch": 0.5748, "loss_ce": 0.002672325586900115, "loss_lvr": 1.1482118368148804, "loss_mode_switch": 0.0, "loss_total": 0.11749351024627686, "step": 1437 }, { "batch_size": 1, "epoch": 0.5748, "step": 1437, "tokens_per_device": 5103 }, { "epoch": 0.5748, "loss_ce": 0.012803207151591778, "loss_lvr": 0.5591347813606262, "loss_mode_switch": 0.0, "loss_total": 0.06871668249368668, "step": 1437 }, { "batch_size": 1, "epoch": 0.5748, "step": 1437, "tokens_per_device": 5224 }, { "epoch": 0.5748, "loss_ce": 0.006749071646481752, "loss_lvr": 0.31582093238830566, "loss_mode_switch": 0.0, "loss_total": 0.03833116590976715, "step": 1437 }, { "batch_size": 4, "epoch": 0.5748, "step": 1437, "tokens_per_device": 10848 }, { "epoch": 0.5748, "loss_ce": 0.013761987909674644, "loss_lvr": 0.7067943215370178, "loss_mode_switch": 0.0, "loss_total": 0.0844414234161377, "step": 1437 }, { "epoch": 0.5752, "grad_norm": 1.1363729238510132, "learning_rate": 4.0312994243861304e-06, "loss": 0.2285, "step": 1438 }, { "batch_size": 4, "epoch": 0.5752, "step": 1438, "tokens_per_device": 3628 }, { "epoch": 0.5752, "loss_ce": 0.4651981294155121, "loss_lvr": 0.7966487407684326, "loss_mode_switch": 0.0, "loss_total": 0.5448629856109619, "step": 1438 }, { "batch_size": 4, "epoch": 0.5752, "step": 1438, "tokens_per_device": 2712 }, { "epoch": 0.5752, "loss_ce": 0.4995954930782318, "loss_lvr": 0.6594181060791016, "loss_mode_switch": 0.0, "loss_total": 0.5655373334884644, "step": 1438 }, { "batch_size": 1, "epoch": 0.5752, "step": 1438, "tokens_per_device": 4892 }, { "epoch": 0.5752, "loss_ce": 0.13423460721969604, "loss_lvr": 0.25038862228393555, "loss_mode_switch": 0.0, "loss_total": 0.15927347540855408, "step": 1438 }, { "batch_size": 1, "epoch": 0.5752, "step": 1438, "tokens_per_device": 4914 }, { "epoch": 0.5752, "loss_ce": 0.8984430432319641, "loss_lvr": 0.9309180974960327, "loss_mode_switch": 0.0, "loss_total": 0.9915348291397095, "step": 1438 }, { "batch_size": 1, "epoch": 0.5752, "step": 1438, "tokens_per_device": 4654 }, { "epoch": 0.5752, "loss_ce": 0.06088852882385254, "loss_lvr": 0.3304881453514099, "loss_mode_switch": 0.0, "loss_total": 0.09393734484910965, "step": 1438 }, { "batch_size": 4, "epoch": 0.5752, "step": 1438, "tokens_per_device": 1484 }, { "epoch": 0.5752, "loss_ce": 0.5089914798736572, "loss_lvr": 0.834039032459259, "loss_mode_switch": 0.0, "loss_total": 0.5923953652381897, "step": 1438 }, { "batch_size": 4, "epoch": 0.5752, "step": 1438, "tokens_per_device": 3760 }, { "epoch": 0.5752, "loss_ce": 0.31695234775543213, "loss_lvr": 1.0897175073623657, "loss_mode_switch": 0.0, "loss_total": 0.4259240925312042, "step": 1438 }, { "batch_size": 1, "epoch": 0.5752, "step": 1438, "tokens_per_device": 4866 }, { "epoch": 0.5752, "loss_ce": 0.012209328822791576, "loss_lvr": 0.5282007455825806, "loss_mode_switch": 0.0, "loss_total": 0.06502940505743027, "step": 1438 }, { "epoch": 0.5756, "grad_norm": 1.3752753734588623, "learning_rate": 4.024945458558661e-06, "loss": 0.328, "step": 1439 }, { "batch_size": 4, "epoch": 0.5756, "step": 1439, "tokens_per_device": 3452 }, { "epoch": 0.5756, "loss_ce": 0.1899317055940628, "loss_lvr": 1.0214046239852905, "loss_mode_switch": 0.0, "loss_total": 0.2920721769332886, "step": 1439 }, { "batch_size": 4, "epoch": 0.5756, "step": 1439, "tokens_per_device": 4320 }, { "epoch": 0.5756, "loss_ce": 0.08033359795808792, "loss_lvr": 0.5821181535720825, "loss_mode_switch": 0.0, "loss_total": 0.13854540884494781, "step": 1439 }, { "batch_size": 4, "epoch": 0.5756, "step": 1439, "tokens_per_device": 2632 }, { "epoch": 0.5756, "loss_ce": 0.31926584243774414, "loss_lvr": 0.801044225692749, "loss_mode_switch": 0.0, "loss_total": 0.3993702530860901, "step": 1439 }, { "batch_size": 4, "epoch": 0.5756, "step": 1439, "tokens_per_device": 1192 }, { "epoch": 0.5756, "loss_ce": 0.5225397944450378, "loss_lvr": 1.1078040599822998, "loss_mode_switch": 0.0, "loss_total": 0.6333202123641968, "step": 1439 }, { "batch_size": 4, "epoch": 0.5756, "step": 1439, "tokens_per_device": 3844 }, { "epoch": 0.5756, "loss_ce": 0.023556923493742943, "loss_lvr": 0.9074455499649048, "loss_mode_switch": 0.0, "loss_total": 0.11430148035287857, "step": 1439 }, { "batch_size": 4, "epoch": 0.5756, "step": 1439, "tokens_per_device": 1708 }, { "epoch": 0.5756, "loss_ce": 0.08986522257328033, "loss_lvr": 0.8610250949859619, "loss_mode_switch": 0.0, "loss_total": 0.1759677231311798, "step": 1439 }, { "batch_size": 1, "epoch": 0.5756, "step": 1439, "tokens_per_device": 5230 }, { "epoch": 0.5756, "loss_ce": 0.06729593873023987, "loss_lvr": 0.30359265208244324, "loss_mode_switch": 0.0, "loss_total": 0.09765520691871643, "step": 1439 }, { "batch_size": 4, "epoch": 0.5756, "step": 1439, "tokens_per_device": 4276 }, { "epoch": 0.5756, "loss_ce": 0.3235332667827606, "loss_lvr": 1.136989712715149, "loss_mode_switch": 0.0, "loss_total": 0.43723225593566895, "step": 1439 }, { "epoch": 0.576, "grad_norm": 1.3306912183761597, "learning_rate": 4.018593129190113e-06, "loss": 0.3135, "step": 1440 }, { "batch_size": 4, "epoch": 0.576, "step": 1440, "tokens_per_device": 3676 }, { "epoch": 0.576, "loss_ce": 0.46589815616607666, "loss_lvr": 0.8526486158370972, "loss_mode_switch": 0.0, "loss_total": 0.5511630177497864, "step": 1440 }, { "batch_size": 4, "epoch": 0.576, "step": 1440, "tokens_per_device": 2656 }, { "epoch": 0.576, "loss_ce": 0.16031217575073242, "loss_lvr": 0.826233446598053, "loss_mode_switch": 0.0, "loss_total": 0.24293552339076996, "step": 1440 }, { "batch_size": 1, "epoch": 0.576, "step": 1440, "tokens_per_device": 4876 }, { "epoch": 0.576, "loss_ce": 0.0016814350383356214, "loss_lvr": 0.4573332667350769, "loss_mode_switch": 0.0, "loss_total": 0.047414764761924744, "step": 1440 }, { "batch_size": 4, "epoch": 0.576, "step": 1440, "tokens_per_device": 4476 }, { "epoch": 0.576, "loss_ce": 0.022957710549235344, "loss_lvr": 0.6140662431716919, "loss_mode_switch": 0.0, "loss_total": 0.08436433225870132, "step": 1440 }, { "batch_size": 1, "epoch": 0.576, "step": 1440, "tokens_per_device": 4753 }, { "epoch": 0.576, "loss_ce": 0.009574176743626595, "loss_lvr": 0.5518411993980408, "loss_mode_switch": 0.0, "loss_total": 0.06475830078125, "step": 1440 }, { "batch_size": 4, "epoch": 0.576, "step": 1440, "tokens_per_device": 15432 }, { "epoch": 0.576, "loss_ce": 0.12292251735925674, "loss_lvr": 0.7369824051856995, "loss_mode_switch": 0.0, "loss_total": 0.19662076234817505, "step": 1440 }, { "batch_size": 4, "epoch": 0.576, "step": 1440, "tokens_per_device": 9684 }, { "epoch": 0.576, "loss_ce": 0.06499750912189484, "loss_lvr": 0.7664613723754883, "loss_mode_switch": 0.0, "loss_total": 0.14164364337921143, "step": 1440 }, { "batch_size": 1, "epoch": 0.576, "step": 1440, "tokens_per_device": 5156 }, { "epoch": 0.576, "loss_ce": 0.5266501903533936, "loss_lvr": 0.2573024034500122, "loss_mode_switch": 0.0, "loss_total": 0.5523804426193237, "step": 1440 }, { "epoch": 0.5764, "grad_norm": 1.5055656433105469, "learning_rate": 4.012242446941765e-06, "loss": 0.2743, "step": 1441 }, { "batch_size": 1, "epoch": 0.5764, "step": 1441, "tokens_per_device": 4897 }, { "epoch": 0.5764, "loss_ce": 0.01231556199491024, "loss_lvr": 0.3432178497314453, "loss_mode_switch": 0.0, "loss_total": 0.04663734883069992, "step": 1441 }, { "batch_size": 4, "epoch": 0.5764, "step": 1441, "tokens_per_device": 4248 }, { "epoch": 0.5764, "loss_ce": 0.04343540221452713, "loss_lvr": 0.8917211294174194, "loss_mode_switch": 0.0, "loss_total": 0.13260751962661743, "step": 1441 }, { "batch_size": 4, "epoch": 0.5764, "step": 1441, "tokens_per_device": 1500 }, { "epoch": 0.5764, "loss_ce": 0.7376054525375366, "loss_lvr": 0.8853891491889954, "loss_mode_switch": 0.0, "loss_total": 0.8261443376541138, "step": 1441 }, { "batch_size": 1, "epoch": 0.5764, "step": 1441, "tokens_per_device": 4876 }, { "epoch": 0.5764, "loss_ce": 0.0040837498381733894, "loss_lvr": 0.5034196972846985, "loss_mode_switch": 0.0, "loss_total": 0.05442572012543678, "step": 1441 }, { "batch_size": 4, "epoch": 0.5764, "step": 1441, "tokens_per_device": 4716 }, { "epoch": 0.5764, "loss_ce": 0.1658950001001358, "loss_lvr": 0.7914022207260132, "loss_mode_switch": 0.0, "loss_total": 0.24503523111343384, "step": 1441 }, { "batch_size": 4, "epoch": 0.5764, "step": 1441, "tokens_per_device": 1228 }, { "epoch": 0.5764, "loss_ce": 0.536763608455658, "loss_lvr": 1.3057538270950317, "loss_mode_switch": 0.0, "loss_total": 0.6673389673233032, "step": 1441 }, { "batch_size": 4, "epoch": 0.5764, "step": 1441, "tokens_per_device": 12420 }, { "epoch": 0.5764, "loss_ce": 0.09265054017305374, "loss_lvr": 0.735177755355835, "loss_mode_switch": 0.0, "loss_total": 0.16616831719875336, "step": 1441 }, { "batch_size": 4, "epoch": 0.5764, "step": 1441, "tokens_per_device": 1628 }, { "epoch": 0.5764, "loss_ce": 0.031224779784679413, "loss_lvr": 0.9837675094604492, "loss_mode_switch": 0.0, "loss_total": 0.12960153818130493, "step": 1441 }, { "epoch": 0.5768, "grad_norm": 1.1248102188110352, "learning_rate": 4.005893422472125e-06, "loss": 0.2614, "step": 1442 }, { "batch_size": 4, "epoch": 0.5768, "step": 1442, "tokens_per_device": 1452 }, { "epoch": 0.5768, "loss_ce": 0.06538895517587662, "loss_lvr": 1.0304404497146606, "loss_mode_switch": 0.0, "loss_total": 0.16843301057815552, "step": 1442 }, { "batch_size": 1, "epoch": 0.5768, "step": 1442, "tokens_per_device": 5026 }, { "epoch": 0.5768, "loss_ce": 1.3057273626327515, "loss_lvr": 1.0340042114257812, "loss_mode_switch": 0.0, "loss_total": 1.4091278314590454, "step": 1442 }, { "batch_size": 1, "epoch": 0.5768, "step": 1442, "tokens_per_device": 5732 }, { "epoch": 0.5768, "loss_ce": 0.049775637686252594, "loss_lvr": 0.3567029535770416, "loss_mode_switch": 0.0, "loss_total": 0.08544593304395676, "step": 1442 }, { "batch_size": 1, "epoch": 0.5768, "step": 1442, "tokens_per_device": 6782 }, { "epoch": 0.5768, "loss_ce": 0.08133971691131592, "loss_lvr": 0.3682376742362976, "loss_mode_switch": 0.0, "loss_total": 0.11816348135471344, "step": 1442 }, { "batch_size": 4, "epoch": 0.5768, "step": 1442, "tokens_per_device": 3872 }, { "epoch": 0.5768, "loss_ce": 0.206288680434227, "loss_lvr": 0.840437650680542, "loss_mode_switch": 0.0, "loss_total": 0.2903324365615845, "step": 1442 }, { "batch_size": 4, "epoch": 0.5768, "step": 1442, "tokens_per_device": 4820 }, { "epoch": 0.5768, "loss_ce": 0.35134202241897583, "loss_lvr": 0.8921810984611511, "loss_mode_switch": 0.0, "loss_total": 0.44056013226509094, "step": 1442 }, { "batch_size": 4, "epoch": 0.5768, "step": 1442, "tokens_per_device": 1488 }, { "epoch": 0.5768, "loss_ce": 0.5997143983840942, "loss_lvr": 0.8565994501113892, "loss_mode_switch": 0.0, "loss_total": 0.6853743195533752, "step": 1442 }, { "batch_size": 4, "epoch": 0.5768, "step": 1442, "tokens_per_device": 4204 }, { "epoch": 0.5768, "loss_ce": 0.07159052044153214, "loss_lvr": 1.0981823205947876, "loss_mode_switch": 0.0, "loss_total": 0.18140876293182373, "step": 1442 }, { "epoch": 0.5772, "grad_norm": 1.426367998123169, "learning_rate": 3.9995460664369254e-06, "loss": 0.3366, "step": 1443 }, { "batch_size": 4, "epoch": 0.5772, "step": 1443, "tokens_per_device": 4224 }, { "epoch": 0.5772, "loss_ce": 0.06909427791833878, "loss_lvr": 0.8592715859413147, "loss_mode_switch": 0.0, "loss_total": 0.15502142906188965, "step": 1443 }, { "batch_size": 4, "epoch": 0.5772, "step": 1443, "tokens_per_device": 3924 }, { "epoch": 0.5772, "loss_ce": 0.28772494196891785, "loss_lvr": 0.7621371746063232, "loss_mode_switch": 0.0, "loss_total": 0.36393865942955017, "step": 1443 }, { "batch_size": 1, "epoch": 0.5772, "step": 1443, "tokens_per_device": 4776 }, { "epoch": 0.5772, "loss_ce": 0.06456417590379715, "loss_lvr": 0.36405059695243835, "loss_mode_switch": 0.0, "loss_total": 0.10096924006938934, "step": 1443 }, { "batch_size": 4, "epoch": 0.5772, "step": 1443, "tokens_per_device": 4272 }, { "epoch": 0.5772, "loss_ce": 0.43941760063171387, "loss_lvr": 1.0625383853912354, "loss_mode_switch": 0.0, "loss_total": 0.5456714630126953, "step": 1443 }, { "batch_size": 4, "epoch": 0.5772, "step": 1443, "tokens_per_device": 13256 }, { "epoch": 0.5772, "loss_ce": 0.43365854024887085, "loss_lvr": 0.8064229488372803, "loss_mode_switch": 0.0, "loss_total": 0.5143008232116699, "step": 1443 }, { "batch_size": 1, "epoch": 0.5772, "step": 1443, "tokens_per_device": 5127 }, { "epoch": 0.5772, "loss_ce": 0.007921812124550343, "loss_lvr": 0.1643553227186203, "loss_mode_switch": 0.0, "loss_total": 0.024357344955205917, "step": 1443 }, { "batch_size": 1, "epoch": 0.5772, "step": 1443, "tokens_per_device": 6273 }, { "epoch": 0.5772, "loss_ce": 0.06481834501028061, "loss_lvr": 0.2615963816642761, "loss_mode_switch": 0.0, "loss_total": 0.0909779816865921, "step": 1443 }, { "batch_size": 4, "epoch": 0.5772, "step": 1443, "tokens_per_device": 4236 }, { "epoch": 0.5772, "loss_ce": 0.2876480519771576, "loss_lvr": 0.856581449508667, "loss_mode_switch": 0.0, "loss_total": 0.3733062148094177, "step": 1443 }, { "epoch": 0.5776, "grad_norm": 1.256171703338623, "learning_rate": 3.993200389489096e-06, "loss": 0.2691, "step": 1444 }, { "batch_size": 4, "epoch": 0.5776, "step": 1444, "tokens_per_device": 4456 }, { "epoch": 0.5776, "loss_ce": 0.2374708205461502, "loss_lvr": 0.896929919719696, "loss_mode_switch": 0.0, "loss_total": 0.32716381549835205, "step": 1444 }, { "batch_size": 4, "epoch": 0.5776, "step": 1444, "tokens_per_device": 2824 }, { "epoch": 0.5776, "loss_ce": 0.4422229528427124, "loss_lvr": 0.8194237947463989, "loss_mode_switch": 0.0, "loss_total": 0.5241653323173523, "step": 1444 }, { "batch_size": 4, "epoch": 0.5776, "step": 1444, "tokens_per_device": 2660 }, { "epoch": 0.5776, "loss_ce": 0.251962810754776, "loss_lvr": 0.616851270198822, "loss_mode_switch": 0.0, "loss_total": 0.31364792585372925, "step": 1444 }, { "batch_size": 4, "epoch": 0.5776, "step": 1444, "tokens_per_device": 4676 }, { "epoch": 0.5776, "loss_ce": 0.31837084889411926, "loss_lvr": 0.43404334783554077, "loss_mode_switch": 0.0, "loss_total": 0.3617751896381378, "step": 1444 }, { "batch_size": 4, "epoch": 0.5776, "step": 1444, "tokens_per_device": 4412 }, { "epoch": 0.5776, "loss_ce": 0.03131138160824776, "loss_lvr": 0.9865302443504333, "loss_mode_switch": 0.0, "loss_total": 0.1299644112586975, "step": 1444 }, { "batch_size": 4, "epoch": 0.5776, "step": 1444, "tokens_per_device": 5256 }, { "epoch": 0.5776, "loss_ce": 0.40755826234817505, "loss_lvr": 0.824691116809845, "loss_mode_switch": 0.0, "loss_total": 0.49002736806869507, "step": 1444 }, { "batch_size": 1, "epoch": 0.5776, "step": 1444, "tokens_per_device": 5115 }, { "epoch": 0.5776, "loss_ce": 0.042938556522130966, "loss_lvr": 0.3684235215187073, "loss_mode_switch": 0.0, "loss_total": 0.07978090643882751, "step": 1444 }, { "batch_size": 1, "epoch": 0.5776, "step": 1444, "tokens_per_device": 5160 }, { "epoch": 0.5776, "loss_ce": 0.00954121258109808, "loss_lvr": 0.20625783503055573, "loss_mode_switch": 0.0, "loss_total": 0.030166994780302048, "step": 1444 }, { "epoch": 0.578, "grad_norm": 1.2256006002426147, "learning_rate": 3.98685640227875e-06, "loss": 0.2861, "step": 1445 }, { "batch_size": 1, "epoch": 0.578, "step": 1445, "tokens_per_device": 4899 }, { "epoch": 0.578, "loss_ce": 0.006129802670329809, "loss_lvr": 0.6294403076171875, "loss_mode_switch": 0.0, "loss_total": 0.06907383352518082, "step": 1445 }, { "batch_size": 4, "epoch": 0.578, "step": 1445, "tokens_per_device": 4392 }, { "epoch": 0.578, "loss_ce": 0.2750936448574066, "loss_lvr": 0.8692413568496704, "loss_mode_switch": 0.0, "loss_total": 0.36201778054237366, "step": 1445 }, { "batch_size": 1, "epoch": 0.578, "step": 1445, "tokens_per_device": 4988 }, { "epoch": 0.578, "loss_ce": 0.008379108272492886, "loss_lvr": 0.5410100221633911, "loss_mode_switch": 0.0, "loss_total": 0.06248011067509651, "step": 1445 }, { "batch_size": 1, "epoch": 0.578, "step": 1445, "tokens_per_device": 4874 }, { "epoch": 0.578, "loss_ce": 0.0002532204380258918, "loss_lvr": 0.8292284607887268, "loss_mode_switch": 0.0, "loss_total": 0.08317606896162033, "step": 1445 }, { "batch_size": 4, "epoch": 0.578, "step": 1445, "tokens_per_device": 4600 }, { "epoch": 0.578, "loss_ce": 0.31109297275543213, "loss_lvr": 0.9007512331008911, "loss_mode_switch": 0.0, "loss_total": 0.4011681079864502, "step": 1445 }, { "batch_size": 4, "epoch": 0.578, "step": 1445, "tokens_per_device": 2632 }, { "epoch": 0.578, "loss_ce": 0.1317678987979889, "loss_lvr": 0.8447809815406799, "loss_mode_switch": 0.0, "loss_total": 0.21624600887298584, "step": 1445 }, { "batch_size": 4, "epoch": 0.578, "step": 1445, "tokens_per_device": 3128 }, { "epoch": 0.578, "loss_ce": 0.2989330291748047, "loss_lvr": 0.932259738445282, "loss_mode_switch": 0.0, "loss_total": 0.39215901494026184, "step": 1445 }, { "batch_size": 4, "epoch": 0.578, "step": 1445, "tokens_per_device": 7360 }, { "epoch": 0.578, "loss_ce": 0.012516360729932785, "loss_lvr": 1.6044797897338867, "loss_mode_switch": 0.0, "loss_total": 0.17296434938907623, "step": 1445 }, { "epoch": 0.5784, "grad_norm": 1.4376089572906494, "learning_rate": 3.98051411545316e-06, "loss": 0.3068, "step": 1446 }, { "batch_size": 1, "epoch": 0.5784, "step": 1446, "tokens_per_device": 4868 }, { "epoch": 0.5784, "loss_ce": 0.003085243981331587, "loss_lvr": 0.33329907059669495, "loss_mode_switch": 0.0, "loss_total": 0.03641515225172043, "step": 1446 }, { "batch_size": 4, "epoch": 0.5784, "step": 1446, "tokens_per_device": 3408 }, { "epoch": 0.5784, "loss_ce": 0.20096759498119354, "loss_lvr": 0.9216251373291016, "loss_mode_switch": 0.0, "loss_total": 0.293130099773407, "step": 1446 }, { "batch_size": 1, "epoch": 0.5784, "step": 1446, "tokens_per_device": 5124 }, { "epoch": 0.5784, "loss_ce": 0.0010997226927429438, "loss_lvr": 0.5558450222015381, "loss_mode_switch": 0.0, "loss_total": 0.0566842257976532, "step": 1446 }, { "batch_size": 4, "epoch": 0.5784, "step": 1446, "tokens_per_device": 4172 }, { "epoch": 0.5784, "loss_ce": 0.46102625131607056, "loss_lvr": 0.8953158259391785, "loss_mode_switch": 0.0, "loss_total": 0.5505578517913818, "step": 1446 }, { "batch_size": 1, "epoch": 0.5784, "step": 1446, "tokens_per_device": 5100 }, { "epoch": 0.5784, "loss_ce": 0.05226054787635803, "loss_lvr": 1.5063596963882446, "loss_mode_switch": 0.0, "loss_total": 0.20289652049541473, "step": 1446 }, { "batch_size": 4, "epoch": 0.5784, "step": 1446, "tokens_per_device": 3772 }, { "epoch": 0.5784, "loss_ce": 0.2079709768295288, "loss_lvr": 0.9609051942825317, "loss_mode_switch": 0.0, "loss_total": 0.30406150221824646, "step": 1446 }, { "batch_size": 4, "epoch": 0.5784, "step": 1446, "tokens_per_device": 5664 }, { "epoch": 0.5784, "loss_ce": 0.009815986268222332, "loss_lvr": 0.6769182085990906, "loss_mode_switch": 0.0, "loss_total": 0.07750780880451202, "step": 1446 }, { "batch_size": 1, "epoch": 0.5784, "step": 1446, "tokens_per_device": 5167 }, { "epoch": 0.5784, "loss_ce": 0.029554784297943115, "loss_lvr": 0.3664466440677643, "loss_mode_switch": 0.0, "loss_total": 0.06619945168495178, "step": 1446 }, { "epoch": 0.5788, "grad_norm": 1.1369205713272095, "learning_rate": 3.974173539656747e-06, "loss": 0.2889, "step": 1447 }, { "batch_size": 4, "epoch": 0.5788, "step": 1447, "tokens_per_device": 8676 }, { "epoch": 0.5788, "loss_ce": 0.20610561966896057, "loss_lvr": 0.7605685591697693, "loss_mode_switch": 0.0, "loss_total": 0.28216248750686646, "step": 1447 }, { "batch_size": 1, "epoch": 0.5788, "step": 1447, "tokens_per_device": 5447 }, { "epoch": 0.5788, "loss_ce": 0.5997105836868286, "loss_lvr": 0.2251998782157898, "loss_mode_switch": 0.0, "loss_total": 0.622230589389801, "step": 1447 }, { "batch_size": 4, "epoch": 0.5788, "step": 1447, "tokens_per_device": 2784 }, { "epoch": 0.5788, "loss_ce": 0.4568520188331604, "loss_lvr": 1.9590145349502563, "loss_mode_switch": 0.0, "loss_total": 0.652753472328186, "step": 1447 }, { "batch_size": 1, "epoch": 0.5788, "step": 1447, "tokens_per_device": 4918 }, { "epoch": 0.5788, "loss_ce": 0.11555416882038116, "loss_lvr": 0.6136837005615234, "loss_mode_switch": 0.0, "loss_total": 0.17692254483699799, "step": 1447 }, { "batch_size": 4, "epoch": 0.5788, "step": 1447, "tokens_per_device": 3828 }, { "epoch": 0.5788, "loss_ce": 0.0831495150923729, "loss_lvr": 0.7741478681564331, "loss_mode_switch": 0.0, "loss_total": 0.16056430339813232, "step": 1447 }, { "batch_size": 4, "epoch": 0.5788, "step": 1447, "tokens_per_device": 4300 }, { "epoch": 0.5788, "loss_ce": 0.47806689143180847, "loss_lvr": 1.1156421899795532, "loss_mode_switch": 0.0, "loss_total": 0.5896310806274414, "step": 1447 }, { "batch_size": 4, "epoch": 0.5788, "step": 1447, "tokens_per_device": 1420 }, { "epoch": 0.5788, "loss_ce": 0.17449548840522766, "loss_lvr": 1.180010199546814, "loss_mode_switch": 0.0, "loss_total": 0.2924965023994446, "step": 1447 }, { "batch_size": 4, "epoch": 0.5788, "step": 1447, "tokens_per_device": 4100 }, { "epoch": 0.5788, "loss_ce": 0.6885941028594971, "loss_lvr": 0.5921689867973328, "loss_mode_switch": 0.0, "loss_total": 0.7478110194206238, "step": 1447 }, { "epoch": 0.5792, "grad_norm": 1.2564278841018677, "learning_rate": 3.967834685531066e-06, "loss": 0.2981, "step": 1448 }, { "batch_size": 4, "epoch": 0.5792, "step": 1448, "tokens_per_device": 4244 }, { "epoch": 0.5792, "loss_ce": 0.018238723278045654, "loss_lvr": 1.0808571577072144, "loss_mode_switch": 0.0, "loss_total": 0.12632444500923157, "step": 1448 }, { "batch_size": 4, "epoch": 0.5792, "step": 1448, "tokens_per_device": 6288 }, { "epoch": 0.5792, "loss_ce": 0.06032385677099228, "loss_lvr": 0.7780681848526001, "loss_mode_switch": 0.0, "loss_total": 0.13813067972660065, "step": 1448 }, { "batch_size": 1, "epoch": 0.5792, "step": 1448, "tokens_per_device": 7245 }, { "epoch": 0.5792, "loss_ce": 0.01486473809927702, "loss_lvr": 0.2533165514469147, "loss_mode_switch": 0.0, "loss_total": 0.04019639268517494, "step": 1448 }, { "batch_size": 1, "epoch": 0.5792, "step": 1448, "tokens_per_device": 5086 }, { "epoch": 0.5792, "loss_ce": 0.06966809928417206, "loss_lvr": 1.2072612047195435, "loss_mode_switch": 0.0, "loss_total": 0.19039422273635864, "step": 1448 }, { "batch_size": 4, "epoch": 0.5792, "step": 1448, "tokens_per_device": 1272 }, { "epoch": 0.5792, "loss_ce": 0.17721033096313477, "loss_lvr": 1.0575923919677734, "loss_mode_switch": 0.0, "loss_total": 0.28296956419944763, "step": 1448 }, { "batch_size": 4, "epoch": 0.5792, "step": 1448, "tokens_per_device": 5024 }, { "epoch": 0.5792, "loss_ce": 0.31407681107521057, "loss_lvr": 0.7385554909706116, "loss_mode_switch": 0.0, "loss_total": 0.38793236017227173, "step": 1448 }, { "batch_size": 4, "epoch": 0.5792, "step": 1448, "tokens_per_device": 4968 }, { "epoch": 0.5792, "loss_ce": 0.21378761529922485, "loss_lvr": 1.0440784692764282, "loss_mode_switch": 0.0, "loss_total": 0.3181954622268677, "step": 1448 }, { "batch_size": 1, "epoch": 0.5792, "step": 1448, "tokens_per_device": 5219 }, { "epoch": 0.5792, "loss_ce": 0.23464412987232208, "loss_lvr": 0.3964008390903473, "loss_mode_switch": 0.0, "loss_total": 0.2742842137813568, "step": 1448 }, { "epoch": 0.5796, "grad_norm": 1.411678433418274, "learning_rate": 3.961497563714774e-06, "loss": 0.295, "step": 1449 }, { "batch_size": 1, "epoch": 0.5796, "step": 1449, "tokens_per_device": 4969 }, { "epoch": 0.5796, "loss_ce": 0.007799108047038317, "loss_lvr": 0.26228806376457214, "loss_mode_switch": 0.0, "loss_total": 0.03402791544795036, "step": 1449 }, { "batch_size": 4, "epoch": 0.5796, "step": 1449, "tokens_per_device": 5820 }, { "epoch": 0.5796, "loss_ce": 0.5149223804473877, "loss_lvr": 1.0310344696044922, "loss_mode_switch": 0.0, "loss_total": 0.6180258393287659, "step": 1449 }, { "batch_size": 4, "epoch": 0.5796, "step": 1449, "tokens_per_device": 8056 }, { "epoch": 0.5796, "loss_ce": 0.08342619985342026, "loss_lvr": 0.604155957698822, "loss_mode_switch": 0.0, "loss_total": 0.14384180307388306, "step": 1449 }, { "batch_size": 4, "epoch": 0.5796, "step": 1449, "tokens_per_device": 5252 }, { "epoch": 0.5796, "loss_ce": 0.1090720146894455, "loss_lvr": 0.8102192878723145, "loss_mode_switch": 0.0, "loss_total": 0.19009393453598022, "step": 1449 }, { "batch_size": 4, "epoch": 0.5796, "step": 1449, "tokens_per_device": 4540 }, { "epoch": 0.5796, "loss_ce": 0.02375011146068573, "loss_lvr": 1.0831364393234253, "loss_mode_switch": 0.0, "loss_total": 0.13206374645233154, "step": 1449 }, { "batch_size": 1, "epoch": 0.5796, "step": 1449, "tokens_per_device": 4888 }, { "epoch": 0.5796, "loss_ce": 0.002278541447594762, "loss_lvr": 1.7086853981018066, "loss_mode_switch": 0.0, "loss_total": 0.1731470823287964, "step": 1449 }, { "batch_size": 1, "epoch": 0.5796, "step": 1449, "tokens_per_device": 4923 }, { "epoch": 0.5796, "loss_ce": 0.01297579426318407, "loss_lvr": 0.487457811832428, "loss_mode_switch": 0.0, "loss_total": 0.061721574515104294, "step": 1449 }, { "batch_size": 4, "epoch": 0.5796, "step": 1449, "tokens_per_device": 4376 }, { "epoch": 0.5796, "loss_ce": 0.2956599295139313, "loss_lvr": 0.882224440574646, "loss_mode_switch": 0.0, "loss_total": 0.3838823735713959, "step": 1449 }, { "epoch": 0.58, "grad_norm": 1.176271677017212, "learning_rate": 3.955162184843625e-06, "loss": 0.2512, "step": 1450 }, { "batch_size": 4, "epoch": 0.58, "step": 1450, "tokens_per_device": 2672 }, { "epoch": 0.58, "loss_ce": 0.31470999121665955, "loss_lvr": 1.6834384202957153, "loss_mode_switch": 0.0, "loss_total": 0.4830538332462311, "step": 1450 }, { "batch_size": 1, "epoch": 0.58, "step": 1450, "tokens_per_device": 4904 }, { "epoch": 0.58, "loss_ce": 0.21819446980953217, "loss_lvr": 0.6802489757537842, "loss_mode_switch": 0.0, "loss_total": 0.28621935844421387, "step": 1450 }, { "batch_size": 1, "epoch": 0.58, "step": 1450, "tokens_per_device": 4946 }, { "epoch": 0.58, "loss_ce": 0.0431315116584301, "loss_lvr": 0.49444225430488586, "loss_mode_switch": 0.0, "loss_total": 0.09257573634386063, "step": 1450 }, { "batch_size": 4, "epoch": 0.58, "step": 1450, "tokens_per_device": 6088 }, { "epoch": 0.58, "loss_ce": 0.04299803823232651, "loss_lvr": 0.437399297952652, "loss_mode_switch": 0.0, "loss_total": 0.0867379680275917, "step": 1450 }, { "batch_size": 4, "epoch": 0.58, "step": 1450, "tokens_per_device": 3100 }, { "epoch": 0.58, "loss_ce": 0.23256102204322815, "loss_lvr": 0.861619770526886, "loss_mode_switch": 0.0, "loss_total": 0.31872299313545227, "step": 1450 }, { "batch_size": 4, "epoch": 0.58, "step": 1450, "tokens_per_device": 4428 }, { "epoch": 0.58, "loss_ce": 0.44518041610717773, "loss_lvr": 0.9097790718078613, "loss_mode_switch": 0.0, "loss_total": 0.5361583232879639, "step": 1450 }, { "batch_size": 4, "epoch": 0.58, "step": 1450, "tokens_per_device": 3860 }, { "epoch": 0.58, "loss_ce": 0.31864482164382935, "loss_lvr": 1.0789545774459839, "loss_mode_switch": 0.0, "loss_total": 0.4265402853488922, "step": 1450 }, { "batch_size": 1, "epoch": 0.58, "step": 1450, "tokens_per_device": 4905 }, { "epoch": 0.58, "loss_ce": 0.007420609705150127, "loss_lvr": 0.2846924662590027, "loss_mode_switch": 0.0, "loss_total": 0.03588985651731491, "step": 1450 }, { "epoch": 0.5804, "grad_norm": 1.2725985050201416, "learning_rate": 3.948828559550448e-06, "loss": 0.2884, "step": 1451 }, { "batch_size": 4, "epoch": 0.5804, "step": 1451, "tokens_per_device": 4428 }, { "epoch": 0.5804, "loss_ce": 0.15952247381210327, "loss_lvr": 0.7942331433296204, "loss_mode_switch": 0.0, "loss_total": 0.23894578218460083, "step": 1451 }, { "batch_size": 1, "epoch": 0.5804, "step": 1451, "tokens_per_device": 4907 }, { "epoch": 0.5804, "loss_ce": 0.0484158955514431, "loss_lvr": 0.30312222242355347, "loss_mode_switch": 0.0, "loss_total": 0.07872811704874039, "step": 1451 }, { "batch_size": 4, "epoch": 0.5804, "step": 1451, "tokens_per_device": 9604 }, { "epoch": 0.5804, "loss_ce": 0.23040716350078583, "loss_lvr": 0.980594277381897, "loss_mode_switch": 0.0, "loss_total": 0.32846659421920776, "step": 1451 }, { "batch_size": 4, "epoch": 0.5804, "step": 1451, "tokens_per_device": 1300 }, { "epoch": 0.5804, "loss_ce": 0.20489467680454254, "loss_lvr": 1.1015986204147339, "loss_mode_switch": 0.0, "loss_total": 0.3150545358657837, "step": 1451 }, { "batch_size": 1, "epoch": 0.5804, "step": 1451, "tokens_per_device": 5111 }, { "epoch": 0.5804, "loss_ce": 0.10189402103424072, "loss_lvr": 0.24703948199748993, "loss_mode_switch": 0.0, "loss_total": 0.12659797072410583, "step": 1451 }, { "batch_size": 4, "epoch": 0.5804, "step": 1451, "tokens_per_device": 4360 }, { "epoch": 0.5804, "loss_ce": 0.07105034589767456, "loss_lvr": 0.7649926543235779, "loss_mode_switch": 0.0, "loss_total": 0.1475496143102646, "step": 1451 }, { "batch_size": 1, "epoch": 0.5804, "step": 1451, "tokens_per_device": 5082 }, { "epoch": 0.5804, "loss_ce": 0.00499916123226285, "loss_lvr": 0.2834910750389099, "loss_mode_switch": 0.0, "loss_total": 0.03334826976060867, "step": 1451 }, { "batch_size": 4, "epoch": 0.5804, "step": 1451, "tokens_per_device": 2656 }, { "epoch": 0.5804, "loss_ce": 0.16028665006160736, "loss_lvr": 0.9118282794952393, "loss_mode_switch": 0.0, "loss_total": 0.2514694929122925, "step": 1451 }, { "epoch": 0.5808, "grad_norm": 1.5361253023147583, "learning_rate": 3.942496698465125e-06, "loss": 0.2294, "step": 1452 }, { "batch_size": 4, "epoch": 0.5808, "step": 1452, "tokens_per_device": 4260 }, { "epoch": 0.5808, "loss_ce": 0.19776469469070435, "loss_lvr": 0.7059106826782227, "loss_mode_switch": 0.0, "loss_total": 0.26835575699806213, "step": 1452 }, { "batch_size": 4, "epoch": 0.5808, "step": 1452, "tokens_per_device": 2640 }, { "epoch": 0.5808, "loss_ce": 0.13942095637321472, "loss_lvr": 0.7456711530685425, "loss_mode_switch": 0.0, "loss_total": 0.2139880657196045, "step": 1452 }, { "batch_size": 1, "epoch": 0.5808, "step": 1452, "tokens_per_device": 5105 }, { "epoch": 0.5808, "loss_ce": 0.0037695025093853474, "loss_lvr": 0.40495702624320984, "loss_mode_switch": 0.0, "loss_total": 0.04426520690321922, "step": 1452 }, { "batch_size": 1, "epoch": 0.5808, "step": 1452, "tokens_per_device": 5607 }, { "epoch": 0.5808, "loss_ce": 0.11097750067710876, "loss_lvr": 0.24356704950332642, "loss_mode_switch": 0.0, "loss_total": 0.13533420860767365, "step": 1452 }, { "batch_size": 1, "epoch": 0.5808, "step": 1452, "tokens_per_device": 5108 }, { "epoch": 0.5808, "loss_ce": 0.7150737047195435, "loss_lvr": 0.5766963958740234, "loss_mode_switch": 0.0, "loss_total": 0.7727433443069458, "step": 1452 }, { "batch_size": 1, "epoch": 0.5808, "step": 1452, "tokens_per_device": 5198 }, { "epoch": 0.5808, "loss_ce": 0.003275650553405285, "loss_lvr": 0.36021688580513, "loss_mode_switch": 0.0, "loss_total": 0.03929734230041504, "step": 1452 }, { "batch_size": 4, "epoch": 0.5808, "step": 1452, "tokens_per_device": 2644 }, { "epoch": 0.5808, "loss_ce": 0.10370878130197525, "loss_lvr": 0.7640649080276489, "loss_mode_switch": 0.0, "loss_total": 0.18011528253555298, "step": 1452 }, { "batch_size": 4, "epoch": 0.5808, "step": 1452, "tokens_per_device": 3376 }, { "epoch": 0.5808, "loss_ce": 0.37526553869247437, "loss_lvr": 1.134037733078003, "loss_mode_switch": 0.0, "loss_total": 0.4886693060398102, "step": 1452 }, { "epoch": 0.5812, "grad_norm": 1.3857202529907227, "learning_rate": 3.936166612214583e-06, "loss": 0.2889, "step": 1453 }, { "batch_size": 4, "epoch": 0.5812, "step": 1453, "tokens_per_device": 4944 }, { "epoch": 0.5812, "loss_ce": 0.021284194663167, "loss_lvr": 0.844830334186554, "loss_mode_switch": 0.0, "loss_total": 0.10576722770929337, "step": 1453 }, { "batch_size": 4, "epoch": 0.5812, "step": 1453, "tokens_per_device": 3876 }, { "epoch": 0.5812, "loss_ce": 0.045572444796562195, "loss_lvr": 0.6508605480194092, "loss_mode_switch": 0.0, "loss_total": 0.11065850406885147, "step": 1453 }, { "batch_size": 4, "epoch": 0.5812, "step": 1453, "tokens_per_device": 4304 }, { "epoch": 0.5812, "loss_ce": 0.3290949761867523, "loss_lvr": 0.885898232460022, "loss_mode_switch": 0.0, "loss_total": 0.41768479347229004, "step": 1453 }, { "batch_size": 4, "epoch": 0.5812, "step": 1453, "tokens_per_device": 13036 }, { "epoch": 0.5812, "loss_ce": 0.5414450168609619, "loss_lvr": 0.7691332697868347, "loss_mode_switch": 0.0, "loss_total": 0.6183583736419678, "step": 1453 }, { "batch_size": 4, "epoch": 0.5812, "step": 1453, "tokens_per_device": 2004 }, { "epoch": 0.5812, "loss_ce": 0.7870396375656128, "loss_lvr": 0.8935691118240356, "loss_mode_switch": 0.0, "loss_total": 0.8763965368270874, "step": 1453 }, { "batch_size": 4, "epoch": 0.5812, "step": 1453, "tokens_per_device": 10648 }, { "epoch": 0.5812, "loss_ce": 0.02145194076001644, "loss_lvr": 0.8472621440887451, "loss_mode_switch": 0.0, "loss_total": 0.1061781570315361, "step": 1453 }, { "batch_size": 4, "epoch": 0.5812, "step": 1453, "tokens_per_device": 4872 }, { "epoch": 0.5812, "loss_ce": 0.16438715159893036, "loss_lvr": 0.9253931641578674, "loss_mode_switch": 0.0, "loss_total": 0.2569264769554138, "step": 1453 }, { "batch_size": 1, "epoch": 0.5812, "step": 1453, "tokens_per_device": 4925 }, { "epoch": 0.5812, "loss_ce": 0.45654618740081787, "loss_lvr": 0.35757434368133545, "loss_mode_switch": 0.0, "loss_total": 0.49230360984802246, "step": 1453 }, { "epoch": 0.5816, "grad_norm": 1.3184326887130737, "learning_rate": 3.9298383114227675e-06, "loss": 0.2878, "step": 1454 }, { "batch_size": 4, "epoch": 0.5816, "step": 1454, "tokens_per_device": 2008 }, { "epoch": 0.5816, "loss_ce": 0.25129416584968567, "loss_lvr": 0.9156534671783447, "loss_mode_switch": 0.0, "loss_total": 0.34285950660705566, "step": 1454 }, { "batch_size": 4, "epoch": 0.5816, "step": 1454, "tokens_per_device": 3108 }, { "epoch": 0.5816, "loss_ce": 0.13113637268543243, "loss_lvr": 0.9261196255683899, "loss_mode_switch": 0.0, "loss_total": 0.2237483263015747, "step": 1454 }, { "batch_size": 4, "epoch": 0.5816, "step": 1454, "tokens_per_device": 2572 }, { "epoch": 0.5816, "loss_ce": 0.15832598507404327, "loss_lvr": 0.8322586417198181, "loss_mode_switch": 0.0, "loss_total": 0.24155184626579285, "step": 1454 }, { "batch_size": 4, "epoch": 0.5816, "step": 1454, "tokens_per_device": 7308 }, { "epoch": 0.5816, "loss_ce": 0.28147196769714355, "loss_lvr": 0.6323334574699402, "loss_mode_switch": 0.0, "loss_total": 0.3447053134441376, "step": 1454 }, { "batch_size": 4, "epoch": 0.5816, "step": 1454, "tokens_per_device": 13068 }, { "epoch": 0.5816, "loss_ce": 0.4458521604537964, "loss_lvr": 0.5715933442115784, "loss_mode_switch": 0.0, "loss_total": 0.5030114650726318, "step": 1454 }, { "batch_size": 4, "epoch": 0.5816, "step": 1454, "tokens_per_device": 4332 }, { "epoch": 0.5816, "loss_ce": 0.053394172340631485, "loss_lvr": 0.7617297172546387, "loss_mode_switch": 0.0, "loss_total": 0.12956714630126953, "step": 1454 }, { "batch_size": 4, "epoch": 0.5816, "step": 1454, "tokens_per_device": 4128 }, { "epoch": 0.5816, "loss_ce": 0.39602333307266235, "loss_lvr": 0.7636247873306274, "loss_mode_switch": 0.0, "loss_total": 0.47238582372665405, "step": 1454 }, { "batch_size": 4, "epoch": 0.5816, "step": 1454, "tokens_per_device": 3988 }, { "epoch": 0.5816, "loss_ce": 0.21146103739738464, "loss_lvr": 0.6691829562187195, "loss_mode_switch": 0.0, "loss_total": 0.27837932109832764, "step": 1454 }, { "epoch": 0.582, "grad_norm": 1.1704645156860352, "learning_rate": 3.9235118067106255e-06, "loss": 0.2706, "step": 1455 }, { "batch_size": 4, "epoch": 0.582, "step": 1455, "tokens_per_device": 7264 }, { "epoch": 0.582, "loss_ce": 0.18140867352485657, "loss_lvr": 0.6661916375160217, "loss_mode_switch": 0.0, "loss_total": 0.24802783131599426, "step": 1455 }, { "batch_size": 4, "epoch": 0.582, "step": 1455, "tokens_per_device": 11240 }, { "epoch": 0.582, "loss_ce": 0.4728274643421173, "loss_lvr": 0.8455978631973267, "loss_mode_switch": 0.0, "loss_total": 0.5573872327804565, "step": 1455 }, { "batch_size": 1, "epoch": 0.582, "step": 1455, "tokens_per_device": 5119 }, { "epoch": 0.582, "loss_ce": 0.0019628421869128942, "loss_lvr": 0.3445212244987488, "loss_mode_switch": 0.0, "loss_total": 0.03641496226191521, "step": 1455 }, { "batch_size": 4, "epoch": 0.582, "step": 1455, "tokens_per_device": 4656 }, { "epoch": 0.582, "loss_ce": 0.04652978107333183, "loss_lvr": 0.5941017270088196, "loss_mode_switch": 0.0, "loss_total": 0.10593995451927185, "step": 1455 }, { "batch_size": 4, "epoch": 0.582, "step": 1455, "tokens_per_device": 2544 }, { "epoch": 0.582, "loss_ce": 0.06440365314483643, "loss_lvr": 1.1159610748291016, "loss_mode_switch": 0.0, "loss_total": 0.17599976062774658, "step": 1455 }, { "batch_size": 1, "epoch": 0.582, "step": 1455, "tokens_per_device": 5040 }, { "epoch": 0.582, "loss_ce": 0.45414361357688904, "loss_lvr": 0.3598364591598511, "loss_mode_switch": 0.0, "loss_total": 0.4901272654533386, "step": 1455 }, { "batch_size": 4, "epoch": 0.582, "step": 1455, "tokens_per_device": 5180 }, { "epoch": 0.582, "loss_ce": 0.02907472848892212, "loss_lvr": 0.7477611899375916, "loss_mode_switch": 0.0, "loss_total": 0.1038508489727974, "step": 1455 }, { "batch_size": 1, "epoch": 0.582, "step": 1455, "tokens_per_device": 4171 }, { "epoch": 0.582, "loss_ce": 0.0025293156504631042, "loss_lvr": 0.31154516339302063, "loss_mode_switch": 0.0, "loss_total": 0.03368383273482323, "step": 1455 }, { "epoch": 0.5824, "grad_norm": 1.305485725402832, "learning_rate": 3.917187108696094e-06, "loss": 0.2504, "step": 1456 }, { "batch_size": 1, "epoch": 0.5824, "step": 1456, "tokens_per_device": 5091 }, { "epoch": 0.5824, "loss_ce": 0.2340056598186493, "loss_lvr": 0.5260623693466187, "loss_mode_switch": 0.0, "loss_total": 0.2866118848323822, "step": 1456 }, { "batch_size": 1, "epoch": 0.5824, "step": 1456, "tokens_per_device": 4878 }, { "epoch": 0.5824, "loss_ce": 0.02174713835120201, "loss_lvr": 0.21794241666793823, "loss_mode_switch": 0.0, "loss_total": 0.043541379272937775, "step": 1456 }, { "batch_size": 4, "epoch": 0.5824, "step": 1456, "tokens_per_device": 7560 }, { "epoch": 0.5824, "loss_ce": 0.012261460535228252, "loss_lvr": 0.8426356315612793, "loss_mode_switch": 0.0, "loss_total": 0.09652502089738846, "step": 1456 }, { "batch_size": 1, "epoch": 0.5824, "step": 1456, "tokens_per_device": 4943 }, { "epoch": 0.5824, "loss_ce": 0.34995830059051514, "loss_lvr": 0.42122146487236023, "loss_mode_switch": 0.0, "loss_total": 0.3920804560184479, "step": 1456 }, { "batch_size": 1, "epoch": 0.5824, "step": 1456, "tokens_per_device": 5113 }, { "epoch": 0.5824, "loss_ce": 0.12168307602405548, "loss_lvr": 0.32418695092201233, "loss_mode_switch": 0.0, "loss_total": 0.15410177409648895, "step": 1456 }, { "batch_size": 4, "epoch": 0.5824, "step": 1456, "tokens_per_device": 8124 }, { "epoch": 0.5824, "loss_ce": 0.6224623322486877, "loss_lvr": 0.5707080960273743, "loss_mode_switch": 0.0, "loss_total": 0.6795331239700317, "step": 1456 }, { "batch_size": 1, "epoch": 0.5824, "step": 1456, "tokens_per_device": 4823 }, { "epoch": 0.5824, "loss_ce": 0.003530059941112995, "loss_lvr": 0.7216542363166809, "loss_mode_switch": 0.0, "loss_total": 0.07569548487663269, "step": 1456 }, { "batch_size": 4, "epoch": 0.5824, "step": 1456, "tokens_per_device": 4528 }, { "epoch": 0.5824, "loss_ce": 0.20950527489185333, "loss_lvr": 0.7088296413421631, "loss_mode_switch": 0.0, "loss_total": 0.2803882360458374, "step": 1456 }, { "epoch": 0.5828, "grad_norm": 1.3347687721252441, "learning_rate": 3.91086422799407e-06, "loss": 0.2602, "step": 1457 }, { "batch_size": 4, "epoch": 0.5828, "step": 1457, "tokens_per_device": 1232 }, { "epoch": 0.5828, "loss_ce": 0.2665635347366333, "loss_lvr": 1.2121633291244507, "loss_mode_switch": 0.0, "loss_total": 0.3877798616886139, "step": 1457 }, { "batch_size": 1, "epoch": 0.5828, "step": 1457, "tokens_per_device": 6079 }, { "epoch": 0.5828, "loss_ce": 0.03185630589723587, "loss_lvr": 0.2634319067001343, "loss_mode_switch": 0.0, "loss_total": 0.05819949507713318, "step": 1457 }, { "batch_size": 1, "epoch": 0.5828, "step": 1457, "tokens_per_device": 4965 }, { "epoch": 0.5828, "loss_ce": 0.0921449214220047, "loss_lvr": 0.72802734375, "loss_mode_switch": 0.0, "loss_total": 0.16494765877723694, "step": 1457 }, { "batch_size": 4, "epoch": 0.5828, "step": 1457, "tokens_per_device": 2868 }, { "epoch": 0.5828, "loss_ce": 0.3171737790107727, "loss_lvr": 0.5624961853027344, "loss_mode_switch": 0.0, "loss_total": 0.37342339754104614, "step": 1457 }, { "batch_size": 4, "epoch": 0.5828, "step": 1457, "tokens_per_device": 2572 }, { "epoch": 0.5828, "loss_ce": 0.19826550781726837, "loss_lvr": 0.9246118664741516, "loss_mode_switch": 0.0, "loss_total": 0.2907266914844513, "step": 1457 }, { "batch_size": 4, "epoch": 0.5828, "step": 1457, "tokens_per_device": 5988 }, { "epoch": 0.5828, "loss_ce": 0.026328032836318016, "loss_lvr": 0.7799548506736755, "loss_mode_switch": 0.0, "loss_total": 0.10432352125644684, "step": 1457 }, { "batch_size": 4, "epoch": 0.5828, "step": 1457, "tokens_per_device": 7680 }, { "epoch": 0.5828, "loss_ce": 0.2751491963863373, "loss_lvr": 0.4787631332874298, "loss_mode_switch": 0.0, "loss_total": 0.32302552461624146, "step": 1457 }, { "batch_size": 4, "epoch": 0.5828, "step": 1457, "tokens_per_device": 16024 }, { "epoch": 0.5828, "loss_ce": 0.07045040279626846, "loss_lvr": 0.6770864725112915, "loss_mode_switch": 0.0, "loss_total": 0.13815905153751373, "step": 1457 }, { "epoch": 0.5832, "grad_norm": 1.3284329175949097, "learning_rate": 3.90454317521641e-06, "loss": 0.303, "step": 1458 }, { "batch_size": 4, "epoch": 0.5832, "step": 1458, "tokens_per_device": 4388 }, { "epoch": 0.5832, "loss_ce": 0.0671176165342331, "loss_lvr": 0.8025632500648499, "loss_mode_switch": 0.0, "loss_total": 0.14737394452095032, "step": 1458 }, { "batch_size": 4, "epoch": 0.5832, "step": 1458, "tokens_per_device": 2920 }, { "epoch": 0.5832, "loss_ce": 0.10908997058868408, "loss_lvr": 0.8364933729171753, "loss_mode_switch": 0.0, "loss_total": 0.1927393078804016, "step": 1458 }, { "batch_size": 4, "epoch": 0.5832, "step": 1458, "tokens_per_device": 4908 }, { "epoch": 0.5832, "loss_ce": 0.023276537656784058, "loss_lvr": 0.7483760118484497, "loss_mode_switch": 0.0, "loss_total": 0.09811414033174515, "step": 1458 }, { "batch_size": 4, "epoch": 0.5832, "step": 1458, "tokens_per_device": 12084 }, { "epoch": 0.5832, "loss_ce": 0.3978080749511719, "loss_lvr": 0.47281450033187866, "loss_mode_switch": 0.0, "loss_total": 0.44508951902389526, "step": 1458 }, { "batch_size": 4, "epoch": 0.5832, "step": 1458, "tokens_per_device": 4236 }, { "epoch": 0.5832, "loss_ce": 0.012925270944833755, "loss_lvr": 0.7865588068962097, "loss_mode_switch": 0.0, "loss_total": 0.09158115088939667, "step": 1458 }, { "batch_size": 4, "epoch": 0.5832, "step": 1458, "tokens_per_device": 4232 }, { "epoch": 0.5832, "loss_ce": 0.3206706643104553, "loss_lvr": 0.8790538311004639, "loss_mode_switch": 0.0, "loss_total": 0.40857604146003723, "step": 1458 }, { "batch_size": 4, "epoch": 0.5832, "step": 1458, "tokens_per_device": 4368 }, { "epoch": 0.5832, "loss_ce": 0.026687944307923317, "loss_lvr": 1.1163227558135986, "loss_mode_switch": 0.0, "loss_total": 0.1383202224969864, "step": 1458 }, { "batch_size": 1, "epoch": 0.5832, "step": 1458, "tokens_per_device": 4962 }, { "epoch": 0.5832, "loss_ce": 0.0007635154179297388, "loss_lvr": 0.3617052435874939, "loss_mode_switch": 0.0, "loss_total": 0.03693404048681259, "step": 1458 }, { "epoch": 0.5836, "grad_norm": 1.3065165281295776, "learning_rate": 3.8982239609718965e-06, "loss": 0.2904, "step": 1459 }, { "batch_size": 4, "epoch": 0.5836, "step": 1459, "tokens_per_device": 5876 }, { "epoch": 0.5836, "loss_ce": 0.1259201020002365, "loss_lvr": 0.6334460973739624, "loss_mode_switch": 0.0, "loss_total": 0.189264714717865, "step": 1459 }, { "batch_size": 4, "epoch": 0.5836, "step": 1459, "tokens_per_device": 1332 }, { "epoch": 0.5836, "loss_ce": 0.09297055751085281, "loss_lvr": 1.0988788604736328, "loss_mode_switch": 0.0, "loss_total": 0.20285844802856445, "step": 1459 }, { "batch_size": 1, "epoch": 0.5836, "step": 1459, "tokens_per_device": 4903 }, { "epoch": 0.5836, "loss_ce": 0.030569884926080704, "loss_lvr": 0.26519203186035156, "loss_mode_switch": 0.0, "loss_total": 0.05708909034729004, "step": 1459 }, { "batch_size": 1, "epoch": 0.5836, "step": 1459, "tokens_per_device": 4929 }, { "epoch": 0.5836, "loss_ce": 0.12629354000091553, "loss_lvr": 0.6993247866630554, "loss_mode_switch": 0.0, "loss_total": 0.19622603058815002, "step": 1459 }, { "batch_size": 4, "epoch": 0.5836, "step": 1459, "tokens_per_device": 6532 }, { "epoch": 0.5836, "loss_ce": 0.0885024294257164, "loss_lvr": 0.8777276873588562, "loss_mode_switch": 0.0, "loss_total": 0.17627519369125366, "step": 1459 }, { "batch_size": 4, "epoch": 0.5836, "step": 1459, "tokens_per_device": 4456 }, { "epoch": 0.5836, "loss_ce": 0.1788025200366974, "loss_lvr": 0.7100480198860168, "loss_mode_switch": 0.0, "loss_total": 0.24980732798576355, "step": 1459 }, { "batch_size": 4, "epoch": 0.5836, "step": 1459, "tokens_per_device": 3080 }, { "epoch": 0.5836, "loss_ce": 0.38453763723373413, "loss_lvr": 0.911497175693512, "loss_mode_switch": 0.0, "loss_total": 0.4756873548030853, "step": 1459 }, { "batch_size": 1, "epoch": 0.5836, "step": 1459, "tokens_per_device": 4907 }, { "epoch": 0.5836, "loss_ce": 0.012714866548776627, "loss_lvr": 0.2904738485813141, "loss_mode_switch": 0.0, "loss_total": 0.041762251406908035, "step": 1459 }, { "epoch": 0.584, "grad_norm": 1.3392292261123657, "learning_rate": 3.89190659586623e-06, "loss": 0.29, "step": 1460 }, { "batch_size": 4, "epoch": 0.584, "step": 1460, "tokens_per_device": 4392 }, { "epoch": 0.584, "loss_ce": 0.38051339983940125, "loss_lvr": 0.8175304532051086, "loss_mode_switch": 0.0, "loss_total": 0.4622664451599121, "step": 1460 }, { "batch_size": 4, "epoch": 0.584, "step": 1460, "tokens_per_device": 2704 }, { "epoch": 0.584, "loss_ce": 0.40339499711990356, "loss_lvr": 0.5695081949234009, "loss_mode_switch": 0.0, "loss_total": 0.4603458046913147, "step": 1460 }, { "batch_size": 4, "epoch": 0.584, "step": 1460, "tokens_per_device": 3972 }, { "epoch": 0.584, "loss_ce": 0.08133238554000854, "loss_lvr": 0.8335689306259155, "loss_mode_switch": 0.0, "loss_total": 0.16468927264213562, "step": 1460 }, { "batch_size": 1, "epoch": 0.584, "step": 1460, "tokens_per_device": 5880 }, { "epoch": 0.584, "loss_ce": 0.08080439269542694, "loss_lvr": 0.49127206206321716, "loss_mode_switch": 0.0, "loss_total": 0.12993159890174866, "step": 1460 }, { "batch_size": 4, "epoch": 0.584, "step": 1460, "tokens_per_device": 4216 }, { "epoch": 0.584, "loss_ce": 0.18023806810379028, "loss_lvr": 0.6660826206207275, "loss_mode_switch": 0.0, "loss_total": 0.24684633314609528, "step": 1460 }, { "batch_size": 1, "epoch": 0.584, "step": 1460, "tokens_per_device": 5915 }, { "epoch": 0.584, "loss_ce": 0.0025781551375985146, "loss_lvr": 0.31306374073028564, "loss_mode_switch": 0.0, "loss_total": 0.033884529024362564, "step": 1460 }, { "batch_size": 1, "epoch": 0.584, "step": 1460, "tokens_per_device": 5176 }, { "epoch": 0.584, "loss_ce": 0.016368795186281204, "loss_lvr": 0.3105546534061432, "loss_mode_switch": 0.0, "loss_total": 0.04742426052689552, "step": 1460 }, { "batch_size": 4, "epoch": 0.584, "step": 1460, "tokens_per_device": 7256 }, { "epoch": 0.584, "loss_ce": 0.17057499289512634, "loss_lvr": 0.6459908485412598, "loss_mode_switch": 0.0, "loss_total": 0.23517408967018127, "step": 1460 }, { "epoch": 0.5844, "grad_norm": 1.2698845863342285, "learning_rate": 3.885591090502003e-06, "loss": 0.2908, "step": 1461 }, { "batch_size": 1, "epoch": 0.5844, "step": 1461, "tokens_per_device": 5058 }, { "epoch": 0.5844, "loss_ce": 0.5642644762992859, "loss_lvr": 0.7504584193229675, "loss_mode_switch": 0.0, "loss_total": 0.6393103003501892, "step": 1461 }, { "batch_size": 4, "epoch": 0.5844, "step": 1461, "tokens_per_device": 8444 }, { "epoch": 0.5844, "loss_ce": 0.13895584642887115, "loss_lvr": 0.3605858385562897, "loss_mode_switch": 0.0, "loss_total": 0.1750144362449646, "step": 1461 }, { "batch_size": 1, "epoch": 0.5844, "step": 1461, "tokens_per_device": 5179 }, { "epoch": 0.5844, "loss_ce": 0.04619557410478592, "loss_lvr": 0.3327619433403015, "loss_mode_switch": 0.0, "loss_total": 0.07947176694869995, "step": 1461 }, { "batch_size": 4, "epoch": 0.5844, "step": 1461, "tokens_per_device": 5048 }, { "epoch": 0.5844, "loss_ce": 0.23378688097000122, "loss_lvr": 0.7195587158203125, "loss_mode_switch": 0.0, "loss_total": 0.3057427406311035, "step": 1461 }, { "batch_size": 4, "epoch": 0.5844, "step": 1461, "tokens_per_device": 4708 }, { "epoch": 0.5844, "loss_ce": 0.4021434783935547, "loss_lvr": 0.9325161576271057, "loss_mode_switch": 0.0, "loss_total": 0.49539509415626526, "step": 1461 }, { "batch_size": 4, "epoch": 0.5844, "step": 1461, "tokens_per_device": 2644 }, { "epoch": 0.5844, "loss_ce": 0.32333967089653015, "loss_lvr": 0.9941875338554382, "loss_mode_switch": 0.0, "loss_total": 0.42275843024253845, "step": 1461 }, { "batch_size": 4, "epoch": 0.5844, "step": 1461, "tokens_per_device": 3784 }, { "epoch": 0.5844, "loss_ce": 0.06737619638442993, "loss_lvr": 1.0715049505233765, "loss_mode_switch": 0.0, "loss_total": 0.17452669143676758, "step": 1461 }, { "batch_size": 1, "epoch": 0.5844, "step": 1461, "tokens_per_device": 4894 }, { "epoch": 0.5844, "loss_ce": 0.23130635917186737, "loss_lvr": 0.1730918437242508, "loss_mode_switch": 0.0, "loss_total": 0.2486155480146408, "step": 1461 }, { "epoch": 0.5848, "grad_norm": 1.3368929624557495, "learning_rate": 3.879277455478689e-06, "loss": 0.2542, "step": 1462 }, { "batch_size": 4, "epoch": 0.5848, "step": 1462, "tokens_per_device": 9640 }, { "epoch": 0.5848, "loss_ce": 0.19957122206687927, "loss_lvr": 1.0493704080581665, "loss_mode_switch": 0.0, "loss_total": 0.3045082688331604, "step": 1462 }, { "batch_size": 1, "epoch": 0.5848, "step": 1462, "tokens_per_device": 4964 }, { "epoch": 0.5848, "loss_ce": 0.6016086935997009, "loss_lvr": 0.41920286417007446, "loss_mode_switch": 0.0, "loss_total": 0.6435289978981018, "step": 1462 }, { "batch_size": 4, "epoch": 0.5848, "step": 1462, "tokens_per_device": 5272 }, { "epoch": 0.5848, "loss_ce": 0.32273346185684204, "loss_lvr": 0.8518431782722473, "loss_mode_switch": 0.0, "loss_total": 0.4079177975654602, "step": 1462 }, { "batch_size": 4, "epoch": 0.5848, "step": 1462, "tokens_per_device": 7664 }, { "epoch": 0.5848, "loss_ce": 0.11424846947193146, "loss_lvr": 0.3204364478588104, "loss_mode_switch": 0.0, "loss_total": 0.14629212021827698, "step": 1462 }, { "batch_size": 4, "epoch": 0.5848, "step": 1462, "tokens_per_device": 5508 }, { "epoch": 0.5848, "loss_ce": 0.5455659627914429, "loss_lvr": 0.7363251447677612, "loss_mode_switch": 0.0, "loss_total": 0.6191985011100769, "step": 1462 }, { "batch_size": 4, "epoch": 0.5848, "step": 1462, "tokens_per_device": 4212 }, { "epoch": 0.5848, "loss_ce": 0.17927637696266174, "loss_lvr": 0.6101569533348083, "loss_mode_switch": 0.0, "loss_total": 0.24029207229614258, "step": 1462 }, { "batch_size": 4, "epoch": 0.5848, "step": 1462, "tokens_per_device": 5564 }, { "epoch": 0.5848, "loss_ce": 0.5235550999641418, "loss_lvr": 0.9859810471534729, "loss_mode_switch": 0.0, "loss_total": 0.6221532225608826, "step": 1462 }, { "batch_size": 4, "epoch": 0.5848, "step": 1462, "tokens_per_device": 10856 }, { "epoch": 0.5848, "loss_ce": 0.1154099777340889, "loss_lvr": 0.6563891172409058, "loss_mode_switch": 0.0, "loss_total": 0.1810488998889923, "step": 1462 }, { "epoch": 0.5852, "grad_norm": 1.3540109395980835, "learning_rate": 3.872965701392626e-06, "loss": 0.3079, "step": 1463 }, { "batch_size": 4, "epoch": 0.5852, "step": 1463, "tokens_per_device": 2632 }, { "epoch": 0.5852, "loss_ce": 0.12567685544490814, "loss_lvr": 0.8339725732803345, "loss_mode_switch": 0.0, "loss_total": 0.20907410979270935, "step": 1463 }, { "batch_size": 4, "epoch": 0.5852, "step": 1463, "tokens_per_device": 4252 }, { "epoch": 0.5852, "loss_ce": 0.17273929715156555, "loss_lvr": 0.7687715888023376, "loss_mode_switch": 0.0, "loss_total": 0.24961645901203156, "step": 1463 }, { "batch_size": 4, "epoch": 0.5852, "step": 1463, "tokens_per_device": 3756 }, { "epoch": 0.5852, "loss_ce": 0.2268456220626831, "loss_lvr": 0.8124712705612183, "loss_mode_switch": 0.0, "loss_total": 0.30809274315834045, "step": 1463 }, { "batch_size": 4, "epoch": 0.5852, "step": 1463, "tokens_per_device": 4516 }, { "epoch": 0.5852, "loss_ce": 0.3810156285762787, "loss_lvr": 0.6686863303184509, "loss_mode_switch": 0.0, "loss_total": 0.4478842616081238, "step": 1463 }, { "batch_size": 1, "epoch": 0.5852, "step": 1463, "tokens_per_device": 4903 }, { "epoch": 0.5852, "loss_ce": 0.36892029643058777, "loss_lvr": 0.44784581661224365, "loss_mode_switch": 0.0, "loss_total": 0.41370487213134766, "step": 1463 }, { "batch_size": 4, "epoch": 0.5852, "step": 1463, "tokens_per_device": 1312 }, { "epoch": 0.5852, "loss_ce": 0.40230315923690796, "loss_lvr": 1.0415407419204712, "loss_mode_switch": 0.0, "loss_total": 0.5064572095870972, "step": 1463 }, { "batch_size": 1, "epoch": 0.5852, "step": 1463, "tokens_per_device": 5682 }, { "epoch": 0.5852, "loss_ce": 0.00018365612777415663, "loss_lvr": 0.31499752402305603, "loss_mode_switch": 0.0, "loss_total": 0.031683411449193954, "step": 1463 }, { "batch_size": 1, "epoch": 0.5852, "step": 1463, "tokens_per_device": 4883 }, { "epoch": 0.5852, "loss_ce": 0.011412571184337139, "loss_lvr": 0.27266910672187805, "loss_mode_switch": 0.0, "loss_total": 0.03867948055267334, "step": 1463 }, { "epoch": 0.5856, "grad_norm": 1.2314233779907227, "learning_rate": 3.8666558388369895e-06, "loss": 0.2703, "step": 1464 }, { "batch_size": 1, "epoch": 0.5856, "step": 1464, "tokens_per_device": 4832 }, { "epoch": 0.5856, "loss_ce": 0.010257672518491745, "loss_lvr": 0.3602563142776489, "loss_mode_switch": 0.0, "loss_total": 0.0462833046913147, "step": 1464 }, { "batch_size": 4, "epoch": 0.5856, "step": 1464, "tokens_per_device": 1676 }, { "epoch": 0.5856, "loss_ce": 0.40856772661209106, "loss_lvr": 0.924371600151062, "loss_mode_switch": 0.0, "loss_total": 0.5010048747062683, "step": 1464 }, { "batch_size": 1, "epoch": 0.5856, "step": 1464, "tokens_per_device": 6636 }, { "epoch": 0.5856, "loss_ce": 0.0008160659926943481, "loss_lvr": 0.2702351212501526, "loss_mode_switch": 0.0, "loss_total": 0.027839578688144684, "step": 1464 }, { "batch_size": 1, "epoch": 0.5856, "step": 1464, "tokens_per_device": 4995 }, { "epoch": 0.5856, "loss_ce": 0.03086346574127674, "loss_lvr": 0.46791163086891174, "loss_mode_switch": 0.0, "loss_total": 0.077654629945755, "step": 1464 }, { "batch_size": 4, "epoch": 0.5856, "step": 1464, "tokens_per_device": 6224 }, { "epoch": 0.5856, "loss_ce": 0.533054769039154, "loss_lvr": 0.6269150972366333, "loss_mode_switch": 0.0, "loss_total": 0.5957462787628174, "step": 1464 }, { "batch_size": 4, "epoch": 0.5856, "step": 1464, "tokens_per_device": 5700 }, { "epoch": 0.5856, "loss_ce": 0.3080785572528839, "loss_lvr": 0.9040468335151672, "loss_mode_switch": 0.0, "loss_total": 0.3984832465648651, "step": 1464 }, { "batch_size": 1, "epoch": 0.5856, "step": 1464, "tokens_per_device": 5002 }, { "epoch": 0.5856, "loss_ce": 0.38775137066841125, "loss_lvr": 0.3611670434474945, "loss_mode_switch": 0.0, "loss_total": 0.4238680601119995, "step": 1464 }, { "batch_size": 4, "epoch": 0.5856, "step": 1464, "tokens_per_device": 6292 }, { "epoch": 0.5856, "loss_ce": 0.6203551888465881, "loss_lvr": 0.8732345700263977, "loss_mode_switch": 0.0, "loss_total": 0.7076786756515503, "step": 1464 }, { "epoch": 0.586, "grad_norm": 1.2723661661148071, "learning_rate": 3.8603478784017845e-06, "loss": 0.296, "step": 1465 }, { "batch_size": 4, "epoch": 0.586, "step": 1465, "tokens_per_device": 4184 }, { "epoch": 0.586, "loss_ce": 0.17635111510753632, "loss_lvr": 0.7495140433311462, "loss_mode_switch": 0.0, "loss_total": 0.2513025104999542, "step": 1465 }, { "batch_size": 1, "epoch": 0.586, "step": 1465, "tokens_per_device": 5089 }, { "epoch": 0.586, "loss_ce": 0.03155633062124252, "loss_lvr": 0.3952762484550476, "loss_mode_switch": 0.0, "loss_total": 0.07108395546674728, "step": 1465 }, { "batch_size": 4, "epoch": 0.586, "step": 1465, "tokens_per_device": 4364 }, { "epoch": 0.586, "loss_ce": 0.2969161570072174, "loss_lvr": 1.4100888967514038, "loss_mode_switch": 0.0, "loss_total": 0.4379250407218933, "step": 1465 }, { "batch_size": 4, "epoch": 0.586, "step": 1465, "tokens_per_device": 2988 }, { "epoch": 0.586, "loss_ce": 0.15814581513404846, "loss_lvr": 0.8276419043540955, "loss_mode_switch": 0.0, "loss_total": 0.24091000854969025, "step": 1465 }, { "batch_size": 4, "epoch": 0.586, "step": 1465, "tokens_per_device": 4600 }, { "epoch": 0.586, "loss_ce": 0.15662865340709686, "loss_lvr": 0.8181638121604919, "loss_mode_switch": 0.0, "loss_total": 0.23844504356384277, "step": 1465 }, { "batch_size": 1, "epoch": 0.586, "step": 1465, "tokens_per_device": 4890 }, { "epoch": 0.586, "loss_ce": 0.10377229750156403, "loss_lvr": 0.26141157746315, "loss_mode_switch": 0.0, "loss_total": 0.12991344928741455, "step": 1465 }, { "batch_size": 4, "epoch": 0.586, "step": 1465, "tokens_per_device": 2192 }, { "epoch": 0.586, "loss_ce": 0.11992968618869781, "loss_lvr": 0.8708084225654602, "loss_mode_switch": 0.0, "loss_total": 0.20701053738594055, "step": 1465 }, { "batch_size": 4, "epoch": 0.586, "step": 1465, "tokens_per_device": 12196 }, { "epoch": 0.586, "loss_ce": 0.0822567492723465, "loss_lvr": 1.0254729986190796, "loss_mode_switch": 0.0, "loss_total": 0.1848040521144867, "step": 1465 }, { "epoch": 0.5864, "grad_norm": 1.3009804487228394, "learning_rate": 3.854041830673818e-06, "loss": 0.2785, "step": 1466 }, { "batch_size": 4, "epoch": 0.5864, "step": 1466, "tokens_per_device": 3904 }, { "epoch": 0.5864, "loss_ce": 0.15454240143299103, "loss_lvr": 0.6133190393447876, "loss_mode_switch": 0.0, "loss_total": 0.2158743143081665, "step": 1466 }, { "batch_size": 4, "epoch": 0.5864, "step": 1466, "tokens_per_device": 1536 }, { "epoch": 0.5864, "loss_ce": 0.1773434728384018, "loss_lvr": 1.0463091135025024, "loss_mode_switch": 0.0, "loss_total": 0.2819743752479553, "step": 1466 }, { "batch_size": 4, "epoch": 0.5864, "step": 1466, "tokens_per_device": 6160 }, { "epoch": 0.5864, "loss_ce": 0.09969860315322876, "loss_lvr": 0.3704879581928253, "loss_mode_switch": 0.0, "loss_total": 0.13674740493297577, "step": 1466 }, { "batch_size": 4, "epoch": 0.5864, "step": 1466, "tokens_per_device": 4244 }, { "epoch": 0.5864, "loss_ce": 0.2375904619693756, "loss_lvr": 0.8897601366043091, "loss_mode_switch": 0.0, "loss_total": 0.3265664875507355, "step": 1466 }, { "batch_size": 4, "epoch": 0.5864, "step": 1466, "tokens_per_device": 5572 }, { "epoch": 0.5864, "loss_ce": 0.020032979547977448, "loss_lvr": 0.8342959880828857, "loss_mode_switch": 0.0, "loss_total": 0.1034625768661499, "step": 1466 }, { "batch_size": 4, "epoch": 0.5864, "step": 1466, "tokens_per_device": 3892 }, { "epoch": 0.5864, "loss_ce": 0.48329851031303406, "loss_lvr": 0.9833016991615295, "loss_mode_switch": 0.0, "loss_total": 0.581628680229187, "step": 1466 }, { "batch_size": 4, "epoch": 0.5864, "step": 1466, "tokens_per_device": 4480 }, { "epoch": 0.5864, "loss_ce": 0.4251839518547058, "loss_lvr": 0.7990426421165466, "loss_mode_switch": 0.0, "loss_total": 0.505088210105896, "step": 1466 }, { "batch_size": 1, "epoch": 0.5864, "step": 1466, "tokens_per_device": 5202 }, { "epoch": 0.5864, "loss_ce": 0.03568178787827492, "loss_lvr": 0.2590997517108917, "loss_mode_switch": 0.0, "loss_total": 0.06159176304936409, "step": 1466 }, { "epoch": 0.5868, "grad_norm": 1.3830153942108154, "learning_rate": 3.847737706236696e-06, "loss": 0.3109, "step": 1467 }, { "batch_size": 4, "epoch": 0.5868, "step": 1467, "tokens_per_device": 4440 }, { "epoch": 0.5868, "loss_ce": 0.6158161163330078, "loss_lvr": 0.8523901700973511, "loss_mode_switch": 0.0, "loss_total": 0.701055109500885, "step": 1467 }, { "batch_size": 4, "epoch": 0.5868, "step": 1467, "tokens_per_device": 11896 }, { "epoch": 0.5868, "loss_ce": 0.07758674025535583, "loss_lvr": 0.687591552734375, "loss_mode_switch": 0.0, "loss_total": 0.14634589850902557, "step": 1467 }, { "batch_size": 4, "epoch": 0.5868, "step": 1467, "tokens_per_device": 5232 }, { "epoch": 0.5868, "loss_ce": 0.13547676801681519, "loss_lvr": 0.7889183759689331, "loss_mode_switch": 0.0, "loss_total": 0.21436861157417297, "step": 1467 }, { "batch_size": 1, "epoch": 0.5868, "step": 1467, "tokens_per_device": 4856 }, { "epoch": 0.5868, "loss_ce": 0.04367136210203171, "loss_lvr": 0.20306338369846344, "loss_mode_switch": 0.0, "loss_total": 0.06397770345211029, "step": 1467 }, { "batch_size": 4, "epoch": 0.5868, "step": 1467, "tokens_per_device": 3980 }, { "epoch": 0.5868, "loss_ce": 0.22768501937389374, "loss_lvr": 0.9066659212112427, "loss_mode_switch": 0.0, "loss_total": 0.3183516263961792, "step": 1467 }, { "batch_size": 4, "epoch": 0.5868, "step": 1467, "tokens_per_device": 4384 }, { "epoch": 0.5868, "loss_ce": 0.08328582346439362, "loss_lvr": 0.8890379071235657, "loss_mode_switch": 0.0, "loss_total": 0.1721896231174469, "step": 1467 }, { "batch_size": 4, "epoch": 0.5868, "step": 1467, "tokens_per_device": 4320 }, { "epoch": 0.5868, "loss_ce": 0.6205306649208069, "loss_lvr": 2.2425639629364014, "loss_mode_switch": 0.0, "loss_total": 0.844787061214447, "step": 1467 }, { "batch_size": 4, "epoch": 0.5868, "step": 1467, "tokens_per_device": 1552 }, { "epoch": 0.5868, "loss_ce": 0.4853332042694092, "loss_lvr": 0.9095214605331421, "loss_mode_switch": 0.0, "loss_total": 0.5762853622436523, "step": 1467 }, { "epoch": 0.5872, "grad_norm": 1.2639280557632446, "learning_rate": 3.8414355156707895e-06, "loss": 0.3035, "step": 1468 }, { "batch_size": 1, "epoch": 0.5872, "step": 1468, "tokens_per_device": 5009 }, { "epoch": 0.5872, "loss_ce": 0.04329034686088562, "loss_lvr": 0.5428000092506409, "loss_mode_switch": 0.0, "loss_total": 0.09757034480571747, "step": 1468 }, { "batch_size": 4, "epoch": 0.5872, "step": 1468, "tokens_per_device": 4220 }, { "epoch": 0.5872, "loss_ce": 0.10193747282028198, "loss_lvr": 0.8582562208175659, "loss_mode_switch": 0.0, "loss_total": 0.18776309490203857, "step": 1468 }, { "batch_size": 4, "epoch": 0.5872, "step": 1468, "tokens_per_device": 4612 }, { "epoch": 0.5872, "loss_ce": 0.3311240077018738, "loss_lvr": 0.9065399169921875, "loss_mode_switch": 0.0, "loss_total": 0.42177799344062805, "step": 1468 }, { "batch_size": 4, "epoch": 0.5872, "step": 1468, "tokens_per_device": 4952 }, { "epoch": 0.5872, "loss_ce": 0.3412728011608124, "loss_lvr": 0.799944281578064, "loss_mode_switch": 0.0, "loss_total": 0.42126724123954773, "step": 1468 }, { "batch_size": 4, "epoch": 0.5872, "step": 1468, "tokens_per_device": 6068 }, { "epoch": 0.5872, "loss_ce": 0.0016797761199995875, "loss_lvr": 1.0212093591690063, "loss_mode_switch": 0.0, "loss_total": 0.1038007140159607, "step": 1468 }, { "batch_size": 4, "epoch": 0.5872, "step": 1468, "tokens_per_device": 2588 }, { "epoch": 0.5872, "loss_ce": 0.24230997264385223, "loss_lvr": 1.5898741483688354, "loss_mode_switch": 0.0, "loss_total": 0.401297390460968, "step": 1468 }, { "batch_size": 1, "epoch": 0.5872, "step": 1468, "tokens_per_device": 4817 }, { "epoch": 0.5872, "loss_ce": 0.1015559509396553, "loss_lvr": 0.45492780208587646, "loss_mode_switch": 0.0, "loss_total": 0.1470487266778946, "step": 1468 }, { "batch_size": 1, "epoch": 0.5872, "step": 1468, "tokens_per_device": 4963 }, { "epoch": 0.5872, "loss_ce": 0.032984089106321335, "loss_lvr": 0.2527219355106354, "loss_mode_switch": 0.0, "loss_total": 0.05825628340244293, "step": 1468 }, { "epoch": 0.5876, "grad_norm": 1.4688149690628052, "learning_rate": 3.835135269553226e-06, "loss": 0.2696, "step": 1469 }, { "batch_size": 4, "epoch": 0.5876, "step": 1469, "tokens_per_device": 3760 }, { "epoch": 0.5876, "loss_ce": 0.1255888193845749, "loss_lvr": 1.0241053104400635, "loss_mode_switch": 0.0, "loss_total": 0.22799935936927795, "step": 1469 }, { "batch_size": 1, "epoch": 0.5876, "step": 1469, "tokens_per_device": 5180 }, { "epoch": 0.5876, "loss_ce": 0.0002963141305372119, "loss_lvr": 0.520805835723877, "loss_mode_switch": 0.0, "loss_total": 0.052376896142959595, "step": 1469 }, { "batch_size": 1, "epoch": 0.5876, "step": 1469, "tokens_per_device": 7071 }, { "epoch": 0.5876, "loss_ce": 0.0031410555820912123, "loss_lvr": 0.29841163754463196, "loss_mode_switch": 0.0, "loss_total": 0.03298221901059151, "step": 1469 }, { "batch_size": 4, "epoch": 0.5876, "step": 1469, "tokens_per_device": 4664 }, { "epoch": 0.5876, "loss_ce": 0.09860958158969879, "loss_lvr": 0.737942636013031, "loss_mode_switch": 0.0, "loss_total": 0.17240384221076965, "step": 1469 }, { "batch_size": 4, "epoch": 0.5876, "step": 1469, "tokens_per_device": 3920 }, { "epoch": 0.5876, "loss_ce": 0.0029815060552209616, "loss_lvr": 0.5870406031608582, "loss_mode_switch": 0.0, "loss_total": 0.06168556585907936, "step": 1469 }, { "batch_size": 1, "epoch": 0.5876, "step": 1469, "tokens_per_device": 5123 }, { "epoch": 0.5876, "loss_ce": 0.018336357548832893, "loss_lvr": 0.4234037399291992, "loss_mode_switch": 0.0, "loss_total": 0.060676731169223785, "step": 1469 }, { "batch_size": 4, "epoch": 0.5876, "step": 1469, "tokens_per_device": 6232 }, { "epoch": 0.5876, "loss_ce": 0.46205493807792664, "loss_lvr": 0.7285348773002625, "loss_mode_switch": 0.0, "loss_total": 0.5349084138870239, "step": 1469 }, { "batch_size": 1, "epoch": 0.5876, "step": 1469, "tokens_per_device": 5135 }, { "epoch": 0.5876, "loss_ce": 0.03798884525895119, "loss_lvr": 0.23812881112098694, "loss_mode_switch": 0.0, "loss_total": 0.0618017241358757, "step": 1469 }, { "epoch": 0.588, "grad_norm": 1.3771836757659912, "learning_rate": 3.828836978457868e-06, "loss": 0.2753, "step": 1470 }, { "batch_size": 1, "epoch": 0.588, "step": 1470, "tokens_per_device": 4867 }, { "epoch": 0.588, "loss_ce": 0.0021889847703278065, "loss_lvr": 0.2108490914106369, "loss_mode_switch": 0.0, "loss_total": 0.0232738945633173, "step": 1470 }, { "batch_size": 4, "epoch": 0.588, "step": 1470, "tokens_per_device": 4288 }, { "epoch": 0.588, "loss_ce": 0.3286346197128296, "loss_lvr": 0.598357617855072, "loss_mode_switch": 0.0, "loss_total": 0.3884703814983368, "step": 1470 }, { "batch_size": 4, "epoch": 0.588, "step": 1470, "tokens_per_device": 3900 }, { "epoch": 0.588, "loss_ce": 0.4113118052482605, "loss_lvr": 1.0163261890411377, "loss_mode_switch": 0.0, "loss_total": 0.5129444003105164, "step": 1470 }, { "batch_size": 1, "epoch": 0.588, "step": 1470, "tokens_per_device": 7971 }, { "epoch": 0.588, "loss_ce": 0.1806747317314148, "loss_lvr": 0.3351926803588867, "loss_mode_switch": 0.0, "loss_total": 0.21419399976730347, "step": 1470 }, { "batch_size": 4, "epoch": 0.588, "step": 1470, "tokens_per_device": 5904 }, { "epoch": 0.588, "loss_ce": 0.1021033450961113, "loss_lvr": 0.7103806138038635, "loss_mode_switch": 0.0, "loss_total": 0.17314140498638153, "step": 1470 }, { "batch_size": 4, "epoch": 0.588, "step": 1470, "tokens_per_device": 4212 }, { "epoch": 0.588, "loss_ce": 0.1706574708223343, "loss_lvr": 0.9237542748451233, "loss_mode_switch": 0.0, "loss_total": 0.2630329132080078, "step": 1470 }, { "batch_size": 1, "epoch": 0.588, "step": 1470, "tokens_per_device": 5142 }, { "epoch": 0.588, "loss_ce": 0.012427138164639473, "loss_lvr": 0.22078895568847656, "loss_mode_switch": 0.0, "loss_total": 0.03450603410601616, "step": 1470 }, { "batch_size": 4, "epoch": 0.588, "step": 1470, "tokens_per_device": 3828 }, { "epoch": 0.588, "loss_ce": 0.4155636429786682, "loss_lvr": 0.6423022747039795, "loss_mode_switch": 0.0, "loss_total": 0.47979387640953064, "step": 1470 }, { "epoch": 0.5884, "grad_norm": 1.4462164640426636, "learning_rate": 3.822540652955298e-06, "loss": 0.2974, "step": 1471 }, { "batch_size": 4, "epoch": 0.5884, "step": 1471, "tokens_per_device": 4104 }, { "epoch": 0.5884, "loss_ce": 0.21198488771915436, "loss_lvr": 0.43358123302459717, "loss_mode_switch": 0.0, "loss_total": 0.2553430199623108, "step": 1471 }, { "batch_size": 1, "epoch": 0.5884, "step": 1471, "tokens_per_device": 6143 }, { "epoch": 0.5884, "loss_ce": 0.011183519847691059, "loss_lvr": 0.28461959958076477, "loss_mode_switch": 0.0, "loss_total": 0.0396454818546772, "step": 1471 }, { "batch_size": 1, "epoch": 0.5884, "step": 1471, "tokens_per_device": 4878 }, { "epoch": 0.5884, "loss_ce": 0.02933088131248951, "loss_lvr": 2.714146375656128, "loss_mode_switch": 0.0, "loss_total": 0.30074551701545715, "step": 1471 }, { "batch_size": 4, "epoch": 0.5884, "step": 1471, "tokens_per_device": 3752 }, { "epoch": 0.5884, "loss_ce": 0.5882328152656555, "loss_lvr": 1.1346849203109741, "loss_mode_switch": 0.0, "loss_total": 0.701701283454895, "step": 1471 }, { "batch_size": 1, "epoch": 0.5884, "step": 1471, "tokens_per_device": 4866 }, { "epoch": 0.5884, "loss_ce": 0.00044574736966751516, "loss_lvr": 0.4557844400405884, "loss_mode_switch": 0.0, "loss_total": 0.04602419212460518, "step": 1471 }, { "batch_size": 1, "epoch": 0.5884, "step": 1471, "tokens_per_device": 4749 }, { "epoch": 0.5884, "loss_ce": 0.0031065333168953657, "loss_lvr": 0.42907899618148804, "loss_mode_switch": 0.0, "loss_total": 0.04601443558931351, "step": 1471 }, { "batch_size": 4, "epoch": 0.5884, "step": 1471, "tokens_per_device": 3788 }, { "epoch": 0.5884, "loss_ce": 0.35617637634277344, "loss_lvr": 0.9542725682258606, "loss_mode_switch": 0.0, "loss_total": 0.45160365104675293, "step": 1471 }, { "batch_size": 4, "epoch": 0.5884, "step": 1471, "tokens_per_device": 4344 }, { "epoch": 0.5884, "loss_ce": 0.04915805906057358, "loss_lvr": 0.7261040210723877, "loss_mode_switch": 0.0, "loss_total": 0.12176845967769623, "step": 1471 }, { "epoch": 0.5888, "grad_norm": 1.1086108684539795, "learning_rate": 3.816246303612802e-06, "loss": 0.2922, "step": 1472 }, { "batch_size": 4, "epoch": 0.5888, "step": 1472, "tokens_per_device": 4032 }, { "epoch": 0.5888, "loss_ce": 0.33883461356163025, "loss_lvr": 1.482879638671875, "loss_mode_switch": 0.0, "loss_total": 0.4871225953102112, "step": 1472 }, { "batch_size": 4, "epoch": 0.5888, "step": 1472, "tokens_per_device": 4356 }, { "epoch": 0.5888, "loss_ce": 0.029451757669448853, "loss_lvr": 1.0242557525634766, "loss_mode_switch": 0.0, "loss_total": 0.1318773329257965, "step": 1472 }, { "batch_size": 4, "epoch": 0.5888, "step": 1472, "tokens_per_device": 4252 }, { "epoch": 0.5888, "loss_ce": 0.3754882216453552, "loss_lvr": 0.7171424627304077, "loss_mode_switch": 0.0, "loss_total": 0.4472024738788605, "step": 1472 }, { "batch_size": 4, "epoch": 0.5888, "step": 1472, "tokens_per_device": 4688 }, { "epoch": 0.5888, "loss_ce": 0.05491410940885544, "loss_lvr": 0.8377703428268433, "loss_mode_switch": 0.0, "loss_total": 0.13869114220142365, "step": 1472 }, { "batch_size": 4, "epoch": 0.5888, "step": 1472, "tokens_per_device": 1464 }, { "epoch": 0.5888, "loss_ce": 0.11805184930562973, "loss_lvr": 1.0151081085205078, "loss_mode_switch": 0.0, "loss_total": 0.21956266462802887, "step": 1472 }, { "batch_size": 4, "epoch": 0.5888, "step": 1472, "tokens_per_device": 2540 }, { "epoch": 0.5888, "loss_ce": 0.23669826984405518, "loss_lvr": 1.0292798280715942, "loss_mode_switch": 0.0, "loss_total": 0.3396262526512146, "step": 1472 }, { "batch_size": 4, "epoch": 0.5888, "step": 1472, "tokens_per_device": 4804 }, { "epoch": 0.5888, "loss_ce": 0.14587466418743134, "loss_lvr": 1.06875479221344, "loss_mode_switch": 0.0, "loss_total": 0.2527501583099365, "step": 1472 }, { "batch_size": 4, "epoch": 0.5888, "step": 1472, "tokens_per_device": 5880 }, { "epoch": 0.5888, "loss_ce": 0.5079797506332397, "loss_lvr": 0.6698882579803467, "loss_mode_switch": 0.0, "loss_total": 0.5749685764312744, "step": 1472 }, { "epoch": 0.5892, "grad_norm": 1.1653436422348022, "learning_rate": 3.809953940994346e-06, "loss": 0.2664, "step": 1473 }, { "batch_size": 4, "epoch": 0.5892, "step": 1473, "tokens_per_device": 4804 }, { "epoch": 0.5892, "loss_ce": 0.1396356225013733, "loss_lvr": 0.9145408868789673, "loss_mode_switch": 0.0, "loss_total": 0.23108971118927002, "step": 1473 }, { "batch_size": 4, "epoch": 0.5892, "step": 1473, "tokens_per_device": 8028 }, { "epoch": 0.5892, "loss_ce": 0.014194166287779808, "loss_lvr": 0.8009899854660034, "loss_mode_switch": 0.0, "loss_total": 0.09429316967725754, "step": 1473 }, { "batch_size": 1, "epoch": 0.5892, "step": 1473, "tokens_per_device": 5172 }, { "epoch": 0.5892, "loss_ce": 0.011247965507209301, "loss_lvr": 0.5276383757591248, "loss_mode_switch": 0.0, "loss_total": 0.06401180475950241, "step": 1473 }, { "batch_size": 4, "epoch": 0.5892, "step": 1473, "tokens_per_device": 4924 }, { "epoch": 0.5892, "loss_ce": 0.2783277630805969, "loss_lvr": 0.8640056252479553, "loss_mode_switch": 0.0, "loss_total": 0.36472833156585693, "step": 1473 }, { "batch_size": 4, "epoch": 0.5892, "step": 1473, "tokens_per_device": 1668 }, { "epoch": 0.5892, "loss_ce": 0.11286433786153793, "loss_lvr": 1.8160287141799927, "loss_mode_switch": 0.0, "loss_total": 0.2944672107696533, "step": 1473 }, { "batch_size": 1, "epoch": 0.5892, "step": 1473, "tokens_per_device": 5109 }, { "epoch": 0.5892, "loss_ce": 0.001713020377792418, "loss_lvr": 0.27156347036361694, "loss_mode_switch": 0.0, "loss_total": 0.028869368135929108, "step": 1473 }, { "batch_size": 1, "epoch": 0.5892, "step": 1473, "tokens_per_device": 5990 }, { "epoch": 0.5892, "loss_ce": 0.1736065149307251, "loss_lvr": 0.3425326347351074, "loss_mode_switch": 0.0, "loss_total": 0.20785978436470032, "step": 1473 }, { "batch_size": 4, "epoch": 0.5892, "step": 1473, "tokens_per_device": 7680 }, { "epoch": 0.5892, "loss_ce": 0.3477655053138733, "loss_lvr": 1.079246163368225, "loss_mode_switch": 0.0, "loss_total": 0.4556901156902313, "step": 1473 }, { "epoch": 0.5896, "grad_norm": 1.1211012601852417, "learning_rate": 3.8036635756605635e-06, "loss": 0.2658, "step": 1474 }, { "batch_size": 4, "epoch": 0.5896, "step": 1474, "tokens_per_device": 4264 }, { "epoch": 0.5896, "loss_ce": 0.0324849933385849, "loss_lvr": 0.7692169547080994, "loss_mode_switch": 0.0, "loss_total": 0.10940668731927872, "step": 1474 }, { "batch_size": 4, "epoch": 0.5896, "step": 1474, "tokens_per_device": 5492 }, { "epoch": 0.5896, "loss_ce": 0.05906546488404274, "loss_lvr": 0.8512193560600281, "loss_mode_switch": 0.0, "loss_total": 0.14418740570545197, "step": 1474 }, { "batch_size": 4, "epoch": 0.5896, "step": 1474, "tokens_per_device": 4236 }, { "epoch": 0.5896, "loss_ce": 0.11486129462718964, "loss_lvr": 0.7080679535865784, "loss_mode_switch": 0.0, "loss_total": 0.18566808104515076, "step": 1474 }, { "batch_size": 4, "epoch": 0.5896, "step": 1474, "tokens_per_device": 4292 }, { "epoch": 0.5896, "loss_ce": 0.12107579410076141, "loss_lvr": 0.6348666548728943, "loss_mode_switch": 0.0, "loss_total": 0.18456245958805084, "step": 1474 }, { "batch_size": 4, "epoch": 0.5896, "step": 1474, "tokens_per_device": 13188 }, { "epoch": 0.5896, "loss_ce": 0.018287327140569687, "loss_lvr": 0.5356326103210449, "loss_mode_switch": 0.0, "loss_total": 0.07185059040784836, "step": 1474 }, { "batch_size": 4, "epoch": 0.5896, "step": 1474, "tokens_per_device": 4304 }, { "epoch": 0.5896, "loss_ce": 0.2318376749753952, "loss_lvr": 1.0625717639923096, "loss_mode_switch": 0.0, "loss_total": 0.3380948603153229, "step": 1474 }, { "batch_size": 4, "epoch": 0.5896, "step": 1474, "tokens_per_device": 1484 }, { "epoch": 0.5896, "loss_ce": 0.07546014338731766, "loss_lvr": 0.8840566277503967, "loss_mode_switch": 0.0, "loss_total": 0.1638658046722412, "step": 1474 }, { "batch_size": 1, "epoch": 0.5896, "step": 1474, "tokens_per_device": 5079 }, { "epoch": 0.5896, "loss_ce": 0.002047202317044139, "loss_lvr": 0.3382960855960846, "loss_mode_switch": 0.0, "loss_total": 0.035876814275979996, "step": 1474 }, { "epoch": 0.59, "grad_norm": 1.14730966091156, "learning_rate": 3.7973752181687336e-06, "loss": 0.2635, "step": 1475 }, { "batch_size": 1, "epoch": 0.59, "step": 1475, "tokens_per_device": 4961 }, { "epoch": 0.59, "loss_ce": 0.01640188880264759, "loss_lvr": 0.31941547989845276, "loss_mode_switch": 0.0, "loss_total": 0.04834343492984772, "step": 1475 }, { "batch_size": 4, "epoch": 0.59, "step": 1475, "tokens_per_device": 3928 }, { "epoch": 0.59, "loss_ce": 0.3490307033061981, "loss_lvr": 0.8891341686248779, "loss_mode_switch": 0.0, "loss_total": 0.43794411420822144, "step": 1475 }, { "batch_size": 4, "epoch": 0.59, "step": 1475, "tokens_per_device": 4444 }, { "epoch": 0.59, "loss_ce": 0.328443318605423, "loss_lvr": 0.7759671807289124, "loss_mode_switch": 0.0, "loss_total": 0.4060400426387787, "step": 1475 }, { "batch_size": 4, "epoch": 0.59, "step": 1475, "tokens_per_device": 1528 }, { "epoch": 0.59, "loss_ce": 0.7075269222259521, "loss_lvr": 0.9622790217399597, "loss_mode_switch": 0.0, "loss_total": 0.8037548065185547, "step": 1475 }, { "batch_size": 4, "epoch": 0.59, "step": 1475, "tokens_per_device": 4284 }, { "epoch": 0.59, "loss_ce": 0.0766986608505249, "loss_lvr": 0.5447654724121094, "loss_mode_switch": 0.0, "loss_total": 0.1311752051115036, "step": 1475 }, { "batch_size": 4, "epoch": 0.59, "step": 1475, "tokens_per_device": 5456 }, { "epoch": 0.59, "loss_ce": 0.24248521029949188, "loss_lvr": 1.0521074533462524, "loss_mode_switch": 0.0, "loss_total": 0.3476959466934204, "step": 1475 }, { "batch_size": 4, "epoch": 0.59, "step": 1475, "tokens_per_device": 2720 }, { "epoch": 0.59, "loss_ce": 0.39658123254776, "loss_lvr": 0.3666824400424957, "loss_mode_switch": 0.0, "loss_total": 0.43324947357177734, "step": 1475 }, { "batch_size": 4, "epoch": 0.59, "step": 1475, "tokens_per_device": 3908 }, { "epoch": 0.59, "loss_ce": 0.4243282377719879, "loss_lvr": 0.8607175350189209, "loss_mode_switch": 0.0, "loss_total": 0.5103999972343445, "step": 1475 }, { "epoch": 0.5904, "grad_norm": 1.3387632369995117, "learning_rate": 3.791088879072766e-06, "loss": 0.26, "step": 1476 }, { "batch_size": 4, "epoch": 0.5904, "step": 1476, "tokens_per_device": 4060 }, { "epoch": 0.5904, "loss_ce": 0.06090737134218216, "loss_lvr": 0.8144887685775757, "loss_mode_switch": 0.0, "loss_total": 0.1423562467098236, "step": 1476 }, { "batch_size": 4, "epoch": 0.5904, "step": 1476, "tokens_per_device": 3796 }, { "epoch": 0.5904, "loss_ce": 0.11090469360351562, "loss_lvr": 0.7035940289497375, "loss_mode_switch": 0.0, "loss_total": 0.18126410245895386, "step": 1476 }, { "batch_size": 4, "epoch": 0.5904, "step": 1476, "tokens_per_device": 3824 }, { "epoch": 0.5904, "loss_ce": 0.08814729005098343, "loss_lvr": 0.5366415977478027, "loss_mode_switch": 0.0, "loss_total": 0.14181144535541534, "step": 1476 }, { "batch_size": 4, "epoch": 0.5904, "step": 1476, "tokens_per_device": 4244 }, { "epoch": 0.5904, "loss_ce": 0.09239661693572998, "loss_lvr": 0.7693544030189514, "loss_mode_switch": 0.0, "loss_total": 0.16933205723762512, "step": 1476 }, { "batch_size": 1, "epoch": 0.5904, "step": 1476, "tokens_per_device": 4970 }, { "epoch": 0.5904, "loss_ce": 0.09226319938898087, "loss_lvr": 0.2614594101905823, "loss_mode_switch": 0.0, "loss_total": 0.11840914189815521, "step": 1476 }, { "batch_size": 4, "epoch": 0.5904, "step": 1476, "tokens_per_device": 4100 }, { "epoch": 0.5904, "loss_ce": 0.5112688541412354, "loss_lvr": 1.4611155986785889, "loss_mode_switch": 0.0, "loss_total": 0.6573804020881653, "step": 1476 }, { "batch_size": 4, "epoch": 0.5904, "step": 1476, "tokens_per_device": 4556 }, { "epoch": 0.5904, "loss_ce": 0.1456824094057083, "loss_lvr": 0.8122824430465698, "loss_mode_switch": 0.0, "loss_total": 0.22691065073013306, "step": 1476 }, { "batch_size": 4, "epoch": 0.5904, "step": 1476, "tokens_per_device": 2812 }, { "epoch": 0.5904, "loss_ce": 0.77773118019104, "loss_lvr": 0.5937411785125732, "loss_mode_switch": 0.0, "loss_total": 0.8371052742004395, "step": 1476 }, { "epoch": 0.5908, "grad_norm": 1.2219464778900146, "learning_rate": 3.784804568923188e-06, "loss": 0.2781, "step": 1477 }, { "batch_size": 4, "epoch": 0.5908, "step": 1477, "tokens_per_device": 4572 }, { "epoch": 0.5908, "loss_ce": 0.1486758589744568, "loss_lvr": 0.6065051555633545, "loss_mode_switch": 0.0, "loss_total": 0.20932637155056, "step": 1477 }, { "batch_size": 4, "epoch": 0.5908, "step": 1477, "tokens_per_device": 4240 }, { "epoch": 0.5908, "loss_ce": 0.6237326860427856, "loss_lvr": 1.1237698793411255, "loss_mode_switch": 0.0, "loss_total": 0.7361096739768982, "step": 1477 }, { "batch_size": 4, "epoch": 0.5908, "step": 1477, "tokens_per_device": 3856 }, { "epoch": 0.5908, "loss_ce": 0.23984579741954803, "loss_lvr": 0.47961536049842834, "loss_mode_switch": 0.0, "loss_total": 0.2878073453903198, "step": 1477 }, { "batch_size": 4, "epoch": 0.5908, "step": 1477, "tokens_per_device": 6296 }, { "epoch": 0.5908, "loss_ce": 0.02545791305601597, "loss_lvr": 0.7780990600585938, "loss_mode_switch": 0.0, "loss_total": 0.10326781868934631, "step": 1477 }, { "batch_size": 4, "epoch": 0.5908, "step": 1477, "tokens_per_device": 3816 }, { "epoch": 0.5908, "loss_ce": 0.14571818709373474, "loss_lvr": 1.2812182903289795, "loss_mode_switch": 0.0, "loss_total": 0.2738400101661682, "step": 1477 }, { "batch_size": 4, "epoch": 0.5908, "step": 1477, "tokens_per_device": 4696 }, { "epoch": 0.5908, "loss_ce": 0.2890225648880005, "loss_lvr": 0.7346780896186829, "loss_mode_switch": 0.0, "loss_total": 0.36249038577079773, "step": 1477 }, { "batch_size": 4, "epoch": 0.5908, "step": 1477, "tokens_per_device": 5096 }, { "epoch": 0.5908, "loss_ce": 0.47445330023765564, "loss_lvr": 0.870878279209137, "loss_mode_switch": 0.0, "loss_total": 0.5615411400794983, "step": 1477 }, { "batch_size": 4, "epoch": 0.5908, "step": 1477, "tokens_per_device": 5608 }, { "epoch": 0.5908, "loss_ce": 0.050279874354600906, "loss_lvr": 0.9478172659873962, "loss_mode_switch": 0.0, "loss_total": 0.14506159722805023, "step": 1477 }, { "epoch": 0.5912, "grad_norm": 1.455647349357605, "learning_rate": 3.778522298267117e-06, "loss": 0.3713, "step": 1478 }, { "batch_size": 1, "epoch": 0.5912, "step": 1478, "tokens_per_device": 4898 }, { "epoch": 0.5912, "loss_ce": 0.14536383748054504, "loss_lvr": 0.3079996705055237, "loss_mode_switch": 0.0, "loss_total": 0.17616380751132965, "step": 1478 }, { "batch_size": 4, "epoch": 0.5912, "step": 1478, "tokens_per_device": 5852 }, { "epoch": 0.5912, "loss_ce": 0.06573467701673508, "loss_lvr": 0.8515055179595947, "loss_mode_switch": 0.0, "loss_total": 0.1508852243423462, "step": 1478 }, { "batch_size": 4, "epoch": 0.5912, "step": 1478, "tokens_per_device": 8908 }, { "epoch": 0.5912, "loss_ce": 0.18435311317443848, "loss_lvr": 0.7807042598724365, "loss_mode_switch": 0.0, "loss_total": 0.2624235451221466, "step": 1478 }, { "batch_size": 1, "epoch": 0.5912, "step": 1478, "tokens_per_device": 5135 }, { "epoch": 0.5912, "loss_ce": 0.08421233296394348, "loss_lvr": 0.3162822723388672, "loss_mode_switch": 0.0, "loss_total": 0.11584056168794632, "step": 1478 }, { "batch_size": 4, "epoch": 0.5912, "step": 1478, "tokens_per_device": 5916 }, { "epoch": 0.5912, "loss_ce": 0.6188920736312866, "loss_lvr": 0.914034903049469, "loss_mode_switch": 0.0, "loss_total": 0.710295557975769, "step": 1478 }, { "batch_size": 4, "epoch": 0.5912, "step": 1478, "tokens_per_device": 4568 }, { "epoch": 0.5912, "loss_ce": 0.05071162059903145, "loss_lvr": 0.7441035509109497, "loss_mode_switch": 0.0, "loss_total": 0.12512198090553284, "step": 1478 }, { "batch_size": 1, "epoch": 0.5912, "step": 1478, "tokens_per_device": 4780 }, { "epoch": 0.5912, "loss_ce": 0.3363170623779297, "loss_lvr": 0.14867845177650452, "loss_mode_switch": 0.0, "loss_total": 0.3511849045753479, "step": 1478 }, { "batch_size": 4, "epoch": 0.5912, "step": 1478, "tokens_per_device": 3792 }, { "epoch": 0.5912, "loss_ce": 0.2505040764808655, "loss_lvr": 0.7295138239860535, "loss_mode_switch": 0.0, "loss_total": 0.32345545291900635, "step": 1478 }, { "epoch": 0.5916, "grad_norm": 1.2810574769973755, "learning_rate": 3.772242077648246e-06, "loss": 0.3199, "step": 1479 }, { "batch_size": 4, "epoch": 0.5916, "step": 1479, "tokens_per_device": 3832 }, { "epoch": 0.5916, "loss_ce": 0.19035962224006653, "loss_lvr": 1.1347798109054565, "loss_mode_switch": 0.0, "loss_total": 0.3038375973701477, "step": 1479 }, { "batch_size": 4, "epoch": 0.5916, "step": 1479, "tokens_per_device": 4008 }, { "epoch": 0.5916, "loss_ce": 0.36803191900253296, "loss_lvr": 0.7426156401634216, "loss_mode_switch": 0.0, "loss_total": 0.4422934949398041, "step": 1479 }, { "batch_size": 4, "epoch": 0.5916, "step": 1479, "tokens_per_device": 1384 }, { "epoch": 0.5916, "loss_ce": 0.7033595442771912, "loss_lvr": 1.0155476331710815, "loss_mode_switch": 0.0, "loss_total": 0.8049142956733704, "step": 1479 }, { "batch_size": 4, "epoch": 0.5916, "step": 1479, "tokens_per_device": 1884 }, { "epoch": 0.5916, "loss_ce": 0.0895426943898201, "loss_lvr": 0.9538469314575195, "loss_mode_switch": 0.0, "loss_total": 0.18492738902568817, "step": 1479 }, { "batch_size": 4, "epoch": 0.5916, "step": 1479, "tokens_per_device": 3604 }, { "epoch": 0.5916, "loss_ce": 0.3240307569503784, "loss_lvr": 0.8456429839134216, "loss_mode_switch": 0.0, "loss_total": 0.4085950553417206, "step": 1479 }, { "batch_size": 4, "epoch": 0.5916, "step": 1479, "tokens_per_device": 1264 }, { "epoch": 0.5916, "loss_ce": 0.7879626750946045, "loss_lvr": 1.0353400707244873, "loss_mode_switch": 0.0, "loss_total": 0.8914966583251953, "step": 1479 }, { "batch_size": 4, "epoch": 0.5916, "step": 1479, "tokens_per_device": 3936 }, { "epoch": 0.5916, "loss_ce": 0.14964909851551056, "loss_lvr": 0.841585099697113, "loss_mode_switch": 0.0, "loss_total": 0.23380760848522186, "step": 1479 }, { "batch_size": 4, "epoch": 0.5916, "step": 1479, "tokens_per_device": 1780 }, { "epoch": 0.5916, "loss_ce": 0.46807539463043213, "loss_lvr": 0.840467631816864, "loss_mode_switch": 0.0, "loss_total": 0.552122175693512, "step": 1479 }, { "epoch": 0.592, "grad_norm": 1.2978017330169678, "learning_rate": 3.7659639176068287e-06, "loss": 0.299, "step": 1480 }, { "batch_size": 4, "epoch": 0.592, "step": 1480, "tokens_per_device": 4172 }, { "epoch": 0.592, "loss_ce": 0.2422645390033722, "loss_lvr": 0.8726804852485657, "loss_mode_switch": 0.0, "loss_total": 0.32953259348869324, "step": 1480 }, { "batch_size": 1, "epoch": 0.592, "step": 1480, "tokens_per_device": 4892 }, { "epoch": 0.592, "loss_ce": 0.04065828397870064, "loss_lvr": 0.5456328392028809, "loss_mode_switch": 0.0, "loss_total": 0.09522156417369843, "step": 1480 }, { "batch_size": 1, "epoch": 0.592, "step": 1480, "tokens_per_device": 5117 }, { "epoch": 0.592, "loss_ce": 0.12944820523262024, "loss_lvr": 0.3318585455417633, "loss_mode_switch": 0.0, "loss_total": 0.16263405978679657, "step": 1480 }, { "batch_size": 1, "epoch": 0.592, "step": 1480, "tokens_per_device": 4892 }, { "epoch": 0.592, "loss_ce": 0.06294557452201843, "loss_lvr": 0.4270031750202179, "loss_mode_switch": 0.0, "loss_total": 0.10564589500427246, "step": 1480 }, { "batch_size": 4, "epoch": 0.592, "step": 1480, "tokens_per_device": 6764 }, { "epoch": 0.592, "loss_ce": 0.18506599962711334, "loss_lvr": 0.7255891561508179, "loss_mode_switch": 0.0, "loss_total": 0.25762492418289185, "step": 1480 }, { "batch_size": 4, "epoch": 0.592, "step": 1480, "tokens_per_device": 1644 }, { "epoch": 0.592, "loss_ce": 0.3254932463169098, "loss_lvr": 0.916619062423706, "loss_mode_switch": 0.0, "loss_total": 0.4171551465988159, "step": 1480 }, { "batch_size": 4, "epoch": 0.592, "step": 1480, "tokens_per_device": 1540 }, { "epoch": 0.592, "loss_ce": 0.26060250401496887, "loss_lvr": 0.9476065039634705, "loss_mode_switch": 0.0, "loss_total": 0.3553631603717804, "step": 1480 }, { "batch_size": 4, "epoch": 0.592, "step": 1480, "tokens_per_device": 13428 }, { "epoch": 0.592, "loss_ce": 0.3590743839740753, "loss_lvr": 0.6360134482383728, "loss_mode_switch": 0.0, "loss_total": 0.4226757287979126, "step": 1480 }, { "epoch": 0.5924, "grad_norm": 1.4534096717834473, "learning_rate": 3.7596878286796657e-06, "loss": 0.3171, "step": 1481 }, { "batch_size": 4, "epoch": 0.5924, "step": 1481, "tokens_per_device": 3916 }, { "epoch": 0.5924, "loss_ce": 0.23754513263702393, "loss_lvr": 0.7392623424530029, "loss_mode_switch": 0.0, "loss_total": 0.3114713728427887, "step": 1481 }, { "batch_size": 4, "epoch": 0.5924, "step": 1481, "tokens_per_device": 4264 }, { "epoch": 0.5924, "loss_ce": 0.5856093168258667, "loss_lvr": 1.3233768939971924, "loss_mode_switch": 0.0, "loss_total": 0.7179470062255859, "step": 1481 }, { "batch_size": 4, "epoch": 0.5924, "step": 1481, "tokens_per_device": 4736 }, { "epoch": 0.5924, "loss_ce": 0.047774311155080795, "loss_lvr": 0.692125678062439, "loss_mode_switch": 0.0, "loss_total": 0.11698688566684723, "step": 1481 }, { "batch_size": 4, "epoch": 0.5924, "step": 1481, "tokens_per_device": 4284 }, { "epoch": 0.5924, "loss_ce": 0.539379358291626, "loss_lvr": 1.0016816854476929, "loss_mode_switch": 0.0, "loss_total": 0.6395475268363953, "step": 1481 }, { "batch_size": 1, "epoch": 0.5924, "step": 1481, "tokens_per_device": 4576 }, { "epoch": 0.5924, "loss_ce": 0.0006785866571590304, "loss_lvr": 0.7192864418029785, "loss_mode_switch": 0.0, "loss_total": 0.07260722666978836, "step": 1481 }, { "batch_size": 4, "epoch": 0.5924, "step": 1481, "tokens_per_device": 4592 }, { "epoch": 0.5924, "loss_ce": 0.7204324007034302, "loss_lvr": 0.8562046885490417, "loss_mode_switch": 0.0, "loss_total": 0.8060528635978699, "step": 1481 }, { "batch_size": 4, "epoch": 0.5924, "step": 1481, "tokens_per_device": 5684 }, { "epoch": 0.5924, "loss_ce": 0.3139917552471161, "loss_lvr": 0.8124988675117493, "loss_mode_switch": 0.0, "loss_total": 0.3952416479587555, "step": 1481 }, { "batch_size": 4, "epoch": 0.5924, "step": 1481, "tokens_per_device": 6588 }, { "epoch": 0.5924, "loss_ce": 0.12744218111038208, "loss_lvr": 0.5585720539093018, "loss_mode_switch": 0.0, "loss_total": 0.18329939246177673, "step": 1481 }, { "epoch": 0.5928, "grad_norm": 1.5168172121047974, "learning_rate": 3.753413821400073e-06, "loss": 0.3599, "step": 1482 }, { "batch_size": 1, "epoch": 0.5928, "step": 1482, "tokens_per_device": 5176 }, { "epoch": 0.5928, "loss_ce": 0.014495317824184895, "loss_lvr": 0.5683957934379578, "loss_mode_switch": 0.0, "loss_total": 0.07133489847183228, "step": 1482 }, { "batch_size": 4, "epoch": 0.5928, "step": 1482, "tokens_per_device": 11408 }, { "epoch": 0.5928, "loss_ce": 0.22742290794849396, "loss_lvr": 0.450529009103775, "loss_mode_switch": 0.0, "loss_total": 0.27247580885887146, "step": 1482 }, { "batch_size": 4, "epoch": 0.5928, "step": 1482, "tokens_per_device": 3908 }, { "epoch": 0.5928, "loss_ce": 0.02069343440234661, "loss_lvr": 0.761454164981842, "loss_mode_switch": 0.0, "loss_total": 0.09683885425329208, "step": 1482 }, { "batch_size": 1, "epoch": 0.5928, "step": 1482, "tokens_per_device": 4988 }, { "epoch": 0.5928, "loss_ce": 0.0023928461596369743, "loss_lvr": 0.44505247473716736, "loss_mode_switch": 0.0, "loss_total": 0.046898096799850464, "step": 1482 }, { "batch_size": 4, "epoch": 0.5928, "step": 1482, "tokens_per_device": 7568 }, { "epoch": 0.5928, "loss_ce": 0.009980159811675549, "loss_lvr": 0.6176643371582031, "loss_mode_switch": 0.0, "loss_total": 0.0717465952038765, "step": 1482 }, { "batch_size": 4, "epoch": 0.5928, "step": 1482, "tokens_per_device": 2800 }, { "epoch": 0.5928, "loss_ce": 0.4754205346107483, "loss_lvr": 0.7354018688201904, "loss_mode_switch": 0.0, "loss_total": 0.5489607453346252, "step": 1482 }, { "batch_size": 4, "epoch": 0.5928, "step": 1482, "tokens_per_device": 4036 }, { "epoch": 0.5928, "loss_ce": 0.07270871847867966, "loss_lvr": 0.6742925643920898, "loss_mode_switch": 0.0, "loss_total": 0.14013797044754028, "step": 1482 }, { "batch_size": 1, "epoch": 0.5928, "step": 1482, "tokens_per_device": 4877 }, { "epoch": 0.5928, "loss_ce": 0.004288656637072563, "loss_lvr": 0.18379122018814087, "loss_mode_switch": 0.0, "loss_total": 0.02266777865588665, "step": 1482 }, { "epoch": 0.5932, "grad_norm": 1.1900781393051147, "learning_rate": 3.74714190629788e-06, "loss": 0.2487, "step": 1483 }, { "batch_size": 4, "epoch": 0.5932, "step": 1483, "tokens_per_device": 7304 }, { "epoch": 0.5932, "loss_ce": 0.0008594297105446458, "loss_lvr": 0.6757096648216248, "loss_mode_switch": 0.0, "loss_total": 0.06843040138483047, "step": 1483 }, { "batch_size": 4, "epoch": 0.5932, "step": 1483, "tokens_per_device": 1380 }, { "epoch": 0.5932, "loss_ce": 0.2848421633243561, "loss_lvr": 1.245244026184082, "loss_mode_switch": 0.0, "loss_total": 0.40936657786369324, "step": 1483 }, { "batch_size": 1, "epoch": 0.5932, "step": 1483, "tokens_per_device": 5100 }, { "epoch": 0.5932, "loss_ce": 0.00033071127836592495, "loss_lvr": 0.3685748875141144, "loss_mode_switch": 0.0, "loss_total": 0.037188202142715454, "step": 1483 }, { "batch_size": 4, "epoch": 0.5932, "step": 1483, "tokens_per_device": 2040 }, { "epoch": 0.5932, "loss_ce": 0.7017431855201721, "loss_lvr": 0.7124804258346558, "loss_mode_switch": 0.0, "loss_total": 0.7729912400245667, "step": 1483 }, { "batch_size": 4, "epoch": 0.5932, "step": 1483, "tokens_per_device": 1536 }, { "epoch": 0.5932, "loss_ce": 0.12872274219989777, "loss_lvr": 0.8826366066932678, "loss_mode_switch": 0.0, "loss_total": 0.21698640286922455, "step": 1483 }, { "batch_size": 4, "epoch": 0.5932, "step": 1483, "tokens_per_device": 3968 }, { "epoch": 0.5932, "loss_ce": 0.7014175057411194, "loss_lvr": 1.0582013130187988, "loss_mode_switch": 0.0, "loss_total": 0.8072376251220703, "step": 1483 }, { "batch_size": 4, "epoch": 0.5932, "step": 1483, "tokens_per_device": 4160 }, { "epoch": 0.5932, "loss_ce": 0.2621878981590271, "loss_lvr": 0.9707871079444885, "loss_mode_switch": 0.0, "loss_total": 0.35926660895347595, "step": 1483 }, { "batch_size": 4, "epoch": 0.5932, "step": 1483, "tokens_per_device": 3832 }, { "epoch": 0.5932, "loss_ce": 0.10399337857961655, "loss_lvr": 0.794857382774353, "loss_mode_switch": 0.0, "loss_total": 0.18347911536693573, "step": 1483 }, { "epoch": 0.5936, "grad_norm": 1.150343894958496, "learning_rate": 3.7408720938994003e-06, "loss": 0.2543, "step": 1484 }, { "batch_size": 4, "epoch": 0.5936, "step": 1484, "tokens_per_device": 1684 }, { "epoch": 0.5936, "loss_ce": 0.12016669660806656, "loss_lvr": 0.6968387365341187, "loss_mode_switch": 0.0, "loss_total": 0.1898505687713623, "step": 1484 }, { "batch_size": 4, "epoch": 0.5936, "step": 1484, "tokens_per_device": 4528 }, { "epoch": 0.5936, "loss_ce": 0.2008672058582306, "loss_lvr": 0.7811447381973267, "loss_mode_switch": 0.0, "loss_total": 0.27898168563842773, "step": 1484 }, { "batch_size": 1, "epoch": 0.5936, "step": 1484, "tokens_per_device": 5066 }, { "epoch": 0.5936, "loss_ce": 0.019452640786767006, "loss_lvr": 1.5852911472320557, "loss_mode_switch": 0.0, "loss_total": 0.17798176407814026, "step": 1484 }, { "batch_size": 1, "epoch": 0.5936, "step": 1484, "tokens_per_device": 5163 }, { "epoch": 0.5936, "loss_ce": 0.0057151103392243385, "loss_lvr": 0.36691245436668396, "loss_mode_switch": 0.0, "loss_total": 0.0424063540995121, "step": 1484 }, { "batch_size": 4, "epoch": 0.5936, "step": 1484, "tokens_per_device": 6584 }, { "epoch": 0.5936, "loss_ce": 0.1349605768918991, "loss_lvr": 0.6361717581748962, "loss_mode_switch": 0.0, "loss_total": 0.19857776165008545, "step": 1484 }, { "batch_size": 1, "epoch": 0.5936, "step": 1484, "tokens_per_device": 5094 }, { "epoch": 0.5936, "loss_ce": 0.011865133419632912, "loss_lvr": 0.39101991057395935, "loss_mode_switch": 0.0, "loss_total": 0.050967127084732056, "step": 1484 }, { "batch_size": 1, "epoch": 0.5936, "step": 1484, "tokens_per_device": 5073 }, { "epoch": 0.5936, "loss_ce": 0.0026951211038976908, "loss_lvr": 0.4324275255203247, "loss_mode_switch": 0.0, "loss_total": 0.04593787342309952, "step": 1484 }, { "batch_size": 4, "epoch": 0.5936, "step": 1484, "tokens_per_device": 4636 }, { "epoch": 0.5936, "loss_ce": 0.18165627121925354, "loss_lvr": 0.5980016589164734, "loss_mode_switch": 0.0, "loss_total": 0.24145643413066864, "step": 1484 }, { "epoch": 0.594, "grad_norm": 1.0850456953048706, "learning_rate": 3.734604394727419e-06, "loss": 0.2172, "step": 1485 }, { "batch_size": 4, "epoch": 0.594, "step": 1485, "tokens_per_device": 5788 }, { "epoch": 0.594, "loss_ce": 0.04298647493124008, "loss_lvr": 0.8896073698997498, "loss_mode_switch": 0.0, "loss_total": 0.13194721937179565, "step": 1485 }, { "batch_size": 1, "epoch": 0.594, "step": 1485, "tokens_per_device": 4893 }, { "epoch": 0.594, "loss_ce": 0.01014716736972332, "loss_lvr": 0.4026015102863312, "loss_mode_switch": 0.0, "loss_total": 0.05040732026100159, "step": 1485 }, { "batch_size": 4, "epoch": 0.594, "step": 1485, "tokens_per_device": 4604 }, { "epoch": 0.594, "loss_ce": 0.05812531337141991, "loss_lvr": 0.7569806575775146, "loss_mode_switch": 0.0, "loss_total": 0.13382337987422943, "step": 1485 }, { "batch_size": 4, "epoch": 0.594, "step": 1485, "tokens_per_device": 3912 }, { "epoch": 0.594, "loss_ce": 0.2392311841249466, "loss_lvr": 0.9508402347564697, "loss_mode_switch": 0.0, "loss_total": 0.3343152105808258, "step": 1485 }, { "batch_size": 1, "epoch": 0.594, "step": 1485, "tokens_per_device": 5151 }, { "epoch": 0.594, "loss_ce": 0.023230819031596184, "loss_lvr": 0.21353940665721893, "loss_mode_switch": 0.0, "loss_total": 0.04458475857973099, "step": 1485 }, { "batch_size": 1, "epoch": 0.594, "step": 1485, "tokens_per_device": 4751 }, { "epoch": 0.594, "loss_ce": 0.0797085091471672, "loss_lvr": 0.6433619856834412, "loss_mode_switch": 0.0, "loss_total": 0.14404471218585968, "step": 1485 }, { "batch_size": 4, "epoch": 0.594, "step": 1485, "tokens_per_device": 4380 }, { "epoch": 0.594, "loss_ce": 0.020258862525224686, "loss_lvr": 0.8369651436805725, "loss_mode_switch": 0.0, "loss_total": 0.10395537316799164, "step": 1485 }, { "batch_size": 1, "epoch": 0.594, "step": 1485, "tokens_per_device": 5469 }, { "epoch": 0.594, "loss_ce": 0.09546346962451935, "loss_lvr": 0.5312566161155701, "loss_mode_switch": 0.0, "loss_total": 0.1485891342163086, "step": 1485 }, { "epoch": 0.5944, "grad_norm": 1.3044495582580566, "learning_rate": 3.7283388193011776e-06, "loss": 0.2956, "step": 1486 }, { "batch_size": 4, "epoch": 0.5944, "step": 1486, "tokens_per_device": 4256 }, { "epoch": 0.5944, "loss_ce": 0.5168511867523193, "loss_lvr": 0.9672455787658691, "loss_mode_switch": 0.0, "loss_total": 0.6135757565498352, "step": 1486 }, { "batch_size": 4, "epoch": 0.5944, "step": 1486, "tokens_per_device": 5700 }, { "epoch": 0.5944, "loss_ce": 0.04257594421505928, "loss_lvr": 1.134379506111145, "loss_mode_switch": 0.0, "loss_total": 0.15601389110088348, "step": 1486 }, { "batch_size": 1, "epoch": 0.5944, "step": 1486, "tokens_per_device": 4918 }, { "epoch": 0.5944, "loss_ce": 0.005713499151170254, "loss_lvr": 0.2343151718378067, "loss_mode_switch": 0.0, "loss_total": 0.0291450172662735, "step": 1486 }, { "batch_size": 4, "epoch": 0.5944, "step": 1486, "tokens_per_device": 2732 }, { "epoch": 0.5944, "loss_ce": 0.11822883039712906, "loss_lvr": 0.8109627962112427, "loss_mode_switch": 0.0, "loss_total": 0.19932511448860168, "step": 1486 }, { "batch_size": 4, "epoch": 0.5944, "step": 1486, "tokens_per_device": 9184 }, { "epoch": 0.5944, "loss_ce": 0.09814541041851044, "loss_lvr": 0.9657562971115112, "loss_mode_switch": 0.0, "loss_total": 0.1947210431098938, "step": 1486 }, { "batch_size": 4, "epoch": 0.5944, "step": 1486, "tokens_per_device": 1340 }, { "epoch": 0.5944, "loss_ce": 0.7377216219902039, "loss_lvr": 1.125787377357483, "loss_mode_switch": 0.0, "loss_total": 0.8503003716468811, "step": 1486 }, { "batch_size": 1, "epoch": 0.5944, "step": 1486, "tokens_per_device": 4778 }, { "epoch": 0.5944, "loss_ce": 0.20809274911880493, "loss_lvr": 0.2935340106487274, "loss_mode_switch": 0.0, "loss_total": 0.2374461442232132, "step": 1486 }, { "batch_size": 4, "epoch": 0.5944, "step": 1486, "tokens_per_device": 1760 }, { "epoch": 0.5944, "loss_ce": 0.20794878900051117, "loss_lvr": 0.9887571334838867, "loss_mode_switch": 0.0, "loss_total": 0.3068245053291321, "step": 1486 }, { "epoch": 0.5948, "grad_norm": 1.3828078508377075, "learning_rate": 3.722075378136352e-06, "loss": 0.3353, "step": 1487 }, { "batch_size": 4, "epoch": 0.5948, "step": 1487, "tokens_per_device": 4388 }, { "epoch": 0.5948, "loss_ce": 0.05902140960097313, "loss_lvr": 0.7504512667655945, "loss_mode_switch": 0.0, "loss_total": 0.13406653702259064, "step": 1487 }, { "batch_size": 4, "epoch": 0.5948, "step": 1487, "tokens_per_device": 1452 }, { "epoch": 0.5948, "loss_ce": 0.39994338154792786, "loss_lvr": 0.8165715932846069, "loss_mode_switch": 0.0, "loss_total": 0.4816005527973175, "step": 1487 }, { "batch_size": 4, "epoch": 0.5948, "step": 1487, "tokens_per_device": 7528 }, { "epoch": 0.5948, "loss_ce": 0.11419022083282471, "loss_lvr": 0.5516631007194519, "loss_mode_switch": 0.0, "loss_total": 0.16935652494430542, "step": 1487 }, { "batch_size": 4, "epoch": 0.5948, "step": 1487, "tokens_per_device": 5256 }, { "epoch": 0.5948, "loss_ce": 0.022213447839021683, "loss_lvr": 0.9021247029304504, "loss_mode_switch": 0.0, "loss_total": 0.11242592334747314, "step": 1487 }, { "batch_size": 1, "epoch": 0.5948, "step": 1487, "tokens_per_device": 4896 }, { "epoch": 0.5948, "loss_ce": 0.06711732596158981, "loss_lvr": 0.4078887403011322, "loss_mode_switch": 0.0, "loss_total": 0.10790619999170303, "step": 1487 }, { "batch_size": 4, "epoch": 0.5948, "step": 1487, "tokens_per_device": 5976 }, { "epoch": 0.5948, "loss_ce": 0.18026776611804962, "loss_lvr": 0.5951525568962097, "loss_mode_switch": 0.0, "loss_total": 0.23978301882743835, "step": 1487 }, { "batch_size": 4, "epoch": 0.5948, "step": 1487, "tokens_per_device": 5796 }, { "epoch": 0.5948, "loss_ce": 0.24731259047985077, "loss_lvr": 0.9188287854194641, "loss_mode_switch": 0.0, "loss_total": 0.33919546008110046, "step": 1487 }, { "batch_size": 4, "epoch": 0.5948, "step": 1487, "tokens_per_device": 4004 }, { "epoch": 0.5948, "loss_ce": 0.09731049090623856, "loss_lvr": 0.7770269513130188, "loss_mode_switch": 0.0, "loss_total": 0.17501318454742432, "step": 1487 }, { "epoch": 0.5952, "grad_norm": 1.6727832555770874, "learning_rate": 3.7158140817450335e-06, "loss": 0.2695, "step": 1488 }, { "batch_size": 1, "epoch": 0.5952, "step": 1488, "tokens_per_device": 5322 }, { "epoch": 0.5952, "loss_ce": 0.04723203927278519, "loss_lvr": 0.441650390625, "loss_mode_switch": 0.0, "loss_total": 0.09139707684516907, "step": 1488 }, { "batch_size": 1, "epoch": 0.5952, "step": 1488, "tokens_per_device": 7710 }, { "epoch": 0.5952, "loss_ce": 0.0007402509218081832, "loss_lvr": 0.2823539972305298, "loss_mode_switch": 0.0, "loss_total": 0.028975650668144226, "step": 1488 }, { "batch_size": 4, "epoch": 0.5952, "step": 1488, "tokens_per_device": 2644 }, { "epoch": 0.5952, "loss_ce": 0.03284134715795517, "loss_lvr": 1.6296581029891968, "loss_mode_switch": 0.0, "loss_total": 0.19580715894699097, "step": 1488 }, { "batch_size": 4, "epoch": 0.5952, "step": 1488, "tokens_per_device": 4428 }, { "epoch": 0.5952, "loss_ce": 0.23494885861873627, "loss_lvr": 0.892659604549408, "loss_mode_switch": 0.0, "loss_total": 0.3242148160934448, "step": 1488 }, { "batch_size": 1, "epoch": 0.5952, "step": 1488, "tokens_per_device": 4514 }, { "epoch": 0.5952, "loss_ce": 0.0024984467308968306, "loss_lvr": 0.2956714630126953, "loss_mode_switch": 0.0, "loss_total": 0.03206559270620346, "step": 1488 }, { "batch_size": 4, "epoch": 0.5952, "step": 1488, "tokens_per_device": 3324 }, { "epoch": 0.5952, "loss_ce": 0.1673387885093689, "loss_lvr": 0.8952530026435852, "loss_mode_switch": 0.0, "loss_total": 0.25686410069465637, "step": 1488 }, { "batch_size": 4, "epoch": 0.5952, "step": 1488, "tokens_per_device": 5368 }, { "epoch": 0.5952, "loss_ce": 0.8299149870872498, "loss_lvr": 0.7740428447723389, "loss_mode_switch": 0.0, "loss_total": 0.9073192477226257, "step": 1488 }, { "batch_size": 4, "epoch": 0.5952, "step": 1488, "tokens_per_device": 2672 }, { "epoch": 0.5952, "loss_ce": 0.40308552980422974, "loss_lvr": 0.7864806652069092, "loss_mode_switch": 0.0, "loss_total": 0.4817335903644562, "step": 1488 }, { "epoch": 0.5956, "grad_norm": 1.128183364868164, "learning_rate": 3.709554940635717e-06, "loss": 0.2504, "step": 1489 }, { "batch_size": 4, "epoch": 0.5956, "step": 1489, "tokens_per_device": 3812 }, { "epoch": 0.5956, "loss_ce": 0.19614240527153015, "loss_lvr": 0.8020904660224915, "loss_mode_switch": 0.0, "loss_total": 0.2763514518737793, "step": 1489 }, { "batch_size": 4, "epoch": 0.5956, "step": 1489, "tokens_per_device": 1764 }, { "epoch": 0.5956, "loss_ce": 0.13679157197475433, "loss_lvr": 0.9324292540550232, "loss_mode_switch": 0.0, "loss_total": 0.2300345003604889, "step": 1489 }, { "batch_size": 4, "epoch": 0.5956, "step": 1489, "tokens_per_device": 1824 }, { "epoch": 0.5956, "loss_ce": 0.2157176434993744, "loss_lvr": 0.8979154229164124, "loss_mode_switch": 0.0, "loss_total": 0.30550917983055115, "step": 1489 }, { "batch_size": 4, "epoch": 0.5956, "step": 1489, "tokens_per_device": 1904 }, { "epoch": 0.5956, "loss_ce": 0.45385345816612244, "loss_lvr": 0.9040709137916565, "loss_mode_switch": 0.0, "loss_total": 0.544260561466217, "step": 1489 }, { "batch_size": 4, "epoch": 0.5956, "step": 1489, "tokens_per_device": 3724 }, { "epoch": 0.5956, "loss_ce": 0.012442891485989094, "loss_lvr": 0.8807501196861267, "loss_mode_switch": 0.0, "loss_total": 0.10051790624856949, "step": 1489 }, { "batch_size": 4, "epoch": 0.5956, "step": 1489, "tokens_per_device": 3516 }, { "epoch": 0.5956, "loss_ce": 0.34897708892822266, "loss_lvr": 0.9408477544784546, "loss_mode_switch": 0.0, "loss_total": 0.44306185841560364, "step": 1489 }, { "batch_size": 1, "epoch": 0.5956, "step": 1489, "tokens_per_device": 4759 }, { "epoch": 0.5956, "loss_ce": 0.03322906419634819, "loss_lvr": 0.4760627746582031, "loss_mode_switch": 0.0, "loss_total": 0.08083534240722656, "step": 1489 }, { "batch_size": 4, "epoch": 0.5956, "step": 1489, "tokens_per_device": 1460 }, { "epoch": 0.5956, "loss_ce": 0.18460950255393982, "loss_lvr": 0.8277485966682434, "loss_mode_switch": 0.0, "loss_total": 0.2673843502998352, "step": 1489 }, { "epoch": 0.596, "grad_norm": 1.2099841833114624, "learning_rate": 3.703297965313275e-06, "loss": 0.2766, "step": 1490 }, { "batch_size": 4, "epoch": 0.596, "step": 1490, "tokens_per_device": 3908 }, { "epoch": 0.596, "loss_ce": 0.33084750175476074, "loss_lvr": 0.8081768155097961, "loss_mode_switch": 0.0, "loss_total": 0.4116652011871338, "step": 1490 }, { "batch_size": 4, "epoch": 0.596, "step": 1490, "tokens_per_device": 4320 }, { "epoch": 0.596, "loss_ce": 0.04525842145085335, "loss_lvr": 0.8958607912063599, "loss_mode_switch": 0.0, "loss_total": 0.13484449684619904, "step": 1490 }, { "batch_size": 4, "epoch": 0.596, "step": 1490, "tokens_per_device": 3828 }, { "epoch": 0.596, "loss_ce": 0.41603022813796997, "loss_lvr": 1.2180181741714478, "loss_mode_switch": 0.0, "loss_total": 0.5378320217132568, "step": 1490 }, { "batch_size": 1, "epoch": 0.596, "step": 1490, "tokens_per_device": 5173 }, { "epoch": 0.596, "loss_ce": 0.004593001212924719, "loss_lvr": 0.7741101384162903, "loss_mode_switch": 0.0, "loss_total": 0.08200401812791824, "step": 1490 }, { "batch_size": 4, "epoch": 0.596, "step": 1490, "tokens_per_device": 4496 }, { "epoch": 0.596, "loss_ce": 0.489521324634552, "loss_lvr": 1.0523788928985596, "loss_mode_switch": 0.0, "loss_total": 0.5947592258453369, "step": 1490 }, { "batch_size": 4, "epoch": 0.596, "step": 1490, "tokens_per_device": 2608 }, { "epoch": 0.596, "loss_ce": 0.2210252285003662, "loss_lvr": 0.6229987740516663, "loss_mode_switch": 0.0, "loss_total": 0.28332510590553284, "step": 1490 }, { "batch_size": 4, "epoch": 0.596, "step": 1490, "tokens_per_device": 4344 }, { "epoch": 0.596, "loss_ce": 0.23566168546676636, "loss_lvr": 1.0504494905471802, "loss_mode_switch": 0.0, "loss_total": 0.34070664644241333, "step": 1490 }, { "batch_size": 4, "epoch": 0.596, "step": 1490, "tokens_per_device": 4368 }, { "epoch": 0.596, "loss_ce": 0.002542334608733654, "loss_lvr": 0.7701054811477661, "loss_mode_switch": 0.0, "loss_total": 0.07955288141965866, "step": 1490 }, { "epoch": 0.5964, "grad_norm": 1.247204065322876, "learning_rate": 3.6970431662789534e-06, "loss": 0.2928, "step": 1491 }, { "batch_size": 4, "epoch": 0.5964, "step": 1491, "tokens_per_device": 4524 }, { "epoch": 0.5964, "loss_ce": 0.08361049741506577, "loss_lvr": 0.9006329774856567, "loss_mode_switch": 0.0, "loss_total": 0.17367379367351532, "step": 1491 }, { "batch_size": 4, "epoch": 0.5964, "step": 1491, "tokens_per_device": 4280 }, { "epoch": 0.5964, "loss_ce": 0.029289720579981804, "loss_lvr": 1.0074589252471924, "loss_mode_switch": 0.0, "loss_total": 0.1300356090068817, "step": 1491 }, { "batch_size": 4, "epoch": 0.5964, "step": 1491, "tokens_per_device": 10664 }, { "epoch": 0.5964, "loss_ce": 0.19069598615169525, "loss_lvr": 0.6576461791992188, "loss_mode_switch": 0.0, "loss_total": 0.25646060705184937, "step": 1491 }, { "batch_size": 4, "epoch": 0.5964, "step": 1491, "tokens_per_device": 4800 }, { "epoch": 0.5964, "loss_ce": 0.04532251134514809, "loss_lvr": 0.8797933459281921, "loss_mode_switch": 0.0, "loss_total": 0.13330185413360596, "step": 1491 }, { "batch_size": 4, "epoch": 0.5964, "step": 1491, "tokens_per_device": 16052 }, { "epoch": 0.5964, "loss_ce": 0.25744402408599854, "loss_lvr": 1.103087306022644, "loss_mode_switch": 0.0, "loss_total": 0.3677527606487274, "step": 1491 }, { "batch_size": 4, "epoch": 0.5964, "step": 1491, "tokens_per_device": 7084 }, { "epoch": 0.5964, "loss_ce": 0.017724020406603813, "loss_lvr": 0.8287311792373657, "loss_mode_switch": 0.0, "loss_total": 0.10059714317321777, "step": 1491 }, { "batch_size": 4, "epoch": 0.5964, "step": 1491, "tokens_per_device": 3796 }, { "epoch": 0.5964, "loss_ce": 0.07944422215223312, "loss_lvr": 0.962442934513092, "loss_mode_switch": 0.0, "loss_total": 0.17568852007389069, "step": 1491 }, { "batch_size": 4, "epoch": 0.5964, "step": 1491, "tokens_per_device": 3052 }, { "epoch": 0.5964, "loss_ce": 0.024356868118047714, "loss_lvr": 0.7340366840362549, "loss_mode_switch": 0.0, "loss_total": 0.09776054322719574, "step": 1491 }, { "epoch": 0.5968, "grad_norm": 1.2165257930755615, "learning_rate": 3.69079055403034e-06, "loss": 0.2673, "step": 1492 }, { "batch_size": 4, "epoch": 0.5968, "step": 1492, "tokens_per_device": 15864 }, { "epoch": 0.5968, "loss_ce": 0.14224080741405487, "loss_lvr": 0.6964166760444641, "loss_mode_switch": 0.0, "loss_total": 0.21188247203826904, "step": 1492 }, { "batch_size": 4, "epoch": 0.5968, "step": 1492, "tokens_per_device": 1468 }, { "epoch": 0.5968, "loss_ce": 0.1042596846818924, "loss_lvr": 0.9014946222305298, "loss_mode_switch": 0.0, "loss_total": 0.19440914690494537, "step": 1492 }, { "batch_size": 1, "epoch": 0.5968, "step": 1492, "tokens_per_device": 4754 }, { "epoch": 0.5968, "loss_ce": 0.012011980637907982, "loss_lvr": 0.26146307587623596, "loss_mode_switch": 0.0, "loss_total": 0.03815829008817673, "step": 1492 }, { "batch_size": 4, "epoch": 0.5968, "step": 1492, "tokens_per_device": 1528 }, { "epoch": 0.5968, "loss_ce": 0.06515181064605713, "loss_lvr": 1.0123034715652466, "loss_mode_switch": 0.0, "loss_total": 0.16638216376304626, "step": 1492 }, { "batch_size": 4, "epoch": 0.5968, "step": 1492, "tokens_per_device": 4612 }, { "epoch": 0.5968, "loss_ce": 0.024584608152508736, "loss_lvr": 0.9962470531463623, "loss_mode_switch": 0.0, "loss_total": 0.12420931458473206, "step": 1492 }, { "batch_size": 1, "epoch": 0.5968, "step": 1492, "tokens_per_device": 4898 }, { "epoch": 0.5968, "loss_ce": 0.01436538528650999, "loss_lvr": 0.5550430417060852, "loss_mode_switch": 0.0, "loss_total": 0.06986968964338303, "step": 1492 }, { "batch_size": 4, "epoch": 0.5968, "step": 1492, "tokens_per_device": 3644 }, { "epoch": 0.5968, "loss_ce": 0.4931815564632416, "loss_lvr": 0.8422006964683533, "loss_mode_switch": 0.0, "loss_total": 0.5774016380310059, "step": 1492 }, { "batch_size": 4, "epoch": 0.5968, "step": 1492, "tokens_per_device": 3672 }, { "epoch": 0.5968, "loss_ce": 0.8159070014953613, "loss_lvr": 0.9597806930541992, "loss_mode_switch": 0.0, "loss_total": 0.9118850827217102, "step": 1492 }, { "epoch": 0.5972, "grad_norm": 1.3299634456634521, "learning_rate": 3.6845401390613528e-06, "loss": 0.2694, "step": 1493 }, { "batch_size": 1, "epoch": 0.5972, "step": 1493, "tokens_per_device": 4602 }, { "epoch": 0.5972, "loss_ce": 0.005288552492856979, "loss_lvr": 0.5166312456130981, "loss_mode_switch": 0.0, "loss_total": 0.05695167928934097, "step": 1493 }, { "batch_size": 4, "epoch": 0.5972, "step": 1493, "tokens_per_device": 1884 }, { "epoch": 0.5972, "loss_ce": 0.08751833438873291, "loss_lvr": 0.9638084769248962, "loss_mode_switch": 0.0, "loss_total": 0.1838991940021515, "step": 1493 }, { "batch_size": 4, "epoch": 0.5972, "step": 1493, "tokens_per_device": 3804 }, { "epoch": 0.5972, "loss_ce": 0.38040220737457275, "loss_lvr": 0.9407005906105042, "loss_mode_switch": 0.0, "loss_total": 0.4744722843170166, "step": 1493 }, { "batch_size": 4, "epoch": 0.5972, "step": 1493, "tokens_per_device": 1656 }, { "epoch": 0.5972, "loss_ce": 0.415910005569458, "loss_lvr": 1.022652506828308, "loss_mode_switch": 0.0, "loss_total": 0.5181752443313599, "step": 1493 }, { "batch_size": 4, "epoch": 0.5972, "step": 1493, "tokens_per_device": 4240 }, { "epoch": 0.5972, "loss_ce": 0.5203709006309509, "loss_lvr": 1.282745599746704, "loss_mode_switch": 0.0, "loss_total": 0.6486454606056213, "step": 1493 }, { "batch_size": 4, "epoch": 0.5972, "step": 1493, "tokens_per_device": 4248 }, { "epoch": 0.5972, "loss_ce": 0.25008511543273926, "loss_lvr": 1.0055428743362427, "loss_mode_switch": 0.0, "loss_total": 0.3506394028663635, "step": 1493 }, { "batch_size": 1, "epoch": 0.5972, "step": 1493, "tokens_per_device": 5114 }, { "epoch": 0.5972, "loss_ce": 0.07351886481046677, "loss_lvr": 0.48018592596054077, "loss_mode_switch": 0.0, "loss_total": 0.1215374618768692, "step": 1493 }, { "batch_size": 4, "epoch": 0.5972, "step": 1493, "tokens_per_device": 4324 }, { "epoch": 0.5972, "loss_ce": 0.041983719915151596, "loss_lvr": 0.8771504759788513, "loss_mode_switch": 0.0, "loss_total": 0.1296987682580948, "step": 1493 }, { "epoch": 0.5976, "grad_norm": 1.757830023765564, "learning_rate": 3.678291931862221e-06, "loss": 0.2547, "step": 1494 }, { "batch_size": 4, "epoch": 0.5976, "step": 1494, "tokens_per_device": 4084 }, { "epoch": 0.5976, "loss_ce": 0.1078656017780304, "loss_lvr": 0.8945538997650146, "loss_mode_switch": 0.0, "loss_total": 0.19732099771499634, "step": 1494 }, { "batch_size": 4, "epoch": 0.5976, "step": 1494, "tokens_per_device": 4204 }, { "epoch": 0.5976, "loss_ce": 0.11981526017189026, "loss_lvr": 0.855717658996582, "loss_mode_switch": 0.0, "loss_total": 0.20538702607154846, "step": 1494 }, { "batch_size": 4, "epoch": 0.5976, "step": 1494, "tokens_per_device": 9084 }, { "epoch": 0.5976, "loss_ce": 0.4474492073059082, "loss_lvr": 0.8339797854423523, "loss_mode_switch": 0.0, "loss_total": 0.5308471918106079, "step": 1494 }, { "batch_size": 4, "epoch": 0.5976, "step": 1494, "tokens_per_device": 4392 }, { "epoch": 0.5976, "loss_ce": 0.39617204666137695, "loss_lvr": 0.9335713386535645, "loss_mode_switch": 0.0, "loss_total": 0.48952919244766235, "step": 1494 }, { "batch_size": 4, "epoch": 0.5976, "step": 1494, "tokens_per_device": 3960 }, { "epoch": 0.5976, "loss_ce": 0.5098715424537659, "loss_lvr": 0.8474836945533752, "loss_mode_switch": 0.0, "loss_total": 0.5946199297904968, "step": 1494 }, { "batch_size": 4, "epoch": 0.5976, "step": 1494, "tokens_per_device": 3820 }, { "epoch": 0.5976, "loss_ce": 0.3467310965061188, "loss_lvr": 0.8936924338340759, "loss_mode_switch": 0.0, "loss_total": 0.4361003339290619, "step": 1494 }, { "batch_size": 4, "epoch": 0.5976, "step": 1494, "tokens_per_device": 11240 }, { "epoch": 0.5976, "loss_ce": 0.07035551220178604, "loss_lvr": 0.7776821255683899, "loss_mode_switch": 0.0, "loss_total": 0.14812372624874115, "step": 1494 }, { "batch_size": 4, "epoch": 0.5976, "step": 1494, "tokens_per_device": 1384 }, { "epoch": 0.5976, "loss_ce": 0.46274444460868835, "loss_lvr": 1.1998052597045898, "loss_mode_switch": 0.0, "loss_total": 0.5827249884605408, "step": 1494 }, { "epoch": 0.598, "grad_norm": 1.417160153388977, "learning_rate": 3.6720459429194743e-06, "loss": 0.2895, "step": 1495 }, { "batch_size": 1, "epoch": 0.598, "step": 1495, "tokens_per_device": 4884 }, { "epoch": 0.598, "loss_ce": 0.004990146961063147, "loss_lvr": 0.16349253058433533, "loss_mode_switch": 0.0, "loss_total": 0.021339401602745056, "step": 1495 }, { "batch_size": 4, "epoch": 0.598, "step": 1495, "tokens_per_device": 1964 }, { "epoch": 0.598, "loss_ce": 0.2541346848011017, "loss_lvr": 0.9258694052696228, "loss_mode_switch": 0.0, "loss_total": 0.3467216193675995, "step": 1495 }, { "batch_size": 4, "epoch": 0.598, "step": 1495, "tokens_per_device": 4404 }, { "epoch": 0.598, "loss_ce": 0.15494389832019806, "loss_lvr": 0.749658465385437, "loss_mode_switch": 0.0, "loss_total": 0.229909747838974, "step": 1495 }, { "batch_size": 1, "epoch": 0.598, "step": 1495, "tokens_per_device": 5191 }, { "epoch": 0.598, "loss_ce": 0.004732957109808922, "loss_lvr": 0.3713820278644562, "loss_mode_switch": 0.0, "loss_total": 0.04187116026878357, "step": 1495 }, { "batch_size": 4, "epoch": 0.598, "step": 1495, "tokens_per_device": 7268 }, { "epoch": 0.598, "loss_ce": 0.14037223160266876, "loss_lvr": 0.8824439644813538, "loss_mode_switch": 0.0, "loss_total": 0.2286166250705719, "step": 1495 }, { "batch_size": 4, "epoch": 0.598, "step": 1495, "tokens_per_device": 3804 }, { "epoch": 0.598, "loss_ce": 0.48467591404914856, "loss_lvr": 0.8378728628158569, "loss_mode_switch": 0.0, "loss_total": 0.5684632062911987, "step": 1495 }, { "batch_size": 1, "epoch": 0.598, "step": 1495, "tokens_per_device": 4881 }, { "epoch": 0.598, "loss_ce": 0.009927441366016865, "loss_lvr": 0.16837388277053833, "loss_mode_switch": 0.0, "loss_total": 0.026764828711748123, "step": 1495 }, { "batch_size": 1, "epoch": 0.598, "step": 1495, "tokens_per_device": 4913 }, { "epoch": 0.598, "loss_ce": 0.036198295652866364, "loss_lvr": 0.5615181922912598, "loss_mode_switch": 0.0, "loss_total": 0.09235011041164398, "step": 1495 }, { "epoch": 0.5984, "grad_norm": 1.2432392835617065, "learning_rate": 3.665802182715913e-06, "loss": 0.2678, "step": 1496 }, { "batch_size": 4, "epoch": 0.5984, "step": 1496, "tokens_per_device": 4352 }, { "epoch": 0.5984, "loss_ce": 0.43527936935424805, "loss_lvr": 0.9165332913398743, "loss_mode_switch": 0.0, "loss_total": 0.5269327163696289, "step": 1496 }, { "batch_size": 4, "epoch": 0.5984, "step": 1496, "tokens_per_device": 2620 }, { "epoch": 0.5984, "loss_ce": 0.31216856837272644, "loss_lvr": 0.8045809864997864, "loss_mode_switch": 0.0, "loss_total": 0.39262667298316956, "step": 1496 }, { "batch_size": 4, "epoch": 0.5984, "step": 1496, "tokens_per_device": 4800 }, { "epoch": 0.5984, "loss_ce": 0.41718044877052307, "loss_lvr": 0.8186231255531311, "loss_mode_switch": 0.0, "loss_total": 0.4990427494049072, "step": 1496 }, { "batch_size": 4, "epoch": 0.5984, "step": 1496, "tokens_per_device": 4276 }, { "epoch": 0.5984, "loss_ce": 0.27453458309173584, "loss_lvr": 0.9037556648254395, "loss_mode_switch": 0.0, "loss_total": 0.36491015553474426, "step": 1496 }, { "batch_size": 4, "epoch": 0.5984, "step": 1496, "tokens_per_device": 4208 }, { "epoch": 0.5984, "loss_ce": 0.10236913710832596, "loss_lvr": 0.7532989382743835, "loss_mode_switch": 0.0, "loss_total": 0.1776990294456482, "step": 1496 }, { "batch_size": 4, "epoch": 0.5984, "step": 1496, "tokens_per_device": 7044 }, { "epoch": 0.5984, "loss_ce": 0.19879841804504395, "loss_lvr": 0.42606067657470703, "loss_mode_switch": 0.0, "loss_total": 0.2414044886827469, "step": 1496 }, { "batch_size": 4, "epoch": 0.5984, "step": 1496, "tokens_per_device": 1352 }, { "epoch": 0.5984, "loss_ce": 0.11089416593313217, "loss_lvr": 0.9320897459983826, "loss_mode_switch": 0.0, "loss_total": 0.20410314202308655, "step": 1496 }, { "batch_size": 4, "epoch": 0.5984, "step": 1496, "tokens_per_device": 4396 }, { "epoch": 0.5984, "loss_ce": 0.27442583441734314, "loss_lvr": 0.7523604035377502, "loss_mode_switch": 0.0, "loss_total": 0.3496618866920471, "step": 1496 }, { "epoch": 0.5988, "grad_norm": 1.3101578950881958, "learning_rate": 3.659560661730599e-06, "loss": 0.2859, "step": 1497 }, { "batch_size": 4, "epoch": 0.5988, "step": 1497, "tokens_per_device": 4256 }, { "epoch": 0.5988, "loss_ce": 0.5534544587135315, "loss_lvr": 1.0873751640319824, "loss_mode_switch": 0.0, "loss_total": 0.6621919870376587, "step": 1497 }, { "batch_size": 4, "epoch": 0.5988, "step": 1497, "tokens_per_device": 5144 }, { "epoch": 0.5988, "loss_ce": 0.2921978235244751, "loss_lvr": 0.7732138633728027, "loss_mode_switch": 0.0, "loss_total": 0.3695192039012909, "step": 1497 }, { "batch_size": 4, "epoch": 0.5988, "step": 1497, "tokens_per_device": 3860 }, { "epoch": 0.5988, "loss_ce": 0.1335846185684204, "loss_lvr": 1.0094069242477417, "loss_mode_switch": 0.0, "loss_total": 0.23452532291412354, "step": 1497 }, { "batch_size": 4, "epoch": 0.5988, "step": 1497, "tokens_per_device": 3564 }, { "epoch": 0.5988, "loss_ce": 0.14215949177742004, "loss_lvr": 1.2101784944534302, "loss_mode_switch": 0.0, "loss_total": 0.2631773352622986, "step": 1497 }, { "batch_size": 4, "epoch": 0.5988, "step": 1497, "tokens_per_device": 4256 }, { "epoch": 0.5988, "loss_ce": 0.07100406289100647, "loss_lvr": 0.8485876321792603, "loss_mode_switch": 0.0, "loss_total": 0.15586283802986145, "step": 1497 }, { "batch_size": 4, "epoch": 0.5988, "step": 1497, "tokens_per_device": 1800 }, { "epoch": 0.5988, "loss_ce": 0.3325049579143524, "loss_lvr": 1.512639045715332, "loss_mode_switch": 0.0, "loss_total": 0.48376888036727905, "step": 1497 }, { "batch_size": 1, "epoch": 0.5988, "step": 1497, "tokens_per_device": 4757 }, { "epoch": 0.5988, "loss_ce": 0.12416303157806396, "loss_lvr": 0.22436657547950745, "loss_mode_switch": 0.0, "loss_total": 0.1465996950864792, "step": 1497 }, { "batch_size": 1, "epoch": 0.5988, "step": 1497, "tokens_per_device": 4910 }, { "epoch": 0.5988, "loss_ce": 0.0028336660470813513, "loss_lvr": 0.1910731941461563, "loss_mode_switch": 0.0, "loss_total": 0.021940985694527626, "step": 1497 }, { "epoch": 0.5992, "grad_norm": 1.1946437358856201, "learning_rate": 3.6533213904388377e-06, "loss": 0.2814, "step": 1498 }, { "batch_size": 4, "epoch": 0.5992, "step": 1498, "tokens_per_device": 5868 }, { "epoch": 0.5992, "loss_ce": 0.2839106321334839, "loss_lvr": 0.9023193120956421, "loss_mode_switch": 0.0, "loss_total": 0.3741425573825836, "step": 1498 }, { "batch_size": 4, "epoch": 0.5992, "step": 1498, "tokens_per_device": 1420 }, { "epoch": 0.5992, "loss_ce": 0.12761478126049042, "loss_lvr": 1.0272860527038574, "loss_mode_switch": 0.0, "loss_total": 0.23034338653087616, "step": 1498 }, { "batch_size": 4, "epoch": 0.5992, "step": 1498, "tokens_per_device": 4672 }, { "epoch": 0.5992, "loss_ce": 0.18559697270393372, "loss_lvr": 0.5455700159072876, "loss_mode_switch": 0.0, "loss_total": 0.240153968334198, "step": 1498 }, { "batch_size": 4, "epoch": 0.5992, "step": 1498, "tokens_per_device": 4196 }, { "epoch": 0.5992, "loss_ce": 0.15289050340652466, "loss_lvr": 1.098798155784607, "loss_mode_switch": 0.0, "loss_total": 0.26277032494544983, "step": 1498 }, { "batch_size": 4, "epoch": 0.5992, "step": 1498, "tokens_per_device": 5056 }, { "epoch": 0.5992, "loss_ce": 0.14189013838768005, "loss_lvr": 0.8824884295463562, "loss_mode_switch": 0.0, "loss_total": 0.23013898730278015, "step": 1498 }, { "batch_size": 4, "epoch": 0.5992, "step": 1498, "tokens_per_device": 3868 }, { "epoch": 0.5992, "loss_ce": 0.4198715090751648, "loss_lvr": 0.9365690350532532, "loss_mode_switch": 0.0, "loss_total": 0.5135284066200256, "step": 1498 }, { "batch_size": 4, "epoch": 0.5992, "step": 1498, "tokens_per_device": 2624 }, { "epoch": 0.5992, "loss_ce": 0.4138944149017334, "loss_lvr": 0.9373458623886108, "loss_mode_switch": 0.0, "loss_total": 0.5076289772987366, "step": 1498 }, { "batch_size": 4, "epoch": 0.5992, "step": 1498, "tokens_per_device": 5680 }, { "epoch": 0.5992, "loss_ce": 0.03302301466464996, "loss_lvr": 0.6933820843696594, "loss_mode_switch": 0.0, "loss_total": 0.10236122459173203, "step": 1498 }, { "epoch": 0.5996, "grad_norm": 1.564774990081787, "learning_rate": 3.6470843793121536e-06, "loss": 0.2892, "step": 1499 }, { "batch_size": 4, "epoch": 0.5996, "step": 1499, "tokens_per_device": 9040 }, { "epoch": 0.5996, "loss_ce": 0.3007604479789734, "loss_lvr": 0.8704776167869568, "loss_mode_switch": 0.0, "loss_total": 0.3878082036972046, "step": 1499 }, { "batch_size": 4, "epoch": 0.5996, "step": 1499, "tokens_per_device": 1620 }, { "epoch": 0.5996, "loss_ce": 0.701969563961029, "loss_lvr": 0.8568406701087952, "loss_mode_switch": 0.0, "loss_total": 0.7876536250114441, "step": 1499 }, { "batch_size": 1, "epoch": 0.5996, "step": 1499, "tokens_per_device": 5115 }, { "epoch": 0.5996, "loss_ce": 0.027569644153118134, "loss_lvr": 0.3594135642051697, "loss_mode_switch": 0.0, "loss_total": 0.06351099908351898, "step": 1499 }, { "batch_size": 1, "epoch": 0.5996, "step": 1499, "tokens_per_device": 4862 }, { "epoch": 0.5996, "loss_ce": 0.03518867865204811, "loss_lvr": 0.38746121525764465, "loss_mode_switch": 0.0, "loss_total": 0.07393480092287064, "step": 1499 }, { "batch_size": 4, "epoch": 0.5996, "step": 1499, "tokens_per_device": 4316 }, { "epoch": 0.5996, "loss_ce": 0.247030571103096, "loss_lvr": 0.9408982992172241, "loss_mode_switch": 0.0, "loss_total": 0.3411203920841217, "step": 1499 }, { "batch_size": 4, "epoch": 0.5996, "step": 1499, "tokens_per_device": 6844 }, { "epoch": 0.5996, "loss_ce": 0.25908541679382324, "loss_lvr": 0.7886351346969604, "loss_mode_switch": 0.0, "loss_total": 0.33794891834259033, "step": 1499 }, { "batch_size": 1, "epoch": 0.5996, "step": 1499, "tokens_per_device": 5036 }, { "epoch": 0.5996, "loss_ce": 0.014203074388206005, "loss_lvr": 0.39943185448646545, "loss_mode_switch": 0.0, "loss_total": 0.054146260023117065, "step": 1499 }, { "batch_size": 4, "epoch": 0.5996, "step": 1499, "tokens_per_device": 2564 }, { "epoch": 0.5996, "loss_ce": 0.0389646477997303, "loss_lvr": 1.1732059717178345, "loss_mode_switch": 0.0, "loss_total": 0.15628524124622345, "step": 1499 }, { "epoch": 0.6, "grad_norm": 1.3254021406173706, "learning_rate": 3.6408496388182857e-06, "loss": 0.2961, "step": 1500 }, { "batch_size": 1, "epoch": 0.6, "step": 1500, "tokens_per_device": 4510 }, { "epoch": 0.6, "loss_ce": 0.05788334459066391, "loss_lvr": 0.22542530298233032, "loss_mode_switch": 0.0, "loss_total": 0.08042587339878082, "step": 1500 }, { "batch_size": 4, "epoch": 0.6, "step": 1500, "tokens_per_device": 5784 }, { "epoch": 0.6, "loss_ce": 0.18444722890853882, "loss_lvr": 0.8325763940811157, "loss_mode_switch": 0.0, "loss_total": 0.26770487427711487, "step": 1500 }, { "batch_size": 1, "epoch": 0.6, "step": 1500, "tokens_per_device": 4860 }, { "epoch": 0.6, "loss_ce": 0.016585955396294594, "loss_lvr": 0.3761437237262726, "loss_mode_switch": 0.0, "loss_total": 0.05420032888650894, "step": 1500 }, { "batch_size": 4, "epoch": 0.6, "step": 1500, "tokens_per_device": 3860 }, { "epoch": 0.6, "loss_ce": 0.39712655544281006, "loss_lvr": 0.9189769625663757, "loss_mode_switch": 0.0, "loss_total": 0.48902425169944763, "step": 1500 }, { "batch_size": 1, "epoch": 0.6, "step": 1500, "tokens_per_device": 4812 }, { "epoch": 0.6, "loss_ce": 0.0112870829179883, "loss_lvr": 0.4093188941478729, "loss_mode_switch": 0.0, "loss_total": 0.0522189736366272, "step": 1500 }, { "batch_size": 4, "epoch": 0.6, "step": 1500, "tokens_per_device": 6432 }, { "epoch": 0.6, "loss_ce": 0.07514992356300354, "loss_lvr": 0.6003081202507019, "loss_mode_switch": 0.0, "loss_total": 0.1351807415485382, "step": 1500 }, { "batch_size": 4, "epoch": 0.6, "step": 1500, "tokens_per_device": 5024 }, { "epoch": 0.6, "loss_ce": 0.013446486555039883, "loss_lvr": 0.7520467638969421, "loss_mode_switch": 0.0, "loss_total": 0.08865116536617279, "step": 1500 }, { "batch_size": 4, "epoch": 0.6, "step": 1500, "tokens_per_device": 1588 }, { "epoch": 0.6, "loss_ce": 0.28381821513175964, "loss_lvr": 0.9277377128601074, "loss_mode_switch": 0.0, "loss_total": 0.3765919804573059, "step": 1500 }, { "epoch": 0.6004, "grad_norm": 1.3302794694900513, "learning_rate": 3.634617179421157e-06, "loss": 0.2932, "step": 1501 }, { "batch_size": 4, "epoch": 0.6004, "step": 1501, "tokens_per_device": 10000 }, { "epoch": 0.6004, "loss_ce": 0.04460962861776352, "loss_lvr": 0.6458788514137268, "loss_mode_switch": 0.0, "loss_total": 0.10919751226902008, "step": 1501 }, { "batch_size": 1, "epoch": 0.6004, "step": 1501, "tokens_per_device": 4864 }, { "epoch": 0.6004, "loss_ce": 0.0530654601752758, "loss_lvr": 0.2413574457168579, "loss_mode_switch": 0.0, "loss_total": 0.07720120251178741, "step": 1501 }, { "batch_size": 4, "epoch": 0.6004, "step": 1501, "tokens_per_device": 3432 }, { "epoch": 0.6004, "loss_ce": 0.30446937680244446, "loss_lvr": 1.0805797576904297, "loss_mode_switch": 0.0, "loss_total": 0.4125273525714874, "step": 1501 }, { "batch_size": 4, "epoch": 0.6004, "step": 1501, "tokens_per_device": 1336 }, { "epoch": 0.6004, "loss_ce": 0.06498134881258011, "loss_lvr": 0.8338924646377563, "loss_mode_switch": 0.0, "loss_total": 0.14837059378623962, "step": 1501 }, { "batch_size": 4, "epoch": 0.6004, "step": 1501, "tokens_per_device": 4196 }, { "epoch": 0.6004, "loss_ce": 0.09512756019830704, "loss_lvr": 1.0246623754501343, "loss_mode_switch": 0.0, "loss_total": 0.1975938081741333, "step": 1501 }, { "batch_size": 4, "epoch": 0.6004, "step": 1501, "tokens_per_device": 4184 }, { "epoch": 0.6004, "loss_ce": 0.09726425260305405, "loss_lvr": 0.9668100476264954, "loss_mode_switch": 0.0, "loss_total": 0.1939452588558197, "step": 1501 }, { "batch_size": 4, "epoch": 0.6004, "step": 1501, "tokens_per_device": 2708 }, { "epoch": 0.6004, "loss_ce": 0.2539588510990143, "loss_lvr": 0.6017840504646301, "loss_mode_switch": 0.0, "loss_total": 0.3141372501850128, "step": 1501 }, { "batch_size": 4, "epoch": 0.6004, "step": 1501, "tokens_per_device": 3504 }, { "epoch": 0.6004, "loss_ce": 0.199492946267128, "loss_lvr": 0.8975294828414917, "loss_mode_switch": 0.0, "loss_total": 0.2892459034919739, "step": 1501 }, { "epoch": 0.6008, "grad_norm": 1.5812819004058838, "learning_rate": 3.628387011580864e-06, "loss": 0.3008, "step": 1502 }, { "batch_size": 1, "epoch": 0.6008, "step": 1502, "tokens_per_device": 5177 }, { "epoch": 0.6008, "loss_ce": 0.0037312584463506937, "loss_lvr": 0.2781204581260681, "loss_mode_switch": 0.0, "loss_total": 0.0315433070063591, "step": 1502 }, { "batch_size": 4, "epoch": 0.6008, "step": 1502, "tokens_per_device": 1712 }, { "epoch": 0.6008, "loss_ce": 0.5261144638061523, "loss_lvr": 0.8437559604644775, "loss_mode_switch": 0.0, "loss_total": 0.610490083694458, "step": 1502 }, { "batch_size": 1, "epoch": 0.6008, "step": 1502, "tokens_per_device": 4842 }, { "epoch": 0.6008, "loss_ce": 0.22820046544075012, "loss_lvr": 0.7720909714698792, "loss_mode_switch": 0.0, "loss_total": 0.3054095506668091, "step": 1502 }, { "batch_size": 4, "epoch": 0.6008, "step": 1502, "tokens_per_device": 4296 }, { "epoch": 0.6008, "loss_ce": 0.659723699092865, "loss_lvr": 0.9058859944343567, "loss_mode_switch": 0.0, "loss_total": 0.750312328338623, "step": 1502 }, { "batch_size": 4, "epoch": 0.6008, "step": 1502, "tokens_per_device": 1684 }, { "epoch": 0.6008, "loss_ce": 0.5928065776824951, "loss_lvr": 0.911286473274231, "loss_mode_switch": 0.0, "loss_total": 0.6839352250099182, "step": 1502 }, { "batch_size": 4, "epoch": 0.6008, "step": 1502, "tokens_per_device": 13020 }, { "epoch": 0.6008, "loss_ce": 0.10162728279829025, "loss_lvr": 1.064672589302063, "loss_mode_switch": 0.0, "loss_total": 0.2080945372581482, "step": 1502 }, { "batch_size": 1, "epoch": 0.6008, "step": 1502, "tokens_per_device": 5109 }, { "epoch": 0.6008, "loss_ce": 0.002038421109318733, "loss_lvr": 0.4476700723171234, "loss_mode_switch": 0.0, "loss_total": 0.046805426478385925, "step": 1502 }, { "batch_size": 1, "epoch": 0.6008, "step": 1502, "tokens_per_device": 5060 }, { "epoch": 0.6008, "loss_ce": 0.0009414848173037171, "loss_lvr": 0.5196599364280701, "loss_mode_switch": 0.0, "loss_total": 0.05290747806429863, "step": 1502 }, { "epoch": 0.6012, "grad_norm": 1.4668313264846802, "learning_rate": 3.622159145753654e-06, "loss": 0.2932, "step": 1503 }, { "batch_size": 4, "epoch": 0.6012, "step": 1503, "tokens_per_device": 11036 }, { "epoch": 0.6012, "loss_ce": 0.03919452428817749, "loss_lvr": 0.8362011313438416, "loss_mode_switch": 0.0, "loss_total": 0.12281464040279388, "step": 1503 }, { "batch_size": 4, "epoch": 0.6012, "step": 1503, "tokens_per_device": 1280 }, { "epoch": 0.6012, "loss_ce": 0.4095490276813507, "loss_lvr": 0.9646268486976624, "loss_mode_switch": 0.0, "loss_total": 0.5060117244720459, "step": 1503 }, { "batch_size": 1, "epoch": 0.6012, "step": 1503, "tokens_per_device": 5117 }, { "epoch": 0.6012, "loss_ce": 0.0026044801343232393, "loss_lvr": 0.22779296338558197, "loss_mode_switch": 0.0, "loss_total": 0.025383777916431427, "step": 1503 }, { "batch_size": 1, "epoch": 0.6012, "step": 1503, "tokens_per_device": 4897 }, { "epoch": 0.6012, "loss_ce": 0.0009540350292809308, "loss_lvr": 0.4481397569179535, "loss_mode_switch": 0.0, "loss_total": 0.04576801136136055, "step": 1503 }, { "batch_size": 1, "epoch": 0.6012, "step": 1503, "tokens_per_device": 5191 }, { "epoch": 0.6012, "loss_ce": 0.04289735481142998, "loss_lvr": 0.6374521851539612, "loss_mode_switch": 0.0, "loss_total": 0.10664257407188416, "step": 1503 }, { "batch_size": 1, "epoch": 0.6012, "step": 1503, "tokens_per_device": 4874 }, { "epoch": 0.6012, "loss_ce": 0.043766818940639496, "loss_lvr": 0.6520823836326599, "loss_mode_switch": 0.0, "loss_total": 0.10897506028413773, "step": 1503 }, { "batch_size": 1, "epoch": 0.6012, "step": 1503, "tokens_per_device": 5056 }, { "epoch": 0.6012, "loss_ce": 0.22152331471443176, "loss_lvr": 0.6206157207489014, "loss_mode_switch": 0.0, "loss_total": 0.2835848927497864, "step": 1503 }, { "batch_size": 1, "epoch": 0.6012, "step": 1503, "tokens_per_device": 5166 }, { "epoch": 0.6012, "loss_ce": 0.022018274292349815, "loss_lvr": 0.41918739676475525, "loss_mode_switch": 0.0, "loss_total": 0.06393701583147049, "step": 1503 }, { "epoch": 0.6016, "grad_norm": 1.2011748552322388, "learning_rate": 3.615933592391913e-06, "loss": 0.2659, "step": 1504 }, { "batch_size": 4, "epoch": 0.6016, "step": 1504, "tokens_per_device": 2560 }, { "epoch": 0.6016, "loss_ce": 0.39586955308914185, "loss_lvr": 0.9790471196174622, "loss_mode_switch": 0.0, "loss_total": 0.49377426505088806, "step": 1504 }, { "batch_size": 4, "epoch": 0.6016, "step": 1504, "tokens_per_device": 4368 }, { "epoch": 0.6016, "loss_ce": 0.008207581005990505, "loss_lvr": 0.9633459448814392, "loss_mode_switch": 0.0, "loss_total": 0.10454218089580536, "step": 1504 }, { "batch_size": 4, "epoch": 0.6016, "step": 1504, "tokens_per_device": 3812 }, { "epoch": 0.6016, "loss_ce": 0.4096299111843109, "loss_lvr": 0.813529372215271, "loss_mode_switch": 0.0, "loss_total": 0.49098286032676697, "step": 1504 }, { "batch_size": 4, "epoch": 0.6016, "step": 1504, "tokens_per_device": 1460 }, { "epoch": 0.6016, "loss_ce": 0.0769076868891716, "loss_lvr": 0.9871770739555359, "loss_mode_switch": 0.0, "loss_total": 0.17562539875507355, "step": 1504 }, { "batch_size": 1, "epoch": 0.6016, "step": 1504, "tokens_per_device": 5160 }, { "epoch": 0.6016, "loss_ce": 0.06934957206249237, "loss_lvr": 0.4709221124649048, "loss_mode_switch": 0.0, "loss_total": 0.11644178628921509, "step": 1504 }, { "batch_size": 4, "epoch": 0.6016, "step": 1504, "tokens_per_device": 5872 }, { "epoch": 0.6016, "loss_ce": 0.20976312458515167, "loss_lvr": 0.9265773892402649, "loss_mode_switch": 0.0, "loss_total": 0.30242085456848145, "step": 1504 }, { "batch_size": 4, "epoch": 0.6016, "step": 1504, "tokens_per_device": 4692 }, { "epoch": 0.6016, "loss_ce": 0.10276512801647186, "loss_lvr": 1.0202807188034058, "loss_mode_switch": 0.0, "loss_total": 0.20479319989681244, "step": 1504 }, { "batch_size": 4, "epoch": 0.6016, "step": 1504, "tokens_per_device": 4216 }, { "epoch": 0.6016, "loss_ce": 0.002508596982806921, "loss_lvr": 1.1422713994979858, "loss_mode_switch": 0.0, "loss_total": 0.11673573404550552, "step": 1504 }, { "epoch": 0.602, "grad_norm": 1.9843934774398804, "learning_rate": 3.6097103619441505e-06, "loss": 0.272, "step": 1505 }, { "batch_size": 4, "epoch": 0.602, "step": 1505, "tokens_per_device": 4840 }, { "epoch": 0.602, "loss_ce": 0.17766202986240387, "loss_lvr": 0.8090188503265381, "loss_mode_switch": 0.0, "loss_total": 0.25856390595436096, "step": 1505 }, { "batch_size": 4, "epoch": 0.602, "step": 1505, "tokens_per_device": 10380 }, { "epoch": 0.602, "loss_ce": 0.16507349908351898, "loss_lvr": 0.9113309383392334, "loss_mode_switch": 0.0, "loss_total": 0.25620660185813904, "step": 1505 }, { "batch_size": 4, "epoch": 0.602, "step": 1505, "tokens_per_device": 2592 }, { "epoch": 0.602, "loss_ce": 0.22149652242660522, "loss_lvr": 0.7639297842979431, "loss_mode_switch": 0.0, "loss_total": 0.29788950085639954, "step": 1505 }, { "batch_size": 1, "epoch": 0.602, "step": 1505, "tokens_per_device": 5178 }, { "epoch": 0.602, "loss_ce": 0.18524964153766632, "loss_lvr": 0.3008349537849426, "loss_mode_switch": 0.0, "loss_total": 0.21533313393592834, "step": 1505 }, { "batch_size": 1, "epoch": 0.602, "step": 1505, "tokens_per_device": 4932 }, { "epoch": 0.602, "loss_ce": 0.0012443209998309612, "loss_lvr": 0.41706594824790955, "loss_mode_switch": 0.0, "loss_total": 0.04295091703534126, "step": 1505 }, { "batch_size": 1, "epoch": 0.602, "step": 1505, "tokens_per_device": 6468 }, { "epoch": 0.602, "loss_ce": 0.031076936051249504, "loss_lvr": 0.5105559825897217, "loss_mode_switch": 0.0, "loss_total": 0.08213253319263458, "step": 1505 }, { "batch_size": 1, "epoch": 0.602, "step": 1505, "tokens_per_device": 4886 }, { "epoch": 0.602, "loss_ce": 0.4433906078338623, "loss_lvr": 0.21895776689052582, "loss_mode_switch": 0.0, "loss_total": 0.46528637409210205, "step": 1505 }, { "batch_size": 4, "epoch": 0.602, "step": 1505, "tokens_per_device": 4072 }, { "epoch": 0.602, "loss_ce": 0.23240505158901215, "loss_lvr": 1.4429130554199219, "loss_mode_switch": 0.0, "loss_total": 0.3766963481903076, "step": 1505 }, { "epoch": 0.6024, "grad_norm": 1.269065022468567, "learning_rate": 3.6034894648549677e-06, "loss": 0.2665, "step": 1506 }, { "batch_size": 1, "epoch": 0.6024, "step": 1506, "tokens_per_device": 5212 }, { "epoch": 0.6024, "loss_ce": 0.022611312568187714, "loss_lvr": 0.2193755805492401, "loss_mode_switch": 0.0, "loss_total": 0.044548869132995605, "step": 1506 }, { "batch_size": 4, "epoch": 0.6024, "step": 1506, "tokens_per_device": 4264 }, { "epoch": 0.6024, "loss_ce": 0.12731195986270905, "loss_lvr": 0.810647189617157, "loss_mode_switch": 0.0, "loss_total": 0.2083766758441925, "step": 1506 }, { "batch_size": 1, "epoch": 0.6024, "step": 1506, "tokens_per_device": 4741 }, { "epoch": 0.6024, "loss_ce": 0.000498843495734036, "loss_lvr": 0.3680685758590698, "loss_mode_switch": 0.0, "loss_total": 0.03730570152401924, "step": 1506 }, { "batch_size": 4, "epoch": 0.6024, "step": 1506, "tokens_per_device": 6008 }, { "epoch": 0.6024, "loss_ce": 0.18056224286556244, "loss_lvr": 0.7433336973190308, "loss_mode_switch": 0.0, "loss_total": 0.2548956274986267, "step": 1506 }, { "batch_size": 4, "epoch": 0.6024, "step": 1506, "tokens_per_device": 4364 }, { "epoch": 0.6024, "loss_ce": 0.009907185100018978, "loss_lvr": 0.7052155137062073, "loss_mode_switch": 0.0, "loss_total": 0.08042874187231064, "step": 1506 }, { "batch_size": 1, "epoch": 0.6024, "step": 1506, "tokens_per_device": 5023 }, { "epoch": 0.6024, "loss_ce": 0.044713523238897324, "loss_lvr": 0.39898017048835754, "loss_mode_switch": 0.0, "loss_total": 0.08461154252290726, "step": 1506 }, { "batch_size": 4, "epoch": 0.6024, "step": 1506, "tokens_per_device": 3312 }, { "epoch": 0.6024, "loss_ce": 0.013405944220721722, "loss_lvr": 0.8825321197509766, "loss_mode_switch": 0.0, "loss_total": 0.10165915638208389, "step": 1506 }, { "batch_size": 4, "epoch": 0.6024, "step": 1506, "tokens_per_device": 5144 }, { "epoch": 0.6024, "loss_ce": 0.20896844565868378, "loss_lvr": 0.7036367654800415, "loss_mode_switch": 0.0, "loss_total": 0.27933213114738464, "step": 1506 }, { "epoch": 0.6028, "grad_norm": 2.0578441619873047, "learning_rate": 3.5972709115650594e-06, "loss": 0.2693, "step": 1507 }, { "batch_size": 4, "epoch": 0.6028, "step": 1507, "tokens_per_device": 3324 }, { "epoch": 0.6028, "loss_ce": 0.42863088846206665, "loss_lvr": 0.6508378386497498, "loss_mode_switch": 0.0, "loss_total": 0.49371469020843506, "step": 1507 }, { "batch_size": 1, "epoch": 0.6028, "step": 1507, "tokens_per_device": 5679 }, { "epoch": 0.6028, "loss_ce": 0.39095956087112427, "loss_lvr": 0.8805078864097595, "loss_mode_switch": 0.0, "loss_total": 0.47901034355163574, "step": 1507 }, { "batch_size": 4, "epoch": 0.6028, "step": 1507, "tokens_per_device": 7560 }, { "epoch": 0.6028, "loss_ce": 0.3435884118080139, "loss_lvr": 0.5529142618179321, "loss_mode_switch": 0.0, "loss_total": 0.3988798260688782, "step": 1507 }, { "batch_size": 4, "epoch": 0.6028, "step": 1507, "tokens_per_device": 3844 }, { "epoch": 0.6028, "loss_ce": 0.591129720211029, "loss_lvr": 1.0957471132278442, "loss_mode_switch": 0.0, "loss_total": 0.7007044553756714, "step": 1507 }, { "batch_size": 1, "epoch": 0.6028, "step": 1507, "tokens_per_device": 5069 }, { "epoch": 0.6028, "loss_ce": 0.12890031933784485, "loss_lvr": 0.39750203490257263, "loss_mode_switch": 0.0, "loss_total": 0.1686505228281021, "step": 1507 }, { "batch_size": 4, "epoch": 0.6028, "step": 1507, "tokens_per_device": 4716 }, { "epoch": 0.6028, "loss_ce": 0.09035510569810867, "loss_lvr": 1.1388638019561768, "loss_mode_switch": 0.0, "loss_total": 0.20424148440361023, "step": 1507 }, { "batch_size": 1, "epoch": 0.6028, "step": 1507, "tokens_per_device": 4956 }, { "epoch": 0.6028, "loss_ce": 0.09042271971702576, "loss_lvr": 0.7307606935501099, "loss_mode_switch": 0.0, "loss_total": 0.16349878907203674, "step": 1507 }, { "batch_size": 4, "epoch": 0.6028, "step": 1507, "tokens_per_device": 5096 }, { "epoch": 0.6028, "loss_ce": 0.16136027872562408, "loss_lvr": 0.5400463938713074, "loss_mode_switch": 0.0, "loss_total": 0.21536491811275482, "step": 1507 }, { "epoch": 0.6032, "grad_norm": 1.2976264953613281, "learning_rate": 3.5910547125111783e-06, "loss": 0.2823, "step": 1508 }, { "batch_size": 1, "epoch": 0.6032, "step": 1508, "tokens_per_device": 5115 }, { "epoch": 0.6032, "loss_ce": 0.006623685359954834, "loss_lvr": 0.2643359303474426, "loss_mode_switch": 0.0, "loss_total": 0.033057279884815216, "step": 1508 }, { "batch_size": 1, "epoch": 0.6032, "step": 1508, "tokens_per_device": 4342 }, { "epoch": 0.6032, "loss_ce": 0.2831829786300659, "loss_lvr": 0.5448533892631531, "loss_mode_switch": 0.0, "loss_total": 0.3376683294773102, "step": 1508 }, { "batch_size": 4, "epoch": 0.6032, "step": 1508, "tokens_per_device": 3424 }, { "epoch": 0.6032, "loss_ce": 0.7384682297706604, "loss_lvr": 1.2260602712631226, "loss_mode_switch": 0.0, "loss_total": 0.8610742688179016, "step": 1508 }, { "batch_size": 1, "epoch": 0.6032, "step": 1508, "tokens_per_device": 4942 }, { "epoch": 0.6032, "loss_ce": 0.3012375831604004, "loss_lvr": 0.4340682625770569, "loss_mode_switch": 0.0, "loss_total": 0.3446443974971771, "step": 1508 }, { "batch_size": 4, "epoch": 0.6032, "step": 1508, "tokens_per_device": 4196 }, { "epoch": 0.6032, "loss_ce": 0.0027377218939363956, "loss_lvr": 0.7457379698753357, "loss_mode_switch": 0.0, "loss_total": 0.07731152325868607, "step": 1508 }, { "batch_size": 4, "epoch": 0.6032, "step": 1508, "tokens_per_device": 1336 }, { "epoch": 0.6032, "loss_ce": 0.26712146401405334, "loss_lvr": 0.9890369772911072, "loss_mode_switch": 0.0, "loss_total": 0.3660251498222351, "step": 1508 }, { "batch_size": 1, "epoch": 0.6032, "step": 1508, "tokens_per_device": 4872 }, { "epoch": 0.6032, "loss_ce": 0.0005678891902789474, "loss_lvr": 0.3579752743244171, "loss_mode_switch": 0.0, "loss_total": 0.03636541962623596, "step": 1508 }, { "batch_size": 4, "epoch": 0.6032, "step": 1508, "tokens_per_device": 4516 }, { "epoch": 0.6032, "loss_ce": 0.6132301092147827, "loss_lvr": 0.7877291440963745, "loss_mode_switch": 0.0, "loss_total": 0.6920030117034912, "step": 1508 }, { "epoch": 0.6036, "grad_norm": 1.330979347229004, "learning_rate": 3.5848408781261323e-06, "loss": 0.3008, "step": 1509 }, { "batch_size": 4, "epoch": 0.6036, "step": 1509, "tokens_per_device": 1384 }, { "epoch": 0.6036, "loss_ce": 0.5227286219596863, "loss_lvr": 0.9796916842460632, "loss_mode_switch": 0.0, "loss_total": 0.6206977963447571, "step": 1509 }, { "batch_size": 4, "epoch": 0.6036, "step": 1509, "tokens_per_device": 4344 }, { "epoch": 0.6036, "loss_ce": 0.23334325850009918, "loss_lvr": 0.9743966460227966, "loss_mode_switch": 0.0, "loss_total": 0.3307829201221466, "step": 1509 }, { "batch_size": 4, "epoch": 0.6036, "step": 1509, "tokens_per_device": 4220 }, { "epoch": 0.6036, "loss_ce": 0.07575033605098724, "loss_lvr": 0.6368275880813599, "loss_mode_switch": 0.0, "loss_total": 0.1394330859184265, "step": 1509 }, { "batch_size": 4, "epoch": 0.6036, "step": 1509, "tokens_per_device": 4244 }, { "epoch": 0.6036, "loss_ce": 0.1164589449763298, "loss_lvr": 0.8143945336341858, "loss_mode_switch": 0.0, "loss_total": 0.19789840281009674, "step": 1509 }, { "batch_size": 1, "epoch": 0.6036, "step": 1509, "tokens_per_device": 4866 }, { "epoch": 0.6036, "loss_ce": 0.018055399879813194, "loss_lvr": 0.21367377042770386, "loss_mode_switch": 0.0, "loss_total": 0.03942277655005455, "step": 1509 }, { "batch_size": 4, "epoch": 0.6036, "step": 1509, "tokens_per_device": 4356 }, { "epoch": 0.6036, "loss_ce": 0.14263083040714264, "loss_lvr": 0.732881486415863, "loss_mode_switch": 0.0, "loss_total": 0.21591898798942566, "step": 1509 }, { "batch_size": 4, "epoch": 0.6036, "step": 1509, "tokens_per_device": 5124 }, { "epoch": 0.6036, "loss_ce": 0.08701205998659134, "loss_lvr": 0.6868340969085693, "loss_mode_switch": 0.0, "loss_total": 0.15569546818733215, "step": 1509 }, { "batch_size": 4, "epoch": 0.6036, "step": 1509, "tokens_per_device": 4004 }, { "epoch": 0.6036, "loss_ce": 0.3796551823616028, "loss_lvr": 0.8260543346405029, "loss_mode_switch": 0.0, "loss_total": 0.4622606039047241, "step": 1509 }, { "epoch": 0.604, "grad_norm": 1.33846116065979, "learning_rate": 3.578629418838757e-06, "loss": 0.3224, "step": 1510 }, { "batch_size": 4, "epoch": 0.604, "step": 1510, "tokens_per_device": 1584 }, { "epoch": 0.604, "loss_ce": 0.476529598236084, "loss_lvr": 0.9200518131256104, "loss_mode_switch": 0.0, "loss_total": 0.568534791469574, "step": 1510 }, { "batch_size": 4, "epoch": 0.604, "step": 1510, "tokens_per_device": 4680 }, { "epoch": 0.604, "loss_ce": 0.013576842844486237, "loss_lvr": 0.9581255912780762, "loss_mode_switch": 0.0, "loss_total": 0.10938940197229385, "step": 1510 }, { "batch_size": 4, "epoch": 0.604, "step": 1510, "tokens_per_device": 4896 }, { "epoch": 0.604, "loss_ce": 0.3516591191291809, "loss_lvr": 0.7493743896484375, "loss_mode_switch": 0.0, "loss_total": 0.4265965521335602, "step": 1510 }, { "batch_size": 4, "epoch": 0.604, "step": 1510, "tokens_per_device": 9004 }, { "epoch": 0.604, "loss_ce": 0.1924983263015747, "loss_lvr": 0.3886537253856659, "loss_mode_switch": 0.0, "loss_total": 0.2313636988401413, "step": 1510 }, { "batch_size": 4, "epoch": 0.604, "step": 1510, "tokens_per_device": 3636 }, { "epoch": 0.604, "loss_ce": 0.5915638208389282, "loss_lvr": 1.0946398973464966, "loss_mode_switch": 0.0, "loss_total": 0.7010278105735779, "step": 1510 }, { "batch_size": 4, "epoch": 0.604, "step": 1510, "tokens_per_device": 4524 }, { "epoch": 0.604, "loss_ce": 0.10586488246917725, "loss_lvr": 1.1567052602767944, "loss_mode_switch": 0.0, "loss_total": 0.22153541445732117, "step": 1510 }, { "batch_size": 4, "epoch": 0.604, "step": 1510, "tokens_per_device": 3892 }, { "epoch": 0.604, "loss_ce": 0.580639123916626, "loss_lvr": 0.9288615584373474, "loss_mode_switch": 0.0, "loss_total": 0.6735252737998962, "step": 1510 }, { "batch_size": 1, "epoch": 0.604, "step": 1510, "tokens_per_device": 4875 }, { "epoch": 0.604, "loss_ce": 0.0009760205866768956, "loss_lvr": 0.4621800184249878, "loss_mode_switch": 0.0, "loss_total": 0.04719402641057968, "step": 1510 }, { "epoch": 0.6044, "grad_norm": 1.3572790622711182, "learning_rate": 3.572420345073906e-06, "loss": 0.3181, "step": 1511 }, { "batch_size": 1, "epoch": 0.6044, "step": 1511, "tokens_per_device": 4889 }, { "epoch": 0.6044, "loss_ce": 0.004966366104781628, "loss_lvr": 0.5149780511856079, "loss_mode_switch": 0.0, "loss_total": 0.05646417289972305, "step": 1511 }, { "batch_size": 4, "epoch": 0.6044, "step": 1511, "tokens_per_device": 2700 }, { "epoch": 0.6044, "loss_ce": 0.238942950963974, "loss_lvr": 1.4446401596069336, "loss_mode_switch": 0.0, "loss_total": 0.38340696692466736, "step": 1511 }, { "batch_size": 1, "epoch": 0.6044, "step": 1511, "tokens_per_device": 5147 }, { "epoch": 0.6044, "loss_ce": 0.11261869966983795, "loss_lvr": 0.37837526202201843, "loss_mode_switch": 0.0, "loss_total": 0.15045621991157532, "step": 1511 }, { "batch_size": 1, "epoch": 0.6044, "step": 1511, "tokens_per_device": 4611 }, { "epoch": 0.6044, "loss_ce": 0.029739970341324806, "loss_lvr": 0.49480774998664856, "loss_mode_switch": 0.0, "loss_total": 0.07922074943780899, "step": 1511 }, { "batch_size": 4, "epoch": 0.6044, "step": 1511, "tokens_per_device": 3752 }, { "epoch": 0.6044, "loss_ce": 0.227937713265419, "loss_lvr": 0.9934414625167847, "loss_mode_switch": 0.0, "loss_total": 0.3272818624973297, "step": 1511 }, { "batch_size": 1, "epoch": 0.6044, "step": 1511, "tokens_per_device": 5247 }, { "epoch": 0.6044, "loss_ce": 0.09290081262588501, "loss_lvr": 0.5705223679542542, "loss_mode_switch": 0.0, "loss_total": 0.14995305240154266, "step": 1511 }, { "batch_size": 4, "epoch": 0.6044, "step": 1511, "tokens_per_device": 3804 }, { "epoch": 0.6044, "loss_ce": 0.5068044662475586, "loss_lvr": 1.0190621614456177, "loss_mode_switch": 0.0, "loss_total": 0.6087107062339783, "step": 1511 }, { "batch_size": 1, "epoch": 0.6044, "step": 1511, "tokens_per_device": 4897 }, { "epoch": 0.6044, "loss_ce": 0.10122965276241302, "loss_lvr": 0.36263254284858704, "loss_mode_switch": 0.0, "loss_total": 0.13749291002750397, "step": 1511 }, { "epoch": 0.6048, "grad_norm": 1.4541326761245728, "learning_rate": 3.566213667252423e-06, "loss": 0.3291, "step": 1512 }, { "batch_size": 4, "epoch": 0.6048, "step": 1512, "tokens_per_device": 4192 }, { "epoch": 0.6048, "loss_ce": 0.07832421362400055, "loss_lvr": 1.0738332271575928, "loss_mode_switch": 0.0, "loss_total": 0.18570753931999207, "step": 1512 }, { "batch_size": 4, "epoch": 0.6048, "step": 1512, "tokens_per_device": 7472 }, { "epoch": 0.6048, "loss_ce": 0.013718850910663605, "loss_lvr": 0.5562888383865356, "loss_mode_switch": 0.0, "loss_total": 0.06934773921966553, "step": 1512 }, { "batch_size": 4, "epoch": 0.6048, "step": 1512, "tokens_per_device": 6588 }, { "epoch": 0.6048, "loss_ce": 0.040035318583250046, "loss_lvr": 0.7240997552871704, "loss_mode_switch": 0.0, "loss_total": 0.11244529485702515, "step": 1512 }, { "batch_size": 1, "epoch": 0.6048, "step": 1512, "tokens_per_device": 7040 }, { "epoch": 0.6048, "loss_ce": 0.16719384491443634, "loss_lvr": 0.3247537910938263, "loss_mode_switch": 0.0, "loss_total": 0.1996692270040512, "step": 1512 }, { "batch_size": 1, "epoch": 0.6048, "step": 1512, "tokens_per_device": 5114 }, { "epoch": 0.6048, "loss_ce": 0.009199178777635098, "loss_lvr": 0.5658822059631348, "loss_mode_switch": 0.0, "loss_total": 0.06578739732503891, "step": 1512 }, { "batch_size": 4, "epoch": 0.6048, "step": 1512, "tokens_per_device": 7872 }, { "epoch": 0.6048, "loss_ce": 0.05071113258600235, "loss_lvr": 2.679018974304199, "loss_mode_switch": 0.0, "loss_total": 0.3186130225658417, "step": 1512 }, { "batch_size": 1, "epoch": 0.6048, "step": 1512, "tokens_per_device": 5414 }, { "epoch": 0.6048, "loss_ce": 0.07621128857135773, "loss_lvr": 0.32045379281044006, "loss_mode_switch": 0.0, "loss_total": 0.10825666785240173, "step": 1512 }, { "batch_size": 1, "epoch": 0.6048, "step": 1512, "tokens_per_device": 5105 }, { "epoch": 0.6048, "loss_ce": 0.014683336019515991, "loss_lvr": 0.36881914734840393, "loss_mode_switch": 0.0, "loss_total": 0.051565252244472504, "step": 1512 }, { "epoch": 0.6052, "grad_norm": 1.2472554445266724, "learning_rate": 3.560009395791133e-06, "loss": 0.2588, "step": 1513 }, { "batch_size": 1, "epoch": 0.6052, "step": 1513, "tokens_per_device": 5171 }, { "epoch": 0.6052, "loss_ce": 0.011905370280146599, "loss_lvr": 0.36504364013671875, "loss_mode_switch": 0.0, "loss_total": 0.04840973764657974, "step": 1513 }, { "batch_size": 1, "epoch": 0.6052, "step": 1513, "tokens_per_device": 5166 }, { "epoch": 0.6052, "loss_ce": 0.2341848760843277, "loss_lvr": 0.25271105766296387, "loss_mode_switch": 0.0, "loss_total": 0.25945597887039185, "step": 1513 }, { "batch_size": 4, "epoch": 0.6052, "step": 1513, "tokens_per_device": 5768 }, { "epoch": 0.6052, "loss_ce": 0.3774610161781311, "loss_lvr": 0.7578730583190918, "loss_mode_switch": 0.0, "loss_total": 0.4532483220100403, "step": 1513 }, { "batch_size": 4, "epoch": 0.6052, "step": 1513, "tokens_per_device": 1216 }, { "epoch": 0.6052, "loss_ce": 0.19765086472034454, "loss_lvr": 1.0609537363052368, "loss_mode_switch": 0.0, "loss_total": 0.30374622344970703, "step": 1513 }, { "batch_size": 1, "epoch": 0.6052, "step": 1513, "tokens_per_device": 5108 }, { "epoch": 0.6052, "loss_ce": 0.0005157290725037456, "loss_lvr": 0.4829154312610626, "loss_mode_switch": 0.0, "loss_total": 0.04880727455019951, "step": 1513 }, { "batch_size": 1, "epoch": 0.6052, "step": 1513, "tokens_per_device": 4856 }, { "epoch": 0.6052, "loss_ce": 0.008859588764607906, "loss_lvr": 0.5586819052696228, "loss_mode_switch": 0.0, "loss_total": 0.0647277757525444, "step": 1513 }, { "batch_size": 4, "epoch": 0.6052, "step": 1513, "tokens_per_device": 5160 }, { "epoch": 0.6052, "loss_ce": 0.13998307287693024, "loss_lvr": 0.9362894296646118, "loss_mode_switch": 0.0, "loss_total": 0.23361201584339142, "step": 1513 }, { "batch_size": 4, "epoch": 0.6052, "step": 1513, "tokens_per_device": 5644 }, { "epoch": 0.6052, "loss_ce": 0.49912866950035095, "loss_lvr": 0.9014659523963928, "loss_mode_switch": 0.0, "loss_total": 0.5892752408981323, "step": 1513 }, { "epoch": 0.6056, "grad_norm": 1.3448634147644043, "learning_rate": 3.553807541102827e-06, "loss": 0.2367, "step": 1514 }, { "batch_size": 4, "epoch": 0.6056, "step": 1514, "tokens_per_device": 6924 }, { "epoch": 0.6056, "loss_ce": 0.46776658296585083, "loss_lvr": 0.7540104389190674, "loss_mode_switch": 0.0, "loss_total": 0.5431676506996155, "step": 1514 }, { "batch_size": 4, "epoch": 0.6056, "step": 1514, "tokens_per_device": 13324 }, { "epoch": 0.6056, "loss_ce": 0.3098956346511841, "loss_lvr": 0.5245023369789124, "loss_mode_switch": 0.0, "loss_total": 0.3623458743095398, "step": 1514 }, { "batch_size": 4, "epoch": 0.6056, "step": 1514, "tokens_per_device": 8900 }, { "epoch": 0.6056, "loss_ce": 0.11647029966115952, "loss_lvr": 0.7457001805305481, "loss_mode_switch": 0.0, "loss_total": 0.19104032218456268, "step": 1514 }, { "batch_size": 1, "epoch": 0.6056, "step": 1514, "tokens_per_device": 5106 }, { "epoch": 0.6056, "loss_ce": 0.08870254456996918, "loss_lvr": 0.48110711574554443, "loss_mode_switch": 0.0, "loss_total": 0.13681325316429138, "step": 1514 }, { "batch_size": 4, "epoch": 0.6056, "step": 1514, "tokens_per_device": 4476 }, { "epoch": 0.6056, "loss_ce": 0.4489096403121948, "loss_lvr": 0.8723521828651428, "loss_mode_switch": 0.0, "loss_total": 0.5361448526382446, "step": 1514 }, { "batch_size": 4, "epoch": 0.6056, "step": 1514, "tokens_per_device": 5016 }, { "epoch": 0.6056, "loss_ce": 0.08405063301324844, "loss_lvr": 0.9892982244491577, "loss_mode_switch": 0.0, "loss_total": 0.18298044800758362, "step": 1514 }, { "batch_size": 4, "epoch": 0.6056, "step": 1514, "tokens_per_device": 4428 }, { "epoch": 0.6056, "loss_ce": 0.6634641289710999, "loss_lvr": 0.7224368453025818, "loss_mode_switch": 0.0, "loss_total": 0.7357078194618225, "step": 1514 }, { "batch_size": 1, "epoch": 0.6056, "step": 1514, "tokens_per_device": 4865 }, { "epoch": 0.6056, "loss_ce": 0.002994461450725794, "loss_lvr": 0.8054670691490173, "loss_mode_switch": 0.0, "loss_total": 0.08354116976261139, "step": 1514 }, { "epoch": 0.606, "grad_norm": 1.175689458847046, "learning_rate": 3.5476081135962335e-06, "loss": 0.2747, "step": 1515 }, { "batch_size": 1, "epoch": 0.606, "step": 1515, "tokens_per_device": 4867 }, { "epoch": 0.606, "loss_ce": 0.03420216217637062, "loss_lvr": 0.482339471578598, "loss_mode_switch": 0.0, "loss_total": 0.08243611454963684, "step": 1515 }, { "batch_size": 4, "epoch": 0.606, "step": 1515, "tokens_per_device": 3816 }, { "epoch": 0.606, "loss_ce": 0.2447669357061386, "loss_lvr": 1.1852779388427734, "loss_mode_switch": 0.0, "loss_total": 0.36329472064971924, "step": 1515 }, { "batch_size": 4, "epoch": 0.606, "step": 1515, "tokens_per_device": 4616 }, { "epoch": 0.606, "loss_ce": 0.10483518242835999, "loss_lvr": 0.7670484781265259, "loss_mode_switch": 0.0, "loss_total": 0.18154004216194153, "step": 1515 }, { "batch_size": 4, "epoch": 0.606, "step": 1515, "tokens_per_device": 2668 }, { "epoch": 0.606, "loss_ce": 0.06411788612604141, "loss_lvr": 2.2549450397491455, "loss_mode_switch": 0.0, "loss_total": 0.28961238265037537, "step": 1515 }, { "batch_size": 4, "epoch": 0.606, "step": 1515, "tokens_per_device": 2712 }, { "epoch": 0.606, "loss_ce": 0.5463653802871704, "loss_lvr": 0.7432142496109009, "loss_mode_switch": 0.0, "loss_total": 0.6206868290901184, "step": 1515 }, { "batch_size": 1, "epoch": 0.606, "step": 1515, "tokens_per_device": 5115 }, { "epoch": 0.606, "loss_ce": 0.003273996524512768, "loss_lvr": 0.20638762414455414, "loss_mode_switch": 0.0, "loss_total": 0.023912757635116577, "step": 1515 }, { "batch_size": 4, "epoch": 0.606, "step": 1515, "tokens_per_device": 4660 }, { "epoch": 0.606, "loss_ce": 0.12196129560470581, "loss_lvr": 0.9413197040557861, "loss_mode_switch": 0.0, "loss_total": 0.2160932719707489, "step": 1515 }, { "batch_size": 4, "epoch": 0.606, "step": 1515, "tokens_per_device": 3904 }, { "epoch": 0.606, "loss_ce": 0.1026197150349617, "loss_lvr": 0.49847474694252014, "loss_mode_switch": 0.0, "loss_total": 0.15246719121932983, "step": 1515 }, { "epoch": 0.6064, "grad_norm": 1.19524085521698, "learning_rate": 3.541411123676012e-06, "loss": 0.2912, "step": 1516 }, { "batch_size": 4, "epoch": 0.6064, "step": 1516, "tokens_per_device": 4464 }, { "epoch": 0.6064, "loss_ce": 0.03403545543551445, "loss_lvr": 0.6256418228149414, "loss_mode_switch": 0.0, "loss_total": 0.09659963846206665, "step": 1516 }, { "batch_size": 4, "epoch": 0.6064, "step": 1516, "tokens_per_device": 4200 }, { "epoch": 0.6064, "loss_ce": 0.8418688774108887, "loss_lvr": 0.9113198518753052, "loss_mode_switch": 0.0, "loss_total": 0.9330008625984192, "step": 1516 }, { "batch_size": 4, "epoch": 0.6064, "step": 1516, "tokens_per_device": 6272 }, { "epoch": 0.6064, "loss_ce": 0.07861335575580597, "loss_lvr": 0.7094653248786926, "loss_mode_switch": 0.0, "loss_total": 0.149559885263443, "step": 1516 }, { "batch_size": 4, "epoch": 0.6064, "step": 1516, "tokens_per_device": 2712 }, { "epoch": 0.6064, "loss_ce": 0.22729229927062988, "loss_lvr": 0.7488647699356079, "loss_mode_switch": 0.0, "loss_total": 0.3021787703037262, "step": 1516 }, { "batch_size": 1, "epoch": 0.6064, "step": 1516, "tokens_per_device": 4885 }, { "epoch": 0.6064, "loss_ce": 0.0019495915621519089, "loss_lvr": 0.22782522439956665, "loss_mode_switch": 0.0, "loss_total": 0.024732114747166634, "step": 1516 }, { "batch_size": 4, "epoch": 0.6064, "step": 1516, "tokens_per_device": 10112 }, { "epoch": 0.6064, "loss_ce": 0.7170962691307068, "loss_lvr": 0.710361659526825, "loss_mode_switch": 0.0, "loss_total": 0.7881324291229248, "step": 1516 }, { "batch_size": 4, "epoch": 0.6064, "step": 1516, "tokens_per_device": 4248 }, { "epoch": 0.6064, "loss_ce": 0.4104226529598236, "loss_lvr": 0.7570292949676514, "loss_mode_switch": 0.0, "loss_total": 0.4861255884170532, "step": 1516 }, { "batch_size": 1, "epoch": 0.6064, "step": 1516, "tokens_per_device": 5147 }, { "epoch": 0.6064, "loss_ce": 0.013043319806456566, "loss_lvr": 0.5575937628746033, "loss_mode_switch": 0.0, "loss_total": 0.06880269944667816, "step": 1516 }, { "epoch": 0.6068, "grad_norm": 1.1325188875198364, "learning_rate": 3.5352165817427255e-06, "loss": 0.2944, "step": 1517 }, { "batch_size": 1, "epoch": 0.6068, "step": 1517, "tokens_per_device": 4622 }, { "epoch": 0.6068, "loss_ce": 0.09696320444345474, "loss_lvr": 0.4660258889198303, "loss_mode_switch": 0.0, "loss_total": 0.14356578886508942, "step": 1517 }, { "batch_size": 4, "epoch": 0.6068, "step": 1517, "tokens_per_device": 4204 }, { "epoch": 0.6068, "loss_ce": 0.06647939234972, "loss_lvr": 0.8601078391075134, "loss_mode_switch": 0.0, "loss_total": 0.15249016880989075, "step": 1517 }, { "batch_size": 1, "epoch": 0.6068, "step": 1517, "tokens_per_device": 5090 }, { "epoch": 0.6068, "loss_ce": 0.14470212161540985, "loss_lvr": 0.46185436844825745, "loss_mode_switch": 0.0, "loss_total": 0.19088755548000336, "step": 1517 }, { "batch_size": 4, "epoch": 0.6068, "step": 1517, "tokens_per_device": 4440 }, { "epoch": 0.6068, "loss_ce": 0.02998301200568676, "loss_lvr": 0.8484938144683838, "loss_mode_switch": 0.0, "loss_total": 0.11483239382505417, "step": 1517 }, { "batch_size": 4, "epoch": 0.6068, "step": 1517, "tokens_per_device": 6264 }, { "epoch": 0.6068, "loss_ce": 0.4350690245628357, "loss_lvr": 0.6640638113021851, "loss_mode_switch": 0.0, "loss_total": 0.5014753937721252, "step": 1517 }, { "batch_size": 1, "epoch": 0.6068, "step": 1517, "tokens_per_device": 5122 }, { "epoch": 0.6068, "loss_ce": 0.1164088249206543, "loss_lvr": 0.3069072365760803, "loss_mode_switch": 0.0, "loss_total": 0.1470995545387268, "step": 1517 }, { "batch_size": 4, "epoch": 0.6068, "step": 1517, "tokens_per_device": 3988 }, { "epoch": 0.6068, "loss_ce": 0.07916399836540222, "loss_lvr": 1.1433507204055786, "loss_mode_switch": 0.0, "loss_total": 0.19349907338619232, "step": 1517 }, { "batch_size": 4, "epoch": 0.6068, "step": 1517, "tokens_per_device": 3204 }, { "epoch": 0.6068, "loss_ce": 0.2945365309715271, "loss_lvr": 0.8968592286109924, "loss_mode_switch": 0.0, "loss_total": 0.38422244787216187, "step": 1517 }, { "epoch": 0.6072, "grad_norm": 1.2794506549835205, "learning_rate": 3.5290244981928323e-06, "loss": 0.2809, "step": 1518 }, { "batch_size": 4, "epoch": 0.6072, "step": 1518, "tokens_per_device": 4212 }, { "epoch": 0.6072, "loss_ce": 0.44694918394088745, "loss_lvr": 0.7499451637268066, "loss_mode_switch": 0.0, "loss_total": 0.5219436883926392, "step": 1518 }, { "batch_size": 1, "epoch": 0.6072, "step": 1518, "tokens_per_device": 5118 }, { "epoch": 0.6072, "loss_ce": 0.0014307305682450533, "loss_lvr": 0.6824504733085632, "loss_mode_switch": 0.0, "loss_total": 0.06967577338218689, "step": 1518 }, { "batch_size": 1, "epoch": 0.6072, "step": 1518, "tokens_per_device": 5540 }, { "epoch": 0.6072, "loss_ce": 0.013272648677229881, "loss_lvr": 0.3585435748100281, "loss_mode_switch": 0.0, "loss_total": 0.0491270050406456, "step": 1518 }, { "batch_size": 4, "epoch": 0.6072, "step": 1518, "tokens_per_device": 4396 }, { "epoch": 0.6072, "loss_ce": 0.06575684249401093, "loss_lvr": 0.9353431463241577, "loss_mode_switch": 0.0, "loss_total": 0.15929114818572998, "step": 1518 }, { "batch_size": 4, "epoch": 0.6072, "step": 1518, "tokens_per_device": 4664 }, { "epoch": 0.6072, "loss_ce": 0.06610171496868134, "loss_lvr": 0.7808684706687927, "loss_mode_switch": 0.0, "loss_total": 0.1441885530948639, "step": 1518 }, { "batch_size": 4, "epoch": 0.6072, "step": 1518, "tokens_per_device": 2692 }, { "epoch": 0.6072, "loss_ce": 0.21892409026622772, "loss_lvr": 0.6372796893119812, "loss_mode_switch": 0.0, "loss_total": 0.2826520502567291, "step": 1518 }, { "batch_size": 1, "epoch": 0.6072, "step": 1518, "tokens_per_device": 4872 }, { "epoch": 0.6072, "loss_ce": 0.007536331191658974, "loss_lvr": 0.5239979028701782, "loss_mode_switch": 0.0, "loss_total": 0.059936121106147766, "step": 1518 }, { "batch_size": 4, "epoch": 0.6072, "step": 1518, "tokens_per_device": 3788 }, { "epoch": 0.6072, "loss_ce": 0.28060829639434814, "loss_lvr": 1.0912898778915405, "loss_mode_switch": 0.0, "loss_total": 0.3897372782230377, "step": 1518 }, { "epoch": 0.6076, "grad_norm": 1.2026656866073608, "learning_rate": 3.5228348834186663e-06, "loss": 0.2803, "step": 1519 }, { "batch_size": 4, "epoch": 0.6076, "step": 1519, "tokens_per_device": 10732 }, { "epoch": 0.6076, "loss_ce": 0.22449742257595062, "loss_lvr": 0.9619704484939575, "loss_mode_switch": 0.0, "loss_total": 0.3206944763660431, "step": 1519 }, { "batch_size": 1, "epoch": 0.6076, "step": 1519, "tokens_per_device": 4918 }, { "epoch": 0.6076, "loss_ce": 0.7110627293586731, "loss_lvr": 1.476580262184143, "loss_mode_switch": 0.0, "loss_total": 0.8587207794189453, "step": 1519 }, { "batch_size": 1, "epoch": 0.6076, "step": 1519, "tokens_per_device": 5037 }, { "epoch": 0.6076, "loss_ce": 0.0007601289544254541, "loss_lvr": 0.3825371563434601, "loss_mode_switch": 0.0, "loss_total": 0.03901384770870209, "step": 1519 }, { "batch_size": 1, "epoch": 0.6076, "step": 1519, "tokens_per_device": 4888 }, { "epoch": 0.6076, "loss_ce": 0.005689412355422974, "loss_lvr": 1.0829110145568848, "loss_mode_switch": 0.0, "loss_total": 0.11398051679134369, "step": 1519 }, { "batch_size": 4, "epoch": 0.6076, "step": 1519, "tokens_per_device": 4320 }, { "epoch": 0.6076, "loss_ce": 0.5423384308815002, "loss_lvr": 1.047580599784851, "loss_mode_switch": 0.0, "loss_total": 0.6470965147018433, "step": 1519 }, { "batch_size": 1, "epoch": 0.6076, "step": 1519, "tokens_per_device": 5034 }, { "epoch": 0.6076, "loss_ce": 0.6326265931129456, "loss_lvr": 0.49870410561561584, "loss_mode_switch": 0.0, "loss_total": 0.6824970245361328, "step": 1519 }, { "batch_size": 1, "epoch": 0.6076, "step": 1519, "tokens_per_device": 5156 }, { "epoch": 0.6076, "loss_ce": 0.02158590778708458, "loss_lvr": 0.2752370238304138, "loss_mode_switch": 0.0, "loss_total": 0.04910960793495178, "step": 1519 }, { "batch_size": 4, "epoch": 0.6076, "step": 1519, "tokens_per_device": 4224 }, { "epoch": 0.6076, "loss_ce": 0.079697385430336, "loss_lvr": 1.626712679862976, "loss_mode_switch": 0.0, "loss_total": 0.2423686534166336, "step": 1519 }, { "epoch": 0.608, "grad_norm": 1.3852518796920776, "learning_rate": 3.516647747808417e-06, "loss": 0.2961, "step": 1520 }, { "batch_size": 1, "epoch": 0.608, "step": 1520, "tokens_per_device": 4467 }, { "epoch": 0.608, "loss_ce": 0.12273186445236206, "loss_lvr": 0.5148044228553772, "loss_mode_switch": 0.0, "loss_total": 0.17421230673789978, "step": 1520 }, { "batch_size": 1, "epoch": 0.608, "step": 1520, "tokens_per_device": 5114 }, { "epoch": 0.608, "loss_ce": 0.010125478729605675, "loss_lvr": 0.2993134558200836, "loss_mode_switch": 0.0, "loss_total": 0.040056824684143066, "step": 1520 }, { "batch_size": 4, "epoch": 0.608, "step": 1520, "tokens_per_device": 5732 }, { "epoch": 0.608, "loss_ce": 0.07239092886447906, "loss_lvr": 0.9325435757637024, "loss_mode_switch": 0.0, "loss_total": 0.1656452864408493, "step": 1520 }, { "batch_size": 4, "epoch": 0.608, "step": 1520, "tokens_per_device": 5668 }, { "epoch": 0.608, "loss_ce": 0.14814385771751404, "loss_lvr": 0.8909457921981812, "loss_mode_switch": 0.0, "loss_total": 0.23723843693733215, "step": 1520 }, { "batch_size": 4, "epoch": 0.608, "step": 1520, "tokens_per_device": 4252 }, { "epoch": 0.608, "loss_ce": 0.09278790652751923, "loss_lvr": 0.7475120425224304, "loss_mode_switch": 0.0, "loss_total": 0.16753911972045898, "step": 1520 }, { "batch_size": 4, "epoch": 0.608, "step": 1520, "tokens_per_device": 2596 }, { "epoch": 0.608, "loss_ce": 0.23716701567173004, "loss_lvr": 0.8016307353973389, "loss_mode_switch": 0.0, "loss_total": 0.31733009219169617, "step": 1520 }, { "batch_size": 4, "epoch": 0.608, "step": 1520, "tokens_per_device": 3792 }, { "epoch": 0.608, "loss_ce": 0.2900755703449249, "loss_lvr": 0.9815165400505066, "loss_mode_switch": 0.0, "loss_total": 0.3882272243499756, "step": 1520 }, { "batch_size": 4, "epoch": 0.608, "step": 1520, "tokens_per_device": 2716 }, { "epoch": 0.608, "loss_ce": 0.3316618800163269, "loss_lvr": 0.7382175922393799, "loss_mode_switch": 0.0, "loss_total": 0.4054836332798004, "step": 1520 }, { "epoch": 0.6084, "grad_norm": 1.3703784942626953, "learning_rate": 3.510463101746109e-06, "loss": 0.2836, "step": 1521 }, { "batch_size": 4, "epoch": 0.6084, "step": 1521, "tokens_per_device": 5852 }, { "epoch": 0.6084, "loss_ce": 0.0007731170044280589, "loss_lvr": 0.6897220015525818, "loss_mode_switch": 0.0, "loss_total": 0.06974531710147858, "step": 1521 }, { "batch_size": 4, "epoch": 0.6084, "step": 1521, "tokens_per_device": 3836 }, { "epoch": 0.6084, "loss_ce": 0.5969754457473755, "loss_lvr": 0.8577621579170227, "loss_mode_switch": 0.0, "loss_total": 0.6827516555786133, "step": 1521 }, { "batch_size": 1, "epoch": 0.6084, "step": 1521, "tokens_per_device": 5042 }, { "epoch": 0.6084, "loss_ce": 0.03305499628186226, "loss_lvr": 0.43073779344558716, "loss_mode_switch": 0.0, "loss_total": 0.07612878084182739, "step": 1521 }, { "batch_size": 4, "epoch": 0.6084, "step": 1521, "tokens_per_device": 5720 }, { "epoch": 0.6084, "loss_ce": 0.06303100287914276, "loss_lvr": 0.8819328546524048, "loss_mode_switch": 0.0, "loss_total": 0.151224285364151, "step": 1521 }, { "batch_size": 4, "epoch": 0.6084, "step": 1521, "tokens_per_device": 1520 }, { "epoch": 0.6084, "loss_ce": 0.34859636425971985, "loss_lvr": 0.8864182233810425, "loss_mode_switch": 0.0, "loss_total": 0.4372381865978241, "step": 1521 }, { "batch_size": 4, "epoch": 0.6084, "step": 1521, "tokens_per_device": 1580 }, { "epoch": 0.6084, "loss_ce": 0.39113080501556396, "loss_lvr": 1.1468127965927124, "loss_mode_switch": 0.0, "loss_total": 0.5058121085166931, "step": 1521 }, { "batch_size": 4, "epoch": 0.6084, "step": 1521, "tokens_per_device": 9868 }, { "epoch": 0.6084, "loss_ce": 0.16798889636993408, "loss_lvr": 0.74657142162323, "loss_mode_switch": 0.0, "loss_total": 0.24264603853225708, "step": 1521 }, { "batch_size": 4, "epoch": 0.6084, "step": 1521, "tokens_per_device": 5388 }, { "epoch": 0.6084, "loss_ce": 0.056416600942611694, "loss_lvr": 0.6743985414505005, "loss_mode_switch": 0.0, "loss_total": 0.12385645508766174, "step": 1521 }, { "epoch": 0.6088, "grad_norm": 1.2639237642288208, "learning_rate": 3.504280955611593e-06, "loss": 0.2549, "step": 1522 }, { "batch_size": 4, "epoch": 0.6088, "step": 1522, "tokens_per_device": 3932 }, { "epoch": 0.6088, "loss_ce": 0.07860457897186279, "loss_lvr": 0.5569294691085815, "loss_mode_switch": 0.0, "loss_total": 0.13429751992225647, "step": 1522 }, { "batch_size": 4, "epoch": 0.6088, "step": 1522, "tokens_per_device": 3932 }, { "epoch": 0.6088, "loss_ce": 0.2913995087146759, "loss_lvr": 1.2078583240509033, "loss_mode_switch": 0.0, "loss_total": 0.41218534111976624, "step": 1522 }, { "batch_size": 4, "epoch": 0.6088, "step": 1522, "tokens_per_device": 2704 }, { "epoch": 0.6088, "loss_ce": 0.3429054617881775, "loss_lvr": 0.7822358012199402, "loss_mode_switch": 0.0, "loss_total": 0.421129047870636, "step": 1522 }, { "batch_size": 4, "epoch": 0.6088, "step": 1522, "tokens_per_device": 5028 }, { "epoch": 0.6088, "loss_ce": 0.17669397592544556, "loss_lvr": 0.7723734378814697, "loss_mode_switch": 0.0, "loss_total": 0.25393131375312805, "step": 1522 }, { "batch_size": 4, "epoch": 0.6088, "step": 1522, "tokens_per_device": 1316 }, { "epoch": 0.6088, "loss_ce": 0.5405071973800659, "loss_lvr": 1.17084538936615, "loss_mode_switch": 0.0, "loss_total": 0.6575917601585388, "step": 1522 }, { "batch_size": 1, "epoch": 0.6088, "step": 1522, "tokens_per_device": 4883 }, { "epoch": 0.6088, "loss_ce": 0.05555841326713562, "loss_lvr": 0.5628832578659058, "loss_mode_switch": 0.0, "loss_total": 0.11184674501419067, "step": 1522 }, { "batch_size": 4, "epoch": 0.6088, "step": 1522, "tokens_per_device": 7468 }, { "epoch": 0.6088, "loss_ce": 0.1604621708393097, "loss_lvr": 0.823424220085144, "loss_mode_switch": 0.0, "loss_total": 0.24280458688735962, "step": 1522 }, { "batch_size": 4, "epoch": 0.6088, "step": 1522, "tokens_per_device": 3868 }, { "epoch": 0.6088, "loss_ce": 0.4243107736110687, "loss_lvr": 0.9660246968269348, "loss_mode_switch": 0.0, "loss_total": 0.5209132432937622, "step": 1522 }, { "epoch": 0.6092, "grad_norm": 1.2548660039901733, "learning_rate": 3.4981013197805208e-06, "loss": 0.3098, "step": 1523 }, { "batch_size": 4, "epoch": 0.6092, "step": 1523, "tokens_per_device": 7004 }, { "epoch": 0.6092, "loss_ce": 0.2764960825443268, "loss_lvr": 0.8405207991600037, "loss_mode_switch": 0.0, "loss_total": 0.3605481684207916, "step": 1523 }, { "batch_size": 4, "epoch": 0.6092, "step": 1523, "tokens_per_device": 8976 }, { "epoch": 0.6092, "loss_ce": 0.5416582226753235, "loss_lvr": 0.938464343547821, "loss_mode_switch": 0.0, "loss_total": 0.6355046629905701, "step": 1523 }, { "batch_size": 1, "epoch": 0.6092, "step": 1523, "tokens_per_device": 5650 }, { "epoch": 0.6092, "loss_ce": 0.036170635372400284, "loss_lvr": 0.2664797306060791, "loss_mode_switch": 0.0, "loss_total": 0.06281860917806625, "step": 1523 }, { "batch_size": 1, "epoch": 0.6092, "step": 1523, "tokens_per_device": 5107 }, { "epoch": 0.6092, "loss_ce": 0.03120281733572483, "loss_lvr": 0.14743490517139435, "loss_mode_switch": 0.0, "loss_total": 0.045946307480335236, "step": 1523 }, { "batch_size": 4, "epoch": 0.6092, "step": 1523, "tokens_per_device": 4968 }, { "epoch": 0.6092, "loss_ce": 0.20766420662403107, "loss_lvr": 0.8869426250457764, "loss_mode_switch": 0.0, "loss_total": 0.29635846614837646, "step": 1523 }, { "batch_size": 4, "epoch": 0.6092, "step": 1523, "tokens_per_device": 4412 }, { "epoch": 0.6092, "loss_ce": 0.13102185726165771, "loss_lvr": 1.0591777563095093, "loss_mode_switch": 0.0, "loss_total": 0.23693963885307312, "step": 1523 }, { "batch_size": 4, "epoch": 0.6092, "step": 1523, "tokens_per_device": 5448 }, { "epoch": 0.6092, "loss_ce": 0.010020368732511997, "loss_lvr": 0.8724618554115295, "loss_mode_switch": 0.0, "loss_total": 0.0972665548324585, "step": 1523 }, { "batch_size": 4, "epoch": 0.6092, "step": 1523, "tokens_per_device": 4908 }, { "epoch": 0.6092, "loss_ce": 0.5679031014442444, "loss_lvr": 0.6029250025749207, "loss_mode_switch": 0.0, "loss_total": 0.628195583820343, "step": 1523 }, { "epoch": 0.6096, "grad_norm": 1.1985136270523071, "learning_rate": 3.491924204624336e-06, "loss": 0.3067, "step": 1524 }, { "batch_size": 1, "epoch": 0.6096, "step": 1524, "tokens_per_device": 4857 }, { "epoch": 0.6096, "loss_ce": 0.004422908183187246, "loss_lvr": 0.4104105830192566, "loss_mode_switch": 0.0, "loss_total": 0.045463964343070984, "step": 1524 }, { "batch_size": 4, "epoch": 0.6096, "step": 1524, "tokens_per_device": 4248 }, { "epoch": 0.6096, "loss_ce": 0.257295161485672, "loss_lvr": 0.7085832357406616, "loss_mode_switch": 0.0, "loss_total": 0.32815349102020264, "step": 1524 }, { "batch_size": 1, "epoch": 0.6096, "step": 1524, "tokens_per_device": 4942 }, { "epoch": 0.6096, "loss_ce": 0.0056994897313416, "loss_lvr": 0.31632205843925476, "loss_mode_switch": 0.0, "loss_total": 0.03733169659972191, "step": 1524 }, { "batch_size": 4, "epoch": 0.6096, "step": 1524, "tokens_per_device": 4252 }, { "epoch": 0.6096, "loss_ce": 0.06068864464759827, "loss_lvr": 0.8748337030410767, "loss_mode_switch": 0.0, "loss_total": 0.1481720209121704, "step": 1524 }, { "batch_size": 4, "epoch": 0.6096, "step": 1524, "tokens_per_device": 1660 }, { "epoch": 0.6096, "loss_ce": 0.26880794763565063, "loss_lvr": 1.082148790359497, "loss_mode_switch": 0.0, "loss_total": 0.3770228326320648, "step": 1524 }, { "batch_size": 4, "epoch": 0.6096, "step": 1524, "tokens_per_device": 4228 }, { "epoch": 0.6096, "loss_ce": 0.03495233878493309, "loss_lvr": 0.9331178069114685, "loss_mode_switch": 0.0, "loss_total": 0.12826411426067352, "step": 1524 }, { "batch_size": 1, "epoch": 0.6096, "step": 1524, "tokens_per_device": 5188 }, { "epoch": 0.6096, "loss_ce": 0.1422526240348816, "loss_lvr": 0.5496340990066528, "loss_mode_switch": 0.0, "loss_total": 0.19721603393554688, "step": 1524 }, { "batch_size": 4, "epoch": 0.6096, "step": 1524, "tokens_per_device": 4200 }, { "epoch": 0.6096, "loss_ce": 0.41265517473220825, "loss_lvr": 0.876880943775177, "loss_mode_switch": 0.0, "loss_total": 0.5003432631492615, "step": 1524 }, { "epoch": 0.61, "grad_norm": 1.3221948146820068, "learning_rate": 3.4857496205102475e-06, "loss": 0.3014, "step": 1525 }, { "batch_size": 4, "epoch": 0.61, "step": 1525, "tokens_per_device": 4168 }, { "epoch": 0.61, "loss_ce": 0.16562926769256592, "loss_lvr": 0.8693680763244629, "loss_mode_switch": 0.0, "loss_total": 0.25256606936454773, "step": 1525 }, { "batch_size": 4, "epoch": 0.61, "step": 1525, "tokens_per_device": 2820 }, { "epoch": 0.61, "loss_ce": 0.3209429085254669, "loss_lvr": 0.7820428013801575, "loss_mode_switch": 0.0, "loss_total": 0.3991471827030182, "step": 1525 }, { "batch_size": 4, "epoch": 0.61, "step": 1525, "tokens_per_device": 5536 }, { "epoch": 0.61, "loss_ce": 0.22584229707717896, "loss_lvr": 0.7859688401222229, "loss_mode_switch": 0.0, "loss_total": 0.3044391870498657, "step": 1525 }, { "batch_size": 4, "epoch": 0.61, "step": 1525, "tokens_per_device": 4256 }, { "epoch": 0.61, "loss_ce": 0.19161029160022736, "loss_lvr": 0.7111404538154602, "loss_mode_switch": 0.0, "loss_total": 0.2627243399620056, "step": 1525 }, { "batch_size": 1, "epoch": 0.61, "step": 1525, "tokens_per_device": 5127 }, { "epoch": 0.61, "loss_ce": 0.007926490157842636, "loss_lvr": 0.30898410081863403, "loss_mode_switch": 0.0, "loss_total": 0.0388249009847641, "step": 1525 }, { "batch_size": 4, "epoch": 0.61, "step": 1525, "tokens_per_device": 8944 }, { "epoch": 0.61, "loss_ce": 0.31567785143852234, "loss_lvr": 0.594129204750061, "loss_mode_switch": 0.0, "loss_total": 0.3750907778739929, "step": 1525 }, { "batch_size": 4, "epoch": 0.61, "step": 1525, "tokens_per_device": 4484 }, { "epoch": 0.61, "loss_ce": 0.07491172105073929, "loss_lvr": 0.7661932110786438, "loss_mode_switch": 0.0, "loss_total": 0.15153104066848755, "step": 1525 }, { "batch_size": 1, "epoch": 0.61, "step": 1525, "tokens_per_device": 5163 }, { "epoch": 0.61, "loss_ce": 0.006329506170004606, "loss_lvr": 0.6736005544662476, "loss_mode_switch": 0.0, "loss_total": 0.07368956506252289, "step": 1525 }, { "epoch": 0.6104, "grad_norm": 1.2323564291000366, "learning_rate": 3.4795775778012165e-06, "loss": 0.2935, "step": 1526 }, { "batch_size": 4, "epoch": 0.6104, "step": 1526, "tokens_per_device": 5292 }, { "epoch": 0.6104, "loss_ce": 0.567335844039917, "loss_lvr": 0.7536020278930664, "loss_mode_switch": 0.0, "loss_total": 0.6426960229873657, "step": 1526 }, { "batch_size": 1, "epoch": 0.6104, "step": 1526, "tokens_per_device": 5134 }, { "epoch": 0.6104, "loss_ce": 0.002840586705133319, "loss_lvr": 0.394876092672348, "loss_mode_switch": 0.0, "loss_total": 0.04232819750905037, "step": 1526 }, { "batch_size": 4, "epoch": 0.6104, "step": 1526, "tokens_per_device": 1404 }, { "epoch": 0.6104, "loss_ce": 0.6253883242607117, "loss_lvr": 0.9412440061569214, "loss_mode_switch": 0.0, "loss_total": 0.7195127010345459, "step": 1526 }, { "batch_size": 4, "epoch": 0.6104, "step": 1526, "tokens_per_device": 1372 }, { "epoch": 0.6104, "loss_ce": 0.380891352891922, "loss_lvr": 0.7468382120132446, "loss_mode_switch": 0.0, "loss_total": 0.455575168132782, "step": 1526 }, { "batch_size": 4, "epoch": 0.6104, "step": 1526, "tokens_per_device": 3832 }, { "epoch": 0.6104, "loss_ce": 0.15203578770160675, "loss_lvr": 1.076141357421875, "loss_mode_switch": 0.0, "loss_total": 0.25964993238449097, "step": 1526 }, { "batch_size": 4, "epoch": 0.6104, "step": 1526, "tokens_per_device": 2616 }, { "epoch": 0.6104, "loss_ce": 0.6665442585945129, "loss_lvr": 0.9743349552154541, "loss_mode_switch": 0.0, "loss_total": 0.7639777660369873, "step": 1526 }, { "batch_size": 4, "epoch": 0.6104, "step": 1526, "tokens_per_device": 2660 }, { "epoch": 0.6104, "loss_ce": 0.2209347039461136, "loss_lvr": 0.9064833521842957, "loss_mode_switch": 0.0, "loss_total": 0.3115830421447754, "step": 1526 }, { "batch_size": 4, "epoch": 0.6104, "step": 1526, "tokens_per_device": 8880 }, { "epoch": 0.6104, "loss_ce": 0.3432461619377136, "loss_lvr": 0.8191467523574829, "loss_mode_switch": 0.0, "loss_total": 0.42516082525253296, "step": 1526 }, { "epoch": 0.6108, "grad_norm": 1.5173571109771729, "learning_rate": 3.473408086855939e-06, "loss": 0.3817, "step": 1527 }, { "batch_size": 4, "epoch": 0.6108, "step": 1527, "tokens_per_device": 6272 }, { "epoch": 0.6108, "loss_ce": 0.1645451784133911, "loss_lvr": 0.7763023376464844, "loss_mode_switch": 0.0, "loss_total": 0.2421754151582718, "step": 1527 }, { "batch_size": 1, "epoch": 0.6108, "step": 1527, "tokens_per_device": 5112 }, { "epoch": 0.6108, "loss_ce": 0.0009962900076061487, "loss_lvr": 0.39008358120918274, "loss_mode_switch": 0.0, "loss_total": 0.04000465199351311, "step": 1527 }, { "batch_size": 4, "epoch": 0.6108, "step": 1527, "tokens_per_device": 1696 }, { "epoch": 0.6108, "loss_ce": 0.10615593194961548, "loss_lvr": 0.8640619516372681, "loss_mode_switch": 0.0, "loss_total": 0.19256213307380676, "step": 1527 }, { "batch_size": 4, "epoch": 0.6108, "step": 1527, "tokens_per_device": 4088 }, { "epoch": 0.6108, "loss_ce": 0.6332576870918274, "loss_lvr": 0.639679491519928, "loss_mode_switch": 0.0, "loss_total": 0.6972256302833557, "step": 1527 }, { "batch_size": 1, "epoch": 0.6108, "step": 1527, "tokens_per_device": 5502 }, { "epoch": 0.6108, "loss_ce": 0.2475118190050125, "loss_lvr": 0.5484838485717773, "loss_mode_switch": 0.0, "loss_total": 0.3023602068424225, "step": 1527 }, { "batch_size": 4, "epoch": 0.6108, "step": 1527, "tokens_per_device": 6292 }, { "epoch": 0.6108, "loss_ce": 0.0003363478754181415, "loss_lvr": 0.8280510306358337, "loss_mode_switch": 0.0, "loss_total": 0.08314145356416702, "step": 1527 }, { "batch_size": 4, "epoch": 0.6108, "step": 1527, "tokens_per_device": 1684 }, { "epoch": 0.6108, "loss_ce": 0.04204914718866348, "loss_lvr": 1.0148992538452148, "loss_mode_switch": 0.0, "loss_total": 0.14353907108306885, "step": 1527 }, { "batch_size": 1, "epoch": 0.6108, "step": 1527, "tokens_per_device": 5096 }, { "epoch": 0.6108, "loss_ce": 0.0032938465010374784, "loss_lvr": 0.4296269714832306, "loss_mode_switch": 0.0, "loss_total": 0.04625654220581055, "step": 1527 }, { "epoch": 0.6112, "grad_norm": 1.5670777559280396, "learning_rate": 3.4672411580288313e-06, "loss": 0.3341, "step": 1528 }, { "batch_size": 1, "epoch": 0.6112, "step": 1528, "tokens_per_device": 5061 }, { "epoch": 0.6112, "loss_ce": 0.04406804218888283, "loss_lvr": 0.3228551745414734, "loss_mode_switch": 0.0, "loss_total": 0.07635356485843658, "step": 1528 }, { "batch_size": 4, "epoch": 0.6112, "step": 1528, "tokens_per_device": 4064 }, { "epoch": 0.6112, "loss_ce": 0.19522929191589355, "loss_lvr": 0.9520404934883118, "loss_mode_switch": 0.0, "loss_total": 0.2904333472251892, "step": 1528 }, { "batch_size": 1, "epoch": 0.6112, "step": 1528, "tokens_per_device": 4900 }, { "epoch": 0.6112, "loss_ce": 0.015726853162050247, "loss_lvr": 0.5682152509689331, "loss_mode_switch": 0.0, "loss_total": 0.07254837453365326, "step": 1528 }, { "batch_size": 1, "epoch": 0.6112, "step": 1528, "tokens_per_device": 4924 }, { "epoch": 0.6112, "loss_ce": 0.5776689648628235, "loss_lvr": 0.7264847159385681, "loss_mode_switch": 0.0, "loss_total": 0.6503174304962158, "step": 1528 }, { "batch_size": 1, "epoch": 0.6112, "step": 1528, "tokens_per_device": 5241 }, { "epoch": 0.6112, "loss_ce": 0.06085620075464249, "loss_lvr": 0.3752325773239136, "loss_mode_switch": 0.0, "loss_total": 0.0983794629573822, "step": 1528 }, { "batch_size": 1, "epoch": 0.6112, "step": 1528, "tokens_per_device": 5112 }, { "epoch": 0.6112, "loss_ce": 0.046549923717975616, "loss_lvr": 0.23251356184482574, "loss_mode_switch": 0.0, "loss_total": 0.06980127841234207, "step": 1528 }, { "batch_size": 4, "epoch": 0.6112, "step": 1528, "tokens_per_device": 1552 }, { "epoch": 0.6112, "loss_ce": 0.1847134381532669, "loss_lvr": 1.0057824850082397, "loss_mode_switch": 0.0, "loss_total": 0.2852916717529297, "step": 1528 }, { "batch_size": 4, "epoch": 0.6112, "step": 1528, "tokens_per_device": 6024 }, { "epoch": 0.6112, "loss_ce": 0.20073996484279633, "loss_lvr": 0.9186868667602539, "loss_mode_switch": 0.0, "loss_total": 0.2926086485385895, "step": 1528 }, { "epoch": 0.6116, "grad_norm": 1.5768550634384155, "learning_rate": 3.461076801670008e-06, "loss": 0.3383, "step": 1529 }, { "batch_size": 4, "epoch": 0.6116, "step": 1529, "tokens_per_device": 1560 }, { "epoch": 0.6116, "loss_ce": 0.3944474756717682, "loss_lvr": 1.2047537565231323, "loss_mode_switch": 0.0, "loss_total": 0.5149228572845459, "step": 1529 }, { "batch_size": 4, "epoch": 0.6116, "step": 1529, "tokens_per_device": 10708 }, { "epoch": 0.6116, "loss_ce": 0.6280916929244995, "loss_lvr": 0.7130230069160461, "loss_mode_switch": 0.0, "loss_total": 0.6993939876556396, "step": 1529 }, { "batch_size": 4, "epoch": 0.6116, "step": 1529, "tokens_per_device": 1384 }, { "epoch": 0.6116, "loss_ce": 0.39676007628440857, "loss_lvr": 1.1124681234359741, "loss_mode_switch": 0.0, "loss_total": 0.5080068707466125, "step": 1529 }, { "batch_size": 1, "epoch": 0.6116, "step": 1529, "tokens_per_device": 4890 }, { "epoch": 0.6116, "loss_ce": 0.060757413506507874, "loss_lvr": 0.3202775716781616, "loss_mode_switch": 0.0, "loss_total": 0.09278517216444016, "step": 1529 }, { "batch_size": 4, "epoch": 0.6116, "step": 1529, "tokens_per_device": 4324 }, { "epoch": 0.6116, "loss_ce": 0.17305730283260345, "loss_lvr": 0.9070482850074768, "loss_mode_switch": 0.0, "loss_total": 0.26376211643218994, "step": 1529 }, { "batch_size": 4, "epoch": 0.6116, "step": 1529, "tokens_per_device": 4864 }, { "epoch": 0.6116, "loss_ce": 0.28352561593055725, "loss_lvr": 0.7530691027641296, "loss_mode_switch": 0.0, "loss_total": 0.35883253812789917, "step": 1529 }, { "batch_size": 4, "epoch": 0.6116, "step": 1529, "tokens_per_device": 4632 }, { "epoch": 0.6116, "loss_ce": 0.2211126685142517, "loss_lvr": 0.6776278018951416, "loss_mode_switch": 0.0, "loss_total": 0.2888754606246948, "step": 1529 }, { "batch_size": 4, "epoch": 0.6116, "step": 1529, "tokens_per_device": 4184 }, { "epoch": 0.6116, "loss_ce": 0.35236304998397827, "loss_lvr": 0.8096438050270081, "loss_mode_switch": 0.0, "loss_total": 0.43332743644714355, "step": 1529 }, { "epoch": 0.612, "grad_norm": 1.426160454750061, "learning_rate": 3.4549150281252635e-06, "loss": 0.3337, "step": 1530 }, { "batch_size": 1, "epoch": 0.612, "step": 1530, "tokens_per_device": 5090 }, { "epoch": 0.612, "loss_ce": 0.048300694674253464, "loss_lvr": 0.3291339874267578, "loss_mode_switch": 0.0, "loss_total": 0.08121409267187119, "step": 1530 }, { "batch_size": 4, "epoch": 0.612, "step": 1530, "tokens_per_device": 1828 }, { "epoch": 0.612, "loss_ce": 0.35757115483283997, "loss_lvr": 0.8001264929771423, "loss_mode_switch": 0.0, "loss_total": 0.4375838041305542, "step": 1530 }, { "batch_size": 1, "epoch": 0.612, "step": 1530, "tokens_per_device": 4843 }, { "epoch": 0.612, "loss_ce": 0.8972499370574951, "loss_lvr": 0.5874865651130676, "loss_mode_switch": 0.0, "loss_total": 0.9559985995292664, "step": 1530 }, { "batch_size": 4, "epoch": 0.612, "step": 1530, "tokens_per_device": 3788 }, { "epoch": 0.612, "loss_ce": 0.11948827654123306, "loss_lvr": 1.0655686855316162, "loss_mode_switch": 0.0, "loss_total": 0.2260451465845108, "step": 1530 }, { "batch_size": 4, "epoch": 0.612, "step": 1530, "tokens_per_device": 11132 }, { "epoch": 0.612, "loss_ce": 0.019327884539961815, "loss_lvr": 0.6304035782814026, "loss_mode_switch": 0.0, "loss_total": 0.08236824721097946, "step": 1530 }, { "batch_size": 4, "epoch": 0.612, "step": 1530, "tokens_per_device": 1548 }, { "epoch": 0.612, "loss_ce": 0.6497912406921387, "loss_lvr": 1.2749006748199463, "loss_mode_switch": 0.0, "loss_total": 0.7772812843322754, "step": 1530 }, { "batch_size": 4, "epoch": 0.612, "step": 1530, "tokens_per_device": 5272 }, { "epoch": 0.612, "loss_ce": 0.012103564105927944, "loss_lvr": 0.6902053952217102, "loss_mode_switch": 0.0, "loss_total": 0.08112410455942154, "step": 1530 }, { "batch_size": 4, "epoch": 0.612, "step": 1530, "tokens_per_device": 11712 }, { "epoch": 0.612, "loss_ce": 0.07741706818342209, "loss_lvr": 0.8544380068778992, "loss_mode_switch": 0.0, "loss_total": 0.16286087036132812, "step": 1530 }, { "epoch": 0.6124, "grad_norm": 1.399624228477478, "learning_rate": 3.448755847736062e-06, "loss": 0.3222, "step": 1531 }, { "batch_size": 4, "epoch": 0.6124, "step": 1531, "tokens_per_device": 4544 }, { "epoch": 0.6124, "loss_ce": 0.0014931836631149054, "loss_lvr": 0.8111898899078369, "loss_mode_switch": 0.0, "loss_total": 0.08261217921972275, "step": 1531 }, { "batch_size": 4, "epoch": 0.6124, "step": 1531, "tokens_per_device": 1352 }, { "epoch": 0.6124, "loss_ce": 0.4728523790836334, "loss_lvr": 0.9777727127075195, "loss_mode_switch": 0.0, "loss_total": 0.5706296563148499, "step": 1531 }, { "batch_size": 4, "epoch": 0.6124, "step": 1531, "tokens_per_device": 6672 }, { "epoch": 0.6124, "loss_ce": 0.040596649050712585, "loss_lvr": 0.6430633068084717, "loss_mode_switch": 0.0, "loss_total": 0.10490298271179199, "step": 1531 }, { "batch_size": 4, "epoch": 0.6124, "step": 1531, "tokens_per_device": 5540 }, { "epoch": 0.6124, "loss_ce": 0.0539717860519886, "loss_lvr": 0.7100971937179565, "loss_mode_switch": 0.0, "loss_total": 0.12498150765895844, "step": 1531 }, { "batch_size": 4, "epoch": 0.6124, "step": 1531, "tokens_per_device": 5908 }, { "epoch": 0.6124, "loss_ce": 0.03613878786563873, "loss_lvr": 0.84496009349823, "loss_mode_switch": 0.0, "loss_total": 0.12063480168581009, "step": 1531 }, { "batch_size": 1, "epoch": 0.6124, "step": 1531, "tokens_per_device": 5077 }, { "epoch": 0.6124, "loss_ce": 0.13650086522102356, "loss_lvr": 0.505859375, "loss_mode_switch": 0.0, "loss_total": 0.1870868057012558, "step": 1531 }, { "batch_size": 4, "epoch": 0.6124, "step": 1531, "tokens_per_device": 3372 }, { "epoch": 0.6124, "loss_ce": 0.4384506344795227, "loss_lvr": 1.0234835147857666, "loss_mode_switch": 0.0, "loss_total": 0.5407989621162415, "step": 1531 }, { "batch_size": 4, "epoch": 0.6124, "step": 1531, "tokens_per_device": 3736 }, { "epoch": 0.6124, "loss_ce": 0.22165237367153168, "loss_lvr": 0.9309207201004028, "loss_mode_switch": 0.0, "loss_total": 0.3147444427013397, "step": 1531 }, { "epoch": 0.6128, "grad_norm": 1.3226186037063599, "learning_rate": 3.442599270839508e-06, "loss": 0.2806, "step": 1532 }, { "batch_size": 1, "epoch": 0.6128, "step": 1532, "tokens_per_device": 5666 }, { "epoch": 0.6128, "loss_ce": 0.013302294537425041, "loss_lvr": 0.4450168013572693, "loss_mode_switch": 0.0, "loss_total": 0.05780397355556488, "step": 1532 }, { "batch_size": 4, "epoch": 0.6128, "step": 1532, "tokens_per_device": 1556 }, { "epoch": 0.6128, "loss_ce": 0.5545612573623657, "loss_lvr": 0.9209199547767639, "loss_mode_switch": 0.0, "loss_total": 0.6466532349586487, "step": 1532 }, { "batch_size": 1, "epoch": 0.6128, "step": 1532, "tokens_per_device": 4872 }, { "epoch": 0.6128, "loss_ce": 0.004057739395648241, "loss_lvr": 0.29955366253852844, "loss_mode_switch": 0.0, "loss_total": 0.03401310741901398, "step": 1532 }, { "batch_size": 4, "epoch": 0.6128, "step": 1532, "tokens_per_device": 4284 }, { "epoch": 0.6128, "loss_ce": 0.1442173272371292, "loss_lvr": 0.813564121723175, "loss_mode_switch": 0.0, "loss_total": 0.22557374835014343, "step": 1532 }, { "batch_size": 4, "epoch": 0.6128, "step": 1532, "tokens_per_device": 5000 }, { "epoch": 0.6128, "loss_ce": 0.4688100516796112, "loss_lvr": 0.8442890644073486, "loss_mode_switch": 0.0, "loss_total": 0.5532389879226685, "step": 1532 }, { "batch_size": 1, "epoch": 0.6128, "step": 1532, "tokens_per_device": 4892 }, { "epoch": 0.6128, "loss_ce": 0.14464592933654785, "loss_lvr": 0.315481573343277, "loss_mode_switch": 0.0, "loss_total": 0.17619408667087555, "step": 1532 }, { "batch_size": 4, "epoch": 0.6128, "step": 1532, "tokens_per_device": 4192 }, { "epoch": 0.6128, "loss_ce": 0.05106194689869881, "loss_lvr": 0.9623924493789673, "loss_mode_switch": 0.0, "loss_total": 0.14730119705200195, "step": 1532 }, { "batch_size": 4, "epoch": 0.6128, "step": 1532, "tokens_per_device": 6768 }, { "epoch": 0.6128, "loss_ce": 0.013678464107215405, "loss_lvr": 0.7147984504699707, "loss_mode_switch": 0.0, "loss_total": 0.08515831083059311, "step": 1532 }, { "epoch": 0.6132, "grad_norm": 1.316760778427124, "learning_rate": 3.436445307768347e-06, "loss": 0.2691, "step": 1533 }, { "batch_size": 1, "epoch": 0.6132, "step": 1533, "tokens_per_device": 4891 }, { "epoch": 0.6132, "loss_ce": 0.0011503417044878006, "loss_lvr": 0.7525445222854614, "loss_mode_switch": 0.0, "loss_total": 0.07640479505062103, "step": 1533 }, { "batch_size": 4, "epoch": 0.6132, "step": 1533, "tokens_per_device": 3908 }, { "epoch": 0.6132, "loss_ce": 0.0510442778468132, "loss_lvr": 1.8995720148086548, "loss_mode_switch": 0.0, "loss_total": 0.24100148677825928, "step": 1533 }, { "batch_size": 4, "epoch": 0.6132, "step": 1533, "tokens_per_device": 1292 }, { "epoch": 0.6132, "loss_ce": 0.574243426322937, "loss_lvr": 1.0966410636901855, "loss_mode_switch": 0.0, "loss_total": 0.6839075088500977, "step": 1533 }, { "batch_size": 1, "epoch": 0.6132, "step": 1533, "tokens_per_device": 5183 }, { "epoch": 0.6132, "loss_ce": 0.054100994020700455, "loss_lvr": 0.3030795753002167, "loss_mode_switch": 0.0, "loss_total": 0.0844089537858963, "step": 1533 }, { "batch_size": 4, "epoch": 0.6132, "step": 1533, "tokens_per_device": 3588 }, { "epoch": 0.6132, "loss_ce": 0.08509805798530579, "loss_lvr": 0.9324525594711304, "loss_mode_switch": 0.0, "loss_total": 0.17834332585334778, "step": 1533 }, { "batch_size": 1, "epoch": 0.6132, "step": 1533, "tokens_per_device": 5092 }, { "epoch": 0.6132, "loss_ce": 0.002045237459242344, "loss_lvr": 0.3230898082256317, "loss_mode_switch": 0.0, "loss_total": 0.03435421735048294, "step": 1533 }, { "batch_size": 1, "epoch": 0.6132, "step": 1533, "tokens_per_device": 5184 }, { "epoch": 0.6132, "loss_ce": 0.003548968117684126, "loss_lvr": 0.5551618337631226, "loss_mode_switch": 0.0, "loss_total": 0.05906515195965767, "step": 1533 }, { "batch_size": 4, "epoch": 0.6132, "step": 1533, "tokens_per_device": 4568 }, { "epoch": 0.6132, "loss_ce": 0.20131409168243408, "loss_lvr": 0.7624626755714417, "loss_mode_switch": 0.0, "loss_total": 0.27756035327911377, "step": 1533 }, { "epoch": 0.6136, "grad_norm": 1.052121639251709, "learning_rate": 3.430293968850931e-06, "loss": 0.2243, "step": 1534 }, { "batch_size": 1, "epoch": 0.6136, "step": 1534, "tokens_per_device": 6564 }, { "epoch": 0.6136, "loss_ce": 0.16545091569423676, "loss_lvr": 0.3980843722820282, "loss_mode_switch": 0.0, "loss_total": 0.20525935292243958, "step": 1534 }, { "batch_size": 1, "epoch": 0.6136, "step": 1534, "tokens_per_device": 4966 }, { "epoch": 0.6136, "loss_ce": 0.003350168466567993, "loss_lvr": 0.35012009739875793, "loss_mode_switch": 0.0, "loss_total": 0.038362178951501846, "step": 1534 }, { "batch_size": 4, "epoch": 0.6136, "step": 1534, "tokens_per_device": 10132 }, { "epoch": 0.6136, "loss_ce": 0.31185558438301086, "loss_lvr": 0.9736952781677246, "loss_mode_switch": 0.0, "loss_total": 0.40922510623931885, "step": 1534 }, { "batch_size": 4, "epoch": 0.6136, "step": 1534, "tokens_per_device": 2700 }, { "epoch": 0.6136, "loss_ce": 0.2619084119796753, "loss_lvr": 0.8084497451782227, "loss_mode_switch": 0.0, "loss_total": 0.3427533805370331, "step": 1534 }, { "batch_size": 4, "epoch": 0.6136, "step": 1534, "tokens_per_device": 2548 }, { "epoch": 0.6136, "loss_ce": 0.1555468887090683, "loss_lvr": 0.9446967244148254, "loss_mode_switch": 0.0, "loss_total": 0.25001657009124756, "step": 1534 }, { "batch_size": 1, "epoch": 0.6136, "step": 1534, "tokens_per_device": 4871 }, { "epoch": 0.6136, "loss_ce": 0.12343239039182663, "loss_lvr": 0.294847309589386, "loss_mode_switch": 0.0, "loss_total": 0.15291711688041687, "step": 1534 }, { "batch_size": 4, "epoch": 0.6136, "step": 1534, "tokens_per_device": 1256 }, { "epoch": 0.6136, "loss_ce": 0.0707547515630722, "loss_lvr": 0.9519063234329224, "loss_mode_switch": 0.0, "loss_total": 0.1659453809261322, "step": 1534 }, { "batch_size": 4, "epoch": 0.6136, "step": 1534, "tokens_per_device": 3884 }, { "epoch": 0.6136, "loss_ce": 0.4510183036327362, "loss_lvr": 0.7885643243789673, "loss_mode_switch": 0.0, "loss_total": 0.5298747420310974, "step": 1534 }, { "epoch": 0.614, "grad_norm": 1.4429004192352295, "learning_rate": 3.4241452644112085e-06, "loss": 0.2939, "step": 1535 }, { "batch_size": 4, "epoch": 0.614, "step": 1535, "tokens_per_device": 6040 }, { "epoch": 0.614, "loss_ce": 0.31341588497161865, "loss_lvr": 0.6674187779426575, "loss_mode_switch": 0.0, "loss_total": 0.3801577687263489, "step": 1535 }, { "batch_size": 1, "epoch": 0.614, "step": 1535, "tokens_per_device": 5094 }, { "epoch": 0.614, "loss_ce": 0.04916517436504364, "loss_lvr": 0.5813204050064087, "loss_mode_switch": 0.0, "loss_total": 0.10729721188545227, "step": 1535 }, { "batch_size": 1, "epoch": 0.614, "step": 1535, "tokens_per_device": 4896 }, { "epoch": 0.614, "loss_ce": 0.04094743728637695, "loss_lvr": 0.31522929668426514, "loss_mode_switch": 0.0, "loss_total": 0.07247036695480347, "step": 1535 }, { "batch_size": 4, "epoch": 0.614, "step": 1535, "tokens_per_device": 4228 }, { "epoch": 0.614, "loss_ce": 0.5087000727653503, "loss_lvr": 0.9066678881645203, "loss_mode_switch": 0.0, "loss_total": 0.5993668437004089, "step": 1535 }, { "batch_size": 4, "epoch": 0.614, "step": 1535, "tokens_per_device": 4132 }, { "epoch": 0.614, "loss_ce": 0.3158917725086212, "loss_lvr": 0.979870617389679, "loss_mode_switch": 0.0, "loss_total": 0.41387882828712463, "step": 1535 }, { "batch_size": 4, "epoch": 0.614, "step": 1535, "tokens_per_device": 4152 }, { "epoch": 0.614, "loss_ce": 0.1330992579460144, "loss_lvr": 0.8332599997520447, "loss_mode_switch": 0.0, "loss_total": 0.21642526984214783, "step": 1535 }, { "batch_size": 4, "epoch": 0.614, "step": 1535, "tokens_per_device": 6116 }, { "epoch": 0.614, "loss_ce": 0.26732027530670166, "loss_lvr": 0.7225024104118347, "loss_mode_switch": 0.0, "loss_total": 0.3395705223083496, "step": 1535 }, { "batch_size": 4, "epoch": 0.614, "step": 1535, "tokens_per_device": 10336 }, { "epoch": 0.614, "loss_ce": 0.24623537063598633, "loss_lvr": 0.6478418707847595, "loss_mode_switch": 0.0, "loss_total": 0.31101956963539124, "step": 1535 }, { "epoch": 0.6144, "grad_norm": 1.1518774032592773, "learning_rate": 3.417999204768706e-06, "loss": 0.2714, "step": 1536 }, { "batch_size": 4, "epoch": 0.6144, "step": 1536, "tokens_per_device": 7308 }, { "epoch": 0.6144, "loss_ce": 0.09482844918966293, "loss_lvr": 0.5445517897605896, "loss_mode_switch": 0.0, "loss_total": 0.14928363263607025, "step": 1536 }, { "batch_size": 4, "epoch": 0.6144, "step": 1536, "tokens_per_device": 4036 }, { "epoch": 0.6144, "loss_ce": 0.631464421749115, "loss_lvr": 0.703676700592041, "loss_mode_switch": 0.0, "loss_total": 0.701832115650177, "step": 1536 }, { "batch_size": 1, "epoch": 0.6144, "step": 1536, "tokens_per_device": 4908 }, { "epoch": 0.6144, "loss_ce": 0.013171769678592682, "loss_lvr": 0.710090160369873, "loss_mode_switch": 0.0, "loss_total": 0.0841807872056961, "step": 1536 }, { "batch_size": 4, "epoch": 0.6144, "step": 1536, "tokens_per_device": 4244 }, { "epoch": 0.6144, "loss_ce": 0.08111556619405746, "loss_lvr": 0.7704137563705444, "loss_mode_switch": 0.0, "loss_total": 0.15815694630146027, "step": 1536 }, { "batch_size": 4, "epoch": 0.6144, "step": 1536, "tokens_per_device": 3816 }, { "epoch": 0.6144, "loss_ce": 0.116655133664608, "loss_lvr": 0.5101668834686279, "loss_mode_switch": 0.0, "loss_total": 0.1676718294620514, "step": 1536 }, { "batch_size": 1, "epoch": 0.6144, "step": 1536, "tokens_per_device": 5195 }, { "epoch": 0.6144, "loss_ce": 0.05422671511769295, "loss_lvr": 0.19972236454486847, "loss_mode_switch": 0.0, "loss_total": 0.07419895380735397, "step": 1536 }, { "batch_size": 1, "epoch": 0.6144, "step": 1536, "tokens_per_device": 4872 }, { "epoch": 0.6144, "loss_ce": 0.25801920890808105, "loss_lvr": 0.20367370545864105, "loss_mode_switch": 0.0, "loss_total": 0.27838659286499023, "step": 1536 }, { "batch_size": 4, "epoch": 0.6144, "step": 1536, "tokens_per_device": 8568 }, { "epoch": 0.6144, "loss_ce": 0.03795349597930908, "loss_lvr": 0.8478149175643921, "loss_mode_switch": 0.0, "loss_total": 0.12273498624563217, "step": 1536 }, { "epoch": 0.6148, "grad_norm": 1.303286075592041, "learning_rate": 3.4118558002385127e-06, "loss": 0.2772, "step": 1537 }, { "batch_size": 4, "epoch": 0.6148, "step": 1537, "tokens_per_device": 1904 }, { "epoch": 0.6148, "loss_ce": 0.015440743416547775, "loss_lvr": 0.9294060468673706, "loss_mode_switch": 0.0, "loss_total": 0.10838134586811066, "step": 1537 }, { "batch_size": 1, "epoch": 0.6148, "step": 1537, "tokens_per_device": 5200 }, { "epoch": 0.6148, "loss_ce": 0.02169547602534294, "loss_lvr": 0.4499360918998718, "loss_mode_switch": 0.0, "loss_total": 0.06668908894062042, "step": 1537 }, { "batch_size": 1, "epoch": 0.6148, "step": 1537, "tokens_per_device": 4966 }, { "epoch": 0.6148, "loss_ce": 0.019045116379857063, "loss_lvr": 0.30947741866111755, "loss_mode_switch": 0.0, "loss_total": 0.04999285936355591, "step": 1537 }, { "batch_size": 4, "epoch": 0.6148, "step": 1537, "tokens_per_device": 2632 }, { "epoch": 0.6148, "loss_ce": 0.04303809627890587, "loss_lvr": 0.8769925236701965, "loss_mode_switch": 0.0, "loss_total": 0.13073734939098358, "step": 1537 }, { "batch_size": 1, "epoch": 0.6148, "step": 1537, "tokens_per_device": 5895 }, { "epoch": 0.6148, "loss_ce": 0.002946604276075959, "loss_lvr": 0.4529574513435364, "loss_mode_switch": 0.0, "loss_total": 0.04824234917759895, "step": 1537 }, { "batch_size": 4, "epoch": 0.6148, "step": 1537, "tokens_per_device": 1760 }, { "epoch": 0.6148, "loss_ce": 0.26246050000190735, "loss_lvr": 0.8512223958969116, "loss_mode_switch": 0.0, "loss_total": 0.34758275747299194, "step": 1537 }, { "batch_size": 4, "epoch": 0.6148, "step": 1537, "tokens_per_device": 3952 }, { "epoch": 0.6148, "loss_ce": 0.3379107117652893, "loss_lvr": 0.9754183888435364, "loss_mode_switch": 0.0, "loss_total": 0.43545255064964294, "step": 1537 }, { "batch_size": 4, "epoch": 0.6148, "step": 1537, "tokens_per_device": 4536 }, { "epoch": 0.6148, "loss_ce": 0.052616048604249954, "loss_lvr": 0.7416317462921143, "loss_mode_switch": 0.0, "loss_total": 0.1267792284488678, "step": 1537 }, { "epoch": 0.6152, "grad_norm": 1.5207680463790894, "learning_rate": 3.4057150611312644e-06, "loss": 0.2876, "step": 1538 }, { "batch_size": 4, "epoch": 0.6152, "step": 1538, "tokens_per_device": 1548 }, { "epoch": 0.6152, "loss_ce": 0.27174288034439087, "loss_lvr": 0.8306920528411865, "loss_mode_switch": 0.0, "loss_total": 0.3548120856285095, "step": 1538 }, { "batch_size": 4, "epoch": 0.6152, "step": 1538, "tokens_per_device": 6740 }, { "epoch": 0.6152, "loss_ce": 0.25788652896881104, "loss_lvr": 0.9319479465484619, "loss_mode_switch": 0.0, "loss_total": 0.35108131170272827, "step": 1538 }, { "batch_size": 4, "epoch": 0.6152, "step": 1538, "tokens_per_device": 4772 }, { "epoch": 0.6152, "loss_ce": 0.03244933858513832, "loss_lvr": 1.4960492849349976, "loss_mode_switch": 0.0, "loss_total": 0.18205426633358002, "step": 1538 }, { "batch_size": 4, "epoch": 0.6152, "step": 1538, "tokens_per_device": 5412 }, { "epoch": 0.6152, "loss_ce": 0.6435028910636902, "loss_lvr": 0.6696272492408752, "loss_mode_switch": 0.0, "loss_total": 0.7104656100273132, "step": 1538 }, { "batch_size": 4, "epoch": 0.6152, "step": 1538, "tokens_per_device": 10988 }, { "epoch": 0.6152, "loss_ce": 0.2578062415122986, "loss_lvr": 0.6905157566070557, "loss_mode_switch": 0.0, "loss_total": 0.3268578052520752, "step": 1538 }, { "batch_size": 4, "epoch": 0.6152, "step": 1538, "tokens_per_device": 11204 }, { "epoch": 0.6152, "loss_ce": 0.20104171335697174, "loss_lvr": 0.5064471364021301, "loss_mode_switch": 0.0, "loss_total": 0.2516864240169525, "step": 1538 }, { "batch_size": 1, "epoch": 0.6152, "step": 1538, "tokens_per_device": 4891 }, { "epoch": 0.6152, "loss_ce": 0.011817850172519684, "loss_lvr": 0.21295109391212463, "loss_mode_switch": 0.0, "loss_total": 0.03311295807361603, "step": 1538 }, { "batch_size": 4, "epoch": 0.6152, "step": 1538, "tokens_per_device": 4640 }, { "epoch": 0.6152, "loss_ce": 0.4453800916671753, "loss_lvr": 0.7099928855895996, "loss_mode_switch": 0.0, "loss_total": 0.5163793563842773, "step": 1538 }, { "epoch": 0.6156, "grad_norm": 1.3437385559082031, "learning_rate": 3.399576997753117e-06, "loss": 0.3156, "step": 1539 }, { "batch_size": 4, "epoch": 0.6156, "step": 1539, "tokens_per_device": 1216 }, { "epoch": 0.6156, "loss_ce": 0.2972234785556793, "loss_lvr": 0.9827788472175598, "loss_mode_switch": 0.0, "loss_total": 0.39550137519836426, "step": 1539 }, { "batch_size": 4, "epoch": 0.6156, "step": 1539, "tokens_per_device": 5736 }, { "epoch": 0.6156, "loss_ce": 0.4313000440597534, "loss_lvr": 0.9903168678283691, "loss_mode_switch": 0.0, "loss_total": 0.5303317308425903, "step": 1539 }, { "batch_size": 4, "epoch": 0.6156, "step": 1539, "tokens_per_device": 1684 }, { "epoch": 0.6156, "loss_ce": 0.1018018126487732, "loss_lvr": 1.0544158220291138, "loss_mode_switch": 0.0, "loss_total": 0.2072433978319168, "step": 1539 }, { "batch_size": 1, "epoch": 0.6156, "step": 1539, "tokens_per_device": 5127 }, { "epoch": 0.6156, "loss_ce": 0.0882214829325676, "loss_lvr": 0.47660765051841736, "loss_mode_switch": 0.0, "loss_total": 0.13588224351406097, "step": 1539 }, { "batch_size": 1, "epoch": 0.6156, "step": 1539, "tokens_per_device": 5174 }, { "epoch": 0.6156, "loss_ce": 0.056364864110946655, "loss_lvr": 0.27532142400741577, "loss_mode_switch": 0.0, "loss_total": 0.08389700949192047, "step": 1539 }, { "batch_size": 4, "epoch": 0.6156, "step": 1539, "tokens_per_device": 1856 }, { "epoch": 0.6156, "loss_ce": 0.3201480805873871, "loss_lvr": 0.8496948480606079, "loss_mode_switch": 0.0, "loss_total": 0.40511757135391235, "step": 1539 }, { "batch_size": 4, "epoch": 0.6156, "step": 1539, "tokens_per_device": 2776 }, { "epoch": 0.6156, "loss_ce": 0.1061592698097229, "loss_lvr": 0.7166715264320374, "loss_mode_switch": 0.0, "loss_total": 0.1778264343738556, "step": 1539 }, { "batch_size": 4, "epoch": 0.6156, "step": 1539, "tokens_per_device": 4628 }, { "epoch": 0.6156, "loss_ce": 0.07821067422628403, "loss_lvr": 0.8749586939811707, "loss_mode_switch": 0.0, "loss_total": 0.1657065451145172, "step": 1539 }, { "epoch": 0.616, "grad_norm": 1.3599573373794556, "learning_rate": 3.3934416204057396e-06, "loss": 0.2924, "step": 1540 }, { "batch_size": 4, "epoch": 0.616, "step": 1540, "tokens_per_device": 5824 }, { "epoch": 0.616, "loss_ce": 0.4657903015613556, "loss_lvr": 0.33578720688819885, "loss_mode_switch": 0.0, "loss_total": 0.4993690252304077, "step": 1540 }, { "batch_size": 1, "epoch": 0.616, "step": 1540, "tokens_per_device": 5331 }, { "epoch": 0.616, "loss_ce": 0.9579368829727173, "loss_lvr": 0.6079455018043518, "loss_mode_switch": 0.0, "loss_total": 1.0187314748764038, "step": 1540 }, { "batch_size": 1, "epoch": 0.616, "step": 1540, "tokens_per_device": 5151 }, { "epoch": 0.616, "loss_ce": 0.0005605267360806465, "loss_lvr": 0.3173103332519531, "loss_mode_switch": 0.0, "loss_total": 0.032291561365127563, "step": 1540 }, { "batch_size": 4, "epoch": 0.616, "step": 1540, "tokens_per_device": 4356 }, { "epoch": 0.616, "loss_ce": 0.6771222949028015, "loss_lvr": 0.9142699837684631, "loss_mode_switch": 0.0, "loss_total": 0.7685493230819702, "step": 1540 }, { "batch_size": 4, "epoch": 0.616, "step": 1540, "tokens_per_device": 4796 }, { "epoch": 0.616, "loss_ce": 0.3440658450126648, "loss_lvr": 0.9192923903465271, "loss_mode_switch": 0.0, "loss_total": 0.43599510192871094, "step": 1540 }, { "batch_size": 1, "epoch": 0.616, "step": 1540, "tokens_per_device": 5095 }, { "epoch": 0.616, "loss_ce": 0.003371968399733305, "loss_lvr": 0.3963898718357086, "loss_mode_switch": 0.0, "loss_total": 0.043010957539081573, "step": 1540 }, { "batch_size": 4, "epoch": 0.616, "step": 1540, "tokens_per_device": 4044 }, { "epoch": 0.616, "loss_ce": 0.377541184425354, "loss_lvr": 0.8303405046463013, "loss_mode_switch": 0.0, "loss_total": 0.4605752229690552, "step": 1540 }, { "batch_size": 4, "epoch": 0.616, "step": 1540, "tokens_per_device": 1704 }, { "epoch": 0.616, "loss_ce": 0.7103749513626099, "loss_lvr": 0.9251236319541931, "loss_mode_switch": 0.0, "loss_total": 0.8028873205184937, "step": 1540 }, { "epoch": 0.6164, "grad_norm": 1.414792537689209, "learning_rate": 3.387308939386291e-06, "loss": 0.3097, "step": 1541 }, { "batch_size": 1, "epoch": 0.6164, "step": 1541, "tokens_per_device": 5216 }, { "epoch": 0.6164, "loss_ce": 0.03578304126858711, "loss_lvr": 0.3402429223060608, "loss_mode_switch": 0.0, "loss_total": 0.06980733573436737, "step": 1541 }, { "batch_size": 4, "epoch": 0.6164, "step": 1541, "tokens_per_device": 5048 }, { "epoch": 0.6164, "loss_ce": 0.10743365436792374, "loss_lvr": 0.7600641250610352, "loss_mode_switch": 0.0, "loss_total": 0.18344005942344666, "step": 1541 }, { "batch_size": 1, "epoch": 0.6164, "step": 1541, "tokens_per_device": 4933 }, { "epoch": 0.6164, "loss_ce": 0.20855668187141418, "loss_lvr": 0.3636256158351898, "loss_mode_switch": 0.0, "loss_total": 0.24491924047470093, "step": 1541 }, { "batch_size": 1, "epoch": 0.6164, "step": 1541, "tokens_per_device": 5121 }, { "epoch": 0.6164, "loss_ce": 0.00038890913128852844, "loss_lvr": 0.4151090979576111, "loss_mode_switch": 0.0, "loss_total": 0.04189981892704964, "step": 1541 }, { "batch_size": 4, "epoch": 0.6164, "step": 1541, "tokens_per_device": 6288 }, { "epoch": 0.6164, "loss_ce": 0.7864429354667664, "loss_lvr": 0.7987672686576843, "loss_mode_switch": 0.0, "loss_total": 0.8663196563720703, "step": 1541 }, { "batch_size": 1, "epoch": 0.6164, "step": 1541, "tokens_per_device": 4818 }, { "epoch": 0.6164, "loss_ce": 0.02056354284286499, "loss_lvr": 0.5596088171005249, "loss_mode_switch": 0.0, "loss_total": 0.07652442157268524, "step": 1541 }, { "batch_size": 4, "epoch": 0.6164, "step": 1541, "tokens_per_device": 4604 }, { "epoch": 0.6164, "loss_ce": 0.529327929019928, "loss_lvr": 0.6595474481582642, "loss_mode_switch": 0.0, "loss_total": 0.5952826738357544, "step": 1541 }, { "batch_size": 1, "epoch": 0.6164, "step": 1541, "tokens_per_device": 5993 }, { "epoch": 0.6164, "loss_ce": 0.18348310887813568, "loss_lvr": 0.5103090405464172, "loss_mode_switch": 0.0, "loss_total": 0.2345140129327774, "step": 1541 }, { "epoch": 0.6168, "grad_norm": 1.498437762260437, "learning_rate": 3.38117896498741e-06, "loss": 0.3015, "step": 1542 }, { "batch_size": 4, "epoch": 0.6168, "step": 1542, "tokens_per_device": 12700 }, { "epoch": 0.6168, "loss_ce": 0.2831190526485443, "loss_lvr": 0.40546348690986633, "loss_mode_switch": 0.0, "loss_total": 0.32366541028022766, "step": 1542 }, { "batch_size": 4, "epoch": 0.6168, "step": 1542, "tokens_per_device": 2676 }, { "epoch": 0.6168, "loss_ce": 0.3800073266029358, "loss_lvr": 0.8566166162490845, "loss_mode_switch": 0.0, "loss_total": 0.4656689763069153, "step": 1542 }, { "batch_size": 4, "epoch": 0.6168, "step": 1542, "tokens_per_device": 5704 }, { "epoch": 0.6168, "loss_ce": 0.13724415004253387, "loss_lvr": 1.0606999397277832, "loss_mode_switch": 0.0, "loss_total": 0.24331414699554443, "step": 1542 }, { "batch_size": 1, "epoch": 0.6168, "step": 1542, "tokens_per_device": 4896 }, { "epoch": 0.6168, "loss_ce": 1.4921187162399292, "loss_lvr": 0.4118196964263916, "loss_mode_switch": 0.0, "loss_total": 1.5333006381988525, "step": 1542 }, { "batch_size": 1, "epoch": 0.6168, "step": 1542, "tokens_per_device": 5104 }, { "epoch": 0.6168, "loss_ce": 0.4453928470611572, "loss_lvr": 0.7923890352249146, "loss_mode_switch": 0.0, "loss_total": 0.5246317386627197, "step": 1542 }, { "batch_size": 4, "epoch": 0.6168, "step": 1542, "tokens_per_device": 3760 }, { "epoch": 0.6168, "loss_ce": 0.18201473355293274, "loss_lvr": 0.7665307521820068, "loss_mode_switch": 0.0, "loss_total": 0.25866782665252686, "step": 1542 }, { "batch_size": 1, "epoch": 0.6168, "step": 1542, "tokens_per_device": 5161 }, { "epoch": 0.6168, "loss_ce": 0.17715875804424286, "loss_lvr": 0.6035134792327881, "loss_mode_switch": 0.0, "loss_total": 0.23751011490821838, "step": 1542 }, { "batch_size": 4, "epoch": 0.6168, "step": 1542, "tokens_per_device": 10976 }, { "epoch": 0.6168, "loss_ce": 0.05437379330396652, "loss_lvr": 0.9928107857704163, "loss_mode_switch": 0.0, "loss_total": 0.15365487337112427, "step": 1542 }, { "epoch": 0.6172, "grad_norm": 1.56838858127594, "learning_rate": 3.375051707497187e-06, "loss": 0.3905, "step": 1543 }, { "batch_size": 4, "epoch": 0.6172, "step": 1543, "tokens_per_device": 1348 }, { "epoch": 0.6172, "loss_ce": 0.29574719071388245, "loss_lvr": 1.157165288925171, "loss_mode_switch": 0.0, "loss_total": 0.41146373748779297, "step": 1543 }, { "batch_size": 4, "epoch": 0.6172, "step": 1543, "tokens_per_device": 5292 }, { "epoch": 0.6172, "loss_ce": 0.1278880089521408, "loss_lvr": 0.7012187838554382, "loss_mode_switch": 0.0, "loss_total": 0.19800987839698792, "step": 1543 }, { "batch_size": 4, "epoch": 0.6172, "step": 1543, "tokens_per_device": 4252 }, { "epoch": 0.6172, "loss_ce": 0.056826021522283554, "loss_lvr": 0.8389915227890015, "loss_mode_switch": 0.0, "loss_total": 0.14072518050670624, "step": 1543 }, { "batch_size": 1, "epoch": 0.6172, "step": 1543, "tokens_per_device": 4911 }, { "epoch": 0.6172, "loss_ce": 0.11410484462976456, "loss_lvr": 0.524706244468689, "loss_mode_switch": 0.0, "loss_total": 0.16657546162605286, "step": 1543 }, { "batch_size": 4, "epoch": 0.6172, "step": 1543, "tokens_per_device": 5900 }, { "epoch": 0.6172, "loss_ce": 0.15179221332073212, "loss_lvr": 0.8035624027252197, "loss_mode_switch": 0.0, "loss_total": 0.2321484535932541, "step": 1543 }, { "batch_size": 1, "epoch": 0.6172, "step": 1543, "tokens_per_device": 5249 }, { "epoch": 0.6172, "loss_ce": 0.00031328885233961046, "loss_lvr": 0.3881555497646332, "loss_mode_switch": 0.0, "loss_total": 0.03912884369492531, "step": 1543 }, { "batch_size": 4, "epoch": 0.6172, "step": 1543, "tokens_per_device": 4992 }, { "epoch": 0.6172, "loss_ce": 0.26910004019737244, "loss_lvr": 0.8520100116729736, "loss_mode_switch": 0.0, "loss_total": 0.3543010354042053, "step": 1543 }, { "batch_size": 4, "epoch": 0.6172, "step": 1543, "tokens_per_device": 11544 }, { "epoch": 0.6172, "loss_ce": 0.03471933677792549, "loss_lvr": 0.498693585395813, "loss_mode_switch": 0.0, "loss_total": 0.08458869159221649, "step": 1543 }, { "epoch": 0.6176, "grad_norm": 1.0480067729949951, "learning_rate": 3.368927177199154e-06, "loss": 0.2233, "step": 1544 }, { "batch_size": 4, "epoch": 0.6176, "step": 1544, "tokens_per_device": 2824 }, { "epoch": 0.6176, "loss_ce": 0.17833632230758667, "loss_lvr": 0.7070402503013611, "loss_mode_switch": 0.0, "loss_total": 0.24904035031795502, "step": 1544 }, { "batch_size": 1, "epoch": 0.6176, "step": 1544, "tokens_per_device": 5134 }, { "epoch": 0.6176, "loss_ce": 0.3228380084037781, "loss_lvr": 0.31375136971473694, "loss_mode_switch": 0.0, "loss_total": 0.354213148355484, "step": 1544 }, { "batch_size": 4, "epoch": 0.6176, "step": 1544, "tokens_per_device": 4296 }, { "epoch": 0.6176, "loss_ce": 0.3038962781429291, "loss_lvr": 0.993659257888794, "loss_mode_switch": 0.0, "loss_total": 0.403262197971344, "step": 1544 }, { "batch_size": 4, "epoch": 0.6176, "step": 1544, "tokens_per_device": 1412 }, { "epoch": 0.6176, "loss_ce": 0.8020592927932739, "loss_lvr": 1.0945382118225098, "loss_mode_switch": 0.0, "loss_total": 0.911513090133667, "step": 1544 }, { "batch_size": 1, "epoch": 0.6176, "step": 1544, "tokens_per_device": 4865 }, { "epoch": 0.6176, "loss_ce": 0.08937961608171463, "loss_lvr": 0.31929394602775574, "loss_mode_switch": 0.0, "loss_total": 0.12130901217460632, "step": 1544 }, { "batch_size": 4, "epoch": 0.6176, "step": 1544, "tokens_per_device": 4296 }, { "epoch": 0.6176, "loss_ce": 0.09409037977457047, "loss_lvr": 0.9025987982749939, "loss_mode_switch": 0.0, "loss_total": 0.18435025215148926, "step": 1544 }, { "batch_size": 4, "epoch": 0.6176, "step": 1544, "tokens_per_device": 3780 }, { "epoch": 0.6176, "loss_ce": 0.06004100665450096, "loss_lvr": 0.8676297664642334, "loss_mode_switch": 0.0, "loss_total": 0.14680399000644684, "step": 1544 }, { "batch_size": 4, "epoch": 0.6176, "step": 1544, "tokens_per_device": 4816 }, { "epoch": 0.6176, "loss_ce": 0.09701411426067352, "loss_lvr": 0.829773485660553, "loss_mode_switch": 0.0, "loss_total": 0.1799914538860321, "step": 1544 }, { "epoch": 0.618, "grad_norm": 1.4783753156661987, "learning_rate": 3.3628053843722674e-06, "loss": 0.3207, "step": 1545 }, { "batch_size": 1, "epoch": 0.618, "step": 1545, "tokens_per_device": 4865 }, { "epoch": 0.618, "loss_ce": 0.0007544162799604237, "loss_lvr": 0.4759320318698883, "loss_mode_switch": 0.0, "loss_total": 0.04834761843085289, "step": 1545 }, { "batch_size": 4, "epoch": 0.618, "step": 1545, "tokens_per_device": 4548 }, { "epoch": 0.618, "loss_ce": 0.1450773924589157, "loss_lvr": 0.8252606391906738, "loss_mode_switch": 0.0, "loss_total": 0.2276034653186798, "step": 1545 }, { "batch_size": 4, "epoch": 0.618, "step": 1545, "tokens_per_device": 4596 }, { "epoch": 0.618, "loss_ce": 0.07830440253019333, "loss_lvr": 0.7864643335342407, "loss_mode_switch": 0.0, "loss_total": 0.15695083141326904, "step": 1545 }, { "batch_size": 4, "epoch": 0.618, "step": 1545, "tokens_per_device": 3940 }, { "epoch": 0.618, "loss_ce": 0.6452966332435608, "loss_lvr": 0.9313005805015564, "loss_mode_switch": 0.0, "loss_total": 0.738426685333252, "step": 1545 }, { "batch_size": 4, "epoch": 0.618, "step": 1545, "tokens_per_device": 2544 }, { "epoch": 0.618, "loss_ce": 0.625900149345398, "loss_lvr": 0.954430103302002, "loss_mode_switch": 0.0, "loss_total": 0.7213431596755981, "step": 1545 }, { "batch_size": 4, "epoch": 0.618, "step": 1545, "tokens_per_device": 5508 }, { "epoch": 0.618, "loss_ce": 0.07979569584131241, "loss_lvr": 0.7211506366729736, "loss_mode_switch": 0.0, "loss_total": 0.15191075205802917, "step": 1545 }, { "batch_size": 4, "epoch": 0.618, "step": 1545, "tokens_per_device": 11832 }, { "epoch": 0.618, "loss_ce": 0.2404409497976303, "loss_lvr": 0.6256653070449829, "loss_mode_switch": 0.0, "loss_total": 0.30300748348236084, "step": 1545 }, { "batch_size": 4, "epoch": 0.618, "step": 1545, "tokens_per_device": 4236 }, { "epoch": 0.618, "loss_ce": 0.0620197094976902, "loss_lvr": 0.874886691570282, "loss_mode_switch": 0.0, "loss_total": 0.14950838685035706, "step": 1545 }, { "epoch": 0.6184, "grad_norm": 1.1925102472305298, "learning_rate": 3.3566863392908864e-06, "loss": 0.2507, "step": 1546 }, { "batch_size": 4, "epoch": 0.6184, "step": 1546, "tokens_per_device": 3808 }, { "epoch": 0.6184, "loss_ce": 0.37386563420295715, "loss_lvr": 0.9679847955703735, "loss_mode_switch": 0.0, "loss_total": 0.4706641137599945, "step": 1546 }, { "batch_size": 4, "epoch": 0.6184, "step": 1546, "tokens_per_device": 4596 }, { "epoch": 0.6184, "loss_ce": 0.41069111227989197, "loss_lvr": 0.8407977223396301, "loss_mode_switch": 0.0, "loss_total": 0.494770884513855, "step": 1546 }, { "batch_size": 1, "epoch": 0.6184, "step": 1546, "tokens_per_device": 4927 }, { "epoch": 0.6184, "loss_ce": 0.012608873657882214, "loss_lvr": 0.6289348602294922, "loss_mode_switch": 0.0, "loss_total": 0.07550235837697983, "step": 1546 }, { "batch_size": 4, "epoch": 0.6184, "step": 1546, "tokens_per_device": 2668 }, { "epoch": 0.6184, "loss_ce": 0.10159202665090561, "loss_lvr": 1.2075564861297607, "loss_mode_switch": 0.0, "loss_total": 0.2223476767539978, "step": 1546 }, { "batch_size": 4, "epoch": 0.6184, "step": 1546, "tokens_per_device": 5672 }, { "epoch": 0.6184, "loss_ce": 0.1502089500427246, "loss_lvr": 0.6888329386711121, "loss_mode_switch": 0.0, "loss_total": 0.2190922498703003, "step": 1546 }, { "batch_size": 1, "epoch": 0.6184, "step": 1546, "tokens_per_device": 4819 }, { "epoch": 0.6184, "loss_ce": 0.0011637816205620766, "loss_lvr": 0.3061588406562805, "loss_mode_switch": 0.0, "loss_total": 0.03177966549992561, "step": 1546 }, { "batch_size": 4, "epoch": 0.6184, "step": 1546, "tokens_per_device": 6500 }, { "epoch": 0.6184, "loss_ce": 0.2706570625305176, "loss_lvr": 0.8257336616516113, "loss_mode_switch": 0.0, "loss_total": 0.35323041677474976, "step": 1546 }, { "batch_size": 4, "epoch": 0.6184, "step": 1546, "tokens_per_device": 7140 }, { "epoch": 0.6184, "loss_ce": 0.5148032903671265, "loss_lvr": 1.0894830226898193, "loss_mode_switch": 0.0, "loss_total": 0.6237515807151794, "step": 1546 }, { "epoch": 0.6188, "grad_norm": 1.3399831056594849, "learning_rate": 3.3505700522247652e-06, "loss": 0.3037, "step": 1547 }, { "batch_size": 4, "epoch": 0.6188, "step": 1547, "tokens_per_device": 2604 }, { "epoch": 0.6188, "loss_ce": 0.004755374509841204, "loss_lvr": 0.8400755524635315, "loss_mode_switch": 0.0, "loss_total": 0.08876293152570724, "step": 1547 }, { "batch_size": 4, "epoch": 0.6188, "step": 1547, "tokens_per_device": 4224 }, { "epoch": 0.6188, "loss_ce": 0.2964717149734497, "loss_lvr": 0.8993072509765625, "loss_mode_switch": 0.0, "loss_total": 0.386402428150177, "step": 1547 }, { "batch_size": 4, "epoch": 0.6188, "step": 1547, "tokens_per_device": 1192 }, { "epoch": 0.6188, "loss_ce": 0.4451574981212616, "loss_lvr": 0.9779460430145264, "loss_mode_switch": 0.0, "loss_total": 0.5429521203041077, "step": 1547 }, { "batch_size": 4, "epoch": 0.6188, "step": 1547, "tokens_per_device": 2672 }, { "epoch": 0.6188, "loss_ce": 0.13647013902664185, "loss_lvr": 0.6917510032653809, "loss_mode_switch": 0.0, "loss_total": 0.20564523339271545, "step": 1547 }, { "batch_size": 1, "epoch": 0.6188, "step": 1547, "tokens_per_device": 5170 }, { "epoch": 0.6188, "loss_ce": 0.1867220252752304, "loss_lvr": 0.3898279070854187, "loss_mode_switch": 0.0, "loss_total": 0.22570481896400452, "step": 1547 }, { "batch_size": 4, "epoch": 0.6188, "step": 1547, "tokens_per_device": 4252 }, { "epoch": 0.6188, "loss_ce": 0.3145460784435272, "loss_lvr": 1.1382538080215454, "loss_mode_switch": 0.0, "loss_total": 0.42837145924568176, "step": 1547 }, { "batch_size": 4, "epoch": 0.6188, "step": 1547, "tokens_per_device": 4284 }, { "epoch": 0.6188, "loss_ce": 0.5902050137519836, "loss_lvr": 0.8389907479286194, "loss_mode_switch": 0.0, "loss_total": 0.6741040945053101, "step": 1547 }, { "batch_size": 4, "epoch": 0.6188, "step": 1547, "tokens_per_device": 1340 }, { "epoch": 0.6188, "loss_ce": 0.387248158454895, "loss_lvr": 0.8824081420898438, "loss_mode_switch": 0.0, "loss_total": 0.47548896074295044, "step": 1547 }, { "epoch": 0.6192, "grad_norm": 1.4895957708358765, "learning_rate": 3.34445653343902e-06, "loss": 0.3156, "step": 1548 }, { "batch_size": 1, "epoch": 0.6192, "step": 1548, "tokens_per_device": 4855 }, { "epoch": 0.6192, "loss_ce": 0.0008567353361286223, "loss_lvr": 0.4413561224937439, "loss_mode_switch": 0.0, "loss_total": 0.044992346316576004, "step": 1548 }, { "batch_size": 4, "epoch": 0.6192, "step": 1548, "tokens_per_device": 4696 }, { "epoch": 0.6192, "loss_ce": 0.27016937732696533, "loss_lvr": 0.7094348669052124, "loss_mode_switch": 0.0, "loss_total": 0.3411128520965576, "step": 1548 }, { "batch_size": 4, "epoch": 0.6192, "step": 1548, "tokens_per_device": 4220 }, { "epoch": 0.6192, "loss_ce": 0.24154062569141388, "loss_lvr": 0.6607353687286377, "loss_mode_switch": 0.0, "loss_total": 0.30761414766311646, "step": 1548 }, { "batch_size": 4, "epoch": 0.6192, "step": 1548, "tokens_per_device": 5092 }, { "epoch": 0.6192, "loss_ce": 0.2808607816696167, "loss_lvr": 0.7798194289207458, "loss_mode_switch": 0.0, "loss_total": 0.35884273052215576, "step": 1548 }, { "batch_size": 4, "epoch": 0.6192, "step": 1548, "tokens_per_device": 2672 }, { "epoch": 0.6192, "loss_ce": 0.1415034830570221, "loss_lvr": 0.7224247455596924, "loss_mode_switch": 0.0, "loss_total": 0.21374595165252686, "step": 1548 }, { "batch_size": 4, "epoch": 0.6192, "step": 1548, "tokens_per_device": 1752 }, { "epoch": 0.6192, "loss_ce": 0.23691707849502563, "loss_lvr": 0.9018107056617737, "loss_mode_switch": 0.0, "loss_total": 0.32709816098213196, "step": 1548 }, { "batch_size": 4, "epoch": 0.6192, "step": 1548, "tokens_per_device": 3736 }, { "epoch": 0.6192, "loss_ce": 0.38471290469169617, "loss_lvr": 0.8089784383773804, "loss_mode_switch": 0.0, "loss_total": 0.4656107425689697, "step": 1548 }, { "batch_size": 4, "epoch": 0.6192, "step": 1548, "tokens_per_device": 4268 }, { "epoch": 0.6192, "loss_ce": 0.36666563153266907, "loss_lvr": 0.9625553488731384, "loss_mode_switch": 0.0, "loss_total": 0.4629211723804474, "step": 1548 }, { "epoch": 0.6196, "grad_norm": 1.229231357574463, "learning_rate": 3.3383457931941275e-06, "loss": 0.2897, "step": 1549 }, { "batch_size": 4, "epoch": 0.6196, "step": 1549, "tokens_per_device": 5448 }, { "epoch": 0.6196, "loss_ce": 0.4790560007095337, "loss_lvr": 0.7933544516563416, "loss_mode_switch": 0.0, "loss_total": 0.5583914518356323, "step": 1549 }, { "batch_size": 4, "epoch": 0.6196, "step": 1549, "tokens_per_device": 4444 }, { "epoch": 0.6196, "loss_ce": 0.1663181632757187, "loss_lvr": 0.7349556684494019, "loss_mode_switch": 0.0, "loss_total": 0.23981373012065887, "step": 1549 }, { "batch_size": 4, "epoch": 0.6196, "step": 1549, "tokens_per_device": 4460 }, { "epoch": 0.6196, "loss_ce": 0.4428332448005676, "loss_lvr": 1.2031446695327759, "loss_mode_switch": 0.0, "loss_total": 0.5631477236747742, "step": 1549 }, { "batch_size": 4, "epoch": 0.6196, "step": 1549, "tokens_per_device": 3744 }, { "epoch": 0.6196, "loss_ce": 0.18085715174674988, "loss_lvr": 0.7173983454704285, "loss_mode_switch": 0.0, "loss_total": 0.25259697437286377, "step": 1549 }, { "batch_size": 4, "epoch": 0.6196, "step": 1549, "tokens_per_device": 1428 }, { "epoch": 0.6196, "loss_ce": 0.5292093753814697, "loss_lvr": 1.007004737854004, "loss_mode_switch": 0.0, "loss_total": 0.629909873008728, "step": 1549 }, { "batch_size": 4, "epoch": 0.6196, "step": 1549, "tokens_per_device": 5944 }, { "epoch": 0.6196, "loss_ce": 0.04175957292318344, "loss_lvr": 0.6337477564811707, "loss_mode_switch": 0.0, "loss_total": 0.10513435304164886, "step": 1549 }, { "batch_size": 4, "epoch": 0.6196, "step": 1549, "tokens_per_device": 3828 }, { "epoch": 0.6196, "loss_ce": 0.39021047949790955, "loss_lvr": 0.9382250308990479, "loss_mode_switch": 0.0, "loss_total": 0.4840329885482788, "step": 1549 }, { "batch_size": 4, "epoch": 0.6196, "step": 1549, "tokens_per_device": 4836 }, { "epoch": 0.6196, "loss_ce": 0.2675883173942566, "loss_lvr": 0.9732000827789307, "loss_mode_switch": 0.0, "loss_total": 0.3649083375930786, "step": 1549 }, { "epoch": 0.62, "grad_norm": 1.210998296737671, "learning_rate": 3.3322378417458985e-06, "loss": 0.2849, "step": 1550 }, { "batch_size": 1, "epoch": 0.62, "step": 1550, "tokens_per_device": 5722 }, { "epoch": 0.62, "loss_ce": 0.24164701998233795, "loss_lvr": 0.37303003668785095, "loss_mode_switch": 0.0, "loss_total": 0.278950035572052, "step": 1550 }, { "batch_size": 4, "epoch": 0.62, "step": 1550, "tokens_per_device": 4716 }, { "epoch": 0.62, "loss_ce": 0.0588361956179142, "loss_lvr": 0.5196257829666138, "loss_mode_switch": 0.0, "loss_total": 0.11079877614974976, "step": 1550 }, { "batch_size": 4, "epoch": 0.62, "step": 1550, "tokens_per_device": 6688 }, { "epoch": 0.62, "loss_ce": 0.13096678256988525, "loss_lvr": 0.6069013476371765, "loss_mode_switch": 0.0, "loss_total": 0.1916569173336029, "step": 1550 }, { "batch_size": 4, "epoch": 0.62, "step": 1550, "tokens_per_device": 4720 }, { "epoch": 0.62, "loss_ce": 0.38184982538223267, "loss_lvr": 0.7017723321914673, "loss_mode_switch": 0.0, "loss_total": 0.4520270526409149, "step": 1550 }, { "batch_size": 4, "epoch": 0.62, "step": 1550, "tokens_per_device": 2656 }, { "epoch": 0.62, "loss_ce": 0.2853158712387085, "loss_lvr": 0.8084017038345337, "loss_mode_switch": 0.0, "loss_total": 0.36615604162216187, "step": 1550 }, { "batch_size": 4, "epoch": 0.62, "step": 1550, "tokens_per_device": 4120 }, { "epoch": 0.62, "loss_ce": 0.23127931356430054, "loss_lvr": 0.934954822063446, "loss_mode_switch": 0.0, "loss_total": 0.3247748017311096, "step": 1550 }, { "batch_size": 4, "epoch": 0.62, "step": 1550, "tokens_per_device": 7340 }, { "epoch": 0.62, "loss_ce": 0.4909299910068512, "loss_lvr": 0.7413828372955322, "loss_mode_switch": 0.0, "loss_total": 0.565068244934082, "step": 1550 }, { "batch_size": 4, "epoch": 0.62, "step": 1550, "tokens_per_device": 4452 }, { "epoch": 0.62, "loss_ce": 0.3381159007549286, "loss_lvr": 0.8583793044090271, "loss_mode_switch": 0.0, "loss_total": 0.4239538311958313, "step": 1550 }, { "epoch": 0.6204, "grad_norm": 1.467578649520874, "learning_rate": 3.3261326893454617e-06, "loss": 0.3172, "step": 1551 }, { "batch_size": 4, "epoch": 0.6204, "step": 1551, "tokens_per_device": 2180 }, { "epoch": 0.6204, "loss_ce": 0.11500603705644608, "loss_lvr": 0.9192157983779907, "loss_mode_switch": 0.0, "loss_total": 0.20692762732505798, "step": 1551 }, { "batch_size": 4, "epoch": 0.6204, "step": 1551, "tokens_per_device": 4984 }, { "epoch": 0.6204, "loss_ce": 0.2719692885875702, "loss_lvr": 0.92301344871521, "loss_mode_switch": 0.0, "loss_total": 0.3642706274986267, "step": 1551 }, { "batch_size": 4, "epoch": 0.6204, "step": 1551, "tokens_per_device": 1548 }, { "epoch": 0.6204, "loss_ce": 0.04274030029773712, "loss_lvr": 1.218533992767334, "loss_mode_switch": 0.0, "loss_total": 0.16459369659423828, "step": 1551 }, { "batch_size": 4, "epoch": 0.6204, "step": 1551, "tokens_per_device": 2864 }, { "epoch": 0.6204, "loss_ce": 0.7384718060493469, "loss_lvr": 0.863312840461731, "loss_mode_switch": 0.0, "loss_total": 0.8248031139373779, "step": 1551 }, { "batch_size": 1, "epoch": 0.6204, "step": 1551, "tokens_per_device": 4888 }, { "epoch": 0.6204, "loss_ce": 0.05216219276189804, "loss_lvr": 2.5969083309173584, "loss_mode_switch": 0.0, "loss_total": 0.3118530511856079, "step": 1551 }, { "batch_size": 4, "epoch": 0.6204, "step": 1551, "tokens_per_device": 5320 }, { "epoch": 0.6204, "loss_ce": 0.09883438050746918, "loss_lvr": 0.8689469695091248, "loss_mode_switch": 0.0, "loss_total": 0.18572908639907837, "step": 1551 }, { "batch_size": 4, "epoch": 0.6204, "step": 1551, "tokens_per_device": 2784 }, { "epoch": 0.6204, "loss_ce": 0.34027692675590515, "loss_lvr": 0.6888810396194458, "loss_mode_switch": 0.0, "loss_total": 0.40916502475738525, "step": 1551 }, { "batch_size": 1, "epoch": 0.6204, "step": 1551, "tokens_per_device": 4348 }, { "epoch": 0.6204, "loss_ce": 0.08401766419410706, "loss_lvr": 0.5181776285171509, "loss_mode_switch": 0.0, "loss_total": 0.1358354240655899, "step": 1551 }, { "epoch": 0.6208, "grad_norm": 1.5949931144714355, "learning_rate": 3.3200303462392548e-06, "loss": 0.2942, "step": 1552 }, { "batch_size": 4, "epoch": 0.6208, "step": 1552, "tokens_per_device": 2576 }, { "epoch": 0.6208, "loss_ce": 0.2786431610584259, "loss_lvr": 0.8666664958000183, "loss_mode_switch": 0.0, "loss_total": 0.36530980467796326, "step": 1552 }, { "batch_size": 1, "epoch": 0.6208, "step": 1552, "tokens_per_device": 4746 }, { "epoch": 0.6208, "loss_ce": 0.16168735921382904, "loss_lvr": 0.2823387384414673, "loss_mode_switch": 0.0, "loss_total": 0.18992123007774353, "step": 1552 }, { "batch_size": 4, "epoch": 0.6208, "step": 1552, "tokens_per_device": 2584 }, { "epoch": 0.6208, "loss_ce": 0.07305576652288437, "loss_lvr": 1.8430050611495972, "loss_mode_switch": 0.0, "loss_total": 0.25735628604888916, "step": 1552 }, { "batch_size": 4, "epoch": 0.6208, "step": 1552, "tokens_per_device": 8744 }, { "epoch": 0.6208, "loss_ce": 0.6631481051445007, "loss_lvr": 0.8262338638305664, "loss_mode_switch": 0.0, "loss_total": 0.7457714676856995, "step": 1552 }, { "batch_size": 4, "epoch": 0.6208, "step": 1552, "tokens_per_device": 6580 }, { "epoch": 0.6208, "loss_ce": 0.17875711619853973, "loss_lvr": 0.5198707580566406, "loss_mode_switch": 0.0, "loss_total": 0.23074419796466827, "step": 1552 }, { "batch_size": 4, "epoch": 0.6208, "step": 1552, "tokens_per_device": 12768 }, { "epoch": 0.6208, "loss_ce": 0.19405823945999146, "loss_lvr": 0.8572003841400146, "loss_mode_switch": 0.0, "loss_total": 0.27977827191352844, "step": 1552 }, { "batch_size": 4, "epoch": 0.6208, "step": 1552, "tokens_per_device": 5240 }, { "epoch": 0.6208, "loss_ce": 0.31758370995521545, "loss_lvr": 0.7658483982086182, "loss_mode_switch": 0.0, "loss_total": 0.39416855573654175, "step": 1552 }, { "batch_size": 4, "epoch": 0.6208, "step": 1552, "tokens_per_device": 2660 }, { "epoch": 0.6208, "loss_ce": 0.2957053780555725, "loss_lvr": 0.7656846642494202, "loss_mode_switch": 0.0, "loss_total": 0.37227386236190796, "step": 1552 }, { "epoch": 0.6212, "grad_norm": 1.434706449508667, "learning_rate": 3.313930822668992e-06, "loss": 0.3168, "step": 1553 }, { "batch_size": 4, "epoch": 0.6212, "step": 1553, "tokens_per_device": 6620 }, { "epoch": 0.6212, "loss_ce": 0.20019027590751648, "loss_lvr": 0.7001479864120483, "loss_mode_switch": 0.0, "loss_total": 0.2702050805091858, "step": 1553 }, { "batch_size": 4, "epoch": 0.6212, "step": 1553, "tokens_per_device": 3836 }, { "epoch": 0.6212, "loss_ce": 0.5193908214569092, "loss_lvr": 0.8424978256225586, "loss_mode_switch": 0.0, "loss_total": 0.603640615940094, "step": 1553 }, { "batch_size": 4, "epoch": 0.6212, "step": 1553, "tokens_per_device": 2932 }, { "epoch": 0.6212, "loss_ce": 0.35240602493286133, "loss_lvr": 0.8021076917648315, "loss_mode_switch": 0.0, "loss_total": 0.43261680006980896, "step": 1553 }, { "batch_size": 1, "epoch": 0.6212, "step": 1553, "tokens_per_device": 5137 }, { "epoch": 0.6212, "loss_ce": 0.08308913558721542, "loss_lvr": 0.6371265053749084, "loss_mode_switch": 0.0, "loss_total": 0.14680178463459015, "step": 1553 }, { "batch_size": 4, "epoch": 0.6212, "step": 1553, "tokens_per_device": 5728 }, { "epoch": 0.6212, "loss_ce": 0.24881543219089508, "loss_lvr": 0.9077873826026917, "loss_mode_switch": 0.0, "loss_total": 0.33959418535232544, "step": 1553 }, { "batch_size": 1, "epoch": 0.6212, "step": 1553, "tokens_per_device": 5523 }, { "epoch": 0.6212, "loss_ce": 0.00014758468023501337, "loss_lvr": 0.37608617544174194, "loss_mode_switch": 0.0, "loss_total": 0.03775620460510254, "step": 1553 }, { "batch_size": 4, "epoch": 0.6212, "step": 1553, "tokens_per_device": 4416 }, { "epoch": 0.6212, "loss_ce": 0.7431197166442871, "loss_lvr": 0.7627211809158325, "loss_mode_switch": 0.0, "loss_total": 0.8193918466567993, "step": 1553 }, { "batch_size": 4, "epoch": 0.6212, "step": 1553, "tokens_per_device": 4832 }, { "epoch": 0.6212, "loss_ce": 0.23657499253749847, "loss_lvr": 0.7138282060623169, "loss_mode_switch": 0.0, "loss_total": 0.30795782804489136, "step": 1553 }, { "epoch": 0.6216, "grad_norm": 1.3320053815841675, "learning_rate": 3.307834128871661e-06, "loss": 0.2959, "step": 1554 }, { "batch_size": 4, "epoch": 0.6216, "step": 1554, "tokens_per_device": 10184 }, { "epoch": 0.6216, "loss_ce": 0.40213969349861145, "loss_lvr": 0.5330353379249573, "loss_mode_switch": 0.0, "loss_total": 0.45544323325157166, "step": 1554 }, { "batch_size": 4, "epoch": 0.6216, "step": 1554, "tokens_per_device": 3488 }, { "epoch": 0.6216, "loss_ce": 0.24184632301330566, "loss_lvr": 0.9529374837875366, "loss_mode_switch": 0.0, "loss_total": 0.3371400833129883, "step": 1554 }, { "batch_size": 1, "epoch": 0.6216, "step": 1554, "tokens_per_device": 7406 }, { "epoch": 0.6216, "loss_ce": 0.0018047555349767208, "loss_lvr": 0.23170462250709534, "loss_mode_switch": 0.0, "loss_total": 0.024975217878818512, "step": 1554 }, { "batch_size": 4, "epoch": 0.6216, "step": 1554, "tokens_per_device": 4340 }, { "epoch": 0.6216, "loss_ce": 0.01137769315391779, "loss_lvr": 0.6707094311714172, "loss_mode_switch": 0.0, "loss_total": 0.07844863831996918, "step": 1554 }, { "batch_size": 4, "epoch": 0.6216, "step": 1554, "tokens_per_device": 6012 }, { "epoch": 0.6216, "loss_ce": 0.23065544664859772, "loss_lvr": 0.7373494505882263, "loss_mode_switch": 0.0, "loss_total": 0.30439040064811707, "step": 1554 }, { "batch_size": 4, "epoch": 0.6216, "step": 1554, "tokens_per_device": 4308 }, { "epoch": 0.6216, "loss_ce": 0.19961953163146973, "loss_lvr": 1.0446157455444336, "loss_mode_switch": 0.0, "loss_total": 0.30408111214637756, "step": 1554 }, { "batch_size": 4, "epoch": 0.6216, "step": 1554, "tokens_per_device": 5624 }, { "epoch": 0.6216, "loss_ce": 0.15293589234352112, "loss_lvr": 1.0567758083343506, "loss_mode_switch": 0.0, "loss_total": 0.2586134672164917, "step": 1554 }, { "batch_size": 4, "epoch": 0.6216, "step": 1554, "tokens_per_device": 4792 }, { "epoch": 0.6216, "loss_ce": 0.5180479288101196, "loss_lvr": 0.7059386968612671, "loss_mode_switch": 0.0, "loss_total": 0.5886418223381042, "step": 1554 }, { "epoch": 0.622, "grad_norm": 1.2459737062454224, "learning_rate": 3.3017402750794976e-06, "loss": 0.295, "step": 1555 }, { "batch_size": 4, "epoch": 0.622, "step": 1555, "tokens_per_device": 5108 }, { "epoch": 0.622, "loss_ce": 0.17500001192092896, "loss_lvr": 1.060050368309021, "loss_mode_switch": 0.0, "loss_total": 0.28100505471229553, "step": 1555 }, { "batch_size": 4, "epoch": 0.622, "step": 1555, "tokens_per_device": 3324 }, { "epoch": 0.622, "loss_ce": 0.027470525354146957, "loss_lvr": 1.2805089950561523, "loss_mode_switch": 0.0, "loss_total": 0.1555214375257492, "step": 1555 }, { "batch_size": 4, "epoch": 0.622, "step": 1555, "tokens_per_device": 4140 }, { "epoch": 0.622, "loss_ce": 0.3118082284927368, "loss_lvr": 0.8448377847671509, "loss_mode_switch": 0.0, "loss_total": 0.3962920010089874, "step": 1555 }, { "batch_size": 4, "epoch": 0.622, "step": 1555, "tokens_per_device": 4560 }, { "epoch": 0.622, "loss_ce": 0.18143810331821442, "loss_lvr": 0.7005980014801025, "loss_mode_switch": 0.0, "loss_total": 0.25149789452552795, "step": 1555 }, { "batch_size": 4, "epoch": 0.622, "step": 1555, "tokens_per_device": 11028 }, { "epoch": 0.622, "loss_ce": 0.06239050254225731, "loss_lvr": 0.758320152759552, "loss_mode_switch": 0.0, "loss_total": 0.13822251558303833, "step": 1555 }, { "batch_size": 4, "epoch": 0.622, "step": 1555, "tokens_per_device": 9700 }, { "epoch": 0.622, "loss_ce": 0.2343451976776123, "loss_lvr": 0.7822965979576111, "loss_mode_switch": 0.0, "loss_total": 0.3125748634338379, "step": 1555 }, { "batch_size": 1, "epoch": 0.622, "step": 1555, "tokens_per_device": 4714 }, { "epoch": 0.622, "loss_ce": 0.11994168162345886, "loss_lvr": 0.5629185438156128, "loss_mode_switch": 0.0, "loss_total": 0.17623353004455566, "step": 1555 }, { "batch_size": 4, "epoch": 0.622, "step": 1555, "tokens_per_device": 11988 }, { "epoch": 0.622, "loss_ce": 0.6621715426445007, "loss_lvr": 0.3933984935283661, "loss_mode_switch": 0.0, "loss_total": 0.7015113830566406, "step": 1555 }, { "epoch": 0.6224, "grad_norm": 1.1912232637405396, "learning_rate": 3.2956492715199744e-06, "loss": 0.2682, "step": 1556 }, { "batch_size": 4, "epoch": 0.6224, "step": 1556, "tokens_per_device": 4268 }, { "epoch": 0.6224, "loss_ce": 0.03969646245241165, "loss_lvr": 1.2255626916885376, "loss_mode_switch": 0.0, "loss_total": 0.16225272417068481, "step": 1556 }, { "batch_size": 1, "epoch": 0.6224, "step": 1556, "tokens_per_device": 5052 }, { "epoch": 0.6224, "loss_ce": 0.003646537195891142, "loss_lvr": 0.43311232328414917, "loss_mode_switch": 0.0, "loss_total": 0.046957772225141525, "step": 1556 }, { "batch_size": 4, "epoch": 0.6224, "step": 1556, "tokens_per_device": 1544 }, { "epoch": 0.6224, "loss_ce": 0.09147172421216965, "loss_lvr": 0.7272070050239563, "loss_mode_switch": 0.0, "loss_total": 0.16419242322444916, "step": 1556 }, { "batch_size": 1, "epoch": 0.6224, "step": 1556, "tokens_per_device": 7447 }, { "epoch": 0.6224, "loss_ce": 0.18349505960941315, "loss_lvr": 0.3403359055519104, "loss_mode_switch": 0.0, "loss_total": 0.21752865612506866, "step": 1556 }, { "batch_size": 4, "epoch": 0.6224, "step": 1556, "tokens_per_device": 4904 }, { "epoch": 0.6224, "loss_ce": 0.09491138905286789, "loss_lvr": 0.7340835928916931, "loss_mode_switch": 0.0, "loss_total": 0.16831974685192108, "step": 1556 }, { "batch_size": 1, "epoch": 0.6224, "step": 1556, "tokens_per_device": 5096 }, { "epoch": 0.6224, "loss_ce": 0.020886961370706558, "loss_lvr": 0.13934168219566345, "loss_mode_switch": 0.0, "loss_total": 0.03482113033533096, "step": 1556 }, { "batch_size": 4, "epoch": 0.6224, "step": 1556, "tokens_per_device": 16200 }, { "epoch": 0.6224, "loss_ce": 0.39673230051994324, "loss_lvr": 0.4372420907020569, "loss_mode_switch": 0.0, "loss_total": 0.4404565095901489, "step": 1556 }, { "batch_size": 4, "epoch": 0.6224, "step": 1556, "tokens_per_device": 14032 }, { "epoch": 0.6224, "loss_ce": 0.7504766583442688, "loss_lvr": 0.9892562627792358, "loss_mode_switch": 0.0, "loss_total": 0.8494023084640503, "step": 1556 }, { "epoch": 0.6228, "grad_norm": 1.3082550764083862, "learning_rate": 3.2895611284157757e-06, "loss": 0.2781, "step": 1557 }, { "batch_size": 4, "epoch": 0.6228, "step": 1557, "tokens_per_device": 5172 }, { "epoch": 0.6228, "loss_ce": 0.31909430027008057, "loss_lvr": 0.6688718199729919, "loss_mode_switch": 0.0, "loss_total": 0.3859815001487732, "step": 1557 }, { "batch_size": 4, "epoch": 0.6228, "step": 1557, "tokens_per_device": 1776 }, { "epoch": 0.6228, "loss_ce": 0.13187803328037262, "loss_lvr": 1.0058400630950928, "loss_mode_switch": 0.0, "loss_total": 0.2324620485305786, "step": 1557 }, { "batch_size": 4, "epoch": 0.6228, "step": 1557, "tokens_per_device": 5976 }, { "epoch": 0.6228, "loss_ce": 0.24795065820217133, "loss_lvr": 0.869545042514801, "loss_mode_switch": 0.0, "loss_total": 0.33490514755249023, "step": 1557 }, { "batch_size": 4, "epoch": 0.6228, "step": 1557, "tokens_per_device": 7260 }, { "epoch": 0.6228, "loss_ce": 0.4925052523612976, "loss_lvr": 1.0771441459655762, "loss_mode_switch": 0.0, "loss_total": 0.6002196669578552, "step": 1557 }, { "batch_size": 4, "epoch": 0.6228, "step": 1557, "tokens_per_device": 1816 }, { "epoch": 0.6228, "loss_ce": 0.13859035074710846, "loss_lvr": 0.8036671876907349, "loss_mode_switch": 0.0, "loss_total": 0.2189570665359497, "step": 1557 }, { "batch_size": 4, "epoch": 0.6228, "step": 1557, "tokens_per_device": 1352 }, { "epoch": 0.6228, "loss_ce": 0.24355162680149078, "loss_lvr": 0.9220701456069946, "loss_mode_switch": 0.0, "loss_total": 0.33575862646102905, "step": 1557 }, { "batch_size": 4, "epoch": 0.6228, "step": 1557, "tokens_per_device": 1528 }, { "epoch": 0.6228, "loss_ce": 0.11646745353937149, "loss_lvr": 0.8732313513755798, "loss_mode_switch": 0.0, "loss_total": 0.2037905901670456, "step": 1557 }, { "batch_size": 4, "epoch": 0.6228, "step": 1557, "tokens_per_device": 11004 }, { "epoch": 0.6228, "loss_ce": 0.15700773894786835, "loss_lvr": 0.5794486999511719, "loss_mode_switch": 0.0, "loss_total": 0.21495261788368225, "step": 1557 }, { "epoch": 0.6232, "grad_norm": 1.4068323373794556, "learning_rate": 3.2834758559847903e-06, "loss": 0.3476, "step": 1558 }, { "batch_size": 4, "epoch": 0.6232, "step": 1558, "tokens_per_device": 4532 }, { "epoch": 0.6232, "loss_ce": 0.0034657390788197517, "loss_lvr": 0.6362153887748718, "loss_mode_switch": 0.0, "loss_total": 0.06708728522062302, "step": 1558 }, { "batch_size": 4, "epoch": 0.6232, "step": 1558, "tokens_per_device": 5700 }, { "epoch": 0.6232, "loss_ce": 0.16792310774326324, "loss_lvr": 1.0899693965911865, "loss_mode_switch": 0.0, "loss_total": 0.27692005038261414, "step": 1558 }, { "batch_size": 4, "epoch": 0.6232, "step": 1558, "tokens_per_device": 4256 }, { "epoch": 0.6232, "loss_ce": 0.1130102351307869, "loss_lvr": 0.963699221611023, "loss_mode_switch": 0.0, "loss_total": 0.2093801498413086, "step": 1558 }, { "batch_size": 1, "epoch": 0.6232, "step": 1558, "tokens_per_device": 5033 }, { "epoch": 0.6232, "loss_ce": 0.03605091944336891, "loss_lvr": 0.5796332955360413, "loss_mode_switch": 0.0, "loss_total": 0.0940142497420311, "step": 1558 }, { "batch_size": 1, "epoch": 0.6232, "step": 1558, "tokens_per_device": 4891 }, { "epoch": 0.6232, "loss_ce": 0.0033595487475395203, "loss_lvr": 0.5088614821434021, "loss_mode_switch": 0.0, "loss_total": 0.05424569919705391, "step": 1558 }, { "batch_size": 4, "epoch": 0.6232, "step": 1558, "tokens_per_device": 2648 }, { "epoch": 0.6232, "loss_ce": 0.20108242332935333, "loss_lvr": 0.7602888345718384, "loss_mode_switch": 0.0, "loss_total": 0.277111291885376, "step": 1558 }, { "batch_size": 1, "epoch": 0.6232, "step": 1558, "tokens_per_device": 5089 }, { "epoch": 0.6232, "loss_ce": 0.001724849222227931, "loss_lvr": 0.8696051836013794, "loss_mode_switch": 0.0, "loss_total": 0.08868536353111267, "step": 1558 }, { "batch_size": 1, "epoch": 0.6232, "step": 1558, "tokens_per_device": 5121 }, { "epoch": 0.6232, "loss_ce": 0.07283170521259308, "loss_lvr": 0.22844025492668152, "loss_mode_switch": 0.0, "loss_total": 0.09567572921514511, "step": 1558 }, { "epoch": 0.6236, "grad_norm": 1.1782163381576538, "learning_rate": 3.2773934644400825e-06, "loss": 0.306, "step": 1559 }, { "batch_size": 4, "epoch": 0.6236, "step": 1559, "tokens_per_device": 4328 }, { "epoch": 0.6236, "loss_ce": 0.23987245559692383, "loss_lvr": 0.7805802226066589, "loss_mode_switch": 0.0, "loss_total": 0.3179304897785187, "step": 1559 }, { "batch_size": 4, "epoch": 0.6236, "step": 1559, "tokens_per_device": 10664 }, { "epoch": 0.6236, "loss_ce": 0.2083757519721985, "loss_lvr": 0.8762663006782532, "loss_mode_switch": 0.0, "loss_total": 0.2960023880004883, "step": 1559 }, { "batch_size": 4, "epoch": 0.6236, "step": 1559, "tokens_per_device": 4800 }, { "epoch": 0.6236, "loss_ce": 0.010053735226392746, "loss_lvr": 0.7779620289802551, "loss_mode_switch": 0.0, "loss_total": 0.0878499448299408, "step": 1559 }, { "batch_size": 1, "epoch": 0.6236, "step": 1559, "tokens_per_device": 5110 }, { "epoch": 0.6236, "loss_ce": 0.147216796875, "loss_lvr": 0.28435808420181274, "loss_mode_switch": 0.0, "loss_total": 0.1756526082754135, "step": 1559 }, { "batch_size": 4, "epoch": 0.6236, "step": 1559, "tokens_per_device": 1500 }, { "epoch": 0.6236, "loss_ce": 0.5526391267776489, "loss_lvr": 1.0399432182312012, "loss_mode_switch": 0.0, "loss_total": 0.6566334366798401, "step": 1559 }, { "batch_size": 4, "epoch": 0.6236, "step": 1559, "tokens_per_device": 12120 }, { "epoch": 0.6236, "loss_ce": 0.35961711406707764, "loss_lvr": 0.8846513032913208, "loss_mode_switch": 0.0, "loss_total": 0.44808223843574524, "step": 1559 }, { "batch_size": 1, "epoch": 0.6236, "step": 1559, "tokens_per_device": 4953 }, { "epoch": 0.6236, "loss_ce": 0.003885109443217516, "loss_lvr": 0.7910864353179932, "loss_mode_switch": 0.0, "loss_total": 0.0829937607049942, "step": 1559 }, { "batch_size": 1, "epoch": 0.6236, "step": 1559, "tokens_per_device": 4894 }, { "epoch": 0.6236, "loss_ce": 0.02003401704132557, "loss_lvr": 0.6693859100341797, "loss_mode_switch": 0.0, "loss_total": 0.08697260916233063, "step": 1559 }, { "epoch": 0.624, "grad_norm": 1.2313271760940552, "learning_rate": 3.271313963989886e-06, "loss": 0.2921, "step": 1560 }, { "batch_size": 4, "epoch": 0.624, "step": 1560, "tokens_per_device": 4224 }, { "epoch": 0.624, "loss_ce": 0.015004660934209824, "loss_lvr": 0.5731691122055054, "loss_mode_switch": 0.0, "loss_total": 0.0723215714097023, "step": 1560 }, { "batch_size": 1, "epoch": 0.624, "step": 1560, "tokens_per_device": 5218 }, { "epoch": 0.624, "loss_ce": 1.6614803075790405, "loss_lvr": 0.5436350703239441, "loss_mode_switch": 0.0, "loss_total": 1.7158437967300415, "step": 1560 }, { "batch_size": 4, "epoch": 0.624, "step": 1560, "tokens_per_device": 3616 }, { "epoch": 0.624, "loss_ce": 0.23872101306915283, "loss_lvr": 0.6627947688102722, "loss_mode_switch": 0.0, "loss_total": 0.3050004839897156, "step": 1560 }, { "batch_size": 1, "epoch": 0.624, "step": 1560, "tokens_per_device": 4926 }, { "epoch": 0.624, "loss_ce": 0.00031088394462130964, "loss_lvr": 0.3366478681564331, "loss_mode_switch": 0.0, "loss_total": 0.03397567197680473, "step": 1560 }, { "batch_size": 1, "epoch": 0.624, "step": 1560, "tokens_per_device": 6375 }, { "epoch": 0.624, "loss_ce": 0.0002952190989162773, "loss_lvr": 0.34598830342292786, "loss_mode_switch": 0.0, "loss_total": 0.03489404916763306, "step": 1560 }, { "batch_size": 4, "epoch": 0.624, "step": 1560, "tokens_per_device": 2748 }, { "epoch": 0.624, "loss_ce": 0.17322827875614166, "loss_lvr": 0.5008339881896973, "loss_mode_switch": 0.0, "loss_total": 0.2233116775751114, "step": 1560 }, { "batch_size": 4, "epoch": 0.624, "step": 1560, "tokens_per_device": 8376 }, { "epoch": 0.624, "loss_ce": 0.18871097266674042, "loss_lvr": 0.9484952092170715, "loss_mode_switch": 0.0, "loss_total": 0.28356048464775085, "step": 1560 }, { "batch_size": 1, "epoch": 0.624, "step": 1560, "tokens_per_device": 5094 }, { "epoch": 0.624, "loss_ce": 0.004242603667080402, "loss_lvr": 0.40943199396133423, "loss_mode_switch": 0.0, "loss_total": 0.04518580436706543, "step": 1560 }, { "epoch": 0.6244, "grad_norm": 1.460776448249817, "learning_rate": 3.2652373648375836e-06, "loss": 0.316, "step": 1561 }, { "batch_size": 4, "epoch": 0.6244, "step": 1561, "tokens_per_device": 1272 }, { "epoch": 0.6244, "loss_ce": 0.8054819107055664, "loss_lvr": 0.8985227942466736, "loss_mode_switch": 0.0, "loss_total": 0.8953341841697693, "step": 1561 }, { "batch_size": 1, "epoch": 0.6244, "step": 1561, "tokens_per_device": 4929 }, { "epoch": 0.6244, "loss_ce": 0.0031096856109797955, "loss_lvr": 0.37239354848861694, "loss_mode_switch": 0.0, "loss_total": 0.04034904018044472, "step": 1561 }, { "batch_size": 1, "epoch": 0.6244, "step": 1561, "tokens_per_device": 5151 }, { "epoch": 0.6244, "loss_ce": 0.08324181288480759, "loss_lvr": 0.2737630307674408, "loss_mode_switch": 0.0, "loss_total": 0.11061811447143555, "step": 1561 }, { "batch_size": 4, "epoch": 0.6244, "step": 1561, "tokens_per_device": 2660 }, { "epoch": 0.6244, "loss_ce": 0.18949882686138153, "loss_lvr": 0.8590245842933655, "loss_mode_switch": 0.0, "loss_total": 0.2754012942314148, "step": 1561 }, { "batch_size": 4, "epoch": 0.6244, "step": 1561, "tokens_per_device": 5200 }, { "epoch": 0.6244, "loss_ce": 0.11467434465885162, "loss_lvr": 0.7872359156608582, "loss_mode_switch": 0.0, "loss_total": 0.19339793920516968, "step": 1561 }, { "batch_size": 4, "epoch": 0.6244, "step": 1561, "tokens_per_device": 5592 }, { "epoch": 0.6244, "loss_ce": 0.05188377946615219, "loss_lvr": 0.8847013115882874, "loss_mode_switch": 0.0, "loss_total": 0.14035391807556152, "step": 1561 }, { "batch_size": 4, "epoch": 0.6244, "step": 1561, "tokens_per_device": 4732 }, { "epoch": 0.6244, "loss_ce": 0.6326177716255188, "loss_lvr": 0.856702983379364, "loss_mode_switch": 0.0, "loss_total": 0.7182880640029907, "step": 1561 }, { "batch_size": 4, "epoch": 0.6244, "step": 1561, "tokens_per_device": 9112 }, { "epoch": 0.6244, "loss_ce": 0.011887138709425926, "loss_lvr": 0.7772576808929443, "loss_mode_switch": 0.0, "loss_total": 0.08961290866136551, "step": 1561 }, { "epoch": 0.6248, "grad_norm": 1.4173554182052612, "learning_rate": 3.259163677181687e-06, "loss": 0.3261, "step": 1562 }, { "batch_size": 1, "epoch": 0.6248, "step": 1562, "tokens_per_device": 5041 }, { "epoch": 0.6248, "loss_ce": 0.09788360446691513, "loss_lvr": 0.8271226286888123, "loss_mode_switch": 0.0, "loss_total": 0.18059587478637695, "step": 1562 }, { "batch_size": 4, "epoch": 0.6248, "step": 1562, "tokens_per_device": 5016 }, { "epoch": 0.6248, "loss_ce": 0.290176659822464, "loss_lvr": 0.8120886087417603, "loss_mode_switch": 0.0, "loss_total": 0.37138551473617554, "step": 1562 }, { "batch_size": 1, "epoch": 0.6248, "step": 1562, "tokens_per_device": 4892 }, { "epoch": 0.6248, "loss_ce": 0.02908124029636383, "loss_lvr": 0.5283747315406799, "loss_mode_switch": 0.0, "loss_total": 0.08191871643066406, "step": 1562 }, { "batch_size": 1, "epoch": 0.6248, "step": 1562, "tokens_per_device": 5175 }, { "epoch": 0.6248, "loss_ce": 0.00938799511641264, "loss_lvr": 0.329249769449234, "loss_mode_switch": 0.0, "loss_total": 0.042312972247600555, "step": 1562 }, { "batch_size": 4, "epoch": 0.6248, "step": 1562, "tokens_per_device": 3880 }, { "epoch": 0.6248, "loss_ce": 0.11327799409627914, "loss_lvr": 0.9300415515899658, "loss_mode_switch": 0.0, "loss_total": 0.20628215372562408, "step": 1562 }, { "batch_size": 1, "epoch": 0.6248, "step": 1562, "tokens_per_device": 5065 }, { "epoch": 0.6248, "loss_ce": 0.10386538505554199, "loss_lvr": 0.31760966777801514, "loss_mode_switch": 0.0, "loss_total": 0.13562634587287903, "step": 1562 }, { "batch_size": 1, "epoch": 0.6248, "step": 1562, "tokens_per_device": 4852 }, { "epoch": 0.6248, "loss_ce": 0.00019256502855569124, "loss_lvr": 0.40115469694137573, "loss_mode_switch": 0.0, "loss_total": 0.04030803591012955, "step": 1562 }, { "batch_size": 4, "epoch": 0.6248, "step": 1562, "tokens_per_device": 4556 }, { "epoch": 0.6248, "loss_ce": 0.03173534944653511, "loss_lvr": 0.7271662950515747, "loss_mode_switch": 0.0, "loss_total": 0.104451984167099, "step": 1562 }, { "epoch": 0.6252, "grad_norm": 1.3511496782302856, "learning_rate": 3.2530929112158194e-06, "loss": 0.303, "step": 1563 }, { "batch_size": 4, "epoch": 0.6252, "step": 1563, "tokens_per_device": 2668 }, { "epoch": 0.6252, "loss_ce": 0.4295233190059662, "loss_lvr": 0.7821241617202759, "loss_mode_switch": 0.0, "loss_total": 0.5077357292175293, "step": 1563 }, { "batch_size": 1, "epoch": 0.6252, "step": 1563, "tokens_per_device": 4895 }, { "epoch": 0.6252, "loss_ce": 0.3890337347984314, "loss_lvr": 1.443031668663025, "loss_mode_switch": 0.0, "loss_total": 0.533336877822876, "step": 1563 }, { "batch_size": 4, "epoch": 0.6252, "step": 1563, "tokens_per_device": 4636 }, { "epoch": 0.6252, "loss_ce": 0.5862690806388855, "loss_lvr": 0.7848570346832275, "loss_mode_switch": 0.0, "loss_total": 0.6647548079490662, "step": 1563 }, { "batch_size": 1, "epoch": 0.6252, "step": 1563, "tokens_per_device": 4897 }, { "epoch": 0.6252, "loss_ce": 0.007973696105182171, "loss_lvr": 0.4357643127441406, "loss_mode_switch": 0.0, "loss_total": 0.05155012756586075, "step": 1563 }, { "batch_size": 4, "epoch": 0.6252, "step": 1563, "tokens_per_device": 4272 }, { "epoch": 0.6252, "loss_ce": 0.13036422431468964, "loss_lvr": 1.1207388639450073, "loss_mode_switch": 0.0, "loss_total": 0.24243810772895813, "step": 1563 }, { "batch_size": 4, "epoch": 0.6252, "step": 1563, "tokens_per_device": 2280 }, { "epoch": 0.6252, "loss_ce": 0.2131827175617218, "loss_lvr": 0.8049901723861694, "loss_mode_switch": 0.0, "loss_total": 0.2936817407608032, "step": 1563 }, { "batch_size": 4, "epoch": 0.6252, "step": 1563, "tokens_per_device": 5016 }, { "epoch": 0.6252, "loss_ce": 0.4261956512928009, "loss_lvr": 0.6581847071647644, "loss_mode_switch": 0.0, "loss_total": 0.4920141100883484, "step": 1563 }, { "batch_size": 4, "epoch": 0.6252, "step": 1563, "tokens_per_device": 3840 }, { "epoch": 0.6252, "loss_ce": 0.18020740151405334, "loss_lvr": 0.9429110288619995, "loss_mode_switch": 0.0, "loss_total": 0.27449852228164673, "step": 1563 }, { "epoch": 0.6256, "grad_norm": 1.3425991535186768, "learning_rate": 3.247025077128704e-06, "loss": 0.3246, "step": 1564 }, { "batch_size": 4, "epoch": 0.6256, "step": 1564, "tokens_per_device": 1872 }, { "epoch": 0.6256, "loss_ce": 0.47452741861343384, "loss_lvr": 0.8186222314834595, "loss_mode_switch": 0.0, "loss_total": 0.5563896298408508, "step": 1564 }, { "batch_size": 4, "epoch": 0.6256, "step": 1564, "tokens_per_device": 4648 }, { "epoch": 0.6256, "loss_ce": 0.41905084252357483, "loss_lvr": 0.7691567540168762, "loss_mode_switch": 0.0, "loss_total": 0.49596652388572693, "step": 1564 }, { "batch_size": 4, "epoch": 0.6256, "step": 1564, "tokens_per_device": 1672 }, { "epoch": 0.6256, "loss_ce": 0.7426823377609253, "loss_lvr": 0.9996490478515625, "loss_mode_switch": 0.0, "loss_total": 0.8426472544670105, "step": 1564 }, { "batch_size": 4, "epoch": 0.6256, "step": 1564, "tokens_per_device": 1384 }, { "epoch": 0.6256, "loss_ce": 0.1146966814994812, "loss_lvr": 1.218334674835205, "loss_mode_switch": 0.0, "loss_total": 0.2365301549434662, "step": 1564 }, { "batch_size": 4, "epoch": 0.6256, "step": 1564, "tokens_per_device": 1220 }, { "epoch": 0.6256, "loss_ce": 0.03695639222860336, "loss_lvr": 0.9748251438140869, "loss_mode_switch": 0.0, "loss_total": 0.1344389021396637, "step": 1564 }, { "batch_size": 1, "epoch": 0.6256, "step": 1564, "tokens_per_device": 7282 }, { "epoch": 0.6256, "loss_ce": 0.0024494524113833904, "loss_lvr": 0.3786064088344574, "loss_mode_switch": 0.0, "loss_total": 0.0403100959956646, "step": 1564 }, { "batch_size": 4, "epoch": 0.6256, "step": 1564, "tokens_per_device": 2680 }, { "epoch": 0.6256, "loss_ce": 0.463983952999115, "loss_lvr": 0.777656614780426, "loss_mode_switch": 0.0, "loss_total": 0.5417495965957642, "step": 1564 }, { "batch_size": 4, "epoch": 0.6256, "step": 1564, "tokens_per_device": 5752 }, { "epoch": 0.6256, "loss_ce": 0.6282462477684021, "loss_lvr": 0.6012064218521118, "loss_mode_switch": 0.0, "loss_total": 0.6883668899536133, "step": 1564 }, { "epoch": 0.626, "grad_norm": 1.2710177898406982, "learning_rate": 3.240960185104137e-06, "loss": 0.2804, "step": 1565 }, { "batch_size": 4, "epoch": 0.626, "step": 1565, "tokens_per_device": 1556 }, { "epoch": 0.626, "loss_ce": 0.5359421968460083, "loss_lvr": 1.0137897729873657, "loss_mode_switch": 0.0, "loss_total": 0.6373211741447449, "step": 1565 }, { "batch_size": 4, "epoch": 0.626, "step": 1565, "tokens_per_device": 4324 }, { "epoch": 0.626, "loss_ce": 0.7215012907981873, "loss_lvr": 0.937027633190155, "loss_mode_switch": 0.0, "loss_total": 0.8152040243148804, "step": 1565 }, { "batch_size": 1, "epoch": 0.626, "step": 1565, "tokens_per_device": 4909 }, { "epoch": 0.626, "loss_ce": 0.011923768557608128, "loss_lvr": 0.4652557373046875, "loss_mode_switch": 0.0, "loss_total": 0.05844934284687042, "step": 1565 }, { "batch_size": 1, "epoch": 0.626, "step": 1565, "tokens_per_device": 5686 }, { "epoch": 0.626, "loss_ce": 0.0008016461506485939, "loss_lvr": 0.3436426818370819, "loss_mode_switch": 0.0, "loss_total": 0.03516591340303421, "step": 1565 }, { "batch_size": 4, "epoch": 0.626, "step": 1565, "tokens_per_device": 5160 }, { "epoch": 0.626, "loss_ce": 0.0009329508757218719, "loss_lvr": 0.5554311275482178, "loss_mode_switch": 0.0, "loss_total": 0.05647606402635574, "step": 1565 }, { "batch_size": 4, "epoch": 0.626, "step": 1565, "tokens_per_device": 10732 }, { "epoch": 0.626, "loss_ce": 0.03258134424686432, "loss_lvr": 0.6954857110977173, "loss_mode_switch": 0.0, "loss_total": 0.10212991386651993, "step": 1565 }, { "batch_size": 4, "epoch": 0.626, "step": 1565, "tokens_per_device": 1408 }, { "epoch": 0.626, "loss_ce": 0.22645537555217743, "loss_lvr": 0.7130147814750671, "loss_mode_switch": 0.0, "loss_total": 0.2977568507194519, "step": 1565 }, { "batch_size": 1, "epoch": 0.626, "step": 1565, "tokens_per_device": 6434 }, { "epoch": 0.626, "loss_ce": 0.003519251476973295, "loss_lvr": 0.22744137048721313, "loss_mode_switch": 0.0, "loss_total": 0.026263389736413956, "step": 1565 }, { "epoch": 0.6264, "grad_norm": 1.2757772207260132, "learning_rate": 3.234898245320987e-06, "loss": 0.2761, "step": 1566 }, { "batch_size": 4, "epoch": 0.6264, "step": 1566, "tokens_per_device": 4180 }, { "epoch": 0.6264, "loss_ce": 0.592078685760498, "loss_lvr": 0.8578545451164246, "loss_mode_switch": 0.0, "loss_total": 0.677864134311676, "step": 1566 }, { "batch_size": 4, "epoch": 0.6264, "step": 1566, "tokens_per_device": 4224 }, { "epoch": 0.6264, "loss_ce": 0.06523831933736801, "loss_lvr": 0.8696448802947998, "loss_mode_switch": 0.0, "loss_total": 0.1522028148174286, "step": 1566 }, { "batch_size": 1, "epoch": 0.6264, "step": 1566, "tokens_per_device": 5334 }, { "epoch": 0.6264, "loss_ce": 0.039156753569841385, "loss_lvr": 0.49887171387672424, "loss_mode_switch": 0.0, "loss_total": 0.08904393017292023, "step": 1566 }, { "batch_size": 1, "epoch": 0.6264, "step": 1566, "tokens_per_device": 4608 }, { "epoch": 0.6264, "loss_ce": 0.010670969262719154, "loss_lvr": 0.23366935551166534, "loss_mode_switch": 0.0, "loss_total": 0.03403790295124054, "step": 1566 }, { "batch_size": 4, "epoch": 0.6264, "step": 1566, "tokens_per_device": 4260 }, { "epoch": 0.6264, "loss_ce": 0.17239931225776672, "loss_lvr": 0.9766542315483093, "loss_mode_switch": 0.0, "loss_total": 0.27006474137306213, "step": 1566 }, { "batch_size": 1, "epoch": 0.6264, "step": 1566, "tokens_per_device": 4723 }, { "epoch": 0.6264, "loss_ce": 0.01698848232626915, "loss_lvr": 0.5936359763145447, "loss_mode_switch": 0.0, "loss_total": 0.0763520821928978, "step": 1566 }, { "batch_size": 4, "epoch": 0.6264, "step": 1566, "tokens_per_device": 7676 }, { "epoch": 0.6264, "loss_ce": 0.08530261367559433, "loss_lvr": 0.8517037630081177, "loss_mode_switch": 0.0, "loss_total": 0.17047299444675446, "step": 1566 }, { "batch_size": 1, "epoch": 0.6264, "step": 1566, "tokens_per_device": 5110 }, { "epoch": 0.6264, "loss_ce": 0.0040570953860878944, "loss_lvr": 0.36621811985969543, "loss_mode_switch": 0.0, "loss_total": 0.04067890718579292, "step": 1566 }, { "epoch": 0.6268, "grad_norm": 1.2604941129684448, "learning_rate": 3.2288392679531612e-06, "loss": 0.2844, "step": 1567 }, { "batch_size": 1, "epoch": 0.6268, "step": 1567, "tokens_per_device": 5386 }, { "epoch": 0.6268, "loss_ce": 0.00022813394025433809, "loss_lvr": 0.3524516224861145, "loss_mode_switch": 0.0, "loss_total": 0.0354732945561409, "step": 1567 }, { "batch_size": 4, "epoch": 0.6268, "step": 1567, "tokens_per_device": 5756 }, { "epoch": 0.6268, "loss_ce": 0.07201167196035385, "loss_lvr": 0.876114547252655, "loss_mode_switch": 0.0, "loss_total": 0.1596231311559677, "step": 1567 }, { "batch_size": 4, "epoch": 0.6268, "step": 1567, "tokens_per_device": 4484 }, { "epoch": 0.6268, "loss_ce": 0.15375502407550812, "loss_lvr": 0.8015828728675842, "loss_mode_switch": 0.0, "loss_total": 0.23391330242156982, "step": 1567 }, { "batch_size": 4, "epoch": 0.6268, "step": 1567, "tokens_per_device": 4208 }, { "epoch": 0.6268, "loss_ce": 0.05323184281587601, "loss_lvr": 0.7803186774253845, "loss_mode_switch": 0.0, "loss_total": 0.13126370310783386, "step": 1567 }, { "batch_size": 1, "epoch": 0.6268, "step": 1567, "tokens_per_device": 4870 }, { "epoch": 0.6268, "loss_ce": 0.09810798615217209, "loss_lvr": 0.38775336742401123, "loss_mode_switch": 0.0, "loss_total": 0.13688331842422485, "step": 1567 }, { "batch_size": 4, "epoch": 0.6268, "step": 1567, "tokens_per_device": 2580 }, { "epoch": 0.6268, "loss_ce": 0.050980471074581146, "loss_lvr": 1.1172130107879639, "loss_mode_switch": 0.0, "loss_total": 0.1627017706632614, "step": 1567 }, { "batch_size": 4, "epoch": 0.6268, "step": 1567, "tokens_per_device": 2596 }, { "epoch": 0.6268, "loss_ce": 0.045785482972860336, "loss_lvr": 0.8978502750396729, "loss_mode_switch": 0.0, "loss_total": 0.13557051122188568, "step": 1567 }, { "batch_size": 4, "epoch": 0.6268, "step": 1567, "tokens_per_device": 4196 }, { "epoch": 0.6268, "loss_ce": 0.2041935920715332, "loss_lvr": 0.7632663249969482, "loss_mode_switch": 0.0, "loss_total": 0.2805202305316925, "step": 1567 }, { "epoch": 0.6272, "grad_norm": 1.3775560855865479, "learning_rate": 3.2227832631695936e-06, "loss": 0.2829, "step": 1568 }, { "batch_size": 4, "epoch": 0.6272, "step": 1568, "tokens_per_device": 6740 }, { "epoch": 0.6272, "loss_ce": 0.20244383811950684, "loss_lvr": 0.6808801293373108, "loss_mode_switch": 0.0, "loss_total": 0.27053186297416687, "step": 1568 }, { "batch_size": 4, "epoch": 0.6272, "step": 1568, "tokens_per_device": 6644 }, { "epoch": 0.6272, "loss_ce": 0.3914784789085388, "loss_lvr": 0.5649220943450928, "loss_mode_switch": 0.0, "loss_total": 0.4479706883430481, "step": 1568 }, { "batch_size": 4, "epoch": 0.6272, "step": 1568, "tokens_per_device": 4372 }, { "epoch": 0.6272, "loss_ce": 0.4347170293331146, "loss_lvr": 0.8642142415046692, "loss_mode_switch": 0.0, "loss_total": 0.5211384296417236, "step": 1568 }, { "batch_size": 4, "epoch": 0.6272, "step": 1568, "tokens_per_device": 5116 }, { "epoch": 0.6272, "loss_ce": 0.25338342785835266, "loss_lvr": 0.6566600203514099, "loss_mode_switch": 0.0, "loss_total": 0.3190494179725647, "step": 1568 }, { "batch_size": 4, "epoch": 0.6272, "step": 1568, "tokens_per_device": 3512 }, { "epoch": 0.6272, "loss_ce": 0.2547685205936432, "loss_lvr": 0.8685834407806396, "loss_mode_switch": 0.0, "loss_total": 0.3416268825531006, "step": 1568 }, { "batch_size": 4, "epoch": 0.6272, "step": 1568, "tokens_per_device": 3964 }, { "epoch": 0.6272, "loss_ce": 0.4735119938850403, "loss_lvr": 0.8181278705596924, "loss_mode_switch": 0.0, "loss_total": 0.5553247928619385, "step": 1568 }, { "batch_size": 4, "epoch": 0.6272, "step": 1568, "tokens_per_device": 2696 }, { "epoch": 0.6272, "loss_ce": 0.3504084646701813, "loss_lvr": 0.7414732575416565, "loss_mode_switch": 0.0, "loss_total": 0.42455577850341797, "step": 1568 }, { "batch_size": 4, "epoch": 0.6272, "step": 1568, "tokens_per_device": 2584 }, { "epoch": 0.6272, "loss_ce": 0.16990704834461212, "loss_lvr": 0.9970868825912476, "loss_mode_switch": 0.0, "loss_total": 0.2696157395839691, "step": 1568 }, { "epoch": 0.6276, "grad_norm": 1.8206082582473755, "learning_rate": 3.2167302411342322e-06, "loss": 0.2699, "step": 1569 }, { "batch_size": 4, "epoch": 0.6276, "step": 1569, "tokens_per_device": 4124 }, { "epoch": 0.6276, "loss_ce": 0.5874807834625244, "loss_lvr": 0.7173507213592529, "loss_mode_switch": 0.0, "loss_total": 0.6592158675193787, "step": 1569 }, { "batch_size": 1, "epoch": 0.6276, "step": 1569, "tokens_per_device": 5134 }, { "epoch": 0.6276, "loss_ce": 0.3578585088253021, "loss_lvr": 0.15356916189193726, "loss_mode_switch": 0.0, "loss_total": 0.3732154369354248, "step": 1569 }, { "batch_size": 4, "epoch": 0.6276, "step": 1569, "tokens_per_device": 6048 }, { "epoch": 0.6276, "loss_ce": 0.032249320298433304, "loss_lvr": 0.7067897319793701, "loss_mode_switch": 0.0, "loss_total": 0.1029282957315445, "step": 1569 }, { "batch_size": 4, "epoch": 0.6276, "step": 1569, "tokens_per_device": 1576 }, { "epoch": 0.6276, "loss_ce": 0.6137128472328186, "loss_lvr": 0.8957769870758057, "loss_mode_switch": 0.0, "loss_total": 0.7032905220985413, "step": 1569 }, { "batch_size": 1, "epoch": 0.6276, "step": 1569, "tokens_per_device": 5107 }, { "epoch": 0.6276, "loss_ce": 0.18875345587730408, "loss_lvr": 0.7379044890403748, "loss_mode_switch": 0.0, "loss_total": 0.2625439167022705, "step": 1569 }, { "batch_size": 1, "epoch": 0.6276, "step": 1569, "tokens_per_device": 5273 }, { "epoch": 0.6276, "loss_ce": 0.045068491250276566, "loss_lvr": 0.7428006529808044, "loss_mode_switch": 0.0, "loss_total": 0.11934855580329895, "step": 1569 }, { "batch_size": 4, "epoch": 0.6276, "step": 1569, "tokens_per_device": 5804 }, { "epoch": 0.6276, "loss_ce": 0.250262051820755, "loss_lvr": 1.1220108270645142, "loss_mode_switch": 0.0, "loss_total": 0.3624631464481354, "step": 1569 }, { "batch_size": 4, "epoch": 0.6276, "step": 1569, "tokens_per_device": 4092 }, { "epoch": 0.6276, "loss_ce": 0.5202879905700684, "loss_lvr": 0.855103611946106, "loss_mode_switch": 0.0, "loss_total": 0.6057983636856079, "step": 1569 }, { "epoch": 0.628, "grad_norm": 1.306748390197754, "learning_rate": 3.2106802120060197e-06, "loss": 0.2513, "step": 1570 }, { "batch_size": 4, "epoch": 0.628, "step": 1570, "tokens_per_device": 3820 }, { "epoch": 0.628, "loss_ce": 0.26843854784965515, "loss_lvr": 0.7421347498893738, "loss_mode_switch": 0.0, "loss_total": 0.34265202283859253, "step": 1570 }, { "batch_size": 4, "epoch": 0.628, "step": 1570, "tokens_per_device": 1148 }, { "epoch": 0.628, "loss_ce": 0.5161985754966736, "loss_lvr": 1.1408416032791138, "loss_mode_switch": 0.0, "loss_total": 0.6302827596664429, "step": 1570 }, { "batch_size": 1, "epoch": 0.628, "step": 1570, "tokens_per_device": 4884 }, { "epoch": 0.628, "loss_ce": 0.15252576768398285, "loss_lvr": 0.6593363285064697, "loss_mode_switch": 0.0, "loss_total": 0.21845939755439758, "step": 1570 }, { "batch_size": 4, "epoch": 0.628, "step": 1570, "tokens_per_device": 2324 }, { "epoch": 0.628, "loss_ce": 0.29166728258132935, "loss_lvr": 0.658719003200531, "loss_mode_switch": 0.0, "loss_total": 0.35753917694091797, "step": 1570 }, { "batch_size": 4, "epoch": 0.628, "step": 1570, "tokens_per_device": 5152 }, { "epoch": 0.628, "loss_ce": 0.08340192586183548, "loss_lvr": 0.8079795241355896, "loss_mode_switch": 0.0, "loss_total": 0.16419988870620728, "step": 1570 }, { "batch_size": 1, "epoch": 0.628, "step": 1570, "tokens_per_device": 4630 }, { "epoch": 0.628, "loss_ce": 0.7580156922340393, "loss_lvr": 0.22934558987617493, "loss_mode_switch": 0.0, "loss_total": 0.7809502482414246, "step": 1570 }, { "batch_size": 4, "epoch": 0.628, "step": 1570, "tokens_per_device": 4316 }, { "epoch": 0.628, "loss_ce": 0.45446687936782837, "loss_lvr": 0.8729203939437866, "loss_mode_switch": 0.0, "loss_total": 0.5417588949203491, "step": 1570 }, { "batch_size": 4, "epoch": 0.628, "step": 1570, "tokens_per_device": 4312 }, { "epoch": 0.628, "loss_ce": 0.01650671660900116, "loss_lvr": 0.7028985619544983, "loss_mode_switch": 0.0, "loss_total": 0.08679657429456711, "step": 1570 }, { "epoch": 0.6284, "grad_norm": 1.4109913110733032, "learning_rate": 3.2046331859388757e-06, "loss": 0.3095, "step": 1571 }, { "batch_size": 1, "epoch": 0.6284, "step": 1571, "tokens_per_device": 4890 }, { "epoch": 0.6284, "loss_ce": 0.02880224399268627, "loss_lvr": 0.22335465252399445, "loss_mode_switch": 0.0, "loss_total": 0.05113770812749863, "step": 1571 }, { "batch_size": 1, "epoch": 0.6284, "step": 1571, "tokens_per_device": 4906 }, { "epoch": 0.6284, "loss_ce": 0.03322579711675644, "loss_lvr": 0.5206292867660522, "loss_mode_switch": 0.0, "loss_total": 0.08528872579336166, "step": 1571 }, { "batch_size": 4, "epoch": 0.6284, "step": 1571, "tokens_per_device": 3896 }, { "epoch": 0.6284, "loss_ce": 0.20895454287528992, "loss_lvr": 0.8826424479484558, "loss_mode_switch": 0.0, "loss_total": 0.29721879959106445, "step": 1571 }, { "batch_size": 4, "epoch": 0.6284, "step": 1571, "tokens_per_device": 10568 }, { "epoch": 0.6284, "loss_ce": 0.12108349055051804, "loss_lvr": 0.5170734524726868, "loss_mode_switch": 0.0, "loss_total": 0.17279084026813507, "step": 1571 }, { "batch_size": 4, "epoch": 0.6284, "step": 1571, "tokens_per_device": 1340 }, { "epoch": 0.6284, "loss_ce": 0.33814746141433716, "loss_lvr": 1.3410370349884033, "loss_mode_switch": 0.0, "loss_total": 0.47225117683410645, "step": 1571 }, { "batch_size": 4, "epoch": 0.6284, "step": 1571, "tokens_per_device": 4208 }, { "epoch": 0.6284, "loss_ce": 0.08991933614015579, "loss_lvr": 0.7961427569389343, "loss_mode_switch": 0.0, "loss_total": 0.1695336103439331, "step": 1571 }, { "batch_size": 4, "epoch": 0.6284, "step": 1571, "tokens_per_device": 1412 }, { "epoch": 0.6284, "loss_ce": 0.10844064503908157, "loss_lvr": 0.915217399597168, "loss_mode_switch": 0.0, "loss_total": 0.19996237754821777, "step": 1571 }, { "batch_size": 4, "epoch": 0.6284, "step": 1571, "tokens_per_device": 4544 }, { "epoch": 0.6284, "loss_ce": 0.07428494840860367, "loss_lvr": 0.7809752225875854, "loss_mode_switch": 0.0, "loss_total": 0.15238246321678162, "step": 1571 }, { "epoch": 0.6288, "grad_norm": 1.2753231525421143, "learning_rate": 3.198589173081674e-06, "loss": 0.263, "step": 1572 }, { "batch_size": 4, "epoch": 0.6288, "step": 1572, "tokens_per_device": 2776 }, { "epoch": 0.6288, "loss_ce": 0.17291322350502014, "loss_lvr": 0.7720960974693298, "loss_mode_switch": 0.0, "loss_total": 0.2501228451728821, "step": 1572 }, { "batch_size": 4, "epoch": 0.6288, "step": 1572, "tokens_per_device": 5112 }, { "epoch": 0.6288, "loss_ce": 0.4435245990753174, "loss_lvr": 0.7175623774528503, "loss_mode_switch": 0.0, "loss_total": 0.5152808427810669, "step": 1572 }, { "batch_size": 1, "epoch": 0.6288, "step": 1572, "tokens_per_device": 5152 }, { "epoch": 0.6288, "loss_ce": 0.004081338178366423, "loss_lvr": 0.4638379216194153, "loss_mode_switch": 0.0, "loss_total": 0.05046513304114342, "step": 1572 }, { "batch_size": 1, "epoch": 0.6288, "step": 1572, "tokens_per_device": 5088 }, { "epoch": 0.6288, "loss_ce": 0.002082430524751544, "loss_lvr": 0.31195953488349915, "loss_mode_switch": 0.0, "loss_total": 0.03327838331460953, "step": 1572 }, { "batch_size": 1, "epoch": 0.6288, "step": 1572, "tokens_per_device": 5497 }, { "epoch": 0.6288, "loss_ce": 0.08704522252082825, "loss_lvr": 0.28754764795303345, "loss_mode_switch": 0.0, "loss_total": 0.11579998582601547, "step": 1572 }, { "batch_size": 4, "epoch": 0.6288, "step": 1572, "tokens_per_device": 4236 }, { "epoch": 0.6288, "loss_ce": 0.16549643874168396, "loss_lvr": 0.600576639175415, "loss_mode_switch": 0.0, "loss_total": 0.22555410861968994, "step": 1572 }, { "batch_size": 4, "epoch": 0.6288, "step": 1572, "tokens_per_device": 2544 }, { "epoch": 0.6288, "loss_ce": 0.8441736102104187, "loss_lvr": 0.7357004880905151, "loss_mode_switch": 0.0, "loss_total": 0.9177436828613281, "step": 1572 }, { "batch_size": 4, "epoch": 0.6288, "step": 1572, "tokens_per_device": 14492 }, { "epoch": 0.6288, "loss_ce": 0.18076226115226746, "loss_lvr": 0.8367428183555603, "loss_mode_switch": 0.0, "loss_total": 0.2644365429878235, "step": 1572 }, { "epoch": 0.6292, "grad_norm": 1.4590245485305786, "learning_rate": 3.19254818357824e-06, "loss": 0.2895, "step": 1573 }, { "batch_size": 1, "epoch": 0.6292, "step": 1573, "tokens_per_device": 5132 }, { "epoch": 0.6292, "loss_ce": 0.00816580280661583, "loss_lvr": 0.2702553868293762, "loss_mode_switch": 0.0, "loss_total": 0.03519134223461151, "step": 1573 }, { "batch_size": 1, "epoch": 0.6292, "step": 1573, "tokens_per_device": 4937 }, { "epoch": 0.6292, "loss_ce": 0.282755583524704, "loss_lvr": 0.2981472611427307, "loss_mode_switch": 0.0, "loss_total": 0.3125703036785126, "step": 1573 }, { "batch_size": 1, "epoch": 0.6292, "step": 1573, "tokens_per_device": 4925 }, { "epoch": 0.6292, "loss_ce": 0.0051452117040753365, "loss_lvr": 0.30728307366371155, "loss_mode_switch": 0.0, "loss_total": 0.035873521119356155, "step": 1573 }, { "batch_size": 4, "epoch": 0.6292, "step": 1573, "tokens_per_device": 5736 }, { "epoch": 0.6292, "loss_ce": 0.2877516746520996, "loss_lvr": 0.6867773532867432, "loss_mode_switch": 0.0, "loss_total": 0.35642939805984497, "step": 1573 }, { "batch_size": 4, "epoch": 0.6292, "step": 1573, "tokens_per_device": 3808 }, { "epoch": 0.6292, "loss_ce": 0.4698006212711334, "loss_lvr": 1.1502854824066162, "loss_mode_switch": 0.0, "loss_total": 0.5848291516304016, "step": 1573 }, { "batch_size": 1, "epoch": 0.6292, "step": 1573, "tokens_per_device": 4892 }, { "epoch": 0.6292, "loss_ce": 0.0016132863238453865, "loss_lvr": 0.20237095654010773, "loss_mode_switch": 0.0, "loss_total": 0.021850381046533585, "step": 1573 }, { "batch_size": 1, "epoch": 0.6292, "step": 1573, "tokens_per_device": 4883 }, { "epoch": 0.6292, "loss_ce": 0.0012704783584922552, "loss_lvr": 1.0380232334136963, "loss_mode_switch": 0.0, "loss_total": 0.10507280379533768, "step": 1573 }, { "batch_size": 4, "epoch": 0.6292, "step": 1573, "tokens_per_device": 3876 }, { "epoch": 0.6292, "loss_ce": 0.07337598502635956, "loss_lvr": 0.8533865213394165, "loss_mode_switch": 0.0, "loss_total": 0.1587146371603012, "step": 1573 }, { "epoch": 0.6296, "grad_norm": 1.2936716079711914, "learning_rate": 3.1865102275673167e-06, "loss": 0.2883, "step": 1574 }, { "batch_size": 4, "epoch": 0.6296, "step": 1574, "tokens_per_device": 4264 }, { "epoch": 0.6296, "loss_ce": 0.001882888376712799, "loss_lvr": 0.680966317653656, "loss_mode_switch": 0.0, "loss_total": 0.06997951865196228, "step": 1574 }, { "batch_size": 4, "epoch": 0.6296, "step": 1574, "tokens_per_device": 4232 }, { "epoch": 0.6296, "loss_ce": 0.09998583048582077, "loss_lvr": 0.6778593063354492, "loss_mode_switch": 0.0, "loss_total": 0.16777175664901733, "step": 1574 }, { "batch_size": 4, "epoch": 0.6296, "step": 1574, "tokens_per_device": 3776 }, { "epoch": 0.6296, "loss_ce": 0.10247728228569031, "loss_lvr": 0.9172208905220032, "loss_mode_switch": 0.0, "loss_total": 0.19419938325881958, "step": 1574 }, { "batch_size": 4, "epoch": 0.6296, "step": 1574, "tokens_per_device": 8356 }, { "epoch": 0.6296, "loss_ce": 0.10544656962156296, "loss_lvr": 0.8475185036659241, "loss_mode_switch": 0.0, "loss_total": 0.19019842147827148, "step": 1574 }, { "batch_size": 4, "epoch": 0.6296, "step": 1574, "tokens_per_device": 4236 }, { "epoch": 0.6296, "loss_ce": 0.39436376094818115, "loss_lvr": 0.923143744468689, "loss_mode_switch": 0.0, "loss_total": 0.4866781234741211, "step": 1574 }, { "batch_size": 4, "epoch": 0.6296, "step": 1574, "tokens_per_device": 5492 }, { "epoch": 0.6296, "loss_ce": 0.03064323589205742, "loss_lvr": 0.7299626469612122, "loss_mode_switch": 0.0, "loss_total": 0.10363949835300446, "step": 1574 }, { "batch_size": 1, "epoch": 0.6296, "step": 1574, "tokens_per_device": 4860 }, { "epoch": 0.6296, "loss_ce": 0.0005325431120581925, "loss_lvr": 0.2910124957561493, "loss_mode_switch": 0.0, "loss_total": 0.029633793979883194, "step": 1574 }, { "batch_size": 1, "epoch": 0.6296, "step": 1574, "tokens_per_device": 5147 }, { "epoch": 0.6296, "loss_ce": 0.11941184848546982, "loss_lvr": 0.3969380259513855, "loss_mode_switch": 0.0, "loss_total": 0.15910565853118896, "step": 1574 }, { "epoch": 0.63, "grad_norm": 1.2186659574508667, "learning_rate": 3.180475315182563e-06, "loss": 0.2643, "step": 1575 }, { "batch_size": 4, "epoch": 0.63, "step": 1575, "tokens_per_device": 4676 }, { "epoch": 0.63, "loss_ce": 0.16540300846099854, "loss_lvr": 0.7877193093299866, "loss_mode_switch": 0.0, "loss_total": 0.24417494237422943, "step": 1575 }, { "batch_size": 4, "epoch": 0.63, "step": 1575, "tokens_per_device": 4292 }, { "epoch": 0.63, "loss_ce": 0.20601806044578552, "loss_lvr": 1.0739655494689941, "loss_mode_switch": 0.0, "loss_total": 0.31341463327407837, "step": 1575 }, { "batch_size": 4, "epoch": 0.63, "step": 1575, "tokens_per_device": 4432 }, { "epoch": 0.63, "loss_ce": 0.30725476145744324, "loss_lvr": 0.8103553652763367, "loss_mode_switch": 0.0, "loss_total": 0.38829028606414795, "step": 1575 }, { "batch_size": 1, "epoch": 0.63, "step": 1575, "tokens_per_device": 5176 }, { "epoch": 0.63, "loss_ce": 0.002207039622589946, "loss_lvr": 0.40431395173072815, "loss_mode_switch": 0.0, "loss_total": 0.04263843595981598, "step": 1575 }, { "batch_size": 4, "epoch": 0.63, "step": 1575, "tokens_per_device": 5260 }, { "epoch": 0.63, "loss_ce": 0.12333273142576218, "loss_lvr": 0.6498422622680664, "loss_mode_switch": 0.0, "loss_total": 0.1883169561624527, "step": 1575 }, { "batch_size": 4, "epoch": 0.63, "step": 1575, "tokens_per_device": 10692 }, { "epoch": 0.63, "loss_ce": 0.4547346234321594, "loss_lvr": 0.5683689117431641, "loss_mode_switch": 0.0, "loss_total": 0.5115715265274048, "step": 1575 }, { "batch_size": 4, "epoch": 0.63, "step": 1575, "tokens_per_device": 2828 }, { "epoch": 0.63, "loss_ce": 0.3154700994491577, "loss_lvr": 0.8413531184196472, "loss_mode_switch": 0.0, "loss_total": 0.3996054232120514, "step": 1575 }, { "batch_size": 1, "epoch": 0.63, "step": 1575, "tokens_per_device": 4905 }, { "epoch": 0.63, "loss_ce": 0.011698484420776367, "loss_lvr": 0.47609826922416687, "loss_mode_switch": 0.0, "loss_total": 0.059308312833309174, "step": 1575 }, { "epoch": 0.6304, "grad_norm": 1.2268491983413696, "learning_rate": 3.1744434565525252e-06, "loss": 0.3003, "step": 1576 }, { "batch_size": 4, "epoch": 0.6304, "step": 1576, "tokens_per_device": 3744 }, { "epoch": 0.6304, "loss_ce": 0.0018146493239328265, "loss_lvr": 1.5674933195114136, "loss_mode_switch": 0.0, "loss_total": 0.1585639864206314, "step": 1576 }, { "batch_size": 4, "epoch": 0.6304, "step": 1576, "tokens_per_device": 1796 }, { "epoch": 0.6304, "loss_ce": 0.3344614803791046, "loss_lvr": 1.0248740911483765, "loss_mode_switch": 0.0, "loss_total": 0.43694889545440674, "step": 1576 }, { "batch_size": 4, "epoch": 0.6304, "step": 1576, "tokens_per_device": 4352 }, { "epoch": 0.6304, "loss_ce": 0.22669871151447296, "loss_lvr": 0.8515079021453857, "loss_mode_switch": 0.0, "loss_total": 0.3118495047092438, "step": 1576 }, { "batch_size": 1, "epoch": 0.6304, "step": 1576, "tokens_per_device": 4869 }, { "epoch": 0.6304, "loss_ce": 0.0017226178897544742, "loss_lvr": 0.42926302552223206, "loss_mode_switch": 0.0, "loss_total": 0.044648922979831696, "step": 1576 }, { "batch_size": 1, "epoch": 0.6304, "step": 1576, "tokens_per_device": 5081 }, { "epoch": 0.6304, "loss_ce": 0.025812286883592606, "loss_lvr": 0.20521332323551178, "loss_mode_switch": 0.0, "loss_total": 0.046333618462085724, "step": 1576 }, { "batch_size": 1, "epoch": 0.6304, "step": 1576, "tokens_per_device": 4906 }, { "epoch": 0.6304, "loss_ce": 0.07835283875465393, "loss_lvr": 0.29727819561958313, "loss_mode_switch": 0.0, "loss_total": 0.10808065533638, "step": 1576 }, { "batch_size": 4, "epoch": 0.6304, "step": 1576, "tokens_per_device": 1476 }, { "epoch": 0.6304, "loss_ce": 0.4270114004611969, "loss_lvr": 0.8312323093414307, "loss_mode_switch": 0.0, "loss_total": 0.5101346373558044, "step": 1576 }, { "batch_size": 4, "epoch": 0.6304, "step": 1576, "tokens_per_device": 3828 }, { "epoch": 0.6304, "loss_ce": 0.21559137105941772, "loss_lvr": 0.9976814985275269, "loss_mode_switch": 0.0, "loss_total": 0.31535953283309937, "step": 1576 }, { "epoch": 0.6308, "grad_norm": 1.6221801042556763, "learning_rate": 3.168414661800625e-06, "loss": 0.3013, "step": 1577 }, { "batch_size": 4, "epoch": 0.6308, "step": 1577, "tokens_per_device": 3852 }, { "epoch": 0.6308, "loss_ce": 0.2147296518087387, "loss_lvr": 1.1090811491012573, "loss_mode_switch": 0.0, "loss_total": 0.3256377577781677, "step": 1577 }, { "batch_size": 4, "epoch": 0.6308, "step": 1577, "tokens_per_device": 4928 }, { "epoch": 0.6308, "loss_ce": 0.04027148336172104, "loss_lvr": 0.7542587518692017, "loss_mode_switch": 0.0, "loss_total": 0.11569736152887344, "step": 1577 }, { "batch_size": 4, "epoch": 0.6308, "step": 1577, "tokens_per_device": 5352 }, { "epoch": 0.6308, "loss_ce": 0.0612407885491848, "loss_lvr": 0.9501601457595825, "loss_mode_switch": 0.0, "loss_total": 0.1562568098306656, "step": 1577 }, { "batch_size": 4, "epoch": 0.6308, "step": 1577, "tokens_per_device": 5476 }, { "epoch": 0.6308, "loss_ce": 0.5598099231719971, "loss_lvr": 0.8616918325424194, "loss_mode_switch": 0.0, "loss_total": 0.645979106426239, "step": 1577 }, { "batch_size": 4, "epoch": 0.6308, "step": 1577, "tokens_per_device": 6336 }, { "epoch": 0.6308, "loss_ce": 0.14812125265598297, "loss_lvr": 0.4762253165245056, "loss_mode_switch": 0.0, "loss_total": 0.19574378430843353, "step": 1577 }, { "batch_size": 4, "epoch": 0.6308, "step": 1577, "tokens_per_device": 3944 }, { "epoch": 0.6308, "loss_ce": 0.4011441469192505, "loss_lvr": 1.193277359008789, "loss_mode_switch": 0.0, "loss_total": 0.5204718708992004, "step": 1577 }, { "batch_size": 1, "epoch": 0.6308, "step": 1577, "tokens_per_device": 5209 }, { "epoch": 0.6308, "loss_ce": 0.10501790046691895, "loss_lvr": 0.22607922554016113, "loss_mode_switch": 0.0, "loss_total": 0.12762582302093506, "step": 1577 }, { "batch_size": 4, "epoch": 0.6308, "step": 1577, "tokens_per_device": 4396 }, { "epoch": 0.6308, "loss_ce": 0.20726129412651062, "loss_lvr": 0.9382421374320984, "loss_mode_switch": 0.0, "loss_total": 0.301085501909256, "step": 1577 }, { "epoch": 0.6312, "grad_norm": 1.2509539127349854, "learning_rate": 3.1623889410451435e-06, "loss": 0.2764, "step": 1578 }, { "batch_size": 4, "epoch": 0.6312, "step": 1578, "tokens_per_device": 4636 }, { "epoch": 0.6312, "loss_ce": 0.7880945205688477, "loss_lvr": 0.8012579679489136, "loss_mode_switch": 0.0, "loss_total": 0.868220329284668, "step": 1578 }, { "batch_size": 1, "epoch": 0.6312, "step": 1578, "tokens_per_device": 5032 }, { "epoch": 0.6312, "loss_ce": 0.1568370759487152, "loss_lvr": 0.3378252387046814, "loss_mode_switch": 0.0, "loss_total": 0.1906196027994156, "step": 1578 }, { "batch_size": 4, "epoch": 0.6312, "step": 1578, "tokens_per_device": 5776 }, { "epoch": 0.6312, "loss_ce": 0.13468627631664276, "loss_lvr": 0.861961305141449, "loss_mode_switch": 0.0, "loss_total": 0.22088241577148438, "step": 1578 }, { "batch_size": 4, "epoch": 0.6312, "step": 1578, "tokens_per_device": 3740 }, { "epoch": 0.6312, "loss_ce": 0.19780874252319336, "loss_lvr": 1.0849484205245972, "loss_mode_switch": 0.0, "loss_total": 0.30630359053611755, "step": 1578 }, { "batch_size": 4, "epoch": 0.6312, "step": 1578, "tokens_per_device": 4664 }, { "epoch": 0.6312, "loss_ce": 0.18145982921123505, "loss_lvr": 0.7666882872581482, "loss_mode_switch": 0.0, "loss_total": 0.25812864303588867, "step": 1578 }, { "batch_size": 4, "epoch": 0.6312, "step": 1578, "tokens_per_device": 1600 }, { "epoch": 0.6312, "loss_ce": 0.365874707698822, "loss_lvr": 1.062741756439209, "loss_mode_switch": 0.0, "loss_total": 0.4721488952636719, "step": 1578 }, { "batch_size": 4, "epoch": 0.6312, "step": 1578, "tokens_per_device": 4552 }, { "epoch": 0.6312, "loss_ce": 0.04249977320432663, "loss_lvr": 0.8247617483139038, "loss_mode_switch": 0.0, "loss_total": 0.12497594952583313, "step": 1578 }, { "batch_size": 4, "epoch": 0.6312, "step": 1578, "tokens_per_device": 4652 }, { "epoch": 0.6312, "loss_ce": 0.14078488945960999, "loss_lvr": 0.9196827411651611, "loss_mode_switch": 0.0, "loss_total": 0.23275315761566162, "step": 1578 }, { "epoch": 0.6316, "grad_norm": 1.7326688766479492, "learning_rate": 3.1563663043991987e-06, "loss": 0.3373, "step": 1579 }, { "batch_size": 4, "epoch": 0.6316, "step": 1579, "tokens_per_device": 2664 }, { "epoch": 0.6316, "loss_ce": 0.021089715883135796, "loss_lvr": 0.5826974511146545, "loss_mode_switch": 0.0, "loss_total": 0.07935946434736252, "step": 1579 }, { "batch_size": 4, "epoch": 0.6316, "step": 1579, "tokens_per_device": 1344 }, { "epoch": 0.6316, "loss_ce": 0.2283160388469696, "loss_lvr": 0.8639986515045166, "loss_mode_switch": 0.0, "loss_total": 0.3147159218788147, "step": 1579 }, { "batch_size": 1, "epoch": 0.6316, "step": 1579, "tokens_per_device": 5042 }, { "epoch": 0.6316, "loss_ce": 0.0040186080150306225, "loss_lvr": 0.2709274888038635, "loss_mode_switch": 0.0, "loss_total": 0.031111357733607292, "step": 1579 }, { "batch_size": 1, "epoch": 0.6316, "step": 1579, "tokens_per_device": 4899 }, { "epoch": 0.6316, "loss_ce": 0.002283575711771846, "loss_lvr": 0.3619009554386139, "loss_mode_switch": 0.0, "loss_total": 0.03847367316484451, "step": 1579 }, { "batch_size": 1, "epoch": 0.6316, "step": 1579, "tokens_per_device": 5192 }, { "epoch": 0.6316, "loss_ce": 0.015024380758404732, "loss_lvr": 0.22364521026611328, "loss_mode_switch": 0.0, "loss_total": 0.03738890215754509, "step": 1579 }, { "batch_size": 4, "epoch": 0.6316, "step": 1579, "tokens_per_device": 2996 }, { "epoch": 0.6316, "loss_ce": 0.3341885209083557, "loss_lvr": 0.9657877087593079, "loss_mode_switch": 0.0, "loss_total": 0.430767297744751, "step": 1579 }, { "batch_size": 4, "epoch": 0.6316, "step": 1579, "tokens_per_device": 6448 }, { "epoch": 0.6316, "loss_ce": 0.023068873211741447, "loss_lvr": 0.5926014184951782, "loss_mode_switch": 0.0, "loss_total": 0.08232901245355606, "step": 1579 }, { "batch_size": 4, "epoch": 0.6316, "step": 1579, "tokens_per_device": 4220 }, { "epoch": 0.6316, "loss_ce": 0.3785105049610138, "loss_lvr": 1.2358816862106323, "loss_mode_switch": 0.0, "loss_total": 0.5020986795425415, "step": 1579 }, { "epoch": 0.632, "grad_norm": 1.3996436595916748, "learning_rate": 3.1503467619707407e-06, "loss": 0.2966, "step": 1580 }, { "batch_size": 4, "epoch": 0.632, "step": 1580, "tokens_per_device": 2672 }, { "epoch": 0.632, "loss_ce": 0.22524215281009674, "loss_lvr": 1.0324182510375977, "loss_mode_switch": 0.0, "loss_total": 0.3284839689731598, "step": 1580 }, { "batch_size": 1, "epoch": 0.632, "step": 1580, "tokens_per_device": 4264 }, { "epoch": 0.632, "loss_ce": 0.045595716685056686, "loss_lvr": 0.5788795948028564, "loss_mode_switch": 0.0, "loss_total": 0.10348367691040039, "step": 1580 }, { "batch_size": 4, "epoch": 0.632, "step": 1580, "tokens_per_device": 3852 }, { "epoch": 0.632, "loss_ce": 0.1477433741092682, "loss_lvr": 0.8942806124687195, "loss_mode_switch": 0.0, "loss_total": 0.23717144131660461, "step": 1580 }, { "batch_size": 4, "epoch": 0.632, "step": 1580, "tokens_per_device": 5968 }, { "epoch": 0.632, "loss_ce": 0.00760929100215435, "loss_lvr": 0.8280177712440491, "loss_mode_switch": 0.0, "loss_total": 0.09041107445955276, "step": 1580 }, { "batch_size": 4, "epoch": 0.632, "step": 1580, "tokens_per_device": 6512 }, { "epoch": 0.632, "loss_ce": 0.2514602839946747, "loss_lvr": 0.8023378849029541, "loss_mode_switch": 0.0, "loss_total": 0.3316940665245056, "step": 1580 }, { "batch_size": 4, "epoch": 0.632, "step": 1580, "tokens_per_device": 6016 }, { "epoch": 0.632, "loss_ce": 0.27321910858154297, "loss_lvr": 0.8829478621482849, "loss_mode_switch": 0.0, "loss_total": 0.3615139126777649, "step": 1580 }, { "batch_size": 4, "epoch": 0.632, "step": 1580, "tokens_per_device": 3768 }, { "epoch": 0.632, "loss_ce": 0.19847552478313446, "loss_lvr": 0.5760068297386169, "loss_mode_switch": 0.0, "loss_total": 0.25607621669769287, "step": 1580 }, { "batch_size": 4, "epoch": 0.632, "step": 1580, "tokens_per_device": 10836 }, { "epoch": 0.632, "loss_ce": 0.30319687724113464, "loss_lvr": 0.49683135747909546, "loss_mode_switch": 0.0, "loss_total": 0.35288000106811523, "step": 1580 }, { "epoch": 0.6324, "grad_norm": 1.3301289081573486, "learning_rate": 3.1443303238625172e-06, "loss": 0.2686, "step": 1581 }, { "batch_size": 1, "epoch": 0.6324, "step": 1581, "tokens_per_device": 4894 }, { "epoch": 0.6324, "loss_ce": 0.04259815067052841, "loss_lvr": 0.24367745220661163, "loss_mode_switch": 0.0, "loss_total": 0.06696589291095734, "step": 1581 }, { "batch_size": 4, "epoch": 0.6324, "step": 1581, "tokens_per_device": 2712 }, { "epoch": 0.6324, "loss_ce": 0.4626832902431488, "loss_lvr": 0.9973815083503723, "loss_mode_switch": 0.0, "loss_total": 0.562421441078186, "step": 1581 }, { "batch_size": 4, "epoch": 0.6324, "step": 1581, "tokens_per_device": 1700 }, { "epoch": 0.6324, "loss_ce": 0.27715998888015747, "loss_lvr": 1.4138392210006714, "loss_mode_switch": 0.0, "loss_total": 0.4185439348220825, "step": 1581 }, { "batch_size": 4, "epoch": 0.6324, "step": 1581, "tokens_per_device": 4224 }, { "epoch": 0.6324, "loss_ce": 0.2488173395395279, "loss_lvr": 0.5157783627510071, "loss_mode_switch": 0.0, "loss_total": 0.3003951907157898, "step": 1581 }, { "batch_size": 4, "epoch": 0.6324, "step": 1581, "tokens_per_device": 2240 }, { "epoch": 0.6324, "loss_ce": 0.21397709846496582, "loss_lvr": 0.8745605945587158, "loss_mode_switch": 0.0, "loss_total": 0.30143314599990845, "step": 1581 }, { "batch_size": 1, "epoch": 0.6324, "step": 1581, "tokens_per_device": 4861 }, { "epoch": 0.6324, "loss_ce": 0.00671169301494956, "loss_lvr": 0.19825725257396698, "loss_mode_switch": 0.0, "loss_total": 0.026537418365478516, "step": 1581 }, { "batch_size": 4, "epoch": 0.6324, "step": 1581, "tokens_per_device": 3784 }, { "epoch": 0.6324, "loss_ce": 0.07698158919811249, "loss_lvr": 1.2339787483215332, "loss_mode_switch": 0.0, "loss_total": 0.20037946105003357, "step": 1581 }, { "batch_size": 4, "epoch": 0.6324, "step": 1581, "tokens_per_device": 4392 }, { "epoch": 0.6324, "loss_ce": 0.04347009211778641, "loss_lvr": 0.7218477129936218, "loss_mode_switch": 0.0, "loss_total": 0.11565486341714859, "step": 1581 }, { "epoch": 0.6328, "grad_norm": 1.136989712715149, "learning_rate": 3.138317000172072e-06, "loss": 0.2546, "step": 1582 }, { "batch_size": 4, "epoch": 0.6328, "step": 1582, "tokens_per_device": 1712 }, { "epoch": 0.6328, "loss_ce": 0.14307253062725067, "loss_lvr": 1.4273982048034668, "loss_mode_switch": 0.0, "loss_total": 0.2858123481273651, "step": 1582 }, { "batch_size": 4, "epoch": 0.6328, "step": 1582, "tokens_per_device": 4596 }, { "epoch": 0.6328, "loss_ce": 0.13410992920398712, "loss_lvr": 0.8585341572761536, "loss_mode_switch": 0.0, "loss_total": 0.21996334195137024, "step": 1582 }, { "batch_size": 1, "epoch": 0.6328, "step": 1582, "tokens_per_device": 4880 }, { "epoch": 0.6328, "loss_ce": 0.11710131168365479, "loss_lvr": 0.2526434361934662, "loss_mode_switch": 0.0, "loss_total": 0.14236564934253693, "step": 1582 }, { "batch_size": 1, "epoch": 0.6328, "step": 1582, "tokens_per_device": 4869 }, { "epoch": 0.6328, "loss_ce": 0.0027330711018294096, "loss_lvr": 0.49253201484680176, "loss_mode_switch": 0.0, "loss_total": 0.051986273378133774, "step": 1582 }, { "batch_size": 1, "epoch": 0.6328, "step": 1582, "tokens_per_device": 5465 }, { "epoch": 0.6328, "loss_ce": 0.05070677772164345, "loss_lvr": 0.6560246348381042, "loss_mode_switch": 0.0, "loss_total": 0.11630924046039581, "step": 1582 }, { "batch_size": 4, "epoch": 0.6328, "step": 1582, "tokens_per_device": 9808 }, { "epoch": 0.6328, "loss_ce": 0.3824474811553955, "loss_lvr": 0.9054540395736694, "loss_mode_switch": 0.0, "loss_total": 0.4729928970336914, "step": 1582 }, { "batch_size": 4, "epoch": 0.6328, "step": 1582, "tokens_per_device": 4488 }, { "epoch": 0.6328, "loss_ce": 0.03319322690367699, "loss_lvr": 0.9016163349151611, "loss_mode_switch": 0.0, "loss_total": 0.12335486710071564, "step": 1582 }, { "batch_size": 4, "epoch": 0.6328, "step": 1582, "tokens_per_device": 5536 }, { "epoch": 0.6328, "loss_ce": 0.4120410084724426, "loss_lvr": 0.8345414400100708, "loss_mode_switch": 0.0, "loss_total": 0.49549514055252075, "step": 1582 }, { "epoch": 0.6332, "grad_norm": 1.285135269165039, "learning_rate": 3.1323068009917174e-06, "loss": 0.2834, "step": 1583 }, { "batch_size": 4, "epoch": 0.6332, "step": 1583, "tokens_per_device": 2464 }, { "epoch": 0.6332, "loss_ce": 0.5239322781562805, "loss_lvr": 0.7486557364463806, "loss_mode_switch": 0.0, "loss_total": 0.5987978577613831, "step": 1583 }, { "batch_size": 4, "epoch": 0.6332, "step": 1583, "tokens_per_device": 5204 }, { "epoch": 0.6332, "loss_ce": 0.3567652404308319, "loss_lvr": 1.0029118061065674, "loss_mode_switch": 0.0, "loss_total": 0.4570564329624176, "step": 1583 }, { "batch_size": 4, "epoch": 0.6332, "step": 1583, "tokens_per_device": 4216 }, { "epoch": 0.6332, "loss_ce": 0.3310238718986511, "loss_lvr": 0.6732146143913269, "loss_mode_switch": 0.0, "loss_total": 0.39834535121917725, "step": 1583 }, { "batch_size": 4, "epoch": 0.6332, "step": 1583, "tokens_per_device": 8140 }, { "epoch": 0.6332, "loss_ce": 0.2009933739900589, "loss_lvr": 0.9407733678817749, "loss_mode_switch": 0.0, "loss_total": 0.29507070779800415, "step": 1583 }, { "batch_size": 4, "epoch": 0.6332, "step": 1583, "tokens_per_device": 2648 }, { "epoch": 0.6332, "loss_ce": 0.3338848054409027, "loss_lvr": 0.8668572306632996, "loss_mode_switch": 0.0, "loss_total": 0.4205705225467682, "step": 1583 }, { "batch_size": 4, "epoch": 0.6332, "step": 1583, "tokens_per_device": 4244 }, { "epoch": 0.6332, "loss_ce": 0.023035939782857895, "loss_lvr": 0.9914312958717346, "loss_mode_switch": 0.0, "loss_total": 0.1221790760755539, "step": 1583 }, { "batch_size": 4, "epoch": 0.6332, "step": 1583, "tokens_per_device": 2748 }, { "epoch": 0.6332, "loss_ce": 0.43053966760635376, "loss_lvr": 0.5150658488273621, "loss_mode_switch": 0.0, "loss_total": 0.4820462465286255, "step": 1583 }, { "batch_size": 1, "epoch": 0.6332, "step": 1583, "tokens_per_device": 5324 }, { "epoch": 0.6332, "loss_ce": 0.16248539090156555, "loss_lvr": 0.49989184737205505, "loss_mode_switch": 0.0, "loss_total": 0.21247458457946777, "step": 1583 }, { "epoch": 0.6336, "grad_norm": 1.367774486541748, "learning_rate": 3.1262997364085248e-06, "loss": 0.2851, "step": 1584 }, { "batch_size": 1, "epoch": 0.6336, "step": 1584, "tokens_per_device": 4862 }, { "epoch": 0.6336, "loss_ce": 0.0011300700716674328, "loss_lvr": 0.48952654004096985, "loss_mode_switch": 0.0, "loss_total": 0.050082724541425705, "step": 1584 }, { "batch_size": 4, "epoch": 0.6336, "step": 1584, "tokens_per_device": 2648 }, { "epoch": 0.6336, "loss_ce": 0.06386134773492813, "loss_lvr": 0.3426631987094879, "loss_mode_switch": 0.0, "loss_total": 0.09812766313552856, "step": 1584 }, { "batch_size": 4, "epoch": 0.6336, "step": 1584, "tokens_per_device": 8936 }, { "epoch": 0.6336, "loss_ce": 0.0017450677696615458, "loss_lvr": 0.6986382007598877, "loss_mode_switch": 0.0, "loss_total": 0.07160888612270355, "step": 1584 }, { "batch_size": 4, "epoch": 0.6336, "step": 1584, "tokens_per_device": 8232 }, { "epoch": 0.6336, "loss_ce": 0.3075421154499054, "loss_lvr": 0.793898344039917, "loss_mode_switch": 0.0, "loss_total": 0.3869319558143616, "step": 1584 }, { "batch_size": 4, "epoch": 0.6336, "step": 1584, "tokens_per_device": 1520 }, { "epoch": 0.6336, "loss_ce": 0.29044240713119507, "loss_lvr": 0.8914487361907959, "loss_mode_switch": 0.0, "loss_total": 0.3795872926712036, "step": 1584 }, { "batch_size": 1, "epoch": 0.6336, "step": 1584, "tokens_per_device": 4974 }, { "epoch": 0.6336, "loss_ce": 0.13984724879264832, "loss_lvr": 0.37971457839012146, "loss_mode_switch": 0.0, "loss_total": 0.17781871557235718, "step": 1584 }, { "batch_size": 4, "epoch": 0.6336, "step": 1584, "tokens_per_device": 4020 }, { "epoch": 0.6336, "loss_ce": 0.002451694803312421, "loss_lvr": 2.5905206203460693, "loss_mode_switch": 0.0, "loss_total": 0.26150375604629517, "step": 1584 }, { "batch_size": 4, "epoch": 0.6336, "step": 1584, "tokens_per_device": 4256 }, { "epoch": 0.6336, "loss_ce": 0.13229475915431976, "loss_lvr": 1.0271046161651611, "loss_mode_switch": 0.0, "loss_total": 0.2350052297115326, "step": 1584 }, { "epoch": 0.634, "grad_norm": 1.153821587562561, "learning_rate": 3.1202958165043053e-06, "loss": 0.2506, "step": 1585 }, { "batch_size": 4, "epoch": 0.634, "step": 1585, "tokens_per_device": 1380 }, { "epoch": 0.634, "loss_ce": 0.484857439994812, "loss_lvr": 0.9193538427352905, "loss_mode_switch": 0.0, "loss_total": 0.57679283618927, "step": 1585 }, { "batch_size": 1, "epoch": 0.634, "step": 1585, "tokens_per_device": 4952 }, { "epoch": 0.634, "loss_ce": 0.05243475362658501, "loss_lvr": 0.47120940685272217, "loss_mode_switch": 0.0, "loss_total": 0.09955569356679916, "step": 1585 }, { "batch_size": 4, "epoch": 0.634, "step": 1585, "tokens_per_device": 5160 }, { "epoch": 0.634, "loss_ce": 0.186482235789299, "loss_lvr": 0.839834451675415, "loss_mode_switch": 0.0, "loss_total": 0.2704656720161438, "step": 1585 }, { "batch_size": 4, "epoch": 0.634, "step": 1585, "tokens_per_device": 4372 }, { "epoch": 0.634, "loss_ce": 0.09227510541677475, "loss_lvr": 1.590135097503662, "loss_mode_switch": 0.0, "loss_total": 0.25128862261772156, "step": 1585 }, { "batch_size": 1, "epoch": 0.634, "step": 1585, "tokens_per_device": 4755 }, { "epoch": 0.634, "loss_ce": 0.18531104922294617, "loss_lvr": 0.772235631942749, "loss_mode_switch": 0.0, "loss_total": 0.26253461837768555, "step": 1585 }, { "batch_size": 4, "epoch": 0.634, "step": 1585, "tokens_per_device": 6660 }, { "epoch": 0.634, "loss_ce": 0.1177167296409607, "loss_lvr": 0.7865692973136902, "loss_mode_switch": 0.0, "loss_total": 0.19637367129325867, "step": 1585 }, { "batch_size": 4, "epoch": 0.634, "step": 1585, "tokens_per_device": 4536 }, { "epoch": 0.634, "loss_ce": 0.019174698740243912, "loss_lvr": 0.904681384563446, "loss_mode_switch": 0.0, "loss_total": 0.10964283347129822, "step": 1585 }, { "batch_size": 1, "epoch": 0.634, "step": 1585, "tokens_per_device": 4831 }, { "epoch": 0.634, "loss_ce": 0.02342802658677101, "loss_lvr": 0.4670250117778778, "loss_mode_switch": 0.0, "loss_total": 0.07013052701950073, "step": 1585 }, { "epoch": 0.6344, "grad_norm": 1.2396210432052612, "learning_rate": 3.1142950513555903e-06, "loss": 0.2918, "step": 1586 }, { "batch_size": 1, "epoch": 0.6344, "step": 1586, "tokens_per_device": 5216 }, { "epoch": 0.6344, "loss_ce": 0.11305814236402512, "loss_lvr": 0.42658036947250366, "loss_mode_switch": 0.0, "loss_total": 0.1557161808013916, "step": 1586 }, { "batch_size": 4, "epoch": 0.6344, "step": 1586, "tokens_per_device": 1824 }, { "epoch": 0.6344, "loss_ce": 0.278973788022995, "loss_lvr": 0.8365269303321838, "loss_mode_switch": 0.0, "loss_total": 0.36262649297714233, "step": 1586 }, { "batch_size": 4, "epoch": 0.6344, "step": 1586, "tokens_per_device": 3780 }, { "epoch": 0.6344, "loss_ce": 0.3025786578655243, "loss_lvr": 0.8579704761505127, "loss_mode_switch": 0.0, "loss_total": 0.3883756995201111, "step": 1586 }, { "batch_size": 4, "epoch": 0.6344, "step": 1586, "tokens_per_device": 4260 }, { "epoch": 0.6344, "loss_ce": 0.4041527509689331, "loss_lvr": 0.8196957111358643, "loss_mode_switch": 0.0, "loss_total": 0.4861223101615906, "step": 1586 }, { "batch_size": 4, "epoch": 0.6344, "step": 1586, "tokens_per_device": 6728 }, { "epoch": 0.6344, "loss_ce": 0.552358090877533, "loss_lvr": 0.7913635969161987, "loss_mode_switch": 0.0, "loss_total": 0.6314944624900818, "step": 1586 }, { "batch_size": 4, "epoch": 0.6344, "step": 1586, "tokens_per_device": 4348 }, { "epoch": 0.6344, "loss_ce": 0.31174200773239136, "loss_lvr": 0.8817594051361084, "loss_mode_switch": 0.0, "loss_total": 0.39991796016693115, "step": 1586 }, { "batch_size": 4, "epoch": 0.6344, "step": 1586, "tokens_per_device": 9816 }, { "epoch": 0.6344, "loss_ce": 0.07891732454299927, "loss_lvr": 0.5893428921699524, "loss_mode_switch": 0.0, "loss_total": 0.13785161077976227, "step": 1586 }, { "batch_size": 4, "epoch": 0.6344, "step": 1586, "tokens_per_device": 1428 }, { "epoch": 0.6344, "loss_ce": 0.1110171377658844, "loss_lvr": 0.7374634146690369, "loss_mode_switch": 0.0, "loss_total": 0.18476349115371704, "step": 1586 }, { "epoch": 0.6348, "grad_norm": 1.333533763885498, "learning_rate": 3.1082974510336163e-06, "loss": 0.3166, "step": 1587 }, { "batch_size": 4, "epoch": 0.6348, "step": 1587, "tokens_per_device": 3804 }, { "epoch": 0.6348, "loss_ce": 0.0955633595585823, "loss_lvr": 0.8308804035186768, "loss_mode_switch": 0.0, "loss_total": 0.17865139245986938, "step": 1587 }, { "batch_size": 4, "epoch": 0.6348, "step": 1587, "tokens_per_device": 7656 }, { "epoch": 0.6348, "loss_ce": 0.141275092959404, "loss_lvr": 0.7215796709060669, "loss_mode_switch": 0.0, "loss_total": 0.21343305706977844, "step": 1587 }, { "batch_size": 4, "epoch": 0.6348, "step": 1587, "tokens_per_device": 1372 }, { "epoch": 0.6348, "loss_ce": 0.8906404376029968, "loss_lvr": 0.8510295152664185, "loss_mode_switch": 0.0, "loss_total": 0.9757434129714966, "step": 1587 }, { "batch_size": 4, "epoch": 0.6348, "step": 1587, "tokens_per_device": 4544 }, { "epoch": 0.6348, "loss_ce": 0.1062602773308754, "loss_lvr": 0.9398718476295471, "loss_mode_switch": 0.0, "loss_total": 0.20024746656417847, "step": 1587 }, { "batch_size": 4, "epoch": 0.6348, "step": 1587, "tokens_per_device": 10072 }, { "epoch": 0.6348, "loss_ce": 0.19006666541099548, "loss_lvr": 0.37089598178863525, "loss_mode_switch": 0.0, "loss_total": 0.22715626657009125, "step": 1587 }, { "batch_size": 4, "epoch": 0.6348, "step": 1587, "tokens_per_device": 2520 }, { "epoch": 0.6348, "loss_ce": 0.07402344048023224, "loss_lvr": 0.7383345365524292, "loss_mode_switch": 0.0, "loss_total": 0.14785689115524292, "step": 1587 }, { "batch_size": 4, "epoch": 0.6348, "step": 1587, "tokens_per_device": 3872 }, { "epoch": 0.6348, "loss_ce": 0.009530236013233662, "loss_lvr": 0.9158178567886353, "loss_mode_switch": 0.0, "loss_total": 0.10111202299594879, "step": 1587 }, { "batch_size": 1, "epoch": 0.6348, "step": 1587, "tokens_per_device": 5126 }, { "epoch": 0.6348, "loss_ce": 0.12518924474716187, "loss_lvr": 0.33736732602119446, "loss_mode_switch": 0.0, "loss_total": 0.15892598032951355, "step": 1587 }, { "epoch": 0.6352, "grad_norm": 1.376846432685852, "learning_rate": 3.1023030256043087e-06, "loss": 0.2903, "step": 1588 }, { "batch_size": 1, "epoch": 0.6352, "step": 1588, "tokens_per_device": 5049 }, { "epoch": 0.6352, "loss_ce": 0.10147815197706223, "loss_lvr": 0.5344300866127014, "loss_mode_switch": 0.0, "loss_total": 0.15492115914821625, "step": 1588 }, { "batch_size": 4, "epoch": 0.6352, "step": 1588, "tokens_per_device": 8152 }, { "epoch": 0.6352, "loss_ce": 0.2801986336708069, "loss_lvr": 1.0685030221939087, "loss_mode_switch": 0.0, "loss_total": 0.3870489299297333, "step": 1588 }, { "batch_size": 4, "epoch": 0.6352, "step": 1588, "tokens_per_device": 2704 }, { "epoch": 0.6352, "loss_ce": 0.24632707238197327, "loss_lvr": 0.769187867641449, "loss_mode_switch": 0.0, "loss_total": 0.3232458531856537, "step": 1588 }, { "batch_size": 4, "epoch": 0.6352, "step": 1588, "tokens_per_device": 4468 }, { "epoch": 0.6352, "loss_ce": 0.01044460479170084, "loss_lvr": 0.7959882020950317, "loss_mode_switch": 0.0, "loss_total": 0.09004342555999756, "step": 1588 }, { "batch_size": 4, "epoch": 0.6352, "step": 1588, "tokens_per_device": 3784 }, { "epoch": 0.6352, "loss_ce": 0.15388251841068268, "loss_lvr": 0.9091024398803711, "loss_mode_switch": 0.0, "loss_total": 0.24479275941848755, "step": 1588 }, { "batch_size": 4, "epoch": 0.6352, "step": 1588, "tokens_per_device": 2712 }, { "epoch": 0.6352, "loss_ce": 0.26615676283836365, "loss_lvr": 0.7673283219337463, "loss_mode_switch": 0.0, "loss_total": 0.34288960695266724, "step": 1588 }, { "batch_size": 1, "epoch": 0.6352, "step": 1588, "tokens_per_device": 4930 }, { "epoch": 0.6352, "loss_ce": 0.024271681904792786, "loss_lvr": 0.9396829605102539, "loss_mode_switch": 0.0, "loss_total": 0.11823997646570206, "step": 1588 }, { "batch_size": 1, "epoch": 0.6352, "step": 1588, "tokens_per_device": 4888 }, { "epoch": 0.6352, "loss_ce": 0.017117079347372055, "loss_lvr": 0.3542175889015198, "loss_mode_switch": 0.0, "loss_total": 0.05253883823752403, "step": 1588 }, { "epoch": 0.6356, "grad_norm": 1.2681140899658203, "learning_rate": 3.0963117851282677e-06, "loss": 0.2686, "step": 1589 }, { "batch_size": 1, "epoch": 0.6356, "step": 1589, "tokens_per_device": 4699 }, { "epoch": 0.6356, "loss_ce": 0.00023718281590845436, "loss_lvr": 0.5839136838912964, "loss_mode_switch": 0.0, "loss_total": 0.05862855166196823, "step": 1589 }, { "batch_size": 4, "epoch": 0.6356, "step": 1589, "tokens_per_device": 4668 }, { "epoch": 0.6356, "loss_ce": 0.4650315046310425, "loss_lvr": 2.3979296684265137, "loss_mode_switch": 0.0, "loss_total": 0.7048244476318359, "step": 1589 }, { "batch_size": 4, "epoch": 0.6356, "step": 1589, "tokens_per_device": 4020 }, { "epoch": 0.6356, "loss_ce": 0.5456113219261169, "loss_lvr": 0.8035115003585815, "loss_mode_switch": 0.0, "loss_total": 0.625962495803833, "step": 1589 }, { "batch_size": 4, "epoch": 0.6356, "step": 1589, "tokens_per_device": 4236 }, { "epoch": 0.6356, "loss_ce": 0.28961652517318726, "loss_lvr": 0.8596133589744568, "loss_mode_switch": 0.0, "loss_total": 0.3755778670310974, "step": 1589 }, { "batch_size": 4, "epoch": 0.6356, "step": 1589, "tokens_per_device": 1596 }, { "epoch": 0.6356, "loss_ce": 0.0647202581167221, "loss_lvr": 0.9488226771354675, "loss_mode_switch": 0.0, "loss_total": 0.15960252285003662, "step": 1589 }, { "batch_size": 4, "epoch": 0.6356, "step": 1589, "tokens_per_device": 4360 }, { "epoch": 0.6356, "loss_ce": 0.03371953219175339, "loss_lvr": 0.6916590332984924, "loss_mode_switch": 0.0, "loss_total": 0.10288543999195099, "step": 1589 }, { "batch_size": 4, "epoch": 0.6356, "step": 1589, "tokens_per_device": 12412 }, { "epoch": 0.6356, "loss_ce": 0.36459219455718994, "loss_lvr": 1.0334707498550415, "loss_mode_switch": 0.0, "loss_total": 0.46793925762176514, "step": 1589 }, { "batch_size": 4, "epoch": 0.6356, "step": 1589, "tokens_per_device": 4240 }, { "epoch": 0.6356, "loss_ce": 0.6807334423065186, "loss_lvr": 0.9465618133544922, "loss_mode_switch": 0.0, "loss_total": 0.7753896117210388, "step": 1589 }, { "epoch": 0.636, "grad_norm": 1.2419453859329224, "learning_rate": 3.090323739660742e-06, "loss": 0.3085, "step": 1590 }, { "batch_size": 4, "epoch": 0.636, "step": 1590, "tokens_per_device": 5972 }, { "epoch": 0.636, "loss_ce": 0.4839983880519867, "loss_lvr": 0.7740952968597412, "loss_mode_switch": 0.0, "loss_total": 0.5614079236984253, "step": 1590 }, { "batch_size": 4, "epoch": 0.636, "step": 1590, "tokens_per_device": 5340 }, { "epoch": 0.636, "loss_ce": 0.12443840503692627, "loss_lvr": 0.7328550219535828, "loss_mode_switch": 0.0, "loss_total": 0.19772391021251678, "step": 1590 }, { "batch_size": 4, "epoch": 0.636, "step": 1590, "tokens_per_device": 4364 }, { "epoch": 0.636, "loss_ce": 0.4297618269920349, "loss_lvr": 1.2700393199920654, "loss_mode_switch": 0.0, "loss_total": 0.5567657947540283, "step": 1590 }, { "batch_size": 4, "epoch": 0.636, "step": 1590, "tokens_per_device": 15580 }, { "epoch": 0.636, "loss_ce": 0.08062057942152023, "loss_lvr": 0.4497649669647217, "loss_mode_switch": 0.0, "loss_total": 0.12559707462787628, "step": 1590 }, { "batch_size": 4, "epoch": 0.636, "step": 1590, "tokens_per_device": 3792 }, { "epoch": 0.636, "loss_ce": 0.07612739503383636, "loss_lvr": 0.8230324983596802, "loss_mode_switch": 0.0, "loss_total": 0.15843063592910767, "step": 1590 }, { "batch_size": 1, "epoch": 0.636, "step": 1590, "tokens_per_device": 4323 }, { "epoch": 0.636, "loss_ce": 0.0007761499145999551, "loss_lvr": 0.5068116188049316, "loss_mode_switch": 0.0, "loss_total": 0.05145731195807457, "step": 1590 }, { "batch_size": 1, "epoch": 0.636, "step": 1590, "tokens_per_device": 4938 }, { "epoch": 0.636, "loss_ce": 0.0016103620873764157, "loss_lvr": 0.3926011919975281, "loss_mode_switch": 0.0, "loss_total": 0.04087048023939133, "step": 1590 }, { "batch_size": 4, "epoch": 0.636, "step": 1590, "tokens_per_device": 2544 }, { "epoch": 0.636, "loss_ce": 0.5065891742706299, "loss_lvr": 1.0490535497665405, "loss_mode_switch": 0.0, "loss_total": 0.6114945411682129, "step": 1590 }, { "epoch": 0.6364, "grad_norm": 1.4209479093551636, "learning_rate": 3.084338899251623e-06, "loss": 0.3159, "step": 1591 }, { "batch_size": 1, "epoch": 0.6364, "step": 1591, "tokens_per_device": 5127 }, { "epoch": 0.6364, "loss_ce": 0.013970647007226944, "loss_lvr": 0.260038822889328, "loss_mode_switch": 0.0, "loss_total": 0.039974529296159744, "step": 1591 }, { "batch_size": 4, "epoch": 0.6364, "step": 1591, "tokens_per_device": 3980 }, { "epoch": 0.6364, "loss_ce": 0.5306594371795654, "loss_lvr": 0.7548176646232605, "loss_mode_switch": 0.0, "loss_total": 0.606141209602356, "step": 1591 }, { "batch_size": 4, "epoch": 0.6364, "step": 1591, "tokens_per_device": 5956 }, { "epoch": 0.6364, "loss_ce": 0.547542154788971, "loss_lvr": 0.6549322605133057, "loss_mode_switch": 0.0, "loss_total": 0.6130353808403015, "step": 1591 }, { "batch_size": 1, "epoch": 0.6364, "step": 1591, "tokens_per_device": 4774 }, { "epoch": 0.6364, "loss_ce": 0.017105115577578545, "loss_lvr": 0.5829446315765381, "loss_mode_switch": 0.0, "loss_total": 0.07539957761764526, "step": 1591 }, { "batch_size": 1, "epoch": 0.6364, "step": 1591, "tokens_per_device": 4883 }, { "epoch": 0.6364, "loss_ce": 0.011173570528626442, "loss_lvr": 0.21249128878116608, "loss_mode_switch": 0.0, "loss_total": 0.03242269903421402, "step": 1591 }, { "batch_size": 4, "epoch": 0.6364, "step": 1591, "tokens_per_device": 1556 }, { "epoch": 0.6364, "loss_ce": 0.07144949585199356, "loss_lvr": 1.1864650249481201, "loss_mode_switch": 0.0, "loss_total": 0.19009599089622498, "step": 1591 }, { "batch_size": 4, "epoch": 0.6364, "step": 1591, "tokens_per_device": 11000 }, { "epoch": 0.6364, "loss_ce": 0.05126161128282547, "loss_lvr": 1.072532296180725, "loss_mode_switch": 0.0, "loss_total": 0.1585148423910141, "step": 1591 }, { "batch_size": 1, "epoch": 0.6364, "step": 1591, "tokens_per_device": 4898 }, { "epoch": 0.6364, "loss_ce": 0.005013621412217617, "loss_lvr": 0.19810445606708527, "loss_mode_switch": 0.0, "loss_total": 0.024824067950248718, "step": 1591 }, { "epoch": 0.6368, "grad_norm": 1.1974568367004395, "learning_rate": 3.078357273945419e-06, "loss": 0.2754, "step": 1592 }, { "batch_size": 4, "epoch": 0.6368, "step": 1592, "tokens_per_device": 3796 }, { "epoch": 0.6368, "loss_ce": 0.33607006072998047, "loss_lvr": 1.064170002937317, "loss_mode_switch": 0.0, "loss_total": 0.44248706102371216, "step": 1592 }, { "batch_size": 1, "epoch": 0.6368, "step": 1592, "tokens_per_device": 5110 }, { "epoch": 0.6368, "loss_ce": 0.018940025940537453, "loss_lvr": 0.47243738174438477, "loss_mode_switch": 0.0, "loss_total": 0.06618376821279526, "step": 1592 }, { "batch_size": 1, "epoch": 0.6368, "step": 1592, "tokens_per_device": 8139 }, { "epoch": 0.6368, "loss_ce": 0.28710293769836426, "loss_lvr": 0.29980453848838806, "loss_mode_switch": 0.0, "loss_total": 0.3170833885669708, "step": 1592 }, { "batch_size": 4, "epoch": 0.6368, "step": 1592, "tokens_per_device": 4100 }, { "epoch": 0.6368, "loss_ce": 0.4181458652019501, "loss_lvr": 0.9114652276039124, "loss_mode_switch": 0.0, "loss_total": 0.5092923641204834, "step": 1592 }, { "batch_size": 1, "epoch": 0.6368, "step": 1592, "tokens_per_device": 4887 }, { "epoch": 0.6368, "loss_ce": 0.37357276678085327, "loss_lvr": 0.3871135115623474, "loss_mode_switch": 0.0, "loss_total": 0.41228410601615906, "step": 1592 }, { "batch_size": 4, "epoch": 0.6368, "step": 1592, "tokens_per_device": 4452 }, { "epoch": 0.6368, "loss_ce": 0.4122898280620575, "loss_lvr": 0.9784485101699829, "loss_mode_switch": 0.0, "loss_total": 0.5101346969604492, "step": 1592 }, { "batch_size": 4, "epoch": 0.6368, "step": 1592, "tokens_per_device": 1852 }, { "epoch": 0.6368, "loss_ce": 0.3309682011604309, "loss_lvr": 0.9186397194862366, "loss_mode_switch": 0.0, "loss_total": 0.422832190990448, "step": 1592 }, { "batch_size": 1, "epoch": 0.6368, "step": 1592, "tokens_per_device": 4972 }, { "epoch": 0.6368, "loss_ce": 0.13402403891086578, "loss_lvr": 0.2934182286262512, "loss_mode_switch": 0.0, "loss_total": 0.16336585581302643, "step": 1592 }, { "epoch": 0.6372, "grad_norm": 1.6202954053878784, "learning_rate": 3.072378873781245e-06, "loss": 0.3402, "step": 1593 }, { "batch_size": 1, "epoch": 0.6372, "step": 1593, "tokens_per_device": 4883 }, { "epoch": 0.6372, "loss_ce": 0.013211546465754509, "loss_lvr": 0.3440778851509094, "loss_mode_switch": 0.0, "loss_total": 0.04761933535337448, "step": 1593 }, { "batch_size": 4, "epoch": 0.6372, "step": 1593, "tokens_per_device": 1684 }, { "epoch": 0.6372, "loss_ce": 0.20065632462501526, "loss_lvr": 1.236136555671692, "loss_mode_switch": 0.0, "loss_total": 0.32426998019218445, "step": 1593 }, { "batch_size": 4, "epoch": 0.6372, "step": 1593, "tokens_per_device": 2132 }, { "epoch": 0.6372, "loss_ce": 0.18924476206302643, "loss_lvr": 0.7898286581039429, "loss_mode_switch": 0.0, "loss_total": 0.26822763681411743, "step": 1593 }, { "batch_size": 4, "epoch": 0.6372, "step": 1593, "tokens_per_device": 7240 }, { "epoch": 0.6372, "loss_ce": 0.15502716600894928, "loss_lvr": 0.8758370876312256, "loss_mode_switch": 0.0, "loss_total": 0.2426108717918396, "step": 1593 }, { "batch_size": 4, "epoch": 0.6372, "step": 1593, "tokens_per_device": 2680 }, { "epoch": 0.6372, "loss_ce": 0.18994086980819702, "loss_lvr": 0.7842411398887634, "loss_mode_switch": 0.0, "loss_total": 0.2683649957180023, "step": 1593 }, { "batch_size": 4, "epoch": 0.6372, "step": 1593, "tokens_per_device": 3780 }, { "epoch": 0.6372, "loss_ce": 0.3354909420013428, "loss_lvr": 1.0735517740249634, "loss_mode_switch": 0.0, "loss_total": 0.4428461194038391, "step": 1593 }, { "batch_size": 4, "epoch": 0.6372, "step": 1593, "tokens_per_device": 2876 }, { "epoch": 0.6372, "loss_ce": 0.05475220829248428, "loss_lvr": 0.8457508087158203, "loss_mode_switch": 0.0, "loss_total": 0.1393272876739502, "step": 1593 }, { "batch_size": 4, "epoch": 0.6372, "step": 1593, "tokens_per_device": 4052 }, { "epoch": 0.6372, "loss_ce": 0.05048179626464844, "loss_lvr": 0.5766781568527222, "loss_mode_switch": 0.0, "loss_total": 0.10814961791038513, "step": 1593 }, { "epoch": 0.6376, "grad_norm": 1.483472228050232, "learning_rate": 3.066403708792805e-06, "loss": 0.3483, "step": 1594 }, { "batch_size": 4, "epoch": 0.6376, "step": 1594, "tokens_per_device": 6972 }, { "epoch": 0.6376, "loss_ce": 0.12842001020908356, "loss_lvr": 0.7089591026306152, "loss_mode_switch": 0.0, "loss_total": 0.19931592047214508, "step": 1594 }, { "batch_size": 1, "epoch": 0.6376, "step": 1594, "tokens_per_device": 5119 }, { "epoch": 0.6376, "loss_ce": 0.3063953220844269, "loss_lvr": 0.779092013835907, "loss_mode_switch": 0.0, "loss_total": 0.3843045234680176, "step": 1594 }, { "batch_size": 4, "epoch": 0.6376, "step": 1594, "tokens_per_device": 2668 }, { "epoch": 0.6376, "loss_ce": 0.3153238594532013, "loss_lvr": 0.7653166651725769, "loss_mode_switch": 0.0, "loss_total": 0.39185553789138794, "step": 1594 }, { "batch_size": 4, "epoch": 0.6376, "step": 1594, "tokens_per_device": 2792 }, { "epoch": 0.6376, "loss_ce": 0.24023833870887756, "loss_lvr": 0.5801833868026733, "loss_mode_switch": 0.0, "loss_total": 0.29825666546821594, "step": 1594 }, { "batch_size": 4, "epoch": 0.6376, "step": 1594, "tokens_per_device": 2584 }, { "epoch": 0.6376, "loss_ce": 0.28672024607658386, "loss_lvr": 0.913844645023346, "loss_mode_switch": 0.0, "loss_total": 0.37810471653938293, "step": 1594 }, { "batch_size": 4, "epoch": 0.6376, "step": 1594, "tokens_per_device": 6080 }, { "epoch": 0.6376, "loss_ce": 0.10141244530677795, "loss_lvr": 0.8141615986824036, "loss_mode_switch": 0.0, "loss_total": 0.1828286051750183, "step": 1594 }, { "batch_size": 4, "epoch": 0.6376, "step": 1594, "tokens_per_device": 3920 }, { "epoch": 0.6376, "loss_ce": 0.192877858877182, "loss_lvr": 0.8047048449516296, "loss_mode_switch": 0.0, "loss_total": 0.273348331451416, "step": 1594 }, { "batch_size": 4, "epoch": 0.6376, "step": 1594, "tokens_per_device": 1648 }, { "epoch": 0.6376, "loss_ce": 0.20369455218315125, "loss_lvr": 1.411518931388855, "loss_mode_switch": 0.0, "loss_total": 0.3448464274406433, "step": 1594 }, { "epoch": 0.638, "grad_norm": 1.2897909879684448, "learning_rate": 3.060431789008368e-06, "loss": 0.2671, "step": 1595 }, { "batch_size": 4, "epoch": 0.638, "step": 1595, "tokens_per_device": 4304 }, { "epoch": 0.638, "loss_ce": 0.07618090510368347, "loss_lvr": 1.0840891599655151, "loss_mode_switch": 0.0, "loss_total": 0.18458983302116394, "step": 1595 }, { "batch_size": 1, "epoch": 0.638, "step": 1595, "tokens_per_device": 4877 }, { "epoch": 0.638, "loss_ce": 0.00019807624630630016, "loss_lvr": 0.18934424221515656, "loss_mode_switch": 0.0, "loss_total": 0.019132500514388084, "step": 1595 }, { "batch_size": 1, "epoch": 0.638, "step": 1595, "tokens_per_device": 5095 }, { "epoch": 0.638, "loss_ce": 0.0035735745914280415, "loss_lvr": 0.3888607621192932, "loss_mode_switch": 0.0, "loss_total": 0.042459651827812195, "step": 1595 }, { "batch_size": 1, "epoch": 0.638, "step": 1595, "tokens_per_device": 4766 }, { "epoch": 0.638, "loss_ce": 0.027517400681972504, "loss_lvr": 0.32240086793899536, "loss_mode_switch": 0.0, "loss_total": 0.05975748971104622, "step": 1595 }, { "batch_size": 1, "epoch": 0.638, "step": 1595, "tokens_per_device": 5184 }, { "epoch": 0.638, "loss_ce": 0.000436347967479378, "loss_lvr": 0.47231805324554443, "loss_mode_switch": 0.0, "loss_total": 0.047668151557445526, "step": 1595 }, { "batch_size": 1, "epoch": 0.638, "step": 1595, "tokens_per_device": 4891 }, { "epoch": 0.638, "loss_ce": 0.13773949444293976, "loss_lvr": 0.196743443608284, "loss_mode_switch": 0.0, "loss_total": 0.15741384029388428, "step": 1595 }, { "batch_size": 4, "epoch": 0.638, "step": 1595, "tokens_per_device": 4072 }, { "epoch": 0.638, "loss_ce": 0.14932677149772644, "loss_lvr": 1.0242854356765747, "loss_mode_switch": 0.0, "loss_total": 0.25175532698631287, "step": 1595 }, { "batch_size": 1, "epoch": 0.638, "step": 1595, "tokens_per_device": 5355 }, { "epoch": 0.638, "loss_ce": 0.0021177527960389853, "loss_lvr": 0.4722231924533844, "loss_mode_switch": 0.0, "loss_total": 0.04934007301926613, "step": 1595 }, { "epoch": 0.6384, "grad_norm": 1.2968589067459106, "learning_rate": 3.0544631244507607e-06, "loss": 0.2696, "step": 1596 }, { "batch_size": 4, "epoch": 0.6384, "step": 1596, "tokens_per_device": 2800 }, { "epoch": 0.6384, "loss_ce": 0.16028033196926117, "loss_lvr": 1.7791175842285156, "loss_mode_switch": 0.0, "loss_total": 0.3381921052932739, "step": 1596 }, { "batch_size": 4, "epoch": 0.6384, "step": 1596, "tokens_per_device": 2652 }, { "epoch": 0.6384, "loss_ce": 0.24191024899482727, "loss_lvr": 0.7678484320640564, "loss_mode_switch": 0.0, "loss_total": 0.3186950981616974, "step": 1596 }, { "batch_size": 4, "epoch": 0.6384, "step": 1596, "tokens_per_device": 4032 }, { "epoch": 0.6384, "loss_ce": 0.3556298017501831, "loss_lvr": 1.0466516017913818, "loss_mode_switch": 0.0, "loss_total": 0.4602949619293213, "step": 1596 }, { "batch_size": 4, "epoch": 0.6384, "step": 1596, "tokens_per_device": 7368 }, { "epoch": 0.6384, "loss_ce": 0.1121416836977005, "loss_lvr": 1.3937523365020752, "loss_mode_switch": 0.0, "loss_total": 0.2515169382095337, "step": 1596 }, { "batch_size": 4, "epoch": 0.6384, "step": 1596, "tokens_per_device": 1320 }, { "epoch": 0.6384, "loss_ce": 0.6012539267539978, "loss_lvr": 0.9982743263244629, "loss_mode_switch": 0.0, "loss_total": 0.7010813355445862, "step": 1596 }, { "batch_size": 4, "epoch": 0.6384, "step": 1596, "tokens_per_device": 3448 }, { "epoch": 0.6384, "loss_ce": 0.7756106853485107, "loss_lvr": 0.9486268758773804, "loss_mode_switch": 0.0, "loss_total": 0.8704733848571777, "step": 1596 }, { "batch_size": 4, "epoch": 0.6384, "step": 1596, "tokens_per_device": 3800 }, { "epoch": 0.6384, "loss_ce": 0.06660226732492447, "loss_lvr": 0.5577483177185059, "loss_mode_switch": 0.0, "loss_total": 0.12237709760665894, "step": 1596 }, { "batch_size": 4, "epoch": 0.6384, "step": 1596, "tokens_per_device": 4272 }, { "epoch": 0.6384, "loss_ce": 0.23771481215953827, "loss_lvr": 0.6985654830932617, "loss_mode_switch": 0.0, "loss_total": 0.3075713515281677, "step": 1596 }, { "epoch": 0.6388, "grad_norm": 1.6196595430374146, "learning_rate": 3.0484977251373458e-06, "loss": 0.3402, "step": 1597 }, { "batch_size": 4, "epoch": 0.6388, "step": 1597, "tokens_per_device": 4728 }, { "epoch": 0.6388, "loss_ce": 0.21647389233112335, "loss_lvr": 0.7005889415740967, "loss_mode_switch": 0.0, "loss_total": 0.28653278946876526, "step": 1597 }, { "batch_size": 1, "epoch": 0.6388, "step": 1597, "tokens_per_device": 4888 }, { "epoch": 0.6388, "loss_ce": 0.04036371037364006, "loss_lvr": 0.6632007956504822, "loss_mode_switch": 0.0, "loss_total": 0.10668379068374634, "step": 1597 }, { "batch_size": 1, "epoch": 0.6388, "step": 1597, "tokens_per_device": 4871 }, { "epoch": 0.6388, "loss_ce": 0.00020916092034894973, "loss_lvr": 1.6016002893447876, "loss_mode_switch": 0.0, "loss_total": 0.16036920249462128, "step": 1597 }, { "batch_size": 4, "epoch": 0.6388, "step": 1597, "tokens_per_device": 2912 }, { "epoch": 0.6388, "loss_ce": 0.25183582305908203, "loss_lvr": 1.1126070022583008, "loss_mode_switch": 0.0, "loss_total": 0.36309653520584106, "step": 1597 }, { "batch_size": 4, "epoch": 0.6388, "step": 1597, "tokens_per_device": 1460 }, { "epoch": 0.6388, "loss_ce": 0.7142088413238525, "loss_lvr": 0.905463695526123, "loss_mode_switch": 0.0, "loss_total": 0.8047552108764648, "step": 1597 }, { "batch_size": 4, "epoch": 0.6388, "step": 1597, "tokens_per_device": 4672 }, { "epoch": 0.6388, "loss_ce": 0.06051500141620636, "loss_lvr": 0.7479625940322876, "loss_mode_switch": 0.0, "loss_total": 0.13531126081943512, "step": 1597 }, { "batch_size": 1, "epoch": 0.6388, "step": 1597, "tokens_per_device": 5187 }, { "epoch": 0.6388, "loss_ce": 0.033109959214925766, "loss_lvr": 0.455859899520874, "loss_mode_switch": 0.0, "loss_total": 0.07869595289230347, "step": 1597 }, { "batch_size": 1, "epoch": 0.6388, "step": 1597, "tokens_per_device": 4873 }, { "epoch": 0.6388, "loss_ce": 0.12467066198587418, "loss_lvr": 0.7226071953773499, "loss_mode_switch": 0.0, "loss_total": 0.196931391954422, "step": 1597 }, { "epoch": 0.6392, "grad_norm": 1.2325453758239746, "learning_rate": 3.0425356010800022e-06, "loss": 0.2549, "step": 1598 }, { "batch_size": 4, "epoch": 0.6392, "step": 1598, "tokens_per_device": 4200 }, { "epoch": 0.6392, "loss_ce": 0.17497393488883972, "loss_lvr": 0.572456419467926, "loss_mode_switch": 0.0, "loss_total": 0.23221957683563232, "step": 1598 }, { "batch_size": 4, "epoch": 0.6392, "step": 1598, "tokens_per_device": 4420 }, { "epoch": 0.6392, "loss_ce": 0.05257751792669296, "loss_lvr": 0.7771822810173035, "loss_mode_switch": 0.0, "loss_total": 0.1302957534790039, "step": 1598 }, { "batch_size": 4, "epoch": 0.6392, "step": 1598, "tokens_per_device": 7268 }, { "epoch": 0.6392, "loss_ce": 0.07015080004930496, "loss_lvr": 0.664075493812561, "loss_mode_switch": 0.0, "loss_total": 0.13655835390090942, "step": 1598 }, { "batch_size": 4, "epoch": 0.6392, "step": 1598, "tokens_per_device": 15548 }, { "epoch": 0.6392, "loss_ce": 0.3165043890476227, "loss_lvr": 0.6561155915260315, "loss_mode_switch": 0.0, "loss_total": 0.3821159601211548, "step": 1598 }, { "batch_size": 4, "epoch": 0.6392, "step": 1598, "tokens_per_device": 4324 }, { "epoch": 0.6392, "loss_ce": 0.16937388479709625, "loss_lvr": 0.935200572013855, "loss_mode_switch": 0.0, "loss_total": 0.262893944978714, "step": 1598 }, { "batch_size": 1, "epoch": 0.6392, "step": 1598, "tokens_per_device": 4944 }, { "epoch": 0.6392, "loss_ce": 0.6076403260231018, "loss_lvr": 0.7577918767929077, "loss_mode_switch": 0.0, "loss_total": 0.6834195256233215, "step": 1598 }, { "batch_size": 1, "epoch": 0.6392, "step": 1598, "tokens_per_device": 5157 }, { "epoch": 0.6392, "loss_ce": 0.01237375196069479, "loss_lvr": 0.42466360330581665, "loss_mode_switch": 0.0, "loss_total": 0.05484011396765709, "step": 1598 }, { "batch_size": 4, "epoch": 0.6392, "step": 1598, "tokens_per_device": 2636 }, { "epoch": 0.6392, "loss_ce": 0.067075215280056, "loss_lvr": 0.6883055567741394, "loss_mode_switch": 0.0, "loss_total": 0.13590577244758606, "step": 1598 }, { "epoch": 0.6396, "grad_norm": 1.5634863376617432, "learning_rate": 3.036576762285118e-06, "loss": 0.2827, "step": 1599 }, { "batch_size": 1, "epoch": 0.6396, "step": 1599, "tokens_per_device": 5253 }, { "epoch": 0.6396, "loss_ce": 0.024091746658086777, "loss_lvr": 0.4268812835216522, "loss_mode_switch": 0.0, "loss_total": 0.06677987426519394, "step": 1599 }, { "batch_size": 4, "epoch": 0.6396, "step": 1599, "tokens_per_device": 5376 }, { "epoch": 0.6396, "loss_ce": 0.1977076381444931, "loss_lvr": 0.7913663983345032, "loss_mode_switch": 0.0, "loss_total": 0.2768442630767822, "step": 1599 }, { "batch_size": 4, "epoch": 0.6396, "step": 1599, "tokens_per_device": 10928 }, { "epoch": 0.6396, "loss_ce": 0.007615950424224138, "loss_lvr": 0.6797365546226501, "loss_mode_switch": 0.0, "loss_total": 0.0755896121263504, "step": 1599 }, { "batch_size": 4, "epoch": 0.6396, "step": 1599, "tokens_per_device": 3984 }, { "epoch": 0.6396, "loss_ce": 0.46649429202079773, "loss_lvr": 0.7369884252548218, "loss_mode_switch": 0.0, "loss_total": 0.5401931405067444, "step": 1599 }, { "batch_size": 4, "epoch": 0.6396, "step": 1599, "tokens_per_device": 4412 }, { "epoch": 0.6396, "loss_ce": 0.220097154378891, "loss_lvr": 1.0856717824935913, "loss_mode_switch": 0.0, "loss_total": 0.3286643326282501, "step": 1599 }, { "batch_size": 4, "epoch": 0.6396, "step": 1599, "tokens_per_device": 2528 }, { "epoch": 0.6396, "loss_ce": 0.05991250276565552, "loss_lvr": 0.7362897396087646, "loss_mode_switch": 0.0, "loss_total": 0.13354147970676422, "step": 1599 }, { "batch_size": 4, "epoch": 0.6396, "step": 1599, "tokens_per_device": 9972 }, { "epoch": 0.6396, "loss_ce": 0.7598826289176941, "loss_lvr": 0.7840963006019592, "loss_mode_switch": 0.0, "loss_total": 0.8382922410964966, "step": 1599 }, { "batch_size": 4, "epoch": 0.6396, "step": 1599, "tokens_per_device": 2720 }, { "epoch": 0.6396, "loss_ce": 0.6406334042549133, "loss_lvr": 0.6968432664871216, "loss_mode_switch": 0.0, "loss_total": 0.7103177309036255, "step": 1599 }, { "epoch": 0.64, "grad_norm": 1.4820283651351929, "learning_rate": 3.0306212187535653e-06, "loss": 0.3337, "step": 1600 }, { "batch_size": 4, "epoch": 0.64, "step": 1600, "tokens_per_device": 5780 }, { "epoch": 0.64, "loss_ce": 0.07508658617734909, "loss_lvr": 0.5238894820213318, "loss_mode_switch": 0.0, "loss_total": 0.1274755299091339, "step": 1600 }, { "batch_size": 4, "epoch": 0.64, "step": 1600, "tokens_per_device": 2608 }, { "epoch": 0.64, "loss_ce": 0.2632657587528229, "loss_lvr": 1.2258940935134888, "loss_mode_switch": 0.0, "loss_total": 0.38585516810417175, "step": 1600 }, { "batch_size": 4, "epoch": 0.64, "step": 1600, "tokens_per_device": 2516 }, { "epoch": 0.64, "loss_ce": 0.6076599955558777, "loss_lvr": 0.9175541996955872, "loss_mode_switch": 0.0, "loss_total": 0.6994154453277588, "step": 1600 }, { "batch_size": 4, "epoch": 0.64, "step": 1600, "tokens_per_device": 4516 }, { "epoch": 0.64, "loss_ce": 0.7362657189369202, "loss_lvr": 0.9514851570129395, "loss_mode_switch": 0.0, "loss_total": 0.8314142227172852, "step": 1600 }, { "batch_size": 4, "epoch": 0.64, "step": 1600, "tokens_per_device": 4284 }, { "epoch": 0.64, "loss_ce": 0.5920102596282959, "loss_lvr": 0.8530238270759583, "loss_mode_switch": 0.0, "loss_total": 0.6773126125335693, "step": 1600 }, { "batch_size": 4, "epoch": 0.64, "step": 1600, "tokens_per_device": 4400 }, { "epoch": 0.64, "loss_ce": 0.45207563042640686, "loss_lvr": 0.7500395178794861, "loss_mode_switch": 0.0, "loss_total": 0.5270795822143555, "step": 1600 }, { "batch_size": 4, "epoch": 0.64, "step": 1600, "tokens_per_device": 4440 }, { "epoch": 0.64, "loss_ce": 0.34389248490333557, "loss_lvr": 0.718887448310852, "loss_mode_switch": 0.0, "loss_total": 0.4157812297344208, "step": 1600 }, { "batch_size": 1, "epoch": 0.64, "step": 1600, "tokens_per_device": 6771 }, { "epoch": 0.64, "loss_ce": 0.00027291045989841223, "loss_lvr": 0.3151111900806427, "loss_mode_switch": 0.0, "loss_total": 0.03178403154015541, "step": 1600 }, { "epoch": 0.6404, "grad_norm": 1.4051618576049805, "learning_rate": 3.024668980480681e-06, "loss": 0.2914, "step": 1601 }, { "batch_size": 4, "epoch": 0.6404, "step": 1601, "tokens_per_device": 2624 }, { "epoch": 0.6404, "loss_ce": 0.019181478768587112, "loss_lvr": 0.6564958691596985, "loss_mode_switch": 0.0, "loss_total": 0.08483107388019562, "step": 1601 }, { "batch_size": 4, "epoch": 0.6404, "step": 1601, "tokens_per_device": 11488 }, { "epoch": 0.6404, "loss_ce": 0.061075326055288315, "loss_lvr": 0.8986470699310303, "loss_mode_switch": 0.0, "loss_total": 0.15094003081321716, "step": 1601 }, { "batch_size": 4, "epoch": 0.6404, "step": 1601, "tokens_per_device": 5056 }, { "epoch": 0.6404, "loss_ce": 0.020421458408236504, "loss_lvr": 0.6205730438232422, "loss_mode_switch": 0.0, "loss_total": 0.08247876167297363, "step": 1601 }, { "batch_size": 1, "epoch": 0.6404, "step": 1601, "tokens_per_device": 5123 }, { "epoch": 0.6404, "loss_ce": 0.16528236865997314, "loss_lvr": 0.2592374086380005, "loss_mode_switch": 0.0, "loss_total": 0.19120611250400543, "step": 1601 }, { "batch_size": 4, "epoch": 0.6404, "step": 1601, "tokens_per_device": 8276 }, { "epoch": 0.6404, "loss_ce": 0.014019102789461613, "loss_lvr": 0.7477893233299255, "loss_mode_switch": 0.0, "loss_total": 0.08879803866147995, "step": 1601 }, { "batch_size": 4, "epoch": 0.6404, "step": 1601, "tokens_per_device": 4428 }, { "epoch": 0.6404, "loss_ce": 0.1287238448858261, "loss_lvr": 0.8403516411781311, "loss_mode_switch": 0.0, "loss_total": 0.21275901794433594, "step": 1601 }, { "batch_size": 1, "epoch": 0.6404, "step": 1601, "tokens_per_device": 4887 }, { "epoch": 0.6404, "loss_ce": 0.007303483318537474, "loss_lvr": 0.955378532409668, "loss_mode_switch": 0.0, "loss_total": 0.1028413400053978, "step": 1601 }, { "batch_size": 4, "epoch": 0.6404, "step": 1601, "tokens_per_device": 6108 }, { "epoch": 0.6404, "loss_ce": 0.019153809174895287, "loss_lvr": 0.7214207649230957, "loss_mode_switch": 0.0, "loss_total": 0.09129589051008224, "step": 1601 }, { "epoch": 0.6408, "grad_norm": 1.2955368757247925, "learning_rate": 3.0187200574562605e-06, "loss": 0.3258, "step": 1602 }, { "batch_size": 4, "epoch": 0.6408, "step": 1602, "tokens_per_device": 4844 }, { "epoch": 0.6408, "loss_ce": 0.6428055167198181, "loss_lvr": 0.8455097079277039, "loss_mode_switch": 0.0, "loss_total": 0.727356493473053, "step": 1602 }, { "batch_size": 4, "epoch": 0.6408, "step": 1602, "tokens_per_device": 13868 }, { "epoch": 0.6408, "loss_ce": 0.4780161678791046, "loss_lvr": 0.7735011577606201, "loss_mode_switch": 0.0, "loss_total": 0.5553662776947021, "step": 1602 }, { "batch_size": 1, "epoch": 0.6408, "step": 1602, "tokens_per_device": 4869 }, { "epoch": 0.6408, "loss_ce": 0.018408458679914474, "loss_lvr": 0.15273597836494446, "loss_mode_switch": 0.0, "loss_total": 0.03368205577135086, "step": 1602 }, { "batch_size": 1, "epoch": 0.6408, "step": 1602, "tokens_per_device": 6508 }, { "epoch": 0.6408, "loss_ce": 0.21782219409942627, "loss_lvr": 0.2758176624774933, "loss_mode_switch": 0.0, "loss_total": 0.2454039603471756, "step": 1602 }, { "batch_size": 4, "epoch": 0.6408, "step": 1602, "tokens_per_device": 2740 }, { "epoch": 0.6408, "loss_ce": 0.04750704765319824, "loss_lvr": 0.660912275314331, "loss_mode_switch": 0.0, "loss_total": 0.1135982796549797, "step": 1602 }, { "batch_size": 4, "epoch": 0.6408, "step": 1602, "tokens_per_device": 4244 }, { "epoch": 0.6408, "loss_ce": 0.2937691807746887, "loss_lvr": 1.0078247785568237, "loss_mode_switch": 0.0, "loss_total": 0.39455166459083557, "step": 1602 }, { "batch_size": 4, "epoch": 0.6408, "step": 1602, "tokens_per_device": 5100 }, { "epoch": 0.6408, "loss_ce": 0.25980642437934875, "loss_lvr": 0.6797910332679749, "loss_mode_switch": 0.0, "loss_total": 0.32778552174568176, "step": 1602 }, { "batch_size": 4, "epoch": 0.6408, "step": 1602, "tokens_per_device": 5304 }, { "epoch": 0.6408, "loss_ce": 0.7672567367553711, "loss_lvr": 0.8769934773445129, "loss_mode_switch": 0.0, "loss_total": 0.8549560904502869, "step": 1602 }, { "epoch": 0.6412, "grad_norm": 1.492215871810913, "learning_rate": 3.0127744596645337e-06, "loss": 0.304, "step": 1603 }, { "batch_size": 4, "epoch": 0.6412, "step": 1603, "tokens_per_device": 2584 }, { "epoch": 0.6412, "loss_ce": 0.47394219040870667, "loss_lvr": 0.8674507141113281, "loss_mode_switch": 0.0, "loss_total": 0.560687243938446, "step": 1603 }, { "batch_size": 4, "epoch": 0.6412, "step": 1603, "tokens_per_device": 4364 }, { "epoch": 0.6412, "loss_ce": 0.2267322838306427, "loss_lvr": 0.9108859896659851, "loss_mode_switch": 0.0, "loss_total": 0.31782087683677673, "step": 1603 }, { "batch_size": 4, "epoch": 0.6412, "step": 1603, "tokens_per_device": 4744 }, { "epoch": 0.6412, "loss_ce": 0.8511890769004822, "loss_lvr": 0.7881619930267334, "loss_mode_switch": 0.0, "loss_total": 0.9300052523612976, "step": 1603 }, { "batch_size": 4, "epoch": 0.6412, "step": 1603, "tokens_per_device": 3328 }, { "epoch": 0.6412, "loss_ce": 0.41034698486328125, "loss_lvr": 0.9456780552864075, "loss_mode_switch": 0.0, "loss_total": 0.5049147605895996, "step": 1603 }, { "batch_size": 4, "epoch": 0.6412, "step": 1603, "tokens_per_device": 3924 }, { "epoch": 0.6412, "loss_ce": 0.010481199249625206, "loss_lvr": 0.8916323781013489, "loss_mode_switch": 0.0, "loss_total": 0.09964443743228912, "step": 1603 }, { "batch_size": 1, "epoch": 0.6412, "step": 1603, "tokens_per_device": 5736 }, { "epoch": 0.6412, "loss_ce": 0.03933798149228096, "loss_lvr": 0.24165639281272888, "loss_mode_switch": 0.0, "loss_total": 0.06350362300872803, "step": 1603 }, { "batch_size": 4, "epoch": 0.6412, "step": 1603, "tokens_per_device": 4308 }, { "epoch": 0.6412, "loss_ce": 0.03698953986167908, "loss_lvr": 0.7686617970466614, "loss_mode_switch": 0.0, "loss_total": 0.11385571956634521, "step": 1603 }, { "batch_size": 4, "epoch": 0.6412, "step": 1603, "tokens_per_device": 6416 }, { "epoch": 0.6412, "loss_ce": 0.27463483810424805, "loss_lvr": 0.6518656015396118, "loss_mode_switch": 0.0, "loss_total": 0.33982139825820923, "step": 1603 }, { "epoch": 0.6416, "grad_norm": 1.3372931480407715, "learning_rate": 3.0068321970841484e-06, "loss": 0.3235, "step": 1604 }, { "batch_size": 4, "epoch": 0.6416, "step": 1604, "tokens_per_device": 3760 }, { "epoch": 0.6416, "loss_ce": 0.19192539155483246, "loss_lvr": 0.8259372711181641, "loss_mode_switch": 0.0, "loss_total": 0.2745191156864166, "step": 1604 }, { "batch_size": 4, "epoch": 0.6416, "step": 1604, "tokens_per_device": 3668 }, { "epoch": 0.6416, "loss_ce": 0.2337626814842224, "loss_lvr": 0.6853265762329102, "loss_mode_switch": 0.0, "loss_total": 0.3022953271865845, "step": 1604 }, { "batch_size": 4, "epoch": 0.6416, "step": 1604, "tokens_per_device": 4820 }, { "epoch": 0.6416, "loss_ce": 0.26257359981536865, "loss_lvr": 0.7795693874359131, "loss_mode_switch": 0.0, "loss_total": 0.34053054451942444, "step": 1604 }, { "batch_size": 4, "epoch": 0.6416, "step": 1604, "tokens_per_device": 1260 }, { "epoch": 0.6416, "loss_ce": 0.2820487320423126, "loss_lvr": 1.0592182874679565, "loss_mode_switch": 0.0, "loss_total": 0.38797056674957275, "step": 1604 }, { "batch_size": 4, "epoch": 0.6416, "step": 1604, "tokens_per_device": 6392 }, { "epoch": 0.6416, "loss_ce": 0.018808843567967415, "loss_lvr": 0.6684161424636841, "loss_mode_switch": 0.0, "loss_total": 0.08565045893192291, "step": 1604 }, { "batch_size": 4, "epoch": 0.6416, "step": 1604, "tokens_per_device": 4440 }, { "epoch": 0.6416, "loss_ce": 0.14739538729190826, "loss_lvr": 0.5135524272918701, "loss_mode_switch": 0.0, "loss_total": 0.19875063002109528, "step": 1604 }, { "batch_size": 1, "epoch": 0.6416, "step": 1604, "tokens_per_device": 7505 }, { "epoch": 0.6416, "loss_ce": 0.03888203576207161, "loss_lvr": 0.3038446009159088, "loss_mode_switch": 0.0, "loss_total": 0.06926649808883667, "step": 1604 }, { "batch_size": 4, "epoch": 0.6416, "step": 1604, "tokens_per_device": 3944 }, { "epoch": 0.6416, "loss_ce": 0.2643888294696808, "loss_lvr": 1.2591357231140137, "loss_mode_switch": 0.0, "loss_total": 0.3903024196624756, "step": 1604 }, { "epoch": 0.642, "grad_norm": 1.2998497486114502, "learning_rate": 3.000893279688155e-06, "loss": 0.2766, "step": 1605 }, { "batch_size": 4, "epoch": 0.642, "step": 1605, "tokens_per_device": 1220 }, { "epoch": 0.642, "loss_ce": 0.49743375182151794, "loss_lvr": 0.9316942691802979, "loss_mode_switch": 0.0, "loss_total": 0.5906031727790833, "step": 1605 }, { "batch_size": 1, "epoch": 0.642, "step": 1605, "tokens_per_device": 5117 }, { "epoch": 0.642, "loss_ce": 0.045428354293107986, "loss_lvr": 0.5195272564888, "loss_mode_switch": 0.0, "loss_total": 0.09738108515739441, "step": 1605 }, { "batch_size": 4, "epoch": 0.642, "step": 1605, "tokens_per_device": 1536 }, { "epoch": 0.642, "loss_ce": 0.5139052867889404, "loss_lvr": 0.8342563509941101, "loss_mode_switch": 0.0, "loss_total": 0.5973309278488159, "step": 1605 }, { "batch_size": 1, "epoch": 0.642, "step": 1605, "tokens_per_device": 4770 }, { "epoch": 0.642, "loss_ce": 0.22421133518218994, "loss_lvr": 0.5993093252182007, "loss_mode_switch": 0.0, "loss_total": 0.28414225578308105, "step": 1605 }, { "batch_size": 4, "epoch": 0.642, "step": 1605, "tokens_per_device": 4184 }, { "epoch": 0.642, "loss_ce": 0.5050868988037109, "loss_lvr": 0.8503111004829407, "loss_mode_switch": 0.0, "loss_total": 0.5901179909706116, "step": 1605 }, { "batch_size": 4, "epoch": 0.642, "step": 1605, "tokens_per_device": 3500 }, { "epoch": 0.642, "loss_ce": 0.1320185661315918, "loss_lvr": 0.9391685724258423, "loss_mode_switch": 0.0, "loss_total": 0.2259354293346405, "step": 1605 }, { "batch_size": 4, "epoch": 0.642, "step": 1605, "tokens_per_device": 7260 }, { "epoch": 0.642, "loss_ce": 0.007123258430510759, "loss_lvr": 0.7519802451133728, "loss_mode_switch": 0.0, "loss_total": 0.08232128620147705, "step": 1605 }, { "batch_size": 4, "epoch": 0.642, "step": 1605, "tokens_per_device": 6388 }, { "epoch": 0.642, "loss_ce": 0.3410956561565399, "loss_lvr": 0.8909525871276855, "loss_mode_switch": 0.0, "loss_total": 0.43019092082977295, "step": 1605 }, { "epoch": 0.6424, "grad_norm": 1.4571539163589478, "learning_rate": 2.9949577174439926e-06, "loss": 0.2726, "step": 1606 }, { "batch_size": 4, "epoch": 0.6424, "step": 1606, "tokens_per_device": 14876 }, { "epoch": 0.6424, "loss_ce": 0.24770483374595642, "loss_lvr": 0.48873063921928406, "loss_mode_switch": 0.0, "loss_total": 0.29657790064811707, "step": 1606 }, { "batch_size": 4, "epoch": 0.6424, "step": 1606, "tokens_per_device": 4556 }, { "epoch": 0.6424, "loss_ce": 0.4271893799304962, "loss_lvr": 0.7117484211921692, "loss_mode_switch": 0.0, "loss_total": 0.4983642101287842, "step": 1606 }, { "batch_size": 4, "epoch": 0.6424, "step": 1606, "tokens_per_device": 6188 }, { "epoch": 0.6424, "loss_ce": 0.38842856884002686, "loss_lvr": 0.748124361038208, "loss_mode_switch": 0.0, "loss_total": 0.46324101090431213, "step": 1606 }, { "batch_size": 4, "epoch": 0.6424, "step": 1606, "tokens_per_device": 5852 }, { "epoch": 0.6424, "loss_ce": 0.18281598389148712, "loss_lvr": 0.7582343220710754, "loss_mode_switch": 0.0, "loss_total": 0.2586394250392914, "step": 1606 }, { "batch_size": 4, "epoch": 0.6424, "step": 1606, "tokens_per_device": 6432 }, { "epoch": 0.6424, "loss_ce": 0.23498645424842834, "loss_lvr": 0.8380956649780273, "loss_mode_switch": 0.0, "loss_total": 0.3187960386276245, "step": 1606 }, { "batch_size": 4, "epoch": 0.6424, "step": 1606, "tokens_per_device": 6232 }, { "epoch": 0.6424, "loss_ce": 0.1860646903514862, "loss_lvr": 0.7379872798919678, "loss_mode_switch": 0.0, "loss_total": 0.2598634362220764, "step": 1606 }, { "batch_size": 4, "epoch": 0.6424, "step": 1606, "tokens_per_device": 4424 }, { "epoch": 0.6424, "loss_ce": 0.035273224115371704, "loss_lvr": 0.8153091669082642, "loss_mode_switch": 0.0, "loss_total": 0.11680414527654648, "step": 1606 }, { "batch_size": 4, "epoch": 0.6424, "step": 1606, "tokens_per_device": 5200 }, { "epoch": 0.6424, "loss_ce": 0.6816785931587219, "loss_lvr": 0.6962782740592957, "loss_mode_switch": 0.0, "loss_total": 0.751306414604187, "step": 1606 }, { "epoch": 0.6428, "grad_norm": 1.5099616050720215, "learning_rate": 2.9890255203134622e-06, "loss": 0.3075, "step": 1607 }, { "batch_size": 4, "epoch": 0.6428, "step": 1607, "tokens_per_device": 3764 }, { "epoch": 0.6428, "loss_ce": 0.22791413962841034, "loss_lvr": 0.7357218861579895, "loss_mode_switch": 0.0, "loss_total": 0.3014863133430481, "step": 1607 }, { "batch_size": 4, "epoch": 0.6428, "step": 1607, "tokens_per_device": 4260 }, { "epoch": 0.6428, "loss_ce": 0.07246152311563492, "loss_lvr": 0.7971289753913879, "loss_mode_switch": 0.0, "loss_total": 0.15217441320419312, "step": 1607 }, { "batch_size": 1, "epoch": 0.6428, "step": 1607, "tokens_per_device": 5090 }, { "epoch": 0.6428, "loss_ce": 0.3622591495513916, "loss_lvr": 0.30227968096733093, "loss_mode_switch": 0.0, "loss_total": 0.392487108707428, "step": 1607 }, { "batch_size": 4, "epoch": 0.6428, "step": 1607, "tokens_per_device": 5960 }, { "epoch": 0.6428, "loss_ce": 0.047763630747795105, "loss_lvr": 1.0650538206100464, "loss_mode_switch": 0.0, "loss_total": 0.1542690098285675, "step": 1607 }, { "batch_size": 1, "epoch": 0.6428, "step": 1607, "tokens_per_device": 4876 }, { "epoch": 0.6428, "loss_ce": 0.015443270094692707, "loss_lvr": 0.45945483446121216, "loss_mode_switch": 0.0, "loss_total": 0.06138875335454941, "step": 1607 }, { "batch_size": 4, "epoch": 0.6428, "step": 1607, "tokens_per_device": 6660 }, { "epoch": 0.6428, "loss_ce": 0.23927591741085052, "loss_lvr": 0.7765774130821228, "loss_mode_switch": 0.0, "loss_total": 0.31693366169929504, "step": 1607 }, { "batch_size": 4, "epoch": 0.6428, "step": 1607, "tokens_per_device": 1228 }, { "epoch": 0.6428, "loss_ce": 0.9829427003860474, "loss_lvr": 1.2268376350402832, "loss_mode_switch": 0.0, "loss_total": 1.1056264638900757, "step": 1607 }, { "batch_size": 4, "epoch": 0.6428, "step": 1607, "tokens_per_device": 4928 }, { "epoch": 0.6428, "loss_ce": 0.26935991644859314, "loss_lvr": 0.6380113363265991, "loss_mode_switch": 0.0, "loss_total": 0.33316105604171753, "step": 1607 }, { "epoch": 0.6432, "grad_norm": 1.5689964294433594, "learning_rate": 2.983096698252726e-06, "loss": 0.3526, "step": 1608 }, { "batch_size": 1, "epoch": 0.6432, "step": 1608, "tokens_per_device": 4965 }, { "epoch": 0.6432, "loss_ce": 0.04372892528772354, "loss_lvr": 0.31286612153053284, "loss_mode_switch": 0.0, "loss_total": 0.07501553744077682, "step": 1608 }, { "batch_size": 4, "epoch": 0.6432, "step": 1608, "tokens_per_device": 4104 }, { "epoch": 0.6432, "loss_ce": 0.23092064261436462, "loss_lvr": 0.8701585531234741, "loss_mode_switch": 0.0, "loss_total": 0.317936509847641, "step": 1608 }, { "batch_size": 1, "epoch": 0.6432, "step": 1608, "tokens_per_device": 5205 }, { "epoch": 0.6432, "loss_ce": 0.19941553473472595, "loss_lvr": 0.2956286370754242, "loss_mode_switch": 0.0, "loss_total": 0.22897839546203613, "step": 1608 }, { "batch_size": 4, "epoch": 0.6432, "step": 1608, "tokens_per_device": 2744 }, { "epoch": 0.6432, "loss_ce": 0.02489650249481201, "loss_lvr": 0.7739055752754211, "loss_mode_switch": 0.0, "loss_total": 0.10228706151247025, "step": 1608 }, { "batch_size": 1, "epoch": 0.6432, "step": 1608, "tokens_per_device": 4848 }, { "epoch": 0.6432, "loss_ce": 0.004389461595565081, "loss_lvr": 0.7851600050926208, "loss_mode_switch": 0.0, "loss_total": 0.08290546387434006, "step": 1608 }, { "batch_size": 4, "epoch": 0.6432, "step": 1608, "tokens_per_device": 4072 }, { "epoch": 0.6432, "loss_ce": 0.46842435002326965, "loss_lvr": 0.8770791888237, "loss_mode_switch": 0.0, "loss_total": 0.5561322569847107, "step": 1608 }, { "batch_size": 4, "epoch": 0.6432, "step": 1608, "tokens_per_device": 1336 }, { "epoch": 0.6432, "loss_ce": 0.18862302601337433, "loss_lvr": 0.9554185271263123, "loss_mode_switch": 0.0, "loss_total": 0.2841648757457733, "step": 1608 }, { "batch_size": 4, "epoch": 0.6432, "step": 1608, "tokens_per_device": 3968 }, { "epoch": 0.6432, "loss_ce": 0.04805838689208031, "loss_lvr": 0.819664716720581, "loss_mode_switch": 0.0, "loss_total": 0.13002486526966095, "step": 1608 }, { "epoch": 0.6436, "grad_norm": 1.2182210683822632, "learning_rate": 2.9771712612122765e-06, "loss": 0.2718, "step": 1609 }, { "batch_size": 1, "epoch": 0.6436, "step": 1609, "tokens_per_device": 5087 }, { "epoch": 0.6436, "loss_ce": 0.01121730450540781, "loss_lvr": 0.21833202242851257, "loss_mode_switch": 0.0, "loss_total": 0.03305050730705261, "step": 1609 }, { "batch_size": 1, "epoch": 0.6436, "step": 1609, "tokens_per_device": 4754 }, { "epoch": 0.6436, "loss_ce": 0.0008120875572785735, "loss_lvr": 0.43120983242988586, "loss_mode_switch": 0.0, "loss_total": 0.043933071196079254, "step": 1609 }, { "batch_size": 4, "epoch": 0.6436, "step": 1609, "tokens_per_device": 4236 }, { "epoch": 0.6436, "loss_ce": 0.3957397937774658, "loss_lvr": 1.0862843990325928, "loss_mode_switch": 0.0, "loss_total": 0.504368245601654, "step": 1609 }, { "batch_size": 4, "epoch": 0.6436, "step": 1609, "tokens_per_device": 4108 }, { "epoch": 0.6436, "loss_ce": 0.1942485272884369, "loss_lvr": 0.8200783133506775, "loss_mode_switch": 0.0, "loss_total": 0.27625635266304016, "step": 1609 }, { "batch_size": 4, "epoch": 0.6436, "step": 1609, "tokens_per_device": 1460 }, { "epoch": 0.6436, "loss_ce": 0.679236650466919, "loss_lvr": 0.9246689081192017, "loss_mode_switch": 0.0, "loss_total": 0.7717035412788391, "step": 1609 }, { "batch_size": 1, "epoch": 0.6436, "step": 1609, "tokens_per_device": 4865 }, { "epoch": 0.6436, "loss_ce": 0.021251888945698738, "loss_lvr": 0.47705134749412537, "loss_mode_switch": 0.0, "loss_total": 0.06895702332258224, "step": 1609 }, { "batch_size": 4, "epoch": 0.6436, "step": 1609, "tokens_per_device": 4252 }, { "epoch": 0.6436, "loss_ce": 0.6296020150184631, "loss_lvr": 0.6827077269554138, "loss_mode_switch": 0.0, "loss_total": 0.6978727579116821, "step": 1609 }, { "batch_size": 1, "epoch": 0.6436, "step": 1609, "tokens_per_device": 5203 }, { "epoch": 0.6436, "loss_ce": 0.027483418583869934, "loss_lvr": 0.28152593970298767, "loss_mode_switch": 0.0, "loss_total": 0.05563601106405258, "step": 1609 }, { "epoch": 0.644, "grad_norm": 1.2650147676467896, "learning_rate": 2.9712492191369245e-06, "loss": 0.2625, "step": 1610 }, { "batch_size": 1, "epoch": 0.644, "step": 1610, "tokens_per_device": 5201 }, { "epoch": 0.644, "loss_ce": 0.07058646529912949, "loss_lvr": 0.3434789180755615, "loss_mode_switch": 0.0, "loss_total": 0.10493435710668564, "step": 1610 }, { "batch_size": 4, "epoch": 0.644, "step": 1610, "tokens_per_device": 3972 }, { "epoch": 0.644, "loss_ce": 0.1704065054655075, "loss_lvr": 0.7705516219139099, "loss_mode_switch": 0.0, "loss_total": 0.24746167659759521, "step": 1610 }, { "batch_size": 4, "epoch": 0.644, "step": 1610, "tokens_per_device": 1556 }, { "epoch": 0.644, "loss_ce": 0.4593967795372009, "loss_lvr": 1.1080816984176636, "loss_mode_switch": 0.0, "loss_total": 0.5702049732208252, "step": 1610 }, { "batch_size": 4, "epoch": 0.644, "step": 1610, "tokens_per_device": 5596 }, { "epoch": 0.644, "loss_ce": 0.17855507135391235, "loss_lvr": 0.7917144894599915, "loss_mode_switch": 0.0, "loss_total": 0.2577265202999115, "step": 1610 }, { "batch_size": 4, "epoch": 0.644, "step": 1610, "tokens_per_device": 1216 }, { "epoch": 0.644, "loss_ce": 0.25305044651031494, "loss_lvr": 0.9356812238693237, "loss_mode_switch": 0.0, "loss_total": 0.34661856293678284, "step": 1610 }, { "batch_size": 4, "epoch": 0.644, "step": 1610, "tokens_per_device": 5708 }, { "epoch": 0.644, "loss_ce": 0.2706257402896881, "loss_lvr": 0.8228015303611755, "loss_mode_switch": 0.0, "loss_total": 0.35290589928627014, "step": 1610 }, { "batch_size": 4, "epoch": 0.644, "step": 1610, "tokens_per_device": 4364 }, { "epoch": 0.644, "loss_ce": 0.1307944655418396, "loss_lvr": 0.8939783573150635, "loss_mode_switch": 0.0, "loss_total": 0.2201923131942749, "step": 1610 }, { "batch_size": 4, "epoch": 0.644, "step": 1610, "tokens_per_device": 7772 }, { "epoch": 0.644, "loss_ce": 0.1142277866601944, "loss_lvr": 0.5859076976776123, "loss_mode_switch": 0.0, "loss_total": 0.17281855642795563, "step": 1610 }, { "epoch": 0.6444, "grad_norm": 1.445379614830017, "learning_rate": 2.965330581965786e-06, "loss": 0.3066, "step": 1611 }, { "batch_size": 1, "epoch": 0.6444, "step": 1611, "tokens_per_device": 4877 }, { "epoch": 0.6444, "loss_ce": 0.0874367207288742, "loss_lvr": 0.36468857526779175, "loss_mode_switch": 0.0, "loss_total": 0.12390558421611786, "step": 1611 }, { "batch_size": 1, "epoch": 0.6444, "step": 1611, "tokens_per_device": 4933 }, { "epoch": 0.6444, "loss_ce": 0.016245873644948006, "loss_lvr": 0.4638102352619171, "loss_mode_switch": 0.0, "loss_total": 0.0626268982887268, "step": 1611 }, { "batch_size": 1, "epoch": 0.6444, "step": 1611, "tokens_per_device": 5064 }, { "epoch": 0.6444, "loss_ce": 0.0033922225702553988, "loss_lvr": 0.3691309988498688, "loss_mode_switch": 0.0, "loss_total": 0.04030532389879227, "step": 1611 }, { "batch_size": 4, "epoch": 0.6444, "step": 1611, "tokens_per_device": 3936 }, { "epoch": 0.6444, "loss_ce": 0.27210983633995056, "loss_lvr": 0.9814172387123108, "loss_mode_switch": 0.0, "loss_total": 0.3702515661716461, "step": 1611 }, { "batch_size": 4, "epoch": 0.6444, "step": 1611, "tokens_per_device": 3764 }, { "epoch": 0.6444, "loss_ce": 0.5197420120239258, "loss_lvr": 0.8608062863349915, "loss_mode_switch": 0.0, "loss_total": 0.6058226227760315, "step": 1611 }, { "batch_size": 4, "epoch": 0.6444, "step": 1611, "tokens_per_device": 3944 }, { "epoch": 0.6444, "loss_ce": 0.03454267978668213, "loss_lvr": 0.8615168333053589, "loss_mode_switch": 0.0, "loss_total": 0.1206943616271019, "step": 1611 }, { "batch_size": 4, "epoch": 0.6444, "step": 1611, "tokens_per_device": 3400 }, { "epoch": 0.6444, "loss_ce": 0.36573827266693115, "loss_lvr": 1.240512728691101, "loss_mode_switch": 0.0, "loss_total": 0.48978954553604126, "step": 1611 }, { "batch_size": 1, "epoch": 0.6444, "step": 1611, "tokens_per_device": 4885 }, { "epoch": 0.6444, "loss_ce": 0.2107924073934555, "loss_lvr": 0.2665369510650635, "loss_mode_switch": 0.0, "loss_total": 0.23744609951972961, "step": 1611 }, { "epoch": 0.6448, "grad_norm": 1.788849115371704, "learning_rate": 2.959415359632257e-06, "loss": 0.2873, "step": 1612 }, { "batch_size": 4, "epoch": 0.6448, "step": 1612, "tokens_per_device": 4464 }, { "epoch": 0.6448, "loss_ce": 0.4103335440158844, "loss_lvr": 0.8443180918693542, "loss_mode_switch": 0.0, "loss_total": 0.49476534128189087, "step": 1612 }, { "batch_size": 1, "epoch": 0.6448, "step": 1612, "tokens_per_device": 5973 }, { "epoch": 0.6448, "loss_ce": 0.03521302714943886, "loss_lvr": 0.32793128490448, "loss_mode_switch": 0.0, "loss_total": 0.06800615787506104, "step": 1612 }, { "batch_size": 4, "epoch": 0.6448, "step": 1612, "tokens_per_device": 1552 }, { "epoch": 0.6448, "loss_ce": 0.5101246237754822, "loss_lvr": 0.893369197845459, "loss_mode_switch": 0.0, "loss_total": 0.599461555480957, "step": 1612 }, { "batch_size": 4, "epoch": 0.6448, "step": 1612, "tokens_per_device": 3812 }, { "epoch": 0.6448, "loss_ce": 0.16004930436611176, "loss_lvr": 0.8175866603851318, "loss_mode_switch": 0.0, "loss_total": 0.2418079674243927, "step": 1612 }, { "batch_size": 1, "epoch": 0.6448, "step": 1612, "tokens_per_device": 4864 }, { "epoch": 0.6448, "loss_ce": 0.004129818640649319, "loss_lvr": 0.36086633801460266, "loss_mode_switch": 0.0, "loss_total": 0.04021645337343216, "step": 1612 }, { "batch_size": 4, "epoch": 0.6448, "step": 1612, "tokens_per_device": 2620 }, { "epoch": 0.6448, "loss_ce": 0.2704601585865021, "loss_lvr": 0.9062113761901855, "loss_mode_switch": 0.0, "loss_total": 0.3610813021659851, "step": 1612 }, { "batch_size": 1, "epoch": 0.6448, "step": 1612, "tokens_per_device": 5220 }, { "epoch": 0.6448, "loss_ce": 0.02156406082212925, "loss_lvr": 0.5333095788955688, "loss_mode_switch": 0.0, "loss_total": 0.07489501684904099, "step": 1612 }, { "batch_size": 4, "epoch": 0.6448, "step": 1612, "tokens_per_device": 3760 }, { "epoch": 0.6448, "loss_ce": 0.31527987122535706, "loss_lvr": 0.6193738579750061, "loss_mode_switch": 0.0, "loss_total": 0.37721726298332214, "step": 1612 }, { "epoch": 0.6452, "grad_norm": 1.1925376653671265, "learning_rate": 2.9535035620640117e-06, "loss": 0.2778, "step": 1613 }, { "batch_size": 1, "epoch": 0.6452, "step": 1613, "tokens_per_device": 4870 }, { "epoch": 0.6452, "loss_ce": 0.10549940168857574, "loss_lvr": 0.415401816368103, "loss_mode_switch": 0.0, "loss_total": 0.14703959226608276, "step": 1613 }, { "batch_size": 1, "epoch": 0.6452, "step": 1613, "tokens_per_device": 5978 }, { "epoch": 0.6452, "loss_ce": 0.025654643774032593, "loss_lvr": 0.42113393545150757, "loss_mode_switch": 0.0, "loss_total": 0.06776803731918335, "step": 1613 }, { "batch_size": 4, "epoch": 0.6452, "step": 1613, "tokens_per_device": 4544 }, { "epoch": 0.6452, "loss_ce": 0.3373616635799408, "loss_lvr": 0.6905781626701355, "loss_mode_switch": 0.0, "loss_total": 0.4064194858074188, "step": 1613 }, { "batch_size": 1, "epoch": 0.6452, "step": 1613, "tokens_per_device": 8694 }, { "epoch": 0.6452, "loss_ce": 0.009688960388302803, "loss_lvr": 0.3400381803512573, "loss_mode_switch": 0.0, "loss_total": 0.043692782521247864, "step": 1613 }, { "batch_size": 4, "epoch": 0.6452, "step": 1613, "tokens_per_device": 3948 }, { "epoch": 0.6452, "loss_ce": 0.11758765578269958, "loss_lvr": 0.8422031402587891, "loss_mode_switch": 0.0, "loss_total": 0.20180797576904297, "step": 1613 }, { "batch_size": 4, "epoch": 0.6452, "step": 1613, "tokens_per_device": 2660 }, { "epoch": 0.6452, "loss_ce": 0.5767756700515747, "loss_lvr": 0.8738874197006226, "loss_mode_switch": 0.0, "loss_total": 0.6641644239425659, "step": 1613 }, { "batch_size": 4, "epoch": 0.6452, "step": 1613, "tokens_per_device": 7400 }, { "epoch": 0.6452, "loss_ce": 0.23224246501922607, "loss_lvr": 0.7850275635719299, "loss_mode_switch": 0.0, "loss_total": 0.3107452392578125, "step": 1613 }, { "batch_size": 4, "epoch": 0.6452, "step": 1613, "tokens_per_device": 4200 }, { "epoch": 0.6452, "loss_ce": 0.08910789340734482, "loss_lvr": 0.8820223212242126, "loss_mode_switch": 0.0, "loss_total": 0.17731012403964996, "step": 1613 }, { "epoch": 0.6456, "grad_norm": 1.2515991926193237, "learning_rate": 2.9475951991829676e-06, "loss": 0.267, "step": 1614 }, { "batch_size": 4, "epoch": 0.6456, "step": 1614, "tokens_per_device": 4268 }, { "epoch": 0.6456, "loss_ce": 0.258157879114151, "loss_lvr": 0.9572431445121765, "loss_mode_switch": 0.0, "loss_total": 0.35388219356536865, "step": 1614 }, { "batch_size": 4, "epoch": 0.6456, "step": 1614, "tokens_per_device": 1508 }, { "epoch": 0.6456, "loss_ce": 0.39035117626190186, "loss_lvr": 1.1613049507141113, "loss_mode_switch": 0.0, "loss_total": 0.5064816474914551, "step": 1614 }, { "batch_size": 1, "epoch": 0.6456, "step": 1614, "tokens_per_device": 5096 }, { "epoch": 0.6456, "loss_ce": 0.0006522121839225292, "loss_lvr": 0.35092732310295105, "loss_mode_switch": 0.0, "loss_total": 0.03574494644999504, "step": 1614 }, { "batch_size": 4, "epoch": 0.6456, "step": 1614, "tokens_per_device": 8852 }, { "epoch": 0.6456, "loss_ce": 0.27580708265304565, "loss_lvr": 0.8156121373176575, "loss_mode_switch": 0.0, "loss_total": 0.3573682904243469, "step": 1614 }, { "batch_size": 4, "epoch": 0.6456, "step": 1614, "tokens_per_device": 3828 }, { "epoch": 0.6456, "loss_ce": 0.1960436999797821, "loss_lvr": 0.7896043062210083, "loss_mode_switch": 0.0, "loss_total": 0.27500414848327637, "step": 1614 }, { "batch_size": 4, "epoch": 0.6456, "step": 1614, "tokens_per_device": 14672 }, { "epoch": 0.6456, "loss_ce": 0.7195912599563599, "loss_lvr": 0.8236164450645447, "loss_mode_switch": 0.0, "loss_total": 0.8019528985023499, "step": 1614 }, { "batch_size": 1, "epoch": 0.6456, "step": 1614, "tokens_per_device": 4868 }, { "epoch": 0.6456, "loss_ce": 0.0008203028701245785, "loss_lvr": 0.460875928401947, "loss_mode_switch": 0.0, "loss_total": 0.04690789431333542, "step": 1614 }, { "batch_size": 4, "epoch": 0.6456, "step": 1614, "tokens_per_device": 4692 }, { "epoch": 0.6456, "loss_ce": 0.15090970695018768, "loss_lvr": 0.9183858633041382, "loss_mode_switch": 0.0, "loss_total": 0.24274829030036926, "step": 1614 }, { "epoch": 0.646, "grad_norm": 1.3338751792907715, "learning_rate": 2.9416902809052817e-06, "loss": 0.3099, "step": 1615 }, { "batch_size": 1, "epoch": 0.646, "step": 1615, "tokens_per_device": 6217 }, { "epoch": 0.646, "loss_ce": 0.008090381510555744, "loss_lvr": 0.589674711227417, "loss_mode_switch": 0.0, "loss_total": 0.06705785542726517, "step": 1615 }, { "batch_size": 4, "epoch": 0.646, "step": 1615, "tokens_per_device": 2532 }, { "epoch": 0.646, "loss_ce": 0.49010202288627625, "loss_lvr": 0.8746029734611511, "loss_mode_switch": 0.0, "loss_total": 0.5775623321533203, "step": 1615 }, { "batch_size": 4, "epoch": 0.646, "step": 1615, "tokens_per_device": 4240 }, { "epoch": 0.646, "loss_ce": 0.009641803801059723, "loss_lvr": 0.9473960399627686, "loss_mode_switch": 0.0, "loss_total": 0.10438141226768494, "step": 1615 }, { "batch_size": 4, "epoch": 0.646, "step": 1615, "tokens_per_device": 3424 }, { "epoch": 0.646, "loss_ce": 0.21934574842453003, "loss_lvr": 0.8326421976089478, "loss_mode_switch": 0.0, "loss_total": 0.30260998010635376, "step": 1615 }, { "batch_size": 4, "epoch": 0.646, "step": 1615, "tokens_per_device": 1364 }, { "epoch": 0.646, "loss_ce": 0.7187835574150085, "loss_lvr": 1.0213029384613037, "loss_mode_switch": 0.0, "loss_total": 0.8209138512611389, "step": 1615 }, { "batch_size": 4, "epoch": 0.646, "step": 1615, "tokens_per_device": 2648 }, { "epoch": 0.646, "loss_ce": 0.3934590816497803, "loss_lvr": 0.7697488069534302, "loss_mode_switch": 0.0, "loss_total": 0.47043395042419434, "step": 1615 }, { "batch_size": 4, "epoch": 0.646, "step": 1615, "tokens_per_device": 4052 }, { "epoch": 0.646, "loss_ce": 0.08461392670869827, "loss_lvr": 0.9516130685806274, "loss_mode_switch": 0.0, "loss_total": 0.17977523803710938, "step": 1615 }, { "batch_size": 4, "epoch": 0.646, "step": 1615, "tokens_per_device": 5436 }, { "epoch": 0.646, "loss_ce": 0.50532066822052, "loss_lvr": 0.8944639563560486, "loss_mode_switch": 0.0, "loss_total": 0.5947670936584473, "step": 1615 }, { "epoch": 0.6464, "grad_norm": 1.2391220331192017, "learning_rate": 2.9357888171413273e-06, "loss": 0.2385, "step": 1616 }, { "batch_size": 1, "epoch": 0.6464, "step": 1616, "tokens_per_device": 5342 }, { "epoch": 0.6464, "loss_ce": 0.14866194128990173, "loss_lvr": 0.3064666986465454, "loss_mode_switch": 0.0, "loss_total": 0.17930860817432404, "step": 1616 }, { "batch_size": 4, "epoch": 0.6464, "step": 1616, "tokens_per_device": 4228 }, { "epoch": 0.6464, "loss_ce": 0.36841076612472534, "loss_lvr": 0.8875049948692322, "loss_mode_switch": 0.0, "loss_total": 0.4571612775325775, "step": 1616 }, { "batch_size": 1, "epoch": 0.6464, "step": 1616, "tokens_per_device": 5128 }, { "epoch": 0.6464, "loss_ce": 0.13987939059734344, "loss_lvr": 0.3333280384540558, "loss_mode_switch": 0.0, "loss_total": 0.1732122004032135, "step": 1616 }, { "batch_size": 1, "epoch": 0.6464, "step": 1616, "tokens_per_device": 4915 }, { "epoch": 0.6464, "loss_ce": 0.034871719777584076, "loss_lvr": 0.32151085138320923, "loss_mode_switch": 0.0, "loss_total": 0.06702280044555664, "step": 1616 }, { "batch_size": 4, "epoch": 0.6464, "step": 1616, "tokens_per_device": 4252 }, { "epoch": 0.6464, "loss_ce": 0.13303349912166595, "loss_lvr": 0.9441284537315369, "loss_mode_switch": 0.0, "loss_total": 0.22744634747505188, "step": 1616 }, { "batch_size": 4, "epoch": 0.6464, "step": 1616, "tokens_per_device": 8224 }, { "epoch": 0.6464, "loss_ce": 0.3710033893585205, "loss_lvr": 0.6512603759765625, "loss_mode_switch": 0.0, "loss_total": 0.4361294209957123, "step": 1616 }, { "batch_size": 1, "epoch": 0.6464, "step": 1616, "tokens_per_device": 4904 }, { "epoch": 0.6464, "loss_ce": 0.1284361183643341, "loss_lvr": 0.2413482666015625, "loss_mode_switch": 0.0, "loss_total": 0.1525709480047226, "step": 1616 }, { "batch_size": 1, "epoch": 0.6464, "step": 1616, "tokens_per_device": 5013 }, { "epoch": 0.6464, "loss_ce": 0.04596748575568199, "loss_lvr": 0.7637719511985779, "loss_mode_switch": 0.0, "loss_total": 0.12234468758106232, "step": 1616 }, { "epoch": 0.6468, "grad_norm": 1.4549188613891602, "learning_rate": 2.9298908177956843e-06, "loss": 0.2659, "step": 1617 }, { "batch_size": 1, "epoch": 0.6468, "step": 1617, "tokens_per_device": 5150 }, { "epoch": 0.6468, "loss_ce": 0.0007080481736920774, "loss_lvr": 0.40729087591171265, "loss_mode_switch": 0.0, "loss_total": 0.04143713414669037, "step": 1617 }, { "batch_size": 4, "epoch": 0.6468, "step": 1617, "tokens_per_device": 2060 }, { "epoch": 0.6468, "loss_ce": 0.5218526721000671, "loss_lvr": 0.8891867995262146, "loss_mode_switch": 0.0, "loss_total": 0.6107713580131531, "step": 1617 }, { "batch_size": 4, "epoch": 0.6468, "step": 1617, "tokens_per_device": 3788 }, { "epoch": 0.6468, "loss_ce": 0.7025004625320435, "loss_lvr": 0.9729615449905396, "loss_mode_switch": 0.0, "loss_total": 0.7997966408729553, "step": 1617 }, { "batch_size": 4, "epoch": 0.6468, "step": 1617, "tokens_per_device": 4452 }, { "epoch": 0.6468, "loss_ce": 0.35345572233200073, "loss_lvr": 0.7368815541267395, "loss_mode_switch": 0.0, "loss_total": 0.4271438717842102, "step": 1617 }, { "batch_size": 4, "epoch": 0.6468, "step": 1617, "tokens_per_device": 5736 }, { "epoch": 0.6468, "loss_ce": 0.007158613298088312, "loss_lvr": 1.120474934577942, "loss_mode_switch": 0.0, "loss_total": 0.11920610815286636, "step": 1617 }, { "batch_size": 4, "epoch": 0.6468, "step": 1617, "tokens_per_device": 4376 }, { "epoch": 0.6468, "loss_ce": 0.08078254759311676, "loss_lvr": 0.8416846990585327, "loss_mode_switch": 0.0, "loss_total": 0.16495102643966675, "step": 1617 }, { "batch_size": 4, "epoch": 0.6468, "step": 1617, "tokens_per_device": 5692 }, { "epoch": 0.6468, "loss_ce": 0.18376494944095612, "loss_lvr": 0.8279603719711304, "loss_mode_switch": 0.0, "loss_total": 0.26656097173690796, "step": 1617 }, { "batch_size": 4, "epoch": 0.6468, "step": 1617, "tokens_per_device": 13884 }, { "epoch": 0.6468, "loss_ce": 0.09990575164556503, "loss_lvr": 0.9119536280632019, "loss_mode_switch": 0.0, "loss_total": 0.19110111892223358, "step": 1617 }, { "epoch": 0.6472, "grad_norm": 1.4001553058624268, "learning_rate": 2.923996292767115e-06, "loss": 0.3185, "step": 1618 }, { "batch_size": 4, "epoch": 0.6472, "step": 1618, "tokens_per_device": 3328 }, { "epoch": 0.6472, "loss_ce": 0.19705452024936676, "loss_lvr": 1.5685077905654907, "loss_mode_switch": 0.0, "loss_total": 0.3539053201675415, "step": 1618 }, { "batch_size": 1, "epoch": 0.6472, "step": 1618, "tokens_per_device": 4886 }, { "epoch": 0.6472, "loss_ce": 0.00510065583512187, "loss_lvr": 0.20325075089931488, "loss_mode_switch": 0.0, "loss_total": 0.025425732135772705, "step": 1618 }, { "batch_size": 4, "epoch": 0.6472, "step": 1618, "tokens_per_device": 2664 }, { "epoch": 0.6472, "loss_ce": 0.35345178842544556, "loss_lvr": 0.7288548350334167, "loss_mode_switch": 0.0, "loss_total": 0.42633727192878723, "step": 1618 }, { "batch_size": 1, "epoch": 0.6472, "step": 1618, "tokens_per_device": 5071 }, { "epoch": 0.6472, "loss_ce": 0.002773196902126074, "loss_lvr": 0.7410208582878113, "loss_mode_switch": 0.0, "loss_total": 0.07687528431415558, "step": 1618 }, { "batch_size": 1, "epoch": 0.6472, "step": 1618, "tokens_per_device": 5282 }, { "epoch": 0.6472, "loss_ce": 0.1759338229894638, "loss_lvr": 0.3045307397842407, "loss_mode_switch": 0.0, "loss_total": 0.20638689398765564, "step": 1618 }, { "batch_size": 4, "epoch": 0.6472, "step": 1618, "tokens_per_device": 4300 }, { "epoch": 0.6472, "loss_ce": 0.11985595524311066, "loss_lvr": 1.0125319957733154, "loss_mode_switch": 0.0, "loss_total": 0.22110915184020996, "step": 1618 }, { "batch_size": 1, "epoch": 0.6472, "step": 1618, "tokens_per_device": 5070 }, { "epoch": 0.6472, "loss_ce": 0.04171523079276085, "loss_lvr": 0.38689860701560974, "loss_mode_switch": 0.0, "loss_total": 0.080405093729496, "step": 1618 }, { "batch_size": 4, "epoch": 0.6472, "step": 1618, "tokens_per_device": 4192 }, { "epoch": 0.6472, "loss_ce": 0.34286412596702576, "loss_lvr": 0.8850200176239014, "loss_mode_switch": 0.0, "loss_total": 0.4313661456108093, "step": 1618 }, { "epoch": 0.6476, "grad_norm": 1.3282076120376587, "learning_rate": 2.9181052519485496e-06, "loss": 0.2902, "step": 1619 }, { "batch_size": 4, "epoch": 0.6476, "step": 1619, "tokens_per_device": 3456 }, { "epoch": 0.6476, "loss_ce": 0.35489892959594727, "loss_lvr": 1.0108393430709839, "loss_mode_switch": 0.0, "loss_total": 0.45598286390304565, "step": 1619 }, { "batch_size": 4, "epoch": 0.6476, "step": 1619, "tokens_per_device": 5048 }, { "epoch": 0.6476, "loss_ce": 0.019691575318574905, "loss_lvr": 0.6401907801628113, "loss_mode_switch": 0.0, "loss_total": 0.08371065557003021, "step": 1619 }, { "batch_size": 4, "epoch": 0.6476, "step": 1619, "tokens_per_device": 2656 }, { "epoch": 0.6476, "loss_ce": 0.33666762709617615, "loss_lvr": 0.7560154795646667, "loss_mode_switch": 0.0, "loss_total": 0.4122691750526428, "step": 1619 }, { "batch_size": 4, "epoch": 0.6476, "step": 1619, "tokens_per_device": 4208 }, { "epoch": 0.6476, "loss_ce": 0.048961713910102844, "loss_lvr": 0.7843494415283203, "loss_mode_switch": 0.0, "loss_total": 0.12739665806293488, "step": 1619 }, { "batch_size": 4, "epoch": 0.6476, "step": 1619, "tokens_per_device": 4236 }, { "epoch": 0.6476, "loss_ce": 0.193569615483284, "loss_lvr": 0.8498072028160095, "loss_mode_switch": 0.0, "loss_total": 0.27855032682418823, "step": 1619 }, { "batch_size": 4, "epoch": 0.6476, "step": 1619, "tokens_per_device": 4268 }, { "epoch": 0.6476, "loss_ce": 0.3604724705219269, "loss_lvr": 1.0698035955429077, "loss_mode_switch": 0.0, "loss_total": 0.4674528241157532, "step": 1619 }, { "batch_size": 1, "epoch": 0.6476, "step": 1619, "tokens_per_device": 4867 }, { "epoch": 0.6476, "loss_ce": 0.024936523288488388, "loss_lvr": 0.8422082662582397, "loss_mode_switch": 0.0, "loss_total": 0.10915735363960266, "step": 1619 }, { "batch_size": 1, "epoch": 0.6476, "step": 1619, "tokens_per_device": 4841 }, { "epoch": 0.6476, "loss_ce": 0.005096354987472296, "loss_lvr": 0.691006600856781, "loss_mode_switch": 0.0, "loss_total": 0.07419701665639877, "step": 1619 }, { "epoch": 0.648, "grad_norm": 1.1099252700805664, "learning_rate": 2.912217705227075e-06, "loss": 0.272, "step": 1620 }, { "batch_size": 4, "epoch": 0.648, "step": 1620, "tokens_per_device": 2572 }, { "epoch": 0.648, "loss_ce": 0.17422613501548767, "loss_lvr": 1.1411809921264648, "loss_mode_switch": 0.0, "loss_total": 0.28834423422813416, "step": 1620 }, { "batch_size": 1, "epoch": 0.648, "step": 1620, "tokens_per_device": 4869 }, { "epoch": 0.648, "loss_ce": 0.0014655160484835505, "loss_lvr": 0.23408865928649902, "loss_mode_switch": 0.0, "loss_total": 0.024874381721019745, "step": 1620 }, { "batch_size": 4, "epoch": 0.648, "step": 1620, "tokens_per_device": 4664 }, { "epoch": 0.648, "loss_ce": 0.4081834852695465, "loss_lvr": 0.9342501759529114, "loss_mode_switch": 0.0, "loss_total": 0.5016084909439087, "step": 1620 }, { "batch_size": 4, "epoch": 0.648, "step": 1620, "tokens_per_device": 4264 }, { "epoch": 0.648, "loss_ce": 0.31075629591941833, "loss_lvr": 0.8320325016975403, "loss_mode_switch": 0.0, "loss_total": 0.39395955204963684, "step": 1620 }, { "batch_size": 4, "epoch": 0.648, "step": 1620, "tokens_per_device": 5568 }, { "epoch": 0.648, "loss_ce": 0.33929887413978577, "loss_lvr": 0.7501232624053955, "loss_mode_switch": 0.0, "loss_total": 0.4143112003803253, "step": 1620 }, { "batch_size": 1, "epoch": 0.648, "step": 1620, "tokens_per_device": 4887 }, { "epoch": 0.648, "loss_ce": 0.0003071320243179798, "loss_lvr": 0.36027759313583374, "loss_mode_switch": 0.0, "loss_total": 0.036334890872240067, "step": 1620 }, { "batch_size": 4, "epoch": 0.648, "step": 1620, "tokens_per_device": 4116 }, { "epoch": 0.648, "loss_ce": 0.38987112045288086, "loss_lvr": 0.6676650047302246, "loss_mode_switch": 0.0, "loss_total": 0.4566376209259033, "step": 1620 }, { "batch_size": 4, "epoch": 0.648, "step": 1620, "tokens_per_device": 4868 }, { "epoch": 0.648, "loss_ce": 0.14885756373405457, "loss_lvr": 0.7560780048370361, "loss_mode_switch": 0.0, "loss_total": 0.22446537017822266, "step": 1620 }, { "epoch": 0.6484, "grad_norm": 1.2837637662887573, "learning_rate": 2.9063336624839065e-06, "loss": 0.2589, "step": 1621 }, { "batch_size": 1, "epoch": 0.6484, "step": 1621, "tokens_per_device": 7045 }, { "epoch": 0.6484, "loss_ce": 0.0004952976596541703, "loss_lvr": 0.3278440535068512, "loss_mode_switch": 0.0, "loss_total": 0.03327970206737518, "step": 1621 }, { "batch_size": 4, "epoch": 0.6484, "step": 1621, "tokens_per_device": 4272 }, { "epoch": 0.6484, "loss_ce": 0.4084271788597107, "loss_lvr": 0.8023167848587036, "loss_mode_switch": 0.0, "loss_total": 0.4886588454246521, "step": 1621 }, { "batch_size": 1, "epoch": 0.6484, "step": 1621, "tokens_per_device": 5201 }, { "epoch": 0.6484, "loss_ce": 0.0779605284333229, "loss_lvr": 0.23196052014827728, "loss_mode_switch": 0.0, "loss_total": 0.1011565774679184, "step": 1621 }, { "batch_size": 4, "epoch": 0.6484, "step": 1621, "tokens_per_device": 1460 }, { "epoch": 0.6484, "loss_ce": 0.4857769012451172, "loss_lvr": 1.0352290868759155, "loss_mode_switch": 0.0, "loss_total": 0.5892997980117798, "step": 1621 }, { "batch_size": 4, "epoch": 0.6484, "step": 1621, "tokens_per_device": 3960 }, { "epoch": 0.6484, "loss_ce": 0.7697803974151611, "loss_lvr": 0.8417597413063049, "loss_mode_switch": 0.0, "loss_total": 0.8539563417434692, "step": 1621 }, { "batch_size": 1, "epoch": 0.6484, "step": 1621, "tokens_per_device": 4732 }, { "epoch": 0.6484, "loss_ce": 0.3657975196838379, "loss_lvr": 0.7865281701087952, "loss_mode_switch": 0.0, "loss_total": 0.44445034861564636, "step": 1621 }, { "batch_size": 1, "epoch": 0.6484, "step": 1621, "tokens_per_device": 5108 }, { "epoch": 0.6484, "loss_ce": 0.014910436235368252, "loss_lvr": 0.7484362721443176, "loss_mode_switch": 0.0, "loss_total": 0.08975406736135483, "step": 1621 }, { "batch_size": 4, "epoch": 0.6484, "step": 1621, "tokens_per_device": 1744 }, { "epoch": 0.6484, "loss_ce": 0.6766891479492188, "loss_lvr": 0.9336423277854919, "loss_mode_switch": 0.0, "loss_total": 0.7700533866882324, "step": 1621 }, { "epoch": 0.6488, "grad_norm": 1.3566253185272217, "learning_rate": 2.9004531335943865e-06, "loss": 0.3338, "step": 1622 }, { "batch_size": 1, "epoch": 0.6488, "step": 1622, "tokens_per_device": 4921 }, { "epoch": 0.6488, "loss_ce": 0.5874868035316467, "loss_lvr": 0.36911115050315857, "loss_mode_switch": 0.0, "loss_total": 0.6243979334831238, "step": 1622 }, { "batch_size": 4, "epoch": 0.6488, "step": 1622, "tokens_per_device": 4476 }, { "epoch": 0.6488, "loss_ce": 0.0427359864115715, "loss_lvr": 1.557604193687439, "loss_mode_switch": 0.0, "loss_total": 0.19849640130996704, "step": 1622 }, { "batch_size": 4, "epoch": 0.6488, "step": 1622, "tokens_per_device": 5812 }, { "epoch": 0.6488, "loss_ce": 0.07598055154085159, "loss_lvr": 0.7463034391403198, "loss_mode_switch": 0.0, "loss_total": 0.15061089396476746, "step": 1622 }, { "batch_size": 4, "epoch": 0.6488, "step": 1622, "tokens_per_device": 5164 }, { "epoch": 0.6488, "loss_ce": 0.038290828466415405, "loss_lvr": 0.7264200448989868, "loss_mode_switch": 0.0, "loss_total": 0.1109328344464302, "step": 1622 }, { "batch_size": 4, "epoch": 0.6488, "step": 1622, "tokens_per_device": 4964 }, { "epoch": 0.6488, "loss_ce": 0.06983042508363724, "loss_lvr": 0.8180214166641235, "loss_mode_switch": 0.0, "loss_total": 0.15163257718086243, "step": 1622 }, { "batch_size": 4, "epoch": 0.6488, "step": 1622, "tokens_per_device": 4436 }, { "epoch": 0.6488, "loss_ce": 0.18266533315181732, "loss_lvr": 1.332446813583374, "loss_mode_switch": 0.0, "loss_total": 0.3159100115299225, "step": 1622 }, { "batch_size": 4, "epoch": 0.6488, "step": 1622, "tokens_per_device": 1328 }, { "epoch": 0.6488, "loss_ce": 0.5231791734695435, "loss_lvr": 0.9490809440612793, "loss_mode_switch": 0.0, "loss_total": 0.6180872917175293, "step": 1622 }, { "batch_size": 4, "epoch": 0.6488, "step": 1622, "tokens_per_device": 3792 }, { "epoch": 0.6488, "loss_ce": 0.21891340613365173, "loss_lvr": 1.0710371732711792, "loss_mode_switch": 0.0, "loss_total": 0.3260171413421631, "step": 1622 }, { "epoch": 0.6492, "grad_norm": 1.3694345951080322, "learning_rate": 2.8945761284279583e-06, "loss": 0.3127, "step": 1623 }, { "batch_size": 4, "epoch": 0.6492, "step": 1623, "tokens_per_device": 1340 }, { "epoch": 0.6492, "loss_ce": 0.38778209686279297, "loss_lvr": 0.9134658575057983, "loss_mode_switch": 0.0, "loss_total": 0.4791286885738373, "step": 1623 }, { "batch_size": 1, "epoch": 0.6492, "step": 1623, "tokens_per_device": 4758 }, { "epoch": 0.6492, "loss_ce": 0.11364767700433731, "loss_lvr": 0.28547126054763794, "loss_mode_switch": 0.0, "loss_total": 0.14219480752944946, "step": 1623 }, { "batch_size": 1, "epoch": 0.6492, "step": 1623, "tokens_per_device": 4262 }, { "epoch": 0.6492, "loss_ce": 0.03770352527499199, "loss_lvr": 0.26301854848861694, "loss_mode_switch": 0.0, "loss_total": 0.06400538235902786, "step": 1623 }, { "batch_size": 4, "epoch": 0.6492, "step": 1623, "tokens_per_device": 1416 }, { "epoch": 0.6492, "loss_ce": 0.3050590753555298, "loss_lvr": 0.8402259349822998, "loss_mode_switch": 0.0, "loss_total": 0.3890816569328308, "step": 1623 }, { "batch_size": 4, "epoch": 0.6492, "step": 1623, "tokens_per_device": 4256 }, { "epoch": 0.6492, "loss_ce": 0.06914196163415909, "loss_lvr": 1.0573583841323853, "loss_mode_switch": 0.0, "loss_total": 0.17487779259681702, "step": 1623 }, { "batch_size": 1, "epoch": 0.6492, "step": 1623, "tokens_per_device": 4697 }, { "epoch": 0.6492, "loss_ce": 0.023941632360219955, "loss_lvr": 0.16076071560382843, "loss_mode_switch": 0.0, "loss_total": 0.04001770168542862, "step": 1623 }, { "batch_size": 1, "epoch": 0.6492, "step": 1623, "tokens_per_device": 4636 }, { "epoch": 0.6492, "loss_ce": 0.1309938132762909, "loss_lvr": 0.3873966336250305, "loss_mode_switch": 0.0, "loss_total": 0.16973347961902618, "step": 1623 }, { "batch_size": 4, "epoch": 0.6492, "step": 1623, "tokens_per_device": 5112 }, { "epoch": 0.6492, "loss_ce": 0.1663387417793274, "loss_lvr": 0.7795965075492859, "loss_mode_switch": 0.0, "loss_total": 0.24429839849472046, "step": 1623 }, { "epoch": 0.6496, "grad_norm": 1.495842456817627, "learning_rate": 2.888702656848147e-06, "loss": 0.2552, "step": 1624 }, { "batch_size": 4, "epoch": 0.6496, "step": 1624, "tokens_per_device": 5980 }, { "epoch": 0.6496, "loss_ce": 0.2950081527233124, "loss_lvr": 0.7727500796318054, "loss_mode_switch": 0.0, "loss_total": 0.3722831606864929, "step": 1624 }, { "batch_size": 4, "epoch": 0.6496, "step": 1624, "tokens_per_device": 2852 }, { "epoch": 0.6496, "loss_ce": 0.3865368068218231, "loss_lvr": 0.5482888221740723, "loss_mode_switch": 0.0, "loss_total": 0.44136568903923035, "step": 1624 }, { "batch_size": 4, "epoch": 0.6496, "step": 1624, "tokens_per_device": 13316 }, { "epoch": 0.6496, "loss_ce": 0.40128037333488464, "loss_lvr": 0.716543436050415, "loss_mode_switch": 0.0, "loss_total": 0.4729347229003906, "step": 1624 }, { "batch_size": 1, "epoch": 0.6496, "step": 1624, "tokens_per_device": 5122 }, { "epoch": 0.6496, "loss_ce": 0.1475035399198532, "loss_lvr": 0.13848266005516052, "loss_mode_switch": 0.0, "loss_total": 0.16135179996490479, "step": 1624 }, { "batch_size": 1, "epoch": 0.6496, "step": 1624, "tokens_per_device": 5434 }, { "epoch": 0.6496, "loss_ce": 1.6674342155456543, "loss_lvr": 0.5560545325279236, "loss_mode_switch": 0.0, "loss_total": 1.7230396270751953, "step": 1624 }, { "batch_size": 4, "epoch": 0.6496, "step": 1624, "tokens_per_device": 9620 }, { "epoch": 0.6496, "loss_ce": 0.00845591351389885, "loss_lvr": 0.9564208388328552, "loss_mode_switch": 0.0, "loss_total": 0.10409799218177795, "step": 1624 }, { "batch_size": 4, "epoch": 0.6496, "step": 1624, "tokens_per_device": 4452 }, { "epoch": 0.6496, "loss_ce": 0.26011109352111816, "loss_lvr": 0.8281958699226379, "loss_mode_switch": 0.0, "loss_total": 0.3429306745529175, "step": 1624 }, { "batch_size": 4, "epoch": 0.6496, "step": 1624, "tokens_per_device": 4584 }, { "epoch": 0.6496, "loss_ce": 0.0018881482537835836, "loss_lvr": 0.6138813495635986, "loss_mode_switch": 0.0, "loss_total": 0.06327628344297409, "step": 1624 }, { "epoch": 0.65, "grad_norm": 1.4248390197753906, "learning_rate": 2.882832728712551e-06, "loss": 0.2918, "step": 1625 }, { "batch_size": 4, "epoch": 0.65, "step": 1625, "tokens_per_device": 9264 }, { "epoch": 0.65, "loss_ce": 0.25638481974601746, "loss_lvr": 0.8051568865776062, "loss_mode_switch": 0.0, "loss_total": 0.3369005024433136, "step": 1625 }, { "batch_size": 4, "epoch": 0.65, "step": 1625, "tokens_per_device": 4740 }, { "epoch": 0.65, "loss_ce": 0.4348854422569275, "loss_lvr": 0.7948704957962036, "loss_mode_switch": 0.0, "loss_total": 0.5143724679946899, "step": 1625 }, { "batch_size": 4, "epoch": 0.65, "step": 1625, "tokens_per_device": 1480 }, { "epoch": 0.65, "loss_ce": 0.37275993824005127, "loss_lvr": 1.4213194847106934, "loss_mode_switch": 0.0, "loss_total": 0.5148918628692627, "step": 1625 }, { "batch_size": 4, "epoch": 0.65, "step": 1625, "tokens_per_device": 1640 }, { "epoch": 0.65, "loss_ce": 0.2843082845211029, "loss_lvr": 0.9201458692550659, "loss_mode_switch": 0.0, "loss_total": 0.376322865486145, "step": 1625 }, { "batch_size": 4, "epoch": 0.65, "step": 1625, "tokens_per_device": 6100 }, { "epoch": 0.65, "loss_ce": 0.1906706988811493, "loss_lvr": 0.7294780015945435, "loss_mode_switch": 0.0, "loss_total": 0.26361849904060364, "step": 1625 }, { "batch_size": 4, "epoch": 0.65, "step": 1625, "tokens_per_device": 4076 }, { "epoch": 0.65, "loss_ce": 0.008314824663102627, "loss_lvr": 0.6051627993583679, "loss_mode_switch": 0.0, "loss_total": 0.06883110105991364, "step": 1625 }, { "batch_size": 4, "epoch": 0.65, "step": 1625, "tokens_per_device": 4276 }, { "epoch": 0.65, "loss_ce": 0.23805999755859375, "loss_lvr": 0.8055477738380432, "loss_mode_switch": 0.0, "loss_total": 0.31861478090286255, "step": 1625 }, { "batch_size": 1, "epoch": 0.65, "step": 1625, "tokens_per_device": 5152 }, { "epoch": 0.65, "loss_ce": 0.0006177930627018213, "loss_lvr": 0.354439914226532, "loss_mode_switch": 0.0, "loss_total": 0.036061786115169525, "step": 1625 }, { "epoch": 0.6504, "grad_norm": 1.2893093824386597, "learning_rate": 2.8769663538728174e-06, "loss": 0.287, "step": 1626 }, { "batch_size": 1, "epoch": 0.6504, "step": 1626, "tokens_per_device": 5178 }, { "epoch": 0.6504, "loss_ce": 0.005748154129832983, "loss_lvr": 0.5064001083374023, "loss_mode_switch": 0.0, "loss_total": 0.056388165801763535, "step": 1626 }, { "batch_size": 1, "epoch": 0.6504, "step": 1626, "tokens_per_device": 5515 }, { "epoch": 0.6504, "loss_ce": 0.0006037302664481103, "loss_lvr": 0.48609012365341187, "loss_mode_switch": 0.0, "loss_total": 0.049212746322155, "step": 1626 }, { "batch_size": 4, "epoch": 0.6504, "step": 1626, "tokens_per_device": 2192 }, { "epoch": 0.6504, "loss_ce": 0.7049789428710938, "loss_lvr": 0.926419734954834, "loss_mode_switch": 0.0, "loss_total": 0.7976208925247192, "step": 1626 }, { "batch_size": 4, "epoch": 0.6504, "step": 1626, "tokens_per_device": 7944 }, { "epoch": 0.6504, "loss_ce": 0.05820620805025101, "loss_lvr": 0.8028052449226379, "loss_mode_switch": 0.0, "loss_total": 0.13848674297332764, "step": 1626 }, { "batch_size": 4, "epoch": 0.6504, "step": 1626, "tokens_per_device": 6128 }, { "epoch": 0.6504, "loss_ce": 0.3948117196559906, "loss_lvr": 0.6448583602905273, "loss_mode_switch": 0.0, "loss_total": 0.4592975676059723, "step": 1626 }, { "batch_size": 1, "epoch": 0.6504, "step": 1626, "tokens_per_device": 5168 }, { "epoch": 0.6504, "loss_ce": 0.07084733247756958, "loss_lvr": 0.242149218916893, "loss_mode_switch": 0.0, "loss_total": 0.095062255859375, "step": 1626 }, { "batch_size": 1, "epoch": 0.6504, "step": 1626, "tokens_per_device": 4857 }, { "epoch": 0.6504, "loss_ce": 0.0014306548982858658, "loss_lvr": 0.5901798605918884, "loss_mode_switch": 0.0, "loss_total": 0.06044863909482956, "step": 1626 }, { "batch_size": 1, "epoch": 0.6504, "step": 1626, "tokens_per_device": 4883 }, { "epoch": 0.6504, "loss_ce": 0.009982481598854065, "loss_lvr": 0.7208656668663025, "loss_mode_switch": 0.0, "loss_total": 0.0820690467953682, "step": 1626 }, { "epoch": 0.6508, "grad_norm": 1.2706869840621948, "learning_rate": 2.871103542174637e-06, "loss": 0.2696, "step": 1627 }, { "batch_size": 4, "epoch": 0.6508, "step": 1627, "tokens_per_device": 4504 }, { "epoch": 0.6508, "loss_ce": 0.6060007214546204, "loss_lvr": 0.7462708950042725, "loss_mode_switch": 0.0, "loss_total": 0.6806278228759766, "step": 1627 }, { "batch_size": 4, "epoch": 0.6508, "step": 1627, "tokens_per_device": 3720 }, { "epoch": 0.6508, "loss_ce": 0.12111838161945343, "loss_lvr": 0.877410352230072, "loss_mode_switch": 0.0, "loss_total": 0.2088594138622284, "step": 1627 }, { "batch_size": 4, "epoch": 0.6508, "step": 1627, "tokens_per_device": 3444 }, { "epoch": 0.6508, "loss_ce": 0.16972056031227112, "loss_lvr": 0.9657289385795593, "loss_mode_switch": 0.0, "loss_total": 0.266293466091156, "step": 1627 }, { "batch_size": 4, "epoch": 0.6508, "step": 1627, "tokens_per_device": 7108 }, { "epoch": 0.6508, "loss_ce": 0.02433539181947708, "loss_lvr": 0.9149144291877747, "loss_mode_switch": 0.0, "loss_total": 0.11582683771848679, "step": 1627 }, { "batch_size": 1, "epoch": 0.6508, "step": 1627, "tokens_per_device": 5148 }, { "epoch": 0.6508, "loss_ce": 0.0018173150019720197, "loss_lvr": 0.4357112646102905, "loss_mode_switch": 0.0, "loss_total": 0.045388441532850266, "step": 1627 }, { "batch_size": 4, "epoch": 0.6508, "step": 1627, "tokens_per_device": 4716 }, { "epoch": 0.6508, "loss_ce": 0.2119879275560379, "loss_lvr": 0.6453949809074402, "loss_mode_switch": 0.0, "loss_total": 0.27652743458747864, "step": 1627 }, { "batch_size": 4, "epoch": 0.6508, "step": 1627, "tokens_per_device": 4236 }, { "epoch": 0.6508, "loss_ce": 0.12104865163564682, "loss_lvr": 0.8764124512672424, "loss_mode_switch": 0.0, "loss_total": 0.20868989825248718, "step": 1627 }, { "batch_size": 4, "epoch": 0.6508, "step": 1627, "tokens_per_device": 1480 }, { "epoch": 0.6508, "loss_ce": 0.13399319350719452, "loss_lvr": 0.8901268243789673, "loss_mode_switch": 0.0, "loss_total": 0.22300587594509125, "step": 1627 }, { "epoch": 0.6512, "grad_norm": 1.4976621866226196, "learning_rate": 2.865244303457715e-06, "loss": 0.2969, "step": 1628 }, { "batch_size": 4, "epoch": 0.6512, "step": 1628, "tokens_per_device": 1572 }, { "epoch": 0.6512, "loss_ce": 0.691779375076294, "loss_lvr": 0.8155203461647034, "loss_mode_switch": 0.0, "loss_total": 0.7733314037322998, "step": 1628 }, { "batch_size": 4, "epoch": 0.6512, "step": 1628, "tokens_per_device": 5580 }, { "epoch": 0.6512, "loss_ce": 0.023395534604787827, "loss_lvr": 0.8285338282585144, "loss_mode_switch": 0.0, "loss_total": 0.10624891519546509, "step": 1628 }, { "batch_size": 4, "epoch": 0.6512, "step": 1628, "tokens_per_device": 4268 }, { "epoch": 0.6512, "loss_ce": 0.1951792687177658, "loss_lvr": 1.007744312286377, "loss_mode_switch": 0.0, "loss_total": 0.2959536910057068, "step": 1628 }, { "batch_size": 4, "epoch": 0.6512, "step": 1628, "tokens_per_device": 5300 }, { "epoch": 0.6512, "loss_ce": 0.33218398690223694, "loss_lvr": 1.3896178007125854, "loss_mode_switch": 0.0, "loss_total": 0.47114574909210205, "step": 1628 }, { "batch_size": 1, "epoch": 0.6512, "step": 1628, "tokens_per_device": 4969 }, { "epoch": 0.6512, "loss_ce": 0.11076965928077698, "loss_lvr": 0.3695153295993805, "loss_mode_switch": 0.0, "loss_total": 0.14772120118141174, "step": 1628 }, { "batch_size": 1, "epoch": 0.6512, "step": 1628, "tokens_per_device": 4733 }, { "epoch": 0.6512, "loss_ce": 0.4619826376438141, "loss_lvr": 0.4322293996810913, "loss_mode_switch": 0.0, "loss_total": 0.5052055716514587, "step": 1628 }, { "batch_size": 1, "epoch": 0.6512, "step": 1628, "tokens_per_device": 4900 }, { "epoch": 0.6512, "loss_ce": 0.006362765561789274, "loss_lvr": 0.24981217086315155, "loss_mode_switch": 0.0, "loss_total": 0.0313439816236496, "step": 1628 }, { "batch_size": 1, "epoch": 0.6512, "step": 1628, "tokens_per_device": 5927 }, { "epoch": 0.6512, "loss_ce": 0.001055686385370791, "loss_lvr": 0.3022158741950989, "loss_mode_switch": 0.0, "loss_total": 0.03127727285027504, "step": 1628 }, { "epoch": 0.6516, "grad_norm": 1.4336460828781128, "learning_rate": 2.859388647555762e-06, "loss": 0.3052, "step": 1629 }, { "batch_size": 4, "epoch": 0.6516, "step": 1629, "tokens_per_device": 5564 }, { "epoch": 0.6516, "loss_ce": 0.13967657089233398, "loss_lvr": 0.6481732726097107, "loss_mode_switch": 0.0, "loss_total": 0.204493910074234, "step": 1629 }, { "batch_size": 4, "epoch": 0.6516, "step": 1629, "tokens_per_device": 1544 }, { "epoch": 0.6516, "loss_ce": 0.5063947439193726, "loss_lvr": 1.1482642889022827, "loss_mode_switch": 0.0, "loss_total": 0.6212211847305298, "step": 1629 }, { "batch_size": 4, "epoch": 0.6516, "step": 1629, "tokens_per_device": 1576 }, { "epoch": 0.6516, "loss_ce": 1.0105950832366943, "loss_lvr": 0.8984717130661011, "loss_mode_switch": 0.0, "loss_total": 1.1004422903060913, "step": 1629 }, { "batch_size": 4, "epoch": 0.6516, "step": 1629, "tokens_per_device": 5552 }, { "epoch": 0.6516, "loss_ce": 0.10855800658464432, "loss_lvr": 0.8929734230041504, "loss_mode_switch": 0.0, "loss_total": 0.19785535335540771, "step": 1629 }, { "batch_size": 4, "epoch": 0.6516, "step": 1629, "tokens_per_device": 4328 }, { "epoch": 0.6516, "loss_ce": 0.1553628146648407, "loss_lvr": 0.9577675461769104, "loss_mode_switch": 0.0, "loss_total": 0.2511395812034607, "step": 1629 }, { "batch_size": 4, "epoch": 0.6516, "step": 1629, "tokens_per_device": 4700 }, { "epoch": 0.6516, "loss_ce": 0.0005542756407521665, "loss_lvr": 0.7601543664932251, "loss_mode_switch": 0.0, "loss_total": 0.07656971365213394, "step": 1629 }, { "batch_size": 4, "epoch": 0.6516, "step": 1629, "tokens_per_device": 4488 }, { "epoch": 0.6516, "loss_ce": 0.25325852632522583, "loss_lvr": 0.6771829128265381, "loss_mode_switch": 0.0, "loss_total": 0.3209768235683441, "step": 1629 }, { "batch_size": 4, "epoch": 0.6516, "step": 1629, "tokens_per_device": 2588 }, { "epoch": 0.6516, "loss_ce": 0.29867836833000183, "loss_lvr": 0.9308326840400696, "loss_mode_switch": 0.0, "loss_total": 0.3917616307735443, "step": 1629 }, { "epoch": 0.652, "grad_norm": 1.2672892808914185, "learning_rate": 2.8535365842964713e-06, "loss": 0.3117, "step": 1630 }, { "batch_size": 4, "epoch": 0.652, "step": 1630, "tokens_per_device": 4228 }, { "epoch": 0.652, "loss_ce": 0.16872556507587433, "loss_lvr": 0.7064493298530579, "loss_mode_switch": 0.0, "loss_total": 0.23937049508094788, "step": 1630 }, { "batch_size": 4, "epoch": 0.652, "step": 1630, "tokens_per_device": 2752 }, { "epoch": 0.652, "loss_ce": 0.7033188343048096, "loss_lvr": 0.9494360685348511, "loss_mode_switch": 0.0, "loss_total": 0.7982624173164368, "step": 1630 }, { "batch_size": 1, "epoch": 0.652, "step": 1630, "tokens_per_device": 4745 }, { "epoch": 0.652, "loss_ce": 0.02116125449538231, "loss_lvr": 0.2909349203109741, "loss_mode_switch": 0.0, "loss_total": 0.05025474727153778, "step": 1630 }, { "batch_size": 4, "epoch": 0.652, "step": 1630, "tokens_per_device": 10412 }, { "epoch": 0.652, "loss_ce": 0.1963675320148468, "loss_lvr": 0.6528384685516357, "loss_mode_switch": 0.0, "loss_total": 0.2616513967514038, "step": 1630 }, { "batch_size": 1, "epoch": 0.652, "step": 1630, "tokens_per_device": 4908 }, { "epoch": 0.652, "loss_ce": 0.027198312804102898, "loss_lvr": 0.29674476385116577, "loss_mode_switch": 0.0, "loss_total": 0.056872788816690445, "step": 1630 }, { "batch_size": 4, "epoch": 0.652, "step": 1630, "tokens_per_device": 1444 }, { "epoch": 0.652, "loss_ce": 0.8122515082359314, "loss_lvr": 1.0972394943237305, "loss_mode_switch": 0.0, "loss_total": 0.9219754338264465, "step": 1630 }, { "batch_size": 4, "epoch": 0.652, "step": 1630, "tokens_per_device": 2640 }, { "epoch": 0.652, "loss_ce": 0.22953318059444427, "loss_lvr": 0.8531646132469177, "loss_mode_switch": 0.0, "loss_total": 0.3148496448993683, "step": 1630 }, { "batch_size": 4, "epoch": 0.652, "step": 1630, "tokens_per_device": 4356 }, { "epoch": 0.652, "loss_ce": 0.017153941094875336, "loss_lvr": 0.6561911702156067, "loss_mode_switch": 0.0, "loss_total": 0.08277305960655212, "step": 1630 }, { "epoch": 0.6524, "grad_norm": 1.1813013553619385, "learning_rate": 2.8476881235015126e-06, "loss": 0.2604, "step": 1631 }, { "batch_size": 1, "epoch": 0.6524, "step": 1631, "tokens_per_device": 4989 }, { "epoch": 0.6524, "loss_ce": 0.6538950800895691, "loss_lvr": 0.4051536023616791, "loss_mode_switch": 0.0, "loss_total": 0.6944104433059692, "step": 1631 }, { "batch_size": 4, "epoch": 0.6524, "step": 1631, "tokens_per_device": 2648 }, { "epoch": 0.6524, "loss_ce": 0.11566361784934998, "loss_lvr": 0.9563805460929871, "loss_mode_switch": 0.0, "loss_total": 0.21130168437957764, "step": 1631 }, { "batch_size": 4, "epoch": 0.6524, "step": 1631, "tokens_per_device": 2428 }, { "epoch": 0.6524, "loss_ce": 0.2732445001602173, "loss_lvr": 0.9847412109375, "loss_mode_switch": 0.0, "loss_total": 0.3717186152935028, "step": 1631 }, { "batch_size": 4, "epoch": 0.6524, "step": 1631, "tokens_per_device": 2684 }, { "epoch": 0.6524, "loss_ce": 0.5581092834472656, "loss_lvr": 0.8621295094490051, "loss_mode_switch": 0.0, "loss_total": 0.6443222165107727, "step": 1631 }, { "batch_size": 4, "epoch": 0.6524, "step": 1631, "tokens_per_device": 4812 }, { "epoch": 0.6524, "loss_ce": 0.5411229133605957, "loss_lvr": 0.7688036561012268, "loss_mode_switch": 0.0, "loss_total": 0.618003249168396, "step": 1631 }, { "batch_size": 4, "epoch": 0.6524, "step": 1631, "tokens_per_device": 2812 }, { "epoch": 0.6524, "loss_ce": 0.5163706541061401, "loss_lvr": 0.7799937129020691, "loss_mode_switch": 0.0, "loss_total": 0.5943700075149536, "step": 1631 }, { "batch_size": 4, "epoch": 0.6524, "step": 1631, "tokens_per_device": 3760 }, { "epoch": 0.6524, "loss_ce": 0.41306257247924805, "loss_lvr": 0.7655628323554993, "loss_mode_switch": 0.0, "loss_total": 0.48961886763572693, "step": 1631 }, { "batch_size": 1, "epoch": 0.6524, "step": 1631, "tokens_per_device": 5251 }, { "epoch": 0.6524, "loss_ce": 0.10712344199419022, "loss_lvr": 0.27081167697906494, "loss_mode_switch": 0.0, "loss_total": 0.13420461118221283, "step": 1631 }, { "epoch": 0.6528, "grad_norm": 1.3417026996612549, "learning_rate": 2.841843274986509e-06, "loss": 0.3213, "step": 1632 }, { "batch_size": 4, "epoch": 0.6528, "step": 1632, "tokens_per_device": 3876 }, { "epoch": 0.6528, "loss_ce": 0.13279478251934052, "loss_lvr": 0.9989349842071533, "loss_mode_switch": 0.0, "loss_total": 0.2326882779598236, "step": 1632 }, { "batch_size": 4, "epoch": 0.6528, "step": 1632, "tokens_per_device": 4332 }, { "epoch": 0.6528, "loss_ce": 0.37803560495376587, "loss_lvr": 0.7024140357971191, "loss_mode_switch": 0.0, "loss_total": 0.44827699661254883, "step": 1632 }, { "batch_size": 4, "epoch": 0.6528, "step": 1632, "tokens_per_device": 4592 }, { "epoch": 0.6528, "loss_ce": 0.01708856411278248, "loss_lvr": 0.8444997668266296, "loss_mode_switch": 0.0, "loss_total": 0.10153853893280029, "step": 1632 }, { "batch_size": 1, "epoch": 0.6528, "step": 1632, "tokens_per_device": 5135 }, { "epoch": 0.6528, "loss_ce": 0.009321040473878384, "loss_lvr": 0.36327478289604187, "loss_mode_switch": 0.0, "loss_total": 0.045648518949747086, "step": 1632 }, { "batch_size": 4, "epoch": 0.6528, "step": 1632, "tokens_per_device": 4380 }, { "epoch": 0.6528, "loss_ce": 0.1489863246679306, "loss_lvr": 0.9241011738777161, "loss_mode_switch": 0.0, "loss_total": 0.2413964420557022, "step": 1632 }, { "batch_size": 4, "epoch": 0.6528, "step": 1632, "tokens_per_device": 4760 }, { "epoch": 0.6528, "loss_ce": 0.19895200431346893, "loss_lvr": 0.8788841366767883, "loss_mode_switch": 0.0, "loss_total": 0.28684040904045105, "step": 1632 }, { "batch_size": 1, "epoch": 0.6528, "step": 1632, "tokens_per_device": 4910 }, { "epoch": 0.6528, "loss_ce": 0.07403826713562012, "loss_lvr": 0.4032147228717804, "loss_mode_switch": 0.0, "loss_total": 0.11435973644256592, "step": 1632 }, { "batch_size": 4, "epoch": 0.6528, "step": 1632, "tokens_per_device": 5284 }, { "epoch": 0.6528, "loss_ce": 0.17820435762405396, "loss_lvr": 0.9843021035194397, "loss_mode_switch": 0.0, "loss_total": 0.2766345739364624, "step": 1632 }, { "epoch": 0.6532, "grad_norm": 1.3974496126174927, "learning_rate": 2.8360020485610164e-06, "loss": 0.2495, "step": 1633 }, { "batch_size": 4, "epoch": 0.6532, "step": 1633, "tokens_per_device": 14132 }, { "epoch": 0.6532, "loss_ce": 0.01174034085124731, "loss_lvr": 0.5471184849739075, "loss_mode_switch": 0.0, "loss_total": 0.06645219027996063, "step": 1633 }, { "batch_size": 1, "epoch": 0.6532, "step": 1633, "tokens_per_device": 5175 }, { "epoch": 0.6532, "loss_ce": 0.06539538502693176, "loss_lvr": 0.2715863883495331, "loss_mode_switch": 0.0, "loss_total": 0.09255402535200119, "step": 1633 }, { "batch_size": 1, "epoch": 0.6532, "step": 1633, "tokens_per_device": 5199 }, { "epoch": 0.6532, "loss_ce": 0.530746340751648, "loss_lvr": 0.25951457023620605, "loss_mode_switch": 0.0, "loss_total": 0.5566977858543396, "step": 1633 }, { "batch_size": 1, "epoch": 0.6532, "step": 1633, "tokens_per_device": 5161 }, { "epoch": 0.6532, "loss_ce": 0.0009790770709514618, "loss_lvr": 0.353015273809433, "loss_mode_switch": 0.0, "loss_total": 0.03628060594201088, "step": 1633 }, { "batch_size": 4, "epoch": 0.6532, "step": 1633, "tokens_per_device": 5156 }, { "epoch": 0.6532, "loss_ce": 0.17026321589946747, "loss_lvr": 0.6953720450401306, "loss_mode_switch": 0.0, "loss_total": 0.23980042338371277, "step": 1633 }, { "batch_size": 1, "epoch": 0.6532, "step": 1633, "tokens_per_device": 4868 }, { "epoch": 0.6532, "loss_ce": 0.002292454708367586, "loss_lvr": 0.24501508474349976, "loss_mode_switch": 0.0, "loss_total": 0.026793962344527245, "step": 1633 }, { "batch_size": 4, "epoch": 0.6532, "step": 1633, "tokens_per_device": 6372 }, { "epoch": 0.6532, "loss_ce": 0.1171293556690216, "loss_lvr": 0.6414116621017456, "loss_mode_switch": 0.0, "loss_total": 0.1812705248594284, "step": 1633 }, { "batch_size": 4, "epoch": 0.6532, "step": 1633, "tokens_per_device": 6004 }, { "epoch": 0.6532, "loss_ce": 0.5980030298233032, "loss_lvr": 0.7825580835342407, "loss_mode_switch": 0.0, "loss_total": 0.6762588620185852, "step": 1633 }, { "epoch": 0.6536, "grad_norm": 1.3617802858352661, "learning_rate": 2.8301644540285137e-06, "loss": 0.276, "step": 1634 }, { "batch_size": 4, "epoch": 0.6536, "step": 1634, "tokens_per_device": 2788 }, { "epoch": 0.6536, "loss_ce": 0.15523476898670197, "loss_lvr": 1.0173176527023315, "loss_mode_switch": 0.0, "loss_total": 0.2569665312767029, "step": 1634 }, { "batch_size": 4, "epoch": 0.6536, "step": 1634, "tokens_per_device": 10736 }, { "epoch": 0.6536, "loss_ce": 0.006420566234737635, "loss_lvr": 1.2947368621826172, "loss_mode_switch": 0.0, "loss_total": 0.13589425384998322, "step": 1634 }, { "batch_size": 1, "epoch": 0.6536, "step": 1634, "tokens_per_device": 4892 }, { "epoch": 0.6536, "loss_ce": 0.05736219882965088, "loss_lvr": 0.5780609846115112, "loss_mode_switch": 0.0, "loss_total": 0.11516830325126648, "step": 1634 }, { "batch_size": 4, "epoch": 0.6536, "step": 1634, "tokens_per_device": 9864 }, { "epoch": 0.6536, "loss_ce": 0.09606153517961502, "loss_lvr": 0.5568369030952454, "loss_mode_switch": 0.0, "loss_total": 0.15174522995948792, "step": 1634 }, { "batch_size": 1, "epoch": 0.6536, "step": 1634, "tokens_per_device": 5104 }, { "epoch": 0.6536, "loss_ce": 0.07697267830371857, "loss_lvr": 0.3513563871383667, "loss_mode_switch": 0.0, "loss_total": 0.11210831999778748, "step": 1634 }, { "batch_size": 4, "epoch": 0.6536, "step": 1634, "tokens_per_device": 4372 }, { "epoch": 0.6536, "loss_ce": 0.14026358723640442, "loss_lvr": 0.8340734243392944, "loss_mode_switch": 0.0, "loss_total": 0.22367092967033386, "step": 1634 }, { "batch_size": 4, "epoch": 0.6536, "step": 1634, "tokens_per_device": 1356 }, { "epoch": 0.6536, "loss_ce": 0.33442434668540955, "loss_lvr": 1.0187076330184937, "loss_mode_switch": 0.0, "loss_total": 0.43629512190818787, "step": 1634 }, { "batch_size": 4, "epoch": 0.6536, "step": 1634, "tokens_per_device": 4648 }, { "epoch": 0.6536, "loss_ce": 0.2943071126937866, "loss_lvr": 0.6298618316650391, "loss_mode_switch": 0.0, "loss_total": 0.3572933077812195, "step": 1634 }, { "epoch": 0.654, "grad_norm": 1.2959572076797485, "learning_rate": 2.8243305011863843e-06, "loss": 0.2577, "step": 1635 }, { "batch_size": 4, "epoch": 0.654, "step": 1635, "tokens_per_device": 3864 }, { "epoch": 0.654, "loss_ce": 0.2943130135536194, "loss_lvr": 0.8187612295150757, "loss_mode_switch": 0.0, "loss_total": 0.37618914246559143, "step": 1635 }, { "batch_size": 4, "epoch": 0.654, "step": 1635, "tokens_per_device": 2612 }, { "epoch": 0.654, "loss_ce": 0.15286359190940857, "loss_lvr": 0.8748475313186646, "loss_mode_switch": 0.0, "loss_total": 0.24034833908081055, "step": 1635 }, { "batch_size": 4, "epoch": 0.654, "step": 1635, "tokens_per_device": 6932 }, { "epoch": 0.654, "loss_ce": 0.031700052320957184, "loss_lvr": 0.9874575734138489, "loss_mode_switch": 0.0, "loss_total": 0.13044580817222595, "step": 1635 }, { "batch_size": 1, "epoch": 0.654, "step": 1635, "tokens_per_device": 4866 }, { "epoch": 0.654, "loss_ce": 0.2527168095111847, "loss_lvr": 0.32032662630081177, "loss_mode_switch": 0.0, "loss_total": 0.28474947810173035, "step": 1635 }, { "batch_size": 4, "epoch": 0.654, "step": 1635, "tokens_per_device": 4540 }, { "epoch": 0.654, "loss_ce": 0.1629629284143448, "loss_lvr": 0.9466544985771179, "loss_mode_switch": 0.0, "loss_total": 0.2576283812522888, "step": 1635 }, { "batch_size": 4, "epoch": 0.654, "step": 1635, "tokens_per_device": 3760 }, { "epoch": 0.654, "loss_ce": 0.23590491712093353, "loss_lvr": 0.48752209544181824, "loss_mode_switch": 0.0, "loss_total": 0.2846571207046509, "step": 1635 }, { "batch_size": 1, "epoch": 0.654, "step": 1635, "tokens_per_device": 5105 }, { "epoch": 0.654, "loss_ce": 0.06040727347135544, "loss_lvr": 0.4173344373703003, "loss_mode_switch": 0.0, "loss_total": 0.10214071720838547, "step": 1635 }, { "batch_size": 4, "epoch": 0.654, "step": 1635, "tokens_per_device": 1520 }, { "epoch": 0.654, "loss_ce": 0.5319980978965759, "loss_lvr": 1.3211710453033447, "loss_mode_switch": 0.0, "loss_total": 0.6641151905059814, "step": 1635 }, { "epoch": 0.6544, "grad_norm": 1.301452398300171, "learning_rate": 2.818500199825902e-06, "loss": 0.2911, "step": 1636 }, { "batch_size": 1, "epoch": 0.6544, "step": 1636, "tokens_per_device": 4890 }, { "epoch": 0.6544, "loss_ce": 0.08556725829839706, "loss_lvr": 0.7004058361053467, "loss_mode_switch": 0.0, "loss_total": 0.15560784935951233, "step": 1636 }, { "batch_size": 4, "epoch": 0.6544, "step": 1636, "tokens_per_device": 3008 }, { "epoch": 0.6544, "loss_ce": 0.24537675082683563, "loss_lvr": 0.6915385127067566, "loss_mode_switch": 0.0, "loss_total": 0.314530611038208, "step": 1636 }, { "batch_size": 4, "epoch": 0.6544, "step": 1636, "tokens_per_device": 5988 }, { "epoch": 0.6544, "loss_ce": 0.20541012287139893, "loss_lvr": 0.8009451627731323, "loss_mode_switch": 0.0, "loss_total": 0.28550463914871216, "step": 1636 }, { "batch_size": 4, "epoch": 0.6544, "step": 1636, "tokens_per_device": 5444 }, { "epoch": 0.6544, "loss_ce": 1.0253167152404785, "loss_lvr": 0.7828425168991089, "loss_mode_switch": 0.0, "loss_total": 1.1036009788513184, "step": 1636 }, { "batch_size": 4, "epoch": 0.6544, "step": 1636, "tokens_per_device": 4680 }, { "epoch": 0.6544, "loss_ce": 0.05582813546061516, "loss_lvr": 0.8717520236968994, "loss_mode_switch": 0.0, "loss_total": 0.14300334453582764, "step": 1636 }, { "batch_size": 1, "epoch": 0.6544, "step": 1636, "tokens_per_device": 4885 }, { "epoch": 0.6544, "loss_ce": 0.135409414768219, "loss_lvr": 0.18143419921398163, "loss_mode_switch": 0.0, "loss_total": 0.1535528302192688, "step": 1636 }, { "batch_size": 4, "epoch": 0.6544, "step": 1636, "tokens_per_device": 2540 }, { "epoch": 0.6544, "loss_ce": 0.5201817154884338, "loss_lvr": 1.310591220855713, "loss_mode_switch": 0.0, "loss_total": 0.6512408256530762, "step": 1636 }, { "batch_size": 4, "epoch": 0.6544, "step": 1636, "tokens_per_device": 11024 }, { "epoch": 0.6544, "loss_ce": 0.4637398421764374, "loss_lvr": 0.4800255596637726, "loss_mode_switch": 0.0, "loss_total": 0.5117424130439758, "step": 1636 }, { "epoch": 0.6548, "grad_norm": 1.2072333097457886, "learning_rate": 2.812673559732211e-06, "loss": 0.2902, "step": 1637 }, { "batch_size": 4, "epoch": 0.6548, "step": 1637, "tokens_per_device": 3868 }, { "epoch": 0.6548, "loss_ce": 0.3351214826107025, "loss_lvr": 0.6018874645233154, "loss_mode_switch": 0.0, "loss_total": 0.3953102231025696, "step": 1637 }, { "batch_size": 4, "epoch": 0.6548, "step": 1637, "tokens_per_device": 4260 }, { "epoch": 0.6548, "loss_ce": 0.30744484066963196, "loss_lvr": 0.5906704068183899, "loss_mode_switch": 0.0, "loss_total": 0.36651188135147095, "step": 1637 }, { "batch_size": 4, "epoch": 0.6548, "step": 1637, "tokens_per_device": 4208 }, { "epoch": 0.6548, "loss_ce": 0.1211899071931839, "loss_lvr": 0.951970100402832, "loss_mode_switch": 0.0, "loss_total": 0.21638691425323486, "step": 1637 }, { "batch_size": 4, "epoch": 0.6548, "step": 1637, "tokens_per_device": 3740 }, { "epoch": 0.6548, "loss_ce": 0.0008386993431486189, "loss_lvr": 0.7418043613433838, "loss_mode_switch": 0.0, "loss_total": 0.07501913607120514, "step": 1637 }, { "batch_size": 4, "epoch": 0.6548, "step": 1637, "tokens_per_device": 4280 }, { "epoch": 0.6548, "loss_ce": 0.4562641978263855, "loss_lvr": 0.7277971506118774, "loss_mode_switch": 0.0, "loss_total": 0.5290439128875732, "step": 1637 }, { "batch_size": 4, "epoch": 0.6548, "step": 1637, "tokens_per_device": 14748 }, { "epoch": 0.6548, "loss_ce": 0.49845126271247864, "loss_lvr": 0.7853884100914001, "loss_mode_switch": 0.0, "loss_total": 0.5769901275634766, "step": 1637 }, { "batch_size": 4, "epoch": 0.6548, "step": 1637, "tokens_per_device": 4716 }, { "epoch": 0.6548, "loss_ce": 0.21307319402694702, "loss_lvr": 0.8186793327331543, "loss_mode_switch": 0.0, "loss_total": 0.29494112730026245, "step": 1637 }, { "batch_size": 4, "epoch": 0.6548, "step": 1637, "tokens_per_device": 4424 }, { "epoch": 0.6548, "loss_ce": 0.3074944019317627, "loss_lvr": 0.7709941864013672, "loss_mode_switch": 0.0, "loss_total": 0.38459381461143494, "step": 1637 }, { "epoch": 0.6552, "grad_norm": 1.383615493774414, "learning_rate": 2.806850590684309e-06, "loss": 0.3171, "step": 1638 }, { "batch_size": 4, "epoch": 0.6552, "step": 1638, "tokens_per_device": 4700 }, { "epoch": 0.6552, "loss_ce": 0.78656005859375, "loss_lvr": 0.7660253643989563, "loss_mode_switch": 0.0, "loss_total": 0.8631625771522522, "step": 1638 }, { "batch_size": 4, "epoch": 0.6552, "step": 1638, "tokens_per_device": 5028 }, { "epoch": 0.6552, "loss_ce": 0.6009004712104797, "loss_lvr": 0.5677400827407837, "loss_mode_switch": 0.0, "loss_total": 0.6576744914054871, "step": 1638 }, { "batch_size": 1, "epoch": 0.6552, "step": 1638, "tokens_per_device": 5187 }, { "epoch": 0.6552, "loss_ce": 0.00924930814653635, "loss_lvr": 0.3419494032859802, "loss_mode_switch": 0.0, "loss_total": 0.04344424977898598, "step": 1638 }, { "batch_size": 1, "epoch": 0.6552, "step": 1638, "tokens_per_device": 5100 }, { "epoch": 0.6552, "loss_ce": 0.004599888343364, "loss_lvr": 0.330802321434021, "loss_mode_switch": 0.0, "loss_total": 0.03768011927604675, "step": 1638 }, { "batch_size": 4, "epoch": 0.6552, "step": 1638, "tokens_per_device": 1676 }, { "epoch": 0.6552, "loss_ce": 0.24958659708499908, "loss_lvr": 0.8747005462646484, "loss_mode_switch": 0.0, "loss_total": 0.33705663681030273, "step": 1638 }, { "batch_size": 1, "epoch": 0.6552, "step": 1638, "tokens_per_device": 6255 }, { "epoch": 0.6552, "loss_ce": 0.008299590088427067, "loss_lvr": 0.2138926088809967, "loss_mode_switch": 0.0, "loss_total": 0.029688850045204163, "step": 1638 }, { "batch_size": 4, "epoch": 0.6552, "step": 1638, "tokens_per_device": 1616 }, { "epoch": 0.6552, "loss_ce": 0.14104671776294708, "loss_lvr": 0.9912333488464355, "loss_mode_switch": 0.0, "loss_total": 0.24017006158828735, "step": 1638 }, { "batch_size": 4, "epoch": 0.6552, "step": 1638, "tokens_per_device": 3808 }, { "epoch": 0.6552, "loss_ce": 0.7122543454170227, "loss_lvr": 0.9311248660087585, "loss_mode_switch": 0.0, "loss_total": 0.8053668141365051, "step": 1638 }, { "epoch": 0.6556, "grad_norm": 1.2297258377075195, "learning_rate": 2.801031302455032e-06, "loss": 0.3074, "step": 1639 }, { "batch_size": 4, "epoch": 0.6556, "step": 1639, "tokens_per_device": 1680 }, { "epoch": 0.6556, "loss_ce": 0.17870406806468964, "loss_lvr": 0.7981322407722473, "loss_mode_switch": 0.0, "loss_total": 0.2585172951221466, "step": 1639 }, { "batch_size": 1, "epoch": 0.6556, "step": 1639, "tokens_per_device": 5107 }, { "epoch": 0.6556, "loss_ce": 0.005506409332156181, "loss_lvr": 0.29382309317588806, "loss_mode_switch": 0.0, "loss_total": 0.03488871827721596, "step": 1639 }, { "batch_size": 4, "epoch": 0.6556, "step": 1639, "tokens_per_device": 3868 }, { "epoch": 0.6556, "loss_ce": 0.3038999140262604, "loss_lvr": 1.0848770141601562, "loss_mode_switch": 0.0, "loss_total": 0.4123876094818115, "step": 1639 }, { "batch_size": 4, "epoch": 0.6556, "step": 1639, "tokens_per_device": 2648 }, { "epoch": 0.6556, "loss_ce": 0.2898233234882355, "loss_lvr": 0.713668942451477, "loss_mode_switch": 0.0, "loss_total": 0.36119022965431213, "step": 1639 }, { "batch_size": 4, "epoch": 0.6556, "step": 1639, "tokens_per_device": 4740 }, { "epoch": 0.6556, "loss_ce": 0.07393965870141983, "loss_lvr": 0.7426666617393494, "loss_mode_switch": 0.0, "loss_total": 0.14820632338523865, "step": 1639 }, { "batch_size": 4, "epoch": 0.6556, "step": 1639, "tokens_per_device": 4320 }, { "epoch": 0.6556, "loss_ce": 0.26114434003829956, "loss_lvr": 0.630212128162384, "loss_mode_switch": 0.0, "loss_total": 0.32416555285453796, "step": 1639 }, { "batch_size": 1, "epoch": 0.6556, "step": 1639, "tokens_per_device": 6643 }, { "epoch": 0.6556, "loss_ce": 0.0005709616816602647, "loss_lvr": 0.34107181429862976, "loss_mode_switch": 0.0, "loss_total": 0.0346781425178051, "step": 1639 }, { "batch_size": 4, "epoch": 0.6556, "step": 1639, "tokens_per_device": 5112 }, { "epoch": 0.6556, "loss_ce": 0.22456158697605133, "loss_lvr": 0.7675204277038574, "loss_mode_switch": 0.0, "loss_total": 0.3013136386871338, "step": 1639 }, { "epoch": 0.656, "grad_norm": 1.323964238166809, "learning_rate": 2.7952157048110406e-06, "loss": 0.2583, "step": 1640 }, { "batch_size": 4, "epoch": 0.656, "step": 1640, "tokens_per_device": 4592 }, { "epoch": 0.656, "loss_ce": 0.22680789232254028, "loss_lvr": 0.8359348773956299, "loss_mode_switch": 0.0, "loss_total": 0.31040138006210327, "step": 1640 }, { "batch_size": 4, "epoch": 0.656, "step": 1640, "tokens_per_device": 3896 }, { "epoch": 0.656, "loss_ce": 0.0733482763171196, "loss_lvr": 0.921278715133667, "loss_mode_switch": 0.0, "loss_total": 0.16547614336013794, "step": 1640 }, { "batch_size": 4, "epoch": 0.656, "step": 1640, "tokens_per_device": 3588 }, { "epoch": 0.656, "loss_ce": 0.09071233123540878, "loss_lvr": 0.8338819742202759, "loss_mode_switch": 0.0, "loss_total": 0.17410053312778473, "step": 1640 }, { "batch_size": 4, "epoch": 0.656, "step": 1640, "tokens_per_device": 10592 }, { "epoch": 0.656, "loss_ce": 0.27031341195106506, "loss_lvr": 0.9142740964889526, "loss_mode_switch": 0.0, "loss_total": 0.3617408275604248, "step": 1640 }, { "batch_size": 4, "epoch": 0.656, "step": 1640, "tokens_per_device": 1960 }, { "epoch": 0.656, "loss_ce": 0.134248748421669, "loss_lvr": 0.8204613924026489, "loss_mode_switch": 0.0, "loss_total": 0.21629488468170166, "step": 1640 }, { "batch_size": 1, "epoch": 0.656, "step": 1640, "tokens_per_device": 4828 }, { "epoch": 0.656, "loss_ce": 0.03285248577594757, "loss_lvr": 0.47796276211738586, "loss_mode_switch": 0.0, "loss_total": 0.0806487649679184, "step": 1640 }, { "batch_size": 4, "epoch": 0.656, "step": 1640, "tokens_per_device": 2640 }, { "epoch": 0.656, "loss_ce": 0.4252488315105438, "loss_lvr": 0.7926915287971497, "loss_mode_switch": 0.0, "loss_total": 0.5045179724693298, "step": 1640 }, { "batch_size": 4, "epoch": 0.656, "step": 1640, "tokens_per_device": 9420 }, { "epoch": 0.656, "loss_ce": 0.17563696205615997, "loss_lvr": 0.8155292272567749, "loss_mode_switch": 0.0, "loss_total": 0.25718986988067627, "step": 1640 }, { "epoch": 0.6564, "grad_norm": 1.3144766092300415, "learning_rate": 2.7894038075128038e-06, "loss": 0.2676, "step": 1641 }, { "batch_size": 1, "epoch": 0.6564, "step": 1641, "tokens_per_device": 5546 }, { "epoch": 0.6564, "loss_ce": 0.00038053709431551397, "loss_lvr": 0.37810763716697693, "loss_mode_switch": 0.0, "loss_total": 0.0381913036108017, "step": 1641 }, { "batch_size": 4, "epoch": 0.6564, "step": 1641, "tokens_per_device": 5364 }, { "epoch": 0.6564, "loss_ce": 0.05779043957591057, "loss_lvr": 0.7353283762931824, "loss_mode_switch": 0.0, "loss_total": 0.13132327795028687, "step": 1641 }, { "batch_size": 4, "epoch": 0.6564, "step": 1641, "tokens_per_device": 4252 }, { "epoch": 0.6564, "loss_ce": 0.3204990327358246, "loss_lvr": 0.8896278142929077, "loss_mode_switch": 0.0, "loss_total": 0.4094618260860443, "step": 1641 }, { "batch_size": 1, "epoch": 0.6564, "step": 1641, "tokens_per_device": 4951 }, { "epoch": 0.6564, "loss_ce": 0.01710783690214157, "loss_lvr": 0.48159462213516235, "loss_mode_switch": 0.0, "loss_total": 0.06526729464530945, "step": 1641 }, { "batch_size": 4, "epoch": 0.6564, "step": 1641, "tokens_per_device": 14464 }, { "epoch": 0.6564, "loss_ce": 0.10212244093418121, "loss_lvr": 0.383175253868103, "loss_mode_switch": 0.0, "loss_total": 0.140439972281456, "step": 1641 }, { "batch_size": 4, "epoch": 0.6564, "step": 1641, "tokens_per_device": 4280 }, { "epoch": 0.6564, "loss_ce": 0.43900376558303833, "loss_lvr": 0.836394727230072, "loss_mode_switch": 0.0, "loss_total": 0.5226432085037231, "step": 1641 }, { "batch_size": 1, "epoch": 0.6564, "step": 1641, "tokens_per_device": 4958 }, { "epoch": 0.6564, "loss_ce": 0.38690951466560364, "loss_lvr": 0.2846134305000305, "loss_mode_switch": 0.0, "loss_total": 0.4153708517551422, "step": 1641 }, { "batch_size": 4, "epoch": 0.6564, "step": 1641, "tokens_per_device": 1288 }, { "epoch": 0.6564, "loss_ce": 0.3844541609287262, "loss_lvr": 1.24323570728302, "loss_mode_switch": 0.0, "loss_total": 0.5087777376174927, "step": 1641 }, { "epoch": 0.6568, "grad_norm": 1.2930039167404175, "learning_rate": 2.7835956203145754e-06, "loss": 0.2638, "step": 1642 }, { "batch_size": 4, "epoch": 0.6568, "step": 1642, "tokens_per_device": 6292 }, { "epoch": 0.6568, "loss_ce": 0.530096709728241, "loss_lvr": 1.5199531316757202, "loss_mode_switch": 0.0, "loss_total": 0.682092010974884, "step": 1642 }, { "batch_size": 4, "epoch": 0.6568, "step": 1642, "tokens_per_device": 2624 }, { "epoch": 0.6568, "loss_ce": 0.3574220836162567, "loss_lvr": 0.8119237422943115, "loss_mode_switch": 0.0, "loss_total": 0.43861445784568787, "step": 1642 }, { "batch_size": 4, "epoch": 0.6568, "step": 1642, "tokens_per_device": 3912 }, { "epoch": 0.6568, "loss_ce": 0.0006251182057894766, "loss_lvr": 0.5509505867958069, "loss_mode_switch": 0.0, "loss_total": 0.05572017654776573, "step": 1642 }, { "batch_size": 4, "epoch": 0.6568, "step": 1642, "tokens_per_device": 4224 }, { "epoch": 0.6568, "loss_ce": 0.11144404858350754, "loss_lvr": 0.4137714207172394, "loss_mode_switch": 0.0, "loss_total": 0.15282118320465088, "step": 1642 }, { "batch_size": 4, "epoch": 0.6568, "step": 1642, "tokens_per_device": 2548 }, { "epoch": 0.6568, "loss_ce": 0.28134122490882874, "loss_lvr": 0.92595374584198, "loss_mode_switch": 0.0, "loss_total": 0.37393659353256226, "step": 1642 }, { "batch_size": 4, "epoch": 0.6568, "step": 1642, "tokens_per_device": 15568 }, { "epoch": 0.6568, "loss_ce": 0.22122102975845337, "loss_lvr": 0.49284738302230835, "loss_mode_switch": 0.0, "loss_total": 0.27050575613975525, "step": 1642 }, { "batch_size": 4, "epoch": 0.6568, "step": 1642, "tokens_per_device": 11796 }, { "epoch": 0.6568, "loss_ce": 0.322153776884079, "loss_lvr": 0.8959413766860962, "loss_mode_switch": 0.0, "loss_total": 0.41174793243408203, "step": 1642 }, { "batch_size": 1, "epoch": 0.6568, "step": 1642, "tokens_per_device": 4858 }, { "epoch": 0.6568, "loss_ce": 0.034854013472795486, "loss_lvr": 0.17111040651798248, "loss_mode_switch": 0.0, "loss_total": 0.051965054124593735, "step": 1642 }, { "epoch": 0.6572, "grad_norm": 1.2519869804382324, "learning_rate": 2.777791152964383e-06, "loss": 0.2959, "step": 1643 }, { "batch_size": 4, "epoch": 0.6572, "step": 1643, "tokens_per_device": 1592 }, { "epoch": 0.6572, "loss_ce": 0.1517106145620346, "loss_lvr": 0.9161854386329651, "loss_mode_switch": 0.0, "loss_total": 0.24332916736602783, "step": 1643 }, { "batch_size": 1, "epoch": 0.6572, "step": 1643, "tokens_per_device": 5102 }, { "epoch": 0.6572, "loss_ce": 0.004977753385901451, "loss_lvr": 0.31337660551071167, "loss_mode_switch": 0.0, "loss_total": 0.03631541132926941, "step": 1643 }, { "batch_size": 1, "epoch": 0.6572, "step": 1643, "tokens_per_device": 4841 }, { "epoch": 0.6572, "loss_ce": 0.0016791936941444874, "loss_lvr": 0.7459201812744141, "loss_mode_switch": 0.0, "loss_total": 0.07627121359109879, "step": 1643 }, { "batch_size": 4, "epoch": 0.6572, "step": 1643, "tokens_per_device": 4240 }, { "epoch": 0.6572, "loss_ce": 0.022807473316788673, "loss_lvr": 0.9731190800666809, "loss_mode_switch": 0.0, "loss_total": 0.1201193779706955, "step": 1643 }, { "batch_size": 1, "epoch": 0.6572, "step": 1643, "tokens_per_device": 4952 }, { "epoch": 0.6572, "loss_ce": 0.2086389660835266, "loss_lvr": 0.19307799637317657, "loss_mode_switch": 0.0, "loss_total": 0.22794675827026367, "step": 1643 }, { "batch_size": 4, "epoch": 0.6572, "step": 1643, "tokens_per_device": 2564 }, { "epoch": 0.6572, "loss_ce": 0.5295429825782776, "loss_lvr": 1.2603408098220825, "loss_mode_switch": 0.0, "loss_total": 0.6555770635604858, "step": 1643 }, { "batch_size": 4, "epoch": 0.6572, "step": 1643, "tokens_per_device": 4996 }, { "epoch": 0.6572, "loss_ce": 0.0019875059369951487, "loss_lvr": 0.5622196793556213, "loss_mode_switch": 0.0, "loss_total": 0.05820947512984276, "step": 1643 }, { "batch_size": 4, "epoch": 0.6572, "step": 1643, "tokens_per_device": 8284 }, { "epoch": 0.6572, "loss_ce": 0.08486814051866531, "loss_lvr": 0.7452068328857422, "loss_mode_switch": 0.0, "loss_total": 0.15938882529735565, "step": 1643 }, { "epoch": 0.6576, "grad_norm": 1.243377923965454, "learning_rate": 2.7719904152040112e-06, "loss": 0.2924, "step": 1644 }, { "batch_size": 1, "epoch": 0.6576, "step": 1644, "tokens_per_device": 4742 }, { "epoch": 0.6576, "loss_ce": 0.00531802698969841, "loss_lvr": 0.40091100335121155, "loss_mode_switch": 0.0, "loss_total": 0.045409128069877625, "step": 1644 }, { "batch_size": 4, "epoch": 0.6576, "step": 1644, "tokens_per_device": 3940 }, { "epoch": 0.6576, "loss_ce": 0.8304216861724854, "loss_lvr": 0.8397322297096252, "loss_mode_switch": 0.0, "loss_total": 0.9143949151039124, "step": 1644 }, { "batch_size": 4, "epoch": 0.6576, "step": 1644, "tokens_per_device": 3808 }, { "epoch": 0.6576, "loss_ce": 0.07895846664905548, "loss_lvr": 0.8276264071464539, "loss_mode_switch": 0.0, "loss_total": 0.1617211103439331, "step": 1644 }, { "batch_size": 4, "epoch": 0.6576, "step": 1644, "tokens_per_device": 8040 }, { "epoch": 0.6576, "loss_ce": 0.25215715169906616, "loss_lvr": 0.8624605536460876, "loss_mode_switch": 0.0, "loss_total": 0.33840322494506836, "step": 1644 }, { "batch_size": 4, "epoch": 0.6576, "step": 1644, "tokens_per_device": 3020 }, { "epoch": 0.6576, "loss_ce": 0.3564477860927582, "loss_lvr": 0.6468557119369507, "loss_mode_switch": 0.0, "loss_total": 0.4211333692073822, "step": 1644 }, { "batch_size": 4, "epoch": 0.6576, "step": 1644, "tokens_per_device": 8204 }, { "epoch": 0.6576, "loss_ce": 0.32109618186950684, "loss_lvr": 0.35189250111579895, "loss_mode_switch": 0.0, "loss_total": 0.35628542304039, "step": 1644 }, { "batch_size": 4, "epoch": 0.6576, "step": 1644, "tokens_per_device": 10464 }, { "epoch": 0.6576, "loss_ce": 0.4026867747306824, "loss_lvr": 0.8961853981018066, "loss_mode_switch": 0.0, "loss_total": 0.49230530858039856, "step": 1644 }, { "batch_size": 4, "epoch": 0.6576, "step": 1644, "tokens_per_device": 5688 }, { "epoch": 0.6576, "loss_ce": 0.13774967193603516, "loss_lvr": 0.7282170057296753, "loss_mode_switch": 0.0, "loss_total": 0.21057137846946716, "step": 1644 }, { "epoch": 0.658, "grad_norm": 1.5238221883773804, "learning_rate": 2.7661934167689887e-06, "loss": 0.3307, "step": 1645 }, { "batch_size": 4, "epoch": 0.658, "step": 1645, "tokens_per_device": 4564 }, { "epoch": 0.658, "loss_ce": 0.08367882668972015, "loss_lvr": 0.7071024775505066, "loss_mode_switch": 0.0, "loss_total": 0.15438908338546753, "step": 1645 }, { "batch_size": 4, "epoch": 0.658, "step": 1645, "tokens_per_device": 8468 }, { "epoch": 0.658, "loss_ce": 0.0017347057582810521, "loss_lvr": 0.4326651394367218, "loss_mode_switch": 0.0, "loss_total": 0.04500122368335724, "step": 1645 }, { "batch_size": 1, "epoch": 0.658, "step": 1645, "tokens_per_device": 4684 }, { "epoch": 0.658, "loss_ce": 0.019795384258031845, "loss_lvr": 0.2357521802186966, "loss_mode_switch": 0.0, "loss_total": 0.043370604515075684, "step": 1645 }, { "batch_size": 4, "epoch": 0.658, "step": 1645, "tokens_per_device": 2344 }, { "epoch": 0.658, "loss_ce": 0.13933579623699188, "loss_lvr": 0.8280179500579834, "loss_mode_switch": 0.0, "loss_total": 0.22213760018348694, "step": 1645 }, { "batch_size": 1, "epoch": 0.658, "step": 1645, "tokens_per_device": 4748 }, { "epoch": 0.658, "loss_ce": 0.04862625151872635, "loss_lvr": 0.42986807227134705, "loss_mode_switch": 0.0, "loss_total": 0.0916130542755127, "step": 1645 }, { "batch_size": 1, "epoch": 0.658, "step": 1645, "tokens_per_device": 5107 }, { "epoch": 0.658, "loss_ce": 0.0723399817943573, "loss_lvr": 0.27580949664115906, "loss_mode_switch": 0.0, "loss_total": 0.09992092847824097, "step": 1645 }, { "batch_size": 4, "epoch": 0.658, "step": 1645, "tokens_per_device": 2708 }, { "epoch": 0.658, "loss_ce": 0.3597792387008667, "loss_lvr": 0.5663637518882751, "loss_mode_switch": 0.0, "loss_total": 0.41641560196876526, "step": 1645 }, { "batch_size": 4, "epoch": 0.658, "step": 1645, "tokens_per_device": 5860 }, { "epoch": 0.658, "loss_ce": 0.1881505846977234, "loss_lvr": 1.0301367044448853, "loss_mode_switch": 0.0, "loss_total": 0.29116424918174744, "step": 1645 }, { "epoch": 0.6584, "grad_norm": 1.217589020729065, "learning_rate": 2.760400167388566e-06, "loss": 0.2761, "step": 1646 }, { "batch_size": 4, "epoch": 0.6584, "step": 1646, "tokens_per_device": 4076 }, { "epoch": 0.6584, "loss_ce": 0.01490236259996891, "loss_lvr": 0.7841355800628662, "loss_mode_switch": 0.0, "loss_total": 0.09331592172384262, "step": 1646 }, { "batch_size": 1, "epoch": 0.6584, "step": 1646, "tokens_per_device": 4624 }, { "epoch": 0.6584, "loss_ce": 0.018179873004555702, "loss_lvr": 0.5385485291481018, "loss_mode_switch": 0.0, "loss_total": 0.07203472405672073, "step": 1646 }, { "batch_size": 1, "epoch": 0.6584, "step": 1646, "tokens_per_device": 5157 }, { "epoch": 0.6584, "loss_ce": 0.0006091602263040841, "loss_lvr": 0.4202998876571655, "loss_mode_switch": 0.0, "loss_total": 0.042639147490262985, "step": 1646 }, { "batch_size": 4, "epoch": 0.6584, "step": 1646, "tokens_per_device": 1612 }, { "epoch": 0.6584, "loss_ce": 0.13736897706985474, "loss_lvr": 0.9804226160049438, "loss_mode_switch": 0.0, "loss_total": 0.23541124165058136, "step": 1646 }, { "batch_size": 4, "epoch": 0.6584, "step": 1646, "tokens_per_device": 1792 }, { "epoch": 0.6584, "loss_ce": 0.09528417885303497, "loss_lvr": 1.0237029790878296, "loss_mode_switch": 0.0, "loss_total": 0.19765448570251465, "step": 1646 }, { "batch_size": 4, "epoch": 0.6584, "step": 1646, "tokens_per_device": 2628 }, { "epoch": 0.6584, "loss_ce": 0.267648845911026, "loss_lvr": 0.7508316040039062, "loss_mode_switch": 0.0, "loss_total": 0.3427320122718811, "step": 1646 }, { "batch_size": 4, "epoch": 0.6584, "step": 1646, "tokens_per_device": 4720 }, { "epoch": 0.6584, "loss_ce": 0.09117648750543594, "loss_lvr": 0.7024968266487122, "loss_mode_switch": 0.0, "loss_total": 0.16142617166042328, "step": 1646 }, { "batch_size": 4, "epoch": 0.6584, "step": 1646, "tokens_per_device": 2568 }, { "epoch": 0.6584, "loss_ce": 0.526948094367981, "loss_lvr": 0.9316253066062927, "loss_mode_switch": 0.0, "loss_total": 0.6201106309890747, "step": 1646 }, { "epoch": 0.6588, "grad_norm": 1.3057001829147339, "learning_rate": 2.754610676785702e-06, "loss": 0.2862, "step": 1647 }, { "batch_size": 1, "epoch": 0.6588, "step": 1647, "tokens_per_device": 5012 }, { "epoch": 0.6588, "loss_ce": 0.7142851948738098, "loss_lvr": 0.3460804522037506, "loss_mode_switch": 0.0, "loss_total": 0.7488932609558105, "step": 1647 }, { "batch_size": 1, "epoch": 0.6588, "step": 1647, "tokens_per_device": 4899 }, { "epoch": 0.6588, "loss_ce": 0.00023269388475455344, "loss_lvr": 0.26572486758232117, "loss_mode_switch": 0.0, "loss_total": 0.026805181056261063, "step": 1647 }, { "batch_size": 4, "epoch": 0.6588, "step": 1647, "tokens_per_device": 2596 }, { "epoch": 0.6588, "loss_ce": 0.6990939378738403, "loss_lvr": 1.14548659324646, "loss_mode_switch": 0.0, "loss_total": 0.8136426210403442, "step": 1647 }, { "batch_size": 4, "epoch": 0.6588, "step": 1647, "tokens_per_device": 4312 }, { "epoch": 0.6588, "loss_ce": 0.8172523975372314, "loss_lvr": 0.6610003113746643, "loss_mode_switch": 0.0, "loss_total": 0.8833523988723755, "step": 1647 }, { "batch_size": 4, "epoch": 0.6588, "step": 1647, "tokens_per_device": 1644 }, { "epoch": 0.6588, "loss_ce": 0.5629427433013916, "loss_lvr": 0.8673050999641418, "loss_mode_switch": 0.0, "loss_total": 0.6496732234954834, "step": 1647 }, { "batch_size": 4, "epoch": 0.6588, "step": 1647, "tokens_per_device": 3768 }, { "epoch": 0.6588, "loss_ce": 0.42278537154197693, "loss_lvr": 0.8793638944625854, "loss_mode_switch": 0.0, "loss_total": 0.510721743106842, "step": 1647 }, { "batch_size": 4, "epoch": 0.6588, "step": 1647, "tokens_per_device": 4488 }, { "epoch": 0.6588, "loss_ce": 0.24718748033046722, "loss_lvr": 0.8237699866294861, "loss_mode_switch": 0.0, "loss_total": 0.32956448197364807, "step": 1647 }, { "batch_size": 1, "epoch": 0.6588, "step": 1647, "tokens_per_device": 4775 }, { "epoch": 0.6588, "loss_ce": 0.0003067263460252434, "loss_lvr": 0.3516141474246979, "loss_mode_switch": 0.0, "loss_total": 0.03546814247965813, "step": 1647 }, { "epoch": 0.6592, "grad_norm": 1.3710044622421265, "learning_rate": 2.748824954677042e-06, "loss": 0.3183, "step": 1648 }, { "batch_size": 4, "epoch": 0.6592, "step": 1648, "tokens_per_device": 4260 }, { "epoch": 0.6592, "loss_ce": 0.14621956646442413, "loss_lvr": 0.9165267944335938, "loss_mode_switch": 0.0, "loss_total": 0.23787224292755127, "step": 1648 }, { "batch_size": 4, "epoch": 0.6592, "step": 1648, "tokens_per_device": 1564 }, { "epoch": 0.6592, "loss_ce": 0.19905465841293335, "loss_lvr": 1.1033726930618286, "loss_mode_switch": 0.0, "loss_total": 0.30939191579818726, "step": 1648 }, { "batch_size": 4, "epoch": 0.6592, "step": 1648, "tokens_per_device": 12376 }, { "epoch": 0.6592, "loss_ce": 0.2703714668750763, "loss_lvr": 0.9405708312988281, "loss_mode_switch": 0.0, "loss_total": 0.3644285500049591, "step": 1648 }, { "batch_size": 4, "epoch": 0.6592, "step": 1648, "tokens_per_device": 9576 }, { "epoch": 0.6592, "loss_ce": 0.037169620394706726, "loss_lvr": 0.7765867114067078, "loss_mode_switch": 0.0, "loss_total": 0.11482829600572586, "step": 1648 }, { "batch_size": 4, "epoch": 0.6592, "step": 1648, "tokens_per_device": 8636 }, { "epoch": 0.6592, "loss_ce": 0.10429748892784119, "loss_lvr": 0.6903572082519531, "loss_mode_switch": 0.0, "loss_total": 0.17333321273326874, "step": 1648 }, { "batch_size": 4, "epoch": 0.6592, "step": 1648, "tokens_per_device": 9812 }, { "epoch": 0.6592, "loss_ce": 0.040204476565122604, "loss_lvr": 0.8313494920730591, "loss_mode_switch": 0.0, "loss_total": 0.12333942949771881, "step": 1648 }, { "batch_size": 4, "epoch": 0.6592, "step": 1648, "tokens_per_device": 5028 }, { "epoch": 0.6592, "loss_ce": 0.3172731101512909, "loss_lvr": 0.7935349941253662, "loss_mode_switch": 0.0, "loss_total": 0.39662662148475647, "step": 1648 }, { "batch_size": 1, "epoch": 0.6592, "step": 1648, "tokens_per_device": 4611 }, { "epoch": 0.6592, "loss_ce": 0.031303953379392624, "loss_lvr": 0.3634379506111145, "loss_mode_switch": 0.0, "loss_total": 0.06764774769544601, "step": 1648 }, { "epoch": 0.6596, "grad_norm": 1.423043966293335, "learning_rate": 2.7430430107729144e-06, "loss": 0.3036, "step": 1649 }, { "batch_size": 1, "epoch": 0.6596, "step": 1649, "tokens_per_device": 4877 }, { "epoch": 0.6596, "loss_ce": 0.020284822210669518, "loss_lvr": 0.3818729519844055, "loss_mode_switch": 0.0, "loss_total": 0.05847211927175522, "step": 1649 }, { "batch_size": 4, "epoch": 0.6596, "step": 1649, "tokens_per_device": 4840 }, { "epoch": 0.6596, "loss_ce": 0.348753958940506, "loss_lvr": 0.7828469276428223, "loss_mode_switch": 0.0, "loss_total": 0.42703866958618164, "step": 1649 }, { "batch_size": 1, "epoch": 0.6596, "step": 1649, "tokens_per_device": 5106 }, { "epoch": 0.6596, "loss_ce": 0.005857823882251978, "loss_lvr": 0.14713676273822784, "loss_mode_switch": 0.0, "loss_total": 0.020571500062942505, "step": 1649 }, { "batch_size": 4, "epoch": 0.6596, "step": 1649, "tokens_per_device": 4032 }, { "epoch": 0.6596, "loss_ce": 0.859469473361969, "loss_lvr": 0.7564290165901184, "loss_mode_switch": 0.0, "loss_total": 0.9351123571395874, "step": 1649 }, { "batch_size": 4, "epoch": 0.6596, "step": 1649, "tokens_per_device": 5352 }, { "epoch": 0.6596, "loss_ce": 0.017949221655726433, "loss_lvr": 0.6616622805595398, "loss_mode_switch": 0.0, "loss_total": 0.08411545306444168, "step": 1649 }, { "batch_size": 4, "epoch": 0.6596, "step": 1649, "tokens_per_device": 4704 }, { "epoch": 0.6596, "loss_ce": 0.08004740625619888, "loss_lvr": 0.4730440080165863, "loss_mode_switch": 0.0, "loss_total": 0.1273518055677414, "step": 1649 }, { "batch_size": 4, "epoch": 0.6596, "step": 1649, "tokens_per_device": 3884 }, { "epoch": 0.6596, "loss_ce": 0.09038139134645462, "loss_lvr": 0.7882746458053589, "loss_mode_switch": 0.0, "loss_total": 0.1692088544368744, "step": 1649 }, { "batch_size": 4, "epoch": 0.6596, "step": 1649, "tokens_per_device": 12656 }, { "epoch": 0.6596, "loss_ce": 0.17591512203216553, "loss_lvr": 1.0648523569107056, "loss_mode_switch": 0.0, "loss_total": 0.28240036964416504, "step": 1649 }, { "epoch": 0.66, "grad_norm": 1.2120909690856934, "learning_rate": 2.7372648547773063e-06, "loss": 0.2573, "step": 1650 }, { "batch_size": 4, "epoch": 0.66, "step": 1650, "tokens_per_device": 16212 }, { "epoch": 0.66, "loss_ce": 0.026701519265770912, "loss_lvr": 0.4915168285369873, "loss_mode_switch": 0.0, "loss_total": 0.07585320621728897, "step": 1650 }, { "batch_size": 4, "epoch": 0.66, "step": 1650, "tokens_per_device": 3724 }, { "epoch": 0.66, "loss_ce": 0.12139579653739929, "loss_lvr": 0.6173415780067444, "loss_mode_switch": 0.0, "loss_total": 0.1831299513578415, "step": 1650 }, { "batch_size": 4, "epoch": 0.66, "step": 1650, "tokens_per_device": 12760 }, { "epoch": 0.66, "loss_ce": 0.22995558381080627, "loss_lvr": 0.6622846126556396, "loss_mode_switch": 0.0, "loss_total": 0.29618406295776367, "step": 1650 }, { "batch_size": 1, "epoch": 0.66, "step": 1650, "tokens_per_device": 5069 }, { "epoch": 0.66, "loss_ce": 0.43049007654190063, "loss_lvr": 0.42630141973495483, "loss_mode_switch": 0.0, "loss_total": 0.47312021255493164, "step": 1650 }, { "batch_size": 4, "epoch": 0.66, "step": 1650, "tokens_per_device": 1512 }, { "epoch": 0.66, "loss_ce": 0.17842425405979156, "loss_lvr": 0.9464790225028992, "loss_mode_switch": 0.0, "loss_total": 0.27307215332984924, "step": 1650 }, { "batch_size": 4, "epoch": 0.66, "step": 1650, "tokens_per_device": 4244 }, { "epoch": 0.66, "loss_ce": 0.5853615403175354, "loss_lvr": 0.9316849708557129, "loss_mode_switch": 0.0, "loss_total": 0.6785300374031067, "step": 1650 }, { "batch_size": 4, "epoch": 0.66, "step": 1650, "tokens_per_device": 4252 }, { "epoch": 0.66, "loss_ce": 0.039768390357494354, "loss_lvr": 0.9580504894256592, "loss_mode_switch": 0.0, "loss_total": 0.13557344675064087, "step": 1650 }, { "batch_size": 4, "epoch": 0.66, "step": 1650, "tokens_per_device": 6092 }, { "epoch": 0.66, "loss_ce": 0.34672626852989197, "loss_lvr": 0.719137966632843, "loss_mode_switch": 0.0, "loss_total": 0.4186400771141052, "step": 1650 }, { "epoch": 0.6604, "grad_norm": 1.4288995265960693, "learning_rate": 2.7314904963878397e-06, "loss": 0.3162, "step": 1651 }, { "batch_size": 1, "epoch": 0.6604, "step": 1651, "tokens_per_device": 5115 }, { "epoch": 0.6604, "loss_ce": 0.06640096753835678, "loss_lvr": 0.2601160705089569, "loss_mode_switch": 0.0, "loss_total": 0.09241257607936859, "step": 1651 }, { "batch_size": 1, "epoch": 0.6604, "step": 1651, "tokens_per_device": 4896 }, { "epoch": 0.6604, "loss_ce": 0.04044313356280327, "loss_lvr": 0.26795506477355957, "loss_mode_switch": 0.0, "loss_total": 0.06723864376544952, "step": 1651 }, { "batch_size": 4, "epoch": 0.6604, "step": 1651, "tokens_per_device": 4512 }, { "epoch": 0.6604, "loss_ce": 0.1569104939699173, "loss_lvr": 0.7657964825630188, "loss_mode_switch": 0.0, "loss_total": 0.23349013924598694, "step": 1651 }, { "batch_size": 4, "epoch": 0.6604, "step": 1651, "tokens_per_device": 1724 }, { "epoch": 0.6604, "loss_ce": 0.49114254117012024, "loss_lvr": 0.7714551091194153, "loss_mode_switch": 0.0, "loss_total": 0.5682880282402039, "step": 1651 }, { "batch_size": 1, "epoch": 0.6604, "step": 1651, "tokens_per_device": 4874 }, { "epoch": 0.6604, "loss_ce": 0.03854620084166527, "loss_lvr": 0.8914617300033569, "loss_mode_switch": 0.0, "loss_total": 0.12769237160682678, "step": 1651 }, { "batch_size": 4, "epoch": 0.6604, "step": 1651, "tokens_per_device": 9916 }, { "epoch": 0.6604, "loss_ce": 0.08941170573234558, "loss_lvr": 0.36885344982147217, "loss_mode_switch": 0.0, "loss_total": 0.12629705667495728, "step": 1651 }, { "batch_size": 4, "epoch": 0.6604, "step": 1651, "tokens_per_device": 1440 }, { "epoch": 0.6604, "loss_ce": 0.027278253808617592, "loss_lvr": 2.1361582279205322, "loss_mode_switch": 0.0, "loss_total": 0.24089407920837402, "step": 1651 }, { "batch_size": 4, "epoch": 0.6604, "step": 1651, "tokens_per_device": 4024 }, { "epoch": 0.6604, "loss_ce": 0.04250233992934227, "loss_lvr": 1.4982205629348755, "loss_mode_switch": 0.0, "loss_total": 0.19232439994812012, "step": 1651 }, { "epoch": 0.6608, "grad_norm": 1.3463308811187744, "learning_rate": 2.7257199452957693e-06, "loss": 0.2486, "step": 1652 }, { "batch_size": 1, "epoch": 0.6608, "step": 1652, "tokens_per_device": 5238 }, { "epoch": 0.6608, "loss_ce": 0.04806382209062576, "loss_lvr": 0.2603856325149536, "loss_mode_switch": 0.0, "loss_total": 0.07410238683223724, "step": 1652 }, { "batch_size": 4, "epoch": 0.6608, "step": 1652, "tokens_per_device": 2628 }, { "epoch": 0.6608, "loss_ce": 0.11638233810663223, "loss_lvr": 0.745593249797821, "loss_mode_switch": 0.0, "loss_total": 0.19094166159629822, "step": 1652 }, { "batch_size": 4, "epoch": 0.6608, "step": 1652, "tokens_per_device": 5136 }, { "epoch": 0.6608, "loss_ce": 0.1425926238298416, "loss_lvr": 0.7272880673408508, "loss_mode_switch": 0.0, "loss_total": 0.21532142162322998, "step": 1652 }, { "batch_size": 4, "epoch": 0.6608, "step": 1652, "tokens_per_device": 3880 }, { "epoch": 0.6608, "loss_ce": 0.16776807606220245, "loss_lvr": 0.4817826747894287, "loss_mode_switch": 0.0, "loss_total": 0.21594634652137756, "step": 1652 }, { "batch_size": 4, "epoch": 0.6608, "step": 1652, "tokens_per_device": 4280 }, { "epoch": 0.6608, "loss_ce": 0.09542367607355118, "loss_lvr": 0.8805311918258667, "loss_mode_switch": 0.0, "loss_total": 0.18347680568695068, "step": 1652 }, { "batch_size": 4, "epoch": 0.6608, "step": 1652, "tokens_per_device": 5768 }, { "epoch": 0.6608, "loss_ce": 0.15738050639629364, "loss_lvr": 0.8240853548049927, "loss_mode_switch": 0.0, "loss_total": 0.23978903889656067, "step": 1652 }, { "batch_size": 4, "epoch": 0.6608, "step": 1652, "tokens_per_device": 3708 }, { "epoch": 0.6608, "loss_ce": 0.4745675027370453, "loss_lvr": 0.8574374914169312, "loss_mode_switch": 0.0, "loss_total": 0.5603112578392029, "step": 1652 }, { "batch_size": 4, "epoch": 0.6608, "step": 1652, "tokens_per_device": 5128 }, { "epoch": 0.6608, "loss_ce": 0.4606735408306122, "loss_lvr": 0.7802883386611938, "loss_mode_switch": 0.0, "loss_total": 0.5387023687362671, "step": 1652 }, { "epoch": 0.6612, "grad_norm": 1.3933093547821045, "learning_rate": 2.719953211185957e-06, "loss": 0.2674, "step": 1653 }, { "batch_size": 4, "epoch": 0.6612, "step": 1653, "tokens_per_device": 5920 }, { "epoch": 0.6612, "loss_ce": 0.2894347012042999, "loss_lvr": 0.8702282905578613, "loss_mode_switch": 0.0, "loss_total": 0.376457542181015, "step": 1653 }, { "batch_size": 1, "epoch": 0.6612, "step": 1653, "tokens_per_device": 5256 }, { "epoch": 0.6612, "loss_ce": 0.0009114974527619779, "loss_lvr": 0.3745390772819519, "loss_mode_switch": 0.0, "loss_total": 0.038365405052900314, "step": 1653 }, { "batch_size": 4, "epoch": 0.6612, "step": 1653, "tokens_per_device": 2688 }, { "epoch": 0.6612, "loss_ce": 0.38638201355934143, "loss_lvr": 0.9247246384620667, "loss_mode_switch": 0.0, "loss_total": 0.4788544774055481, "step": 1653 }, { "batch_size": 4, "epoch": 0.6612, "step": 1653, "tokens_per_device": 4244 }, { "epoch": 0.6612, "loss_ce": 0.1980942040681839, "loss_lvr": 0.7461919188499451, "loss_mode_switch": 0.0, "loss_total": 0.27271339297294617, "step": 1653 }, { "batch_size": 4, "epoch": 0.6612, "step": 1653, "tokens_per_device": 4072 }, { "epoch": 0.6612, "loss_ce": 0.5766916871070862, "loss_lvr": 0.5014638304710388, "loss_mode_switch": 0.0, "loss_total": 0.6268380880355835, "step": 1653 }, { "batch_size": 4, "epoch": 0.6612, "step": 1653, "tokens_per_device": 5340 }, { "epoch": 0.6612, "loss_ce": 0.2308999001979828, "loss_lvr": 0.697809100151062, "loss_mode_switch": 0.0, "loss_total": 0.30068081617355347, "step": 1653 }, { "batch_size": 4, "epoch": 0.6612, "step": 1653, "tokens_per_device": 2548 }, { "epoch": 0.6612, "loss_ce": 0.01853054389357567, "loss_lvr": 0.8964277505874634, "loss_mode_switch": 0.0, "loss_total": 0.10817332565784454, "step": 1653 }, { "batch_size": 1, "epoch": 0.6612, "step": 1653, "tokens_per_device": 4888 }, { "epoch": 0.6612, "loss_ce": 0.12423215806484222, "loss_lvr": 1.621521234512329, "loss_mode_switch": 0.0, "loss_total": 0.2863842844963074, "step": 1653 }, { "epoch": 0.6616, "grad_norm": 1.4134217500686646, "learning_rate": 2.71419030373686e-06, "loss": 0.2697, "step": 1654 }, { "batch_size": 4, "epoch": 0.6616, "step": 1654, "tokens_per_device": 5764 }, { "epoch": 0.6616, "loss_ce": 0.40853404998779297, "loss_lvr": 1.014641284942627, "loss_mode_switch": 0.0, "loss_total": 0.5099982023239136, "step": 1654 }, { "batch_size": 4, "epoch": 0.6616, "step": 1654, "tokens_per_device": 1696 }, { "epoch": 0.6616, "loss_ce": 0.48873451352119446, "loss_lvr": 1.0960102081298828, "loss_mode_switch": 0.0, "loss_total": 0.5983355045318604, "step": 1654 }, { "batch_size": 1, "epoch": 0.6616, "step": 1654, "tokens_per_device": 4872 }, { "epoch": 0.6616, "loss_ce": 0.000563529203645885, "loss_lvr": 0.4276920557022095, "loss_mode_switch": 0.0, "loss_total": 0.04333273321390152, "step": 1654 }, { "batch_size": 1, "epoch": 0.6616, "step": 1654, "tokens_per_device": 6264 }, { "epoch": 0.6616, "loss_ce": 0.05572640895843506, "loss_lvr": 0.3517461121082306, "loss_mode_switch": 0.0, "loss_total": 0.09090101718902588, "step": 1654 }, { "batch_size": 1, "epoch": 0.6616, "step": 1654, "tokens_per_device": 5113 }, { "epoch": 0.6616, "loss_ce": 0.029984714463353157, "loss_lvr": 0.33647269010543823, "loss_mode_switch": 0.0, "loss_total": 0.06363198161125183, "step": 1654 }, { "batch_size": 4, "epoch": 0.6616, "step": 1654, "tokens_per_device": 1416 }, { "epoch": 0.6616, "loss_ce": 0.5890626907348633, "loss_lvr": 0.9470269083976746, "loss_mode_switch": 0.0, "loss_total": 0.6837654113769531, "step": 1654 }, { "batch_size": 4, "epoch": 0.6616, "step": 1654, "tokens_per_device": 5860 }, { "epoch": 0.6616, "loss_ce": 0.5771580934524536, "loss_lvr": 0.8389768004417419, "loss_mode_switch": 0.0, "loss_total": 0.6610558032989502, "step": 1654 }, { "batch_size": 4, "epoch": 0.6616, "step": 1654, "tokens_per_device": 4904 }, { "epoch": 0.6616, "loss_ce": 0.25281962752342224, "loss_lvr": 0.8233432769775391, "loss_mode_switch": 0.0, "loss_total": 0.3351539671421051, "step": 1654 }, { "epoch": 0.662, "grad_norm": 1.2733025550842285, "learning_rate": 2.7084312326205164e-06, "loss": 0.3129, "step": 1655 }, { "batch_size": 4, "epoch": 0.662, "step": 1655, "tokens_per_device": 2596 }, { "epoch": 0.662, "loss_ce": 0.15640662610530853, "loss_lvr": 1.1316626071929932, "loss_mode_switch": 0.0, "loss_total": 0.2695728838443756, "step": 1655 }, { "batch_size": 1, "epoch": 0.662, "step": 1655, "tokens_per_device": 5107 }, { "epoch": 0.662, "loss_ce": 0.013386482372879982, "loss_lvr": 0.31804966926574707, "loss_mode_switch": 0.0, "loss_total": 0.0451914519071579, "step": 1655 }, { "batch_size": 1, "epoch": 0.662, "step": 1655, "tokens_per_device": 5171 }, { "epoch": 0.662, "loss_ce": 0.11831774562597275, "loss_lvr": 0.3890584409236908, "loss_mode_switch": 0.0, "loss_total": 0.15722358226776123, "step": 1655 }, { "batch_size": 1, "epoch": 0.662, "step": 1655, "tokens_per_device": 5001 }, { "epoch": 0.662, "loss_ce": 0.1651570200920105, "loss_lvr": 0.25970402359962463, "loss_mode_switch": 0.0, "loss_total": 0.19112741947174072, "step": 1655 }, { "batch_size": 4, "epoch": 0.662, "step": 1655, "tokens_per_device": 5148 }, { "epoch": 0.662, "loss_ce": 0.2771560847759247, "loss_lvr": 0.7637750506401062, "loss_mode_switch": 0.0, "loss_total": 0.3535335958003998, "step": 1655 }, { "batch_size": 1, "epoch": 0.662, "step": 1655, "tokens_per_device": 5103 }, { "epoch": 0.662, "loss_ce": 0.0022185416892170906, "loss_lvr": 0.45681482553482056, "loss_mode_switch": 0.0, "loss_total": 0.04790002480149269, "step": 1655 }, { "batch_size": 4, "epoch": 0.662, "step": 1655, "tokens_per_device": 6656 }, { "epoch": 0.662, "loss_ce": 0.05372173711657524, "loss_lvr": 0.9402877688407898, "loss_mode_switch": 0.0, "loss_total": 0.14775051176548004, "step": 1655 }, { "batch_size": 4, "epoch": 0.662, "step": 1655, "tokens_per_device": 2720 }, { "epoch": 0.662, "loss_ce": 0.03724994137883186, "loss_lvr": 0.9177600145339966, "loss_mode_switch": 0.0, "loss_total": 0.12902595102787018, "step": 1655 }, { "epoch": 0.6624, "grad_norm": 1.2502862215042114, "learning_rate": 2.7026760075025195e-06, "loss": 0.2903, "step": 1656 }, { "batch_size": 1, "epoch": 0.6624, "step": 1656, "tokens_per_device": 5103 }, { "epoch": 0.6624, "loss_ce": 0.049417831003665924, "loss_lvr": 0.6035789251327515, "loss_mode_switch": 0.0, "loss_total": 0.10977572202682495, "step": 1656 }, { "batch_size": 4, "epoch": 0.6624, "step": 1656, "tokens_per_device": 6448 }, { "epoch": 0.6624, "loss_ce": 0.18739329278469086, "loss_lvr": 0.7725397348403931, "loss_mode_switch": 0.0, "loss_total": 0.2646472752094269, "step": 1656 }, { "batch_size": 1, "epoch": 0.6624, "step": 1656, "tokens_per_device": 4906 }, { "epoch": 0.6624, "loss_ce": 0.015282279811799526, "loss_lvr": 0.22337651252746582, "loss_mode_switch": 0.0, "loss_total": 0.037619929760694504, "step": 1656 }, { "batch_size": 4, "epoch": 0.6624, "step": 1656, "tokens_per_device": 4904 }, { "epoch": 0.6624, "loss_ce": 0.12814132869243622, "loss_lvr": 0.7436153888702393, "loss_mode_switch": 0.0, "loss_total": 0.20250287652015686, "step": 1656 }, { "batch_size": 1, "epoch": 0.6624, "step": 1656, "tokens_per_device": 5146 }, { "epoch": 0.6624, "loss_ce": 0.014382004737854004, "loss_lvr": 0.19342316687107086, "loss_mode_switch": 0.0, "loss_total": 0.03372432291507721, "step": 1656 }, { "batch_size": 4, "epoch": 0.6624, "step": 1656, "tokens_per_device": 3384 }, { "epoch": 0.6624, "loss_ce": 0.32054635882377625, "loss_lvr": 1.674076795578003, "loss_mode_switch": 0.0, "loss_total": 0.4879540205001831, "step": 1656 }, { "batch_size": 4, "epoch": 0.6624, "step": 1656, "tokens_per_device": 2088 }, { "epoch": 0.6624, "loss_ce": 0.16988898813724518, "loss_lvr": 0.7833005785942078, "loss_mode_switch": 0.0, "loss_total": 0.24821904301643372, "step": 1656 }, { "batch_size": 4, "epoch": 0.6624, "step": 1656, "tokens_per_device": 4272 }, { "epoch": 0.6624, "loss_ce": 0.26645326614379883, "loss_lvr": 0.8445202708244324, "loss_mode_switch": 0.0, "loss_total": 0.35090529918670654, "step": 1656 }, { "epoch": 0.6628, "grad_norm": 1.4361492395401, "learning_rate": 2.6969246380420088e-06, "loss": 0.2702, "step": 1657 }, { "batch_size": 4, "epoch": 0.6628, "step": 1657, "tokens_per_device": 4956 }, { "epoch": 0.6628, "loss_ce": 0.050408802926540375, "loss_lvr": 0.7534893751144409, "loss_mode_switch": 0.0, "loss_total": 0.12575773894786835, "step": 1657 }, { "batch_size": 4, "epoch": 0.6628, "step": 1657, "tokens_per_device": 2552 }, { "epoch": 0.6628, "loss_ce": 0.1558840572834015, "loss_lvr": 0.8436473608016968, "loss_mode_switch": 0.0, "loss_total": 0.24024879932403564, "step": 1657 }, { "batch_size": 4, "epoch": 0.6628, "step": 1657, "tokens_per_device": 2664 }, { "epoch": 0.6628, "loss_ce": 0.42630860209465027, "loss_lvr": 0.8957122564315796, "loss_mode_switch": 0.0, "loss_total": 0.5158798098564148, "step": 1657 }, { "batch_size": 1, "epoch": 0.6628, "step": 1657, "tokens_per_device": 6186 }, { "epoch": 0.6628, "loss_ce": 0.034994252026081085, "loss_lvr": 0.3324498236179352, "loss_mode_switch": 0.0, "loss_total": 0.0682392343878746, "step": 1657 }, { "batch_size": 1, "epoch": 0.6628, "step": 1657, "tokens_per_device": 4831 }, { "epoch": 0.6628, "loss_ce": 0.08364526182413101, "loss_lvr": 0.3819928467273712, "loss_mode_switch": 0.0, "loss_total": 0.12184454500675201, "step": 1657 }, { "batch_size": 4, "epoch": 0.6628, "step": 1657, "tokens_per_device": 1680 }, { "epoch": 0.6628, "loss_ce": 0.4040515422821045, "loss_lvr": 0.9829527735710144, "loss_mode_switch": 0.0, "loss_total": 0.5023468136787415, "step": 1657 }, { "batch_size": 1, "epoch": 0.6628, "step": 1657, "tokens_per_device": 4843 }, { "epoch": 0.6628, "loss_ce": 0.13944672048091888, "loss_lvr": 0.38389021158218384, "loss_mode_switch": 0.0, "loss_total": 0.17783574759960175, "step": 1657 }, { "batch_size": 1, "epoch": 0.6628, "step": 1657, "tokens_per_device": 5466 }, { "epoch": 0.6628, "loss_ce": 0.3280960023403168, "loss_lvr": 0.5785990953445435, "loss_mode_switch": 0.0, "loss_total": 0.38595589995384216, "step": 1657 }, { "epoch": 0.6632, "grad_norm": 1.5033849477767944, "learning_rate": 2.691177133891658e-06, "loss": 0.2993, "step": 1658 }, { "batch_size": 1, "epoch": 0.6632, "step": 1658, "tokens_per_device": 4164 }, { "epoch": 0.6632, "loss_ce": 0.0006710097659379244, "loss_lvr": 0.3370840847492218, "loss_mode_switch": 0.0, "loss_total": 0.034379418939352036, "step": 1658 }, { "batch_size": 4, "epoch": 0.6632, "step": 1658, "tokens_per_device": 5176 }, { "epoch": 0.6632, "loss_ce": 0.17672011256217957, "loss_lvr": 0.9993235468864441, "loss_mode_switch": 0.0, "loss_total": 0.276652455329895, "step": 1658 }, { "batch_size": 4, "epoch": 0.6632, "step": 1658, "tokens_per_device": 1444 }, { "epoch": 0.6632, "loss_ce": 0.26762259006500244, "loss_lvr": 1.0579655170440674, "loss_mode_switch": 0.0, "loss_total": 0.3734191358089447, "step": 1658 }, { "batch_size": 4, "epoch": 0.6632, "step": 1658, "tokens_per_device": 3576 }, { "epoch": 0.6632, "loss_ce": 0.43925097584724426, "loss_lvr": 0.9064416885375977, "loss_mode_switch": 0.0, "loss_total": 0.5298951268196106, "step": 1658 }, { "batch_size": 4, "epoch": 0.6632, "step": 1658, "tokens_per_device": 2560 }, { "epoch": 0.6632, "loss_ce": 0.3437376320362091, "loss_lvr": 0.8464689254760742, "loss_mode_switch": 0.0, "loss_total": 0.42838454246520996, "step": 1658 }, { "batch_size": 4, "epoch": 0.6632, "step": 1658, "tokens_per_device": 5072 }, { "epoch": 0.6632, "loss_ce": 0.06058830767869949, "loss_lvr": 0.7628021836280823, "loss_mode_switch": 0.0, "loss_total": 0.13686853647232056, "step": 1658 }, { "batch_size": 4, "epoch": 0.6632, "step": 1658, "tokens_per_device": 4544 }, { "epoch": 0.6632, "loss_ce": 0.0020669957157224417, "loss_lvr": 0.6510509252548218, "loss_mode_switch": 0.0, "loss_total": 0.0671720877289772, "step": 1658 }, { "batch_size": 4, "epoch": 0.6632, "step": 1658, "tokens_per_device": 8708 }, { "epoch": 0.6632, "loss_ce": 0.11680345237255096, "loss_lvr": 0.8680906891822815, "loss_mode_switch": 0.0, "loss_total": 0.2036125212907791, "step": 1658 }, { "epoch": 0.6636, "grad_norm": 1.2539887428283691, "learning_rate": 2.685433504697647e-06, "loss": 0.2793, "step": 1659 }, { "batch_size": 1, "epoch": 0.6636, "step": 1659, "tokens_per_device": 4911 }, { "epoch": 0.6636, "loss_ce": 0.0002371586742810905, "loss_lvr": 0.4979017376899719, "loss_mode_switch": 0.0, "loss_total": 0.050027333199977875, "step": 1659 }, { "batch_size": 1, "epoch": 0.6636, "step": 1659, "tokens_per_device": 5466 }, { "epoch": 0.6636, "loss_ce": 0.0006021871813572943, "loss_lvr": 0.361685574054718, "loss_mode_switch": 0.0, "loss_total": 0.036770742386579514, "step": 1659 }, { "batch_size": 1, "epoch": 0.6636, "step": 1659, "tokens_per_device": 4958 }, { "epoch": 0.6636, "loss_ce": 0.00548303360119462, "loss_lvr": 0.21890988945960999, "loss_mode_switch": 0.0, "loss_total": 0.0273740217089653, "step": 1659 }, { "batch_size": 4, "epoch": 0.6636, "step": 1659, "tokens_per_device": 3828 }, { "epoch": 0.6636, "loss_ce": 0.2163509726524353, "loss_lvr": 1.0748403072357178, "loss_mode_switch": 0.0, "loss_total": 0.32383501529693604, "step": 1659 }, { "batch_size": 4, "epoch": 0.6636, "step": 1659, "tokens_per_device": 14720 }, { "epoch": 0.6636, "loss_ce": 0.08058822900056839, "loss_lvr": 0.7137993574142456, "loss_mode_switch": 0.0, "loss_total": 0.15196816623210907, "step": 1659 }, { "batch_size": 4, "epoch": 0.6636, "step": 1659, "tokens_per_device": 5792 }, { "epoch": 0.6636, "loss_ce": 0.2855628728866577, "loss_lvr": 0.7152634859085083, "loss_mode_switch": 0.0, "loss_total": 0.35708922147750854, "step": 1659 }, { "batch_size": 1, "epoch": 0.6636, "step": 1659, "tokens_per_device": 4869 }, { "epoch": 0.6636, "loss_ce": 0.008192047476768494, "loss_lvr": 0.26716023683547974, "loss_mode_switch": 0.0, "loss_total": 0.03490807116031647, "step": 1659 }, { "batch_size": 4, "epoch": 0.6636, "step": 1659, "tokens_per_device": 7472 }, { "epoch": 0.6636, "loss_ce": 0.0872223898768425, "loss_lvr": 0.8457421064376831, "loss_mode_switch": 0.0, "loss_total": 0.17179660499095917, "step": 1659 }, { "epoch": 0.664, "grad_norm": 1.6420453786849976, "learning_rate": 2.6796937600996587e-06, "loss": 0.3107, "step": 1660 }, { "batch_size": 4, "epoch": 0.664, "step": 1660, "tokens_per_device": 3832 }, { "epoch": 0.664, "loss_ce": 0.1374405026435852, "loss_lvr": 0.6003324389457703, "loss_mode_switch": 0.0, "loss_total": 0.19747374951839447, "step": 1660 }, { "batch_size": 4, "epoch": 0.664, "step": 1660, "tokens_per_device": 3864 }, { "epoch": 0.664, "loss_ce": 0.1734568327665329, "loss_lvr": 0.9939014315605164, "loss_mode_switch": 0.0, "loss_total": 0.2728469669818878, "step": 1660 }, { "batch_size": 4, "epoch": 0.664, "step": 1660, "tokens_per_device": 1412 }, { "epoch": 0.664, "loss_ce": 0.15729406476020813, "loss_lvr": 0.8270297646522522, "loss_mode_switch": 0.0, "loss_total": 0.2399970442056656, "step": 1660 }, { "batch_size": 4, "epoch": 0.664, "step": 1660, "tokens_per_device": 1292 }, { "epoch": 0.664, "loss_ce": 0.7317952513694763, "loss_lvr": 0.9660143256187439, "loss_mode_switch": 0.0, "loss_total": 0.8283966779708862, "step": 1660 }, { "batch_size": 4, "epoch": 0.664, "step": 1660, "tokens_per_device": 3740 }, { "epoch": 0.664, "loss_ce": 0.38669589161872864, "loss_lvr": 0.8520925641059875, "loss_mode_switch": 0.0, "loss_total": 0.4719051420688629, "step": 1660 }, { "batch_size": 1, "epoch": 0.664, "step": 1660, "tokens_per_device": 4885 }, { "epoch": 0.664, "loss_ce": 0.011384417302906513, "loss_lvr": 0.2753998339176178, "loss_mode_switch": 0.0, "loss_total": 0.03892439976334572, "step": 1660 }, { "batch_size": 4, "epoch": 0.664, "step": 1660, "tokens_per_device": 8256 }, { "epoch": 0.664, "loss_ce": 0.5343356132507324, "loss_lvr": 0.5223104357719421, "loss_mode_switch": 0.0, "loss_total": 0.586566686630249, "step": 1660 }, { "batch_size": 4, "epoch": 0.664, "step": 1660, "tokens_per_device": 2808 }, { "epoch": 0.664, "loss_ce": 0.26263102889060974, "loss_lvr": 0.5975489616394043, "loss_mode_switch": 0.0, "loss_total": 0.3223859369754791, "step": 1660 }, { "epoch": 0.6644, "grad_norm": 1.3227119445800781, "learning_rate": 2.67395790973085e-06, "loss": 0.2652, "step": 1661 }, { "batch_size": 4, "epoch": 0.6644, "step": 1661, "tokens_per_device": 1888 }, { "epoch": 0.6644, "loss_ce": 0.7705737352371216, "loss_lvr": 0.9091429710388184, "loss_mode_switch": 0.0, "loss_total": 0.8614880442619324, "step": 1661 }, { "batch_size": 4, "epoch": 0.6644, "step": 1661, "tokens_per_device": 5560 }, { "epoch": 0.6644, "loss_ce": 0.1503005176782608, "loss_lvr": 1.1364690065383911, "loss_mode_switch": 0.0, "loss_total": 0.26394742727279663, "step": 1661 }, { "batch_size": 4, "epoch": 0.6644, "step": 1661, "tokens_per_device": 2620 }, { "epoch": 0.6644, "loss_ce": 0.5307214260101318, "loss_lvr": 0.9207984209060669, "loss_mode_switch": 0.0, "loss_total": 0.6228012442588806, "step": 1661 }, { "batch_size": 4, "epoch": 0.6644, "step": 1661, "tokens_per_device": 4728 }, { "epoch": 0.6644, "loss_ce": 0.3686721920967102, "loss_lvr": 0.8372697234153748, "loss_mode_switch": 0.0, "loss_total": 0.4523991644382477, "step": 1661 }, { "batch_size": 1, "epoch": 0.6644, "step": 1661, "tokens_per_device": 4873 }, { "epoch": 0.6644, "loss_ce": 0.021930547431111336, "loss_lvr": 0.18348997831344604, "loss_mode_switch": 0.0, "loss_total": 0.04027954488992691, "step": 1661 }, { "batch_size": 4, "epoch": 0.6644, "step": 1661, "tokens_per_device": 4332 }, { "epoch": 0.6644, "loss_ce": 0.14184917509555817, "loss_lvr": 0.9530852437019348, "loss_mode_switch": 0.0, "loss_total": 0.2371577024459839, "step": 1661 }, { "batch_size": 1, "epoch": 0.6644, "step": 1661, "tokens_per_device": 7208 }, { "epoch": 0.6644, "loss_ce": 0.07433093339204788, "loss_lvr": 0.3318117558956146, "loss_mode_switch": 0.0, "loss_total": 0.10751210898160934, "step": 1661 }, { "batch_size": 1, "epoch": 0.6644, "step": 1661, "tokens_per_device": 4976 }, { "epoch": 0.6644, "loss_ce": 0.29995545744895935, "loss_lvr": 0.5741848349571228, "loss_mode_switch": 0.0, "loss_total": 0.3573739528656006, "step": 1661 }, { "epoch": 0.6648, "grad_norm": 1.6274950504302979, "learning_rate": 2.668225963217844e-06, "loss": 0.332, "step": 1662 }, { "batch_size": 4, "epoch": 0.6648, "step": 1662, "tokens_per_device": 1676 }, { "epoch": 0.6648, "loss_ce": 0.751950204372406, "loss_lvr": 0.9189547896385193, "loss_mode_switch": 0.0, "loss_total": 0.8438456654548645, "step": 1662 }, { "batch_size": 4, "epoch": 0.6648, "step": 1662, "tokens_per_device": 3860 }, { "epoch": 0.6648, "loss_ce": 0.6406406760215759, "loss_lvr": 0.8236667513847351, "loss_mode_switch": 0.0, "loss_total": 0.723007321357727, "step": 1662 }, { "batch_size": 4, "epoch": 0.6648, "step": 1662, "tokens_per_device": 10900 }, { "epoch": 0.6648, "loss_ce": 0.25883978605270386, "loss_lvr": 0.5329465866088867, "loss_mode_switch": 0.0, "loss_total": 0.31213444471359253, "step": 1662 }, { "batch_size": 1, "epoch": 0.6648, "step": 1662, "tokens_per_device": 4896 }, { "epoch": 0.6648, "loss_ce": 0.03410583361983299, "loss_lvr": 0.8140403628349304, "loss_mode_switch": 0.0, "loss_total": 0.11550986766815186, "step": 1662 }, { "batch_size": 4, "epoch": 0.6648, "step": 1662, "tokens_per_device": 6208 }, { "epoch": 0.6648, "loss_ce": 0.08446354418992996, "loss_lvr": 0.6857671141624451, "loss_mode_switch": 0.0, "loss_total": 0.15304026007652283, "step": 1662 }, { "batch_size": 4, "epoch": 0.6648, "step": 1662, "tokens_per_device": 6244 }, { "epoch": 0.6648, "loss_ce": 0.0963965654373169, "loss_lvr": 0.5868720412254333, "loss_mode_switch": 0.0, "loss_total": 0.1550837755203247, "step": 1662 }, { "batch_size": 4, "epoch": 0.6648, "step": 1662, "tokens_per_device": 4512 }, { "epoch": 0.6648, "loss_ce": 0.0325382761657238, "loss_lvr": 0.7230223417282104, "loss_mode_switch": 0.0, "loss_total": 0.10484051704406738, "step": 1662 }, { "batch_size": 1, "epoch": 0.6648, "step": 1662, "tokens_per_device": 5101 }, { "epoch": 0.6648, "loss_ce": 0.010996103286743164, "loss_lvr": 0.3679269552230835, "loss_mode_switch": 0.0, "loss_total": 0.047788798809051514, "step": 1662 }, { "epoch": 0.6652, "grad_norm": 1.4378764629364014, "learning_rate": 2.662497930180715e-06, "loss": 0.3223, "step": 1663 }, { "batch_size": 4, "epoch": 0.6652, "step": 1663, "tokens_per_device": 5648 }, { "epoch": 0.6652, "loss_ce": 0.8417845368385315, "loss_lvr": 0.829304575920105, "loss_mode_switch": 0.0, "loss_total": 0.924714982509613, "step": 1663 }, { "batch_size": 4, "epoch": 0.6652, "step": 1663, "tokens_per_device": 4276 }, { "epoch": 0.6652, "loss_ce": 0.28834274411201477, "loss_lvr": 0.7147474884986877, "loss_mode_switch": 0.0, "loss_total": 0.3598175048828125, "step": 1663 }, { "batch_size": 4, "epoch": 0.6652, "step": 1663, "tokens_per_device": 4848 }, { "epoch": 0.6652, "loss_ce": 0.4138522148132324, "loss_lvr": 0.7611791491508484, "loss_mode_switch": 0.0, "loss_total": 0.4899701476097107, "step": 1663 }, { "batch_size": 4, "epoch": 0.6652, "step": 1663, "tokens_per_device": 1436 }, { "epoch": 0.6652, "loss_ce": 0.43326157331466675, "loss_lvr": 1.1489120721817017, "loss_mode_switch": 0.0, "loss_total": 0.5481528043746948, "step": 1663 }, { "batch_size": 4, "epoch": 0.6652, "step": 1663, "tokens_per_device": 2404 }, { "epoch": 0.6652, "loss_ce": 0.5590890645980835, "loss_lvr": 0.9826940298080444, "loss_mode_switch": 0.0, "loss_total": 0.6573584675788879, "step": 1663 }, { "batch_size": 4, "epoch": 0.6652, "step": 1663, "tokens_per_device": 2524 }, { "epoch": 0.6652, "loss_ce": 0.4120456576347351, "loss_lvr": 1.0070464611053467, "loss_mode_switch": 0.0, "loss_total": 0.5127503275871277, "step": 1663 }, { "batch_size": 4, "epoch": 0.6652, "step": 1663, "tokens_per_device": 3852 }, { "epoch": 0.6652, "loss_ce": 0.04662308469414711, "loss_lvr": 0.8409003019332886, "loss_mode_switch": 0.0, "loss_total": 0.13071312010288239, "step": 1663 }, { "batch_size": 4, "epoch": 0.6652, "step": 1663, "tokens_per_device": 3796 }, { "epoch": 0.6652, "loss_ce": 0.23514685034751892, "loss_lvr": 0.9598379135131836, "loss_mode_switch": 0.0, "loss_total": 0.33113065361976624, "step": 1663 }, { "epoch": 0.6656, "grad_norm": 1.578210711479187, "learning_rate": 2.6567738202329684e-06, "loss": 0.2997, "step": 1664 }, { "batch_size": 4, "epoch": 0.6656, "step": 1664, "tokens_per_device": 4212 }, { "epoch": 0.6656, "loss_ce": 0.09828121960163116, "loss_lvr": 1.0627412796020508, "loss_mode_switch": 0.0, "loss_total": 0.20455534756183624, "step": 1664 }, { "batch_size": 4, "epoch": 0.6656, "step": 1664, "tokens_per_device": 5940 }, { "epoch": 0.6656, "loss_ce": 0.12758418917655945, "loss_lvr": 0.6305144429206848, "loss_mode_switch": 0.0, "loss_total": 0.19063563644886017, "step": 1664 }, { "batch_size": 4, "epoch": 0.6656, "step": 1664, "tokens_per_device": 1184 }, { "epoch": 0.6656, "loss_ce": 0.1623382270336151, "loss_lvr": 1.110960602760315, "loss_mode_switch": 0.0, "loss_total": 0.27343428134918213, "step": 1664 }, { "batch_size": 4, "epoch": 0.6656, "step": 1664, "tokens_per_device": 1376 }, { "epoch": 0.6656, "loss_ce": 0.3409343659877777, "loss_lvr": 0.9389601349830627, "loss_mode_switch": 0.0, "loss_total": 0.43483036756515503, "step": 1664 }, { "batch_size": 4, "epoch": 0.6656, "step": 1664, "tokens_per_device": 2924 }, { "epoch": 0.6656, "loss_ce": 0.021924659609794617, "loss_lvr": 0.8112665414810181, "loss_mode_switch": 0.0, "loss_total": 0.1030513122677803, "step": 1664 }, { "batch_size": 4, "epoch": 0.6656, "step": 1664, "tokens_per_device": 4668 }, { "epoch": 0.6656, "loss_ce": 0.17666184902191162, "loss_lvr": 0.8228522539138794, "loss_mode_switch": 0.0, "loss_total": 0.25894707441329956, "step": 1664 }, { "batch_size": 1, "epoch": 0.6656, "step": 1664, "tokens_per_device": 5133 }, { "epoch": 0.6656, "loss_ce": 0.05377105623483658, "loss_lvr": 0.40953582525253296, "loss_mode_switch": 0.0, "loss_total": 0.094724640250206, "step": 1664 }, { "batch_size": 4, "epoch": 0.6656, "step": 1664, "tokens_per_device": 1500 }, { "epoch": 0.6656, "loss_ce": 0.6533191204071045, "loss_lvr": 0.7682216167449951, "loss_mode_switch": 0.0, "loss_total": 0.730141282081604, "step": 1664 }, { "epoch": 0.666, "grad_norm": 1.277389645576477, "learning_rate": 2.6510536429815224e-06, "loss": 0.2742, "step": 1665 }, { "batch_size": 4, "epoch": 0.666, "step": 1665, "tokens_per_device": 4352 }, { "epoch": 0.666, "loss_ce": 0.5525189638137817, "loss_lvr": 0.8700913786888123, "loss_mode_switch": 0.0, "loss_total": 0.6395280957221985, "step": 1665 }, { "batch_size": 1, "epoch": 0.666, "step": 1665, "tokens_per_device": 4874 }, { "epoch": 0.666, "loss_ce": 0.005935211665928364, "loss_lvr": 0.23316021263599396, "loss_mode_switch": 0.0, "loss_total": 0.029251232743263245, "step": 1665 }, { "batch_size": 4, "epoch": 0.666, "step": 1665, "tokens_per_device": 12644 }, { "epoch": 0.666, "loss_ce": 0.15096548199653625, "loss_lvr": 0.6367199420928955, "loss_mode_switch": 0.0, "loss_total": 0.21463748812675476, "step": 1665 }, { "batch_size": 4, "epoch": 0.666, "step": 1665, "tokens_per_device": 4384 }, { "epoch": 0.666, "loss_ce": 0.023279262706637383, "loss_lvr": 0.7126429080963135, "loss_mode_switch": 0.0, "loss_total": 0.09454355388879776, "step": 1665 }, { "batch_size": 4, "epoch": 0.666, "step": 1665, "tokens_per_device": 5732 }, { "epoch": 0.666, "loss_ce": 0.003044778248295188, "loss_lvr": 0.4139527380466461, "loss_mode_switch": 0.0, "loss_total": 0.044440049678087234, "step": 1665 }, { "batch_size": 1, "epoch": 0.666, "step": 1665, "tokens_per_device": 5073 }, { "epoch": 0.666, "loss_ce": 0.002070059534162283, "loss_lvr": 0.24418412148952484, "loss_mode_switch": 0.0, "loss_total": 0.026488471776247025, "step": 1665 }, { "batch_size": 4, "epoch": 0.666, "step": 1665, "tokens_per_device": 6000 }, { "epoch": 0.666, "loss_ce": 0.22263331711292267, "loss_lvr": 0.8216849565505981, "loss_mode_switch": 0.0, "loss_total": 0.3048018217086792, "step": 1665 }, { "batch_size": 4, "epoch": 0.666, "step": 1665, "tokens_per_device": 9236 }, { "epoch": 0.666, "loss_ce": 0.38677534461021423, "loss_lvr": 0.6629906892776489, "loss_mode_switch": 0.0, "loss_total": 0.4530744254589081, "step": 1665 }, { "epoch": 0.6664, "grad_norm": 1.277658224105835, "learning_rate": 2.6453374080266947e-06, "loss": 0.2662, "step": 1666 }, { "batch_size": 4, "epoch": 0.6664, "step": 1666, "tokens_per_device": 4292 }, { "epoch": 0.6664, "loss_ce": 0.4170069694519043, "loss_lvr": 1.0127654075622559, "loss_mode_switch": 0.0, "loss_total": 0.518283486366272, "step": 1666 }, { "batch_size": 4, "epoch": 0.6664, "step": 1666, "tokens_per_device": 4804 }, { "epoch": 0.6664, "loss_ce": 0.3443143665790558, "loss_lvr": 0.46535995602607727, "loss_mode_switch": 0.0, "loss_total": 0.39085036516189575, "step": 1666 }, { "batch_size": 4, "epoch": 0.6664, "step": 1666, "tokens_per_device": 4460 }, { "epoch": 0.6664, "loss_ce": 0.5414130091667175, "loss_lvr": 1.0937479734420776, "loss_mode_switch": 0.0, "loss_total": 0.6507878303527832, "step": 1666 }, { "batch_size": 1, "epoch": 0.6664, "step": 1666, "tokens_per_device": 4858 }, { "epoch": 0.6664, "loss_ce": 0.283869206905365, "loss_lvr": 0.5730423331260681, "loss_mode_switch": 0.0, "loss_total": 0.3411734402179718, "step": 1666 }, { "batch_size": 1, "epoch": 0.6664, "step": 1666, "tokens_per_device": 4730 }, { "epoch": 0.6664, "loss_ce": 0.00018064315372612327, "loss_lvr": 0.29767292737960815, "loss_mode_switch": 0.0, "loss_total": 0.029947936534881592, "step": 1666 }, { "batch_size": 4, "epoch": 0.6664, "step": 1666, "tokens_per_device": 4196 }, { "epoch": 0.6664, "loss_ce": 0.4037107527256012, "loss_lvr": 0.8197965621948242, "loss_mode_switch": 0.0, "loss_total": 0.4856904149055481, "step": 1666 }, { "batch_size": 4, "epoch": 0.6664, "step": 1666, "tokens_per_device": 5868 }, { "epoch": 0.6664, "loss_ce": 0.21776148676872253, "loss_lvr": 0.6784245371818542, "loss_mode_switch": 0.0, "loss_total": 0.28560394048690796, "step": 1666 }, { "batch_size": 4, "epoch": 0.6664, "step": 1666, "tokens_per_device": 4256 }, { "epoch": 0.6664, "loss_ce": 0.5166294574737549, "loss_lvr": 0.9824328422546387, "loss_mode_switch": 0.0, "loss_total": 0.6148727536201477, "step": 1666 }, { "epoch": 0.6668, "grad_norm": 1.5390031337738037, "learning_rate": 2.639625124962192e-06, "loss": 0.3367, "step": 1667 }, { "batch_size": 1, "epoch": 0.6668, "step": 1667, "tokens_per_device": 4896 }, { "epoch": 0.6668, "loss_ce": 0.21834278106689453, "loss_lvr": 0.6500478982925415, "loss_mode_switch": 0.0, "loss_total": 0.28334757685661316, "step": 1667 }, { "batch_size": 4, "epoch": 0.6668, "step": 1667, "tokens_per_device": 5848 }, { "epoch": 0.6668, "loss_ce": 0.184463769197464, "loss_lvr": 0.9619571566581726, "loss_mode_switch": 0.0, "loss_total": 0.2806594967842102, "step": 1667 }, { "batch_size": 4, "epoch": 0.6668, "step": 1667, "tokens_per_device": 15232 }, { "epoch": 0.6668, "loss_ce": 0.3276939392089844, "loss_lvr": 0.7802294492721558, "loss_mode_switch": 0.0, "loss_total": 0.4057168960571289, "step": 1667 }, { "batch_size": 4, "epoch": 0.6668, "step": 1667, "tokens_per_device": 1400 }, { "epoch": 0.6668, "loss_ce": 0.2404462844133377, "loss_lvr": 1.0620285272598267, "loss_mode_switch": 0.0, "loss_total": 0.3466491401195526, "step": 1667 }, { "batch_size": 1, "epoch": 0.6668, "step": 1667, "tokens_per_device": 4865 }, { "epoch": 0.6668, "loss_ce": 0.00038133631460368633, "loss_lvr": 0.33206379413604736, "loss_mode_switch": 0.0, "loss_total": 0.03358771651983261, "step": 1667 }, { "batch_size": 4, "epoch": 0.6668, "step": 1667, "tokens_per_device": 4696 }, { "epoch": 0.6668, "loss_ce": 0.0880047082901001, "loss_lvr": 0.8634339570999146, "loss_mode_switch": 0.0, "loss_total": 0.1743481159210205, "step": 1667 }, { "batch_size": 4, "epoch": 0.6668, "step": 1667, "tokens_per_device": 1468 }, { "epoch": 0.6668, "loss_ce": 0.240301713347435, "loss_lvr": 1.012714147567749, "loss_mode_switch": 0.0, "loss_total": 0.3415731191635132, "step": 1667 }, { "batch_size": 4, "epoch": 0.6668, "step": 1667, "tokens_per_device": 3792 }, { "epoch": 0.6668, "loss_ce": 0.1878872513771057, "loss_lvr": 1.0299714803695679, "loss_mode_switch": 0.0, "loss_total": 0.290884405374527, "step": 1667 }, { "epoch": 0.6672, "grad_norm": 1.3845765590667725, "learning_rate": 2.63391680337508e-06, "loss": 0.2916, "step": 1668 }, { "batch_size": 4, "epoch": 0.6672, "step": 1668, "tokens_per_device": 4592 }, { "epoch": 0.6672, "loss_ce": 0.2795969247817993, "loss_lvr": 0.7356548309326172, "loss_mode_switch": 0.0, "loss_total": 0.35316240787506104, "step": 1668 }, { "batch_size": 4, "epoch": 0.6672, "step": 1668, "tokens_per_device": 4012 }, { "epoch": 0.6672, "loss_ce": 0.37140583992004395, "loss_lvr": 0.87089604139328, "loss_mode_switch": 0.0, "loss_total": 0.45849543809890747, "step": 1668 }, { "batch_size": 4, "epoch": 0.6672, "step": 1668, "tokens_per_device": 1648 }, { "epoch": 0.6672, "loss_ce": 0.7284088134765625, "loss_lvr": 0.9758971929550171, "loss_mode_switch": 0.0, "loss_total": 0.8259985446929932, "step": 1668 }, { "batch_size": 4, "epoch": 0.6672, "step": 1668, "tokens_per_device": 4956 }, { "epoch": 0.6672, "loss_ce": 0.36153823137283325, "loss_lvr": 0.7838584780693054, "loss_mode_switch": 0.0, "loss_total": 0.43992409110069275, "step": 1668 }, { "batch_size": 1, "epoch": 0.6672, "step": 1668, "tokens_per_device": 4856 }, { "epoch": 0.6672, "loss_ce": 0.00018592625565361232, "loss_lvr": 0.36692607402801514, "loss_mode_switch": 0.0, "loss_total": 0.03687853366136551, "step": 1668 }, { "batch_size": 4, "epoch": 0.6672, "step": 1668, "tokens_per_device": 5828 }, { "epoch": 0.6672, "loss_ce": 0.16196608543395996, "loss_lvr": 0.608981192111969, "loss_mode_switch": 0.0, "loss_total": 0.22286421060562134, "step": 1668 }, { "batch_size": 1, "epoch": 0.6672, "step": 1668, "tokens_per_device": 4904 }, { "epoch": 0.6672, "loss_ce": 0.3442431390285492, "loss_lvr": 0.8435587882995605, "loss_mode_switch": 0.0, "loss_total": 0.4285990297794342, "step": 1668 }, { "batch_size": 1, "epoch": 0.6672, "step": 1668, "tokens_per_device": 5262 }, { "epoch": 0.6672, "loss_ce": 0.06954315304756165, "loss_lvr": 0.30599355697631836, "loss_mode_switch": 0.0, "loss_total": 0.10014250874519348, "step": 1668 }, { "epoch": 0.6676, "grad_norm": 1.2602216005325317, "learning_rate": 2.6282124528457852e-06, "loss": 0.3228, "step": 1669 }, { "batch_size": 4, "epoch": 0.6676, "step": 1669, "tokens_per_device": 6376 }, { "epoch": 0.6676, "loss_ce": 0.057792793959379196, "loss_lvr": 0.9299432039260864, "loss_mode_switch": 0.0, "loss_total": 0.1507871150970459, "step": 1669 }, { "batch_size": 1, "epoch": 0.6676, "step": 1669, "tokens_per_device": 5239 }, { "epoch": 0.6676, "loss_ce": 0.0007587889558635652, "loss_lvr": 0.22094130516052246, "loss_mode_switch": 0.0, "loss_total": 0.02285291999578476, "step": 1669 }, { "batch_size": 1, "epoch": 0.6676, "step": 1669, "tokens_per_device": 4864 }, { "epoch": 0.6676, "loss_ce": 0.0001361554313916713, "loss_lvr": 0.24419817328453064, "loss_mode_switch": 0.0, "loss_total": 0.024555973708629608, "step": 1669 }, { "batch_size": 1, "epoch": 0.6676, "step": 1669, "tokens_per_device": 5680 }, { "epoch": 0.6676, "loss_ce": 0.009172427468001842, "loss_lvr": 0.20920738577842712, "loss_mode_switch": 0.0, "loss_total": 0.03009316697716713, "step": 1669 }, { "batch_size": 4, "epoch": 0.6676, "step": 1669, "tokens_per_device": 5688 }, { "epoch": 0.6676, "loss_ce": 0.20385797321796417, "loss_lvr": 1.124066710472107, "loss_mode_switch": 0.0, "loss_total": 0.31626462936401367, "step": 1669 }, { "batch_size": 1, "epoch": 0.6676, "step": 1669, "tokens_per_device": 5467 }, { "epoch": 0.6676, "loss_ce": 0.02819995954632759, "loss_lvr": 0.44863465428352356, "loss_mode_switch": 0.0, "loss_total": 0.073063425719738, "step": 1669 }, { "batch_size": 4, "epoch": 0.6676, "step": 1669, "tokens_per_device": 11168 }, { "epoch": 0.6676, "loss_ce": 0.20749902725219727, "loss_lvr": 0.683491051197052, "loss_mode_switch": 0.0, "loss_total": 0.2758481502532959, "step": 1669 }, { "batch_size": 1, "epoch": 0.6676, "step": 1669, "tokens_per_device": 4894 }, { "epoch": 0.6676, "loss_ce": 0.12246657907962799, "loss_lvr": 0.2429150938987732, "loss_mode_switch": 0.0, "loss_total": 0.1467580944299698, "step": 1669 }, { "epoch": 0.668, "grad_norm": 1.2463732957839966, "learning_rate": 2.622512082948063e-06, "loss": 0.2525, "step": 1670 }, { "batch_size": 4, "epoch": 0.668, "step": 1670, "tokens_per_device": 5432 }, { "epoch": 0.668, "loss_ce": 0.034555841237306595, "loss_lvr": 0.5853559374809265, "loss_mode_switch": 0.0, "loss_total": 0.0930914357304573, "step": 1670 }, { "batch_size": 4, "epoch": 0.668, "step": 1670, "tokens_per_device": 3468 }, { "epoch": 0.668, "loss_ce": 0.5274102687835693, "loss_lvr": 1.0344799757003784, "loss_mode_switch": 0.0, "loss_total": 0.6308582425117493, "step": 1670 }, { "batch_size": 4, "epoch": 0.668, "step": 1670, "tokens_per_device": 4476 }, { "epoch": 0.668, "loss_ce": 0.33232948184013367, "loss_lvr": 0.8062976598739624, "loss_mode_switch": 0.0, "loss_total": 0.4129592478275299, "step": 1670 }, { "batch_size": 4, "epoch": 0.668, "step": 1670, "tokens_per_device": 9016 }, { "epoch": 0.668, "loss_ce": 0.12281126528978348, "loss_lvr": 0.8676038980484009, "loss_mode_switch": 0.0, "loss_total": 0.20957165956497192, "step": 1670 }, { "batch_size": 1, "epoch": 0.668, "step": 1670, "tokens_per_device": 4880 }, { "epoch": 0.668, "loss_ce": 0.0034757175017148256, "loss_lvr": 0.38213256001472473, "loss_mode_switch": 0.0, "loss_total": 0.04168897494673729, "step": 1670 }, { "batch_size": 1, "epoch": 0.668, "step": 1670, "tokens_per_device": 4907 }, { "epoch": 0.668, "loss_ce": 0.02447005733847618, "loss_lvr": 0.6657149791717529, "loss_mode_switch": 0.0, "loss_total": 0.09104155004024506, "step": 1670 }, { "batch_size": 4, "epoch": 0.668, "step": 1670, "tokens_per_device": 4972 }, { "epoch": 0.668, "loss_ce": 0.32323524355888367, "loss_lvr": 0.7471078038215637, "loss_mode_switch": 0.0, "loss_total": 0.3979460299015045, "step": 1670 }, { "batch_size": 4, "epoch": 0.668, "step": 1670, "tokens_per_device": 4460 }, { "epoch": 0.668, "loss_ce": 0.1370742917060852, "loss_lvr": 0.8255600929260254, "loss_mode_switch": 0.0, "loss_total": 0.21963030099868774, "step": 1670 }, { "epoch": 0.6684, "grad_norm": 1.374148964881897, "learning_rate": 2.6168157032489883e-06, "loss": 0.2872, "step": 1671 }, { "batch_size": 1, "epoch": 0.6684, "step": 1671, "tokens_per_device": 4971 }, { "epoch": 0.6684, "loss_ce": 0.00012186261301394552, "loss_lvr": 0.5750405192375183, "loss_mode_switch": 0.0, "loss_total": 0.057625915855169296, "step": 1671 }, { "batch_size": 4, "epoch": 0.6684, "step": 1671, "tokens_per_device": 2768 }, { "epoch": 0.6684, "loss_ce": 0.2907998263835907, "loss_lvr": 0.8222371935844421, "loss_mode_switch": 0.0, "loss_total": 0.37302353978157043, "step": 1671 }, { "batch_size": 4, "epoch": 0.6684, "step": 1671, "tokens_per_device": 5528 }, { "epoch": 0.6684, "loss_ce": 0.08409179002046585, "loss_lvr": 0.9140524864196777, "loss_mode_switch": 0.0, "loss_total": 0.17549704015254974, "step": 1671 }, { "batch_size": 4, "epoch": 0.6684, "step": 1671, "tokens_per_device": 1420 }, { "epoch": 0.6684, "loss_ce": 0.3851358890533447, "loss_lvr": 0.8085111975669861, "loss_mode_switch": 0.0, "loss_total": 0.46598702669143677, "step": 1671 }, { "batch_size": 4, "epoch": 0.6684, "step": 1671, "tokens_per_device": 3884 }, { "epoch": 0.6684, "loss_ce": 0.11330509185791016, "loss_lvr": 1.2264982461929321, "loss_mode_switch": 0.0, "loss_total": 0.2359549105167389, "step": 1671 }, { "batch_size": 4, "epoch": 0.6684, "step": 1671, "tokens_per_device": 15380 }, { "epoch": 0.6684, "loss_ce": 0.07331615686416626, "loss_lvr": 0.9942638278007507, "loss_mode_switch": 0.0, "loss_total": 0.1727425456047058, "step": 1671 }, { "batch_size": 1, "epoch": 0.6684, "step": 1671, "tokens_per_device": 4867 }, { "epoch": 0.6684, "loss_ce": 0.009198260493576527, "loss_lvr": 0.2650512754917145, "loss_mode_switch": 0.0, "loss_total": 0.0357033871114254, "step": 1671 }, { "batch_size": 4, "epoch": 0.6684, "step": 1671, "tokens_per_device": 4988 }, { "epoch": 0.6684, "loss_ce": 0.023315537720918655, "loss_lvr": 0.4839670956134796, "loss_mode_switch": 0.0, "loss_total": 0.07171224802732468, "step": 1671 }, { "epoch": 0.6688, "grad_norm": 1.2912800312042236, "learning_rate": 2.611123323308943e-06, "loss": 0.264, "step": 1672 }, { "batch_size": 4, "epoch": 0.6688, "step": 1672, "tokens_per_device": 4256 }, { "epoch": 0.6688, "loss_ce": 0.13233889639377594, "loss_lvr": 0.9860975742340088, "loss_mode_switch": 0.0, "loss_total": 0.23094865679740906, "step": 1672 }, { "batch_size": 1, "epoch": 0.6688, "step": 1672, "tokens_per_device": 4890 }, { "epoch": 0.6688, "loss_ce": 0.3736761808395386, "loss_lvr": 0.9452940821647644, "loss_mode_switch": 0.0, "loss_total": 0.46820560097694397, "step": 1672 }, { "batch_size": 1, "epoch": 0.6688, "step": 1672, "tokens_per_device": 4881 }, { "epoch": 0.6688, "loss_ce": 0.008516051806509495, "loss_lvr": 0.3848613202571869, "loss_mode_switch": 0.0, "loss_total": 0.04700218513607979, "step": 1672 }, { "batch_size": 4, "epoch": 0.6688, "step": 1672, "tokens_per_device": 1852 }, { "epoch": 0.6688, "loss_ce": 0.058757483959198, "loss_lvr": 1.0130484104156494, "loss_mode_switch": 0.0, "loss_total": 0.16006232798099518, "step": 1672 }, { "batch_size": 4, "epoch": 0.6688, "step": 1672, "tokens_per_device": 4968 }, { "epoch": 0.6688, "loss_ce": 0.3858531415462494, "loss_lvr": 0.8093816637992859, "loss_mode_switch": 0.0, "loss_total": 0.4667913019657135, "step": 1672 }, { "batch_size": 4, "epoch": 0.6688, "step": 1672, "tokens_per_device": 4340 }, { "epoch": 0.6688, "loss_ce": 0.24174495041370392, "loss_lvr": 0.7158987522125244, "loss_mode_switch": 0.0, "loss_total": 0.3133348226547241, "step": 1672 }, { "batch_size": 4, "epoch": 0.6688, "step": 1672, "tokens_per_device": 1380 }, { "epoch": 0.6688, "loss_ce": 0.5886885523796082, "loss_lvr": 1.0047340393066406, "loss_mode_switch": 0.0, "loss_total": 0.6891619563102722, "step": 1672 }, { "batch_size": 4, "epoch": 0.6688, "step": 1672, "tokens_per_device": 1212 }, { "epoch": 0.6688, "loss_ce": 0.33138561248779297, "loss_lvr": 1.0616309642791748, "loss_mode_switch": 0.0, "loss_total": 0.4375486969947815, "step": 1672 }, { "epoch": 0.6692, "grad_norm": 1.333026647567749, "learning_rate": 2.605434952681589e-06, "loss": 0.3204, "step": 1673 }, { "batch_size": 4, "epoch": 0.6692, "step": 1673, "tokens_per_device": 1812 }, { "epoch": 0.6692, "loss_ce": 0.24014872312545776, "loss_lvr": 0.8838685154914856, "loss_mode_switch": 0.0, "loss_total": 0.3285355865955353, "step": 1673 }, { "batch_size": 4, "epoch": 0.6692, "step": 1673, "tokens_per_device": 4168 }, { "epoch": 0.6692, "loss_ce": 0.12521328032016754, "loss_lvr": 0.7764821648597717, "loss_mode_switch": 0.0, "loss_total": 0.202861487865448, "step": 1673 }, { "batch_size": 4, "epoch": 0.6692, "step": 1673, "tokens_per_device": 5112 }, { "epoch": 0.6692, "loss_ce": 0.3876343369483948, "loss_lvr": 0.712246835231781, "loss_mode_switch": 0.0, "loss_total": 0.45885902643203735, "step": 1673 }, { "batch_size": 1, "epoch": 0.6692, "step": 1673, "tokens_per_device": 4666 }, { "epoch": 0.6692, "loss_ce": 0.049336981028318405, "loss_lvr": 0.45744824409484863, "loss_mode_switch": 0.0, "loss_total": 0.09508180618286133, "step": 1673 }, { "batch_size": 1, "epoch": 0.6692, "step": 1673, "tokens_per_device": 7483 }, { "epoch": 0.6692, "loss_ce": 0.08120061457157135, "loss_lvr": 0.34743624925613403, "loss_mode_switch": 0.0, "loss_total": 0.11594423651695251, "step": 1673 }, { "batch_size": 4, "epoch": 0.6692, "step": 1673, "tokens_per_device": 4680 }, { "epoch": 0.6692, "loss_ce": 0.35065537691116333, "loss_lvr": 0.7893335223197937, "loss_mode_switch": 0.0, "loss_total": 0.4295887351036072, "step": 1673 }, { "batch_size": 4, "epoch": 0.6692, "step": 1673, "tokens_per_device": 2256 }, { "epoch": 0.6692, "loss_ce": 0.08117936551570892, "loss_lvr": 0.813019335269928, "loss_mode_switch": 0.0, "loss_total": 0.16248130798339844, "step": 1673 }, { "batch_size": 4, "epoch": 0.6692, "step": 1673, "tokens_per_device": 4256 }, { "epoch": 0.6692, "loss_ce": 0.9944144487380981, "loss_lvr": 0.8627150654792786, "loss_mode_switch": 0.0, "loss_total": 1.0806859731674194, "step": 1673 }, { "epoch": 0.6696, "grad_norm": 1.3280938863754272, "learning_rate": 2.5997506009138707e-06, "loss": 0.2931, "step": 1674 }, { "batch_size": 1, "epoch": 0.6696, "step": 1674, "tokens_per_device": 4750 }, { "epoch": 0.6696, "loss_ce": 0.014006746001541615, "loss_lvr": 0.3446018397808075, "loss_mode_switch": 0.0, "loss_total": 0.04846692830324173, "step": 1674 }, { "batch_size": 1, "epoch": 0.6696, "step": 1674, "tokens_per_device": 5327 }, { "epoch": 0.6696, "loss_ce": 0.0008527635945938528, "loss_lvr": 0.5417525172233582, "loss_mode_switch": 0.0, "loss_total": 0.05502801761031151, "step": 1674 }, { "batch_size": 4, "epoch": 0.6696, "step": 1674, "tokens_per_device": 4188 }, { "epoch": 0.6696, "loss_ce": 0.02572994865477085, "loss_lvr": 0.7898163199424744, "loss_mode_switch": 0.0, "loss_total": 0.10471157729625702, "step": 1674 }, { "batch_size": 4, "epoch": 0.6696, "step": 1674, "tokens_per_device": 5744 }, { "epoch": 0.6696, "loss_ce": 0.3871721029281616, "loss_lvr": 0.9427870512008667, "loss_mode_switch": 0.0, "loss_total": 0.48145079612731934, "step": 1674 }, { "batch_size": 4, "epoch": 0.6696, "step": 1674, "tokens_per_device": 3820 }, { "epoch": 0.6696, "loss_ce": 0.03427528589963913, "loss_lvr": 0.923337459564209, "loss_mode_switch": 0.0, "loss_total": 0.12660902738571167, "step": 1674 }, { "batch_size": 1, "epoch": 0.6696, "step": 1674, "tokens_per_device": 4892 }, { "epoch": 0.6696, "loss_ce": 0.2211875468492508, "loss_lvr": 0.7304299473762512, "loss_mode_switch": 0.0, "loss_total": 0.29423055052757263, "step": 1674 }, { "batch_size": 4, "epoch": 0.6696, "step": 1674, "tokens_per_device": 1608 }, { "epoch": 0.6696, "loss_ce": 0.13485407829284668, "loss_lvr": 0.8856116533279419, "loss_mode_switch": 0.0, "loss_total": 0.22341525554656982, "step": 1674 }, { "batch_size": 1, "epoch": 0.6696, "step": 1674, "tokens_per_device": 5120 }, { "epoch": 0.6696, "loss_ce": 0.06456909328699112, "loss_lvr": 0.28907692432403564, "loss_mode_switch": 0.0, "loss_total": 0.0934767872095108, "step": 1674 }, { "epoch": 0.67, "grad_norm": 1.362773060798645, "learning_rate": 2.594070277545975e-06, "loss": 0.2759, "step": 1675 }, { "batch_size": 4, "epoch": 0.67, "step": 1675, "tokens_per_device": 7304 }, { "epoch": 0.67, "loss_ce": 0.029099294915795326, "loss_lvr": 0.763549268245697, "loss_mode_switch": 0.0, "loss_total": 0.105454221367836, "step": 1675 }, { "batch_size": 4, "epoch": 0.67, "step": 1675, "tokens_per_device": 4996 }, { "epoch": 0.67, "loss_ce": 0.05386605113744736, "loss_lvr": 0.7593804597854614, "loss_mode_switch": 0.0, "loss_total": 0.1298041045665741, "step": 1675 }, { "batch_size": 4, "epoch": 0.67, "step": 1675, "tokens_per_device": 1256 }, { "epoch": 0.67, "loss_ce": 0.1528119295835495, "loss_lvr": 0.9881739020347595, "loss_mode_switch": 0.0, "loss_total": 0.2516293227672577, "step": 1675 }, { "batch_size": 1, "epoch": 0.67, "step": 1675, "tokens_per_device": 5140 }, { "epoch": 0.67, "loss_ce": 0.0004407138912938535, "loss_lvr": 0.5652657151222229, "loss_mode_switch": 0.0, "loss_total": 0.05696728453040123, "step": 1675 }, { "batch_size": 4, "epoch": 0.67, "step": 1675, "tokens_per_device": 2620 }, { "epoch": 0.67, "loss_ce": 0.53583824634552, "loss_lvr": 0.6117520332336426, "loss_mode_switch": 0.0, "loss_total": 0.5970134735107422, "step": 1675 }, { "batch_size": 4, "epoch": 0.67, "step": 1675, "tokens_per_device": 5776 }, { "epoch": 0.67, "loss_ce": 0.20851144194602966, "loss_lvr": 0.6339588165283203, "loss_mode_switch": 0.0, "loss_total": 0.27190732955932617, "step": 1675 }, { "batch_size": 4, "epoch": 0.67, "step": 1675, "tokens_per_device": 1748 }, { "epoch": 0.67, "loss_ce": 0.10998544842004776, "loss_lvr": 0.9858091473579407, "loss_mode_switch": 0.0, "loss_total": 0.20856636762619019, "step": 1675 }, { "batch_size": 1, "epoch": 0.67, "step": 1675, "tokens_per_device": 4843 }, { "epoch": 0.67, "loss_ce": 0.009502743370831013, "loss_lvr": 0.38611289858818054, "loss_mode_switch": 0.0, "loss_total": 0.04811403155326843, "step": 1675 }, { "epoch": 0.6704, "grad_norm": 1.299631953239441, "learning_rate": 2.5883939921113373e-06, "loss": 0.2634, "step": 1676 }, { "batch_size": 4, "epoch": 0.6704, "step": 1676, "tokens_per_device": 5252 }, { "epoch": 0.6704, "loss_ce": 0.1367178112268448, "loss_lvr": 0.800327718257904, "loss_mode_switch": 0.0, "loss_total": 0.2167505919933319, "step": 1676 }, { "batch_size": 4, "epoch": 0.6704, "step": 1676, "tokens_per_device": 5580 }, { "epoch": 0.6704, "loss_ce": 0.13717173039913177, "loss_lvr": 0.7756419777870178, "loss_mode_switch": 0.0, "loss_total": 0.21473592519760132, "step": 1676 }, { "batch_size": 1, "epoch": 0.6704, "step": 1676, "tokens_per_device": 4889 }, { "epoch": 0.6704, "loss_ce": 0.446114182472229, "loss_lvr": 0.6738170385360718, "loss_mode_switch": 0.0, "loss_total": 0.5134958624839783, "step": 1676 }, { "batch_size": 1, "epoch": 0.6704, "step": 1676, "tokens_per_device": 5107 }, { "epoch": 0.6704, "loss_ce": 0.1674320548772812, "loss_lvr": 0.5046800374984741, "loss_mode_switch": 0.0, "loss_total": 0.21790006756782532, "step": 1676 }, { "batch_size": 1, "epoch": 0.6704, "step": 1676, "tokens_per_device": 4884 }, { "epoch": 0.6704, "loss_ce": 0.01901833713054657, "loss_lvr": 0.23433846235275269, "loss_mode_switch": 0.0, "loss_total": 0.04245218634605408, "step": 1676 }, { "batch_size": 4, "epoch": 0.6704, "step": 1676, "tokens_per_device": 5260 }, { "epoch": 0.6704, "loss_ce": 0.15577426552772522, "loss_lvr": 0.7721905708312988, "loss_mode_switch": 0.0, "loss_total": 0.23299333453178406, "step": 1676 }, { "batch_size": 4, "epoch": 0.6704, "step": 1676, "tokens_per_device": 2920 }, { "epoch": 0.6704, "loss_ce": 0.0005024111596867442, "loss_lvr": 0.31644660234451294, "loss_mode_switch": 0.0, "loss_total": 0.03214707225561142, "step": 1676 }, { "batch_size": 4, "epoch": 0.6704, "step": 1676, "tokens_per_device": 8112 }, { "epoch": 0.6704, "loss_ce": 0.196266308426857, "loss_lvr": 0.623540997505188, "loss_mode_switch": 0.0, "loss_total": 0.25862041115760803, "step": 1676 }, { "epoch": 0.6708, "grad_norm": 1.4601259231567383, "learning_rate": 2.582721754136609e-06, "loss": 0.2983, "step": 1677 }, { "batch_size": 1, "epoch": 0.6708, "step": 1677, "tokens_per_device": 4923 }, { "epoch": 0.6708, "loss_ce": 0.038577061146497726, "loss_lvr": 0.20549191534519196, "loss_mode_switch": 0.0, "loss_total": 0.05912625044584274, "step": 1677 }, { "batch_size": 4, "epoch": 0.6708, "step": 1677, "tokens_per_device": 2572 }, { "epoch": 0.6708, "loss_ce": 0.3626870810985565, "loss_lvr": 1.0585006475448608, "loss_mode_switch": 0.0, "loss_total": 0.4685371518135071, "step": 1677 }, { "batch_size": 4, "epoch": 0.6708, "step": 1677, "tokens_per_device": 4388 }, { "epoch": 0.6708, "loss_ce": 0.305462121963501, "loss_lvr": 1.0313682556152344, "loss_mode_switch": 0.0, "loss_total": 0.40859895944595337, "step": 1677 }, { "batch_size": 4, "epoch": 0.6708, "step": 1677, "tokens_per_device": 2624 }, { "epoch": 0.6708, "loss_ce": 0.29078418016433716, "loss_lvr": 0.7818892598152161, "loss_mode_switch": 0.0, "loss_total": 0.36897310614585876, "step": 1677 }, { "batch_size": 1, "epoch": 0.6708, "step": 1677, "tokens_per_device": 5728 }, { "epoch": 0.6708, "loss_ce": 0.011477059684693813, "loss_lvr": 0.341774046421051, "loss_mode_switch": 0.0, "loss_total": 0.04565446451306343, "step": 1677 }, { "batch_size": 4, "epoch": 0.6708, "step": 1677, "tokens_per_device": 5556 }, { "epoch": 0.6708, "loss_ce": 0.014075041748583317, "loss_lvr": 0.7664647698402405, "loss_mode_switch": 0.0, "loss_total": 0.09072151780128479, "step": 1677 }, { "batch_size": 4, "epoch": 0.6708, "step": 1677, "tokens_per_device": 6888 }, { "epoch": 0.6708, "loss_ce": 0.04387768730521202, "loss_lvr": 0.8525479435920715, "loss_mode_switch": 0.0, "loss_total": 0.129132479429245, "step": 1677 }, { "batch_size": 1, "epoch": 0.6708, "step": 1677, "tokens_per_device": 5117 }, { "epoch": 0.6708, "loss_ce": 0.015898434445261955, "loss_lvr": 0.31245288252830505, "loss_mode_switch": 0.0, "loss_total": 0.04714372381567955, "step": 1677 }, { "epoch": 0.6712, "grad_norm": 1.3181216716766357, "learning_rate": 2.5770535731416556e-06, "loss": 0.2763, "step": 1678 }, { "batch_size": 1, "epoch": 0.6712, "step": 1678, "tokens_per_device": 4845 }, { "epoch": 0.6712, "loss_ce": 0.02491598017513752, "loss_lvr": 0.15912270545959473, "loss_mode_switch": 0.0, "loss_total": 0.04082825034856796, "step": 1678 }, { "batch_size": 4, "epoch": 0.6712, "step": 1678, "tokens_per_device": 5432 }, { "epoch": 0.6712, "loss_ce": 0.313351035118103, "loss_lvr": 0.7544530630111694, "loss_mode_switch": 0.0, "loss_total": 0.388796329498291, "step": 1678 }, { "batch_size": 1, "epoch": 0.6712, "step": 1678, "tokens_per_device": 4909 }, { "epoch": 0.6712, "loss_ce": 0.2767728865146637, "loss_lvr": 0.6535102128982544, "loss_mode_switch": 0.0, "loss_total": 0.34212392568588257, "step": 1678 }, { "batch_size": 1, "epoch": 0.6712, "step": 1678, "tokens_per_device": 6236 }, { "epoch": 0.6712, "loss_ce": 0.01726609095931053, "loss_lvr": 0.3016294538974762, "loss_mode_switch": 0.0, "loss_total": 0.04742903634905815, "step": 1678 }, { "batch_size": 4, "epoch": 0.6712, "step": 1678, "tokens_per_device": 5156 }, { "epoch": 0.6712, "loss_ce": 0.4651806056499481, "loss_lvr": 0.7491065859794617, "loss_mode_switch": 0.0, "loss_total": 0.5400912761688232, "step": 1678 }, { "batch_size": 1, "epoch": 0.6712, "step": 1678, "tokens_per_device": 5228 }, { "epoch": 0.6712, "loss_ce": 0.1206682026386261, "loss_lvr": 0.8334816098213196, "loss_mode_switch": 0.0, "loss_total": 0.20401635766029358, "step": 1678 }, { "batch_size": 1, "epoch": 0.6712, "step": 1678, "tokens_per_device": 6326 }, { "epoch": 0.6712, "loss_ce": 0.00021276595361996442, "loss_lvr": 0.46192997694015503, "loss_mode_switch": 0.0, "loss_total": 0.046405766159296036, "step": 1678 }, { "batch_size": 1, "epoch": 0.6712, "step": 1678, "tokens_per_device": 5043 }, { "epoch": 0.6712, "loss_ce": 0.01618492230772972, "loss_lvr": 0.18231762945652008, "loss_mode_switch": 0.0, "loss_total": 0.03441668301820755, "step": 1678 }, { "epoch": 0.6716, "grad_norm": 1.1677517890930176, "learning_rate": 2.5713894586395282e-06, "loss": 0.2445, "step": 1679 }, { "batch_size": 4, "epoch": 0.6716, "step": 1679, "tokens_per_device": 11056 }, { "epoch": 0.6716, "loss_ce": 0.22214433550834656, "loss_lvr": 0.6862480044364929, "loss_mode_switch": 0.0, "loss_total": 0.29076912999153137, "step": 1679 }, { "batch_size": 4, "epoch": 0.6716, "step": 1679, "tokens_per_device": 4472 }, { "epoch": 0.6716, "loss_ce": 0.08270329236984253, "loss_lvr": 1.0840699672698975, "loss_mode_switch": 0.0, "loss_total": 0.1911102831363678, "step": 1679 }, { "batch_size": 4, "epoch": 0.6716, "step": 1679, "tokens_per_device": 4176 }, { "epoch": 0.6716, "loss_ce": 0.016852986067533493, "loss_lvr": 0.7810096144676208, "loss_mode_switch": 0.0, "loss_total": 0.09495395421981812, "step": 1679 }, { "batch_size": 4, "epoch": 0.6716, "step": 1679, "tokens_per_device": 8416 }, { "epoch": 0.6716, "loss_ce": 0.18282878398895264, "loss_lvr": 0.8092042207717896, "loss_mode_switch": 0.0, "loss_total": 0.26374921202659607, "step": 1679 }, { "batch_size": 1, "epoch": 0.6716, "step": 1679, "tokens_per_device": 4748 }, { "epoch": 0.6716, "loss_ce": 0.009509149007499218, "loss_lvr": 0.32455262541770935, "loss_mode_switch": 0.0, "loss_total": 0.04196441173553467, "step": 1679 }, { "batch_size": 4, "epoch": 0.6716, "step": 1679, "tokens_per_device": 5712 }, { "epoch": 0.6716, "loss_ce": 0.2744589149951935, "loss_lvr": 0.9953134059906006, "loss_mode_switch": 0.0, "loss_total": 0.3739902675151825, "step": 1679 }, { "batch_size": 4, "epoch": 0.6716, "step": 1679, "tokens_per_device": 3808 }, { "epoch": 0.6716, "loss_ce": 0.020002037286758423, "loss_lvr": 0.8848631978034973, "loss_mode_switch": 0.0, "loss_total": 0.10848835855722427, "step": 1679 }, { "batch_size": 4, "epoch": 0.6716, "step": 1679, "tokens_per_device": 3900 }, { "epoch": 0.6716, "loss_ce": 0.4507204592227936, "loss_lvr": 0.8244404792785645, "loss_mode_switch": 0.0, "loss_total": 0.5331645011901855, "step": 1679 }, { "epoch": 0.672, "grad_norm": 1.2654742002487183, "learning_rate": 2.5657294201364526e-06, "loss": 0.2722, "step": 1680 }, { "batch_size": 1, "epoch": 0.672, "step": 1680, "tokens_per_device": 5038 }, { "epoch": 0.672, "loss_ce": 0.11657537519931793, "loss_lvr": 0.3154211938381195, "loss_mode_switch": 0.0, "loss_total": 0.14811749756336212, "step": 1680 }, { "batch_size": 4, "epoch": 0.672, "step": 1680, "tokens_per_device": 2704 }, { "epoch": 0.672, "loss_ce": 0.5420002341270447, "loss_lvr": 0.7316797971725464, "loss_mode_switch": 0.0, "loss_total": 0.6151682138442993, "step": 1680 }, { "batch_size": 4, "epoch": 0.672, "step": 1680, "tokens_per_device": 2692 }, { "epoch": 0.672, "loss_ce": 0.15530981123447418, "loss_lvr": 0.6999368667602539, "loss_mode_switch": 0.0, "loss_total": 0.2253035008907318, "step": 1680 }, { "batch_size": 4, "epoch": 0.672, "step": 1680, "tokens_per_device": 9812 }, { "epoch": 0.672, "loss_ce": 0.21091023087501526, "loss_lvr": 0.4537416100502014, "loss_mode_switch": 0.0, "loss_total": 0.2562843859195709, "step": 1680 }, { "batch_size": 1, "epoch": 0.672, "step": 1680, "tokens_per_device": 4866 }, { "epoch": 0.672, "loss_ce": 0.00748843839392066, "loss_lvr": 0.16252551972866058, "loss_mode_switch": 0.0, "loss_total": 0.023740991950035095, "step": 1680 }, { "batch_size": 1, "epoch": 0.672, "step": 1680, "tokens_per_device": 4758 }, { "epoch": 0.672, "loss_ce": 0.00027485686587169766, "loss_lvr": 0.47219178080558777, "loss_mode_switch": 0.0, "loss_total": 0.047494035214185715, "step": 1680 }, { "batch_size": 1, "epoch": 0.672, "step": 1680, "tokens_per_device": 5035 }, { "epoch": 0.672, "loss_ce": 0.040717292577028275, "loss_lvr": 0.5442429780960083, "loss_mode_switch": 0.0, "loss_total": 0.09514158964157104, "step": 1680 }, { "batch_size": 1, "epoch": 0.672, "step": 1680, "tokens_per_device": 4900 }, { "epoch": 0.672, "loss_ce": 0.18287046253681183, "loss_lvr": 0.8157221078872681, "loss_mode_switch": 0.0, "loss_total": 0.26444268226623535, "step": 1680 }, { "epoch": 0.6724, "grad_norm": 1.3920528888702393, "learning_rate": 2.560073467131819e-06, "loss": 0.2638, "step": 1681 }, { "batch_size": 4, "epoch": 0.6724, "step": 1681, "tokens_per_device": 4228 }, { "epoch": 0.6724, "loss_ce": 0.12463193386793137, "loss_lvr": 0.879098653793335, "loss_mode_switch": 0.0, "loss_total": 0.21254180371761322, "step": 1681 }, { "batch_size": 4, "epoch": 0.6724, "step": 1681, "tokens_per_device": 1284 }, { "epoch": 0.6724, "loss_ce": 0.38811030983924866, "loss_lvr": 0.8910872340202332, "loss_mode_switch": 0.0, "loss_total": 0.4772190451622009, "step": 1681 }, { "batch_size": 1, "epoch": 0.6724, "step": 1681, "tokens_per_device": 4876 }, { "epoch": 0.6724, "loss_ce": 0.00426362082362175, "loss_lvr": 0.9032650589942932, "loss_mode_switch": 0.0, "loss_total": 0.09459012746810913, "step": 1681 }, { "batch_size": 4, "epoch": 0.6724, "step": 1681, "tokens_per_device": 4692 }, { "epoch": 0.6724, "loss_ce": 0.010747011750936508, "loss_lvr": 0.8183190226554871, "loss_mode_switch": 0.0, "loss_total": 0.09257891774177551, "step": 1681 }, { "batch_size": 4, "epoch": 0.6724, "step": 1681, "tokens_per_device": 2640 }, { "epoch": 0.6724, "loss_ce": 0.41158628463745117, "loss_lvr": 0.7644356489181519, "loss_mode_switch": 0.0, "loss_total": 0.4880298376083374, "step": 1681 }, { "batch_size": 4, "epoch": 0.6724, "step": 1681, "tokens_per_device": 5992 }, { "epoch": 0.6724, "loss_ce": 0.3052273988723755, "loss_lvr": 0.8027002811431885, "loss_mode_switch": 0.0, "loss_total": 0.38549742102622986, "step": 1681 }, { "batch_size": 1, "epoch": 0.6724, "step": 1681, "tokens_per_device": 7503 }, { "epoch": 0.6724, "loss_ce": 0.0005131839425303042, "loss_lvr": 0.32405853271484375, "loss_mode_switch": 0.0, "loss_total": 0.03291903808712959, "step": 1681 }, { "batch_size": 4, "epoch": 0.6724, "step": 1681, "tokens_per_device": 4240 }, { "epoch": 0.6724, "loss_ce": 0.3087916672229767, "loss_lvr": 0.8542186617851257, "loss_mode_switch": 0.0, "loss_total": 0.3942135274410248, "step": 1681 }, { "epoch": 0.6728, "grad_norm": 1.2188117504119873, "learning_rate": 2.554421609118155e-06, "loss": 0.2811, "step": 1682 }, { "batch_size": 1, "epoch": 0.6728, "step": 1682, "tokens_per_device": 4892 }, { "epoch": 0.6728, "loss_ce": 0.0035647216718643904, "loss_lvr": 0.49627208709716797, "loss_mode_switch": 0.0, "loss_total": 0.053191933780908585, "step": 1682 }, { "batch_size": 4, "epoch": 0.6728, "step": 1682, "tokens_per_device": 1472 }, { "epoch": 0.6728, "loss_ce": 1.146929144859314, "loss_lvr": 1.0114083290100098, "loss_mode_switch": 0.0, "loss_total": 1.2480700016021729, "step": 1682 }, { "batch_size": 1, "epoch": 0.6728, "step": 1682, "tokens_per_device": 4835 }, { "epoch": 0.6728, "loss_ce": 0.5105053186416626, "loss_lvr": 0.5494614839553833, "loss_mode_switch": 0.0, "loss_total": 0.565451443195343, "step": 1682 }, { "batch_size": 1, "epoch": 0.6728, "step": 1682, "tokens_per_device": 4887 }, { "epoch": 0.6728, "loss_ce": 0.005779893137514591, "loss_lvr": 0.32889556884765625, "loss_mode_switch": 0.0, "loss_total": 0.03866944834589958, "step": 1682 }, { "batch_size": 4, "epoch": 0.6728, "step": 1682, "tokens_per_device": 5932 }, { "epoch": 0.6728, "loss_ce": 0.1276000440120697, "loss_lvr": 0.4807450771331787, "loss_mode_switch": 0.0, "loss_total": 0.17567455768585205, "step": 1682 }, { "batch_size": 4, "epoch": 0.6728, "step": 1682, "tokens_per_device": 1588 }, { "epoch": 0.6728, "loss_ce": 0.16065773367881775, "loss_lvr": 0.9503791928291321, "loss_mode_switch": 0.0, "loss_total": 0.255695641040802, "step": 1682 }, { "batch_size": 4, "epoch": 0.6728, "step": 1682, "tokens_per_device": 9492 }, { "epoch": 0.6728, "loss_ce": 0.2835221290588379, "loss_lvr": 0.7758284211158752, "loss_mode_switch": 0.0, "loss_total": 0.36110496520996094, "step": 1682 }, { "batch_size": 1, "epoch": 0.6728, "step": 1682, "tokens_per_device": 5099 }, { "epoch": 0.6728, "loss_ce": 0.0005290159024298191, "loss_lvr": 0.2606961131095886, "loss_mode_switch": 0.0, "loss_total": 0.026598626747727394, "step": 1682 }, { "epoch": 0.6732, "grad_norm": 1.3558956384658813, "learning_rate": 2.5487738555811215e-06, "loss": 0.2708, "step": 1683 }, { "batch_size": 4, "epoch": 0.6732, "step": 1683, "tokens_per_device": 4400 }, { "epoch": 0.6732, "loss_ce": 0.48632627725601196, "loss_lvr": 0.7727378010749817, "loss_mode_switch": 0.0, "loss_total": 0.5636000633239746, "step": 1683 }, { "batch_size": 1, "epoch": 0.6732, "step": 1683, "tokens_per_device": 5136 }, { "epoch": 0.6732, "loss_ce": 0.019124440848827362, "loss_lvr": 0.17248676717281342, "loss_mode_switch": 0.0, "loss_total": 0.036373116075992584, "step": 1683 }, { "batch_size": 4, "epoch": 0.6732, "step": 1683, "tokens_per_device": 3796 }, { "epoch": 0.6732, "loss_ce": 0.18759861588478088, "loss_lvr": 0.9399350881576538, "loss_mode_switch": 0.0, "loss_total": 0.28159213066101074, "step": 1683 }, { "batch_size": 4, "epoch": 0.6732, "step": 1683, "tokens_per_device": 4056 }, { "epoch": 0.6732, "loss_ce": 0.17914308607578278, "loss_lvr": 0.8140362501144409, "loss_mode_switch": 0.0, "loss_total": 0.2605467140674591, "step": 1683 }, { "batch_size": 4, "epoch": 0.6732, "step": 1683, "tokens_per_device": 4316 }, { "epoch": 0.6732, "loss_ce": 0.18077868223190308, "loss_lvr": 0.9460208415985107, "loss_mode_switch": 0.0, "loss_total": 0.2753807604312897, "step": 1683 }, { "batch_size": 1, "epoch": 0.6732, "step": 1683, "tokens_per_device": 5122 }, { "epoch": 0.6732, "loss_ce": 0.06900583952665329, "loss_lvr": 0.38820046186447144, "loss_mode_switch": 0.0, "loss_total": 0.10782589018344879, "step": 1683 }, { "batch_size": 1, "epoch": 0.6732, "step": 1683, "tokens_per_device": 5806 }, { "epoch": 0.6732, "loss_ce": 0.01484003011137247, "loss_lvr": 0.3708798885345459, "loss_mode_switch": 0.0, "loss_total": 0.051928017288446426, "step": 1683 }, { "batch_size": 4, "epoch": 0.6732, "step": 1683, "tokens_per_device": 4652 }, { "epoch": 0.6732, "loss_ce": 0.4222222566604614, "loss_lvr": 0.6865994930267334, "loss_mode_switch": 0.0, "loss_total": 0.4908822178840637, "step": 1683 }, { "epoch": 0.6736, "grad_norm": 1.4221454858779907, "learning_rate": 2.5431302159994835e-06, "loss": 0.3024, "step": 1684 }, { "batch_size": 1, "epoch": 0.6736, "step": 1684, "tokens_per_device": 5194 }, { "epoch": 0.6736, "loss_ce": 0.05412674322724342, "loss_lvr": 0.24379634857177734, "loss_mode_switch": 0.0, "loss_total": 0.07850638031959534, "step": 1684 }, { "batch_size": 4, "epoch": 0.6736, "step": 1684, "tokens_per_device": 5712 }, { "epoch": 0.6736, "loss_ce": 0.12834735214710236, "loss_lvr": 0.794907808303833, "loss_mode_switch": 0.0, "loss_total": 0.20783813297748566, "step": 1684 }, { "batch_size": 4, "epoch": 0.6736, "step": 1684, "tokens_per_device": 3972 }, { "epoch": 0.6736, "loss_ce": 0.22540923953056335, "loss_lvr": 1.1889082193374634, "loss_mode_switch": 0.0, "loss_total": 0.3443000614643097, "step": 1684 }, { "batch_size": 1, "epoch": 0.6736, "step": 1684, "tokens_per_device": 4889 }, { "epoch": 0.6736, "loss_ce": 0.04118875414133072, "loss_lvr": 0.2893387973308563, "loss_mode_switch": 0.0, "loss_total": 0.07012263685464859, "step": 1684 }, { "batch_size": 4, "epoch": 0.6736, "step": 1684, "tokens_per_device": 1776 }, { "epoch": 0.6736, "loss_ce": 0.3608046770095825, "loss_lvr": 0.778961181640625, "loss_mode_switch": 0.0, "loss_total": 0.438700795173645, "step": 1684 }, { "batch_size": 4, "epoch": 0.6736, "step": 1684, "tokens_per_device": 4236 }, { "epoch": 0.6736, "loss_ce": 0.24728673696517944, "loss_lvr": 0.8364199995994568, "loss_mode_switch": 0.0, "loss_total": 0.3309287428855896, "step": 1684 }, { "batch_size": 1, "epoch": 0.6736, "step": 1684, "tokens_per_device": 4877 }, { "epoch": 0.6736, "loss_ce": 0.0025610229931771755, "loss_lvr": 0.3404431641101837, "loss_mode_switch": 0.0, "loss_total": 0.036605339497327805, "step": 1684 }, { "batch_size": 4, "epoch": 0.6736, "step": 1684, "tokens_per_device": 4032 }, { "epoch": 0.6736, "loss_ce": 0.15124967694282532, "loss_lvr": 0.6904237866401672, "loss_mode_switch": 0.0, "loss_total": 0.22029206156730652, "step": 1684 }, { "epoch": 0.674, "grad_norm": 1.4048068523406982, "learning_rate": 2.5374906998451094e-06, "loss": 0.2996, "step": 1685 }, { "batch_size": 4, "epoch": 0.674, "step": 1685, "tokens_per_device": 3808 }, { "epoch": 0.674, "loss_ce": 0.46262505650520325, "loss_lvr": 0.818512499332428, "loss_mode_switch": 0.0, "loss_total": 0.544476330280304, "step": 1685 }, { "batch_size": 1, "epoch": 0.674, "step": 1685, "tokens_per_device": 4885 }, { "epoch": 0.674, "loss_ce": 0.0011259913444519043, "loss_lvr": 0.552224338054657, "loss_mode_switch": 0.0, "loss_total": 0.05634842440485954, "step": 1685 }, { "batch_size": 1, "epoch": 0.674, "step": 1685, "tokens_per_device": 5193 }, { "epoch": 0.674, "loss_ce": 0.0034806954208761454, "loss_lvr": 0.3631875813007355, "loss_mode_switch": 0.0, "loss_total": 0.03979945555329323, "step": 1685 }, { "batch_size": 4, "epoch": 0.674, "step": 1685, "tokens_per_device": 6380 }, { "epoch": 0.674, "loss_ce": 0.0012581327464431524, "loss_lvr": 0.7188214063644409, "loss_mode_switch": 0.0, "loss_total": 0.07314027845859528, "step": 1685 }, { "batch_size": 4, "epoch": 0.674, "step": 1685, "tokens_per_device": 6100 }, { "epoch": 0.674, "loss_ce": 0.032346758991479874, "loss_lvr": 0.8625360727310181, "loss_mode_switch": 0.0, "loss_total": 0.11860036849975586, "step": 1685 }, { "batch_size": 4, "epoch": 0.674, "step": 1685, "tokens_per_device": 2664 }, { "epoch": 0.674, "loss_ce": 0.30994680523872375, "loss_lvr": 0.9569317698478699, "loss_mode_switch": 0.0, "loss_total": 0.40563997626304626, "step": 1685 }, { "batch_size": 4, "epoch": 0.674, "step": 1685, "tokens_per_device": 4504 }, { "epoch": 0.674, "loss_ce": 0.14926324784755707, "loss_lvr": 0.8154820203781128, "loss_mode_switch": 0.0, "loss_total": 0.2308114469051361, "step": 1685 }, { "batch_size": 4, "epoch": 0.674, "step": 1685, "tokens_per_device": 5780 }, { "epoch": 0.674, "loss_ce": 0.1890723556280136, "loss_lvr": 0.8055859208106995, "loss_mode_switch": 0.0, "loss_total": 0.26963093876838684, "step": 1685 }, { "epoch": 0.6744, "grad_norm": 1.1643083095550537, "learning_rate": 2.5318553165829407e-06, "loss": 0.2882, "step": 1686 }, { "batch_size": 4, "epoch": 0.6744, "step": 1686, "tokens_per_device": 3776 }, { "epoch": 0.6744, "loss_ce": 0.029412904754281044, "loss_lvr": 0.761172890663147, "loss_mode_switch": 0.0, "loss_total": 0.10553019493818283, "step": 1686 }, { "batch_size": 4, "epoch": 0.6744, "step": 1686, "tokens_per_device": 2604 }, { "epoch": 0.6744, "loss_ce": 0.37354525923728943, "loss_lvr": 0.9273627400398254, "loss_mode_switch": 0.0, "loss_total": 0.466281533241272, "step": 1686 }, { "batch_size": 4, "epoch": 0.6744, "step": 1686, "tokens_per_device": 4520 }, { "epoch": 0.6744, "loss_ce": 0.33211231231689453, "loss_lvr": 0.976453959941864, "loss_mode_switch": 0.0, "loss_total": 0.4297577142715454, "step": 1686 }, { "batch_size": 4, "epoch": 0.6744, "step": 1686, "tokens_per_device": 2532 }, { "epoch": 0.6744, "loss_ce": 0.36371463537216187, "loss_lvr": 1.058836579322815, "loss_mode_switch": 0.0, "loss_total": 0.46959829330444336, "step": 1686 }, { "batch_size": 4, "epoch": 0.6744, "step": 1686, "tokens_per_device": 4392 }, { "epoch": 0.6744, "loss_ce": 0.10585960000753403, "loss_lvr": 0.8348482847213745, "loss_mode_switch": 0.0, "loss_total": 0.18934443593025208, "step": 1686 }, { "batch_size": 4, "epoch": 0.6744, "step": 1686, "tokens_per_device": 8476 }, { "epoch": 0.6744, "loss_ce": 0.16744183003902435, "loss_lvr": 1.146215558052063, "loss_mode_switch": 0.0, "loss_total": 0.28206339478492737, "step": 1686 }, { "batch_size": 4, "epoch": 0.6744, "step": 1686, "tokens_per_device": 7860 }, { "epoch": 0.6744, "loss_ce": 0.6676977276802063, "loss_lvr": 0.6390214562416077, "loss_mode_switch": 0.0, "loss_total": 0.7315998673439026, "step": 1686 }, { "batch_size": 1, "epoch": 0.6744, "step": 1686, "tokens_per_device": 5065 }, { "epoch": 0.6744, "loss_ce": 0.0011508172610774636, "loss_lvr": 0.6938613653182983, "loss_mode_switch": 0.0, "loss_total": 0.07053695619106293, "step": 1686 }, { "epoch": 0.6748, "grad_norm": 1.3969660997390747, "learning_rate": 2.5262240756709838e-06, "loss": 0.3292, "step": 1687 }, { "batch_size": 4, "epoch": 0.6748, "step": 1687, "tokens_per_device": 3824 }, { "epoch": 0.6748, "loss_ce": 0.0021661738865077496, "loss_lvr": 0.7190935611724854, "loss_mode_switch": 0.0, "loss_total": 0.07407553493976593, "step": 1687 }, { "batch_size": 4, "epoch": 0.6748, "step": 1687, "tokens_per_device": 2672 }, { "epoch": 0.6748, "loss_ce": 0.12551790475845337, "loss_lvr": 0.8872855305671692, "loss_mode_switch": 0.0, "loss_total": 0.2142464518547058, "step": 1687 }, { "batch_size": 4, "epoch": 0.6748, "step": 1687, "tokens_per_device": 4040 }, { "epoch": 0.6748, "loss_ce": 0.255300909280777, "loss_lvr": 0.9403852224349976, "loss_mode_switch": 0.0, "loss_total": 0.34933942556381226, "step": 1687 }, { "batch_size": 4, "epoch": 0.6748, "step": 1687, "tokens_per_device": 3872 }, { "epoch": 0.6748, "loss_ce": 0.3278104364871979, "loss_lvr": 0.9468112587928772, "loss_mode_switch": 0.0, "loss_total": 0.42249155044555664, "step": 1687 }, { "batch_size": 4, "epoch": 0.6748, "step": 1687, "tokens_per_device": 4372 }, { "epoch": 0.6748, "loss_ce": 0.45766204595565796, "loss_lvr": 0.8823304772377014, "loss_mode_switch": 0.0, "loss_total": 0.5458950996398926, "step": 1687 }, { "batch_size": 1, "epoch": 0.6748, "step": 1687, "tokens_per_device": 4907 }, { "epoch": 0.6748, "loss_ce": 0.04238611087203026, "loss_lvr": 0.9014795422554016, "loss_mode_switch": 0.0, "loss_total": 0.13253407180309296, "step": 1687 }, { "batch_size": 1, "epoch": 0.6748, "step": 1687, "tokens_per_device": 5162 }, { "epoch": 0.6748, "loss_ce": 0.05109238997101784, "loss_lvr": 0.5303041934967041, "loss_mode_switch": 0.0, "loss_total": 0.10412281006574631, "step": 1687 }, { "batch_size": 4, "epoch": 0.6748, "step": 1687, "tokens_per_device": 2624 }, { "epoch": 0.6748, "loss_ce": 0.1364785134792328, "loss_lvr": 1.080955147743225, "loss_mode_switch": 0.0, "loss_total": 0.24457404017448425, "step": 1687 }, { "epoch": 0.6752, "grad_norm": 1.3071825504302979, "learning_rate": 2.5205969865602974e-06, "loss": 0.3079, "step": 1688 }, { "batch_size": 1, "epoch": 0.6752, "step": 1688, "tokens_per_device": 5132 }, { "epoch": 0.6752, "loss_ce": 0.05777793005108833, "loss_lvr": 0.5286564826965332, "loss_mode_switch": 0.0, "loss_total": 0.11064358055591583, "step": 1688 }, { "batch_size": 4, "epoch": 0.6752, "step": 1688, "tokens_per_device": 4024 }, { "epoch": 0.6752, "loss_ce": 0.07526258379220963, "loss_lvr": 0.8434778451919556, "loss_mode_switch": 0.0, "loss_total": 0.15961036086082458, "step": 1688 }, { "batch_size": 1, "epoch": 0.6752, "step": 1688, "tokens_per_device": 4869 }, { "epoch": 0.6752, "loss_ce": 0.027200045064091682, "loss_lvr": 0.5048045516014099, "loss_mode_switch": 0.0, "loss_total": 0.07768049836158752, "step": 1688 }, { "batch_size": 1, "epoch": 0.6752, "step": 1688, "tokens_per_device": 4994 }, { "epoch": 0.6752, "loss_ce": 0.07833123952150345, "loss_lvr": 0.7705851197242737, "loss_mode_switch": 0.0, "loss_total": 0.15538975596427917, "step": 1688 }, { "batch_size": 4, "epoch": 0.6752, "step": 1688, "tokens_per_device": 3864 }, { "epoch": 0.6752, "loss_ce": 0.02682238258421421, "loss_lvr": 0.7969239354133606, "loss_mode_switch": 0.0, "loss_total": 0.10651477426290512, "step": 1688 }, { "batch_size": 4, "epoch": 0.6752, "step": 1688, "tokens_per_device": 4616 }, { "epoch": 0.6752, "loss_ce": 0.22955596446990967, "loss_lvr": 0.8154595494270325, "loss_mode_switch": 0.0, "loss_total": 0.31110191345214844, "step": 1688 }, { "batch_size": 4, "epoch": 0.6752, "step": 1688, "tokens_per_device": 5724 }, { "epoch": 0.6752, "loss_ce": 0.5412968397140503, "loss_lvr": 0.9633505344390869, "loss_mode_switch": 0.0, "loss_total": 0.637631893157959, "step": 1688 }, { "batch_size": 1, "epoch": 0.6752, "step": 1688, "tokens_per_device": 5119 }, { "epoch": 0.6752, "loss_ce": 0.003903494216501713, "loss_lvr": 0.5119964480400085, "loss_mode_switch": 0.0, "loss_total": 0.05510313808917999, "step": 1688 }, { "epoch": 0.6756, "grad_norm": 1.347572684288025, "learning_rate": 2.514974058694965e-06, "loss": 0.296, "step": 1689 }, { "batch_size": 4, "epoch": 0.6756, "step": 1689, "tokens_per_device": 13616 }, { "epoch": 0.6756, "loss_ce": 0.011409725062549114, "loss_lvr": 0.40719732642173767, "loss_mode_switch": 0.0, "loss_total": 0.052129458636045456, "step": 1689 }, { "batch_size": 4, "epoch": 0.6756, "step": 1689, "tokens_per_device": 4252 }, { "epoch": 0.6756, "loss_ce": 0.16391277313232422, "loss_lvr": 0.8397921919822693, "loss_mode_switch": 0.0, "loss_total": 0.24789199233055115, "step": 1689 }, { "batch_size": 1, "epoch": 0.6756, "step": 1689, "tokens_per_device": 4894 }, { "epoch": 0.6756, "loss_ce": 0.9641574621200562, "loss_lvr": 0.8218449950218201, "loss_mode_switch": 0.0, "loss_total": 1.0463420152664185, "step": 1689 }, { "batch_size": 4, "epoch": 0.6756, "step": 1689, "tokens_per_device": 5292 }, { "epoch": 0.6756, "loss_ce": 0.5345104336738586, "loss_lvr": 0.9346433281898499, "loss_mode_switch": 0.0, "loss_total": 0.6279747486114502, "step": 1689 }, { "batch_size": 4, "epoch": 0.6756, "step": 1689, "tokens_per_device": 5036 }, { "epoch": 0.6756, "loss_ce": 0.11695487797260284, "loss_lvr": 0.45906826853752136, "loss_mode_switch": 0.0, "loss_total": 0.16286170482635498, "step": 1689 }, { "batch_size": 4, "epoch": 0.6756, "step": 1689, "tokens_per_device": 1536 }, { "epoch": 0.6756, "loss_ce": 0.12404866516590118, "loss_lvr": 1.0113061666488647, "loss_mode_switch": 0.0, "loss_total": 0.2251792848110199, "step": 1689 }, { "batch_size": 4, "epoch": 0.6756, "step": 1689, "tokens_per_device": 5108 }, { "epoch": 0.6756, "loss_ce": 0.4958459436893463, "loss_lvr": 0.678493857383728, "loss_mode_switch": 0.0, "loss_total": 0.5636953115463257, "step": 1689 }, { "batch_size": 4, "epoch": 0.6756, "step": 1689, "tokens_per_device": 4600 }, { "epoch": 0.6756, "loss_ce": 0.03176122531294823, "loss_lvr": 0.7000146508216858, "loss_mode_switch": 0.0, "loss_total": 0.10176269710063934, "step": 1689 }, { "epoch": 0.676, "grad_norm": 1.3216261863708496, "learning_rate": 2.5093553015120937e-06, "loss": 0.3041, "step": 1690 }, { "batch_size": 4, "epoch": 0.676, "step": 1690, "tokens_per_device": 4288 }, { "epoch": 0.676, "loss_ce": 0.04468405991792679, "loss_lvr": 0.8612184524536133, "loss_mode_switch": 0.0, "loss_total": 0.13080590963363647, "step": 1690 }, { "batch_size": 4, "epoch": 0.676, "step": 1690, "tokens_per_device": 3924 }, { "epoch": 0.676, "loss_ce": 0.539890706539154, "loss_lvr": 0.9084644317626953, "loss_mode_switch": 0.0, "loss_total": 0.6307371258735657, "step": 1690 }, { "batch_size": 4, "epoch": 0.676, "step": 1690, "tokens_per_device": 4116 }, { "epoch": 0.676, "loss_ce": 0.24844714999198914, "loss_lvr": 0.793698787689209, "loss_mode_switch": 0.0, "loss_total": 0.32781702280044556, "step": 1690 }, { "batch_size": 4, "epoch": 0.676, "step": 1690, "tokens_per_device": 3792 }, { "epoch": 0.676, "loss_ce": 0.15321296453475952, "loss_lvr": 0.8413281440734863, "loss_mode_switch": 0.0, "loss_total": 0.23734578490257263, "step": 1690 }, { "batch_size": 4, "epoch": 0.676, "step": 1690, "tokens_per_device": 3756 }, { "epoch": 0.676, "loss_ce": 0.18088017404079437, "loss_lvr": 0.9051570296287537, "loss_mode_switch": 0.0, "loss_total": 0.27139586210250854, "step": 1690 }, { "batch_size": 4, "epoch": 0.676, "step": 1690, "tokens_per_device": 1600 }, { "epoch": 0.676, "loss_ce": 0.07638701796531677, "loss_lvr": 0.8789080381393433, "loss_mode_switch": 0.0, "loss_total": 0.1642778217792511, "step": 1690 }, { "batch_size": 4, "epoch": 0.676, "step": 1690, "tokens_per_device": 4540 }, { "epoch": 0.676, "loss_ce": 0.07647541910409927, "loss_lvr": 0.7578421235084534, "loss_mode_switch": 0.0, "loss_total": 0.15225963294506073, "step": 1690 }, { "batch_size": 4, "epoch": 0.676, "step": 1690, "tokens_per_device": 5780 }, { "epoch": 0.676, "loss_ce": 0.10436355322599411, "loss_lvr": 0.7649784684181213, "loss_mode_switch": 0.0, "loss_total": 0.18086139857769012, "step": 1690 }, { "epoch": 0.6764, "grad_norm": 1.4431583881378174, "learning_rate": 2.5037407244417834e-06, "loss": 0.2715, "step": 1691 }, { "batch_size": 4, "epoch": 0.6764, "step": 1691, "tokens_per_device": 2116 }, { "epoch": 0.6764, "loss_ce": 0.2729753255844116, "loss_lvr": 0.7703142166137695, "loss_mode_switch": 0.0, "loss_total": 0.35000675916671753, "step": 1691 }, { "batch_size": 1, "epoch": 0.6764, "step": 1691, "tokens_per_device": 5096 }, { "epoch": 0.6764, "loss_ce": 0.0034682254772633314, "loss_lvr": 0.21370449662208557, "loss_mode_switch": 0.0, "loss_total": 0.024838674813508987, "step": 1691 }, { "batch_size": 4, "epoch": 0.6764, "step": 1691, "tokens_per_device": 4352 }, { "epoch": 0.6764, "loss_ce": 0.16232794523239136, "loss_lvr": 0.9930720925331116, "loss_mode_switch": 0.0, "loss_total": 0.2616351544857025, "step": 1691 }, { "batch_size": 4, "epoch": 0.6764, "step": 1691, "tokens_per_device": 4504 }, { "epoch": 0.6764, "loss_ce": 0.18232809007167816, "loss_lvr": 0.7088111639022827, "loss_mode_switch": 0.0, "loss_total": 0.2532092034816742, "step": 1691 }, { "batch_size": 1, "epoch": 0.6764, "step": 1691, "tokens_per_device": 4866 }, { "epoch": 0.6764, "loss_ce": 0.00034764964948408306, "loss_lvr": 0.2431042343378067, "loss_mode_switch": 0.0, "loss_total": 0.024658072739839554, "step": 1691 }, { "batch_size": 1, "epoch": 0.6764, "step": 1691, "tokens_per_device": 5431 }, { "epoch": 0.6764, "loss_ce": 0.06535732746124268, "loss_lvr": 0.3565676212310791, "loss_mode_switch": 0.0, "loss_total": 0.10101409256458282, "step": 1691 }, { "batch_size": 1, "epoch": 0.6764, "step": 1691, "tokens_per_device": 5050 }, { "epoch": 0.6764, "loss_ce": 0.14434435963630676, "loss_lvr": 0.35290998220443726, "loss_mode_switch": 0.0, "loss_total": 0.17963536083698273, "step": 1691 }, { "batch_size": 4, "epoch": 0.6764, "step": 1691, "tokens_per_device": 4304 }, { "epoch": 0.6764, "loss_ce": 0.45729362964630127, "loss_lvr": 0.7812280654907227, "loss_mode_switch": 0.0, "loss_total": 0.5354164242744446, "step": 1691 }, { "epoch": 0.6768, "grad_norm": 1.3052031993865967, "learning_rate": 2.498130336907125e-06, "loss": 0.2626, "step": 1692 }, { "batch_size": 1, "epoch": 0.6768, "step": 1692, "tokens_per_device": 5068 }, { "epoch": 0.6768, "loss_ce": 0.004497685935348272, "loss_lvr": 0.4263448715209961, "loss_mode_switch": 0.0, "loss_total": 0.04713217169046402, "step": 1692 }, { "batch_size": 4, "epoch": 0.6768, "step": 1692, "tokens_per_device": 2580 }, { "epoch": 0.6768, "loss_ce": 0.3395307660102844, "loss_lvr": 0.7503886222839355, "loss_mode_switch": 0.0, "loss_total": 0.414569616317749, "step": 1692 }, { "batch_size": 4, "epoch": 0.6768, "step": 1692, "tokens_per_device": 3840 }, { "epoch": 0.6768, "loss_ce": 0.168585866689682, "loss_lvr": 0.8171592950820923, "loss_mode_switch": 0.0, "loss_total": 0.2503018081188202, "step": 1692 }, { "batch_size": 4, "epoch": 0.6768, "step": 1692, "tokens_per_device": 3344 }, { "epoch": 0.6768, "loss_ce": 0.18937350809574127, "loss_lvr": 0.9633219838142395, "loss_mode_switch": 0.0, "loss_total": 0.28570571541786194, "step": 1692 }, { "batch_size": 4, "epoch": 0.6768, "step": 1692, "tokens_per_device": 2888 }, { "epoch": 0.6768, "loss_ce": 0.275147944688797, "loss_lvr": 1.0789092779159546, "loss_mode_switch": 0.0, "loss_total": 0.38303887844085693, "step": 1692 }, { "batch_size": 4, "epoch": 0.6768, "step": 1692, "tokens_per_device": 4664 }, { "epoch": 0.6768, "loss_ce": 0.7970068454742432, "loss_lvr": 0.6819433569908142, "loss_mode_switch": 0.0, "loss_total": 0.8652011752128601, "step": 1692 }, { "batch_size": 4, "epoch": 0.6768, "step": 1692, "tokens_per_device": 4576 }, { "epoch": 0.6768, "loss_ce": 0.48089009523391724, "loss_lvr": 0.6128575801849365, "loss_mode_switch": 0.0, "loss_total": 0.542175829410553, "step": 1692 }, { "batch_size": 1, "epoch": 0.6768, "step": 1692, "tokens_per_device": 5142 }, { "epoch": 0.6768, "loss_ce": 0.01182246021926403, "loss_lvr": 0.5714854001998901, "loss_mode_switch": 0.0, "loss_total": 0.06897100061178207, "step": 1692 }, { "epoch": 0.6772, "grad_norm": 1.375560998916626, "learning_rate": 2.492524148324173e-06, "loss": 0.2537, "step": 1693 }, { "batch_size": 4, "epoch": 0.6772, "step": 1693, "tokens_per_device": 1292 }, { "epoch": 0.6772, "loss_ce": 0.5987652540206909, "loss_lvr": 1.177108883857727, "loss_mode_switch": 0.0, "loss_total": 0.7164761424064636, "step": 1693 }, { "batch_size": 4, "epoch": 0.6772, "step": 1693, "tokens_per_device": 4472 }, { "epoch": 0.6772, "loss_ce": 0.05534111708402634, "loss_lvr": 0.8703455328941345, "loss_mode_switch": 0.0, "loss_total": 0.14237567782402039, "step": 1693 }, { "batch_size": 1, "epoch": 0.6772, "step": 1693, "tokens_per_device": 5176 }, { "epoch": 0.6772, "loss_ce": 0.1557537168264389, "loss_lvr": 0.2950061559677124, "loss_mode_switch": 0.0, "loss_total": 0.18525433540344238, "step": 1693 }, { "batch_size": 4, "epoch": 0.6772, "step": 1693, "tokens_per_device": 13388 }, { "epoch": 0.6772, "loss_ce": 0.1868618130683899, "loss_lvr": 0.839371383190155, "loss_mode_switch": 0.0, "loss_total": 0.2707989513874054, "step": 1693 }, { "batch_size": 4, "epoch": 0.6772, "step": 1693, "tokens_per_device": 4752 }, { "epoch": 0.6772, "loss_ce": 0.2544853091239929, "loss_lvr": 0.5907658934593201, "loss_mode_switch": 0.0, "loss_total": 0.31356188654899597, "step": 1693 }, { "batch_size": 1, "epoch": 0.6772, "step": 1693, "tokens_per_device": 5471 }, { "epoch": 0.6772, "loss_ce": 0.00016655599756632, "loss_lvr": 0.8331074118614197, "loss_mode_switch": 0.0, "loss_total": 0.08347730338573456, "step": 1693 }, { "batch_size": 1, "epoch": 0.6772, "step": 1693, "tokens_per_device": 8971 }, { "epoch": 0.6772, "loss_ce": 0.001329108257777989, "loss_lvr": 0.26405924558639526, "loss_mode_switch": 0.0, "loss_total": 0.027735034003853798, "step": 1693 }, { "batch_size": 1, "epoch": 0.6772, "step": 1693, "tokens_per_device": 5122 }, { "epoch": 0.6772, "loss_ce": 0.00024035740352701396, "loss_lvr": 0.4077538847923279, "loss_mode_switch": 0.0, "loss_total": 0.04101574420928955, "step": 1693 }, { "epoch": 0.6776, "grad_norm": 1.2980060577392578, "learning_rate": 2.4869221681019394e-06, "loss": 0.2933, "step": 1694 }, { "batch_size": 1, "epoch": 0.6776, "step": 1694, "tokens_per_device": 5175 }, { "epoch": 0.6776, "loss_ce": 0.008062858134508133, "loss_lvr": 0.3495344817638397, "loss_mode_switch": 0.0, "loss_total": 0.043016307055950165, "step": 1694 }, { "batch_size": 1, "epoch": 0.6776, "step": 1694, "tokens_per_device": 5029 }, { "epoch": 0.6776, "loss_ce": 0.14786775410175323, "loss_lvr": 0.5517415404319763, "loss_mode_switch": 0.0, "loss_total": 0.2030419111251831, "step": 1694 }, { "batch_size": 4, "epoch": 0.6776, "step": 1694, "tokens_per_device": 6228 }, { "epoch": 0.6776, "loss_ce": 0.22786785662174225, "loss_lvr": 0.6098552942276001, "loss_mode_switch": 0.0, "loss_total": 0.28885337710380554, "step": 1694 }, { "batch_size": 1, "epoch": 0.6776, "step": 1694, "tokens_per_device": 4915 }, { "epoch": 0.6776, "loss_ce": 0.0011007613502442837, "loss_lvr": 0.3112267255783081, "loss_mode_switch": 0.0, "loss_total": 0.03222343325614929, "step": 1694 }, { "batch_size": 4, "epoch": 0.6776, "step": 1694, "tokens_per_device": 4336 }, { "epoch": 0.6776, "loss_ce": 0.1185368001461029, "loss_lvr": 0.8125177025794983, "loss_mode_switch": 0.0, "loss_total": 0.19978857040405273, "step": 1694 }, { "batch_size": 1, "epoch": 0.6776, "step": 1694, "tokens_per_device": 5835 }, { "epoch": 0.6776, "loss_ce": 0.4724445044994354, "loss_lvr": 0.32312333583831787, "loss_mode_switch": 0.0, "loss_total": 0.5047568082809448, "step": 1694 }, { "batch_size": 4, "epoch": 0.6776, "step": 1694, "tokens_per_device": 3432 }, { "epoch": 0.6776, "loss_ce": 0.4000861942768097, "loss_lvr": 0.7001246809959412, "loss_mode_switch": 0.0, "loss_total": 0.47009867429733276, "step": 1694 }, { "batch_size": 4, "epoch": 0.6776, "step": 1694, "tokens_per_device": 5612 }, { "epoch": 0.6776, "loss_ce": 0.27756011486053467, "loss_lvr": 0.7799224853515625, "loss_mode_switch": 0.0, "loss_total": 0.3555523753166199, "step": 1694 }, { "epoch": 0.678, "grad_norm": 1.364485502243042, "learning_rate": 2.4813244056423692e-06, "loss": 0.3212, "step": 1695 }, { "batch_size": 4, "epoch": 0.678, "step": 1695, "tokens_per_device": 3880 }, { "epoch": 0.678, "loss_ce": 0.010320674628019333, "loss_lvr": 0.8216595649719238, "loss_mode_switch": 0.0, "loss_total": 0.09248663485050201, "step": 1695 }, { "batch_size": 1, "epoch": 0.678, "step": 1695, "tokens_per_device": 4977 }, { "epoch": 0.678, "loss_ce": 0.03475867956876755, "loss_lvr": 0.3400343060493469, "loss_mode_switch": 0.0, "loss_total": 0.06876210868358612, "step": 1695 }, { "batch_size": 1, "epoch": 0.678, "step": 1695, "tokens_per_device": 4902 }, { "epoch": 0.678, "loss_ce": 0.0005310402484610677, "loss_lvr": 0.3751486837863922, "loss_mode_switch": 0.0, "loss_total": 0.03804590925574303, "step": 1695 }, { "batch_size": 4, "epoch": 0.678, "step": 1695, "tokens_per_device": 2680 }, { "epoch": 0.678, "loss_ce": 0.6535011529922485, "loss_lvr": 0.6559645533561707, "loss_mode_switch": 0.0, "loss_total": 0.7190976142883301, "step": 1695 }, { "batch_size": 1, "epoch": 0.678, "step": 1695, "tokens_per_device": 4860 }, { "epoch": 0.678, "loss_ce": 0.002092902548611164, "loss_lvr": 0.22954301536083221, "loss_mode_switch": 0.0, "loss_total": 0.02504720538854599, "step": 1695 }, { "batch_size": 1, "epoch": 0.678, "step": 1695, "tokens_per_device": 5234 }, { "epoch": 0.678, "loss_ce": 0.0023716737050563097, "loss_lvr": 0.34049689769744873, "loss_mode_switch": 0.0, "loss_total": 0.036421362310647964, "step": 1695 }, { "batch_size": 4, "epoch": 0.678, "step": 1695, "tokens_per_device": 4064 }, { "epoch": 0.678, "loss_ce": 0.06279376894235611, "loss_lvr": 0.6163836121559143, "loss_mode_switch": 0.0, "loss_total": 0.12443213164806366, "step": 1695 }, { "batch_size": 1, "epoch": 0.678, "step": 1695, "tokens_per_device": 4874 }, { "epoch": 0.678, "loss_ce": 0.0009507256327196956, "loss_lvr": 0.4013729393482208, "loss_mode_switch": 0.0, "loss_total": 0.04108801856637001, "step": 1695 }, { "epoch": 0.6784, "grad_norm": 1.3754239082336426, "learning_rate": 2.4757308703403275e-06, "loss": 0.2811, "step": 1696 }, { "batch_size": 4, "epoch": 0.6784, "step": 1696, "tokens_per_device": 6824 }, { "epoch": 0.6784, "loss_ce": 0.32096338272094727, "loss_lvr": 0.8327757120132446, "loss_mode_switch": 0.0, "loss_total": 0.4042409658432007, "step": 1696 }, { "batch_size": 4, "epoch": 0.6784, "step": 1696, "tokens_per_device": 4668 }, { "epoch": 0.6784, "loss_ce": 0.3997591435909271, "loss_lvr": 0.7691237330436707, "loss_mode_switch": 0.0, "loss_total": 0.4766715168952942, "step": 1696 }, { "batch_size": 4, "epoch": 0.6784, "step": 1696, "tokens_per_device": 6020 }, { "epoch": 0.6784, "loss_ce": 0.03684895858168602, "loss_lvr": 0.6588914394378662, "loss_mode_switch": 0.0, "loss_total": 0.10273809731006622, "step": 1696 }, { "batch_size": 4, "epoch": 0.6784, "step": 1696, "tokens_per_device": 16368 }, { "epoch": 0.6784, "loss_ce": 0.26620450615882874, "loss_lvr": 0.8108766078948975, "loss_mode_switch": 0.0, "loss_total": 0.3472921848297119, "step": 1696 }, { "batch_size": 4, "epoch": 0.6784, "step": 1696, "tokens_per_device": 4248 }, { "epoch": 0.6784, "loss_ce": 0.16699109971523285, "loss_lvr": 1.0690051317214966, "loss_mode_switch": 0.0, "loss_total": 0.2738916277885437, "step": 1696 }, { "batch_size": 4, "epoch": 0.6784, "step": 1696, "tokens_per_device": 4240 }, { "epoch": 0.6784, "loss_ce": 0.7675275802612305, "loss_lvr": 0.9232734441757202, "loss_mode_switch": 0.0, "loss_total": 0.8598549365997314, "step": 1696 }, { "batch_size": 4, "epoch": 0.6784, "step": 1696, "tokens_per_device": 1640 }, { "epoch": 0.6784, "loss_ce": 0.23057863116264343, "loss_lvr": 1.0030522346496582, "loss_mode_switch": 0.0, "loss_total": 0.33088386058807373, "step": 1696 }, { "batch_size": 4, "epoch": 0.6784, "step": 1696, "tokens_per_device": 5748 }, { "epoch": 0.6784, "loss_ce": 0.23506613075733185, "loss_lvr": 0.8182429671287537, "loss_mode_switch": 0.0, "loss_total": 0.3168904185295105, "step": 1696 }, { "epoch": 0.6788, "grad_norm": 1.296406626701355, "learning_rate": 2.4701415715835917e-06, "loss": 0.2991, "step": 1697 }, { "batch_size": 4, "epoch": 0.6788, "step": 1697, "tokens_per_device": 1424 }, { "epoch": 0.6788, "loss_ce": 0.36081790924072266, "loss_lvr": 1.0994378328323364, "loss_mode_switch": 0.0, "loss_total": 0.4707616865634918, "step": 1697 }, { "batch_size": 4, "epoch": 0.6788, "step": 1697, "tokens_per_device": 1544 }, { "epoch": 0.6788, "loss_ce": 0.19269560277462006, "loss_lvr": 0.9076216816902161, "loss_mode_switch": 0.0, "loss_total": 0.28345775604248047, "step": 1697 }, { "batch_size": 4, "epoch": 0.6788, "step": 1697, "tokens_per_device": 4336 }, { "epoch": 0.6788, "loss_ce": 0.4621284008026123, "loss_lvr": 0.7999043464660645, "loss_mode_switch": 0.0, "loss_total": 0.5421188473701477, "step": 1697 }, { "batch_size": 1, "epoch": 0.6788, "step": 1697, "tokens_per_device": 5114 }, { "epoch": 0.6788, "loss_ce": 0.00413489667698741, "loss_lvr": 0.4123622477054596, "loss_mode_switch": 0.0, "loss_total": 0.045371122658252716, "step": 1697 }, { "batch_size": 1, "epoch": 0.6788, "step": 1697, "tokens_per_device": 4967 }, { "epoch": 0.6788, "loss_ce": 0.20037460327148438, "loss_lvr": 0.7719534039497375, "loss_mode_switch": 0.0, "loss_total": 0.2775699496269226, "step": 1697 }, { "batch_size": 4, "epoch": 0.6788, "step": 1697, "tokens_per_device": 4716 }, { "epoch": 0.6788, "loss_ce": 0.4881657660007477, "loss_lvr": 0.8722319006919861, "loss_mode_switch": 0.0, "loss_total": 0.5753889679908752, "step": 1697 }, { "batch_size": 4, "epoch": 0.6788, "step": 1697, "tokens_per_device": 1776 }, { "epoch": 0.6788, "loss_ce": 0.2519569396972656, "loss_lvr": 0.9687488675117493, "loss_mode_switch": 0.0, "loss_total": 0.34883183240890503, "step": 1697 }, { "batch_size": 4, "epoch": 0.6788, "step": 1697, "tokens_per_device": 1440 }, { "epoch": 0.6788, "loss_ce": 0.5590041279792786, "loss_lvr": 0.9112198948860168, "loss_mode_switch": 0.0, "loss_total": 0.6501260995864868, "step": 1697 }, { "epoch": 0.6792, "grad_norm": 1.3291890621185303, "learning_rate": 2.464556518752821e-06, "loss": 0.3087, "step": 1698 }, { "batch_size": 4, "epoch": 0.6792, "step": 1698, "tokens_per_device": 5664 }, { "epoch": 0.6792, "loss_ce": 0.2538507878780365, "loss_lvr": 0.919391393661499, "loss_mode_switch": 0.0, "loss_total": 0.34578993916511536, "step": 1698 }, { "batch_size": 1, "epoch": 0.6792, "step": 1698, "tokens_per_device": 5171 }, { "epoch": 0.6792, "loss_ce": 0.0002644359483383596, "loss_lvr": 0.779236376285553, "loss_mode_switch": 0.0, "loss_total": 0.07818807661533356, "step": 1698 }, { "batch_size": 1, "epoch": 0.6792, "step": 1698, "tokens_per_device": 5332 }, { "epoch": 0.6792, "loss_ce": 0.0008765133679844439, "loss_lvr": 0.35031259059906006, "loss_mode_switch": 0.0, "loss_total": 0.035907771438360214, "step": 1698 }, { "batch_size": 1, "epoch": 0.6792, "step": 1698, "tokens_per_device": 5087 }, { "epoch": 0.6792, "loss_ce": 0.03780914470553398, "loss_lvr": 0.8521050214767456, "loss_mode_switch": 0.0, "loss_total": 0.12301965057849884, "step": 1698 }, { "batch_size": 4, "epoch": 0.6792, "step": 1698, "tokens_per_device": 3740 }, { "epoch": 0.6792, "loss_ce": 0.7719801068305969, "loss_lvr": 1.0578656196594238, "loss_mode_switch": 0.0, "loss_total": 0.8777666687965393, "step": 1698 }, { "batch_size": 1, "epoch": 0.6792, "step": 1698, "tokens_per_device": 4952 }, { "epoch": 0.6792, "loss_ce": 0.018122481182217598, "loss_lvr": 0.46439769864082336, "loss_mode_switch": 0.0, "loss_total": 0.06456225365400314, "step": 1698 }, { "batch_size": 4, "epoch": 0.6792, "step": 1698, "tokens_per_device": 14420 }, { "epoch": 0.6792, "loss_ce": 0.007179671432822943, "loss_lvr": 0.8885113596916199, "loss_mode_switch": 0.0, "loss_total": 0.0960308089852333, "step": 1698 }, { "batch_size": 4, "epoch": 0.6792, "step": 1698, "tokens_per_device": 4392 }, { "epoch": 0.6792, "loss_ce": 0.12863793969154358, "loss_lvr": 0.8193203210830688, "loss_mode_switch": 0.0, "loss_total": 0.21056997776031494, "step": 1698 }, { "epoch": 0.6796, "grad_norm": 1.2342313528060913, "learning_rate": 2.458975721221555e-06, "loss": 0.2514, "step": 1699 }, { "batch_size": 1, "epoch": 0.6796, "step": 1699, "tokens_per_device": 5148 }, { "epoch": 0.6796, "loss_ce": 0.026387564837932587, "loss_lvr": 0.4724728763103485, "loss_mode_switch": 0.0, "loss_total": 0.07363484799861908, "step": 1699 }, { "batch_size": 4, "epoch": 0.6796, "step": 1699, "tokens_per_device": 4024 }, { "epoch": 0.6796, "loss_ce": 0.5653727054595947, "loss_lvr": 0.8337944746017456, "loss_mode_switch": 0.0, "loss_total": 0.6487521529197693, "step": 1699 }, { "batch_size": 4, "epoch": 0.6796, "step": 1699, "tokens_per_device": 2620 }, { "epoch": 0.6796, "loss_ce": 0.5497459173202515, "loss_lvr": 0.6405184864997864, "loss_mode_switch": 0.0, "loss_total": 0.6137977838516235, "step": 1699 }, { "batch_size": 1, "epoch": 0.6796, "step": 1699, "tokens_per_device": 5104 }, { "epoch": 0.6796, "loss_ce": 0.09859272837638855, "loss_lvr": 0.3344942331314087, "loss_mode_switch": 0.0, "loss_total": 0.13204215466976166, "step": 1699 }, { "batch_size": 4, "epoch": 0.6796, "step": 1699, "tokens_per_device": 4188 }, { "epoch": 0.6796, "loss_ce": 0.14700350165367126, "loss_lvr": 0.543464183807373, "loss_mode_switch": 0.0, "loss_total": 0.2013499140739441, "step": 1699 }, { "batch_size": 4, "epoch": 0.6796, "step": 1699, "tokens_per_device": 4272 }, { "epoch": 0.6796, "loss_ce": 0.3440123200416565, "loss_lvr": 0.6798744797706604, "loss_mode_switch": 0.0, "loss_total": 0.41199976205825806, "step": 1699 }, { "batch_size": 1, "epoch": 0.6796, "step": 1699, "tokens_per_device": 4884 }, { "epoch": 0.6796, "loss_ce": 0.008682078681886196, "loss_lvr": 0.20179791748523712, "loss_mode_switch": 0.0, "loss_total": 0.028861869126558304, "step": 1699 }, { "batch_size": 1, "epoch": 0.6796, "step": 1699, "tokens_per_device": 4875 }, { "epoch": 0.6796, "loss_ce": 0.004421760328114033, "loss_lvr": 0.4613918662071228, "loss_mode_switch": 0.0, "loss_total": 0.05056094750761986, "step": 1699 }, { "epoch": 0.68, "grad_norm": 1.318697452545166, "learning_rate": 2.4533991883561868e-06, "loss": 0.2828, "step": 1700 }, { "batch_size": 4, "epoch": 0.68, "step": 1700, "tokens_per_device": 7652 }, { "epoch": 0.68, "loss_ce": 0.11948087811470032, "loss_lvr": 0.6222051382064819, "loss_mode_switch": 0.0, "loss_total": 0.1817013919353485, "step": 1700 }, { "batch_size": 1, "epoch": 0.68, "step": 1700, "tokens_per_device": 5258 }, { "epoch": 0.68, "loss_ce": 0.025574341416358948, "loss_lvr": 0.2876741886138916, "loss_mode_switch": 0.0, "loss_total": 0.05434176325798035, "step": 1700 }, { "batch_size": 4, "epoch": 0.68, "step": 1700, "tokens_per_device": 1644 }, { "epoch": 0.68, "loss_ce": 0.03146371990442276, "loss_lvr": 1.2623710632324219, "loss_mode_switch": 0.0, "loss_total": 0.15770083665847778, "step": 1700 }, { "batch_size": 4, "epoch": 0.68, "step": 1700, "tokens_per_device": 8800 }, { "epoch": 0.68, "loss_ce": 0.31205177307128906, "loss_lvr": 0.747743546962738, "loss_mode_switch": 0.0, "loss_total": 0.38682612776756287, "step": 1700 }, { "batch_size": 4, "epoch": 0.68, "step": 1700, "tokens_per_device": 3732 }, { "epoch": 0.68, "loss_ce": 0.19256186485290527, "loss_lvr": 1.0163955688476562, "loss_mode_switch": 0.0, "loss_total": 0.29420143365859985, "step": 1700 }, { "batch_size": 1, "epoch": 0.68, "step": 1700, "tokens_per_device": 4878 }, { "epoch": 0.68, "loss_ce": 0.027012092992663383, "loss_lvr": 0.22440141439437866, "loss_mode_switch": 0.0, "loss_total": 0.04945223405957222, "step": 1700 }, { "batch_size": 4, "epoch": 0.68, "step": 1700, "tokens_per_device": 3776 }, { "epoch": 0.68, "loss_ce": 0.25194185972213745, "loss_lvr": 0.9940851330757141, "loss_mode_switch": 0.0, "loss_total": 0.3513503670692444, "step": 1700 }, { "batch_size": 4, "epoch": 0.68, "step": 1700, "tokens_per_device": 1852 }, { "epoch": 0.68, "loss_ce": 0.32325297594070435, "loss_lvr": 0.9381154179573059, "loss_mode_switch": 0.0, "loss_total": 0.41706451773643494, "step": 1700 }, { "epoch": 0.6804, "grad_norm": 1.3766783475875854, "learning_rate": 2.447826929515953e-06, "loss": 0.2719, "step": 1701 }, { "batch_size": 4, "epoch": 0.6804, "step": 1701, "tokens_per_device": 1652 }, { "epoch": 0.6804, "loss_ce": 0.6453912854194641, "loss_lvr": 0.8615047931671143, "loss_mode_switch": 0.0, "loss_total": 0.7315417528152466, "step": 1701 }, { "batch_size": 4, "epoch": 0.6804, "step": 1701, "tokens_per_device": 4380 }, { "epoch": 0.6804, "loss_ce": 0.2771020829677582, "loss_lvr": 0.6583823561668396, "loss_mode_switch": 0.0, "loss_total": 0.3429403305053711, "step": 1701 }, { "batch_size": 1, "epoch": 0.6804, "step": 1701, "tokens_per_device": 5202 }, { "epoch": 0.6804, "loss_ce": 0.00801395159214735, "loss_lvr": 0.3627434968948364, "loss_mode_switch": 0.0, "loss_total": 0.04428830370306969, "step": 1701 }, { "batch_size": 1, "epoch": 0.6804, "step": 1701, "tokens_per_device": 4321 }, { "epoch": 0.6804, "loss_ce": 0.2434169203042984, "loss_lvr": 0.6748923063278198, "loss_mode_switch": 0.0, "loss_total": 0.31090614199638367, "step": 1701 }, { "batch_size": 4, "epoch": 0.6804, "step": 1701, "tokens_per_device": 8208 }, { "epoch": 0.6804, "loss_ce": 0.110892653465271, "loss_lvr": 0.44520241022109985, "loss_mode_switch": 0.0, "loss_total": 0.15541289746761322, "step": 1701 }, { "batch_size": 1, "epoch": 0.6804, "step": 1701, "tokens_per_device": 5171 }, { "epoch": 0.6804, "loss_ce": 0.04755500704050064, "loss_lvr": 0.8562456965446472, "loss_mode_switch": 0.0, "loss_total": 0.13317957520484924, "step": 1701 }, { "batch_size": 4, "epoch": 0.6804, "step": 1701, "tokens_per_device": 9120 }, { "epoch": 0.6804, "loss_ce": 0.3621982932090759, "loss_lvr": 0.6815022230148315, "loss_mode_switch": 0.0, "loss_total": 0.4303485155105591, "step": 1701 }, { "batch_size": 4, "epoch": 0.6804, "step": 1701, "tokens_per_device": 4476 }, { "epoch": 0.6804, "loss_ce": 0.07149520516395569, "loss_lvr": 0.844605565071106, "loss_mode_switch": 0.0, "loss_total": 0.15595576167106628, "step": 1701 }, { "epoch": 0.6808, "grad_norm": 1.3930354118347168, "learning_rate": 2.4422589540529187e-06, "loss": 0.268, "step": 1702 }, { "batch_size": 4, "epoch": 0.6808, "step": 1702, "tokens_per_device": 4952 }, { "epoch": 0.6808, "loss_ce": 0.4462141990661621, "loss_lvr": 0.968177080154419, "loss_mode_switch": 0.0, "loss_total": 0.5430319309234619, "step": 1702 }, { "batch_size": 4, "epoch": 0.6808, "step": 1702, "tokens_per_device": 3852 }, { "epoch": 0.6808, "loss_ce": 0.49319031834602356, "loss_lvr": 1.0779032707214355, "loss_mode_switch": 0.0, "loss_total": 0.6009806394577026, "step": 1702 }, { "batch_size": 4, "epoch": 0.6808, "step": 1702, "tokens_per_device": 6348 }, { "epoch": 0.6808, "loss_ce": 0.3275599479675293, "loss_lvr": 0.9300070405006409, "loss_mode_switch": 0.0, "loss_total": 0.42056065797805786, "step": 1702 }, { "batch_size": 4, "epoch": 0.6808, "step": 1702, "tokens_per_device": 1400 }, { "epoch": 0.6808, "loss_ce": 0.7218884825706482, "loss_lvr": 1.1644684076309204, "loss_mode_switch": 0.0, "loss_total": 0.8383353352546692, "step": 1702 }, { "batch_size": 1, "epoch": 0.6808, "step": 1702, "tokens_per_device": 4948 }, { "epoch": 0.6808, "loss_ce": 0.004652175586670637, "loss_lvr": 0.22578099370002747, "loss_mode_switch": 0.0, "loss_total": 0.0272302757948637, "step": 1702 }, { "batch_size": 4, "epoch": 0.6808, "step": 1702, "tokens_per_device": 4120 }, { "epoch": 0.6808, "loss_ce": 0.1192440316081047, "loss_lvr": 0.7177510261535645, "loss_mode_switch": 0.0, "loss_total": 0.19101913273334503, "step": 1702 }, { "batch_size": 4, "epoch": 0.6808, "step": 1702, "tokens_per_device": 4240 }, { "epoch": 0.6808, "loss_ce": 0.005202047061175108, "loss_lvr": 0.6700266003608704, "loss_mode_switch": 0.0, "loss_total": 0.07220470905303955, "step": 1702 }, { "batch_size": 4, "epoch": 0.6808, "step": 1702, "tokens_per_device": 5700 }, { "epoch": 0.6808, "loss_ce": 0.04996640607714653, "loss_lvr": 0.6874203681945801, "loss_mode_switch": 0.0, "loss_total": 0.11870844662189484, "step": 1702 }, { "epoch": 0.6812, "grad_norm": 1.2530896663665771, "learning_rate": 2.436695271311961e-06, "loss": 0.2608, "step": 1703 }, { "batch_size": 4, "epoch": 0.6812, "step": 1703, "tokens_per_device": 4376 }, { "epoch": 0.6812, "loss_ce": 0.3825005888938904, "loss_lvr": 1.0680372714996338, "loss_mode_switch": 0.0, "loss_total": 0.4893043041229248, "step": 1703 }, { "batch_size": 1, "epoch": 0.6812, "step": 1703, "tokens_per_device": 5761 }, { "epoch": 0.6812, "loss_ce": 0.520724356174469, "loss_lvr": 0.5624307990074158, "loss_mode_switch": 0.0, "loss_total": 0.5769674181938171, "step": 1703 }, { "batch_size": 1, "epoch": 0.6812, "step": 1703, "tokens_per_device": 4880 }, { "epoch": 0.6812, "loss_ce": 0.2369076907634735, "loss_lvr": 0.6454205513000488, "loss_mode_switch": 0.0, "loss_total": 0.3014497458934784, "step": 1703 }, { "batch_size": 1, "epoch": 0.6812, "step": 1703, "tokens_per_device": 5208 }, { "epoch": 0.6812, "loss_ce": 0.000944271800108254, "loss_lvr": 0.3307762145996094, "loss_mode_switch": 0.0, "loss_total": 0.03402189537882805, "step": 1703 }, { "batch_size": 4, "epoch": 0.6812, "step": 1703, "tokens_per_device": 5372 }, { "epoch": 0.6812, "loss_ce": 0.06492601335048676, "loss_lvr": 0.8367552757263184, "loss_mode_switch": 0.0, "loss_total": 0.14860153198242188, "step": 1703 }, { "batch_size": 4, "epoch": 0.6812, "step": 1703, "tokens_per_device": 10460 }, { "epoch": 0.6812, "loss_ce": 0.060291044414043427, "loss_lvr": 0.8450276255607605, "loss_mode_switch": 0.0, "loss_total": 0.1447938084602356, "step": 1703 }, { "batch_size": 4, "epoch": 0.6812, "step": 1703, "tokens_per_device": 4256 }, { "epoch": 0.6812, "loss_ce": 0.06369543820619583, "loss_lvr": 1.14344322681427, "loss_mode_switch": 0.0, "loss_total": 0.1780397593975067, "step": 1703 }, { "batch_size": 4, "epoch": 0.6812, "step": 1703, "tokens_per_device": 4268 }, { "epoch": 0.6812, "loss_ce": 0.15581747889518738, "loss_lvr": 1.1275629997253418, "loss_mode_switch": 0.0, "loss_total": 0.2685737907886505, "step": 1703 }, { "epoch": 0.6816, "grad_norm": 1.831222653388977, "learning_rate": 2.43113589063075e-06, "loss": 0.3154, "step": 1704 }, { "batch_size": 1, "epoch": 0.6816, "step": 1704, "tokens_per_device": 4900 }, { "epoch": 0.6816, "loss_ce": 1.207395076751709, "loss_lvr": 1.1839100122451782, "loss_mode_switch": 0.0, "loss_total": 1.3257861137390137, "step": 1704 }, { "batch_size": 4, "epoch": 0.6816, "step": 1704, "tokens_per_device": 4188 }, { "epoch": 0.6816, "loss_ce": 0.2708733081817627, "loss_lvr": 0.523577868938446, "loss_mode_switch": 0.0, "loss_total": 0.3232311010360718, "step": 1704 }, { "batch_size": 4, "epoch": 0.6816, "step": 1704, "tokens_per_device": 4260 }, { "epoch": 0.6816, "loss_ce": 0.18928901851177216, "loss_lvr": 0.9453801512718201, "loss_mode_switch": 0.0, "loss_total": 0.2838270366191864, "step": 1704 }, { "batch_size": 1, "epoch": 0.6816, "step": 1704, "tokens_per_device": 5036 }, { "epoch": 0.6816, "loss_ce": 0.03418494015932083, "loss_lvr": 0.45685458183288574, "loss_mode_switch": 0.0, "loss_total": 0.07987040281295776, "step": 1704 }, { "batch_size": 4, "epoch": 0.6816, "step": 1704, "tokens_per_device": 3852 }, { "epoch": 0.6816, "loss_ce": 0.1676352322101593, "loss_lvr": 0.9205837249755859, "loss_mode_switch": 0.0, "loss_total": 0.25969362258911133, "step": 1704 }, { "batch_size": 4, "epoch": 0.6816, "step": 1704, "tokens_per_device": 4284 }, { "epoch": 0.6816, "loss_ce": 0.014101129025220871, "loss_lvr": 0.6869591474533081, "loss_mode_switch": 0.0, "loss_total": 0.08279705047607422, "step": 1704 }, { "batch_size": 4, "epoch": 0.6816, "step": 1704, "tokens_per_device": 3876 }, { "epoch": 0.6816, "loss_ce": 0.05151165649294853, "loss_lvr": 0.9102780818939209, "loss_mode_switch": 0.0, "loss_total": 0.14253947138786316, "step": 1704 }, { "batch_size": 1, "epoch": 0.6816, "step": 1704, "tokens_per_device": 5263 }, { "epoch": 0.6816, "loss_ce": 0.001490783877670765, "loss_lvr": 0.25011298060417175, "loss_mode_switch": 0.0, "loss_total": 0.026502083986997604, "step": 1704 }, { "epoch": 0.682, "grad_norm": 1.4074939489364624, "learning_rate": 2.425580821339733e-06, "loss": 0.3242, "step": 1705 }, { "batch_size": 4, "epoch": 0.682, "step": 1705, "tokens_per_device": 4548 }, { "epoch": 0.682, "loss_ce": 0.15476427972316742, "loss_lvr": 0.6098483800888062, "loss_mode_switch": 0.0, "loss_total": 0.2157491147518158, "step": 1705 }, { "batch_size": 1, "epoch": 0.682, "step": 1705, "tokens_per_device": 5145 }, { "epoch": 0.682, "loss_ce": 0.08473736047744751, "loss_lvr": 0.3655431866645813, "loss_mode_switch": 0.0, "loss_total": 0.12129168212413788, "step": 1705 }, { "batch_size": 1, "epoch": 0.682, "step": 1705, "tokens_per_device": 4962 }, { "epoch": 0.682, "loss_ce": 0.15066169202327728, "loss_lvr": 0.6543631553649902, "loss_mode_switch": 0.0, "loss_total": 0.21609801054000854, "step": 1705 }, { "batch_size": 4, "epoch": 0.682, "step": 1705, "tokens_per_device": 2696 }, { "epoch": 0.682, "loss_ce": 0.6654622554779053, "loss_lvr": 0.43577688932418823, "loss_mode_switch": 0.0, "loss_total": 0.7090399265289307, "step": 1705 }, { "batch_size": 4, "epoch": 0.682, "step": 1705, "tokens_per_device": 1500 }, { "epoch": 0.682, "loss_ce": 0.5492957234382629, "loss_lvr": 0.9523636698722839, "loss_mode_switch": 0.0, "loss_total": 0.6445320844650269, "step": 1705 }, { "batch_size": 4, "epoch": 0.682, "step": 1705, "tokens_per_device": 2588 }, { "epoch": 0.682, "loss_ce": 0.3624723255634308, "loss_lvr": 1.0169473886489868, "loss_mode_switch": 0.0, "loss_total": 0.464167058467865, "step": 1705 }, { "batch_size": 4, "epoch": 0.682, "step": 1705, "tokens_per_device": 4232 }, { "epoch": 0.682, "loss_ce": 0.13590578734874725, "loss_lvr": 0.9663255214691162, "loss_mode_switch": 0.0, "loss_total": 0.2325383424758911, "step": 1705 }, { "batch_size": 4, "epoch": 0.682, "step": 1705, "tokens_per_device": 4392 }, { "epoch": 0.682, "loss_ce": 0.4980863332748413, "loss_lvr": 0.9150058627128601, "loss_mode_switch": 0.0, "loss_total": 0.5895869135856628, "step": 1705 }, { "epoch": 0.6824, "grad_norm": 1.295583963394165, "learning_rate": 2.420030072762127e-06, "loss": 0.2814, "step": 1706 }, { "batch_size": 1, "epoch": 0.6824, "step": 1706, "tokens_per_device": 4870 }, { "epoch": 0.6824, "loss_ce": 0.0009462953894399107, "loss_lvr": 0.2959998846054077, "loss_mode_switch": 0.0, "loss_total": 0.0305462833493948, "step": 1706 }, { "batch_size": 4, "epoch": 0.6824, "step": 1706, "tokens_per_device": 4212 }, { "epoch": 0.6824, "loss_ce": 0.023387037217617035, "loss_lvr": 0.5441823601722717, "loss_mode_switch": 0.0, "loss_total": 0.07780527323484421, "step": 1706 }, { "batch_size": 4, "epoch": 0.6824, "step": 1706, "tokens_per_device": 5636 }, { "epoch": 0.6824, "loss_ce": 0.10755399614572525, "loss_lvr": 0.5422040224075317, "loss_mode_switch": 0.0, "loss_total": 0.1617743968963623, "step": 1706 }, { "batch_size": 1, "epoch": 0.6824, "step": 1706, "tokens_per_device": 4799 }, { "epoch": 0.6824, "loss_ce": 0.15315976738929749, "loss_lvr": 0.2235214114189148, "loss_mode_switch": 0.0, "loss_total": 0.1755119115114212, "step": 1706 }, { "batch_size": 4, "epoch": 0.6824, "step": 1706, "tokens_per_device": 1280 }, { "epoch": 0.6824, "loss_ce": 0.21899224817752838, "loss_lvr": 0.9160294532775879, "loss_mode_switch": 0.0, "loss_total": 0.31059518456459045, "step": 1706 }, { "batch_size": 1, "epoch": 0.6824, "step": 1706, "tokens_per_device": 5100 }, { "epoch": 0.6824, "loss_ce": 0.0008638596045784652, "loss_lvr": 0.2529256343841553, "loss_mode_switch": 0.0, "loss_total": 0.02615642361342907, "step": 1706 }, { "batch_size": 1, "epoch": 0.6824, "step": 1706, "tokens_per_device": 5101 }, { "epoch": 0.6824, "loss_ce": 0.029146375134587288, "loss_lvr": 0.22173115611076355, "loss_mode_switch": 0.0, "loss_total": 0.05131949111819267, "step": 1706 }, { "batch_size": 1, "epoch": 0.6824, "step": 1706, "tokens_per_device": 5089 }, { "epoch": 0.6824, "loss_ce": 0.05707308650016785, "loss_lvr": 0.7359804511070251, "loss_mode_switch": 0.0, "loss_total": 0.13067114353179932, "step": 1706 }, { "epoch": 0.6828, "grad_norm": 1.2112782001495361, "learning_rate": 2.4144836542138975e-06, "loss": 0.259, "step": 1707 }, { "batch_size": 4, "epoch": 0.6828, "step": 1707, "tokens_per_device": 8556 }, { "epoch": 0.6828, "loss_ce": 0.03014947660267353, "loss_lvr": 0.5880509614944458, "loss_mode_switch": 0.0, "loss_total": 0.08895457535982132, "step": 1707 }, { "batch_size": 4, "epoch": 0.6828, "step": 1707, "tokens_per_device": 1384 }, { "epoch": 0.6828, "loss_ce": 0.5059906840324402, "loss_lvr": 0.9126613140106201, "loss_mode_switch": 0.0, "loss_total": 0.5972568392753601, "step": 1707 }, { "batch_size": 1, "epoch": 0.6828, "step": 1707, "tokens_per_device": 6608 }, { "epoch": 0.6828, "loss_ce": 0.06420670449733734, "loss_lvr": 0.3984258472919464, "loss_mode_switch": 0.0, "loss_total": 0.10404929518699646, "step": 1707 }, { "batch_size": 4, "epoch": 0.6828, "step": 1707, "tokens_per_device": 5696 }, { "epoch": 0.6828, "loss_ce": 0.145921528339386, "loss_lvr": 0.7845309376716614, "loss_mode_switch": 0.0, "loss_total": 0.22437462210655212, "step": 1707 }, { "batch_size": 1, "epoch": 0.6828, "step": 1707, "tokens_per_device": 5111 }, { "epoch": 0.6828, "loss_ce": 0.0012960624881088734, "loss_lvr": 0.32975006103515625, "loss_mode_switch": 0.0, "loss_total": 0.03427106887102127, "step": 1707 }, { "batch_size": 4, "epoch": 0.6828, "step": 1707, "tokens_per_device": 6264 }, { "epoch": 0.6828, "loss_ce": 0.1321171522140503, "loss_lvr": 0.8862596154212952, "loss_mode_switch": 0.0, "loss_total": 0.2207431197166443, "step": 1707 }, { "batch_size": 4, "epoch": 0.6828, "step": 1707, "tokens_per_device": 4556 }, { "epoch": 0.6828, "loss_ce": 0.0236496701836586, "loss_lvr": 0.8246458172798157, "loss_mode_switch": 0.0, "loss_total": 0.10611425340175629, "step": 1707 }, { "batch_size": 4, "epoch": 0.6828, "step": 1707, "tokens_per_device": 5152 }, { "epoch": 0.6828, "loss_ce": 0.08206859976053238, "loss_lvr": 0.9672210812568665, "loss_mode_switch": 0.0, "loss_total": 0.17879071831703186, "step": 1707 }, { "epoch": 0.6832, "grad_norm": 1.4531782865524292, "learning_rate": 2.408941575003737e-06, "loss": 0.2656, "step": 1708 }, { "batch_size": 1, "epoch": 0.6832, "step": 1708, "tokens_per_device": 6249 }, { "epoch": 0.6832, "loss_ce": 0.006488795392215252, "loss_lvr": 0.2432761937379837, "loss_mode_switch": 0.0, "loss_total": 0.030816413462162018, "step": 1708 }, { "batch_size": 4, "epoch": 0.6832, "step": 1708, "tokens_per_device": 4344 }, { "epoch": 0.6832, "loss_ce": 0.7126732468605042, "loss_lvr": 0.823852002620697, "loss_mode_switch": 0.0, "loss_total": 0.7950584292411804, "step": 1708 }, { "batch_size": 1, "epoch": 0.6832, "step": 1708, "tokens_per_device": 5151 }, { "epoch": 0.6832, "loss_ce": 0.02641329914331436, "loss_lvr": 0.8084379434585571, "loss_mode_switch": 0.0, "loss_total": 0.10725709795951843, "step": 1708 }, { "batch_size": 4, "epoch": 0.6832, "step": 1708, "tokens_per_device": 5784 }, { "epoch": 0.6832, "loss_ce": 0.040967632085084915, "loss_lvr": 0.7459082007408142, "loss_mode_switch": 0.0, "loss_total": 0.11555846035480499, "step": 1708 }, { "batch_size": 4, "epoch": 0.6832, "step": 1708, "tokens_per_device": 3460 }, { "epoch": 0.6832, "loss_ce": 0.09109736233949661, "loss_lvr": 0.7875319719314575, "loss_mode_switch": 0.0, "loss_total": 0.16985055804252625, "step": 1708 }, { "batch_size": 1, "epoch": 0.6832, "step": 1708, "tokens_per_device": 4879 }, { "epoch": 0.6832, "loss_ce": 0.011090882122516632, "loss_lvr": 0.47689393162727356, "loss_mode_switch": 0.0, "loss_total": 0.05878027528524399, "step": 1708 }, { "batch_size": 4, "epoch": 0.6832, "step": 1708, "tokens_per_device": 6788 }, { "epoch": 0.6832, "loss_ce": 0.3192107379436493, "loss_lvr": 0.8307623267173767, "loss_mode_switch": 0.0, "loss_total": 0.40228697657585144, "step": 1708 }, { "batch_size": 1, "epoch": 0.6832, "step": 1708, "tokens_per_device": 4853 }, { "epoch": 0.6832, "loss_ce": 0.0009683822281658649, "loss_lvr": 0.3477773368358612, "loss_mode_switch": 0.0, "loss_total": 0.03574611619114876, "step": 1708 }, { "epoch": 0.6836, "grad_norm": 1.456118106842041, "learning_rate": 2.4034038444330597e-06, "loss": 0.2784, "step": 1709 }, { "batch_size": 4, "epoch": 0.6836, "step": 1709, "tokens_per_device": 5300 }, { "epoch": 0.6836, "loss_ce": 0.19051003456115723, "loss_lvr": 0.8416165709495544, "loss_mode_switch": 0.0, "loss_total": 0.2746717035770416, "step": 1709 }, { "batch_size": 1, "epoch": 0.6836, "step": 1709, "tokens_per_device": 7385 }, { "epoch": 0.6836, "loss_ce": 0.0003959587775170803, "loss_lvr": 0.31448814272880554, "loss_mode_switch": 0.0, "loss_total": 0.03184477239847183, "step": 1709 }, { "batch_size": 4, "epoch": 0.6836, "step": 1709, "tokens_per_device": 3364 }, { "epoch": 0.6836, "loss_ce": 0.09198837727308273, "loss_lvr": 0.5375010371208191, "loss_mode_switch": 0.0, "loss_total": 0.14573848247528076, "step": 1709 }, { "batch_size": 4, "epoch": 0.6836, "step": 1709, "tokens_per_device": 6272 }, { "epoch": 0.6836, "loss_ce": 0.03254384547472, "loss_lvr": 0.9619250893592834, "loss_mode_switch": 0.0, "loss_total": 0.12873634696006775, "step": 1709 }, { "batch_size": 4, "epoch": 0.6836, "step": 1709, "tokens_per_device": 2768 }, { "epoch": 0.6836, "loss_ce": 0.3334276080131531, "loss_lvr": 0.6552178859710693, "loss_mode_switch": 0.0, "loss_total": 0.39894938468933105, "step": 1709 }, { "batch_size": 1, "epoch": 0.6836, "step": 1709, "tokens_per_device": 4741 }, { "epoch": 0.6836, "loss_ce": 0.06718304753303528, "loss_lvr": 0.4465716481208801, "loss_mode_switch": 0.0, "loss_total": 0.11184021830558777, "step": 1709 }, { "batch_size": 1, "epoch": 0.6836, "step": 1709, "tokens_per_device": 5138 }, { "epoch": 0.6836, "loss_ce": 0.0013146336423233151, "loss_lvr": 0.4391172230243683, "loss_mode_switch": 0.0, "loss_total": 0.04522635415196419, "step": 1709 }, { "batch_size": 4, "epoch": 0.6836, "step": 1709, "tokens_per_device": 4520 }, { "epoch": 0.6836, "loss_ce": 0.1427030712366104, "loss_lvr": 0.8422987461090088, "loss_mode_switch": 0.0, "loss_total": 0.22693294286727905, "step": 1709 }, { "epoch": 0.684, "grad_norm": 1.4683120250701904, "learning_rate": 2.3978704717959777e-06, "loss": 0.2856, "step": 1710 }, { "batch_size": 4, "epoch": 0.684, "step": 1710, "tokens_per_device": 1796 }, { "epoch": 0.684, "loss_ce": 0.3104884922504425, "loss_lvr": 0.8373309373855591, "loss_mode_switch": 0.0, "loss_total": 0.39422160387039185, "step": 1710 }, { "batch_size": 4, "epoch": 0.684, "step": 1710, "tokens_per_device": 6200 }, { "epoch": 0.684, "loss_ce": 0.09323803335428238, "loss_lvr": 0.888963520526886, "loss_mode_switch": 0.0, "loss_total": 0.18213438987731934, "step": 1710 }, { "batch_size": 1, "epoch": 0.684, "step": 1710, "tokens_per_device": 5179 }, { "epoch": 0.684, "loss_ce": 0.0021299519576132298, "loss_lvr": 0.4273645877838135, "loss_mode_switch": 0.0, "loss_total": 0.0448664128780365, "step": 1710 }, { "batch_size": 4, "epoch": 0.684, "step": 1710, "tokens_per_device": 14976 }, { "epoch": 0.684, "loss_ce": 0.06349773705005646, "loss_lvr": 0.7889431715011597, "loss_mode_switch": 0.0, "loss_total": 0.14239205420017242, "step": 1710 }, { "batch_size": 4, "epoch": 0.684, "step": 1710, "tokens_per_device": 6120 }, { "epoch": 0.684, "loss_ce": 0.1555090695619583, "loss_lvr": 0.7333633899688721, "loss_mode_switch": 0.0, "loss_total": 0.22884541749954224, "step": 1710 }, { "batch_size": 4, "epoch": 0.684, "step": 1710, "tokens_per_device": 2800 }, { "epoch": 0.684, "loss_ce": 0.35275009274482727, "loss_lvr": 0.7631688714027405, "loss_mode_switch": 0.0, "loss_total": 0.4290669858455658, "step": 1710 }, { "batch_size": 4, "epoch": 0.684, "step": 1710, "tokens_per_device": 5032 }, { "epoch": 0.684, "loss_ce": 0.22833342850208282, "loss_lvr": 0.8244530558586121, "loss_mode_switch": 0.0, "loss_total": 0.31077873706817627, "step": 1710 }, { "batch_size": 1, "epoch": 0.684, "step": 1710, "tokens_per_device": 5135 }, { "epoch": 0.684, "loss_ce": 0.11499598622322083, "loss_lvr": 0.35396885871887207, "loss_mode_switch": 0.0, "loss_total": 0.15039287507534027, "step": 1710 }, { "epoch": 0.6844, "grad_norm": 1.364238977432251, "learning_rate": 2.392341466379294e-06, "loss": 0.2834, "step": 1711 }, { "batch_size": 4, "epoch": 0.6844, "step": 1711, "tokens_per_device": 6920 }, { "epoch": 0.6844, "loss_ce": 0.1523863673210144, "loss_lvr": 0.5193084478378296, "loss_mode_switch": 0.0, "loss_total": 0.20431721210479736, "step": 1711 }, { "batch_size": 4, "epoch": 0.6844, "step": 1711, "tokens_per_device": 3804 }, { "epoch": 0.6844, "loss_ce": 0.22869758307933807, "loss_lvr": 0.9281401634216309, "loss_mode_switch": 0.0, "loss_total": 0.3215115964412689, "step": 1711 }, { "batch_size": 4, "epoch": 0.6844, "step": 1711, "tokens_per_device": 2620 }, { "epoch": 0.6844, "loss_ce": 0.004749948624521494, "loss_lvr": 0.8722324371337891, "loss_mode_switch": 0.0, "loss_total": 0.0919731929898262, "step": 1711 }, { "batch_size": 1, "epoch": 0.6844, "step": 1711, "tokens_per_device": 4899 }, { "epoch": 0.6844, "loss_ce": 0.011230857111513615, "loss_lvr": 0.298229843378067, "loss_mode_switch": 0.0, "loss_total": 0.04105384275317192, "step": 1711 }, { "batch_size": 4, "epoch": 0.6844, "step": 1711, "tokens_per_device": 9668 }, { "epoch": 0.6844, "loss_ce": 0.34184741973876953, "loss_lvr": 0.818952739238739, "loss_mode_switch": 0.0, "loss_total": 0.42374271154403687, "step": 1711 }, { "batch_size": 4, "epoch": 0.6844, "step": 1711, "tokens_per_device": 8572 }, { "epoch": 0.6844, "loss_ce": 0.17510823905467987, "loss_lvr": 1.0082992315292358, "loss_mode_switch": 0.0, "loss_total": 0.27593815326690674, "step": 1711 }, { "batch_size": 4, "epoch": 0.6844, "step": 1711, "tokens_per_device": 10288 }, { "epoch": 0.6844, "loss_ce": 0.07181007415056229, "loss_lvr": 0.781094491481781, "loss_mode_switch": 0.0, "loss_total": 0.1499195247888565, "step": 1711 }, { "batch_size": 1, "epoch": 0.6844, "step": 1711, "tokens_per_device": 5116 }, { "epoch": 0.6844, "loss_ce": 0.04240494221448898, "loss_lvr": 0.5185495615005493, "loss_mode_switch": 0.0, "loss_total": 0.09425990283489227, "step": 1711 }, { "epoch": 0.6848, "grad_norm": 1.2710442543029785, "learning_rate": 2.38681683746248e-06, "loss": 0.2761, "step": 1712 }, { "batch_size": 4, "epoch": 0.6848, "step": 1712, "tokens_per_device": 2320 }, { "epoch": 0.6848, "loss_ce": 0.15669982135295868, "loss_lvr": 0.7224286198616028, "loss_mode_switch": 0.0, "loss_total": 0.22894269227981567, "step": 1712 }, { "batch_size": 4, "epoch": 0.6848, "step": 1712, "tokens_per_device": 2656 }, { "epoch": 0.6848, "loss_ce": 0.27781832218170166, "loss_lvr": 0.6276520490646362, "loss_mode_switch": 0.0, "loss_total": 0.34058353304862976, "step": 1712 }, { "batch_size": 4, "epoch": 0.6848, "step": 1712, "tokens_per_device": 1468 }, { "epoch": 0.6848, "loss_ce": 0.4143417775630951, "loss_lvr": 0.9015397429466248, "loss_mode_switch": 0.0, "loss_total": 0.5044957399368286, "step": 1712 }, { "batch_size": 1, "epoch": 0.6848, "step": 1712, "tokens_per_device": 5006 }, { "epoch": 0.6848, "loss_ce": 0.07204200327396393, "loss_lvr": 0.4676821231842041, "loss_mode_switch": 0.0, "loss_total": 0.11881022155284882, "step": 1712 }, { "batch_size": 4, "epoch": 0.6848, "step": 1712, "tokens_per_device": 2636 }, { "epoch": 0.6848, "loss_ce": 0.5880061984062195, "loss_lvr": 0.8152527809143066, "loss_mode_switch": 0.0, "loss_total": 0.6695314645767212, "step": 1712 }, { "batch_size": 1, "epoch": 0.6848, "step": 1712, "tokens_per_device": 5129 }, { "epoch": 0.6848, "loss_ce": 0.011987665668129921, "loss_lvr": 0.17073412239551544, "loss_mode_switch": 0.0, "loss_total": 0.029061079025268555, "step": 1712 }, { "batch_size": 4, "epoch": 0.6848, "step": 1712, "tokens_per_device": 7000 }, { "epoch": 0.6848, "loss_ce": 0.08409188687801361, "loss_lvr": 0.80745530128479, "loss_mode_switch": 0.0, "loss_total": 0.16483741998672485, "step": 1712 }, { "batch_size": 4, "epoch": 0.6848, "step": 1712, "tokens_per_device": 12716 }, { "epoch": 0.6848, "loss_ce": 0.3890343904495239, "loss_lvr": 0.5905646681785583, "loss_mode_switch": 0.0, "loss_total": 0.4480908513069153, "step": 1712 }, { "epoch": 0.6852, "grad_norm": 1.3492307662963867, "learning_rate": 2.3812965943176608e-06, "loss": 0.2914, "step": 1713 }, { "batch_size": 4, "epoch": 0.6852, "step": 1713, "tokens_per_device": 5332 }, { "epoch": 0.6852, "loss_ce": 0.173990398645401, "loss_lvr": 0.7200812697410583, "loss_mode_switch": 0.0, "loss_total": 0.2459985315799713, "step": 1713 }, { "batch_size": 4, "epoch": 0.6852, "step": 1713, "tokens_per_device": 4552 }, { "epoch": 0.6852, "loss_ce": 0.11035338789224625, "loss_lvr": 0.732189953327179, "loss_mode_switch": 0.0, "loss_total": 0.18357238173484802, "step": 1713 }, { "batch_size": 1, "epoch": 0.6852, "step": 1713, "tokens_per_device": 4904 }, { "epoch": 0.6852, "loss_ce": 0.05106841400265694, "loss_lvr": 0.5449174046516418, "loss_mode_switch": 0.0, "loss_total": 0.10556015372276306, "step": 1713 }, { "batch_size": 4, "epoch": 0.6852, "step": 1713, "tokens_per_device": 7508 }, { "epoch": 0.6852, "loss_ce": 0.09620759636163712, "loss_lvr": 0.30117297172546387, "loss_mode_switch": 0.0, "loss_total": 0.12632489204406738, "step": 1713 }, { "batch_size": 4, "epoch": 0.6852, "step": 1713, "tokens_per_device": 5776 }, { "epoch": 0.6852, "loss_ce": 0.07197350263595581, "loss_lvr": 1.0147422552108765, "loss_mode_switch": 0.0, "loss_total": 0.17344772815704346, "step": 1713 }, { "batch_size": 1, "epoch": 0.6852, "step": 1713, "tokens_per_device": 5217 }, { "epoch": 0.6852, "loss_ce": 0.22636574506759644, "loss_lvr": 0.6350475549697876, "loss_mode_switch": 0.0, "loss_total": 0.2898705005645752, "step": 1713 }, { "batch_size": 4, "epoch": 0.6852, "step": 1713, "tokens_per_device": 4372 }, { "epoch": 0.6852, "loss_ce": 0.45639047026634216, "loss_lvr": 1.0840667486190796, "loss_mode_switch": 0.0, "loss_total": 0.5647971630096436, "step": 1713 }, { "batch_size": 4, "epoch": 0.6852, "step": 1713, "tokens_per_device": 3348 }, { "epoch": 0.6852, "loss_ce": 0.4118993878364563, "loss_lvr": 0.9223979115486145, "loss_mode_switch": 0.0, "loss_total": 0.5041391849517822, "step": 1713 }, { "epoch": 0.6856, "grad_norm": 1.3173333406448364, "learning_rate": 2.3757807462096013e-06, "loss": 0.2821, "step": 1714 }, { "batch_size": 4, "epoch": 0.6856, "step": 1714, "tokens_per_device": 4316 }, { "epoch": 0.6856, "loss_ce": 0.5014795064926147, "loss_lvr": 0.8797648549079895, "loss_mode_switch": 0.0, "loss_total": 0.5894559621810913, "step": 1714 }, { "batch_size": 4, "epoch": 0.6856, "step": 1714, "tokens_per_device": 4236 }, { "epoch": 0.6856, "loss_ce": 0.42773371934890747, "loss_lvr": 0.9498823881149292, "loss_mode_switch": 0.0, "loss_total": 0.5227219462394714, "step": 1714 }, { "batch_size": 4, "epoch": 0.6856, "step": 1714, "tokens_per_device": 6932 }, { "epoch": 0.6856, "loss_ce": 0.1377294957637787, "loss_lvr": 0.6858720183372498, "loss_mode_switch": 0.0, "loss_total": 0.20631670951843262, "step": 1714 }, { "batch_size": 1, "epoch": 0.6856, "step": 1714, "tokens_per_device": 4172 }, { "epoch": 0.6856, "loss_ce": 0.010364913381636143, "loss_lvr": 0.4468289911746979, "loss_mode_switch": 0.0, "loss_total": 0.055047813802957535, "step": 1714 }, { "batch_size": 1, "epoch": 0.6856, "step": 1714, "tokens_per_device": 5169 }, { "epoch": 0.6856, "loss_ce": 0.01034572347998619, "loss_lvr": 0.46596759557724, "loss_mode_switch": 0.0, "loss_total": 0.05694248527288437, "step": 1714 }, { "batch_size": 4, "epoch": 0.6856, "step": 1714, "tokens_per_device": 4244 }, { "epoch": 0.6856, "loss_ce": 0.3661429286003113, "loss_lvr": 0.9525538086891174, "loss_mode_switch": 0.0, "loss_total": 0.46139830350875854, "step": 1714 }, { "batch_size": 4, "epoch": 0.6856, "step": 1714, "tokens_per_device": 3888 }, { "epoch": 0.6856, "loss_ce": 0.3119290769100189, "loss_lvr": 0.9471316337585449, "loss_mode_switch": 0.0, "loss_total": 0.40664225816726685, "step": 1714 }, { "batch_size": 4, "epoch": 0.6856, "step": 1714, "tokens_per_device": 2612 }, { "epoch": 0.6856, "loss_ce": 0.6285143494606018, "loss_lvr": 0.842889666557312, "loss_mode_switch": 0.0, "loss_total": 0.712803304195404, "step": 1714 }, { "epoch": 0.686, "grad_norm": 1.6292473077774048, "learning_rate": 2.3702693023956853e-06, "loss": 0.3333, "step": 1715 }, { "batch_size": 1, "epoch": 0.686, "step": 1715, "tokens_per_device": 4842 }, { "epoch": 0.686, "loss_ce": 0.023882851004600525, "loss_lvr": 0.7251247763633728, "loss_mode_switch": 0.0, "loss_total": 0.0963953286409378, "step": 1715 }, { "batch_size": 4, "epoch": 0.686, "step": 1715, "tokens_per_device": 5772 }, { "epoch": 0.686, "loss_ce": 0.23265209794044495, "loss_lvr": 0.8171824216842651, "loss_mode_switch": 0.0, "loss_total": 0.314370334148407, "step": 1715 }, { "batch_size": 1, "epoch": 0.686, "step": 1715, "tokens_per_device": 5141 }, { "epoch": 0.686, "loss_ce": 0.0013066802639514208, "loss_lvr": 0.24506624042987823, "loss_mode_switch": 0.0, "loss_total": 0.025813305750489235, "step": 1715 }, { "batch_size": 4, "epoch": 0.686, "step": 1715, "tokens_per_device": 1308 }, { "epoch": 0.686, "loss_ce": 0.5131213665008545, "loss_lvr": 0.9371705055236816, "loss_mode_switch": 0.0, "loss_total": 0.6068384051322937, "step": 1715 }, { "batch_size": 1, "epoch": 0.686, "step": 1715, "tokens_per_device": 4889 }, { "epoch": 0.686, "loss_ce": 0.423737496137619, "loss_lvr": 0.3058082163333893, "loss_mode_switch": 0.0, "loss_total": 0.4543183147907257, "step": 1715 }, { "batch_size": 1, "epoch": 0.686, "step": 1715, "tokens_per_device": 5083 }, { "epoch": 0.686, "loss_ce": 0.00018091598758473992, "loss_lvr": 0.22522631287574768, "loss_mode_switch": 0.0, "loss_total": 0.02270354889333248, "step": 1715 }, { "batch_size": 4, "epoch": 0.686, "step": 1715, "tokens_per_device": 5412 }, { "epoch": 0.686, "loss_ce": 0.12982045114040375, "loss_lvr": 0.7668327689170837, "loss_mode_switch": 0.0, "loss_total": 0.2065037190914154, "step": 1715 }, { "batch_size": 4, "epoch": 0.686, "step": 1715, "tokens_per_device": 1344 }, { "epoch": 0.686, "loss_ce": 0.695311963558197, "loss_lvr": 1.0976454019546509, "loss_mode_switch": 0.0, "loss_total": 0.8050764799118042, "step": 1715 }, { "epoch": 0.6864, "grad_norm": 1.440255880355835, "learning_rate": 2.3647622721259184e-06, "loss": 0.2993, "step": 1716 }, { "batch_size": 1, "epoch": 0.6864, "step": 1716, "tokens_per_device": 5670 }, { "epoch": 0.6864, "loss_ce": 0.029168443754315376, "loss_lvr": 0.3122761845588684, "loss_mode_switch": 0.0, "loss_total": 0.06039606034755707, "step": 1716 }, { "batch_size": 4, "epoch": 0.6864, "step": 1716, "tokens_per_device": 8096 }, { "epoch": 0.6864, "loss_ce": 0.0295171607285738, "loss_lvr": 0.7397594451904297, "loss_mode_switch": 0.0, "loss_total": 0.1034931018948555, "step": 1716 }, { "batch_size": 4, "epoch": 0.6864, "step": 1716, "tokens_per_device": 16272 }, { "epoch": 0.6864, "loss_ce": 0.2052854597568512, "loss_lvr": 0.7088987827301025, "loss_mode_switch": 0.0, "loss_total": 0.2761753499507904, "step": 1716 }, { "batch_size": 4, "epoch": 0.6864, "step": 1716, "tokens_per_device": 1712 }, { "epoch": 0.6864, "loss_ce": 0.22822022438049316, "loss_lvr": 0.7640679478645325, "loss_mode_switch": 0.0, "loss_total": 0.30462703108787537, "step": 1716 }, { "batch_size": 4, "epoch": 0.6864, "step": 1716, "tokens_per_device": 14796 }, { "epoch": 0.6864, "loss_ce": 0.18824730813503265, "loss_lvr": 0.8959985375404358, "loss_mode_switch": 0.0, "loss_total": 0.27784717082977295, "step": 1716 }, { "batch_size": 4, "epoch": 0.6864, "step": 1716, "tokens_per_device": 8412 }, { "epoch": 0.6864, "loss_ce": 0.3514817953109741, "loss_lvr": 0.7317186594009399, "loss_mode_switch": 0.0, "loss_total": 0.42465364933013916, "step": 1716 }, { "batch_size": 4, "epoch": 0.6864, "step": 1716, "tokens_per_device": 4924 }, { "epoch": 0.6864, "loss_ce": 0.6542655229568481, "loss_lvr": 0.8111774921417236, "loss_mode_switch": 0.0, "loss_total": 0.7353832721710205, "step": 1716 }, { "batch_size": 4, "epoch": 0.6864, "step": 1716, "tokens_per_device": 4556 }, { "epoch": 0.6864, "loss_ce": 0.27640777826309204, "loss_lvr": 0.8876364827156067, "loss_mode_switch": 0.0, "loss_total": 0.3651714324951172, "step": 1716 }, { "epoch": 0.6868, "grad_norm": 1.5668869018554688, "learning_rate": 2.3592596646428855e-06, "loss": 0.3401, "step": 1717 }, { "batch_size": 4, "epoch": 0.6868, "step": 1717, "tokens_per_device": 4364 }, { "epoch": 0.6868, "loss_ce": 0.5075463056564331, "loss_lvr": 0.8132190108299255, "loss_mode_switch": 0.0, "loss_total": 0.5888682007789612, "step": 1717 }, { "batch_size": 1, "epoch": 0.6868, "step": 1717, "tokens_per_device": 5118 }, { "epoch": 0.6868, "loss_ce": 0.007632496301084757, "loss_lvr": 0.46121731400489807, "loss_mode_switch": 0.0, "loss_total": 0.053754229098558426, "step": 1717 }, { "batch_size": 4, "epoch": 0.6868, "step": 1717, "tokens_per_device": 6512 }, { "epoch": 0.6868, "loss_ce": 0.07577838003635406, "loss_lvr": 0.8214197754859924, "loss_mode_switch": 0.0, "loss_total": 0.15792036056518555, "step": 1717 }, { "batch_size": 4, "epoch": 0.6868, "step": 1717, "tokens_per_device": 3484 }, { "epoch": 0.6868, "loss_ce": 0.4393085241317749, "loss_lvr": 0.9308951497077942, "loss_mode_switch": 0.0, "loss_total": 0.5323980450630188, "step": 1717 }, { "batch_size": 4, "epoch": 0.6868, "step": 1717, "tokens_per_device": 5792 }, { "epoch": 0.6868, "loss_ce": 0.3496786057949066, "loss_lvr": 0.9434688687324524, "loss_mode_switch": 0.0, "loss_total": 0.4440254867076874, "step": 1717 }, { "batch_size": 1, "epoch": 0.6868, "step": 1717, "tokens_per_device": 6186 }, { "epoch": 0.6868, "loss_ce": 0.08036213368177414, "loss_lvr": 0.2717498540878296, "loss_mode_switch": 0.0, "loss_total": 0.10753712058067322, "step": 1717 }, { "batch_size": 4, "epoch": 0.6868, "step": 1717, "tokens_per_device": 3728 }, { "epoch": 0.6868, "loss_ce": 0.2039727419614792, "loss_lvr": 1.7676185369491577, "loss_mode_switch": 0.0, "loss_total": 0.3807345926761627, "step": 1717 }, { "batch_size": 4, "epoch": 0.6868, "step": 1717, "tokens_per_device": 4316 }, { "epoch": 0.6868, "loss_ce": 0.18689577281475067, "loss_lvr": 0.669704258441925, "loss_mode_switch": 0.0, "loss_total": 0.25386619567871094, "step": 1717 }, { "epoch": 0.6872, "grad_norm": 1.2381407022476196, "learning_rate": 2.353761489181754e-06, "loss": 0.2563, "step": 1718 }, { "batch_size": 4, "epoch": 0.6872, "step": 1718, "tokens_per_device": 5076 }, { "epoch": 0.6872, "loss_ce": 0.13978640735149384, "loss_lvr": 0.7744890451431274, "loss_mode_switch": 0.0, "loss_total": 0.21723531186580658, "step": 1718 }, { "batch_size": 4, "epoch": 0.6872, "step": 1718, "tokens_per_device": 4876 }, { "epoch": 0.6872, "loss_ce": 0.2263929396867752, "loss_lvr": 0.7512481212615967, "loss_mode_switch": 0.0, "loss_total": 0.3015177547931671, "step": 1718 }, { "batch_size": 4, "epoch": 0.6872, "step": 1718, "tokens_per_device": 3888 }, { "epoch": 0.6872, "loss_ce": 0.4791790246963501, "loss_lvr": 1.0076406002044678, "loss_mode_switch": 0.0, "loss_total": 0.579943060874939, "step": 1718 }, { "batch_size": 4, "epoch": 0.6872, "step": 1718, "tokens_per_device": 5204 }, { "epoch": 0.6872, "loss_ce": 0.34652605652809143, "loss_lvr": 1.0377928018569946, "loss_mode_switch": 0.0, "loss_total": 0.45030534267425537, "step": 1718 }, { "batch_size": 1, "epoch": 0.6872, "step": 1718, "tokens_per_device": 4739 }, { "epoch": 0.6872, "loss_ce": 0.0005271817790344357, "loss_lvr": 0.3930421471595764, "loss_mode_switch": 0.0, "loss_total": 0.03983139619231224, "step": 1718 }, { "batch_size": 1, "epoch": 0.6872, "step": 1718, "tokens_per_device": 5097 }, { "epoch": 0.6872, "loss_ce": 0.002861869288608432, "loss_lvr": 0.2523432970046997, "loss_mode_switch": 0.0, "loss_total": 0.02809619903564453, "step": 1718 }, { "batch_size": 4, "epoch": 0.6872, "step": 1718, "tokens_per_device": 5372 }, { "epoch": 0.6872, "loss_ce": 0.06796399503946304, "loss_lvr": 0.8627090454101562, "loss_mode_switch": 0.0, "loss_total": 0.1542349010705948, "step": 1718 }, { "batch_size": 4, "epoch": 0.6872, "step": 1718, "tokens_per_device": 6228 }, { "epoch": 0.6872, "loss_ce": 0.2765672206878662, "loss_lvr": 0.7530906796455383, "loss_mode_switch": 0.0, "loss_total": 0.35187628865242004, "step": 1718 }, { "epoch": 0.6876, "grad_norm": 1.4024235010147095, "learning_rate": 2.3482677549702493e-06, "loss": 0.3025, "step": 1719 }, { "batch_size": 1, "epoch": 0.6876, "step": 1719, "tokens_per_device": 5119 }, { "epoch": 0.6876, "loss_ce": 0.13147547841072083, "loss_lvr": 0.32802248001098633, "loss_mode_switch": 0.0, "loss_total": 0.16427773237228394, "step": 1719 }, { "batch_size": 4, "epoch": 0.6876, "step": 1719, "tokens_per_device": 1504 }, { "epoch": 0.6876, "loss_ce": 0.4921667277812958, "loss_lvr": 1.074642300605774, "loss_mode_switch": 0.0, "loss_total": 0.5996309518814087, "step": 1719 }, { "batch_size": 1, "epoch": 0.6876, "step": 1719, "tokens_per_device": 4382 }, { "epoch": 0.6876, "loss_ce": 0.06119474023580551, "loss_lvr": 0.2675250768661499, "loss_mode_switch": 0.0, "loss_total": 0.08794724941253662, "step": 1719 }, { "batch_size": 4, "epoch": 0.6876, "step": 1719, "tokens_per_device": 2740 }, { "epoch": 0.6876, "loss_ce": 0.2695213854312897, "loss_lvr": 0.7805855870246887, "loss_mode_switch": 0.0, "loss_total": 0.3475799560546875, "step": 1719 }, { "batch_size": 4, "epoch": 0.6876, "step": 1719, "tokens_per_device": 5300 }, { "epoch": 0.6876, "loss_ce": 0.4395270049571991, "loss_lvr": 0.6998350024223328, "loss_mode_switch": 0.0, "loss_total": 0.5095105171203613, "step": 1719 }, { "batch_size": 1, "epoch": 0.6876, "step": 1719, "tokens_per_device": 4890 }, { "epoch": 0.6876, "loss_ce": 0.009979753755033016, "loss_lvr": 0.4524601697921753, "loss_mode_switch": 0.0, "loss_total": 0.05522577092051506, "step": 1719 }, { "batch_size": 4, "epoch": 0.6876, "step": 1719, "tokens_per_device": 1252 }, { "epoch": 0.6876, "loss_ce": 0.3223239481449127, "loss_lvr": 0.8573018312454224, "loss_mode_switch": 0.0, "loss_total": 0.4080541431903839, "step": 1719 }, { "batch_size": 4, "epoch": 0.6876, "step": 1719, "tokens_per_device": 4836 }, { "epoch": 0.6876, "loss_ce": 0.133114755153656, "loss_lvr": 0.6382433772087097, "loss_mode_switch": 0.0, "loss_total": 0.19693909585475922, "step": 1719 }, { "epoch": 0.688, "grad_norm": 1.1470286846160889, "learning_rate": 2.342778471228648e-06, "loss": 0.2469, "step": 1720 }, { "batch_size": 1, "epoch": 0.688, "step": 1720, "tokens_per_device": 5623 }, { "epoch": 0.688, "loss_ce": 0.08536077290773392, "loss_lvr": 0.2438237965106964, "loss_mode_switch": 0.0, "loss_total": 0.1097431555390358, "step": 1720 }, { "batch_size": 1, "epoch": 0.688, "step": 1720, "tokens_per_device": 5164 }, { "epoch": 0.688, "loss_ce": 0.014012248255312443, "loss_lvr": 0.3554023504257202, "loss_mode_switch": 0.0, "loss_total": 0.04955248162150383, "step": 1720 }, { "batch_size": 4, "epoch": 0.688, "step": 1720, "tokens_per_device": 1644 }, { "epoch": 0.688, "loss_ce": 0.05573117733001709, "loss_lvr": 1.0429792404174805, "loss_mode_switch": 0.0, "loss_total": 0.1600291132926941, "step": 1720 }, { "batch_size": 1, "epoch": 0.688, "step": 1720, "tokens_per_device": 5105 }, { "epoch": 0.688, "loss_ce": 0.006025827489793301, "loss_lvr": 0.5698603987693787, "loss_mode_switch": 0.0, "loss_total": 0.06301186978816986, "step": 1720 }, { "batch_size": 1, "epoch": 0.688, "step": 1720, "tokens_per_device": 4709 }, { "epoch": 0.688, "loss_ce": 0.10947885364294052, "loss_lvr": 0.5113914608955383, "loss_mode_switch": 0.0, "loss_total": 0.16061800718307495, "step": 1720 }, { "batch_size": 4, "epoch": 0.688, "step": 1720, "tokens_per_device": 1556 }, { "epoch": 0.688, "loss_ce": 0.5063816905021667, "loss_lvr": 1.0282500982284546, "loss_mode_switch": 0.0, "loss_total": 0.6092066764831543, "step": 1720 }, { "batch_size": 1, "epoch": 0.688, "step": 1720, "tokens_per_device": 4950 }, { "epoch": 0.688, "loss_ce": 0.02318131923675537, "loss_lvr": 0.33117204904556274, "loss_mode_switch": 0.0, "loss_total": 0.056298524141311646, "step": 1720 }, { "batch_size": 4, "epoch": 0.688, "step": 1720, "tokens_per_device": 4204 }, { "epoch": 0.688, "loss_ce": 0.051770661026239395, "loss_lvr": 1.045649528503418, "loss_mode_switch": 0.0, "loss_total": 0.15633562207221985, "step": 1720 }, { "epoch": 0.6884, "grad_norm": 1.4959474802017212, "learning_rate": 2.3372936471697564e-06, "loss": 0.3016, "step": 1721 }, { "batch_size": 4, "epoch": 0.6884, "step": 1721, "tokens_per_device": 5248 }, { "epoch": 0.6884, "loss_ce": 0.030264252796769142, "loss_lvr": 0.8640494346618652, "loss_mode_switch": 0.0, "loss_total": 0.1166691929101944, "step": 1721 }, { "batch_size": 4, "epoch": 0.6884, "step": 1721, "tokens_per_device": 1988 }, { "epoch": 0.6884, "loss_ce": 0.5415360331535339, "loss_lvr": 1.0328240394592285, "loss_mode_switch": 0.0, "loss_total": 0.6448184251785278, "step": 1721 }, { "batch_size": 1, "epoch": 0.6884, "step": 1721, "tokens_per_device": 4897 }, { "epoch": 0.6884, "loss_ce": 0.0007926774560473859, "loss_lvr": 0.2689334750175476, "loss_mode_switch": 0.0, "loss_total": 0.027686024084687233, "step": 1721 }, { "batch_size": 4, "epoch": 0.6884, "step": 1721, "tokens_per_device": 4344 }, { "epoch": 0.6884, "loss_ce": 0.071555957198143, "loss_lvr": 0.9607236385345459, "loss_mode_switch": 0.0, "loss_total": 0.16762831807136536, "step": 1721 }, { "batch_size": 4, "epoch": 0.6884, "step": 1721, "tokens_per_device": 12460 }, { "epoch": 0.6884, "loss_ce": 0.4725971817970276, "loss_lvr": 0.7479856014251709, "loss_mode_switch": 0.0, "loss_total": 0.5473957657814026, "step": 1721 }, { "batch_size": 4, "epoch": 0.6884, "step": 1721, "tokens_per_device": 3896 }, { "epoch": 0.6884, "loss_ce": 0.43874359130859375, "loss_lvr": 0.9230010509490967, "loss_mode_switch": 0.0, "loss_total": 0.5310437083244324, "step": 1721 }, { "batch_size": 4, "epoch": 0.6884, "step": 1721, "tokens_per_device": 1572 }, { "epoch": 0.6884, "loss_ce": 0.5803829431533813, "loss_lvr": 0.8375213742256165, "loss_mode_switch": 0.0, "loss_total": 0.6641350984573364, "step": 1721 }, { "batch_size": 1, "epoch": 0.6884, "step": 1721, "tokens_per_device": 4914 }, { "epoch": 0.6884, "loss_ce": 0.0710756704211235, "loss_lvr": 0.6958231329917908, "loss_mode_switch": 0.0, "loss_total": 0.14065799117088318, "step": 1721 }, { "epoch": 0.6888, "grad_norm": 1.217930793762207, "learning_rate": 2.3318132919988944e-06, "loss": 0.282, "step": 1722 }, { "batch_size": 4, "epoch": 0.6888, "step": 1722, "tokens_per_device": 4584 }, { "epoch": 0.6888, "loss_ce": 0.42764613032341003, "loss_lvr": 1.0263776779174805, "loss_mode_switch": 0.0, "loss_total": 0.5302839279174805, "step": 1722 }, { "batch_size": 4, "epoch": 0.6888, "step": 1722, "tokens_per_device": 3852 }, { "epoch": 0.6888, "loss_ce": 0.6088135242462158, "loss_lvr": 0.861768901348114, "loss_mode_switch": 0.0, "loss_total": 0.6949903964996338, "step": 1722 }, { "batch_size": 1, "epoch": 0.6888, "step": 1722, "tokens_per_device": 4947 }, { "epoch": 0.6888, "loss_ce": 0.2613584101200104, "loss_lvr": 0.19386360049247742, "loss_mode_switch": 0.0, "loss_total": 0.2807447612285614, "step": 1722 }, { "batch_size": 1, "epoch": 0.6888, "step": 1722, "tokens_per_device": 4856 }, { "epoch": 0.6888, "loss_ce": 0.2517877221107483, "loss_lvr": 0.4202912449836731, "loss_mode_switch": 0.0, "loss_total": 0.29381683468818665, "step": 1722 }, { "batch_size": 4, "epoch": 0.6888, "step": 1722, "tokens_per_device": 1604 }, { "epoch": 0.6888, "loss_ce": 0.03895632550120354, "loss_lvr": 0.854127824306488, "loss_mode_switch": 0.0, "loss_total": 0.12436911463737488, "step": 1722 }, { "batch_size": 4, "epoch": 0.6888, "step": 1722, "tokens_per_device": 4220 }, { "epoch": 0.6888, "loss_ce": 0.11161204427480698, "loss_lvr": 1.0567985773086548, "loss_mode_switch": 0.0, "loss_total": 0.2172919064760208, "step": 1722 }, { "batch_size": 4, "epoch": 0.6888, "step": 1722, "tokens_per_device": 2724 }, { "epoch": 0.6888, "loss_ce": 0.3147578239440918, "loss_lvr": 0.855255663394928, "loss_mode_switch": 0.0, "loss_total": 0.4002833962440491, "step": 1722 }, { "batch_size": 4, "epoch": 0.6888, "step": 1722, "tokens_per_device": 1380 }, { "epoch": 0.6888, "loss_ce": 0.44438713788986206, "loss_lvr": 0.9067714810371399, "loss_mode_switch": 0.0, "loss_total": 0.5350642800331116, "step": 1722 }, { "epoch": 0.6892, "grad_norm": 1.4093681573867798, "learning_rate": 2.326337414913881e-06, "loss": 0.3018, "step": 1723 }, { "batch_size": 1, "epoch": 0.6892, "step": 1723, "tokens_per_device": 5487 }, { "epoch": 0.6892, "loss_ce": 0.05100977048277855, "loss_lvr": 0.3191035985946655, "loss_mode_switch": 0.0, "loss_total": 0.0829201340675354, "step": 1723 }, { "batch_size": 4, "epoch": 0.6892, "step": 1723, "tokens_per_device": 1236 }, { "epoch": 0.6892, "loss_ce": 0.11120270192623138, "loss_lvr": 1.0095155239105225, "loss_mode_switch": 0.0, "loss_total": 0.21215425431728363, "step": 1723 }, { "batch_size": 4, "epoch": 0.6892, "step": 1723, "tokens_per_device": 5688 }, { "epoch": 0.6892, "loss_ce": 0.11985574662685394, "loss_lvr": 0.5970702767372131, "loss_mode_switch": 0.0, "loss_total": 0.1795627772808075, "step": 1723 }, { "batch_size": 1, "epoch": 0.6892, "step": 1723, "tokens_per_device": 5052 }, { "epoch": 0.6892, "loss_ce": 0.004846458788961172, "loss_lvr": 0.29266980290412903, "loss_mode_switch": 0.0, "loss_total": 0.03411344066262245, "step": 1723 }, { "batch_size": 4, "epoch": 0.6892, "step": 1723, "tokens_per_device": 4708 }, { "epoch": 0.6892, "loss_ce": 0.021845897659659386, "loss_lvr": 0.8209699988365173, "loss_mode_switch": 0.0, "loss_total": 0.10394290089607239, "step": 1723 }, { "batch_size": 4, "epoch": 0.6892, "step": 1723, "tokens_per_device": 5164 }, { "epoch": 0.6892, "loss_ce": 0.08858337253332138, "loss_lvr": 0.6775228381156921, "loss_mode_switch": 0.0, "loss_total": 0.15633565187454224, "step": 1723 }, { "batch_size": 4, "epoch": 0.6892, "step": 1723, "tokens_per_device": 4256 }, { "epoch": 0.6892, "loss_ce": 0.33455193042755127, "loss_lvr": 0.78631192445755, "loss_mode_switch": 0.0, "loss_total": 0.4131831228733063, "step": 1723 }, { "batch_size": 4, "epoch": 0.6892, "step": 1723, "tokens_per_device": 4264 }, { "epoch": 0.6892, "loss_ce": 0.052044160664081573, "loss_lvr": 0.8190985321998596, "loss_mode_switch": 0.0, "loss_total": 0.1339540183544159, "step": 1723 }, { "epoch": 0.6896, "grad_norm": 1.2293552160263062, "learning_rate": 2.320866025105016e-06, "loss": 0.2743, "step": 1724 }, { "batch_size": 4, "epoch": 0.6896, "step": 1724, "tokens_per_device": 7828 }, { "epoch": 0.6896, "loss_ce": 0.2841854989528656, "loss_lvr": 0.9739087820053101, "loss_mode_switch": 0.0, "loss_total": 0.38157638907432556, "step": 1724 }, { "batch_size": 4, "epoch": 0.6896, "step": 1724, "tokens_per_device": 3932 }, { "epoch": 0.6896, "loss_ce": 0.11131554841995239, "loss_lvr": 0.9747698307037354, "loss_mode_switch": 0.0, "loss_total": 0.2087925374507904, "step": 1724 }, { "batch_size": 1, "epoch": 0.6896, "step": 1724, "tokens_per_device": 4679 }, { "epoch": 0.6896, "loss_ce": 0.0037430753000080585, "loss_lvr": 0.3509688079357147, "loss_mode_switch": 0.0, "loss_total": 0.038839954882860184, "step": 1724 }, { "batch_size": 1, "epoch": 0.6896, "step": 1724, "tokens_per_device": 5122 }, { "epoch": 0.6896, "loss_ce": 0.2783026397228241, "loss_lvr": 0.3335840404033661, "loss_mode_switch": 0.0, "loss_total": 0.311661034822464, "step": 1724 }, { "batch_size": 4, "epoch": 0.6896, "step": 1724, "tokens_per_device": 4488 }, { "epoch": 0.6896, "loss_ce": 0.19241347908973694, "loss_lvr": 0.9513574242591858, "loss_mode_switch": 0.0, "loss_total": 0.28754922747612, "step": 1724 }, { "batch_size": 1, "epoch": 0.6896, "step": 1724, "tokens_per_device": 5115 }, { "epoch": 0.6896, "loss_ce": 0.027730343863368034, "loss_lvr": 0.8665809035301208, "loss_mode_switch": 0.0, "loss_total": 0.11438843607902527, "step": 1724 }, { "batch_size": 4, "epoch": 0.6896, "step": 1724, "tokens_per_device": 12596 }, { "epoch": 0.6896, "loss_ce": 0.08360611647367477, "loss_lvr": 0.7382659912109375, "loss_mode_switch": 0.0, "loss_total": 0.15743272006511688, "step": 1724 }, { "batch_size": 4, "epoch": 0.6896, "step": 1724, "tokens_per_device": 11068 }, { "epoch": 0.6896, "loss_ce": 0.2042609304189682, "loss_lvr": 0.8949574828147888, "loss_mode_switch": 0.0, "loss_total": 0.2937566637992859, "step": 1724 }, { "epoch": 0.69, "grad_norm": 1.598006010055542, "learning_rate": 2.315399131755081e-06, "loss": 0.3105, "step": 1725 }, { "batch_size": 1, "epoch": 0.69, "step": 1725, "tokens_per_device": 5036 }, { "epoch": 0.69, "loss_ce": 0.0319790281355381, "loss_lvr": 0.5358645915985107, "loss_mode_switch": 0.0, "loss_total": 0.0855654925107956, "step": 1725 }, { "batch_size": 4, "epoch": 0.69, "step": 1725, "tokens_per_device": 1816 }, { "epoch": 0.69, "loss_ce": 0.5339535474777222, "loss_lvr": 1.065303087234497, "loss_mode_switch": 0.0, "loss_total": 0.6404838562011719, "step": 1725 }, { "batch_size": 4, "epoch": 0.69, "step": 1725, "tokens_per_device": 4420 }, { "epoch": 0.69, "loss_ce": 0.1349581927061081, "loss_lvr": 0.9414317011833191, "loss_mode_switch": 0.0, "loss_total": 0.22910135984420776, "step": 1725 }, { "batch_size": 4, "epoch": 0.69, "step": 1725, "tokens_per_device": 4552 }, { "epoch": 0.69, "loss_ce": 0.35848304629325867, "loss_lvr": 0.7267247438430786, "loss_mode_switch": 0.0, "loss_total": 0.4311555325984955, "step": 1725 }, { "batch_size": 1, "epoch": 0.69, "step": 1725, "tokens_per_device": 5190 }, { "epoch": 0.69, "loss_ce": 0.0042052362114191055, "loss_lvr": 0.6693108677864075, "loss_mode_switch": 0.0, "loss_total": 0.07113632559776306, "step": 1725 }, { "batch_size": 4, "epoch": 0.69, "step": 1725, "tokens_per_device": 4004 }, { "epoch": 0.69, "loss_ce": 0.39669209718704224, "loss_lvr": 0.8238394260406494, "loss_mode_switch": 0.0, "loss_total": 0.4790760278701782, "step": 1725 }, { "batch_size": 4, "epoch": 0.69, "step": 1725, "tokens_per_device": 1352 }, { "epoch": 0.69, "loss_ce": 0.5082683563232422, "loss_lvr": 1.1050125360488892, "loss_mode_switch": 0.0, "loss_total": 0.6187695860862732, "step": 1725 }, { "batch_size": 1, "epoch": 0.69, "step": 1725, "tokens_per_device": 5129 }, { "epoch": 0.69, "loss_ce": 0.003545530140399933, "loss_lvr": 0.380443811416626, "loss_mode_switch": 0.0, "loss_total": 0.04158991202712059, "step": 1725 }, { "epoch": 0.6904, "grad_norm": 1.5246495008468628, "learning_rate": 2.3099367440392985e-06, "loss": 0.306, "step": 1726 }, { "batch_size": 4, "epoch": 0.6904, "step": 1726, "tokens_per_device": 2760 }, { "epoch": 0.6904, "loss_ce": 0.09017091989517212, "loss_lvr": 0.6430452466011047, "loss_mode_switch": 0.0, "loss_total": 0.15447545051574707, "step": 1726 }, { "batch_size": 1, "epoch": 0.6904, "step": 1726, "tokens_per_device": 4958 }, { "epoch": 0.6904, "loss_ce": 0.029925452545285225, "loss_lvr": 0.4211951196193695, "loss_mode_switch": 0.0, "loss_total": 0.0720449686050415, "step": 1726 }, { "batch_size": 1, "epoch": 0.6904, "step": 1726, "tokens_per_device": 5691 }, { "epoch": 0.6904, "loss_ce": 0.10868241637945175, "loss_lvr": 0.2959807813167572, "loss_mode_switch": 0.0, "loss_total": 0.1382804960012436, "step": 1726 }, { "batch_size": 1, "epoch": 0.6904, "step": 1726, "tokens_per_device": 4907 }, { "epoch": 0.6904, "loss_ce": 0.01855439506471157, "loss_lvr": 0.2224760800600052, "loss_mode_switch": 0.0, "loss_total": 0.040802001953125, "step": 1726 }, { "batch_size": 4, "epoch": 0.6904, "step": 1726, "tokens_per_device": 1292 }, { "epoch": 0.6904, "loss_ce": 0.1334065943956375, "loss_lvr": 0.9036389589309692, "loss_mode_switch": 0.0, "loss_total": 0.22377049922943115, "step": 1726 }, { "batch_size": 4, "epoch": 0.6904, "step": 1726, "tokens_per_device": 7620 }, { "epoch": 0.6904, "loss_ce": 0.05386718362569809, "loss_lvr": 1.2384029626846313, "loss_mode_switch": 0.0, "loss_total": 0.1777074784040451, "step": 1726 }, { "batch_size": 4, "epoch": 0.6904, "step": 1726, "tokens_per_device": 2608 }, { "epoch": 0.6904, "loss_ce": 0.19841863214969635, "loss_lvr": 0.9249288439750671, "loss_mode_switch": 0.0, "loss_total": 0.2909115254878998, "step": 1726 }, { "batch_size": 1, "epoch": 0.6904, "step": 1726, "tokens_per_device": 4760 }, { "epoch": 0.6904, "loss_ce": 0.00191908935084939, "loss_lvr": 0.2982664108276367, "loss_mode_switch": 0.0, "loss_total": 0.031745731830596924, "step": 1726 }, { "epoch": 0.6908, "grad_norm": 1.344599723815918, "learning_rate": 2.3044788711253336e-06, "loss": 0.2786, "step": 1727 }, { "batch_size": 4, "epoch": 0.6908, "step": 1727, "tokens_per_device": 4360 }, { "epoch": 0.6908, "loss_ce": 0.2426438182592392, "loss_lvr": 0.712871789932251, "loss_mode_switch": 0.0, "loss_total": 0.3139309883117676, "step": 1727 }, { "batch_size": 4, "epoch": 0.6908, "step": 1727, "tokens_per_device": 4316 }, { "epoch": 0.6908, "loss_ce": 0.10644397884607315, "loss_lvr": 0.5968604683876038, "loss_mode_switch": 0.0, "loss_total": 0.16613002121448517, "step": 1727 }, { "batch_size": 4, "epoch": 0.6908, "step": 1727, "tokens_per_device": 4712 }, { "epoch": 0.6908, "loss_ce": 0.19648931920528412, "loss_lvr": 0.7306305766105652, "loss_mode_switch": 0.0, "loss_total": 0.2695523798465729, "step": 1727 }, { "batch_size": 4, "epoch": 0.6908, "step": 1727, "tokens_per_device": 2536 }, { "epoch": 0.6908, "loss_ce": 0.5308091044425964, "loss_lvr": 0.8001502752304077, "loss_mode_switch": 0.0, "loss_total": 0.6108241081237793, "step": 1727 }, { "batch_size": 4, "epoch": 0.6908, "step": 1727, "tokens_per_device": 3864 }, { "epoch": 0.6908, "loss_ce": 0.27009955048561096, "loss_lvr": 1.3351715803146362, "loss_mode_switch": 0.0, "loss_total": 0.403616726398468, "step": 1727 }, { "batch_size": 1, "epoch": 0.6908, "step": 1727, "tokens_per_device": 5173 }, { "epoch": 0.6908, "loss_ce": 0.002319911727681756, "loss_lvr": 0.5002287030220032, "loss_mode_switch": 0.0, "loss_total": 0.05234278365969658, "step": 1727 }, { "batch_size": 4, "epoch": 0.6908, "step": 1727, "tokens_per_device": 4388 }, { "epoch": 0.6908, "loss_ce": 0.15646645426750183, "loss_lvr": 0.6994975209236145, "loss_mode_switch": 0.0, "loss_total": 0.2264162003993988, "step": 1727 }, { "batch_size": 1, "epoch": 0.6908, "step": 1727, "tokens_per_device": 5216 }, { "epoch": 0.6908, "loss_ce": 0.0007992864702828228, "loss_lvr": 0.5462319254875183, "loss_mode_switch": 0.0, "loss_total": 0.05542248114943504, "step": 1727 }, { "epoch": 0.6912, "grad_norm": 1.3592737913131714, "learning_rate": 2.299025522173271e-06, "loss": 0.301, "step": 1728 }, { "batch_size": 1, "epoch": 0.6912, "step": 1728, "tokens_per_device": 4889 }, { "epoch": 0.6912, "loss_ce": 0.0005768504925072193, "loss_lvr": 0.5887324810028076, "loss_mode_switch": 0.0, "loss_total": 0.059450097382068634, "step": 1728 }, { "batch_size": 4, "epoch": 0.6912, "step": 1728, "tokens_per_device": 4596 }, { "epoch": 0.6912, "loss_ce": 0.21496883034706116, "loss_lvr": 0.7928781509399414, "loss_mode_switch": 0.0, "loss_total": 0.29425665736198425, "step": 1728 }, { "batch_size": 1, "epoch": 0.6912, "step": 1728, "tokens_per_device": 5021 }, { "epoch": 0.6912, "loss_ce": 0.015643097460269928, "loss_lvr": 0.4144882559776306, "loss_mode_switch": 0.0, "loss_total": 0.05709192529320717, "step": 1728 }, { "batch_size": 4, "epoch": 0.6912, "step": 1728, "tokens_per_device": 4236 }, { "epoch": 0.6912, "loss_ce": 0.1900315135717392, "loss_lvr": 1.0603705644607544, "loss_mode_switch": 0.0, "loss_total": 0.29606857895851135, "step": 1728 }, { "batch_size": 1, "epoch": 0.6912, "step": 1728, "tokens_per_device": 7256 }, { "epoch": 0.6912, "loss_ce": 0.011681189760565758, "loss_lvr": 0.44190922379493713, "loss_mode_switch": 0.0, "loss_total": 0.0558721125125885, "step": 1728 }, { "batch_size": 4, "epoch": 0.6912, "step": 1728, "tokens_per_device": 5848 }, { "epoch": 0.6912, "loss_ce": 0.20587119460105896, "loss_lvr": 0.7303125858306885, "loss_mode_switch": 0.0, "loss_total": 0.27890247106552124, "step": 1728 }, { "batch_size": 4, "epoch": 0.6912, "step": 1728, "tokens_per_device": 4884 }, { "epoch": 0.6912, "loss_ce": 0.05843214690685272, "loss_lvr": 0.7441447973251343, "loss_mode_switch": 0.0, "loss_total": 0.1328466236591339, "step": 1728 }, { "batch_size": 4, "epoch": 0.6912, "step": 1728, "tokens_per_device": 4552 }, { "epoch": 0.6912, "loss_ce": 0.48941656947135925, "loss_lvr": 0.7685709595680237, "loss_mode_switch": 0.0, "loss_total": 0.5662736892700195, "step": 1728 }, { "epoch": 0.6916, "grad_norm": 1.2223875522613525, "learning_rate": 2.2935767063356084e-06, "loss": 0.3001, "step": 1729 }, { "batch_size": 1, "epoch": 0.6916, "step": 1729, "tokens_per_device": 5202 }, { "epoch": 0.6916, "loss_ce": 0.9626960754394531, "loss_lvr": 0.5062501430511475, "loss_mode_switch": 0.0, "loss_total": 1.013321042060852, "step": 1729 }, { "batch_size": 4, "epoch": 0.6916, "step": 1729, "tokens_per_device": 4300 }, { "epoch": 0.6916, "loss_ce": 0.10540255159139633, "loss_lvr": 0.9099516272544861, "loss_mode_switch": 0.0, "loss_total": 0.19639772176742554, "step": 1729 }, { "batch_size": 1, "epoch": 0.6916, "step": 1729, "tokens_per_device": 5147 }, { "epoch": 0.6916, "loss_ce": 0.01859128102660179, "loss_lvr": 0.4462736248970032, "loss_mode_switch": 0.0, "loss_total": 0.06321864575147629, "step": 1729 }, { "batch_size": 4, "epoch": 0.6916, "step": 1729, "tokens_per_device": 3812 }, { "epoch": 0.6916, "loss_ce": 0.07981589436531067, "loss_lvr": 1.0865062475204468, "loss_mode_switch": 0.0, "loss_total": 0.18846651911735535, "step": 1729 }, { "batch_size": 4, "epoch": 0.6916, "step": 1729, "tokens_per_device": 5016 }, { "epoch": 0.6916, "loss_ce": 0.7474865317344666, "loss_lvr": 0.8459503054618835, "loss_mode_switch": 0.0, "loss_total": 0.8320815563201904, "step": 1729 }, { "batch_size": 1, "epoch": 0.6916, "step": 1729, "tokens_per_device": 5182 }, { "epoch": 0.6916, "loss_ce": 0.0010041060158982873, "loss_lvr": 0.5223991274833679, "loss_mode_switch": 0.0, "loss_total": 0.05324402078986168, "step": 1729 }, { "batch_size": 4, "epoch": 0.6916, "step": 1729, "tokens_per_device": 5340 }, { "epoch": 0.6916, "loss_ce": 0.1163492351770401, "loss_lvr": 0.7638193368911743, "loss_mode_switch": 0.0, "loss_total": 0.19273117184638977, "step": 1729 }, { "batch_size": 4, "epoch": 0.6916, "step": 1729, "tokens_per_device": 2692 }, { "epoch": 0.6916, "loss_ce": 0.3717012405395508, "loss_lvr": 0.8339008092880249, "loss_mode_switch": 0.0, "loss_total": 0.45509132742881775, "step": 1729 }, { "epoch": 0.692, "grad_norm": 1.3200854063034058, "learning_rate": 2.2881324327572336e-06, "loss": 0.2843, "step": 1730 }, { "batch_size": 4, "epoch": 0.692, "step": 1730, "tokens_per_device": 4380 }, { "epoch": 0.692, "loss_ce": 0.2879018783569336, "loss_lvr": 0.9337102770805359, "loss_mode_switch": 0.0, "loss_total": 0.38127291202545166, "step": 1730 }, { "batch_size": 4, "epoch": 0.692, "step": 1730, "tokens_per_device": 3808 }, { "epoch": 0.692, "loss_ce": 0.002746488666161895, "loss_lvr": 0.8166418075561523, "loss_mode_switch": 0.0, "loss_total": 0.0844106674194336, "step": 1730 }, { "batch_size": 4, "epoch": 0.692, "step": 1730, "tokens_per_device": 2696 }, { "epoch": 0.692, "loss_ce": 0.22280089557170868, "loss_lvr": 0.5223323702812195, "loss_mode_switch": 0.0, "loss_total": 0.2750341296195984, "step": 1730 }, { "batch_size": 1, "epoch": 0.692, "step": 1730, "tokens_per_device": 5090 }, { "epoch": 0.692, "loss_ce": 0.014899739995598793, "loss_lvr": 0.6145712733268738, "loss_mode_switch": 0.0, "loss_total": 0.07635686546564102, "step": 1730 }, { "batch_size": 4, "epoch": 0.692, "step": 1730, "tokens_per_device": 1388 }, { "epoch": 0.692, "loss_ce": 0.24386011064052582, "loss_lvr": 1.0054603815078735, "loss_mode_switch": 0.0, "loss_total": 0.3444061577320099, "step": 1730 }, { "batch_size": 4, "epoch": 0.692, "step": 1730, "tokens_per_device": 2596 }, { "epoch": 0.692, "loss_ce": 0.17323803901672363, "loss_lvr": 0.8949307203292847, "loss_mode_switch": 0.0, "loss_total": 0.2627311050891876, "step": 1730 }, { "batch_size": 1, "epoch": 0.692, "step": 1730, "tokens_per_device": 5174 }, { "epoch": 0.692, "loss_ce": 0.025763370096683502, "loss_lvr": 0.3086043894290924, "loss_mode_switch": 0.0, "loss_total": 0.05662380903959274, "step": 1730 }, { "batch_size": 4, "epoch": 0.692, "step": 1730, "tokens_per_device": 5484 }, { "epoch": 0.692, "loss_ce": 0.1701103001832962, "loss_lvr": 0.8314100503921509, "loss_mode_switch": 0.0, "loss_total": 0.253251314163208, "step": 1730 }, { "epoch": 0.6924, "grad_norm": 1.285150408744812, "learning_rate": 2.2826927105754083e-06, "loss": 0.2849, "step": 1731 }, { "batch_size": 1, "epoch": 0.6924, "step": 1731, "tokens_per_device": 5530 }, { "epoch": 0.6924, "loss_ce": 0.039972979575395584, "loss_lvr": 0.4690060317516327, "loss_mode_switch": 0.0, "loss_total": 0.08687358349561691, "step": 1731 }, { "batch_size": 1, "epoch": 0.6924, "step": 1731, "tokens_per_device": 4534 }, { "epoch": 0.6924, "loss_ce": 0.001811955706216395, "loss_lvr": 0.5092083811759949, "loss_mode_switch": 0.0, "loss_total": 0.05273279547691345, "step": 1731 }, { "batch_size": 4, "epoch": 0.6924, "step": 1731, "tokens_per_device": 5108 }, { "epoch": 0.6924, "loss_ce": 0.12509028613567352, "loss_lvr": 0.8158581256866455, "loss_mode_switch": 0.0, "loss_total": 0.20667609572410583, "step": 1731 }, { "batch_size": 4, "epoch": 0.6924, "step": 1731, "tokens_per_device": 3812 }, { "epoch": 0.6924, "loss_ce": 0.234291210770607, "loss_lvr": 1.1422631740570068, "loss_mode_switch": 0.0, "loss_total": 0.3485175371170044, "step": 1731 }, { "batch_size": 4, "epoch": 0.6924, "step": 1731, "tokens_per_device": 5740 }, { "epoch": 0.6924, "loss_ce": 0.22380346059799194, "loss_lvr": 0.7034587860107422, "loss_mode_switch": 0.0, "loss_total": 0.29414933919906616, "step": 1731 }, { "batch_size": 4, "epoch": 0.6924, "step": 1731, "tokens_per_device": 4056 }, { "epoch": 0.6924, "loss_ce": 0.5487463474273682, "loss_lvr": 0.8726234436035156, "loss_mode_switch": 0.0, "loss_total": 0.6360086798667908, "step": 1731 }, { "batch_size": 4, "epoch": 0.6924, "step": 1731, "tokens_per_device": 3484 }, { "epoch": 0.6924, "loss_ce": 0.14347264170646667, "loss_lvr": 0.7676941156387329, "loss_mode_switch": 0.0, "loss_total": 0.22024205327033997, "step": 1731 }, { "batch_size": 4, "epoch": 0.6924, "step": 1731, "tokens_per_device": 1444 }, { "epoch": 0.6924, "loss_ce": 0.6559412479400635, "loss_lvr": 0.9096252918243408, "loss_mode_switch": 0.0, "loss_total": 0.7469037771224976, "step": 1731 }, { "epoch": 0.6928, "grad_norm": 1.2820382118225098, "learning_rate": 2.2772575489197553e-06, "loss": 0.2524, "step": 1732 }, { "batch_size": 4, "epoch": 0.6928, "step": 1732, "tokens_per_device": 3744 }, { "epoch": 0.6928, "loss_ce": 0.08824680745601654, "loss_lvr": 0.9615187644958496, "loss_mode_switch": 0.0, "loss_total": 0.18439868092536926, "step": 1732 }, { "batch_size": 4, "epoch": 0.6928, "step": 1732, "tokens_per_device": 5148 }, { "epoch": 0.6928, "loss_ce": 0.08848896622657776, "loss_lvr": 0.7184972167015076, "loss_mode_switch": 0.0, "loss_total": 0.16033869981765747, "step": 1732 }, { "batch_size": 4, "epoch": 0.6928, "step": 1732, "tokens_per_device": 4616 }, { "epoch": 0.6928, "loss_ce": 0.3183783292770386, "loss_lvr": 0.8576360940933228, "loss_mode_switch": 0.0, "loss_total": 0.40414193272590637, "step": 1732 }, { "batch_size": 4, "epoch": 0.6928, "step": 1732, "tokens_per_device": 3904 }, { "epoch": 0.6928, "loss_ce": 0.21551457047462463, "loss_lvr": 0.6839515566825867, "loss_mode_switch": 0.0, "loss_total": 0.28390973806381226, "step": 1732 }, { "batch_size": 4, "epoch": 0.6928, "step": 1732, "tokens_per_device": 13160 }, { "epoch": 0.6928, "loss_ce": 0.007685126271098852, "loss_lvr": 0.9860528111457825, "loss_mode_switch": 0.0, "loss_total": 0.10629040747880936, "step": 1732 }, { "batch_size": 4, "epoch": 0.6928, "step": 1732, "tokens_per_device": 4920 }, { "epoch": 0.6928, "loss_ce": 0.6026392579078674, "loss_lvr": 0.9642826318740845, "loss_mode_switch": 0.0, "loss_total": 0.6990675330162048, "step": 1732 }, { "batch_size": 1, "epoch": 0.6928, "step": 1732, "tokens_per_device": 4822 }, { "epoch": 0.6928, "loss_ce": 0.10167983919382095, "loss_lvr": 0.7245069146156311, "loss_mode_switch": 0.0, "loss_total": 0.17413052916526794, "step": 1732 }, { "batch_size": 1, "epoch": 0.6928, "step": 1732, "tokens_per_device": 4980 }, { "epoch": 0.6928, "loss_ce": 0.5673123002052307, "loss_lvr": 0.42198166251182556, "loss_mode_switch": 0.0, "loss_total": 0.6095104813575745, "step": 1732 }, { "epoch": 0.6932, "grad_norm": 1.3247989416122437, "learning_rate": 2.271826956912248e-06, "loss": 0.2981, "step": 1733 }, { "batch_size": 4, "epoch": 0.6932, "step": 1733, "tokens_per_device": 4504 }, { "epoch": 0.6932, "loss_ce": 0.014382691122591496, "loss_lvr": 0.6996279954910278, "loss_mode_switch": 0.0, "loss_total": 0.0843454897403717, "step": 1733 }, { "batch_size": 1, "epoch": 0.6932, "step": 1733, "tokens_per_device": 5195 }, { "epoch": 0.6932, "loss_ce": 0.05788993462920189, "loss_lvr": 0.40205052495002747, "loss_mode_switch": 0.0, "loss_total": 0.09809498488903046, "step": 1733 }, { "batch_size": 4, "epoch": 0.6932, "step": 1733, "tokens_per_device": 5128 }, { "epoch": 0.6932, "loss_ce": 0.3145035207271576, "loss_lvr": 0.7191978693008423, "loss_mode_switch": 0.0, "loss_total": 0.3864233195781708, "step": 1733 }, { "batch_size": 4, "epoch": 0.6932, "step": 1733, "tokens_per_device": 6252 }, { "epoch": 0.6932, "loss_ce": 0.01275610737502575, "loss_lvr": 1.0154435634613037, "loss_mode_switch": 0.0, "loss_total": 0.11430046707391739, "step": 1733 }, { "batch_size": 4, "epoch": 0.6932, "step": 1733, "tokens_per_device": 4260 }, { "epoch": 0.6932, "loss_ce": 0.1817190945148468, "loss_lvr": 0.8811941742897034, "loss_mode_switch": 0.0, "loss_total": 0.26983851194381714, "step": 1733 }, { "batch_size": 4, "epoch": 0.6932, "step": 1733, "tokens_per_device": 4664 }, { "epoch": 0.6932, "loss_ce": 0.60706627368927, "loss_lvr": 0.7657128572463989, "loss_mode_switch": 0.0, "loss_total": 0.6836375594139099, "step": 1733 }, { "batch_size": 4, "epoch": 0.6932, "step": 1733, "tokens_per_device": 1544 }, { "epoch": 0.6932, "loss_ce": 0.3313262164592743, "loss_lvr": 0.9921785593032837, "loss_mode_switch": 0.0, "loss_total": 0.43054407835006714, "step": 1733 }, { "batch_size": 4, "epoch": 0.6932, "step": 1733, "tokens_per_device": 5948 }, { "epoch": 0.6932, "loss_ce": 0.5598757266998291, "loss_lvr": 0.8341314792633057, "loss_mode_switch": 0.0, "loss_total": 0.6432888507843018, "step": 1733 }, { "epoch": 0.6936, "grad_norm": 1.4623279571533203, "learning_rate": 2.266400943667185e-06, "loss": 0.3294, "step": 1734 }, { "batch_size": 4, "epoch": 0.6936, "step": 1734, "tokens_per_device": 4316 }, { "epoch": 0.6936, "loss_ce": 0.21344412863254547, "loss_lvr": 0.9279746413230896, "loss_mode_switch": 0.0, "loss_total": 0.30624160170555115, "step": 1734 }, { "batch_size": 4, "epoch": 0.6936, "step": 1734, "tokens_per_device": 1816 }, { "epoch": 0.6936, "loss_ce": 0.0817100927233696, "loss_lvr": 0.8170690536499023, "loss_mode_switch": 0.0, "loss_total": 0.1634169965982437, "step": 1734 }, { "batch_size": 1, "epoch": 0.6936, "step": 1734, "tokens_per_device": 5217 }, { "epoch": 0.6936, "loss_ce": 0.09925227612257004, "loss_lvr": 0.49036216735839844, "loss_mode_switch": 0.0, "loss_total": 0.14828848838806152, "step": 1734 }, { "batch_size": 4, "epoch": 0.6936, "step": 1734, "tokens_per_device": 4360 }, { "epoch": 0.6936, "loss_ce": 0.2553982436656952, "loss_lvr": 0.9816684126853943, "loss_mode_switch": 0.0, "loss_total": 0.3535650968551636, "step": 1734 }, { "batch_size": 4, "epoch": 0.6936, "step": 1734, "tokens_per_device": 4448 }, { "epoch": 0.6936, "loss_ce": 0.5858277082443237, "loss_lvr": 0.8753132224082947, "loss_mode_switch": 0.0, "loss_total": 0.6733590364456177, "step": 1734 }, { "batch_size": 4, "epoch": 0.6936, "step": 1734, "tokens_per_device": 1584 }, { "epoch": 0.6936, "loss_ce": 0.5743155479431152, "loss_lvr": 1.0353267192840576, "loss_mode_switch": 0.0, "loss_total": 0.677848219871521, "step": 1734 }, { "batch_size": 1, "epoch": 0.6936, "step": 1734, "tokens_per_device": 4741 }, { "epoch": 0.6936, "loss_ce": 0.02402155101299286, "loss_lvr": 0.5420275330543518, "loss_mode_switch": 0.0, "loss_total": 0.0782243013381958, "step": 1734 }, { "batch_size": 4, "epoch": 0.6936, "step": 1734, "tokens_per_device": 5540 }, { "epoch": 0.6936, "loss_ce": 0.015825804322957993, "loss_lvr": 0.9119904041290283, "loss_mode_switch": 0.0, "loss_total": 0.10702484846115112, "step": 1734 }, { "epoch": 0.694, "grad_norm": 1.3063586950302124, "learning_rate": 2.260979518291186e-06, "loss": 0.3315, "step": 1735 }, { "batch_size": 4, "epoch": 0.694, "step": 1735, "tokens_per_device": 5748 }, { "epoch": 0.694, "loss_ce": 0.7223607301712036, "loss_lvr": 0.7900072336196899, "loss_mode_switch": 0.0, "loss_total": 0.8013614416122437, "step": 1735 }, { "batch_size": 4, "epoch": 0.694, "step": 1735, "tokens_per_device": 4104 }, { "epoch": 0.694, "loss_ce": 0.14699998497962952, "loss_lvr": 0.7222157716751099, "loss_mode_switch": 0.0, "loss_total": 0.2192215621471405, "step": 1735 }, { "batch_size": 4, "epoch": 0.694, "step": 1735, "tokens_per_device": 2752 }, { "epoch": 0.694, "loss_ce": 0.25664037466049194, "loss_lvr": 0.7326185703277588, "loss_mode_switch": 0.0, "loss_total": 0.3299022316932678, "step": 1735 }, { "batch_size": 1, "epoch": 0.694, "step": 1735, "tokens_per_device": 4897 }, { "epoch": 0.694, "loss_ce": 0.045659154653549194, "loss_lvr": 0.27972012758255005, "loss_mode_switch": 0.0, "loss_total": 0.0736311674118042, "step": 1735 }, { "batch_size": 4, "epoch": 0.694, "step": 1735, "tokens_per_device": 5108 }, { "epoch": 0.694, "loss_ce": 0.06316030025482178, "loss_lvr": 0.8306586742401123, "loss_mode_switch": 0.0, "loss_total": 0.146226167678833, "step": 1735 }, { "batch_size": 1, "epoch": 0.694, "step": 1735, "tokens_per_device": 5072 }, { "epoch": 0.694, "loss_ce": 0.011465203016996384, "loss_lvr": 0.693115770816803, "loss_mode_switch": 0.0, "loss_total": 0.08077678084373474, "step": 1735 }, { "batch_size": 4, "epoch": 0.694, "step": 1735, "tokens_per_device": 4364 }, { "epoch": 0.694, "loss_ce": 0.06202530115842819, "loss_lvr": 0.9893918037414551, "loss_mode_switch": 0.0, "loss_total": 0.1609644889831543, "step": 1735 }, { "batch_size": 4, "epoch": 0.694, "step": 1735, "tokens_per_device": 7156 }, { "epoch": 0.694, "loss_ce": 0.3036176860332489, "loss_lvr": 0.9102481007575989, "loss_mode_switch": 0.0, "loss_total": 0.39464250206947327, "step": 1735 }, { "epoch": 0.6944, "grad_norm": 1.2267072200775146, "learning_rate": 2.255562689883166e-06, "loss": 0.2542, "step": 1736 }, { "batch_size": 1, "epoch": 0.6944, "step": 1736, "tokens_per_device": 4938 }, { "epoch": 0.6944, "loss_ce": 0.0003471837262623012, "loss_lvr": 0.434336394071579, "loss_mode_switch": 0.0, "loss_total": 0.043780822306871414, "step": 1736 }, { "batch_size": 1, "epoch": 0.6944, "step": 1736, "tokens_per_device": 4899 }, { "epoch": 0.6944, "loss_ce": 0.2617221474647522, "loss_lvr": 0.49699604511260986, "loss_mode_switch": 0.0, "loss_total": 0.3114217519760132, "step": 1736 }, { "batch_size": 4, "epoch": 0.6944, "step": 1736, "tokens_per_device": 4224 }, { "epoch": 0.6944, "loss_ce": 0.19196763634681702, "loss_lvr": 1.1134463548660278, "loss_mode_switch": 0.0, "loss_total": 0.3033122718334198, "step": 1736 }, { "batch_size": 4, "epoch": 0.6944, "step": 1736, "tokens_per_device": 4660 }, { "epoch": 0.6944, "loss_ce": 0.3034558892250061, "loss_lvr": 0.8186892867088318, "loss_mode_switch": 0.0, "loss_total": 0.3853248357772827, "step": 1736 }, { "batch_size": 4, "epoch": 0.6944, "step": 1736, "tokens_per_device": 4540 }, { "epoch": 0.6944, "loss_ce": 0.20129211246967316, "loss_lvr": 0.8160322308540344, "loss_mode_switch": 0.0, "loss_total": 0.2828953266143799, "step": 1736 }, { "batch_size": 4, "epoch": 0.6944, "step": 1736, "tokens_per_device": 3012 }, { "epoch": 0.6944, "loss_ce": 0.11515035480260849, "loss_lvr": 0.6100525259971619, "loss_mode_switch": 0.0, "loss_total": 0.17615561187267303, "step": 1736 }, { "batch_size": 4, "epoch": 0.6944, "step": 1736, "tokens_per_device": 5608 }, { "epoch": 0.6944, "loss_ce": 0.4162049889564514, "loss_lvr": 0.7070382833480835, "loss_mode_switch": 0.0, "loss_total": 0.48690882325172424, "step": 1736 }, { "batch_size": 1, "epoch": 0.6944, "step": 1736, "tokens_per_device": 5090 }, { "epoch": 0.6944, "loss_ce": 0.06590790301561356, "loss_lvr": 0.4921778440475464, "loss_mode_switch": 0.0, "loss_total": 0.11512568593025208, "step": 1736 }, { "epoch": 0.6948, "grad_norm": 1.4183986186981201, "learning_rate": 2.2501504675343237e-06, "loss": 0.3022, "step": 1737 }, { "batch_size": 4, "epoch": 0.6948, "step": 1737, "tokens_per_device": 4948 }, { "epoch": 0.6948, "loss_ce": 0.48016685247421265, "loss_lvr": 0.9045790433883667, "loss_mode_switch": 0.0, "loss_total": 0.5706247687339783, "step": 1737 }, { "batch_size": 4, "epoch": 0.6948, "step": 1737, "tokens_per_device": 4548 }, { "epoch": 0.6948, "loss_ce": 0.29168373346328735, "loss_lvr": 0.9907408952713013, "loss_mode_switch": 0.0, "loss_total": 0.39075782895088196, "step": 1737 }, { "batch_size": 4, "epoch": 0.6948, "step": 1737, "tokens_per_device": 5260 }, { "epoch": 0.6948, "loss_ce": 0.36924776434898376, "loss_lvr": 0.7523139715194702, "loss_mode_switch": 0.0, "loss_total": 0.44447916746139526, "step": 1737 }, { "batch_size": 1, "epoch": 0.6948, "step": 1737, "tokens_per_device": 5190 }, { "epoch": 0.6948, "loss_ce": 0.015196351334452629, "loss_lvr": 0.4078318476676941, "loss_mode_switch": 0.0, "loss_total": 0.05597953498363495, "step": 1737 }, { "batch_size": 4, "epoch": 0.6948, "step": 1737, "tokens_per_device": 11448 }, { "epoch": 0.6948, "loss_ce": 0.33348044753074646, "loss_lvr": 0.8035219311714172, "loss_mode_switch": 0.0, "loss_total": 0.4138326346874237, "step": 1737 }, { "batch_size": 1, "epoch": 0.6948, "step": 1737, "tokens_per_device": 4926 }, { "epoch": 0.6948, "loss_ce": 0.09988188743591309, "loss_lvr": 0.2698166072368622, "loss_mode_switch": 0.0, "loss_total": 0.12686355412006378, "step": 1737 }, { "batch_size": 1, "epoch": 0.6948, "step": 1737, "tokens_per_device": 4891 }, { "epoch": 0.6948, "loss_ce": 0.23703719675540924, "loss_lvr": 0.6446784138679504, "loss_mode_switch": 0.0, "loss_total": 0.30150502920150757, "step": 1737 }, { "batch_size": 4, "epoch": 0.6948, "step": 1737, "tokens_per_device": 2636 }, { "epoch": 0.6948, "loss_ce": 0.5115467309951782, "loss_lvr": 0.6718202233314514, "loss_mode_switch": 0.0, "loss_total": 0.5787287354469299, "step": 1737 }, { "epoch": 0.6952, "grad_norm": 1.332268238067627, "learning_rate": 2.244742860328133e-06, "loss": 0.3039, "step": 1738 }, { "batch_size": 4, "epoch": 0.6952, "step": 1738, "tokens_per_device": 3448 }, { "epoch": 0.6952, "loss_ce": 0.20407262444496155, "loss_lvr": 0.7712498903274536, "loss_mode_switch": 0.0, "loss_total": 0.28119760751724243, "step": 1738 }, { "batch_size": 4, "epoch": 0.6952, "step": 1738, "tokens_per_device": 6588 }, { "epoch": 0.6952, "loss_ce": 0.32465168833732605, "loss_lvr": 0.6563809514045715, "loss_mode_switch": 0.0, "loss_total": 0.3902897834777832, "step": 1738 }, { "batch_size": 1, "epoch": 0.6952, "step": 1738, "tokens_per_device": 5859 }, { "epoch": 0.6952, "loss_ce": 0.013181236572563648, "loss_lvr": 0.2847945988178253, "loss_mode_switch": 0.0, "loss_total": 0.041660696268081665, "step": 1738 }, { "batch_size": 1, "epoch": 0.6952, "step": 1738, "tokens_per_device": 7635 }, { "epoch": 0.6952, "loss_ce": 0.01097224373370409, "loss_lvr": 0.26057857275009155, "loss_mode_switch": 0.0, "loss_total": 0.03703010082244873, "step": 1738 }, { "batch_size": 4, "epoch": 0.6952, "step": 1738, "tokens_per_device": 4860 }, { "epoch": 0.6952, "loss_ce": 0.2563789188861847, "loss_lvr": 0.7889389395713806, "loss_mode_switch": 0.0, "loss_total": 0.33527281880378723, "step": 1738 }, { "batch_size": 4, "epoch": 0.6952, "step": 1738, "tokens_per_device": 4532 }, { "epoch": 0.6952, "loss_ce": 0.2271580547094345, "loss_lvr": 0.8267402052879333, "loss_mode_switch": 0.0, "loss_total": 0.30983206629753113, "step": 1738 }, { "batch_size": 4, "epoch": 0.6952, "step": 1738, "tokens_per_device": 3728 }, { "epoch": 0.6952, "loss_ce": 0.11112573742866516, "loss_lvr": 0.9017432332038879, "loss_mode_switch": 0.0, "loss_total": 0.20130005478858948, "step": 1738 }, { "batch_size": 4, "epoch": 0.6952, "step": 1738, "tokens_per_device": 6832 }, { "epoch": 0.6952, "loss_ce": 0.46312543749809265, "loss_lvr": 0.738742470741272, "loss_mode_switch": 0.0, "loss_total": 0.5369997024536133, "step": 1738 }, { "epoch": 0.6956, "grad_norm": 1.2763365507125854, "learning_rate": 2.2393398773403196e-06, "loss": 0.2711, "step": 1739 }, { "batch_size": 4, "epoch": 0.6956, "step": 1739, "tokens_per_device": 1228 }, { "epoch": 0.6956, "loss_ce": 0.3979742228984833, "loss_lvr": 1.1012688875198364, "loss_mode_switch": 0.0, "loss_total": 0.5081011056900024, "step": 1739 }, { "batch_size": 1, "epoch": 0.6956, "step": 1739, "tokens_per_device": 5094 }, { "epoch": 0.6956, "loss_ce": 0.002805430209264159, "loss_lvr": 0.9495956897735596, "loss_mode_switch": 0.0, "loss_total": 0.0977649986743927, "step": 1739 }, { "batch_size": 4, "epoch": 0.6956, "step": 1739, "tokens_per_device": 3964 }, { "epoch": 0.6956, "loss_ce": 0.42184898257255554, "loss_lvr": 0.850968599319458, "loss_mode_switch": 0.0, "loss_total": 0.5069458484649658, "step": 1739 }, { "batch_size": 1, "epoch": 0.6956, "step": 1739, "tokens_per_device": 5339 }, { "epoch": 0.6956, "loss_ce": 0.003643057309091091, "loss_lvr": 0.33060336112976074, "loss_mode_switch": 0.0, "loss_total": 0.03670339658856392, "step": 1739 }, { "batch_size": 4, "epoch": 0.6956, "step": 1739, "tokens_per_device": 1428 }, { "epoch": 0.6956, "loss_ce": 0.6566916704177856, "loss_lvr": 0.952271580696106, "loss_mode_switch": 0.0, "loss_total": 0.7519188523292542, "step": 1739 }, { "batch_size": 1, "epoch": 0.6956, "step": 1739, "tokens_per_device": 5198 }, { "epoch": 0.6956, "loss_ce": 0.2120652198791504, "loss_lvr": 0.7955421209335327, "loss_mode_switch": 0.0, "loss_total": 0.2916194200515747, "step": 1739 }, { "batch_size": 4, "epoch": 0.6956, "step": 1739, "tokens_per_device": 5608 }, { "epoch": 0.6956, "loss_ce": 0.3382922112941742, "loss_lvr": 0.7031459808349609, "loss_mode_switch": 0.0, "loss_total": 0.4086068272590637, "step": 1739 }, { "batch_size": 4, "epoch": 0.6956, "step": 1739, "tokens_per_device": 4236 }, { "epoch": 0.6956, "loss_ce": 0.027335405349731445, "loss_lvr": 0.8849471211433411, "loss_mode_switch": 0.0, "loss_total": 0.11583011597394943, "step": 1739 }, { "epoch": 0.696, "grad_norm": 1.4578808546066284, "learning_rate": 2.233941527638848e-06, "loss": 0.3017, "step": 1740 }, { "batch_size": 4, "epoch": 0.696, "step": 1740, "tokens_per_device": 1332 }, { "epoch": 0.696, "loss_ce": 0.4061465859413147, "loss_lvr": 1.0195715427398682, "loss_mode_switch": 0.0, "loss_total": 0.5081037282943726, "step": 1740 }, { "batch_size": 4, "epoch": 0.696, "step": 1740, "tokens_per_device": 5784 }, { "epoch": 0.696, "loss_ce": 0.22682182490825653, "loss_lvr": 0.8466328978538513, "loss_mode_switch": 0.0, "loss_total": 0.3114851117134094, "step": 1740 }, { "batch_size": 1, "epoch": 0.696, "step": 1740, "tokens_per_device": 7310 }, { "epoch": 0.696, "loss_ce": 0.27498918771743774, "loss_lvr": 0.3103289306163788, "loss_mode_switch": 0.0, "loss_total": 0.3060220777988434, "step": 1740 }, { "batch_size": 4, "epoch": 0.696, "step": 1740, "tokens_per_device": 10644 }, { "epoch": 0.696, "loss_ce": 0.3045203387737274, "loss_lvr": 0.8935646414756775, "loss_mode_switch": 0.0, "loss_total": 0.3938767910003662, "step": 1740 }, { "batch_size": 4, "epoch": 0.696, "step": 1740, "tokens_per_device": 2612 }, { "epoch": 0.696, "loss_ce": 0.24053335189819336, "loss_lvr": 1.1165697574615479, "loss_mode_switch": 0.0, "loss_total": 0.3521903157234192, "step": 1740 }, { "batch_size": 4, "epoch": 0.696, "step": 1740, "tokens_per_device": 1568 }, { "epoch": 0.696, "loss_ce": 0.19717469811439514, "loss_lvr": 1.0306042432785034, "loss_mode_switch": 0.0, "loss_total": 0.3002351224422455, "step": 1740 }, { "batch_size": 4, "epoch": 0.696, "step": 1740, "tokens_per_device": 5560 }, { "epoch": 0.696, "loss_ce": 0.2438458800315857, "loss_lvr": 0.5381373167037964, "loss_mode_switch": 0.0, "loss_total": 0.29765960574150085, "step": 1740 }, { "batch_size": 4, "epoch": 0.696, "step": 1740, "tokens_per_device": 5332 }, { "epoch": 0.696, "loss_ce": 0.23269027471542358, "loss_lvr": 0.7892454862594604, "loss_mode_switch": 0.0, "loss_total": 0.3116148114204407, "step": 1740 }, { "epoch": 0.6964, "grad_norm": 1.28517484664917, "learning_rate": 2.228547820283902e-06, "loss": 0.2892, "step": 1741 }, { "batch_size": 1, "epoch": 0.6964, "step": 1741, "tokens_per_device": 4886 }, { "epoch": 0.6964, "loss_ce": 0.06736962497234344, "loss_lvr": 0.7186617255210876, "loss_mode_switch": 0.0, "loss_total": 0.13923579454421997, "step": 1741 }, { "batch_size": 4, "epoch": 0.6964, "step": 1741, "tokens_per_device": 3856 }, { "epoch": 0.6964, "loss_ce": 0.6059470176696777, "loss_lvr": 0.8628536462783813, "loss_mode_switch": 0.0, "loss_total": 0.6922323703765869, "step": 1741 }, { "batch_size": 4, "epoch": 0.6964, "step": 1741, "tokens_per_device": 1320 }, { "epoch": 0.6964, "loss_ce": 0.23023651540279388, "loss_lvr": 0.9260419011116028, "loss_mode_switch": 0.0, "loss_total": 0.32284069061279297, "step": 1741 }, { "batch_size": 4, "epoch": 0.6964, "step": 1741, "tokens_per_device": 2708 }, { "epoch": 0.6964, "loss_ce": 0.2216530740261078, "loss_lvr": 0.7659522891044617, "loss_mode_switch": 0.0, "loss_total": 0.298248291015625, "step": 1741 }, { "batch_size": 1, "epoch": 0.6964, "step": 1741, "tokens_per_device": 5116 }, { "epoch": 0.6964, "loss_ce": 0.00542510487139225, "loss_lvr": 0.6218900084495544, "loss_mode_switch": 0.0, "loss_total": 0.0676141083240509, "step": 1741 }, { "batch_size": 1, "epoch": 0.6964, "step": 1741, "tokens_per_device": 4892 }, { "epoch": 0.6964, "loss_ce": 0.47999781370162964, "loss_lvr": 0.3761829137802124, "loss_mode_switch": 0.0, "loss_total": 0.5176160931587219, "step": 1741 }, { "batch_size": 4, "epoch": 0.6964, "step": 1741, "tokens_per_device": 4252 }, { "epoch": 0.6964, "loss_ce": 0.2395739108324051, "loss_lvr": 0.8627187013626099, "loss_mode_switch": 0.0, "loss_total": 0.32584577798843384, "step": 1741 }, { "batch_size": 1, "epoch": 0.6964, "step": 1741, "tokens_per_device": 7950 }, { "epoch": 0.6964, "loss_ce": 0.0006226021214388311, "loss_lvr": 0.2988375127315521, "loss_mode_switch": 0.0, "loss_total": 0.030506353825330734, "step": 1741 }, { "epoch": 0.6968, "grad_norm": 1.6022133827209473, "learning_rate": 2.2231587643278827e-06, "loss": 0.3099, "step": 1742 }, { "batch_size": 4, "epoch": 0.6968, "step": 1742, "tokens_per_device": 5060 }, { "epoch": 0.6968, "loss_ce": 0.5088393688201904, "loss_lvr": 0.7827008962631226, "loss_mode_switch": 0.0, "loss_total": 0.5871094465255737, "step": 1742 }, { "batch_size": 4, "epoch": 0.6968, "step": 1742, "tokens_per_device": 1536 }, { "epoch": 0.6968, "loss_ce": 0.44660940766334534, "loss_lvr": 1.005053997039795, "loss_mode_switch": 0.0, "loss_total": 0.5471147894859314, "step": 1742 }, { "batch_size": 1, "epoch": 0.6968, "step": 1742, "tokens_per_device": 5107 }, { "epoch": 0.6968, "loss_ce": 0.058896470814943314, "loss_lvr": 0.18005390465259552, "loss_mode_switch": 0.0, "loss_total": 0.0769018605351448, "step": 1742 }, { "batch_size": 4, "epoch": 0.6968, "step": 1742, "tokens_per_device": 4956 }, { "epoch": 0.6968, "loss_ce": 0.2781845033168793, "loss_lvr": 1.2180589437484741, "loss_mode_switch": 0.0, "loss_total": 0.39999040961265564, "step": 1742 }, { "batch_size": 4, "epoch": 0.6968, "step": 1742, "tokens_per_device": 1692 }, { "epoch": 0.6968, "loss_ce": 0.08866862207651138, "loss_lvr": 1.4770711660385132, "loss_mode_switch": 0.0, "loss_total": 0.23637574911117554, "step": 1742 }, { "batch_size": 4, "epoch": 0.6968, "step": 1742, "tokens_per_device": 3804 }, { "epoch": 0.6968, "loss_ce": 0.021164046600461006, "loss_lvr": 0.6777512431144714, "loss_mode_switch": 0.0, "loss_total": 0.08893916755914688, "step": 1742 }, { "batch_size": 1, "epoch": 0.6968, "step": 1742, "tokens_per_device": 5117 }, { "epoch": 0.6968, "loss_ce": 0.014210094697773457, "loss_lvr": 0.3113865852355957, "loss_mode_switch": 0.0, "loss_total": 0.04534875229001045, "step": 1742 }, { "batch_size": 1, "epoch": 0.6968, "step": 1742, "tokens_per_device": 4913 }, { "epoch": 0.6968, "loss_ce": 0.03472214937210083, "loss_lvr": 0.5810607075691223, "loss_mode_switch": 0.0, "loss_total": 0.09282822161912918, "step": 1742 }, { "epoch": 0.6972, "grad_norm": 1.4442253112792969, "learning_rate": 2.217774368815378e-06, "loss": 0.3168, "step": 1743 }, { "batch_size": 4, "epoch": 0.6972, "step": 1743, "tokens_per_device": 1524 }, { "epoch": 0.6972, "loss_ce": 0.5118539929389954, "loss_lvr": 0.8559777736663818, "loss_mode_switch": 0.0, "loss_total": 0.5974517464637756, "step": 1743 }, { "batch_size": 4, "epoch": 0.6972, "step": 1743, "tokens_per_device": 4964 }, { "epoch": 0.6972, "loss_ce": 0.6563910841941833, "loss_lvr": 0.7816551923751831, "loss_mode_switch": 0.0, "loss_total": 0.7345566153526306, "step": 1743 }, { "batch_size": 4, "epoch": 0.6972, "step": 1743, "tokens_per_device": 15828 }, { "epoch": 0.6972, "loss_ce": 0.087236687541008, "loss_lvr": 0.883966326713562, "loss_mode_switch": 0.0, "loss_total": 0.17563331127166748, "step": 1743 }, { "batch_size": 4, "epoch": 0.6972, "step": 1743, "tokens_per_device": 4252 }, { "epoch": 0.6972, "loss_ce": 0.24389447271823883, "loss_lvr": 1.0176351070404053, "loss_mode_switch": 0.0, "loss_total": 0.34565797448158264, "step": 1743 }, { "batch_size": 4, "epoch": 0.6972, "step": 1743, "tokens_per_device": 8944 }, { "epoch": 0.6972, "loss_ce": 0.08244816958904266, "loss_lvr": 0.7985816597938538, "loss_mode_switch": 0.0, "loss_total": 0.16230633854866028, "step": 1743 }, { "batch_size": 1, "epoch": 0.6972, "step": 1743, "tokens_per_device": 4866 }, { "epoch": 0.6972, "loss_ce": 0.034774504601955414, "loss_lvr": 0.3806818127632141, "loss_mode_switch": 0.0, "loss_total": 0.07284268736839294, "step": 1743 }, { "batch_size": 1, "epoch": 0.6972, "step": 1743, "tokens_per_device": 5183 }, { "epoch": 0.6972, "loss_ce": 0.13996337354183197, "loss_lvr": 0.1667935699224472, "loss_mode_switch": 0.0, "loss_total": 0.15664273500442505, "step": 1743 }, { "batch_size": 4, "epoch": 0.6972, "step": 1743, "tokens_per_device": 3828 }, { "epoch": 0.6972, "loss_ce": 0.1143568754196167, "loss_lvr": 1.0353559255599976, "loss_mode_switch": 0.0, "loss_total": 0.21789246797561646, "step": 1743 }, { "epoch": 0.6976, "grad_norm": 1.3230180740356445, "learning_rate": 2.2123946427831582e-06, "loss": 0.2811, "step": 1744 }, { "batch_size": 1, "epoch": 0.6976, "step": 1744, "tokens_per_device": 4965 }, { "epoch": 0.6976, "loss_ce": 0.6879621744155884, "loss_lvr": 0.5304103493690491, "loss_mode_switch": 0.0, "loss_total": 0.7410032153129578, "step": 1744 }, { "batch_size": 4, "epoch": 0.6976, "step": 1744, "tokens_per_device": 4200 }, { "epoch": 0.6976, "loss_ce": 0.07642072439193726, "loss_lvr": 0.821218729019165, "loss_mode_switch": 0.0, "loss_total": 0.15854260325431824, "step": 1744 }, { "batch_size": 4, "epoch": 0.6976, "step": 1744, "tokens_per_device": 7136 }, { "epoch": 0.6976, "loss_ce": 0.6421098709106445, "loss_lvr": 0.7692732214927673, "loss_mode_switch": 0.0, "loss_total": 0.7190371751785278, "step": 1744 }, { "batch_size": 4, "epoch": 0.6976, "step": 1744, "tokens_per_device": 4308 }, { "epoch": 0.6976, "loss_ce": 0.028003407642245293, "loss_lvr": 0.9734006524085999, "loss_mode_switch": 0.0, "loss_total": 0.1253434717655182, "step": 1744 }, { "batch_size": 1, "epoch": 0.6976, "step": 1744, "tokens_per_device": 5000 }, { "epoch": 0.6976, "loss_ce": 0.00610500480979681, "loss_lvr": 0.537878692150116, "loss_mode_switch": 0.0, "loss_total": 0.05989287421107292, "step": 1744 }, { "batch_size": 1, "epoch": 0.6976, "step": 1744, "tokens_per_device": 4430 }, { "epoch": 0.6976, "loss_ce": 0.019824128597974777, "loss_lvr": 0.47896039485931396, "loss_mode_switch": 0.0, "loss_total": 0.06772016733884811, "step": 1744 }, { "batch_size": 4, "epoch": 0.6976, "step": 1744, "tokens_per_device": 3784 }, { "epoch": 0.6976, "loss_ce": 0.7046669721603394, "loss_lvr": 0.7808564305305481, "loss_mode_switch": 0.0, "loss_total": 0.7827526330947876, "step": 1744 }, { "batch_size": 4, "epoch": 0.6976, "step": 1744, "tokens_per_device": 4376 }, { "epoch": 0.6976, "loss_ce": 0.20852838456630707, "loss_lvr": 0.9413020014762878, "loss_mode_switch": 0.0, "loss_total": 0.3026585876941681, "step": 1744 }, { "epoch": 0.698, "grad_norm": 1.2161376476287842, "learning_rate": 2.207019595260154e-06, "loss": 0.2432, "step": 1745 }, { "batch_size": 4, "epoch": 0.698, "step": 1745, "tokens_per_device": 5844 }, { "epoch": 0.698, "loss_ce": 0.14705654978752136, "loss_lvr": 1.0695492029190063, "loss_mode_switch": 0.0, "loss_total": 0.25401148200035095, "step": 1745 }, { "batch_size": 4, "epoch": 0.698, "step": 1745, "tokens_per_device": 4368 }, { "epoch": 0.698, "loss_ce": 0.5103999376296997, "loss_lvr": 1.193213701248169, "loss_mode_switch": 0.0, "loss_total": 0.6297212839126587, "step": 1745 }, { "batch_size": 4, "epoch": 0.698, "step": 1745, "tokens_per_device": 4940 }, { "epoch": 0.698, "loss_ce": 0.24556618928909302, "loss_lvr": 0.8195098638534546, "loss_mode_switch": 0.0, "loss_total": 0.32751718163490295, "step": 1745 }, { "batch_size": 4, "epoch": 0.698, "step": 1745, "tokens_per_device": 5264 }, { "epoch": 0.698, "loss_ce": 0.31685322523117065, "loss_lvr": 0.7449328899383545, "loss_mode_switch": 0.0, "loss_total": 0.3913465142250061, "step": 1745 }, { "batch_size": 4, "epoch": 0.698, "step": 1745, "tokens_per_device": 4520 }, { "epoch": 0.698, "loss_ce": 0.26114583015441895, "loss_lvr": 0.7950404286384583, "loss_mode_switch": 0.0, "loss_total": 0.34064987301826477, "step": 1745 }, { "batch_size": 4, "epoch": 0.698, "step": 1745, "tokens_per_device": 12492 }, { "epoch": 0.698, "loss_ce": 0.19899405539035797, "loss_lvr": 0.5110160112380981, "loss_mode_switch": 0.0, "loss_total": 0.2500956654548645, "step": 1745 }, { "batch_size": 1, "epoch": 0.698, "step": 1745, "tokens_per_device": 4824 }, { "epoch": 0.698, "loss_ce": 0.00023305478680413216, "loss_lvr": 0.3111003041267395, "loss_mode_switch": 0.0, "loss_total": 0.031343087553977966, "step": 1745 }, { "batch_size": 4, "epoch": 0.698, "step": 1745, "tokens_per_device": 1620 }, { "epoch": 0.698, "loss_ce": 0.41993609070777893, "loss_lvr": 0.9595611095428467, "loss_mode_switch": 0.0, "loss_total": 0.5158922076225281, "step": 1745 }, { "epoch": 0.6984, "grad_norm": 1.48527193069458, "learning_rate": 2.2016492352674424e-06, "loss": 0.2879, "step": 1746 }, { "batch_size": 4, "epoch": 0.6984, "step": 1746, "tokens_per_device": 5024 }, { "epoch": 0.6984, "loss_ce": 0.07263819873332977, "loss_lvr": 0.885060727596283, "loss_mode_switch": 0.0, "loss_total": 0.16114427149295807, "step": 1746 }, { "batch_size": 1, "epoch": 0.6984, "step": 1746, "tokens_per_device": 5129 }, { "epoch": 0.6984, "loss_ce": 0.013877683319151402, "loss_lvr": 1.4539598226547241, "loss_mode_switch": 0.0, "loss_total": 0.1592736691236496, "step": 1746 }, { "batch_size": 1, "epoch": 0.6984, "step": 1746, "tokens_per_device": 8756 }, { "epoch": 0.6984, "loss_ce": 0.005053339991718531, "loss_lvr": 0.4367828071117401, "loss_mode_switch": 0.0, "loss_total": 0.048731621354818344, "step": 1746 }, { "batch_size": 4, "epoch": 0.6984, "step": 1746, "tokens_per_device": 4820 }, { "epoch": 0.6984, "loss_ce": 0.1400730311870575, "loss_lvr": 0.7336850762367249, "loss_mode_switch": 0.0, "loss_total": 0.21344155073165894, "step": 1746 }, { "batch_size": 4, "epoch": 0.6984, "step": 1746, "tokens_per_device": 4544 }, { "epoch": 0.6984, "loss_ce": 0.0714341253042221, "loss_lvr": 0.8497021794319153, "loss_mode_switch": 0.0, "loss_total": 0.15640434622764587, "step": 1746 }, { "batch_size": 4, "epoch": 0.6984, "step": 1746, "tokens_per_device": 2612 }, { "epoch": 0.6984, "loss_ce": 0.2540608048439026, "loss_lvr": 0.8028796911239624, "loss_mode_switch": 0.0, "loss_total": 0.33434876799583435, "step": 1746 }, { "batch_size": 1, "epoch": 0.6984, "step": 1746, "tokens_per_device": 4897 }, { "epoch": 0.6984, "loss_ce": 0.1917913407087326, "loss_lvr": 0.7351475358009338, "loss_mode_switch": 0.0, "loss_total": 0.2653060853481293, "step": 1746 }, { "batch_size": 4, "epoch": 0.6984, "step": 1746, "tokens_per_device": 4716 }, { "epoch": 0.6984, "loss_ce": 0.8649104833602905, "loss_lvr": 0.9975838661193848, "loss_mode_switch": 0.0, "loss_total": 0.964668869972229, "step": 1746 }, { "epoch": 0.6988, "grad_norm": 1.3645131587982178, "learning_rate": 2.19628357181824e-06, "loss": 0.3254, "step": 1747 }, { "batch_size": 1, "epoch": 0.6988, "step": 1747, "tokens_per_device": 4866 }, { "epoch": 0.6988, "loss_ce": 0.16873273253440857, "loss_lvr": 0.21400125324726105, "loss_mode_switch": 0.0, "loss_total": 0.19013285636901855, "step": 1747 }, { "batch_size": 4, "epoch": 0.6988, "step": 1747, "tokens_per_device": 1420 }, { "epoch": 0.6988, "loss_ce": 0.2612074017524719, "loss_lvr": 0.9572779536247253, "loss_mode_switch": 0.0, "loss_total": 0.35693520307540894, "step": 1747 }, { "batch_size": 4, "epoch": 0.6988, "step": 1747, "tokens_per_device": 4584 }, { "epoch": 0.6988, "loss_ce": 0.19717073440551758, "loss_lvr": 0.719676673412323, "loss_mode_switch": 0.0, "loss_total": 0.2691383957862854, "step": 1747 }, { "batch_size": 4, "epoch": 0.6988, "step": 1747, "tokens_per_device": 3832 }, { "epoch": 0.6988, "loss_ce": 0.28403985500335693, "loss_lvr": 0.6016486883163452, "loss_mode_switch": 0.0, "loss_total": 0.34420472383499146, "step": 1747 }, { "batch_size": 4, "epoch": 0.6988, "step": 1747, "tokens_per_device": 5016 }, { "epoch": 0.6988, "loss_ce": 0.04177841916680336, "loss_lvr": 0.8337608575820923, "loss_mode_switch": 0.0, "loss_total": 0.125154510140419, "step": 1747 }, { "batch_size": 4, "epoch": 0.6988, "step": 1747, "tokens_per_device": 1280 }, { "epoch": 0.6988, "loss_ce": 0.431535005569458, "loss_lvr": 1.0762513875961304, "loss_mode_switch": 0.0, "loss_total": 0.5391601324081421, "step": 1747 }, { "batch_size": 4, "epoch": 0.6988, "step": 1747, "tokens_per_device": 4332 }, { "epoch": 0.6988, "loss_ce": 0.12386306375265121, "loss_lvr": 0.6359426379203796, "loss_mode_switch": 0.0, "loss_total": 0.18745732307434082, "step": 1747 }, { "batch_size": 1, "epoch": 0.6988, "step": 1747, "tokens_per_device": 5204 }, { "epoch": 0.6988, "loss_ce": 0.24334485828876495, "loss_lvr": 0.5080154538154602, "loss_mode_switch": 0.0, "loss_total": 0.29414641857147217, "step": 1747 }, { "epoch": 0.6992, "grad_norm": 1.2473958730697632, "learning_rate": 2.1909226139178723e-06, "loss": 0.2557, "step": 1748 }, { "batch_size": 4, "epoch": 0.6992, "step": 1748, "tokens_per_device": 3808 }, { "epoch": 0.6992, "loss_ce": 0.23404359817504883, "loss_lvr": 0.9507167935371399, "loss_mode_switch": 0.0, "loss_total": 0.32911527156829834, "step": 1748 }, { "batch_size": 1, "epoch": 0.6992, "step": 1748, "tokens_per_device": 4985 }, { "epoch": 0.6992, "loss_ce": 0.6071068644523621, "loss_lvr": 0.18359045684337616, "loss_mode_switch": 0.0, "loss_total": 0.6254659295082092, "step": 1748 }, { "batch_size": 4, "epoch": 0.6992, "step": 1748, "tokens_per_device": 1328 }, { "epoch": 0.6992, "loss_ce": 0.2856934666633606, "loss_lvr": 0.9913133978843689, "loss_mode_switch": 0.0, "loss_total": 0.38482481241226196, "step": 1748 }, { "batch_size": 4, "epoch": 0.6992, "step": 1748, "tokens_per_device": 4244 }, { "epoch": 0.6992, "loss_ce": 0.3810133934020996, "loss_lvr": 0.9379744529724121, "loss_mode_switch": 0.0, "loss_total": 0.4748108386993408, "step": 1748 }, { "batch_size": 4, "epoch": 0.6992, "step": 1748, "tokens_per_device": 2508 }, { "epoch": 0.6992, "loss_ce": 0.0339653380215168, "loss_lvr": 0.9735950231552124, "loss_mode_switch": 0.0, "loss_total": 0.13132484257221222, "step": 1748 }, { "batch_size": 4, "epoch": 0.6992, "step": 1748, "tokens_per_device": 4480 }, { "epoch": 0.6992, "loss_ce": 0.13686221837997437, "loss_lvr": 0.9005804061889648, "loss_mode_switch": 0.0, "loss_total": 0.2269202619791031, "step": 1748 }, { "batch_size": 4, "epoch": 0.6992, "step": 1748, "tokens_per_device": 15220 }, { "epoch": 0.6992, "loss_ce": 0.015589870512485504, "loss_lvr": 0.7309033870697021, "loss_mode_switch": 0.0, "loss_total": 0.0886802077293396, "step": 1748 }, { "batch_size": 1, "epoch": 0.6992, "step": 1748, "tokens_per_device": 4907 }, { "epoch": 0.6992, "loss_ce": 0.02821069210767746, "loss_lvr": 0.7521301507949829, "loss_mode_switch": 0.0, "loss_total": 0.10342370718717575, "step": 1748 }, { "epoch": 0.6996, "grad_norm": 1.213621735572815, "learning_rate": 2.1855663705637763e-06, "loss": 0.2677, "step": 1749 }, { "batch_size": 4, "epoch": 0.6996, "step": 1749, "tokens_per_device": 4304 }, { "epoch": 0.6996, "loss_ce": 0.23683686554431915, "loss_lvr": 0.8344677686691284, "loss_mode_switch": 0.0, "loss_total": 0.3202836513519287, "step": 1749 }, { "batch_size": 4, "epoch": 0.6996, "step": 1749, "tokens_per_device": 1244 }, { "epoch": 0.6996, "loss_ce": 0.021613096818327904, "loss_lvr": 1.0725817680358887, "loss_mode_switch": 0.0, "loss_total": 0.12887127697467804, "step": 1749 }, { "batch_size": 4, "epoch": 0.6996, "step": 1749, "tokens_per_device": 4788 }, { "epoch": 0.6996, "loss_ce": 0.3152373731136322, "loss_lvr": 0.732307493686676, "loss_mode_switch": 0.0, "loss_total": 0.3884681165218353, "step": 1749 }, { "batch_size": 4, "epoch": 0.6996, "step": 1749, "tokens_per_device": 1412 }, { "epoch": 0.6996, "loss_ce": 0.7537657022476196, "loss_lvr": 0.8507053256034851, "loss_mode_switch": 0.0, "loss_total": 0.8388362526893616, "step": 1749 }, { "batch_size": 4, "epoch": 0.6996, "step": 1749, "tokens_per_device": 12824 }, { "epoch": 0.6996, "loss_ce": 0.017104143276810646, "loss_lvr": 0.618302583694458, "loss_mode_switch": 0.0, "loss_total": 0.07893440127372742, "step": 1749 }, { "batch_size": 1, "epoch": 0.6996, "step": 1749, "tokens_per_device": 5186 }, { "epoch": 0.6996, "loss_ce": 0.14411449432373047, "loss_lvr": 0.3250313997268677, "loss_mode_switch": 0.0, "loss_total": 0.17661763727664948, "step": 1749 }, { "batch_size": 1, "epoch": 0.6996, "step": 1749, "tokens_per_device": 4960 }, { "epoch": 0.6996, "loss_ce": 0.018832365050911903, "loss_lvr": 0.473662793636322, "loss_mode_switch": 0.0, "loss_total": 0.06619864702224731, "step": 1749 }, { "batch_size": 1, "epoch": 0.6996, "step": 1749, "tokens_per_device": 4896 }, { "epoch": 0.6996, "loss_ce": 0.005187665577977896, "loss_lvr": 0.6767864227294922, "loss_mode_switch": 0.0, "loss_total": 0.07286631315946579, "step": 1749 }, { "epoch": 0.7, "grad_norm": 1.2111284732818604, "learning_rate": 2.1802148507454675e-06, "loss": 0.2684, "step": 1750 }, { "batch_size": 4, "epoch": 0.7, "step": 1750, "tokens_per_device": 4264 }, { "epoch": 0.7, "loss_ce": 0.007376207038760185, "loss_lvr": 0.8852207660675049, "loss_mode_switch": 0.0, "loss_total": 0.09589828550815582, "step": 1750 }, { "batch_size": 4, "epoch": 0.7, "step": 1750, "tokens_per_device": 2632 }, { "epoch": 0.7, "loss_ce": 0.5333443284034729, "loss_lvr": 0.8633975386619568, "loss_mode_switch": 0.0, "loss_total": 0.619684100151062, "step": 1750 }, { "batch_size": 4, "epoch": 0.7, "step": 1750, "tokens_per_device": 3016 }, { "epoch": 0.7, "loss_ce": 0.06391710788011551, "loss_lvr": 0.9526215195655823, "loss_mode_switch": 0.0, "loss_total": 0.15917927026748657, "step": 1750 }, { "batch_size": 4, "epoch": 0.7, "step": 1750, "tokens_per_device": 5752 }, { "epoch": 0.7, "loss_ce": 0.10028928518295288, "loss_lvr": 0.79509437084198, "loss_mode_switch": 0.0, "loss_total": 0.17979872226715088, "step": 1750 }, { "batch_size": 1, "epoch": 0.7, "step": 1750, "tokens_per_device": 5112 }, { "epoch": 0.7, "loss_ce": 0.0047253514640033245, "loss_lvr": 0.29907411336898804, "loss_mode_switch": 0.0, "loss_total": 0.034632761031389236, "step": 1750 }, { "batch_size": 4, "epoch": 0.7, "step": 1750, "tokens_per_device": 14052 }, { "epoch": 0.7, "loss_ce": 0.08072055876255035, "loss_lvr": 0.7333225011825562, "loss_mode_switch": 0.0, "loss_total": 0.15405280888080597, "step": 1750 }, { "batch_size": 4, "epoch": 0.7, "step": 1750, "tokens_per_device": 2584 }, { "epoch": 0.7, "loss_ce": 0.6401337385177612, "loss_lvr": 0.9181726574897766, "loss_mode_switch": 0.0, "loss_total": 0.7319509983062744, "step": 1750 }, { "batch_size": 4, "epoch": 0.7, "step": 1750, "tokens_per_device": 6328 }, { "epoch": 0.7, "loss_ce": 0.31593725085258484, "loss_lvr": 0.7406011819839478, "loss_mode_switch": 0.0, "loss_total": 0.38999736309051514, "step": 1750 }, { "epoch": 0.7004, "grad_norm": 1.5024102926254272, "learning_rate": 2.174868063444542e-06, "loss": 0.3195, "step": 1751 }, { "batch_size": 4, "epoch": 0.7004, "step": 1751, "tokens_per_device": 4412 }, { "epoch": 0.7004, "loss_ce": 0.14440898597240448, "loss_lvr": 0.6178798079490662, "loss_mode_switch": 0.0, "loss_total": 0.20619696378707886, "step": 1751 }, { "batch_size": 1, "epoch": 0.7004, "step": 1751, "tokens_per_device": 4622 }, { "epoch": 0.7004, "loss_ce": 0.0025319275446236134, "loss_lvr": 0.5116668343544006, "loss_mode_switch": 0.0, "loss_total": 0.05369861051440239, "step": 1751 }, { "batch_size": 4, "epoch": 0.7004, "step": 1751, "tokens_per_device": 1404 }, { "epoch": 0.7004, "loss_ce": 0.5060851573944092, "loss_lvr": 1.278570532798767, "loss_mode_switch": 0.0, "loss_total": 0.6339422464370728, "step": 1751 }, { "batch_size": 4, "epoch": 0.7004, "step": 1751, "tokens_per_device": 4204 }, { "epoch": 0.7004, "loss_ce": 0.39143216609954834, "loss_lvr": 0.9574096202850342, "loss_mode_switch": 0.0, "loss_total": 0.4871731400489807, "step": 1751 }, { "batch_size": 4, "epoch": 0.7004, "step": 1751, "tokens_per_device": 4624 }, { "epoch": 0.7004, "loss_ce": 0.6242851614952087, "loss_lvr": 0.8561038970947266, "loss_mode_switch": 0.0, "loss_total": 0.7098955512046814, "step": 1751 }, { "batch_size": 1, "epoch": 0.7004, "step": 1751, "tokens_per_device": 4897 }, { "epoch": 0.7004, "loss_ce": 0.012254077941179276, "loss_lvr": 0.29121121764183044, "loss_mode_switch": 0.0, "loss_total": 0.04137519747018814, "step": 1751 }, { "batch_size": 1, "epoch": 0.7004, "step": 1751, "tokens_per_device": 4735 }, { "epoch": 0.7004, "loss_ce": 0.005152491852641106, "loss_lvr": 0.33293554186820984, "loss_mode_switch": 0.0, "loss_total": 0.03844604641199112, "step": 1751 }, { "batch_size": 4, "epoch": 0.7004, "step": 1751, "tokens_per_device": 5956 }, { "epoch": 0.7004, "loss_ce": 0.38577908277511597, "loss_lvr": 0.6939637660980225, "loss_mode_switch": 0.0, "loss_total": 0.4551754593849182, "step": 1751 }, { "epoch": 0.7008, "grad_norm": 1.2169522047042847, "learning_rate": 2.1695260176346453e-06, "loss": 0.2711, "step": 1752 }, { "batch_size": 1, "epoch": 0.7008, "step": 1752, "tokens_per_device": 7140 }, { "epoch": 0.7008, "loss_ce": 0.00021170705440454185, "loss_lvr": 0.3061887323856354, "loss_mode_switch": 0.0, "loss_total": 0.03083058074116707, "step": 1752 }, { "batch_size": 1, "epoch": 0.7008, "step": 1752, "tokens_per_device": 4894 }, { "epoch": 0.7008, "loss_ce": 1.2084957361221313, "loss_lvr": 0.8011269569396973, "loss_mode_switch": 0.0, "loss_total": 1.288608431816101, "step": 1752 }, { "batch_size": 4, "epoch": 0.7008, "step": 1752, "tokens_per_device": 1788 }, { "epoch": 0.7008, "loss_ce": 0.5508299469947815, "loss_lvr": 0.9705255627632141, "loss_mode_switch": 0.0, "loss_total": 0.6478825211524963, "step": 1752 }, { "batch_size": 1, "epoch": 0.7008, "step": 1752, "tokens_per_device": 4677 }, { "epoch": 0.7008, "loss_ce": 0.034198205918073654, "loss_lvr": 0.44106003642082214, "loss_mode_switch": 0.0, "loss_total": 0.07830420881509781, "step": 1752 }, { "batch_size": 4, "epoch": 0.7008, "step": 1752, "tokens_per_device": 4964 }, { "epoch": 0.7008, "loss_ce": 0.273686021566391, "loss_lvr": 0.8893056511878967, "loss_mode_switch": 0.0, "loss_total": 0.3626165986061096, "step": 1752 }, { "batch_size": 4, "epoch": 0.7008, "step": 1752, "tokens_per_device": 3344 }, { "epoch": 0.7008, "loss_ce": 0.07545725256204605, "loss_lvr": 0.5822209119796753, "loss_mode_switch": 0.0, "loss_total": 0.1336793452501297, "step": 1752 }, { "batch_size": 4, "epoch": 0.7008, "step": 1752, "tokens_per_device": 4640 }, { "epoch": 0.7008, "loss_ce": 0.1176433265209198, "loss_lvr": 0.779166042804718, "loss_mode_switch": 0.0, "loss_total": 0.19555993378162384, "step": 1752 }, { "batch_size": 4, "epoch": 0.7008, "step": 1752, "tokens_per_device": 5632 }, { "epoch": 0.7008, "loss_ce": 0.06640961766242981, "loss_lvr": 0.7414665222167969, "loss_mode_switch": 0.0, "loss_total": 0.14055627584457397, "step": 1752 }, { "epoch": 0.7012, "grad_norm": 1.2906421422958374, "learning_rate": 2.164188722281474e-06, "loss": 0.2647, "step": 1753 }, { "batch_size": 4, "epoch": 0.7012, "step": 1753, "tokens_per_device": 1676 }, { "epoch": 0.7012, "loss_ce": 0.7067642211914062, "loss_lvr": 0.9686050415039062, "loss_mode_switch": 0.0, "loss_total": 0.8036247491836548, "step": 1753 }, { "batch_size": 4, "epoch": 0.7012, "step": 1753, "tokens_per_device": 2648 }, { "epoch": 0.7012, "loss_ce": 0.09980625659227371, "loss_lvr": 0.5834019184112549, "loss_mode_switch": 0.0, "loss_total": 0.1581464409828186, "step": 1753 }, { "batch_size": 1, "epoch": 0.7012, "step": 1753, "tokens_per_device": 5224 }, { "epoch": 0.7012, "loss_ce": 0.18087930977344513, "loss_lvr": 0.35785746574401855, "loss_mode_switch": 0.0, "loss_total": 0.21666505932807922, "step": 1753 }, { "batch_size": 4, "epoch": 0.7012, "step": 1753, "tokens_per_device": 3940 }, { "epoch": 0.7012, "loss_ce": 0.09898153692483902, "loss_lvr": 0.7596766352653503, "loss_mode_switch": 0.0, "loss_total": 0.17494919896125793, "step": 1753 }, { "batch_size": 4, "epoch": 0.7012, "step": 1753, "tokens_per_device": 4864 }, { "epoch": 0.7012, "loss_ce": 0.29356464743614197, "loss_lvr": 0.6346104741096497, "loss_mode_switch": 0.0, "loss_total": 0.357025682926178, "step": 1753 }, { "batch_size": 1, "epoch": 0.7012, "step": 1753, "tokens_per_device": 4940 }, { "epoch": 0.7012, "loss_ce": 0.7958665490150452, "loss_lvr": 0.2790297567844391, "loss_mode_switch": 0.0, "loss_total": 0.8237695097923279, "step": 1753 }, { "batch_size": 4, "epoch": 0.7012, "step": 1753, "tokens_per_device": 8456 }, { "epoch": 0.7012, "loss_ce": 0.722668468952179, "loss_lvr": 0.9535718560218811, "loss_mode_switch": 0.0, "loss_total": 0.8180256485939026, "step": 1753 }, { "batch_size": 4, "epoch": 0.7012, "step": 1753, "tokens_per_device": 3976 }, { "epoch": 0.7012, "loss_ce": 0.48171672224998474, "loss_lvr": 0.8379581570625305, "loss_mode_switch": 0.0, "loss_total": 0.5655125379562378, "step": 1753 }, { "epoch": 0.7016, "grad_norm": 1.2819287776947021, "learning_rate": 2.158856186342745e-06, "loss": 0.2663, "step": 1754 }, { "batch_size": 4, "epoch": 0.7016, "step": 1754, "tokens_per_device": 5168 }, { "epoch": 0.7016, "loss_ce": 0.6554867625236511, "loss_lvr": 0.8442501425743103, "loss_mode_switch": 0.0, "loss_total": 0.7399117946624756, "step": 1754 }, { "batch_size": 4, "epoch": 0.7016, "step": 1754, "tokens_per_device": 1540 }, { "epoch": 0.7016, "loss_ce": 0.14419081807136536, "loss_lvr": 0.9024916291236877, "loss_mode_switch": 0.0, "loss_total": 0.23443998396396637, "step": 1754 }, { "batch_size": 1, "epoch": 0.7016, "step": 1754, "tokens_per_device": 5133 }, { "epoch": 0.7016, "loss_ce": 0.025897962972521782, "loss_lvr": 0.3449588716030121, "loss_mode_switch": 0.0, "loss_total": 0.06039384752511978, "step": 1754 }, { "batch_size": 4, "epoch": 0.7016, "step": 1754, "tokens_per_device": 1608 }, { "epoch": 0.7016, "loss_ce": 0.31164008378982544, "loss_lvr": 0.8557978272438049, "loss_mode_switch": 0.0, "loss_total": 0.39721986651420593, "step": 1754 }, { "batch_size": 4, "epoch": 0.7016, "step": 1754, "tokens_per_device": 1300 }, { "epoch": 0.7016, "loss_ce": 0.7599443197250366, "loss_lvr": 1.2442864179611206, "loss_mode_switch": 0.0, "loss_total": 0.8843729496002197, "step": 1754 }, { "batch_size": 4, "epoch": 0.7016, "step": 1754, "tokens_per_device": 4124 }, { "epoch": 0.7016, "loss_ce": 0.4903087615966797, "loss_lvr": 0.7979131937026978, "loss_mode_switch": 0.0, "loss_total": 0.5701000690460205, "step": 1754 }, { "batch_size": 4, "epoch": 0.7016, "step": 1754, "tokens_per_device": 7692 }, { "epoch": 0.7016, "loss_ce": 0.20264053344726562, "loss_lvr": 0.8044205904006958, "loss_mode_switch": 0.0, "loss_total": 0.28308260440826416, "step": 1754 }, { "batch_size": 4, "epoch": 0.7016, "step": 1754, "tokens_per_device": 2612 }, { "epoch": 0.7016, "loss_ce": 0.2354668825864792, "loss_lvr": 0.8557668328285217, "loss_mode_switch": 0.0, "loss_total": 0.32104355096817017, "step": 1754 }, { "epoch": 0.702, "grad_norm": 1.2144516706466675, "learning_rate": 2.1535284187681866e-06, "loss": 0.2725, "step": 1755 }, { "batch_size": 1, "epoch": 0.702, "step": 1755, "tokens_per_device": 4876 }, { "epoch": 0.702, "loss_ce": 0.7827950716018677, "loss_lvr": 0.5431286096572876, "loss_mode_switch": 0.0, "loss_total": 0.8371079564094543, "step": 1755 }, { "batch_size": 1, "epoch": 0.702, "step": 1755, "tokens_per_device": 4771 }, { "epoch": 0.702, "loss_ce": 0.009094025939702988, "loss_lvr": 0.1880977749824524, "loss_mode_switch": 0.0, "loss_total": 0.027903804555535316, "step": 1755 }, { "batch_size": 4, "epoch": 0.702, "step": 1755, "tokens_per_device": 4048 }, { "epoch": 0.702, "loss_ce": 0.036880604922771454, "loss_lvr": 0.9195858240127563, "loss_mode_switch": 0.0, "loss_total": 0.12883919477462769, "step": 1755 }, { "batch_size": 4, "epoch": 0.702, "step": 1755, "tokens_per_device": 6508 }, { "epoch": 0.702, "loss_ce": 0.40413856506347656, "loss_lvr": 0.6562101244926453, "loss_mode_switch": 0.0, "loss_total": 0.46975958347320557, "step": 1755 }, { "batch_size": 4, "epoch": 0.702, "step": 1755, "tokens_per_device": 3984 }, { "epoch": 0.702, "loss_ce": 0.5524190664291382, "loss_lvr": 0.9616245627403259, "loss_mode_switch": 0.0, "loss_total": 0.6485815048217773, "step": 1755 }, { "batch_size": 4, "epoch": 0.702, "step": 1755, "tokens_per_device": 4484 }, { "epoch": 0.702, "loss_ce": 0.2773425281047821, "loss_lvr": 0.44173508882522583, "loss_mode_switch": 0.0, "loss_total": 0.3215160369873047, "step": 1755 }, { "batch_size": 4, "epoch": 0.702, "step": 1755, "tokens_per_device": 2828 }, { "epoch": 0.702, "loss_ce": 0.4759114682674408, "loss_lvr": 0.6956717371940613, "loss_mode_switch": 0.0, "loss_total": 0.5454786419868469, "step": 1755 }, { "batch_size": 4, "epoch": 0.702, "step": 1755, "tokens_per_device": 1228 }, { "epoch": 0.702, "loss_ce": 0.29357069730758667, "loss_lvr": 1.010621190071106, "loss_mode_switch": 0.0, "loss_total": 0.39463281631469727, "step": 1755 }, { "epoch": 0.7024, "grad_norm": 1.5967090129852295, "learning_rate": 2.148205428499531e-06, "loss": 0.3417, "step": 1756 }, { "batch_size": 4, "epoch": 0.7024, "step": 1756, "tokens_per_device": 3980 }, { "epoch": 0.7024, "loss_ce": 0.4456803798675537, "loss_lvr": 1.054648995399475, "loss_mode_switch": 0.0, "loss_total": 0.5511452555656433, "step": 1756 }, { "batch_size": 1, "epoch": 0.7024, "step": 1756, "tokens_per_device": 4862 }, { "epoch": 0.7024, "loss_ce": 0.009669557213783264, "loss_lvr": 0.3715592920780182, "loss_mode_switch": 0.0, "loss_total": 0.04682548716664314, "step": 1756 }, { "batch_size": 4, "epoch": 0.7024, "step": 1756, "tokens_per_device": 10316 }, { "epoch": 0.7024, "loss_ce": 0.15347819030284882, "loss_lvr": 0.7066283226013184, "loss_mode_switch": 0.0, "loss_total": 0.22414103150367737, "step": 1756 }, { "batch_size": 1, "epoch": 0.7024, "step": 1756, "tokens_per_device": 4148 }, { "epoch": 0.7024, "loss_ce": 0.008493566885590553, "loss_lvr": 0.30918940901756287, "loss_mode_switch": 0.0, "loss_total": 0.03941250592470169, "step": 1756 }, { "batch_size": 4, "epoch": 0.7024, "step": 1756, "tokens_per_device": 11788 }, { "epoch": 0.7024, "loss_ce": 0.2466588020324707, "loss_lvr": 0.4557502865791321, "loss_mode_switch": 0.0, "loss_total": 0.29223382472991943, "step": 1756 }, { "batch_size": 1, "epoch": 0.7024, "step": 1756, "tokens_per_device": 4873 }, { "epoch": 0.7024, "loss_ce": 0.029756419360637665, "loss_lvr": 0.3227332532405853, "loss_mode_switch": 0.0, "loss_total": 0.06202974542975426, "step": 1756 }, { "batch_size": 4, "epoch": 0.7024, "step": 1756, "tokens_per_device": 5512 }, { "epoch": 0.7024, "loss_ce": 0.21061377227306366, "loss_lvr": 0.5382145047187805, "loss_mode_switch": 0.0, "loss_total": 0.2644352316856384, "step": 1756 }, { "batch_size": 1, "epoch": 0.7024, "step": 1756, "tokens_per_device": 5040 }, { "epoch": 0.7024, "loss_ce": 0.030671199783682823, "loss_lvr": 0.2608402669429779, "loss_mode_switch": 0.0, "loss_total": 0.056755226105451584, "step": 1756 }, { "epoch": 0.7028, "grad_norm": 1.2223780155181885, "learning_rate": 2.1428872244704862e-06, "loss": 0.2745, "step": 1757 }, { "batch_size": 4, "epoch": 0.7028, "step": 1757, "tokens_per_device": 4328 }, { "epoch": 0.7028, "loss_ce": 0.15758194029331207, "loss_lvr": 0.7962031364440918, "loss_mode_switch": 0.0, "loss_total": 0.2372022569179535, "step": 1757 }, { "batch_size": 1, "epoch": 0.7028, "step": 1757, "tokens_per_device": 4853 }, { "epoch": 0.7028, "loss_ce": 0.16234715282917023, "loss_lvr": 0.32528573274612427, "loss_mode_switch": 0.0, "loss_total": 0.19487573206424713, "step": 1757 }, { "batch_size": 4, "epoch": 0.7028, "step": 1757, "tokens_per_device": 9280 }, { "epoch": 0.7028, "loss_ce": 0.4208712577819824, "loss_lvr": 0.36006492376327515, "loss_mode_switch": 0.0, "loss_total": 0.456877738237381, "step": 1757 }, { "batch_size": 4, "epoch": 0.7028, "step": 1757, "tokens_per_device": 6600 }, { "epoch": 0.7028, "loss_ce": 0.22510409355163574, "loss_lvr": 0.6977260112762451, "loss_mode_switch": 0.0, "loss_total": 0.29487669467926025, "step": 1757 }, { "batch_size": 4, "epoch": 0.7028, "step": 1757, "tokens_per_device": 1296 }, { "epoch": 0.7028, "loss_ce": 0.9463084936141968, "loss_lvr": 1.039223074913025, "loss_mode_switch": 0.0, "loss_total": 1.0502307415008545, "step": 1757 }, { "batch_size": 4, "epoch": 0.7028, "step": 1757, "tokens_per_device": 5940 }, { "epoch": 0.7028, "loss_ce": 0.5398862361907959, "loss_lvr": 0.846329927444458, "loss_mode_switch": 0.0, "loss_total": 0.6245192289352417, "step": 1757 }, { "batch_size": 4, "epoch": 0.7028, "step": 1757, "tokens_per_device": 1360 }, { "epoch": 0.7028, "loss_ce": 0.30701813101768494, "loss_lvr": 1.0174534320831299, "loss_mode_switch": 0.0, "loss_total": 0.40876346826553345, "step": 1757 }, { "batch_size": 4, "epoch": 0.7028, "step": 1757, "tokens_per_device": 1612 }, { "epoch": 0.7028, "loss_ce": 0.25788792967796326, "loss_lvr": 1.8005924224853516, "loss_mode_switch": 0.0, "loss_total": 0.437947154045105, "step": 1757 }, { "epoch": 0.7032, "grad_norm": 1.4118205308914185, "learning_rate": 2.1375738156067327e-06, "loss": 0.3496, "step": 1758 }, { "batch_size": 4, "epoch": 0.7032, "step": 1758, "tokens_per_device": 4080 }, { "epoch": 0.7032, "loss_ce": 0.29437875747680664, "loss_lvr": 0.5903205275535583, "loss_mode_switch": 0.0, "loss_total": 0.3534108102321625, "step": 1758 }, { "batch_size": 4, "epoch": 0.7032, "step": 1758, "tokens_per_device": 3952 }, { "epoch": 0.7032, "loss_ce": 0.48948773741722107, "loss_lvr": 0.789055347442627, "loss_mode_switch": 0.0, "loss_total": 0.5683932900428772, "step": 1758 }, { "batch_size": 1, "epoch": 0.7032, "step": 1758, "tokens_per_device": 5112 }, { "epoch": 0.7032, "loss_ce": 0.006246950943022966, "loss_lvr": 0.24670390784740448, "loss_mode_switch": 0.0, "loss_total": 0.030917340889573097, "step": 1758 }, { "batch_size": 1, "epoch": 0.7032, "step": 1758, "tokens_per_device": 5058 }, { "epoch": 0.7032, "loss_ce": 0.15123961865901947, "loss_lvr": 0.36568132042884827, "loss_mode_switch": 0.0, "loss_total": 0.18780775368213654, "step": 1758 }, { "batch_size": 4, "epoch": 0.7032, "step": 1758, "tokens_per_device": 8264 }, { "epoch": 0.7032, "loss_ce": 0.18943460285663605, "loss_lvr": 0.5168633460998535, "loss_mode_switch": 0.0, "loss_total": 0.24112093448638916, "step": 1758 }, { "batch_size": 4, "epoch": 0.7032, "step": 1758, "tokens_per_device": 2848 }, { "epoch": 0.7032, "loss_ce": 0.22671696543693542, "loss_lvr": 0.6321751475334167, "loss_mode_switch": 0.0, "loss_total": 0.2899344861507416, "step": 1758 }, { "batch_size": 1, "epoch": 0.7032, "step": 1758, "tokens_per_device": 4815 }, { "epoch": 0.7032, "loss_ce": 0.07814303040504456, "loss_lvr": 0.20809218287467957, "loss_mode_switch": 0.0, "loss_total": 0.09895224869251251, "step": 1758 }, { "batch_size": 4, "epoch": 0.7032, "step": 1758, "tokens_per_device": 2668 }, { "epoch": 0.7032, "loss_ce": 0.27810904383659363, "loss_lvr": 0.694737434387207, "loss_mode_switch": 0.0, "loss_total": 0.34758278727531433, "step": 1758 }, { "epoch": 0.7036, "grad_norm": 1.1996287107467651, "learning_rate": 2.132265210825896e-06, "loss": 0.2716, "step": 1759 }, { "batch_size": 4, "epoch": 0.7036, "step": 1759, "tokens_per_device": 4272 }, { "epoch": 0.7036, "loss_ce": 0.07560727000236511, "loss_lvr": 0.9168890714645386, "loss_mode_switch": 0.0, "loss_total": 0.1672961711883545, "step": 1759 }, { "batch_size": 4, "epoch": 0.7036, "step": 1759, "tokens_per_device": 6012 }, { "epoch": 0.7036, "loss_ce": 0.4517151117324829, "loss_lvr": 0.674239456653595, "loss_mode_switch": 0.0, "loss_total": 0.5191390514373779, "step": 1759 }, { "batch_size": 4, "epoch": 0.7036, "step": 1759, "tokens_per_device": 3852 }, { "epoch": 0.7036, "loss_ce": 0.016532886773347855, "loss_lvr": 0.780316174030304, "loss_mode_switch": 0.0, "loss_total": 0.0945645123720169, "step": 1759 }, { "batch_size": 1, "epoch": 0.7036, "step": 1759, "tokens_per_device": 5109 }, { "epoch": 0.7036, "loss_ce": 0.2742878496646881, "loss_lvr": 0.3688996434211731, "loss_mode_switch": 0.0, "loss_total": 0.3111778199672699, "step": 1759 }, { "batch_size": 4, "epoch": 0.7036, "step": 1759, "tokens_per_device": 5696 }, { "epoch": 0.7036, "loss_ce": 0.1361023336648941, "loss_lvr": 0.7756600975990295, "loss_mode_switch": 0.0, "loss_total": 0.2136683464050293, "step": 1759 }, { "batch_size": 1, "epoch": 0.7036, "step": 1759, "tokens_per_device": 5096 }, { "epoch": 0.7036, "loss_ce": 0.18390198051929474, "loss_lvr": 0.48635241389274597, "loss_mode_switch": 0.0, "loss_total": 0.23253722488880157, "step": 1759 }, { "batch_size": 4, "epoch": 0.7036, "step": 1759, "tokens_per_device": 4076 }, { "epoch": 0.7036, "loss_ce": 0.011867803521454334, "loss_lvr": 0.9298549294471741, "loss_mode_switch": 0.0, "loss_total": 0.1048533022403717, "step": 1759 }, { "batch_size": 1, "epoch": 0.7036, "step": 1759, "tokens_per_device": 4893 }, { "epoch": 0.7036, "loss_ce": 0.0010801743483170867, "loss_lvr": 0.4514313340187073, "loss_mode_switch": 0.0, "loss_total": 0.04622330889105797, "step": 1759 }, { "epoch": 0.704, "grad_norm": 1.1699198484420776, "learning_rate": 2.1269614190375477e-06, "loss": 0.2462, "step": 1760 }, { "batch_size": 4, "epoch": 0.704, "step": 1760, "tokens_per_device": 4744 }, { "epoch": 0.704, "loss_ce": 0.5317344069480896, "loss_lvr": 0.8402113318443298, "loss_mode_switch": 0.0, "loss_total": 0.615755558013916, "step": 1760 }, { "batch_size": 4, "epoch": 0.704, "step": 1760, "tokens_per_device": 14120 }, { "epoch": 0.704, "loss_ce": 0.1469876915216446, "loss_lvr": 0.9456987977027893, "loss_mode_switch": 0.0, "loss_total": 0.24155756831169128, "step": 1760 }, { "batch_size": 1, "epoch": 0.704, "step": 1760, "tokens_per_device": 5103 }, { "epoch": 0.704, "loss_ce": 0.00459762429818511, "loss_lvr": 0.31457996368408203, "loss_mode_switch": 0.0, "loss_total": 0.03605562075972557, "step": 1760 }, { "batch_size": 4, "epoch": 0.704, "step": 1760, "tokens_per_device": 3848 }, { "epoch": 0.704, "loss_ce": 0.15756380558013916, "loss_lvr": 0.7993460893630981, "loss_mode_switch": 0.0, "loss_total": 0.2374984174966812, "step": 1760 }, { "batch_size": 4, "epoch": 0.704, "step": 1760, "tokens_per_device": 4824 }, { "epoch": 0.704, "loss_ce": 0.08717595040798187, "loss_lvr": 0.9507900476455688, "loss_mode_switch": 0.0, "loss_total": 0.18225495517253876, "step": 1760 }, { "batch_size": 4, "epoch": 0.704, "step": 1760, "tokens_per_device": 5612 }, { "epoch": 0.704, "loss_ce": 0.11206185072660446, "loss_lvr": 0.7692531943321228, "loss_mode_switch": 0.0, "loss_total": 0.18898716568946838, "step": 1760 }, { "batch_size": 4, "epoch": 0.704, "step": 1760, "tokens_per_device": 4488 }, { "epoch": 0.704, "loss_ce": 0.3343411087989807, "loss_lvr": 0.9093561768531799, "loss_mode_switch": 0.0, "loss_total": 0.4252767264842987, "step": 1760 }, { "batch_size": 4, "epoch": 0.704, "step": 1760, "tokens_per_device": 2616 }, { "epoch": 0.704, "loss_ce": 0.08303304761648178, "loss_lvr": 0.9408308863639832, "loss_mode_switch": 0.0, "loss_total": 0.17711614072322845, "step": 1760 }, { "epoch": 0.7044, "grad_norm": 1.41753089427948, "learning_rate": 2.1216624491431744e-06, "loss": 0.3406, "step": 1761 }, { "batch_size": 1, "epoch": 0.7044, "step": 1761, "tokens_per_device": 5270 }, { "epoch": 0.7044, "loss_ce": 0.00970504991710186, "loss_lvr": 0.5109882354736328, "loss_mode_switch": 0.0, "loss_total": 0.06080387532711029, "step": 1761 }, { "batch_size": 4, "epoch": 0.7044, "step": 1761, "tokens_per_device": 4192 }, { "epoch": 0.7044, "loss_ce": 0.30194681882858276, "loss_lvr": 0.9366898536682129, "loss_mode_switch": 0.0, "loss_total": 0.395615816116333, "step": 1761 }, { "batch_size": 1, "epoch": 0.7044, "step": 1761, "tokens_per_device": 5102 }, { "epoch": 0.7044, "loss_ce": 0.0026894030161201954, "loss_lvr": 0.6150428056716919, "loss_mode_switch": 0.0, "loss_total": 0.06419368833303452, "step": 1761 }, { "batch_size": 4, "epoch": 0.7044, "step": 1761, "tokens_per_device": 4072 }, { "epoch": 0.7044, "loss_ce": 0.24647821485996246, "loss_lvr": 0.8662354946136475, "loss_mode_switch": 0.0, "loss_total": 0.333101749420166, "step": 1761 }, { "batch_size": 4, "epoch": 0.7044, "step": 1761, "tokens_per_device": 4548 }, { "epoch": 0.7044, "loss_ce": 0.2567749619483948, "loss_lvr": 0.9045836329460144, "loss_mode_switch": 0.0, "loss_total": 0.3472333252429962, "step": 1761 }, { "batch_size": 4, "epoch": 0.7044, "step": 1761, "tokens_per_device": 4716 }, { "epoch": 0.7044, "loss_ce": 0.5401284694671631, "loss_lvr": 0.8322834968566895, "loss_mode_switch": 0.0, "loss_total": 0.623356819152832, "step": 1761 }, { "batch_size": 1, "epoch": 0.7044, "step": 1761, "tokens_per_device": 4921 }, { "epoch": 0.7044, "loss_ce": 0.23666657507419586, "loss_lvr": 0.4959670305252075, "loss_mode_switch": 0.0, "loss_total": 0.28626328706741333, "step": 1761 }, { "batch_size": 4, "epoch": 0.7044, "step": 1761, "tokens_per_device": 5832 }, { "epoch": 0.7044, "loss_ce": 0.23687131702899933, "loss_lvr": 0.68767249584198, "loss_mode_switch": 0.0, "loss_total": 0.30563855171203613, "step": 1761 }, { "epoch": 0.7048, "grad_norm": 1.3025301694869995, "learning_rate": 2.1163683100361702e-06, "loss": 0.3157, "step": 1762 }, { "batch_size": 1, "epoch": 0.7048, "step": 1762, "tokens_per_device": 4874 }, { "epoch": 0.7048, "loss_ce": 0.16610100865364075, "loss_lvr": 0.8811870813369751, "loss_mode_switch": 0.0, "loss_total": 0.2542197108268738, "step": 1762 }, { "batch_size": 4, "epoch": 0.7048, "step": 1762, "tokens_per_device": 4552 }, { "epoch": 0.7048, "loss_ce": 0.1193428486585617, "loss_lvr": 0.905828058719635, "loss_mode_switch": 0.0, "loss_total": 0.20992565155029297, "step": 1762 }, { "batch_size": 4, "epoch": 0.7048, "step": 1762, "tokens_per_device": 5524 }, { "epoch": 0.7048, "loss_ce": 0.31187868118286133, "loss_lvr": 0.8319716453552246, "loss_mode_switch": 0.0, "loss_total": 0.39507585763931274, "step": 1762 }, { "batch_size": 4, "epoch": 0.7048, "step": 1762, "tokens_per_device": 4564 }, { "epoch": 0.7048, "loss_ce": 0.14215488731861115, "loss_lvr": 0.8533586859703064, "loss_mode_switch": 0.0, "loss_total": 0.22749075293540955, "step": 1762 }, { "batch_size": 4, "epoch": 0.7048, "step": 1762, "tokens_per_device": 5992 }, { "epoch": 0.7048, "loss_ce": 0.03484668955206871, "loss_lvr": 1.3049180507659912, "loss_mode_switch": 0.0, "loss_total": 0.16533850133419037, "step": 1762 }, { "batch_size": 4, "epoch": 0.7048, "step": 1762, "tokens_per_device": 4356 }, { "epoch": 0.7048, "loss_ce": 0.5257628560066223, "loss_lvr": 0.8146227598190308, "loss_mode_switch": 0.0, "loss_total": 0.6072251200675964, "step": 1762 }, { "batch_size": 4, "epoch": 0.7048, "step": 1762, "tokens_per_device": 4184 }, { "epoch": 0.7048, "loss_ce": 0.27080512046813965, "loss_lvr": 0.9686532616615295, "loss_mode_switch": 0.0, "loss_total": 0.3676704466342926, "step": 1762 }, { "batch_size": 1, "epoch": 0.7048, "step": 1762, "tokens_per_device": 4941 }, { "epoch": 0.7048, "loss_ce": 0.429027259349823, "loss_lvr": 0.23846372961997986, "loss_mode_switch": 0.0, "loss_total": 0.4528736472129822, "step": 1762 }, { "epoch": 0.7052, "grad_norm": 1.352030873298645, "learning_rate": 2.1110790106018286e-06, "loss": 0.2877, "step": 1763 }, { "batch_size": 4, "epoch": 0.7052, "step": 1763, "tokens_per_device": 3796 }, { "epoch": 0.7052, "loss_ce": 0.04456206411123276, "loss_lvr": 0.8476436138153076, "loss_mode_switch": 0.0, "loss_total": 0.12932643294334412, "step": 1763 }, { "batch_size": 1, "epoch": 0.7052, "step": 1763, "tokens_per_device": 5093 }, { "epoch": 0.7052, "loss_ce": 0.025088835507631302, "loss_lvr": 0.7420016527175903, "loss_mode_switch": 0.0, "loss_total": 0.09928900003433228, "step": 1763 }, { "batch_size": 4, "epoch": 0.7052, "step": 1763, "tokens_per_device": 1228 }, { "epoch": 0.7052, "loss_ce": 0.5686330199241638, "loss_lvr": 1.1307145357131958, "loss_mode_switch": 0.0, "loss_total": 0.6817044615745544, "step": 1763 }, { "batch_size": 4, "epoch": 0.7052, "step": 1763, "tokens_per_device": 4036 }, { "epoch": 0.7052, "loss_ce": 0.24387210607528687, "loss_lvr": 0.9235767722129822, "loss_mode_switch": 0.0, "loss_total": 0.3362298011779785, "step": 1763 }, { "batch_size": 4, "epoch": 0.7052, "step": 1763, "tokens_per_device": 4280 }, { "epoch": 0.7052, "loss_ce": 0.5542896389961243, "loss_lvr": 0.7385669946670532, "loss_mode_switch": 0.0, "loss_total": 0.6281463503837585, "step": 1763 }, { "batch_size": 1, "epoch": 0.7052, "step": 1763, "tokens_per_device": 5273 }, { "epoch": 0.7052, "loss_ce": 0.02237665094435215, "loss_lvr": 0.5769162178039551, "loss_mode_switch": 0.0, "loss_total": 0.08006827533245087, "step": 1763 }, { "batch_size": 1, "epoch": 0.7052, "step": 1763, "tokens_per_device": 5214 }, { "epoch": 0.7052, "loss_ce": 0.001560308039188385, "loss_lvr": 0.3723069131374359, "loss_mode_switch": 0.0, "loss_total": 0.038791000843048096, "step": 1763 }, { "batch_size": 4, "epoch": 0.7052, "step": 1763, "tokens_per_device": 2692 }, { "epoch": 0.7052, "loss_ce": 0.020622044801712036, "loss_lvr": 0.8166735172271729, "loss_mode_switch": 0.0, "loss_total": 0.10228940099477768, "step": 1763 }, { "epoch": 0.7056, "grad_norm": 1.2191228866577148, "learning_rate": 2.105794559717311e-06, "loss": 0.2823, "step": 1764 }, { "batch_size": 4, "epoch": 0.7056, "step": 1764, "tokens_per_device": 15356 }, { "epoch": 0.7056, "loss_ce": 0.3742292821407318, "loss_lvr": 0.48897069692611694, "loss_mode_switch": 0.0, "loss_total": 0.42312633991241455, "step": 1764 }, { "batch_size": 4, "epoch": 0.7056, "step": 1764, "tokens_per_device": 1352 }, { "epoch": 0.7056, "loss_ce": 0.873839259147644, "loss_lvr": 0.8852862119674683, "loss_mode_switch": 0.0, "loss_total": 0.9623678922653198, "step": 1764 }, { "batch_size": 4, "epoch": 0.7056, "step": 1764, "tokens_per_device": 3976 }, { "epoch": 0.7056, "loss_ce": 0.04112095758318901, "loss_lvr": 1.5395829677581787, "loss_mode_switch": 0.0, "loss_total": 0.1950792670249939, "step": 1764 }, { "batch_size": 4, "epoch": 0.7056, "step": 1764, "tokens_per_device": 5972 }, { "epoch": 0.7056, "loss_ce": 0.25900235772132874, "loss_lvr": 0.6762453317642212, "loss_mode_switch": 0.0, "loss_total": 0.32662689685821533, "step": 1764 }, { "batch_size": 1, "epoch": 0.7056, "step": 1764, "tokens_per_device": 4900 }, { "epoch": 0.7056, "loss_ce": 0.014510630629956722, "loss_lvr": 0.8487234711647034, "loss_mode_switch": 0.0, "loss_total": 0.09938298165798187, "step": 1764 }, { "batch_size": 4, "epoch": 0.7056, "step": 1764, "tokens_per_device": 4920 }, { "epoch": 0.7056, "loss_ce": 0.2632966637611389, "loss_lvr": 0.9542499780654907, "loss_mode_switch": 0.0, "loss_total": 0.35872167348861694, "step": 1764 }, { "batch_size": 4, "epoch": 0.7056, "step": 1764, "tokens_per_device": 5180 }, { "epoch": 0.7056, "loss_ce": 0.2891950011253357, "loss_lvr": 0.8148635029792786, "loss_mode_switch": 0.0, "loss_total": 0.3706813454627991, "step": 1764 }, { "batch_size": 4, "epoch": 0.7056, "step": 1764, "tokens_per_device": 13840 }, { "epoch": 0.7056, "loss_ce": 0.1583232879638672, "loss_lvr": 0.45691171288490295, "loss_mode_switch": 0.0, "loss_total": 0.20401446521282196, "step": 1764 }, { "epoch": 0.706, "grad_norm": 1.364709734916687, "learning_rate": 2.1005149662516517e-06, "loss": 0.3479, "step": 1765 }, { "batch_size": 1, "epoch": 0.706, "step": 1765, "tokens_per_device": 4873 }, { "epoch": 0.706, "loss_ce": 0.003952538128942251, "loss_lvr": 0.3183184564113617, "loss_mode_switch": 0.0, "loss_total": 0.03578438237309456, "step": 1765 }, { "batch_size": 4, "epoch": 0.706, "step": 1765, "tokens_per_device": 6012 }, { "epoch": 0.706, "loss_ce": 0.3912115693092346, "loss_lvr": 0.8692566156387329, "loss_mode_switch": 0.0, "loss_total": 0.47813722491264343, "step": 1765 }, { "batch_size": 4, "epoch": 0.706, "step": 1765, "tokens_per_device": 8796 }, { "epoch": 0.706, "loss_ce": 0.13381844758987427, "loss_lvr": 0.9372475743293762, "loss_mode_switch": 0.0, "loss_total": 0.2275432050228119, "step": 1765 }, { "batch_size": 4, "epoch": 0.706, "step": 1765, "tokens_per_device": 6560 }, { "epoch": 0.706, "loss_ce": 0.3893342912197113, "loss_lvr": 0.31760483980178833, "loss_mode_switch": 0.0, "loss_total": 0.42109477519989014, "step": 1765 }, { "batch_size": 4, "epoch": 0.706, "step": 1765, "tokens_per_device": 3876 }, { "epoch": 0.706, "loss_ce": 0.06960912048816681, "loss_lvr": 0.7272825241088867, "loss_mode_switch": 0.0, "loss_total": 0.1423373818397522, "step": 1765 }, { "batch_size": 1, "epoch": 0.706, "step": 1765, "tokens_per_device": 4871 }, { "epoch": 0.706, "loss_ce": 0.02735542133450508, "loss_lvr": 0.22304007411003113, "loss_mode_switch": 0.0, "loss_total": 0.04965943098068237, "step": 1765 }, { "batch_size": 4, "epoch": 0.706, "step": 1765, "tokens_per_device": 4240 }, { "epoch": 0.706, "loss_ce": 0.45687738060951233, "loss_lvr": 0.7465164661407471, "loss_mode_switch": 0.0, "loss_total": 0.5315290093421936, "step": 1765 }, { "batch_size": 4, "epoch": 0.706, "step": 1765, "tokens_per_device": 5812 }, { "epoch": 0.706, "loss_ce": 0.14519239962100983, "loss_lvr": 0.5932426452636719, "loss_mode_switch": 0.0, "loss_total": 0.20451666414737701, "step": 1765 }, { "epoch": 0.7064, "grad_norm": 1.2065593004226685, "learning_rate": 2.0952402390657215e-06, "loss": 0.2532, "step": 1766 }, { "batch_size": 1, "epoch": 0.7064, "step": 1766, "tokens_per_device": 5106 }, { "epoch": 0.7064, "loss_ce": 0.004133693408221006, "loss_lvr": 0.4678336977958679, "loss_mode_switch": 0.0, "loss_total": 0.050917062908411026, "step": 1766 }, { "batch_size": 1, "epoch": 0.7064, "step": 1766, "tokens_per_device": 4825 }, { "epoch": 0.7064, "loss_ce": 0.0013415059074759483, "loss_lvr": 0.2285442352294922, "loss_mode_switch": 0.0, "loss_total": 0.024195928126573563, "step": 1766 }, { "batch_size": 4, "epoch": 0.7064, "step": 1766, "tokens_per_device": 1412 }, { "epoch": 0.7064, "loss_ce": 0.10990317165851593, "loss_lvr": 1.5340570211410522, "loss_mode_switch": 0.0, "loss_total": 0.26330888271331787, "step": 1766 }, { "batch_size": 4, "epoch": 0.7064, "step": 1766, "tokens_per_device": 1572 }, { "epoch": 0.7064, "loss_ce": 0.23929305374622345, "loss_lvr": 1.0633466243743896, "loss_mode_switch": 0.0, "loss_total": 0.34562772512435913, "step": 1766 }, { "batch_size": 1, "epoch": 0.7064, "step": 1766, "tokens_per_device": 5128 }, { "epoch": 0.7064, "loss_ce": 0.006677298806607723, "loss_lvr": 0.46256566047668457, "loss_mode_switch": 0.0, "loss_total": 0.052933868020772934, "step": 1766 }, { "batch_size": 1, "epoch": 0.7064, "step": 1766, "tokens_per_device": 5090 }, { "epoch": 0.7064, "loss_ce": 0.07261781394481659, "loss_lvr": 0.2677021324634552, "loss_mode_switch": 0.0, "loss_total": 0.09938802570104599, "step": 1766 }, { "batch_size": 1, "epoch": 0.7064, "step": 1766, "tokens_per_device": 5183 }, { "epoch": 0.7064, "loss_ce": 0.0017446153797209263, "loss_lvr": 0.2649059295654297, "loss_mode_switch": 0.0, "loss_total": 0.028235208243131638, "step": 1766 }, { "batch_size": 4, "epoch": 0.7064, "step": 1766, "tokens_per_device": 4916 }, { "epoch": 0.7064, "loss_ce": 0.3989129066467285, "loss_lvr": 0.6670661568641663, "loss_mode_switch": 0.0, "loss_total": 0.4656195342540741, "step": 1766 }, { "epoch": 0.7068, "grad_norm": 1.117587924003601, "learning_rate": 2.0899703870122347e-06, "loss": 0.2413, "step": 1767 }, { "batch_size": 4, "epoch": 0.7068, "step": 1767, "tokens_per_device": 2600 }, { "epoch": 0.7068, "loss_ce": 0.5310553908348083, "loss_lvr": 0.8643208742141724, "loss_mode_switch": 0.0, "loss_total": 0.6174874901771545, "step": 1767 }, { "batch_size": 4, "epoch": 0.7068, "step": 1767, "tokens_per_device": 5016 }, { "epoch": 0.7068, "loss_ce": 0.18766622245311737, "loss_lvr": 0.9734570384025574, "loss_mode_switch": 0.0, "loss_total": 0.2850119173526764, "step": 1767 }, { "batch_size": 1, "epoch": 0.7068, "step": 1767, "tokens_per_device": 4874 }, { "epoch": 0.7068, "loss_ce": 0.33609122037887573, "loss_lvr": 0.40942269563674927, "loss_mode_switch": 0.0, "loss_total": 0.3770335018634796, "step": 1767 }, { "batch_size": 4, "epoch": 0.7068, "step": 1767, "tokens_per_device": 1944 }, { "epoch": 0.7068, "loss_ce": 0.41427338123321533, "loss_lvr": 0.8693772554397583, "loss_mode_switch": 0.0, "loss_total": 0.5012111067771912, "step": 1767 }, { "batch_size": 4, "epoch": 0.7068, "step": 1767, "tokens_per_device": 2624 }, { "epoch": 0.7068, "loss_ce": 0.24519194662570953, "loss_lvr": 0.9857112169265747, "loss_mode_switch": 0.0, "loss_total": 0.3437630534172058, "step": 1767 }, { "batch_size": 4, "epoch": 0.7068, "step": 1767, "tokens_per_device": 3784 }, { "epoch": 0.7068, "loss_ce": 0.381258487701416, "loss_lvr": 0.7716881632804871, "loss_mode_switch": 0.0, "loss_total": 0.4584273099899292, "step": 1767 }, { "batch_size": 1, "epoch": 0.7068, "step": 1767, "tokens_per_device": 5144 }, { "epoch": 0.7068, "loss_ce": 0.06147187575697899, "loss_lvr": 0.3568998873233795, "loss_mode_switch": 0.0, "loss_total": 0.09716186672449112, "step": 1767 }, { "batch_size": 1, "epoch": 0.7068, "step": 1767, "tokens_per_device": 4971 }, { "epoch": 0.7068, "loss_ce": 0.35099685192108154, "loss_lvr": 0.3381516635417938, "loss_mode_switch": 0.0, "loss_total": 0.38481202721595764, "step": 1767 }, { "epoch": 0.7072, "grad_norm": 1.3637274503707886, "learning_rate": 2.0847054189357136e-06, "loss": 0.3429, "step": 1768 }, { "batch_size": 1, "epoch": 0.7072, "step": 1768, "tokens_per_device": 5038 }, { "epoch": 0.7072, "loss_ce": 0.4495786428451538, "loss_lvr": 0.4375261068344116, "loss_mode_switch": 0.0, "loss_total": 0.49333125352859497, "step": 1768 }, { "batch_size": 1, "epoch": 0.7072, "step": 1768, "tokens_per_device": 4872 }, { "epoch": 0.7072, "loss_ce": 0.007564218249171972, "loss_lvr": 0.6427363753318787, "loss_mode_switch": 0.0, "loss_total": 0.07183785736560822, "step": 1768 }, { "batch_size": 1, "epoch": 0.7072, "step": 1768, "tokens_per_device": 6678 }, { "epoch": 0.7072, "loss_ce": 0.04789675027132034, "loss_lvr": 0.2929238975048065, "loss_mode_switch": 0.0, "loss_total": 0.077189140021801, "step": 1768 }, { "batch_size": 4, "epoch": 0.7072, "step": 1768, "tokens_per_device": 4188 }, { "epoch": 0.7072, "loss_ce": 0.12026207149028778, "loss_lvr": 1.1912283897399902, "loss_mode_switch": 0.0, "loss_total": 0.23938491940498352, "step": 1768 }, { "batch_size": 4, "epoch": 0.7072, "step": 1768, "tokens_per_device": 1604 }, { "epoch": 0.7072, "loss_ce": 0.2940082848072052, "loss_lvr": 0.8420042395591736, "loss_mode_switch": 0.0, "loss_total": 0.3782086968421936, "step": 1768 }, { "batch_size": 4, "epoch": 0.7072, "step": 1768, "tokens_per_device": 4224 }, { "epoch": 0.7072, "loss_ce": 0.05310191959142685, "loss_lvr": 1.2232674360275269, "loss_mode_switch": 0.0, "loss_total": 0.17542865872383118, "step": 1768 }, { "batch_size": 1, "epoch": 0.7072, "step": 1768, "tokens_per_device": 4902 }, { "epoch": 0.7072, "loss_ce": 0.03103039227426052, "loss_lvr": 0.3866470456123352, "loss_mode_switch": 0.0, "loss_total": 0.06969510018825531, "step": 1768 }, { "batch_size": 4, "epoch": 0.7072, "step": 1768, "tokens_per_device": 3160 }, { "epoch": 0.7072, "loss_ce": 0.37481024861335754, "loss_lvr": 0.7009057998657227, "loss_mode_switch": 0.0, "loss_total": 0.44490084052085876, "step": 1768 }, { "epoch": 0.7076, "grad_norm": 1.3413646221160889, "learning_rate": 2.079445343672493e-06, "loss": 0.3074, "step": 1769 }, { "batch_size": 1, "epoch": 0.7076, "step": 1769, "tokens_per_device": 7380 }, { "epoch": 0.7076, "loss_ce": 0.06123768910765648, "loss_lvr": 0.23458009958267212, "loss_mode_switch": 0.0, "loss_total": 0.08469569683074951, "step": 1769 }, { "batch_size": 4, "epoch": 0.7076, "step": 1769, "tokens_per_device": 4260 }, { "epoch": 0.7076, "loss_ce": 0.0867648795247078, "loss_lvr": 0.836319088935852, "loss_mode_switch": 0.0, "loss_total": 0.17039678990840912, "step": 1769 }, { "batch_size": 4, "epoch": 0.7076, "step": 1769, "tokens_per_device": 2536 }, { "epoch": 0.7076, "loss_ce": 0.015375208109617233, "loss_lvr": 0.8606550097465515, "loss_mode_switch": 0.0, "loss_total": 0.10144071280956268, "step": 1769 }, { "batch_size": 4, "epoch": 0.7076, "step": 1769, "tokens_per_device": 2624 }, { "epoch": 0.7076, "loss_ce": 0.4970633089542389, "loss_lvr": 0.788903534412384, "loss_mode_switch": 0.0, "loss_total": 0.5759536623954773, "step": 1769 }, { "batch_size": 4, "epoch": 0.7076, "step": 1769, "tokens_per_device": 4420 }, { "epoch": 0.7076, "loss_ce": 0.22234627604484558, "loss_lvr": 1.9736111164093018, "loss_mode_switch": 0.0, "loss_total": 0.41970738768577576, "step": 1769 }, { "batch_size": 1, "epoch": 0.7076, "step": 1769, "tokens_per_device": 5150 }, { "epoch": 0.7076, "loss_ce": 0.0014951862394809723, "loss_lvr": 0.5255037546157837, "loss_mode_switch": 0.0, "loss_total": 0.05404556170105934, "step": 1769 }, { "batch_size": 4, "epoch": 0.7076, "step": 1769, "tokens_per_device": 4332 }, { "epoch": 0.7076, "loss_ce": 0.06534065306186676, "loss_lvr": 0.8189095258712769, "loss_mode_switch": 0.0, "loss_total": 0.14723160862922668, "step": 1769 }, { "batch_size": 1, "epoch": 0.7076, "step": 1769, "tokens_per_device": 5116 }, { "epoch": 0.7076, "loss_ce": 0.022843334823846817, "loss_lvr": 0.23173917829990387, "loss_mode_switch": 0.0, "loss_total": 0.046017251908779144, "step": 1769 }, { "epoch": 0.708, "grad_norm": 1.1895593404769897, "learning_rate": 2.07419017005069e-06, "loss": 0.2424, "step": 1770 }, { "batch_size": 1, "epoch": 0.708, "step": 1770, "tokens_per_device": 4904 }, { "epoch": 0.708, "loss_ce": 0.08049128949642181, "loss_lvr": 0.35330668091773987, "loss_mode_switch": 0.0, "loss_total": 0.1158219575881958, "step": 1770 }, { "batch_size": 4, "epoch": 0.708, "step": 1770, "tokens_per_device": 4192 }, { "epoch": 0.708, "loss_ce": 0.22307811677455902, "loss_lvr": 0.8061525821685791, "loss_mode_switch": 0.0, "loss_total": 0.30369338393211365, "step": 1770 }, { "batch_size": 4, "epoch": 0.708, "step": 1770, "tokens_per_device": 5872 }, { "epoch": 0.708, "loss_ce": 0.15665169060230255, "loss_lvr": 0.7384357452392578, "loss_mode_switch": 0.0, "loss_total": 0.23049527406692505, "step": 1770 }, { "batch_size": 4, "epoch": 0.708, "step": 1770, "tokens_per_device": 5488 }, { "epoch": 0.708, "loss_ce": 0.09907781332731247, "loss_lvr": 0.7581729292869568, "loss_mode_switch": 0.0, "loss_total": 0.17489510774612427, "step": 1770 }, { "batch_size": 4, "epoch": 0.708, "step": 1770, "tokens_per_device": 4488 }, { "epoch": 0.708, "loss_ce": 0.19723863899707794, "loss_lvr": 0.8688303232192993, "loss_mode_switch": 0.0, "loss_total": 0.28412166237831116, "step": 1770 }, { "batch_size": 4, "epoch": 0.708, "step": 1770, "tokens_per_device": 2640 }, { "epoch": 0.708, "loss_ce": 0.20318394899368286, "loss_lvr": 0.7615860104560852, "loss_mode_switch": 0.0, "loss_total": 0.27934256196022034, "step": 1770 }, { "batch_size": 1, "epoch": 0.708, "step": 1770, "tokens_per_device": 5095 }, { "epoch": 0.708, "loss_ce": 0.0002191385137848556, "loss_lvr": 0.39795467257499695, "loss_mode_switch": 0.0, "loss_total": 0.040014609694480896, "step": 1770 }, { "batch_size": 4, "epoch": 0.708, "step": 1770, "tokens_per_device": 1416 }, { "epoch": 0.708, "loss_ce": 0.20252349972724915, "loss_lvr": 0.9003497958183289, "loss_mode_switch": 0.0, "loss_total": 0.292558491230011, "step": 1770 }, { "epoch": 0.7084, "grad_norm": 1.2585933208465576, "learning_rate": 2.068939906890194e-06, "loss": 0.2624, "step": 1771 }, { "batch_size": 4, "epoch": 0.7084, "step": 1771, "tokens_per_device": 4572 }, { "epoch": 0.7084, "loss_ce": 0.15315906703472137, "loss_lvr": 0.7853173613548279, "loss_mode_switch": 0.0, "loss_total": 0.23169079422950745, "step": 1771 }, { "batch_size": 4, "epoch": 0.7084, "step": 1771, "tokens_per_device": 2592 }, { "epoch": 0.7084, "loss_ce": 1.1090246438980103, "loss_lvr": 0.7843238711357117, "loss_mode_switch": 0.0, "loss_total": 1.1874570846557617, "step": 1771 }, { "batch_size": 4, "epoch": 0.7084, "step": 1771, "tokens_per_device": 5700 }, { "epoch": 0.7084, "loss_ce": 0.5057133436203003, "loss_lvr": 0.7037402391433716, "loss_mode_switch": 0.0, "loss_total": 0.5760873556137085, "step": 1771 }, { "batch_size": 1, "epoch": 0.7084, "step": 1771, "tokens_per_device": 4911 }, { "epoch": 0.7084, "loss_ce": 0.005984235554933548, "loss_lvr": 0.2579370439052582, "loss_mode_switch": 0.0, "loss_total": 0.031777940690517426, "step": 1771 }, { "batch_size": 4, "epoch": 0.7084, "step": 1771, "tokens_per_device": 4264 }, { "epoch": 0.7084, "loss_ce": 0.2694871127605438, "loss_lvr": 0.8304386734962463, "loss_mode_switch": 0.0, "loss_total": 0.35253098607063293, "step": 1771 }, { "batch_size": 4, "epoch": 0.7084, "step": 1771, "tokens_per_device": 4488 }, { "epoch": 0.7084, "loss_ce": 0.6268053650856018, "loss_lvr": 0.8299543857574463, "loss_mode_switch": 0.0, "loss_total": 0.7098007798194885, "step": 1771 }, { "batch_size": 4, "epoch": 0.7084, "step": 1771, "tokens_per_device": 2760 }, { "epoch": 0.7084, "loss_ce": 0.3234364688396454, "loss_lvr": 0.6963894963264465, "loss_mode_switch": 0.0, "loss_total": 0.3930754065513611, "step": 1771 }, { "batch_size": 4, "epoch": 0.7084, "step": 1771, "tokens_per_device": 2696 }, { "epoch": 0.7084, "loss_ce": 0.060070883482694626, "loss_lvr": 0.838909387588501, "loss_mode_switch": 0.0, "loss_total": 0.1439618170261383, "step": 1771 }, { "epoch": 0.7088, "grad_norm": 1.3912529945373535, "learning_rate": 2.0636945630026594e-06, "loss": 0.3376, "step": 1772 }, { "batch_size": 4, "epoch": 0.7088, "step": 1772, "tokens_per_device": 1496 }, { "epoch": 0.7088, "loss_ce": 0.7969399690628052, "loss_lvr": 0.9258221387863159, "loss_mode_switch": 0.0, "loss_total": 0.8895221948623657, "step": 1772 }, { "batch_size": 4, "epoch": 0.7088, "step": 1772, "tokens_per_device": 4924 }, { "epoch": 0.7088, "loss_ce": 0.35462233424186707, "loss_lvr": 0.7165469527244568, "loss_mode_switch": 0.0, "loss_total": 0.4262770414352417, "step": 1772 }, { "batch_size": 1, "epoch": 0.7088, "step": 1772, "tokens_per_device": 4949 }, { "epoch": 0.7088, "loss_ce": 0.05467259883880615, "loss_lvr": 0.44822242856025696, "loss_mode_switch": 0.0, "loss_total": 0.09949484467506409, "step": 1772 }, { "batch_size": 4, "epoch": 0.7088, "step": 1772, "tokens_per_device": 1780 }, { "epoch": 0.7088, "loss_ce": 0.4642452597618103, "loss_lvr": 0.8665286302566528, "loss_mode_switch": 0.0, "loss_total": 0.5508981347084045, "step": 1772 }, { "batch_size": 4, "epoch": 0.7088, "step": 1772, "tokens_per_device": 4252 }, { "epoch": 0.7088, "loss_ce": 0.4996224641799927, "loss_lvr": 0.8532552123069763, "loss_mode_switch": 0.0, "loss_total": 0.5849480032920837, "step": 1772 }, { "batch_size": 4, "epoch": 0.7088, "step": 1772, "tokens_per_device": 3948 }, { "epoch": 0.7088, "loss_ce": 0.429700642824173, "loss_lvr": 1.2480474710464478, "loss_mode_switch": 0.0, "loss_total": 0.5545054078102112, "step": 1772 }, { "batch_size": 1, "epoch": 0.7088, "step": 1772, "tokens_per_device": 4886 }, { "epoch": 0.7088, "loss_ce": 0.0010191920446231961, "loss_lvr": 1.0078704357147217, "loss_mode_switch": 0.0, "loss_total": 0.10180623829364777, "step": 1772 }, { "batch_size": 4, "epoch": 0.7088, "step": 1772, "tokens_per_device": 6004 }, { "epoch": 0.7088, "loss_ce": 0.31743621826171875, "loss_lvr": 0.8707526326179504, "loss_mode_switch": 0.0, "loss_total": 0.4045114815235138, "step": 1772 }, { "epoch": 0.7092, "grad_norm": 1.2314170598983765, "learning_rate": 2.058454147191478e-06, "loss": 0.2771, "step": 1773 }, { "batch_size": 4, "epoch": 0.7092, "step": 1773, "tokens_per_device": 5948 }, { "epoch": 0.7092, "loss_ce": 0.004755678586661816, "loss_lvr": 0.9494585394859314, "loss_mode_switch": 0.0, "loss_total": 0.09970153123140335, "step": 1773 }, { "batch_size": 4, "epoch": 0.7092, "step": 1773, "tokens_per_device": 2632 }, { "epoch": 0.7092, "loss_ce": 0.48985061049461365, "loss_lvr": 0.8602769374847412, "loss_mode_switch": 0.0, "loss_total": 0.5758783221244812, "step": 1773 }, { "batch_size": 4, "epoch": 0.7092, "step": 1773, "tokens_per_device": 4432 }, { "epoch": 0.7092, "loss_ce": 0.0059296065010130405, "loss_lvr": 0.7164009213447571, "loss_mode_switch": 0.0, "loss_total": 0.07756970077753067, "step": 1773 }, { "batch_size": 4, "epoch": 0.7092, "step": 1773, "tokens_per_device": 6024 }, { "epoch": 0.7092, "loss_ce": 0.024079537019133568, "loss_lvr": 0.6502125859260559, "loss_mode_switch": 0.0, "loss_total": 0.08910080045461655, "step": 1773 }, { "batch_size": 4, "epoch": 0.7092, "step": 1773, "tokens_per_device": 3764 }, { "epoch": 0.7092, "loss_ce": 0.05437857285141945, "loss_lvr": 0.8887664079666138, "loss_mode_switch": 0.0, "loss_total": 0.14325521886348724, "step": 1773 }, { "batch_size": 1, "epoch": 0.7092, "step": 1773, "tokens_per_device": 6511 }, { "epoch": 0.7092, "loss_ce": 0.006257025990635157, "loss_lvr": 0.20043088495731354, "loss_mode_switch": 0.0, "loss_total": 0.026300113648176193, "step": 1773 }, { "batch_size": 4, "epoch": 0.7092, "step": 1773, "tokens_per_device": 1160 }, { "epoch": 0.7092, "loss_ce": 0.476837158203125, "loss_lvr": 1.0480313301086426, "loss_mode_switch": 0.0, "loss_total": 0.5816403031349182, "step": 1773 }, { "batch_size": 1, "epoch": 0.7092, "step": 1773, "tokens_per_device": 5140 }, { "epoch": 0.7092, "loss_ce": 0.026318084448575974, "loss_lvr": 0.1808117777109146, "loss_mode_switch": 0.0, "loss_total": 0.044399261474609375, "step": 1773 }, { "epoch": 0.7096, "grad_norm": 1.4633667469024658, "learning_rate": 2.053218668251775e-06, "loss": 0.317, "step": 1774 }, { "batch_size": 1, "epoch": 0.7096, "step": 1774, "tokens_per_device": 4869 }, { "epoch": 0.7096, "loss_ce": 0.07730992883443832, "loss_lvr": 0.45507049560546875, "loss_mode_switch": 0.0, "loss_total": 0.12281697988510132, "step": 1774 }, { "batch_size": 4, "epoch": 0.7096, "step": 1774, "tokens_per_device": 4276 }, { "epoch": 0.7096, "loss_ce": 0.1570892035961151, "loss_lvr": 1.046091914176941, "loss_mode_switch": 0.0, "loss_total": 0.2616983950138092, "step": 1774 }, { "batch_size": 4, "epoch": 0.7096, "step": 1774, "tokens_per_device": 2140 }, { "epoch": 0.7096, "loss_ce": 0.15890443325042725, "loss_lvr": 0.9024797081947327, "loss_mode_switch": 0.0, "loss_total": 0.24915240705013275, "step": 1774 }, { "batch_size": 1, "epoch": 0.7096, "step": 1774, "tokens_per_device": 5096 }, { "epoch": 0.7096, "loss_ce": 0.07847586274147034, "loss_lvr": 0.35062873363494873, "loss_mode_switch": 0.0, "loss_total": 0.11353874206542969, "step": 1774 }, { "batch_size": 1, "epoch": 0.7096, "step": 1774, "tokens_per_device": 4648 }, { "epoch": 0.7096, "loss_ce": 0.040428806096315384, "loss_lvr": 0.20939286053180695, "loss_mode_switch": 0.0, "loss_total": 0.06136809289455414, "step": 1774 }, { "batch_size": 4, "epoch": 0.7096, "step": 1774, "tokens_per_device": 1500 }, { "epoch": 0.7096, "loss_ce": 0.4899543225765228, "loss_lvr": 0.8855124115943909, "loss_mode_switch": 0.0, "loss_total": 0.5785055756568909, "step": 1774 }, { "batch_size": 4, "epoch": 0.7096, "step": 1774, "tokens_per_device": 3588 }, { "epoch": 0.7096, "loss_ce": 0.0060850027948617935, "loss_lvr": 1.2832776308059692, "loss_mode_switch": 0.0, "loss_total": 0.13441278040409088, "step": 1774 }, { "batch_size": 1, "epoch": 0.7096, "step": 1774, "tokens_per_device": 5105 }, { "epoch": 0.7096, "loss_ce": 0.08325731009244919, "loss_lvr": 1.3352099657058716, "loss_mode_switch": 0.0, "loss_total": 0.21677830815315247, "step": 1774 }, { "epoch": 0.71, "grad_norm": 2.3959500789642334, "learning_rate": 2.0479881349703885e-06, "loss": 0.2993, "step": 1775 }, { "batch_size": 4, "epoch": 0.71, "step": 1775, "tokens_per_device": 4320 }, { "epoch": 0.71, "loss_ce": 0.03831371292471886, "loss_lvr": 0.9459145665168762, "loss_mode_switch": 0.0, "loss_total": 0.13290517032146454, "step": 1775 }, { "batch_size": 4, "epoch": 0.71, "step": 1775, "tokens_per_device": 3844 }, { "epoch": 0.71, "loss_ce": 0.16428375244140625, "loss_lvr": 0.8691484928131104, "loss_mode_switch": 0.0, "loss_total": 0.25119858980178833, "step": 1775 }, { "batch_size": 1, "epoch": 0.71, "step": 1775, "tokens_per_device": 5166 }, { "epoch": 0.71, "loss_ce": 0.05272041633725166, "loss_lvr": 0.48276737332344055, "loss_mode_switch": 0.0, "loss_total": 0.10099714994430542, "step": 1775 }, { "batch_size": 4, "epoch": 0.71, "step": 1775, "tokens_per_device": 5076 }, { "epoch": 0.71, "loss_ce": 0.1554768681526184, "loss_lvr": 0.5480608940124512, "loss_mode_switch": 0.0, "loss_total": 0.21028295159339905, "step": 1775 }, { "batch_size": 4, "epoch": 0.71, "step": 1775, "tokens_per_device": 1240 }, { "epoch": 0.71, "loss_ce": 0.2303130030632019, "loss_lvr": 0.9474380612373352, "loss_mode_switch": 0.0, "loss_total": 0.3250568211078644, "step": 1775 }, { "batch_size": 4, "epoch": 0.71, "step": 1775, "tokens_per_device": 5104 }, { "epoch": 0.71, "loss_ce": 0.14478197693824768, "loss_lvr": 0.8141198754310608, "loss_mode_switch": 0.0, "loss_total": 0.22619396448135376, "step": 1775 }, { "batch_size": 4, "epoch": 0.71, "step": 1775, "tokens_per_device": 1436 }, { "epoch": 0.71, "loss_ce": 0.8999636173248291, "loss_lvr": 1.067488431930542, "loss_mode_switch": 0.0, "loss_total": 1.0067124366760254, "step": 1775 }, { "batch_size": 4, "epoch": 0.71, "step": 1775, "tokens_per_device": 5132 }, { "epoch": 0.71, "loss_ce": 0.20639458298683167, "loss_lvr": 0.8024404644966125, "loss_mode_switch": 0.0, "loss_total": 0.28663861751556396, "step": 1775 }, { "epoch": 0.7104, "grad_norm": 1.3094269037246704, "learning_rate": 2.042762556125853e-06, "loss": 0.2817, "step": 1776 }, { "batch_size": 4, "epoch": 0.7104, "step": 1776, "tokens_per_device": 4320 }, { "epoch": 0.7104, "loss_ce": 0.10546153038740158, "loss_lvr": 1.1405682563781738, "loss_mode_switch": 0.0, "loss_total": 0.21951836347579956, "step": 1776 }, { "batch_size": 1, "epoch": 0.7104, "step": 1776, "tokens_per_device": 5121 }, { "epoch": 0.7104, "loss_ce": 0.07097184658050537, "loss_lvr": 0.35051625967025757, "loss_mode_switch": 0.0, "loss_total": 0.10602347552776337, "step": 1776 }, { "batch_size": 4, "epoch": 0.7104, "step": 1776, "tokens_per_device": 2724 }, { "epoch": 0.7104, "loss_ce": 0.5952993631362915, "loss_lvr": 0.8182129859924316, "loss_mode_switch": 0.0, "loss_total": 0.6771206855773926, "step": 1776 }, { "batch_size": 4, "epoch": 0.7104, "step": 1776, "tokens_per_device": 4864 }, { "epoch": 0.7104, "loss_ce": 0.5416531562805176, "loss_lvr": 0.8849034905433655, "loss_mode_switch": 0.0, "loss_total": 0.6301435232162476, "step": 1776 }, { "batch_size": 1, "epoch": 0.7104, "step": 1776, "tokens_per_device": 5112 }, { "epoch": 0.7104, "loss_ce": 0.0008774884045124054, "loss_lvr": 0.34082916378974915, "loss_mode_switch": 0.0, "loss_total": 0.03496040403842926, "step": 1776 }, { "batch_size": 4, "epoch": 0.7104, "step": 1776, "tokens_per_device": 1480 }, { "epoch": 0.7104, "loss_ce": 0.4157554805278778, "loss_lvr": 1.0483567714691162, "loss_mode_switch": 0.0, "loss_total": 0.520591139793396, "step": 1776 }, { "batch_size": 4, "epoch": 0.7104, "step": 1776, "tokens_per_device": 3056 }, { "epoch": 0.7104, "loss_ce": 0.041878849267959595, "loss_lvr": 0.32454603910446167, "loss_mode_switch": 0.0, "loss_total": 0.07433345913887024, "step": 1776 }, { "batch_size": 4, "epoch": 0.7104, "step": 1776, "tokens_per_device": 4296 }, { "epoch": 0.7104, "loss_ce": 0.16228266060352325, "loss_lvr": 0.7427314519882202, "loss_mode_switch": 0.0, "loss_total": 0.236555814743042, "step": 1776 }, { "epoch": 0.7108, "grad_norm": 1.4513074159622192, "learning_rate": 2.0375419404883938e-06, "loss": 0.2947, "step": 1777 }, { "batch_size": 4, "epoch": 0.7108, "step": 1777, "tokens_per_device": 4340 }, { "epoch": 0.7108, "loss_ce": 0.07204773277044296, "loss_lvr": 0.7437936067581177, "loss_mode_switch": 0.0, "loss_total": 0.14642709493637085, "step": 1777 }, { "batch_size": 1, "epoch": 0.7108, "step": 1777, "tokens_per_device": 4911 }, { "epoch": 0.7108, "loss_ce": 0.08133874833583832, "loss_lvr": 1.1954914331436157, "loss_mode_switch": 0.0, "loss_total": 0.20088788866996765, "step": 1777 }, { "batch_size": 1, "epoch": 0.7108, "step": 1777, "tokens_per_device": 4589 }, { "epoch": 0.7108, "loss_ce": 0.2538355886936188, "loss_lvr": 0.5460741519927979, "loss_mode_switch": 0.0, "loss_total": 0.30844300985336304, "step": 1777 }, { "batch_size": 1, "epoch": 0.7108, "step": 1777, "tokens_per_device": 5171 }, { "epoch": 0.7108, "loss_ce": 0.013277736492455006, "loss_lvr": 0.33576101064682007, "loss_mode_switch": 0.0, "loss_total": 0.04685383662581444, "step": 1777 }, { "batch_size": 4, "epoch": 0.7108, "step": 1777, "tokens_per_device": 1368 }, { "epoch": 0.7108, "loss_ce": 0.39618217945098877, "loss_lvr": 0.9253876805305481, "loss_mode_switch": 0.0, "loss_total": 0.48872095346450806, "step": 1777 }, { "batch_size": 1, "epoch": 0.7108, "step": 1777, "tokens_per_device": 7164 }, { "epoch": 0.7108, "loss_ce": 0.03914656117558479, "loss_lvr": 0.370714008808136, "loss_mode_switch": 0.0, "loss_total": 0.07621796429157257, "step": 1777 }, { "batch_size": 1, "epoch": 0.7108, "step": 1777, "tokens_per_device": 4698 }, { "epoch": 0.7108, "loss_ce": 0.19272643327713013, "loss_lvr": 0.3523898124694824, "loss_mode_switch": 0.0, "loss_total": 0.22796541452407837, "step": 1777 }, { "batch_size": 1, "epoch": 0.7108, "step": 1777, "tokens_per_device": 4976 }, { "epoch": 0.7108, "loss_ce": 0.26781195402145386, "loss_lvr": 0.25197622179985046, "loss_mode_switch": 0.0, "loss_total": 0.29300957918167114, "step": 1777 }, { "epoch": 0.7112, "grad_norm": 1.4271854162216187, "learning_rate": 2.0323262968199043e-06, "loss": 0.2965, "step": 1778 }, { "batch_size": 4, "epoch": 0.7112, "step": 1778, "tokens_per_device": 4252 }, { "epoch": 0.7112, "loss_ce": 0.27797752618789673, "loss_lvr": 0.8995838165283203, "loss_mode_switch": 0.0, "loss_total": 0.3679358959197998, "step": 1778 }, { "batch_size": 4, "epoch": 0.7112, "step": 1778, "tokens_per_device": 5752 }, { "epoch": 0.7112, "loss_ce": 0.48541751503944397, "loss_lvr": 0.8894466161727905, "loss_mode_switch": 0.0, "loss_total": 0.5743621587753296, "step": 1778 }, { "batch_size": 4, "epoch": 0.7112, "step": 1778, "tokens_per_device": 4348 }, { "epoch": 0.7112, "loss_ce": 0.3336380124092102, "loss_lvr": 0.8482062816619873, "loss_mode_switch": 0.0, "loss_total": 0.41845864057540894, "step": 1778 }, { "batch_size": 1, "epoch": 0.7112, "step": 1778, "tokens_per_device": 5115 }, { "epoch": 0.7112, "loss_ce": 0.006552261300384998, "loss_lvr": 0.5236790180206299, "loss_mode_switch": 0.0, "loss_total": 0.05892016366124153, "step": 1778 }, { "batch_size": 1, "epoch": 0.7112, "step": 1778, "tokens_per_device": 7870 }, { "epoch": 0.7112, "loss_ce": 0.15697865188121796, "loss_lvr": 0.29456403851509094, "loss_mode_switch": 0.0, "loss_total": 0.1864350587129593, "step": 1778 }, { "batch_size": 4, "epoch": 0.7112, "step": 1778, "tokens_per_device": 3928 }, { "epoch": 0.7112, "loss_ce": 0.3547746241092682, "loss_lvr": 0.6600375175476074, "loss_mode_switch": 0.0, "loss_total": 0.42077839374542236, "step": 1778 }, { "batch_size": 4, "epoch": 0.7112, "step": 1778, "tokens_per_device": 5648 }, { "epoch": 0.7112, "loss_ce": 0.6795526742935181, "loss_lvr": 0.6991591453552246, "loss_mode_switch": 0.0, "loss_total": 0.7494685649871826, "step": 1778 }, { "batch_size": 4, "epoch": 0.7112, "step": 1778, "tokens_per_device": 3836 }, { "epoch": 0.7112, "loss_ce": 0.08574605733156204, "loss_lvr": 0.8329469561576843, "loss_mode_switch": 0.0, "loss_total": 0.1690407544374466, "step": 1778 }, { "epoch": 0.7116, "grad_norm": 1.4033230543136597, "learning_rate": 2.027115633873931e-06, "loss": 0.3228, "step": 1779 }, { "batch_size": 1, "epoch": 0.7116, "step": 1779, "tokens_per_device": 4854 }, { "epoch": 0.7116, "loss_ce": 0.0008943257271312177, "loss_lvr": 0.39791128039360046, "loss_mode_switch": 0.0, "loss_total": 0.04068545624613762, "step": 1779 }, { "batch_size": 4, "epoch": 0.7116, "step": 1779, "tokens_per_device": 5028 }, { "epoch": 0.7116, "loss_ce": 0.1047029197216034, "loss_lvr": 0.8863296508789062, "loss_mode_switch": 0.0, "loss_total": 0.1933358907699585, "step": 1779 }, { "batch_size": 4, "epoch": 0.7116, "step": 1779, "tokens_per_device": 4228 }, { "epoch": 0.7116, "loss_ce": 0.0454837903380394, "loss_lvr": 0.8478690981864929, "loss_mode_switch": 0.0, "loss_total": 0.13027070462703705, "step": 1779 }, { "batch_size": 4, "epoch": 0.7116, "step": 1779, "tokens_per_device": 4368 }, { "epoch": 0.7116, "loss_ce": 0.36953309178352356, "loss_lvr": 0.8213086724281311, "loss_mode_switch": 0.0, "loss_total": 0.4516639709472656, "step": 1779 }, { "batch_size": 4, "epoch": 0.7116, "step": 1779, "tokens_per_device": 2564 }, { "epoch": 0.7116, "loss_ce": 0.2422286868095398, "loss_lvr": 0.8313649296760559, "loss_mode_switch": 0.0, "loss_total": 0.32536518573760986, "step": 1779 }, { "batch_size": 1, "epoch": 0.7116, "step": 1779, "tokens_per_device": 4763 }, { "epoch": 0.7116, "loss_ce": 0.04477811977267265, "loss_lvr": 0.292477011680603, "loss_mode_switch": 0.0, "loss_total": 0.07402582466602325, "step": 1779 }, { "batch_size": 4, "epoch": 0.7116, "step": 1779, "tokens_per_device": 4204 }, { "epoch": 0.7116, "loss_ce": 0.6356863379478455, "loss_lvr": 0.7752202749252319, "loss_mode_switch": 0.0, "loss_total": 0.7132083773612976, "step": 1779 }, { "batch_size": 4, "epoch": 0.7116, "step": 1779, "tokens_per_device": 4492 }, { "epoch": 0.7116, "loss_ce": 0.3000797629356384, "loss_lvr": 1.1094236373901367, "loss_mode_switch": 0.0, "loss_total": 0.4110221266746521, "step": 1779 }, { "epoch": 0.712, "grad_norm": 1.7110153436660767, "learning_rate": 2.021909960395661e-06, "loss": 0.2583, "step": 1780 }, { "batch_size": 4, "epoch": 0.712, "step": 1780, "tokens_per_device": 3820 }, { "epoch": 0.712, "loss_ce": 0.14452828466892242, "loss_lvr": 0.9949216246604919, "loss_mode_switch": 0.0, "loss_total": 0.24402044713497162, "step": 1780 }, { "batch_size": 1, "epoch": 0.712, "step": 1780, "tokens_per_device": 4708 }, { "epoch": 0.712, "loss_ce": 0.8183750510215759, "loss_lvr": 0.6464893817901611, "loss_mode_switch": 0.0, "loss_total": 0.8830239772796631, "step": 1780 }, { "batch_size": 4, "epoch": 0.712, "step": 1780, "tokens_per_device": 6160 }, { "epoch": 0.712, "loss_ce": 0.0014353615697473288, "loss_lvr": 0.7703490257263184, "loss_mode_switch": 0.0, "loss_total": 0.07847026735544205, "step": 1780 }, { "batch_size": 4, "epoch": 0.712, "step": 1780, "tokens_per_device": 4456 }, { "epoch": 0.712, "loss_ce": 0.3144069314002991, "loss_lvr": 0.700783908367157, "loss_mode_switch": 0.0, "loss_total": 0.3844853341579437, "step": 1780 }, { "batch_size": 4, "epoch": 0.712, "step": 1780, "tokens_per_device": 3756 }, { "epoch": 0.712, "loss_ce": 0.00073424750007689, "loss_lvr": 0.4671383798122406, "loss_mode_switch": 0.0, "loss_total": 0.047448087483644485, "step": 1780 }, { "batch_size": 4, "epoch": 0.712, "step": 1780, "tokens_per_device": 4536 }, { "epoch": 0.712, "loss_ce": 0.3242025375366211, "loss_lvr": 0.7490041851997375, "loss_mode_switch": 0.0, "loss_total": 0.39910295605659485, "step": 1780 }, { "batch_size": 1, "epoch": 0.712, "step": 1780, "tokens_per_device": 5125 }, { "epoch": 0.712, "loss_ce": 0.003934809938073158, "loss_lvr": 0.5313467979431152, "loss_mode_switch": 0.0, "loss_total": 0.05706948786973953, "step": 1780 }, { "batch_size": 1, "epoch": 0.712, "step": 1780, "tokens_per_device": 5210 }, { "epoch": 0.712, "loss_ce": 0.017828669399023056, "loss_lvr": 0.4657686948776245, "loss_mode_switch": 0.0, "loss_total": 0.06440553814172745, "step": 1780 }, { "epoch": 0.7124, "grad_norm": 1.2384812831878662, "learning_rate": 2.0167092851219094e-06, "loss": 0.2604, "step": 1781 }, { "batch_size": 4, "epoch": 0.7124, "step": 1781, "tokens_per_device": 1544 }, { "epoch": 0.7124, "loss_ce": 0.4059298634529114, "loss_lvr": 0.9144653081893921, "loss_mode_switch": 0.0, "loss_total": 0.49737638235092163, "step": 1781 }, { "batch_size": 4, "epoch": 0.7124, "step": 1781, "tokens_per_device": 4588 }, { "epoch": 0.7124, "loss_ce": 0.0027528677601367235, "loss_lvr": 0.9533875584602356, "loss_mode_switch": 0.0, "loss_total": 0.09809162467718124, "step": 1781 }, { "batch_size": 1, "epoch": 0.7124, "step": 1781, "tokens_per_device": 5300 }, { "epoch": 0.7124, "loss_ce": 0.02217753417789936, "loss_lvr": 0.3405317962169647, "loss_mode_switch": 0.0, "loss_total": 0.05623071640729904, "step": 1781 }, { "batch_size": 1, "epoch": 0.7124, "step": 1781, "tokens_per_device": 5144 }, { "epoch": 0.7124, "loss_ce": 0.0013161106035113335, "loss_lvr": 0.409359872341156, "loss_mode_switch": 0.0, "loss_total": 0.04225210100412369, "step": 1781 }, { "batch_size": 4, "epoch": 0.7124, "step": 1781, "tokens_per_device": 4192 }, { "epoch": 0.7124, "loss_ce": 0.6182238459587097, "loss_lvr": 0.9112251400947571, "loss_mode_switch": 0.0, "loss_total": 0.709346354007721, "step": 1781 }, { "batch_size": 1, "epoch": 0.7124, "step": 1781, "tokens_per_device": 4896 }, { "epoch": 0.7124, "loss_ce": 0.0074697150848805904, "loss_lvr": 0.1956641972064972, "loss_mode_switch": 0.0, "loss_total": 0.027036136016249657, "step": 1781 }, { "batch_size": 4, "epoch": 0.7124, "step": 1781, "tokens_per_device": 2588 }, { "epoch": 0.7124, "loss_ce": 0.3553504943847656, "loss_lvr": 1.1575349569320679, "loss_mode_switch": 0.0, "loss_total": 0.4711039960384369, "step": 1781 }, { "batch_size": 4, "epoch": 0.7124, "step": 1781, "tokens_per_device": 3896 }, { "epoch": 0.7124, "loss_ce": 0.2413792461156845, "loss_lvr": 0.7526155114173889, "loss_mode_switch": 0.0, "loss_total": 0.31664079427719116, "step": 1781 }, { "epoch": 0.7128, "grad_norm": 1.388495683670044, "learning_rate": 2.0115136167811033e-06, "loss": 0.2947, "step": 1782 }, { "batch_size": 4, "epoch": 0.7128, "step": 1782, "tokens_per_device": 2728 }, { "epoch": 0.7128, "loss_ce": 0.9281875491142273, "loss_lvr": 0.7594574689865112, "loss_mode_switch": 0.0, "loss_total": 1.0041333436965942, "step": 1782 }, { "batch_size": 4, "epoch": 0.7128, "step": 1782, "tokens_per_device": 2744 }, { "epoch": 0.7128, "loss_ce": 0.39286962151527405, "loss_lvr": 0.8462651968002319, "loss_mode_switch": 0.0, "loss_total": 0.4774961471557617, "step": 1782 }, { "batch_size": 4, "epoch": 0.7128, "step": 1782, "tokens_per_device": 1488 }, { "epoch": 0.7128, "loss_ce": 0.3946237862110138, "loss_lvr": 1.136274814605713, "loss_mode_switch": 0.0, "loss_total": 0.5082512497901917, "step": 1782 }, { "batch_size": 4, "epoch": 0.7128, "step": 1782, "tokens_per_device": 4256 }, { "epoch": 0.7128, "loss_ce": 0.5748921632766724, "loss_lvr": 0.905903160572052, "loss_mode_switch": 0.0, "loss_total": 0.6654824614524841, "step": 1782 }, { "batch_size": 4, "epoch": 0.7128, "step": 1782, "tokens_per_device": 4996 }, { "epoch": 0.7128, "loss_ce": 0.0752362385392189, "loss_lvr": 0.5195106863975525, "loss_mode_switch": 0.0, "loss_total": 0.1271873116493225, "step": 1782 }, { "batch_size": 4, "epoch": 0.7128, "step": 1782, "tokens_per_device": 3320 }, { "epoch": 0.7128, "loss_ce": 0.5000286102294922, "loss_lvr": 0.7299883365631104, "loss_mode_switch": 0.0, "loss_total": 0.5730274319648743, "step": 1782 }, { "batch_size": 4, "epoch": 0.7128, "step": 1782, "tokens_per_device": 2700 }, { "epoch": 0.7128, "loss_ce": 0.2259918451309204, "loss_lvr": 1.4213197231292725, "loss_mode_switch": 0.0, "loss_total": 0.3681238293647766, "step": 1782 }, { "batch_size": 4, "epoch": 0.7128, "step": 1782, "tokens_per_device": 3536 }, { "epoch": 0.7128, "loss_ce": 0.07303401082754135, "loss_lvr": 0.8044646382331848, "loss_mode_switch": 0.0, "loss_total": 0.15348047018051147, "step": 1782 }, { "epoch": 0.7132, "grad_norm": 1.3821940422058105, "learning_rate": 2.0063229640932664e-06, "loss": 0.3293, "step": 1783 }, { "batch_size": 4, "epoch": 0.7132, "step": 1783, "tokens_per_device": 2816 }, { "epoch": 0.7132, "loss_ce": 0.28175103664398193, "loss_lvr": 1.139743685722351, "loss_mode_switch": 0.0, "loss_total": 0.39572539925575256, "step": 1783 }, { "batch_size": 4, "epoch": 0.7132, "step": 1783, "tokens_per_device": 15024 }, { "epoch": 0.7132, "loss_ce": 0.15167665481567383, "loss_lvr": 0.5546004772186279, "loss_mode_switch": 0.0, "loss_total": 0.20713670551776886, "step": 1783 }, { "batch_size": 4, "epoch": 0.7132, "step": 1783, "tokens_per_device": 4404 }, { "epoch": 0.7132, "loss_ce": 0.13548244535923004, "loss_lvr": 0.9862236976623535, "loss_mode_switch": 0.0, "loss_total": 0.23410481214523315, "step": 1783 }, { "batch_size": 4, "epoch": 0.7132, "step": 1783, "tokens_per_device": 11096 }, { "epoch": 0.7132, "loss_ce": 0.3542656898498535, "loss_lvr": 0.7712641358375549, "loss_mode_switch": 0.0, "loss_total": 0.431392103433609, "step": 1783 }, { "batch_size": 1, "epoch": 0.7132, "step": 1783, "tokens_per_device": 4943 }, { "epoch": 0.7132, "loss_ce": 0.06904090195894241, "loss_lvr": 0.2809993028640747, "loss_mode_switch": 0.0, "loss_total": 0.097140833735466, "step": 1783 }, { "batch_size": 4, "epoch": 0.7132, "step": 1783, "tokens_per_device": 1368 }, { "epoch": 0.7132, "loss_ce": 0.25529733300209045, "loss_lvr": 1.459055781364441, "loss_mode_switch": 0.0, "loss_total": 0.401202917098999, "step": 1783 }, { "batch_size": 1, "epoch": 0.7132, "step": 1783, "tokens_per_device": 4876 }, { "epoch": 0.7132, "loss_ce": 0.011996908113360405, "loss_lvr": 0.2992802560329437, "loss_mode_switch": 0.0, "loss_total": 0.04192493483424187, "step": 1783 }, { "batch_size": 4, "epoch": 0.7132, "step": 1783, "tokens_per_device": 5204 }, { "epoch": 0.7132, "loss_ce": 0.4401445984840393, "loss_lvr": 0.7823337912559509, "loss_mode_switch": 0.0, "loss_total": 0.518377959728241, "step": 1783 }, { "epoch": 0.7136, "grad_norm": 1.5949995517730713, "learning_rate": 2.001137335770003e-06, "loss": 0.3039, "step": 1784 }, { "batch_size": 4, "epoch": 0.7136, "step": 1784, "tokens_per_device": 3680 }, { "epoch": 0.7136, "loss_ce": 0.3453967273235321, "loss_lvr": 0.8151220083236694, "loss_mode_switch": 0.0, "loss_total": 0.426908940076828, "step": 1784 }, { "batch_size": 4, "epoch": 0.7136, "step": 1784, "tokens_per_device": 2692 }, { "epoch": 0.7136, "loss_ce": 0.11098858714103699, "loss_lvr": 0.7458080053329468, "loss_mode_switch": 0.0, "loss_total": 0.1855693906545639, "step": 1784 }, { "batch_size": 4, "epoch": 0.7136, "step": 1784, "tokens_per_device": 4416 }, { "epoch": 0.7136, "loss_ce": 0.032173819839954376, "loss_lvr": 0.978324830532074, "loss_mode_switch": 0.0, "loss_total": 0.1300063133239746, "step": 1784 }, { "batch_size": 4, "epoch": 0.7136, "step": 1784, "tokens_per_device": 11388 }, { "epoch": 0.7136, "loss_ce": 0.32466161251068115, "loss_lvr": 0.7810336351394653, "loss_mode_switch": 0.0, "loss_total": 0.4027649760246277, "step": 1784 }, { "batch_size": 1, "epoch": 0.7136, "step": 1784, "tokens_per_device": 6572 }, { "epoch": 0.7136, "loss_ce": 0.002886450383812189, "loss_lvr": 0.32074326276779175, "loss_mode_switch": 0.0, "loss_total": 0.034960780292749405, "step": 1784 }, { "batch_size": 4, "epoch": 0.7136, "step": 1784, "tokens_per_device": 7184 }, { "epoch": 0.7136, "loss_ce": 0.01855495572090149, "loss_lvr": 0.43159955739974976, "loss_mode_switch": 0.0, "loss_total": 0.061714913696050644, "step": 1784 }, { "batch_size": 1, "epoch": 0.7136, "step": 1784, "tokens_per_device": 4886 }, { "epoch": 0.7136, "loss_ce": 0.19629088044166565, "loss_lvr": 0.36334407329559326, "loss_mode_switch": 0.0, "loss_total": 0.23262529075145721, "step": 1784 }, { "batch_size": 1, "epoch": 0.7136, "step": 1784, "tokens_per_device": 4859 }, { "epoch": 0.7136, "loss_ce": 0.026923969388008118, "loss_lvr": 0.17857302725315094, "loss_mode_switch": 0.0, "loss_total": 0.04478127509355545, "step": 1784 }, { "epoch": 0.714, "grad_norm": 1.2008333206176758, "learning_rate": 1.9959567405144825e-06, "loss": 0.275, "step": 1785 }, { "batch_size": 1, "epoch": 0.714, "step": 1785, "tokens_per_device": 5139 }, { "epoch": 0.714, "loss_ce": 0.05619731917977333, "loss_lvr": 0.24521374702453613, "loss_mode_switch": 0.0, "loss_total": 0.08071869611740112, "step": 1785 }, { "batch_size": 4, "epoch": 0.714, "step": 1785, "tokens_per_device": 4244 }, { "epoch": 0.714, "loss_ce": 0.441339373588562, "loss_lvr": 0.8275766372680664, "loss_mode_switch": 0.0, "loss_total": 0.5240970253944397, "step": 1785 }, { "batch_size": 4, "epoch": 0.714, "step": 1785, "tokens_per_device": 4388 }, { "epoch": 0.714, "loss_ce": 0.03332400694489479, "loss_lvr": 1.2838581800460815, "loss_mode_switch": 0.0, "loss_total": 0.16170983016490936, "step": 1785 }, { "batch_size": 4, "epoch": 0.714, "step": 1785, "tokens_per_device": 3776 }, { "epoch": 0.714, "loss_ce": 0.05764946714043617, "loss_lvr": 0.7626081109046936, "loss_mode_switch": 0.0, "loss_total": 0.13391028344631195, "step": 1785 }, { "batch_size": 4, "epoch": 0.714, "step": 1785, "tokens_per_device": 4752 }, { "epoch": 0.714, "loss_ce": 0.10955679416656494, "loss_lvr": 0.9098595380783081, "loss_mode_switch": 0.0, "loss_total": 0.20054274797439575, "step": 1785 }, { "batch_size": 1, "epoch": 0.714, "step": 1785, "tokens_per_device": 5165 }, { "epoch": 0.714, "loss_ce": 0.014935674145817757, "loss_lvr": 0.4015573263168335, "loss_mode_switch": 0.0, "loss_total": 0.055091410875320435, "step": 1785 }, { "batch_size": 1, "epoch": 0.714, "step": 1785, "tokens_per_device": 4802 }, { "epoch": 0.714, "loss_ce": 0.05716019496321678, "loss_lvr": 0.7613596320152283, "loss_mode_switch": 0.0, "loss_total": 0.1332961618900299, "step": 1785 }, { "batch_size": 4, "epoch": 0.714, "step": 1785, "tokens_per_device": 3792 }, { "epoch": 0.714, "loss_ce": 0.3140092194080353, "loss_lvr": 0.848199725151062, "loss_mode_switch": 0.0, "loss_total": 0.3988291919231415, "step": 1785 }, { "epoch": 0.7144, "grad_norm": 1.7988803386688232, "learning_rate": 1.9907811870214334e-06, "loss": 0.2824, "step": 1786 }, { "batch_size": 4, "epoch": 0.7144, "step": 1786, "tokens_per_device": 4236 }, { "epoch": 0.7144, "loss_ce": 0.497290700674057, "loss_lvr": 0.9649100303649902, "loss_mode_switch": 0.0, "loss_total": 0.5937817096710205, "step": 1786 }, { "batch_size": 1, "epoch": 0.7144, "step": 1786, "tokens_per_device": 5152 }, { "epoch": 0.7144, "loss_ce": 0.0006737421499565244, "loss_lvr": 0.48769837617874146, "loss_mode_switch": 0.0, "loss_total": 0.04944358021020889, "step": 1786 }, { "batch_size": 1, "epoch": 0.7144, "step": 1786, "tokens_per_device": 4744 }, { "epoch": 0.7144, "loss_ce": 0.20434069633483887, "loss_lvr": 0.40553340315818787, "loss_mode_switch": 0.0, "loss_total": 0.24489404261112213, "step": 1786 }, { "batch_size": 4, "epoch": 0.7144, "step": 1786, "tokens_per_device": 14928 }, { "epoch": 0.7144, "loss_ce": 0.1549951583147049, "loss_lvr": 0.5603840351104736, "loss_mode_switch": 0.0, "loss_total": 0.21103356778621674, "step": 1786 }, { "batch_size": 4, "epoch": 0.7144, "step": 1786, "tokens_per_device": 5120 }, { "epoch": 0.7144, "loss_ce": 0.46767252683639526, "loss_lvr": 0.9333400726318359, "loss_mode_switch": 0.0, "loss_total": 0.5610065460205078, "step": 1786 }, { "batch_size": 1, "epoch": 0.7144, "step": 1786, "tokens_per_device": 4994 }, { "epoch": 0.7144, "loss_ce": 0.003561523510143161, "loss_lvr": 0.44490891695022583, "loss_mode_switch": 0.0, "loss_total": 0.04805241525173187, "step": 1786 }, { "batch_size": 4, "epoch": 0.7144, "step": 1786, "tokens_per_device": 3780 }, { "epoch": 0.7144, "loss_ce": 0.32201576232910156, "loss_lvr": 0.8297964930534363, "loss_mode_switch": 0.0, "loss_total": 0.4049954116344452, "step": 1786 }, { "batch_size": 4, "epoch": 0.7144, "step": 1786, "tokens_per_device": 3488 }, { "epoch": 0.7144, "loss_ce": 0.09622994810342789, "loss_lvr": 0.5702120065689087, "loss_mode_switch": 0.0, "loss_total": 0.15325114130973816, "step": 1786 }, { "epoch": 0.7148, "grad_norm": 1.2920907735824585, "learning_rate": 1.9856106839771194e-06, "loss": 0.2702, "step": 1787 }, { "batch_size": 1, "epoch": 0.7148, "step": 1787, "tokens_per_device": 4879 }, { "epoch": 0.7148, "loss_ce": 0.17636969685554504, "loss_lvr": 0.21541492640972137, "loss_mode_switch": 0.0, "loss_total": 0.19791118800640106, "step": 1787 }, { "batch_size": 4, "epoch": 0.7148, "step": 1787, "tokens_per_device": 7696 }, { "epoch": 0.7148, "loss_ce": 0.26478978991508484, "loss_lvr": 0.802649974822998, "loss_mode_switch": 0.0, "loss_total": 0.3450548052787781, "step": 1787 }, { "batch_size": 4, "epoch": 0.7148, "step": 1787, "tokens_per_device": 2652 }, { "epoch": 0.7148, "loss_ce": 0.4620961844921112, "loss_lvr": 0.9496312737464905, "loss_mode_switch": 0.0, "loss_total": 0.5570592880249023, "step": 1787 }, { "batch_size": 4, "epoch": 0.7148, "step": 1787, "tokens_per_device": 4224 }, { "epoch": 0.7148, "loss_ce": 0.3271244764328003, "loss_lvr": 0.59280925989151, "loss_mode_switch": 0.0, "loss_total": 0.38640540838241577, "step": 1787 }, { "batch_size": 1, "epoch": 0.7148, "step": 1787, "tokens_per_device": 5739 }, { "epoch": 0.7148, "loss_ce": 0.002990051871165633, "loss_lvr": 0.5295649170875549, "loss_mode_switch": 0.0, "loss_total": 0.05594654381275177, "step": 1787 }, { "batch_size": 4, "epoch": 0.7148, "step": 1787, "tokens_per_device": 2044 }, { "epoch": 0.7148, "loss_ce": 0.3714110553264618, "loss_lvr": 0.9306117296218872, "loss_mode_switch": 0.0, "loss_total": 0.464472234249115, "step": 1787 }, { "batch_size": 4, "epoch": 0.7148, "step": 1787, "tokens_per_device": 4128 }, { "epoch": 0.7148, "loss_ce": 0.16229568421840668, "loss_lvr": 0.7666684985160828, "loss_mode_switch": 0.0, "loss_total": 0.23896253108978271, "step": 1787 }, { "batch_size": 4, "epoch": 0.7148, "step": 1787, "tokens_per_device": 4592 }, { "epoch": 0.7148, "loss_ce": 0.5212850570678711, "loss_lvr": 1.0129951238632202, "loss_mode_switch": 0.0, "loss_total": 0.6225845813751221, "step": 1787 }, { "epoch": 0.7152, "grad_norm": 1.2860989570617676, "learning_rate": 1.9804452400593265e-06, "loss": 0.2781, "step": 1788 }, { "batch_size": 4, "epoch": 0.7152, "step": 1788, "tokens_per_device": 2668 }, { "epoch": 0.7152, "loss_ce": 0.1279398798942566, "loss_lvr": 0.8444621562957764, "loss_mode_switch": 0.0, "loss_total": 0.2123861014842987, "step": 1788 }, { "batch_size": 4, "epoch": 0.7152, "step": 1788, "tokens_per_device": 7728 }, { "epoch": 0.7152, "loss_ce": 0.39921051263809204, "loss_lvr": 0.8077555894851685, "loss_mode_switch": 0.0, "loss_total": 0.4799860715866089, "step": 1788 }, { "batch_size": 4, "epoch": 0.7152, "step": 1788, "tokens_per_device": 2680 }, { "epoch": 0.7152, "loss_ce": 0.5450671911239624, "loss_lvr": 0.9806300401687622, "loss_mode_switch": 0.0, "loss_total": 0.6431301832199097, "step": 1788 }, { "batch_size": 1, "epoch": 0.7152, "step": 1788, "tokens_per_device": 6736 }, { "epoch": 0.7152, "loss_ce": 0.021195674315094948, "loss_lvr": 0.345796674489975, "loss_mode_switch": 0.0, "loss_total": 0.055775344371795654, "step": 1788 }, { "batch_size": 4, "epoch": 0.7152, "step": 1788, "tokens_per_device": 1824 }, { "epoch": 0.7152, "loss_ce": 0.3360033333301544, "loss_lvr": 0.8609117865562439, "loss_mode_switch": 0.0, "loss_total": 0.42209452390670776, "step": 1788 }, { "batch_size": 4, "epoch": 0.7152, "step": 1788, "tokens_per_device": 2532 }, { "epoch": 0.7152, "loss_ce": 0.2716558873653412, "loss_lvr": 0.8059924244880676, "loss_mode_switch": 0.0, "loss_total": 0.3522551357746124, "step": 1788 }, { "batch_size": 4, "epoch": 0.7152, "step": 1788, "tokens_per_device": 1192 }, { "epoch": 0.7152, "loss_ce": 0.302792489528656, "loss_lvr": 1.07221519947052, "loss_mode_switch": 0.0, "loss_total": 0.41001400351524353, "step": 1788 }, { "batch_size": 1, "epoch": 0.7152, "step": 1788, "tokens_per_device": 5210 }, { "epoch": 0.7152, "loss_ce": 0.04566110298037529, "loss_lvr": 0.3776048421859741, "loss_mode_switch": 0.0, "loss_total": 0.08342158794403076, "step": 1788 }, { "epoch": 0.7156, "grad_norm": 1.4807779788970947, "learning_rate": 1.975284863937352e-06, "loss": 0.3254, "step": 1789 }, { "batch_size": 1, "epoch": 0.7156, "step": 1789, "tokens_per_device": 5182 }, { "epoch": 0.7156, "loss_ce": 0.0003999464097432792, "loss_lvr": 0.4694885313510895, "loss_mode_switch": 0.0, "loss_total": 0.04734880104660988, "step": 1789 }, { "batch_size": 4, "epoch": 0.7156, "step": 1789, "tokens_per_device": 4588 }, { "epoch": 0.7156, "loss_ce": 0.2797544300556183, "loss_lvr": 0.9566028118133545, "loss_mode_switch": 0.0, "loss_total": 0.37541472911834717, "step": 1789 }, { "batch_size": 4, "epoch": 0.7156, "step": 1789, "tokens_per_device": 1720 }, { "epoch": 0.7156, "loss_ce": 0.49612435698509216, "loss_lvr": 1.0540980100631714, "loss_mode_switch": 0.0, "loss_total": 0.6015341281890869, "step": 1789 }, { "batch_size": 4, "epoch": 0.7156, "step": 1789, "tokens_per_device": 4316 }, { "epoch": 0.7156, "loss_ce": 0.40260282158851624, "loss_lvr": 0.8312495946884155, "loss_mode_switch": 0.0, "loss_total": 0.48572778701782227, "step": 1789 }, { "batch_size": 1, "epoch": 0.7156, "step": 1789, "tokens_per_device": 4906 }, { "epoch": 0.7156, "loss_ce": 0.003923510201275349, "loss_lvr": 0.17765414714813232, "loss_mode_switch": 0.0, "loss_total": 0.021688926964998245, "step": 1789 }, { "batch_size": 1, "epoch": 0.7156, "step": 1789, "tokens_per_device": 4895 }, { "epoch": 0.7156, "loss_ce": 0.011188517324626446, "loss_lvr": 0.6786456108093262, "loss_mode_switch": 0.0, "loss_total": 0.07905307412147522, "step": 1789 }, { "batch_size": 1, "epoch": 0.7156, "step": 1789, "tokens_per_device": 5014 }, { "epoch": 0.7156, "loss_ce": 0.008584371767938137, "loss_lvr": 0.45361751317977905, "loss_mode_switch": 0.0, "loss_total": 0.053946126252412796, "step": 1789 }, { "batch_size": 4, "epoch": 0.7156, "step": 1789, "tokens_per_device": 2736 }, { "epoch": 0.7156, "loss_ce": 0.3849976360797882, "loss_lvr": 0.693360447883606, "loss_mode_switch": 0.0, "loss_total": 0.45433369278907776, "step": 1789 }, { "epoch": 0.716, "grad_norm": 1.3258392810821533, "learning_rate": 1.9701295642719836e-06, "loss": 0.2991, "step": 1790 }, { "batch_size": 4, "epoch": 0.716, "step": 1790, "tokens_per_device": 4236 }, { "epoch": 0.716, "loss_ce": 0.4145825505256653, "loss_lvr": 1.337157964706421, "loss_mode_switch": 0.0, "loss_total": 0.5482983589172363, "step": 1790 }, { "batch_size": 4, "epoch": 0.716, "step": 1790, "tokens_per_device": 6700 }, { "epoch": 0.716, "loss_ce": 0.19459810853004456, "loss_lvr": 0.8730063438415527, "loss_mode_switch": 0.0, "loss_total": 0.28189873695373535, "step": 1790 }, { "batch_size": 1, "epoch": 0.716, "step": 1790, "tokens_per_device": 5174 }, { "epoch": 0.716, "loss_ce": 0.0028433697298169136, "loss_lvr": 0.3847626745700836, "loss_mode_switch": 0.0, "loss_total": 0.04131963849067688, "step": 1790 }, { "batch_size": 4, "epoch": 0.716, "step": 1790, "tokens_per_device": 9448 }, { "epoch": 0.716, "loss_ce": 0.011758853681385517, "loss_lvr": 0.6578043699264526, "loss_mode_switch": 0.0, "loss_total": 0.07753929495811462, "step": 1790 }, { "batch_size": 4, "epoch": 0.716, "step": 1790, "tokens_per_device": 4232 }, { "epoch": 0.716, "loss_ce": 0.25522658228874207, "loss_lvr": 0.8628894686698914, "loss_mode_switch": 0.0, "loss_total": 0.34151554107666016, "step": 1790 }, { "batch_size": 4, "epoch": 0.716, "step": 1790, "tokens_per_device": 5284 }, { "epoch": 0.716, "loss_ce": 0.0445830337703228, "loss_lvr": 0.7076642513275146, "loss_mode_switch": 0.0, "loss_total": 0.11534945666790009, "step": 1790 }, { "batch_size": 4, "epoch": 0.716, "step": 1790, "tokens_per_device": 6148 }, { "epoch": 0.716, "loss_ce": 0.07251647114753723, "loss_lvr": 0.6973031759262085, "loss_mode_switch": 0.0, "loss_total": 0.1422467827796936, "step": 1790 }, { "batch_size": 4, "epoch": 0.716, "step": 1790, "tokens_per_device": 9128 }, { "epoch": 0.716, "loss_ce": 0.36017030477523804, "loss_lvr": 0.8506393432617188, "loss_mode_switch": 0.0, "loss_total": 0.4452342391014099, "step": 1790 }, { "epoch": 0.7164, "grad_norm": 1.690218448638916, "learning_rate": 1.9649793497154953e-06, "loss": 0.2756, "step": 1791 }, { "batch_size": 1, "epoch": 0.7164, "step": 1791, "tokens_per_device": 5018 }, { "epoch": 0.7164, "loss_ce": 0.21101093292236328, "loss_lvr": 0.309415340423584, "loss_mode_switch": 0.0, "loss_total": 0.24195246398448944, "step": 1791 }, { "batch_size": 1, "epoch": 0.7164, "step": 1791, "tokens_per_device": 4924 }, { "epoch": 0.7164, "loss_ce": 0.5430828332901001, "loss_lvr": 0.37995025515556335, "loss_mode_switch": 0.0, "loss_total": 0.5810778737068176, "step": 1791 }, { "batch_size": 1, "epoch": 0.7164, "step": 1791, "tokens_per_device": 4942 }, { "epoch": 0.7164, "loss_ce": 0.17890797555446625, "loss_lvr": 0.7969184517860413, "loss_mode_switch": 0.0, "loss_total": 0.25859981775283813, "step": 1791 }, { "batch_size": 4, "epoch": 0.7164, "step": 1791, "tokens_per_device": 1428 }, { "epoch": 0.7164, "loss_ce": 0.31994324922561646, "loss_lvr": 0.8678925633430481, "loss_mode_switch": 0.0, "loss_total": 0.4067324995994568, "step": 1791 }, { "batch_size": 4, "epoch": 0.7164, "step": 1791, "tokens_per_device": 2624 }, { "epoch": 0.7164, "loss_ce": 0.21256744861602783, "loss_lvr": 1.0247458219528198, "loss_mode_switch": 0.0, "loss_total": 0.31504201889038086, "step": 1791 }, { "batch_size": 4, "epoch": 0.7164, "step": 1791, "tokens_per_device": 4192 }, { "epoch": 0.7164, "loss_ce": 0.13099974393844604, "loss_lvr": 1.7570956945419312, "loss_mode_switch": 0.0, "loss_total": 0.30670931935310364, "step": 1791 }, { "batch_size": 4, "epoch": 0.7164, "step": 1791, "tokens_per_device": 6028 }, { "epoch": 0.7164, "loss_ce": 0.04993583634495735, "loss_lvr": 0.797110915184021, "loss_mode_switch": 0.0, "loss_total": 0.1296469271183014, "step": 1791 }, { "batch_size": 4, "epoch": 0.7164, "step": 1791, "tokens_per_device": 2656 }, { "epoch": 0.7164, "loss_ce": 0.07697851210832596, "loss_lvr": 1.0148481130599976, "loss_mode_switch": 0.0, "loss_total": 0.17846332490444183, "step": 1791 }, { "epoch": 0.7168, "grad_norm": 1.4377254247665405, "learning_rate": 1.959834228911624e-06, "loss": 0.2659, "step": 1792 }, { "batch_size": 4, "epoch": 0.7168, "step": 1792, "tokens_per_device": 2344 }, { "epoch": 0.7168, "loss_ce": 0.12338527292013168, "loss_lvr": 0.9124197959899902, "loss_mode_switch": 0.0, "loss_total": 0.2146272510290146, "step": 1792 }, { "batch_size": 1, "epoch": 0.7168, "step": 1792, "tokens_per_device": 4906 }, { "epoch": 0.7168, "loss_ce": 0.46668732166290283, "loss_lvr": 0.6691646575927734, "loss_mode_switch": 0.0, "loss_total": 0.5336037874221802, "step": 1792 }, { "batch_size": 4, "epoch": 0.7168, "step": 1792, "tokens_per_device": 5392 }, { "epoch": 0.7168, "loss_ce": 0.43254369497299194, "loss_lvr": 0.6574980020523071, "loss_mode_switch": 0.0, "loss_total": 0.4982934892177582, "step": 1792 }, { "batch_size": 4, "epoch": 0.7168, "step": 1792, "tokens_per_device": 5680 }, { "epoch": 0.7168, "loss_ce": 0.15688848495483398, "loss_lvr": 0.7782231569290161, "loss_mode_switch": 0.0, "loss_total": 0.23471081256866455, "step": 1792 }, { "batch_size": 1, "epoch": 0.7168, "step": 1792, "tokens_per_device": 4889 }, { "epoch": 0.7168, "loss_ce": 0.04974537715315819, "loss_lvr": 0.551598846912384, "loss_mode_switch": 0.0, "loss_total": 0.10490526258945465, "step": 1792 }, { "batch_size": 4, "epoch": 0.7168, "step": 1792, "tokens_per_device": 4252 }, { "epoch": 0.7168, "loss_ce": 0.22296175360679626, "loss_lvr": 0.7020492553710938, "loss_mode_switch": 0.0, "loss_total": 0.2931666970252991, "step": 1792 }, { "batch_size": 1, "epoch": 0.7168, "step": 1792, "tokens_per_device": 5084 }, { "epoch": 0.7168, "loss_ce": 0.05825074017047882, "loss_lvr": 0.46643391251564026, "loss_mode_switch": 0.0, "loss_total": 0.10489413142204285, "step": 1792 }, { "batch_size": 4, "epoch": 0.7168, "step": 1792, "tokens_per_device": 5688 }, { "epoch": 0.7168, "loss_ce": 0.38284504413604736, "loss_lvr": 0.8115602135658264, "loss_mode_switch": 0.0, "loss_total": 0.4640010595321655, "step": 1792 }, { "epoch": 0.7172, "grad_norm": 1.320093035697937, "learning_rate": 1.9546942104955567e-06, "loss": 0.2906, "step": 1793 }, { "batch_size": 4, "epoch": 0.7172, "step": 1793, "tokens_per_device": 4000 }, { "epoch": 0.7172, "loss_ce": 0.15426771342754364, "loss_lvr": 0.9436189532279968, "loss_mode_switch": 0.0, "loss_total": 0.2486295998096466, "step": 1793 }, { "batch_size": 4, "epoch": 0.7172, "step": 1793, "tokens_per_device": 5124 }, { "epoch": 0.7172, "loss_ce": 0.09366901218891144, "loss_lvr": 0.7771365642547607, "loss_mode_switch": 0.0, "loss_total": 0.17138266563415527, "step": 1793 }, { "batch_size": 4, "epoch": 0.7172, "step": 1793, "tokens_per_device": 1220 }, { "epoch": 0.7172, "loss_ce": 0.3637497127056122, "loss_lvr": 1.1958448886871338, "loss_mode_switch": 0.0, "loss_total": 0.4833342134952545, "step": 1793 }, { "batch_size": 1, "epoch": 0.7172, "step": 1793, "tokens_per_device": 4874 }, { "epoch": 0.7172, "loss_ce": 0.023525739088654518, "loss_lvr": 0.2680647075176239, "loss_mode_switch": 0.0, "loss_total": 0.050332210958004, "step": 1793 }, { "batch_size": 4, "epoch": 0.7172, "step": 1793, "tokens_per_device": 3932 }, { "epoch": 0.7172, "loss_ce": 0.47616761922836304, "loss_lvr": 0.779060959815979, "loss_mode_switch": 0.0, "loss_total": 0.554073691368103, "step": 1793 }, { "batch_size": 4, "epoch": 0.7172, "step": 1793, "tokens_per_device": 2664 }, { "epoch": 0.7172, "loss_ce": 0.3640934228897095, "loss_lvr": 0.7767912149429321, "loss_mode_switch": 0.0, "loss_total": 0.44177255034446716, "step": 1793 }, { "batch_size": 4, "epoch": 0.7172, "step": 1793, "tokens_per_device": 1540 }, { "epoch": 0.7172, "loss_ce": 0.2594732642173767, "loss_lvr": 1.1926848888397217, "loss_mode_switch": 0.0, "loss_total": 0.3787417411804199, "step": 1793 }, { "batch_size": 4, "epoch": 0.7172, "step": 1793, "tokens_per_device": 13904 }, { "epoch": 0.7172, "loss_ce": 0.0797196552157402, "loss_lvr": 0.7865285873413086, "loss_mode_switch": 0.0, "loss_total": 0.15837252140045166, "step": 1793 }, { "epoch": 0.7176, "grad_norm": 1.6491984128952026, "learning_rate": 1.949559303093916e-06, "loss": 0.2864, "step": 1794 }, { "batch_size": 4, "epoch": 0.7176, "step": 1794, "tokens_per_device": 5884 }, { "epoch": 0.7176, "loss_ce": 0.3092273473739624, "loss_lvr": 0.8340094685554504, "loss_mode_switch": 0.0, "loss_total": 0.3926283121109009, "step": 1794 }, { "batch_size": 4, "epoch": 0.7176, "step": 1794, "tokens_per_device": 2584 }, { "epoch": 0.7176, "loss_ce": 0.45424291491508484, "loss_lvr": 1.1137577295303345, "loss_mode_switch": 0.0, "loss_total": 0.5656186938285828, "step": 1794 }, { "batch_size": 4, "epoch": 0.7176, "step": 1794, "tokens_per_device": 1656 }, { "epoch": 0.7176, "loss_ce": 0.04063371196389198, "loss_lvr": 2.1825802326202393, "loss_mode_switch": 0.0, "loss_total": 0.2588917315006256, "step": 1794 }, { "batch_size": 4, "epoch": 0.7176, "step": 1794, "tokens_per_device": 4252 }, { "epoch": 0.7176, "loss_ce": 0.2538314461708069, "loss_lvr": 0.749069094657898, "loss_mode_switch": 0.0, "loss_total": 0.32873836159706116, "step": 1794 }, { "batch_size": 4, "epoch": 0.7176, "step": 1794, "tokens_per_device": 5032 }, { "epoch": 0.7176, "loss_ce": 0.001532080234028399, "loss_lvr": 0.8289458751678467, "loss_mode_switch": 0.0, "loss_total": 0.08442666381597519, "step": 1794 }, { "batch_size": 1, "epoch": 0.7176, "step": 1794, "tokens_per_device": 4735 }, { "epoch": 0.7176, "loss_ce": 0.009264168329536915, "loss_lvr": 0.3245663642883301, "loss_mode_switch": 0.0, "loss_total": 0.04172080382704735, "step": 1794 }, { "batch_size": 4, "epoch": 0.7176, "step": 1794, "tokens_per_device": 6640 }, { "epoch": 0.7176, "loss_ce": 0.03984212875366211, "loss_lvr": 0.8898875713348389, "loss_mode_switch": 0.0, "loss_total": 0.12883087992668152, "step": 1794 }, { "batch_size": 1, "epoch": 0.7176, "step": 1794, "tokens_per_device": 4986 }, { "epoch": 0.7176, "loss_ce": 0.3625811040401459, "loss_lvr": 0.3724447786808014, "loss_mode_switch": 0.0, "loss_total": 0.3998255729675293, "step": 1794 }, { "epoch": 0.718, "grad_norm": 1.3298159837722778, "learning_rate": 1.944429515324749e-06, "loss": 0.2343, "step": 1795 }, { "batch_size": 4, "epoch": 0.718, "step": 1795, "tokens_per_device": 3488 }, { "epoch": 0.718, "loss_ce": 0.5929558873176575, "loss_lvr": 0.8131378889083862, "loss_mode_switch": 0.0, "loss_total": 0.6742696762084961, "step": 1795 }, { "batch_size": 4, "epoch": 0.718, "step": 1795, "tokens_per_device": 4264 }, { "epoch": 0.718, "loss_ce": 0.09300102293491364, "loss_lvr": 0.8597028255462646, "loss_mode_switch": 0.0, "loss_total": 0.1789713054895401, "step": 1795 }, { "batch_size": 1, "epoch": 0.718, "step": 1795, "tokens_per_device": 5087 }, { "epoch": 0.718, "loss_ce": 0.015214493498206139, "loss_lvr": 0.39786186814308167, "loss_mode_switch": 0.0, "loss_total": 0.055000677704811096, "step": 1795 }, { "batch_size": 4, "epoch": 0.718, "step": 1795, "tokens_per_device": 4216 }, { "epoch": 0.718, "loss_ce": 0.7333062887191772, "loss_lvr": 0.8647220730781555, "loss_mode_switch": 0.0, "loss_total": 0.8197785019874573, "step": 1795 }, { "batch_size": 4, "epoch": 0.718, "step": 1795, "tokens_per_device": 4356 }, { "epoch": 0.718, "loss_ce": 0.22651846706867218, "loss_lvr": 0.8597287535667419, "loss_mode_switch": 0.0, "loss_total": 0.31249135732650757, "step": 1795 }, { "batch_size": 1, "epoch": 0.718, "step": 1795, "tokens_per_device": 4758 }, { "epoch": 0.718, "loss_ce": 0.011819826439023018, "loss_lvr": 0.36942964792251587, "loss_mode_switch": 0.0, "loss_total": 0.048762790858745575, "step": 1795 }, { "batch_size": 1, "epoch": 0.718, "step": 1795, "tokens_per_device": 6045 }, { "epoch": 0.718, "loss_ce": 0.007366623729467392, "loss_lvr": 0.4533403515815735, "loss_mode_switch": 0.0, "loss_total": 0.05270066112279892, "step": 1795 }, { "batch_size": 1, "epoch": 0.718, "step": 1795, "tokens_per_device": 5195 }, { "epoch": 0.718, "loss_ce": 0.041947610676288605, "loss_lvr": 0.39060530066490173, "loss_mode_switch": 0.0, "loss_total": 0.08100813627243042, "step": 1795 }, { "epoch": 0.7184, "grad_norm": 1.3266931772232056, "learning_rate": 1.939304855797511e-06, "loss": 0.3273, "step": 1796 }, { "batch_size": 1, "epoch": 0.7184, "step": 1796, "tokens_per_device": 5115 }, { "epoch": 0.7184, "loss_ce": 0.0018019680865108967, "loss_lvr": 0.499971866607666, "loss_mode_switch": 0.0, "loss_total": 0.05179915577173233, "step": 1796 }, { "batch_size": 4, "epoch": 0.7184, "step": 1796, "tokens_per_device": 1892 }, { "epoch": 0.7184, "loss_ce": 0.12519347667694092, "loss_lvr": 0.8769165277481079, "loss_mode_switch": 0.0, "loss_total": 0.21288514137268066, "step": 1796 }, { "batch_size": 4, "epoch": 0.7184, "step": 1796, "tokens_per_device": 8504 }, { "epoch": 0.7184, "loss_ce": 0.2628682255744934, "loss_lvr": 0.9294321537017822, "loss_mode_switch": 0.0, "loss_total": 0.3558114469051361, "step": 1796 }, { "batch_size": 4, "epoch": 0.7184, "step": 1796, "tokens_per_device": 13888 }, { "epoch": 0.7184, "loss_ce": 0.342474102973938, "loss_lvr": 0.9408480525016785, "loss_mode_switch": 0.0, "loss_total": 0.43655890226364136, "step": 1796 }, { "batch_size": 4, "epoch": 0.7184, "step": 1796, "tokens_per_device": 3788 }, { "epoch": 0.7184, "loss_ce": 0.308144748210907, "loss_lvr": 0.9764323234558105, "loss_mode_switch": 0.0, "loss_total": 0.40578797459602356, "step": 1796 }, { "batch_size": 4, "epoch": 0.7184, "step": 1796, "tokens_per_device": 1924 }, { "epoch": 0.7184, "loss_ce": 0.3052135407924652, "loss_lvr": 0.8034206628799438, "loss_mode_switch": 0.0, "loss_total": 0.385555624961853, "step": 1796 }, { "batch_size": 1, "epoch": 0.7184, "step": 1796, "tokens_per_device": 4937 }, { "epoch": 0.7184, "loss_ce": 0.08431228250265121, "loss_lvr": 0.4691375195980072, "loss_mode_switch": 0.0, "loss_total": 0.13122603297233582, "step": 1796 }, { "batch_size": 4, "epoch": 0.7184, "step": 1796, "tokens_per_device": 5444 }, { "epoch": 0.7184, "loss_ce": 0.031119532883167267, "loss_lvr": 0.59150630235672, "loss_mode_switch": 0.0, "loss_total": 0.09027016162872314, "step": 1796 }, { "epoch": 0.7188, "grad_norm": 1.2245889902114868, "learning_rate": 1.9341853331130472e-06, "loss": 0.2577, "step": 1797 }, { "batch_size": 4, "epoch": 0.7188, "step": 1797, "tokens_per_device": 3816 }, { "epoch": 0.7188, "loss_ce": 0.504164457321167, "loss_lvr": 1.0320167541503906, "loss_mode_switch": 0.0, "loss_total": 0.607366144657135, "step": 1797 }, { "batch_size": 4, "epoch": 0.7188, "step": 1797, "tokens_per_device": 4896 }, { "epoch": 0.7188, "loss_ce": 0.17592953145503998, "loss_lvr": 0.9234630465507507, "loss_mode_switch": 0.0, "loss_total": 0.26827582716941833, "step": 1797 }, { "batch_size": 1, "epoch": 0.7188, "step": 1797, "tokens_per_device": 4707 }, { "epoch": 0.7188, "loss_ce": 0.04602857679128647, "loss_lvr": 0.12015973031520844, "loss_mode_switch": 0.0, "loss_total": 0.05804454907774925, "step": 1797 }, { "batch_size": 4, "epoch": 0.7188, "step": 1797, "tokens_per_device": 3792 }, { "epoch": 0.7188, "loss_ce": 0.11361158639192581, "loss_lvr": 0.9240947365760803, "loss_mode_switch": 0.0, "loss_total": 0.20602107048034668, "step": 1797 }, { "batch_size": 1, "epoch": 0.7188, "step": 1797, "tokens_per_device": 4308 }, { "epoch": 0.7188, "loss_ce": 0.8074365258216858, "loss_lvr": 0.16909685730934143, "loss_mode_switch": 0.0, "loss_total": 0.8243461847305298, "step": 1797 }, { "batch_size": 1, "epoch": 0.7188, "step": 1797, "tokens_per_device": 5552 }, { "epoch": 0.7188, "loss_ce": 0.1388261765241623, "loss_lvr": 0.4273046851158142, "loss_mode_switch": 0.0, "loss_total": 0.18155664205551147, "step": 1797 }, { "batch_size": 4, "epoch": 0.7188, "step": 1797, "tokens_per_device": 6152 }, { "epoch": 0.7188, "loss_ce": 0.2984604239463806, "loss_lvr": 1.0219104290008545, "loss_mode_switch": 0.0, "loss_total": 0.4006514549255371, "step": 1797 }, { "batch_size": 4, "epoch": 0.7188, "step": 1797, "tokens_per_device": 1192 }, { "epoch": 0.7188, "loss_ce": 0.23513402044773102, "loss_lvr": 0.9351754188537598, "loss_mode_switch": 0.0, "loss_total": 0.3286515474319458, "step": 1797 }, { "epoch": 0.7192, "grad_norm": 1.1637850999832153, "learning_rate": 1.929070955863584e-06, "loss": 0.2759, "step": 1798 }, { "batch_size": 4, "epoch": 0.7192, "step": 1798, "tokens_per_device": 4000 }, { "epoch": 0.7192, "loss_ce": 0.19664068520069122, "loss_lvr": 0.8494246006011963, "loss_mode_switch": 0.0, "loss_total": 0.28158313035964966, "step": 1798 }, { "batch_size": 4, "epoch": 0.7192, "step": 1798, "tokens_per_device": 4180 }, { "epoch": 0.7192, "loss_ce": 0.4454749524593353, "loss_lvr": 0.7425463795661926, "loss_mode_switch": 0.0, "loss_total": 0.5197296142578125, "step": 1798 }, { "batch_size": 1, "epoch": 0.7192, "step": 1798, "tokens_per_device": 5169 }, { "epoch": 0.7192, "loss_ce": 0.007471733260899782, "loss_lvr": 0.44095534086227417, "loss_mode_switch": 0.0, "loss_total": 0.05156726762652397, "step": 1798 }, { "batch_size": 4, "epoch": 0.7192, "step": 1798, "tokens_per_device": 2764 }, { "epoch": 0.7192, "loss_ce": 0.7333957552909851, "loss_lvr": 0.7305013537406921, "loss_mode_switch": 0.0, "loss_total": 0.8064458966255188, "step": 1798 }, { "batch_size": 4, "epoch": 0.7192, "step": 1798, "tokens_per_device": 4672 }, { "epoch": 0.7192, "loss_ce": 0.7535578012466431, "loss_lvr": 0.989122211933136, "loss_mode_switch": 0.0, "loss_total": 0.8524700403213501, "step": 1798 }, { "batch_size": 4, "epoch": 0.7192, "step": 1798, "tokens_per_device": 6080 }, { "epoch": 0.7192, "loss_ce": 0.10530800372362137, "loss_lvr": 0.8108497262001038, "loss_mode_switch": 0.0, "loss_total": 0.18639297783374786, "step": 1798 }, { "batch_size": 4, "epoch": 0.7192, "step": 1798, "tokens_per_device": 4264 }, { "epoch": 0.7192, "loss_ce": 0.1889593005180359, "loss_lvr": 1.1061331033706665, "loss_mode_switch": 0.0, "loss_total": 0.299572616815567, "step": 1798 }, { "batch_size": 4, "epoch": 0.7192, "step": 1798, "tokens_per_device": 4232 }, { "epoch": 0.7192, "loss_ce": 0.3438129425048828, "loss_lvr": 1.012273907661438, "loss_mode_switch": 0.0, "loss_total": 0.44504034519195557, "step": 1798 }, { "epoch": 0.7196, "grad_norm": 1.13339364528656, "learning_rate": 1.923961732632709e-06, "loss": 0.2624, "step": 1799 }, { "batch_size": 4, "epoch": 0.7196, "step": 1799, "tokens_per_device": 5308 }, { "epoch": 0.7196, "loss_ce": 0.1679905503988266, "loss_lvr": 0.6339755654335022, "loss_mode_switch": 0.0, "loss_total": 0.23138810694217682, "step": 1799 }, { "batch_size": 4, "epoch": 0.7196, "step": 1799, "tokens_per_device": 5108 }, { "epoch": 0.7196, "loss_ce": 0.023834044113755226, "loss_lvr": 0.8952464461326599, "loss_mode_switch": 0.0, "loss_total": 0.11335869133472443, "step": 1799 }, { "batch_size": 4, "epoch": 0.7196, "step": 1799, "tokens_per_device": 3888 }, { "epoch": 0.7196, "loss_ce": 0.2435382604598999, "loss_lvr": 1.4753552675247192, "loss_mode_switch": 0.0, "loss_total": 0.3910737931728363, "step": 1799 }, { "batch_size": 4, "epoch": 0.7196, "step": 1799, "tokens_per_device": 4880 }, { "epoch": 0.7196, "loss_ce": 0.4622288942337036, "loss_lvr": 0.8149030804634094, "loss_mode_switch": 0.0, "loss_total": 0.5437191724777222, "step": 1799 }, { "batch_size": 4, "epoch": 0.7196, "step": 1799, "tokens_per_device": 4236 }, { "epoch": 0.7196, "loss_ce": 0.6991447806358337, "loss_lvr": 1.0943210124969482, "loss_mode_switch": 0.0, "loss_total": 0.8085768818855286, "step": 1799 }, { "batch_size": 1, "epoch": 0.7196, "step": 1799, "tokens_per_device": 5029 }, { "epoch": 0.7196, "loss_ce": 0.07839205116033554, "loss_lvr": 0.2749873399734497, "loss_mode_switch": 0.0, "loss_total": 0.10589078813791275, "step": 1799 }, { "batch_size": 1, "epoch": 0.7196, "step": 1799, "tokens_per_device": 5223 }, { "epoch": 0.7196, "loss_ce": 0.06971440464258194, "loss_lvr": 0.3508525490760803, "loss_mode_switch": 0.0, "loss_total": 0.10479965806007385, "step": 1799 }, { "batch_size": 1, "epoch": 0.7196, "step": 1799, "tokens_per_device": 5118 }, { "epoch": 0.7196, "loss_ce": 0.03881995379924774, "loss_lvr": 0.26000821590423584, "loss_mode_switch": 0.0, "loss_total": 0.0648207738995552, "step": 1799 }, { "epoch": 0.72, "grad_norm": 1.188171148300171, "learning_rate": 1.9188576719953635e-06, "loss": 0.2812, "step": 1800 }, { "batch_size": 4, "epoch": 0.72, "step": 1800, "tokens_per_device": 5304 }, { "epoch": 0.72, "loss_ce": 0.30926528573036194, "loss_lvr": 0.9037474393844604, "loss_mode_switch": 0.0, "loss_total": 0.3996400237083435, "step": 1800 }, { "batch_size": 1, "epoch": 0.72, "step": 1800, "tokens_per_device": 5142 }, { "epoch": 0.72, "loss_ce": 0.014146699570119381, "loss_lvr": 0.5504493117332458, "loss_mode_switch": 0.0, "loss_total": 0.06919162720441818, "step": 1800 }, { "batch_size": 4, "epoch": 0.72, "step": 1800, "tokens_per_device": 4608 }, { "epoch": 0.72, "loss_ce": 0.24702517688274384, "loss_lvr": 0.9165489673614502, "loss_mode_switch": 0.0, "loss_total": 0.33868008852005005, "step": 1800 }, { "batch_size": 1, "epoch": 0.72, "step": 1800, "tokens_per_device": 5065 }, { "epoch": 0.72, "loss_ce": 0.03236238285899162, "loss_lvr": 0.3095768094062805, "loss_mode_switch": 0.0, "loss_total": 0.06332006305456161, "step": 1800 }, { "batch_size": 4, "epoch": 0.72, "step": 1800, "tokens_per_device": 5000 }, { "epoch": 0.72, "loss_ce": 0.462220162153244, "loss_lvr": 0.7298151254653931, "loss_mode_switch": 0.0, "loss_total": 0.5352016687393188, "step": 1800 }, { "batch_size": 4, "epoch": 0.72, "step": 1800, "tokens_per_device": 4308 }, { "epoch": 0.72, "loss_ce": 0.3149374723434448, "loss_lvr": 0.7366325855255127, "loss_mode_switch": 0.0, "loss_total": 0.38860073685646057, "step": 1800 }, { "batch_size": 4, "epoch": 0.72, "step": 1800, "tokens_per_device": 1532 }, { "epoch": 0.72, "loss_ce": 0.26346084475517273, "loss_lvr": 0.8746880888938904, "loss_mode_switch": 0.0, "loss_total": 0.3509296476840973, "step": 1800 }, { "batch_size": 4, "epoch": 0.72, "step": 1800, "tokens_per_device": 8680 }, { "epoch": 0.72, "loss_ce": 0.12142603099346161, "loss_lvr": 0.9883880615234375, "loss_mode_switch": 0.0, "loss_total": 0.22026483714580536, "step": 1800 }, { "epoch": 0.7204, "grad_norm": 1.2502086162567139, "learning_rate": 1.9137587825178243e-06, "loss": 0.3117, "step": 1801 }, { "batch_size": 4, "epoch": 0.7204, "step": 1801, "tokens_per_device": 3784 }, { "epoch": 0.7204, "loss_ce": 0.5285930633544922, "loss_lvr": 1.0244688987731934, "loss_mode_switch": 0.0, "loss_total": 0.6310399770736694, "step": 1801 }, { "batch_size": 4, "epoch": 0.7204, "step": 1801, "tokens_per_device": 5672 }, { "epoch": 0.7204, "loss_ce": 0.21440725028514862, "loss_lvr": 0.8701440691947937, "loss_mode_switch": 0.0, "loss_total": 0.3014216423034668, "step": 1801 }, { "batch_size": 1, "epoch": 0.7204, "step": 1801, "tokens_per_device": 6731 }, { "epoch": 0.7204, "loss_ce": 0.0009637767216190696, "loss_lvr": 0.2956947982311249, "loss_mode_switch": 0.0, "loss_total": 0.030533257871866226, "step": 1801 }, { "batch_size": 4, "epoch": 0.7204, "step": 1801, "tokens_per_device": 5080 }, { "epoch": 0.7204, "loss_ce": 0.323423832654953, "loss_lvr": 0.8498170375823975, "loss_mode_switch": 0.0, "loss_total": 0.4084055423736572, "step": 1801 }, { "batch_size": 4, "epoch": 0.7204, "step": 1801, "tokens_per_device": 12656 }, { "epoch": 0.7204, "loss_ce": 0.15134483575820923, "loss_lvr": 0.6071107387542725, "loss_mode_switch": 0.0, "loss_total": 0.21205590665340424, "step": 1801 }, { "batch_size": 1, "epoch": 0.7204, "step": 1801, "tokens_per_device": 5159 }, { "epoch": 0.7204, "loss_ce": 0.00053039425984025, "loss_lvr": 0.3603786826133728, "loss_mode_switch": 0.0, "loss_total": 0.03656826540827751, "step": 1801 }, { "batch_size": 4, "epoch": 0.7204, "step": 1801, "tokens_per_device": 4416 }, { "epoch": 0.7204, "loss_ce": 0.08307037502527237, "loss_lvr": 0.8670881986618042, "loss_mode_switch": 0.0, "loss_total": 0.1697791963815689, "step": 1801 }, { "batch_size": 4, "epoch": 0.7204, "step": 1801, "tokens_per_device": 4632 }, { "epoch": 0.7204, "loss_ce": 0.09593348950147629, "loss_lvr": 0.8722600340843201, "loss_mode_switch": 0.0, "loss_total": 0.1831595003604889, "step": 1801 }, { "epoch": 0.7208, "grad_norm": 1.2556170225143433, "learning_rate": 1.908665072757687e-06, "loss": 0.2708, "step": 1802 }, { "batch_size": 4, "epoch": 0.7208, "step": 1802, "tokens_per_device": 5940 }, { "epoch": 0.7208, "loss_ce": 0.1037852093577385, "loss_lvr": 0.9747210144996643, "loss_mode_switch": 0.0, "loss_total": 0.20125731825828552, "step": 1802 }, { "batch_size": 1, "epoch": 0.7208, "step": 1802, "tokens_per_device": 4789 }, { "epoch": 0.7208, "loss_ce": 0.13271306455135345, "loss_lvr": 0.2025807946920395, "loss_mode_switch": 0.0, "loss_total": 0.15297114849090576, "step": 1802 }, { "batch_size": 1, "epoch": 0.7208, "step": 1802, "tokens_per_device": 4880 }, { "epoch": 0.7208, "loss_ce": 0.006253224331885576, "loss_lvr": 0.3155136704444885, "loss_mode_switch": 0.0, "loss_total": 0.03780459240078926, "step": 1802 }, { "batch_size": 4, "epoch": 0.7208, "step": 1802, "tokens_per_device": 2656 }, { "epoch": 0.7208, "loss_ce": 0.39801740646362305, "loss_lvr": 0.7003777623176575, "loss_mode_switch": 0.0, "loss_total": 0.46805518865585327, "step": 1802 }, { "batch_size": 4, "epoch": 0.7208, "step": 1802, "tokens_per_device": 10792 }, { "epoch": 0.7208, "loss_ce": 0.08933147042989731, "loss_lvr": 0.7355367541313171, "loss_mode_switch": 0.0, "loss_total": 0.1628851443529129, "step": 1802 }, { "batch_size": 1, "epoch": 0.7208, "step": 1802, "tokens_per_device": 5095 }, { "epoch": 0.7208, "loss_ce": 0.11111968755722046, "loss_lvr": 0.3866909444332123, "loss_mode_switch": 0.0, "loss_total": 0.1497887820005417, "step": 1802 }, { "batch_size": 4, "epoch": 0.7208, "step": 1802, "tokens_per_device": 9440 }, { "epoch": 0.7208, "loss_ce": 0.49625349044799805, "loss_lvr": 0.7704935669898987, "loss_mode_switch": 0.0, "loss_total": 0.5733028650283813, "step": 1802 }, { "batch_size": 1, "epoch": 0.7208, "step": 1802, "tokens_per_device": 5167 }, { "epoch": 0.7208, "loss_ce": 0.07069779932498932, "loss_lvr": 0.37375280261039734, "loss_mode_switch": 0.0, "loss_total": 0.10807308554649353, "step": 1802 }, { "epoch": 0.7212, "grad_norm": 1.4384032487869263, "learning_rate": 1.903576551263852e-06, "loss": 0.3342, "step": 1803 }, { "batch_size": 4, "epoch": 0.7212, "step": 1803, "tokens_per_device": 5744 }, { "epoch": 0.7212, "loss_ce": 0.24440667033195496, "loss_lvr": 0.9407362341880798, "loss_mode_switch": 0.0, "loss_total": 0.33848029375076294, "step": 1803 }, { "batch_size": 4, "epoch": 0.7212, "step": 1803, "tokens_per_device": 5672 }, { "epoch": 0.7212, "loss_ce": 0.03192088380455971, "loss_lvr": 0.6638543009757996, "loss_mode_switch": 0.0, "loss_total": 0.0983063131570816, "step": 1803 }, { "batch_size": 1, "epoch": 0.7212, "step": 1803, "tokens_per_device": 5097 }, { "epoch": 0.7212, "loss_ce": 0.0006028336938470602, "loss_lvr": 0.8671817779541016, "loss_mode_switch": 0.0, "loss_total": 0.08732101321220398, "step": 1803 }, { "batch_size": 4, "epoch": 0.7212, "step": 1803, "tokens_per_device": 4352 }, { "epoch": 0.7212, "loss_ce": 0.42031845450401306, "loss_lvr": 0.7032518982887268, "loss_mode_switch": 0.0, "loss_total": 0.4906436502933502, "step": 1803 }, { "batch_size": 4, "epoch": 0.7212, "step": 1803, "tokens_per_device": 3788 }, { "epoch": 0.7212, "loss_ce": 0.27838706970214844, "loss_lvr": 0.7223100662231445, "loss_mode_switch": 0.0, "loss_total": 0.35061806440353394, "step": 1803 }, { "batch_size": 1, "epoch": 0.7212, "step": 1803, "tokens_per_device": 4878 }, { "epoch": 0.7212, "loss_ce": 0.5248039960861206, "loss_lvr": 0.6394862532615662, "loss_mode_switch": 0.0, "loss_total": 0.5887526273727417, "step": 1803 }, { "batch_size": 4, "epoch": 0.7212, "step": 1803, "tokens_per_device": 3784 }, { "epoch": 0.7212, "loss_ce": 0.7411195039749146, "loss_lvr": 1.2528789043426514, "loss_mode_switch": 0.0, "loss_total": 0.8664073944091797, "step": 1803 }, { "batch_size": 4, "epoch": 0.7212, "step": 1803, "tokens_per_device": 2908 }, { "epoch": 0.7212, "loss_ce": 0.6148862838745117, "loss_lvr": 1.6848212480545044, "loss_mode_switch": 0.0, "loss_total": 0.7833684086799622, "step": 1803 }, { "epoch": 0.7216, "grad_norm": 1.2939209938049316, "learning_rate": 1.8984932265765172e-06, "loss": 0.2889, "step": 1804 }, { "batch_size": 4, "epoch": 0.7216, "step": 1804, "tokens_per_device": 3844 }, { "epoch": 0.7216, "loss_ce": 0.3468623757362366, "loss_lvr": 0.8868784308433533, "loss_mode_switch": 0.0, "loss_total": 0.4355502128601074, "step": 1804 }, { "batch_size": 4, "epoch": 0.7216, "step": 1804, "tokens_per_device": 1344 }, { "epoch": 0.7216, "loss_ce": 0.34140118956565857, "loss_lvr": 1.0161373615264893, "loss_mode_switch": 0.0, "loss_total": 0.443014919757843, "step": 1804 }, { "batch_size": 4, "epoch": 0.7216, "step": 1804, "tokens_per_device": 13188 }, { "epoch": 0.7216, "loss_ce": 0.22171221673488617, "loss_lvr": 0.6570738554000854, "loss_mode_switch": 0.0, "loss_total": 0.2874196171760559, "step": 1804 }, { "batch_size": 1, "epoch": 0.7216, "step": 1804, "tokens_per_device": 4888 }, { "epoch": 0.7216, "loss_ce": 0.006057138089090586, "loss_lvr": 0.8614751696586609, "loss_mode_switch": 0.0, "loss_total": 0.09220465272665024, "step": 1804 }, { "batch_size": 1, "epoch": 0.7216, "step": 1804, "tokens_per_device": 5171 }, { "epoch": 0.7216, "loss_ce": 0.0037635352928191423, "loss_lvr": 0.40848827362060547, "loss_mode_switch": 0.0, "loss_total": 0.04461236298084259, "step": 1804 }, { "batch_size": 4, "epoch": 0.7216, "step": 1804, "tokens_per_device": 3740 }, { "epoch": 0.7216, "loss_ce": 0.34525537490844727, "loss_lvr": 0.8834533095359802, "loss_mode_switch": 0.0, "loss_total": 0.4336007237434387, "step": 1804 }, { "batch_size": 4, "epoch": 0.7216, "step": 1804, "tokens_per_device": 4208 }, { "epoch": 0.7216, "loss_ce": 0.06919941306114197, "loss_lvr": 0.8983452320098877, "loss_mode_switch": 0.0, "loss_total": 0.15903393924236298, "step": 1804 }, { "batch_size": 4, "epoch": 0.7216, "step": 1804, "tokens_per_device": 4512 }, { "epoch": 0.7216, "loss_ce": 0.3586830198764801, "loss_lvr": 0.7209506630897522, "loss_mode_switch": 0.0, "loss_total": 0.4307780861854553, "step": 1804 }, { "epoch": 0.722, "grad_norm": 1.1840420961380005, "learning_rate": 1.8934151072271573e-06, "loss": 0.2523, "step": 1805 }, { "batch_size": 4, "epoch": 0.722, "step": 1805, "tokens_per_device": 4596 }, { "epoch": 0.722, "loss_ce": 0.4542897939682007, "loss_lvr": 0.693102240562439, "loss_mode_switch": 0.0, "loss_total": 0.5236000418663025, "step": 1805 }, { "batch_size": 4, "epoch": 0.722, "step": 1805, "tokens_per_device": 4776 }, { "epoch": 0.722, "loss_ce": 0.13621622323989868, "loss_lvr": 0.4917882978916168, "loss_mode_switch": 0.0, "loss_total": 0.18539506196975708, "step": 1805 }, { "batch_size": 4, "epoch": 0.722, "step": 1805, "tokens_per_device": 1528 }, { "epoch": 0.722, "loss_ce": 0.21545153856277466, "loss_lvr": 0.8953701257705688, "loss_mode_switch": 0.0, "loss_total": 0.3049885630607605, "step": 1805 }, { "batch_size": 4, "epoch": 0.722, "step": 1805, "tokens_per_device": 3744 }, { "epoch": 0.722, "loss_ce": 0.24624699354171753, "loss_lvr": 0.9265462756156921, "loss_mode_switch": 0.0, "loss_total": 0.3389016389846802, "step": 1805 }, { "batch_size": 4, "epoch": 0.722, "step": 1805, "tokens_per_device": 1380 }, { "epoch": 0.722, "loss_ce": 0.3824089467525482, "loss_lvr": 0.978413462638855, "loss_mode_switch": 0.0, "loss_total": 0.4802502989768982, "step": 1805 }, { "batch_size": 4, "epoch": 0.722, "step": 1805, "tokens_per_device": 3760 }, { "epoch": 0.722, "loss_ce": 0.11772861331701279, "loss_lvr": 0.9042572379112244, "loss_mode_switch": 0.0, "loss_total": 0.2081543356180191, "step": 1805 }, { "batch_size": 4, "epoch": 0.722, "step": 1805, "tokens_per_device": 4292 }, { "epoch": 0.722, "loss_ce": 0.4510815739631653, "loss_lvr": 0.8862505555152893, "loss_mode_switch": 0.0, "loss_total": 0.5397066473960876, "step": 1805 }, { "batch_size": 1, "epoch": 0.722, "step": 1805, "tokens_per_device": 5094 }, { "epoch": 0.722, "loss_ce": 0.6824413537979126, "loss_lvr": 0.36962538957595825, "loss_mode_switch": 0.0, "loss_total": 0.719403862953186, "step": 1805 }, { "epoch": 0.7224, "grad_norm": 1.4070501327514648, "learning_rate": 1.8883422017385078e-06, "loss": 0.3213, "step": 1806 }, { "batch_size": 4, "epoch": 0.7224, "step": 1806, "tokens_per_device": 4204 }, { "epoch": 0.7224, "loss_ce": 0.14161072671413422, "loss_lvr": 0.8662477731704712, "loss_mode_switch": 0.0, "loss_total": 0.22823551297187805, "step": 1806 }, { "batch_size": 4, "epoch": 0.7224, "step": 1806, "tokens_per_device": 5096 }, { "epoch": 0.7224, "loss_ce": 0.2619784474372864, "loss_lvr": 0.7537451386451721, "loss_mode_switch": 0.0, "loss_total": 0.3373529613018036, "step": 1806 }, { "batch_size": 4, "epoch": 0.7224, "step": 1806, "tokens_per_device": 5292 }, { "epoch": 0.7224, "loss_ce": 0.016174716874957085, "loss_lvr": 0.664866030216217, "loss_mode_switch": 0.0, "loss_total": 0.08266132324934006, "step": 1806 }, { "batch_size": 1, "epoch": 0.7224, "step": 1806, "tokens_per_device": 4775 }, { "epoch": 0.7224, "loss_ce": 0.12138994038105011, "loss_lvr": 0.22541305422782898, "loss_mode_switch": 0.0, "loss_total": 0.14393123984336853, "step": 1806 }, { "batch_size": 4, "epoch": 0.7224, "step": 1806, "tokens_per_device": 2716 }, { "epoch": 0.7224, "loss_ce": 0.7372657656669617, "loss_lvr": 0.6388523578643799, "loss_mode_switch": 0.0, "loss_total": 0.8011509776115417, "step": 1806 }, { "batch_size": 1, "epoch": 0.7224, "step": 1806, "tokens_per_device": 4935 }, { "epoch": 0.7224, "loss_ce": 0.017567720264196396, "loss_lvr": 0.6439751386642456, "loss_mode_switch": 0.0, "loss_total": 0.08196523785591125, "step": 1806 }, { "batch_size": 1, "epoch": 0.7224, "step": 1806, "tokens_per_device": 5109 }, { "epoch": 0.7224, "loss_ce": 0.00290106562897563, "loss_lvr": 0.32880353927612305, "loss_mode_switch": 0.0, "loss_total": 0.03578142076730728, "step": 1806 }, { "batch_size": 4, "epoch": 0.7224, "step": 1806, "tokens_per_device": 2656 }, { "epoch": 0.7224, "loss_ce": 0.4824078679084778, "loss_lvr": 0.8154881596565247, "loss_mode_switch": 0.0, "loss_total": 0.5639566779136658, "step": 1806 }, { "epoch": 0.7228, "grad_norm": 1.4832077026367188, "learning_rate": 1.883274518624556e-06, "loss": 0.3073, "step": 1807 }, { "batch_size": 1, "epoch": 0.7228, "step": 1807, "tokens_per_device": 4852 }, { "epoch": 0.7228, "loss_ce": 0.009128283709287643, "loss_lvr": 0.38742339611053467, "loss_mode_switch": 0.0, "loss_total": 0.04787062481045723, "step": 1807 }, { "batch_size": 4, "epoch": 0.7228, "step": 1807, "tokens_per_device": 2664 }, { "epoch": 0.7228, "loss_ce": 0.2903619110584259, "loss_lvr": 0.786098301410675, "loss_mode_switch": 0.0, "loss_total": 0.36897173523902893, "step": 1807 }, { "batch_size": 4, "epoch": 0.7228, "step": 1807, "tokens_per_device": 1412 }, { "epoch": 0.7228, "loss_ce": 0.18847984075546265, "loss_lvr": 0.9699251651763916, "loss_mode_switch": 0.0, "loss_total": 0.2854723632335663, "step": 1807 }, { "batch_size": 4, "epoch": 0.7228, "step": 1807, "tokens_per_device": 1268 }, { "epoch": 0.7228, "loss_ce": 0.30845728516578674, "loss_lvr": 1.131529450416565, "loss_mode_switch": 0.0, "loss_total": 0.4216102361679077, "step": 1807 }, { "batch_size": 4, "epoch": 0.7228, "step": 1807, "tokens_per_device": 3940 }, { "epoch": 0.7228, "loss_ce": 0.20810148119926453, "loss_lvr": 0.8949018120765686, "loss_mode_switch": 0.0, "loss_total": 0.2975916564464569, "step": 1807 }, { "batch_size": 1, "epoch": 0.7228, "step": 1807, "tokens_per_device": 4894 }, { "epoch": 0.7228, "loss_ce": 0.22361640632152557, "loss_lvr": 0.7255698442459106, "loss_mode_switch": 0.0, "loss_total": 0.2961733937263489, "step": 1807 }, { "batch_size": 4, "epoch": 0.7228, "step": 1807, "tokens_per_device": 4280 }, { "epoch": 0.7228, "loss_ce": 0.24652409553527832, "loss_lvr": 0.8944474458694458, "loss_mode_switch": 0.0, "loss_total": 0.33596885204315186, "step": 1807 }, { "batch_size": 4, "epoch": 0.7228, "step": 1807, "tokens_per_device": 4712 }, { "epoch": 0.7228, "loss_ce": 0.3940828740596771, "loss_lvr": 1.2291938066482544, "loss_mode_switch": 0.0, "loss_total": 0.5170022249221802, "step": 1807 }, { "epoch": 0.7232, "grad_norm": 1.3665755987167358, "learning_rate": 1.8782120663905218e-06, "loss": 0.2985, "step": 1808 }, { "batch_size": 4, "epoch": 0.7232, "step": 1808, "tokens_per_device": 4416 }, { "epoch": 0.7232, "loss_ce": 0.03087005764245987, "loss_lvr": 0.9033183455467224, "loss_mode_switch": 0.0, "loss_total": 0.12120189517736435, "step": 1808 }, { "batch_size": 4, "epoch": 0.7232, "step": 1808, "tokens_per_device": 1436 }, { "epoch": 0.7232, "loss_ce": 0.6560802459716797, "loss_lvr": 1.0076565742492676, "loss_mode_switch": 0.0, "loss_total": 0.7568458914756775, "step": 1808 }, { "batch_size": 4, "epoch": 0.7232, "step": 1808, "tokens_per_device": 4208 }, { "epoch": 0.7232, "loss_ce": 0.4621763527393341, "loss_lvr": 1.076617956161499, "loss_mode_switch": 0.0, "loss_total": 0.5698381662368774, "step": 1808 }, { "batch_size": 1, "epoch": 0.7232, "step": 1808, "tokens_per_device": 4755 }, { "epoch": 0.7232, "loss_ce": 0.11756972223520279, "loss_lvr": 0.4106624722480774, "loss_mode_switch": 0.0, "loss_total": 0.1586359739303589, "step": 1808 }, { "batch_size": 4, "epoch": 0.7232, "step": 1808, "tokens_per_device": 4180 }, { "epoch": 0.7232, "loss_ce": 0.41045957803726196, "loss_lvr": 0.9145803451538086, "loss_mode_switch": 0.0, "loss_total": 0.5019176006317139, "step": 1808 }, { "batch_size": 4, "epoch": 0.7232, "step": 1808, "tokens_per_device": 3780 }, { "epoch": 0.7232, "loss_ce": 0.29921796917915344, "loss_lvr": 0.8122944235801697, "loss_mode_switch": 0.0, "loss_total": 0.3804474174976349, "step": 1808 }, { "batch_size": 1, "epoch": 0.7232, "step": 1808, "tokens_per_device": 5096 }, { "epoch": 0.7232, "loss_ce": 0.48866620659828186, "loss_lvr": 0.42309361696243286, "loss_mode_switch": 0.0, "loss_total": 0.5309755802154541, "step": 1808 }, { "batch_size": 4, "epoch": 0.7232, "step": 1808, "tokens_per_device": 1904 }, { "epoch": 0.7232, "loss_ce": 0.36454567313194275, "loss_lvr": 0.8583644032478333, "loss_mode_switch": 0.0, "loss_total": 0.4503821134567261, "step": 1808 }, { "epoch": 0.7236, "grad_norm": 1.393337368965149, "learning_rate": 1.8731548535328497e-06, "loss": 0.2999, "step": 1809 }, { "batch_size": 4, "epoch": 0.7236, "step": 1809, "tokens_per_device": 4220 }, { "epoch": 0.7236, "loss_ce": 0.2703971266746521, "loss_lvr": 0.8224542140960693, "loss_mode_switch": 0.0, "loss_total": 0.3526425361633301, "step": 1809 }, { "batch_size": 4, "epoch": 0.7236, "step": 1809, "tokens_per_device": 2664 }, { "epoch": 0.7236, "loss_ce": 0.37122592329978943, "loss_lvr": 0.757668137550354, "loss_mode_switch": 0.0, "loss_total": 0.44699275493621826, "step": 1809 }, { "batch_size": 1, "epoch": 0.7236, "step": 1809, "tokens_per_device": 4801 }, { "epoch": 0.7236, "loss_ce": 0.13026443123817444, "loss_lvr": 0.43530943989753723, "loss_mode_switch": 0.0, "loss_total": 0.17379537224769592, "step": 1809 }, { "batch_size": 1, "epoch": 0.7236, "step": 1809, "tokens_per_device": 5158 }, { "epoch": 0.7236, "loss_ce": 0.0003090747050009668, "loss_lvr": 0.39066773653030396, "loss_mode_switch": 0.0, "loss_total": 0.039375849068164825, "step": 1809 }, { "batch_size": 1, "epoch": 0.7236, "step": 1809, "tokens_per_device": 5047 }, { "epoch": 0.7236, "loss_ce": 0.12459111213684082, "loss_lvr": 0.5685986876487732, "loss_mode_switch": 0.0, "loss_total": 0.1814509779214859, "step": 1809 }, { "batch_size": 1, "epoch": 0.7236, "step": 1809, "tokens_per_device": 4893 }, { "epoch": 0.7236, "loss_ce": 0.006009550299495459, "loss_lvr": 0.36842241883277893, "loss_mode_switch": 0.0, "loss_total": 0.04285179078578949, "step": 1809 }, { "batch_size": 4, "epoch": 0.7236, "step": 1809, "tokens_per_device": 1780 }, { "epoch": 0.7236, "loss_ce": 1.003005027770996, "loss_lvr": 0.9210188984870911, "loss_mode_switch": 0.0, "loss_total": 1.0951069593429565, "step": 1809 }, { "batch_size": 1, "epoch": 0.7236, "step": 1809, "tokens_per_device": 5163 }, { "epoch": 0.7236, "loss_ce": 0.012851308099925518, "loss_lvr": 0.3553503155708313, "loss_mode_switch": 0.0, "loss_total": 0.0483863428235054, "step": 1809 }, { "epoch": 0.724, "grad_norm": 1.7589207887649536, "learning_rate": 1.8681028885391905e-06, "loss": 0.3106, "step": 1810 }, { "batch_size": 4, "epoch": 0.724, "step": 1810, "tokens_per_device": 3800 }, { "epoch": 0.724, "loss_ce": 0.20651964843273163, "loss_lvr": 0.9522821307182312, "loss_mode_switch": 0.0, "loss_total": 0.3017478585243225, "step": 1810 }, { "batch_size": 1, "epoch": 0.724, "step": 1810, "tokens_per_device": 4674 }, { "epoch": 0.724, "loss_ce": 0.0038232088554650545, "loss_lvr": 0.4125484228134155, "loss_mode_switch": 0.0, "loss_total": 0.04507805407047272, "step": 1810 }, { "batch_size": 1, "epoch": 0.724, "step": 1810, "tokens_per_device": 5237 }, { "epoch": 0.724, "loss_ce": 0.9198205471038818, "loss_lvr": 0.5647730827331543, "loss_mode_switch": 0.0, "loss_total": 0.9762978553771973, "step": 1810 }, { "batch_size": 4, "epoch": 0.724, "step": 1810, "tokens_per_device": 3788 }, { "epoch": 0.724, "loss_ce": 0.1882900446653366, "loss_lvr": 0.9181340336799622, "loss_mode_switch": 0.0, "loss_total": 0.2801034450531006, "step": 1810 }, { "batch_size": 4, "epoch": 0.724, "step": 1810, "tokens_per_device": 1628 }, { "epoch": 0.724, "loss_ce": 0.5288482904434204, "loss_lvr": 0.9810501933097839, "loss_mode_switch": 0.0, "loss_total": 0.6269533038139343, "step": 1810 }, { "batch_size": 1, "epoch": 0.724, "step": 1810, "tokens_per_device": 4851 }, { "epoch": 0.724, "loss_ce": 0.24699048697948456, "loss_lvr": 0.853179931640625, "loss_mode_switch": 0.0, "loss_total": 0.33230847120285034, "step": 1810 }, { "batch_size": 4, "epoch": 0.724, "step": 1810, "tokens_per_device": 10656 }, { "epoch": 0.724, "loss_ce": 0.23725727200508118, "loss_lvr": 0.45632222294807434, "loss_mode_switch": 0.0, "loss_total": 0.2828894853591919, "step": 1810 }, { "batch_size": 4, "epoch": 0.724, "step": 1810, "tokens_per_device": 4324 }, { "epoch": 0.724, "loss_ce": 0.15357770025730133, "loss_lvr": 0.8580069541931152, "loss_mode_switch": 0.0, "loss_total": 0.23937839269638062, "step": 1810 }, { "epoch": 0.7244, "grad_norm": 1.2846757173538208, "learning_rate": 1.863056179888385e-06, "loss": 0.2759, "step": 1811 }, { "batch_size": 4, "epoch": 0.7244, "step": 1811, "tokens_per_device": 4748 }, { "epoch": 0.7244, "loss_ce": 0.2161683589220047, "loss_lvr": 0.7131657600402832, "loss_mode_switch": 0.0, "loss_total": 0.28748494386672974, "step": 1811 }, { "batch_size": 4, "epoch": 0.7244, "step": 1811, "tokens_per_device": 2664 }, { "epoch": 0.7244, "loss_ce": 0.5757491588592529, "loss_lvr": 0.7005059123039246, "loss_mode_switch": 0.0, "loss_total": 0.6457997560501099, "step": 1811 }, { "batch_size": 4, "epoch": 0.7244, "step": 1811, "tokens_per_device": 12004 }, { "epoch": 0.7244, "loss_ce": 0.15963467955589294, "loss_lvr": 0.4592514932155609, "loss_mode_switch": 0.0, "loss_total": 0.2055598348379135, "step": 1811 }, { "batch_size": 1, "epoch": 0.7244, "step": 1811, "tokens_per_device": 5109 }, { "epoch": 0.7244, "loss_ce": 0.02036684937775135, "loss_lvr": 0.37668436765670776, "loss_mode_switch": 0.0, "loss_total": 0.05803528428077698, "step": 1811 }, { "batch_size": 4, "epoch": 0.7244, "step": 1811, "tokens_per_device": 4388 }, { "epoch": 0.7244, "loss_ce": 0.0653778612613678, "loss_lvr": 0.7070683836936951, "loss_mode_switch": 0.0, "loss_total": 0.13608470559120178, "step": 1811 }, { "batch_size": 4, "epoch": 0.7244, "step": 1811, "tokens_per_device": 4356 }, { "epoch": 0.7244, "loss_ce": 0.23629257082939148, "loss_lvr": 0.8468459248542786, "loss_mode_switch": 0.0, "loss_total": 0.3209771513938904, "step": 1811 }, { "batch_size": 4, "epoch": 0.7244, "step": 1811, "tokens_per_device": 3976 }, { "epoch": 0.7244, "loss_ce": 0.28841596841812134, "loss_lvr": 0.956588625907898, "loss_mode_switch": 0.0, "loss_total": 0.3840748369693756, "step": 1811 }, { "batch_size": 4, "epoch": 0.7244, "step": 1811, "tokens_per_device": 2752 }, { "epoch": 0.7244, "loss_ce": 0.5288329124450684, "loss_lvr": 0.9299162030220032, "loss_mode_switch": 0.0, "loss_total": 0.6218245029449463, "step": 1811 }, { "epoch": 0.7248, "grad_norm": 1.2309963703155518, "learning_rate": 1.8580147360504513e-06, "loss": 0.2648, "step": 1812 }, { "batch_size": 4, "epoch": 0.7248, "step": 1812, "tokens_per_device": 2636 }, { "epoch": 0.7248, "loss_ce": 0.5338687896728516, "loss_lvr": 0.8441909551620483, "loss_mode_switch": 0.0, "loss_total": 0.6182878613471985, "step": 1812 }, { "batch_size": 1, "epoch": 0.7248, "step": 1812, "tokens_per_device": 4745 }, { "epoch": 0.7248, "loss_ce": 0.0048090494237840176, "loss_lvr": 0.5342937111854553, "loss_mode_switch": 0.0, "loss_total": 0.05823842063546181, "step": 1812 }, { "batch_size": 4, "epoch": 0.7248, "step": 1812, "tokens_per_device": 12860 }, { "epoch": 0.7248, "loss_ce": 0.03814277425408363, "loss_lvr": 0.5923868417739868, "loss_mode_switch": 0.0, "loss_total": 0.09738145768642426, "step": 1812 }, { "batch_size": 1, "epoch": 0.7248, "step": 1812, "tokens_per_device": 5018 }, { "epoch": 0.7248, "loss_ce": 0.0006713533075526357, "loss_lvr": 0.45382875204086304, "loss_mode_switch": 0.0, "loss_total": 0.04605422914028168, "step": 1812 }, { "batch_size": 1, "epoch": 0.7248, "step": 1812, "tokens_per_device": 4965 }, { "epoch": 0.7248, "loss_ce": 0.2402525544166565, "loss_lvr": 0.23761892318725586, "loss_mode_switch": 0.0, "loss_total": 0.26401445269584656, "step": 1812 }, { "batch_size": 1, "epoch": 0.7248, "step": 1812, "tokens_per_device": 5689 }, { "epoch": 0.7248, "loss_ce": 0.05447991564869881, "loss_lvr": 0.2844955027103424, "loss_mode_switch": 0.0, "loss_total": 0.08292946219444275, "step": 1812 }, { "batch_size": 4, "epoch": 0.7248, "step": 1812, "tokens_per_device": 4244 }, { "epoch": 0.7248, "loss_ce": 0.0028431727550923824, "loss_lvr": 0.3554708659648895, "loss_mode_switch": 0.0, "loss_total": 0.03839026018977165, "step": 1812 }, { "batch_size": 1, "epoch": 0.7248, "step": 1812, "tokens_per_device": 4851 }, { "epoch": 0.7248, "loss_ce": 0.007160334847867489, "loss_lvr": 0.5979946851730347, "loss_mode_switch": 0.0, "loss_total": 0.06695980578660965, "step": 1812 }, { "epoch": 0.7252, "grad_norm": 1.4787607192993164, "learning_rate": 1.8529785654865744e-06, "loss": 0.2871, "step": 1813 }, { "batch_size": 1, "epoch": 0.7252, "step": 1813, "tokens_per_device": 5654 }, { "epoch": 0.7252, "loss_ce": 0.07416924089193344, "loss_lvr": 0.2957012355327606, "loss_mode_switch": 0.0, "loss_total": 0.10373936593532562, "step": 1813 }, { "batch_size": 4, "epoch": 0.7252, "step": 1813, "tokens_per_device": 1876 }, { "epoch": 0.7252, "loss_ce": 0.24226771295070648, "loss_lvr": 0.9805063605308533, "loss_mode_switch": 0.0, "loss_total": 0.34031835198402405, "step": 1813 }, { "batch_size": 4, "epoch": 0.7252, "step": 1813, "tokens_per_device": 5692 }, { "epoch": 0.7252, "loss_ce": 0.04852178320288658, "loss_lvr": 0.746674120426178, "loss_mode_switch": 0.0, "loss_total": 0.12318919599056244, "step": 1813 }, { "batch_size": 1, "epoch": 0.7252, "step": 1813, "tokens_per_device": 4852 }, { "epoch": 0.7252, "loss_ce": 0.015380428172647953, "loss_lvr": 0.29719892144203186, "loss_mode_switch": 0.0, "loss_total": 0.045100320130586624, "step": 1813 }, { "batch_size": 4, "epoch": 0.7252, "step": 1813, "tokens_per_device": 2684 }, { "epoch": 0.7252, "loss_ce": 0.09181620925664902, "loss_lvr": 0.7219402194023132, "loss_mode_switch": 0.0, "loss_total": 0.16401022672653198, "step": 1813 }, { "batch_size": 4, "epoch": 0.7252, "step": 1813, "tokens_per_device": 4240 }, { "epoch": 0.7252, "loss_ce": 0.3310309946537018, "loss_lvr": 0.8067207336425781, "loss_mode_switch": 0.0, "loss_total": 0.41170307993888855, "step": 1813 }, { "batch_size": 1, "epoch": 0.7252, "step": 1813, "tokens_per_device": 4896 }, { "epoch": 0.7252, "loss_ce": 0.014350137673318386, "loss_lvr": 0.45223960280418396, "loss_mode_switch": 0.0, "loss_total": 0.059574101120233536, "step": 1813 }, { "batch_size": 4, "epoch": 0.7252, "step": 1813, "tokens_per_device": 4240 }, { "epoch": 0.7252, "loss_ce": 0.09271307289600372, "loss_lvr": 0.9530247449874878, "loss_mode_switch": 0.0, "loss_total": 0.18801555037498474, "step": 1813 }, { "epoch": 0.7256, "grad_norm": 1.3664236068725586, "learning_rate": 1.84794767664909e-06, "loss": 0.3273, "step": 1814 }, { "batch_size": 4, "epoch": 0.7256, "step": 1814, "tokens_per_device": 10984 }, { "epoch": 0.7256, "loss_ce": 0.8425661325454712, "loss_lvr": 1.0580084323883057, "loss_mode_switch": 0.0, "loss_total": 0.9483669996261597, "step": 1814 }, { "batch_size": 4, "epoch": 0.7256, "step": 1814, "tokens_per_device": 4392 }, { "epoch": 0.7256, "loss_ce": 0.2036857306957245, "loss_lvr": 0.689638078212738, "loss_mode_switch": 0.0, "loss_total": 0.27264952659606934, "step": 1814 }, { "batch_size": 1, "epoch": 0.7256, "step": 1814, "tokens_per_device": 5158 }, { "epoch": 0.7256, "loss_ce": 0.06536342203617096, "loss_lvr": 0.2911878526210785, "loss_mode_switch": 0.0, "loss_total": 0.09448220580816269, "step": 1814 }, { "batch_size": 4, "epoch": 0.7256, "step": 1814, "tokens_per_device": 2768 }, { "epoch": 0.7256, "loss_ce": 0.668402910232544, "loss_lvr": 0.747693657875061, "loss_mode_switch": 0.0, "loss_total": 0.743172287940979, "step": 1814 }, { "batch_size": 4, "epoch": 0.7256, "step": 1814, "tokens_per_device": 2712 }, { "epoch": 0.7256, "loss_ce": 0.08722180128097534, "loss_lvr": 0.8217840790748596, "loss_mode_switch": 0.0, "loss_total": 0.16940021514892578, "step": 1814 }, { "batch_size": 4, "epoch": 0.7256, "step": 1814, "tokens_per_device": 11416 }, { "epoch": 0.7256, "loss_ce": 0.04544253647327423, "loss_lvr": 0.5686842799186707, "loss_mode_switch": 0.0, "loss_total": 0.10231097042560577, "step": 1814 }, { "batch_size": 4, "epoch": 0.7256, "step": 1814, "tokens_per_device": 1448 }, { "epoch": 0.7256, "loss_ce": 0.5107317566871643, "loss_lvr": 0.816554605960846, "loss_mode_switch": 0.0, "loss_total": 0.5923871994018555, "step": 1814 }, { "batch_size": 4, "epoch": 0.7256, "step": 1814, "tokens_per_device": 7268 }, { "epoch": 0.7256, "loss_ce": 0.065752774477005, "loss_lvr": 0.6820330023765564, "loss_mode_switch": 0.0, "loss_total": 0.13395607471466064, "step": 1814 }, { "epoch": 0.726, "grad_norm": 1.3692076206207275, "learning_rate": 1.8429220779814654e-06, "loss": 0.326, "step": 1815 }, { "batch_size": 4, "epoch": 0.726, "step": 1815, "tokens_per_device": 5056 }, { "epoch": 0.726, "loss_ce": 0.011902688071131706, "loss_lvr": 0.7379127144813538, "loss_mode_switch": 0.0, "loss_total": 0.08569396287202835, "step": 1815 }, { "batch_size": 4, "epoch": 0.726, "step": 1815, "tokens_per_device": 4248 }, { "epoch": 0.726, "loss_ce": 0.10252886265516281, "loss_lvr": 1.1142396926879883, "loss_mode_switch": 0.0, "loss_total": 0.21395283937454224, "step": 1815 }, { "batch_size": 4, "epoch": 0.726, "step": 1815, "tokens_per_device": 4340 }, { "epoch": 0.726, "loss_ce": 0.6220927834510803, "loss_lvr": 0.8058332800865173, "loss_mode_switch": 0.0, "loss_total": 0.7026761174201965, "step": 1815 }, { "batch_size": 4, "epoch": 0.726, "step": 1815, "tokens_per_device": 1400 }, { "epoch": 0.726, "loss_ce": 0.4967091977596283, "loss_lvr": 1.1193692684173584, "loss_mode_switch": 0.0, "loss_total": 0.6086461544036865, "step": 1815 }, { "batch_size": 4, "epoch": 0.726, "step": 1815, "tokens_per_device": 4540 }, { "epoch": 0.726, "loss_ce": 0.3717697858810425, "loss_lvr": 0.8792411088943481, "loss_mode_switch": 0.0, "loss_total": 0.45969390869140625, "step": 1815 }, { "batch_size": 4, "epoch": 0.726, "step": 1815, "tokens_per_device": 6028 }, { "epoch": 0.726, "loss_ce": 0.6013154983520508, "loss_lvr": 0.7015622854232788, "loss_mode_switch": 0.0, "loss_total": 0.6714717149734497, "step": 1815 }, { "batch_size": 4, "epoch": 0.726, "step": 1815, "tokens_per_device": 5576 }, { "epoch": 0.726, "loss_ce": 0.0530482642352581, "loss_lvr": 0.717933177947998, "loss_mode_switch": 0.0, "loss_total": 0.1248415857553482, "step": 1815 }, { "batch_size": 4, "epoch": 0.726, "step": 1815, "tokens_per_device": 5296 }, { "epoch": 0.726, "loss_ce": 0.008891632780432701, "loss_lvr": 0.7590838670730591, "loss_mode_switch": 0.0, "loss_total": 0.08480001986026764, "step": 1815 }, { "epoch": 0.7264, "grad_norm": 1.5765546560287476, "learning_rate": 1.837901777918291e-06, "loss": 0.3952, "step": 1816 }, { "batch_size": 4, "epoch": 0.7264, "step": 1816, "tokens_per_device": 4316 }, { "epoch": 0.7264, "loss_ce": 0.0011264914646744728, "loss_lvr": 0.538856029510498, "loss_mode_switch": 0.0, "loss_total": 0.05501209571957588, "step": 1816 }, { "batch_size": 4, "epoch": 0.7264, "step": 1816, "tokens_per_device": 7068 }, { "epoch": 0.7264, "loss_ce": 0.18337561190128326, "loss_lvr": 0.7322279810905457, "loss_mode_switch": 0.0, "loss_total": 0.25659841299057007, "step": 1816 }, { "batch_size": 4, "epoch": 0.7264, "step": 1816, "tokens_per_device": 1376 }, { "epoch": 0.7264, "loss_ce": 0.7335084676742554, "loss_lvr": 0.9734782576560974, "loss_mode_switch": 0.0, "loss_total": 0.8308563232421875, "step": 1816 }, { "batch_size": 4, "epoch": 0.7264, "step": 1816, "tokens_per_device": 12604 }, { "epoch": 0.7264, "loss_ce": 0.0021801558323204517, "loss_lvr": 0.6356419324874878, "loss_mode_switch": 0.0, "loss_total": 0.06574435532093048, "step": 1816 }, { "batch_size": 4, "epoch": 0.7264, "step": 1816, "tokens_per_device": 1472 }, { "epoch": 0.7264, "loss_ce": 0.6678410172462463, "loss_lvr": 0.9849986433982849, "loss_mode_switch": 0.0, "loss_total": 0.7663408517837524, "step": 1816 }, { "batch_size": 1, "epoch": 0.7264, "step": 1816, "tokens_per_device": 4835 }, { "epoch": 0.7264, "loss_ce": 0.6532803773880005, "loss_lvr": 0.4545882046222687, "loss_mode_switch": 0.0, "loss_total": 0.6987391710281372, "step": 1816 }, { "batch_size": 4, "epoch": 0.7264, "step": 1816, "tokens_per_device": 1536 }, { "epoch": 0.7264, "loss_ce": 0.1260324865579605, "loss_lvr": 1.065366268157959, "loss_mode_switch": 0.0, "loss_total": 0.2325691133737564, "step": 1816 }, { "batch_size": 1, "epoch": 0.7264, "step": 1816, "tokens_per_device": 5129 }, { "epoch": 0.7264, "loss_ce": 0.2060088813304901, "loss_lvr": 0.19657549262046814, "loss_mode_switch": 0.0, "loss_total": 0.22566643357276917, "step": 1816 }, { "epoch": 0.7268, "grad_norm": 1.3380473852157593, "learning_rate": 1.8328867848852633e-06, "loss": 0.2712, "step": 1817 }, { "batch_size": 1, "epoch": 0.7268, "step": 1817, "tokens_per_device": 5090 }, { "epoch": 0.7268, "loss_ce": 0.5443357229232788, "loss_lvr": 0.21620851755142212, "loss_mode_switch": 0.0, "loss_total": 0.5659565925598145, "step": 1817 }, { "batch_size": 4, "epoch": 0.7268, "step": 1817, "tokens_per_device": 4856 }, { "epoch": 0.7268, "loss_ce": 0.03623088076710701, "loss_lvr": 0.591191828250885, "loss_mode_switch": 0.0, "loss_total": 0.09535006433725357, "step": 1817 }, { "batch_size": 4, "epoch": 0.7268, "step": 1817, "tokens_per_device": 2564 }, { "epoch": 0.7268, "loss_ce": 0.9436783790588379, "loss_lvr": 0.9082327485084534, "loss_mode_switch": 0.0, "loss_total": 1.0345016717910767, "step": 1817 }, { "batch_size": 4, "epoch": 0.7268, "step": 1817, "tokens_per_device": 8676 }, { "epoch": 0.7268, "loss_ce": 0.48169589042663574, "loss_lvr": 0.7990948557853699, "loss_mode_switch": 0.0, "loss_total": 0.5616053938865662, "step": 1817 }, { "batch_size": 4, "epoch": 0.7268, "step": 1817, "tokens_per_device": 4732 }, { "epoch": 0.7268, "loss_ce": 0.3999946415424347, "loss_lvr": 0.7461544275283813, "loss_mode_switch": 0.0, "loss_total": 0.4746100902557373, "step": 1817 }, { "batch_size": 4, "epoch": 0.7268, "step": 1817, "tokens_per_device": 4060 }, { "epoch": 0.7268, "loss_ce": 0.017689021304249763, "loss_lvr": 0.7335793375968933, "loss_mode_switch": 0.0, "loss_total": 0.09104695171117783, "step": 1817 }, { "batch_size": 1, "epoch": 0.7268, "step": 1817, "tokens_per_device": 5108 }, { "epoch": 0.7268, "loss_ce": 0.008283893577754498, "loss_lvr": 0.2628297507762909, "loss_mode_switch": 0.0, "loss_total": 0.03456686809659004, "step": 1817 }, { "batch_size": 4, "epoch": 0.7268, "step": 1817, "tokens_per_device": 5772 }, { "epoch": 0.7268, "loss_ce": 0.0513533391058445, "loss_lvr": 0.7103337049484253, "loss_mode_switch": 0.0, "loss_total": 0.12238670885562897, "step": 1817 }, { "epoch": 0.7272, "grad_norm": 1.3012226819992065, "learning_rate": 1.8278771072991748e-06, "loss": 0.2966, "step": 1818 }, { "batch_size": 4, "epoch": 0.7272, "step": 1818, "tokens_per_device": 5848 }, { "epoch": 0.7272, "loss_ce": 0.26518306136131287, "loss_lvr": 0.8267040848731995, "loss_mode_switch": 0.0, "loss_total": 0.34785348176956177, "step": 1818 }, { "batch_size": 1, "epoch": 0.7272, "step": 1818, "tokens_per_device": 4904 }, { "epoch": 0.7272, "loss_ce": 0.015712980180978775, "loss_lvr": 0.27845075726509094, "loss_mode_switch": 0.0, "loss_total": 0.04355805367231369, "step": 1818 }, { "batch_size": 1, "epoch": 0.7272, "step": 1818, "tokens_per_device": 4945 }, { "epoch": 0.7272, "loss_ce": 1.4116932153701782, "loss_lvr": 0.7508109211921692, "loss_mode_switch": 0.0, "loss_total": 1.4867743253707886, "step": 1818 }, { "batch_size": 1, "epoch": 0.7272, "step": 1818, "tokens_per_device": 5002 }, { "epoch": 0.7272, "loss_ce": 0.4095015227794647, "loss_lvr": 0.3859044909477234, "loss_mode_switch": 0.0, "loss_total": 0.448091983795166, "step": 1818 }, { "batch_size": 4, "epoch": 0.7272, "step": 1818, "tokens_per_device": 1348 }, { "epoch": 0.7272, "loss_ce": 0.06671254336833954, "loss_lvr": 0.8152068853378296, "loss_mode_switch": 0.0, "loss_total": 0.14823323488235474, "step": 1818 }, { "batch_size": 4, "epoch": 0.7272, "step": 1818, "tokens_per_device": 3004 }, { "epoch": 0.7272, "loss_ce": 0.1521356999874115, "loss_lvr": 0.9903990030288696, "loss_mode_switch": 0.0, "loss_total": 0.2511756122112274, "step": 1818 }, { "batch_size": 1, "epoch": 0.7272, "step": 1818, "tokens_per_device": 6274 }, { "epoch": 0.7272, "loss_ce": 0.011005413718521595, "loss_lvr": 0.27869126200675964, "loss_mode_switch": 0.0, "loss_total": 0.038874540477991104, "step": 1818 }, { "batch_size": 1, "epoch": 0.7272, "step": 1818, "tokens_per_device": 5109 }, { "epoch": 0.7272, "loss_ce": 0.005634487606585026, "loss_lvr": 0.3831734359264374, "loss_mode_switch": 0.0, "loss_total": 0.04395183175802231, "step": 1818 }, { "epoch": 0.7276, "grad_norm": 1.3456288576126099, "learning_rate": 1.8228727535678959e-06, "loss": 0.2807, "step": 1819 }, { "batch_size": 4, "epoch": 0.7276, "step": 1819, "tokens_per_device": 4092 }, { "epoch": 0.7276, "loss_ce": 0.19923782348632812, "loss_lvr": 0.7547523975372314, "loss_mode_switch": 0.0, "loss_total": 0.27471306920051575, "step": 1819 }, { "batch_size": 1, "epoch": 0.7276, "step": 1819, "tokens_per_device": 4766 }, { "epoch": 0.7276, "loss_ce": 0.0005655995337292552, "loss_lvr": 0.34146860241889954, "loss_mode_switch": 0.0, "loss_total": 0.03471245989203453, "step": 1819 }, { "batch_size": 4, "epoch": 0.7276, "step": 1819, "tokens_per_device": 2760 }, { "epoch": 0.7276, "loss_ce": 0.15266230702400208, "loss_lvr": 0.8456271886825562, "loss_mode_switch": 0.0, "loss_total": 0.2372250258922577, "step": 1819 }, { "batch_size": 4, "epoch": 0.7276, "step": 1819, "tokens_per_device": 5720 }, { "epoch": 0.7276, "loss_ce": 0.26701563596725464, "loss_lvr": 0.8143388032913208, "loss_mode_switch": 0.0, "loss_total": 0.3484495282173157, "step": 1819 }, { "batch_size": 1, "epoch": 0.7276, "step": 1819, "tokens_per_device": 5238 }, { "epoch": 0.7276, "loss_ce": 0.006143998820334673, "loss_lvr": 0.3636684715747833, "loss_mode_switch": 0.0, "loss_total": 0.04251084476709366, "step": 1819 }, { "batch_size": 4, "epoch": 0.7276, "step": 1819, "tokens_per_device": 7004 }, { "epoch": 0.7276, "loss_ce": 0.16362982988357544, "loss_lvr": 0.8587655425071716, "loss_mode_switch": 0.0, "loss_total": 0.2495063841342926, "step": 1819 }, { "batch_size": 4, "epoch": 0.7276, "step": 1819, "tokens_per_device": 1596 }, { "epoch": 0.7276, "loss_ce": 0.4982798397541046, "loss_lvr": 0.9539303183555603, "loss_mode_switch": 0.0, "loss_total": 0.5936728715896606, "step": 1819 }, { "batch_size": 4, "epoch": 0.7276, "step": 1819, "tokens_per_device": 4192 }, { "epoch": 0.7276, "loss_ce": 0.054996099323034286, "loss_lvr": 0.8588327765464783, "loss_mode_switch": 0.0, "loss_total": 0.14087937772274017, "step": 1819 }, { "epoch": 0.728, "grad_norm": 1.2112854719161987, "learning_rate": 1.81787373209036e-06, "loss": 0.2416, "step": 1820 }, { "batch_size": 4, "epoch": 0.728, "step": 1820, "tokens_per_device": 4728 }, { "epoch": 0.728, "loss_ce": 0.23384851217269897, "loss_lvr": 0.8593152165412903, "loss_mode_switch": 0.0, "loss_total": 0.31978005170822144, "step": 1820 }, { "batch_size": 1, "epoch": 0.728, "step": 1820, "tokens_per_device": 4875 }, { "epoch": 0.728, "loss_ce": 0.0002555738901719451, "loss_lvr": 0.7860742211341858, "loss_mode_switch": 0.0, "loss_total": 0.0788630023598671, "step": 1820 }, { "batch_size": 4, "epoch": 0.728, "step": 1820, "tokens_per_device": 2688 }, { "epoch": 0.728, "loss_ce": 0.3009914457798004, "loss_lvr": 0.6990169882774353, "loss_mode_switch": 0.0, "loss_total": 0.3708931505680084, "step": 1820 }, { "batch_size": 4, "epoch": 0.728, "step": 1820, "tokens_per_device": 5784 }, { "epoch": 0.728, "loss_ce": 0.21018528938293457, "loss_lvr": 0.7494435906410217, "loss_mode_switch": 0.0, "loss_total": 0.2851296663284302, "step": 1820 }, { "batch_size": 4, "epoch": 0.728, "step": 1820, "tokens_per_device": 2804 }, { "epoch": 0.728, "loss_ce": 0.5852259397506714, "loss_lvr": 0.9245772361755371, "loss_mode_switch": 0.0, "loss_total": 0.6776836514472961, "step": 1820 }, { "batch_size": 4, "epoch": 0.728, "step": 1820, "tokens_per_device": 4352 }, { "epoch": 0.728, "loss_ce": 0.32264819741249084, "loss_lvr": 0.8710406422615051, "loss_mode_switch": 0.0, "loss_total": 0.4097522497177124, "step": 1820 }, { "batch_size": 1, "epoch": 0.728, "step": 1820, "tokens_per_device": 5208 }, { "epoch": 0.728, "loss_ce": 0.062149517238140106, "loss_lvr": 0.5053237676620483, "loss_mode_switch": 0.0, "loss_total": 0.11268189549446106, "step": 1820 }, { "batch_size": 4, "epoch": 0.728, "step": 1820, "tokens_per_device": 1320 }, { "epoch": 0.728, "loss_ce": 0.24123917520046234, "loss_lvr": 0.8787164688110352, "loss_mode_switch": 0.0, "loss_total": 0.3291108310222626, "step": 1820 }, { "epoch": 0.7284, "grad_norm": 1.3346728086471558, "learning_rate": 1.8128800512565514e-06, "loss": 0.2884, "step": 1821 }, { "batch_size": 4, "epoch": 0.7284, "step": 1821, "tokens_per_device": 2544 }, { "epoch": 0.7284, "loss_ce": 0.1362430602312088, "loss_lvr": 1.2636022567749023, "loss_mode_switch": 0.0, "loss_total": 0.2626032829284668, "step": 1821 }, { "batch_size": 4, "epoch": 0.7284, "step": 1821, "tokens_per_device": 1684 }, { "epoch": 0.7284, "loss_ce": 0.27431201934814453, "loss_lvr": 0.8126891851425171, "loss_mode_switch": 0.0, "loss_total": 0.3555809259414673, "step": 1821 }, { "batch_size": 4, "epoch": 0.7284, "step": 1821, "tokens_per_device": 1440 }, { "epoch": 0.7284, "loss_ce": 0.4113161265850067, "loss_lvr": 0.8813645243644714, "loss_mode_switch": 0.0, "loss_total": 0.4994525909423828, "step": 1821 }, { "batch_size": 4, "epoch": 0.7284, "step": 1821, "tokens_per_device": 12636 }, { "epoch": 0.7284, "loss_ce": 0.004112980794161558, "loss_lvr": 0.4571123719215393, "loss_mode_switch": 0.0, "loss_total": 0.049824219197034836, "step": 1821 }, { "batch_size": 4, "epoch": 0.7284, "step": 1821, "tokens_per_device": 5088 }, { "epoch": 0.7284, "loss_ce": 0.11356563121080399, "loss_lvr": 0.5679972171783447, "loss_mode_switch": 0.0, "loss_total": 0.1703653484582901, "step": 1821 }, { "batch_size": 4, "epoch": 0.7284, "step": 1821, "tokens_per_device": 6312 }, { "epoch": 0.7284, "loss_ce": 0.11358845233917236, "loss_lvr": 0.8294629454612732, "loss_mode_switch": 0.0, "loss_total": 0.19653475284576416, "step": 1821 }, { "batch_size": 4, "epoch": 0.7284, "step": 1821, "tokens_per_device": 5884 }, { "epoch": 0.7284, "loss_ce": 0.4723196029663086, "loss_lvr": 0.685979425907135, "loss_mode_switch": 0.0, "loss_total": 0.5409175157546997, "step": 1821 }, { "batch_size": 4, "epoch": 0.7284, "step": 1821, "tokens_per_device": 10740 }, { "epoch": 0.7284, "loss_ce": 0.10347012430429459, "loss_lvr": 0.6302433609962463, "loss_mode_switch": 0.0, "loss_total": 0.1664944589138031, "step": 1821 }, { "epoch": 0.7288, "grad_norm": 1.4393620491027832, "learning_rate": 1.8078917194474954e-06, "loss": 0.2856, "step": 1822 }, { "batch_size": 4, "epoch": 0.7288, "step": 1822, "tokens_per_device": 1320 }, { "epoch": 0.7288, "loss_ce": 0.25875863432884216, "loss_lvr": 1.0941073894500732, "loss_mode_switch": 0.0, "loss_total": 0.368169367313385, "step": 1822 }, { "batch_size": 1, "epoch": 0.7288, "step": 1822, "tokens_per_device": 5462 }, { "epoch": 0.7288, "loss_ce": 0.19787347316741943, "loss_lvr": 0.6302233338356018, "loss_mode_switch": 0.0, "loss_total": 0.26089581847190857, "step": 1822 }, { "batch_size": 1, "epoch": 0.7288, "step": 1822, "tokens_per_device": 5157 }, { "epoch": 0.7288, "loss_ce": 0.0006962598999962211, "loss_lvr": 0.6346493363380432, "loss_mode_switch": 0.0, "loss_total": 0.06416118890047073, "step": 1822 }, { "batch_size": 4, "epoch": 0.7288, "step": 1822, "tokens_per_device": 1468 }, { "epoch": 0.7288, "loss_ce": 0.7145505547523499, "loss_lvr": 1.1075408458709717, "loss_mode_switch": 0.0, "loss_total": 0.8253046274185181, "step": 1822 }, { "batch_size": 4, "epoch": 0.7288, "step": 1822, "tokens_per_device": 4196 }, { "epoch": 0.7288, "loss_ce": 0.1305861920118332, "loss_lvr": 0.8484571576118469, "loss_mode_switch": 0.0, "loss_total": 0.21543189883232117, "step": 1822 }, { "batch_size": 1, "epoch": 0.7288, "step": 1822, "tokens_per_device": 5204 }, { "epoch": 0.7288, "loss_ce": 0.5718861222267151, "loss_lvr": 0.29380002617836, "loss_mode_switch": 0.0, "loss_total": 0.6012661457061768, "step": 1822 }, { "batch_size": 4, "epoch": 0.7288, "step": 1822, "tokens_per_device": 4200 }, { "epoch": 0.7288, "loss_ce": 0.1765725314617157, "loss_lvr": 1.1747195720672607, "loss_mode_switch": 0.0, "loss_total": 0.29404449462890625, "step": 1822 }, { "batch_size": 4, "epoch": 0.7288, "step": 1822, "tokens_per_device": 3848 }, { "epoch": 0.7288, "loss_ce": 0.3079976439476013, "loss_lvr": 0.888606607913971, "loss_mode_switch": 0.0, "loss_total": 0.3968583047389984, "step": 1822 }, { "epoch": 0.7292, "grad_norm": 1.3658936023712158, "learning_rate": 1.8029087450352323e-06, "loss": 0.3018, "step": 1823 }, { "batch_size": 1, "epoch": 0.7292, "step": 1823, "tokens_per_device": 4793 }, { "epoch": 0.7292, "loss_ce": 0.00899780448526144, "loss_lvr": 0.2703237533569336, "loss_mode_switch": 0.0, "loss_total": 0.036030180752277374, "step": 1823 }, { "batch_size": 4, "epoch": 0.7292, "step": 1823, "tokens_per_device": 3844 }, { "epoch": 0.7292, "loss_ce": 0.4588218033313751, "loss_lvr": 2.072831153869629, "loss_mode_switch": 0.0, "loss_total": 0.6661049127578735, "step": 1823 }, { "batch_size": 4, "epoch": 0.7292, "step": 1823, "tokens_per_device": 5320 }, { "epoch": 0.7292, "loss_ce": 0.38859182596206665, "loss_lvr": 0.7689500451087952, "loss_mode_switch": 0.0, "loss_total": 0.4654868245124817, "step": 1823 }, { "batch_size": 4, "epoch": 0.7292, "step": 1823, "tokens_per_device": 3796 }, { "epoch": 0.7292, "loss_ce": 0.11680442094802856, "loss_lvr": 0.9416185617446899, "loss_mode_switch": 0.0, "loss_total": 0.2109662890434265, "step": 1823 }, { "batch_size": 4, "epoch": 0.7292, "step": 1823, "tokens_per_device": 4408 }, { "epoch": 0.7292, "loss_ce": 0.2899373471736908, "loss_lvr": 0.7018849849700928, "loss_mode_switch": 0.0, "loss_total": 0.3601258397102356, "step": 1823 }, { "batch_size": 1, "epoch": 0.7292, "step": 1823, "tokens_per_device": 5125 }, { "epoch": 0.7292, "loss_ce": 0.02901783585548401, "loss_lvr": 0.32321712374687195, "loss_mode_switch": 0.0, "loss_total": 0.06133954972028732, "step": 1823 }, { "batch_size": 1, "epoch": 0.7292, "step": 1823, "tokens_per_device": 4887 }, { "epoch": 0.7292, "loss_ce": 0.004599207546561956, "loss_lvr": 0.17154255509376526, "loss_mode_switch": 0.0, "loss_total": 0.0217534638941288, "step": 1823 }, { "batch_size": 4, "epoch": 0.7292, "step": 1823, "tokens_per_device": 4700 }, { "epoch": 0.7292, "loss_ce": 0.13641932606697083, "loss_lvr": 0.9789829850196838, "loss_mode_switch": 0.0, "loss_total": 0.2343176305294037, "step": 1823 }, { "epoch": 0.7296, "grad_norm": 1.1841422319412231, "learning_rate": 1.797931136382819e-06, "loss": 0.2654, "step": 1824 }, { "batch_size": 4, "epoch": 0.7296, "step": 1824, "tokens_per_device": 4212 }, { "epoch": 0.7296, "loss_ce": 0.08590401709079742, "loss_lvr": 0.9893372654914856, "loss_mode_switch": 0.0, "loss_total": 0.18483774363994598, "step": 1824 }, { "batch_size": 1, "epoch": 0.7296, "step": 1824, "tokens_per_device": 4892 }, { "epoch": 0.7296, "loss_ce": 0.013903009705245495, "loss_lvr": 0.41871148347854614, "loss_mode_switch": 0.0, "loss_total": 0.05577415972948074, "step": 1824 }, { "batch_size": 4, "epoch": 0.7296, "step": 1824, "tokens_per_device": 3764 }, { "epoch": 0.7296, "loss_ce": 0.44458115100860596, "loss_lvr": 0.7142453193664551, "loss_mode_switch": 0.0, "loss_total": 0.5160056948661804, "step": 1824 }, { "batch_size": 1, "epoch": 0.7296, "step": 1824, "tokens_per_device": 4949 }, { "epoch": 0.7296, "loss_ce": 0.5352749228477478, "loss_lvr": 0.6514767408370972, "loss_mode_switch": 0.0, "loss_total": 0.6004226207733154, "step": 1824 }, { "batch_size": 4, "epoch": 0.7296, "step": 1824, "tokens_per_device": 2568 }, { "epoch": 0.7296, "loss_ce": 0.15941011905670166, "loss_lvr": 0.887951135635376, "loss_mode_switch": 0.0, "loss_total": 0.2482052445411682, "step": 1824 }, { "batch_size": 4, "epoch": 0.7296, "step": 1824, "tokens_per_device": 6552 }, { "epoch": 0.7296, "loss_ce": 0.40567904710769653, "loss_lvr": 0.7628055214881897, "loss_mode_switch": 0.0, "loss_total": 0.48195961117744446, "step": 1824 }, { "batch_size": 4, "epoch": 0.7296, "step": 1824, "tokens_per_device": 4312 }, { "epoch": 0.7296, "loss_ce": 0.133839413523674, "loss_lvr": 0.9209542870521545, "loss_mode_switch": 0.0, "loss_total": 0.22593483328819275, "step": 1824 }, { "batch_size": 4, "epoch": 0.7296, "step": 1824, "tokens_per_device": 15004 }, { "epoch": 0.7296, "loss_ce": 0.16938172280788422, "loss_lvr": 0.6941143870353699, "loss_mode_switch": 0.0, "loss_total": 0.23879316449165344, "step": 1824 }, { "epoch": 0.73, "grad_norm": 1.3878601789474487, "learning_rate": 1.7929589018443016e-06, "loss": 0.3162, "step": 1825 }, { "batch_size": 4, "epoch": 0.73, "step": 1825, "tokens_per_device": 4272 }, { "epoch": 0.73, "loss_ce": 0.37770622968673706, "loss_lvr": 0.8662660717964172, "loss_mode_switch": 0.0, "loss_total": 0.46433284878730774, "step": 1825 }, { "batch_size": 4, "epoch": 0.73, "step": 1825, "tokens_per_device": 4212 }, { "epoch": 0.73, "loss_ce": 0.0016101342625916004, "loss_lvr": 0.5792871117591858, "loss_mode_switch": 0.0, "loss_total": 0.05953884497284889, "step": 1825 }, { "batch_size": 4, "epoch": 0.73, "step": 1825, "tokens_per_device": 2640 }, { "epoch": 0.73, "loss_ce": 0.07991896569728851, "loss_lvr": 0.5868598222732544, "loss_mode_switch": 0.0, "loss_total": 0.13860495388507843, "step": 1825 }, { "batch_size": 4, "epoch": 0.73, "step": 1825, "tokens_per_device": 3944 }, { "epoch": 0.73, "loss_ce": 0.12665337324142456, "loss_lvr": 0.7945433259010315, "loss_mode_switch": 0.0, "loss_total": 0.2061077058315277, "step": 1825 }, { "batch_size": 4, "epoch": 0.73, "step": 1825, "tokens_per_device": 1892 }, { "epoch": 0.73, "loss_ce": 0.14514616131782532, "loss_lvr": 0.7770519256591797, "loss_mode_switch": 0.0, "loss_total": 0.22285136580467224, "step": 1825 }, { "batch_size": 4, "epoch": 0.73, "step": 1825, "tokens_per_device": 2708 }, { "epoch": 0.73, "loss_ce": 0.16845037043094635, "loss_lvr": 0.765143096446991, "loss_mode_switch": 0.0, "loss_total": 0.24496468901634216, "step": 1825 }, { "batch_size": 4, "epoch": 0.73, "step": 1825, "tokens_per_device": 2572 }, { "epoch": 0.73, "loss_ce": 0.537528932094574, "loss_lvr": 0.8570495843887329, "loss_mode_switch": 0.0, "loss_total": 0.6232339143753052, "step": 1825 }, { "batch_size": 1, "epoch": 0.73, "step": 1825, "tokens_per_device": 5013 }, { "epoch": 0.73, "loss_ce": 0.011285403743386269, "loss_lvr": 0.25467562675476074, "loss_mode_switch": 0.0, "loss_total": 0.03675296902656555, "step": 1825 }, { "epoch": 0.7304, "grad_norm": 1.3258758783340454, "learning_rate": 1.7879920497647068e-06, "loss": 0.2842, "step": 1826 }, { "batch_size": 1, "epoch": 0.7304, "step": 1826, "tokens_per_device": 4949 }, { "epoch": 0.7304, "loss_ce": 0.11049135774374008, "loss_lvr": 0.5444990396499634, "loss_mode_switch": 0.0, "loss_total": 0.16494126617908478, "step": 1826 }, { "batch_size": 1, "epoch": 0.7304, "step": 1826, "tokens_per_device": 4948 }, { "epoch": 0.7304, "loss_ce": 0.010001703165471554, "loss_lvr": 0.17691807448863983, "loss_mode_switch": 0.0, "loss_total": 0.027693510055541992, "step": 1826 }, { "batch_size": 4, "epoch": 0.7304, "step": 1826, "tokens_per_device": 2584 }, { "epoch": 0.7304, "loss_ce": 0.2525848150253296, "loss_lvr": 0.8854607343673706, "loss_mode_switch": 0.0, "loss_total": 0.3411308825016022, "step": 1826 }, { "batch_size": 4, "epoch": 0.7304, "step": 1826, "tokens_per_device": 5880 }, { "epoch": 0.7304, "loss_ce": 0.3280426561832428, "loss_lvr": 0.6237359046936035, "loss_mode_switch": 0.0, "loss_total": 0.3904162347316742, "step": 1826 }, { "batch_size": 4, "epoch": 0.7304, "step": 1826, "tokens_per_device": 3312 }, { "epoch": 0.7304, "loss_ce": 0.2420724332332611, "loss_lvr": 0.91225266456604, "loss_mode_switch": 0.0, "loss_total": 0.3332976996898651, "step": 1826 }, { "batch_size": 4, "epoch": 0.7304, "step": 1826, "tokens_per_device": 9944 }, { "epoch": 0.7304, "loss_ce": 0.2690297067165375, "loss_lvr": 0.479460746049881, "loss_mode_switch": 0.0, "loss_total": 0.31697577238082886, "step": 1826 }, { "batch_size": 4, "epoch": 0.7304, "step": 1826, "tokens_per_device": 1648 }, { "epoch": 0.7304, "loss_ce": 0.6907579302787781, "loss_lvr": 1.0687354803085327, "loss_mode_switch": 0.0, "loss_total": 0.7976315021514893, "step": 1826 }, { "batch_size": 1, "epoch": 0.7304, "step": 1826, "tokens_per_device": 5101 }, { "epoch": 0.7304, "loss_ce": 0.01080870907753706, "loss_lvr": 0.7180209755897522, "loss_mode_switch": 0.0, "loss_total": 0.08261080831289291, "step": 1826 }, { "epoch": 0.7308, "grad_norm": 1.2534769773483276, "learning_rate": 1.7830305884800302e-06, "loss": 0.2765, "step": 1827 }, { "batch_size": 4, "epoch": 0.7308, "step": 1827, "tokens_per_device": 4296 }, { "epoch": 0.7308, "loss_ce": 0.646809995174408, "loss_lvr": 0.9216272830963135, "loss_mode_switch": 0.0, "loss_total": 0.7389727234840393, "step": 1827 }, { "batch_size": 4, "epoch": 0.7308, "step": 1827, "tokens_per_device": 3364 }, { "epoch": 0.7308, "loss_ce": 0.14809603989124298, "loss_lvr": 0.9448996782302856, "loss_mode_switch": 0.0, "loss_total": 0.24258601665496826, "step": 1827 }, { "batch_size": 4, "epoch": 0.7308, "step": 1827, "tokens_per_device": 1612 }, { "epoch": 0.7308, "loss_ce": 0.6450686454772949, "loss_lvr": 1.0135045051574707, "loss_mode_switch": 0.0, "loss_total": 0.7464190721511841, "step": 1827 }, { "batch_size": 1, "epoch": 0.7308, "step": 1827, "tokens_per_device": 5124 }, { "epoch": 0.7308, "loss_ce": 0.0007537572528235614, "loss_lvr": 0.7033429145812988, "loss_mode_switch": 0.0, "loss_total": 0.07108805328607559, "step": 1827 }, { "batch_size": 4, "epoch": 0.7308, "step": 1827, "tokens_per_device": 4532 }, { "epoch": 0.7308, "loss_ce": 0.013163570314645767, "loss_lvr": 0.4425608217716217, "loss_mode_switch": 0.0, "loss_total": 0.05741965398192406, "step": 1827 }, { "batch_size": 4, "epoch": 0.7308, "step": 1827, "tokens_per_device": 4216 }, { "epoch": 0.7308, "loss_ce": 0.2955220937728882, "loss_lvr": 1.0461457967758179, "loss_mode_switch": 0.0, "loss_total": 0.40013667941093445, "step": 1827 }, { "batch_size": 4, "epoch": 0.7308, "step": 1827, "tokens_per_device": 4392 }, { "epoch": 0.7308, "loss_ce": 0.012488173320889473, "loss_lvr": 0.8936450481414795, "loss_mode_switch": 0.0, "loss_total": 0.10185267776250839, "step": 1827 }, { "batch_size": 4, "epoch": 0.7308, "step": 1827, "tokens_per_device": 4108 }, { "epoch": 0.7308, "loss_ce": 0.8743259906768799, "loss_lvr": 0.692010760307312, "loss_mode_switch": 0.0, "loss_total": 0.9435270428657532, "step": 1827 }, { "epoch": 0.7312, "grad_norm": 1.3920038938522339, "learning_rate": 1.7780745263172216e-06, "loss": 0.3168, "step": 1828 }, { "batch_size": 4, "epoch": 0.7312, "step": 1828, "tokens_per_device": 4296 }, { "epoch": 0.7312, "loss_ce": 0.11586394906044006, "loss_lvr": 0.7969502210617065, "loss_mode_switch": 0.0, "loss_total": 0.19555896520614624, "step": 1828 }, { "batch_size": 4, "epoch": 0.7312, "step": 1828, "tokens_per_device": 4336 }, { "epoch": 0.7312, "loss_ce": 0.24189692735671997, "loss_lvr": 0.5609740614891052, "loss_mode_switch": 0.0, "loss_total": 0.29799434542655945, "step": 1828 }, { "batch_size": 4, "epoch": 0.7312, "step": 1828, "tokens_per_device": 9172 }, { "epoch": 0.7312, "loss_ce": 0.04943959042429924, "loss_lvr": 0.7049412131309509, "loss_mode_switch": 0.0, "loss_total": 0.11993370950222015, "step": 1828 }, { "batch_size": 4, "epoch": 0.7312, "step": 1828, "tokens_per_device": 4440 }, { "epoch": 0.7312, "loss_ce": 0.5270829796791077, "loss_lvr": 0.953668475151062, "loss_mode_switch": 0.0, "loss_total": 0.6224498152732849, "step": 1828 }, { "batch_size": 1, "epoch": 0.7312, "step": 1828, "tokens_per_device": 5165 }, { "epoch": 0.7312, "loss_ce": 0.03283051773905754, "loss_lvr": 0.33858203887939453, "loss_mode_switch": 0.0, "loss_total": 0.06668872386217117, "step": 1828 }, { "batch_size": 4, "epoch": 0.7312, "step": 1828, "tokens_per_device": 1420 }, { "epoch": 0.7312, "loss_ce": 0.1104472279548645, "loss_lvr": 1.067031741142273, "loss_mode_switch": 0.0, "loss_total": 0.21715040504932404, "step": 1828 }, { "batch_size": 4, "epoch": 0.7312, "step": 1828, "tokens_per_device": 5688 }, { "epoch": 0.7312, "loss_ce": 0.17017492651939392, "loss_lvr": 0.6581417322158813, "loss_mode_switch": 0.0, "loss_total": 0.23598909378051758, "step": 1828 }, { "batch_size": 4, "epoch": 0.7312, "step": 1828, "tokens_per_device": 2776 }, { "epoch": 0.7312, "loss_ce": 0.1045171320438385, "loss_lvr": 0.7021540403366089, "loss_mode_switch": 0.0, "loss_total": 0.1747325360774994, "step": 1828 }, { "epoch": 0.7316, "grad_norm": 1.2518165111541748, "learning_rate": 1.773123871594164e-06, "loss": 0.2858, "step": 1829 }, { "batch_size": 1, "epoch": 0.7316, "step": 1829, "tokens_per_device": 5186 }, { "epoch": 0.7316, "loss_ce": 0.017773358151316643, "loss_lvr": 0.23574718832969666, "loss_mode_switch": 0.0, "loss_total": 0.04134807735681534, "step": 1829 }, { "batch_size": 4, "epoch": 0.7316, "step": 1829, "tokens_per_device": 5012 }, { "epoch": 0.7316, "loss_ce": 0.092292420566082, "loss_lvr": 0.6950007081031799, "loss_mode_switch": 0.0, "loss_total": 0.16179248690605164, "step": 1829 }, { "batch_size": 4, "epoch": 0.7316, "step": 1829, "tokens_per_device": 5060 }, { "epoch": 0.7316, "loss_ce": 0.07382351160049438, "loss_lvr": 0.8024484515190125, "loss_mode_switch": 0.0, "loss_total": 0.15406835079193115, "step": 1829 }, { "batch_size": 4, "epoch": 0.7316, "step": 1829, "tokens_per_device": 4760 }, { "epoch": 0.7316, "loss_ce": 0.17603880167007446, "loss_lvr": 0.7838067412376404, "loss_mode_switch": 0.0, "loss_total": 0.2544194757938385, "step": 1829 }, { "batch_size": 1, "epoch": 0.7316, "step": 1829, "tokens_per_device": 5389 }, { "epoch": 0.7316, "loss_ce": 0.000403836922487244, "loss_lvr": 0.24480879306793213, "loss_mode_switch": 0.0, "loss_total": 0.024884715676307678, "step": 1829 }, { "batch_size": 4, "epoch": 0.7316, "step": 1829, "tokens_per_device": 4536 }, { "epoch": 0.7316, "loss_ce": 0.011959383264183998, "loss_lvr": 1.116697072982788, "loss_mode_switch": 0.0, "loss_total": 0.12362909317016602, "step": 1829 }, { "batch_size": 1, "epoch": 0.7316, "step": 1829, "tokens_per_device": 4865 }, { "epoch": 0.7316, "loss_ce": 0.02439538761973381, "loss_lvr": 0.27576276659965515, "loss_mode_switch": 0.0, "loss_total": 0.051971666514873505, "step": 1829 }, { "batch_size": 4, "epoch": 0.7316, "step": 1829, "tokens_per_device": 2240 }, { "epoch": 0.7316, "loss_ce": 0.14542704820632935, "loss_lvr": 0.9625826478004456, "loss_mode_switch": 0.0, "loss_total": 0.24168531596660614, "step": 1829 }, { "epoch": 0.732, "grad_norm": 1.2267664670944214, "learning_rate": 1.7681786326196665e-06, "loss": 0.2474, "step": 1830 }, { "batch_size": 4, "epoch": 0.732, "step": 1830, "tokens_per_device": 4240 }, { "epoch": 0.732, "loss_ce": 0.024513885378837585, "loss_lvr": 0.7276992797851562, "loss_mode_switch": 0.0, "loss_total": 0.09728381782770157, "step": 1830 }, { "batch_size": 4, "epoch": 0.732, "step": 1830, "tokens_per_device": 1924 }, { "epoch": 0.732, "loss_ce": 0.4140934646129608, "loss_lvr": 0.9649189710617065, "loss_mode_switch": 0.0, "loss_total": 0.510585367679596, "step": 1830 }, { "batch_size": 4, "epoch": 0.732, "step": 1830, "tokens_per_device": 4224 }, { "epoch": 0.732, "loss_ce": 0.08371493220329285, "loss_lvr": 0.7810660004615784, "loss_mode_switch": 0.0, "loss_total": 0.16182154417037964, "step": 1830 }, { "batch_size": 4, "epoch": 0.732, "step": 1830, "tokens_per_device": 1564 }, { "epoch": 0.732, "loss_ce": 0.23422102630138397, "loss_lvr": 0.8040780425071716, "loss_mode_switch": 0.0, "loss_total": 0.31462883949279785, "step": 1830 }, { "batch_size": 4, "epoch": 0.732, "step": 1830, "tokens_per_device": 4792 }, { "epoch": 0.732, "loss_ce": 0.2009068727493286, "loss_lvr": 0.7612692713737488, "loss_mode_switch": 0.0, "loss_total": 0.27703380584716797, "step": 1830 }, { "batch_size": 4, "epoch": 0.732, "step": 1830, "tokens_per_device": 15352 }, { "epoch": 0.732, "loss_ce": 0.09561802446842194, "loss_lvr": 0.7175531387329102, "loss_mode_switch": 0.0, "loss_total": 0.16737332940101624, "step": 1830 }, { "batch_size": 4, "epoch": 0.732, "step": 1830, "tokens_per_device": 4168 }, { "epoch": 0.732, "loss_ce": 0.451643168926239, "loss_lvr": 1.1822022199630737, "loss_mode_switch": 0.0, "loss_total": 0.5698633790016174, "step": 1830 }, { "batch_size": 4, "epoch": 0.732, "step": 1830, "tokens_per_device": 3860 }, { "epoch": 0.732, "loss_ce": 0.27462244033813477, "loss_lvr": 0.7818015813827515, "loss_mode_switch": 0.0, "loss_total": 0.3528026044368744, "step": 1830 }, { "epoch": 0.7324, "grad_norm": 1.3948777914047241, "learning_rate": 1.7632388176934523e-06, "loss": 0.2324, "step": 1831 }, { "batch_size": 1, "epoch": 0.7324, "step": 1831, "tokens_per_device": 4792 }, { "epoch": 0.7324, "loss_ce": 0.04638507962226868, "loss_lvr": 0.18958508968353271, "loss_mode_switch": 0.0, "loss_total": 0.06534358859062195, "step": 1831 }, { "batch_size": 4, "epoch": 0.7324, "step": 1831, "tokens_per_device": 15996 }, { "epoch": 0.7324, "loss_ce": 0.23818601667881012, "loss_lvr": 0.7768911123275757, "loss_mode_switch": 0.0, "loss_total": 0.3158751130104065, "step": 1831 }, { "batch_size": 1, "epoch": 0.7324, "step": 1831, "tokens_per_device": 4885 }, { "epoch": 0.7324, "loss_ce": 0.261153906583786, "loss_lvr": 0.6637998223304749, "loss_mode_switch": 0.0, "loss_total": 0.32753390073776245, "step": 1831 }, { "batch_size": 4, "epoch": 0.7324, "step": 1831, "tokens_per_device": 5900 }, { "epoch": 0.7324, "loss_ce": 0.061342690140008926, "loss_lvr": 1.011026382446289, "loss_mode_switch": 0.0, "loss_total": 0.1624453365802765, "step": 1831 }, { "batch_size": 4, "epoch": 0.7324, "step": 1831, "tokens_per_device": 4332 }, { "epoch": 0.7324, "loss_ce": 0.4127751886844635, "loss_lvr": 0.7790365219116211, "loss_mode_switch": 0.0, "loss_total": 0.4906788468360901, "step": 1831 }, { "batch_size": 4, "epoch": 0.7324, "step": 1831, "tokens_per_device": 11064 }, { "epoch": 0.7324, "loss_ce": 0.026306580752134323, "loss_lvr": 0.39177221059799194, "loss_mode_switch": 0.0, "loss_total": 0.06548380106687546, "step": 1831 }, { "batch_size": 4, "epoch": 0.7324, "step": 1831, "tokens_per_device": 15660 }, { "epoch": 0.7324, "loss_ce": 0.2106151431798935, "loss_lvr": 0.5261238217353821, "loss_mode_switch": 0.0, "loss_total": 0.26322752237319946, "step": 1831 }, { "batch_size": 1, "epoch": 0.7324, "step": 1831, "tokens_per_device": 4862 }, { "epoch": 0.7324, "loss_ce": 0.10540984570980072, "loss_lvr": 0.3212531805038452, "loss_mode_switch": 0.0, "loss_total": 0.13753516972064972, "step": 1831 }, { "epoch": 0.7328, "grad_norm": 1.2302014827728271, "learning_rate": 1.7583044351061369e-06, "loss": 0.2438, "step": 1832 }, { "batch_size": 4, "epoch": 0.7328, "step": 1832, "tokens_per_device": 5184 }, { "epoch": 0.7328, "loss_ce": 0.14170989394187927, "loss_lvr": 0.7261068820953369, "loss_mode_switch": 0.0, "loss_total": 0.2143205851316452, "step": 1832 }, { "batch_size": 4, "epoch": 0.7328, "step": 1832, "tokens_per_device": 5712 }, { "epoch": 0.7328, "loss_ce": 0.07373131811618805, "loss_lvr": 0.7702569961547852, "loss_mode_switch": 0.0, "loss_total": 0.15075701475143433, "step": 1832 }, { "batch_size": 4, "epoch": 0.7328, "step": 1832, "tokens_per_device": 3408 }, { "epoch": 0.7328, "loss_ce": 0.0583624541759491, "loss_lvr": 1.1652966737747192, "loss_mode_switch": 0.0, "loss_total": 0.1748921275138855, "step": 1832 }, { "batch_size": 4, "epoch": 0.7328, "step": 1832, "tokens_per_device": 4292 }, { "epoch": 0.7328, "loss_ce": 0.07356630265712738, "loss_lvr": 0.8954218029975891, "loss_mode_switch": 0.0, "loss_total": 0.1631084829568863, "step": 1832 }, { "batch_size": 1, "epoch": 0.7328, "step": 1832, "tokens_per_device": 5143 }, { "epoch": 0.7328, "loss_ce": 0.014481447637081146, "loss_lvr": 0.27084001898765564, "loss_mode_switch": 0.0, "loss_total": 0.04156544804573059, "step": 1832 }, { "batch_size": 4, "epoch": 0.7328, "step": 1832, "tokens_per_device": 2556 }, { "epoch": 0.7328, "loss_ce": 0.3878288269042969, "loss_lvr": 0.9725896120071411, "loss_mode_switch": 0.0, "loss_total": 0.4850877821445465, "step": 1832 }, { "batch_size": 1, "epoch": 0.7328, "step": 1832, "tokens_per_device": 4887 }, { "epoch": 0.7328, "loss_ce": 0.021762369200587273, "loss_lvr": 0.21874815225601196, "loss_mode_switch": 0.0, "loss_total": 0.04363718628883362, "step": 1832 }, { "batch_size": 4, "epoch": 0.7328, "step": 1832, "tokens_per_device": 2696 }, { "epoch": 0.7328, "loss_ce": 0.400354266166687, "loss_lvr": 0.601390540599823, "loss_mode_switch": 0.0, "loss_total": 0.4604933261871338, "step": 1832 }, { "epoch": 0.7332, "grad_norm": 1.2488869428634644, "learning_rate": 1.7533754931392227e-06, "loss": 0.2909, "step": 1833 }, { "batch_size": 4, "epoch": 0.7332, "step": 1833, "tokens_per_device": 4792 }, { "epoch": 0.7332, "loss_ce": 0.0871826708316803, "loss_lvr": 0.8135353922843933, "loss_mode_switch": 0.0, "loss_total": 0.1685362160205841, "step": 1833 }, { "batch_size": 1, "epoch": 0.7332, "step": 1833, "tokens_per_device": 4949 }, { "epoch": 0.7332, "loss_ce": 0.0014443343970924616, "loss_lvr": 0.6396819353103638, "loss_mode_switch": 0.0, "loss_total": 0.06541252881288528, "step": 1833 }, { "batch_size": 4, "epoch": 0.7332, "step": 1833, "tokens_per_device": 3836 }, { "epoch": 0.7332, "loss_ce": 0.2429707795381546, "loss_lvr": 0.5483481287956238, "loss_mode_switch": 0.0, "loss_total": 0.2978056073188782, "step": 1833 }, { "batch_size": 1, "epoch": 0.7332, "step": 1833, "tokens_per_device": 5103 }, { "epoch": 0.7332, "loss_ce": 0.0004892388824373484, "loss_lvr": 0.5016146898269653, "loss_mode_switch": 0.0, "loss_total": 0.0506507083773613, "step": 1833 }, { "batch_size": 1, "epoch": 0.7332, "step": 1833, "tokens_per_device": 7482 }, { "epoch": 0.7332, "loss_ce": 0.014963784255087376, "loss_lvr": 0.2468004673719406, "loss_mode_switch": 0.0, "loss_total": 0.03964383155107498, "step": 1833 }, { "batch_size": 4, "epoch": 0.7332, "step": 1833, "tokens_per_device": 4260 }, { "epoch": 0.7332, "loss_ce": 0.07933858036994934, "loss_lvr": 0.8525540828704834, "loss_mode_switch": 0.0, "loss_total": 0.16459399461746216, "step": 1833 }, { "batch_size": 4, "epoch": 0.7332, "step": 1833, "tokens_per_device": 1328 }, { "epoch": 0.7332, "loss_ce": 0.05194401368498802, "loss_lvr": 1.717208981513977, "loss_mode_switch": 0.0, "loss_total": 0.22366492450237274, "step": 1833 }, { "batch_size": 1, "epoch": 0.7332, "step": 1833, "tokens_per_device": 4123 }, { "epoch": 0.7332, "loss_ce": 0.0012388250324875116, "loss_lvr": 0.3734455406665802, "loss_mode_switch": 0.0, "loss_total": 0.038583382964134216, "step": 1833 }, { "epoch": 0.7336, "grad_norm": 1.4701303243637085, "learning_rate": 1.7484520000650757e-06, "loss": 0.3085, "step": 1834 }, { "batch_size": 1, "epoch": 0.7336, "step": 1834, "tokens_per_device": 4103 }, { "epoch": 0.7336, "loss_ce": 0.0029671108350157738, "loss_lvr": 0.21714837849140167, "loss_mode_switch": 0.0, "loss_total": 0.024681948125362396, "step": 1834 }, { "batch_size": 1, "epoch": 0.7336, "step": 1834, "tokens_per_device": 5116 }, { "epoch": 0.7336, "loss_ce": 0.016552859917283058, "loss_lvr": 0.6364089250564575, "loss_mode_switch": 0.0, "loss_total": 0.08019375056028366, "step": 1834 }, { "batch_size": 1, "epoch": 0.7336, "step": 1834, "tokens_per_device": 4911 }, { "epoch": 0.7336, "loss_ce": 1.865045189857483, "loss_lvr": 2.5609960556030273, "loss_mode_switch": 0.0, "loss_total": 2.1211447715759277, "step": 1834 }, { "batch_size": 4, "epoch": 0.7336, "step": 1834, "tokens_per_device": 4368 }, { "epoch": 0.7336, "loss_ce": 0.20340320467948914, "loss_lvr": 0.8313617706298828, "loss_mode_switch": 0.0, "loss_total": 0.28653937578201294, "step": 1834 }, { "batch_size": 4, "epoch": 0.7336, "step": 1834, "tokens_per_device": 4556 }, { "epoch": 0.7336, "loss_ce": 0.15234148502349854, "loss_lvr": 0.829563558101654, "loss_mode_switch": 0.0, "loss_total": 0.23529784381389618, "step": 1834 }, { "batch_size": 4, "epoch": 0.7336, "step": 1834, "tokens_per_device": 3772 }, { "epoch": 0.7336, "loss_ce": 0.30871012806892395, "loss_lvr": 0.6376655101776123, "loss_mode_switch": 0.0, "loss_total": 0.3724766969680786, "step": 1834 }, { "batch_size": 1, "epoch": 0.7336, "step": 1834, "tokens_per_device": 5012 }, { "epoch": 0.7336, "loss_ce": 0.008836514316499233, "loss_lvr": 0.5284447073936462, "loss_mode_switch": 0.0, "loss_total": 0.06168098747730255, "step": 1834 }, { "batch_size": 4, "epoch": 0.7336, "step": 1834, "tokens_per_device": 5944 }, { "epoch": 0.7336, "loss_ce": 0.23375073075294495, "loss_lvr": 0.8973879814147949, "loss_mode_switch": 0.0, "loss_total": 0.32348954677581787, "step": 1834 }, { "epoch": 0.734, "grad_norm": 1.82779061794281, "learning_rate": 1.743533964146924e-06, "loss": 0.3331, "step": 1835 }, { "batch_size": 4, "epoch": 0.734, "step": 1835, "tokens_per_device": 12132 }, { "epoch": 0.734, "loss_ce": 0.08907023817300797, "loss_lvr": 1.056723952293396, "loss_mode_switch": 0.0, "loss_total": 0.19474263489246368, "step": 1835 }, { "batch_size": 4, "epoch": 0.734, "step": 1835, "tokens_per_device": 1552 }, { "epoch": 0.734, "loss_ce": 0.7027863264083862, "loss_lvr": 1.0698416233062744, "loss_mode_switch": 0.0, "loss_total": 0.8097704648971558, "step": 1835 }, { "batch_size": 4, "epoch": 0.734, "step": 1835, "tokens_per_device": 3804 }, { "epoch": 0.734, "loss_ce": 0.21693387627601624, "loss_lvr": 0.8252840638160706, "loss_mode_switch": 0.0, "loss_total": 0.29946228861808777, "step": 1835 }, { "batch_size": 4, "epoch": 0.734, "step": 1835, "tokens_per_device": 13480 }, { "epoch": 0.734, "loss_ce": 0.04240833967924118, "loss_lvr": 0.3780072033405304, "loss_mode_switch": 0.0, "loss_total": 0.08020906150341034, "step": 1835 }, { "batch_size": 4, "epoch": 0.734, "step": 1835, "tokens_per_device": 1740 }, { "epoch": 0.734, "loss_ce": 0.7419289946556091, "loss_lvr": 0.8432145714759827, "loss_mode_switch": 0.0, "loss_total": 0.826250433921814, "step": 1835 }, { "batch_size": 4, "epoch": 0.734, "step": 1835, "tokens_per_device": 4448 }, { "epoch": 0.734, "loss_ce": 0.06213986873626709, "loss_lvr": 0.7008252739906311, "loss_mode_switch": 0.0, "loss_total": 0.13222239911556244, "step": 1835 }, { "batch_size": 4, "epoch": 0.734, "step": 1835, "tokens_per_device": 5784 }, { "epoch": 0.734, "loss_ce": 0.2502111792564392, "loss_lvr": 0.7846137285232544, "loss_mode_switch": 0.0, "loss_total": 0.3286725580692291, "step": 1835 }, { "batch_size": 1, "epoch": 0.734, "step": 1835, "tokens_per_device": 4862 }, { "epoch": 0.734, "loss_ce": 0.0025488799437880516, "loss_lvr": 0.24804125726222992, "loss_mode_switch": 0.0, "loss_total": 0.027353007346391678, "step": 1835 }, { "epoch": 0.7344, "grad_norm": 1.2346619367599487, "learning_rate": 1.7386213936388303e-06, "loss": 0.2782, "step": 1836 }, { "batch_size": 1, "epoch": 0.7344, "step": 1836, "tokens_per_device": 5134 }, { "epoch": 0.7344, "loss_ce": 0.004015089478343725, "loss_lvr": 0.7047243714332581, "loss_mode_switch": 0.0, "loss_total": 0.07448752969503403, "step": 1836 }, { "batch_size": 4, "epoch": 0.7344, "step": 1836, "tokens_per_device": 4856 }, { "epoch": 0.7344, "loss_ce": 0.2524067163467407, "loss_lvr": 0.9366078972816467, "loss_mode_switch": 0.0, "loss_total": 0.34606751799583435, "step": 1836 }, { "batch_size": 4, "epoch": 0.7344, "step": 1836, "tokens_per_device": 4624 }, { "epoch": 0.7344, "loss_ce": 0.25230827927589417, "loss_lvr": 0.8319674730300903, "loss_mode_switch": 0.0, "loss_total": 0.33550503849983215, "step": 1836 }, { "batch_size": 1, "epoch": 0.7344, "step": 1836, "tokens_per_device": 5121 }, { "epoch": 0.7344, "loss_ce": 0.0008239569724537432, "loss_lvr": 0.4867018759250641, "loss_mode_switch": 0.0, "loss_total": 0.049494143575429916, "step": 1836 }, { "batch_size": 1, "epoch": 0.7344, "step": 1836, "tokens_per_device": 4197 }, { "epoch": 0.7344, "loss_ce": 0.1312868744134903, "loss_lvr": 0.3196808695793152, "loss_mode_switch": 0.0, "loss_total": 0.16325496137142181, "step": 1836 }, { "batch_size": 4, "epoch": 0.7344, "step": 1836, "tokens_per_device": 1292 }, { "epoch": 0.7344, "loss_ce": 0.32322680950164795, "loss_lvr": 1.0341417789459229, "loss_mode_switch": 0.0, "loss_total": 0.42664098739624023, "step": 1836 }, { "batch_size": 4, "epoch": 0.7344, "step": 1836, "tokens_per_device": 1716 }, { "epoch": 0.7344, "loss_ce": 0.13858279585838318, "loss_lvr": 0.9749279022216797, "loss_mode_switch": 0.0, "loss_total": 0.23607558012008667, "step": 1836 }, { "batch_size": 4, "epoch": 0.7344, "step": 1836, "tokens_per_device": 5036 }, { "epoch": 0.7344, "loss_ce": 0.37850314378738403, "loss_lvr": 0.4947291910648346, "loss_mode_switch": 0.0, "loss_total": 0.4279760718345642, "step": 1836 }, { "epoch": 0.7348, "grad_norm": 1.2047529220581055, "learning_rate": 1.7337142967856857e-06, "loss": 0.2779, "step": 1837 }, { "batch_size": 1, "epoch": 0.7348, "step": 1837, "tokens_per_device": 5118 }, { "epoch": 0.7348, "loss_ce": 0.2060040384531021, "loss_lvr": 0.22883237898349762, "loss_mode_switch": 0.0, "loss_total": 0.22888727486133575, "step": 1837 }, { "batch_size": 4, "epoch": 0.7348, "step": 1837, "tokens_per_device": 5580 }, { "epoch": 0.7348, "loss_ce": 0.2056591659784317, "loss_lvr": 0.7258208990097046, "loss_mode_switch": 0.0, "loss_total": 0.27824124693870544, "step": 1837 }, { "batch_size": 4, "epoch": 0.7348, "step": 1837, "tokens_per_device": 1440 }, { "epoch": 0.7348, "loss_ce": 0.08538934588432312, "loss_lvr": 0.9567482471466064, "loss_mode_switch": 0.0, "loss_total": 0.181064173579216, "step": 1837 }, { "batch_size": 1, "epoch": 0.7348, "step": 1837, "tokens_per_device": 4947 }, { "epoch": 0.7348, "loss_ce": 0.11607541888952255, "loss_lvr": 0.3694758415222168, "loss_mode_switch": 0.0, "loss_total": 0.15302300453186035, "step": 1837 }, { "batch_size": 1, "epoch": 0.7348, "step": 1837, "tokens_per_device": 5008 }, { "epoch": 0.7348, "loss_ce": 0.8556755185127258, "loss_lvr": 0.273493230342865, "loss_mode_switch": 0.0, "loss_total": 0.8830248117446899, "step": 1837 }, { "batch_size": 4, "epoch": 0.7348, "step": 1837, "tokens_per_device": 7740 }, { "epoch": 0.7348, "loss_ce": 0.28431475162506104, "loss_lvr": 0.7313368320465088, "loss_mode_switch": 0.0, "loss_total": 0.35744842886924744, "step": 1837 }, { "batch_size": 1, "epoch": 0.7348, "step": 1837, "tokens_per_device": 5161 }, { "epoch": 0.7348, "loss_ce": 0.00042870789184235036, "loss_lvr": 0.35528823733329773, "loss_mode_switch": 0.0, "loss_total": 0.03595753014087677, "step": 1837 }, { "batch_size": 1, "epoch": 0.7348, "step": 1837, "tokens_per_device": 5032 }, { "epoch": 0.7348, "loss_ce": 0.03691057115793228, "loss_lvr": 0.19986100494861603, "loss_mode_switch": 0.0, "loss_total": 0.056896671652793884, "step": 1837 }, { "epoch": 0.7352, "grad_norm": 1.4030910730361938, "learning_rate": 1.7288126818231998e-06, "loss": 0.2794, "step": 1838 }, { "batch_size": 4, "epoch": 0.7352, "step": 1838, "tokens_per_device": 1512 }, { "epoch": 0.7352, "loss_ce": 0.1289680302143097, "loss_lvr": 0.9195806384086609, "loss_mode_switch": 0.0, "loss_total": 0.22092610597610474, "step": 1838 }, { "batch_size": 4, "epoch": 0.7352, "step": 1838, "tokens_per_device": 4244 }, { "epoch": 0.7352, "loss_ce": 0.41141828894615173, "loss_lvr": 0.9567167162895203, "loss_mode_switch": 0.0, "loss_total": 0.5070899724960327, "step": 1838 }, { "batch_size": 4, "epoch": 0.7352, "step": 1838, "tokens_per_device": 5872 }, { "epoch": 0.7352, "loss_ce": 0.4054630696773529, "loss_lvr": 0.6965993046760559, "loss_mode_switch": 0.0, "loss_total": 0.47512298822402954, "step": 1838 }, { "batch_size": 4, "epoch": 0.7352, "step": 1838, "tokens_per_device": 8836 }, { "epoch": 0.7352, "loss_ce": 0.7167516946792603, "loss_lvr": 0.3663017451763153, "loss_mode_switch": 0.0, "loss_total": 0.7533818483352661, "step": 1838 }, { "batch_size": 4, "epoch": 0.7352, "step": 1838, "tokens_per_device": 6240 }, { "epoch": 0.7352, "loss_ce": 0.4550303518772125, "loss_lvr": 0.5656379461288452, "loss_mode_switch": 0.0, "loss_total": 0.5115941762924194, "step": 1838 }, { "batch_size": 4, "epoch": 0.7352, "step": 1838, "tokens_per_device": 4312 }, { "epoch": 0.7352, "loss_ce": 0.23034320771694183, "loss_lvr": 0.9914279580116272, "loss_mode_switch": 0.0, "loss_total": 0.32948601245880127, "step": 1838 }, { "batch_size": 4, "epoch": 0.7352, "step": 1838, "tokens_per_device": 5316 }, { "epoch": 0.7352, "loss_ce": 0.12042887508869171, "loss_lvr": 0.6981255412101746, "loss_mode_switch": 0.0, "loss_total": 0.19024142622947693, "step": 1838 }, { "batch_size": 4, "epoch": 0.7352, "step": 1838, "tokens_per_device": 4232 }, { "epoch": 0.7352, "loss_ce": 0.96915602684021, "loss_lvr": 1.1434258222579956, "loss_mode_switch": 0.0, "loss_total": 1.0834985971450806, "step": 1838 }, { "epoch": 0.7356, "grad_norm": 1.591351866722107, "learning_rate": 1.7239165569778738e-06, "loss": 0.3986, "step": 1839 }, { "batch_size": 4, "epoch": 0.7356, "step": 1839, "tokens_per_device": 1604 }, { "epoch": 0.7356, "loss_ce": 0.5622686743736267, "loss_lvr": 0.9567553400993347, "loss_mode_switch": 0.0, "loss_total": 0.6579442024230957, "step": 1839 }, { "batch_size": 4, "epoch": 0.7356, "step": 1839, "tokens_per_device": 4428 }, { "epoch": 0.7356, "loss_ce": 0.02448953688144684, "loss_lvr": 0.7190226912498474, "loss_mode_switch": 0.0, "loss_total": 0.09639180451631546, "step": 1839 }, { "batch_size": 4, "epoch": 0.7356, "step": 1839, "tokens_per_device": 1264 }, { "epoch": 0.7356, "loss_ce": 0.3509330451488495, "loss_lvr": 0.8978838920593262, "loss_mode_switch": 0.0, "loss_total": 0.44072145223617554, "step": 1839 }, { "batch_size": 4, "epoch": 0.7356, "step": 1839, "tokens_per_device": 1264 }, { "epoch": 0.7356, "loss_ce": 0.36388036608695984, "loss_lvr": 0.8451820611953735, "loss_mode_switch": 0.0, "loss_total": 0.4483985900878906, "step": 1839 }, { "batch_size": 4, "epoch": 0.7356, "step": 1839, "tokens_per_device": 4272 }, { "epoch": 0.7356, "loss_ce": 0.2295254021883011, "loss_lvr": 0.8918545842170715, "loss_mode_switch": 0.0, "loss_total": 0.3187108635902405, "step": 1839 }, { "batch_size": 1, "epoch": 0.7356, "step": 1839, "tokens_per_device": 5130 }, { "epoch": 0.7356, "loss_ce": 0.06500330567359924, "loss_lvr": 0.30911365151405334, "loss_mode_switch": 0.0, "loss_total": 0.09591466933488846, "step": 1839 }, { "batch_size": 4, "epoch": 0.7356, "step": 1839, "tokens_per_device": 4300 }, { "epoch": 0.7356, "loss_ce": 0.2961716949939728, "loss_lvr": 0.5497141480445862, "loss_mode_switch": 0.0, "loss_total": 0.35114312171936035, "step": 1839 }, { "batch_size": 1, "epoch": 0.7356, "step": 1839, "tokens_per_device": 4829 }, { "epoch": 0.7356, "loss_ce": 0.005399278365075588, "loss_lvr": 0.40394288301467896, "loss_mode_switch": 0.0, "loss_total": 0.045793566852808, "step": 1839 }, { "epoch": 0.736, "grad_norm": 1.3329179286956787, "learning_rate": 1.7190259304670038e-06, "loss": 0.2973, "step": 1840 }, { "batch_size": 4, "epoch": 0.736, "step": 1840, "tokens_per_device": 4504 }, { "epoch": 0.736, "loss_ce": 0.20055130124092102, "loss_lvr": 0.7810072302818298, "loss_mode_switch": 0.0, "loss_total": 0.27865201234817505, "step": 1840 }, { "batch_size": 1, "epoch": 0.736, "step": 1840, "tokens_per_device": 6031 }, { "epoch": 0.736, "loss_ce": 0.17523504793643951, "loss_lvr": 0.41792359948158264, "loss_mode_switch": 0.0, "loss_total": 0.21702741086483002, "step": 1840 }, { "batch_size": 4, "epoch": 0.736, "step": 1840, "tokens_per_device": 1396 }, { "epoch": 0.736, "loss_ce": 0.4137086570262909, "loss_lvr": 0.966777503490448, "loss_mode_switch": 0.0, "loss_total": 0.5103864073753357, "step": 1840 }, { "batch_size": 4, "epoch": 0.736, "step": 1840, "tokens_per_device": 4320 }, { "epoch": 0.736, "loss_ce": 0.09040369093418121, "loss_lvr": 0.5848267078399658, "loss_mode_switch": 0.0, "loss_total": 0.14888636767864227, "step": 1840 }, { "batch_size": 4, "epoch": 0.736, "step": 1840, "tokens_per_device": 4128 }, { "epoch": 0.736, "loss_ce": 0.4421079158782959, "loss_lvr": 1.1880772113800049, "loss_mode_switch": 0.0, "loss_total": 0.5609156489372253, "step": 1840 }, { "batch_size": 1, "epoch": 0.736, "step": 1840, "tokens_per_device": 4972 }, { "epoch": 0.736, "loss_ce": 0.3436371088027954, "loss_lvr": 0.4197673797607422, "loss_mode_switch": 0.0, "loss_total": 0.3856138586997986, "step": 1840 }, { "batch_size": 1, "epoch": 0.736, "step": 1840, "tokens_per_device": 4894 }, { "epoch": 0.736, "loss_ce": 0.17181608080863953, "loss_lvr": 0.6170511841773987, "loss_mode_switch": 0.0, "loss_total": 0.23352119326591492, "step": 1840 }, { "batch_size": 4, "epoch": 0.736, "step": 1840, "tokens_per_device": 1500 }, { "epoch": 0.736, "loss_ce": 0.4189721345901489, "loss_lvr": 0.956558108329773, "loss_mode_switch": 0.0, "loss_total": 0.5146279335021973, "step": 1840 }, { "epoch": 0.7364, "grad_norm": 1.4026622772216797, "learning_rate": 1.714140810498648e-06, "loss": 0.2443, "step": 1841 }, { "batch_size": 4, "epoch": 0.7364, "step": 1841, "tokens_per_device": 11748 }, { "epoch": 0.7364, "loss_ce": 0.2801017463207245, "loss_lvr": 0.9761402010917664, "loss_mode_switch": 0.0, "loss_total": 0.3777157664299011, "step": 1841 }, { "batch_size": 4, "epoch": 0.7364, "step": 1841, "tokens_per_device": 6548 }, { "epoch": 0.7364, "loss_ce": 0.29595479369163513, "loss_lvr": 0.8012473583221436, "loss_mode_switch": 0.0, "loss_total": 0.3760795295238495, "step": 1841 }, { "batch_size": 4, "epoch": 0.7364, "step": 1841, "tokens_per_device": 6796 }, { "epoch": 0.7364, "loss_ce": 0.4820161461830139, "loss_lvr": 1.0561037063598633, "loss_mode_switch": 0.0, "loss_total": 0.5876265168190002, "step": 1841 }, { "batch_size": 4, "epoch": 0.7364, "step": 1841, "tokens_per_device": 2776 }, { "epoch": 0.7364, "loss_ce": 0.1780160367488861, "loss_lvr": 0.830225944519043, "loss_mode_switch": 0.0, "loss_total": 0.2610386312007904, "step": 1841 }, { "batch_size": 4, "epoch": 0.7364, "step": 1841, "tokens_per_device": 4424 }, { "epoch": 0.7364, "loss_ce": 0.2312590777873993, "loss_lvr": 0.777831494808197, "loss_mode_switch": 0.0, "loss_total": 0.30904221534729004, "step": 1841 }, { "batch_size": 4, "epoch": 0.7364, "step": 1841, "tokens_per_device": 3852 }, { "epoch": 0.7364, "loss_ce": 0.5534837245941162, "loss_lvr": 0.8055362105369568, "loss_mode_switch": 0.0, "loss_total": 0.6340373754501343, "step": 1841 }, { "batch_size": 4, "epoch": 0.7364, "step": 1841, "tokens_per_device": 3848 }, { "epoch": 0.7364, "loss_ce": 0.05722622945904732, "loss_lvr": 1.0078576803207397, "loss_mode_switch": 0.0, "loss_total": 0.1580120027065277, "step": 1841 }, { "batch_size": 4, "epoch": 0.7364, "step": 1841, "tokens_per_device": 1164 }, { "epoch": 0.7364, "loss_ce": 0.12943299114704132, "loss_lvr": 0.9513091444969177, "loss_mode_switch": 0.0, "loss_total": 0.22456389665603638, "step": 1841 }, { "epoch": 0.7368, "grad_norm": 1.4221158027648926, "learning_rate": 1.709261205271633e-06, "loss": 0.3101, "step": 1842 }, { "batch_size": 1, "epoch": 0.7368, "step": 1842, "tokens_per_device": 4860 }, { "epoch": 0.7368, "loss_ce": 0.004483070690184832, "loss_lvr": 0.3338477313518524, "loss_mode_switch": 0.0, "loss_total": 0.037867844104766846, "step": 1842 }, { "batch_size": 4, "epoch": 0.7368, "step": 1842, "tokens_per_device": 6696 }, { "epoch": 0.7368, "loss_ce": 0.249603733420372, "loss_lvr": 0.8827295303344727, "loss_mode_switch": 0.0, "loss_total": 0.33787667751312256, "step": 1842 }, { "batch_size": 1, "epoch": 0.7368, "step": 1842, "tokens_per_device": 6530 }, { "epoch": 0.7368, "loss_ce": 0.0018087588250637054, "loss_lvr": 0.22243714332580566, "loss_mode_switch": 0.0, "loss_total": 0.024052472785115242, "step": 1842 }, { "batch_size": 4, "epoch": 0.7368, "step": 1842, "tokens_per_device": 3728 }, { "epoch": 0.7368, "loss_ce": 0.20252655446529388, "loss_lvr": 0.9034336805343628, "loss_mode_switch": 0.0, "loss_total": 0.2928699254989624, "step": 1842 }, { "batch_size": 4, "epoch": 0.7368, "step": 1842, "tokens_per_device": 4876 }, { "epoch": 0.7368, "loss_ce": 0.10559085011482239, "loss_lvr": 0.6893452405929565, "loss_mode_switch": 0.0, "loss_total": 0.17452538013458252, "step": 1842 }, { "batch_size": 1, "epoch": 0.7368, "step": 1842, "tokens_per_device": 5043 }, { "epoch": 0.7368, "loss_ce": 0.24929119646549225, "loss_lvr": 0.5972225069999695, "loss_mode_switch": 0.0, "loss_total": 0.3090134561061859, "step": 1842 }, { "batch_size": 4, "epoch": 0.7368, "step": 1842, "tokens_per_device": 4252 }, { "epoch": 0.7368, "loss_ce": 0.30013173818588257, "loss_lvr": 0.923975944519043, "loss_mode_switch": 0.0, "loss_total": 0.39252933859825134, "step": 1842 }, { "batch_size": 4, "epoch": 0.7368, "step": 1842, "tokens_per_device": 4148 }, { "epoch": 0.7368, "loss_ce": 0.13986244797706604, "loss_lvr": 0.6588010787963867, "loss_mode_switch": 0.0, "loss_total": 0.20574256777763367, "step": 1842 }, { "epoch": 0.7372, "grad_norm": 1.3325424194335938, "learning_rate": 1.7043871229755198e-06, "loss": 0.311, "step": 1843 }, { "batch_size": 4, "epoch": 0.7372, "step": 1843, "tokens_per_device": 4924 }, { "epoch": 0.7372, "loss_ce": 0.19064606726169586, "loss_lvr": 0.7057703733444214, "loss_mode_switch": 0.0, "loss_total": 0.26122310757637024, "step": 1843 }, { "batch_size": 4, "epoch": 0.7372, "step": 1843, "tokens_per_device": 4848 }, { "epoch": 0.7372, "loss_ce": 0.012753669172525406, "loss_lvr": 0.8206015825271606, "loss_mode_switch": 0.0, "loss_total": 0.09481382369995117, "step": 1843 }, { "batch_size": 4, "epoch": 0.7372, "step": 1843, "tokens_per_device": 4508 }, { "epoch": 0.7372, "loss_ce": 0.027180172502994537, "loss_lvr": 0.8842081427574158, "loss_mode_switch": 0.0, "loss_total": 0.11560098826885223, "step": 1843 }, { "batch_size": 1, "epoch": 0.7372, "step": 1843, "tokens_per_device": 4714 }, { "epoch": 0.7372, "loss_ce": 0.23023714125156403, "loss_lvr": 0.5737740993499756, "loss_mode_switch": 0.0, "loss_total": 0.2876145541667938, "step": 1843 }, { "batch_size": 1, "epoch": 0.7372, "step": 1843, "tokens_per_device": 5328 }, { "epoch": 0.7372, "loss_ce": 0.0007072353619150817, "loss_lvr": 0.2848120629787445, "loss_mode_switch": 0.0, "loss_total": 0.02918844111263752, "step": 1843 }, { "batch_size": 1, "epoch": 0.7372, "step": 1843, "tokens_per_device": 4881 }, { "epoch": 0.7372, "loss_ce": 0.10758321732282639, "loss_lvr": 0.2656613886356354, "loss_mode_switch": 0.0, "loss_total": 0.13414935767650604, "step": 1843 }, { "batch_size": 1, "epoch": 0.7372, "step": 1843, "tokens_per_device": 5095 }, { "epoch": 0.7372, "loss_ce": 0.07488222420215607, "loss_lvr": 0.21099990606307983, "loss_mode_switch": 0.0, "loss_total": 0.09598221629858017, "step": 1843 }, { "batch_size": 4, "epoch": 0.7372, "step": 1843, "tokens_per_device": 3776 }, { "epoch": 0.7372, "loss_ce": 0.15327167510986328, "loss_lvr": 0.9545210003852844, "loss_mode_switch": 0.0, "loss_total": 0.24872377514839172, "step": 1843 }, { "epoch": 0.7376, "grad_norm": 1.2867275476455688, "learning_rate": 1.6995185717906092e-06, "loss": 0.2661, "step": 1844 }, { "batch_size": 1, "epoch": 0.7376, "step": 1844, "tokens_per_device": 4870 }, { "epoch": 0.7376, "loss_ce": 0.012140167877078056, "loss_lvr": 0.28326746821403503, "loss_mode_switch": 0.0, "loss_total": 0.04046691581606865, "step": 1844 }, { "batch_size": 4, "epoch": 0.7376, "step": 1844, "tokens_per_device": 7260 }, { "epoch": 0.7376, "loss_ce": 0.058470726013183594, "loss_lvr": 0.6318217515945435, "loss_mode_switch": 0.0, "loss_total": 0.12165290117263794, "step": 1844 }, { "batch_size": 1, "epoch": 0.7376, "step": 1844, "tokens_per_device": 5114 }, { "epoch": 0.7376, "loss_ce": 0.005926550831645727, "loss_lvr": 0.2980763912200928, "loss_mode_switch": 0.0, "loss_total": 0.03573419153690338, "step": 1844 }, { "batch_size": 4, "epoch": 0.7376, "step": 1844, "tokens_per_device": 9504 }, { "epoch": 0.7376, "loss_ce": 0.06257377564907074, "loss_lvr": 0.8123969435691833, "loss_mode_switch": 0.0, "loss_total": 0.14381346106529236, "step": 1844 }, { "batch_size": 4, "epoch": 0.7376, "step": 1844, "tokens_per_device": 5520 }, { "epoch": 0.7376, "loss_ce": 0.24086689949035645, "loss_lvr": 0.667763888835907, "loss_mode_switch": 0.0, "loss_total": 0.3076432943344116, "step": 1844 }, { "batch_size": 4, "epoch": 0.7376, "step": 1844, "tokens_per_device": 2652 }, { "epoch": 0.7376, "loss_ce": 0.24573209881782532, "loss_lvr": 1.055942416191101, "loss_mode_switch": 0.0, "loss_total": 0.3513263463973999, "step": 1844 }, { "batch_size": 1, "epoch": 0.7376, "step": 1844, "tokens_per_device": 5229 }, { "epoch": 0.7376, "loss_ce": 1.349261999130249, "loss_lvr": 0.47255951166152954, "loss_mode_switch": 0.0, "loss_total": 1.3965179920196533, "step": 1844 }, { "batch_size": 4, "epoch": 0.7376, "step": 1844, "tokens_per_device": 9900 }, { "epoch": 0.7376, "loss_ce": 0.23459859192371368, "loss_lvr": 0.7912009358406067, "loss_mode_switch": 0.0, "loss_total": 0.31371867656707764, "step": 1844 }, { "epoch": 0.738, "grad_norm": 1.3789221048355103, "learning_rate": 1.6946555598879138e-06, "loss": 0.315, "step": 1845 }, { "batch_size": 1, "epoch": 0.738, "step": 1845, "tokens_per_device": 5183 }, { "epoch": 0.738, "loss_ce": 0.03987101837992668, "loss_lvr": 0.9526798129081726, "loss_mode_switch": 0.0, "loss_total": 0.13513900339603424, "step": 1845 }, { "batch_size": 4, "epoch": 0.738, "step": 1845, "tokens_per_device": 4308 }, { "epoch": 0.738, "loss_ce": 0.4317246973514557, "loss_lvr": 1.0808215141296387, "loss_mode_switch": 0.0, "loss_total": 0.5398068428039551, "step": 1845 }, { "batch_size": 4, "epoch": 0.738, "step": 1845, "tokens_per_device": 6300 }, { "epoch": 0.738, "loss_ce": 0.8870461583137512, "loss_lvr": 0.6799970269203186, "loss_mode_switch": 0.0, "loss_total": 0.9550458788871765, "step": 1845 }, { "batch_size": 4, "epoch": 0.738, "step": 1845, "tokens_per_device": 5880 }, { "epoch": 0.738, "loss_ce": 0.18171143531799316, "loss_lvr": 0.8810553550720215, "loss_mode_switch": 0.0, "loss_total": 0.26981696486473083, "step": 1845 }, { "batch_size": 4, "epoch": 0.738, "step": 1845, "tokens_per_device": 1888 }, { "epoch": 0.738, "loss_ce": 0.23272442817687988, "loss_lvr": 0.9165560603141785, "loss_mode_switch": 0.0, "loss_total": 0.3243800401687622, "step": 1845 }, { "batch_size": 4, "epoch": 0.738, "step": 1845, "tokens_per_device": 1340 }, { "epoch": 0.738, "loss_ce": 0.4963585436344147, "loss_lvr": 0.9753665328025818, "loss_mode_switch": 0.0, "loss_total": 0.5938951969146729, "step": 1845 }, { "batch_size": 4, "epoch": 0.738, "step": 1845, "tokens_per_device": 3792 }, { "epoch": 0.738, "loss_ce": 0.21355466544628143, "loss_lvr": 1.1842020750045776, "loss_mode_switch": 0.0, "loss_total": 0.3319748640060425, "step": 1845 }, { "batch_size": 1, "epoch": 0.738, "step": 1845, "tokens_per_device": 5187 }, { "epoch": 0.738, "loss_ce": 0.03284734860062599, "loss_lvr": 0.42321112751960754, "loss_mode_switch": 0.0, "loss_total": 0.07516846060752869, "step": 1845 }, { "epoch": 0.7384, "grad_norm": 1.2815759181976318, "learning_rate": 1.6897980954291483e-06, "loss": 0.2826, "step": 1846 }, { "batch_size": 4, "epoch": 0.7384, "step": 1846, "tokens_per_device": 4620 }, { "epoch": 0.7384, "loss_ce": 0.01039982307702303, "loss_lvr": 1.124790072441101, "loss_mode_switch": 0.0, "loss_total": 0.12287883460521698, "step": 1846 }, { "batch_size": 1, "epoch": 0.7384, "step": 1846, "tokens_per_device": 4906 }, { "epoch": 0.7384, "loss_ce": 0.11154801398515701, "loss_lvr": 0.4994150400161743, "loss_mode_switch": 0.0, "loss_total": 0.16148951649665833, "step": 1846 }, { "batch_size": 4, "epoch": 0.7384, "step": 1846, "tokens_per_device": 9552 }, { "epoch": 0.7384, "loss_ce": 0.510831892490387, "loss_lvr": 0.7611135840415955, "loss_mode_switch": 0.0, "loss_total": 0.5869432687759399, "step": 1846 }, { "batch_size": 1, "epoch": 0.7384, "step": 1846, "tokens_per_device": 4890 }, { "epoch": 0.7384, "loss_ce": 0.8034583926200867, "loss_lvr": 0.7847692370414734, "loss_mode_switch": 0.0, "loss_total": 0.8819352984428406, "step": 1846 }, { "batch_size": 4, "epoch": 0.7384, "step": 1846, "tokens_per_device": 6876 }, { "epoch": 0.7384, "loss_ce": 0.11110541969537735, "loss_lvr": 0.9116281867027283, "loss_mode_switch": 0.0, "loss_total": 0.20226824283599854, "step": 1846 }, { "batch_size": 1, "epoch": 0.7384, "step": 1846, "tokens_per_device": 4867 }, { "epoch": 0.7384, "loss_ce": 0.00981207937002182, "loss_lvr": 0.17024753987789154, "loss_mode_switch": 0.0, "loss_total": 0.026836832985281944, "step": 1846 }, { "batch_size": 4, "epoch": 0.7384, "step": 1846, "tokens_per_device": 3864 }, { "epoch": 0.7384, "loss_ce": 0.41265812516212463, "loss_lvr": 1.0630030632019043, "loss_mode_switch": 0.0, "loss_total": 0.5189584493637085, "step": 1846 }, { "batch_size": 1, "epoch": 0.7384, "step": 1846, "tokens_per_device": 4881 }, { "epoch": 0.7384, "loss_ce": 0.00011203267786186188, "loss_lvr": 0.22356833517551422, "loss_mode_switch": 0.0, "loss_total": 0.022468866780400276, "step": 1846 }, { "epoch": 0.7388, "grad_norm": 1.493525743484497, "learning_rate": 1.6849461865667226e-06, "loss": 0.3253, "step": 1847 }, { "batch_size": 4, "epoch": 0.7388, "step": 1847, "tokens_per_device": 4764 }, { "epoch": 0.7388, "loss_ce": 0.058681756258010864, "loss_lvr": 0.7486591339111328, "loss_mode_switch": 0.0, "loss_total": 0.13354766368865967, "step": 1847 }, { "batch_size": 4, "epoch": 0.7388, "step": 1847, "tokens_per_device": 5932 }, { "epoch": 0.7388, "loss_ce": 0.2418871819972992, "loss_lvr": 0.7198935747146606, "loss_mode_switch": 0.0, "loss_total": 0.31387653946876526, "step": 1847 }, { "batch_size": 4, "epoch": 0.7388, "step": 1847, "tokens_per_device": 4220 }, { "epoch": 0.7388, "loss_ce": 0.062046535313129425, "loss_lvr": 0.9694749116897583, "loss_mode_switch": 0.0, "loss_total": 0.15899401903152466, "step": 1847 }, { "batch_size": 1, "epoch": 0.7388, "step": 1847, "tokens_per_device": 4911 }, { "epoch": 0.7388, "loss_ce": 0.0017678404692560434, "loss_lvr": 0.15905974805355072, "loss_mode_switch": 0.0, "loss_total": 0.017673814669251442, "step": 1847 }, { "batch_size": 4, "epoch": 0.7388, "step": 1847, "tokens_per_device": 4720 }, { "epoch": 0.7388, "loss_ce": 0.309836208820343, "loss_lvr": 0.7155439257621765, "loss_mode_switch": 0.0, "loss_total": 0.38139060139656067, "step": 1847 }, { "batch_size": 4, "epoch": 0.7388, "step": 1847, "tokens_per_device": 4228 }, { "epoch": 0.7388, "loss_ce": 0.3501673936843872, "loss_lvr": 0.9341898560523987, "loss_mode_switch": 0.0, "loss_total": 0.4435863792896271, "step": 1847 }, { "batch_size": 1, "epoch": 0.7388, "step": 1847, "tokens_per_device": 4641 }, { "epoch": 0.7388, "loss_ce": 0.008870052173733711, "loss_lvr": 0.241620734333992, "loss_mode_switch": 0.0, "loss_total": 0.03303212672472, "step": 1847 }, { "batch_size": 1, "epoch": 0.7388, "step": 1847, "tokens_per_device": 5165 }, { "epoch": 0.7388, "loss_ce": 0.13044624030590057, "loss_lvr": 0.3166121244430542, "loss_mode_switch": 0.0, "loss_total": 0.162107452750206, "step": 1847 }, { "epoch": 0.7392, "grad_norm": 1.3067325353622437, "learning_rate": 1.6800998414437165e-06, "loss": 0.2687, "step": 1848 }, { "batch_size": 4, "epoch": 0.7392, "step": 1848, "tokens_per_device": 4004 }, { "epoch": 0.7392, "loss_ce": 0.042507968842983246, "loss_lvr": 1.3175774812698364, "loss_mode_switch": 0.0, "loss_total": 0.17426571249961853, "step": 1848 }, { "batch_size": 4, "epoch": 0.7392, "step": 1848, "tokens_per_device": 5896 }, { "epoch": 0.7392, "loss_ce": 0.5016714930534363, "loss_lvr": 0.8735805749893188, "loss_mode_switch": 0.0, "loss_total": 0.5890295505523682, "step": 1848 }, { "batch_size": 1, "epoch": 0.7392, "step": 1848, "tokens_per_device": 6218 }, { "epoch": 0.7392, "loss_ce": 0.014989221468567848, "loss_lvr": 0.21789349615573883, "loss_mode_switch": 0.0, "loss_total": 0.03677856922149658, "step": 1848 }, { "batch_size": 4, "epoch": 0.7392, "step": 1848, "tokens_per_device": 3048 }, { "epoch": 0.7392, "loss_ce": 0.058449048548936844, "loss_lvr": 0.5661620497703552, "loss_mode_switch": 0.0, "loss_total": 0.11506525427103043, "step": 1848 }, { "batch_size": 4, "epoch": 0.7392, "step": 1848, "tokens_per_device": 6592 }, { "epoch": 0.7392, "loss_ce": 0.299862802028656, "loss_lvr": 0.498912513256073, "loss_mode_switch": 0.0, "loss_total": 0.34975406527519226, "step": 1848 }, { "batch_size": 4, "epoch": 0.7392, "step": 1848, "tokens_per_device": 2628 }, { "epoch": 0.7392, "loss_ce": 0.38580322265625, "loss_lvr": 0.8499568104743958, "loss_mode_switch": 0.0, "loss_total": 0.47079890966415405, "step": 1848 }, { "batch_size": 4, "epoch": 0.7392, "step": 1848, "tokens_per_device": 5708 }, { "epoch": 0.7392, "loss_ce": 0.18057863414287567, "loss_lvr": 0.8074691295623779, "loss_mode_switch": 0.0, "loss_total": 0.26132553815841675, "step": 1848 }, { "batch_size": 4, "epoch": 0.7392, "step": 1848, "tokens_per_device": 4264 }, { "epoch": 0.7392, "loss_ce": 0.04484907165169716, "loss_lvr": 1.0207496881484985, "loss_mode_switch": 0.0, "loss_total": 0.14692404866218567, "step": 1848 }, { "epoch": 0.7396, "grad_norm": 1.402109146118164, "learning_rate": 1.6752590681938768e-06, "loss": 0.3136, "step": 1849 }, { "batch_size": 4, "epoch": 0.7396, "step": 1849, "tokens_per_device": 1592 }, { "epoch": 0.7396, "loss_ce": 0.2850639522075653, "loss_lvr": 0.9616333246231079, "loss_mode_switch": 0.0, "loss_total": 0.3812272846698761, "step": 1849 }, { "batch_size": 4, "epoch": 0.7396, "step": 1849, "tokens_per_device": 11004 }, { "epoch": 0.7396, "loss_ce": 0.11529748886823654, "loss_lvr": 0.8413812518119812, "loss_mode_switch": 0.0, "loss_total": 0.19943562150001526, "step": 1849 }, { "batch_size": 4, "epoch": 0.7396, "step": 1849, "tokens_per_device": 5016 }, { "epoch": 0.7396, "loss_ce": 0.02753564901649952, "loss_lvr": 0.986047625541687, "loss_mode_switch": 0.0, "loss_total": 0.12614041566848755, "step": 1849 }, { "batch_size": 4, "epoch": 0.7396, "step": 1849, "tokens_per_device": 5972 }, { "epoch": 0.7396, "loss_ce": 0.3010247051715851, "loss_lvr": 1.0211678743362427, "loss_mode_switch": 0.0, "loss_total": 0.40314149856567383, "step": 1849 }, { "batch_size": 4, "epoch": 0.7396, "step": 1849, "tokens_per_device": 4440 }, { "epoch": 0.7396, "loss_ce": 0.4636279046535492, "loss_lvr": 0.7488815188407898, "loss_mode_switch": 0.0, "loss_total": 0.5385160446166992, "step": 1849 }, { "batch_size": 4, "epoch": 0.7396, "step": 1849, "tokens_per_device": 6116 }, { "epoch": 0.7396, "loss_ce": 0.05391573905944824, "loss_lvr": 0.7803230285644531, "loss_mode_switch": 0.0, "loss_total": 0.1319480538368225, "step": 1849 }, { "batch_size": 4, "epoch": 0.7396, "step": 1849, "tokens_per_device": 5576 }, { "epoch": 0.7396, "loss_ce": 0.12486233562231064, "loss_lvr": 0.872565507888794, "loss_mode_switch": 0.0, "loss_total": 0.21211889386177063, "step": 1849 }, { "batch_size": 4, "epoch": 0.7396, "step": 1849, "tokens_per_device": 4288 }, { "epoch": 0.7396, "loss_ce": 0.08301056921482086, "loss_lvr": 0.5981835126876831, "loss_mode_switch": 0.0, "loss_total": 0.14282892644405365, "step": 1849 }, { "epoch": 0.74, "grad_norm": 1.2327302694320679, "learning_rate": 1.6704238749415958e-06, "loss": 0.3129, "step": 1850 }, { "batch_size": 4, "epoch": 0.74, "step": 1850, "tokens_per_device": 1568 }, { "epoch": 0.74, "loss_ce": 0.023726968094706535, "loss_lvr": 1.225403904914856, "loss_mode_switch": 0.0, "loss_total": 0.1462673544883728, "step": 1850 }, { "batch_size": 4, "epoch": 0.74, "step": 1850, "tokens_per_device": 6940 }, { "epoch": 0.74, "loss_ce": 0.08888967335224152, "loss_lvr": 0.6775373816490173, "loss_mode_switch": 0.0, "loss_total": 0.15664342045783997, "step": 1850 }, { "batch_size": 4, "epoch": 0.74, "step": 1850, "tokens_per_device": 4716 }, { "epoch": 0.74, "loss_ce": 0.2241411805152893, "loss_lvr": 0.8241890668869019, "loss_mode_switch": 0.0, "loss_total": 0.30656009912490845, "step": 1850 }, { "batch_size": 4, "epoch": 0.74, "step": 1850, "tokens_per_device": 4704 }, { "epoch": 0.74, "loss_ce": 0.4798836410045624, "loss_lvr": 0.6645785570144653, "loss_mode_switch": 0.0, "loss_total": 0.5463414788246155, "step": 1850 }, { "batch_size": 4, "epoch": 0.74, "step": 1850, "tokens_per_device": 1960 }, { "epoch": 0.74, "loss_ce": 0.5585260987281799, "loss_lvr": 0.9122920036315918, "loss_mode_switch": 0.0, "loss_total": 0.6497552990913391, "step": 1850 }, { "batch_size": 4, "epoch": 0.74, "step": 1850, "tokens_per_device": 3968 }, { "epoch": 0.74, "loss_ce": 0.02012641541659832, "loss_lvr": 1.580039143562317, "loss_mode_switch": 0.0, "loss_total": 0.17813032865524292, "step": 1850 }, { "batch_size": 4, "epoch": 0.74, "step": 1850, "tokens_per_device": 3828 }, { "epoch": 0.74, "loss_ce": 0.054972048848867416, "loss_lvr": 0.8314878344535828, "loss_mode_switch": 0.0, "loss_total": 0.1381208300590515, "step": 1850 }, { "batch_size": 1, "epoch": 0.74, "step": 1850, "tokens_per_device": 4907 }, { "epoch": 0.74, "loss_ce": 0.15392373502254486, "loss_lvr": 0.386248916387558, "loss_mode_switch": 0.0, "loss_total": 0.19254863262176514, "step": 1850 }, { "epoch": 0.7404, "grad_norm": 1.4471591711044312, "learning_rate": 1.6655942698019001e-06, "loss": 0.3191, "step": 1851 }, { "batch_size": 4, "epoch": 0.7404, "step": 1851, "tokens_per_device": 4532 }, { "epoch": 0.7404, "loss_ce": 0.2203981876373291, "loss_lvr": 1.072114109992981, "loss_mode_switch": 0.0, "loss_total": 0.3276095986366272, "step": 1851 }, { "batch_size": 1, "epoch": 0.7404, "step": 1851, "tokens_per_device": 5097 }, { "epoch": 0.7404, "loss_ce": 0.00545716704800725, "loss_lvr": 0.9337339997291565, "loss_mode_switch": 0.0, "loss_total": 0.09883057326078415, "step": 1851 }, { "batch_size": 4, "epoch": 0.7404, "step": 1851, "tokens_per_device": 8728 }, { "epoch": 0.7404, "loss_ce": 0.10399164259433746, "loss_lvr": 0.6684296727180481, "loss_mode_switch": 0.0, "loss_total": 0.17083460092544556, "step": 1851 }, { "batch_size": 4, "epoch": 0.7404, "step": 1851, "tokens_per_device": 3744 }, { "epoch": 0.7404, "loss_ce": 0.0055013252422213554, "loss_lvr": 0.4757765233516693, "loss_mode_switch": 0.0, "loss_total": 0.05307897925376892, "step": 1851 }, { "batch_size": 4, "epoch": 0.7404, "step": 1851, "tokens_per_device": 4244 }, { "epoch": 0.7404, "loss_ce": 0.23878346383571625, "loss_lvr": 0.8282009959220886, "loss_mode_switch": 0.0, "loss_total": 0.32160356640815735, "step": 1851 }, { "batch_size": 1, "epoch": 0.7404, "step": 1851, "tokens_per_device": 4879 }, { "epoch": 0.7404, "loss_ce": 0.009337734431028366, "loss_lvr": 0.2269834578037262, "loss_mode_switch": 0.0, "loss_total": 0.032036080956459045, "step": 1851 }, { "batch_size": 4, "epoch": 0.7404, "step": 1851, "tokens_per_device": 4680 }, { "epoch": 0.7404, "loss_ce": 0.15103231370449066, "loss_lvr": 1.019625186920166, "loss_mode_switch": 0.0, "loss_total": 0.2529948353767395, "step": 1851 }, { "batch_size": 4, "epoch": 0.7404, "step": 1851, "tokens_per_device": 15056 }, { "epoch": 0.7404, "loss_ce": 0.06749562919139862, "loss_lvr": 0.583773672580719, "loss_mode_switch": 0.0, "loss_total": 0.12587299942970276, "step": 1851 }, { "epoch": 0.7408, "grad_norm": 1.245722770690918, "learning_rate": 1.6607702608804416e-06, "loss": 0.2536, "step": 1852 }, { "batch_size": 4, "epoch": 0.7408, "step": 1852, "tokens_per_device": 5568 }, { "epoch": 0.7408, "loss_ce": 0.0208289697766304, "loss_lvr": 0.9762184619903564, "loss_mode_switch": 0.0, "loss_total": 0.1184508204460144, "step": 1852 }, { "batch_size": 4, "epoch": 0.7408, "step": 1852, "tokens_per_device": 5708 }, { "epoch": 0.7408, "loss_ce": 0.28874969482421875, "loss_lvr": 0.9435761570930481, "loss_mode_switch": 0.0, "loss_total": 0.3831073045730591, "step": 1852 }, { "batch_size": 4, "epoch": 0.7408, "step": 1852, "tokens_per_device": 4244 }, { "epoch": 0.7408, "loss_ce": 0.3351110517978668, "loss_lvr": 0.7567009925842285, "loss_mode_switch": 0.0, "loss_total": 0.4107811450958252, "step": 1852 }, { "batch_size": 4, "epoch": 0.7408, "step": 1852, "tokens_per_device": 3508 }, { "epoch": 0.7408, "loss_ce": 0.04491836950182915, "loss_lvr": 0.7101025581359863, "loss_mode_switch": 0.0, "loss_total": 0.11592862010002136, "step": 1852 }, { "batch_size": 1, "epoch": 0.7408, "step": 1852, "tokens_per_device": 5015 }, { "epoch": 0.7408, "loss_ce": 0.0022039723116904497, "loss_lvr": 0.23788096010684967, "loss_mode_switch": 0.0, "loss_total": 0.025992069393396378, "step": 1852 }, { "batch_size": 4, "epoch": 0.7408, "step": 1852, "tokens_per_device": 11868 }, { "epoch": 0.7408, "loss_ce": 1.0940910577774048, "loss_lvr": 0.669937014579773, "loss_mode_switch": 0.0, "loss_total": 1.161084771156311, "step": 1852 }, { "batch_size": 4, "epoch": 0.7408, "step": 1852, "tokens_per_device": 4112 }, { "epoch": 0.7408, "loss_ce": 0.05623628944158554, "loss_lvr": 0.8145257830619812, "loss_mode_switch": 0.0, "loss_total": 0.13768887519836426, "step": 1852 }, { "batch_size": 4, "epoch": 0.7408, "step": 1852, "tokens_per_device": 3808 }, { "epoch": 0.7408, "loss_ce": 0.18383930623531342, "loss_lvr": 1.2216464281082153, "loss_mode_switch": 0.0, "loss_total": 0.30600395798683167, "step": 1852 }, { "epoch": 0.7412, "grad_norm": 1.2945085763931274, "learning_rate": 1.6559518562734777e-06, "loss": 0.3111, "step": 1853 }, { "batch_size": 4, "epoch": 0.7412, "step": 1853, "tokens_per_device": 4612 }, { "epoch": 0.7412, "loss_ce": 0.19348198175430298, "loss_lvr": 0.7839062213897705, "loss_mode_switch": 0.0, "loss_total": 0.2718726098537445, "step": 1853 }, { "batch_size": 4, "epoch": 0.7412, "step": 1853, "tokens_per_device": 4356 }, { "epoch": 0.7412, "loss_ce": 0.18985044956207275, "loss_lvr": 1.0248572826385498, "loss_mode_switch": 0.0, "loss_total": 0.2923361659049988, "step": 1853 }, { "batch_size": 4, "epoch": 0.7412, "step": 1853, "tokens_per_device": 1396 }, { "epoch": 0.7412, "loss_ce": 0.5733020901679993, "loss_lvr": 1.0730291604995728, "loss_mode_switch": 0.0, "loss_total": 0.6806049942970276, "step": 1853 }, { "batch_size": 4, "epoch": 0.7412, "step": 1853, "tokens_per_device": 1364 }, { "epoch": 0.7412, "loss_ce": 0.9526112079620361, "loss_lvr": 0.9806397557258606, "loss_mode_switch": 0.0, "loss_total": 1.0506751537322998, "step": 1853 }, { "batch_size": 4, "epoch": 0.7412, "step": 1853, "tokens_per_device": 4160 }, { "epoch": 0.7412, "loss_ce": 0.28197476267814636, "loss_lvr": 0.7958869934082031, "loss_mode_switch": 0.0, "loss_total": 0.36156347393989563, "step": 1853 }, { "batch_size": 1, "epoch": 0.7412, "step": 1853, "tokens_per_device": 4940 }, { "epoch": 0.7412, "loss_ce": 0.0309382863342762, "loss_lvr": 0.31246286630630493, "loss_mode_switch": 0.0, "loss_total": 0.06218457221984863, "step": 1853 }, { "batch_size": 4, "epoch": 0.7412, "step": 1853, "tokens_per_device": 3044 }, { "epoch": 0.7412, "loss_ce": 0.24285024404525757, "loss_lvr": 0.6018856167793274, "loss_mode_switch": 0.0, "loss_total": 0.3030388057231903, "step": 1853 }, { "batch_size": 4, "epoch": 0.7412, "step": 1853, "tokens_per_device": 3372 }, { "epoch": 0.7412, "loss_ce": 0.23039990663528442, "loss_lvr": 1.0929251909255981, "loss_mode_switch": 0.0, "loss_total": 0.3396924138069153, "step": 1853 }, { "epoch": 0.7416, "grad_norm": 1.3836134672164917, "learning_rate": 1.6511390640678592e-06, "loss": 0.2903, "step": 1854 }, { "batch_size": 4, "epoch": 0.7416, "step": 1854, "tokens_per_device": 10384 }, { "epoch": 0.7416, "loss_ce": 0.22126908600330353, "loss_lvr": 0.7408779859542847, "loss_mode_switch": 0.0, "loss_total": 0.2953568696975708, "step": 1854 }, { "batch_size": 1, "epoch": 0.7416, "step": 1854, "tokens_per_device": 5017 }, { "epoch": 0.7416, "loss_ce": 0.013204795308411121, "loss_lvr": 0.2984650731086731, "loss_mode_switch": 0.0, "loss_total": 0.043051302433013916, "step": 1854 }, { "batch_size": 1, "epoch": 0.7416, "step": 1854, "tokens_per_device": 4814 }, { "epoch": 0.7416, "loss_ce": 0.001286149024963379, "loss_lvr": 0.19603197276592255, "loss_mode_switch": 0.0, "loss_total": 0.020889347419142723, "step": 1854 }, { "batch_size": 4, "epoch": 0.7416, "step": 1854, "tokens_per_device": 4240 }, { "epoch": 0.7416, "loss_ce": 0.1630067378282547, "loss_lvr": 0.6372902989387512, "loss_mode_switch": 0.0, "loss_total": 0.22673577070236206, "step": 1854 }, { "batch_size": 1, "epoch": 0.7416, "step": 1854, "tokens_per_device": 4901 }, { "epoch": 0.7416, "loss_ce": 0.005562239792197943, "loss_lvr": 0.6434444189071655, "loss_mode_switch": 0.0, "loss_total": 0.06990668177604675, "step": 1854 }, { "batch_size": 1, "epoch": 0.7416, "step": 1854, "tokens_per_device": 5288 }, { "epoch": 0.7416, "loss_ce": 0.06049926206469536, "loss_lvr": 0.2869495749473572, "loss_mode_switch": 0.0, "loss_total": 0.08919422328472137, "step": 1854 }, { "batch_size": 4, "epoch": 0.7416, "step": 1854, "tokens_per_device": 4188 }, { "epoch": 0.7416, "loss_ce": 0.013561555184423923, "loss_lvr": 0.8700975775718689, "loss_mode_switch": 0.0, "loss_total": 0.10057131201028824, "step": 1854 }, { "batch_size": 4, "epoch": 0.7416, "step": 1854, "tokens_per_device": 4840 }, { "epoch": 0.7416, "loss_ce": 0.0756504237651825, "loss_lvr": 0.630335807800293, "loss_mode_switch": 0.0, "loss_total": 0.1386840045452118, "step": 1854 }, { "epoch": 0.742, "grad_norm": 1.324222207069397, "learning_rate": 1.6463318923410183e-06, "loss": 0.2909, "step": 1855 }, { "batch_size": 1, "epoch": 0.742, "step": 1855, "tokens_per_device": 5075 }, { "epoch": 0.742, "loss_ce": 0.004039580933749676, "loss_lvr": 0.3134017586708069, "loss_mode_switch": 0.0, "loss_total": 0.03537975996732712, "step": 1855 }, { "batch_size": 4, "epoch": 0.742, "step": 1855, "tokens_per_device": 2584 }, { "epoch": 0.742, "loss_ce": 0.06367836147546768, "loss_lvr": 0.8245711326599121, "loss_mode_switch": 0.0, "loss_total": 0.14613547921180725, "step": 1855 }, { "batch_size": 4, "epoch": 0.742, "step": 1855, "tokens_per_device": 3936 }, { "epoch": 0.742, "loss_ce": 0.30723321437835693, "loss_lvr": 1.49875807762146, "loss_mode_switch": 0.0, "loss_total": 0.4571090340614319, "step": 1855 }, { "batch_size": 1, "epoch": 0.742, "step": 1855, "tokens_per_device": 4917 }, { "epoch": 0.742, "loss_ce": 0.07499868422746658, "loss_lvr": 0.7576331496238708, "loss_mode_switch": 0.0, "loss_total": 0.15076199173927307, "step": 1855 }, { "batch_size": 4, "epoch": 0.742, "step": 1855, "tokens_per_device": 2676 }, { "epoch": 0.742, "loss_ce": 0.16999708116054535, "loss_lvr": 0.7561261057853699, "loss_mode_switch": 0.0, "loss_total": 0.24560970067977905, "step": 1855 }, { "batch_size": 4, "epoch": 0.742, "step": 1855, "tokens_per_device": 4376 }, { "epoch": 0.742, "loss_ce": 0.15889886021614075, "loss_lvr": 0.8893527388572693, "loss_mode_switch": 0.0, "loss_total": 0.24783414602279663, "step": 1855 }, { "batch_size": 4, "epoch": 0.742, "step": 1855, "tokens_per_device": 2852 }, { "epoch": 0.742, "loss_ce": 0.10660223662853241, "loss_lvr": 0.7772538661956787, "loss_mode_switch": 0.0, "loss_total": 0.184327632188797, "step": 1855 }, { "batch_size": 1, "epoch": 0.742, "step": 1855, "tokens_per_device": 4866 }, { "epoch": 0.742, "loss_ce": 0.015181516297161579, "loss_lvr": 0.261735200881958, "loss_mode_switch": 0.0, "loss_total": 0.041355036199092865, "step": 1855 }, { "epoch": 0.7424, "grad_norm": 1.1669667959213257, "learning_rate": 1.6415303491609519e-06, "loss": 0.2896, "step": 1856 }, { "batch_size": 4, "epoch": 0.7424, "step": 1856, "tokens_per_device": 1636 }, { "epoch": 0.7424, "loss_ce": 0.17962683737277985, "loss_lvr": 0.8621165156364441, "loss_mode_switch": 0.0, "loss_total": 0.26583850383758545, "step": 1856 }, { "batch_size": 4, "epoch": 0.7424, "step": 1856, "tokens_per_device": 4368 }, { "epoch": 0.7424, "loss_ce": 0.37898796796798706, "loss_lvr": 0.8238105177879333, "loss_mode_switch": 0.0, "loss_total": 0.46136903762817383, "step": 1856 }, { "batch_size": 4, "epoch": 0.7424, "step": 1856, "tokens_per_device": 1436 }, { "epoch": 0.7424, "loss_ce": 0.7943920493125916, "loss_lvr": 1.0258525609970093, "loss_mode_switch": 0.0, "loss_total": 0.8969773054122925, "step": 1856 }, { "batch_size": 4, "epoch": 0.7424, "step": 1856, "tokens_per_device": 4764 }, { "epoch": 0.7424, "loss_ce": 0.15659822523593903, "loss_lvr": 0.8958878517150879, "loss_mode_switch": 0.0, "loss_total": 0.2461870014667511, "step": 1856 }, { "batch_size": 4, "epoch": 0.7424, "step": 1856, "tokens_per_device": 13296 }, { "epoch": 0.7424, "loss_ce": 0.04927543178200722, "loss_lvr": 0.5148237347602844, "loss_mode_switch": 0.0, "loss_total": 0.10075780749320984, "step": 1856 }, { "batch_size": 1, "epoch": 0.7424, "step": 1856, "tokens_per_device": 5122 }, { "epoch": 0.7424, "loss_ce": 0.00046660538646392524, "loss_lvr": 0.38255977630615234, "loss_mode_switch": 0.0, "loss_total": 0.038722582161426544, "step": 1856 }, { "batch_size": 1, "epoch": 0.7424, "step": 1856, "tokens_per_device": 5091 }, { "epoch": 0.7424, "loss_ce": 0.014303433708846569, "loss_lvr": 0.4643267095088959, "loss_mode_switch": 0.0, "loss_total": 0.06073610484600067, "step": 1856 }, { "batch_size": 1, "epoch": 0.7424, "step": 1856, "tokens_per_device": 4675 }, { "epoch": 0.7424, "loss_ce": 0.22097419202327728, "loss_lvr": 0.39410269260406494, "loss_mode_switch": 0.0, "loss_total": 0.2603844702243805, "step": 1856 }, { "epoch": 0.7428, "grad_norm": 1.290130853652954, "learning_rate": 1.6367344425862136e-06, "loss": 0.303, "step": 1857 }, { "batch_size": 4, "epoch": 0.7428, "step": 1857, "tokens_per_device": 4328 }, { "epoch": 0.7428, "loss_ce": 0.4423966109752655, "loss_lvr": 0.9824250936508179, "loss_mode_switch": 0.0, "loss_total": 0.5406391024589539, "step": 1857 }, { "batch_size": 1, "epoch": 0.7428, "step": 1857, "tokens_per_device": 5124 }, { "epoch": 0.7428, "loss_ce": 0.04193990305066109, "loss_lvr": 0.34474608302116394, "loss_mode_switch": 0.0, "loss_total": 0.07641451060771942, "step": 1857 }, { "batch_size": 4, "epoch": 0.7428, "step": 1857, "tokens_per_device": 2244 }, { "epoch": 0.7428, "loss_ce": 0.28723305463790894, "loss_lvr": 1.0060811042785645, "loss_mode_switch": 0.0, "loss_total": 0.3878411650657654, "step": 1857 }, { "batch_size": 4, "epoch": 0.7428, "step": 1857, "tokens_per_device": 2896 }, { "epoch": 0.7428, "loss_ce": 0.18495042622089386, "loss_lvr": 0.922303318977356, "loss_mode_switch": 0.0, "loss_total": 0.2771807610988617, "step": 1857 }, { "batch_size": 4, "epoch": 0.7428, "step": 1857, "tokens_per_device": 4200 }, { "epoch": 0.7428, "loss_ce": 0.07394880056381226, "loss_lvr": 1.2231172323226929, "loss_mode_switch": 0.0, "loss_total": 0.19626052677631378, "step": 1857 }, { "batch_size": 4, "epoch": 0.7428, "step": 1857, "tokens_per_device": 1828 }, { "epoch": 0.7428, "loss_ce": 0.052580833435058594, "loss_lvr": 1.338823676109314, "loss_mode_switch": 0.0, "loss_total": 0.18646320700645447, "step": 1857 }, { "batch_size": 4, "epoch": 0.7428, "step": 1857, "tokens_per_device": 4724 }, { "epoch": 0.7428, "loss_ce": 0.03116670995950699, "loss_lvr": 1.3193308115005493, "loss_mode_switch": 0.0, "loss_total": 0.16309979557991028, "step": 1857 }, { "batch_size": 4, "epoch": 0.7428, "step": 1857, "tokens_per_device": 1324 }, { "epoch": 0.7428, "loss_ce": 0.2827904522418976, "loss_lvr": 0.9527725577354431, "loss_mode_switch": 0.0, "loss_total": 0.3780677020549774, "step": 1857 }, { "epoch": 0.7432, "grad_norm": 1.2113900184631348, "learning_rate": 1.6319441806658987e-06, "loss": 0.2793, "step": 1858 }, { "batch_size": 1, "epoch": 0.7432, "step": 1858, "tokens_per_device": 5234 }, { "epoch": 0.7432, "loss_ce": 0.4237096905708313, "loss_lvr": 0.46221476793289185, "loss_mode_switch": 0.0, "loss_total": 0.46993115544319153, "step": 1858 }, { "batch_size": 4, "epoch": 0.7432, "step": 1858, "tokens_per_device": 4596 }, { "epoch": 0.7432, "loss_ce": 0.17026767134666443, "loss_lvr": 0.7331207990646362, "loss_mode_switch": 0.0, "loss_total": 0.24357974529266357, "step": 1858 }, { "batch_size": 4, "epoch": 0.7432, "step": 1858, "tokens_per_device": 5020 }, { "epoch": 0.7432, "loss_ce": 0.0886664092540741, "loss_lvr": 0.7728898525238037, "loss_mode_switch": 0.0, "loss_total": 0.16595539450645447, "step": 1858 }, { "batch_size": 1, "epoch": 0.7432, "step": 1858, "tokens_per_device": 5043 }, { "epoch": 0.7432, "loss_ce": 0.01368620153516531, "loss_lvr": 0.41235625743865967, "loss_mode_switch": 0.0, "loss_total": 0.05492182821035385, "step": 1858 }, { "batch_size": 4, "epoch": 0.7432, "step": 1858, "tokens_per_device": 4216 }, { "epoch": 0.7432, "loss_ce": 0.2684132158756256, "loss_lvr": 0.8676494359970093, "loss_mode_switch": 0.0, "loss_total": 0.35517817735671997, "step": 1858 }, { "batch_size": 4, "epoch": 0.7432, "step": 1858, "tokens_per_device": 6080 }, { "epoch": 0.7432, "loss_ce": 0.0644344612956047, "loss_lvr": 0.7758837938308716, "loss_mode_switch": 0.0, "loss_total": 0.14202284812927246, "step": 1858 }, { "batch_size": 1, "epoch": 0.7432, "step": 1858, "tokens_per_device": 5046 }, { "epoch": 0.7432, "loss_ce": 0.0038491867017000914, "loss_lvr": 0.29010993242263794, "loss_mode_switch": 0.0, "loss_total": 0.03286018222570419, "step": 1858 }, { "batch_size": 4, "epoch": 0.7432, "step": 1858, "tokens_per_device": 1256 }, { "epoch": 0.7432, "loss_ce": 0.0928938165307045, "loss_lvr": 1.11006498336792, "loss_mode_switch": 0.0, "loss_total": 0.2039003074169159, "step": 1858 }, { "epoch": 0.7436, "grad_norm": 1.2459369897842407, "learning_rate": 1.6271595714396233e-06, "loss": 0.3039, "step": 1859 }, { "batch_size": 4, "epoch": 0.7436, "step": 1859, "tokens_per_device": 3752 }, { "epoch": 0.7436, "loss_ce": 0.11539370566606522, "loss_lvr": 0.8221073746681213, "loss_mode_switch": 0.0, "loss_total": 0.1976044476032257, "step": 1859 }, { "batch_size": 4, "epoch": 0.7436, "step": 1859, "tokens_per_device": 5452 }, { "epoch": 0.7436, "loss_ce": 0.15026386082172394, "loss_lvr": 0.7810691595077515, "loss_mode_switch": 0.0, "loss_total": 0.2283707857131958, "step": 1859 }, { "batch_size": 1, "epoch": 0.7436, "step": 1859, "tokens_per_device": 4885 }, { "epoch": 0.7436, "loss_ce": 0.004018652252852917, "loss_lvr": 0.25143200159072876, "loss_mode_switch": 0.0, "loss_total": 0.029161851853132248, "step": 1859 }, { "batch_size": 1, "epoch": 0.7436, "step": 1859, "tokens_per_device": 5470 }, { "epoch": 0.7436, "loss_ce": 0.019424835219979286, "loss_lvr": 0.3339020311832428, "loss_mode_switch": 0.0, "loss_total": 0.052815042436122894, "step": 1859 }, { "batch_size": 1, "epoch": 0.7436, "step": 1859, "tokens_per_device": 7277 }, { "epoch": 0.7436, "loss_ce": 0.001364428666420281, "loss_lvr": 0.26771754026412964, "loss_mode_switch": 0.0, "loss_total": 0.028136182576417923, "step": 1859 }, { "batch_size": 4, "epoch": 0.7436, "step": 1859, "tokens_per_device": 4772 }, { "epoch": 0.7436, "loss_ce": 0.16922126710414886, "loss_lvr": 0.8101467490196228, "loss_mode_switch": 0.0, "loss_total": 0.2502359449863434, "step": 1859 }, { "batch_size": 1, "epoch": 0.7436, "step": 1859, "tokens_per_device": 4931 }, { "epoch": 0.7436, "loss_ce": 0.0007621294353157282, "loss_lvr": 0.34034663438796997, "loss_mode_switch": 0.0, "loss_total": 0.03479679673910141, "step": 1859 }, { "batch_size": 4, "epoch": 0.7436, "step": 1859, "tokens_per_device": 1548 }, { "epoch": 0.7436, "loss_ce": 0.6807897686958313, "loss_lvr": 1.01093590259552, "loss_mode_switch": 0.0, "loss_total": 0.7818833589553833, "step": 1859 }, { "epoch": 0.744, "grad_norm": 1.2667553424835205, "learning_rate": 1.6223806229375182e-06, "loss": 0.2938, "step": 1860 }, { "batch_size": 4, "epoch": 0.744, "step": 1860, "tokens_per_device": 2696 }, { "epoch": 0.744, "loss_ce": 0.1366872936487198, "loss_lvr": 0.7249460220336914, "loss_mode_switch": 0.0, "loss_total": 0.20918190479278564, "step": 1860 }, { "batch_size": 1, "epoch": 0.744, "step": 1860, "tokens_per_device": 4120 }, { "epoch": 0.744, "loss_ce": 0.013967243954539299, "loss_lvr": 0.32499417662620544, "loss_mode_switch": 0.0, "loss_total": 0.04646666347980499, "step": 1860 }, { "batch_size": 4, "epoch": 0.744, "step": 1860, "tokens_per_device": 3976 }, { "epoch": 0.744, "loss_ce": 0.04494169354438782, "loss_lvr": 2.109950542449951, "loss_mode_switch": 0.0, "loss_total": 0.25593674182891846, "step": 1860 }, { "batch_size": 4, "epoch": 0.744, "step": 1860, "tokens_per_device": 4280 }, { "epoch": 0.744, "loss_ce": 0.3615057170391083, "loss_lvr": 0.8612669110298157, "loss_mode_switch": 0.0, "loss_total": 0.44763240218162537, "step": 1860 }, { "batch_size": 4, "epoch": 0.744, "step": 1860, "tokens_per_device": 4148 }, { "epoch": 0.744, "loss_ce": 0.1266000121831894, "loss_lvr": 0.8431879281997681, "loss_mode_switch": 0.0, "loss_total": 0.21091881394386292, "step": 1860 }, { "batch_size": 4, "epoch": 0.744, "step": 1860, "tokens_per_device": 4216 }, { "epoch": 0.744, "loss_ce": 0.2308879792690277, "loss_lvr": 0.8781306743621826, "loss_mode_switch": 0.0, "loss_total": 0.3187010586261749, "step": 1860 }, { "batch_size": 4, "epoch": 0.744, "step": 1860, "tokens_per_device": 4692 }, { "epoch": 0.744, "loss_ce": 0.20348918437957764, "loss_lvr": 1.0381176471710205, "loss_mode_switch": 0.0, "loss_total": 0.30730095505714417, "step": 1860 }, { "batch_size": 4, "epoch": 0.744, "step": 1860, "tokens_per_device": 3232 }, { "epoch": 0.744, "loss_ce": 0.02371242083609104, "loss_lvr": 0.8256447911262512, "loss_mode_switch": 0.0, "loss_total": 0.10627689957618713, "step": 1860 }, { "epoch": 0.7444, "grad_norm": 1.4748010635375977, "learning_rate": 1.6176073431802158e-06, "loss": 0.2843, "step": 1861 }, { "batch_size": 1, "epoch": 0.7444, "step": 1861, "tokens_per_device": 4881 }, { "epoch": 0.7444, "loss_ce": 0.12408718466758728, "loss_lvr": 0.2139623761177063, "loss_mode_switch": 0.0, "loss_total": 0.14548341929912567, "step": 1861 }, { "batch_size": 4, "epoch": 0.7444, "step": 1861, "tokens_per_device": 2644 }, { "epoch": 0.7444, "loss_ce": 0.21437019109725952, "loss_lvr": 0.7542203664779663, "loss_mode_switch": 0.0, "loss_total": 0.2897922396659851, "step": 1861 }, { "batch_size": 4, "epoch": 0.7444, "step": 1861, "tokens_per_device": 8740 }, { "epoch": 0.7444, "loss_ce": 0.2054128348827362, "loss_lvr": 0.6649953126907349, "loss_mode_switch": 0.0, "loss_total": 0.2719123661518097, "step": 1861 }, { "batch_size": 1, "epoch": 0.7444, "step": 1861, "tokens_per_device": 4895 }, { "epoch": 0.7444, "loss_ce": 0.035663917660713196, "loss_lvr": 0.6977424621582031, "loss_mode_switch": 0.0, "loss_total": 0.10543816536664963, "step": 1861 }, { "batch_size": 4, "epoch": 0.7444, "step": 1861, "tokens_per_device": 2076 }, { "epoch": 0.7444, "loss_ce": 0.10171087831258774, "loss_lvr": 0.7951757311820984, "loss_mode_switch": 0.0, "loss_total": 0.18122845888137817, "step": 1861 }, { "batch_size": 4, "epoch": 0.7444, "step": 1861, "tokens_per_device": 3788 }, { "epoch": 0.7444, "loss_ce": 0.4679061472415924, "loss_lvr": 0.8920077085494995, "loss_mode_switch": 0.0, "loss_total": 0.5571069121360779, "step": 1861 }, { "batch_size": 4, "epoch": 0.7444, "step": 1861, "tokens_per_device": 4376 }, { "epoch": 0.7444, "loss_ce": 0.6580805778503418, "loss_lvr": 0.470593124628067, "loss_mode_switch": 0.0, "loss_total": 0.7051398754119873, "step": 1861 }, { "batch_size": 4, "epoch": 0.7444, "step": 1861, "tokens_per_device": 15872 }, { "epoch": 0.7444, "loss_ce": 0.36728936433792114, "loss_lvr": 0.6324158906936646, "loss_mode_switch": 0.0, "loss_total": 0.43053096532821655, "step": 1861 }, { "epoch": 0.7448, "grad_norm": 1.3891135454177856, "learning_rate": 1.6128397401788353e-06, "loss": 0.2849, "step": 1862 }, { "batch_size": 4, "epoch": 0.7448, "step": 1862, "tokens_per_device": 7036 }, { "epoch": 0.7448, "loss_ce": 0.12206950038671494, "loss_lvr": 0.6622291803359985, "loss_mode_switch": 0.0, "loss_total": 0.18829241394996643, "step": 1862 }, { "batch_size": 1, "epoch": 0.7448, "step": 1862, "tokens_per_device": 4860 }, { "epoch": 0.7448, "loss_ce": 0.12429428845643997, "loss_lvr": 0.3866053819656372, "loss_mode_switch": 0.0, "loss_total": 0.16295482218265533, "step": 1862 }, { "batch_size": 1, "epoch": 0.7448, "step": 1862, "tokens_per_device": 5948 }, { "epoch": 0.7448, "loss_ce": 0.00866860244423151, "loss_lvr": 0.5549395680427551, "loss_mode_switch": 0.0, "loss_total": 0.06416255980730057, "step": 1862 }, { "batch_size": 4, "epoch": 0.7448, "step": 1862, "tokens_per_device": 4352 }, { "epoch": 0.7448, "loss_ce": 0.2930753827095032, "loss_lvr": 0.7095418572425842, "loss_mode_switch": 0.0, "loss_total": 0.36402958631515503, "step": 1862 }, { "batch_size": 4, "epoch": 0.7448, "step": 1862, "tokens_per_device": 5552 }, { "epoch": 0.7448, "loss_ce": 0.03909177705645561, "loss_lvr": 0.6624442338943481, "loss_mode_switch": 0.0, "loss_total": 0.10533620417118073, "step": 1862 }, { "batch_size": 4, "epoch": 0.7448, "step": 1862, "tokens_per_device": 8192 }, { "epoch": 0.7448, "loss_ce": 0.009228399954736233, "loss_lvr": 0.8534057140350342, "loss_mode_switch": 0.0, "loss_total": 0.09456897526979446, "step": 1862 }, { "batch_size": 1, "epoch": 0.7448, "step": 1862, "tokens_per_device": 4985 }, { "epoch": 0.7448, "loss_ce": 0.06864148378372192, "loss_lvr": 0.3731380105018616, "loss_mode_switch": 0.0, "loss_total": 0.10595528781414032, "step": 1862 }, { "batch_size": 1, "epoch": 0.7448, "step": 1862, "tokens_per_device": 4947 }, { "epoch": 0.7448, "loss_ce": 0.5120745897293091, "loss_lvr": 0.29238441586494446, "loss_mode_switch": 0.0, "loss_total": 0.5413130521774292, "step": 1862 }, { "epoch": 0.7452, "grad_norm": 1.3139152526855469, "learning_rate": 1.6080778219349652e-06, "loss": 0.2558, "step": 1863 }, { "batch_size": 4, "epoch": 0.7452, "step": 1863, "tokens_per_device": 5788 }, { "epoch": 0.7452, "loss_ce": 0.6046821475028992, "loss_lvr": 0.8497866988182068, "loss_mode_switch": 0.0, "loss_total": 0.6896607875823975, "step": 1863 }, { "batch_size": 1, "epoch": 0.7452, "step": 1863, "tokens_per_device": 5061 }, { "epoch": 0.7452, "loss_ce": 0.0033533237874507904, "loss_lvr": 1.209858775138855, "loss_mode_switch": 0.0, "loss_total": 0.12433920800685883, "step": 1863 }, { "batch_size": 4, "epoch": 0.7452, "step": 1863, "tokens_per_device": 4200 }, { "epoch": 0.7452, "loss_ce": 0.0676734670996666, "loss_lvr": 1.1684482097625732, "loss_mode_switch": 0.0, "loss_total": 0.18451829254627228, "step": 1863 }, { "batch_size": 4, "epoch": 0.7452, "step": 1863, "tokens_per_device": 5016 }, { "epoch": 0.7452, "loss_ce": 0.1563582569360733, "loss_lvr": 0.8840056657791138, "loss_mode_switch": 0.0, "loss_total": 0.24475881457328796, "step": 1863 }, { "batch_size": 4, "epoch": 0.7452, "step": 1863, "tokens_per_device": 12244 }, { "epoch": 0.7452, "loss_ce": 0.08817236870527267, "loss_lvr": 0.4991914629936218, "loss_mode_switch": 0.0, "loss_total": 0.13809151947498322, "step": 1863 }, { "batch_size": 1, "epoch": 0.7452, "step": 1863, "tokens_per_device": 4718 }, { "epoch": 0.7452, "loss_ce": 0.02159987948834896, "loss_lvr": 0.9126019477844238, "loss_mode_switch": 0.0, "loss_total": 0.11286007612943649, "step": 1863 }, { "batch_size": 4, "epoch": 0.7452, "step": 1863, "tokens_per_device": 7308 }, { "epoch": 0.7452, "loss_ce": 0.026550795882940292, "loss_lvr": 0.7244541645050049, "loss_mode_switch": 0.0, "loss_total": 0.09899620711803436, "step": 1863 }, { "batch_size": 4, "epoch": 0.7452, "step": 1863, "tokens_per_device": 1184 }, { "epoch": 0.7452, "loss_ce": 0.14827261865139008, "loss_lvr": 1.2972553968429565, "loss_mode_switch": 0.0, "loss_total": 0.277998149394989, "step": 1863 }, { "epoch": 0.7456, "grad_norm": 1.1969083547592163, "learning_rate": 1.6033215964406534e-06, "loss": 0.2524, "step": 1864 }, { "batch_size": 4, "epoch": 0.7456, "step": 1864, "tokens_per_device": 5968 }, { "epoch": 0.7456, "loss_ce": 0.17848387360572815, "loss_lvr": 0.6858929395675659, "loss_mode_switch": 0.0, "loss_total": 0.24707317352294922, "step": 1864 }, { "batch_size": 4, "epoch": 0.7456, "step": 1864, "tokens_per_device": 4260 }, { "epoch": 0.7456, "loss_ce": 0.4272679388523102, "loss_lvr": 1.0104564428329468, "loss_mode_switch": 0.0, "loss_total": 0.5283135771751404, "step": 1864 }, { "batch_size": 1, "epoch": 0.7456, "step": 1864, "tokens_per_device": 5063 }, { "epoch": 0.7456, "loss_ce": 0.2605477273464203, "loss_lvr": 0.5462160706520081, "loss_mode_switch": 0.0, "loss_total": 0.3151693344116211, "step": 1864 }, { "batch_size": 4, "epoch": 0.7456, "step": 1864, "tokens_per_device": 3788 }, { "epoch": 0.7456, "loss_ce": 0.6716177463531494, "loss_lvr": 0.972164511680603, "loss_mode_switch": 0.0, "loss_total": 0.7688341736793518, "step": 1864 }, { "batch_size": 1, "epoch": 0.7456, "step": 1864, "tokens_per_device": 5125 }, { "epoch": 0.7456, "loss_ce": 0.03800561651587486, "loss_lvr": 0.4791560471057892, "loss_mode_switch": 0.0, "loss_total": 0.08592122048139572, "step": 1864 }, { "batch_size": 4, "epoch": 0.7456, "step": 1864, "tokens_per_device": 4732 }, { "epoch": 0.7456, "loss_ce": 0.055412594228982925, "loss_lvr": 0.8835224509239197, "loss_mode_switch": 0.0, "loss_total": 0.14376483857631683, "step": 1864 }, { "batch_size": 1, "epoch": 0.7456, "step": 1864, "tokens_per_device": 5388 }, { "epoch": 0.7456, "loss_ce": 0.0836348906159401, "loss_lvr": 0.37517648935317993, "loss_mode_switch": 0.0, "loss_total": 0.12115253508090973, "step": 1864 }, { "batch_size": 1, "epoch": 0.7456, "step": 1864, "tokens_per_device": 4818 }, { "epoch": 0.7456, "loss_ce": 0.04049612581729889, "loss_lvr": 0.5279448628425598, "loss_mode_switch": 0.0, "loss_total": 0.09329061210155487, "step": 1864 }, { "epoch": 0.746, "grad_norm": 1.442518711090088, "learning_rate": 1.5985710716783936e-06, "loss": 0.3022, "step": 1865 }, { "batch_size": 1, "epoch": 0.746, "step": 1865, "tokens_per_device": 4885 }, { "epoch": 0.746, "loss_ce": 0.03387625142931938, "loss_lvr": 0.2994628846645355, "loss_mode_switch": 0.0, "loss_total": 0.06382253766059875, "step": 1865 }, { "batch_size": 1, "epoch": 0.746, "step": 1865, "tokens_per_device": 7572 }, { "epoch": 0.746, "loss_ce": 0.0007992471219040453, "loss_lvr": 0.2567117512226105, "loss_mode_switch": 0.0, "loss_total": 0.026470422744750977, "step": 1865 }, { "batch_size": 1, "epoch": 0.746, "step": 1865, "tokens_per_device": 5176 }, { "epoch": 0.746, "loss_ce": 0.0008562027360312641, "loss_lvr": 0.44518589973449707, "loss_mode_switch": 0.0, "loss_total": 0.0453747920691967, "step": 1865 }, { "batch_size": 4, "epoch": 0.746, "step": 1865, "tokens_per_device": 2668 }, { "epoch": 0.746, "loss_ce": 0.23670390248298645, "loss_lvr": 0.8910744190216064, "loss_mode_switch": 0.0, "loss_total": 0.32581135630607605, "step": 1865 }, { "batch_size": 4, "epoch": 0.746, "step": 1865, "tokens_per_device": 4164 }, { "epoch": 0.746, "loss_ce": 0.4797059893608093, "loss_lvr": 1.09091055393219, "loss_mode_switch": 0.0, "loss_total": 0.5887970328330994, "step": 1865 }, { "batch_size": 1, "epoch": 0.746, "step": 1865, "tokens_per_device": 5048 }, { "epoch": 0.746, "loss_ce": 0.14506477117538452, "loss_lvr": 0.3118838667869568, "loss_mode_switch": 0.0, "loss_total": 0.17625315487384796, "step": 1865 }, { "batch_size": 4, "epoch": 0.746, "step": 1865, "tokens_per_device": 4388 }, { "epoch": 0.746, "loss_ce": 0.561860203742981, "loss_lvr": 0.731994092464447, "loss_mode_switch": 0.0, "loss_total": 0.6350595951080322, "step": 1865 }, { "batch_size": 4, "epoch": 0.746, "step": 1865, "tokens_per_device": 9536 }, { "epoch": 0.746, "loss_ce": 0.06437806040048599, "loss_lvr": 0.3671598732471466, "loss_mode_switch": 0.0, "loss_total": 0.10109405219554901, "step": 1865 }, { "epoch": 0.7464, "grad_norm": 1.1206748485565186, "learning_rate": 1.5938262556211142e-06, "loss": 0.2418, "step": 1866 }, { "batch_size": 4, "epoch": 0.7464, "step": 1866, "tokens_per_device": 3784 }, { "epoch": 0.7464, "loss_ce": 0.1804322898387909, "loss_lvr": 1.0897696018218994, "loss_mode_switch": 0.0, "loss_total": 0.28940925002098083, "step": 1866 }, { "batch_size": 4, "epoch": 0.7464, "step": 1866, "tokens_per_device": 3316 }, { "epoch": 0.7464, "loss_ce": 0.05528309941291809, "loss_lvr": 1.8352184295654297, "loss_mode_switch": 0.0, "loss_total": 0.23880495131015778, "step": 1866 }, { "batch_size": 4, "epoch": 0.7464, "step": 1866, "tokens_per_device": 3792 }, { "epoch": 0.7464, "loss_ce": 0.17385290563106537, "loss_lvr": 0.8335872888565063, "loss_mode_switch": 0.0, "loss_total": 0.2572116255760193, "step": 1866 }, { "batch_size": 4, "epoch": 0.7464, "step": 1866, "tokens_per_device": 8584 }, { "epoch": 0.7464, "loss_ce": 0.18681195378303528, "loss_lvr": 0.6909540295600891, "loss_mode_switch": 0.0, "loss_total": 0.2559073567390442, "step": 1866 }, { "batch_size": 1, "epoch": 0.7464, "step": 1866, "tokens_per_device": 4866 }, { "epoch": 0.7464, "loss_ce": 0.01585368812084198, "loss_lvr": 0.35631391406059265, "loss_mode_switch": 0.0, "loss_total": 0.051485080271959305, "step": 1866 }, { "batch_size": 4, "epoch": 0.7464, "step": 1866, "tokens_per_device": 1584 }, { "epoch": 0.7464, "loss_ce": 0.4389277994632721, "loss_lvr": 0.9854416847229004, "loss_mode_switch": 0.0, "loss_total": 0.5374719500541687, "step": 1866 }, { "batch_size": 1, "epoch": 0.7464, "step": 1866, "tokens_per_device": 4980 }, { "epoch": 0.7464, "loss_ce": 0.0006806981400586665, "loss_lvr": 0.37889131903648376, "loss_mode_switch": 0.0, "loss_total": 0.03856983408331871, "step": 1866 }, { "batch_size": 4, "epoch": 0.7464, "step": 1866, "tokens_per_device": 4820 }, { "epoch": 0.7464, "loss_ce": 0.09733282774686813, "loss_lvr": 0.8093679547309875, "loss_mode_switch": 0.0, "loss_total": 0.178269624710083, "step": 1866 }, { "epoch": 0.7468, "grad_norm": 1.220435380935669, "learning_rate": 1.589087156232163e-06, "loss": 0.291, "step": 1867 }, { "batch_size": 4, "epoch": 0.7468, "step": 1867, "tokens_per_device": 4608 }, { "epoch": 0.7468, "loss_ce": 0.14822691679000854, "loss_lvr": 1.133959412574768, "loss_mode_switch": 0.0, "loss_total": 0.2616228461265564, "step": 1867 }, { "batch_size": 4, "epoch": 0.7468, "step": 1867, "tokens_per_device": 4592 }, { "epoch": 0.7468, "loss_ce": 0.6958171725273132, "loss_lvr": 0.6832889318466187, "loss_mode_switch": 0.0, "loss_total": 0.764146089553833, "step": 1867 }, { "batch_size": 4, "epoch": 0.7468, "step": 1867, "tokens_per_device": 4432 }, { "epoch": 0.7468, "loss_ce": 0.2736571729183197, "loss_lvr": 0.9736424088478088, "loss_mode_switch": 0.0, "loss_total": 0.37102141976356506, "step": 1867 }, { "batch_size": 1, "epoch": 0.7468, "step": 1867, "tokens_per_device": 4906 }, { "epoch": 0.7468, "loss_ce": 0.00148340396117419, "loss_lvr": 0.2758110761642456, "loss_mode_switch": 0.0, "loss_total": 0.029064511880278587, "step": 1867 }, { "batch_size": 1, "epoch": 0.7468, "step": 1867, "tokens_per_device": 5094 }, { "epoch": 0.7468, "loss_ce": 0.08604849874973297, "loss_lvr": 0.37665075063705444, "loss_mode_switch": 0.0, "loss_total": 0.12371357530355453, "step": 1867 }, { "batch_size": 1, "epoch": 0.7468, "step": 1867, "tokens_per_device": 4878 }, { "epoch": 0.7468, "loss_ce": 0.003009382402524352, "loss_lvr": 0.5077030062675476, "loss_mode_switch": 0.0, "loss_total": 0.053779684007167816, "step": 1867 }, { "batch_size": 4, "epoch": 0.7468, "step": 1867, "tokens_per_device": 3884 }, { "epoch": 0.7468, "loss_ce": 0.35419777035713196, "loss_lvr": 0.9388065338134766, "loss_mode_switch": 0.0, "loss_total": 0.4480784237384796, "step": 1867 }, { "batch_size": 4, "epoch": 0.7468, "step": 1867, "tokens_per_device": 2660 }, { "epoch": 0.7468, "loss_ce": 0.08755771815776825, "loss_lvr": 1.3280062675476074, "loss_mode_switch": 0.0, "loss_total": 0.22035834193229675, "step": 1867 }, { "epoch": 0.7472, "grad_norm": 1.3372180461883545, "learning_rate": 1.5843537814652894e-06, "loss": 0.2955, "step": 1868 }, { "batch_size": 4, "epoch": 0.7472, "step": 1868, "tokens_per_device": 6208 }, { "epoch": 0.7472, "loss_ce": 0.060841064900159836, "loss_lvr": 0.7806634902954102, "loss_mode_switch": 0.0, "loss_total": 0.13890741765499115, "step": 1868 }, { "batch_size": 1, "epoch": 0.7472, "step": 1868, "tokens_per_device": 6396 }, { "epoch": 0.7472, "loss_ce": 0.020870182663202286, "loss_lvr": 0.3302758038043976, "loss_mode_switch": 0.0, "loss_total": 0.05389776453375816, "step": 1868 }, { "batch_size": 4, "epoch": 0.7472, "step": 1868, "tokens_per_device": 15772 }, { "epoch": 0.7472, "loss_ce": 0.41947874426841736, "loss_lvr": 0.34923383593559265, "loss_mode_switch": 0.0, "loss_total": 0.4544021189212799, "step": 1868 }, { "batch_size": 4, "epoch": 0.7472, "step": 1868, "tokens_per_device": 5684 }, { "epoch": 0.7472, "loss_ce": 0.5375120639801025, "loss_lvr": 0.931574821472168, "loss_mode_switch": 0.0, "loss_total": 0.6306695342063904, "step": 1868 }, { "batch_size": 1, "epoch": 0.7472, "step": 1868, "tokens_per_device": 5018 }, { "epoch": 0.7472, "loss_ce": 0.04773981124162674, "loss_lvr": 0.3053553104400635, "loss_mode_switch": 0.0, "loss_total": 0.07827534526586533, "step": 1868 }, { "batch_size": 4, "epoch": 0.7472, "step": 1868, "tokens_per_device": 5336 }, { "epoch": 0.7472, "loss_ce": 0.05636503919959068, "loss_lvr": 0.7311709523200989, "loss_mode_switch": 0.0, "loss_total": 0.12948213517665863, "step": 1868 }, { "batch_size": 1, "epoch": 0.7472, "step": 1868, "tokens_per_device": 4819 }, { "epoch": 0.7472, "loss_ce": 0.011836613528430462, "loss_lvr": 0.20250774919986725, "loss_mode_switch": 0.0, "loss_total": 0.03208738937973976, "step": 1868 }, { "batch_size": 4, "epoch": 0.7472, "step": 1868, "tokens_per_device": 4340 }, { "epoch": 0.7472, "loss_ce": 0.4614015817642212, "loss_lvr": 0.9597867727279663, "loss_mode_switch": 0.0, "loss_total": 0.5573802590370178, "step": 1868 }, { "epoch": 0.7476, "grad_norm": 1.2230441570281982, "learning_rate": 1.5796261392646357e-06, "loss": 0.2672, "step": 1869 }, { "batch_size": 4, "epoch": 0.7476, "step": 1869, "tokens_per_device": 15532 }, { "epoch": 0.7476, "loss_ce": 0.5026416778564453, "loss_lvr": 0.8789387941360474, "loss_mode_switch": 0.0, "loss_total": 0.590535581111908, "step": 1869 }, { "batch_size": 1, "epoch": 0.7476, "step": 1869, "tokens_per_device": 5024 }, { "epoch": 0.7476, "loss_ce": 0.0002781820949167013, "loss_lvr": 0.5166140794754028, "loss_mode_switch": 0.0, "loss_total": 0.05193959176540375, "step": 1869 }, { "batch_size": 4, "epoch": 0.7476, "step": 1869, "tokens_per_device": 1592 }, { "epoch": 0.7476, "loss_ce": 0.6402199268341064, "loss_lvr": 0.9100236296653748, "loss_mode_switch": 0.0, "loss_total": 0.7312222719192505, "step": 1869 }, { "batch_size": 4, "epoch": 0.7476, "step": 1869, "tokens_per_device": 5896 }, { "epoch": 0.7476, "loss_ce": 0.05923215299844742, "loss_lvr": 0.8812318444252014, "loss_mode_switch": 0.0, "loss_total": 0.1473553478717804, "step": 1869 }, { "batch_size": 4, "epoch": 0.7476, "step": 1869, "tokens_per_device": 4296 }, { "epoch": 0.7476, "loss_ce": 0.3081417381763458, "loss_lvr": 0.8500968217849731, "loss_mode_switch": 0.0, "loss_total": 0.3931514322757721, "step": 1869 }, { "batch_size": 1, "epoch": 0.7476, "step": 1869, "tokens_per_device": 5088 }, { "epoch": 0.7476, "loss_ce": 0.016553333029150963, "loss_lvr": 0.25866085290908813, "loss_mode_switch": 0.0, "loss_total": 0.042419418692588806, "step": 1869 }, { "batch_size": 4, "epoch": 0.7476, "step": 1869, "tokens_per_device": 7004 }, { "epoch": 0.7476, "loss_ce": 0.01043263915926218, "loss_lvr": 1.503979206085205, "loss_mode_switch": 0.0, "loss_total": 0.1608305722475052, "step": 1869 }, { "batch_size": 1, "epoch": 0.7476, "step": 1869, "tokens_per_device": 4882 }, { "epoch": 0.7476, "loss_ce": 0.009230642579495907, "loss_lvr": 0.2655000388622284, "loss_mode_switch": 0.0, "loss_total": 0.0357806459069252, "step": 1869 }, { "epoch": 0.748, "grad_norm": 1.3799859285354614, "learning_rate": 1.5749042375647261e-06, "loss": 0.3142, "step": 1870 }, { "batch_size": 4, "epoch": 0.748, "step": 1870, "tokens_per_device": 3640 }, { "epoch": 0.748, "loss_ce": 0.29956790804862976, "loss_lvr": 0.8656476140022278, "loss_mode_switch": 0.0, "loss_total": 0.3861326575279236, "step": 1870 }, { "batch_size": 4, "epoch": 0.748, "step": 1870, "tokens_per_device": 2660 }, { "epoch": 0.748, "loss_ce": 0.31962478160858154, "loss_lvr": 0.895041286945343, "loss_mode_switch": 0.0, "loss_total": 0.40912890434265137, "step": 1870 }, { "batch_size": 1, "epoch": 0.748, "step": 1870, "tokens_per_device": 4874 }, { "epoch": 0.748, "loss_ce": 0.012358260340988636, "loss_lvr": 0.4475024342536926, "loss_mode_switch": 0.0, "loss_total": 0.057108502835035324, "step": 1870 }, { "batch_size": 1, "epoch": 0.748, "step": 1870, "tokens_per_device": 4871 }, { "epoch": 0.748, "loss_ce": 0.058756135404109955, "loss_lvr": 0.21534444391727448, "loss_mode_switch": 0.0, "loss_total": 0.08029057830572128, "step": 1870 }, { "batch_size": 4, "epoch": 0.748, "step": 1870, "tokens_per_device": 8108 }, { "epoch": 0.748, "loss_ce": 0.07560179382562637, "loss_lvr": 0.7311080098152161, "loss_mode_switch": 0.0, "loss_total": 0.14871260523796082, "step": 1870 }, { "batch_size": 4, "epoch": 0.748, "step": 1870, "tokens_per_device": 5004 }, { "epoch": 0.748, "loss_ce": 0.03635850548744202, "loss_lvr": 0.8972817659378052, "loss_mode_switch": 0.0, "loss_total": 0.12608668208122253, "step": 1870 }, { "batch_size": 1, "epoch": 0.748, "step": 1870, "tokens_per_device": 4098 }, { "epoch": 0.748, "loss_ce": 0.007930092513561249, "loss_lvr": 0.49147117137908936, "loss_mode_switch": 0.0, "loss_total": 0.057077210396528244, "step": 1870 }, { "batch_size": 4, "epoch": 0.748, "step": 1870, "tokens_per_device": 15572 }, { "epoch": 0.748, "loss_ce": 0.26966139674186707, "loss_lvr": 0.6882034540176392, "loss_mode_switch": 0.0, "loss_total": 0.33848175406455994, "step": 1870 }, { "epoch": 0.7484, "grad_norm": 1.0814690589904785, "learning_rate": 1.5701880842904503e-06, "loss": 0.2206, "step": 1871 }, { "batch_size": 4, "epoch": 0.7484, "step": 1871, "tokens_per_device": 4376 }, { "epoch": 0.7484, "loss_ce": 0.054204195737838745, "loss_lvr": 0.817280650138855, "loss_mode_switch": 0.0, "loss_total": 0.13593226671218872, "step": 1871 }, { "batch_size": 4, "epoch": 0.7484, "step": 1871, "tokens_per_device": 4240 }, { "epoch": 0.7484, "loss_ce": 0.270389586687088, "loss_lvr": 0.8552735447883606, "loss_mode_switch": 0.0, "loss_total": 0.35591694712638855, "step": 1871 }, { "batch_size": 1, "epoch": 0.7484, "step": 1871, "tokens_per_device": 4894 }, { "epoch": 0.7484, "loss_ce": 0.0051412079483270645, "loss_lvr": 0.4832818806171417, "loss_mode_switch": 0.0, "loss_total": 0.053469397127628326, "step": 1871 }, { "batch_size": 1, "epoch": 0.7484, "step": 1871, "tokens_per_device": 4833 }, { "epoch": 0.7484, "loss_ce": 0.047367461025714874, "loss_lvr": 0.38427597284317017, "loss_mode_switch": 0.0, "loss_total": 0.08579505980014801, "step": 1871 }, { "batch_size": 4, "epoch": 0.7484, "step": 1871, "tokens_per_device": 4440 }, { "epoch": 0.7484, "loss_ce": 0.2034887969493866, "loss_lvr": 0.7452680468559265, "loss_mode_switch": 0.0, "loss_total": 0.2780156135559082, "step": 1871 }, { "batch_size": 4, "epoch": 0.7484, "step": 1871, "tokens_per_device": 5032 }, { "epoch": 0.7484, "loss_ce": 0.137010857462883, "loss_lvr": 0.7114635109901428, "loss_mode_switch": 0.0, "loss_total": 0.20815721154212952, "step": 1871 }, { "batch_size": 1, "epoch": 0.7484, "step": 1871, "tokens_per_device": 5266 }, { "epoch": 0.7484, "loss_ce": 0.0030982571188360453, "loss_lvr": 0.497587114572525, "loss_mode_switch": 0.0, "loss_total": 0.05285697057843208, "step": 1871 }, { "batch_size": 4, "epoch": 0.7484, "step": 1871, "tokens_per_device": 2644 }, { "epoch": 0.7484, "loss_ce": 0.3650238513946533, "loss_lvr": 0.849856436252594, "loss_mode_switch": 0.0, "loss_total": 0.4500094950199127, "step": 1871 }, { "epoch": 0.7488, "grad_norm": 1.3899060487747192, "learning_rate": 1.565477687357047e-06, "loss": 0.296, "step": 1872 }, { "batch_size": 1, "epoch": 0.7488, "step": 1872, "tokens_per_device": 5724 }, { "epoch": 0.7488, "loss_ce": 0.11693675071001053, "loss_lvr": 0.2875411808490753, "loss_mode_switch": 0.0, "loss_total": 0.14569087326526642, "step": 1872 }, { "batch_size": 4, "epoch": 0.7488, "step": 1872, "tokens_per_device": 4328 }, { "epoch": 0.7488, "loss_ce": 0.03115788660943508, "loss_lvr": 0.7811911106109619, "loss_mode_switch": 0.0, "loss_total": 0.10927700251340866, "step": 1872 }, { "batch_size": 4, "epoch": 0.7488, "step": 1872, "tokens_per_device": 4144 }, { "epoch": 0.7488, "loss_ce": 0.035850174725055695, "loss_lvr": 0.806036651134491, "loss_mode_switch": 0.0, "loss_total": 0.11645384132862091, "step": 1872 }, { "batch_size": 4, "epoch": 0.7488, "step": 1872, "tokens_per_device": 8612 }, { "epoch": 0.7488, "loss_ce": 0.2410772442817688, "loss_lvr": 0.7704674601554871, "loss_mode_switch": 0.0, "loss_total": 0.318123996257782, "step": 1872 }, { "batch_size": 4, "epoch": 0.7488, "step": 1872, "tokens_per_device": 4284 }, { "epoch": 0.7488, "loss_ce": 0.11738421767950058, "loss_lvr": 0.9389882683753967, "loss_mode_switch": 0.0, "loss_total": 0.21128304302692413, "step": 1872 }, { "batch_size": 4, "epoch": 0.7488, "step": 1872, "tokens_per_device": 3812 }, { "epoch": 0.7488, "loss_ce": 0.0012462573358789086, "loss_lvr": 0.501729428768158, "loss_mode_switch": 0.0, "loss_total": 0.051419202238321304, "step": 1872 }, { "batch_size": 4, "epoch": 0.7488, "step": 1872, "tokens_per_device": 4304 }, { "epoch": 0.7488, "loss_ce": 0.06000738963484764, "loss_lvr": 0.8382686972618103, "loss_mode_switch": 0.0, "loss_total": 0.14383426308631897, "step": 1872 }, { "batch_size": 4, "epoch": 0.7488, "step": 1872, "tokens_per_device": 2796 }, { "epoch": 0.7488, "loss_ce": 0.11614422500133514, "loss_lvr": 0.6513460874557495, "loss_mode_switch": 0.0, "loss_total": 0.18127882480621338, "step": 1872 }, { "epoch": 0.7492, "grad_norm": 1.3375170230865479, "learning_rate": 1.5607730546700956e-06, "loss": 0.2645, "step": 1873 }, { "batch_size": 4, "epoch": 0.7492, "step": 1873, "tokens_per_device": 3772 }, { "epoch": 0.7492, "loss_ce": 0.1149221807718277, "loss_lvr": 0.85948646068573, "loss_mode_switch": 0.0, "loss_total": 0.2008708268404007, "step": 1873 }, { "batch_size": 4, "epoch": 0.7492, "step": 1873, "tokens_per_device": 5600 }, { "epoch": 0.7492, "loss_ce": 0.09238272905349731, "loss_lvr": 0.7708612680435181, "loss_mode_switch": 0.0, "loss_total": 0.16946884989738464, "step": 1873 }, { "batch_size": 4, "epoch": 0.7492, "step": 1873, "tokens_per_device": 3620 }, { "epoch": 0.7492, "loss_ce": 0.1188935711979866, "loss_lvr": 0.7231698036193848, "loss_mode_switch": 0.0, "loss_total": 0.1912105530500412, "step": 1873 }, { "batch_size": 4, "epoch": 0.7492, "step": 1873, "tokens_per_device": 2648 }, { "epoch": 0.7492, "loss_ce": 0.46015265583992004, "loss_lvr": 0.7589812874794006, "loss_mode_switch": 0.0, "loss_total": 0.5360507965087891, "step": 1873 }, { "batch_size": 4, "epoch": 0.7492, "step": 1873, "tokens_per_device": 4728 }, { "epoch": 0.7492, "loss_ce": 0.055984701961278915, "loss_lvr": 0.8096520900726318, "loss_mode_switch": 0.0, "loss_total": 0.13694991171360016, "step": 1873 }, { "batch_size": 1, "epoch": 0.7492, "step": 1873, "tokens_per_device": 4928 }, { "epoch": 0.7492, "loss_ce": 0.12722015380859375, "loss_lvr": 0.3790298104286194, "loss_mode_switch": 0.0, "loss_total": 0.1651231348514557, "step": 1873 }, { "batch_size": 4, "epoch": 0.7492, "step": 1873, "tokens_per_device": 7776 }, { "epoch": 0.7492, "loss_ce": 0.43501266837120056, "loss_lvr": 0.7850879430770874, "loss_mode_switch": 0.0, "loss_total": 0.5135214328765869, "step": 1873 }, { "batch_size": 4, "epoch": 0.7492, "step": 1873, "tokens_per_device": 6768 }, { "epoch": 0.7492, "loss_ce": 0.23562999069690704, "loss_lvr": 0.7128484845161438, "loss_mode_switch": 0.0, "loss_total": 0.3069148361682892, "step": 1873 }, { "epoch": 0.7496, "grad_norm": 1.6831704378128052, "learning_rate": 1.5560741941254998e-06, "loss": 0.2845, "step": 1874 }, { "batch_size": 4, "epoch": 0.7496, "step": 1874, "tokens_per_device": 4260 }, { "epoch": 0.7496, "loss_ce": 0.012856580317020416, "loss_lvr": 0.855983555316925, "loss_mode_switch": 0.0, "loss_total": 0.09845493733882904, "step": 1874 }, { "batch_size": 1, "epoch": 0.7496, "step": 1874, "tokens_per_device": 5018 }, { "epoch": 0.7496, "loss_ce": 0.039184216409921646, "loss_lvr": 0.23675931990146637, "loss_mode_switch": 0.0, "loss_total": 0.0628601461648941, "step": 1874 }, { "batch_size": 4, "epoch": 0.7496, "step": 1874, "tokens_per_device": 5936 }, { "epoch": 0.7496, "loss_ce": 0.06588879972696304, "loss_lvr": 0.9088390469551086, "loss_mode_switch": 0.0, "loss_total": 0.1567727029323578, "step": 1874 }, { "batch_size": 1, "epoch": 0.7496, "step": 1874, "tokens_per_device": 4550 }, { "epoch": 0.7496, "loss_ce": 0.00011676998110488057, "loss_lvr": 0.286953330039978, "loss_mode_switch": 0.0, "loss_total": 0.02881210297346115, "step": 1874 }, { "batch_size": 4, "epoch": 0.7496, "step": 1874, "tokens_per_device": 14436 }, { "epoch": 0.7496, "loss_ce": 0.29101306200027466, "loss_lvr": 0.7678260207176208, "loss_mode_switch": 0.0, "loss_total": 0.3677956759929657, "step": 1874 }, { "batch_size": 4, "epoch": 0.7496, "step": 1874, "tokens_per_device": 2568 }, { "epoch": 0.7496, "loss_ce": 0.43816351890563965, "loss_lvr": 0.9756126403808594, "loss_mode_switch": 0.0, "loss_total": 0.5357247591018677, "step": 1874 }, { "batch_size": 4, "epoch": 0.7496, "step": 1874, "tokens_per_device": 2632 }, { "epoch": 0.7496, "loss_ce": 0.5478243827819824, "loss_lvr": 0.8683094382286072, "loss_mode_switch": 0.0, "loss_total": 0.6346553564071655, "step": 1874 }, { "batch_size": 4, "epoch": 0.7496, "step": 1874, "tokens_per_device": 4248 }, { "epoch": 0.7496, "loss_ce": 0.4780920445919037, "loss_lvr": 0.8000964522361755, "loss_mode_switch": 0.0, "loss_total": 0.5581017136573792, "step": 1874 }, { "epoch": 0.75, "grad_norm": 1.2199366092681885, "learning_rate": 1.5513811136094786e-06, "loss": 0.2776, "step": 1875 }, { "batch_size": 4, "epoch": 0.75, "step": 1875, "tokens_per_device": 2808 }, { "epoch": 0.75, "loss_ce": 0.34935301542282104, "loss_lvr": 0.5769771933555603, "loss_mode_switch": 0.0, "loss_total": 0.4070507287979126, "step": 1875 }, { "batch_size": 1, "epoch": 0.75, "step": 1875, "tokens_per_device": 4904 }, { "epoch": 0.75, "loss_ce": 0.03356173262000084, "loss_lvr": 0.8639638423919678, "loss_mode_switch": 0.0, "loss_total": 0.11995811760425568, "step": 1875 }, { "batch_size": 4, "epoch": 0.75, "step": 1875, "tokens_per_device": 4296 }, { "epoch": 0.75, "loss_ce": 0.3661525547504425, "loss_lvr": 0.76900315284729, "loss_mode_switch": 0.0, "loss_total": 0.44305288791656494, "step": 1875 }, { "batch_size": 4, "epoch": 0.75, "step": 1875, "tokens_per_device": 5752 }, { "epoch": 0.75, "loss_ce": 0.11825642734766006, "loss_lvr": 0.9394627809524536, "loss_mode_switch": 0.0, "loss_total": 0.21220269799232483, "step": 1875 }, { "batch_size": 4, "epoch": 0.75, "step": 1875, "tokens_per_device": 5716 }, { "epoch": 0.75, "loss_ce": 0.15181802213191986, "loss_lvr": 1.2482788562774658, "loss_mode_switch": 0.0, "loss_total": 0.2766458988189697, "step": 1875 }, { "batch_size": 4, "epoch": 0.75, "step": 1875, "tokens_per_device": 11108 }, { "epoch": 0.75, "loss_ce": 0.1820732206106186, "loss_lvr": 0.6200767755508423, "loss_mode_switch": 0.0, "loss_total": 0.24408090114593506, "step": 1875 }, { "batch_size": 4, "epoch": 0.75, "step": 1875, "tokens_per_device": 2776 }, { "epoch": 0.75, "loss_ce": 0.23020131886005402, "loss_lvr": 0.6632090210914612, "loss_mode_switch": 0.0, "loss_total": 0.29652222990989685, "step": 1875 }, { "batch_size": 4, "epoch": 0.75, "step": 1875, "tokens_per_device": 5148 }, { "epoch": 0.75, "loss_ce": 0.36545810103416443, "loss_lvr": 0.7318033576011658, "loss_mode_switch": 0.0, "loss_total": 0.43863844871520996, "step": 1875 }, { "epoch": 0.7504, "grad_norm": 1.3109363317489624, "learning_rate": 1.5466938209985504e-06, "loss": 0.2514, "step": 1876 }, { "batch_size": 4, "epoch": 0.7504, "step": 1876, "tokens_per_device": 4316 }, { "epoch": 0.7504, "loss_ce": 0.007383849937468767, "loss_lvr": 1.5587855577468872, "loss_mode_switch": 0.0, "loss_total": 0.16326241195201874, "step": 1876 }, { "batch_size": 4, "epoch": 0.7504, "step": 1876, "tokens_per_device": 2696 }, { "epoch": 0.7504, "loss_ce": 0.12160588055849075, "loss_lvr": 0.7459033727645874, "loss_mode_switch": 0.0, "loss_total": 0.19619622826576233, "step": 1876 }, { "batch_size": 4, "epoch": 0.7504, "step": 1876, "tokens_per_device": 4464 }, { "epoch": 0.7504, "loss_ce": 0.35516688227653503, "loss_lvr": 0.7252238988876343, "loss_mode_switch": 0.0, "loss_total": 0.4276892840862274, "step": 1876 }, { "batch_size": 1, "epoch": 0.7504, "step": 1876, "tokens_per_device": 4849 }, { "epoch": 0.7504, "loss_ce": 0.27685853838920593, "loss_lvr": 0.39469680190086365, "loss_mode_switch": 0.0, "loss_total": 0.316328227519989, "step": 1876 }, { "batch_size": 4, "epoch": 0.7504, "step": 1876, "tokens_per_device": 5324 }, { "epoch": 0.7504, "loss_ce": 0.02501486800611019, "loss_lvr": 0.6732288599014282, "loss_mode_switch": 0.0, "loss_total": 0.09233775734901428, "step": 1876 }, { "batch_size": 4, "epoch": 0.7504, "step": 1876, "tokens_per_device": 1448 }, { "epoch": 0.7504, "loss_ce": 0.4398627281188965, "loss_lvr": 1.0391736030578613, "loss_mode_switch": 0.0, "loss_total": 0.5437800884246826, "step": 1876 }, { "batch_size": 4, "epoch": 0.7504, "step": 1876, "tokens_per_device": 5076 }, { "epoch": 0.7504, "loss_ce": 0.010507977567613125, "loss_lvr": 0.627005934715271, "loss_mode_switch": 0.0, "loss_total": 0.07320857048034668, "step": 1876 }, { "batch_size": 4, "epoch": 0.7504, "step": 1876, "tokens_per_device": 4036 }, { "epoch": 0.7504, "loss_ce": 0.1749560534954071, "loss_lvr": 0.9619686603546143, "loss_mode_switch": 0.0, "loss_total": 0.27115291357040405, "step": 1876 }, { "epoch": 0.7508, "grad_norm": 1.5244330167770386, "learning_rate": 1.5420123241595169e-06, "loss": 0.3336, "step": 1877 }, { "batch_size": 4, "epoch": 0.7508, "step": 1877, "tokens_per_device": 1352 }, { "epoch": 0.7508, "loss_ce": 0.4510692358016968, "loss_lvr": 1.0980230569839478, "loss_mode_switch": 0.0, "loss_total": 0.5608715415000916, "step": 1877 }, { "batch_size": 4, "epoch": 0.7508, "step": 1877, "tokens_per_device": 1528 }, { "epoch": 0.7508, "loss_ce": 0.3424456715583801, "loss_lvr": 0.9587652087211609, "loss_mode_switch": 0.0, "loss_total": 0.43832218647003174, "step": 1877 }, { "batch_size": 4, "epoch": 0.7508, "step": 1877, "tokens_per_device": 5424 }, { "epoch": 0.7508, "loss_ce": 0.029897259548306465, "loss_lvr": 0.5414624810218811, "loss_mode_switch": 0.0, "loss_total": 0.08404351025819778, "step": 1877 }, { "batch_size": 4, "epoch": 0.7508, "step": 1877, "tokens_per_device": 1488 }, { "epoch": 0.7508, "loss_ce": 0.7757894992828369, "loss_lvr": 0.919800877571106, "loss_mode_switch": 0.0, "loss_total": 0.8677695989608765, "step": 1877 }, { "batch_size": 4, "epoch": 0.7508, "step": 1877, "tokens_per_device": 9676 }, { "epoch": 0.7508, "loss_ce": 0.3653535842895508, "loss_lvr": 0.9283851385116577, "loss_mode_switch": 0.0, "loss_total": 0.4581921100616455, "step": 1877 }, { "batch_size": 4, "epoch": 0.7508, "step": 1877, "tokens_per_device": 2076 }, { "epoch": 0.7508, "loss_ce": 0.2951285243034363, "loss_lvr": 0.6734533309936523, "loss_mode_switch": 0.0, "loss_total": 0.36247384548187256, "step": 1877 }, { "batch_size": 4, "epoch": 0.7508, "step": 1877, "tokens_per_device": 1504 }, { "epoch": 0.7508, "loss_ce": 0.6728150248527527, "loss_lvr": 0.8576716184616089, "loss_mode_switch": 0.0, "loss_total": 0.7585821747779846, "step": 1877 }, { "batch_size": 4, "epoch": 0.7508, "step": 1877, "tokens_per_device": 4528 }, { "epoch": 0.7508, "loss_ce": 0.49809449911117554, "loss_lvr": 1.052878499031067, "loss_mode_switch": 0.0, "loss_total": 0.6033823490142822, "step": 1877 }, { "epoch": 0.7512, "grad_norm": 1.3178491592407227, "learning_rate": 1.5373366309494515e-06, "loss": 0.2994, "step": 1878 }, { "batch_size": 4, "epoch": 0.7512, "step": 1878, "tokens_per_device": 1356 }, { "epoch": 0.7512, "loss_ce": 0.1133565753698349, "loss_lvr": 0.9011102914810181, "loss_mode_switch": 0.0, "loss_total": 0.20346760749816895, "step": 1878 }, { "batch_size": 1, "epoch": 0.7512, "step": 1878, "tokens_per_device": 4862 }, { "epoch": 0.7512, "loss_ce": 0.09366952627897263, "loss_lvr": 0.17965693771839142, "loss_mode_switch": 0.0, "loss_total": 0.111635223031044, "step": 1878 }, { "batch_size": 1, "epoch": 0.7512, "step": 1878, "tokens_per_device": 4855 }, { "epoch": 0.7512, "loss_ce": 0.00032083503901958466, "loss_lvr": 0.7017896175384521, "loss_mode_switch": 0.0, "loss_total": 0.07049980014562607, "step": 1878 }, { "batch_size": 4, "epoch": 0.7512, "step": 1878, "tokens_per_device": 7748 }, { "epoch": 0.7512, "loss_ce": 0.03933935984969139, "loss_lvr": 0.6277521848678589, "loss_mode_switch": 0.0, "loss_total": 0.10211457312107086, "step": 1878 }, { "batch_size": 4, "epoch": 0.7512, "step": 1878, "tokens_per_device": 2620 }, { "epoch": 0.7512, "loss_ce": 0.23930436372756958, "loss_lvr": 0.7552390098571777, "loss_mode_switch": 0.0, "loss_total": 0.3148282766342163, "step": 1878 }, { "batch_size": 1, "epoch": 0.7512, "step": 1878, "tokens_per_device": 4764 }, { "epoch": 0.7512, "loss_ce": 0.012732338160276413, "loss_lvr": 0.7143772840499878, "loss_mode_switch": 0.0, "loss_total": 0.08417007327079773, "step": 1878 }, { "batch_size": 1, "epoch": 0.7512, "step": 1878, "tokens_per_device": 7434 }, { "epoch": 0.7512, "loss_ce": 0.19558008015155792, "loss_lvr": 0.4673709273338318, "loss_mode_switch": 0.0, "loss_total": 0.24231716990470886, "step": 1878 }, { "batch_size": 4, "epoch": 0.7512, "step": 1878, "tokens_per_device": 2880 }, { "epoch": 0.7512, "loss_ce": 0.06650105863809586, "loss_lvr": 0.5513297915458679, "loss_mode_switch": 0.0, "loss_total": 0.12163403630256653, "step": 1878 }, { "epoch": 0.7516, "grad_norm": 1.4611458778381348, "learning_rate": 1.5326667492156905e-06, "loss": 0.2843, "step": 1879 }, { "batch_size": 4, "epoch": 0.7516, "step": 1879, "tokens_per_device": 4252 }, { "epoch": 0.7516, "loss_ce": 0.7152698040008545, "loss_lvr": 0.8513779044151306, "loss_mode_switch": 0.0, "loss_total": 0.8004075884819031, "step": 1879 }, { "batch_size": 4, "epoch": 0.7516, "step": 1879, "tokens_per_device": 4548 }, { "epoch": 0.7516, "loss_ce": 0.18938791751861572, "loss_lvr": 0.5699881315231323, "loss_mode_switch": 0.0, "loss_total": 0.24638673663139343, "step": 1879 }, { "batch_size": 4, "epoch": 0.7516, "step": 1879, "tokens_per_device": 5244 }, { "epoch": 0.7516, "loss_ce": 0.47147631645202637, "loss_lvr": 0.8082188963890076, "loss_mode_switch": 0.0, "loss_total": 0.5522981882095337, "step": 1879 }, { "batch_size": 1, "epoch": 0.7516, "step": 1879, "tokens_per_device": 5166 }, { "epoch": 0.7516, "loss_ce": 0.00999755971133709, "loss_lvr": 0.3486296534538269, "loss_mode_switch": 0.0, "loss_total": 0.04486052691936493, "step": 1879 }, { "batch_size": 1, "epoch": 0.7516, "step": 1879, "tokens_per_device": 5643 }, { "epoch": 0.7516, "loss_ce": 0.047168124467134476, "loss_lvr": 0.3823956251144409, "loss_mode_switch": 0.0, "loss_total": 0.08540768921375275, "step": 1879 }, { "batch_size": 4, "epoch": 0.7516, "step": 1879, "tokens_per_device": 13816 }, { "epoch": 0.7516, "loss_ce": 0.22413361072540283, "loss_lvr": 0.37194347381591797, "loss_mode_switch": 0.0, "loss_total": 0.26132795214653015, "step": 1879 }, { "batch_size": 1, "epoch": 0.7516, "step": 1879, "tokens_per_device": 5539 }, { "epoch": 0.7516, "loss_ce": 0.010820104740560055, "loss_lvr": 0.323673278093338, "loss_mode_switch": 0.0, "loss_total": 0.04318743571639061, "step": 1879 }, { "batch_size": 4, "epoch": 0.7516, "step": 1879, "tokens_per_device": 3676 }, { "epoch": 0.7516, "loss_ce": 0.3928084969520569, "loss_lvr": 0.8771544098854065, "loss_mode_switch": 0.0, "loss_total": 0.480523943901062, "step": 1879 }, { "epoch": 0.752, "grad_norm": 1.4193353652954102, "learning_rate": 1.5280026867958186e-06, "loss": 0.2737, "step": 1880 }, { "batch_size": 4, "epoch": 0.752, "step": 1880, "tokens_per_device": 4692 }, { "epoch": 0.752, "loss_ce": 0.510862410068512, "loss_lvr": 0.7043740153312683, "loss_mode_switch": 0.0, "loss_total": 0.5812997817993164, "step": 1880 }, { "batch_size": 4, "epoch": 0.752, "step": 1880, "tokens_per_device": 4408 }, { "epoch": 0.752, "loss_ce": 0.0743572935461998, "loss_lvr": 0.6668931245803833, "loss_mode_switch": 0.0, "loss_total": 0.14104661345481873, "step": 1880 }, { "batch_size": 4, "epoch": 0.752, "step": 1880, "tokens_per_device": 1628 }, { "epoch": 0.752, "loss_ce": 0.04518868401646614, "loss_lvr": 0.9602611660957336, "loss_mode_switch": 0.0, "loss_total": 0.14121480286121368, "step": 1880 }, { "batch_size": 4, "epoch": 0.752, "step": 1880, "tokens_per_device": 4416 }, { "epoch": 0.752, "loss_ce": 0.10601797699928284, "loss_lvr": 0.789393424987793, "loss_mode_switch": 0.0, "loss_total": 0.1849573254585266, "step": 1880 }, { "batch_size": 4, "epoch": 0.752, "step": 1880, "tokens_per_device": 5460 }, { "epoch": 0.752, "loss_ce": 0.24341264367103577, "loss_lvr": 0.7863000631332397, "loss_mode_switch": 0.0, "loss_total": 0.32204264402389526, "step": 1880 }, { "batch_size": 1, "epoch": 0.752, "step": 1880, "tokens_per_device": 4878 }, { "epoch": 0.752, "loss_ce": 0.02098051644861698, "loss_lvr": 0.9620643258094788, "loss_mode_switch": 0.0, "loss_total": 0.11718694865703583, "step": 1880 }, { "batch_size": 4, "epoch": 0.752, "step": 1880, "tokens_per_device": 4444 }, { "epoch": 0.752, "loss_ce": 0.34758836030960083, "loss_lvr": 0.7282111048698425, "loss_mode_switch": 0.0, "loss_total": 0.4204094707965851, "step": 1880 }, { "batch_size": 4, "epoch": 0.752, "step": 1880, "tokens_per_device": 14148 }, { "epoch": 0.752, "loss_ce": 0.2061477154493332, "loss_lvr": 1.1123251914978027, "loss_mode_switch": 0.0, "loss_total": 0.31738024950027466, "step": 1880 }, { "epoch": 0.7524, "grad_norm": 1.2588869333267212, "learning_rate": 1.5233444515176488e-06, "loss": 0.3096, "step": 1881 }, { "batch_size": 1, "epoch": 0.7524, "step": 1881, "tokens_per_device": 5121 }, { "epoch": 0.7524, "loss_ce": 0.04287474974989891, "loss_lvr": 0.5417041778564453, "loss_mode_switch": 0.0, "loss_total": 0.0970451682806015, "step": 1881 }, { "batch_size": 4, "epoch": 0.7524, "step": 1881, "tokens_per_device": 4372 }, { "epoch": 0.7524, "loss_ce": 0.7847480177879333, "loss_lvr": 0.5879046320915222, "loss_mode_switch": 0.0, "loss_total": 0.8435384631156921, "step": 1881 }, { "batch_size": 1, "epoch": 0.7524, "step": 1881, "tokens_per_device": 5135 }, { "epoch": 0.7524, "loss_ce": 0.01543042715638876, "loss_lvr": 0.2405194640159607, "loss_mode_switch": 0.0, "loss_total": 0.039482373744249344, "step": 1881 }, { "batch_size": 1, "epoch": 0.7524, "step": 1881, "tokens_per_device": 4889 }, { "epoch": 0.7524, "loss_ce": 0.003137382213026285, "loss_lvr": 0.26519152522087097, "loss_mode_switch": 0.0, "loss_total": 0.029656535014510155, "step": 1881 }, { "batch_size": 1, "epoch": 0.7524, "step": 1881, "tokens_per_device": 4916 }, { "epoch": 0.7524, "loss_ce": 0.017782293260097504, "loss_lvr": 0.23069323599338531, "loss_mode_switch": 0.0, "loss_total": 0.040851615369319916, "step": 1881 }, { "batch_size": 4, "epoch": 0.7524, "step": 1881, "tokens_per_device": 10948 }, { "epoch": 0.7524, "loss_ce": 0.13265764713287354, "loss_lvr": 0.9932835698127747, "loss_mode_switch": 0.0, "loss_total": 0.23198601603507996, "step": 1881 }, { "batch_size": 4, "epoch": 0.7524, "step": 1881, "tokens_per_device": 2548 }, { "epoch": 0.7524, "loss_ce": 0.6231249570846558, "loss_lvr": 0.956739068031311, "loss_mode_switch": 0.0, "loss_total": 0.7187988758087158, "step": 1881 }, { "batch_size": 4, "epoch": 0.7524, "step": 1881, "tokens_per_device": 3948 }, { "epoch": 0.7524, "loss_ce": 0.27521058917045593, "loss_lvr": 0.797911524772644, "loss_mode_switch": 0.0, "loss_total": 0.3550017476081848, "step": 1881 }, { "epoch": 0.7528, "grad_norm": 1.3783847093582153, "learning_rate": 1.5186920511992154e-06, "loss": 0.282, "step": 1882 }, { "batch_size": 4, "epoch": 0.7528, "step": 1882, "tokens_per_device": 4268 }, { "epoch": 0.7528, "loss_ce": 0.2831589877605438, "loss_lvr": 1.0085091590881348, "loss_mode_switch": 0.0, "loss_total": 0.3840098977088928, "step": 1882 }, { "batch_size": 1, "epoch": 0.7528, "step": 1882, "tokens_per_device": 4851 }, { "epoch": 0.7528, "loss_ce": 0.003730732947587967, "loss_lvr": 0.36139383912086487, "loss_mode_switch": 0.0, "loss_total": 0.039870116859674454, "step": 1882 }, { "batch_size": 1, "epoch": 0.7528, "step": 1882, "tokens_per_device": 7218 }, { "epoch": 0.7528, "loss_ce": 0.11942108720541, "loss_lvr": 0.21964192390441895, "loss_mode_switch": 0.0, "loss_total": 0.1413852870464325, "step": 1882 }, { "batch_size": 4, "epoch": 0.7528, "step": 1882, "tokens_per_device": 1484 }, { "epoch": 0.7528, "loss_ce": 0.46815821528434753, "loss_lvr": 0.7968502640724182, "loss_mode_switch": 0.0, "loss_total": 0.5478432178497314, "step": 1882 }, { "batch_size": 4, "epoch": 0.7528, "step": 1882, "tokens_per_device": 4296 }, { "epoch": 0.7528, "loss_ce": 0.17022833228111267, "loss_lvr": 0.8380426168441772, "loss_mode_switch": 0.0, "loss_total": 0.25403261184692383, "step": 1882 }, { "batch_size": 1, "epoch": 0.7528, "step": 1882, "tokens_per_device": 4896 }, { "epoch": 0.7528, "loss_ce": 0.0023504772689193487, "loss_lvr": 0.4760534167289734, "loss_mode_switch": 0.0, "loss_total": 0.0499558188021183, "step": 1882 }, { "batch_size": 4, "epoch": 0.7528, "step": 1882, "tokens_per_device": 3740 }, { "epoch": 0.7528, "loss_ce": 0.18264836072921753, "loss_lvr": 0.7927442193031311, "loss_mode_switch": 0.0, "loss_total": 0.26192277669906616, "step": 1882 }, { "batch_size": 1, "epoch": 0.7528, "step": 1882, "tokens_per_device": 4939 }, { "epoch": 0.7528, "loss_ce": 0.20778587460517883, "loss_lvr": 0.28327426314353943, "loss_mode_switch": 0.0, "loss_total": 0.2361132949590683, "step": 1882 }, { "epoch": 0.7532, "grad_norm": 1.365029215812683, "learning_rate": 1.5140454936487597e-06, "loss": 0.2923, "step": 1883 }, { "batch_size": 1, "epoch": 0.7532, "step": 1883, "tokens_per_device": 4911 }, { "epoch": 0.7532, "loss_ce": 0.00032679204014129937, "loss_lvr": 0.2596268653869629, "loss_mode_switch": 0.0, "loss_total": 0.026289477944374084, "step": 1883 }, { "batch_size": 4, "epoch": 0.7532, "step": 1883, "tokens_per_device": 2608 }, { "epoch": 0.7532, "loss_ce": 0.35774585604667664, "loss_lvr": 0.912716269493103, "loss_mode_switch": 0.0, "loss_total": 0.4490174949169159, "step": 1883 }, { "batch_size": 4, "epoch": 0.7532, "step": 1883, "tokens_per_device": 8560 }, { "epoch": 0.7532, "loss_ce": 0.054841361939907074, "loss_lvr": 0.7741420269012451, "loss_mode_switch": 0.0, "loss_total": 0.13225556910037994, "step": 1883 }, { "batch_size": 4, "epoch": 0.7532, "step": 1883, "tokens_per_device": 3928 }, { "epoch": 0.7532, "loss_ce": 0.08481363207101822, "loss_lvr": 0.5569040775299072, "loss_mode_switch": 0.0, "loss_total": 0.14050403237342834, "step": 1883 }, { "batch_size": 1, "epoch": 0.7532, "step": 1883, "tokens_per_device": 5688 }, { "epoch": 0.7532, "loss_ce": 0.006547778844833374, "loss_lvr": 0.18818873167037964, "loss_mode_switch": 0.0, "loss_total": 0.025366652756929398, "step": 1883 }, { "batch_size": 4, "epoch": 0.7532, "step": 1883, "tokens_per_device": 2644 }, { "epoch": 0.7532, "loss_ce": 0.4801171123981476, "loss_lvr": 0.962396502494812, "loss_mode_switch": 0.0, "loss_total": 0.5763567686080933, "step": 1883 }, { "batch_size": 4, "epoch": 0.7532, "step": 1883, "tokens_per_device": 4404 }, { "epoch": 0.7532, "loss_ce": 0.15304158627986908, "loss_lvr": 1.0053684711456299, "loss_mode_switch": 0.0, "loss_total": 0.25357842445373535, "step": 1883 }, { "batch_size": 4, "epoch": 0.7532, "step": 1883, "tokens_per_device": 4252 }, { "epoch": 0.7532, "loss_ce": 0.00649844016879797, "loss_lvr": 0.8529089093208313, "loss_mode_switch": 0.0, "loss_total": 0.09178933501243591, "step": 1883 }, { "epoch": 0.7536, "grad_norm": 1.1577428579330444, "learning_rate": 1.5094047866647194e-06, "loss": 0.2743, "step": 1884 }, { "batch_size": 4, "epoch": 0.7536, "step": 1884, "tokens_per_device": 5756 }, { "epoch": 0.7536, "loss_ce": 0.0949464812874794, "loss_lvr": 0.6114227175712585, "loss_mode_switch": 0.0, "loss_total": 0.15608875453472137, "step": 1884 }, { "batch_size": 1, "epoch": 0.7536, "step": 1884, "tokens_per_device": 5021 }, { "epoch": 0.7536, "loss_ce": 0.06859570741653442, "loss_lvr": 0.37088003754615784, "loss_mode_switch": 0.0, "loss_total": 0.10568371415138245, "step": 1884 }, { "batch_size": 4, "epoch": 0.7536, "step": 1884, "tokens_per_device": 7092 }, { "epoch": 0.7536, "loss_ce": 0.14125116169452667, "loss_lvr": 0.952741801738739, "loss_mode_switch": 0.0, "loss_total": 0.23652534186840057, "step": 1884 }, { "batch_size": 1, "epoch": 0.7536, "step": 1884, "tokens_per_device": 5258 }, { "epoch": 0.7536, "loss_ce": 0.05192195996642113, "loss_lvr": 0.3402455449104309, "loss_mode_switch": 0.0, "loss_total": 0.08594651520252228, "step": 1884 }, { "batch_size": 4, "epoch": 0.7536, "step": 1884, "tokens_per_device": 5112 }, { "epoch": 0.7536, "loss_ce": 0.8923653960227966, "loss_lvr": 0.5699819922447205, "loss_mode_switch": 0.0, "loss_total": 0.9493635892868042, "step": 1884 }, { "batch_size": 4, "epoch": 0.7536, "step": 1884, "tokens_per_device": 1812 }, { "epoch": 0.7536, "loss_ce": 0.12658068537712097, "loss_lvr": 0.8018171787261963, "loss_mode_switch": 0.0, "loss_total": 0.2067624032497406, "step": 1884 }, { "batch_size": 4, "epoch": 0.7536, "step": 1884, "tokens_per_device": 4528 }, { "epoch": 0.7536, "loss_ce": 0.4929642677307129, "loss_lvr": 0.8819049000740051, "loss_mode_switch": 0.0, "loss_total": 0.5811547636985779, "step": 1884 }, { "batch_size": 1, "epoch": 0.7536, "step": 1884, "tokens_per_device": 4554 }, { "epoch": 0.7536, "loss_ce": 0.0172325000166893, "loss_lvr": 0.2702082395553589, "loss_mode_switch": 0.0, "loss_total": 0.04425332695245743, "step": 1884 }, { "epoch": 0.754, "grad_norm": 1.2639418840408325, "learning_rate": 1.5047699380357134e-06, "loss": 0.2988, "step": 1885 }, { "batch_size": 1, "epoch": 0.754, "step": 1885, "tokens_per_device": 6573 }, { "epoch": 0.754, "loss_ce": 0.0015479697613045573, "loss_lvr": 0.2905034124851227, "loss_mode_switch": 0.0, "loss_total": 0.030598310753703117, "step": 1885 }, { "batch_size": 4, "epoch": 0.754, "step": 1885, "tokens_per_device": 3748 }, { "epoch": 0.754, "loss_ce": 0.2002219706773758, "loss_lvr": 0.8078314661979675, "loss_mode_switch": 0.0, "loss_total": 0.2810051143169403, "step": 1885 }, { "batch_size": 1, "epoch": 0.754, "step": 1885, "tokens_per_device": 4896 }, { "epoch": 0.754, "loss_ce": 0.30557337403297424, "loss_lvr": 0.6046304702758789, "loss_mode_switch": 0.0, "loss_total": 0.36603641510009766, "step": 1885 }, { "batch_size": 4, "epoch": 0.754, "step": 1885, "tokens_per_device": 8664 }, { "epoch": 0.754, "loss_ce": 0.09997826814651489, "loss_lvr": 0.8494395017623901, "loss_mode_switch": 0.0, "loss_total": 0.1849222183227539, "step": 1885 }, { "batch_size": 4, "epoch": 0.754, "step": 1885, "tokens_per_device": 4208 }, { "epoch": 0.754, "loss_ce": 0.21632833778858185, "loss_lvr": 0.8600316047668457, "loss_mode_switch": 0.0, "loss_total": 0.30233150720596313, "step": 1885 }, { "batch_size": 4, "epoch": 0.754, "step": 1885, "tokens_per_device": 4212 }, { "epoch": 0.754, "loss_ce": 0.6874457597732544, "loss_lvr": 0.8911035060882568, "loss_mode_switch": 0.0, "loss_total": 0.776556134223938, "step": 1885 }, { "batch_size": 4, "epoch": 0.754, "step": 1885, "tokens_per_device": 3848 }, { "epoch": 0.754, "loss_ce": 0.311306357383728, "loss_lvr": 0.9723148941993713, "loss_mode_switch": 0.0, "loss_total": 0.4085378646850586, "step": 1885 }, { "batch_size": 4, "epoch": 0.754, "step": 1885, "tokens_per_device": 4400 }, { "epoch": 0.754, "loss_ce": 0.18040253221988678, "loss_lvr": 1.152646780014038, "loss_mode_switch": 0.0, "loss_total": 0.29566720128059387, "step": 1885 }, { "epoch": 0.7544, "grad_norm": 1.4414607286453247, "learning_rate": 1.5001409555405238e-06, "loss": 0.3202, "step": 1886 }, { "batch_size": 1, "epoch": 0.7544, "step": 1886, "tokens_per_device": 4880 }, { "epoch": 0.7544, "loss_ce": 0.004455359186977148, "loss_lvr": 0.2971128821372986, "loss_mode_switch": 0.0, "loss_total": 0.03416664898395538, "step": 1886 }, { "batch_size": 1, "epoch": 0.7544, "step": 1886, "tokens_per_device": 4864 }, { "epoch": 0.7544, "loss_ce": 0.00035959103843197227, "loss_lvr": 0.2492833137512207, "loss_mode_switch": 0.0, "loss_total": 0.025287922471761703, "step": 1886 }, { "batch_size": 1, "epoch": 0.7544, "step": 1886, "tokens_per_device": 4911 }, { "epoch": 0.7544, "loss_ce": 0.22709722816944122, "loss_lvr": 0.5454520583152771, "loss_mode_switch": 0.0, "loss_total": 0.28164243698120117, "step": 1886 }, { "batch_size": 4, "epoch": 0.7544, "step": 1886, "tokens_per_device": 2592 }, { "epoch": 0.7544, "loss_ce": 0.48033297061920166, "loss_lvr": 0.8567363619804382, "loss_mode_switch": 0.0, "loss_total": 0.566006600856781, "step": 1886 }, { "batch_size": 1, "epoch": 0.7544, "step": 1886, "tokens_per_device": 5998 }, { "epoch": 0.7544, "loss_ce": 0.004554205574095249, "loss_lvr": 0.26672354340553284, "loss_mode_switch": 0.0, "loss_total": 0.031226560473442078, "step": 1886 }, { "batch_size": 4, "epoch": 0.7544, "step": 1886, "tokens_per_device": 9348 }, { "epoch": 0.7544, "loss_ce": 0.3334186375141144, "loss_lvr": 0.6518042087554932, "loss_mode_switch": 0.0, "loss_total": 0.3985990583896637, "step": 1886 }, { "batch_size": 1, "epoch": 0.7544, "step": 1886, "tokens_per_device": 5066 }, { "epoch": 0.7544, "loss_ce": 0.0002751727879513055, "loss_lvr": 0.4205666780471802, "loss_mode_switch": 0.0, "loss_total": 0.042331840842962265, "step": 1886 }, { "batch_size": 4, "epoch": 0.7544, "step": 1886, "tokens_per_device": 4324 }, { "epoch": 0.7544, "loss_ce": 0.05437268316745758, "loss_lvr": 0.8596932291984558, "loss_mode_switch": 0.0, "loss_total": 0.14034199714660645, "step": 1886 }, { "epoch": 0.7548, "grad_norm": 1.1860207319259644, "learning_rate": 1.4955178469480891e-06, "loss": 0.2553, "step": 1887 }, { "batch_size": 4, "epoch": 0.7548, "step": 1887, "tokens_per_device": 4420 }, { "epoch": 0.7548, "loss_ce": 0.14594846963882446, "loss_lvr": 0.7468243837356567, "loss_mode_switch": 0.0, "loss_total": 0.22063091397285461, "step": 1887 }, { "batch_size": 4, "epoch": 0.7548, "step": 1887, "tokens_per_device": 4500 }, { "epoch": 0.7548, "loss_ce": 0.17330124974250793, "loss_lvr": 0.7610533833503723, "loss_mode_switch": 0.0, "loss_total": 0.2494065910577774, "step": 1887 }, { "batch_size": 4, "epoch": 0.7548, "step": 1887, "tokens_per_device": 1308 }, { "epoch": 0.7548, "loss_ce": 0.33070558309555054, "loss_lvr": 0.9746475219726562, "loss_mode_switch": 0.0, "loss_total": 0.4281703233718872, "step": 1887 }, { "batch_size": 4, "epoch": 0.7548, "step": 1887, "tokens_per_device": 1224 }, { "epoch": 0.7548, "loss_ce": 0.4148384928703308, "loss_lvr": 1.0817614793777466, "loss_mode_switch": 0.0, "loss_total": 0.5230146646499634, "step": 1887 }, { "batch_size": 4, "epoch": 0.7548, "step": 1887, "tokens_per_device": 3860 }, { "epoch": 0.7548, "loss_ce": 0.15432022511959076, "loss_lvr": 1.304115653038025, "loss_mode_switch": 0.0, "loss_total": 0.28473180532455444, "step": 1887 }, { "batch_size": 4, "epoch": 0.7548, "step": 1887, "tokens_per_device": 1576 }, { "epoch": 0.7548, "loss_ce": 0.2917226552963257, "loss_lvr": 0.9496996402740479, "loss_mode_switch": 0.0, "loss_total": 0.386692613363266, "step": 1887 }, { "batch_size": 4, "epoch": 0.7548, "step": 1887, "tokens_per_device": 1408 }, { "epoch": 0.7548, "loss_ce": 0.604444682598114, "loss_lvr": 1.1141787767410278, "loss_mode_switch": 0.0, "loss_total": 0.7158625721931458, "step": 1887 }, { "batch_size": 4, "epoch": 0.7548, "step": 1887, "tokens_per_device": 4300 }, { "epoch": 0.7548, "loss_ce": 0.8009394407272339, "loss_lvr": 0.8332485556602478, "loss_mode_switch": 0.0, "loss_total": 0.8842642903327942, "step": 1887 }, { "epoch": 0.7552, "grad_norm": 1.2654621601104736, "learning_rate": 1.4909006200174924e-06, "loss": 0.318, "step": 1888 }, { "batch_size": 4, "epoch": 0.7552, "step": 1888, "tokens_per_device": 3792 }, { "epoch": 0.7552, "loss_ce": 0.38738152384757996, "loss_lvr": 0.784650981426239, "loss_mode_switch": 0.0, "loss_total": 0.46584662795066833, "step": 1888 }, { "batch_size": 4, "epoch": 0.7552, "step": 1888, "tokens_per_device": 4276 }, { "epoch": 0.7552, "loss_ce": 0.40576356649398804, "loss_lvr": 0.5335734486579895, "loss_mode_switch": 0.0, "loss_total": 0.45912089943885803, "step": 1888 }, { "batch_size": 4, "epoch": 0.7552, "step": 1888, "tokens_per_device": 5336 }, { "epoch": 0.7552, "loss_ce": 0.2890300154685974, "loss_lvr": 0.8227630257606506, "loss_mode_switch": 0.0, "loss_total": 0.37130632996559143, "step": 1888 }, { "batch_size": 4, "epoch": 0.7552, "step": 1888, "tokens_per_device": 3596 }, { "epoch": 0.7552, "loss_ce": 0.07870253920555115, "loss_lvr": 1.0803301334381104, "loss_mode_switch": 0.0, "loss_total": 0.18673555552959442, "step": 1888 }, { "batch_size": 1, "epoch": 0.7552, "step": 1888, "tokens_per_device": 4866 }, { "epoch": 0.7552, "loss_ce": 0.00017203025345224887, "loss_lvr": 0.27233004570007324, "loss_mode_switch": 0.0, "loss_total": 0.027405034750699997, "step": 1888 }, { "batch_size": 4, "epoch": 0.7552, "step": 1888, "tokens_per_device": 4380 }, { "epoch": 0.7552, "loss_ce": 0.3470851182937622, "loss_lvr": 0.7938862442970276, "loss_mode_switch": 0.0, "loss_total": 0.4264737367630005, "step": 1888 }, { "batch_size": 4, "epoch": 0.7552, "step": 1888, "tokens_per_device": 4648 }, { "epoch": 0.7552, "loss_ce": 0.14877766370773315, "loss_lvr": 0.8123723268508911, "loss_mode_switch": 0.0, "loss_total": 0.2300148904323578, "step": 1888 }, { "batch_size": 1, "epoch": 0.7552, "step": 1888, "tokens_per_device": 4905 }, { "epoch": 0.7552, "loss_ce": 0.11069056391716003, "loss_lvr": 0.5891748666763306, "loss_mode_switch": 0.0, "loss_total": 0.16960805654525757, "step": 1888 }, { "epoch": 0.7556, "grad_norm": 1.398630142211914, "learning_rate": 1.4862892824979448e-06, "loss": 0.3147, "step": 1889 }, { "batch_size": 4, "epoch": 0.7556, "step": 1889, "tokens_per_device": 4268 }, { "epoch": 0.7556, "loss_ce": 0.280502587556839, "loss_lvr": 0.8752433061599731, "loss_mode_switch": 0.0, "loss_total": 0.3680269122123718, "step": 1889 }, { "batch_size": 1, "epoch": 0.7556, "step": 1889, "tokens_per_device": 5124 }, { "epoch": 0.7556, "loss_ce": 0.18020498752593994, "loss_lvr": 0.509300172328949, "loss_mode_switch": 0.0, "loss_total": 0.23113501071929932, "step": 1889 }, { "batch_size": 1, "epoch": 0.7556, "step": 1889, "tokens_per_device": 4877 }, { "epoch": 0.7556, "loss_ce": 0.04305576905608177, "loss_lvr": 0.16302141547203064, "loss_mode_switch": 0.0, "loss_total": 0.059357911348342896, "step": 1889 }, { "batch_size": 4, "epoch": 0.7556, "step": 1889, "tokens_per_device": 7304 }, { "epoch": 0.7556, "loss_ce": 0.5767231583595276, "loss_lvr": 0.7559241056442261, "loss_mode_switch": 0.0, "loss_total": 0.6523155570030212, "step": 1889 }, { "batch_size": 4, "epoch": 0.7556, "step": 1889, "tokens_per_device": 4816 }, { "epoch": 0.7556, "loss_ce": 0.40115591883659363, "loss_lvr": 0.6569299101829529, "loss_mode_switch": 0.0, "loss_total": 0.4668489098548889, "step": 1889 }, { "batch_size": 4, "epoch": 0.7556, "step": 1889, "tokens_per_device": 5320 }, { "epoch": 0.7556, "loss_ce": 0.046033743768930435, "loss_lvr": 0.5274208188056946, "loss_mode_switch": 0.0, "loss_total": 0.09877582639455795, "step": 1889 }, { "batch_size": 4, "epoch": 0.7556, "step": 1889, "tokens_per_device": 4300 }, { "epoch": 0.7556, "loss_ce": 0.11002103239297867, "loss_lvr": 0.893979012966156, "loss_mode_switch": 0.0, "loss_total": 0.19941893219947815, "step": 1889 }, { "batch_size": 1, "epoch": 0.7556, "step": 1889, "tokens_per_device": 5091 }, { "epoch": 0.7556, "loss_ce": 0.026145784184336662, "loss_lvr": 0.19700020551681519, "loss_mode_switch": 0.0, "loss_total": 0.04584580659866333, "step": 1889 }, { "epoch": 0.756, "grad_norm": 1.5449045896530151, "learning_rate": 1.4816838421287693e-06, "loss": 0.3034, "step": 1890 }, { "batch_size": 1, "epoch": 0.756, "step": 1890, "tokens_per_device": 5161 }, { "epoch": 0.756, "loss_ce": 0.1393374502658844, "loss_lvr": 0.41803452372550964, "loss_mode_switch": 0.0, "loss_total": 0.18114089965820312, "step": 1890 }, { "batch_size": 4, "epoch": 0.756, "step": 1890, "tokens_per_device": 4304 }, { "epoch": 0.756, "loss_ce": 0.0640743225812912, "loss_lvr": 0.7934778928756714, "loss_mode_switch": 0.0, "loss_total": 0.14342211186885834, "step": 1890 }, { "batch_size": 4, "epoch": 0.756, "step": 1890, "tokens_per_device": 2836 }, { "epoch": 0.756, "loss_ce": 0.20643708109855652, "loss_lvr": 0.7114821076393127, "loss_mode_switch": 0.0, "loss_total": 0.27758529782295227, "step": 1890 }, { "batch_size": 4, "epoch": 0.756, "step": 1890, "tokens_per_device": 1568 }, { "epoch": 0.756, "loss_ce": 0.5104544758796692, "loss_lvr": 1.1913433074951172, "loss_mode_switch": 0.0, "loss_total": 0.629588782787323, "step": 1890 }, { "batch_size": 1, "epoch": 0.756, "step": 1890, "tokens_per_device": 5161 }, { "epoch": 0.756, "loss_ce": 0.0002894142235163599, "loss_lvr": 0.5068610906600952, "loss_mode_switch": 0.0, "loss_total": 0.05097552388906479, "step": 1890 }, { "batch_size": 1, "epoch": 0.756, "step": 1890, "tokens_per_device": 5195 }, { "epoch": 0.756, "loss_ce": 0.005824296269565821, "loss_lvr": 0.5284776091575623, "loss_mode_switch": 0.0, "loss_total": 0.05867205932736397, "step": 1890 }, { "batch_size": 4, "epoch": 0.756, "step": 1890, "tokens_per_device": 4404 }, { "epoch": 0.756, "loss_ce": 0.15217788517475128, "loss_lvr": 0.9256128072738647, "loss_mode_switch": 0.0, "loss_total": 0.24473917484283447, "step": 1890 }, { "batch_size": 4, "epoch": 0.756, "step": 1890, "tokens_per_device": 4204 }, { "epoch": 0.756, "loss_ce": 0.21093660593032837, "loss_lvr": 1.2070202827453613, "loss_mode_switch": 0.0, "loss_total": 0.3316386342048645, "step": 1890 }, { "epoch": 0.7564, "grad_norm": 1.4039667844772339, "learning_rate": 1.4770843066393954e-06, "loss": 0.2702, "step": 1891 }, { "batch_size": 1, "epoch": 0.7564, "step": 1891, "tokens_per_device": 5125 }, { "epoch": 0.7564, "loss_ce": 0.5836014151573181, "loss_lvr": 0.2728723883628845, "loss_mode_switch": 0.0, "loss_total": 0.610888659954071, "step": 1891 }, { "batch_size": 4, "epoch": 0.7564, "step": 1891, "tokens_per_device": 2580 }, { "epoch": 0.7564, "loss_ce": 0.5492795705795288, "loss_lvr": 0.9612152576446533, "loss_mode_switch": 0.0, "loss_total": 0.645401120185852, "step": 1891 }, { "batch_size": 4, "epoch": 0.7564, "step": 1891, "tokens_per_device": 3952 }, { "epoch": 0.7564, "loss_ce": 0.01208884920924902, "loss_lvr": 0.8512842059135437, "loss_mode_switch": 0.0, "loss_total": 0.09721726924180984, "step": 1891 }, { "batch_size": 1, "epoch": 0.7564, "step": 1891, "tokens_per_device": 5000 }, { "epoch": 0.7564, "loss_ce": 0.014539862982928753, "loss_lvr": 0.6511592864990234, "loss_mode_switch": 0.0, "loss_total": 0.07965578883886337, "step": 1891 }, { "batch_size": 4, "epoch": 0.7564, "step": 1891, "tokens_per_device": 4900 }, { "epoch": 0.7564, "loss_ce": 0.297026127576828, "loss_lvr": 1.133166790008545, "loss_mode_switch": 0.0, "loss_total": 0.410342812538147, "step": 1891 }, { "batch_size": 1, "epoch": 0.7564, "step": 1891, "tokens_per_device": 4902 }, { "epoch": 0.7564, "loss_ce": 0.026759330183267593, "loss_lvr": 0.38533079624176025, "loss_mode_switch": 0.0, "loss_total": 0.06529241055250168, "step": 1891 }, { "batch_size": 4, "epoch": 0.7564, "step": 1891, "tokens_per_device": 2548 }, { "epoch": 0.7564, "loss_ce": 0.5093110799789429, "loss_lvr": 0.7529520988464355, "loss_mode_switch": 0.0, "loss_total": 0.5846062898635864, "step": 1891 }, { "batch_size": 4, "epoch": 0.7564, "step": 1891, "tokens_per_device": 2716 }, { "epoch": 0.7564, "loss_ce": 0.14679190516471863, "loss_lvr": 0.7337355613708496, "loss_mode_switch": 0.0, "loss_total": 0.2201654613018036, "step": 1891 }, { "epoch": 0.7568, "grad_norm": 1.4506103992462158, "learning_rate": 1.4724906837493386e-06, "loss": 0.3136, "step": 1892 }, { "batch_size": 4, "epoch": 0.7568, "step": 1892, "tokens_per_device": 7180 }, { "epoch": 0.7568, "loss_ce": 0.22970744967460632, "loss_lvr": 0.6331830620765686, "loss_mode_switch": 0.0, "loss_total": 0.29302576184272766, "step": 1892 }, { "batch_size": 1, "epoch": 0.7568, "step": 1892, "tokens_per_device": 5180 }, { "epoch": 0.7568, "loss_ce": 0.012871913611888885, "loss_lvr": 0.7353928685188293, "loss_mode_switch": 0.0, "loss_total": 0.08641120046377182, "step": 1892 }, { "batch_size": 4, "epoch": 0.7568, "step": 1892, "tokens_per_device": 5968 }, { "epoch": 0.7568, "loss_ce": 0.811316728591919, "loss_lvr": 0.8155961632728577, "loss_mode_switch": 0.0, "loss_total": 0.8928763270378113, "step": 1892 }, { "batch_size": 4, "epoch": 0.7568, "step": 1892, "tokens_per_device": 4348 }, { "epoch": 0.7568, "loss_ce": 0.35661906003952026, "loss_lvr": 0.8417152166366577, "loss_mode_switch": 0.0, "loss_total": 0.440790593624115, "step": 1892 }, { "batch_size": 4, "epoch": 0.7568, "step": 1892, "tokens_per_device": 5940 }, { "epoch": 0.7568, "loss_ce": 0.030433308333158493, "loss_lvr": 0.5979468822479248, "loss_mode_switch": 0.0, "loss_total": 0.09022799879312515, "step": 1892 }, { "batch_size": 4, "epoch": 0.7568, "step": 1892, "tokens_per_device": 4496 }, { "epoch": 0.7568, "loss_ce": 0.17321735620498657, "loss_lvr": 0.7579300403594971, "loss_mode_switch": 0.0, "loss_total": 0.2490103542804718, "step": 1892 }, { "batch_size": 1, "epoch": 0.7568, "step": 1892, "tokens_per_device": 5159 }, { "epoch": 0.7568, "loss_ce": 0.00044122495455667377, "loss_lvr": 0.4543364942073822, "loss_mode_switch": 0.0, "loss_total": 0.04587487503886223, "step": 1892 }, { "batch_size": 4, "epoch": 0.7568, "step": 1892, "tokens_per_device": 4272 }, { "epoch": 0.7568, "loss_ce": 0.22674807906150818, "loss_lvr": 0.8757392764091492, "loss_mode_switch": 0.0, "loss_total": 0.31432199478149414, "step": 1892 }, { "epoch": 0.7572, "grad_norm": 1.2399797439575195, "learning_rate": 1.467902981168195e-06, "loss": 0.2507, "step": 1893 }, { "batch_size": 4, "epoch": 0.7572, "step": 1893, "tokens_per_device": 1660 }, { "epoch": 0.7572, "loss_ce": 0.3397410213947296, "loss_lvr": 0.764950156211853, "loss_mode_switch": 0.0, "loss_total": 0.4162360429763794, "step": 1893 }, { "batch_size": 4, "epoch": 0.7572, "step": 1893, "tokens_per_device": 4304 }, { "epoch": 0.7572, "loss_ce": 0.0015489704674109817, "loss_lvr": 0.8047465085983276, "loss_mode_switch": 0.0, "loss_total": 0.08202362060546875, "step": 1893 }, { "batch_size": 4, "epoch": 0.7572, "step": 1893, "tokens_per_device": 2916 }, { "epoch": 0.7572, "loss_ce": 0.18302614986896515, "loss_lvr": 0.6439949870109558, "loss_mode_switch": 0.0, "loss_total": 0.2474256455898285, "step": 1893 }, { "batch_size": 4, "epoch": 0.7572, "step": 1893, "tokens_per_device": 4296 }, { "epoch": 0.7572, "loss_ce": 0.029878243803977966, "loss_lvr": 0.8699550032615662, "loss_mode_switch": 0.0, "loss_total": 0.11687374860048294, "step": 1893 }, { "batch_size": 1, "epoch": 0.7572, "step": 1893, "tokens_per_device": 5122 }, { "epoch": 0.7572, "loss_ce": 0.0006480079027824104, "loss_lvr": 0.6163316965103149, "loss_mode_switch": 0.0, "loss_total": 0.06228117644786835, "step": 1893 }, { "batch_size": 4, "epoch": 0.7572, "step": 1893, "tokens_per_device": 4324 }, { "epoch": 0.7572, "loss_ce": 0.41216278076171875, "loss_lvr": 0.7718466520309448, "loss_mode_switch": 0.0, "loss_total": 0.4893474578857422, "step": 1893 }, { "batch_size": 4, "epoch": 0.7572, "step": 1893, "tokens_per_device": 3788 }, { "epoch": 0.7572, "loss_ce": 0.2856384813785553, "loss_lvr": 0.5199720859527588, "loss_mode_switch": 0.0, "loss_total": 0.33763569593429565, "step": 1893 }, { "batch_size": 1, "epoch": 0.7572, "step": 1893, "tokens_per_device": 5028 }, { "epoch": 0.7572, "loss_ce": 0.04128355532884598, "loss_lvr": 0.35021811723709106, "loss_mode_switch": 0.0, "loss_total": 0.07630536705255508, "step": 1893 }, { "epoch": 0.7576, "grad_norm": 1.548758864402771, "learning_rate": 1.4633212065956248e-06, "loss": 0.2997, "step": 1894 }, { "batch_size": 4, "epoch": 0.7576, "step": 1894, "tokens_per_device": 4352 }, { "epoch": 0.7576, "loss_ce": 0.02091926708817482, "loss_lvr": 0.7236059904098511, "loss_mode_switch": 0.0, "loss_total": 0.0932798683643341, "step": 1894 }, { "batch_size": 1, "epoch": 0.7576, "step": 1894, "tokens_per_device": 6352 }, { "epoch": 0.7576, "loss_ce": 0.6869024038314819, "loss_lvr": 0.3549603223800659, "loss_mode_switch": 0.0, "loss_total": 0.7223984599113464, "step": 1894 }, { "batch_size": 4, "epoch": 0.7576, "step": 1894, "tokens_per_device": 1296 }, { "epoch": 0.7576, "loss_ce": 0.23168140649795532, "loss_lvr": 1.2054016590118408, "loss_mode_switch": 0.0, "loss_total": 0.3522215783596039, "step": 1894 }, { "batch_size": 4, "epoch": 0.7576, "step": 1894, "tokens_per_device": 6104 }, { "epoch": 0.7576, "loss_ce": 0.13054752349853516, "loss_lvr": 0.781982958316803, "loss_mode_switch": 0.0, "loss_total": 0.2087458223104477, "step": 1894 }, { "batch_size": 4, "epoch": 0.7576, "step": 1894, "tokens_per_device": 1928 }, { "epoch": 0.7576, "loss_ce": 0.3994095027446747, "loss_lvr": 0.9548106789588928, "loss_mode_switch": 0.0, "loss_total": 0.49489057064056396, "step": 1894 }, { "batch_size": 1, "epoch": 0.7576, "step": 1894, "tokens_per_device": 7719 }, { "epoch": 0.7576, "loss_ce": 0.00011766697571147233, "loss_lvr": 0.3016708195209503, "loss_mode_switch": 0.0, "loss_total": 0.03028474934399128, "step": 1894 }, { "batch_size": 4, "epoch": 0.7576, "step": 1894, "tokens_per_device": 2688 }, { "epoch": 0.7576, "loss_ce": 1.0211694240570068, "loss_lvr": 0.6886841654777527, "loss_mode_switch": 0.0, "loss_total": 1.0900378227233887, "step": 1894 }, { "batch_size": 1, "epoch": 0.7576, "step": 1894, "tokens_per_device": 4890 }, { "epoch": 0.7576, "loss_ce": 0.0052013518288731575, "loss_lvr": 0.47578516602516174, "loss_mode_switch": 0.0, "loss_total": 0.05277986824512482, "step": 1894 }, { "epoch": 0.758, "grad_norm": 1.7588574886322021, "learning_rate": 1.4587453677213348e-06, "loss": 0.3182, "step": 1895 }, { "batch_size": 4, "epoch": 0.758, "step": 1895, "tokens_per_device": 3932 }, { "epoch": 0.758, "loss_ce": 0.19365240633487701, "loss_lvr": 0.7928906679153442, "loss_mode_switch": 0.0, "loss_total": 0.2729414701461792, "step": 1895 }, { "batch_size": 4, "epoch": 0.758, "step": 1895, "tokens_per_device": 2624 }, { "epoch": 0.758, "loss_ce": 0.5031473636627197, "loss_lvr": 0.9074428081512451, "loss_mode_switch": 0.0, "loss_total": 0.5938916206359863, "step": 1895 }, { "batch_size": 1, "epoch": 0.758, "step": 1895, "tokens_per_device": 6200 }, { "epoch": 0.758, "loss_ce": 0.043064676225185394, "loss_lvr": 0.3037002980709076, "loss_mode_switch": 0.0, "loss_total": 0.07343471050262451, "step": 1895 }, { "batch_size": 4, "epoch": 0.758, "step": 1895, "tokens_per_device": 2712 }, { "epoch": 0.758, "loss_ce": 0.14252464473247528, "loss_lvr": 0.5060248374938965, "loss_mode_switch": 0.0, "loss_total": 0.1931271255016327, "step": 1895 }, { "batch_size": 1, "epoch": 0.758, "step": 1895, "tokens_per_device": 5117 }, { "epoch": 0.758, "loss_ce": 0.009937924332916737, "loss_lvr": 0.6814108490943909, "loss_mode_switch": 0.0, "loss_total": 0.07807901501655579, "step": 1895 }, { "batch_size": 4, "epoch": 0.758, "step": 1895, "tokens_per_device": 5676 }, { "epoch": 0.758, "loss_ce": 0.40309348702430725, "loss_lvr": 0.9515007734298706, "loss_mode_switch": 0.0, "loss_total": 0.4982435703277588, "step": 1895 }, { "batch_size": 4, "epoch": 0.758, "step": 1895, "tokens_per_device": 4312 }, { "epoch": 0.758, "loss_ce": 0.04649995267391205, "loss_lvr": 0.6735830903053284, "loss_mode_switch": 0.0, "loss_total": 0.11385826021432877, "step": 1895 }, { "batch_size": 4, "epoch": 0.758, "step": 1895, "tokens_per_device": 2644 }, { "epoch": 0.758, "loss_ce": 0.22464461624622345, "loss_lvr": 0.8183244466781616, "loss_mode_switch": 0.0, "loss_total": 0.30647706985473633, "step": 1895 }, { "epoch": 0.7584, "grad_norm": 1.1859194040298462, "learning_rate": 1.4541754722250716e-06, "loss": 0.237, "step": 1896 }, { "batch_size": 1, "epoch": 0.7584, "step": 1896, "tokens_per_device": 4952 }, { "epoch": 0.7584, "loss_ce": 0.00860816054046154, "loss_lvr": 0.26391029357910156, "loss_mode_switch": 0.0, "loss_total": 0.034999191761016846, "step": 1896 }, { "batch_size": 4, "epoch": 0.7584, "step": 1896, "tokens_per_device": 7256 }, { "epoch": 0.7584, "loss_ce": 0.0018167909001931548, "loss_lvr": 0.7157933115959167, "loss_mode_switch": 0.0, "loss_total": 0.07339612394571304, "step": 1896 }, { "batch_size": 4, "epoch": 0.7584, "step": 1896, "tokens_per_device": 1936 }, { "epoch": 0.7584, "loss_ce": 0.6547155976295471, "loss_lvr": 0.9675015807151794, "loss_mode_switch": 0.0, "loss_total": 0.7514657378196716, "step": 1896 }, { "batch_size": 1, "epoch": 0.7584, "step": 1896, "tokens_per_device": 4876 }, { "epoch": 0.7584, "loss_ce": 1.463346242904663, "loss_lvr": 0.6746639013290405, "loss_mode_switch": 0.0, "loss_total": 1.5308126211166382, "step": 1896 }, { "batch_size": 1, "epoch": 0.7584, "step": 1896, "tokens_per_device": 5153 }, { "epoch": 0.7584, "loss_ce": 0.0009665982215665281, "loss_lvr": 0.19814105331897736, "loss_mode_switch": 0.0, "loss_total": 0.02078070305287838, "step": 1896 }, { "batch_size": 1, "epoch": 0.7584, "step": 1896, "tokens_per_device": 4876 }, { "epoch": 0.7584, "loss_ce": 0.022150540724396706, "loss_lvr": 0.25098273158073425, "loss_mode_switch": 0.0, "loss_total": 0.04724881425499916, "step": 1896 }, { "batch_size": 4, "epoch": 0.7584, "step": 1896, "tokens_per_device": 6316 }, { "epoch": 0.7584, "loss_ce": 0.35550814867019653, "loss_lvr": 0.8593255281448364, "loss_mode_switch": 0.0, "loss_total": 0.4414407014846802, "step": 1896 }, { "batch_size": 4, "epoch": 0.7584, "step": 1896, "tokens_per_device": 4184 }, { "epoch": 0.7584, "loss_ce": 0.013647746294736862, "loss_lvr": 0.6366411447525024, "loss_mode_switch": 0.0, "loss_total": 0.07731185853481293, "step": 1896 }, { "epoch": 0.7588, "grad_norm": 1.3972220420837402, "learning_rate": 1.4496115277766105e-06, "loss": 0.3081, "step": 1897 }, { "batch_size": 1, "epoch": 0.7588, "step": 1897, "tokens_per_device": 4670 }, { "epoch": 0.7588, "loss_ce": 0.024190831929445267, "loss_lvr": 0.5267468690872192, "loss_mode_switch": 0.0, "loss_total": 0.07686552405357361, "step": 1897 }, { "batch_size": 1, "epoch": 0.7588, "step": 1897, "tokens_per_device": 4563 }, { "epoch": 0.7588, "loss_ce": 0.2568884491920471, "loss_lvr": 0.692697286605835, "loss_mode_switch": 0.0, "loss_total": 0.32615816593170166, "step": 1897 }, { "batch_size": 1, "epoch": 0.7588, "step": 1897, "tokens_per_device": 4886 }, { "epoch": 0.7588, "loss_ce": 0.0017436916241422296, "loss_lvr": 0.24792957305908203, "loss_mode_switch": 0.0, "loss_total": 0.026536649093031883, "step": 1897 }, { "batch_size": 4, "epoch": 0.7588, "step": 1897, "tokens_per_device": 1820 }, { "epoch": 0.7588, "loss_ce": 0.038514137268066406, "loss_lvr": 0.9088971614837646, "loss_mode_switch": 0.0, "loss_total": 0.12940385937690735, "step": 1897 }, { "batch_size": 4, "epoch": 0.7588, "step": 1897, "tokens_per_device": 3744 }, { "epoch": 0.7588, "loss_ce": 0.6923167109489441, "loss_lvr": 0.8006952404975891, "loss_mode_switch": 0.0, "loss_total": 0.7723862528800964, "step": 1897 }, { "batch_size": 4, "epoch": 0.7588, "step": 1897, "tokens_per_device": 4164 }, { "epoch": 0.7588, "loss_ce": 0.04765160754323006, "loss_lvr": 0.8365928530693054, "loss_mode_switch": 0.0, "loss_total": 0.13131089508533478, "step": 1897 }, { "batch_size": 1, "epoch": 0.7588, "step": 1897, "tokens_per_device": 4869 }, { "epoch": 0.7588, "loss_ce": 0.18265919387340546, "loss_lvr": 0.3180108368396759, "loss_mode_switch": 0.0, "loss_total": 0.21446028351783752, "step": 1897 }, { "batch_size": 4, "epoch": 0.7588, "step": 1897, "tokens_per_device": 3780 }, { "epoch": 0.7588, "loss_ce": 0.1485058218240738, "loss_lvr": 0.6759061813354492, "loss_mode_switch": 0.0, "loss_total": 0.216096431016922, "step": 1897 }, { "epoch": 0.7592, "grad_norm": 1.4137405157089233, "learning_rate": 1.4450535420357325e-06, "loss": 0.2795, "step": 1898 }, { "batch_size": 4, "epoch": 0.7592, "step": 1898, "tokens_per_device": 3788 }, { "epoch": 0.7592, "loss_ce": 0.3161669671535492, "loss_lvr": 0.9605656266212463, "loss_mode_switch": 0.0, "loss_total": 0.4122235178947449, "step": 1898 }, { "batch_size": 4, "epoch": 0.7592, "step": 1898, "tokens_per_device": 4412 }, { "epoch": 0.7592, "loss_ce": 0.22638855874538422, "loss_lvr": 0.8295188546180725, "loss_mode_switch": 0.0, "loss_total": 0.3093404471874237, "step": 1898 }, { "batch_size": 4, "epoch": 0.7592, "step": 1898, "tokens_per_device": 3992 }, { "epoch": 0.7592, "loss_ce": 0.04328884184360504, "loss_lvr": 0.9420738816261292, "loss_mode_switch": 0.0, "loss_total": 0.1374962329864502, "step": 1898 }, { "batch_size": 4, "epoch": 0.7592, "step": 1898, "tokens_per_device": 4280 }, { "epoch": 0.7592, "loss_ce": 0.32683366537094116, "loss_lvr": 0.7719846963882446, "loss_mode_switch": 0.0, "loss_total": 0.4040321409702301, "step": 1898 }, { "batch_size": 1, "epoch": 0.7592, "step": 1898, "tokens_per_device": 4907 }, { "epoch": 0.7592, "loss_ce": 0.011465217918157578, "loss_lvr": 0.44321209192276, "loss_mode_switch": 0.0, "loss_total": 0.05578642711043358, "step": 1898 }, { "batch_size": 4, "epoch": 0.7592, "step": 1898, "tokens_per_device": 1808 }, { "epoch": 0.7592, "loss_ce": 0.1314437985420227, "loss_lvr": 0.8008916974067688, "loss_mode_switch": 0.0, "loss_total": 0.21153298020362854, "step": 1898 }, { "batch_size": 4, "epoch": 0.7592, "step": 1898, "tokens_per_device": 2888 }, { "epoch": 0.7592, "loss_ce": 0.18767084181308746, "loss_lvr": 0.7876992225646973, "loss_mode_switch": 0.0, "loss_total": 0.266440749168396, "step": 1898 }, { "batch_size": 1, "epoch": 0.7592, "step": 1898, "tokens_per_device": 5128 }, { "epoch": 0.7592, "loss_ce": 0.004956464283168316, "loss_lvr": 0.4815555512905121, "loss_mode_switch": 0.0, "loss_total": 0.05311202257871628, "step": 1898 }, { "epoch": 0.7596, "grad_norm": 1.397111177444458, "learning_rate": 1.440501522652224e-06, "loss": 0.3184, "step": 1899 }, { "batch_size": 4, "epoch": 0.7596, "step": 1899, "tokens_per_device": 3836 }, { "epoch": 0.7596, "loss_ce": 0.3180416524410248, "loss_lvr": 0.840378999710083, "loss_mode_switch": 0.0, "loss_total": 0.4020795524120331, "step": 1899 }, { "batch_size": 1, "epoch": 0.7596, "step": 1899, "tokens_per_device": 4854 }, { "epoch": 0.7596, "loss_ce": 0.00030994691769592464, "loss_lvr": 0.49311259388923645, "loss_mode_switch": 0.0, "loss_total": 0.049621209502220154, "step": 1899 }, { "batch_size": 1, "epoch": 0.7596, "step": 1899, "tokens_per_device": 4899 }, { "epoch": 0.7596, "loss_ce": 0.13409346342086792, "loss_lvr": 0.27718043327331543, "loss_mode_switch": 0.0, "loss_total": 0.16181150078773499, "step": 1899 }, { "batch_size": 1, "epoch": 0.7596, "step": 1899, "tokens_per_device": 5954 }, { "epoch": 0.7596, "loss_ce": 0.011733478866517544, "loss_lvr": 0.3732845187187195, "loss_mode_switch": 0.0, "loss_total": 0.04906193166971207, "step": 1899 }, { "batch_size": 4, "epoch": 0.7596, "step": 1899, "tokens_per_device": 1384 }, { "epoch": 0.7596, "loss_ce": 0.47731149196624756, "loss_lvr": 0.8576169013977051, "loss_mode_switch": 0.0, "loss_total": 0.5630731582641602, "step": 1899 }, { "batch_size": 4, "epoch": 0.7596, "step": 1899, "tokens_per_device": 6048 }, { "epoch": 0.7596, "loss_ce": 0.14164140820503235, "loss_lvr": 0.7631567120552063, "loss_mode_switch": 0.0, "loss_total": 0.21795707941055298, "step": 1899 }, { "batch_size": 4, "epoch": 0.7596, "step": 1899, "tokens_per_device": 3976 }, { "epoch": 0.7596, "loss_ce": 0.08748212456703186, "loss_lvr": 0.757172167301178, "loss_mode_switch": 0.0, "loss_total": 0.16319933533668518, "step": 1899 }, { "batch_size": 1, "epoch": 0.7596, "step": 1899, "tokens_per_device": 5040 }, { "epoch": 0.7596, "loss_ce": 0.004884436726570129, "loss_lvr": 0.3882579207420349, "loss_mode_switch": 0.0, "loss_total": 0.04371022805571556, "step": 1899 }, { "epoch": 0.76, "grad_norm": 1.3314766883850098, "learning_rate": 1.4359554772658551e-06, "loss": 0.302, "step": 1900 }, { "batch_size": 1, "epoch": 0.76, "step": 1900, "tokens_per_device": 4869 }, { "epoch": 0.76, "loss_ce": 0.00613839365541935, "loss_lvr": 0.2029171884059906, "loss_mode_switch": 0.0, "loss_total": 0.02643011324107647, "step": 1900 }, { "batch_size": 4, "epoch": 0.76, "step": 1900, "tokens_per_device": 4260 }, { "epoch": 0.76, "loss_ce": 0.3144340217113495, "loss_lvr": 0.8802909851074219, "loss_mode_switch": 0.0, "loss_total": 0.4024631381034851, "step": 1900 }, { "batch_size": 1, "epoch": 0.76, "step": 1900, "tokens_per_device": 4487 }, { "epoch": 0.76, "loss_ce": 0.05970826372504234, "loss_lvr": 0.3923093378543854, "loss_mode_switch": 0.0, "loss_total": 0.0989391952753067, "step": 1900 }, { "batch_size": 1, "epoch": 0.76, "step": 1900, "tokens_per_device": 4113 }, { "epoch": 0.76, "loss_ce": 0.11756118386983871, "loss_lvr": 0.438328355550766, "loss_mode_switch": 0.0, "loss_total": 0.16139401495456696, "step": 1900 }, { "batch_size": 1, "epoch": 0.76, "step": 1900, "tokens_per_device": 4874 }, { "epoch": 0.76, "loss_ce": 0.00017103712889365852, "loss_lvr": 0.4317660629749298, "loss_mode_switch": 0.0, "loss_total": 0.043347641825675964, "step": 1900 }, { "batch_size": 1, "epoch": 0.76, "step": 1900, "tokens_per_device": 4854 }, { "epoch": 0.76, "loss_ce": 0.00382452760823071, "loss_lvr": 0.3568597733974457, "loss_mode_switch": 0.0, "loss_total": 0.03951050713658333, "step": 1900 }, { "batch_size": 4, "epoch": 0.76, "step": 1900, "tokens_per_device": 6464 }, { "epoch": 0.76, "loss_ce": 0.19298751652240753, "loss_lvr": 0.6942257881164551, "loss_mode_switch": 0.0, "loss_total": 0.26241010427474976, "step": 1900 }, { "batch_size": 4, "epoch": 0.76, "step": 1900, "tokens_per_device": 11812 }, { "epoch": 0.76, "loss_ce": 0.08264865726232529, "loss_lvr": 0.9434976577758789, "loss_mode_switch": 0.0, "loss_total": 0.17699842154979706, "step": 1900 }, { "epoch": 0.7604, "grad_norm": 1.3462684154510498, "learning_rate": 1.4314154135063668e-06, "loss": 0.274, "step": 1901 }, { "batch_size": 4, "epoch": 0.7604, "step": 1901, "tokens_per_device": 8444 }, { "epoch": 0.7604, "loss_ce": 0.1223054975271225, "loss_lvr": 0.7716241478919983, "loss_mode_switch": 0.0, "loss_total": 0.19946791231632233, "step": 1901 }, { "batch_size": 4, "epoch": 0.7604, "step": 1901, "tokens_per_device": 4244 }, { "epoch": 0.7604, "loss_ce": 0.31498339772224426, "loss_lvr": 1.322381615638733, "loss_mode_switch": 0.0, "loss_total": 0.447221577167511, "step": 1901 }, { "batch_size": 4, "epoch": 0.7604, "step": 1901, "tokens_per_device": 9888 }, { "epoch": 0.7604, "loss_ce": 0.07402396947145462, "loss_lvr": 0.6513975262641907, "loss_mode_switch": 0.0, "loss_total": 0.13916373252868652, "step": 1901 }, { "batch_size": 1, "epoch": 0.7604, "step": 1901, "tokens_per_device": 4892 }, { "epoch": 0.7604, "loss_ce": 0.014657610096037388, "loss_lvr": 0.3195365369319916, "loss_mode_switch": 0.0, "loss_total": 0.04661126434803009, "step": 1901 }, { "batch_size": 4, "epoch": 0.7604, "step": 1901, "tokens_per_device": 3884 }, { "epoch": 0.7604, "loss_ce": 0.10742907971143723, "loss_lvr": 0.7778984308242798, "loss_mode_switch": 0.0, "loss_total": 0.1852189302444458, "step": 1901 }, { "batch_size": 1, "epoch": 0.7604, "step": 1901, "tokens_per_device": 4878 }, { "epoch": 0.7604, "loss_ce": 0.0191606767475605, "loss_lvr": 0.21475155651569366, "loss_mode_switch": 0.0, "loss_total": 0.04063583165407181, "step": 1901 }, { "batch_size": 4, "epoch": 0.7604, "step": 1901, "tokens_per_device": 5444 }, { "epoch": 0.7604, "loss_ce": 0.8244994282722473, "loss_lvr": 0.8341832160949707, "loss_mode_switch": 0.0, "loss_total": 0.9079177379608154, "step": 1901 }, { "batch_size": 4, "epoch": 0.7604, "step": 1901, "tokens_per_device": 4212 }, { "epoch": 0.7604, "loss_ce": 0.4562658369541168, "loss_lvr": 0.8403529524803162, "loss_mode_switch": 0.0, "loss_total": 0.5403011441230774, "step": 1901 }, { "epoch": 0.7608, "grad_norm": 1.4357986450195312, "learning_rate": 1.426881338993466e-06, "loss": 0.278, "step": 1902 }, { "batch_size": 4, "epoch": 0.7608, "step": 1902, "tokens_per_device": 4164 }, { "epoch": 0.7608, "loss_ce": 0.5011299848556519, "loss_lvr": 0.7524213194847107, "loss_mode_switch": 0.0, "loss_total": 0.5763721466064453, "step": 1902 }, { "batch_size": 1, "epoch": 0.7608, "step": 1902, "tokens_per_device": 5123 }, { "epoch": 0.7608, "loss_ce": 0.027573950588703156, "loss_lvr": 0.4915897250175476, "loss_mode_switch": 0.0, "loss_total": 0.07673291862010956, "step": 1902 }, { "batch_size": 4, "epoch": 0.7608, "step": 1902, "tokens_per_device": 9444 }, { "epoch": 0.7608, "loss_ce": 0.20828066766262054, "loss_lvr": 0.9293637871742249, "loss_mode_switch": 0.0, "loss_total": 0.30121704936027527, "step": 1902 }, { "batch_size": 4, "epoch": 0.7608, "step": 1902, "tokens_per_device": 2756 }, { "epoch": 0.7608, "loss_ce": 0.3383556604385376, "loss_lvr": 0.7938133478164673, "loss_mode_switch": 0.0, "loss_total": 0.4177370071411133, "step": 1902 }, { "batch_size": 4, "epoch": 0.7608, "step": 1902, "tokens_per_device": 1380 }, { "epoch": 0.7608, "loss_ce": 0.15650542080402374, "loss_lvr": 1.0104910135269165, "loss_mode_switch": 0.0, "loss_total": 0.2575545310974121, "step": 1902 }, { "batch_size": 4, "epoch": 0.7608, "step": 1902, "tokens_per_device": 5372 }, { "epoch": 0.7608, "loss_ce": 0.2514145076274872, "loss_lvr": 0.4856693148612976, "loss_mode_switch": 0.0, "loss_total": 0.2999814450740814, "step": 1902 }, { "batch_size": 4, "epoch": 0.7608, "step": 1902, "tokens_per_device": 5700 }, { "epoch": 0.7608, "loss_ce": 0.2703050374984741, "loss_lvr": 0.6690711975097656, "loss_mode_switch": 0.0, "loss_total": 0.33721214532852173, "step": 1902 }, { "batch_size": 4, "epoch": 0.7608, "step": 1902, "tokens_per_device": 10004 }, { "epoch": 0.7608, "loss_ce": 0.5353100299835205, "loss_lvr": 0.5838359594345093, "loss_mode_switch": 0.0, "loss_total": 0.5936936140060425, "step": 1902 }, { "epoch": 0.7612, "grad_norm": 2.2532215118408203, "learning_rate": 1.422353261336808e-06, "loss": 0.3189, "step": 1903 }, { "batch_size": 4, "epoch": 0.7612, "step": 1903, "tokens_per_device": 5156 }, { "epoch": 0.7612, "loss_ce": 0.12278007715940475, "loss_lvr": 0.9191983938217163, "loss_mode_switch": 0.0, "loss_total": 0.21469992399215698, "step": 1903 }, { "batch_size": 1, "epoch": 0.7612, "step": 1903, "tokens_per_device": 5485 }, { "epoch": 0.7612, "loss_ce": 0.007103991694748402, "loss_lvr": 0.33336418867111206, "loss_mode_switch": 0.0, "loss_total": 0.04044041037559509, "step": 1903 }, { "batch_size": 4, "epoch": 0.7612, "step": 1903, "tokens_per_device": 3812 }, { "epoch": 0.7612, "loss_ce": 0.644988477230072, "loss_lvr": 0.894264280796051, "loss_mode_switch": 0.0, "loss_total": 0.7344149351119995, "step": 1903 }, { "batch_size": 1, "epoch": 0.7612, "step": 1903, "tokens_per_device": 5121 }, { "epoch": 0.7612, "loss_ce": 0.04802997410297394, "loss_lvr": 0.5814498066902161, "loss_mode_switch": 0.0, "loss_total": 0.10617496073246002, "step": 1903 }, { "batch_size": 1, "epoch": 0.7612, "step": 1903, "tokens_per_device": 5126 }, { "epoch": 0.7612, "loss_ce": 0.10962092131376266, "loss_lvr": 0.3686305582523346, "loss_mode_switch": 0.0, "loss_total": 0.14648397266864777, "step": 1903 }, { "batch_size": 1, "epoch": 0.7612, "step": 1903, "tokens_per_device": 5124 }, { "epoch": 0.7612, "loss_ce": 0.03449070081114769, "loss_lvr": 0.17757420241832733, "loss_mode_switch": 0.0, "loss_total": 0.05224812030792236, "step": 1903 }, { "batch_size": 4, "epoch": 0.7612, "step": 1903, "tokens_per_device": 4268 }, { "epoch": 0.7612, "loss_ce": 0.5235096216201782, "loss_lvr": 0.7889696359634399, "loss_mode_switch": 0.0, "loss_total": 0.6024065613746643, "step": 1903 }, { "batch_size": 4, "epoch": 0.7612, "step": 1903, "tokens_per_device": 4968 }, { "epoch": 0.7612, "loss_ce": 0.21361832320690155, "loss_lvr": 0.8587864637374878, "loss_mode_switch": 0.0, "loss_total": 0.29949697852134705, "step": 1903 }, { "epoch": 0.7616, "grad_norm": 1.466609239578247, "learning_rate": 1.4178311881359785e-06, "loss": 0.2756, "step": 1904 }, { "batch_size": 4, "epoch": 0.7616, "step": 1904, "tokens_per_device": 14228 }, { "epoch": 0.7616, "loss_ce": 0.0007709485362283885, "loss_lvr": 0.5013094544410706, "loss_mode_switch": 0.0, "loss_total": 0.05090189352631569, "step": 1904 }, { "batch_size": 1, "epoch": 0.7616, "step": 1904, "tokens_per_device": 5071 }, { "epoch": 0.7616, "loss_ce": 0.2968865931034088, "loss_lvr": 0.44142845273017883, "loss_mode_switch": 0.0, "loss_total": 0.34102943539619446, "step": 1904 }, { "batch_size": 4, "epoch": 0.7616, "step": 1904, "tokens_per_device": 2304 }, { "epoch": 0.7616, "loss_ce": 0.3029760718345642, "loss_lvr": 0.7945723533630371, "loss_mode_switch": 0.0, "loss_total": 0.38243329524993896, "step": 1904 }, { "batch_size": 4, "epoch": 0.7616, "step": 1904, "tokens_per_device": 3512 }, { "epoch": 0.7616, "loss_ce": 0.2953321635723114, "loss_lvr": 0.5554715394973755, "loss_mode_switch": 0.0, "loss_total": 0.3508793115615845, "step": 1904 }, { "batch_size": 1, "epoch": 0.7616, "step": 1904, "tokens_per_device": 4908 }, { "epoch": 0.7616, "loss_ce": 0.0076368870213627815, "loss_lvr": 0.6057636141777039, "loss_mode_switch": 0.0, "loss_total": 0.06821324676275253, "step": 1904 }, { "batch_size": 4, "epoch": 0.7616, "step": 1904, "tokens_per_device": 4820 }, { "epoch": 0.7616, "loss_ce": 0.1434047520160675, "loss_lvr": 0.8870298266410828, "loss_mode_switch": 0.0, "loss_total": 0.2321077287197113, "step": 1904 }, { "batch_size": 1, "epoch": 0.7616, "step": 1904, "tokens_per_device": 5107 }, { "epoch": 0.7616, "loss_ce": 0.006721027661114931, "loss_lvr": 0.39461055397987366, "loss_mode_switch": 0.0, "loss_total": 0.04618208482861519, "step": 1904 }, { "batch_size": 4, "epoch": 0.7616, "step": 1904, "tokens_per_device": 4964 }, { "epoch": 0.7616, "loss_ce": 0.2664250135421753, "loss_lvr": 0.6673216819763184, "loss_mode_switch": 0.0, "loss_total": 0.33315718173980713, "step": 1904 }, { "epoch": 0.762, "grad_norm": 1.311262607574463, "learning_rate": 1.4133151269804873e-06, "loss": 0.2754, "step": 1905 }, { "batch_size": 4, "epoch": 0.762, "step": 1905, "tokens_per_device": 3764 }, { "epoch": 0.762, "loss_ce": 0.548677921295166, "loss_lvr": 0.9220200181007385, "loss_mode_switch": 0.0, "loss_total": 0.6408799290657043, "step": 1905 }, { "batch_size": 4, "epoch": 0.762, "step": 1905, "tokens_per_device": 7172 }, { "epoch": 0.762, "loss_ce": 0.009068949148058891, "loss_lvr": 0.7108136415481567, "loss_mode_switch": 0.0, "loss_total": 0.0801503136754036, "step": 1905 }, { "batch_size": 1, "epoch": 0.762, "step": 1905, "tokens_per_device": 5071 }, { "epoch": 0.762, "loss_ce": 0.0006695010815747082, "loss_lvr": 0.3821280598640442, "loss_mode_switch": 0.0, "loss_total": 0.0388823077082634, "step": 1905 }, { "batch_size": 1, "epoch": 0.762, "step": 1905, "tokens_per_device": 5909 }, { "epoch": 0.762, "loss_ce": 0.04984560236334801, "loss_lvr": 0.5540897250175476, "loss_mode_switch": 0.0, "loss_total": 0.10525457561016083, "step": 1905 }, { "batch_size": 4, "epoch": 0.762, "step": 1905, "tokens_per_device": 5160 }, { "epoch": 0.762, "loss_ce": 0.3679993152618408, "loss_lvr": 0.7133846282958984, "loss_mode_switch": 0.0, "loss_total": 0.4393377900123596, "step": 1905 }, { "batch_size": 4, "epoch": 0.762, "step": 1905, "tokens_per_device": 5884 }, { "epoch": 0.762, "loss_ce": 0.44197914004325867, "loss_lvr": 0.8770499229431152, "loss_mode_switch": 0.0, "loss_total": 0.5296841263771057, "step": 1905 }, { "batch_size": 4, "epoch": 0.762, "step": 1905, "tokens_per_device": 4784 }, { "epoch": 0.762, "loss_ce": 0.2935057282447815, "loss_lvr": 0.8574013710021973, "loss_mode_switch": 0.0, "loss_total": 0.3792458772659302, "step": 1905 }, { "batch_size": 4, "epoch": 0.762, "step": 1905, "tokens_per_device": 4240 }, { "epoch": 0.762, "loss_ce": 0.1894916146993637, "loss_lvr": 0.8527050614356995, "loss_mode_switch": 0.0, "loss_total": 0.2747621238231659, "step": 1905 }, { "epoch": 0.7624, "grad_norm": 1.4079523086547852, "learning_rate": 1.4088050854497587e-06, "loss": 0.294, "step": 1906 }, { "batch_size": 1, "epoch": 0.7624, "step": 1906, "tokens_per_device": 4880 }, { "epoch": 0.7624, "loss_ce": 0.008685384877026081, "loss_lvr": 0.7413541674613953, "loss_mode_switch": 0.0, "loss_total": 0.08282080292701721, "step": 1906 }, { "batch_size": 4, "epoch": 0.7624, "step": 1906, "tokens_per_device": 5444 }, { "epoch": 0.7624, "loss_ce": 0.2320045828819275, "loss_lvr": 0.8646129965782166, "loss_mode_switch": 0.0, "loss_total": 0.3184658885002136, "step": 1906 }, { "batch_size": 4, "epoch": 0.7624, "step": 1906, "tokens_per_device": 6596 }, { "epoch": 0.7624, "loss_ce": 0.13636499643325806, "loss_lvr": 0.66367506980896, "loss_mode_switch": 0.0, "loss_total": 0.20273250341415405, "step": 1906 }, { "batch_size": 4, "epoch": 0.7624, "step": 1906, "tokens_per_device": 4556 }, { "epoch": 0.7624, "loss_ce": 0.11944370716810226, "loss_lvr": 0.873077929019928, "loss_mode_switch": 0.0, "loss_total": 0.2067514955997467, "step": 1906 }, { "batch_size": 4, "epoch": 0.7624, "step": 1906, "tokens_per_device": 2712 }, { "epoch": 0.7624, "loss_ce": 0.10906227678060532, "loss_lvr": 0.8264552354812622, "loss_mode_switch": 0.0, "loss_total": 0.1917078047990799, "step": 1906 }, { "batch_size": 4, "epoch": 0.7624, "step": 1906, "tokens_per_device": 2764 }, { "epoch": 0.7624, "loss_ce": 0.08429605513811111, "loss_lvr": 0.9235508441925049, "loss_mode_switch": 0.0, "loss_total": 0.17665114998817444, "step": 1906 }, { "batch_size": 1, "epoch": 0.7624, "step": 1906, "tokens_per_device": 4899 }, { "epoch": 0.7624, "loss_ce": 0.007367810234427452, "loss_lvr": 0.2726908326148987, "loss_mode_switch": 0.0, "loss_total": 0.03463689237833023, "step": 1906 }, { "batch_size": 1, "epoch": 0.7624, "step": 1906, "tokens_per_device": 5127 }, { "epoch": 0.7624, "loss_ce": 0.002312735188752413, "loss_lvr": 0.8239103555679321, "loss_mode_switch": 0.0, "loss_total": 0.08470377326011658, "step": 1906 }, { "epoch": 0.7628, "grad_norm": 1.2253527641296387, "learning_rate": 1.404301071113106e-06, "loss": 0.2573, "step": 1907 }, { "batch_size": 4, "epoch": 0.7628, "step": 1907, "tokens_per_device": 4272 }, { "epoch": 0.7628, "loss_ce": 0.22955337166786194, "loss_lvr": 1.1104917526245117, "loss_mode_switch": 0.0, "loss_total": 0.3406025469303131, "step": 1907 }, { "batch_size": 1, "epoch": 0.7628, "step": 1907, "tokens_per_device": 5268 }, { "epoch": 0.7628, "loss_ce": 0.22856640815734863, "loss_lvr": 0.5682014226913452, "loss_mode_switch": 0.0, "loss_total": 0.2853865623474121, "step": 1907 }, { "batch_size": 4, "epoch": 0.7628, "step": 1907, "tokens_per_device": 3784 }, { "epoch": 0.7628, "loss_ce": 0.1610191911458969, "loss_lvr": 1.0382158756256104, "loss_mode_switch": 0.0, "loss_total": 0.2648407816886902, "step": 1907 }, { "batch_size": 4, "epoch": 0.7628, "step": 1907, "tokens_per_device": 4240 }, { "epoch": 0.7628, "loss_ce": 0.17205005884170532, "loss_lvr": 0.7978925108909607, "loss_mode_switch": 0.0, "loss_total": 0.2518393099308014, "step": 1907 }, { "batch_size": 1, "epoch": 0.7628, "step": 1907, "tokens_per_device": 5164 }, { "epoch": 0.7628, "loss_ce": 0.0148647865280509, "loss_lvr": 0.25844183564186096, "loss_mode_switch": 0.0, "loss_total": 0.04070897027850151, "step": 1907 }, { "batch_size": 4, "epoch": 0.7628, "step": 1907, "tokens_per_device": 1424 }, { "epoch": 0.7628, "loss_ce": 0.11744373291730881, "loss_lvr": 0.8616941571235657, "loss_mode_switch": 0.0, "loss_total": 0.20361314713954926, "step": 1907 }, { "batch_size": 4, "epoch": 0.7628, "step": 1907, "tokens_per_device": 12332 }, { "epoch": 0.7628, "loss_ce": 0.23574768006801605, "loss_lvr": 0.7624519467353821, "loss_mode_switch": 0.0, "loss_total": 0.311992883682251, "step": 1907 }, { "batch_size": 4, "epoch": 0.7628, "step": 1907, "tokens_per_device": 4540 }, { "epoch": 0.7628, "loss_ce": 0.02404925972223282, "loss_lvr": 0.7945684194564819, "loss_mode_switch": 0.0, "loss_total": 0.10350610315799713, "step": 1907 }, { "epoch": 0.7632, "grad_norm": 1.4267765283584595, "learning_rate": 1.3998030915297357e-06, "loss": 0.2498, "step": 1908 }, { "batch_size": 1, "epoch": 0.7632, "step": 1908, "tokens_per_device": 4942 }, { "epoch": 0.7632, "loss_ce": 0.015054905787110329, "loss_lvr": 0.31293806433677673, "loss_mode_switch": 0.0, "loss_total": 0.04634871333837509, "step": 1908 }, { "batch_size": 4, "epoch": 0.7632, "step": 1908, "tokens_per_device": 4492 }, { "epoch": 0.7632, "loss_ce": 0.014462439343333244, "loss_lvr": 0.7271093130111694, "loss_mode_switch": 0.0, "loss_total": 0.08717337250709534, "step": 1908 }, { "batch_size": 4, "epoch": 0.7632, "step": 1908, "tokens_per_device": 1304 }, { "epoch": 0.7632, "loss_ce": 0.23440667986869812, "loss_lvr": 1.0671778917312622, "loss_mode_switch": 0.0, "loss_total": 0.3411244750022888, "step": 1908 }, { "batch_size": 4, "epoch": 0.7632, "step": 1908, "tokens_per_device": 2960 }, { "epoch": 0.7632, "loss_ce": 0.08910652995109558, "loss_lvr": 1.0316643714904785, "loss_mode_switch": 0.0, "loss_total": 0.19227296113967896, "step": 1908 }, { "batch_size": 4, "epoch": 0.7632, "step": 1908, "tokens_per_device": 2028 }, { "epoch": 0.7632, "loss_ce": 0.5331051349639893, "loss_lvr": 0.6774035692214966, "loss_mode_switch": 0.0, "loss_total": 0.6008455157279968, "step": 1908 }, { "batch_size": 4, "epoch": 0.7632, "step": 1908, "tokens_per_device": 4324 }, { "epoch": 0.7632, "loss_ce": 0.18480201065540314, "loss_lvr": 1.4338053464889526, "loss_mode_switch": 0.0, "loss_total": 0.32818254828453064, "step": 1908 }, { "batch_size": 4, "epoch": 0.7632, "step": 1908, "tokens_per_device": 3360 }, { "epoch": 0.7632, "loss_ce": 0.3862035870552063, "loss_lvr": 0.7473487257957458, "loss_mode_switch": 0.0, "loss_total": 0.4609384536743164, "step": 1908 }, { "batch_size": 4, "epoch": 0.7632, "step": 1908, "tokens_per_device": 14156 }, { "epoch": 0.7632, "loss_ce": 0.5469996333122253, "loss_lvr": 0.41855961084365845, "loss_mode_switch": 0.0, "loss_total": 0.5888556241989136, "step": 1908 }, { "epoch": 0.7636, "grad_norm": 1.1661403179168701, "learning_rate": 1.3953111542487202e-06, "loss": 0.2372, "step": 1909 }, { "batch_size": 4, "epoch": 0.7636, "step": 1909, "tokens_per_device": 5088 }, { "epoch": 0.7636, "loss_ce": 0.43901315331459045, "loss_lvr": 0.595658540725708, "loss_mode_switch": 0.0, "loss_total": 0.4985789954662323, "step": 1909 }, { "batch_size": 1, "epoch": 0.7636, "step": 1909, "tokens_per_device": 5007 }, { "epoch": 0.7636, "loss_ce": 0.003787170397117734, "loss_lvr": 0.2644602060317993, "loss_mode_switch": 0.0, "loss_total": 0.030233191326260567, "step": 1909 }, { "batch_size": 4, "epoch": 0.7636, "step": 1909, "tokens_per_device": 2624 }, { "epoch": 0.7636, "loss_ce": 0.38673004508018494, "loss_lvr": 0.9666515588760376, "loss_mode_switch": 0.0, "loss_total": 0.48339521884918213, "step": 1909 }, { "batch_size": 1, "epoch": 0.7636, "step": 1909, "tokens_per_device": 4900 }, { "epoch": 0.7636, "loss_ce": 0.000296877435175702, "loss_lvr": 0.5317091345787048, "loss_mode_switch": 0.0, "loss_total": 0.05346779152750969, "step": 1909 }, { "batch_size": 1, "epoch": 0.7636, "step": 1909, "tokens_per_device": 6035 }, { "epoch": 0.7636, "loss_ce": 0.02294205315411091, "loss_lvr": 0.21665221452713013, "loss_mode_switch": 0.0, "loss_total": 0.04460727423429489, "step": 1909 }, { "batch_size": 4, "epoch": 0.7636, "step": 1909, "tokens_per_device": 4204 }, { "epoch": 0.7636, "loss_ce": 0.40749216079711914, "loss_lvr": 0.9651216268539429, "loss_mode_switch": 0.0, "loss_total": 0.5040042996406555, "step": 1909 }, { "batch_size": 1, "epoch": 0.7636, "step": 1909, "tokens_per_device": 4940 }, { "epoch": 0.7636, "loss_ce": 0.0473981648683548, "loss_lvr": 0.3948809504508972, "loss_mode_switch": 0.0, "loss_total": 0.08688625693321228, "step": 1909 }, { "batch_size": 4, "epoch": 0.7636, "step": 1909, "tokens_per_device": 1480 }, { "epoch": 0.7636, "loss_ce": 0.6651308536529541, "loss_lvr": 0.9538239240646362, "loss_mode_switch": 0.0, "loss_total": 0.7605132460594177, "step": 1909 }, { "epoch": 0.764, "grad_norm": 1.5172619819641113, "learning_rate": 1.39082526680899e-06, "loss": 0.2733, "step": 1910 }, { "batch_size": 4, "epoch": 0.764, "step": 1910, "tokens_per_device": 5660 }, { "epoch": 0.764, "loss_ce": 0.25174611806869507, "loss_lvr": 0.5329521894454956, "loss_mode_switch": 0.0, "loss_total": 0.3050413429737091, "step": 1910 }, { "batch_size": 4, "epoch": 0.764, "step": 1910, "tokens_per_device": 4284 }, { "epoch": 0.764, "loss_ce": 0.1714475452899933, "loss_lvr": 0.8540383577346802, "loss_mode_switch": 0.0, "loss_total": 0.2568513751029968, "step": 1910 }, { "batch_size": 4, "epoch": 0.764, "step": 1910, "tokens_per_device": 3756 }, { "epoch": 0.764, "loss_ce": 0.2104548215866089, "loss_lvr": 0.9274632334709167, "loss_mode_switch": 0.0, "loss_total": 0.3032011389732361, "step": 1910 }, { "batch_size": 4, "epoch": 0.764, "step": 1910, "tokens_per_device": 2592 }, { "epoch": 0.764, "loss_ce": 0.24588584899902344, "loss_lvr": 0.8407405614852905, "loss_mode_switch": 0.0, "loss_total": 0.329959899187088, "step": 1910 }, { "batch_size": 1, "epoch": 0.764, "step": 1910, "tokens_per_device": 5118 }, { "epoch": 0.764, "loss_ce": 0.752896785736084, "loss_lvr": 0.6498854756355286, "loss_mode_switch": 0.0, "loss_total": 0.8178853392601013, "step": 1910 }, { "batch_size": 1, "epoch": 0.764, "step": 1910, "tokens_per_device": 4871 }, { "epoch": 0.764, "loss_ce": 0.023882074281573296, "loss_lvr": 0.3167652189731598, "loss_mode_switch": 0.0, "loss_total": 0.05555859953165054, "step": 1910 }, { "batch_size": 4, "epoch": 0.764, "step": 1910, "tokens_per_device": 1644 }, { "epoch": 0.764, "loss_ce": 0.24062000215053558, "loss_lvr": 0.8569639921188354, "loss_mode_switch": 0.0, "loss_total": 0.3263164162635803, "step": 1910 }, { "batch_size": 4, "epoch": 0.764, "step": 1910, "tokens_per_device": 4372 }, { "epoch": 0.764, "loss_ce": 0.07943655550479889, "loss_lvr": 0.8718513250350952, "loss_mode_switch": 0.0, "loss_total": 0.16662168502807617, "step": 1910 }, { "epoch": 0.7644, "grad_norm": 1.452845811843872, "learning_rate": 1.386345436739328e-06, "loss": 0.3484, "step": 1911 }, { "batch_size": 1, "epoch": 0.7644, "step": 1911, "tokens_per_device": 4854 }, { "epoch": 0.7644, "loss_ce": 0.0003405229654163122, "loss_lvr": 0.2703632116317749, "loss_mode_switch": 0.0, "loss_total": 0.027376845479011536, "step": 1911 }, { "batch_size": 1, "epoch": 0.7644, "step": 1911, "tokens_per_device": 5004 }, { "epoch": 0.7644, "loss_ce": 0.020803401246666908, "loss_lvr": 0.37173911929130554, "loss_mode_switch": 0.0, "loss_total": 0.05797731131315231, "step": 1911 }, { "batch_size": 4, "epoch": 0.7644, "step": 1911, "tokens_per_device": 6112 }, { "epoch": 0.7644, "loss_ce": 0.07770837098360062, "loss_lvr": 0.8105279803276062, "loss_mode_switch": 0.0, "loss_total": 0.1587611734867096, "step": 1911 }, { "batch_size": 1, "epoch": 0.7644, "step": 1911, "tokens_per_device": 4698 }, { "epoch": 0.7644, "loss_ce": 0.058415498584508896, "loss_lvr": 0.33841824531555176, "loss_mode_switch": 0.0, "loss_total": 0.09225732088088989, "step": 1911 }, { "batch_size": 4, "epoch": 0.7644, "step": 1911, "tokens_per_device": 3224 }, { "epoch": 0.7644, "loss_ce": 0.552696943283081, "loss_lvr": 0.6280955076217651, "loss_mode_switch": 0.0, "loss_total": 0.6155064702033997, "step": 1911 }, { "batch_size": 4, "epoch": 0.7644, "step": 1911, "tokens_per_device": 5200 }, { "epoch": 0.7644, "loss_ce": 0.7846020460128784, "loss_lvr": 0.7260892987251282, "loss_mode_switch": 0.0, "loss_total": 0.8572109937667847, "step": 1911 }, { "batch_size": 4, "epoch": 0.7644, "step": 1911, "tokens_per_device": 5812 }, { "epoch": 0.7644, "loss_ce": 0.3897906541824341, "loss_lvr": 0.36707061529159546, "loss_mode_switch": 0.0, "loss_total": 0.4264977276325226, "step": 1911 }, { "batch_size": 4, "epoch": 0.7644, "step": 1911, "tokens_per_device": 1596 }, { "epoch": 0.7644, "loss_ce": 0.08583814650774002, "loss_lvr": 0.9690659642219543, "loss_mode_switch": 0.0, "loss_total": 0.18274474143981934, "step": 1911 }, { "epoch": 0.7648, "grad_norm": 1.1756818294525146, "learning_rate": 1.3818716715583452e-06, "loss": 0.2484, "step": 1912 }, { "batch_size": 1, "epoch": 0.7648, "step": 1912, "tokens_per_device": 4938 }, { "epoch": 0.7648, "loss_ce": 0.18768130242824554, "loss_lvr": 0.9053537249565125, "loss_mode_switch": 0.0, "loss_total": 0.2782166600227356, "step": 1912 }, { "batch_size": 1, "epoch": 0.7648, "step": 1912, "tokens_per_device": 5100 }, { "epoch": 0.7648, "loss_ce": 0.019265037029981613, "loss_lvr": 0.40689200162887573, "loss_mode_switch": 0.0, "loss_total": 0.059954237192869186, "step": 1912 }, { "batch_size": 4, "epoch": 0.7648, "step": 1912, "tokens_per_device": 6140 }, { "epoch": 0.7648, "loss_ce": 0.20599037408828735, "loss_lvr": 0.6665418148040771, "loss_mode_switch": 0.0, "loss_total": 0.2726445496082306, "step": 1912 }, { "batch_size": 4, "epoch": 0.7648, "step": 1912, "tokens_per_device": 4280 }, { "epoch": 0.7648, "loss_ce": 0.7120816707611084, "loss_lvr": 1.133571743965149, "loss_mode_switch": 0.0, "loss_total": 0.8254388570785522, "step": 1912 }, { "batch_size": 4, "epoch": 0.7648, "step": 1912, "tokens_per_device": 4200 }, { "epoch": 0.7648, "loss_ce": 0.18124713003635406, "loss_lvr": 0.7106411457061768, "loss_mode_switch": 0.0, "loss_total": 0.25231122970581055, "step": 1912 }, { "batch_size": 1, "epoch": 0.7648, "step": 1912, "tokens_per_device": 4880 }, { "epoch": 0.7648, "loss_ce": 0.04080667719244957, "loss_lvr": 0.4242129325866699, "loss_mode_switch": 0.0, "loss_total": 0.0832279697060585, "step": 1912 }, { "batch_size": 4, "epoch": 0.7648, "step": 1912, "tokens_per_device": 2644 }, { "epoch": 0.7648, "loss_ce": 0.12446289509534836, "loss_lvr": 0.9119800329208374, "loss_mode_switch": 0.0, "loss_total": 0.21566089987754822, "step": 1912 }, { "batch_size": 4, "epoch": 0.7648, "step": 1912, "tokens_per_device": 3792 }, { "epoch": 0.7648, "loss_ce": 0.16364489495754242, "loss_lvr": 0.8823985457420349, "loss_mode_switch": 0.0, "loss_total": 0.2518847584724426, "step": 1912 }, { "epoch": 0.7652, "grad_norm": 1.3826128244400024, "learning_rate": 1.3774039787744776e-06, "loss": 0.2967, "step": 1913 }, { "batch_size": 4, "epoch": 0.7652, "step": 1913, "tokens_per_device": 16016 }, { "epoch": 0.7652, "loss_ce": 0.2606680989265442, "loss_lvr": 0.33313366770744324, "loss_mode_switch": 0.0, "loss_total": 0.2939814627170563, "step": 1913 }, { "batch_size": 4, "epoch": 0.7652, "step": 1913, "tokens_per_device": 2156 }, { "epoch": 0.7652, "loss_ce": 0.20754319429397583, "loss_lvr": 0.8778783679008484, "loss_mode_switch": 0.0, "loss_total": 0.29533103108406067, "step": 1913 }, { "batch_size": 4, "epoch": 0.7652, "step": 1913, "tokens_per_device": 4200 }, { "epoch": 0.7652, "loss_ce": 0.3965245485305786, "loss_lvr": 0.8119155764579773, "loss_mode_switch": 0.0, "loss_total": 0.4777161180973053, "step": 1913 }, { "batch_size": 4, "epoch": 0.7652, "step": 1913, "tokens_per_device": 2632 }, { "epoch": 0.7652, "loss_ce": 0.39546045660972595, "loss_lvr": 0.6964106559753418, "loss_mode_switch": 0.0, "loss_total": 0.46510154008865356, "step": 1913 }, { "batch_size": 4, "epoch": 0.7652, "step": 1913, "tokens_per_device": 5124 }, { "epoch": 0.7652, "loss_ce": 0.00034154130844399333, "loss_lvr": 0.6596807837486267, "loss_mode_switch": 0.0, "loss_total": 0.0663096234202385, "step": 1913 }, { "batch_size": 4, "epoch": 0.7652, "step": 1913, "tokens_per_device": 7376 }, { "epoch": 0.7652, "loss_ce": 0.027319196611642838, "loss_lvr": 0.6397321820259094, "loss_mode_switch": 0.0, "loss_total": 0.09129241108894348, "step": 1913 }, { "batch_size": 4, "epoch": 0.7652, "step": 1913, "tokens_per_device": 2640 }, { "epoch": 0.7652, "loss_ce": 0.17119836807250977, "loss_lvr": 0.7315442562103271, "loss_mode_switch": 0.0, "loss_total": 0.244352787733078, "step": 1913 }, { "batch_size": 4, "epoch": 0.7652, "step": 1913, "tokens_per_device": 4176 }, { "epoch": 0.7652, "loss_ce": 0.11830350756645203, "loss_lvr": 0.640663206577301, "loss_mode_switch": 0.0, "loss_total": 0.18236982822418213, "step": 1913 }, { "epoch": 0.7656, "grad_norm": 1.3598591089248657, "learning_rate": 1.3729423658859654e-06, "loss": 0.2891, "step": 1914 }, { "batch_size": 4, "epoch": 0.7656, "step": 1914, "tokens_per_device": 3760 }, { "epoch": 0.7656, "loss_ce": 0.04732956364750862, "loss_lvr": 0.694617748260498, "loss_mode_switch": 0.0, "loss_total": 0.11679133772850037, "step": 1914 }, { "batch_size": 4, "epoch": 0.7656, "step": 1914, "tokens_per_device": 4312 }, { "epoch": 0.7656, "loss_ce": 0.4503191411495209, "loss_lvr": 0.8546132445335388, "loss_mode_switch": 0.0, "loss_total": 0.5357804894447327, "step": 1914 }, { "batch_size": 4, "epoch": 0.7656, "step": 1914, "tokens_per_device": 5704 }, { "epoch": 0.7656, "loss_ce": 0.17781344056129456, "loss_lvr": 0.8138169050216675, "loss_mode_switch": 0.0, "loss_total": 0.25919514894485474, "step": 1914 }, { "batch_size": 4, "epoch": 0.7656, "step": 1914, "tokens_per_device": 1548 }, { "epoch": 0.7656, "loss_ce": 0.4964876174926758, "loss_lvr": 0.9463779926300049, "loss_mode_switch": 0.0, "loss_total": 0.5911254286766052, "step": 1914 }, { "batch_size": 1, "epoch": 0.7656, "step": 1914, "tokens_per_device": 4882 }, { "epoch": 0.7656, "loss_ce": 0.0012736017815768719, "loss_lvr": 0.33098819851875305, "loss_mode_switch": 0.0, "loss_total": 0.034372422844171524, "step": 1914 }, { "batch_size": 4, "epoch": 0.7656, "step": 1914, "tokens_per_device": 2648 }, { "epoch": 0.7656, "loss_ce": 0.5410223603248596, "loss_lvr": 1.6152054071426392, "loss_mode_switch": 0.0, "loss_total": 0.7025429010391235, "step": 1914 }, { "batch_size": 4, "epoch": 0.7656, "step": 1914, "tokens_per_device": 4992 }, { "epoch": 0.7656, "loss_ce": 0.437858909368515, "loss_lvr": 0.8053693175315857, "loss_mode_switch": 0.0, "loss_total": 0.5183958411216736, "step": 1914 }, { "batch_size": 4, "epoch": 0.7656, "step": 1914, "tokens_per_device": 5584 }, { "epoch": 0.7656, "loss_ce": 0.2754218578338623, "loss_lvr": 0.7488810420036316, "loss_mode_switch": 0.0, "loss_total": 0.35030996799468994, "step": 1914 }, { "epoch": 0.766, "grad_norm": 1.4049744606018066, "learning_rate": 1.368486840380851e-06, "loss": 0.3093, "step": 1915 }, { "batch_size": 4, "epoch": 0.766, "step": 1915, "tokens_per_device": 4264 }, { "epoch": 0.766, "loss_ce": 0.04803727939724922, "loss_lvr": 0.772761344909668, "loss_mode_switch": 0.0, "loss_total": 0.1253134161233902, "step": 1915 }, { "batch_size": 4, "epoch": 0.766, "step": 1915, "tokens_per_device": 3756 }, { "epoch": 0.766, "loss_ce": 0.21584290266036987, "loss_lvr": 0.9129002690315247, "loss_mode_switch": 0.0, "loss_total": 0.30713292956352234, "step": 1915 }, { "batch_size": 4, "epoch": 0.766, "step": 1915, "tokens_per_device": 1920 }, { "epoch": 0.766, "loss_ce": 0.21583694219589233, "loss_lvr": 0.8903069496154785, "loss_mode_switch": 0.0, "loss_total": 0.30486762523651123, "step": 1915 }, { "batch_size": 1, "epoch": 0.766, "step": 1915, "tokens_per_device": 4743 }, { "epoch": 0.766, "loss_ce": 0.01736314594745636, "loss_lvr": 0.23721055686473846, "loss_mode_switch": 0.0, "loss_total": 0.04108420014381409, "step": 1915 }, { "batch_size": 4, "epoch": 0.766, "step": 1915, "tokens_per_device": 2732 }, { "epoch": 0.766, "loss_ce": 0.6314266324043274, "loss_lvr": 0.8820905089378357, "loss_mode_switch": 0.0, "loss_total": 0.7196356654167175, "step": 1915 }, { "batch_size": 4, "epoch": 0.766, "step": 1915, "tokens_per_device": 4736 }, { "epoch": 0.766, "loss_ce": 0.04373741149902344, "loss_lvr": 0.7818746566772461, "loss_mode_switch": 0.0, "loss_total": 0.12192487716674805, "step": 1915 }, { "batch_size": 4, "epoch": 0.766, "step": 1915, "tokens_per_device": 10736 }, { "epoch": 0.766, "loss_ce": 0.11105134338140488, "loss_lvr": 0.905680239200592, "loss_mode_switch": 0.0, "loss_total": 0.20161937177181244, "step": 1915 }, { "batch_size": 4, "epoch": 0.766, "step": 1915, "tokens_per_device": 3876 }, { "epoch": 0.766, "loss_ce": 0.3830679655075073, "loss_lvr": 0.8468369841575623, "loss_mode_switch": 0.0, "loss_total": 0.467751681804657, "step": 1915 }, { "epoch": 0.7664, "grad_norm": 1.1677145957946777, "learning_rate": 1.364037409736954e-06, "loss": 0.2755, "step": 1916 }, { "batch_size": 1, "epoch": 0.7664, "step": 1916, "tokens_per_device": 5162 }, { "epoch": 0.7664, "loss_ce": 0.19390539824962616, "loss_lvr": 0.22796781361103058, "loss_mode_switch": 0.0, "loss_total": 0.2167021781206131, "step": 1916 }, { "batch_size": 1, "epoch": 0.7664, "step": 1916, "tokens_per_device": 4233 }, { "epoch": 0.7664, "loss_ce": 0.27162113785743713, "loss_lvr": 0.30088239908218384, "loss_mode_switch": 0.0, "loss_total": 0.30170938372612, "step": 1916 }, { "batch_size": 4, "epoch": 0.7664, "step": 1916, "tokens_per_device": 4392 }, { "epoch": 0.7664, "loss_ce": 0.036455024033784866, "loss_lvr": 0.5699371695518494, "loss_mode_switch": 0.0, "loss_total": 0.09344874322414398, "step": 1916 }, { "batch_size": 4, "epoch": 0.7664, "step": 1916, "tokens_per_device": 1224 }, { "epoch": 0.7664, "loss_ce": 0.1851319521665573, "loss_lvr": 1.3956966400146484, "loss_mode_switch": 0.0, "loss_total": 0.32470160722732544, "step": 1916 }, { "batch_size": 1, "epoch": 0.7664, "step": 1916, "tokens_per_device": 4662 }, { "epoch": 0.7664, "loss_ce": 0.0759291797876358, "loss_lvr": 0.37556084990501404, "loss_mode_switch": 0.0, "loss_total": 0.11348526179790497, "step": 1916 }, { "batch_size": 1, "epoch": 0.7664, "step": 1916, "tokens_per_device": 5579 }, { "epoch": 0.7664, "loss_ce": 0.000646740838419646, "loss_lvr": 0.3085586428642273, "loss_mode_switch": 0.0, "loss_total": 0.031502604484558105, "step": 1916 }, { "batch_size": 1, "epoch": 0.7664, "step": 1916, "tokens_per_device": 7689 }, { "epoch": 0.7664, "loss_ce": 0.0502530038356781, "loss_lvr": 0.416422575712204, "loss_mode_switch": 0.0, "loss_total": 0.09189526736736298, "step": 1916 }, { "batch_size": 4, "epoch": 0.7664, "step": 1916, "tokens_per_device": 1424 }, { "epoch": 0.7664, "loss_ce": 0.3736036717891693, "loss_lvr": 1.0959161520004272, "loss_mode_switch": 0.0, "loss_total": 0.48319530487060547, "step": 1916 }, { "epoch": 0.7668, "grad_norm": 1.2607309818267822, "learning_rate": 1.3595940814218668e-06, "loss": 0.265, "step": 1917 }, { "batch_size": 4, "epoch": 0.7668, "step": 1917, "tokens_per_device": 4088 }, { "epoch": 0.7668, "loss_ce": 0.3813095986843109, "loss_lvr": 0.8181844353675842, "loss_mode_switch": 0.0, "loss_total": 0.4631280303001404, "step": 1917 }, { "batch_size": 4, "epoch": 0.7668, "step": 1917, "tokens_per_device": 4800 }, { "epoch": 0.7668, "loss_ce": 0.28646066784858704, "loss_lvr": 0.7437554597854614, "loss_mode_switch": 0.0, "loss_total": 0.3608362078666687, "step": 1917 }, { "batch_size": 4, "epoch": 0.7668, "step": 1917, "tokens_per_device": 3348 }, { "epoch": 0.7668, "loss_ce": 0.008599401451647282, "loss_lvr": 0.6284367442131042, "loss_mode_switch": 0.0, "loss_total": 0.07144307345151901, "step": 1917 }, { "batch_size": 1, "epoch": 0.7668, "step": 1917, "tokens_per_device": 4929 }, { "epoch": 0.7668, "loss_ce": 0.12643568217754364, "loss_lvr": 0.587088942527771, "loss_mode_switch": 0.0, "loss_total": 0.1851445734500885, "step": 1917 }, { "batch_size": 4, "epoch": 0.7668, "step": 1917, "tokens_per_device": 2596 }, { "epoch": 0.7668, "loss_ce": 0.022116851061582565, "loss_lvr": 0.6604880094528198, "loss_mode_switch": 0.0, "loss_total": 0.08816565573215485, "step": 1917 }, { "batch_size": 4, "epoch": 0.7668, "step": 1917, "tokens_per_device": 3784 }, { "epoch": 0.7668, "loss_ce": 0.030712570995092392, "loss_lvr": 0.8598251938819885, "loss_mode_switch": 0.0, "loss_total": 0.1166950911283493, "step": 1917 }, { "batch_size": 1, "epoch": 0.7668, "step": 1917, "tokens_per_device": 4748 }, { "epoch": 0.7668, "loss_ce": 0.0012626597890630364, "loss_lvr": 0.49866217374801636, "loss_mode_switch": 0.0, "loss_total": 0.05112887918949127, "step": 1917 }, { "batch_size": 4, "epoch": 0.7668, "step": 1917, "tokens_per_device": 4452 }, { "epoch": 0.7668, "loss_ce": 0.19572719931602478, "loss_lvr": 0.8080472946166992, "loss_mode_switch": 0.0, "loss_total": 0.2765319347381592, "step": 1917 }, { "epoch": 0.7672, "grad_norm": 1.1613833904266357, "learning_rate": 1.3551568628929434e-06, "loss": 0.2564, "step": 1918 }, { "batch_size": 1, "epoch": 0.7672, "step": 1918, "tokens_per_device": 4904 }, { "epoch": 0.7672, "loss_ce": 0.01883642002940178, "loss_lvr": 0.6272231936454773, "loss_mode_switch": 0.0, "loss_total": 0.08155873417854309, "step": 1918 }, { "batch_size": 1, "epoch": 0.7672, "step": 1918, "tokens_per_device": 5074 }, { "epoch": 0.7672, "loss_ce": 0.0010022656060755253, "loss_lvr": 0.22150221467018127, "loss_mode_switch": 0.0, "loss_total": 0.023152487352490425, "step": 1918 }, { "batch_size": 4, "epoch": 0.7672, "step": 1918, "tokens_per_device": 10412 }, { "epoch": 0.7672, "loss_ce": 0.20529329776763916, "loss_lvr": 0.5995550155639648, "loss_mode_switch": 0.0, "loss_total": 0.2652488052845001, "step": 1918 }, { "batch_size": 4, "epoch": 0.7672, "step": 1918, "tokens_per_device": 12456 }, { "epoch": 0.7672, "loss_ce": 0.0498322993516922, "loss_lvr": 0.508503258228302, "loss_mode_switch": 0.0, "loss_total": 0.10068263113498688, "step": 1918 }, { "batch_size": 1, "epoch": 0.7672, "step": 1918, "tokens_per_device": 5452 }, { "epoch": 0.7672, "loss_ce": 0.02280397340655327, "loss_lvr": 0.288381963968277, "loss_mode_switch": 0.0, "loss_total": 0.051642172038555145, "step": 1918 }, { "batch_size": 4, "epoch": 0.7672, "step": 1918, "tokens_per_device": 4200 }, { "epoch": 0.7672, "loss_ce": 0.34411701560020447, "loss_lvr": 0.7512474060058594, "loss_mode_switch": 0.0, "loss_total": 0.4192417562007904, "step": 1918 }, { "batch_size": 4, "epoch": 0.7672, "step": 1918, "tokens_per_device": 3536 }, { "epoch": 0.7672, "loss_ce": 0.5136906504631042, "loss_lvr": 0.9583967328071594, "loss_mode_switch": 0.0, "loss_total": 0.6095303297042847, "step": 1918 }, { "batch_size": 1, "epoch": 0.7672, "step": 1918, "tokens_per_device": 5242 }, { "epoch": 0.7672, "loss_ce": 0.0031195953488349915, "loss_lvr": 0.27368175983428955, "loss_mode_switch": 0.0, "loss_total": 0.030487772077322006, "step": 1918 }, { "epoch": 0.7676, "grad_norm": 1.0483472347259521, "learning_rate": 1.3507257615972779e-06, "loss": 0.2331, "step": 1919 }, { "batch_size": 4, "epoch": 0.7676, "step": 1919, "tokens_per_device": 4524 }, { "epoch": 0.7676, "loss_ce": 0.22940051555633545, "loss_lvr": 1.7200089693069458, "loss_mode_switch": 0.0, "loss_total": 0.4014014005661011, "step": 1919 }, { "batch_size": 4, "epoch": 0.7676, "step": 1919, "tokens_per_device": 6320 }, { "epoch": 0.7676, "loss_ce": 0.4516310691833496, "loss_lvr": 0.9594431519508362, "loss_mode_switch": 0.0, "loss_total": 0.5475753545761108, "step": 1919 }, { "batch_size": 1, "epoch": 0.7676, "step": 1919, "tokens_per_device": 5181 }, { "epoch": 0.7676, "loss_ce": 0.00021219832706265152, "loss_lvr": 0.4681783616542816, "loss_mode_switch": 0.0, "loss_total": 0.04703003540635109, "step": 1919 }, { "batch_size": 1, "epoch": 0.7676, "step": 1919, "tokens_per_device": 7805 }, { "epoch": 0.7676, "loss_ce": 0.19082488119602203, "loss_lvr": 0.20339050889015198, "loss_mode_switch": 0.0, "loss_total": 0.2111639380455017, "step": 1919 }, { "batch_size": 1, "epoch": 0.7676, "step": 1919, "tokens_per_device": 5124 }, { "epoch": 0.7676, "loss_ce": 0.03908127173781395, "loss_lvr": 0.22936928272247314, "loss_mode_switch": 0.0, "loss_total": 0.062018200755119324, "step": 1919 }, { "batch_size": 4, "epoch": 0.7676, "step": 1919, "tokens_per_device": 2720 }, { "epoch": 0.7676, "loss_ce": 0.02222450263798237, "loss_lvr": 1.4801336526870728, "loss_mode_switch": 0.0, "loss_total": 0.17023786902427673, "step": 1919 }, { "batch_size": 4, "epoch": 0.7676, "step": 1919, "tokens_per_device": 4456 }, { "epoch": 0.7676, "loss_ce": 0.001442444627173245, "loss_lvr": 0.770408570766449, "loss_mode_switch": 0.0, "loss_total": 0.07848330587148666, "step": 1919 }, { "batch_size": 4, "epoch": 0.7676, "step": 1919, "tokens_per_device": 6368 }, { "epoch": 0.7676, "loss_ce": 0.013749521225690842, "loss_lvr": 0.6502582430839539, "loss_mode_switch": 0.0, "loss_total": 0.07877534627914429, "step": 1919 }, { "epoch": 0.768, "grad_norm": 1.4348154067993164, "learning_rate": 1.3463007849717035e-06, "loss": 0.2969, "step": 1920 }, { "batch_size": 4, "epoch": 0.768, "step": 1920, "tokens_per_device": 4380 }, { "epoch": 0.768, "loss_ce": 0.4756549894809723, "loss_lvr": 0.7317301034927368, "loss_mode_switch": 0.0, "loss_total": 0.5488280057907104, "step": 1920 }, { "batch_size": 4, "epoch": 0.768, "step": 1920, "tokens_per_device": 3816 }, { "epoch": 0.768, "loss_ce": 0.2936837673187256, "loss_lvr": 1.0249055624008179, "loss_mode_switch": 0.0, "loss_total": 0.3961743116378784, "step": 1920 }, { "batch_size": 4, "epoch": 0.768, "step": 1920, "tokens_per_device": 5040 }, { "epoch": 0.768, "loss_ce": 0.32177823781967163, "loss_lvr": 0.6666322946548462, "loss_mode_switch": 0.0, "loss_total": 0.3884414732456207, "step": 1920 }, { "batch_size": 4, "epoch": 0.768, "step": 1920, "tokens_per_device": 1560 }, { "epoch": 0.768, "loss_ce": 0.44921624660491943, "loss_lvr": 0.8064191937446594, "loss_mode_switch": 0.0, "loss_total": 0.5298581719398499, "step": 1920 }, { "batch_size": 1, "epoch": 0.768, "step": 1920, "tokens_per_device": 5173 }, { "epoch": 0.768, "loss_ce": 0.0011110143968835473, "loss_lvr": 0.5691779255867004, "loss_mode_switch": 0.0, "loss_total": 0.05802880972623825, "step": 1920 }, { "batch_size": 1, "epoch": 0.768, "step": 1920, "tokens_per_device": 5066 }, { "epoch": 0.768, "loss_ce": 0.008400402031838894, "loss_lvr": 1.0175902843475342, "loss_mode_switch": 0.0, "loss_total": 0.11015943437814713, "step": 1920 }, { "batch_size": 4, "epoch": 0.768, "step": 1920, "tokens_per_device": 5548 }, { "epoch": 0.768, "loss_ce": 0.593947172164917, "loss_lvr": 0.7815334796905518, "loss_mode_switch": 0.0, "loss_total": 0.6721005439758301, "step": 1920 }, { "batch_size": 1, "epoch": 0.768, "step": 1920, "tokens_per_device": 4960 }, { "epoch": 0.768, "loss_ce": 0.0296001136302948, "loss_lvr": 0.4882287085056305, "loss_mode_switch": 0.0, "loss_total": 0.07842298597097397, "step": 1920 }, { "epoch": 0.7684, "grad_norm": 1.249650001525879, "learning_rate": 1.341881940442769e-06, "loss": 0.2755, "step": 1921 }, { "batch_size": 4, "epoch": 0.7684, "step": 1921, "tokens_per_device": 8824 }, { "epoch": 0.7684, "loss_ce": 0.2002754509449005, "loss_lvr": 0.7871032357215881, "loss_mode_switch": 0.0, "loss_total": 0.27898576855659485, "step": 1921 }, { "batch_size": 1, "epoch": 0.7684, "step": 1921, "tokens_per_device": 5086 }, { "epoch": 0.7684, "loss_ce": 0.14006230235099792, "loss_lvr": 0.5261932015419006, "loss_mode_switch": 0.0, "loss_total": 0.19268162548542023, "step": 1921 }, { "batch_size": 4, "epoch": 0.7684, "step": 1921, "tokens_per_device": 4224 }, { "epoch": 0.7684, "loss_ce": 0.33566874265670776, "loss_lvr": 0.7973716259002686, "loss_mode_switch": 0.0, "loss_total": 0.41540589928627014, "step": 1921 }, { "batch_size": 4, "epoch": 0.7684, "step": 1921, "tokens_per_device": 3504 }, { "epoch": 0.7684, "loss_ce": 0.2756364047527313, "loss_lvr": 0.8471122980117798, "loss_mode_switch": 0.0, "loss_total": 0.3603476285934448, "step": 1921 }, { "batch_size": 4, "epoch": 0.7684, "step": 1921, "tokens_per_device": 3808 }, { "epoch": 0.7684, "loss_ce": 0.23728898167610168, "loss_lvr": 0.6438905596733093, "loss_mode_switch": 0.0, "loss_total": 0.30167803168296814, "step": 1921 }, { "batch_size": 4, "epoch": 0.7684, "step": 1921, "tokens_per_device": 5720 }, { "epoch": 0.7684, "loss_ce": 0.08571203052997589, "loss_lvr": 1.0323147773742676, "loss_mode_switch": 0.0, "loss_total": 0.1889435052871704, "step": 1921 }, { "batch_size": 4, "epoch": 0.7684, "step": 1921, "tokens_per_device": 4400 }, { "epoch": 0.7684, "loss_ce": 0.9753428101539612, "loss_lvr": 0.8920590281486511, "loss_mode_switch": 0.0, "loss_total": 1.0645487308502197, "step": 1921 }, { "batch_size": 4, "epoch": 0.7684, "step": 1921, "tokens_per_device": 3784 }, { "epoch": 0.7684, "loss_ce": 0.24280264973640442, "loss_lvr": 0.7622928023338318, "loss_mode_switch": 0.0, "loss_total": 0.3190319240093231, "step": 1921 }, { "epoch": 0.7688, "grad_norm": 1.53793466091156, "learning_rate": 1.337469235426736e-06, "loss": 0.3374, "step": 1922 }, { "batch_size": 4, "epoch": 0.7688, "step": 1922, "tokens_per_device": 1256 }, { "epoch": 0.7688, "loss_ce": 0.3734937012195587, "loss_lvr": 0.9242223501205444, "loss_mode_switch": 0.0, "loss_total": 0.4659159481525421, "step": 1922 }, { "batch_size": 4, "epoch": 0.7688, "step": 1922, "tokens_per_device": 3808 }, { "epoch": 0.7688, "loss_ce": 0.6058167815208435, "loss_lvr": 0.9439346790313721, "loss_mode_switch": 0.0, "loss_total": 0.7002102732658386, "step": 1922 }, { "batch_size": 4, "epoch": 0.7688, "step": 1922, "tokens_per_device": 2004 }, { "epoch": 0.7688, "loss_ce": 0.6144288778305054, "loss_lvr": 0.8418452739715576, "loss_mode_switch": 0.0, "loss_total": 0.6986134052276611, "step": 1922 }, { "batch_size": 1, "epoch": 0.7688, "step": 1922, "tokens_per_device": 5239 }, { "epoch": 0.7688, "loss_ce": 0.2834789752960205, "loss_lvr": 0.3202366828918457, "loss_mode_switch": 0.0, "loss_total": 0.3155026435852051, "step": 1922 }, { "batch_size": 1, "epoch": 0.7688, "step": 1922, "tokens_per_device": 4900 }, { "epoch": 0.7688, "loss_ce": 0.002172657288610935, "loss_lvr": 0.2835875153541565, "loss_mode_switch": 0.0, "loss_total": 0.03053141012787819, "step": 1922 }, { "batch_size": 1, "epoch": 0.7688, "step": 1922, "tokens_per_device": 5157 }, { "epoch": 0.7688, "loss_ce": 2.719006061553955, "loss_lvr": 0.7532530426979065, "loss_mode_switch": 0.0, "loss_total": 2.7943313121795654, "step": 1922 }, { "batch_size": 1, "epoch": 0.7688, "step": 1922, "tokens_per_device": 5165 }, { "epoch": 0.7688, "loss_ce": 0.03068743459880352, "loss_lvr": 0.3092264235019684, "loss_mode_switch": 0.0, "loss_total": 0.06161007657647133, "step": 1922 }, { "batch_size": 4, "epoch": 0.7688, "step": 1922, "tokens_per_device": 2576 }, { "epoch": 0.7688, "loss_ce": 0.02067437767982483, "loss_lvr": 0.9455990195274353, "loss_mode_switch": 0.0, "loss_total": 0.11523427814245224, "step": 1922 }, { "epoch": 0.7692, "grad_norm": 1.4409477710723877, "learning_rate": 1.3330626773295579e-06, "loss": 0.325, "step": 1923 }, { "batch_size": 4, "epoch": 0.7692, "step": 1923, "tokens_per_device": 5540 }, { "epoch": 0.7692, "loss_ce": 0.17012619972229004, "loss_lvr": 0.7049254775047302, "loss_mode_switch": 0.0, "loss_total": 0.2406187504529953, "step": 1923 }, { "batch_size": 1, "epoch": 0.7692, "step": 1923, "tokens_per_device": 4916 }, { "epoch": 0.7692, "loss_ce": 0.0313703678548336, "loss_lvr": 0.3609296381473541, "loss_mode_switch": 0.0, "loss_total": 0.06746333092451096, "step": 1923 }, { "batch_size": 1, "epoch": 0.7692, "step": 1923, "tokens_per_device": 4888 }, { "epoch": 0.7692, "loss_ce": 0.00910909753292799, "loss_lvr": 0.8423711657524109, "loss_mode_switch": 0.0, "loss_total": 0.09334621578454971, "step": 1923 }, { "batch_size": 1, "epoch": 0.7692, "step": 1923, "tokens_per_device": 5114 }, { "epoch": 0.7692, "loss_ce": 0.003649497637525201, "loss_lvr": 0.237887442111969, "loss_mode_switch": 0.0, "loss_total": 0.027438241988420486, "step": 1923 }, { "batch_size": 1, "epoch": 0.7692, "step": 1923, "tokens_per_device": 4856 }, { "epoch": 0.7692, "loss_ce": 0.14045535027980804, "loss_lvr": 0.6766032576560974, "loss_mode_switch": 0.0, "loss_total": 0.20811566710472107, "step": 1923 }, { "batch_size": 4, "epoch": 0.7692, "step": 1923, "tokens_per_device": 2644 }, { "epoch": 0.7692, "loss_ce": 0.590181827545166, "loss_lvr": 0.8185859322547913, "loss_mode_switch": 0.0, "loss_total": 0.6720404028892517, "step": 1923 }, { "batch_size": 4, "epoch": 0.7692, "step": 1923, "tokens_per_device": 4600 }, { "epoch": 0.7692, "loss_ce": 0.06452988088130951, "loss_lvr": 0.7970001697540283, "loss_mode_switch": 0.0, "loss_total": 0.14422988891601562, "step": 1923 }, { "batch_size": 4, "epoch": 0.7692, "step": 1923, "tokens_per_device": 3896 }, { "epoch": 0.7692, "loss_ce": 0.3761322796344757, "loss_lvr": 0.8405730724334717, "loss_mode_switch": 0.0, "loss_total": 0.4601895809173584, "step": 1923 }, { "epoch": 0.7696, "grad_norm": 1.1936520338058472, "learning_rate": 1.3286622735468764e-06, "loss": 0.2569, "step": 1924 }, { "batch_size": 4, "epoch": 0.7696, "step": 1924, "tokens_per_device": 5332 }, { "epoch": 0.7696, "loss_ce": 0.1121334657073021, "loss_lvr": 0.9238520860671997, "loss_mode_switch": 0.0, "loss_total": 0.20451867580413818, "step": 1924 }, { "batch_size": 4, "epoch": 0.7696, "step": 1924, "tokens_per_device": 4484 }, { "epoch": 0.7696, "loss_ce": 0.08992534130811691, "loss_lvr": 1.0054665803909302, "loss_mode_switch": 0.0, "loss_total": 0.19047200679779053, "step": 1924 }, { "batch_size": 4, "epoch": 0.7696, "step": 1924, "tokens_per_device": 6708 }, { "epoch": 0.7696, "loss_ce": 0.19010287523269653, "loss_lvr": 0.7435289025306702, "loss_mode_switch": 0.0, "loss_total": 0.26445576548576355, "step": 1924 }, { "batch_size": 4, "epoch": 0.7696, "step": 1924, "tokens_per_device": 5712 }, { "epoch": 0.7696, "loss_ce": 0.07055965811014175, "loss_lvr": 0.9872011542320251, "loss_mode_switch": 0.0, "loss_total": 0.1692797839641571, "step": 1924 }, { "batch_size": 4, "epoch": 0.7696, "step": 1924, "tokens_per_device": 4056 }, { "epoch": 0.7696, "loss_ce": 0.07290823757648468, "loss_lvr": 0.5404667854309082, "loss_mode_switch": 0.0, "loss_total": 0.12695491313934326, "step": 1924 }, { "batch_size": 1, "epoch": 0.7696, "step": 1924, "tokens_per_device": 4896 }, { "epoch": 0.7696, "loss_ce": 0.0004406571388244629, "loss_lvr": 0.3315337300300598, "loss_mode_switch": 0.0, "loss_total": 0.033594030886888504, "step": 1924 }, { "batch_size": 4, "epoch": 0.7696, "step": 1924, "tokens_per_device": 3424 }, { "epoch": 0.7696, "loss_ce": 0.44898882508277893, "loss_lvr": 0.5296427607536316, "loss_mode_switch": 0.0, "loss_total": 0.501953125, "step": 1924 }, { "batch_size": 4, "epoch": 0.7696, "step": 1924, "tokens_per_device": 4504 }, { "epoch": 0.7696, "loss_ce": 0.38460761308670044, "loss_lvr": 0.7772279381752014, "loss_mode_switch": 0.0, "loss_total": 0.4623304009437561, "step": 1924 }, { "epoch": 0.77, "grad_norm": 1.2360140085220337, "learning_rate": 1.3242680314639995e-06, "loss": 0.2735, "step": 1925 }, { "batch_size": 1, "epoch": 0.77, "step": 1925, "tokens_per_device": 5088 }, { "epoch": 0.77, "loss_ce": 0.5877518057823181, "loss_lvr": 0.2904709577560425, "loss_mode_switch": 0.0, "loss_total": 0.6167988777160645, "step": 1925 }, { "batch_size": 4, "epoch": 0.77, "step": 1925, "tokens_per_device": 7080 }, { "epoch": 0.77, "loss_ce": 0.21357567608356476, "loss_lvr": 0.6851173043251038, "loss_mode_switch": 0.0, "loss_total": 0.28208741545677185, "step": 1925 }, { "batch_size": 4, "epoch": 0.77, "step": 1925, "tokens_per_device": 3952 }, { "epoch": 0.77, "loss_ce": 0.15025310218334198, "loss_lvr": 0.8057312965393066, "loss_mode_switch": 0.0, "loss_total": 0.2308262288570404, "step": 1925 }, { "batch_size": 4, "epoch": 0.77, "step": 1925, "tokens_per_device": 4704 }, { "epoch": 0.77, "loss_ce": 0.3416881263256073, "loss_lvr": 0.8213793635368347, "loss_mode_switch": 0.0, "loss_total": 0.42382606863975525, "step": 1925 }, { "batch_size": 4, "epoch": 0.77, "step": 1925, "tokens_per_device": 4316 }, { "epoch": 0.77, "loss_ce": 0.5393480658531189, "loss_lvr": 0.8769842386245728, "loss_mode_switch": 0.0, "loss_total": 0.6270464658737183, "step": 1925 }, { "batch_size": 4, "epoch": 0.77, "step": 1925, "tokens_per_device": 11304 }, { "epoch": 0.77, "loss_ce": 0.5358069539070129, "loss_lvr": 0.8922140598297119, "loss_mode_switch": 0.0, "loss_total": 0.6250283718109131, "step": 1925 }, { "batch_size": 4, "epoch": 0.77, "step": 1925, "tokens_per_device": 5072 }, { "epoch": 0.77, "loss_ce": 0.615387499332428, "loss_lvr": 0.6005951762199402, "loss_mode_switch": 0.0, "loss_total": 0.6754469871520996, "step": 1925 }, { "batch_size": 1, "epoch": 0.77, "step": 1925, "tokens_per_device": 4770 }, { "epoch": 0.77, "loss_ce": 0.016066299751400948, "loss_lvr": 0.29348909854888916, "loss_mode_switch": 0.0, "loss_total": 0.045415207743644714, "step": 1925 }, { "epoch": 0.7704, "grad_norm": 1.3853514194488525, "learning_rate": 1.319879958455894e-06, "loss": 0.3364, "step": 1926 }, { "batch_size": 4, "epoch": 0.7704, "step": 1926, "tokens_per_device": 5732 }, { "epoch": 0.7704, "loss_ce": 0.23200856149196625, "loss_lvr": 1.0634982585906982, "loss_mode_switch": 0.0, "loss_total": 0.33835840225219727, "step": 1926 }, { "batch_size": 1, "epoch": 0.7704, "step": 1926, "tokens_per_device": 4748 }, { "epoch": 0.7704, "loss_ce": 0.037756018340587616, "loss_lvr": 0.6183546781539917, "loss_mode_switch": 0.0, "loss_total": 0.09959148615598679, "step": 1926 }, { "batch_size": 4, "epoch": 0.7704, "step": 1926, "tokens_per_device": 1740 }, { "epoch": 0.7704, "loss_ce": 0.9117734432220459, "loss_lvr": 1.020438551902771, "loss_mode_switch": 0.0, "loss_total": 1.013817310333252, "step": 1926 }, { "batch_size": 1, "epoch": 0.7704, "step": 1926, "tokens_per_device": 4921 }, { "epoch": 0.7704, "loss_ce": 0.00489294296130538, "loss_lvr": 0.41739699244499207, "loss_mode_switch": 0.0, "loss_total": 0.046632640063762665, "step": 1926 }, { "batch_size": 4, "epoch": 0.7704, "step": 1926, "tokens_per_device": 3816 }, { "epoch": 0.7704, "loss_ce": 0.3132384717464447, "loss_lvr": 0.9877663254737854, "loss_mode_switch": 0.0, "loss_total": 0.4120151102542877, "step": 1926 }, { "batch_size": 1, "epoch": 0.7704, "step": 1926, "tokens_per_device": 4366 }, { "epoch": 0.7704, "loss_ce": 0.13989973068237305, "loss_lvr": 0.37491315603256226, "loss_mode_switch": 0.0, "loss_total": 0.17739105224609375, "step": 1926 }, { "batch_size": 4, "epoch": 0.7704, "step": 1926, "tokens_per_device": 3732 }, { "epoch": 0.7704, "loss_ce": 0.06729580461978912, "loss_lvr": 0.41830500960350037, "loss_mode_switch": 0.0, "loss_total": 0.10912630707025528, "step": 1926 }, { "batch_size": 4, "epoch": 0.7704, "step": 1926, "tokens_per_device": 3772 }, { "epoch": 0.7704, "loss_ce": 0.4360780715942383, "loss_lvr": 1.0331298112869263, "loss_mode_switch": 0.0, "loss_total": 0.539391040802002, "step": 1926 }, { "epoch": 0.7708, "grad_norm": 1.4051133394241333, "learning_rate": 1.3154980618871793e-06, "loss": 0.3088, "step": 1927 }, { "batch_size": 4, "epoch": 0.7708, "step": 1927, "tokens_per_device": 4280 }, { "epoch": 0.7708, "loss_ce": 0.2676730751991272, "loss_lvr": 0.8507163524627686, "loss_mode_switch": 0.0, "loss_total": 0.3527446985244751, "step": 1927 }, { "batch_size": 4, "epoch": 0.7708, "step": 1927, "tokens_per_device": 3852 }, { "epoch": 0.7708, "loss_ce": 0.12271251529455185, "loss_lvr": 0.5448707342147827, "loss_mode_switch": 0.0, "loss_total": 0.177199587225914, "step": 1927 }, { "batch_size": 4, "epoch": 0.7708, "step": 1927, "tokens_per_device": 13824 }, { "epoch": 0.7708, "loss_ce": 0.2413012534379959, "loss_lvr": 0.9562060832977295, "loss_mode_switch": 0.0, "loss_total": 0.3369218707084656, "step": 1927 }, { "batch_size": 4, "epoch": 0.7708, "step": 1927, "tokens_per_device": 2560 }, { "epoch": 0.7708, "loss_ce": 0.42912930250167847, "loss_lvr": 0.9444301128387451, "loss_mode_switch": 0.0, "loss_total": 0.5235723257064819, "step": 1927 }, { "batch_size": 1, "epoch": 0.7708, "step": 1927, "tokens_per_device": 5247 }, { "epoch": 0.7708, "loss_ce": 0.5779797434806824, "loss_lvr": 0.5789973735809326, "loss_mode_switch": 0.0, "loss_total": 0.6358794569969177, "step": 1927 }, { "batch_size": 4, "epoch": 0.7708, "step": 1927, "tokens_per_device": 4288 }, { "epoch": 0.7708, "loss_ce": 0.09918292611837387, "loss_lvr": 1.049615740776062, "loss_mode_switch": 0.0, "loss_total": 0.20414450764656067, "step": 1927 }, { "batch_size": 4, "epoch": 0.7708, "step": 1927, "tokens_per_device": 4228 }, { "epoch": 0.7708, "loss_ce": 0.43258020281791687, "loss_lvr": 1.047774314880371, "loss_mode_switch": 0.0, "loss_total": 0.5373576283454895, "step": 1927 }, { "batch_size": 4, "epoch": 0.7708, "step": 1927, "tokens_per_device": 3996 }, { "epoch": 0.7708, "loss_ce": 0.07629286497831345, "loss_lvr": 0.6366751194000244, "loss_mode_switch": 0.0, "loss_total": 0.139960378408432, "step": 1927 }, { "epoch": 0.7712, "grad_norm": 1.240737795829773, "learning_rate": 1.3111223491121e-06, "loss": 0.3041, "step": 1928 }, { "batch_size": 4, "epoch": 0.7712, "step": 1928, "tokens_per_device": 3776 }, { "epoch": 0.7712, "loss_ce": 0.42294415831565857, "loss_lvr": 0.9251259565353394, "loss_mode_switch": 0.0, "loss_total": 0.5154567360877991, "step": 1928 }, { "batch_size": 4, "epoch": 0.7712, "step": 1928, "tokens_per_device": 5924 }, { "epoch": 0.7712, "loss_ce": 0.15299245715141296, "loss_lvr": 0.7416533827781677, "loss_mode_switch": 0.0, "loss_total": 0.2271578013896942, "step": 1928 }, { "batch_size": 1, "epoch": 0.7712, "step": 1928, "tokens_per_device": 5625 }, { "epoch": 0.7712, "loss_ce": 0.0004524564719758928, "loss_lvr": 0.44409140944480896, "loss_mode_switch": 0.0, "loss_total": 0.04486159607768059, "step": 1928 }, { "batch_size": 4, "epoch": 0.7712, "step": 1928, "tokens_per_device": 14552 }, { "epoch": 0.7712, "loss_ce": 0.4190416634082794, "loss_lvr": 0.3511582612991333, "loss_mode_switch": 0.0, "loss_total": 0.4541575014591217, "step": 1928 }, { "batch_size": 1, "epoch": 0.7712, "step": 1928, "tokens_per_device": 7095 }, { "epoch": 0.7712, "loss_ce": 0.026259412989020348, "loss_lvr": 0.4305598735809326, "loss_mode_switch": 0.0, "loss_total": 0.06931540369987488, "step": 1928 }, { "batch_size": 1, "epoch": 0.7712, "step": 1928, "tokens_per_device": 4910 }, { "epoch": 0.7712, "loss_ce": 0.02562960423529148, "loss_lvr": 0.2676915228366852, "loss_mode_switch": 0.0, "loss_total": 0.05239875614643097, "step": 1928 }, { "batch_size": 4, "epoch": 0.7712, "step": 1928, "tokens_per_device": 5588 }, { "epoch": 0.7712, "loss_ce": 0.18523050844669342, "loss_lvr": 0.5302821397781372, "loss_mode_switch": 0.0, "loss_total": 0.2382587194442749, "step": 1928 }, { "batch_size": 4, "epoch": 0.7712, "step": 1928, "tokens_per_device": 4948 }, { "epoch": 0.7712, "loss_ce": 0.172071173787117, "loss_lvr": 0.6916394829750061, "loss_mode_switch": 0.0, "loss_total": 0.24123512208461761, "step": 1928 }, { "epoch": 0.7716, "grad_norm": 1.372931718826294, "learning_rate": 1.30675282747453e-06, "loss": 0.2477, "step": 1929 }, { "batch_size": 4, "epoch": 0.7716, "step": 1929, "tokens_per_device": 6148 }, { "epoch": 0.7716, "loss_ce": 0.227762833237648, "loss_lvr": 1.9120311737060547, "loss_mode_switch": 0.0, "loss_total": 0.4189659357070923, "step": 1929 }, { "batch_size": 4, "epoch": 0.7716, "step": 1929, "tokens_per_device": 4296 }, { "epoch": 0.7716, "loss_ce": 0.38337644934654236, "loss_lvr": 0.8229767084121704, "loss_mode_switch": 0.0, "loss_total": 0.46567413210868835, "step": 1929 }, { "batch_size": 4, "epoch": 0.7716, "step": 1929, "tokens_per_device": 1568 }, { "epoch": 0.7716, "loss_ce": 0.36130425333976746, "loss_lvr": 0.8388417363166809, "loss_mode_switch": 0.0, "loss_total": 0.4451884329319, "step": 1929 }, { "batch_size": 4, "epoch": 0.7716, "step": 1929, "tokens_per_device": 9084 }, { "epoch": 0.7716, "loss_ce": 0.03869879990816116, "loss_lvr": 0.7239670157432556, "loss_mode_switch": 0.0, "loss_total": 0.11109550297260284, "step": 1929 }, { "batch_size": 4, "epoch": 0.7716, "step": 1929, "tokens_per_device": 2868 }, { "epoch": 0.7716, "loss_ce": 0.5021235942840576, "loss_lvr": 0.7484988570213318, "loss_mode_switch": 0.0, "loss_total": 0.5769734978675842, "step": 1929 }, { "batch_size": 4, "epoch": 0.7716, "step": 1929, "tokens_per_device": 4444 }, { "epoch": 0.7716, "loss_ce": 0.43261438608169556, "loss_lvr": 0.8325609564781189, "loss_mode_switch": 0.0, "loss_total": 0.5158704519271851, "step": 1929 }, { "batch_size": 1, "epoch": 0.7716, "step": 1929, "tokens_per_device": 5074 }, { "epoch": 0.7716, "loss_ce": 0.05254402011632919, "loss_lvr": 0.4189048409461975, "loss_mode_switch": 0.0, "loss_total": 0.09443449974060059, "step": 1929 }, { "batch_size": 4, "epoch": 0.7716, "step": 1929, "tokens_per_device": 5580 }, { "epoch": 0.7716, "loss_ce": 0.37705284357070923, "loss_lvr": 0.8193320035934448, "loss_mode_switch": 0.0, "loss_total": 0.4589860439300537, "step": 1929 }, { "epoch": 0.772, "grad_norm": 1.2329150438308716, "learning_rate": 1.3023895043079476e-06, "loss": 0.286, "step": 1930 }, { "batch_size": 4, "epoch": 0.772, "step": 1930, "tokens_per_device": 10956 }, { "epoch": 0.772, "loss_ce": 0.06238337233662605, "loss_lvr": 0.7906274199485779, "loss_mode_switch": 0.0, "loss_total": 0.14144611358642578, "step": 1930 }, { "batch_size": 4, "epoch": 0.772, "step": 1930, "tokens_per_device": 4008 }, { "epoch": 0.772, "loss_ce": 0.008715017698705196, "loss_lvr": 0.9560971856117249, "loss_mode_switch": 0.0, "loss_total": 0.10432473570108414, "step": 1930 }, { "batch_size": 1, "epoch": 0.772, "step": 1930, "tokens_per_device": 5671 }, { "epoch": 0.772, "loss_ce": 0.0018199027981609106, "loss_lvr": 0.3798825144767761, "loss_mode_switch": 0.0, "loss_total": 0.03980815410614014, "step": 1930 }, { "batch_size": 4, "epoch": 0.772, "step": 1930, "tokens_per_device": 4576 }, { "epoch": 0.772, "loss_ce": 0.12649041414260864, "loss_lvr": 0.7222470045089722, "loss_mode_switch": 0.0, "loss_total": 0.19871512055397034, "step": 1930 }, { "batch_size": 4, "epoch": 0.772, "step": 1930, "tokens_per_device": 4000 }, { "epoch": 0.772, "loss_ce": 0.9735887050628662, "loss_lvr": 0.7704647779464722, "loss_mode_switch": 0.0, "loss_total": 1.0506352186203003, "step": 1930 }, { "batch_size": 4, "epoch": 0.772, "step": 1930, "tokens_per_device": 5080 }, { "epoch": 0.772, "loss_ce": 0.6457169055938721, "loss_lvr": 0.6828608512878418, "loss_mode_switch": 0.0, "loss_total": 0.7140029668807983, "step": 1930 }, { "batch_size": 4, "epoch": 0.772, "step": 1930, "tokens_per_device": 8016 }, { "epoch": 0.772, "loss_ce": 0.13132034242153168, "loss_lvr": 0.830707311630249, "loss_mode_switch": 0.0, "loss_total": 0.2143910825252533, "step": 1930 }, { "batch_size": 4, "epoch": 0.772, "step": 1930, "tokens_per_device": 4668 }, { "epoch": 0.772, "loss_ce": 0.4951924681663513, "loss_lvr": 0.9083871841430664, "loss_mode_switch": 0.0, "loss_total": 0.5860311985015869, "step": 1930 }, { "epoch": 0.7724, "grad_norm": 1.2413579225540161, "learning_rate": 1.2980323869354277e-06, "loss": 0.3158, "step": 1931 }, { "batch_size": 4, "epoch": 0.7724, "step": 1931, "tokens_per_device": 5112 }, { "epoch": 0.7724, "loss_ce": 0.376125693321228, "loss_lvr": 0.9082934260368347, "loss_mode_switch": 0.0, "loss_total": 0.4669550359249115, "step": 1931 }, { "batch_size": 1, "epoch": 0.7724, "step": 1931, "tokens_per_device": 5155 }, { "epoch": 0.7724, "loss_ce": 0.0006053496617823839, "loss_lvr": 0.48457616567611694, "loss_mode_switch": 0.0, "loss_total": 0.04906296730041504, "step": 1931 }, { "batch_size": 4, "epoch": 0.7724, "step": 1931, "tokens_per_device": 1604 }, { "epoch": 0.7724, "loss_ce": 0.4395967125892639, "loss_lvr": 1.049730658531189, "loss_mode_switch": 0.0, "loss_total": 0.5445697903633118, "step": 1931 }, { "batch_size": 1, "epoch": 0.7724, "step": 1931, "tokens_per_device": 5084 }, { "epoch": 0.7724, "loss_ce": 0.1306900829076767, "loss_lvr": 0.2998606860637665, "loss_mode_switch": 0.0, "loss_total": 0.16067615151405334, "step": 1931 }, { "batch_size": 1, "epoch": 0.7724, "step": 1931, "tokens_per_device": 4672 }, { "epoch": 0.7724, "loss_ce": 0.04906461760401726, "loss_lvr": 0.49168190360069275, "loss_mode_switch": 0.0, "loss_total": 0.09823280572891235, "step": 1931 }, { "batch_size": 1, "epoch": 0.7724, "step": 1931, "tokens_per_device": 4964 }, { "epoch": 0.7724, "loss_ce": 0.02777804620563984, "loss_lvr": 0.2720385789871216, "loss_mode_switch": 0.0, "loss_total": 0.05498190224170685, "step": 1931 }, { "batch_size": 4, "epoch": 0.7724, "step": 1931, "tokens_per_device": 4248 }, { "epoch": 0.7724, "loss_ce": 0.05238420516252518, "loss_lvr": 0.6296380162239075, "loss_mode_switch": 0.0, "loss_total": 0.11534801125526428, "step": 1931 }, { "batch_size": 1, "epoch": 0.7724, "step": 1931, "tokens_per_device": 4913 }, { "epoch": 0.7724, "loss_ce": 0.03560199961066246, "loss_lvr": 0.5073955059051514, "loss_mode_switch": 0.0, "loss_total": 0.08634155243635178, "step": 1931 }, { "epoch": 0.7728, "grad_norm": 1.389041781425476, "learning_rate": 1.2936814826696326e-06, "loss": 0.309, "step": 1932 }, { "batch_size": 1, "epoch": 0.7728, "step": 1932, "tokens_per_device": 6411 }, { "epoch": 0.7728, "loss_ce": 0.07352953404188156, "loss_lvr": 0.227105513215065, "loss_mode_switch": 0.0, "loss_total": 0.0962400883436203, "step": 1932 }, { "batch_size": 4, "epoch": 0.7728, "step": 1932, "tokens_per_device": 4280 }, { "epoch": 0.7728, "loss_ce": 0.4626387655735016, "loss_lvr": 0.872117280960083, "loss_mode_switch": 0.0, "loss_total": 0.5498504638671875, "step": 1932 }, { "batch_size": 1, "epoch": 0.7728, "step": 1932, "tokens_per_device": 4867 }, { "epoch": 0.7728, "loss_ce": 0.0006301960092969239, "loss_lvr": 0.2820572257041931, "loss_mode_switch": 0.0, "loss_total": 0.028835918754339218, "step": 1932 }, { "batch_size": 4, "epoch": 0.7728, "step": 1932, "tokens_per_device": 5440 }, { "epoch": 0.7728, "loss_ce": 0.16581138968467712, "loss_lvr": 1.2920516729354858, "loss_mode_switch": 0.0, "loss_total": 0.2950165569782257, "step": 1932 }, { "batch_size": 4, "epoch": 0.7728, "step": 1932, "tokens_per_device": 5676 }, { "epoch": 0.7728, "loss_ce": 0.05999636650085449, "loss_lvr": 1.1409987211227417, "loss_mode_switch": 0.0, "loss_total": 0.1740962415933609, "step": 1932 }, { "batch_size": 4, "epoch": 0.7728, "step": 1932, "tokens_per_device": 4276 }, { "epoch": 0.7728, "loss_ce": 0.02923145703971386, "loss_lvr": 0.9160146713256836, "loss_mode_switch": 0.0, "loss_total": 0.12083292752504349, "step": 1932 }, { "batch_size": 4, "epoch": 0.7728, "step": 1932, "tokens_per_device": 2720 }, { "epoch": 0.7728, "loss_ce": 0.4540058374404907, "loss_lvr": 0.7679653167724609, "loss_mode_switch": 0.0, "loss_total": 0.5308023691177368, "step": 1932 }, { "batch_size": 4, "epoch": 0.7728, "step": 1932, "tokens_per_device": 4388 }, { "epoch": 0.7728, "loss_ce": 0.13409274816513062, "loss_lvr": 0.6289312243461609, "loss_mode_switch": 0.0, "loss_total": 0.1969858705997467, "step": 1932 }, { "epoch": 0.7732, "grad_norm": 1.5405080318450928, "learning_rate": 1.2893367988127986e-06, "loss": 0.3055, "step": 1933 }, { "batch_size": 4, "epoch": 0.7732, "step": 1933, "tokens_per_device": 4600 }, { "epoch": 0.7732, "loss_ce": 0.5741548538208008, "loss_lvr": 0.9697397947311401, "loss_mode_switch": 0.0, "loss_total": 0.6711288094520569, "step": 1933 }, { "batch_size": 1, "epoch": 0.7732, "step": 1933, "tokens_per_device": 4879 }, { "epoch": 0.7732, "loss_ce": 0.02020074427127838, "loss_lvr": 0.7981908917427063, "loss_mode_switch": 0.0, "loss_total": 0.10001983493566513, "step": 1933 }, { "batch_size": 4, "epoch": 0.7732, "step": 1933, "tokens_per_device": 5104 }, { "epoch": 0.7732, "loss_ce": 0.08333885669708252, "loss_lvr": 0.9322632551193237, "loss_mode_switch": 0.0, "loss_total": 0.17656518518924713, "step": 1933 }, { "batch_size": 1, "epoch": 0.7732, "step": 1933, "tokens_per_device": 4876 }, { "epoch": 0.7732, "loss_ce": 0.09046771377325058, "loss_lvr": 0.25164422392845154, "loss_mode_switch": 0.0, "loss_total": 0.11563213914632797, "step": 1933 }, { "batch_size": 1, "epoch": 0.7732, "step": 1933, "tokens_per_device": 5057 }, { "epoch": 0.7732, "loss_ce": 0.0005804731044918299, "loss_lvr": 0.2994304895401001, "loss_mode_switch": 0.0, "loss_total": 0.030523521825671196, "step": 1933 }, { "batch_size": 4, "epoch": 0.7732, "step": 1933, "tokens_per_device": 4232 }, { "epoch": 0.7732, "loss_ce": 0.04448045790195465, "loss_lvr": 0.6214582920074463, "loss_mode_switch": 0.0, "loss_total": 0.10662628710269928, "step": 1933 }, { "batch_size": 4, "epoch": 0.7732, "step": 1933, "tokens_per_device": 13728 }, { "epoch": 0.7732, "loss_ce": 0.0974932461977005, "loss_lvr": 0.9622163772583008, "loss_mode_switch": 0.0, "loss_total": 0.19371488690376282, "step": 1933 }, { "batch_size": 4, "epoch": 0.7732, "step": 1933, "tokens_per_device": 1516 }, { "epoch": 0.7732, "loss_ce": 0.21214815974235535, "loss_lvr": 1.3021166324615479, "loss_mode_switch": 0.0, "loss_total": 0.34235984086990356, "step": 1933 }, { "epoch": 0.7736, "grad_norm": 1.2204029560089111, "learning_rate": 1.284998342656717e-06, "loss": 0.2532, "step": 1934 }, { "batch_size": 1, "epoch": 0.7736, "step": 1934, "tokens_per_device": 5143 }, { "epoch": 0.7736, "loss_ce": 0.0005341513897292316, "loss_lvr": 0.3451853096485138, "loss_mode_switch": 0.0, "loss_total": 0.03505268320441246, "step": 1934 }, { "batch_size": 1, "epoch": 0.7736, "step": 1934, "tokens_per_device": 4890 }, { "epoch": 0.7736, "loss_ce": 0.006362392101436853, "loss_lvr": 0.243849515914917, "loss_mode_switch": 0.0, "loss_total": 0.030747342854738235, "step": 1934 }, { "batch_size": 4, "epoch": 0.7736, "step": 1934, "tokens_per_device": 15628 }, { "epoch": 0.7736, "loss_ce": 0.4009593725204468, "loss_lvr": 0.8035625219345093, "loss_mode_switch": 0.0, "loss_total": 0.48131561279296875, "step": 1934 }, { "batch_size": 1, "epoch": 0.7736, "step": 1934, "tokens_per_device": 5056 }, { "epoch": 0.7736, "loss_ce": 0.12736152112483978, "loss_lvr": 0.6985136270523071, "loss_mode_switch": 0.0, "loss_total": 0.19721287488937378, "step": 1934 }, { "batch_size": 4, "epoch": 0.7736, "step": 1934, "tokens_per_device": 9396 }, { "epoch": 0.7736, "loss_ce": 0.20589031279087067, "loss_lvr": 0.7948556542396545, "loss_mode_switch": 0.0, "loss_total": 0.2853758931159973, "step": 1934 }, { "batch_size": 4, "epoch": 0.7736, "step": 1934, "tokens_per_device": 3804 }, { "epoch": 0.7736, "loss_ce": 0.06576196104288101, "loss_lvr": 0.9534028172492981, "loss_mode_switch": 0.0, "loss_total": 0.16110223531723022, "step": 1934 }, { "batch_size": 1, "epoch": 0.7736, "step": 1934, "tokens_per_device": 5153 }, { "epoch": 0.7736, "loss_ce": 0.00013376440620049834, "loss_lvr": 0.41217753291130066, "loss_mode_switch": 0.0, "loss_total": 0.04135151952505112, "step": 1934 }, { "batch_size": 4, "epoch": 0.7736, "step": 1934, "tokens_per_device": 2136 }, { "epoch": 0.7736, "loss_ce": 0.43855521082878113, "loss_lvr": 0.8730701804161072, "loss_mode_switch": 0.0, "loss_total": 0.5258622169494629, "step": 1934 }, { "epoch": 0.774, "grad_norm": 1.2216129302978516, "learning_rate": 1.2806661214827286e-06, "loss": 0.2265, "step": 1935 }, { "batch_size": 1, "epoch": 0.774, "step": 1935, "tokens_per_device": 4752 }, { "epoch": 0.774, "loss_ce": 0.17052797973155975, "loss_lvr": 0.3328600823879242, "loss_mode_switch": 0.0, "loss_total": 0.20381398499011993, "step": 1935 }, { "batch_size": 4, "epoch": 0.774, "step": 1935, "tokens_per_device": 5444 }, { "epoch": 0.774, "loss_ce": 0.12316446751356125, "loss_lvr": 0.8510774374008179, "loss_mode_switch": 0.0, "loss_total": 0.20827221870422363, "step": 1935 }, { "batch_size": 4, "epoch": 0.774, "step": 1935, "tokens_per_device": 5488 }, { "epoch": 0.774, "loss_ce": 0.4982088804244995, "loss_lvr": 0.5549007654190063, "loss_mode_switch": 0.0, "loss_total": 0.5536989569664001, "step": 1935 }, { "batch_size": 4, "epoch": 0.774, "step": 1935, "tokens_per_device": 4384 }, { "epoch": 0.774, "loss_ce": 0.2714827358722687, "loss_lvr": 0.8370512127876282, "loss_mode_switch": 0.0, "loss_total": 0.35518786311149597, "step": 1935 }, { "batch_size": 4, "epoch": 0.774, "step": 1935, "tokens_per_device": 3728 }, { "epoch": 0.774, "loss_ce": 0.5230427384376526, "loss_lvr": 1.158593773841858, "loss_mode_switch": 0.0, "loss_total": 0.6389021277427673, "step": 1935 }, { "batch_size": 4, "epoch": 0.774, "step": 1935, "tokens_per_device": 2660 }, { "epoch": 0.774, "loss_ce": 0.12278050929307938, "loss_lvr": 0.8371267914772034, "loss_mode_switch": 0.0, "loss_total": 0.20649319887161255, "step": 1935 }, { "batch_size": 4, "epoch": 0.774, "step": 1935, "tokens_per_device": 1444 }, { "epoch": 0.774, "loss_ce": 0.5813374519348145, "loss_lvr": 0.8945844173431396, "loss_mode_switch": 0.0, "loss_total": 0.6707959175109863, "step": 1935 }, { "batch_size": 1, "epoch": 0.774, "step": 1935, "tokens_per_device": 5278 }, { "epoch": 0.774, "loss_ce": 0.015027872286736965, "loss_lvr": 0.20997962355613708, "loss_mode_switch": 0.0, "loss_total": 0.03602583333849907, "step": 1935 }, { "epoch": 0.7744, "grad_norm": 1.5302625894546509, "learning_rate": 1.2763401425617134e-06, "loss": 0.3205, "step": 1936 }, { "batch_size": 4, "epoch": 0.7744, "step": 1936, "tokens_per_device": 4208 }, { "epoch": 0.7744, "loss_ce": 0.3574026823043823, "loss_lvr": 0.9315029382705688, "loss_mode_switch": 0.0, "loss_total": 0.45055297017097473, "step": 1936 }, { "batch_size": 1, "epoch": 0.7744, "step": 1936, "tokens_per_device": 4894 }, { "epoch": 0.7744, "loss_ce": 0.21290850639343262, "loss_lvr": 0.19007350504398346, "loss_mode_switch": 0.0, "loss_total": 0.23191586136817932, "step": 1936 }, { "batch_size": 1, "epoch": 0.7744, "step": 1936, "tokens_per_device": 4880 }, { "epoch": 0.7744, "loss_ce": 0.23075424134731293, "loss_lvr": 0.2981697916984558, "loss_mode_switch": 0.0, "loss_total": 0.2605712115764618, "step": 1936 }, { "batch_size": 4, "epoch": 0.7744, "step": 1936, "tokens_per_device": 4420 }, { "epoch": 0.7744, "loss_ce": 0.5630537867546082, "loss_lvr": 0.8835128545761108, "loss_mode_switch": 0.0, "loss_total": 0.6514050960540771, "step": 1936 }, { "batch_size": 4, "epoch": 0.7744, "step": 1936, "tokens_per_device": 3724 }, { "epoch": 0.7744, "loss_ce": 0.6850718855857849, "loss_lvr": 0.9671128392219543, "loss_mode_switch": 0.0, "loss_total": 0.7817831635475159, "step": 1936 }, { "batch_size": 1, "epoch": 0.7744, "step": 1936, "tokens_per_device": 6310 }, { "epoch": 0.7744, "loss_ce": 0.028714656829833984, "loss_lvr": 0.7650578618049622, "loss_mode_switch": 0.0, "loss_total": 0.10522044450044632, "step": 1936 }, { "batch_size": 4, "epoch": 0.7744, "step": 1936, "tokens_per_device": 4288 }, { "epoch": 0.7744, "loss_ce": 0.17780137062072754, "loss_lvr": 0.6285462975502014, "loss_mode_switch": 0.0, "loss_total": 0.24065600335597992, "step": 1936 }, { "batch_size": 4, "epoch": 0.7744, "step": 1936, "tokens_per_device": 4072 }, { "epoch": 0.7744, "loss_ce": 0.21336188912391663, "loss_lvr": 1.117396354675293, "loss_mode_switch": 0.0, "loss_total": 0.3251015245914459, "step": 1936 }, { "epoch": 0.7748, "grad_norm": 1.474931240081787, "learning_rate": 1.2720204131540693e-06, "loss": 0.3169, "step": 1937 }, { "batch_size": 4, "epoch": 0.7748, "step": 1937, "tokens_per_device": 3992 }, { "epoch": 0.7748, "loss_ce": 0.2792147099971771, "loss_lvr": 1.1291989088058472, "loss_mode_switch": 0.0, "loss_total": 0.3921346068382263, "step": 1937 }, { "batch_size": 4, "epoch": 0.7748, "step": 1937, "tokens_per_device": 13296 }, { "epoch": 0.7748, "loss_ce": 0.0477309413254261, "loss_lvr": 0.7110980153083801, "loss_mode_switch": 0.0, "loss_total": 0.11884073913097382, "step": 1937 }, { "batch_size": 1, "epoch": 0.7748, "step": 1937, "tokens_per_device": 4916 }, { "epoch": 0.7748, "loss_ce": 0.0027673994190990925, "loss_lvr": 0.8126041293144226, "loss_mode_switch": 0.0, "loss_total": 0.08402781188488007, "step": 1937 }, { "batch_size": 1, "epoch": 0.7748, "step": 1937, "tokens_per_device": 4859 }, { "epoch": 0.7748, "loss_ce": 0.09024931490421295, "loss_lvr": 0.4174145758152008, "loss_mode_switch": 0.0, "loss_total": 0.13199077546596527, "step": 1937 }, { "batch_size": 1, "epoch": 0.7748, "step": 1937, "tokens_per_device": 4965 }, { "epoch": 0.7748, "loss_ce": 0.06379058957099915, "loss_lvr": 0.4989974796772003, "loss_mode_switch": 0.0, "loss_total": 0.1136903390288353, "step": 1937 }, { "batch_size": 1, "epoch": 0.7748, "step": 1937, "tokens_per_device": 5105 }, { "epoch": 0.7748, "loss_ce": 0.019414570182561874, "loss_lvr": 0.3484726548194885, "loss_mode_switch": 0.0, "loss_total": 0.054261837154626846, "step": 1937 }, { "batch_size": 4, "epoch": 0.7748, "step": 1937, "tokens_per_device": 5376 }, { "epoch": 0.7748, "loss_ce": 0.5500419735908508, "loss_lvr": 0.9626671671867371, "loss_mode_switch": 0.0, "loss_total": 0.6463086605072021, "step": 1937 }, { "batch_size": 4, "epoch": 0.7748, "step": 1937, "tokens_per_device": 3800 }, { "epoch": 0.7748, "loss_ce": 0.010057678446173668, "loss_lvr": 0.8028467893600464, "loss_mode_switch": 0.0, "loss_total": 0.09034235775470734, "step": 1937 }, { "epoch": 0.7752, "grad_norm": 1.3803277015686035, "learning_rate": 1.2677069405097115e-06, "loss": 0.3033, "step": 1938 }, { "batch_size": 4, "epoch": 0.7752, "step": 1938, "tokens_per_device": 3800 }, { "epoch": 0.7752, "loss_ce": 0.4074476957321167, "loss_lvr": 1.0493314266204834, "loss_mode_switch": 0.0, "loss_total": 0.512380838394165, "step": 1938 }, { "batch_size": 1, "epoch": 0.7752, "step": 1938, "tokens_per_device": 5390 }, { "epoch": 0.7752, "loss_ce": 0.22610116004943848, "loss_lvr": 0.5105319619178772, "loss_mode_switch": 0.0, "loss_total": 0.2771543562412262, "step": 1938 }, { "batch_size": 4, "epoch": 0.7752, "step": 1938, "tokens_per_device": 2616 }, { "epoch": 0.7752, "loss_ce": 0.15879733860492706, "loss_lvr": 0.8586100339889526, "loss_mode_switch": 0.0, "loss_total": 0.24465835094451904, "step": 1938 }, { "batch_size": 4, "epoch": 0.7752, "step": 1938, "tokens_per_device": 4268 }, { "epoch": 0.7752, "loss_ce": 0.6304699778556824, "loss_lvr": 0.7937342524528503, "loss_mode_switch": 0.0, "loss_total": 0.7098433971405029, "step": 1938 }, { "batch_size": 4, "epoch": 0.7752, "step": 1938, "tokens_per_device": 1444 }, { "epoch": 0.7752, "loss_ce": 0.26289933919906616, "loss_lvr": 1.7272909879684448, "loss_mode_switch": 0.0, "loss_total": 0.4356284439563751, "step": 1938 }, { "batch_size": 4, "epoch": 0.7752, "step": 1938, "tokens_per_device": 6204 }, { "epoch": 0.7752, "loss_ce": 0.27912774682044983, "loss_lvr": 0.5959723591804504, "loss_mode_switch": 0.0, "loss_total": 0.3387249708175659, "step": 1938 }, { "batch_size": 4, "epoch": 0.7752, "step": 1938, "tokens_per_device": 11024 }, { "epoch": 0.7752, "loss_ce": 0.18912972509860992, "loss_lvr": 0.6713615655899048, "loss_mode_switch": 0.0, "loss_total": 0.25626587867736816, "step": 1938 }, { "batch_size": 4, "epoch": 0.7752, "step": 1938, "tokens_per_device": 5420 }, { "epoch": 0.7752, "loss_ce": 0.1496545523405075, "loss_lvr": 0.7309490442276001, "loss_mode_switch": 0.0, "loss_total": 0.22274945676326752, "step": 1938 }, { "epoch": 0.7756, "grad_norm": 1.5216482877731323, "learning_rate": 1.2633997318680496e-06, "loss": 0.3039, "step": 1939 }, { "batch_size": 1, "epoch": 0.7756, "step": 1939, "tokens_per_device": 5082 }, { "epoch": 0.7756, "loss_ce": 0.5836422443389893, "loss_lvr": 0.5938937067985535, "loss_mode_switch": 0.0, "loss_total": 0.6430315971374512, "step": 1939 }, { "batch_size": 4, "epoch": 0.7756, "step": 1939, "tokens_per_device": 1908 }, { "epoch": 0.7756, "loss_ce": 0.5657609105110168, "loss_lvr": 0.8534539937973022, "loss_mode_switch": 0.0, "loss_total": 0.6511062979698181, "step": 1939 }, { "batch_size": 1, "epoch": 0.7756, "step": 1939, "tokens_per_device": 5429 }, { "epoch": 0.7756, "loss_ce": 0.018788229674100876, "loss_lvr": 0.40536317229270935, "loss_mode_switch": 0.0, "loss_total": 0.05932454764842987, "step": 1939 }, { "batch_size": 4, "epoch": 0.7756, "step": 1939, "tokens_per_device": 5756 }, { "epoch": 0.7756, "loss_ce": 0.42283767461776733, "loss_lvr": 0.8119592070579529, "loss_mode_switch": 0.0, "loss_total": 0.5040335655212402, "step": 1939 }, { "batch_size": 1, "epoch": 0.7756, "step": 1939, "tokens_per_device": 4909 }, { "epoch": 0.7756, "loss_ce": 0.09170167148113251, "loss_lvr": 0.33764368295669556, "loss_mode_switch": 0.0, "loss_total": 0.12546604871749878, "step": 1939 }, { "batch_size": 4, "epoch": 0.7756, "step": 1939, "tokens_per_device": 9416 }, { "epoch": 0.7756, "loss_ce": 0.5380197167396545, "loss_lvr": 0.49049708247184753, "loss_mode_switch": 0.0, "loss_total": 0.5870694518089294, "step": 1939 }, { "batch_size": 4, "epoch": 0.7756, "step": 1939, "tokens_per_device": 3940 }, { "epoch": 0.7756, "loss_ce": 0.29255637526512146, "loss_lvr": 0.8611245155334473, "loss_mode_switch": 0.0, "loss_total": 0.3786688446998596, "step": 1939 }, { "batch_size": 4, "epoch": 0.7756, "step": 1939, "tokens_per_device": 3996 }, { "epoch": 0.7756, "loss_ce": 0.29797714948654175, "loss_lvr": 0.7942774891853333, "loss_mode_switch": 0.0, "loss_total": 0.3774048984050751, "step": 1939 }, { "epoch": 0.776, "grad_norm": 1.3237254619598389, "learning_rate": 1.2590987944579808e-06, "loss": 0.3105, "step": 1940 }, { "batch_size": 4, "epoch": 0.776, "step": 1940, "tokens_per_device": 5336 }, { "epoch": 0.776, "loss_ce": 0.013547574169933796, "loss_lvr": 0.7047079801559448, "loss_mode_switch": 0.0, "loss_total": 0.08401837944984436, "step": 1940 }, { "batch_size": 4, "epoch": 0.776, "step": 1940, "tokens_per_device": 5048 }, { "epoch": 0.776, "loss_ce": 0.05566482990980148, "loss_lvr": 0.7458456754684448, "loss_mode_switch": 0.0, "loss_total": 0.13024939596652985, "step": 1940 }, { "batch_size": 4, "epoch": 0.776, "step": 1940, "tokens_per_device": 1448 }, { "epoch": 0.776, "loss_ce": 0.35569605231285095, "loss_lvr": 0.9159722924232483, "loss_mode_switch": 0.0, "loss_total": 0.4472932815551758, "step": 1940 }, { "batch_size": 4, "epoch": 0.776, "step": 1940, "tokens_per_device": 4928 }, { "epoch": 0.776, "loss_ce": 0.44341906905174255, "loss_lvr": 0.7425599694252014, "loss_mode_switch": 0.0, "loss_total": 0.5176750421524048, "step": 1940 }, { "batch_size": 4, "epoch": 0.776, "step": 1940, "tokens_per_device": 6352 }, { "epoch": 0.776, "loss_ce": 0.02305671013891697, "loss_lvr": 0.6739043593406677, "loss_mode_switch": 0.0, "loss_total": 0.09044714272022247, "step": 1940 }, { "batch_size": 4, "epoch": 0.776, "step": 1940, "tokens_per_device": 4696 }, { "epoch": 0.776, "loss_ce": 0.27869778871536255, "loss_lvr": 0.7030078172683716, "loss_mode_switch": 0.0, "loss_total": 0.3489985764026642, "step": 1940 }, { "batch_size": 4, "epoch": 0.776, "step": 1940, "tokens_per_device": 6372 }, { "epoch": 0.776, "loss_ce": 0.06413101404905319, "loss_lvr": 0.6323780417442322, "loss_mode_switch": 0.0, "loss_total": 0.12736882269382477, "step": 1940 }, { "batch_size": 4, "epoch": 0.776, "step": 1940, "tokens_per_device": 5328 }, { "epoch": 0.776, "loss_ce": 0.33254262804985046, "loss_lvr": 0.9086167216300964, "loss_mode_switch": 0.0, "loss_total": 0.4234043061733246, "step": 1940 }, { "epoch": 0.7764, "grad_norm": 1.2997530698776245, "learning_rate": 1.254804135497879e-06, "loss": 0.3093, "step": 1941 }, { "batch_size": 4, "epoch": 0.7764, "step": 1941, "tokens_per_device": 3972 }, { "epoch": 0.7764, "loss_ce": 0.10549592971801758, "loss_lvr": 1.0301605463027954, "loss_mode_switch": 0.0, "loss_total": 0.20851197838783264, "step": 1941 }, { "batch_size": 4, "epoch": 0.7764, "step": 1941, "tokens_per_device": 1676 }, { "epoch": 0.7764, "loss_ce": 0.6495932936668396, "loss_lvr": 0.9127170443534851, "loss_mode_switch": 0.0, "loss_total": 0.7408649921417236, "step": 1941 }, { "batch_size": 4, "epoch": 0.7764, "step": 1941, "tokens_per_device": 4128 }, { "epoch": 0.7764, "loss_ce": 0.23748263716697693, "loss_lvr": 0.7579872608184814, "loss_mode_switch": 0.0, "loss_total": 0.3132813572883606, "step": 1941 }, { "batch_size": 4, "epoch": 0.7764, "step": 1941, "tokens_per_device": 4060 }, { "epoch": 0.7764, "loss_ce": 0.04754344001412392, "loss_lvr": 0.6024092435836792, "loss_mode_switch": 0.0, "loss_total": 0.10778436064720154, "step": 1941 }, { "batch_size": 4, "epoch": 0.7764, "step": 1941, "tokens_per_device": 3716 }, { "epoch": 0.7764, "loss_ce": 0.34934163093566895, "loss_lvr": 0.9180805087089539, "loss_mode_switch": 0.0, "loss_total": 0.44114968180656433, "step": 1941 }, { "batch_size": 1, "epoch": 0.7764, "step": 1941, "tokens_per_device": 4871 }, { "epoch": 0.7764, "loss_ce": 0.0003184709057677537, "loss_lvr": 0.24483048915863037, "loss_mode_switch": 0.0, "loss_total": 0.024801520630717278, "step": 1941 }, { "batch_size": 4, "epoch": 0.7764, "step": 1941, "tokens_per_device": 6068 }, { "epoch": 0.7764, "loss_ce": 0.2631295323371887, "loss_lvr": 0.7938447594642639, "loss_mode_switch": 0.0, "loss_total": 0.3425140082836151, "step": 1941 }, { "batch_size": 1, "epoch": 0.7764, "step": 1941, "tokens_per_device": 4937 }, { "epoch": 0.7764, "loss_ce": 0.06051848083734512, "loss_lvr": 0.7145810723304749, "loss_mode_switch": 0.0, "loss_total": 0.13197658956050873, "step": 1941 }, { "epoch": 0.7768, "grad_norm": 1.1487882137298584, "learning_rate": 1.2505157621955815e-06, "loss": 0.2504, "step": 1942 }, { "batch_size": 1, "epoch": 0.7768, "step": 1942, "tokens_per_device": 9986 }, { "epoch": 0.7768, "loss_ce": 0.0009457177366130054, "loss_lvr": 0.37539908289909363, "loss_mode_switch": 0.0, "loss_total": 0.03848562762141228, "step": 1942 }, { "batch_size": 4, "epoch": 0.7768, "step": 1942, "tokens_per_device": 4224 }, { "epoch": 0.7768, "loss_ce": 0.37456029653549194, "loss_lvr": 0.7128133177757263, "loss_mode_switch": 0.0, "loss_total": 0.44584164023399353, "step": 1942 }, { "batch_size": 1, "epoch": 0.7768, "step": 1942, "tokens_per_device": 6808 }, { "epoch": 0.7768, "loss_ce": 0.00023501434770878404, "loss_lvr": 0.31854966282844543, "loss_mode_switch": 0.0, "loss_total": 0.03208998218178749, "step": 1942 }, { "batch_size": 4, "epoch": 0.7768, "step": 1942, "tokens_per_device": 9652 }, { "epoch": 0.7768, "loss_ce": 0.6282406449317932, "loss_lvr": 0.8319717049598694, "loss_mode_switch": 0.0, "loss_total": 0.7114378213882446, "step": 1942 }, { "batch_size": 4, "epoch": 0.7768, "step": 1942, "tokens_per_device": 5720 }, { "epoch": 0.7768, "loss_ce": 0.732643187046051, "loss_lvr": 0.925040602684021, "loss_mode_switch": 0.0, "loss_total": 0.825147271156311, "step": 1942 }, { "batch_size": 4, "epoch": 0.7768, "step": 1942, "tokens_per_device": 5912 }, { "epoch": 0.7768, "loss_ce": 0.22464971244335175, "loss_lvr": 0.8215509653091431, "loss_mode_switch": 0.0, "loss_total": 0.3068048059940338, "step": 1942 }, { "batch_size": 4, "epoch": 0.7768, "step": 1942, "tokens_per_device": 6492 }, { "epoch": 0.7768, "loss_ce": 0.022961562499403954, "loss_lvr": 0.6412093639373779, "loss_mode_switch": 0.0, "loss_total": 0.08708250522613525, "step": 1942 }, { "batch_size": 4, "epoch": 0.7768, "step": 1942, "tokens_per_device": 3960 }, { "epoch": 0.7768, "loss_ce": 0.5419889688491821, "loss_lvr": 0.9508333802223206, "loss_mode_switch": 0.0, "loss_total": 0.6370723247528076, "step": 1942 }, { "epoch": 0.7772, "grad_norm": 1.2756989002227783, "learning_rate": 1.2462336817483734e-06, "loss": 0.3163, "step": 1943 }, { "batch_size": 4, "epoch": 0.7772, "step": 1943, "tokens_per_device": 4236 }, { "epoch": 0.7772, "loss_ce": 0.03197856992483139, "loss_lvr": 0.5905793309211731, "loss_mode_switch": 0.0, "loss_total": 0.09103649854660034, "step": 1943 }, { "batch_size": 1, "epoch": 0.7772, "step": 1943, "tokens_per_device": 4922 }, { "epoch": 0.7772, "loss_ce": 0.21790941059589386, "loss_lvr": 0.3685627579689026, "loss_mode_switch": 0.0, "loss_total": 0.25476568937301636, "step": 1943 }, { "batch_size": 1, "epoch": 0.7772, "step": 1943, "tokens_per_device": 4627 }, { "epoch": 0.7772, "loss_ce": 0.005098253954201937, "loss_lvr": 0.2867130637168884, "loss_mode_switch": 0.0, "loss_total": 0.03376956284046173, "step": 1943 }, { "batch_size": 4, "epoch": 0.7772, "step": 1943, "tokens_per_device": 4328 }, { "epoch": 0.7772, "loss_ce": 0.18778707087039948, "loss_lvr": 0.7309862375259399, "loss_mode_switch": 0.0, "loss_total": 0.26088568568229675, "step": 1943 }, { "batch_size": 1, "epoch": 0.7772, "step": 1943, "tokens_per_device": 4902 }, { "epoch": 0.7772, "loss_ce": 0.01931905187666416, "loss_lvr": 0.7619665861129761, "loss_mode_switch": 0.0, "loss_total": 0.09551571309566498, "step": 1943 }, { "batch_size": 1, "epoch": 0.7772, "step": 1943, "tokens_per_device": 4699 }, { "epoch": 0.7772, "loss_ce": 0.19226916134357452, "loss_lvr": 0.34570351243019104, "loss_mode_switch": 0.0, "loss_total": 0.22683951258659363, "step": 1943 }, { "batch_size": 1, "epoch": 0.7772, "step": 1943, "tokens_per_device": 5223 }, { "epoch": 0.7772, "loss_ce": 0.1303049623966217, "loss_lvr": 0.6022659540176392, "loss_mode_switch": 0.0, "loss_total": 0.19053155183792114, "step": 1943 }, { "batch_size": 4, "epoch": 0.7772, "step": 1943, "tokens_per_device": 5860 }, { "epoch": 0.7772, "loss_ce": 0.2988937199115753, "loss_lvr": 0.7039506435394287, "loss_mode_switch": 0.0, "loss_total": 0.3692888021469116, "step": 1943 }, { "epoch": 0.7776, "grad_norm": 1.4412516355514526, "learning_rate": 1.2419579013429795e-06, "loss": 0.3034, "step": 1944 }, { "batch_size": 1, "epoch": 0.7776, "step": 1944, "tokens_per_device": 4878 }, { "epoch": 0.7776, "loss_ce": 0.018411653116345406, "loss_lvr": 0.5416707992553711, "loss_mode_switch": 0.0, "loss_total": 0.07257873564958572, "step": 1944 }, { "batch_size": 1, "epoch": 0.7776, "step": 1944, "tokens_per_device": 5632 }, { "epoch": 0.7776, "loss_ce": 0.10105592012405396, "loss_lvr": 0.9113149642944336, "loss_mode_switch": 0.0, "loss_total": 0.19218742847442627, "step": 1944 }, { "batch_size": 1, "epoch": 0.7776, "step": 1944, "tokens_per_device": 4863 }, { "epoch": 0.7776, "loss_ce": 0.01570914313197136, "loss_lvr": 0.23600172996520996, "loss_mode_switch": 0.0, "loss_total": 0.039309315383434296, "step": 1944 }, { "batch_size": 1, "epoch": 0.7776, "step": 1944, "tokens_per_device": 4935 }, { "epoch": 0.7776, "loss_ce": 0.0036692852154374123, "loss_lvr": 0.4224996268749237, "loss_mode_switch": 0.0, "loss_total": 0.04591924697160721, "step": 1944 }, { "batch_size": 1, "epoch": 0.7776, "step": 1944, "tokens_per_device": 4679 }, { "epoch": 0.7776, "loss_ce": 0.019046226516366005, "loss_lvr": 0.2706812024116516, "loss_mode_switch": 0.0, "loss_total": 0.046114347875118256, "step": 1944 }, { "batch_size": 4, "epoch": 0.7776, "step": 1944, "tokens_per_device": 3832 }, { "epoch": 0.7776, "loss_ce": 0.6685390472412109, "loss_lvr": 0.9902328252792358, "loss_mode_switch": 0.0, "loss_total": 0.7675623297691345, "step": 1944 }, { "batch_size": 4, "epoch": 0.7776, "step": 1944, "tokens_per_device": 1740 }, { "epoch": 0.7776, "loss_ce": 0.1906059980392456, "loss_lvr": 0.7120695114135742, "loss_mode_switch": 0.0, "loss_total": 0.2618129551410675, "step": 1944 }, { "batch_size": 4, "epoch": 0.7776, "step": 1944, "tokens_per_device": 5500 }, { "epoch": 0.7776, "loss_ce": 0.21387219429016113, "loss_lvr": 0.55106520652771, "loss_mode_switch": 0.0, "loss_total": 0.26897871494293213, "step": 1944 }, { "epoch": 0.778, "grad_norm": 1.2059001922607422, "learning_rate": 1.2376884281555485e-06, "loss": 0.2629, "step": 1945 }, { "batch_size": 4, "epoch": 0.778, "step": 1945, "tokens_per_device": 4732 }, { "epoch": 0.778, "loss_ce": 0.3037348985671997, "loss_lvr": 0.8847774863243103, "loss_mode_switch": 0.0, "loss_total": 0.3922126591205597, "step": 1945 }, { "batch_size": 4, "epoch": 0.778, "step": 1945, "tokens_per_device": 6040 }, { "epoch": 0.778, "loss_ce": 0.11988736689090729, "loss_lvr": 0.7318185567855835, "loss_mode_switch": 0.0, "loss_total": 0.1930692195892334, "step": 1945 }, { "batch_size": 4, "epoch": 0.778, "step": 1945, "tokens_per_device": 4784 }, { "epoch": 0.778, "loss_ce": 0.5724819898605347, "loss_lvr": 0.7788994312286377, "loss_mode_switch": 0.0, "loss_total": 0.6503719091415405, "step": 1945 }, { "batch_size": 4, "epoch": 0.778, "step": 1945, "tokens_per_device": 4192 }, { "epoch": 0.778, "loss_ce": 0.31341949105262756, "loss_lvr": 0.6734575629234314, "loss_mode_switch": 0.0, "loss_total": 0.38076525926589966, "step": 1945 }, { "batch_size": 4, "epoch": 0.778, "step": 1945, "tokens_per_device": 4344 }, { "epoch": 0.778, "loss_ce": 0.20147572457790375, "loss_lvr": 0.8666491508483887, "loss_mode_switch": 0.0, "loss_total": 0.2881406545639038, "step": 1945 }, { "batch_size": 1, "epoch": 0.778, "step": 1945, "tokens_per_device": 4994 }, { "epoch": 0.778, "loss_ce": 0.6688164472579956, "loss_lvr": 1.040700078010559, "loss_mode_switch": 0.0, "loss_total": 0.7728864550590515, "step": 1945 }, { "batch_size": 4, "epoch": 0.778, "step": 1945, "tokens_per_device": 5256 }, { "epoch": 0.778, "loss_ce": 0.513252854347229, "loss_lvr": 0.8819464445114136, "loss_mode_switch": 0.0, "loss_total": 0.6014475226402283, "step": 1945 }, { "batch_size": 4, "epoch": 0.778, "step": 1945, "tokens_per_device": 3988 }, { "epoch": 0.778, "loss_ce": 0.1989128589630127, "loss_lvr": 0.7508674263954163, "loss_mode_switch": 0.0, "loss_total": 0.2739996016025543, "step": 1945 }, { "epoch": 0.7784, "grad_norm": 1.3509700298309326, "learning_rate": 1.2334252693516512e-06, "loss": 0.316, "step": 1946 }, { "batch_size": 4, "epoch": 0.7784, "step": 1946, "tokens_per_device": 4672 }, { "epoch": 0.7784, "loss_ce": 0.11188357323408127, "loss_lvr": 0.7390852570533752, "loss_mode_switch": 0.0, "loss_total": 0.18579210340976715, "step": 1946 }, { "batch_size": 4, "epoch": 0.7784, "step": 1946, "tokens_per_device": 4852 }, { "epoch": 0.7784, "loss_ce": 0.0036152638494968414, "loss_lvr": 0.84022456407547, "loss_mode_switch": 0.0, "loss_total": 0.08763772249221802, "step": 1946 }, { "batch_size": 1, "epoch": 0.7784, "step": 1946, "tokens_per_device": 5035 }, { "epoch": 0.7784, "loss_ce": 0.015392666682600975, "loss_lvr": 0.4624336063861847, "loss_mode_switch": 0.0, "loss_total": 0.06163603067398071, "step": 1946 }, { "batch_size": 4, "epoch": 0.7784, "step": 1946, "tokens_per_device": 3176 }, { "epoch": 0.7784, "loss_ce": 0.178781196475029, "loss_lvr": 0.9563222527503967, "loss_mode_switch": 0.0, "loss_total": 0.27441340684890747, "step": 1946 }, { "batch_size": 4, "epoch": 0.7784, "step": 1946, "tokens_per_device": 4552 }, { "epoch": 0.7784, "loss_ce": 0.20207367837429047, "loss_lvr": 0.7455248236656189, "loss_mode_switch": 0.0, "loss_total": 0.2766261696815491, "step": 1946 }, { "batch_size": 4, "epoch": 0.7784, "step": 1946, "tokens_per_device": 4284 }, { "epoch": 0.7784, "loss_ce": 0.14511747658252716, "loss_lvr": 0.8202459216117859, "loss_mode_switch": 0.0, "loss_total": 0.2271420657634735, "step": 1946 }, { "batch_size": 1, "epoch": 0.7784, "step": 1946, "tokens_per_device": 4821 }, { "epoch": 0.7784, "loss_ce": 0.05465241149067879, "loss_lvr": 0.20592333376407623, "loss_mode_switch": 0.0, "loss_total": 0.07524474710226059, "step": 1946 }, { "batch_size": 4, "epoch": 0.7784, "step": 1946, "tokens_per_device": 4268 }, { "epoch": 0.7784, "loss_ce": 0.42808976769447327, "loss_lvr": 0.9861646890640259, "loss_mode_switch": 0.0, "loss_total": 0.5267062187194824, "step": 1946 }, { "epoch": 0.7788, "grad_norm": 1.2273873090744019, "learning_rate": 1.229168432086254e-06, "loss": 0.2552, "step": 1947 }, { "batch_size": 4, "epoch": 0.7788, "step": 1947, "tokens_per_device": 8336 }, { "epoch": 0.7788, "loss_ce": 0.15410612523555756, "loss_lvr": 0.5289078950881958, "loss_mode_switch": 0.0, "loss_total": 0.20699691772460938, "step": 1947 }, { "batch_size": 4, "epoch": 0.7788, "step": 1947, "tokens_per_device": 3916 }, { "epoch": 0.7788, "loss_ce": 0.0715523362159729, "loss_lvr": 0.901081919670105, "loss_mode_switch": 0.0, "loss_total": 0.16166052222251892, "step": 1947 }, { "batch_size": 4, "epoch": 0.7788, "step": 1947, "tokens_per_device": 4432 }, { "epoch": 0.7788, "loss_ce": 0.5035311579704285, "loss_lvr": 0.9048497676849365, "loss_mode_switch": 0.0, "loss_total": 0.5940161347389221, "step": 1947 }, { "batch_size": 4, "epoch": 0.7788, "step": 1947, "tokens_per_device": 4200 }, { "epoch": 0.7788, "loss_ce": 0.4095706343650818, "loss_lvr": 0.7517039179801941, "loss_mode_switch": 0.0, "loss_total": 0.4847410321235657, "step": 1947 }, { "batch_size": 4, "epoch": 0.7788, "step": 1947, "tokens_per_device": 1272 }, { "epoch": 0.7788, "loss_ce": 0.26997026801109314, "loss_lvr": 0.7744160890579224, "loss_mode_switch": 0.0, "loss_total": 0.3474118709564209, "step": 1947 }, { "batch_size": 4, "epoch": 0.7788, "step": 1947, "tokens_per_device": 5060 }, { "epoch": 0.7788, "loss_ce": 0.5140050053596497, "loss_lvr": 0.871961772441864, "loss_mode_switch": 0.0, "loss_total": 0.6012011766433716, "step": 1947 }, { "batch_size": 1, "epoch": 0.7788, "step": 1947, "tokens_per_device": 4883 }, { "epoch": 0.7788, "loss_ce": 0.05661831423640251, "loss_lvr": 0.37646710872650146, "loss_mode_switch": 0.0, "loss_total": 0.09426502883434296, "step": 1947 }, { "batch_size": 1, "epoch": 0.7788, "step": 1947, "tokens_per_device": 4853 }, { "epoch": 0.7788, "loss_ce": 0.0032418377231806517, "loss_lvr": 0.2654106616973877, "loss_mode_switch": 0.0, "loss_total": 0.02978290431201458, "step": 1947 }, { "epoch": 0.7792, "grad_norm": 1.3151172399520874, "learning_rate": 1.224917923503715e-06, "loss": 0.3022, "step": 1948 }, { "batch_size": 4, "epoch": 0.7792, "step": 1948, "tokens_per_device": 10708 }, { "epoch": 0.7792, "loss_ce": 0.6209936141967773, "loss_lvr": 0.8309627175331116, "loss_mode_switch": 0.0, "loss_total": 0.704089879989624, "step": 1948 }, { "batch_size": 4, "epoch": 0.7792, "step": 1948, "tokens_per_device": 4528 }, { "epoch": 0.7792, "loss_ce": 0.1256933957338333, "loss_lvr": 0.7259821891784668, "loss_mode_switch": 0.0, "loss_total": 0.19829161465168, "step": 1948 }, { "batch_size": 4, "epoch": 0.7792, "step": 1948, "tokens_per_device": 1368 }, { "epoch": 0.7792, "loss_ce": 0.8103640675544739, "loss_lvr": 0.8490647077560425, "loss_mode_switch": 0.0, "loss_total": 0.8952705264091492, "step": 1948 }, { "batch_size": 4, "epoch": 0.7792, "step": 1948, "tokens_per_device": 3788 }, { "epoch": 0.7792, "loss_ce": 0.6515958905220032, "loss_lvr": 0.8522506356239319, "loss_mode_switch": 0.0, "loss_total": 0.7368209362030029, "step": 1948 }, { "batch_size": 4, "epoch": 0.7792, "step": 1948, "tokens_per_device": 6868 }, { "epoch": 0.7792, "loss_ce": 0.34722352027893066, "loss_lvr": 0.8420607447624207, "loss_mode_switch": 0.0, "loss_total": 0.43142959475517273, "step": 1948 }, { "batch_size": 4, "epoch": 0.7792, "step": 1948, "tokens_per_device": 2608 }, { "epoch": 0.7792, "loss_ce": 0.6276342272758484, "loss_lvr": 0.7705209851264954, "loss_mode_switch": 0.0, "loss_total": 0.7046863436698914, "step": 1948 }, { "batch_size": 1, "epoch": 0.7792, "step": 1948, "tokens_per_device": 4913 }, { "epoch": 0.7792, "loss_ce": 0.0003979799512308091, "loss_lvr": 0.25434359908103943, "loss_mode_switch": 0.0, "loss_total": 0.025832340121269226, "step": 1948 }, { "batch_size": 4, "epoch": 0.7792, "step": 1948, "tokens_per_device": 4256 }, { "epoch": 0.7792, "loss_ce": 0.06343989074230194, "loss_lvr": 0.7290246486663818, "loss_mode_switch": 0.0, "loss_total": 0.1363423466682434, "step": 1948 }, { "epoch": 0.7796, "grad_norm": 1.311260461807251, "learning_rate": 1.2206737507377698e-06, "loss": 0.3088, "step": 1949 }, { "batch_size": 4, "epoch": 0.7796, "step": 1949, "tokens_per_device": 5200 }, { "epoch": 0.7796, "loss_ce": 0.07190272957086563, "loss_lvr": 0.7286058068275452, "loss_mode_switch": 0.0, "loss_total": 0.14476332068443298, "step": 1949 }, { "batch_size": 4, "epoch": 0.7796, "step": 1949, "tokens_per_device": 1312 }, { "epoch": 0.7796, "loss_ce": 0.36697641015052795, "loss_lvr": 0.9583607912063599, "loss_mode_switch": 0.0, "loss_total": 0.46281248331069946, "step": 1949 }, { "batch_size": 1, "epoch": 0.7796, "step": 1949, "tokens_per_device": 4877 }, { "epoch": 0.7796, "loss_ce": 0.003021570388227701, "loss_lvr": 0.22894254326820374, "loss_mode_switch": 0.0, "loss_total": 0.025915823876857758, "step": 1949 }, { "batch_size": 4, "epoch": 0.7796, "step": 1949, "tokens_per_device": 5192 }, { "epoch": 0.7796, "loss_ce": 0.04839572310447693, "loss_lvr": 0.7692998647689819, "loss_mode_switch": 0.0, "loss_total": 0.12532570958137512, "step": 1949 }, { "batch_size": 1, "epoch": 0.7796, "step": 1949, "tokens_per_device": 5171 }, { "epoch": 0.7796, "loss_ce": 0.06979845464229584, "loss_lvr": 0.5699938535690308, "loss_mode_switch": 0.0, "loss_total": 0.1267978399991989, "step": 1949 }, { "batch_size": 4, "epoch": 0.7796, "step": 1949, "tokens_per_device": 1304 }, { "epoch": 0.7796, "loss_ce": 0.0565854050219059, "loss_lvr": 1.0460304021835327, "loss_mode_switch": 0.0, "loss_total": 0.16118845343589783, "step": 1949 }, { "batch_size": 1, "epoch": 0.7796, "step": 1949, "tokens_per_device": 4879 }, { "epoch": 0.7796, "loss_ce": 0.26320788264274597, "loss_lvr": 0.22989584505558014, "loss_mode_switch": 0.0, "loss_total": 0.2861974537372589, "step": 1949 }, { "batch_size": 4, "epoch": 0.7796, "step": 1949, "tokens_per_device": 4388 }, { "epoch": 0.7796, "loss_ce": 0.0868711993098259, "loss_lvr": 0.8961372971534729, "loss_mode_switch": 0.0, "loss_total": 0.17648492753505707, "step": 1949 }, { "epoch": 0.78, "grad_norm": 1.2199602127075195, "learning_rate": 1.2164359209115235e-06, "loss": 0.2457, "step": 1950 }, { "batch_size": 1, "epoch": 0.78, "step": 1950, "tokens_per_device": 5058 }, { "epoch": 0.78, "loss_ce": 0.04605937749147415, "loss_lvr": 0.8875617384910583, "loss_mode_switch": 0.0, "loss_total": 0.1348155438899994, "step": 1950 }, { "batch_size": 1, "epoch": 0.78, "step": 1950, "tokens_per_device": 4884 }, { "epoch": 0.78, "loss_ce": 0.0006139362812973559, "loss_lvr": 0.3927363455295563, "loss_mode_switch": 0.0, "loss_total": 0.03988756984472275, "step": 1950 }, { "batch_size": 4, "epoch": 0.78, "step": 1950, "tokens_per_device": 4976 }, { "epoch": 0.78, "loss_ce": 0.36968404054641724, "loss_lvr": 0.6303580403327942, "loss_mode_switch": 0.0, "loss_total": 0.4327198565006256, "step": 1950 }, { "batch_size": 4, "epoch": 0.78, "step": 1950, "tokens_per_device": 8936 }, { "epoch": 0.78, "loss_ce": 0.01574431173503399, "loss_lvr": 0.5213887691497803, "loss_mode_switch": 0.0, "loss_total": 0.06788318604230881, "step": 1950 }, { "batch_size": 4, "epoch": 0.78, "step": 1950, "tokens_per_device": 2012 }, { "epoch": 0.78, "loss_ce": 0.3971033990383148, "loss_lvr": 0.9033859372138977, "loss_mode_switch": 0.0, "loss_total": 0.4874419867992401, "step": 1950 }, { "batch_size": 1, "epoch": 0.78, "step": 1950, "tokens_per_device": 6314 }, { "epoch": 0.78, "loss_ce": 0.00040041658212430775, "loss_lvr": 0.2953665554523468, "loss_mode_switch": 0.0, "loss_total": 0.029937071725726128, "step": 1950 }, { "batch_size": 4, "epoch": 0.78, "step": 1950, "tokens_per_device": 1448 }, { "epoch": 0.78, "loss_ce": 0.26583945751190186, "loss_lvr": 1.0088505744934082, "loss_mode_switch": 0.0, "loss_total": 0.36672452092170715, "step": 1950 }, { "batch_size": 4, "epoch": 0.78, "step": 1950, "tokens_per_device": 4440 }, { "epoch": 0.78, "loss_ce": 0.39407724142074585, "loss_lvr": 0.7874668836593628, "loss_mode_switch": 0.0, "loss_total": 0.4728239178657532, "step": 1950 }, { "epoch": 0.7804, "grad_norm": 1.2390296459197998, "learning_rate": 1.212204441137435e-06, "loss": 0.2595, "step": 1951 }, { "batch_size": 4, "epoch": 0.7804, "step": 1951, "tokens_per_device": 5596 }, { "epoch": 0.7804, "loss_ce": 0.19382621347904205, "loss_lvr": 0.6432532072067261, "loss_mode_switch": 0.0, "loss_total": 0.2581515312194824, "step": 1951 }, { "batch_size": 1, "epoch": 0.7804, "step": 1951, "tokens_per_device": 6789 }, { "epoch": 0.7804, "loss_ce": 0.06068708002567291, "loss_lvr": 0.31440889835357666, "loss_mode_switch": 0.0, "loss_total": 0.0921279713511467, "step": 1951 }, { "batch_size": 4, "epoch": 0.7804, "step": 1951, "tokens_per_device": 2688 }, { "epoch": 0.7804, "loss_ce": 0.21828791499137878, "loss_lvr": 0.6346499919891357, "loss_mode_switch": 0.0, "loss_total": 0.28175291419029236, "step": 1951 }, { "batch_size": 1, "epoch": 0.7804, "step": 1951, "tokens_per_device": 5064 }, { "epoch": 0.7804, "loss_ce": 0.008445395156741142, "loss_lvr": 0.27116942405700684, "loss_mode_switch": 0.0, "loss_total": 0.035562336444854736, "step": 1951 }, { "batch_size": 4, "epoch": 0.7804, "step": 1951, "tokens_per_device": 4252 }, { "epoch": 0.7804, "loss_ce": 0.2659637928009033, "loss_lvr": 0.7020042538642883, "loss_mode_switch": 0.0, "loss_total": 0.3361642360687256, "step": 1951 }, { "batch_size": 4, "epoch": 0.7804, "step": 1951, "tokens_per_device": 2684 }, { "epoch": 0.7804, "loss_ce": 0.16956952214241028, "loss_lvr": 0.8535973429679871, "loss_mode_switch": 0.0, "loss_total": 0.25492924451828003, "step": 1951 }, { "batch_size": 4, "epoch": 0.7804, "step": 1951, "tokens_per_device": 5848 }, { "epoch": 0.7804, "loss_ce": 0.35846298933029175, "loss_lvr": 0.8124971389770508, "loss_mode_switch": 0.0, "loss_total": 0.4397127032279968, "step": 1951 }, { "batch_size": 1, "epoch": 0.7804, "step": 1951, "tokens_per_device": 5208 }, { "epoch": 0.7804, "loss_ce": 0.06402406096458435, "loss_lvr": 0.404904842376709, "loss_mode_switch": 0.0, "loss_total": 0.10451454669237137, "step": 1951 }, { "epoch": 0.7808, "grad_norm": 1.2654809951782227, "learning_rate": 1.2079793185173045e-06, "loss": 0.2482, "step": 1952 }, { "batch_size": 4, "epoch": 0.7808, "step": 1952, "tokens_per_device": 1996 }, { "epoch": 0.7808, "loss_ce": 0.19907723367214203, "loss_lvr": 0.854742705821991, "loss_mode_switch": 0.0, "loss_total": 0.2845515012741089, "step": 1952 }, { "batch_size": 4, "epoch": 0.7808, "step": 1952, "tokens_per_device": 8472 }, { "epoch": 0.7808, "loss_ce": 0.31405431032180786, "loss_lvr": 0.37706229090690613, "loss_mode_switch": 0.0, "loss_total": 0.35176053643226624, "step": 1952 }, { "batch_size": 4, "epoch": 0.7808, "step": 1952, "tokens_per_device": 4328 }, { "epoch": 0.7808, "loss_ce": 0.25450628995895386, "loss_lvr": 0.6238983869552612, "loss_mode_switch": 0.0, "loss_total": 0.31689614057540894, "step": 1952 }, { "batch_size": 4, "epoch": 0.7808, "step": 1952, "tokens_per_device": 3964 }, { "epoch": 0.7808, "loss_ce": 0.13452759385108948, "loss_lvr": 1.0088114738464355, "loss_mode_switch": 0.0, "loss_total": 0.235408753156662, "step": 1952 }, { "batch_size": 4, "epoch": 0.7808, "step": 1952, "tokens_per_device": 8984 }, { "epoch": 0.7808, "loss_ce": 0.13103435933589935, "loss_lvr": 0.7922858595848083, "loss_mode_switch": 0.0, "loss_total": 0.2102629542350769, "step": 1952 }, { "batch_size": 4, "epoch": 0.7808, "step": 1952, "tokens_per_device": 6608 }, { "epoch": 0.7808, "loss_ce": 0.4235628843307495, "loss_lvr": 0.9028434753417969, "loss_mode_switch": 0.0, "loss_total": 0.5138472318649292, "step": 1952 }, { "batch_size": 1, "epoch": 0.7808, "step": 1952, "tokens_per_device": 4975 }, { "epoch": 0.7808, "loss_ce": 1.4824129343032837, "loss_lvr": 0.5132626891136169, "loss_mode_switch": 0.0, "loss_total": 1.5337392091751099, "step": 1952 }, { "batch_size": 1, "epoch": 0.7808, "step": 1952, "tokens_per_device": 6241 }, { "epoch": 0.7808, "loss_ce": 0.07496172934770584, "loss_lvr": 0.2814904451370239, "loss_mode_switch": 0.0, "loss_total": 0.10311077535152435, "step": 1952 }, { "epoch": 0.7812, "grad_norm": 1.597269892692566, "learning_rate": 1.2037605601422614e-06, "loss": 0.3194, "step": 1953 }, { "batch_size": 1, "epoch": 0.7812, "step": 1953, "tokens_per_device": 5180 }, { "epoch": 0.7812, "loss_ce": 0.0163491889834404, "loss_lvr": 0.28099656105041504, "loss_mode_switch": 0.0, "loss_total": 0.0444488450884819, "step": 1953 }, { "batch_size": 4, "epoch": 0.7812, "step": 1953, "tokens_per_device": 1352 }, { "epoch": 0.7812, "loss_ce": 0.40607988834381104, "loss_lvr": 0.8904655575752258, "loss_mode_switch": 0.0, "loss_total": 0.4951264560222626, "step": 1953 }, { "batch_size": 1, "epoch": 0.7812, "step": 1953, "tokens_per_device": 5124 }, { "epoch": 0.7812, "loss_ce": 0.14384499192237854, "loss_lvr": 0.23289792239665985, "loss_mode_switch": 0.0, "loss_total": 0.16713479161262512, "step": 1953 }, { "batch_size": 1, "epoch": 0.7812, "step": 1953, "tokens_per_device": 5025 }, { "epoch": 0.7812, "loss_ce": 0.014362995512783527, "loss_lvr": 0.5739143490791321, "loss_mode_switch": 0.0, "loss_total": 0.07175443321466446, "step": 1953 }, { "batch_size": 1, "epoch": 0.7812, "step": 1953, "tokens_per_device": 5095 }, { "epoch": 0.7812, "loss_ce": 0.004685691092163324, "loss_lvr": 0.5168660283088684, "loss_mode_switch": 0.0, "loss_total": 0.05637229606509209, "step": 1953 }, { "batch_size": 4, "epoch": 0.7812, "step": 1953, "tokens_per_device": 5500 }, { "epoch": 0.7812, "loss_ce": 0.40150901675224304, "loss_lvr": 0.7845783829689026, "loss_mode_switch": 0.0, "loss_total": 0.4799668490886688, "step": 1953 }, { "batch_size": 1, "epoch": 0.7812, "step": 1953, "tokens_per_device": 5145 }, { "epoch": 0.7812, "loss_ce": 0.08206245303153992, "loss_lvr": 0.5341762900352478, "loss_mode_switch": 0.0, "loss_total": 0.13548007607460022, "step": 1953 }, { "batch_size": 4, "epoch": 0.7812, "step": 1953, "tokens_per_device": 4220 }, { "epoch": 0.7812, "loss_ce": 0.17100568115711212, "loss_lvr": 1.0392060279846191, "loss_mode_switch": 0.0, "loss_total": 0.2749262750148773, "step": 1953 }, { "epoch": 0.7816, "grad_norm": 1.3036164045333862, "learning_rate": 1.1995481730927538e-06, "loss": 0.2465, "step": 1954 }, { "batch_size": 4, "epoch": 0.7816, "step": 1954, "tokens_per_device": 4948 }, { "epoch": 0.7816, "loss_ce": 0.7694793343544006, "loss_lvr": 0.9403719902038574, "loss_mode_switch": 0.0, "loss_total": 0.8635165095329285, "step": 1954 }, { "batch_size": 1, "epoch": 0.7816, "step": 1954, "tokens_per_device": 4689 }, { "epoch": 0.7816, "loss_ce": 0.057439062744379044, "loss_lvr": 0.6582807302474976, "loss_mode_switch": 0.0, "loss_total": 0.12326714396476746, "step": 1954 }, { "batch_size": 4, "epoch": 0.7816, "step": 1954, "tokens_per_device": 2544 }, { "epoch": 0.7816, "loss_ce": 0.28686949610710144, "loss_lvr": 0.8879286050796509, "loss_mode_switch": 0.0, "loss_total": 0.37566235661506653, "step": 1954 }, { "batch_size": 4, "epoch": 0.7816, "step": 1954, "tokens_per_device": 6044 }, { "epoch": 0.7816, "loss_ce": 0.7235345840454102, "loss_lvr": 0.7301560044288635, "loss_mode_switch": 0.0, "loss_total": 0.7965501546859741, "step": 1954 }, { "batch_size": 1, "epoch": 0.7816, "step": 1954, "tokens_per_device": 4883 }, { "epoch": 0.7816, "loss_ce": 0.0164295956492424, "loss_lvr": 0.8355754613876343, "loss_mode_switch": 0.0, "loss_total": 0.09998714178800583, "step": 1954 }, { "batch_size": 1, "epoch": 0.7816, "step": 1954, "tokens_per_device": 5109 }, { "epoch": 0.7816, "loss_ce": 0.004865649156272411, "loss_lvr": 0.3083249032497406, "loss_mode_switch": 0.0, "loss_total": 0.03569813817739487, "step": 1954 }, { "batch_size": 4, "epoch": 0.7816, "step": 1954, "tokens_per_device": 4836 }, { "epoch": 0.7816, "loss_ce": 0.08493297547101974, "loss_lvr": 0.760708212852478, "loss_mode_switch": 0.0, "loss_total": 0.16100379824638367, "step": 1954 }, { "batch_size": 4, "epoch": 0.7816, "step": 1954, "tokens_per_device": 4452 }, { "epoch": 0.7816, "loss_ce": 0.28028130531311035, "loss_lvr": 0.6635539531707764, "loss_mode_switch": 0.0, "loss_total": 0.34663671255111694, "step": 1954 }, { "epoch": 0.782, "grad_norm": 1.3990610837936401, "learning_rate": 1.1953421644385444e-06, "loss": 0.2608, "step": 1955 }, { "batch_size": 4, "epoch": 0.782, "step": 1955, "tokens_per_device": 3148 }, { "epoch": 0.782, "loss_ce": 0.523870050907135, "loss_lvr": 1.06443452835083, "loss_mode_switch": 0.0, "loss_total": 0.630313515663147, "step": 1955 }, { "batch_size": 4, "epoch": 0.782, "step": 1955, "tokens_per_device": 4220 }, { "epoch": 0.782, "loss_ce": 0.000983728445135057, "loss_lvr": 0.5118486285209656, "loss_mode_switch": 0.0, "loss_total": 0.0521685928106308, "step": 1955 }, { "batch_size": 1, "epoch": 0.782, "step": 1955, "tokens_per_device": 4922 }, { "epoch": 0.782, "loss_ce": 0.04278649017214775, "loss_lvr": 0.3102754056453705, "loss_mode_switch": 0.0, "loss_total": 0.0738140344619751, "step": 1955 }, { "batch_size": 4, "epoch": 0.782, "step": 1955, "tokens_per_device": 1320 }, { "epoch": 0.782, "loss_ce": 0.43817147612571716, "loss_lvr": 0.9891242980957031, "loss_mode_switch": 0.0, "loss_total": 0.5370839238166809, "step": 1955 }, { "batch_size": 4, "epoch": 0.782, "step": 1955, "tokens_per_device": 5760 }, { "epoch": 0.782, "loss_ce": 0.16821065545082092, "loss_lvr": 0.670143723487854, "loss_mode_switch": 0.0, "loss_total": 0.23522502183914185, "step": 1955 }, { "batch_size": 4, "epoch": 0.782, "step": 1955, "tokens_per_device": 2632 }, { "epoch": 0.782, "loss_ce": 0.5418329238891602, "loss_lvr": 0.7897464632987976, "loss_mode_switch": 0.0, "loss_total": 0.6208075881004333, "step": 1955 }, { "batch_size": 4, "epoch": 0.782, "step": 1955, "tokens_per_device": 1272 }, { "epoch": 0.782, "loss_ce": 0.04057127237319946, "loss_lvr": 0.9109359383583069, "loss_mode_switch": 0.0, "loss_total": 0.13166487216949463, "step": 1955 }, { "batch_size": 4, "epoch": 0.782, "step": 1955, "tokens_per_device": 1348 }, { "epoch": 0.782, "loss_ce": 0.4064130187034607, "loss_lvr": 0.8965439796447754, "loss_mode_switch": 0.0, "loss_total": 0.4960674047470093, "step": 1955 }, { "epoch": 0.7824, "grad_norm": 1.2533786296844482, "learning_rate": 1.1911425412386811e-06, "loss": 0.2788, "step": 1956 }, { "batch_size": 4, "epoch": 0.7824, "step": 1956, "tokens_per_device": 1332 }, { "epoch": 0.7824, "loss_ce": 0.5683609843254089, "loss_lvr": 0.9449411034584045, "loss_mode_switch": 0.0, "loss_total": 0.6628550887107849, "step": 1956 }, { "batch_size": 1, "epoch": 0.7824, "step": 1956, "tokens_per_device": 5260 }, { "epoch": 0.7824, "loss_ce": 0.0017295628786087036, "loss_lvr": 0.5907506942749023, "loss_mode_switch": 0.0, "loss_total": 0.06080463156104088, "step": 1956 }, { "batch_size": 1, "epoch": 0.7824, "step": 1956, "tokens_per_device": 5556 }, { "epoch": 0.7824, "loss_ce": 0.07681900262832642, "loss_lvr": 0.25556913018226624, "loss_mode_switch": 0.0, "loss_total": 0.10237591713666916, "step": 1956 }, { "batch_size": 4, "epoch": 0.7824, "step": 1956, "tokens_per_device": 10904 }, { "epoch": 0.7824, "loss_ce": 0.0288012083619833, "loss_lvr": 0.38946297764778137, "loss_mode_switch": 0.0, "loss_total": 0.06774750351905823, "step": 1956 }, { "batch_size": 4, "epoch": 0.7824, "step": 1956, "tokens_per_device": 4360 }, { "epoch": 0.7824, "loss_ce": 0.718999445438385, "loss_lvr": 0.8850233554840088, "loss_mode_switch": 0.0, "loss_total": 0.8075017929077148, "step": 1956 }, { "batch_size": 1, "epoch": 0.7824, "step": 1956, "tokens_per_device": 4760 }, { "epoch": 0.7824, "loss_ce": 0.04996489733457565, "loss_lvr": 0.21920713782310486, "loss_mode_switch": 0.0, "loss_total": 0.0718856155872345, "step": 1956 }, { "batch_size": 1, "epoch": 0.7824, "step": 1956, "tokens_per_device": 5362 }, { "epoch": 0.7824, "loss_ce": 0.2174834907054901, "loss_lvr": 0.4048755466938019, "loss_mode_switch": 0.0, "loss_total": 0.25797104835510254, "step": 1956 }, { "batch_size": 4, "epoch": 0.7824, "step": 1956, "tokens_per_device": 3792 }, { "epoch": 0.7824, "loss_ce": 0.4516150653362274, "loss_lvr": 1.0086060762405396, "loss_mode_switch": 0.0, "loss_total": 0.5524756908416748, "step": 1956 }, { "epoch": 0.7828, "grad_norm": 1.3665337562561035, "learning_rate": 1.1869493105414999e-06, "loss": 0.3113, "step": 1957 }, { "batch_size": 4, "epoch": 0.7828, "step": 1957, "tokens_per_device": 4284 }, { "epoch": 0.7828, "loss_ce": 0.07249436527490616, "loss_lvr": 0.766160786151886, "loss_mode_switch": 0.0, "loss_total": 0.14911043643951416, "step": 1957 }, { "batch_size": 4, "epoch": 0.7828, "step": 1957, "tokens_per_device": 3804 }, { "epoch": 0.7828, "loss_ce": 0.24342820048332214, "loss_lvr": 0.8196591734886169, "loss_mode_switch": 0.0, "loss_total": 0.3253941237926483, "step": 1957 }, { "batch_size": 4, "epoch": 0.7828, "step": 1957, "tokens_per_device": 2624 }, { "epoch": 0.7828, "loss_ce": 0.4207417368888855, "loss_lvr": 0.7615180611610413, "loss_mode_switch": 0.0, "loss_total": 0.4968935549259186, "step": 1957 }, { "batch_size": 1, "epoch": 0.7828, "step": 1957, "tokens_per_device": 4919 }, { "epoch": 0.7828, "loss_ce": 0.0028139245696365833, "loss_lvr": 0.5438333749771118, "loss_mode_switch": 0.0, "loss_total": 0.05719726160168648, "step": 1957 }, { "batch_size": 4, "epoch": 0.7828, "step": 1957, "tokens_per_device": 2340 }, { "epoch": 0.7828, "loss_ce": 0.1420821100473404, "loss_lvr": 0.8724872469902039, "loss_mode_switch": 0.0, "loss_total": 0.22933083772659302, "step": 1957 }, { "batch_size": 1, "epoch": 0.7828, "step": 1957, "tokens_per_device": 5093 }, { "epoch": 0.7828, "loss_ce": 0.0020009013824164867, "loss_lvr": 0.4094387888908386, "loss_mode_switch": 0.0, "loss_total": 0.042944781482219696, "step": 1957 }, { "batch_size": 4, "epoch": 0.7828, "step": 1957, "tokens_per_device": 7136 }, { "epoch": 0.7828, "loss_ce": 0.1277012825012207, "loss_lvr": 0.7689290642738342, "loss_mode_switch": 0.0, "loss_total": 0.2045941948890686, "step": 1957 }, { "batch_size": 1, "epoch": 0.7828, "step": 1957, "tokens_per_device": 5887 }, { "epoch": 0.7828, "loss_ce": 0.01846468821167946, "loss_lvr": 0.5378804206848145, "loss_mode_switch": 0.0, "loss_total": 0.07225273549556732, "step": 1957 }, { "epoch": 0.7832, "grad_norm": 1.1210027933120728, "learning_rate": 1.1827624793846037e-06, "loss": 0.248, "step": 1958 }, { "batch_size": 4, "epoch": 0.7832, "step": 1958, "tokens_per_device": 3616 }, { "epoch": 0.7832, "loss_ce": 0.13170015811920166, "loss_lvr": 0.8501915335655212, "loss_mode_switch": 0.0, "loss_total": 0.21671931445598602, "step": 1958 }, { "batch_size": 1, "epoch": 0.7832, "step": 1958, "tokens_per_device": 5115 }, { "epoch": 0.7832, "loss_ce": 0.004569514654576778, "loss_lvr": 0.4044424295425415, "loss_mode_switch": 0.0, "loss_total": 0.04501375928521156, "step": 1958 }, { "batch_size": 4, "epoch": 0.7832, "step": 1958, "tokens_per_device": 6840 }, { "epoch": 0.7832, "loss_ce": 0.1859525740146637, "loss_lvr": 0.7253840565681458, "loss_mode_switch": 0.0, "loss_total": 0.25849097967147827, "step": 1958 }, { "batch_size": 4, "epoch": 0.7832, "step": 1958, "tokens_per_device": 2672 }, { "epoch": 0.7832, "loss_ce": 0.13114215433597565, "loss_lvr": 0.5053549408912659, "loss_mode_switch": 0.0, "loss_total": 0.1816776543855667, "step": 1958 }, { "batch_size": 1, "epoch": 0.7832, "step": 1958, "tokens_per_device": 5041 }, { "epoch": 0.7832, "loss_ce": 0.029660603031516075, "loss_lvr": 0.3555788993835449, "loss_mode_switch": 0.0, "loss_total": 0.0652184933423996, "step": 1958 }, { "batch_size": 4, "epoch": 0.7832, "step": 1958, "tokens_per_device": 4112 }, { "epoch": 0.7832, "loss_ce": 0.04934515058994293, "loss_lvr": 0.49767428636550903, "loss_mode_switch": 0.0, "loss_total": 0.09911258518695831, "step": 1958 }, { "batch_size": 4, "epoch": 0.7832, "step": 1958, "tokens_per_device": 1520 }, { "epoch": 0.7832, "loss_ce": 0.6333003640174866, "loss_lvr": 0.78434818983078, "loss_mode_switch": 0.0, "loss_total": 0.711735188961029, "step": 1958 }, { "batch_size": 4, "epoch": 0.7832, "step": 1958, "tokens_per_device": 2712 }, { "epoch": 0.7832, "loss_ce": 0.24334852397441864, "loss_lvr": 0.9605984687805176, "loss_mode_switch": 0.0, "loss_total": 0.33940836787223816, "step": 1958 }, { "epoch": 0.7836, "grad_norm": 1.377490520477295, "learning_rate": 1.1785820547948612e-06, "loss": 0.265, "step": 1959 }, { "batch_size": 4, "epoch": 0.7836, "step": 1959, "tokens_per_device": 4252 }, { "epoch": 0.7836, "loss_ce": 0.3114084005355835, "loss_lvr": 0.9070819616317749, "loss_mode_switch": 0.0, "loss_total": 0.402116596698761, "step": 1959 }, { "batch_size": 4, "epoch": 0.7836, "step": 1959, "tokens_per_device": 4224 }, { "epoch": 0.7836, "loss_ce": 0.16894152760505676, "loss_lvr": 0.8558468818664551, "loss_mode_switch": 0.0, "loss_total": 0.2545262277126312, "step": 1959 }, { "batch_size": 4, "epoch": 0.7836, "step": 1959, "tokens_per_device": 4336 }, { "epoch": 0.7836, "loss_ce": 0.22807283699512482, "loss_lvr": 0.4010581076145172, "loss_mode_switch": 0.0, "loss_total": 0.26817864179611206, "step": 1959 }, { "batch_size": 4, "epoch": 0.7836, "step": 1959, "tokens_per_device": 2584 }, { "epoch": 0.7836, "loss_ce": 0.059197910130023956, "loss_lvr": 0.9593140482902527, "loss_mode_switch": 0.0, "loss_total": 0.1551293134689331, "step": 1959 }, { "batch_size": 4, "epoch": 0.7836, "step": 1959, "tokens_per_device": 6464 }, { "epoch": 0.7836, "loss_ce": 0.0038422485813498497, "loss_lvr": 0.7500522136688232, "loss_mode_switch": 0.0, "loss_total": 0.0788474753499031, "step": 1959 }, { "batch_size": 4, "epoch": 0.7836, "step": 1959, "tokens_per_device": 4280 }, { "epoch": 0.7836, "loss_ce": 0.1350785344839096, "loss_lvr": 0.8254933953285217, "loss_mode_switch": 0.0, "loss_total": 0.2176278829574585, "step": 1959 }, { "batch_size": 4, "epoch": 0.7836, "step": 1959, "tokens_per_device": 4280 }, { "epoch": 0.7836, "loss_ce": 0.1919267624616623, "loss_lvr": 0.5367841124534607, "loss_mode_switch": 0.0, "loss_total": 0.24560517072677612, "step": 1959 }, { "batch_size": 1, "epoch": 0.7836, "step": 1959, "tokens_per_device": 4917 }, { "epoch": 0.7836, "loss_ce": 0.39837846159935, "loss_lvr": 0.963800311088562, "loss_mode_switch": 0.0, "loss_total": 0.4947584867477417, "step": 1959 }, { "epoch": 0.784, "grad_norm": 1.227640151977539, "learning_rate": 1.1744080437883859e-06, "loss": 0.2872, "step": 1960 }, { "batch_size": 4, "epoch": 0.784, "step": 1960, "tokens_per_device": 4244 }, { "epoch": 0.784, "loss_ce": 0.28255775570869446, "loss_lvr": 1.0008047819137573, "loss_mode_switch": 0.0, "loss_total": 0.38263824582099915, "step": 1960 }, { "batch_size": 4, "epoch": 0.784, "step": 1960, "tokens_per_device": 1396 }, { "epoch": 0.784, "loss_ce": 0.5685537457466125, "loss_lvr": 0.9532874822616577, "loss_mode_switch": 0.0, "loss_total": 0.6638824939727783, "step": 1960 }, { "batch_size": 1, "epoch": 0.784, "step": 1960, "tokens_per_device": 4851 }, { "epoch": 0.784, "loss_ce": 0.08306290209293365, "loss_lvr": 0.5608705878257751, "loss_mode_switch": 0.0, "loss_total": 0.1391499638557434, "step": 1960 }, { "batch_size": 1, "epoch": 0.784, "step": 1960, "tokens_per_device": 4598 }, { "epoch": 0.784, "loss_ce": 0.022794349119067192, "loss_lvr": 0.7080634832382202, "loss_mode_switch": 0.0, "loss_total": 0.09360069781541824, "step": 1960 }, { "batch_size": 4, "epoch": 0.784, "step": 1960, "tokens_per_device": 2552 }, { "epoch": 0.784, "loss_ce": 0.3977653980255127, "loss_lvr": 0.9448677897453308, "loss_mode_switch": 0.0, "loss_total": 0.4922521710395813, "step": 1960 }, { "batch_size": 1, "epoch": 0.784, "step": 1960, "tokens_per_device": 4910 }, { "epoch": 0.784, "loss_ce": 0.004224089905619621, "loss_lvr": 0.5723749399185181, "loss_mode_switch": 0.0, "loss_total": 0.06146158277988434, "step": 1960 }, { "batch_size": 1, "epoch": 0.784, "step": 1960, "tokens_per_device": 4924 }, { "epoch": 0.784, "loss_ce": 0.05709034949541092, "loss_lvr": 0.904292643070221, "loss_mode_switch": 0.0, "loss_total": 0.14751961827278137, "step": 1960 }, { "batch_size": 4, "epoch": 0.784, "step": 1960, "tokens_per_device": 3984 }, { "epoch": 0.784, "loss_ce": 0.1924005001783371, "loss_lvr": 0.8684847950935364, "loss_mode_switch": 0.0, "loss_total": 0.279248982667923, "step": 1960 }, { "epoch": 0.7844, "grad_norm": 1.3228809833526611, "learning_rate": 1.1702404533705264e-06, "loss": 0.3018, "step": 1961 }, { "batch_size": 4, "epoch": 0.7844, "step": 1961, "tokens_per_device": 4448 }, { "epoch": 0.7844, "loss_ce": 0.3876497149467468, "loss_lvr": 0.5812675356864929, "loss_mode_switch": 0.0, "loss_total": 0.44577646255493164, "step": 1961 }, { "batch_size": 4, "epoch": 0.7844, "step": 1961, "tokens_per_device": 3880 }, { "epoch": 0.7844, "loss_ce": 0.06643583625555038, "loss_lvr": 0.8955598473548889, "loss_mode_switch": 0.0, "loss_total": 0.1559918224811554, "step": 1961 }, { "batch_size": 1, "epoch": 0.7844, "step": 1961, "tokens_per_device": 5160 }, { "epoch": 0.7844, "loss_ce": 0.16097065806388855, "loss_lvr": 0.26507604122161865, "loss_mode_switch": 0.0, "loss_total": 0.18747825920581818, "step": 1961 }, { "batch_size": 4, "epoch": 0.7844, "step": 1961, "tokens_per_device": 5796 }, { "epoch": 0.7844, "loss_ce": 0.5189279913902283, "loss_lvr": 0.9842965006828308, "loss_mode_switch": 0.0, "loss_total": 0.617357611656189, "step": 1961 }, { "batch_size": 4, "epoch": 0.7844, "step": 1961, "tokens_per_device": 4768 }, { "epoch": 0.7844, "loss_ce": 0.003693073522299528, "loss_lvr": 0.6290561556816101, "loss_mode_switch": 0.0, "loss_total": 0.06659869104623795, "step": 1961 }, { "batch_size": 4, "epoch": 0.7844, "step": 1961, "tokens_per_device": 3336 }, { "epoch": 0.7844, "loss_ce": 0.18168115615844727, "loss_lvr": 0.8800344467163086, "loss_mode_switch": 0.0, "loss_total": 0.2696846127510071, "step": 1961 }, { "batch_size": 4, "epoch": 0.7844, "step": 1961, "tokens_per_device": 14092 }, { "epoch": 0.7844, "loss_ce": 0.004441181663423777, "loss_lvr": 0.695270836353302, "loss_mode_switch": 0.0, "loss_total": 0.07396826148033142, "step": 1961 }, { "batch_size": 4, "epoch": 0.7844, "step": 1961, "tokens_per_device": 5044 }, { "epoch": 0.7844, "loss_ce": 0.020342733711004257, "loss_lvr": 0.7773424983024597, "loss_mode_switch": 0.0, "loss_total": 0.09807698428630829, "step": 1961 }, { "epoch": 0.7848, "grad_norm": 1.271863341331482, "learning_rate": 1.166079290535856e-06, "loss": 0.2981, "step": 1962 }, { "batch_size": 4, "epoch": 0.7848, "step": 1962, "tokens_per_device": 3892 }, { "epoch": 0.7848, "loss_ce": 0.19377188384532928, "loss_lvr": 0.8132395148277283, "loss_mode_switch": 0.0, "loss_total": 0.2750958204269409, "step": 1962 }, { "batch_size": 1, "epoch": 0.7848, "step": 1962, "tokens_per_device": 5398 }, { "epoch": 0.7848, "loss_ce": 0.1020737737417221, "loss_lvr": 0.4178057014942169, "loss_mode_switch": 0.0, "loss_total": 0.14385434985160828, "step": 1962 }, { "batch_size": 4, "epoch": 0.7848, "step": 1962, "tokens_per_device": 4412 }, { "epoch": 0.7848, "loss_ce": 0.07468298822641373, "loss_lvr": 0.9902233481407166, "loss_mode_switch": 0.0, "loss_total": 0.1737053245306015, "step": 1962 }, { "batch_size": 1, "epoch": 0.7848, "step": 1962, "tokens_per_device": 5071 }, { "epoch": 0.7848, "loss_ce": 0.0053571113385260105, "loss_lvr": 0.3215549886226654, "loss_mode_switch": 0.0, "loss_total": 0.03751261159777641, "step": 1962 }, { "batch_size": 1, "epoch": 0.7848, "step": 1962, "tokens_per_device": 4898 }, { "epoch": 0.7848, "loss_ce": 0.01949186436831951, "loss_lvr": 0.2990105450153351, "loss_mode_switch": 0.0, "loss_total": 0.04939291998744011, "step": 1962 }, { "batch_size": 4, "epoch": 0.7848, "step": 1962, "tokens_per_device": 3752 }, { "epoch": 0.7848, "loss_ce": 0.9886078834533691, "loss_lvr": 1.6173336505889893, "loss_mode_switch": 0.0, "loss_total": 1.150341272354126, "step": 1962 }, { "batch_size": 4, "epoch": 0.7848, "step": 1962, "tokens_per_device": 1620 }, { "epoch": 0.7848, "loss_ce": 0.31807073950767517, "loss_lvr": 0.8781417608261108, "loss_mode_switch": 0.0, "loss_total": 0.40588492155075073, "step": 1962 }, { "batch_size": 1, "epoch": 0.7848, "step": 1962, "tokens_per_device": 5030 }, { "epoch": 0.7848, "loss_ce": 0.0005624471814371645, "loss_lvr": 0.43366891145706177, "loss_mode_switch": 0.0, "loss_total": 0.043929338455200195, "step": 1962 }, { "epoch": 0.7852, "grad_norm": 1.227636456489563, "learning_rate": 1.1619245622681575e-06, "loss": 0.2836, "step": 1963 }, { "batch_size": 1, "epoch": 0.7852, "step": 1963, "tokens_per_device": 5093 }, { "epoch": 0.7852, "loss_ce": 0.00017688138177618384, "loss_lvr": 0.4402378797531128, "loss_mode_switch": 0.0, "loss_total": 0.04420066997408867, "step": 1963 }, { "batch_size": 4, "epoch": 0.7852, "step": 1963, "tokens_per_device": 2680 }, { "epoch": 0.7852, "loss_ce": 0.16968035697937012, "loss_lvr": 0.809209942817688, "loss_mode_switch": 0.0, "loss_total": 0.2506013512611389, "step": 1963 }, { "batch_size": 4, "epoch": 0.7852, "step": 1963, "tokens_per_device": 8476 }, { "epoch": 0.7852, "loss_ce": 0.2177862823009491, "loss_lvr": 0.6300433874130249, "loss_mode_switch": 0.0, "loss_total": 0.28079062700271606, "step": 1963 }, { "batch_size": 4, "epoch": 0.7852, "step": 1963, "tokens_per_device": 2868 }, { "epoch": 0.7852, "loss_ce": 0.5263484716415405, "loss_lvr": 0.5925385355949402, "loss_mode_switch": 0.0, "loss_total": 0.585602343082428, "step": 1963 }, { "batch_size": 1, "epoch": 0.7852, "step": 1963, "tokens_per_device": 5160 }, { "epoch": 0.7852, "loss_ce": 0.008135448209941387, "loss_lvr": 0.4215267300605774, "loss_mode_switch": 0.0, "loss_total": 0.0502881221473217, "step": 1963 }, { "batch_size": 4, "epoch": 0.7852, "step": 1963, "tokens_per_device": 5228 }, { "epoch": 0.7852, "loss_ce": 0.10863833129405975, "loss_lvr": 0.8043064475059509, "loss_mode_switch": 0.0, "loss_total": 0.1890689730644226, "step": 1963 }, { "batch_size": 4, "epoch": 0.7852, "step": 1963, "tokens_per_device": 6008 }, { "epoch": 0.7852, "loss_ce": 0.35189828276634216, "loss_lvr": 0.6681526303291321, "loss_mode_switch": 0.0, "loss_total": 0.4187135398387909, "step": 1963 }, { "batch_size": 1, "epoch": 0.7852, "step": 1963, "tokens_per_device": 5155 }, { "epoch": 0.7852, "loss_ce": 0.20142865180969238, "loss_lvr": 0.3640517294406891, "loss_mode_switch": 0.0, "loss_total": 0.23783382773399353, "step": 1963 }, { "epoch": 0.7856, "grad_norm": 1.3599807024002075, "learning_rate": 1.1577762755404227e-06, "loss": 0.2768, "step": 1964 }, { "batch_size": 4, "epoch": 0.7856, "step": 1964, "tokens_per_device": 4256 }, { "epoch": 0.7856, "loss_ce": 0.5980280637741089, "loss_lvr": 1.0548124313354492, "loss_mode_switch": 0.0, "loss_total": 0.7035093307495117, "step": 1964 }, { "batch_size": 1, "epoch": 0.7856, "step": 1964, "tokens_per_device": 4920 }, { "epoch": 0.7856, "loss_ce": 0.01926233060657978, "loss_lvr": 0.7803201675415039, "loss_mode_switch": 0.0, "loss_total": 0.09729434549808502, "step": 1964 }, { "batch_size": 4, "epoch": 0.7856, "step": 1964, "tokens_per_device": 2064 }, { "epoch": 0.7856, "loss_ce": 0.5313446521759033, "loss_lvr": 0.9548050761222839, "loss_mode_switch": 0.0, "loss_total": 0.6268251538276672, "step": 1964 }, { "batch_size": 4, "epoch": 0.7856, "step": 1964, "tokens_per_device": 3820 }, { "epoch": 0.7856, "loss_ce": 0.4010976552963257, "loss_lvr": 1.1146132946014404, "loss_mode_switch": 0.0, "loss_total": 0.5125589966773987, "step": 1964 }, { "batch_size": 4, "epoch": 0.7856, "step": 1964, "tokens_per_device": 3904 }, { "epoch": 0.7856, "loss_ce": 0.11787022650241852, "loss_lvr": 0.7357592582702637, "loss_mode_switch": 0.0, "loss_total": 0.19144615530967712, "step": 1964 }, { "batch_size": 1, "epoch": 0.7856, "step": 1964, "tokens_per_device": 5431 }, { "epoch": 0.7856, "loss_ce": 0.07200776785612106, "loss_lvr": 0.3599652945995331, "loss_mode_switch": 0.0, "loss_total": 0.10800430178642273, "step": 1964 }, { "batch_size": 4, "epoch": 0.7856, "step": 1964, "tokens_per_device": 4996 }, { "epoch": 0.7856, "loss_ce": 0.3850765526294708, "loss_lvr": 0.8161371350288391, "loss_mode_switch": 0.0, "loss_total": 0.4666902720928192, "step": 1964 }, { "batch_size": 4, "epoch": 0.7856, "step": 1964, "tokens_per_device": 2624 }, { "epoch": 0.7856, "loss_ce": 0.8160113096237183, "loss_lvr": 0.8974930644035339, "loss_mode_switch": 0.0, "loss_total": 0.905760645866394, "step": 1964 }, { "epoch": 0.786, "grad_norm": 1.491053819656372, "learning_rate": 1.1536344373148245e-06, "loss": 0.3025, "step": 1965 }, { "batch_size": 4, "epoch": 0.786, "step": 1965, "tokens_per_device": 2528 }, { "epoch": 0.786, "loss_ce": 0.1250695437192917, "loss_lvr": 0.9394989013671875, "loss_mode_switch": 0.0, "loss_total": 0.21901944279670715, "step": 1965 }, { "batch_size": 4, "epoch": 0.786, "step": 1965, "tokens_per_device": 5208 }, { "epoch": 0.786, "loss_ce": 0.26022088527679443, "loss_lvr": 0.9078615307807922, "loss_mode_switch": 0.0, "loss_total": 0.35100704431533813, "step": 1965 }, { "batch_size": 4, "epoch": 0.786, "step": 1965, "tokens_per_device": 5716 }, { "epoch": 0.786, "loss_ce": 0.2663807272911072, "loss_lvr": 0.8391082882881165, "loss_mode_switch": 0.0, "loss_total": 0.35029155015945435, "step": 1965 }, { "batch_size": 4, "epoch": 0.786, "step": 1965, "tokens_per_device": 2720 }, { "epoch": 0.786, "loss_ce": 0.2248898297548294, "loss_lvr": 0.6288319826126099, "loss_mode_switch": 0.0, "loss_total": 0.2877730131149292, "step": 1965 }, { "batch_size": 1, "epoch": 0.786, "step": 1965, "tokens_per_device": 5118 }, { "epoch": 0.786, "loss_ce": 0.010242738761007786, "loss_lvr": 0.28480151295661926, "loss_mode_switch": 0.0, "loss_total": 0.038722891360521317, "step": 1965 }, { "batch_size": 4, "epoch": 0.786, "step": 1965, "tokens_per_device": 4964 }, { "epoch": 0.786, "loss_ce": 0.010904149152338505, "loss_lvr": 0.8566792607307434, "loss_mode_switch": 0.0, "loss_total": 0.09657207876443863, "step": 1965 }, { "batch_size": 1, "epoch": 0.786, "step": 1965, "tokens_per_device": 5249 }, { "epoch": 0.786, "loss_ce": 0.0004293526289984584, "loss_lvr": 0.2739180028438568, "loss_mode_switch": 0.0, "loss_total": 0.02782115340232849, "step": 1965 }, { "batch_size": 4, "epoch": 0.786, "step": 1965, "tokens_per_device": 6068 }, { "epoch": 0.786, "loss_ce": 0.016446851193904877, "loss_lvr": 0.7867407202720642, "loss_mode_switch": 0.0, "loss_total": 0.09512092173099518, "step": 1965 }, { "epoch": 0.7864, "grad_norm": 1.1986757516860962, "learning_rate": 1.1494990545427153e-06, "loss": 0.2845, "step": 1966 }, { "batch_size": 1, "epoch": 0.7864, "step": 1966, "tokens_per_device": 4954 }, { "epoch": 0.7864, "loss_ce": 0.049134984612464905, "loss_lvr": 0.74020916223526, "loss_mode_switch": 0.0, "loss_total": 0.12315589934587479, "step": 1966 }, { "batch_size": 1, "epoch": 0.7864, "step": 1966, "tokens_per_device": 5105 }, { "epoch": 0.7864, "loss_ce": 0.0006900042062625289, "loss_lvr": 0.33033522963523865, "loss_mode_switch": 0.0, "loss_total": 0.03372352942824364, "step": 1966 }, { "batch_size": 4, "epoch": 0.7864, "step": 1966, "tokens_per_device": 3796 }, { "epoch": 0.7864, "loss_ce": 0.13024570047855377, "loss_lvr": 0.9438456892967224, "loss_mode_switch": 0.0, "loss_total": 0.22463026642799377, "step": 1966 }, { "batch_size": 4, "epoch": 0.7864, "step": 1966, "tokens_per_device": 1292 }, { "epoch": 0.7864, "loss_ce": 0.2708456218242645, "loss_lvr": 0.971057116985321, "loss_mode_switch": 0.0, "loss_total": 0.36795133352279663, "step": 1966 }, { "batch_size": 4, "epoch": 0.7864, "step": 1966, "tokens_per_device": 4384 }, { "epoch": 0.7864, "loss_ce": 0.06215676665306091, "loss_lvr": 0.6760423183441162, "loss_mode_switch": 0.0, "loss_total": 0.1297610104084015, "step": 1966 }, { "batch_size": 4, "epoch": 0.7864, "step": 1966, "tokens_per_device": 10624 }, { "epoch": 0.7864, "loss_ce": 0.7567547559738159, "loss_lvr": 0.8329120874404907, "loss_mode_switch": 0.0, "loss_total": 0.8400459885597229, "step": 1966 }, { "batch_size": 4, "epoch": 0.7864, "step": 1966, "tokens_per_device": 4096 }, { "epoch": 0.7864, "loss_ce": 0.28969472646713257, "loss_lvr": 0.9665583372116089, "loss_mode_switch": 0.0, "loss_total": 0.3863505721092224, "step": 1966 }, { "batch_size": 4, "epoch": 0.7864, "step": 1966, "tokens_per_device": 4268 }, { "epoch": 0.7864, "loss_ce": 0.41920122504234314, "loss_lvr": 0.8776735067367554, "loss_mode_switch": 0.0, "loss_total": 0.5069685578346252, "step": 1966 }, { "epoch": 0.7868, "grad_norm": 1.3681817054748535, "learning_rate": 1.1453701341646134e-06, "loss": 0.3153, "step": 1967 }, { "batch_size": 4, "epoch": 0.7868, "step": 1967, "tokens_per_device": 4452 }, { "epoch": 0.7868, "loss_ce": 0.03348063677549362, "loss_lvr": 0.6402387619018555, "loss_mode_switch": 0.0, "loss_total": 0.09750451147556305, "step": 1967 }, { "batch_size": 4, "epoch": 0.7868, "step": 1967, "tokens_per_device": 7272 }, { "epoch": 0.7868, "loss_ce": 0.0004975866177119315, "loss_lvr": 0.7485644221305847, "loss_mode_switch": 0.0, "loss_total": 0.07535403221845627, "step": 1967 }, { "batch_size": 4, "epoch": 0.7868, "step": 1967, "tokens_per_device": 4392 }, { "epoch": 0.7868, "loss_ce": 0.4098997712135315, "loss_lvr": 0.6989920735359192, "loss_mode_switch": 0.0, "loss_total": 0.47979897260665894, "step": 1967 }, { "batch_size": 4, "epoch": 0.7868, "step": 1967, "tokens_per_device": 3768 }, { "epoch": 0.7868, "loss_ce": 0.4560454785823822, "loss_lvr": 1.0506256818771362, "loss_mode_switch": 0.0, "loss_total": 0.5611080527305603, "step": 1967 }, { "batch_size": 4, "epoch": 0.7868, "step": 1967, "tokens_per_device": 8448 }, { "epoch": 0.7868, "loss_ce": 0.03977939859032631, "loss_lvr": 0.6646834015846252, "loss_mode_switch": 0.0, "loss_total": 0.10624773800373077, "step": 1967 }, { "batch_size": 1, "epoch": 0.7868, "step": 1967, "tokens_per_device": 5189 }, { "epoch": 0.7868, "loss_ce": 0.0323929600417614, "loss_lvr": 0.4764080345630646, "loss_mode_switch": 0.0, "loss_total": 0.08003376424312592, "step": 1967 }, { "batch_size": 4, "epoch": 0.7868, "step": 1967, "tokens_per_device": 4180 }, { "epoch": 0.7868, "loss_ce": 0.2567199468612671, "loss_lvr": 0.7845158576965332, "loss_mode_switch": 0.0, "loss_total": 0.33517152070999146, "step": 1967 }, { "batch_size": 1, "epoch": 0.7868, "step": 1967, "tokens_per_device": 4876 }, { "epoch": 0.7868, "loss_ce": 0.14402256906032562, "loss_lvr": 0.5591914653778076, "loss_mode_switch": 0.0, "loss_total": 0.1999417245388031, "step": 1967 }, { "epoch": 0.7872, "grad_norm": 1.366874098777771, "learning_rate": 1.1412476831101916e-06, "loss": 0.2622, "step": 1968 }, { "batch_size": 1, "epoch": 0.7872, "step": 1968, "tokens_per_device": 4919 }, { "epoch": 0.7872, "loss_ce": 0.0014627939090132713, "loss_lvr": 0.21343587338924408, "loss_mode_switch": 0.0, "loss_total": 0.022806379944086075, "step": 1968 }, { "batch_size": 1, "epoch": 0.7872, "step": 1968, "tokens_per_device": 5187 }, { "epoch": 0.7872, "loss_ce": 0.0005179364234209061, "loss_lvr": 0.37226009368896484, "loss_mode_switch": 0.0, "loss_total": 0.0377439484000206, "step": 1968 }, { "batch_size": 4, "epoch": 0.7872, "step": 1968, "tokens_per_device": 4196 }, { "epoch": 0.7872, "loss_ce": 0.2966969907283783, "loss_lvr": 0.9027137160301208, "loss_mode_switch": 0.0, "loss_total": 0.38696837425231934, "step": 1968 }, { "batch_size": 1, "epoch": 0.7872, "step": 1968, "tokens_per_device": 5179 }, { "epoch": 0.7872, "loss_ce": 0.00503488490357995, "loss_lvr": 0.617260217666626, "loss_mode_switch": 0.0, "loss_total": 0.06676090508699417, "step": 1968 }, { "batch_size": 4, "epoch": 0.7872, "step": 1968, "tokens_per_device": 3808 }, { "epoch": 0.7872, "loss_ce": 0.34175917506217957, "loss_lvr": 0.9278053045272827, "loss_mode_switch": 0.0, "loss_total": 0.43453970551490784, "step": 1968 }, { "batch_size": 4, "epoch": 0.7872, "step": 1968, "tokens_per_device": 3792 }, { "epoch": 0.7872, "loss_ce": 0.1706685721874237, "loss_lvr": 0.9822984337806702, "loss_mode_switch": 0.0, "loss_total": 0.2688984274864197, "step": 1968 }, { "batch_size": 4, "epoch": 0.7872, "step": 1968, "tokens_per_device": 4216 }, { "epoch": 0.7872, "loss_ce": 0.29490169882774353, "loss_lvr": 0.7678855061531067, "loss_mode_switch": 0.0, "loss_total": 0.3716902434825897, "step": 1968 }, { "batch_size": 4, "epoch": 0.7872, "step": 1968, "tokens_per_device": 4220 }, { "epoch": 0.7872, "loss_ce": 0.435884565114975, "loss_lvr": 1.0739026069641113, "loss_mode_switch": 0.0, "loss_total": 0.5432748198509216, "step": 1968 }, { "epoch": 0.7876, "grad_norm": 1.4999901056289673, "learning_rate": 1.1371317082982658e-06, "loss": 0.3063, "step": 1969 }, { "batch_size": 4, "epoch": 0.7876, "step": 1969, "tokens_per_device": 6280 }, { "epoch": 0.7876, "loss_ce": 0.04600904509425163, "loss_lvr": 0.7511186599731445, "loss_mode_switch": 0.0, "loss_total": 0.12112091481685638, "step": 1969 }, { "batch_size": 4, "epoch": 0.7876, "step": 1969, "tokens_per_device": 5568 }, { "epoch": 0.7876, "loss_ce": 0.0914793312549591, "loss_lvr": 0.8827577829360962, "loss_mode_switch": 0.0, "loss_total": 0.17975512146949768, "step": 1969 }, { "batch_size": 4, "epoch": 0.7876, "step": 1969, "tokens_per_device": 4344 }, { "epoch": 0.7876, "loss_ce": 0.0014722299529239535, "loss_lvr": 0.838836669921875, "loss_mode_switch": 0.0, "loss_total": 0.08535589277744293, "step": 1969 }, { "batch_size": 1, "epoch": 0.7876, "step": 1969, "tokens_per_device": 5126 }, { "epoch": 0.7876, "loss_ce": 0.014143691398203373, "loss_lvr": 0.2990909814834595, "loss_mode_switch": 0.0, "loss_total": 0.044052790850400925, "step": 1969 }, { "batch_size": 4, "epoch": 0.7876, "step": 1969, "tokens_per_device": 5780 }, { "epoch": 0.7876, "loss_ce": 0.052595753222703934, "loss_lvr": 0.7287119030952454, "loss_mode_switch": 0.0, "loss_total": 0.1254669427871704, "step": 1969 }, { "batch_size": 4, "epoch": 0.7876, "step": 1969, "tokens_per_device": 6264 }, { "epoch": 0.7876, "loss_ce": 0.10920428484678268, "loss_lvr": 0.9598312973976135, "loss_mode_switch": 0.0, "loss_total": 0.20518741011619568, "step": 1969 }, { "batch_size": 4, "epoch": 0.7876, "step": 1969, "tokens_per_device": 1940 }, { "epoch": 0.7876, "loss_ce": 0.1897200644016266, "loss_lvr": 0.8256151080131531, "loss_mode_switch": 0.0, "loss_total": 0.27228158712387085, "step": 1969 }, { "batch_size": 1, "epoch": 0.7876, "step": 1969, "tokens_per_device": 4928 }, { "epoch": 0.7876, "loss_ce": 0.0024869926273822784, "loss_lvr": 0.30653464794158936, "loss_mode_switch": 0.0, "loss_total": 0.033140458166599274, "step": 1969 }, { "epoch": 0.788, "grad_norm": 1.2523658275604248, "learning_rate": 1.133022216636781e-06, "loss": 0.2696, "step": 1970 }, { "batch_size": 1, "epoch": 0.788, "step": 1970, "tokens_per_device": 4684 }, { "epoch": 0.788, "loss_ce": 0.0006270165322348475, "loss_lvr": 0.34127768874168396, "loss_mode_switch": 0.0, "loss_total": 0.034754782915115356, "step": 1970 }, { "batch_size": 1, "epoch": 0.788, "step": 1970, "tokens_per_device": 5148 }, { "epoch": 0.788, "loss_ce": 0.1274905800819397, "loss_lvr": 0.24686609208583832, "loss_mode_switch": 0.0, "loss_total": 0.15217718482017517, "step": 1970 }, { "batch_size": 4, "epoch": 0.788, "step": 1970, "tokens_per_device": 4456 }, { "epoch": 0.788, "loss_ce": 0.10405993461608887, "loss_lvr": 0.9685566425323486, "loss_mode_switch": 0.0, "loss_total": 0.2009156048297882, "step": 1970 }, { "batch_size": 1, "epoch": 0.788, "step": 1970, "tokens_per_device": 4901 }, { "epoch": 0.788, "loss_ce": 0.006004610564559698, "loss_lvr": 0.16179172694683075, "loss_mode_switch": 0.0, "loss_total": 0.02218378335237503, "step": 1970 }, { "batch_size": 4, "epoch": 0.788, "step": 1970, "tokens_per_device": 1288 }, { "epoch": 0.788, "loss_ce": 0.3414437770843506, "loss_lvr": 1.9569058418273926, "loss_mode_switch": 0.0, "loss_total": 0.5371343493461609, "step": 1970 }, { "batch_size": 4, "epoch": 0.788, "step": 1970, "tokens_per_device": 3832 }, { "epoch": 0.788, "loss_ce": 0.5220856666564941, "loss_lvr": 1.2050048112869263, "loss_mode_switch": 0.0, "loss_total": 0.6425861716270447, "step": 1970 }, { "batch_size": 4, "epoch": 0.788, "step": 1970, "tokens_per_device": 3728 }, { "epoch": 0.788, "loss_ce": 0.015518777072429657, "loss_lvr": 0.4404592514038086, "loss_mode_switch": 0.0, "loss_total": 0.059564702212810516, "step": 1970 }, { "batch_size": 4, "epoch": 0.788, "step": 1970, "tokens_per_device": 1428 }, { "epoch": 0.788, "loss_ce": 0.3190302848815918, "loss_lvr": 1.079209804534912, "loss_mode_switch": 0.0, "loss_total": 0.42695125937461853, "step": 1970 }, { "epoch": 0.7884, "grad_norm": 1.3421269655227661, "learning_rate": 1.1289192150228007e-06, "loss": 0.3036, "step": 1971 }, { "batch_size": 4, "epoch": 0.7884, "step": 1971, "tokens_per_device": 3780 }, { "epoch": 0.7884, "loss_ce": 0.19909796118736267, "loss_lvr": 1.0558645725250244, "loss_mode_switch": 0.0, "loss_total": 0.30468443036079407, "step": 1971 }, { "batch_size": 4, "epoch": 0.7884, "step": 1971, "tokens_per_device": 5836 }, { "epoch": 0.7884, "loss_ce": 0.24621371924877167, "loss_lvr": 0.5484816431999207, "loss_mode_switch": 0.0, "loss_total": 0.30106186866760254, "step": 1971 }, { "batch_size": 1, "epoch": 0.7884, "step": 1971, "tokens_per_device": 5997 }, { "epoch": 0.7884, "loss_ce": 0.06639759987592697, "loss_lvr": 0.31984996795654297, "loss_mode_switch": 0.0, "loss_total": 0.09838259220123291, "step": 1971 }, { "batch_size": 4, "epoch": 0.7884, "step": 1971, "tokens_per_device": 4208 }, { "epoch": 0.7884, "loss_ce": 0.5940761566162109, "loss_lvr": 0.7885676622390747, "loss_mode_switch": 0.0, "loss_total": 0.6729329228401184, "step": 1971 }, { "batch_size": 4, "epoch": 0.7884, "step": 1971, "tokens_per_device": 4340 }, { "epoch": 0.7884, "loss_ce": 0.14623382687568665, "loss_lvr": 0.7287343740463257, "loss_mode_switch": 0.0, "loss_total": 0.2191072702407837, "step": 1971 }, { "batch_size": 1, "epoch": 0.7884, "step": 1971, "tokens_per_device": 5453 }, { "epoch": 0.7884, "loss_ce": 0.08611846715211868, "loss_lvr": 0.4602092504501343, "loss_mode_switch": 0.0, "loss_total": 0.1321393847465515, "step": 1971 }, { "batch_size": 4, "epoch": 0.7884, "step": 1971, "tokens_per_device": 4564 }, { "epoch": 0.7884, "loss_ce": 0.014530249871313572, "loss_lvr": 0.8986175060272217, "loss_mode_switch": 0.0, "loss_total": 0.10439199954271317, "step": 1971 }, { "batch_size": 4, "epoch": 0.7884, "step": 1971, "tokens_per_device": 5740 }, { "epoch": 0.7884, "loss_ce": 0.4187379777431488, "loss_lvr": 0.7104455232620239, "loss_mode_switch": 0.0, "loss_total": 0.48978254199028015, "step": 1971 }, { "epoch": 0.7888, "grad_norm": 1.2650015354156494, "learning_rate": 1.124822710342499e-06, "loss": 0.286, "step": 1972 }, { "batch_size": 1, "epoch": 0.7888, "step": 1972, "tokens_per_device": 4852 }, { "epoch": 0.7888, "loss_ce": 0.01589113287627697, "loss_lvr": 0.284934401512146, "loss_mode_switch": 0.0, "loss_total": 0.04438457265496254, "step": 1972 }, { "batch_size": 4, "epoch": 0.7888, "step": 1972, "tokens_per_device": 4896 }, { "epoch": 0.7888, "loss_ce": 0.44083675742149353, "loss_lvr": 0.7629495859146118, "loss_mode_switch": 0.0, "loss_total": 0.5171316862106323, "step": 1972 }, { "batch_size": 4, "epoch": 0.7888, "step": 1972, "tokens_per_device": 2568 }, { "epoch": 0.7888, "loss_ce": 0.2707917094230652, "loss_lvr": 1.1615091562271118, "loss_mode_switch": 0.0, "loss_total": 0.38694262504577637, "step": 1972 }, { "batch_size": 1, "epoch": 0.7888, "step": 1972, "tokens_per_device": 5097 }, { "epoch": 0.7888, "loss_ce": 0.0002806511474773288, "loss_lvr": 0.33522504568099976, "loss_mode_switch": 0.0, "loss_total": 0.03380315750837326, "step": 1972 }, { "batch_size": 1, "epoch": 0.7888, "step": 1972, "tokens_per_device": 5222 }, { "epoch": 0.7888, "loss_ce": 0.5217931866645813, "loss_lvr": 0.2831316292285919, "loss_mode_switch": 0.0, "loss_total": 0.5501063466072083, "step": 1972 }, { "batch_size": 1, "epoch": 0.7888, "step": 1972, "tokens_per_device": 4891 }, { "epoch": 0.7888, "loss_ce": 0.00020024474360980093, "loss_lvr": 0.4846680164337158, "loss_mode_switch": 0.0, "loss_total": 0.048667047172784805, "step": 1972 }, { "batch_size": 4, "epoch": 0.7888, "step": 1972, "tokens_per_device": 3944 }, { "epoch": 0.7888, "loss_ce": 0.4073396623134613, "loss_lvr": 0.882574200630188, "loss_mode_switch": 0.0, "loss_total": 0.49559709429740906, "step": 1972 }, { "batch_size": 4, "epoch": 0.7888, "step": 1972, "tokens_per_device": 5932 }, { "epoch": 0.7888, "loss_ce": 0.37077149748802185, "loss_lvr": 0.952679455280304, "loss_mode_switch": 0.0, "loss_total": 0.4660394489765167, "step": 1972 }, { "epoch": 0.7892, "grad_norm": 1.2357875108718872, "learning_rate": 1.1207327094711423e-06, "loss": 0.269, "step": 1973 }, { "batch_size": 4, "epoch": 0.7892, "step": 1973, "tokens_per_device": 1560 }, { "epoch": 0.7892, "loss_ce": 0.25116708874702454, "loss_lvr": 0.7589012980461121, "loss_mode_switch": 0.0, "loss_total": 0.32705721259117126, "step": 1973 }, { "batch_size": 4, "epoch": 0.7892, "step": 1973, "tokens_per_device": 1240 }, { "epoch": 0.7892, "loss_ce": 0.2637898623943329, "loss_lvr": 1.1016929149627686, "loss_mode_switch": 0.0, "loss_total": 0.37395915389060974, "step": 1973 }, { "batch_size": 1, "epoch": 0.7892, "step": 1973, "tokens_per_device": 5136 }, { "epoch": 0.7892, "loss_ce": 0.03348714858293533, "loss_lvr": 0.3265685439109802, "loss_mode_switch": 0.0, "loss_total": 0.06614400446414948, "step": 1973 }, { "batch_size": 4, "epoch": 0.7892, "step": 1973, "tokens_per_device": 4792 }, { "epoch": 0.7892, "loss_ce": 0.048982635140419006, "loss_lvr": 0.94096440076828, "loss_mode_switch": 0.0, "loss_total": 0.14307907223701477, "step": 1973 }, { "batch_size": 4, "epoch": 0.7892, "step": 1973, "tokens_per_device": 1396 }, { "epoch": 0.7892, "loss_ce": 0.44804033637046814, "loss_lvr": 2.2233707904815674, "loss_mode_switch": 0.0, "loss_total": 0.6703774333000183, "step": 1973 }, { "batch_size": 4, "epoch": 0.7892, "step": 1973, "tokens_per_device": 3992 }, { "epoch": 0.7892, "loss_ce": 0.11451946198940277, "loss_lvr": 0.9178169369697571, "loss_mode_switch": 0.0, "loss_total": 0.20630115270614624, "step": 1973 }, { "batch_size": 4, "epoch": 0.7892, "step": 1973, "tokens_per_device": 4804 }, { "epoch": 0.7892, "loss_ce": 0.21703705191612244, "loss_lvr": 0.888543426990509, "loss_mode_switch": 0.0, "loss_total": 0.30589139461517334, "step": 1973 }, { "batch_size": 4, "epoch": 0.7892, "step": 1973, "tokens_per_device": 3984 }, { "epoch": 0.7892, "loss_ce": 0.27054929733276367, "loss_lvr": 0.8206965923309326, "loss_mode_switch": 0.0, "loss_total": 0.3526189625263214, "step": 1973 }, { "epoch": 0.7896, "grad_norm": 1.214651107788086, "learning_rate": 1.116649219273086e-06, "loss": 0.2558, "step": 1974 }, { "batch_size": 4, "epoch": 0.7896, "step": 1974, "tokens_per_device": 2668 }, { "epoch": 0.7896, "loss_ce": 0.021834932267665863, "loss_lvr": 0.8140431642532349, "loss_mode_switch": 0.0, "loss_total": 0.10323925316333771, "step": 1974 }, { "batch_size": 4, "epoch": 0.7896, "step": 1974, "tokens_per_device": 4272 }, { "epoch": 0.7896, "loss_ce": 0.23213139176368713, "loss_lvr": 0.5331647992134094, "loss_mode_switch": 0.0, "loss_total": 0.2854478657245636, "step": 1974 }, { "batch_size": 4, "epoch": 0.7896, "step": 1974, "tokens_per_device": 6512 }, { "epoch": 0.7896, "loss_ce": 0.08013949543237686, "loss_lvr": 0.7529023885726929, "loss_mode_switch": 0.0, "loss_total": 0.15542973577976227, "step": 1974 }, { "batch_size": 4, "epoch": 0.7896, "step": 1974, "tokens_per_device": 4992 }, { "epoch": 0.7896, "loss_ce": 0.015298685058951378, "loss_lvr": 0.6928688883781433, "loss_mode_switch": 0.0, "loss_total": 0.08458557724952698, "step": 1974 }, { "batch_size": 4, "epoch": 0.7896, "step": 1974, "tokens_per_device": 4044 }, { "epoch": 0.7896, "loss_ce": 0.12935617566108704, "loss_lvr": 1.0209161043167114, "loss_mode_switch": 0.0, "loss_total": 0.23144778609275818, "step": 1974 }, { "batch_size": 4, "epoch": 0.7896, "step": 1974, "tokens_per_device": 1648 }, { "epoch": 0.7896, "loss_ce": 0.7223480939865112, "loss_lvr": 0.9946195483207703, "loss_mode_switch": 0.0, "loss_total": 0.8218100666999817, "step": 1974 }, { "batch_size": 4, "epoch": 0.7896, "step": 1974, "tokens_per_device": 5068 }, { "epoch": 0.7896, "loss_ce": 0.2124210149049759, "loss_lvr": 0.4654931128025055, "loss_mode_switch": 0.0, "loss_total": 0.25897032022476196, "step": 1974 }, { "batch_size": 4, "epoch": 0.7896, "step": 1974, "tokens_per_device": 1368 }, { "epoch": 0.7896, "loss_ce": 0.11669619381427765, "loss_lvr": 0.8407717943191528, "loss_mode_switch": 0.0, "loss_total": 0.20077337324619293, "step": 1974 }, { "epoch": 0.79, "grad_norm": 1.3257044553756714, "learning_rate": 1.1125722466017547e-06, "loss": 0.2949, "step": 1975 }, { "batch_size": 1, "epoch": 0.79, "step": 1975, "tokens_per_device": 4903 }, { "epoch": 0.79, "loss_ce": 0.007112790364772081, "loss_lvr": 0.29619544744491577, "loss_mode_switch": 0.0, "loss_total": 0.03673233464360237, "step": 1975 }, { "batch_size": 4, "epoch": 0.79, "step": 1975, "tokens_per_device": 4420 }, { "epoch": 0.79, "loss_ce": 0.001949015073478222, "loss_lvr": 1.4097412824630737, "loss_mode_switch": 0.0, "loss_total": 0.14292314648628235, "step": 1975 }, { "batch_size": 1, "epoch": 0.79, "step": 1975, "tokens_per_device": 6678 }, { "epoch": 0.79, "loss_ce": 0.00012211743160150945, "loss_lvr": 0.4096798002719879, "loss_mode_switch": 0.0, "loss_total": 0.04109010100364685, "step": 1975 }, { "batch_size": 4, "epoch": 0.79, "step": 1975, "tokens_per_device": 4456 }, { "epoch": 0.79, "loss_ce": 0.45838016271591187, "loss_lvr": 0.7702836394309998, "loss_mode_switch": 0.0, "loss_total": 0.5354084968566895, "step": 1975 }, { "batch_size": 1, "epoch": 0.79, "step": 1975, "tokens_per_device": 4922 }, { "epoch": 0.79, "loss_ce": 0.02027176134288311, "loss_lvr": 0.2781204283237457, "loss_mode_switch": 0.0, "loss_total": 0.04808380454778671, "step": 1975 }, { "batch_size": 1, "epoch": 0.79, "step": 1975, "tokens_per_device": 5941 }, { "epoch": 0.79, "loss_ce": 0.08697652071714401, "loss_lvr": 0.17413771152496338, "loss_mode_switch": 0.0, "loss_total": 0.10439029335975647, "step": 1975 }, { "batch_size": 4, "epoch": 0.79, "step": 1975, "tokens_per_device": 2052 }, { "epoch": 0.79, "loss_ce": 0.01183532364666462, "loss_lvr": 1.1959128379821777, "loss_mode_switch": 0.0, "loss_total": 0.1314266175031662, "step": 1975 }, { "batch_size": 4, "epoch": 0.79, "step": 1975, "tokens_per_device": 4288 }, { "epoch": 0.79, "loss_ce": 0.0213091429322958, "loss_lvr": 0.7405534386634827, "loss_mode_switch": 0.0, "loss_total": 0.09536448866128922, "step": 1975 }, { "epoch": 0.7904, "grad_norm": 1.308485984802246, "learning_rate": 1.1085017982996337e-06, "loss": 0.2897, "step": 1976 }, { "batch_size": 4, "epoch": 0.7904, "step": 1976, "tokens_per_device": 8580 }, { "epoch": 0.7904, "loss_ce": 0.3150603473186493, "loss_lvr": 0.8629881143569946, "loss_mode_switch": 0.0, "loss_total": 0.4013591706752777, "step": 1976 }, { "batch_size": 4, "epoch": 0.7904, "step": 1976, "tokens_per_device": 1596 }, { "epoch": 0.7904, "loss_ce": 0.5176694989204407, "loss_lvr": 1.089242935180664, "loss_mode_switch": 0.0, "loss_total": 0.6265937685966492, "step": 1976 }, { "batch_size": 1, "epoch": 0.7904, "step": 1976, "tokens_per_device": 5107 }, { "epoch": 0.7904, "loss_ce": 0.007565399631857872, "loss_lvr": 0.47437384724617004, "loss_mode_switch": 0.0, "loss_total": 0.055002786219120026, "step": 1976 }, { "batch_size": 4, "epoch": 0.7904, "step": 1976, "tokens_per_device": 1340 }, { "epoch": 0.7904, "loss_ce": 0.08060983568429947, "loss_lvr": 0.9905177354812622, "loss_mode_switch": 0.0, "loss_total": 0.1796616017818451, "step": 1976 }, { "batch_size": 4, "epoch": 0.7904, "step": 1976, "tokens_per_device": 4244 }, { "epoch": 0.7904, "loss_ce": 0.0008136451942846179, "loss_lvr": 0.40297555923461914, "loss_mode_switch": 0.0, "loss_total": 0.04111120104789734, "step": 1976 }, { "batch_size": 4, "epoch": 0.7904, "step": 1976, "tokens_per_device": 11172 }, { "epoch": 0.7904, "loss_ce": 0.09370653331279755, "loss_lvr": 0.6114935278892517, "loss_mode_switch": 0.0, "loss_total": 0.1548558920621872, "step": 1976 }, { "batch_size": 4, "epoch": 0.7904, "step": 1976, "tokens_per_device": 3796 }, { "epoch": 0.7904, "loss_ce": 0.17908932268619537, "loss_lvr": 0.8850715160369873, "loss_mode_switch": 0.0, "loss_total": 0.2675964832305908, "step": 1976 }, { "batch_size": 4, "epoch": 0.7904, "step": 1976, "tokens_per_device": 2736 }, { "epoch": 0.7904, "loss_ce": 0.2079092115163803, "loss_lvr": 0.6400063633918762, "loss_mode_switch": 0.0, "loss_total": 0.27190983295440674, "step": 1976 }, { "epoch": 0.7908, "grad_norm": 1.134390115737915, "learning_rate": 1.1044378811982631e-06, "loss": 0.2163, "step": 1977 }, { "batch_size": 4, "epoch": 0.7908, "step": 1977, "tokens_per_device": 13348 }, { "epoch": 0.7908, "loss_ce": 0.019838830456137657, "loss_lvr": 0.5494945645332336, "loss_mode_switch": 0.0, "loss_total": 0.07478828728199005, "step": 1977 }, { "batch_size": 1, "epoch": 0.7908, "step": 1977, "tokens_per_device": 7212 }, { "epoch": 0.7908, "loss_ce": 0.0023869636934250593, "loss_lvr": 0.42557772994041443, "loss_mode_switch": 0.0, "loss_total": 0.04494473710656166, "step": 1977 }, { "batch_size": 1, "epoch": 0.7908, "step": 1977, "tokens_per_device": 4917 }, { "epoch": 0.7908, "loss_ce": 0.014094041660428047, "loss_lvr": 0.2624419033527374, "loss_mode_switch": 0.0, "loss_total": 0.04033823311328888, "step": 1977 }, { "batch_size": 4, "epoch": 0.7908, "step": 1977, "tokens_per_device": 4248 }, { "epoch": 0.7908, "loss_ce": 0.6568174362182617, "loss_lvr": 0.8255653381347656, "loss_mode_switch": 0.0, "loss_total": 0.7393739819526672, "step": 1977 }, { "batch_size": 1, "epoch": 0.7908, "step": 1977, "tokens_per_device": 4967 }, { "epoch": 0.7908, "loss_ce": 0.03524050861597061, "loss_lvr": 0.34341877698898315, "loss_mode_switch": 0.0, "loss_total": 0.06958238780498505, "step": 1977 }, { "batch_size": 4, "epoch": 0.7908, "step": 1977, "tokens_per_device": 14336 }, { "epoch": 0.7908, "loss_ce": 0.1043519526720047, "loss_lvr": 0.31072700023651123, "loss_mode_switch": 0.0, "loss_total": 0.1354246586561203, "step": 1977 }, { "batch_size": 1, "epoch": 0.7908, "step": 1977, "tokens_per_device": 5027 }, { "epoch": 0.7908, "loss_ce": 0.44832485914230347, "loss_lvr": 0.629321813583374, "loss_mode_switch": 0.0, "loss_total": 0.5112570524215698, "step": 1977 }, { "batch_size": 4, "epoch": 0.7908, "step": 1977, "tokens_per_device": 10552 }, { "epoch": 0.7908, "loss_ce": 0.37064021825790405, "loss_lvr": 0.6946662664413452, "loss_mode_switch": 0.0, "loss_total": 0.4401068389415741, "step": 1977 }, { "epoch": 0.7912, "grad_norm": 1.2413090467453003, "learning_rate": 1.1003805021182169e-06, "loss": 0.2557, "step": 1978 }, { "batch_size": 4, "epoch": 0.7912, "step": 1978, "tokens_per_device": 5704 }, { "epoch": 0.7912, "loss_ce": 0.0474865548312664, "loss_lvr": 0.8243468403816223, "loss_mode_switch": 0.0, "loss_total": 0.12992124259471893, "step": 1978 }, { "batch_size": 4, "epoch": 0.7912, "step": 1978, "tokens_per_device": 3380 }, { "epoch": 0.7912, "loss_ce": 0.21005243062973022, "loss_lvr": 0.7810202836990356, "loss_mode_switch": 0.0, "loss_total": 0.2881544530391693, "step": 1978 }, { "batch_size": 4, "epoch": 0.7912, "step": 1978, "tokens_per_device": 4532 }, { "epoch": 0.7912, "loss_ce": 0.1229097917675972, "loss_lvr": 0.7826313376426697, "loss_mode_switch": 0.0, "loss_total": 0.20117291808128357, "step": 1978 }, { "batch_size": 1, "epoch": 0.7912, "step": 1978, "tokens_per_device": 4366 }, { "epoch": 0.7912, "loss_ce": 0.4782896637916565, "loss_lvr": 0.4140557646751404, "loss_mode_switch": 0.0, "loss_total": 0.5196952223777771, "step": 1978 }, { "batch_size": 4, "epoch": 0.7912, "step": 1978, "tokens_per_device": 5480 }, { "epoch": 0.7912, "loss_ce": 0.32487383484840393, "loss_lvr": 0.9341922402381897, "loss_mode_switch": 0.0, "loss_total": 0.4182930588722229, "step": 1978 }, { "batch_size": 4, "epoch": 0.7912, "step": 1978, "tokens_per_device": 3740 }, { "epoch": 0.7912, "loss_ce": 0.03486720845103264, "loss_lvr": 2.350074529647827, "loss_mode_switch": 0.0, "loss_total": 0.2698746621608734, "step": 1978 }, { "batch_size": 1, "epoch": 0.7912, "step": 1978, "tokens_per_device": 5509 }, { "epoch": 0.7912, "loss_ce": 0.015995018184185028, "loss_lvr": 0.34197455644607544, "loss_mode_switch": 0.0, "loss_total": 0.05019247531890869, "step": 1978 }, { "batch_size": 1, "epoch": 0.7912, "step": 1978, "tokens_per_device": 4885 }, { "epoch": 0.7912, "loss_ce": 0.010197465308010578, "loss_lvr": 1.0910720825195312, "loss_mode_switch": 0.0, "loss_total": 0.11930467933416367, "step": 1978 }, { "epoch": 0.7916, "grad_norm": 1.3832154273986816, "learning_rate": 1.0963296678691e-06, "loss": 0.259, "step": 1979 }, { "batch_size": 4, "epoch": 0.7916, "step": 1979, "tokens_per_device": 1236 }, { "epoch": 0.7916, "loss_ce": 0.21223843097686768, "loss_lvr": 1.0251892805099487, "loss_mode_switch": 0.0, "loss_total": 0.3147573471069336, "step": 1979 }, { "batch_size": 4, "epoch": 0.7916, "step": 1979, "tokens_per_device": 2656 }, { "epoch": 0.7916, "loss_ce": 0.08291284739971161, "loss_lvr": 0.7070654034614563, "loss_mode_switch": 0.0, "loss_total": 0.15361937880516052, "step": 1979 }, { "batch_size": 4, "epoch": 0.7916, "step": 1979, "tokens_per_device": 6208 }, { "epoch": 0.7916, "loss_ce": 0.4658792018890381, "loss_lvr": 0.5161768198013306, "loss_mode_switch": 0.0, "loss_total": 0.5174968838691711, "step": 1979 }, { "batch_size": 1, "epoch": 0.7916, "step": 1979, "tokens_per_device": 4997 }, { "epoch": 0.7916, "loss_ce": 0.4901539385318756, "loss_lvr": 0.6237221956253052, "loss_mode_switch": 0.0, "loss_total": 0.5525261759757996, "step": 1979 }, { "batch_size": 4, "epoch": 0.7916, "step": 1979, "tokens_per_device": 4976 }, { "epoch": 0.7916, "loss_ce": 0.2734193205833435, "loss_lvr": 0.7105685472488403, "loss_mode_switch": 0.0, "loss_total": 0.3444761633872986, "step": 1979 }, { "batch_size": 4, "epoch": 0.7916, "step": 1979, "tokens_per_device": 5996 }, { "epoch": 0.7916, "loss_ce": 0.0038636045064777136, "loss_lvr": 0.668736457824707, "loss_mode_switch": 0.0, "loss_total": 0.07073725014925003, "step": 1979 }, { "batch_size": 4, "epoch": 0.7916, "step": 1979, "tokens_per_device": 1928 }, { "epoch": 0.7916, "loss_ce": 0.48652103543281555, "loss_lvr": 0.7593040466308594, "loss_mode_switch": 0.0, "loss_total": 0.5624514222145081, "step": 1979 }, { "batch_size": 4, "epoch": 0.7916, "step": 1979, "tokens_per_device": 11544 }, { "epoch": 0.7916, "loss_ce": 0.34057196974754333, "loss_lvr": 0.48779502511024475, "loss_mode_switch": 0.0, "loss_total": 0.389351487159729, "step": 1979 }, { "epoch": 0.792, "grad_norm": 1.3849034309387207, "learning_rate": 1.092285385249528e-06, "loss": 0.3291, "step": 1980 }, { "batch_size": 4, "epoch": 0.792, "step": 1980, "tokens_per_device": 4228 }, { "epoch": 0.792, "loss_ce": 0.14883334934711456, "loss_lvr": 0.8634946942329407, "loss_mode_switch": 0.0, "loss_total": 0.23518282175064087, "step": 1980 }, { "batch_size": 4, "epoch": 0.792, "step": 1980, "tokens_per_device": 4436 }, { "epoch": 0.792, "loss_ce": 0.08279919624328613, "loss_lvr": 1.0082756280899048, "loss_mode_switch": 0.0, "loss_total": 0.18362677097320557, "step": 1980 }, { "batch_size": 4, "epoch": 0.792, "step": 1980, "tokens_per_device": 4416 }, { "epoch": 0.792, "loss_ce": 0.3315012753009796, "loss_lvr": 0.7396207451820374, "loss_mode_switch": 0.0, "loss_total": 0.4054633378982544, "step": 1980 }, { "batch_size": 1, "epoch": 0.792, "step": 1980, "tokens_per_device": 6031 }, { "epoch": 0.792, "loss_ce": 0.00021282851230353117, "loss_lvr": 0.31816187500953674, "loss_mode_switch": 0.0, "loss_total": 0.03202901780605316, "step": 1980 }, { "batch_size": 4, "epoch": 0.792, "step": 1980, "tokens_per_device": 3672 }, { "epoch": 0.792, "loss_ce": 0.25929921865463257, "loss_lvr": 0.4106872081756592, "loss_mode_switch": 0.0, "loss_total": 0.30036795139312744, "step": 1980 }, { "batch_size": 1, "epoch": 0.792, "step": 1980, "tokens_per_device": 4682 }, { "epoch": 0.792, "loss_ce": 0.05096814036369324, "loss_lvr": 0.6117704510688782, "loss_mode_switch": 0.0, "loss_total": 0.11214518547058105, "step": 1980 }, { "batch_size": 4, "epoch": 0.792, "step": 1980, "tokens_per_device": 3836 }, { "epoch": 0.792, "loss_ce": 0.05045574903488159, "loss_lvr": 0.7203729748725891, "loss_mode_switch": 0.0, "loss_total": 0.12249305099248886, "step": 1980 }, { "batch_size": 4, "epoch": 0.792, "step": 1980, "tokens_per_device": 1708 }, { "epoch": 0.792, "loss_ce": 0.40593385696411133, "loss_lvr": 1.112294316291809, "loss_mode_switch": 0.0, "loss_total": 0.5171632766723633, "step": 1980 }, { "epoch": 0.7924, "grad_norm": 1.640884518623352, "learning_rate": 1.088247661047127e-06, "loss": 0.2853, "step": 1981 }, { "batch_size": 4, "epoch": 0.7924, "step": 1981, "tokens_per_device": 1216 }, { "epoch": 0.7924, "loss_ce": 0.17529448866844177, "loss_lvr": 0.8456123471260071, "loss_mode_switch": 0.0, "loss_total": 0.259855717420578, "step": 1981 }, { "batch_size": 1, "epoch": 0.7924, "step": 1981, "tokens_per_device": 5149 }, { "epoch": 0.7924, "loss_ce": 0.29860445857048035, "loss_lvr": 0.26779940724372864, "loss_mode_switch": 0.0, "loss_total": 0.3253844082355499, "step": 1981 }, { "batch_size": 1, "epoch": 0.7924, "step": 1981, "tokens_per_device": 4915 }, { "epoch": 0.7924, "loss_ce": 0.030565915629267693, "loss_lvr": 0.5141132473945618, "loss_mode_switch": 0.0, "loss_total": 0.0819772407412529, "step": 1981 }, { "batch_size": 4, "epoch": 0.7924, "step": 1981, "tokens_per_device": 4704 }, { "epoch": 0.7924, "loss_ce": 0.10819298774003983, "loss_lvr": 0.9743215441703796, "loss_mode_switch": 0.0, "loss_total": 0.20562514662742615, "step": 1981 }, { "batch_size": 1, "epoch": 0.7924, "step": 1981, "tokens_per_device": 4857 }, { "epoch": 0.7924, "loss_ce": 0.04697727784514427, "loss_lvr": 0.33941930532455444, "loss_mode_switch": 0.0, "loss_total": 0.08091920614242554, "step": 1981 }, { "batch_size": 4, "epoch": 0.7924, "step": 1981, "tokens_per_device": 2608 }, { "epoch": 0.7924, "loss_ce": 0.3265984356403351, "loss_lvr": 0.872787356376648, "loss_mode_switch": 0.0, "loss_total": 0.4138771891593933, "step": 1981 }, { "batch_size": 1, "epoch": 0.7924, "step": 1981, "tokens_per_device": 4866 }, { "epoch": 0.7924, "loss_ce": 0.15192051231861115, "loss_lvr": 0.40488728880882263, "loss_mode_switch": 0.0, "loss_total": 0.19240924715995789, "step": 1981 }, { "batch_size": 4, "epoch": 0.7924, "step": 1981, "tokens_per_device": 1456 }, { "epoch": 0.7924, "loss_ce": 0.5042036771774292, "loss_lvr": 0.8013088703155518, "loss_mode_switch": 0.0, "loss_total": 0.5843345522880554, "step": 1981 }, { "epoch": 0.7928, "grad_norm": 1.2005997896194458, "learning_rate": 1.0842165020385092e-06, "loss": 0.2442, "step": 1982 }, { "batch_size": 1, "epoch": 0.7928, "step": 1982, "tokens_per_device": 4875 }, { "epoch": 0.7928, "loss_ce": 0.009743522852659225, "loss_lvr": 0.4534327983856201, "loss_mode_switch": 0.0, "loss_total": 0.05508680269122124, "step": 1982 }, { "batch_size": 1, "epoch": 0.7928, "step": 1982, "tokens_per_device": 4850 }, { "epoch": 0.7928, "loss_ce": 0.2779947519302368, "loss_lvr": 0.531589925289154, "loss_mode_switch": 0.0, "loss_total": 0.3311537504196167, "step": 1982 }, { "batch_size": 4, "epoch": 0.7928, "step": 1982, "tokens_per_device": 3824 }, { "epoch": 0.7928, "loss_ce": 0.3865380883216858, "loss_lvr": 0.7608884572982788, "loss_mode_switch": 0.0, "loss_total": 0.46262693405151367, "step": 1982 }, { "batch_size": 1, "epoch": 0.7928, "step": 1982, "tokens_per_device": 4888 }, { "epoch": 0.7928, "loss_ce": 0.04122824966907501, "loss_lvr": 0.6262617707252502, "loss_mode_switch": 0.0, "loss_total": 0.10385442525148392, "step": 1982 }, { "batch_size": 4, "epoch": 0.7928, "step": 1982, "tokens_per_device": 10712 }, { "epoch": 0.7928, "loss_ce": 0.5877295732498169, "loss_lvr": 0.7421436905860901, "loss_mode_switch": 0.0, "loss_total": 0.6619439125061035, "step": 1982 }, { "batch_size": 4, "epoch": 0.7928, "step": 1982, "tokens_per_device": 4632 }, { "epoch": 0.7928, "loss_ce": 0.6657125353813171, "loss_lvr": 0.6996079683303833, "loss_mode_switch": 0.0, "loss_total": 0.7356733083724976, "step": 1982 }, { "batch_size": 4, "epoch": 0.7928, "step": 1982, "tokens_per_device": 3820 }, { "epoch": 0.7928, "loss_ce": 0.14952485263347626, "loss_lvr": 0.7984873056411743, "loss_mode_switch": 0.0, "loss_total": 0.22937357425689697, "step": 1982 }, { "batch_size": 4, "epoch": 0.7928, "step": 1982, "tokens_per_device": 3776 }, { "epoch": 0.7928, "loss_ce": 0.14089706540107727, "loss_lvr": 0.7912139892578125, "loss_mode_switch": 0.0, "loss_total": 0.22001847624778748, "step": 1982 }, { "epoch": 0.7932, "grad_norm": 1.2532927989959717, "learning_rate": 1.0801919149892743e-06, "loss": 0.2516, "step": 1983 }, { "batch_size": 4, "epoch": 0.7932, "step": 1983, "tokens_per_device": 4256 }, { "epoch": 0.7932, "loss_ce": 0.7048922777175903, "loss_lvr": 0.87197345495224, "loss_mode_switch": 0.0, "loss_total": 0.7920896410942078, "step": 1983 }, { "batch_size": 4, "epoch": 0.7932, "step": 1983, "tokens_per_device": 1436 }, { "epoch": 0.7932, "loss_ce": 0.11696256697177887, "loss_lvr": 0.801478922367096, "loss_mode_switch": 0.0, "loss_total": 0.19711045920848846, "step": 1983 }, { "batch_size": 4, "epoch": 0.7932, "step": 1983, "tokens_per_device": 4040 }, { "epoch": 0.7932, "loss_ce": 0.09572917222976685, "loss_lvr": 0.8467113971710205, "loss_mode_switch": 0.0, "loss_total": 0.1804003119468689, "step": 1983 }, { "batch_size": 4, "epoch": 0.7932, "step": 1983, "tokens_per_device": 4304 }, { "epoch": 0.7932, "loss_ce": 0.033845968544483185, "loss_lvr": 0.8345223069190979, "loss_mode_switch": 0.0, "loss_total": 0.1172982007265091, "step": 1983 }, { "batch_size": 4, "epoch": 0.7932, "step": 1983, "tokens_per_device": 4232 }, { "epoch": 0.7932, "loss_ce": 0.09092269092798233, "loss_lvr": 0.9451926350593567, "loss_mode_switch": 0.0, "loss_total": 0.18544195592403412, "step": 1983 }, { "batch_size": 4, "epoch": 0.7932, "step": 1983, "tokens_per_device": 1532 }, { "epoch": 0.7932, "loss_ce": 0.6567505598068237, "loss_lvr": 1.027805209159851, "loss_mode_switch": 0.0, "loss_total": 0.7595310807228088, "step": 1983 }, { "batch_size": 1, "epoch": 0.7932, "step": 1983, "tokens_per_device": 4886 }, { "epoch": 0.7932, "loss_ce": 0.04638047516345978, "loss_lvr": 0.6324045062065125, "loss_mode_switch": 0.0, "loss_total": 0.10962092876434326, "step": 1983 }, { "batch_size": 4, "epoch": 0.7932, "step": 1983, "tokens_per_device": 4264 }, { "epoch": 0.7932, "loss_ce": 0.32268449664115906, "loss_lvr": 0.9223628640174866, "loss_mode_switch": 0.0, "loss_total": 0.41492077708244324, "step": 1983 }, { "epoch": 0.7936, "grad_norm": 1.2163171768188477, "learning_rate": 1.0761739066539888e-06, "loss": 0.3051, "step": 1984 }, { "batch_size": 4, "epoch": 0.7936, "step": 1984, "tokens_per_device": 2572 }, { "epoch": 0.7936, "loss_ce": 0.5084923505783081, "loss_lvr": 0.9649288058280945, "loss_mode_switch": 0.0, "loss_total": 0.604985237121582, "step": 1984 }, { "batch_size": 4, "epoch": 0.7936, "step": 1984, "tokens_per_device": 2712 }, { "epoch": 0.7936, "loss_ce": 0.21353013813495636, "loss_lvr": 0.6274492740631104, "loss_mode_switch": 0.0, "loss_total": 0.27627506852149963, "step": 1984 }, { "batch_size": 4, "epoch": 0.7936, "step": 1984, "tokens_per_device": 2668 }, { "epoch": 0.7936, "loss_ce": 0.18283720314502716, "loss_lvr": 2.5224685668945312, "loss_mode_switch": 0.0, "loss_total": 0.4350840449333191, "step": 1984 }, { "batch_size": 4, "epoch": 0.7936, "step": 1984, "tokens_per_device": 4060 }, { "epoch": 0.7936, "loss_ce": 0.6107714176177979, "loss_lvr": 0.7252599000930786, "loss_mode_switch": 0.0, "loss_total": 0.6832973957061768, "step": 1984 }, { "batch_size": 4, "epoch": 0.7936, "step": 1984, "tokens_per_device": 5656 }, { "epoch": 0.7936, "loss_ce": 0.33494967222213745, "loss_lvr": 0.8390834331512451, "loss_mode_switch": 0.0, "loss_total": 0.41885802149772644, "step": 1984 }, { "batch_size": 4, "epoch": 0.7936, "step": 1984, "tokens_per_device": 2544 }, { "epoch": 0.7936, "loss_ce": 0.2783292829990387, "loss_lvr": 0.7466802000999451, "loss_mode_switch": 0.0, "loss_total": 0.3529973030090332, "step": 1984 }, { "batch_size": 4, "epoch": 0.7936, "step": 1984, "tokens_per_device": 5752 }, { "epoch": 0.7936, "loss_ce": 0.010582069866359234, "loss_lvr": 0.7781765460968018, "loss_mode_switch": 0.0, "loss_total": 0.0883997231721878, "step": 1984 }, { "batch_size": 4, "epoch": 0.7936, "step": 1984, "tokens_per_device": 4140 }, { "epoch": 0.7936, "loss_ce": 0.2302054464817047, "loss_lvr": 0.86060631275177, "loss_mode_switch": 0.0, "loss_total": 0.31626608967781067, "step": 1984 }, { "epoch": 0.794, "grad_norm": 1.606629490852356, "learning_rate": 1.0721624837761768e-06, "loss": 0.34, "step": 1985 }, { "batch_size": 1, "epoch": 0.794, "step": 1985, "tokens_per_device": 5820 }, { "epoch": 0.794, "loss_ce": 0.013184621930122375, "loss_lvr": 0.30029526352882385, "loss_mode_switch": 0.0, "loss_total": 0.04321414977312088, "step": 1985 }, { "batch_size": 1, "epoch": 0.794, "step": 1985, "tokens_per_device": 4869 }, { "epoch": 0.794, "loss_ce": 0.00215107761323452, "loss_lvr": 0.4116339087486267, "loss_mode_switch": 0.0, "loss_total": 0.04331447184085846, "step": 1985 }, { "batch_size": 4, "epoch": 0.794, "step": 1985, "tokens_per_device": 13012 }, { "epoch": 0.794, "loss_ce": 0.08090691268444061, "loss_lvr": 0.7015321850776672, "loss_mode_switch": 0.0, "loss_total": 0.15106013417243958, "step": 1985 }, { "batch_size": 4, "epoch": 0.794, "step": 1985, "tokens_per_device": 5784 }, { "epoch": 0.794, "loss_ce": 0.0914871022105217, "loss_lvr": 0.5926346778869629, "loss_mode_switch": 0.0, "loss_total": 0.15075057744979858, "step": 1985 }, { "batch_size": 4, "epoch": 0.794, "step": 1985, "tokens_per_device": 6064 }, { "epoch": 0.794, "loss_ce": 0.3108527362346649, "loss_lvr": 0.796043872833252, "loss_mode_switch": 0.0, "loss_total": 0.3904571235179901, "step": 1985 }, { "batch_size": 4, "epoch": 0.794, "step": 1985, "tokens_per_device": 3824 }, { "epoch": 0.794, "loss_ce": 0.46896347403526306, "loss_lvr": 0.7782753705978394, "loss_mode_switch": 0.0, "loss_total": 0.5467910170555115, "step": 1985 }, { "batch_size": 4, "epoch": 0.794, "step": 1985, "tokens_per_device": 5636 }, { "epoch": 0.794, "loss_ce": 0.24026891589164734, "loss_lvr": 1.193556547164917, "loss_mode_switch": 0.0, "loss_total": 0.35962456464767456, "step": 1985 }, { "batch_size": 4, "epoch": 0.794, "step": 1985, "tokens_per_device": 1624 }, { "epoch": 0.794, "loss_ce": 0.056081779301166534, "loss_lvr": 1.004963994026184, "loss_mode_switch": 0.0, "loss_total": 0.1565781831741333, "step": 1985 }, { "epoch": 0.7944, "grad_norm": 1.7256627082824707, "learning_rate": 1.0681576530883148e-06, "loss": 0.322, "step": 1986 }, { "batch_size": 1, "epoch": 0.7944, "step": 1986, "tokens_per_device": 5111 }, { "epoch": 0.7944, "loss_ce": 0.01759551279246807, "loss_lvr": 0.8307884931564331, "loss_mode_switch": 0.0, "loss_total": 0.10067436844110489, "step": 1986 }, { "batch_size": 4, "epoch": 0.7944, "step": 1986, "tokens_per_device": 3804 }, { "epoch": 0.7944, "loss_ce": 0.060675255954265594, "loss_lvr": 0.8523951172828674, "loss_mode_switch": 0.0, "loss_total": 0.14591476321220398, "step": 1986 }, { "batch_size": 4, "epoch": 0.7944, "step": 1986, "tokens_per_device": 2672 }, { "epoch": 0.7944, "loss_ce": 0.7404847145080566, "loss_lvr": 0.8811731338500977, "loss_mode_switch": 0.0, "loss_total": 0.8286020159721375, "step": 1986 }, { "batch_size": 1, "epoch": 0.7944, "step": 1986, "tokens_per_device": 4872 }, { "epoch": 0.7944, "loss_ce": 0.0005000640521757305, "loss_lvr": 0.2745474576950073, "loss_mode_switch": 0.0, "loss_total": 0.027954811230301857, "step": 1986 }, { "batch_size": 1, "epoch": 0.7944, "step": 1986, "tokens_per_device": 5035 }, { "epoch": 0.7944, "loss_ce": 0.15611273050308228, "loss_lvr": 1.1171493530273438, "loss_mode_switch": 0.0, "loss_total": 0.2678276598453522, "step": 1986 }, { "batch_size": 1, "epoch": 0.7944, "step": 1986, "tokens_per_device": 5606 }, { "epoch": 0.7944, "loss_ce": 0.0004899497726000845, "loss_lvr": 0.3108198642730713, "loss_mode_switch": 0.0, "loss_total": 0.03157193586230278, "step": 1986 }, { "batch_size": 1, "epoch": 0.7944, "step": 1986, "tokens_per_device": 5172 }, { "epoch": 0.7944, "loss_ce": 0.002270590281113982, "loss_lvr": 0.3498810827732086, "loss_mode_switch": 0.0, "loss_total": 0.03725869953632355, "step": 1986 }, { "batch_size": 1, "epoch": 0.7944, "step": 1986, "tokens_per_device": 4914 }, { "epoch": 0.7944, "loss_ce": 0.16446569561958313, "loss_lvr": 0.7994367480278015, "loss_mode_switch": 0.0, "loss_total": 0.24440938234329224, "step": 1986 }, { "epoch": 0.7948, "grad_norm": 1.274901270866394, "learning_rate": 1.064159421311809e-06, "loss": 0.2705, "step": 1987 }, { "batch_size": 1, "epoch": 0.7948, "step": 1987, "tokens_per_device": 4884 }, { "epoch": 0.7948, "loss_ce": 0.005831899121403694, "loss_lvr": 0.3231186270713806, "loss_mode_switch": 0.0, "loss_total": 0.038143761456012726, "step": 1987 }, { "batch_size": 4, "epoch": 0.7948, "step": 1987, "tokens_per_device": 10964 }, { "epoch": 0.7948, "loss_ce": 0.2856384515762329, "loss_lvr": 0.5848232507705688, "loss_mode_switch": 0.0, "loss_total": 0.3441207706928253, "step": 1987 }, { "batch_size": 1, "epoch": 0.7948, "step": 1987, "tokens_per_device": 5164 }, { "epoch": 0.7948, "loss_ce": 0.0057479520328342915, "loss_lvr": 0.5052693486213684, "loss_mode_switch": 0.0, "loss_total": 0.056274887174367905, "step": 1987 }, { "batch_size": 4, "epoch": 0.7948, "step": 1987, "tokens_per_device": 5408 }, { "epoch": 0.7948, "loss_ce": 0.10910657048225403, "loss_lvr": 0.6084326505661011, "loss_mode_switch": 0.0, "loss_total": 0.16994982957839966, "step": 1987 }, { "batch_size": 1, "epoch": 0.7948, "step": 1987, "tokens_per_device": 6136 }, { "epoch": 0.7948, "loss_ce": 0.006164087913930416, "loss_lvr": 0.4210726320743561, "loss_mode_switch": 0.0, "loss_total": 0.04827135428786278, "step": 1987 }, { "batch_size": 4, "epoch": 0.7948, "step": 1987, "tokens_per_device": 4272 }, { "epoch": 0.7948, "loss_ce": 0.07750467211008072, "loss_lvr": 1.1086318492889404, "loss_mode_switch": 0.0, "loss_total": 0.18836785852909088, "step": 1987 }, { "batch_size": 4, "epoch": 0.7948, "step": 1987, "tokens_per_device": 4272 }, { "epoch": 0.7948, "loss_ce": 0.40946730971336365, "loss_lvr": 0.8818921446800232, "loss_mode_switch": 0.0, "loss_total": 0.49765652418136597, "step": 1987 }, { "batch_size": 4, "epoch": 0.7948, "step": 1987, "tokens_per_device": 4272 }, { "epoch": 0.7948, "loss_ce": 0.4289810359477997, "loss_lvr": 1.051891565322876, "loss_mode_switch": 0.0, "loss_total": 0.5341702103614807, "step": 1987 }, { "epoch": 0.7952, "grad_norm": 1.2848412990570068, "learning_rate": 1.0601677951569967e-06, "loss": 0.299, "step": 1988 }, { "batch_size": 4, "epoch": 0.7952, "step": 1988, "tokens_per_device": 4580 }, { "epoch": 0.7952, "loss_ce": 0.09714435786008835, "loss_lvr": 0.4611622989177704, "loss_mode_switch": 0.0, "loss_total": 0.14326058328151703, "step": 1988 }, { "batch_size": 4, "epoch": 0.7952, "step": 1988, "tokens_per_device": 6168 }, { "epoch": 0.7952, "loss_ce": 0.09180530905723572, "loss_lvr": 0.8358564972877502, "loss_mode_switch": 0.0, "loss_total": 0.17539095878601074, "step": 1988 }, { "batch_size": 4, "epoch": 0.7952, "step": 1988, "tokens_per_device": 3836 }, { "epoch": 0.7952, "loss_ce": 0.27913570404052734, "loss_lvr": 0.7879394888877869, "loss_mode_switch": 0.0, "loss_total": 0.35792964696884155, "step": 1988 }, { "batch_size": 1, "epoch": 0.7952, "step": 1988, "tokens_per_device": 4893 }, { "epoch": 0.7952, "loss_ce": 0.03953372314572334, "loss_lvr": 0.7122339010238647, "loss_mode_switch": 0.0, "loss_total": 0.11075711250305176, "step": 1988 }, { "batch_size": 4, "epoch": 0.7952, "step": 1988, "tokens_per_device": 4076 }, { "epoch": 0.7952, "loss_ce": 0.03277590498328209, "loss_lvr": 0.6309536695480347, "loss_mode_switch": 0.0, "loss_total": 0.09587126970291138, "step": 1988 }, { "batch_size": 4, "epoch": 0.7952, "step": 1988, "tokens_per_device": 4936 }, { "epoch": 0.7952, "loss_ce": 0.16243503987789154, "loss_lvr": 0.6775916218757629, "loss_mode_switch": 0.0, "loss_total": 0.23019421100616455, "step": 1988 }, { "batch_size": 4, "epoch": 0.7952, "step": 1988, "tokens_per_device": 1556 }, { "epoch": 0.7952, "loss_ce": 0.6914461851119995, "loss_lvr": 0.8807679414749146, "loss_mode_switch": 0.0, "loss_total": 0.7795229554176331, "step": 1988 }, { "batch_size": 4, "epoch": 0.7952, "step": 1988, "tokens_per_device": 4484 }, { "epoch": 0.7952, "loss_ce": 0.3264075219631195, "loss_lvr": 0.668756902217865, "loss_mode_switch": 0.0, "loss_total": 0.3932832181453705, "step": 1988 }, { "epoch": 0.7956, "grad_norm": 1.412781834602356, "learning_rate": 1.056182781323124e-06, "loss": 0.2753, "step": 1989 }, { "batch_size": 4, "epoch": 0.7956, "step": 1989, "tokens_per_device": 3800 }, { "epoch": 0.7956, "loss_ce": 0.1587618887424469, "loss_lvr": 1.0704172849655151, "loss_mode_switch": 0.0, "loss_total": 0.26580363512039185, "step": 1989 }, { "batch_size": 4, "epoch": 0.7956, "step": 1989, "tokens_per_device": 3744 }, { "epoch": 0.7956, "loss_ce": 0.00354534899815917, "loss_lvr": 0.7476573586463928, "loss_mode_switch": 0.0, "loss_total": 0.07831108570098877, "step": 1989 }, { "batch_size": 1, "epoch": 0.7956, "step": 1989, "tokens_per_device": 4911 }, { "epoch": 0.7956, "loss_ce": 0.3590705096721649, "loss_lvr": 0.7182407975196838, "loss_mode_switch": 0.0, "loss_total": 0.4308945834636688, "step": 1989 }, { "batch_size": 4, "epoch": 0.7956, "step": 1989, "tokens_per_device": 5368 }, { "epoch": 0.7956, "loss_ce": 0.5101230144500732, "loss_lvr": 0.691710352897644, "loss_mode_switch": 0.0, "loss_total": 0.5792940258979797, "step": 1989 }, { "batch_size": 4, "epoch": 0.7956, "step": 1989, "tokens_per_device": 1436 }, { "epoch": 0.7956, "loss_ce": 0.5686113834381104, "loss_lvr": 0.8656092882156372, "loss_mode_switch": 0.0, "loss_total": 0.6551722884178162, "step": 1989 }, { "batch_size": 4, "epoch": 0.7956, "step": 1989, "tokens_per_device": 4240 }, { "epoch": 0.7956, "loss_ce": 0.0206094142049551, "loss_lvr": 1.1408610343933105, "loss_mode_switch": 0.0, "loss_total": 0.13469551503658295, "step": 1989 }, { "batch_size": 1, "epoch": 0.7956, "step": 1989, "tokens_per_device": 5869 }, { "epoch": 0.7956, "loss_ce": 0.10162407159805298, "loss_lvr": 0.32156652212142944, "loss_mode_switch": 0.0, "loss_total": 0.13378071784973145, "step": 1989 }, { "batch_size": 4, "epoch": 0.7956, "step": 1989, "tokens_per_device": 2656 }, { "epoch": 0.7956, "loss_ce": 0.17369844019412994, "loss_lvr": 0.8995981216430664, "loss_mode_switch": 0.0, "loss_total": 0.2636582553386688, "step": 1989 }, { "epoch": 0.796, "grad_norm": 1.3327621221542358, "learning_rate": 1.0522043864983428e-06, "loss": 0.2639, "step": 1990 }, { "batch_size": 1, "epoch": 0.796, "step": 1990, "tokens_per_device": 5079 }, { "epoch": 0.796, "loss_ce": 0.0005348068661987782, "loss_lvr": 0.27783721685409546, "loss_mode_switch": 0.0, "loss_total": 0.028318528085947037, "step": 1990 }, { "batch_size": 4, "epoch": 0.796, "step": 1990, "tokens_per_device": 4356 }, { "epoch": 0.796, "loss_ce": 0.25079798698425293, "loss_lvr": 1.0800144672393799, "loss_mode_switch": 0.0, "loss_total": 0.35879942774772644, "step": 1990 }, { "batch_size": 4, "epoch": 0.796, "step": 1990, "tokens_per_device": 1680 }, { "epoch": 0.796, "loss_ce": 0.026528475806117058, "loss_lvr": 0.8508498072624207, "loss_mode_switch": 0.0, "loss_total": 0.11161345988512039, "step": 1990 }, { "batch_size": 4, "epoch": 0.796, "step": 1990, "tokens_per_device": 3284 }, { "epoch": 0.796, "loss_ce": 0.11477753520011902, "loss_lvr": 0.690399169921875, "loss_mode_switch": 0.0, "loss_total": 0.18381744623184204, "step": 1990 }, { "batch_size": 4, "epoch": 0.796, "step": 1990, "tokens_per_device": 4364 }, { "epoch": 0.796, "loss_ce": 0.41422075033187866, "loss_lvr": 1.7257862091064453, "loss_mode_switch": 0.0, "loss_total": 0.5867993831634521, "step": 1990 }, { "batch_size": 4, "epoch": 0.796, "step": 1990, "tokens_per_device": 2640 }, { "epoch": 0.796, "loss_ce": 0.4757017493247986, "loss_lvr": 0.8061535954475403, "loss_mode_switch": 0.0, "loss_total": 0.5563170909881592, "step": 1990 }, { "batch_size": 4, "epoch": 0.796, "step": 1990, "tokens_per_device": 4116 }, { "epoch": 0.796, "loss_ce": 0.4548928141593933, "loss_lvr": 0.8547908663749695, "loss_mode_switch": 0.0, "loss_total": 0.5403718948364258, "step": 1990 }, { "batch_size": 4, "epoch": 0.796, "step": 1990, "tokens_per_device": 4212 }, { "epoch": 0.796, "loss_ce": 0.39327529072761536, "loss_lvr": 0.5840758681297302, "loss_mode_switch": 0.0, "loss_total": 0.4516828656196594, "step": 1990 }, { "epoch": 0.7964, "grad_norm": 1.328639268875122, "learning_rate": 1.0482326173596947e-06, "loss": 0.2945, "step": 1991 }, { "batch_size": 4, "epoch": 0.7964, "step": 1991, "tokens_per_device": 3760 }, { "epoch": 0.7964, "loss_ce": 0.3628338873386383, "loss_lvr": 0.7282055020332336, "loss_mode_switch": 0.0, "loss_total": 0.4356544315814972, "step": 1991 }, { "batch_size": 4, "epoch": 0.7964, "step": 1991, "tokens_per_device": 2628 }, { "epoch": 0.7964, "loss_ce": 0.03920276463031769, "loss_lvr": 0.8253709673881531, "loss_mode_switch": 0.0, "loss_total": 0.12173986434936523, "step": 1991 }, { "batch_size": 4, "epoch": 0.7964, "step": 1991, "tokens_per_device": 3356 }, { "epoch": 0.7964, "loss_ce": 0.40255194902420044, "loss_lvr": 0.949065625667572, "loss_mode_switch": 0.0, "loss_total": 0.4974585175514221, "step": 1991 }, { "batch_size": 4, "epoch": 0.7964, "step": 1991, "tokens_per_device": 5876 }, { "epoch": 0.7964, "loss_ce": 0.033687300980091095, "loss_lvr": 0.7541803121566772, "loss_mode_switch": 0.0, "loss_total": 0.10910533368587494, "step": 1991 }, { "batch_size": 4, "epoch": 0.7964, "step": 1991, "tokens_per_device": 4192 }, { "epoch": 0.7964, "loss_ce": 0.16421188414096832, "loss_lvr": 0.7239325642585754, "loss_mode_switch": 0.0, "loss_total": 0.23660513758659363, "step": 1991 }, { "batch_size": 4, "epoch": 0.7964, "step": 1991, "tokens_per_device": 6668 }, { "epoch": 0.7964, "loss_ce": 0.19989877939224243, "loss_lvr": 1.0251743793487549, "loss_mode_switch": 0.0, "loss_total": 0.30241620540618896, "step": 1991 }, { "batch_size": 1, "epoch": 0.7964, "step": 1991, "tokens_per_device": 5192 }, { "epoch": 0.7964, "loss_ce": 0.01635744608938694, "loss_lvr": 0.48204872012138367, "loss_mode_switch": 0.0, "loss_total": 0.06456232070922852, "step": 1991 }, { "batch_size": 4, "epoch": 0.7964, "step": 1991, "tokens_per_device": 7760 }, { "epoch": 0.7964, "loss_ce": 0.2861739993095398, "loss_lvr": 0.8967795968055725, "loss_mode_switch": 0.0, "loss_total": 0.37585195899009705, "step": 1991 }, { "epoch": 0.7968, "grad_norm": 1.4621365070343018, "learning_rate": 1.0442674805730986e-06, "loss": 0.2835, "step": 1992 }, { "batch_size": 1, "epoch": 0.7968, "step": 1992, "tokens_per_device": 5161 }, { "epoch": 0.7968, "loss_ce": 0.07067963480949402, "loss_lvr": 0.5509078502655029, "loss_mode_switch": 0.0, "loss_total": 0.1257704198360443, "step": 1992 }, { "batch_size": 4, "epoch": 0.7968, "step": 1992, "tokens_per_device": 4268 }, { "epoch": 0.7968, "loss_ce": 0.047786395996809006, "loss_lvr": 0.7118130922317505, "loss_mode_switch": 0.0, "loss_total": 0.11896771192550659, "step": 1992 }, { "batch_size": 4, "epoch": 0.7968, "step": 1992, "tokens_per_device": 4860 }, { "epoch": 0.7968, "loss_ce": 0.400446355342865, "loss_lvr": 0.6174837350845337, "loss_mode_switch": 0.0, "loss_total": 0.4621947407722473, "step": 1992 }, { "batch_size": 1, "epoch": 0.7968, "step": 1992, "tokens_per_device": 5093 }, { "epoch": 0.7968, "loss_ce": 0.002922666724771261, "loss_lvr": 0.4515921175479889, "loss_mode_switch": 0.0, "loss_total": 0.04808187857270241, "step": 1992 }, { "batch_size": 4, "epoch": 0.7968, "step": 1992, "tokens_per_device": 1216 }, { "epoch": 0.7968, "loss_ce": 0.2742857038974762, "loss_lvr": 1.0369656085968018, "loss_mode_switch": 0.0, "loss_total": 0.3779822587966919, "step": 1992 }, { "batch_size": 4, "epoch": 0.7968, "step": 1992, "tokens_per_device": 2572 }, { "epoch": 0.7968, "loss_ce": 0.14028708636760712, "loss_lvr": 0.9959450960159302, "loss_mode_switch": 0.0, "loss_total": 0.23988160490989685, "step": 1992 }, { "batch_size": 1, "epoch": 0.7968, "step": 1992, "tokens_per_device": 4939 }, { "epoch": 0.7968, "loss_ce": 0.01720777526497841, "loss_lvr": 0.2956218123435974, "loss_mode_switch": 0.0, "loss_total": 0.04676995426416397, "step": 1992 }, { "batch_size": 4, "epoch": 0.7968, "step": 1992, "tokens_per_device": 5728 }, { "epoch": 0.7968, "loss_ce": 0.07640956342220306, "loss_lvr": 0.8966537714004517, "loss_mode_switch": 0.0, "loss_total": 0.1660749316215515, "step": 1992 }, { "epoch": 0.7972, "grad_norm": 1.1552187204360962, "learning_rate": 1.0403089827933482e-06, "loss": 0.2535, "step": 1993 }, { "batch_size": 4, "epoch": 0.7972, "step": 1993, "tokens_per_device": 4252 }, { "epoch": 0.7972, "loss_ce": 0.15217342972755432, "loss_lvr": 0.7253394722938538, "loss_mode_switch": 0.0, "loss_total": 0.22470737993717194, "step": 1993 }, { "batch_size": 1, "epoch": 0.7972, "step": 1993, "tokens_per_device": 5092 }, { "epoch": 0.7972, "loss_ce": 0.16161096096038818, "loss_lvr": 0.5081401467323303, "loss_mode_switch": 0.0, "loss_total": 0.21242497861385345, "step": 1993 }, { "batch_size": 4, "epoch": 0.7972, "step": 1993, "tokens_per_device": 3584 }, { "epoch": 0.7972, "loss_ce": 0.10236947983503342, "loss_lvr": 0.7234645485877991, "loss_mode_switch": 0.0, "loss_total": 0.17471593618392944, "step": 1993 }, { "batch_size": 4, "epoch": 0.7972, "step": 1993, "tokens_per_device": 5988 }, { "epoch": 0.7972, "loss_ce": 0.4305708110332489, "loss_lvr": 0.7976886034011841, "loss_mode_switch": 0.0, "loss_total": 0.5103396773338318, "step": 1993 }, { "batch_size": 4, "epoch": 0.7972, "step": 1993, "tokens_per_device": 3792 }, { "epoch": 0.7972, "loss_ce": 0.5572208166122437, "loss_lvr": 0.9316681027412415, "loss_mode_switch": 0.0, "loss_total": 0.6503876447677612, "step": 1993 }, { "batch_size": 4, "epoch": 0.7972, "step": 1993, "tokens_per_device": 1544 }, { "epoch": 0.7972, "loss_ce": 0.14464116096496582, "loss_lvr": 0.9455010890960693, "loss_mode_switch": 0.0, "loss_total": 0.23919126391410828, "step": 1993 }, { "batch_size": 1, "epoch": 0.7972, "step": 1993, "tokens_per_device": 4906 }, { "epoch": 0.7972, "loss_ce": 0.11107644438743591, "loss_lvr": 0.1931985318660736, "loss_mode_switch": 0.0, "loss_total": 0.1303962916135788, "step": 1993 }, { "batch_size": 1, "epoch": 0.7972, "step": 1993, "tokens_per_device": 5110 }, { "epoch": 0.7972, "loss_ce": 0.00037882456672377884, "loss_lvr": 0.3328063189983368, "loss_mode_switch": 0.0, "loss_total": 0.03365945816040039, "step": 1993 }, { "epoch": 0.7976, "grad_norm": 1.2463568449020386, "learning_rate": 1.0363571306640885e-06, "loss": 0.2676, "step": 1994 }, { "batch_size": 4, "epoch": 0.7976, "step": 1994, "tokens_per_device": 8156 }, { "epoch": 0.7976, "loss_ce": 0.7321877479553223, "loss_lvr": 0.6705697178840637, "loss_mode_switch": 0.0, "loss_total": 0.7992447018623352, "step": 1994 }, { "batch_size": 1, "epoch": 0.7976, "step": 1994, "tokens_per_device": 4755 }, { "epoch": 0.7976, "loss_ce": 0.013322276063263416, "loss_lvr": 0.3775702714920044, "loss_mode_switch": 0.0, "loss_total": 0.05107930302619934, "step": 1994 }, { "batch_size": 4, "epoch": 0.7976, "step": 1994, "tokens_per_device": 5224 }, { "epoch": 0.7976, "loss_ce": 0.06769862025976181, "loss_lvr": 0.7251738905906677, "loss_mode_switch": 0.0, "loss_total": 0.14021600782871246, "step": 1994 }, { "batch_size": 4, "epoch": 0.7976, "step": 1994, "tokens_per_device": 15244 }, { "epoch": 0.7976, "loss_ce": 1.2308772802352905, "loss_lvr": 0.8095576763153076, "loss_mode_switch": 0.0, "loss_total": 1.3118330240249634, "step": 1994 }, { "batch_size": 4, "epoch": 0.7976, "step": 1994, "tokens_per_device": 3948 }, { "epoch": 0.7976, "loss_ce": 0.07628170400857925, "loss_lvr": 0.8724526166915894, "loss_mode_switch": 0.0, "loss_total": 0.1635269671678543, "step": 1994 }, { "batch_size": 4, "epoch": 0.7976, "step": 1994, "tokens_per_device": 3860 }, { "epoch": 0.7976, "loss_ce": 0.13431613147258759, "loss_lvr": 0.8465794920921326, "loss_mode_switch": 0.0, "loss_total": 0.21897408366203308, "step": 1994 }, { "batch_size": 1, "epoch": 0.7976, "step": 1994, "tokens_per_device": 5126 }, { "epoch": 0.7976, "loss_ce": 0.023998787626624107, "loss_lvr": 0.22213907539844513, "loss_mode_switch": 0.0, "loss_total": 0.04621269553899765, "step": 1994 }, { "batch_size": 4, "epoch": 0.7976, "step": 1994, "tokens_per_device": 4632 }, { "epoch": 0.7976, "loss_ce": 0.2940748333930969, "loss_lvr": 0.9261731505393982, "loss_mode_switch": 0.0, "loss_total": 0.3866921663284302, "step": 1994 }, { "epoch": 0.798, "grad_norm": 1.583600401878357, "learning_rate": 1.0324119308178166e-06, "loss": 0.328, "step": 1995 }, { "batch_size": 1, "epoch": 0.798, "step": 1995, "tokens_per_device": 4884 }, { "epoch": 0.798, "loss_ce": 0.0004138491058256477, "loss_lvr": 0.1896078884601593, "loss_mode_switch": 0.0, "loss_total": 0.01937463879585266, "step": 1995 }, { "batch_size": 4, "epoch": 0.798, "step": 1995, "tokens_per_device": 4376 }, { "epoch": 0.798, "loss_ce": 0.4399188458919525, "loss_lvr": 0.8404939770698547, "loss_mode_switch": 0.0, "loss_total": 0.5239682197570801, "step": 1995 }, { "batch_size": 1, "epoch": 0.798, "step": 1995, "tokens_per_device": 4961 }, { "epoch": 0.798, "loss_ce": 0.11888949573040009, "loss_lvr": 0.2135116457939148, "loss_mode_switch": 0.0, "loss_total": 0.1402406543493271, "step": 1995 }, { "batch_size": 4, "epoch": 0.798, "step": 1995, "tokens_per_device": 4180 }, { "epoch": 0.798, "loss_ce": 0.3470383286476135, "loss_lvr": 0.9784964919090271, "loss_mode_switch": 0.0, "loss_total": 0.44488799571990967, "step": 1995 }, { "batch_size": 1, "epoch": 0.798, "step": 1995, "tokens_per_device": 4518 }, { "epoch": 0.798, "loss_ce": 0.0805061087012291, "loss_lvr": 0.30054885149002075, "loss_mode_switch": 0.0, "loss_total": 0.11056099832057953, "step": 1995 }, { "batch_size": 4, "epoch": 0.798, "step": 1995, "tokens_per_device": 2756 }, { "epoch": 0.798, "loss_ce": 0.3318925201892853, "loss_lvr": 0.7929782867431641, "loss_mode_switch": 0.0, "loss_total": 0.41119036078453064, "step": 1995 }, { "batch_size": 4, "epoch": 0.798, "step": 1995, "tokens_per_device": 3948 }, { "epoch": 0.798, "loss_ce": 0.052431121468544006, "loss_lvr": 0.8457668423652649, "loss_mode_switch": 0.0, "loss_total": 0.13700780272483826, "step": 1995 }, { "batch_size": 4, "epoch": 0.798, "step": 1995, "tokens_per_device": 1320 }, { "epoch": 0.798, "loss_ce": 0.22180134057998657, "loss_lvr": 1.1747517585754395, "loss_mode_switch": 0.0, "loss_total": 0.339276522397995, "step": 1995 }, { "epoch": 0.7984, "grad_norm": 1.1919665336608887, "learning_rate": 1.0284733898758587e-06, "loss": 0.2268, "step": 1996 }, { "batch_size": 4, "epoch": 0.7984, "step": 1996, "tokens_per_device": 2476 }, { "epoch": 0.7984, "loss_ce": 0.29531997442245483, "loss_lvr": 0.8283894062042236, "loss_mode_switch": 0.0, "loss_total": 0.37815892696380615, "step": 1996 }, { "batch_size": 4, "epoch": 0.7984, "step": 1996, "tokens_per_device": 6256 }, { "epoch": 0.7984, "loss_ce": 0.054320696741342545, "loss_lvr": 0.8801538944244385, "loss_mode_switch": 0.0, "loss_total": 0.14233608543872833, "step": 1996 }, { "batch_size": 4, "epoch": 0.7984, "step": 1996, "tokens_per_device": 1660 }, { "epoch": 0.7984, "loss_ce": 0.08822482079267502, "loss_lvr": 0.6926743984222412, "loss_mode_switch": 0.0, "loss_total": 0.1574922651052475, "step": 1996 }, { "batch_size": 1, "epoch": 0.7984, "step": 1996, "tokens_per_device": 4112 }, { "epoch": 0.7984, "loss_ce": 0.01105837244540453, "loss_lvr": 0.5256714820861816, "loss_mode_switch": 0.0, "loss_total": 0.0636255219578743, "step": 1996 }, { "batch_size": 4, "epoch": 0.7984, "step": 1996, "tokens_per_device": 1684 }, { "epoch": 0.7984, "loss_ce": 0.27059468626976013, "loss_lvr": 0.8560559749603271, "loss_mode_switch": 0.0, "loss_total": 0.35620027780532837, "step": 1996 }, { "batch_size": 4, "epoch": 0.7984, "step": 1996, "tokens_per_device": 5632 }, { "epoch": 0.7984, "loss_ce": 0.027245882898569107, "loss_lvr": 0.7623478770256042, "loss_mode_switch": 0.0, "loss_total": 0.10348066687583923, "step": 1996 }, { "batch_size": 4, "epoch": 0.7984, "step": 1996, "tokens_per_device": 4288 }, { "epoch": 0.7984, "loss_ce": 0.1616736501455307, "loss_lvr": 0.8361060619354248, "loss_mode_switch": 0.0, "loss_total": 0.24528425931930542, "step": 1996 }, { "batch_size": 4, "epoch": 0.7984, "step": 1996, "tokens_per_device": 4164 }, { "epoch": 0.7984, "loss_ce": 0.3211134672164917, "loss_lvr": 0.7925881743431091, "loss_mode_switch": 0.0, "loss_total": 0.40037229657173157, "step": 1996 }, { "epoch": 0.7988, "grad_norm": 1.2501442432403564, "learning_rate": 1.0245415144483722e-06, "loss": 0.2795, "step": 1997 }, { "batch_size": 1, "epoch": 0.7988, "step": 1997, "tokens_per_device": 5053 }, { "epoch": 0.7988, "loss_ce": 0.0032184994779527187, "loss_lvr": 0.25672194361686707, "loss_mode_switch": 0.0, "loss_total": 0.028890695422887802, "step": 1997 }, { "batch_size": 4, "epoch": 0.7988, "step": 1997, "tokens_per_device": 4076 }, { "epoch": 0.7988, "loss_ce": 0.3425290882587433, "loss_lvr": 1.0175549983978271, "loss_mode_switch": 0.0, "loss_total": 0.444284588098526, "step": 1997 }, { "batch_size": 4, "epoch": 0.7988, "step": 1997, "tokens_per_device": 3820 }, { "epoch": 0.7988, "loss_ce": 0.23585733771324158, "loss_lvr": 0.9305812120437622, "loss_mode_switch": 0.0, "loss_total": 0.32891547679901123, "step": 1997 }, { "batch_size": 1, "epoch": 0.7988, "step": 1997, "tokens_per_device": 5105 }, { "epoch": 0.7988, "loss_ce": 0.0211458932608366, "loss_lvr": 0.3158184587955475, "loss_mode_switch": 0.0, "loss_total": 0.05272773653268814, "step": 1997 }, { "batch_size": 4, "epoch": 0.7988, "step": 1997, "tokens_per_device": 4908 }, { "epoch": 0.7988, "loss_ce": 0.009678025729954243, "loss_lvr": 0.7777191996574402, "loss_mode_switch": 0.0, "loss_total": 0.08744995296001434, "step": 1997 }, { "batch_size": 1, "epoch": 0.7988, "step": 1997, "tokens_per_device": 5293 }, { "epoch": 0.7988, "loss_ce": 0.24380400776863098, "loss_lvr": 0.3916255533695221, "loss_mode_switch": 0.0, "loss_total": 0.2829665541648865, "step": 1997 }, { "batch_size": 4, "epoch": 0.7988, "step": 1997, "tokens_per_device": 1240 }, { "epoch": 0.7988, "loss_ce": 0.11610767245292664, "loss_lvr": 0.9498728513717651, "loss_mode_switch": 0.0, "loss_total": 0.2110949605703354, "step": 1997 }, { "batch_size": 1, "epoch": 0.7988, "step": 1997, "tokens_per_device": 4939 }, { "epoch": 0.7988, "loss_ce": 0.07223385572433472, "loss_lvr": 0.4700848460197449, "loss_mode_switch": 0.0, "loss_total": 0.1192423403263092, "step": 1997 }, { "epoch": 0.7992, "grad_norm": 1.4220739603042603, "learning_rate": 1.020616311134321e-06, "loss": 0.267, "step": 1998 }, { "batch_size": 4, "epoch": 0.7992, "step": 1998, "tokens_per_device": 11248 }, { "epoch": 0.7992, "loss_ce": 0.3704049289226532, "loss_lvr": 0.8586700558662415, "loss_mode_switch": 0.0, "loss_total": 0.4562719464302063, "step": 1998 }, { "batch_size": 4, "epoch": 0.7992, "step": 1998, "tokens_per_device": 4096 }, { "epoch": 0.7992, "loss_ce": 0.3638797700405121, "loss_lvr": 1.079105257987976, "loss_mode_switch": 0.0, "loss_total": 0.4717903137207031, "step": 1998 }, { "batch_size": 4, "epoch": 0.7992, "step": 1998, "tokens_per_device": 6012 }, { "epoch": 0.7992, "loss_ce": 0.11909376084804535, "loss_lvr": 0.8815429210662842, "loss_mode_switch": 0.0, "loss_total": 0.20724806189537048, "step": 1998 }, { "batch_size": 1, "epoch": 0.7992, "step": 1998, "tokens_per_device": 5092 }, { "epoch": 0.7992, "loss_ce": 0.07180686295032501, "loss_lvr": 0.6514021158218384, "loss_mode_switch": 0.0, "loss_total": 0.13694706559181213, "step": 1998 }, { "batch_size": 4, "epoch": 0.7992, "step": 1998, "tokens_per_device": 9668 }, { "epoch": 0.7992, "loss_ce": 0.24279679358005524, "loss_lvr": 0.34831318259239197, "loss_mode_switch": 0.0, "loss_total": 0.2776281237602234, "step": 1998 }, { "batch_size": 4, "epoch": 0.7992, "step": 1998, "tokens_per_device": 13044 }, { "epoch": 0.7992, "loss_ce": 0.09062391519546509, "loss_lvr": 0.5361993908882141, "loss_mode_switch": 0.0, "loss_total": 0.14424385130405426, "step": 1998 }, { "batch_size": 4, "epoch": 0.7992, "step": 1998, "tokens_per_device": 4276 }, { "epoch": 0.7992, "loss_ce": 0.3411300778388977, "loss_lvr": 0.8733788132667542, "loss_mode_switch": 0.0, "loss_total": 0.4284679591655731, "step": 1998 }, { "batch_size": 4, "epoch": 0.7992, "step": 1998, "tokens_per_device": 1524 }, { "epoch": 0.7992, "loss_ce": 0.16925938427448273, "loss_lvr": 1.666672945022583, "loss_mode_switch": 0.0, "loss_total": 0.33592668175697327, "step": 1998 }, { "epoch": 0.7996, "grad_norm": 1.2659856081008911, "learning_rate": 1.016697786521476e-06, "loss": 0.2723, "step": 1999 }, { "batch_size": 4, "epoch": 0.7996, "step": 1999, "tokens_per_device": 3808 }, { "epoch": 0.7996, "loss_ce": 0.17614881694316864, "loss_lvr": 2.132559061050415, "loss_mode_switch": 0.0, "loss_total": 0.3894047141075134, "step": 1999 }, { "batch_size": 4, "epoch": 0.7996, "step": 1999, "tokens_per_device": 4496 }, { "epoch": 0.7996, "loss_ce": 0.2998771071434021, "loss_lvr": 0.7142946124076843, "loss_mode_switch": 0.0, "loss_total": 0.37130656838417053, "step": 1999 }, { "batch_size": 1, "epoch": 0.7996, "step": 1999, "tokens_per_device": 4893 }, { "epoch": 0.7996, "loss_ce": 0.00019477863679639995, "loss_lvr": 0.18402346968650818, "loss_mode_switch": 0.0, "loss_total": 0.018597126007080078, "step": 1999 }, { "batch_size": 1, "epoch": 0.7996, "step": 1999, "tokens_per_device": 5102 }, { "epoch": 0.7996, "loss_ce": 0.04670742154121399, "loss_lvr": 0.2802172899246216, "loss_mode_switch": 0.0, "loss_total": 0.07472915202379227, "step": 1999 }, { "batch_size": 4, "epoch": 0.7996, "step": 1999, "tokens_per_device": 2664 }, { "epoch": 0.7996, "loss_ce": 0.6265171766281128, "loss_lvr": 0.6801568269729614, "loss_mode_switch": 0.0, "loss_total": 0.6945328712463379, "step": 1999 }, { "batch_size": 1, "epoch": 0.7996, "step": 1999, "tokens_per_device": 5172 }, { "epoch": 0.7996, "loss_ce": 0.001818834338337183, "loss_lvr": 0.401142954826355, "loss_mode_switch": 0.0, "loss_total": 0.04193313047289848, "step": 1999 }, { "batch_size": 4, "epoch": 0.7996, "step": 1999, "tokens_per_device": 1392 }, { "epoch": 0.7996, "loss_ce": 0.47326773405075073, "loss_lvr": 1.032288670539856, "loss_mode_switch": 0.0, "loss_total": 0.5764966011047363, "step": 1999 }, { "batch_size": 4, "epoch": 0.7996, "step": 1999, "tokens_per_device": 4132 }, { "epoch": 0.7996, "loss_ce": 0.1635228991508484, "loss_lvr": 0.8515544533729553, "loss_mode_switch": 0.0, "loss_total": 0.24867835640907288, "step": 1999 }, { "epoch": 0.8, "grad_norm": 1.2808939218521118, "learning_rate": 1.012785947186397e-06, "loss": 0.27, "step": 2000 }, { "batch_size": 4, "epoch": 0.8, "step": 2000, "tokens_per_device": 4248 }, { "epoch": 0.8, "loss_ce": 0.42982301115989685, "loss_lvr": 0.6674610376358032, "loss_mode_switch": 0.0, "loss_total": 0.49656912684440613, "step": 2000 }, { "batch_size": 4, "epoch": 0.8, "step": 2000, "tokens_per_device": 4252 }, { "epoch": 0.8, "loss_ce": 0.026653053238987923, "loss_lvr": 0.8036898970603943, "loss_mode_switch": 0.0, "loss_total": 0.10702203959226608, "step": 2000 }, { "batch_size": 4, "epoch": 0.8, "step": 2000, "tokens_per_device": 5592 }, { "epoch": 0.8, "loss_ce": 0.12413187325000763, "loss_lvr": 0.667390763759613, "loss_mode_switch": 0.0, "loss_total": 0.19087094068527222, "step": 2000 }, { "batch_size": 4, "epoch": 0.8, "step": 2000, "tokens_per_device": 6708 }, { "epoch": 0.8, "loss_ce": 0.19300158321857452, "loss_lvr": 0.8216330409049988, "loss_mode_switch": 0.0, "loss_total": 0.2751649022102356, "step": 2000 }, { "batch_size": 4, "epoch": 0.8, "step": 2000, "tokens_per_device": 6980 }, { "epoch": 0.8, "loss_ce": 0.2213721126317978, "loss_lvr": 0.6326910853385925, "loss_mode_switch": 0.0, "loss_total": 0.28464120626449585, "step": 2000 }, { "batch_size": 4, "epoch": 0.8, "step": 2000, "tokens_per_device": 2636 }, { "epoch": 0.8, "loss_ce": 0.030743995681405067, "loss_lvr": 0.8641656041145325, "loss_mode_switch": 0.0, "loss_total": 0.11716055870056152, "step": 2000 }, { "batch_size": 4, "epoch": 0.8, "step": 2000, "tokens_per_device": 3808 }, { "epoch": 0.8, "loss_ce": 0.011583933606743813, "loss_lvr": 0.785714864730835, "loss_mode_switch": 0.0, "loss_total": 0.09015542268753052, "step": 2000 }, { "batch_size": 1, "epoch": 0.8, "step": 2000, "tokens_per_device": 6209 }, { "epoch": 0.8, "loss_ce": 0.00770181231200695, "loss_lvr": 0.19708718359470367, "loss_mode_switch": 0.0, "loss_total": 0.027410531416535378, "step": 2000 }, { "epoch": 0.8004, "grad_norm": 1.3179388046264648, "learning_rate": 1.008880799694421e-06, "loss": 0.2709, "step": 2001 }, { "batch_size": 4, "epoch": 0.8004, "step": 2001, "tokens_per_device": 4628 }, { "epoch": 0.8004, "loss_ce": 0.28108540177345276, "loss_lvr": 0.9260432720184326, "loss_mode_switch": 0.0, "loss_total": 0.373689740896225, "step": 2001 }, { "batch_size": 4, "epoch": 0.8004, "step": 2001, "tokens_per_device": 4264 }, { "epoch": 0.8004, "loss_ce": 0.05210591480135918, "loss_lvr": 0.9798558950424194, "loss_mode_switch": 0.0, "loss_total": 0.1500914990901947, "step": 2001 }, { "batch_size": 4, "epoch": 0.8004, "step": 2001, "tokens_per_device": 1324 }, { "epoch": 0.8004, "loss_ce": 0.10728263854980469, "loss_lvr": 0.8931018114089966, "loss_mode_switch": 0.0, "loss_total": 0.19659282267093658, "step": 2001 }, { "batch_size": 4, "epoch": 0.8004, "step": 2001, "tokens_per_device": 3800 }, { "epoch": 0.8004, "loss_ce": 0.3525638282299042, "loss_lvr": 0.809653103351593, "loss_mode_switch": 0.0, "loss_total": 0.4335291385650635, "step": 2001 }, { "batch_size": 1, "epoch": 0.8004, "step": 2001, "tokens_per_device": 5031 }, { "epoch": 0.8004, "loss_ce": 0.0010641015833243728, "loss_lvr": 0.3015504777431488, "loss_mode_switch": 0.0, "loss_total": 0.031219149008393288, "step": 2001 }, { "batch_size": 1, "epoch": 0.8004, "step": 2001, "tokens_per_device": 4901 }, { "epoch": 0.8004, "loss_ce": 0.013137441128492355, "loss_lvr": 0.38847556710243225, "loss_mode_switch": 0.0, "loss_total": 0.0519849993288517, "step": 2001 }, { "batch_size": 1, "epoch": 0.8004, "step": 2001, "tokens_per_device": 5026 }, { "epoch": 0.8004, "loss_ce": 1.565317153930664, "loss_lvr": 0.4849080443382263, "loss_mode_switch": 0.0, "loss_total": 1.6138079166412354, "step": 2001 }, { "batch_size": 4, "epoch": 0.8004, "step": 2001, "tokens_per_device": 3760 }, { "epoch": 0.8004, "loss_ce": 0.4292544424533844, "loss_lvr": 1.1229056119918823, "loss_mode_switch": 0.0, "loss_total": 0.541545033454895, "step": 2001 }, { "epoch": 0.8008, "grad_norm": 1.4843966960906982, "learning_rate": 1.0049823505996608e-06, "loss": 0.31, "step": 2002 }, { "batch_size": 4, "epoch": 0.8008, "step": 2002, "tokens_per_device": 9024 }, { "epoch": 0.8008, "loss_ce": 0.4387039244174957, "loss_lvr": 0.7759389877319336, "loss_mode_switch": 0.0, "loss_total": 0.5162978172302246, "step": 2002 }, { "batch_size": 4, "epoch": 0.8008, "step": 2002, "tokens_per_device": 3752 }, { "epoch": 0.8008, "loss_ce": 0.09672217816114426, "loss_lvr": 0.9049472212791443, "loss_mode_switch": 0.0, "loss_total": 0.18721690773963928, "step": 2002 }, { "batch_size": 4, "epoch": 0.8008, "step": 2002, "tokens_per_device": 5668 }, { "epoch": 0.8008, "loss_ce": 0.1303151249885559, "loss_lvr": 0.7013731598854065, "loss_mode_switch": 0.0, "loss_total": 0.20045244693756104, "step": 2002 }, { "batch_size": 4, "epoch": 0.8008, "step": 2002, "tokens_per_device": 5736 }, { "epoch": 0.8008, "loss_ce": 0.7642318606376648, "loss_lvr": 0.7676144242286682, "loss_mode_switch": 0.0, "loss_total": 0.8409932851791382, "step": 2002 }, { "batch_size": 4, "epoch": 0.8008, "step": 2002, "tokens_per_device": 4624 }, { "epoch": 0.8008, "loss_ce": 0.04894125834107399, "loss_lvr": 0.72712641954422, "loss_mode_switch": 0.0, "loss_total": 0.12165389955043793, "step": 2002 }, { "batch_size": 1, "epoch": 0.8008, "step": 2002, "tokens_per_device": 4993 }, { "epoch": 0.8008, "loss_ce": 0.0004968246212229133, "loss_lvr": 0.28838279843330383, "loss_mode_switch": 0.0, "loss_total": 0.029335105791687965, "step": 2002 }, { "batch_size": 4, "epoch": 0.8008, "step": 2002, "tokens_per_device": 3724 }, { "epoch": 0.8008, "loss_ce": 0.324692040681839, "loss_lvr": 0.9139445424079895, "loss_mode_switch": 0.0, "loss_total": 0.41608649492263794, "step": 2002 }, { "batch_size": 4, "epoch": 0.8008, "step": 2002, "tokens_per_device": 14788 }, { "epoch": 0.8008, "loss_ce": 0.18409843742847443, "loss_lvr": 0.6674405336380005, "loss_mode_switch": 0.0, "loss_total": 0.25084248185157776, "step": 2002 }, { "epoch": 0.8012, "grad_norm": 1.143492341041565, "learning_rate": 1.00109060644498e-06, "loss": 0.2679, "step": 2003 }, { "batch_size": 1, "epoch": 0.8012, "step": 2003, "tokens_per_device": 4863 }, { "epoch": 0.8012, "loss_ce": 0.001946700969710946, "loss_lvr": 0.18141891062259674, "loss_mode_switch": 0.0, "loss_total": 0.020088592544198036, "step": 2003 }, { "batch_size": 4, "epoch": 0.8012, "step": 2003, "tokens_per_device": 1252 }, { "epoch": 0.8012, "loss_ce": 0.18231351673603058, "loss_lvr": 0.9859662055969238, "loss_mode_switch": 0.0, "loss_total": 0.2809101343154907, "step": 2003 }, { "batch_size": 4, "epoch": 0.8012, "step": 2003, "tokens_per_device": 4404 }, { "epoch": 0.8012, "loss_ce": 0.6321581602096558, "loss_lvr": 0.7517536878585815, "loss_mode_switch": 0.0, "loss_total": 0.707333505153656, "step": 2003 }, { "batch_size": 4, "epoch": 0.8012, "step": 2003, "tokens_per_device": 4208 }, { "epoch": 0.8012, "loss_ce": 0.20731987059116364, "loss_lvr": 1.064559817314148, "loss_mode_switch": 0.0, "loss_total": 0.31377583742141724, "step": 2003 }, { "batch_size": 1, "epoch": 0.8012, "step": 2003, "tokens_per_device": 4896 }, { "epoch": 0.8012, "loss_ce": 0.005385294556617737, "loss_lvr": 0.6197375655174255, "loss_mode_switch": 0.0, "loss_total": 0.06735905259847641, "step": 2003 }, { "batch_size": 4, "epoch": 0.8012, "step": 2003, "tokens_per_device": 10696 }, { "epoch": 0.8012, "loss_ce": 0.16381914913654327, "loss_lvr": 0.9915074706077576, "loss_mode_switch": 0.0, "loss_total": 0.2629699110984802, "step": 2003 }, { "batch_size": 4, "epoch": 0.8012, "step": 2003, "tokens_per_device": 4204 }, { "epoch": 0.8012, "loss_ce": 0.10328510403633118, "loss_lvr": 1.7207003831863403, "loss_mode_switch": 0.0, "loss_total": 0.27535516023635864, "step": 2003 }, { "batch_size": 4, "epoch": 0.8012, "step": 2003, "tokens_per_device": 7568 }, { "epoch": 0.8012, "loss_ce": 0.1152825579047203, "loss_lvr": 0.8369413018226624, "loss_mode_switch": 0.0, "loss_total": 0.19897669553756714, "step": 2003 }, { "epoch": 0.8016, "grad_norm": 1.295754075050354, "learning_rate": 9.972055737619935e-07, "loss": 0.3098, "step": 2004 }, { "batch_size": 1, "epoch": 0.8016, "step": 2004, "tokens_per_device": 4902 }, { "epoch": 0.8016, "loss_ce": 0.012038666754961014, "loss_lvr": 0.9135596752166748, "loss_mode_switch": 0.0, "loss_total": 0.10339464247226715, "step": 2004 }, { "batch_size": 4, "epoch": 0.8016, "step": 2004, "tokens_per_device": 2012 }, { "epoch": 0.8016, "loss_ce": 0.03558356687426567, "loss_lvr": 0.7951799035072327, "loss_mode_switch": 0.0, "loss_total": 0.11510156095027924, "step": 2004 }, { "batch_size": 1, "epoch": 0.8016, "step": 2004, "tokens_per_device": 5176 }, { "epoch": 0.8016, "loss_ce": 0.012038602493703365, "loss_lvr": 0.4492264986038208, "loss_mode_switch": 0.0, "loss_total": 0.05696125328540802, "step": 2004 }, { "batch_size": 4, "epoch": 0.8016, "step": 2004, "tokens_per_device": 1928 }, { "epoch": 0.8016, "loss_ce": 0.19324953854084015, "loss_lvr": 1.2336019277572632, "loss_mode_switch": 0.0, "loss_total": 0.3166097402572632, "step": 2004 }, { "batch_size": 4, "epoch": 0.8016, "step": 2004, "tokens_per_device": 6452 }, { "epoch": 0.8016, "loss_ce": 0.14846523106098175, "loss_lvr": 0.53965824842453, "loss_mode_switch": 0.0, "loss_total": 0.20243105292320251, "step": 2004 }, { "batch_size": 4, "epoch": 0.8016, "step": 2004, "tokens_per_device": 6592 }, { "epoch": 0.8016, "loss_ce": 0.02261398173868656, "loss_lvr": 0.7582408785820007, "loss_mode_switch": 0.0, "loss_total": 0.0984380692243576, "step": 2004 }, { "batch_size": 4, "epoch": 0.8016, "step": 2004, "tokens_per_device": 4324 }, { "epoch": 0.8016, "loss_ce": 0.09314969927072525, "loss_lvr": 0.6425087451934814, "loss_mode_switch": 0.0, "loss_total": 0.15740057826042175, "step": 2004 }, { "batch_size": 4, "epoch": 0.8016, "step": 2004, "tokens_per_device": 6008 }, { "epoch": 0.8016, "loss_ce": 0.10469616949558258, "loss_lvr": 0.6126720905303955, "loss_mode_switch": 0.0, "loss_total": 0.16596338152885437, "step": 2004 }, { "epoch": 0.802, "grad_norm": 1.2443233728408813, "learning_rate": 9.933272590710508e-07, "loss": 0.2479, "step": 2005 }, { "batch_size": 4, "epoch": 0.802, "step": 2005, "tokens_per_device": 1404 }, { "epoch": 0.802, "loss_ce": 0.20166009664535522, "loss_lvr": 1.0775578022003174, "loss_mode_switch": 0.0, "loss_total": 0.30941587686538696, "step": 2005 }, { "batch_size": 1, "epoch": 0.802, "step": 2005, "tokens_per_device": 5483 }, { "epoch": 0.802, "loss_ce": 0.4336504638195038, "loss_lvr": 0.5446091294288635, "loss_mode_switch": 0.0, "loss_total": 0.48811137676239014, "step": 2005 }, { "batch_size": 4, "epoch": 0.802, "step": 2005, "tokens_per_device": 1176 }, { "epoch": 0.802, "loss_ce": 0.42431819438934326, "loss_lvr": 1.895591139793396, "loss_mode_switch": 0.0, "loss_total": 0.6138772964477539, "step": 2005 }, { "batch_size": 1, "epoch": 0.802, "step": 2005, "tokens_per_device": 5154 }, { "epoch": 0.802, "loss_ce": 0.15137875080108643, "loss_lvr": 0.44339513778686523, "loss_mode_switch": 0.0, "loss_total": 0.19571825861930847, "step": 2005 }, { "batch_size": 4, "epoch": 0.802, "step": 2005, "tokens_per_device": 1544 }, { "epoch": 0.802, "loss_ce": 0.7351984977722168, "loss_lvr": 0.9163063764572144, "loss_mode_switch": 0.0, "loss_total": 0.8268291354179382, "step": 2005 }, { "batch_size": 1, "epoch": 0.802, "step": 2005, "tokens_per_device": 4887 }, { "epoch": 0.802, "loss_ce": 0.08982517570257187, "loss_lvr": 0.3155515491962433, "loss_mode_switch": 0.0, "loss_total": 0.12138032913208008, "step": 2005 }, { "batch_size": 1, "epoch": 0.802, "step": 2005, "tokens_per_device": 5149 }, { "epoch": 0.802, "loss_ce": 0.19796380400657654, "loss_lvr": 0.5157383680343628, "loss_mode_switch": 0.0, "loss_total": 0.2495376467704773, "step": 2005 }, { "batch_size": 1, "epoch": 0.802, "step": 2005, "tokens_per_device": 4917 }, { "epoch": 0.802, "loss_ce": 0.029258212074637413, "loss_lvr": 0.3982153534889221, "loss_mode_switch": 0.0, "loss_total": 0.06907974928617477, "step": 2005 }, { "epoch": 0.8024, "grad_norm": 1.5574610233306885, "learning_rate": 9.89455668881225e-07, "loss": 0.335, "step": 2006 }, { "batch_size": 4, "epoch": 0.8024, "step": 2006, "tokens_per_device": 5780 }, { "epoch": 0.8024, "loss_ce": 0.30016136169433594, "loss_lvr": 0.7097100615501404, "loss_mode_switch": 0.0, "loss_total": 0.37113237380981445, "step": 2006 }, { "batch_size": 1, "epoch": 0.8024, "step": 2006, "tokens_per_device": 5105 }, { "epoch": 0.8024, "loss_ce": 0.7060654759407043, "loss_lvr": 0.6138105988502502, "loss_mode_switch": 0.0, "loss_total": 0.7674465179443359, "step": 2006 }, { "batch_size": 1, "epoch": 0.8024, "step": 2006, "tokens_per_device": 4888 }, { "epoch": 0.8024, "loss_ce": 0.00796505156904459, "loss_lvr": 0.3999446928501129, "loss_mode_switch": 0.0, "loss_total": 0.047959521412849426, "step": 2006 }, { "batch_size": 4, "epoch": 0.8024, "step": 2006, "tokens_per_device": 2572 }, { "epoch": 0.8024, "loss_ce": 0.09009286761283875, "loss_lvr": 0.8177953362464905, "loss_mode_switch": 0.0, "loss_total": 0.17187240719795227, "step": 2006 }, { "batch_size": 4, "epoch": 0.8024, "step": 2006, "tokens_per_device": 5244 }, { "epoch": 0.8024, "loss_ce": 0.0005320991622284055, "loss_lvr": 0.9369680881500244, "loss_mode_switch": 0.0, "loss_total": 0.09422890841960907, "step": 2006 }, { "batch_size": 1, "epoch": 0.8024, "step": 2006, "tokens_per_device": 5140 }, { "epoch": 0.8024, "loss_ce": 0.029147453606128693, "loss_lvr": 0.4866476356983185, "loss_mode_switch": 0.0, "loss_total": 0.07781221717596054, "step": 2006 }, { "batch_size": 1, "epoch": 0.8024, "step": 2006, "tokens_per_device": 5244 }, { "epoch": 0.8024, "loss_ce": 0.00114831340033561, "loss_lvr": 0.27860110998153687, "loss_mode_switch": 0.0, "loss_total": 0.029008423909544945, "step": 2006 }, { "batch_size": 4, "epoch": 0.8024, "step": 2006, "tokens_per_device": 5460 }, { "epoch": 0.8024, "loss_ce": 0.20252472162246704, "loss_lvr": 0.8497390151023865, "loss_mode_switch": 0.0, "loss_total": 0.2874986231327057, "step": 2006 }, { "epoch": 0.8028, "grad_norm": 1.293031096458435, "learning_rate": 9.855908096903055e-07, "loss": 0.2735, "step": 2007 }, { "batch_size": 4, "epoch": 0.8028, "step": 2007, "tokens_per_device": 2544 }, { "epoch": 0.8028, "loss_ce": 0.12481202185153961, "loss_lvr": 0.739536702632904, "loss_mode_switch": 0.0, "loss_total": 0.19876569509506226, "step": 2007 }, { "batch_size": 4, "epoch": 0.8028, "step": 2007, "tokens_per_device": 5924 }, { "epoch": 0.8028, "loss_ce": 0.14716319739818573, "loss_lvr": 0.7895510196685791, "loss_mode_switch": 0.0, "loss_total": 0.2261182963848114, "step": 2007 }, { "batch_size": 4, "epoch": 0.8028, "step": 2007, "tokens_per_device": 4196 }, { "epoch": 0.8028, "loss_ce": 0.10126715898513794, "loss_lvr": 0.9023925065994263, "loss_mode_switch": 0.0, "loss_total": 0.19150641560554504, "step": 2007 }, { "batch_size": 4, "epoch": 0.8028, "step": 2007, "tokens_per_device": 2056 }, { "epoch": 0.8028, "loss_ce": 0.5493577718734741, "loss_lvr": 1.132380485534668, "loss_mode_switch": 0.0, "loss_total": 0.662595808506012, "step": 2007 }, { "batch_size": 4, "epoch": 0.8028, "step": 2007, "tokens_per_device": 2476 }, { "epoch": 0.8028, "loss_ce": 0.44595006108283997, "loss_lvr": 0.773256242275238, "loss_mode_switch": 0.0, "loss_total": 0.5232756733894348, "step": 2007 }, { "batch_size": 4, "epoch": 0.8028, "step": 2007, "tokens_per_device": 4852 }, { "epoch": 0.8028, "loss_ce": 0.3419220745563507, "loss_lvr": 0.7196146249771118, "loss_mode_switch": 0.0, "loss_total": 0.4138835370540619, "step": 2007 }, { "batch_size": 4, "epoch": 0.8028, "step": 2007, "tokens_per_device": 4412 }, { "epoch": 0.8028, "loss_ce": 0.04531930387020111, "loss_lvr": 1.0283551216125488, "loss_mode_switch": 0.0, "loss_total": 0.1481548249721527, "step": 2007 }, { "batch_size": 4, "epoch": 0.8028, "step": 2007, "tokens_per_device": 4416 }, { "epoch": 0.8028, "loss_ce": 0.4258815348148346, "loss_lvr": 1.5217758417129517, "loss_mode_switch": 0.0, "loss_total": 0.5780591368675232, "step": 2007 }, { "epoch": 0.8032, "grad_norm": 1.373408317565918, "learning_rate": 9.81732687984786e-07, "loss": 0.332, "step": 2008 }, { "batch_size": 4, "epoch": 0.8032, "step": 2008, "tokens_per_device": 4328 }, { "epoch": 0.8032, "loss_ce": 0.1313391774892807, "loss_lvr": 0.5117478370666504, "loss_mode_switch": 0.0, "loss_total": 0.18251396715641022, "step": 2008 }, { "batch_size": 4, "epoch": 0.8032, "step": 2008, "tokens_per_device": 3888 }, { "epoch": 0.8032, "loss_ce": 0.5295540690422058, "loss_lvr": 1.0542577505111694, "loss_mode_switch": 0.0, "loss_total": 0.6349798440933228, "step": 2008 }, { "batch_size": 4, "epoch": 0.8032, "step": 2008, "tokens_per_device": 4460 }, { "epoch": 0.8032, "loss_ce": 0.2621525526046753, "loss_lvr": 0.9570909142494202, "loss_mode_switch": 0.0, "loss_total": 0.35786163806915283, "step": 2008 }, { "batch_size": 1, "epoch": 0.8032, "step": 2008, "tokens_per_device": 4883 }, { "epoch": 0.8032, "loss_ce": 0.27437660098075867, "loss_lvr": 0.2678316831588745, "loss_mode_switch": 0.0, "loss_total": 0.3011597692966461, "step": 2008 }, { "batch_size": 1, "epoch": 0.8032, "step": 2008, "tokens_per_device": 4892 }, { "epoch": 0.8032, "loss_ce": 0.12662874162197113, "loss_lvr": 0.9105193614959717, "loss_mode_switch": 0.0, "loss_total": 0.2176806777715683, "step": 2008 }, { "batch_size": 1, "epoch": 0.8032, "step": 2008, "tokens_per_device": 5101 }, { "epoch": 0.8032, "loss_ce": 0.00018543450278230011, "loss_lvr": 0.34066760540008545, "loss_mode_switch": 0.0, "loss_total": 0.03425219655036926, "step": 2008 }, { "batch_size": 4, "epoch": 0.8032, "step": 2008, "tokens_per_device": 1524 }, { "epoch": 0.8032, "loss_ce": 0.4402012526988983, "loss_lvr": 0.7814330458641052, "loss_mode_switch": 0.0, "loss_total": 0.5183445811271667, "step": 2008 }, { "batch_size": 4, "epoch": 0.8032, "step": 2008, "tokens_per_device": 1384 }, { "epoch": 0.8032, "loss_ce": 0.5067851543426514, "loss_lvr": 0.9773566126823425, "loss_mode_switch": 0.0, "loss_total": 0.6045207977294922, "step": 2008 }, { "epoch": 0.8036, "grad_norm": 1.2555001974105835, "learning_rate": 9.778813102398494e-07, "loss": 0.284, "step": 2009 }, { "batch_size": 4, "epoch": 0.8036, "step": 2009, "tokens_per_device": 4816 }, { "epoch": 0.8036, "loss_ce": 0.08160652220249176, "loss_lvr": 0.727517306804657, "loss_mode_switch": 0.0, "loss_total": 0.15435825288295746, "step": 2009 }, { "batch_size": 4, "epoch": 0.8036, "step": 2009, "tokens_per_device": 3820 }, { "epoch": 0.8036, "loss_ce": 0.36236175894737244, "loss_lvr": 0.8173173666000366, "loss_mode_switch": 0.0, "loss_total": 0.4440934956073761, "step": 2009 }, { "batch_size": 4, "epoch": 0.8036, "step": 2009, "tokens_per_device": 4188 }, { "epoch": 0.8036, "loss_ce": 0.4301077127456665, "loss_lvr": 0.8686233162879944, "loss_mode_switch": 0.0, "loss_total": 0.5169700384140015, "step": 2009 }, { "batch_size": 4, "epoch": 0.8036, "step": 2009, "tokens_per_device": 1192 }, { "epoch": 0.8036, "loss_ce": 0.12777501344680786, "loss_lvr": 1.0076184272766113, "loss_mode_switch": 0.0, "loss_total": 0.22853685915470123, "step": 2009 }, { "batch_size": 4, "epoch": 0.8036, "step": 2009, "tokens_per_device": 6412 }, { "epoch": 0.8036, "loss_ce": 0.22214293479919434, "loss_lvr": 0.798470139503479, "loss_mode_switch": 0.0, "loss_total": 0.30198994278907776, "step": 2009 }, { "batch_size": 4, "epoch": 0.8036, "step": 2009, "tokens_per_device": 4612 }, { "epoch": 0.8036, "loss_ce": 0.06454402208328247, "loss_lvr": 0.7575576305389404, "loss_mode_switch": 0.0, "loss_total": 0.14029979705810547, "step": 2009 }, { "batch_size": 4, "epoch": 0.8036, "step": 2009, "tokens_per_device": 4564 }, { "epoch": 0.8036, "loss_ce": 0.5380169749259949, "loss_lvr": 1.0150256156921387, "loss_mode_switch": 0.0, "loss_total": 0.6395195126533508, "step": 2009 }, { "batch_size": 4, "epoch": 0.8036, "step": 2009, "tokens_per_device": 2692 }, { "epoch": 0.8036, "loss_ce": 0.7816072702407837, "loss_lvr": 0.852063000202179, "loss_mode_switch": 0.0, "loss_total": 0.8668135404586792, "step": 2009 }, { "epoch": 0.804, "grad_norm": 1.4785065650939941, "learning_rate": 9.740366829193587e-07, "loss": 0.3128, "step": 2010 }, { "batch_size": 4, "epoch": 0.804, "step": 2010, "tokens_per_device": 2648 }, { "epoch": 0.804, "loss_ce": 0.492000550031662, "loss_lvr": 0.7388022541999817, "loss_mode_switch": 0.0, "loss_total": 0.5658807754516602, "step": 2010 }, { "batch_size": 4, "epoch": 0.804, "step": 2010, "tokens_per_device": 2632 }, { "epoch": 0.804, "loss_ce": 0.16025546193122864, "loss_lvr": 1.581228256225586, "loss_mode_switch": 0.0, "loss_total": 0.3183782696723938, "step": 2010 }, { "batch_size": 4, "epoch": 0.804, "step": 2010, "tokens_per_device": 4260 }, { "epoch": 0.804, "loss_ce": 0.13005034625530243, "loss_lvr": 0.9141320586204529, "loss_mode_switch": 0.0, "loss_total": 0.22146356105804443, "step": 2010 }, { "batch_size": 4, "epoch": 0.804, "step": 2010, "tokens_per_device": 4236 }, { "epoch": 0.804, "loss_ce": 0.22693631052970886, "loss_lvr": 0.8889989852905273, "loss_mode_switch": 0.0, "loss_total": 0.31583622097969055, "step": 2010 }, { "batch_size": 4, "epoch": 0.804, "step": 2010, "tokens_per_device": 12856 }, { "epoch": 0.804, "loss_ce": 0.002607946516945958, "loss_lvr": 2.002224922180176, "loss_mode_switch": 0.0, "loss_total": 0.20283043384552002, "step": 2010 }, { "batch_size": 1, "epoch": 0.804, "step": 2010, "tokens_per_device": 4745 }, { "epoch": 0.804, "loss_ce": 0.1352611929178238, "loss_lvr": 0.40030357241630554, "loss_mode_switch": 0.0, "loss_total": 0.17529155313968658, "step": 2010 }, { "batch_size": 4, "epoch": 0.804, "step": 2010, "tokens_per_device": 3772 }, { "epoch": 0.804, "loss_ce": 0.4670916497707367, "loss_lvr": 1.0161019563674927, "loss_mode_switch": 0.0, "loss_total": 0.5687018632888794, "step": 2010 }, { "batch_size": 1, "epoch": 0.804, "step": 2010, "tokens_per_device": 4460 }, { "epoch": 0.804, "loss_ce": 0.027997571974992752, "loss_lvr": 0.4351711869239807, "loss_mode_switch": 0.0, "loss_total": 0.07151469588279724, "step": 2010 }, { "epoch": 0.8044, "grad_norm": 1.2350449562072754, "learning_rate": 9.701988124758544e-07, "loss": 0.2736, "step": 2011 }, { "batch_size": 4, "epoch": 0.8044, "step": 2011, "tokens_per_device": 5216 }, { "epoch": 0.8044, "loss_ce": 0.599868655204773, "loss_lvr": 0.6460448503494263, "loss_mode_switch": 0.0, "loss_total": 0.6644731163978577, "step": 2011 }, { "batch_size": 4, "epoch": 0.8044, "step": 2011, "tokens_per_device": 3252 }, { "epoch": 0.8044, "loss_ce": 0.13953980803489685, "loss_lvr": 0.719211995601654, "loss_mode_switch": 0.0, "loss_total": 0.21146100759506226, "step": 2011 }, { "batch_size": 1, "epoch": 0.8044, "step": 2011, "tokens_per_device": 6430 }, { "epoch": 0.8044, "loss_ce": 0.07360837608575821, "loss_lvr": 0.29577988386154175, "loss_mode_switch": 0.0, "loss_total": 0.10318636894226074, "step": 2011 }, { "batch_size": 4, "epoch": 0.8044, "step": 2011, "tokens_per_device": 6064 }, { "epoch": 0.8044, "loss_ce": 0.3673878014087677, "loss_lvr": 0.6772300004959106, "loss_mode_switch": 0.0, "loss_total": 0.43511080741882324, "step": 2011 }, { "batch_size": 4, "epoch": 0.8044, "step": 2011, "tokens_per_device": 4348 }, { "epoch": 0.8044, "loss_ce": 0.19667062163352966, "loss_lvr": 0.8133013844490051, "loss_mode_switch": 0.0, "loss_total": 0.27800077199935913, "step": 2011 }, { "batch_size": 4, "epoch": 0.8044, "step": 2011, "tokens_per_device": 4516 }, { "epoch": 0.8044, "loss_ce": 0.3912769854068756, "loss_lvr": 0.8779909610748291, "loss_mode_switch": 0.0, "loss_total": 0.479076087474823, "step": 2011 }, { "batch_size": 4, "epoch": 0.8044, "step": 2011, "tokens_per_device": 6768 }, { "epoch": 0.8044, "loss_ce": 0.6667504906654358, "loss_lvr": 0.609641969203949, "loss_mode_switch": 0.0, "loss_total": 0.7277146577835083, "step": 2011 }, { "batch_size": 1, "epoch": 0.8044, "step": 2011, "tokens_per_device": 5764 }, { "epoch": 0.8044, "loss_ce": 0.003986352123320103, "loss_lvr": 0.3222317099571228, "loss_mode_switch": 0.0, "loss_total": 0.03620952367782593, "step": 2011 }, { "epoch": 0.8048, "grad_norm": 1.3470587730407715, "learning_rate": 9.663677053505283e-07, "loss": 0.3195, "step": 2012 }, { "batch_size": 1, "epoch": 0.8048, "step": 2012, "tokens_per_device": 5181 }, { "epoch": 0.8048, "loss_ce": 0.00047115961206145585, "loss_lvr": 0.3580799102783203, "loss_mode_switch": 0.0, "loss_total": 0.03627915307879448, "step": 2012 }, { "batch_size": 4, "epoch": 0.8048, "step": 2012, "tokens_per_device": 3936 }, { "epoch": 0.8048, "loss_ce": 0.3177848160266876, "loss_lvr": 1.8026188611984253, "loss_mode_switch": 0.0, "loss_total": 0.4980466961860657, "step": 2012 }, { "batch_size": 4, "epoch": 0.8048, "step": 2012, "tokens_per_device": 4340 }, { "epoch": 0.8048, "loss_ce": 0.4440627098083496, "loss_lvr": 0.6859941482543945, "loss_mode_switch": 0.0, "loss_total": 0.5126621127128601, "step": 2012 }, { "batch_size": 1, "epoch": 0.8048, "step": 2012, "tokens_per_device": 4268 }, { "epoch": 0.8048, "loss_ce": 0.018805360421538353, "loss_lvr": 0.7570685744285583, "loss_mode_switch": 0.0, "loss_total": 0.0945122241973877, "step": 2012 }, { "batch_size": 1, "epoch": 0.8048, "step": 2012, "tokens_per_device": 7572 }, { "epoch": 0.8048, "loss_ce": 0.21603256464004517, "loss_lvr": 0.28261205554008484, "loss_mode_switch": 0.0, "loss_total": 0.24429376423358917, "step": 2012 }, { "batch_size": 4, "epoch": 0.8048, "step": 2012, "tokens_per_device": 2548 }, { "epoch": 0.8048, "loss_ce": 0.0586857907474041, "loss_lvr": 0.9222754836082458, "loss_mode_switch": 0.0, "loss_total": 0.15091334283351898, "step": 2012 }, { "batch_size": 4, "epoch": 0.8048, "step": 2012, "tokens_per_device": 4148 }, { "epoch": 0.8048, "loss_ce": 0.16574570536613464, "loss_lvr": 0.6835560202598572, "loss_mode_switch": 0.0, "loss_total": 0.2341013103723526, "step": 2012 }, { "batch_size": 4, "epoch": 0.8048, "step": 2012, "tokens_per_device": 4984 }, { "epoch": 0.8048, "loss_ce": 0.058264438062906265, "loss_lvr": 0.8280443549156189, "loss_mode_switch": 0.0, "loss_total": 0.14106887578964233, "step": 2012 }, { "epoch": 0.8052, "grad_norm": 1.2477604150772095, "learning_rate": 9.625433679732288e-07, "loss": 0.2634, "step": 2013 }, { "batch_size": 4, "epoch": 0.8052, "step": 2013, "tokens_per_device": 4392 }, { "epoch": 0.8052, "loss_ce": 0.028006820008158684, "loss_lvr": 0.7134624719619751, "loss_mode_switch": 0.0, "loss_total": 0.09935306757688522, "step": 2013 }, { "batch_size": 4, "epoch": 0.8052, "step": 2013, "tokens_per_device": 3900 }, { "epoch": 0.8052, "loss_ce": 0.7018848657608032, "loss_lvr": 0.8110172152519226, "loss_mode_switch": 0.0, "loss_total": 0.782986581325531, "step": 2013 }, { "batch_size": 4, "epoch": 0.8052, "step": 2013, "tokens_per_device": 4456 }, { "epoch": 0.8052, "loss_ce": 0.042224183678627014, "loss_lvr": 0.9289625287055969, "loss_mode_switch": 0.0, "loss_total": 0.1351204365491867, "step": 2013 }, { "batch_size": 4, "epoch": 0.8052, "step": 2013, "tokens_per_device": 4392 }, { "epoch": 0.8052, "loss_ce": 0.557948887348175, "loss_lvr": 0.8242096900939941, "loss_mode_switch": 0.0, "loss_total": 0.6403698325157166, "step": 2013 }, { "batch_size": 4, "epoch": 0.8052, "step": 2013, "tokens_per_device": 4476 }, { "epoch": 0.8052, "loss_ce": 0.28592196106910706, "loss_lvr": 0.7843603491783142, "loss_mode_switch": 0.0, "loss_total": 0.36435800790786743, "step": 2013 }, { "batch_size": 4, "epoch": 0.8052, "step": 2013, "tokens_per_device": 3760 }, { "epoch": 0.8052, "loss_ce": 0.06399255990982056, "loss_lvr": 0.6886420249938965, "loss_mode_switch": 0.0, "loss_total": 0.13285675644874573, "step": 2013 }, { "batch_size": 4, "epoch": 0.8052, "step": 2013, "tokens_per_device": 4236 }, { "epoch": 0.8052, "loss_ce": 0.2562301754951477, "loss_lvr": 0.78525710105896, "loss_mode_switch": 0.0, "loss_total": 0.33475589752197266, "step": 2013 }, { "batch_size": 4, "epoch": 0.8052, "step": 2013, "tokens_per_device": 7840 }, { "epoch": 0.8052, "loss_ce": 0.5632442235946655, "loss_lvr": 2.216256618499756, "loss_mode_switch": 0.0, "loss_total": 0.784869909286499, "step": 2013 }, { "epoch": 0.8056, "grad_norm": 1.2497575283050537, "learning_rate": 9.587258067624373e-07, "loss": 0.3211, "step": 2014 }, { "batch_size": 4, "epoch": 0.8056, "step": 2014, "tokens_per_device": 6012 }, { "epoch": 0.8056, "loss_ce": 0.07321431487798691, "loss_lvr": 0.6866039037704468, "loss_mode_switch": 0.0, "loss_total": 0.14187470078468323, "step": 2014 }, { "batch_size": 1, "epoch": 0.8056, "step": 2014, "tokens_per_device": 5956 }, { "epoch": 0.8056, "loss_ce": 0.09346602857112885, "loss_lvr": 0.4578321874141693, "loss_mode_switch": 0.0, "loss_total": 0.13924925029277802, "step": 2014 }, { "batch_size": 4, "epoch": 0.8056, "step": 2014, "tokens_per_device": 5792 }, { "epoch": 0.8056, "loss_ce": 0.24613891541957855, "loss_lvr": 1.0726104974746704, "loss_mode_switch": 0.0, "loss_total": 0.35339996218681335, "step": 2014 }, { "batch_size": 4, "epoch": 0.8056, "step": 2014, "tokens_per_device": 3516 }, { "epoch": 0.8056, "loss_ce": 0.266887366771698, "loss_lvr": 0.9336377382278442, "loss_mode_switch": 0.0, "loss_total": 0.36025112867355347, "step": 2014 }, { "batch_size": 4, "epoch": 0.8056, "step": 2014, "tokens_per_device": 1604 }, { "epoch": 0.8056, "loss_ce": 0.2899702489376068, "loss_lvr": 0.9881383180618286, "loss_mode_switch": 0.0, "loss_total": 0.3887840807437897, "step": 2014 }, { "batch_size": 1, "epoch": 0.8056, "step": 2014, "tokens_per_device": 5186 }, { "epoch": 0.8056, "loss_ce": 0.015165461227297783, "loss_lvr": 0.4707379937171936, "loss_mode_switch": 0.0, "loss_total": 0.062239259481430054, "step": 2014 }, { "batch_size": 4, "epoch": 0.8056, "step": 2014, "tokens_per_device": 5500 }, { "epoch": 0.8056, "loss_ce": 0.2897922098636627, "loss_lvr": 0.9333998560905457, "loss_mode_switch": 0.0, "loss_total": 0.3831321895122528, "step": 2014 }, { "batch_size": 4, "epoch": 0.8056, "step": 2014, "tokens_per_device": 4220 }, { "epoch": 0.8056, "loss_ce": 0.1256018429994583, "loss_lvr": 0.9973007440567017, "loss_mode_switch": 0.0, "loss_total": 0.22533191740512848, "step": 2014 }, { "epoch": 0.806, "grad_norm": 1.3645075559616089, "learning_rate": 9.549150281252633e-07, "loss": 0.2593, "step": 2015 }, { "batch_size": 4, "epoch": 0.806, "step": 2015, "tokens_per_device": 14116 }, { "epoch": 0.806, "loss_ce": 0.09170392155647278, "loss_lvr": 0.7793652415275574, "loss_mode_switch": 0.0, "loss_total": 0.169640451669693, "step": 2015 }, { "batch_size": 4, "epoch": 0.806, "step": 2015, "tokens_per_device": 2704 }, { "epoch": 0.806, "loss_ce": 0.2483748346567154, "loss_lvr": 0.6677838563919067, "loss_mode_switch": 0.0, "loss_total": 0.31515321135520935, "step": 2015 }, { "batch_size": 4, "epoch": 0.806, "step": 2015, "tokens_per_device": 1784 }, { "epoch": 0.806, "loss_ce": 0.4035879075527191, "loss_lvr": 0.9539139866828918, "loss_mode_switch": 0.0, "loss_total": 0.4989793002605438, "step": 2015 }, { "batch_size": 4, "epoch": 0.806, "step": 2015, "tokens_per_device": 4392 }, { "epoch": 0.806, "loss_ce": 0.08248618245124817, "loss_lvr": 0.9985421299934387, "loss_mode_switch": 0.0, "loss_total": 0.18234039843082428, "step": 2015 }, { "batch_size": 4, "epoch": 0.806, "step": 2015, "tokens_per_device": 3832 }, { "epoch": 0.806, "loss_ce": 0.14626893401145935, "loss_lvr": 1.0347371101379395, "loss_mode_switch": 0.0, "loss_total": 0.24974265694618225, "step": 2015 }, { "batch_size": 1, "epoch": 0.806, "step": 2015, "tokens_per_device": 5050 }, { "epoch": 0.806, "loss_ce": 0.04082154855132103, "loss_lvr": 0.37771543860435486, "loss_mode_switch": 0.0, "loss_total": 0.07859309017658234, "step": 2015 }, { "batch_size": 4, "epoch": 0.806, "step": 2015, "tokens_per_device": 5652 }, { "epoch": 0.806, "loss_ce": 0.7461789846420288, "loss_lvr": 1.0853708982467651, "loss_mode_switch": 0.0, "loss_total": 0.8547160625457764, "step": 2015 }, { "batch_size": 4, "epoch": 0.806, "step": 2015, "tokens_per_device": 6116 }, { "epoch": 0.806, "loss_ce": 0.4329911172389984, "loss_lvr": 0.7060641050338745, "loss_mode_switch": 0.0, "loss_total": 0.5035974979400635, "step": 2015 }, { "epoch": 0.8064, "grad_norm": 1.3175013065338135, "learning_rate": 9.511110384574345e-07, "loss": 0.3123, "step": 2016 }, { "batch_size": 4, "epoch": 0.8064, "step": 2016, "tokens_per_device": 3852 }, { "epoch": 0.8064, "loss_ce": 0.08034258335828781, "loss_lvr": 1.0506575107574463, "loss_mode_switch": 0.0, "loss_total": 0.1854083389043808, "step": 2016 }, { "batch_size": 4, "epoch": 0.8064, "step": 2016, "tokens_per_device": 5576 }, { "epoch": 0.8064, "loss_ce": 0.257347047328949, "loss_lvr": 0.7279014587402344, "loss_mode_switch": 0.0, "loss_total": 0.3301371932029724, "step": 2016 }, { "batch_size": 1, "epoch": 0.8064, "step": 2016, "tokens_per_device": 5025 }, { "epoch": 0.8064, "loss_ce": 0.00430330028757453, "loss_lvr": 0.5531899333000183, "loss_mode_switch": 0.0, "loss_total": 0.05962229147553444, "step": 2016 }, { "batch_size": 4, "epoch": 0.8064, "step": 2016, "tokens_per_device": 4540 }, { "epoch": 0.8064, "loss_ce": 0.292081356048584, "loss_lvr": 0.8728130459785461, "loss_mode_switch": 0.0, "loss_total": 0.37936267256736755, "step": 2016 }, { "batch_size": 1, "epoch": 0.8064, "step": 2016, "tokens_per_device": 4962 }, { "epoch": 0.8064, "loss_ce": 0.06788188964128494, "loss_lvr": 0.34572911262512207, "loss_mode_switch": 0.0, "loss_total": 0.10245479643344879, "step": 2016 }, { "batch_size": 4, "epoch": 0.8064, "step": 2016, "tokens_per_device": 1248 }, { "epoch": 0.8064, "loss_ce": 0.2125253677368164, "loss_lvr": 1.0812517404556274, "loss_mode_switch": 0.0, "loss_total": 0.32065054774284363, "step": 2016 }, { "batch_size": 4, "epoch": 0.8064, "step": 2016, "tokens_per_device": 1508 }, { "epoch": 0.8064, "loss_ce": 0.2183360457420349, "loss_lvr": 0.8832959532737732, "loss_mode_switch": 0.0, "loss_total": 0.30666565895080566, "step": 2016 }, { "batch_size": 4, "epoch": 0.8064, "step": 2016, "tokens_per_device": 2676 }, { "epoch": 0.8064, "loss_ce": 0.3885430693626404, "loss_lvr": 0.8144407272338867, "loss_mode_switch": 0.0, "loss_total": 0.469987154006958, "step": 2016 }, { "epoch": 0.8068, "grad_norm": 1.2269303798675537, "learning_rate": 9.473138441432855e-07, "loss": 0.266, "step": 2017 }, { "batch_size": 4, "epoch": 0.8068, "step": 2017, "tokens_per_device": 4316 }, { "epoch": 0.8068, "loss_ce": 0.3191157877445221, "loss_lvr": 1.1457334756851196, "loss_mode_switch": 0.0, "loss_total": 0.433689147233963, "step": 2017 }, { "batch_size": 4, "epoch": 0.8068, "step": 2017, "tokens_per_device": 10396 }, { "epoch": 0.8068, "loss_ce": 0.026340389624238014, "loss_lvr": 0.7671351432800293, "loss_mode_switch": 0.0, "loss_total": 0.10305390506982803, "step": 2017 }, { "batch_size": 4, "epoch": 0.8068, "step": 2017, "tokens_per_device": 1964 }, { "epoch": 0.8068, "loss_ce": 0.7547409534454346, "loss_lvr": 0.7761825919151306, "loss_mode_switch": 0.0, "loss_total": 0.8323591947555542, "step": 2017 }, { "batch_size": 1, "epoch": 0.8068, "step": 2017, "tokens_per_device": 5102 }, { "epoch": 0.8068, "loss_ce": 0.0008484887657687068, "loss_lvr": 0.2656385600566864, "loss_mode_switch": 0.0, "loss_total": 0.02741234563291073, "step": 2017 }, { "batch_size": 4, "epoch": 0.8068, "step": 2017, "tokens_per_device": 3252 }, { "epoch": 0.8068, "loss_ce": 0.6707496047019958, "loss_lvr": 0.46928098797798157, "loss_mode_switch": 0.0, "loss_total": 0.7176777124404907, "step": 2017 }, { "batch_size": 4, "epoch": 0.8068, "step": 2017, "tokens_per_device": 1456 }, { "epoch": 0.8068, "loss_ce": 0.5917843580245972, "loss_lvr": 0.9815289974212646, "loss_mode_switch": 0.0, "loss_total": 0.6899372339248657, "step": 2017 }, { "batch_size": 4, "epoch": 0.8068, "step": 2017, "tokens_per_device": 1844 }, { "epoch": 0.8068, "loss_ce": 0.4868086278438568, "loss_lvr": 0.8678532242774963, "loss_mode_switch": 0.0, "loss_total": 0.5735939741134644, "step": 2017 }, { "batch_size": 1, "epoch": 0.8068, "step": 2017, "tokens_per_device": 6401 }, { "epoch": 0.8068, "loss_ce": 0.12329740822315216, "loss_lvr": 0.3470575511455536, "loss_mode_switch": 0.0, "loss_total": 0.15800316631793976, "step": 2017 }, { "epoch": 0.8072, "grad_norm": 1.9414232969284058, "learning_rate": 9.435234515557434e-07, "loss": 0.2668, "step": 2018 }, { "batch_size": 1, "epoch": 0.8072, "step": 2018, "tokens_per_device": 5121 }, { "epoch": 0.8072, "loss_ce": 0.0020930664613842964, "loss_lvr": 0.710239052772522, "loss_mode_switch": 0.0, "loss_total": 0.0731169730424881, "step": 2018 }, { "batch_size": 4, "epoch": 0.8072, "step": 2018, "tokens_per_device": 1488 }, { "epoch": 0.8072, "loss_ce": 0.4202607274055481, "loss_lvr": 0.878551721572876, "loss_mode_switch": 0.0, "loss_total": 0.5081158876419067, "step": 2018 }, { "batch_size": 4, "epoch": 0.8072, "step": 2018, "tokens_per_device": 3728 }, { "epoch": 0.8072, "loss_ce": 0.27878499031066895, "loss_lvr": 0.9648647904396057, "loss_mode_switch": 0.0, "loss_total": 0.3752714693546295, "step": 2018 }, { "batch_size": 1, "epoch": 0.8072, "step": 2018, "tokens_per_device": 5188 }, { "epoch": 0.8072, "loss_ce": 0.006545285228639841, "loss_lvr": 0.3183728754520416, "loss_mode_switch": 0.0, "loss_total": 0.038382574915885925, "step": 2018 }, { "batch_size": 4, "epoch": 0.8072, "step": 2018, "tokens_per_device": 4356 }, { "epoch": 0.8072, "loss_ce": 0.08065483719110489, "loss_lvr": 0.9527552723884583, "loss_mode_switch": 0.0, "loss_total": 0.17593036592006683, "step": 2018 }, { "batch_size": 4, "epoch": 0.8072, "step": 2018, "tokens_per_device": 5776 }, { "epoch": 0.8072, "loss_ce": 0.11982746422290802, "loss_lvr": 0.8617863655090332, "loss_mode_switch": 0.0, "loss_total": 0.20600610971450806, "step": 2018 }, { "batch_size": 1, "epoch": 0.8072, "step": 2018, "tokens_per_device": 4961 }, { "epoch": 0.8072, "loss_ce": 0.09632924199104309, "loss_lvr": 0.26050081849098206, "loss_mode_switch": 0.0, "loss_total": 0.12237932533025742, "step": 2018 }, { "batch_size": 1, "epoch": 0.8072, "step": 2018, "tokens_per_device": 5121 }, { "epoch": 0.8072, "loss_ce": 0.12626859545707703, "loss_lvr": 0.49840718507766724, "loss_mode_switch": 0.0, "loss_total": 0.17610931396484375, "step": 2018 }, { "epoch": 0.8076, "grad_norm": 1.3507426977157593, "learning_rate": 9.397398670563201e-07, "loss": 0.2901, "step": 2019 }, { "batch_size": 1, "epoch": 0.8076, "step": 2019, "tokens_per_device": 4743 }, { "epoch": 0.8076, "loss_ce": 0.043030038475990295, "loss_lvr": 0.21674151718616486, "loss_mode_switch": 0.0, "loss_total": 0.06470419466495514, "step": 2019 }, { "batch_size": 4, "epoch": 0.8076, "step": 2019, "tokens_per_device": 4276 }, { "epoch": 0.8076, "loss_ce": 0.7378387451171875, "loss_lvr": 0.7517585158348083, "loss_mode_switch": 0.0, "loss_total": 0.8130146265029907, "step": 2019 }, { "batch_size": 1, "epoch": 0.8076, "step": 2019, "tokens_per_device": 5523 }, { "epoch": 0.8076, "loss_ce": 0.4249846637248993, "loss_lvr": 0.47711867094039917, "loss_mode_switch": 0.0, "loss_total": 0.47269654273986816, "step": 2019 }, { "batch_size": 1, "epoch": 0.8076, "step": 2019, "tokens_per_device": 4854 }, { "epoch": 0.8076, "loss_ce": 0.0010259434347972274, "loss_lvr": 0.3662126362323761, "loss_mode_switch": 0.0, "loss_total": 0.03764721006155014, "step": 2019 }, { "batch_size": 4, "epoch": 0.8076, "step": 2019, "tokens_per_device": 4856 }, { "epoch": 0.8076, "loss_ce": 0.3434566557407379, "loss_lvr": 0.5950896143913269, "loss_mode_switch": 0.0, "loss_total": 0.40296560525894165, "step": 2019 }, { "batch_size": 4, "epoch": 0.8076, "step": 2019, "tokens_per_device": 2692 }, { "epoch": 0.8076, "loss_ce": 0.0666838139295578, "loss_lvr": 0.7892879247665405, "loss_mode_switch": 0.0, "loss_total": 0.14561259746551514, "step": 2019 }, { "batch_size": 1, "epoch": 0.8076, "step": 2019, "tokens_per_device": 4959 }, { "epoch": 0.8076, "loss_ce": 0.0008095804951153696, "loss_lvr": 0.37476933002471924, "loss_mode_switch": 0.0, "loss_total": 0.038286514580249786, "step": 2019 }, { "batch_size": 4, "epoch": 0.8076, "step": 2019, "tokens_per_device": 2652 }, { "epoch": 0.8076, "loss_ce": 0.6796820163726807, "loss_lvr": 0.9595627784729004, "loss_mode_switch": 0.0, "loss_total": 0.7756382822990417, "step": 2019 }, { "epoch": 0.808, "grad_norm": 1.410003662109375, "learning_rate": 9.359630969951012e-07, "loss": 0.3081, "step": 2020 }, { "batch_size": 4, "epoch": 0.808, "step": 2020, "tokens_per_device": 4392 }, { "epoch": 0.808, "loss_ce": 0.11807068437337875, "loss_lvr": 1.053859829902649, "loss_mode_switch": 0.0, "loss_total": 0.22345666587352753, "step": 2020 }, { "batch_size": 4, "epoch": 0.808, "step": 2020, "tokens_per_device": 5628 }, { "epoch": 0.808, "loss_ce": 0.14344936609268188, "loss_lvr": 0.7223653197288513, "loss_mode_switch": 0.0, "loss_total": 0.2156859040260315, "step": 2020 }, { "batch_size": 4, "epoch": 0.808, "step": 2020, "tokens_per_device": 3468 }, { "epoch": 0.808, "loss_ce": 0.2663002908229828, "loss_lvr": 0.8019669055938721, "loss_mode_switch": 0.0, "loss_total": 0.3464969992637634, "step": 2020 }, { "batch_size": 1, "epoch": 0.808, "step": 2020, "tokens_per_device": 4859 }, { "epoch": 0.808, "loss_ce": 0.0002593623357824981, "loss_lvr": 0.2019728422164917, "loss_mode_switch": 0.0, "loss_total": 0.020456647500395775, "step": 2020 }, { "batch_size": 4, "epoch": 0.808, "step": 2020, "tokens_per_device": 7224 }, { "epoch": 0.808, "loss_ce": 0.7068700790405273, "loss_lvr": 0.7066619992256165, "loss_mode_switch": 0.0, "loss_total": 0.7775362730026245, "step": 2020 }, { "batch_size": 4, "epoch": 0.808, "step": 2020, "tokens_per_device": 4228 }, { "epoch": 0.808, "loss_ce": 0.30228352546691895, "loss_lvr": 0.9390041828155518, "loss_mode_switch": 0.0, "loss_total": 0.39618393778800964, "step": 2020 }, { "batch_size": 4, "epoch": 0.808, "step": 2020, "tokens_per_device": 4528 }, { "epoch": 0.808, "loss_ce": 0.45421668887138367, "loss_lvr": 0.9889386296272278, "loss_mode_switch": 0.0, "loss_total": 0.5531105399131775, "step": 2020 }, { "batch_size": 4, "epoch": 0.808, "step": 2020, "tokens_per_device": 6780 }, { "epoch": 0.808, "loss_ce": 0.3087567090988159, "loss_lvr": 0.5614936351776123, "loss_mode_switch": 0.0, "loss_total": 0.36490607261657715, "step": 2020 }, { "epoch": 0.8084, "grad_norm": 1.3445483446121216, "learning_rate": 9.321931477107377e-07, "loss": 0.3109, "step": 2021 }, { "batch_size": 4, "epoch": 0.8084, "step": 2021, "tokens_per_device": 6888 }, { "epoch": 0.8084, "loss_ce": 0.4186549782752991, "loss_lvr": 0.6714588403701782, "loss_mode_switch": 0.0, "loss_total": 0.4858008623123169, "step": 2021 }, { "batch_size": 1, "epoch": 0.8084, "step": 2021, "tokens_per_device": 5018 }, { "epoch": 0.8084, "loss_ce": 0.07843970507383347, "loss_lvr": 0.7164273858070374, "loss_mode_switch": 0.0, "loss_total": 0.15008243918418884, "step": 2021 }, { "batch_size": 4, "epoch": 0.8084, "step": 2021, "tokens_per_device": 1352 }, { "epoch": 0.8084, "loss_ce": 0.29886409640312195, "loss_lvr": 0.8635400533676147, "loss_mode_switch": 0.0, "loss_total": 0.3852181136608124, "step": 2021 }, { "batch_size": 4, "epoch": 0.8084, "step": 2021, "tokens_per_device": 5448 }, { "epoch": 0.8084, "loss_ce": 0.09675677865743637, "loss_lvr": 0.7086480259895325, "loss_mode_switch": 0.0, "loss_total": 0.16762158274650574, "step": 2021 }, { "batch_size": 4, "epoch": 0.8084, "step": 2021, "tokens_per_device": 7000 }, { "epoch": 0.8084, "loss_ce": 0.2075694501399994, "loss_lvr": 0.7847550511360168, "loss_mode_switch": 0.0, "loss_total": 0.2860449552536011, "step": 2021 }, { "batch_size": 4, "epoch": 0.8084, "step": 2021, "tokens_per_device": 4372 }, { "epoch": 0.8084, "loss_ce": 0.6953719258308411, "loss_lvr": 0.9494168162345886, "loss_mode_switch": 0.0, "loss_total": 0.7903136014938354, "step": 2021 }, { "batch_size": 4, "epoch": 0.8084, "step": 2021, "tokens_per_device": 3776 }, { "epoch": 0.8084, "loss_ce": 0.013598061166703701, "loss_lvr": 0.9016339778900146, "loss_mode_switch": 0.0, "loss_total": 0.1037614643573761, "step": 2021 }, { "batch_size": 1, "epoch": 0.8084, "step": 2021, "tokens_per_device": 5257 }, { "epoch": 0.8084, "loss_ce": 0.0943072959780693, "loss_lvr": 0.3731870949268341, "loss_mode_switch": 0.0, "loss_total": 0.13162600994110107, "step": 2021 }, { "epoch": 0.8088, "grad_norm": 1.2913051843643188, "learning_rate": 9.284300255304329e-07, "loss": 0.2565, "step": 2022 }, { "batch_size": 4, "epoch": 0.8088, "step": 2022, "tokens_per_device": 4244 }, { "epoch": 0.8088, "loss_ce": 0.09878278523683548, "loss_lvr": 0.8817664980888367, "loss_mode_switch": 0.0, "loss_total": 0.18695944547653198, "step": 2022 }, { "batch_size": 4, "epoch": 0.8088, "step": 2022, "tokens_per_device": 5668 }, { "epoch": 0.8088, "loss_ce": 0.27489715814590454, "loss_lvr": 0.9613452553749084, "loss_mode_switch": 0.0, "loss_total": 0.3710317015647888, "step": 2022 }, { "batch_size": 4, "epoch": 0.8088, "step": 2022, "tokens_per_device": 6240 }, { "epoch": 0.8088, "loss_ce": 0.03458227962255478, "loss_lvr": 0.7149173021316528, "loss_mode_switch": 0.0, "loss_total": 0.1060740128159523, "step": 2022 }, { "batch_size": 4, "epoch": 0.8088, "step": 2022, "tokens_per_device": 4088 }, { "epoch": 0.8088, "loss_ce": 0.3216394782066345, "loss_lvr": 1.0626599788665771, "loss_mode_switch": 0.0, "loss_total": 0.42790547013282776, "step": 2022 }, { "batch_size": 4, "epoch": 0.8088, "step": 2022, "tokens_per_device": 2628 }, { "epoch": 0.8088, "loss_ce": 0.44702669978141785, "loss_lvr": 0.7341631650924683, "loss_mode_switch": 0.0, "loss_total": 0.5204430222511292, "step": 2022 }, { "batch_size": 4, "epoch": 0.8088, "step": 2022, "tokens_per_device": 3784 }, { "epoch": 0.8088, "loss_ce": 0.3538697361946106, "loss_lvr": 0.7634117603302002, "loss_mode_switch": 0.0, "loss_total": 0.4302109181880951, "step": 2022 }, { "batch_size": 1, "epoch": 0.8088, "step": 2022, "tokens_per_device": 5117 }, { "epoch": 0.8088, "loss_ce": 0.0891871452331543, "loss_lvr": 0.347169429063797, "loss_mode_switch": 0.0, "loss_total": 0.12390409409999847, "step": 2022 }, { "batch_size": 4, "epoch": 0.8088, "step": 2022, "tokens_per_device": 7300 }, { "epoch": 0.8088, "loss_ce": 0.0021143590565770864, "loss_lvr": 0.7148910760879517, "loss_mode_switch": 0.0, "loss_total": 0.07360346615314484, "step": 2022 }, { "epoch": 0.8092, "grad_norm": 1.281720519065857, "learning_rate": 9.246737367699287e-07, "loss": 0.3043, "step": 2023 }, { "batch_size": 1, "epoch": 0.8092, "step": 2023, "tokens_per_device": 5133 }, { "epoch": 0.8092, "loss_ce": 0.0012223086087033153, "loss_lvr": 0.3081859052181244, "loss_mode_switch": 0.0, "loss_total": 0.03204089775681496, "step": 2023 }, { "batch_size": 4, "epoch": 0.8092, "step": 2023, "tokens_per_device": 4328 }, { "epoch": 0.8092, "loss_ce": 0.24337585270404816, "loss_lvr": 0.9012673497200012, "loss_mode_switch": 0.0, "loss_total": 0.3335025906562805, "step": 2023 }, { "batch_size": 1, "epoch": 0.8092, "step": 2023, "tokens_per_device": 4788 }, { "epoch": 0.8092, "loss_ce": 0.023617299273610115, "loss_lvr": 0.4307137429714203, "loss_mode_switch": 0.0, "loss_total": 0.066688671708107, "step": 2023 }, { "batch_size": 4, "epoch": 0.8092, "step": 2023, "tokens_per_device": 15936 }, { "epoch": 0.8092, "loss_ce": 0.12059883028268814, "loss_lvr": 0.8343991041183472, "loss_mode_switch": 0.0, "loss_total": 0.20403873920440674, "step": 2023 }, { "batch_size": 4, "epoch": 0.8092, "step": 2023, "tokens_per_device": 4728 }, { "epoch": 0.8092, "loss_ce": 0.16361282765865326, "loss_lvr": 0.8074818253517151, "loss_mode_switch": 0.0, "loss_total": 0.244361013174057, "step": 2023 }, { "batch_size": 1, "epoch": 0.8092, "step": 2023, "tokens_per_device": 4895 }, { "epoch": 0.8092, "loss_ce": 0.5131782293319702, "loss_lvr": 0.20724838972091675, "loss_mode_switch": 0.0, "loss_total": 0.5339030623435974, "step": 2023 }, { "batch_size": 4, "epoch": 0.8092, "step": 2023, "tokens_per_device": 3636 }, { "epoch": 0.8092, "loss_ce": 0.28906670212745667, "loss_lvr": 0.8600906729698181, "loss_mode_switch": 0.0, "loss_total": 0.3750757575035095, "step": 2023 }, { "batch_size": 1, "epoch": 0.8092, "step": 2023, "tokens_per_device": 4904 }, { "epoch": 0.8092, "loss_ce": 0.10631496459245682, "loss_lvr": 0.4686960279941559, "loss_mode_switch": 0.0, "loss_total": 0.15318456292152405, "step": 2023 }, { "epoch": 0.8096, "grad_norm": 1.3421772718429565, "learning_rate": 9.209242877335006e-07, "loss": 0.2682, "step": 2024 }, { "batch_size": 4, "epoch": 0.8096, "step": 2024, "tokens_per_device": 2616 }, { "epoch": 0.8096, "loss_ce": 0.12364500015974045, "loss_lvr": 1.0149929523468018, "loss_mode_switch": 0.0, "loss_total": 0.22514429688453674, "step": 2024 }, { "batch_size": 4, "epoch": 0.8096, "step": 2024, "tokens_per_device": 6068 }, { "epoch": 0.8096, "loss_ce": 0.08737614750862122, "loss_lvr": 0.831058919429779, "loss_mode_switch": 0.0, "loss_total": 0.17048203945159912, "step": 2024 }, { "batch_size": 4, "epoch": 0.8096, "step": 2024, "tokens_per_device": 4196 }, { "epoch": 0.8096, "loss_ce": 0.10868606716394424, "loss_lvr": 1.0065571069717407, "loss_mode_switch": 0.0, "loss_total": 0.20934177935123444, "step": 2024 }, { "batch_size": 4, "epoch": 0.8096, "step": 2024, "tokens_per_device": 8956 }, { "epoch": 0.8096, "loss_ce": 0.28562232851982117, "loss_lvr": 0.7740099430084229, "loss_mode_switch": 0.0, "loss_total": 0.3630233407020569, "step": 2024 }, { "batch_size": 1, "epoch": 0.8096, "step": 2024, "tokens_per_device": 4920 }, { "epoch": 0.8096, "loss_ce": 0.024632515385746956, "loss_lvr": 0.5281878709793091, "loss_mode_switch": 0.0, "loss_total": 0.07745130360126495, "step": 2024 }, { "batch_size": 4, "epoch": 0.8096, "step": 2024, "tokens_per_device": 2648 }, { "epoch": 0.8096, "loss_ce": 0.11947467923164368, "loss_lvr": 0.863257646560669, "loss_mode_switch": 0.0, "loss_total": 0.20580044388771057, "step": 2024 }, { "batch_size": 4, "epoch": 0.8096, "step": 2024, "tokens_per_device": 3748 }, { "epoch": 0.8096, "loss_ce": 0.25861656665802, "loss_lvr": 1.0875744819641113, "loss_mode_switch": 0.0, "loss_total": 0.3673740029335022, "step": 2024 }, { "batch_size": 4, "epoch": 0.8096, "step": 2024, "tokens_per_device": 6240 }, { "epoch": 0.8096, "loss_ce": 0.06967490911483765, "loss_lvr": 0.7693813443183899, "loss_mode_switch": 0.0, "loss_total": 0.14661304652690887, "step": 2024 }, { "epoch": 0.81, "grad_norm": 1.2303767204284668, "learning_rate": 9.171816847139447e-07, "loss": 0.2784, "step": 2025 }, { "batch_size": 4, "epoch": 0.81, "step": 2025, "tokens_per_device": 3748 }, { "epoch": 0.81, "loss_ce": 0.19550108909606934, "loss_lvr": 0.8422877192497253, "loss_mode_switch": 0.0, "loss_total": 0.2797298729419708, "step": 2025 }, { "batch_size": 4, "epoch": 0.81, "step": 2025, "tokens_per_device": 5728 }, { "epoch": 0.81, "loss_ce": 0.5214650630950928, "loss_lvr": 0.6928700804710388, "loss_mode_switch": 0.0, "loss_total": 0.5907520651817322, "step": 2025 }, { "batch_size": 4, "epoch": 0.81, "step": 2025, "tokens_per_device": 4004 }, { "epoch": 0.81, "loss_ce": 0.31091976165771484, "loss_lvr": 1.00608491897583, "loss_mode_switch": 0.0, "loss_total": 0.41152825951576233, "step": 2025 }, { "batch_size": 1, "epoch": 0.81, "step": 2025, "tokens_per_device": 5211 }, { "epoch": 0.81, "loss_ce": 0.0999978706240654, "loss_lvr": 0.42935022711753845, "loss_mode_switch": 0.0, "loss_total": 0.14293289184570312, "step": 2025 }, { "batch_size": 4, "epoch": 0.81, "step": 2025, "tokens_per_device": 6964 }, { "epoch": 0.81, "loss_ce": 0.23796190321445465, "loss_lvr": 0.7794687151908875, "loss_mode_switch": 0.0, "loss_total": 0.3159087896347046, "step": 2025 }, { "batch_size": 4, "epoch": 0.81, "step": 2025, "tokens_per_device": 7012 }, { "epoch": 0.81, "loss_ce": 0.0684383362531662, "loss_lvr": 0.9555764198303223, "loss_mode_switch": 0.0, "loss_total": 0.16399598121643066, "step": 2025 }, { "batch_size": 1, "epoch": 0.81, "step": 2025, "tokens_per_device": 6460 }, { "epoch": 0.81, "loss_ce": 0.0017656967975199223, "loss_lvr": 0.24733711779117584, "loss_mode_switch": 0.0, "loss_total": 0.02649940922856331, "step": 2025 }, { "batch_size": 1, "epoch": 0.81, "step": 2025, "tokens_per_device": 5117 }, { "epoch": 0.81, "loss_ce": 0.011016566306352615, "loss_lvr": 0.24172064661979675, "loss_mode_switch": 0.0, "loss_total": 0.03518863022327423, "step": 2025 }, { "epoch": 0.8104, "grad_norm": 1.2953691482543945, "learning_rate": 9.134459339925694e-07, "loss": 0.2939, "step": 2026 }, { "batch_size": 4, "epoch": 0.8104, "step": 2026, "tokens_per_device": 5872 }, { "epoch": 0.8104, "loss_ce": 0.274093359708786, "loss_lvr": 0.909977376461029, "loss_mode_switch": 0.0, "loss_total": 0.36509108543395996, "step": 2026 }, { "batch_size": 4, "epoch": 0.8104, "step": 2026, "tokens_per_device": 5748 }, { "epoch": 0.8104, "loss_ce": 0.378511905670166, "loss_lvr": 0.8561157584190369, "loss_mode_switch": 0.0, "loss_total": 0.4641234874725342, "step": 2026 }, { "batch_size": 4, "epoch": 0.8104, "step": 2026, "tokens_per_device": 4444 }, { "epoch": 0.8104, "loss_ce": 0.0751563236117363, "loss_lvr": 0.6765801310539246, "loss_mode_switch": 0.0, "loss_total": 0.14281433820724487, "step": 2026 }, { "batch_size": 4, "epoch": 0.8104, "step": 2026, "tokens_per_device": 2620 }, { "epoch": 0.8104, "loss_ce": 0.1923251450061798, "loss_lvr": 0.9526910781860352, "loss_mode_switch": 0.0, "loss_total": 0.2875942587852478, "step": 2026 }, { "batch_size": 4, "epoch": 0.8104, "step": 2026, "tokens_per_device": 5240 }, { "epoch": 0.8104, "loss_ce": 0.3684079349040985, "loss_lvr": 0.7226131558418274, "loss_mode_switch": 0.0, "loss_total": 0.4406692385673523, "step": 2026 }, { "batch_size": 1, "epoch": 0.8104, "step": 2026, "tokens_per_device": 5105 }, { "epoch": 0.8104, "loss_ce": 0.08228648453950882, "loss_lvr": 0.4279099404811859, "loss_mode_switch": 0.0, "loss_total": 0.125077486038208, "step": 2026 }, { "batch_size": 1, "epoch": 0.8104, "step": 2026, "tokens_per_device": 5021 }, { "epoch": 0.8104, "loss_ce": 0.02789853885769844, "loss_lvr": 0.6163612604141235, "loss_mode_switch": 0.0, "loss_total": 0.08953467011451721, "step": 2026 }, { "batch_size": 4, "epoch": 0.8104, "step": 2026, "tokens_per_device": 5492 }, { "epoch": 0.8104, "loss_ce": 0.3855957090854645, "loss_lvr": 0.73291015625, "loss_mode_switch": 0.0, "loss_total": 0.4588867425918579, "step": 2026 }, { "epoch": 0.8108, "grad_norm": 1.434695839881897, "learning_rate": 9.097170418391782e-07, "loss": 0.3194, "step": 2027 }, { "batch_size": 4, "epoch": 0.8108, "step": 2027, "tokens_per_device": 7164 }, { "epoch": 0.8108, "loss_ce": 0.23716382682323456, "loss_lvr": 0.7466444969177246, "loss_mode_switch": 0.0, "loss_total": 0.31182828545570374, "step": 2027 }, { "batch_size": 4, "epoch": 0.8108, "step": 2027, "tokens_per_device": 3836 }, { "epoch": 0.8108, "loss_ce": 0.07981325685977936, "loss_lvr": 0.8961272239685059, "loss_mode_switch": 0.0, "loss_total": 0.16942597925662994, "step": 2027 }, { "batch_size": 4, "epoch": 0.8108, "step": 2027, "tokens_per_device": 1408 }, { "epoch": 0.8108, "loss_ce": 0.1718686819076538, "loss_lvr": 1.7672526836395264, "loss_mode_switch": 0.0, "loss_total": 0.34859395027160645, "step": 2027 }, { "batch_size": 4, "epoch": 0.8108, "step": 2027, "tokens_per_device": 4132 }, { "epoch": 0.8108, "loss_ce": 0.23014013469219208, "loss_lvr": 0.9750578999519348, "loss_mode_switch": 0.0, "loss_total": 0.3276459276676178, "step": 2027 }, { "batch_size": 4, "epoch": 0.8108, "step": 2027, "tokens_per_device": 6416 }, { "epoch": 0.8108, "loss_ce": 0.010477214120328426, "loss_lvr": 0.9029568433761597, "loss_mode_switch": 0.0, "loss_total": 0.1007729023694992, "step": 2027 }, { "batch_size": 4, "epoch": 0.8108, "step": 2027, "tokens_per_device": 3812 }, { "epoch": 0.8108, "loss_ce": 0.326263427734375, "loss_lvr": 0.9279991388320923, "loss_mode_switch": 0.0, "loss_total": 0.4190633296966553, "step": 2027 }, { "batch_size": 1, "epoch": 0.8108, "step": 2027, "tokens_per_device": 5126 }, { "epoch": 0.8108, "loss_ce": 0.00024484016466885805, "loss_lvr": 0.19320710003376007, "loss_mode_switch": 0.0, "loss_total": 0.019565550610423088, "step": 2027 }, { "batch_size": 4, "epoch": 0.8108, "step": 2027, "tokens_per_device": 2648 }, { "epoch": 0.8108, "loss_ce": 0.38467708230018616, "loss_lvr": 0.9191375374794006, "loss_mode_switch": 0.0, "loss_total": 0.4765908420085907, "step": 2027 }, { "epoch": 0.8112, "grad_norm": 1.6521039009094238, "learning_rate": 9.059950145120666e-07, "loss": 0.2832, "step": 2028 }, { "batch_size": 4, "epoch": 0.8112, "step": 2028, "tokens_per_device": 4372 }, { "epoch": 0.8112, "loss_ce": 0.40672266483306885, "loss_lvr": 0.7393050193786621, "loss_mode_switch": 0.0, "loss_total": 0.48065316677093506, "step": 2028 }, { "batch_size": 4, "epoch": 0.8112, "step": 2028, "tokens_per_device": 3756 }, { "epoch": 0.8112, "loss_ce": 0.7826364636421204, "loss_lvr": 0.8751294612884521, "loss_mode_switch": 0.0, "loss_total": 0.8701494336128235, "step": 2028 }, { "batch_size": 1, "epoch": 0.8112, "step": 2028, "tokens_per_device": 4664 }, { "epoch": 0.8112, "loss_ce": 0.18859446048736572, "loss_lvr": 0.3305267095565796, "loss_mode_switch": 0.0, "loss_total": 0.22164712846279144, "step": 2028 }, { "batch_size": 4, "epoch": 0.8112, "step": 2028, "tokens_per_device": 3932 }, { "epoch": 0.8112, "loss_ce": 0.08052955567836761, "loss_lvr": 0.8319303393363953, "loss_mode_switch": 0.0, "loss_total": 0.16372258961200714, "step": 2028 }, { "batch_size": 4, "epoch": 0.8112, "step": 2028, "tokens_per_device": 4236 }, { "epoch": 0.8112, "loss_ce": 0.25034695863723755, "loss_lvr": 0.6887450814247131, "loss_mode_switch": 0.0, "loss_total": 0.31922146677970886, "step": 2028 }, { "batch_size": 1, "epoch": 0.8112, "step": 2028, "tokens_per_device": 6294 }, { "epoch": 0.8112, "loss_ce": 0.00028045097133144736, "loss_lvr": 0.3094770014286041, "loss_mode_switch": 0.0, "loss_total": 0.03122815117239952, "step": 2028 }, { "batch_size": 1, "epoch": 0.8112, "step": 2028, "tokens_per_device": 4911 }, { "epoch": 0.8112, "loss_ce": 0.41786128282546997, "loss_lvr": 0.3546783924102783, "loss_mode_switch": 0.0, "loss_total": 0.4533291161060333, "step": 2028 }, { "batch_size": 1, "epoch": 0.8112, "step": 2028, "tokens_per_device": 5217 }, { "epoch": 0.8112, "loss_ce": 0.0006122508202679455, "loss_lvr": 0.329206258058548, "loss_mode_switch": 0.0, "loss_total": 0.03353287652134895, "step": 2028 }, { "epoch": 0.8116, "grad_norm": 1.2134567499160767, "learning_rate": 9.022798582580067e-07, "loss": 0.2627, "step": 2029 }, { "batch_size": 1, "epoch": 0.8116, "step": 2029, "tokens_per_device": 5088 }, { "epoch": 0.8116, "loss_ce": 0.05291929468512535, "loss_lvr": 0.7443081140518188, "loss_mode_switch": 0.0, "loss_total": 0.1273501068353653, "step": 2029 }, { "batch_size": 4, "epoch": 0.8116, "step": 2029, "tokens_per_device": 5748 }, { "epoch": 0.8116, "loss_ce": 0.004035855643451214, "loss_lvr": 0.9365402460098267, "loss_mode_switch": 0.0, "loss_total": 0.09768988192081451, "step": 2029 }, { "batch_size": 4, "epoch": 0.8116, "step": 2029, "tokens_per_device": 10892 }, { "epoch": 0.8116, "loss_ce": 0.07556677609682083, "loss_lvr": 0.7394675016403198, "loss_mode_switch": 0.0, "loss_total": 0.14951352775096893, "step": 2029 }, { "batch_size": 1, "epoch": 0.8116, "step": 2029, "tokens_per_device": 4882 }, { "epoch": 0.8116, "loss_ce": 0.0015489637153223157, "loss_lvr": 0.3117847144603729, "loss_mode_switch": 0.0, "loss_total": 0.0327274352312088, "step": 2029 }, { "batch_size": 4, "epoch": 0.8116, "step": 2029, "tokens_per_device": 5412 }, { "epoch": 0.8116, "loss_ce": 0.5276750326156616, "loss_lvr": 0.8384473919868469, "loss_mode_switch": 0.0, "loss_total": 0.6115197539329529, "step": 2029 }, { "batch_size": 4, "epoch": 0.8116, "step": 2029, "tokens_per_device": 4240 }, { "epoch": 0.8116, "loss_ce": 0.4727918803691864, "loss_lvr": 0.7730745673179626, "loss_mode_switch": 0.0, "loss_total": 0.5500993132591248, "step": 2029 }, { "batch_size": 1, "epoch": 0.8116, "step": 2029, "tokens_per_device": 4885 }, { "epoch": 0.8116, "loss_ce": 0.02194828726351261, "loss_lvr": 0.39377930760383606, "loss_mode_switch": 0.0, "loss_total": 0.061326220631599426, "step": 2029 }, { "batch_size": 1, "epoch": 0.8116, "step": 2029, "tokens_per_device": 5056 }, { "epoch": 0.8116, "loss_ce": 0.054481711238622665, "loss_lvr": 0.6202593445777893, "loss_mode_switch": 0.0, "loss_total": 0.1165076494216919, "step": 2029 }, { "epoch": 0.812, "grad_norm": 1.586661458015442, "learning_rate": 8.985715793122407e-07, "loss": 0.2868, "step": 2030 }, { "batch_size": 4, "epoch": 0.812, "step": 2030, "tokens_per_device": 1900 }, { "epoch": 0.812, "loss_ce": 0.24046488106250763, "loss_lvr": 0.9692112803459167, "loss_mode_switch": 0.0, "loss_total": 0.33738601207733154, "step": 2030 }, { "batch_size": 4, "epoch": 0.812, "step": 2030, "tokens_per_device": 1584 }, { "epoch": 0.812, "loss_ce": 0.5750983953475952, "loss_lvr": 0.9756452441215515, "loss_mode_switch": 0.0, "loss_total": 0.6726629137992859, "step": 2030 }, { "batch_size": 4, "epoch": 0.812, "step": 2030, "tokens_per_device": 4984 }, { "epoch": 0.812, "loss_ce": 0.003939618822187185, "loss_lvr": 0.6963884234428406, "loss_mode_switch": 0.0, "loss_total": 0.07357846200466156, "step": 2030 }, { "batch_size": 4, "epoch": 0.812, "step": 2030, "tokens_per_device": 4528 }, { "epoch": 0.812, "loss_ce": 0.17425872385501862, "loss_lvr": 0.7818273901939392, "loss_mode_switch": 0.0, "loss_total": 0.2524414658546448, "step": 2030 }, { "batch_size": 4, "epoch": 0.812, "step": 2030, "tokens_per_device": 8108 }, { "epoch": 0.812, "loss_ce": 0.29364824295043945, "loss_lvr": 0.5686254501342773, "loss_mode_switch": 0.0, "loss_total": 0.35051077604293823, "step": 2030 }, { "batch_size": 4, "epoch": 0.812, "step": 2030, "tokens_per_device": 3816 }, { "epoch": 0.812, "loss_ce": 0.41712719202041626, "loss_lvr": 0.8263288140296936, "loss_mode_switch": 0.0, "loss_total": 0.49976009130477905, "step": 2030 }, { "batch_size": 4, "epoch": 0.812, "step": 2030, "tokens_per_device": 5768 }, { "epoch": 0.812, "loss_ce": 0.5145451426506042, "loss_lvr": 0.7625752091407776, "loss_mode_switch": 0.0, "loss_total": 0.5908026695251465, "step": 2030 }, { "batch_size": 4, "epoch": 0.812, "step": 2030, "tokens_per_device": 4204 }, { "epoch": 0.812, "loss_ce": 0.00011190608347533271, "loss_lvr": 0.7522960901260376, "loss_mode_switch": 0.0, "loss_total": 0.07534151524305344, "step": 2030 }, { "epoch": 0.8124, "grad_norm": 1.188639760017395, "learning_rate": 8.948701838984702e-07, "loss": 0.2861, "step": 2031 }, { "batch_size": 4, "epoch": 0.8124, "step": 2031, "tokens_per_device": 4596 }, { "epoch": 0.8124, "loss_ce": 0.38507720828056335, "loss_lvr": 0.8459779620170593, "loss_mode_switch": 0.0, "loss_total": 0.4696750044822693, "step": 2031 }, { "batch_size": 4, "epoch": 0.8124, "step": 2031, "tokens_per_device": 5852 }, { "epoch": 0.8124, "loss_ce": 0.06802289187908173, "loss_lvr": 0.7998411059379578, "loss_mode_switch": 0.0, "loss_total": 0.14800700545310974, "step": 2031 }, { "batch_size": 4, "epoch": 0.8124, "step": 2031, "tokens_per_device": 3792 }, { "epoch": 0.8124, "loss_ce": 0.17467421293258667, "loss_lvr": 0.9940383434295654, "loss_mode_switch": 0.0, "loss_total": 0.27407804131507874, "step": 2031 }, { "batch_size": 4, "epoch": 0.8124, "step": 2031, "tokens_per_device": 1236 }, { "epoch": 0.8124, "loss_ce": 0.2201526165008545, "loss_lvr": 0.9887596368789673, "loss_mode_switch": 0.0, "loss_total": 0.3190285861492157, "step": 2031 }, { "batch_size": 4, "epoch": 0.8124, "step": 2031, "tokens_per_device": 6320 }, { "epoch": 0.8124, "loss_ce": 0.13971582055091858, "loss_lvr": 0.7333011031150818, "loss_mode_switch": 0.0, "loss_total": 0.21304592490196228, "step": 2031 }, { "batch_size": 4, "epoch": 0.8124, "step": 2031, "tokens_per_device": 1776 }, { "epoch": 0.8124, "loss_ce": 0.10451772063970566, "loss_lvr": 1.3262017965316772, "loss_mode_switch": 0.0, "loss_total": 0.23713791370391846, "step": 2031 }, { "batch_size": 4, "epoch": 0.8124, "step": 2031, "tokens_per_device": 3832 }, { "epoch": 0.8124, "loss_ce": 0.5393922328948975, "loss_lvr": 1.3240737915039062, "loss_mode_switch": 0.0, "loss_total": 0.6717996001243591, "step": 2031 }, { "batch_size": 4, "epoch": 0.8124, "step": 2031, "tokens_per_device": 4140 }, { "epoch": 0.8124, "loss_ce": 0.06590811908245087, "loss_lvr": 0.8247575163841248, "loss_mode_switch": 0.0, "loss_total": 0.14838387072086334, "step": 2031 }, { "epoch": 0.8128, "grad_norm": 1.2768011093139648, "learning_rate": 8.911756782288394e-07, "loss": 0.2884, "step": 2032 }, { "batch_size": 1, "epoch": 0.8128, "step": 2032, "tokens_per_device": 4864 }, { "epoch": 0.8128, "loss_ce": 0.006902999710291624, "loss_lvr": 0.26782485842704773, "loss_mode_switch": 0.0, "loss_total": 0.033685486763715744, "step": 2032 }, { "batch_size": 4, "epoch": 0.8128, "step": 2032, "tokens_per_device": 5920 }, { "epoch": 0.8128, "loss_ce": 0.39055556058883667, "loss_lvr": 0.7213587164878845, "loss_mode_switch": 0.0, "loss_total": 0.46269142627716064, "step": 2032 }, { "batch_size": 4, "epoch": 0.8128, "step": 2032, "tokens_per_device": 4480 }, { "epoch": 0.8128, "loss_ce": 0.32417845726013184, "loss_lvr": 0.8060533404350281, "loss_mode_switch": 0.0, "loss_total": 0.40478378534317017, "step": 2032 }, { "batch_size": 1, "epoch": 0.8128, "step": 2032, "tokens_per_device": 4893 }, { "epoch": 0.8128, "loss_ce": 0.21404823660850525, "loss_lvr": 0.2418895810842514, "loss_mode_switch": 0.0, "loss_total": 0.238237202167511, "step": 2032 }, { "batch_size": 1, "epoch": 0.8128, "step": 2032, "tokens_per_device": 4913 }, { "epoch": 0.8128, "loss_ce": 0.006778944283723831, "loss_lvr": 1.0776922702789307, "loss_mode_switch": 0.0, "loss_total": 0.11454817652702332, "step": 2032 }, { "batch_size": 1, "epoch": 0.8128, "step": 2032, "tokens_per_device": 4884 }, { "epoch": 0.8128, "loss_ce": 0.014250500127673149, "loss_lvr": 0.21248874068260193, "loss_mode_switch": 0.0, "loss_total": 0.03549937531352043, "step": 2032 }, { "batch_size": 4, "epoch": 0.8128, "step": 2032, "tokens_per_device": 3896 }, { "epoch": 0.8128, "loss_ce": 0.3635668456554413, "loss_lvr": 0.7734395861625671, "loss_mode_switch": 0.0, "loss_total": 0.44091081619262695, "step": 2032 }, { "batch_size": 4, "epoch": 0.8128, "step": 2032, "tokens_per_device": 10792 }, { "epoch": 0.8128, "loss_ce": 0.08021294325590134, "loss_lvr": 0.8516817092895508, "loss_mode_switch": 0.0, "loss_total": 0.16538111865520477, "step": 2032 }, { "epoch": 0.8132, "grad_norm": 1.3062516450881958, "learning_rate": 8.874880685039305e-07, "loss": 0.2364, "step": 2033 }, { "batch_size": 4, "epoch": 0.8132, "step": 2033, "tokens_per_device": 3780 }, { "epoch": 0.8132, "loss_ce": 0.12532885372638702, "loss_lvr": 0.8740480542182922, "loss_mode_switch": 0.0, "loss_total": 0.212733656167984, "step": 2033 }, { "batch_size": 1, "epoch": 0.8132, "step": 2033, "tokens_per_device": 5210 }, { "epoch": 0.8132, "loss_ce": 0.06860901415348053, "loss_lvr": 0.2888449430465698, "loss_mode_switch": 0.0, "loss_total": 0.09749350696802139, "step": 2033 }, { "batch_size": 4, "epoch": 0.8132, "step": 2033, "tokens_per_device": 4224 }, { "epoch": 0.8132, "loss_ce": 0.19221939146518707, "loss_lvr": 0.7504032254219055, "loss_mode_switch": 0.0, "loss_total": 0.26725971698760986, "step": 2033 }, { "batch_size": 4, "epoch": 0.8132, "step": 2033, "tokens_per_device": 4884 }, { "epoch": 0.8132, "loss_ce": 0.09311845898628235, "loss_lvr": 0.6257984638214111, "loss_mode_switch": 0.0, "loss_total": 0.15569829940795898, "step": 2033 }, { "batch_size": 4, "epoch": 0.8132, "step": 2033, "tokens_per_device": 2652 }, { "epoch": 0.8132, "loss_ce": 0.3411014974117279, "loss_lvr": 1.1137101650238037, "loss_mode_switch": 0.0, "loss_total": 0.4524725079536438, "step": 2033 }, { "batch_size": 4, "epoch": 0.8132, "step": 2033, "tokens_per_device": 5804 }, { "epoch": 0.8132, "loss_ce": 0.09932496398687363, "loss_lvr": 0.6660196185112, "loss_mode_switch": 0.0, "loss_total": 0.16592693328857422, "step": 2033 }, { "batch_size": 4, "epoch": 0.8132, "step": 2033, "tokens_per_device": 5480 }, { "epoch": 0.8132, "loss_ce": 0.1367379128932953, "loss_lvr": 0.7208414673805237, "loss_mode_switch": 0.0, "loss_total": 0.2088220715522766, "step": 2033 }, { "batch_size": 1, "epoch": 0.8132, "step": 2033, "tokens_per_device": 5178 }, { "epoch": 0.8132, "loss_ce": 0.007212108466774225, "loss_lvr": 0.3055211007595062, "loss_mode_switch": 0.0, "loss_total": 0.03776421770453453, "step": 2033 }, { "epoch": 0.8136, "grad_norm": 1.508629322052002, "learning_rate": 8.838073609127546e-07, "loss": 0.3121, "step": 2034 }, { "batch_size": 1, "epoch": 0.8136, "step": 2034, "tokens_per_device": 5382 }, { "epoch": 0.8136, "loss_ce": 0.0012164507061243057, "loss_lvr": 0.3536059856414795, "loss_mode_switch": 0.0, "loss_total": 0.03657705336809158, "step": 2034 }, { "batch_size": 1, "epoch": 0.8136, "step": 2034, "tokens_per_device": 4894 }, { "epoch": 0.8136, "loss_ce": 0.29350611567497253, "loss_lvr": 0.3185724914073944, "loss_mode_switch": 0.0, "loss_total": 0.3253633677959442, "step": 2034 }, { "batch_size": 4, "epoch": 0.8136, "step": 2034, "tokens_per_device": 11372 }, { "epoch": 0.8136, "loss_ce": 0.06345237791538239, "loss_lvr": 0.6082074642181396, "loss_mode_switch": 0.0, "loss_total": 0.12427312135696411, "step": 2034 }, { "batch_size": 4, "epoch": 0.8136, "step": 2034, "tokens_per_device": 5744 }, { "epoch": 0.8136, "loss_ce": 0.0935247465968132, "loss_lvr": 0.9091076850891113, "loss_mode_switch": 0.0, "loss_total": 0.18443551659584045, "step": 2034 }, { "batch_size": 1, "epoch": 0.8136, "step": 2034, "tokens_per_device": 4883 }, { "epoch": 0.8136, "loss_ce": 0.3199746906757355, "loss_lvr": 0.5613381862640381, "loss_mode_switch": 0.0, "loss_total": 0.3761084973812103, "step": 2034 }, { "batch_size": 4, "epoch": 0.8136, "step": 2034, "tokens_per_device": 4004 }, { "epoch": 0.8136, "loss_ce": 0.2889271676540375, "loss_lvr": 0.7339791655540466, "loss_mode_switch": 0.0, "loss_total": 0.3623250722885132, "step": 2034 }, { "batch_size": 4, "epoch": 0.8136, "step": 2034, "tokens_per_device": 6080 }, { "epoch": 0.8136, "loss_ce": 0.22194992005825043, "loss_lvr": 0.6507853269577026, "loss_mode_switch": 0.0, "loss_total": 0.2870284616947174, "step": 2034 }, { "batch_size": 4, "epoch": 0.8136, "step": 2034, "tokens_per_device": 5772 }, { "epoch": 0.8136, "loss_ce": 0.1914079338312149, "loss_lvr": 0.726136326789856, "loss_mode_switch": 0.0, "loss_total": 0.2640215754508972, "step": 2034 }, { "epoch": 0.814, "grad_norm": 1.563332200050354, "learning_rate": 8.801335616327378e-07, "loss": 0.3127, "step": 2035 }, { "batch_size": 4, "epoch": 0.814, "step": 2035, "tokens_per_device": 1200 }, { "epoch": 0.814, "loss_ce": 0.18735089898109436, "loss_lvr": 1.2341814041137695, "loss_mode_switch": 0.0, "loss_total": 0.31076905131340027, "step": 2035 }, { "batch_size": 1, "epoch": 0.814, "step": 2035, "tokens_per_device": 4944 }, { "epoch": 0.814, "loss_ce": 0.20807774364948273, "loss_lvr": 0.29225441813468933, "loss_mode_switch": 0.0, "loss_total": 0.23730318248271942, "step": 2035 }, { "batch_size": 1, "epoch": 0.814, "step": 2035, "tokens_per_device": 4885 }, { "epoch": 0.814, "loss_ce": 0.003731381380930543, "loss_lvr": 0.5271207690238953, "loss_mode_switch": 0.0, "loss_total": 0.056443460285663605, "step": 2035 }, { "batch_size": 1, "epoch": 0.814, "step": 2035, "tokens_per_device": 5154 }, { "epoch": 0.814, "loss_ce": 0.0006306146387942135, "loss_lvr": 0.777660608291626, "loss_mode_switch": 0.0, "loss_total": 0.07839667797088623, "step": 2035 }, { "batch_size": 4, "epoch": 0.814, "step": 2035, "tokens_per_device": 4668 }, { "epoch": 0.814, "loss_ce": 0.08097869157791138, "loss_lvr": 0.8920130133628845, "loss_mode_switch": 0.0, "loss_total": 0.17017999291419983, "step": 2035 }, { "batch_size": 4, "epoch": 0.814, "step": 2035, "tokens_per_device": 1412 }, { "epoch": 0.814, "loss_ce": 0.31424397230148315, "loss_lvr": 0.9413133859634399, "loss_mode_switch": 0.0, "loss_total": 0.4083753228187561, "step": 2035 }, { "batch_size": 4, "epoch": 0.814, "step": 2035, "tokens_per_device": 4288 }, { "epoch": 0.814, "loss_ce": 0.14383237063884735, "loss_lvr": 0.8699749112129211, "loss_mode_switch": 0.0, "loss_total": 0.2308298647403717, "step": 2035 }, { "batch_size": 4, "epoch": 0.814, "step": 2035, "tokens_per_device": 4396 }, { "epoch": 0.814, "loss_ce": 0.231766477227211, "loss_lvr": 0.813946545124054, "loss_mode_switch": 0.0, "loss_total": 0.31316113471984863, "step": 2035 }, { "epoch": 0.8144, "grad_norm": 1.3402162790298462, "learning_rate": 8.764666768297108e-07, "loss": 0.3258, "step": 2036 }, { "batch_size": 4, "epoch": 0.8144, "step": 2036, "tokens_per_device": 4856 }, { "epoch": 0.8144, "loss_ce": 0.010037310421466827, "loss_lvr": 0.7432138323783875, "loss_mode_switch": 0.0, "loss_total": 0.08435869216918945, "step": 2036 }, { "batch_size": 1, "epoch": 0.8144, "step": 2036, "tokens_per_device": 5371 }, { "epoch": 0.8144, "loss_ce": 0.3160076439380646, "loss_lvr": 0.5026975870132446, "loss_mode_switch": 0.0, "loss_total": 0.36627739667892456, "step": 2036 }, { "batch_size": 1, "epoch": 0.8144, "step": 2036, "tokens_per_device": 5124 }, { "epoch": 0.8144, "loss_ce": 0.11207693815231323, "loss_lvr": 0.6737343668937683, "loss_mode_switch": 0.0, "loss_total": 0.1794503778219223, "step": 2036 }, { "batch_size": 4, "epoch": 0.8144, "step": 2036, "tokens_per_device": 4716 }, { "epoch": 0.8144, "loss_ce": 0.6649840474128723, "loss_lvr": 0.8643587231636047, "loss_mode_switch": 0.0, "loss_total": 0.7514199018478394, "step": 2036 }, { "batch_size": 4, "epoch": 0.8144, "step": 2036, "tokens_per_device": 4548 }, { "epoch": 0.8144, "loss_ce": 0.2752171456813812, "loss_lvr": 0.8204783797264099, "loss_mode_switch": 0.0, "loss_total": 0.35726499557495117, "step": 2036 }, { "batch_size": 4, "epoch": 0.8144, "step": 2036, "tokens_per_device": 5836 }, { "epoch": 0.8144, "loss_ce": 0.12358409911394119, "loss_lvr": 0.7703285217285156, "loss_mode_switch": 0.0, "loss_total": 0.2006169557571411, "step": 2036 }, { "batch_size": 4, "epoch": 0.8144, "step": 2036, "tokens_per_device": 1416 }, { "epoch": 0.8144, "loss_ce": 0.32734060287475586, "loss_lvr": 0.9616772532463074, "loss_mode_switch": 0.0, "loss_total": 0.42350834608078003, "step": 2036 }, { "batch_size": 4, "epoch": 0.8144, "step": 2036, "tokens_per_device": 6032 }, { "epoch": 0.8144, "loss_ce": 0.07990965247154236, "loss_lvr": 1.0991524457931519, "loss_mode_switch": 0.0, "loss_total": 0.1898249089717865, "step": 2036 }, { "epoch": 0.8148, "grad_norm": 1.2684797048568726, "learning_rate": 8.728067126578988e-07, "loss": 0.2936, "step": 2037 }, { "batch_size": 4, "epoch": 0.8148, "step": 2037, "tokens_per_device": 2612 }, { "epoch": 0.8148, "loss_ce": 0.09024109691381454, "loss_lvr": 1.660258412361145, "loss_mode_switch": 0.0, "loss_total": 0.2562669515609741, "step": 2037 }, { "batch_size": 4, "epoch": 0.8148, "step": 2037, "tokens_per_device": 5908 }, { "epoch": 0.8148, "loss_ce": 0.26968735456466675, "loss_lvr": 0.8239205479621887, "loss_mode_switch": 0.0, "loss_total": 0.3520794212818146, "step": 2037 }, { "batch_size": 1, "epoch": 0.8148, "step": 2037, "tokens_per_device": 4557 }, { "epoch": 0.8148, "loss_ce": 0.7849708795547485, "loss_lvr": 1.0906428098678589, "loss_mode_switch": 0.0, "loss_total": 0.8940351605415344, "step": 2037 }, { "batch_size": 1, "epoch": 0.8148, "step": 2037, "tokens_per_device": 5214 }, { "epoch": 0.8148, "loss_ce": 0.31754887104034424, "loss_lvr": 0.3803789019584656, "loss_mode_switch": 0.0, "loss_total": 0.3555867671966553, "step": 2037 }, { "batch_size": 1, "epoch": 0.8148, "step": 2037, "tokens_per_device": 5100 }, { "epoch": 0.8148, "loss_ce": 0.0016819114098325372, "loss_lvr": 0.42051634192466736, "loss_mode_switch": 0.0, "loss_total": 0.04373354837298393, "step": 2037 }, { "batch_size": 1, "epoch": 0.8148, "step": 2037, "tokens_per_device": 5100 }, { "epoch": 0.8148, "loss_ce": 0.1347847878932953, "loss_lvr": 0.5078968405723572, "loss_mode_switch": 0.0, "loss_total": 0.185574471950531, "step": 2037 }, { "batch_size": 1, "epoch": 0.8148, "step": 2037, "tokens_per_device": 5226 }, { "epoch": 0.8148, "loss_ce": 0.003723008558154106, "loss_lvr": 0.3820282816886902, "loss_mode_switch": 0.0, "loss_total": 0.04192584007978439, "step": 2037 }, { "batch_size": 4, "epoch": 0.8148, "step": 2037, "tokens_per_device": 4560 }, { "epoch": 0.8148, "loss_ce": 0.3115479648113251, "loss_lvr": 0.6886755228042603, "loss_mode_switch": 0.0, "loss_total": 0.38041552901268005, "step": 2037 }, { "epoch": 0.8152, "grad_norm": 1.5280176401138306, "learning_rate": 8.691536752599128e-07, "loss": 0.2904, "step": 2038 }, { "batch_size": 4, "epoch": 0.8152, "step": 2038, "tokens_per_device": 4052 }, { "epoch": 0.8152, "loss_ce": 0.30518338084220886, "loss_lvr": 0.6689794063568115, "loss_mode_switch": 0.0, "loss_total": 0.37208133935928345, "step": 2038 }, { "batch_size": 4, "epoch": 0.8152, "step": 2038, "tokens_per_device": 4988 }, { "epoch": 0.8152, "loss_ce": 0.30490291118621826, "loss_lvr": 1.0128122568130493, "loss_mode_switch": 0.0, "loss_total": 0.4061841368675232, "step": 2038 }, { "batch_size": 4, "epoch": 0.8152, "step": 2038, "tokens_per_device": 4408 }, { "epoch": 0.8152, "loss_ce": 0.03064807876944542, "loss_lvr": 0.8010377883911133, "loss_mode_switch": 0.0, "loss_total": 0.11075185239315033, "step": 2038 }, { "batch_size": 1, "epoch": 0.8152, "step": 2038, "tokens_per_device": 5015 }, { "epoch": 0.8152, "loss_ce": 0.1800030916929245, "loss_lvr": 1.1755949258804321, "loss_mode_switch": 0.0, "loss_total": 0.2975625991821289, "step": 2038 }, { "batch_size": 4, "epoch": 0.8152, "step": 2038, "tokens_per_device": 4312 }, { "epoch": 0.8152, "loss_ce": 0.23603174090385437, "loss_lvr": 0.8520523309707642, "loss_mode_switch": 0.0, "loss_total": 0.3212369680404663, "step": 2038 }, { "batch_size": 4, "epoch": 0.8152, "step": 2038, "tokens_per_device": 6064 }, { "epoch": 0.8152, "loss_ce": 0.08682399988174438, "loss_lvr": 0.7855868935585022, "loss_mode_switch": 0.0, "loss_total": 0.16538268327713013, "step": 2038 }, { "batch_size": 4, "epoch": 0.8152, "step": 2038, "tokens_per_device": 4908 }, { "epoch": 0.8152, "loss_ce": 0.06956726312637329, "loss_lvr": 0.5655273795127869, "loss_mode_switch": 0.0, "loss_total": 0.12612000107765198, "step": 2038 }, { "batch_size": 4, "epoch": 0.8152, "step": 2038, "tokens_per_device": 6312 }, { "epoch": 0.8152, "loss_ce": 0.21687260270118713, "loss_lvr": 0.6056218147277832, "loss_mode_switch": 0.0, "loss_total": 0.2774347960948944, "step": 2038 }, { "epoch": 0.8156, "grad_norm": 1.2576422691345215, "learning_rate": 8.655075707667399e-07, "loss": 0.2665, "step": 2039 }, { "batch_size": 4, "epoch": 0.8156, "step": 2039, "tokens_per_device": 4160 }, { "epoch": 0.8156, "loss_ce": 0.15884289145469666, "loss_lvr": 0.831087052822113, "loss_mode_switch": 0.0, "loss_total": 0.2419515997171402, "step": 2039 }, { "batch_size": 4, "epoch": 0.8156, "step": 2039, "tokens_per_device": 3752 }, { "epoch": 0.8156, "loss_ce": 0.37443000078201294, "loss_lvr": 0.9883420467376709, "loss_mode_switch": 0.0, "loss_total": 0.473264217376709, "step": 2039 }, { "batch_size": 1, "epoch": 0.8156, "step": 2039, "tokens_per_device": 4906 }, { "epoch": 0.8156, "loss_ce": 0.12245555967092514, "loss_lvr": 0.5721943974494934, "loss_mode_switch": 0.0, "loss_total": 0.17967499792575836, "step": 2039 }, { "batch_size": 4, "epoch": 0.8156, "step": 2039, "tokens_per_device": 4444 }, { "epoch": 0.8156, "loss_ce": 0.4903874397277832, "loss_lvr": 0.8416659832000732, "loss_mode_switch": 0.0, "loss_total": 0.5745540261268616, "step": 2039 }, { "batch_size": 1, "epoch": 0.8156, "step": 2039, "tokens_per_device": 4980 }, { "epoch": 0.8156, "loss_ce": 0.06247394531965256, "loss_lvr": 0.27888023853302, "loss_mode_switch": 0.0, "loss_total": 0.09036196768283844, "step": 2039 }, { "batch_size": 4, "epoch": 0.8156, "step": 2039, "tokens_per_device": 15400 }, { "epoch": 0.8156, "loss_ce": 0.6031619310379028, "loss_lvr": 0.9585161805152893, "loss_mode_switch": 0.0, "loss_total": 0.6990135312080383, "step": 2039 }, { "batch_size": 4, "epoch": 0.8156, "step": 2039, "tokens_per_device": 4064 }, { "epoch": 0.8156, "loss_ce": 0.3199583888053894, "loss_lvr": 0.653188169002533, "loss_mode_switch": 0.0, "loss_total": 0.3852772116661072, "step": 2039 }, { "batch_size": 1, "epoch": 0.8156, "step": 2039, "tokens_per_device": 5106 }, { "epoch": 0.8156, "loss_ce": 0.1476101130247116, "loss_lvr": 0.5271626710891724, "loss_mode_switch": 0.0, "loss_total": 0.20032638311386108, "step": 2039 }, { "epoch": 0.816, "grad_norm": 1.3915462493896484, "learning_rate": 8.618684052977305e-07, "loss": 0.3086, "step": 2040 }, { "batch_size": 1, "epoch": 0.816, "step": 2040, "tokens_per_device": 4884 }, { "epoch": 0.816, "loss_ce": 0.06987756490707397, "loss_lvr": 1.3551729917526245, "loss_mode_switch": 0.0, "loss_total": 0.20539486408233643, "step": 2040 }, { "batch_size": 4, "epoch": 0.816, "step": 2040, "tokens_per_device": 4072 }, { "epoch": 0.816, "loss_ce": 0.45213380455970764, "loss_lvr": 0.9728081822395325, "loss_mode_switch": 0.0, "loss_total": 0.5494146347045898, "step": 2040 }, { "batch_size": 4, "epoch": 0.816, "step": 2040, "tokens_per_device": 1368 }, { "epoch": 0.816, "loss_ce": 0.5448349118232727, "loss_lvr": 0.9034483432769775, "loss_mode_switch": 0.0, "loss_total": 0.6351797580718994, "step": 2040 }, { "batch_size": 4, "epoch": 0.816, "step": 2040, "tokens_per_device": 11360 }, { "epoch": 0.816, "loss_ce": 0.25579237937927246, "loss_lvr": 1.0041307210922241, "loss_mode_switch": 0.0, "loss_total": 0.35620546340942383, "step": 2040 }, { "batch_size": 4, "epoch": 0.816, "step": 2040, "tokens_per_device": 14808 }, { "epoch": 0.816, "loss_ce": 0.30372801423072815, "loss_lvr": 0.5847787857055664, "loss_mode_switch": 0.0, "loss_total": 0.3622058928012848, "step": 2040 }, { "batch_size": 1, "epoch": 0.816, "step": 2040, "tokens_per_device": 4826 }, { "epoch": 0.816, "loss_ce": 0.011265178211033344, "loss_lvr": 0.37634798884391785, "loss_mode_switch": 0.0, "loss_total": 0.04889997839927673, "step": 2040 }, { "batch_size": 4, "epoch": 0.816, "step": 2040, "tokens_per_device": 1500 }, { "epoch": 0.816, "loss_ce": 0.2309543341398239, "loss_lvr": 0.9471506476402283, "loss_mode_switch": 0.0, "loss_total": 0.32566940784454346, "step": 2040 }, { "batch_size": 4, "epoch": 0.816, "step": 2040, "tokens_per_device": 4564 }, { "epoch": 0.816, "loss_ce": 0.17626574635505676, "loss_lvr": 1.027402639389038, "loss_mode_switch": 0.0, "loss_total": 0.2790060043334961, "step": 2040 }, { "epoch": 0.8164, "grad_norm": 1.2829318046569824, "learning_rate": 8.582361849605891e-07, "loss": 0.2878, "step": 2041 }, { "batch_size": 4, "epoch": 0.8164, "step": 2041, "tokens_per_device": 3792 }, { "epoch": 0.8164, "loss_ce": 0.18700829148292542, "loss_lvr": 1.0782806873321533, "loss_mode_switch": 0.0, "loss_total": 0.2948363721370697, "step": 2041 }, { "batch_size": 4, "epoch": 0.8164, "step": 2041, "tokens_per_device": 4376 }, { "epoch": 0.8164, "loss_ce": 0.6768015623092651, "loss_lvr": 0.9317270517349243, "loss_mode_switch": 0.0, "loss_total": 0.7699742913246155, "step": 2041 }, { "batch_size": 4, "epoch": 0.8164, "step": 2041, "tokens_per_device": 5012 }, { "epoch": 0.8164, "loss_ce": 0.13012631237506866, "loss_lvr": 0.8277365565299988, "loss_mode_switch": 0.0, "loss_total": 0.21289996802806854, "step": 2041 }, { "batch_size": 1, "epoch": 0.8164, "step": 2041, "tokens_per_device": 4896 }, { "epoch": 0.8164, "loss_ce": 0.027653653174638748, "loss_lvr": 0.578330397605896, "loss_mode_switch": 0.0, "loss_total": 0.08548669517040253, "step": 2041 }, { "batch_size": 1, "epoch": 0.8164, "step": 2041, "tokens_per_device": 5316 }, { "epoch": 0.8164, "loss_ce": 2.07259464263916, "loss_lvr": 0.4528018534183502, "loss_mode_switch": 0.0, "loss_total": 2.11787486076355, "step": 2041 }, { "batch_size": 4, "epoch": 0.8164, "step": 2041, "tokens_per_device": 4064 }, { "epoch": 0.8164, "loss_ce": 0.10354644060134888, "loss_lvr": 0.7062926292419434, "loss_mode_switch": 0.0, "loss_total": 0.1741757094860077, "step": 2041 }, { "batch_size": 4, "epoch": 0.8164, "step": 2041, "tokens_per_device": 2664 }, { "epoch": 0.8164, "loss_ce": 0.416408509016037, "loss_lvr": 0.7769547700881958, "loss_mode_switch": 0.0, "loss_total": 0.4941039979457855, "step": 2041 }, { "batch_size": 4, "epoch": 0.8164, "step": 2041, "tokens_per_device": 1420 }, { "epoch": 0.8164, "loss_ce": 0.6652210354804993, "loss_lvr": 0.9058197140693665, "loss_mode_switch": 0.0, "loss_total": 0.7558029890060425, "step": 2041 }, { "epoch": 0.8168, "grad_norm": 1.5058766603469849, "learning_rate": 8.546109158513615e-07, "loss": 0.3618, "step": 2042 }, { "batch_size": 4, "epoch": 0.8168, "step": 2042, "tokens_per_device": 3744 }, { "epoch": 0.8168, "loss_ce": 0.3262692987918854, "loss_lvr": 0.7617563009262085, "loss_mode_switch": 0.0, "loss_total": 0.4024449288845062, "step": 2042 }, { "batch_size": 1, "epoch": 0.8168, "step": 2042, "tokens_per_device": 7232 }, { "epoch": 0.8168, "loss_ce": 0.22319045662879944, "loss_lvr": 0.40317869186401367, "loss_mode_switch": 0.0, "loss_total": 0.26350831985473633, "step": 2042 }, { "batch_size": 4, "epoch": 0.8168, "step": 2042, "tokens_per_device": 5032 }, { "epoch": 0.8168, "loss_ce": 0.4965989887714386, "loss_lvr": 0.7658697366714478, "loss_mode_switch": 0.0, "loss_total": 0.5731859803199768, "step": 2042 }, { "batch_size": 4, "epoch": 0.8168, "step": 2042, "tokens_per_device": 4136 }, { "epoch": 0.8168, "loss_ce": 0.0576525442302227, "loss_lvr": 0.5629836916923523, "loss_mode_switch": 0.0, "loss_total": 0.11395091563463211, "step": 2042 }, { "batch_size": 1, "epoch": 0.8168, "step": 2042, "tokens_per_device": 5164 }, { "epoch": 0.8168, "loss_ce": 0.03121114708483219, "loss_lvr": 0.5546242594718933, "loss_mode_switch": 0.0, "loss_total": 0.08667357265949249, "step": 2042 }, { "batch_size": 1, "epoch": 0.8168, "step": 2042, "tokens_per_device": 5114 }, { "epoch": 0.8168, "loss_ce": 0.06832914799451828, "loss_lvr": 0.3902437686920166, "loss_mode_switch": 0.0, "loss_total": 0.10735352337360382, "step": 2042 }, { "batch_size": 4, "epoch": 0.8168, "step": 2042, "tokens_per_device": 4372 }, { "epoch": 0.8168, "loss_ce": 0.12850382924079895, "loss_lvr": 0.8257706761360168, "loss_mode_switch": 0.0, "loss_total": 0.2110809087753296, "step": 2042 }, { "batch_size": 4, "epoch": 0.8168, "step": 2042, "tokens_per_device": 5156 }, { "epoch": 0.8168, "loss_ce": 0.13306541740894318, "loss_lvr": 0.6895596385002136, "loss_mode_switch": 0.0, "loss_total": 0.20202139019966125, "step": 2042 }, { "epoch": 0.8172, "grad_norm": 1.365264892578125, "learning_rate": 8.509926040544308e-07, "loss": 0.2476, "step": 2043 }, { "batch_size": 4, "epoch": 0.8172, "step": 2043, "tokens_per_device": 3972 }, { "epoch": 0.8172, "loss_ce": 0.10239502787590027, "loss_lvr": 1.0570443868637085, "loss_mode_switch": 0.0, "loss_total": 0.20809946954250336, "step": 2043 }, { "batch_size": 4, "epoch": 0.8172, "step": 2043, "tokens_per_device": 8820 }, { "epoch": 0.8172, "loss_ce": 0.003157791681587696, "loss_lvr": 0.9152331352233887, "loss_mode_switch": 0.0, "loss_total": 0.09468110650777817, "step": 2043 }, { "batch_size": 4, "epoch": 0.8172, "step": 2043, "tokens_per_device": 4748 }, { "epoch": 0.8172, "loss_ce": 0.4539512097835541, "loss_lvr": 0.7903485894203186, "loss_mode_switch": 0.0, "loss_total": 0.532986044883728, "step": 2043 }, { "batch_size": 1, "epoch": 0.8172, "step": 2043, "tokens_per_device": 5178 }, { "epoch": 0.8172, "loss_ce": 0.10103090107440948, "loss_lvr": 0.8340727686882019, "loss_mode_switch": 0.0, "loss_total": 0.18443816900253296, "step": 2043 }, { "batch_size": 4, "epoch": 0.8172, "step": 2043, "tokens_per_device": 4260 }, { "epoch": 0.8172, "loss_ce": 0.2923180162906647, "loss_lvr": 0.8614029884338379, "loss_mode_switch": 0.0, "loss_total": 0.37845832109451294, "step": 2043 }, { "batch_size": 4, "epoch": 0.8172, "step": 2043, "tokens_per_device": 4376 }, { "epoch": 0.8172, "loss_ce": 0.35840028524398804, "loss_lvr": 0.6029582023620605, "loss_mode_switch": 0.0, "loss_total": 0.4186961054801941, "step": 2043 }, { "batch_size": 4, "epoch": 0.8172, "step": 2043, "tokens_per_device": 3788 }, { "epoch": 0.8172, "loss_ce": 0.03079739399254322, "loss_lvr": 0.7604717016220093, "loss_mode_switch": 0.0, "loss_total": 0.10684456676244736, "step": 2043 }, { "batch_size": 4, "epoch": 0.8172, "step": 2043, "tokens_per_device": 5668 }, { "epoch": 0.8172, "loss_ce": 0.22698991000652313, "loss_lvr": 0.575015664100647, "loss_mode_switch": 0.0, "loss_total": 0.28449147939682007, "step": 2043 }, { "epoch": 0.8176, "grad_norm": 1.2547427415847778, "learning_rate": 8.473812556425037e-07, "loss": 0.2983, "step": 2044 }, { "batch_size": 4, "epoch": 0.8176, "step": 2044, "tokens_per_device": 4196 }, { "epoch": 0.8176, "loss_ce": 0.33229827880859375, "loss_lvr": 0.8005378246307373, "loss_mode_switch": 0.0, "loss_total": 0.412352055311203, "step": 2044 }, { "batch_size": 4, "epoch": 0.8176, "step": 2044, "tokens_per_device": 1712 }, { "epoch": 0.8176, "loss_ce": 0.07430311292409897, "loss_lvr": 0.904258131980896, "loss_mode_switch": 0.0, "loss_total": 0.16472892463207245, "step": 2044 }, { "batch_size": 1, "epoch": 0.8176, "step": 2044, "tokens_per_device": 5113 }, { "epoch": 0.8176, "loss_ce": 0.3079935312271118, "loss_lvr": 0.5179919600486755, "loss_mode_switch": 0.0, "loss_total": 0.3597927391529083, "step": 2044 }, { "batch_size": 4, "epoch": 0.8176, "step": 2044, "tokens_per_device": 2852 }, { "epoch": 0.8176, "loss_ce": 0.25371724367141724, "loss_lvr": 0.5084176063537598, "loss_mode_switch": 0.0, "loss_total": 0.30455899238586426, "step": 2044 }, { "batch_size": 4, "epoch": 0.8176, "step": 2044, "tokens_per_device": 10252 }, { "epoch": 0.8176, "loss_ce": 0.08857785910367966, "loss_lvr": 0.5968745946884155, "loss_mode_switch": 0.0, "loss_total": 0.1482653170824051, "step": 2044 }, { "batch_size": 4, "epoch": 0.8176, "step": 2044, "tokens_per_device": 5364 }, { "epoch": 0.8176, "loss_ce": 0.3524659276008606, "loss_lvr": 0.7423692941665649, "loss_mode_switch": 0.0, "loss_total": 0.4267028570175171, "step": 2044 }, { "batch_size": 4, "epoch": 0.8176, "step": 2044, "tokens_per_device": 2656 }, { "epoch": 0.8176, "loss_ce": 0.19123020768165588, "loss_lvr": 1.1815460920333862, "loss_mode_switch": 0.0, "loss_total": 0.309384822845459, "step": 2044 }, { "batch_size": 4, "epoch": 0.8176, "step": 2044, "tokens_per_device": 4612 }, { "epoch": 0.8176, "loss_ce": 0.10212766379117966, "loss_lvr": 0.7672197222709656, "loss_mode_switch": 0.0, "loss_total": 0.17884963750839233, "step": 2044 }, { "epoch": 0.818, "grad_norm": 1.4617218971252441, "learning_rate": 8.437768766765975e-07, "loss": 0.2794, "step": 2045 }, { "batch_size": 4, "epoch": 0.818, "step": 2045, "tokens_per_device": 4992 }, { "epoch": 0.818, "loss_ce": 0.003994626458734274, "loss_lvr": 0.809116780757904, "loss_mode_switch": 0.0, "loss_total": 0.08490630984306335, "step": 2045 }, { "batch_size": 4, "epoch": 0.818, "step": 2045, "tokens_per_device": 4188 }, { "epoch": 0.818, "loss_ce": 0.5431289076805115, "loss_lvr": 0.9355999231338501, "loss_mode_switch": 0.0, "loss_total": 0.6366888880729675, "step": 2045 }, { "batch_size": 4, "epoch": 0.818, "step": 2045, "tokens_per_device": 3932 }, { "epoch": 0.818, "loss_ce": 0.2543577551841736, "loss_lvr": 1.1867396831512451, "loss_mode_switch": 0.0, "loss_total": 0.37303173542022705, "step": 2045 }, { "batch_size": 4, "epoch": 0.818, "step": 2045, "tokens_per_device": 9040 }, { "epoch": 0.818, "loss_ce": 0.5393567085266113, "loss_lvr": 0.6142677068710327, "loss_mode_switch": 0.0, "loss_total": 0.6007834672927856, "step": 2045 }, { "batch_size": 4, "epoch": 0.818, "step": 2045, "tokens_per_device": 4780 }, { "epoch": 0.818, "loss_ce": 0.18610748648643494, "loss_lvr": 0.848115861415863, "loss_mode_switch": 0.0, "loss_total": 0.2709190845489502, "step": 2045 }, { "batch_size": 1, "epoch": 0.818, "step": 2045, "tokens_per_device": 4883 }, { "epoch": 0.818, "loss_ce": 0.002771836007013917, "loss_lvr": 0.36691609025001526, "loss_mode_switch": 0.0, "loss_total": 0.03946344554424286, "step": 2045 }, { "batch_size": 1, "epoch": 0.818, "step": 2045, "tokens_per_device": 7148 }, { "epoch": 0.818, "loss_ce": 0.05669309198856354, "loss_lvr": 0.6482569575309753, "loss_mode_switch": 0.0, "loss_total": 0.12151879072189331, "step": 2045 }, { "batch_size": 4, "epoch": 0.818, "step": 2045, "tokens_per_device": 2588 }, { "epoch": 0.818, "loss_ce": 0.5276628136634827, "loss_lvr": 0.8076863288879395, "loss_mode_switch": 0.0, "loss_total": 0.6084314584732056, "step": 2045 }, { "epoch": 0.8184, "grad_norm": 1.2686498165130615, "learning_rate": 8.40179473206032e-07, "loss": 0.2956, "step": 2046 }, { "batch_size": 4, "epoch": 0.8184, "step": 2046, "tokens_per_device": 5204 }, { "epoch": 0.8184, "loss_ce": 0.2711622714996338, "loss_lvr": 0.8098227977752686, "loss_mode_switch": 0.0, "loss_total": 0.3521445393562317, "step": 2046 }, { "batch_size": 1, "epoch": 0.8184, "step": 2046, "tokens_per_device": 4899 }, { "epoch": 0.8184, "loss_ce": 0.005534249823540449, "loss_lvr": 0.3255816698074341, "loss_mode_switch": 0.0, "loss_total": 0.038092419505119324, "step": 2046 }, { "batch_size": 1, "epoch": 0.8184, "step": 2046, "tokens_per_device": 4881 }, { "epoch": 0.8184, "loss_ce": 0.25641748309135437, "loss_lvr": 0.7239745855331421, "loss_mode_switch": 0.0, "loss_total": 0.32881495356559753, "step": 2046 }, { "batch_size": 4, "epoch": 0.8184, "step": 2046, "tokens_per_device": 1256 }, { "epoch": 0.8184, "loss_ce": 0.3721763491630554, "loss_lvr": 0.9887670278549194, "loss_mode_switch": 0.0, "loss_total": 0.4710530638694763, "step": 2046 }, { "batch_size": 1, "epoch": 0.8184, "step": 2046, "tokens_per_device": 5131 }, { "epoch": 0.8184, "loss_ce": 0.00904612522572279, "loss_lvr": 0.41498109698295593, "loss_mode_switch": 0.0, "loss_total": 0.05054423585534096, "step": 2046 }, { "batch_size": 4, "epoch": 0.8184, "step": 2046, "tokens_per_device": 1600 }, { "epoch": 0.8184, "loss_ce": 0.3080754578113556, "loss_lvr": 1.0894620418548584, "loss_mode_switch": 0.0, "loss_total": 0.41702166199684143, "step": 2046 }, { "batch_size": 4, "epoch": 0.8184, "step": 2046, "tokens_per_device": 4248 }, { "epoch": 0.8184, "loss_ce": 0.6387636065483093, "loss_lvr": 0.8520224690437317, "loss_mode_switch": 0.0, "loss_total": 0.7239658832550049, "step": 2046 }, { "batch_size": 1, "epoch": 0.8184, "step": 2046, "tokens_per_device": 5193 }, { "epoch": 0.8184, "loss_ce": 0.06080373749136925, "loss_lvr": 0.8356537818908691, "loss_mode_switch": 0.0, "loss_total": 0.14436911046504974, "step": 2046 }, { "epoch": 0.8188, "grad_norm": 1.3639739751815796, "learning_rate": 8.365890512684211e-07, "loss": 0.2742, "step": 2047 }, { "batch_size": 1, "epoch": 0.8188, "step": 2047, "tokens_per_device": 4922 }, { "epoch": 0.8188, "loss_ce": 0.0008295575389638543, "loss_lvr": 0.25648289918899536, "loss_mode_switch": 0.0, "loss_total": 0.02647784724831581, "step": 2047 }, { "batch_size": 4, "epoch": 0.8188, "step": 2047, "tokens_per_device": 2716 }, { "epoch": 0.8188, "loss_ce": 0.465027391910553, "loss_lvr": 1.290685772895813, "loss_mode_switch": 0.0, "loss_total": 0.5940959453582764, "step": 2047 }, { "batch_size": 4, "epoch": 0.8188, "step": 2047, "tokens_per_device": 4488 }, { "epoch": 0.8188, "loss_ce": 0.28349485993385315, "loss_lvr": 0.49608781933784485, "loss_mode_switch": 0.0, "loss_total": 0.33310365676879883, "step": 2047 }, { "batch_size": 1, "epoch": 0.8188, "step": 2047, "tokens_per_device": 5092 }, { "epoch": 0.8188, "loss_ce": 0.118568055331707, "loss_lvr": 0.34989091753959656, "loss_mode_switch": 0.0, "loss_total": 0.15355715155601501, "step": 2047 }, { "batch_size": 4, "epoch": 0.8188, "step": 2047, "tokens_per_device": 5808 }, { "epoch": 0.8188, "loss_ce": 0.17668262124061584, "loss_lvr": 0.595414400100708, "loss_mode_switch": 0.0, "loss_total": 0.23622405529022217, "step": 2047 }, { "batch_size": 1, "epoch": 0.8188, "step": 2047, "tokens_per_device": 4669 }, { "epoch": 0.8188, "loss_ce": 0.05360998585820198, "loss_lvr": 0.32992830872535706, "loss_mode_switch": 0.0, "loss_total": 0.0866028219461441, "step": 2047 }, { "batch_size": 1, "epoch": 0.8188, "step": 2047, "tokens_per_device": 4872 }, { "epoch": 0.8188, "loss_ce": 0.2054615467786789, "loss_lvr": 0.5059171915054321, "loss_mode_switch": 0.0, "loss_total": 0.25605326890945435, "step": 2047 }, { "batch_size": 1, "epoch": 0.8188, "step": 2047, "tokens_per_device": 5130 }, { "epoch": 0.8188, "loss_ce": 0.11076962202787399, "loss_lvr": 0.5075393319129944, "loss_mode_switch": 0.0, "loss_total": 0.16152355074882507, "step": 2047 }, { "epoch": 0.8192, "grad_norm": 1.309128761291504, "learning_rate": 8.330056168896628e-07, "loss": 0.3066, "step": 2048 }, { "batch_size": 4, "epoch": 0.8192, "step": 2048, "tokens_per_device": 3536 }, { "epoch": 0.8192, "loss_ce": 0.5463240742683411, "loss_lvr": 0.8848569393157959, "loss_mode_switch": 0.0, "loss_total": 0.6348097920417786, "step": 2048 }, { "batch_size": 4, "epoch": 0.8192, "step": 2048, "tokens_per_device": 4388 }, { "epoch": 0.8192, "loss_ce": 0.0795896053314209, "loss_lvr": 0.8471137285232544, "loss_mode_switch": 0.0, "loss_total": 0.16430097818374634, "step": 2048 }, { "batch_size": 4, "epoch": 0.8192, "step": 2048, "tokens_per_device": 4244 }, { "epoch": 0.8192, "loss_ce": 0.5989058613777161, "loss_lvr": 0.9170612096786499, "loss_mode_switch": 0.0, "loss_total": 0.6906119585037231, "step": 2048 }, { "batch_size": 1, "epoch": 0.8192, "step": 2048, "tokens_per_device": 4890 }, { "epoch": 0.8192, "loss_ce": 0.004973583854734898, "loss_lvr": 0.3413165807723999, "loss_mode_switch": 0.0, "loss_total": 0.039105240255594254, "step": 2048 }, { "batch_size": 4, "epoch": 0.8192, "step": 2048, "tokens_per_device": 5168 }, { "epoch": 0.8192, "loss_ce": 0.20398983359336853, "loss_lvr": 0.8121626377105713, "loss_mode_switch": 0.0, "loss_total": 0.2852061092853546, "step": 2048 }, { "batch_size": 1, "epoch": 0.8192, "step": 2048, "tokens_per_device": 4981 }, { "epoch": 0.8192, "loss_ce": 0.30817100405693054, "loss_lvr": 0.39724263548851013, "loss_mode_switch": 0.0, "loss_total": 0.3478952646255493, "step": 2048 }, { "batch_size": 4, "epoch": 0.8192, "step": 2048, "tokens_per_device": 6968 }, { "epoch": 0.8192, "loss_ce": 0.1826876401901245, "loss_lvr": 0.8463135957717896, "loss_mode_switch": 0.0, "loss_total": 0.267318993806839, "step": 2048 }, { "batch_size": 1, "epoch": 0.8192, "step": 2048, "tokens_per_device": 4948 }, { "epoch": 0.8192, "loss_ce": 0.284330815076828, "loss_lvr": 0.45528244972229004, "loss_mode_switch": 0.0, "loss_total": 0.32985904812812805, "step": 2048 }, { "epoch": 0.8196, "grad_norm": 1.458133339881897, "learning_rate": 8.294291760839268e-07, "loss": 0.327, "step": 2049 }, { "batch_size": 1, "epoch": 0.8196, "step": 2049, "tokens_per_device": 4910 }, { "epoch": 0.8196, "loss_ce": 0.13285696506500244, "loss_lvr": 0.14856737852096558, "loss_mode_switch": 0.0, "loss_total": 0.14771370589733124, "step": 2049 }, { "batch_size": 1, "epoch": 0.8196, "step": 2049, "tokens_per_device": 4867 }, { "epoch": 0.8196, "loss_ce": 0.0037010679952800274, "loss_lvr": 0.1798621118068695, "loss_mode_switch": 0.0, "loss_total": 0.021687280386686325, "step": 2049 }, { "batch_size": 4, "epoch": 0.8196, "step": 2049, "tokens_per_device": 8500 }, { "epoch": 0.8196, "loss_ce": 0.308265745639801, "loss_lvr": 0.8782710433006287, "loss_mode_switch": 0.0, "loss_total": 0.39609286189079285, "step": 2049 }, { "batch_size": 4, "epoch": 0.8196, "step": 2049, "tokens_per_device": 1716 }, { "epoch": 0.8196, "loss_ce": 0.1630222499370575, "loss_lvr": 0.8590794205665588, "loss_mode_switch": 0.0, "loss_total": 0.2489301860332489, "step": 2049 }, { "batch_size": 1, "epoch": 0.8196, "step": 2049, "tokens_per_device": 4996 }, { "epoch": 0.8196, "loss_ce": 0.004744442645460367, "loss_lvr": 0.6252811551094055, "loss_mode_switch": 0.0, "loss_total": 0.06727255880832672, "step": 2049 }, { "batch_size": 4, "epoch": 0.8196, "step": 2049, "tokens_per_device": 6868 }, { "epoch": 0.8196, "loss_ce": 0.0043065110221505165, "loss_lvr": 0.6530809998512268, "loss_mode_switch": 0.0, "loss_total": 0.06961461156606674, "step": 2049 }, { "batch_size": 4, "epoch": 0.8196, "step": 2049, "tokens_per_device": 2624 }, { "epoch": 0.8196, "loss_ce": 0.5015085935592651, "loss_lvr": 0.843980610370636, "loss_mode_switch": 0.0, "loss_total": 0.5859066247940063, "step": 2049 }, { "batch_size": 1, "epoch": 0.8196, "step": 2049, "tokens_per_device": 5015 }, { "epoch": 0.8196, "loss_ce": 0.018907852470874786, "loss_lvr": 0.8902962803840637, "loss_mode_switch": 0.0, "loss_total": 0.10793748497962952, "step": 2049 }, { "epoch": 0.82, "grad_norm": 1.3266284465789795, "learning_rate": 8.258597348536452e-07, "loss": 0.2907, "step": 2050 }, { "batch_size": 4, "epoch": 0.82, "step": 2050, "tokens_per_device": 5864 }, { "epoch": 0.82, "loss_ce": 0.02499440498650074, "loss_lvr": 0.6038274765014648, "loss_mode_switch": 0.0, "loss_total": 0.08537715673446655, "step": 2050 }, { "batch_size": 4, "epoch": 0.82, "step": 2050, "tokens_per_device": 2700 }, { "epoch": 0.82, "loss_ce": 0.13714995980262756, "loss_lvr": 0.6437853574752808, "loss_mode_switch": 0.0, "loss_total": 0.20152848958969116, "step": 2050 }, { "batch_size": 4, "epoch": 0.82, "step": 2050, "tokens_per_device": 1272 }, { "epoch": 0.82, "loss_ce": 0.26421213150024414, "loss_lvr": 1.110716462135315, "loss_mode_switch": 0.0, "loss_total": 0.37528377771377563, "step": 2050 }, { "batch_size": 1, "epoch": 0.82, "step": 2050, "tokens_per_device": 4878 }, { "epoch": 0.82, "loss_ce": 0.018785111606121063, "loss_lvr": 0.44996440410614014, "loss_mode_switch": 0.0, "loss_total": 0.06378155201673508, "step": 2050 }, { "batch_size": 1, "epoch": 0.82, "step": 2050, "tokens_per_device": 4920 }, { "epoch": 0.82, "loss_ce": 0.170066699385643, "loss_lvr": 0.3209003508090973, "loss_mode_switch": 0.0, "loss_total": 0.20215673744678497, "step": 2050 }, { "batch_size": 4, "epoch": 0.82, "step": 2050, "tokens_per_device": 3784 }, { "epoch": 0.82, "loss_ce": 0.31401053071022034, "loss_lvr": 0.5658964514732361, "loss_mode_switch": 0.0, "loss_total": 0.370600163936615, "step": 2050 }, { "batch_size": 1, "epoch": 0.82, "step": 2050, "tokens_per_device": 5741 }, { "epoch": 0.82, "loss_ce": 0.039650898426771164, "loss_lvr": 0.7367388010025024, "loss_mode_switch": 0.0, "loss_total": 0.11332477629184723, "step": 2050 }, { "batch_size": 4, "epoch": 0.82, "step": 2050, "tokens_per_device": 3840 }, { "epoch": 0.82, "loss_ce": 0.636881947517395, "loss_lvr": 1.3701833486557007, "loss_mode_switch": 0.0, "loss_total": 0.7739002704620361, "step": 2050 }, { "epoch": 0.8204, "grad_norm": 1.2437494993209839, "learning_rate": 8.222972991894995e-07, "loss": 0.2744, "step": 2051 }, { "batch_size": 1, "epoch": 0.8204, "step": 2051, "tokens_per_device": 5127 }, { "epoch": 0.8204, "loss_ce": 0.022107595577836037, "loss_lvr": 0.35691702365875244, "loss_mode_switch": 0.0, "loss_total": 0.05779930204153061, "step": 2051 }, { "batch_size": 4, "epoch": 0.8204, "step": 2051, "tokens_per_device": 4400 }, { "epoch": 0.8204, "loss_ce": 0.42153510451316833, "loss_lvr": 0.819179892539978, "loss_mode_switch": 0.0, "loss_total": 0.5034530758857727, "step": 2051 }, { "batch_size": 1, "epoch": 0.8204, "step": 2051, "tokens_per_device": 5107 }, { "epoch": 0.8204, "loss_ce": 0.5886245965957642, "loss_lvr": 0.7554974555969238, "loss_mode_switch": 0.0, "loss_total": 0.6641743183135986, "step": 2051 }, { "batch_size": 4, "epoch": 0.8204, "step": 2051, "tokens_per_device": 4072 }, { "epoch": 0.8204, "loss_ce": 0.10131675004959106, "loss_lvr": 0.8094056248664856, "loss_mode_switch": 0.0, "loss_total": 0.18225732445716858, "step": 2051 }, { "batch_size": 1, "epoch": 0.8204, "step": 2051, "tokens_per_device": 4876 }, { "epoch": 0.8204, "loss_ce": 0.0011845130939036608, "loss_lvr": 0.4757458567619324, "loss_mode_switch": 0.0, "loss_total": 0.0487590990960598, "step": 2051 }, { "batch_size": 4, "epoch": 0.8204, "step": 2051, "tokens_per_device": 3744 }, { "epoch": 0.8204, "loss_ce": 0.3267720639705658, "loss_lvr": 0.8756675124168396, "loss_mode_switch": 0.0, "loss_total": 0.4143388271331787, "step": 2051 }, { "batch_size": 4, "epoch": 0.8204, "step": 2051, "tokens_per_device": 5712 }, { "epoch": 0.8204, "loss_ce": 0.003226896282285452, "loss_lvr": 0.7596696019172668, "loss_mode_switch": 0.0, "loss_total": 0.07919386029243469, "step": 2051 }, { "batch_size": 4, "epoch": 0.8204, "step": 2051, "tokens_per_device": 5448 }, { "epoch": 0.8204, "loss_ce": 0.08865485340356827, "loss_lvr": 0.7329510450363159, "loss_mode_switch": 0.0, "loss_total": 0.16194996237754822, "step": 2051 }, { "epoch": 0.8208, "grad_norm": 1.327527403831482, "learning_rate": 8.187418750704202e-07, "loss": 0.2803, "step": 2052 }, { "batch_size": 1, "epoch": 0.8208, "step": 2052, "tokens_per_device": 4863 }, { "epoch": 0.8208, "loss_ce": 0.0004754294059239328, "loss_lvr": 0.5317366719245911, "loss_mode_switch": 0.0, "loss_total": 0.05364909768104553, "step": 2052 }, { "batch_size": 4, "epoch": 0.8208, "step": 2052, "tokens_per_device": 16300 }, { "epoch": 0.8208, "loss_ce": 0.27121224999427795, "loss_lvr": 0.8458694815635681, "loss_mode_switch": 0.0, "loss_total": 0.35579919815063477, "step": 2052 }, { "batch_size": 1, "epoch": 0.8208, "step": 2052, "tokens_per_device": 5100 }, { "epoch": 0.8208, "loss_ce": 0.01567227393388748, "loss_lvr": 0.13796715438365936, "loss_mode_switch": 0.0, "loss_total": 0.029468990862369537, "step": 2052 }, { "batch_size": 4, "epoch": 0.8208, "step": 2052, "tokens_per_device": 3828 }, { "epoch": 0.8208, "loss_ce": 0.3978126347064972, "loss_lvr": 0.5145260691642761, "loss_mode_switch": 0.0, "loss_total": 0.4492652416229248, "step": 2052 }, { "batch_size": 1, "epoch": 0.8208, "step": 2052, "tokens_per_device": 4299 }, { "epoch": 0.8208, "loss_ce": 0.0016788128996267915, "loss_lvr": 0.7289250493049622, "loss_mode_switch": 0.0, "loss_total": 0.07457131892442703, "step": 2052 }, { "batch_size": 4, "epoch": 0.8208, "step": 2052, "tokens_per_device": 1336 }, { "epoch": 0.8208, "loss_ce": 0.48785996437072754, "loss_lvr": 0.8975433707237244, "loss_mode_switch": 0.0, "loss_total": 0.5776143074035645, "step": 2052 }, { "batch_size": 1, "epoch": 0.8208, "step": 2052, "tokens_per_device": 5114 }, { "epoch": 0.8208, "loss_ce": 0.0048522548750042915, "loss_lvr": 0.21385174989700317, "loss_mode_switch": 0.0, "loss_total": 0.026237431913614273, "step": 2052 }, { "batch_size": 4, "epoch": 0.8208, "step": 2052, "tokens_per_device": 2684 }, { "epoch": 0.8208, "loss_ce": 0.13196761906147003, "loss_lvr": 0.6377503871917725, "loss_mode_switch": 0.0, "loss_total": 0.195742666721344, "step": 2052 }, { "epoch": 0.8212, "grad_norm": 1.1323164701461792, "learning_rate": 8.151934684635632e-07, "loss": 0.2274, "step": 2053 }, { "batch_size": 4, "epoch": 0.8212, "step": 2053, "tokens_per_device": 3968 }, { "epoch": 0.8212, "loss_ce": 0.1450374573469162, "loss_lvr": 0.8380333781242371, "loss_mode_switch": 0.0, "loss_total": 0.22884079813957214, "step": 2053 }, { "batch_size": 1, "epoch": 0.8212, "step": 2053, "tokens_per_device": 4899 }, { "epoch": 0.8212, "loss_ce": 0.005176307167857885, "loss_lvr": 0.31176885962486267, "loss_mode_switch": 0.0, "loss_total": 0.03635319322347641, "step": 2053 }, { "batch_size": 4, "epoch": 0.8212, "step": 2053, "tokens_per_device": 2928 }, { "epoch": 0.8212, "loss_ce": 0.16551034152507782, "loss_lvr": 0.8415511250495911, "loss_mode_switch": 0.0, "loss_total": 0.24966545403003693, "step": 2053 }, { "batch_size": 4, "epoch": 0.8212, "step": 2053, "tokens_per_device": 2636 }, { "epoch": 0.8212, "loss_ce": 0.6871161460876465, "loss_lvr": 0.8400132656097412, "loss_mode_switch": 0.0, "loss_total": 0.7711174488067627, "step": 2053 }, { "batch_size": 4, "epoch": 0.8212, "step": 2053, "tokens_per_device": 4308 }, { "epoch": 0.8212, "loss_ce": 0.04286859557032585, "loss_lvr": 0.7960441708564758, "loss_mode_switch": 0.0, "loss_total": 0.12247301638126373, "step": 2053 }, { "batch_size": 1, "epoch": 0.8212, "step": 2053, "tokens_per_device": 5099 }, { "epoch": 0.8212, "loss_ce": 0.007620937656611204, "loss_lvr": 0.19436101615428925, "loss_mode_switch": 0.0, "loss_total": 0.027057040482759476, "step": 2053 }, { "batch_size": 4, "epoch": 0.8212, "step": 2053, "tokens_per_device": 1348 }, { "epoch": 0.8212, "loss_ce": 0.68354332447052, "loss_lvr": 0.9647433161735535, "loss_mode_switch": 0.0, "loss_total": 0.7800176739692688, "step": 2053 }, { "batch_size": 1, "epoch": 0.8212, "step": 2053, "tokens_per_device": 4918 }, { "epoch": 0.8212, "loss_ce": 0.3524295389652252, "loss_lvr": 0.38962239027023315, "loss_mode_switch": 0.0, "loss_total": 0.391391783952713, "step": 2053 }, { "epoch": 0.8216, "grad_norm": 1.368512511253357, "learning_rate": 8.116520853243126e-07, "loss": 0.2986, "step": 2054 }, { "batch_size": 4, "epoch": 0.8216, "step": 2054, "tokens_per_device": 3488 }, { "epoch": 0.8216, "loss_ce": 0.12315843999385834, "loss_lvr": 0.7789149284362793, "loss_mode_switch": 0.0, "loss_total": 0.20104992389678955, "step": 2054 }, { "batch_size": 4, "epoch": 0.8216, "step": 2054, "tokens_per_device": 1780 }, { "epoch": 0.8216, "loss_ce": 0.3486889898777008, "loss_lvr": 0.7851181030273438, "loss_mode_switch": 0.0, "loss_total": 0.4272007942199707, "step": 2054 }, { "batch_size": 4, "epoch": 0.8216, "step": 2054, "tokens_per_device": 14052 }, { "epoch": 0.8216, "loss_ce": 0.36484333872795105, "loss_lvr": 0.9032975435256958, "loss_mode_switch": 0.0, "loss_total": 0.4551731050014496, "step": 2054 }, { "batch_size": 1, "epoch": 0.8216, "step": 2054, "tokens_per_device": 5545 }, { "epoch": 0.8216, "loss_ce": 0.0027651276905089617, "loss_lvr": 0.32516738772392273, "loss_mode_switch": 0.0, "loss_total": 0.035281866788864136, "step": 2054 }, { "batch_size": 1, "epoch": 0.8216, "step": 2054, "tokens_per_device": 4757 }, { "epoch": 0.8216, "loss_ce": 0.0023123640567064285, "loss_lvr": 0.2566124200820923, "loss_mode_switch": 0.0, "loss_total": 0.027973607182502747, "step": 2054 }, { "batch_size": 4, "epoch": 0.8216, "step": 2054, "tokens_per_device": 4888 }, { "epoch": 0.8216, "loss_ce": 0.20004448294639587, "loss_lvr": 0.814942479133606, "loss_mode_switch": 0.0, "loss_total": 0.281538724899292, "step": 2054 }, { "batch_size": 4, "epoch": 0.8216, "step": 2054, "tokens_per_device": 6804 }, { "epoch": 0.8216, "loss_ce": 0.2500152885913849, "loss_lvr": 0.6548993587493896, "loss_mode_switch": 0.0, "loss_total": 0.3155052363872528, "step": 2054 }, { "batch_size": 4, "epoch": 0.8216, "step": 2054, "tokens_per_device": 4588 }, { "epoch": 0.8216, "loss_ce": 0.45816168189048767, "loss_lvr": 0.8617743253707886, "loss_mode_switch": 0.0, "loss_total": 0.544339120388031, "step": 2054 }, { "epoch": 0.822, "grad_norm": 1.2783336639404297, "learning_rate": 8.081177315962601e-07, "loss": 0.2809, "step": 2055 }, { "batch_size": 4, "epoch": 0.822, "step": 2055, "tokens_per_device": 1488 }, { "epoch": 0.822, "loss_ce": 0.5091072916984558, "loss_lvr": 0.8109797835350037, "loss_mode_switch": 0.0, "loss_total": 0.5902052521705627, "step": 2055 }, { "batch_size": 1, "epoch": 0.822, "step": 2055, "tokens_per_device": 5106 }, { "epoch": 0.822, "loss_ce": 0.002287869807332754, "loss_lvr": 0.3546312153339386, "loss_mode_switch": 0.0, "loss_total": 0.03775098919868469, "step": 2055 }, { "batch_size": 4, "epoch": 0.822, "step": 2055, "tokens_per_device": 1396 }, { "epoch": 0.822, "loss_ce": 0.6631330847740173, "loss_lvr": 1.0356947183609009, "loss_mode_switch": 0.0, "loss_total": 0.7667025327682495, "step": 2055 }, { "batch_size": 4, "epoch": 0.822, "step": 2055, "tokens_per_device": 4816 }, { "epoch": 0.822, "loss_ce": 0.2762812376022339, "loss_lvr": 0.6792504191398621, "loss_mode_switch": 0.0, "loss_total": 0.3442062735557556, "step": 2055 }, { "batch_size": 4, "epoch": 0.822, "step": 2055, "tokens_per_device": 2632 }, { "epoch": 0.822, "loss_ce": 0.27359622716903687, "loss_lvr": 0.9289664626121521, "loss_mode_switch": 0.0, "loss_total": 0.3664928674697876, "step": 2055 }, { "batch_size": 4, "epoch": 0.822, "step": 2055, "tokens_per_device": 3500 }, { "epoch": 0.822, "loss_ce": 0.46990442276000977, "loss_lvr": 0.850274384021759, "loss_mode_switch": 0.0, "loss_total": 0.5549318790435791, "step": 2055 }, { "batch_size": 1, "epoch": 0.822, "step": 2055, "tokens_per_device": 5571 }, { "epoch": 0.822, "loss_ce": 0.006802178919315338, "loss_lvr": 0.295306533575058, "loss_mode_switch": 0.0, "loss_total": 0.03633283078670502, "step": 2055 }, { "batch_size": 4, "epoch": 0.822, "step": 2055, "tokens_per_device": 5464 }, { "epoch": 0.822, "loss_ce": 0.11291755735874176, "loss_lvr": 0.7593640685081482, "loss_mode_switch": 0.0, "loss_total": 0.18885396420955658, "step": 2055 }, { "epoch": 0.8224, "grad_norm": 1.285693883895874, "learning_rate": 8.04590413211202e-07, "loss": 0.3015, "step": 2056 }, { "batch_size": 1, "epoch": 0.8224, "step": 2056, "tokens_per_device": 4873 }, { "epoch": 0.8224, "loss_ce": 0.015601112507283688, "loss_lvr": 0.203988716006279, "loss_mode_switch": 0.0, "loss_total": 0.03599998354911804, "step": 2056 }, { "batch_size": 1, "epoch": 0.8224, "step": 2056, "tokens_per_device": 5161 }, { "epoch": 0.8224, "loss_ce": 0.020787060260772705, "loss_lvr": 0.2800532579421997, "loss_mode_switch": 0.0, "loss_total": 0.048792384564876556, "step": 2056 }, { "batch_size": 1, "epoch": 0.8224, "step": 2056, "tokens_per_device": 4751 }, { "epoch": 0.8224, "loss_ce": 0.18594826757907867, "loss_lvr": 0.5357386469841003, "loss_mode_switch": 0.0, "loss_total": 0.23952212929725647, "step": 2056 }, { "batch_size": 4, "epoch": 0.8224, "step": 2056, "tokens_per_device": 2016 }, { "epoch": 0.8224, "loss_ce": 0.13386090099811554, "loss_lvr": 0.8020509481430054, "loss_mode_switch": 0.0, "loss_total": 0.21406599879264832, "step": 2056 }, { "batch_size": 4, "epoch": 0.8224, "step": 2056, "tokens_per_device": 4200 }, { "epoch": 0.8224, "loss_ce": 0.034229911863803864, "loss_lvr": 1.1259256601333618, "loss_mode_switch": 0.0, "loss_total": 0.1468224823474884, "step": 2056 }, { "batch_size": 4, "epoch": 0.8224, "step": 2056, "tokens_per_device": 5768 }, { "epoch": 0.8224, "loss_ce": 0.7581503987312317, "loss_lvr": 0.7913725972175598, "loss_mode_switch": 0.0, "loss_total": 0.8372876644134521, "step": 2056 }, { "batch_size": 4, "epoch": 0.8224, "step": 2056, "tokens_per_device": 1496 }, { "epoch": 0.8224, "loss_ce": 0.5830780267715454, "loss_lvr": 0.9311103820800781, "loss_mode_switch": 0.0, "loss_total": 0.6761890649795532, "step": 2056 }, { "batch_size": 4, "epoch": 0.8224, "step": 2056, "tokens_per_device": 4264 }, { "epoch": 0.8224, "loss_ce": 0.43918702006340027, "loss_lvr": 1.0800243616104126, "loss_mode_switch": 0.0, "loss_total": 0.547189474105835, "step": 2056 }, { "epoch": 0.8228, "grad_norm": 1.3448913097381592, "learning_rate": 8.010701360891265e-07, "loss": 0.2964, "step": 2057 }, { "batch_size": 4, "epoch": 0.8228, "step": 2057, "tokens_per_device": 1304 }, { "epoch": 0.8228, "loss_ce": 0.64317387342453, "loss_lvr": 2.125328302383423, "loss_mode_switch": 0.0, "loss_total": 0.8557066917419434, "step": 2057 }, { "batch_size": 4, "epoch": 0.8228, "step": 2057, "tokens_per_device": 5548 }, { "epoch": 0.8228, "loss_ce": 0.35922110080718994, "loss_lvr": 0.6026977896690369, "loss_mode_switch": 0.0, "loss_total": 0.41949087381362915, "step": 2057 }, { "batch_size": 1, "epoch": 0.8228, "step": 2057, "tokens_per_device": 5319 }, { "epoch": 0.8228, "loss_ce": 0.13252778351306915, "loss_lvr": 0.618284285068512, "loss_mode_switch": 0.0, "loss_total": 0.19435621798038483, "step": 2057 }, { "batch_size": 4, "epoch": 0.8228, "step": 2057, "tokens_per_device": 9540 }, { "epoch": 0.8228, "loss_ce": 0.2846265435218811, "loss_lvr": 0.7325103282928467, "loss_mode_switch": 0.0, "loss_total": 0.35787758231163025, "step": 2057 }, { "batch_size": 4, "epoch": 0.8228, "step": 2057, "tokens_per_device": 3976 }, { "epoch": 0.8228, "loss_ce": 0.267403781414032, "loss_lvr": 0.9550036787986755, "loss_mode_switch": 0.0, "loss_total": 0.3629041612148285, "step": 2057 }, { "batch_size": 4, "epoch": 0.8228, "step": 2057, "tokens_per_device": 5784 }, { "epoch": 0.8228, "loss_ce": 0.1857200413942337, "loss_lvr": 0.809059202671051, "loss_mode_switch": 0.0, "loss_total": 0.2666259706020355, "step": 2057 }, { "batch_size": 4, "epoch": 0.8228, "step": 2057, "tokens_per_device": 2816 }, { "epoch": 0.8228, "loss_ce": 0.10398933291435242, "loss_lvr": 0.5616342425346375, "loss_mode_switch": 0.0, "loss_total": 0.16015276312828064, "step": 2057 }, { "batch_size": 4, "epoch": 0.8228, "step": 2057, "tokens_per_device": 3784 }, { "epoch": 0.8228, "loss_ce": 0.28928935527801514, "loss_lvr": 1.2769596576690674, "loss_mode_switch": 0.0, "loss_total": 0.41698533296585083, "step": 2057 }, { "epoch": 0.8232, "grad_norm": 1.3715466260910034, "learning_rate": 7.975569061382066e-07, "loss": 0.2869, "step": 2058 }, { "batch_size": 1, "epoch": 0.8232, "step": 2058, "tokens_per_device": 4960 }, { "epoch": 0.8232, "loss_ce": 0.02339230850338936, "loss_lvr": 0.29575660824775696, "loss_mode_switch": 0.0, "loss_total": 0.052967969328165054, "step": 2058 }, { "batch_size": 1, "epoch": 0.8232, "step": 2058, "tokens_per_device": 5294 }, { "epoch": 0.8232, "loss_ce": 0.08698424696922302, "loss_lvr": 0.55475252866745, "loss_mode_switch": 0.0, "loss_total": 0.14245949685573578, "step": 2058 }, { "batch_size": 4, "epoch": 0.8232, "step": 2058, "tokens_per_device": 6432 }, { "epoch": 0.8232, "loss_ce": 0.04929399490356445, "loss_lvr": 0.6918288469314575, "loss_mode_switch": 0.0, "loss_total": 0.11847688257694244, "step": 2058 }, { "batch_size": 4, "epoch": 0.8232, "step": 2058, "tokens_per_device": 4612 }, { "epoch": 0.8232, "loss_ce": 0.33135440945625305, "loss_lvr": 1.2275491952896118, "loss_mode_switch": 0.0, "loss_total": 0.4541093409061432, "step": 2058 }, { "batch_size": 4, "epoch": 0.8232, "step": 2058, "tokens_per_device": 2656 }, { "epoch": 0.8232, "loss_ce": 0.3070894777774811, "loss_lvr": 0.9352502226829529, "loss_mode_switch": 0.0, "loss_total": 0.40061450004577637, "step": 2058 }, { "batch_size": 4, "epoch": 0.8232, "step": 2058, "tokens_per_device": 2820 }, { "epoch": 0.8232, "loss_ce": 0.4582729637622833, "loss_lvr": 0.49470654129981995, "loss_mode_switch": 0.0, "loss_total": 0.5077435970306396, "step": 2058 }, { "batch_size": 4, "epoch": 0.8232, "step": 2058, "tokens_per_device": 4260 }, { "epoch": 0.8232, "loss_ce": 0.22218725085258484, "loss_lvr": 1.1842316389083862, "loss_mode_switch": 0.0, "loss_total": 0.34061041474342346, "step": 2058 }, { "batch_size": 4, "epoch": 0.8232, "step": 2058, "tokens_per_device": 6168 }, { "epoch": 0.8232, "loss_ce": 0.10139701515436172, "loss_lvr": 0.6471573710441589, "loss_mode_switch": 0.0, "loss_total": 0.1661127507686615, "step": 2058 }, { "epoch": 0.8236, "grad_norm": 1.330970287322998, "learning_rate": 7.94050729254785e-07, "loss": 0.3071, "step": 2059 }, { "batch_size": 4, "epoch": 0.8236, "step": 2059, "tokens_per_device": 1760 }, { "epoch": 0.8236, "loss_ce": 0.6159422993659973, "loss_lvr": 0.9685966968536377, "loss_mode_switch": 0.0, "loss_total": 0.712801992893219, "step": 2059 }, { "batch_size": 4, "epoch": 0.8236, "step": 2059, "tokens_per_device": 3960 }, { "epoch": 0.8236, "loss_ce": 0.3117307126522064, "loss_lvr": 0.7626904845237732, "loss_mode_switch": 0.0, "loss_total": 0.3879997730255127, "step": 2059 }, { "batch_size": 1, "epoch": 0.8236, "step": 2059, "tokens_per_device": 5056 }, { "epoch": 0.8236, "loss_ce": 0.001313177403062582, "loss_lvr": 0.41145744919776917, "loss_mode_switch": 0.0, "loss_total": 0.042458921670913696, "step": 2059 }, { "batch_size": 1, "epoch": 0.8236, "step": 2059, "tokens_per_device": 4911 }, { "epoch": 0.8236, "loss_ce": 0.039343785494565964, "loss_lvr": 0.6275877952575684, "loss_mode_switch": 0.0, "loss_total": 0.10210256278514862, "step": 2059 }, { "batch_size": 4, "epoch": 0.8236, "step": 2059, "tokens_per_device": 2688 }, { "epoch": 0.8236, "loss_ce": 0.6766834259033203, "loss_lvr": 0.8047053217887878, "loss_mode_switch": 0.0, "loss_total": 0.7571539878845215, "step": 2059 }, { "batch_size": 4, "epoch": 0.8236, "step": 2059, "tokens_per_device": 1248 }, { "epoch": 0.8236, "loss_ce": 0.13622701168060303, "loss_lvr": 0.9488679766654968, "loss_mode_switch": 0.0, "loss_total": 0.23111382126808167, "step": 2059 }, { "batch_size": 4, "epoch": 0.8236, "step": 2059, "tokens_per_device": 8168 }, { "epoch": 0.8236, "loss_ce": 0.2985050082206726, "loss_lvr": 0.8786961436271667, "loss_mode_switch": 0.0, "loss_total": 0.3863746225833893, "step": 2059 }, { "batch_size": 1, "epoch": 0.8236, "step": 2059, "tokens_per_device": 4873 }, { "epoch": 0.8236, "loss_ce": 0.1509220004081726, "loss_lvr": 0.6368682980537415, "loss_mode_switch": 0.0, "loss_total": 0.214608833193779, "step": 2059 }, { "epoch": 0.824, "grad_norm": 1.2976126670837402, "learning_rate": 7.905516113233652e-07, "loss": 0.3172, "step": 2060 }, { "batch_size": 1, "epoch": 0.824, "step": 2060, "tokens_per_device": 4866 }, { "epoch": 0.824, "loss_ce": 0.0036137206479907036, "loss_lvr": 0.3526655435562134, "loss_mode_switch": 0.0, "loss_total": 0.038880277425050735, "step": 2060 }, { "batch_size": 1, "epoch": 0.824, "step": 2060, "tokens_per_device": 4938 }, { "epoch": 0.824, "loss_ce": 0.0417812243103981, "loss_lvr": 0.3199913501739502, "loss_mode_switch": 0.0, "loss_total": 0.073780357837677, "step": 2060 }, { "batch_size": 1, "epoch": 0.824, "step": 2060, "tokens_per_device": 4881 }, { "epoch": 0.824, "loss_ce": 6.923506589373574e-05, "loss_lvr": 0.2814336121082306, "loss_mode_switch": 0.0, "loss_total": 0.028212595731019974, "step": 2060 }, { "batch_size": 4, "epoch": 0.824, "step": 2060, "tokens_per_device": 2724 }, { "epoch": 0.824, "loss_ce": 0.3534122705459595, "loss_lvr": 0.8732853531837463, "loss_mode_switch": 0.0, "loss_total": 0.44074082374572754, "step": 2060 }, { "batch_size": 4, "epoch": 0.824, "step": 2060, "tokens_per_device": 1884 }, { "epoch": 0.824, "loss_ce": 0.7500260472297668, "loss_lvr": 0.8802016377449036, "loss_mode_switch": 0.0, "loss_total": 0.8380461931228638, "step": 2060 }, { "batch_size": 4, "epoch": 0.824, "step": 2060, "tokens_per_device": 6536 }, { "epoch": 0.824, "loss_ce": 0.377070814371109, "loss_lvr": 0.7215926051139832, "loss_mode_switch": 0.0, "loss_total": 0.4492300748825073, "step": 2060 }, { "batch_size": 1, "epoch": 0.824, "step": 2060, "tokens_per_device": 5175 }, { "epoch": 0.824, "loss_ce": 0.03225626423954964, "loss_lvr": 0.5366254448890686, "loss_mode_switch": 0.0, "loss_total": 0.08591881394386292, "step": 2060 }, { "batch_size": 4, "epoch": 0.824, "step": 2060, "tokens_per_device": 4304 }, { "epoch": 0.824, "loss_ce": 0.39898937940597534, "loss_lvr": 1.0324233770370483, "loss_mode_switch": 0.0, "loss_total": 0.5022317171096802, "step": 2060 }, { "epoch": 0.8244, "grad_norm": 1.2149574756622314, "learning_rate": 7.870595582166096e-07, "loss": 0.2819, "step": 2061 }, { "batch_size": 4, "epoch": 0.8244, "step": 2061, "tokens_per_device": 3816 }, { "epoch": 0.8244, "loss_ce": 0.6739569902420044, "loss_lvr": 1.1914284229278564, "loss_mode_switch": 0.0, "loss_total": 0.7930998206138611, "step": 2061 }, { "batch_size": 4, "epoch": 0.8244, "step": 2061, "tokens_per_device": 2568 }, { "epoch": 0.8244, "loss_ce": 0.43762269616127014, "loss_lvr": 1.12044095993042, "loss_mode_switch": 0.0, "loss_total": 0.5496667623519897, "step": 2061 }, { "batch_size": 4, "epoch": 0.8244, "step": 2061, "tokens_per_device": 10764 }, { "epoch": 0.8244, "loss_ce": 0.5561770796775818, "loss_lvr": 0.4809267520904541, "loss_mode_switch": 0.0, "loss_total": 0.6042697429656982, "step": 2061 }, { "batch_size": 4, "epoch": 0.8244, "step": 2061, "tokens_per_device": 12720 }, { "epoch": 0.8244, "loss_ce": 0.3429071605205536, "loss_lvr": 0.9399569034576416, "loss_mode_switch": 0.0, "loss_total": 0.43690285086631775, "step": 2061 }, { "batch_size": 4, "epoch": 0.8244, "step": 2061, "tokens_per_device": 7392 }, { "epoch": 0.8244, "loss_ce": 0.20251727104187012, "loss_lvr": 0.6465174555778503, "loss_mode_switch": 0.0, "loss_total": 0.2671690285205841, "step": 2061 }, { "batch_size": 1, "epoch": 0.8244, "step": 2061, "tokens_per_device": 4890 }, { "epoch": 0.8244, "loss_ce": 0.126020610332489, "loss_lvr": 0.6073825359344482, "loss_mode_switch": 0.0, "loss_total": 0.1867588609457016, "step": 2061 }, { "batch_size": 4, "epoch": 0.8244, "step": 2061, "tokens_per_device": 4224 }, { "epoch": 0.8244, "loss_ce": 0.36357057094573975, "loss_lvr": 1.008804440498352, "loss_mode_switch": 0.0, "loss_total": 0.46445101499557495, "step": 2061 }, { "batch_size": 1, "epoch": 0.8244, "step": 2061, "tokens_per_device": 5234 }, { "epoch": 0.8244, "loss_ce": 0.1649167388677597, "loss_lvr": 0.545595109462738, "loss_mode_switch": 0.0, "loss_total": 0.21947625279426575, "step": 2061 }, { "epoch": 0.8248, "grad_norm": 1.3401002883911133, "learning_rate": 7.835745757953178e-07, "loss": 0.2995, "step": 2062 }, { "batch_size": 1, "epoch": 0.8248, "step": 2062, "tokens_per_device": 4893 }, { "epoch": 0.8248, "loss_ce": 0.0002771221043076366, "loss_lvr": 0.3829192519187927, "loss_mode_switch": 0.0, "loss_total": 0.038569048047065735, "step": 2062 }, { "batch_size": 4, "epoch": 0.8248, "step": 2062, "tokens_per_device": 4636 }, { "epoch": 0.8248, "loss_ce": 0.13498306274414062, "loss_lvr": 0.7890415191650391, "loss_mode_switch": 0.0, "loss_total": 0.21388721466064453, "step": 2062 }, { "batch_size": 4, "epoch": 0.8248, "step": 2062, "tokens_per_device": 5108 }, { "epoch": 0.8248, "loss_ce": 0.4709831476211548, "loss_lvr": 0.9112580418586731, "loss_mode_switch": 0.0, "loss_total": 0.5621089339256287, "step": 2062 }, { "batch_size": 4, "epoch": 0.8248, "step": 2062, "tokens_per_device": 3928 }, { "epoch": 0.8248, "loss_ce": 0.2136746197938919, "loss_lvr": 0.871671199798584, "loss_mode_switch": 0.0, "loss_total": 0.300841748714447, "step": 2062 }, { "batch_size": 1, "epoch": 0.8248, "step": 2062, "tokens_per_device": 5083 }, { "epoch": 0.8248, "loss_ce": 0.12371769547462463, "loss_lvr": 0.29997479915618896, "loss_mode_switch": 0.0, "loss_total": 0.15371517837047577, "step": 2062 }, { "batch_size": 4, "epoch": 0.8248, "step": 2062, "tokens_per_device": 4224 }, { "epoch": 0.8248, "loss_ce": 0.28772974014282227, "loss_lvr": 0.8387455344200134, "loss_mode_switch": 0.0, "loss_total": 0.3716042935848236, "step": 2062 }, { "batch_size": 4, "epoch": 0.8248, "step": 2062, "tokens_per_device": 5948 }, { "epoch": 0.8248, "loss_ce": 0.1368899643421173, "loss_lvr": 0.7273334264755249, "loss_mode_switch": 0.0, "loss_total": 0.2096233069896698, "step": 2062 }, { "batch_size": 4, "epoch": 0.8248, "step": 2062, "tokens_per_device": 4668 }, { "epoch": 0.8248, "loss_ce": 0.07870427519083023, "loss_lvr": 0.8253815174102783, "loss_mode_switch": 0.0, "loss_total": 0.16124242544174194, "step": 2062 }, { "epoch": 0.8252, "grad_norm": 1.328696846961975, "learning_rate": 7.800966699084262e-07, "loss": 0.2901, "step": 2063 }, { "batch_size": 4, "epoch": 0.8252, "step": 2063, "tokens_per_device": 4192 }, { "epoch": 0.8252, "loss_ce": 0.05460912361741066, "loss_lvr": 0.8077712059020996, "loss_mode_switch": 0.0, "loss_total": 0.13538624346256256, "step": 2063 }, { "batch_size": 1, "epoch": 0.8252, "step": 2063, "tokens_per_device": 7069 }, { "epoch": 0.8252, "loss_ce": 0.00021005523740313947, "loss_lvr": 0.2911834120750427, "loss_mode_switch": 0.0, "loss_total": 0.029328398406505585, "step": 2063 }, { "batch_size": 4, "epoch": 0.8252, "step": 2063, "tokens_per_device": 4596 }, { "epoch": 0.8252, "loss_ce": 0.5528987646102905, "loss_lvr": 0.7479121088981628, "loss_mode_switch": 0.0, "loss_total": 0.6276899576187134, "step": 2063 }, { "batch_size": 4, "epoch": 0.8252, "step": 2063, "tokens_per_device": 2580 }, { "epoch": 0.8252, "loss_ce": 0.47957831621170044, "loss_lvr": 0.9063376784324646, "loss_mode_switch": 0.0, "loss_total": 0.5702120661735535, "step": 2063 }, { "batch_size": 4, "epoch": 0.8252, "step": 2063, "tokens_per_device": 5480 }, { "epoch": 0.8252, "loss_ce": 0.21359014511108398, "loss_lvr": 0.8940162658691406, "loss_mode_switch": 0.0, "loss_total": 0.3029917776584625, "step": 2063 }, { "batch_size": 4, "epoch": 0.8252, "step": 2063, "tokens_per_device": 14128 }, { "epoch": 0.8252, "loss_ce": 0.06373432278633118, "loss_lvr": 0.9009420275688171, "loss_mode_switch": 0.0, "loss_total": 0.15382853150367737, "step": 2063 }, { "batch_size": 4, "epoch": 0.8252, "step": 2063, "tokens_per_device": 3844 }, { "epoch": 0.8252, "loss_ce": 0.047738198190927505, "loss_lvr": 1.110395908355713, "loss_mode_switch": 0.0, "loss_total": 0.15877778828144073, "step": 2063 }, { "batch_size": 4, "epoch": 0.8252, "step": 2063, "tokens_per_device": 3792 }, { "epoch": 0.8252, "loss_ce": 0.18812932074069977, "loss_lvr": 0.4753355383872986, "loss_mode_switch": 0.0, "loss_total": 0.23566287755966187, "step": 2063 }, { "epoch": 0.8256, "grad_norm": 1.1019768714904785, "learning_rate": 7.766258463929926e-07, "loss": 0.2305, "step": 2064 }, { "batch_size": 1, "epoch": 0.8256, "step": 2064, "tokens_per_device": 5072 }, { "epoch": 0.8256, "loss_ce": 0.00543533219024539, "loss_lvr": 0.256486713886261, "loss_mode_switch": 0.0, "loss_total": 0.031084004789590836, "step": 2064 }, { "batch_size": 4, "epoch": 0.8256, "step": 2064, "tokens_per_device": 4272 }, { "epoch": 0.8256, "loss_ce": 0.24277842044830322, "loss_lvr": 0.826366662979126, "loss_mode_switch": 0.0, "loss_total": 0.32541507482528687, "step": 2064 }, { "batch_size": 1, "epoch": 0.8256, "step": 2064, "tokens_per_device": 4897 }, { "epoch": 0.8256, "loss_ce": 0.07357624918222427, "loss_lvr": 0.7920189499855042, "loss_mode_switch": 0.0, "loss_total": 0.15277814865112305, "step": 2064 }, { "batch_size": 1, "epoch": 0.8256, "step": 2064, "tokens_per_device": 4980 }, { "epoch": 0.8256, "loss_ce": 0.022789163514971733, "loss_lvr": 0.6091841459274292, "loss_mode_switch": 0.0, "loss_total": 0.08370757848024368, "step": 2064 }, { "batch_size": 4, "epoch": 0.8256, "step": 2064, "tokens_per_device": 4192 }, { "epoch": 0.8256, "loss_ce": 0.03752971813082695, "loss_lvr": 0.8271253705024719, "loss_mode_switch": 0.0, "loss_total": 0.12024225294589996, "step": 2064 }, { "batch_size": 1, "epoch": 0.8256, "step": 2064, "tokens_per_device": 5327 }, { "epoch": 0.8256, "loss_ce": 0.03387841954827309, "loss_lvr": 0.34580132365226746, "loss_mode_switch": 0.0, "loss_total": 0.06845855712890625, "step": 2064 }, { "batch_size": 4, "epoch": 0.8256, "step": 2064, "tokens_per_device": 4280 }, { "epoch": 0.8256, "loss_ce": 0.27450984716415405, "loss_lvr": 0.4388832747936249, "loss_mode_switch": 0.0, "loss_total": 0.3183981776237488, "step": 2064 }, { "batch_size": 1, "epoch": 0.8256, "step": 2064, "tokens_per_device": 6263 }, { "epoch": 0.8256, "loss_ce": 0.0014856340130791068, "loss_lvr": 0.28280603885650635, "loss_mode_switch": 0.0, "loss_total": 0.02976623922586441, "step": 2064 }, { "epoch": 0.826, "grad_norm": 1.235396385192871, "learning_rate": 7.731621110741871e-07, "loss": 0.2541, "step": 2065 }, { "batch_size": 1, "epoch": 0.826, "step": 2065, "tokens_per_device": 4989 }, { "epoch": 0.826, "loss_ce": 0.05978323146700859, "loss_lvr": 0.5224073529243469, "loss_mode_switch": 0.0, "loss_total": 0.1120239645242691, "step": 2065 }, { "batch_size": 4, "epoch": 0.826, "step": 2065, "tokens_per_device": 5724 }, { "epoch": 0.826, "loss_ce": 0.04575985670089722, "loss_lvr": 1.1539692878723145, "loss_mode_switch": 0.0, "loss_total": 0.1611567884683609, "step": 2065 }, { "batch_size": 1, "epoch": 0.826, "step": 2065, "tokens_per_device": 4893 }, { "epoch": 0.826, "loss_ce": 0.0049699535593390465, "loss_lvr": 0.35406601428985596, "loss_mode_switch": 0.0, "loss_total": 0.04037655517458916, "step": 2065 }, { "batch_size": 4, "epoch": 0.826, "step": 2065, "tokens_per_device": 12380 }, { "epoch": 0.826, "loss_ce": 0.2339712679386139, "loss_lvr": 0.7245717644691467, "loss_mode_switch": 0.0, "loss_total": 0.3064284324645996, "step": 2065 }, { "batch_size": 4, "epoch": 0.826, "step": 2065, "tokens_per_device": 5500 }, { "epoch": 0.826, "loss_ce": 0.4105866849422455, "loss_lvr": 0.9881709218025208, "loss_mode_switch": 0.0, "loss_total": 0.5094037652015686, "step": 2065 }, { "batch_size": 4, "epoch": 0.826, "step": 2065, "tokens_per_device": 5412 }, { "epoch": 0.826, "loss_ce": 0.007513006683439016, "loss_lvr": 0.5499697923660278, "loss_mode_switch": 0.0, "loss_total": 0.06250998377799988, "step": 2065 }, { "batch_size": 4, "epoch": 0.826, "step": 2065, "tokens_per_device": 1208 }, { "epoch": 0.826, "loss_ce": 0.5293590426445007, "loss_lvr": 1.2038931846618652, "loss_mode_switch": 0.0, "loss_total": 0.6497483849525452, "step": 2065 }, { "batch_size": 4, "epoch": 0.826, "step": 2065, "tokens_per_device": 3604 }, { "epoch": 0.826, "loss_ce": 0.3637045919895172, "loss_lvr": 0.829735279083252, "loss_mode_switch": 0.0, "loss_total": 0.44667813181877136, "step": 2065 }, { "epoch": 0.8264, "grad_norm": 1.3902312517166138, "learning_rate": 7.697054697652879e-07, "loss": 0.3014, "step": 2066 }, { "batch_size": 4, "epoch": 0.8264, "step": 2066, "tokens_per_device": 5676 }, { "epoch": 0.8264, "loss_ce": 0.27784502506256104, "loss_lvr": 0.8536979556083679, "loss_mode_switch": 0.0, "loss_total": 0.3632148206233978, "step": 2066 }, { "batch_size": 4, "epoch": 0.8264, "step": 2066, "tokens_per_device": 5340 }, { "epoch": 0.8264, "loss_ce": 0.5728330612182617, "loss_lvr": 0.6312451958656311, "loss_mode_switch": 0.0, "loss_total": 0.6359575986862183, "step": 2066 }, { "batch_size": 4, "epoch": 0.8264, "step": 2066, "tokens_per_device": 4292 }, { "epoch": 0.8264, "loss_ce": 0.3425721824169159, "loss_lvr": 0.9890298247337341, "loss_mode_switch": 0.0, "loss_total": 0.44147515296936035, "step": 2066 }, { "batch_size": 4, "epoch": 0.8264, "step": 2066, "tokens_per_device": 1164 }, { "epoch": 0.8264, "loss_ce": 0.07680919766426086, "loss_lvr": 1.0095559358596802, "loss_mode_switch": 0.0, "loss_total": 0.17776480317115784, "step": 2066 }, { "batch_size": 1, "epoch": 0.8264, "step": 2066, "tokens_per_device": 6062 }, { "epoch": 0.8264, "loss_ce": 0.00016661286645103246, "loss_lvr": 0.28660091757774353, "loss_mode_switch": 0.0, "loss_total": 0.028826706111431122, "step": 2066 }, { "batch_size": 4, "epoch": 0.8264, "step": 2066, "tokens_per_device": 1580 }, { "epoch": 0.8264, "loss_ce": 0.1029679924249649, "loss_lvr": 0.894184410572052, "loss_mode_switch": 0.0, "loss_total": 0.1923864334821701, "step": 2066 }, { "batch_size": 1, "epoch": 0.8264, "step": 2066, "tokens_per_device": 4458 }, { "epoch": 0.8264, "loss_ce": 0.01736905239522457, "loss_lvr": 0.9411548376083374, "loss_mode_switch": 0.0, "loss_total": 0.11148454248905182, "step": 2066 }, { "batch_size": 4, "epoch": 0.8264, "step": 2066, "tokens_per_device": 1208 }, { "epoch": 0.8264, "loss_ce": 0.22477024793624878, "loss_lvr": 1.0156605243682861, "loss_mode_switch": 0.0, "loss_total": 0.3263362944126129, "step": 2066 }, { "epoch": 0.8268, "grad_norm": 1.200581431388855, "learning_rate": 7.662559282676619e-07, "loss": 0.2734, "step": 2067 }, { "batch_size": 1, "epoch": 0.8268, "step": 2067, "tokens_per_device": 4889 }, { "epoch": 0.8268, "loss_ce": 0.015967588871717453, "loss_lvr": 0.7538742423057556, "loss_mode_switch": 0.0, "loss_total": 0.09135501086711884, "step": 2067 }, { "batch_size": 4, "epoch": 0.8268, "step": 2067, "tokens_per_device": 4444 }, { "epoch": 0.8268, "loss_ce": 0.3248266577720642, "loss_lvr": 0.9090593457221985, "loss_mode_switch": 0.0, "loss_total": 0.41573259234428406, "step": 2067 }, { "batch_size": 4, "epoch": 0.8268, "step": 2067, "tokens_per_device": 1588 }, { "epoch": 0.8268, "loss_ce": 0.3974708616733551, "loss_lvr": 0.891015350818634, "loss_mode_switch": 0.0, "loss_total": 0.48657238483428955, "step": 2067 }, { "batch_size": 4, "epoch": 0.8268, "step": 2067, "tokens_per_device": 3932 }, { "epoch": 0.8268, "loss_ce": 0.16187435388565063, "loss_lvr": 0.7840412855148315, "loss_mode_switch": 0.0, "loss_total": 0.2402784824371338, "step": 2067 }, { "batch_size": 1, "epoch": 0.8268, "step": 2067, "tokens_per_device": 7810 }, { "epoch": 0.8268, "loss_ce": 0.009402490220963955, "loss_lvr": 0.40146374702453613, "loss_mode_switch": 0.0, "loss_total": 0.04954886808991432, "step": 2067 }, { "batch_size": 1, "epoch": 0.8268, "step": 2067, "tokens_per_device": 5040 }, { "epoch": 0.8268, "loss_ce": 0.5554453730583191, "loss_lvr": 0.15716728568077087, "loss_mode_switch": 0.0, "loss_total": 0.5711621046066284, "step": 2067 }, { "batch_size": 4, "epoch": 0.8268, "step": 2067, "tokens_per_device": 1624 }, { "epoch": 0.8268, "loss_ce": 0.6851418018341064, "loss_lvr": 0.8620918393135071, "loss_mode_switch": 0.0, "loss_total": 0.7713509798049927, "step": 2067 }, { "batch_size": 1, "epoch": 0.8268, "step": 2067, "tokens_per_device": 4958 }, { "epoch": 0.8268, "loss_ce": 0.8171738386154175, "loss_lvr": 0.5375511050224304, "loss_mode_switch": 0.0, "loss_total": 0.870928943157196, "step": 2067 }, { "epoch": 0.8272, "grad_norm": 1.3119945526123047, "learning_rate": 7.628134923707642e-07, "loss": 0.2636, "step": 2068 }, { "batch_size": 1, "epoch": 0.8272, "step": 2068, "tokens_per_device": 5788 }, { "epoch": 0.8272, "loss_ce": 0.0180384311825037, "loss_lvr": 0.46708184480667114, "loss_mode_switch": 0.0, "loss_total": 0.06474661827087402, "step": 2068 }, { "batch_size": 1, "epoch": 0.8272, "step": 2068, "tokens_per_device": 4879 }, { "epoch": 0.8272, "loss_ce": 0.00037790724309161305, "loss_lvr": 0.3406350910663605, "loss_mode_switch": 0.0, "loss_total": 0.03444141894578934, "step": 2068 }, { "batch_size": 1, "epoch": 0.8272, "step": 2068, "tokens_per_device": 5139 }, { "epoch": 0.8272, "loss_ce": 0.38496285676956177, "loss_lvr": 0.23524975776672363, "loss_mode_switch": 0.0, "loss_total": 0.40848782658576965, "step": 2068 }, { "batch_size": 4, "epoch": 0.8272, "step": 2068, "tokens_per_device": 4184 }, { "epoch": 0.8272, "loss_ce": 0.19389738142490387, "loss_lvr": 0.8213416337966919, "loss_mode_switch": 0.0, "loss_total": 0.2760315537452698, "step": 2068 }, { "batch_size": 1, "epoch": 0.8272, "step": 2068, "tokens_per_device": 5016 }, { "epoch": 0.8272, "loss_ce": 0.019549939781427383, "loss_lvr": 0.26445192098617554, "loss_mode_switch": 0.0, "loss_total": 0.04599513113498688, "step": 2068 }, { "batch_size": 4, "epoch": 0.8272, "step": 2068, "tokens_per_device": 2688 }, { "epoch": 0.8272, "loss_ce": 0.04397359490394592, "loss_lvr": 0.6714943647384644, "loss_mode_switch": 0.0, "loss_total": 0.11112303286790848, "step": 2068 }, { "batch_size": 4, "epoch": 0.8272, "step": 2068, "tokens_per_device": 4632 }, { "epoch": 0.8272, "loss_ce": 0.09423157572746277, "loss_lvr": 1.101926565170288, "loss_mode_switch": 0.0, "loss_total": 0.20442423224449158, "step": 2068 }, { "batch_size": 4, "epoch": 0.8272, "step": 2068, "tokens_per_device": 3860 }, { "epoch": 0.8272, "loss_ce": 0.16819696128368378, "loss_lvr": 1.010841727256775, "loss_mode_switch": 0.0, "loss_total": 0.26928114891052246, "step": 2068 }, { "epoch": 0.8276, "grad_norm": 1.2273929119110107, "learning_rate": 7.593781678521212e-07, "loss": 0.2744, "step": 2069 }, { "batch_size": 4, "epoch": 0.8276, "step": 2069, "tokens_per_device": 5148 }, { "epoch": 0.8276, "loss_ce": 0.03547338768839836, "loss_lvr": 0.7325422167778015, "loss_mode_switch": 0.0, "loss_total": 0.1087276041507721, "step": 2069 }, { "batch_size": 1, "epoch": 0.8276, "step": 2069, "tokens_per_device": 5142 }, { "epoch": 0.8276, "loss_ce": 0.2756766974925995, "loss_lvr": 0.6236770153045654, "loss_mode_switch": 0.0, "loss_total": 0.3380444049835205, "step": 2069 }, { "batch_size": 4, "epoch": 0.8276, "step": 2069, "tokens_per_device": 1520 }, { "epoch": 0.8276, "loss_ce": 0.5262179970741272, "loss_lvr": 0.9341844320297241, "loss_mode_switch": 0.0, "loss_total": 0.6196364164352417, "step": 2069 }, { "batch_size": 1, "epoch": 0.8276, "step": 2069, "tokens_per_device": 5128 }, { "epoch": 0.8276, "loss_ce": 0.20450875163078308, "loss_lvr": 0.30334892868995667, "loss_mode_switch": 0.0, "loss_total": 0.2348436415195465, "step": 2069 }, { "batch_size": 4, "epoch": 0.8276, "step": 2069, "tokens_per_device": 1776 }, { "epoch": 0.8276, "loss_ce": 0.24106526374816895, "loss_lvr": 1.7015149593353271, "loss_mode_switch": 0.0, "loss_total": 0.41121676564216614, "step": 2069 }, { "batch_size": 4, "epoch": 0.8276, "step": 2069, "tokens_per_device": 3816 }, { "epoch": 0.8276, "loss_ce": 0.3422403633594513, "loss_lvr": 0.8947372436523438, "loss_mode_switch": 0.0, "loss_total": 0.43171408772468567, "step": 2069 }, { "batch_size": 1, "epoch": 0.8276, "step": 2069, "tokens_per_device": 5117 }, { "epoch": 0.8276, "loss_ce": 0.006530745420604944, "loss_lvr": 0.4208267629146576, "loss_mode_switch": 0.0, "loss_total": 0.048613425344228745, "step": 2069 }, { "batch_size": 1, "epoch": 0.8276, "step": 2069, "tokens_per_device": 5028 }, { "epoch": 0.8276, "loss_ce": 0.007311671040952206, "loss_lvr": 0.24958300590515137, "loss_mode_switch": 0.0, "loss_total": 0.03226997330784798, "step": 2069 }, { "epoch": 0.828, "grad_norm": 1.2610098123550415, "learning_rate": 7.55949960477328e-07, "loss": 0.267, "step": 2070 }, { "batch_size": 4, "epoch": 0.828, "step": 2070, "tokens_per_device": 3840 }, { "epoch": 0.828, "loss_ce": 0.09844828397035599, "loss_lvr": 0.9219898581504822, "loss_mode_switch": 0.0, "loss_total": 0.19064727425575256, "step": 2070 }, { "batch_size": 4, "epoch": 0.828, "step": 2070, "tokens_per_device": 5884 }, { "epoch": 0.828, "loss_ce": 0.20863547921180725, "loss_lvr": 0.60148024559021, "loss_mode_switch": 0.0, "loss_total": 0.2687835097312927, "step": 2070 }, { "batch_size": 4, "epoch": 0.828, "step": 2070, "tokens_per_device": 5720 }, { "epoch": 0.828, "loss_ce": 0.024741195142269135, "loss_lvr": 0.9014053344726562, "loss_mode_switch": 0.0, "loss_total": 0.114881731569767, "step": 2070 }, { "batch_size": 4, "epoch": 0.828, "step": 2070, "tokens_per_device": 15584 }, { "epoch": 0.828, "loss_ce": 0.09850168228149414, "loss_lvr": 0.662304162979126, "loss_mode_switch": 0.0, "loss_total": 0.16473209857940674, "step": 2070 }, { "batch_size": 4, "epoch": 0.828, "step": 2070, "tokens_per_device": 1316 }, { "epoch": 0.828, "loss_ce": 0.7362799644470215, "loss_lvr": 1.032383918762207, "loss_mode_switch": 0.0, "loss_total": 0.8395183682441711, "step": 2070 }, { "batch_size": 4, "epoch": 0.828, "step": 2070, "tokens_per_device": 5288 }, { "epoch": 0.828, "loss_ce": 0.2338283210992813, "loss_lvr": 0.6468538641929626, "loss_mode_switch": 0.0, "loss_total": 0.2985137104988098, "step": 2070 }, { "batch_size": 4, "epoch": 0.828, "step": 2070, "tokens_per_device": 4300 }, { "epoch": 0.828, "loss_ce": 0.15944093465805054, "loss_lvr": 0.8525465726852417, "loss_mode_switch": 0.0, "loss_total": 0.24469560384750366, "step": 2070 }, { "batch_size": 4, "epoch": 0.828, "step": 2070, "tokens_per_device": 4368 }, { "epoch": 0.828, "loss_ce": 0.6011402010917664, "loss_lvr": 0.926993727684021, "loss_mode_switch": 0.0, "loss_total": 0.6938395500183105, "step": 2070 }, { "epoch": 0.8284, "grad_norm": 1.4475640058517456, "learning_rate": 7.525288760000304e-07, "loss": 0.3292, "step": 2071 }, { "batch_size": 1, "epoch": 0.8284, "step": 2071, "tokens_per_device": 4954 }, { "epoch": 0.8284, "loss_ce": 0.0005124112358316779, "loss_lvr": 0.3918272852897644, "loss_mode_switch": 0.0, "loss_total": 0.0396951399743557, "step": 2071 }, { "batch_size": 1, "epoch": 0.8284, "step": 2071, "tokens_per_device": 4897 }, { "epoch": 0.8284, "loss_ce": 0.003105567768216133, "loss_lvr": 0.22630390524864197, "loss_mode_switch": 0.0, "loss_total": 0.02573595941066742, "step": 2071 }, { "batch_size": 4, "epoch": 0.8284, "step": 2071, "tokens_per_device": 5000 }, { "epoch": 0.8284, "loss_ce": 0.19568470120429993, "loss_lvr": 0.9780490398406982, "loss_mode_switch": 0.0, "loss_total": 0.29348960518836975, "step": 2071 }, { "batch_size": 4, "epoch": 0.8284, "step": 2071, "tokens_per_device": 3812 }, { "epoch": 0.8284, "loss_ce": 0.497170090675354, "loss_lvr": 0.9587379693984985, "loss_mode_switch": 0.0, "loss_total": 0.593043863773346, "step": 2071 }, { "batch_size": 4, "epoch": 0.8284, "step": 2071, "tokens_per_device": 6292 }, { "epoch": 0.8284, "loss_ce": 0.05185692012310028, "loss_lvr": 0.7006200551986694, "loss_mode_switch": 0.0, "loss_total": 0.1219189241528511, "step": 2071 }, { "batch_size": 4, "epoch": 0.8284, "step": 2071, "tokens_per_device": 2620 }, { "epoch": 0.8284, "loss_ce": 0.19646483659744263, "loss_lvr": 0.7750842571258545, "loss_mode_switch": 0.0, "loss_total": 0.2739732563495636, "step": 2071 }, { "batch_size": 4, "epoch": 0.8284, "step": 2071, "tokens_per_device": 1812 }, { "epoch": 0.8284, "loss_ce": 0.3990175127983093, "loss_lvr": 0.8507158756256104, "loss_mode_switch": 0.0, "loss_total": 0.48408910632133484, "step": 2071 }, { "batch_size": 1, "epoch": 0.8284, "step": 2071, "tokens_per_device": 4950 }, { "epoch": 0.8284, "loss_ce": 0.008436218835413456, "loss_lvr": 0.38828104734420776, "loss_mode_switch": 0.0, "loss_total": 0.04726432263851166, "step": 2071 }, { "epoch": 0.8288, "grad_norm": 1.1933872699737549, "learning_rate": 7.491149201619236e-07, "loss": 0.273, "step": 2072 }, { "batch_size": 4, "epoch": 0.8288, "step": 2072, "tokens_per_device": 5188 }, { "epoch": 0.8288, "loss_ce": 0.002607666188850999, "loss_lvr": 0.6740309596061707, "loss_mode_switch": 0.0, "loss_total": 0.07001076638698578, "step": 2072 }, { "batch_size": 1, "epoch": 0.8288, "step": 2072, "tokens_per_device": 5022 }, { "epoch": 0.8288, "loss_ce": 0.0011542976135388017, "loss_lvr": 0.4459185004234314, "loss_mode_switch": 0.0, "loss_total": 0.04574614763259888, "step": 2072 }, { "batch_size": 4, "epoch": 0.8288, "step": 2072, "tokens_per_device": 1384 }, { "epoch": 0.8288, "loss_ce": 0.299454927444458, "loss_lvr": 0.9122554659843445, "loss_mode_switch": 0.0, "loss_total": 0.3906804919242859, "step": 2072 }, { "batch_size": 1, "epoch": 0.8288, "step": 2072, "tokens_per_device": 4877 }, { "epoch": 0.8288, "loss_ce": 0.0036300555802881718, "loss_lvr": 0.12801207602024078, "loss_mode_switch": 0.0, "loss_total": 0.016431262716650963, "step": 2072 }, { "batch_size": 1, "epoch": 0.8288, "step": 2072, "tokens_per_device": 4753 }, { "epoch": 0.8288, "loss_ce": 0.001626053941436112, "loss_lvr": 0.3142387568950653, "loss_mode_switch": 0.0, "loss_total": 0.03304993361234665, "step": 2072 }, { "batch_size": 4, "epoch": 0.8288, "step": 2072, "tokens_per_device": 2576 }, { "epoch": 0.8288, "loss_ce": 0.15746384859085083, "loss_lvr": 0.9276843070983887, "loss_mode_switch": 0.0, "loss_total": 0.2502322793006897, "step": 2072 }, { "batch_size": 4, "epoch": 0.8288, "step": 2072, "tokens_per_device": 5996 }, { "epoch": 0.8288, "loss_ce": 0.35258978605270386, "loss_lvr": 0.7082230448722839, "loss_mode_switch": 0.0, "loss_total": 0.4234120845794678, "step": 2072 }, { "batch_size": 4, "epoch": 0.8288, "step": 2072, "tokens_per_device": 4120 }, { "epoch": 0.8288, "loss_ce": 0.40747347474098206, "loss_lvr": 0.7856968641281128, "loss_mode_switch": 0.0, "loss_total": 0.48604315519332886, "step": 2072 }, { "epoch": 0.8292, "grad_norm": 1.5936970710754395, "learning_rate": 7.457080986927357e-07, "loss": 0.3268, "step": 2073 }, { "batch_size": 4, "epoch": 0.8292, "step": 2073, "tokens_per_device": 2644 }, { "epoch": 0.8292, "loss_ce": 0.41366899013519287, "loss_lvr": 0.6849242448806763, "loss_mode_switch": 0.0, "loss_total": 0.48216140270233154, "step": 2073 }, { "batch_size": 4, "epoch": 0.8292, "step": 2073, "tokens_per_device": 11928 }, { "epoch": 0.8292, "loss_ce": 0.0031318964902311563, "loss_lvr": 0.4628691077232361, "loss_mode_switch": 0.0, "loss_total": 0.04941880702972412, "step": 2073 }, { "batch_size": 4, "epoch": 0.8292, "step": 2073, "tokens_per_device": 3852 }, { "epoch": 0.8292, "loss_ce": 0.383138507604599, "loss_lvr": 0.879249095916748, "loss_mode_switch": 0.0, "loss_total": 0.47106343507766724, "step": 2073 }, { "batch_size": 1, "epoch": 0.8292, "step": 2073, "tokens_per_device": 5094 }, { "epoch": 0.8292, "loss_ce": 0.06589914113283157, "loss_lvr": 0.23812235891819, "loss_mode_switch": 0.0, "loss_total": 0.08971137553453445, "step": 2073 }, { "batch_size": 4, "epoch": 0.8292, "step": 2073, "tokens_per_device": 4640 }, { "epoch": 0.8292, "loss_ce": 0.3511653542518616, "loss_lvr": 0.7914783358573914, "loss_mode_switch": 0.0, "loss_total": 0.43031319975852966, "step": 2073 }, { "batch_size": 1, "epoch": 0.8292, "step": 2073, "tokens_per_device": 5172 }, { "epoch": 0.8292, "loss_ce": 0.047543901950120926, "loss_lvr": 0.187369242310524, "loss_mode_switch": 0.0, "loss_total": 0.06628082692623138, "step": 2073 }, { "batch_size": 4, "epoch": 0.8292, "step": 2073, "tokens_per_device": 5864 }, { "epoch": 0.8292, "loss_ce": 0.07166114449501038, "loss_lvr": 0.6976787447929382, "loss_mode_switch": 0.0, "loss_total": 0.14142902195453644, "step": 2073 }, { "batch_size": 1, "epoch": 0.8292, "step": 2073, "tokens_per_device": 4904 }, { "epoch": 0.8292, "loss_ce": 0.02890191785991192, "loss_lvr": 0.23271602392196655, "loss_mode_switch": 0.0, "loss_total": 0.05217352136969566, "step": 2073 }, { "epoch": 0.8296, "grad_norm": 1.2633824348449707, "learning_rate": 7.423084173102213e-07, "loss": 0.2675, "step": 2074 }, { "batch_size": 1, "epoch": 0.8296, "step": 2074, "tokens_per_device": 5082 }, { "epoch": 0.8296, "loss_ce": 0.0022802294697612524, "loss_lvr": 0.3054291307926178, "loss_mode_switch": 0.0, "loss_total": 0.032823141664266586, "step": 2074 }, { "batch_size": 1, "epoch": 0.8296, "step": 2074, "tokens_per_device": 4862 }, { "epoch": 0.8296, "loss_ce": 0.0026532206684350967, "loss_lvr": 0.37955135107040405, "loss_mode_switch": 0.0, "loss_total": 0.04060835391283035, "step": 2074 }, { "batch_size": 4, "epoch": 0.8296, "step": 2074, "tokens_per_device": 5912 }, { "epoch": 0.8296, "loss_ce": 0.008916917257010937, "loss_lvr": 0.7004244923591614, "loss_mode_switch": 0.0, "loss_total": 0.07895936816930771, "step": 2074 }, { "batch_size": 4, "epoch": 0.8296, "step": 2074, "tokens_per_device": 4264 }, { "epoch": 0.8296, "loss_ce": 0.24804051220417023, "loss_lvr": 1.0648611783981323, "loss_mode_switch": 0.0, "loss_total": 0.3545266389846802, "step": 2074 }, { "batch_size": 1, "epoch": 0.8296, "step": 2074, "tokens_per_device": 4935 }, { "epoch": 0.8296, "loss_ce": 0.06862425059080124, "loss_lvr": 0.4245304465293884, "loss_mode_switch": 0.0, "loss_total": 0.11107729375362396, "step": 2074 }, { "batch_size": 1, "epoch": 0.8296, "step": 2074, "tokens_per_device": 4870 }, { "epoch": 0.8296, "loss_ce": 0.3602677881717682, "loss_lvr": 0.4664786159992218, "loss_mode_switch": 0.0, "loss_total": 0.40691566467285156, "step": 2074 }, { "batch_size": 4, "epoch": 0.8296, "step": 2074, "tokens_per_device": 8764 }, { "epoch": 0.8296, "loss_ce": 0.03875172510743141, "loss_lvr": 0.9474462866783142, "loss_mode_switch": 0.0, "loss_total": 0.13349635899066925, "step": 2074 }, { "batch_size": 4, "epoch": 0.8296, "step": 2074, "tokens_per_device": 4196 }, { "epoch": 0.8296, "loss_ce": 0.13164816796779633, "loss_lvr": 0.8899794816970825, "loss_mode_switch": 0.0, "loss_total": 0.22064611315727234, "step": 2074 }, { "epoch": 0.83, "grad_norm": 1.6995055675506592, "learning_rate": 7.389158817201541e-07, "loss": 0.3379, "step": 2075 }, { "batch_size": 4, "epoch": 0.83, "step": 2075, "tokens_per_device": 4628 }, { "epoch": 0.83, "loss_ce": 0.5199505090713501, "loss_lvr": 0.6875041127204895, "loss_mode_switch": 0.0, "loss_total": 0.5887008905410767, "step": 2075 }, { "batch_size": 4, "epoch": 0.83, "step": 2075, "tokens_per_device": 3000 }, { "epoch": 0.83, "loss_ce": 0.6775543093681335, "loss_lvr": 0.7581924796104431, "loss_mode_switch": 0.0, "loss_total": 0.7533735632896423, "step": 2075 }, { "batch_size": 4, "epoch": 0.83, "step": 2075, "tokens_per_device": 1532 }, { "epoch": 0.83, "loss_ce": 0.6063829660415649, "loss_lvr": 0.961564838886261, "loss_mode_switch": 0.0, "loss_total": 0.7025394439697266, "step": 2075 }, { "batch_size": 4, "epoch": 0.83, "step": 2075, "tokens_per_device": 2760 }, { "epoch": 0.83, "loss_ce": 0.09051469713449478, "loss_lvr": 0.7118741869926453, "loss_mode_switch": 0.0, "loss_total": 0.16170212626457214, "step": 2075 }, { "batch_size": 1, "epoch": 0.83, "step": 2075, "tokens_per_device": 5259 }, { "epoch": 0.83, "loss_ce": 0.08186108618974686, "loss_lvr": 0.4078427255153656, "loss_mode_switch": 0.0, "loss_total": 0.12264536321163177, "step": 2075 }, { "batch_size": 4, "epoch": 0.83, "step": 2075, "tokens_per_device": 9208 }, { "epoch": 0.83, "loss_ce": 0.3423655927181244, "loss_lvr": 1.0925135612487793, "loss_mode_switch": 0.0, "loss_total": 0.45161694288253784, "step": 2075 }, { "batch_size": 4, "epoch": 0.83, "step": 2075, "tokens_per_device": 4404 }, { "epoch": 0.83, "loss_ce": 0.18389831483364105, "loss_lvr": 0.8552011847496033, "loss_mode_switch": 0.0, "loss_total": 0.2694184184074402, "step": 2075 }, { "batch_size": 4, "epoch": 0.83, "step": 2075, "tokens_per_device": 6176 }, { "epoch": 0.83, "loss_ce": 0.09759300202131271, "loss_lvr": 0.7137324810028076, "loss_mode_switch": 0.0, "loss_total": 0.16896624863147736, "step": 2075 }, { "epoch": 0.8304, "grad_norm": 1.317393183708191, "learning_rate": 7.355304976163119e-07, "loss": 0.3003, "step": 2076 }, { "batch_size": 1, "epoch": 0.8304, "step": 2076, "tokens_per_device": 5099 }, { "epoch": 0.8304, "loss_ce": 0.04950977861881256, "loss_lvr": 0.31545278429985046, "loss_mode_switch": 0.0, "loss_total": 0.08105506002902985, "step": 2076 }, { "batch_size": 1, "epoch": 0.8304, "step": 2076, "tokens_per_device": 4875 }, { "epoch": 0.8304, "loss_ce": 0.007285856641829014, "loss_lvr": 1.3615227937698364, "loss_mode_switch": 0.0, "loss_total": 0.14343814551830292, "step": 2076 }, { "batch_size": 1, "epoch": 0.8304, "step": 2076, "tokens_per_device": 5024 }, { "epoch": 0.8304, "loss_ce": 0.010015223175287247, "loss_lvr": 0.3493063747882843, "loss_mode_switch": 0.0, "loss_total": 0.044945862144231796, "step": 2076 }, { "batch_size": 4, "epoch": 0.8304, "step": 2076, "tokens_per_device": 4424 }, { "epoch": 0.8304, "loss_ce": 0.08359654992818832, "loss_lvr": 0.584248423576355, "loss_mode_switch": 0.0, "loss_total": 0.14202138781547546, "step": 2076 }, { "batch_size": 1, "epoch": 0.8304, "step": 2076, "tokens_per_device": 5387 }, { "epoch": 0.8304, "loss_ce": 0.7391160130500793, "loss_lvr": 0.2878468632698059, "loss_mode_switch": 0.0, "loss_total": 0.7679007053375244, "step": 2076 }, { "batch_size": 4, "epoch": 0.8304, "step": 2076, "tokens_per_device": 4788 }, { "epoch": 0.8304, "loss_ce": 0.0700196847319603, "loss_lvr": 0.7241715788841248, "loss_mode_switch": 0.0, "loss_total": 0.14243684709072113, "step": 2076 }, { "batch_size": 4, "epoch": 0.8304, "step": 2076, "tokens_per_device": 3948 }, { "epoch": 0.8304, "loss_ce": 0.028313251212239265, "loss_lvr": 0.7340461611747742, "loss_mode_switch": 0.0, "loss_total": 0.10171786695718765, "step": 2076 }, { "batch_size": 1, "epoch": 0.8304, "step": 2076, "tokens_per_device": 4934 }, { "epoch": 0.8304, "loss_ce": 0.7418351173400879, "loss_lvr": 0.6402419209480286, "loss_mode_switch": 0.0, "loss_total": 0.8058593273162842, "step": 2076 }, { "epoch": 0.8308, "grad_norm": 1.3828067779541016, "learning_rate": 7.32152270680473e-07, "loss": 0.2896, "step": 2077 }, { "batch_size": 4, "epoch": 0.8308, "step": 2077, "tokens_per_device": 4276 }, { "epoch": 0.8308, "loss_ce": 0.1907966285943985, "loss_lvr": 0.9625576734542847, "loss_mode_switch": 0.0, "loss_total": 0.2870523929595947, "step": 2077 }, { "batch_size": 1, "epoch": 0.8308, "step": 2077, "tokens_per_device": 4972 }, { "epoch": 0.8308, "loss_ce": 0.01495914626866579, "loss_lvr": 0.6950366497039795, "loss_mode_switch": 0.0, "loss_total": 0.08446281403303146, "step": 2077 }, { "batch_size": 4, "epoch": 0.8308, "step": 2077, "tokens_per_device": 3792 }, { "epoch": 0.8308, "loss_ce": 0.12936373054981232, "loss_lvr": 0.7785765528678894, "loss_mode_switch": 0.0, "loss_total": 0.2072213888168335, "step": 2077 }, { "batch_size": 4, "epoch": 0.8308, "step": 2077, "tokens_per_device": 4276 }, { "epoch": 0.8308, "loss_ce": 0.05890097841620445, "loss_lvr": 0.8188621401786804, "loss_mode_switch": 0.0, "loss_total": 0.14078719913959503, "step": 2077 }, { "batch_size": 4, "epoch": 0.8308, "step": 2077, "tokens_per_device": 4208 }, { "epoch": 0.8308, "loss_ce": 0.32600656151771545, "loss_lvr": 0.8189327120780945, "loss_mode_switch": 0.0, "loss_total": 0.4078998267650604, "step": 2077 }, { "batch_size": 1, "epoch": 0.8308, "step": 2077, "tokens_per_device": 4937 }, { "epoch": 0.8308, "loss_ce": 0.0008369316929019988, "loss_lvr": 0.6646770238876343, "loss_mode_switch": 0.0, "loss_total": 0.06730463355779648, "step": 2077 }, { "batch_size": 4, "epoch": 0.8308, "step": 2077, "tokens_per_device": 3936 }, { "epoch": 0.8308, "loss_ce": 0.11650589108467102, "loss_lvr": 0.6131088733673096, "loss_mode_switch": 0.0, "loss_total": 0.17781677842140198, "step": 2077 }, { "batch_size": 4, "epoch": 0.8308, "step": 2077, "tokens_per_device": 4480 }, { "epoch": 0.8308, "loss_ce": 0.4005427062511444, "loss_lvr": 0.8121761083602905, "loss_mode_switch": 0.0, "loss_total": 0.48176032304763794, "step": 2077 }, { "epoch": 0.8312, "grad_norm": 1.1319541931152344, "learning_rate": 7.287812065823996e-07, "loss": 0.2456, "step": 2078 }, { "batch_size": 4, "epoch": 0.8312, "step": 2078, "tokens_per_device": 5796 }, { "epoch": 0.8312, "loss_ce": 0.25080132484436035, "loss_lvr": 1.134018898010254, "loss_mode_switch": 0.0, "loss_total": 0.36420321464538574, "step": 2078 }, { "batch_size": 1, "epoch": 0.8312, "step": 2078, "tokens_per_device": 5040 }, { "epoch": 0.8312, "loss_ce": 0.09190041571855545, "loss_lvr": 0.3204270303249359, "loss_mode_switch": 0.0, "loss_total": 0.12394312024116516, "step": 2078 }, { "batch_size": 4, "epoch": 0.8312, "step": 2078, "tokens_per_device": 3292 }, { "epoch": 0.8312, "loss_ce": 0.6771292686462402, "loss_lvr": 0.9633685350418091, "loss_mode_switch": 0.0, "loss_total": 0.7734661102294922, "step": 2078 }, { "batch_size": 1, "epoch": 0.8312, "step": 2078, "tokens_per_device": 4589 }, { "epoch": 0.8312, "loss_ce": 0.001928380923345685, "loss_lvr": 0.8666993975639343, "loss_mode_switch": 0.0, "loss_total": 0.0885983258485794, "step": 2078 }, { "batch_size": 4, "epoch": 0.8312, "step": 2078, "tokens_per_device": 6052 }, { "epoch": 0.8312, "loss_ce": 0.010179524309933186, "loss_lvr": 0.5295458436012268, "loss_mode_switch": 0.0, "loss_total": 0.06313411146402359, "step": 2078 }, { "batch_size": 4, "epoch": 0.8312, "step": 2078, "tokens_per_device": 4572 }, { "epoch": 0.8312, "loss_ce": 0.31995853781700134, "loss_lvr": 0.701094925403595, "loss_mode_switch": 0.0, "loss_total": 0.39006802439689636, "step": 2078 }, { "batch_size": 4, "epoch": 0.8312, "step": 2078, "tokens_per_device": 3908 }, { "epoch": 0.8312, "loss_ce": 0.49253007769584656, "loss_lvr": 0.7123098373413086, "loss_mode_switch": 0.0, "loss_total": 0.5637610554695129, "step": 2078 }, { "batch_size": 4, "epoch": 0.8312, "step": 2078, "tokens_per_device": 7720 }, { "epoch": 0.8312, "loss_ce": 0.05366891622543335, "loss_lvr": 0.8267650008201599, "loss_mode_switch": 0.0, "loss_total": 0.13634541630744934, "step": 2078 }, { "epoch": 0.8316, "grad_norm": 1.3421194553375244, "learning_rate": 7.254173109798363e-07, "loss": 0.2748, "step": 2079 }, { "batch_size": 1, "epoch": 0.8316, "step": 2079, "tokens_per_device": 5119 }, { "epoch": 0.8316, "loss_ce": 0.16753779351711273, "loss_lvr": 0.5063303709030151, "loss_mode_switch": 0.0, "loss_total": 0.21817083656787872, "step": 2079 }, { "batch_size": 4, "epoch": 0.8316, "step": 2079, "tokens_per_device": 3772 }, { "epoch": 0.8316, "loss_ce": 0.5766249299049377, "loss_lvr": 0.9629976153373718, "loss_mode_switch": 0.0, "loss_total": 0.6729246973991394, "step": 2079 }, { "batch_size": 1, "epoch": 0.8316, "step": 2079, "tokens_per_device": 5165 }, { "epoch": 0.8316, "loss_ce": 0.009704294614493847, "loss_lvr": 0.47682884335517883, "loss_mode_switch": 0.0, "loss_total": 0.057387180626392365, "step": 2079 }, { "batch_size": 1, "epoch": 0.8316, "step": 2079, "tokens_per_device": 5141 }, { "epoch": 0.8316, "loss_ce": 0.03023197315633297, "loss_lvr": 0.6103600859642029, "loss_mode_switch": 0.0, "loss_total": 0.09126798063516617, "step": 2079 }, { "batch_size": 4, "epoch": 0.8316, "step": 2079, "tokens_per_device": 4564 }, { "epoch": 0.8316, "loss_ce": 0.4939018785953522, "loss_lvr": 1.1652131080627441, "loss_mode_switch": 0.0, "loss_total": 0.61042320728302, "step": 2079 }, { "batch_size": 4, "epoch": 0.8316, "step": 2079, "tokens_per_device": 3076 }, { "epoch": 0.8316, "loss_ce": 0.10506375133991241, "loss_lvr": 0.7576478719711304, "loss_mode_switch": 0.0, "loss_total": 0.1808285415172577, "step": 2079 }, { "batch_size": 1, "epoch": 0.8316, "step": 2079, "tokens_per_device": 5184 }, { "epoch": 0.8316, "loss_ce": 0.018794860690832138, "loss_lvr": 0.4003852605819702, "loss_mode_switch": 0.0, "loss_total": 0.05883338674902916, "step": 2079 }, { "batch_size": 4, "epoch": 0.8316, "step": 2079, "tokens_per_device": 1688 }, { "epoch": 0.8316, "loss_ce": 0.7041007876396179, "loss_lvr": 0.9342117309570312, "loss_mode_switch": 0.0, "loss_total": 0.7975219488143921, "step": 2079 }, { "epoch": 0.832, "grad_norm": 1.2617566585540771, "learning_rate": 7.220605895184946e-07, "loss": 0.3322, "step": 2080 }, { "batch_size": 4, "epoch": 0.832, "step": 2080, "tokens_per_device": 4284 }, { "epoch": 0.832, "loss_ce": 0.06702689826488495, "loss_lvr": 0.9647805094718933, "loss_mode_switch": 0.0, "loss_total": 0.163504958152771, "step": 2080 }, { "batch_size": 1, "epoch": 0.832, "step": 2080, "tokens_per_device": 5013 }, { "epoch": 0.832, "loss_ce": 0.14856140315532684, "loss_lvr": 0.850758969783783, "loss_mode_switch": 0.0, "loss_total": 0.23363730311393738, "step": 2080 }, { "batch_size": 4, "epoch": 0.832, "step": 2080, "tokens_per_device": 4288 }, { "epoch": 0.832, "loss_ce": 0.3340464234352112, "loss_lvr": 0.8602447509765625, "loss_mode_switch": 0.0, "loss_total": 0.4200708866119385, "step": 2080 }, { "batch_size": 4, "epoch": 0.832, "step": 2080, "tokens_per_device": 1440 }, { "epoch": 0.832, "loss_ce": 0.35895803570747375, "loss_lvr": 0.8635835647583008, "loss_mode_switch": 0.0, "loss_total": 0.4453164041042328, "step": 2080 }, { "batch_size": 4, "epoch": 0.832, "step": 2080, "tokens_per_device": 6220 }, { "epoch": 0.832, "loss_ce": 0.13794264197349548, "loss_lvr": 0.5277476906776428, "loss_mode_switch": 0.0, "loss_total": 0.190717414021492, "step": 2080 }, { "batch_size": 1, "epoch": 0.832, "step": 2080, "tokens_per_device": 4916 }, { "epoch": 0.832, "loss_ce": 0.29422876238822937, "loss_lvr": 0.41470810770988464, "loss_mode_switch": 0.0, "loss_total": 0.33569955825805664, "step": 2080 }, { "batch_size": 4, "epoch": 0.832, "step": 2080, "tokens_per_device": 4308 }, { "epoch": 0.832, "loss_ce": 0.21482408046722412, "loss_lvr": 0.8378480672836304, "loss_mode_switch": 0.0, "loss_total": 0.2986088991165161, "step": 2080 }, { "batch_size": 1, "epoch": 0.832, "step": 2080, "tokens_per_device": 4887 }, { "epoch": 0.832, "loss_ce": 0.019553184509277344, "loss_lvr": 0.6540531516075134, "loss_mode_switch": 0.0, "loss_total": 0.0849585011601448, "step": 2080 }, { "epoch": 0.8324, "grad_norm": 1.2887095212936401, "learning_rate": 7.187110478320447e-07, "loss": 0.2894, "step": 2081 }, { "batch_size": 1, "epoch": 0.8324, "step": 2081, "tokens_per_device": 5933 }, { "epoch": 0.8324, "loss_ce": 0.048511847853660583, "loss_lvr": 0.49394574761390686, "loss_mode_switch": 0.0, "loss_total": 0.09790642559528351, "step": 2081 }, { "batch_size": 4, "epoch": 0.8324, "step": 2081, "tokens_per_device": 1580 }, { "epoch": 0.8324, "loss_ce": 0.14402979612350464, "loss_lvr": 0.9497784972190857, "loss_mode_switch": 0.0, "loss_total": 0.23900765180587769, "step": 2081 }, { "batch_size": 4, "epoch": 0.8324, "step": 2081, "tokens_per_device": 5748 }, { "epoch": 0.8324, "loss_ce": 0.05326663702726364, "loss_lvr": 0.692172646522522, "loss_mode_switch": 0.0, "loss_total": 0.12248390167951584, "step": 2081 }, { "batch_size": 1, "epoch": 0.8324, "step": 2081, "tokens_per_device": 5201 }, { "epoch": 0.8324, "loss_ce": 0.0003318748204037547, "loss_lvr": 0.33816105127334595, "loss_mode_switch": 0.0, "loss_total": 0.03414798155426979, "step": 2081 }, { "batch_size": 4, "epoch": 0.8324, "step": 2081, "tokens_per_device": 4464 }, { "epoch": 0.8324, "loss_ce": 0.42215996980667114, "loss_lvr": 0.8174417614936829, "loss_mode_switch": 0.0, "loss_total": 0.5039041638374329, "step": 2081 }, { "batch_size": 4, "epoch": 0.8324, "step": 2081, "tokens_per_device": 4772 }, { "epoch": 0.8324, "loss_ce": 0.5127679109573364, "loss_lvr": 0.87310791015625, "loss_mode_switch": 0.0, "loss_total": 0.6000787019729614, "step": 2081 }, { "batch_size": 4, "epoch": 0.8324, "step": 2081, "tokens_per_device": 1384 }, { "epoch": 0.8324, "loss_ce": 0.23910722136497498, "loss_lvr": 0.98480224609375, "loss_mode_switch": 0.0, "loss_total": 0.33758744597435, "step": 2081 }, { "batch_size": 4, "epoch": 0.8324, "step": 2081, "tokens_per_device": 4188 }, { "epoch": 0.8324, "loss_ce": 0.0007219668477773666, "loss_lvr": 1.1096553802490234, "loss_mode_switch": 0.0, "loss_total": 0.11168751120567322, "step": 2081 }, { "epoch": 0.8328, "grad_norm": 1.387848138809204, "learning_rate": 7.153686915421087e-07, "loss": 0.2721, "step": 2082 }, { "batch_size": 4, "epoch": 0.8328, "step": 2082, "tokens_per_device": 4396 }, { "epoch": 0.8328, "loss_ce": 0.031461216509342194, "loss_lvr": 0.8159797191619873, "loss_mode_switch": 0.0, "loss_total": 0.11305919289588928, "step": 2082 }, { "batch_size": 4, "epoch": 0.8328, "step": 2082, "tokens_per_device": 1796 }, { "epoch": 0.8328, "loss_ce": 0.0844106525182724, "loss_lvr": 0.8743793368339539, "loss_mode_switch": 0.0, "loss_total": 0.1718485951423645, "step": 2082 }, { "batch_size": 1, "epoch": 0.8328, "step": 2082, "tokens_per_device": 5053 }, { "epoch": 0.8328, "loss_ce": 0.07316642999649048, "loss_lvr": 0.5772877931594849, "loss_mode_switch": 0.0, "loss_total": 0.1308952122926712, "step": 2082 }, { "batch_size": 4, "epoch": 0.8328, "step": 2082, "tokens_per_device": 5108 }, { "epoch": 0.8328, "loss_ce": 0.2007150799036026, "loss_lvr": 0.7132313847541809, "loss_mode_switch": 0.0, "loss_total": 0.27203822135925293, "step": 2082 }, { "batch_size": 4, "epoch": 0.8328, "step": 2082, "tokens_per_device": 1324 }, { "epoch": 0.8328, "loss_ce": 0.07076279819011688, "loss_lvr": 0.869588315486908, "loss_mode_switch": 0.0, "loss_total": 0.1577216386795044, "step": 2082 }, { "batch_size": 1, "epoch": 0.8328, "step": 2082, "tokens_per_device": 5169 }, { "epoch": 0.8328, "loss_ce": 0.09730920195579529, "loss_lvr": 0.5997738838195801, "loss_mode_switch": 0.0, "loss_total": 0.15728658437728882, "step": 2082 }, { "batch_size": 4, "epoch": 0.8328, "step": 2082, "tokens_per_device": 13044 }, { "epoch": 0.8328, "loss_ce": 0.14406564831733704, "loss_lvr": 0.797409176826477, "loss_mode_switch": 0.0, "loss_total": 0.22380656003952026, "step": 2082 }, { "batch_size": 4, "epoch": 0.8328, "step": 2082, "tokens_per_device": 11276 }, { "epoch": 0.8328, "loss_ce": 0.1561659425497055, "loss_lvr": 0.7287994623184204, "loss_mode_switch": 0.0, "loss_total": 0.22904589772224426, "step": 2082 }, { "epoch": 0.8332, "grad_norm": 1.4678349494934082, "learning_rate": 7.120335262582495e-07, "loss": 0.2835, "step": 2083 }, { "batch_size": 1, "epoch": 0.8332, "step": 2083, "tokens_per_device": 4867 }, { "epoch": 0.8332, "loss_ce": 0.1326388120651245, "loss_lvr": 0.22277091443538666, "loss_mode_switch": 0.0, "loss_total": 0.15491589903831482, "step": 2083 }, { "batch_size": 4, "epoch": 0.8332, "step": 2083, "tokens_per_device": 2784 }, { "epoch": 0.8332, "loss_ce": 0.33882394433021545, "loss_lvr": 0.5130499601364136, "loss_mode_switch": 0.0, "loss_total": 0.3901289403438568, "step": 2083 }, { "batch_size": 4, "epoch": 0.8332, "step": 2083, "tokens_per_device": 9168 }, { "epoch": 0.8332, "loss_ce": 0.07933925837278366, "loss_lvr": 0.5323402285575867, "loss_mode_switch": 0.0, "loss_total": 0.13257327675819397, "step": 2083 }, { "batch_size": 1, "epoch": 0.8332, "step": 2083, "tokens_per_device": 4970 }, { "epoch": 0.8332, "loss_ce": 0.039009418338537216, "loss_lvr": 0.18144704401493073, "loss_mode_switch": 0.0, "loss_total": 0.05715412274003029, "step": 2083 }, { "batch_size": 4, "epoch": 0.8332, "step": 2083, "tokens_per_device": 5196 }, { "epoch": 0.8332, "loss_ce": 0.30632922053337097, "loss_lvr": 1.1036523580551147, "loss_mode_switch": 0.0, "loss_total": 0.4166944622993469, "step": 2083 }, { "batch_size": 1, "epoch": 0.8332, "step": 2083, "tokens_per_device": 5104 }, { "epoch": 0.8332, "loss_ce": 0.00287462305277586, "loss_lvr": 0.3971644937992096, "loss_mode_switch": 0.0, "loss_total": 0.042591072618961334, "step": 2083 }, { "batch_size": 4, "epoch": 0.8332, "step": 2083, "tokens_per_device": 4376 }, { "epoch": 0.8332, "loss_ce": 0.011341365985572338, "loss_lvr": 0.7771716713905334, "loss_mode_switch": 0.0, "loss_total": 0.0890585333108902, "step": 2083 }, { "batch_size": 4, "epoch": 0.8332, "step": 2083, "tokens_per_device": 4764 }, { "epoch": 0.8332, "loss_ce": 0.02541857212781906, "loss_lvr": 0.7936438322067261, "loss_mode_switch": 0.0, "loss_total": 0.10478295385837555, "step": 2083 }, { "epoch": 0.8336, "grad_norm": 1.3924530744552612, "learning_rate": 7.087055575779594e-07, "loss": 0.3089, "step": 2084 }, { "batch_size": 4, "epoch": 0.8336, "step": 2084, "tokens_per_device": 4400 }, { "epoch": 0.8336, "loss_ce": 0.3685961663722992, "loss_lvr": 0.8602868318557739, "loss_mode_switch": 0.0, "loss_total": 0.45462486147880554, "step": 2084 }, { "batch_size": 4, "epoch": 0.8336, "step": 2084, "tokens_per_device": 4744 }, { "epoch": 0.8336, "loss_ce": 0.01656518131494522, "loss_lvr": 0.7634265422821045, "loss_mode_switch": 0.0, "loss_total": 0.09290783852338791, "step": 2084 }, { "batch_size": 1, "epoch": 0.8336, "step": 2084, "tokens_per_device": 5171 }, { "epoch": 0.8336, "loss_ce": 0.0006903110770508647, "loss_lvr": 0.32974448800086975, "loss_mode_switch": 0.0, "loss_total": 0.0336647592484951, "step": 2084 }, { "batch_size": 4, "epoch": 0.8336, "step": 2084, "tokens_per_device": 3780 }, { "epoch": 0.8336, "loss_ce": 0.0426395907998085, "loss_lvr": 0.6798257827758789, "loss_mode_switch": 0.0, "loss_total": 0.11062216758728027, "step": 2084 }, { "batch_size": 4, "epoch": 0.8336, "step": 2084, "tokens_per_device": 3744 }, { "epoch": 0.8336, "loss_ce": 0.3111078441143036, "loss_lvr": 0.8598730564117432, "loss_mode_switch": 0.0, "loss_total": 0.3970951437950134, "step": 2084 }, { "batch_size": 4, "epoch": 0.8336, "step": 2084, "tokens_per_device": 4552 }, { "epoch": 0.8336, "loss_ce": 0.6759151220321655, "loss_lvr": 0.8069802522659302, "loss_mode_switch": 0.0, "loss_total": 0.7566131353378296, "step": 2084 }, { "batch_size": 4, "epoch": 0.8336, "step": 2084, "tokens_per_device": 10980 }, { "epoch": 0.8336, "loss_ce": 0.040341105312108994, "loss_lvr": 0.6545553803443909, "loss_mode_switch": 0.0, "loss_total": 0.10579665005207062, "step": 2084 }, { "batch_size": 4, "epoch": 0.8336, "step": 2084, "tokens_per_device": 3848 }, { "epoch": 0.8336, "loss_ce": 0.32048919796943665, "loss_lvr": 0.8609637022018433, "loss_mode_switch": 0.0, "loss_total": 0.40658557415008545, "step": 2084 }, { "epoch": 0.834, "grad_norm": 1.3388687372207642, "learning_rate": 7.053847910866513e-07, "loss": 0.2953, "step": 2085 }, { "batch_size": 1, "epoch": 0.834, "step": 2085, "tokens_per_device": 5213 }, { "epoch": 0.834, "loss_ce": 0.0720900297164917, "loss_lvr": 0.48813411593437195, "loss_mode_switch": 0.0, "loss_total": 0.12090344727039337, "step": 2085 }, { "batch_size": 4, "epoch": 0.834, "step": 2085, "tokens_per_device": 4232 }, { "epoch": 0.834, "loss_ce": 0.006637236103415489, "loss_lvr": 0.8594698309898376, "loss_mode_switch": 0.0, "loss_total": 0.09258422255516052, "step": 2085 }, { "batch_size": 1, "epoch": 0.834, "step": 2085, "tokens_per_device": 4951 }, { "epoch": 0.834, "loss_ce": 0.15063010156154633, "loss_lvr": 0.3860069513320923, "loss_mode_switch": 0.0, "loss_total": 0.1892307996749878, "step": 2085 }, { "batch_size": 1, "epoch": 0.834, "step": 2085, "tokens_per_device": 5015 }, { "epoch": 0.834, "loss_ce": 0.1734636276960373, "loss_lvr": 0.2832794785499573, "loss_mode_switch": 0.0, "loss_total": 0.20179156959056854, "step": 2085 }, { "batch_size": 1, "epoch": 0.834, "step": 2085, "tokens_per_device": 5838 }, { "epoch": 0.834, "loss_ce": 0.020625192672014236, "loss_lvr": 0.42637401819229126, "loss_mode_switch": 0.0, "loss_total": 0.06326259672641754, "step": 2085 }, { "batch_size": 1, "epoch": 0.834, "step": 2085, "tokens_per_device": 4924 }, { "epoch": 0.834, "loss_ce": 0.00218910607509315, "loss_lvr": 0.5647162795066833, "loss_mode_switch": 0.0, "loss_total": 0.058660734444856644, "step": 2085 }, { "batch_size": 4, "epoch": 0.834, "step": 2085, "tokens_per_device": 4708 }, { "epoch": 0.834, "loss_ce": 0.10793706774711609, "loss_lvr": 0.8358802199363708, "loss_mode_switch": 0.0, "loss_total": 0.19152510166168213, "step": 2085 }, { "batch_size": 1, "epoch": 0.834, "step": 2085, "tokens_per_device": 6454 }, { "epoch": 0.834, "loss_ce": 0.00012635339226108044, "loss_lvr": 0.42411214113235474, "loss_mode_switch": 0.0, "loss_total": 0.042537569999694824, "step": 2085 }, { "epoch": 0.8344, "grad_norm": 1.242276668548584, "learning_rate": 7.020712323576556e-07, "loss": 0.2538, "step": 2086 }, { "batch_size": 4, "epoch": 0.8344, "step": 2086, "tokens_per_device": 3772 }, { "epoch": 0.8344, "loss_ce": 0.5103540420532227, "loss_lvr": 0.9465345144271851, "loss_mode_switch": 0.0, "loss_total": 0.6050074696540833, "step": 2086 }, { "batch_size": 4, "epoch": 0.8344, "step": 2086, "tokens_per_device": 4252 }, { "epoch": 0.8344, "loss_ce": 0.3298018276691437, "loss_lvr": 1.0781879425048828, "loss_mode_switch": 0.0, "loss_total": 0.4376206398010254, "step": 2086 }, { "batch_size": 4, "epoch": 0.8344, "step": 2086, "tokens_per_device": 3300 }, { "epoch": 0.8344, "loss_ce": 0.2117801308631897, "loss_lvr": 0.8652398586273193, "loss_mode_switch": 0.0, "loss_total": 0.29830411076545715, "step": 2086 }, { "batch_size": 1, "epoch": 0.8344, "step": 2086, "tokens_per_device": 4875 }, { "epoch": 0.8344, "loss_ce": 0.01032737921923399, "loss_lvr": 0.14435730874538422, "loss_mode_switch": 0.0, "loss_total": 0.024763111025094986, "step": 2086 }, { "batch_size": 4, "epoch": 0.8344, "step": 2086, "tokens_per_device": 3200 }, { "epoch": 0.8344, "loss_ce": 0.5479429364204407, "loss_lvr": 1.0410027503967285, "loss_mode_switch": 0.0, "loss_total": 0.6520432233810425, "step": 2086 }, { "batch_size": 4, "epoch": 0.8344, "step": 2086, "tokens_per_device": 12636 }, { "epoch": 0.8344, "loss_ce": 0.1753050684928894, "loss_lvr": 0.9200720191001892, "loss_mode_switch": 0.0, "loss_total": 0.26731228828430176, "step": 2086 }, { "batch_size": 1, "epoch": 0.8344, "step": 2086, "tokens_per_device": 4893 }, { "epoch": 0.8344, "loss_ce": 0.0015945103950798512, "loss_lvr": 0.4008713662624359, "loss_mode_switch": 0.0, "loss_total": 0.041681647300720215, "step": 2086 }, { "batch_size": 1, "epoch": 0.8344, "step": 2086, "tokens_per_device": 4883 }, { "epoch": 0.8344, "loss_ce": 0.011612963862717152, "loss_lvr": 0.6768210530281067, "loss_mode_switch": 0.0, "loss_total": 0.0792950764298439, "step": 2086 }, { "epoch": 0.8348, "grad_norm": 1.4212547540664673, "learning_rate": 6.987648869521996e-07, "loss": 0.3117, "step": 2087 }, { "batch_size": 1, "epoch": 0.8348, "step": 2087, "tokens_per_device": 4871 }, { "epoch": 0.8348, "loss_ce": 0.0002679784665815532, "loss_lvr": 1.2413930892944336, "loss_mode_switch": 0.0, "loss_total": 0.12440728396177292, "step": 2087 }, { "batch_size": 4, "epoch": 0.8348, "step": 2087, "tokens_per_device": 5796 }, { "epoch": 0.8348, "loss_ce": 0.14660893380641937, "loss_lvr": 0.7616913914680481, "loss_mode_switch": 0.0, "loss_total": 0.2227780818939209, "step": 2087 }, { "batch_size": 1, "epoch": 0.8348, "step": 2087, "tokens_per_device": 4941 }, { "epoch": 0.8348, "loss_ce": 0.020981471985578537, "loss_lvr": 0.4921119511127472, "loss_mode_switch": 0.0, "loss_total": 0.07019266486167908, "step": 2087 }, { "batch_size": 4, "epoch": 0.8348, "step": 2087, "tokens_per_device": 7688 }, { "epoch": 0.8348, "loss_ce": 0.18315790593624115, "loss_lvr": 1.0370807647705078, "loss_mode_switch": 0.0, "loss_total": 0.2868659794330597, "step": 2087 }, { "batch_size": 1, "epoch": 0.8348, "step": 2087, "tokens_per_device": 4889 }, { "epoch": 0.8348, "loss_ce": 0.016436917707324028, "loss_lvr": 0.33747512102127075, "loss_mode_switch": 0.0, "loss_total": 0.050184428691864014, "step": 2087 }, { "batch_size": 1, "epoch": 0.8348, "step": 2087, "tokens_per_device": 6104 }, { "epoch": 0.8348, "loss_ce": 0.011688546277582645, "loss_lvr": 0.261544793844223, "loss_mode_switch": 0.0, "loss_total": 0.03784302622079849, "step": 2087 }, { "batch_size": 4, "epoch": 0.8348, "step": 2087, "tokens_per_device": 5248 }, { "epoch": 0.8348, "loss_ce": 0.24316394329071045, "loss_lvr": 0.7271307706832886, "loss_mode_switch": 0.0, "loss_total": 0.3158770203590393, "step": 2087 }, { "batch_size": 1, "epoch": 0.8348, "step": 2087, "tokens_per_device": 5691 }, { "epoch": 0.8348, "loss_ce": 0.004364228341728449, "loss_lvr": 0.3134748637676239, "loss_mode_switch": 0.0, "loss_total": 0.03571171686053276, "step": 2087 }, { "epoch": 0.8352, "grad_norm": 1.4604450464248657, "learning_rate": 6.954657604194093e-07, "loss": 0.2521, "step": 2088 }, { "batch_size": 4, "epoch": 0.8352, "step": 2088, "tokens_per_device": 4188 }, { "epoch": 0.8352, "loss_ce": 0.30227985978126526, "loss_lvr": 0.9415416121482849, "loss_mode_switch": 0.0, "loss_total": 0.3964340090751648, "step": 2088 }, { "batch_size": 4, "epoch": 0.8352, "step": 2088, "tokens_per_device": 2676 }, { "epoch": 0.8352, "loss_ce": 0.0783146396279335, "loss_lvr": 0.7897924184799194, "loss_mode_switch": 0.0, "loss_total": 0.1572938859462738, "step": 2088 }, { "batch_size": 1, "epoch": 0.8352, "step": 2088, "tokens_per_device": 4776 }, { "epoch": 0.8352, "loss_ce": 0.028134535998106003, "loss_lvr": 0.24485503137111664, "loss_mode_switch": 0.0, "loss_total": 0.05262003839015961, "step": 2088 }, { "batch_size": 1, "epoch": 0.8352, "step": 2088, "tokens_per_device": 5038 }, { "epoch": 0.8352, "loss_ce": 0.00799255259335041, "loss_lvr": 0.2551657557487488, "loss_mode_switch": 0.0, "loss_total": 0.03350912779569626, "step": 2088 }, { "batch_size": 1, "epoch": 0.8352, "step": 2088, "tokens_per_device": 5129 }, { "epoch": 0.8352, "loss_ce": 0.11739704012870789, "loss_lvr": 0.33818700909614563, "loss_mode_switch": 0.0, "loss_total": 0.15121574699878693, "step": 2088 }, { "batch_size": 4, "epoch": 0.8352, "step": 2088, "tokens_per_device": 6400 }, { "epoch": 0.8352, "loss_ce": 0.1519295573234558, "loss_lvr": 0.5097655653953552, "loss_mode_switch": 0.0, "loss_total": 0.20290611684322357, "step": 2088 }, { "batch_size": 4, "epoch": 0.8352, "step": 2088, "tokens_per_device": 7332 }, { "epoch": 0.8352, "loss_ce": 0.2145809680223465, "loss_lvr": 0.8998717665672302, "loss_mode_switch": 0.0, "loss_total": 0.3045681416988373, "step": 2088 }, { "batch_size": 4, "epoch": 0.8352, "step": 2088, "tokens_per_device": 4628 }, { "epoch": 0.8352, "loss_ce": 0.04211055114865303, "loss_lvr": 0.6779422163963318, "loss_mode_switch": 0.0, "loss_total": 0.10990478098392487, "step": 2088 }, { "epoch": 0.8356, "grad_norm": 1.5254589319229126, "learning_rate": 6.921738582962923e-07, "loss": 0.2959, "step": 2089 }, { "batch_size": 1, "epoch": 0.8356, "step": 2089, "tokens_per_device": 5043 }, { "epoch": 0.8356, "loss_ce": 0.0169155802577734, "loss_lvr": 0.21390286087989807, "loss_mode_switch": 0.0, "loss_total": 0.038305867463350296, "step": 2089 }, { "batch_size": 4, "epoch": 0.8356, "step": 2089, "tokens_per_device": 4224 }, { "epoch": 0.8356, "loss_ce": 0.158050537109375, "loss_lvr": 1.0124307870864868, "loss_mode_switch": 0.0, "loss_total": 0.2592936158180237, "step": 2089 }, { "batch_size": 4, "epoch": 0.8356, "step": 2089, "tokens_per_device": 10200 }, { "epoch": 0.8356, "loss_ce": 0.41691744327545166, "loss_lvr": 0.6399857997894287, "loss_mode_switch": 0.0, "loss_total": 0.48091602325439453, "step": 2089 }, { "batch_size": 4, "epoch": 0.8356, "step": 2089, "tokens_per_device": 1464 }, { "epoch": 0.8356, "loss_ce": 0.5215368270874023, "loss_lvr": 0.9043769836425781, "loss_mode_switch": 0.0, "loss_total": 0.6119745373725891, "step": 2089 }, { "batch_size": 4, "epoch": 0.8356, "step": 2089, "tokens_per_device": 4512 }, { "epoch": 0.8356, "loss_ce": 0.7048418521881104, "loss_lvr": 0.8557518124580383, "loss_mode_switch": 0.0, "loss_total": 0.7904170155525208, "step": 2089 }, { "batch_size": 4, "epoch": 0.8356, "step": 2089, "tokens_per_device": 1212 }, { "epoch": 0.8356, "loss_ce": 0.30937817692756653, "loss_lvr": 1.1358821392059326, "loss_mode_switch": 0.0, "loss_total": 0.4229663908481598, "step": 2089 }, { "batch_size": 1, "epoch": 0.8356, "step": 2089, "tokens_per_device": 5050 }, { "epoch": 0.8356, "loss_ce": 0.07140613347291946, "loss_lvr": 0.1581798791885376, "loss_mode_switch": 0.0, "loss_total": 0.08722412586212158, "step": 2089 }, { "batch_size": 1, "epoch": 0.8356, "step": 2089, "tokens_per_device": 4741 }, { "epoch": 0.8356, "loss_ce": 0.000490704202093184, "loss_lvr": 0.8002310991287231, "loss_mode_switch": 0.0, "loss_total": 0.08051381260156631, "step": 2089 }, { "epoch": 0.836, "grad_norm": 1.3694926500320435, "learning_rate": 6.888891861077301e-07, "loss": 0.2752, "step": 2090 }, { "batch_size": 4, "epoch": 0.836, "step": 2090, "tokens_per_device": 4288 }, { "epoch": 0.836, "loss_ce": 0.16786549985408783, "loss_lvr": 0.970048189163208, "loss_mode_switch": 0.0, "loss_total": 0.2648703157901764, "step": 2090 }, { "batch_size": 1, "epoch": 0.836, "step": 2090, "tokens_per_device": 4929 }, { "epoch": 0.836, "loss_ce": 0.027937375009059906, "loss_lvr": 0.2567296624183655, "loss_mode_switch": 0.0, "loss_total": 0.053610339760780334, "step": 2090 }, { "batch_size": 4, "epoch": 0.836, "step": 2090, "tokens_per_device": 3876 }, { "epoch": 0.836, "loss_ce": 0.12425354868173599, "loss_lvr": 0.9433028101921082, "loss_mode_switch": 0.0, "loss_total": 0.2185838222503662, "step": 2090 }, { "batch_size": 1, "epoch": 0.836, "step": 2090, "tokens_per_device": 5014 }, { "epoch": 0.836, "loss_ce": 0.2633572816848755, "loss_lvr": 0.390221506357193, "loss_mode_switch": 0.0, "loss_total": 0.30237942934036255, "step": 2090 }, { "batch_size": 1, "epoch": 0.836, "step": 2090, "tokens_per_device": 5190 }, { "epoch": 0.836, "loss_ce": 0.9711923003196716, "loss_lvr": 0.3777037560939789, "loss_mode_switch": 0.0, "loss_total": 1.008962631225586, "step": 2090 }, { "batch_size": 1, "epoch": 0.836, "step": 2090, "tokens_per_device": 5152 }, { "epoch": 0.836, "loss_ce": 0.000822918489575386, "loss_lvr": 0.2491031140089035, "loss_mode_switch": 0.0, "loss_total": 0.025733230635523796, "step": 2090 }, { "batch_size": 4, "epoch": 0.836, "step": 2090, "tokens_per_device": 5912 }, { "epoch": 0.836, "loss_ce": 0.02996368706226349, "loss_lvr": 0.7171869277954102, "loss_mode_switch": 0.0, "loss_total": 0.1016823798418045, "step": 2090 }, { "batch_size": 1, "epoch": 0.836, "step": 2090, "tokens_per_device": 4865 }, { "epoch": 0.836, "loss_ce": 0.10703108459711075, "loss_lvr": 0.42341068387031555, "loss_mode_switch": 0.0, "loss_total": 0.1493721604347229, "step": 2090 }, { "epoch": 0.8364, "grad_norm": 1.491112232208252, "learning_rate": 6.856117493664743e-07, "loss": 0.3062, "step": 2091 }, { "batch_size": 4, "epoch": 0.8364, "step": 2091, "tokens_per_device": 7168 }, { "epoch": 0.8364, "loss_ce": 0.04767696559429169, "loss_lvr": 0.45915859937667847, "loss_mode_switch": 0.0, "loss_total": 0.0935928225517273, "step": 2091 }, { "batch_size": 4, "epoch": 0.8364, "step": 2091, "tokens_per_device": 3676 }, { "epoch": 0.8364, "loss_ce": 0.0215529203414917, "loss_lvr": 0.9312812089920044, "loss_mode_switch": 0.0, "loss_total": 0.11468104273080826, "step": 2091 }, { "batch_size": 4, "epoch": 0.8364, "step": 2091, "tokens_per_device": 11752 }, { "epoch": 0.8364, "loss_ce": 0.030626796185970306, "loss_lvr": 0.42790424823760986, "loss_mode_switch": 0.0, "loss_total": 0.07341721653938293, "step": 2091 }, { "batch_size": 4, "epoch": 0.8364, "step": 2091, "tokens_per_device": 5756 }, { "epoch": 0.8364, "loss_ce": 0.7816654443740845, "loss_lvr": 0.7745158076286316, "loss_mode_switch": 0.0, "loss_total": 0.8591170310974121, "step": 2091 }, { "batch_size": 4, "epoch": 0.8364, "step": 2091, "tokens_per_device": 1556 }, { "epoch": 0.8364, "loss_ce": 0.310810387134552, "loss_lvr": 0.9920799136161804, "loss_mode_switch": 0.0, "loss_total": 0.4100183844566345, "step": 2091 }, { "batch_size": 4, "epoch": 0.8364, "step": 2091, "tokens_per_device": 1404 }, { "epoch": 0.8364, "loss_ce": 0.6392719745635986, "loss_lvr": 0.9419684410095215, "loss_mode_switch": 0.0, "loss_total": 0.7334688305854797, "step": 2091 }, { "batch_size": 4, "epoch": 0.8364, "step": 2091, "tokens_per_device": 4080 }, { "epoch": 0.8364, "loss_ce": 0.38050806522369385, "loss_lvr": 1.047979712486267, "loss_mode_switch": 0.0, "loss_total": 0.4853060245513916, "step": 2091 }, { "batch_size": 4, "epoch": 0.8364, "step": 2091, "tokens_per_device": 4244 }, { "epoch": 0.8364, "loss_ce": 0.09892243891954422, "loss_lvr": 0.5901097655296326, "loss_mode_switch": 0.0, "loss_total": 0.15793341398239136, "step": 2091 }, { "epoch": 0.8368, "grad_norm": 1.3177961111068726, "learning_rate": 6.823415535731303e-07, "loss": 0.2995, "step": 2092 }, { "batch_size": 4, "epoch": 0.8368, "step": 2092, "tokens_per_device": 3928 }, { "epoch": 0.8368, "loss_ce": 0.19408918917179108, "loss_lvr": 0.9715620279312134, "loss_mode_switch": 0.0, "loss_total": 0.29124540090560913, "step": 2092 }, { "batch_size": 4, "epoch": 0.8368, "step": 2092, "tokens_per_device": 3860 }, { "epoch": 0.8368, "loss_ce": 0.26805591583251953, "loss_lvr": 0.8348225951194763, "loss_mode_switch": 0.0, "loss_total": 0.35153818130493164, "step": 2092 }, { "batch_size": 4, "epoch": 0.8368, "step": 2092, "tokens_per_device": 5164 }, { "epoch": 0.8368, "loss_ce": 0.095518559217453, "loss_lvr": 1.1000126600265503, "loss_mode_switch": 0.0, "loss_total": 0.20551982522010803, "step": 2092 }, { "batch_size": 1, "epoch": 0.8368, "step": 2092, "tokens_per_device": 4748 }, { "epoch": 0.8368, "loss_ce": 0.10075890272855759, "loss_lvr": 0.31618475914001465, "loss_mode_switch": 0.0, "loss_total": 0.13237738609313965, "step": 2092 }, { "batch_size": 4, "epoch": 0.8368, "step": 2092, "tokens_per_device": 4856 }, { "epoch": 0.8368, "loss_ce": 0.05272572115063667, "loss_lvr": 0.7785500884056091, "loss_mode_switch": 0.0, "loss_total": 0.13058073818683624, "step": 2092 }, { "batch_size": 4, "epoch": 0.8368, "step": 2092, "tokens_per_device": 4576 }, { "epoch": 0.8368, "loss_ce": 0.2649480104446411, "loss_lvr": 0.7345832586288452, "loss_mode_switch": 0.0, "loss_total": 0.3384063243865967, "step": 2092 }, { "batch_size": 4, "epoch": 0.8368, "step": 2092, "tokens_per_device": 5860 }, { "epoch": 0.8368, "loss_ce": 0.08293146640062332, "loss_lvr": 0.6772445440292358, "loss_mode_switch": 0.0, "loss_total": 0.15065592527389526, "step": 2092 }, { "batch_size": 4, "epoch": 0.8368, "step": 2092, "tokens_per_device": 1604 }, { "epoch": 0.8368, "loss_ce": 0.41306623816490173, "loss_lvr": 0.9727914929389954, "loss_mode_switch": 0.0, "loss_total": 0.5103453993797302, "step": 2092 }, { "epoch": 0.8372, "grad_norm": 1.3119412660598755, "learning_rate": 6.790786042161507e-07, "loss": 0.2635, "step": 2093 }, { "batch_size": 4, "epoch": 0.8372, "step": 2093, "tokens_per_device": 5664 }, { "epoch": 0.8372, "loss_ce": 0.4250631332397461, "loss_lvr": 0.758306086063385, "loss_mode_switch": 0.0, "loss_total": 0.5008937120437622, "step": 2093 }, { "batch_size": 4, "epoch": 0.8372, "step": 2093, "tokens_per_device": 5676 }, { "epoch": 0.8372, "loss_ce": 0.00418426189571619, "loss_lvr": 0.6620550751686096, "loss_mode_switch": 0.0, "loss_total": 0.0703897699713707, "step": 2093 }, { "batch_size": 4, "epoch": 0.8372, "step": 2093, "tokens_per_device": 1668 }, { "epoch": 0.8372, "loss_ce": 0.7552701830863953, "loss_lvr": 1.0064359903335571, "loss_mode_switch": 0.0, "loss_total": 0.8559137582778931, "step": 2093 }, { "batch_size": 4, "epoch": 0.8372, "step": 2093, "tokens_per_device": 1548 }, { "epoch": 0.8372, "loss_ce": 0.30712491273880005, "loss_lvr": 0.960295557975769, "loss_mode_switch": 0.0, "loss_total": 0.4031544625759125, "step": 2093 }, { "batch_size": 1, "epoch": 0.8372, "step": 2093, "tokens_per_device": 5365 }, { "epoch": 0.8372, "loss_ce": 0.015773214399814606, "loss_lvr": 0.26387983560562134, "loss_mode_switch": 0.0, "loss_total": 0.04216119647026062, "step": 2093 }, { "batch_size": 1, "epoch": 0.8372, "step": 2093, "tokens_per_device": 4906 }, { "epoch": 0.8372, "loss_ce": 0.34181737899780273, "loss_lvr": 0.6168408989906311, "loss_mode_switch": 0.0, "loss_total": 0.4035014808177948, "step": 2093 }, { "batch_size": 1, "epoch": 0.8372, "step": 2093, "tokens_per_device": 5240 }, { "epoch": 0.8372, "loss_ce": 0.01168469525873661, "loss_lvr": 0.24168503284454346, "loss_mode_switch": 0.0, "loss_total": 0.035853199660778046, "step": 2093 }, { "batch_size": 1, "epoch": 0.8372, "step": 2093, "tokens_per_device": 5205 }, { "epoch": 0.8372, "loss_ce": 0.11596944183111191, "loss_lvr": 0.64423668384552, "loss_mode_switch": 0.0, "loss_total": 0.18039311468601227, "step": 2093 }, { "epoch": 0.8376, "grad_norm": 1.6554362773895264, "learning_rate": 6.758229067718269e-07, "loss": 0.3, "step": 2094 }, { "batch_size": 4, "epoch": 0.8376, "step": 2094, "tokens_per_device": 6120 }, { "epoch": 0.8376, "loss_ce": 0.22129622101783752, "loss_lvr": 0.7475382685661316, "loss_mode_switch": 0.0, "loss_total": 0.2960500419139862, "step": 2094 }, { "batch_size": 4, "epoch": 0.8376, "step": 2094, "tokens_per_device": 5532 }, { "epoch": 0.8376, "loss_ce": 0.001002027653157711, "loss_lvr": 0.6948198080062866, "loss_mode_switch": 0.0, "loss_total": 0.07048401236534119, "step": 2094 }, { "batch_size": 1, "epoch": 0.8376, "step": 2094, "tokens_per_device": 4881 }, { "epoch": 0.8376, "loss_ce": 0.4386216104030609, "loss_lvr": 0.4614869952201843, "loss_mode_switch": 0.0, "loss_total": 0.4847702980041504, "step": 2094 }, { "batch_size": 4, "epoch": 0.8376, "step": 2094, "tokens_per_device": 4240 }, { "epoch": 0.8376, "loss_ce": 0.47891128063201904, "loss_lvr": 0.57935631275177, "loss_mode_switch": 0.0, "loss_total": 0.536846935749054, "step": 2094 }, { "batch_size": 1, "epoch": 0.8376, "step": 2094, "tokens_per_device": 4893 }, { "epoch": 0.8376, "loss_ce": 0.0644552931189537, "loss_lvr": 0.9613869190216064, "loss_mode_switch": 0.0, "loss_total": 0.16059398651123047, "step": 2094 }, { "batch_size": 4, "epoch": 0.8376, "step": 2094, "tokens_per_device": 4684 }, { "epoch": 0.8376, "loss_ce": 0.11376319080591202, "loss_lvr": 0.8143197894096375, "loss_mode_switch": 0.0, "loss_total": 0.19519516825675964, "step": 2094 }, { "batch_size": 4, "epoch": 0.8376, "step": 2094, "tokens_per_device": 4276 }, { "epoch": 0.8376, "loss_ce": 0.35729989409446716, "loss_lvr": 0.9350714087486267, "loss_mode_switch": 0.0, "loss_total": 0.45080703496932983, "step": 2094 }, { "batch_size": 1, "epoch": 0.8376, "step": 2094, "tokens_per_device": 4883 }, { "epoch": 0.8376, "loss_ce": 0.13320474326610565, "loss_lvr": 0.1580890715122223, "loss_mode_switch": 0.0, "loss_total": 0.14901365339756012, "step": 2094 }, { "epoch": 0.838, "grad_norm": 1.3645297288894653, "learning_rate": 6.725744667042778e-07, "loss": 0.2685, "step": 2095 }, { "batch_size": 4, "epoch": 0.838, "step": 2095, "tokens_per_device": 4380 }, { "epoch": 0.838, "loss_ce": 0.01801062375307083, "loss_lvr": 1.0414332151412964, "loss_mode_switch": 0.0, "loss_total": 0.12215394526720047, "step": 2095 }, { "batch_size": 4, "epoch": 0.838, "step": 2095, "tokens_per_device": 4232 }, { "epoch": 0.838, "loss_ce": 0.06437226384878159, "loss_lvr": 0.7181102633476257, "loss_mode_switch": 0.0, "loss_total": 0.13618329167366028, "step": 2095 }, { "batch_size": 4, "epoch": 0.838, "step": 2095, "tokens_per_device": 2616 }, { "epoch": 0.838, "loss_ce": 0.36577484011650085, "loss_lvr": 0.8115862011909485, "loss_mode_switch": 0.0, "loss_total": 0.44693344831466675, "step": 2095 }, { "batch_size": 4, "epoch": 0.838, "step": 2095, "tokens_per_device": 1428 }, { "epoch": 0.838, "loss_ce": 0.3891453444957733, "loss_lvr": 0.9702142477035522, "loss_mode_switch": 0.0, "loss_total": 0.486166775226593, "step": 2095 }, { "batch_size": 4, "epoch": 0.838, "step": 2095, "tokens_per_device": 4216 }, { "epoch": 0.838, "loss_ce": 0.16713300347328186, "loss_lvr": 0.8798322081565857, "loss_mode_switch": 0.0, "loss_total": 0.25511622428894043, "step": 2095 }, { "batch_size": 1, "epoch": 0.838, "step": 2095, "tokens_per_device": 4888 }, { "epoch": 0.838, "loss_ce": 0.004801961127668619, "loss_lvr": 0.41136863827705383, "loss_mode_switch": 0.0, "loss_total": 0.045938827097415924, "step": 2095 }, { "batch_size": 4, "epoch": 0.838, "step": 2095, "tokens_per_device": 2804 }, { "epoch": 0.838, "loss_ce": 0.12595203518867493, "loss_lvr": 0.6246116757392883, "loss_mode_switch": 0.0, "loss_total": 0.18841320276260376, "step": 2095 }, { "batch_size": 1, "epoch": 0.838, "step": 2095, "tokens_per_device": 5114 }, { "epoch": 0.838, "loss_ce": 0.005989876575767994, "loss_lvr": 0.555961549282074, "loss_mode_switch": 0.0, "loss_total": 0.06158602982759476, "step": 2095 }, { "epoch": 0.8384, "grad_norm": 1.3444794416427612, "learning_rate": 6.693332894654442e-07, "loss": 0.2818, "step": 2096 }, { "batch_size": 4, "epoch": 0.8384, "step": 2096, "tokens_per_device": 4316 }, { "epoch": 0.8384, "loss_ce": 0.050865236669778824, "loss_lvr": 0.8074077367782593, "loss_mode_switch": 0.0, "loss_total": 0.13160601258277893, "step": 2096 }, { "batch_size": 4, "epoch": 0.8384, "step": 2096, "tokens_per_device": 2696 }, { "epoch": 0.8384, "loss_ce": 0.9323420524597168, "loss_lvr": 0.7432469129562378, "loss_mode_switch": 0.0, "loss_total": 1.0066667795181274, "step": 2096 }, { "batch_size": 4, "epoch": 0.8384, "step": 2096, "tokens_per_device": 5804 }, { "epoch": 0.8384, "loss_ce": 0.11271359026432037, "loss_lvr": 0.8921561241149902, "loss_mode_switch": 0.0, "loss_total": 0.2019292116165161, "step": 2096 }, { "batch_size": 1, "epoch": 0.8384, "step": 2096, "tokens_per_device": 4898 }, { "epoch": 0.8384, "loss_ce": 0.026553506031632423, "loss_lvr": 0.5081864595413208, "loss_mode_switch": 0.0, "loss_total": 0.07737215608358383, "step": 2096 }, { "batch_size": 4, "epoch": 0.8384, "step": 2096, "tokens_per_device": 3792 }, { "epoch": 0.8384, "loss_ce": 0.09692961722612381, "loss_lvr": 0.6704480648040771, "loss_mode_switch": 0.0, "loss_total": 0.16397443413734436, "step": 2096 }, { "batch_size": 4, "epoch": 0.8384, "step": 2096, "tokens_per_device": 3740 }, { "epoch": 0.8384, "loss_ce": 0.33276301622390747, "loss_lvr": 0.8530101776123047, "loss_mode_switch": 0.0, "loss_total": 0.41806402802467346, "step": 2096 }, { "batch_size": 1, "epoch": 0.8384, "step": 2096, "tokens_per_device": 4890 }, { "epoch": 0.8384, "loss_ce": 0.41275984048843384, "loss_lvr": 0.4583558142185211, "loss_mode_switch": 0.0, "loss_total": 0.4585954248905182, "step": 2096 }, { "batch_size": 4, "epoch": 0.8384, "step": 2096, "tokens_per_device": 4296 }, { "epoch": 0.8384, "loss_ce": 0.002312140306457877, "loss_lvr": 0.663665771484375, "loss_mode_switch": 0.0, "loss_total": 0.06867871433496475, "step": 2096 }, { "epoch": 0.8388, "grad_norm": 1.3617587089538574, "learning_rate": 6.660993804950777e-07, "loss": 0.2985, "step": 2097 }, { "batch_size": 4, "epoch": 0.8388, "step": 2097, "tokens_per_device": 4804 }, { "epoch": 0.8388, "loss_ce": 0.0740479901432991, "loss_lvr": 0.8547743558883667, "loss_mode_switch": 0.0, "loss_total": 0.15952542424201965, "step": 2097 }, { "batch_size": 4, "epoch": 0.8388, "step": 2097, "tokens_per_device": 4328 }, { "epoch": 0.8388, "loss_ce": 0.07144878804683685, "loss_lvr": 0.7245219945907593, "loss_mode_switch": 0.0, "loss_total": 0.14390099048614502, "step": 2097 }, { "batch_size": 1, "epoch": 0.8388, "step": 2097, "tokens_per_device": 5103 }, { "epoch": 0.8388, "loss_ce": 0.0026622447185218334, "loss_lvr": 0.482247918844223, "loss_mode_switch": 0.0, "loss_total": 0.05088703706860542, "step": 2097 }, { "batch_size": 4, "epoch": 0.8388, "step": 2097, "tokens_per_device": 1304 }, { "epoch": 0.8388, "loss_ce": 0.4863502085208893, "loss_lvr": 0.8963702321052551, "loss_mode_switch": 0.0, "loss_total": 0.5759872198104858, "step": 2097 }, { "batch_size": 4, "epoch": 0.8388, "step": 2097, "tokens_per_device": 4516 }, { "epoch": 0.8388, "loss_ce": 0.22111552953720093, "loss_lvr": 1.0074883699417114, "loss_mode_switch": 0.0, "loss_total": 0.32186436653137207, "step": 2097 }, { "batch_size": 4, "epoch": 0.8388, "step": 2097, "tokens_per_device": 4232 }, { "epoch": 0.8388, "loss_ce": 0.2954719364643097, "loss_lvr": 0.8808315396308899, "loss_mode_switch": 0.0, "loss_total": 0.3835550844669342, "step": 2097 }, { "batch_size": 4, "epoch": 0.8388, "step": 2097, "tokens_per_device": 1644 }, { "epoch": 0.8388, "loss_ce": 0.23913916945457458, "loss_lvr": 0.9089823365211487, "loss_mode_switch": 0.0, "loss_total": 0.3300374150276184, "step": 2097 }, { "batch_size": 4, "epoch": 0.8388, "step": 2097, "tokens_per_device": 4816 }, { "epoch": 0.8388, "loss_ce": 0.16887633502483368, "loss_lvr": 0.7960191369056702, "loss_mode_switch": 0.0, "loss_total": 0.2484782487154007, "step": 2097 }, { "epoch": 0.8392, "grad_norm": 1.2683454751968384, "learning_rate": 6.62872745220729e-07, "loss": 0.3122, "step": 2098 }, { "batch_size": 4, "epoch": 0.8392, "step": 2098, "tokens_per_device": 4192 }, { "epoch": 0.8392, "loss_ce": 0.15284356474876404, "loss_lvr": 0.6339877843856812, "loss_mode_switch": 0.0, "loss_total": 0.21624234318733215, "step": 2098 }, { "batch_size": 4, "epoch": 0.8392, "step": 2098, "tokens_per_device": 5640 }, { "epoch": 0.8392, "loss_ce": 0.36744117736816406, "loss_lvr": 0.7342973351478577, "loss_mode_switch": 0.0, "loss_total": 0.44087091088294983, "step": 2098 }, { "batch_size": 1, "epoch": 0.8392, "step": 2098, "tokens_per_device": 5297 }, { "epoch": 0.8392, "loss_ce": 0.13111388683319092, "loss_lvr": 0.1614457368850708, "loss_mode_switch": 0.0, "loss_total": 0.147258460521698, "step": 2098 }, { "batch_size": 1, "epoch": 0.8392, "step": 2098, "tokens_per_device": 5163 }, { "epoch": 0.8392, "loss_ce": 0.022858740761876106, "loss_lvr": 0.44600754976272583, "loss_mode_switch": 0.0, "loss_total": 0.06745949387550354, "step": 2098 }, { "batch_size": 4, "epoch": 0.8392, "step": 2098, "tokens_per_device": 4180 }, { "epoch": 0.8392, "loss_ce": 0.0024876839015632868, "loss_lvr": 0.817237913608551, "loss_mode_switch": 0.0, "loss_total": 0.08421147614717484, "step": 2098 }, { "batch_size": 4, "epoch": 0.8392, "step": 2098, "tokens_per_device": 3756 }, { "epoch": 0.8392, "loss_ce": 0.14502295851707458, "loss_lvr": 0.5612069368362427, "loss_mode_switch": 0.0, "loss_total": 0.20114365220069885, "step": 2098 }, { "batch_size": 4, "epoch": 0.8392, "step": 2098, "tokens_per_device": 6124 }, { "epoch": 0.8392, "loss_ce": 0.13699942827224731, "loss_lvr": 0.8312578201293945, "loss_mode_switch": 0.0, "loss_total": 0.220125213265419, "step": 2098 }, { "batch_size": 4, "epoch": 0.8392, "step": 2098, "tokens_per_device": 15724 }, { "epoch": 0.8392, "loss_ce": 0.3916685879230499, "loss_lvr": 0.8565194606781006, "loss_mode_switch": 0.0, "loss_total": 0.4773205518722534, "step": 2098 }, { "epoch": 0.8396, "grad_norm": 1.2509772777557373, "learning_rate": 6.596533890577417e-07, "loss": 0.2777, "step": 2099 }, { "batch_size": 4, "epoch": 0.8396, "step": 2099, "tokens_per_device": 5480 }, { "epoch": 0.8396, "loss_ce": 0.3679203391075134, "loss_lvr": 0.7071868777275085, "loss_mode_switch": 0.0, "loss_total": 0.4386390447616577, "step": 2099 }, { "batch_size": 4, "epoch": 0.8396, "step": 2099, "tokens_per_device": 2620 }, { "epoch": 0.8396, "loss_ce": 0.2937415838241577, "loss_lvr": 0.8838455677032471, "loss_mode_switch": 0.0, "loss_total": 0.3821261525154114, "step": 2099 }, { "batch_size": 4, "epoch": 0.8396, "step": 2099, "tokens_per_device": 4316 }, { "epoch": 0.8396, "loss_ce": 0.26447802782058716, "loss_lvr": 0.7738211750984192, "loss_mode_switch": 0.0, "loss_total": 0.3418601453304291, "step": 2099 }, { "batch_size": 1, "epoch": 0.8396, "step": 2099, "tokens_per_device": 4945 }, { "epoch": 0.8396, "loss_ce": 0.00021215331798885018, "loss_lvr": 0.23010848462581635, "loss_mode_switch": 0.0, "loss_total": 0.02322300150990486, "step": 2099 }, { "batch_size": 4, "epoch": 0.8396, "step": 2099, "tokens_per_device": 3864 }, { "epoch": 0.8396, "loss_ce": 0.20826947689056396, "loss_lvr": 0.8631685972213745, "loss_mode_switch": 0.0, "loss_total": 0.29458633065223694, "step": 2099 }, { "batch_size": 4, "epoch": 0.8396, "step": 2099, "tokens_per_device": 4256 }, { "epoch": 0.8396, "loss_ce": 0.5831334590911865, "loss_lvr": 1.1335750818252563, "loss_mode_switch": 0.0, "loss_total": 0.6964909434318542, "step": 2099 }, { "batch_size": 4, "epoch": 0.8396, "step": 2099, "tokens_per_device": 4728 }, { "epoch": 0.8396, "loss_ce": 0.5195842385292053, "loss_lvr": 0.7862986326217651, "loss_mode_switch": 0.0, "loss_total": 0.5982140898704529, "step": 2099 }, { "batch_size": 4, "epoch": 0.8396, "step": 2099, "tokens_per_device": 3988 }, { "epoch": 0.8396, "loss_ce": 0.21682433784008026, "loss_lvr": 0.7789077162742615, "loss_mode_switch": 0.0, "loss_total": 0.29471510648727417, "step": 2099 }, { "epoch": 0.84, "grad_norm": 1.2981246709823608, "learning_rate": 6.564413174092443e-07, "loss": 0.2917, "step": 2100 }, { "batch_size": 4, "epoch": 0.84, "step": 2100, "tokens_per_device": 6392 }, { "epoch": 0.84, "loss_ce": 0.4380192458629608, "loss_lvr": 0.7558311223983765, "loss_mode_switch": 0.0, "loss_total": 0.5136023759841919, "step": 2100 }, { "batch_size": 4, "epoch": 0.84, "step": 2100, "tokens_per_device": 4812 }, { "epoch": 0.84, "loss_ce": 0.2707870602607727, "loss_lvr": 0.638149082660675, "loss_mode_switch": 0.0, "loss_total": 0.3346019685268402, "step": 2100 }, { "batch_size": 4, "epoch": 0.84, "step": 2100, "tokens_per_device": 4268 }, { "epoch": 0.84, "loss_ce": 0.13115763664245605, "loss_lvr": 0.3967914283275604, "loss_mode_switch": 0.0, "loss_total": 0.17083677649497986, "step": 2100 }, { "batch_size": 1, "epoch": 0.84, "step": 2100, "tokens_per_device": 5690 }, { "epoch": 0.84, "loss_ce": 0.001660453388467431, "loss_lvr": 0.24577529728412628, "loss_mode_switch": 0.0, "loss_total": 0.026237983256578445, "step": 2100 }, { "batch_size": 1, "epoch": 0.84, "step": 2100, "tokens_per_device": 4919 }, { "epoch": 0.84, "loss_ce": 0.633659839630127, "loss_lvr": 0.7863599061965942, "loss_mode_switch": 0.0, "loss_total": 0.7122958302497864, "step": 2100 }, { "batch_size": 1, "epoch": 0.84, "step": 2100, "tokens_per_device": 4886 }, { "epoch": 0.84, "loss_ce": 0.019691530615091324, "loss_lvr": 0.5596796870231628, "loss_mode_switch": 0.0, "loss_total": 0.07565949857234955, "step": 2100 }, { "batch_size": 4, "epoch": 0.84, "step": 2100, "tokens_per_device": 12836 }, { "epoch": 0.84, "loss_ce": 0.20743399858474731, "loss_lvr": 0.7400829195976257, "loss_mode_switch": 0.0, "loss_total": 0.2814422845840454, "step": 2100 }, { "batch_size": 1, "epoch": 0.84, "step": 2100, "tokens_per_device": 5130 }, { "epoch": 0.84, "loss_ce": 0.001468006637878716, "loss_lvr": 0.3792756497859955, "loss_mode_switch": 0.0, "loss_total": 0.03939557075500488, "step": 2100 }, { "epoch": 0.8404, "grad_norm": 1.281936526298523, "learning_rate": 6.532365356661397e-07, "loss": 0.2541, "step": 2101 }, { "batch_size": 4, "epoch": 0.8404, "step": 2101, "tokens_per_device": 4412 }, { "epoch": 0.8404, "loss_ce": 0.23145240545272827, "loss_lvr": 0.939706563949585, "loss_mode_switch": 0.0, "loss_total": 0.32542306184768677, "step": 2101 }, { "batch_size": 4, "epoch": 0.8404, "step": 2101, "tokens_per_device": 2640 }, { "epoch": 0.8404, "loss_ce": 0.05626796931028366, "loss_lvr": 0.7965240478515625, "loss_mode_switch": 0.0, "loss_total": 0.13592037558555603, "step": 2101 }, { "batch_size": 1, "epoch": 0.8404, "step": 2101, "tokens_per_device": 4569 }, { "epoch": 0.8404, "loss_ce": 0.2914421260356903, "loss_lvr": 0.7047483325004578, "loss_mode_switch": 0.0, "loss_total": 0.3619169592857361, "step": 2101 }, { "batch_size": 4, "epoch": 0.8404, "step": 2101, "tokens_per_device": 3756 }, { "epoch": 0.8404, "loss_ce": 0.3993239402770996, "loss_lvr": 1.056945562362671, "loss_mode_switch": 0.0, "loss_total": 0.5050184726715088, "step": 2101 }, { "batch_size": 1, "epoch": 0.8404, "step": 2101, "tokens_per_device": 5204 }, { "epoch": 0.8404, "loss_ce": 0.05972280353307724, "loss_lvr": 0.49171745777130127, "loss_mode_switch": 0.0, "loss_total": 0.10889454931020737, "step": 2101 }, { "batch_size": 4, "epoch": 0.8404, "step": 2101, "tokens_per_device": 5028 }, { "epoch": 0.8404, "loss_ce": 0.19333286583423615, "loss_lvr": 0.9134273529052734, "loss_mode_switch": 0.0, "loss_total": 0.28467559814453125, "step": 2101 }, { "batch_size": 1, "epoch": 0.8404, "step": 2101, "tokens_per_device": 4901 }, { "epoch": 0.8404, "loss_ce": 0.1355832815170288, "loss_lvr": 0.3388504087924957, "loss_mode_switch": 0.0, "loss_total": 0.16946832835674286, "step": 2101 }, { "batch_size": 1, "epoch": 0.8404, "step": 2101, "tokens_per_device": 4810 }, { "epoch": 0.8404, "loss_ce": 0.0003979243920184672, "loss_lvr": 0.3893698453903198, "loss_mode_switch": 0.0, "loss_total": 0.03933490812778473, "step": 2101 }, { "epoch": 0.8408, "grad_norm": 1.3723291158676147, "learning_rate": 6.500390492070941e-07, "loss": 0.2696, "step": 2102 }, { "batch_size": 4, "epoch": 0.8408, "step": 2102, "tokens_per_device": 4548 }, { "epoch": 0.8408, "loss_ce": 0.16922082006931305, "loss_lvr": 0.9607473015785217, "loss_mode_switch": 0.0, "loss_total": 0.2652955651283264, "step": 2102 }, { "batch_size": 4, "epoch": 0.8408, "step": 2102, "tokens_per_device": 4712 }, { "epoch": 0.8408, "loss_ce": 0.20861077308654785, "loss_lvr": 0.7545948624610901, "loss_mode_switch": 0.0, "loss_total": 0.2840702533721924, "step": 2102 }, { "batch_size": 4, "epoch": 0.8408, "step": 2102, "tokens_per_device": 4448 }, { "epoch": 0.8408, "loss_ce": 0.23174196481704712, "loss_lvr": 0.8382858037948608, "loss_mode_switch": 0.0, "loss_total": 0.31557053327560425, "step": 2102 }, { "batch_size": 4, "epoch": 0.8408, "step": 2102, "tokens_per_device": 5760 }, { "epoch": 0.8408, "loss_ce": 0.3459492325782776, "loss_lvr": 0.7559651136398315, "loss_mode_switch": 0.0, "loss_total": 0.42154574394226074, "step": 2102 }, { "batch_size": 4, "epoch": 0.8408, "step": 2102, "tokens_per_device": 9564 }, { "epoch": 0.8408, "loss_ce": 0.3364899158477783, "loss_lvr": 0.6405373215675354, "loss_mode_switch": 0.0, "loss_total": 0.4005436599254608, "step": 2102 }, { "batch_size": 1, "epoch": 0.8408, "step": 2102, "tokens_per_device": 4878 }, { "epoch": 0.8408, "loss_ce": 0.010096216574311256, "loss_lvr": 0.29672712087631226, "loss_mode_switch": 0.0, "loss_total": 0.03976892679929733, "step": 2102 }, { "batch_size": 4, "epoch": 0.8408, "step": 2102, "tokens_per_device": 3948 }, { "epoch": 0.8408, "loss_ce": 0.066411592066288, "loss_lvr": 0.9003022909164429, "loss_mode_switch": 0.0, "loss_total": 0.1564418226480484, "step": 2102 }, { "batch_size": 4, "epoch": 0.8408, "step": 2102, "tokens_per_device": 4304 }, { "epoch": 0.8408, "loss_ce": 0.4032289683818817, "loss_lvr": 0.7731809616088867, "loss_mode_switch": 0.0, "loss_total": 0.48054707050323486, "step": 2102 }, { "epoch": 0.8412, "grad_norm": 1.3415554761886597, "learning_rate": 6.468488633985299e-07, "loss": 0.2906, "step": 2103 }, { "batch_size": 4, "epoch": 0.8412, "step": 2103, "tokens_per_device": 5628 }, { "epoch": 0.8412, "loss_ce": 0.43134474754333496, "loss_lvr": 0.7106929421424866, "loss_mode_switch": 0.0, "loss_total": 0.5024140477180481, "step": 2103 }, { "batch_size": 4, "epoch": 0.8412, "step": 2103, "tokens_per_device": 3944 }, { "epoch": 0.8412, "loss_ce": 0.1555081158876419, "loss_lvr": 0.8411197662353516, "loss_mode_switch": 0.0, "loss_total": 0.23962008953094482, "step": 2103 }, { "batch_size": 1, "epoch": 0.8412, "step": 2103, "tokens_per_device": 5185 }, { "epoch": 0.8412, "loss_ce": 0.09791049361228943, "loss_lvr": 0.4640069007873535, "loss_mode_switch": 0.0, "loss_total": 0.14431118965148926, "step": 2103 }, { "batch_size": 4, "epoch": 0.8412, "step": 2103, "tokens_per_device": 1532 }, { "epoch": 0.8412, "loss_ce": 0.8173738718032837, "loss_lvr": 0.8281189799308777, "loss_mode_switch": 0.0, "loss_total": 0.900185763835907, "step": 2103 }, { "batch_size": 4, "epoch": 0.8412, "step": 2103, "tokens_per_device": 4368 }, { "epoch": 0.8412, "loss_ce": 0.2979404032230377, "loss_lvr": 1.091697335243225, "loss_mode_switch": 0.0, "loss_total": 0.40711015462875366, "step": 2103 }, { "batch_size": 1, "epoch": 0.8412, "step": 2103, "tokens_per_device": 5124 }, { "epoch": 0.8412, "loss_ce": 0.08868180215358734, "loss_lvr": 0.5432257056236267, "loss_mode_switch": 0.0, "loss_total": 0.14300437271595, "step": 2103 }, { "batch_size": 1, "epoch": 0.8412, "step": 2103, "tokens_per_device": 4878 }, { "epoch": 0.8412, "loss_ce": 0.000632133218459785, "loss_lvr": 0.3410995602607727, "loss_mode_switch": 0.0, "loss_total": 0.0347420908510685, "step": 2103 }, { "batch_size": 1, "epoch": 0.8412, "step": 2103, "tokens_per_device": 5469 }, { "epoch": 0.8412, "loss_ce": 0.03628183528780937, "loss_lvr": 0.28074416518211365, "loss_mode_switch": 0.0, "loss_total": 0.0643562525510788, "step": 2103 }, { "epoch": 0.8416, "grad_norm": 1.4827497005462646, "learning_rate": 6.436659835946168e-07, "loss": 0.3425, "step": 2104 }, { "batch_size": 4, "epoch": 0.8416, "step": 2104, "tokens_per_device": 2804 }, { "epoch": 0.8416, "loss_ce": 0.4802572429180145, "loss_lvr": 0.9166538119316101, "loss_mode_switch": 0.0, "loss_total": 0.5719226002693176, "step": 2104 }, { "batch_size": 4, "epoch": 0.8416, "step": 2104, "tokens_per_device": 3820 }, { "epoch": 0.8416, "loss_ce": 0.226437509059906, "loss_lvr": 0.9028213024139404, "loss_mode_switch": 0.0, "loss_total": 0.316719651222229, "step": 2104 }, { "batch_size": 4, "epoch": 0.8416, "step": 2104, "tokens_per_device": 6692 }, { "epoch": 0.8416, "loss_ce": 0.019553346559405327, "loss_lvr": 0.8399606347084045, "loss_mode_switch": 0.0, "loss_total": 0.10354941338300705, "step": 2104 }, { "batch_size": 4, "epoch": 0.8416, "step": 2104, "tokens_per_device": 1700 }, { "epoch": 0.8416, "loss_ce": 0.5534665584564209, "loss_lvr": 0.7966457009315491, "loss_mode_switch": 0.0, "loss_total": 0.6331311464309692, "step": 2104 }, { "batch_size": 4, "epoch": 0.8416, "step": 2104, "tokens_per_device": 4368 }, { "epoch": 0.8416, "loss_ce": 0.2837483584880829, "loss_lvr": 0.8397782444953918, "loss_mode_switch": 0.0, "loss_total": 0.3677261769771576, "step": 2104 }, { "batch_size": 4, "epoch": 0.8416, "step": 2104, "tokens_per_device": 2576 }, { "epoch": 0.8416, "loss_ce": 0.18187060952186584, "loss_lvr": 0.8591048121452332, "loss_mode_switch": 0.0, "loss_total": 0.2677810788154602, "step": 2104 }, { "batch_size": 4, "epoch": 0.8416, "step": 2104, "tokens_per_device": 2676 }, { "epoch": 0.8416, "loss_ce": 0.34906432032585144, "loss_lvr": 0.9437193870544434, "loss_mode_switch": 0.0, "loss_total": 0.44343626499176025, "step": 2104 }, { "batch_size": 4, "epoch": 0.8416, "step": 2104, "tokens_per_device": 4204 }, { "epoch": 0.8416, "loss_ce": 0.12852033972740173, "loss_lvr": 0.8488965630531311, "loss_mode_switch": 0.0, "loss_total": 0.21340999007225037, "step": 2104 }, { "epoch": 0.842, "grad_norm": 1.4140596389770508, "learning_rate": 6.404904151372649e-07, "loss": 0.307, "step": 2105 }, { "batch_size": 4, "epoch": 0.842, "step": 2105, "tokens_per_device": 1352 }, { "epoch": 0.842, "loss_ce": 0.4735223352909088, "loss_lvr": 0.8566681146621704, "loss_mode_switch": 0.0, "loss_total": 0.5591891407966614, "step": 2105 }, { "batch_size": 1, "epoch": 0.842, "step": 2105, "tokens_per_device": 5683 }, { "epoch": 0.842, "loss_ce": 0.037900153547525406, "loss_lvr": 0.19700779020786285, "loss_mode_switch": 0.0, "loss_total": 0.05760093033313751, "step": 2105 }, { "batch_size": 1, "epoch": 0.842, "step": 2105, "tokens_per_device": 4775 }, { "epoch": 0.842, "loss_ce": 0.0014871074818074703, "loss_lvr": 0.2428978681564331, "loss_mode_switch": 0.0, "loss_total": 0.025776894763112068, "step": 2105 }, { "batch_size": 1, "epoch": 0.842, "step": 2105, "tokens_per_device": 4863 }, { "epoch": 0.842, "loss_ce": 0.0016372674144804478, "loss_lvr": 0.3278158903121948, "loss_mode_switch": 0.0, "loss_total": 0.03441885858774185, "step": 2105 }, { "batch_size": 4, "epoch": 0.842, "step": 2105, "tokens_per_device": 5792 }, { "epoch": 0.842, "loss_ce": 0.016089942306280136, "loss_lvr": 0.49198347330093384, "loss_mode_switch": 0.0, "loss_total": 0.06528829038143158, "step": 2105 }, { "batch_size": 4, "epoch": 0.842, "step": 2105, "tokens_per_device": 5604 }, { "epoch": 0.842, "loss_ce": 0.04071003198623657, "loss_lvr": 0.8384608626365662, "loss_mode_switch": 0.0, "loss_total": 0.12455611675977707, "step": 2105 }, { "batch_size": 4, "epoch": 0.842, "step": 2105, "tokens_per_device": 4308 }, { "epoch": 0.842, "loss_ce": 0.30889904499053955, "loss_lvr": 0.7945928573608398, "loss_mode_switch": 0.0, "loss_total": 0.38835832476615906, "step": 2105 }, { "batch_size": 4, "epoch": 0.842, "step": 2105, "tokens_per_device": 3980 }, { "epoch": 0.842, "loss_ce": 0.13773605227470398, "loss_lvr": 0.614862859249115, "loss_mode_switch": 0.0, "loss_total": 0.19922234117984772, "step": 2105 }, { "epoch": 0.8424, "grad_norm": 1.1404528617858887, "learning_rate": 6.373221633561117e-07, "loss": 0.2383, "step": 2106 }, { "batch_size": 4, "epoch": 0.8424, "step": 2106, "tokens_per_device": 4776 }, { "epoch": 0.8424, "loss_ce": 0.31077808141708374, "loss_lvr": 0.8275357484817505, "loss_mode_switch": 0.0, "loss_total": 0.3935316503047943, "step": 2106 }, { "batch_size": 1, "epoch": 0.8424, "step": 2106, "tokens_per_device": 6076 }, { "epoch": 0.8424, "loss_ce": 0.13432201743125916, "loss_lvr": 0.47493648529052734, "loss_mode_switch": 0.0, "loss_total": 0.18181566894054413, "step": 2106 }, { "batch_size": 4, "epoch": 0.8424, "step": 2106, "tokens_per_device": 3804 }, { "epoch": 0.8424, "loss_ce": 0.3850812017917633, "loss_lvr": 0.8772326111793518, "loss_mode_switch": 0.0, "loss_total": 0.472804456949234, "step": 2106 }, { "batch_size": 4, "epoch": 0.8424, "step": 2106, "tokens_per_device": 2608 }, { "epoch": 0.8424, "loss_ce": 0.22571521997451782, "loss_lvr": 0.8443983197212219, "loss_mode_switch": 0.0, "loss_total": 0.31015506386756897, "step": 2106 }, { "batch_size": 4, "epoch": 0.8424, "step": 2106, "tokens_per_device": 5636 }, { "epoch": 0.8424, "loss_ce": 0.0766085535287857, "loss_lvr": 0.837230384349823, "loss_mode_switch": 0.0, "loss_total": 0.160331591963768, "step": 2106 }, { "batch_size": 1, "epoch": 0.8424, "step": 2106, "tokens_per_device": 5149 }, { "epoch": 0.8424, "loss_ce": 0.0718478113412857, "loss_lvr": 0.44009700417518616, "loss_mode_switch": 0.0, "loss_total": 0.11585751175880432, "step": 2106 }, { "batch_size": 4, "epoch": 0.8424, "step": 2106, "tokens_per_device": 7552 }, { "epoch": 0.8424, "loss_ce": 0.21412964165210724, "loss_lvr": 0.6329377293586731, "loss_mode_switch": 0.0, "loss_total": 0.2774234116077423, "step": 2106 }, { "batch_size": 4, "epoch": 0.8424, "step": 2106, "tokens_per_device": 3960 }, { "epoch": 0.8424, "loss_ce": 0.0033064913004636765, "loss_lvr": 0.8983324766159058, "loss_mode_switch": 0.0, "loss_total": 0.09313974529504776, "step": 2106 }, { "epoch": 0.8428, "grad_norm": 1.3340787887573242, "learning_rate": 6.341612335685143e-07, "loss": 0.2447, "step": 2107 }, { "batch_size": 4, "epoch": 0.8428, "step": 2107, "tokens_per_device": 1448 }, { "epoch": 0.8428, "loss_ce": 0.3568369150161743, "loss_lvr": 1.6948267221450806, "loss_mode_switch": 0.0, "loss_total": 0.5263196229934692, "step": 2107 }, { "batch_size": 4, "epoch": 0.8428, "step": 2107, "tokens_per_device": 10080 }, { "epoch": 0.8428, "loss_ce": 0.1810847520828247, "loss_lvr": 0.9029277563095093, "loss_mode_switch": 0.0, "loss_total": 0.2713775336742401, "step": 2107 }, { "batch_size": 1, "epoch": 0.8428, "step": 2107, "tokens_per_device": 4247 }, { "epoch": 0.8428, "loss_ce": 0.0018459891434758902, "loss_lvr": 0.5363209247589111, "loss_mode_switch": 0.0, "loss_total": 0.05547808110713959, "step": 2107 }, { "batch_size": 4, "epoch": 0.8428, "step": 2107, "tokens_per_device": 2660 }, { "epoch": 0.8428, "loss_ce": 0.010614965111017227, "loss_lvr": 0.7824926376342773, "loss_mode_switch": 0.0, "loss_total": 0.08886423707008362, "step": 2107 }, { "batch_size": 4, "epoch": 0.8428, "step": 2107, "tokens_per_device": 1328 }, { "epoch": 0.8428, "loss_ce": 0.3500477373600006, "loss_lvr": 1.1969009637832642, "loss_mode_switch": 0.0, "loss_total": 0.46973782777786255, "step": 2107 }, { "batch_size": 4, "epoch": 0.8428, "step": 2107, "tokens_per_device": 13780 }, { "epoch": 0.8428, "loss_ce": 0.051740843802690506, "loss_lvr": 0.5601773262023926, "loss_mode_switch": 0.0, "loss_total": 0.10775858163833618, "step": 2107 }, { "batch_size": 4, "epoch": 0.8428, "step": 2107, "tokens_per_device": 5548 }, { "epoch": 0.8428, "loss_ce": 0.35659918189048767, "loss_lvr": 0.9413864612579346, "loss_mode_switch": 0.0, "loss_total": 0.4507378339767456, "step": 2107 }, { "batch_size": 4, "epoch": 0.8428, "step": 2107, "tokens_per_device": 4300 }, { "epoch": 0.8428, "loss_ce": 0.3219108581542969, "loss_lvr": 0.7433990836143494, "loss_mode_switch": 0.0, "loss_total": 0.39625078439712524, "step": 2107 }, { "epoch": 0.8432, "grad_norm": 1.1444249153137207, "learning_rate": 6.31007631079541e-07, "loss": 0.2638, "step": 2108 }, { "batch_size": 4, "epoch": 0.8432, "step": 2108, "tokens_per_device": 3940 }, { "epoch": 0.8432, "loss_ce": 0.0048194024711847305, "loss_lvr": 0.7555670738220215, "loss_mode_switch": 0.0, "loss_total": 0.08037611097097397, "step": 2108 }, { "batch_size": 4, "epoch": 0.8432, "step": 2108, "tokens_per_device": 4628 }, { "epoch": 0.8432, "loss_ce": 0.3247680068016052, "loss_lvr": 0.7912087440490723, "loss_mode_switch": 0.0, "loss_total": 0.40388888120651245, "step": 2108 }, { "batch_size": 4, "epoch": 0.8432, "step": 2108, "tokens_per_device": 4308 }, { "epoch": 0.8432, "loss_ce": 0.03981569781899452, "loss_lvr": 0.5313666462898254, "loss_mode_switch": 0.0, "loss_total": 0.09295236319303513, "step": 2108 }, { "batch_size": 4, "epoch": 0.8432, "step": 2108, "tokens_per_device": 4048 }, { "epoch": 0.8432, "loss_ce": 0.08407177031040192, "loss_lvr": 0.7078434824943542, "loss_mode_switch": 0.0, "loss_total": 0.1548561155796051, "step": 2108 }, { "batch_size": 4, "epoch": 0.8432, "step": 2108, "tokens_per_device": 5888 }, { "epoch": 0.8432, "loss_ce": 0.07707194983959198, "loss_lvr": 0.7885595560073853, "loss_mode_switch": 0.0, "loss_total": 0.1559278964996338, "step": 2108 }, { "batch_size": 4, "epoch": 0.8432, "step": 2108, "tokens_per_device": 15332 }, { "epoch": 0.8432, "loss_ce": 0.1984032243490219, "loss_lvr": 0.8913168907165527, "loss_mode_switch": 0.0, "loss_total": 0.2875349223613739, "step": 2108 }, { "batch_size": 4, "epoch": 0.8432, "step": 2108, "tokens_per_device": 4988 }, { "epoch": 0.8432, "loss_ce": 0.12252820283174515, "loss_lvr": 0.8254625201225281, "loss_mode_switch": 0.0, "loss_total": 0.2050744593143463, "step": 2108 }, { "batch_size": 1, "epoch": 0.8432, "step": 2108, "tokens_per_device": 5040 }, { "epoch": 0.8432, "loss_ce": 0.012346426956355572, "loss_lvr": 0.24606958031654358, "loss_mode_switch": 0.0, "loss_total": 0.036953385919332504, "step": 2108 }, { "epoch": 0.8436, "grad_norm": 1.2014871835708618, "learning_rate": 6.278613611819645e-07, "loss": 0.2623, "step": 2109 }, { "batch_size": 4, "epoch": 0.8436, "step": 2109, "tokens_per_device": 16304 }, { "epoch": 0.8436, "loss_ce": 0.02581820636987686, "loss_lvr": 0.8235107660293579, "loss_mode_switch": 0.0, "loss_total": 0.10816928744316101, "step": 2109 }, { "batch_size": 4, "epoch": 0.8436, "step": 2109, "tokens_per_device": 2612 }, { "epoch": 0.8436, "loss_ce": 0.42097213864326477, "loss_lvr": 0.8114209175109863, "loss_mode_switch": 0.0, "loss_total": 0.5021142363548279, "step": 2109 }, { "batch_size": 4, "epoch": 0.8436, "step": 2109, "tokens_per_device": 2944 }, { "epoch": 0.8436, "loss_ce": 0.19583864510059357, "loss_lvr": 0.8411877155303955, "loss_mode_switch": 0.0, "loss_total": 0.2799574136734009, "step": 2109 }, { "batch_size": 4, "epoch": 0.8436, "step": 2109, "tokens_per_device": 4204 }, { "epoch": 0.8436, "loss_ce": 0.2343069612979889, "loss_lvr": 0.8073332905769348, "loss_mode_switch": 0.0, "loss_total": 0.3150402903556824, "step": 2109 }, { "batch_size": 4, "epoch": 0.8436, "step": 2109, "tokens_per_device": 7672 }, { "epoch": 0.8436, "loss_ce": 0.024192441254854202, "loss_lvr": 0.7648613452911377, "loss_mode_switch": 0.0, "loss_total": 0.10067857801914215, "step": 2109 }, { "batch_size": 1, "epoch": 0.8436, "step": 2109, "tokens_per_device": 4870 }, { "epoch": 0.8436, "loss_ce": 0.2524908781051636, "loss_lvr": 1.982013463973999, "loss_mode_switch": 0.0, "loss_total": 0.45069223642349243, "step": 2109 }, { "batch_size": 4, "epoch": 0.8436, "step": 2109, "tokens_per_device": 3772 }, { "epoch": 0.8436, "loss_ce": 0.21669434010982513, "loss_lvr": 1.0261592864990234, "loss_mode_switch": 0.0, "loss_total": 0.3193102777004242, "step": 2109 }, { "batch_size": 1, "epoch": 0.8436, "step": 2109, "tokens_per_device": 4685 }, { "epoch": 0.8436, "loss_ce": 0.05483267083764076, "loss_lvr": 0.48894593119621277, "loss_mode_switch": 0.0, "loss_total": 0.10372726619243622, "step": 2109 }, { "epoch": 0.844, "grad_norm": 1.3027833700180054, "learning_rate": 6.24722429156251e-07, "loss": 0.296, "step": 2110 }, { "batch_size": 4, "epoch": 0.844, "step": 2110, "tokens_per_device": 4376 }, { "epoch": 0.844, "loss_ce": 0.26212018728256226, "loss_lvr": 0.642433226108551, "loss_mode_switch": 0.0, "loss_total": 0.3263635039329529, "step": 2110 }, { "batch_size": 1, "epoch": 0.844, "step": 2110, "tokens_per_device": 4864 }, { "epoch": 0.844, "loss_ce": 0.006509773898869753, "loss_lvr": 0.4491581618785858, "loss_mode_switch": 0.0, "loss_total": 0.051425591111183167, "step": 2110 }, { "batch_size": 4, "epoch": 0.844, "step": 2110, "tokens_per_device": 5628 }, { "epoch": 0.844, "loss_ce": 0.02339753322303295, "loss_lvr": 1.0664327144622803, "loss_mode_switch": 0.0, "loss_total": 0.13004080951213837, "step": 2110 }, { "batch_size": 4, "epoch": 0.844, "step": 2110, "tokens_per_device": 4756 }, { "epoch": 0.844, "loss_ce": 0.3292924165725708, "loss_lvr": 0.7499997615814209, "loss_mode_switch": 0.0, "loss_total": 0.40429240465164185, "step": 2110 }, { "batch_size": 1, "epoch": 0.844, "step": 2110, "tokens_per_device": 5075 }, { "epoch": 0.844, "loss_ce": 0.013176146894693375, "loss_lvr": 0.3666450083255768, "loss_mode_switch": 0.0, "loss_total": 0.04984064772725105, "step": 2110 }, { "batch_size": 4, "epoch": 0.844, "step": 2110, "tokens_per_device": 2956 }, { "epoch": 0.844, "loss_ce": 0.4750971794128418, "loss_lvr": 0.6513955593109131, "loss_mode_switch": 0.0, "loss_total": 0.5402367115020752, "step": 2110 }, { "batch_size": 4, "epoch": 0.844, "step": 2110, "tokens_per_device": 5276 }, { "epoch": 0.844, "loss_ce": 0.20737838745117188, "loss_lvr": 1.1558815240859985, "loss_mode_switch": 0.0, "loss_total": 0.3229665458202362, "step": 2110 }, { "batch_size": 4, "epoch": 0.844, "step": 2110, "tokens_per_device": 4332 }, { "epoch": 0.844, "loss_ce": 0.3092033267021179, "loss_lvr": 0.7063824534416199, "loss_mode_switch": 0.0, "loss_total": 0.37984156608581543, "step": 2110 }, { "epoch": 0.8444, "grad_norm": 1.431220531463623, "learning_rate": 6.215908402705484e-07, "loss": 0.332, "step": 2111 }, { "batch_size": 1, "epoch": 0.8444, "step": 2111, "tokens_per_device": 4983 }, { "epoch": 0.8444, "loss_ce": 0.20968495309352875, "loss_lvr": 0.3130033314228058, "loss_mode_switch": 0.0, "loss_total": 0.24098528921604156, "step": 2111 }, { "batch_size": 4, "epoch": 0.8444, "step": 2111, "tokens_per_device": 6012 }, { "epoch": 0.8444, "loss_ce": 0.07056766003370285, "loss_lvr": 0.7439415454864502, "loss_mode_switch": 0.0, "loss_total": 0.14496181905269623, "step": 2111 }, { "batch_size": 1, "epoch": 0.8444, "step": 2111, "tokens_per_device": 5182 }, { "epoch": 0.8444, "loss_ce": 0.000228511868044734, "loss_lvr": 0.32804566621780396, "loss_mode_switch": 0.0, "loss_total": 0.03303308039903641, "step": 2111 }, { "batch_size": 4, "epoch": 0.8444, "step": 2111, "tokens_per_device": 3984 }, { "epoch": 0.8444, "loss_ce": 0.1575358510017395, "loss_lvr": 1.097296953201294, "loss_mode_switch": 0.0, "loss_total": 0.26726555824279785, "step": 2111 }, { "batch_size": 1, "epoch": 0.8444, "step": 2111, "tokens_per_device": 4941 }, { "epoch": 0.8444, "loss_ce": 0.006516608875244856, "loss_lvr": 0.16967707872390747, "loss_mode_switch": 0.0, "loss_total": 0.02348431758582592, "step": 2111 }, { "batch_size": 4, "epoch": 0.8444, "step": 2111, "tokens_per_device": 4864 }, { "epoch": 0.8444, "loss_ce": 0.15472760796546936, "loss_lvr": 1.3103430271148682, "loss_mode_switch": 0.0, "loss_total": 0.28576189279556274, "step": 2111 }, { "batch_size": 1, "epoch": 0.8444, "step": 2111, "tokens_per_device": 5111 }, { "epoch": 0.8444, "loss_ce": 0.04012118652462959, "loss_lvr": 0.43982452154159546, "loss_mode_switch": 0.0, "loss_total": 0.08410364389419556, "step": 2111 }, { "batch_size": 1, "epoch": 0.8444, "step": 2111, "tokens_per_device": 4854 }, { "epoch": 0.8444, "loss_ce": 0.000869160401634872, "loss_lvr": 0.39964064955711365, "loss_mode_switch": 0.0, "loss_total": 0.04083322361111641, "step": 2111 }, { "epoch": 0.8448, "grad_norm": 1.210668683052063, "learning_rate": 6.184665997806832e-07, "loss": 0.28, "step": 2112 }, { "batch_size": 4, "epoch": 0.8448, "step": 2112, "tokens_per_device": 1700 }, { "epoch": 0.8448, "loss_ce": 0.3628472089767456, "loss_lvr": 1.2754130363464355, "loss_mode_switch": 0.0, "loss_total": 0.49038851261138916, "step": 2112 }, { "batch_size": 1, "epoch": 0.8448, "step": 2112, "tokens_per_device": 5912 }, { "epoch": 0.8448, "loss_ce": 0.0005651713581755757, "loss_lvr": 0.2998422682285309, "loss_mode_switch": 0.0, "loss_total": 0.030549397692084312, "step": 2112 }, { "batch_size": 4, "epoch": 0.8448, "step": 2112, "tokens_per_device": 10052 }, { "epoch": 0.8448, "loss_ce": 0.408003032207489, "loss_lvr": 0.7473258376121521, "loss_mode_switch": 0.0, "loss_total": 0.48273563385009766, "step": 2112 }, { "batch_size": 4, "epoch": 0.8448, "step": 2112, "tokens_per_device": 3832 }, { "epoch": 0.8448, "loss_ce": 0.051960911601781845, "loss_lvr": 0.7345218062400818, "loss_mode_switch": 0.0, "loss_total": 0.12541308999061584, "step": 2112 }, { "batch_size": 1, "epoch": 0.8448, "step": 2112, "tokens_per_device": 4614 }, { "epoch": 0.8448, "loss_ce": 0.3856891691684723, "loss_lvr": 0.7938021421432495, "loss_mode_switch": 0.0, "loss_total": 0.46506938338279724, "step": 2112 }, { "batch_size": 4, "epoch": 0.8448, "step": 2112, "tokens_per_device": 1748 }, { "epoch": 0.8448, "loss_ce": 0.04105731472373009, "loss_lvr": 1.5830146074295044, "loss_mode_switch": 0.0, "loss_total": 0.1993587762117386, "step": 2112 }, { "batch_size": 4, "epoch": 0.8448, "step": 2112, "tokens_per_device": 4264 }, { "epoch": 0.8448, "loss_ce": 0.07182210683822632, "loss_lvr": 0.9072297811508179, "loss_mode_switch": 0.0, "loss_total": 0.1625450849533081, "step": 2112 }, { "batch_size": 4, "epoch": 0.8448, "step": 2112, "tokens_per_device": 8112 }, { "epoch": 0.8448, "loss_ce": 0.7145332098007202, "loss_lvr": 0.850725531578064, "loss_mode_switch": 0.0, "loss_total": 0.7996057868003845, "step": 2112 }, { "epoch": 0.8452, "grad_norm": 1.3960907459259033, "learning_rate": 6.153497129301461e-07, "loss": 0.274, "step": 2113 }, { "batch_size": 4, "epoch": 0.8452, "step": 2113, "tokens_per_device": 1676 }, { "epoch": 0.8452, "loss_ce": 0.2919558584690094, "loss_lvr": 0.8084366321563721, "loss_mode_switch": 0.0, "loss_total": 0.37279951572418213, "step": 2113 }, { "batch_size": 4, "epoch": 0.8452, "step": 2113, "tokens_per_device": 10648 }, { "epoch": 0.8452, "loss_ce": 0.2899697721004486, "loss_lvr": 1.0103378295898438, "loss_mode_switch": 0.0, "loss_total": 0.3910035490989685, "step": 2113 }, { "batch_size": 4, "epoch": 0.8452, "step": 2113, "tokens_per_device": 2648 }, { "epoch": 0.8452, "loss_ce": 0.28211647272109985, "loss_lvr": 1.6623775959014893, "loss_mode_switch": 0.0, "loss_total": 0.44835424423217773, "step": 2113 }, { "batch_size": 4, "epoch": 0.8452, "step": 2113, "tokens_per_device": 4448 }, { "epoch": 0.8452, "loss_ce": 0.06680970638990402, "loss_lvr": 0.7944416999816895, "loss_mode_switch": 0.0, "loss_total": 0.14625388383865356, "step": 2113 }, { "batch_size": 1, "epoch": 0.8452, "step": 2113, "tokens_per_device": 4877 }, { "epoch": 0.8452, "loss_ce": 0.018107999116182327, "loss_lvr": 0.4353792667388916, "loss_mode_switch": 0.0, "loss_total": 0.06164592504501343, "step": 2113 }, { "batch_size": 4, "epoch": 0.8452, "step": 2113, "tokens_per_device": 6340 }, { "epoch": 0.8452, "loss_ce": 0.10296571254730225, "loss_lvr": 0.7006139159202576, "loss_mode_switch": 0.0, "loss_total": 0.17302709817886353, "step": 2113 }, { "batch_size": 4, "epoch": 0.8452, "step": 2113, "tokens_per_device": 4284 }, { "epoch": 0.8452, "loss_ce": 0.07587385177612305, "loss_lvr": 0.7157388925552368, "loss_mode_switch": 0.0, "loss_total": 0.14744773507118225, "step": 2113 }, { "batch_size": 4, "epoch": 0.8452, "step": 2113, "tokens_per_device": 12668 }, { "epoch": 0.8452, "loss_ce": 0.03925633057951927, "loss_lvr": 0.8041967749595642, "loss_mode_switch": 0.0, "loss_total": 0.11967600882053375, "step": 2113 }, { "epoch": 0.8456, "grad_norm": 1.187984824180603, "learning_rate": 6.122401849500892e-07, "loss": 0.246, "step": 2114 }, { "batch_size": 4, "epoch": 0.8456, "step": 2114, "tokens_per_device": 5272 }, { "epoch": 0.8456, "loss_ce": 0.3975371718406677, "loss_lvr": 0.722649097442627, "loss_mode_switch": 0.0, "loss_total": 0.4698020815849304, "step": 2114 }, { "batch_size": 1, "epoch": 0.8456, "step": 2114, "tokens_per_device": 5071 }, { "epoch": 0.8456, "loss_ce": 0.00874961819499731, "loss_lvr": 0.9339232444763184, "loss_mode_switch": 0.0, "loss_total": 0.10214194655418396, "step": 2114 }, { "batch_size": 4, "epoch": 0.8456, "step": 2114, "tokens_per_device": 5256 }, { "epoch": 0.8456, "loss_ce": 0.46609023213386536, "loss_lvr": 0.8886784315109253, "loss_mode_switch": 0.0, "loss_total": 0.5549581050872803, "step": 2114 }, { "batch_size": 1, "epoch": 0.8456, "step": 2114, "tokens_per_device": 6533 }, { "epoch": 0.8456, "loss_ce": 0.010341004468500614, "loss_lvr": 0.4951024353504181, "loss_mode_switch": 0.0, "loss_total": 0.05985124781727791, "step": 2114 }, { "batch_size": 4, "epoch": 0.8456, "step": 2114, "tokens_per_device": 5788 }, { "epoch": 0.8456, "loss_ce": 0.009607478976249695, "loss_lvr": 0.5922462344169617, "loss_mode_switch": 0.0, "loss_total": 0.06883209943771362, "step": 2114 }, { "batch_size": 4, "epoch": 0.8456, "step": 2114, "tokens_per_device": 4244 }, { "epoch": 0.8456, "loss_ce": 0.19095630943775177, "loss_lvr": 0.40360262989997864, "loss_mode_switch": 0.0, "loss_total": 0.23131656646728516, "step": 2114 }, { "batch_size": 1, "epoch": 0.8456, "step": 2114, "tokens_per_device": 5171 }, { "epoch": 0.8456, "loss_ce": 0.25448960065841675, "loss_lvr": 0.21860705316066742, "loss_mode_switch": 0.0, "loss_total": 0.27635031938552856, "step": 2114 }, { "batch_size": 4, "epoch": 0.8456, "step": 2114, "tokens_per_device": 4160 }, { "epoch": 0.8456, "loss_ce": 0.49166548252105713, "loss_lvr": 0.8080662488937378, "loss_mode_switch": 0.0, "loss_total": 0.572472095489502, "step": 2114 }, { "epoch": 0.846, "grad_norm": 1.3432679176330566, "learning_rate": 6.091380210593145e-07, "loss": 0.2983, "step": 2115 }, { "batch_size": 4, "epoch": 0.846, "step": 2115, "tokens_per_device": 4236 }, { "epoch": 0.846, "loss_ce": 0.2265714406967163, "loss_lvr": 1.0187568664550781, "loss_mode_switch": 0.0, "loss_total": 0.3284471333026886, "step": 2115 }, { "batch_size": 4, "epoch": 0.846, "step": 2115, "tokens_per_device": 4036 }, { "epoch": 0.846, "loss_ce": 0.35206377506256104, "loss_lvr": 0.8774684071540833, "loss_mode_switch": 0.0, "loss_total": 0.4398106336593628, "step": 2115 }, { "batch_size": 4, "epoch": 0.846, "step": 2115, "tokens_per_device": 7912 }, { "epoch": 0.846, "loss_ce": 0.01690712757408619, "loss_lvr": 0.7389416694641113, "loss_mode_switch": 0.0, "loss_total": 0.09080129116773605, "step": 2115 }, { "batch_size": 1, "epoch": 0.846, "step": 2115, "tokens_per_device": 4612 }, { "epoch": 0.846, "loss_ce": 0.006569476332515478, "loss_lvr": 0.4090310335159302, "loss_mode_switch": 0.0, "loss_total": 0.047472577542066574, "step": 2115 }, { "batch_size": 4, "epoch": 0.846, "step": 2115, "tokens_per_device": 1660 }, { "epoch": 0.846, "loss_ce": 0.05873443931341171, "loss_lvr": 0.8719849586486816, "loss_mode_switch": 0.0, "loss_total": 0.14593294262886047, "step": 2115 }, { "batch_size": 1, "epoch": 0.846, "step": 2115, "tokens_per_device": 5151 }, { "epoch": 0.846, "loss_ce": 0.11366260796785355, "loss_lvr": 0.2526903748512268, "loss_mode_switch": 0.0, "loss_total": 0.13893164694309235, "step": 2115 }, { "batch_size": 4, "epoch": 0.846, "step": 2115, "tokens_per_device": 4956 }, { "epoch": 0.846, "loss_ce": 0.03401634469628334, "loss_lvr": 0.7815146446228027, "loss_mode_switch": 0.0, "loss_total": 0.11216780543327332, "step": 2115 }, { "batch_size": 4, "epoch": 0.846, "step": 2115, "tokens_per_device": 1824 }, { "epoch": 0.846, "loss_ce": 0.3753266930580139, "loss_lvr": 0.8368210792541504, "loss_mode_switch": 0.0, "loss_total": 0.4590088129043579, "step": 2115 }, { "epoch": 0.8464, "grad_norm": 1.2870821952819824, "learning_rate": 6.060432264642601e-07, "loss": 0.2599, "step": 2116 }, { "batch_size": 4, "epoch": 0.8464, "step": 2116, "tokens_per_device": 4220 }, { "epoch": 0.8464, "loss_ce": 0.08286607265472412, "loss_lvr": 0.9634387493133545, "loss_mode_switch": 0.0, "loss_total": 0.17920994758605957, "step": 2116 }, { "batch_size": 4, "epoch": 0.8464, "step": 2116, "tokens_per_device": 4344 }, { "epoch": 0.8464, "loss_ce": 0.19587095081806183, "loss_lvr": 0.8344743847846985, "loss_mode_switch": 0.0, "loss_total": 0.2793183922767639, "step": 2116 }, { "batch_size": 4, "epoch": 0.8464, "step": 2116, "tokens_per_device": 3976 }, { "epoch": 0.8464, "loss_ce": 0.0030046396423131227, "loss_lvr": 0.2803126573562622, "loss_mode_switch": 0.0, "loss_total": 0.031035905703902245, "step": 2116 }, { "batch_size": 4, "epoch": 0.8464, "step": 2116, "tokens_per_device": 4248 }, { "epoch": 0.8464, "loss_ce": 0.11691959947347641, "loss_lvr": 0.8976881504058838, "loss_mode_switch": 0.0, "loss_total": 0.20668841898441315, "step": 2116 }, { "batch_size": 4, "epoch": 0.8464, "step": 2116, "tokens_per_device": 5224 }, { "epoch": 0.8464, "loss_ce": 0.548069417476654, "loss_lvr": 0.8226062059402466, "loss_mode_switch": 0.0, "loss_total": 0.6303300261497498, "step": 2116 }, { "batch_size": 1, "epoch": 0.8464, "step": 2116, "tokens_per_device": 4891 }, { "epoch": 0.8464, "loss_ce": 0.006569918245077133, "loss_lvr": 0.7203264236450195, "loss_mode_switch": 0.0, "loss_total": 0.07860256731510162, "step": 2116 }, { "batch_size": 4, "epoch": 0.8464, "step": 2116, "tokens_per_device": 5968 }, { "epoch": 0.8464, "loss_ce": 0.21676580607891083, "loss_lvr": 0.6810767650604248, "loss_mode_switch": 0.0, "loss_total": 0.28487348556518555, "step": 2116 }, { "batch_size": 4, "epoch": 0.8464, "step": 2116, "tokens_per_device": 2652 }, { "epoch": 0.8464, "loss_ce": 0.26045626401901245, "loss_lvr": 0.8354164361953735, "loss_mode_switch": 0.0, "loss_total": 0.34399789571762085, "step": 2116 }, { "epoch": 0.8468, "grad_norm": 1.310123324394226, "learning_rate": 6.029558063589996e-07, "loss": 0.2607, "step": 2117 }, { "batch_size": 4, "epoch": 0.8468, "step": 2117, "tokens_per_device": 4320 }, { "epoch": 0.8468, "loss_ce": 0.05912444740533829, "loss_lvr": 0.9109405279159546, "loss_mode_switch": 0.0, "loss_total": 0.15021850168704987, "step": 2117 }, { "batch_size": 4, "epoch": 0.8468, "step": 2117, "tokens_per_device": 4192 }, { "epoch": 0.8468, "loss_ce": 0.48801255226135254, "loss_lvr": 1.0710057020187378, "loss_mode_switch": 0.0, "loss_total": 0.5951130986213684, "step": 2117 }, { "batch_size": 4, "epoch": 0.8468, "step": 2117, "tokens_per_device": 4480 }, { "epoch": 0.8468, "loss_ce": 0.12520074844360352, "loss_lvr": 0.870150625705719, "loss_mode_switch": 0.0, "loss_total": 0.21221581101417542, "step": 2117 }, { "batch_size": 4, "epoch": 0.8468, "step": 2117, "tokens_per_device": 1772 }, { "epoch": 0.8468, "loss_ce": 0.5396180152893066, "loss_lvr": 0.9498116374015808, "loss_mode_switch": 0.0, "loss_total": 0.6345992088317871, "step": 2117 }, { "batch_size": 1, "epoch": 0.8468, "step": 2117, "tokens_per_device": 5494 }, { "epoch": 0.8468, "loss_ce": 0.1159915030002594, "loss_lvr": 0.27468565106391907, "loss_mode_switch": 0.0, "loss_total": 0.14346006512641907, "step": 2117 }, { "batch_size": 1, "epoch": 0.8468, "step": 2117, "tokens_per_device": 4873 }, { "epoch": 0.8468, "loss_ce": 0.014016123488545418, "loss_lvr": 0.41381219029426575, "loss_mode_switch": 0.0, "loss_total": 0.05539734661579132, "step": 2117 }, { "batch_size": 4, "epoch": 0.8468, "step": 2117, "tokens_per_device": 3060 }, { "epoch": 0.8468, "loss_ce": 0.6385617256164551, "loss_lvr": 0.6782277226448059, "loss_mode_switch": 0.0, "loss_total": 0.7063844799995422, "step": 2117 }, { "batch_size": 4, "epoch": 0.8468, "step": 2117, "tokens_per_device": 5804 }, { "epoch": 0.8468, "loss_ce": 0.0951865017414093, "loss_lvr": 0.9059208035469055, "loss_mode_switch": 0.0, "loss_total": 0.18577858805656433, "step": 2117 }, { "epoch": 0.8472, "grad_norm": 1.381767749786377, "learning_rate": 5.998757659252285e-07, "loss": 0.3248, "step": 2118 }, { "batch_size": 4, "epoch": 0.8472, "step": 2118, "tokens_per_device": 5540 }, { "epoch": 0.8472, "loss_ce": 0.14712902903556824, "loss_lvr": 0.7896237373352051, "loss_mode_switch": 0.0, "loss_total": 0.2260914146900177, "step": 2118 }, { "batch_size": 1, "epoch": 0.8472, "step": 2118, "tokens_per_device": 5037 }, { "epoch": 0.8472, "loss_ce": 0.2507535517215729, "loss_lvr": 0.868390679359436, "loss_mode_switch": 0.0, "loss_total": 0.33759263157844543, "step": 2118 }, { "batch_size": 4, "epoch": 0.8472, "step": 2118, "tokens_per_device": 7320 }, { "epoch": 0.8472, "loss_ce": 0.14718657732009888, "loss_lvr": 0.35358065366744995, "loss_mode_switch": 0.0, "loss_total": 0.18254464864730835, "step": 2118 }, { "batch_size": 4, "epoch": 0.8472, "step": 2118, "tokens_per_device": 4572 }, { "epoch": 0.8472, "loss_ce": 0.09683199226856232, "loss_lvr": 0.6733118295669556, "loss_mode_switch": 0.0, "loss_total": 0.16416317224502563, "step": 2118 }, { "batch_size": 4, "epoch": 0.8472, "step": 2118, "tokens_per_device": 5900 }, { "epoch": 0.8472, "loss_ce": 0.3523242771625519, "loss_lvr": 1.5647368431091309, "loss_mode_switch": 0.0, "loss_total": 0.5087979435920715, "step": 2118 }, { "batch_size": 4, "epoch": 0.8472, "step": 2118, "tokens_per_device": 4376 }, { "epoch": 0.8472, "loss_ce": 0.2699931859970093, "loss_lvr": 0.5672627687454224, "loss_mode_switch": 0.0, "loss_total": 0.3267194628715515, "step": 2118 }, { "batch_size": 4, "epoch": 0.8472, "step": 2118, "tokens_per_device": 3476 }, { "epoch": 0.8472, "loss_ce": 0.2735122740268707, "loss_lvr": 0.38425570726394653, "loss_mode_switch": 0.0, "loss_total": 0.3119378387928009, "step": 2118 }, { "batch_size": 4, "epoch": 0.8472, "step": 2118, "tokens_per_device": 2652 }, { "epoch": 0.8472, "loss_ce": 0.12110165506601334, "loss_lvr": 0.6425089836120605, "loss_mode_switch": 0.0, "loss_total": 0.18535256385803223, "step": 2118 }, { "epoch": 0.8476, "grad_norm": 1.4960330724716187, "learning_rate": 5.968031103322592e-07, "loss": 0.3202, "step": 2119 }, { "batch_size": 4, "epoch": 0.8476, "step": 2119, "tokens_per_device": 4668 }, { "epoch": 0.8476, "loss_ce": 0.023769142106175423, "loss_lvr": 0.8010514378547668, "loss_mode_switch": 0.0, "loss_total": 0.10387428849935532, "step": 2119 }, { "batch_size": 4, "epoch": 0.8476, "step": 2119, "tokens_per_device": 15324 }, { "epoch": 0.8476, "loss_ce": 0.21049714088439941, "loss_lvr": 0.6657777428627014, "loss_mode_switch": 0.0, "loss_total": 0.277074933052063, "step": 2119 }, { "batch_size": 4, "epoch": 0.8476, "step": 2119, "tokens_per_device": 4464 }, { "epoch": 0.8476, "loss_ce": 0.01947404444217682, "loss_lvr": 0.5148311257362366, "loss_mode_switch": 0.0, "loss_total": 0.07095715403556824, "step": 2119 }, { "batch_size": 4, "epoch": 0.8476, "step": 2119, "tokens_per_device": 1776 }, { "epoch": 0.8476, "loss_ce": 0.47800424695014954, "loss_lvr": 0.9744399785995483, "loss_mode_switch": 0.0, "loss_total": 0.5754482746124268, "step": 2119 }, { "batch_size": 4, "epoch": 0.8476, "step": 2119, "tokens_per_device": 2696 }, { "epoch": 0.8476, "loss_ce": 0.7471754550933838, "loss_lvr": 0.5869897603988647, "loss_mode_switch": 0.0, "loss_total": 0.8058744072914124, "step": 2119 }, { "batch_size": 1, "epoch": 0.8476, "step": 2119, "tokens_per_device": 4834 }, { "epoch": 0.8476, "loss_ce": 0.023120736703276634, "loss_lvr": 0.2966731786727905, "loss_mode_switch": 0.0, "loss_total": 0.052788056433200836, "step": 2119 }, { "batch_size": 4, "epoch": 0.8476, "step": 2119, "tokens_per_device": 4736 }, { "epoch": 0.8476, "loss_ce": 0.047865819185972214, "loss_lvr": 0.7632229328155518, "loss_mode_switch": 0.0, "loss_total": 0.12418811023235321, "step": 2119 }, { "batch_size": 4, "epoch": 0.8476, "step": 2119, "tokens_per_device": 3756 }, { "epoch": 0.8476, "loss_ce": 0.2793957591056824, "loss_lvr": 0.5731480717658997, "loss_mode_switch": 0.0, "loss_total": 0.3367105722427368, "step": 2119 }, { "epoch": 0.848, "grad_norm": 1.217108964920044, "learning_rate": 5.937378447370068e-07, "loss": 0.2941, "step": 2120 }, { "batch_size": 4, "epoch": 0.848, "step": 2120, "tokens_per_device": 4936 }, { "epoch": 0.848, "loss_ce": 0.15179796516895294, "loss_lvr": 0.6419044733047485, "loss_mode_switch": 0.0, "loss_total": 0.2159884124994278, "step": 2120 }, { "batch_size": 4, "epoch": 0.848, "step": 2120, "tokens_per_device": 4680 }, { "epoch": 0.848, "loss_ce": 0.13195425271987915, "loss_lvr": 0.629441499710083, "loss_mode_switch": 0.0, "loss_total": 0.19489839673042297, "step": 2120 }, { "batch_size": 1, "epoch": 0.848, "step": 2120, "tokens_per_device": 4878 }, { "epoch": 0.848, "loss_ce": 0.002177139278501272, "loss_lvr": 0.2522485554218292, "loss_mode_switch": 0.0, "loss_total": 0.027401994913816452, "step": 2120 }, { "batch_size": 4, "epoch": 0.848, "step": 2120, "tokens_per_device": 5960 }, { "epoch": 0.848, "loss_ce": 0.014966854825615883, "loss_lvr": 0.4827077090740204, "loss_mode_switch": 0.0, "loss_total": 0.06323762983083725, "step": 2120 }, { "batch_size": 1, "epoch": 0.848, "step": 2120, "tokens_per_device": 5505 }, { "epoch": 0.848, "loss_ce": 0.15480414032936096, "loss_lvr": 0.3319404423236847, "loss_mode_switch": 0.0, "loss_total": 0.1879981905221939, "step": 2120 }, { "batch_size": 4, "epoch": 0.848, "step": 2120, "tokens_per_device": 1432 }, { "epoch": 0.848, "loss_ce": 0.17405445873737335, "loss_lvr": 0.9755265712738037, "loss_mode_switch": 0.0, "loss_total": 0.27160710096359253, "step": 2120 }, { "batch_size": 1, "epoch": 0.848, "step": 2120, "tokens_per_device": 4895 }, { "epoch": 0.848, "loss_ce": 0.16901171207427979, "loss_lvr": 0.6733060479164124, "loss_mode_switch": 0.0, "loss_total": 0.23634231090545654, "step": 2120 }, { "batch_size": 4, "epoch": 0.848, "step": 2120, "tokens_per_device": 2684 }, { "epoch": 0.848, "loss_ce": 0.10586874186992645, "loss_lvr": 0.9763913154602051, "loss_mode_switch": 0.0, "loss_total": 0.20350787043571472, "step": 2120 }, { "epoch": 0.8484, "grad_norm": 1.2774910926818848, "learning_rate": 5.906799742839842e-07, "loss": 0.2831, "step": 2121 }, { "batch_size": 4, "epoch": 0.8484, "step": 2121, "tokens_per_device": 3796 }, { "epoch": 0.8484, "loss_ce": 0.2726656198501587, "loss_lvr": 1.032429814338684, "loss_mode_switch": 0.0, "loss_total": 0.37590861320495605, "step": 2121 }, { "batch_size": 1, "epoch": 0.8484, "step": 2121, "tokens_per_device": 4777 }, { "epoch": 0.8484, "loss_ce": 0.0931866466999054, "loss_lvr": 0.28571900725364685, "loss_mode_switch": 0.0, "loss_total": 0.12175855040550232, "step": 2121 }, { "batch_size": 4, "epoch": 0.8484, "step": 2121, "tokens_per_device": 1340 }, { "epoch": 0.8484, "loss_ce": 0.08104943484067917, "loss_lvr": 0.9653058648109436, "loss_mode_switch": 0.0, "loss_total": 0.17758002877235413, "step": 2121 }, { "batch_size": 4, "epoch": 0.8484, "step": 2121, "tokens_per_device": 3892 }, { "epoch": 0.8484, "loss_ce": 0.0253174789249897, "loss_lvr": 0.6594824194908142, "loss_mode_switch": 0.0, "loss_total": 0.0912657231092453, "step": 2121 }, { "batch_size": 4, "epoch": 0.8484, "step": 2121, "tokens_per_device": 3924 }, { "epoch": 0.8484, "loss_ce": 0.4216781258583069, "loss_lvr": 0.8892276883125305, "loss_mode_switch": 0.0, "loss_total": 0.5106009244918823, "step": 2121 }, { "batch_size": 1, "epoch": 0.8484, "step": 2121, "tokens_per_device": 7366 }, { "epoch": 0.8484, "loss_ce": 0.0133518036454916, "loss_lvr": 0.5437029004096985, "loss_mode_switch": 0.0, "loss_total": 0.06772209703922272, "step": 2121 }, { "batch_size": 4, "epoch": 0.8484, "step": 2121, "tokens_per_device": 3740 }, { "epoch": 0.8484, "loss_ce": 0.5259932279586792, "loss_lvr": 0.7417268753051758, "loss_mode_switch": 0.0, "loss_total": 0.6001659035682678, "step": 2121 }, { "batch_size": 4, "epoch": 0.8484, "step": 2121, "tokens_per_device": 6336 }, { "epoch": 0.8484, "loss_ce": 0.4287102222442627, "loss_lvr": 0.7585881948471069, "loss_mode_switch": 0.0, "loss_total": 0.5045690536499023, "step": 2121 }, { "epoch": 0.8488, "grad_norm": 1.1666944026947021, "learning_rate": 5.876295041052932e-07, "loss": 0.2291, "step": 2122 }, { "batch_size": 4, "epoch": 0.8488, "step": 2122, "tokens_per_device": 1364 }, { "epoch": 0.8488, "loss_ce": 0.9324835538864136, "loss_lvr": 2.244696617126465, "loss_mode_switch": 0.0, "loss_total": 1.15695321559906, "step": 2122 }, { "batch_size": 4, "epoch": 0.8488, "step": 2122, "tokens_per_device": 4480 }, { "epoch": 0.8488, "loss_ce": 0.02282259799540043, "loss_lvr": 1.1253023147583008, "loss_mode_switch": 0.0, "loss_total": 0.13535283505916595, "step": 2122 }, { "batch_size": 1, "epoch": 0.8488, "step": 2122, "tokens_per_device": 4959 }, { "epoch": 0.8488, "loss_ce": 0.0005961380084045231, "loss_lvr": 0.2796821892261505, "loss_mode_switch": 0.0, "loss_total": 0.02856435626745224, "step": 2122 }, { "batch_size": 4, "epoch": 0.8488, "step": 2122, "tokens_per_device": 4688 }, { "epoch": 0.8488, "loss_ce": 0.4593406915664673, "loss_lvr": 0.6512966752052307, "loss_mode_switch": 0.0, "loss_total": 0.524470329284668, "step": 2122 }, { "batch_size": 1, "epoch": 0.8488, "step": 2122, "tokens_per_device": 5095 }, { "epoch": 0.8488, "loss_ce": 0.09238949418067932, "loss_lvr": 0.3265155553817749, "loss_mode_switch": 0.0, "loss_total": 0.12504105269908905, "step": 2122 }, { "batch_size": 4, "epoch": 0.8488, "step": 2122, "tokens_per_device": 10764 }, { "epoch": 0.8488, "loss_ce": 0.0424213632941246, "loss_lvr": 0.7809597849845886, "loss_mode_switch": 0.0, "loss_total": 0.12051734328269958, "step": 2122 }, { "batch_size": 4, "epoch": 0.8488, "step": 2122, "tokens_per_device": 6816 }, { "epoch": 0.8488, "loss_ce": 0.29166552424430847, "loss_lvr": 0.6642733812332153, "loss_mode_switch": 0.0, "loss_total": 0.35809287428855896, "step": 2122 }, { "batch_size": 4, "epoch": 0.8488, "step": 2122, "tokens_per_device": 4100 }, { "epoch": 0.8488, "loss_ce": 0.5716645121574402, "loss_lvr": 0.814498245716095, "loss_mode_switch": 0.0, "loss_total": 0.6531143188476562, "step": 2122 }, { "epoch": 0.8492, "grad_norm": 1.3013148307800293, "learning_rate": 5.845864393206158e-07, "loss": 0.2615, "step": 2123 }, { "batch_size": 4, "epoch": 0.8492, "step": 2123, "tokens_per_device": 12924 }, { "epoch": 0.8492, "loss_ce": 0.029202038422226906, "loss_lvr": 0.6853815317153931, "loss_mode_switch": 0.0, "loss_total": 0.09774018824100494, "step": 2123 }, { "batch_size": 1, "epoch": 0.8492, "step": 2123, "tokens_per_device": 5133 }, { "epoch": 0.8492, "loss_ce": 0.014770863577723503, "loss_lvr": 0.28167998790740967, "loss_mode_switch": 0.0, "loss_total": 0.04293886199593544, "step": 2123 }, { "batch_size": 4, "epoch": 0.8492, "step": 2123, "tokens_per_device": 3812 }, { "epoch": 0.8492, "loss_ce": 0.5912362337112427, "loss_lvr": 1.0819441080093384, "loss_mode_switch": 0.0, "loss_total": 0.6994306445121765, "step": 2123 }, { "batch_size": 1, "epoch": 0.8492, "step": 2123, "tokens_per_device": 4768 }, { "epoch": 0.8492, "loss_ce": 0.014375736005604267, "loss_lvr": 0.33272218704223633, "loss_mode_switch": 0.0, "loss_total": 0.047647953033447266, "step": 2123 }, { "batch_size": 4, "epoch": 0.8492, "step": 2123, "tokens_per_device": 5700 }, { "epoch": 0.8492, "loss_ce": 0.0023209459614008665, "loss_lvr": 0.41194748878479004, "loss_mode_switch": 0.0, "loss_total": 0.04351569339632988, "step": 2123 }, { "batch_size": 4, "epoch": 0.8492, "step": 2123, "tokens_per_device": 5864 }, { "epoch": 0.8492, "loss_ce": 0.028156207874417305, "loss_lvr": 0.7131639122962952, "loss_mode_switch": 0.0, "loss_total": 0.09947259724140167, "step": 2123 }, { "batch_size": 4, "epoch": 0.8492, "step": 2123, "tokens_per_device": 4780 }, { "epoch": 0.8492, "loss_ce": 0.26452872157096863, "loss_lvr": 0.6760834455490112, "loss_mode_switch": 0.0, "loss_total": 0.3321370780467987, "step": 2123 }, { "batch_size": 1, "epoch": 0.8492, "step": 2123, "tokens_per_device": 4914 }, { "epoch": 0.8492, "loss_ce": 0.1621147096157074, "loss_lvr": 0.5340608954429626, "loss_mode_switch": 0.0, "loss_total": 0.21552079916000366, "step": 2123 }, { "epoch": 0.8496, "grad_norm": 1.3505383729934692, "learning_rate": 5.815507850372077e-07, "loss": 0.2636, "step": 2124 }, { "batch_size": 1, "epoch": 0.8496, "step": 2124, "tokens_per_device": 5106 }, { "epoch": 0.8496, "loss_ce": 0.026177402585744858, "loss_lvr": 0.6964213848114014, "loss_mode_switch": 0.0, "loss_total": 0.09581954777240753, "step": 2124 }, { "batch_size": 4, "epoch": 0.8496, "step": 2124, "tokens_per_device": 3388 }, { "epoch": 0.8496, "loss_ce": 0.4205103814601898, "loss_lvr": 0.8293853402137756, "loss_mode_switch": 0.0, "loss_total": 0.5034489035606384, "step": 2124 }, { "batch_size": 1, "epoch": 0.8496, "step": 2124, "tokens_per_device": 5129 }, { "epoch": 0.8496, "loss_ce": 0.0013783455360680819, "loss_lvr": 0.3252817988395691, "loss_mode_switch": 0.0, "loss_total": 0.03390652686357498, "step": 2124 }, { "batch_size": 4, "epoch": 0.8496, "step": 2124, "tokens_per_device": 4092 }, { "epoch": 0.8496, "loss_ce": 0.31882551312446594, "loss_lvr": 0.7734561562538147, "loss_mode_switch": 0.0, "loss_total": 0.39617112278938293, "step": 2124 }, { "batch_size": 4, "epoch": 0.8496, "step": 2124, "tokens_per_device": 2700 }, { "epoch": 0.8496, "loss_ce": 0.052624281495809555, "loss_lvr": 0.5198431611061096, "loss_mode_switch": 0.0, "loss_total": 0.10460859537124634, "step": 2124 }, { "batch_size": 1, "epoch": 0.8496, "step": 2124, "tokens_per_device": 4402 }, { "epoch": 0.8496, "loss_ce": 0.030482446774840355, "loss_lvr": 0.4339822232723236, "loss_mode_switch": 0.0, "loss_total": 0.07388067245483398, "step": 2124 }, { "batch_size": 1, "epoch": 0.8496, "step": 2124, "tokens_per_device": 5095 }, { "epoch": 0.8496, "loss_ce": 0.002721159253269434, "loss_lvr": 0.37216031551361084, "loss_mode_switch": 0.0, "loss_total": 0.03993719443678856, "step": 2124 }, { "batch_size": 4, "epoch": 0.8496, "step": 2124, "tokens_per_device": 4180 }, { "epoch": 0.8496, "loss_ce": 0.37989550828933716, "loss_lvr": 0.8256464004516602, "loss_mode_switch": 0.0, "loss_total": 0.46246016025543213, "step": 2124 }, { "epoch": 0.85, "grad_norm": 1.2600504159927368, "learning_rate": 5.785225463498828e-07, "loss": 0.2709, "step": 2125 }, { "batch_size": 4, "epoch": 0.85, "step": 2125, "tokens_per_device": 7996 }, { "epoch": 0.85, "loss_ce": 0.12219855189323425, "loss_lvr": 0.7968117594718933, "loss_mode_switch": 0.0, "loss_total": 0.20187973976135254, "step": 2125 }, { "batch_size": 1, "epoch": 0.85, "step": 2125, "tokens_per_device": 5143 }, { "epoch": 0.85, "loss_ce": 0.008469044230878353, "loss_lvr": 0.3020670711994171, "loss_mode_switch": 0.0, "loss_total": 0.03867575153708458, "step": 2125 }, { "batch_size": 4, "epoch": 0.85, "step": 2125, "tokens_per_device": 3844 }, { "epoch": 0.85, "loss_ce": 0.219376802444458, "loss_lvr": 1.0832955837249756, "loss_mode_switch": 0.0, "loss_total": 0.32770636677742004, "step": 2125 }, { "batch_size": 4, "epoch": 0.85, "step": 2125, "tokens_per_device": 4620 }, { "epoch": 0.85, "loss_ce": 0.020654581487178802, "loss_lvr": 0.7138808965682983, "loss_mode_switch": 0.0, "loss_total": 0.09204266965389252, "step": 2125 }, { "batch_size": 1, "epoch": 0.85, "step": 2125, "tokens_per_device": 4906 }, { "epoch": 0.85, "loss_ce": 0.1356094777584076, "loss_lvr": 0.2711455821990967, "loss_mode_switch": 0.0, "loss_total": 0.16272403299808502, "step": 2125 }, { "batch_size": 1, "epoch": 0.85, "step": 2125, "tokens_per_device": 5198 }, { "epoch": 0.85, "loss_ce": 0.0039676763117313385, "loss_lvr": 0.3558424413204193, "loss_mode_switch": 0.0, "loss_total": 0.03955192118883133, "step": 2125 }, { "batch_size": 1, "epoch": 0.85, "step": 2125, "tokens_per_device": 5471 }, { "epoch": 0.85, "loss_ce": 0.13589483499526978, "loss_lvr": 0.22966814041137695, "loss_mode_switch": 0.0, "loss_total": 0.1588616520166397, "step": 2125 }, { "batch_size": 1, "epoch": 0.85, "step": 2125, "tokens_per_device": 4875 }, { "epoch": 0.85, "loss_ce": 0.050088562071323395, "loss_lvr": 0.7199159264564514, "loss_mode_switch": 0.0, "loss_total": 0.12208015471696854, "step": 2125 }, { "epoch": 0.8504, "grad_norm": 1.4248660802841187, "learning_rate": 5.755017283410102e-07, "loss": 0.2607, "step": 2126 }, { "batch_size": 4, "epoch": 0.8504, "step": 2126, "tokens_per_device": 1316 }, { "epoch": 0.8504, "loss_ce": 0.2967737019062042, "loss_lvr": 1.2144840955734253, "loss_mode_switch": 0.0, "loss_total": 0.4182221293449402, "step": 2126 }, { "batch_size": 1, "epoch": 0.8504, "step": 2126, "tokens_per_device": 4818 }, { "epoch": 0.8504, "loss_ce": 0.3430556058883667, "loss_lvr": 0.8640323281288147, "loss_mode_switch": 0.0, "loss_total": 0.4294588565826416, "step": 2126 }, { "batch_size": 1, "epoch": 0.8504, "step": 2126, "tokens_per_device": 5168 }, { "epoch": 0.8504, "loss_ce": 0.2329898625612259, "loss_lvr": 0.36461350321769714, "loss_mode_switch": 0.0, "loss_total": 0.26945120096206665, "step": 2126 }, { "batch_size": 4, "epoch": 0.8504, "step": 2126, "tokens_per_device": 4456 }, { "epoch": 0.8504, "loss_ce": 0.785784900188446, "loss_lvr": 0.7862429022789001, "loss_mode_switch": 0.0, "loss_total": 0.8644092082977295, "step": 2126 }, { "batch_size": 4, "epoch": 0.8504, "step": 2126, "tokens_per_device": 3600 }, { "epoch": 0.8504, "loss_ce": 0.6573420166969299, "loss_lvr": 0.8883086442947388, "loss_mode_switch": 0.0, "loss_total": 0.7461729049682617, "step": 2126 }, { "batch_size": 4, "epoch": 0.8504, "step": 2126, "tokens_per_device": 4256 }, { "epoch": 0.8504, "loss_ce": 0.20493099093437195, "loss_lvr": 1.167286992073059, "loss_mode_switch": 0.0, "loss_total": 0.3216596841812134, "step": 2126 }, { "batch_size": 4, "epoch": 0.8504, "step": 2126, "tokens_per_device": 2692 }, { "epoch": 0.8504, "loss_ce": 0.2901504933834076, "loss_lvr": 0.7478926181793213, "loss_mode_switch": 0.0, "loss_total": 0.36493974924087524, "step": 2126 }, { "batch_size": 4, "epoch": 0.8504, "step": 2126, "tokens_per_device": 3828 }, { "epoch": 0.8504, "loss_ce": 0.08831173926591873, "loss_lvr": 0.8697560429573059, "loss_mode_switch": 0.0, "loss_total": 0.17528733611106873, "step": 2126 }, { "epoch": 0.8508, "grad_norm": 1.2842838764190674, "learning_rate": 5.724883360805095e-07, "loss": 0.3006, "step": 2127 }, { "batch_size": 4, "epoch": 0.8508, "step": 2127, "tokens_per_device": 4928 }, { "epoch": 0.8508, "loss_ce": 0.060334548354148865, "loss_lvr": 0.6826673150062561, "loss_mode_switch": 0.0, "loss_total": 0.1286012828350067, "step": 2127 }, { "batch_size": 1, "epoch": 0.8508, "step": 2127, "tokens_per_device": 4914 }, { "epoch": 0.8508, "loss_ce": 0.1624937504529953, "loss_lvr": 0.6418318152427673, "loss_mode_switch": 0.0, "loss_total": 0.22667694091796875, "step": 2127 }, { "batch_size": 4, "epoch": 0.8508, "step": 2127, "tokens_per_device": 4308 }, { "epoch": 0.8508, "loss_ce": 0.40641146898269653, "loss_lvr": 0.6389943957328796, "loss_mode_switch": 0.0, "loss_total": 0.47031092643737793, "step": 2127 }, { "batch_size": 4, "epoch": 0.8508, "step": 2127, "tokens_per_device": 4400 }, { "epoch": 0.8508, "loss_ce": 0.11332208663225174, "loss_lvr": 0.7564462423324585, "loss_mode_switch": 0.0, "loss_total": 0.18896672129631042, "step": 2127 }, { "batch_size": 1, "epoch": 0.8508, "step": 2127, "tokens_per_device": 4745 }, { "epoch": 0.8508, "loss_ce": 0.005343631841242313, "loss_lvr": 0.5362450480461121, "loss_mode_switch": 0.0, "loss_total": 0.058968137949705124, "step": 2127 }, { "batch_size": 4, "epoch": 0.8508, "step": 2127, "tokens_per_device": 6568 }, { "epoch": 0.8508, "loss_ce": 0.04283977299928665, "loss_lvr": 0.7974972128868103, "loss_mode_switch": 0.0, "loss_total": 0.12258949875831604, "step": 2127 }, { "batch_size": 4, "epoch": 0.8508, "step": 2127, "tokens_per_device": 4460 }, { "epoch": 0.8508, "loss_ce": 0.15664762258529663, "loss_lvr": 1.0000733137130737, "loss_mode_switch": 0.0, "loss_total": 0.2566549479961395, "step": 2127 }, { "batch_size": 4, "epoch": 0.8508, "step": 2127, "tokens_per_device": 4684 }, { "epoch": 0.8508, "loss_ce": 0.0438246987760067, "loss_lvr": 0.7957408428192139, "loss_mode_switch": 0.0, "loss_total": 0.1233987808227539, "step": 2127 }, { "epoch": 0.8512, "grad_norm": 1.2275323867797852, "learning_rate": 5.694823746258299e-07, "loss": 0.2798, "step": 2128 }, { "batch_size": 1, "epoch": 0.8512, "step": 2128, "tokens_per_device": 4882 }, { "epoch": 0.8512, "loss_ce": 0.0015417345566675067, "loss_lvr": 0.8698841333389282, "loss_mode_switch": 0.0, "loss_total": 0.08853014558553696, "step": 2128 }, { "batch_size": 4, "epoch": 0.8512, "step": 2128, "tokens_per_device": 5324 }, { "epoch": 0.8512, "loss_ce": 0.47274476289749146, "loss_lvr": 0.7834345102310181, "loss_mode_switch": 0.0, "loss_total": 0.5510882139205933, "step": 2128 }, { "batch_size": 4, "epoch": 0.8512, "step": 2128, "tokens_per_device": 14128 }, { "epoch": 0.8512, "loss_ce": 0.2993900775909424, "loss_lvr": 0.9133113622665405, "loss_mode_switch": 0.0, "loss_total": 0.3907212018966675, "step": 2128 }, { "batch_size": 4, "epoch": 0.8512, "step": 2128, "tokens_per_device": 8156 }, { "epoch": 0.8512, "loss_ce": 0.0954124853014946, "loss_lvr": 0.5205613970756531, "loss_mode_switch": 0.0, "loss_total": 0.14746862649917603, "step": 2128 }, { "batch_size": 4, "epoch": 0.8512, "step": 2128, "tokens_per_device": 4240 }, { "epoch": 0.8512, "loss_ce": 0.08772366493940353, "loss_lvr": 0.9120530486106873, "loss_mode_switch": 0.0, "loss_total": 0.17892897129058838, "step": 2128 }, { "batch_size": 4, "epoch": 0.8512, "step": 2128, "tokens_per_device": 15384 }, { "epoch": 0.8512, "loss_ce": 0.06029133126139641, "loss_lvr": 0.23194515705108643, "loss_mode_switch": 0.0, "loss_total": 0.08348584920167923, "step": 2128 }, { "batch_size": 4, "epoch": 0.8512, "step": 2128, "tokens_per_device": 2052 }, { "epoch": 0.8512, "loss_ce": 0.4589599072933197, "loss_lvr": 1.0677489042282104, "loss_mode_switch": 0.0, "loss_total": 0.5657348036766052, "step": 2128 }, { "batch_size": 1, "epoch": 0.8512, "step": 2128, "tokens_per_device": 5065 }, { "epoch": 0.8512, "loss_ce": 0.006356238853186369, "loss_lvr": 0.47660455107688904, "loss_mode_switch": 0.0, "loss_total": 0.05401669442653656, "step": 2128 }, { "epoch": 0.8516, "grad_norm": 1.3294597864151, "learning_rate": 5.664838490219565e-07, "loss": 0.2557, "step": 2129 }, { "batch_size": 4, "epoch": 0.8516, "step": 2129, "tokens_per_device": 4892 }, { "epoch": 0.8516, "loss_ce": 0.5866715908050537, "loss_lvr": 0.9456426501274109, "loss_mode_switch": 0.0, "loss_total": 0.6812358498573303, "step": 2129 }, { "batch_size": 1, "epoch": 0.8516, "step": 2129, "tokens_per_device": 5168 }, { "epoch": 0.8516, "loss_ce": 0.0004387454828247428, "loss_lvr": 0.487959623336792, "loss_mode_switch": 0.0, "loss_total": 0.04923471063375473, "step": 2129 }, { "batch_size": 1, "epoch": 0.8516, "step": 2129, "tokens_per_device": 4889 }, { "epoch": 0.8516, "loss_ce": 0.013371312990784645, "loss_lvr": 0.4623977839946747, "loss_mode_switch": 0.0, "loss_total": 0.059611089527606964, "step": 2129 }, { "batch_size": 4, "epoch": 0.8516, "step": 2129, "tokens_per_device": 3776 }, { "epoch": 0.8516, "loss_ce": 0.4741445779800415, "loss_lvr": 0.8264943957328796, "loss_mode_switch": 0.0, "loss_total": 0.5567940473556519, "step": 2129 }, { "batch_size": 4, "epoch": 0.8516, "step": 2129, "tokens_per_device": 5716 }, { "epoch": 0.8516, "loss_ce": 0.09761273115873337, "loss_lvr": 0.8844155669212341, "loss_mode_switch": 0.0, "loss_total": 0.1860542893409729, "step": 2129 }, { "batch_size": 1, "epoch": 0.8516, "step": 2129, "tokens_per_device": 5180 }, { "epoch": 0.8516, "loss_ce": 0.11456482112407684, "loss_lvr": 0.6153647899627686, "loss_mode_switch": 0.0, "loss_total": 0.17610129714012146, "step": 2129 }, { "batch_size": 1, "epoch": 0.8516, "step": 2129, "tokens_per_device": 5098 }, { "epoch": 0.8516, "loss_ce": 0.036200471222400665, "loss_lvr": 0.43013179302215576, "loss_mode_switch": 0.0, "loss_total": 0.07921364903450012, "step": 2129 }, { "batch_size": 4, "epoch": 0.8516, "step": 2129, "tokens_per_device": 2676 }, { "epoch": 0.8516, "loss_ce": 0.46561121940612793, "loss_lvr": 0.9592435359954834, "loss_mode_switch": 0.0, "loss_total": 0.5615355968475342, "step": 2129 }, { "epoch": 0.852, "grad_norm": 1.4211595058441162, "learning_rate": 5.634927643013899e-07, "loss": 0.2975, "step": 2130 }, { "batch_size": 4, "epoch": 0.852, "step": 2130, "tokens_per_device": 1548 }, { "epoch": 0.852, "loss_ce": 0.37457284331321716, "loss_lvr": 0.8507891893386841, "loss_mode_switch": 0.0, "loss_total": 0.45965176820755005, "step": 2130 }, { "batch_size": 1, "epoch": 0.852, "step": 2130, "tokens_per_device": 5130 }, { "epoch": 0.852, "loss_ce": 0.0811924934387207, "loss_lvr": 0.2908810079097748, "loss_mode_switch": 0.0, "loss_total": 0.1102805957198143, "step": 2130 }, { "batch_size": 4, "epoch": 0.852, "step": 2130, "tokens_per_device": 10120 }, { "epoch": 0.852, "loss_ce": 0.028289007022976875, "loss_lvr": 0.4187445044517517, "loss_mode_switch": 0.0, "loss_total": 0.07016345858573914, "step": 2130 }, { "batch_size": 4, "epoch": 0.852, "step": 2130, "tokens_per_device": 4668 }, { "epoch": 0.852, "loss_ce": 0.22610071301460266, "loss_lvr": 0.7567259669303894, "loss_mode_switch": 0.0, "loss_total": 0.3017733097076416, "step": 2130 }, { "batch_size": 1, "epoch": 0.852, "step": 2130, "tokens_per_device": 6117 }, { "epoch": 0.852, "loss_ce": 0.07327865809202194, "loss_lvr": 0.500503659248352, "loss_mode_switch": 0.0, "loss_total": 0.1233290284872055, "step": 2130 }, { "batch_size": 1, "epoch": 0.852, "step": 2130, "tokens_per_device": 6674 }, { "epoch": 0.852, "loss_ce": 0.0016736548859626055, "loss_lvr": 0.32299360632896423, "loss_mode_switch": 0.0, "loss_total": 0.03397301584482193, "step": 2130 }, { "batch_size": 4, "epoch": 0.852, "step": 2130, "tokens_per_device": 5576 }, { "epoch": 0.852, "loss_ce": 0.055106986314058304, "loss_lvr": 0.5573813319206238, "loss_mode_switch": 0.0, "loss_total": 0.11084511876106262, "step": 2130 }, { "batch_size": 1, "epoch": 0.852, "step": 2130, "tokens_per_device": 4881 }, { "epoch": 0.852, "loss_ce": 0.04162994399666786, "loss_lvr": 0.18042531609535217, "loss_mode_switch": 0.0, "loss_total": 0.05967247486114502, "step": 2130 }, { "epoch": 0.8524, "grad_norm": 1.2993216514587402, "learning_rate": 5.605091254841427e-07, "loss": 0.3087, "step": 2131 }, { "batch_size": 4, "epoch": 0.8524, "step": 2131, "tokens_per_device": 15872 }, { "epoch": 0.8524, "loss_ce": 0.29221174120903015, "loss_lvr": 0.3315485119819641, "loss_mode_switch": 0.0, "loss_total": 0.3253665864467621, "step": 2131 }, { "batch_size": 4, "epoch": 0.8524, "step": 2131, "tokens_per_device": 2632 }, { "epoch": 0.8524, "loss_ce": 0.3215259909629822, "loss_lvr": 0.8700441122055054, "loss_mode_switch": 0.0, "loss_total": 0.40853041410446167, "step": 2131 }, { "batch_size": 4, "epoch": 0.8524, "step": 2131, "tokens_per_device": 3928 }, { "epoch": 0.8524, "loss_ce": 0.31343159079551697, "loss_lvr": 0.8372102975845337, "loss_mode_switch": 0.0, "loss_total": 0.3971526324748993, "step": 2131 }, { "batch_size": 4, "epoch": 0.8524, "step": 2131, "tokens_per_device": 4348 }, { "epoch": 0.8524, "loss_ce": 0.013410559855401516, "loss_lvr": 0.762712299823761, "loss_mode_switch": 0.0, "loss_total": 0.08968178927898407, "step": 2131 }, { "batch_size": 4, "epoch": 0.8524, "step": 2131, "tokens_per_device": 2904 }, { "epoch": 0.8524, "loss_ce": 0.15192154049873352, "loss_lvr": 0.7797552943229675, "loss_mode_switch": 0.0, "loss_total": 0.22989708185195923, "step": 2131 }, { "batch_size": 4, "epoch": 0.8524, "step": 2131, "tokens_per_device": 5000 }, { "epoch": 0.8524, "loss_ce": 0.8114880919456482, "loss_lvr": 0.6079437136650085, "loss_mode_switch": 0.0, "loss_total": 0.8722824454307556, "step": 2131 }, { "batch_size": 1, "epoch": 0.8524, "step": 2131, "tokens_per_device": 4945 }, { "epoch": 0.8524, "loss_ce": 0.1024121418595314, "loss_lvr": 0.3989905118942261, "loss_mode_switch": 0.0, "loss_total": 0.1423111855983734, "step": 2131 }, { "batch_size": 1, "epoch": 0.8524, "step": 2131, "tokens_per_device": 5019 }, { "epoch": 0.8524, "loss_ce": 0.021328160539269447, "loss_lvr": 0.3249901235103607, "loss_mode_switch": 0.0, "loss_total": 0.05382717400789261, "step": 2131 }, { "epoch": 0.8528, "grad_norm": 1.204134225845337, "learning_rate": 5.575329375777333e-07, "loss": 0.2271, "step": 2132 }, { "batch_size": 4, "epoch": 0.8528, "step": 2132, "tokens_per_device": 4072 }, { "epoch": 0.8528, "loss_ce": 0.052196864038705826, "loss_lvr": 0.8105189800262451, "loss_mode_switch": 0.0, "loss_total": 0.13324876129627228, "step": 2132 }, { "batch_size": 4, "epoch": 0.8528, "step": 2132, "tokens_per_device": 4192 }, { "epoch": 0.8528, "loss_ce": 0.16948096454143524, "loss_lvr": 0.860065758228302, "loss_mode_switch": 0.0, "loss_total": 0.2554875314235687, "step": 2132 }, { "batch_size": 1, "epoch": 0.8528, "step": 2132, "tokens_per_device": 6053 }, { "epoch": 0.8528, "loss_ce": 0.17064453661441803, "loss_lvr": 0.4651266634464264, "loss_mode_switch": 0.0, "loss_total": 0.21715719997882843, "step": 2132 }, { "batch_size": 4, "epoch": 0.8528, "step": 2132, "tokens_per_device": 7712 }, { "epoch": 0.8528, "loss_ce": 0.03785570338368416, "loss_lvr": 0.748435914516449, "loss_mode_switch": 0.0, "loss_total": 0.11269930005073547, "step": 2132 }, { "batch_size": 4, "epoch": 0.8528, "step": 2132, "tokens_per_device": 4300 }, { "epoch": 0.8528, "loss_ce": 0.11657595634460449, "loss_lvr": 0.7731422781944275, "loss_mode_switch": 0.0, "loss_total": 0.19389018416404724, "step": 2132 }, { "batch_size": 4, "epoch": 0.8528, "step": 2132, "tokens_per_device": 3976 }, { "epoch": 0.8528, "loss_ce": 0.284970223903656, "loss_lvr": 0.9574328064918518, "loss_mode_switch": 0.0, "loss_total": 0.3807135224342346, "step": 2132 }, { "batch_size": 4, "epoch": 0.8528, "step": 2132, "tokens_per_device": 16316 }, { "epoch": 0.8528, "loss_ce": 0.007323202211409807, "loss_lvr": 0.3319047689437866, "loss_mode_switch": 0.0, "loss_total": 0.04051367938518524, "step": 2132 }, { "batch_size": 4, "epoch": 0.8528, "step": 2132, "tokens_per_device": 3880 }, { "epoch": 0.8528, "loss_ce": 0.27571794390678406, "loss_lvr": 0.9335564970970154, "loss_mode_switch": 0.0, "loss_total": 0.3690735995769501, "step": 2132 }, { "epoch": 0.8532, "grad_norm": 1.356711983680725, "learning_rate": 5.545642055771749e-07, "loss": 0.2525, "step": 2133 }, { "batch_size": 4, "epoch": 0.8532, "step": 2133, "tokens_per_device": 4680 }, { "epoch": 0.8532, "loss_ce": 0.10323559492826462, "loss_lvr": 0.5765486359596252, "loss_mode_switch": 0.0, "loss_total": 0.16089046001434326, "step": 2133 }, { "batch_size": 4, "epoch": 0.8532, "step": 2133, "tokens_per_device": 10488 }, { "epoch": 0.8532, "loss_ce": 0.07230503857135773, "loss_lvr": 0.43471860885620117, "loss_mode_switch": 0.0, "loss_total": 0.1157768964767456, "step": 2133 }, { "batch_size": 4, "epoch": 0.8532, "step": 2133, "tokens_per_device": 4516 }, { "epoch": 0.8532, "loss_ce": 0.14575840532779694, "loss_lvr": 0.68373042345047, "loss_mode_switch": 0.0, "loss_total": 0.2141314446926117, "step": 2133 }, { "batch_size": 4, "epoch": 0.8532, "step": 2133, "tokens_per_device": 4292 }, { "epoch": 0.8532, "loss_ce": 0.4327889084815979, "loss_lvr": 0.8032153248786926, "loss_mode_switch": 0.0, "loss_total": 0.5131104588508606, "step": 2133 }, { "batch_size": 1, "epoch": 0.8532, "step": 2133, "tokens_per_device": 4870 }, { "epoch": 0.8532, "loss_ce": 0.017116963863372803, "loss_lvr": 0.49605533480644226, "loss_mode_switch": 0.0, "loss_total": 0.06672249734401703, "step": 2133 }, { "batch_size": 4, "epoch": 0.8532, "step": 2133, "tokens_per_device": 14452 }, { "epoch": 0.8532, "loss_ce": 0.12016666680574417, "loss_lvr": 0.48682674765586853, "loss_mode_switch": 0.0, "loss_total": 0.16884934902191162, "step": 2133 }, { "batch_size": 1, "epoch": 0.8532, "step": 2133, "tokens_per_device": 4883 }, { "epoch": 0.8532, "loss_ce": 0.00046312279300764203, "loss_lvr": 0.2635391354560852, "loss_mode_switch": 0.0, "loss_total": 0.026817036792635918, "step": 2133 }, { "batch_size": 4, "epoch": 0.8532, "step": 2133, "tokens_per_device": 4232 }, { "epoch": 0.8532, "loss_ce": 0.3984523117542267, "loss_lvr": 0.8517550230026245, "loss_mode_switch": 0.0, "loss_total": 0.4836278259754181, "step": 2133 }, { "epoch": 0.8536, "grad_norm": 1.222277045249939, "learning_rate": 5.516029344649649e-07, "loss": 0.2694, "step": 2134 }, { "batch_size": 4, "epoch": 0.8536, "step": 2134, "tokens_per_device": 4468 }, { "epoch": 0.8536, "loss_ce": 0.3013087809085846, "loss_lvr": 0.8648113012313843, "loss_mode_switch": 0.0, "loss_total": 0.38778990507125854, "step": 2134 }, { "batch_size": 4, "epoch": 0.8536, "step": 2134, "tokens_per_device": 4288 }, { "epoch": 0.8536, "loss_ce": 0.028246864676475525, "loss_lvr": 0.5533084273338318, "loss_mode_switch": 0.0, "loss_total": 0.0835777074098587, "step": 2134 }, { "batch_size": 4, "epoch": 0.8536, "step": 2134, "tokens_per_device": 4900 }, { "epoch": 0.8536, "loss_ce": 0.005123144946992397, "loss_lvr": 0.6946849822998047, "loss_mode_switch": 0.0, "loss_total": 0.07459164410829544, "step": 2134 }, { "batch_size": 1, "epoch": 0.8536, "step": 2134, "tokens_per_device": 5174 }, { "epoch": 0.8536, "loss_ce": 0.06393655389547348, "loss_lvr": 0.3774876296520233, "loss_mode_switch": 0.0, "loss_total": 0.10168531537055969, "step": 2134 }, { "batch_size": 4, "epoch": 0.8536, "step": 2134, "tokens_per_device": 3128 }, { "epoch": 0.8536, "loss_ce": 0.7051215171813965, "loss_lvr": 0.7644141912460327, "loss_mode_switch": 0.0, "loss_total": 0.7815629243850708, "step": 2134 }, { "batch_size": 4, "epoch": 0.8536, "step": 2134, "tokens_per_device": 2648 }, { "epoch": 0.8536, "loss_ce": 0.07580271363258362, "loss_lvr": 0.5892848968505859, "loss_mode_switch": 0.0, "loss_total": 0.1347312033176422, "step": 2134 }, { "batch_size": 4, "epoch": 0.8536, "step": 2134, "tokens_per_device": 4360 }, { "epoch": 0.8536, "loss_ce": 0.19031000137329102, "loss_lvr": 0.8141514658927917, "loss_mode_switch": 0.0, "loss_total": 0.2717251479625702, "step": 2134 }, { "batch_size": 4, "epoch": 0.8536, "step": 2134, "tokens_per_device": 2760 }, { "epoch": 0.8536, "loss_ce": 0.39418235421180725, "loss_lvr": 0.8313871026039124, "loss_mode_switch": 0.0, "loss_total": 0.477321058511734, "step": 2134 }, { "epoch": 0.854, "grad_norm": 1.8149118423461914, "learning_rate": 5.486491292110796e-07, "loss": 0.3185, "step": 2135 }, { "batch_size": 4, "epoch": 0.854, "step": 2135, "tokens_per_device": 4008 }, { "epoch": 0.854, "loss_ce": 0.05757804960012436, "loss_lvr": 0.5299896597862244, "loss_mode_switch": 0.0, "loss_total": 0.11057701706886292, "step": 2135 }, { "batch_size": 1, "epoch": 0.854, "step": 2135, "tokens_per_device": 4996 }, { "epoch": 0.854, "loss_ce": 0.9030011892318726, "loss_lvr": 0.324291855096817, "loss_mode_switch": 0.0, "loss_total": 0.9354303479194641, "step": 2135 }, { "batch_size": 1, "epoch": 0.854, "step": 2135, "tokens_per_device": 7005 }, { "epoch": 0.854, "loss_ce": 0.12817776203155518, "loss_lvr": 0.34033629298210144, "loss_mode_switch": 0.0, "loss_total": 0.16221138834953308, "step": 2135 }, { "batch_size": 4, "epoch": 0.854, "step": 2135, "tokens_per_device": 4288 }, { "epoch": 0.854, "loss_ce": 0.08678232878446579, "loss_lvr": 2.731684446334839, "loss_mode_switch": 0.0, "loss_total": 0.3599507808685303, "step": 2135 }, { "batch_size": 1, "epoch": 0.854, "step": 2135, "tokens_per_device": 5272 }, { "epoch": 0.854, "loss_ce": 0.21733517944812775, "loss_lvr": 0.30161720514297485, "loss_mode_switch": 0.0, "loss_total": 0.24749690294265747, "step": 2135 }, { "batch_size": 4, "epoch": 0.854, "step": 2135, "tokens_per_device": 4248 }, { "epoch": 0.854, "loss_ce": 0.12848633527755737, "loss_lvr": 0.8522831201553345, "loss_mode_switch": 0.0, "loss_total": 0.21371465921401978, "step": 2135 }, { "batch_size": 4, "epoch": 0.854, "step": 2135, "tokens_per_device": 4204 }, { "epoch": 0.854, "loss_ce": 0.02506454661488533, "loss_lvr": 0.31220942735671997, "loss_mode_switch": 0.0, "loss_total": 0.05628548935055733, "step": 2135 }, { "batch_size": 4, "epoch": 0.854, "step": 2135, "tokens_per_device": 2640 }, { "epoch": 0.854, "loss_ce": 0.3570706248283386, "loss_lvr": 0.8179764151573181, "loss_mode_switch": 0.0, "loss_total": 0.43886828422546387, "step": 2135 }, { "epoch": 0.8544, "grad_norm": 1.2924028635025024, "learning_rate": 5.457027947729676e-07, "loss": 0.2675, "step": 2136 }, { "batch_size": 4, "epoch": 0.8544, "step": 2136, "tokens_per_device": 3988 }, { "epoch": 0.8544, "loss_ce": 0.49025753140449524, "loss_lvr": 0.800467848777771, "loss_mode_switch": 0.0, "loss_total": 0.5703043341636658, "step": 2136 }, { "batch_size": 4, "epoch": 0.8544, "step": 2136, "tokens_per_device": 8288 }, { "epoch": 0.8544, "loss_ce": 0.08437815308570862, "loss_lvr": 0.4610598683357239, "loss_mode_switch": 0.0, "loss_total": 0.13048413395881653, "step": 2136 }, { "batch_size": 4, "epoch": 0.8544, "step": 2136, "tokens_per_device": 5688 }, { "epoch": 0.8544, "loss_ce": 0.029409123584628105, "loss_lvr": 0.8160496950149536, "loss_mode_switch": 0.0, "loss_total": 0.11101409792900085, "step": 2136 }, { "batch_size": 1, "epoch": 0.8544, "step": 2136, "tokens_per_device": 4823 }, { "epoch": 0.8544, "loss_ce": 0.003665132215246558, "loss_lvr": 0.29973191022872925, "loss_mode_switch": 0.0, "loss_total": 0.033638324588537216, "step": 2136 }, { "batch_size": 4, "epoch": 0.8544, "step": 2136, "tokens_per_device": 12208 }, { "epoch": 0.8544, "loss_ce": 0.03683970123529434, "loss_lvr": 0.803363025188446, "loss_mode_switch": 0.0, "loss_total": 0.11717600375413895, "step": 2136 }, { "batch_size": 4, "epoch": 0.8544, "step": 2136, "tokens_per_device": 1584 }, { "epoch": 0.8544, "loss_ce": 0.24278317391872406, "loss_lvr": 1.4884556531906128, "loss_mode_switch": 0.0, "loss_total": 0.3916287422180176, "step": 2136 }, { "batch_size": 1, "epoch": 0.8544, "step": 2136, "tokens_per_device": 5259 }, { "epoch": 0.8544, "loss_ce": 0.004865861497819424, "loss_lvr": 0.27381110191345215, "loss_mode_switch": 0.0, "loss_total": 0.03224697336554527, "step": 2136 }, { "batch_size": 4, "epoch": 0.8544, "step": 2136, "tokens_per_device": 5988 }, { "epoch": 0.8544, "loss_ce": 0.06278378516435623, "loss_lvr": 0.7324739098548889, "loss_mode_switch": 0.0, "loss_total": 0.13603118062019348, "step": 2136 }, { "epoch": 0.8548, "grad_norm": 1.3080577850341797, "learning_rate": 5.42763936095535e-07, "loss": 0.2696, "step": 2137 }, { "batch_size": 4, "epoch": 0.8548, "step": 2137, "tokens_per_device": 1264 }, { "epoch": 0.8548, "loss_ce": 0.29756590723991394, "loss_lvr": 1.3666913509368896, "loss_mode_switch": 0.0, "loss_total": 0.4342350363731384, "step": 2137 }, { "batch_size": 4, "epoch": 0.8548, "step": 2137, "tokens_per_device": 8440 }, { "epoch": 0.8548, "loss_ce": 0.09209791570901871, "loss_lvr": 0.8726223707199097, "loss_mode_switch": 0.0, "loss_total": 0.17936015129089355, "step": 2137 }, { "batch_size": 4, "epoch": 0.8548, "step": 2137, "tokens_per_device": 9216 }, { "epoch": 0.8548, "loss_ce": 0.3729575574398041, "loss_lvr": 0.8639026284217834, "loss_mode_switch": 0.0, "loss_total": 0.45934781432151794, "step": 2137 }, { "batch_size": 4, "epoch": 0.8548, "step": 2137, "tokens_per_device": 4132 }, { "epoch": 0.8548, "loss_ce": 0.6724767684936523, "loss_lvr": 0.9185981750488281, "loss_mode_switch": 0.0, "loss_total": 0.7643365859985352, "step": 2137 }, { "batch_size": 4, "epoch": 0.8548, "step": 2137, "tokens_per_device": 13336 }, { "epoch": 0.8548, "loss_ce": 0.31186774373054504, "loss_lvr": 0.605907142162323, "loss_mode_switch": 0.0, "loss_total": 0.37245845794677734, "step": 2137 }, { "batch_size": 1, "epoch": 0.8548, "step": 2137, "tokens_per_device": 5066 }, { "epoch": 0.8548, "loss_ce": 0.004993076901882887, "loss_lvr": 0.3105296790599823, "loss_mode_switch": 0.0, "loss_total": 0.036046043038368225, "step": 2137 }, { "batch_size": 4, "epoch": 0.8548, "step": 2137, "tokens_per_device": 4936 }, { "epoch": 0.8548, "loss_ce": 0.03328275308012962, "loss_lvr": 0.851518452167511, "loss_mode_switch": 0.0, "loss_total": 0.1184345930814743, "step": 2137 }, { "batch_size": 1, "epoch": 0.8548, "step": 2137, "tokens_per_device": 5160 }, { "epoch": 0.8548, "loss_ce": 0.017978640273213387, "loss_lvr": 0.34878528118133545, "loss_mode_switch": 0.0, "loss_total": 0.0528571680188179, "step": 2137 }, { "epoch": 0.8552, "grad_norm": 1.284445881843567, "learning_rate": 5.398325581111458e-07, "loss": 0.3103, "step": 2138 }, { "batch_size": 4, "epoch": 0.8552, "step": 2138, "tokens_per_device": 6220 }, { "epoch": 0.8552, "loss_ce": 0.03619284927845001, "loss_lvr": 0.713528037071228, "loss_mode_switch": 0.0, "loss_total": 0.1075456514954567, "step": 2138 }, { "batch_size": 4, "epoch": 0.8552, "step": 2138, "tokens_per_device": 3764 }, { "epoch": 0.8552, "loss_ce": 0.2287500947713852, "loss_lvr": 1.0008649826049805, "loss_mode_switch": 0.0, "loss_total": 0.328836590051651, "step": 2138 }, { "batch_size": 4, "epoch": 0.8552, "step": 2138, "tokens_per_device": 1352 }, { "epoch": 0.8552, "loss_ce": 0.23091213405132294, "loss_lvr": 0.8741394877433777, "loss_mode_switch": 0.0, "loss_total": 0.31832608580589294, "step": 2138 }, { "batch_size": 4, "epoch": 0.8552, "step": 2138, "tokens_per_device": 4496 }, { "epoch": 0.8552, "loss_ce": 0.6720976829528809, "loss_lvr": 0.7764374017715454, "loss_mode_switch": 0.0, "loss_total": 0.7497414350509644, "step": 2138 }, { "batch_size": 4, "epoch": 0.8552, "step": 2138, "tokens_per_device": 3928 }, { "epoch": 0.8552, "loss_ce": 0.4404480457305908, "loss_lvr": 0.8540725708007812, "loss_mode_switch": 0.0, "loss_total": 0.525855302810669, "step": 2138 }, { "batch_size": 4, "epoch": 0.8552, "step": 2138, "tokens_per_device": 9336 }, { "epoch": 0.8552, "loss_ce": 0.01137190219014883, "loss_lvr": 0.6315087676048279, "loss_mode_switch": 0.0, "loss_total": 0.07452277839183807, "step": 2138 }, { "batch_size": 4, "epoch": 0.8552, "step": 2138, "tokens_per_device": 3768 }, { "epoch": 0.8552, "loss_ce": 0.25681108236312866, "loss_lvr": 0.8166369795799255, "loss_mode_switch": 0.0, "loss_total": 0.3384747803211212, "step": 2138 }, { "batch_size": 1, "epoch": 0.8552, "step": 2138, "tokens_per_device": 4866 }, { "epoch": 0.8552, "loss_ce": 0.007584773004055023, "loss_lvr": 0.27339449524879456, "loss_mode_switch": 0.0, "loss_total": 0.0349242240190506, "step": 2138 }, { "epoch": 0.8556, "grad_norm": 1.2558094263076782, "learning_rate": 5.36908665739605e-07, "loss": 0.2719, "step": 2139 }, { "batch_size": 1, "epoch": 0.8556, "step": 2139, "tokens_per_device": 4881 }, { "epoch": 0.8556, "loss_ce": 0.01730448007583618, "loss_lvr": 0.7391427755355835, "loss_mode_switch": 0.0, "loss_total": 0.09121876209974289, "step": 2139 }, { "batch_size": 4, "epoch": 0.8556, "step": 2139, "tokens_per_device": 5964 }, { "epoch": 0.8556, "loss_ce": 0.005606366787105799, "loss_lvr": 0.5022943019866943, "loss_mode_switch": 0.0, "loss_total": 0.055835798382759094, "step": 2139 }, { "batch_size": 1, "epoch": 0.8556, "step": 2139, "tokens_per_device": 5117 }, { "epoch": 0.8556, "loss_ce": 0.05962076410651207, "loss_lvr": 0.5010915398597717, "loss_mode_switch": 0.0, "loss_total": 0.10972991585731506, "step": 2139 }, { "batch_size": 4, "epoch": 0.8556, "step": 2139, "tokens_per_device": 11604 }, { "epoch": 0.8556, "loss_ce": 0.14080093801021576, "loss_lvr": 0.8176437616348267, "loss_mode_switch": 0.0, "loss_total": 0.22256532311439514, "step": 2139 }, { "batch_size": 4, "epoch": 0.8556, "step": 2139, "tokens_per_device": 3784 }, { "epoch": 0.8556, "loss_ce": 0.06893125176429749, "loss_lvr": 0.8527267575263977, "loss_mode_switch": 0.0, "loss_total": 0.15420392155647278, "step": 2139 }, { "batch_size": 4, "epoch": 0.8556, "step": 2139, "tokens_per_device": 2600 }, { "epoch": 0.8556, "loss_ce": 0.3737991452217102, "loss_lvr": 0.8966400623321533, "loss_mode_switch": 0.0, "loss_total": 0.46346315741539, "step": 2139 }, { "batch_size": 1, "epoch": 0.8556, "step": 2139, "tokens_per_device": 5173 }, { "epoch": 0.8556, "loss_ce": 0.053596775978803635, "loss_lvr": 0.520415186882019, "loss_mode_switch": 0.0, "loss_total": 0.1056382954120636, "step": 2139 }, { "batch_size": 1, "epoch": 0.8556, "step": 2139, "tokens_per_device": 5119 }, { "epoch": 0.8556, "loss_ce": 0.005738419946283102, "loss_lvr": 0.395255982875824, "loss_mode_switch": 0.0, "loss_total": 0.04526401683688164, "step": 2139 }, { "epoch": 0.856, "grad_norm": 1.313927412033081, "learning_rate": 5.339922638881545e-07, "loss": 0.287, "step": 2140 }, { "batch_size": 4, "epoch": 0.856, "step": 2140, "tokens_per_device": 1476 }, { "epoch": 0.856, "loss_ce": 0.551846981048584, "loss_lvr": 0.8516159057617188, "loss_mode_switch": 0.0, "loss_total": 0.637008547782898, "step": 2140 }, { "batch_size": 1, "epoch": 0.856, "step": 2140, "tokens_per_device": 5167 }, { "epoch": 0.856, "loss_ce": 0.0031526703387498856, "loss_lvr": 0.335702508687973, "loss_mode_switch": 0.0, "loss_total": 0.03672292083501816, "step": 2140 }, { "batch_size": 4, "epoch": 0.856, "step": 2140, "tokens_per_device": 6480 }, { "epoch": 0.856, "loss_ce": 0.033745016902685165, "loss_lvr": 0.7737495303153992, "loss_mode_switch": 0.0, "loss_total": 0.11111997067928314, "step": 2140 }, { "batch_size": 1, "epoch": 0.856, "step": 2140, "tokens_per_device": 4892 }, { "epoch": 0.856, "loss_ce": 0.009307225234806538, "loss_lvr": 0.6444778442382812, "loss_mode_switch": 0.0, "loss_total": 0.07375501096248627, "step": 2140 }, { "batch_size": 4, "epoch": 0.856, "step": 2140, "tokens_per_device": 3988 }, { "epoch": 0.856, "loss_ce": 0.06407533586025238, "loss_lvr": 0.8851222395896912, "loss_mode_switch": 0.0, "loss_total": 0.15258756279945374, "step": 2140 }, { "batch_size": 4, "epoch": 0.856, "step": 2140, "tokens_per_device": 3828 }, { "epoch": 0.856, "loss_ce": 0.2458614557981491, "loss_lvr": 0.6874132752418518, "loss_mode_switch": 0.0, "loss_total": 0.314602792263031, "step": 2140 }, { "batch_size": 4, "epoch": 0.856, "step": 2140, "tokens_per_device": 3804 }, { "epoch": 0.856, "loss_ce": 0.28207647800445557, "loss_lvr": 0.7087375521659851, "loss_mode_switch": 0.0, "loss_total": 0.35295024514198303, "step": 2140 }, { "batch_size": 1, "epoch": 0.856, "step": 2140, "tokens_per_device": 4861 }, { "epoch": 0.856, "loss_ce": 0.00016154718468897045, "loss_lvr": 0.2945920526981354, "loss_mode_switch": 0.0, "loss_total": 0.029620753601193428, "step": 2140 }, { "epoch": 0.8564, "grad_norm": 1.3775722980499268, "learning_rate": 5.31083357451469e-07, "loss": 0.2967, "step": 2141 }, { "batch_size": 4, "epoch": 0.8564, "step": 2141, "tokens_per_device": 3872 }, { "epoch": 0.8564, "loss_ce": 0.008260966278612614, "loss_lvr": 0.8898821473121643, "loss_mode_switch": 0.0, "loss_total": 0.09724918007850647, "step": 2141 }, { "batch_size": 4, "epoch": 0.8564, "step": 2141, "tokens_per_device": 8352 }, { "epoch": 0.8564, "loss_ce": 0.304583877325058, "loss_lvr": 0.6792250871658325, "loss_mode_switch": 0.0, "loss_total": 0.37250638008117676, "step": 2141 }, { "batch_size": 4, "epoch": 0.8564, "step": 2141, "tokens_per_device": 5088 }, { "epoch": 0.8564, "loss_ce": 0.4799340069293976, "loss_lvr": 0.634227454662323, "loss_mode_switch": 0.0, "loss_total": 0.5433567762374878, "step": 2141 }, { "batch_size": 4, "epoch": 0.8564, "step": 2141, "tokens_per_device": 3032 }, { "epoch": 0.8564, "loss_ce": 0.014411349780857563, "loss_lvr": 0.7943015694618225, "loss_mode_switch": 0.0, "loss_total": 0.09384150803089142, "step": 2141 }, { "batch_size": 1, "epoch": 0.8564, "step": 2141, "tokens_per_device": 4750 }, { "epoch": 0.8564, "loss_ce": 0.008527733385562897, "loss_lvr": 0.29443392157554626, "loss_mode_switch": 0.0, "loss_total": 0.037971124053001404, "step": 2141 }, { "batch_size": 4, "epoch": 0.8564, "step": 2141, "tokens_per_device": 1652 }, { "epoch": 0.8564, "loss_ce": 0.1980033814907074, "loss_lvr": 1.6106421947479248, "loss_mode_switch": 0.0, "loss_total": 0.3590676188468933, "step": 2141 }, { "batch_size": 4, "epoch": 0.8564, "step": 2141, "tokens_per_device": 3756 }, { "epoch": 0.8564, "loss_ce": 0.280015766620636, "loss_lvr": 0.7369122505187988, "loss_mode_switch": 0.0, "loss_total": 0.3537069857120514, "step": 2141 }, { "batch_size": 4, "epoch": 0.8564, "step": 2141, "tokens_per_device": 4128 }, { "epoch": 0.8564, "loss_ce": 0.1875850260257721, "loss_lvr": 0.9338009357452393, "loss_mode_switch": 0.0, "loss_total": 0.280965119600296, "step": 2141 }, { "epoch": 0.8568, "grad_norm": 1.321553349494934, "learning_rate": 5.281819513116371e-07, "loss": 0.2839, "step": 2142 }, { "batch_size": 4, "epoch": 0.8568, "step": 2142, "tokens_per_device": 4040 }, { "epoch": 0.8568, "loss_ce": 0.3167848289012909, "loss_lvr": 0.8929712772369385, "loss_mode_switch": 0.0, "loss_total": 0.4060819745063782, "step": 2142 }, { "batch_size": 4, "epoch": 0.8568, "step": 2142, "tokens_per_device": 8420 }, { "epoch": 0.8568, "loss_ce": 0.2904362976551056, "loss_lvr": 0.6935955882072449, "loss_mode_switch": 0.0, "loss_total": 0.35979586839675903, "step": 2142 }, { "batch_size": 4, "epoch": 0.8568, "step": 2142, "tokens_per_device": 3236 }, { "epoch": 0.8568, "loss_ce": 0.4942857623100281, "loss_lvr": 1.031838297843933, "loss_mode_switch": 0.0, "loss_total": 0.5974695682525635, "step": 2142 }, { "batch_size": 4, "epoch": 0.8568, "step": 2142, "tokens_per_device": 4248 }, { "epoch": 0.8568, "loss_ce": 0.353899747133255, "loss_lvr": 0.7895632386207581, "loss_mode_switch": 0.0, "loss_total": 0.43285608291625977, "step": 2142 }, { "batch_size": 4, "epoch": 0.8568, "step": 2142, "tokens_per_device": 10480 }, { "epoch": 0.8568, "loss_ce": 0.14012916386127472, "loss_lvr": 1.0181080102920532, "loss_mode_switch": 0.0, "loss_total": 0.2419399619102478, "step": 2142 }, { "batch_size": 4, "epoch": 0.8568, "step": 2142, "tokens_per_device": 4668 }, { "epoch": 0.8568, "loss_ce": 0.21094512939453125, "loss_lvr": 0.8818882703781128, "loss_mode_switch": 0.0, "loss_total": 0.29913395643234253, "step": 2142 }, { "batch_size": 4, "epoch": 0.8568, "step": 2142, "tokens_per_device": 3828 }, { "epoch": 0.8568, "loss_ce": 0.01078761462122202, "loss_lvr": 0.5134860873222351, "loss_mode_switch": 0.0, "loss_total": 0.062136221677064896, "step": 2142 }, { "batch_size": 4, "epoch": 0.8568, "step": 2142, "tokens_per_device": 4440 }, { "epoch": 0.8568, "loss_ce": 0.2281343936920166, "loss_lvr": 0.5794925689697266, "loss_mode_switch": 0.0, "loss_total": 0.2860836386680603, "step": 2142 }, { "epoch": 0.8572, "grad_norm": 1.3260068893432617, "learning_rate": 5.252880503381658e-07, "loss": 0.2767, "step": 2143 }, { "batch_size": 4, "epoch": 0.8572, "step": 2143, "tokens_per_device": 4996 }, { "epoch": 0.8572, "loss_ce": 0.30034664273262024, "loss_lvr": 1.0040818452835083, "loss_mode_switch": 0.0, "loss_total": 0.4007548391819, "step": 2143 }, { "batch_size": 4, "epoch": 0.8572, "step": 2143, "tokens_per_device": 4296 }, { "epoch": 0.8572, "loss_ce": 0.11390873044729233, "loss_lvr": 0.7857034802436829, "loss_mode_switch": 0.0, "loss_total": 0.19247907400131226, "step": 2143 }, { "batch_size": 1, "epoch": 0.8572, "step": 2143, "tokens_per_device": 5108 }, { "epoch": 0.8572, "loss_ce": 0.00038992537884041667, "loss_lvr": 0.29010793566703796, "loss_mode_switch": 0.0, "loss_total": 0.029400719329714775, "step": 2143 }, { "batch_size": 1, "epoch": 0.8572, "step": 2143, "tokens_per_device": 5109 }, { "epoch": 0.8572, "loss_ce": 0.0023630766663700342, "loss_lvr": 0.45020994544029236, "loss_mode_switch": 0.0, "loss_total": 0.047384072095155716, "step": 2143 }, { "batch_size": 4, "epoch": 0.8572, "step": 2143, "tokens_per_device": 3760 }, { "epoch": 0.8572, "loss_ce": 0.20330218970775604, "loss_lvr": 0.7643439769744873, "loss_mode_switch": 0.0, "loss_total": 0.27973657846450806, "step": 2143 }, { "batch_size": 4, "epoch": 0.8572, "step": 2143, "tokens_per_device": 4212 }, { "epoch": 0.8572, "loss_ce": 0.2947824001312256, "loss_lvr": 1.8186203241348267, "loss_mode_switch": 0.0, "loss_total": 0.47664445638656616, "step": 2143 }, { "batch_size": 4, "epoch": 0.8572, "step": 2143, "tokens_per_device": 4444 }, { "epoch": 0.8572, "loss_ce": 0.2712390720844269, "loss_lvr": 0.7068097591400146, "loss_mode_switch": 0.0, "loss_total": 0.34192004799842834, "step": 2143 }, { "batch_size": 4, "epoch": 0.8572, "step": 2143, "tokens_per_device": 3400 }, { "epoch": 0.8572, "loss_ce": 0.19192425906658173, "loss_lvr": 1.0256174802780151, "loss_mode_switch": 0.0, "loss_total": 0.29448601603507996, "step": 2143 }, { "epoch": 0.8576, "grad_norm": 1.8467849493026733, "learning_rate": 5.22401659387961e-07, "loss": 0.356, "step": 2144 }, { "batch_size": 4, "epoch": 0.8576, "step": 2144, "tokens_per_device": 5724 }, { "epoch": 0.8576, "loss_ce": 0.08133187890052795, "loss_lvr": 1.0001285076141357, "loss_mode_switch": 0.0, "loss_total": 0.18134473264217377, "step": 2144 }, { "batch_size": 1, "epoch": 0.8576, "step": 2144, "tokens_per_device": 4885 }, { "epoch": 0.8576, "loss_ce": 0.08161718398332596, "loss_lvr": 0.8172764778137207, "loss_mode_switch": 0.0, "loss_total": 0.1633448302745819, "step": 2144 }, { "batch_size": 4, "epoch": 0.8576, "step": 2144, "tokens_per_device": 12436 }, { "epoch": 0.8576, "loss_ce": 0.3953620493412018, "loss_lvr": 0.6299508213996887, "loss_mode_switch": 0.0, "loss_total": 0.4583571255207062, "step": 2144 }, { "batch_size": 4, "epoch": 0.8576, "step": 2144, "tokens_per_device": 1868 }, { "epoch": 0.8576, "loss_ce": 0.3993016183376312, "loss_lvr": 0.7842469215393066, "loss_mode_switch": 0.0, "loss_total": 0.4777263104915619, "step": 2144 }, { "batch_size": 4, "epoch": 0.8576, "step": 2144, "tokens_per_device": 4176 }, { "epoch": 0.8576, "loss_ce": 0.1383434385061264, "loss_lvr": 0.6861850023269653, "loss_mode_switch": 0.0, "loss_total": 0.20696192979812622, "step": 2144 }, { "batch_size": 4, "epoch": 0.8576, "step": 2144, "tokens_per_device": 2516 }, { "epoch": 0.8576, "loss_ce": 0.13440927863121033, "loss_lvr": 0.9442426562309265, "loss_mode_switch": 0.0, "loss_total": 0.22883355617523193, "step": 2144 }, { "batch_size": 1, "epoch": 0.8576, "step": 2144, "tokens_per_device": 4855 }, { "epoch": 0.8576, "loss_ce": 0.007992292754352093, "loss_lvr": 0.5346587300300598, "loss_mode_switch": 0.0, "loss_total": 0.06145816668868065, "step": 2144 }, { "batch_size": 4, "epoch": 0.8576, "step": 2144, "tokens_per_device": 4532 }, { "epoch": 0.8576, "loss_ce": 0.10330704599618912, "loss_lvr": 0.7235948443412781, "loss_mode_switch": 0.0, "loss_total": 0.17566654086112976, "step": 2144 }, { "epoch": 0.858, "grad_norm": 1.4030743837356567, "learning_rate": 5.195227833053273e-07, "loss": 0.2915, "step": 2145 }, { "batch_size": 1, "epoch": 0.858, "step": 2145, "tokens_per_device": 5042 }, { "epoch": 0.858, "loss_ce": 0.08091982454061508, "loss_lvr": 0.38368335366249084, "loss_mode_switch": 0.0, "loss_total": 0.11928816139698029, "step": 2145 }, { "batch_size": 4, "epoch": 0.858, "step": 2145, "tokens_per_device": 1624 }, { "epoch": 0.858, "loss_ce": 0.11861031502485275, "loss_lvr": 0.9227551817893982, "loss_mode_switch": 0.0, "loss_total": 0.21088583767414093, "step": 2145 }, { "batch_size": 4, "epoch": 0.858, "step": 2145, "tokens_per_device": 5352 }, { "epoch": 0.858, "loss_ce": 0.09345469623804092, "loss_lvr": 0.8061423301696777, "loss_mode_switch": 0.0, "loss_total": 0.17406892776489258, "step": 2145 }, { "batch_size": 4, "epoch": 0.858, "step": 2145, "tokens_per_device": 5620 }, { "epoch": 0.858, "loss_ce": 0.41332417726516724, "loss_lvr": 0.9474743008613586, "loss_mode_switch": 0.0, "loss_total": 0.5080716013908386, "step": 2145 }, { "batch_size": 4, "epoch": 0.858, "step": 2145, "tokens_per_device": 2616 }, { "epoch": 0.858, "loss_ce": 0.6753759980201721, "loss_lvr": 0.7696824669837952, "loss_mode_switch": 0.0, "loss_total": 0.7523442506790161, "step": 2145 }, { "batch_size": 4, "epoch": 0.858, "step": 2145, "tokens_per_device": 1668 }, { "epoch": 0.858, "loss_ce": 0.7750062942504883, "loss_lvr": 0.8660153150558472, "loss_mode_switch": 0.0, "loss_total": 0.8616078495979309, "step": 2145 }, { "batch_size": 4, "epoch": 0.858, "step": 2145, "tokens_per_device": 4784 }, { "epoch": 0.858, "loss_ce": 0.7535926699638367, "loss_lvr": 0.7034168839454651, "loss_mode_switch": 0.0, "loss_total": 0.8239343762397766, "step": 2145 }, { "batch_size": 4, "epoch": 0.858, "step": 2145, "tokens_per_device": 3764 }, { "epoch": 0.858, "loss_ce": 0.11478926241397858, "loss_lvr": 0.9273422360420227, "loss_mode_switch": 0.0, "loss_total": 0.20752349495887756, "step": 2145 }, { "epoch": 0.8584, "grad_norm": 1.344889521598816, "learning_rate": 5.166514269219546e-07, "loss": 0.3177, "step": 2146 }, { "batch_size": 4, "epoch": 0.8584, "step": 2146, "tokens_per_device": 3680 }, { "epoch": 0.8584, "loss_ce": 0.4616481363773346, "loss_lvr": 0.792342483997345, "loss_mode_switch": 0.0, "loss_total": 0.540882408618927, "step": 2146 }, { "batch_size": 4, "epoch": 0.8584, "step": 2146, "tokens_per_device": 3828 }, { "epoch": 0.8584, "loss_ce": 0.28154435753822327, "loss_lvr": 0.6961221694946289, "loss_mode_switch": 0.0, "loss_total": 0.3511565923690796, "step": 2146 }, { "batch_size": 1, "epoch": 0.8584, "step": 2146, "tokens_per_device": 5366 }, { "epoch": 0.8584, "loss_ce": 0.004785309545695782, "loss_lvr": 0.31139272451400757, "loss_mode_switch": 0.0, "loss_total": 0.03592458367347717, "step": 2146 }, { "batch_size": 4, "epoch": 0.8584, "step": 2146, "tokens_per_device": 4180 }, { "epoch": 0.8584, "loss_ce": 0.031519901007413864, "loss_lvr": 0.9193556308746338, "loss_mode_switch": 0.0, "loss_total": 0.1234554648399353, "step": 2146 }, { "batch_size": 4, "epoch": 0.8584, "step": 2146, "tokens_per_device": 1316 }, { "epoch": 0.8584, "loss_ce": 0.21716417372226715, "loss_lvr": 1.0032248497009277, "loss_mode_switch": 0.0, "loss_total": 0.31748664379119873, "step": 2146 }, { "batch_size": 4, "epoch": 0.8584, "step": 2146, "tokens_per_device": 4908 }, { "epoch": 0.8584, "loss_ce": 0.046888988465070724, "loss_lvr": 0.5523294806480408, "loss_mode_switch": 0.0, "loss_total": 0.10212193429470062, "step": 2146 }, { "batch_size": 1, "epoch": 0.8584, "step": 2146, "tokens_per_device": 4872 }, { "epoch": 0.8584, "loss_ce": 0.009682350791990757, "loss_lvr": 0.5601177215576172, "loss_mode_switch": 0.0, "loss_total": 0.06569412350654602, "step": 2146 }, { "batch_size": 4, "epoch": 0.8584, "step": 2146, "tokens_per_device": 6632 }, { "epoch": 0.8584, "loss_ce": 0.24593548476696014, "loss_lvr": 0.7239331603050232, "loss_mode_switch": 0.0, "loss_total": 0.3183287978172302, "step": 2146 }, { "epoch": 0.8588, "grad_norm": 1.2048178911209106, "learning_rate": 5.13787595056916e-07, "loss": 0.2802, "step": 2147 }, { "batch_size": 4, "epoch": 0.8588, "step": 2147, "tokens_per_device": 4220 }, { "epoch": 0.8588, "loss_ce": 0.797706663608551, "loss_lvr": 1.0053709745407104, "loss_mode_switch": 0.0, "loss_total": 0.89824378490448, "step": 2147 }, { "batch_size": 1, "epoch": 0.8588, "step": 2147, "tokens_per_device": 5107 }, { "epoch": 0.8588, "loss_ce": 0.008824643678963184, "loss_lvr": 0.17746680974960327, "loss_mode_switch": 0.0, "loss_total": 0.026571325957775116, "step": 2147 }, { "batch_size": 4, "epoch": 0.8588, "step": 2147, "tokens_per_device": 1372 }, { "epoch": 0.8588, "loss_ce": 0.24748007953166962, "loss_lvr": 0.8965762257575989, "loss_mode_switch": 0.0, "loss_total": 0.33713769912719727, "step": 2147 }, { "batch_size": 1, "epoch": 0.8588, "step": 2147, "tokens_per_device": 5046 }, { "epoch": 0.8588, "loss_ce": 0.2780205011367798, "loss_lvr": 0.4348159432411194, "loss_mode_switch": 0.0, "loss_total": 0.32150208950042725, "step": 2147 }, { "batch_size": 4, "epoch": 0.8588, "step": 2147, "tokens_per_device": 3728 }, { "epoch": 0.8588, "loss_ce": 0.37039750814437866, "loss_lvr": 0.7463778257369995, "loss_mode_switch": 0.0, "loss_total": 0.44503527879714966, "step": 2147 }, { "batch_size": 4, "epoch": 0.8588, "step": 2147, "tokens_per_device": 2648 }, { "epoch": 0.8588, "loss_ce": 0.48173364996910095, "loss_lvr": 0.9539458751678467, "loss_mode_switch": 0.0, "loss_total": 0.5771282315254211, "step": 2147 }, { "batch_size": 4, "epoch": 0.8588, "step": 2147, "tokens_per_device": 5808 }, { "epoch": 0.8588, "loss_ce": 0.1386578232049942, "loss_lvr": 0.7625444531440735, "loss_mode_switch": 0.0, "loss_total": 0.2149122655391693, "step": 2147 }, { "batch_size": 1, "epoch": 0.8588, "step": 2147, "tokens_per_device": 4861 }, { "epoch": 0.8588, "loss_ce": 0.0001512783783255145, "loss_lvr": 0.24590973556041718, "loss_mode_switch": 0.0, "loss_total": 0.024742253124713898, "step": 2147 }, { "epoch": 0.8592, "grad_norm": 1.3267650604248047, "learning_rate": 5.10931292516652e-07, "loss": 0.2763, "step": 2148 }, { "batch_size": 1, "epoch": 0.8592, "step": 2148, "tokens_per_device": 4912 }, { "epoch": 0.8592, "loss_ce": 0.027726884931325912, "loss_lvr": 0.2784886956214905, "loss_mode_switch": 0.0, "loss_total": 0.05557575449347496, "step": 2148 }, { "batch_size": 4, "epoch": 0.8592, "step": 2148, "tokens_per_device": 4448 }, { "epoch": 0.8592, "loss_ce": 0.04966471716761589, "loss_lvr": 0.7669152617454529, "loss_mode_switch": 0.0, "loss_total": 0.12635624408721924, "step": 2148 }, { "batch_size": 4, "epoch": 0.8592, "step": 2148, "tokens_per_device": 4672 }, { "epoch": 0.8592, "loss_ce": 0.542523980140686, "loss_lvr": 0.8983926177024841, "loss_mode_switch": 0.0, "loss_total": 0.6323632597923279, "step": 2148 }, { "batch_size": 4, "epoch": 0.8592, "step": 2148, "tokens_per_device": 8668 }, { "epoch": 0.8592, "loss_ce": 0.07184743136167526, "loss_lvr": 0.8874163031578064, "loss_mode_switch": 0.0, "loss_total": 0.1605890691280365, "step": 2148 }, { "batch_size": 1, "epoch": 0.8592, "step": 2148, "tokens_per_device": 4898 }, { "epoch": 0.8592, "loss_ce": 1.2929188013076782, "loss_lvr": 0.47488152980804443, "loss_mode_switch": 0.0, "loss_total": 1.340406894683838, "step": 2148 }, { "batch_size": 1, "epoch": 0.8592, "step": 2148, "tokens_per_device": 6770 }, { "epoch": 0.8592, "loss_ce": 0.009390953928232193, "loss_lvr": 0.3048494756221771, "loss_mode_switch": 0.0, "loss_total": 0.039875902235507965, "step": 2148 }, { "batch_size": 4, "epoch": 0.8592, "step": 2148, "tokens_per_device": 6540 }, { "epoch": 0.8592, "loss_ce": 0.03094593994319439, "loss_lvr": 0.8367443084716797, "loss_mode_switch": 0.0, "loss_total": 0.11462037265300751, "step": 2148 }, { "batch_size": 4, "epoch": 0.8592, "step": 2148, "tokens_per_device": 2648 }, { "epoch": 0.8592, "loss_ce": 0.45991721749305725, "loss_lvr": 0.7031057476997375, "loss_mode_switch": 0.0, "loss_total": 0.530227780342102, "step": 2148 }, { "epoch": 0.8596, "grad_norm": 1.346266269683838, "learning_rate": 5.080825240949672e-07, "loss": 0.3014, "step": 2149 }, { "batch_size": 4, "epoch": 0.8596, "step": 2149, "tokens_per_device": 2656 }, { "epoch": 0.8596, "loss_ce": 0.12169452011585236, "loss_lvr": 0.8979583978652954, "loss_mode_switch": 0.0, "loss_total": 0.21149036288261414, "step": 2149 }, { "batch_size": 1, "epoch": 0.8596, "step": 2149, "tokens_per_device": 5279 }, { "epoch": 0.8596, "loss_ce": 0.11172179132699966, "loss_lvr": 0.28220871090888977, "loss_mode_switch": 0.0, "loss_total": 0.13994266092777252, "step": 2149 }, { "batch_size": 4, "epoch": 0.8596, "step": 2149, "tokens_per_device": 2568 }, { "epoch": 0.8596, "loss_ce": 0.9598645567893982, "loss_lvr": 0.925159215927124, "loss_mode_switch": 0.0, "loss_total": 1.0523804426193237, "step": 2149 }, { "batch_size": 1, "epoch": 0.8596, "step": 2149, "tokens_per_device": 4896 }, { "epoch": 0.8596, "loss_ce": 0.00015887542394921184, "loss_lvr": 0.30380189418792725, "loss_mode_switch": 0.0, "loss_total": 0.03053906559944153, "step": 2149 }, { "batch_size": 4, "epoch": 0.8596, "step": 2149, "tokens_per_device": 3800 }, { "epoch": 0.8596, "loss_ce": 0.4350304901599884, "loss_lvr": 0.9813279509544373, "loss_mode_switch": 0.0, "loss_total": 0.53316330909729, "step": 2149 }, { "batch_size": 4, "epoch": 0.8596, "step": 2149, "tokens_per_device": 1268 }, { "epoch": 0.8596, "loss_ce": 0.37499353289604187, "loss_lvr": 1.2040308713912964, "loss_mode_switch": 0.0, "loss_total": 0.49539661407470703, "step": 2149 }, { "batch_size": 4, "epoch": 0.8596, "step": 2149, "tokens_per_device": 1300 }, { "epoch": 0.8596, "loss_ce": 0.12260779738426208, "loss_lvr": 1.0353854894638062, "loss_mode_switch": 0.0, "loss_total": 0.22614634037017822, "step": 2149 }, { "batch_size": 1, "epoch": 0.8596, "step": 2149, "tokens_per_device": 4926 }, { "epoch": 0.8596, "loss_ce": 0.2471049576997757, "loss_lvr": 0.31324025988578796, "loss_mode_switch": 0.0, "loss_total": 0.27842897176742554, "step": 2149 }, { "epoch": 0.86, "grad_norm": 1.513036847114563, "learning_rate": 5.05241294573024e-07, "loss": 0.3077, "step": 2150 }, { "batch_size": 4, "epoch": 0.86, "step": 2150, "tokens_per_device": 3780 }, { "epoch": 0.86, "loss_ce": 0.2279222011566162, "loss_lvr": 1.154495120048523, "loss_mode_switch": 0.0, "loss_total": 0.343371719121933, "step": 2150 }, { "batch_size": 4, "epoch": 0.86, "step": 2150, "tokens_per_device": 4532 }, { "epoch": 0.86, "loss_ce": 0.02292146347463131, "loss_lvr": 0.8202219009399414, "loss_mode_switch": 0.0, "loss_total": 0.1049436554312706, "step": 2150 }, { "batch_size": 1, "epoch": 0.86, "step": 2150, "tokens_per_device": 5136 }, { "epoch": 0.86, "loss_ce": 0.07917994260787964, "loss_lvr": 0.36164581775665283, "loss_mode_switch": 0.0, "loss_total": 0.11534452438354492, "step": 2150 }, { "batch_size": 4, "epoch": 0.86, "step": 2150, "tokens_per_device": 5196 }, { "epoch": 0.86, "loss_ce": 0.25442758202552795, "loss_lvr": 0.6598992943763733, "loss_mode_switch": 0.0, "loss_total": 0.32041752338409424, "step": 2150 }, { "batch_size": 1, "epoch": 0.86, "step": 2150, "tokens_per_device": 5021 }, { "epoch": 0.86, "loss_ce": 0.09597133100032806, "loss_lvr": 0.340350478887558, "loss_mode_switch": 0.0, "loss_total": 0.13000637292861938, "step": 2150 }, { "batch_size": 4, "epoch": 0.86, "step": 2150, "tokens_per_device": 4364 }, { "epoch": 0.86, "loss_ce": 0.1800602674484253, "loss_lvr": 0.5251645445823669, "loss_mode_switch": 0.0, "loss_total": 0.23257672786712646, "step": 2150 }, { "batch_size": 4, "epoch": 0.86, "step": 2150, "tokens_per_device": 5472 }, { "epoch": 0.86, "loss_ce": 0.4053444266319275, "loss_lvr": 0.5371348857879639, "loss_mode_switch": 0.0, "loss_total": 0.45905792713165283, "step": 2150 }, { "batch_size": 4, "epoch": 0.86, "step": 2150, "tokens_per_device": 3912 }, { "epoch": 0.86, "loss_ce": 0.30869168043136597, "loss_lvr": 0.336671382188797, "loss_mode_switch": 0.0, "loss_total": 0.3423588275909424, "step": 2150 }, { "epoch": 0.8604, "grad_norm": 1.2441498041152954, "learning_rate": 5.024076087193292e-07, "loss": 0.2727, "step": 2151 }, { "batch_size": 4, "epoch": 0.8604, "step": 2151, "tokens_per_device": 4988 }, { "epoch": 0.8604, "loss_ce": 0.330169677734375, "loss_lvr": 0.8901285529136658, "loss_mode_switch": 0.0, "loss_total": 0.41918253898620605, "step": 2151 }, { "batch_size": 4, "epoch": 0.8604, "step": 2151, "tokens_per_device": 1268 }, { "epoch": 0.8604, "loss_ce": 0.24871821701526642, "loss_lvr": 1.176365852355957, "loss_mode_switch": 0.0, "loss_total": 0.3663547933101654, "step": 2151 }, { "batch_size": 4, "epoch": 0.8604, "step": 2151, "tokens_per_device": 3784 }, { "epoch": 0.8604, "loss_ce": 0.4593856930732727, "loss_lvr": 0.9639712572097778, "loss_mode_switch": 0.0, "loss_total": 0.5557827949523926, "step": 2151 }, { "batch_size": 4, "epoch": 0.8604, "step": 2151, "tokens_per_device": 2664 }, { "epoch": 0.8604, "loss_ce": 0.09166142344474792, "loss_lvr": 0.7198121547698975, "loss_mode_switch": 0.0, "loss_total": 0.16364264488220215, "step": 2151 }, { "batch_size": 4, "epoch": 0.8604, "step": 2151, "tokens_per_device": 4320 }, { "epoch": 0.8604, "loss_ce": 0.5228085517883301, "loss_lvr": 0.8671082258224487, "loss_mode_switch": 0.0, "loss_total": 0.609519362449646, "step": 2151 }, { "batch_size": 1, "epoch": 0.8604, "step": 2151, "tokens_per_device": 5125 }, { "epoch": 0.8604, "loss_ce": 0.005971301347017288, "loss_lvr": 0.42932814359664917, "loss_mode_switch": 0.0, "loss_total": 0.048904117196798325, "step": 2151 }, { "batch_size": 1, "epoch": 0.8604, "step": 2151, "tokens_per_device": 7407 }, { "epoch": 0.8604, "loss_ce": 0.0033789228182286024, "loss_lvr": 0.26830729842185974, "loss_mode_switch": 0.0, "loss_total": 0.030209653079509735, "step": 2151 }, { "batch_size": 1, "epoch": 0.8604, "step": 2151, "tokens_per_device": 5130 }, { "epoch": 0.8604, "loss_ce": 0.02769566886126995, "loss_lvr": 0.19303283095359802, "loss_mode_switch": 0.0, "loss_total": 0.04699895158410072, "step": 2151 }, { "epoch": 0.8608, "grad_norm": 1.2119969129562378, "learning_rate": 4.995814712897312e-07, "loss": 0.2703, "step": 2152 }, { "batch_size": 1, "epoch": 0.8608, "step": 2152, "tokens_per_device": 5065 }, { "epoch": 0.8608, "loss_ce": 0.3551580309867859, "loss_lvr": 0.34447237849235535, "loss_mode_switch": 0.0, "loss_total": 0.3896052837371826, "step": 2152 }, { "batch_size": 4, "epoch": 0.8608, "step": 2152, "tokens_per_device": 4148 }, { "epoch": 0.8608, "loss_ce": 0.16812492907047272, "loss_lvr": 0.7503973841667175, "loss_mode_switch": 0.0, "loss_total": 0.24316465854644775, "step": 2152 }, { "batch_size": 4, "epoch": 0.8608, "step": 2152, "tokens_per_device": 1692 }, { "epoch": 0.8608, "loss_ce": 0.41887983679771423, "loss_lvr": 0.7656469345092773, "loss_mode_switch": 0.0, "loss_total": 0.49544453620910645, "step": 2152 }, { "batch_size": 4, "epoch": 0.8608, "step": 2152, "tokens_per_device": 1436 }, { "epoch": 0.8608, "loss_ce": 0.41587644815444946, "loss_lvr": 0.9705349802970886, "loss_mode_switch": 0.0, "loss_total": 0.5129299163818359, "step": 2152 }, { "batch_size": 4, "epoch": 0.8608, "step": 2152, "tokens_per_device": 2676 }, { "epoch": 0.8608, "loss_ce": 0.688293993473053, "loss_lvr": 1.1068083047866821, "loss_mode_switch": 0.0, "loss_total": 0.7989748120307922, "step": 2152 }, { "batch_size": 1, "epoch": 0.8608, "step": 2152, "tokens_per_device": 5121 }, { "epoch": 0.8608, "loss_ce": 0.00024420878617092967, "loss_lvr": 0.21529722213745117, "loss_mode_switch": 0.0, "loss_total": 0.0217739325016737, "step": 2152 }, { "batch_size": 4, "epoch": 0.8608, "step": 2152, "tokens_per_device": 1572 }, { "epoch": 0.8608, "loss_ce": 0.2686426639556885, "loss_lvr": 1.5002317428588867, "loss_mode_switch": 0.0, "loss_total": 0.4186658263206482, "step": 2152 }, { "batch_size": 4, "epoch": 0.8608, "step": 2152, "tokens_per_device": 3792 }, { "epoch": 0.8608, "loss_ce": 0.18388602137565613, "loss_lvr": 0.8494393229484558, "loss_mode_switch": 0.0, "loss_total": 0.26882994174957275, "step": 2152 }, { "epoch": 0.8612, "grad_norm": 1.2986228466033936, "learning_rate": 4.967628870274071e-07, "loss": 0.2879, "step": 2153 }, { "batch_size": 1, "epoch": 0.8612, "step": 2153, "tokens_per_device": 4797 }, { "epoch": 0.8612, "loss_ce": 0.005955668166279793, "loss_lvr": 0.6624383926391602, "loss_mode_switch": 0.0, "loss_total": 0.0721995085477829, "step": 2153 }, { "batch_size": 1, "epoch": 0.8612, "step": 2153, "tokens_per_device": 5678 }, { "epoch": 0.8612, "loss_ce": 0.00224646320566535, "loss_lvr": 0.3147921562194824, "loss_mode_switch": 0.0, "loss_total": 0.03372567892074585, "step": 2153 }, { "batch_size": 4, "epoch": 0.8612, "step": 2153, "tokens_per_device": 10580 }, { "epoch": 0.8612, "loss_ce": 0.0429813489317894, "loss_lvr": 0.6768674850463867, "loss_mode_switch": 0.0, "loss_total": 0.11066810041666031, "step": 2153 }, { "batch_size": 4, "epoch": 0.8612, "step": 2153, "tokens_per_device": 4264 }, { "epoch": 0.8612, "loss_ce": 0.3740403652191162, "loss_lvr": 0.5668092966079712, "loss_mode_switch": 0.0, "loss_total": 0.4307212829589844, "step": 2153 }, { "batch_size": 4, "epoch": 0.8612, "step": 2153, "tokens_per_device": 7400 }, { "epoch": 0.8612, "loss_ce": 0.08628252893686295, "loss_lvr": 0.8488379716873169, "loss_mode_switch": 0.0, "loss_total": 0.171166330575943, "step": 2153 }, { "batch_size": 4, "epoch": 0.8612, "step": 2153, "tokens_per_device": 1280 }, { "epoch": 0.8612, "loss_ce": 0.30308571457862854, "loss_lvr": 0.8911898136138916, "loss_mode_switch": 0.0, "loss_total": 0.3922047019004822, "step": 2153 }, { "batch_size": 1, "epoch": 0.8612, "step": 2153, "tokens_per_device": 4881 }, { "epoch": 0.8612, "loss_ce": 0.019330773502588272, "loss_lvr": 0.6081050038337708, "loss_mode_switch": 0.0, "loss_total": 0.08014127612113953, "step": 2153 }, { "batch_size": 4, "epoch": 0.8612, "step": 2153, "tokens_per_device": 2672 }, { "epoch": 0.8612, "loss_ce": 0.5907813310623169, "loss_lvr": 0.557755708694458, "loss_mode_switch": 0.0, "loss_total": 0.6465569138526917, "step": 2153 }, { "epoch": 0.8616, "grad_norm": 1.3283648490905762, "learning_rate": 4.939518606628607e-07, "loss": 0.3002, "step": 2154 }, { "batch_size": 4, "epoch": 0.8616, "step": 2154, "tokens_per_device": 5752 }, { "epoch": 0.8616, "loss_ce": 0.0011359063209965825, "loss_lvr": 0.6690285801887512, "loss_mode_switch": 0.0, "loss_total": 0.06803876906633377, "step": 2154 }, { "batch_size": 4, "epoch": 0.8616, "step": 2154, "tokens_per_device": 3764 }, { "epoch": 0.8616, "loss_ce": 0.2790074050426483, "loss_lvr": 1.0314067602157593, "loss_mode_switch": 0.0, "loss_total": 0.3821480870246887, "step": 2154 }, { "batch_size": 1, "epoch": 0.8616, "step": 2154, "tokens_per_device": 7338 }, { "epoch": 0.8616, "loss_ce": 0.00016474429867230356, "loss_lvr": 0.23382671177387238, "loss_mode_switch": 0.0, "loss_total": 0.02354741469025612, "step": 2154 }, { "batch_size": 1, "epoch": 0.8616, "step": 2154, "tokens_per_device": 4703 }, { "epoch": 0.8616, "loss_ce": 0.009577970951795578, "loss_lvr": 0.446476012468338, "loss_mode_switch": 0.0, "loss_total": 0.05422557145357132, "step": 2154 }, { "batch_size": 4, "epoch": 0.8616, "step": 2154, "tokens_per_device": 5236 }, { "epoch": 0.8616, "loss_ce": 0.1840580254793167, "loss_lvr": 0.8168515563011169, "loss_mode_switch": 0.0, "loss_total": 0.2657431960105896, "step": 2154 }, { "batch_size": 1, "epoch": 0.8616, "step": 2154, "tokens_per_device": 5015 }, { "epoch": 0.8616, "loss_ce": 0.08709364384412766, "loss_lvr": 0.5153800249099731, "loss_mode_switch": 0.0, "loss_total": 0.1386316418647766, "step": 2154 }, { "batch_size": 4, "epoch": 0.8616, "step": 2154, "tokens_per_device": 5924 }, { "epoch": 0.8616, "loss_ce": 0.3937314450740814, "loss_lvr": 0.6652021408081055, "loss_mode_switch": 0.0, "loss_total": 0.46025165915489197, "step": 2154 }, { "batch_size": 4, "epoch": 0.8616, "step": 2154, "tokens_per_device": 4212 }, { "epoch": 0.8616, "loss_ce": 0.413429856300354, "loss_lvr": 0.7049829959869385, "loss_mode_switch": 0.0, "loss_total": 0.4839281439781189, "step": 2154 }, { "epoch": 0.862, "grad_norm": 1.2275506258010864, "learning_rate": 4.911483969139086e-07, "loss": 0.2672, "step": 2155 }, { "batch_size": 1, "epoch": 0.862, "step": 2155, "tokens_per_device": 5224 }, { "epoch": 0.862, "loss_ce": 0.1594582051038742, "loss_lvr": 0.38267233967781067, "loss_mode_switch": 0.0, "loss_total": 0.19772544503211975, "step": 2155 }, { "batch_size": 1, "epoch": 0.862, "step": 2155, "tokens_per_device": 4867 }, { "epoch": 0.862, "loss_ce": 0.0003346140729263425, "loss_lvr": 0.34100693464279175, "loss_mode_switch": 0.0, "loss_total": 0.03443530574440956, "step": 2155 }, { "batch_size": 4, "epoch": 0.862, "step": 2155, "tokens_per_device": 6292 }, { "epoch": 0.862, "loss_ce": 0.24545596539974213, "loss_lvr": 1.0499849319458008, "loss_mode_switch": 0.0, "loss_total": 0.3504544496536255, "step": 2155 }, { "batch_size": 1, "epoch": 0.862, "step": 2155, "tokens_per_device": 5114 }, { "epoch": 0.862, "loss_ce": 0.00014845861005596817, "loss_lvr": 0.4274712800979614, "loss_mode_switch": 0.0, "loss_total": 0.042895589023828506, "step": 2155 }, { "batch_size": 4, "epoch": 0.862, "step": 2155, "tokens_per_device": 4664 }, { "epoch": 0.862, "loss_ce": 0.5008502006530762, "loss_lvr": 0.9181311130523682, "loss_mode_switch": 0.0, "loss_total": 0.5926632881164551, "step": 2155 }, { "batch_size": 4, "epoch": 0.862, "step": 2155, "tokens_per_device": 4276 }, { "epoch": 0.862, "loss_ce": 0.1978766769170761, "loss_lvr": 0.8637663125991821, "loss_mode_switch": 0.0, "loss_total": 0.2842532992362976, "step": 2155 }, { "batch_size": 4, "epoch": 0.862, "step": 2155, "tokens_per_device": 4060 }, { "epoch": 0.862, "loss_ce": 0.18841668963432312, "loss_lvr": 0.9482525587081909, "loss_mode_switch": 0.0, "loss_total": 0.28324195742607117, "step": 2155 }, { "batch_size": 1, "epoch": 0.862, "step": 2155, "tokens_per_device": 4874 }, { "epoch": 0.862, "loss_ce": 0.006961829494684935, "loss_lvr": 0.3091123700141907, "loss_mode_switch": 0.0, "loss_total": 0.03787306696176529, "step": 2155 }, { "epoch": 0.8624, "grad_norm": 1.3077083826065063, "learning_rate": 4.883525004856738e-07, "loss": 0.3174, "step": 2156 }, { "batch_size": 4, "epoch": 0.8624, "step": 2156, "tokens_per_device": 1760 }, { "epoch": 0.8624, "loss_ce": 0.2624834477901459, "loss_lvr": 2.061999797821045, "loss_mode_switch": 0.0, "loss_total": 0.4686834216117859, "step": 2156 }, { "batch_size": 4, "epoch": 0.8624, "step": 2156, "tokens_per_device": 4272 }, { "epoch": 0.8624, "loss_ce": 0.36447399854660034, "loss_lvr": 0.9223219156265259, "loss_mode_switch": 0.0, "loss_total": 0.4567061960697174, "step": 2156 }, { "batch_size": 4, "epoch": 0.8624, "step": 2156, "tokens_per_device": 2628 }, { "epoch": 0.8624, "loss_ce": 0.19415028393268585, "loss_lvr": 1.0342636108398438, "loss_mode_switch": 0.0, "loss_total": 0.2975766360759735, "step": 2156 }, { "batch_size": 4, "epoch": 0.8624, "step": 2156, "tokens_per_device": 5624 }, { "epoch": 0.8624, "loss_ce": 0.07872477173805237, "loss_lvr": 0.705427885055542, "loss_mode_switch": 0.0, "loss_total": 0.1492675542831421, "step": 2156 }, { "batch_size": 1, "epoch": 0.8624, "step": 2156, "tokens_per_device": 4363 }, { "epoch": 0.8624, "loss_ce": 0.2993839979171753, "loss_lvr": 0.2457396388053894, "loss_mode_switch": 0.0, "loss_total": 0.3239579498767853, "step": 2156 }, { "batch_size": 4, "epoch": 0.8624, "step": 2156, "tokens_per_device": 4332 }, { "epoch": 0.8624, "loss_ce": 0.6797724962234497, "loss_lvr": 0.9757965207099915, "loss_mode_switch": 0.0, "loss_total": 0.7773521542549133, "step": 2156 }, { "batch_size": 4, "epoch": 0.8624, "step": 2156, "tokens_per_device": 3144 }, { "epoch": 0.8624, "loss_ce": 0.68423992395401, "loss_lvr": 0.6127889156341553, "loss_mode_switch": 0.0, "loss_total": 0.7455188035964966, "step": 2156 }, { "batch_size": 1, "epoch": 0.8624, "step": 2156, "tokens_per_device": 7143 }, { "epoch": 0.8624, "loss_ce": 0.0280880406498909, "loss_lvr": 0.20488668978214264, "loss_mode_switch": 0.0, "loss_total": 0.0485767126083374, "step": 2156 }, { "epoch": 0.8628, "grad_norm": 1.280727744102478, "learning_rate": 4.855641760705837e-07, "loss": 0.3013, "step": 2157 }, { "batch_size": 4, "epoch": 0.8628, "step": 2157, "tokens_per_device": 3828 }, { "epoch": 0.8628, "loss_ce": 0.10246813297271729, "loss_lvr": 0.7799121141433716, "loss_mode_switch": 0.0, "loss_total": 0.18045935034751892, "step": 2157 }, { "batch_size": 4, "epoch": 0.8628, "step": 2157, "tokens_per_device": 10580 }, { "epoch": 0.8628, "loss_ce": 0.03949869051575661, "loss_lvr": 0.4955745041370392, "loss_mode_switch": 0.0, "loss_total": 0.08905614167451859, "step": 2157 }, { "batch_size": 4, "epoch": 0.8628, "step": 2157, "tokens_per_device": 4520 }, { "epoch": 0.8628, "loss_ce": 0.25418147444725037, "loss_lvr": 0.8442783355712891, "loss_mode_switch": 0.0, "loss_total": 0.3386093080043793, "step": 2157 }, { "batch_size": 1, "epoch": 0.8628, "step": 2157, "tokens_per_device": 5116 }, { "epoch": 0.8628, "loss_ce": 0.0003315817448310554, "loss_lvr": 0.218453049659729, "loss_mode_switch": 0.0, "loss_total": 0.022176887840032578, "step": 2157 }, { "batch_size": 4, "epoch": 0.8628, "step": 2157, "tokens_per_device": 4424 }, { "epoch": 0.8628, "loss_ce": 0.14944633841514587, "loss_lvr": 0.7422599196434021, "loss_mode_switch": 0.0, "loss_total": 0.22367233037948608, "step": 2157 }, { "batch_size": 4, "epoch": 0.8628, "step": 2157, "tokens_per_device": 5740 }, { "epoch": 0.8628, "loss_ce": 0.10230323672294617, "loss_lvr": 0.4950868487358093, "loss_mode_switch": 0.0, "loss_total": 0.15181192755699158, "step": 2157 }, { "batch_size": 4, "epoch": 0.8628, "step": 2157, "tokens_per_device": 1252 }, { "epoch": 0.8628, "loss_ce": 0.301255464553833, "loss_lvr": 1.0301342010498047, "loss_mode_switch": 0.0, "loss_total": 0.40426889061927795, "step": 2157 }, { "batch_size": 4, "epoch": 0.8628, "step": 2157, "tokens_per_device": 2672 }, { "epoch": 0.8628, "loss_ce": 0.4723281264305115, "loss_lvr": 0.7711285352706909, "loss_mode_switch": 0.0, "loss_total": 0.5494409799575806, "step": 2157 }, { "epoch": 0.8632, "grad_norm": 1.2754106521606445, "learning_rate": 4.827834283483513e-07, "loss": 0.2964, "step": 2158 }, { "batch_size": 1, "epoch": 0.8632, "step": 2158, "tokens_per_device": 4824 }, { "epoch": 0.8632, "loss_ce": 0.009689956903457642, "loss_lvr": 0.21105614304542542, "loss_mode_switch": 0.0, "loss_total": 0.030795572325587273, "step": 2158 }, { "batch_size": 1, "epoch": 0.8632, "step": 2158, "tokens_per_device": 4887 }, { "epoch": 0.8632, "loss_ce": 0.3295382857322693, "loss_lvr": 0.2580668330192566, "loss_mode_switch": 0.0, "loss_total": 0.3553449809551239, "step": 2158 }, { "batch_size": 4, "epoch": 0.8632, "step": 2158, "tokens_per_device": 5564 }, { "epoch": 0.8632, "loss_ce": 0.004713750910013914, "loss_lvr": 0.5826388001441956, "loss_mode_switch": 0.0, "loss_total": 0.0629776269197464, "step": 2158 }, { "batch_size": 4, "epoch": 0.8632, "step": 2158, "tokens_per_device": 2832 }, { "epoch": 0.8632, "loss_ce": 0.04168349504470825, "loss_lvr": 0.7513031363487244, "loss_mode_switch": 0.0, "loss_total": 0.11681380867958069, "step": 2158 }, { "batch_size": 1, "epoch": 0.8632, "step": 2158, "tokens_per_device": 4870 }, { "epoch": 0.8632, "loss_ce": 0.01689569465816021, "loss_lvr": 0.4232812821865082, "loss_mode_switch": 0.0, "loss_total": 0.05922382324934006, "step": 2158 }, { "batch_size": 4, "epoch": 0.8632, "step": 2158, "tokens_per_device": 6588 }, { "epoch": 0.8632, "loss_ce": 0.2829064428806305, "loss_lvr": 0.7869223952293396, "loss_mode_switch": 0.0, "loss_total": 0.3615986704826355, "step": 2158 }, { "batch_size": 4, "epoch": 0.8632, "step": 2158, "tokens_per_device": 4244 }, { "epoch": 0.8632, "loss_ce": 0.06292293220758438, "loss_lvr": 0.8785096406936646, "loss_mode_switch": 0.0, "loss_total": 0.15077389776706696, "step": 2158 }, { "batch_size": 1, "epoch": 0.8632, "step": 2158, "tokens_per_device": 4880 }, { "epoch": 0.8632, "loss_ce": 0.020749766379594803, "loss_lvr": 0.2514929175376892, "loss_mode_switch": 0.0, "loss_total": 0.045899055898189545, "step": 2158 }, { "epoch": 0.8636, "grad_norm": 1.26567804813385, "learning_rate": 4.800102619859792e-07, "loss": 0.2582, "step": 2159 }, { "batch_size": 4, "epoch": 0.8636, "step": 2159, "tokens_per_device": 4520 }, { "epoch": 0.8636, "loss_ce": 0.004822163842618465, "loss_lvr": 0.9766284823417664, "loss_mode_switch": 0.0, "loss_total": 0.10248501598834991, "step": 2159 }, { "batch_size": 1, "epoch": 0.8636, "step": 2159, "tokens_per_device": 5192 }, { "epoch": 0.8636, "loss_ce": 0.07402199506759644, "loss_lvr": 0.3653937578201294, "loss_mode_switch": 0.0, "loss_total": 0.11056137084960938, "step": 2159 }, { "batch_size": 4, "epoch": 0.8636, "step": 2159, "tokens_per_device": 3652 }, { "epoch": 0.8636, "loss_ce": 0.10686024278402328, "loss_lvr": 0.920137345790863, "loss_mode_switch": 0.0, "loss_total": 0.19887398183345795, "step": 2159 }, { "batch_size": 4, "epoch": 0.8636, "step": 2159, "tokens_per_device": 3480 }, { "epoch": 0.8636, "loss_ce": 0.29214805364608765, "loss_lvr": 1.2179347276687622, "loss_mode_switch": 0.0, "loss_total": 0.41394153237342834, "step": 2159 }, { "batch_size": 4, "epoch": 0.8636, "step": 2159, "tokens_per_device": 4428 }, { "epoch": 0.8636, "loss_ce": 0.7014718055725098, "loss_lvr": 0.8068376779556274, "loss_mode_switch": 0.0, "loss_total": 0.7821555733680725, "step": 2159 }, { "batch_size": 4, "epoch": 0.8636, "step": 2159, "tokens_per_device": 7268 }, { "epoch": 0.8636, "loss_ce": 0.1321663111448288, "loss_lvr": 0.8342728614807129, "loss_mode_switch": 0.0, "loss_total": 0.2155936062335968, "step": 2159 }, { "batch_size": 4, "epoch": 0.8636, "step": 2159, "tokens_per_device": 9392 }, { "epoch": 0.8636, "loss_ce": 0.22682510316371918, "loss_lvr": 0.5453619360923767, "loss_mode_switch": 0.0, "loss_total": 0.28136128187179565, "step": 2159 }, { "batch_size": 4, "epoch": 0.8636, "step": 2159, "tokens_per_device": 4148 }, { "epoch": 0.8636, "loss_ce": 0.6433062553405762, "loss_lvr": 0.8681936860084534, "loss_mode_switch": 0.0, "loss_total": 0.7301256060600281, "step": 2159 }, { "epoch": 0.864, "grad_norm": 1.5026401281356812, "learning_rate": 4.772446816377408e-07, "loss": 0.3019, "step": 2160 }, { "batch_size": 4, "epoch": 0.864, "step": 2160, "tokens_per_device": 4316 }, { "epoch": 0.864, "loss_ce": 0.0563913993537426, "loss_lvr": 0.7563982009887695, "loss_mode_switch": 0.0, "loss_total": 0.13203121721744537, "step": 2160 }, { "batch_size": 4, "epoch": 0.864, "step": 2160, "tokens_per_device": 5616 }, { "epoch": 0.864, "loss_ce": 0.35330453515052795, "loss_lvr": 0.5114786028862, "loss_mode_switch": 0.0, "loss_total": 0.404452383518219, "step": 2160 }, { "batch_size": 4, "epoch": 0.864, "step": 2160, "tokens_per_device": 1492 }, { "epoch": 0.864, "loss_ce": 0.43307438492774963, "loss_lvr": 0.9880167245864868, "loss_mode_switch": 0.0, "loss_total": 0.5318760871887207, "step": 2160 }, { "batch_size": 4, "epoch": 0.864, "step": 2160, "tokens_per_device": 7496 }, { "epoch": 0.864, "loss_ce": 0.1617523431777954, "loss_lvr": 0.7071636915206909, "loss_mode_switch": 0.0, "loss_total": 0.23246872425079346, "step": 2160 }, { "batch_size": 4, "epoch": 0.864, "step": 2160, "tokens_per_device": 2572 }, { "epoch": 0.864, "loss_ce": 0.21079885959625244, "loss_lvr": 1.1055502891540527, "loss_mode_switch": 0.0, "loss_total": 0.32135388255119324, "step": 2160 }, { "batch_size": 4, "epoch": 0.864, "step": 2160, "tokens_per_device": 3032 }, { "epoch": 0.864, "loss_ce": 0.3889533579349518, "loss_lvr": 0.8288251161575317, "loss_mode_switch": 0.0, "loss_total": 0.4718358814716339, "step": 2160 }, { "batch_size": 1, "epoch": 0.864, "step": 2160, "tokens_per_device": 5741 }, { "epoch": 0.864, "loss_ce": 0.00019526916730683297, "loss_lvr": 0.2981054186820984, "loss_mode_switch": 0.0, "loss_total": 0.030005810782313347, "step": 2160 }, { "batch_size": 4, "epoch": 0.864, "step": 2160, "tokens_per_device": 1616 }, { "epoch": 0.864, "loss_ce": 0.33765703439712524, "loss_lvr": 0.8670061826705933, "loss_mode_switch": 0.0, "loss_total": 0.42435765266418457, "step": 2160 }, { "epoch": 0.8644, "grad_norm": 1.2311779260635376, "learning_rate": 4.744866919451824e-07, "loss": 0.2657, "step": 2161 }, { "batch_size": 4, "epoch": 0.8644, "step": 2161, "tokens_per_device": 3992 }, { "epoch": 0.8644, "loss_ce": 0.04637235403060913, "loss_lvr": 1.3626517057418823, "loss_mode_switch": 0.0, "loss_total": 0.1826375275850296, "step": 2161 }, { "batch_size": 1, "epoch": 0.8644, "step": 2161, "tokens_per_device": 4896 }, { "epoch": 0.8644, "loss_ce": 0.032198693603277206, "loss_lvr": 0.5569228529930115, "loss_mode_switch": 0.0, "loss_total": 0.08789098262786865, "step": 2161 }, { "batch_size": 1, "epoch": 0.8644, "step": 2161, "tokens_per_device": 4904 }, { "epoch": 0.8644, "loss_ce": 0.007073774468153715, "loss_lvr": 0.4301528036594391, "loss_mode_switch": 0.0, "loss_total": 0.05008905753493309, "step": 2161 }, { "batch_size": 4, "epoch": 0.8644, "step": 2161, "tokens_per_device": 8492 }, { "epoch": 0.8644, "loss_ce": 0.20510520040988922, "loss_lvr": 0.3085796535015106, "loss_mode_switch": 0.0, "loss_total": 0.23596316576004028, "step": 2161 }, { "batch_size": 1, "epoch": 0.8644, "step": 2161, "tokens_per_device": 4873 }, { "epoch": 0.8644, "loss_ce": 0.0007312654634006321, "loss_lvr": 0.28168296813964844, "loss_mode_switch": 0.0, "loss_total": 0.02889956161379814, "step": 2161 }, { "batch_size": 4, "epoch": 0.8644, "step": 2161, "tokens_per_device": 3836 }, { "epoch": 0.8644, "loss_ce": 0.3567226231098175, "loss_lvr": 0.7515419125556946, "loss_mode_switch": 0.0, "loss_total": 0.4318768084049225, "step": 2161 }, { "batch_size": 1, "epoch": 0.8644, "step": 2161, "tokens_per_device": 4835 }, { "epoch": 0.8644, "loss_ce": 0.00014874165935907513, "loss_lvr": 0.2997646629810333, "loss_mode_switch": 0.0, "loss_total": 0.0301252081990242, "step": 2161 }, { "batch_size": 1, "epoch": 0.8644, "step": 2161, "tokens_per_device": 4619 }, { "epoch": 0.8644, "loss_ce": 0.0428595095872879, "loss_lvr": 0.5134513974189758, "loss_mode_switch": 0.0, "loss_total": 0.09420464932918549, "step": 2161 }, { "epoch": 0.8648, "grad_norm": 1.2045867443084717, "learning_rate": 4.7173629753710595e-07, "loss": 0.2602, "step": 2162 }, { "batch_size": 1, "epoch": 0.8648, "step": 2162, "tokens_per_device": 5023 }, { "epoch": 0.8648, "loss_ce": 0.0018828203901648521, "loss_lvr": 0.22208277881145477, "loss_mode_switch": 0.0, "loss_total": 0.024091098457574844, "step": 2162 }, { "batch_size": 4, "epoch": 0.8648, "step": 2162, "tokens_per_device": 4956 }, { "epoch": 0.8648, "loss_ce": 0.28949716687202454, "loss_lvr": 0.6495632529258728, "loss_mode_switch": 0.0, "loss_total": 0.35445350408554077, "step": 2162 }, { "batch_size": 1, "epoch": 0.8648, "step": 2162, "tokens_per_device": 4872 }, { "epoch": 0.8648, "loss_ce": 0.001585072954185307, "loss_lvr": 0.41312533617019653, "loss_mode_switch": 0.0, "loss_total": 0.04289760813117027, "step": 2162 }, { "batch_size": 4, "epoch": 0.8648, "step": 2162, "tokens_per_device": 1440 }, { "epoch": 0.8648, "loss_ce": 0.7156192660331726, "loss_lvr": 1.0799832344055176, "loss_mode_switch": 0.0, "loss_total": 0.8236175775527954, "step": 2162 }, { "batch_size": 4, "epoch": 0.8648, "step": 2162, "tokens_per_device": 1424 }, { "epoch": 0.8648, "loss_ce": 0.5039939880371094, "loss_lvr": 1.6217275857925415, "loss_mode_switch": 0.0, "loss_total": 0.6661667823791504, "step": 2162 }, { "batch_size": 4, "epoch": 0.8648, "step": 2162, "tokens_per_device": 2908 }, { "epoch": 0.8648, "loss_ce": 0.27672216296195984, "loss_lvr": 0.8642235398292542, "loss_mode_switch": 0.0, "loss_total": 0.36314451694488525, "step": 2162 }, { "batch_size": 1, "epoch": 0.8648, "step": 2162, "tokens_per_device": 5085 }, { "epoch": 0.8648, "loss_ce": 0.0013572423486039042, "loss_lvr": 0.5985627770423889, "loss_mode_switch": 0.0, "loss_total": 0.06121351942420006, "step": 2162 }, { "batch_size": 4, "epoch": 0.8648, "step": 2162, "tokens_per_device": 5032 }, { "epoch": 0.8648, "loss_ce": 0.1015377938747406, "loss_lvr": 0.8800550103187561, "loss_mode_switch": 0.0, "loss_total": 0.18954330682754517, "step": 2162 }, { "epoch": 0.8652, "grad_norm": 1.2078325748443604, "learning_rate": 4.689935030295717e-07, "loss": 0.2758, "step": 2163 }, { "batch_size": 4, "epoch": 0.8652, "step": 2163, "tokens_per_device": 4464 }, { "epoch": 0.8652, "loss_ce": 0.3688211441040039, "loss_lvr": 0.8076008558273315, "loss_mode_switch": 0.0, "loss_total": 0.44958123564720154, "step": 2163 }, { "batch_size": 4, "epoch": 0.8652, "step": 2163, "tokens_per_device": 5344 }, { "epoch": 0.8652, "loss_ce": 0.03552506864070892, "loss_lvr": 0.4982154071331024, "loss_mode_switch": 0.0, "loss_total": 0.08534660935401917, "step": 2163 }, { "batch_size": 1, "epoch": 0.8652, "step": 2163, "tokens_per_device": 5023 }, { "epoch": 0.8652, "loss_ce": 0.05885020270943642, "loss_lvr": 0.36272475123405457, "loss_mode_switch": 0.0, "loss_total": 0.09512268006801605, "step": 2163 }, { "batch_size": 1, "epoch": 0.8652, "step": 2163, "tokens_per_device": 4899 }, { "epoch": 0.8652, "loss_ce": 0.001428998657502234, "loss_lvr": 0.4068765342235565, "loss_mode_switch": 0.0, "loss_total": 0.04211665317416191, "step": 2163 }, { "batch_size": 1, "epoch": 0.8652, "step": 2163, "tokens_per_device": 5089 }, { "epoch": 0.8652, "loss_ce": 0.7959839701652527, "loss_lvr": 0.377529501914978, "loss_mode_switch": 0.0, "loss_total": 0.8337368965148926, "step": 2163 }, { "batch_size": 1, "epoch": 0.8652, "step": 2163, "tokens_per_device": 4930 }, { "epoch": 0.8652, "loss_ce": 0.130415141582489, "loss_lvr": 0.6019369959831238, "loss_mode_switch": 0.0, "loss_total": 0.19060884416103363, "step": 2163 }, { "batch_size": 4, "epoch": 0.8652, "step": 2163, "tokens_per_device": 4544 }, { "epoch": 0.8652, "loss_ce": 0.06495097279548645, "loss_lvr": 0.7901871800422668, "loss_mode_switch": 0.0, "loss_total": 0.14396968483924866, "step": 2163 }, { "batch_size": 4, "epoch": 0.8652, "step": 2163, "tokens_per_device": 5196 }, { "epoch": 0.8652, "loss_ce": 0.22338154911994934, "loss_lvr": 0.6638411283493042, "loss_mode_switch": 0.0, "loss_total": 0.2897656559944153, "step": 2163 }, { "epoch": 0.8656, "grad_norm": 1.3572229146957397, "learning_rate": 4.662583130258796e-07, "loss": 0.2836, "step": 2164 }, { "batch_size": 1, "epoch": 0.8656, "step": 2164, "tokens_per_device": 5074 }, { "epoch": 0.8656, "loss_ce": 0.4165053367614746, "loss_lvr": 0.4756010174751282, "loss_mode_switch": 0.0, "loss_total": 0.46406543254852295, "step": 2164 }, { "batch_size": 4, "epoch": 0.8656, "step": 2164, "tokens_per_device": 11088 }, { "epoch": 0.8656, "loss_ce": 0.14941619336605072, "loss_lvr": 0.7104212045669556, "loss_mode_switch": 0.0, "loss_total": 0.22045831382274628, "step": 2164 }, { "batch_size": 1, "epoch": 0.8656, "step": 2164, "tokens_per_device": 4888 }, { "epoch": 0.8656, "loss_ce": 0.02967650070786476, "loss_lvr": 0.6221117377281189, "loss_mode_switch": 0.0, "loss_total": 0.09188767522573471, "step": 2164 }, { "batch_size": 4, "epoch": 0.8656, "step": 2164, "tokens_per_device": 4276 }, { "epoch": 0.8656, "loss_ce": 0.25589630007743835, "loss_lvr": 0.63680100440979, "loss_mode_switch": 0.0, "loss_total": 0.3195764124393463, "step": 2164 }, { "batch_size": 4, "epoch": 0.8656, "step": 2164, "tokens_per_device": 4508 }, { "epoch": 0.8656, "loss_ce": 0.13387377560138702, "loss_lvr": 0.5871856808662415, "loss_mode_switch": 0.0, "loss_total": 0.19259235262870789, "step": 2164 }, { "batch_size": 4, "epoch": 0.8656, "step": 2164, "tokens_per_device": 1344 }, { "epoch": 0.8656, "loss_ce": 0.5929414629936218, "loss_lvr": 1.04102623462677, "loss_mode_switch": 0.0, "loss_total": 0.6970440745353699, "step": 2164 }, { "batch_size": 4, "epoch": 0.8656, "step": 2164, "tokens_per_device": 5640 }, { "epoch": 0.8656, "loss_ce": 0.5090205669403076, "loss_lvr": 0.7266040444374084, "loss_mode_switch": 0.0, "loss_total": 0.581680953502655, "step": 2164 }, { "batch_size": 4, "epoch": 0.8656, "step": 2164, "tokens_per_device": 15996 }, { "epoch": 0.8656, "loss_ce": 0.45988136529922485, "loss_lvr": 0.9456321001052856, "loss_mode_switch": 0.0, "loss_total": 0.5544445514678955, "step": 2164 }, { "epoch": 0.866, "grad_norm": 1.3667206764221191, "learning_rate": 4.6353073211656886e-07, "loss": 0.2989, "step": 2165 }, { "batch_size": 1, "epoch": 0.866, "step": 2165, "tokens_per_device": 4680 }, { "epoch": 0.866, "loss_ce": 0.05854630470275879, "loss_lvr": 0.37692776322364807, "loss_mode_switch": 0.0, "loss_total": 0.09623908251523972, "step": 2165 }, { "batch_size": 4, "epoch": 0.866, "step": 2165, "tokens_per_device": 4652 }, { "epoch": 0.866, "loss_ce": 0.091764897108078, "loss_lvr": 1.1777201890945435, "loss_mode_switch": 0.0, "loss_total": 0.20953691005706787, "step": 2165 }, { "batch_size": 4, "epoch": 0.866, "step": 2165, "tokens_per_device": 4204 }, { "epoch": 0.866, "loss_ce": 0.15869079530239105, "loss_lvr": 0.8469219207763672, "loss_mode_switch": 0.0, "loss_total": 0.24338299036026, "step": 2165 }, { "batch_size": 4, "epoch": 0.866, "step": 2165, "tokens_per_device": 4608 }, { "epoch": 0.866, "loss_ce": 0.34360116720199585, "loss_lvr": 0.686766505241394, "loss_mode_switch": 0.0, "loss_total": 0.41227781772613525, "step": 2165 }, { "batch_size": 1, "epoch": 0.866, "step": 2165, "tokens_per_device": 4926 }, { "epoch": 0.866, "loss_ce": 0.9493526220321655, "loss_lvr": 0.2850302755832672, "loss_mode_switch": 0.0, "loss_total": 0.9778556227684021, "step": 2165 }, { "batch_size": 4, "epoch": 0.866, "step": 2165, "tokens_per_device": 4680 }, { "epoch": 0.866, "loss_ce": 0.452117919921875, "loss_lvr": 0.8304868340492249, "loss_mode_switch": 0.0, "loss_total": 0.5351666212081909, "step": 2165 }, { "batch_size": 4, "epoch": 0.866, "step": 2165, "tokens_per_device": 4728 }, { "epoch": 0.866, "loss_ce": 0.3443095088005066, "loss_lvr": 0.7801008224487305, "loss_mode_switch": 0.0, "loss_total": 0.42231959104537964, "step": 2165 }, { "batch_size": 4, "epoch": 0.866, "step": 2165, "tokens_per_device": 1368 }, { "epoch": 0.866, "loss_ce": 0.3893642723560333, "loss_lvr": 1.0590938329696655, "loss_mode_switch": 0.0, "loss_total": 0.4952736496925354, "step": 2165 }, { "epoch": 0.8664, "grad_norm": 1.3890044689178467, "learning_rate": 4.608107648794091e-07, "loss": 0.3108, "step": 2166 }, { "batch_size": 4, "epoch": 0.8664, "step": 2166, "tokens_per_device": 2628 }, { "epoch": 0.8664, "loss_ce": 0.22460371255874634, "loss_lvr": 0.8368207812309265, "loss_mode_switch": 0.0, "loss_total": 0.30828580260276794, "step": 2166 }, { "batch_size": 1, "epoch": 0.8664, "step": 2166, "tokens_per_device": 4890 }, { "epoch": 0.8664, "loss_ce": 0.02436424419283867, "loss_lvr": 0.4227994680404663, "loss_mode_switch": 0.0, "loss_total": 0.06664419174194336, "step": 2166 }, { "batch_size": 1, "epoch": 0.8664, "step": 2166, "tokens_per_device": 5506 }, { "epoch": 0.8664, "loss_ce": 0.0029877054039388895, "loss_lvr": 0.5526809692382812, "loss_mode_switch": 0.0, "loss_total": 0.05825580283999443, "step": 2166 }, { "batch_size": 1, "epoch": 0.8664, "step": 2166, "tokens_per_device": 5126 }, { "epoch": 0.8664, "loss_ce": 0.09609439224004745, "loss_lvr": 0.41753360629081726, "loss_mode_switch": 0.0, "loss_total": 0.13784775137901306, "step": 2166 }, { "batch_size": 1, "epoch": 0.8664, "step": 2166, "tokens_per_device": 5090 }, { "epoch": 0.8664, "loss_ce": 0.08178389072418213, "loss_lvr": 0.5159395933151245, "loss_mode_switch": 0.0, "loss_total": 0.13337785005569458, "step": 2166 }, { "batch_size": 4, "epoch": 0.8664, "step": 2166, "tokens_per_device": 1832 }, { "epoch": 0.8664, "loss_ce": 0.25056278705596924, "loss_lvr": 0.8758473992347717, "loss_mode_switch": 0.0, "loss_total": 0.33814752101898193, "step": 2166 }, { "batch_size": 4, "epoch": 0.8664, "step": 2166, "tokens_per_device": 7676 }, { "epoch": 0.8664, "loss_ce": 0.27632150053977966, "loss_lvr": 0.44683149456977844, "loss_mode_switch": 0.0, "loss_total": 0.3210046589374542, "step": 2166 }, { "batch_size": 4, "epoch": 0.8664, "step": 2166, "tokens_per_device": 4000 }, { "epoch": 0.8664, "loss_ce": 0.21767376363277435, "loss_lvr": 0.8270043134689331, "loss_mode_switch": 0.0, "loss_total": 0.30037420988082886, "step": 2166 }, { "epoch": 0.8668, "grad_norm": 1.2247651815414429, "learning_rate": 4.580984158793894e-07, "loss": 0.255, "step": 2167 }, { "batch_size": 4, "epoch": 0.8668, "step": 2167, "tokens_per_device": 4020 }, { "epoch": 0.8668, "loss_ce": 0.005646505858749151, "loss_lvr": 1.2334372997283936, "loss_mode_switch": 0.0, "loss_total": 0.12899023294448853, "step": 2167 }, { "batch_size": 4, "epoch": 0.8668, "step": 2167, "tokens_per_device": 1540 }, { "epoch": 0.8668, "loss_ce": 0.1255551427602768, "loss_lvr": 1.0929919481277466, "loss_mode_switch": 0.0, "loss_total": 0.2348543405532837, "step": 2167 }, { "batch_size": 4, "epoch": 0.8668, "step": 2167, "tokens_per_device": 4192 }, { "epoch": 0.8668, "loss_ce": 0.11706022918224335, "loss_lvr": 0.8600692749023438, "loss_mode_switch": 0.0, "loss_total": 0.20306715369224548, "step": 2167 }, { "batch_size": 4, "epoch": 0.8668, "step": 2167, "tokens_per_device": 2628 }, { "epoch": 0.8668, "loss_ce": 0.7222187519073486, "loss_lvr": 0.8084942102432251, "loss_mode_switch": 0.0, "loss_total": 0.8030681610107422, "step": 2167 }, { "batch_size": 4, "epoch": 0.8668, "step": 2167, "tokens_per_device": 4184 }, { "epoch": 0.8668, "loss_ce": 0.1466570645570755, "loss_lvr": 0.6387372016906738, "loss_mode_switch": 0.0, "loss_total": 0.21053078770637512, "step": 2167 }, { "batch_size": 4, "epoch": 0.8668, "step": 2167, "tokens_per_device": 4908 }, { "epoch": 0.8668, "loss_ce": 0.1306232213973999, "loss_lvr": 0.8367546200752258, "loss_mode_switch": 0.0, "loss_total": 0.21429869532585144, "step": 2167 }, { "batch_size": 4, "epoch": 0.8668, "step": 2167, "tokens_per_device": 3784 }, { "epoch": 0.8668, "loss_ce": 0.08029574155807495, "loss_lvr": 0.749113917350769, "loss_mode_switch": 0.0, "loss_total": 0.15520712733268738, "step": 2167 }, { "batch_size": 4, "epoch": 0.8668, "step": 2167, "tokens_per_device": 4000 }, { "epoch": 0.8668, "loss_ce": 0.04604202136397362, "loss_lvr": 0.6124071478843689, "loss_mode_switch": 0.0, "loss_total": 0.10728273540735245, "step": 2167 }, { "epoch": 0.8672, "grad_norm": 1.3106603622436523, "learning_rate": 4.553936896687161e-07, "loss": 0.2889, "step": 2168 }, { "batch_size": 4, "epoch": 0.8672, "step": 2168, "tokens_per_device": 3852 }, { "epoch": 0.8672, "loss_ce": 0.3979080617427826, "loss_lvr": 0.9942672252655029, "loss_mode_switch": 0.0, "loss_total": 0.4973347783088684, "step": 2168 }, { "batch_size": 1, "epoch": 0.8672, "step": 2168, "tokens_per_device": 5953 }, { "epoch": 0.8672, "loss_ce": 0.0033946128096431494, "loss_lvr": 0.4366239309310913, "loss_mode_switch": 0.0, "loss_total": 0.047057006508111954, "step": 2168 }, { "batch_size": 1, "epoch": 0.8672, "step": 2168, "tokens_per_device": 5248 }, { "epoch": 0.8672, "loss_ce": 0.00296511291526258, "loss_lvr": 0.4179501235485077, "loss_mode_switch": 0.0, "loss_total": 0.044760122895240784, "step": 2168 }, { "batch_size": 4, "epoch": 0.8672, "step": 2168, "tokens_per_device": 4316 }, { "epoch": 0.8672, "loss_ce": 0.023158174008131027, "loss_lvr": 0.8842415809631348, "loss_mode_switch": 0.0, "loss_total": 0.11158233880996704, "step": 2168 }, { "batch_size": 4, "epoch": 0.8672, "step": 2168, "tokens_per_device": 3672 }, { "epoch": 0.8672, "loss_ce": 0.16414877772331238, "loss_lvr": 1.2571583986282349, "loss_mode_switch": 0.0, "loss_total": 0.28986459970474243, "step": 2168 }, { "batch_size": 4, "epoch": 0.8672, "step": 2168, "tokens_per_device": 1212 }, { "epoch": 0.8672, "loss_ce": 0.3202076256275177, "loss_lvr": 0.9784512519836426, "loss_mode_switch": 0.0, "loss_total": 0.4180527627468109, "step": 2168 }, { "batch_size": 4, "epoch": 0.8672, "step": 2168, "tokens_per_device": 5748 }, { "epoch": 0.8672, "loss_ce": 0.3045147955417633, "loss_lvr": 0.8673809170722961, "loss_mode_switch": 0.0, "loss_total": 0.39125287532806396, "step": 2168 }, { "batch_size": 1, "epoch": 0.8672, "step": 2168, "tokens_per_device": 5451 }, { "epoch": 0.8672, "loss_ce": 0.0028738968539983034, "loss_lvr": 0.4100615084171295, "loss_mode_switch": 0.0, "loss_total": 0.043880049139261246, "step": 2168 }, { "epoch": 0.8676, "grad_norm": 1.2030247449874878, "learning_rate": 4.5269659078679973e-07, "loss": 0.2768, "step": 2169 }, { "batch_size": 1, "epoch": 0.8676, "step": 2169, "tokens_per_device": 5116 }, { "epoch": 0.8676, "loss_ce": 0.0009151287958957255, "loss_lvr": 0.6377971172332764, "loss_mode_switch": 0.0, "loss_total": 0.064694844186306, "step": 2169 }, { "batch_size": 4, "epoch": 0.8676, "step": 2169, "tokens_per_device": 3900 }, { "epoch": 0.8676, "loss_ce": 0.19477947056293488, "loss_lvr": 0.89228755235672, "loss_mode_switch": 0.0, "loss_total": 0.2840082347393036, "step": 2169 }, { "batch_size": 4, "epoch": 0.8676, "step": 2169, "tokens_per_device": 3008 }, { "epoch": 0.8676, "loss_ce": 0.05618526414036751, "loss_lvr": 0.8080016374588013, "loss_mode_switch": 0.0, "loss_total": 0.1369854360818863, "step": 2169 }, { "batch_size": 1, "epoch": 0.8676, "step": 2169, "tokens_per_device": 5102 }, { "epoch": 0.8676, "loss_ce": 0.0019265145529061556, "loss_lvr": 0.31214505434036255, "loss_mode_switch": 0.0, "loss_total": 0.03314102068543434, "step": 2169 }, { "batch_size": 1, "epoch": 0.8676, "step": 2169, "tokens_per_device": 4878 }, { "epoch": 0.8676, "loss_ce": 0.027649687603116035, "loss_lvr": 0.19895637035369873, "loss_mode_switch": 0.0, "loss_total": 0.04754532501101494, "step": 2169 }, { "batch_size": 1, "epoch": 0.8676, "step": 2169, "tokens_per_device": 4774 }, { "epoch": 0.8676, "loss_ce": 0.0007483621593564749, "loss_lvr": 0.18411758542060852, "loss_mode_switch": 0.0, "loss_total": 0.01916012167930603, "step": 2169 }, { "batch_size": 4, "epoch": 0.8676, "step": 2169, "tokens_per_device": 4256 }, { "epoch": 0.8676, "loss_ce": 0.2970998287200928, "loss_lvr": 0.8370612859725952, "loss_mode_switch": 0.0, "loss_total": 0.38080596923828125, "step": 2169 }, { "batch_size": 4, "epoch": 0.8676, "step": 2169, "tokens_per_device": 5032 }, { "epoch": 0.8676, "loss_ce": 0.14776049554347992, "loss_lvr": 0.6060640811920166, "loss_mode_switch": 0.0, "loss_total": 0.20836690068244934, "step": 2169 }, { "epoch": 0.868, "grad_norm": 1.316341757774353, "learning_rate": 4.5000712376024826e-07, "loss": 0.2635, "step": 2170 }, { "batch_size": 1, "epoch": 0.868, "step": 2170, "tokens_per_device": 4908 }, { "epoch": 0.868, "loss_ce": 0.8665716648101807, "loss_lvr": 0.4571587145328522, "loss_mode_switch": 0.0, "loss_total": 0.9122875332832336, "step": 2170 }, { "batch_size": 4, "epoch": 0.868, "step": 2170, "tokens_per_device": 1368 }, { "epoch": 0.868, "loss_ce": 0.14127454161643982, "loss_lvr": 0.8594370484352112, "loss_mode_switch": 0.0, "loss_total": 0.22721824049949646, "step": 2170 }, { "batch_size": 1, "epoch": 0.868, "step": 2170, "tokens_per_device": 5215 }, { "epoch": 0.868, "loss_ce": 0.01481354795396328, "loss_lvr": 0.29796046018600464, "loss_mode_switch": 0.0, "loss_total": 0.04460959509015083, "step": 2170 }, { "batch_size": 4, "epoch": 0.868, "step": 2170, "tokens_per_device": 2464 }, { "epoch": 0.868, "loss_ce": 0.11394234746694565, "loss_lvr": 1.0691922903060913, "loss_mode_switch": 0.0, "loss_total": 0.22086158394813538, "step": 2170 }, { "batch_size": 4, "epoch": 0.868, "step": 2170, "tokens_per_device": 4556 }, { "epoch": 0.868, "loss_ce": 0.7596070766448975, "loss_lvr": 0.63231360912323, "loss_mode_switch": 0.0, "loss_total": 0.8228384256362915, "step": 2170 }, { "batch_size": 1, "epoch": 0.868, "step": 2170, "tokens_per_device": 5183 }, { "epoch": 0.868, "loss_ce": 0.13759176433086395, "loss_lvr": 0.410573810338974, "loss_mode_switch": 0.0, "loss_total": 0.1786491423845291, "step": 2170 }, { "batch_size": 4, "epoch": 0.868, "step": 2170, "tokens_per_device": 3788 }, { "epoch": 0.868, "loss_ce": 0.3157521188259125, "loss_lvr": 0.9823263883590698, "loss_mode_switch": 0.0, "loss_total": 0.4139847755432129, "step": 2170 }, { "batch_size": 1, "epoch": 0.868, "step": 2170, "tokens_per_device": 5108 }, { "epoch": 0.868, "loss_ce": 0.06587187200784683, "loss_lvr": 0.6088597178459167, "loss_mode_switch": 0.0, "loss_total": 0.12675784528255463, "step": 2170 }, { "epoch": 0.8684, "grad_norm": 1.433426856994629, "learning_rate": 4.473252931028643e-07, "loss": 0.303, "step": 2171 }, { "batch_size": 4, "epoch": 0.8684, "step": 2171, "tokens_per_device": 3768 }, { "epoch": 0.8684, "loss_ce": 0.28649085760116577, "loss_lvr": 0.9984471797943115, "loss_mode_switch": 0.0, "loss_total": 0.3863355815410614, "step": 2171 }, { "batch_size": 4, "epoch": 0.8684, "step": 2171, "tokens_per_device": 5700 }, { "epoch": 0.8684, "loss_ce": 0.006639636587351561, "loss_lvr": 0.8409169912338257, "loss_mode_switch": 0.0, "loss_total": 0.09073133766651154, "step": 2171 }, { "batch_size": 4, "epoch": 0.8684, "step": 2171, "tokens_per_device": 5888 }, { "epoch": 0.8684, "loss_ce": 0.043428968638181686, "loss_lvr": 0.7036656141281128, "loss_mode_switch": 0.0, "loss_total": 0.11379553377628326, "step": 2171 }, { "batch_size": 4, "epoch": 0.8684, "step": 2171, "tokens_per_device": 3864 }, { "epoch": 0.8684, "loss_ce": 0.4240550696849823, "loss_lvr": 0.8838033080101013, "loss_mode_switch": 0.0, "loss_total": 0.5124353766441345, "step": 2171 }, { "batch_size": 4, "epoch": 0.8684, "step": 2171, "tokens_per_device": 5984 }, { "epoch": 0.8684, "loss_ce": 0.6820034384727478, "loss_lvr": 0.7506077289581299, "loss_mode_switch": 0.0, "loss_total": 0.7570642232894897, "step": 2171 }, { "batch_size": 4, "epoch": 0.8684, "step": 2171, "tokens_per_device": 4312 }, { "epoch": 0.8684, "loss_ce": 0.27485761046409607, "loss_lvr": 0.45600205659866333, "loss_mode_switch": 0.0, "loss_total": 0.3204578161239624, "step": 2171 }, { "batch_size": 1, "epoch": 0.8684, "step": 2171, "tokens_per_device": 4878 }, { "epoch": 0.8684, "loss_ce": 0.006611066870391369, "loss_lvr": 0.30460116267204285, "loss_mode_switch": 0.0, "loss_total": 0.03707118332386017, "step": 2171 }, { "batch_size": 1, "epoch": 0.8684, "step": 2171, "tokens_per_device": 4894 }, { "epoch": 0.8684, "loss_ce": 0.0017271219985559583, "loss_lvr": 0.9302432537078857, "loss_mode_switch": 0.0, "loss_total": 0.09475144743919373, "step": 2171 }, { "epoch": 0.8688, "grad_norm": 1.3199957609176636, "learning_rate": 4.446511033156337e-07, "loss": 0.31, "step": 2172 }, { "batch_size": 4, "epoch": 0.8688, "step": 2172, "tokens_per_device": 8052 }, { "epoch": 0.8688, "loss_ce": 0.13186247646808624, "loss_lvr": 0.5965120196342468, "loss_mode_switch": 0.0, "loss_total": 0.19151368737220764, "step": 2172 }, { "batch_size": 4, "epoch": 0.8688, "step": 2172, "tokens_per_device": 3776 }, { "epoch": 0.8688, "loss_ce": 0.5071892142295837, "loss_lvr": 0.6793511509895325, "loss_mode_switch": 0.0, "loss_total": 0.5751243233680725, "step": 2172 }, { "batch_size": 4, "epoch": 0.8688, "step": 2172, "tokens_per_device": 4132 }, { "epoch": 0.8688, "loss_ce": 0.23848995566368103, "loss_lvr": 0.8466440439224243, "loss_mode_switch": 0.0, "loss_total": 0.32315436005592346, "step": 2172 }, { "batch_size": 4, "epoch": 0.8688, "step": 2172, "tokens_per_device": 4804 }, { "epoch": 0.8688, "loss_ce": 0.3281380236148834, "loss_lvr": 0.7678821086883545, "loss_mode_switch": 0.0, "loss_total": 0.40492624044418335, "step": 2172 }, { "batch_size": 4, "epoch": 0.8688, "step": 2172, "tokens_per_device": 3812 }, { "epoch": 0.8688, "loss_ce": 0.17378054559230804, "loss_lvr": 0.8302383422851562, "loss_mode_switch": 0.0, "loss_total": 0.25680437684059143, "step": 2172 }, { "batch_size": 4, "epoch": 0.8688, "step": 2172, "tokens_per_device": 2024 }, { "epoch": 0.8688, "loss_ce": 0.2826850712299347, "loss_lvr": 1.2756785154342651, "loss_mode_switch": 0.0, "loss_total": 0.4102529287338257, "step": 2172 }, { "batch_size": 4, "epoch": 0.8688, "step": 2172, "tokens_per_device": 4264 }, { "epoch": 0.8688, "loss_ce": 0.12800763547420502, "loss_lvr": 0.8591978549957275, "loss_mode_switch": 0.0, "loss_total": 0.21392741799354553, "step": 2172 }, { "batch_size": 4, "epoch": 0.8688, "step": 2172, "tokens_per_device": 1256 }, { "epoch": 0.8688, "loss_ce": 0.195652574300766, "loss_lvr": 1.064037561416626, "loss_mode_switch": 0.0, "loss_total": 0.30205634236335754, "step": 2172 }, { "epoch": 0.8692, "grad_norm": 1.4123810529708862, "learning_rate": 4.419845588867161e-07, "loss": 0.2418, "step": 2173 }, { "batch_size": 4, "epoch": 0.8692, "step": 2173, "tokens_per_device": 4204 }, { "epoch": 0.8692, "loss_ce": 0.2914606034755707, "loss_lvr": 0.6668355464935303, "loss_mode_switch": 0.0, "loss_total": 0.3581441640853882, "step": 2173 }, { "batch_size": 4, "epoch": 0.8692, "step": 2173, "tokens_per_device": 3224 }, { "epoch": 0.8692, "loss_ce": 0.15802104771137238, "loss_lvr": 0.8166390657424927, "loss_mode_switch": 0.0, "loss_total": 0.23968495428562164, "step": 2173 }, { "batch_size": 4, "epoch": 0.8692, "step": 2173, "tokens_per_device": 1316 }, { "epoch": 0.8692, "loss_ce": 0.06450127065181732, "loss_lvr": 0.9758970141410828, "loss_mode_switch": 0.0, "loss_total": 0.1620909720659256, "step": 2173 }, { "batch_size": 4, "epoch": 0.8692, "step": 2173, "tokens_per_device": 14864 }, { "epoch": 0.8692, "loss_ce": 0.1601235270500183, "loss_lvr": 0.6347091794013977, "loss_mode_switch": 0.0, "loss_total": 0.22359445691108704, "step": 2173 }, { "batch_size": 4, "epoch": 0.8692, "step": 2173, "tokens_per_device": 11148 }, { "epoch": 0.8692, "loss_ce": 0.0014589637285098433, "loss_lvr": 0.6617933511734009, "loss_mode_switch": 0.0, "loss_total": 0.06763830035924911, "step": 2173 }, { "batch_size": 1, "epoch": 0.8692, "step": 2173, "tokens_per_device": 5121 }, { "epoch": 0.8692, "loss_ce": 0.5524352788925171, "loss_lvr": 0.545964777469635, "loss_mode_switch": 0.0, "loss_total": 0.6070317625999451, "step": 2173 }, { "batch_size": 4, "epoch": 0.8692, "step": 2173, "tokens_per_device": 3756 }, { "epoch": 0.8692, "loss_ce": 0.09492941200733185, "loss_lvr": 0.6908406019210815, "loss_mode_switch": 0.0, "loss_total": 0.16401347517967224, "step": 2173 }, { "batch_size": 1, "epoch": 0.8692, "step": 2173, "tokens_per_device": 6658 }, { "epoch": 0.8692, "loss_ce": 0.06630142778158188, "loss_lvr": 0.31944504380226135, "loss_mode_switch": 0.0, "loss_total": 0.09824593365192413, "step": 2173 }, { "epoch": 0.8696, "grad_norm": 1.3026645183563232, "learning_rate": 4.3932566429144175e-07, "loss": 0.2386, "step": 2174 }, { "batch_size": 4, "epoch": 0.8696, "step": 2174, "tokens_per_device": 3440 }, { "epoch": 0.8696, "loss_ce": 0.39165011048316956, "loss_lvr": 0.5988928079605103, "loss_mode_switch": 0.0, "loss_total": 0.45153939723968506, "step": 2174 }, { "batch_size": 1, "epoch": 0.8696, "step": 2174, "tokens_per_device": 5130 }, { "epoch": 0.8696, "loss_ce": 0.06707064062356949, "loss_lvr": 0.4017525315284729, "loss_mode_switch": 0.0, "loss_total": 0.10724589228630066, "step": 2174 }, { "batch_size": 4, "epoch": 0.8696, "step": 2174, "tokens_per_device": 3916 }, { "epoch": 0.8696, "loss_ce": 0.3050391674041748, "loss_lvr": 0.8585315942764282, "loss_mode_switch": 0.0, "loss_total": 0.3908923268318176, "step": 2174 }, { "batch_size": 4, "epoch": 0.8696, "step": 2174, "tokens_per_device": 7044 }, { "epoch": 0.8696, "loss_ce": 0.010032855905592442, "loss_lvr": 0.9565370082855225, "loss_mode_switch": 0.0, "loss_total": 0.10568656027317047, "step": 2174 }, { "batch_size": 4, "epoch": 0.8696, "step": 2174, "tokens_per_device": 5196 }, { "epoch": 0.8696, "loss_ce": 0.2152615189552307, "loss_lvr": 0.7091903686523438, "loss_mode_switch": 0.0, "loss_total": 0.2861805558204651, "step": 2174 }, { "batch_size": 1, "epoch": 0.8696, "step": 2174, "tokens_per_device": 5001 }, { "epoch": 0.8696, "loss_ce": 0.03353545442223549, "loss_lvr": 0.7626256942749023, "loss_mode_switch": 0.0, "loss_total": 0.10979802906513214, "step": 2174 }, { "batch_size": 4, "epoch": 0.8696, "step": 2174, "tokens_per_device": 1408 }, { "epoch": 0.8696, "loss_ce": 0.41038256883621216, "loss_lvr": 1.1425189971923828, "loss_mode_switch": 0.0, "loss_total": 0.5246344804763794, "step": 2174 }, { "batch_size": 4, "epoch": 0.8696, "step": 2174, "tokens_per_device": 2460 }, { "epoch": 0.8696, "loss_ce": 0.038565218448638916, "loss_lvr": 0.8213012218475342, "loss_mode_switch": 0.0, "loss_total": 0.12069534510374069, "step": 2174 }, { "epoch": 0.87, "grad_norm": 1.2958632707595825, "learning_rate": 4.3667442399229985e-07, "loss": 0.2945, "step": 2175 }, { "batch_size": 1, "epoch": 0.87, "step": 2175, "tokens_per_device": 4862 }, { "epoch": 0.87, "loss_ce": 0.0031892939005047083, "loss_lvr": 0.5082661509513855, "loss_mode_switch": 0.0, "loss_total": 0.05401591211557388, "step": 2175 }, { "batch_size": 4, "epoch": 0.87, "step": 2175, "tokens_per_device": 10216 }, { "epoch": 0.87, "loss_ce": 0.45754051208496094, "loss_lvr": 0.7099864482879639, "loss_mode_switch": 0.0, "loss_total": 0.5285391807556152, "step": 2175 }, { "batch_size": 4, "epoch": 0.87, "step": 2175, "tokens_per_device": 2868 }, { "epoch": 0.87, "loss_ce": 0.5096787214279175, "loss_lvr": 0.9213400483131409, "loss_mode_switch": 0.0, "loss_total": 0.6018127202987671, "step": 2175 }, { "batch_size": 1, "epoch": 0.87, "step": 2175, "tokens_per_device": 6446 }, { "epoch": 0.87, "loss_ce": 0.006030504126101732, "loss_lvr": 0.3214142322540283, "loss_mode_switch": 0.0, "loss_total": 0.038171928375959396, "step": 2175 }, { "batch_size": 4, "epoch": 0.87, "step": 2175, "tokens_per_device": 4616 }, { "epoch": 0.87, "loss_ce": 0.28411680459976196, "loss_lvr": 0.7595223784446716, "loss_mode_switch": 0.0, "loss_total": 0.36006903648376465, "step": 2175 }, { "batch_size": 4, "epoch": 0.87, "step": 2175, "tokens_per_device": 6108 }, { "epoch": 0.87, "loss_ce": 0.06969576328992844, "loss_lvr": 0.5979243516921997, "loss_mode_switch": 0.0, "loss_total": 0.12948819994926453, "step": 2175 }, { "batch_size": 4, "epoch": 0.87, "step": 2175, "tokens_per_device": 2676 }, { "epoch": 0.87, "loss_ce": 0.6128731369972229, "loss_lvr": 0.8396062850952148, "loss_mode_switch": 0.0, "loss_total": 0.6968337893486023, "step": 2175 }, { "batch_size": 1, "epoch": 0.87, "step": 2175, "tokens_per_device": 4948 }, { "epoch": 0.87, "loss_ce": 0.0030569357331842184, "loss_lvr": 0.14807076752185822, "loss_mode_switch": 0.0, "loss_total": 0.017864013090729713, "step": 2175 }, { "epoch": 0.8704, "grad_norm": 1.2947747707366943, "learning_rate": 4.3403084243893556e-07, "loss": 0.2884, "step": 2176 }, { "batch_size": 1, "epoch": 0.8704, "step": 2176, "tokens_per_device": 4935 }, { "epoch": 0.8704, "loss_ce": 0.16921314597129822, "loss_lvr": 0.39556458592414856, "loss_mode_switch": 0.0, "loss_total": 0.20876960456371307, "step": 2176 }, { "batch_size": 1, "epoch": 0.8704, "step": 2176, "tokens_per_device": 6398 }, { "epoch": 0.8704, "loss_ce": 0.11938836425542831, "loss_lvr": 0.4313465356826782, "loss_mode_switch": 0.0, "loss_total": 0.16252301633358002, "step": 2176 }, { "batch_size": 4, "epoch": 0.8704, "step": 2176, "tokens_per_device": 7160 }, { "epoch": 0.8704, "loss_ce": 0.3591398596763611, "loss_lvr": 0.3336597979068756, "loss_mode_switch": 0.0, "loss_total": 0.39250582456588745, "step": 2176 }, { "batch_size": 1, "epoch": 0.8704, "step": 2176, "tokens_per_device": 4814 }, { "epoch": 0.8704, "loss_ce": 0.007275203242897987, "loss_lvr": 0.15277904272079468, "loss_mode_switch": 0.0, "loss_total": 0.022553108632564545, "step": 2176 }, { "batch_size": 1, "epoch": 0.8704, "step": 2176, "tokens_per_device": 4969 }, { "epoch": 0.8704, "loss_ce": 0.0006459529977291822, "loss_lvr": 0.3347944915294647, "loss_mode_switch": 0.0, "loss_total": 0.03412540256977081, "step": 2176 }, { "batch_size": 1, "epoch": 0.8704, "step": 2176, "tokens_per_device": 5122 }, { "epoch": 0.8704, "loss_ce": 0.006535834167152643, "loss_lvr": 0.5784701704978943, "loss_mode_switch": 0.0, "loss_total": 0.06438285112380981, "step": 2176 }, { "batch_size": 1, "epoch": 0.8704, "step": 2176, "tokens_per_device": 4889 }, { "epoch": 0.8704, "loss_ce": 0.0009993037674576044, "loss_lvr": 0.4070911109447479, "loss_mode_switch": 0.0, "loss_total": 0.04170841723680496, "step": 2176 }, { "batch_size": 4, "epoch": 0.8704, "step": 2176, "tokens_per_device": 3756 }, { "epoch": 0.8704, "loss_ce": 0.24244368076324463, "loss_lvr": 0.8557361960411072, "loss_mode_switch": 0.0, "loss_total": 0.32801729440689087, "step": 2176 }, { "epoch": 0.8708, "grad_norm": 1.2534633874893188, "learning_rate": 4.3139492406814086e-07, "loss": 0.2322, "step": 2177 }, { "batch_size": 4, "epoch": 0.8708, "step": 2177, "tokens_per_device": 4208 }, { "epoch": 0.8708, "loss_ce": 0.14439307153224945, "loss_lvr": 0.9002037048339844, "loss_mode_switch": 0.0, "loss_total": 0.23441344499588013, "step": 2177 }, { "batch_size": 4, "epoch": 0.8708, "step": 2177, "tokens_per_device": 1488 }, { "epoch": 0.8708, "loss_ce": 0.5375204086303711, "loss_lvr": 0.84388667345047, "loss_mode_switch": 0.0, "loss_total": 0.6219090819358826, "step": 2177 }, { "batch_size": 1, "epoch": 0.8708, "step": 2177, "tokens_per_device": 5074 }, { "epoch": 0.8708, "loss_ce": 0.015392931178212166, "loss_lvr": 0.3638925552368164, "loss_mode_switch": 0.0, "loss_total": 0.051782190799713135, "step": 2177 }, { "batch_size": 4, "epoch": 0.8708, "step": 2177, "tokens_per_device": 3036 }, { "epoch": 0.8708, "loss_ce": 0.463608980178833, "loss_lvr": 0.8608109951019287, "loss_mode_switch": 0.0, "loss_total": 0.5496900677680969, "step": 2177 }, { "batch_size": 4, "epoch": 0.8708, "step": 2177, "tokens_per_device": 4416 }, { "epoch": 0.8708, "loss_ce": 0.5283764600753784, "loss_lvr": 0.6781628131866455, "loss_mode_switch": 0.0, "loss_total": 0.5961927175521851, "step": 2177 }, { "batch_size": 1, "epoch": 0.8708, "step": 2177, "tokens_per_device": 5103 }, { "epoch": 0.8708, "loss_ce": 0.15868398547172546, "loss_lvr": 0.3024396300315857, "loss_mode_switch": 0.0, "loss_total": 0.18892794847488403, "step": 2177 }, { "batch_size": 4, "epoch": 0.8708, "step": 2177, "tokens_per_device": 6820 }, { "epoch": 0.8708, "loss_ce": 0.23010395467281342, "loss_lvr": 0.5276261568069458, "loss_mode_switch": 0.0, "loss_total": 0.28286656737327576, "step": 2177 }, { "batch_size": 4, "epoch": 0.8708, "step": 2177, "tokens_per_device": 4192 }, { "epoch": 0.8708, "loss_ce": 0.21467946469783783, "loss_lvr": 0.6516140699386597, "loss_mode_switch": 0.0, "loss_total": 0.279840886592865, "step": 2177 }, { "epoch": 0.8712, "grad_norm": 1.2779651880264282, "learning_rate": 4.2876667330384315e-07, "loss": 0.271, "step": 2178 }, { "batch_size": 1, "epoch": 0.8712, "step": 2178, "tokens_per_device": 4822 }, { "epoch": 0.8712, "loss_ce": 0.0977216511964798, "loss_lvr": 0.556032121181488, "loss_mode_switch": 0.0, "loss_total": 0.15332487225532532, "step": 2178 }, { "batch_size": 4, "epoch": 0.8712, "step": 2178, "tokens_per_device": 4256 }, { "epoch": 0.8712, "loss_ce": 0.04062121361494064, "loss_lvr": 0.7557310461997986, "loss_mode_switch": 0.0, "loss_total": 0.11619432270526886, "step": 2178 }, { "batch_size": 1, "epoch": 0.8712, "step": 2178, "tokens_per_device": 5109 }, { "epoch": 0.8712, "loss_ce": 0.258167564868927, "loss_lvr": 0.40305331349372864, "loss_mode_switch": 0.0, "loss_total": 0.29847288131713867, "step": 2178 }, { "batch_size": 1, "epoch": 0.8712, "step": 2178, "tokens_per_device": 5180 }, { "epoch": 0.8712, "loss_ce": 0.011386599391698837, "loss_lvr": 0.12438686937093735, "loss_mode_switch": 0.0, "loss_total": 0.02382528781890869, "step": 2178 }, { "batch_size": 4, "epoch": 0.8712, "step": 2178, "tokens_per_device": 4424 }, { "epoch": 0.8712, "loss_ce": 0.1682964414358139, "loss_lvr": 0.8118026256561279, "loss_mode_switch": 0.0, "loss_total": 0.24947670102119446, "step": 2178 }, { "batch_size": 4, "epoch": 0.8712, "step": 2178, "tokens_per_device": 5316 }, { "epoch": 0.8712, "loss_ce": 0.36010703444480896, "loss_lvr": 0.8968873620033264, "loss_mode_switch": 0.0, "loss_total": 0.44979578256607056, "step": 2178 }, { "batch_size": 4, "epoch": 0.8712, "step": 2178, "tokens_per_device": 5872 }, { "epoch": 0.8712, "loss_ce": 0.12652021646499634, "loss_lvr": 0.8531861901283264, "loss_mode_switch": 0.0, "loss_total": 0.21183884143829346, "step": 2178 }, { "batch_size": 1, "epoch": 0.8712, "step": 2178, "tokens_per_device": 4931 }, { "epoch": 0.8712, "loss_ce": 0.20988906919956207, "loss_lvr": 0.4581800103187561, "loss_mode_switch": 0.0, "loss_total": 0.2557070851325989, "step": 2178 }, { "epoch": 0.8716, "grad_norm": 1.3870556354522705, "learning_rate": 4.261460945571017e-07, "loss": 0.297, "step": 2179 }, { "batch_size": 1, "epoch": 0.8716, "step": 2179, "tokens_per_device": 4896 }, { "epoch": 0.8716, "loss_ce": 0.04705693945288658, "loss_lvr": 0.16487780213356018, "loss_mode_switch": 0.0, "loss_total": 0.06354472041130066, "step": 2179 }, { "batch_size": 1, "epoch": 0.8716, "step": 2179, "tokens_per_device": 4897 }, { "epoch": 0.8716, "loss_ce": 0.012166434898972511, "loss_lvr": 0.42424511909484863, "loss_mode_switch": 0.0, "loss_total": 0.054590947926044464, "step": 2179 }, { "batch_size": 1, "epoch": 0.8716, "step": 2179, "tokens_per_device": 4866 }, { "epoch": 0.8716, "loss_ce": 0.0004783602198585868, "loss_lvr": 0.2338757961988449, "loss_mode_switch": 0.0, "loss_total": 0.023865940049290657, "step": 2179 }, { "batch_size": 4, "epoch": 0.8716, "step": 2179, "tokens_per_device": 4652 }, { "epoch": 0.8716, "loss_ce": 0.12388928234577179, "loss_lvr": 0.8303079605102539, "loss_mode_switch": 0.0, "loss_total": 0.2069200873374939, "step": 2179 }, { "batch_size": 4, "epoch": 0.8716, "step": 2179, "tokens_per_device": 1328 }, { "epoch": 0.8716, "loss_ce": 0.3459881842136383, "loss_lvr": 1.0437954664230347, "loss_mode_switch": 0.0, "loss_total": 0.4503677487373352, "step": 2179 }, { "batch_size": 4, "epoch": 0.8716, "step": 2179, "tokens_per_device": 6552 }, { "epoch": 0.8716, "loss_ce": 0.05645812675356865, "loss_lvr": 0.8008939027786255, "loss_mode_switch": 0.0, "loss_total": 0.1365475207567215, "step": 2179 }, { "batch_size": 4, "epoch": 0.8716, "step": 2179, "tokens_per_device": 4248 }, { "epoch": 0.8716, "loss_ce": 0.03137790784239769, "loss_lvr": 0.6839825510978699, "loss_mode_switch": 0.0, "loss_total": 0.09977616369724274, "step": 2179 }, { "batch_size": 4, "epoch": 0.8716, "step": 2179, "tokens_per_device": 4212 }, { "epoch": 0.8716, "loss_ce": 0.5318422317504883, "loss_lvr": 0.6063458323478699, "loss_mode_switch": 0.0, "loss_total": 0.5924768447875977, "step": 2179 }, { "epoch": 0.872, "grad_norm": 1.2582767009735107, "learning_rate": 4.2353319222610265e-07, "loss": 0.2834, "step": 2180 }, { "batch_size": 4, "epoch": 0.872, "step": 2180, "tokens_per_device": 1652 }, { "epoch": 0.872, "loss_ce": 0.26145827770233154, "loss_lvr": 0.9340364933013916, "loss_mode_switch": 0.0, "loss_total": 0.35486191511154175, "step": 2180 }, { "batch_size": 4, "epoch": 0.872, "step": 2180, "tokens_per_device": 5188 }, { "epoch": 0.872, "loss_ce": 0.1739884912967682, "loss_lvr": 0.8348714113235474, "loss_mode_switch": 0.0, "loss_total": 0.2574756443500519, "step": 2180 }, { "batch_size": 4, "epoch": 0.872, "step": 2180, "tokens_per_device": 8460 }, { "epoch": 0.872, "loss_ce": 0.2335105538368225, "loss_lvr": 0.6792911887168884, "loss_mode_switch": 0.0, "loss_total": 0.30143967270851135, "step": 2180 }, { "batch_size": 1, "epoch": 0.872, "step": 2180, "tokens_per_device": 4596 }, { "epoch": 0.872, "loss_ce": 0.25068098306655884, "loss_lvr": 0.4546092748641968, "loss_mode_switch": 0.0, "loss_total": 0.29614192247390747, "step": 2180 }, { "batch_size": 4, "epoch": 0.872, "step": 2180, "tokens_per_device": 1488 }, { "epoch": 0.872, "loss_ce": 0.4782809913158417, "loss_lvr": 1.1262059211730957, "loss_mode_switch": 0.0, "loss_total": 0.5909016132354736, "step": 2180 }, { "batch_size": 4, "epoch": 0.872, "step": 2180, "tokens_per_device": 1380 }, { "epoch": 0.872, "loss_ce": 0.20597507059574127, "loss_lvr": 1.0260776281356812, "loss_mode_switch": 0.0, "loss_total": 0.3085828423500061, "step": 2180 }, { "batch_size": 4, "epoch": 0.872, "step": 2180, "tokens_per_device": 1964 }, { "epoch": 0.872, "loss_ce": 0.1736634373664856, "loss_lvr": 0.8354251384735107, "loss_mode_switch": 0.0, "loss_total": 0.2572059631347656, "step": 2180 }, { "batch_size": 4, "epoch": 0.872, "step": 2180, "tokens_per_device": 12056 }, { "epoch": 0.872, "loss_ce": 0.47950372099876404, "loss_lvr": 0.365614652633667, "loss_mode_switch": 0.0, "loss_total": 0.5160651803016663, "step": 2180 }, { "epoch": 0.8724, "grad_norm": 1.3787232637405396, "learning_rate": 4.2092797069614667e-07, "loss": 0.2719, "step": 2181 }, { "batch_size": 4, "epoch": 0.8724, "step": 2181, "tokens_per_device": 4088 }, { "epoch": 0.8724, "loss_ce": 0.0761428102850914, "loss_lvr": 0.6435273289680481, "loss_mode_switch": 0.0, "loss_total": 0.14049553871154785, "step": 2181 }, { "batch_size": 4, "epoch": 0.8724, "step": 2181, "tokens_per_device": 5408 }, { "epoch": 0.8724, "loss_ce": 0.3822578191757202, "loss_lvr": 0.8804546594619751, "loss_mode_switch": 0.0, "loss_total": 0.4703032970428467, "step": 2181 }, { "batch_size": 4, "epoch": 0.8724, "step": 2181, "tokens_per_device": 4348 }, { "epoch": 0.8724, "loss_ce": 0.2194548398256302, "loss_lvr": 0.8348596096038818, "loss_mode_switch": 0.0, "loss_total": 0.3029407858848572, "step": 2181 }, { "batch_size": 4, "epoch": 0.8724, "step": 2181, "tokens_per_device": 3904 }, { "epoch": 0.8724, "loss_ce": 0.5558333396911621, "loss_lvr": 0.892541229724884, "loss_mode_switch": 0.0, "loss_total": 0.645087480545044, "step": 2181 }, { "batch_size": 1, "epoch": 0.8724, "step": 2181, "tokens_per_device": 5095 }, { "epoch": 0.8724, "loss_ce": 0.00028328958433121443, "loss_lvr": 0.7420778274536133, "loss_mode_switch": 0.0, "loss_total": 0.07449106872081757, "step": 2181 }, { "batch_size": 4, "epoch": 0.8724, "step": 2181, "tokens_per_device": 13156 }, { "epoch": 0.8724, "loss_ce": 0.1276112049818039, "loss_lvr": 0.43117740750312805, "loss_mode_switch": 0.0, "loss_total": 0.17072895169258118, "step": 2181 }, { "batch_size": 4, "epoch": 0.8724, "step": 2181, "tokens_per_device": 4236 }, { "epoch": 0.8724, "loss_ce": 0.5552061796188354, "loss_lvr": 0.7231960892677307, "loss_mode_switch": 0.0, "loss_total": 0.627525806427002, "step": 2181 }, { "batch_size": 1, "epoch": 0.8724, "step": 2181, "tokens_per_device": 7398 }, { "epoch": 0.8724, "loss_ce": 0.00016079659690149128, "loss_lvr": 0.30886155366897583, "loss_mode_switch": 0.0, "loss_total": 0.031046953052282333, "step": 2181 }, { "epoch": 0.8728, "grad_norm": 1.4522464275360107, "learning_rate": 4.183304343396427e-07, "loss": 0.2926, "step": 2182 }, { "batch_size": 4, "epoch": 0.8728, "step": 2182, "tokens_per_device": 14156 }, { "epoch": 0.8728, "loss_ce": 0.19715295732021332, "loss_lvr": 0.26651814579963684, "loss_mode_switch": 0.0, "loss_total": 0.223804771900177, "step": 2182 }, { "batch_size": 1, "epoch": 0.8728, "step": 2182, "tokens_per_device": 5141 }, { "epoch": 0.8728, "loss_ce": 0.010929013602435589, "loss_lvr": 0.3257821500301361, "loss_mode_switch": 0.0, "loss_total": 0.043507229536771774, "step": 2182 }, { "batch_size": 1, "epoch": 0.8728, "step": 2182, "tokens_per_device": 4867 }, { "epoch": 0.8728, "loss_ce": 0.008735437877476215, "loss_lvr": 0.23577654361724854, "loss_mode_switch": 0.0, "loss_total": 0.03231309354305267, "step": 2182 }, { "batch_size": 4, "epoch": 0.8728, "step": 2182, "tokens_per_device": 5768 }, { "epoch": 0.8728, "loss_ce": 0.17830799520015717, "loss_lvr": 0.8196890950202942, "loss_mode_switch": 0.0, "loss_total": 0.2602769136428833, "step": 2182 }, { "batch_size": 4, "epoch": 0.8728, "step": 2182, "tokens_per_device": 4180 }, { "epoch": 0.8728, "loss_ce": 0.058028124272823334, "loss_lvr": 0.8548941016197205, "loss_mode_switch": 0.0, "loss_total": 0.14351753890514374, "step": 2182 }, { "batch_size": 4, "epoch": 0.8728, "step": 2182, "tokens_per_device": 1732 }, { "epoch": 0.8728, "loss_ce": 0.345130056142807, "loss_lvr": 0.8952491283416748, "loss_mode_switch": 0.0, "loss_total": 0.43465498089790344, "step": 2182 }, { "batch_size": 1, "epoch": 0.8728, "step": 2182, "tokens_per_device": 5173 }, { "epoch": 0.8728, "loss_ce": 0.008474924601614475, "loss_lvr": 0.4190179705619812, "loss_mode_switch": 0.0, "loss_total": 0.05037672072649002, "step": 2182 }, { "batch_size": 1, "epoch": 0.8728, "step": 2182, "tokens_per_device": 4849 }, { "epoch": 0.8728, "loss_ce": 0.0020310806576162577, "loss_lvr": 0.22784622013568878, "loss_mode_switch": 0.0, "loss_total": 0.024815702810883522, "step": 2182 }, { "epoch": 0.8732, "grad_norm": 1.3276857137680054, "learning_rate": 4.15740587516103e-07, "loss": 0.2912, "step": 2183 }, { "batch_size": 1, "epoch": 0.8732, "step": 2183, "tokens_per_device": 4860 }, { "epoch": 0.8732, "loss_ce": 0.04425898566842079, "loss_lvr": 0.3333539664745331, "loss_mode_switch": 0.0, "loss_total": 0.07759438455104828, "step": 2183 }, { "batch_size": 4, "epoch": 0.8732, "step": 2183, "tokens_per_device": 5204 }, { "epoch": 0.8732, "loss_ce": 0.18103362619876862, "loss_lvr": 0.7119197845458984, "loss_mode_switch": 0.0, "loss_total": 0.2522256076335907, "step": 2183 }, { "batch_size": 4, "epoch": 0.8732, "step": 2183, "tokens_per_device": 4812 }, { "epoch": 0.8732, "loss_ce": 0.7509868144989014, "loss_lvr": 1.040532112121582, "loss_mode_switch": 0.0, "loss_total": 0.8550400137901306, "step": 2183 }, { "batch_size": 4, "epoch": 0.8732, "step": 2183, "tokens_per_device": 2772 }, { "epoch": 0.8732, "loss_ce": 0.3037834167480469, "loss_lvr": 0.7484430074691772, "loss_mode_switch": 0.0, "loss_total": 0.3786277174949646, "step": 2183 }, { "batch_size": 4, "epoch": 0.8732, "step": 2183, "tokens_per_device": 4756 }, { "epoch": 0.8732, "loss_ce": 0.028973456472158432, "loss_lvr": 0.7828317880630493, "loss_mode_switch": 0.0, "loss_total": 0.10725663602352142, "step": 2183 }, { "batch_size": 4, "epoch": 0.8732, "step": 2183, "tokens_per_device": 4716 }, { "epoch": 0.8732, "loss_ce": 0.07504967600107193, "loss_lvr": 0.8329349160194397, "loss_mode_switch": 0.0, "loss_total": 0.15834316611289978, "step": 2183 }, { "batch_size": 4, "epoch": 0.8732, "step": 2183, "tokens_per_device": 1212 }, { "epoch": 0.8732, "loss_ce": 0.2711271345615387, "loss_lvr": 1.9860739707946777, "loss_mode_switch": 0.0, "loss_total": 0.4697345495223999, "step": 2183 }, { "batch_size": 4, "epoch": 0.8732, "step": 2183, "tokens_per_device": 12152 }, { "epoch": 0.8732, "loss_ce": 0.23701688647270203, "loss_lvr": 0.9114974737167358, "loss_mode_switch": 0.0, "loss_total": 0.3281666338443756, "step": 2183 }, { "epoch": 0.8736, "grad_norm": 1.2475048303604126, "learning_rate": 4.131584345721312e-07, "loss": 0.2836, "step": 2184 }, { "batch_size": 4, "epoch": 0.8736, "step": 2184, "tokens_per_device": 3768 }, { "epoch": 0.8736, "loss_ce": 0.0006366766756400466, "loss_lvr": 0.7768782377243042, "loss_mode_switch": 0.0, "loss_total": 0.07832449674606323, "step": 2184 }, { "batch_size": 1, "epoch": 0.8736, "step": 2184, "tokens_per_device": 5057 }, { "epoch": 0.8736, "loss_ce": 0.002039545914158225, "loss_lvr": 0.405683308839798, "loss_mode_switch": 0.0, "loss_total": 0.042607877403497696, "step": 2184 }, { "batch_size": 1, "epoch": 0.8736, "step": 2184, "tokens_per_device": 4894 }, { "epoch": 0.8736, "loss_ce": 0.017179764807224274, "loss_lvr": 0.7567779421806335, "loss_mode_switch": 0.0, "loss_total": 0.09285756200551987, "step": 2184 }, { "batch_size": 4, "epoch": 0.8736, "step": 2184, "tokens_per_device": 5432 }, { "epoch": 0.8736, "loss_ce": 0.06401105225086212, "loss_lvr": 0.8453329801559448, "loss_mode_switch": 0.0, "loss_total": 0.1485443413257599, "step": 2184 }, { "batch_size": 4, "epoch": 0.8736, "step": 2184, "tokens_per_device": 4000 }, { "epoch": 0.8736, "loss_ce": 0.2835542857646942, "loss_lvr": 0.9014196991920471, "loss_mode_switch": 0.0, "loss_total": 0.3736962676048279, "step": 2184 }, { "batch_size": 4, "epoch": 0.8736, "step": 2184, "tokens_per_device": 6176 }, { "epoch": 0.8736, "loss_ce": 0.16878433525562286, "loss_lvr": 0.8290057182312012, "loss_mode_switch": 0.0, "loss_total": 0.25168490409851074, "step": 2184 }, { "batch_size": 4, "epoch": 0.8736, "step": 2184, "tokens_per_device": 2696 }, { "epoch": 0.8736, "loss_ce": 0.31455403566360474, "loss_lvr": 0.9028772115707397, "loss_mode_switch": 0.0, "loss_total": 0.40484175086021423, "step": 2184 }, { "batch_size": 1, "epoch": 0.8736, "step": 2184, "tokens_per_device": 4909 }, { "epoch": 0.8736, "loss_ce": 0.12267082929611206, "loss_lvr": 0.2395104020833969, "loss_mode_switch": 0.0, "loss_total": 0.14662186801433563, "step": 2184 }, { "epoch": 0.874, "grad_norm": 1.2577967643737793, "learning_rate": 4.1058397984142405e-07, "loss": 0.2398, "step": 2185 }, { "batch_size": 4, "epoch": 0.874, "step": 2185, "tokens_per_device": 1632 }, { "epoch": 0.874, "loss_ce": 0.08045744895935059, "loss_lvr": 0.8900305032730103, "loss_mode_switch": 0.0, "loss_total": 0.1694605052471161, "step": 2185 }, { "batch_size": 4, "epoch": 0.874, "step": 2185, "tokens_per_device": 9952 }, { "epoch": 0.874, "loss_ce": 0.1084720715880394, "loss_lvr": 0.5685174465179443, "loss_mode_switch": 0.0, "loss_total": 0.16532382369041443, "step": 2185 }, { "batch_size": 4, "epoch": 0.874, "step": 2185, "tokens_per_device": 4820 }, { "epoch": 0.874, "loss_ce": 0.11420396715402603, "loss_lvr": 0.8473462462425232, "loss_mode_switch": 0.0, "loss_total": 0.19893859326839447, "step": 2185 }, { "batch_size": 4, "epoch": 0.874, "step": 2185, "tokens_per_device": 6728 }, { "epoch": 0.874, "loss_ce": 0.10130473226308823, "loss_lvr": 0.7529763579368591, "loss_mode_switch": 0.0, "loss_total": 0.17660236358642578, "step": 2185 }, { "batch_size": 1, "epoch": 0.874, "step": 2185, "tokens_per_device": 4898 }, { "epoch": 0.874, "loss_ce": 0.11985490471124649, "loss_lvr": 0.6127483248710632, "loss_mode_switch": 0.0, "loss_total": 0.18112973868846893, "step": 2185 }, { "batch_size": 4, "epoch": 0.874, "step": 2185, "tokens_per_device": 4512 }, { "epoch": 0.874, "loss_ce": 0.049804605543613434, "loss_lvr": 0.8783859610557556, "loss_mode_switch": 0.0, "loss_total": 0.13764320313930511, "step": 2185 }, { "batch_size": 4, "epoch": 0.874, "step": 2185, "tokens_per_device": 8412 }, { "epoch": 0.874, "loss_ce": 0.2875176668167114, "loss_lvr": 0.6733527779579163, "loss_mode_switch": 0.0, "loss_total": 0.35485294461250305, "step": 2185 }, { "batch_size": 4, "epoch": 0.874, "step": 2185, "tokens_per_device": 2656 }, { "epoch": 0.874, "loss_ce": 0.03405028581619263, "loss_lvr": 0.8385518193244934, "loss_mode_switch": 0.0, "loss_total": 0.11790546774864197, "step": 2185 }, { "epoch": 0.8744, "grad_norm": 1.1540160179138184, "learning_rate": 4.0801722764475303e-07, "loss": 0.2586, "step": 2186 }, { "batch_size": 4, "epoch": 0.8744, "step": 2186, "tokens_per_device": 5972 }, { "epoch": 0.8744, "loss_ce": 0.026764634996652603, "loss_lvr": 0.8206585645675659, "loss_mode_switch": 0.0, "loss_total": 0.10883049666881561, "step": 2186 }, { "batch_size": 4, "epoch": 0.8744, "step": 2186, "tokens_per_device": 4624 }, { "epoch": 0.8744, "loss_ce": 0.004965939559042454, "loss_lvr": 0.7724564671516418, "loss_mode_switch": 0.0, "loss_total": 0.08221158385276794, "step": 2186 }, { "batch_size": 1, "epoch": 0.8744, "step": 2186, "tokens_per_device": 5185 }, { "epoch": 0.8744, "loss_ce": 0.12342715263366699, "loss_lvr": 0.18973591923713684, "loss_mode_switch": 0.0, "loss_total": 0.14240074157714844, "step": 2186 }, { "batch_size": 4, "epoch": 0.8744, "step": 2186, "tokens_per_device": 4544 }, { "epoch": 0.8744, "loss_ce": 0.07431424409151077, "loss_lvr": 1.0205473899841309, "loss_mode_switch": 0.0, "loss_total": 0.17636898159980774, "step": 2186 }, { "batch_size": 4, "epoch": 0.8744, "step": 2186, "tokens_per_device": 4036 }, { "epoch": 0.8744, "loss_ce": 0.21254563331604004, "loss_lvr": 0.5544129610061646, "loss_mode_switch": 0.0, "loss_total": 0.267986923456192, "step": 2186 }, { "batch_size": 4, "epoch": 0.8744, "step": 2186, "tokens_per_device": 4440 }, { "epoch": 0.8744, "loss_ce": 0.14976926147937775, "loss_lvr": 0.7795405983924866, "loss_mode_switch": 0.0, "loss_total": 0.22772333025932312, "step": 2186 }, { "batch_size": 4, "epoch": 0.8744, "step": 2186, "tokens_per_device": 2620 }, { "epoch": 0.8744, "loss_ce": 0.2807731032371521, "loss_lvr": 0.8342711925506592, "loss_mode_switch": 0.0, "loss_total": 0.364200234413147, "step": 2186 }, { "batch_size": 4, "epoch": 0.8744, "step": 2186, "tokens_per_device": 4496 }, { "epoch": 0.8744, "loss_ce": 0.4030764102935791, "loss_lvr": 0.7950260043144226, "loss_mode_switch": 0.0, "loss_total": 0.4825790226459503, "step": 2186 }, { "epoch": 0.8748, "grad_norm": 1.2976871728897095, "learning_rate": 4.0545818228996336e-07, "loss": 0.3049, "step": 2187 }, { "batch_size": 1, "epoch": 0.8748, "step": 2187, "tokens_per_device": 4879 }, { "epoch": 0.8748, "loss_ce": 0.28463980555534363, "loss_lvr": 0.3840346038341522, "loss_mode_switch": 0.0, "loss_total": 0.32304325699806213, "step": 2187 }, { "batch_size": 4, "epoch": 0.8748, "step": 2187, "tokens_per_device": 1636 }, { "epoch": 0.8748, "loss_ce": 0.7948040962219238, "loss_lvr": 1.068674921989441, "loss_mode_switch": 0.0, "loss_total": 0.9016715884208679, "step": 2187 }, { "batch_size": 4, "epoch": 0.8748, "step": 2187, "tokens_per_device": 5760 }, { "epoch": 0.8748, "loss_ce": 0.05954918637871742, "loss_lvr": 0.7465298175811768, "loss_mode_switch": 0.0, "loss_total": 0.13420216739177704, "step": 2187 }, { "batch_size": 4, "epoch": 0.8748, "step": 2187, "tokens_per_device": 3808 }, { "epoch": 0.8748, "loss_ce": 0.04080929234623909, "loss_lvr": 1.1786847114562988, "loss_mode_switch": 0.0, "loss_total": 0.15867777168750763, "step": 2187 }, { "batch_size": 1, "epoch": 0.8748, "step": 2187, "tokens_per_device": 4904 }, { "epoch": 0.8748, "loss_ce": 0.08320727199316025, "loss_lvr": 0.2985355257987976, "loss_mode_switch": 0.0, "loss_total": 0.11306082457304001, "step": 2187 }, { "batch_size": 4, "epoch": 0.8748, "step": 2187, "tokens_per_device": 4036 }, { "epoch": 0.8748, "loss_ce": 0.18522682785987854, "loss_lvr": 0.8088182210922241, "loss_mode_switch": 0.0, "loss_total": 0.2661086618900299, "step": 2187 }, { "batch_size": 4, "epoch": 0.8748, "step": 2187, "tokens_per_device": 1816 }, { "epoch": 0.8748, "loss_ce": 0.34497684240341187, "loss_lvr": 0.8784312009811401, "loss_mode_switch": 0.0, "loss_total": 0.4328199625015259, "step": 2187 }, { "batch_size": 4, "epoch": 0.8748, "step": 2187, "tokens_per_device": 2684 }, { "epoch": 0.8748, "loss_ce": 0.23383915424346924, "loss_lvr": 1.3742682933807373, "loss_mode_switch": 0.0, "loss_total": 0.3712660074234009, "step": 2187 }, { "epoch": 0.8752, "grad_norm": 1.3504005670547485, "learning_rate": 4.0290684807196667e-07, "loss": 0.3258, "step": 2188 }, { "batch_size": 4, "epoch": 0.8752, "step": 2188, "tokens_per_device": 6504 }, { "epoch": 0.8752, "loss_ce": 0.5158035755157471, "loss_lvr": 0.5384246706962585, "loss_mode_switch": 0.0, "loss_total": 0.5696460604667664, "step": 2188 }, { "batch_size": 4, "epoch": 0.8752, "step": 2188, "tokens_per_device": 4208 }, { "epoch": 0.8752, "loss_ce": 0.24790717661380768, "loss_lvr": 0.4002636671066284, "loss_mode_switch": 0.0, "loss_total": 0.2879335284233093, "step": 2188 }, { "batch_size": 4, "epoch": 0.8752, "step": 2188, "tokens_per_device": 2628 }, { "epoch": 0.8752, "loss_ce": 0.43219059705734253, "loss_lvr": 0.859765350818634, "loss_mode_switch": 0.0, "loss_total": 0.5181671380996704, "step": 2188 }, { "batch_size": 4, "epoch": 0.8752, "step": 2188, "tokens_per_device": 2212 }, { "epoch": 0.8752, "loss_ce": 0.38823428750038147, "loss_lvr": 0.9612642526626587, "loss_mode_switch": 0.0, "loss_total": 0.4843607246875763, "step": 2188 }, { "batch_size": 1, "epoch": 0.8752, "step": 2188, "tokens_per_device": 5077 }, { "epoch": 0.8752, "loss_ce": 0.011523772031068802, "loss_lvr": 0.339086651802063, "loss_mode_switch": 0.0, "loss_total": 0.0454324372112751, "step": 2188 }, { "batch_size": 4, "epoch": 0.8752, "step": 2188, "tokens_per_device": 5124 }, { "epoch": 0.8752, "loss_ce": 0.1209169551730156, "loss_lvr": 0.8302848935127258, "loss_mode_switch": 0.0, "loss_total": 0.20394544303417206, "step": 2188 }, { "batch_size": 1, "epoch": 0.8752, "step": 2188, "tokens_per_device": 5042 }, { "epoch": 0.8752, "loss_ce": 0.00861540250480175, "loss_lvr": 0.16591770946979523, "loss_mode_switch": 0.0, "loss_total": 0.025207173079252243, "step": 2188 }, { "batch_size": 1, "epoch": 0.8752, "step": 2188, "tokens_per_device": 5105 }, { "epoch": 0.8752, "loss_ce": 0.04491688311100006, "loss_lvr": 0.3770943582057953, "loss_mode_switch": 0.0, "loss_total": 0.08262632042169571, "step": 2188 }, { "epoch": 0.8756, "grad_norm": 1.2953153848648071, "learning_rate": 4.003632292727316e-07, "loss": 0.296, "step": 2189 }, { "batch_size": 4, "epoch": 0.8756, "step": 2189, "tokens_per_device": 4464 }, { "epoch": 0.8756, "loss_ce": 0.2807404398918152, "loss_lvr": 0.4731634557247162, "loss_mode_switch": 0.0, "loss_total": 0.32805678248405457, "step": 2189 }, { "batch_size": 4, "epoch": 0.8756, "step": 2189, "tokens_per_device": 1548 }, { "epoch": 0.8756, "loss_ce": 0.33314159512519836, "loss_lvr": 0.8513530492782593, "loss_mode_switch": 0.0, "loss_total": 0.41827690601348877, "step": 2189 }, { "batch_size": 4, "epoch": 0.8756, "step": 2189, "tokens_per_device": 4572 }, { "epoch": 0.8756, "loss_ce": 0.007324553560465574, "loss_lvr": 0.781112015247345, "loss_mode_switch": 0.0, "loss_total": 0.08543575555086136, "step": 2189 }, { "batch_size": 4, "epoch": 0.8756, "step": 2189, "tokens_per_device": 4008 }, { "epoch": 0.8756, "loss_ce": 0.16293947398662567, "loss_lvr": 0.7277782559394836, "loss_mode_switch": 0.0, "loss_total": 0.2357172966003418, "step": 2189 }, { "batch_size": 1, "epoch": 0.8756, "step": 2189, "tokens_per_device": 4910 }, { "epoch": 0.8756, "loss_ce": 0.5513339042663574, "loss_lvr": 0.4613121747970581, "loss_mode_switch": 0.0, "loss_total": 0.5974650979042053, "step": 2189 }, { "batch_size": 4, "epoch": 0.8756, "step": 2189, "tokens_per_device": 4468 }, { "epoch": 0.8756, "loss_ce": 0.1049945205450058, "loss_lvr": 0.8672701716423035, "loss_mode_switch": 0.0, "loss_total": 0.19172152876853943, "step": 2189 }, { "batch_size": 4, "epoch": 0.8756, "step": 2189, "tokens_per_device": 3772 }, { "epoch": 0.8756, "loss_ce": 0.1312166154384613, "loss_lvr": 0.774103045463562, "loss_mode_switch": 0.0, "loss_total": 0.20862692594528198, "step": 2189 }, { "batch_size": 1, "epoch": 0.8756, "step": 2189, "tokens_per_device": 5001 }, { "epoch": 0.8756, "loss_ce": 0.6790713667869568, "loss_lvr": 0.26448729634284973, "loss_mode_switch": 0.0, "loss_total": 0.7055200934410095, "step": 2189 }, { "epoch": 0.876, "grad_norm": 1.2324103116989136, "learning_rate": 3.9782733016128006e-07, "loss": 0.3134, "step": 2190 }, { "batch_size": 1, "epoch": 0.876, "step": 2190, "tokens_per_device": 5004 }, { "epoch": 0.876, "loss_ce": 0.0010905649978667498, "loss_lvr": 0.38265320658683777, "loss_mode_switch": 0.0, "loss_total": 0.03935588523745537, "step": 2190 }, { "batch_size": 4, "epoch": 0.876, "step": 2190, "tokens_per_device": 3412 }, { "epoch": 0.876, "loss_ce": 0.08019299060106277, "loss_lvr": 0.6914301514625549, "loss_mode_switch": 0.0, "loss_total": 0.14933601021766663, "step": 2190 }, { "batch_size": 4, "epoch": 0.876, "step": 2190, "tokens_per_device": 4872 }, { "epoch": 0.876, "loss_ce": 0.0727691724896431, "loss_lvr": 0.6997349262237549, "loss_mode_switch": 0.0, "loss_total": 0.14274266362190247, "step": 2190 }, { "batch_size": 1, "epoch": 0.876, "step": 2190, "tokens_per_device": 5196 }, { "epoch": 0.876, "loss_ce": 0.01521830353885889, "loss_lvr": 0.37797072529792786, "loss_mode_switch": 0.0, "loss_total": 0.05301537737250328, "step": 2190 }, { "batch_size": 4, "epoch": 0.876, "step": 2190, "tokens_per_device": 5992 }, { "epoch": 0.876, "loss_ce": 0.2923336327075958, "loss_lvr": 0.784024178981781, "loss_mode_switch": 0.0, "loss_total": 0.3707360625267029, "step": 2190 }, { "batch_size": 1, "epoch": 0.876, "step": 2190, "tokens_per_device": 4748 }, { "epoch": 0.876, "loss_ce": 0.0001315341069130227, "loss_lvr": 0.3224954307079315, "loss_mode_switch": 0.0, "loss_total": 0.032381076365709305, "step": 2190 }, { "batch_size": 4, "epoch": 0.876, "step": 2190, "tokens_per_device": 4260 }, { "epoch": 0.876, "loss_ce": 0.22901538014411926, "loss_lvr": 0.5265904664993286, "loss_mode_switch": 0.0, "loss_total": 0.28167441487312317, "step": 2190 }, { "batch_size": 4, "epoch": 0.876, "step": 2190, "tokens_per_device": 4280 }, { "epoch": 0.876, "loss_ce": 0.06512722373008728, "loss_lvr": 0.7639175057411194, "loss_mode_switch": 0.0, "loss_total": 0.1415189802646637, "step": 2190 }, { "epoch": 0.8764, "grad_norm": 1.285789132118225, "learning_rate": 3.952991549936752e-07, "loss": 0.247, "step": 2191 }, { "batch_size": 1, "epoch": 0.8764, "step": 2191, "tokens_per_device": 4847 }, { "epoch": 0.8764, "loss_ce": 0.0021271733567118645, "loss_lvr": 0.35189780592918396, "loss_mode_switch": 0.0, "loss_total": 0.037316955626010895, "step": 2191 }, { "batch_size": 1, "epoch": 0.8764, "step": 2191, "tokens_per_device": 5160 }, { "epoch": 0.8764, "loss_ce": 0.0023677910212427378, "loss_lvr": 0.3061232566833496, "loss_mode_switch": 0.0, "loss_total": 0.032980117946863174, "step": 2191 }, { "batch_size": 4, "epoch": 0.8764, "step": 2191, "tokens_per_device": 3896 }, { "epoch": 0.8764, "loss_ce": 0.1246442124247551, "loss_lvr": 0.7545255422592163, "loss_mode_switch": 0.0, "loss_total": 0.20009677112102509, "step": 2191 }, { "batch_size": 4, "epoch": 0.8764, "step": 2191, "tokens_per_device": 3812 }, { "epoch": 0.8764, "loss_ce": 0.12238014489412308, "loss_lvr": 1.1148707866668701, "loss_mode_switch": 0.0, "loss_total": 0.23386722803115845, "step": 2191 }, { "batch_size": 4, "epoch": 0.8764, "step": 2191, "tokens_per_device": 4140 }, { "epoch": 0.8764, "loss_ce": 0.3088889718055725, "loss_lvr": 0.46019500494003296, "loss_mode_switch": 0.0, "loss_total": 0.35490846633911133, "step": 2191 }, { "batch_size": 1, "epoch": 0.8764, "step": 2191, "tokens_per_device": 5008 }, { "epoch": 0.8764, "loss_ce": 0.35479244589805603, "loss_lvr": 0.2180773913860321, "loss_mode_switch": 0.0, "loss_total": 0.3766001760959625, "step": 2191 }, { "batch_size": 4, "epoch": 0.8764, "step": 2191, "tokens_per_device": 3884 }, { "epoch": 0.8764, "loss_ce": 0.8303889632225037, "loss_lvr": 0.9469872713088989, "loss_mode_switch": 0.0, "loss_total": 0.9250876903533936, "step": 2191 }, { "batch_size": 1, "epoch": 0.8764, "step": 2191, "tokens_per_device": 5066 }, { "epoch": 0.8764, "loss_ce": 0.027535106986761093, "loss_lvr": 0.5749621987342834, "loss_mode_switch": 0.0, "loss_total": 0.08503133058547974, "step": 2191 }, { "epoch": 0.8768, "grad_norm": 1.3580317497253418, "learning_rate": 3.9277870801301855e-07, "loss": 0.2943, "step": 2192 }, { "batch_size": 4, "epoch": 0.8768, "step": 2192, "tokens_per_device": 3856 }, { "epoch": 0.8768, "loss_ce": 0.2415829598903656, "loss_lvr": 0.6559018492698669, "loss_mode_switch": 0.0, "loss_total": 0.30717313289642334, "step": 2192 }, { "batch_size": 1, "epoch": 0.8768, "step": 2192, "tokens_per_device": 4915 }, { "epoch": 0.8768, "loss_ce": 0.04539654403924942, "loss_lvr": 0.38380271196365356, "loss_mode_switch": 0.0, "loss_total": 0.0837768167257309, "step": 2192 }, { "batch_size": 1, "epoch": 0.8768, "step": 2192, "tokens_per_device": 5144 }, { "epoch": 0.8768, "loss_ce": 0.004800130613148212, "loss_lvr": 0.43350350856781006, "loss_mode_switch": 0.0, "loss_total": 0.048150479793548584, "step": 2192 }, { "batch_size": 4, "epoch": 0.8768, "step": 2192, "tokens_per_device": 1648 }, { "epoch": 0.8768, "loss_ce": 0.6558210849761963, "loss_lvr": 0.949355959892273, "loss_mode_switch": 0.0, "loss_total": 0.7507566809654236, "step": 2192 }, { "batch_size": 4, "epoch": 0.8768, "step": 2192, "tokens_per_device": 4488 }, { "epoch": 0.8768, "loss_ce": 0.054816920310258865, "loss_lvr": 0.8983420133590698, "loss_mode_switch": 0.0, "loss_total": 0.1446511298418045, "step": 2192 }, { "batch_size": 4, "epoch": 0.8768, "step": 2192, "tokens_per_device": 1240 }, { "epoch": 0.8768, "loss_ce": 0.09946154803037643, "loss_lvr": 1.8347272872924805, "loss_mode_switch": 0.0, "loss_total": 0.2829342782497406, "step": 2192 }, { "batch_size": 4, "epoch": 0.8768, "step": 2192, "tokens_per_device": 3752 }, { "epoch": 0.8768, "loss_ce": 0.25251683592796326, "loss_lvr": 0.8633236289024353, "loss_mode_switch": 0.0, "loss_total": 0.33884918689727783, "step": 2192 }, { "batch_size": 4, "epoch": 0.8768, "step": 2192, "tokens_per_device": 2276 }, { "epoch": 0.8768, "loss_ce": 0.3623122572898865, "loss_lvr": 0.8349591493606567, "loss_mode_switch": 0.0, "loss_total": 0.44580817222595215, "step": 2192 }, { "epoch": 0.8772, "grad_norm": 1.2874577045440674, "learning_rate": 3.9026599344943794e-07, "loss": 0.2641, "step": 2193 }, { "batch_size": 1, "epoch": 0.8772, "step": 2193, "tokens_per_device": 5115 }, { "epoch": 0.8772, "loss_ce": 0.0035130828619003296, "loss_lvr": 0.15904417634010315, "loss_mode_switch": 0.0, "loss_total": 0.019417500123381615, "step": 2193 }, { "batch_size": 4, "epoch": 0.8772, "step": 2193, "tokens_per_device": 10768 }, { "epoch": 0.8772, "loss_ce": 0.1206977367401123, "loss_lvr": 0.7895804643630981, "loss_mode_switch": 0.0, "loss_total": 0.19965578615665436, "step": 2193 }, { "batch_size": 4, "epoch": 0.8772, "step": 2193, "tokens_per_device": 2756 }, { "epoch": 0.8772, "loss_ce": 0.032442063093185425, "loss_lvr": 1.3532217741012573, "loss_mode_switch": 0.0, "loss_total": 0.16776424646377563, "step": 2193 }, { "batch_size": 4, "epoch": 0.8772, "step": 2193, "tokens_per_device": 2724 }, { "epoch": 0.8772, "loss_ce": 0.5151733756065369, "loss_lvr": 0.98129802942276, "loss_mode_switch": 0.0, "loss_total": 0.6133031845092773, "step": 2193 }, { "batch_size": 4, "epoch": 0.8772, "step": 2193, "tokens_per_device": 4296 }, { "epoch": 0.8772, "loss_ce": 0.7679520845413208, "loss_lvr": 0.9666042923927307, "loss_mode_switch": 0.0, "loss_total": 0.8646125197410583, "step": 2193 }, { "batch_size": 1, "epoch": 0.8772, "step": 2193, "tokens_per_device": 4867 }, { "epoch": 0.8772, "loss_ce": 0.009325181134045124, "loss_lvr": 0.30941465497016907, "loss_mode_switch": 0.0, "loss_total": 0.040266647934913635, "step": 2193 }, { "batch_size": 4, "epoch": 0.8772, "step": 2193, "tokens_per_device": 4272 }, { "epoch": 0.8772, "loss_ce": 0.20999933779239655, "loss_lvr": 1.4087125062942505, "loss_mode_switch": 0.0, "loss_total": 0.35087060928344727, "step": 2193 }, { "batch_size": 4, "epoch": 0.8772, "step": 2193, "tokens_per_device": 4176 }, { "epoch": 0.8772, "loss_ce": 0.005486882757395506, "loss_lvr": 0.39021557569503784, "loss_mode_switch": 0.0, "loss_total": 0.0445084422826767, "step": 2193 }, { "epoch": 0.8776, "grad_norm": 1.2867745161056519, "learning_rate": 3.877610155200906e-07, "loss": 0.2957, "step": 2194 }, { "batch_size": 1, "epoch": 0.8776, "step": 2194, "tokens_per_device": 6102 }, { "epoch": 0.8776, "loss_ce": 0.2101978361606598, "loss_lvr": 0.4367019832134247, "loss_mode_switch": 0.0, "loss_total": 0.253868043422699, "step": 2194 }, { "batch_size": 1, "epoch": 0.8776, "step": 2194, "tokens_per_device": 4920 }, { "epoch": 0.8776, "loss_ce": 0.06735121458768845, "loss_lvr": 0.24452871084213257, "loss_mode_switch": 0.0, "loss_total": 0.09180408716201782, "step": 2194 }, { "batch_size": 4, "epoch": 0.8776, "step": 2194, "tokens_per_device": 4300 }, { "epoch": 0.8776, "loss_ce": 0.08876070380210876, "loss_lvr": 0.9608139991760254, "loss_mode_switch": 0.0, "loss_total": 0.18484210968017578, "step": 2194 }, { "batch_size": 4, "epoch": 0.8776, "step": 2194, "tokens_per_device": 4336 }, { "epoch": 0.8776, "loss_ce": 0.28394249081611633, "loss_lvr": 0.8973626494407654, "loss_mode_switch": 0.0, "loss_total": 0.3736787438392639, "step": 2194 }, { "batch_size": 1, "epoch": 0.8776, "step": 2194, "tokens_per_device": 5033 }, { "epoch": 0.8776, "loss_ce": 0.014638049528002739, "loss_lvr": 0.30518588423728943, "loss_mode_switch": 0.0, "loss_total": 0.04515663906931877, "step": 2194 }, { "batch_size": 4, "epoch": 0.8776, "step": 2194, "tokens_per_device": 1504 }, { "epoch": 0.8776, "loss_ce": 0.16959956288337708, "loss_lvr": 0.9328629374504089, "loss_mode_switch": 0.0, "loss_total": 0.2628858685493469, "step": 2194 }, { "batch_size": 4, "epoch": 0.8776, "step": 2194, "tokens_per_device": 3796 }, { "epoch": 0.8776, "loss_ce": 0.03035563975572586, "loss_lvr": 0.5205462574958801, "loss_mode_switch": 0.0, "loss_total": 0.08241026103496552, "step": 2194 }, { "batch_size": 4, "epoch": 0.8776, "step": 2194, "tokens_per_device": 4460 }, { "epoch": 0.8776, "loss_ce": 0.48993611335754395, "loss_lvr": 0.9624645709991455, "loss_mode_switch": 0.0, "loss_total": 0.5861825942993164, "step": 2194 }, { "epoch": 0.878, "grad_norm": 1.3291198015213013, "learning_rate": 3.852637784291424e-07, "loss": 0.2657, "step": 2195 }, { "batch_size": 4, "epoch": 0.878, "step": 2195, "tokens_per_device": 3764 }, { "epoch": 0.878, "loss_ce": 0.35227257013320923, "loss_lvr": 1.059468150138855, "loss_mode_switch": 0.0, "loss_total": 0.45821937918663025, "step": 2195 }, { "batch_size": 4, "epoch": 0.878, "step": 2195, "tokens_per_device": 1460 }, { "epoch": 0.878, "loss_ce": 0.28044363856315613, "loss_lvr": 0.82669597864151, "loss_mode_switch": 0.0, "loss_total": 0.3631132245063782, "step": 2195 }, { "batch_size": 4, "epoch": 0.878, "step": 2195, "tokens_per_device": 4768 }, { "epoch": 0.878, "loss_ce": 0.1312132179737091, "loss_lvr": 1.1842292547225952, "loss_mode_switch": 0.0, "loss_total": 0.24963614344596863, "step": 2195 }, { "batch_size": 4, "epoch": 0.878, "step": 2195, "tokens_per_device": 5252 }, { "epoch": 0.878, "loss_ce": 0.6478682160377502, "loss_lvr": 0.8784757852554321, "loss_mode_switch": 0.0, "loss_total": 0.7357158064842224, "step": 2195 }, { "batch_size": 1, "epoch": 0.878, "step": 2195, "tokens_per_device": 5132 }, { "epoch": 0.878, "loss_ce": 0.20502962172031403, "loss_lvr": 0.41912052035331726, "loss_mode_switch": 0.0, "loss_total": 0.2469416707754135, "step": 2195 }, { "batch_size": 4, "epoch": 0.878, "step": 2195, "tokens_per_device": 4584 }, { "epoch": 0.878, "loss_ce": 0.08065228164196014, "loss_lvr": 0.7947463989257812, "loss_mode_switch": 0.0, "loss_total": 0.1601269245147705, "step": 2195 }, { "batch_size": 1, "epoch": 0.878, "step": 2195, "tokens_per_device": 5076 }, { "epoch": 0.878, "loss_ce": 0.23702408373355865, "loss_lvr": 0.3358843922615051, "loss_mode_switch": 0.0, "loss_total": 0.27061253786087036, "step": 2195 }, { "batch_size": 4, "epoch": 0.878, "step": 2195, "tokens_per_device": 4336 }, { "epoch": 0.878, "loss_ce": 0.03492192551493645, "loss_lvr": 0.8602055907249451, "loss_mode_switch": 0.0, "loss_total": 0.12094248831272125, "step": 2195 }, { "epoch": 0.8784, "grad_norm": 1.416623592376709, "learning_rate": 3.827742863677708e-07, "loss": 0.2586, "step": 2196 }, { "batch_size": 4, "epoch": 0.8784, "step": 2196, "tokens_per_device": 8828 }, { "epoch": 0.8784, "loss_ce": 0.32006725668907166, "loss_lvr": 0.817817747592926, "loss_mode_switch": 0.0, "loss_total": 0.40184903144836426, "step": 2196 }, { "batch_size": 1, "epoch": 0.8784, "step": 2196, "tokens_per_device": 4783 }, { "epoch": 0.8784, "loss_ce": 0.02460513450205326, "loss_lvr": 0.24409350752830505, "loss_mode_switch": 0.0, "loss_total": 0.049014486372470856, "step": 2196 }, { "batch_size": 1, "epoch": 0.8784, "step": 2196, "tokens_per_device": 4879 }, { "epoch": 0.8784, "loss_ce": 0.00856766663491726, "loss_lvr": 0.26278501749038696, "loss_mode_switch": 0.0, "loss_total": 0.034846168011426926, "step": 2196 }, { "batch_size": 1, "epoch": 0.8784, "step": 2196, "tokens_per_device": 5084 }, { "epoch": 0.8784, "loss_ce": 0.004501075018197298, "loss_lvr": 0.29887697100639343, "loss_mode_switch": 0.0, "loss_total": 0.03438877314329147, "step": 2196 }, { "batch_size": 4, "epoch": 0.8784, "step": 2196, "tokens_per_device": 11980 }, { "epoch": 0.8784, "loss_ce": 0.13254910707473755, "loss_lvr": 0.6097483038902283, "loss_mode_switch": 0.0, "loss_total": 0.19352394342422485, "step": 2196 }, { "batch_size": 4, "epoch": 0.8784, "step": 2196, "tokens_per_device": 4020 }, { "epoch": 0.8784, "loss_ce": 0.04401882365345955, "loss_lvr": 0.7295475602149963, "loss_mode_switch": 0.0, "loss_total": 0.11697357892990112, "step": 2196 }, { "batch_size": 4, "epoch": 0.8784, "step": 2196, "tokens_per_device": 2868 }, { "epoch": 0.8784, "loss_ce": 0.28848251700401306, "loss_lvr": 0.48408257961273193, "loss_mode_switch": 0.0, "loss_total": 0.3368907868862152, "step": 2196 }, { "batch_size": 1, "epoch": 0.8784, "step": 2196, "tokens_per_device": 4923 }, { "epoch": 0.8784, "loss_ce": 0.11407158523797989, "loss_lvr": 0.34085601568222046, "loss_mode_switch": 0.0, "loss_total": 0.14815717935562134, "step": 2196 }, { "epoch": 0.8788, "grad_norm": 1.194956660270691, "learning_rate": 3.802925435141525e-07, "loss": 0.2526, "step": 2197 }, { "batch_size": 4, "epoch": 0.8788, "step": 2197, "tokens_per_device": 7304 }, { "epoch": 0.8788, "loss_ce": 0.6741077303886414, "loss_lvr": 0.7333588600158691, "loss_mode_switch": 0.0, "loss_total": 0.7474436163902283, "step": 2197 }, { "batch_size": 4, "epoch": 0.8788, "step": 2197, "tokens_per_device": 2532 }, { "epoch": 0.8788, "loss_ce": 0.32500386238098145, "loss_lvr": 0.951124906539917, "loss_mode_switch": 0.0, "loss_total": 0.4201163649559021, "step": 2197 }, { "batch_size": 4, "epoch": 0.8788, "step": 2197, "tokens_per_device": 5912 }, { "epoch": 0.8788, "loss_ce": 0.21068646013736725, "loss_lvr": 0.7120644450187683, "loss_mode_switch": 0.0, "loss_total": 0.28189289569854736, "step": 2197 }, { "batch_size": 4, "epoch": 0.8788, "step": 2197, "tokens_per_device": 4264 }, { "epoch": 0.8788, "loss_ce": 0.08225265145301819, "loss_lvr": 1.0347939729690552, "loss_mode_switch": 0.0, "loss_total": 0.18573205173015594, "step": 2197 }, { "batch_size": 4, "epoch": 0.8788, "step": 2197, "tokens_per_device": 3872 }, { "epoch": 0.8788, "loss_ce": 0.12841811776161194, "loss_lvr": 0.9751327633857727, "loss_mode_switch": 0.0, "loss_total": 0.22593140602111816, "step": 2197 }, { "batch_size": 4, "epoch": 0.8788, "step": 2197, "tokens_per_device": 4040 }, { "epoch": 0.8788, "loss_ce": 0.05589883029460907, "loss_lvr": 0.8603609800338745, "loss_mode_switch": 0.0, "loss_total": 0.14193493127822876, "step": 2197 }, { "batch_size": 4, "epoch": 0.8788, "step": 2197, "tokens_per_device": 2608 }, { "epoch": 0.8788, "loss_ce": 0.5366798639297485, "loss_lvr": 0.8082799911499023, "loss_mode_switch": 0.0, "loss_total": 0.6175078749656677, "step": 2197 }, { "batch_size": 1, "epoch": 0.8788, "step": 2197, "tokens_per_device": 4832 }, { "epoch": 0.8788, "loss_ce": 0.1284497082233429, "loss_lvr": 0.4126873016357422, "loss_mode_switch": 0.0, "loss_total": 0.1697184443473816, "step": 2197 }, { "epoch": 0.8792, "grad_norm": 1.343867540359497, "learning_rate": 3.778185540334611e-07, "loss": 0.3056, "step": 2198 }, { "batch_size": 4, "epoch": 0.8792, "step": 2198, "tokens_per_device": 7304 }, { "epoch": 0.8792, "loss_ce": 0.4629134237766266, "loss_lvr": 0.6729828119277954, "loss_mode_switch": 0.0, "loss_total": 0.5302116870880127, "step": 2198 }, { "batch_size": 4, "epoch": 0.8792, "step": 2198, "tokens_per_device": 1872 }, { "epoch": 0.8792, "loss_ce": 0.12298549711704254, "loss_lvr": 0.8602304458618164, "loss_mode_switch": 0.0, "loss_total": 0.20900854468345642, "step": 2198 }, { "batch_size": 4, "epoch": 0.8792, "step": 2198, "tokens_per_device": 4308 }, { "epoch": 0.8792, "loss_ce": 0.046553418040275574, "loss_lvr": 1.0944292545318604, "loss_mode_switch": 0.0, "loss_total": 0.15599635243415833, "step": 2198 }, { "batch_size": 4, "epoch": 0.8792, "step": 2198, "tokens_per_device": 4816 }, { "epoch": 0.8792, "loss_ce": 0.044135723263025284, "loss_lvr": 0.6660605072975159, "loss_mode_switch": 0.0, "loss_total": 0.11074177920818329, "step": 2198 }, { "batch_size": 1, "epoch": 0.8792, "step": 2198, "tokens_per_device": 5073 }, { "epoch": 0.8792, "loss_ce": 0.0006128742243163288, "loss_lvr": 0.18565858900547028, "loss_mode_switch": 0.0, "loss_total": 0.019178733229637146, "step": 2198 }, { "batch_size": 1, "epoch": 0.8792, "step": 2198, "tokens_per_device": 5072 }, { "epoch": 0.8792, "loss_ce": 0.2843998074531555, "loss_lvr": 0.4690064489841461, "loss_mode_switch": 0.0, "loss_total": 0.33130043745040894, "step": 2198 }, { "batch_size": 4, "epoch": 0.8792, "step": 2198, "tokens_per_device": 11152 }, { "epoch": 0.8792, "loss_ce": 0.5149338841438293, "loss_lvr": 0.6835253834724426, "loss_mode_switch": 0.0, "loss_total": 0.5832864046096802, "step": 2198 }, { "batch_size": 4, "epoch": 0.8792, "step": 2198, "tokens_per_device": 4444 }, { "epoch": 0.8792, "loss_ce": 0.5706343650817871, "loss_lvr": 0.7253684997558594, "loss_mode_switch": 0.0, "loss_total": 0.6431711912155151, "step": 2198 }, { "epoch": 0.8796, "grad_norm": 1.6457146406173706, "learning_rate": 3.753523220778571e-07, "loss": 0.3314, "step": 2199 }, { "batch_size": 4, "epoch": 0.8796, "step": 2199, "tokens_per_device": 1484 }, { "epoch": 0.8796, "loss_ce": 0.4563771188259125, "loss_lvr": 0.8703762292861938, "loss_mode_switch": 0.0, "loss_total": 0.5434147119522095, "step": 2199 }, { "batch_size": 1, "epoch": 0.8796, "step": 2199, "tokens_per_device": 5731 }, { "epoch": 0.8796, "loss_ce": 0.3810424506664276, "loss_lvr": 0.2556014955043793, "loss_mode_switch": 0.0, "loss_total": 0.4066025912761688, "step": 2199 }, { "batch_size": 4, "epoch": 0.8796, "step": 2199, "tokens_per_device": 6232 }, { "epoch": 0.8796, "loss_ce": 0.004201081581413746, "loss_lvr": 0.7209957838058472, "loss_mode_switch": 0.0, "loss_total": 0.07630066573619843, "step": 2199 }, { "batch_size": 4, "epoch": 0.8796, "step": 2199, "tokens_per_device": 4256 }, { "epoch": 0.8796, "loss_ce": 0.17655466496944427, "loss_lvr": 1.038672924041748, "loss_mode_switch": 0.0, "loss_total": 0.2804219722747803, "step": 2199 }, { "batch_size": 1, "epoch": 0.8796, "step": 2199, "tokens_per_device": 5107 }, { "epoch": 0.8796, "loss_ce": 0.01007685624063015, "loss_lvr": 0.38861462473869324, "loss_mode_switch": 0.0, "loss_total": 0.0489383190870285, "step": 2199 }, { "batch_size": 4, "epoch": 0.8796, "step": 2199, "tokens_per_device": 5748 }, { "epoch": 0.8796, "loss_ce": 0.4887733459472656, "loss_lvr": 0.8491531610488892, "loss_mode_switch": 0.0, "loss_total": 0.5736886858940125, "step": 2199 }, { "batch_size": 1, "epoch": 0.8796, "step": 2199, "tokens_per_device": 5103 }, { "epoch": 0.8796, "loss_ce": 0.0015556461876258254, "loss_lvr": 0.39433106780052185, "loss_mode_switch": 0.0, "loss_total": 0.040988754481077194, "step": 2199 }, { "batch_size": 1, "epoch": 0.8796, "step": 2199, "tokens_per_device": 6824 }, { "epoch": 0.8796, "loss_ce": 0.04279554262757301, "loss_lvr": 0.23837117850780487, "loss_mode_switch": 0.0, "loss_total": 0.06663265824317932, "step": 2199 }, { "epoch": 0.88, "grad_norm": 1.3912806510925293, "learning_rate": 3.728938517864794e-07, "loss": 0.2795, "step": 2200 }, { "batch_size": 4, "epoch": 0.88, "step": 2200, "tokens_per_device": 1204 }, { "epoch": 0.88, "loss_ce": 0.3030548393726349, "loss_lvr": 0.9997045397758484, "loss_mode_switch": 0.0, "loss_total": 0.4030252993106842, "step": 2200 }, { "batch_size": 4, "epoch": 0.88, "step": 2200, "tokens_per_device": 5620 }, { "epoch": 0.88, "loss_ce": 0.1663704216480255, "loss_lvr": 0.6868962049484253, "loss_mode_switch": 0.0, "loss_total": 0.23506003618240356, "step": 2200 }, { "batch_size": 4, "epoch": 0.88, "step": 2200, "tokens_per_device": 4724 }, { "epoch": 0.88, "loss_ce": 0.1852564811706543, "loss_lvr": 0.7014492750167847, "loss_mode_switch": 0.0, "loss_total": 0.2554014027118683, "step": 2200 }, { "batch_size": 1, "epoch": 0.88, "step": 2200, "tokens_per_device": 5035 }, { "epoch": 0.88, "loss_ce": 0.0011642634635791183, "loss_lvr": 0.5240776538848877, "loss_mode_switch": 0.0, "loss_total": 0.05357203260064125, "step": 2200 }, { "batch_size": 4, "epoch": 0.88, "step": 2200, "tokens_per_device": 3560 }, { "epoch": 0.88, "loss_ce": 0.290581077337265, "loss_lvr": 0.8138765692710876, "loss_mode_switch": 0.0, "loss_total": 0.37196874618530273, "step": 2200 }, { "batch_size": 1, "epoch": 0.88, "step": 2200, "tokens_per_device": 4860 }, { "epoch": 0.88, "loss_ce": 0.005897382739931345, "loss_lvr": 0.23041215538978577, "loss_mode_switch": 0.0, "loss_total": 0.028938598930835724, "step": 2200 }, { "batch_size": 4, "epoch": 0.88, "step": 2200, "tokens_per_device": 4296 }, { "epoch": 0.88, "loss_ce": 0.006404312327504158, "loss_lvr": 0.7611472606658936, "loss_mode_switch": 0.0, "loss_total": 0.0825190395116806, "step": 2200 }, { "batch_size": 4, "epoch": 0.88, "step": 2200, "tokens_per_device": 1784 }, { "epoch": 0.88, "loss_ce": 0.24481140077114105, "loss_lvr": 0.8498948812484741, "loss_mode_switch": 0.0, "loss_total": 0.32980090379714966, "step": 2200 }, { "epoch": 0.8804, "grad_norm": 1.1982135772705078, "learning_rate": 3.7044314728544196e-07, "loss": 0.2758, "step": 2201 }, { "batch_size": 1, "epoch": 0.8804, "step": 2201, "tokens_per_device": 4864 }, { "epoch": 0.8804, "loss_ce": 0.00016654063074383885, "loss_lvr": 0.23809537291526794, "loss_mode_switch": 0.0, "loss_total": 0.023976078256964684, "step": 2201 }, { "batch_size": 4, "epoch": 0.8804, "step": 2201, "tokens_per_device": 4384 }, { "epoch": 0.8804, "loss_ce": 0.2601117193698883, "loss_lvr": 0.8545255661010742, "loss_mode_switch": 0.0, "loss_total": 0.3455642759799957, "step": 2201 }, { "batch_size": 4, "epoch": 0.8804, "step": 2201, "tokens_per_device": 4560 }, { "epoch": 0.8804, "loss_ce": 0.34667032957077026, "loss_lvr": 1.538170576095581, "loss_mode_switch": 0.0, "loss_total": 0.5004873871803284, "step": 2201 }, { "batch_size": 1, "epoch": 0.8804, "step": 2201, "tokens_per_device": 5350 }, { "epoch": 0.8804, "loss_ce": 0.007090445142239332, "loss_lvr": 0.34042465686798096, "loss_mode_switch": 0.0, "loss_total": 0.041132912039756775, "step": 2201 }, { "batch_size": 4, "epoch": 0.8804, "step": 2201, "tokens_per_device": 5232 }, { "epoch": 0.8804, "loss_ce": 0.008757466450333595, "loss_lvr": 0.782350480556488, "loss_mode_switch": 0.0, "loss_total": 0.08699251711368561, "step": 2201 }, { "batch_size": 4, "epoch": 0.8804, "step": 2201, "tokens_per_device": 6648 }, { "epoch": 0.8804, "loss_ce": 0.38232114911079407, "loss_lvr": 0.8051496744155884, "loss_mode_switch": 0.0, "loss_total": 0.4628361165523529, "step": 2201 }, { "batch_size": 4, "epoch": 0.8804, "step": 2201, "tokens_per_device": 2028 }, { "epoch": 0.8804, "loss_ce": 0.18885457515716553, "loss_lvr": 1.0562111139297485, "loss_mode_switch": 0.0, "loss_total": 0.2944756746292114, "step": 2201 }, { "batch_size": 4, "epoch": 0.8804, "step": 2201, "tokens_per_device": 3356 }, { "epoch": 0.8804, "loss_ce": 0.27375760674476624, "loss_lvr": 0.6779103875160217, "loss_mode_switch": 0.0, "loss_total": 0.3415486514568329, "step": 2201 }, { "epoch": 0.8808, "grad_norm": 1.256353735923767, "learning_rate": 3.6800021268782293e-07, "loss": 0.2629, "step": 2202 }, { "batch_size": 4, "epoch": 0.8808, "step": 2202, "tokens_per_device": 2776 }, { "epoch": 0.8808, "loss_ce": 0.42893871665000916, "loss_lvr": 0.8200733065605164, "loss_mode_switch": 0.0, "loss_total": 0.5109460353851318, "step": 2202 }, { "batch_size": 1, "epoch": 0.8808, "step": 2202, "tokens_per_device": 5084 }, { "epoch": 0.8808, "loss_ce": 0.00024808308808133006, "loss_lvr": 0.44293415546417236, "loss_mode_switch": 0.0, "loss_total": 0.04454149678349495, "step": 2202 }, { "batch_size": 1, "epoch": 0.8808, "step": 2202, "tokens_per_device": 5181 }, { "epoch": 0.8808, "loss_ce": 0.004154110327363014, "loss_lvr": 0.3576662540435791, "loss_mode_switch": 0.0, "loss_total": 0.03992073982954025, "step": 2202 }, { "batch_size": 4, "epoch": 0.8808, "step": 2202, "tokens_per_device": 3792 }, { "epoch": 0.8808, "loss_ce": 0.504472017288208, "loss_lvr": 0.9367093443870544, "loss_mode_switch": 0.0, "loss_total": 0.5981429815292358, "step": 2202 }, { "batch_size": 4, "epoch": 0.8808, "step": 2202, "tokens_per_device": 9720 }, { "epoch": 0.8808, "loss_ce": 0.14147500693798065, "loss_lvr": 0.4284020662307739, "loss_mode_switch": 0.0, "loss_total": 0.18431521952152252, "step": 2202 }, { "batch_size": 1, "epoch": 0.8808, "step": 2202, "tokens_per_device": 4878 }, { "epoch": 0.8808, "loss_ce": 0.009163248352706432, "loss_lvr": 0.21706891059875488, "loss_mode_switch": 0.0, "loss_total": 0.030870139598846436, "step": 2202 }, { "batch_size": 1, "epoch": 0.8808, "step": 2202, "tokens_per_device": 7394 }, { "epoch": 0.8808, "loss_ce": 0.002588007366284728, "loss_lvr": 0.23054619133472443, "loss_mode_switch": 0.0, "loss_total": 0.025642627850174904, "step": 2202 }, { "batch_size": 4, "epoch": 0.8808, "step": 2202, "tokens_per_device": 4516 }, { "epoch": 0.8808, "loss_ce": 0.35797253251075745, "loss_lvr": 0.8524971604347229, "loss_mode_switch": 0.0, "loss_total": 0.4432222545146942, "step": 2202 }, { "epoch": 0.8812, "grad_norm": 1.190878987312317, "learning_rate": 3.655650520936638e-07, "loss": 0.2401, "step": 2203 }, { "batch_size": 4, "epoch": 0.8812, "step": 2203, "tokens_per_device": 1528 }, { "epoch": 0.8812, "loss_ce": 0.40073153376579285, "loss_lvr": 0.9130445718765259, "loss_mode_switch": 0.0, "loss_total": 0.49203598499298096, "step": 2203 }, { "batch_size": 1, "epoch": 0.8812, "step": 2203, "tokens_per_device": 4873 }, { "epoch": 0.8812, "loss_ce": 0.07118397206068039, "loss_lvr": 0.3451530933380127, "loss_mode_switch": 0.0, "loss_total": 0.10569928586483002, "step": 2203 }, { "batch_size": 4, "epoch": 0.8812, "step": 2203, "tokens_per_device": 5200 }, { "epoch": 0.8812, "loss_ce": 0.10712293535470963, "loss_lvr": 0.7356175184249878, "loss_mode_switch": 0.0, "loss_total": 0.18068468570709229, "step": 2203 }, { "batch_size": 4, "epoch": 0.8812, "step": 2203, "tokens_per_device": 5136 }, { "epoch": 0.8812, "loss_ce": 0.32886895537376404, "loss_lvr": 0.5274025797843933, "loss_mode_switch": 0.0, "loss_total": 0.3816092014312744, "step": 2203 }, { "batch_size": 4, "epoch": 0.8812, "step": 2203, "tokens_per_device": 4244 }, { "epoch": 0.8812, "loss_ce": 0.13005352020263672, "loss_lvr": 1.0944405794143677, "loss_mode_switch": 0.0, "loss_total": 0.239497572183609, "step": 2203 }, { "batch_size": 4, "epoch": 0.8812, "step": 2203, "tokens_per_device": 2680 }, { "epoch": 0.8812, "loss_ce": 0.09588488936424255, "loss_lvr": 0.9073526859283447, "loss_mode_switch": 0.0, "loss_total": 0.18662016093730927, "step": 2203 }, { "batch_size": 4, "epoch": 0.8812, "step": 2203, "tokens_per_device": 4156 }, { "epoch": 0.8812, "loss_ce": 0.4104154706001282, "loss_lvr": 0.8756595849990845, "loss_mode_switch": 0.0, "loss_total": 0.4979814291000366, "step": 2203 }, { "batch_size": 4, "epoch": 0.8812, "step": 2203, "tokens_per_device": 3856 }, { "epoch": 0.8812, "loss_ce": 0.21782709658145905, "loss_lvr": 0.7509392499923706, "loss_mode_switch": 0.0, "loss_total": 0.2929210066795349, "step": 2203 }, { "epoch": 0.8816, "grad_norm": 1.210071086883545, "learning_rate": 3.6313766958995635e-07, "loss": 0.2551, "step": 2204 }, { "batch_size": 4, "epoch": 0.8816, "step": 2204, "tokens_per_device": 2736 }, { "epoch": 0.8816, "loss_ce": 0.20363788306713104, "loss_lvr": 0.6500769257545471, "loss_mode_switch": 0.0, "loss_total": 0.26864558458328247, "step": 2204 }, { "batch_size": 4, "epoch": 0.8816, "step": 2204, "tokens_per_device": 5740 }, { "epoch": 0.8816, "loss_ce": 0.36275365948677063, "loss_lvr": 1.020477056503296, "loss_mode_switch": 0.0, "loss_total": 0.4648013710975647, "step": 2204 }, { "batch_size": 4, "epoch": 0.8816, "step": 2204, "tokens_per_device": 2524 }, { "epoch": 0.8816, "loss_ce": 0.2800833284854889, "loss_lvr": 0.9888253808021545, "loss_mode_switch": 0.0, "loss_total": 0.3789658546447754, "step": 2204 }, { "batch_size": 4, "epoch": 0.8816, "step": 2204, "tokens_per_device": 5888 }, { "epoch": 0.8816, "loss_ce": 0.05348058417439461, "loss_lvr": 1.3149909973144531, "loss_mode_switch": 0.0, "loss_total": 0.18497967720031738, "step": 2204 }, { "batch_size": 4, "epoch": 0.8816, "step": 2204, "tokens_per_device": 4208 }, { "epoch": 0.8816, "loss_ce": 0.06670490652322769, "loss_lvr": 0.830661416053772, "loss_mode_switch": 0.0, "loss_total": 0.149771049618721, "step": 2204 }, { "batch_size": 1, "epoch": 0.8816, "step": 2204, "tokens_per_device": 4880 }, { "epoch": 0.8816, "loss_ce": 0.13653792440891266, "loss_lvr": 0.3654095530509949, "loss_mode_switch": 0.0, "loss_total": 0.17307887971401215, "step": 2204 }, { "batch_size": 4, "epoch": 0.8816, "step": 2204, "tokens_per_device": 5884 }, { "epoch": 0.8816, "loss_ce": 0.32902705669403076, "loss_lvr": 0.8425028324127197, "loss_mode_switch": 0.0, "loss_total": 0.4132773280143738, "step": 2204 }, { "batch_size": 1, "epoch": 0.8816, "step": 2204, "tokens_per_device": 4846 }, { "epoch": 0.8816, "loss_ce": 0.0691651850938797, "loss_lvr": 0.9625586271286011, "loss_mode_switch": 0.0, "loss_total": 0.1654210388660431, "step": 2204 }, { "epoch": 0.882, "grad_norm": 1.34591543674469, "learning_rate": 3.60718069250639e-07, "loss": 0.2912, "step": 2205 }, { "batch_size": 1, "epoch": 0.882, "step": 2205, "tokens_per_device": 5033 }, { "epoch": 0.882, "loss_ce": 0.1917642503976822, "loss_lvr": 0.379719078540802, "loss_mode_switch": 0.0, "loss_total": 0.22973616421222687, "step": 2205 }, { "batch_size": 4, "epoch": 0.882, "step": 2205, "tokens_per_device": 4280 }, { "epoch": 0.882, "loss_ce": 0.3912985324859619, "loss_lvr": 0.8938463926315308, "loss_mode_switch": 0.0, "loss_total": 0.48068317770957947, "step": 2205 }, { "batch_size": 4, "epoch": 0.882, "step": 2205, "tokens_per_device": 3948 }, { "epoch": 0.882, "loss_ce": 0.019006868824362755, "loss_lvr": 0.9684997200965881, "loss_mode_switch": 0.0, "loss_total": 0.1158568412065506, "step": 2205 }, { "batch_size": 4, "epoch": 0.882, "step": 2205, "tokens_per_device": 1600 }, { "epoch": 0.882, "loss_ce": 0.23385387659072876, "loss_lvr": 1.3767573833465576, "loss_mode_switch": 0.0, "loss_total": 0.37152963876724243, "step": 2205 }, { "batch_size": 4, "epoch": 0.882, "step": 2205, "tokens_per_device": 1352 }, { "epoch": 0.882, "loss_ce": 0.3172288239002228, "loss_lvr": 1.000870943069458, "loss_mode_switch": 0.0, "loss_total": 0.41731593012809753, "step": 2205 }, { "batch_size": 4, "epoch": 0.882, "step": 2205, "tokens_per_device": 2708 }, { "epoch": 0.882, "loss_ce": 0.2393961250782013, "loss_lvr": 0.8102550506591797, "loss_mode_switch": 0.0, "loss_total": 0.32042163610458374, "step": 2205 }, { "batch_size": 4, "epoch": 0.882, "step": 2205, "tokens_per_device": 6964 }, { "epoch": 0.882, "loss_ce": 0.018848750740289688, "loss_lvr": 0.47832340002059937, "loss_mode_switch": 0.0, "loss_total": 0.06668108701705933, "step": 2205 }, { "batch_size": 4, "epoch": 0.882, "step": 2205, "tokens_per_device": 1432 }, { "epoch": 0.882, "loss_ce": 0.14890801906585693, "loss_lvr": 1.7130845785140991, "loss_mode_switch": 0.0, "loss_total": 0.32021647691726685, "step": 2205 }, { "epoch": 0.8824, "grad_norm": 1.4228103160858154, "learning_rate": 3.5830625513658677e-07, "loss": 0.274, "step": 2206 }, { "batch_size": 4, "epoch": 0.8824, "step": 2206, "tokens_per_device": 8292 }, { "epoch": 0.8824, "loss_ce": 0.3529469668865204, "loss_lvr": 0.640696108341217, "loss_mode_switch": 0.0, "loss_total": 0.41701656579971313, "step": 2206 }, { "batch_size": 4, "epoch": 0.8824, "step": 2206, "tokens_per_device": 5284 }, { "epoch": 0.8824, "loss_ce": 0.5701221227645874, "loss_lvr": 0.770274817943573, "loss_mode_switch": 0.0, "loss_total": 0.6471496224403381, "step": 2206 }, { "batch_size": 4, "epoch": 0.8824, "step": 2206, "tokens_per_device": 5840 }, { "epoch": 0.8824, "loss_ce": 0.017926378175616264, "loss_lvr": 0.8299660086631775, "loss_mode_switch": 0.0, "loss_total": 0.10092297941446304, "step": 2206 }, { "batch_size": 4, "epoch": 0.8824, "step": 2206, "tokens_per_device": 4464 }, { "epoch": 0.8824, "loss_ce": 0.10934324562549591, "loss_lvr": 0.8571597337722778, "loss_mode_switch": 0.0, "loss_total": 0.19505921006202698, "step": 2206 }, { "batch_size": 1, "epoch": 0.8824, "step": 2206, "tokens_per_device": 5023 }, { "epoch": 0.8824, "loss_ce": 0.05479559302330017, "loss_lvr": 0.18906880915164948, "loss_mode_switch": 0.0, "loss_total": 0.07370247691869736, "step": 2206 }, { "batch_size": 4, "epoch": 0.8824, "step": 2206, "tokens_per_device": 4260 }, { "epoch": 0.8824, "loss_ce": 0.3424874246120453, "loss_lvr": 0.7938520908355713, "loss_mode_switch": 0.0, "loss_total": 0.42187264561653137, "step": 2206 }, { "batch_size": 1, "epoch": 0.8824, "step": 2206, "tokens_per_device": 4760 }, { "epoch": 0.8824, "loss_ce": 0.715160608291626, "loss_lvr": 0.6822258830070496, "loss_mode_switch": 0.0, "loss_total": 0.7833831906318665, "step": 2206 }, { "batch_size": 1, "epoch": 0.8824, "step": 2206, "tokens_per_device": 4978 }, { "epoch": 0.8824, "loss_ce": 0.24914591014385223, "loss_lvr": 0.6422994136810303, "loss_mode_switch": 0.0, "loss_total": 0.313375860452652, "step": 2206 }, { "epoch": 0.8828, "grad_norm": 1.3888461589813232, "learning_rate": 3.5590223129561063e-07, "loss": 0.2818, "step": 2207 }, { "batch_size": 4, "epoch": 0.8828, "step": 2207, "tokens_per_device": 15888 }, { "epoch": 0.8828, "loss_ce": 0.10806449502706528, "loss_lvr": 0.49394381046295166, "loss_mode_switch": 0.0, "loss_total": 0.15745887160301208, "step": 2207 }, { "batch_size": 4, "epoch": 0.8828, "step": 2207, "tokens_per_device": 4948 }, { "epoch": 0.8828, "loss_ce": 0.028177622705698013, "loss_lvr": 0.7662709951400757, "loss_mode_switch": 0.0, "loss_total": 0.10480472445487976, "step": 2207 }, { "batch_size": 1, "epoch": 0.8828, "step": 2207, "tokens_per_device": 4829 }, { "epoch": 0.8828, "loss_ce": 0.014570266008377075, "loss_lvr": 0.31691551208496094, "loss_mode_switch": 0.0, "loss_total": 0.04626181721687317, "step": 2207 }, { "batch_size": 4, "epoch": 0.8828, "step": 2207, "tokens_per_device": 1328 }, { "epoch": 0.8828, "loss_ce": 0.38748255372047424, "loss_lvr": 0.9994208812713623, "loss_mode_switch": 0.0, "loss_total": 0.4874246418476105, "step": 2207 }, { "batch_size": 4, "epoch": 0.8828, "step": 2207, "tokens_per_device": 5320 }, { "epoch": 0.8828, "loss_ce": 0.49168121814727783, "loss_lvr": 0.7734532952308655, "loss_mode_switch": 0.0, "loss_total": 0.569026529788971, "step": 2207 }, { "batch_size": 4, "epoch": 0.8828, "step": 2207, "tokens_per_device": 2704 }, { "epoch": 0.8828, "loss_ce": 0.2911171019077301, "loss_lvr": 0.7223087549209595, "loss_mode_switch": 0.0, "loss_total": 0.36334797739982605, "step": 2207 }, { "batch_size": 1, "epoch": 0.8828, "step": 2207, "tokens_per_device": 4268 }, { "epoch": 0.8828, "loss_ce": 0.35252872109413147, "loss_lvr": 0.8834035396575928, "loss_mode_switch": 0.0, "loss_total": 0.4408690929412842, "step": 2207 }, { "batch_size": 4, "epoch": 0.8828, "step": 2207, "tokens_per_device": 4460 }, { "epoch": 0.8828, "loss_ce": 0.5041566491127014, "loss_lvr": 0.7638320326805115, "loss_mode_switch": 0.0, "loss_total": 0.5805398225784302, "step": 2207 }, { "epoch": 0.8832, "grad_norm": 1.3888614177703857, "learning_rate": 3.535060017624453e-07, "loss": 0.2993, "step": 2208 }, { "batch_size": 1, "epoch": 0.8832, "step": 2208, "tokens_per_device": 5187 }, { "epoch": 0.8832, "loss_ce": 0.13503752648830414, "loss_lvr": 0.3457147777080536, "loss_mode_switch": 0.0, "loss_total": 0.16960901021957397, "step": 2208 }, { "batch_size": 1, "epoch": 0.8832, "step": 2208, "tokens_per_device": 6166 }, { "epoch": 0.8832, "loss_ce": 0.0007023611688055098, "loss_lvr": 0.3743911683559418, "loss_mode_switch": 0.0, "loss_total": 0.03814148157835007, "step": 2208 }, { "batch_size": 4, "epoch": 0.8832, "step": 2208, "tokens_per_device": 8572 }, { "epoch": 0.8832, "loss_ce": 0.14329273998737335, "loss_lvr": 1.0860987901687622, "loss_mode_switch": 0.0, "loss_total": 0.25190261006355286, "step": 2208 }, { "batch_size": 4, "epoch": 0.8832, "step": 2208, "tokens_per_device": 3768 }, { "epoch": 0.8832, "loss_ce": 0.1815931349992752, "loss_lvr": 1.1675729751586914, "loss_mode_switch": 0.0, "loss_total": 0.29835042357444763, "step": 2208 }, { "batch_size": 4, "epoch": 0.8832, "step": 2208, "tokens_per_device": 2612 }, { "epoch": 0.8832, "loss_ce": 0.4612848162651062, "loss_lvr": 0.873244047164917, "loss_mode_switch": 0.0, "loss_total": 0.54860919713974, "step": 2208 }, { "batch_size": 4, "epoch": 0.8832, "step": 2208, "tokens_per_device": 4176 }, { "epoch": 0.8832, "loss_ce": 0.29439717531204224, "loss_lvr": 1.0702372789382935, "loss_mode_switch": 0.0, "loss_total": 0.4014208912849426, "step": 2208 }, { "batch_size": 1, "epoch": 0.8832, "step": 2208, "tokens_per_device": 4989 }, { "epoch": 0.8832, "loss_ce": 0.010440818965435028, "loss_lvr": 0.5743451714515686, "loss_mode_switch": 0.0, "loss_total": 0.06787534058094025, "step": 2208 }, { "batch_size": 4, "epoch": 0.8832, "step": 2208, "tokens_per_device": 2420 }, { "epoch": 0.8832, "loss_ce": 0.036590952426195145, "loss_lvr": 0.953924834728241, "loss_mode_switch": 0.0, "loss_total": 0.1319834440946579, "step": 2208 }, { "epoch": 0.8836, "grad_norm": 1.1979120969772339, "learning_rate": 3.511175705587433e-07, "loss": 0.275, "step": 2209 }, { "batch_size": 4, "epoch": 0.8836, "step": 2209, "tokens_per_device": 5984 }, { "epoch": 0.8836, "loss_ce": 0.22700777649879456, "loss_lvr": 0.5318645238876343, "loss_mode_switch": 0.0, "loss_total": 0.2801942229270935, "step": 2209 }, { "batch_size": 1, "epoch": 0.8836, "step": 2209, "tokens_per_device": 5008 }, { "epoch": 0.8836, "loss_ce": 0.0021001631394028664, "loss_lvr": 0.7230136394500732, "loss_mode_switch": 0.0, "loss_total": 0.07440152764320374, "step": 2209 }, { "batch_size": 4, "epoch": 0.8836, "step": 2209, "tokens_per_device": 4896 }, { "epoch": 0.8836, "loss_ce": 0.266976535320282, "loss_lvr": 1.0002645254135132, "loss_mode_switch": 0.0, "loss_total": 0.3670029938220978, "step": 2209 }, { "batch_size": 4, "epoch": 0.8836, "step": 2209, "tokens_per_device": 4252 }, { "epoch": 0.8836, "loss_ce": 0.06423298269510269, "loss_lvr": 1.7481186389923096, "loss_mode_switch": 0.0, "loss_total": 0.23904484510421753, "step": 2209 }, { "batch_size": 4, "epoch": 0.8836, "step": 2209, "tokens_per_device": 4496 }, { "epoch": 0.8836, "loss_ce": 0.24334338307380676, "loss_lvr": 0.7472488284111023, "loss_mode_switch": 0.0, "loss_total": 0.318068265914917, "step": 2209 }, { "batch_size": 4, "epoch": 0.8836, "step": 2209, "tokens_per_device": 1544 }, { "epoch": 0.8836, "loss_ce": 0.22132264077663422, "loss_lvr": 1.0860027074813843, "loss_mode_switch": 0.0, "loss_total": 0.3299229145050049, "step": 2209 }, { "batch_size": 4, "epoch": 0.8836, "step": 2209, "tokens_per_device": 4212 }, { "epoch": 0.8836, "loss_ce": 0.34963247179985046, "loss_lvr": 0.8399769067764282, "loss_mode_switch": 0.0, "loss_total": 0.43363016843795776, "step": 2209 }, { "batch_size": 4, "epoch": 0.8836, "step": 2209, "tokens_per_device": 1380 }, { "epoch": 0.8836, "loss_ce": 0.42295777797698975, "loss_lvr": 0.8810240626335144, "loss_mode_switch": 0.0, "loss_total": 0.5110601782798767, "step": 2209 }, { "epoch": 0.884, "grad_norm": 1.1822303533554077, "learning_rate": 3.4873694169306915e-07, "loss": 0.2496, "step": 2210 }, { "batch_size": 1, "epoch": 0.884, "step": 2210, "tokens_per_device": 5108 }, { "epoch": 0.884, "loss_ce": 0.00024965862394310534, "loss_lvr": 0.7133530974388123, "loss_mode_switch": 0.0, "loss_total": 0.07158496975898743, "step": 2210 }, { "batch_size": 4, "epoch": 0.884, "step": 2210, "tokens_per_device": 1500 }, { "epoch": 0.884, "loss_ce": 0.7302884459495544, "loss_lvr": 0.9300349950790405, "loss_mode_switch": 0.0, "loss_total": 0.8232919573783875, "step": 2210 }, { "batch_size": 4, "epoch": 0.884, "step": 2210, "tokens_per_device": 2828 }, { "epoch": 0.884, "loss_ce": 0.45553359389305115, "loss_lvr": 0.6811990141868591, "loss_mode_switch": 0.0, "loss_total": 0.523653507232666, "step": 2210 }, { "batch_size": 4, "epoch": 0.884, "step": 2210, "tokens_per_device": 10816 }, { "epoch": 0.884, "loss_ce": 0.004729755222797394, "loss_lvr": 0.6757575869560242, "loss_mode_switch": 0.0, "loss_total": 0.07230551540851593, "step": 2210 }, { "batch_size": 1, "epoch": 0.884, "step": 2210, "tokens_per_device": 5268 }, { "epoch": 0.884, "loss_ce": 0.013103334233164787, "loss_lvr": 0.3598754107952118, "loss_mode_switch": 0.0, "loss_total": 0.049090877175331116, "step": 2210 }, { "batch_size": 1, "epoch": 0.884, "step": 2210, "tokens_per_device": 4865 }, { "epoch": 0.884, "loss_ce": 0.014507800340652466, "loss_lvr": 0.22376975417137146, "loss_mode_switch": 0.0, "loss_total": 0.03688477724790573, "step": 2210 }, { "batch_size": 4, "epoch": 0.884, "step": 2210, "tokens_per_device": 1632 }, { "epoch": 0.884, "loss_ce": 0.6558051109313965, "loss_lvr": 0.9755916595458984, "loss_mode_switch": 0.0, "loss_total": 0.7533642649650574, "step": 2210 }, { "batch_size": 4, "epoch": 0.884, "step": 2210, "tokens_per_device": 4944 }, { "epoch": 0.884, "loss_ce": 0.40080147981643677, "loss_lvr": 0.8178048133850098, "loss_mode_switch": 0.0, "loss_total": 0.4825819730758667, "step": 2210 }, { "epoch": 0.8844, "grad_norm": 1.189168095588684, "learning_rate": 3.4636411916089465e-07, "loss": 0.2664, "step": 2211 }, { "batch_size": 4, "epoch": 0.8844, "step": 2211, "tokens_per_device": 1376 }, { "epoch": 0.8844, "loss_ce": 0.1881694495677948, "loss_lvr": 0.9563844203948975, "loss_mode_switch": 0.0, "loss_total": 0.2838079035282135, "step": 2211 }, { "batch_size": 4, "epoch": 0.8844, "step": 2211, "tokens_per_device": 4720 }, { "epoch": 0.8844, "loss_ce": 0.06149208918213844, "loss_lvr": 0.7864037156105042, "loss_mode_switch": 0.0, "loss_total": 0.14013245701789856, "step": 2211 }, { "batch_size": 4, "epoch": 0.8844, "step": 2211, "tokens_per_device": 3784 }, { "epoch": 0.8844, "loss_ce": 0.2754822373390198, "loss_lvr": 0.8269585967063904, "loss_mode_switch": 0.0, "loss_total": 0.35817810893058777, "step": 2211 }, { "batch_size": 1, "epoch": 0.8844, "step": 2211, "tokens_per_device": 4741 }, { "epoch": 0.8844, "loss_ce": 0.0009712025057524443, "loss_lvr": 0.6217133402824402, "loss_mode_switch": 0.0, "loss_total": 0.06314253807067871, "step": 2211 }, { "batch_size": 4, "epoch": 0.8844, "step": 2211, "tokens_per_device": 10936 }, { "epoch": 0.8844, "loss_ce": 0.04910871759057045, "loss_lvr": 0.6816779971122742, "loss_mode_switch": 0.0, "loss_total": 0.11727651953697205, "step": 2211 }, { "batch_size": 4, "epoch": 0.8844, "step": 2211, "tokens_per_device": 2732 }, { "epoch": 0.8844, "loss_ce": 0.6588559150695801, "loss_lvr": 0.716755211353302, "loss_mode_switch": 0.0, "loss_total": 0.7305314540863037, "step": 2211 }, { "batch_size": 4, "epoch": 0.8844, "step": 2211, "tokens_per_device": 14128 }, { "epoch": 0.8844, "loss_ce": 0.5772200226783752, "loss_lvr": 0.819746732711792, "loss_mode_switch": 0.0, "loss_total": 0.6591947078704834, "step": 2211 }, { "batch_size": 1, "epoch": 0.8844, "step": 2211, "tokens_per_device": 5110 }, { "epoch": 0.8844, "loss_ce": 0.021596934646368027, "loss_lvr": 0.22915051877498627, "loss_mode_switch": 0.0, "loss_total": 0.04451198875904083, "step": 2211 }, { "epoch": 0.8848, "grad_norm": 1.3283355236053467, "learning_rate": 3.4399910694458583e-07, "loss": 0.3012, "step": 2212 }, { "batch_size": 4, "epoch": 0.8848, "step": 2212, "tokens_per_device": 1384 }, { "epoch": 0.8848, "loss_ce": 0.4869835674762726, "loss_lvr": 0.8917767405509949, "loss_mode_switch": 0.0, "loss_total": 0.57616126537323, "step": 2212 }, { "batch_size": 4, "epoch": 0.8848, "step": 2212, "tokens_per_device": 4880 }, { "epoch": 0.8848, "loss_ce": 0.3242635726928711, "loss_lvr": 0.725492000579834, "loss_mode_switch": 0.0, "loss_total": 0.39681276679039, "step": 2212 }, { "batch_size": 4, "epoch": 0.8848, "step": 2212, "tokens_per_device": 3768 }, { "epoch": 0.8848, "loss_ce": 0.6074742078781128, "loss_lvr": 0.6428474187850952, "loss_mode_switch": 0.0, "loss_total": 0.6717589497566223, "step": 2212 }, { "batch_size": 1, "epoch": 0.8848, "step": 2212, "tokens_per_device": 4671 }, { "epoch": 0.8848, "loss_ce": 0.0634738951921463, "loss_lvr": 0.4367387890815735, "loss_mode_switch": 0.0, "loss_total": 0.10714777559041977, "step": 2212 }, { "batch_size": 4, "epoch": 0.8848, "step": 2212, "tokens_per_device": 2584 }, { "epoch": 0.8848, "loss_ce": 0.1476229578256607, "loss_lvr": 0.9286471605300903, "loss_mode_switch": 0.0, "loss_total": 0.24048766493797302, "step": 2212 }, { "batch_size": 1, "epoch": 0.8848, "step": 2212, "tokens_per_device": 4887 }, { "epoch": 0.8848, "loss_ce": 0.022378109395503998, "loss_lvr": 0.4232613146305084, "loss_mode_switch": 0.0, "loss_total": 0.06470423936843872, "step": 2212 }, { "batch_size": 1, "epoch": 0.8848, "step": 2212, "tokens_per_device": 5068 }, { "epoch": 0.8848, "loss_ce": 0.0936511978507042, "loss_lvr": 0.27116265892982483, "loss_mode_switch": 0.0, "loss_total": 0.12076746672391891, "step": 2212 }, { "batch_size": 4, "epoch": 0.8848, "step": 2212, "tokens_per_device": 2668 }, { "epoch": 0.8848, "loss_ce": 0.4510994851589203, "loss_lvr": 0.7377308011054993, "loss_mode_switch": 0.0, "loss_total": 0.5248725414276123, "step": 2212 }, { "epoch": 0.8852, "grad_norm": 1.2831225395202637, "learning_rate": 3.41641909013406e-07, "loss": 0.2905, "step": 2213 }, { "batch_size": 4, "epoch": 0.8852, "step": 2213, "tokens_per_device": 7040 }, { "epoch": 0.8852, "loss_ce": 0.4028494954109192, "loss_lvr": 0.6589316129684448, "loss_mode_switch": 0.0, "loss_total": 0.4687426686286926, "step": 2213 }, { "batch_size": 4, "epoch": 0.8852, "step": 2213, "tokens_per_device": 3800 }, { "epoch": 0.8852, "loss_ce": 0.1417178511619568, "loss_lvr": 1.095747709274292, "loss_mode_switch": 0.0, "loss_total": 0.2512926161289215, "step": 2213 }, { "batch_size": 4, "epoch": 0.8852, "step": 2213, "tokens_per_device": 3956 }, { "epoch": 0.8852, "loss_ce": 0.3467434048652649, "loss_lvr": 0.8544236421585083, "loss_mode_switch": 0.0, "loss_total": 0.4321857690811157, "step": 2213 }, { "batch_size": 4, "epoch": 0.8852, "step": 2213, "tokens_per_device": 3788 }, { "epoch": 0.8852, "loss_ce": 0.1402999609708786, "loss_lvr": 0.8565357327461243, "loss_mode_switch": 0.0, "loss_total": 0.22595353424549103, "step": 2213 }, { "batch_size": 1, "epoch": 0.8852, "step": 2213, "tokens_per_device": 5044 }, { "epoch": 0.8852, "loss_ce": 0.0014131047064438462, "loss_lvr": 0.4295673370361328, "loss_mode_switch": 0.0, "loss_total": 0.04436983913183212, "step": 2213 }, { "batch_size": 1, "epoch": 0.8852, "step": 2213, "tokens_per_device": 4908 }, { "epoch": 0.8852, "loss_ce": 0.04539189487695694, "loss_lvr": 0.259896457195282, "loss_mode_switch": 0.0, "loss_total": 0.07138153910636902, "step": 2213 }, { "batch_size": 4, "epoch": 0.8852, "step": 2213, "tokens_per_device": 4212 }, { "epoch": 0.8852, "loss_ce": 0.148644357919693, "loss_lvr": 1.1758109331130981, "loss_mode_switch": 0.0, "loss_total": 0.2662254571914673, "step": 2213 }, { "batch_size": 1, "epoch": 0.8852, "step": 2213, "tokens_per_device": 5231 }, { "epoch": 0.8852, "loss_ce": 0.21252427995204926, "loss_lvr": 0.23908156156539917, "loss_mode_switch": 0.0, "loss_total": 0.23643243312835693, "step": 2213 }, { "epoch": 0.8856, "grad_norm": 1.491336703300476, "learning_rate": 3.3929252932349944e-07, "loss": 0.323, "step": 2214 }, { "batch_size": 4, "epoch": 0.8856, "step": 2214, "tokens_per_device": 4052 }, { "epoch": 0.8856, "loss_ce": 0.008836221881210804, "loss_lvr": 0.6741847991943359, "loss_mode_switch": 0.0, "loss_total": 0.076254703104496, "step": 2214 }, { "batch_size": 4, "epoch": 0.8856, "step": 2214, "tokens_per_device": 5732 }, { "epoch": 0.8856, "loss_ce": 0.3521358072757721, "loss_lvr": 0.7767508625984192, "loss_mode_switch": 0.0, "loss_total": 0.42981088161468506, "step": 2214 }, { "batch_size": 1, "epoch": 0.8856, "step": 2214, "tokens_per_device": 4859 }, { "epoch": 0.8856, "loss_ce": 0.038624320179224014, "loss_lvr": 0.27911409735679626, "loss_mode_switch": 0.0, "loss_total": 0.06653572618961334, "step": 2214 }, { "batch_size": 4, "epoch": 0.8856, "step": 2214, "tokens_per_device": 1492 }, { "epoch": 0.8856, "loss_ce": 0.30283868312835693, "loss_lvr": 0.9819653630256653, "loss_mode_switch": 0.0, "loss_total": 0.40103521943092346, "step": 2214 }, { "batch_size": 4, "epoch": 0.8856, "step": 2214, "tokens_per_device": 4244 }, { "epoch": 0.8856, "loss_ce": 0.25614774227142334, "loss_lvr": 0.8665202856063843, "loss_mode_switch": 0.0, "loss_total": 0.3427997827529907, "step": 2214 }, { "batch_size": 4, "epoch": 0.8856, "step": 2214, "tokens_per_device": 1608 }, { "epoch": 0.8856, "loss_ce": 0.40579885244369507, "loss_lvr": 0.9512938857078552, "loss_mode_switch": 0.0, "loss_total": 0.5009282231330872, "step": 2214 }, { "batch_size": 4, "epoch": 0.8856, "step": 2214, "tokens_per_device": 1460 }, { "epoch": 0.8856, "loss_ce": 1.075191617012024, "loss_lvr": 0.8500735759735107, "loss_mode_switch": 0.0, "loss_total": 1.1601989269256592, "step": 2214 }, { "batch_size": 4, "epoch": 0.8856, "step": 2214, "tokens_per_device": 7372 }, { "epoch": 0.8856, "loss_ce": 0.5038102269172668, "loss_lvr": 0.8272532224655151, "loss_mode_switch": 0.0, "loss_total": 0.5865355730056763, "step": 2214 }, { "epoch": 0.886, "grad_norm": 1.2439806461334229, "learning_rate": 3.369509718178887e-07, "loss": 0.3208, "step": 2215 }, { "batch_size": 4, "epoch": 0.886, "step": 2215, "tokens_per_device": 4244 }, { "epoch": 0.886, "loss_ce": 0.07290566712617874, "loss_lvr": 0.8018580675125122, "loss_mode_switch": 0.0, "loss_total": 0.15309147536754608, "step": 2215 }, { "batch_size": 1, "epoch": 0.886, "step": 2215, "tokens_per_device": 5166 }, { "epoch": 0.886, "loss_ce": 0.13095735013484955, "loss_lvr": 0.3134172260761261, "loss_mode_switch": 0.0, "loss_total": 0.16229906678199768, "step": 2215 }, { "batch_size": 4, "epoch": 0.886, "step": 2215, "tokens_per_device": 5312 }, { "epoch": 0.886, "loss_ce": 0.8098175525665283, "loss_lvr": 0.7364159226417542, "loss_mode_switch": 0.0, "loss_total": 0.8834591507911682, "step": 2215 }, { "batch_size": 1, "epoch": 0.886, "step": 2215, "tokens_per_device": 5322 }, { "epoch": 0.886, "loss_ce": 0.02781156823039055, "loss_lvr": 0.25758063793182373, "loss_mode_switch": 0.0, "loss_total": 0.05356962978839874, "step": 2215 }, { "batch_size": 4, "epoch": 0.886, "step": 2215, "tokens_per_device": 4440 }, { "epoch": 0.886, "loss_ce": 0.09312981367111206, "loss_lvr": 0.710670530796051, "loss_mode_switch": 0.0, "loss_total": 0.16419687867164612, "step": 2215 }, { "batch_size": 4, "epoch": 0.886, "step": 2215, "tokens_per_device": 1400 }, { "epoch": 0.886, "loss_ce": 0.36911579966545105, "loss_lvr": 0.8354796171188354, "loss_mode_switch": 0.0, "loss_total": 0.452663779258728, "step": 2215 }, { "batch_size": 1, "epoch": 0.886, "step": 2215, "tokens_per_device": 4883 }, { "epoch": 0.886, "loss_ce": 0.21730318665504456, "loss_lvr": 0.2808256149291992, "loss_mode_switch": 0.0, "loss_total": 0.24538575112819672, "step": 2215 }, { "batch_size": 4, "epoch": 0.886, "step": 2215, "tokens_per_device": 4856 }, { "epoch": 0.886, "loss_ce": 0.45453906059265137, "loss_lvr": 0.667937159538269, "loss_mode_switch": 0.0, "loss_total": 0.5213328003883362, "step": 2215 }, { "epoch": 0.8864, "grad_norm": 1.3857226371765137, "learning_rate": 3.3461724042647136e-07, "loss": 0.268, "step": 2216 }, { "batch_size": 4, "epoch": 0.8864, "step": 2216, "tokens_per_device": 2572 }, { "epoch": 0.8864, "loss_ce": 0.0069900271482765675, "loss_lvr": 0.8567813634872437, "loss_mode_switch": 0.0, "loss_total": 0.09266816824674606, "step": 2216 }, { "batch_size": 4, "epoch": 0.8864, "step": 2216, "tokens_per_device": 4684 }, { "epoch": 0.8864, "loss_ce": 0.05908474326133728, "loss_lvr": 0.6588752269744873, "loss_mode_switch": 0.0, "loss_total": 0.12497226893901825, "step": 2216 }, { "batch_size": 4, "epoch": 0.8864, "step": 2216, "tokens_per_device": 12744 }, { "epoch": 0.8864, "loss_ce": 0.11091826111078262, "loss_lvr": 0.7014056444168091, "loss_mode_switch": 0.0, "loss_total": 0.1810588240623474, "step": 2216 }, { "batch_size": 4, "epoch": 0.8864, "step": 2216, "tokens_per_device": 3416 }, { "epoch": 0.8864, "loss_ce": 0.02275659330189228, "loss_lvr": 0.9153329730033875, "loss_mode_switch": 0.0, "loss_total": 0.11428988724946976, "step": 2216 }, { "batch_size": 4, "epoch": 0.8864, "step": 2216, "tokens_per_device": 11036 }, { "epoch": 0.8864, "loss_ce": 0.015804823487997055, "loss_lvr": 0.6666153073310852, "loss_mode_switch": 0.0, "loss_total": 0.08246634900569916, "step": 2216 }, { "batch_size": 1, "epoch": 0.8864, "step": 2216, "tokens_per_device": 5128 }, { "epoch": 0.8864, "loss_ce": 0.0071251592598855495, "loss_lvr": 0.3401387631893158, "loss_mode_switch": 0.0, "loss_total": 0.041139036417007446, "step": 2216 }, { "batch_size": 4, "epoch": 0.8864, "step": 2216, "tokens_per_device": 1304 }, { "epoch": 0.8864, "loss_ce": 0.3348965644836426, "loss_lvr": 1.2924513816833496, "loss_mode_switch": 0.0, "loss_total": 0.46414172649383545, "step": 2216 }, { "batch_size": 4, "epoch": 0.8864, "step": 2216, "tokens_per_device": 2952 }, { "epoch": 0.8864, "loss_ce": 0.16749700903892517, "loss_lvr": 0.4480276107788086, "loss_mode_switch": 0.0, "loss_total": 0.21229976415634155, "step": 2216 }, { "epoch": 0.8868, "grad_norm": 1.2903008460998535, "learning_rate": 3.3229133906600706e-07, "loss": 0.2714, "step": 2217 }, { "batch_size": 4, "epoch": 0.8868, "step": 2217, "tokens_per_device": 9840 }, { "epoch": 0.8868, "loss_ce": 1.1722252368927002, "loss_lvr": 0.3135049045085907, "loss_mode_switch": 0.0, "loss_total": 1.2035757303237915, "step": 2217 }, { "batch_size": 4, "epoch": 0.8868, "step": 2217, "tokens_per_device": 2800 }, { "epoch": 0.8868, "loss_ce": 0.4414639174938202, "loss_lvr": 0.6337162256240845, "loss_mode_switch": 0.0, "loss_total": 0.5048355460166931, "step": 2217 }, { "batch_size": 4, "epoch": 0.8868, "step": 2217, "tokens_per_device": 5248 }, { "epoch": 0.8868, "loss_ce": 0.1882752776145935, "loss_lvr": 0.6646931171417236, "loss_mode_switch": 0.0, "loss_total": 0.25474458932876587, "step": 2217 }, { "batch_size": 1, "epoch": 0.8868, "step": 2217, "tokens_per_device": 5138 }, { "epoch": 0.8868, "loss_ce": 0.009929899126291275, "loss_lvr": 0.46412378549575806, "loss_mode_switch": 0.0, "loss_total": 0.05634227767586708, "step": 2217 }, { "batch_size": 1, "epoch": 0.8868, "step": 2217, "tokens_per_device": 4896 }, { "epoch": 0.8868, "loss_ce": 0.00620607566088438, "loss_lvr": 0.26036426424980164, "loss_mode_switch": 0.0, "loss_total": 0.03224250301718712, "step": 2217 }, { "batch_size": 4, "epoch": 0.8868, "step": 2217, "tokens_per_device": 5248 }, { "epoch": 0.8868, "loss_ce": 0.41425180435180664, "loss_lvr": 0.7289758920669556, "loss_mode_switch": 0.0, "loss_total": 0.4871493875980377, "step": 2217 }, { "batch_size": 4, "epoch": 0.8868, "step": 2217, "tokens_per_device": 4464 }, { "epoch": 0.8868, "loss_ce": 0.11045501381158829, "loss_lvr": 0.6577491164207458, "loss_mode_switch": 0.0, "loss_total": 0.17622992396354675, "step": 2217 }, { "batch_size": 4, "epoch": 0.8868, "step": 2217, "tokens_per_device": 1576 }, { "epoch": 0.8868, "loss_ce": 0.9042865633964539, "loss_lvr": 1.1460436582565308, "loss_mode_switch": 0.0, "loss_total": 1.0188909769058228, "step": 2217 }, { "epoch": 0.8872, "grad_norm": 1.18754243850708, "learning_rate": 3.299732716401166e-07, "loss": 0.2659, "step": 2218 }, { "batch_size": 4, "epoch": 0.8872, "step": 2218, "tokens_per_device": 4352 }, { "epoch": 0.8872, "loss_ce": 0.44319242238998413, "loss_lvr": 1.014640212059021, "loss_mode_switch": 0.0, "loss_total": 0.5446564555168152, "step": 2218 }, { "batch_size": 4, "epoch": 0.8872, "step": 2218, "tokens_per_device": 15736 }, { "epoch": 0.8872, "loss_ce": 0.7019380331039429, "loss_lvr": 0.7236812114715576, "loss_mode_switch": 0.0, "loss_total": 0.7743061780929565, "step": 2218 }, { "batch_size": 4, "epoch": 0.8872, "step": 2218, "tokens_per_device": 1468 }, { "epoch": 0.8872, "loss_ce": 0.493130087852478, "loss_lvr": 0.8926968574523926, "loss_mode_switch": 0.0, "loss_total": 0.5823997855186462, "step": 2218 }, { "batch_size": 4, "epoch": 0.8872, "step": 2218, "tokens_per_device": 12644 }, { "epoch": 0.8872, "loss_ce": 0.041795291006565094, "loss_lvr": 0.5406524538993835, "loss_mode_switch": 0.0, "loss_total": 0.0958605408668518, "step": 2218 }, { "batch_size": 1, "epoch": 0.8872, "step": 2218, "tokens_per_device": 4853 }, { "epoch": 0.8872, "loss_ce": 0.09812469780445099, "loss_lvr": 0.20978356897830963, "loss_mode_switch": 0.0, "loss_total": 0.11910305917263031, "step": 2218 }, { "batch_size": 4, "epoch": 0.8872, "step": 2218, "tokens_per_device": 4392 }, { "epoch": 0.8872, "loss_ce": 0.40072643756866455, "loss_lvr": 0.6248190999031067, "loss_mode_switch": 0.0, "loss_total": 0.4632083475589752, "step": 2218 }, { "batch_size": 1, "epoch": 0.8872, "step": 2218, "tokens_per_device": 4953 }, { "epoch": 0.8872, "loss_ce": 0.20113235712051392, "loss_lvr": 0.7608924508094788, "loss_mode_switch": 0.0, "loss_total": 0.2772216200828552, "step": 2218 }, { "batch_size": 4, "epoch": 0.8872, "step": 2218, "tokens_per_device": 1536 }, { "epoch": 0.8872, "loss_ce": 0.575435996055603, "loss_lvr": 0.8556473851203918, "loss_mode_switch": 0.0, "loss_total": 0.6610007286071777, "step": 2218 }, { "epoch": 0.8876, "grad_norm": 1.4383642673492432, "learning_rate": 3.276630420392707e-07, "loss": 0.3353, "step": 2219 }, { "batch_size": 4, "epoch": 0.8876, "step": 2219, "tokens_per_device": 4716 }, { "epoch": 0.8876, "loss_ce": 0.23937071859836578, "loss_lvr": 0.7195183038711548, "loss_mode_switch": 0.0, "loss_total": 0.31132254004478455, "step": 2219 }, { "batch_size": 1, "epoch": 0.8876, "step": 2219, "tokens_per_device": 4872 }, { "epoch": 0.8876, "loss_ce": 0.006128246430307627, "loss_lvr": 0.1846555471420288, "loss_mode_switch": 0.0, "loss_total": 0.02459380030632019, "step": 2219 }, { "batch_size": 4, "epoch": 0.8876, "step": 2219, "tokens_per_device": 2668 }, { "epoch": 0.8876, "loss_ce": 0.1836187094449997, "loss_lvr": 0.7611366510391235, "loss_mode_switch": 0.0, "loss_total": 0.25973236560821533, "step": 2219 }, { "batch_size": 4, "epoch": 0.8876, "step": 2219, "tokens_per_device": 3800 }, { "epoch": 0.8876, "loss_ce": 0.27727609872817993, "loss_lvr": 1.1367229223251343, "loss_mode_switch": 0.0, "loss_total": 0.3909483850002289, "step": 2219 }, { "batch_size": 4, "epoch": 0.8876, "step": 2219, "tokens_per_device": 4216 }, { "epoch": 0.8876, "loss_ce": 0.10564562678337097, "loss_lvr": 0.6661562323570251, "loss_mode_switch": 0.0, "loss_total": 0.17226125299930573, "step": 2219 }, { "batch_size": 1, "epoch": 0.8876, "step": 2219, "tokens_per_device": 4998 }, { "epoch": 0.8876, "loss_ce": 0.21408186852931976, "loss_lvr": 0.2934315502643585, "loss_mode_switch": 0.0, "loss_total": 0.24342502653598785, "step": 2219 }, { "batch_size": 4, "epoch": 0.8876, "step": 2219, "tokens_per_device": 4524 }, { "epoch": 0.8876, "loss_ce": 0.28536826372146606, "loss_lvr": 0.831912636756897, "loss_mode_switch": 0.0, "loss_total": 0.3685595393180847, "step": 2219 }, { "batch_size": 1, "epoch": 0.8876, "step": 2219, "tokens_per_device": 5032 }, { "epoch": 0.8876, "loss_ce": 0.08469833433628082, "loss_lvr": 0.37488222122192383, "loss_mode_switch": 0.0, "loss_total": 0.1221865564584732, "step": 2219 }, { "epoch": 0.888, "grad_norm": 1.4236209392547607, "learning_rate": 3.2536065414078724e-07, "loss": 0.2625, "step": 2220 }, { "batch_size": 4, "epoch": 0.888, "step": 2220, "tokens_per_device": 1372 }, { "epoch": 0.888, "loss_ce": 0.29543283581733704, "loss_lvr": 1.291809320449829, "loss_mode_switch": 0.0, "loss_total": 0.4246137738227844, "step": 2220 }, { "batch_size": 1, "epoch": 0.888, "step": 2220, "tokens_per_device": 4900 }, { "epoch": 0.888, "loss_ce": 0.004204518161714077, "loss_lvr": 0.37019720673561096, "loss_mode_switch": 0.0, "loss_total": 0.04122424125671387, "step": 2220 }, { "batch_size": 1, "epoch": 0.888, "step": 2220, "tokens_per_device": 4889 }, { "epoch": 0.888, "loss_ce": 0.00905173271894455, "loss_lvr": 0.27807220816612244, "loss_mode_switch": 0.0, "loss_total": 0.03685895353555679, "step": 2220 }, { "batch_size": 4, "epoch": 0.888, "step": 2220, "tokens_per_device": 2696 }, { "epoch": 0.888, "loss_ce": 0.11417246609926224, "loss_lvr": 0.5489609241485596, "loss_mode_switch": 0.0, "loss_total": 0.16906856000423431, "step": 2220 }, { "batch_size": 4, "epoch": 0.888, "step": 2220, "tokens_per_device": 6312 }, { "epoch": 0.888, "loss_ce": 0.31068703532218933, "loss_lvr": 0.7049410343170166, "loss_mode_switch": 0.0, "loss_total": 0.38118115067481995, "step": 2220 }, { "batch_size": 1, "epoch": 0.888, "step": 2220, "tokens_per_device": 5792 }, { "epoch": 0.888, "loss_ce": 0.008620000444352627, "loss_lvr": 0.26141926646232605, "loss_mode_switch": 0.0, "loss_total": 0.034761928021907806, "step": 2220 }, { "batch_size": 4, "epoch": 0.888, "step": 2220, "tokens_per_device": 2296 }, { "epoch": 0.888, "loss_ce": 0.11030871421098709, "loss_lvr": 0.902079701423645, "loss_mode_switch": 0.0, "loss_total": 0.2005166858434677, "step": 2220 }, { "batch_size": 4, "epoch": 0.888, "step": 2220, "tokens_per_device": 4372 }, { "epoch": 0.888, "loss_ce": 0.0678364560008049, "loss_lvr": 0.8373032808303833, "loss_mode_switch": 0.0, "loss_total": 0.1515667885541916, "step": 2220 }, { "epoch": 0.8884, "grad_norm": 1.486922025680542, "learning_rate": 3.230661118088219e-07, "loss": 0.3822, "step": 2221 }, { "batch_size": 1, "epoch": 0.8884, "step": 2221, "tokens_per_device": 4890 }, { "epoch": 0.8884, "loss_ce": 0.05920187756419182, "loss_lvr": 0.2214909791946411, "loss_mode_switch": 0.0, "loss_total": 0.08135097473859787, "step": 2221 }, { "batch_size": 4, "epoch": 0.8884, "step": 2221, "tokens_per_device": 4284 }, { "epoch": 0.8884, "loss_ce": 0.22524699568748474, "loss_lvr": 0.5114620327949524, "loss_mode_switch": 0.0, "loss_total": 0.27639320492744446, "step": 2221 }, { "batch_size": 1, "epoch": 0.8884, "step": 2221, "tokens_per_device": 4853 }, { "epoch": 0.8884, "loss_ce": 0.006729394197463989, "loss_lvr": 0.18321263790130615, "loss_mode_switch": 0.0, "loss_total": 0.025050658732652664, "step": 2221 }, { "batch_size": 1, "epoch": 0.8884, "step": 2221, "tokens_per_device": 5162 }, { "epoch": 0.8884, "loss_ce": 0.1752186119556427, "loss_lvr": 0.3387279510498047, "loss_mode_switch": 0.0, "loss_total": 0.2090914100408554, "step": 2221 }, { "batch_size": 4, "epoch": 0.8884, "step": 2221, "tokens_per_device": 4204 }, { "epoch": 0.8884, "loss_ce": 0.2723519802093506, "loss_lvr": 0.9846445918083191, "loss_mode_switch": 0.0, "loss_total": 0.3708164393901825, "step": 2221 }, { "batch_size": 4, "epoch": 0.8884, "step": 2221, "tokens_per_device": 5912 }, { "epoch": 0.8884, "loss_ce": 0.5736541152000427, "loss_lvr": 0.7419851422309875, "loss_mode_switch": 0.0, "loss_total": 0.6478526592254639, "step": 2221 }, { "batch_size": 4, "epoch": 0.8884, "step": 2221, "tokens_per_device": 2516 }, { "epoch": 0.8884, "loss_ce": 0.15103642642498016, "loss_lvr": 0.8807527422904968, "loss_mode_switch": 0.0, "loss_total": 0.23911169171333313, "step": 2221 }, { "batch_size": 4, "epoch": 0.8884, "step": 2221, "tokens_per_device": 4824 }, { "epoch": 0.8884, "loss_ce": 0.662628710269928, "loss_lvr": 0.6558147668838501, "loss_mode_switch": 0.0, "loss_total": 0.7282102108001709, "step": 2221 }, { "epoch": 0.8888, "grad_norm": 1.366957426071167, "learning_rate": 3.2077941889436525e-07, "loss": 0.278, "step": 2222 }, { "batch_size": 4, "epoch": 0.8888, "step": 2222, "tokens_per_device": 3824 }, { "epoch": 0.8888, "loss_ce": 0.31081318855285645, "loss_lvr": 0.9566770792007446, "loss_mode_switch": 0.0, "loss_total": 0.40648090839385986, "step": 2222 }, { "batch_size": 4, "epoch": 0.8888, "step": 2222, "tokens_per_device": 5356 }, { "epoch": 0.8888, "loss_ce": 0.3341692388057709, "loss_lvr": 0.5236572623252869, "loss_mode_switch": 0.0, "loss_total": 0.3865349590778351, "step": 2222 }, { "batch_size": 4, "epoch": 0.8888, "step": 2222, "tokens_per_device": 5668 }, { "epoch": 0.8888, "loss_ce": 0.14636799693107605, "loss_lvr": 0.6805379986763, "loss_mode_switch": 0.0, "loss_total": 0.214421808719635, "step": 2222 }, { "batch_size": 4, "epoch": 0.8888, "step": 2222, "tokens_per_device": 4260 }, { "epoch": 0.8888, "loss_ce": 0.06881513446569443, "loss_lvr": 0.8497422337532043, "loss_mode_switch": 0.0, "loss_total": 0.15378935635089874, "step": 2222 }, { "batch_size": 4, "epoch": 0.8888, "step": 2222, "tokens_per_device": 1792 }, { "epoch": 0.8888, "loss_ce": 0.4449823200702667, "loss_lvr": 1.1586216688156128, "loss_mode_switch": 0.0, "loss_total": 0.5608444809913635, "step": 2222 }, { "batch_size": 4, "epoch": 0.8888, "step": 2222, "tokens_per_device": 4196 }, { "epoch": 0.8888, "loss_ce": 0.32284703850746155, "loss_lvr": 0.7341179251670837, "loss_mode_switch": 0.0, "loss_total": 0.3962588310241699, "step": 2222 }, { "batch_size": 4, "epoch": 0.8888, "step": 2222, "tokens_per_device": 5116 }, { "epoch": 0.8888, "loss_ce": 0.2236787974834442, "loss_lvr": 0.8591000437736511, "loss_mode_switch": 0.0, "loss_total": 0.30958878993988037, "step": 2222 }, { "batch_size": 4, "epoch": 0.8888, "step": 2222, "tokens_per_device": 4364 }, { "epoch": 0.8888, "loss_ce": 0.04102184623479843, "loss_lvr": 0.7865836024284363, "loss_mode_switch": 0.0, "loss_total": 0.11968021094799042, "step": 2222 }, { "epoch": 0.8892, "grad_norm": 1.3271231651306152, "learning_rate": 3.185005792352308e-07, "loss": 0.2761, "step": 2223 }, { "batch_size": 4, "epoch": 0.8892, "step": 2223, "tokens_per_device": 4408 }, { "epoch": 0.8892, "loss_ce": 0.19993211328983307, "loss_lvr": 0.8121250867843628, "loss_mode_switch": 0.0, "loss_total": 0.2811446189880371, "step": 2223 }, { "batch_size": 1, "epoch": 0.8892, "step": 2223, "tokens_per_device": 5184 }, { "epoch": 0.8892, "loss_ce": 0.07137610763311386, "loss_lvr": 0.5759198069572449, "loss_mode_switch": 0.0, "loss_total": 0.12896808981895447, "step": 2223 }, { "batch_size": 1, "epoch": 0.8892, "step": 2223, "tokens_per_device": 5098 }, { "epoch": 0.8892, "loss_ce": 0.002763236640021205, "loss_lvr": 0.38139981031417847, "loss_mode_switch": 0.0, "loss_total": 0.04090321809053421, "step": 2223 }, { "batch_size": 4, "epoch": 0.8892, "step": 2223, "tokens_per_device": 9544 }, { "epoch": 0.8892, "loss_ce": 0.0107036167755723, "loss_lvr": 0.5534559488296509, "loss_mode_switch": 0.0, "loss_total": 0.06604921072721481, "step": 2223 }, { "batch_size": 4, "epoch": 0.8892, "step": 2223, "tokens_per_device": 9184 }, { "epoch": 0.8892, "loss_ce": 0.09256210178136826, "loss_lvr": 1.0286519527435303, "loss_mode_switch": 0.0, "loss_total": 0.1954272985458374, "step": 2223 }, { "batch_size": 4, "epoch": 0.8892, "step": 2223, "tokens_per_device": 4772 }, { "epoch": 0.8892, "loss_ce": 0.22224585711956024, "loss_lvr": 1.1032421588897705, "loss_mode_switch": 0.0, "loss_total": 0.33257007598876953, "step": 2223 }, { "batch_size": 4, "epoch": 0.8892, "step": 2223, "tokens_per_device": 4500 }, { "epoch": 0.8892, "loss_ce": 0.38977357745170593, "loss_lvr": 0.764102578163147, "loss_mode_switch": 0.0, "loss_total": 0.4661838412284851, "step": 2223 }, { "batch_size": 4, "epoch": 0.8892, "step": 2223, "tokens_per_device": 3700 }, { "epoch": 0.8892, "loss_ce": 0.07828796654939651, "loss_lvr": 0.987675666809082, "loss_mode_switch": 0.0, "loss_total": 0.17705553770065308, "step": 2223 }, { "epoch": 0.8896, "grad_norm": 1.4372142553329468, "learning_rate": 3.162295966560536e-07, "loss": 0.2925, "step": 2224 }, { "batch_size": 4, "epoch": 0.8896, "step": 2224, "tokens_per_device": 1368 }, { "epoch": 0.8896, "loss_ce": 0.5595308542251587, "loss_lvr": 0.923213541507721, "loss_mode_switch": 0.0, "loss_total": 0.6518521904945374, "step": 2224 }, { "batch_size": 1, "epoch": 0.8896, "step": 2224, "tokens_per_device": 5140 }, { "epoch": 0.8896, "loss_ce": 0.3275534510612488, "loss_lvr": 0.3863034248352051, "loss_mode_switch": 0.0, "loss_total": 0.3661837875843048, "step": 2224 }, { "batch_size": 4, "epoch": 0.8896, "step": 2224, "tokens_per_device": 5220 }, { "epoch": 0.8896, "loss_ce": 0.15237361192703247, "loss_lvr": 0.6437183618545532, "loss_mode_switch": 0.0, "loss_total": 0.21674545109272003, "step": 2224 }, { "batch_size": 4, "epoch": 0.8896, "step": 2224, "tokens_per_device": 4336 }, { "epoch": 0.8896, "loss_ce": 0.19027505815029144, "loss_lvr": 0.7445095181465149, "loss_mode_switch": 0.0, "loss_total": 0.26472601294517517, "step": 2224 }, { "batch_size": 1, "epoch": 0.8896, "step": 2224, "tokens_per_device": 5119 }, { "epoch": 0.8896, "loss_ce": 0.1949905902147293, "loss_lvr": 0.45715880393981934, "loss_mode_switch": 0.0, "loss_total": 0.24070647358894348, "step": 2224 }, { "batch_size": 4, "epoch": 0.8896, "step": 2224, "tokens_per_device": 11032 }, { "epoch": 0.8896, "loss_ce": 0.008333130739629269, "loss_lvr": 0.4627218544483185, "loss_mode_switch": 0.0, "loss_total": 0.05460531637072563, "step": 2224 }, { "batch_size": 1, "epoch": 0.8896, "step": 2224, "tokens_per_device": 5115 }, { "epoch": 0.8896, "loss_ce": 0.06362393498420715, "loss_lvr": 0.37752223014831543, "loss_mode_switch": 0.0, "loss_total": 0.10137616097927094, "step": 2224 }, { "batch_size": 4, "epoch": 0.8896, "step": 2224, "tokens_per_device": 5044 }, { "epoch": 0.8896, "loss_ce": 0.3503612279891968, "loss_lvr": 1.0219089984893799, "loss_mode_switch": 0.0, "loss_total": 0.4525521397590637, "step": 2224 }, { "epoch": 0.89, "grad_norm": 1.316572666168213, "learning_rate": 3.1396647496828245e-07, "loss": 0.2572, "step": 2225 }, { "batch_size": 4, "epoch": 0.89, "step": 2225, "tokens_per_device": 5788 }, { "epoch": 0.89, "loss_ce": 0.04627992585301399, "loss_lvr": 0.6788614988327026, "loss_mode_switch": 0.0, "loss_total": 0.11416608095169067, "step": 2225 }, { "batch_size": 4, "epoch": 0.89, "step": 2225, "tokens_per_device": 3812 }, { "epoch": 0.89, "loss_ce": 0.04990515112876892, "loss_lvr": 0.9039450287818909, "loss_mode_switch": 0.0, "loss_total": 0.14029964804649353, "step": 2225 }, { "batch_size": 4, "epoch": 0.89, "step": 2225, "tokens_per_device": 4396 }, { "epoch": 0.89, "loss_ce": 0.41144275665283203, "loss_lvr": 0.7118567824363708, "loss_mode_switch": 0.0, "loss_total": 0.4826284348964691, "step": 2225 }, { "batch_size": 4, "epoch": 0.89, "step": 2225, "tokens_per_device": 2680 }, { "epoch": 0.89, "loss_ce": 0.27072229981422424, "loss_lvr": 0.6619265675544739, "loss_mode_switch": 0.0, "loss_total": 0.33691495656967163, "step": 2225 }, { "batch_size": 4, "epoch": 0.89, "step": 2225, "tokens_per_device": 4296 }, { "epoch": 0.89, "loss_ce": 0.40593039989471436, "loss_lvr": 0.8582597374916077, "loss_mode_switch": 0.0, "loss_total": 0.4917563796043396, "step": 2225 }, { "batch_size": 4, "epoch": 0.89, "step": 2225, "tokens_per_device": 4480 }, { "epoch": 0.89, "loss_ce": 0.1994168609380722, "loss_lvr": 0.9320072531700134, "loss_mode_switch": 0.0, "loss_total": 0.2926175892353058, "step": 2225 }, { "batch_size": 4, "epoch": 0.89, "step": 2225, "tokens_per_device": 7308 }, { "epoch": 0.89, "loss_ce": 0.04082006961107254, "loss_lvr": 0.7012849450111389, "loss_mode_switch": 0.0, "loss_total": 0.11094856262207031, "step": 2225 }, { "batch_size": 4, "epoch": 0.89, "step": 2225, "tokens_per_device": 9348 }, { "epoch": 0.89, "loss_ce": 0.1656980663537979, "loss_lvr": 0.6454651951789856, "loss_mode_switch": 0.0, "loss_total": 0.23024457693099976, "step": 2225 }, { "epoch": 0.8904, "grad_norm": 1.3476577997207642, "learning_rate": 3.1171121797017036e-07, "loss": 0.3001, "step": 2226 }, { "batch_size": 4, "epoch": 0.8904, "step": 2226, "tokens_per_device": 6456 }, { "epoch": 0.8904, "loss_ce": 0.14603173732757568, "loss_lvr": 0.8059248924255371, "loss_mode_switch": 0.0, "loss_total": 0.22662422060966492, "step": 2226 }, { "batch_size": 4, "epoch": 0.8904, "step": 2226, "tokens_per_device": 5116 }, { "epoch": 0.8904, "loss_ce": 0.21135437488555908, "loss_lvr": 0.8197083473205566, "loss_mode_switch": 0.0, "loss_total": 0.2933252155780792, "step": 2226 }, { "batch_size": 1, "epoch": 0.8904, "step": 2226, "tokens_per_device": 5181 }, { "epoch": 0.8904, "loss_ce": 0.01415682677179575, "loss_lvr": 0.5483522415161133, "loss_mode_switch": 0.0, "loss_total": 0.06899205595254898, "step": 2226 }, { "batch_size": 4, "epoch": 0.8904, "step": 2226, "tokens_per_device": 3548 }, { "epoch": 0.8904, "loss_ce": 0.36630117893218994, "loss_lvr": 0.9142158627510071, "loss_mode_switch": 0.0, "loss_total": 0.4577227830886841, "step": 2226 }, { "batch_size": 1, "epoch": 0.8904, "step": 2226, "tokens_per_device": 4859 }, { "epoch": 0.8904, "loss_ce": 0.003276906907558441, "loss_lvr": 0.3207676410675049, "loss_mode_switch": 0.0, "loss_total": 0.03535367175936699, "step": 2226 }, { "batch_size": 1, "epoch": 0.8904, "step": 2226, "tokens_per_device": 4965 }, { "epoch": 0.8904, "loss_ce": 0.02465163543820381, "loss_lvr": 0.23477782309055328, "loss_mode_switch": 0.0, "loss_total": 0.04812941700220108, "step": 2226 }, { "batch_size": 4, "epoch": 0.8904, "step": 2226, "tokens_per_device": 2072 }, { "epoch": 0.8904, "loss_ce": 0.4398050606250763, "loss_lvr": 1.0340402126312256, "loss_mode_switch": 0.0, "loss_total": 0.5432090759277344, "step": 2226 }, { "batch_size": 1, "epoch": 0.8904, "step": 2226, "tokens_per_device": 5178 }, { "epoch": 0.8904, "loss_ce": 0.18659348785877228, "loss_lvr": 0.23282037675380707, "loss_mode_switch": 0.0, "loss_total": 0.20987552404403687, "step": 2226 }, { "epoch": 0.8908, "grad_norm": 1.8887012004852295, "learning_rate": 3.09463829446775e-07, "loss": 0.2553, "step": 2227 }, { "batch_size": 4, "epoch": 0.8908, "step": 2227, "tokens_per_device": 4536 }, { "epoch": 0.8908, "loss_ce": 0.5256268978118896, "loss_lvr": 0.47376108169555664, "loss_mode_switch": 0.0, "loss_total": 0.5730029940605164, "step": 2227 }, { "batch_size": 4, "epoch": 0.8908, "step": 2227, "tokens_per_device": 2320 }, { "epoch": 0.8908, "loss_ce": 0.12937785685062408, "loss_lvr": 0.8274838328361511, "loss_mode_switch": 0.0, "loss_total": 0.2121262401342392, "step": 2227 }, { "batch_size": 4, "epoch": 0.8908, "step": 2227, "tokens_per_device": 5732 }, { "epoch": 0.8908, "loss_ce": 0.005512019619345665, "loss_lvr": 0.9168698191642761, "loss_mode_switch": 0.0, "loss_total": 0.09719900786876678, "step": 2227 }, { "batch_size": 1, "epoch": 0.8908, "step": 2227, "tokens_per_device": 5161 }, { "epoch": 0.8908, "loss_ce": 0.024846281856298447, "loss_lvr": 0.22560469806194305, "loss_mode_switch": 0.0, "loss_total": 0.04740675166249275, "step": 2227 }, { "batch_size": 4, "epoch": 0.8908, "step": 2227, "tokens_per_device": 3940 }, { "epoch": 0.8908, "loss_ce": 0.21498680114746094, "loss_lvr": 0.8970327973365784, "loss_mode_switch": 0.0, "loss_total": 0.30469009280204773, "step": 2227 }, { "batch_size": 4, "epoch": 0.8908, "step": 2227, "tokens_per_device": 5256 }, { "epoch": 0.8908, "loss_ce": 0.056169554591178894, "loss_lvr": 0.7685341835021973, "loss_mode_switch": 0.0, "loss_total": 0.1330229640007019, "step": 2227 }, { "batch_size": 4, "epoch": 0.8908, "step": 2227, "tokens_per_device": 12560 }, { "epoch": 0.8908, "loss_ce": 0.19866496324539185, "loss_lvr": 0.8321434259414673, "loss_mode_switch": 0.0, "loss_total": 0.2818793058395386, "step": 2227 }, { "batch_size": 1, "epoch": 0.8908, "step": 2227, "tokens_per_device": 5105 }, { "epoch": 0.8908, "loss_ce": 0.00031184195540845394, "loss_lvr": 0.6177175641059875, "loss_mode_switch": 0.0, "loss_total": 0.06208359822630882, "step": 2227 }, { "epoch": 0.8912, "grad_norm": 1.3378221988677979, "learning_rate": 3.072243131699443e-07, "loss": 0.2664, "step": 2228 }, { "batch_size": 4, "epoch": 0.8912, "step": 2228, "tokens_per_device": 4848 }, { "epoch": 0.8912, "loss_ce": 0.3772691786289215, "loss_lvr": 0.7831376194953918, "loss_mode_switch": 0.0, "loss_total": 0.45558294653892517, "step": 2228 }, { "batch_size": 4, "epoch": 0.8912, "step": 2228, "tokens_per_device": 4332 }, { "epoch": 0.8912, "loss_ce": 0.7410682439804077, "loss_lvr": 0.7681623697280884, "loss_mode_switch": 0.0, "loss_total": 0.8178845047950745, "step": 2228 }, { "batch_size": 4, "epoch": 0.8912, "step": 2228, "tokens_per_device": 5316 }, { "epoch": 0.8912, "loss_ce": 0.14044320583343506, "loss_lvr": 0.8016508221626282, "loss_mode_switch": 0.0, "loss_total": 0.22060829401016235, "step": 2228 }, { "batch_size": 1, "epoch": 0.8912, "step": 2228, "tokens_per_device": 4865 }, { "epoch": 0.8912, "loss_ce": 0.005043913144618273, "loss_lvr": 0.3067792057991028, "loss_mode_switch": 0.0, "loss_total": 0.03572183474898338, "step": 2228 }, { "batch_size": 1, "epoch": 0.8912, "step": 2228, "tokens_per_device": 5136 }, { "epoch": 0.8912, "loss_ce": 0.0005292452988214791, "loss_lvr": 0.37381511926651, "loss_mode_switch": 0.0, "loss_total": 0.03791075572371483, "step": 2228 }, { "batch_size": 4, "epoch": 0.8912, "step": 2228, "tokens_per_device": 6264 }, { "epoch": 0.8912, "loss_ce": 0.05083627626299858, "loss_lvr": 0.7754501104354858, "loss_mode_switch": 0.0, "loss_total": 0.12838128209114075, "step": 2228 }, { "batch_size": 4, "epoch": 0.8912, "step": 2228, "tokens_per_device": 1484 }, { "epoch": 0.8912, "loss_ce": 0.40935656428337097, "loss_lvr": 0.995959997177124, "loss_mode_switch": 0.0, "loss_total": 0.5089525580406189, "step": 2228 }, { "batch_size": 1, "epoch": 0.8912, "step": 2228, "tokens_per_device": 5133 }, { "epoch": 0.8912, "loss_ce": 0.038285110145807266, "loss_lvr": 0.28590625524520874, "loss_mode_switch": 0.0, "loss_total": 0.06687573343515396, "step": 2228 }, { "epoch": 0.8916, "grad_norm": 1.3484621047973633, "learning_rate": 3.049926728983171e-07, "loss": 0.2772, "step": 2229 }, { "batch_size": 1, "epoch": 0.8916, "step": 2229, "tokens_per_device": 4921 }, { "epoch": 0.8916, "loss_ce": 0.005264509469270706, "loss_lvr": 0.6374448537826538, "loss_mode_switch": 0.0, "loss_total": 0.06900899112224579, "step": 2229 }, { "batch_size": 4, "epoch": 0.8916, "step": 2229, "tokens_per_device": 1360 }, { "epoch": 0.8916, "loss_ce": 0.23145346343517303, "loss_lvr": 0.6943663358688354, "loss_mode_switch": 0.0, "loss_total": 0.30089008808135986, "step": 2229 }, { "batch_size": 4, "epoch": 0.8916, "step": 2229, "tokens_per_device": 3364 }, { "epoch": 0.8916, "loss_ce": 0.04581877216696739, "loss_lvr": 1.286097764968872, "loss_mode_switch": 0.0, "loss_total": 0.1744285523891449, "step": 2229 }, { "batch_size": 4, "epoch": 0.8916, "step": 2229, "tokens_per_device": 5964 }, { "epoch": 0.8916, "loss_ce": 0.4042760729789734, "loss_lvr": 0.7699329257011414, "loss_mode_switch": 0.0, "loss_total": 0.48126935958862305, "step": 2229 }, { "batch_size": 4, "epoch": 0.8916, "step": 2229, "tokens_per_device": 5676 }, { "epoch": 0.8916, "loss_ce": 0.19606146216392517, "loss_lvr": 0.8145152926445007, "loss_mode_switch": 0.0, "loss_total": 0.2775129973888397, "step": 2229 }, { "batch_size": 4, "epoch": 0.8916, "step": 2229, "tokens_per_device": 4228 }, { "epoch": 0.8916, "loss_ce": 0.43564552068710327, "loss_lvr": 0.9214746356010437, "loss_mode_switch": 0.0, "loss_total": 0.5277929902076721, "step": 2229 }, { "batch_size": 1, "epoch": 0.8916, "step": 2229, "tokens_per_device": 5208 }, { "epoch": 0.8916, "loss_ce": 0.07415454089641571, "loss_lvr": 0.518251895904541, "loss_mode_switch": 0.0, "loss_total": 0.1259797364473343, "step": 2229 }, { "batch_size": 4, "epoch": 0.8916, "step": 2229, "tokens_per_device": 8084 }, { "epoch": 0.8916, "loss_ce": 0.024509182199835777, "loss_lvr": 0.6423326730728149, "loss_mode_switch": 0.0, "loss_total": 0.0887424498796463, "step": 2229 }, { "epoch": 0.892, "grad_norm": 1.2588297128677368, "learning_rate": 3.0276891237731085e-07, "loss": 0.2677, "step": 2230 }, { "batch_size": 4, "epoch": 0.892, "step": 2230, "tokens_per_device": 4752 }, { "epoch": 0.892, "loss_ce": 0.15106140077114105, "loss_lvr": 0.8129048347473145, "loss_mode_switch": 0.0, "loss_total": 0.2323518842458725, "step": 2230 }, { "batch_size": 4, "epoch": 0.892, "step": 2230, "tokens_per_device": 9964 }, { "epoch": 0.892, "loss_ce": 0.03526443615555763, "loss_lvr": 0.9143837690353394, "loss_mode_switch": 0.0, "loss_total": 0.12670281529426575, "step": 2230 }, { "batch_size": 4, "epoch": 0.892, "step": 2230, "tokens_per_device": 4376 }, { "epoch": 0.892, "loss_ce": 0.1288042515516281, "loss_lvr": 0.7154298424720764, "loss_mode_switch": 0.0, "loss_total": 0.20034724473953247, "step": 2230 }, { "batch_size": 4, "epoch": 0.892, "step": 2230, "tokens_per_device": 4640 }, { "epoch": 0.892, "loss_ce": 0.37762191891670227, "loss_lvr": 0.4352011978626251, "loss_mode_switch": 0.0, "loss_total": 0.421142041683197, "step": 2230 }, { "batch_size": 1, "epoch": 0.892, "step": 2230, "tokens_per_device": 4770 }, { "epoch": 0.892, "loss_ce": 0.014432573691010475, "loss_lvr": 0.35929247736930847, "loss_mode_switch": 0.0, "loss_total": 0.05036181956529617, "step": 2230 }, { "batch_size": 4, "epoch": 0.892, "step": 2230, "tokens_per_device": 4448 }, { "epoch": 0.892, "loss_ce": 0.017677972093224525, "loss_lvr": 0.6828150749206543, "loss_mode_switch": 0.0, "loss_total": 0.08595947921276093, "step": 2230 }, { "batch_size": 4, "epoch": 0.892, "step": 2230, "tokens_per_device": 1400 }, { "epoch": 0.892, "loss_ce": 0.4509495496749878, "loss_lvr": 0.8793050646781921, "loss_mode_switch": 0.0, "loss_total": 0.5388800501823425, "step": 2230 }, { "batch_size": 1, "epoch": 0.892, "step": 2230, "tokens_per_device": 4366 }, { "epoch": 0.892, "loss_ce": 0.007461595349013805, "loss_lvr": 0.2517460882663727, "loss_mode_switch": 0.0, "loss_total": 0.03263620287179947, "step": 2230 }, { "epoch": 0.8924, "grad_norm": 1.2392542362213135, "learning_rate": 3.005530353391195e-07, "loss": 0.2473, "step": 2231 }, { "batch_size": 1, "epoch": 0.8924, "step": 2231, "tokens_per_device": 5175 }, { "epoch": 0.8924, "loss_ce": 0.17248521745204926, "loss_lvr": 0.3628411889076233, "loss_mode_switch": 0.0, "loss_total": 0.20876933634281158, "step": 2231 }, { "batch_size": 4, "epoch": 0.8924, "step": 2231, "tokens_per_device": 11636 }, { "epoch": 0.8924, "loss_ce": 0.40799248218536377, "loss_lvr": 0.8810415863990784, "loss_mode_switch": 0.0, "loss_total": 0.4960966408252716, "step": 2231 }, { "batch_size": 4, "epoch": 0.8924, "step": 2231, "tokens_per_device": 4356 }, { "epoch": 0.8924, "loss_ce": 0.14853596687316895, "loss_lvr": 0.9834654927253723, "loss_mode_switch": 0.0, "loss_total": 0.24688252806663513, "step": 2231 }, { "batch_size": 4, "epoch": 0.8924, "step": 2231, "tokens_per_device": 4328 }, { "epoch": 0.8924, "loss_ce": 0.04082340747117996, "loss_lvr": 0.48336946964263916, "loss_mode_switch": 0.0, "loss_total": 0.08916035294532776, "step": 2231 }, { "batch_size": 4, "epoch": 0.8924, "step": 2231, "tokens_per_device": 3940 }, { "epoch": 0.8924, "loss_ce": 0.386322021484375, "loss_lvr": 0.7970524430274963, "loss_mode_switch": 0.0, "loss_total": 0.46602725982666016, "step": 2231 }, { "batch_size": 4, "epoch": 0.8924, "step": 2231, "tokens_per_device": 5232 }, { "epoch": 0.8924, "loss_ce": 0.1224055290222168, "loss_lvr": 0.6400779485702515, "loss_mode_switch": 0.0, "loss_total": 0.18641331791877747, "step": 2231 }, { "batch_size": 1, "epoch": 0.8924, "step": 2231, "tokens_per_device": 5010 }, { "epoch": 0.8924, "loss_ce": 0.0009614236187189817, "loss_lvr": 0.281980037689209, "loss_mode_switch": 0.0, "loss_total": 0.02915942668914795, "step": 2231 }, { "batch_size": 4, "epoch": 0.8924, "step": 2231, "tokens_per_device": 1328 }, { "epoch": 0.8924, "loss_ce": 0.34483760595321655, "loss_lvr": 0.9249675869941711, "loss_mode_switch": 0.0, "loss_total": 0.4373343586921692, "step": 2231 }, { "epoch": 0.8928, "grad_norm": 1.3905314207077026, "learning_rate": 2.9834504550270706e-07, "loss": 0.2841, "step": 2232 }, { "batch_size": 4, "epoch": 0.8928, "step": 2232, "tokens_per_device": 2736 }, { "epoch": 0.8928, "loss_ce": 0.23025649785995483, "loss_lvr": 0.5666300654411316, "loss_mode_switch": 0.0, "loss_total": 0.286919504404068, "step": 2232 }, { "batch_size": 1, "epoch": 0.8928, "step": 2232, "tokens_per_device": 4882 }, { "epoch": 0.8928, "loss_ce": 0.0504731610417366, "loss_lvr": 0.4553898274898529, "loss_mode_switch": 0.0, "loss_total": 0.09601214528083801, "step": 2232 }, { "batch_size": 4, "epoch": 0.8928, "step": 2232, "tokens_per_device": 1480 }, { "epoch": 0.8928, "loss_ce": 0.5136352777481079, "loss_lvr": 0.9074234962463379, "loss_mode_switch": 0.0, "loss_total": 0.6043776273727417, "step": 2232 }, { "batch_size": 4, "epoch": 0.8928, "step": 2232, "tokens_per_device": 3756 }, { "epoch": 0.8928, "loss_ce": 0.02773956023156643, "loss_lvr": 1.6069767475128174, "loss_mode_switch": 0.0, "loss_total": 0.18843723833560944, "step": 2232 }, { "batch_size": 4, "epoch": 0.8928, "step": 2232, "tokens_per_device": 4216 }, { "epoch": 0.8928, "loss_ce": 0.34202006459236145, "loss_lvr": 0.8177820444107056, "loss_mode_switch": 0.0, "loss_total": 0.42379826307296753, "step": 2232 }, { "batch_size": 4, "epoch": 0.8928, "step": 2232, "tokens_per_device": 5784 }, { "epoch": 0.8928, "loss_ce": 0.0031751086935400963, "loss_lvr": 0.837494432926178, "loss_mode_switch": 0.0, "loss_total": 0.08692455291748047, "step": 2232 }, { "batch_size": 1, "epoch": 0.8928, "step": 2232, "tokens_per_device": 5047 }, { "epoch": 0.8928, "loss_ce": 0.030497008934617043, "loss_lvr": 0.23372170329093933, "loss_mode_switch": 0.0, "loss_total": 0.053869180381298065, "step": 2232 }, { "batch_size": 4, "epoch": 0.8928, "step": 2232, "tokens_per_device": 7372 }, { "epoch": 0.8928, "loss_ce": 0.25221702456474304, "loss_lvr": 0.8763445019721985, "loss_mode_switch": 0.0, "loss_total": 0.3398514688014984, "step": 2232 }, { "epoch": 0.8932, "grad_norm": 1.3847378492355347, "learning_rate": 2.9614494657379865e-07, "loss": 0.3074, "step": 2233 }, { "batch_size": 1, "epoch": 0.8932, "step": 2233, "tokens_per_device": 5056 }, { "epoch": 0.8932, "loss_ce": 0.03342607244849205, "loss_lvr": 0.40707361698150635, "loss_mode_switch": 0.0, "loss_total": 0.07413343340158463, "step": 2233 }, { "batch_size": 4, "epoch": 0.8932, "step": 2233, "tokens_per_device": 1644 }, { "epoch": 0.8932, "loss_ce": 0.11888856440782547, "loss_lvr": 0.9314897656440735, "loss_mode_switch": 0.0, "loss_total": 0.21203753352165222, "step": 2233 }, { "batch_size": 1, "epoch": 0.8932, "step": 2233, "tokens_per_device": 4968 }, { "epoch": 0.8932, "loss_ce": 0.10640542209148407, "loss_lvr": 0.4251165986061096, "loss_mode_switch": 0.0, "loss_total": 0.1489170789718628, "step": 2233 }, { "batch_size": 1, "epoch": 0.8932, "step": 2233, "tokens_per_device": 4881 }, { "epoch": 0.8932, "loss_ce": 0.11601327359676361, "loss_lvr": 0.23156684637069702, "loss_mode_switch": 0.0, "loss_total": 0.13916996121406555, "step": 2233 }, { "batch_size": 4, "epoch": 0.8932, "step": 2233, "tokens_per_device": 1524 }, { "epoch": 0.8932, "loss_ce": 0.293131560087204, "loss_lvr": 0.8708972930908203, "loss_mode_switch": 0.0, "loss_total": 0.38022130727767944, "step": 2233 }, { "batch_size": 4, "epoch": 0.8932, "step": 2233, "tokens_per_device": 3760 }, { "epoch": 0.8932, "loss_ce": 0.05422235280275345, "loss_lvr": 0.727716863155365, "loss_mode_switch": 0.0, "loss_total": 0.1269940435886383, "step": 2233 }, { "batch_size": 4, "epoch": 0.8932, "step": 2233, "tokens_per_device": 1928 }, { "epoch": 0.8932, "loss_ce": 0.0211896114051342, "loss_lvr": 0.802446722984314, "loss_mode_switch": 0.0, "loss_total": 0.10143429040908813, "step": 2233 }, { "batch_size": 1, "epoch": 0.8932, "step": 2233, "tokens_per_device": 5849 }, { "epoch": 0.8932, "loss_ce": 0.10179941356182098, "loss_lvr": 0.25624558329582214, "loss_mode_switch": 0.0, "loss_total": 0.1274239718914032, "step": 2233 }, { "epoch": 0.8936, "grad_norm": 1.3293765783309937, "learning_rate": 2.939527422448768e-07, "loss": 0.3115, "step": 2234 }, { "batch_size": 4, "epoch": 0.8936, "step": 2234, "tokens_per_device": 3448 }, { "epoch": 0.8936, "loss_ce": 0.20519287884235382, "loss_lvr": 1.024093747138977, "loss_mode_switch": 0.0, "loss_total": 0.30760225653648376, "step": 2234 }, { "batch_size": 4, "epoch": 0.8936, "step": 2234, "tokens_per_device": 1484 }, { "epoch": 0.8936, "loss_ce": 0.5695539712905884, "loss_lvr": 0.7980917096138, "loss_mode_switch": 0.0, "loss_total": 0.6493631601333618, "step": 2234 }, { "batch_size": 4, "epoch": 0.8936, "step": 2234, "tokens_per_device": 4352 }, { "epoch": 0.8936, "loss_ce": 0.08169244974851608, "loss_lvr": 0.7294716835021973, "loss_mode_switch": 0.0, "loss_total": 0.1546396166086197, "step": 2234 }, { "batch_size": 4, "epoch": 0.8936, "step": 2234, "tokens_per_device": 7028 }, { "epoch": 0.8936, "loss_ce": 0.04441523179411888, "loss_lvr": 0.698384165763855, "loss_mode_switch": 0.0, "loss_total": 0.11425365507602692, "step": 2234 }, { "batch_size": 1, "epoch": 0.8936, "step": 2234, "tokens_per_device": 5078 }, { "epoch": 0.8936, "loss_ce": 0.002159260446205735, "loss_lvr": 0.35323378443717957, "loss_mode_switch": 0.0, "loss_total": 0.03748263791203499, "step": 2234 }, { "batch_size": 4, "epoch": 0.8936, "step": 2234, "tokens_per_device": 2764 }, { "epoch": 0.8936, "loss_ce": 0.10295877605676651, "loss_lvr": 0.7442636489868164, "loss_mode_switch": 0.0, "loss_total": 0.177385151386261, "step": 2234 }, { "batch_size": 1, "epoch": 0.8936, "step": 2234, "tokens_per_device": 5179 }, { "epoch": 0.8936, "loss_ce": 0.04734644293785095, "loss_lvr": 0.17523516714572906, "loss_mode_switch": 0.0, "loss_total": 0.0648699626326561, "step": 2234 }, { "batch_size": 4, "epoch": 0.8936, "step": 2234, "tokens_per_device": 5912 }, { "epoch": 0.8936, "loss_ce": 0.1811087429523468, "loss_lvr": 0.7462419867515564, "loss_mode_switch": 0.0, "loss_total": 0.2557329535484314, "step": 2234 }, { "epoch": 0.894, "grad_norm": 1.3998552560806274, "learning_rate": 2.917684361951728e-07, "loss": 0.2974, "step": 2235 }, { "batch_size": 4, "epoch": 0.894, "step": 2235, "tokens_per_device": 5632 }, { "epoch": 0.894, "loss_ce": 0.005572109948843718, "loss_lvr": 0.8207576870918274, "loss_mode_switch": 0.0, "loss_total": 0.08764787763357162, "step": 2235 }, { "batch_size": 4, "epoch": 0.894, "step": 2235, "tokens_per_device": 4312 }, { "epoch": 0.894, "loss_ce": 0.2109023630619049, "loss_lvr": 0.7632262706756592, "loss_mode_switch": 0.0, "loss_total": 0.28722500801086426, "step": 2235 }, { "batch_size": 1, "epoch": 0.894, "step": 2235, "tokens_per_device": 5184 }, { "epoch": 0.894, "loss_ce": 0.01039156224578619, "loss_lvr": 0.2422998547554016, "loss_mode_switch": 0.0, "loss_total": 0.034621547907590866, "step": 2235 }, { "batch_size": 1, "epoch": 0.894, "step": 2235, "tokens_per_device": 7152 }, { "epoch": 0.894, "loss_ce": 0.00025839125737547874, "loss_lvr": 0.29235416650772095, "loss_mode_switch": 0.0, "loss_total": 0.02949380874633789, "step": 2235 }, { "batch_size": 1, "epoch": 0.894, "step": 2235, "tokens_per_device": 4699 }, { "epoch": 0.894, "loss_ce": 0.12409372627735138, "loss_lvr": 0.41235360503196716, "loss_mode_switch": 0.0, "loss_total": 0.16532908380031586, "step": 2235 }, { "batch_size": 1, "epoch": 0.894, "step": 2235, "tokens_per_device": 4918 }, { "epoch": 0.894, "loss_ce": 0.28483152389526367, "loss_lvr": 0.47127765417099, "loss_mode_switch": 0.0, "loss_total": 0.3319592773914337, "step": 2235 }, { "batch_size": 1, "epoch": 0.894, "step": 2235, "tokens_per_device": 4928 }, { "epoch": 0.894, "loss_ce": 0.012649079784750938, "loss_lvr": 0.427255243062973, "loss_mode_switch": 0.0, "loss_total": 0.05537460744380951, "step": 2235 }, { "batch_size": 4, "epoch": 0.894, "step": 2235, "tokens_per_device": 2572 }, { "epoch": 0.894, "loss_ce": 0.3132975697517395, "loss_lvr": 0.7821733355522156, "loss_mode_switch": 0.0, "loss_total": 0.3915148973464966, "step": 2235 }, { "epoch": 0.8944, "grad_norm": 1.2518417835235596, "learning_rate": 2.8959203209066477e-07, "loss": 0.2761, "step": 2236 }, { "batch_size": 4, "epoch": 0.8944, "step": 2236, "tokens_per_device": 5000 }, { "epoch": 0.8944, "loss_ce": 0.02732674963772297, "loss_lvr": 0.8073403835296631, "loss_mode_switch": 0.0, "loss_total": 0.10806078463792801, "step": 2236 }, { "batch_size": 4, "epoch": 0.8944, "step": 2236, "tokens_per_device": 2780 }, { "epoch": 0.8944, "loss_ce": 0.27082696557044983, "loss_lvr": 0.5176427960395813, "loss_mode_switch": 0.0, "loss_total": 0.32259124517440796, "step": 2236 }, { "batch_size": 1, "epoch": 0.8944, "step": 2236, "tokens_per_device": 4586 }, { "epoch": 0.8944, "loss_ce": 0.0055639296770095825, "loss_lvr": 0.3402721583843231, "loss_mode_switch": 0.0, "loss_total": 0.039591144770383835, "step": 2236 }, { "batch_size": 4, "epoch": 0.8944, "step": 2236, "tokens_per_device": 1300 }, { "epoch": 0.8944, "loss_ce": 0.08225050568580627, "loss_lvr": 0.9624014496803284, "loss_mode_switch": 0.0, "loss_total": 0.17849065363407135, "step": 2236 }, { "batch_size": 4, "epoch": 0.8944, "step": 2236, "tokens_per_device": 1424 }, { "epoch": 0.8944, "loss_ce": 0.2492854744195938, "loss_lvr": 0.8351085782051086, "loss_mode_switch": 0.0, "loss_total": 0.3327963352203369, "step": 2236 }, { "batch_size": 4, "epoch": 0.8944, "step": 2236, "tokens_per_device": 3268 }, { "epoch": 0.8944, "loss_ce": 0.5336494445800781, "loss_lvr": 0.7856341600418091, "loss_mode_switch": 0.0, "loss_total": 0.6122128367424011, "step": 2236 }, { "batch_size": 4, "epoch": 0.8944, "step": 2236, "tokens_per_device": 4252 }, { "epoch": 0.8944, "loss_ce": 0.34338992834091187, "loss_lvr": 1.141649603843689, "loss_mode_switch": 0.0, "loss_total": 0.4575548768043518, "step": 2236 }, { "batch_size": 4, "epoch": 0.8944, "step": 2236, "tokens_per_device": 3840 }, { "epoch": 0.8944, "loss_ce": 0.09266955405473709, "loss_lvr": 0.9619330167770386, "loss_mode_switch": 0.0, "loss_total": 0.1888628602027893, "step": 2236 }, { "epoch": 0.8948, "grad_norm": 1.187799096107483, "learning_rate": 2.874235335840664e-07, "loss": 0.2484, "step": 2237 }, { "batch_size": 1, "epoch": 0.8948, "step": 2237, "tokens_per_device": 4887 }, { "epoch": 0.8948, "loss_ce": 0.035062290728092194, "loss_lvr": 0.257759690284729, "loss_mode_switch": 0.0, "loss_total": 0.060838259756565094, "step": 2237 }, { "batch_size": 4, "epoch": 0.8948, "step": 2237, "tokens_per_device": 3744 }, { "epoch": 0.8948, "loss_ce": 0.5039397478103638, "loss_lvr": 0.7740516662597656, "loss_mode_switch": 0.0, "loss_total": 0.5813449025154114, "step": 2237 }, { "batch_size": 4, "epoch": 0.8948, "step": 2237, "tokens_per_device": 8004 }, { "epoch": 0.8948, "loss_ce": 0.6906256675720215, "loss_lvr": 0.7770620584487915, "loss_mode_switch": 0.0, "loss_total": 0.7683318853378296, "step": 2237 }, { "batch_size": 1, "epoch": 0.8948, "step": 2237, "tokens_per_device": 5151 }, { "epoch": 0.8948, "loss_ce": 0.006067262031137943, "loss_lvr": 0.43922343850135803, "loss_mode_switch": 0.0, "loss_total": 0.04998960718512535, "step": 2237 }, { "batch_size": 4, "epoch": 0.8948, "step": 2237, "tokens_per_device": 2616 }, { "epoch": 0.8948, "loss_ce": 0.10877720266580582, "loss_lvr": 0.7876396775245667, "loss_mode_switch": 0.0, "loss_total": 0.1875411719083786, "step": 2237 }, { "batch_size": 1, "epoch": 0.8948, "step": 2237, "tokens_per_device": 5349 }, { "epoch": 0.8948, "loss_ce": 0.30870285630226135, "loss_lvr": 0.2911388874053955, "loss_mode_switch": 0.0, "loss_total": 0.3378167450428009, "step": 2237 }, { "batch_size": 4, "epoch": 0.8948, "step": 2237, "tokens_per_device": 7184 }, { "epoch": 0.8948, "loss_ce": 0.08304058015346527, "loss_lvr": 0.6187183856964111, "loss_mode_switch": 0.0, "loss_total": 0.14491242170333862, "step": 2237 }, { "batch_size": 4, "epoch": 0.8948, "step": 2237, "tokens_per_device": 4904 }, { "epoch": 0.8948, "loss_ce": 0.26228025555610657, "loss_lvr": 0.933047890663147, "loss_mode_switch": 0.0, "loss_total": 0.3555850386619568, "step": 2237 }, { "epoch": 0.8952, "grad_norm": 1.3689757585525513, "learning_rate": 2.852629443148247e-07, "loss": 0.3035, "step": 2238 }, { "batch_size": 4, "epoch": 0.8952, "step": 2238, "tokens_per_device": 4880 }, { "epoch": 0.8952, "loss_ce": 0.19118666648864746, "loss_lvr": 0.784391462802887, "loss_mode_switch": 0.0, "loss_total": 0.26962581276893616, "step": 2238 }, { "batch_size": 1, "epoch": 0.8952, "step": 2238, "tokens_per_device": 5159 }, { "epoch": 0.8952, "loss_ce": 7.848396489862353e-05, "loss_lvr": 0.33799561858177185, "loss_mode_switch": 0.0, "loss_total": 0.03387804701924324, "step": 2238 }, { "batch_size": 4, "epoch": 0.8952, "step": 2238, "tokens_per_device": 8892 }, { "epoch": 0.8952, "loss_ce": 0.2710146903991699, "loss_lvr": 0.6422485709190369, "loss_mode_switch": 0.0, "loss_total": 0.33523955941200256, "step": 2238 }, { "batch_size": 4, "epoch": 0.8952, "step": 2238, "tokens_per_device": 4320 }, { "epoch": 0.8952, "loss_ce": 0.12544608116149902, "loss_lvr": 0.6285117268562317, "loss_mode_switch": 0.0, "loss_total": 0.18829725682735443, "step": 2238 }, { "batch_size": 1, "epoch": 0.8952, "step": 2238, "tokens_per_device": 5026 }, { "epoch": 0.8952, "loss_ce": 0.0005731042474508286, "loss_lvr": 0.5970945358276367, "loss_mode_switch": 0.0, "loss_total": 0.06028255820274353, "step": 2238 }, { "batch_size": 1, "epoch": 0.8952, "step": 2238, "tokens_per_device": 4874 }, { "epoch": 0.8952, "loss_ce": 0.004073755349963903, "loss_lvr": 1.237565517425537, "loss_mode_switch": 0.0, "loss_total": 0.12783031165599823, "step": 2238 }, { "batch_size": 4, "epoch": 0.8952, "step": 2238, "tokens_per_device": 4000 }, { "epoch": 0.8952, "loss_ce": 0.27356499433517456, "loss_lvr": 0.9875867962837219, "loss_mode_switch": 0.0, "loss_total": 0.3723236918449402, "step": 2238 }, { "batch_size": 4, "epoch": 0.8952, "step": 2238, "tokens_per_device": 4212 }, { "epoch": 0.8952, "loss_ce": 0.20686602592468262, "loss_lvr": 0.7678601145744324, "loss_mode_switch": 0.0, "loss_total": 0.28365203738212585, "step": 2238 }, { "epoch": 0.8956, "grad_norm": 1.3253648281097412, "learning_rate": 2.831102679091113e-07, "loss": 0.2923, "step": 2239 }, { "batch_size": 1, "epoch": 0.8956, "step": 2239, "tokens_per_device": 4737 }, { "epoch": 0.8956, "loss_ce": 0.6117055416107178, "loss_lvr": 0.4072667360305786, "loss_mode_switch": 0.0, "loss_total": 0.6524322032928467, "step": 2239 }, { "batch_size": 4, "epoch": 0.8956, "step": 2239, "tokens_per_device": 6996 }, { "epoch": 0.8956, "loss_ce": 0.15453672409057617, "loss_lvr": 0.6069375872612, "loss_mode_switch": 0.0, "loss_total": 0.21523047983646393, "step": 2239 }, { "batch_size": 4, "epoch": 0.8956, "step": 2239, "tokens_per_device": 7552 }, { "epoch": 0.8956, "loss_ce": 0.5438955426216125, "loss_lvr": 0.6870046257972717, "loss_mode_switch": 0.0, "loss_total": 0.6125960350036621, "step": 2239 }, { "batch_size": 4, "epoch": 0.8956, "step": 2239, "tokens_per_device": 5236 }, { "epoch": 0.8956, "loss_ce": 0.19605712592601776, "loss_lvr": 0.8830150961875916, "loss_mode_switch": 0.0, "loss_total": 0.2843586206436157, "step": 2239 }, { "batch_size": 4, "epoch": 0.8956, "step": 2239, "tokens_per_device": 6544 }, { "epoch": 0.8956, "loss_ce": 0.4773077666759491, "loss_lvr": 0.6859827041625977, "loss_mode_switch": 0.0, "loss_total": 0.5459060668945312, "step": 2239 }, { "batch_size": 1, "epoch": 0.8956, "step": 2239, "tokens_per_device": 4975 }, { "epoch": 0.8956, "loss_ce": 0.013636892661452293, "loss_lvr": 0.4347193241119385, "loss_mode_switch": 0.0, "loss_total": 0.05710882693529129, "step": 2239 }, { "batch_size": 4, "epoch": 0.8956, "step": 2239, "tokens_per_device": 2640 }, { "epoch": 0.8956, "loss_ce": 0.21300941705703735, "loss_lvr": 0.7567620873451233, "loss_mode_switch": 0.0, "loss_total": 0.2886856198310852, "step": 2239 }, { "batch_size": 1, "epoch": 0.8956, "step": 2239, "tokens_per_device": 4972 }, { "epoch": 0.8956, "loss_ce": 0.022556057199835777, "loss_lvr": 0.5722944736480713, "loss_mode_switch": 0.0, "loss_total": 0.07978550344705582, "step": 2239 }, { "epoch": 0.896, "grad_norm": 1.2906930446624756, "learning_rate": 2.809655079798179e-07, "loss": 0.3047, "step": 2240 }, { "batch_size": 1, "epoch": 0.896, "step": 2240, "tokens_per_device": 5225 }, { "epoch": 0.896, "loss_ce": 0.006422894075512886, "loss_lvr": 0.29194915294647217, "loss_mode_switch": 0.0, "loss_total": 0.03561780974268913, "step": 2240 }, { "batch_size": 1, "epoch": 0.896, "step": 2240, "tokens_per_device": 5163 }, { "epoch": 0.896, "loss_ce": 0.012583538889884949, "loss_lvr": 0.4302317798137665, "loss_mode_switch": 0.0, "loss_total": 0.055606719106435776, "step": 2240 }, { "batch_size": 4, "epoch": 0.896, "step": 2240, "tokens_per_device": 4240 }, { "epoch": 0.896, "loss_ce": 0.32995152473449707, "loss_lvr": 0.7008594274520874, "loss_mode_switch": 0.0, "loss_total": 0.4000374674797058, "step": 2240 }, { "batch_size": 4, "epoch": 0.896, "step": 2240, "tokens_per_device": 4932 }, { "epoch": 0.896, "loss_ce": 0.0037871715612709522, "loss_lvr": 1.1702102422714233, "loss_mode_switch": 0.0, "loss_total": 0.1208081990480423, "step": 2240 }, { "batch_size": 4, "epoch": 0.896, "step": 2240, "tokens_per_device": 4632 }, { "epoch": 0.896, "loss_ce": 0.541335940361023, "loss_lvr": 0.8153467774391174, "loss_mode_switch": 0.0, "loss_total": 0.6228706240653992, "step": 2240 }, { "batch_size": 4, "epoch": 0.896, "step": 2240, "tokens_per_device": 4524 }, { "epoch": 0.896, "loss_ce": 0.3940127193927765, "loss_lvr": 1.0282479524612427, "loss_mode_switch": 0.0, "loss_total": 0.4968375265598297, "step": 2240 }, { "batch_size": 4, "epoch": 0.896, "step": 2240, "tokens_per_device": 4760 }, { "epoch": 0.896, "loss_ce": 0.1017446517944336, "loss_lvr": 0.7229340076446533, "loss_mode_switch": 0.0, "loss_total": 0.17403805255889893, "step": 2240 }, { "batch_size": 1, "epoch": 0.896, "step": 2240, "tokens_per_device": 5197 }, { "epoch": 0.896, "loss_ce": 0.05647261440753937, "loss_lvr": 0.27087607979774475, "loss_mode_switch": 0.0, "loss_total": 0.08356022089719772, "step": 2240 }, { "epoch": 0.8964, "grad_norm": 1.3714271783828735, "learning_rate": 2.7882866812655006e-07, "loss": 0.3109, "step": 2241 }, { "batch_size": 4, "epoch": 0.8964, "step": 2241, "tokens_per_device": 6000 }, { "epoch": 0.8964, "loss_ce": 0.2568991184234619, "loss_lvr": 0.9202041625976562, "loss_mode_switch": 0.0, "loss_total": 0.348919540643692, "step": 2241 }, { "batch_size": 4, "epoch": 0.8964, "step": 2241, "tokens_per_device": 3852 }, { "epoch": 0.8964, "loss_ce": 0.0710025206208229, "loss_lvr": 1.0429656505584717, "loss_mode_switch": 0.0, "loss_total": 0.17529907822608948, "step": 2241 }, { "batch_size": 1, "epoch": 0.8964, "step": 2241, "tokens_per_device": 4907 }, { "epoch": 0.8964, "loss_ce": 0.16431377828121185, "loss_lvr": 0.6025636196136475, "loss_mode_switch": 0.0, "loss_total": 0.2245701402425766, "step": 2241 }, { "batch_size": 4, "epoch": 0.8964, "step": 2241, "tokens_per_device": 2580 }, { "epoch": 0.8964, "loss_ce": 0.288980096578598, "loss_lvr": 0.9097078442573547, "loss_mode_switch": 0.0, "loss_total": 0.3799508810043335, "step": 2241 }, { "batch_size": 1, "epoch": 0.8964, "step": 2241, "tokens_per_device": 4885 }, { "epoch": 0.8964, "loss_ce": 0.2149573713541031, "loss_lvr": 0.8305319547653198, "loss_mode_switch": 0.0, "loss_total": 0.29801055788993835, "step": 2241 }, { "batch_size": 4, "epoch": 0.8964, "step": 2241, "tokens_per_device": 3860 }, { "epoch": 0.8964, "loss_ce": 0.4965856969356537, "loss_lvr": 0.8219888806343079, "loss_mode_switch": 0.0, "loss_total": 0.5787845849990845, "step": 2241 }, { "batch_size": 4, "epoch": 0.8964, "step": 2241, "tokens_per_device": 1488 }, { "epoch": 0.8964, "loss_ce": 0.5983678698539734, "loss_lvr": 0.8870950937271118, "loss_mode_switch": 0.0, "loss_total": 0.6870774030685425, "step": 2241 }, { "batch_size": 4, "epoch": 0.8964, "step": 2241, "tokens_per_device": 3560 }, { "epoch": 0.8964, "loss_ce": 0.0687834694981575, "loss_lvr": 0.35742127895355225, "loss_mode_switch": 0.0, "loss_total": 0.1045255959033966, "step": 2241 }, { "epoch": 0.8968, "grad_norm": 1.333001971244812, "learning_rate": 2.7669975193562013e-07, "loss": 0.3205, "step": 2242 }, { "batch_size": 4, "epoch": 0.8968, "step": 2242, "tokens_per_device": 3460 }, { "epoch": 0.8968, "loss_ce": 0.03227725625038147, "loss_lvr": 1.1182327270507812, "loss_mode_switch": 0.0, "loss_total": 0.14410053193569183, "step": 2242 }, { "batch_size": 4, "epoch": 0.8968, "step": 2242, "tokens_per_device": 4252 }, { "epoch": 0.8968, "loss_ce": 0.20183506608009338, "loss_lvr": 0.8931135535240173, "loss_mode_switch": 0.0, "loss_total": 0.2911464273929596, "step": 2242 }, { "batch_size": 4, "epoch": 0.8968, "step": 2242, "tokens_per_device": 2632 }, { "epoch": 0.8968, "loss_ce": 0.06679773330688477, "loss_lvr": 0.7566985487937927, "loss_mode_switch": 0.0, "loss_total": 0.14246758818626404, "step": 2242 }, { "batch_size": 4, "epoch": 0.8968, "step": 2242, "tokens_per_device": 4768 }, { "epoch": 0.8968, "loss_ce": 0.11512815207242966, "loss_lvr": 0.7648110389709473, "loss_mode_switch": 0.0, "loss_total": 0.19160926342010498, "step": 2242 }, { "batch_size": 1, "epoch": 0.8968, "step": 2242, "tokens_per_device": 4919 }, { "epoch": 0.8968, "loss_ce": 0.03325880691409111, "loss_lvr": 0.47250276803970337, "loss_mode_switch": 0.0, "loss_total": 0.08050908148288727, "step": 2242 }, { "batch_size": 1, "epoch": 0.8968, "step": 2242, "tokens_per_device": 5964 }, { "epoch": 0.8968, "loss_ce": 0.09580254554748535, "loss_lvr": 0.27385133504867554, "loss_mode_switch": 0.0, "loss_total": 0.12318767607212067, "step": 2242 }, { "batch_size": 4, "epoch": 0.8968, "step": 2242, "tokens_per_device": 4256 }, { "epoch": 0.8968, "loss_ce": 0.2448796182870865, "loss_lvr": 0.6529480218887329, "loss_mode_switch": 0.0, "loss_total": 0.3101744055747986, "step": 2242 }, { "batch_size": 1, "epoch": 0.8968, "step": 2242, "tokens_per_device": 4878 }, { "epoch": 0.8968, "loss_ce": 0.08572445064783096, "loss_lvr": 0.33653321862220764, "loss_mode_switch": 0.0, "loss_total": 0.11937777698040009, "step": 2242 }, { "epoch": 0.8972, "grad_norm": 1.248185157775879, "learning_rate": 2.7457876298004393e-07, "loss": 0.2704, "step": 2243 }, { "batch_size": 4, "epoch": 0.8972, "step": 2243, "tokens_per_device": 4252 }, { "epoch": 0.8972, "loss_ce": 0.16149969398975372, "loss_lvr": 0.958137571811676, "loss_mode_switch": 0.0, "loss_total": 0.25731346011161804, "step": 2243 }, { "batch_size": 1, "epoch": 0.8972, "step": 2243, "tokens_per_device": 4888 }, { "epoch": 0.8972, "loss_ce": 0.047578807920217514, "loss_lvr": 0.6422881484031677, "loss_mode_switch": 0.0, "loss_total": 0.11180762946605682, "step": 2243 }, { "batch_size": 4, "epoch": 0.8972, "step": 2243, "tokens_per_device": 4792 }, { "epoch": 0.8972, "loss_ce": 0.5608074069023132, "loss_lvr": 0.7828223705291748, "loss_mode_switch": 0.0, "loss_total": 0.6390896439552307, "step": 2243 }, { "batch_size": 1, "epoch": 0.8972, "step": 2243, "tokens_per_device": 5153 }, { "epoch": 0.8972, "loss_ce": 0.0338735431432724, "loss_lvr": 0.21598298847675323, "loss_mode_switch": 0.0, "loss_total": 0.05547184497117996, "step": 2243 }, { "batch_size": 4, "epoch": 0.8972, "step": 2243, "tokens_per_device": 6552 }, { "epoch": 0.8972, "loss_ce": 0.05448725447058678, "loss_lvr": 0.908040463924408, "loss_mode_switch": 0.0, "loss_total": 0.1452912986278534, "step": 2243 }, { "batch_size": 1, "epoch": 0.8972, "step": 2243, "tokens_per_device": 5165 }, { "epoch": 0.8972, "loss_ce": 0.013935265131294727, "loss_lvr": 0.4356772303581238, "loss_mode_switch": 0.0, "loss_total": 0.05750298872590065, "step": 2243 }, { "batch_size": 4, "epoch": 0.8972, "step": 2243, "tokens_per_device": 16016 }, { "epoch": 0.8972, "loss_ce": 0.1971789300441742, "loss_lvr": 0.812544047832489, "loss_mode_switch": 0.0, "loss_total": 0.27843332290649414, "step": 2243 }, { "batch_size": 1, "epoch": 0.8972, "step": 2243, "tokens_per_device": 5533 }, { "epoch": 0.8972, "loss_ce": 0.014359463937580585, "loss_lvr": 0.48181188106536865, "loss_mode_switch": 0.0, "loss_total": 0.06254065036773682, "step": 2243 }, { "epoch": 0.8976, "grad_norm": 1.1862480640411377, "learning_rate": 2.7246570481953004e-07, "loss": 0.2807, "step": 2244 }, { "batch_size": 4, "epoch": 0.8976, "step": 2244, "tokens_per_device": 2648 }, { "epoch": 0.8976, "loss_ce": 0.39657363295555115, "loss_lvr": 0.9190716743469238, "loss_mode_switch": 0.0, "loss_total": 0.488480806350708, "step": 2244 }, { "batch_size": 1, "epoch": 0.8976, "step": 2244, "tokens_per_device": 4825 }, { "epoch": 0.8976, "loss_ce": 0.3352585434913635, "loss_lvr": 0.6458570957183838, "loss_mode_switch": 0.0, "loss_total": 0.3998442590236664, "step": 2244 }, { "batch_size": 4, "epoch": 0.8976, "step": 2244, "tokens_per_device": 5016 }, { "epoch": 0.8976, "loss_ce": 0.22111999988555908, "loss_lvr": 0.9982195496559143, "loss_mode_switch": 0.0, "loss_total": 0.3209419548511505, "step": 2244 }, { "batch_size": 4, "epoch": 0.8976, "step": 2244, "tokens_per_device": 1584 }, { "epoch": 0.8976, "loss_ce": 0.2692221403121948, "loss_lvr": 2.094099521636963, "loss_mode_switch": 0.0, "loss_total": 0.4786320924758911, "step": 2244 }, { "batch_size": 1, "epoch": 0.8976, "step": 2244, "tokens_per_device": 5149 }, { "epoch": 0.8976, "loss_ce": 0.0009882585145533085, "loss_lvr": 0.5616862773895264, "loss_mode_switch": 0.0, "loss_total": 0.05715688690543175, "step": 2244 }, { "batch_size": 1, "epoch": 0.8976, "step": 2244, "tokens_per_device": 5108 }, { "epoch": 0.8976, "loss_ce": 0.0018423295114189386, "loss_lvr": 0.31608277559280396, "loss_mode_switch": 0.0, "loss_total": 0.03345061093568802, "step": 2244 }, { "batch_size": 4, "epoch": 0.8976, "step": 2244, "tokens_per_device": 5852 }, { "epoch": 0.8976, "loss_ce": 0.21978750824928284, "loss_lvr": 0.8058212995529175, "loss_mode_switch": 0.0, "loss_total": 0.30036965012550354, "step": 2244 }, { "batch_size": 4, "epoch": 0.8976, "step": 2244, "tokens_per_device": 3128 }, { "epoch": 0.8976, "loss_ce": 0.06798703223466873, "loss_lvr": 0.6506779789924622, "loss_mode_switch": 0.0, "loss_total": 0.13305482268333435, "step": 2244 }, { "epoch": 0.898, "grad_norm": 1.190726637840271, "learning_rate": 2.7036058100047723e-07, "loss": 0.2436, "step": 2245 }, { "batch_size": 4, "epoch": 0.898, "step": 2245, "tokens_per_device": 8492 }, { "epoch": 0.898, "loss_ce": 0.603135883808136, "loss_lvr": 0.6605473756790161, "loss_mode_switch": 0.0, "loss_total": 0.6691906452178955, "step": 2245 }, { "batch_size": 1, "epoch": 0.898, "step": 2245, "tokens_per_device": 5097 }, { "epoch": 0.898, "loss_ce": 0.001075509819202125, "loss_lvr": 0.282472163438797, "loss_mode_switch": 0.0, "loss_total": 0.029322726652026176, "step": 2245 }, { "batch_size": 4, "epoch": 0.898, "step": 2245, "tokens_per_device": 4352 }, { "epoch": 0.898, "loss_ce": 0.6591821908950806, "loss_lvr": 0.7920011878013611, "loss_mode_switch": 0.0, "loss_total": 0.7383823394775391, "step": 2245 }, { "batch_size": 1, "epoch": 0.898, "step": 2245, "tokens_per_device": 5106 }, { "epoch": 0.898, "loss_ce": 0.001034358749166131, "loss_lvr": 0.18579934537410736, "loss_mode_switch": 0.0, "loss_total": 0.019614294171333313, "step": 2245 }, { "batch_size": 4, "epoch": 0.898, "step": 2245, "tokens_per_device": 2632 }, { "epoch": 0.898, "loss_ce": 0.30651044845581055, "loss_lvr": 0.8007957935333252, "loss_mode_switch": 0.0, "loss_total": 0.38659003376960754, "step": 2245 }, { "batch_size": 4, "epoch": 0.898, "step": 2245, "tokens_per_device": 1596 }, { "epoch": 0.898, "loss_ce": 0.08933546394109726, "loss_lvr": 1.0146313905715942, "loss_mode_switch": 0.0, "loss_total": 0.19079861044883728, "step": 2245 }, { "batch_size": 4, "epoch": 0.898, "step": 2245, "tokens_per_device": 1216 }, { "epoch": 0.898, "loss_ce": 0.4348978102207184, "loss_lvr": 0.9209517240524292, "loss_mode_switch": 0.0, "loss_total": 0.5269929766654968, "step": 2245 }, { "batch_size": 4, "epoch": 0.898, "step": 2245, "tokens_per_device": 1312 }, { "epoch": 0.898, "loss_ce": 0.07861097157001495, "loss_lvr": 0.9452416300773621, "loss_mode_switch": 0.0, "loss_total": 0.17313513159751892, "step": 2245 }, { "epoch": 0.8984, "grad_norm": 1.3056514263153076, "learning_rate": 2.682633950559699e-07, "loss": 0.2792, "step": 2246 }, { "batch_size": 1, "epoch": 0.8984, "step": 2246, "tokens_per_device": 5166 }, { "epoch": 0.8984, "loss_ce": 0.008305005729198456, "loss_lvr": 0.2522905170917511, "loss_mode_switch": 0.0, "loss_total": 0.033534057438373566, "step": 2246 }, { "batch_size": 4, "epoch": 0.8984, "step": 2246, "tokens_per_device": 9644 }, { "epoch": 0.8984, "loss_ce": 0.0727454423904419, "loss_lvr": 0.8869473934173584, "loss_mode_switch": 0.0, "loss_total": 0.1614401936531067, "step": 2246 }, { "batch_size": 1, "epoch": 0.8984, "step": 2246, "tokens_per_device": 4954 }, { "epoch": 0.8984, "loss_ce": 0.0003675575426314026, "loss_lvr": 0.14715874195098877, "loss_mode_switch": 0.0, "loss_total": 0.0150834321975708, "step": 2246 }, { "batch_size": 4, "epoch": 0.8984, "step": 2246, "tokens_per_device": 2828 }, { "epoch": 0.8984, "loss_ce": 0.2691822350025177, "loss_lvr": 0.3696399927139282, "loss_mode_switch": 0.0, "loss_total": 0.3061462342739105, "step": 2246 }, { "batch_size": 4, "epoch": 0.8984, "step": 2246, "tokens_per_device": 3804 }, { "epoch": 0.8984, "loss_ce": 0.2688400447368622, "loss_lvr": 0.958601713180542, "loss_mode_switch": 0.0, "loss_total": 0.36470022797584534, "step": 2246 }, { "batch_size": 4, "epoch": 0.8984, "step": 2246, "tokens_per_device": 4256 }, { "epoch": 0.8984, "loss_ce": 0.5272949934005737, "loss_lvr": 0.8598993420600891, "loss_mode_switch": 0.0, "loss_total": 0.6132849454879761, "step": 2246 }, { "batch_size": 4, "epoch": 0.8984, "step": 2246, "tokens_per_device": 5280 }, { "epoch": 0.8984, "loss_ce": 0.29420793056488037, "loss_lvr": 0.779534637928009, "loss_mode_switch": 0.0, "loss_total": 0.3721613883972168, "step": 2246 }, { "batch_size": 1, "epoch": 0.8984, "step": 2246, "tokens_per_device": 4891 }, { "epoch": 0.8984, "loss_ce": 0.0017796106403693557, "loss_lvr": 0.2613086998462677, "loss_mode_switch": 0.0, "loss_total": 0.02791048027575016, "step": 2246 }, { "epoch": 0.8988, "grad_norm": 1.3745135068893433, "learning_rate": 2.661741505057691e-07, "loss": 0.2858, "step": 2247 }, { "batch_size": 4, "epoch": 0.8988, "step": 2247, "tokens_per_device": 4256 }, { "epoch": 0.8988, "loss_ce": 0.00952578242868185, "loss_lvr": 0.9816322922706604, "loss_mode_switch": 0.0, "loss_total": 0.1076890155673027, "step": 2247 }, { "batch_size": 4, "epoch": 0.8988, "step": 2247, "tokens_per_device": 6284 }, { "epoch": 0.8988, "loss_ce": 0.03825180232524872, "loss_lvr": 0.6674704551696777, "loss_mode_switch": 0.0, "loss_total": 0.10499884933233261, "step": 2247 }, { "batch_size": 4, "epoch": 0.8988, "step": 2247, "tokens_per_device": 2704 }, { "epoch": 0.8988, "loss_ce": 0.7248995304107666, "loss_lvr": 0.8371243476867676, "loss_mode_switch": 0.0, "loss_total": 0.8086119890213013, "step": 2247 }, { "batch_size": 4, "epoch": 0.8988, "step": 2247, "tokens_per_device": 1232 }, { "epoch": 0.8988, "loss_ce": 0.2785067856311798, "loss_lvr": 1.0235223770141602, "loss_mode_switch": 0.0, "loss_total": 0.38085901737213135, "step": 2247 }, { "batch_size": 4, "epoch": 0.8988, "step": 2247, "tokens_per_device": 2768 }, { "epoch": 0.8988, "loss_ce": 0.6215076446533203, "loss_lvr": 0.7687287330627441, "loss_mode_switch": 0.0, "loss_total": 0.6983805298805237, "step": 2247 }, { "batch_size": 4, "epoch": 0.8988, "step": 2247, "tokens_per_device": 5824 }, { "epoch": 0.8988, "loss_ce": 0.23619714379310608, "loss_lvr": 0.922692596912384, "loss_mode_switch": 0.0, "loss_total": 0.32846641540527344, "step": 2247 }, { "batch_size": 4, "epoch": 0.8988, "step": 2247, "tokens_per_device": 1512 }, { "epoch": 0.8988, "loss_ce": 0.6577569246292114, "loss_lvr": 0.8943262696266174, "loss_mode_switch": 0.0, "loss_total": 0.7471895217895508, "step": 2247 }, { "batch_size": 1, "epoch": 0.8988, "step": 2247, "tokens_per_device": 5007 }, { "epoch": 0.8988, "loss_ce": 0.00016828089428599924, "loss_lvr": 0.34964805841445923, "loss_mode_switch": 0.0, "loss_total": 0.03513308987021446, "step": 2247 }, { "epoch": 0.8992, "grad_norm": 1.4207271337509155, "learning_rate": 2.640928508563062e-07, "loss": 0.2911, "step": 2248 }, { "batch_size": 1, "epoch": 0.8992, "step": 2248, "tokens_per_device": 4875 }, { "epoch": 0.8992, "loss_ce": 0.030751649290323257, "loss_lvr": 0.15924564003944397, "loss_mode_switch": 0.0, "loss_total": 0.046676211059093475, "step": 2248 }, { "batch_size": 4, "epoch": 0.8992, "step": 2248, "tokens_per_device": 4704 }, { "epoch": 0.8992, "loss_ce": 0.2099931389093399, "loss_lvr": 0.4046298563480377, "loss_mode_switch": 0.0, "loss_total": 0.2504561245441437, "step": 2248 }, { "batch_size": 1, "epoch": 0.8992, "step": 2248, "tokens_per_device": 4896 }, { "epoch": 0.8992, "loss_ce": 0.28282514214515686, "loss_lvr": 0.4795677363872528, "loss_mode_switch": 0.0, "loss_total": 0.3307819068431854, "step": 2248 }, { "batch_size": 4, "epoch": 0.8992, "step": 2248, "tokens_per_device": 4216 }, { "epoch": 0.8992, "loss_ce": 0.28583452105522156, "loss_lvr": 0.8972929120063782, "loss_mode_switch": 0.0, "loss_total": 0.3755638003349304, "step": 2248 }, { "batch_size": 4, "epoch": 0.8992, "step": 2248, "tokens_per_device": 6304 }, { "epoch": 0.8992, "loss_ce": 0.15512926876544952, "loss_lvr": 0.7636427283287048, "loss_mode_switch": 0.0, "loss_total": 0.2314935326576233, "step": 2248 }, { "batch_size": 1, "epoch": 0.8992, "step": 2248, "tokens_per_device": 4849 }, { "epoch": 0.8992, "loss_ce": 0.004897920414805412, "loss_lvr": 0.3292595446109772, "loss_mode_switch": 0.0, "loss_total": 0.0378238782286644, "step": 2248 }, { "batch_size": 4, "epoch": 0.8992, "step": 2248, "tokens_per_device": 5256 }, { "epoch": 0.8992, "loss_ce": 0.24720999598503113, "loss_lvr": 1.0262479782104492, "loss_mode_switch": 0.0, "loss_total": 0.3498347997665405, "step": 2248 }, { "batch_size": 4, "epoch": 0.8992, "step": 2248, "tokens_per_device": 3912 }, { "epoch": 0.8992, "loss_ce": 0.27051377296447754, "loss_lvr": 0.8966102004051208, "loss_mode_switch": 0.0, "loss_total": 0.3601748049259186, "step": 2248 }, { "epoch": 0.8996, "grad_norm": 1.316380262374878, "learning_rate": 2.620194996006803e-07, "loss": 0.3019, "step": 2249 }, { "batch_size": 4, "epoch": 0.8996, "step": 2249, "tokens_per_device": 3764 }, { "epoch": 0.8996, "loss_ce": 0.24799947440624237, "loss_lvr": 0.6960365772247314, "loss_mode_switch": 0.0, "loss_total": 0.31760314106941223, "step": 2249 }, { "batch_size": 1, "epoch": 0.8996, "step": 2249, "tokens_per_device": 5139 }, { "epoch": 0.8996, "loss_ce": 0.0016574313631281257, "loss_lvr": 0.3464711308479309, "loss_mode_switch": 0.0, "loss_total": 0.036304544657468796, "step": 2249 }, { "batch_size": 4, "epoch": 0.8996, "step": 2249, "tokens_per_device": 1336 }, { "epoch": 0.8996, "loss_ce": 0.5355117917060852, "loss_lvr": 0.9889401197433472, "loss_mode_switch": 0.0, "loss_total": 0.634405791759491, "step": 2249 }, { "batch_size": 1, "epoch": 0.8996, "step": 2249, "tokens_per_device": 4817 }, { "epoch": 0.8996, "loss_ce": 0.0003394423401914537, "loss_lvr": 0.2694917321205139, "loss_mode_switch": 0.0, "loss_total": 0.027288615703582764, "step": 2249 }, { "batch_size": 4, "epoch": 0.8996, "step": 2249, "tokens_per_device": 4684 }, { "epoch": 0.8996, "loss_ce": 0.21822009980678558, "loss_lvr": 0.7279889583587646, "loss_mode_switch": 0.0, "loss_total": 0.2910189926624298, "step": 2249 }, { "batch_size": 1, "epoch": 0.8996, "step": 2249, "tokens_per_device": 4348 }, { "epoch": 0.8996, "loss_ce": 0.11237229406833649, "loss_lvr": 0.3062892258167267, "loss_mode_switch": 0.0, "loss_total": 0.14300121366977692, "step": 2249 }, { "batch_size": 4, "epoch": 0.8996, "step": 2249, "tokens_per_device": 4228 }, { "epoch": 0.8996, "loss_ce": 0.02710534818470478, "loss_lvr": 0.7273889183998108, "loss_mode_switch": 0.0, "loss_total": 0.09984423965215683, "step": 2249 }, { "batch_size": 4, "epoch": 0.8996, "step": 2249, "tokens_per_device": 5224 }, { "epoch": 0.8996, "loss_ce": 0.10454221069812775, "loss_lvr": 0.6577479243278503, "loss_mode_switch": 0.0, "loss_total": 0.17031699419021606, "step": 2249 }, { "epoch": 0.9, "grad_norm": 1.1150099039077759, "learning_rate": 2.599541002186479e-07, "loss": 0.2694, "step": 2250 }, { "batch_size": 1, "epoch": 0.9, "step": 2250, "tokens_per_device": 5099 }, { "epoch": 0.9, "loss_ce": 0.044498298317193985, "loss_lvr": 0.1628948450088501, "loss_mode_switch": 0.0, "loss_total": 0.060787782073020935, "step": 2250 }, { "batch_size": 4, "epoch": 0.9, "step": 2250, "tokens_per_device": 1344 }, { "epoch": 0.9, "loss_ce": 0.6454606652259827, "loss_lvr": 0.8604776263237, "loss_mode_switch": 0.0, "loss_total": 0.7315084338188171, "step": 2250 }, { "batch_size": 1, "epoch": 0.9, "step": 2250, "tokens_per_device": 5166 }, { "epoch": 0.9, "loss_ce": 0.007107269484549761, "loss_lvr": 0.75825434923172, "loss_mode_switch": 0.0, "loss_total": 0.08293271064758301, "step": 2250 }, { "batch_size": 4, "epoch": 0.9, "step": 2250, "tokens_per_device": 3824 }, { "epoch": 0.9, "loss_ce": 0.2593557834625244, "loss_lvr": 0.8720059394836426, "loss_mode_switch": 0.0, "loss_total": 0.3465563654899597, "step": 2250 }, { "batch_size": 4, "epoch": 0.9, "step": 2250, "tokens_per_device": 3808 }, { "epoch": 0.9, "loss_ce": 0.23362348973751068, "loss_lvr": 1.2465320825576782, "loss_mode_switch": 0.0, "loss_total": 0.35827669501304626, "step": 2250 }, { "batch_size": 1, "epoch": 0.9, "step": 2250, "tokens_per_device": 5099 }, { "epoch": 0.9, "loss_ce": 0.060834161937236786, "loss_lvr": 0.3958199918270111, "loss_mode_switch": 0.0, "loss_total": 0.1004161611199379, "step": 2250 }, { "batch_size": 4, "epoch": 0.9, "step": 2250, "tokens_per_device": 5840 }, { "epoch": 0.9, "loss_ce": 0.11922194808721542, "loss_lvr": 0.5438611507415771, "loss_mode_switch": 0.0, "loss_total": 0.17360806465148926, "step": 2250 }, { "batch_size": 4, "epoch": 0.9, "step": 2250, "tokens_per_device": 4400 }, { "epoch": 0.9, "loss_ce": 0.039369136095047, "loss_lvr": 0.8920807838439941, "loss_mode_switch": 0.0, "loss_total": 0.12857721745967865, "step": 2250 }, { "epoch": 0.9004, "grad_norm": 1.6205949783325195, "learning_rate": 2.578966561766233e-07, "loss": 0.289, "step": 2251 }, { "batch_size": 4, "epoch": 0.9004, "step": 2251, "tokens_per_device": 3736 }, { "epoch": 0.9004, "loss_ce": 0.06800737231969833, "loss_lvr": 1.2298089265823364, "loss_mode_switch": 0.0, "loss_total": 0.19098827242851257, "step": 2251 }, { "batch_size": 1, "epoch": 0.9004, "step": 2251, "tokens_per_device": 5019 }, { "epoch": 0.9004, "loss_ce": 0.14825017750263214, "loss_lvr": 0.465750128030777, "loss_mode_switch": 0.0, "loss_total": 0.1948251873254776, "step": 2251 }, { "batch_size": 4, "epoch": 0.9004, "step": 2251, "tokens_per_device": 13020 }, { "epoch": 0.9004, "loss_ce": 0.15601758658885956, "loss_lvr": 0.7490947842597961, "loss_mode_switch": 0.0, "loss_total": 0.23092706501483917, "step": 2251 }, { "batch_size": 4, "epoch": 0.9004, "step": 2251, "tokens_per_device": 4436 }, { "epoch": 0.9004, "loss_ce": 0.3827536702156067, "loss_lvr": 0.6413374543190002, "loss_mode_switch": 0.0, "loss_total": 0.44688743352890015, "step": 2251 }, { "batch_size": 1, "epoch": 0.9004, "step": 2251, "tokens_per_device": 4821 }, { "epoch": 0.9004, "loss_ce": 0.0006257136119529605, "loss_lvr": 0.3388870656490326, "loss_mode_switch": 0.0, "loss_total": 0.034514423459768295, "step": 2251 }, { "batch_size": 4, "epoch": 0.9004, "step": 2251, "tokens_per_device": 4256 }, { "epoch": 0.9004, "loss_ce": 0.8167068362236023, "loss_lvr": 0.6016973853111267, "loss_mode_switch": 0.0, "loss_total": 0.8768765926361084, "step": 2251 }, { "batch_size": 4, "epoch": 0.9004, "step": 2251, "tokens_per_device": 4232 }, { "epoch": 0.9004, "loss_ce": 0.18730135262012482, "loss_lvr": 0.9692912697792053, "loss_mode_switch": 0.0, "loss_total": 0.28423047065734863, "step": 2251 }, { "batch_size": 1, "epoch": 0.9004, "step": 2251, "tokens_per_device": 4856 }, { "epoch": 0.9004, "loss_ce": 0.003351702354848385, "loss_lvr": 0.36316293478012085, "loss_mode_switch": 0.0, "loss_total": 0.039667997509241104, "step": 2251 }, { "epoch": 0.9008, "grad_norm": 1.3895959854125977, "learning_rate": 2.5584717092766774e-07, "loss": 0.3058, "step": 2252 }, { "batch_size": 4, "epoch": 0.9008, "step": 2252, "tokens_per_device": 2944 }, { "epoch": 0.9008, "loss_ce": 0.028972795233130455, "loss_lvr": 0.7864685654640198, "loss_mode_switch": 0.0, "loss_total": 0.10761965811252594, "step": 2252 }, { "batch_size": 4, "epoch": 0.9008, "step": 2252, "tokens_per_device": 2616 }, { "epoch": 0.9008, "loss_ce": 0.3872585594654083, "loss_lvr": 1.0783789157867432, "loss_mode_switch": 0.0, "loss_total": 0.49509644508361816, "step": 2252 }, { "batch_size": 1, "epoch": 0.9008, "step": 2252, "tokens_per_device": 4235 }, { "epoch": 0.9008, "loss_ce": 0.05461415275931358, "loss_lvr": 0.39105352759361267, "loss_mode_switch": 0.0, "loss_total": 0.09371950477361679, "step": 2252 }, { "batch_size": 4, "epoch": 0.9008, "step": 2252, "tokens_per_device": 1228 }, { "epoch": 0.9008, "loss_ce": 0.49031883478164673, "loss_lvr": 1.2387737035751343, "loss_mode_switch": 0.0, "loss_total": 0.6141961812973022, "step": 2252 }, { "batch_size": 4, "epoch": 0.9008, "step": 2252, "tokens_per_device": 1332 }, { "epoch": 0.9008, "loss_ce": 0.3939797282218933, "loss_lvr": 0.8680931925773621, "loss_mode_switch": 0.0, "loss_total": 0.48078906536102295, "step": 2252 }, { "batch_size": 4, "epoch": 0.9008, "step": 2252, "tokens_per_device": 4612 }, { "epoch": 0.9008, "loss_ce": 0.24031932651996613, "loss_lvr": 0.9476678967475891, "loss_mode_switch": 0.0, "loss_total": 0.3350861072540283, "step": 2252 }, { "batch_size": 4, "epoch": 0.9008, "step": 2252, "tokens_per_device": 6308 }, { "epoch": 0.9008, "loss_ce": 0.18972338736057281, "loss_lvr": 0.7173230051994324, "loss_mode_switch": 0.0, "loss_total": 0.2614556849002838, "step": 2252 }, { "batch_size": 4, "epoch": 0.9008, "step": 2252, "tokens_per_device": 5744 }, { "epoch": 0.9008, "loss_ce": 0.46015894412994385, "loss_lvr": 0.965789794921875, "loss_mode_switch": 0.0, "loss_total": 0.5567378997802734, "step": 2252 }, { "epoch": 0.9012, "grad_norm": 1.3023035526275635, "learning_rate": 2.5380564791148364e-07, "loss": 0.3126, "step": 2253 }, { "batch_size": 4, "epoch": 0.9012, "step": 2253, "tokens_per_device": 3324 }, { "epoch": 0.9012, "loss_ce": 0.009194986894726753, "loss_lvr": 1.0282307863235474, "loss_mode_switch": 0.0, "loss_total": 0.11201806366443634, "step": 2253 }, { "batch_size": 4, "epoch": 0.9012, "step": 2253, "tokens_per_device": 3804 }, { "epoch": 0.9012, "loss_ce": 0.07693035900592804, "loss_lvr": 2.2058393955230713, "loss_mode_switch": 0.0, "loss_total": 0.29751431941986084, "step": 2253 }, { "batch_size": 4, "epoch": 0.9012, "step": 2253, "tokens_per_device": 5740 }, { "epoch": 0.9012, "loss_ce": 0.6009993553161621, "loss_lvr": 0.7053505182266235, "loss_mode_switch": 0.0, "loss_total": 0.6715344190597534, "step": 2253 }, { "batch_size": 1, "epoch": 0.9012, "step": 2253, "tokens_per_device": 4441 }, { "epoch": 0.9012, "loss_ce": 0.0002461660769768059, "loss_lvr": 0.2876129746437073, "loss_mode_switch": 0.0, "loss_total": 0.02900746278464794, "step": 2253 }, { "batch_size": 1, "epoch": 0.9012, "step": 2253, "tokens_per_device": 4955 }, { "epoch": 0.9012, "loss_ce": 0.2850800156593323, "loss_lvr": 0.2611978054046631, "loss_mode_switch": 0.0, "loss_total": 0.31119978427886963, "step": 2253 }, { "batch_size": 4, "epoch": 0.9012, "step": 2253, "tokens_per_device": 3812 }, { "epoch": 0.9012, "loss_ce": 0.047002553939819336, "loss_lvr": 0.8458282351493835, "loss_mode_switch": 0.0, "loss_total": 0.13158538937568665, "step": 2253 }, { "batch_size": 4, "epoch": 0.9012, "step": 2253, "tokens_per_device": 2624 }, { "epoch": 0.9012, "loss_ce": 0.16454502940177917, "loss_lvr": 0.8218223452568054, "loss_mode_switch": 0.0, "loss_total": 0.24672725796699524, "step": 2253 }, { "batch_size": 4, "epoch": 0.9012, "step": 2253, "tokens_per_device": 1716 }, { "epoch": 0.9012, "loss_ce": 0.5929957032203674, "loss_lvr": 1.115366816520691, "loss_mode_switch": 0.0, "loss_total": 0.7045323848724365, "step": 2253 }, { "epoch": 0.9016, "grad_norm": 1.295538306236267, "learning_rate": 2.517720905544102e-07, "loss": 0.2631, "step": 2254 }, { "batch_size": 1, "epoch": 0.9016, "step": 2254, "tokens_per_device": 4811 }, { "epoch": 0.9016, "loss_ce": 0.047175273299217224, "loss_lvr": 0.4436303973197937, "loss_mode_switch": 0.0, "loss_total": 0.09153831005096436, "step": 2254 }, { "batch_size": 4, "epoch": 0.9016, "step": 2254, "tokens_per_device": 4096 }, { "epoch": 0.9016, "loss_ce": 0.05343857780098915, "loss_lvr": 2.511343240737915, "loss_mode_switch": 0.0, "loss_total": 0.3045729100704193, "step": 2254 }, { "batch_size": 1, "epoch": 0.9016, "step": 2254, "tokens_per_device": 6458 }, { "epoch": 0.9016, "loss_ce": 0.08330176770687103, "loss_lvr": 0.29863622784614563, "loss_mode_switch": 0.0, "loss_total": 0.11316539347171783, "step": 2254 }, { "batch_size": 4, "epoch": 0.9016, "step": 2254, "tokens_per_device": 3224 }, { "epoch": 0.9016, "loss_ce": 0.28918367624282837, "loss_lvr": 0.8746721744537354, "loss_mode_switch": 0.0, "loss_total": 0.3766508996486664, "step": 2254 }, { "batch_size": 4, "epoch": 0.9016, "step": 2254, "tokens_per_device": 4576 }, { "epoch": 0.9016, "loss_ce": 0.025515759363770485, "loss_lvr": 1.1820958852767944, "loss_mode_switch": 0.0, "loss_total": 0.14372535049915314, "step": 2254 }, { "batch_size": 1, "epoch": 0.9016, "step": 2254, "tokens_per_device": 4806 }, { "epoch": 0.9016, "loss_ce": 0.028215747326612473, "loss_lvr": 0.36155229806900024, "loss_mode_switch": 0.0, "loss_total": 0.06437097489833832, "step": 2254 }, { "batch_size": 4, "epoch": 0.9016, "step": 2254, "tokens_per_device": 4296 }, { "epoch": 0.9016, "loss_ce": 0.24320977926254272, "loss_lvr": 1.3379839658737183, "loss_mode_switch": 0.0, "loss_total": 0.37700819969177246, "step": 2254 }, { "batch_size": 4, "epoch": 0.9016, "step": 2254, "tokens_per_device": 4072 }, { "epoch": 0.9016, "loss_ce": 0.005154543090611696, "loss_lvr": 1.0355339050292969, "loss_mode_switch": 0.0, "loss_total": 0.10870793461799622, "step": 2254 }, { "epoch": 0.902, "grad_norm": 1.3867028951644897, "learning_rate": 2.497465022694207e-07, "loss": 0.2892, "step": 2255 }, { "batch_size": 4, "epoch": 0.902, "step": 2255, "tokens_per_device": 4796 }, { "epoch": 0.902, "loss_ce": 0.005266615655273199, "loss_lvr": 0.8038890361785889, "loss_mode_switch": 0.0, "loss_total": 0.08565551787614822, "step": 2255 }, { "batch_size": 1, "epoch": 0.902, "step": 2255, "tokens_per_device": 5129 }, { "epoch": 0.902, "loss_ce": 0.009167923592031002, "loss_lvr": 0.8464305400848389, "loss_mode_switch": 0.0, "loss_total": 0.09381098300218582, "step": 2255 }, { "batch_size": 1, "epoch": 0.902, "step": 2255, "tokens_per_device": 5089 }, { "epoch": 0.902, "loss_ce": 0.0065774437971413136, "loss_lvr": 0.29124128818511963, "loss_mode_switch": 0.0, "loss_total": 0.03570157289505005, "step": 2255 }, { "batch_size": 4, "epoch": 0.902, "step": 2255, "tokens_per_device": 4796 }, { "epoch": 0.902, "loss_ce": 0.014237797819077969, "loss_lvr": 0.9400026202201843, "loss_mode_switch": 0.0, "loss_total": 0.10823806375265121, "step": 2255 }, { "batch_size": 1, "epoch": 0.902, "step": 2255, "tokens_per_device": 5119 }, { "epoch": 0.902, "loss_ce": 0.010970433242619038, "loss_lvr": 0.32684534788131714, "loss_mode_switch": 0.0, "loss_total": 0.04365496709942818, "step": 2255 }, { "batch_size": 1, "epoch": 0.902, "step": 2255, "tokens_per_device": 5035 }, { "epoch": 0.902, "loss_ce": 0.011447269469499588, "loss_lvr": 0.5583464503288269, "loss_mode_switch": 0.0, "loss_total": 0.06728191673755646, "step": 2255 }, { "batch_size": 4, "epoch": 0.902, "step": 2255, "tokens_per_device": 4284 }, { "epoch": 0.902, "loss_ce": 0.3289230167865753, "loss_lvr": 0.9012993574142456, "loss_mode_switch": 0.0, "loss_total": 0.41905295848846436, "step": 2255 }, { "batch_size": 4, "epoch": 0.902, "step": 2255, "tokens_per_device": 4880 }, { "epoch": 0.902, "loss_ce": 0.04331699758768082, "loss_lvr": 1.0878132581710815, "loss_mode_switch": 0.0, "loss_total": 0.15209832787513733, "step": 2255 }, { "epoch": 0.9024, "grad_norm": 1.1543831825256348, "learning_rate": 2.477288864561106e-07, "loss": 0.237, "step": 2256 }, { "batch_size": 1, "epoch": 0.9024, "step": 2256, "tokens_per_device": 5111 }, { "epoch": 0.9024, "loss_ce": 0.01424498949199915, "loss_lvr": 0.5547311305999756, "loss_mode_switch": 0.0, "loss_total": 0.06971810758113861, "step": 2256 }, { "batch_size": 4, "epoch": 0.9024, "step": 2256, "tokens_per_device": 2688 }, { "epoch": 0.9024, "loss_ce": 0.8408423066139221, "loss_lvr": 0.6898627877235413, "loss_mode_switch": 0.0, "loss_total": 0.9098286032676697, "step": 2256 }, { "batch_size": 4, "epoch": 0.9024, "step": 2256, "tokens_per_device": 11192 }, { "epoch": 0.9024, "loss_ce": 0.19700638949871063, "loss_lvr": 0.6270946264266968, "loss_mode_switch": 0.0, "loss_total": 0.25971585512161255, "step": 2256 }, { "batch_size": 4, "epoch": 0.9024, "step": 2256, "tokens_per_device": 4276 }, { "epoch": 0.9024, "loss_ce": 0.4450005292892456, "loss_lvr": 0.9303010702133179, "loss_mode_switch": 0.0, "loss_total": 0.5380306243896484, "step": 2256 }, { "batch_size": 4, "epoch": 0.9024, "step": 2256, "tokens_per_device": 3676 }, { "epoch": 0.9024, "loss_ce": 0.013410467654466629, "loss_lvr": 0.738212525844574, "loss_mode_switch": 0.0, "loss_total": 0.08723172545433044, "step": 2256 }, { "batch_size": 4, "epoch": 0.9024, "step": 2256, "tokens_per_device": 1424 }, { "epoch": 0.9024, "loss_ce": 0.5997775793075562, "loss_lvr": 0.9100171327590942, "loss_mode_switch": 0.0, "loss_total": 0.6907792687416077, "step": 2256 }, { "batch_size": 1, "epoch": 0.9024, "step": 2256, "tokens_per_device": 5013 }, { "epoch": 0.9024, "loss_ce": 0.18145252764225006, "loss_lvr": 0.25736120343208313, "loss_mode_switch": 0.0, "loss_total": 0.2071886509656906, "step": 2256 }, { "batch_size": 1, "epoch": 0.9024, "step": 2256, "tokens_per_device": 4818 }, { "epoch": 0.9024, "loss_ce": 0.00029759612516500056, "loss_lvr": 0.32153239846229553, "loss_mode_switch": 0.0, "loss_total": 0.0324508361518383, "step": 2256 }, { "epoch": 0.9028, "grad_norm": 1.380940556526184, "learning_rate": 2.4571924650069634e-07, "loss": 0.3087, "step": 2257 }, { "batch_size": 4, "epoch": 0.9028, "step": 2257, "tokens_per_device": 1508 }, { "epoch": 0.9028, "loss_ce": 0.3205660879611969, "loss_lvr": 1.8006591796875, "loss_mode_switch": 0.0, "loss_total": 0.5006319880485535, "step": 2257 }, { "batch_size": 4, "epoch": 0.9028, "step": 2257, "tokens_per_device": 4764 }, { "epoch": 0.9028, "loss_ce": 0.12565858662128448, "loss_lvr": 0.6647214889526367, "loss_mode_switch": 0.0, "loss_total": 0.19213074445724487, "step": 2257 }, { "batch_size": 4, "epoch": 0.9028, "step": 2257, "tokens_per_device": 5156 }, { "epoch": 0.9028, "loss_ce": 0.2998703420162201, "loss_lvr": 0.8414605259895325, "loss_mode_switch": 0.0, "loss_total": 0.38401639461517334, "step": 2257 }, { "batch_size": 4, "epoch": 0.9028, "step": 2257, "tokens_per_device": 4420 }, { "epoch": 0.9028, "loss_ce": 0.27709075808525085, "loss_lvr": 0.8979141712188721, "loss_mode_switch": 0.0, "loss_total": 0.36688217520713806, "step": 2257 }, { "batch_size": 4, "epoch": 0.9028, "step": 2257, "tokens_per_device": 8596 }, { "epoch": 0.9028, "loss_ce": 0.012296350672841072, "loss_lvr": 1.0019344091415405, "loss_mode_switch": 0.0, "loss_total": 0.11248978972434998, "step": 2257 }, { "batch_size": 1, "epoch": 0.9028, "step": 2257, "tokens_per_device": 4346 }, { "epoch": 0.9028, "loss_ce": 0.008735359646379948, "loss_lvr": 0.10435935854911804, "loss_mode_switch": 0.0, "loss_total": 0.019171295687556267, "step": 2257 }, { "batch_size": 1, "epoch": 0.9028, "step": 2257, "tokens_per_device": 4899 }, { "epoch": 0.9028, "loss_ce": 0.01603427343070507, "loss_lvr": 0.336648166179657, "loss_mode_switch": 0.0, "loss_total": 0.0496990904211998, "step": 2257 }, { "batch_size": 1, "epoch": 0.9028, "step": 2257, "tokens_per_device": 5179 }, { "epoch": 0.9028, "loss_ce": 0.024487148970365524, "loss_lvr": 0.6164202094078064, "loss_mode_switch": 0.0, "loss_total": 0.08612917363643646, "step": 2257 }, { "epoch": 0.9032, "grad_norm": 1.2819128036499023, "learning_rate": 2.437175857760077e-07, "loss": 0.2531, "step": 2258 }, { "batch_size": 1, "epoch": 0.9032, "step": 2258, "tokens_per_device": 4959 }, { "epoch": 0.9032, "loss_ce": 0.03254071995615959, "loss_lvr": 0.315947026014328, "loss_mode_switch": 0.0, "loss_total": 0.06413542479276657, "step": 2258 }, { "batch_size": 1, "epoch": 0.9032, "step": 2258, "tokens_per_device": 5101 }, { "epoch": 0.9032, "loss_ce": 0.002487556543201208, "loss_lvr": 0.41349494457244873, "loss_mode_switch": 0.0, "loss_total": 0.0438370518386364, "step": 2258 }, { "batch_size": 4, "epoch": 0.9032, "step": 2258, "tokens_per_device": 5024 }, { "epoch": 0.9032, "loss_ce": 0.3955989480018616, "loss_lvr": 0.8254601955413818, "loss_mode_switch": 0.0, "loss_total": 0.47814497351646423, "step": 2258 }, { "batch_size": 4, "epoch": 0.9032, "step": 2258, "tokens_per_device": 5516 }, { "epoch": 0.9032, "loss_ce": 0.17913417518138885, "loss_lvr": 0.6294592618942261, "loss_mode_switch": 0.0, "loss_total": 0.24208009243011475, "step": 2258 }, { "batch_size": 1, "epoch": 0.9032, "step": 2258, "tokens_per_device": 4898 }, { "epoch": 0.9032, "loss_ce": 0.017047127708792686, "loss_lvr": 0.11785457283258438, "loss_mode_switch": 0.0, "loss_total": 0.028832584619522095, "step": 2258 }, { "batch_size": 4, "epoch": 0.9032, "step": 2258, "tokens_per_device": 6972 }, { "epoch": 0.9032, "loss_ce": 0.1733493059873581, "loss_lvr": 0.7213166356086731, "loss_mode_switch": 0.0, "loss_total": 0.2454809695482254, "step": 2258 }, { "batch_size": 4, "epoch": 0.9032, "step": 2258, "tokens_per_device": 4152 }, { "epoch": 0.9032, "loss_ce": 0.4577042758464813, "loss_lvr": 0.7903834581375122, "loss_mode_switch": 0.0, "loss_total": 0.536742627620697, "step": 2258 }, { "batch_size": 4, "epoch": 0.9032, "step": 2258, "tokens_per_device": 2988 }, { "epoch": 0.9032, "loss_ce": 0.1541510671377182, "loss_lvr": 0.6837493777275085, "loss_mode_switch": 0.0, "loss_total": 0.22252601385116577, "step": 2258 }, { "epoch": 0.9036, "grad_norm": 1.4489775896072388, "learning_rate": 2.417239076414829e-07, "loss": 0.2589, "step": 2259 }, { "batch_size": 1, "epoch": 0.9036, "step": 2259, "tokens_per_device": 4951 }, { "epoch": 0.9036, "loss_ce": 0.008511748164892197, "loss_lvr": 1.16060471534729, "loss_mode_switch": 0.0, "loss_total": 0.12457221746444702, "step": 2259 }, { "batch_size": 4, "epoch": 0.9036, "step": 2259, "tokens_per_device": 4856 }, { "epoch": 0.9036, "loss_ce": 0.3981829881668091, "loss_lvr": 0.7746620178222656, "loss_mode_switch": 0.0, "loss_total": 0.4756491780281067, "step": 2259 }, { "batch_size": 1, "epoch": 0.9036, "step": 2259, "tokens_per_device": 4873 }, { "epoch": 0.9036, "loss_ce": 0.00027398468228057027, "loss_lvr": 0.182843878865242, "loss_mode_switch": 0.0, "loss_total": 0.01855837181210518, "step": 2259 }, { "batch_size": 4, "epoch": 0.9036, "step": 2259, "tokens_per_device": 2688 }, { "epoch": 0.9036, "loss_ce": 0.44571980834007263, "loss_lvr": 0.8453030586242676, "loss_mode_switch": 0.0, "loss_total": 0.5302501320838928, "step": 2259 }, { "batch_size": 4, "epoch": 0.9036, "step": 2259, "tokens_per_device": 4172 }, { "epoch": 0.9036, "loss_ce": 0.04536737501621246, "loss_lvr": 0.7301883101463318, "loss_mode_switch": 0.0, "loss_total": 0.11838620901107788, "step": 2259 }, { "batch_size": 4, "epoch": 0.9036, "step": 2259, "tokens_per_device": 4800 }, { "epoch": 0.9036, "loss_ce": 0.24633896350860596, "loss_lvr": 0.9331904053688049, "loss_mode_switch": 0.0, "loss_total": 0.3396580219268799, "step": 2259 }, { "batch_size": 1, "epoch": 0.9036, "step": 2259, "tokens_per_device": 5175 }, { "epoch": 0.9036, "loss_ce": 0.1101943850517273, "loss_lvr": 0.49382704496383667, "loss_mode_switch": 0.0, "loss_total": 0.15957708656787872, "step": 2259 }, { "batch_size": 4, "epoch": 0.9036, "step": 2259, "tokens_per_device": 6232 }, { "epoch": 0.9036, "loss_ce": 0.005124866031110287, "loss_lvr": 0.9462924003601074, "loss_mode_switch": 0.0, "loss_total": 0.09975410997867584, "step": 2259 }, { "epoch": 0.904, "grad_norm": 1.26541268825531, "learning_rate": 2.397382154431621e-07, "loss": 0.2911, "step": 2260 }, { "batch_size": 1, "epoch": 0.904, "step": 2260, "tokens_per_device": 5163 }, { "epoch": 0.904, "loss_ce": 0.052529189735651016, "loss_lvr": 0.3861573040485382, "loss_mode_switch": 0.0, "loss_total": 0.09114491939544678, "step": 2260 }, { "batch_size": 4, "epoch": 0.904, "step": 2260, "tokens_per_device": 4604 }, { "epoch": 0.904, "loss_ce": 0.06003303825855255, "loss_lvr": 1.0840624570846558, "loss_mode_switch": 0.0, "loss_total": 0.16843928396701813, "step": 2260 }, { "batch_size": 1, "epoch": 0.904, "step": 2260, "tokens_per_device": 4904 }, { "epoch": 0.904, "loss_ce": 0.0010885328520089388, "loss_lvr": 0.6313480138778687, "loss_mode_switch": 0.0, "loss_total": 0.06422333419322968, "step": 2260 }, { "batch_size": 4, "epoch": 0.904, "step": 2260, "tokens_per_device": 1348 }, { "epoch": 0.904, "loss_ce": 0.5602046847343445, "loss_lvr": 1.141364574432373, "loss_mode_switch": 0.0, "loss_total": 0.6743411421775818, "step": 2260 }, { "batch_size": 4, "epoch": 0.904, "step": 2260, "tokens_per_device": 1676 }, { "epoch": 0.904, "loss_ce": 0.6316086649894714, "loss_lvr": 1.0293583869934082, "loss_mode_switch": 0.0, "loss_total": 0.7345445156097412, "step": 2260 }, { "batch_size": 4, "epoch": 0.904, "step": 2260, "tokens_per_device": 1656 }, { "epoch": 0.904, "loss_ce": 0.4808160364627838, "loss_lvr": 0.763189435005188, "loss_mode_switch": 0.0, "loss_total": 0.5571349859237671, "step": 2260 }, { "batch_size": 1, "epoch": 0.904, "step": 2260, "tokens_per_device": 4891 }, { "epoch": 0.904, "loss_ce": 0.1722344011068344, "loss_lvr": 0.48034167289733887, "loss_mode_switch": 0.0, "loss_total": 0.22026857733726501, "step": 2260 }, { "batch_size": 1, "epoch": 0.904, "step": 2260, "tokens_per_device": 5968 }, { "epoch": 0.904, "loss_ce": 0.24059540033340454, "loss_lvr": 0.28059008717536926, "loss_mode_switch": 0.0, "loss_total": 0.26865440607070923, "step": 2260 }, { "epoch": 0.9044, "grad_norm": 1.3759108781814575, "learning_rate": 2.3776051251368505e-07, "loss": 0.272, "step": 2261 }, { "batch_size": 1, "epoch": 0.9044, "step": 2261, "tokens_per_device": 4951 }, { "epoch": 0.9044, "loss_ce": 0.01353148277848959, "loss_lvr": 0.34683525562286377, "loss_mode_switch": 0.0, "loss_total": 0.04821500927209854, "step": 2261 }, { "batch_size": 4, "epoch": 0.9044, "step": 2261, "tokens_per_device": 4236 }, { "epoch": 0.9044, "loss_ce": 0.0013365022605285048, "loss_lvr": 1.2718156576156616, "loss_mode_switch": 0.0, "loss_total": 0.12851807475090027, "step": 2261 }, { "batch_size": 4, "epoch": 0.9044, "step": 2261, "tokens_per_device": 4536 }, { "epoch": 0.9044, "loss_ce": 0.08675537258386612, "loss_lvr": 0.8030859231948853, "loss_mode_switch": 0.0, "loss_total": 0.16706396639347076, "step": 2261 }, { "batch_size": 4, "epoch": 0.9044, "step": 2261, "tokens_per_device": 2608 }, { "epoch": 0.9044, "loss_ce": 0.10081706941127777, "loss_lvr": 0.8654406666755676, "loss_mode_switch": 0.0, "loss_total": 0.18736113607883453, "step": 2261 }, { "batch_size": 4, "epoch": 0.9044, "step": 2261, "tokens_per_device": 4260 }, { "epoch": 0.9044, "loss_ce": 0.4536382257938385, "loss_lvr": 0.7178611755371094, "loss_mode_switch": 0.0, "loss_total": 0.5254243612289429, "step": 2261 }, { "batch_size": 1, "epoch": 0.9044, "step": 2261, "tokens_per_device": 4953 }, { "epoch": 0.9044, "loss_ce": 1.1985565423965454, "loss_lvr": 0.7966359853744507, "loss_mode_switch": 0.0, "loss_total": 1.2782201766967773, "step": 2261 }, { "batch_size": 4, "epoch": 0.9044, "step": 2261, "tokens_per_device": 4868 }, { "epoch": 0.9044, "loss_ce": 0.09132704138755798, "loss_lvr": 1.0508016347885132, "loss_mode_switch": 0.0, "loss_total": 0.19640719890594482, "step": 2261 }, { "batch_size": 1, "epoch": 0.9044, "step": 2261, "tokens_per_device": 5104 }, { "epoch": 0.9044, "loss_ce": 0.0635886937379837, "loss_lvr": 0.22655731439590454, "loss_mode_switch": 0.0, "loss_total": 0.08624442666769028, "step": 2261 }, { "epoch": 0.9048, "grad_norm": 1.256285548210144, "learning_rate": 2.3579080217228046e-07, "loss": 0.2703, "step": 2262 }, { "batch_size": 4, "epoch": 0.9048, "step": 2262, "tokens_per_device": 4196 }, { "epoch": 0.9048, "loss_ce": 0.024738505482673645, "loss_lvr": 0.9321679472923279, "loss_mode_switch": 0.0, "loss_total": 0.11795530468225479, "step": 2262 }, { "batch_size": 4, "epoch": 0.9048, "step": 2262, "tokens_per_device": 5904 }, { "epoch": 0.9048, "loss_ce": 0.2181558907032013, "loss_lvr": 0.7780202627182007, "loss_mode_switch": 0.0, "loss_total": 0.29595792293548584, "step": 2262 }, { "batch_size": 4, "epoch": 0.9048, "step": 2262, "tokens_per_device": 2644 }, { "epoch": 0.9048, "loss_ce": 0.3863639235496521, "loss_lvr": 0.8025176525115967, "loss_mode_switch": 0.0, "loss_total": 0.4666156768798828, "step": 2262 }, { "batch_size": 4, "epoch": 0.9048, "step": 2262, "tokens_per_device": 8168 }, { "epoch": 0.9048, "loss_ce": 0.3622424304485321, "loss_lvr": 0.8433207869529724, "loss_mode_switch": 0.0, "loss_total": 0.44657450914382935, "step": 2262 }, { "batch_size": 4, "epoch": 0.9048, "step": 2262, "tokens_per_device": 5448 }, { "epoch": 0.9048, "loss_ce": 0.23506808280944824, "loss_lvr": 0.6912415027618408, "loss_mode_switch": 0.0, "loss_total": 0.3041922450065613, "step": 2262 }, { "batch_size": 4, "epoch": 0.9048, "step": 2262, "tokens_per_device": 6036 }, { "epoch": 0.9048, "loss_ce": 0.21354486048221588, "loss_lvr": 0.7446373701095581, "loss_mode_switch": 0.0, "loss_total": 0.28800860047340393, "step": 2262 }, { "batch_size": 4, "epoch": 0.9048, "step": 2262, "tokens_per_device": 5092 }, { "epoch": 0.9048, "loss_ce": 0.29006943106651306, "loss_lvr": 0.6497409343719482, "loss_mode_switch": 0.0, "loss_total": 0.35504353046417236, "step": 2262 }, { "batch_size": 4, "epoch": 0.9048, "step": 2262, "tokens_per_device": 2208 }, { "epoch": 0.9048, "loss_ce": 0.30355334281921387, "loss_lvr": 1.4196542501449585, "loss_mode_switch": 0.0, "loss_total": 0.4455187916755676, "step": 2262 }, { "epoch": 0.9052, "grad_norm": 1.2452257871627808, "learning_rate": 2.3382908772476175e-07, "loss": 0.2598, "step": 2263 }, { "batch_size": 4, "epoch": 0.9052, "step": 2263, "tokens_per_device": 12972 }, { "epoch": 0.9052, "loss_ce": 0.3223460614681244, "loss_lvr": 0.8486797213554382, "loss_mode_switch": 0.0, "loss_total": 0.40721404552459717, "step": 2263 }, { "batch_size": 4, "epoch": 0.9052, "step": 2263, "tokens_per_device": 3856 }, { "epoch": 0.9052, "loss_ce": 0.03935515135526657, "loss_lvr": 0.5527695417404175, "loss_mode_switch": 0.0, "loss_total": 0.0946321040391922, "step": 2263 }, { "batch_size": 4, "epoch": 0.9052, "step": 2263, "tokens_per_device": 4300 }, { "epoch": 0.9052, "loss_ce": 0.05991343781352043, "loss_lvr": 0.8924232125282288, "loss_mode_switch": 0.0, "loss_total": 0.14915576577186584, "step": 2263 }, { "batch_size": 4, "epoch": 0.9052, "step": 2263, "tokens_per_device": 3752 }, { "epoch": 0.9052, "loss_ce": 0.07184481620788574, "loss_lvr": 1.1308237314224243, "loss_mode_switch": 0.0, "loss_total": 0.18492719531059265, "step": 2263 }, { "batch_size": 1, "epoch": 0.9052, "step": 2263, "tokens_per_device": 4933 }, { "epoch": 0.9052, "loss_ce": 0.015461409464478493, "loss_lvr": 0.3173736333847046, "loss_mode_switch": 0.0, "loss_total": 0.04719877243041992, "step": 2263 }, { "batch_size": 4, "epoch": 0.9052, "step": 2263, "tokens_per_device": 2612 }, { "epoch": 0.9052, "loss_ce": 0.05578501150012016, "loss_lvr": 0.8496581315994263, "loss_mode_switch": 0.0, "loss_total": 0.14075082540512085, "step": 2263 }, { "batch_size": 4, "epoch": 0.9052, "step": 2263, "tokens_per_device": 1620 }, { "epoch": 0.9052, "loss_ce": 0.38415655493736267, "loss_lvr": 0.8940066695213318, "loss_mode_switch": 0.0, "loss_total": 0.4735572338104248, "step": 2263 }, { "batch_size": 1, "epoch": 0.9052, "step": 2263, "tokens_per_device": 4854 }, { "epoch": 0.9052, "loss_ce": 0.1697216033935547, "loss_lvr": 0.2660832703113556, "loss_mode_switch": 0.0, "loss_total": 0.19632993638515472, "step": 2263 }, { "epoch": 0.9056, "grad_norm": 1.3759970664978027, "learning_rate": 2.3187537246352587e-07, "loss": 0.2833, "step": 2264 }, { "batch_size": 4, "epoch": 0.9056, "step": 2264, "tokens_per_device": 4044 }, { "epoch": 0.9056, "loss_ce": 0.47198909521102905, "loss_lvr": 0.867135763168335, "loss_mode_switch": 0.0, "loss_total": 0.5587026476860046, "step": 2264 }, { "batch_size": 4, "epoch": 0.9056, "step": 2264, "tokens_per_device": 3800 }, { "epoch": 0.9056, "loss_ce": 0.330205500125885, "loss_lvr": 1.0080902576446533, "loss_mode_switch": 0.0, "loss_total": 0.4310145378112793, "step": 2264 }, { "batch_size": 1, "epoch": 0.9056, "step": 2264, "tokens_per_device": 5143 }, { "epoch": 0.9056, "loss_ce": 0.165060892701149, "loss_lvr": 0.3588411509990692, "loss_mode_switch": 0.0, "loss_total": 0.20094500482082367, "step": 2264 }, { "batch_size": 1, "epoch": 0.9056, "step": 2264, "tokens_per_device": 4907 }, { "epoch": 0.9056, "loss_ce": 0.01554043311625719, "loss_lvr": 0.16806165874004364, "loss_mode_switch": 0.0, "loss_total": 0.03234659880399704, "step": 2264 }, { "batch_size": 1, "epoch": 0.9056, "step": 2264, "tokens_per_device": 4899 }, { "epoch": 0.9056, "loss_ce": 1.0070061683654785, "loss_lvr": 0.7677038908004761, "loss_mode_switch": 0.0, "loss_total": 1.083776593208313, "step": 2264 }, { "batch_size": 1, "epoch": 0.9056, "step": 2264, "tokens_per_device": 4888 }, { "epoch": 0.9056, "loss_ce": 0.004453458823263645, "loss_lvr": 0.24108028411865234, "loss_mode_switch": 0.0, "loss_total": 0.028561487793922424, "step": 2264 }, { "batch_size": 1, "epoch": 0.9056, "step": 2264, "tokens_per_device": 5128 }, { "epoch": 0.9056, "loss_ce": 0.028613127768039703, "loss_lvr": 0.3515337407588959, "loss_mode_switch": 0.0, "loss_total": 0.06376650184392929, "step": 2264 }, { "batch_size": 4, "epoch": 0.9056, "step": 2264, "tokens_per_device": 7416 }, { "epoch": 0.9056, "loss_ce": 0.2473381906747818, "loss_lvr": 0.7811349034309387, "loss_mode_switch": 0.0, "loss_total": 0.32545167207717896, "step": 2264 }, { "epoch": 0.906, "grad_norm": 1.3373851776123047, "learning_rate": 2.2992965966754378e-07, "loss": 0.2553, "step": 2265 }, { "batch_size": 4, "epoch": 0.906, "step": 2265, "tokens_per_device": 4028 }, { "epoch": 0.906, "loss_ce": 0.057003285735845566, "loss_lvr": 1.0363062620162964, "loss_mode_switch": 0.0, "loss_total": 0.1606339067220688, "step": 2265 }, { "batch_size": 1, "epoch": 0.906, "step": 2265, "tokens_per_device": 5080 }, { "epoch": 0.906, "loss_ce": 0.0017786723328754306, "loss_lvr": 0.49011150002479553, "loss_mode_switch": 0.0, "loss_total": 0.05078982561826706, "step": 2265 }, { "batch_size": 1, "epoch": 0.906, "step": 2265, "tokens_per_device": 4935 }, { "epoch": 0.906, "loss_ce": 0.030224939808249474, "loss_lvr": 0.4281238615512848, "loss_mode_switch": 0.0, "loss_total": 0.07303732633590698, "step": 2265 }, { "batch_size": 4, "epoch": 0.906, "step": 2265, "tokens_per_device": 5948 }, { "epoch": 0.906, "loss_ce": 0.06890692561864853, "loss_lvr": 0.7500656247138977, "loss_mode_switch": 0.0, "loss_total": 0.14391349256038666, "step": 2265 }, { "batch_size": 4, "epoch": 0.906, "step": 2265, "tokens_per_device": 4348 }, { "epoch": 0.906, "loss_ce": 0.4381311535835266, "loss_lvr": 0.6464606523513794, "loss_mode_switch": 0.0, "loss_total": 0.5027772188186646, "step": 2265 }, { "batch_size": 4, "epoch": 0.906, "step": 2265, "tokens_per_device": 4280 }, { "epoch": 0.906, "loss_ce": 0.013120114803314209, "loss_lvr": 0.9665954113006592, "loss_mode_switch": 0.0, "loss_total": 0.10977965593338013, "step": 2265 }, { "batch_size": 4, "epoch": 0.906, "step": 2265, "tokens_per_device": 4196 }, { "epoch": 0.906, "loss_ce": 0.0025427399668842554, "loss_lvr": 0.8086029291152954, "loss_mode_switch": 0.0, "loss_total": 0.08340303599834442, "step": 2265 }, { "batch_size": 4, "epoch": 0.906, "step": 2265, "tokens_per_device": 2752 }, { "epoch": 0.906, "loss_ce": 0.08205915242433548, "loss_lvr": 0.6645581722259521, "loss_mode_switch": 0.0, "loss_total": 0.14851497113704681, "step": 2265 }, { "epoch": 0.9064, "grad_norm": 1.1797289848327637, "learning_rate": 2.279919526023533e-07, "loss": 0.1958, "step": 2266 }, { "batch_size": 1, "epoch": 0.9064, "step": 2266, "tokens_per_device": 5159 }, { "epoch": 0.9064, "loss_ce": 0.002740438561886549, "loss_lvr": 0.30046549439430237, "loss_mode_switch": 0.0, "loss_total": 0.03278698772192001, "step": 2266 }, { "batch_size": 4, "epoch": 0.9064, "step": 2266, "tokens_per_device": 4212 }, { "epoch": 0.9064, "loss_ce": 0.08622115105390549, "loss_lvr": 1.508948802947998, "loss_mode_switch": 0.0, "loss_total": 0.2371160387992859, "step": 2266 }, { "batch_size": 1, "epoch": 0.9064, "step": 2266, "tokens_per_device": 4905 }, { "epoch": 0.9064, "loss_ce": 0.05032322183251381, "loss_lvr": 0.3155302107334137, "loss_mode_switch": 0.0, "loss_total": 0.0818762481212616, "step": 2266 }, { "batch_size": 1, "epoch": 0.9064, "step": 2266, "tokens_per_device": 4419 }, { "epoch": 0.9064, "loss_ce": 0.10722553730010986, "loss_lvr": 0.9803423285484314, "loss_mode_switch": 0.0, "loss_total": 0.205259770154953, "step": 2266 }, { "batch_size": 1, "epoch": 0.9064, "step": 2266, "tokens_per_device": 5106 }, { "epoch": 0.9064, "loss_ce": 0.0028129196725785732, "loss_lvr": 0.41336414217948914, "loss_mode_switch": 0.0, "loss_total": 0.044149331748485565, "step": 2266 }, { "batch_size": 1, "epoch": 0.9064, "step": 2266, "tokens_per_device": 4864 }, { "epoch": 0.9064, "loss_ce": 0.00634801248088479, "loss_lvr": 0.28244152665138245, "loss_mode_switch": 0.0, "loss_total": 0.0345921665430069, "step": 2266 }, { "batch_size": 1, "epoch": 0.9064, "step": 2266, "tokens_per_device": 5018 }, { "epoch": 0.9064, "loss_ce": 0.24405911564826965, "loss_lvr": 0.18309468030929565, "loss_mode_switch": 0.0, "loss_total": 0.2623685896396637, "step": 2266 }, { "batch_size": 4, "epoch": 0.9064, "step": 2266, "tokens_per_device": 4260 }, { "epoch": 0.9064, "loss_ce": 0.14313872158527374, "loss_lvr": 1.4068965911865234, "loss_mode_switch": 0.0, "loss_total": 0.28382837772369385, "step": 2266 }, { "epoch": 0.9068, "grad_norm": 1.316167950630188, "learning_rate": 2.260622545200586e-07, "loss": 0.2918, "step": 2267 }, { "batch_size": 4, "epoch": 0.9068, "step": 2267, "tokens_per_device": 12892 }, { "epoch": 0.9068, "loss_ce": 0.12955620884895325, "loss_lvr": 0.8086265325546265, "loss_mode_switch": 0.0, "loss_total": 0.21041886508464813, "step": 2267 }, { "batch_size": 1, "epoch": 0.9068, "step": 2267, "tokens_per_device": 5094 }, { "epoch": 0.9068, "loss_ce": 0.4080142378807068, "loss_lvr": 0.3060893416404724, "loss_mode_switch": 0.0, "loss_total": 0.4386231601238251, "step": 2267 }, { "batch_size": 4, "epoch": 0.9068, "step": 2267, "tokens_per_device": 4192 }, { "epoch": 0.9068, "loss_ce": 0.21751321852207184, "loss_lvr": 0.7812720537185669, "loss_mode_switch": 0.0, "loss_total": 0.29564040899276733, "step": 2267 }, { "batch_size": 4, "epoch": 0.9068, "step": 2267, "tokens_per_device": 2656 }, { "epoch": 0.9068, "loss_ce": 0.6435993313789368, "loss_lvr": 0.8689656853675842, "loss_mode_switch": 0.0, "loss_total": 0.7304959297180176, "step": 2267 }, { "batch_size": 4, "epoch": 0.9068, "step": 2267, "tokens_per_device": 3900 }, { "epoch": 0.9068, "loss_ce": 0.05671583488583565, "loss_lvr": 0.8062653541564941, "loss_mode_switch": 0.0, "loss_total": 0.13734237849712372, "step": 2267 }, { "batch_size": 4, "epoch": 0.9068, "step": 2267, "tokens_per_device": 4344 }, { "epoch": 0.9068, "loss_ce": 0.01695624180138111, "loss_lvr": 0.6276593804359436, "loss_mode_switch": 0.0, "loss_total": 0.07972218096256256, "step": 2267 }, { "batch_size": 4, "epoch": 0.9068, "step": 2267, "tokens_per_device": 3144 }, { "epoch": 0.9068, "loss_ce": 0.022336389869451523, "loss_lvr": 1.0219578742980957, "loss_mode_switch": 0.0, "loss_total": 0.12453217804431915, "step": 2267 }, { "batch_size": 1, "epoch": 0.9068, "step": 2267, "tokens_per_device": 4868 }, { "epoch": 0.9068, "loss_ce": 0.07254000008106232, "loss_lvr": 0.2541665732860565, "loss_mode_switch": 0.0, "loss_total": 0.09795665740966797, "step": 2267 }, { "epoch": 0.9072, "grad_norm": 1.605843424797058, "learning_rate": 2.241405686593201e-07, "loss": 0.2807, "step": 2268 }, { "batch_size": 1, "epoch": 0.9072, "step": 2268, "tokens_per_device": 4880 }, { "epoch": 0.9072, "loss_ce": 0.010155231691896915, "loss_lvr": 0.3357570767402649, "loss_mode_switch": 0.0, "loss_total": 0.04373094066977501, "step": 2268 }, { "batch_size": 4, "epoch": 0.9072, "step": 2268, "tokens_per_device": 1368 }, { "epoch": 0.9072, "loss_ce": 0.4974406659603119, "loss_lvr": 0.9142275452613831, "loss_mode_switch": 0.0, "loss_total": 0.5888634324073792, "step": 2268 }, { "batch_size": 4, "epoch": 0.9072, "step": 2268, "tokens_per_device": 7740 }, { "epoch": 0.9072, "loss_ce": 0.20425564050674438, "loss_lvr": 0.6827130317687988, "loss_mode_switch": 0.0, "loss_total": 0.27252694964408875, "step": 2268 }, { "batch_size": 1, "epoch": 0.9072, "step": 2268, "tokens_per_device": 5034 }, { "epoch": 0.9072, "loss_ce": 0.2878780663013458, "loss_lvr": 0.25485530495643616, "loss_mode_switch": 0.0, "loss_total": 0.31336361169815063, "step": 2268 }, { "batch_size": 1, "epoch": 0.9072, "step": 2268, "tokens_per_device": 5090 }, { "epoch": 0.9072, "loss_ce": 0.001297804294154048, "loss_lvr": 0.37475237250328064, "loss_mode_switch": 0.0, "loss_total": 0.03877304494380951, "step": 2268 }, { "batch_size": 4, "epoch": 0.9072, "step": 2268, "tokens_per_device": 3792 }, { "epoch": 0.9072, "loss_ce": 0.8775855302810669, "loss_lvr": 0.8811222910881042, "loss_mode_switch": 0.0, "loss_total": 0.9656977653503418, "step": 2268 }, { "batch_size": 4, "epoch": 0.9072, "step": 2268, "tokens_per_device": 1248 }, { "epoch": 0.9072, "loss_ce": 0.34014612436294556, "loss_lvr": 1.394343376159668, "loss_mode_switch": 0.0, "loss_total": 0.47958046197891235, "step": 2268 }, { "batch_size": 1, "epoch": 0.9072, "step": 2268, "tokens_per_device": 4886 }, { "epoch": 0.9072, "loss_ce": 0.008568370714783669, "loss_lvr": 0.11830366402864456, "loss_mode_switch": 0.0, "loss_total": 0.020398736000061035, "step": 2268 }, { "epoch": 0.9076, "grad_norm": 1.3953901529312134, "learning_rate": 2.2222689824535294e-07, "loss": 0.2748, "step": 2269 }, { "batch_size": 4, "epoch": 0.9076, "step": 2269, "tokens_per_device": 4312 }, { "epoch": 0.9076, "loss_ce": 0.1908087283372879, "loss_lvr": 0.7813073992729187, "loss_mode_switch": 0.0, "loss_total": 0.26893946528434753, "step": 2269 }, { "batch_size": 4, "epoch": 0.9076, "step": 2269, "tokens_per_device": 1924 }, { "epoch": 0.9076, "loss_ce": 0.2763223946094513, "loss_lvr": 0.9998409152030945, "loss_mode_switch": 0.0, "loss_total": 0.3763064742088318, "step": 2269 }, { "batch_size": 4, "epoch": 0.9076, "step": 2269, "tokens_per_device": 6596 }, { "epoch": 0.9076, "loss_ce": 0.42207005620002747, "loss_lvr": 0.6319751143455505, "loss_mode_switch": 0.0, "loss_total": 0.4852675795555115, "step": 2269 }, { "batch_size": 1, "epoch": 0.9076, "step": 2269, "tokens_per_device": 4978 }, { "epoch": 0.9076, "loss_ce": 0.0987926572561264, "loss_lvr": 0.22932711243629456, "loss_mode_switch": 0.0, "loss_total": 0.12172536551952362, "step": 2269 }, { "batch_size": 1, "epoch": 0.9076, "step": 2269, "tokens_per_device": 4884 }, { "epoch": 0.9076, "loss_ce": 0.017729662358760834, "loss_lvr": 0.9821537137031555, "loss_mode_switch": 0.0, "loss_total": 0.11594503372907639, "step": 2269 }, { "batch_size": 4, "epoch": 0.9076, "step": 2269, "tokens_per_device": 1572 }, { "epoch": 0.9076, "loss_ce": 0.19428418576717377, "loss_lvr": 1.1021790504455566, "loss_mode_switch": 0.0, "loss_total": 0.30450209975242615, "step": 2269 }, { "batch_size": 4, "epoch": 0.9076, "step": 2269, "tokens_per_device": 4236 }, { "epoch": 0.9076, "loss_ce": 0.23540130257606506, "loss_lvr": 1.8099817037582397, "loss_mode_switch": 0.0, "loss_total": 0.4163994789123535, "step": 2269 }, { "batch_size": 1, "epoch": 0.9076, "step": 2269, "tokens_per_device": 5182 }, { "epoch": 0.9076, "loss_ce": 0.2002933919429779, "loss_lvr": 0.38298213481903076, "loss_mode_switch": 0.0, "loss_total": 0.23859161138534546, "step": 2269 }, { "epoch": 0.908, "grad_norm": 1.3621537685394287, "learning_rate": 2.2032124648992015e-07, "loss": 0.3133, "step": 2270 }, { "batch_size": 4, "epoch": 0.908, "step": 2270, "tokens_per_device": 2600 }, { "epoch": 0.908, "loss_ce": 0.5518038868904114, "loss_lvr": 0.8327975869178772, "loss_mode_switch": 0.0, "loss_total": 0.6350836753845215, "step": 2270 }, { "batch_size": 1, "epoch": 0.908, "step": 2270, "tokens_per_device": 4860 }, { "epoch": 0.908, "loss_ce": 0.4376574456691742, "loss_lvr": 0.37818607687950134, "loss_mode_switch": 0.0, "loss_total": 0.47547605633735657, "step": 2270 }, { "batch_size": 1, "epoch": 0.908, "step": 2270, "tokens_per_device": 4362 }, { "epoch": 0.908, "loss_ce": 0.02160503901541233, "loss_lvr": 0.5858477354049683, "loss_mode_switch": 0.0, "loss_total": 0.08018981665372849, "step": 2270 }, { "batch_size": 1, "epoch": 0.908, "step": 2270, "tokens_per_device": 4463 }, { "epoch": 0.908, "loss_ce": 0.0009798656683415174, "loss_lvr": 0.5018951892852783, "loss_mode_switch": 0.0, "loss_total": 0.05116938799619675, "step": 2270 }, { "batch_size": 4, "epoch": 0.908, "step": 2270, "tokens_per_device": 2976 }, { "epoch": 0.908, "loss_ce": 0.22222474217414856, "loss_lvr": 0.6455485820770264, "loss_mode_switch": 0.0, "loss_total": 0.28677961230278015, "step": 2270 }, { "batch_size": 4, "epoch": 0.908, "step": 2270, "tokens_per_device": 5864 }, { "epoch": 0.908, "loss_ce": 0.1113966554403305, "loss_lvr": 0.8215839862823486, "loss_mode_switch": 0.0, "loss_total": 0.1935550570487976, "step": 2270 }, { "batch_size": 4, "epoch": 0.908, "step": 2270, "tokens_per_device": 2584 }, { "epoch": 0.908, "loss_ce": 0.27275946736335754, "loss_lvr": 1.0576088428497314, "loss_mode_switch": 0.0, "loss_total": 0.3785203695297241, "step": 2270 }, { "batch_size": 4, "epoch": 0.908, "step": 2270, "tokens_per_device": 3832 }, { "epoch": 0.908, "loss_ce": 0.2560841143131256, "loss_lvr": 0.8714982867240906, "loss_mode_switch": 0.0, "loss_total": 0.34323394298553467, "step": 2270 }, { "epoch": 0.9084, "grad_norm": 1.3094251155853271, "learning_rate": 2.1842361659132395e-07, "loss": 0.286, "step": 2271 }, { "batch_size": 1, "epoch": 0.9084, "step": 2271, "tokens_per_device": 5375 }, { "epoch": 0.9084, "loss_ce": 0.12651148438453674, "loss_lvr": 0.31628870964050293, "loss_mode_switch": 0.0, "loss_total": 0.1581403613090515, "step": 2271 }, { "batch_size": 4, "epoch": 0.9084, "step": 2271, "tokens_per_device": 1376 }, { "epoch": 0.9084, "loss_ce": 0.19259370863437653, "loss_lvr": 1.6014455556869507, "loss_mode_switch": 0.0, "loss_total": 0.35273826122283936, "step": 2271 }, { "batch_size": 1, "epoch": 0.9084, "step": 2271, "tokens_per_device": 4880 }, { "epoch": 0.9084, "loss_ce": 0.0033635450527071953, "loss_lvr": 0.8065552115440369, "loss_mode_switch": 0.0, "loss_total": 0.08401906490325928, "step": 2271 }, { "batch_size": 4, "epoch": 0.9084, "step": 2271, "tokens_per_device": 4320 }, { "epoch": 0.9084, "loss_ce": 0.21093448996543884, "loss_lvr": 0.8422940373420715, "loss_mode_switch": 0.0, "loss_total": 0.2951638996601105, "step": 2271 }, { "batch_size": 1, "epoch": 0.9084, "step": 2271, "tokens_per_device": 5038 }, { "epoch": 0.9084, "loss_ce": 0.17083807289600372, "loss_lvr": 0.1976245641708374, "loss_mode_switch": 0.0, "loss_total": 0.19060052931308746, "step": 2271 }, { "batch_size": 4, "epoch": 0.9084, "step": 2271, "tokens_per_device": 2808 }, { "epoch": 0.9084, "loss_ce": 0.2735845148563385, "loss_lvr": 0.7725995779037476, "loss_mode_switch": 0.0, "loss_total": 0.35084447264671326, "step": 2271 }, { "batch_size": 4, "epoch": 0.9084, "step": 2271, "tokens_per_device": 4192 }, { "epoch": 0.9084, "loss_ce": 0.4857107698917389, "loss_lvr": 0.9145182967185974, "loss_mode_switch": 0.0, "loss_total": 0.5771626234054565, "step": 2271 }, { "batch_size": 4, "epoch": 0.9084, "step": 2271, "tokens_per_device": 6884 }, { "epoch": 0.9084, "loss_ce": 0.03401802480220795, "loss_lvr": 0.8049057126045227, "loss_mode_switch": 0.0, "loss_total": 0.11450859904289246, "step": 2271 }, { "epoch": 0.9088, "grad_norm": 1.2903721332550049, "learning_rate": 2.1653401173440558e-07, "loss": 0.2389, "step": 2272 }, { "batch_size": 4, "epoch": 0.9088, "step": 2272, "tokens_per_device": 5816 }, { "epoch": 0.9088, "loss_ce": 0.28840094804763794, "loss_lvr": 0.7760975956916809, "loss_mode_switch": 0.0, "loss_total": 0.36601072549819946, "step": 2272 }, { "batch_size": 1, "epoch": 0.9088, "step": 2272, "tokens_per_device": 5102 }, { "epoch": 0.9088, "loss_ce": 0.0024015421513468027, "loss_lvr": 0.32423415780067444, "loss_mode_switch": 0.0, "loss_total": 0.03482495993375778, "step": 2272 }, { "batch_size": 1, "epoch": 0.9088, "step": 2272, "tokens_per_device": 5129 }, { "epoch": 0.9088, "loss_ce": 0.09362088143825531, "loss_lvr": 0.43337345123291016, "loss_mode_switch": 0.0, "loss_total": 0.13695822656154633, "step": 2272 }, { "batch_size": 1, "epoch": 0.9088, "step": 2272, "tokens_per_device": 4781 }, { "epoch": 0.9088, "loss_ce": 0.13062141835689545, "loss_lvr": 0.2715509533882141, "loss_mode_switch": 0.0, "loss_total": 0.15777651965618134, "step": 2272 }, { "batch_size": 4, "epoch": 0.9088, "step": 2272, "tokens_per_device": 1340 }, { "epoch": 0.9088, "loss_ce": 0.2244548350572586, "loss_lvr": 1.0986943244934082, "loss_mode_switch": 0.0, "loss_total": 0.33432427048683167, "step": 2272 }, { "batch_size": 4, "epoch": 0.9088, "step": 2272, "tokens_per_device": 6040 }, { "epoch": 0.9088, "loss_ce": 0.05477822199463844, "loss_lvr": 0.784989595413208, "loss_mode_switch": 0.0, "loss_total": 0.13327717781066895, "step": 2272 }, { "batch_size": 1, "epoch": 0.9088, "step": 2272, "tokens_per_device": 5142 }, { "epoch": 0.9088, "loss_ce": 0.5528765320777893, "loss_lvr": 0.2900639474391937, "loss_mode_switch": 0.0, "loss_total": 0.5818829536437988, "step": 2272 }, { "batch_size": 4, "epoch": 0.9088, "step": 2272, "tokens_per_device": 4516 }, { "epoch": 0.9088, "loss_ce": 0.43414825201034546, "loss_lvr": 0.8306856155395508, "loss_mode_switch": 0.0, "loss_total": 0.5172168016433716, "step": 2272 }, { "epoch": 0.9092, "grad_norm": 1.5653787851333618, "learning_rate": 2.1465243509053713e-07, "loss": 0.3118, "step": 2273 }, { "batch_size": 4, "epoch": 0.9092, "step": 2273, "tokens_per_device": 5628 }, { "epoch": 0.9092, "loss_ce": 0.13496683537960052, "loss_lvr": 0.8478015661239624, "loss_mode_switch": 0.0, "loss_total": 0.21974699199199677, "step": 2273 }, { "batch_size": 4, "epoch": 0.9092, "step": 2273, "tokens_per_device": 2924 }, { "epoch": 0.9092, "loss_ce": 0.6133297681808472, "loss_lvr": 0.792474627494812, "loss_mode_switch": 0.0, "loss_total": 0.6925772428512573, "step": 2273 }, { "batch_size": 4, "epoch": 0.9092, "step": 2273, "tokens_per_device": 4236 }, { "epoch": 0.9092, "loss_ce": 0.19283507764339447, "loss_lvr": 0.5265369415283203, "loss_mode_switch": 0.0, "loss_total": 0.24548877775669098, "step": 2273 }, { "batch_size": 4, "epoch": 0.9092, "step": 2273, "tokens_per_device": 5624 }, { "epoch": 0.9092, "loss_ce": 0.3085779845714569, "loss_lvr": 0.9469090104103088, "loss_mode_switch": 0.0, "loss_total": 0.40326887369155884, "step": 2273 }, { "batch_size": 4, "epoch": 0.9092, "step": 2273, "tokens_per_device": 2620 }, { "epoch": 0.9092, "loss_ce": 0.37035518884658813, "loss_lvr": 0.7405301928520203, "loss_mode_switch": 0.0, "loss_total": 0.44440820813179016, "step": 2273 }, { "batch_size": 1, "epoch": 0.9092, "step": 2273, "tokens_per_device": 5110 }, { "epoch": 0.9092, "loss_ce": 0.0005186806665733457, "loss_lvr": 0.46189671754837036, "loss_mode_switch": 0.0, "loss_total": 0.046708352863788605, "step": 2273 }, { "batch_size": 1, "epoch": 0.9092, "step": 2273, "tokens_per_device": 5120 }, { "epoch": 0.9092, "loss_ce": 0.009957621805369854, "loss_lvr": 0.3736809492111206, "loss_mode_switch": 0.0, "loss_total": 0.04732571914792061, "step": 2273 }, { "batch_size": 1, "epoch": 0.9092, "step": 2273, "tokens_per_device": 4885 }, { "epoch": 0.9092, "loss_ce": 0.02086458168923855, "loss_lvr": 0.5700113773345947, "loss_mode_switch": 0.0, "loss_total": 0.07786571979522705, "step": 2273 }, { "epoch": 0.9096, "grad_norm": 1.4263421297073364, "learning_rate": 2.1277888981761753e-07, "loss": 0.2966, "step": 2274 }, { "batch_size": 4, "epoch": 0.9096, "step": 2274, "tokens_per_device": 3876 }, { "epoch": 0.9096, "loss_ce": 0.3888465166091919, "loss_lvr": 1.051566481590271, "loss_mode_switch": 0.0, "loss_total": 0.49400317668914795, "step": 2274 }, { "batch_size": 1, "epoch": 0.9096, "step": 2274, "tokens_per_device": 5944 }, { "epoch": 0.9096, "loss_ce": 0.01246338989585638, "loss_lvr": 0.42760300636291504, "loss_mode_switch": 0.0, "loss_total": 0.05522369220852852, "step": 2274 }, { "batch_size": 4, "epoch": 0.9096, "step": 2274, "tokens_per_device": 4224 }, { "epoch": 0.9096, "loss_ce": 0.386768102645874, "loss_lvr": 0.6939541101455688, "loss_mode_switch": 0.0, "loss_total": 0.45616352558135986, "step": 2274 }, { "batch_size": 4, "epoch": 0.9096, "step": 2274, "tokens_per_device": 4264 }, { "epoch": 0.9096, "loss_ce": 0.25604212284088135, "loss_lvr": 0.8781276941299438, "loss_mode_switch": 0.0, "loss_total": 0.3438549041748047, "step": 2274 }, { "batch_size": 1, "epoch": 0.9096, "step": 2274, "tokens_per_device": 4964 }, { "epoch": 0.9096, "loss_ce": 0.3795175850391388, "loss_lvr": 0.900955319404602, "loss_mode_switch": 0.0, "loss_total": 0.46961313486099243, "step": 2274 }, { "batch_size": 1, "epoch": 0.9096, "step": 2274, "tokens_per_device": 5150 }, { "epoch": 0.9096, "loss_ce": 0.06527680903673172, "loss_lvr": 0.3602745831012726, "loss_mode_switch": 0.0, "loss_total": 0.10130426287651062, "step": 2274 }, { "batch_size": 4, "epoch": 0.9096, "step": 2274, "tokens_per_device": 2720 }, { "epoch": 0.9096, "loss_ce": 0.653716504573822, "loss_lvr": 0.790640652179718, "loss_mode_switch": 0.0, "loss_total": 0.7327805757522583, "step": 2274 }, { "batch_size": 4, "epoch": 0.9096, "step": 2274, "tokens_per_device": 4196 }, { "epoch": 0.9096, "loss_ce": 0.187575563788414, "loss_lvr": 0.9834408164024353, "loss_mode_switch": 0.0, "loss_total": 0.2859196364879608, "step": 2274 }, { "epoch": 0.91, "grad_norm": 1.2571611404418945, "learning_rate": 2.109133790600648e-07, "loss": 0.2844, "step": 2275 }, { "batch_size": 1, "epoch": 0.91, "step": 2275, "tokens_per_device": 4760 }, { "epoch": 0.91, "loss_ce": 0.020350219681859016, "loss_lvr": 0.2058110535144806, "loss_mode_switch": 0.0, "loss_total": 0.040931325405836105, "step": 2275 }, { "batch_size": 4, "epoch": 0.91, "step": 2275, "tokens_per_device": 3976 }, { "epoch": 0.91, "loss_ce": 0.22007781267166138, "loss_lvr": 0.6996307969093323, "loss_mode_switch": 0.0, "loss_total": 0.29004091024398804, "step": 2275 }, { "batch_size": 4, "epoch": 0.91, "step": 2275, "tokens_per_device": 1420 }, { "epoch": 0.91, "loss_ce": 0.45660361647605896, "loss_lvr": 0.7633267045021057, "loss_mode_switch": 0.0, "loss_total": 0.5329362750053406, "step": 2275 }, { "batch_size": 4, "epoch": 0.91, "step": 2275, "tokens_per_device": 8024 }, { "epoch": 0.91, "loss_ce": 0.40300634503364563, "loss_lvr": 1.0910441875457764, "loss_mode_switch": 0.0, "loss_total": 0.5121107697486877, "step": 2275 }, { "batch_size": 4, "epoch": 0.91, "step": 2275, "tokens_per_device": 15472 }, { "epoch": 0.91, "loss_ce": 0.015207945369184017, "loss_lvr": 0.6366586685180664, "loss_mode_switch": 0.0, "loss_total": 0.07887381315231323, "step": 2275 }, { "batch_size": 4, "epoch": 0.91, "step": 2275, "tokens_per_device": 12864 }, { "epoch": 0.91, "loss_ce": 0.04834512621164322, "loss_lvr": 0.45457980036735535, "loss_mode_switch": 0.0, "loss_total": 0.09380310773849487, "step": 2275 }, { "batch_size": 1, "epoch": 0.91, "step": 2275, "tokens_per_device": 4896 }, { "epoch": 0.91, "loss_ce": 0.04664412513375282, "loss_lvr": 0.2871386706829071, "loss_mode_switch": 0.0, "loss_total": 0.07535798847675323, "step": 2275 }, { "batch_size": 1, "epoch": 0.91, "step": 2275, "tokens_per_device": 4898 }, { "epoch": 0.91, "loss_ce": 0.040607303380966187, "loss_lvr": 0.21206922829151154, "loss_mode_switch": 0.0, "loss_total": 0.06181422621011734, "step": 2275 }, { "epoch": 0.9104, "grad_norm": 1.3027368783950806, "learning_rate": 2.0905590594881342e-07, "loss": 0.2996, "step": 2276 }, { "batch_size": 4, "epoch": 0.9104, "step": 2276, "tokens_per_device": 4180 }, { "epoch": 0.9104, "loss_ce": 0.23510387539863586, "loss_lvr": 0.8274165987968445, "loss_mode_switch": 0.0, "loss_total": 0.31784552335739136, "step": 2276 }, { "batch_size": 1, "epoch": 0.9104, "step": 2276, "tokens_per_device": 4741 }, { "epoch": 0.9104, "loss_ce": 0.0022630130406469107, "loss_lvr": 0.32734033465385437, "loss_mode_switch": 0.0, "loss_total": 0.03499704599380493, "step": 2276 }, { "batch_size": 4, "epoch": 0.9104, "step": 2276, "tokens_per_device": 5708 }, { "epoch": 0.9104, "loss_ce": 0.23579052090644836, "loss_lvr": 0.6144058108329773, "loss_mode_switch": 0.0, "loss_total": 0.29723110795021057, "step": 2276 }, { "batch_size": 4, "epoch": 0.9104, "step": 2276, "tokens_per_device": 4224 }, { "epoch": 0.9104, "loss_ce": 0.17035897076129913, "loss_lvr": 0.9725189805030823, "loss_mode_switch": 0.0, "loss_total": 0.2676108777523041, "step": 2276 }, { "batch_size": 4, "epoch": 0.9104, "step": 2276, "tokens_per_device": 3508 }, { "epoch": 0.9104, "loss_ce": 0.4522409439086914, "loss_lvr": 0.8555576205253601, "loss_mode_switch": 0.0, "loss_total": 0.5377967357635498, "step": 2276 }, { "batch_size": 4, "epoch": 0.9104, "step": 2276, "tokens_per_device": 6056 }, { "epoch": 0.9104, "loss_ce": 0.17179037630558014, "loss_lvr": 0.8284249901771545, "loss_mode_switch": 0.0, "loss_total": 0.2546328902244568, "step": 2276 }, { "batch_size": 4, "epoch": 0.9104, "step": 2276, "tokens_per_device": 3904 }, { "epoch": 0.9104, "loss_ce": 0.02261389046907425, "loss_lvr": 0.9442196488380432, "loss_mode_switch": 0.0, "loss_total": 0.11703585833311081, "step": 2276 }, { "batch_size": 1, "epoch": 0.9104, "step": 2276, "tokens_per_device": 5178 }, { "epoch": 0.9104, "loss_ce": 0.010338314808905125, "loss_lvr": 0.22547106444835663, "loss_mode_switch": 0.0, "loss_total": 0.03288542106747627, "step": 2276 }, { "epoch": 0.9108, "grad_norm": 1.3750778436660767, "learning_rate": 2.0720647360130687e-07, "loss": 0.2802, "step": 2277 }, { "batch_size": 4, "epoch": 0.9108, "step": 2277, "tokens_per_device": 5724 }, { "epoch": 0.9108, "loss_ce": 0.1711106151342392, "loss_lvr": 0.5585313439369202, "loss_mode_switch": 0.0, "loss_total": 0.22696375846862793, "step": 2277 }, { "batch_size": 1, "epoch": 0.9108, "step": 2277, "tokens_per_device": 5096 }, { "epoch": 0.9108, "loss_ce": 0.033265091478824615, "loss_lvr": 0.5841756463050842, "loss_mode_switch": 0.0, "loss_total": 0.09168265759944916, "step": 2277 }, { "batch_size": 1, "epoch": 0.9108, "step": 2277, "tokens_per_device": 5249 }, { "epoch": 0.9108, "loss_ce": 0.20608678460121155, "loss_lvr": 0.40394285321235657, "loss_mode_switch": 0.0, "loss_total": 0.24648107588291168, "step": 2277 }, { "batch_size": 1, "epoch": 0.9108, "step": 2277, "tokens_per_device": 5112 }, { "epoch": 0.9108, "loss_ce": 0.0004884605295956135, "loss_lvr": 0.4126376807689667, "loss_mode_switch": 0.0, "loss_total": 0.04175223037600517, "step": 2277 }, { "batch_size": 1, "epoch": 0.9108, "step": 2277, "tokens_per_device": 4866 }, { "epoch": 0.9108, "loss_ce": 0.024644380435347557, "loss_lvr": 0.24173375964164734, "loss_mode_switch": 0.0, "loss_total": 0.04881775751709938, "step": 2277 }, { "batch_size": 4, "epoch": 0.9108, "step": 2277, "tokens_per_device": 2660 }, { "epoch": 0.9108, "loss_ce": 0.515872597694397, "loss_lvr": 0.8732786774635315, "loss_mode_switch": 0.0, "loss_total": 0.6032004356384277, "step": 2277 }, { "batch_size": 4, "epoch": 0.9108, "step": 2277, "tokens_per_device": 4156 }, { "epoch": 0.9108, "loss_ce": 0.32436805963516235, "loss_lvr": 0.7277243733406067, "loss_mode_switch": 0.0, "loss_total": 0.3971405029296875, "step": 2277 }, { "batch_size": 4, "epoch": 0.9108, "step": 2277, "tokens_per_device": 2740 }, { "epoch": 0.9108, "loss_ce": 0.3124849498271942, "loss_lvr": 0.46303418278694153, "loss_mode_switch": 0.0, "loss_total": 0.3587883710861206, "step": 2277 }, { "epoch": 0.9112, "grad_norm": 1.2035890817642212, "learning_rate": 2.053650851214961e-07, "loss": 0.2474, "step": 2278 }, { "batch_size": 4, "epoch": 0.9112, "step": 2278, "tokens_per_device": 4208 }, { "epoch": 0.9112, "loss_ce": 0.36734986305236816, "loss_lvr": 0.8300653100013733, "loss_mode_switch": 0.0, "loss_total": 0.4503563940525055, "step": 2278 }, { "batch_size": 1, "epoch": 0.9112, "step": 2278, "tokens_per_device": 4969 }, { "epoch": 0.9112, "loss_ce": 0.030500153079628944, "loss_lvr": 0.8057507872581482, "loss_mode_switch": 0.0, "loss_total": 0.11107522994279861, "step": 2278 }, { "batch_size": 1, "epoch": 0.9112, "step": 2278, "tokens_per_device": 4890 }, { "epoch": 0.9112, "loss_ce": 0.15076231956481934, "loss_lvr": 0.19170013070106506, "loss_mode_switch": 0.0, "loss_total": 0.16993233561515808, "step": 2278 }, { "batch_size": 1, "epoch": 0.9112, "step": 2278, "tokens_per_device": 5597 }, { "epoch": 0.9112, "loss_ce": 0.20450715720653534, "loss_lvr": 0.3353635370731354, "loss_mode_switch": 0.0, "loss_total": 0.23804351687431335, "step": 2278 }, { "batch_size": 1, "epoch": 0.9112, "step": 2278, "tokens_per_device": 4825 }, { "epoch": 0.9112, "loss_ce": 0.11600731313228607, "loss_lvr": 0.3425513207912445, "loss_mode_switch": 0.0, "loss_total": 0.15026244521141052, "step": 2278 }, { "batch_size": 4, "epoch": 0.9112, "step": 2278, "tokens_per_device": 4836 }, { "epoch": 0.9112, "loss_ce": 0.5330229997634888, "loss_lvr": 0.5979449152946472, "loss_mode_switch": 0.0, "loss_total": 0.592817485332489, "step": 2278 }, { "batch_size": 1, "epoch": 0.9112, "step": 2278, "tokens_per_device": 4588 }, { "epoch": 0.9112, "loss_ce": 0.004716263618320227, "loss_lvr": 0.5468336343765259, "loss_mode_switch": 0.0, "loss_total": 0.05939962714910507, "step": 2278 }, { "batch_size": 4, "epoch": 0.9112, "step": 2278, "tokens_per_device": 4604 }, { "epoch": 0.9112, "loss_ce": 0.3837905824184418, "loss_lvr": 0.8093589544296265, "loss_mode_switch": 0.0, "loss_total": 0.4647264778614044, "step": 2278 }, { "epoch": 0.9116, "grad_norm": 1.272792100906372, "learning_rate": 2.0353174359983074e-07, "loss": 0.3001, "step": 2279 }, { "batch_size": 4, "epoch": 0.9116, "step": 2279, "tokens_per_device": 4196 }, { "epoch": 0.9116, "loss_ce": 0.28484776616096497, "loss_lvr": 0.5028451085090637, "loss_mode_switch": 0.0, "loss_total": 0.33513227105140686, "step": 2279 }, { "batch_size": 4, "epoch": 0.9116, "step": 2279, "tokens_per_device": 4076 }, { "epoch": 0.9116, "loss_ce": 0.25911229848861694, "loss_lvr": 0.907685399055481, "loss_mode_switch": 0.0, "loss_total": 0.3498808443546295, "step": 2279 }, { "batch_size": 4, "epoch": 0.9116, "step": 2279, "tokens_per_device": 6616 }, { "epoch": 0.9116, "loss_ce": 0.21532051265239716, "loss_lvr": 0.7764282822608948, "loss_mode_switch": 0.0, "loss_total": 0.29296332597732544, "step": 2279 }, { "batch_size": 4, "epoch": 0.9116, "step": 2279, "tokens_per_device": 4208 }, { "epoch": 0.9116, "loss_ce": 0.38973236083984375, "loss_lvr": 0.9038189649581909, "loss_mode_switch": 0.0, "loss_total": 0.48011425137519836, "step": 2279 }, { "batch_size": 4, "epoch": 0.9116, "step": 2279, "tokens_per_device": 5028 }, { "epoch": 0.9116, "loss_ce": 0.0012617480242624879, "loss_lvr": 0.5891137719154358, "loss_mode_switch": 0.0, "loss_total": 0.06017312780022621, "step": 2279 }, { "batch_size": 1, "epoch": 0.9116, "step": 2279, "tokens_per_device": 4723 }, { "epoch": 0.9116, "loss_ce": 0.0031775133684277534, "loss_lvr": 0.4625180661678314, "loss_mode_switch": 0.0, "loss_total": 0.04942931979894638, "step": 2279 }, { "batch_size": 4, "epoch": 0.9116, "step": 2279, "tokens_per_device": 11168 }, { "epoch": 0.9116, "loss_ce": 0.13393667340278625, "loss_lvr": 0.5854388475418091, "loss_mode_switch": 0.0, "loss_total": 0.19248056411743164, "step": 2279 }, { "batch_size": 4, "epoch": 0.9116, "step": 2279, "tokens_per_device": 6352 }, { "epoch": 0.9116, "loss_ce": 0.2811368703842163, "loss_lvr": 0.6361420750617981, "loss_mode_switch": 0.0, "loss_total": 0.3447510898113251, "step": 2279 }, { "epoch": 0.912, "grad_norm": 1.2215087413787842, "learning_rate": 2.0170645211325335e-07, "loss": 0.2549, "step": 2280 }, { "batch_size": 4, "epoch": 0.912, "step": 2280, "tokens_per_device": 6072 }, { "epoch": 0.912, "loss_ce": 0.12840799987316132, "loss_lvr": 0.5294479727745056, "loss_mode_switch": 0.0, "loss_total": 0.18135279417037964, "step": 2280 }, { "batch_size": 1, "epoch": 0.912, "step": 2280, "tokens_per_device": 4566 }, { "epoch": 0.912, "loss_ce": 0.03129927068948746, "loss_lvr": 0.36540481448173523, "loss_mode_switch": 0.0, "loss_total": 0.06783975660800934, "step": 2280 }, { "batch_size": 1, "epoch": 0.912, "step": 2280, "tokens_per_device": 4922 }, { "epoch": 0.912, "loss_ce": 0.06411053985357285, "loss_lvr": 0.25040796399116516, "loss_mode_switch": 0.0, "loss_total": 0.08915133774280548, "step": 2280 }, { "batch_size": 1, "epoch": 0.912, "step": 2280, "tokens_per_device": 4399 }, { "epoch": 0.912, "loss_ce": 0.10324853658676147, "loss_lvr": 0.4345991909503937, "loss_mode_switch": 0.0, "loss_total": 0.14670845866203308, "step": 2280 }, { "batch_size": 4, "epoch": 0.912, "step": 2280, "tokens_per_device": 6132 }, { "epoch": 0.912, "loss_ce": 0.0502835176885128, "loss_lvr": 0.7799514532089233, "loss_mode_switch": 0.0, "loss_total": 0.12827865779399872, "step": 2280 }, { "batch_size": 4, "epoch": 0.912, "step": 2280, "tokens_per_device": 6320 }, { "epoch": 0.912, "loss_ce": 0.2927878797054291, "loss_lvr": 0.718639075756073, "loss_mode_switch": 0.0, "loss_total": 0.36465179920196533, "step": 2280 }, { "batch_size": 1, "epoch": 0.912, "step": 2280, "tokens_per_device": 4857 }, { "epoch": 0.912, "loss_ce": 0.0010182056576013565, "loss_lvr": 0.31861522793769836, "loss_mode_switch": 0.0, "loss_total": 0.03287973254919052, "step": 2280 }, { "batch_size": 1, "epoch": 0.912, "step": 2280, "tokens_per_device": 5117 }, { "epoch": 0.912, "loss_ce": 0.010757429525256157, "loss_lvr": 0.42368462681770325, "loss_mode_switch": 0.0, "loss_total": 0.05312589555978775, "step": 2280 }, { "epoch": 0.9124, "grad_norm": 1.4086699485778809, "learning_rate": 1.9988921372519732e-07, "loss": 0.2659, "step": 2281 }, { "batch_size": 4, "epoch": 0.9124, "step": 2281, "tokens_per_device": 1640 }, { "epoch": 0.9124, "loss_ce": 0.4746340215206146, "loss_lvr": 1.1400032043457031, "loss_mode_switch": 0.0, "loss_total": 0.5886343717575073, "step": 2281 }, { "batch_size": 1, "epoch": 0.9124, "step": 2281, "tokens_per_device": 5164 }, { "epoch": 0.9124, "loss_ce": 0.017767487093806267, "loss_lvr": 0.5480272173881531, "loss_mode_switch": 0.0, "loss_total": 0.07257021218538284, "step": 2281 }, { "batch_size": 4, "epoch": 0.9124, "step": 2281, "tokens_per_device": 4404 }, { "epoch": 0.9124, "loss_ce": 0.10426981002092361, "loss_lvr": 0.7157184481620789, "loss_mode_switch": 0.0, "loss_total": 0.17584165930747986, "step": 2281 }, { "batch_size": 4, "epoch": 0.9124, "step": 2281, "tokens_per_device": 3740 }, { "epoch": 0.9124, "loss_ce": 0.07846074551343918, "loss_lvr": 0.7349952459335327, "loss_mode_switch": 0.0, "loss_total": 0.15196026861667633, "step": 2281 }, { "batch_size": 4, "epoch": 0.9124, "step": 2281, "tokens_per_device": 2568 }, { "epoch": 0.9124, "loss_ce": 0.3798755407333374, "loss_lvr": 0.8605562448501587, "loss_mode_switch": 0.0, "loss_total": 0.4659311771392822, "step": 2281 }, { "batch_size": 4, "epoch": 0.9124, "step": 2281, "tokens_per_device": 1712 }, { "epoch": 0.9124, "loss_ce": 0.059272170066833496, "loss_lvr": 0.9485235810279846, "loss_mode_switch": 0.0, "loss_total": 0.15412452816963196, "step": 2281 }, { "batch_size": 4, "epoch": 0.9124, "step": 2281, "tokens_per_device": 4332 }, { "epoch": 0.9124, "loss_ce": 0.2543361186981201, "loss_lvr": 0.8491540551185608, "loss_mode_switch": 0.0, "loss_total": 0.3392515182495117, "step": 2281 }, { "batch_size": 4, "epoch": 0.9124, "step": 2281, "tokens_per_device": 6264 }, { "epoch": 0.9124, "loss_ce": 0.1291249394416809, "loss_lvr": 0.6861594915390015, "loss_mode_switch": 0.0, "loss_total": 0.19774088263511658, "step": 2281 }, { "epoch": 0.9128, "grad_norm": 1.2626627683639526, "learning_rate": 1.9808003148558074e-07, "loss": 0.2796, "step": 2282 }, { "batch_size": 1, "epoch": 0.9128, "step": 2282, "tokens_per_device": 5899 }, { "epoch": 0.9128, "loss_ce": 0.0050576296634972095, "loss_lvr": 0.5273855328559875, "loss_mode_switch": 0.0, "loss_total": 0.057796183973550797, "step": 2282 }, { "batch_size": 1, "epoch": 0.9128, "step": 2282, "tokens_per_device": 6576 }, { "epoch": 0.9128, "loss_ce": 0.009642924182116985, "loss_lvr": 0.4064529836177826, "loss_mode_switch": 0.0, "loss_total": 0.05028822273015976, "step": 2282 }, { "batch_size": 1, "epoch": 0.9128, "step": 2282, "tokens_per_device": 5021 }, { "epoch": 0.9128, "loss_ce": 0.014450076967477798, "loss_lvr": 0.3062528967857361, "loss_mode_switch": 0.0, "loss_total": 0.04507536441087723, "step": 2282 }, { "batch_size": 1, "epoch": 0.9128, "step": 2282, "tokens_per_device": 4907 }, { "epoch": 0.9128, "loss_ce": 0.25161099433898926, "loss_lvr": 0.40532898902893066, "loss_mode_switch": 0.0, "loss_total": 0.29214388132095337, "step": 2282 }, { "batch_size": 4, "epoch": 0.9128, "step": 2282, "tokens_per_device": 4996 }, { "epoch": 0.9128, "loss_ce": 0.24386665225028992, "loss_lvr": 0.6924334764480591, "loss_mode_switch": 0.0, "loss_total": 0.31310999393463135, "step": 2282 }, { "batch_size": 4, "epoch": 0.9128, "step": 2282, "tokens_per_device": 3996 }, { "epoch": 0.9128, "loss_ce": 0.42885085940361023, "loss_lvr": 0.8238468170166016, "loss_mode_switch": 0.0, "loss_total": 0.5112355351448059, "step": 2282 }, { "batch_size": 4, "epoch": 0.9128, "step": 2282, "tokens_per_device": 4192 }, { "epoch": 0.9128, "loss_ce": 0.22714965045452118, "loss_lvr": 0.8517638444900513, "loss_mode_switch": 0.0, "loss_total": 0.312326043844223, "step": 2282 }, { "batch_size": 1, "epoch": 0.9128, "step": 2282, "tokens_per_device": 4836 }, { "epoch": 0.9128, "loss_ce": 0.10478127002716064, "loss_lvr": 0.38315340876579285, "loss_mode_switch": 0.0, "loss_total": 0.14309661090373993, "step": 2282 }, { "epoch": 0.9132, "grad_norm": 1.1472517251968384, "learning_rate": 1.9627890843080034e-07, "loss": 0.2566, "step": 2283 }, { "batch_size": 4, "epoch": 0.9132, "step": 2283, "tokens_per_device": 4256 }, { "epoch": 0.9132, "loss_ce": 0.1834939569234848, "loss_lvr": 0.7511298060417175, "loss_mode_switch": 0.0, "loss_total": 0.2586069405078888, "step": 2283 }, { "batch_size": 4, "epoch": 0.9132, "step": 2283, "tokens_per_device": 4596 }, { "epoch": 0.9132, "loss_ce": 0.4748358428478241, "loss_lvr": 0.534286618232727, "loss_mode_switch": 0.0, "loss_total": 0.5282645225524902, "step": 2283 }, { "batch_size": 4, "epoch": 0.9132, "step": 2283, "tokens_per_device": 10676 }, { "epoch": 0.9132, "loss_ce": 0.03200959041714668, "loss_lvr": 0.636151909828186, "loss_mode_switch": 0.0, "loss_total": 0.09562478959560394, "step": 2283 }, { "batch_size": 4, "epoch": 0.9132, "step": 2283, "tokens_per_device": 1604 }, { "epoch": 0.9132, "loss_ce": 0.32141345739364624, "loss_lvr": 0.7881930470466614, "loss_mode_switch": 0.0, "loss_total": 0.4002327620983124, "step": 2283 }, { "batch_size": 1, "epoch": 0.9132, "step": 2283, "tokens_per_device": 4879 }, { "epoch": 0.9132, "loss_ce": 0.1131361648440361, "loss_lvr": 0.16588139533996582, "loss_mode_switch": 0.0, "loss_total": 0.12972430884838104, "step": 2283 }, { "batch_size": 4, "epoch": 0.9132, "step": 2283, "tokens_per_device": 6252 }, { "epoch": 0.9132, "loss_ce": 0.21799807250499725, "loss_lvr": 0.8390465378761292, "loss_mode_switch": 0.0, "loss_total": 0.301902711391449, "step": 2283 }, { "batch_size": 4, "epoch": 0.9132, "step": 2283, "tokens_per_device": 4172 }, { "epoch": 0.9132, "loss_ce": 0.2624744474887848, "loss_lvr": 0.5903558731079102, "loss_mode_switch": 0.0, "loss_total": 0.32151004672050476, "step": 2283 }, { "batch_size": 4, "epoch": 0.9132, "step": 2283, "tokens_per_device": 5860 }, { "epoch": 0.9132, "loss_ce": 0.22955432534217834, "loss_lvr": 0.913861870765686, "loss_mode_switch": 0.0, "loss_total": 0.3209405243396759, "step": 2283 }, { "epoch": 0.9136, "grad_norm": 1.1974730491638184, "learning_rate": 1.9448584758372745e-07, "loss": 0.2971, "step": 2284 }, { "batch_size": 4, "epoch": 0.9136, "step": 2284, "tokens_per_device": 8548 }, { "epoch": 0.9136, "loss_ce": 0.6207478046417236, "loss_lvr": 0.34145623445510864, "loss_mode_switch": 0.0, "loss_total": 0.6548933982849121, "step": 2284 }, { "batch_size": 4, "epoch": 0.9136, "step": 2284, "tokens_per_device": 8828 }, { "epoch": 0.9136, "loss_ce": 0.008992058224976063, "loss_lvr": 0.6179933547973633, "loss_mode_switch": 0.0, "loss_total": 0.07079139351844788, "step": 2284 }, { "batch_size": 4, "epoch": 0.9136, "step": 2284, "tokens_per_device": 7216 }, { "epoch": 0.9136, "loss_ce": 0.16718930006027222, "loss_lvr": 0.8094311952590942, "loss_mode_switch": 0.0, "loss_total": 0.24813242256641388, "step": 2284 }, { "batch_size": 4, "epoch": 0.9136, "step": 2284, "tokens_per_device": 5568 }, { "epoch": 0.9136, "loss_ce": 0.028435934334993362, "loss_lvr": 0.7956058382987976, "loss_mode_switch": 0.0, "loss_total": 0.10799652338027954, "step": 2284 }, { "batch_size": 4, "epoch": 0.9136, "step": 2284, "tokens_per_device": 1224 }, { "epoch": 0.9136, "loss_ce": 0.15651081502437592, "loss_lvr": 1.0016249418258667, "loss_mode_switch": 0.0, "loss_total": 0.25667330622673035, "step": 2284 }, { "batch_size": 4, "epoch": 0.9136, "step": 2284, "tokens_per_device": 4132 }, { "epoch": 0.9136, "loss_ce": 0.21055372059345245, "loss_lvr": 0.8035723567008972, "loss_mode_switch": 0.0, "loss_total": 0.2909109592437744, "step": 2284 }, { "batch_size": 4, "epoch": 0.9136, "step": 2284, "tokens_per_device": 1500 }, { "epoch": 0.9136, "loss_ce": 0.46046823263168335, "loss_lvr": 1.1460322141647339, "loss_mode_switch": 0.0, "loss_total": 0.5750714540481567, "step": 2284 }, { "batch_size": 4, "epoch": 0.9136, "step": 2284, "tokens_per_device": 1444 }, { "epoch": 0.9136, "loss_ce": 0.6591421961784363, "loss_lvr": 1.2366091012954712, "loss_mode_switch": 0.0, "loss_total": 0.7828031182289124, "step": 2284 }, { "epoch": 0.914, "grad_norm": 1.2049627304077148, "learning_rate": 1.9270085195370048e-07, "loss": 0.3073, "step": 2285 }, { "batch_size": 4, "epoch": 0.914, "step": 2285, "tokens_per_device": 4080 }, { "epoch": 0.914, "loss_ce": 0.19909727573394775, "loss_lvr": 0.9376586079597473, "loss_mode_switch": 0.0, "loss_total": 0.292863130569458, "step": 2285 }, { "batch_size": 4, "epoch": 0.914, "step": 2285, "tokens_per_device": 4796 }, { "epoch": 0.914, "loss_ce": 0.6356706619262695, "loss_lvr": 0.801593542098999, "loss_mode_switch": 0.0, "loss_total": 0.7158300280570984, "step": 2285 }, { "batch_size": 4, "epoch": 0.914, "step": 2285, "tokens_per_device": 6164 }, { "epoch": 0.914, "loss_ce": 0.0558277927339077, "loss_lvr": 0.5883361101150513, "loss_mode_switch": 0.0, "loss_total": 0.11466140300035477, "step": 2285 }, { "batch_size": 1, "epoch": 0.914, "step": 2285, "tokens_per_device": 5202 }, { "epoch": 0.914, "loss_ce": 0.05054054409265518, "loss_lvr": 0.449445903301239, "loss_mode_switch": 0.0, "loss_total": 0.0954851359128952, "step": 2285 }, { "batch_size": 4, "epoch": 0.914, "step": 2285, "tokens_per_device": 5080 }, { "epoch": 0.914, "loss_ce": 0.013853335753083229, "loss_lvr": 0.5743593573570251, "loss_mode_switch": 0.0, "loss_total": 0.07128927111625671, "step": 2285 }, { "batch_size": 4, "epoch": 0.914, "step": 2285, "tokens_per_device": 4988 }, { "epoch": 0.914, "loss_ce": 0.19242937862873077, "loss_lvr": 1.2729798555374146, "loss_mode_switch": 0.0, "loss_total": 0.31972736120224, "step": 2285 }, { "batch_size": 1, "epoch": 0.914, "step": 2285, "tokens_per_device": 7073 }, { "epoch": 0.914, "loss_ce": 0.00023899432562757283, "loss_lvr": 0.25406205654144287, "loss_mode_switch": 0.0, "loss_total": 0.025645200163125992, "step": 2285 }, { "batch_size": 4, "epoch": 0.914, "step": 2285, "tokens_per_device": 4404 }, { "epoch": 0.914, "loss_ce": 0.2920630872249603, "loss_lvr": 1.0631651878356934, "loss_mode_switch": 0.0, "loss_total": 0.3983796238899231, "step": 2285 }, { "epoch": 0.9144, "grad_norm": 1.3626439571380615, "learning_rate": 1.9092392453652352e-07, "loss": 0.2674, "step": 2286 }, { "batch_size": 1, "epoch": 0.9144, "step": 2286, "tokens_per_device": 5177 }, { "epoch": 0.9144, "loss_ce": 0.0024208740796893835, "loss_lvr": 0.3691693842411041, "loss_mode_switch": 0.0, "loss_total": 0.03933781012892723, "step": 2286 }, { "batch_size": 1, "epoch": 0.9144, "step": 2286, "tokens_per_device": 4901 }, { "epoch": 0.9144, "loss_ce": 0.0030336612835526466, "loss_lvr": 0.8944694995880127, "loss_mode_switch": 0.0, "loss_total": 0.0924806147813797, "step": 2286 }, { "batch_size": 4, "epoch": 0.9144, "step": 2286, "tokens_per_device": 2596 }, { "epoch": 0.9144, "loss_ce": 0.05109594762325287, "loss_lvr": 0.8215514421463013, "loss_mode_switch": 0.0, "loss_total": 0.1332511007785797, "step": 2286 }, { "batch_size": 4, "epoch": 0.9144, "step": 2286, "tokens_per_device": 8632 }, { "epoch": 0.9144, "loss_ce": 0.03151790797710419, "loss_lvr": 0.6113298535346985, "loss_mode_switch": 0.0, "loss_total": 0.0926508903503418, "step": 2286 }, { "batch_size": 4, "epoch": 0.9144, "step": 2286, "tokens_per_device": 13652 }, { "epoch": 0.9144, "loss_ce": 0.18390904366970062, "loss_lvr": 0.5574102401733398, "loss_mode_switch": 0.0, "loss_total": 0.23965007066726685, "step": 2286 }, { "batch_size": 4, "epoch": 0.9144, "step": 2286, "tokens_per_device": 4296 }, { "epoch": 0.9144, "loss_ce": 0.06579763442277908, "loss_lvr": 0.7866835594177246, "loss_mode_switch": 0.0, "loss_total": 0.14446598291397095, "step": 2286 }, { "batch_size": 4, "epoch": 0.9144, "step": 2286, "tokens_per_device": 3828 }, { "epoch": 0.9144, "loss_ce": 0.41887298226356506, "loss_lvr": 0.8494544625282288, "loss_mode_switch": 0.0, "loss_total": 0.5038184523582458, "step": 2286 }, { "batch_size": 1, "epoch": 0.9144, "step": 2286, "tokens_per_device": 5215 }, { "epoch": 0.9144, "loss_ce": 0.005575342103838921, "loss_lvr": 0.4427156448364258, "loss_mode_switch": 0.0, "loss_total": 0.04984690994024277, "step": 2286 }, { "epoch": 0.9148, "grad_norm": 1.1450083255767822, "learning_rate": 1.8915506831445996e-07, "loss": 0.2156, "step": 2287 }, { "batch_size": 1, "epoch": 0.9148, "step": 2287, "tokens_per_device": 4940 }, { "epoch": 0.9148, "loss_ce": 0.01104225404560566, "loss_lvr": 0.4847915768623352, "loss_mode_switch": 0.0, "loss_total": 0.05952141433954239, "step": 2287 }, { "batch_size": 4, "epoch": 0.9148, "step": 2287, "tokens_per_device": 2688 }, { "epoch": 0.9148, "loss_ce": 0.08699735254049301, "loss_lvr": 0.864291787147522, "loss_mode_switch": 0.0, "loss_total": 0.1734265387058258, "step": 2287 }, { "batch_size": 4, "epoch": 0.9148, "step": 2287, "tokens_per_device": 4756 }, { "epoch": 0.9148, "loss_ce": 0.04902985319495201, "loss_lvr": 0.8845328092575073, "loss_mode_switch": 0.0, "loss_total": 0.1374831348657608, "step": 2287 }, { "batch_size": 4, "epoch": 0.9148, "step": 2287, "tokens_per_device": 3884 }, { "epoch": 0.9148, "loss_ce": 0.3134119212627411, "loss_lvr": 0.7693662643432617, "loss_mode_switch": 0.0, "loss_total": 0.39034855365753174, "step": 2287 }, { "batch_size": 4, "epoch": 0.9148, "step": 2287, "tokens_per_device": 2628 }, { "epoch": 0.9148, "loss_ce": 0.2016560286283493, "loss_lvr": 1.0075509548187256, "loss_mode_switch": 0.0, "loss_total": 0.30241113901138306, "step": 2287 }, { "batch_size": 4, "epoch": 0.9148, "step": 2287, "tokens_per_device": 4440 }, { "epoch": 0.9148, "loss_ce": 0.30844226479530334, "loss_lvr": 1.169301152229309, "loss_mode_switch": 0.0, "loss_total": 0.4253723919391632, "step": 2287 }, { "batch_size": 4, "epoch": 0.9148, "step": 2287, "tokens_per_device": 1320 }, { "epoch": 0.9148, "loss_ce": 0.6858093738555908, "loss_lvr": 1.104764461517334, "loss_mode_switch": 0.0, "loss_total": 0.7962858080863953, "step": 2287 }, { "batch_size": 1, "epoch": 0.9148, "step": 2287, "tokens_per_device": 5257 }, { "epoch": 0.9148, "loss_ce": 0.012460982427001, "loss_lvr": 0.7886662483215332, "loss_mode_switch": 0.0, "loss_total": 0.09132760763168335, "step": 2287 }, { "epoch": 0.9152, "grad_norm": 1.1024192571640015, "learning_rate": 1.8739428625622614e-07, "loss": 0.2446, "step": 2288 }, { "batch_size": 1, "epoch": 0.9152, "step": 2288, "tokens_per_device": 6496 }, { "epoch": 0.9152, "loss_ce": 0.1517053246498108, "loss_lvr": 0.330704003572464, "loss_mode_switch": 0.0, "loss_total": 0.1847757250070572, "step": 2288 }, { "batch_size": 4, "epoch": 0.9152, "step": 2288, "tokens_per_device": 4836 }, { "epoch": 0.9152, "loss_ce": 0.37742871046066284, "loss_lvr": 0.8963094353675842, "loss_mode_switch": 0.0, "loss_total": 0.4670596718788147, "step": 2288 }, { "batch_size": 4, "epoch": 0.9152, "step": 2288, "tokens_per_device": 5576 }, { "epoch": 0.9152, "loss_ce": 0.43971002101898193, "loss_lvr": 0.7147220969200134, "loss_mode_switch": 0.0, "loss_total": 0.5111822485923767, "step": 2288 }, { "batch_size": 4, "epoch": 0.9152, "step": 2288, "tokens_per_device": 3956 }, { "epoch": 0.9152, "loss_ce": 0.10290229320526123, "loss_lvr": 0.7459331154823303, "loss_mode_switch": 0.0, "loss_total": 0.17749559879302979, "step": 2288 }, { "batch_size": 4, "epoch": 0.9152, "step": 2288, "tokens_per_device": 2820 }, { "epoch": 0.9152, "loss_ce": 0.08288872987031937, "loss_lvr": 0.5107141137123108, "loss_mode_switch": 0.0, "loss_total": 0.13396014273166656, "step": 2288 }, { "batch_size": 1, "epoch": 0.9152, "step": 2288, "tokens_per_device": 5190 }, { "epoch": 0.9152, "loss_ce": 0.0036793015897274017, "loss_lvr": 0.518542468547821, "loss_mode_switch": 0.0, "loss_total": 0.055533550679683685, "step": 2288 }, { "batch_size": 4, "epoch": 0.9152, "step": 2288, "tokens_per_device": 4456 }, { "epoch": 0.9152, "loss_ce": 0.2122202068567276, "loss_lvr": 1.396064281463623, "loss_mode_switch": 0.0, "loss_total": 0.35182663798332214, "step": 2288 }, { "batch_size": 4, "epoch": 0.9152, "step": 2288, "tokens_per_device": 1252 }, { "epoch": 0.9152, "loss_ce": 0.6114485859870911, "loss_lvr": 1.0696487426757812, "loss_mode_switch": 0.0, "loss_total": 0.7184134721755981, "step": 2288 }, { "epoch": 0.9156, "grad_norm": 1.3699392080307007, "learning_rate": 1.856415813169876e-07, "loss": 0.2719, "step": 2289 }, { "batch_size": 4, "epoch": 0.9156, "step": 2289, "tokens_per_device": 9556 }, { "epoch": 0.9156, "loss_ce": 0.160833939909935, "loss_lvr": 0.7523522973060608, "loss_mode_switch": 0.0, "loss_total": 0.23606917262077332, "step": 2289 }, { "batch_size": 1, "epoch": 0.9156, "step": 2289, "tokens_per_device": 4962 }, { "epoch": 0.9156, "loss_ce": 0.04311933368444443, "loss_lvr": 0.4845666289329529, "loss_mode_switch": 0.0, "loss_total": 0.0915759950876236, "step": 2289 }, { "batch_size": 1, "epoch": 0.9156, "step": 2289, "tokens_per_device": 6194 }, { "epoch": 0.9156, "loss_ce": 0.005000762641429901, "loss_lvr": 0.35528239607810974, "loss_mode_switch": 0.0, "loss_total": 0.040529001504182816, "step": 2289 }, { "batch_size": 4, "epoch": 0.9156, "step": 2289, "tokens_per_device": 1480 }, { "epoch": 0.9156, "loss_ce": 0.08498445153236389, "loss_lvr": 0.8474302291870117, "loss_mode_switch": 0.0, "loss_total": 0.16972747445106506, "step": 2289 }, { "batch_size": 1, "epoch": 0.9156, "step": 2289, "tokens_per_device": 5015 }, { "epoch": 0.9156, "loss_ce": 0.15559564530849457, "loss_lvr": 0.5245022773742676, "loss_mode_switch": 0.0, "loss_total": 0.2080458700656891, "step": 2289 }, { "batch_size": 4, "epoch": 0.9156, "step": 2289, "tokens_per_device": 4552 }, { "epoch": 0.9156, "loss_ce": 0.5001986026763916, "loss_lvr": 0.659060001373291, "loss_mode_switch": 0.0, "loss_total": 0.5661045908927917, "step": 2289 }, { "batch_size": 1, "epoch": 0.9156, "step": 2289, "tokens_per_device": 4779 }, { "epoch": 0.9156, "loss_ce": 0.005522408522665501, "loss_lvr": 0.42795753479003906, "loss_mode_switch": 0.0, "loss_total": 0.04831816256046295, "step": 2289 }, { "batch_size": 1, "epoch": 0.9156, "step": 2289, "tokens_per_device": 5088 }, { "epoch": 0.9156, "loss_ce": 0.13025341928005219, "loss_lvr": 0.5407813191413879, "loss_mode_switch": 0.0, "loss_total": 0.18433155119419098, "step": 2289 }, { "epoch": 0.916, "grad_norm": 1.231830358505249, "learning_rate": 1.838969564383525e-07, "loss": 0.2593, "step": 2290 }, { "batch_size": 1, "epoch": 0.916, "step": 2290, "tokens_per_device": 5038 }, { "epoch": 0.916, "loss_ce": 0.11603913456201553, "loss_lvr": 0.3446767330169678, "loss_mode_switch": 0.0, "loss_total": 0.15050680935382843, "step": 2290 }, { "batch_size": 1, "epoch": 0.916, "step": 2290, "tokens_per_device": 5069 }, { "epoch": 0.916, "loss_ce": 0.012761064805090427, "loss_lvr": 0.23302224278450012, "loss_mode_switch": 0.0, "loss_total": 0.036063291132450104, "step": 2290 }, { "batch_size": 4, "epoch": 0.916, "step": 2290, "tokens_per_device": 1276 }, { "epoch": 0.916, "loss_ce": 0.7186794281005859, "loss_lvr": 1.0879360437393188, "loss_mode_switch": 0.0, "loss_total": 0.8274730443954468, "step": 2290 }, { "batch_size": 1, "epoch": 0.916, "step": 2290, "tokens_per_device": 7749 }, { "epoch": 0.916, "loss_ce": 0.49059274792671204, "loss_lvr": 0.2569420039653778, "loss_mode_switch": 0.0, "loss_total": 0.5162869691848755, "step": 2290 }, { "batch_size": 4, "epoch": 0.916, "step": 2290, "tokens_per_device": 1588 }, { "epoch": 0.916, "loss_ce": 0.24478541314601898, "loss_lvr": 0.8663012981414795, "loss_mode_switch": 0.0, "loss_total": 0.3314155340194702, "step": 2290 }, { "batch_size": 4, "epoch": 0.916, "step": 2290, "tokens_per_device": 4504 }, { "epoch": 0.916, "loss_ce": 0.3523952066898346, "loss_lvr": 0.7840000987052917, "loss_mode_switch": 0.0, "loss_total": 0.43079522252082825, "step": 2290 }, { "batch_size": 4, "epoch": 0.916, "step": 2290, "tokens_per_device": 5716 }, { "epoch": 0.916, "loss_ce": 0.27679580450057983, "loss_lvr": 0.8015602231025696, "loss_mode_switch": 0.0, "loss_total": 0.35695183277130127, "step": 2290 }, { "batch_size": 1, "epoch": 0.916, "step": 2290, "tokens_per_device": 4884 }, { "epoch": 0.916, "loss_ce": 0.0015604663640260696, "loss_lvr": 0.7313032150268555, "loss_mode_switch": 0.0, "loss_total": 0.0746907889842987, "step": 2290 }, { "epoch": 0.9164, "grad_norm": 1.5722593069076538, "learning_rate": 1.8216041454837075e-07, "loss": 0.3114, "step": 2291 }, { "batch_size": 4, "epoch": 0.9164, "step": 2291, "tokens_per_device": 1556 }, { "epoch": 0.9164, "loss_ce": 0.43635714054107666, "loss_lvr": 1.0025938749313354, "loss_mode_switch": 0.0, "loss_total": 0.5366165041923523, "step": 2291 }, { "batch_size": 1, "epoch": 0.9164, "step": 2291, "tokens_per_device": 5129 }, { "epoch": 0.9164, "loss_ce": 0.14370973408222198, "loss_lvr": 0.3373861610889435, "loss_mode_switch": 0.0, "loss_total": 0.1774483472108841, "step": 2291 }, { "batch_size": 4, "epoch": 0.9164, "step": 2291, "tokens_per_device": 1624 }, { "epoch": 0.9164, "loss_ce": 0.41084977984428406, "loss_lvr": 0.8965936899185181, "loss_mode_switch": 0.0, "loss_total": 0.5005091428756714, "step": 2291 }, { "batch_size": 4, "epoch": 0.9164, "step": 2291, "tokens_per_device": 4316 }, { "epoch": 0.9164, "loss_ce": 0.44323623180389404, "loss_lvr": 0.6745386123657227, "loss_mode_switch": 0.0, "loss_total": 0.5106900930404663, "step": 2291 }, { "batch_size": 4, "epoch": 0.9164, "step": 2291, "tokens_per_device": 5968 }, { "epoch": 0.9164, "loss_ce": 0.17762932181358337, "loss_lvr": 0.6342998147010803, "loss_mode_switch": 0.0, "loss_total": 0.2410593032836914, "step": 2291 }, { "batch_size": 4, "epoch": 0.9164, "step": 2291, "tokens_per_device": 1956 }, { "epoch": 0.9164, "loss_ce": 0.20332388579845428, "loss_lvr": 0.9157576560974121, "loss_mode_switch": 0.0, "loss_total": 0.2948996424674988, "step": 2291 }, { "batch_size": 4, "epoch": 0.9164, "step": 2291, "tokens_per_device": 3912 }, { "epoch": 0.9164, "loss_ce": 0.030178187415003777, "loss_lvr": 0.9437282085418701, "loss_mode_switch": 0.0, "loss_total": 0.12455101311206818, "step": 2291 }, { "batch_size": 1, "epoch": 0.9164, "step": 2291, "tokens_per_device": 5103 }, { "epoch": 0.9164, "loss_ce": 0.13210032880306244, "loss_lvr": 0.44477081298828125, "loss_mode_switch": 0.0, "loss_total": 0.17657741904258728, "step": 2291 }, { "epoch": 0.9168, "grad_norm": 1.3777799606323242, "learning_rate": 1.804319585615244e-07, "loss": 0.2967, "step": 2292 }, { "batch_size": 4, "epoch": 0.9168, "step": 2292, "tokens_per_device": 6496 }, { "epoch": 0.9168, "loss_ce": 0.05109644681215286, "loss_lvr": 0.7040200233459473, "loss_mode_switch": 0.0, "loss_total": 0.12149845063686371, "step": 2292 }, { "batch_size": 4, "epoch": 0.9168, "step": 2292, "tokens_per_device": 2572 }, { "epoch": 0.9168, "loss_ce": 0.39907899498939514, "loss_lvr": 0.7534756064414978, "loss_mode_switch": 0.0, "loss_total": 0.4744265675544739, "step": 2292 }, { "batch_size": 1, "epoch": 0.9168, "step": 2292, "tokens_per_device": 4781 }, { "epoch": 0.9168, "loss_ce": 0.02086157165467739, "loss_lvr": 0.29389089345932007, "loss_mode_switch": 0.0, "loss_total": 0.05025066062808037, "step": 2292 }, { "batch_size": 4, "epoch": 0.9168, "step": 2292, "tokens_per_device": 4260 }, { "epoch": 0.9168, "loss_ce": 0.28185123205184937, "loss_lvr": 0.7229409217834473, "loss_mode_switch": 0.0, "loss_total": 0.3541453182697296, "step": 2292 }, { "batch_size": 4, "epoch": 0.9168, "step": 2292, "tokens_per_device": 1440 }, { "epoch": 0.9168, "loss_ce": 0.2594035565853119, "loss_lvr": 0.9000599980354309, "loss_mode_switch": 0.0, "loss_total": 0.3494095504283905, "step": 2292 }, { "batch_size": 4, "epoch": 0.9168, "step": 2292, "tokens_per_device": 2704 }, { "epoch": 0.9168, "loss_ce": 0.18596453964710236, "loss_lvr": 0.8171100616455078, "loss_mode_switch": 0.0, "loss_total": 0.2676755487918854, "step": 2292 }, { "batch_size": 4, "epoch": 0.9168, "step": 2292, "tokens_per_device": 5792 }, { "epoch": 0.9168, "loss_ce": 0.48725345730781555, "loss_lvr": 0.8870213627815247, "loss_mode_switch": 0.0, "loss_total": 0.5759555697441101, "step": 2292 }, { "batch_size": 4, "epoch": 0.9168, "step": 2292, "tokens_per_device": 5892 }, { "epoch": 0.9168, "loss_ce": 0.625679612159729, "loss_lvr": 0.8166267275810242, "loss_mode_switch": 0.0, "loss_total": 0.707342267036438, "step": 2292 }, { "epoch": 0.9172, "grad_norm": 1.4952590465545654, "learning_rate": 1.7871159137872573e-07, "loss": 0.3036, "step": 2293 }, { "batch_size": 4, "epoch": 0.9172, "step": 2293, "tokens_per_device": 4700 }, { "epoch": 0.9172, "loss_ce": 0.33336764574050903, "loss_lvr": 0.7920980453491211, "loss_mode_switch": 0.0, "loss_total": 0.41257745027542114, "step": 2293 }, { "batch_size": 4, "epoch": 0.9172, "step": 2293, "tokens_per_device": 4188 }, { "epoch": 0.9172, "loss_ce": 0.5590898990631104, "loss_lvr": 1.0018454790115356, "loss_mode_switch": 0.0, "loss_total": 0.6592744588851929, "step": 2293 }, { "batch_size": 1, "epoch": 0.9172, "step": 2293, "tokens_per_device": 4868 }, { "epoch": 0.9172, "loss_ce": 0.20582853257656097, "loss_lvr": 0.481646329164505, "loss_mode_switch": 0.0, "loss_total": 0.2539931535720825, "step": 2293 }, { "batch_size": 4, "epoch": 0.9172, "step": 2293, "tokens_per_device": 4264 }, { "epoch": 0.9172, "loss_ce": 0.13936759531497955, "loss_lvr": 0.9689133763313293, "loss_mode_switch": 0.0, "loss_total": 0.23625892400741577, "step": 2293 }, { "batch_size": 1, "epoch": 0.9172, "step": 2293, "tokens_per_device": 4940 }, { "epoch": 0.9172, "loss_ce": 0.06723155826330185, "loss_lvr": 0.5073845386505127, "loss_mode_switch": 0.0, "loss_total": 0.11797001212835312, "step": 2293 }, { "batch_size": 4, "epoch": 0.9172, "step": 2293, "tokens_per_device": 4156 }, { "epoch": 0.9172, "loss_ce": 0.16187353432178497, "loss_lvr": 0.776946485042572, "loss_mode_switch": 0.0, "loss_total": 0.23956817388534546, "step": 2293 }, { "batch_size": 4, "epoch": 0.9172, "step": 2293, "tokens_per_device": 2092 }, { "epoch": 0.9172, "loss_ce": 0.26185473799705505, "loss_lvr": 0.7622378468513489, "loss_mode_switch": 0.0, "loss_total": 0.3380785286426544, "step": 2293 }, { "batch_size": 1, "epoch": 0.9172, "step": 2293, "tokens_per_device": 4884 }, { "epoch": 0.9172, "loss_ce": 0.05800136923789978, "loss_lvr": 1.245227336883545, "loss_mode_switch": 0.0, "loss_total": 0.18252411484718323, "step": 2293 }, { "epoch": 0.9176, "grad_norm": 1.3415714502334595, "learning_rate": 1.7699931588731012e-07, "loss": 0.257, "step": 2294 }, { "batch_size": 1, "epoch": 0.9176, "step": 2294, "tokens_per_device": 5018 }, { "epoch": 0.9176, "loss_ce": 0.4437606930732727, "loss_lvr": 0.2666570842266083, "loss_mode_switch": 0.0, "loss_total": 0.47042641043663025, "step": 2294 }, { "batch_size": 4, "epoch": 0.9176, "step": 2294, "tokens_per_device": 4236 }, { "epoch": 0.9176, "loss_ce": 0.09400398284196854, "loss_lvr": 0.8966160416603088, "loss_mode_switch": 0.0, "loss_total": 0.18366558849811554, "step": 2294 }, { "batch_size": 4, "epoch": 0.9176, "step": 2294, "tokens_per_device": 1312 }, { "epoch": 0.9176, "loss_ce": 0.22090665996074677, "loss_lvr": 1.0678540468215942, "loss_mode_switch": 0.0, "loss_total": 0.32769206166267395, "step": 2294 }, { "batch_size": 4, "epoch": 0.9176, "step": 2294, "tokens_per_device": 1360 }, { "epoch": 0.9176, "loss_ce": 0.4736640751361847, "loss_lvr": 1.0507597923278809, "loss_mode_switch": 0.0, "loss_total": 0.5787400603294373, "step": 2294 }, { "batch_size": 4, "epoch": 0.9176, "step": 2294, "tokens_per_device": 11044 }, { "epoch": 0.9176, "loss_ce": 0.4279859960079193, "loss_lvr": 0.7145286202430725, "loss_mode_switch": 0.0, "loss_total": 0.4994388520717621, "step": 2294 }, { "batch_size": 4, "epoch": 0.9176, "step": 2294, "tokens_per_device": 4648 }, { "epoch": 0.9176, "loss_ce": 0.2822646200656891, "loss_lvr": 0.774674654006958, "loss_mode_switch": 0.0, "loss_total": 0.35973209142684937, "step": 2294 }, { "batch_size": 1, "epoch": 0.9176, "step": 2294, "tokens_per_device": 4915 }, { "epoch": 0.9176, "loss_ce": 0.0009427094482816756, "loss_lvr": 0.2618538439273834, "loss_mode_switch": 0.0, "loss_total": 0.02712809294462204, "step": 2294 }, { "batch_size": 4, "epoch": 0.9176, "step": 2294, "tokens_per_device": 5748 }, { "epoch": 0.9176, "loss_ce": 0.16492800414562225, "loss_lvr": 0.8396897912025452, "loss_mode_switch": 0.0, "loss_total": 0.248896986246109, "step": 2294 }, { "epoch": 0.918, "grad_norm": 1.2649949789047241, "learning_rate": 1.7529513496103322e-07, "loss": 0.2727, "step": 2295 }, { "batch_size": 4, "epoch": 0.918, "step": 2295, "tokens_per_device": 3448 }, { "epoch": 0.918, "loss_ce": 0.4342440664768219, "loss_lvr": 0.9324719905853271, "loss_mode_switch": 0.0, "loss_total": 0.5274912714958191, "step": 2295 }, { "batch_size": 4, "epoch": 0.918, "step": 2295, "tokens_per_device": 5224 }, { "epoch": 0.918, "loss_ce": 0.055607326328754425, "loss_lvr": 0.8152974247932434, "loss_mode_switch": 0.0, "loss_total": 0.13713707029819489, "step": 2295 }, { "batch_size": 1, "epoch": 0.918, "step": 2295, "tokens_per_device": 5185 }, { "epoch": 0.918, "loss_ce": 0.005850015673786402, "loss_lvr": 0.4203604757785797, "loss_mode_switch": 0.0, "loss_total": 0.047886066138744354, "step": 2295 }, { "batch_size": 4, "epoch": 0.918, "step": 2295, "tokens_per_device": 4236 }, { "epoch": 0.918, "loss_ce": 0.13506776094436646, "loss_lvr": 0.9400542974472046, "loss_mode_switch": 0.0, "loss_total": 0.2290731966495514, "step": 2295 }, { "batch_size": 4, "epoch": 0.918, "step": 2295, "tokens_per_device": 2676 }, { "epoch": 0.918, "loss_ce": 0.21303322911262512, "loss_lvr": 0.8534350991249084, "loss_mode_switch": 0.0, "loss_total": 0.29837673902511597, "step": 2295 }, { "batch_size": 4, "epoch": 0.918, "step": 2295, "tokens_per_device": 3760 }, { "epoch": 0.918, "loss_ce": 0.4068858325481415, "loss_lvr": 1.0354137420654297, "loss_mode_switch": 0.0, "loss_total": 0.5104272365570068, "step": 2295 }, { "batch_size": 4, "epoch": 0.918, "step": 2295, "tokens_per_device": 4244 }, { "epoch": 0.918, "loss_ce": 0.01525155734270811, "loss_lvr": 0.8155136108398438, "loss_mode_switch": 0.0, "loss_total": 0.09680292010307312, "step": 2295 }, { "batch_size": 1, "epoch": 0.918, "step": 2295, "tokens_per_device": 4788 }, { "epoch": 0.918, "loss_ce": 0.00016408247756771743, "loss_lvr": 0.2741677761077881, "loss_mode_switch": 0.0, "loss_total": 0.027580861002206802, "step": 2295 }, { "epoch": 0.9184, "grad_norm": 1.203286051750183, "learning_rate": 1.7359905146006607e-07, "loss": 0.2988, "step": 2296 }, { "batch_size": 4, "epoch": 0.9184, "step": 2296, "tokens_per_device": 1256 }, { "epoch": 0.9184, "loss_ce": 0.38234373927116394, "loss_lvr": 0.983025312423706, "loss_mode_switch": 0.0, "loss_total": 0.4806462824344635, "step": 2296 }, { "batch_size": 1, "epoch": 0.9184, "step": 2296, "tokens_per_device": 4910 }, { "epoch": 0.9184, "loss_ce": 0.0015360990073531866, "loss_lvr": 0.34182995557785034, "loss_mode_switch": 0.0, "loss_total": 0.035719092935323715, "step": 2296 }, { "batch_size": 4, "epoch": 0.9184, "step": 2296, "tokens_per_device": 8980 }, { "epoch": 0.9184, "loss_ce": 0.5611360669136047, "loss_lvr": 0.7513276934623718, "loss_mode_switch": 0.0, "loss_total": 0.6362688541412354, "step": 2296 }, { "batch_size": 4, "epoch": 0.9184, "step": 2296, "tokens_per_device": 3456 }, { "epoch": 0.9184, "loss_ce": 0.1911012828350067, "loss_lvr": 0.7580996751785278, "loss_mode_switch": 0.0, "loss_total": 0.26691126823425293, "step": 2296 }, { "batch_size": 1, "epoch": 0.9184, "step": 2296, "tokens_per_device": 5018 }, { "epoch": 0.9184, "loss_ce": 0.21631774306297302, "loss_lvr": 0.38617876172065735, "loss_mode_switch": 0.0, "loss_total": 0.254935622215271, "step": 2296 }, { "batch_size": 4, "epoch": 0.9184, "step": 2296, "tokens_per_device": 1356 }, { "epoch": 0.9184, "loss_ce": 0.2005556970834732, "loss_lvr": 0.7873660922050476, "loss_mode_switch": 0.0, "loss_total": 0.2792923152446747, "step": 2296 }, { "batch_size": 1, "epoch": 0.9184, "step": 2296, "tokens_per_device": 4982 }, { "epoch": 0.9184, "loss_ce": 0.12343782186508179, "loss_lvr": 0.219330832362175, "loss_mode_switch": 0.0, "loss_total": 0.14537090063095093, "step": 2296 }, { "batch_size": 4, "epoch": 0.9184, "step": 2296, "tokens_per_device": 1388 }, { "epoch": 0.9184, "loss_ce": 0.7492525577545166, "loss_lvr": 0.8819798827171326, "loss_mode_switch": 0.0, "loss_total": 0.8374505639076233, "step": 2296 }, { "epoch": 0.9188, "grad_norm": 1.239790439605713, "learning_rate": 1.719110682309888e-07, "loss": 0.2855, "step": 2297 }, { "batch_size": 4, "epoch": 0.9188, "step": 2297, "tokens_per_device": 2688 }, { "epoch": 0.9188, "loss_ce": 0.3052375912666321, "loss_lvr": 0.8170574903488159, "loss_mode_switch": 0.0, "loss_total": 0.38694334030151367, "step": 2297 }, { "batch_size": 4, "epoch": 0.9188, "step": 2297, "tokens_per_device": 1436 }, { "epoch": 0.9188, "loss_ce": 0.391033798456192, "loss_lvr": 0.7422634363174438, "loss_mode_switch": 0.0, "loss_total": 0.4652601480484009, "step": 2297 }, { "batch_size": 1, "epoch": 0.9188, "step": 2297, "tokens_per_device": 4872 }, { "epoch": 0.9188, "loss_ce": 0.05847768858075142, "loss_lvr": 0.32813695073127747, "loss_mode_switch": 0.0, "loss_total": 0.0912913829088211, "step": 2297 }, { "batch_size": 1, "epoch": 0.9188, "step": 2297, "tokens_per_device": 5014 }, { "epoch": 0.9188, "loss_ce": 0.4746500849723816, "loss_lvr": 0.2558852732181549, "loss_mode_switch": 0.0, "loss_total": 0.5002385973930359, "step": 2297 }, { "batch_size": 4, "epoch": 0.9188, "step": 2297, "tokens_per_device": 2892 }, { "epoch": 0.9188, "loss_ce": 0.4553888142108917, "loss_lvr": 0.6775544881820679, "loss_mode_switch": 0.0, "loss_total": 0.5231442451477051, "step": 2297 }, { "batch_size": 1, "epoch": 0.9188, "step": 2297, "tokens_per_device": 5106 }, { "epoch": 0.9188, "loss_ce": 0.0003187457623425871, "loss_lvr": 0.3743607699871063, "loss_mode_switch": 0.0, "loss_total": 0.03775482624769211, "step": 2297 }, { "batch_size": 4, "epoch": 0.9188, "step": 2297, "tokens_per_device": 1612 }, { "epoch": 0.9188, "loss_ce": 0.18960721790790558, "loss_lvr": 1.1480919122695923, "loss_mode_switch": 0.0, "loss_total": 0.3044164180755615, "step": 2297 }, { "batch_size": 4, "epoch": 0.9188, "step": 2297, "tokens_per_device": 3752 }, { "epoch": 0.9188, "loss_ce": 0.06202571466565132, "loss_lvr": 0.8717586994171143, "loss_mode_switch": 0.0, "loss_total": 0.14920158684253693, "step": 2297 }, { "epoch": 0.9192, "grad_norm": 1.8400059938430786, "learning_rate": 1.702311881067864e-07, "loss": 0.2504, "step": 2298 }, { "batch_size": 4, "epoch": 0.9192, "step": 2298, "tokens_per_device": 4364 }, { "epoch": 0.9192, "loss_ce": 0.2718735933303833, "loss_lvr": 0.7689772248268127, "loss_mode_switch": 0.0, "loss_total": 0.348771333694458, "step": 2298 }, { "batch_size": 1, "epoch": 0.9192, "step": 2298, "tokens_per_device": 5194 }, { "epoch": 0.9192, "loss_ce": 0.013962207362055779, "loss_lvr": 0.38746097683906555, "loss_mode_switch": 0.0, "loss_total": 0.052708305418491364, "step": 2298 }, { "batch_size": 4, "epoch": 0.9192, "step": 2298, "tokens_per_device": 4284 }, { "epoch": 0.9192, "loss_ce": 0.110586978495121, "loss_lvr": 0.6162083745002747, "loss_mode_switch": 0.0, "loss_total": 0.1722078174352646, "step": 2298 }, { "batch_size": 4, "epoch": 0.9192, "step": 2298, "tokens_per_device": 1324 }, { "epoch": 0.9192, "loss_ce": 0.26501619815826416, "loss_lvr": 1.126917839050293, "loss_mode_switch": 0.0, "loss_total": 0.37770798802375793, "step": 2298 }, { "batch_size": 1, "epoch": 0.9192, "step": 2298, "tokens_per_device": 4925 }, { "epoch": 0.9192, "loss_ce": 0.025486264377832413, "loss_lvr": 0.42230841517448425, "loss_mode_switch": 0.0, "loss_total": 0.06771710515022278, "step": 2298 }, { "batch_size": 4, "epoch": 0.9192, "step": 2298, "tokens_per_device": 3856 }, { "epoch": 0.9192, "loss_ce": 0.22858981788158417, "loss_lvr": 1.0612282752990723, "loss_mode_switch": 0.0, "loss_total": 0.3347126543521881, "step": 2298 }, { "batch_size": 4, "epoch": 0.9192, "step": 2298, "tokens_per_device": 4264 }, { "epoch": 0.9192, "loss_ce": 0.4859941303730011, "loss_lvr": 0.8273044228553772, "loss_mode_switch": 0.0, "loss_total": 0.5687245726585388, "step": 2298 }, { "batch_size": 4, "epoch": 0.9192, "step": 2298, "tokens_per_device": 4420 }, { "epoch": 0.9192, "loss_ce": 0.00247665005736053, "loss_lvr": 0.7260298728942871, "loss_mode_switch": 0.0, "loss_total": 0.07507963478565216, "step": 2298 }, { "epoch": 0.9196, "grad_norm": 1.2825098037719727, "learning_rate": 1.6855941390684415e-07, "loss": 0.2932, "step": 2299 }, { "batch_size": 1, "epoch": 0.9196, "step": 2299, "tokens_per_device": 4895 }, { "epoch": 0.9196, "loss_ce": 0.0002994826063513756, "loss_lvr": 0.17992740869522095, "loss_mode_switch": 0.0, "loss_total": 0.018292222172021866, "step": 2299 }, { "batch_size": 4, "epoch": 0.9196, "step": 2299, "tokens_per_device": 7452 }, { "epoch": 0.9196, "loss_ce": 0.13023750483989716, "loss_lvr": 0.7502498030662537, "loss_mode_switch": 0.0, "loss_total": 0.20526248216629028, "step": 2299 }, { "batch_size": 4, "epoch": 0.9196, "step": 2299, "tokens_per_device": 4400 }, { "epoch": 0.9196, "loss_ce": 0.5655531883239746, "loss_lvr": 0.8029912114143372, "loss_mode_switch": 0.0, "loss_total": 0.6458523273468018, "step": 2299 }, { "batch_size": 4, "epoch": 0.9196, "step": 2299, "tokens_per_device": 4212 }, { "epoch": 0.9196, "loss_ce": 0.41947972774505615, "loss_lvr": 0.9017751216888428, "loss_mode_switch": 0.0, "loss_total": 0.5096572637557983, "step": 2299 }, { "batch_size": 1, "epoch": 0.9196, "step": 2299, "tokens_per_device": 5104 }, { "epoch": 0.9196, "loss_ce": 0.004066310357302427, "loss_lvr": 0.34569159150123596, "loss_mode_switch": 0.0, "loss_total": 0.03863546997308731, "step": 2299 }, { "batch_size": 1, "epoch": 0.9196, "step": 2299, "tokens_per_device": 4957 }, { "epoch": 0.9196, "loss_ce": 0.0850880965590477, "loss_lvr": 0.43867889046669006, "loss_mode_switch": 0.0, "loss_total": 0.12895599007606506, "step": 2299 }, { "batch_size": 4, "epoch": 0.9196, "step": 2299, "tokens_per_device": 4272 }, { "epoch": 0.9196, "loss_ce": 0.042404986917972565, "loss_lvr": 0.8172813057899475, "loss_mode_switch": 0.0, "loss_total": 0.12413311749696732, "step": 2299 }, { "batch_size": 4, "epoch": 0.9196, "step": 2299, "tokens_per_device": 7796 }, { "epoch": 0.9196, "loss_ce": 0.4902164340019226, "loss_lvr": 0.7462782859802246, "loss_mode_switch": 0.0, "loss_total": 0.5648442506790161, "step": 2299 }, { "epoch": 0.92, "grad_norm": 1.2474360466003418, "learning_rate": 1.6689574843694433e-07, "loss": 0.2374, "step": 2300 }, { "batch_size": 1, "epoch": 0.92, "step": 2300, "tokens_per_device": 4890 }, { "epoch": 0.92, "loss_ce": 0.07181636244058609, "loss_lvr": 0.9605496525764465, "loss_mode_switch": 0.0, "loss_total": 0.16787132620811462, "step": 2300 }, { "batch_size": 4, "epoch": 0.92, "step": 2300, "tokens_per_device": 4620 }, { "epoch": 0.92, "loss_ce": 0.16210535168647766, "loss_lvr": 0.8535177707672119, "loss_mode_switch": 0.0, "loss_total": 0.2474571317434311, "step": 2300 }, { "batch_size": 4, "epoch": 0.92, "step": 2300, "tokens_per_device": 10808 }, { "epoch": 0.92, "loss_ce": 0.0027743768878281116, "loss_lvr": 0.747020423412323, "loss_mode_switch": 0.0, "loss_total": 0.07747642695903778, "step": 2300 }, { "batch_size": 1, "epoch": 0.92, "step": 2300, "tokens_per_device": 4910 }, { "epoch": 0.92, "loss_ce": 0.03622714430093765, "loss_lvr": 0.6543726921081543, "loss_mode_switch": 0.0, "loss_total": 0.10166441649198532, "step": 2300 }, { "batch_size": 4, "epoch": 0.92, "step": 2300, "tokens_per_device": 4712 }, { "epoch": 0.92, "loss_ce": 0.14301539957523346, "loss_lvr": 0.6856512427330017, "loss_mode_switch": 0.0, "loss_total": 0.21158051490783691, "step": 2300 }, { "batch_size": 4, "epoch": 0.92, "step": 2300, "tokens_per_device": 5420 }, { "epoch": 0.92, "loss_ce": 0.2358810007572174, "loss_lvr": 0.9836419820785522, "loss_mode_switch": 0.0, "loss_total": 0.3342452049255371, "step": 2300 }, { "batch_size": 1, "epoch": 0.92, "step": 2300, "tokens_per_device": 5310 }, { "epoch": 0.92, "loss_ce": 0.01425143051892519, "loss_lvr": 0.32342302799224854, "loss_mode_switch": 0.0, "loss_total": 0.04659373313188553, "step": 2300 }, { "batch_size": 1, "epoch": 0.92, "step": 2300, "tokens_per_device": 4977 }, { "epoch": 0.92, "loss_ce": 0.04040004312992096, "loss_lvr": 0.46001699566841125, "loss_mode_switch": 0.0, "loss_total": 0.08640174567699432, "step": 2300 }, { "epoch": 0.9204, "grad_norm": 1.4037402868270874, "learning_rate": 1.6524019448925788e-07, "loss": 0.2968, "step": 2301 }, { "batch_size": 1, "epoch": 0.9204, "step": 2301, "tokens_per_device": 6599 }, { "epoch": 0.9204, "loss_ce": 0.2541459798812866, "loss_lvr": 0.4405210018157959, "loss_mode_switch": 0.0, "loss_total": 0.29819807410240173, "step": 2301 }, { "batch_size": 4, "epoch": 0.9204, "step": 2301, "tokens_per_device": 2068 }, { "epoch": 0.9204, "loss_ce": 0.02975727804005146, "loss_lvr": 0.8842339515686035, "loss_mode_switch": 0.0, "loss_total": 0.11818066984415054, "step": 2301 }, { "batch_size": 1, "epoch": 0.9204, "step": 2301, "tokens_per_device": 4932 }, { "epoch": 0.9204, "loss_ce": 0.0014369995333254337, "loss_lvr": 0.5453406572341919, "loss_mode_switch": 0.0, "loss_total": 0.055971067398786545, "step": 2301 }, { "batch_size": 4, "epoch": 0.9204, "step": 2301, "tokens_per_device": 4540 }, { "epoch": 0.9204, "loss_ce": 0.1283443570137024, "loss_lvr": 0.7746665477752686, "loss_mode_switch": 0.0, "loss_total": 0.2058110237121582, "step": 2301 }, { "batch_size": 1, "epoch": 0.9204, "step": 2301, "tokens_per_device": 4789 }, { "epoch": 0.9204, "loss_ce": 0.00021799353999085724, "loss_lvr": 0.3232342600822449, "loss_mode_switch": 0.0, "loss_total": 0.0325414203107357, "step": 2301 }, { "batch_size": 4, "epoch": 0.9204, "step": 2301, "tokens_per_device": 2688 }, { "epoch": 0.9204, "loss_ce": 0.10094068944454193, "loss_lvr": 0.8504698872566223, "loss_mode_switch": 0.0, "loss_total": 0.1859876811504364, "step": 2301 }, { "batch_size": 4, "epoch": 0.9204, "step": 2301, "tokens_per_device": 8392 }, { "epoch": 0.9204, "loss_ce": 0.1834680736064911, "loss_lvr": 0.8801355361938477, "loss_mode_switch": 0.0, "loss_total": 0.27148163318634033, "step": 2301 }, { "batch_size": 4, "epoch": 0.9204, "step": 2301, "tokens_per_device": 1588 }, { "epoch": 0.9204, "loss_ce": 0.11860930919647217, "loss_lvr": 0.7656455039978027, "loss_mode_switch": 0.0, "loss_total": 0.19517385959625244, "step": 2301 }, { "epoch": 0.9208, "grad_norm": 1.2973895072937012, "learning_rate": 1.6359275484234495e-07, "loss": 0.2839, "step": 2302 }, { "batch_size": 4, "epoch": 0.9208, "step": 2302, "tokens_per_device": 4672 }, { "epoch": 0.9208, "loss_ce": 0.12056829035282135, "loss_lvr": 0.7183047533035278, "loss_mode_switch": 0.0, "loss_total": 0.19239875674247742, "step": 2302 }, { "batch_size": 4, "epoch": 0.9208, "step": 2302, "tokens_per_device": 3748 }, { "epoch": 0.9208, "loss_ce": 0.5009171962738037, "loss_lvr": 1.0086289644241333, "loss_mode_switch": 0.0, "loss_total": 0.601780116558075, "step": 2302 }, { "batch_size": 4, "epoch": 0.9208, "step": 2302, "tokens_per_device": 1552 }, { "epoch": 0.9208, "loss_ce": 0.4665642976760864, "loss_lvr": 0.8608672618865967, "loss_mode_switch": 0.0, "loss_total": 0.552651047706604, "step": 2302 }, { "batch_size": 1, "epoch": 0.9208, "step": 2302, "tokens_per_device": 4871 }, { "epoch": 0.9208, "loss_ce": 0.0038592626806348562, "loss_lvr": 0.20069566369056702, "loss_mode_switch": 0.0, "loss_total": 0.02392883040010929, "step": 2302 }, { "batch_size": 4, "epoch": 0.9208, "step": 2302, "tokens_per_device": 5088 }, { "epoch": 0.9208, "loss_ce": 0.5343263149261475, "loss_lvr": 0.637319028377533, "loss_mode_switch": 0.0, "loss_total": 0.5980582237243652, "step": 2302 }, { "batch_size": 4, "epoch": 0.9208, "step": 2302, "tokens_per_device": 10116 }, { "epoch": 0.9208, "loss_ce": 0.1581147313117981, "loss_lvr": 0.6950610876083374, "loss_mode_switch": 0.0, "loss_total": 0.22762084007263184, "step": 2302 }, { "batch_size": 1, "epoch": 0.9208, "step": 2302, "tokens_per_device": 5131 }, { "epoch": 0.9208, "loss_ce": 0.028292912989854813, "loss_lvr": 0.2829471826553345, "loss_mode_switch": 0.0, "loss_total": 0.05658762902021408, "step": 2302 }, { "batch_size": 4, "epoch": 0.9208, "step": 2302, "tokens_per_device": 4256 }, { "epoch": 0.9208, "loss_ce": 0.35052061080932617, "loss_lvr": 0.790482223033905, "loss_mode_switch": 0.0, "loss_total": 0.4295688271522522, "step": 2302 }, { "epoch": 0.9212, "grad_norm": 1.304444670677185, "learning_rate": 1.6195343226114492e-07, "loss": 0.2928, "step": 2303 }, { "batch_size": 4, "epoch": 0.9212, "step": 2303, "tokens_per_device": 4312 }, { "epoch": 0.9212, "loss_ce": 0.41388824582099915, "loss_lvr": 0.7998254895210266, "loss_mode_switch": 0.0, "loss_total": 0.4938707947731018, "step": 2303 }, { "batch_size": 4, "epoch": 0.9212, "step": 2303, "tokens_per_device": 4200 }, { "epoch": 0.9212, "loss_ce": 0.12265130132436752, "loss_lvr": 0.8760257363319397, "loss_mode_switch": 0.0, "loss_total": 0.21025387942790985, "step": 2303 }, { "batch_size": 1, "epoch": 0.9212, "step": 2303, "tokens_per_device": 5370 }, { "epoch": 0.9212, "loss_ce": 0.005753345787525177, "loss_lvr": 0.2549041211605072, "loss_mode_switch": 0.0, "loss_total": 0.031243758276104927, "step": 2303 }, { "batch_size": 4, "epoch": 0.9212, "step": 2303, "tokens_per_device": 4152 }, { "epoch": 0.9212, "loss_ce": 0.2788359820842743, "loss_lvr": 0.8501683473587036, "loss_mode_switch": 0.0, "loss_total": 0.3638528287410736, "step": 2303 }, { "batch_size": 4, "epoch": 0.9212, "step": 2303, "tokens_per_device": 3816 }, { "epoch": 0.9212, "loss_ce": 0.10061786323785782, "loss_lvr": 0.8909337520599365, "loss_mode_switch": 0.0, "loss_total": 0.18971124291419983, "step": 2303 }, { "batch_size": 4, "epoch": 0.9212, "step": 2303, "tokens_per_device": 8636 }, { "epoch": 0.9212, "loss_ce": 0.04332514852285385, "loss_lvr": 0.5042287111282349, "loss_mode_switch": 0.0, "loss_total": 0.09374801814556122, "step": 2303 }, { "batch_size": 4, "epoch": 0.9212, "step": 2303, "tokens_per_device": 4552 }, { "epoch": 0.9212, "loss_ce": 0.47974875569343567, "loss_lvr": 0.8969431519508362, "loss_mode_switch": 0.0, "loss_total": 0.5694430470466614, "step": 2303 }, { "batch_size": 1, "epoch": 0.9212, "step": 2303, "tokens_per_device": 4762 }, { "epoch": 0.9212, "loss_ce": 0.00940550584346056, "loss_lvr": 0.44472476840019226, "loss_mode_switch": 0.0, "loss_total": 0.05387798324227333, "step": 2303 }, { "epoch": 0.9216, "grad_norm": 1.161119818687439, "learning_rate": 1.6032222949697361e-07, "loss": 0.2306, "step": 2304 }, { "batch_size": 1, "epoch": 0.9216, "step": 2304, "tokens_per_device": 5118 }, { "epoch": 0.9216, "loss_ce": 0.14293861389160156, "loss_lvr": 0.45504850149154663, "loss_mode_switch": 0.0, "loss_total": 0.18844346702098846, "step": 2304 }, { "batch_size": 4, "epoch": 0.9216, "step": 2304, "tokens_per_device": 1528 }, { "epoch": 0.9216, "loss_ce": 0.8788329362869263, "loss_lvr": 1.0491033792495728, "loss_mode_switch": 0.0, "loss_total": 0.9837432503700256, "step": 2304 }, { "batch_size": 4, "epoch": 0.9216, "step": 2304, "tokens_per_device": 4008 }, { "epoch": 0.9216, "loss_ce": 0.3697468042373657, "loss_lvr": 0.8829972147941589, "loss_mode_switch": 0.0, "loss_total": 0.4580465257167816, "step": 2304 }, { "batch_size": 4, "epoch": 0.9216, "step": 2304, "tokens_per_device": 2672 }, { "epoch": 0.9216, "loss_ce": 0.11116259545087814, "loss_lvr": 0.6235771179199219, "loss_mode_switch": 0.0, "loss_total": 0.1735203117132187, "step": 2304 }, { "batch_size": 1, "epoch": 0.9216, "step": 2304, "tokens_per_device": 5031 }, { "epoch": 0.9216, "loss_ce": 0.3526197671890259, "loss_lvr": 0.38744863867759705, "loss_mode_switch": 0.0, "loss_total": 0.3913646340370178, "step": 2304 }, { "batch_size": 1, "epoch": 0.9216, "step": 2304, "tokens_per_device": 5165 }, { "epoch": 0.9216, "loss_ce": 0.0007202611886896193, "loss_lvr": 0.34185272455215454, "loss_mode_switch": 0.0, "loss_total": 0.034905534237623215, "step": 2304 }, { "batch_size": 4, "epoch": 0.9216, "step": 2304, "tokens_per_device": 3820 }, { "epoch": 0.9216, "loss_ce": 0.341686487197876, "loss_lvr": 0.8949458599090576, "loss_mode_switch": 0.0, "loss_total": 0.43118107318878174, "step": 2304 }, { "batch_size": 4, "epoch": 0.9216, "step": 2304, "tokens_per_device": 7576 }, { "epoch": 0.9216, "loss_ce": 0.051849957555532455, "loss_lvr": 0.6290664672851562, "loss_mode_switch": 0.0, "loss_total": 0.11475659906864166, "step": 2304 }, { "epoch": 0.922, "grad_norm": 1.3318108320236206, "learning_rate": 1.5869914928752117e-07, "loss": 0.2833, "step": 2305 }, { "batch_size": 4, "epoch": 0.922, "step": 2305, "tokens_per_device": 7640 }, { "epoch": 0.922, "loss_ce": 0.5069537162780762, "loss_lvr": 0.5616072416305542, "loss_mode_switch": 0.0, "loss_total": 0.5631144642829895, "step": 2305 }, { "batch_size": 1, "epoch": 0.922, "step": 2305, "tokens_per_device": 5163 }, { "epoch": 0.922, "loss_ce": 0.036165907979011536, "loss_lvr": 0.20792432129383087, "loss_mode_switch": 0.0, "loss_total": 0.05695834010839462, "step": 2305 }, { "batch_size": 4, "epoch": 0.922, "step": 2305, "tokens_per_device": 5832 }, { "epoch": 0.922, "loss_ce": 0.2038859724998474, "loss_lvr": 0.7334311008453369, "loss_mode_switch": 0.0, "loss_total": 0.27722907066345215, "step": 2305 }, { "batch_size": 4, "epoch": 0.922, "step": 2305, "tokens_per_device": 4248 }, { "epoch": 0.922, "loss_ce": 0.1968839466571808, "loss_lvr": 0.5331146717071533, "loss_mode_switch": 0.0, "loss_total": 0.2501954138278961, "step": 2305 }, { "batch_size": 4, "epoch": 0.922, "step": 2305, "tokens_per_device": 5812 }, { "epoch": 0.922, "loss_ce": 0.15702368319034576, "loss_lvr": 0.9959023594856262, "loss_mode_switch": 0.0, "loss_total": 0.25661391019821167, "step": 2305 }, { "batch_size": 4, "epoch": 0.922, "step": 2305, "tokens_per_device": 7016 }, { "epoch": 0.922, "loss_ce": 0.2223910391330719, "loss_lvr": 0.8051072955131531, "loss_mode_switch": 0.0, "loss_total": 0.3029017746448517, "step": 2305 }, { "batch_size": 4, "epoch": 0.922, "step": 2305, "tokens_per_device": 2972 }, { "epoch": 0.922, "loss_ce": 0.19291557371616364, "loss_lvr": 0.7899290919303894, "loss_mode_switch": 0.0, "loss_total": 0.2719084918498993, "step": 2305 }, { "batch_size": 1, "epoch": 0.922, "step": 2305, "tokens_per_device": 4951 }, { "epoch": 0.922, "loss_ce": 0.009414393454790115, "loss_lvr": 0.2809496819972992, "loss_mode_switch": 0.0, "loss_total": 0.037509359419345856, "step": 2305 }, { "epoch": 0.9224, "grad_norm": 1.327061653137207, "learning_rate": 1.5708419435684463e-07, "loss": 0.2756, "step": 2306 }, { "batch_size": 4, "epoch": 0.9224, "step": 2306, "tokens_per_device": 4444 }, { "epoch": 0.9224, "loss_ce": 0.20108065009117126, "loss_lvr": 0.8478119969367981, "loss_mode_switch": 0.0, "loss_total": 0.2858618497848511, "step": 2306 }, { "batch_size": 1, "epoch": 0.9224, "step": 2306, "tokens_per_device": 4978 }, { "epoch": 0.9224, "loss_ce": 0.04529285430908203, "loss_lvr": 0.4339442551136017, "loss_mode_switch": 0.0, "loss_total": 0.08868728578090668, "step": 2306 }, { "batch_size": 4, "epoch": 0.9224, "step": 2306, "tokens_per_device": 10628 }, { "epoch": 0.9224, "loss_ce": 0.1293633133172989, "loss_lvr": 1.0577878952026367, "loss_mode_switch": 0.0, "loss_total": 0.23514211177825928, "step": 2306 }, { "batch_size": 4, "epoch": 0.9224, "step": 2306, "tokens_per_device": 2612 }, { "epoch": 0.9224, "loss_ce": 0.5002831816673279, "loss_lvr": 0.799787700176239, "loss_mode_switch": 0.0, "loss_total": 0.5802619457244873, "step": 2306 }, { "batch_size": 1, "epoch": 0.9224, "step": 2306, "tokens_per_device": 4748 }, { "epoch": 0.9224, "loss_ce": 0.007244592532515526, "loss_lvr": 0.24568641185760498, "loss_mode_switch": 0.0, "loss_total": 0.031813234090805054, "step": 2306 }, { "batch_size": 1, "epoch": 0.9224, "step": 2306, "tokens_per_device": 4874 }, { "epoch": 0.9224, "loss_ce": 0.0002344062813790515, "loss_lvr": 0.41954949498176575, "loss_mode_switch": 0.0, "loss_total": 0.042189355939626694, "step": 2306 }, { "batch_size": 1, "epoch": 0.9224, "step": 2306, "tokens_per_device": 4707 }, { "epoch": 0.9224, "loss_ce": 0.08385653048753738, "loss_lvr": 0.26806774735450745, "loss_mode_switch": 0.0, "loss_total": 0.11066330969333649, "step": 2306 }, { "batch_size": 4, "epoch": 0.9224, "step": 2306, "tokens_per_device": 1572 }, { "epoch": 0.9224, "loss_ce": 0.45262423157691956, "loss_lvr": 0.8797351717948914, "loss_mode_switch": 0.0, "loss_total": 0.5405977368354797, "step": 2306 }, { "epoch": 0.9228, "grad_norm": 1.0722180604934692, "learning_rate": 1.5547736741536367e-07, "loss": 0.2119, "step": 2307 }, { "batch_size": 4, "epoch": 0.9228, "step": 2307, "tokens_per_device": 2776 }, { "epoch": 0.9228, "loss_ce": 0.5061200261116028, "loss_lvr": 0.7009314894676208, "loss_mode_switch": 0.0, "loss_total": 0.5762131810188293, "step": 2307 }, { "batch_size": 4, "epoch": 0.9228, "step": 2307, "tokens_per_device": 2680 }, { "epoch": 0.9228, "loss_ce": 0.02664320543408394, "loss_lvr": 0.47991830110549927, "loss_mode_switch": 0.0, "loss_total": 0.07463503628969193, "step": 2307 }, { "batch_size": 1, "epoch": 0.9228, "step": 2307, "tokens_per_device": 5166 }, { "epoch": 0.9228, "loss_ce": 0.05706215649843216, "loss_lvr": 0.45241779088974, "loss_mode_switch": 0.0, "loss_total": 0.10230393707752228, "step": 2307 }, { "batch_size": 1, "epoch": 0.9228, "step": 2307, "tokens_per_device": 6550 }, { "epoch": 0.9228, "loss_ce": 0.055805400013923645, "loss_lvr": 0.26704901456832886, "loss_mode_switch": 0.0, "loss_total": 0.08251029998064041, "step": 2307 }, { "batch_size": 4, "epoch": 0.9228, "step": 2307, "tokens_per_device": 4276 }, { "epoch": 0.9228, "loss_ce": 0.25005200505256653, "loss_lvr": 1.2896308898925781, "loss_mode_switch": 0.0, "loss_total": 0.37901508808135986, "step": 2307 }, { "batch_size": 1, "epoch": 0.9228, "step": 2307, "tokens_per_device": 5120 }, { "epoch": 0.9228, "loss_ce": 0.03643417730927467, "loss_lvr": 0.619907557964325, "loss_mode_switch": 0.0, "loss_total": 0.09842493385076523, "step": 2307 }, { "batch_size": 4, "epoch": 0.9228, "step": 2307, "tokens_per_device": 4440 }, { "epoch": 0.9228, "loss_ce": 0.2609233856201172, "loss_lvr": 0.8194228410720825, "loss_mode_switch": 0.0, "loss_total": 0.3428656756877899, "step": 2307 }, { "batch_size": 4, "epoch": 0.9228, "step": 2307, "tokens_per_device": 1404 }, { "epoch": 0.9228, "loss_ce": 0.26453492045402527, "loss_lvr": 0.8250412940979004, "loss_mode_switch": 0.0, "loss_total": 0.34703904390335083, "step": 2307 }, { "epoch": 0.9232, "grad_norm": 1.7272599935531616, "learning_rate": 1.5387867115985721e-07, "loss": 0.2989, "step": 2308 }, { "batch_size": 1, "epoch": 0.9232, "step": 2308, "tokens_per_device": 5016 }, { "epoch": 0.9232, "loss_ce": 0.021807940676808357, "loss_lvr": 0.3288916349411011, "loss_mode_switch": 0.0, "loss_total": 0.054697103798389435, "step": 2308 }, { "batch_size": 1, "epoch": 0.9232, "step": 2308, "tokens_per_device": 4881 }, { "epoch": 0.9232, "loss_ce": 0.009019897319376469, "loss_lvr": 1.5904046297073364, "loss_mode_switch": 0.0, "loss_total": 0.16806036233901978, "step": 2308 }, { "batch_size": 4, "epoch": 0.9232, "step": 2308, "tokens_per_device": 4680 }, { "epoch": 0.9232, "loss_ce": 0.14585041999816895, "loss_lvr": 0.7738414406776428, "loss_mode_switch": 0.0, "loss_total": 0.22323456406593323, "step": 2308 }, { "batch_size": 4, "epoch": 0.9232, "step": 2308, "tokens_per_device": 2588 }, { "epoch": 0.9232, "loss_ce": 0.1405077874660492, "loss_lvr": 0.9491721391677856, "loss_mode_switch": 0.0, "loss_total": 0.23542499542236328, "step": 2308 }, { "batch_size": 1, "epoch": 0.9232, "step": 2308, "tokens_per_device": 5209 }, { "epoch": 0.9232, "loss_ce": 0.5858762860298157, "loss_lvr": 0.3007993996143341, "loss_mode_switch": 0.0, "loss_total": 0.6159562468528748, "step": 2308 }, { "batch_size": 4, "epoch": 0.9232, "step": 2308, "tokens_per_device": 3440 }, { "epoch": 0.9232, "loss_ce": 0.11289720982313156, "loss_lvr": 0.952901303768158, "loss_mode_switch": 0.0, "loss_total": 0.20818734169006348, "step": 2308 }, { "batch_size": 1, "epoch": 0.9232, "step": 2308, "tokens_per_device": 5078 }, { "epoch": 0.9232, "loss_ce": 0.027201423421502113, "loss_lvr": 0.9280897378921509, "loss_mode_switch": 0.0, "loss_total": 0.12001039832830429, "step": 2308 }, { "batch_size": 1, "epoch": 0.9232, "step": 2308, "tokens_per_device": 4819 }, { "epoch": 0.9232, "loss_ce": 0.00567134004086256, "loss_lvr": 0.2896063029766083, "loss_mode_switch": 0.0, "loss_total": 0.03463197126984596, "step": 2308 }, { "epoch": 0.9236, "grad_norm": 1.192423701286316, "learning_rate": 1.522881082734584e-07, "loss": 0.2637, "step": 2309 }, { "batch_size": 4, "epoch": 0.9236, "step": 2309, "tokens_per_device": 2600 }, { "epoch": 0.9236, "loss_ce": 0.12125495076179504, "loss_lvr": 0.7385908961296082, "loss_mode_switch": 0.0, "loss_total": 0.19511404633522034, "step": 2309 }, { "batch_size": 4, "epoch": 0.9236, "step": 2309, "tokens_per_device": 3600 }, { "epoch": 0.9236, "loss_ce": 0.4978872239589691, "loss_lvr": 1.150134801864624, "loss_mode_switch": 0.0, "loss_total": 0.6129007339477539, "step": 2309 }, { "batch_size": 4, "epoch": 0.9236, "step": 2309, "tokens_per_device": 4204 }, { "epoch": 0.9236, "loss_ce": 0.07250085473060608, "loss_lvr": 0.7836621403694153, "loss_mode_switch": 0.0, "loss_total": 0.15086707472801208, "step": 2309 }, { "batch_size": 1, "epoch": 0.9236, "step": 2309, "tokens_per_device": 4906 }, { "epoch": 0.9236, "loss_ce": 0.01122790202498436, "loss_lvr": 0.2917400002479553, "loss_mode_switch": 0.0, "loss_total": 0.04040190204977989, "step": 2309 }, { "batch_size": 4, "epoch": 0.9236, "step": 2309, "tokens_per_device": 5112 }, { "epoch": 0.9236, "loss_ce": 0.42150840163230896, "loss_lvr": 1.2304154634475708, "loss_mode_switch": 0.0, "loss_total": 0.5445499420166016, "step": 2309 }, { "batch_size": 1, "epoch": 0.9236, "step": 2309, "tokens_per_device": 5151 }, { "epoch": 0.9236, "loss_ce": 0.10473871231079102, "loss_lvr": 0.30800554156303406, "loss_mode_switch": 0.0, "loss_total": 0.13553926348686218, "step": 2309 }, { "batch_size": 1, "epoch": 0.9236, "step": 2309, "tokens_per_device": 5293 }, { "epoch": 0.9236, "loss_ce": 1.2421283721923828, "loss_lvr": 0.4974762797355652, "loss_mode_switch": 0.0, "loss_total": 1.291875958442688, "step": 2309 }, { "batch_size": 4, "epoch": 0.9236, "step": 2309, "tokens_per_device": 4216 }, { "epoch": 0.9236, "loss_ce": 0.28142738342285156, "loss_lvr": 0.8801566958427429, "loss_mode_switch": 0.0, "loss_total": 0.36944305896759033, "step": 2309 }, { "epoch": 0.924, "grad_norm": 1.3734084367752075, "learning_rate": 1.5070568142564912e-07, "loss": 0.3142, "step": 2310 }, { "batch_size": 1, "epoch": 0.924, "step": 2310, "tokens_per_device": 5030 }, { "epoch": 0.924, "loss_ce": 0.049249205738306046, "loss_lvr": 0.3575284481048584, "loss_mode_switch": 0.0, "loss_total": 0.08500204980373383, "step": 2310 }, { "batch_size": 4, "epoch": 0.924, "step": 2310, "tokens_per_device": 4904 }, { "epoch": 0.924, "loss_ce": 0.08393511921167374, "loss_lvr": 0.7471767663955688, "loss_mode_switch": 0.0, "loss_total": 0.15865279734134674, "step": 2310 }, { "batch_size": 4, "epoch": 0.924, "step": 2310, "tokens_per_device": 4480 }, { "epoch": 0.924, "loss_ce": 0.4974195659160614, "loss_lvr": 0.7559587359428406, "loss_mode_switch": 0.0, "loss_total": 0.5730154514312744, "step": 2310 }, { "batch_size": 4, "epoch": 0.924, "step": 2310, "tokens_per_device": 1716 }, { "epoch": 0.924, "loss_ce": 0.22986383736133575, "loss_lvr": 0.8705633282661438, "loss_mode_switch": 0.0, "loss_total": 0.3169201612472534, "step": 2310 }, { "batch_size": 1, "epoch": 0.924, "step": 2310, "tokens_per_device": 5428 }, { "epoch": 0.924, "loss_ce": 0.0056455861777067184, "loss_lvr": 0.29391729831695557, "loss_mode_switch": 0.0, "loss_total": 0.035037316381931305, "step": 2310 }, { "batch_size": 1, "epoch": 0.924, "step": 2310, "tokens_per_device": 5176 }, { "epoch": 0.924, "loss_ce": 0.001962939975783229, "loss_lvr": 0.2791260778903961, "loss_mode_switch": 0.0, "loss_total": 0.02987554669380188, "step": 2310 }, { "batch_size": 4, "epoch": 0.924, "step": 2310, "tokens_per_device": 4596 }, { "epoch": 0.924, "loss_ce": 0.06634053587913513, "loss_lvr": 0.9346766471862793, "loss_mode_switch": 0.0, "loss_total": 0.1598082035779953, "step": 2310 }, { "batch_size": 4, "epoch": 0.924, "step": 2310, "tokens_per_device": 3848 }, { "epoch": 0.924, "loss_ce": 0.23971417546272278, "loss_lvr": 0.8663204312324524, "loss_mode_switch": 0.0, "loss_total": 0.326346218585968, "step": 2310 }, { "epoch": 0.9244, "grad_norm": 1.2187191247940063, "learning_rate": 1.4913139327225546e-07, "loss": 0.2748, "step": 2311 }, { "batch_size": 4, "epoch": 0.9244, "step": 2311, "tokens_per_device": 4384 }, { "epoch": 0.9244, "loss_ce": 0.3814861476421356, "loss_lvr": 1.2203916311264038, "loss_mode_switch": 0.0, "loss_total": 0.5035253167152405, "step": 2311 }, { "batch_size": 4, "epoch": 0.9244, "step": 2311, "tokens_per_device": 3364 }, { "epoch": 0.9244, "loss_ce": 0.000266539107542485, "loss_lvr": 0.7293931245803833, "loss_mode_switch": 0.0, "loss_total": 0.0732058510184288, "step": 2311 }, { "batch_size": 1, "epoch": 0.9244, "step": 2311, "tokens_per_device": 4906 }, { "epoch": 0.9244, "loss_ce": 0.2260131686925888, "loss_lvr": 0.26051756739616394, "loss_mode_switch": 0.0, "loss_total": 0.25206491351127625, "step": 2311 }, { "batch_size": 1, "epoch": 0.9244, "step": 2311, "tokens_per_device": 5139 }, { "epoch": 0.9244, "loss_ce": 0.014021812006831169, "loss_lvr": 0.39698526263237, "loss_mode_switch": 0.0, "loss_total": 0.05372034013271332, "step": 2311 }, { "batch_size": 4, "epoch": 0.9244, "step": 2311, "tokens_per_device": 4592 }, { "epoch": 0.9244, "loss_ce": 0.4507417678833008, "loss_lvr": 0.7641903758049011, "loss_mode_switch": 0.0, "loss_total": 0.5271608233451843, "step": 2311 }, { "batch_size": 4, "epoch": 0.9244, "step": 2311, "tokens_per_device": 4908 }, { "epoch": 0.9244, "loss_ce": 0.15235261619091034, "loss_lvr": 0.5520187616348267, "loss_mode_switch": 0.0, "loss_total": 0.20755448937416077, "step": 2311 }, { "batch_size": 1, "epoch": 0.9244, "step": 2311, "tokens_per_device": 5165 }, { "epoch": 0.9244, "loss_ce": 0.26346030831336975, "loss_lvr": 0.5582262277603149, "loss_mode_switch": 0.0, "loss_total": 0.3192829191684723, "step": 2311 }, { "batch_size": 4, "epoch": 0.9244, "step": 2311, "tokens_per_device": 1336 }, { "epoch": 0.9244, "loss_ce": 0.32820332050323486, "loss_lvr": 1.0379984378814697, "loss_mode_switch": 0.0, "loss_total": 0.4320031702518463, "step": 2311 }, { "epoch": 0.9248, "grad_norm": 1.247722864151001, "learning_rate": 1.475652464554478e-07, "loss": 0.2732, "step": 2312 }, { "batch_size": 1, "epoch": 0.9248, "step": 2312, "tokens_per_device": 4991 }, { "epoch": 0.9248, "loss_ce": 0.006403876002877951, "loss_lvr": 0.17762209475040436, "loss_mode_switch": 0.0, "loss_total": 0.024166086688637733, "step": 2312 }, { "batch_size": 4, "epoch": 0.9248, "step": 2312, "tokens_per_device": 3828 }, { "epoch": 0.9248, "loss_ce": 0.06051211431622505, "loss_lvr": 1.065542459487915, "loss_mode_switch": 0.0, "loss_total": 0.16706636548042297, "step": 2312 }, { "batch_size": 1, "epoch": 0.9248, "step": 2312, "tokens_per_device": 4874 }, { "epoch": 0.9248, "loss_ce": 0.0037866958882659674, "loss_lvr": 0.2512635290622711, "loss_mode_switch": 0.0, "loss_total": 0.028913049027323723, "step": 2312 }, { "batch_size": 4, "epoch": 0.9248, "step": 2312, "tokens_per_device": 4748 }, { "epoch": 0.9248, "loss_ce": 0.48231959342956543, "loss_lvr": 0.8901765942573547, "loss_mode_switch": 0.0, "loss_total": 0.5713372230529785, "step": 2312 }, { "batch_size": 1, "epoch": 0.9248, "step": 2312, "tokens_per_device": 4870 }, { "epoch": 0.9248, "loss_ce": 0.011816625483334064, "loss_lvr": 0.16178016364574432, "loss_mode_switch": 0.0, "loss_total": 0.02799464389681816, "step": 2312 }, { "batch_size": 4, "epoch": 0.9248, "step": 2312, "tokens_per_device": 4360 }, { "epoch": 0.9248, "loss_ce": 0.09637128561735153, "loss_lvr": 0.859738290309906, "loss_mode_switch": 0.0, "loss_total": 0.18234512209892273, "step": 2312 }, { "batch_size": 4, "epoch": 0.9248, "step": 2312, "tokens_per_device": 4340 }, { "epoch": 0.9248, "loss_ce": 0.024453960359096527, "loss_lvr": 0.9590240716934204, "loss_mode_switch": 0.0, "loss_total": 0.12035636603832245, "step": 2312 }, { "batch_size": 4, "epoch": 0.9248, "step": 2312, "tokens_per_device": 5728 }, { "epoch": 0.9248, "loss_ce": 0.03625204414129257, "loss_lvr": 1.6590704917907715, "loss_mode_switch": 0.0, "loss_total": 0.2021591067314148, "step": 2312 }, { "epoch": 0.9252, "grad_norm": 1.108476161956787, "learning_rate": 1.4600724360372853e-07, "loss": 0.224, "step": 2313 }, { "batch_size": 4, "epoch": 0.9252, "step": 2313, "tokens_per_device": 1316 }, { "epoch": 0.9252, "loss_ce": 0.3394130766391754, "loss_lvr": 0.8644635677337646, "loss_mode_switch": 0.0, "loss_total": 0.4258594512939453, "step": 2313 }, { "batch_size": 1, "epoch": 0.9252, "step": 2313, "tokens_per_device": 4866 }, { "epoch": 0.9252, "loss_ce": 0.09837222099304199, "loss_lvr": 0.4507412016391754, "loss_mode_switch": 0.0, "loss_total": 0.14344634115695953, "step": 2313 }, { "batch_size": 4, "epoch": 0.9252, "step": 2313, "tokens_per_device": 2568 }, { "epoch": 0.9252, "loss_ce": 0.040122032165527344, "loss_lvr": 0.6674848198890686, "loss_mode_switch": 0.0, "loss_total": 0.10687051713466644, "step": 2313 }, { "batch_size": 1, "epoch": 0.9252, "step": 2313, "tokens_per_device": 5023 }, { "epoch": 0.9252, "loss_ce": 0.12082123756408691, "loss_lvr": 0.19128693640232086, "loss_mode_switch": 0.0, "loss_total": 0.13994993269443512, "step": 2313 }, { "batch_size": 4, "epoch": 0.9252, "step": 2313, "tokens_per_device": 5736 }, { "epoch": 0.9252, "loss_ce": 0.05959991738200188, "loss_lvr": 0.5058811902999878, "loss_mode_switch": 0.0, "loss_total": 0.11018803715705872, "step": 2313 }, { "batch_size": 4, "epoch": 0.9252, "step": 2313, "tokens_per_device": 13316 }, { "epoch": 0.9252, "loss_ce": 0.13276147842407227, "loss_lvr": 0.47637397050857544, "loss_mode_switch": 0.0, "loss_total": 0.1803988814353943, "step": 2313 }, { "batch_size": 1, "epoch": 0.9252, "step": 2313, "tokens_per_device": 5028 }, { "epoch": 0.9252, "loss_ce": 0.000565295631531626, "loss_lvr": 0.2297893762588501, "loss_mode_switch": 0.0, "loss_total": 0.023544233292341232, "step": 2313 }, { "batch_size": 1, "epoch": 0.9252, "step": 2313, "tokens_per_device": 4855 }, { "epoch": 0.9252, "loss_ce": 0.13560281693935394, "loss_lvr": 0.2787575125694275, "loss_mode_switch": 0.0, "loss_total": 0.1634785681962967, "step": 2313 }, { "epoch": 0.9256, "grad_norm": 1.405836820602417, "learning_rate": 1.4445738733193494e-07, "loss": 0.3129, "step": 2314 }, { "batch_size": 1, "epoch": 0.9256, "step": 2314, "tokens_per_device": 4884 }, { "epoch": 0.9256, "loss_ce": 0.00019270066695753485, "loss_lvr": 0.21442890167236328, "loss_mode_switch": 0.0, "loss_total": 0.021635590121150017, "step": 2314 }, { "batch_size": 4, "epoch": 0.9256, "step": 2314, "tokens_per_device": 3940 }, { "epoch": 0.9256, "loss_ce": 0.37040919065475464, "loss_lvr": 1.6171103715896606, "loss_mode_switch": 0.0, "loss_total": 0.5321202278137207, "step": 2314 }, { "batch_size": 1, "epoch": 0.9256, "step": 2314, "tokens_per_device": 6632 }, { "epoch": 0.9256, "loss_ce": 0.20811428129673004, "loss_lvr": 0.24561162292957306, "loss_mode_switch": 0.0, "loss_total": 0.2326754480600357, "step": 2314 }, { "batch_size": 1, "epoch": 0.9256, "step": 2314, "tokens_per_device": 5110 }, { "epoch": 0.9256, "loss_ce": 0.024793829768896103, "loss_lvr": 0.4313293695449829, "loss_mode_switch": 0.0, "loss_total": 0.06792676448822021, "step": 2314 }, { "batch_size": 1, "epoch": 0.9256, "step": 2314, "tokens_per_device": 4875 }, { "epoch": 0.9256, "loss_ce": 0.007576724048703909, "loss_lvr": 0.4824933111667633, "loss_mode_switch": 0.0, "loss_total": 0.05582605302333832, "step": 2314 }, { "batch_size": 4, "epoch": 0.9256, "step": 2314, "tokens_per_device": 1324 }, { "epoch": 0.9256, "loss_ce": 0.4394887387752533, "loss_lvr": 1.0826033353805542, "loss_mode_switch": 0.0, "loss_total": 0.5477490425109863, "step": 2314 }, { "batch_size": 4, "epoch": 0.9256, "step": 2314, "tokens_per_device": 4248 }, { "epoch": 0.9256, "loss_ce": 0.43681007623672485, "loss_lvr": 0.7560722231864929, "loss_mode_switch": 0.0, "loss_total": 0.5124173164367676, "step": 2314 }, { "batch_size": 4, "epoch": 0.9256, "step": 2314, "tokens_per_device": 4652 }, { "epoch": 0.9256, "loss_ce": 0.13307644426822662, "loss_lvr": 0.770048975944519, "loss_mode_switch": 0.0, "loss_total": 0.2100813388824463, "step": 2314 }, { "epoch": 0.926, "grad_norm": 1.0386536121368408, "learning_rate": 1.4291568024122848e-07, "loss": 0.2069, "step": 2315 }, { "batch_size": 4, "epoch": 0.926, "step": 2315, "tokens_per_device": 4332 }, { "epoch": 0.926, "loss_ce": 0.01521716546267271, "loss_lvr": 0.6443729996681213, "loss_mode_switch": 0.0, "loss_total": 0.07965446263551712, "step": 2315 }, { "batch_size": 4, "epoch": 0.926, "step": 2315, "tokens_per_device": 4220 }, { "epoch": 0.926, "loss_ce": 0.2625664472579956, "loss_lvr": 1.0475517511367798, "loss_mode_switch": 0.0, "loss_total": 0.36732161045074463, "step": 2315 }, { "batch_size": 4, "epoch": 0.926, "step": 2315, "tokens_per_device": 4368 }, { "epoch": 0.926, "loss_ce": 0.5990091562271118, "loss_lvr": 0.9433504939079285, "loss_mode_switch": 0.0, "loss_total": 0.693344235420227, "step": 2315 }, { "batch_size": 4, "epoch": 0.926, "step": 2315, "tokens_per_device": 6584 }, { "epoch": 0.926, "loss_ce": 0.0875718742609024, "loss_lvr": 0.6919827461242676, "loss_mode_switch": 0.0, "loss_total": 0.15677013993263245, "step": 2315 }, { "batch_size": 1, "epoch": 0.926, "step": 2315, "tokens_per_device": 4878 }, { "epoch": 0.926, "loss_ce": 0.3074730932712555, "loss_lvr": 0.359839528799057, "loss_mode_switch": 0.0, "loss_total": 0.34345704317092896, "step": 2315 }, { "batch_size": 4, "epoch": 0.926, "step": 2315, "tokens_per_device": 2528 }, { "epoch": 0.926, "loss_ce": 0.11217968910932541, "loss_lvr": 0.7897331714630127, "loss_mode_switch": 0.0, "loss_total": 0.19115300476551056, "step": 2315 }, { "batch_size": 4, "epoch": 0.926, "step": 2315, "tokens_per_device": 2672 }, { "epoch": 0.926, "loss_ce": 0.2606807351112366, "loss_lvr": 0.8507741093635559, "loss_mode_switch": 0.0, "loss_total": 0.3457581400871277, "step": 2315 }, { "batch_size": 4, "epoch": 0.926, "step": 2315, "tokens_per_device": 5776 }, { "epoch": 0.926, "loss_ce": 0.08258248120546341, "loss_lvr": 0.5324276089668274, "loss_mode_switch": 0.0, "loss_total": 0.1358252465724945, "step": 2315 }, { "epoch": 0.9264, "grad_norm": 1.305841326713562, "learning_rate": 1.4138212491909776e-07, "loss": 0.2686, "step": 2316 }, { "batch_size": 1, "epoch": 0.9264, "step": 2316, "tokens_per_device": 4869 }, { "epoch": 0.9264, "loss_ce": 0.0021552860271185637, "loss_lvr": 0.22888940572738647, "loss_mode_switch": 0.0, "loss_total": 0.02504422701895237, "step": 2316 }, { "batch_size": 1, "epoch": 0.9264, "step": 2316, "tokens_per_device": 5468 }, { "epoch": 0.9264, "loss_ce": 0.20501352846622467, "loss_lvr": 0.21650291979312897, "loss_mode_switch": 0.0, "loss_total": 0.22666382789611816, "step": 2316 }, { "batch_size": 4, "epoch": 0.9264, "step": 2316, "tokens_per_device": 13036 }, { "epoch": 0.9264, "loss_ce": 0.2109842151403427, "loss_lvr": 0.577156126499176, "loss_mode_switch": 0.0, "loss_total": 0.2686998248100281, "step": 2316 }, { "batch_size": 4, "epoch": 0.9264, "step": 2316, "tokens_per_device": 3796 }, { "epoch": 0.9264, "loss_ce": 0.9123578667640686, "loss_lvr": 1.0167008638381958, "loss_mode_switch": 0.0, "loss_total": 1.0140279531478882, "step": 2316 }, { "batch_size": 4, "epoch": 0.9264, "step": 2316, "tokens_per_device": 4244 }, { "epoch": 0.9264, "loss_ce": 0.32844221591949463, "loss_lvr": 0.6358184218406677, "loss_mode_switch": 0.0, "loss_total": 0.39202407002449036, "step": 2316 }, { "batch_size": 1, "epoch": 0.9264, "step": 2316, "tokens_per_device": 4947 }, { "epoch": 0.9264, "loss_ce": 0.036096930503845215, "loss_lvr": 0.38266679644584656, "loss_mode_switch": 0.0, "loss_total": 0.07436361163854599, "step": 2316 }, { "batch_size": 4, "epoch": 0.9264, "step": 2316, "tokens_per_device": 10328 }, { "epoch": 0.9264, "loss_ce": 0.0465405210852623, "loss_lvr": 0.9338993430137634, "loss_mode_switch": 0.0, "loss_total": 0.13993045687675476, "step": 2316 }, { "batch_size": 1, "epoch": 0.9264, "step": 2316, "tokens_per_device": 5165 }, { "epoch": 0.9264, "loss_ce": 8.920008258428425e-05, "loss_lvr": 0.3352140188217163, "loss_mode_switch": 0.0, "loss_total": 0.03361060097813606, "step": 2316 }, { "epoch": 0.9268, "grad_norm": 1.4149335622787476, "learning_rate": 1.3985672393934557e-07, "loss": 0.3148, "step": 2317 }, { "batch_size": 4, "epoch": 0.9268, "step": 2317, "tokens_per_device": 4020 }, { "epoch": 0.9268, "loss_ce": 0.14529305696487427, "loss_lvr": 0.6156476140022278, "loss_mode_switch": 0.0, "loss_total": 0.2068578153848648, "step": 2317 }, { "batch_size": 1, "epoch": 0.9268, "step": 2317, "tokens_per_device": 5190 }, { "epoch": 0.9268, "loss_ce": 0.1256246566772461, "loss_lvr": 0.2591801881790161, "loss_mode_switch": 0.0, "loss_total": 0.15154267847537994, "step": 2317 }, { "batch_size": 1, "epoch": 0.9268, "step": 2317, "tokens_per_device": 5130 }, { "epoch": 0.9268, "loss_ce": 0.19881145656108856, "loss_lvr": 0.3351539373397827, "loss_mode_switch": 0.0, "loss_total": 0.23232685029506683, "step": 2317 }, { "batch_size": 4, "epoch": 0.9268, "step": 2317, "tokens_per_device": 4468 }, { "epoch": 0.9268, "loss_ce": 0.014769518747925758, "loss_lvr": 0.6006104946136475, "loss_mode_switch": 0.0, "loss_total": 0.0748305693268776, "step": 2317 }, { "batch_size": 1, "epoch": 0.9268, "step": 2317, "tokens_per_device": 4899 }, { "epoch": 0.9268, "loss_ce": 0.06764260679483414, "loss_lvr": 0.3986024260520935, "loss_mode_switch": 0.0, "loss_total": 0.10750284790992737, "step": 2317 }, { "batch_size": 4, "epoch": 0.9268, "step": 2317, "tokens_per_device": 1340 }, { "epoch": 0.9268, "loss_ce": 0.28600209951400757, "loss_lvr": 0.8355413675308228, "loss_mode_switch": 0.0, "loss_total": 0.3695562481880188, "step": 2317 }, { "batch_size": 4, "epoch": 0.9268, "step": 2317, "tokens_per_device": 10976 }, { "epoch": 0.9268, "loss_ce": 0.29871681332588196, "loss_lvr": 0.7115683555603027, "loss_mode_switch": 0.0, "loss_total": 0.36987364292144775, "step": 2317 }, { "batch_size": 4, "epoch": 0.9268, "step": 2317, "tokens_per_device": 2612 }, { "epoch": 0.9268, "loss_ce": 0.04097885265946388, "loss_lvr": 0.6961334347724915, "loss_mode_switch": 0.0, "loss_total": 0.11059220135211945, "step": 2317 }, { "epoch": 0.9272, "grad_norm": 1.3282102346420288, "learning_rate": 1.383394798620935e-07, "loss": 0.2858, "step": 2318 }, { "batch_size": 4, "epoch": 0.9272, "step": 2318, "tokens_per_device": 6052 }, { "epoch": 0.9272, "loss_ce": 0.17966094613075256, "loss_lvr": 0.9803025126457214, "loss_mode_switch": 0.0, "loss_total": 0.27769118547439575, "step": 2318 }, { "batch_size": 4, "epoch": 0.9272, "step": 2318, "tokens_per_device": 4388 }, { "epoch": 0.9272, "loss_ce": 0.0226285420358181, "loss_lvr": 0.9272008538246155, "loss_mode_switch": 0.0, "loss_total": 0.11534862220287323, "step": 2318 }, { "batch_size": 4, "epoch": 0.9272, "step": 2318, "tokens_per_device": 3844 }, { "epoch": 0.9272, "loss_ce": 0.4131431579589844, "loss_lvr": 0.8539919257164001, "loss_mode_switch": 0.0, "loss_total": 0.4985423684120178, "step": 2318 }, { "batch_size": 4, "epoch": 0.9272, "step": 2318, "tokens_per_device": 4704 }, { "epoch": 0.9272, "loss_ce": 0.3406813442707062, "loss_lvr": 0.8620613813400269, "loss_mode_switch": 0.0, "loss_total": 0.42688748240470886, "step": 2318 }, { "batch_size": 1, "epoch": 0.9272, "step": 2318, "tokens_per_device": 5126 }, { "epoch": 0.9272, "loss_ce": 0.22457355260849, "loss_lvr": 0.5072559118270874, "loss_mode_switch": 0.0, "loss_total": 0.2752991318702698, "step": 2318 }, { "batch_size": 4, "epoch": 0.9272, "step": 2318, "tokens_per_device": 8040 }, { "epoch": 0.9272, "loss_ce": 0.10847264528274536, "loss_lvr": 0.4624914526939392, "loss_mode_switch": 0.0, "loss_total": 0.15472179651260376, "step": 2318 }, { "batch_size": 4, "epoch": 0.9272, "step": 2318, "tokens_per_device": 2660 }, { "epoch": 0.9272, "loss_ce": 0.23453541100025177, "loss_lvr": 0.794679582118988, "loss_mode_switch": 0.0, "loss_total": 0.3140033781528473, "step": 2318 }, { "batch_size": 4, "epoch": 0.9272, "step": 2318, "tokens_per_device": 4324 }, { "epoch": 0.9272, "loss_ce": 0.6765151023864746, "loss_lvr": 0.9493862390518188, "loss_mode_switch": 0.0, "loss_total": 0.7714537382125854, "step": 2318 }, { "epoch": 0.9276, "grad_norm": 1.4148938655853271, "learning_rate": 1.3683039523376962e-07, "loss": 0.3089, "step": 2319 }, { "batch_size": 4, "epoch": 0.9276, "step": 2319, "tokens_per_device": 4156 }, { "epoch": 0.9276, "loss_ce": 0.12901759147644043, "loss_lvr": 0.7265351414680481, "loss_mode_switch": 0.0, "loss_total": 0.20167110860347748, "step": 2319 }, { "batch_size": 4, "epoch": 0.9276, "step": 2319, "tokens_per_device": 7304 }, { "epoch": 0.9276, "loss_ce": 0.23069791495800018, "loss_lvr": 0.7176700234413147, "loss_mode_switch": 0.0, "loss_total": 0.30246490240097046, "step": 2319 }, { "batch_size": 4, "epoch": 0.9276, "step": 2319, "tokens_per_device": 5204 }, { "epoch": 0.9276, "loss_ce": 0.10417161136865616, "loss_lvr": 0.7801191806793213, "loss_mode_switch": 0.0, "loss_total": 0.18218353390693665, "step": 2319 }, { "batch_size": 4, "epoch": 0.9276, "step": 2319, "tokens_per_device": 4212 }, { "epoch": 0.9276, "loss_ce": 0.04129907116293907, "loss_lvr": 0.9189448952674866, "loss_mode_switch": 0.0, "loss_total": 0.13319356739521027, "step": 2319 }, { "batch_size": 4, "epoch": 0.9276, "step": 2319, "tokens_per_device": 3772 }, { "epoch": 0.9276, "loss_ce": 0.35646119713783264, "loss_lvr": 0.8713164329528809, "loss_mode_switch": 0.0, "loss_total": 0.4435928463935852, "step": 2319 }, { "batch_size": 1, "epoch": 0.9276, "step": 2319, "tokens_per_device": 4987 }, { "epoch": 0.9276, "loss_ce": 0.011380170471966267, "loss_lvr": 0.5070372819900513, "loss_mode_switch": 0.0, "loss_total": 0.062083899974823, "step": 2319 }, { "batch_size": 4, "epoch": 0.9276, "step": 2319, "tokens_per_device": 15424 }, { "epoch": 0.9276, "loss_ce": 0.3919292390346527, "loss_lvr": 0.7318481802940369, "loss_mode_switch": 0.0, "loss_total": 0.4651140570640564, "step": 2319 }, { "batch_size": 4, "epoch": 0.9276, "step": 2319, "tokens_per_device": 4944 }, { "epoch": 0.9276, "loss_ce": 0.16640134155750275, "loss_lvr": 0.6572378873825073, "loss_mode_switch": 0.0, "loss_total": 0.23212513327598572, "step": 2319 }, { "epoch": 0.928, "grad_norm": 1.3188849687576294, "learning_rate": 1.3532947258710905e-07, "loss": 0.2743, "step": 2320 }, { "batch_size": 4, "epoch": 0.928, "step": 2320, "tokens_per_device": 2292 }, { "epoch": 0.928, "loss_ce": 0.2305658608675003, "loss_lvr": 1.2121306657791138, "loss_mode_switch": 0.0, "loss_total": 0.35177892446517944, "step": 2320 }, { "batch_size": 1, "epoch": 0.928, "step": 2320, "tokens_per_device": 5155 }, { "epoch": 0.928, "loss_ce": 0.10852920264005661, "loss_lvr": 0.3403579294681549, "loss_mode_switch": 0.0, "loss_total": 0.14256499707698822, "step": 2320 }, { "batch_size": 4, "epoch": 0.928, "step": 2320, "tokens_per_device": 9140 }, { "epoch": 0.928, "loss_ce": 0.03881995752453804, "loss_lvr": 0.5749647617340088, "loss_mode_switch": 0.0, "loss_total": 0.09631643444299698, "step": 2320 }, { "batch_size": 1, "epoch": 0.928, "step": 2320, "tokens_per_device": 6283 }, { "epoch": 0.928, "loss_ce": 0.006232696585357189, "loss_lvr": 0.31523221731185913, "loss_mode_switch": 0.0, "loss_total": 0.037755921483039856, "step": 2320 }, { "batch_size": 4, "epoch": 0.928, "step": 2320, "tokens_per_device": 4828 }, { "epoch": 0.928, "loss_ce": 0.06747110188007355, "loss_lvr": 0.5466330051422119, "loss_mode_switch": 0.0, "loss_total": 0.12213440239429474, "step": 2320 }, { "batch_size": 1, "epoch": 0.928, "step": 2320, "tokens_per_device": 4889 }, { "epoch": 0.928, "loss_ce": 0.6355634331703186, "loss_lvr": 0.5725533962249756, "loss_mode_switch": 0.0, "loss_total": 0.6928187608718872, "step": 2320 }, { "batch_size": 4, "epoch": 0.928, "step": 2320, "tokens_per_device": 5832 }, { "epoch": 0.928, "loss_ce": 0.3423769772052765, "loss_lvr": 0.7556601762771606, "loss_mode_switch": 0.0, "loss_total": 0.41794300079345703, "step": 2320 }, { "batch_size": 1, "epoch": 0.928, "step": 2320, "tokens_per_device": 4762 }, { "epoch": 0.928, "loss_ce": 0.009821133688092232, "loss_lvr": 0.2430489957332611, "loss_mode_switch": 0.0, "loss_total": 0.03412603586912155, "step": 2320 }, { "epoch": 0.9284, "grad_norm": 1.276841163635254, "learning_rate": 1.3383671444114953e-07, "loss": 0.2379, "step": 2321 }, { "batch_size": 4, "epoch": 0.9284, "step": 2321, "tokens_per_device": 5032 }, { "epoch": 0.9284, "loss_ce": 0.19638095796108246, "loss_lvr": 0.6530048251152039, "loss_mode_switch": 0.0, "loss_total": 0.2616814374923706, "step": 2321 }, { "batch_size": 1, "epoch": 0.9284, "step": 2321, "tokens_per_device": 6510 }, { "epoch": 0.9284, "loss_ce": 0.05082312598824501, "loss_lvr": 0.24624615907669067, "loss_mode_switch": 0.0, "loss_total": 0.07544773817062378, "step": 2321 }, { "batch_size": 1, "epoch": 0.9284, "step": 2321, "tokens_per_device": 5103 }, { "epoch": 0.9284, "loss_ce": 0.02462751790881157, "loss_lvr": 0.7524533867835999, "loss_mode_switch": 0.0, "loss_total": 0.09987285733222961, "step": 2321 }, { "batch_size": 4, "epoch": 0.9284, "step": 2321, "tokens_per_device": 2692 }, { "epoch": 0.9284, "loss_ce": 0.1726430356502533, "loss_lvr": 1.07987380027771, "loss_mode_switch": 0.0, "loss_total": 0.2806304097175598, "step": 2321 }, { "batch_size": 4, "epoch": 0.9284, "step": 2321, "tokens_per_device": 1496 }, { "epoch": 0.9284, "loss_ce": 0.3390236496925354, "loss_lvr": 0.8081151247024536, "loss_mode_switch": 0.0, "loss_total": 0.4198351502418518, "step": 2321 }, { "batch_size": 4, "epoch": 0.9284, "step": 2321, "tokens_per_device": 2168 }, { "epoch": 0.9284, "loss_ce": 0.41842126846313477, "loss_lvr": 0.7590104341506958, "loss_mode_switch": 0.0, "loss_total": 0.4943222999572754, "step": 2321 }, { "batch_size": 4, "epoch": 0.9284, "step": 2321, "tokens_per_device": 5420 }, { "epoch": 0.9284, "loss_ce": 0.2045218050479889, "loss_lvr": 1.0374565124511719, "loss_mode_switch": 0.0, "loss_total": 0.3082674741744995, "step": 2321 }, { "batch_size": 4, "epoch": 0.9284, "step": 2321, "tokens_per_device": 4940 }, { "epoch": 0.9284, "loss_ce": 0.22345533967018127, "loss_lvr": 0.881427526473999, "loss_mode_switch": 0.0, "loss_total": 0.3115980923175812, "step": 2321 }, { "epoch": 0.9288, "grad_norm": 1.149492621421814, "learning_rate": 1.3235212330122425e-07, "loss": 0.239, "step": 2322 }, { "batch_size": 1, "epoch": 0.9288, "step": 2322, "tokens_per_device": 4829 }, { "epoch": 0.9288, "loss_ce": 0.16281694173812866, "loss_lvr": 0.601349949836731, "loss_mode_switch": 0.0, "loss_total": 0.22295193374156952, "step": 2322 }, { "batch_size": 4, "epoch": 0.9288, "step": 2322, "tokens_per_device": 4320 }, { "epoch": 0.9288, "loss_ce": 0.11633260548114777, "loss_lvr": 0.8416462540626526, "loss_mode_switch": 0.0, "loss_total": 0.20049723982810974, "step": 2322 }, { "batch_size": 1, "epoch": 0.9288, "step": 2322, "tokens_per_device": 4921 }, { "epoch": 0.9288, "loss_ce": 0.11695296317338943, "loss_lvr": 0.5659510493278503, "loss_mode_switch": 0.0, "loss_total": 0.17354807257652283, "step": 2322 }, { "batch_size": 4, "epoch": 0.9288, "step": 2322, "tokens_per_device": 2596 }, { "epoch": 0.9288, "loss_ce": 0.3251393437385559, "loss_lvr": 0.9758848547935486, "loss_mode_switch": 0.0, "loss_total": 0.4227278232574463, "step": 2322 }, { "batch_size": 1, "epoch": 0.9288, "step": 2322, "tokens_per_device": 4878 }, { "epoch": 0.9288, "loss_ce": 0.004611263517290354, "loss_lvr": 0.25924044847488403, "loss_mode_switch": 0.0, "loss_total": 0.03053530864417553, "step": 2322 }, { "batch_size": 4, "epoch": 0.9288, "step": 2322, "tokens_per_device": 3360 }, { "epoch": 0.9288, "loss_ce": 0.18743550777435303, "loss_lvr": 0.8076807260513306, "loss_mode_switch": 0.0, "loss_total": 0.26820358633995056, "step": 2322 }, { "batch_size": 4, "epoch": 0.9288, "step": 2322, "tokens_per_device": 1492 }, { "epoch": 0.9288, "loss_ce": 0.23744867742061615, "loss_lvr": 0.8714979290962219, "loss_mode_switch": 0.0, "loss_total": 0.3245984613895416, "step": 2322 }, { "batch_size": 4, "epoch": 0.9288, "step": 2322, "tokens_per_device": 2652 }, { "epoch": 0.9288, "loss_ce": 0.21974453330039978, "loss_lvr": 0.8462470769882202, "loss_mode_switch": 0.0, "loss_total": 0.3043692409992218, "step": 2322 }, { "epoch": 0.9292, "grad_norm": 1.3639144897460938, "learning_rate": 1.308757016589618e-07, "loss": 0.2565, "step": 2323 }, { "batch_size": 4, "epoch": 0.9292, "step": 2323, "tokens_per_device": 2072 }, { "epoch": 0.9292, "loss_ce": 0.1380854845046997, "loss_lvr": 1.0271923542022705, "loss_mode_switch": 0.0, "loss_total": 0.2408047318458557, "step": 2323 }, { "batch_size": 4, "epoch": 0.9292, "step": 2323, "tokens_per_device": 1288 }, { "epoch": 0.9292, "loss_ce": 0.1639278382062912, "loss_lvr": 1.1127650737762451, "loss_mode_switch": 0.0, "loss_total": 0.2752043604850769, "step": 2323 }, { "batch_size": 4, "epoch": 0.9292, "step": 2323, "tokens_per_device": 4612 }, { "epoch": 0.9292, "loss_ce": 0.0946735069155693, "loss_lvr": 0.9170345664024353, "loss_mode_switch": 0.0, "loss_total": 0.18637695908546448, "step": 2323 }, { "batch_size": 4, "epoch": 0.9292, "step": 2323, "tokens_per_device": 5220 }, { "epoch": 0.9292, "loss_ce": 0.5426257252693176, "loss_lvr": 0.752656877040863, "loss_mode_switch": 0.0, "loss_total": 0.6178914308547974, "step": 2323 }, { "batch_size": 4, "epoch": 0.9292, "step": 2323, "tokens_per_device": 6068 }, { "epoch": 0.9292, "loss_ce": 0.061389658600091934, "loss_lvr": 0.9181976318359375, "loss_mode_switch": 0.0, "loss_total": 0.15320941805839539, "step": 2323 }, { "batch_size": 4, "epoch": 0.9292, "step": 2323, "tokens_per_device": 9824 }, { "epoch": 0.9292, "loss_ce": 0.1529875546693802, "loss_lvr": 0.8331137299537659, "loss_mode_switch": 0.0, "loss_total": 0.23629891872406006, "step": 2323 }, { "batch_size": 1, "epoch": 0.9292, "step": 2323, "tokens_per_device": 4856 }, { "epoch": 0.9292, "loss_ce": 0.03089137375354767, "loss_lvr": 0.6588445901870728, "loss_mode_switch": 0.0, "loss_total": 0.0967758372426033, "step": 2323 }, { "batch_size": 4, "epoch": 0.9292, "step": 2323, "tokens_per_device": 5148 }, { "epoch": 0.9292, "loss_ce": 0.043558213859796524, "loss_lvr": 0.8334736824035645, "loss_mode_switch": 0.0, "loss_total": 0.12690559029579163, "step": 2323 }, { "epoch": 0.9296, "grad_norm": 1.3892710208892822, "learning_rate": 1.2940745199227666e-07, "loss": 0.3084, "step": 2324 }, { "batch_size": 4, "epoch": 0.9296, "step": 2324, "tokens_per_device": 4064 }, { "epoch": 0.9296, "loss_ce": 0.37849706411361694, "loss_lvr": 0.8958969116210938, "loss_mode_switch": 0.0, "loss_total": 0.46808674931526184, "step": 2324 }, { "batch_size": 1, "epoch": 0.9296, "step": 2324, "tokens_per_device": 5790 }, { "epoch": 0.9296, "loss_ce": 0.04778250679373741, "loss_lvr": 0.20834876596927643, "loss_mode_switch": 0.0, "loss_total": 0.06861738115549088, "step": 2324 }, { "batch_size": 1, "epoch": 0.9296, "step": 2324, "tokens_per_device": 5145 }, { "epoch": 0.9296, "loss_ce": 0.012104441411793232, "loss_lvr": 0.2326052337884903, "loss_mode_switch": 0.0, "loss_total": 0.035364966839551926, "step": 2324 }, { "batch_size": 4, "epoch": 0.9296, "step": 2324, "tokens_per_device": 3408 }, { "epoch": 0.9296, "loss_ce": 0.30533552169799805, "loss_lvr": 0.8548789024353027, "loss_mode_switch": 0.0, "loss_total": 0.3908234238624573, "step": 2324 }, { "batch_size": 4, "epoch": 0.9296, "step": 2324, "tokens_per_device": 9504 }, { "epoch": 0.9296, "loss_ce": 0.5676931142807007, "loss_lvr": 0.5672946572303772, "loss_mode_switch": 0.0, "loss_total": 0.624422550201416, "step": 2324 }, { "batch_size": 1, "epoch": 0.9296, "step": 2324, "tokens_per_device": 5733 }, { "epoch": 0.9296, "loss_ce": 0.0955011397600174, "loss_lvr": 0.4700712561607361, "loss_mode_switch": 0.0, "loss_total": 0.14250826835632324, "step": 2324 }, { "batch_size": 4, "epoch": 0.9296, "step": 2324, "tokens_per_device": 4304 }, { "epoch": 0.9296, "loss_ce": 0.3089936077594757, "loss_lvr": 1.8428902626037598, "loss_mode_switch": 0.0, "loss_total": 0.49328261613845825, "step": 2324 }, { "batch_size": 4, "epoch": 0.9296, "step": 2324, "tokens_per_device": 5904 }, { "epoch": 0.9296, "loss_ce": 0.5426626801490784, "loss_lvr": 0.6497911810874939, "loss_mode_switch": 0.0, "loss_total": 0.6076418161392212, "step": 2324 }, { "epoch": 0.93, "grad_norm": 1.2097710371017456, "learning_rate": 1.2794737676536993e-07, "loss": 0.2717, "step": 2325 }, { "batch_size": 4, "epoch": 0.93, "step": 2325, "tokens_per_device": 2952 }, { "epoch": 0.93, "loss_ce": 0.4489741921424866, "loss_lvr": 0.8719238042831421, "loss_mode_switch": 0.0, "loss_total": 0.5361665487289429, "step": 2325 }, { "batch_size": 4, "epoch": 0.93, "step": 2325, "tokens_per_device": 2644 }, { "epoch": 0.93, "loss_ce": 0.41889098286628723, "loss_lvr": 0.9621286988258362, "loss_mode_switch": 0.0, "loss_total": 0.5151038765907288, "step": 2325 }, { "batch_size": 4, "epoch": 0.93, "step": 2325, "tokens_per_device": 4260 }, { "epoch": 0.93, "loss_ce": 0.03619991987943649, "loss_lvr": 1.0502583980560303, "loss_mode_switch": 0.0, "loss_total": 0.14122575521469116, "step": 2325 }, { "batch_size": 4, "epoch": 0.93, "step": 2325, "tokens_per_device": 5788 }, { "epoch": 0.93, "loss_ce": 0.11641868948936462, "loss_lvr": 0.783436119556427, "loss_mode_switch": 0.0, "loss_total": 0.19476230442523956, "step": 2325 }, { "batch_size": 4, "epoch": 0.93, "step": 2325, "tokens_per_device": 2712 }, { "epoch": 0.93, "loss_ce": 0.1232336014509201, "loss_lvr": 0.7608245015144348, "loss_mode_switch": 0.0, "loss_total": 0.19931605458259583, "step": 2325 }, { "batch_size": 4, "epoch": 0.93, "step": 2325, "tokens_per_device": 2580 }, { "epoch": 0.93, "loss_ce": 0.32105153799057007, "loss_lvr": 0.7631418108940125, "loss_mode_switch": 0.0, "loss_total": 0.3973657190799713, "step": 2325 }, { "batch_size": 4, "epoch": 0.93, "step": 2325, "tokens_per_device": 4624 }, { "epoch": 0.93, "loss_ce": 0.12164202332496643, "loss_lvr": 0.7895218729972839, "loss_mode_switch": 0.0, "loss_total": 0.2005942165851593, "step": 2325 }, { "batch_size": 4, "epoch": 0.93, "step": 2325, "tokens_per_device": 4312 }, { "epoch": 0.93, "loss_ce": 0.12775622308254242, "loss_lvr": 0.8880012035369873, "loss_mode_switch": 0.0, "loss_total": 0.2165563404560089, "step": 2325 }, { "epoch": 0.9304, "grad_norm": 1.3217058181762695, "learning_rate": 1.2649547842872367e-07, "loss": 0.3155, "step": 2326 }, { "batch_size": 4, "epoch": 0.9304, "step": 2326, "tokens_per_device": 6508 }, { "epoch": 0.9304, "loss_ce": 0.4024723470211029, "loss_lvr": 0.7972866892814636, "loss_mode_switch": 0.0, "loss_total": 0.4822010099887848, "step": 2326 }, { "batch_size": 4, "epoch": 0.9304, "step": 2326, "tokens_per_device": 4532 }, { "epoch": 0.9304, "loss_ce": 0.11180713027715683, "loss_lvr": 0.6003470420837402, "loss_mode_switch": 0.0, "loss_total": 0.1718418300151825, "step": 2326 }, { "batch_size": 4, "epoch": 0.9304, "step": 2326, "tokens_per_device": 8928 }, { "epoch": 0.9304, "loss_ce": 0.09216982871294022, "loss_lvr": 0.4301571547985077, "loss_mode_switch": 0.0, "loss_total": 0.13518553972244263, "step": 2326 }, { "batch_size": 4, "epoch": 0.9304, "step": 2326, "tokens_per_device": 4208 }, { "epoch": 0.9304, "loss_ce": 0.1362554132938385, "loss_lvr": 1.5068176984786987, "loss_mode_switch": 0.0, "loss_total": 0.2869371771812439, "step": 2326 }, { "batch_size": 1, "epoch": 0.9304, "step": 2326, "tokens_per_device": 5160 }, { "epoch": 0.9304, "loss_ce": 1.0415747165679932, "loss_lvr": 0.5850856304168701, "loss_mode_switch": 0.0, "loss_total": 1.1000832319259644, "step": 2326 }, { "batch_size": 4, "epoch": 0.9304, "step": 2326, "tokens_per_device": 1796 }, { "epoch": 0.9304, "loss_ce": 0.5923219919204712, "loss_lvr": 0.9635371565818787, "loss_mode_switch": 0.0, "loss_total": 0.6886757016181946, "step": 2326 }, { "batch_size": 4, "epoch": 0.9304, "step": 2326, "tokens_per_device": 13592 }, { "epoch": 0.9304, "loss_ce": 0.16531968116760254, "loss_lvr": 0.8659510016441345, "loss_mode_switch": 0.0, "loss_total": 0.2519147992134094, "step": 2326 }, { "batch_size": 4, "epoch": 0.9304, "step": 2326, "tokens_per_device": 5796 }, { "epoch": 0.9304, "loss_ce": 0.3665485680103302, "loss_lvr": 0.9385150074958801, "loss_mode_switch": 0.0, "loss_total": 0.4604000747203827, "step": 2326 }, { "epoch": 0.9308, "grad_norm": 1.3522100448608398, "learning_rate": 1.2505175941909642e-07, "loss": 0.2749, "step": 2327 }, { "batch_size": 4, "epoch": 0.9308, "step": 2327, "tokens_per_device": 6132 }, { "epoch": 0.9308, "loss_ce": 0.10291992127895355, "loss_lvr": 0.7575122714042664, "loss_mode_switch": 0.0, "loss_total": 0.17867115139961243, "step": 2327 }, { "batch_size": 4, "epoch": 0.9308, "step": 2327, "tokens_per_device": 8396 }, { "epoch": 0.9308, "loss_ce": 0.23053660988807678, "loss_lvr": 0.567791223526001, "loss_mode_switch": 0.0, "loss_total": 0.2873157262802124, "step": 2327 }, { "batch_size": 1, "epoch": 0.9308, "step": 2327, "tokens_per_device": 5118 }, { "epoch": 0.9308, "loss_ce": 0.5042608976364136, "loss_lvr": 0.30889931321144104, "loss_mode_switch": 0.0, "loss_total": 0.5351508259773254, "step": 2327 }, { "batch_size": 1, "epoch": 0.9308, "step": 2327, "tokens_per_device": 4925 }, { "epoch": 0.9308, "loss_ce": 0.3531561493873596, "loss_lvr": 0.9393941164016724, "loss_mode_switch": 0.0, "loss_total": 0.4470955729484558, "step": 2327 }, { "batch_size": 4, "epoch": 0.9308, "step": 2327, "tokens_per_device": 5144 }, { "epoch": 0.9308, "loss_ce": 0.020417828112840652, "loss_lvr": 0.6778610944747925, "loss_mode_switch": 0.0, "loss_total": 0.08820393681526184, "step": 2327 }, { "batch_size": 4, "epoch": 0.9308, "step": 2327, "tokens_per_device": 4324 }, { "epoch": 0.9308, "loss_ce": 0.12815150618553162, "loss_lvr": 1.058713436126709, "loss_mode_switch": 0.0, "loss_total": 0.234022855758667, "step": 2327 }, { "batch_size": 1, "epoch": 0.9308, "step": 2327, "tokens_per_device": 5216 }, { "epoch": 0.9308, "loss_ce": 0.01676204428076744, "loss_lvr": 0.4597710072994232, "loss_mode_switch": 0.0, "loss_total": 0.06273914873600006, "step": 2327 }, { "batch_size": 1, "epoch": 0.9308, "step": 2327, "tokens_per_device": 4894 }, { "epoch": 0.9308, "loss_ce": 0.016470102593302727, "loss_lvr": 0.24077168107032776, "loss_mode_switch": 0.0, "loss_total": 0.04054727032780647, "step": 2327 }, { "epoch": 0.9312, "grad_norm": 1.1569416522979736, "learning_rate": 1.2361622215951774e-07, "loss": 0.2586, "step": 2328 }, { "batch_size": 4, "epoch": 0.9312, "step": 2328, "tokens_per_device": 4552 }, { "epoch": 0.9312, "loss_ce": 0.09896209836006165, "loss_lvr": 0.7731303572654724, "loss_mode_switch": 0.0, "loss_total": 0.1762751340866089, "step": 2328 }, { "batch_size": 4, "epoch": 0.9312, "step": 2328, "tokens_per_device": 4548 }, { "epoch": 0.9312, "loss_ce": 0.6601369380950928, "loss_lvr": 1.0171787738800049, "loss_mode_switch": 0.0, "loss_total": 0.7618548274040222, "step": 2328 }, { "batch_size": 1, "epoch": 0.9312, "step": 2328, "tokens_per_device": 5212 }, { "epoch": 0.9312, "loss_ce": 0.1713503748178482, "loss_lvr": 0.4326765835285187, "loss_mode_switch": 0.0, "loss_total": 0.2146180272102356, "step": 2328 }, { "batch_size": 4, "epoch": 0.9312, "step": 2328, "tokens_per_device": 3788 }, { "epoch": 0.9312, "loss_ce": 0.16914723813533783, "loss_lvr": 0.9170613884925842, "loss_mode_switch": 0.0, "loss_total": 0.2608533799648285, "step": 2328 }, { "batch_size": 4, "epoch": 0.9312, "step": 2328, "tokens_per_device": 5916 }, { "epoch": 0.9312, "loss_ce": 0.08521755039691925, "loss_lvr": 0.6728069186210632, "loss_mode_switch": 0.0, "loss_total": 0.1524982452392578, "step": 2328 }, { "batch_size": 4, "epoch": 0.9312, "step": 2328, "tokens_per_device": 4508 }, { "epoch": 0.9312, "loss_ce": 0.35529205203056335, "loss_lvr": 0.9835633635520935, "loss_mode_switch": 0.0, "loss_total": 0.4536483883857727, "step": 2328 }, { "batch_size": 4, "epoch": 0.9312, "step": 2328, "tokens_per_device": 4800 }, { "epoch": 0.9312, "loss_ce": 0.4892464876174927, "loss_lvr": 0.8143174052238464, "loss_mode_switch": 0.0, "loss_total": 0.5706782341003418, "step": 2328 }, { "batch_size": 4, "epoch": 0.9312, "step": 2328, "tokens_per_device": 5440 }, { "epoch": 0.9312, "loss_ce": 0.23166105151176453, "loss_lvr": 0.7849111557006836, "loss_mode_switch": 0.0, "loss_total": 0.31015217304229736, "step": 2328 }, { "epoch": 0.9316, "grad_norm": 1.191164493560791, "learning_rate": 1.2218886905928652e-07, "loss": 0.2494, "step": 2329 }, { "batch_size": 4, "epoch": 0.9316, "step": 2329, "tokens_per_device": 10296 }, { "epoch": 0.9316, "loss_ce": 0.31759127974510193, "loss_lvr": 0.6887431740760803, "loss_mode_switch": 0.0, "loss_total": 0.3864656090736389, "step": 2329 }, { "batch_size": 4, "epoch": 0.9316, "step": 2329, "tokens_per_device": 11084 }, { "epoch": 0.9316, "loss_ce": 0.1351073682308197, "loss_lvr": 0.4793529510498047, "loss_mode_switch": 0.0, "loss_total": 0.18304266035556793, "step": 2329 }, { "batch_size": 4, "epoch": 0.9316, "step": 2329, "tokens_per_device": 1936 }, { "epoch": 0.9316, "loss_ce": 0.3259069621562958, "loss_lvr": 0.9206463098526001, "loss_mode_switch": 0.0, "loss_total": 0.4179716110229492, "step": 2329 }, { "batch_size": 1, "epoch": 0.9316, "step": 2329, "tokens_per_device": 5170 }, { "epoch": 0.9316, "loss_ce": 0.056270964443683624, "loss_lvr": 0.20245739817619324, "loss_mode_switch": 0.0, "loss_total": 0.07651670277118683, "step": 2329 }, { "batch_size": 4, "epoch": 0.9316, "step": 2329, "tokens_per_device": 6400 }, { "epoch": 0.9316, "loss_ce": 0.5100870728492737, "loss_lvr": 0.7324146628379822, "loss_mode_switch": 0.0, "loss_total": 0.5833285450935364, "step": 2329 }, { "batch_size": 1, "epoch": 0.9316, "step": 2329, "tokens_per_device": 6862 }, { "epoch": 0.9316, "loss_ce": 0.0003538088349159807, "loss_lvr": 0.2788863778114319, "loss_mode_switch": 0.0, "loss_total": 0.0282424483448267, "step": 2329 }, { "batch_size": 4, "epoch": 0.9316, "step": 2329, "tokens_per_device": 6756 }, { "epoch": 0.9316, "loss_ce": 0.06769338250160217, "loss_lvr": 0.727715790271759, "loss_mode_switch": 0.0, "loss_total": 0.14046496152877808, "step": 2329 }, { "batch_size": 4, "epoch": 0.9316, "step": 2329, "tokens_per_device": 6104 }, { "epoch": 0.9316, "loss_ce": 0.2293873131275177, "loss_lvr": 0.8670601844787598, "loss_mode_switch": 0.0, "loss_total": 0.3160933256149292, "step": 2329 }, { "epoch": 0.932, "grad_norm": 1.5919318199157715, "learning_rate": 1.2076970251396593e-07, "loss": 0.2899, "step": 2330 }, { "batch_size": 4, "epoch": 0.932, "step": 2330, "tokens_per_device": 3488 }, { "epoch": 0.932, "loss_ce": 0.21005487442016602, "loss_lvr": 0.6338899731636047, "loss_mode_switch": 0.0, "loss_total": 0.27344387769699097, "step": 2330 }, { "batch_size": 1, "epoch": 0.932, "step": 2330, "tokens_per_device": 4732 }, { "epoch": 0.932, "loss_ce": 0.008447203785181046, "loss_lvr": 0.9104660749435425, "loss_mode_switch": 0.0, "loss_total": 0.09949381649494171, "step": 2330 }, { "batch_size": 4, "epoch": 0.932, "step": 2330, "tokens_per_device": 5732 }, { "epoch": 0.932, "loss_ce": 0.16845980286598206, "loss_lvr": 0.7588072419166565, "loss_mode_switch": 0.0, "loss_total": 0.24434053897857666, "step": 2330 }, { "batch_size": 4, "epoch": 0.932, "step": 2330, "tokens_per_device": 6548 }, { "epoch": 0.932, "loss_ce": 0.3868233859539032, "loss_lvr": 0.6232619285583496, "loss_mode_switch": 0.0, "loss_total": 0.44914957880973816, "step": 2330 }, { "batch_size": 1, "epoch": 0.932, "step": 2330, "tokens_per_device": 4720 }, { "epoch": 0.932, "loss_ce": 0.0013953273883089423, "loss_lvr": 0.24581889808177948, "loss_mode_switch": 0.0, "loss_total": 0.02597721852362156, "step": 2330 }, { "batch_size": 4, "epoch": 0.932, "step": 2330, "tokens_per_device": 2060 }, { "epoch": 0.932, "loss_ce": 0.14947038888931274, "loss_lvr": 0.8953133821487427, "loss_mode_switch": 0.0, "loss_total": 0.23900172114372253, "step": 2330 }, { "batch_size": 1, "epoch": 0.932, "step": 2330, "tokens_per_device": 4890 }, { "epoch": 0.932, "loss_ce": 0.003399935318157077, "loss_lvr": 0.46028560400009155, "loss_mode_switch": 0.0, "loss_total": 0.04942849650979042, "step": 2330 }, { "batch_size": 4, "epoch": 0.932, "step": 2330, "tokens_per_device": 4848 }, { "epoch": 0.932, "loss_ce": 0.0260029137134552, "loss_lvr": 0.6260897517204285, "loss_mode_switch": 0.0, "loss_total": 0.0886118933558464, "step": 2330 }, { "epoch": 0.9324, "grad_norm": 1.2188317775726318, "learning_rate": 1.193587249053807e-07, "loss": 0.2799, "step": 2331 }, { "batch_size": 4, "epoch": 0.9324, "step": 2331, "tokens_per_device": 3876 }, { "epoch": 0.9324, "loss_ce": 0.2995941638946533, "loss_lvr": 0.9900386333465576, "loss_mode_switch": 0.0, "loss_total": 0.3985980153083801, "step": 2331 }, { "batch_size": 4, "epoch": 0.9324, "step": 2331, "tokens_per_device": 1412 }, { "epoch": 0.9324, "loss_ce": 0.4851531386375427, "loss_lvr": 1.0383152961730957, "loss_mode_switch": 0.0, "loss_total": 0.5889846682548523, "step": 2331 }, { "batch_size": 4, "epoch": 0.9324, "step": 2331, "tokens_per_device": 5852 }, { "epoch": 0.9324, "loss_ce": 0.018864652141928673, "loss_lvr": 0.6703701019287109, "loss_mode_switch": 0.0, "loss_total": 0.0859016627073288, "step": 2331 }, { "batch_size": 1, "epoch": 0.9324, "step": 2331, "tokens_per_device": 4926 }, { "epoch": 0.9324, "loss_ce": 0.009977088309824467, "loss_lvr": 0.30732017755508423, "loss_mode_switch": 0.0, "loss_total": 0.040709108114242554, "step": 2331 }, { "batch_size": 1, "epoch": 0.9324, "step": 2331, "tokens_per_device": 4902 }, { "epoch": 0.9324, "loss_ce": 0.21152490377426147, "loss_lvr": 0.456323504447937, "loss_mode_switch": 0.0, "loss_total": 0.25715726613998413, "step": 2331 }, { "batch_size": 1, "epoch": 0.9324, "step": 2331, "tokens_per_device": 5073 }, { "epoch": 0.9324, "loss_ce": 0.10994410514831543, "loss_lvr": 0.4179481267929077, "loss_mode_switch": 0.0, "loss_total": 0.15173891186714172, "step": 2331 }, { "batch_size": 1, "epoch": 0.9324, "step": 2331, "tokens_per_device": 5046 }, { "epoch": 0.9324, "loss_ce": 0.0002066554152406752, "loss_lvr": 0.9410980939865112, "loss_mode_switch": 0.0, "loss_total": 0.09431646764278412, "step": 2331 }, { "batch_size": 4, "epoch": 0.9324, "step": 2331, "tokens_per_device": 7136 }, { "epoch": 0.9324, "loss_ce": 0.21004219353199005, "loss_lvr": 0.7478876113891602, "loss_mode_switch": 0.0, "loss_total": 0.2848309576511383, "step": 2331 }, { "epoch": 0.9328, "grad_norm": 1.1337872743606567, "learning_rate": 1.179559386016088e-07, "loss": 0.2108, "step": 2332 }, { "batch_size": 1, "epoch": 0.9328, "step": 2332, "tokens_per_device": 4764 }, { "epoch": 0.9328, "loss_ce": 0.0032618732657283545, "loss_lvr": 0.3710328936576843, "loss_mode_switch": 0.0, "loss_total": 0.04036516323685646, "step": 2332 }, { "batch_size": 1, "epoch": 0.9328, "step": 2332, "tokens_per_device": 4901 }, { "epoch": 0.9328, "loss_ce": 0.022932076826691628, "loss_lvr": 0.3170117437839508, "loss_mode_switch": 0.0, "loss_total": 0.0546332523226738, "step": 2332 }, { "batch_size": 1, "epoch": 0.9328, "step": 2332, "tokens_per_device": 4882 }, { "epoch": 0.9328, "loss_ce": 0.17431074380874634, "loss_lvr": 0.1394052952528, "loss_mode_switch": 0.0, "loss_total": 0.18825127184391022, "step": 2332 }, { "batch_size": 4, "epoch": 0.9328, "step": 2332, "tokens_per_device": 2564 }, { "epoch": 0.9328, "loss_ce": 0.18936380743980408, "loss_lvr": 0.7807365655899048, "loss_mode_switch": 0.0, "loss_total": 0.2674374580383301, "step": 2332 }, { "batch_size": 1, "epoch": 0.9328, "step": 2332, "tokens_per_device": 4900 }, { "epoch": 0.9328, "loss_ce": 0.03115229308605194, "loss_lvr": 0.30394768714904785, "loss_mode_switch": 0.0, "loss_total": 0.061547063291072845, "step": 2332 }, { "batch_size": 4, "epoch": 0.9328, "step": 2332, "tokens_per_device": 1256 }, { "epoch": 0.9328, "loss_ce": 0.3119974136352539, "loss_lvr": 0.9926026463508606, "loss_mode_switch": 0.0, "loss_total": 0.41125768423080444, "step": 2332 }, { "batch_size": 4, "epoch": 0.9328, "step": 2332, "tokens_per_device": 4276 }, { "epoch": 0.9328, "loss_ce": 0.4065217077732086, "loss_lvr": 1.0088754892349243, "loss_mode_switch": 0.0, "loss_total": 0.5074092745780945, "step": 2332 }, { "batch_size": 1, "epoch": 0.9328, "step": 2332, "tokens_per_device": 5147 }, { "epoch": 0.9328, "loss_ce": 0.011686230078339577, "loss_lvr": 0.42087864875793457, "loss_mode_switch": 0.0, "loss_total": 0.05377409607172012, "step": 2332 }, { "epoch": 0.9332, "grad_norm": 1.4530360698699951, "learning_rate": 1.1656134595698309e-07, "loss": 0.2595, "step": 2333 }, { "batch_size": 4, "epoch": 0.9332, "step": 2333, "tokens_per_device": 3924 }, { "epoch": 0.9332, "loss_ce": 0.10819806158542633, "loss_lvr": 0.898779571056366, "loss_mode_switch": 0.0, "loss_total": 0.1980760097503662, "step": 2333 }, { "batch_size": 1, "epoch": 0.9332, "step": 2333, "tokens_per_device": 5159 }, { "epoch": 0.9332, "loss_ce": 0.005644091870635748, "loss_lvr": 0.3133973181247711, "loss_mode_switch": 0.0, "loss_total": 0.03698382154107094, "step": 2333 }, { "batch_size": 1, "epoch": 0.9332, "step": 2333, "tokens_per_device": 5151 }, { "epoch": 0.9332, "loss_ce": 0.0195001233369112, "loss_lvr": 0.5876774787902832, "loss_mode_switch": 0.0, "loss_total": 0.07826787233352661, "step": 2333 }, { "batch_size": 4, "epoch": 0.9332, "step": 2333, "tokens_per_device": 11120 }, { "epoch": 0.9332, "loss_ce": 0.23051615059375763, "loss_lvr": 0.8421134352684021, "loss_mode_switch": 0.0, "loss_total": 0.3147274851799011, "step": 2333 }, { "batch_size": 4, "epoch": 0.9332, "step": 2333, "tokens_per_device": 4364 }, { "epoch": 0.9332, "loss_ce": 0.06964834034442902, "loss_lvr": 0.7893052101135254, "loss_mode_switch": 0.0, "loss_total": 0.14857885241508484, "step": 2333 }, { "batch_size": 1, "epoch": 0.9332, "step": 2333, "tokens_per_device": 4887 }, { "epoch": 0.9332, "loss_ce": 0.05480026453733444, "loss_lvr": 0.49512675404548645, "loss_mode_switch": 0.0, "loss_total": 0.1043129414319992, "step": 2333 }, { "batch_size": 4, "epoch": 0.9332, "step": 2333, "tokens_per_device": 1348 }, { "epoch": 0.9332, "loss_ce": 0.3690587878227234, "loss_lvr": 0.8892401456832886, "loss_mode_switch": 0.0, "loss_total": 0.4579828083515167, "step": 2333 }, { "batch_size": 4, "epoch": 0.9332, "step": 2333, "tokens_per_device": 4320 }, { "epoch": 0.9332, "loss_ce": 0.36587780714035034, "loss_lvr": 0.8935890197753906, "loss_mode_switch": 0.0, "loss_total": 0.4552367031574249, "step": 2333 }, { "epoch": 0.9336, "grad_norm": 1.2252488136291504, "learning_rate": 1.151749493120835e-07, "loss": 0.2395, "step": 2334 }, { "batch_size": 1, "epoch": 0.9336, "step": 2334, "tokens_per_device": 5322 }, { "epoch": 0.9336, "loss_ce": 1.840623140335083, "loss_lvr": 0.49005430936813354, "loss_mode_switch": 0.0, "loss_total": 1.889628529548645, "step": 2334 }, { "batch_size": 4, "epoch": 0.9336, "step": 2334, "tokens_per_device": 1548 }, { "epoch": 0.9336, "loss_ce": 0.13091829419136047, "loss_lvr": 0.9118010997772217, "loss_mode_switch": 0.0, "loss_total": 0.22209841012954712, "step": 2334 }, { "batch_size": 4, "epoch": 0.9336, "step": 2334, "tokens_per_device": 4200 }, { "epoch": 0.9336, "loss_ce": 0.040457431226968765, "loss_lvr": 0.815801739692688, "loss_mode_switch": 0.0, "loss_total": 0.1220376044511795, "step": 2334 }, { "batch_size": 1, "epoch": 0.9336, "step": 2334, "tokens_per_device": 5098 }, { "epoch": 0.9336, "loss_ce": 0.003695129416882992, "loss_lvr": 0.3139916956424713, "loss_mode_switch": 0.0, "loss_total": 0.03509430214762688, "step": 2334 }, { "batch_size": 1, "epoch": 0.9336, "step": 2334, "tokens_per_device": 4885 }, { "epoch": 0.9336, "loss_ce": 0.022824469953775406, "loss_lvr": 0.4008411765098572, "loss_mode_switch": 0.0, "loss_total": 0.0629085898399353, "step": 2334 }, { "batch_size": 4, "epoch": 0.9336, "step": 2334, "tokens_per_device": 10316 }, { "epoch": 0.9336, "loss_ce": 0.22211986780166626, "loss_lvr": 0.8124978542327881, "loss_mode_switch": 0.0, "loss_total": 0.3033696413040161, "step": 2334 }, { "batch_size": 1, "epoch": 0.9336, "step": 2334, "tokens_per_device": 6073 }, { "epoch": 0.9336, "loss_ce": 0.032474204897880554, "loss_lvr": 0.2578275203704834, "loss_mode_switch": 0.0, "loss_total": 0.058256957679986954, "step": 2334 }, { "batch_size": 4, "epoch": 0.9336, "step": 2334, "tokens_per_device": 4252 }, { "epoch": 0.9336, "loss_ce": 0.027326827868819237, "loss_lvr": 1.0901216268539429, "loss_mode_switch": 0.0, "loss_total": 0.1363389939069748, "step": 2334 }, { "epoch": 0.934, "grad_norm": 1.3793652057647705, "learning_rate": 1.1379675099373489e-07, "loss": 0.254, "step": 2335 }, { "batch_size": 4, "epoch": 0.934, "step": 2335, "tokens_per_device": 4224 }, { "epoch": 0.934, "loss_ce": 0.03319282457232475, "loss_lvr": 0.5723572969436646, "loss_mode_switch": 0.0, "loss_total": 0.09042855352163315, "step": 2335 }, { "batch_size": 4, "epoch": 0.934, "step": 2335, "tokens_per_device": 4576 }, { "epoch": 0.934, "loss_ce": 0.23876705765724182, "loss_lvr": 0.6323774456977844, "loss_mode_switch": 0.0, "loss_total": 0.3020048141479492, "step": 2335 }, { "batch_size": 1, "epoch": 0.934, "step": 2335, "tokens_per_device": 4879 }, { "epoch": 0.934, "loss_ce": 0.004868319723755121, "loss_lvr": 1.2183650732040405, "loss_mode_switch": 0.0, "loss_total": 0.12670482695102692, "step": 2335 }, { "batch_size": 4, "epoch": 0.934, "step": 2335, "tokens_per_device": 3760 }, { "epoch": 0.934, "loss_ce": 0.30432671308517456, "loss_lvr": 0.9397820234298706, "loss_mode_switch": 0.0, "loss_total": 0.39830490946769714, "step": 2335 }, { "batch_size": 4, "epoch": 0.934, "step": 2335, "tokens_per_device": 10364 }, { "epoch": 0.934, "loss_ce": 0.15959201753139496, "loss_lvr": 0.6445058584213257, "loss_mode_switch": 0.0, "loss_total": 0.2240425944328308, "step": 2335 }, { "batch_size": 4, "epoch": 0.934, "step": 2335, "tokens_per_device": 1608 }, { "epoch": 0.934, "loss_ce": 0.10408996045589447, "loss_lvr": 0.9153839349746704, "loss_mode_switch": 0.0, "loss_total": 0.1956283450126648, "step": 2335 }, { "batch_size": 4, "epoch": 0.934, "step": 2335, "tokens_per_device": 4228 }, { "epoch": 0.934, "loss_ce": 0.09085165709257126, "loss_lvr": 1.0265352725982666, "loss_mode_switch": 0.0, "loss_total": 0.1935051828622818, "step": 2335 }, { "batch_size": 4, "epoch": 0.934, "step": 2335, "tokens_per_device": 4696 }, { "epoch": 0.934, "loss_ce": 0.3049105107784271, "loss_lvr": 0.8288732767105103, "loss_mode_switch": 0.0, "loss_total": 0.38779783248901367, "step": 2335 }, { "epoch": 0.9344, "grad_norm": 1.3073159456253052, "learning_rate": 1.1242675331500363e-07, "loss": 0.2645, "step": 2336 }, { "batch_size": 1, "epoch": 0.9344, "step": 2336, "tokens_per_device": 5127 }, { "epoch": 0.9344, "loss_ce": 0.1028011366724968, "loss_lvr": 0.392803817987442, "loss_mode_switch": 0.0, "loss_total": 0.14208151400089264, "step": 2336 }, { "batch_size": 4, "epoch": 0.9344, "step": 2336, "tokens_per_device": 5892 }, { "epoch": 0.9344, "loss_ce": 0.04208119586110115, "loss_lvr": 0.754395604133606, "loss_mode_switch": 0.0, "loss_total": 0.1175207644701004, "step": 2336 }, { "batch_size": 4, "epoch": 0.9344, "step": 2336, "tokens_per_device": 1996 }, { "epoch": 0.9344, "loss_ce": 0.11754533648490906, "loss_lvr": 0.7195149064064026, "loss_mode_switch": 0.0, "loss_total": 0.18949683010578156, "step": 2336 }, { "batch_size": 4, "epoch": 0.9344, "step": 2336, "tokens_per_device": 3052 }, { "epoch": 0.9344, "loss_ce": 0.8962348103523254, "loss_lvr": 1.1808252334594727, "loss_mode_switch": 0.0, "loss_total": 1.014317274093628, "step": 2336 }, { "batch_size": 4, "epoch": 0.9344, "step": 2336, "tokens_per_device": 4392 }, { "epoch": 0.9344, "loss_ce": 0.11391706019639969, "loss_lvr": 0.7833105325698853, "loss_mode_switch": 0.0, "loss_total": 0.19224810600280762, "step": 2336 }, { "batch_size": 4, "epoch": 0.9344, "step": 2336, "tokens_per_device": 4832 }, { "epoch": 0.9344, "loss_ce": 0.04695460572838783, "loss_lvr": 1.027997374534607, "loss_mode_switch": 0.0, "loss_total": 0.1497543454170227, "step": 2336 }, { "batch_size": 4, "epoch": 0.9344, "step": 2336, "tokens_per_device": 3036 }, { "epoch": 0.9344, "loss_ce": 0.23169806599617004, "loss_lvr": 1.1498364210128784, "loss_mode_switch": 0.0, "loss_total": 0.34668171405792236, "step": 2336 }, { "batch_size": 4, "epoch": 0.9344, "step": 2336, "tokens_per_device": 2628 }, { "epoch": 0.9344, "loss_ce": 0.07121697813272476, "loss_lvr": 0.6481181383132935, "loss_mode_switch": 0.0, "loss_total": 0.13602879643440247, "step": 2336 }, { "epoch": 0.9348, "grad_norm": 1.3051838874816895, "learning_rate": 1.1106495857519162e-07, "loss": 0.2629, "step": 2337 }, { "batch_size": 4, "epoch": 0.9348, "step": 2337, "tokens_per_device": 1464 }, { "epoch": 0.9348, "loss_ce": 0.11697296053171158, "loss_lvr": 1.0141061544418335, "loss_mode_switch": 0.0, "loss_total": 0.2183835804462433, "step": 2337 }, { "batch_size": 4, "epoch": 0.9348, "step": 2337, "tokens_per_device": 4256 }, { "epoch": 0.9348, "loss_ce": 0.08793189376592636, "loss_lvr": 0.7282611727714539, "loss_mode_switch": 0.0, "loss_total": 0.16075801849365234, "step": 2337 }, { "batch_size": 4, "epoch": 0.9348, "step": 2337, "tokens_per_device": 4564 }, { "epoch": 0.9348, "loss_ce": 0.06633021682500839, "loss_lvr": 0.9631494879722595, "loss_mode_switch": 0.0, "loss_total": 0.162645161151886, "step": 2337 }, { "batch_size": 1, "epoch": 0.9348, "step": 2337, "tokens_per_device": 4836 }, { "epoch": 0.9348, "loss_ce": 0.03902776539325714, "loss_lvr": 0.5152645111083984, "loss_mode_switch": 0.0, "loss_total": 0.09055422246456146, "step": 2337 }, { "batch_size": 1, "epoch": 0.9348, "step": 2337, "tokens_per_device": 4897 }, { "epoch": 0.9348, "loss_ce": 0.16436465084552765, "loss_lvr": 0.4325697124004364, "loss_mode_switch": 0.0, "loss_total": 0.20762161910533905, "step": 2337 }, { "batch_size": 4, "epoch": 0.9348, "step": 2337, "tokens_per_device": 4368 }, { "epoch": 0.9348, "loss_ce": 0.6572903394699097, "loss_lvr": 0.9135462045669556, "loss_mode_switch": 0.0, "loss_total": 0.7486449480056763, "step": 2337 }, { "batch_size": 4, "epoch": 0.9348, "step": 2337, "tokens_per_device": 2612 }, { "epoch": 0.9348, "loss_ce": 0.13100884854793549, "loss_lvr": 0.6737512350082397, "loss_mode_switch": 0.0, "loss_total": 0.19838397204875946, "step": 2337 }, { "batch_size": 1, "epoch": 0.9348, "step": 2337, "tokens_per_device": 5070 }, { "epoch": 0.9348, "loss_ce": 0.03853815793991089, "loss_lvr": 0.38510483503341675, "loss_mode_switch": 0.0, "loss_total": 0.0770486444234848, "step": 2337 }, { "epoch": 0.9352, "grad_norm": 1.354893445968628, "learning_rate": 1.0971136905983282e-07, "loss": 0.2877, "step": 2338 }, { "batch_size": 4, "epoch": 0.9352, "step": 2338, "tokens_per_device": 4408 }, { "epoch": 0.9352, "loss_ce": 0.12356918305158615, "loss_lvr": 0.7769219875335693, "loss_mode_switch": 0.0, "loss_total": 0.20126138627529144, "step": 2338 }, { "batch_size": 1, "epoch": 0.9352, "step": 2338, "tokens_per_device": 4984 }, { "epoch": 0.9352, "loss_ce": 0.39672088623046875, "loss_lvr": 0.6612575054168701, "loss_mode_switch": 0.0, "loss_total": 0.46284663677215576, "step": 2338 }, { "batch_size": 1, "epoch": 0.9352, "step": 2338, "tokens_per_device": 4858 }, { "epoch": 0.9352, "loss_ce": 0.0965585932135582, "loss_lvr": 0.20715190470218658, "loss_mode_switch": 0.0, "loss_total": 0.11727378517389297, "step": 2338 }, { "batch_size": 4, "epoch": 0.9352, "step": 2338, "tokens_per_device": 3880 }, { "epoch": 0.9352, "loss_ce": 0.049809232354164124, "loss_lvr": 0.8771030902862549, "loss_mode_switch": 0.0, "loss_total": 0.13751953840255737, "step": 2338 }, { "batch_size": 4, "epoch": 0.9352, "step": 2338, "tokens_per_device": 5168 }, { "epoch": 0.9352, "loss_ce": 0.26083582639694214, "loss_lvr": 0.6242960095405579, "loss_mode_switch": 0.0, "loss_total": 0.3232654333114624, "step": 2338 }, { "batch_size": 4, "epoch": 0.9352, "step": 2338, "tokens_per_device": 1260 }, { "epoch": 0.9352, "loss_ce": 0.11137054115533829, "loss_lvr": 0.9606200456619263, "loss_mode_switch": 0.0, "loss_total": 0.20743253827095032, "step": 2338 }, { "batch_size": 4, "epoch": 0.9352, "step": 2338, "tokens_per_device": 4048 }, { "epoch": 0.9352, "loss_ce": 0.03581686690449715, "loss_lvr": 1.5245792865753174, "loss_mode_switch": 0.0, "loss_total": 0.1882748007774353, "step": 2338 }, { "batch_size": 4, "epoch": 0.9352, "step": 2338, "tokens_per_device": 8780 }, { "epoch": 0.9352, "loss_ce": 0.048053402453660965, "loss_lvr": 2.892245054244995, "loss_mode_switch": 0.0, "loss_total": 0.33727791905403137, "step": 2338 }, { "epoch": 0.9356, "grad_norm": 1.2648773193359375, "learning_rate": 1.0836598704069057e-07, "loss": 0.2782, "step": 2339 }, { "batch_size": 4, "epoch": 0.9356, "step": 2339, "tokens_per_device": 3756 }, { "epoch": 0.9356, "loss_ce": 0.24821607768535614, "loss_lvr": 1.0770795345306396, "loss_mode_switch": 0.0, "loss_total": 0.3559240400791168, "step": 2339 }, { "batch_size": 1, "epoch": 0.9356, "step": 2339, "tokens_per_device": 6787 }, { "epoch": 0.9356, "loss_ce": 0.0026390624698251486, "loss_lvr": 0.3607693314552307, "loss_mode_switch": 0.0, "loss_total": 0.038715995848178864, "step": 2339 }, { "batch_size": 1, "epoch": 0.9356, "step": 2339, "tokens_per_device": 5226 }, { "epoch": 0.9356, "loss_ce": 0.16930879652500153, "loss_lvr": 0.3934992849826813, "loss_mode_switch": 0.0, "loss_total": 0.20865872502326965, "step": 2339 }, { "batch_size": 1, "epoch": 0.9356, "step": 2339, "tokens_per_device": 4803 }, { "epoch": 0.9356, "loss_ce": 0.10916391015052795, "loss_lvr": 0.17097455263137817, "loss_mode_switch": 0.0, "loss_total": 0.126261368393898, "step": 2339 }, { "batch_size": 4, "epoch": 0.9356, "step": 2339, "tokens_per_device": 4716 }, { "epoch": 0.9356, "loss_ce": 0.25057634711265564, "loss_lvr": 0.7239864468574524, "loss_mode_switch": 0.0, "loss_total": 0.3229749798774719, "step": 2339 }, { "batch_size": 1, "epoch": 0.9356, "step": 2339, "tokens_per_device": 4869 }, { "epoch": 0.9356, "loss_ce": 0.109986811876297, "loss_lvr": 0.397441029548645, "loss_mode_switch": 0.0, "loss_total": 0.14973092079162598, "step": 2339 }, { "batch_size": 4, "epoch": 0.9356, "step": 2339, "tokens_per_device": 5652 }, { "epoch": 0.9356, "loss_ce": 0.2042708545923233, "loss_lvr": 0.7714722752571106, "loss_mode_switch": 0.0, "loss_total": 0.2814180850982666, "step": 2339 }, { "batch_size": 4, "epoch": 0.9356, "step": 2339, "tokens_per_device": 2728 }, { "epoch": 0.9356, "loss_ce": 0.528893232345581, "loss_lvr": 0.8415325880050659, "loss_mode_switch": 0.0, "loss_total": 0.6130464673042297, "step": 2339 }, { "epoch": 0.936, "grad_norm": 1.1494709253311157, "learning_rate": 1.0702881477575589e-07, "loss": 0.2637, "step": 2340 }, { "batch_size": 1, "epoch": 0.936, "step": 2340, "tokens_per_device": 5113 }, { "epoch": 0.936, "loss_ce": 0.1652679145336151, "loss_lvr": 0.4247758984565735, "loss_mode_switch": 0.0, "loss_total": 0.2077455073595047, "step": 2340 }, { "batch_size": 1, "epoch": 0.936, "step": 2340, "tokens_per_device": 5161 }, { "epoch": 0.936, "loss_ce": 0.0685540959239006, "loss_lvr": 0.5714261531829834, "loss_mode_switch": 0.0, "loss_total": 0.12569671869277954, "step": 2340 }, { "batch_size": 4, "epoch": 0.936, "step": 2340, "tokens_per_device": 2760 }, { "epoch": 0.936, "loss_ce": 0.3495337963104248, "loss_lvr": 0.8690731525421143, "loss_mode_switch": 0.0, "loss_total": 0.4364411234855652, "step": 2340 }, { "batch_size": 4, "epoch": 0.936, "step": 2340, "tokens_per_device": 4120 }, { "epoch": 0.936, "loss_ce": 0.5024732947349548, "loss_lvr": 1.162453532218933, "loss_mode_switch": 0.0, "loss_total": 0.6187186241149902, "step": 2340 }, { "batch_size": 4, "epoch": 0.936, "step": 2340, "tokens_per_device": 3760 }, { "epoch": 0.936, "loss_ce": 0.48553466796875, "loss_lvr": 0.937283456325531, "loss_mode_switch": 0.0, "loss_total": 0.5792630314826965, "step": 2340 }, { "batch_size": 4, "epoch": 0.936, "step": 2340, "tokens_per_device": 4280 }, { "epoch": 0.936, "loss_ce": 0.5829944610595703, "loss_lvr": 0.5937144160270691, "loss_mode_switch": 0.0, "loss_total": 0.6423659324645996, "step": 2340 }, { "batch_size": 1, "epoch": 0.936, "step": 2340, "tokens_per_device": 4674 }, { "epoch": 0.936, "loss_ce": 0.003650204511359334, "loss_lvr": 0.2556239664554596, "loss_mode_switch": 0.0, "loss_total": 0.029212601482868195, "step": 2340 }, { "batch_size": 4, "epoch": 0.936, "step": 2340, "tokens_per_device": 5132 }, { "epoch": 0.936, "loss_ce": 0.2988573908805847, "loss_lvr": 0.7912039160728455, "loss_mode_switch": 0.0, "loss_total": 0.37797778844833374, "step": 2340 }, { "epoch": 0.9364, "grad_norm": 1.3760579824447632, "learning_rate": 1.0569985450923803e-07, "loss": 0.3131, "step": 2341 }, { "batch_size": 4, "epoch": 0.9364, "step": 2341, "tokens_per_device": 4224 }, { "epoch": 0.9364, "loss_ce": 0.23305173218250275, "loss_lvr": 1.0491821765899658, "loss_mode_switch": 0.0, "loss_total": 0.33796995878219604, "step": 2341 }, { "batch_size": 4, "epoch": 0.9364, "step": 2341, "tokens_per_device": 3828 }, { "epoch": 0.9364, "loss_ce": 0.1108558401465416, "loss_lvr": 0.5947412848472595, "loss_mode_switch": 0.0, "loss_total": 0.1703299731016159, "step": 2341 }, { "batch_size": 4, "epoch": 0.9364, "step": 2341, "tokens_per_device": 4052 }, { "epoch": 0.9364, "loss_ce": 0.0011665492784231901, "loss_lvr": 0.6030513048171997, "loss_mode_switch": 0.0, "loss_total": 0.06147167831659317, "step": 2341 }, { "batch_size": 1, "epoch": 0.9364, "step": 2341, "tokens_per_device": 4897 }, { "epoch": 0.9364, "loss_ce": 0.008572482503950596, "loss_lvr": 0.40071940422058105, "loss_mode_switch": 0.0, "loss_total": 0.048644423484802246, "step": 2341 }, { "batch_size": 4, "epoch": 0.9364, "step": 2341, "tokens_per_device": 2648 }, { "epoch": 0.9364, "loss_ce": 0.38254621624946594, "loss_lvr": 0.7994714379310608, "loss_mode_switch": 0.0, "loss_total": 0.462493360042572, "step": 2341 }, { "batch_size": 4, "epoch": 0.9364, "step": 2341, "tokens_per_device": 4376 }, { "epoch": 0.9364, "loss_ce": 0.16950668394565582, "loss_lvr": 1.0137664079666138, "loss_mode_switch": 0.0, "loss_total": 0.27088332176208496, "step": 2341 }, { "batch_size": 4, "epoch": 0.9364, "step": 2341, "tokens_per_device": 4244 }, { "epoch": 0.9364, "loss_ce": 0.25835350155830383, "loss_lvr": 1.0407075881958008, "loss_mode_switch": 0.0, "loss_total": 0.36242425441741943, "step": 2341 }, { "batch_size": 4, "epoch": 0.9364, "step": 2341, "tokens_per_device": 2588 }, { "epoch": 0.9364, "loss_ce": 0.021310487762093544, "loss_lvr": 1.007838487625122, "loss_mode_switch": 0.0, "loss_total": 0.12209433317184448, "step": 2341 }, { "epoch": 0.9368, "grad_norm": 1.3104088306427002, "learning_rate": 1.0437910847156507e-07, "loss": 0.2804, "step": 2342 }, { "batch_size": 4, "epoch": 0.9368, "step": 2342, "tokens_per_device": 4624 }, { "epoch": 0.9368, "loss_ce": 0.16073152422904968, "loss_lvr": 0.8508477807044983, "loss_mode_switch": 0.0, "loss_total": 0.24581630527973175, "step": 2342 }, { "batch_size": 4, "epoch": 0.9368, "step": 2342, "tokens_per_device": 2680 }, { "epoch": 0.9368, "loss_ce": 0.3778412640094757, "loss_lvr": 0.7726591229438782, "loss_mode_switch": 0.0, "loss_total": 0.455107182264328, "step": 2342 }, { "batch_size": 4, "epoch": 0.9368, "step": 2342, "tokens_per_device": 1500 }, { "epoch": 0.9368, "loss_ce": 0.6138375401496887, "loss_lvr": 0.9530989527702332, "loss_mode_switch": 0.0, "loss_total": 0.7091474533081055, "step": 2342 }, { "batch_size": 4, "epoch": 0.9368, "step": 2342, "tokens_per_device": 5904 }, { "epoch": 0.9368, "loss_ce": 0.08273476362228394, "loss_lvr": 0.741483747959137, "loss_mode_switch": 0.0, "loss_total": 0.1568831503391266, "step": 2342 }, { "batch_size": 1, "epoch": 0.9368, "step": 2342, "tokens_per_device": 5160 }, { "epoch": 0.9368, "loss_ce": 0.5423431396484375, "loss_lvr": 0.24669796228408813, "loss_mode_switch": 0.0, "loss_total": 0.5670129060745239, "step": 2342 }, { "batch_size": 1, "epoch": 0.9368, "step": 2342, "tokens_per_device": 5177 }, { "epoch": 0.9368, "loss_ce": 0.010084117762744427, "loss_lvr": 0.2353033423423767, "loss_mode_switch": 0.0, "loss_total": 0.03361445292830467, "step": 2342 }, { "batch_size": 1, "epoch": 0.9368, "step": 2342, "tokens_per_device": 5123 }, { "epoch": 0.9368, "loss_ce": 0.00997393298894167, "loss_lvr": 0.440003365278244, "loss_mode_switch": 0.0, "loss_total": 0.053974270820617676, "step": 2342 }, { "batch_size": 4, "epoch": 0.9368, "step": 2342, "tokens_per_device": 2612 }, { "epoch": 0.9368, "loss_ce": 0.43175724148750305, "loss_lvr": 0.8149105906486511, "loss_mode_switch": 0.0, "loss_total": 0.5132483243942261, "step": 2342 }, { "epoch": 0.9372, "grad_norm": 2.9368488788604736, "learning_rate": 1.0306657887937833e-07, "loss": 0.2762, "step": 2343 }, { "batch_size": 4, "epoch": 0.9372, "step": 2343, "tokens_per_device": 6024 }, { "epoch": 0.9372, "loss_ce": 0.08877987414598465, "loss_lvr": 0.6640781760215759, "loss_mode_switch": 0.0, "loss_total": 0.1551876962184906, "step": 2343 }, { "batch_size": 4, "epoch": 0.9372, "step": 2343, "tokens_per_device": 4168 }, { "epoch": 0.9372, "loss_ce": 0.1900356411933899, "loss_lvr": 0.667565107345581, "loss_mode_switch": 0.0, "loss_total": 0.2567921578884125, "step": 2343 }, { "batch_size": 4, "epoch": 0.9372, "step": 2343, "tokens_per_device": 1356 }, { "epoch": 0.9372, "loss_ce": 0.49973785877227783, "loss_lvr": 1.0393322706222534, "loss_mode_switch": 0.0, "loss_total": 0.6036710739135742, "step": 2343 }, { "batch_size": 4, "epoch": 0.9372, "step": 2343, "tokens_per_device": 15980 }, { "epoch": 0.9372, "loss_ce": 0.20394693315029144, "loss_lvr": 0.8745065927505493, "loss_mode_switch": 0.0, "loss_total": 0.2913976013660431, "step": 2343 }, { "batch_size": 1, "epoch": 0.9372, "step": 2343, "tokens_per_device": 4871 }, { "epoch": 0.9372, "loss_ce": 0.008163384161889553, "loss_lvr": 0.3778076767921448, "loss_mode_switch": 0.0, "loss_total": 0.045944154262542725, "step": 2343 }, { "batch_size": 4, "epoch": 0.9372, "step": 2343, "tokens_per_device": 4380 }, { "epoch": 0.9372, "loss_ce": 0.19820381700992584, "loss_lvr": 0.8551680445671082, "loss_mode_switch": 0.0, "loss_total": 0.28372061252593994, "step": 2343 }, { "batch_size": 4, "epoch": 0.9372, "step": 2343, "tokens_per_device": 5372 }, { "epoch": 0.9372, "loss_ce": 0.009887195192277431, "loss_lvr": 0.6142760515213013, "loss_mode_switch": 0.0, "loss_total": 0.07131479680538177, "step": 2343 }, { "batch_size": 4, "epoch": 0.9372, "step": 2343, "tokens_per_device": 2768 }, { "epoch": 0.9372, "loss_ce": 0.31629687547683716, "loss_lvr": 0.9585924744606018, "loss_mode_switch": 0.0, "loss_total": 0.4121561348438263, "step": 2343 }, { "epoch": 0.9376, "grad_norm": 1.304208755493164, "learning_rate": 1.0176226793552957e-07, "loss": 0.3067, "step": 2344 }, { "batch_size": 4, "epoch": 0.9376, "step": 2344, "tokens_per_device": 15612 }, { "epoch": 0.9376, "loss_ce": 0.06255121529102325, "loss_lvr": 0.7075557112693787, "loss_mode_switch": 0.0, "loss_total": 0.13330678641796112, "step": 2344 }, { "batch_size": 1, "epoch": 0.9376, "step": 2344, "tokens_per_device": 5115 }, { "epoch": 0.9376, "loss_ce": 0.04971764609217644, "loss_lvr": 0.4098969101905823, "loss_mode_switch": 0.0, "loss_total": 0.09070733934640884, "step": 2344 }, { "batch_size": 4, "epoch": 0.9376, "step": 2344, "tokens_per_device": 2456 }, { "epoch": 0.9376, "loss_ce": 0.6086665391921997, "loss_lvr": 1.2869446277618408, "loss_mode_switch": 0.0, "loss_total": 0.7373610138893127, "step": 2344 }, { "batch_size": 4, "epoch": 0.9376, "step": 2344, "tokens_per_device": 2724 }, { "epoch": 0.9376, "loss_ce": 0.27458640933036804, "loss_lvr": 0.6561363339424133, "loss_mode_switch": 0.0, "loss_total": 0.3402000367641449, "step": 2344 }, { "batch_size": 4, "epoch": 0.9376, "step": 2344, "tokens_per_device": 2696 }, { "epoch": 0.9376, "loss_ce": 0.0864231064915657, "loss_lvr": 0.8459844589233398, "loss_mode_switch": 0.0, "loss_total": 0.17102155089378357, "step": 2344 }, { "batch_size": 4, "epoch": 0.9376, "step": 2344, "tokens_per_device": 14468 }, { "epoch": 0.9376, "loss_ce": 0.13005420565605164, "loss_lvr": 0.7227846384048462, "loss_mode_switch": 0.0, "loss_total": 0.20233267545700073, "step": 2344 }, { "batch_size": 1, "epoch": 0.9376, "step": 2344, "tokens_per_device": 5114 }, { "epoch": 0.9376, "loss_ce": 0.008094120770692825, "loss_lvr": 0.20128855109214783, "loss_mode_switch": 0.0, "loss_total": 0.028222976252436638, "step": 2344 }, { "batch_size": 1, "epoch": 0.9376, "step": 2344, "tokens_per_device": 5560 }, { "epoch": 0.9376, "loss_ce": 0.009702534414827824, "loss_lvr": 0.3283342123031616, "loss_mode_switch": 0.0, "loss_total": 0.04253595694899559, "step": 2344 }, { "epoch": 0.938, "grad_norm": 1.2499570846557617, "learning_rate": 1.004661778290783e-07, "loss": 0.2515, "step": 2345 }, { "batch_size": 1, "epoch": 0.938, "step": 2345, "tokens_per_device": 5386 }, { "epoch": 0.938, "loss_ce": 0.015147135592997074, "loss_lvr": 0.27788224816322327, "loss_mode_switch": 0.0, "loss_total": 0.042935360223054886, "step": 2345 }, { "batch_size": 4, "epoch": 0.938, "step": 2345, "tokens_per_device": 6052 }, { "epoch": 0.938, "loss_ce": 0.10235375165939331, "loss_lvr": 0.6142319440841675, "loss_mode_switch": 0.0, "loss_total": 0.1637769490480423, "step": 2345 }, { "batch_size": 1, "epoch": 0.938, "step": 2345, "tokens_per_device": 4894 }, { "epoch": 0.938, "loss_ce": 0.0389319509267807, "loss_lvr": 0.1861865371465683, "loss_mode_switch": 0.0, "loss_total": 0.05755060538649559, "step": 2345 }, { "batch_size": 4, "epoch": 0.938, "step": 2345, "tokens_per_device": 3756 }, { "epoch": 0.938, "loss_ce": 0.4045878052711487, "loss_lvr": 1.1045030355453491, "loss_mode_switch": 0.0, "loss_total": 0.5150381326675415, "step": 2345 }, { "batch_size": 4, "epoch": 0.938, "step": 2345, "tokens_per_device": 3728 }, { "epoch": 0.938, "loss_ce": 0.1719907969236374, "loss_lvr": 0.3367077708244324, "loss_mode_switch": 0.0, "loss_total": 0.2056615799665451, "step": 2345 }, { "batch_size": 1, "epoch": 0.938, "step": 2345, "tokens_per_device": 5199 }, { "epoch": 0.938, "loss_ce": 0.0019329618662595749, "loss_lvr": 0.6026889085769653, "loss_mode_switch": 0.0, "loss_total": 0.0622018501162529, "step": 2345 }, { "batch_size": 4, "epoch": 0.938, "step": 2345, "tokens_per_device": 5052 }, { "epoch": 0.938, "loss_ce": 0.3056561052799225, "loss_lvr": 0.9948397874832153, "loss_mode_switch": 0.0, "loss_total": 0.40514010190963745, "step": 2345 }, { "batch_size": 4, "epoch": 0.938, "step": 2345, "tokens_per_device": 3496 }, { "epoch": 0.938, "loss_ce": 0.25270694494247437, "loss_lvr": 0.9275684952735901, "loss_mode_switch": 0.0, "loss_total": 0.3454638123512268, "step": 2345 }, { "epoch": 0.9384, "grad_norm": 1.2649942636489868, "learning_rate": 9.917831073528506e-08, "loss": 0.2622, "step": 2346 }, { "batch_size": 4, "epoch": 0.9384, "step": 2346, "tokens_per_device": 7076 }, { "epoch": 0.9384, "loss_ce": 0.15174326300621033, "loss_lvr": 0.49896615743637085, "loss_mode_switch": 0.0, "loss_total": 0.20163987576961517, "step": 2346 }, { "batch_size": 4, "epoch": 0.9384, "step": 2346, "tokens_per_device": 5264 }, { "epoch": 0.9384, "loss_ce": 0.10023057460784912, "loss_lvr": 0.6123730540275574, "loss_mode_switch": 0.0, "loss_total": 0.16146788001060486, "step": 2346 }, { "batch_size": 4, "epoch": 0.9384, "step": 2346, "tokens_per_device": 4300 }, { "epoch": 0.9384, "loss_ce": 0.5560509562492371, "loss_lvr": 0.9873165488243103, "loss_mode_switch": 0.0, "loss_total": 0.6547825932502747, "step": 2346 }, { "batch_size": 1, "epoch": 0.9384, "step": 2346, "tokens_per_device": 4921 }, { "epoch": 0.9384, "loss_ce": 0.04522987827658653, "loss_lvr": 0.777519702911377, "loss_mode_switch": 0.0, "loss_total": 0.12298184633255005, "step": 2346 }, { "batch_size": 4, "epoch": 0.9384, "step": 2346, "tokens_per_device": 3868 }, { "epoch": 0.9384, "loss_ce": 0.6119258999824524, "loss_lvr": 0.924073338508606, "loss_mode_switch": 0.0, "loss_total": 0.7043332457542419, "step": 2346 }, { "batch_size": 1, "epoch": 0.9384, "step": 2346, "tokens_per_device": 4675 }, { "epoch": 0.9384, "loss_ce": 0.006100226193666458, "loss_lvr": 0.22082415223121643, "loss_mode_switch": 0.0, "loss_total": 0.02818264253437519, "step": 2346 }, { "batch_size": 4, "epoch": 0.9384, "step": 2346, "tokens_per_device": 3896 }, { "epoch": 0.9384, "loss_ce": 0.20705068111419678, "loss_lvr": 1.0045167207717896, "loss_mode_switch": 0.0, "loss_total": 0.3075023591518402, "step": 2346 }, { "batch_size": 1, "epoch": 0.9384, "step": 2346, "tokens_per_device": 5457 }, { "epoch": 0.9384, "loss_ce": 0.12838371098041534, "loss_lvr": 0.4295775592327118, "loss_mode_switch": 0.0, "loss_total": 0.17134146392345428, "step": 2346 }, { "epoch": 0.9388, "grad_norm": 1.3223429918289185, "learning_rate": 9.789866881560971e-08, "loss": 0.259, "step": 2347 }, { "batch_size": 1, "epoch": 0.9388, "step": 2347, "tokens_per_device": 5182 }, { "epoch": 0.9388, "loss_ce": 0.065798319876194, "loss_lvr": 0.5006431341171265, "loss_mode_switch": 0.0, "loss_total": 0.115862637758255, "step": 2347 }, { "batch_size": 1, "epoch": 0.9388, "step": 2347, "tokens_per_device": 4928 }, { "epoch": 0.9388, "loss_ce": 0.013714541681110859, "loss_lvr": 0.2955627739429474, "loss_mode_switch": 0.0, "loss_total": 0.04327081888914108, "step": 2347 }, { "batch_size": 4, "epoch": 0.9388, "step": 2347, "tokens_per_device": 2544 }, { "epoch": 0.9388, "loss_ce": 0.12841041386127472, "loss_lvr": 1.033376932144165, "loss_mode_switch": 0.0, "loss_total": 0.23174810409545898, "step": 2347 }, { "batch_size": 4, "epoch": 0.9388, "step": 2347, "tokens_per_device": 5416 }, { "epoch": 0.9388, "loss_ce": 0.11627663671970367, "loss_lvr": 0.6956402063369751, "loss_mode_switch": 0.0, "loss_total": 0.1858406662940979, "step": 2347 }, { "batch_size": 4, "epoch": 0.9388, "step": 2347, "tokens_per_device": 4188 }, { "epoch": 0.9388, "loss_ce": 0.04149331524968147, "loss_lvr": 0.9061185717582703, "loss_mode_switch": 0.0, "loss_total": 0.13210517168045044, "step": 2347 }, { "batch_size": 4, "epoch": 0.9388, "step": 2347, "tokens_per_device": 4208 }, { "epoch": 0.9388, "loss_ce": 0.20175351202487946, "loss_lvr": 0.7753341794013977, "loss_mode_switch": 0.0, "loss_total": 0.2792869210243225, "step": 2347 }, { "batch_size": 1, "epoch": 0.9388, "step": 2347, "tokens_per_device": 4923 }, { "epoch": 0.9388, "loss_ce": 0.0596621111035347, "loss_lvr": 0.33347928524017334, "loss_mode_switch": 0.0, "loss_total": 0.09301003813743591, "step": 2347 }, { "batch_size": 4, "epoch": 0.9388, "step": 2347, "tokens_per_device": 4936 }, { "epoch": 0.9388, "loss_ce": 0.2144821584224701, "loss_lvr": 0.8735836148262024, "loss_mode_switch": 0.0, "loss_total": 0.30184051394462585, "step": 2347 }, { "epoch": 0.9392, "grad_norm": 1.068050503730774, "learning_rate": 9.66272542177077e-08, "loss": 0.2323, "step": 2348 }, { "batch_size": 1, "epoch": 0.9392, "step": 2348, "tokens_per_device": 5104 }, { "epoch": 0.9392, "loss_ce": 0.02070201002061367, "loss_lvr": 0.5121844410896301, "loss_mode_switch": 0.0, "loss_total": 0.07192045450210571, "step": 2348 }, { "batch_size": 4, "epoch": 0.9392, "step": 2348, "tokens_per_device": 3768 }, { "epoch": 0.9392, "loss_ce": 0.002345914486795664, "loss_lvr": 0.5434530973434448, "loss_mode_switch": 0.0, "loss_total": 0.05669122561812401, "step": 2348 }, { "batch_size": 1, "epoch": 0.9392, "step": 2348, "tokens_per_device": 5075 }, { "epoch": 0.9392, "loss_ce": 0.02600550465285778, "loss_lvr": 0.4520622491836548, "loss_mode_switch": 0.0, "loss_total": 0.07121173292398453, "step": 2348 }, { "batch_size": 4, "epoch": 0.9392, "step": 2348, "tokens_per_device": 3592 }, { "epoch": 0.9392, "loss_ce": 0.3801596760749817, "loss_lvr": 0.829626739025116, "loss_mode_switch": 0.0, "loss_total": 0.4631223678588867, "step": 2348 }, { "batch_size": 1, "epoch": 0.9392, "step": 2348, "tokens_per_device": 4917 }, { "epoch": 0.9392, "loss_ce": 0.07258521020412445, "loss_lvr": 0.7893977761268616, "loss_mode_switch": 0.0, "loss_total": 0.15152499079704285, "step": 2348 }, { "batch_size": 1, "epoch": 0.9392, "step": 2348, "tokens_per_device": 6696 }, { "epoch": 0.9392, "loss_ce": 0.0795358419418335, "loss_lvr": 0.4359157383441925, "loss_mode_switch": 0.0, "loss_total": 0.12312741577625275, "step": 2348 }, { "batch_size": 4, "epoch": 0.9392, "step": 2348, "tokens_per_device": 10772 }, { "epoch": 0.9392, "loss_ce": 0.46548596024513245, "loss_lvr": 0.6087356805801392, "loss_mode_switch": 0.0, "loss_total": 0.5263595581054688, "step": 2348 }, { "batch_size": 4, "epoch": 0.9392, "step": 2348, "tokens_per_device": 4448 }, { "epoch": 0.9392, "loss_ce": 0.5207948684692383, "loss_lvr": 1.003658652305603, "loss_mode_switch": 0.0, "loss_total": 0.6211607456207275, "step": 2348 }, { "epoch": 0.9396, "grad_norm": 1.1651039123535156, "learning_rate": 9.536406907542761e-08, "loss": 0.2438, "step": 2349 }, { "batch_size": 4, "epoch": 0.9396, "step": 2349, "tokens_per_device": 4216 }, { "epoch": 0.9396, "loss_ce": 0.5134518146514893, "loss_lvr": 0.7613261938095093, "loss_mode_switch": 0.0, "loss_total": 0.5895844101905823, "step": 2349 }, { "batch_size": 4, "epoch": 0.9396, "step": 2349, "tokens_per_device": 6496 }, { "epoch": 0.9396, "loss_ce": 0.06808120757341385, "loss_lvr": 0.794990599155426, "loss_mode_switch": 0.0, "loss_total": 0.14758026599884033, "step": 2349 }, { "batch_size": 4, "epoch": 0.9396, "step": 2349, "tokens_per_device": 2616 }, { "epoch": 0.9396, "loss_ce": 0.3974199593067169, "loss_lvr": 0.8842760920524597, "loss_mode_switch": 0.0, "loss_total": 0.4858475625514984, "step": 2349 }, { "batch_size": 4, "epoch": 0.9396, "step": 2349, "tokens_per_device": 4440 }, { "epoch": 0.9396, "loss_ce": 0.05555498227477074, "loss_lvr": 0.8452849388122559, "loss_mode_switch": 0.0, "loss_total": 0.14008347690105438, "step": 2349 }, { "batch_size": 4, "epoch": 0.9396, "step": 2349, "tokens_per_device": 3804 }, { "epoch": 0.9396, "loss_ce": 0.559467613697052, "loss_lvr": 0.4232282340526581, "loss_mode_switch": 0.0, "loss_total": 0.6017904281616211, "step": 2349 }, { "batch_size": 4, "epoch": 0.9396, "step": 2349, "tokens_per_device": 1916 }, { "epoch": 0.9396, "loss_ce": 0.1599711775779724, "loss_lvr": 0.8228914737701416, "loss_mode_switch": 0.0, "loss_total": 0.24226033687591553, "step": 2349 }, { "batch_size": 4, "epoch": 0.9396, "step": 2349, "tokens_per_device": 3776 }, { "epoch": 0.9396, "loss_ce": 0.023168902844190598, "loss_lvr": 0.7359803318977356, "loss_mode_switch": 0.0, "loss_total": 0.09676693379878998, "step": 2349 }, { "batch_size": 1, "epoch": 0.9396, "step": 2349, "tokens_per_device": 4782 }, { "epoch": 0.9396, "loss_ce": 0.03206989914178848, "loss_lvr": 0.4311121702194214, "loss_mode_switch": 0.0, "loss_total": 0.07518111169338226, "step": 2349 }, { "epoch": 0.94, "grad_norm": 1.2362291812896729, "learning_rate": 9.410911550880474e-08, "loss": 0.2738, "step": 2350 }, { "batch_size": 4, "epoch": 0.94, "step": 2350, "tokens_per_device": 4220 }, { "epoch": 0.94, "loss_ce": 0.42588183283805847, "loss_lvr": 0.6756864786148071, "loss_mode_switch": 0.0, "loss_total": 0.49345049262046814, "step": 2350 }, { "batch_size": 4, "epoch": 0.94, "step": 2350, "tokens_per_device": 1420 }, { "epoch": 0.94, "loss_ce": 0.1902029663324356, "loss_lvr": 0.8889685273170471, "loss_mode_switch": 0.0, "loss_total": 0.27909982204437256, "step": 2350 }, { "batch_size": 1, "epoch": 0.94, "step": 2350, "tokens_per_device": 4950 }, { "epoch": 0.94, "loss_ce": 0.0077043697237968445, "loss_lvr": 0.3679713308811188, "loss_mode_switch": 0.0, "loss_total": 0.04450150206685066, "step": 2350 }, { "batch_size": 1, "epoch": 0.94, "step": 2350, "tokens_per_device": 5155 }, { "epoch": 0.94, "loss_ce": 0.7054068446159363, "loss_lvr": 0.3205389678478241, "loss_mode_switch": 0.0, "loss_total": 0.737460732460022, "step": 2350 }, { "batch_size": 4, "epoch": 0.94, "step": 2350, "tokens_per_device": 4536 }, { "epoch": 0.94, "loss_ce": 0.07536014914512634, "loss_lvr": 0.8697114586830139, "loss_mode_switch": 0.0, "loss_total": 0.16233129799365997, "step": 2350 }, { "batch_size": 1, "epoch": 0.94, "step": 2350, "tokens_per_device": 5152 }, { "epoch": 0.94, "loss_ce": 0.00414805393666029, "loss_lvr": 0.5320532917976379, "loss_mode_switch": 0.0, "loss_total": 0.05735338479280472, "step": 2350 }, { "batch_size": 4, "epoch": 0.94, "step": 2350, "tokens_per_device": 2660 }, { "epoch": 0.94, "loss_ce": 0.3539641797542572, "loss_lvr": 0.6134645938873291, "loss_mode_switch": 0.0, "loss_total": 0.41531065106391907, "step": 2350 }, { "batch_size": 4, "epoch": 0.94, "step": 2350, "tokens_per_device": 4488 }, { "epoch": 0.94, "loss_ce": 0.31698179244995117, "loss_lvr": 1.1179214715957642, "loss_mode_switch": 0.0, "loss_total": 0.4287739396095276, "step": 2350 }, { "epoch": 0.9404, "grad_norm": 1.218718409538269, "learning_rate": 9.286239562405985e-08, "loss": 0.2797, "step": 2351 }, { "batch_size": 4, "epoch": 0.9404, "step": 2351, "tokens_per_device": 1260 }, { "epoch": 0.9404, "loss_ce": 0.3294954299926758, "loss_lvr": 0.8510487079620361, "loss_mode_switch": 0.0, "loss_total": 0.41460031270980835, "step": 2351 }, { "batch_size": 1, "epoch": 0.9404, "step": 2351, "tokens_per_device": 4129 }, { "epoch": 0.9404, "loss_ce": 0.016019318252801895, "loss_lvr": 0.7673634886741638, "loss_mode_switch": 0.0, "loss_total": 0.09275567531585693, "step": 2351 }, { "batch_size": 4, "epoch": 0.9404, "step": 2351, "tokens_per_device": 4096 }, { "epoch": 0.9404, "loss_ce": 0.37480953335762024, "loss_lvr": 0.8010737299919128, "loss_mode_switch": 0.0, "loss_total": 0.45491689443588257, "step": 2351 }, { "batch_size": 1, "epoch": 0.9404, "step": 2351, "tokens_per_device": 5019 }, { "epoch": 0.9404, "loss_ce": 0.0029841652140021324, "loss_lvr": 0.4005357623100281, "loss_mode_switch": 0.0, "loss_total": 0.043037742376327515, "step": 2351 }, { "batch_size": 4, "epoch": 0.9404, "step": 2351, "tokens_per_device": 5760 }, { "epoch": 0.9404, "loss_ce": 0.18730373680591583, "loss_lvr": 0.7687092423439026, "loss_mode_switch": 0.0, "loss_total": 0.2641746699810028, "step": 2351 }, { "batch_size": 4, "epoch": 0.9404, "step": 2351, "tokens_per_device": 4340 }, { "epoch": 0.9404, "loss_ce": 0.008970880880951881, "loss_lvr": 0.8229429125785828, "loss_mode_switch": 0.0, "loss_total": 0.09126517176628113, "step": 2351 }, { "batch_size": 4, "epoch": 0.9404, "step": 2351, "tokens_per_device": 3608 }, { "epoch": 0.9404, "loss_ce": 0.6089428067207336, "loss_lvr": 0.7409737706184387, "loss_mode_switch": 0.0, "loss_total": 0.683040201663971, "step": 2351 }, { "batch_size": 4, "epoch": 0.9404, "step": 2351, "tokens_per_device": 4544 }, { "epoch": 0.9404, "loss_ce": 0.30207380652427673, "loss_lvr": 0.8573270440101624, "loss_mode_switch": 0.0, "loss_total": 0.3878065049648285, "step": 2351 }, { "epoch": 0.9408, "grad_norm": 1.251025676727295, "learning_rate": 9.162391151359417e-08, "loss": 0.2733, "step": 2352 }, { "batch_size": 4, "epoch": 0.9408, "step": 2352, "tokens_per_device": 5612 }, { "epoch": 0.9408, "loss_ce": 0.09182675927877426, "loss_lvr": 0.6853169202804565, "loss_mode_switch": 0.0, "loss_total": 0.1603584587574005, "step": 2352 }, { "batch_size": 4, "epoch": 0.9408, "step": 2352, "tokens_per_device": 6172 }, { "epoch": 0.9408, "loss_ce": 0.07412402331829071, "loss_lvr": 0.7350279688835144, "loss_mode_switch": 0.0, "loss_total": 0.1476268172264099, "step": 2352 }, { "batch_size": 1, "epoch": 0.9408, "step": 2352, "tokens_per_device": 7475 }, { "epoch": 0.9408, "loss_ce": 0.0001904341479530558, "loss_lvr": 0.31211766600608826, "loss_mode_switch": 0.0, "loss_total": 0.03140220046043396, "step": 2352 }, { "batch_size": 4, "epoch": 0.9408, "step": 2352, "tokens_per_device": 15100 }, { "epoch": 0.9408, "loss_ce": 0.06324702501296997, "loss_lvr": 0.5406538844108582, "loss_mode_switch": 0.0, "loss_total": 0.11731241643428802, "step": 2352 }, { "batch_size": 1, "epoch": 0.9408, "step": 2352, "tokens_per_device": 5413 }, { "epoch": 0.9408, "loss_ce": 0.007151871919631958, "loss_lvr": 0.2999930679798126, "loss_mode_switch": 0.0, "loss_total": 0.03715118020772934, "step": 2352 }, { "batch_size": 1, "epoch": 0.9408, "step": 2352, "tokens_per_device": 5074 }, { "epoch": 0.9408, "loss_ce": 0.0038400127086788416, "loss_lvr": 0.8603696823120117, "loss_mode_switch": 0.0, "loss_total": 0.08987698704004288, "step": 2352 }, { "batch_size": 4, "epoch": 0.9408, "step": 2352, "tokens_per_device": 1292 }, { "epoch": 0.9408, "loss_ce": 0.28161272406578064, "loss_lvr": 1.002143144607544, "loss_mode_switch": 0.0, "loss_total": 0.38182705640792847, "step": 2352 }, { "batch_size": 4, "epoch": 0.9408, "step": 2352, "tokens_per_device": 1544 }, { "epoch": 0.9408, "loss_ce": 0.4039124846458435, "loss_lvr": 1.033637285232544, "loss_mode_switch": 0.0, "loss_total": 0.5072762370109558, "step": 2352 }, { "epoch": 0.9412, "grad_norm": 1.5596778392791748, "learning_rate": 9.039366525598781e-08, "loss": 0.2071, "step": 2353 }, { "batch_size": 1, "epoch": 0.9412, "step": 2353, "tokens_per_device": 5171 }, { "epoch": 0.9412, "loss_ce": 0.03130444139242172, "loss_lvr": 0.5301988124847412, "loss_mode_switch": 0.0, "loss_total": 0.08432432264089584, "step": 2353 }, { "batch_size": 4, "epoch": 0.9412, "step": 2353, "tokens_per_device": 1416 }, { "epoch": 0.9412, "loss_ce": 0.70661461353302, "loss_lvr": 0.8692768812179565, "loss_mode_switch": 0.0, "loss_total": 0.7935423254966736, "step": 2353 }, { "batch_size": 4, "epoch": 0.9412, "step": 2353, "tokens_per_device": 2620 }, { "epoch": 0.9412, "loss_ce": 0.27679288387298584, "loss_lvr": 0.6774909496307373, "loss_mode_switch": 0.0, "loss_total": 0.3445419669151306, "step": 2353 }, { "batch_size": 4, "epoch": 0.9412, "step": 2353, "tokens_per_device": 2544 }, { "epoch": 0.9412, "loss_ce": 0.31183764338493347, "loss_lvr": 0.9018334746360779, "loss_mode_switch": 0.0, "loss_total": 0.40202099084854126, "step": 2353 }, { "batch_size": 4, "epoch": 0.9412, "step": 2353, "tokens_per_device": 8132 }, { "epoch": 0.9412, "loss_ce": 0.3363391160964966, "loss_lvr": 0.3944455087184906, "loss_mode_switch": 0.0, "loss_total": 0.37578368186950684, "step": 2353 }, { "batch_size": 4, "epoch": 0.9412, "step": 2353, "tokens_per_device": 4264 }, { "epoch": 0.9412, "loss_ce": 0.2500337064266205, "loss_lvr": 0.7879866361618042, "loss_mode_switch": 0.0, "loss_total": 0.32883238792419434, "step": 2353 }, { "batch_size": 4, "epoch": 0.9412, "step": 2353, "tokens_per_device": 3784 }, { "epoch": 0.9412, "loss_ce": 0.7480109930038452, "loss_lvr": 0.8288974761962891, "loss_mode_switch": 0.0, "loss_total": 0.8309007287025452, "step": 2353 }, { "batch_size": 1, "epoch": 0.9412, "step": 2353, "tokens_per_device": 6283 }, { "epoch": 0.9412, "loss_ce": 0.03426762670278549, "loss_lvr": 0.2298174798488617, "loss_mode_switch": 0.0, "loss_total": 0.05724937468767166, "step": 2353 }, { "epoch": 0.9416, "grad_norm": 1.2139123678207397, "learning_rate": 8.917165891599467e-08, "loss": 0.2324, "step": 2354 }, { "batch_size": 4, "epoch": 0.9416, "step": 2354, "tokens_per_device": 3784 }, { "epoch": 0.9416, "loss_ce": 0.42641592025756836, "loss_lvr": 0.6025606989860535, "loss_mode_switch": 0.0, "loss_total": 0.48667198419570923, "step": 2354 }, { "batch_size": 1, "epoch": 0.9416, "step": 2354, "tokens_per_device": 4861 }, { "epoch": 0.9416, "loss_ce": 0.1161474660038948, "loss_lvr": 0.2807440161705017, "loss_mode_switch": 0.0, "loss_total": 0.14422187209129333, "step": 2354 }, { "batch_size": 4, "epoch": 0.9416, "step": 2354, "tokens_per_device": 5664 }, { "epoch": 0.9416, "loss_ce": 0.37967684864997864, "loss_lvr": 0.7554916143417358, "loss_mode_switch": 0.0, "loss_total": 0.45522600412368774, "step": 2354 }, { "batch_size": 4, "epoch": 0.9416, "step": 2354, "tokens_per_device": 9672 }, { "epoch": 0.9416, "loss_ce": 0.17715579271316528, "loss_lvr": 0.6295850276947021, "loss_mode_switch": 0.0, "loss_total": 0.24011430144309998, "step": 2354 }, { "batch_size": 4, "epoch": 0.9416, "step": 2354, "tokens_per_device": 4244 }, { "epoch": 0.9416, "loss_ce": 0.07121022790670395, "loss_lvr": 0.8574729561805725, "loss_mode_switch": 0.0, "loss_total": 0.15695752203464508, "step": 2354 }, { "batch_size": 1, "epoch": 0.9416, "step": 2354, "tokens_per_device": 5115 }, { "epoch": 0.9416, "loss_ce": 0.006446133367717266, "loss_lvr": 0.2794075906276703, "loss_mode_switch": 0.0, "loss_total": 0.03438689187169075, "step": 2354 }, { "batch_size": 4, "epoch": 0.9416, "step": 2354, "tokens_per_device": 4544 }, { "epoch": 0.9416, "loss_ce": 0.40711095929145813, "loss_lvr": 0.790124237537384, "loss_mode_switch": 0.0, "loss_total": 0.48612338304519653, "step": 2354 }, { "batch_size": 4, "epoch": 0.9416, "step": 2354, "tokens_per_device": 1536 }, { "epoch": 0.9416, "loss_ce": 0.42265403270721436, "loss_lvr": 1.1530115604400635, "loss_mode_switch": 0.0, "loss_total": 0.5379551649093628, "step": 2354 }, { "epoch": 0.942, "grad_norm": 1.2054129838943481, "learning_rate": 8.795789454453862e-08, "loss": 0.2645, "step": 2355 }, { "batch_size": 1, "epoch": 0.942, "step": 2355, "tokens_per_device": 5084 }, { "epoch": 0.942, "loss_ce": 0.019522743299603462, "loss_lvr": 1.219032645225525, "loss_mode_switch": 0.0, "loss_total": 0.14142601191997528, "step": 2355 }, { "batch_size": 4, "epoch": 0.942, "step": 2355, "tokens_per_device": 2624 }, { "epoch": 0.942, "loss_ce": 0.19632461667060852, "loss_lvr": 0.8750919103622437, "loss_mode_switch": 0.0, "loss_total": 0.2838338017463684, "step": 2355 }, { "batch_size": 1, "epoch": 0.942, "step": 2355, "tokens_per_device": 6309 }, { "epoch": 0.942, "loss_ce": 0.003584491554647684, "loss_lvr": 0.5100493431091309, "loss_mode_switch": 0.0, "loss_total": 0.05458942800760269, "step": 2355 }, { "batch_size": 4, "epoch": 0.942, "step": 2355, "tokens_per_device": 4000 }, { "epoch": 0.942, "loss_ce": 0.5921578407287598, "loss_lvr": 0.8666991591453552, "loss_mode_switch": 0.0, "loss_total": 0.6788277626037598, "step": 2355 }, { "batch_size": 1, "epoch": 0.942, "step": 2355, "tokens_per_device": 5176 }, { "epoch": 0.942, "loss_ce": 0.0031612375751137733, "loss_lvr": 0.4206167161464691, "loss_mode_switch": 0.0, "loss_total": 0.04522290825843811, "step": 2355 }, { "batch_size": 4, "epoch": 0.942, "step": 2355, "tokens_per_device": 6640 }, { "epoch": 0.942, "loss_ce": 0.029277047142386436, "loss_lvr": 0.8682266473770142, "loss_mode_switch": 0.0, "loss_total": 0.11609971523284912, "step": 2355 }, { "batch_size": 4, "epoch": 0.942, "step": 2355, "tokens_per_device": 1392 }, { "epoch": 0.942, "loss_ce": 0.2597377300262451, "loss_lvr": 1.0200778245925903, "loss_mode_switch": 0.0, "loss_total": 0.3617455065250397, "step": 2355 }, { "batch_size": 4, "epoch": 0.942, "step": 2355, "tokens_per_device": 2572 }, { "epoch": 0.942, "loss_ce": 0.4526694118976593, "loss_lvr": 0.7418660521507263, "loss_mode_switch": 0.0, "loss_total": 0.526856005191803, "step": 2355 }, { "epoch": 0.9424, "grad_norm": 1.2544149160385132, "learning_rate": 8.675237417871075e-08, "loss": 0.2595, "step": 2356 }, { "batch_size": 4, "epoch": 0.9424, "step": 2356, "tokens_per_device": 4176 }, { "epoch": 0.9424, "loss_ce": 0.4160197675228119, "loss_lvr": 0.7509834170341492, "loss_mode_switch": 0.0, "loss_total": 0.49111810326576233, "step": 2356 }, { "batch_size": 4, "epoch": 0.9424, "step": 2356, "tokens_per_device": 1416 }, { "epoch": 0.9424, "loss_ce": 0.17486059665679932, "loss_lvr": 0.8817419409751892, "loss_mode_switch": 0.0, "loss_total": 0.26303479075431824, "step": 2356 }, { "batch_size": 4, "epoch": 0.9424, "step": 2356, "tokens_per_device": 1600 }, { "epoch": 0.9424, "loss_ce": 0.4621540307998657, "loss_lvr": 0.7195282578468323, "loss_mode_switch": 0.0, "loss_total": 0.5341068506240845, "step": 2356 }, { "batch_size": 4, "epoch": 0.9424, "step": 2356, "tokens_per_device": 5888 }, { "epoch": 0.9424, "loss_ce": 0.15801747143268585, "loss_lvr": 0.7595223784446716, "loss_mode_switch": 0.0, "loss_total": 0.23396971821784973, "step": 2356 }, { "batch_size": 4, "epoch": 0.9424, "step": 2356, "tokens_per_device": 6688 }, { "epoch": 0.9424, "loss_ce": 0.22712522745132446, "loss_lvr": 0.6361295580863953, "loss_mode_switch": 0.0, "loss_total": 0.29073819518089294, "step": 2356 }, { "batch_size": 4, "epoch": 0.9424, "step": 2356, "tokens_per_device": 6168 }, { "epoch": 0.9424, "loss_ce": 0.22019371390342712, "loss_lvr": 0.737548291683197, "loss_mode_switch": 0.0, "loss_total": 0.29394853115081787, "step": 2356 }, { "batch_size": 1, "epoch": 0.9424, "step": 2356, "tokens_per_device": 4892 }, { "epoch": 0.9424, "loss_ce": 0.0002558127453085035, "loss_lvr": 0.4850364029407501, "loss_mode_switch": 0.0, "loss_total": 0.04875945299863815, "step": 2356 }, { "batch_size": 4, "epoch": 0.9424, "step": 2356, "tokens_per_device": 5200 }, { "epoch": 0.9424, "loss_ce": 0.3262152075767517, "loss_lvr": 0.7582540512084961, "loss_mode_switch": 0.0, "loss_total": 0.40204060077667236, "step": 2356 }, { "epoch": 0.9428, "grad_norm": 1.3118171691894531, "learning_rate": 8.555509984176812e-08, "loss": 0.262, "step": 2357 }, { "batch_size": 4, "epoch": 0.9428, "step": 2357, "tokens_per_device": 4244 }, { "epoch": 0.9428, "loss_ce": 0.2849362790584564, "loss_lvr": 0.9036613702774048, "loss_mode_switch": 0.0, "loss_total": 0.37530243396759033, "step": 2357 }, { "batch_size": 1, "epoch": 0.9428, "step": 2357, "tokens_per_device": 5688 }, { "epoch": 0.9428, "loss_ce": 0.00042577311978675425, "loss_lvr": 0.422657310962677, "loss_mode_switch": 0.0, "loss_total": 0.04269150644540787, "step": 2357 }, { "batch_size": 4, "epoch": 0.9428, "step": 2357, "tokens_per_device": 1456 }, { "epoch": 0.9428, "loss_ce": 0.299599826335907, "loss_lvr": 0.8602946400642395, "loss_mode_switch": 0.0, "loss_total": 0.3856292963027954, "step": 2357 }, { "batch_size": 1, "epoch": 0.9428, "step": 2357, "tokens_per_device": 4879 }, { "epoch": 0.9428, "loss_ce": 0.009564966894686222, "loss_lvr": 0.762640118598938, "loss_mode_switch": 0.0, "loss_total": 0.0858289822936058, "step": 2357 }, { "batch_size": 4, "epoch": 0.9428, "step": 2357, "tokens_per_device": 6528 }, { "epoch": 0.9428, "loss_ce": 0.4287269711494446, "loss_lvr": 0.7388399243354797, "loss_mode_switch": 0.0, "loss_total": 0.502610981464386, "step": 2357 }, { "batch_size": 4, "epoch": 0.9428, "step": 2357, "tokens_per_device": 10448 }, { "epoch": 0.9428, "loss_ce": 0.09544607996940613, "loss_lvr": 0.7305096983909607, "loss_mode_switch": 0.0, "loss_total": 0.16849705576896667, "step": 2357 }, { "batch_size": 4, "epoch": 0.9428, "step": 2357, "tokens_per_device": 3788 }, { "epoch": 0.9428, "loss_ce": 0.17227700352668762, "loss_lvr": 0.8961846232414246, "loss_mode_switch": 0.0, "loss_total": 0.26189547777175903, "step": 2357 }, { "batch_size": 4, "epoch": 0.9428, "step": 2357, "tokens_per_device": 2596 }, { "epoch": 0.9428, "loss_ce": 0.1514177918434143, "loss_lvr": 0.825011670589447, "loss_mode_switch": 0.0, "loss_total": 0.2339189648628235, "step": 2357 }, { "epoch": 0.9432, "grad_norm": 1.2579741477966309, "learning_rate": 8.43660735431262e-08, "loss": 0.2468, "step": 2358 }, { "batch_size": 4, "epoch": 0.9432, "step": 2358, "tokens_per_device": 6308 }, { "epoch": 0.9432, "loss_ce": 0.37090274691581726, "loss_lvr": 0.8201407790184021, "loss_mode_switch": 0.0, "loss_total": 0.45291683077812195, "step": 2358 }, { "batch_size": 1, "epoch": 0.9432, "step": 2358, "tokens_per_device": 4866 }, { "epoch": 0.9432, "loss_ce": 0.000567814102396369, "loss_lvr": 0.35362741351127625, "loss_mode_switch": 0.0, "loss_total": 0.03593055531382561, "step": 2358 }, { "batch_size": 4, "epoch": 0.9432, "step": 2358, "tokens_per_device": 6328 }, { "epoch": 0.9432, "loss_ce": 0.3368120789527893, "loss_lvr": 0.9037332534790039, "loss_mode_switch": 0.0, "loss_total": 0.42718541622161865, "step": 2358 }, { "batch_size": 4, "epoch": 0.9432, "step": 2358, "tokens_per_device": 11512 }, { "epoch": 0.9432, "loss_ce": 0.29914194345474243, "loss_lvr": 0.7463517785072327, "loss_mode_switch": 0.0, "loss_total": 0.3737771213054657, "step": 2358 }, { "batch_size": 4, "epoch": 0.9432, "step": 2358, "tokens_per_device": 4384 }, { "epoch": 0.9432, "loss_ce": 0.3417627215385437, "loss_lvr": 0.8397738337516785, "loss_mode_switch": 0.0, "loss_total": 0.425740122795105, "step": 2358 }, { "batch_size": 4, "epoch": 0.9432, "step": 2358, "tokens_per_device": 2556 }, { "epoch": 0.9432, "loss_ce": 0.3724629282951355, "loss_lvr": 0.7513776421546936, "loss_mode_switch": 0.0, "loss_total": 0.44760069251060486, "step": 2358 }, { "batch_size": 1, "epoch": 0.9432, "step": 2358, "tokens_per_device": 4868 }, { "epoch": 0.9432, "loss_ce": 0.0004423143982421607, "loss_lvr": 0.3371170163154602, "loss_mode_switch": 0.0, "loss_total": 0.0341540165245533, "step": 2358 }, { "batch_size": 4, "epoch": 0.9432, "step": 2358, "tokens_per_device": 2584 }, { "epoch": 0.9432, "loss_ce": 0.5059128999710083, "loss_lvr": 0.9018495678901672, "loss_mode_switch": 0.0, "loss_total": 0.5960978269577026, "step": 2358 }, { "epoch": 0.9436, "grad_norm": 1.2385226488113403, "learning_rate": 8.318529727835811e-08, "loss": 0.2551, "step": 2359 }, { "batch_size": 4, "epoch": 0.9436, "step": 2359, "tokens_per_device": 4408 }, { "epoch": 0.9436, "loss_ce": 0.19004221260547638, "loss_lvr": 0.8420318365097046, "loss_mode_switch": 0.0, "loss_total": 0.27424538135528564, "step": 2359 }, { "batch_size": 4, "epoch": 0.9436, "step": 2359, "tokens_per_device": 1152 }, { "epoch": 0.9436, "loss_ce": 0.05877072364091873, "loss_lvr": 0.992607831954956, "loss_mode_switch": 0.0, "loss_total": 0.15803150832653046, "step": 2359 }, { "batch_size": 4, "epoch": 0.9436, "step": 2359, "tokens_per_device": 1504 }, { "epoch": 0.9436, "loss_ce": 0.05062943696975708, "loss_lvr": 0.9051194787025452, "loss_mode_switch": 0.0, "loss_total": 0.1411413848400116, "step": 2359 }, { "batch_size": 1, "epoch": 0.9436, "step": 2359, "tokens_per_device": 5160 }, { "epoch": 0.9436, "loss_ce": 0.00826147198677063, "loss_lvr": 0.43088650703430176, "loss_mode_switch": 0.0, "loss_total": 0.051350124180316925, "step": 2359 }, { "batch_size": 4, "epoch": 0.9436, "step": 2359, "tokens_per_device": 4240 }, { "epoch": 0.9436, "loss_ce": 0.4127289950847626, "loss_lvr": 0.9841013550758362, "loss_mode_switch": 0.0, "loss_total": 0.5111391544342041, "step": 2359 }, { "batch_size": 4, "epoch": 0.9436, "step": 2359, "tokens_per_device": 3412 }, { "epoch": 0.9436, "loss_ce": 0.5071260333061218, "loss_lvr": 0.8176491856575012, "loss_mode_switch": 0.0, "loss_total": 0.5888909697532654, "step": 2359 }, { "batch_size": 4, "epoch": 0.9436, "step": 2359, "tokens_per_device": 8648 }, { "epoch": 0.9436, "loss_ce": 0.16820867359638214, "loss_lvr": 0.7268899083137512, "loss_mode_switch": 0.0, "loss_total": 0.24089765548706055, "step": 2359 }, { "batch_size": 4, "epoch": 0.9436, "step": 2359, "tokens_per_device": 11168 }, { "epoch": 0.9436, "loss_ce": 0.6652921438217163, "loss_lvr": 0.6199966669082642, "loss_mode_switch": 0.0, "loss_total": 0.7272918224334717, "step": 2359 }, { "epoch": 0.944, "grad_norm": 1.193099856376648, "learning_rate": 8.201277302919086e-08, "loss": 0.2198, "step": 2360 }, { "batch_size": 4, "epoch": 0.944, "step": 2360, "tokens_per_device": 4608 }, { "epoch": 0.944, "loss_ce": 0.22516527771949768, "loss_lvr": 0.8281852006912231, "loss_mode_switch": 0.0, "loss_total": 0.3079838156700134, "step": 2360 }, { "batch_size": 4, "epoch": 0.944, "step": 2360, "tokens_per_device": 4240 }, { "epoch": 0.944, "loss_ce": 0.17233894765377045, "loss_lvr": 0.8390557765960693, "loss_mode_switch": 0.0, "loss_total": 0.2562445402145386, "step": 2360 }, { "batch_size": 4, "epoch": 0.944, "step": 2360, "tokens_per_device": 12108 }, { "epoch": 0.944, "loss_ce": 0.04076702520251274, "loss_lvr": 0.7780483961105347, "loss_mode_switch": 0.0, "loss_total": 0.11857186257839203, "step": 2360 }, { "batch_size": 4, "epoch": 0.944, "step": 2360, "tokens_per_device": 11068 }, { "epoch": 0.944, "loss_ce": 0.06984297186136246, "loss_lvr": 0.6203197240829468, "loss_mode_switch": 0.0, "loss_total": 0.1318749487400055, "step": 2360 }, { "batch_size": 4, "epoch": 0.944, "step": 2360, "tokens_per_device": 5736 }, { "epoch": 0.944, "loss_ce": 0.047953106462955475, "loss_lvr": 0.8763550519943237, "loss_mode_switch": 0.0, "loss_total": 0.1355886161327362, "step": 2360 }, { "batch_size": 1, "epoch": 0.944, "step": 2360, "tokens_per_device": 5004 }, { "epoch": 0.944, "loss_ce": 0.028989877551794052, "loss_lvr": 0.232789546251297, "loss_mode_switch": 0.0, "loss_total": 0.05226883292198181, "step": 2360 }, { "batch_size": 4, "epoch": 0.944, "step": 2360, "tokens_per_device": 2664 }, { "epoch": 0.944, "loss_ce": 0.21999484300613403, "loss_lvr": 0.9014615416526794, "loss_mode_switch": 0.0, "loss_total": 0.310140997171402, "step": 2360 }, { "batch_size": 4, "epoch": 0.944, "step": 2360, "tokens_per_device": 4728 }, { "epoch": 0.944, "loss_ce": 0.04307929426431656, "loss_lvr": 0.7742000222206116, "loss_mode_switch": 0.0, "loss_total": 0.12049929797649384, "step": 2360 }, { "epoch": 0.9444, "grad_norm": 1.1044167280197144, "learning_rate": 8.084850276350142e-08, "loss": 0.2378, "step": 2361 }, { "batch_size": 4, "epoch": 0.9444, "step": 2361, "tokens_per_device": 4240 }, { "epoch": 0.9444, "loss_ce": 0.08278507739305496, "loss_lvr": 0.7792834639549255, "loss_mode_switch": 0.0, "loss_total": 0.16071343421936035, "step": 2361 }, { "batch_size": 1, "epoch": 0.9444, "step": 2361, "tokens_per_device": 5212 }, { "epoch": 0.9444, "loss_ce": 0.015382486395537853, "loss_lvr": 0.39669692516326904, "loss_mode_switch": 0.0, "loss_total": 0.05505217984318733, "step": 2361 }, { "batch_size": 4, "epoch": 0.9444, "step": 2361, "tokens_per_device": 1348 }, { "epoch": 0.9444, "loss_ce": 0.36315080523490906, "loss_lvr": 0.9111091494560242, "loss_mode_switch": 0.0, "loss_total": 0.4542617201805115, "step": 2361 }, { "batch_size": 4, "epoch": 0.9444, "step": 2361, "tokens_per_device": 1516 }, { "epoch": 0.9444, "loss_ce": 0.3748012185096741, "loss_lvr": 0.9304097294807434, "loss_mode_switch": 0.0, "loss_total": 0.4678421914577484, "step": 2361 }, { "batch_size": 1, "epoch": 0.9444, "step": 2361, "tokens_per_device": 4886 }, { "epoch": 0.9444, "loss_ce": 0.01988183706998825, "loss_lvr": 0.20765185356140137, "loss_mode_switch": 0.0, "loss_total": 0.04064702242612839, "step": 2361 }, { "batch_size": 4, "epoch": 0.9444, "step": 2361, "tokens_per_device": 5384 }, { "epoch": 0.9444, "loss_ce": 0.012344581075012684, "loss_lvr": 0.5934709906578064, "loss_mode_switch": 0.0, "loss_total": 0.07169168442487717, "step": 2361 }, { "batch_size": 4, "epoch": 0.9444, "step": 2361, "tokens_per_device": 4836 }, { "epoch": 0.9444, "loss_ce": 0.14296360313892365, "loss_lvr": 0.776994526386261, "loss_mode_switch": 0.0, "loss_total": 0.22066305577754974, "step": 2361 }, { "batch_size": 4, "epoch": 0.9444, "step": 2361, "tokens_per_device": 3012 }, { "epoch": 0.9444, "loss_ce": 0.18713591992855072, "loss_lvr": 0.474797785282135, "loss_mode_switch": 0.0, "loss_total": 0.23461569845676422, "step": 2361 }, { "epoch": 0.9448, "grad_norm": 1.2097439765930176, "learning_rate": 7.969248843531452e-08, "loss": 0.2698, "step": 2362 }, { "batch_size": 1, "epoch": 0.9448, "step": 2362, "tokens_per_device": 4779 }, { "epoch": 0.9448, "loss_ce": 0.012836652807891369, "loss_lvr": 0.295549213886261, "loss_mode_switch": 0.0, "loss_total": 0.0423915758728981, "step": 2362 }, { "batch_size": 1, "epoch": 0.9448, "step": 2362, "tokens_per_device": 4382 }, { "epoch": 0.9448, "loss_ce": 0.03580226004123688, "loss_lvr": 0.5086142420768738, "loss_mode_switch": 0.0, "loss_total": 0.08666368573904037, "step": 2362 }, { "batch_size": 1, "epoch": 0.9448, "step": 2362, "tokens_per_device": 5095 }, { "epoch": 0.9448, "loss_ce": 0.013492432422935963, "loss_lvr": 0.4970414340496063, "loss_mode_switch": 0.0, "loss_total": 0.0631965771317482, "step": 2362 }, { "batch_size": 1, "epoch": 0.9448, "step": 2362, "tokens_per_device": 5330 }, { "epoch": 0.9448, "loss_ce": 0.05405407398939133, "loss_lvr": 0.49214115738868713, "loss_mode_switch": 0.0, "loss_total": 0.10326819121837616, "step": 2362 }, { "batch_size": 4, "epoch": 0.9448, "step": 2362, "tokens_per_device": 7088 }, { "epoch": 0.9448, "loss_ce": 0.023928457871079445, "loss_lvr": 0.8595607280731201, "loss_mode_switch": 0.0, "loss_total": 0.10988453030586243, "step": 2362 }, { "batch_size": 4, "epoch": 0.9448, "step": 2362, "tokens_per_device": 4712 }, { "epoch": 0.9448, "loss_ce": 0.1867399364709854, "loss_lvr": 0.9951869249343872, "loss_mode_switch": 0.0, "loss_total": 0.28625863790512085, "step": 2362 }, { "batch_size": 4, "epoch": 0.9448, "step": 2362, "tokens_per_device": 4372 }, { "epoch": 0.9448, "loss_ce": 0.011480134911835194, "loss_lvr": 0.8315654993057251, "loss_mode_switch": 0.0, "loss_total": 0.09463668614625931, "step": 2362 }, { "batch_size": 1, "epoch": 0.9448, "step": 2362, "tokens_per_device": 5855 }, { "epoch": 0.9448, "loss_ce": 0.280056893825531, "loss_lvr": 0.48693570494651794, "loss_mode_switch": 0.0, "loss_total": 0.32875046133995056, "step": 2362 }, { "epoch": 0.9452, "grad_norm": 1.177229642868042, "learning_rate": 7.854473198479928e-08, "loss": 0.2178, "step": 2363 }, { "batch_size": 4, "epoch": 0.9452, "step": 2363, "tokens_per_device": 4224 }, { "epoch": 0.9452, "loss_ce": 0.07179170846939087, "loss_lvr": 0.8285222053527832, "loss_mode_switch": 0.0, "loss_total": 0.1546439230442047, "step": 2363 }, { "batch_size": 4, "epoch": 0.9452, "step": 2363, "tokens_per_device": 1260 }, { "epoch": 0.9452, "loss_ce": 0.07796365767717361, "loss_lvr": 1.0690072774887085, "loss_mode_switch": 0.0, "loss_total": 0.18486438691616058, "step": 2363 }, { "batch_size": 4, "epoch": 0.9452, "step": 2363, "tokens_per_device": 5292 }, { "epoch": 0.9452, "loss_ce": 0.3671572506427765, "loss_lvr": 0.600509524345398, "loss_mode_switch": 0.0, "loss_total": 0.42720821499824524, "step": 2363 }, { "batch_size": 1, "epoch": 0.9452, "step": 2363, "tokens_per_device": 5101 }, { "epoch": 0.9452, "loss_ce": 0.12564924359321594, "loss_lvr": 0.2884407937526703, "loss_mode_switch": 0.0, "loss_total": 0.1544933170080185, "step": 2363 }, { "batch_size": 4, "epoch": 0.9452, "step": 2363, "tokens_per_device": 4252 }, { "epoch": 0.9452, "loss_ce": 0.33265817165374756, "loss_lvr": 0.9377397298812866, "loss_mode_switch": 0.0, "loss_total": 0.42643213272094727, "step": 2363 }, { "batch_size": 4, "epoch": 0.9452, "step": 2363, "tokens_per_device": 2560 }, { "epoch": 0.9452, "loss_ce": 0.04608311504125595, "loss_lvr": 1.0081801414489746, "loss_mode_switch": 0.0, "loss_total": 0.14690113067626953, "step": 2363 }, { "batch_size": 1, "epoch": 0.9452, "step": 2363, "tokens_per_device": 4906 }, { "epoch": 0.9452, "loss_ce": 0.024230798706412315, "loss_lvr": 0.4554864764213562, "loss_mode_switch": 0.0, "loss_total": 0.06977944821119308, "step": 2363 }, { "batch_size": 4, "epoch": 0.9452, "step": 2363, "tokens_per_device": 6356 }, { "epoch": 0.9452, "loss_ce": 0.38860583305358887, "loss_lvr": 0.4563857614994049, "loss_mode_switch": 0.0, "loss_total": 0.43424439430236816, "step": 2363 }, { "epoch": 0.9456, "grad_norm": 1.251408338546753, "learning_rate": 7.740523533826372e-08, "loss": 0.2616, "step": 2364 }, { "batch_size": 1, "epoch": 0.9456, "step": 2364, "tokens_per_device": 4875 }, { "epoch": 0.9456, "loss_ce": 0.2028035670518875, "loss_lvr": 0.17432543635368347, "loss_mode_switch": 0.0, "loss_total": 0.22023610770702362, "step": 2364 }, { "batch_size": 4, "epoch": 0.9456, "step": 2364, "tokens_per_device": 4416 }, { "epoch": 0.9456, "loss_ce": 0.5186986923217773, "loss_lvr": 0.9788853526115417, "loss_mode_switch": 0.0, "loss_total": 0.616587221622467, "step": 2364 }, { "batch_size": 4, "epoch": 0.9456, "step": 2364, "tokens_per_device": 5068 }, { "epoch": 0.9456, "loss_ce": 0.24998025596141815, "loss_lvr": 0.8202565312385559, "loss_mode_switch": 0.0, "loss_total": 0.33200591802597046, "step": 2364 }, { "batch_size": 4, "epoch": 0.9456, "step": 2364, "tokens_per_device": 8340 }, { "epoch": 0.9456, "loss_ce": 0.22626540064811707, "loss_lvr": 0.9491422176361084, "loss_mode_switch": 0.0, "loss_total": 0.3211796283721924, "step": 2364 }, { "batch_size": 4, "epoch": 0.9456, "step": 2364, "tokens_per_device": 3732 }, { "epoch": 0.9456, "loss_ce": 0.489926278591156, "loss_lvr": 1.3252586126327515, "loss_mode_switch": 0.0, "loss_total": 0.6224521398544312, "step": 2364 }, { "batch_size": 4, "epoch": 0.9456, "step": 2364, "tokens_per_device": 4200 }, { "epoch": 0.9456, "loss_ce": 0.10203561931848526, "loss_lvr": 0.995642900466919, "loss_mode_switch": 0.0, "loss_total": 0.20159991085529327, "step": 2364 }, { "batch_size": 4, "epoch": 0.9456, "step": 2364, "tokens_per_device": 5360 }, { "epoch": 0.9456, "loss_ce": 0.08576078712940216, "loss_lvr": 0.46802061796188354, "loss_mode_switch": 0.0, "loss_total": 0.13256284594535828, "step": 2364 }, { "batch_size": 4, "epoch": 0.9456, "step": 2364, "tokens_per_device": 4248 }, { "epoch": 0.9456, "loss_ce": 0.18206119537353516, "loss_lvr": 1.0042575597763062, "loss_mode_switch": 0.0, "loss_total": 0.2824869453907013, "step": 2364 }, { "epoch": 0.946, "grad_norm": 1.1690099239349365, "learning_rate": 7.627400040815414e-08, "loss": 0.2435, "step": 2365 }, { "batch_size": 1, "epoch": 0.946, "step": 2365, "tokens_per_device": 4888 }, { "epoch": 0.946, "loss_ce": 0.11139310151338577, "loss_lvr": 0.42195233702659607, "loss_mode_switch": 0.0, "loss_total": 0.15358833968639374, "step": 2365 }, { "batch_size": 4, "epoch": 0.946, "step": 2365, "tokens_per_device": 4664 }, { "epoch": 0.946, "loss_ce": 0.37729611992836, "loss_lvr": 0.798613965511322, "loss_mode_switch": 0.0, "loss_total": 0.45715752243995667, "step": 2365 }, { "batch_size": 4, "epoch": 0.946, "step": 2365, "tokens_per_device": 4508 }, { "epoch": 0.946, "loss_ce": 0.28808146715164185, "loss_lvr": 0.7531960606575012, "loss_mode_switch": 0.0, "loss_total": 0.3634010851383209, "step": 2365 }, { "batch_size": 1, "epoch": 0.946, "step": 2365, "tokens_per_device": 4900 }, { "epoch": 0.946, "loss_ce": 0.016248198226094246, "loss_lvr": 0.8928453326225281, "loss_mode_switch": 0.0, "loss_total": 0.10553272813558578, "step": 2365 }, { "batch_size": 4, "epoch": 0.946, "step": 2365, "tokens_per_device": 1360 }, { "epoch": 0.946, "loss_ce": 0.18738673627376556, "loss_lvr": 0.8261783123016357, "loss_mode_switch": 0.0, "loss_total": 0.2700045704841614, "step": 2365 }, { "batch_size": 4, "epoch": 0.946, "step": 2365, "tokens_per_device": 5012 }, { "epoch": 0.946, "loss_ce": 0.10688909143209457, "loss_lvr": 0.6520606875419617, "loss_mode_switch": 0.0, "loss_total": 0.1720951646566391, "step": 2365 }, { "batch_size": 1, "epoch": 0.946, "step": 2365, "tokens_per_device": 4878 }, { "epoch": 0.946, "loss_ce": 0.0010071444557979703, "loss_lvr": 0.2656826078891754, "loss_mode_switch": 0.0, "loss_total": 0.027575407177209854, "step": 2365 }, { "batch_size": 4, "epoch": 0.946, "step": 2365, "tokens_per_device": 4208 }, { "epoch": 0.946, "loss_ce": 0.1859825998544693, "loss_lvr": 0.8422626852989197, "loss_mode_switch": 0.0, "loss_total": 0.27020886540412903, "step": 2365 }, { "epoch": 0.9464, "grad_norm": 1.1988165378570557, "learning_rate": 7.515102909305128e-08, "loss": 0.2312, "step": 2366 }, { "batch_size": 4, "epoch": 0.9464, "step": 2366, "tokens_per_device": 4252 }, { "epoch": 0.9464, "loss_ce": 0.026845304295420647, "loss_lvr": 0.5722190141677856, "loss_mode_switch": 0.0, "loss_total": 0.084067203104496, "step": 2366 }, { "batch_size": 4, "epoch": 0.9464, "step": 2366, "tokens_per_device": 4240 }, { "epoch": 0.9464, "loss_ce": 0.11680494993925095, "loss_lvr": 1.1216636896133423, "loss_mode_switch": 0.0, "loss_total": 0.22897131741046906, "step": 2366 }, { "batch_size": 4, "epoch": 0.9464, "step": 2366, "tokens_per_device": 10608 }, { "epoch": 0.9464, "loss_ce": 0.017340857535600662, "loss_lvr": 0.6247887015342712, "loss_mode_switch": 0.0, "loss_total": 0.07981972396373749, "step": 2366 }, { "batch_size": 4, "epoch": 0.9464, "step": 2366, "tokens_per_device": 11828 }, { "epoch": 0.9464, "loss_ce": 0.08335113525390625, "loss_lvr": 0.8752008080482483, "loss_mode_switch": 0.0, "loss_total": 0.17087122797966003, "step": 2366 }, { "batch_size": 4, "epoch": 0.9464, "step": 2366, "tokens_per_device": 4248 }, { "epoch": 0.9464, "loss_ce": 0.018018368631601334, "loss_lvr": 0.9399083852767944, "loss_mode_switch": 0.0, "loss_total": 0.1120092123746872, "step": 2366 }, { "batch_size": 1, "epoch": 0.9464, "step": 2366, "tokens_per_device": 4877 }, { "epoch": 0.9464, "loss_ce": 0.0020914289634674788, "loss_lvr": 0.1564760059118271, "loss_mode_switch": 0.0, "loss_total": 0.017739029601216316, "step": 2366 }, { "batch_size": 4, "epoch": 0.9464, "step": 2366, "tokens_per_device": 5808 }, { "epoch": 0.9464, "loss_ce": 0.07384870201349258, "loss_lvr": 0.7901617884635925, "loss_mode_switch": 0.0, "loss_total": 0.15286487340927124, "step": 2366 }, { "batch_size": 4, "epoch": 0.9464, "step": 2366, "tokens_per_device": 4884 }, { "epoch": 0.9464, "loss_ce": 0.6316719651222229, "loss_lvr": 0.8348826169967651, "loss_mode_switch": 0.0, "loss_total": 0.7151602506637573, "step": 2366 }, { "epoch": 0.9468, "grad_norm": 1.2496103048324585, "learning_rate": 7.403632327766641e-08, "loss": 0.2468, "step": 2367 }, { "batch_size": 4, "epoch": 0.9468, "step": 2367, "tokens_per_device": 3820 }, { "epoch": 0.9468, "loss_ce": 0.002406855346634984, "loss_lvr": 1.0562752485275269, "loss_mode_switch": 0.0, "loss_total": 0.10803438723087311, "step": 2367 }, { "batch_size": 4, "epoch": 0.9468, "step": 2367, "tokens_per_device": 5148 }, { "epoch": 0.9468, "loss_ce": 0.05680515989661217, "loss_lvr": 0.8826724290847778, "loss_mode_switch": 0.0, "loss_total": 0.14507240056991577, "step": 2367 }, { "batch_size": 4, "epoch": 0.9468, "step": 2367, "tokens_per_device": 15468 }, { "epoch": 0.9468, "loss_ce": 0.03811299800872803, "loss_lvr": 0.30779018998146057, "loss_mode_switch": 0.0, "loss_total": 0.06889201700687408, "step": 2367 }, { "batch_size": 1, "epoch": 0.9468, "step": 2367, "tokens_per_device": 5543 }, { "epoch": 0.9468, "loss_ce": 0.012750626541674137, "loss_lvr": 0.2865000367164612, "loss_mode_switch": 0.0, "loss_total": 0.04140063002705574, "step": 2367 }, { "batch_size": 4, "epoch": 0.9468, "step": 2367, "tokens_per_device": 1904 }, { "epoch": 0.9468, "loss_ce": 0.08569112420082092, "loss_lvr": 0.8742490410804749, "loss_mode_switch": 0.0, "loss_total": 0.1731160283088684, "step": 2367 }, { "batch_size": 4, "epoch": 0.9468, "step": 2367, "tokens_per_device": 2628 }, { "epoch": 0.9468, "loss_ce": 0.13632099330425262, "loss_lvr": 1.4081807136535645, "loss_mode_switch": 0.0, "loss_total": 0.2771390676498413, "step": 2367 }, { "batch_size": 1, "epoch": 0.9468, "step": 2367, "tokens_per_device": 4935 }, { "epoch": 0.9468, "loss_ce": 0.012020006775856018, "loss_lvr": 0.7408120632171631, "loss_mode_switch": 0.0, "loss_total": 0.08610121160745621, "step": 2367 }, { "batch_size": 4, "epoch": 0.9468, "step": 2367, "tokens_per_device": 2560 }, { "epoch": 0.9468, "loss_ce": 0.29123637080192566, "loss_lvr": 0.9023200869560242, "loss_mode_switch": 0.0, "loss_total": 0.38146838545799255, "step": 2367 }, { "epoch": 0.9472, "grad_norm": 1.2217786312103271, "learning_rate": 7.292988483283913e-08, "loss": 0.2367, "step": 2368 }, { "batch_size": 1, "epoch": 0.9472, "step": 2368, "tokens_per_device": 5307 }, { "epoch": 0.9472, "loss_ce": 0.01064109057188034, "loss_lvr": 0.5198746919631958, "loss_mode_switch": 0.0, "loss_total": 0.06262855976819992, "step": 2368 }, { "batch_size": 1, "epoch": 0.9472, "step": 2368, "tokens_per_device": 4971 }, { "epoch": 0.9472, "loss_ce": 0.029833922162652016, "loss_lvr": 0.3681594133377075, "loss_mode_switch": 0.0, "loss_total": 0.06664986163377762, "step": 2368 }, { "batch_size": 4, "epoch": 0.9472, "step": 2368, "tokens_per_device": 3780 }, { "epoch": 0.9472, "loss_ce": 0.3337373733520508, "loss_lvr": 0.9979286789894104, "loss_mode_switch": 0.0, "loss_total": 0.4335302412509918, "step": 2368 }, { "batch_size": 4, "epoch": 0.9472, "step": 2368, "tokens_per_device": 5696 }, { "epoch": 0.9472, "loss_ce": 0.22527727484703064, "loss_lvr": 1.1052180528640747, "loss_mode_switch": 0.0, "loss_total": 0.33579909801483154, "step": 2368 }, { "batch_size": 4, "epoch": 0.9472, "step": 2368, "tokens_per_device": 2548 }, { "epoch": 0.9472, "loss_ce": 0.2612968683242798, "loss_lvr": 0.8821234703063965, "loss_mode_switch": 0.0, "loss_total": 0.34950920939445496, "step": 2368 }, { "batch_size": 4, "epoch": 0.9472, "step": 2368, "tokens_per_device": 3660 }, { "epoch": 0.9472, "loss_ce": 0.03683945909142494, "loss_lvr": 0.6968629956245422, "loss_mode_switch": 0.0, "loss_total": 0.10652576386928558, "step": 2368 }, { "batch_size": 4, "epoch": 0.9472, "step": 2368, "tokens_per_device": 11156 }, { "epoch": 0.9472, "loss_ce": 0.20386601984500885, "loss_lvr": 0.7418977618217468, "loss_mode_switch": 0.0, "loss_total": 0.2780557870864868, "step": 2368 }, { "batch_size": 1, "epoch": 0.9472, "step": 2368, "tokens_per_device": 7937 }, { "epoch": 0.9472, "loss_ce": 0.005049784202128649, "loss_lvr": 0.360583633184433, "loss_mode_switch": 0.0, "loss_total": 0.0411081463098526, "step": 2368 }, { "epoch": 0.9476, "grad_norm": 1.291398525238037, "learning_rate": 7.183171561553349e-08, "loss": 0.2698, "step": 2369 }, { "batch_size": 4, "epoch": 0.9476, "step": 2369, "tokens_per_device": 2584 }, { "epoch": 0.9476, "loss_ce": 0.3276773691177368, "loss_lvr": 0.947472333908081, "loss_mode_switch": 0.0, "loss_total": 0.4224246144294739, "step": 2369 }, { "batch_size": 4, "epoch": 0.9476, "step": 2369, "tokens_per_device": 12284 }, { "epoch": 0.9476, "loss_ce": 0.04344569146633148, "loss_lvr": 0.4801628589630127, "loss_mode_switch": 0.0, "loss_total": 0.09146197885274887, "step": 2369 }, { "batch_size": 1, "epoch": 0.9476, "step": 2369, "tokens_per_device": 6378 }, { "epoch": 0.9476, "loss_ce": 0.12142646312713623, "loss_lvr": 0.34813013672828674, "loss_mode_switch": 0.0, "loss_total": 0.15623947978019714, "step": 2369 }, { "batch_size": 4, "epoch": 0.9476, "step": 2369, "tokens_per_device": 4336 }, { "epoch": 0.9476, "loss_ce": 0.08924444019794464, "loss_lvr": 1.0476016998291016, "loss_mode_switch": 0.0, "loss_total": 0.1940046101808548, "step": 2369 }, { "batch_size": 4, "epoch": 0.9476, "step": 2369, "tokens_per_device": 5588 }, { "epoch": 0.9476, "loss_ce": 0.17495650053024292, "loss_lvr": 0.8086522221565247, "loss_mode_switch": 0.0, "loss_total": 0.25582173466682434, "step": 2369 }, { "batch_size": 1, "epoch": 0.9476, "step": 2369, "tokens_per_device": 4935 }, { "epoch": 0.9476, "loss_ce": 2.5174825191497803, "loss_lvr": 0.6954322457313538, "loss_mode_switch": 0.0, "loss_total": 2.5870256423950195, "step": 2369 }, { "batch_size": 1, "epoch": 0.9476, "step": 2369, "tokens_per_device": 4903 }, { "epoch": 0.9476, "loss_ce": 0.06982956826686859, "loss_lvr": 0.7442790865898132, "loss_mode_switch": 0.0, "loss_total": 0.14425748586654663, "step": 2369 }, { "batch_size": 4, "epoch": 0.9476, "step": 2369, "tokens_per_device": 4548 }, { "epoch": 0.9476, "loss_ce": 0.14564630389213562, "loss_lvr": 0.6443350911140442, "loss_mode_switch": 0.0, "loss_total": 0.21007981896400452, "step": 2369 }, { "epoch": 0.948, "grad_norm": 1.5457581281661987, "learning_rate": 7.074181746883402e-08, "loss": 0.2776, "step": 2370 }, { "batch_size": 4, "epoch": 0.948, "step": 2370, "tokens_per_device": 2596 }, { "epoch": 0.948, "loss_ce": 0.15655438601970673, "loss_lvr": 0.9651381969451904, "loss_mode_switch": 0.0, "loss_total": 0.253068208694458, "step": 2370 }, { "batch_size": 4, "epoch": 0.948, "step": 2370, "tokens_per_device": 9568 }, { "epoch": 0.948, "loss_ce": 0.07141433656215668, "loss_lvr": 0.4162292778491974, "loss_mode_switch": 0.0, "loss_total": 0.11303726583719254, "step": 2370 }, { "batch_size": 1, "epoch": 0.948, "step": 2370, "tokens_per_device": 5027 }, { "epoch": 0.948, "loss_ce": 0.0008676228462718427, "loss_lvr": 0.6200433373451233, "loss_mode_switch": 0.0, "loss_total": 0.06287195533514023, "step": 2370 }, { "batch_size": 1, "epoch": 0.948, "step": 2370, "tokens_per_device": 5392 }, { "epoch": 0.948, "loss_ce": 0.0935078114271164, "loss_lvr": 0.3493313789367676, "loss_mode_switch": 0.0, "loss_total": 0.1284409463405609, "step": 2370 }, { "batch_size": 4, "epoch": 0.948, "step": 2370, "tokens_per_device": 3392 }, { "epoch": 0.948, "loss_ce": 0.36007416248321533, "loss_lvr": 0.8869795799255371, "loss_mode_switch": 0.0, "loss_total": 0.448772132396698, "step": 2370 }, { "batch_size": 1, "epoch": 0.948, "step": 2370, "tokens_per_device": 4865 }, { "epoch": 0.948, "loss_ce": 0.0030958426650613546, "loss_lvr": 0.2881341576576233, "loss_mode_switch": 0.0, "loss_total": 0.03190925717353821, "step": 2370 }, { "batch_size": 1, "epoch": 0.948, "step": 2370, "tokens_per_device": 4931 }, { "epoch": 0.948, "loss_ce": 0.4431340992450714, "loss_lvr": 0.506181001663208, "loss_mode_switch": 0.0, "loss_total": 0.49375221133232117, "step": 2370 }, { "batch_size": 4, "epoch": 0.948, "step": 2370, "tokens_per_device": 4184 }, { "epoch": 0.948, "loss_ce": 0.2233305126428604, "loss_lvr": 0.7015873789787292, "loss_mode_switch": 0.0, "loss_total": 0.2934892475605011, "step": 2370 }, { "epoch": 0.9484, "grad_norm": 1.1439810991287231, "learning_rate": 6.966019222194531e-08, "loss": 0.236, "step": 2371 }, { "batch_size": 4, "epoch": 0.9484, "step": 2371, "tokens_per_device": 2660 }, { "epoch": 0.9484, "loss_ce": 0.283687561750412, "loss_lvr": 1.002036690711975, "loss_mode_switch": 0.0, "loss_total": 0.383891224861145, "step": 2371 }, { "batch_size": 4, "epoch": 0.9484, "step": 2371, "tokens_per_device": 5892 }, { "epoch": 0.9484, "loss_ce": 0.035008542239665985, "loss_lvr": 0.7905389070510864, "loss_mode_switch": 0.0, "loss_total": 0.11406243592500687, "step": 2371 }, { "batch_size": 4, "epoch": 0.9484, "step": 2371, "tokens_per_device": 10096 }, { "epoch": 0.9484, "loss_ce": 0.22067105770111084, "loss_lvr": 0.4022960066795349, "loss_mode_switch": 0.0, "loss_total": 0.2609006464481354, "step": 2371 }, { "batch_size": 1, "epoch": 0.9484, "step": 2371, "tokens_per_device": 5170 }, { "epoch": 0.9484, "loss_ce": 0.00344491726718843, "loss_lvr": 0.1913854330778122, "loss_mode_switch": 0.0, "loss_total": 0.022583460435271263, "step": 2371 }, { "batch_size": 4, "epoch": 0.9484, "step": 2371, "tokens_per_device": 4132 }, { "epoch": 0.9484, "loss_ce": 0.5634987354278564, "loss_lvr": 0.8574443459510803, "loss_mode_switch": 0.0, "loss_total": 0.649243175983429, "step": 2371 }, { "batch_size": 4, "epoch": 0.9484, "step": 2371, "tokens_per_device": 14540 }, { "epoch": 0.9484, "loss_ce": 0.037347905337810516, "loss_lvr": 0.5935883522033691, "loss_mode_switch": 0.0, "loss_total": 0.09670674055814743, "step": 2371 }, { "batch_size": 4, "epoch": 0.9484, "step": 2371, "tokens_per_device": 4220 }, { "epoch": 0.9484, "loss_ce": 0.06755468994379044, "loss_lvr": 1.2917799949645996, "loss_mode_switch": 0.0, "loss_total": 0.19673269987106323, "step": 2371 }, { "batch_size": 4, "epoch": 0.9484, "step": 2371, "tokens_per_device": 1396 }, { "epoch": 0.9484, "loss_ce": 0.4820597469806671, "loss_lvr": 0.8950659036636353, "loss_mode_switch": 0.0, "loss_total": 0.5715663433074951, "step": 2371 }, { "epoch": 0.9488, "grad_norm": 1.3204306364059448, "learning_rate": 6.858684169018581e-08, "loss": 0.2616, "step": 2372 }, { "batch_size": 4, "epoch": 0.9488, "step": 2372, "tokens_per_device": 4584 }, { "epoch": 0.9488, "loss_ce": 0.03786729648709297, "loss_lvr": 0.7112011909484863, "loss_mode_switch": 0.0, "loss_total": 0.10898742079734802, "step": 2372 }, { "batch_size": 4, "epoch": 0.9488, "step": 2372, "tokens_per_device": 5988 }, { "epoch": 0.9488, "loss_ce": 0.05845198035240173, "loss_lvr": 0.7298859357833862, "loss_mode_switch": 0.0, "loss_total": 0.13144057989120483, "step": 2372 }, { "batch_size": 4, "epoch": 0.9488, "step": 2372, "tokens_per_device": 5988 }, { "epoch": 0.9488, "loss_ce": 0.14048172533512115, "loss_lvr": 0.7856075763702393, "loss_mode_switch": 0.0, "loss_total": 0.21904247999191284, "step": 2372 }, { "batch_size": 4, "epoch": 0.9488, "step": 2372, "tokens_per_device": 5008 }, { "epoch": 0.9488, "loss_ce": 0.059450019150972366, "loss_lvr": 0.8322876691818237, "loss_mode_switch": 0.0, "loss_total": 0.14267878234386444, "step": 2372 }, { "batch_size": 4, "epoch": 0.9488, "step": 2372, "tokens_per_device": 4816 }, { "epoch": 0.9488, "loss_ce": 0.03725427761673927, "loss_lvr": 0.7618016600608826, "loss_mode_switch": 0.0, "loss_total": 0.11343444883823395, "step": 2372 }, { "batch_size": 4, "epoch": 0.9488, "step": 2372, "tokens_per_device": 4980 }, { "epoch": 0.9488, "loss_ce": 0.1279313713312149, "loss_lvr": 1.0417790412902832, "loss_mode_switch": 0.0, "loss_total": 0.23210927844047546, "step": 2372 }, { "batch_size": 4, "epoch": 0.9488, "step": 2372, "tokens_per_device": 4444 }, { "epoch": 0.9488, "loss_ce": 0.2845417857170105, "loss_lvr": 0.7703397274017334, "loss_mode_switch": 0.0, "loss_total": 0.36157575249671936, "step": 2372 }, { "batch_size": 4, "epoch": 0.9488, "step": 2372, "tokens_per_device": 2600 }, { "epoch": 0.9488, "loss_ce": 0.13482189178466797, "loss_lvr": 0.7705904245376587, "loss_mode_switch": 0.0, "loss_total": 0.21188093721866608, "step": 2372 }, { "epoch": 0.9492, "grad_norm": 1.2318487167358398, "learning_rate": 6.752176767498841e-08, "loss": 0.2558, "step": 2373 }, { "batch_size": 4, "epoch": 0.9492, "step": 2373, "tokens_per_device": 2564 }, { "epoch": 0.9492, "loss_ce": 0.019401784986257553, "loss_lvr": 0.9339578151702881, "loss_mode_switch": 0.0, "loss_total": 0.1127975732088089, "step": 2373 }, { "batch_size": 4, "epoch": 0.9492, "step": 2373, "tokens_per_device": 4892 }, { "epoch": 0.9492, "loss_ce": 0.05933261662721634, "loss_lvr": 0.867161214351654, "loss_mode_switch": 0.0, "loss_total": 0.14604873955249786, "step": 2373 }, { "batch_size": 4, "epoch": 0.9492, "step": 2373, "tokens_per_device": 4516 }, { "epoch": 0.9492, "loss_ce": 0.36368414759635925, "loss_lvr": 0.7174373269081116, "loss_mode_switch": 0.0, "loss_total": 0.43542787432670593, "step": 2373 }, { "batch_size": 4, "epoch": 0.9492, "step": 2373, "tokens_per_device": 4412 }, { "epoch": 0.9492, "loss_ce": 0.16923727095127106, "loss_lvr": 0.9430277347564697, "loss_mode_switch": 0.0, "loss_total": 0.26354002952575684, "step": 2373 }, { "batch_size": 4, "epoch": 0.9492, "step": 2373, "tokens_per_device": 4924 }, { "epoch": 0.9492, "loss_ce": 0.07800833880901337, "loss_lvr": 0.8438153266906738, "loss_mode_switch": 0.0, "loss_total": 0.162389874458313, "step": 2373 }, { "batch_size": 1, "epoch": 0.9492, "step": 2373, "tokens_per_device": 4758 }, { "epoch": 0.9492, "loss_ce": 0.0014112769858911633, "loss_lvr": 0.3410685658454895, "loss_mode_switch": 0.0, "loss_total": 0.035518135875463486, "step": 2373 }, { "batch_size": 4, "epoch": 0.9492, "step": 2373, "tokens_per_device": 4672 }, { "epoch": 0.9492, "loss_ce": 0.1849786788225174, "loss_lvr": 0.6030455827713013, "loss_mode_switch": 0.0, "loss_total": 0.24528324604034424, "step": 2373 }, { "batch_size": 4, "epoch": 0.9492, "step": 2373, "tokens_per_device": 3056 }, { "epoch": 0.9492, "loss_ce": 0.025968896225094795, "loss_lvr": 1.1496413946151733, "loss_mode_switch": 0.0, "loss_total": 0.14093303680419922, "step": 2373 }, { "epoch": 0.9496, "grad_norm": 1.0537261962890625, "learning_rate": 6.646497196389268e-08, "loss": 0.2286, "step": 2374 }, { "batch_size": 1, "epoch": 0.9496, "step": 2374, "tokens_per_device": 5024 }, { "epoch": 0.9496, "loss_ce": 0.15861119329929352, "loss_lvr": 0.4635174870491028, "loss_mode_switch": 0.0, "loss_total": 0.20496293902397156, "step": 2374 }, { "batch_size": 4, "epoch": 0.9496, "step": 2374, "tokens_per_device": 6228 }, { "epoch": 0.9496, "loss_ce": 0.10997645556926727, "loss_lvr": 0.7186963558197021, "loss_mode_switch": 0.0, "loss_total": 0.18184608221054077, "step": 2374 }, { "batch_size": 4, "epoch": 0.9496, "step": 2374, "tokens_per_device": 4508 }, { "epoch": 0.9496, "loss_ce": 0.3303024172782898, "loss_lvr": 0.6462827324867249, "loss_mode_switch": 0.0, "loss_total": 0.3949306905269623, "step": 2374 }, { "batch_size": 4, "epoch": 0.9496, "step": 2374, "tokens_per_device": 4776 }, { "epoch": 0.9496, "loss_ce": 0.009016132913529873, "loss_lvr": 0.8661033511161804, "loss_mode_switch": 0.0, "loss_total": 0.09562647342681885, "step": 2374 }, { "batch_size": 4, "epoch": 0.9496, "step": 2374, "tokens_per_device": 5840 }, { "epoch": 0.9496, "loss_ce": 0.29409733414649963, "loss_lvr": 1.0451918840408325, "loss_mode_switch": 0.0, "loss_total": 0.3986165225505829, "step": 2374 }, { "batch_size": 1, "epoch": 0.9496, "step": 2374, "tokens_per_device": 4852 }, { "epoch": 0.9496, "loss_ce": 0.07156723737716675, "loss_lvr": 0.39997437596321106, "loss_mode_switch": 0.0, "loss_total": 0.11156468093395233, "step": 2374 }, { "batch_size": 4, "epoch": 0.9496, "step": 2374, "tokens_per_device": 2588 }, { "epoch": 0.9496, "loss_ce": 0.3256239891052246, "loss_lvr": 0.9007114171981812, "loss_mode_switch": 0.0, "loss_total": 0.4156951308250427, "step": 2374 }, { "batch_size": 4, "epoch": 0.9496, "step": 2374, "tokens_per_device": 2552 }, { "epoch": 0.9496, "loss_ce": 0.33737167716026306, "loss_lvr": 1.0833189487457275, "loss_mode_switch": 0.0, "loss_total": 0.44570356607437134, "step": 2374 }, { "epoch": 0.95, "grad_norm": 1.23792564868927, "learning_rate": 6.54164563305465e-08, "loss": 0.233, "step": 2375 }, { "batch_size": 4, "epoch": 0.95, "step": 2375, "tokens_per_device": 4228 }, { "epoch": 0.95, "loss_ce": 0.0731164738535881, "loss_lvr": 0.6684430837631226, "loss_mode_switch": 0.0, "loss_total": 0.13996078073978424, "step": 2375 }, { "batch_size": 4, "epoch": 0.95, "step": 2375, "tokens_per_device": 2472 }, { "epoch": 0.95, "loss_ce": 0.2685963213443756, "loss_lvr": 0.8847944736480713, "loss_mode_switch": 0.0, "loss_total": 0.3570757806301117, "step": 2375 }, { "batch_size": 4, "epoch": 0.95, "step": 2375, "tokens_per_device": 4424 }, { "epoch": 0.95, "loss_ce": 0.6715201735496521, "loss_lvr": 0.938246488571167, "loss_mode_switch": 0.0, "loss_total": 0.7653447985649109, "step": 2375 }, { "batch_size": 1, "epoch": 0.95, "step": 2375, "tokens_per_device": 5035 }, { "epoch": 0.95, "loss_ce": 0.031163999810814857, "loss_lvr": 0.4122418165206909, "loss_mode_switch": 0.0, "loss_total": 0.0723881796002388, "step": 2375 }, { "batch_size": 4, "epoch": 0.95, "step": 2375, "tokens_per_device": 1328 }, { "epoch": 0.95, "loss_ce": 0.25013813376426697, "loss_lvr": 1.0456682443618774, "loss_mode_switch": 0.0, "loss_total": 0.35470497608184814, "step": 2375 }, { "batch_size": 4, "epoch": 0.95, "step": 2375, "tokens_per_device": 4196 }, { "epoch": 0.95, "loss_ce": 0.38441145420074463, "loss_lvr": 0.8470004200935364, "loss_mode_switch": 0.0, "loss_total": 0.46911150217056274, "step": 2375 }, { "batch_size": 4, "epoch": 0.95, "step": 2375, "tokens_per_device": 1680 }, { "epoch": 0.95, "loss_ce": 0.13666778802871704, "loss_lvr": 1.6627079248428345, "loss_mode_switch": 0.0, "loss_total": 0.3029385805130005, "step": 2375 }, { "batch_size": 4, "epoch": 0.95, "step": 2375, "tokens_per_device": 4992 }, { "epoch": 0.95, "loss_ce": 0.3370285630226135, "loss_lvr": 0.6073160171508789, "loss_mode_switch": 0.0, "loss_total": 0.39776015281677246, "step": 2375 }, { "epoch": 0.9504, "grad_norm": 1.2072381973266602, "learning_rate": 6.437622253470055e-08, "loss": 0.26, "step": 2376 }, { "batch_size": 4, "epoch": 0.9504, "step": 2376, "tokens_per_device": 3580 }, { "epoch": 0.9504, "loss_ce": 0.1975739300251007, "loss_lvr": 1.1304134130477905, "loss_mode_switch": 0.0, "loss_total": 0.31061527132987976, "step": 2376 }, { "batch_size": 4, "epoch": 0.9504, "step": 2376, "tokens_per_device": 4012 }, { "epoch": 0.9504, "loss_ce": 0.0604599229991436, "loss_lvr": 0.9118109345436096, "loss_mode_switch": 0.0, "loss_total": 0.15164101123809814, "step": 2376 }, { "batch_size": 4, "epoch": 0.9504, "step": 2376, "tokens_per_device": 4284 }, { "epoch": 0.9504, "loss_ce": 0.05263224616646767, "loss_lvr": 0.7251499891281128, "loss_mode_switch": 0.0, "loss_total": 0.1251472532749176, "step": 2376 }, { "batch_size": 1, "epoch": 0.9504, "step": 2376, "tokens_per_device": 5123 }, { "epoch": 0.9504, "loss_ce": 0.04744994640350342, "loss_lvr": 0.41888105869293213, "loss_mode_switch": 0.0, "loss_total": 0.08933804929256439, "step": 2376 }, { "batch_size": 4, "epoch": 0.9504, "step": 2376, "tokens_per_device": 5844 }, { "epoch": 0.9504, "loss_ce": 0.10442261397838593, "loss_lvr": 1.332639217376709, "loss_mode_switch": 0.0, "loss_total": 0.23768654465675354, "step": 2376 }, { "batch_size": 4, "epoch": 0.9504, "step": 2376, "tokens_per_device": 5312 }, { "epoch": 0.9504, "loss_ce": 0.0014666825300082564, "loss_lvr": 0.4719405472278595, "loss_mode_switch": 0.0, "loss_total": 0.04866074025630951, "step": 2376 }, { "batch_size": 1, "epoch": 0.9504, "step": 2376, "tokens_per_device": 4877 }, { "epoch": 0.9504, "loss_ce": 0.02738451026380062, "loss_lvr": 0.3789975345134735, "loss_mode_switch": 0.0, "loss_total": 0.06528426706790924, "step": 2376 }, { "batch_size": 1, "epoch": 0.9504, "step": 2376, "tokens_per_device": 5110 }, { "epoch": 0.9504, "loss_ce": 0.08738014101982117, "loss_lvr": 0.32618701457977295, "loss_mode_switch": 0.0, "loss_total": 0.11999884247779846, "step": 2376 }, { "epoch": 0.9508, "grad_norm": 1.275275707244873, "learning_rate": 6.334427232220552e-08, "loss": 0.2822, "step": 2377 }, { "batch_size": 1, "epoch": 0.9508, "step": 2377, "tokens_per_device": 5127 }, { "epoch": 0.9508, "loss_ce": 0.0025633859913796186, "loss_lvr": 0.4501180052757263, "loss_mode_switch": 0.0, "loss_total": 0.04757518693804741, "step": 2377 }, { "batch_size": 4, "epoch": 0.9508, "step": 2377, "tokens_per_device": 3864 }, { "epoch": 0.9508, "loss_ce": 0.1025858223438263, "loss_lvr": 0.5511505007743835, "loss_mode_switch": 0.0, "loss_total": 0.15770086646080017, "step": 2377 }, { "batch_size": 4, "epoch": 0.9508, "step": 2377, "tokens_per_device": 4664 }, { "epoch": 0.9508, "loss_ce": 0.05278615280985832, "loss_lvr": 0.7490670680999756, "loss_mode_switch": 0.0, "loss_total": 0.12769286334514618, "step": 2377 }, { "batch_size": 4, "epoch": 0.9508, "step": 2377, "tokens_per_device": 4252 }, { "epoch": 0.9508, "loss_ce": 0.042217403650283813, "loss_lvr": 1.6124814748764038, "loss_mode_switch": 0.0, "loss_total": 0.2034655511379242, "step": 2377 }, { "batch_size": 4, "epoch": 0.9508, "step": 2377, "tokens_per_device": 2564 }, { "epoch": 0.9508, "loss_ce": 0.1990279108285904, "loss_lvr": 0.7041034698486328, "loss_mode_switch": 0.0, "loss_total": 0.2694382667541504, "step": 2377 }, { "batch_size": 1, "epoch": 0.9508, "step": 2377, "tokens_per_device": 5098 }, { "epoch": 0.9508, "loss_ce": 0.9697749614715576, "loss_lvr": 0.6650965809822083, "loss_mode_switch": 0.0, "loss_total": 1.0362845659255981, "step": 2377 }, { "batch_size": 1, "epoch": 0.9508, "step": 2377, "tokens_per_device": 4879 }, { "epoch": 0.9508, "loss_ce": 0.04697330668568611, "loss_lvr": 0.18869252502918243, "loss_mode_switch": 0.0, "loss_total": 0.06584256142377853, "step": 2377 }, { "batch_size": 4, "epoch": 0.9508, "step": 2377, "tokens_per_device": 1636 }, { "epoch": 0.9508, "loss_ce": 0.8725495934486389, "loss_lvr": 0.8628343939781189, "loss_mode_switch": 0.0, "loss_total": 0.9588330388069153, "step": 2377 }, { "epoch": 0.9512, "grad_norm": 1.1708742380142212, "learning_rate": 6.23206074250099e-08, "loss": 0.2333, "step": 2378 }, { "batch_size": 4, "epoch": 0.9512, "step": 2378, "tokens_per_device": 2560 }, { "epoch": 0.9512, "loss_ce": 0.4070569574832916, "loss_lvr": 2.106412649154663, "loss_mode_switch": 0.0, "loss_total": 0.6176981925964355, "step": 2378 }, { "batch_size": 4, "epoch": 0.9512, "step": 2378, "tokens_per_device": 4888 }, { "epoch": 0.9512, "loss_ce": 0.15835466980934143, "loss_lvr": 0.523827314376831, "loss_mode_switch": 0.0, "loss_total": 0.210737407207489, "step": 2378 }, { "batch_size": 4, "epoch": 0.9512, "step": 2378, "tokens_per_device": 2640 }, { "epoch": 0.9512, "loss_ce": 0.30536335706710815, "loss_lvr": 2.059135675430298, "loss_mode_switch": 0.0, "loss_total": 0.5112769603729248, "step": 2378 }, { "batch_size": 4, "epoch": 0.9512, "step": 2378, "tokens_per_device": 3936 }, { "epoch": 0.9512, "loss_ce": 0.09538118541240692, "loss_lvr": 1.2110532522201538, "loss_mode_switch": 0.0, "loss_total": 0.21648651361465454, "step": 2378 }, { "batch_size": 1, "epoch": 0.9512, "step": 2378, "tokens_per_device": 5156 }, { "epoch": 0.9512, "loss_ce": 0.001825634390115738, "loss_lvr": 0.7283180356025696, "loss_mode_switch": 0.0, "loss_total": 0.07465744018554688, "step": 2378 }, { "batch_size": 4, "epoch": 0.9512, "step": 2378, "tokens_per_device": 4844 }, { "epoch": 0.9512, "loss_ce": 0.5888453722000122, "loss_lvr": 0.8625549674034119, "loss_mode_switch": 0.0, "loss_total": 0.6751008629798889, "step": 2378 }, { "batch_size": 4, "epoch": 0.9512, "step": 2378, "tokens_per_device": 5020 }, { "epoch": 0.9512, "loss_ce": 0.1300712674856186, "loss_lvr": 0.6785043478012085, "loss_mode_switch": 0.0, "loss_total": 0.19792169332504272, "step": 2378 }, { "batch_size": 1, "epoch": 0.9512, "step": 2378, "tokens_per_device": 4899 }, { "epoch": 0.9512, "loss_ce": 0.07520940154790878, "loss_lvr": 0.42945218086242676, "loss_mode_switch": 0.0, "loss_total": 0.1181546151638031, "step": 2378 }, { "epoch": 0.9516, "grad_norm": 1.2373331785202026, "learning_rate": 6.130522956115659e-08, "loss": 0.266, "step": 2379 }, { "batch_size": 1, "epoch": 0.9516, "step": 2379, "tokens_per_device": 4546 }, { "epoch": 0.9516, "loss_ce": 0.0635211318731308, "loss_lvr": 0.2549484074115753, "loss_mode_switch": 0.0, "loss_total": 0.08901597559452057, "step": 2379 }, { "batch_size": 4, "epoch": 0.9516, "step": 2379, "tokens_per_device": 4968 }, { "epoch": 0.9516, "loss_ce": 0.06619857996702194, "loss_lvr": 0.5534432530403137, "loss_mode_switch": 0.0, "loss_total": 0.12154290080070496, "step": 2379 }, { "batch_size": 4, "epoch": 0.9516, "step": 2379, "tokens_per_device": 3600 }, { "epoch": 0.9516, "loss_ce": 0.3754734992980957, "loss_lvr": 0.9562857747077942, "loss_mode_switch": 0.0, "loss_total": 0.4711020886898041, "step": 2379 }, { "batch_size": 4, "epoch": 0.9516, "step": 2379, "tokens_per_device": 1564 }, { "epoch": 0.9516, "loss_ce": 0.08919128775596619, "loss_lvr": 0.867767870426178, "loss_mode_switch": 0.0, "loss_total": 0.17596808075904846, "step": 2379 }, { "batch_size": 1, "epoch": 0.9516, "step": 2379, "tokens_per_device": 4608 }, { "epoch": 0.9516, "loss_ce": 0.0038534754421561956, "loss_lvr": 0.4741082191467285, "loss_mode_switch": 0.0, "loss_total": 0.05126429721713066, "step": 2379 }, { "batch_size": 4, "epoch": 0.9516, "step": 2379, "tokens_per_device": 4220 }, { "epoch": 0.9516, "loss_ce": 0.017669500783085823, "loss_lvr": 0.37607917189598083, "loss_mode_switch": 0.0, "loss_total": 0.055277422070503235, "step": 2379 }, { "batch_size": 4, "epoch": 0.9516, "step": 2379, "tokens_per_device": 1400 }, { "epoch": 0.9516, "loss_ce": 0.25402066111564636, "loss_lvr": 0.9263375401496887, "loss_mode_switch": 0.0, "loss_total": 0.34665441513061523, "step": 2379 }, { "batch_size": 4, "epoch": 0.9516, "step": 2379, "tokens_per_device": 3828 }, { "epoch": 0.9516, "loss_ce": 0.13366055488586426, "loss_lvr": 0.6865696310997009, "loss_mode_switch": 0.0, "loss_total": 0.2023175209760666, "step": 2379 }, { "epoch": 0.952, "grad_norm": 1.063889741897583, "learning_rate": 6.029814043478022e-08, "loss": 0.2175, "step": 2380 }, { "batch_size": 4, "epoch": 0.952, "step": 2380, "tokens_per_device": 4756 }, { "epoch": 0.952, "loss_ce": 0.16714759171009064, "loss_lvr": 0.7202817797660828, "loss_mode_switch": 0.0, "loss_total": 0.23917576670646667, "step": 2380 }, { "batch_size": 4, "epoch": 0.952, "step": 2380, "tokens_per_device": 2884 }, { "epoch": 0.952, "loss_ce": 0.26561760902404785, "loss_lvr": 0.6870639324188232, "loss_mode_switch": 0.0, "loss_total": 0.3343240022659302, "step": 2380 }, { "batch_size": 4, "epoch": 0.952, "step": 2380, "tokens_per_device": 6912 }, { "epoch": 0.952, "loss_ce": 0.2686726450920105, "loss_lvr": 0.5158320665359497, "loss_mode_switch": 0.0, "loss_total": 0.320255845785141, "step": 2380 }, { "batch_size": 1, "epoch": 0.952, "step": 2380, "tokens_per_device": 5380 }, { "epoch": 0.952, "loss_ce": 0.02503182925283909, "loss_lvr": 0.2529349625110626, "loss_mode_switch": 0.0, "loss_total": 0.05032532662153244, "step": 2380 }, { "batch_size": 4, "epoch": 0.952, "step": 2380, "tokens_per_device": 4392 }, { "epoch": 0.952, "loss_ce": 0.2752283215522766, "loss_lvr": 0.8167276978492737, "loss_mode_switch": 0.0, "loss_total": 0.3569011092185974, "step": 2380 }, { "batch_size": 4, "epoch": 0.952, "step": 2380, "tokens_per_device": 2548 }, { "epoch": 0.952, "loss_ce": 0.07876155525445938, "loss_lvr": 0.9412894248962402, "loss_mode_switch": 0.0, "loss_total": 0.17289049923419952, "step": 2380 }, { "batch_size": 4, "epoch": 0.952, "step": 2380, "tokens_per_device": 2560 }, { "epoch": 0.952, "loss_ce": 0.09067089855670929, "loss_lvr": 0.8128905296325684, "loss_mode_switch": 0.0, "loss_total": 0.17195995151996613, "step": 2380 }, { "batch_size": 4, "epoch": 0.952, "step": 2380, "tokens_per_device": 5164 }, { "epoch": 0.952, "loss_ce": 0.5748561024665833, "loss_lvr": 0.8291285037994385, "loss_mode_switch": 0.0, "loss_total": 0.657768964767456, "step": 2380 }, { "epoch": 0.9524, "grad_norm": 1.1663323640823364, "learning_rate": 5.9299341736103746e-08, "loss": 0.2259, "step": 2381 }, { "batch_size": 4, "epoch": 0.9524, "step": 2381, "tokens_per_device": 2116 }, { "epoch": 0.9524, "loss_ce": 0.06545906513929367, "loss_lvr": 0.6991668343544006, "loss_mode_switch": 0.0, "loss_total": 0.1353757530450821, "step": 2381 }, { "batch_size": 1, "epoch": 0.9524, "step": 2381, "tokens_per_device": 5228 }, { "epoch": 0.9524, "loss_ce": 0.048389073461294174, "loss_lvr": 0.3029026985168457, "loss_mode_switch": 0.0, "loss_total": 0.07867934554815292, "step": 2381 }, { "batch_size": 1, "epoch": 0.9524, "step": 2381, "tokens_per_device": 4886 }, { "epoch": 0.9524, "loss_ce": 0.05000106245279312, "loss_lvr": 0.4271027743816376, "loss_mode_switch": 0.0, "loss_total": 0.09271134436130524, "step": 2381 }, { "batch_size": 1, "epoch": 0.9524, "step": 2381, "tokens_per_device": 4856 }, { "epoch": 0.9524, "loss_ce": 0.028611866757273674, "loss_lvr": 0.3690900206565857, "loss_mode_switch": 0.0, "loss_total": 0.06552086770534515, "step": 2381 }, { "batch_size": 4, "epoch": 0.9524, "step": 2381, "tokens_per_device": 3888 }, { "epoch": 0.9524, "loss_ce": 0.4385245144367218, "loss_lvr": 0.788245439529419, "loss_mode_switch": 0.0, "loss_total": 0.5173490643501282, "step": 2381 }, { "batch_size": 4, "epoch": 0.9524, "step": 2381, "tokens_per_device": 2572 }, { "epoch": 0.9524, "loss_ce": 0.3285844326019287, "loss_lvr": 0.8187505602836609, "loss_mode_switch": 0.0, "loss_total": 0.4104594886302948, "step": 2381 }, { "batch_size": 1, "epoch": 0.9524, "step": 2381, "tokens_per_device": 5106 }, { "epoch": 0.9524, "loss_ce": 0.008058284409344196, "loss_lvr": 0.25077128410339355, "loss_mode_switch": 0.0, "loss_total": 0.033135414123535156, "step": 2381 }, { "batch_size": 4, "epoch": 0.9524, "step": 2381, "tokens_per_device": 4196 }, { "epoch": 0.9524, "loss_ce": 0.0783483013510704, "loss_lvr": 0.7375085353851318, "loss_mode_switch": 0.0, "loss_total": 0.15209916234016418, "step": 2381 }, { "epoch": 0.9528, "grad_norm": 1.2087745666503906, "learning_rate": 5.8308835141436814e-08, "loss": 0.2492, "step": 2382 }, { "batch_size": 4, "epoch": 0.9528, "step": 2382, "tokens_per_device": 5568 }, { "epoch": 0.9528, "loss_ce": 0.10975603014230728, "loss_lvr": 0.6182491779327393, "loss_mode_switch": 0.0, "loss_total": 0.1715809404850006, "step": 2382 }, { "batch_size": 4, "epoch": 0.9528, "step": 2382, "tokens_per_device": 1336 }, { "epoch": 0.9528, "loss_ce": 0.4724985361099243, "loss_lvr": 0.9684600830078125, "loss_mode_switch": 0.0, "loss_total": 0.5693445205688477, "step": 2382 }, { "batch_size": 4, "epoch": 0.9528, "step": 2382, "tokens_per_device": 5304 }, { "epoch": 0.9528, "loss_ce": 0.21506789326667786, "loss_lvr": 1.0944797992706299, "loss_mode_switch": 0.0, "loss_total": 0.3245158791542053, "step": 2382 }, { "batch_size": 4, "epoch": 0.9528, "step": 2382, "tokens_per_device": 2456 }, { "epoch": 0.9528, "loss_ce": 0.1684459149837494, "loss_lvr": 0.953353226184845, "loss_mode_switch": 0.0, "loss_total": 0.26378124952316284, "step": 2382 }, { "batch_size": 1, "epoch": 0.9528, "step": 2382, "tokens_per_device": 4905 }, { "epoch": 0.9528, "loss_ce": 0.018570492044091225, "loss_lvr": 0.34155115485191345, "loss_mode_switch": 0.0, "loss_total": 0.05272560566663742, "step": 2382 }, { "batch_size": 1, "epoch": 0.9528, "step": 2382, "tokens_per_device": 5196 }, { "epoch": 0.9528, "loss_ce": 0.03200233355164528, "loss_lvr": 0.44780775904655457, "loss_mode_switch": 0.0, "loss_total": 0.07678310573101044, "step": 2382 }, { "batch_size": 4, "epoch": 0.9528, "step": 2382, "tokens_per_device": 3844 }, { "epoch": 0.9528, "loss_ce": 0.14132705330848694, "loss_lvr": 0.9722815155982971, "loss_mode_switch": 0.0, "loss_total": 0.2385552078485489, "step": 2382 }, { "batch_size": 4, "epoch": 0.9528, "step": 2382, "tokens_per_device": 3960 }, { "epoch": 0.9528, "loss_ce": 0.14847928285598755, "loss_lvr": 1.3817741870880127, "loss_mode_switch": 0.0, "loss_total": 0.2866567075252533, "step": 2382 }, { "epoch": 0.9532, "grad_norm": 1.0203875303268433, "learning_rate": 5.7326622313171877e-08, "loss": 0.2104, "step": 2383 }, { "batch_size": 1, "epoch": 0.9532, "step": 2383, "tokens_per_device": 5036 }, { "epoch": 0.9532, "loss_ce": 0.027128111571073532, "loss_lvr": 0.27961936593055725, "loss_mode_switch": 0.0, "loss_total": 0.0550900474190712, "step": 2383 }, { "batch_size": 4, "epoch": 0.9532, "step": 2383, "tokens_per_device": 2600 }, { "epoch": 0.9532, "loss_ce": 0.029580246657133102, "loss_lvr": 0.8222512006759644, "loss_mode_switch": 0.0, "loss_total": 0.11180536448955536, "step": 2383 }, { "batch_size": 4, "epoch": 0.9532, "step": 2383, "tokens_per_device": 2616 }, { "epoch": 0.9532, "loss_ce": 0.3636489510536194, "loss_lvr": 0.9354172945022583, "loss_mode_switch": 0.0, "loss_total": 0.45719069242477417, "step": 2383 }, { "batch_size": 4, "epoch": 0.9532, "step": 2383, "tokens_per_device": 4192 }, { "epoch": 0.9532, "loss_ce": 0.07609883695840836, "loss_lvr": 0.9041215777397156, "loss_mode_switch": 0.0, "loss_total": 0.16651099920272827, "step": 2383 }, { "batch_size": 1, "epoch": 0.9532, "step": 2383, "tokens_per_device": 6381 }, { "epoch": 0.9532, "loss_ce": 0.005839450750499964, "loss_lvr": 0.2962721288204193, "loss_mode_switch": 0.0, "loss_total": 0.03546666353940964, "step": 2383 }, { "batch_size": 4, "epoch": 0.9532, "step": 2383, "tokens_per_device": 4200 }, { "epoch": 0.9532, "loss_ce": 0.10055911540985107, "loss_lvr": 0.8186089992523193, "loss_mode_switch": 0.0, "loss_total": 0.182420015335083, "step": 2383 }, { "batch_size": 4, "epoch": 0.9532, "step": 2383, "tokens_per_device": 2688 }, { "epoch": 0.9532, "loss_ce": 0.36775171756744385, "loss_lvr": 0.7973492741584778, "loss_mode_switch": 0.0, "loss_total": 0.44748663902282715, "step": 2383 }, { "batch_size": 4, "epoch": 0.9532, "step": 2383, "tokens_per_device": 1352 }, { "epoch": 0.9532, "loss_ce": 0.555924654006958, "loss_lvr": 0.8568927645683289, "loss_mode_switch": 0.0, "loss_total": 0.6416139602661133, "step": 2383 }, { "epoch": 0.9536, "grad_norm": 1.0908712148666382, "learning_rate": 5.6352704899782506e-08, "loss": 0.235, "step": 2384 }, { "batch_size": 4, "epoch": 0.9536, "step": 2384, "tokens_per_device": 2680 }, { "epoch": 0.9536, "loss_ce": 0.38578951358795166, "loss_lvr": 0.700870931148529, "loss_mode_switch": 0.0, "loss_total": 0.4558766186237335, "step": 2384 }, { "batch_size": 4, "epoch": 0.9536, "step": 2384, "tokens_per_device": 3324 }, { "epoch": 0.9536, "loss_ce": 0.13115476071834564, "loss_lvr": 1.0265140533447266, "loss_mode_switch": 0.0, "loss_total": 0.23380616307258606, "step": 2384 }, { "batch_size": 1, "epoch": 0.9536, "step": 2384, "tokens_per_device": 5149 }, { "epoch": 0.9536, "loss_ce": 0.02286136895418167, "loss_lvr": 0.4184357523918152, "loss_mode_switch": 0.0, "loss_total": 0.06470493972301483, "step": 2384 }, { "batch_size": 1, "epoch": 0.9536, "step": 2384, "tokens_per_device": 4901 }, { "epoch": 0.9536, "loss_ce": 0.009781618602573872, "loss_lvr": 0.22553199529647827, "loss_mode_switch": 0.0, "loss_total": 0.0323348194360733, "step": 2384 }, { "batch_size": 1, "epoch": 0.9536, "step": 2384, "tokens_per_device": 6946 }, { "epoch": 0.9536, "loss_ce": 0.0019134493777528405, "loss_lvr": 0.3640982210636139, "loss_mode_switch": 0.0, "loss_total": 0.03832327201962471, "step": 2384 }, { "batch_size": 4, "epoch": 0.9536, "step": 2384, "tokens_per_device": 1664 }, { "epoch": 0.9536, "loss_ce": 0.36475101113319397, "loss_lvr": 0.9220005869865417, "loss_mode_switch": 0.0, "loss_total": 0.4569510817527771, "step": 2384 }, { "batch_size": 1, "epoch": 0.9536, "step": 2384, "tokens_per_device": 5200 }, { "epoch": 0.9536, "loss_ce": 0.08367694169282913, "loss_lvr": 0.26209259033203125, "loss_mode_switch": 0.0, "loss_total": 0.10988619923591614, "step": 2384 }, { "batch_size": 4, "epoch": 0.9536, "step": 2384, "tokens_per_device": 3352 }, { "epoch": 0.9536, "loss_ce": 0.19619068503379822, "loss_lvr": 1.9798957109451294, "loss_mode_switch": 0.0, "loss_total": 0.3941802382469177, "step": 2384 }, { "epoch": 0.954, "grad_norm": 1.0380154848098755, "learning_rate": 5.538708453581787e-08, "loss": 0.227, "step": 2385 }, { "batch_size": 4, "epoch": 0.954, "step": 2385, "tokens_per_device": 4560 }, { "epoch": 0.954, "loss_ce": 0.3186371326446533, "loss_lvr": 0.8651080131530762, "loss_mode_switch": 0.0, "loss_total": 0.4051479399204254, "step": 2385 }, { "batch_size": 1, "epoch": 0.954, "step": 2385, "tokens_per_device": 4894 }, { "epoch": 0.954, "loss_ce": 0.003662120085209608, "loss_lvr": 0.15833136439323425, "loss_mode_switch": 0.0, "loss_total": 0.01949525810778141, "step": 2385 }, { "batch_size": 1, "epoch": 0.954, "step": 2385, "tokens_per_device": 5157 }, { "epoch": 0.954, "loss_ce": 0.0803561583161354, "loss_lvr": 0.4750185012817383, "loss_mode_switch": 0.0, "loss_total": 0.1278580129146576, "step": 2385 }, { "batch_size": 1, "epoch": 0.954, "step": 2385, "tokens_per_device": 5117 }, { "epoch": 0.954, "loss_ce": 0.006161623168736696, "loss_lvr": 0.31888848543167114, "loss_mode_switch": 0.0, "loss_total": 0.03805047273635864, "step": 2385 }, { "batch_size": 4, "epoch": 0.954, "step": 2385, "tokens_per_device": 1612 }, { "epoch": 0.954, "loss_ce": 0.6447330713272095, "loss_lvr": 0.8006579279899597, "loss_mode_switch": 0.0, "loss_total": 0.724798858165741, "step": 2385 }, { "batch_size": 4, "epoch": 0.954, "step": 2385, "tokens_per_device": 4192 }, { "epoch": 0.954, "loss_ce": 0.3133290708065033, "loss_lvr": 0.6007956266403198, "loss_mode_switch": 0.0, "loss_total": 0.37340864539146423, "step": 2385 }, { "batch_size": 4, "epoch": 0.954, "step": 2385, "tokens_per_device": 4432 }, { "epoch": 0.954, "loss_ce": 0.1301274299621582, "loss_lvr": 0.6233615279197693, "loss_mode_switch": 0.0, "loss_total": 0.19246357679367065, "step": 2385 }, { "batch_size": 1, "epoch": 0.954, "step": 2385, "tokens_per_device": 5117 }, { "epoch": 0.954, "loss_ce": 0.11499962955713272, "loss_lvr": 0.14636749029159546, "loss_mode_switch": 0.0, "loss_total": 0.12963637709617615, "step": 2385 }, { "epoch": 0.9544, "grad_norm": 1.1199922561645508, "learning_rate": 5.442976284190382e-08, "loss": 0.2204, "step": 2386 }, { "batch_size": 4, "epoch": 0.9544, "step": 2386, "tokens_per_device": 4200 }, { "epoch": 0.9544, "loss_ce": 0.09011710435152054, "loss_lvr": 1.626203179359436, "loss_mode_switch": 0.0, "loss_total": 0.252737432718277, "step": 2386 }, { "batch_size": 1, "epoch": 0.9544, "step": 2386, "tokens_per_device": 4878 }, { "epoch": 0.9544, "loss_ce": 0.006421093363314867, "loss_lvr": 0.5056666731834412, "loss_mode_switch": 0.0, "loss_total": 0.056987762451171875, "step": 2386 }, { "batch_size": 4, "epoch": 0.9544, "step": 2386, "tokens_per_device": 4248 }, { "epoch": 0.9544, "loss_ce": 0.3033006191253662, "loss_lvr": 0.9829373955726624, "loss_mode_switch": 0.0, "loss_total": 0.4015943706035614, "step": 2386 }, { "batch_size": 4, "epoch": 0.9544, "step": 2386, "tokens_per_device": 3808 }, { "epoch": 0.9544, "loss_ce": 0.27735745906829834, "loss_lvr": 1.0715104341506958, "loss_mode_switch": 0.0, "loss_total": 0.38450849056243896, "step": 2386 }, { "batch_size": 1, "epoch": 0.9544, "step": 2386, "tokens_per_device": 4819 }, { "epoch": 0.9544, "loss_ce": 0.0177475418895483, "loss_lvr": 0.33559146523475647, "loss_mode_switch": 0.0, "loss_total": 0.05130668729543686, "step": 2386 }, { "batch_size": 4, "epoch": 0.9544, "step": 2386, "tokens_per_device": 3808 }, { "epoch": 0.9544, "loss_ce": 0.26143088936805725, "loss_lvr": 0.8066027164459229, "loss_mode_switch": 0.0, "loss_total": 0.3420911729335785, "step": 2386 }, { "batch_size": 4, "epoch": 0.9544, "step": 2386, "tokens_per_device": 1788 }, { "epoch": 0.9544, "loss_ce": 0.13571982085704803, "loss_lvr": 1.0121774673461914, "loss_mode_switch": 0.0, "loss_total": 0.23693756759166718, "step": 2386 }, { "batch_size": 1, "epoch": 0.9544, "step": 2386, "tokens_per_device": 5115 }, { "epoch": 0.9544, "loss_ce": 0.0003371914499439299, "loss_lvr": 0.358038991689682, "loss_mode_switch": 0.0, "loss_total": 0.03614109009504318, "step": 2386 }, { "epoch": 0.9548, "grad_norm": 1.0358282327651978, "learning_rate": 5.348074142473847e-08, "loss": 0.2106, "step": 2387 }, { "batch_size": 1, "epoch": 0.9548, "step": 2387, "tokens_per_device": 4881 }, { "epoch": 0.9548, "loss_ce": 0.10054273158311844, "loss_lvr": 0.4634358286857605, "loss_mode_switch": 0.0, "loss_total": 0.14688631892204285, "step": 2387 }, { "batch_size": 1, "epoch": 0.9548, "step": 2387, "tokens_per_device": 4940 }, { "epoch": 0.9548, "loss_ce": 0.009097792208194733, "loss_lvr": 0.2806430459022522, "loss_mode_switch": 0.0, "loss_total": 0.03716209530830383, "step": 2387 }, { "batch_size": 1, "epoch": 0.9548, "step": 2387, "tokens_per_device": 5617 }, { "epoch": 0.9548, "loss_ce": 0.014441154897212982, "loss_lvr": 0.36254143714904785, "loss_mode_switch": 0.0, "loss_total": 0.05069530010223389, "step": 2387 }, { "batch_size": 1, "epoch": 0.9548, "step": 2387, "tokens_per_device": 4206 }, { "epoch": 0.9548, "loss_ce": 0.00097801408264786, "loss_lvr": 0.44721734523773193, "loss_mode_switch": 0.0, "loss_total": 0.045699749141931534, "step": 2387 }, { "batch_size": 1, "epoch": 0.9548, "step": 2387, "tokens_per_device": 4890 }, { "epoch": 0.9548, "loss_ce": 0.06850855052471161, "loss_lvr": 0.3972567319869995, "loss_mode_switch": 0.0, "loss_total": 0.1082342267036438, "step": 2387 }, { "batch_size": 1, "epoch": 0.9548, "step": 2387, "tokens_per_device": 4743 }, { "epoch": 0.9548, "loss_ce": 0.004457575734704733, "loss_lvr": 0.2099943608045578, "loss_mode_switch": 0.0, "loss_total": 0.02545701339840889, "step": 2387 }, { "batch_size": 4, "epoch": 0.9548, "step": 2387, "tokens_per_device": 6020 }, { "epoch": 0.9548, "loss_ce": 0.10632350295782089, "loss_lvr": 0.6978071928024292, "loss_mode_switch": 0.0, "loss_total": 0.17610421776771545, "step": 2387 }, { "batch_size": 4, "epoch": 0.9548, "step": 2387, "tokens_per_device": 4748 }, { "epoch": 0.9548, "loss_ce": 0.10816018283367157, "loss_lvr": 0.7806961536407471, "loss_mode_switch": 0.0, "loss_total": 0.18622979521751404, "step": 2387 }, { "epoch": 0.9552, "grad_norm": 1.1699743270874023, "learning_rate": 5.254002187708773e-08, "loss": 0.227, "step": 2388 }, { "batch_size": 1, "epoch": 0.9552, "step": 2388, "tokens_per_device": 5106 }, { "epoch": 0.9552, "loss_ce": 0.08947508037090302, "loss_lvr": 0.5535382628440857, "loss_mode_switch": 0.0, "loss_total": 0.1448289155960083, "step": 2388 }, { "batch_size": 4, "epoch": 0.9552, "step": 2388, "tokens_per_device": 5240 }, { "epoch": 0.9552, "loss_ce": 0.016223685815930367, "loss_lvr": 0.6285000443458557, "loss_mode_switch": 0.0, "loss_total": 0.07907368987798691, "step": 2388 }, { "batch_size": 4, "epoch": 0.9552, "step": 2388, "tokens_per_device": 6588 }, { "epoch": 0.9552, "loss_ce": 0.16281956434249878, "loss_lvr": 0.9352465271949768, "loss_mode_switch": 0.0, "loss_total": 0.2563442289829254, "step": 2388 }, { "batch_size": 4, "epoch": 0.9552, "step": 2388, "tokens_per_device": 4316 }, { "epoch": 0.9552, "loss_ce": 0.12267524003982544, "loss_lvr": 0.927105724811554, "loss_mode_switch": 0.0, "loss_total": 0.2153858244419098, "step": 2388 }, { "batch_size": 4, "epoch": 0.9552, "step": 2388, "tokens_per_device": 4228 }, { "epoch": 0.9552, "loss_ce": 0.22805815935134888, "loss_lvr": 0.7988101840019226, "loss_mode_switch": 0.0, "loss_total": 0.30793917179107666, "step": 2388 }, { "batch_size": 4, "epoch": 0.9552, "step": 2388, "tokens_per_device": 2576 }, { "epoch": 0.9552, "loss_ce": 0.22001083195209503, "loss_lvr": 0.6003897786140442, "loss_mode_switch": 0.0, "loss_total": 0.28004980087280273, "step": 2388 }, { "batch_size": 4, "epoch": 0.9552, "step": 2388, "tokens_per_device": 1284 }, { "epoch": 0.9552, "loss_ce": 0.23474927246570587, "loss_lvr": 0.7832542657852173, "loss_mode_switch": 0.0, "loss_total": 0.3130747079849243, "step": 2388 }, { "batch_size": 4, "epoch": 0.9552, "step": 2388, "tokens_per_device": 4060 }, { "epoch": 0.9552, "loss_ce": 0.061465803533792496, "loss_lvr": 0.8402453064918518, "loss_mode_switch": 0.0, "loss_total": 0.14549033343791962, "step": 2388 }, { "epoch": 0.9556, "grad_norm": 1.1431143283843994, "learning_rate": 5.16076057777859e-08, "loss": 0.2154, "step": 2389 }, { "batch_size": 4, "epoch": 0.9556, "step": 2389, "tokens_per_device": 4348 }, { "epoch": 0.9556, "loss_ce": 0.14232800900936127, "loss_lvr": 1.0694478750228882, "loss_mode_switch": 0.0, "loss_total": 0.24927279353141785, "step": 2389 }, { "batch_size": 4, "epoch": 0.9556, "step": 2389, "tokens_per_device": 2736 }, { "epoch": 0.9556, "loss_ce": 0.09492339938879013, "loss_lvr": 0.629802405834198, "loss_mode_switch": 0.0, "loss_total": 0.15790364146232605, "step": 2389 }, { "batch_size": 4, "epoch": 0.9556, "step": 2389, "tokens_per_device": 12316 }, { "epoch": 0.9556, "loss_ce": 0.20093902945518494, "loss_lvr": 0.5448819398880005, "loss_mode_switch": 0.0, "loss_total": 0.25542721152305603, "step": 2389 }, { "batch_size": 1, "epoch": 0.9556, "step": 2389, "tokens_per_device": 5145 }, { "epoch": 0.9556, "loss_ce": 0.0012578836176544428, "loss_lvr": 0.2798818349838257, "loss_mode_switch": 0.0, "loss_total": 0.029246067628264427, "step": 2389 }, { "batch_size": 1, "epoch": 0.9556, "step": 2389, "tokens_per_device": 5921 }, { "epoch": 0.9556, "loss_ce": 0.020097937434911728, "loss_lvr": 0.2709890604019165, "loss_mode_switch": 0.0, "loss_total": 0.04719684273004532, "step": 2389 }, { "batch_size": 1, "epoch": 0.9556, "step": 2389, "tokens_per_device": 5127 }, { "epoch": 0.9556, "loss_ce": 0.014490250498056412, "loss_lvr": 0.3600604832172394, "loss_mode_switch": 0.0, "loss_total": 0.05049629881978035, "step": 2389 }, { "batch_size": 4, "epoch": 0.9556, "step": 2389, "tokens_per_device": 14224 }, { "epoch": 0.9556, "loss_ce": 0.1818753480911255, "loss_lvr": 0.8275668621063232, "loss_mode_switch": 0.0, "loss_total": 0.26463204622268677, "step": 2389 }, { "batch_size": 4, "epoch": 0.9556, "step": 2389, "tokens_per_device": 3796 }, { "epoch": 0.9556, "loss_ce": 0.6831948757171631, "loss_lvr": 0.770046055316925, "loss_mode_switch": 0.0, "loss_total": 0.7601994872093201, "step": 2389 }, { "epoch": 0.956, "grad_norm": 1.1575616598129272, "learning_rate": 5.068349469173006e-08, "loss": 0.2448, "step": 2390 }, { "batch_size": 4, "epoch": 0.956, "step": 2390, "tokens_per_device": 5740 }, { "epoch": 0.956, "loss_ce": 0.14997906982898712, "loss_lvr": 0.7829124331474304, "loss_mode_switch": 0.0, "loss_total": 0.22827032208442688, "step": 2390 }, { "batch_size": 4, "epoch": 0.956, "step": 2390, "tokens_per_device": 4228 }, { "epoch": 0.956, "loss_ce": 0.31459420919418335, "loss_lvr": 0.8560253977775574, "loss_mode_switch": 0.0, "loss_total": 0.40019676089286804, "step": 2390 }, { "batch_size": 4, "epoch": 0.956, "step": 2390, "tokens_per_device": 4200 }, { "epoch": 0.956, "loss_ce": 0.11769197881221771, "loss_lvr": 1.1917935609817505, "loss_mode_switch": 0.0, "loss_total": 0.23687133193016052, "step": 2390 }, { "batch_size": 4, "epoch": 0.956, "step": 2390, "tokens_per_device": 10008 }, { "epoch": 0.956, "loss_ce": 0.08712559938430786, "loss_lvr": 0.3530331254005432, "loss_mode_switch": 0.0, "loss_total": 0.12242890894412994, "step": 2390 }, { "batch_size": 1, "epoch": 0.956, "step": 2390, "tokens_per_device": 4891 }, { "epoch": 0.956, "loss_ce": 0.04634859412908554, "loss_lvr": 0.3624435067176819, "loss_mode_switch": 0.0, "loss_total": 0.08259294927120209, "step": 2390 }, { "batch_size": 1, "epoch": 0.956, "step": 2390, "tokens_per_device": 6284 }, { "epoch": 0.956, "loss_ce": 0.008080791682004929, "loss_lvr": 0.25931471586227417, "loss_mode_switch": 0.0, "loss_total": 0.034012265503406525, "step": 2390 }, { "batch_size": 4, "epoch": 0.956, "step": 2390, "tokens_per_device": 4248 }, { "epoch": 0.956, "loss_ce": 0.06105905771255493, "loss_lvr": 0.9633961915969849, "loss_mode_switch": 0.0, "loss_total": 0.15739867091178894, "step": 2390 }, { "batch_size": 1, "epoch": 0.956, "step": 2390, "tokens_per_device": 4891 }, { "epoch": 0.956, "loss_ce": 0.06432834267616272, "loss_lvr": 0.27601781487464905, "loss_mode_switch": 0.0, "loss_total": 0.09193012118339539, "step": 2390 }, { "epoch": 0.9564, "grad_norm": 1.0333195924758911, "learning_rate": 4.976769016987959e-08, "loss": 0.2269, "step": 2391 }, { "batch_size": 1, "epoch": 0.9564, "step": 2391, "tokens_per_device": 5098 }, { "epoch": 0.9564, "loss_ce": 0.04485108703374863, "loss_lvr": 0.2764958441257477, "loss_mode_switch": 0.0, "loss_total": 0.07250067591667175, "step": 2391 }, { "batch_size": 1, "epoch": 0.9564, "step": 2391, "tokens_per_device": 5149 }, { "epoch": 0.9564, "loss_ce": 0.0038535173516720533, "loss_lvr": 0.5565259456634521, "loss_mode_switch": 0.0, "loss_total": 0.059506114572286606, "step": 2391 }, { "batch_size": 4, "epoch": 0.9564, "step": 2391, "tokens_per_device": 4168 }, { "epoch": 0.9564, "loss_ce": 0.23125648498535156, "loss_lvr": 0.9646938443183899, "loss_mode_switch": 0.0, "loss_total": 0.327725887298584, "step": 2391 }, { "batch_size": 4, "epoch": 0.9564, "step": 2391, "tokens_per_device": 4212 }, { "epoch": 0.9564, "loss_ce": 0.12475119531154633, "loss_lvr": 0.45566004514694214, "loss_mode_switch": 0.0, "loss_total": 0.17031720280647278, "step": 2391 }, { "batch_size": 4, "epoch": 0.9564, "step": 2391, "tokens_per_device": 3676 }, { "epoch": 0.9564, "loss_ce": 0.02473863586783409, "loss_lvr": 0.5261597633361816, "loss_mode_switch": 0.0, "loss_total": 0.07735460996627808, "step": 2391 }, { "batch_size": 1, "epoch": 0.9564, "step": 2391, "tokens_per_device": 5099 }, { "epoch": 0.9564, "loss_ce": 0.006518661510199308, "loss_lvr": 0.3042682409286499, "loss_mode_switch": 0.0, "loss_total": 0.03694548457860947, "step": 2391 }, { "batch_size": 4, "epoch": 0.9564, "step": 2391, "tokens_per_device": 4060 }, { "epoch": 0.9564, "loss_ce": 0.30951952934265137, "loss_lvr": 0.7908178567886353, "loss_mode_switch": 0.0, "loss_total": 0.38860130310058594, "step": 2391 }, { "batch_size": 4, "epoch": 0.9564, "step": 2391, "tokens_per_device": 4252 }, { "epoch": 0.9564, "loss_ce": 0.006067616865038872, "loss_lvr": 0.38575124740600586, "loss_mode_switch": 0.0, "loss_total": 0.04464273899793625, "step": 2391 }, { "epoch": 0.9568, "grad_norm": 0.9838325381278992, "learning_rate": 4.8860193749253324e-08, "loss": 0.192, "step": 2392 }, { "batch_size": 4, "epoch": 0.9568, "step": 2392, "tokens_per_device": 4240 }, { "epoch": 0.9568, "loss_ce": 0.014195769093930721, "loss_lvr": 0.7289003133773804, "loss_mode_switch": 0.0, "loss_total": 0.08708580583333969, "step": 2392 }, { "batch_size": 4, "epoch": 0.9568, "step": 2392, "tokens_per_device": 3880 }, { "epoch": 0.9568, "loss_ce": 0.09340636432170868, "loss_lvr": 0.8501064777374268, "loss_mode_switch": 0.0, "loss_total": 0.17841701209545135, "step": 2392 }, { "batch_size": 4, "epoch": 0.9568, "step": 2392, "tokens_per_device": 5424 }, { "epoch": 0.9568, "loss_ce": 0.409961998462677, "loss_lvr": 0.8212122917175293, "loss_mode_switch": 0.0, "loss_total": 0.49208322167396545, "step": 2392 }, { "batch_size": 4, "epoch": 0.9568, "step": 2392, "tokens_per_device": 5920 }, { "epoch": 0.9568, "loss_ce": 0.24399766325950623, "loss_lvr": 0.7195325493812561, "loss_mode_switch": 0.0, "loss_total": 0.3159509301185608, "step": 2392 }, { "batch_size": 4, "epoch": 0.9568, "step": 2392, "tokens_per_device": 10024 }, { "epoch": 0.9568, "loss_ce": 0.007374455686658621, "loss_lvr": 0.5933045148849487, "loss_mode_switch": 0.0, "loss_total": 0.06670490652322769, "step": 2392 }, { "batch_size": 1, "epoch": 0.9568, "step": 2392, "tokens_per_device": 5043 }, { "epoch": 0.9568, "loss_ce": 0.0033968479838222265, "loss_lvr": 0.31643223762512207, "loss_mode_switch": 0.0, "loss_total": 0.03504006937146187, "step": 2392 }, { "batch_size": 4, "epoch": 0.9568, "step": 2392, "tokens_per_device": 5064 }, { "epoch": 0.9568, "loss_ce": 0.28359171748161316, "loss_lvr": 0.8112779855728149, "loss_mode_switch": 0.0, "loss_total": 0.3647195100784302, "step": 2392 }, { "batch_size": 1, "epoch": 0.9568, "step": 2392, "tokens_per_device": 4893 }, { "epoch": 0.9568, "loss_ce": 0.0006884420872665942, "loss_lvr": 0.21035267412662506, "loss_mode_switch": 0.0, "loss_total": 0.021723710000514984, "step": 2392 }, { "epoch": 0.9572, "grad_norm": 0.9207451343536377, "learning_rate": 4.79610069529246e-08, "loss": 0.169, "step": 2393 }, { "batch_size": 4, "epoch": 0.9572, "step": 2393, "tokens_per_device": 3840 }, { "epoch": 0.9572, "loss_ce": 0.13577280938625336, "loss_lvr": 0.8259507417678833, "loss_mode_switch": 0.0, "loss_total": 0.21836787462234497, "step": 2393 }, { "batch_size": 4, "epoch": 0.9572, "step": 2393, "tokens_per_device": 1292 }, { "epoch": 0.9572, "loss_ce": 0.45045721530914307, "loss_lvr": 0.9817432761192322, "loss_mode_switch": 0.0, "loss_total": 0.5486315488815308, "step": 2393 }, { "batch_size": 4, "epoch": 0.9572, "step": 2393, "tokens_per_device": 1492 }, { "epoch": 0.9572, "loss_ce": 0.1537409871816635, "loss_lvr": 0.840231716632843, "loss_mode_switch": 0.0, "loss_total": 0.2377641499042511, "step": 2393 }, { "batch_size": 4, "epoch": 0.9572, "step": 2393, "tokens_per_device": 1452 }, { "epoch": 0.9572, "loss_ce": 0.5093948245048523, "loss_lvr": 0.8595471382141113, "loss_mode_switch": 0.0, "loss_total": 0.5953495502471924, "step": 2393 }, { "batch_size": 4, "epoch": 0.9572, "step": 2393, "tokens_per_device": 4592 }, { "epoch": 0.9572, "loss_ce": 0.2781248390674591, "loss_lvr": 0.7915262579917908, "loss_mode_switch": 0.0, "loss_total": 0.35727745294570923, "step": 2393 }, { "batch_size": 4, "epoch": 0.9572, "step": 2393, "tokens_per_device": 3204 }, { "epoch": 0.9572, "loss_ce": 0.22958068549633026, "loss_lvr": 0.7428680062294006, "loss_mode_switch": 0.0, "loss_total": 0.30386748909950256, "step": 2393 }, { "batch_size": 4, "epoch": 0.9572, "step": 2393, "tokens_per_device": 11172 }, { "epoch": 0.9572, "loss_ce": 0.24681127071380615, "loss_lvr": 0.6914324760437012, "loss_mode_switch": 0.0, "loss_total": 0.3159545063972473, "step": 2393 }, { "batch_size": 4, "epoch": 0.9572, "step": 2393, "tokens_per_device": 4732 }, { "epoch": 0.9572, "loss_ce": 0.5271463990211487, "loss_lvr": 0.7471203804016113, "loss_mode_switch": 0.0, "loss_total": 0.6018584370613098, "step": 2393 }, { "epoch": 0.9576, "grad_norm": 1.2158790826797485, "learning_rate": 4.707013129002291e-08, "loss": 0.2553, "step": 2394 }, { "batch_size": 1, "epoch": 0.9576, "step": 2394, "tokens_per_device": 6934 }, { "epoch": 0.9576, "loss_ce": 0.016542071476578712, "loss_lvr": 0.40305373072624207, "loss_mode_switch": 0.0, "loss_total": 0.05684744566679001, "step": 2394 }, { "batch_size": 4, "epoch": 0.9576, "step": 2394, "tokens_per_device": 2676 }, { "epoch": 0.9576, "loss_ce": 0.3695468008518219, "loss_lvr": 0.7641295790672302, "loss_mode_switch": 0.0, "loss_total": 0.44595974683761597, "step": 2394 }, { "batch_size": 4, "epoch": 0.9576, "step": 2394, "tokens_per_device": 4140 }, { "epoch": 0.9576, "loss_ce": 0.15651343762874603, "loss_lvr": 0.8961989879608154, "loss_mode_switch": 0.0, "loss_total": 0.24613332748413086, "step": 2394 }, { "batch_size": 1, "epoch": 0.9576, "step": 2394, "tokens_per_device": 5052 }, { "epoch": 0.9576, "loss_ce": 0.003666679607704282, "loss_lvr": 0.3015865385532379, "loss_mode_switch": 0.0, "loss_total": 0.033825334161520004, "step": 2394 }, { "batch_size": 4, "epoch": 0.9576, "step": 2394, "tokens_per_device": 1488 }, { "epoch": 0.9576, "loss_ce": 0.2793055474758148, "loss_lvr": 1.0440260171890259, "loss_mode_switch": 0.0, "loss_total": 0.3837081491947174, "step": 2394 }, { "batch_size": 4, "epoch": 0.9576, "step": 2394, "tokens_per_device": 4712 }, { "epoch": 0.9576, "loss_ce": 0.35090455412864685, "loss_lvr": 0.921233594417572, "loss_mode_switch": 0.0, "loss_total": 0.44302791357040405, "step": 2394 }, { "batch_size": 1, "epoch": 0.9576, "step": 2394, "tokens_per_device": 4883 }, { "epoch": 0.9576, "loss_ce": 0.00031055061845108867, "loss_lvr": 0.33153894543647766, "loss_mode_switch": 0.0, "loss_total": 0.033464446663856506, "step": 2394 }, { "batch_size": 4, "epoch": 0.9576, "step": 2394, "tokens_per_device": 4248 }, { "epoch": 0.9576, "loss_ce": 0.21654750406742096, "loss_lvr": 0.5645296573638916, "loss_mode_switch": 0.0, "loss_total": 0.27300047874450684, "step": 2394 }, { "epoch": 0.958, "grad_norm": 1.106338620185852, "learning_rate": 4.618756825572612e-08, "loss": 0.2441, "step": 2395 }, { "batch_size": 4, "epoch": 0.958, "step": 2395, "tokens_per_device": 8212 }, { "epoch": 0.958, "loss_ce": 0.04595867916941643, "loss_lvr": 0.7999112010002136, "loss_mode_switch": 0.0, "loss_total": 0.12594980001449585, "step": 2395 }, { "batch_size": 4, "epoch": 0.958, "step": 2395, "tokens_per_device": 4276 }, { "epoch": 0.958, "loss_ce": 0.12028785794973373, "loss_lvr": 1.3490794897079468, "loss_mode_switch": 0.0, "loss_total": 0.2551957964897156, "step": 2395 }, { "batch_size": 4, "epoch": 0.958, "step": 2395, "tokens_per_device": 4508 }, { "epoch": 0.958, "loss_ce": 0.11606542766094208, "loss_lvr": 0.8160644173622131, "loss_mode_switch": 0.0, "loss_total": 0.19767186045646667, "step": 2395 }, { "batch_size": 4, "epoch": 0.958, "step": 2395, "tokens_per_device": 3932 }, { "epoch": 0.958, "loss_ce": 0.12377900630235672, "loss_lvr": 2.311339855194092, "loss_mode_switch": 0.0, "loss_total": 0.35491299629211426, "step": 2395 }, { "batch_size": 1, "epoch": 0.958, "step": 2395, "tokens_per_device": 4887 }, { "epoch": 0.958, "loss_ce": 0.02104084938764572, "loss_lvr": 0.2685801386833191, "loss_mode_switch": 0.0, "loss_total": 0.04789886623620987, "step": 2395 }, { "batch_size": 4, "epoch": 0.958, "step": 2395, "tokens_per_device": 4232 }, { "epoch": 0.958, "loss_ce": 0.2883141338825226, "loss_lvr": 0.40328195691108704, "loss_mode_switch": 0.0, "loss_total": 0.328642338514328, "step": 2395 }, { "batch_size": 4, "epoch": 0.958, "step": 2395, "tokens_per_device": 2692 }, { "epoch": 0.958, "loss_ce": 0.41329050064086914, "loss_lvr": 0.892407238483429, "loss_mode_switch": 0.0, "loss_total": 0.5025312304496765, "step": 2395 }, { "batch_size": 4, "epoch": 0.958, "step": 2395, "tokens_per_device": 3500 }, { "epoch": 0.958, "loss_ce": 0.6197314262390137, "loss_lvr": 0.6960511207580566, "loss_mode_switch": 0.0, "loss_total": 0.6893365383148193, "step": 2395 }, { "epoch": 0.9584, "grad_norm": 1.0882169008255005, "learning_rate": 4.5313319331262703e-08, "loss": 0.221, "step": 2396 }, { "batch_size": 4, "epoch": 0.9584, "step": 2396, "tokens_per_device": 5448 }, { "epoch": 0.9584, "loss_ce": 0.11406855285167694, "loss_lvr": 0.6076786518096924, "loss_mode_switch": 0.0, "loss_total": 0.1748364269733429, "step": 2396 }, { "batch_size": 1, "epoch": 0.9584, "step": 2396, "tokens_per_device": 5012 }, { "epoch": 0.9584, "loss_ce": 0.07538704574108124, "loss_lvr": 0.712677538394928, "loss_mode_switch": 0.0, "loss_total": 0.14665479958057404, "step": 2396 }, { "batch_size": 4, "epoch": 0.9584, "step": 2396, "tokens_per_device": 6884 }, { "epoch": 0.9584, "loss_ce": 0.12358184158802032, "loss_lvr": 0.5828263163566589, "loss_mode_switch": 0.0, "loss_total": 0.18186447024345398, "step": 2396 }, { "batch_size": 4, "epoch": 0.9584, "step": 2396, "tokens_per_device": 3800 }, { "epoch": 0.9584, "loss_ce": 0.09942522644996643, "loss_lvr": 1.981474757194519, "loss_mode_switch": 0.0, "loss_total": 0.29757270216941833, "step": 2396 }, { "batch_size": 4, "epoch": 0.9584, "step": 2396, "tokens_per_device": 2824 }, { "epoch": 0.9584, "loss_ce": 0.10189638286828995, "loss_lvr": 0.8426580429077148, "loss_mode_switch": 0.0, "loss_total": 0.18616218864917755, "step": 2396 }, { "batch_size": 1, "epoch": 0.9584, "step": 2396, "tokens_per_device": 4893 }, { "epoch": 0.9584, "loss_ce": 0.019859520718455315, "loss_lvr": 0.3911205232143402, "loss_mode_switch": 0.0, "loss_total": 0.058971576392650604, "step": 2396 }, { "batch_size": 4, "epoch": 0.9584, "step": 2396, "tokens_per_device": 14636 }, { "epoch": 0.9584, "loss_ce": 0.1322667896747589, "loss_lvr": 0.3816797137260437, "loss_mode_switch": 0.0, "loss_total": 0.17043475806713104, "step": 2396 }, { "batch_size": 1, "epoch": 0.9584, "step": 2396, "tokens_per_device": 4939 }, { "epoch": 0.9584, "loss_ce": 0.2372870296239853, "loss_lvr": 0.407821923494339, "loss_mode_switch": 0.0, "loss_total": 0.27806922793388367, "step": 2396 }, { "epoch": 0.9588, "grad_norm": 1.1129103899002075, "learning_rate": 4.4447385983907855e-08, "loss": 0.2399, "step": 2397 }, { "batch_size": 1, "epoch": 0.9588, "step": 2397, "tokens_per_device": 6890 }, { "epoch": 0.9588, "loss_ce": 0.07645832002162933, "loss_lvr": 0.23867595195770264, "loss_mode_switch": 0.0, "loss_total": 0.10032591223716736, "step": 2397 }, { "batch_size": 4, "epoch": 0.9588, "step": 2397, "tokens_per_device": 3972 }, { "epoch": 0.9588, "loss_ce": 0.0479547418653965, "loss_lvr": 2.2736518383026123, "loss_mode_switch": 0.0, "loss_total": 0.2753199338912964, "step": 2397 }, { "batch_size": 4, "epoch": 0.9588, "step": 2397, "tokens_per_device": 4900 }, { "epoch": 0.9588, "loss_ce": 0.2230706512928009, "loss_lvr": 0.96392822265625, "loss_mode_switch": 0.0, "loss_total": 0.31946349143981934, "step": 2397 }, { "batch_size": 4, "epoch": 0.9588, "step": 2397, "tokens_per_device": 5076 }, { "epoch": 0.9588, "loss_ce": 0.016205336898565292, "loss_lvr": 0.6424327492713928, "loss_mode_switch": 0.0, "loss_total": 0.08044861257076263, "step": 2397 }, { "batch_size": 4, "epoch": 0.9588, "step": 2397, "tokens_per_device": 4660 }, { "epoch": 0.9588, "loss_ce": 0.2836332321166992, "loss_lvr": 0.8757198452949524, "loss_mode_switch": 0.0, "loss_total": 0.37120521068573, "step": 2397 }, { "batch_size": 4, "epoch": 0.9588, "step": 2397, "tokens_per_device": 2684 }, { "epoch": 0.9588, "loss_ce": 0.4916432797908783, "loss_lvr": 0.7895026206970215, "loss_mode_switch": 0.0, "loss_total": 0.570593535900116, "step": 2397 }, { "batch_size": 4, "epoch": 0.9588, "step": 2397, "tokens_per_device": 3912 }, { "epoch": 0.9588, "loss_ce": 0.06742224842309952, "loss_lvr": 0.699431300163269, "loss_mode_switch": 0.0, "loss_total": 0.13736537098884583, "step": 2397 }, { "batch_size": 4, "epoch": 0.9588, "step": 2397, "tokens_per_device": 4288 }, { "epoch": 0.9588, "loss_ce": 0.1790841668844223, "loss_lvr": 0.8786169290542603, "loss_mode_switch": 0.0, "loss_total": 0.26694586873054504, "step": 2397 }, { "epoch": 0.9592, "grad_norm": 1.1560713052749634, "learning_rate": 4.3589769666978476e-08, "loss": 0.2137, "step": 2398 }, { "batch_size": 4, "epoch": 0.9592, "step": 2398, "tokens_per_device": 12804 }, { "epoch": 0.9592, "loss_ce": 0.3580783009529114, "loss_lvr": 0.7213569283485413, "loss_mode_switch": 0.0, "loss_total": 0.430213987827301, "step": 2398 }, { "batch_size": 1, "epoch": 0.9592, "step": 2398, "tokens_per_device": 4104 }, { "epoch": 0.9592, "loss_ce": 0.0004498083726502955, "loss_lvr": 0.37901467084884644, "loss_mode_switch": 0.0, "loss_total": 0.03835127875208855, "step": 2398 }, { "batch_size": 4, "epoch": 0.9592, "step": 2398, "tokens_per_device": 1572 }, { "epoch": 0.9592, "loss_ce": 0.5336184501647949, "loss_lvr": 0.8339661359786987, "loss_mode_switch": 0.0, "loss_total": 0.6170150637626648, "step": 2398 }, { "batch_size": 1, "epoch": 0.9592, "step": 2398, "tokens_per_device": 4863 }, { "epoch": 0.9592, "loss_ce": 0.00039115716936066747, "loss_lvr": 0.21699965000152588, "loss_mode_switch": 0.0, "loss_total": 0.022091122344136238, "step": 2398 }, { "batch_size": 4, "epoch": 0.9592, "step": 2398, "tokens_per_device": 2680 }, { "epoch": 0.9592, "loss_ce": 0.05001923069357872, "loss_lvr": 0.6668904423713684, "loss_mode_switch": 0.0, "loss_total": 0.11670827865600586, "step": 2398 }, { "batch_size": 4, "epoch": 0.9592, "step": 2398, "tokens_per_device": 3900 }, { "epoch": 0.9592, "loss_ce": 0.2615611255168915, "loss_lvr": 0.911150336265564, "loss_mode_switch": 0.0, "loss_total": 0.3526761531829834, "step": 2398 }, { "batch_size": 4, "epoch": 0.9592, "step": 2398, "tokens_per_device": 5080 }, { "epoch": 0.9592, "loss_ce": 0.15345411002635956, "loss_lvr": 0.7858421206474304, "loss_mode_switch": 0.0, "loss_total": 0.23203831911087036, "step": 2398 }, { "batch_size": 4, "epoch": 0.9592, "step": 2398, "tokens_per_device": 3724 }, { "epoch": 0.9592, "loss_ce": 0.029964905232191086, "loss_lvr": 1.365842580795288, "loss_mode_switch": 0.0, "loss_total": 0.1665491759777069, "step": 2398 }, { "epoch": 0.9596, "grad_norm": 1.0003465414047241, "learning_rate": 4.274047181983487e-08, "loss": 0.2043, "step": 2399 }, { "batch_size": 4, "epoch": 0.9596, "step": 2399, "tokens_per_device": 4232 }, { "epoch": 0.9596, "loss_ce": 0.0022191512398421764, "loss_lvr": 0.5078747272491455, "loss_mode_switch": 0.0, "loss_total": 0.053006626665592194, "step": 2399 }, { "batch_size": 4, "epoch": 0.9596, "step": 2399, "tokens_per_device": 2100 }, { "epoch": 0.9596, "loss_ce": 0.4573751986026764, "loss_lvr": 0.8636284470558167, "loss_mode_switch": 0.0, "loss_total": 0.543738067150116, "step": 2399 }, { "batch_size": 4, "epoch": 0.9596, "step": 2399, "tokens_per_device": 4252 }, { "epoch": 0.9596, "loss_ce": 0.19279196858406067, "loss_lvr": 0.9938632249832153, "loss_mode_switch": 0.0, "loss_total": 0.29217830300331116, "step": 2399 }, { "batch_size": 1, "epoch": 0.9596, "step": 2399, "tokens_per_device": 5856 }, { "epoch": 0.9596, "loss_ce": 0.0004685414314735681, "loss_lvr": 0.37848448753356934, "loss_mode_switch": 0.0, "loss_total": 0.0383169911801815, "step": 2399 }, { "batch_size": 4, "epoch": 0.9596, "step": 2399, "tokens_per_device": 2744 }, { "epoch": 0.9596, "loss_ce": 0.16446128487586975, "loss_lvr": 0.6305087804794312, "loss_mode_switch": 0.0, "loss_total": 0.2275121659040451, "step": 2399 }, { "batch_size": 4, "epoch": 0.9596, "step": 2399, "tokens_per_device": 4264 }, { "epoch": 0.9596, "loss_ce": 0.01057191751897335, "loss_lvr": 0.9411401152610779, "loss_mode_switch": 0.0, "loss_total": 0.10468593239784241, "step": 2399 }, { "batch_size": 1, "epoch": 0.9596, "step": 2399, "tokens_per_device": 4926 }, { "epoch": 0.9596, "loss_ce": 0.16870692372322083, "loss_lvr": 0.4795435070991516, "loss_mode_switch": 0.0, "loss_total": 0.216661274433136, "step": 2399 }, { "batch_size": 4, "epoch": 0.9596, "step": 2399, "tokens_per_device": 3776 }, { "epoch": 0.9596, "loss_ce": 0.13378159701824188, "loss_lvr": 0.9779433608055115, "loss_mode_switch": 0.0, "loss_total": 0.23157593607902527, "step": 2399 }, { "epoch": 0.96, "grad_norm": 1.0742573738098145, "learning_rate": 4.189949386787462e-08, "loss": 0.2154, "step": 2400 }, { "batch_size": 4, "epoch": 0.96, "step": 2400, "tokens_per_device": 3800 }, { "epoch": 0.96, "loss_ce": 0.16244827210903168, "loss_lvr": 0.8127692937850952, "loss_mode_switch": 0.0, "loss_total": 0.24372521042823792, "step": 2400 }, { "batch_size": 4, "epoch": 0.96, "step": 2400, "tokens_per_device": 5352 }, { "epoch": 0.96, "loss_ce": 0.2502666115760803, "loss_lvr": 0.5884436964988708, "loss_mode_switch": 0.0, "loss_total": 0.30911096930503845, "step": 2400 }, { "batch_size": 4, "epoch": 0.96, "step": 2400, "tokens_per_device": 4056 }, { "epoch": 0.96, "loss_ce": 0.08300449699163437, "loss_lvr": 0.8430498838424683, "loss_mode_switch": 0.0, "loss_total": 0.1673094928264618, "step": 2400 }, { "batch_size": 4, "epoch": 0.96, "step": 2400, "tokens_per_device": 3788 }, { "epoch": 0.96, "loss_ce": 0.02540649101138115, "loss_lvr": 0.9173678755760193, "loss_mode_switch": 0.0, "loss_total": 0.11714327335357666, "step": 2400 }, { "batch_size": 4, "epoch": 0.96, "step": 2400, "tokens_per_device": 2088 }, { "epoch": 0.96, "loss_ce": 0.2847108542919159, "loss_lvr": 0.9533411264419556, "loss_mode_switch": 0.0, "loss_total": 0.38004496693611145, "step": 2400 }, { "batch_size": 4, "epoch": 0.96, "step": 2400, "tokens_per_device": 4104 }, { "epoch": 0.96, "loss_ce": 0.16200125217437744, "loss_lvr": 0.8823104500770569, "loss_mode_switch": 0.0, "loss_total": 0.2502323091030121, "step": 2400 }, { "batch_size": 4, "epoch": 0.96, "step": 2400, "tokens_per_device": 2700 }, { "epoch": 0.96, "loss_ce": 0.5014469027519226, "loss_lvr": 0.7689812779426575, "loss_mode_switch": 0.0, "loss_total": 0.5783450603485107, "step": 2400 }, { "batch_size": 1, "epoch": 0.96, "step": 2400, "tokens_per_device": 4943 }, { "epoch": 0.96, "loss_ce": 0.026550162583589554, "loss_lvr": 0.4741096496582031, "loss_mode_switch": 0.0, "loss_total": 0.07396112382411957, "step": 2400 }, { "epoch": 0.9604, "grad_norm": 1.1549561023712158, "learning_rate": 4.106683722253257e-08, "loss": 0.2407, "step": 2401 }, { "batch_size": 4, "epoch": 0.9604, "step": 2401, "tokens_per_device": 1584 }, { "epoch": 0.9604, "loss_ce": 0.46155598759651184, "loss_lvr": 0.9252755641937256, "loss_mode_switch": 0.0, "loss_total": 0.554083526134491, "step": 2401 }, { "batch_size": 1, "epoch": 0.9604, "step": 2401, "tokens_per_device": 5179 }, { "epoch": 0.9604, "loss_ce": 0.07168404757976532, "loss_lvr": 0.5364109873771667, "loss_mode_switch": 0.0, "loss_total": 0.12532514333724976, "step": 2401 }, { "batch_size": 4, "epoch": 0.9604, "step": 2401, "tokens_per_device": 2544 }, { "epoch": 0.9604, "loss_ce": 0.4440964162349701, "loss_lvr": 0.8958908915519714, "loss_mode_switch": 0.0, "loss_total": 0.5336855053901672, "step": 2401 }, { "batch_size": 4, "epoch": 0.9604, "step": 2401, "tokens_per_device": 8320 }, { "epoch": 0.9604, "loss_ce": 0.06941056251525879, "loss_lvr": 0.6497976183891296, "loss_mode_switch": 0.0, "loss_total": 0.13439032435417175, "step": 2401 }, { "batch_size": 4, "epoch": 0.9604, "step": 2401, "tokens_per_device": 6152 }, { "epoch": 0.9604, "loss_ce": 0.17131438851356506, "loss_lvr": 0.9833753705024719, "loss_mode_switch": 0.0, "loss_total": 0.2696519196033478, "step": 2401 }, { "batch_size": 4, "epoch": 0.9604, "step": 2401, "tokens_per_device": 4296 }, { "epoch": 0.9604, "loss_ce": 0.21690575778484344, "loss_lvr": 0.9730604887008667, "loss_mode_switch": 0.0, "loss_total": 0.31421181559562683, "step": 2401 }, { "batch_size": 4, "epoch": 0.9604, "step": 2401, "tokens_per_device": 3408 }, { "epoch": 0.9604, "loss_ce": 0.1131388396024704, "loss_lvr": 0.5123737454414368, "loss_mode_switch": 0.0, "loss_total": 0.16437621414661407, "step": 2401 }, { "batch_size": 4, "epoch": 0.9604, "step": 2401, "tokens_per_device": 4516 }, { "epoch": 0.9604, "loss_ce": 0.4166739881038666, "loss_lvr": 0.7831838726997375, "loss_mode_switch": 0.0, "loss_total": 0.49499237537384033, "step": 2401 }, { "epoch": 0.9608, "grad_norm": 1.0814878940582275, "learning_rate": 4.024250328127755e-08, "loss": 0.2172, "step": 2402 }, { "batch_size": 4, "epoch": 0.9608, "step": 2402, "tokens_per_device": 4236 }, { "epoch": 0.9608, "loss_ce": 0.10173503309488297, "loss_lvr": 1.0664902925491333, "loss_mode_switch": 0.0, "loss_total": 0.20838406682014465, "step": 2402 }, { "batch_size": 1, "epoch": 0.9608, "step": 2402, "tokens_per_device": 5041 }, { "epoch": 0.9608, "loss_ce": 0.19111335277557373, "loss_lvr": 0.49506720900535583, "loss_mode_switch": 0.0, "loss_total": 0.24062007665634155, "step": 2402 }, { "batch_size": 4, "epoch": 0.9608, "step": 2402, "tokens_per_device": 1468 }, { "epoch": 0.9608, "loss_ce": 0.09037504345178604, "loss_lvr": 1.04621160030365, "loss_mode_switch": 0.0, "loss_total": 0.1949962079524994, "step": 2402 }, { "batch_size": 4, "epoch": 0.9608, "step": 2402, "tokens_per_device": 3828 }, { "epoch": 0.9608, "loss_ce": 0.0016732873627915978, "loss_lvr": 0.9109812378883362, "loss_mode_switch": 0.0, "loss_total": 0.09277141094207764, "step": 2402 }, { "batch_size": 4, "epoch": 0.9608, "step": 2402, "tokens_per_device": 2644 }, { "epoch": 0.9608, "loss_ce": 0.16740001738071442, "loss_lvr": 0.4949139654636383, "loss_mode_switch": 0.0, "loss_total": 0.21689140796661377, "step": 2402 }, { "batch_size": 4, "epoch": 0.9608, "step": 2402, "tokens_per_device": 3768 }, { "epoch": 0.9608, "loss_ce": 0.018879568204283714, "loss_lvr": 0.9224644899368286, "loss_mode_switch": 0.0, "loss_total": 0.11112602055072784, "step": 2402 }, { "batch_size": 4, "epoch": 0.9608, "step": 2402, "tokens_per_device": 3796 }, { "epoch": 0.9608, "loss_ce": 0.24616968631744385, "loss_lvr": 1.2088515758514404, "loss_mode_switch": 0.0, "loss_total": 0.36705484986305237, "step": 2402 }, { "batch_size": 4, "epoch": 0.9608, "step": 2402, "tokens_per_device": 3804 }, { "epoch": 0.9608, "loss_ce": 0.3473358452320099, "loss_lvr": 0.9998234510421753, "loss_mode_switch": 0.0, "loss_total": 0.4473181962966919, "step": 2402 }, { "epoch": 0.9612, "grad_norm": 1.1574316024780273, "learning_rate": 3.9426493427611177e-08, "loss": 0.2402, "step": 2403 }, { "batch_size": 4, "epoch": 0.9612, "step": 2403, "tokens_per_device": 5160 }, { "epoch": 0.9612, "loss_ce": 0.5053316354751587, "loss_lvr": 0.7023547291755676, "loss_mode_switch": 0.0, "loss_total": 0.5755671262741089, "step": 2403 }, { "batch_size": 4, "epoch": 0.9612, "step": 2403, "tokens_per_device": 4216 }, { "epoch": 0.9612, "loss_ce": 0.010858096182346344, "loss_lvr": 0.6935835480690002, "loss_mode_switch": 0.0, "loss_total": 0.08021645247936249, "step": 2403 }, { "batch_size": 4, "epoch": 0.9612, "step": 2403, "tokens_per_device": 6224 }, { "epoch": 0.9612, "loss_ce": 0.004070252645760775, "loss_lvr": 0.8815687298774719, "loss_mode_switch": 0.0, "loss_total": 0.09222712367773056, "step": 2403 }, { "batch_size": 4, "epoch": 0.9612, "step": 2403, "tokens_per_device": 1256 }, { "epoch": 0.9612, "loss_ce": 0.31167781352996826, "loss_lvr": 0.9306318163871765, "loss_mode_switch": 0.0, "loss_total": 0.40474098920822144, "step": 2403 }, { "batch_size": 1, "epoch": 0.9612, "step": 2403, "tokens_per_device": 4797 }, { "epoch": 0.9612, "loss_ce": 0.03115876577794552, "loss_lvr": 0.47077587246894836, "loss_mode_switch": 0.0, "loss_total": 0.07823635637760162, "step": 2403 }, { "batch_size": 1, "epoch": 0.9612, "step": 2403, "tokens_per_device": 4887 }, { "epoch": 0.9612, "loss_ce": 0.0001410945551469922, "loss_lvr": 0.2228686362504959, "loss_mode_switch": 0.0, "loss_total": 0.022427959367632866, "step": 2403 }, { "batch_size": 4, "epoch": 0.9612, "step": 2403, "tokens_per_device": 3844 }, { "epoch": 0.9612, "loss_ce": 0.021198634058237076, "loss_lvr": 1.3388581275939941, "loss_mode_switch": 0.0, "loss_total": 0.15508444607257843, "step": 2403 }, { "batch_size": 4, "epoch": 0.9612, "step": 2403, "tokens_per_device": 4340 }, { "epoch": 0.9612, "loss_ce": 0.056847747415304184, "loss_lvr": 0.7879602313041687, "loss_mode_switch": 0.0, "loss_total": 0.13564376533031464, "step": 2403 }, { "epoch": 0.9616, "grad_norm": 1.067495346069336, "learning_rate": 3.8618809031061855e-08, "loss": 0.2202, "step": 2404 }, { "batch_size": 4, "epoch": 0.9616, "step": 2404, "tokens_per_device": 6300 }, { "epoch": 0.9616, "loss_ce": 0.10702410340309143, "loss_lvr": 0.7831016778945923, "loss_mode_switch": 0.0, "loss_total": 0.18533426523208618, "step": 2404 }, { "batch_size": 4, "epoch": 0.9616, "step": 2404, "tokens_per_device": 4356 }, { "epoch": 0.9616, "loss_ce": 0.029350588098168373, "loss_lvr": 0.8804599046707153, "loss_mode_switch": 0.0, "loss_total": 0.11739657819271088, "step": 2404 }, { "batch_size": 4, "epoch": 0.9616, "step": 2404, "tokens_per_device": 6112 }, { "epoch": 0.9616, "loss_ce": 0.2535460889339447, "loss_lvr": 0.7448130249977112, "loss_mode_switch": 0.0, "loss_total": 0.3280273973941803, "step": 2404 }, { "batch_size": 4, "epoch": 0.9616, "step": 2404, "tokens_per_device": 1632 }, { "epoch": 0.9616, "loss_ce": 0.46343982219696045, "loss_lvr": 1.1353089809417725, "loss_mode_switch": 0.0, "loss_total": 0.5769706964492798, "step": 2404 }, { "batch_size": 4, "epoch": 0.9616, "step": 2404, "tokens_per_device": 1268 }, { "epoch": 0.9616, "loss_ce": 0.06862810999155045, "loss_lvr": 0.7652021050453186, "loss_mode_switch": 0.0, "loss_total": 0.14514832198619843, "step": 2404 }, { "batch_size": 1, "epoch": 0.9616, "step": 2404, "tokens_per_device": 4845 }, { "epoch": 0.9616, "loss_ce": 0.1327640265226364, "loss_lvr": 0.29731717705726624, "loss_mode_switch": 0.0, "loss_total": 0.16249574720859528, "step": 2404 }, { "batch_size": 4, "epoch": 0.9616, "step": 2404, "tokens_per_device": 5068 }, { "epoch": 0.9616, "loss_ce": 0.25239327549934387, "loss_lvr": 0.7960257530212402, "loss_mode_switch": 0.0, "loss_total": 0.3319958448410034, "step": 2404 }, { "batch_size": 4, "epoch": 0.9616, "step": 2404, "tokens_per_device": 4288 }, { "epoch": 0.9616, "loss_ce": 0.3007435202598572, "loss_lvr": 0.6122015714645386, "loss_mode_switch": 0.0, "loss_total": 0.36196368932724, "step": 2404 }, { "epoch": 0.962, "grad_norm": 1.162041425704956, "learning_rate": 3.781945144718912e-08, "loss": 0.2261, "step": 2405 }, { "batch_size": 4, "epoch": 0.962, "step": 2405, "tokens_per_device": 6004 }, { "epoch": 0.962, "loss_ce": 0.2393057942390442, "loss_lvr": 0.7427334189414978, "loss_mode_switch": 0.0, "loss_total": 0.31357914209365845, "step": 2405 }, { "batch_size": 1, "epoch": 0.962, "step": 2405, "tokens_per_device": 5244 }, { "epoch": 0.962, "loss_ce": 0.15805578231811523, "loss_lvr": 0.23064543306827545, "loss_mode_switch": 0.0, "loss_total": 0.18112032115459442, "step": 2405 }, { "batch_size": 4, "epoch": 0.962, "step": 2405, "tokens_per_device": 3220 }, { "epoch": 0.962, "loss_ce": 0.08696010708808899, "loss_lvr": 0.7273783683776855, "loss_mode_switch": 0.0, "loss_total": 0.15969794988632202, "step": 2405 }, { "batch_size": 1, "epoch": 0.962, "step": 2405, "tokens_per_device": 4903 }, { "epoch": 0.962, "loss_ce": 0.016028815880417824, "loss_lvr": 0.254780650138855, "loss_mode_switch": 0.0, "loss_total": 0.04150687903165817, "step": 2405 }, { "batch_size": 4, "epoch": 0.962, "step": 2405, "tokens_per_device": 4404 }, { "epoch": 0.962, "loss_ce": 0.05006547272205353, "loss_lvr": 0.5692107081413269, "loss_mode_switch": 0.0, "loss_total": 0.10698654502630234, "step": 2405 }, { "batch_size": 4, "epoch": 0.962, "step": 2405, "tokens_per_device": 10684 }, { "epoch": 0.962, "loss_ce": 0.002526365453377366, "loss_lvr": 0.7315912246704102, "loss_mode_switch": 0.0, "loss_total": 0.07568548619747162, "step": 2405 }, { "batch_size": 4, "epoch": 0.962, "step": 2405, "tokens_per_device": 1616 }, { "epoch": 0.962, "loss_ce": 0.2916187047958374, "loss_lvr": 0.8692919611930847, "loss_mode_switch": 0.0, "loss_total": 0.37854790687561035, "step": 2405 }, { "batch_size": 4, "epoch": 0.962, "step": 2405, "tokens_per_device": 3872 }, { "epoch": 0.962, "loss_ce": 0.13157734274864197, "loss_lvr": 0.8169142603874207, "loss_mode_switch": 0.0, "loss_total": 0.21326877176761627, "step": 2405 }, { "epoch": 0.9624, "grad_norm": 1.0777870416641235, "learning_rate": 3.7028422017573175e-08, "loss": 0.2081, "step": 2406 }, { "batch_size": 1, "epoch": 0.9624, "step": 2406, "tokens_per_device": 5159 }, { "epoch": 0.9624, "loss_ce": 0.08621780574321747, "loss_lvr": 0.4542853534221649, "loss_mode_switch": 0.0, "loss_total": 0.13164633512496948, "step": 2406 }, { "batch_size": 4, "epoch": 0.9624, "step": 2406, "tokens_per_device": 4400 }, { "epoch": 0.9624, "loss_ce": 0.18706785142421722, "loss_lvr": 0.7033160328865051, "loss_mode_switch": 0.0, "loss_total": 0.25739943981170654, "step": 2406 }, { "batch_size": 1, "epoch": 0.9624, "step": 2406, "tokens_per_device": 6162 }, { "epoch": 0.9624, "loss_ce": 0.0051692756824195385, "loss_lvr": 0.30548718571662903, "loss_mode_switch": 0.0, "loss_total": 0.03571799397468567, "step": 2406 }, { "batch_size": 1, "epoch": 0.9624, "step": 2406, "tokens_per_device": 7398 }, { "epoch": 0.9624, "loss_ce": 0.06959692388772964, "loss_lvr": 0.281786173582077, "loss_mode_switch": 0.0, "loss_total": 0.09777554124593735, "step": 2406 }, { "batch_size": 1, "epoch": 0.9624, "step": 2406, "tokens_per_device": 5151 }, { "epoch": 0.9624, "loss_ce": 0.005858621560037136, "loss_lvr": 0.5345020890235901, "loss_mode_switch": 0.0, "loss_total": 0.05930883064866066, "step": 2406 }, { "batch_size": 4, "epoch": 0.9624, "step": 2406, "tokens_per_device": 3820 }, { "epoch": 0.9624, "loss_ce": 0.018453536555171013, "loss_lvr": 1.006174921989441, "loss_mode_switch": 0.0, "loss_total": 0.11907102912664413, "step": 2406 }, { "batch_size": 4, "epoch": 0.9624, "step": 2406, "tokens_per_device": 6504 }, { "epoch": 0.9624, "loss_ce": 0.28736141324043274, "loss_lvr": 0.6954361796379089, "loss_mode_switch": 0.0, "loss_total": 0.3569050431251526, "step": 2406 }, { "batch_size": 4, "epoch": 0.9624, "step": 2406, "tokens_per_device": 6036 }, { "epoch": 0.9624, "loss_ce": 0.17302128672599792, "loss_lvr": 0.7185209393501282, "loss_mode_switch": 0.0, "loss_total": 0.24487337470054626, "step": 2406 }, { "epoch": 0.9628, "grad_norm": 0.9684987664222717, "learning_rate": 3.62457220698198e-08, "loss": 0.1832, "step": 2407 }, { "batch_size": 4, "epoch": 0.9628, "step": 2407, "tokens_per_device": 1496 }, { "epoch": 0.9628, "loss_ce": 0.2708686590194702, "loss_lvr": 0.9943858981132507, "loss_mode_switch": 0.0, "loss_total": 0.3703072667121887, "step": 2407 }, { "batch_size": 4, "epoch": 0.9628, "step": 2407, "tokens_per_device": 2072 }, { "epoch": 0.9628, "loss_ce": 0.12644422054290771, "loss_lvr": 0.8816720247268677, "loss_mode_switch": 0.0, "loss_total": 0.21461142599582672, "step": 2407 }, { "batch_size": 4, "epoch": 0.9628, "step": 2407, "tokens_per_device": 3836 }, { "epoch": 0.9628, "loss_ce": 0.1406710147857666, "loss_lvr": 0.9496739506721497, "loss_mode_switch": 0.0, "loss_total": 0.23563840985298157, "step": 2407 }, { "batch_size": 1, "epoch": 0.9628, "step": 2407, "tokens_per_device": 5264 }, { "epoch": 0.9628, "loss_ce": 0.5236389636993408, "loss_lvr": 0.43386614322662354, "loss_mode_switch": 0.0, "loss_total": 0.5670256018638611, "step": 2407 }, { "batch_size": 4, "epoch": 0.9628, "step": 2407, "tokens_per_device": 4444 }, { "epoch": 0.9628, "loss_ce": 0.19563975930213928, "loss_lvr": 0.756477415561676, "loss_mode_switch": 0.0, "loss_total": 0.2712875008583069, "step": 2407 }, { "batch_size": 1, "epoch": 0.9628, "step": 2407, "tokens_per_device": 5197 }, { "epoch": 0.9628, "loss_ce": 0.013042232021689415, "loss_lvr": 0.37231728434562683, "loss_mode_switch": 0.0, "loss_total": 0.05027396231889725, "step": 2407 }, { "batch_size": 4, "epoch": 0.9628, "step": 2407, "tokens_per_device": 4700 }, { "epoch": 0.9628, "loss_ce": 0.3378728926181793, "loss_lvr": 0.7123649716377258, "loss_mode_switch": 0.0, "loss_total": 0.4091093838214874, "step": 2407 }, { "batch_size": 4, "epoch": 0.9628, "step": 2407, "tokens_per_device": 4424 }, { "epoch": 0.9628, "loss_ce": 0.07064028084278107, "loss_lvr": 0.7678284049034119, "loss_mode_switch": 0.0, "loss_total": 0.14742311835289001, "step": 2407 }, { "epoch": 0.9632, "grad_norm": 1.1785517930984497, "learning_rate": 3.547135291755488e-08, "loss": 0.2182, "step": 2408 }, { "batch_size": 4, "epoch": 0.9632, "step": 2408, "tokens_per_device": 4284 }, { "epoch": 0.9632, "loss_ce": 0.12405747175216675, "loss_lvr": 0.952242374420166, "loss_mode_switch": 0.0, "loss_total": 0.21928170323371887, "step": 2408 }, { "batch_size": 4, "epoch": 0.9632, "step": 2408, "tokens_per_device": 4424 }, { "epoch": 0.9632, "loss_ce": 0.568178117275238, "loss_lvr": 0.9022343158721924, "loss_mode_switch": 0.0, "loss_total": 0.6584015488624573, "step": 2408 }, { "batch_size": 1, "epoch": 0.9632, "step": 2408, "tokens_per_device": 5248 }, { "epoch": 0.9632, "loss_ce": 0.07951542735099792, "loss_lvr": 0.5419976115226746, "loss_mode_switch": 0.0, "loss_total": 0.1337151825428009, "step": 2408 }, { "batch_size": 4, "epoch": 0.9632, "step": 2408, "tokens_per_device": 2592 }, { "epoch": 0.9632, "loss_ce": 0.2440442591905594, "loss_lvr": 0.7743369340896606, "loss_mode_switch": 0.0, "loss_total": 0.3214779496192932, "step": 2408 }, { "batch_size": 1, "epoch": 0.9632, "step": 2408, "tokens_per_device": 5215 }, { "epoch": 0.9632, "loss_ce": 0.1832200288772583, "loss_lvr": 0.3008933663368225, "loss_mode_switch": 0.0, "loss_total": 0.2133093625307083, "step": 2408 }, { "batch_size": 4, "epoch": 0.9632, "step": 2408, "tokens_per_device": 4628 }, { "epoch": 0.9632, "loss_ce": 0.2536386549472809, "loss_lvr": 0.8051846623420715, "loss_mode_switch": 0.0, "loss_total": 0.3341571092605591, "step": 2408 }, { "batch_size": 4, "epoch": 0.9632, "step": 2408, "tokens_per_device": 3928 }, { "epoch": 0.9632, "loss_ce": 0.41180408000946045, "loss_lvr": 0.9476667642593384, "loss_mode_switch": 0.0, "loss_total": 0.5065707564353943, "step": 2408 }, { "batch_size": 1, "epoch": 0.9632, "step": 2408, "tokens_per_device": 4882 }, { "epoch": 0.9632, "loss_ce": 0.02345774695277214, "loss_lvr": 0.3617992699146271, "loss_mode_switch": 0.0, "loss_total": 0.05963767319917679, "step": 2408 }, { "epoch": 0.9636, "grad_norm": 1.3185229301452637, "learning_rate": 3.470531586042047e-08, "loss": 0.2798, "step": 2409 }, { "batch_size": 4, "epoch": 0.9636, "step": 2409, "tokens_per_device": 4744 }, { "epoch": 0.9636, "loss_ce": 0.011892723850905895, "loss_lvr": 0.6810889840126038, "loss_mode_switch": 0.0, "loss_total": 0.08000162243843079, "step": 2409 }, { "batch_size": 4, "epoch": 0.9636, "step": 2409, "tokens_per_device": 6028 }, { "epoch": 0.9636, "loss_ce": 0.012135997414588928, "loss_lvr": 0.6403487920761108, "loss_mode_switch": 0.0, "loss_total": 0.07617087662220001, "step": 2409 }, { "batch_size": 4, "epoch": 0.9636, "step": 2409, "tokens_per_device": 5728 }, { "epoch": 0.9636, "loss_ce": 0.009338850155472755, "loss_lvr": 0.7890405058860779, "loss_mode_switch": 0.0, "loss_total": 0.08824290335178375, "step": 2409 }, { "batch_size": 4, "epoch": 0.9636, "step": 2409, "tokens_per_device": 1336 }, { "epoch": 0.9636, "loss_ce": 0.596488356590271, "loss_lvr": 1.010516881942749, "loss_mode_switch": 0.0, "loss_total": 0.6975400447845459, "step": 2409 }, { "batch_size": 4, "epoch": 0.9636, "step": 2409, "tokens_per_device": 8272 }, { "epoch": 0.9636, "loss_ce": 0.2802268862724304, "loss_lvr": 0.909986674785614, "loss_mode_switch": 0.0, "loss_total": 0.3712255656719208, "step": 2409 }, { "batch_size": 1, "epoch": 0.9636, "step": 2409, "tokens_per_device": 5196 }, { "epoch": 0.9636, "loss_ce": 0.07574387639760971, "loss_lvr": 0.4628182053565979, "loss_mode_switch": 0.0, "loss_total": 0.12202569842338562, "step": 2409 }, { "batch_size": 1, "epoch": 0.9636, "step": 2409, "tokens_per_device": 4899 }, { "epoch": 0.9636, "loss_ce": 0.01684441976249218, "loss_lvr": 0.3442420959472656, "loss_mode_switch": 0.0, "loss_total": 0.05126862972974777, "step": 2409 }, { "batch_size": 4, "epoch": 0.9636, "step": 2409, "tokens_per_device": 8884 }, { "epoch": 0.9636, "loss_ce": 0.01964999921619892, "loss_lvr": 0.7208166122436523, "loss_mode_switch": 0.0, "loss_total": 0.09173166006803513, "step": 2409 }, { "epoch": 0.964, "grad_norm": 1.169519305229187, "learning_rate": 3.394761218407705e-08, "loss": 0.2347, "step": 2410 }, { "batch_size": 1, "epoch": 0.964, "step": 2410, "tokens_per_device": 4909 }, { "epoch": 0.964, "loss_ce": 0.004529946483671665, "loss_lvr": 0.3412516415119171, "loss_mode_switch": 0.0, "loss_total": 0.0386551097035408, "step": 2410 }, { "batch_size": 4, "epoch": 0.964, "step": 2410, "tokens_per_device": 5156 }, { "epoch": 0.964, "loss_ce": 0.14752811193466187, "loss_lvr": 0.6585443615913391, "loss_mode_switch": 0.0, "loss_total": 0.2133825421333313, "step": 2410 }, { "batch_size": 4, "epoch": 0.964, "step": 2410, "tokens_per_device": 3780 }, { "epoch": 0.964, "loss_ce": 0.16909314692020416, "loss_lvr": 1.303507685661316, "loss_mode_switch": 0.0, "loss_total": 0.29944390058517456, "step": 2410 }, { "batch_size": 1, "epoch": 0.964, "step": 2410, "tokens_per_device": 4961 }, { "epoch": 0.964, "loss_ce": 0.048202481120824814, "loss_lvr": 0.20480290055274963, "loss_mode_switch": 0.0, "loss_total": 0.06868277490139008, "step": 2410 }, { "batch_size": 1, "epoch": 0.964, "step": 2410, "tokens_per_device": 5552 }, { "epoch": 0.964, "loss_ce": 0.0006054213736206293, "loss_lvr": 0.38961222767829895, "loss_mode_switch": 0.0, "loss_total": 0.03956664726138115, "step": 2410 }, { "batch_size": 4, "epoch": 0.964, "step": 2410, "tokens_per_device": 4480 }, { "epoch": 0.964, "loss_ce": 0.18971678614616394, "loss_lvr": 0.8523203134536743, "loss_mode_switch": 0.0, "loss_total": 0.2749488353729248, "step": 2410 }, { "batch_size": 4, "epoch": 0.964, "step": 2410, "tokens_per_device": 3880 }, { "epoch": 0.964, "loss_ce": 0.07210750132799149, "loss_lvr": 0.8288131356239319, "loss_mode_switch": 0.0, "loss_total": 0.1549888253211975, "step": 2410 }, { "batch_size": 1, "epoch": 0.964, "step": 2410, "tokens_per_device": 5036 }, { "epoch": 0.964, "loss_ce": 0.00621543126180768, "loss_lvr": 0.27835577726364136, "loss_mode_switch": 0.0, "loss_total": 0.03405100852251053, "step": 2410 }, { "epoch": 0.9644, "grad_norm": 0.8857560157775879, "learning_rate": 3.3198243160198486e-08, "loss": 0.1672, "step": 2411 }, { "batch_size": 4, "epoch": 0.9644, "step": 2411, "tokens_per_device": 7132 }, { "epoch": 0.9644, "loss_ce": 0.011865117587149143, "loss_lvr": 0.6799923181533813, "loss_mode_switch": 0.0, "loss_total": 0.07986435294151306, "step": 2411 }, { "batch_size": 4, "epoch": 0.9644, "step": 2411, "tokens_per_device": 4256 }, { "epoch": 0.9644, "loss_ce": 0.4349142909049988, "loss_lvr": 0.7591277956962585, "loss_mode_switch": 0.0, "loss_total": 0.5108270645141602, "step": 2411 }, { "batch_size": 4, "epoch": 0.9644, "step": 2411, "tokens_per_device": 13136 }, { "epoch": 0.9644, "loss_ce": 0.439874529838562, "loss_lvr": 0.5994458794593811, "loss_mode_switch": 0.0, "loss_total": 0.4998191297054291, "step": 2411 }, { "batch_size": 4, "epoch": 0.9644, "step": 2411, "tokens_per_device": 5440 }, { "epoch": 0.9644, "loss_ce": 0.3683595061302185, "loss_lvr": 0.8023691177368164, "loss_mode_switch": 0.0, "loss_total": 0.44859641790390015, "step": 2411 }, { "batch_size": 4, "epoch": 0.9644, "step": 2411, "tokens_per_device": 4464 }, { "epoch": 0.9644, "loss_ce": 0.0120367631316185, "loss_lvr": 0.7151090502738953, "loss_mode_switch": 0.0, "loss_total": 0.0835476666688919, "step": 2411 }, { "batch_size": 4, "epoch": 0.9644, "step": 2411, "tokens_per_device": 4216 }, { "epoch": 0.9644, "loss_ce": 0.019050979986786842, "loss_lvr": 0.8704453110694885, "loss_mode_switch": 0.0, "loss_total": 0.10609550774097443, "step": 2411 }, { "batch_size": 4, "epoch": 0.9644, "step": 2411, "tokens_per_device": 5088 }, { "epoch": 0.9644, "loss_ce": 0.28208431601524353, "loss_lvr": 1.013245701789856, "loss_mode_switch": 0.0, "loss_total": 0.38340890407562256, "step": 2411 }, { "batch_size": 1, "epoch": 0.9644, "step": 2411, "tokens_per_device": 5111 }, { "epoch": 0.9644, "loss_ce": 0.02079193852841854, "loss_lvr": 0.19736479222774506, "loss_mode_switch": 0.0, "loss_total": 0.04052841663360596, "step": 2411 }, { "epoch": 0.9648, "grad_norm": 1.0726228952407837, "learning_rate": 3.245721004646929e-08, "loss": 0.239, "step": 2412 }, { "batch_size": 4, "epoch": 0.9648, "step": 2412, "tokens_per_device": 5868 }, { "epoch": 0.9648, "loss_ce": 0.34399735927581787, "loss_lvr": 0.8681544065475464, "loss_mode_switch": 0.0, "loss_total": 0.430812805891037, "step": 2412 }, { "batch_size": 1, "epoch": 0.9648, "step": 2412, "tokens_per_device": 4859 }, { "epoch": 0.9648, "loss_ce": 0.002276803832501173, "loss_lvr": 0.32070598006248474, "loss_mode_switch": 0.0, "loss_total": 0.034347403794527054, "step": 2412 }, { "batch_size": 1, "epoch": 0.9648, "step": 2412, "tokens_per_device": 4935 }, { "epoch": 0.9648, "loss_ce": 0.043430477380752563, "loss_lvr": 0.451016902923584, "loss_mode_switch": 0.0, "loss_total": 0.08853216469287872, "step": 2412 }, { "batch_size": 4, "epoch": 0.9648, "step": 2412, "tokens_per_device": 5764 }, { "epoch": 0.9648, "loss_ce": 0.001067343633621931, "loss_lvr": 0.695294976234436, "loss_mode_switch": 0.0, "loss_total": 0.0705968365073204, "step": 2412 }, { "batch_size": 1, "epoch": 0.9648, "step": 2412, "tokens_per_device": 4959 }, { "epoch": 0.9648, "loss_ce": 0.0044076829217374325, "loss_lvr": 0.2772165536880493, "loss_mode_switch": 0.0, "loss_total": 0.03212933987379074, "step": 2412 }, { "batch_size": 4, "epoch": 0.9648, "step": 2412, "tokens_per_device": 6268 }, { "epoch": 0.9648, "loss_ce": 0.20843182504177094, "loss_lvr": 0.8547691702842712, "loss_mode_switch": 0.0, "loss_total": 0.2939087450504303, "step": 2412 }, { "batch_size": 4, "epoch": 0.9648, "step": 2412, "tokens_per_device": 5096 }, { "epoch": 0.9648, "loss_ce": 0.1341744214296341, "loss_lvr": 0.8494147062301636, "loss_mode_switch": 0.0, "loss_total": 0.21911588311195374, "step": 2412 }, { "batch_size": 4, "epoch": 0.9648, "step": 2412, "tokens_per_device": 1860 }, { "epoch": 0.9648, "loss_ce": 0.2862175703048706, "loss_lvr": 0.9264187216758728, "loss_mode_switch": 0.0, "loss_total": 0.3788594603538513, "step": 2412 }, { "epoch": 0.9652, "grad_norm": 1.0792430639266968, "learning_rate": 3.172451408658406e-08, "loss": 0.2306, "step": 2413 }, { "batch_size": 4, "epoch": 0.9652, "step": 2413, "tokens_per_device": 3760 }, { "epoch": 0.9652, "loss_ce": 0.4324522316455841, "loss_lvr": 0.9542908668518066, "loss_mode_switch": 0.0, "loss_total": 0.5278813242912292, "step": 2413 }, { "batch_size": 4, "epoch": 0.9652, "step": 2413, "tokens_per_device": 3764 }, { "epoch": 0.9652, "loss_ce": 0.14030279219150543, "loss_lvr": 0.9575446248054504, "loss_mode_switch": 0.0, "loss_total": 0.23605725169181824, "step": 2413 }, { "batch_size": 4, "epoch": 0.9652, "step": 2413, "tokens_per_device": 4324 }, { "epoch": 0.9652, "loss_ce": 0.0005184212350286543, "loss_lvr": 0.6183869242668152, "loss_mode_switch": 0.0, "loss_total": 0.06235711649060249, "step": 2413 }, { "batch_size": 1, "epoch": 0.9652, "step": 2413, "tokens_per_device": 4874 }, { "epoch": 0.9652, "loss_ce": 0.04713365063071251, "loss_lvr": 0.32290610671043396, "loss_mode_switch": 0.0, "loss_total": 0.07942426204681396, "step": 2413 }, { "batch_size": 4, "epoch": 0.9652, "step": 2413, "tokens_per_device": 7904 }, { "epoch": 0.9652, "loss_ce": 0.11485951393842697, "loss_lvr": 0.8593064546585083, "loss_mode_switch": 0.0, "loss_total": 0.2007901668548584, "step": 2413 }, { "batch_size": 1, "epoch": 0.9652, "step": 2413, "tokens_per_device": 4753 }, { "epoch": 0.9652, "loss_ce": 0.014435365796089172, "loss_lvr": 0.3492501974105835, "loss_mode_switch": 0.0, "loss_total": 0.04936038702726364, "step": 2413 }, { "batch_size": 1, "epoch": 0.9652, "step": 2413, "tokens_per_device": 5178 }, { "epoch": 0.9652, "loss_ce": 0.001133596058934927, "loss_lvr": 0.5580218434333801, "loss_mode_switch": 0.0, "loss_total": 0.05693577975034714, "step": 2413 }, { "batch_size": 4, "epoch": 0.9652, "step": 2413, "tokens_per_device": 3804 }, { "epoch": 0.9652, "loss_ce": 0.23043140769004822, "loss_lvr": 0.8999209403991699, "loss_mode_switch": 0.0, "loss_total": 0.32042351365089417, "step": 2413 }, { "epoch": 0.9656, "grad_norm": 1.1592905521392822, "learning_rate": 3.100015651024524e-08, "loss": 0.2278, "step": 2414 }, { "batch_size": 4, "epoch": 0.9656, "step": 2414, "tokens_per_device": 1564 }, { "epoch": 0.9656, "loss_ce": 0.12213878333568573, "loss_lvr": 0.8056328296661377, "loss_mode_switch": 0.0, "loss_total": 0.20270207524299622, "step": 2414 }, { "batch_size": 1, "epoch": 0.9656, "step": 2414, "tokens_per_device": 4956 }, { "epoch": 0.9656, "loss_ce": 0.0003129865217488259, "loss_lvr": 0.42523863911628723, "loss_mode_switch": 0.0, "loss_total": 0.04283685237169266, "step": 2414 }, { "batch_size": 1, "epoch": 0.9656, "step": 2414, "tokens_per_device": 4888 }, { "epoch": 0.9656, "loss_ce": 0.004375271499156952, "loss_lvr": 0.766240656375885, "loss_mode_switch": 0.0, "loss_total": 0.08099933713674545, "step": 2414 }, { "batch_size": 4, "epoch": 0.9656, "step": 2414, "tokens_per_device": 3808 }, { "epoch": 0.9656, "loss_ce": 0.2000989019870758, "loss_lvr": 0.7871435284614563, "loss_mode_switch": 0.0, "loss_total": 0.2788132429122925, "step": 2414 }, { "batch_size": 4, "epoch": 0.9656, "step": 2414, "tokens_per_device": 4224 }, { "epoch": 0.9656, "loss_ce": 0.07175992429256439, "loss_lvr": 0.8669499754905701, "loss_mode_switch": 0.0, "loss_total": 0.15845492482185364, "step": 2414 }, { "batch_size": 4, "epoch": 0.9656, "step": 2414, "tokens_per_device": 5844 }, { "epoch": 0.9656, "loss_ce": 0.296346515417099, "loss_lvr": 1.171223521232605, "loss_mode_switch": 0.0, "loss_total": 0.4134688675403595, "step": 2414 }, { "batch_size": 4, "epoch": 0.9656, "step": 2414, "tokens_per_device": 4480 }, { "epoch": 0.9656, "loss_ce": 0.11017054319381714, "loss_lvr": 0.7331718802452087, "loss_mode_switch": 0.0, "loss_total": 0.18348774313926697, "step": 2414 }, { "batch_size": 1, "epoch": 0.9656, "step": 2414, "tokens_per_device": 5100 }, { "epoch": 0.9656, "loss_ce": 0.0033896828535944223, "loss_lvr": 0.3709109425544739, "loss_mode_switch": 0.0, "loss_total": 0.040480777621269226, "step": 2414 }, { "epoch": 0.966, "grad_norm": 1.1665951013565063, "learning_rate": 3.0284138533160924e-08, "loss": 0.2299, "step": 2415 }, { "batch_size": 4, "epoch": 0.966, "step": 2415, "tokens_per_device": 5876 }, { "epoch": 0.966, "loss_ce": 0.040509168058633804, "loss_lvr": 0.6971389651298523, "loss_mode_switch": 0.0, "loss_total": 0.11022306978702545, "step": 2415 }, { "batch_size": 4, "epoch": 0.966, "step": 2415, "tokens_per_device": 3780 }, { "epoch": 0.966, "loss_ce": 0.19786152243614197, "loss_lvr": 0.8028618693351746, "loss_mode_switch": 0.0, "loss_total": 0.27814769744873047, "step": 2415 }, { "batch_size": 4, "epoch": 0.966, "step": 2415, "tokens_per_device": 4828 }, { "epoch": 0.966, "loss_ce": 0.13658636808395386, "loss_lvr": 0.788743257522583, "loss_mode_switch": 0.0, "loss_total": 0.21546068787574768, "step": 2415 }, { "batch_size": 4, "epoch": 0.966, "step": 2415, "tokens_per_device": 1292 }, { "epoch": 0.966, "loss_ce": 0.1942867487668991, "loss_lvr": 1.1646977663040161, "loss_mode_switch": 0.0, "loss_total": 0.31075653433799744, "step": 2415 }, { "batch_size": 1, "epoch": 0.966, "step": 2415, "tokens_per_device": 4890 }, { "epoch": 0.966, "loss_ce": 0.027323706075549126, "loss_lvr": 1.276508092880249, "loss_mode_switch": 0.0, "loss_total": 0.15497452020645142, "step": 2415 }, { "batch_size": 4, "epoch": 0.966, "step": 2415, "tokens_per_device": 5440 }, { "epoch": 0.966, "loss_ce": 0.10143441706895828, "loss_lvr": 0.8964546918869019, "loss_mode_switch": 0.0, "loss_total": 0.19107988476753235, "step": 2415 }, { "batch_size": 4, "epoch": 0.966, "step": 2415, "tokens_per_device": 4244 }, { "epoch": 0.966, "loss_ce": 0.18395878374576569, "loss_lvr": 0.8024905920028687, "loss_mode_switch": 0.0, "loss_total": 0.2642078399658203, "step": 2415 }, { "batch_size": 4, "epoch": 0.966, "step": 2415, "tokens_per_device": 4620 }, { "epoch": 0.966, "loss_ce": 0.10886632651090622, "loss_lvr": 0.728100597858429, "loss_mode_switch": 0.0, "loss_total": 0.18167638778686523, "step": 2415 }, { "epoch": 0.9664, "grad_norm": 1.1727335453033447, "learning_rate": 2.957646135704262e-08, "loss": 0.2419, "step": 2416 }, { "batch_size": 4, "epoch": 0.9664, "step": 2416, "tokens_per_device": 4976 }, { "epoch": 0.9664, "loss_ce": 0.43693095445632935, "loss_lvr": 0.6779173016548157, "loss_mode_switch": 0.0, "loss_total": 0.5047227144241333, "step": 2416 }, { "batch_size": 4, "epoch": 0.9664, "step": 2416, "tokens_per_device": 5320 }, { "epoch": 0.9664, "loss_ce": 0.27356502413749695, "loss_lvr": 0.6559192538261414, "loss_mode_switch": 0.0, "loss_total": 0.33915695548057556, "step": 2416 }, { "batch_size": 4, "epoch": 0.9664, "step": 2416, "tokens_per_device": 4408 }, { "epoch": 0.9664, "loss_ce": 0.08880287408828735, "loss_lvr": 0.46021926403045654, "loss_mode_switch": 0.0, "loss_total": 0.13482479751110077, "step": 2416 }, { "batch_size": 1, "epoch": 0.9664, "step": 2416, "tokens_per_device": 5176 }, { "epoch": 0.9664, "loss_ce": 0.018931211903691292, "loss_lvr": 0.27156996726989746, "loss_mode_switch": 0.0, "loss_total": 0.04608821123838425, "step": 2416 }, { "batch_size": 4, "epoch": 0.9664, "step": 2416, "tokens_per_device": 3756 }, { "epoch": 0.9664, "loss_ce": 0.0273668821901083, "loss_lvr": 0.6160129308700562, "loss_mode_switch": 0.0, "loss_total": 0.0889681726694107, "step": 2416 }, { "batch_size": 1, "epoch": 0.9664, "step": 2416, "tokens_per_device": 7496 }, { "epoch": 0.9664, "loss_ce": 0.0021650928538292646, "loss_lvr": 0.35088077187538147, "loss_mode_switch": 0.0, "loss_total": 0.03725317120552063, "step": 2416 }, { "batch_size": 4, "epoch": 0.9664, "step": 2416, "tokens_per_device": 3884 }, { "epoch": 0.9664, "loss_ce": 0.05554962903261185, "loss_lvr": 0.7975669503211975, "loss_mode_switch": 0.0, "loss_total": 0.13530632853507996, "step": 2416 }, { "batch_size": 4, "epoch": 0.9664, "step": 2416, "tokens_per_device": 2868 }, { "epoch": 0.9664, "loss_ce": 0.029563147574663162, "loss_lvr": 0.5803861618041992, "loss_mode_switch": 0.0, "loss_total": 0.08760176599025726, "step": 2416 }, { "epoch": 0.9668, "grad_norm": 1.0997897386550903, "learning_rate": 2.8877126169602477e-08, "loss": 0.2126, "step": 2417 }, { "batch_size": 4, "epoch": 0.9668, "step": 2417, "tokens_per_device": 4016 }, { "epoch": 0.9668, "loss_ce": 0.37551942467689514, "loss_lvr": 0.7728155851364136, "loss_mode_switch": 0.0, "loss_total": 0.452800989151001, "step": 2417 }, { "batch_size": 4, "epoch": 0.9668, "step": 2417, "tokens_per_device": 4568 }, { "epoch": 0.9668, "loss_ce": 0.3748396933078766, "loss_lvr": 0.7135521769523621, "loss_mode_switch": 0.0, "loss_total": 0.44619491696357727, "step": 2417 }, { "batch_size": 4, "epoch": 0.9668, "step": 2417, "tokens_per_device": 4372 }, { "epoch": 0.9668, "loss_ce": 0.41254663467407227, "loss_lvr": 0.7248440980911255, "loss_mode_switch": 0.0, "loss_total": 0.48503103852272034, "step": 2417 }, { "batch_size": 4, "epoch": 0.9668, "step": 2417, "tokens_per_device": 1400 }, { "epoch": 0.9668, "loss_ce": 0.4258630871772766, "loss_lvr": 2.1285054683685303, "loss_mode_switch": 0.0, "loss_total": 0.6387136578559875, "step": 2417 }, { "batch_size": 4, "epoch": 0.9668, "step": 2417, "tokens_per_device": 5720 }, { "epoch": 0.9668, "loss_ce": 0.08954767137765884, "loss_lvr": 0.740989089012146, "loss_mode_switch": 0.0, "loss_total": 0.16364657878875732, "step": 2417 }, { "batch_size": 4, "epoch": 0.9668, "step": 2417, "tokens_per_device": 4116 }, { "epoch": 0.9668, "loss_ce": 0.18408676981925964, "loss_lvr": 0.6488350629806519, "loss_mode_switch": 0.0, "loss_total": 0.24897027015686035, "step": 2417 }, { "batch_size": 4, "epoch": 0.9668, "step": 2417, "tokens_per_device": 2248 }, { "epoch": 0.9668, "loss_ce": 0.05513736605644226, "loss_lvr": 0.796501100063324, "loss_mode_switch": 0.0, "loss_total": 0.13478747010231018, "step": 2417 }, { "batch_size": 4, "epoch": 0.9668, "step": 2417, "tokens_per_device": 5344 }, { "epoch": 0.9668, "loss_ce": 0.26060670614242554, "loss_lvr": 0.732057511806488, "loss_mode_switch": 0.0, "loss_total": 0.3338124752044678, "step": 2417 }, { "epoch": 0.9672, "grad_norm": 1.2784837484359741, "learning_rate": 2.818613414455218e-08, "loss": 0.1949, "step": 2418 }, { "batch_size": 4, "epoch": 0.9672, "step": 2418, "tokens_per_device": 4300 }, { "epoch": 0.9672, "loss_ce": 0.0625789612531662, "loss_lvr": 0.6525532007217407, "loss_mode_switch": 0.0, "loss_total": 0.127834290266037, "step": 2418 }, { "batch_size": 1, "epoch": 0.9672, "step": 2418, "tokens_per_device": 4859 }, { "epoch": 0.9672, "loss_ce": 0.018221097066998482, "loss_lvr": 0.3518053889274597, "loss_mode_switch": 0.0, "loss_total": 0.053401634097099304, "step": 2418 }, { "batch_size": 4, "epoch": 0.9672, "step": 2418, "tokens_per_device": 3032 }, { "epoch": 0.9672, "loss_ce": 0.1585652381181717, "loss_lvr": 0.9459667205810547, "loss_mode_switch": 0.0, "loss_total": 0.2531619071960449, "step": 2418 }, { "batch_size": 4, "epoch": 0.9672, "step": 2418, "tokens_per_device": 2760 }, { "epoch": 0.9672, "loss_ce": 0.009212140925228596, "loss_lvr": 0.5342804789543152, "loss_mode_switch": 0.0, "loss_total": 0.06264019012451172, "step": 2418 }, { "batch_size": 4, "epoch": 0.9672, "step": 2418, "tokens_per_device": 4572 }, { "epoch": 0.9672, "loss_ce": 0.08736755698919296, "loss_lvr": 0.8465306162834167, "loss_mode_switch": 0.0, "loss_total": 0.17202061414718628, "step": 2418 }, { "batch_size": 4, "epoch": 0.9672, "step": 2418, "tokens_per_device": 5844 }, { "epoch": 0.9672, "loss_ce": 0.029115350916981697, "loss_lvr": 0.7439662218093872, "loss_mode_switch": 0.0, "loss_total": 0.10351197421550751, "step": 2418 }, { "batch_size": 4, "epoch": 0.9672, "step": 2418, "tokens_per_device": 4200 }, { "epoch": 0.9672, "loss_ce": 0.27244967222213745, "loss_lvr": 0.9840676784515381, "loss_mode_switch": 0.0, "loss_total": 0.3708564341068268, "step": 2418 }, { "batch_size": 1, "epoch": 0.9672, "step": 2418, "tokens_per_device": 4891 }, { "epoch": 0.9672, "loss_ce": 0.00024051220680121332, "loss_lvr": 0.2361033409833908, "loss_mode_switch": 0.0, "loss_total": 0.02385084703564644, "step": 2418 }, { "epoch": 0.9676, "grad_norm": 1.161504864692688, "learning_rate": 2.7503486441602388e-08, "loss": 0.2072, "step": 2419 }, { "batch_size": 4, "epoch": 0.9676, "step": 2419, "tokens_per_device": 2752 }, { "epoch": 0.9676, "loss_ce": 0.1347464621067047, "loss_lvr": 0.842253565788269, "loss_mode_switch": 0.0, "loss_total": 0.21897181868553162, "step": 2419 }, { "batch_size": 1, "epoch": 0.9676, "step": 2419, "tokens_per_device": 4864 }, { "epoch": 0.9676, "loss_ce": 0.0512198768556118, "loss_lvr": 0.21592368185520172, "loss_mode_switch": 0.0, "loss_total": 0.07281224429607391, "step": 2419 }, { "batch_size": 4, "epoch": 0.9676, "step": 2419, "tokens_per_device": 6360 }, { "epoch": 0.9676, "loss_ce": 0.6512906551361084, "loss_lvr": 0.8631399869918823, "loss_mode_switch": 0.0, "loss_total": 0.7376046776771545, "step": 2419 }, { "batch_size": 1, "epoch": 0.9676, "step": 2419, "tokens_per_device": 5043 }, { "epoch": 0.9676, "loss_ce": 0.0050325216725468636, "loss_lvr": 0.3155977129936218, "loss_mode_switch": 0.0, "loss_total": 0.03659229353070259, "step": 2419 }, { "batch_size": 4, "epoch": 0.9676, "step": 2419, "tokens_per_device": 3836 }, { "epoch": 0.9676, "loss_ce": 0.27581942081451416, "loss_lvr": 0.7553487420082092, "loss_mode_switch": 0.0, "loss_total": 0.35135430097579956, "step": 2419 }, { "batch_size": 4, "epoch": 0.9676, "step": 2419, "tokens_per_device": 4228 }, { "epoch": 0.9676, "loss_ce": 0.02519439347088337, "loss_lvr": 1.2723530530929565, "loss_mode_switch": 0.0, "loss_total": 0.1524296998977661, "step": 2419 }, { "batch_size": 1, "epoch": 0.9676, "step": 2419, "tokens_per_device": 7513 }, { "epoch": 0.9676, "loss_ce": 0.0003594918816816062, "loss_lvr": 0.31738877296447754, "loss_mode_switch": 0.0, "loss_total": 0.03209836781024933, "step": 2419 }, { "batch_size": 4, "epoch": 0.9676, "step": 2419, "tokens_per_device": 1372 }, { "epoch": 0.9676, "loss_ce": 0.6114901304244995, "loss_lvr": 0.7885805368423462, "loss_mode_switch": 0.0, "loss_total": 0.690348207950592, "step": 2419 }, { "epoch": 0.968, "grad_norm": 1.0561074018478394, "learning_rate": 2.6829184206457194e-08, "loss": 0.2265, "step": 2420 }, { "batch_size": 4, "epoch": 0.968, "step": 2420, "tokens_per_device": 4244 }, { "epoch": 0.968, "loss_ce": 0.0751931294798851, "loss_lvr": 1.0212838649749756, "loss_mode_switch": 0.0, "loss_total": 0.17732152342796326, "step": 2420 }, { "batch_size": 4, "epoch": 0.968, "step": 2420, "tokens_per_device": 1420 }, { "epoch": 0.968, "loss_ce": 0.21865853667259216, "loss_lvr": 0.8948560953140259, "loss_mode_switch": 0.0, "loss_total": 0.30814415216445923, "step": 2420 }, { "batch_size": 4, "epoch": 0.968, "step": 2420, "tokens_per_device": 2056 }, { "epoch": 0.968, "loss_ce": 0.3866880536079407, "loss_lvr": 0.8520300984382629, "loss_mode_switch": 0.0, "loss_total": 0.4718910753726959, "step": 2420 }, { "batch_size": 1, "epoch": 0.968, "step": 2420, "tokens_per_device": 5180 }, { "epoch": 0.968, "loss_ce": 0.005366911645978689, "loss_lvr": 0.6988106966018677, "loss_mode_switch": 0.0, "loss_total": 0.07524798810482025, "step": 2420 }, { "batch_size": 1, "epoch": 0.968, "step": 2420, "tokens_per_device": 5114 }, { "epoch": 0.968, "loss_ce": 0.0004504455719143152, "loss_lvr": 0.3432864546775818, "loss_mode_switch": 0.0, "loss_total": 0.03477909415960312, "step": 2420 }, { "batch_size": 1, "epoch": 0.968, "step": 2420, "tokens_per_device": 5178 }, { "epoch": 0.968, "loss_ce": 0.01910577341914177, "loss_lvr": 0.4416050910949707, "loss_mode_switch": 0.0, "loss_total": 0.06326628476381302, "step": 2420 }, { "batch_size": 4, "epoch": 0.968, "step": 2420, "tokens_per_device": 2628 }, { "epoch": 0.968, "loss_ce": 0.2898057699203491, "loss_lvr": 0.9363280534744263, "loss_mode_switch": 0.0, "loss_total": 0.3834385871887207, "step": 2420 }, { "batch_size": 4, "epoch": 0.968, "step": 2420, "tokens_per_device": 3344 }, { "epoch": 0.968, "loss_ce": 0.03951219841837883, "loss_lvr": 0.8488223552703857, "loss_mode_switch": 0.0, "loss_total": 0.12439443171024323, "step": 2420 }, { "epoch": 0.9684, "grad_norm": 1.694138526916504, "learning_rate": 2.6163228570816324e-08, "loss": 0.2136, "step": 2421 }, { "batch_size": 4, "epoch": 0.9684, "step": 2421, "tokens_per_device": 4068 }, { "epoch": 0.9684, "loss_ce": 0.03522934019565582, "loss_lvr": 0.938166618347168, "loss_mode_switch": 0.0, "loss_total": 0.1290459930896759, "step": 2421 }, { "batch_size": 1, "epoch": 0.9684, "step": 2421, "tokens_per_device": 4887 }, { "epoch": 0.9684, "loss_ce": 0.011202349327504635, "loss_lvr": 0.6151721477508545, "loss_mode_switch": 0.0, "loss_total": 0.07271956652402878, "step": 2421 }, { "batch_size": 1, "epoch": 0.9684, "step": 2421, "tokens_per_device": 5115 }, { "epoch": 0.9684, "loss_ce": 0.011209203861653805, "loss_lvr": 0.34475550055503845, "loss_mode_switch": 0.0, "loss_total": 0.045684754848480225, "step": 2421 }, { "batch_size": 4, "epoch": 0.9684, "step": 2421, "tokens_per_device": 4660 }, { "epoch": 0.9684, "loss_ce": 0.06603405624628067, "loss_lvr": 0.7467820048332214, "loss_mode_switch": 0.0, "loss_total": 0.14071226119995117, "step": 2421 }, { "batch_size": 4, "epoch": 0.9684, "step": 2421, "tokens_per_device": 3800 }, { "epoch": 0.9684, "loss_ce": 0.15582327544689178, "loss_lvr": 0.8909443616867065, "loss_mode_switch": 0.0, "loss_total": 0.24491772055625916, "step": 2421 }, { "batch_size": 4, "epoch": 0.9684, "step": 2421, "tokens_per_device": 1220 }, { "epoch": 0.9684, "loss_ce": 0.10798437148332596, "loss_lvr": 0.9322693347930908, "loss_mode_switch": 0.0, "loss_total": 0.20121130347251892, "step": 2421 }, { "batch_size": 4, "epoch": 0.9684, "step": 2421, "tokens_per_device": 4444 }, { "epoch": 0.9684, "loss_ce": 0.15500758588314056, "loss_lvr": 0.9161103367805481, "loss_mode_switch": 0.0, "loss_total": 0.2466186285018921, "step": 2421 }, { "batch_size": 1, "epoch": 0.9684, "step": 2421, "tokens_per_device": 5117 }, { "epoch": 0.9684, "loss_ce": 0.001446118694730103, "loss_lvr": 0.290172278881073, "loss_mode_switch": 0.0, "loss_total": 0.03046334721148014, "step": 2421 }, { "epoch": 0.9688, "grad_norm": 1.2823587656021118, "learning_rate": 2.5505620652369058e-08, "loss": 0.204, "step": 2422 }, { "batch_size": 4, "epoch": 0.9688, "step": 2422, "tokens_per_device": 3792 }, { "epoch": 0.9688, "loss_ce": 0.0751880332827568, "loss_lvr": 0.6726081967353821, "loss_mode_switch": 0.0, "loss_total": 0.14244885742664337, "step": 2422 }, { "batch_size": 4, "epoch": 0.9688, "step": 2422, "tokens_per_device": 4980 }, { "epoch": 0.9688, "loss_ce": 0.18469683825969696, "loss_lvr": 0.8351776599884033, "loss_mode_switch": 0.0, "loss_total": 0.268214613199234, "step": 2422 }, { "batch_size": 4, "epoch": 0.9688, "step": 2422, "tokens_per_device": 1764 }, { "epoch": 0.9688, "loss_ce": 0.10076840221881866, "loss_lvr": 0.9446423053741455, "loss_mode_switch": 0.0, "loss_total": 0.19523262977600098, "step": 2422 }, { "batch_size": 1, "epoch": 0.9688, "step": 2422, "tokens_per_device": 5094 }, { "epoch": 0.9688, "loss_ce": 0.005079919472336769, "loss_lvr": 0.6512800455093384, "loss_mode_switch": 0.0, "loss_total": 0.07020792365074158, "step": 2422 }, { "batch_size": 1, "epoch": 0.9688, "step": 2422, "tokens_per_device": 4856 }, { "epoch": 0.9688, "loss_ce": 0.0015431659994646907, "loss_lvr": 0.25546330213546753, "loss_mode_switch": 0.0, "loss_total": 0.027089497074484825, "step": 2422 }, { "batch_size": 4, "epoch": 0.9688, "step": 2422, "tokens_per_device": 4708 }, { "epoch": 0.9688, "loss_ce": 0.1539432555437088, "loss_lvr": 0.5779390931129456, "loss_mode_switch": 0.0, "loss_total": 0.21173717081546783, "step": 2422 }, { "batch_size": 4, "epoch": 0.9688, "step": 2422, "tokens_per_device": 4328 }, { "epoch": 0.9688, "loss_ce": 0.15924085676670074, "loss_lvr": 0.9507623910903931, "loss_mode_switch": 0.0, "loss_total": 0.25431710481643677, "step": 2422 }, { "batch_size": 4, "epoch": 0.9688, "step": 2422, "tokens_per_device": 3376 }, { "epoch": 0.9688, "loss_ce": 0.24156728386878967, "loss_lvr": 1.0103141069412231, "loss_mode_switch": 0.0, "loss_total": 0.34259870648384094, "step": 2422 }, { "epoch": 0.9692, "grad_norm": 1.3235406875610352, "learning_rate": 2.4856361554795318e-08, "loss": 0.2265, "step": 2423 }, { "batch_size": 4, "epoch": 0.9692, "step": 2423, "tokens_per_device": 3816 }, { "epoch": 0.9692, "loss_ce": 0.13920080661773682, "loss_lvr": 0.9489666819572449, "loss_mode_switch": 0.0, "loss_total": 0.23409748077392578, "step": 2423 }, { "batch_size": 4, "epoch": 0.9692, "step": 2423, "tokens_per_device": 4356 }, { "epoch": 0.9692, "loss_ce": 0.08193561434745789, "loss_lvr": 0.7004633545875549, "loss_mode_switch": 0.0, "loss_total": 0.15198194980621338, "step": 2423 }, { "batch_size": 1, "epoch": 0.9692, "step": 2423, "tokens_per_device": 5113 }, { "epoch": 0.9692, "loss_ce": 0.011819247156381607, "loss_lvr": 0.2426428645849228, "loss_mode_switch": 0.0, "loss_total": 0.036083534359931946, "step": 2423 }, { "batch_size": 1, "epoch": 0.9692, "step": 2423, "tokens_per_device": 5021 }, { "epoch": 0.9692, "loss_ce": 0.013770855963230133, "loss_lvr": 0.30754703283309937, "loss_mode_switch": 0.0, "loss_total": 0.04452555999159813, "step": 2423 }, { "batch_size": 4, "epoch": 0.9692, "step": 2423, "tokens_per_device": 2620 }, { "epoch": 0.9692, "loss_ce": 0.4723721742630005, "loss_lvr": 0.7988530397415161, "loss_mode_switch": 0.0, "loss_total": 0.5522574782371521, "step": 2423 }, { "batch_size": 4, "epoch": 0.9692, "step": 2423, "tokens_per_device": 11004 }, { "epoch": 0.9692, "loss_ce": 0.07067308574914932, "loss_lvr": 0.3941729664802551, "loss_mode_switch": 0.0, "loss_total": 0.11009038239717484, "step": 2423 }, { "batch_size": 4, "epoch": 0.9692, "step": 2423, "tokens_per_device": 5504 }, { "epoch": 0.9692, "loss_ce": 0.002462196396663785, "loss_lvr": 0.7710077166557312, "loss_mode_switch": 0.0, "loss_total": 0.0795629695057869, "step": 2423 }, { "batch_size": 4, "epoch": 0.9692, "step": 2423, "tokens_per_device": 4480 }, { "epoch": 0.9692, "loss_ce": 0.07195572555065155, "loss_lvr": 0.8446134328842163, "loss_mode_switch": 0.0, "loss_total": 0.15641707181930542, "step": 2423 }, { "epoch": 0.9696, "grad_norm": 0.977063775062561, "learning_rate": 2.421545236776457e-08, "loss": 0.194, "step": 2424 }, { "batch_size": 4, "epoch": 0.9696, "step": 2424, "tokens_per_device": 1316 }, { "epoch": 0.9696, "loss_ce": 0.10690243542194366, "loss_lvr": 1.0724186897277832, "loss_mode_switch": 0.0, "loss_total": 0.21414430439472198, "step": 2424 }, { "batch_size": 4, "epoch": 0.9696, "step": 2424, "tokens_per_device": 4036 }, { "epoch": 0.9696, "loss_ce": 0.1823512762784958, "loss_lvr": 0.43534666299819946, "loss_mode_switch": 0.0, "loss_total": 0.22588594257831573, "step": 2424 }, { "batch_size": 4, "epoch": 0.9696, "step": 2424, "tokens_per_device": 4200 }, { "epoch": 0.9696, "loss_ce": 0.013717968948185444, "loss_lvr": 0.7406589984893799, "loss_mode_switch": 0.0, "loss_total": 0.08778387308120728, "step": 2424 }, { "batch_size": 4, "epoch": 0.9696, "step": 2424, "tokens_per_device": 5420 }, { "epoch": 0.9696, "loss_ce": 0.01608574576675892, "loss_lvr": 0.6381859183311462, "loss_mode_switch": 0.0, "loss_total": 0.07990434020757675, "step": 2424 }, { "batch_size": 4, "epoch": 0.9696, "step": 2424, "tokens_per_device": 4916 }, { "epoch": 0.9696, "loss_ce": 0.487362802028656, "loss_lvr": 0.8930187225341797, "loss_mode_switch": 0.0, "loss_total": 0.5766646862030029, "step": 2424 }, { "batch_size": 1, "epoch": 0.9696, "step": 2424, "tokens_per_device": 5283 }, { "epoch": 0.9696, "loss_ce": 9.616108582122251e-05, "loss_lvr": 0.3269403576850891, "loss_mode_switch": 0.0, "loss_total": 0.03279019892215729, "step": 2424 }, { "batch_size": 4, "epoch": 0.9696, "step": 2424, "tokens_per_device": 6620 }, { "epoch": 0.9696, "loss_ce": 0.06856236606836319, "loss_lvr": 0.8010926842689514, "loss_mode_switch": 0.0, "loss_total": 0.14867162704467773, "step": 2424 }, { "batch_size": 1, "epoch": 0.9696, "step": 2424, "tokens_per_device": 5153 }, { "epoch": 0.9696, "loss_ce": 0.005661404225975275, "loss_lvr": 0.26478928327560425, "loss_mode_switch": 0.0, "loss_total": 0.0321403332054615, "step": 2424 }, { "epoch": 0.97, "grad_norm": 1.120639443397522, "learning_rate": 2.358289416693027e-08, "loss": 0.2208, "step": 2425 }, { "batch_size": 4, "epoch": 0.97, "step": 2425, "tokens_per_device": 14412 }, { "epoch": 0.97, "loss_ce": 0.0505390502512455, "loss_lvr": 0.867929220199585, "loss_mode_switch": 0.0, "loss_total": 0.1373319774866104, "step": 2425 }, { "batch_size": 4, "epoch": 0.97, "step": 2425, "tokens_per_device": 4460 }, { "epoch": 0.97, "loss_ce": 0.4093515872955322, "loss_lvr": 0.8501890301704407, "loss_mode_switch": 0.0, "loss_total": 0.4943704903125763, "step": 2425 }, { "batch_size": 4, "epoch": 0.97, "step": 2425, "tokens_per_device": 4212 }, { "epoch": 0.97, "loss_ce": 0.042039573192596436, "loss_lvr": 0.7926368713378906, "loss_mode_switch": 0.0, "loss_total": 0.1213032603263855, "step": 2425 }, { "batch_size": 1, "epoch": 0.97, "step": 2425, "tokens_per_device": 4892 }, { "epoch": 0.97, "loss_ce": 0.03035782091319561, "loss_lvr": 0.36837223172187805, "loss_mode_switch": 0.0, "loss_total": 0.06719504296779633, "step": 2425 }, { "batch_size": 4, "epoch": 0.97, "step": 2425, "tokens_per_device": 5580 }, { "epoch": 0.97, "loss_ce": 0.16550925374031067, "loss_lvr": 0.761862576007843, "loss_mode_switch": 0.0, "loss_total": 0.24169552326202393, "step": 2425 }, { "batch_size": 4, "epoch": 0.97, "step": 2425, "tokens_per_device": 5424 }, { "epoch": 0.97, "loss_ce": 0.09435582906007767, "loss_lvr": 0.7469949722290039, "loss_mode_switch": 0.0, "loss_total": 0.16905532777309418, "step": 2425 }, { "batch_size": 1, "epoch": 0.97, "step": 2425, "tokens_per_device": 5124 }, { "epoch": 0.97, "loss_ce": 0.04747949168086052, "loss_lvr": 0.4589569866657257, "loss_mode_switch": 0.0, "loss_total": 0.09337519109249115, "step": 2425 }, { "batch_size": 4, "epoch": 0.97, "step": 2425, "tokens_per_device": 1256 }, { "epoch": 0.97, "loss_ce": 0.1154574602842331, "loss_lvr": 1.1007304191589355, "loss_mode_switch": 0.0, "loss_total": 0.2255305051803589, "step": 2425 }, { "epoch": 0.9704, "grad_norm": 1.1724416017532349, "learning_rate": 2.2958688013930973e-08, "loss": 0.2448, "step": 2426 }, { "batch_size": 1, "epoch": 0.9704, "step": 2426, "tokens_per_device": 5229 }, { "epoch": 0.9704, "loss_ce": 0.08236807584762573, "loss_lvr": 0.31483450531959534, "loss_mode_switch": 0.0, "loss_total": 0.11385153234004974, "step": 2426 }, { "batch_size": 4, "epoch": 0.9704, "step": 2426, "tokens_per_device": 2664 }, { "epoch": 0.9704, "loss_ce": 0.3751463294029236, "loss_lvr": 0.8877059817314148, "loss_mode_switch": 0.0, "loss_total": 0.46391692757606506, "step": 2426 }, { "batch_size": 1, "epoch": 0.9704, "step": 2426, "tokens_per_device": 5187 }, { "epoch": 0.9704, "loss_ce": 0.00011185684707015753, "loss_lvr": 0.3208976686000824, "loss_mode_switch": 0.0, "loss_total": 0.0322016216814518, "step": 2426 }, { "batch_size": 4, "epoch": 0.9704, "step": 2426, "tokens_per_device": 5120 }, { "epoch": 0.9704, "loss_ce": 0.30405303835868835, "loss_lvr": 0.7127335071563721, "loss_mode_switch": 0.0, "loss_total": 0.37532639503479004, "step": 2426 }, { "batch_size": 4, "epoch": 0.9704, "step": 2426, "tokens_per_device": 3020 }, { "epoch": 0.9704, "loss_ce": 0.11867912858724594, "loss_lvr": 0.7654435038566589, "loss_mode_switch": 0.0, "loss_total": 0.19522348046302795, "step": 2426 }, { "batch_size": 4, "epoch": 0.9704, "step": 2426, "tokens_per_device": 4216 }, { "epoch": 0.9704, "loss_ce": 0.037930119782686234, "loss_lvr": 0.9099195599555969, "loss_mode_switch": 0.0, "loss_total": 0.12892207503318787, "step": 2426 }, { "batch_size": 1, "epoch": 0.9704, "step": 2426, "tokens_per_device": 4948 }, { "epoch": 0.9704, "loss_ce": 0.04006608948111534, "loss_lvr": 0.22830802202224731, "loss_mode_switch": 0.0, "loss_total": 0.06289689242839813, "step": 2426 }, { "batch_size": 4, "epoch": 0.9704, "step": 2426, "tokens_per_device": 4776 }, { "epoch": 0.9704, "loss_ce": 0.20862510800361633, "loss_lvr": 0.833304762840271, "loss_mode_switch": 0.0, "loss_total": 0.2919555902481079, "step": 2426 }, { "epoch": 0.9708, "grad_norm": 1.1814231872558594, "learning_rate": 2.234283495638756e-08, "loss": 0.2101, "step": 2427 }, { "batch_size": 4, "epoch": 0.9708, "step": 2427, "tokens_per_device": 4812 }, { "epoch": 0.9708, "loss_ce": 0.2834889888763428, "loss_lvr": 0.9672335386276245, "loss_mode_switch": 0.0, "loss_total": 0.38021233677864075, "step": 2427 }, { "batch_size": 4, "epoch": 0.9708, "step": 2427, "tokens_per_device": 13760 }, { "epoch": 0.9708, "loss_ce": 0.0659591555595398, "loss_lvr": 0.6134755611419678, "loss_mode_switch": 0.0, "loss_total": 0.1273067146539688, "step": 2427 }, { "batch_size": 4, "epoch": 0.9708, "step": 2427, "tokens_per_device": 1360 }, { "epoch": 0.9708, "loss_ce": 0.08629682660102844, "loss_lvr": 1.0580936670303345, "loss_mode_switch": 0.0, "loss_total": 0.1921061873435974, "step": 2427 }, { "batch_size": 4, "epoch": 0.9708, "step": 2427, "tokens_per_device": 1632 }, { "epoch": 0.9708, "loss_ce": 0.40301889181137085, "loss_lvr": 1.0218452215194702, "loss_mode_switch": 0.0, "loss_total": 0.5052034258842468, "step": 2427 }, { "batch_size": 4, "epoch": 0.9708, "step": 2427, "tokens_per_device": 5204 }, { "epoch": 0.9708, "loss_ce": 0.05863538384437561, "loss_lvr": 0.7521675229072571, "loss_mode_switch": 0.0, "loss_total": 0.13385213911533356, "step": 2427 }, { "batch_size": 4, "epoch": 0.9708, "step": 2427, "tokens_per_device": 3708 }, { "epoch": 0.9708, "loss_ce": 0.017905743792653084, "loss_lvr": 1.050415277481079, "loss_mode_switch": 0.0, "loss_total": 0.12294726818799973, "step": 2427 }, { "batch_size": 4, "epoch": 0.9708, "step": 2427, "tokens_per_device": 4984 }, { "epoch": 0.9708, "loss_ce": 0.14326049387454987, "loss_lvr": 0.6691383123397827, "loss_mode_switch": 0.0, "loss_total": 0.2101743221282959, "step": 2427 }, { "batch_size": 1, "epoch": 0.9708, "step": 2427, "tokens_per_device": 4864 }, { "epoch": 0.9708, "loss_ce": 0.04486581310629845, "loss_lvr": 0.3915848731994629, "loss_mode_switch": 0.0, "loss_total": 0.08402430266141891, "step": 2427 }, { "epoch": 0.9712, "grad_norm": 0.9222203493118286, "learning_rate": 2.173533602790212e-08, "loss": 0.1721, "step": 2428 }, { "batch_size": 1, "epoch": 0.9712, "step": 2428, "tokens_per_device": 7548 }, { "epoch": 0.9712, "loss_ce": 0.00025308632757514715, "loss_lvr": 0.24623791873455048, "loss_mode_switch": 0.0, "loss_total": 0.024876879528164864, "step": 2428 }, { "batch_size": 4, "epoch": 0.9712, "step": 2428, "tokens_per_device": 9072 }, { "epoch": 0.9712, "loss_ce": 0.025446511805057526, "loss_lvr": 0.7517271041870117, "loss_mode_switch": 0.0, "loss_total": 0.10061922669410706, "step": 2428 }, { "batch_size": 4, "epoch": 0.9712, "step": 2428, "tokens_per_device": 1504 }, { "epoch": 0.9712, "loss_ce": 0.13357694447040558, "loss_lvr": 1.0375486612319946, "loss_mode_switch": 0.0, "loss_total": 0.2373318076133728, "step": 2428 }, { "batch_size": 1, "epoch": 0.9712, "step": 2428, "tokens_per_device": 5055 }, { "epoch": 0.9712, "loss_ce": 0.05448610708117485, "loss_lvr": 0.27754443883895874, "loss_mode_switch": 0.0, "loss_total": 0.08224055171012878, "step": 2428 }, { "batch_size": 1, "epoch": 0.9712, "step": 2428, "tokens_per_device": 4882 }, { "epoch": 0.9712, "loss_ce": 0.027890941128134727, "loss_lvr": 0.5637905597686768, "loss_mode_switch": 0.0, "loss_total": 0.08427000045776367, "step": 2428 }, { "batch_size": 1, "epoch": 0.9712, "step": 2428, "tokens_per_device": 4735 }, { "epoch": 0.9712, "loss_ce": 0.012065472081303596, "loss_lvr": 0.24901413917541504, "loss_mode_switch": 0.0, "loss_total": 0.03696688637137413, "step": 2428 }, { "batch_size": 4, "epoch": 0.9712, "step": 2428, "tokens_per_device": 1712 }, { "epoch": 0.9712, "loss_ce": 0.2921726107597351, "loss_lvr": 0.8876915574073792, "loss_mode_switch": 0.0, "loss_total": 0.380941778421402, "step": 2428 }, { "batch_size": 4, "epoch": 0.9712, "step": 2428, "tokens_per_device": 3768 }, { "epoch": 0.9712, "loss_ce": 0.3137132525444031, "loss_lvr": 0.9056240916252136, "loss_mode_switch": 0.0, "loss_total": 0.40427565574645996, "step": 2428 }, { "epoch": 0.9716, "grad_norm": 1.119066596031189, "learning_rate": 2.1136192248056298e-08, "loss": 0.2179, "step": 2429 }, { "batch_size": 4, "epoch": 0.9716, "step": 2429, "tokens_per_device": 4276 }, { "epoch": 0.9716, "loss_ce": 0.2526727020740509, "loss_lvr": 0.9623770117759705, "loss_mode_switch": 0.0, "loss_total": 0.348910391330719, "step": 2429 }, { "batch_size": 4, "epoch": 0.9716, "step": 2429, "tokens_per_device": 4440 }, { "epoch": 0.9716, "loss_ce": 0.18555736541748047, "loss_lvr": 0.6989564895629883, "loss_mode_switch": 0.0, "loss_total": 0.2554530203342438, "step": 2429 }, { "batch_size": 4, "epoch": 0.9716, "step": 2429, "tokens_per_device": 1880 }, { "epoch": 0.9716, "loss_ce": 0.14219731092453003, "loss_lvr": 0.9765639901161194, "loss_mode_switch": 0.0, "loss_total": 0.23985370993614197, "step": 2429 }, { "batch_size": 1, "epoch": 0.9716, "step": 2429, "tokens_per_device": 5259 }, { "epoch": 0.9716, "loss_ce": 0.03821884095668793, "loss_lvr": 0.44833311438560486, "loss_mode_switch": 0.0, "loss_total": 0.08305215835571289, "step": 2429 }, { "batch_size": 4, "epoch": 0.9716, "step": 2429, "tokens_per_device": 5760 }, { "epoch": 0.9716, "loss_ce": 0.0024247553665190935, "loss_lvr": 1.1187543869018555, "loss_mode_switch": 0.0, "loss_total": 0.1143001914024353, "step": 2429 }, { "batch_size": 1, "epoch": 0.9716, "step": 2429, "tokens_per_device": 5166 }, { "epoch": 0.9716, "loss_ce": 0.0027019851841032505, "loss_lvr": 0.4489618241786957, "loss_mode_switch": 0.0, "loss_total": 0.04759816825389862, "step": 2429 }, { "batch_size": 4, "epoch": 0.9716, "step": 2429, "tokens_per_device": 7256 }, { "epoch": 0.9716, "loss_ce": 0.2510851323604584, "loss_lvr": 0.4086843430995941, "loss_mode_switch": 0.0, "loss_total": 0.29195356369018555, "step": 2429 }, { "batch_size": 4, "epoch": 0.9716, "step": 2429, "tokens_per_device": 4316 }, { "epoch": 0.9716, "loss_ce": 0.2297104001045227, "loss_lvr": 0.8613318800926208, "loss_mode_switch": 0.0, "loss_total": 0.3158435821533203, "step": 2429 }, { "epoch": 0.972, "grad_norm": 1.1723469495773315, "learning_rate": 2.0545404622407396e-08, "loss": 0.219, "step": 2430 }, { "batch_size": 4, "epoch": 0.972, "step": 2430, "tokens_per_device": 1632 }, { "epoch": 0.972, "loss_ce": 0.40557214617729187, "loss_lvr": 0.9907424449920654, "loss_mode_switch": 0.0, "loss_total": 0.5046464204788208, "step": 2430 }, { "batch_size": 1, "epoch": 0.972, "step": 2430, "tokens_per_device": 4863 }, { "epoch": 0.972, "loss_ce": 0.0002918739046435803, "loss_lvr": 0.2742644250392914, "loss_mode_switch": 0.0, "loss_total": 0.02771831676363945, "step": 2430 }, { "batch_size": 1, "epoch": 0.972, "step": 2430, "tokens_per_device": 5143 }, { "epoch": 0.972, "loss_ce": 0.1287393718957901, "loss_lvr": 0.5903549790382385, "loss_mode_switch": 0.0, "loss_total": 0.1877748668193817, "step": 2430 }, { "batch_size": 1, "epoch": 0.972, "step": 2430, "tokens_per_device": 4896 }, { "epoch": 0.972, "loss_ce": 0.05007947236299515, "loss_lvr": 0.3106350600719452, "loss_mode_switch": 0.0, "loss_total": 0.08114297688007355, "step": 2430 }, { "batch_size": 4, "epoch": 0.972, "step": 2430, "tokens_per_device": 8464 }, { "epoch": 0.972, "loss_ce": 0.02918253093957901, "loss_lvr": 0.712796688079834, "loss_mode_switch": 0.0, "loss_total": 0.10046219825744629, "step": 2430 }, { "batch_size": 4, "epoch": 0.972, "step": 2430, "tokens_per_device": 9056 }, { "epoch": 0.972, "loss_ce": 0.017181193456053734, "loss_lvr": 0.5685335397720337, "loss_mode_switch": 0.0, "loss_total": 0.07403454929590225, "step": 2430 }, { "batch_size": 4, "epoch": 0.972, "step": 2430, "tokens_per_device": 2644 }, { "epoch": 0.972, "loss_ce": 0.12211591005325317, "loss_lvr": 0.6776633858680725, "loss_mode_switch": 0.0, "loss_total": 0.18988224864006042, "step": 2430 }, { "batch_size": 1, "epoch": 0.972, "step": 2430, "tokens_per_device": 4877 }, { "epoch": 0.972, "loss_ce": 0.006608428433537483, "loss_lvr": 0.2752246856689453, "loss_mode_switch": 0.0, "loss_total": 0.034130897372961044, "step": 2430 }, { "epoch": 0.9724, "grad_norm": 1.15715754032135, "learning_rate": 1.9962974142490043e-08, "loss": 0.213, "step": 2431 }, { "batch_size": 4, "epoch": 0.9724, "step": 2431, "tokens_per_device": 2864 }, { "epoch": 0.9724, "loss_ce": 0.2952495813369751, "loss_lvr": 1.8909550905227661, "loss_mode_switch": 0.0, "loss_total": 0.48434507846832275, "step": 2431 }, { "batch_size": 4, "epoch": 0.9724, "step": 2431, "tokens_per_device": 4604 }, { "epoch": 0.9724, "loss_ce": 0.21543702483177185, "loss_lvr": 0.8423528075218201, "loss_mode_switch": 0.0, "loss_total": 0.29967230558395386, "step": 2431 }, { "batch_size": 4, "epoch": 0.9724, "step": 2431, "tokens_per_device": 8812 }, { "epoch": 0.9724, "loss_ce": 0.07540515065193176, "loss_lvr": 0.857779324054718, "loss_mode_switch": 0.0, "loss_total": 0.16118308901786804, "step": 2431 }, { "batch_size": 4, "epoch": 0.9724, "step": 2431, "tokens_per_device": 1296 }, { "epoch": 0.9724, "loss_ce": 0.13237130641937256, "loss_lvr": 1.888208270072937, "loss_mode_switch": 0.0, "loss_total": 0.3211921453475952, "step": 2431 }, { "batch_size": 4, "epoch": 0.9724, "step": 2431, "tokens_per_device": 4268 }, { "epoch": 0.9724, "loss_ce": 0.0034672704059630632, "loss_lvr": 0.36239364743232727, "loss_mode_switch": 0.0, "loss_total": 0.03970663622021675, "step": 2431 }, { "batch_size": 4, "epoch": 0.9724, "step": 2431, "tokens_per_device": 5336 }, { "epoch": 0.9724, "loss_ce": 0.11345213651657104, "loss_lvr": 0.7424823641777039, "loss_mode_switch": 0.0, "loss_total": 0.18770037591457367, "step": 2431 }, { "batch_size": 4, "epoch": 0.9724, "step": 2431, "tokens_per_device": 5372 }, { "epoch": 0.9724, "loss_ce": 0.09168107807636261, "loss_lvr": 0.8306246995925903, "loss_mode_switch": 0.0, "loss_total": 0.17474354803562164, "step": 2431 }, { "batch_size": 4, "epoch": 0.9724, "step": 2431, "tokens_per_device": 2516 }, { "epoch": 0.9724, "loss_ce": 0.18538495898246765, "loss_lvr": 1.0522030591964722, "loss_mode_switch": 0.0, "loss_total": 0.2906052768230438, "step": 2431 }, { "epoch": 0.9728, "grad_norm": 1.0555356740951538, "learning_rate": 1.9388901785811766e-08, "loss": 0.2309, "step": 2432 }, { "batch_size": 1, "epoch": 0.9728, "step": 2432, "tokens_per_device": 7579 }, { "epoch": 0.9728, "loss_ce": 0.013651889748871326, "loss_lvr": 0.3005358874797821, "loss_mode_switch": 0.0, "loss_total": 0.04370547831058502, "step": 2432 }, { "batch_size": 1, "epoch": 0.9728, "step": 2432, "tokens_per_device": 5114 }, { "epoch": 0.9728, "loss_ce": 0.027656644582748413, "loss_lvr": 0.37914925813674927, "loss_mode_switch": 0.0, "loss_total": 0.06557157635688782, "step": 2432 }, { "batch_size": 4, "epoch": 0.9728, "step": 2432, "tokens_per_device": 3948 }, { "epoch": 0.9728, "loss_ce": 0.1930292397737503, "loss_lvr": 1.0285940170288086, "loss_mode_switch": 0.0, "loss_total": 0.29588863253593445, "step": 2432 }, { "batch_size": 4, "epoch": 0.9728, "step": 2432, "tokens_per_device": 4440 }, { "epoch": 0.9728, "loss_ce": 0.02805880270898342, "loss_lvr": 0.4578514099121094, "loss_mode_switch": 0.0, "loss_total": 0.07384394109249115, "step": 2432 }, { "batch_size": 4, "epoch": 0.9728, "step": 2432, "tokens_per_device": 4080 }, { "epoch": 0.9728, "loss_ce": 0.008110660128295422, "loss_lvr": 0.8751980662345886, "loss_mode_switch": 0.0, "loss_total": 0.0956304669380188, "step": 2432 }, { "batch_size": 1, "epoch": 0.9728, "step": 2432, "tokens_per_device": 5160 }, { "epoch": 0.9728, "loss_ce": 0.010484888218343258, "loss_lvr": 0.2673909366130829, "loss_mode_switch": 0.0, "loss_total": 0.03722398355603218, "step": 2432 }, { "batch_size": 4, "epoch": 0.9728, "step": 2432, "tokens_per_device": 4164 }, { "epoch": 0.9728, "loss_ce": 0.26967695355415344, "loss_lvr": 0.954128623008728, "loss_mode_switch": 0.0, "loss_total": 0.3650898337364197, "step": 2432 }, { "batch_size": 1, "epoch": 0.9728, "step": 2432, "tokens_per_device": 4884 }, { "epoch": 0.9728, "loss_ce": 0.006558829918503761, "loss_lvr": 0.5670717358589172, "loss_mode_switch": 0.0, "loss_total": 0.06326600164175034, "step": 2432 }, { "epoch": 0.9732, "grad_norm": 1.0827562808990479, "learning_rate": 1.8823188515852964e-08, "loss": 0.2184, "step": 2433 }, { "batch_size": 4, "epoch": 0.9732, "step": 2433, "tokens_per_device": 4408 }, { "epoch": 0.9732, "loss_ce": 0.033653587102890015, "loss_lvr": 0.6267522573471069, "loss_mode_switch": 0.0, "loss_total": 0.09632881730794907, "step": 2433 }, { "batch_size": 4, "epoch": 0.9732, "step": 2433, "tokens_per_device": 2588 }, { "epoch": 0.9732, "loss_ce": 0.1092502772808075, "loss_lvr": 0.8053665161132812, "loss_mode_switch": 0.0, "loss_total": 0.18978694081306458, "step": 2433 }, { "batch_size": 1, "epoch": 0.9732, "step": 2433, "tokens_per_device": 7334 }, { "epoch": 0.9732, "loss_ce": 0.011104960925877094, "loss_lvr": 0.4325297474861145, "loss_mode_switch": 0.0, "loss_total": 0.05435793474316597, "step": 2433 }, { "batch_size": 1, "epoch": 0.9732, "step": 2433, "tokens_per_device": 5205 }, { "epoch": 0.9732, "loss_ce": 0.00022709943004883826, "loss_lvr": 0.36841046810150146, "loss_mode_switch": 0.0, "loss_total": 0.03706814721226692, "step": 2433 }, { "batch_size": 1, "epoch": 0.9732, "step": 2433, "tokens_per_device": 5047 }, { "epoch": 0.9732, "loss_ce": 0.05498031899333, "loss_lvr": 0.32728734612464905, "loss_mode_switch": 0.0, "loss_total": 0.08770905435085297, "step": 2433 }, { "batch_size": 4, "epoch": 0.9732, "step": 2433, "tokens_per_device": 6052 }, { "epoch": 0.9732, "loss_ce": 0.15523798763751984, "loss_lvr": 1.3023566007614136, "loss_mode_switch": 0.0, "loss_total": 0.28547364473342896, "step": 2433 }, { "batch_size": 1, "epoch": 0.9732, "step": 2433, "tokens_per_device": 5109 }, { "epoch": 0.9732, "loss_ce": 0.00989389419555664, "loss_lvr": 0.3101574182510376, "loss_mode_switch": 0.0, "loss_total": 0.04090963676571846, "step": 2433 }, { "batch_size": 4, "epoch": 0.9732, "step": 2433, "tokens_per_device": 4744 }, { "epoch": 0.9732, "loss_ce": 0.01007546205073595, "loss_lvr": 0.647789478302002, "loss_mode_switch": 0.0, "loss_total": 0.07485441118478775, "step": 2433 }, { "epoch": 0.9736, "grad_norm": 1.040648102760315, "learning_rate": 1.826583528206527e-08, "loss": 0.1965, "step": 2434 }, { "batch_size": 4, "epoch": 0.9736, "step": 2434, "tokens_per_device": 4144 }, { "epoch": 0.9736, "loss_ce": 0.11763890087604523, "loss_lvr": 0.7376124858856201, "loss_mode_switch": 0.0, "loss_total": 0.19140014052391052, "step": 2434 }, { "batch_size": 1, "epoch": 0.9736, "step": 2434, "tokens_per_device": 4732 }, { "epoch": 0.9736, "loss_ce": 0.009843975305557251, "loss_lvr": 0.3049890100955963, "loss_mode_switch": 0.0, "loss_total": 0.04034287482500076, "step": 2434 }, { "batch_size": 4, "epoch": 0.9736, "step": 2434, "tokens_per_device": 5764 }, { "epoch": 0.9736, "loss_ce": 0.17662596702575684, "loss_lvr": 0.7583450675010681, "loss_mode_switch": 0.0, "loss_total": 0.2524604797363281, "step": 2434 }, { "batch_size": 4, "epoch": 0.9736, "step": 2434, "tokens_per_device": 4316 }, { "epoch": 0.9736, "loss_ce": 0.19491417706012726, "loss_lvr": 0.9450028538703918, "loss_mode_switch": 0.0, "loss_total": 0.2894144654273987, "step": 2434 }, { "batch_size": 4, "epoch": 0.9736, "step": 2434, "tokens_per_device": 6372 }, { "epoch": 0.9736, "loss_ce": 0.013910133391618729, "loss_lvr": 0.6863148212432861, "loss_mode_switch": 0.0, "loss_total": 0.08254161477088928, "step": 2434 }, { "batch_size": 4, "epoch": 0.9736, "step": 2434, "tokens_per_device": 4100 }, { "epoch": 0.9736, "loss_ce": 0.08812855184078217, "loss_lvr": 0.6869221925735474, "loss_mode_switch": 0.0, "loss_total": 0.15682077407836914, "step": 2434 }, { "batch_size": 4, "epoch": 0.9736, "step": 2434, "tokens_per_device": 3892 }, { "epoch": 0.9736, "loss_ce": 0.16641449928283691, "loss_lvr": 0.7227444648742676, "loss_mode_switch": 0.0, "loss_total": 0.23868894577026367, "step": 2434 }, { "batch_size": 1, "epoch": 0.9736, "step": 2434, "tokens_per_device": 4893 }, { "epoch": 0.9736, "loss_ce": 0.041782259941101074, "loss_lvr": 0.16550803184509277, "loss_mode_switch": 0.0, "loss_total": 0.05833306163549423, "step": 2434 }, { "epoch": 0.974, "grad_norm": 0.963668704032898, "learning_rate": 1.7716843019867646e-08, "loss": 0.1725, "step": 2435 }, { "batch_size": 1, "epoch": 0.974, "step": 2435, "tokens_per_device": 5190 }, { "epoch": 0.974, "loss_ce": 0.04225368797779083, "loss_lvr": 0.6226431727409363, "loss_mode_switch": 0.0, "loss_total": 0.10451801121234894, "step": 2435 }, { "batch_size": 1, "epoch": 0.974, "step": 2435, "tokens_per_device": 4831 }, { "epoch": 0.974, "loss_ce": 0.00024199009931180626, "loss_lvr": 0.14288628101348877, "loss_mode_switch": 0.0, "loss_total": 0.01453061867505312, "step": 2435 }, { "batch_size": 4, "epoch": 0.974, "step": 2435, "tokens_per_device": 2652 }, { "epoch": 0.974, "loss_ce": 0.18617908656597137, "loss_lvr": 0.6886688470840454, "loss_mode_switch": 0.0, "loss_total": 0.25504598021507263, "step": 2435 }, { "batch_size": 4, "epoch": 0.974, "step": 2435, "tokens_per_device": 1576 }, { "epoch": 0.974, "loss_ce": 0.4395062029361725, "loss_lvr": 1.1332963705062866, "loss_mode_switch": 0.0, "loss_total": 0.5528358221054077, "step": 2435 }, { "batch_size": 4, "epoch": 0.974, "step": 2435, "tokens_per_device": 4280 }, { "epoch": 0.974, "loss_ce": 0.33804112672805786, "loss_lvr": 0.9252861142158508, "loss_mode_switch": 0.0, "loss_total": 0.43056973814964294, "step": 2435 }, { "batch_size": 4, "epoch": 0.974, "step": 2435, "tokens_per_device": 6892 }, { "epoch": 0.974, "loss_ce": 0.13875500857830048, "loss_lvr": 0.8426957726478577, "loss_mode_switch": 0.0, "loss_total": 0.22302457690238953, "step": 2435 }, { "batch_size": 1, "epoch": 0.974, "step": 2435, "tokens_per_device": 5183 }, { "epoch": 0.974, "loss_ce": 0.005023312754929066, "loss_lvr": 0.5899847149848938, "loss_mode_switch": 0.0, "loss_total": 0.06402178853750229, "step": 2435 }, { "batch_size": 4, "epoch": 0.974, "step": 2435, "tokens_per_device": 4328 }, { "epoch": 0.974, "loss_ce": 0.0037555645685642958, "loss_lvr": 0.8053485155105591, "loss_mode_switch": 0.0, "loss_total": 0.08429041504859924, "step": 2435 }, { "epoch": 0.9744, "grad_norm": 1.2399024963378906, "learning_rate": 1.7176212650648616e-08, "loss": 0.2304, "step": 2436 }, { "batch_size": 4, "epoch": 0.9744, "step": 2436, "tokens_per_device": 1380 }, { "epoch": 0.9744, "loss_ce": 0.03890986740589142, "loss_lvr": 1.0287820100784302, "loss_mode_switch": 0.0, "loss_total": 0.1417880654335022, "step": 2436 }, { "batch_size": 1, "epoch": 0.9744, "step": 2436, "tokens_per_device": 4602 }, { "epoch": 0.9744, "loss_ce": 0.018832046538591385, "loss_lvr": 0.5318637490272522, "loss_mode_switch": 0.0, "loss_total": 0.07201842218637466, "step": 2436 }, { "batch_size": 1, "epoch": 0.9744, "step": 2436, "tokens_per_device": 5200 }, { "epoch": 0.9744, "loss_ce": 0.05920103192329407, "loss_lvr": 0.6941041350364685, "loss_mode_switch": 0.0, "loss_total": 0.12861144542694092, "step": 2436 }, { "batch_size": 4, "epoch": 0.9744, "step": 2436, "tokens_per_device": 12584 }, { "epoch": 0.9744, "loss_ce": 0.37362080812454224, "loss_lvr": 0.9582938551902771, "loss_mode_switch": 0.0, "loss_total": 0.4694502055644989, "step": 2436 }, { "batch_size": 4, "epoch": 0.9744, "step": 2436, "tokens_per_device": 3788 }, { "epoch": 0.9744, "loss_ce": 0.32509663701057434, "loss_lvr": 1.0138027667999268, "loss_mode_switch": 0.0, "loss_total": 0.42647692561149597, "step": 2436 }, { "batch_size": 4, "epoch": 0.9744, "step": 2436, "tokens_per_device": 4648 }, { "epoch": 0.9744, "loss_ce": 0.24536307156085968, "loss_lvr": 0.7892736196517944, "loss_mode_switch": 0.0, "loss_total": 0.3242904245853424, "step": 2436 }, { "batch_size": 4, "epoch": 0.9744, "step": 2436, "tokens_per_device": 3772 }, { "epoch": 0.9744, "loss_ce": 0.05751575157046318, "loss_lvr": 0.8032333254814148, "loss_mode_switch": 0.0, "loss_total": 0.13783907890319824, "step": 2436 }, { "batch_size": 4, "epoch": 0.9744, "step": 2436, "tokens_per_device": 2572 }, { "epoch": 0.9744, "loss_ce": 0.24382969737052917, "loss_lvr": 0.8302755355834961, "loss_mode_switch": 0.0, "loss_total": 0.3268572688102722, "step": 2436 }, { "epoch": 0.9748, "grad_norm": 1.0508443117141724, "learning_rate": 1.6643945081761813e-08, "loss": 0.2042, "step": 2437 }, { "batch_size": 1, "epoch": 0.9748, "step": 2437, "tokens_per_device": 5165 }, { "epoch": 0.9748, "loss_ce": 0.01392855029553175, "loss_lvr": 0.24007061123847961, "loss_mode_switch": 0.0, "loss_total": 0.037935610860586166, "step": 2437 }, { "batch_size": 4, "epoch": 0.9748, "step": 2437, "tokens_per_device": 3940 }, { "epoch": 0.9748, "loss_ce": 0.0012210934655740857, "loss_lvr": 0.43323981761932373, "loss_mode_switch": 0.0, "loss_total": 0.04454507678747177, "step": 2437 }, { "batch_size": 4, "epoch": 0.9748, "step": 2437, "tokens_per_device": 8792 }, { "epoch": 0.9748, "loss_ce": 0.20312243700027466, "loss_lvr": 0.6215651035308838, "loss_mode_switch": 0.0, "loss_total": 0.2652789354324341, "step": 2437 }, { "batch_size": 1, "epoch": 0.9748, "step": 2437, "tokens_per_device": 4885 }, { "epoch": 0.9748, "loss_ce": 0.011671674437820911, "loss_lvr": 0.7195847630500793, "loss_mode_switch": 0.0, "loss_total": 0.08363015204668045, "step": 2437 }, { "batch_size": 1, "epoch": 0.9748, "step": 2437, "tokens_per_device": 5125 }, { "epoch": 0.9748, "loss_ce": 0.002037031576037407, "loss_lvr": 0.40385669469833374, "loss_mode_switch": 0.0, "loss_total": 0.04242270439863205, "step": 2437 }, { "batch_size": 4, "epoch": 0.9748, "step": 2437, "tokens_per_device": 5736 }, { "epoch": 0.9748, "loss_ce": 0.13587433099746704, "loss_lvr": 0.8583322763442993, "loss_mode_switch": 0.0, "loss_total": 0.2217075526714325, "step": 2437 }, { "batch_size": 4, "epoch": 0.9748, "step": 2437, "tokens_per_device": 9804 }, { "epoch": 0.9748, "loss_ce": 0.258145809173584, "loss_lvr": 0.7666513919830322, "loss_mode_switch": 0.0, "loss_total": 0.33481094241142273, "step": 2437 }, { "batch_size": 1, "epoch": 0.9748, "step": 2437, "tokens_per_device": 4867 }, { "epoch": 0.9748, "loss_ce": 0.00032964302226901054, "loss_lvr": 0.23894178867340088, "loss_mode_switch": 0.0, "loss_total": 0.024223823100328445, "step": 2437 }, { "epoch": 0.9752, "grad_norm": 1.2435410022735596, "learning_rate": 1.6120041206524885e-08, "loss": 0.2101, "step": 2438 }, { "batch_size": 1, "epoch": 0.9752, "step": 2438, "tokens_per_device": 5732 }, { "epoch": 0.9752, "loss_ce": 0.0003210732829757035, "loss_lvr": 0.29524293541908264, "loss_mode_switch": 0.0, "loss_total": 0.02984536811709404, "step": 2438 }, { "batch_size": 4, "epoch": 0.9752, "step": 2438, "tokens_per_device": 5584 }, { "epoch": 0.9752, "loss_ce": 0.11624973267316818, "loss_lvr": 0.663598358631134, "loss_mode_switch": 0.0, "loss_total": 0.18260957300662994, "step": 2438 }, { "batch_size": 1, "epoch": 0.9752, "step": 2438, "tokens_per_device": 5113 }, { "epoch": 0.9752, "loss_ce": 0.0005910098552703857, "loss_lvr": 0.3298318088054657, "loss_mode_switch": 0.0, "loss_total": 0.033574189990758896, "step": 2438 }, { "batch_size": 1, "epoch": 0.9752, "step": 2438, "tokens_per_device": 4942 }, { "epoch": 0.9752, "loss_ce": 0.06382790952920914, "loss_lvr": 0.2259308248758316, "loss_mode_switch": 0.0, "loss_total": 0.08642099052667618, "step": 2438 }, { "batch_size": 4, "epoch": 0.9752, "step": 2438, "tokens_per_device": 5164 }, { "epoch": 0.9752, "loss_ce": 0.0925518348813057, "loss_lvr": 0.7593980431556702, "loss_mode_switch": 0.0, "loss_total": 0.16849163174629211, "step": 2438 }, { "batch_size": 4, "epoch": 0.9752, "step": 2438, "tokens_per_device": 4932 }, { "epoch": 0.9752, "loss_ce": 0.13169439136981964, "loss_lvr": 0.8650640249252319, "loss_mode_switch": 0.0, "loss_total": 0.21820080280303955, "step": 2438 }, { "batch_size": 4, "epoch": 0.9752, "step": 2438, "tokens_per_device": 2692 }, { "epoch": 0.9752, "loss_ce": 0.15114940702915192, "loss_lvr": 0.7199528217315674, "loss_mode_switch": 0.0, "loss_total": 0.22314468026161194, "step": 2438 }, { "batch_size": 1, "epoch": 0.9752, "step": 2438, "tokens_per_device": 5100 }, { "epoch": 0.9752, "loss_ce": 0.11205035448074341, "loss_lvr": 0.3110741078853607, "loss_mode_switch": 0.0, "loss_total": 0.14315776526927948, "step": 2438 }, { "epoch": 0.9756, "grad_norm": 1.207371711730957, "learning_rate": 1.560450190422058e-08, "loss": 0.2367, "step": 2439 }, { "batch_size": 4, "epoch": 0.9756, "step": 2439, "tokens_per_device": 2676 }, { "epoch": 0.9756, "loss_ce": 0.1598109006881714, "loss_lvr": 0.8800919055938721, "loss_mode_switch": 0.0, "loss_total": 0.24782009422779083, "step": 2439 }, { "batch_size": 4, "epoch": 0.9756, "step": 2439, "tokens_per_device": 1556 }, { "epoch": 0.9756, "loss_ce": 0.37824833393096924, "loss_lvr": 0.8788814544677734, "loss_mode_switch": 0.0, "loss_total": 0.46613648533821106, "step": 2439 }, { "batch_size": 4, "epoch": 0.9756, "step": 2439, "tokens_per_device": 5492 }, { "epoch": 0.9756, "loss_ce": 0.0328793041408062, "loss_lvr": 0.7853514552116394, "loss_mode_switch": 0.0, "loss_total": 0.11141444742679596, "step": 2439 }, { "batch_size": 1, "epoch": 0.9756, "step": 2439, "tokens_per_device": 4840 }, { "epoch": 0.9756, "loss_ce": 0.05704081431031227, "loss_lvr": 0.18065176904201508, "loss_mode_switch": 0.0, "loss_total": 0.07510599493980408, "step": 2439 }, { "batch_size": 4, "epoch": 0.9756, "step": 2439, "tokens_per_device": 1516 }, { "epoch": 0.9756, "loss_ce": 0.36222511529922485, "loss_lvr": 0.7764878869056702, "loss_mode_switch": 0.0, "loss_total": 0.43987390398979187, "step": 2439 }, { "batch_size": 4, "epoch": 0.9756, "step": 2439, "tokens_per_device": 5960 }, { "epoch": 0.9756, "loss_ce": 0.22811219096183777, "loss_lvr": 0.6688228845596313, "loss_mode_switch": 0.0, "loss_total": 0.2949944734573364, "step": 2439 }, { "batch_size": 1, "epoch": 0.9756, "step": 2439, "tokens_per_device": 4867 }, { "epoch": 0.9756, "loss_ce": 0.0009015463292598724, "loss_lvr": 0.324398934841156, "loss_mode_switch": 0.0, "loss_total": 0.03334144130349159, "step": 2439 }, { "batch_size": 1, "epoch": 0.9756, "step": 2439, "tokens_per_device": 7156 }, { "epoch": 0.9756, "loss_ce": 0.0036120382137596607, "loss_lvr": 0.3763144612312317, "loss_mode_switch": 0.0, "loss_total": 0.04124348610639572, "step": 2439 }, { "epoch": 0.976, "grad_norm": 1.1430790424346924, "learning_rate": 1.509732804009012e-08, "loss": 0.2534, "step": 2440 }, { "batch_size": 1, "epoch": 0.976, "step": 2440, "tokens_per_device": 4950 }, { "epoch": 0.976, "loss_ce": 0.060453709214925766, "loss_lvr": 0.5368709564208984, "loss_mode_switch": 0.0, "loss_total": 0.11414080858230591, "step": 2440 }, { "batch_size": 4, "epoch": 0.976, "step": 2440, "tokens_per_device": 3820 }, { "epoch": 0.976, "loss_ce": 0.38614246249198914, "loss_lvr": 1.0642322301864624, "loss_mode_switch": 0.0, "loss_total": 0.49256569147109985, "step": 2440 }, { "batch_size": 4, "epoch": 0.976, "step": 2440, "tokens_per_device": 1796 }, { "epoch": 0.976, "loss_ce": 0.17402148246765137, "loss_lvr": 1.1220208406448364, "loss_mode_switch": 0.0, "loss_total": 0.28622356057167053, "step": 2440 }, { "batch_size": 4, "epoch": 0.976, "step": 2440, "tokens_per_device": 3760 }, { "epoch": 0.976, "loss_ce": 0.16581475734710693, "loss_lvr": 1.1188020706176758, "loss_mode_switch": 0.0, "loss_total": 0.277694970369339, "step": 2440 }, { "batch_size": 1, "epoch": 0.976, "step": 2440, "tokens_per_device": 7743 }, { "epoch": 0.976, "loss_ce": 0.07763472944498062, "loss_lvr": 0.2401721477508545, "loss_mode_switch": 0.0, "loss_total": 0.10165194422006607, "step": 2440 }, { "batch_size": 4, "epoch": 0.976, "step": 2440, "tokens_per_device": 1332 }, { "epoch": 0.976, "loss_ce": 0.32162269949913025, "loss_lvr": 1.4774938821792603, "loss_mode_switch": 0.0, "loss_total": 0.46937209367752075, "step": 2440 }, { "batch_size": 4, "epoch": 0.976, "step": 2440, "tokens_per_device": 5800 }, { "epoch": 0.976, "loss_ce": 0.11595714092254639, "loss_lvr": 0.7383752465248108, "loss_mode_switch": 0.0, "loss_total": 0.189794659614563, "step": 2440 }, { "batch_size": 4, "epoch": 0.976, "step": 2440, "tokens_per_device": 4188 }, { "epoch": 0.976, "loss_ce": 0.1606907993555069, "loss_lvr": 0.8064376711845398, "loss_mode_switch": 0.0, "loss_total": 0.24133455753326416, "step": 2440 }, { "epoch": 0.9764, "grad_norm": 1.1198887825012207, "learning_rate": 1.4598520465337051e-08, "loss": 0.2276, "step": 2441 }, { "batch_size": 4, "epoch": 0.9764, "step": 2441, "tokens_per_device": 4760 }, { "epoch": 0.9764, "loss_ce": 0.05215563252568245, "loss_lvr": 0.8626426458358765, "loss_mode_switch": 0.0, "loss_total": 0.13841989636421204, "step": 2441 }, { "batch_size": 4, "epoch": 0.9764, "step": 2441, "tokens_per_device": 1264 }, { "epoch": 0.9764, "loss_ce": 0.20573821663856506, "loss_lvr": 1.0039197206497192, "loss_mode_switch": 0.0, "loss_total": 0.30613020062446594, "step": 2441 }, { "batch_size": 1, "epoch": 0.9764, "step": 2441, "tokens_per_device": 5003 }, { "epoch": 0.9764, "loss_ce": 0.004948571790009737, "loss_lvr": 0.24558193981647491, "loss_mode_switch": 0.0, "loss_total": 0.02950676530599594, "step": 2441 }, { "batch_size": 4, "epoch": 0.9764, "step": 2441, "tokens_per_device": 1340 }, { "epoch": 0.9764, "loss_ce": 0.20617924630641937, "loss_lvr": 0.940611720085144, "loss_mode_switch": 0.0, "loss_total": 0.3002404272556305, "step": 2441 }, { "batch_size": 4, "epoch": 0.9764, "step": 2441, "tokens_per_device": 4200 }, { "epoch": 0.9764, "loss_ce": 0.06634820997714996, "loss_lvr": 0.8666602969169617, "loss_mode_switch": 0.0, "loss_total": 0.15301424264907837, "step": 2441 }, { "batch_size": 4, "epoch": 0.9764, "step": 2441, "tokens_per_device": 4220 }, { "epoch": 0.9764, "loss_ce": 0.04364263638854027, "loss_lvr": 1.2279021739959717, "loss_mode_switch": 0.0, "loss_total": 0.16643285751342773, "step": 2441 }, { "batch_size": 4, "epoch": 0.9764, "step": 2441, "tokens_per_device": 5068 }, { "epoch": 0.9764, "loss_ce": 0.4166933000087738, "loss_lvr": 0.7269028425216675, "loss_mode_switch": 0.0, "loss_total": 0.4893835783004761, "step": 2441 }, { "batch_size": 4, "epoch": 0.9764, "step": 2441, "tokens_per_device": 4900 }, { "epoch": 0.9764, "loss_ce": 0.2910706698894501, "loss_lvr": 0.842274010181427, "loss_mode_switch": 0.0, "loss_total": 0.37529808282852173, "step": 2441 }, { "epoch": 0.9768, "grad_norm": 0.998002827167511, "learning_rate": 1.4108080017122272e-08, "loss": 0.2184, "step": 2442 }, { "batch_size": 4, "epoch": 0.9768, "step": 2442, "tokens_per_device": 15164 }, { "epoch": 0.9768, "loss_ce": 0.3472819924354553, "loss_lvr": 0.9769667387008667, "loss_mode_switch": 0.0, "loss_total": 0.44497865438461304, "step": 2442 }, { "batch_size": 4, "epoch": 0.9768, "step": 2442, "tokens_per_device": 5204 }, { "epoch": 0.9768, "loss_ce": 0.07884105294942856, "loss_lvr": 0.7495784163475037, "loss_mode_switch": 0.0, "loss_total": 0.1537988930940628, "step": 2442 }, { "batch_size": 4, "epoch": 0.9768, "step": 2442, "tokens_per_device": 5972 }, { "epoch": 0.9768, "loss_ce": 0.08706633746623993, "loss_lvr": 0.7502842545509338, "loss_mode_switch": 0.0, "loss_total": 0.16209477186203003, "step": 2442 }, { "batch_size": 1, "epoch": 0.9768, "step": 2442, "tokens_per_device": 7337 }, { "epoch": 0.9768, "loss_ce": 0.011022216640412807, "loss_lvr": 0.2552737593650818, "loss_mode_switch": 0.0, "loss_total": 0.03654959425330162, "step": 2442 }, { "batch_size": 4, "epoch": 0.9768, "step": 2442, "tokens_per_device": 3780 }, { "epoch": 0.9768, "loss_ce": 0.4118858575820923, "loss_lvr": 1.0743553638458252, "loss_mode_switch": 0.0, "loss_total": 0.5193213820457458, "step": 2442 }, { "batch_size": 4, "epoch": 0.9768, "step": 2442, "tokens_per_device": 1368 }, { "epoch": 0.9768, "loss_ce": 0.49427154660224915, "loss_lvr": 0.9354575276374817, "loss_mode_switch": 0.0, "loss_total": 0.5878173112869263, "step": 2442 }, { "batch_size": 4, "epoch": 0.9768, "step": 2442, "tokens_per_device": 5284 }, { "epoch": 0.9768, "loss_ce": 0.03371722996234894, "loss_lvr": 0.7680210471153259, "loss_mode_switch": 0.0, "loss_total": 0.11051933467388153, "step": 2442 }, { "batch_size": 1, "epoch": 0.9768, "step": 2442, "tokens_per_device": 5120 }, { "epoch": 0.9768, "loss_ce": 0.029362689703702927, "loss_lvr": 0.2102852165699005, "loss_mode_switch": 0.0, "loss_total": 0.05039121210575104, "step": 2442 }, { "epoch": 0.9772, "grad_norm": 1.0547597408294678, "learning_rate": 1.3626007518565686e-08, "loss": 0.202, "step": 2443 }, { "batch_size": 4, "epoch": 0.9772, "step": 2443, "tokens_per_device": 1960 }, { "epoch": 0.9772, "loss_ce": 0.028289826586842537, "loss_lvr": 0.9693053364753723, "loss_mode_switch": 0.0, "loss_total": 0.12522035837173462, "step": 2443 }, { "batch_size": 4, "epoch": 0.9772, "step": 2443, "tokens_per_device": 1280 }, { "epoch": 0.9772, "loss_ce": 0.0508853979408741, "loss_lvr": 1.0980581045150757, "loss_mode_switch": 0.0, "loss_total": 0.16069121658802032, "step": 2443 }, { "batch_size": 1, "epoch": 0.9772, "step": 2443, "tokens_per_device": 4820 }, { "epoch": 0.9772, "loss_ce": 0.011459183879196644, "loss_lvr": 0.4093833565711975, "loss_mode_switch": 0.0, "loss_total": 0.05239751935005188, "step": 2443 }, { "batch_size": 4, "epoch": 0.9772, "step": 2443, "tokens_per_device": 4756 }, { "epoch": 0.9772, "loss_ce": 0.391996830701828, "loss_lvr": 0.7841209769248962, "loss_mode_switch": 0.0, "loss_total": 0.47040891647338867, "step": 2443 }, { "batch_size": 1, "epoch": 0.9772, "step": 2443, "tokens_per_device": 5180 }, { "epoch": 0.9772, "loss_ce": 0.028658561408519745, "loss_lvr": 0.3960556387901306, "loss_mode_switch": 0.0, "loss_total": 0.06826412677764893, "step": 2443 }, { "batch_size": 4, "epoch": 0.9772, "step": 2443, "tokens_per_device": 4232 }, { "epoch": 0.9772, "loss_ce": 0.3387001156806946, "loss_lvr": 0.9448006749153137, "loss_mode_switch": 0.0, "loss_total": 0.43318018317222595, "step": 2443 }, { "batch_size": 4, "epoch": 0.9772, "step": 2443, "tokens_per_device": 2536 }, { "epoch": 0.9772, "loss_ce": 0.07571881264448166, "loss_lvr": 0.8393295407295227, "loss_mode_switch": 0.0, "loss_total": 0.1596517711877823, "step": 2443 }, { "batch_size": 4, "epoch": 0.9772, "step": 2443, "tokens_per_device": 4216 }, { "epoch": 0.9772, "loss_ce": 0.07498273998498917, "loss_lvr": 0.8643636107444763, "loss_mode_switch": 0.0, "loss_total": 0.1614190936088562, "step": 2443 }, { "epoch": 0.9776, "grad_norm": 0.9545931220054626, "learning_rate": 1.3152303778740661e-08, "loss": 0.1899, "step": 2444 }, { "batch_size": 1, "epoch": 0.9776, "step": 2444, "tokens_per_device": 5026 }, { "epoch": 0.9776, "loss_ce": 0.0017191278748214245, "loss_lvr": 1.3253130912780762, "loss_mode_switch": 0.0, "loss_total": 0.1342504471540451, "step": 2444 }, { "batch_size": 4, "epoch": 0.9776, "step": 2444, "tokens_per_device": 1832 }, { "epoch": 0.9776, "loss_ce": 0.49191927909851074, "loss_lvr": 0.9764329195022583, "loss_mode_switch": 0.0, "loss_total": 0.5895625948905945, "step": 2444 }, { "batch_size": 1, "epoch": 0.9776, "step": 2444, "tokens_per_device": 4882 }, { "epoch": 0.9776, "loss_ce": 0.0038450430147349834, "loss_lvr": 0.22490271925926208, "loss_mode_switch": 0.0, "loss_total": 0.02633531577885151, "step": 2444 }, { "batch_size": 1, "epoch": 0.9776, "step": 2444, "tokens_per_device": 4874 }, { "epoch": 0.9776, "loss_ce": 0.006278811953961849, "loss_lvr": 0.32797670364379883, "loss_mode_switch": 0.0, "loss_total": 0.039076484739780426, "step": 2444 }, { "batch_size": 4, "epoch": 0.9776, "step": 2444, "tokens_per_device": 3212 }, { "epoch": 0.9776, "loss_ce": 0.09848635643720627, "loss_lvr": 0.5999791622161865, "loss_mode_switch": 0.0, "loss_total": 0.15848428010940552, "step": 2444 }, { "batch_size": 1, "epoch": 0.9776, "step": 2444, "tokens_per_device": 4923 }, { "epoch": 0.9776, "loss_ce": 0.05354725569486618, "loss_lvr": 0.2111622840166092, "loss_mode_switch": 0.0, "loss_total": 0.07466348260641098, "step": 2444 }, { "batch_size": 4, "epoch": 0.9776, "step": 2444, "tokens_per_device": 4252 }, { "epoch": 0.9776, "loss_ce": 0.11099646240472794, "loss_lvr": 0.96601402759552, "loss_mode_switch": 0.0, "loss_total": 0.20759786665439606, "step": 2444 }, { "batch_size": 4, "epoch": 0.9776, "step": 2444, "tokens_per_device": 2584 }, { "epoch": 0.9776, "loss_ce": 0.3598199784755707, "loss_lvr": 0.8525238633155823, "loss_mode_switch": 0.0, "loss_total": 0.44507235288619995, "step": 2444 }, { "epoch": 0.978, "grad_norm": 1.0077439546585083, "learning_rate": 1.268696959267679e-08, "loss": 0.2051, "step": 2445 }, { "batch_size": 4, "epoch": 0.978, "step": 2445, "tokens_per_device": 2712 }, { "epoch": 0.978, "loss_ce": 0.1404978632926941, "loss_lvr": 0.7178878784179688, "loss_mode_switch": 0.0, "loss_total": 0.21228665113449097, "step": 2445 }, { "batch_size": 4, "epoch": 0.978, "step": 2445, "tokens_per_device": 4616 }, { "epoch": 0.978, "loss_ce": 0.11955756694078445, "loss_lvr": 0.8409724831581116, "loss_mode_switch": 0.0, "loss_total": 0.20365482568740845, "step": 2445 }, { "batch_size": 4, "epoch": 0.978, "step": 2445, "tokens_per_device": 3860 }, { "epoch": 0.978, "loss_ce": 0.1409710943698883, "loss_lvr": 0.7282062768936157, "loss_mode_switch": 0.0, "loss_total": 0.21379172801971436, "step": 2445 }, { "batch_size": 1, "epoch": 0.978, "step": 2445, "tokens_per_device": 5112 }, { "epoch": 0.978, "loss_ce": 0.0002840621455106884, "loss_lvr": 0.35866767168045044, "loss_mode_switch": 0.0, "loss_total": 0.03615082800388336, "step": 2445 }, { "batch_size": 4, "epoch": 0.978, "step": 2445, "tokens_per_device": 6160 }, { "epoch": 0.978, "loss_ce": 0.21095865964889526, "loss_lvr": 0.711073637008667, "loss_mode_switch": 0.0, "loss_total": 0.2820660173892975, "step": 2445 }, { "batch_size": 4, "epoch": 0.978, "step": 2445, "tokens_per_device": 3792 }, { "epoch": 0.978, "loss_ce": 0.0024318904615938663, "loss_lvr": 0.9485936760902405, "loss_mode_switch": 0.0, "loss_total": 0.0972912609577179, "step": 2445 }, { "batch_size": 4, "epoch": 0.978, "step": 2445, "tokens_per_device": 1224 }, { "epoch": 0.978, "loss_ce": 0.3078364133834839, "loss_lvr": 1.231052041053772, "loss_mode_switch": 0.0, "loss_total": 0.4309416115283966, "step": 2445 }, { "batch_size": 4, "epoch": 0.978, "step": 2445, "tokens_per_device": 5736 }, { "epoch": 0.978, "loss_ce": 0.08026223629713058, "loss_lvr": 0.6948437094688416, "loss_mode_switch": 0.0, "loss_total": 0.1497466117143631, "step": 2445 }, { "epoch": 0.9784, "grad_norm": 1.1014618873596191, "learning_rate": 1.2230005741356577e-08, "loss": 0.2186, "step": 2446 }, { "batch_size": 1, "epoch": 0.9784, "step": 2446, "tokens_per_device": 5100 }, { "epoch": 0.9784, "loss_ce": 0.0003120990586467087, "loss_lvr": 0.651874840259552, "loss_mode_switch": 0.0, "loss_total": 0.06549958139657974, "step": 2446 }, { "batch_size": 4, "epoch": 0.9784, "step": 2446, "tokens_per_device": 6268 }, { "epoch": 0.9784, "loss_ce": 0.039741192013025284, "loss_lvr": 0.6858621835708618, "loss_mode_switch": 0.0, "loss_total": 0.10832741856575012, "step": 2446 }, { "batch_size": 4, "epoch": 0.9784, "step": 2446, "tokens_per_device": 1436 }, { "epoch": 0.9784, "loss_ce": 0.14996735751628876, "loss_lvr": 1.098331332206726, "loss_mode_switch": 0.0, "loss_total": 0.2598004937171936, "step": 2446 }, { "batch_size": 4, "epoch": 0.9784, "step": 2446, "tokens_per_device": 4104 }, { "epoch": 0.9784, "loss_ce": 0.010634019039571285, "loss_lvr": 1.139021873474121, "loss_mode_switch": 0.0, "loss_total": 0.12453620880842209, "step": 2446 }, { "batch_size": 4, "epoch": 0.9784, "step": 2446, "tokens_per_device": 5128 }, { "epoch": 0.9784, "loss_ce": 0.03977867215871811, "loss_lvr": 0.9884989261627197, "loss_mode_switch": 0.0, "loss_total": 0.13862857222557068, "step": 2446 }, { "batch_size": 4, "epoch": 0.9784, "step": 2446, "tokens_per_device": 4700 }, { "epoch": 0.9784, "loss_ce": 0.13193681836128235, "loss_lvr": 0.738854169845581, "loss_mode_switch": 0.0, "loss_total": 0.20582222938537598, "step": 2446 }, { "batch_size": 4, "epoch": 0.9784, "step": 2446, "tokens_per_device": 4304 }, { "epoch": 0.9784, "loss_ce": 0.18875132501125336, "loss_lvr": 1.1540366411209106, "loss_mode_switch": 0.0, "loss_total": 0.30415499210357666, "step": 2446 }, { "batch_size": 4, "epoch": 0.9784, "step": 2446, "tokens_per_device": 2724 }, { "epoch": 0.9784, "loss_ce": 0.26616451144218445, "loss_lvr": 0.8723438382148743, "loss_mode_switch": 0.0, "loss_total": 0.3533988893032074, "step": 2446 }, { "epoch": 0.9788, "grad_norm": 1.1849337816238403, "learning_rate": 1.1781412991713759e-08, "loss": 0.2018, "step": 2447 }, { "batch_size": 1, "epoch": 0.9788, "step": 2447, "tokens_per_device": 4986 }, { "epoch": 0.9788, "loss_ce": 0.006126503925770521, "loss_lvr": 0.3095296025276184, "loss_mode_switch": 0.0, "loss_total": 0.03707946464419365, "step": 2447 }, { "batch_size": 4, "epoch": 0.9788, "step": 2447, "tokens_per_device": 9388 }, { "epoch": 0.9788, "loss_ce": 0.010771426372230053, "loss_lvr": 0.5944948792457581, "loss_mode_switch": 0.0, "loss_total": 0.07022091746330261, "step": 2447 }, { "batch_size": 4, "epoch": 0.9788, "step": 2447, "tokens_per_device": 6168 }, { "epoch": 0.9788, "loss_ce": 0.01191437803208828, "loss_lvr": 0.7249335646629333, "loss_mode_switch": 0.0, "loss_total": 0.084407739341259, "step": 2447 }, { "batch_size": 4, "epoch": 0.9788, "step": 2447, "tokens_per_device": 4348 }, { "epoch": 0.9788, "loss_ce": 0.11776956915855408, "loss_lvr": 0.7364183664321899, "loss_mode_switch": 0.0, "loss_total": 0.19141140580177307, "step": 2447 }, { "batch_size": 4, "epoch": 0.9788, "step": 2447, "tokens_per_device": 2624 }, { "epoch": 0.9788, "loss_ce": 0.3621431887149811, "loss_lvr": 0.9308631420135498, "loss_mode_switch": 0.0, "loss_total": 0.4552295207977295, "step": 2447 }, { "batch_size": 4, "epoch": 0.9788, "step": 2447, "tokens_per_device": 4240 }, { "epoch": 0.9788, "loss_ce": 0.0628286823630333, "loss_lvr": 0.7464935779571533, "loss_mode_switch": 0.0, "loss_total": 0.1374780386686325, "step": 2447 }, { "batch_size": 4, "epoch": 0.9788, "step": 2447, "tokens_per_device": 4028 }, { "epoch": 0.9788, "loss_ce": 0.19357064366340637, "loss_lvr": 1.0110502243041992, "loss_mode_switch": 0.0, "loss_total": 0.29467567801475525, "step": 2447 }, { "batch_size": 1, "epoch": 0.9788, "step": 2447, "tokens_per_device": 4905 }, { "epoch": 0.9788, "loss_ce": 0.005564040504395962, "loss_lvr": 0.41705024242401123, "loss_mode_switch": 0.0, "loss_total": 0.0472690649330616, "step": 2447 }, { "epoch": 0.9792, "grad_norm": 1.8601548671722412, "learning_rate": 1.1341192096633313e-08, "loss": 0.1981, "step": 2448 }, { "batch_size": 4, "epoch": 0.9792, "step": 2448, "tokens_per_device": 2940 }, { "epoch": 0.9792, "loss_ce": 0.10591468214988708, "loss_lvr": 0.7084720730781555, "loss_mode_switch": 0.0, "loss_total": 0.17676189541816711, "step": 2448 }, { "batch_size": 4, "epoch": 0.9792, "step": 2448, "tokens_per_device": 4400 }, { "epoch": 0.9792, "loss_ce": 0.06967834383249283, "loss_lvr": 0.7333943247795105, "loss_mode_switch": 0.0, "loss_total": 0.14301776885986328, "step": 2448 }, { "batch_size": 4, "epoch": 0.9792, "step": 2448, "tokens_per_device": 3948 }, { "epoch": 0.9792, "loss_ce": 0.24515412747859955, "loss_lvr": 0.7610567212104797, "loss_mode_switch": 0.0, "loss_total": 0.3212597966194153, "step": 2448 }, { "batch_size": 4, "epoch": 0.9792, "step": 2448, "tokens_per_device": 5912 }, { "epoch": 0.9792, "loss_ce": 0.016506804153323174, "loss_lvr": 0.6834084391593933, "loss_mode_switch": 0.0, "loss_total": 0.08484765142202377, "step": 2448 }, { "batch_size": 1, "epoch": 0.9792, "step": 2448, "tokens_per_device": 5205 }, { "epoch": 0.9792, "loss_ce": 0.0006595414597541094, "loss_lvr": 0.6508958339691162, "loss_mode_switch": 0.0, "loss_total": 0.06574912369251251, "step": 2448 }, { "batch_size": 4, "epoch": 0.9792, "step": 2448, "tokens_per_device": 1616 }, { "epoch": 0.9792, "loss_ce": 0.5595118999481201, "loss_lvr": 0.8556904792785645, "loss_mode_switch": 0.0, "loss_total": 0.6450809240341187, "step": 2448 }, { "batch_size": 4, "epoch": 0.9792, "step": 2448, "tokens_per_device": 3784 }, { "epoch": 0.9792, "loss_ce": 0.20156657695770264, "loss_lvr": 0.9079409837722778, "loss_mode_switch": 0.0, "loss_total": 0.29236066341400146, "step": 2448 }, { "batch_size": 4, "epoch": 0.9792, "step": 2448, "tokens_per_device": 1404 }, { "epoch": 0.9792, "loss_ce": 0.30062294006347656, "loss_lvr": 1.047793984413147, "loss_mode_switch": 0.0, "loss_total": 0.4054023325443268, "step": 2448 }, { "epoch": 0.9796, "grad_norm": 1.140021800994873, "learning_rate": 1.0909343794948124e-08, "loss": 0.2377, "step": 2449 }, { "batch_size": 4, "epoch": 0.9796, "step": 2449, "tokens_per_device": 4920 }, { "epoch": 0.9796, "loss_ce": 0.0736083984375, "loss_lvr": 0.6847377419471741, "loss_mode_switch": 0.0, "loss_total": 0.14208218455314636, "step": 2449 }, { "batch_size": 1, "epoch": 0.9796, "step": 2449, "tokens_per_device": 4892 }, { "epoch": 0.9796, "loss_ce": 0.0021028409246355295, "loss_lvr": 0.783200740814209, "loss_mode_switch": 0.0, "loss_total": 0.08042292296886444, "step": 2449 }, { "batch_size": 1, "epoch": 0.9796, "step": 2449, "tokens_per_device": 5191 }, { "epoch": 0.9796, "loss_ce": 0.0722479447722435, "loss_lvr": 0.22343496978282928, "loss_mode_switch": 0.0, "loss_total": 0.09459143877029419, "step": 2449 }, { "batch_size": 4, "epoch": 0.9796, "step": 2449, "tokens_per_device": 7636 }, { "epoch": 0.9796, "loss_ce": 0.048686642199754715, "loss_lvr": 0.6676684021949768, "loss_mode_switch": 0.0, "loss_total": 0.11545348167419434, "step": 2449 }, { "batch_size": 4, "epoch": 0.9796, "step": 2449, "tokens_per_device": 4000 }, { "epoch": 0.9796, "loss_ce": 0.12833428382873535, "loss_lvr": 0.9382044672966003, "loss_mode_switch": 0.0, "loss_total": 0.22215473651885986, "step": 2449 }, { "batch_size": 4, "epoch": 0.9796, "step": 2449, "tokens_per_device": 1572 }, { "epoch": 0.9796, "loss_ce": 0.48895570635795593, "loss_lvr": 0.8183223605155945, "loss_mode_switch": 0.0, "loss_total": 0.5707879662513733, "step": 2449 }, { "batch_size": 4, "epoch": 0.9796, "step": 2449, "tokens_per_device": 3896 }, { "epoch": 0.9796, "loss_ce": 0.20105530321598053, "loss_lvr": 0.7746628522872925, "loss_mode_switch": 0.0, "loss_total": 0.2785215973854065, "step": 2449 }, { "batch_size": 1, "epoch": 0.9796, "step": 2449, "tokens_per_device": 4882 }, { "epoch": 0.9796, "loss_ce": 0.042018141597509384, "loss_lvr": 0.1977398693561554, "loss_mode_switch": 0.0, "loss_total": 0.061792127788066864, "step": 2449 }, { "epoch": 0.98, "grad_norm": 1.0337815284729004, "learning_rate": 1.0485868811441757e-08, "loss": 0.182, "step": 2450 }, { "batch_size": 4, "epoch": 0.98, "step": 2450, "tokens_per_device": 2644 }, { "epoch": 0.98, "loss_ce": 0.33515921235084534, "loss_lvr": 0.8605730533599854, "loss_mode_switch": 0.0, "loss_total": 0.42121651768684387, "step": 2450 }, { "batch_size": 4, "epoch": 0.98, "step": 2450, "tokens_per_device": 4512 }, { "epoch": 0.98, "loss_ce": 0.026516510173678398, "loss_lvr": 0.9147728681564331, "loss_mode_switch": 0.0, "loss_total": 0.1179938018321991, "step": 2450 }, { "batch_size": 4, "epoch": 0.98, "step": 2450, "tokens_per_device": 1912 }, { "epoch": 0.98, "loss_ce": 0.1355605125427246, "loss_lvr": 1.6103492975234985, "loss_mode_switch": 0.0, "loss_total": 0.2965954542160034, "step": 2450 }, { "batch_size": 1, "epoch": 0.98, "step": 2450, "tokens_per_device": 4990 }, { "epoch": 0.98, "loss_ce": 0.0008552675717510283, "loss_lvr": 0.4287867546081543, "loss_mode_switch": 0.0, "loss_total": 0.04373394325375557, "step": 2450 }, { "batch_size": 1, "epoch": 0.98, "step": 2450, "tokens_per_device": 5145 }, { "epoch": 0.98, "loss_ce": 0.001975573366507888, "loss_lvr": 0.3386370539665222, "loss_mode_switch": 0.0, "loss_total": 0.035839278250932693, "step": 2450 }, { "batch_size": 4, "epoch": 0.98, "step": 2450, "tokens_per_device": 4220 }, { "epoch": 0.98, "loss_ce": 0.008560094982385635, "loss_lvr": 1.217136263847351, "loss_mode_switch": 0.0, "loss_total": 0.1302737295627594, "step": 2450 }, { "batch_size": 4, "epoch": 0.98, "step": 2450, "tokens_per_device": 3484 }, { "epoch": 0.98, "loss_ce": 0.1773233562707901, "loss_lvr": 0.5040435194969177, "loss_mode_switch": 0.0, "loss_total": 0.2277277112007141, "step": 2450 }, { "batch_size": 1, "epoch": 0.98, "step": 2450, "tokens_per_device": 6083 }, { "epoch": 0.98, "loss_ce": 0.0003074992273468524, "loss_lvr": 0.24644315242767334, "loss_mode_switch": 0.0, "loss_total": 0.024951813742518425, "step": 2450 }, { "epoch": 0.9804, "grad_norm": 1.1090244054794312, "learning_rate": 1.00707678568418e-08, "loss": 0.2006, "step": 2451 }, { "batch_size": 4, "epoch": 0.9804, "step": 2451, "tokens_per_device": 8368 }, { "epoch": 0.9804, "loss_ce": 0.14175069332122803, "loss_lvr": 0.6284641027450562, "loss_mode_switch": 0.0, "loss_total": 0.2045971155166626, "step": 2451 }, { "batch_size": 4, "epoch": 0.9804, "step": 2451, "tokens_per_device": 5320 }, { "epoch": 0.9804, "loss_ce": 0.40999555587768555, "loss_lvr": 0.7927238941192627, "loss_mode_switch": 0.0, "loss_total": 0.4892679452896118, "step": 2451 }, { "batch_size": 1, "epoch": 0.9804, "step": 2451, "tokens_per_device": 4831 }, { "epoch": 0.9804, "loss_ce": 0.00042774443863891065, "loss_lvr": 0.349247008562088, "loss_mode_switch": 0.0, "loss_total": 0.035352446138858795, "step": 2451 }, { "batch_size": 4, "epoch": 0.9804, "step": 2451, "tokens_per_device": 1564 }, { "epoch": 0.9804, "loss_ce": 0.4448049068450928, "loss_lvr": 0.7800353765487671, "loss_mode_switch": 0.0, "loss_total": 0.5228084325790405, "step": 2451 }, { "batch_size": 1, "epoch": 0.9804, "step": 2451, "tokens_per_device": 5129 }, { "epoch": 0.9804, "loss_ce": 0.02232593484222889, "loss_lvr": 0.4817846119403839, "loss_mode_switch": 0.0, "loss_total": 0.07050439715385437, "step": 2451 }, { "batch_size": 1, "epoch": 0.9804, "step": 2451, "tokens_per_device": 4900 }, { "epoch": 0.9804, "loss_ce": 0.06448202580213547, "loss_lvr": 0.7232875823974609, "loss_mode_switch": 0.0, "loss_total": 0.1368107795715332, "step": 2451 }, { "batch_size": 4, "epoch": 0.9804, "step": 2451, "tokens_per_device": 3196 }, { "epoch": 0.9804, "loss_ce": 0.02138485200703144, "loss_lvr": 0.9313854575157166, "loss_mode_switch": 0.0, "loss_total": 0.11452339589595795, "step": 2451 }, { "batch_size": 1, "epoch": 0.9804, "step": 2451, "tokens_per_device": 4494 }, { "epoch": 0.9804, "loss_ce": 0.37909135222435, "loss_lvr": 0.3179325759410858, "loss_mode_switch": 0.0, "loss_total": 0.4108846187591553, "step": 2451 }, { "epoch": 0.9808, "grad_norm": 0.9837851524353027, "learning_rate": 9.66404162782375e-09, "loss": 0.1981, "step": 2452 }, { "batch_size": 4, "epoch": 0.9808, "step": 2452, "tokens_per_device": 1508 }, { "epoch": 0.9808, "loss_ce": 0.3731902837753296, "loss_lvr": 0.8487698435783386, "loss_mode_switch": 0.0, "loss_total": 0.45806726813316345, "step": 2452 }, { "batch_size": 1, "epoch": 0.9808, "step": 2452, "tokens_per_device": 4874 }, { "epoch": 0.9808, "loss_ce": 0.00017830425349529833, "loss_lvr": 0.2599951922893524, "loss_mode_switch": 0.0, "loss_total": 0.026177823543548584, "step": 2452 }, { "batch_size": 4, "epoch": 0.9808, "step": 2452, "tokens_per_device": 4880 }, { "epoch": 0.9808, "loss_ce": 0.3656280040740967, "loss_lvr": 0.7497385144233704, "loss_mode_switch": 0.0, "loss_total": 0.4406018555164337, "step": 2452 }, { "batch_size": 1, "epoch": 0.9808, "step": 2452, "tokens_per_device": 4887 }, { "epoch": 0.9808, "loss_ce": 0.0024537169374525547, "loss_lvr": 0.21006932854652405, "loss_mode_switch": 0.0, "loss_total": 0.023460648953914642, "step": 2452 }, { "batch_size": 4, "epoch": 0.9808, "step": 2452, "tokens_per_device": 1440 }, { "epoch": 0.9808, "loss_ce": 0.16989527642726898, "loss_lvr": 0.9777064323425293, "loss_mode_switch": 0.0, "loss_total": 0.26766592264175415, "step": 2452 }, { "batch_size": 1, "epoch": 0.9808, "step": 2452, "tokens_per_device": 5201 }, { "epoch": 0.9808, "loss_ce": 0.019760861992836, "loss_lvr": 0.36145833134651184, "loss_mode_switch": 0.0, "loss_total": 0.05590669438242912, "step": 2452 }, { "batch_size": 4, "epoch": 0.9808, "step": 2452, "tokens_per_device": 4324 }, { "epoch": 0.9808, "loss_ce": 0.004387763328850269, "loss_lvr": 0.8923023343086243, "loss_mode_switch": 0.0, "loss_total": 0.0936179980635643, "step": 2452 }, { "batch_size": 4, "epoch": 0.9808, "step": 2452, "tokens_per_device": 11020 }, { "epoch": 0.9808, "loss_ce": 0.18277239799499512, "loss_lvr": 0.6928685903549194, "loss_mode_switch": 0.0, "loss_total": 0.2520592510700226, "step": 2452 }, { "epoch": 0.9812, "grad_norm": 1.0360184907913208, "learning_rate": 9.265690807006566e-09, "loss": 0.1768, "step": 2453 }, { "batch_size": 1, "epoch": 0.9812, "step": 2453, "tokens_per_device": 4893 }, { "epoch": 0.9812, "loss_ce": 0.05207562446594238, "loss_lvr": 0.2786332666873932, "loss_mode_switch": 0.0, "loss_total": 0.07993894815444946, "step": 2453 }, { "batch_size": 4, "epoch": 0.9812, "step": 2453, "tokens_per_device": 6184 }, { "epoch": 0.9812, "loss_ce": 0.1264065057039261, "loss_lvr": 0.8352529406547546, "loss_mode_switch": 0.0, "loss_total": 0.20993179082870483, "step": 2453 }, { "batch_size": 1, "epoch": 0.9812, "step": 2453, "tokens_per_device": 4852 }, { "epoch": 0.9812, "loss_ce": 0.0018768785521388054, "loss_lvr": 0.2220045030117035, "loss_mode_switch": 0.0, "loss_total": 0.02407732978463173, "step": 2453 }, { "batch_size": 1, "epoch": 0.9812, "step": 2453, "tokens_per_device": 5131 }, { "epoch": 0.9812, "loss_ce": 0.03740497678518295, "loss_lvr": 0.34840646386146545, "loss_mode_switch": 0.0, "loss_total": 0.07224562764167786, "step": 2453 }, { "batch_size": 4, "epoch": 0.9812, "step": 2453, "tokens_per_device": 3864 }, { "epoch": 0.9812, "loss_ce": 0.09517266601324081, "loss_lvr": 0.7784592509269714, "loss_mode_switch": 0.0, "loss_total": 0.17301858961582184, "step": 2453 }, { "batch_size": 4, "epoch": 0.9812, "step": 2453, "tokens_per_device": 3892 }, { "epoch": 0.9812, "loss_ce": 0.21213148534297943, "loss_lvr": 0.9046309590339661, "loss_mode_switch": 0.0, "loss_total": 0.3025945723056793, "step": 2453 }, { "batch_size": 1, "epoch": 0.9812, "step": 2453, "tokens_per_device": 5140 }, { "epoch": 0.9812, "loss_ce": 0.03616790473461151, "loss_lvr": 0.368590384721756, "loss_mode_switch": 0.0, "loss_total": 0.07302694022655487, "step": 2453 }, { "batch_size": 1, "epoch": 0.9812, "step": 2453, "tokens_per_device": 4614 }, { "epoch": 0.9812, "loss_ce": 0.0073800766840577126, "loss_lvr": 0.2959926128387451, "loss_mode_switch": 0.0, "loss_total": 0.03697934001684189, "step": 2453 }, { "epoch": 0.9816, "grad_norm": 1.1823770999908447, "learning_rate": 8.875716062951566e-09, "loss": 0.2246, "step": 2454 }, { "batch_size": 1, "epoch": 0.9816, "step": 2454, "tokens_per_device": 5132 }, { "epoch": 0.9816, "loss_ce": 0.005916332360357046, "loss_lvr": 0.5534337759017944, "loss_mode_switch": 0.0, "loss_total": 0.06125970929861069, "step": 2454 }, { "batch_size": 1, "epoch": 0.9816, "step": 2454, "tokens_per_device": 5135 }, { "epoch": 0.9816, "loss_ce": 0.0214103814214468, "loss_lvr": 0.3852296769618988, "loss_mode_switch": 0.0, "loss_total": 0.05993334949016571, "step": 2454 }, { "batch_size": 4, "epoch": 0.9816, "step": 2454, "tokens_per_device": 1328 }, { "epoch": 0.9816, "loss_ce": 0.08510056138038635, "loss_lvr": 0.9618570804595947, "loss_mode_switch": 0.0, "loss_total": 0.1812862753868103, "step": 2454 }, { "batch_size": 4, "epoch": 0.9816, "step": 2454, "tokens_per_device": 6328 }, { "epoch": 0.9816, "loss_ce": 0.28301364183425903, "loss_lvr": 0.7042282819747925, "loss_mode_switch": 0.0, "loss_total": 0.3534364700317383, "step": 2454 }, { "batch_size": 1, "epoch": 0.9816, "step": 2454, "tokens_per_device": 7589 }, { "epoch": 0.9816, "loss_ce": 0.0007388045778498054, "loss_lvr": 0.26654547452926636, "loss_mode_switch": 0.0, "loss_total": 0.02739335224032402, "step": 2454 }, { "batch_size": 1, "epoch": 0.9816, "step": 2454, "tokens_per_device": 5152 }, { "epoch": 0.9816, "loss_ce": 0.0002659612218849361, "loss_lvr": 0.3758491277694702, "loss_mode_switch": 0.0, "loss_total": 0.03785087168216705, "step": 2454 }, { "batch_size": 4, "epoch": 0.9816, "step": 2454, "tokens_per_device": 4068 }, { "epoch": 0.9816, "loss_ce": 0.1784747987985611, "loss_lvr": 0.7315322160720825, "loss_mode_switch": 0.0, "loss_total": 0.25162801146507263, "step": 2454 }, { "batch_size": 1, "epoch": 0.9816, "step": 2454, "tokens_per_device": 4973 }, { "epoch": 0.9816, "loss_ce": 0.013225766830146313, "loss_lvr": 0.6073916554450989, "loss_mode_switch": 0.0, "loss_total": 0.0739649310708046, "step": 2454 }, { "epoch": 0.982, "grad_norm": 1.0128804445266724, "learning_rate": 8.494118050164646e-09, "loss": 0.1906, "step": 2455 }, { "batch_size": 4, "epoch": 0.982, "step": 2455, "tokens_per_device": 4188 }, { "epoch": 0.982, "loss_ce": 0.24202492833137512, "loss_lvr": 1.071921706199646, "loss_mode_switch": 0.0, "loss_total": 0.34921711683273315, "step": 2455 }, { "batch_size": 4, "epoch": 0.982, "step": 2455, "tokens_per_device": 6080 }, { "epoch": 0.982, "loss_ce": 0.3542172908782959, "loss_lvr": 0.6862971782684326, "loss_mode_switch": 0.0, "loss_total": 0.4228470027446747, "step": 2455 }, { "batch_size": 4, "epoch": 0.982, "step": 2455, "tokens_per_device": 2956 }, { "epoch": 0.982, "loss_ce": 0.37513819336891174, "loss_lvr": 0.7997609376907349, "loss_mode_switch": 0.0, "loss_total": 0.45511430501937866, "step": 2455 }, { "batch_size": 4, "epoch": 0.982, "step": 2455, "tokens_per_device": 3812 }, { "epoch": 0.982, "loss_ce": 0.22043485939502716, "loss_lvr": 0.8855620622634888, "loss_mode_switch": 0.0, "loss_total": 0.30899107456207275, "step": 2455 }, { "batch_size": 1, "epoch": 0.982, "step": 2455, "tokens_per_device": 4908 }, { "epoch": 0.982, "loss_ce": 0.009949540719389915, "loss_lvr": 0.1563936471939087, "loss_mode_switch": 0.0, "loss_total": 0.025588905438780785, "step": 2455 }, { "batch_size": 4, "epoch": 0.982, "step": 2455, "tokens_per_device": 3768 }, { "epoch": 0.982, "loss_ce": 0.08618488907814026, "loss_lvr": 0.9059126973152161, "loss_mode_switch": 0.0, "loss_total": 0.17677617073059082, "step": 2455 }, { "batch_size": 4, "epoch": 0.982, "step": 2455, "tokens_per_device": 3844 }, { "epoch": 0.982, "loss_ce": 0.0021132039837539196, "loss_lvr": 0.6412432193756104, "loss_mode_switch": 0.0, "loss_total": 0.06623752415180206, "step": 2455 }, { "batch_size": 1, "epoch": 0.982, "step": 2455, "tokens_per_device": 5119 }, { "epoch": 0.982, "loss_ce": 0.08085589855909348, "loss_lvr": 0.31712573766708374, "loss_mode_switch": 0.0, "loss_total": 0.11256846785545349, "step": 2455 }, { "epoch": 0.9824, "grad_norm": 1.0786298513412476, "learning_rate": 8.120897409090166e-09, "loss": 0.2098, "step": 2456 }, { "batch_size": 4, "epoch": 0.9824, "step": 2456, "tokens_per_device": 4052 }, { "epoch": 0.9824, "loss_ce": 0.24817243218421936, "loss_lvr": 0.6788454651832581, "loss_mode_switch": 0.0, "loss_total": 0.3160569667816162, "step": 2456 }, { "batch_size": 4, "epoch": 0.9824, "step": 2456, "tokens_per_device": 9596 }, { "epoch": 0.9824, "loss_ce": 0.3220703601837158, "loss_lvr": 0.6632493734359741, "loss_mode_switch": 0.0, "loss_total": 0.3883953094482422, "step": 2456 }, { "batch_size": 4, "epoch": 0.9824, "step": 2456, "tokens_per_device": 3768 }, { "epoch": 0.9824, "loss_ce": 0.26937463879585266, "loss_lvr": 0.9143866896629333, "loss_mode_switch": 0.0, "loss_total": 0.36081331968307495, "step": 2456 }, { "batch_size": 4, "epoch": 0.9824, "step": 2456, "tokens_per_device": 1548 }, { "epoch": 0.9824, "loss_ce": 0.4312753677368164, "loss_lvr": 0.7248625159263611, "loss_mode_switch": 0.0, "loss_total": 0.5037616491317749, "step": 2456 }, { "batch_size": 4, "epoch": 0.9824, "step": 2456, "tokens_per_device": 4484 }, { "epoch": 0.9824, "loss_ce": 0.14010807871818542, "loss_lvr": 0.8942880034446716, "loss_mode_switch": 0.0, "loss_total": 0.22953689098358154, "step": 2456 }, { "batch_size": 4, "epoch": 0.9824, "step": 2456, "tokens_per_device": 4428 }, { "epoch": 0.9824, "loss_ce": 0.12362675368785858, "loss_lvr": 0.7212195992469788, "loss_mode_switch": 0.0, "loss_total": 0.1957487165927887, "step": 2456 }, { "batch_size": 1, "epoch": 0.9824, "step": 2456, "tokens_per_device": 4630 }, { "epoch": 0.9824, "loss_ce": 0.007153376471251249, "loss_lvr": 0.4131063222885132, "loss_mode_switch": 0.0, "loss_total": 0.048464011400938034, "step": 2456 }, { "batch_size": 4, "epoch": 0.9824, "step": 2456, "tokens_per_device": 6084 }, { "epoch": 0.9824, "loss_ce": 0.04340271279215813, "loss_lvr": 0.6955089569091797, "loss_mode_switch": 0.0, "loss_total": 0.11295360326766968, "step": 2456 }, { "epoch": 0.9828, "grad_norm": 1.0232949256896973, "learning_rate": 7.756054766114852e-09, "loss": 0.2367, "step": 2457 }, { "batch_size": 1, "epoch": 0.9828, "step": 2457, "tokens_per_device": 5016 }, { "epoch": 0.9828, "loss_ce": 0.004537501838058233, "loss_lvr": 0.2734537720680237, "loss_mode_switch": 0.0, "loss_total": 0.0318828783929348, "step": 2457 }, { "batch_size": 1, "epoch": 0.9828, "step": 2457, "tokens_per_device": 5175 }, { "epoch": 0.9828, "loss_ce": 0.05369983986020088, "loss_lvr": 0.3084932267665863, "loss_mode_switch": 0.0, "loss_total": 0.08454915881156921, "step": 2457 }, { "batch_size": 4, "epoch": 0.9828, "step": 2457, "tokens_per_device": 3436 }, { "epoch": 0.9828, "loss_ce": 0.09407283365726471, "loss_lvr": 0.8174335956573486, "loss_mode_switch": 0.0, "loss_total": 0.17581619322299957, "step": 2457 }, { "batch_size": 4, "epoch": 0.9828, "step": 2457, "tokens_per_device": 10800 }, { "epoch": 0.9828, "loss_ce": 0.22446992993354797, "loss_lvr": 0.6937432289123535, "loss_mode_switch": 0.0, "loss_total": 0.2938442528247833, "step": 2457 }, { "batch_size": 4, "epoch": 0.9828, "step": 2457, "tokens_per_device": 4772 }, { "epoch": 0.9828, "loss_ce": 0.0400492399930954, "loss_lvr": 0.7319654226303101, "loss_mode_switch": 0.0, "loss_total": 0.11324578523635864, "step": 2457 }, { "batch_size": 4, "epoch": 0.9828, "step": 2457, "tokens_per_device": 1336 }, { "epoch": 0.9828, "loss_ce": 0.6725617051124573, "loss_lvr": 0.8688761591911316, "loss_mode_switch": 0.0, "loss_total": 0.759449303150177, "step": 2457 }, { "batch_size": 4, "epoch": 0.9828, "step": 2457, "tokens_per_device": 5116 }, { "epoch": 0.9828, "loss_ce": 0.2538369297981262, "loss_lvr": 0.7372280359268188, "loss_mode_switch": 0.0, "loss_total": 0.3275597393512726, "step": 2457 }, { "batch_size": 1, "epoch": 0.9828, "step": 2457, "tokens_per_device": 4995 }, { "epoch": 0.9828, "loss_ce": 0.13863296806812286, "loss_lvr": 0.30133679509162903, "loss_mode_switch": 0.0, "loss_total": 0.16876664757728577, "step": 2457 }, { "epoch": 0.9832, "grad_norm": 1.196029543876648, "learning_rate": 7.399590733562778e-09, "loss": 0.2164, "step": 2458 }, { "batch_size": 4, "epoch": 0.9832, "step": 2458, "tokens_per_device": 5740 }, { "epoch": 0.9832, "loss_ce": 0.06300213932991028, "loss_lvr": 0.9295978546142578, "loss_mode_switch": 0.0, "loss_total": 0.15596193075180054, "step": 2458 }, { "batch_size": 1, "epoch": 0.9832, "step": 2458, "tokens_per_device": 5244 }, { "epoch": 0.9832, "loss_ce": 0.009792660363018513, "loss_lvr": 0.2625598609447479, "loss_mode_switch": 0.0, "loss_total": 0.03604864701628685, "step": 2458 }, { "batch_size": 1, "epoch": 0.9832, "step": 2458, "tokens_per_device": 6205 }, { "epoch": 0.9832, "loss_ce": 0.014663060195744038, "loss_lvr": 0.28701457381248474, "loss_mode_switch": 0.0, "loss_total": 0.043364517390728, "step": 2458 }, { "batch_size": 4, "epoch": 0.9832, "step": 2458, "tokens_per_device": 11284 }, { "epoch": 0.9832, "loss_ce": 0.060131192207336426, "loss_lvr": 0.693650484085083, "loss_mode_switch": 0.0, "loss_total": 0.1294962465763092, "step": 2458 }, { "batch_size": 4, "epoch": 0.9832, "step": 2458, "tokens_per_device": 4280 }, { "epoch": 0.9832, "loss_ce": 0.035421933978796005, "loss_lvr": 0.4961405098438263, "loss_mode_switch": 0.0, "loss_total": 0.08503598719835281, "step": 2458 }, { "batch_size": 1, "epoch": 0.9832, "step": 2458, "tokens_per_device": 5112 }, { "epoch": 0.9832, "loss_ce": 0.02164481393992901, "loss_lvr": 0.6149946451187134, "loss_mode_switch": 0.0, "loss_total": 0.08314427733421326, "step": 2458 }, { "batch_size": 1, "epoch": 0.9832, "step": 2458, "tokens_per_device": 5114 }, { "epoch": 0.9832, "loss_ce": 0.001677493448369205, "loss_lvr": 0.42846983671188354, "loss_mode_switch": 0.0, "loss_total": 0.044524479657411575, "step": 2458 }, { "batch_size": 4, "epoch": 0.9832, "step": 2458, "tokens_per_device": 1452 }, { "epoch": 0.9832, "loss_ce": 0.24600787460803986, "loss_lvr": 0.9165089130401611, "loss_mode_switch": 0.0, "loss_total": 0.33765876293182373, "step": 2458 }, { "epoch": 0.9836, "grad_norm": 0.9829614162445068, "learning_rate": 7.051505909697609e-09, "loss": 0.1871, "step": 2459 }, { "batch_size": 1, "epoch": 0.9836, "step": 2459, "tokens_per_device": 4694 }, { "epoch": 0.9836, "loss_ce": 0.000249890930717811, "loss_lvr": 0.2737348973751068, "loss_mode_switch": 0.0, "loss_total": 0.027623381465673447, "step": 2459 }, { "batch_size": 4, "epoch": 0.9836, "step": 2459, "tokens_per_device": 3820 }, { "epoch": 0.9836, "loss_ce": 0.1750485897064209, "loss_lvr": 0.6187620759010315, "loss_mode_switch": 0.0, "loss_total": 0.23692479729652405, "step": 2459 }, { "batch_size": 1, "epoch": 0.9836, "step": 2459, "tokens_per_device": 5074 }, { "epoch": 0.9836, "loss_ce": 0.000773800362367183, "loss_lvr": 0.4823695123195648, "loss_mode_switch": 0.0, "loss_total": 0.0490107499063015, "step": 2459 }, { "batch_size": 4, "epoch": 0.9836, "step": 2459, "tokens_per_device": 3484 }, { "epoch": 0.9836, "loss_ce": 0.22337213158607483, "loss_lvr": 0.7925822138786316, "loss_mode_switch": 0.0, "loss_total": 0.30263036489486694, "step": 2459 }, { "batch_size": 4, "epoch": 0.9836, "step": 2459, "tokens_per_device": 4148 }, { "epoch": 0.9836, "loss_ce": 0.281738817691803, "loss_lvr": 0.9346174001693726, "loss_mode_switch": 0.0, "loss_total": 0.3752005696296692, "step": 2459 }, { "batch_size": 1, "epoch": 0.9836, "step": 2459, "tokens_per_device": 4902 }, { "epoch": 0.9836, "loss_ce": 0.011514023877680302, "loss_lvr": 0.33125555515289307, "loss_mode_switch": 0.0, "loss_total": 0.04463957995176315, "step": 2459 }, { "batch_size": 4, "epoch": 0.9836, "step": 2459, "tokens_per_device": 1272 }, { "epoch": 0.9836, "loss_ce": 0.23259957134723663, "loss_lvr": 1.2795995473861694, "loss_mode_switch": 0.0, "loss_total": 0.36055952310562134, "step": 2459 }, { "batch_size": 4, "epoch": 0.9836, "step": 2459, "tokens_per_device": 10304 }, { "epoch": 0.9836, "loss_ce": 0.06025402992963791, "loss_lvr": 0.7868193984031677, "loss_mode_switch": 0.0, "loss_total": 0.13893596827983856, "step": 2459 }, { "epoch": 0.984, "grad_norm": 1.1462538242340088, "learning_rate": 6.711800878718144e-09, "loss": 0.2105, "step": 2460 }, { "batch_size": 4, "epoch": 0.984, "step": 2460, "tokens_per_device": 3992 }, { "epoch": 0.984, "loss_ce": 0.1019819900393486, "loss_lvr": 0.76612788438797, "loss_mode_switch": 0.0, "loss_total": 0.17859478294849396, "step": 2460 }, { "batch_size": 1, "epoch": 0.984, "step": 2460, "tokens_per_device": 4897 }, { "epoch": 0.984, "loss_ce": 0.006820861250162125, "loss_lvr": 0.2764131426811218, "loss_mode_switch": 0.0, "loss_total": 0.03446217626333237, "step": 2460 }, { "batch_size": 1, "epoch": 0.984, "step": 2460, "tokens_per_device": 5158 }, { "epoch": 0.984, "loss_ce": 0.10741491615772247, "loss_lvr": 0.2080676108598709, "loss_mode_switch": 0.0, "loss_total": 0.12822167575359344, "step": 2460 }, { "batch_size": 1, "epoch": 0.984, "step": 2460, "tokens_per_device": 4884 }, { "epoch": 0.984, "loss_ce": 0.005145038943737745, "loss_lvr": 0.32139724493026733, "loss_mode_switch": 0.0, "loss_total": 0.037284765392541885, "step": 2460 }, { "batch_size": 1, "epoch": 0.984, "step": 2460, "tokens_per_device": 5189 }, { "epoch": 0.984, "loss_ce": 0.008142911829054356, "loss_lvr": 0.27830731868743896, "loss_mode_switch": 0.0, "loss_total": 0.035973645746707916, "step": 2460 }, { "batch_size": 4, "epoch": 0.984, "step": 2460, "tokens_per_device": 4488 }, { "epoch": 0.984, "loss_ce": 0.018041737377643585, "loss_lvr": 0.8048631548881531, "loss_mode_switch": 0.0, "loss_total": 0.09852805733680725, "step": 2460 }, { "batch_size": 1, "epoch": 0.984, "step": 2460, "tokens_per_device": 5263 }, { "epoch": 0.984, "loss_ce": 0.06793548911809921, "loss_lvr": 0.38707348704338074, "loss_mode_switch": 0.0, "loss_total": 0.10664284229278564, "step": 2460 }, { "batch_size": 4, "epoch": 0.984, "step": 2460, "tokens_per_device": 4256 }, { "epoch": 0.984, "loss_ce": 0.059518709778785706, "loss_lvr": 0.8658381104469299, "loss_mode_switch": 0.0, "loss_total": 0.14610251784324646, "step": 2460 }, { "epoch": 0.9844, "grad_norm": 1.1335896253585815, "learning_rate": 6.38047621075999e-09, "loss": 0.1758, "step": 2461 }, { "batch_size": 4, "epoch": 0.9844, "step": 2461, "tokens_per_device": 1424 }, { "epoch": 0.9844, "loss_ce": 0.30464035272598267, "loss_lvr": 0.8679752349853516, "loss_mode_switch": 0.0, "loss_total": 0.3914378881454468, "step": 2461 }, { "batch_size": 4, "epoch": 0.9844, "step": 2461, "tokens_per_device": 6136 }, { "epoch": 0.9844, "loss_ce": 0.11677297204732895, "loss_lvr": 0.5569095015525818, "loss_mode_switch": 0.0, "loss_total": 0.17246392369270325, "step": 2461 }, { "batch_size": 4, "epoch": 0.9844, "step": 2461, "tokens_per_device": 2568 }, { "epoch": 0.9844, "loss_ce": 0.2014225572347641, "loss_lvr": 1.0549532175064087, "loss_mode_switch": 0.0, "loss_total": 0.30691787600517273, "step": 2461 }, { "batch_size": 4, "epoch": 0.9844, "step": 2461, "tokens_per_device": 1388 }, { "epoch": 0.9844, "loss_ce": 0.12185141444206238, "loss_lvr": 0.912138044834137, "loss_mode_switch": 0.0, "loss_total": 0.2130652219057083, "step": 2461 }, { "batch_size": 4, "epoch": 0.9844, "step": 2461, "tokens_per_device": 5408 }, { "epoch": 0.9844, "loss_ce": 0.328772634267807, "loss_lvr": 0.6581124067306519, "loss_mode_switch": 0.0, "loss_total": 0.39458388090133667, "step": 2461 }, { "batch_size": 1, "epoch": 0.9844, "step": 2461, "tokens_per_device": 5103 }, { "epoch": 0.9844, "loss_ce": 0.0015617340104654431, "loss_lvr": 0.7601094841957092, "loss_mode_switch": 0.0, "loss_total": 0.07757268100976944, "step": 2461 }, { "batch_size": 4, "epoch": 0.9844, "step": 2461, "tokens_per_device": 4076 }, { "epoch": 0.9844, "loss_ce": 0.13833913207054138, "loss_lvr": 0.8674541115760803, "loss_mode_switch": 0.0, "loss_total": 0.22508454322814941, "step": 2461 }, { "batch_size": 4, "epoch": 0.9844, "step": 2461, "tokens_per_device": 2584 }, { "epoch": 0.9844, "loss_ce": 0.2722955644130707, "loss_lvr": 0.9506585001945496, "loss_mode_switch": 0.0, "loss_total": 0.3673614263534546, "step": 2461 }, { "epoch": 0.9848, "grad_norm": 0.9309244155883789, "learning_rate": 6.057532461893889e-09, "loss": 0.1955, "step": 2462 }, { "batch_size": 1, "epoch": 0.9848, "step": 2462, "tokens_per_device": 5099 }, { "epoch": 0.9848, "loss_ce": 0.020063865929841995, "loss_lvr": 0.20833803713321686, "loss_mode_switch": 0.0, "loss_total": 0.0408976674079895, "step": 2462 }, { "batch_size": 1, "epoch": 0.9848, "step": 2462, "tokens_per_device": 6064 }, { "epoch": 0.9848, "loss_ce": 0.01465226337313652, "loss_lvr": 0.251814067363739, "loss_mode_switch": 0.0, "loss_total": 0.0398336723446846, "step": 2462 }, { "batch_size": 4, "epoch": 0.9848, "step": 2462, "tokens_per_device": 1420 }, { "epoch": 0.9848, "loss_ce": 0.33386582136154175, "loss_lvr": 1.091750979423523, "loss_mode_switch": 0.0, "loss_total": 0.4430409073829651, "step": 2462 }, { "batch_size": 4, "epoch": 0.9848, "step": 2462, "tokens_per_device": 4256 }, { "epoch": 0.9848, "loss_ce": 0.09561421722173691, "loss_lvr": 0.7884567379951477, "loss_mode_switch": 0.0, "loss_total": 0.17445988953113556, "step": 2462 }, { "batch_size": 4, "epoch": 0.9848, "step": 2462, "tokens_per_device": 1856 }, { "epoch": 0.9848, "loss_ce": 0.16105473041534424, "loss_lvr": 0.8721050024032593, "loss_mode_switch": 0.0, "loss_total": 0.24826523661613464, "step": 2462 }, { "batch_size": 4, "epoch": 0.9848, "step": 2462, "tokens_per_device": 1604 }, { "epoch": 0.9848, "loss_ce": 0.12717831134796143, "loss_lvr": 0.9717046618461609, "loss_mode_switch": 0.0, "loss_total": 0.224348783493042, "step": 2462 }, { "batch_size": 4, "epoch": 0.9848, "step": 2462, "tokens_per_device": 4276 }, { "epoch": 0.9848, "loss_ce": 0.08146242797374725, "loss_lvr": 0.6023610234260559, "loss_mode_switch": 0.0, "loss_total": 0.14169853925704956, "step": 2462 }, { "batch_size": 4, "epoch": 0.9848, "step": 2462, "tokens_per_device": 2660 }, { "epoch": 0.9848, "loss_ce": 0.15829700231552124, "loss_lvr": 0.8265520334243774, "loss_mode_switch": 0.0, "loss_total": 0.24095220863819122, "step": 2462 }, { "epoch": 0.9852, "grad_norm": 1.1269510984420776, "learning_rate": 5.742970174124618e-09, "loss": 0.2318, "step": 2463 }, { "batch_size": 4, "epoch": 0.9852, "step": 2463, "tokens_per_device": 16076 }, { "epoch": 0.9852, "loss_ce": 0.1217540055513382, "loss_lvr": 0.6898685097694397, "loss_mode_switch": 0.0, "loss_total": 0.19074085354804993, "step": 2463 }, { "batch_size": 4, "epoch": 0.9852, "step": 2463, "tokens_per_device": 7012 }, { "epoch": 0.9852, "loss_ce": 0.09420983493328094, "loss_lvr": 0.48095938563346863, "loss_mode_switch": 0.0, "loss_total": 0.14230577647686005, "step": 2463 }, { "batch_size": 4, "epoch": 0.9852, "step": 2463, "tokens_per_device": 1552 }, { "epoch": 0.9852, "loss_ce": 0.10810637474060059, "loss_lvr": 1.3283576965332031, "loss_mode_switch": 0.0, "loss_total": 0.24094215035438538, "step": 2463 }, { "batch_size": 4, "epoch": 0.9852, "step": 2463, "tokens_per_device": 5036 }, { "epoch": 0.9852, "loss_ce": 0.13003896176815033, "loss_lvr": 0.8743075132369995, "loss_mode_switch": 0.0, "loss_total": 0.217469722032547, "step": 2463 }, { "batch_size": 4, "epoch": 0.9852, "step": 2463, "tokens_per_device": 2348 }, { "epoch": 0.9852, "loss_ce": 0.3912453055381775, "loss_lvr": 0.9500630497932434, "loss_mode_switch": 0.0, "loss_total": 0.4862516224384308, "step": 2463 }, { "batch_size": 1, "epoch": 0.9852, "step": 2463, "tokens_per_device": 5105 }, { "epoch": 0.9852, "loss_ce": 0.018515976145863533, "loss_lvr": 0.5157486200332642, "loss_mode_switch": 0.0, "loss_total": 0.07009083777666092, "step": 2463 }, { "batch_size": 1, "epoch": 0.9852, "step": 2463, "tokens_per_device": 5241 }, { "epoch": 0.9852, "loss_ce": 0.012438192032277584, "loss_lvr": 0.2628311812877655, "loss_mode_switch": 0.0, "loss_total": 0.03872131183743477, "step": 2463 }, { "batch_size": 1, "epoch": 0.9852, "step": 2463, "tokens_per_device": 6361 }, { "epoch": 0.9852, "loss_ce": 0.07978475093841553, "loss_lvr": 0.321556031703949, "loss_mode_switch": 0.0, "loss_total": 0.11194035410881042, "step": 2463 }, { "epoch": 0.9856, "grad_norm": 1.0490777492523193, "learning_rate": 5.436789875390425e-09, "loss": 0.2092, "step": 2464 }, { "batch_size": 4, "epoch": 0.9856, "step": 2464, "tokens_per_device": 2708 }, { "epoch": 0.9856, "loss_ce": 0.2528974711894989, "loss_lvr": 0.5490683317184448, "loss_mode_switch": 0.0, "loss_total": 0.30780431628227234, "step": 2464 }, { "batch_size": 1, "epoch": 0.9856, "step": 2464, "tokens_per_device": 5179 }, { "epoch": 0.9856, "loss_ce": 0.049649711698293686, "loss_lvr": 0.40771305561065674, "loss_mode_switch": 0.0, "loss_total": 0.09042102098464966, "step": 2464 }, { "batch_size": 4, "epoch": 0.9856, "step": 2464, "tokens_per_device": 7048 }, { "epoch": 0.9856, "loss_ce": 0.05187566950917244, "loss_lvr": 0.7508115172386169, "loss_mode_switch": 0.0, "loss_total": 0.12695682048797607, "step": 2464 }, { "batch_size": 4, "epoch": 0.9856, "step": 2464, "tokens_per_device": 4296 }, { "epoch": 0.9856, "loss_ce": 0.021677954122424126, "loss_lvr": 0.8625494837760925, "loss_mode_switch": 0.0, "loss_total": 0.10793290287256241, "step": 2464 }, { "batch_size": 4, "epoch": 0.9856, "step": 2464, "tokens_per_device": 5316 }, { "epoch": 0.9856, "loss_ce": 0.08597822487354279, "loss_lvr": 0.8758383989334106, "loss_mode_switch": 0.0, "loss_total": 0.17356206476688385, "step": 2464 }, { "batch_size": 4, "epoch": 0.9856, "step": 2464, "tokens_per_device": 8188 }, { "epoch": 0.9856, "loss_ce": 0.16556081175804138, "loss_lvr": 0.6950390934944153, "loss_mode_switch": 0.0, "loss_total": 0.23506471514701843, "step": 2464 }, { "batch_size": 4, "epoch": 0.9856, "step": 2464, "tokens_per_device": 5552 }, { "epoch": 0.9856, "loss_ce": 0.13477711379528046, "loss_lvr": 0.762840747833252, "loss_mode_switch": 0.0, "loss_total": 0.21106117963790894, "step": 2464 }, { "batch_size": 4, "epoch": 0.9856, "step": 2464, "tokens_per_device": 3804 }, { "epoch": 0.9856, "loss_ce": 0.1537155956029892, "loss_lvr": 1.3804551362991333, "loss_mode_switch": 0.0, "loss_total": 0.2917611002922058, "step": 2464 }, { "epoch": 0.986, "grad_norm": 1.1537787914276123, "learning_rate": 5.138992079561367e-09, "loss": 0.2403, "step": 2465 }, { "batch_size": 4, "epoch": 0.986, "step": 2465, "tokens_per_device": 2752 }, { "epoch": 0.986, "loss_ce": 0.30838578939437866, "loss_lvr": 0.6163706183433533, "loss_mode_switch": 0.0, "loss_total": 0.37002286314964294, "step": 2465 }, { "batch_size": 1, "epoch": 0.986, "step": 2465, "tokens_per_device": 4833 }, { "epoch": 0.986, "loss_ce": 0.014285814017057419, "loss_lvr": 0.44160744547843933, "loss_mode_switch": 0.0, "loss_total": 0.05844656005501747, "step": 2465 }, { "batch_size": 1, "epoch": 0.986, "step": 2465, "tokens_per_device": 5095 }, { "epoch": 0.986, "loss_ce": 0.0007844062056392431, "loss_lvr": 0.969578742980957, "loss_mode_switch": 0.0, "loss_total": 0.09774228185415268, "step": 2465 }, { "batch_size": 4, "epoch": 0.986, "step": 2465, "tokens_per_device": 4256 }, { "epoch": 0.986, "loss_ce": 0.1919597089290619, "loss_lvr": 0.8288193345069885, "loss_mode_switch": 0.0, "loss_total": 0.27484163641929626, "step": 2465 }, { "batch_size": 1, "epoch": 0.986, "step": 2465, "tokens_per_device": 4392 }, { "epoch": 0.986, "loss_ce": 0.009806718677282333, "loss_lvr": 0.5811129212379456, "loss_mode_switch": 0.0, "loss_total": 0.06791801005601883, "step": 2465 }, { "batch_size": 4, "epoch": 0.986, "step": 2465, "tokens_per_device": 5612 }, { "epoch": 0.986, "loss_ce": 0.09056540578603745, "loss_lvr": 1.0794517993927002, "loss_mode_switch": 0.0, "loss_total": 0.19851058721542358, "step": 2465 }, { "batch_size": 4, "epoch": 0.986, "step": 2465, "tokens_per_device": 1752 }, { "epoch": 0.986, "loss_ce": 0.030180182307958603, "loss_lvr": 1.5175650119781494, "loss_mode_switch": 0.0, "loss_total": 0.18193669617176056, "step": 2465 }, { "batch_size": 1, "epoch": 0.986, "step": 2465, "tokens_per_device": 4892 }, { "epoch": 0.986, "loss_ce": 0.03668805956840515, "loss_lvr": 0.1997876614332199, "loss_mode_switch": 0.0, "loss_total": 0.05666682869195938, "step": 2465 }, { "epoch": 0.9864, "grad_norm": 1.1329632997512817, "learning_rate": 4.849577286438756e-09, "loss": 0.2313, "step": 2466 }, { "batch_size": 1, "epoch": 0.9864, "step": 2466, "tokens_per_device": 6780 }, { "epoch": 0.9864, "loss_ce": 0.00012848770711570978, "loss_lvr": 0.24366231262683868, "loss_mode_switch": 0.0, "loss_total": 0.024494718760252, "step": 2466 }, { "batch_size": 1, "epoch": 0.9864, "step": 2466, "tokens_per_device": 4861 }, { "epoch": 0.9864, "loss_ce": 0.0008852786268107593, "loss_lvr": 0.43742960691452026, "loss_mode_switch": 0.0, "loss_total": 0.044628240168094635, "step": 2466 }, { "batch_size": 1, "epoch": 0.9864, "step": 2466, "tokens_per_device": 4688 }, { "epoch": 0.9864, "loss_ce": 0.382968544960022, "loss_lvr": 0.2863881587982178, "loss_mode_switch": 0.0, "loss_total": 0.4116073548793793, "step": 2466 }, { "batch_size": 1, "epoch": 0.9864, "step": 2466, "tokens_per_device": 5167 }, { "epoch": 0.9864, "loss_ce": 0.19675880670547485, "loss_lvr": 0.45074063539505005, "loss_mode_switch": 0.0, "loss_total": 0.24183286726474762, "step": 2466 }, { "batch_size": 4, "epoch": 0.9864, "step": 2466, "tokens_per_device": 2752 }, { "epoch": 0.9864, "loss_ce": 0.12365596741437912, "loss_lvr": 0.34947070479393005, "loss_mode_switch": 0.0, "loss_total": 0.15860304236412048, "step": 2466 }, { "batch_size": 1, "epoch": 0.9864, "step": 2466, "tokens_per_device": 4879 }, { "epoch": 0.9864, "loss_ce": 0.03331144526600838, "loss_lvr": 1.1881572008132935, "loss_mode_switch": 0.0, "loss_total": 0.15212716162204742, "step": 2466 }, { "batch_size": 4, "epoch": 0.9864, "step": 2466, "tokens_per_device": 3344 }, { "epoch": 0.9864, "loss_ce": 0.20393963158130646, "loss_lvr": 0.5760793685913086, "loss_mode_switch": 0.0, "loss_total": 0.2615475654602051, "step": 2466 }, { "batch_size": 4, "epoch": 0.9864, "step": 2466, "tokens_per_device": 5500 }, { "epoch": 0.9864, "loss_ce": 0.24620698392391205, "loss_lvr": 0.7528681755065918, "loss_mode_switch": 0.0, "loss_total": 0.32149380445480347, "step": 2466 }, { "epoch": 0.9868, "grad_norm": 1.1109267473220825, "learning_rate": 4.568545981755157e-09, "loss": 0.2204, "step": 2467 }, { "batch_size": 1, "epoch": 0.9868, "step": 2467, "tokens_per_device": 7336 }, { "epoch": 0.9868, "loss_ce": 0.0003579113108571619, "loss_lvr": 0.306874543428421, "loss_mode_switch": 0.0, "loss_total": 0.03104536607861519, "step": 2467 }, { "batch_size": 4, "epoch": 0.9868, "step": 2467, "tokens_per_device": 2664 }, { "epoch": 0.9868, "loss_ce": 0.08391322940587997, "loss_lvr": 0.8616271615028381, "loss_mode_switch": 0.0, "loss_total": 0.17007595300674438, "step": 2467 }, { "batch_size": 1, "epoch": 0.9868, "step": 2467, "tokens_per_device": 5151 }, { "epoch": 0.9868, "loss_ce": 0.021243322640657425, "loss_lvr": 0.2227228283882141, "loss_mode_switch": 0.0, "loss_total": 0.043515607714653015, "step": 2467 }, { "batch_size": 4, "epoch": 0.9868, "step": 2467, "tokens_per_device": 3332 }, { "epoch": 0.9868, "loss_ce": 0.08241897076368332, "loss_lvr": 0.882652223110199, "loss_mode_switch": 0.0, "loss_total": 0.17068418860435486, "step": 2467 }, { "batch_size": 4, "epoch": 0.9868, "step": 2467, "tokens_per_device": 8868 }, { "epoch": 0.9868, "loss_ce": 0.21078893542289734, "loss_lvr": 0.7100721597671509, "loss_mode_switch": 0.0, "loss_total": 0.2817961573600769, "step": 2467 }, { "batch_size": 4, "epoch": 0.9868, "step": 2467, "tokens_per_device": 1528 }, { "epoch": 0.9868, "loss_ce": 0.4660959541797638, "loss_lvr": 0.811835527420044, "loss_mode_switch": 0.0, "loss_total": 0.5472794771194458, "step": 2467 }, { "batch_size": 4, "epoch": 0.9868, "step": 2467, "tokens_per_device": 7300 }, { "epoch": 0.9868, "loss_ce": 0.0016484850784763694, "loss_lvr": 0.6930400729179382, "loss_mode_switch": 0.0, "loss_total": 0.07095249742269516, "step": 2467 }, { "batch_size": 4, "epoch": 0.9868, "step": 2467, "tokens_per_device": 3764 }, { "epoch": 0.9868, "loss_ce": 0.5294065475463867, "loss_lvr": 1.071316123008728, "loss_mode_switch": 0.0, "loss_total": 0.6365381479263306, "step": 2467 }, { "epoch": 0.9872, "grad_norm": 1.0014179944992065, "learning_rate": 4.295898637172169e-09, "loss": 0.2212, "step": 2468 }, { "batch_size": 4, "epoch": 0.9872, "step": 2468, "tokens_per_device": 3120 }, { "epoch": 0.9872, "loss_ce": 0.7152115106582642, "loss_lvr": 0.9213103652000427, "loss_mode_switch": 0.0, "loss_total": 0.807342529296875, "step": 2468 }, { "batch_size": 4, "epoch": 0.9872, "step": 2468, "tokens_per_device": 4412 }, { "epoch": 0.9872, "loss_ce": 0.02253313548862934, "loss_lvr": 0.7001973986625671, "loss_mode_switch": 0.0, "loss_total": 0.09255287796258926, "step": 2468 }, { "batch_size": 4, "epoch": 0.9872, "step": 2468, "tokens_per_device": 1532 }, { "epoch": 0.9872, "loss_ce": 0.14356376230716705, "loss_lvr": 0.8253359794616699, "loss_mode_switch": 0.0, "loss_total": 0.22609736025333405, "step": 2468 }, { "batch_size": 4, "epoch": 0.9872, "step": 2468, "tokens_per_device": 9792 }, { "epoch": 0.9872, "loss_ce": 0.2663724422454834, "loss_lvr": 0.5257272124290466, "loss_mode_switch": 0.0, "loss_total": 0.31894516944885254, "step": 2468 }, { "batch_size": 4, "epoch": 0.9872, "step": 2468, "tokens_per_device": 2992 }, { "epoch": 0.9872, "loss_ce": 0.37996724247932434, "loss_lvr": 0.4910716712474823, "loss_mode_switch": 0.0, "loss_total": 0.42907440662384033, "step": 2468 }, { "batch_size": 1, "epoch": 0.9872, "step": 2468, "tokens_per_device": 5456 }, { "epoch": 0.9872, "loss_ce": 0.09107654541730881, "loss_lvr": 0.3607146739959717, "loss_mode_switch": 0.0, "loss_total": 0.12714801728725433, "step": 2468 }, { "batch_size": 1, "epoch": 0.9872, "step": 2468, "tokens_per_device": 5080 }, { "epoch": 0.9872, "loss_ce": 0.5330281853675842, "loss_lvr": 0.38413843512535095, "loss_mode_switch": 0.0, "loss_total": 0.5714420080184937, "step": 2468 }, { "batch_size": 4, "epoch": 0.9872, "step": 2468, "tokens_per_device": 3808 }, { "epoch": 0.9872, "loss_ce": 0.1369507908821106, "loss_lvr": 0.9617713689804077, "loss_mode_switch": 0.0, "loss_total": 0.2331279218196869, "step": 2468 }, { "epoch": 0.9876, "grad_norm": 1.1571003198623657, "learning_rate": 4.031635710281534e-09, "loss": 0.2326, "step": 2469 }, { "batch_size": 1, "epoch": 0.9876, "step": 2469, "tokens_per_device": 4220 }, { "epoch": 0.9876, "loss_ce": 0.0011338057229295373, "loss_lvr": 0.7327706217765808, "loss_mode_switch": 0.0, "loss_total": 0.07441087067127228, "step": 2469 }, { "batch_size": 4, "epoch": 0.9876, "step": 2469, "tokens_per_device": 5384 }, { "epoch": 0.9876, "loss_ce": 0.07277602702379227, "loss_lvr": 0.6529075503349304, "loss_mode_switch": 0.0, "loss_total": 0.13806678354740143, "step": 2469 }, { "batch_size": 4, "epoch": 0.9876, "step": 2469, "tokens_per_device": 3752 }, { "epoch": 0.9876, "loss_ce": 0.08674010634422302, "loss_lvr": 1.029064655303955, "loss_mode_switch": 0.0, "loss_total": 0.18964657187461853, "step": 2469 }, { "batch_size": 4, "epoch": 0.9876, "step": 2469, "tokens_per_device": 2892 }, { "epoch": 0.9876, "loss_ce": 0.21446898579597473, "loss_lvr": 0.6452015042304993, "loss_mode_switch": 0.0, "loss_total": 0.27898913621902466, "step": 2469 }, { "batch_size": 1, "epoch": 0.9876, "step": 2469, "tokens_per_device": 4935 }, { "epoch": 0.9876, "loss_ce": 0.0030921853613108397, "loss_lvr": 0.8603398203849792, "loss_mode_switch": 0.0, "loss_total": 0.08912616968154907, "step": 2469 }, { "batch_size": 1, "epoch": 0.9876, "step": 2469, "tokens_per_device": 5105 }, { "epoch": 0.9876, "loss_ce": 0.0005424856790341437, "loss_lvr": 0.11772534251213074, "loss_mode_switch": 0.0, "loss_total": 0.012315020896494389, "step": 2469 }, { "batch_size": 4, "epoch": 0.9876, "step": 2469, "tokens_per_device": 4324 }, { "epoch": 0.9876, "loss_ce": 0.10511572659015656, "loss_lvr": 0.9018356204032898, "loss_mode_switch": 0.0, "loss_total": 0.19529929757118225, "step": 2469 }, { "batch_size": 4, "epoch": 0.9876, "step": 2469, "tokens_per_device": 4152 }, { "epoch": 0.9876, "loss_ce": 0.10569585114717484, "loss_lvr": 0.7002537846565247, "loss_mode_switch": 0.0, "loss_total": 0.17572122812271118, "step": 2469 }, { "epoch": 0.988, "grad_norm": 1.1757991313934326, "learning_rate": 3.775757644601808e-09, "loss": 0.2162, "step": 2470 }, { "batch_size": 4, "epoch": 0.988, "step": 2470, "tokens_per_device": 5128 }, { "epoch": 0.988, "loss_ce": 0.6145254373550415, "loss_lvr": 0.8276935815811157, "loss_mode_switch": 0.0, "loss_total": 0.6972947716712952, "step": 2470 }, { "batch_size": 4, "epoch": 0.988, "step": 2470, "tokens_per_device": 4804 }, { "epoch": 0.988, "loss_ce": 0.1598535031080246, "loss_lvr": 0.7620161175727844, "loss_mode_switch": 0.0, "loss_total": 0.23605510592460632, "step": 2470 }, { "batch_size": 1, "epoch": 0.988, "step": 2470, "tokens_per_device": 4767 }, { "epoch": 0.988, "loss_ce": 0.06855758279561996, "loss_lvr": 0.3071843683719635, "loss_mode_switch": 0.0, "loss_total": 0.09927602112293243, "step": 2470 }, { "batch_size": 4, "epoch": 0.988, "step": 2470, "tokens_per_device": 4264 }, { "epoch": 0.988, "loss_ce": 0.05627845972776413, "loss_lvr": 0.9768365025520325, "loss_mode_switch": 0.0, "loss_total": 0.15396210551261902, "step": 2470 }, { "batch_size": 4, "epoch": 0.988, "step": 2470, "tokens_per_device": 4264 }, { "epoch": 0.988, "loss_ce": 0.27254244685173035, "loss_lvr": 0.8134629726409912, "loss_mode_switch": 0.0, "loss_total": 0.35388875007629395, "step": 2470 }, { "batch_size": 1, "epoch": 0.988, "step": 2470, "tokens_per_device": 4742 }, { "epoch": 0.988, "loss_ce": 0.03478887304663658, "loss_lvr": 0.3624041676521301, "loss_mode_switch": 0.0, "loss_total": 0.07102929055690765, "step": 2470 }, { "batch_size": 4, "epoch": 0.988, "step": 2470, "tokens_per_device": 4452 }, { "epoch": 0.988, "loss_ce": 0.24040447175502777, "loss_lvr": 0.9056484699249268, "loss_mode_switch": 0.0, "loss_total": 0.33096933364868164, "step": 2470 }, { "batch_size": 1, "epoch": 0.988, "step": 2470, "tokens_per_device": 5068 }, { "epoch": 0.988, "loss_ce": 0.005257809069007635, "loss_lvr": 0.3644181489944458, "loss_mode_switch": 0.0, "loss_total": 0.041699621826410294, "step": 2470 }, { "epoch": 0.9884, "grad_norm": 1.0357847213745117, "learning_rate": 3.5282648695794675e-09, "loss": 0.1981, "step": 2471 }, { "batch_size": 4, "epoch": 0.9884, "step": 2471, "tokens_per_device": 5468 }, { "epoch": 0.9884, "loss_ce": 0.004292922560125589, "loss_lvr": 0.7500025629997253, "loss_mode_switch": 0.0, "loss_total": 0.07929317653179169, "step": 2471 }, { "batch_size": 4, "epoch": 0.9884, "step": 2471, "tokens_per_device": 11008 }, { "epoch": 0.9884, "loss_ce": 0.10703766345977783, "loss_lvr": 0.7715534567832947, "loss_mode_switch": 0.0, "loss_total": 0.18419301509857178, "step": 2471 }, { "batch_size": 4, "epoch": 0.9884, "step": 2471, "tokens_per_device": 4304 }, { "epoch": 0.9884, "loss_ce": 0.17437143623828888, "loss_lvr": 0.9738209843635559, "loss_mode_switch": 0.0, "loss_total": 0.27175354957580566, "step": 2471 }, { "batch_size": 4, "epoch": 0.9884, "step": 2471, "tokens_per_device": 4656 }, { "epoch": 0.9884, "loss_ce": 0.07701875269412994, "loss_lvr": 0.6244211196899414, "loss_mode_switch": 0.0, "loss_total": 0.13946086168289185, "step": 2471 }, { "batch_size": 1, "epoch": 0.9884, "step": 2471, "tokens_per_device": 4900 }, { "epoch": 0.9884, "loss_ce": 0.012020770460367203, "loss_lvr": 0.830295979976654, "loss_mode_switch": 0.0, "loss_total": 0.09505036473274231, "step": 2471 }, { "batch_size": 1, "epoch": 0.9884, "step": 2471, "tokens_per_device": 4852 }, { "epoch": 0.9884, "loss_ce": 0.07660138607025146, "loss_lvr": 0.17585067451000214, "loss_mode_switch": 0.0, "loss_total": 0.0941864550113678, "step": 2471 }, { "batch_size": 1, "epoch": 0.9884, "step": 2471, "tokens_per_device": 6450 }, { "epoch": 0.9884, "loss_ce": 0.05682837963104248, "loss_lvr": 0.48108017444610596, "loss_mode_switch": 0.0, "loss_total": 0.1049363985657692, "step": 2471 }, { "batch_size": 4, "epoch": 0.9884, "step": 2471, "tokens_per_device": 1428 }, { "epoch": 0.9884, "loss_ce": 0.0247601680457592, "loss_lvr": 0.8519635200500488, "loss_mode_switch": 0.0, "loss_total": 0.1099565178155899, "step": 2471 }, { "epoch": 0.9888, "grad_norm": 1.0863442420959473, "learning_rate": 3.2891578005889158e-09, "loss": 0.2139, "step": 2472 }, { "batch_size": 1, "epoch": 0.9888, "step": 2472, "tokens_per_device": 4624 }, { "epoch": 0.9888, "loss_ce": 0.01084070559591055, "loss_lvr": 0.2752493619918823, "loss_mode_switch": 0.0, "loss_total": 0.038365643471479416, "step": 2472 }, { "batch_size": 1, "epoch": 0.9888, "step": 2472, "tokens_per_device": 4854 }, { "epoch": 0.9888, "loss_ce": 0.0012969825183972716, "loss_lvr": 0.2267325073480606, "loss_mode_switch": 0.0, "loss_total": 0.023970233276486397, "step": 2472 }, { "batch_size": 4, "epoch": 0.9888, "step": 2472, "tokens_per_device": 4216 }, { "epoch": 0.9888, "loss_ce": 0.1573714166879654, "loss_lvr": 1.049897313117981, "loss_mode_switch": 0.0, "loss_total": 0.2623611390590668, "step": 2472 }, { "batch_size": 4, "epoch": 0.9888, "step": 2472, "tokens_per_device": 5788 }, { "epoch": 0.9888, "loss_ce": 0.045138970017433167, "loss_lvr": 0.7624498605728149, "loss_mode_switch": 0.0, "loss_total": 0.12138395756483078, "step": 2472 }, { "batch_size": 1, "epoch": 0.9888, "step": 2472, "tokens_per_device": 4901 }, { "epoch": 0.9888, "loss_ce": 0.038622044026851654, "loss_lvr": 0.8930829167366028, "loss_mode_switch": 0.0, "loss_total": 0.12793034315109253, "step": 2472 }, { "batch_size": 4, "epoch": 0.9888, "step": 2472, "tokens_per_device": 1380 }, { "epoch": 0.9888, "loss_ce": 0.2311021238565445, "loss_lvr": 1.0340536832809448, "loss_mode_switch": 0.0, "loss_total": 0.3345074951648712, "step": 2472 }, { "batch_size": 1, "epoch": 0.9888, "step": 2472, "tokens_per_device": 5108 }, { "epoch": 0.9888, "loss_ce": 0.002752308500930667, "loss_lvr": 0.8873242139816284, "loss_mode_switch": 0.0, "loss_total": 0.09148473292589188, "step": 2472 }, { "batch_size": 4, "epoch": 0.9888, "step": 2472, "tokens_per_device": 11608 }, { "epoch": 0.9888, "loss_ce": 0.026537328958511353, "loss_lvr": 0.7553677558898926, "loss_mode_switch": 0.0, "loss_total": 0.10207410901784897, "step": 2472 }, { "epoch": 0.9892, "grad_norm": 1.000803828239441, "learning_rate": 3.0584368389291465e-09, "loss": 0.1808, "step": 2473 }, { "batch_size": 1, "epoch": 0.9892, "step": 2473, "tokens_per_device": 5022 }, { "epoch": 0.9892, "loss_ce": 0.08256541192531586, "loss_lvr": 1.2509618997573853, "loss_mode_switch": 0.0, "loss_total": 0.20766159892082214, "step": 2473 }, { "batch_size": 4, "epoch": 0.9892, "step": 2473, "tokens_per_device": 4628 }, { "epoch": 0.9892, "loss_ce": 0.2335781455039978, "loss_lvr": 0.9556528925895691, "loss_mode_switch": 0.0, "loss_total": 0.3291434347629547, "step": 2473 }, { "batch_size": 4, "epoch": 0.9892, "step": 2473, "tokens_per_device": 4312 }, { "epoch": 0.9892, "loss_ce": 0.23105251789093018, "loss_lvr": 0.9396913051605225, "loss_mode_switch": 0.0, "loss_total": 0.3250216543674469, "step": 2473 }, { "batch_size": 4, "epoch": 0.9892, "step": 2473, "tokens_per_device": 4412 }, { "epoch": 0.9892, "loss_ce": 0.01139818225055933, "loss_lvr": 0.7558161020278931, "loss_mode_switch": 0.0, "loss_total": 0.08697979152202606, "step": 2473 }, { "batch_size": 4, "epoch": 0.9892, "step": 2473, "tokens_per_device": 1568 }, { "epoch": 0.9892, "loss_ce": 0.15778832137584686, "loss_lvr": 1.0121910572052002, "loss_mode_switch": 0.0, "loss_total": 0.25900742411613464, "step": 2473 }, { "batch_size": 1, "epoch": 0.9892, "step": 2473, "tokens_per_device": 5193 }, { "epoch": 0.9892, "loss_ce": 0.01120242103934288, "loss_lvr": 0.5206143260002136, "loss_mode_switch": 0.0, "loss_total": 0.06326385587453842, "step": 2473 }, { "batch_size": 4, "epoch": 0.9892, "step": 2473, "tokens_per_device": 5080 }, { "epoch": 0.9892, "loss_ce": 0.21106566488742828, "loss_lvr": 0.776757001876831, "loss_mode_switch": 0.0, "loss_total": 0.2887413501739502, "step": 2473 }, { "batch_size": 4, "epoch": 0.9892, "step": 2473, "tokens_per_device": 8804 }, { "epoch": 0.9892, "loss_ce": 0.023090912029147148, "loss_lvr": 0.5332768559455872, "loss_mode_switch": 0.0, "loss_total": 0.07641860097646713, "step": 2473 }, { "epoch": 0.9896, "grad_norm": 1.110074520111084, "learning_rate": 2.8361023718248557e-09, "loss": 0.1945, "step": 2474 }, { "batch_size": 4, "epoch": 0.9896, "step": 2474, "tokens_per_device": 2656 }, { "epoch": 0.9896, "loss_ce": 0.2350868433713913, "loss_lvr": 0.7607319951057434, "loss_mode_switch": 0.0, "loss_total": 0.31116002798080444, "step": 2474 }, { "batch_size": 4, "epoch": 0.9896, "step": 2474, "tokens_per_device": 3760 }, { "epoch": 0.9896, "loss_ce": 0.0878569632768631, "loss_lvr": 0.9303178191184998, "loss_mode_switch": 0.0, "loss_total": 0.18088874220848083, "step": 2474 }, { "batch_size": 4, "epoch": 0.9896, "step": 2474, "tokens_per_device": 6300 }, { "epoch": 0.9896, "loss_ce": 0.15610429644584656, "loss_lvr": 0.7136532664299011, "loss_mode_switch": 0.0, "loss_total": 0.22746962308883667, "step": 2474 }, { "batch_size": 1, "epoch": 0.9896, "step": 2474, "tokens_per_device": 4894 }, { "epoch": 0.9896, "loss_ce": 0.04451649636030197, "loss_lvr": 0.5171785950660706, "loss_mode_switch": 0.0, "loss_total": 0.09623435139656067, "step": 2474 }, { "batch_size": 1, "epoch": 0.9896, "step": 2474, "tokens_per_device": 5135 }, { "epoch": 0.9896, "loss_ce": 0.004853176884353161, "loss_lvr": 0.2538502514362335, "loss_mode_switch": 0.0, "loss_total": 0.030238203704357147, "step": 2474 }, { "batch_size": 4, "epoch": 0.9896, "step": 2474, "tokens_per_device": 1400 }, { "epoch": 0.9896, "loss_ce": 0.4825739860534668, "loss_lvr": 1.0356239080429077, "loss_mode_switch": 0.0, "loss_total": 0.5861364006996155, "step": 2474 }, { "batch_size": 4, "epoch": 0.9896, "step": 2474, "tokens_per_device": 4456 }, { "epoch": 0.9896, "loss_ce": 0.15077416598796844, "loss_lvr": 0.7882542014122009, "loss_mode_switch": 0.0, "loss_total": 0.22959959506988525, "step": 2474 }, { "batch_size": 4, "epoch": 0.9896, "step": 2474, "tokens_per_device": 4328 }, { "epoch": 0.9896, "loss_ce": 0.08769541233778, "loss_lvr": 0.7816131711006165, "loss_mode_switch": 0.0, "loss_total": 0.16585673391819, "step": 2474 }, { "epoch": 0.99, "grad_norm": 1.0256059169769287, "learning_rate": 2.6221547724253337e-09, "loss": 0.211, "step": 2475 }, { "batch_size": 4, "epoch": 0.99, "step": 2475, "tokens_per_device": 3764 }, { "epoch": 0.99, "loss_ce": 0.40783044695854187, "loss_lvr": 0.9117851853370667, "loss_mode_switch": 0.0, "loss_total": 0.4990089535713196, "step": 2475 }, { "batch_size": 4, "epoch": 0.99, "step": 2475, "tokens_per_device": 1508 }, { "epoch": 0.99, "loss_ce": 0.751358687877655, "loss_lvr": 0.9158593416213989, "loss_mode_switch": 0.0, "loss_total": 0.8429446220397949, "step": 2475 }, { "batch_size": 1, "epoch": 0.99, "step": 2475, "tokens_per_device": 5106 }, { "epoch": 0.99, "loss_ce": 0.016170399263501167, "loss_lvr": 0.47257885336875916, "loss_mode_switch": 0.0, "loss_total": 0.06342828273773193, "step": 2475 }, { "batch_size": 4, "epoch": 0.99, "step": 2475, "tokens_per_device": 1676 }, { "epoch": 0.99, "loss_ce": 0.07053530216217041, "loss_lvr": 0.8994381427764893, "loss_mode_switch": 0.0, "loss_total": 0.1604791283607483, "step": 2475 }, { "batch_size": 4, "epoch": 0.99, "step": 2475, "tokens_per_device": 2588 }, { "epoch": 0.99, "loss_ce": 0.05321851745247841, "loss_lvr": 0.7176687121391296, "loss_mode_switch": 0.0, "loss_total": 0.12498539686203003, "step": 2475 }, { "batch_size": 4, "epoch": 0.99, "step": 2475, "tokens_per_device": 11128 }, { "epoch": 0.99, "loss_ce": 0.3606080114841461, "loss_lvr": 0.6601868867874146, "loss_mode_switch": 0.0, "loss_total": 0.42662671208381653, "step": 2475 }, { "batch_size": 4, "epoch": 0.99, "step": 2475, "tokens_per_device": 3812 }, { "epoch": 0.99, "loss_ce": 0.2107006311416626, "loss_lvr": 0.9878655076026917, "loss_mode_switch": 0.0, "loss_total": 0.3094871938228607, "step": 2475 }, { "batch_size": 4, "epoch": 0.99, "step": 2475, "tokens_per_device": 4432 }, { "epoch": 0.99, "loss_ce": 0.3284686505794525, "loss_lvr": 0.9561662673950195, "loss_mode_switch": 0.0, "loss_total": 0.4240852892398834, "step": 2475 }, { "epoch": 0.9904, "grad_norm": 0.9348689317703247, "learning_rate": 2.4165943998050166e-09, "loss": 0.1976, "step": 2476 }, { "batch_size": 4, "epoch": 0.9904, "step": 2476, "tokens_per_device": 4212 }, { "epoch": 0.9904, "loss_ce": 0.06512533873319626, "loss_lvr": 1.1043071746826172, "loss_mode_switch": 0.0, "loss_total": 0.17555606365203857, "step": 2476 }, { "batch_size": 4, "epoch": 0.9904, "step": 2476, "tokens_per_device": 4260 }, { "epoch": 0.9904, "loss_ce": 0.5640403628349304, "loss_lvr": 0.9280552268028259, "loss_mode_switch": 0.0, "loss_total": 0.6568458676338196, "step": 2476 }, { "batch_size": 1, "epoch": 0.9904, "step": 2476, "tokens_per_device": 4856 }, { "epoch": 0.9904, "loss_ce": 0.007030402775853872, "loss_lvr": 0.44116494059562683, "loss_mode_switch": 0.0, "loss_total": 0.051146894693374634, "step": 2476 }, { "batch_size": 1, "epoch": 0.9904, "step": 2476, "tokens_per_device": 5115 }, { "epoch": 0.9904, "loss_ce": 0.01214342936873436, "loss_lvr": 0.25246861577033997, "loss_mode_switch": 0.0, "loss_total": 0.037390291690826416, "step": 2476 }, { "batch_size": 4, "epoch": 0.9904, "step": 2476, "tokens_per_device": 5664 }, { "epoch": 0.9904, "loss_ce": 0.26436033844947815, "loss_lvr": 0.6599134802818298, "loss_mode_switch": 0.0, "loss_total": 0.33035168051719666, "step": 2476 }, { "batch_size": 4, "epoch": 0.9904, "step": 2476, "tokens_per_device": 1280 }, { "epoch": 0.9904, "loss_ce": 0.4289740025997162, "loss_lvr": 1.1107361316680908, "loss_mode_switch": 0.0, "loss_total": 0.5400476455688477, "step": 2476 }, { "batch_size": 1, "epoch": 0.9904, "step": 2476, "tokens_per_device": 5195 }, { "epoch": 0.9904, "loss_ce": 0.0022925366647541523, "loss_lvr": 0.21324820816516876, "loss_mode_switch": 0.0, "loss_total": 0.02361735701560974, "step": 2476 }, { "batch_size": 4, "epoch": 0.9904, "step": 2476, "tokens_per_device": 6320 }, { "epoch": 0.9904, "loss_ce": 0.2668543756008148, "loss_lvr": 0.7194859385490417, "loss_mode_switch": 0.0, "loss_total": 0.3388029634952545, "step": 2476 }, { "epoch": 0.9908, "grad_norm": 0.9557770490646362, "learning_rate": 2.219421598961269e-09, "loss": 0.1923, "step": 2477 }, { "batch_size": 1, "epoch": 0.9908, "step": 2477, "tokens_per_device": 4425 }, { "epoch": 0.9908, "loss_ce": 0.0008928957977332175, "loss_lvr": 0.6386694312095642, "loss_mode_switch": 0.0, "loss_total": 0.06475983560085297, "step": 2477 }, { "batch_size": 1, "epoch": 0.9908, "step": 2477, "tokens_per_device": 4748 }, { "epoch": 0.9908, "loss_ce": 0.0052857412956655025, "loss_lvr": 0.5118897557258606, "loss_mode_switch": 0.0, "loss_total": 0.0564747154712677, "step": 2477 }, { "batch_size": 4, "epoch": 0.9908, "step": 2477, "tokens_per_device": 4228 }, { "epoch": 0.9908, "loss_ce": 0.4467753469944, "loss_lvr": 0.8361237049102783, "loss_mode_switch": 0.0, "loss_total": 0.5303876996040344, "step": 2477 }, { "batch_size": 4, "epoch": 0.9908, "step": 2477, "tokens_per_device": 5500 }, { "epoch": 0.9908, "loss_ce": 0.03202344849705696, "loss_lvr": 0.7241820096969604, "loss_mode_switch": 0.0, "loss_total": 0.10444165766239166, "step": 2477 }, { "batch_size": 4, "epoch": 0.9908, "step": 2477, "tokens_per_device": 6076 }, { "epoch": 0.9908, "loss_ce": 0.09170154482126236, "loss_lvr": 0.6420597434043884, "loss_mode_switch": 0.0, "loss_total": 0.1559075117111206, "step": 2477 }, { "batch_size": 4, "epoch": 0.9908, "step": 2477, "tokens_per_device": 3760 }, { "epoch": 0.9908, "loss_ce": 0.04388456046581268, "loss_lvr": 0.5866431593894958, "loss_mode_switch": 0.0, "loss_total": 0.10254888236522675, "step": 2477 }, { "batch_size": 1, "epoch": 0.9908, "step": 2477, "tokens_per_device": 5631 }, { "epoch": 0.9908, "loss_ce": 0.031599823385477066, "loss_lvr": 0.3431900441646576, "loss_mode_switch": 0.0, "loss_total": 0.06591883301734924, "step": 2477 }, { "batch_size": 4, "epoch": 0.9908, "step": 2477, "tokens_per_device": 4452 }, { "epoch": 0.9908, "loss_ce": 0.03953758627176285, "loss_lvr": 0.6524122357368469, "loss_mode_switch": 0.0, "loss_total": 0.10477881133556366, "step": 2477 }, { "epoch": 0.9912, "grad_norm": 0.9128947257995605, "learning_rate": 2.0306367008138263e-09, "loss": 0.1638, "step": 2478 }, { "batch_size": 4, "epoch": 0.9912, "step": 2478, "tokens_per_device": 4672 }, { "epoch": 0.9912, "loss_ce": 0.0028800980653613806, "loss_lvr": 0.7330981492996216, "loss_mode_switch": 0.0, "loss_total": 0.07618991285562515, "step": 2478 }, { "batch_size": 1, "epoch": 0.9912, "step": 2478, "tokens_per_device": 4973 }, { "epoch": 0.9912, "loss_ce": 0.05127564072608948, "loss_lvr": 0.5245816111564636, "loss_mode_switch": 0.0, "loss_total": 0.10373380780220032, "step": 2478 }, { "batch_size": 1, "epoch": 0.9912, "step": 2478, "tokens_per_device": 4647 }, { "epoch": 0.9912, "loss_ce": 0.002223508432507515, "loss_lvr": 0.24001094698905945, "loss_mode_switch": 0.0, "loss_total": 0.02622460387647152, "step": 2478 }, { "batch_size": 1, "epoch": 0.9912, "step": 2478, "tokens_per_device": 4685 }, { "epoch": 0.9912, "loss_ce": 0.0009523530607111752, "loss_lvr": 0.2870565950870514, "loss_mode_switch": 0.0, "loss_total": 0.029658013954758644, "step": 2478 }, { "batch_size": 4, "epoch": 0.9912, "step": 2478, "tokens_per_device": 4248 }, { "epoch": 0.9912, "loss_ce": 0.1836918294429779, "loss_lvr": 0.9359234571456909, "loss_mode_switch": 0.0, "loss_total": 0.277284175157547, "step": 2478 }, { "batch_size": 4, "epoch": 0.9912, "step": 2478, "tokens_per_device": 2792 }, { "epoch": 0.9912, "loss_ce": 0.24412432312965393, "loss_lvr": 0.5937294960021973, "loss_mode_switch": 0.0, "loss_total": 0.3034972846508026, "step": 2478 }, { "batch_size": 1, "epoch": 0.9912, "step": 2478, "tokens_per_device": 5140 }, { "epoch": 0.9912, "loss_ce": 0.012714152224361897, "loss_lvr": 0.25110238790512085, "loss_mode_switch": 0.0, "loss_total": 0.037824392318725586, "step": 2478 }, { "batch_size": 4, "epoch": 0.9912, "step": 2478, "tokens_per_device": 2656 }, { "epoch": 0.9912, "loss_ce": 0.6246464252471924, "loss_lvr": 0.7821052074432373, "loss_mode_switch": 0.0, "loss_total": 0.7028569579124451, "step": 2478 }, { "epoch": 0.9916, "grad_norm": 1.258758783340454, "learning_rate": 1.8502400222047967e-09, "loss": 0.2665, "step": 2479 }, { "batch_size": 4, "epoch": 0.9916, "step": 2479, "tokens_per_device": 1536 }, { "epoch": 0.9916, "loss_ce": 0.3238877058029175, "loss_lvr": 0.9636191129684448, "loss_mode_switch": 0.0, "loss_total": 0.4202496111392975, "step": 2479 }, { "batch_size": 4, "epoch": 0.9916, "step": 2479, "tokens_per_device": 3296 }, { "epoch": 0.9916, "loss_ce": 0.001037479960359633, "loss_lvr": 0.4869428873062134, "loss_mode_switch": 0.0, "loss_total": 0.049731768667697906, "step": 2479 }, { "batch_size": 4, "epoch": 0.9916, "step": 2479, "tokens_per_device": 1408 }, { "epoch": 0.9916, "loss_ce": 0.2123686969280243, "loss_lvr": 0.9346711039543152, "loss_mode_switch": 0.0, "loss_total": 0.3058358132839203, "step": 2479 }, { "batch_size": 4, "epoch": 0.9916, "step": 2479, "tokens_per_device": 3788 }, { "epoch": 0.9916, "loss_ce": 0.08401301503181458, "loss_lvr": 1.0142327547073364, "loss_mode_switch": 0.0, "loss_total": 0.18543629348278046, "step": 2479 }, { "batch_size": 4, "epoch": 0.9916, "step": 2479, "tokens_per_device": 2696 }, { "epoch": 0.9916, "loss_ce": 0.27078917622566223, "loss_lvr": 0.6961488723754883, "loss_mode_switch": 0.0, "loss_total": 0.34040406346321106, "step": 2479 }, { "batch_size": 4, "epoch": 0.9916, "step": 2479, "tokens_per_device": 7184 }, { "epoch": 0.9916, "loss_ce": 0.3314146399497986, "loss_lvr": 0.8923493027687073, "loss_mode_switch": 0.0, "loss_total": 0.42064958810806274, "step": 2479 }, { "batch_size": 4, "epoch": 0.9916, "step": 2479, "tokens_per_device": 5732 }, { "epoch": 0.9916, "loss_ce": 0.1929035186767578, "loss_lvr": 0.7422131299972534, "loss_mode_switch": 0.0, "loss_total": 0.26712483167648315, "step": 2479 }, { "batch_size": 4, "epoch": 0.9916, "step": 2479, "tokens_per_device": 4404 }, { "epoch": 0.9916, "loss_ce": 0.21467402577400208, "loss_lvr": 1.2187130451202393, "loss_mode_switch": 0.0, "loss_total": 0.33654534816741943, "step": 2479 }, { "epoch": 0.992, "grad_norm": 0.9999026656150818, "learning_rate": 1.6782318658992159e-09, "loss": 0.211, "step": 2480 }, { "batch_size": 4, "epoch": 0.992, "step": 2480, "tokens_per_device": 1408 }, { "epoch": 0.992, "loss_ce": 0.15925748646259308, "loss_lvr": 0.9217253923416138, "loss_mode_switch": 0.0, "loss_total": 0.25143003463745117, "step": 2480 }, { "batch_size": 4, "epoch": 0.992, "step": 2480, "tokens_per_device": 8424 }, { "epoch": 0.992, "loss_ce": 0.05151979252696037, "loss_lvr": 0.7898661494255066, "loss_mode_switch": 0.0, "loss_total": 0.13050641119480133, "step": 2480 }, { "batch_size": 4, "epoch": 0.992, "step": 2480, "tokens_per_device": 8888 }, { "epoch": 0.992, "loss_ce": 0.004053716082125902, "loss_lvr": 0.7572407126426697, "loss_mode_switch": 0.0, "loss_total": 0.079777792096138, "step": 2480 }, { "batch_size": 4, "epoch": 0.992, "step": 2480, "tokens_per_device": 2596 }, { "epoch": 0.992, "loss_ce": 0.2874911427497864, "loss_lvr": 1.0229674577713013, "loss_mode_switch": 0.0, "loss_total": 0.389787882566452, "step": 2480 }, { "batch_size": 4, "epoch": 0.992, "step": 2480, "tokens_per_device": 3744 }, { "epoch": 0.992, "loss_ce": 0.08379096537828445, "loss_lvr": 1.0122392177581787, "loss_mode_switch": 0.0, "loss_total": 0.18501488864421844, "step": 2480 }, { "batch_size": 4, "epoch": 0.992, "step": 2480, "tokens_per_device": 3996 }, { "epoch": 0.992, "loss_ce": 0.16169512271881104, "loss_lvr": 0.8385128378868103, "loss_mode_switch": 0.0, "loss_total": 0.2455464005470276, "step": 2480 }, { "batch_size": 4, "epoch": 0.992, "step": 2480, "tokens_per_device": 3824 }, { "epoch": 0.992, "loss_ce": 0.06405165046453476, "loss_lvr": 0.8688533902168274, "loss_mode_switch": 0.0, "loss_total": 0.15093699097633362, "step": 2480 }, { "batch_size": 4, "epoch": 0.992, "step": 2480, "tokens_per_device": 5788 }, { "epoch": 0.992, "loss_ce": 0.02018648386001587, "loss_lvr": 0.6515012383460999, "loss_mode_switch": 0.0, "loss_total": 0.0853366106748581, "step": 2480 }, { "epoch": 0.9924, "grad_norm": 1.0551902055740356, "learning_rate": 1.5146125205822703e-09, "loss": 0.2055, "step": 2481 }, { "batch_size": 1, "epoch": 0.9924, "step": 2481, "tokens_per_device": 4903 }, { "epoch": 0.9924, "loss_ce": 0.018766365945339203, "loss_lvr": 0.339049756526947, "loss_mode_switch": 0.0, "loss_total": 0.052671343088150024, "step": 2481 }, { "batch_size": 4, "epoch": 0.9924, "step": 2481, "tokens_per_device": 4224 }, { "epoch": 0.9924, "loss_ce": 0.036042794585227966, "loss_lvr": 0.6784822940826416, "loss_mode_switch": 0.0, "loss_total": 0.10389102250337601, "step": 2481 }, { "batch_size": 4, "epoch": 0.9924, "step": 2481, "tokens_per_device": 7028 }, { "epoch": 0.9924, "loss_ce": 0.19814738631248474, "loss_lvr": 0.670708179473877, "loss_mode_switch": 0.0, "loss_total": 0.26521819829940796, "step": 2481 }, { "batch_size": 1, "epoch": 0.9924, "step": 2481, "tokens_per_device": 4747 }, { "epoch": 0.9924, "loss_ce": 0.012822345830500126, "loss_lvr": 0.31918784976005554, "loss_mode_switch": 0.0, "loss_total": 0.044741131365299225, "step": 2481 }, { "batch_size": 4, "epoch": 0.9924, "step": 2481, "tokens_per_device": 4224 }, { "epoch": 0.9924, "loss_ce": 0.23747527599334717, "loss_lvr": 0.7325005531311035, "loss_mode_switch": 0.0, "loss_total": 0.3107253313064575, "step": 2481 }, { "batch_size": 4, "epoch": 0.9924, "step": 2481, "tokens_per_device": 4272 }, { "epoch": 0.9924, "loss_ce": 0.143325537443161, "loss_lvr": 1.0195062160491943, "loss_mode_switch": 0.0, "loss_total": 0.24527615308761597, "step": 2481 }, { "batch_size": 4, "epoch": 0.9924, "step": 2481, "tokens_per_device": 5696 }, { "epoch": 0.9924, "loss_ce": 0.13707488775253296, "loss_lvr": 0.7972983121871948, "loss_mode_switch": 0.0, "loss_total": 0.21680471301078796, "step": 2481 }, { "batch_size": 1, "epoch": 0.9924, "step": 2481, "tokens_per_device": 4862 }, { "epoch": 0.9924, "loss_ce": 0.0077543011866509914, "loss_lvr": 0.4590110778808594, "loss_mode_switch": 0.0, "loss_total": 0.05365540832281113, "step": 2481 }, { "epoch": 0.9928, "grad_norm": 1.0404049158096313, "learning_rate": 1.3593822608609642e-09, "loss": 0.2083, "step": 2482 }, { "batch_size": 4, "epoch": 0.9928, "step": 2482, "tokens_per_device": 2772 }, { "epoch": 0.9928, "loss_ce": 0.03935742750763893, "loss_lvr": 1.6348319053649902, "loss_mode_switch": 0.0, "loss_total": 0.20284061133861542, "step": 2482 }, { "batch_size": 4, "epoch": 0.9928, "step": 2482, "tokens_per_device": 4832 }, { "epoch": 0.9928, "loss_ce": 0.12929774820804596, "loss_lvr": 0.8491864204406738, "loss_mode_switch": 0.0, "loss_total": 0.21421638131141663, "step": 2482 }, { "batch_size": 1, "epoch": 0.9928, "step": 2482, "tokens_per_device": 4676 }, { "epoch": 0.9928, "loss_ce": 0.020902540534734726, "loss_lvr": 0.7314968705177307, "loss_mode_switch": 0.0, "loss_total": 0.09405222535133362, "step": 2482 }, { "batch_size": 4, "epoch": 0.9928, "step": 2482, "tokens_per_device": 3236 }, { "epoch": 0.9928, "loss_ce": 0.11183825880289078, "loss_lvr": 0.7805176377296448, "loss_mode_switch": 0.0, "loss_total": 0.1898900270462036, "step": 2482 }, { "batch_size": 4, "epoch": 0.9928, "step": 2482, "tokens_per_device": 5288 }, { "epoch": 0.9928, "loss_ce": 0.05060697719454765, "loss_lvr": 0.8769868016242981, "loss_mode_switch": 0.0, "loss_total": 0.1383056640625, "step": 2482 }, { "batch_size": 4, "epoch": 0.9928, "step": 2482, "tokens_per_device": 4220 }, { "epoch": 0.9928, "loss_ce": 0.024936823174357414, "loss_lvr": 0.9350548982620239, "loss_mode_switch": 0.0, "loss_total": 0.11844231933355331, "step": 2482 }, { "batch_size": 1, "epoch": 0.9928, "step": 2482, "tokens_per_device": 5122 }, { "epoch": 0.9928, "loss_ce": 0.021932143718004227, "loss_lvr": 0.373380184173584, "loss_mode_switch": 0.0, "loss_total": 0.059270162135362625, "step": 2482 }, { "batch_size": 1, "epoch": 0.9928, "step": 2482, "tokens_per_device": 5161 }, { "epoch": 0.9928, "loss_ce": 0.0007712937076576054, "loss_lvr": 0.5374904274940491, "loss_mode_switch": 0.0, "loss_total": 0.05452033877372742, "step": 2482 }, { "epoch": 0.9932, "grad_norm": 1.109882116317749, "learning_rate": 1.2125413472613424e-09, "loss": 0.2048, "step": 2483 }, { "batch_size": 4, "epoch": 0.9932, "step": 2483, "tokens_per_device": 1256 }, { "epoch": 0.9932, "loss_ce": 0.17071270942687988, "loss_lvr": 0.662990391254425, "loss_mode_switch": 0.0, "loss_total": 0.23701176047325134, "step": 2483 }, { "batch_size": 4, "epoch": 0.9932, "step": 2483, "tokens_per_device": 1976 }, { "epoch": 0.9932, "loss_ce": 0.33673471212387085, "loss_lvr": 0.7921382784843445, "loss_mode_switch": 0.0, "loss_total": 0.4159485399723053, "step": 2483 }, { "batch_size": 1, "epoch": 0.9932, "step": 2483, "tokens_per_device": 5030 }, { "epoch": 0.9932, "loss_ce": 0.0037083174102008343, "loss_lvr": 0.5388385057449341, "loss_mode_switch": 0.0, "loss_total": 0.05759216845035553, "step": 2483 }, { "batch_size": 1, "epoch": 0.9932, "step": 2483, "tokens_per_device": 5022 }, { "epoch": 0.9932, "loss_ce": 0.07089877873659134, "loss_lvr": 0.38410311937332153, "loss_mode_switch": 0.0, "loss_total": 0.10930909216403961, "step": 2483 }, { "batch_size": 4, "epoch": 0.9932, "step": 2483, "tokens_per_device": 5488 }, { "epoch": 0.9932, "loss_ce": 0.2463870644569397, "loss_lvr": 0.8007410168647766, "loss_mode_switch": 0.0, "loss_total": 0.32646116614341736, "step": 2483 }, { "batch_size": 1, "epoch": 0.9932, "step": 2483, "tokens_per_device": 5023 }, { "epoch": 0.9932, "loss_ce": 0.0003734620404429734, "loss_lvr": 0.23911648988723755, "loss_mode_switch": 0.0, "loss_total": 0.02428511157631874, "step": 2483 }, { "batch_size": 4, "epoch": 0.9932, "step": 2483, "tokens_per_device": 5160 }, { "epoch": 0.9932, "loss_ce": 0.4801172614097595, "loss_lvr": 0.6374470591545105, "loss_mode_switch": 0.0, "loss_total": 0.543861985206604, "step": 2483 }, { "batch_size": 1, "epoch": 0.9932, "step": 2483, "tokens_per_device": 5192 }, { "epoch": 0.9932, "loss_ce": 0.008157525211572647, "loss_lvr": 0.32969993352890015, "loss_mode_switch": 0.0, "loss_total": 0.0411275178194046, "step": 2483 }, { "epoch": 0.9936, "grad_norm": 0.9916797876358032, "learning_rate": 1.074090026231267e-09, "loss": 0.2038, "step": 2484 }, { "batch_size": 1, "epoch": 0.9936, "step": 2484, "tokens_per_device": 4966 }, { "epoch": 0.9936, "loss_ce": 0.011702466756105423, "loss_lvr": 0.2116347998380661, "loss_mode_switch": 0.0, "loss_total": 0.03286594897508621, "step": 2484 }, { "batch_size": 4, "epoch": 0.9936, "step": 2484, "tokens_per_device": 4400 }, { "epoch": 0.9936, "loss_ce": 0.083933524787426, "loss_lvr": 1.579479694366455, "loss_mode_switch": 0.0, "loss_total": 0.24188148975372314, "step": 2484 }, { "batch_size": 4, "epoch": 0.9936, "step": 2484, "tokens_per_device": 4500 }, { "epoch": 0.9936, "loss_ce": 0.23506906628608704, "loss_lvr": 0.6046085953712463, "loss_mode_switch": 0.0, "loss_total": 0.29552993178367615, "step": 2484 }, { "batch_size": 4, "epoch": 0.9936, "step": 2484, "tokens_per_device": 4248 }, { "epoch": 0.9936, "loss_ce": 0.03122951090335846, "loss_lvr": 0.8668202757835388, "loss_mode_switch": 0.0, "loss_total": 0.11791153997182846, "step": 2484 }, { "batch_size": 1, "epoch": 0.9936, "step": 2484, "tokens_per_device": 5118 }, { "epoch": 0.9936, "loss_ce": 0.00842319242656231, "loss_lvr": 0.18122448027133942, "loss_mode_switch": 0.0, "loss_total": 0.02654564008116722, "step": 2484 }, { "batch_size": 1, "epoch": 0.9936, "step": 2484, "tokens_per_device": 4940 }, { "epoch": 0.9936, "loss_ce": 0.00875540729612112, "loss_lvr": 0.456840455532074, "loss_mode_switch": 0.0, "loss_total": 0.05443945527076721, "step": 2484 }, { "batch_size": 1, "epoch": 0.9936, "step": 2484, "tokens_per_device": 4563 }, { "epoch": 0.9936, "loss_ce": 0.00046307200682349503, "loss_lvr": 0.3694153428077698, "loss_mode_switch": 0.0, "loss_total": 0.037404607981443405, "step": 2484 }, { "batch_size": 4, "epoch": 0.9936, "step": 2484, "tokens_per_device": 3996 }, { "epoch": 0.9936, "loss_ce": 0.16347810626029968, "loss_lvr": 0.6761102080345154, "loss_mode_switch": 0.0, "loss_total": 0.23108913004398346, "step": 2484 }, { "epoch": 0.994, "grad_norm": 1.07080078125, "learning_rate": 9.440285301370865e-10, "loss": 0.179, "step": 2485 }, { "batch_size": 4, "epoch": 0.994, "step": 2485, "tokens_per_device": 3804 }, { "epoch": 0.994, "loss_ce": 0.25672438740730286, "loss_lvr": 0.9440053105354309, "loss_mode_switch": 0.0, "loss_total": 0.35112491250038147, "step": 2485 }, { "batch_size": 4, "epoch": 0.994, "step": 2485, "tokens_per_device": 12412 }, { "epoch": 0.994, "loss_ce": 0.09201422333717346, "loss_lvr": 0.8524813055992126, "loss_mode_switch": 0.0, "loss_total": 0.17726236581802368, "step": 2485 }, { "batch_size": 4, "epoch": 0.994, "step": 2485, "tokens_per_device": 11176 }, { "epoch": 0.994, "loss_ce": 0.10285452753305435, "loss_lvr": 0.9114397764205933, "loss_mode_switch": 0.0, "loss_total": 0.1939985156059265, "step": 2485 }, { "batch_size": 4, "epoch": 0.994, "step": 2485, "tokens_per_device": 2560 }, { "epoch": 0.994, "loss_ce": 0.3068743348121643, "loss_lvr": 0.8060861229896545, "loss_mode_switch": 0.0, "loss_total": 0.3874829411506653, "step": 2485 }, { "batch_size": 4, "epoch": 0.994, "step": 2485, "tokens_per_device": 5684 }, { "epoch": 0.994, "loss_ce": 0.13980913162231445, "loss_lvr": 0.5826231837272644, "loss_mode_switch": 0.0, "loss_total": 0.1980714499950409, "step": 2485 }, { "batch_size": 4, "epoch": 0.994, "step": 2485, "tokens_per_device": 3796 }, { "epoch": 0.994, "loss_ce": 0.38975802063941956, "loss_lvr": 0.8314123749732971, "loss_mode_switch": 0.0, "loss_total": 0.47289925813674927, "step": 2485 }, { "batch_size": 4, "epoch": 0.994, "step": 2485, "tokens_per_device": 5784 }, { "epoch": 0.994, "loss_ce": 0.003498898819088936, "loss_lvr": 0.6335561275482178, "loss_mode_switch": 0.0, "loss_total": 0.06685451418161392, "step": 2485 }, { "batch_size": 4, "epoch": 0.994, "step": 2485, "tokens_per_device": 3768 }, { "epoch": 0.994, "loss_ce": 0.2804648280143738, "loss_lvr": 0.7662109136581421, "loss_mode_switch": 0.0, "loss_total": 0.3570859134197235, "step": 2485 }, { "epoch": 0.9944, "grad_norm": 1.0743327140808105, "learning_rate": 8.223570772636358e-10, "loss": 0.2233, "step": 2486 }, { "batch_size": 4, "epoch": 0.9944, "step": 2486, "tokens_per_device": 3796 }, { "epoch": 0.9944, "loss_ce": 0.20513392984867096, "loss_lvr": 0.6441428065299988, "loss_mode_switch": 0.0, "loss_total": 0.2695482075214386, "step": 2486 }, { "batch_size": 1, "epoch": 0.9944, "step": 2486, "tokens_per_device": 5164 }, { "epoch": 0.9944, "loss_ce": 0.002002598252147436, "loss_lvr": 0.3524029552936554, "loss_mode_switch": 0.0, "loss_total": 0.03724289312958717, "step": 2486 }, { "batch_size": 1, "epoch": 0.9944, "step": 2486, "tokens_per_device": 5019 }, { "epoch": 0.9944, "loss_ce": 0.003918044734746218, "loss_lvr": 0.4661986529827118, "loss_mode_switch": 0.0, "loss_total": 0.05053791031241417, "step": 2486 }, { "batch_size": 1, "epoch": 0.9944, "step": 2486, "tokens_per_device": 5194 }, { "epoch": 0.9944, "loss_ce": 0.1677827537059784, "loss_lvr": 0.16071172058582306, "loss_mode_switch": 0.0, "loss_total": 0.18385392427444458, "step": 2486 }, { "batch_size": 4, "epoch": 0.9944, "step": 2486, "tokens_per_device": 4212 }, { "epoch": 0.9944, "loss_ce": 0.09471206367015839, "loss_lvr": 0.850111186504364, "loss_mode_switch": 0.0, "loss_total": 0.17972317337989807, "step": 2486 }, { "batch_size": 4, "epoch": 0.9944, "step": 2486, "tokens_per_device": 5584 }, { "epoch": 0.9944, "loss_ce": 0.032821230590343475, "loss_lvr": 0.7002277374267578, "loss_mode_switch": 0.0, "loss_total": 0.1028440073132515, "step": 2486 }, { "batch_size": 1, "epoch": 0.9944, "step": 2486, "tokens_per_device": 4886 }, { "epoch": 0.9944, "loss_ce": 0.006432813126593828, "loss_lvr": 1.466247320175171, "loss_mode_switch": 0.0, "loss_total": 0.1530575454235077, "step": 2486 }, { "batch_size": 4, "epoch": 0.9944, "step": 2486, "tokens_per_device": 4884 }, { "epoch": 0.9944, "loss_ce": 0.31591665744781494, "loss_lvr": 0.7765942811965942, "loss_mode_switch": 0.0, "loss_total": 0.39357608556747437, "step": 2486 }, { "epoch": 0.9948, "grad_norm": 1.1057193279266357, "learning_rate": 7.090758718153457e-10, "loss": 0.2202, "step": 2487 }, { "batch_size": 4, "epoch": 0.9948, "step": 2487, "tokens_per_device": 3844 }, { "epoch": 0.9948, "loss_ce": 0.3494844436645508, "loss_lvr": 0.990458071231842, "loss_mode_switch": 0.0, "loss_total": 0.44853025674819946, "step": 2487 }, { "batch_size": 4, "epoch": 0.9948, "step": 2487, "tokens_per_device": 5828 }, { "epoch": 0.9948, "loss_ce": 0.18200267851352692, "loss_lvr": 0.804426372051239, "loss_mode_switch": 0.0, "loss_total": 0.262445330619812, "step": 2487 }, { "batch_size": 1, "epoch": 0.9948, "step": 2487, "tokens_per_device": 4705 }, { "epoch": 0.9948, "loss_ce": 0.004952756687998772, "loss_lvr": 0.4134751558303833, "loss_mode_switch": 0.0, "loss_total": 0.04630026966333389, "step": 2487 }, { "batch_size": 1, "epoch": 0.9948, "step": 2487, "tokens_per_device": 5152 }, { "epoch": 0.9948, "loss_ce": 6.64793697069399e-05, "loss_lvr": 0.29871150851249695, "loss_mode_switch": 0.0, "loss_total": 0.029937630519270897, "step": 2487 }, { "batch_size": 1, "epoch": 0.9948, "step": 2487, "tokens_per_device": 5118 }, { "epoch": 0.9948, "loss_ce": 0.00048078910913318396, "loss_lvr": 0.5066564679145813, "loss_mode_switch": 0.0, "loss_total": 0.05114643648266792, "step": 2487 }, { "batch_size": 4, "epoch": 0.9948, "step": 2487, "tokens_per_device": 4288 }, { "epoch": 0.9948, "loss_ce": 0.16318339109420776, "loss_lvr": 0.9474213123321533, "loss_mode_switch": 0.0, "loss_total": 0.25792551040649414, "step": 2487 }, { "batch_size": 4, "epoch": 0.9948, "step": 2487, "tokens_per_device": 2676 }, { "epoch": 0.9948, "loss_ce": 0.4440042972564697, "loss_lvr": 0.8661339282989502, "loss_mode_switch": 0.0, "loss_total": 0.5306177139282227, "step": 2487 }, { "batch_size": 4, "epoch": 0.9948, "step": 2487, "tokens_per_device": 1528 }, { "epoch": 0.9948, "loss_ce": 0.22063902020454407, "loss_lvr": 0.8835663199424744, "loss_mode_switch": 0.0, "loss_total": 0.30899566411972046, "step": 2487 }, { "epoch": 0.9952, "grad_norm": 1.1533689498901367, "learning_rate": 6.041851039151337e-10, "loss": 0.2079, "step": 2488 }, { "batch_size": 1, "epoch": 0.9952, "step": 2488, "tokens_per_device": 5098 }, { "epoch": 0.9952, "loss_ce": 0.0012766156578436494, "loss_lvr": 0.1992715299129486, "loss_mode_switch": 0.0, "loss_total": 0.021203769370913506, "step": 2488 }, { "batch_size": 4, "epoch": 0.9952, "step": 2488, "tokens_per_device": 4212 }, { "epoch": 0.9952, "loss_ce": 0.5372499823570251, "loss_lvr": 0.9900516867637634, "loss_mode_switch": 0.0, "loss_total": 0.636255145072937, "step": 2488 }, { "batch_size": 1, "epoch": 0.9952, "step": 2488, "tokens_per_device": 5120 }, { "epoch": 0.9952, "loss_ce": 0.05661411210894585, "loss_lvr": 0.5872557163238525, "loss_mode_switch": 0.0, "loss_total": 0.11533968150615692, "step": 2488 }, { "batch_size": 4, "epoch": 0.9952, "step": 2488, "tokens_per_device": 4652 }, { "epoch": 0.9952, "loss_ce": 0.03503473103046417, "loss_lvr": 0.6285389065742493, "loss_mode_switch": 0.0, "loss_total": 0.09788862615823746, "step": 2488 }, { "batch_size": 1, "epoch": 0.9952, "step": 2488, "tokens_per_device": 4825 }, { "epoch": 0.9952, "loss_ce": 0.13109655678272247, "loss_lvr": 0.21997644007205963, "loss_mode_switch": 0.0, "loss_total": 0.15309420228004456, "step": 2488 }, { "batch_size": 4, "epoch": 0.9952, "step": 2488, "tokens_per_device": 2684 }, { "epoch": 0.9952, "loss_ce": 0.3943134546279907, "loss_lvr": 0.804668664932251, "loss_mode_switch": 0.0, "loss_total": 0.4747803211212158, "step": 2488 }, { "batch_size": 1, "epoch": 0.9952, "step": 2488, "tokens_per_device": 5191 }, { "epoch": 0.9952, "loss_ce": 0.07659449428319931, "loss_lvr": 0.4687829613685608, "loss_mode_switch": 0.0, "loss_total": 0.12347279489040375, "step": 2488 }, { "batch_size": 4, "epoch": 0.9952, "step": 2488, "tokens_per_device": 1408 }, { "epoch": 0.9952, "loss_ce": 0.4876919090747833, "loss_lvr": 0.9466042518615723, "loss_mode_switch": 0.0, "loss_total": 0.582352340221405, "step": 2488 }, { "epoch": 0.9956, "grad_norm": 1.1274446249008179, "learning_rate": 5.076849496044034e-10, "loss": 0.2228, "step": 2489 }, { "batch_size": 4, "epoch": 0.9956, "step": 2489, "tokens_per_device": 3764 }, { "epoch": 0.9956, "loss_ce": 0.013489277102053165, "loss_lvr": 0.8593906164169312, "loss_mode_switch": 0.0, "loss_total": 0.09942834079265594, "step": 2489 }, { "batch_size": 4, "epoch": 0.9956, "step": 2489, "tokens_per_device": 4248 }, { "epoch": 0.9956, "loss_ce": 0.14346349239349365, "loss_lvr": 0.7682384848594666, "loss_mode_switch": 0.0, "loss_total": 0.22028735280036926, "step": 2489 }, { "batch_size": 1, "epoch": 0.9956, "step": 2489, "tokens_per_device": 5171 }, { "epoch": 0.9956, "loss_ce": 0.03255540877580643, "loss_lvr": 0.5299840569496155, "loss_mode_switch": 0.0, "loss_total": 0.08555381000041962, "step": 2489 }, { "batch_size": 4, "epoch": 0.9956, "step": 2489, "tokens_per_device": 4384 }, { "epoch": 0.9956, "loss_ce": 0.4714028537273407, "loss_lvr": 0.6936239004135132, "loss_mode_switch": 0.0, "loss_total": 0.5407652258872986, "step": 2489 }, { "batch_size": 1, "epoch": 0.9956, "step": 2489, "tokens_per_device": 4882 }, { "epoch": 0.9956, "loss_ce": 0.007804756984114647, "loss_lvr": 0.6670375466346741, "loss_mode_switch": 0.0, "loss_total": 0.07450851798057556, "step": 2489 }, { "batch_size": 1, "epoch": 0.9956, "step": 2489, "tokens_per_device": 4761 }, { "epoch": 0.9956, "loss_ce": 0.024609209969639778, "loss_lvr": 0.6688727736473083, "loss_mode_switch": 0.0, "loss_total": 0.09149648994207382, "step": 2489 }, { "batch_size": 4, "epoch": 0.9956, "step": 2489, "tokens_per_device": 3796 }, { "epoch": 0.9956, "loss_ce": 0.042220503091812134, "loss_lvr": 0.9434973001480103, "loss_mode_switch": 0.0, "loss_total": 0.13657024502754211, "step": 2489 }, { "batch_size": 1, "epoch": 0.9956, "step": 2489, "tokens_per_device": 5000 }, { "epoch": 0.9956, "loss_ce": 0.005841060075908899, "loss_lvr": 0.49920451641082764, "loss_mode_switch": 0.0, "loss_total": 0.055761512368917465, "step": 2489 }, { "epoch": 0.996, "grad_norm": 1.0739825963974, "learning_rate": 4.1957557084082447e-10, "loss": 0.203, "step": 2490 }, { "batch_size": 1, "epoch": 0.996, "step": 2490, "tokens_per_device": 5924 }, { "epoch": 0.996, "loss_ce": 0.01123250462114811, "loss_lvr": 0.3995158076286316, "loss_mode_switch": 0.0, "loss_total": 0.05118408799171448, "step": 2490 }, { "batch_size": 4, "epoch": 0.996, "step": 2490, "tokens_per_device": 2520 }, { "epoch": 0.996, "loss_ce": 0.14252863824367523, "loss_lvr": 0.9118850827217102, "loss_mode_switch": 0.0, "loss_total": 0.233717143535614, "step": 2490 }, { "batch_size": 4, "epoch": 0.996, "step": 2490, "tokens_per_device": 1648 }, { "epoch": 0.996, "loss_ce": 0.3437906801700592, "loss_lvr": 0.8096789717674255, "loss_mode_switch": 0.0, "loss_total": 0.42475858330726624, "step": 2490 }, { "batch_size": 4, "epoch": 0.996, "step": 2490, "tokens_per_device": 1732 }, { "epoch": 0.996, "loss_ce": 0.3941829800605774, "loss_lvr": 0.8680858016014099, "loss_mode_switch": 0.0, "loss_total": 0.48099157214164734, "step": 2490 }, { "batch_size": 4, "epoch": 0.996, "step": 2490, "tokens_per_device": 4836 }, { "epoch": 0.996, "loss_ce": 0.31331950426101685, "loss_lvr": 0.6473837494850159, "loss_mode_switch": 0.0, "loss_total": 0.37805789709091187, "step": 2490 }, { "batch_size": 4, "epoch": 0.996, "step": 2490, "tokens_per_device": 2832 }, { "epoch": 0.996, "loss_ce": 0.1521364152431488, "loss_lvr": 0.6586006283760071, "loss_mode_switch": 0.0, "loss_total": 0.2179964780807495, "step": 2490 }, { "batch_size": 4, "epoch": 0.996, "step": 2490, "tokens_per_device": 4164 }, { "epoch": 0.996, "loss_ce": 0.1280357837677002, "loss_lvr": 0.808982789516449, "loss_mode_switch": 0.0, "loss_total": 0.20893406867980957, "step": 2490 }, { "batch_size": 4, "epoch": 0.996, "step": 2490, "tokens_per_device": 4300 }, { "epoch": 0.996, "loss_ce": 0.3828859031200409, "loss_lvr": 0.7641534209251404, "loss_mode_switch": 0.0, "loss_total": 0.459301233291626, "step": 2490 }, { "epoch": 0.9964, "grad_norm": 1.0886967182159424, "learning_rate": 3.398571155011077e-10, "loss": 0.207, "step": 2491 }, { "batch_size": 4, "epoch": 0.9964, "step": 2491, "tokens_per_device": 3176 }, { "epoch": 0.9964, "loss_ce": 0.1702243536710739, "loss_lvr": 0.7760024070739746, "loss_mode_switch": 0.0, "loss_total": 0.24782459437847137, "step": 2491 }, { "batch_size": 4, "epoch": 0.9964, "step": 2491, "tokens_per_device": 6120 }, { "epoch": 0.9964, "loss_ce": 0.015247007831931114, "loss_lvr": 0.6023030877113342, "loss_mode_switch": 0.0, "loss_total": 0.07547731697559357, "step": 2491 }, { "batch_size": 4, "epoch": 0.9964, "step": 2491, "tokens_per_device": 2800 }, { "epoch": 0.9964, "loss_ce": 0.05698929727077484, "loss_lvr": 0.7371618151664734, "loss_mode_switch": 0.0, "loss_total": 0.13070547580718994, "step": 2491 }, { "batch_size": 4, "epoch": 0.9964, "step": 2491, "tokens_per_device": 9616 }, { "epoch": 0.9964, "loss_ce": 0.21219982206821442, "loss_lvr": 0.7410491704940796, "loss_mode_switch": 0.0, "loss_total": 0.2863047420978546, "step": 2491 }, { "batch_size": 1, "epoch": 0.9964, "step": 2491, "tokens_per_device": 5049 }, { "epoch": 0.9964, "loss_ce": 0.011708212085068226, "loss_lvr": 0.29001158475875854, "loss_mode_switch": 0.0, "loss_total": 0.040709372609853745, "step": 2491 }, { "batch_size": 4, "epoch": 0.9964, "step": 2491, "tokens_per_device": 4364 }, { "epoch": 0.9964, "loss_ce": 0.20896799862384796, "loss_lvr": 1.072544813156128, "loss_mode_switch": 0.0, "loss_total": 0.31622248888015747, "step": 2491 }, { "batch_size": 4, "epoch": 0.9964, "step": 2491, "tokens_per_device": 5220 }, { "epoch": 0.9964, "loss_ce": 0.14533354341983795, "loss_lvr": 0.8413485884666443, "loss_mode_switch": 0.0, "loss_total": 0.22946840524673462, "step": 2491 }, { "batch_size": 4, "epoch": 0.9964, "step": 2491, "tokens_per_device": 5392 }, { "epoch": 0.9964, "loss_ce": 0.04474305734038353, "loss_lvr": 0.7503420114517212, "loss_mode_switch": 0.0, "loss_total": 0.11977726221084595, "step": 2491 }, { "epoch": 0.9968, "grad_norm": 1.0449391603469849, "learning_rate": 2.6852971737878483e-10, "loss": 0.1958, "step": 2492 }, { "batch_size": 1, "epoch": 0.9968, "step": 2492, "tokens_per_device": 5185 }, { "epoch": 0.9968, "loss_ce": 0.00826321542263031, "loss_lvr": 0.2093496173620224, "loss_mode_switch": 0.0, "loss_total": 0.02919817715883255, "step": 2492 }, { "batch_size": 4, "epoch": 0.9968, "step": 2492, "tokens_per_device": 4816 }, { "epoch": 0.9968, "loss_ce": 0.014673051424324512, "loss_lvr": 0.8441537618637085, "loss_mode_switch": 0.0, "loss_total": 0.09908843040466309, "step": 2492 }, { "batch_size": 4, "epoch": 0.9968, "step": 2492, "tokens_per_device": 4228 }, { "epoch": 0.9968, "loss_ce": 0.19157904386520386, "loss_lvr": 1.487900972366333, "loss_mode_switch": 0.0, "loss_total": 0.34036916494369507, "step": 2492 }, { "batch_size": 4, "epoch": 0.9968, "step": 2492, "tokens_per_device": 5904 }, { "epoch": 0.9968, "loss_ce": 0.027696244418621063, "loss_lvr": 0.6922003030776978, "loss_mode_switch": 0.0, "loss_total": 0.09691627323627472, "step": 2492 }, { "batch_size": 4, "epoch": 0.9968, "step": 2492, "tokens_per_device": 4436 }, { "epoch": 0.9968, "loss_ce": 0.4133356213569641, "loss_lvr": 0.8943199515342712, "loss_mode_switch": 0.0, "loss_total": 0.5027676224708557, "step": 2492 }, { "batch_size": 4, "epoch": 0.9968, "step": 2492, "tokens_per_device": 9180 }, { "epoch": 0.9968, "loss_ce": 0.3689344525337219, "loss_lvr": 0.597740888595581, "loss_mode_switch": 0.0, "loss_total": 0.428708553314209, "step": 2492 }, { "batch_size": 1, "epoch": 0.9968, "step": 2492, "tokens_per_device": 4953 }, { "epoch": 0.9968, "loss_ce": 0.012762906029820442, "loss_lvr": 0.5324525833129883, "loss_mode_switch": 0.0, "loss_total": 0.06600816547870636, "step": 2492 }, { "batch_size": 4, "epoch": 0.9968, "step": 2492, "tokens_per_device": 2568 }, { "epoch": 0.9968, "loss_ce": 0.22103945910930634, "loss_lvr": 0.9739447236061096, "loss_mode_switch": 0.0, "loss_total": 0.318433940410614, "step": 2492 }, { "epoch": 0.9972, "grad_norm": 1.074914813041687, "learning_rate": 2.0559349618420877e-10, "loss": 0.2266, "step": 2493 }, { "batch_size": 4, "epoch": 0.9972, "step": 2493, "tokens_per_device": 6448 }, { "epoch": 0.9972, "loss_ce": 0.18564440310001373, "loss_lvr": 0.7788164615631104, "loss_mode_switch": 0.0, "loss_total": 0.263526052236557, "step": 2493 }, { "batch_size": 4, "epoch": 0.9972, "step": 2493, "tokens_per_device": 3596 }, { "epoch": 0.9972, "loss_ce": 0.06839519739151001, "loss_lvr": 0.8507934808731079, "loss_mode_switch": 0.0, "loss_total": 0.15347453951835632, "step": 2493 }, { "batch_size": 4, "epoch": 0.9972, "step": 2493, "tokens_per_device": 4008 }, { "epoch": 0.9972, "loss_ce": 0.025192061439156532, "loss_lvr": 1.343485951423645, "loss_mode_switch": 0.0, "loss_total": 0.15954066812992096, "step": 2493 }, { "batch_size": 1, "epoch": 0.9972, "step": 2493, "tokens_per_device": 5158 }, { "epoch": 0.9972, "loss_ce": 0.014951161108911037, "loss_lvr": 0.4712570905685425, "loss_mode_switch": 0.0, "loss_total": 0.0620768703520298, "step": 2493 }, { "batch_size": 1, "epoch": 0.9972, "step": 2493, "tokens_per_device": 4902 }, { "epoch": 0.9972, "loss_ce": 0.01059188786894083, "loss_lvr": 0.5969311594963074, "loss_mode_switch": 0.0, "loss_total": 0.07028500735759735, "step": 2493 }, { "batch_size": 1, "epoch": 0.9972, "step": 2493, "tokens_per_device": 5155 }, { "epoch": 0.9972, "loss_ce": 0.00011696166620822623, "loss_lvr": 0.4190587103366852, "loss_mode_switch": 0.0, "loss_total": 0.042022835463285446, "step": 2493 }, { "batch_size": 4, "epoch": 0.9972, "step": 2493, "tokens_per_device": 9380 }, { "epoch": 0.9972, "loss_ce": 0.08413418382406235, "loss_lvr": 0.5597439408302307, "loss_mode_switch": 0.0, "loss_total": 0.14010858535766602, "step": 2493 }, { "batch_size": 1, "epoch": 0.9972, "step": 2493, "tokens_per_device": 4861 }, { "epoch": 0.9972, "loss_ce": 0.001436265534721315, "loss_lvr": 0.35592204332351685, "loss_mode_switch": 0.0, "loss_total": 0.0370284728705883, "step": 2493 }, { "epoch": 0.9976, "grad_norm": 0.9565489292144775, "learning_rate": 1.5104855754566329e-10, "loss": 0.2117, "step": 2494 }, { "batch_size": 4, "epoch": 0.9976, "step": 2494, "tokens_per_device": 7080 }, { "epoch": 0.9976, "loss_ce": 0.1563449203968048, "loss_lvr": 0.8969902992248535, "loss_mode_switch": 0.0, "loss_total": 0.24604395031929016, "step": 2494 }, { "batch_size": 1, "epoch": 0.9976, "step": 2494, "tokens_per_device": 5068 }, { "epoch": 0.9976, "loss_ce": 0.0016711094649508595, "loss_lvr": 0.4409726560115814, "loss_mode_switch": 0.0, "loss_total": 0.0457683764398098, "step": 2494 }, { "batch_size": 1, "epoch": 0.9976, "step": 2494, "tokens_per_device": 4142 }, { "epoch": 0.9976, "loss_ce": 0.007806335110217333, "loss_lvr": 0.450893759727478, "loss_mode_switch": 0.0, "loss_total": 0.05289570987224579, "step": 2494 }, { "batch_size": 4, "epoch": 0.9976, "step": 2494, "tokens_per_device": 4852 }, { "epoch": 0.9976, "loss_ce": 0.1441798061132431, "loss_lvr": 0.7987033128738403, "loss_mode_switch": 0.0, "loss_total": 0.2240501344203949, "step": 2494 }, { "batch_size": 1, "epoch": 0.9976, "step": 2494, "tokens_per_device": 5027 }, { "epoch": 0.9976, "loss_ce": 0.0717584639787674, "loss_lvr": 0.7041550278663635, "loss_mode_switch": 0.0, "loss_total": 0.14217397570610046, "step": 2494 }, { "batch_size": 4, "epoch": 0.9976, "step": 2494, "tokens_per_device": 4208 }, { "epoch": 0.9976, "loss_ce": 0.06096469983458519, "loss_lvr": 0.7679390907287598, "loss_mode_switch": 0.0, "loss_total": 0.13775861263275146, "step": 2494 }, { "batch_size": 4, "epoch": 0.9976, "step": 2494, "tokens_per_device": 3300 }, { "epoch": 0.9976, "loss_ce": 0.19121746718883514, "loss_lvr": 0.4683781862258911, "loss_mode_switch": 0.0, "loss_total": 0.2380552887916565, "step": 2494 }, { "batch_size": 4, "epoch": 0.9976, "step": 2494, "tokens_per_device": 3768 }, { "epoch": 0.9976, "loss_ce": 0.03874315321445465, "loss_lvr": 0.8530138731002808, "loss_mode_switch": 0.0, "loss_total": 0.12404454499483109, "step": 2494 }, { "epoch": 0.998, "grad_norm": 0.9755903482437134, "learning_rate": 1.0489499300603279e-10, "loss": 0.2104, "step": 2495 }, { "batch_size": 1, "epoch": 0.998, "step": 2495, "tokens_per_device": 5082 }, { "epoch": 0.998, "loss_ce": 0.0024146828800439835, "loss_lvr": 0.577283501625061, "loss_mode_switch": 0.0, "loss_total": 0.06014303117990494, "step": 2495 }, { "batch_size": 4, "epoch": 0.998, "step": 2495, "tokens_per_device": 4500 }, { "epoch": 0.998, "loss_ce": 0.19997794926166534, "loss_lvr": 0.8613188862800598, "loss_mode_switch": 0.0, "loss_total": 0.2861098349094391, "step": 2495 }, { "batch_size": 4, "epoch": 0.998, "step": 2495, "tokens_per_device": 2648 }, { "epoch": 0.998, "loss_ce": 0.30929964780807495, "loss_lvr": 0.8313108086585999, "loss_mode_switch": 0.0, "loss_total": 0.39243072271347046, "step": 2495 }, { "batch_size": 4, "epoch": 0.998, "step": 2495, "tokens_per_device": 1620 }, { "epoch": 0.998, "loss_ce": 0.41193854808807373, "loss_lvr": 0.9505320191383362, "loss_mode_switch": 0.0, "loss_total": 0.5069917440414429, "step": 2495 }, { "batch_size": 4, "epoch": 0.998, "step": 2495, "tokens_per_device": 8852 }, { "epoch": 0.998, "loss_ce": 0.30561718344688416, "loss_lvr": 0.3771789073944092, "loss_mode_switch": 0.0, "loss_total": 0.3433350622653961, "step": 2495 }, { "batch_size": 4, "epoch": 0.998, "step": 2495, "tokens_per_device": 12664 }, { "epoch": 0.998, "loss_ce": 0.03248018026351929, "loss_lvr": 0.7768509387969971, "loss_mode_switch": 0.0, "loss_total": 0.11016527563333511, "step": 2495 }, { "batch_size": 1, "epoch": 0.998, "step": 2495, "tokens_per_device": 4880 }, { "epoch": 0.998, "loss_ce": 0.0072418516501784325, "loss_lvr": 0.3276576101779938, "loss_mode_switch": 0.0, "loss_total": 0.040007613599300385, "step": 2495 }, { "batch_size": 4, "epoch": 0.998, "step": 2495, "tokens_per_device": 4320 }, { "epoch": 0.998, "loss_ce": 0.013624780811369419, "loss_lvr": 1.3267252445220947, "loss_mode_switch": 0.0, "loss_total": 0.14629732072353363, "step": 2495 }, { "epoch": 0.9984, "grad_norm": 1.1915756464004517, "learning_rate": 6.713288002724305e-11, "loss": 0.2335, "step": 2496 }, { "batch_size": 4, "epoch": 0.9984, "step": 2496, "tokens_per_device": 4244 }, { "epoch": 0.9984, "loss_ce": 0.42580264806747437, "loss_lvr": 1.069100260734558, "loss_mode_switch": 0.0, "loss_total": 0.5327126979827881, "step": 2496 }, { "batch_size": 1, "epoch": 0.9984, "step": 2496, "tokens_per_device": 5023 }, { "epoch": 0.9984, "loss_ce": 0.0558958537876606, "loss_lvr": 0.40510526299476624, "loss_mode_switch": 0.0, "loss_total": 0.09640638530254364, "step": 2496 }, { "batch_size": 4, "epoch": 0.9984, "step": 2496, "tokens_per_device": 7912 }, { "epoch": 0.9984, "loss_ce": 0.12230399250984192, "loss_lvr": 0.8917637467384338, "loss_mode_switch": 0.0, "loss_total": 0.21148037910461426, "step": 2496 }, { "batch_size": 4, "epoch": 0.9984, "step": 2496, "tokens_per_device": 4840 }, { "epoch": 0.9984, "loss_ce": 0.30234023928642273, "loss_lvr": 0.9454525113105774, "loss_mode_switch": 0.0, "loss_total": 0.396885484457016, "step": 2496 }, { "batch_size": 1, "epoch": 0.9984, "step": 2496, "tokens_per_device": 4880 }, { "epoch": 0.9984, "loss_ce": 0.004193244036287069, "loss_lvr": 0.3017401099205017, "loss_mode_switch": 0.0, "loss_total": 0.03436725586652756, "step": 2496 }, { "batch_size": 4, "epoch": 0.9984, "step": 2496, "tokens_per_device": 3816 }, { "epoch": 0.9984, "loss_ce": 0.2814820110797882, "loss_lvr": 0.6993744969367981, "loss_mode_switch": 0.0, "loss_total": 0.35141944885253906, "step": 2496 }, { "batch_size": 4, "epoch": 0.9984, "step": 2496, "tokens_per_device": 3040 }, { "epoch": 0.9984, "loss_ce": 0.01768193021416664, "loss_lvr": 0.4233928620815277, "loss_mode_switch": 0.0, "loss_total": 0.06002121791243553, "step": 2496 }, { "batch_size": 4, "epoch": 0.9984, "step": 2496, "tokens_per_device": 1512 }, { "epoch": 0.9984, "loss_ce": 0.2207295447587967, "loss_lvr": 0.778470516204834, "loss_mode_switch": 0.0, "loss_total": 0.29857659339904785, "step": 2496 }, { "epoch": 0.9988, "grad_norm": 1.191342830657959, "learning_rate": 3.776228198526521e-11, "loss": 0.2133, "step": 2497 }, { "batch_size": 4, "epoch": 0.9988, "step": 2497, "tokens_per_device": 9276 }, { "epoch": 0.9988, "loss_ce": 0.015265953727066517, "loss_lvr": 0.34949561953544617, "loss_mode_switch": 0.0, "loss_total": 0.05021551623940468, "step": 2497 }, { "batch_size": 4, "epoch": 0.9988, "step": 2497, "tokens_per_device": 4360 }, { "epoch": 0.9988, "loss_ce": 0.20463678240776062, "loss_lvr": 0.8933588862419128, "loss_mode_switch": 0.0, "loss_total": 0.2939726710319519, "step": 2497 }, { "batch_size": 4, "epoch": 0.9988, "step": 2497, "tokens_per_device": 5736 }, { "epoch": 0.9988, "loss_ce": 0.3040779232978821, "loss_lvr": 0.7300752401351929, "loss_mode_switch": 0.0, "loss_total": 0.37708544731140137, "step": 2497 }, { "batch_size": 4, "epoch": 0.9988, "step": 2497, "tokens_per_device": 4296 }, { "epoch": 0.9988, "loss_ce": 0.27382341027259827, "loss_lvr": 0.8373166918754578, "loss_mode_switch": 0.0, "loss_total": 0.357555091381073, "step": 2497 }, { "batch_size": 1, "epoch": 0.9988, "step": 2497, "tokens_per_device": 4877 }, { "epoch": 0.9988, "loss_ce": 0.1877189576625824, "loss_lvr": 0.5037144422531128, "loss_mode_switch": 0.0, "loss_total": 0.2380903959274292, "step": 2497 }, { "batch_size": 1, "epoch": 0.9988, "step": 2497, "tokens_per_device": 4428 }, { "epoch": 0.9988, "loss_ce": 0.03171434625983238, "loss_lvr": 0.4734419584274292, "loss_mode_switch": 0.0, "loss_total": 0.07905854284763336, "step": 2497 }, { "batch_size": 4, "epoch": 0.9988, "step": 2497, "tokens_per_device": 2572 }, { "epoch": 0.9988, "loss_ce": 0.23507888615131378, "loss_lvr": 0.8391342759132385, "loss_mode_switch": 0.0, "loss_total": 0.3189923167228699, "step": 2497 }, { "batch_size": 4, "epoch": 0.9988, "step": 2497, "tokens_per_device": 6608 }, { "epoch": 0.9988, "loss_ce": 0.07533729076385498, "loss_lvr": 0.5379142165184021, "loss_mode_switch": 0.0, "loss_total": 0.12912870943546295, "step": 2497 }, { "epoch": 0.9992, "grad_norm": 1.1321178674697876, "learning_rate": 1.6783248174556677e-11, "loss": 0.2155, "step": 2498 }, { "batch_size": 4, "epoch": 0.9992, "step": 2498, "tokens_per_device": 3624 }, { "epoch": 0.9992, "loss_ce": 0.1722177118062973, "loss_lvr": 0.9987329840660095, "loss_mode_switch": 0.0, "loss_total": 0.27209100127220154, "step": 2498 }, { "batch_size": 4, "epoch": 0.9992, "step": 2498, "tokens_per_device": 1680 }, { "epoch": 0.9992, "loss_ce": 0.09385605901479721, "loss_lvr": 0.8374839425086975, "loss_mode_switch": 0.0, "loss_total": 0.17760445177555084, "step": 2498 }, { "batch_size": 4, "epoch": 0.9992, "step": 2498, "tokens_per_device": 4544 }, { "epoch": 0.9992, "loss_ce": 0.062111325562000275, "loss_lvr": 0.7195421457290649, "loss_mode_switch": 0.0, "loss_total": 0.13406553864479065, "step": 2498 }, { "batch_size": 4, "epoch": 0.9992, "step": 2498, "tokens_per_device": 5732 }, { "epoch": 0.9992, "loss_ce": 0.020413704216480255, "loss_lvr": 0.61074298620224, "loss_mode_switch": 0.0, "loss_total": 0.0814879983663559, "step": 2498 }, { "batch_size": 4, "epoch": 0.9992, "step": 2498, "tokens_per_device": 2624 }, { "epoch": 0.9992, "loss_ce": 0.10442917793989182, "loss_lvr": 0.8736896514892578, "loss_mode_switch": 0.0, "loss_total": 0.1917981505393982, "step": 2498 }, { "batch_size": 4, "epoch": 0.9992, "step": 2498, "tokens_per_device": 1372 }, { "epoch": 0.9992, "loss_ce": 0.09466497600078583, "loss_lvr": 1.116140604019165, "loss_mode_switch": 0.0, "loss_total": 0.20627903938293457, "step": 2498 }, { "batch_size": 4, "epoch": 0.9992, "step": 2498, "tokens_per_device": 3500 }, { "epoch": 0.9992, "loss_ce": 0.3094528615474701, "loss_lvr": 0.8072261810302734, "loss_mode_switch": 0.0, "loss_total": 0.3901754915714264, "step": 2498 }, { "batch_size": 4, "epoch": 0.9992, "step": 2498, "tokens_per_device": 4280 }, { "epoch": 0.9992, "loss_ce": 0.15200090408325195, "loss_lvr": 0.9722077250480652, "loss_mode_switch": 0.0, "loss_total": 0.24922168254852295, "step": 2498 }, { "epoch": 0.9996, "grad_norm": 0.9528752565383911, "learning_rate": 4.19581380417533e-12, "loss": 0.1823, "step": 2499 }, { "batch_size": 1, "epoch": 0.9996, "step": 2499, "tokens_per_device": 5134 }, { "epoch": 0.9996, "loss_ce": 0.0015493736136704683, "loss_lvr": 0.4049797058105469, "loss_mode_switch": 0.0, "loss_total": 0.04204734414815903, "step": 2499 }, { "batch_size": 4, "epoch": 0.9996, "step": 2499, "tokens_per_device": 6976 }, { "epoch": 0.9996, "loss_ce": 0.008929559029638767, "loss_lvr": 0.6740677952766418, "loss_mode_switch": 0.0, "loss_total": 0.0763363391160965, "step": 2499 }, { "batch_size": 4, "epoch": 0.9996, "step": 2499, "tokens_per_device": 4224 }, { "epoch": 0.9996, "loss_ce": 0.1387452781200409, "loss_lvr": 1.2499860525131226, "loss_mode_switch": 0.0, "loss_total": 0.26374387741088867, "step": 2499 }, { "batch_size": 4, "epoch": 0.9996, "step": 2499, "tokens_per_device": 4428 }, { "epoch": 0.9996, "loss_ce": 0.017160510644316673, "loss_lvr": 0.5988829135894775, "loss_mode_switch": 0.0, "loss_total": 0.07704880088567734, "step": 2499 }, { "batch_size": 4, "epoch": 0.9996, "step": 2499, "tokens_per_device": 5072 }, { "epoch": 0.9996, "loss_ce": 0.2380100041627884, "loss_lvr": 0.5008342862129211, "loss_mode_switch": 0.0, "loss_total": 0.2880934476852417, "step": 2499 }, { "batch_size": 4, "epoch": 0.9996, "step": 2499, "tokens_per_device": 4260 }, { "epoch": 0.9996, "loss_ce": 0.29889583587646484, "loss_lvr": 1.0361931324005127, "loss_mode_switch": 0.0, "loss_total": 0.40251514315605164, "step": 2499 }, { "batch_size": 1, "epoch": 0.9996, "step": 2499, "tokens_per_device": 5107 }, { "epoch": 0.9996, "loss_ce": 0.00013485029921866953, "loss_lvr": 0.17370851337909698, "loss_mode_switch": 0.0, "loss_total": 0.017505701631307602, "step": 2499 }, { "batch_size": 4, "epoch": 0.9996, "step": 2499, "tokens_per_device": 8904 }, { "epoch": 0.9996, "loss_ce": 0.010355699807405472, "loss_lvr": 0.6459061503410339, "loss_mode_switch": 0.0, "loss_total": 0.0749463140964508, "step": 2499 }, { "epoch": 1.0, "grad_norm": 1.02496337890625, "learning_rate": 0.0, "loss": 0.2193, "step": 2500 } ], "logging_steps": 1.0, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3883092291223552.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }