diff --git "a/checkpoint-1000/trainer_state.json" "b/checkpoint-1000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1000/trainer_state.json" @@ -0,0 +1,33434 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.28758357897764036, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "ce_ib": 65.99971008300781, + "ce_orig": 0.8247115612030029, + "epoch": 0, + "kl_loss": 3969.01025390625, + "loss_ib": 39.756099700927734, + "step": 0 + }, + { + "ce_ib": 61.875301361083984, + "ce_orig": 0.3094598948955536, + "epoch": 0, + "kl_loss": 1816.435302734375, + "loss_ib": 18.226226806640625, + "step": 0 + }, + { + "ce_ib": 65.33805084228516, + "ce_orig": 1.0820972919464111, + "epoch": 0, + "kl_loss": 4051.13818359375, + "loss_ib": 40.576717376708984, + "step": 0 + }, + { + "ce_ib": 65.36083221435547, + "ce_orig": 0.8601827025413513, + "epoch": 0, + "kl_loss": 3727.80126953125, + "loss_ib": 37.3433723449707, + "step": 0 + }, + { + "ce_ib": 64.40461730957031, + "ce_orig": 1.3601988554000854, + "epoch": 0.00028758357897764035, + "kl_loss": 3548.660888671875, + "loss_ib": 35.5510139465332, + "step": 1 + }, + { + "ce_ib": 66.136474609375, + "ce_orig": 0.9451982975006104, + "epoch": 0.00028758357897764035, + "kl_loss": 4003.119140625, + "loss_ib": 40.097328186035156, + "step": 1 + }, + { + "ce_ib": 65.30732727050781, + "ce_orig": 1.3611608743667603, + "epoch": 0.00028758357897764035, + "kl_loss": 3076.302490234375, + "loss_ib": 30.828330993652344, + "step": 1 + }, + { + "ce_ib": 63.613216400146484, + "ce_orig": 0.5681392550468445, + "epoch": 0.00028758357897764035, + "kl_loss": 3922.22265625, + "loss_ib": 39.28583908081055, + "step": 1 + }, + { + "ce_ib": 65.20169067382812, + "ce_orig": 0.9869711399078369, + "epoch": 0.0005751671579552807, + "kl_loss": 4010.333251953125, + "loss_ib": 40.16853332519531, + "step": 2 + }, + { + "ce_ib": 64.6613540649414, + "ce_orig": 1.0124142169952393, + "epoch": 0.0005751671579552807, + "kl_loss": 3416.4658203125, + "loss_ib": 34.22931671142578, + "step": 2 + }, + { + "ce_ib": 64.3924560546875, + "ce_orig": 0.825140118598938, + "epoch": 0.0005751671579552807, + "kl_loss": 3954.5244140625, + "loss_ib": 39.60963439941406, + "step": 2 + }, + { + "ce_ib": 66.31563568115234, + "ce_orig": 1.6114795207977295, + "epoch": 0.0005751671579552807, + "kl_loss": 3360.53955078125, + "loss_ib": 33.67171096801758, + "step": 2 + }, + { + "ce_ib": 63.97846603393555, + "ce_orig": 1.0248628854751587, + "epoch": 0.0008627507369329212, + "kl_loss": 3866.74462890625, + "loss_ib": 38.73142623901367, + "step": 3 + }, + { + "ce_ib": 64.94669342041016, + "ce_orig": 0.7158174514770508, + "epoch": 0.0008627507369329212, + "kl_loss": 3586.52783203125, + "loss_ib": 35.93022537231445, + "step": 3 + }, + { + "ce_ib": 66.78568267822266, + "ce_orig": 1.1728931665420532, + "epoch": 0.0008627507369329212, + "kl_loss": 3981.269775390625, + "loss_ib": 39.87948226928711, + "step": 3 + }, + { + "ce_ib": 66.30445861816406, + "ce_orig": 0.9273799657821655, + "epoch": 0.0008627507369329212, + "kl_loss": 3999.728271484375, + "loss_ib": 40.0635871887207, + "step": 3 + }, + { + "ce_ib": 63.22294616699219, + "ce_orig": 0.6721798181533813, + "epoch": 0.0011503343159105614, + "kl_loss": 3434.2626953125, + "loss_ib": 34.40584945678711, + "step": 4 + }, + { + "ce_ib": 65.629150390625, + "ce_orig": 0.851636528968811, + "epoch": 0.0011503343159105614, + "kl_loss": 3777.80029296875, + "loss_ib": 37.843631744384766, + "step": 4 + }, + { + "ce_ib": 65.70416259765625, + "ce_orig": 0.8407150506973267, + "epoch": 0.0011503343159105614, + "kl_loss": 3663.44775390625, + "loss_ib": 36.70018005371094, + "step": 4 + }, + { + "ce_ib": 65.25149536132812, + "ce_orig": 0.8431562781333923, + "epoch": 0.0011503343159105614, + "kl_loss": 4073.102783203125, + "loss_ib": 40.79627990722656, + "step": 4 + }, + { + "epoch": 0.0014379178948882019, + "grad_norm": Infinity, + "learning_rate": 0.0, + "loss": 37.6651, + "step": 5 + }, + { + "ce_ib": 63.31033706665039, + "ce_orig": 0.5193647146224976, + "epoch": 0.0014379178948882019, + "kl_loss": 3829.75732421875, + "loss_ib": 38.36088180541992, + "step": 5 + }, + { + "ce_ib": 64.82113647460938, + "ce_orig": 0.9080048203468323, + "epoch": 0.0014379178948882019, + "kl_loss": 4034.60400390625, + "loss_ib": 40.41086196899414, + "step": 5 + }, + { + "ce_ib": 67.75746154785156, + "ce_orig": 1.7583141326904297, + "epoch": 0.0014379178948882019, + "kl_loss": 3362.895751953125, + "loss_ib": 33.696712493896484, + "step": 5 + }, + { + "ce_ib": 65.55052947998047, + "ce_orig": 1.0019645690917969, + "epoch": 0.0014379178948882019, + "kl_loss": 3561.7119140625, + "loss_ib": 35.68266677856445, + "step": 5 + }, + { + "ce_ib": 65.5093765258789, + "ce_orig": 1.2022827863693237, + "epoch": 0.0017255014738658423, + "kl_loss": 3854.793212890625, + "loss_ib": 38.613441467285156, + "step": 6 + }, + { + "ce_ib": 63.95633316040039, + "ce_orig": 0.5561846494674683, + "epoch": 0.0017255014738658423, + "kl_loss": 3231.163818359375, + "loss_ib": 32.37559509277344, + "step": 6 + }, + { + "ce_ib": 66.91143798828125, + "ce_orig": 1.007911205291748, + "epoch": 0.0017255014738658423, + "kl_loss": 3694.936767578125, + "loss_ib": 37.01627731323242, + "step": 6 + }, + { + "ce_ib": 65.86326599121094, + "ce_orig": 1.1325939893722534, + "epoch": 0.0017255014738658423, + "kl_loss": 3653.87255859375, + "loss_ib": 36.60458755493164, + "step": 6 + }, + { + "ce_ib": 61.932804107666016, + "ce_orig": 0.3588312268257141, + "epoch": 0.0020130850528434826, + "kl_loss": 2617.568359375, + "loss_ib": 26.23761749267578, + "step": 7 + }, + { + "ce_ib": 66.4891586303711, + "ce_orig": 0.9551964402198792, + "epoch": 0.0020130850528434826, + "kl_loss": 4009.619140625, + "loss_ib": 40.162681579589844, + "step": 7 + }, + { + "ce_ib": 64.68766021728516, + "ce_orig": 1.3480956554412842, + "epoch": 0.0020130850528434826, + "kl_loss": 3682.406494140625, + "loss_ib": 36.88875198364258, + "step": 7 + }, + { + "ce_ib": 65.71851348876953, + "ce_orig": 1.4119411706924438, + "epoch": 0.0020130850528434826, + "kl_loss": 3544.10595703125, + "loss_ib": 35.50677490234375, + "step": 7 + }, + { + "ce_ib": 64.80267333984375, + "ce_orig": 1.1264560222625732, + "epoch": 0.002300668631821123, + "kl_loss": 3803.631103515625, + "loss_ib": 38.101112365722656, + "step": 8 + }, + { + "ce_ib": 64.57341766357422, + "ce_orig": 0.8282275199890137, + "epoch": 0.002300668631821123, + "kl_loss": 4064.74267578125, + "loss_ib": 40.711997985839844, + "step": 8 + }, + { + "ce_ib": 64.71014404296875, + "ce_orig": 0.8245378732681274, + "epoch": 0.002300668631821123, + "kl_loss": 3696.43896484375, + "loss_ib": 37.02909851074219, + "step": 8 + }, + { + "ce_ib": 66.23856353759766, + "ce_orig": 0.7464695572853088, + "epoch": 0.002300668631821123, + "kl_loss": 3910.202880859375, + "loss_ib": 39.16826629638672, + "step": 8 + }, + { + "ce_ib": 66.02950286865234, + "ce_orig": 1.2234686613082886, + "epoch": 0.0025882522107987635, + "kl_loss": 3270.403076171875, + "loss_ib": 32.77006149291992, + "step": 9 + }, + { + "ce_ib": 61.797386169433594, + "ce_orig": 0.6015214920043945, + "epoch": 0.0025882522107987635, + "kl_loss": 3816.387939453125, + "loss_ib": 38.22567367553711, + "step": 9 + }, + { + "ce_ib": 61.85765075683594, + "ce_orig": 0.6827896237373352, + "epoch": 0.0025882522107987635, + "kl_loss": 3886.591064453125, + "loss_ib": 38.92776870727539, + "step": 9 + }, + { + "ce_ib": 66.08187866210938, + "ce_orig": 1.3109632730484009, + "epoch": 0.0025882522107987635, + "kl_loss": 3950.779541015625, + "loss_ib": 39.573875427246094, + "step": 9 + }, + { + "epoch": 0.0028758357897764038, + "grad_norm": 519.0091552734375, + "learning_rate": 1.2738853503184715e-07, + "loss": 37.7545, + "step": 10 + }, + { + "ce_ib": 64.04639434814453, + "ce_orig": 0.7621712684631348, + "epoch": 0.0028758357897764038, + "kl_loss": 3556.8876953125, + "loss_ib": 35.6329231262207, + "step": 10 + }, + { + "ce_ib": 68.0383071899414, + "ce_orig": 1.6497186422348022, + "epoch": 0.0028758357897764038, + "kl_loss": 3772.04345703125, + "loss_ib": 37.78847122192383, + "step": 10 + }, + { + "ce_ib": 68.69857025146484, + "ce_orig": 1.7943047285079956, + "epoch": 0.0028758357897764038, + "kl_loss": 3361.59521484375, + "loss_ib": 33.68465042114258, + "step": 10 + }, + { + "ce_ib": 66.5051040649414, + "ce_orig": 0.9888308644294739, + "epoch": 0.0028758357897764038, + "kl_loss": 3659.6396484375, + "loss_ib": 36.662899017333984, + "step": 10 + }, + { + "ce_ib": 69.01343536376953, + "ce_orig": 1.8538011312484741, + "epoch": 0.003163419368754044, + "kl_loss": 3817.212158203125, + "loss_ib": 38.24113464355469, + "step": 11 + }, + { + "ce_ib": 66.35260772705078, + "ce_orig": 1.5063494443893433, + "epoch": 0.003163419368754044, + "kl_loss": 3289.161376953125, + "loss_ib": 32.95796585083008, + "step": 11 + }, + { + "ce_ib": 63.46610641479492, + "ce_orig": 0.9150936007499695, + "epoch": 0.003163419368754044, + "kl_loss": 3871.0595703125, + "loss_ib": 38.77406311035156, + "step": 11 + }, + { + "ce_ib": 65.04010009765625, + "ce_orig": 0.8206998705863953, + "epoch": 0.003163419368754044, + "kl_loss": 1842.705322265625, + "loss_ib": 18.492094039916992, + "step": 11 + }, + { + "ce_ib": 64.97047424316406, + "ce_orig": 1.1343697309494019, + "epoch": 0.0034510029477316847, + "kl_loss": 3761.435546875, + "loss_ib": 37.679325103759766, + "step": 12 + }, + { + "ce_ib": 65.20761108398438, + "ce_orig": 0.8448718190193176, + "epoch": 0.0034510029477316847, + "kl_loss": 3675.133544921875, + "loss_ib": 36.81654357910156, + "step": 12 + }, + { + "ce_ib": 61.798465728759766, + "ce_orig": 0.8912767767906189, + "epoch": 0.0034510029477316847, + "kl_loss": 3862.031982421875, + "loss_ib": 38.6821174621582, + "step": 12 + }, + { + "ce_ib": 62.26899337768555, + "ce_orig": 0.6894794702529907, + "epoch": 0.0034510029477316847, + "kl_loss": 3852.8837890625, + "loss_ib": 38.59110641479492, + "step": 12 + }, + { + "ce_ib": 62.82128143310547, + "ce_orig": 0.8209494948387146, + "epoch": 0.003738586526709325, + "kl_loss": 4055.579345703125, + "loss_ib": 40.618614196777344, + "step": 13 + }, + { + "ce_ib": 63.56863021850586, + "ce_orig": 0.6192977428436279, + "epoch": 0.003738586526709325, + "kl_loss": 3213.953369140625, + "loss_ib": 32.203102111816406, + "step": 13 + }, + { + "ce_ib": 63.4449348449707, + "ce_orig": 0.8305644392967224, + "epoch": 0.003738586526709325, + "kl_loss": 4162.50732421875, + "loss_ib": 41.68851852416992, + "step": 13 + }, + { + "ce_ib": 63.81005859375, + "ce_orig": 0.63532555103302, + "epoch": 0.003738586526709325, + "kl_loss": 3868.43896484375, + "loss_ib": 38.748199462890625, + "step": 13 + }, + { + "ce_ib": 63.38967514038086, + "ce_orig": 1.0747102499008179, + "epoch": 0.004026170105686965, + "kl_loss": 3797.86328125, + "loss_ib": 38.04201889038086, + "step": 14 + }, + { + "ce_ib": 64.95621490478516, + "ce_orig": 0.2551676332950592, + "epoch": 0.004026170105686965, + "kl_loss": 3229.682861328125, + "loss_ib": 32.36178207397461, + "step": 14 + }, + { + "ce_ib": 63.782310485839844, + "ce_orig": 0.9092867374420166, + "epoch": 0.004026170105686965, + "kl_loss": 4164.35009765625, + "loss_ib": 41.70728302001953, + "step": 14 + }, + { + "ce_ib": 62.4797477722168, + "ce_orig": 0.47730547189712524, + "epoch": 0.004026170105686965, + "kl_loss": 3852.087158203125, + "loss_ib": 38.583351135253906, + "step": 14 + }, + { + "epoch": 0.004313753684664605, + "grad_norm": 523.5825805664062, + "learning_rate": 2.8662420382165606e-07, + "loss": 37.7441, + "step": 15 + }, + { + "ce_ib": 66.0411148071289, + "ce_orig": 1.2483270168304443, + "epoch": 0.004313753684664605, + "kl_loss": 3730.42578125, + "loss_ib": 37.370296478271484, + "step": 15 + }, + { + "ce_ib": 62.428306579589844, + "ce_orig": 0.6228333711624146, + "epoch": 0.004313753684664605, + "kl_loss": 3604.6357421875, + "loss_ib": 36.108787536621094, + "step": 15 + }, + { + "ce_ib": 63.51506042480469, + "ce_orig": 1.2859349250793457, + "epoch": 0.004313753684664605, + "kl_loss": 4202.3095703125, + "loss_ib": 42.08660888671875, + "step": 15 + }, + { + "ce_ib": 63.46177291870117, + "ce_orig": 0.7081962823867798, + "epoch": 0.004313753684664605, + "kl_loss": 3794.773193359375, + "loss_ib": 38.011192321777344, + "step": 15 + }, + { + "ce_ib": 63.84330749511719, + "ce_orig": 0.7915632724761963, + "epoch": 0.004601337263642246, + "kl_loss": 3308.423828125, + "loss_ib": 33.14807891845703, + "step": 16 + }, + { + "ce_ib": 64.69558715820312, + "ce_orig": 1.4574185609817505, + "epoch": 0.004601337263642246, + "kl_loss": 3845.239990234375, + "loss_ib": 38.517093658447266, + "step": 16 + }, + { + "ce_ib": 63.32929992675781, + "ce_orig": 0.954424262046814, + "epoch": 0.004601337263642246, + "kl_loss": 3470.448486328125, + "loss_ib": 34.76781463623047, + "step": 16 + }, + { + "ce_ib": 67.40885925292969, + "ce_orig": 0.8406963348388672, + "epoch": 0.004601337263642246, + "kl_loss": 4098.966796875, + "loss_ib": 41.05707550048828, + "step": 16 + }, + { + "ce_ib": 66.77290344238281, + "ce_orig": 1.3419686555862427, + "epoch": 0.004888920842619887, + "kl_loss": 3622.17578125, + "loss_ib": 36.28852844238281, + "step": 17 + }, + { + "ce_ib": 62.80875778198242, + "ce_orig": 0.6389923691749573, + "epoch": 0.004888920842619887, + "kl_loss": 4102.7236328125, + "loss_ib": 41.09004211425781, + "step": 17 + }, + { + "ce_ib": 64.88585662841797, + "ce_orig": 1.1766713857650757, + "epoch": 0.004888920842619887, + "kl_loss": 1908.9375, + "loss_ib": 19.154260635375977, + "step": 17 + }, + { + "ce_ib": 65.38214874267578, + "ce_orig": 1.252450942993164, + "epoch": 0.004888920842619887, + "kl_loss": 3970.094482421875, + "loss_ib": 39.766326904296875, + "step": 17 + }, + { + "ce_ib": 64.7368392944336, + "ce_orig": 1.1009352207183838, + "epoch": 0.005176504421597527, + "kl_loss": 3504.70361328125, + "loss_ib": 35.11177062988281, + "step": 18 + }, + { + "ce_ib": 62.09738540649414, + "ce_orig": 0.480591744184494, + "epoch": 0.005176504421597527, + "kl_loss": 3198.1953125, + "loss_ib": 32.04404830932617, + "step": 18 + }, + { + "ce_ib": 65.45724487304688, + "ce_orig": 0.8052865266799927, + "epoch": 0.005176504421597527, + "kl_loss": 3857.419189453125, + "loss_ib": 38.6396484375, + "step": 18 + }, + { + "ce_ib": 65.82563781738281, + "ce_orig": 1.2099261283874512, + "epoch": 0.005176504421597527, + "kl_loss": 3628.470703125, + "loss_ib": 36.35053253173828, + "step": 18 + }, + { + "ce_ib": 64.87178802490234, + "ce_orig": 1.0739271640777588, + "epoch": 0.005464088000575167, + "kl_loss": 3784.338623046875, + "loss_ib": 37.90825653076172, + "step": 19 + }, + { + "ce_ib": 66.35687255859375, + "ce_orig": 1.3444932699203491, + "epoch": 0.005464088000575167, + "kl_loss": 3339.950927734375, + "loss_ib": 33.46586608886719, + "step": 19 + }, + { + "ce_ib": 67.06342315673828, + "ce_orig": 0.8359652757644653, + "epoch": 0.005464088000575167, + "kl_loss": 4230.32666015625, + "loss_ib": 42.370330810546875, + "step": 19 + }, + { + "ce_ib": 67.96249389648438, + "ce_orig": 1.7302289009094238, + "epoch": 0.005464088000575167, + "kl_loss": 3579.07421875, + "loss_ib": 35.85870361328125, + "step": 19 + }, + { + "epoch": 0.0057516715795528075, + "grad_norm": 525.4837036132812, + "learning_rate": 4.45859872611465e-07, + "loss": 38.2435, + "step": 20 + }, + { + "ce_ib": 64.20391082763672, + "ce_orig": 0.7589595913887024, + "epoch": 0.0057516715795528075, + "kl_loss": 4128.66064453125, + "loss_ib": 41.35081100463867, + "step": 20 + }, + { + "ce_ib": 62.830806732177734, + "ce_orig": 0.6316859126091003, + "epoch": 0.0057516715795528075, + "kl_loss": 3403.72509765625, + "loss_ib": 34.10007858276367, + "step": 20 + }, + { + "ce_ib": 65.20977783203125, + "ce_orig": 0.8842067718505859, + "epoch": 0.0057516715795528075, + "kl_loss": 4221.73095703125, + "loss_ib": 42.28252029418945, + "step": 20 + }, + { + "ce_ib": 63.80289077758789, + "ce_orig": 1.1217824220657349, + "epoch": 0.0057516715795528075, + "kl_loss": 3923.58349609375, + "loss_ib": 39.29963684082031, + "step": 20 + }, + { + "ce_ib": 63.151161193847656, + "ce_orig": 0.725497305393219, + "epoch": 0.006039255158530448, + "kl_loss": 4119.48046875, + "loss_ib": 41.25795364379883, + "step": 21 + }, + { + "ce_ib": 62.518638610839844, + "ce_orig": 0.6902149319648743, + "epoch": 0.006039255158530448, + "kl_loss": 3765.32373046875, + "loss_ib": 37.715755462646484, + "step": 21 + }, + { + "ce_ib": 64.55998229980469, + "ce_orig": 1.0123672485351562, + "epoch": 0.006039255158530448, + "kl_loss": 3793.22509765625, + "loss_ib": 37.99681091308594, + "step": 21 + }, + { + "ce_ib": 61.955604553222656, + "ce_orig": 0.528033971786499, + "epoch": 0.006039255158530448, + "kl_loss": 3723.4833984375, + "loss_ib": 37.29678726196289, + "step": 21 + }, + { + "ce_ib": 62.95112609863281, + "ce_orig": 0.7799142003059387, + "epoch": 0.006326838737508088, + "kl_loss": 3707.60546875, + "loss_ib": 37.13900375366211, + "step": 22 + }, + { + "ce_ib": 63.943023681640625, + "ce_orig": 0.836663544178009, + "epoch": 0.006326838737508088, + "kl_loss": 3695.3837890625, + "loss_ib": 37.01778030395508, + "step": 22 + }, + { + "ce_ib": 61.96399688720703, + "ce_orig": 0.5541026592254639, + "epoch": 0.006326838737508088, + "kl_loss": 3717.060302734375, + "loss_ib": 37.232566833496094, + "step": 22 + }, + { + "ce_ib": 65.02377319335938, + "ce_orig": 1.0211303234100342, + "epoch": 0.006326838737508088, + "kl_loss": 3948.7412109375, + "loss_ib": 39.55243682861328, + "step": 22 + }, + { + "ce_ib": 64.2979736328125, + "ce_orig": 1.0038220882415771, + "epoch": 0.006614422316485728, + "kl_loss": 3556.190185546875, + "loss_ib": 35.626197814941406, + "step": 23 + }, + { + "ce_ib": 64.74838256835938, + "ce_orig": 1.4320262670516968, + "epoch": 0.006614422316485728, + "kl_loss": 3511.77587890625, + "loss_ib": 35.1825065612793, + "step": 23 + }, + { + "ce_ib": 66.12266540527344, + "ce_orig": 1.3454687595367432, + "epoch": 0.006614422316485728, + "kl_loss": 3485.62158203125, + "loss_ib": 34.92233657836914, + "step": 23 + }, + { + "ce_ib": 65.04991912841797, + "ce_orig": 1.1041706800460815, + "epoch": 0.006614422316485728, + "kl_loss": 3322.958984375, + "loss_ib": 33.294639587402344, + "step": 23 + }, + { + "ce_ib": 63.05337905883789, + "ce_orig": 0.8803771734237671, + "epoch": 0.006902005895463369, + "kl_loss": 4095.29345703125, + "loss_ib": 41.01598358154297, + "step": 24 + }, + { + "ce_ib": 62.59025192260742, + "ce_orig": 0.5355072021484375, + "epoch": 0.006902005895463369, + "kl_loss": 2086.240478515625, + "loss_ib": 20.92499542236328, + "step": 24 + }, + { + "ce_ib": 63.184295654296875, + "ce_orig": 0.7724276781082153, + "epoch": 0.006902005895463369, + "kl_loss": 4004.15185546875, + "loss_ib": 40.10470199584961, + "step": 24 + }, + { + "ce_ib": 65.41134643554688, + "ce_orig": 0.9222034811973572, + "epoch": 0.006902005895463369, + "kl_loss": 4123.5751953125, + "loss_ib": 41.30116271972656, + "step": 24 + }, + { + "epoch": 0.00718958947444101, + "grad_norm": 504.01654052734375, + "learning_rate": 6.05095541401274e-07, + "loss": 37.9487, + "step": 25 + }, + { + "ce_ib": 65.42027282714844, + "ce_orig": 1.304446816444397, + "epoch": 0.00718958947444101, + "kl_loss": 3540.853515625, + "loss_ib": 35.47395324707031, + "step": 25 + }, + { + "ce_ib": 61.20964050292969, + "ce_orig": 0.4327137768268585, + "epoch": 0.00718958947444101, + "kl_loss": 3690.677734375, + "loss_ib": 36.96798324584961, + "step": 25 + }, + { + "ce_ib": 64.6102066040039, + "ce_orig": 1.0307230949401855, + "epoch": 0.00718958947444101, + "kl_loss": 4015.37060546875, + "loss_ib": 40.21831512451172, + "step": 25 + }, + { + "ce_ib": 64.76322174072266, + "ce_orig": 0.9839794039726257, + "epoch": 0.00718958947444101, + "kl_loss": 4032.43212890625, + "loss_ib": 40.38908386230469, + "step": 25 + }, + { + "ce_ib": 65.33113098144531, + "ce_orig": 1.4617711305618286, + "epoch": 0.00747717305341865, + "kl_loss": 3820.6474609375, + "loss_ib": 38.27180480957031, + "step": 26 + }, + { + "ce_ib": 65.57540130615234, + "ce_orig": 0.7502631545066833, + "epoch": 0.00747717305341865, + "kl_loss": 3953.942138671875, + "loss_ib": 39.60499572753906, + "step": 26 + }, + { + "ce_ib": 66.47959899902344, + "ce_orig": 1.3454749584197998, + "epoch": 0.00747717305341865, + "kl_loss": 3569.14794921875, + "loss_ib": 35.757957458496094, + "step": 26 + }, + { + "ce_ib": 66.15406799316406, + "ce_orig": 1.0591317415237427, + "epoch": 0.00747717305341865, + "kl_loss": 3647.95068359375, + "loss_ib": 36.54566192626953, + "step": 26 + }, + { + "ce_ib": 61.56562042236328, + "ce_orig": 0.7531498074531555, + "epoch": 0.00776475663239629, + "kl_loss": 3646.81689453125, + "loss_ib": 36.52973175048828, + "step": 27 + }, + { + "ce_ib": 63.738616943359375, + "ce_orig": 0.7576659321784973, + "epoch": 0.00776475663239629, + "kl_loss": 4090.55078125, + "loss_ib": 40.96924591064453, + "step": 27 + }, + { + "ce_ib": 63.68565368652344, + "ce_orig": 0.8695321083068848, + "epoch": 0.00776475663239629, + "kl_loss": 3782.551025390625, + "loss_ib": 37.88919448852539, + "step": 27 + }, + { + "ce_ib": 62.30582046508789, + "ce_orig": 0.5045351982116699, + "epoch": 0.00776475663239629, + "kl_loss": 3620.3427734375, + "loss_ib": 36.26573181152344, + "step": 27 + }, + { + "ce_ib": 63.34856414794922, + "ce_orig": 0.741316556930542, + "epoch": 0.00805234021137393, + "kl_loss": 4024.19580078125, + "loss_ib": 40.30530548095703, + "step": 28 + }, + { + "ce_ib": 64.60435485839844, + "ce_orig": 0.7678440809249878, + "epoch": 0.00805234021137393, + "kl_loss": 3513.181884765625, + "loss_ib": 35.1964225769043, + "step": 28 + }, + { + "ce_ib": 64.85627746582031, + "ce_orig": 1.3411056995391846, + "epoch": 0.00805234021137393, + "kl_loss": 3659.3564453125, + "loss_ib": 36.65842056274414, + "step": 28 + }, + { + "ce_ib": 68.69432830810547, + "ce_orig": 1.4908052682876587, + "epoch": 0.00805234021137393, + "kl_loss": 4179.46630859375, + "loss_ib": 41.86335754394531, + "step": 28 + }, + { + "ce_ib": 64.14762115478516, + "ce_orig": 0.7301002740859985, + "epoch": 0.008339923790351571, + "kl_loss": 4014.65771484375, + "loss_ib": 40.210723876953125, + "step": 29 + }, + { + "ce_ib": 65.77961730957031, + "ce_orig": 1.078151822090149, + "epoch": 0.008339923790351571, + "kl_loss": 3336.4443359375, + "loss_ib": 33.43022155761719, + "step": 29 + }, + { + "ce_ib": 62.39012908935547, + "ce_orig": 1.5332895517349243, + "epoch": 0.008339923790351571, + "kl_loss": 4004.0390625, + "loss_ib": 40.102779388427734, + "step": 29 + }, + { + "ce_ib": 63.807186126708984, + "ce_orig": 0.9249582886695862, + "epoch": 0.008339923790351571, + "kl_loss": 3553.84716796875, + "loss_ib": 35.60227584838867, + "step": 29 + }, + { + "epoch": 0.00862750736932921, + "grad_norm": 537.8850708007812, + "learning_rate": 7.643312101910829e-07, + "loss": 38.6273, + "step": 30 + }, + { + "ce_ib": 64.0005111694336, + "ce_orig": 0.8134416937828064, + "epoch": 0.00862750736932921, + "kl_loss": 3766.99658203125, + "loss_ib": 37.73396682739258, + "step": 30 + }, + { + "ce_ib": 65.6531982421875, + "ce_orig": 1.2596931457519531, + "epoch": 0.00862750736932921, + "kl_loss": 3756.150146484375, + "loss_ib": 37.62715530395508, + "step": 30 + }, + { + "ce_ib": 66.32474517822266, + "ce_orig": 1.5833230018615723, + "epoch": 0.00862750736932921, + "kl_loss": 3616.305419921875, + "loss_ib": 36.22937774658203, + "step": 30 + }, + { + "ce_ib": 68.49303436279297, + "ce_orig": 1.2524874210357666, + "epoch": 0.00862750736932921, + "kl_loss": 3675.2001953125, + "loss_ib": 36.82049560546875, + "step": 30 + }, + { + "ce_ib": 66.44476318359375, + "ce_orig": 1.3207565546035767, + "epoch": 0.008915090948306852, + "kl_loss": 3776.052734375, + "loss_ib": 37.82697296142578, + "step": 31 + }, + { + "ce_ib": 66.1202163696289, + "ce_orig": 1.7769383192062378, + "epoch": 0.008915090948306852, + "kl_loss": 3872.0908203125, + "loss_ib": 38.78702926635742, + "step": 31 + }, + { + "ce_ib": 64.21722412109375, + "ce_orig": 1.2050706148147583, + "epoch": 0.008915090948306852, + "kl_loss": 3775.35009765625, + "loss_ib": 37.81771469116211, + "step": 31 + }, + { + "ce_ib": 64.00657653808594, + "ce_orig": 0.745306670665741, + "epoch": 0.008915090948306852, + "kl_loss": 4056.444580078125, + "loss_ib": 40.62845230102539, + "step": 31 + }, + { + "ce_ib": 64.75992584228516, + "ce_orig": 0.9638186097145081, + "epoch": 0.009202674527284491, + "kl_loss": 3975.2265625, + "loss_ib": 39.81702423095703, + "step": 32 + }, + { + "ce_ib": 63.76476287841797, + "ce_orig": 0.7001180052757263, + "epoch": 0.009202674527284491, + "kl_loss": 3708.013671875, + "loss_ib": 37.14390182495117, + "step": 32 + }, + { + "ce_ib": 62.59078598022461, + "ce_orig": 0.581017255783081, + "epoch": 0.009202674527284491, + "kl_loss": 4050.337890625, + "loss_ib": 40.56596755981445, + "step": 32 + }, + { + "ce_ib": 62.47100830078125, + "ce_orig": 0.4765642583370209, + "epoch": 0.009202674527284491, + "kl_loss": 3852.07666015625, + "loss_ib": 38.58323669433594, + "step": 32 + }, + { + "ce_ib": 66.74118041992188, + "ce_orig": 1.0750036239624023, + "epoch": 0.009490258106262132, + "kl_loss": 3840.857177734375, + "loss_ib": 38.475311279296875, + "step": 33 + }, + { + "ce_ib": 62.256229400634766, + "ce_orig": 1.111011028289795, + "epoch": 0.009490258106262132, + "kl_loss": 3756.697265625, + "loss_ib": 37.62922668457031, + "step": 33 + }, + { + "ce_ib": 60.74306869506836, + "ce_orig": 0.322427362203598, + "epoch": 0.009490258106262132, + "kl_loss": 3481.74169921875, + "loss_ib": 34.87815856933594, + "step": 33 + }, + { + "ce_ib": 63.85698318481445, + "ce_orig": 1.245665192604065, + "epoch": 0.009490258106262132, + "kl_loss": 3702.04638671875, + "loss_ib": 37.084320068359375, + "step": 33 + }, + { + "ce_ib": 64.15027618408203, + "ce_orig": 0.7340657114982605, + "epoch": 0.009777841685239774, + "kl_loss": 2706.0263671875, + "loss_ib": 27.124412536621094, + "step": 34 + }, + { + "ce_ib": 62.855018615722656, + "ce_orig": 1.0289608240127563, + "epoch": 0.009777841685239774, + "kl_loss": 3802.005615234375, + "loss_ib": 38.082908630371094, + "step": 34 + }, + { + "ce_ib": 63.055484771728516, + "ce_orig": 0.7458648681640625, + "epoch": 0.009777841685239774, + "kl_loss": 3990.32861328125, + "loss_ib": 39.96634292602539, + "step": 34 + }, + { + "ce_ib": 61.685733795166016, + "ce_orig": 0.4432576894760132, + "epoch": 0.009777841685239774, + "kl_loss": 2863.019775390625, + "loss_ib": 28.69188117980957, + "step": 34 + }, + { + "epoch": 0.010065425264217413, + "grad_norm": 523.54052734375, + "learning_rate": 9.235668789808917e-07, + "loss": 37.7138, + "step": 35 + }, + { + "ce_ib": 63.075679779052734, + "ce_orig": 0.7427234053611755, + "epoch": 0.010065425264217413, + "kl_loss": 3800.17724609375, + "loss_ib": 38.06484603881836, + "step": 35 + }, + { + "ce_ib": 66.27129364013672, + "ce_orig": 1.4802910089492798, + "epoch": 0.010065425264217413, + "kl_loss": 3349.9755859375, + "loss_ib": 33.5660285949707, + "step": 35 + }, + { + "ce_ib": 62.79461669921875, + "ce_orig": 0.8912234902381897, + "epoch": 0.010065425264217413, + "kl_loss": 4039.09765625, + "loss_ib": 40.45376968383789, + "step": 35 + }, + { + "ce_ib": 61.9300537109375, + "ce_orig": 0.6817716360092163, + "epoch": 0.010065425264217413, + "kl_loss": 3835.2861328125, + "loss_ib": 38.414791107177734, + "step": 35 + }, + { + "ce_ib": 63.25111770629883, + "ce_orig": 0.9492425918579102, + "epoch": 0.010353008843195054, + "kl_loss": 3599.0205078125, + "loss_ib": 36.0534553527832, + "step": 36 + }, + { + "ce_ib": 64.83221435546875, + "ce_orig": 1.1269358396530151, + "epoch": 0.010353008843195054, + "kl_loss": 3825.91796875, + "loss_ib": 38.32400894165039, + "step": 36 + }, + { + "ce_ib": 63.47658920288086, + "ce_orig": 0.7525137662887573, + "epoch": 0.010353008843195054, + "kl_loss": 3816.482421875, + "loss_ib": 38.22829818725586, + "step": 36 + }, + { + "ce_ib": 67.63275909423828, + "ce_orig": 1.4331247806549072, + "epoch": 0.010353008843195054, + "kl_loss": 3869.70654296875, + "loss_ib": 38.76469802856445, + "step": 36 + }, + { + "ce_ib": 62.58089065551758, + "ce_orig": 0.6857898235321045, + "epoch": 0.010640592422172693, + "kl_loss": 3316.5986328125, + "loss_ib": 33.22856521606445, + "step": 37 + }, + { + "ce_ib": 66.71737670898438, + "ce_orig": 1.6872270107269287, + "epoch": 0.010640592422172693, + "kl_loss": 3741.76953125, + "loss_ib": 37.48440933227539, + "step": 37 + }, + { + "ce_ib": 64.51302337646484, + "ce_orig": 1.0037118196487427, + "epoch": 0.010640592422172693, + "kl_loss": 4072.7705078125, + "loss_ib": 40.79221725463867, + "step": 37 + }, + { + "ce_ib": 64.44024658203125, + "ce_orig": 1.0666587352752686, + "epoch": 0.010640592422172693, + "kl_loss": 3476.2373046875, + "loss_ib": 34.826812744140625, + "step": 37 + }, + { + "ce_ib": 63.56709289550781, + "ce_orig": 0.6757309436798096, + "epoch": 0.010928176001150335, + "kl_loss": 3920.34814453125, + "loss_ib": 39.26704788208008, + "step": 38 + }, + { + "ce_ib": 66.18359375, + "ce_orig": 1.389379620552063, + "epoch": 0.010928176001150335, + "kl_loss": 3573.64013671875, + "loss_ib": 35.80258560180664, + "step": 38 + }, + { + "ce_ib": 65.27085876464844, + "ce_orig": 0.9928706884384155, + "epoch": 0.010928176001150335, + "kl_loss": 3934.83349609375, + "loss_ib": 39.413604736328125, + "step": 38 + }, + { + "ce_ib": 62.60868453979492, + "ce_orig": 0.5065615773200989, + "epoch": 0.010928176001150335, + "kl_loss": 3360.3466796875, + "loss_ib": 33.66607666015625, + "step": 38 + }, + { + "ce_ib": 63.16704177856445, + "ce_orig": 0.6447534561157227, + "epoch": 0.011215759580127974, + "kl_loss": 3981.157470703125, + "loss_ib": 39.87474060058594, + "step": 39 + }, + { + "ce_ib": 66.88977813720703, + "ce_orig": 1.1577696800231934, + "epoch": 0.011215759580127974, + "kl_loss": 3895.958984375, + "loss_ib": 39.0264778137207, + "step": 39 + }, + { + "ce_ib": 66.50093841552734, + "ce_orig": 1.4465612173080444, + "epoch": 0.011215759580127974, + "kl_loss": 3656.223388671875, + "loss_ib": 36.62873458862305, + "step": 39 + }, + { + "ce_ib": 63.415382385253906, + "ce_orig": 0.7691327929496765, + "epoch": 0.011215759580127974, + "kl_loss": 4074.531005859375, + "loss_ib": 40.80872344970703, + "step": 39 + }, + { + "epoch": 0.011503343159105615, + "grad_norm": 543.6448364257812, + "learning_rate": 1.0828025477707007e-06, + "loss": 38.3393, + "step": 40 + }, + { + "ce_ib": 68.20819091796875, + "ce_orig": 1.7859582901000977, + "epoch": 0.011503343159105615, + "kl_loss": 3490.180908203125, + "loss_ib": 34.97001647949219, + "step": 40 + }, + { + "ce_ib": 63.85101318359375, + "ce_orig": 0.798017144203186, + "epoch": 0.011503343159105615, + "kl_loss": 3664.87158203125, + "loss_ib": 36.71256637573242, + "step": 40 + }, + { + "ce_ib": 65.26078033447266, + "ce_orig": 1.3482457399368286, + "epoch": 0.011503343159105615, + "kl_loss": 3666.393310546875, + "loss_ib": 36.72919464111328, + "step": 40 + }, + { + "ce_ib": 64.7423324584961, + "ce_orig": 1.047332763671875, + "epoch": 0.011503343159105615, + "kl_loss": 3898.302734375, + "loss_ib": 39.047767639160156, + "step": 40 + }, + { + "ce_ib": 63.11514663696289, + "ce_orig": 0.707227349281311, + "epoch": 0.011790926738083256, + "kl_loss": 3996.748046875, + "loss_ib": 40.03059387207031, + "step": 41 + }, + { + "ce_ib": 62.858245849609375, + "ce_orig": 0.7572628259658813, + "epoch": 0.011790926738083256, + "kl_loss": 3845.11328125, + "loss_ib": 38.51399230957031, + "step": 41 + }, + { + "ce_ib": 64.94149780273438, + "ce_orig": 0.9001584649085999, + "epoch": 0.011790926738083256, + "kl_loss": 3669.0107421875, + "loss_ib": 36.75504684448242, + "step": 41 + }, + { + "ce_ib": 64.22615051269531, + "ce_orig": 0.9319191575050354, + "epoch": 0.011790926738083256, + "kl_loss": 3406.089599609375, + "loss_ib": 34.1251220703125, + "step": 41 + }, + { + "ce_ib": 66.28656005859375, + "ce_orig": 1.7123758792877197, + "epoch": 0.012078510317060896, + "kl_loss": 3858.2216796875, + "loss_ib": 38.648502349853516, + "step": 42 + }, + { + "ce_ib": 63.06233215332031, + "ce_orig": 0.7600352764129639, + "epoch": 0.012078510317060896, + "kl_loss": 3817.84375, + "loss_ib": 38.24149703979492, + "step": 42 + }, + { + "ce_ib": 62.57767868041992, + "ce_orig": 0.9215527772903442, + "epoch": 0.012078510317060896, + "kl_loss": 3511.0380859375, + "loss_ib": 35.17295837402344, + "step": 42 + }, + { + "ce_ib": 64.01197814941406, + "ce_orig": 0.6495408415794373, + "epoch": 0.012078510317060896, + "kl_loss": 3883.279541015625, + "loss_ib": 38.89680480957031, + "step": 42 + }, + { + "ce_ib": 63.950992584228516, + "ce_orig": 0.8470758199691772, + "epoch": 0.012366093896038537, + "kl_loss": 3836.8623046875, + "loss_ib": 38.43257141113281, + "step": 43 + }, + { + "ce_ib": 66.46541595458984, + "ce_orig": 1.070137619972229, + "epoch": 0.012366093896038537, + "kl_loss": 3574.319580078125, + "loss_ib": 35.80965805053711, + "step": 43 + }, + { + "ce_ib": 61.230316162109375, + "ce_orig": 0.6914916634559631, + "epoch": 0.012366093896038537, + "kl_loss": 4103.490234375, + "loss_ib": 41.09613037109375, + "step": 43 + }, + { + "ce_ib": 65.71780395507812, + "ce_orig": 1.2423909902572632, + "epoch": 0.012366093896038537, + "kl_loss": 3505.72607421875, + "loss_ib": 35.12297821044922, + "step": 43 + }, + { + "ce_ib": 63.5611572265625, + "ce_orig": 0.9509873986244202, + "epoch": 0.012653677475016176, + "kl_loss": 3528.08349609375, + "loss_ib": 35.34439468383789, + "step": 44 + }, + { + "ce_ib": 61.90439987182617, + "ce_orig": 1.0406547784805298, + "epoch": 0.012653677475016176, + "kl_loss": 3284.989501953125, + "loss_ib": 32.911800384521484, + "step": 44 + }, + { + "ce_ib": 62.566444396972656, + "ce_orig": 0.9737301468849182, + "epoch": 0.012653677475016176, + "kl_loss": 4007.912841796875, + "loss_ib": 40.141693115234375, + "step": 44 + }, + { + "ce_ib": 62.55556869506836, + "ce_orig": 0.9186174273490906, + "epoch": 0.012653677475016176, + "kl_loss": 3678.3505859375, + "loss_ib": 36.84606170654297, + "step": 44 + }, + { + "epoch": 0.012941261053993817, + "grad_norm": 492.6309509277344, + "learning_rate": 1.2420382165605097e-06, + "loss": 37.2694, + "step": 45 + }, + { + "ce_ib": 63.57815170288086, + "ce_orig": 1.3040772676467896, + "epoch": 0.012941261053993817, + "kl_loss": 3878.02587890625, + "loss_ib": 38.84383773803711, + "step": 45 + }, + { + "ce_ib": 61.98274230957031, + "ce_orig": 0.6795246601104736, + "epoch": 0.012941261053993817, + "kl_loss": 3593.56640625, + "loss_ib": 35.99764633178711, + "step": 45 + }, + { + "ce_ib": 63.4442138671875, + "ce_orig": 0.9516732692718506, + "epoch": 0.012941261053993817, + "kl_loss": 3920.56982421875, + "loss_ib": 39.269142150878906, + "step": 45 + }, + { + "ce_ib": 64.35120391845703, + "ce_orig": 0.7613200545310974, + "epoch": 0.012941261053993817, + "kl_loss": 3579.57763671875, + "loss_ib": 35.86012649536133, + "step": 45 + }, + { + "ce_ib": 62.913352966308594, + "ce_orig": 1.0408005714416504, + "epoch": 0.013228844632971457, + "kl_loss": 3620.77099609375, + "loss_ib": 36.27062225341797, + "step": 46 + }, + { + "ce_ib": 63.58440399169922, + "ce_orig": 0.8055190443992615, + "epoch": 0.013228844632971457, + "kl_loss": 3386.48876953125, + "loss_ib": 33.928470611572266, + "step": 46 + }, + { + "ce_ib": 63.420753479003906, + "ce_orig": 1.1024976968765259, + "epoch": 0.013228844632971457, + "kl_loss": 3643.9453125, + "loss_ib": 36.502872467041016, + "step": 46 + }, + { + "ce_ib": 62.746665954589844, + "ce_orig": 0.7064395546913147, + "epoch": 0.013228844632971457, + "kl_loss": 3931.369873046875, + "loss_ib": 39.37644577026367, + "step": 46 + }, + { + "ce_ib": 63.00592041015625, + "ce_orig": 0.8414040803909302, + "epoch": 0.013516428211949098, + "kl_loss": 4138.1728515625, + "loss_ib": 41.44473648071289, + "step": 47 + }, + { + "ce_ib": 66.07843017578125, + "ce_orig": 0.8475580811500549, + "epoch": 0.013516428211949098, + "kl_loss": 3865.19287109375, + "loss_ib": 38.7180061340332, + "step": 47 + }, + { + "ce_ib": 63.92705535888672, + "ce_orig": 0.9875443577766418, + "epoch": 0.013516428211949098, + "kl_loss": 4158.44189453125, + "loss_ib": 41.64834213256836, + "step": 47 + }, + { + "ce_ib": 68.14057922363281, + "ce_orig": 1.756430745124817, + "epoch": 0.013516428211949098, + "kl_loss": 3738.97314453125, + "loss_ib": 37.45787048339844, + "step": 47 + }, + { + "ce_ib": 63.9988899230957, + "ce_orig": 0.8397009968757629, + "epoch": 0.013804011790926739, + "kl_loss": 3744.52294921875, + "loss_ib": 37.50922775268555, + "step": 48 + }, + { + "ce_ib": 64.73321533203125, + "ce_orig": 1.5420986413955688, + "epoch": 0.013804011790926739, + "kl_loss": 3818.59228515625, + "loss_ib": 38.25065612792969, + "step": 48 + }, + { + "ce_ib": 64.00019073486328, + "ce_orig": 0.5949701070785522, + "epoch": 0.013804011790926739, + "kl_loss": 3568.99609375, + "loss_ib": 35.75395965576172, + "step": 48 + }, + { + "ce_ib": 64.06549072265625, + "ce_orig": 1.3993630409240723, + "epoch": 0.013804011790926739, + "kl_loss": 2974.96728515625, + "loss_ib": 29.813737869262695, + "step": 48 + }, + { + "ce_ib": 64.43647766113281, + "ce_orig": 1.1406134366989136, + "epoch": 0.014091595369904378, + "kl_loss": 3809.33447265625, + "loss_ib": 38.157779693603516, + "step": 49 + }, + { + "ce_ib": 61.58470153808594, + "ce_orig": 0.76979660987854, + "epoch": 0.014091595369904378, + "kl_loss": 4037.28759765625, + "loss_ib": 40.4344596862793, + "step": 49 + }, + { + "ce_ib": 62.92927551269531, + "ce_orig": 1.3724863529205322, + "epoch": 0.014091595369904378, + "kl_loss": 3676.944580078125, + "loss_ib": 36.832374572753906, + "step": 49 + }, + { + "ce_ib": 62.875492095947266, + "ce_orig": 0.6223806142807007, + "epoch": 0.014091595369904378, + "kl_loss": 3579.248046875, + "loss_ib": 35.85535430908203, + "step": 49 + }, + { + "epoch": 0.01437917894888202, + "grad_norm": 533.30029296875, + "learning_rate": 1.4012738853503185e-06, + "loss": 37.7487, + "step": 50 + }, + { + "ce_ib": 61.585453033447266, + "ce_orig": 0.8775674104690552, + "epoch": 0.01437917894888202, + "kl_loss": 3878.78076171875, + "loss_ib": 38.84939193725586, + "step": 50 + }, + { + "ce_ib": 58.95383834838867, + "ce_orig": 0.7002028822898865, + "epoch": 0.01437917894888202, + "kl_loss": 3601.2216796875, + "loss_ib": 36.0711669921875, + "step": 50 + }, + { + "ce_ib": 62.58400344848633, + "ce_orig": 0.7227221131324768, + "epoch": 0.01437917894888202, + "kl_loss": 3643.05615234375, + "loss_ib": 36.49314498901367, + "step": 50 + }, + { + "ce_ib": 66.29362487792969, + "ce_orig": 1.0485941171646118, + "epoch": 0.01437917894888202, + "kl_loss": 3717.5888671875, + "loss_ib": 37.242183685302734, + "step": 50 + }, + { + "ce_ib": 61.591148376464844, + "ce_orig": 0.6134757995605469, + "epoch": 0.014666762527859659, + "kl_loss": 4068.88330078125, + "loss_ib": 40.750423431396484, + "step": 51 + }, + { + "ce_ib": 64.17617797851562, + "ce_orig": 1.2959325313568115, + "epoch": 0.014666762527859659, + "kl_loss": 3517.951171875, + "loss_ib": 35.24368667602539, + "step": 51 + }, + { + "ce_ib": 61.61008834838867, + "ce_orig": 0.6165804862976074, + "epoch": 0.014666762527859659, + "kl_loss": 4074.12744140625, + "loss_ib": 40.80288314819336, + "step": 51 + }, + { + "ce_ib": 63.430118560791016, + "ce_orig": 1.3455349206924438, + "epoch": 0.014666762527859659, + "kl_loss": 3925.92333984375, + "loss_ib": 39.322662353515625, + "step": 51 + }, + { + "ce_ib": 63.63911819458008, + "ce_orig": 0.813752293586731, + "epoch": 0.0149543461068373, + "kl_loss": 3411.76806640625, + "loss_ib": 34.18132019042969, + "step": 52 + }, + { + "ce_ib": 62.01913833618164, + "ce_orig": 0.9041391611099243, + "epoch": 0.0149543461068373, + "kl_loss": 3552.44091796875, + "loss_ib": 35.58642578125, + "step": 52 + }, + { + "ce_ib": 63.11591339111328, + "ce_orig": 0.8099521994590759, + "epoch": 0.0149543461068373, + "kl_loss": 3363.9013671875, + "loss_ib": 33.70212936401367, + "step": 52 + }, + { + "ce_ib": 63.846641540527344, + "ce_orig": 1.0799516439437866, + "epoch": 0.0149543461068373, + "kl_loss": 3810.20947265625, + "loss_ib": 38.16594314575195, + "step": 52 + }, + { + "ce_ib": 63.17069625854492, + "ce_orig": 1.2767831087112427, + "epoch": 0.015241929685814939, + "kl_loss": 3672.79248046875, + "loss_ib": 36.79109573364258, + "step": 53 + }, + { + "ce_ib": 60.14902877807617, + "ce_orig": 0.5526849627494812, + "epoch": 0.015241929685814939, + "kl_loss": 2977.911376953125, + "loss_ib": 29.839262008666992, + "step": 53 + }, + { + "ce_ib": 63.31485366821289, + "ce_orig": 0.7787724137306213, + "epoch": 0.015241929685814939, + "kl_loss": 3514.63232421875, + "loss_ib": 35.20963668823242, + "step": 53 + }, + { + "ce_ib": 64.56353759765625, + "ce_orig": 1.6204540729522705, + "epoch": 0.015241929685814939, + "kl_loss": 3812.165771484375, + "loss_ib": 38.18621826171875, + "step": 53 + }, + { + "ce_ib": 66.47161865234375, + "ce_orig": 1.488782525062561, + "epoch": 0.01552951326479258, + "kl_loss": 3656.353759765625, + "loss_ib": 36.630008697509766, + "step": 54 + }, + { + "ce_ib": 63.27266311645508, + "ce_orig": 1.2602483034133911, + "epoch": 0.01552951326479258, + "kl_loss": 3836.474609375, + "loss_ib": 38.428016662597656, + "step": 54 + }, + { + "ce_ib": 61.19478225708008, + "ce_orig": 0.9387843608856201, + "epoch": 0.01552951326479258, + "kl_loss": 3736.385009765625, + "loss_ib": 37.425045013427734, + "step": 54 + }, + { + "ce_ib": 62.278865814208984, + "ce_orig": 0.6224288940429688, + "epoch": 0.01552951326479258, + "kl_loss": 3810.37646484375, + "loss_ib": 38.16604232788086, + "step": 54 + }, + { + "epoch": 0.01581709684377022, + "grad_norm": 496.5709533691406, + "learning_rate": 1.5605095541401275e-06, + "loss": 37.2354, + "step": 55 + }, + { + "ce_ib": 64.26878356933594, + "ce_orig": 0.8660982847213745, + "epoch": 0.01581709684377022, + "kl_loss": 3508.06201171875, + "loss_ib": 35.1448860168457, + "step": 55 + }, + { + "ce_ib": 58.63752365112305, + "ce_orig": 0.08659573644399643, + "epoch": 0.01581709684377022, + "kl_loss": 513.92724609375, + "loss_ib": 5.197909832000732, + "step": 55 + }, + { + "ce_ib": 61.785953521728516, + "ce_orig": 0.9901527762413025, + "epoch": 0.01581709684377022, + "kl_loss": 3603.44580078125, + "loss_ib": 36.09624481201172, + "step": 55 + }, + { + "ce_ib": 64.46088409423828, + "ce_orig": 0.8389644026756287, + "epoch": 0.01581709684377022, + "kl_loss": 3921.08642578125, + "loss_ib": 39.27532196044922, + "step": 55 + }, + { + "ce_ib": 60.15986251831055, + "ce_orig": 0.8044717311859131, + "epoch": 0.01610468042274786, + "kl_loss": 3653.1875, + "loss_ib": 36.59203338623047, + "step": 56 + }, + { + "ce_ib": 62.024410247802734, + "ce_orig": 0.6691257953643799, + "epoch": 0.01610468042274786, + "kl_loss": 4127.29736328125, + "loss_ib": 41.334999084472656, + "step": 56 + }, + { + "ce_ib": 62.78162384033203, + "ce_orig": 0.9230839014053345, + "epoch": 0.01610468042274786, + "kl_loss": 3697.928955078125, + "loss_ib": 37.04206848144531, + "step": 56 + }, + { + "ce_ib": 64.67729949951172, + "ce_orig": 1.3531347513198853, + "epoch": 0.01610468042274786, + "kl_loss": 3432.21728515625, + "loss_ib": 34.38684844970703, + "step": 56 + }, + { + "ce_ib": 61.56424331665039, + "ce_orig": 0.8097767233848572, + "epoch": 0.016392264001725502, + "kl_loss": 3895.91943359375, + "loss_ib": 39.02075958251953, + "step": 57 + }, + { + "ce_ib": 62.23572540283203, + "ce_orig": 0.5408704876899719, + "epoch": 0.016392264001725502, + "kl_loss": 3602.846923828125, + "loss_ib": 36.09070587158203, + "step": 57 + }, + { + "ce_ib": 63.37240982055664, + "ce_orig": 0.7751593589782715, + "epoch": 0.016392264001725502, + "kl_loss": 3264.66650390625, + "loss_ib": 32.71003723144531, + "step": 57 + }, + { + "ce_ib": 64.54995727539062, + "ce_orig": 1.2101812362670898, + "epoch": 0.016392264001725502, + "kl_loss": 3891.133544921875, + "loss_ib": 38.97588348388672, + "step": 57 + }, + { + "ce_ib": 64.34214782714844, + "ce_orig": 1.1633491516113281, + "epoch": 0.016679847580703143, + "kl_loss": 3507.682861328125, + "loss_ib": 35.141170501708984, + "step": 58 + }, + { + "ce_ib": 64.65531921386719, + "ce_orig": 1.0707935094833374, + "epoch": 0.016679847580703143, + "kl_loss": 3560.9189453125, + "loss_ib": 35.67384338378906, + "step": 58 + }, + { + "ce_ib": 62.398475646972656, + "ce_orig": 1.028975009918213, + "epoch": 0.016679847580703143, + "kl_loss": 3907.4013671875, + "loss_ib": 39.136409759521484, + "step": 58 + }, + { + "ce_ib": 62.34195327758789, + "ce_orig": 1.025146484375, + "epoch": 0.016679847580703143, + "kl_loss": 3447.82470703125, + "loss_ib": 34.54058837890625, + "step": 58 + }, + { + "ce_ib": 64.7926254272461, + "ce_orig": 1.7371116876602173, + "epoch": 0.01696743115968078, + "kl_loss": 3368.210205078125, + "loss_ib": 33.74689483642578, + "step": 59 + }, + { + "ce_ib": 63.827476501464844, + "ce_orig": 1.2639371156692505, + "epoch": 0.01696743115968078, + "kl_loss": 3764.4599609375, + "loss_ib": 37.70842742919922, + "step": 59 + }, + { + "ce_ib": 60.48318099975586, + "ce_orig": 0.5967444181442261, + "epoch": 0.01696743115968078, + "kl_loss": 3661.50927734375, + "loss_ib": 36.675575256347656, + "step": 59 + }, + { + "ce_ib": 62.90448760986328, + "ce_orig": 0.8884239792823792, + "epoch": 0.01696743115968078, + "kl_loss": 3642.8115234375, + "loss_ib": 36.49102020263672, + "step": 59 + }, + { + "epoch": 0.01725501473865842, + "grad_norm": 521.4768676757812, + "learning_rate": 1.7197452229299363e-06, + "loss": 37.203, + "step": 60 + }, + { + "ce_ib": 62.17192077636719, + "ce_orig": 0.7363674640655518, + "epoch": 0.01725501473865842, + "kl_loss": 3631.49609375, + "loss_ib": 36.377132415771484, + "step": 60 + }, + { + "ce_ib": 63.342933654785156, + "ce_orig": 1.0785236358642578, + "epoch": 0.01725501473865842, + "kl_loss": 3575.03369140625, + "loss_ib": 35.81367874145508, + "step": 60 + }, + { + "ce_ib": 61.84444046020508, + "ce_orig": 0.8591632843017578, + "epoch": 0.01725501473865842, + "kl_loss": 3692.08203125, + "loss_ib": 36.982662200927734, + "step": 60 + }, + { + "ce_ib": 63.40077209472656, + "ce_orig": 1.3737884759902954, + "epoch": 0.01725501473865842, + "kl_loss": 3226.7734375, + "loss_ib": 32.33113479614258, + "step": 60 + }, + { + "ce_ib": 60.77010726928711, + "ce_orig": 0.5792077779769897, + "epoch": 0.017542598317636063, + "kl_loss": 3807.59423828125, + "loss_ib": 38.13671112060547, + "step": 61 + }, + { + "ce_ib": 65.03406524658203, + "ce_orig": 1.6079394817352295, + "epoch": 0.017542598317636063, + "kl_loss": 3764.595703125, + "loss_ib": 37.71099090576172, + "step": 61 + }, + { + "ce_ib": 62.88136672973633, + "ce_orig": 0.9871428608894348, + "epoch": 0.017542598317636063, + "kl_loss": 3831.7412109375, + "loss_ib": 38.38029098510742, + "step": 61 + }, + { + "ce_ib": 62.55516815185547, + "ce_orig": 0.7838013172149658, + "epoch": 0.017542598317636063, + "kl_loss": 3860.99755859375, + "loss_ib": 38.67253112792969, + "step": 61 + }, + { + "ce_ib": 66.16559600830078, + "ce_orig": 1.8575478792190552, + "epoch": 0.017830181896613704, + "kl_loss": 3639.782958984375, + "loss_ib": 36.463993072509766, + "step": 62 + }, + { + "ce_ib": 59.800865173339844, + "ce_orig": 0.7499110698699951, + "epoch": 0.017830181896613704, + "kl_loss": 2505.564453125, + "loss_ib": 25.11544418334961, + "step": 62 + }, + { + "ce_ib": 64.14867401123047, + "ce_orig": 1.4294443130493164, + "epoch": 0.017830181896613704, + "kl_loss": 3695.159423828125, + "loss_ib": 37.015743255615234, + "step": 62 + }, + { + "ce_ib": 59.92710494995117, + "ce_orig": 0.7166628241539001, + "epoch": 0.017830181896613704, + "kl_loss": 3911.38427734375, + "loss_ib": 39.17376708984375, + "step": 62 + }, + { + "ce_ib": 64.37249755859375, + "ce_orig": 1.247324824333191, + "epoch": 0.018117765475591345, + "kl_loss": 3376.881103515625, + "loss_ib": 33.83318328857422, + "step": 63 + }, + { + "ce_ib": 62.949920654296875, + "ce_orig": 0.5989828705787659, + "epoch": 0.018117765475591345, + "kl_loss": 3794.5458984375, + "loss_ib": 38.00840759277344, + "step": 63 + }, + { + "ce_ib": 65.01542663574219, + "ce_orig": 1.4691771268844604, + "epoch": 0.018117765475591345, + "kl_loss": 3403.2939453125, + "loss_ib": 34.09795379638672, + "step": 63 + }, + { + "ce_ib": 61.29307556152344, + "ce_orig": 0.7313998937606812, + "epoch": 0.018117765475591345, + "kl_loss": 3450.58984375, + "loss_ib": 34.56719207763672, + "step": 63 + }, + { + "ce_ib": 60.609066009521484, + "ce_orig": 0.7843332886695862, + "epoch": 0.018405349054568983, + "kl_loss": 3731.23095703125, + "loss_ib": 37.37291717529297, + "step": 64 + }, + { + "ce_ib": 62.530723571777344, + "ce_orig": 0.6766409873962402, + "epoch": 0.018405349054568983, + "kl_loss": 3585.4892578125, + "loss_ib": 35.917423248291016, + "step": 64 + }, + { + "ce_ib": 59.86486053466797, + "ce_orig": 0.5166366696357727, + "epoch": 0.018405349054568983, + "kl_loss": 3537.461181640625, + "loss_ib": 35.43447494506836, + "step": 64 + }, + { + "ce_ib": 62.64058303833008, + "ce_orig": 0.9777031540870667, + "epoch": 0.018405349054568983, + "kl_loss": 3730.766845703125, + "loss_ib": 37.37030792236328, + "step": 64 + }, + { + "epoch": 0.018692932633546624, + "grad_norm": 518.4480590820312, + "learning_rate": 1.8789808917197455e-06, + "loss": 36.8274, + "step": 65 + }, + { + "ce_ib": 62.54412078857422, + "ce_orig": 0.7984204888343811, + "epoch": 0.018692932633546624, + "kl_loss": 3530.49755859375, + "loss_ib": 35.36751937866211, + "step": 65 + }, + { + "ce_ib": 60.793087005615234, + "ce_orig": 0.7615864276885986, + "epoch": 0.018692932633546624, + "kl_loss": 3840.51513671875, + "loss_ib": 38.4659423828125, + "step": 65 + }, + { + "ce_ib": 60.25053787231445, + "ce_orig": 0.9209924340248108, + "epoch": 0.018692932633546624, + "kl_loss": 3897.0556640625, + "loss_ib": 39.03080749511719, + "step": 65 + }, + { + "ce_ib": 63.16765594482422, + "ce_orig": 1.3678812980651855, + "epoch": 0.018692932633546624, + "kl_loss": 3938.2587890625, + "loss_ib": 39.44575500488281, + "step": 65 + }, + { + "ce_ib": 65.65672302246094, + "ce_orig": 1.016355037689209, + "epoch": 0.018980516212524265, + "kl_loss": 3797.2421875, + "loss_ib": 38.03807830810547, + "step": 66 + }, + { + "ce_ib": 61.379974365234375, + "ce_orig": 1.0548150539398193, + "epoch": 0.018980516212524265, + "kl_loss": 3564.20751953125, + "loss_ib": 35.703453063964844, + "step": 66 + }, + { + "ce_ib": 62.37001419067383, + "ce_orig": 0.9475827217102051, + "epoch": 0.018980516212524265, + "kl_loss": 3571.18115234375, + "loss_ib": 35.7741813659668, + "step": 66 + }, + { + "ce_ib": 60.9146728515625, + "ce_orig": 0.784809947013855, + "epoch": 0.018980516212524265, + "kl_loss": 3582.17822265625, + "loss_ib": 35.88269805908203, + "step": 66 + }, + { + "ce_ib": 63.815486907958984, + "ce_orig": 1.3292514085769653, + "epoch": 0.019268099791501906, + "kl_loss": 3440.73779296875, + "loss_ib": 34.47119140625, + "step": 67 + }, + { + "ce_ib": 62.98185348510742, + "ce_orig": 0.7911183834075928, + "epoch": 0.019268099791501906, + "kl_loss": 4040.31640625, + "loss_ib": 40.46614456176758, + "step": 67 + }, + { + "ce_ib": 65.0182876586914, + "ce_orig": 0.7907848358154297, + "epoch": 0.019268099791501906, + "kl_loss": 3936.2587890625, + "loss_ib": 39.42760467529297, + "step": 67 + }, + { + "ce_ib": 63.88326644897461, + "ce_orig": 1.092854380607605, + "epoch": 0.019268099791501906, + "kl_loss": 1859.8902587890625, + "loss_ib": 18.662784576416016, + "step": 67 + }, + { + "ce_ib": 61.63788604736328, + "ce_orig": 0.5948306322097778, + "epoch": 0.019555683370479547, + "kl_loss": 3881.165771484375, + "loss_ib": 38.873294830322266, + "step": 68 + }, + { + "ce_ib": 60.62575912475586, + "ce_orig": 0.5677074193954468, + "epoch": 0.019555683370479547, + "kl_loss": 3741.05078125, + "loss_ib": 37.47113037109375, + "step": 68 + }, + { + "ce_ib": 60.886016845703125, + "ce_orig": 0.5185374617576599, + "epoch": 0.019555683370479547, + "kl_loss": 3542.03857421875, + "loss_ib": 35.48126983642578, + "step": 68 + }, + { + "ce_ib": 60.4649543762207, + "ce_orig": 0.4317881166934967, + "epoch": 0.019555683370479547, + "kl_loss": 3231.02587890625, + "loss_ib": 32.370723724365234, + "step": 68 + }, + { + "ce_ib": 63.6849365234375, + "ce_orig": 1.51223886013031, + "epoch": 0.019843266949457185, + "kl_loss": 3498.36083984375, + "loss_ib": 35.04729080200195, + "step": 69 + }, + { + "ce_ib": 63.855262756347656, + "ce_orig": 2.3971757888793945, + "epoch": 0.019843266949457185, + "kl_loss": 3531.71630859375, + "loss_ib": 35.38101577758789, + "step": 69 + }, + { + "ce_ib": 61.03330993652344, + "ce_orig": 1.2093490362167358, + "epoch": 0.019843266949457185, + "kl_loss": 3718.281494140625, + "loss_ib": 37.24384689331055, + "step": 69 + }, + { + "ce_ib": 64.49278259277344, + "ce_orig": 1.474419355392456, + "epoch": 0.019843266949457185, + "kl_loss": 3354.119140625, + "loss_ib": 33.605682373046875, + "step": 69 + }, + { + "epoch": 0.020130850528434826, + "grad_norm": 502.4320068359375, + "learning_rate": 2.0382165605095544e-06, + "loss": 36.6694, + "step": 70 + }, + { + "ce_ib": 62.60323715209961, + "ce_orig": 0.8422316312789917, + "epoch": 0.020130850528434826, + "kl_loss": 3720.64892578125, + "loss_ib": 37.26909255981445, + "step": 70 + }, + { + "ce_ib": 61.0345573425293, + "ce_orig": 0.4435622990131378, + "epoch": 0.020130850528434826, + "kl_loss": 3550.4306640625, + "loss_ib": 35.56534194946289, + "step": 70 + }, + { + "ce_ib": 62.675987243652344, + "ce_orig": 1.0512957572937012, + "epoch": 0.020130850528434826, + "kl_loss": 3372.498046875, + "loss_ib": 33.787654876708984, + "step": 70 + }, + { + "ce_ib": 63.02972412109375, + "ce_orig": 0.8881096839904785, + "epoch": 0.020130850528434826, + "kl_loss": 3634.5400390625, + "loss_ib": 36.40842819213867, + "step": 70 + }, + { + "ce_ib": 61.19450759887695, + "ce_orig": 0.8308007717132568, + "epoch": 0.020418434107412467, + "kl_loss": 3668.546875, + "loss_ib": 36.74666213989258, + "step": 71 + }, + { + "ce_ib": 62.893211364746094, + "ce_orig": 0.6296738982200623, + "epoch": 0.020418434107412467, + "kl_loss": 3531.10107421875, + "loss_ib": 35.3739013671875, + "step": 71 + }, + { + "ce_ib": 61.59178924560547, + "ce_orig": 0.9227543473243713, + "epoch": 0.020418434107412467, + "kl_loss": 3549.591552734375, + "loss_ib": 35.5575065612793, + "step": 71 + }, + { + "ce_ib": 60.22856140136719, + "ce_orig": 0.6385669708251953, + "epoch": 0.020418434107412467, + "kl_loss": 3632.27978515625, + "loss_ib": 36.383026123046875, + "step": 71 + }, + { + "ce_ib": 64.18019104003906, + "ce_orig": 1.6183723211288452, + "epoch": 0.020706017686390108, + "kl_loss": 3154.05224609375, + "loss_ib": 31.60470199584961, + "step": 72 + }, + { + "ce_ib": 60.683284759521484, + "ce_orig": 0.7608581185340881, + "epoch": 0.020706017686390108, + "kl_loss": 3858.5693359375, + "loss_ib": 38.64637756347656, + "step": 72 + }, + { + "ce_ib": 60.50144577026367, + "ce_orig": 0.7999545335769653, + "epoch": 0.020706017686390108, + "kl_loss": 3660.2255859375, + "loss_ib": 36.66275405883789, + "step": 72 + }, + { + "ce_ib": 61.18289566040039, + "ce_orig": 1.1362156867980957, + "epoch": 0.020706017686390108, + "kl_loss": 3251.9755859375, + "loss_ib": 32.58094024658203, + "step": 72 + }, + { + "ce_ib": 61.33281326293945, + "ce_orig": 0.6669225096702576, + "epoch": 0.02099360126536775, + "kl_loss": 3684.797119140625, + "loss_ib": 36.9093017578125, + "step": 73 + }, + { + "ce_ib": 59.9718017578125, + "ce_orig": 0.9883142113685608, + "epoch": 0.02099360126536775, + "kl_loss": 3631.879638671875, + "loss_ib": 36.37876510620117, + "step": 73 + }, + { + "ce_ib": 63.71513748168945, + "ce_orig": 1.3611314296722412, + "epoch": 0.02099360126536775, + "kl_loss": 3757.900146484375, + "loss_ib": 37.64271545410156, + "step": 73 + }, + { + "ce_ib": 61.1561164855957, + "ce_orig": 0.8598636388778687, + "epoch": 0.02099360126536775, + "kl_loss": 3841.884033203125, + "loss_ib": 38.47999572753906, + "step": 73 + }, + { + "ce_ib": 61.85321044921875, + "ce_orig": 0.8115242719650269, + "epoch": 0.021281184844345387, + "kl_loss": 3850.6259765625, + "loss_ib": 38.568111419677734, + "step": 74 + }, + { + "ce_ib": 58.246185302734375, + "ce_orig": 0.5221734046936035, + "epoch": 0.021281184844345387, + "kl_loss": 3487.754150390625, + "loss_ib": 34.935787200927734, + "step": 74 + }, + { + "ce_ib": 61.469078063964844, + "ce_orig": 0.7319106459617615, + "epoch": 0.021281184844345387, + "kl_loss": 4135.6591796875, + "loss_ib": 41.418060302734375, + "step": 74 + }, + { + "ce_ib": 61.441585540771484, + "ce_orig": 0.9860128164291382, + "epoch": 0.021281184844345387, + "kl_loss": 3332.26318359375, + "loss_ib": 33.384071350097656, + "step": 74 + }, + { + "epoch": 0.021568768423323028, + "grad_norm": 524.3435668945312, + "learning_rate": 2.1974522292993634e-06, + "loss": 36.7901, + "step": 75 + }, + { + "ce_ib": 60.86553192138672, + "ce_orig": 0.6997863054275513, + "epoch": 0.021568768423323028, + "kl_loss": 3372.901123046875, + "loss_ib": 33.789878845214844, + "step": 75 + }, + { + "ce_ib": 62.65220260620117, + "ce_orig": 1.4506391286849976, + "epoch": 0.021568768423323028, + "kl_loss": 3574.6826171875, + "loss_ib": 35.80947494506836, + "step": 75 + }, + { + "ce_ib": 61.273311614990234, + "ce_orig": 0.7289857864379883, + "epoch": 0.021568768423323028, + "kl_loss": 3926.74658203125, + "loss_ib": 39.328739166259766, + "step": 75 + }, + { + "ce_ib": 60.932491302490234, + "ce_orig": 0.6170489192008972, + "epoch": 0.021568768423323028, + "kl_loss": 3798.67724609375, + "loss_ib": 38.04770278930664, + "step": 75 + }, + { + "ce_ib": 60.94731140136719, + "ce_orig": 1.1426844596862793, + "epoch": 0.02185635200230067, + "kl_loss": 3991.97900390625, + "loss_ib": 39.980735778808594, + "step": 76 + }, + { + "ce_ib": 59.70903396606445, + "ce_orig": 0.753753125667572, + "epoch": 0.02185635200230067, + "kl_loss": 3774.1162109375, + "loss_ib": 37.80086898803711, + "step": 76 + }, + { + "ce_ib": 60.583736419677734, + "ce_orig": 1.2148487567901611, + "epoch": 0.02185635200230067, + "kl_loss": 3596.19140625, + "loss_ib": 36.02249526977539, + "step": 76 + }, + { + "ce_ib": 61.524559020996094, + "ce_orig": 1.4590885639190674, + "epoch": 0.02185635200230067, + "kl_loss": 4056.229248046875, + "loss_ib": 40.62381362915039, + "step": 76 + }, + { + "ce_ib": 60.858001708984375, + "ce_orig": 1.2687323093414307, + "epoch": 0.02214393558127831, + "kl_loss": 3721.635009765625, + "loss_ib": 37.27720642089844, + "step": 77 + }, + { + "ce_ib": 60.991886138916016, + "ce_orig": 0.7511414289474487, + "epoch": 0.02214393558127831, + "kl_loss": 3765.05615234375, + "loss_ib": 37.711551666259766, + "step": 77 + }, + { + "ce_ib": 63.26270294189453, + "ce_orig": 1.4974795579910278, + "epoch": 0.02214393558127831, + "kl_loss": 3476.8515625, + "loss_ib": 34.8317756652832, + "step": 77 + }, + { + "ce_ib": 60.98178482055664, + "ce_orig": 0.8308293223381042, + "epoch": 0.02214393558127831, + "kl_loss": 3436.35009765625, + "loss_ib": 34.42448043823242, + "step": 77 + }, + { + "ce_ib": 60.006534576416016, + "ce_orig": 0.7794530391693115, + "epoch": 0.022431519160255948, + "kl_loss": 3976.89306640625, + "loss_ib": 39.82893753051758, + "step": 78 + }, + { + "ce_ib": 60.324668884277344, + "ce_orig": 1.0577147006988525, + "epoch": 0.022431519160255948, + "kl_loss": 3699.833984375, + "loss_ib": 37.05866241455078, + "step": 78 + }, + { + "ce_ib": 63.445526123046875, + "ce_orig": 1.2055957317352295, + "epoch": 0.022431519160255948, + "kl_loss": 3332.263671875, + "loss_ib": 33.38608169555664, + "step": 78 + }, + { + "ce_ib": 62.043052673339844, + "ce_orig": 1.2299708127975464, + "epoch": 0.022431519160255948, + "kl_loss": 3498.40966796875, + "loss_ib": 35.046138763427734, + "step": 78 + }, + { + "ce_ib": 62.630794525146484, + "ce_orig": 1.6087855100631714, + "epoch": 0.02271910273923359, + "kl_loss": 3403.677490234375, + "loss_ib": 34.09940719604492, + "step": 79 + }, + { + "ce_ib": 61.70748519897461, + "ce_orig": 1.395445466041565, + "epoch": 0.02271910273923359, + "kl_loss": 3506.6875, + "loss_ib": 35.12858200073242, + "step": 79 + }, + { + "ce_ib": 60.67852020263672, + "ce_orig": 0.8778415322303772, + "epoch": 0.02271910273923359, + "kl_loss": 3626.2841796875, + "loss_ib": 36.32352066040039, + "step": 79 + }, + { + "ce_ib": 62.01763153076172, + "ce_orig": 1.1855616569519043, + "epoch": 0.02271910273923359, + "kl_loss": 3562.23974609375, + "loss_ib": 35.68441390991211, + "step": 79 + }, + { + "epoch": 0.02300668631821123, + "grad_norm": 495.8802795410156, + "learning_rate": 2.356687898089172e-06, + "loss": 36.0833, + "step": 80 + }, + { + "ce_ib": 61.64104080200195, + "ce_orig": 1.086162805557251, + "epoch": 0.02300668631821123, + "kl_loss": 3640.07470703125, + "loss_ib": 36.46238708496094, + "step": 80 + }, + { + "ce_ib": 61.57204055786133, + "ce_orig": 0.9259805083274841, + "epoch": 0.02300668631821123, + "kl_loss": 3893.42578125, + "loss_ib": 38.99583053588867, + "step": 80 + }, + { + "ce_ib": 66.0667953491211, + "ce_orig": 1.7032816410064697, + "epoch": 0.02300668631821123, + "kl_loss": 3534.810546875, + "loss_ib": 35.4141731262207, + "step": 80 + }, + { + "ce_ib": 59.70638656616211, + "ce_orig": 0.9606295228004456, + "epoch": 0.02300668631821123, + "kl_loss": 3836.31494140625, + "loss_ib": 38.422855377197266, + "step": 80 + }, + { + "ce_ib": 62.870216369628906, + "ce_orig": 1.2989033460617065, + "epoch": 0.02329426989718887, + "kl_loss": 3637.41015625, + "loss_ib": 36.436973571777344, + "step": 81 + }, + { + "ce_ib": 62.709312438964844, + "ce_orig": 1.103994369506836, + "epoch": 0.02329426989718887, + "kl_loss": 3570.747802734375, + "loss_ib": 35.77018737792969, + "step": 81 + }, + { + "ce_ib": 61.99235916137695, + "ce_orig": 1.6946630477905273, + "epoch": 0.02329426989718887, + "kl_loss": 3532.9951171875, + "loss_ib": 35.39194107055664, + "step": 81 + }, + { + "ce_ib": 60.915489196777344, + "ce_orig": 0.9102914333343506, + "epoch": 0.02329426989718887, + "kl_loss": 3406.8291015625, + "loss_ib": 34.129207611083984, + "step": 81 + }, + { + "ce_ib": 59.30317306518555, + "ce_orig": 0.6368826031684875, + "epoch": 0.023581853476166512, + "kl_loss": 3712.42236328125, + "loss_ib": 37.183528900146484, + "step": 82 + }, + { + "ce_ib": 60.09833526611328, + "ce_orig": 0.9400615096092224, + "epoch": 0.023581853476166512, + "kl_loss": 3772.23291015625, + "loss_ib": 37.78242492675781, + "step": 82 + }, + { + "ce_ib": 59.77298355102539, + "ce_orig": 1.1513607501983643, + "epoch": 0.023581853476166512, + "kl_loss": 3608.817138671875, + "loss_ib": 36.147945404052734, + "step": 82 + }, + { + "ce_ib": 61.4395637512207, + "ce_orig": 0.8887077569961548, + "epoch": 0.023581853476166512, + "kl_loss": 3723.0693359375, + "loss_ib": 37.29213333129883, + "step": 82 + }, + { + "ce_ib": 59.11687469482422, + "ce_orig": 1.0084015130996704, + "epoch": 0.02386943705514415, + "kl_loss": 3634.8994140625, + "loss_ib": 36.40810775756836, + "step": 83 + }, + { + "ce_ib": 61.65114212036133, + "ce_orig": 1.2165615558624268, + "epoch": 0.02386943705514415, + "kl_loss": 3236.768310546875, + "loss_ib": 32.4293327331543, + "step": 83 + }, + { + "ce_ib": 61.245872497558594, + "ce_orig": 0.9978702068328857, + "epoch": 0.02386943705514415, + "kl_loss": 3855.5927734375, + "loss_ib": 38.61717224121094, + "step": 83 + }, + { + "ce_ib": 64.20453643798828, + "ce_orig": 1.7128099203109741, + "epoch": 0.02386943705514415, + "kl_loss": 3634.4951171875, + "loss_ib": 36.40915298461914, + "step": 83 + }, + { + "ce_ib": 64.26237487792969, + "ce_orig": 1.0946186780929565, + "epoch": 0.02415702063412179, + "kl_loss": 3485.233642578125, + "loss_ib": 34.91659927368164, + "step": 84 + }, + { + "ce_ib": 60.3849983215332, + "ce_orig": 0.9367128610610962, + "epoch": 0.02415702063412179, + "kl_loss": 3541.22021484375, + "loss_ib": 35.47258758544922, + "step": 84 + }, + { + "ce_ib": 58.71778869628906, + "ce_orig": 0.7141556143760681, + "epoch": 0.02415702063412179, + "kl_loss": 3839.89599609375, + "loss_ib": 38.45767593383789, + "step": 84 + }, + { + "ce_ib": 59.888084411621094, + "ce_orig": 0.76758873462677, + "epoch": 0.02415702063412179, + "kl_loss": 3747.78759765625, + "loss_ib": 37.53776168823242, + "step": 84 + }, + { + "epoch": 0.024444604213099432, + "grad_norm": 524.7237548828125, + "learning_rate": 2.515923566878981e-06, + "loss": 36.6037, + "step": 85 + }, + { + "ce_ib": 60.545921325683594, + "ce_orig": 1.2117584943771362, + "epoch": 0.024444604213099432, + "kl_loss": 3843.150634765625, + "loss_ib": 38.4920539855957, + "step": 85 + }, + { + "ce_ib": 58.764076232910156, + "ce_orig": 0.38705337047576904, + "epoch": 0.024444604213099432, + "kl_loss": 3380.5478515625, + "loss_ib": 33.86424255371094, + "step": 85 + }, + { + "ce_ib": 63.67119216918945, + "ce_orig": 1.6082842350006104, + "epoch": 0.024444604213099432, + "kl_loss": 3766.42333984375, + "loss_ib": 37.7279052734375, + "step": 85 + }, + { + "ce_ib": 59.66868591308594, + "ce_orig": 0.8970634341239929, + "epoch": 0.024444604213099432, + "kl_loss": 3629.2216796875, + "loss_ib": 36.35188293457031, + "step": 85 + }, + { + "ce_ib": 64.16704559326172, + "ce_orig": 2.276355743408203, + "epoch": 0.024732187792077073, + "kl_loss": 3044.044921875, + "loss_ib": 30.504615783691406, + "step": 86 + }, + { + "ce_ib": 60.19831848144531, + "ce_orig": 0.9945586919784546, + "epoch": 0.024732187792077073, + "kl_loss": 3570.64990234375, + "loss_ib": 35.76669692993164, + "step": 86 + }, + { + "ce_ib": 60.47395706176758, + "ce_orig": 0.7600452303886414, + "epoch": 0.024732187792077073, + "kl_loss": 3757.107666015625, + "loss_ib": 37.63154983520508, + "step": 86 + }, + { + "ce_ib": 61.33274841308594, + "ce_orig": 1.0650345087051392, + "epoch": 0.024732187792077073, + "kl_loss": 3494.167236328125, + "loss_ib": 35.00300598144531, + "step": 86 + }, + { + "ce_ib": 58.06923294067383, + "ce_orig": 1.0382181406021118, + "epoch": 0.025019771371054714, + "kl_loss": 3489.38623046875, + "loss_ib": 34.95193099975586, + "step": 87 + }, + { + "ce_ib": 59.217193603515625, + "ce_orig": 0.9920439720153809, + "epoch": 0.025019771371054714, + "kl_loss": 3379.11279296875, + "loss_ib": 33.850341796875, + "step": 87 + }, + { + "ce_ib": 59.13267517089844, + "ce_orig": 0.7496766448020935, + "epoch": 0.025019771371054714, + "kl_loss": 3705.8818359375, + "loss_ib": 37.117950439453125, + "step": 87 + }, + { + "ce_ib": 63.39867401123047, + "ce_orig": 1.7902837991714478, + "epoch": 0.025019771371054714, + "kl_loss": 3600.96484375, + "loss_ib": 36.07304763793945, + "step": 87 + }, + { + "ce_ib": 59.85429763793945, + "ce_orig": 0.6192349791526794, + "epoch": 0.025307354950032352, + "kl_loss": 3378.796630859375, + "loss_ib": 33.84782028198242, + "step": 88 + }, + { + "ce_ib": 59.664588928222656, + "ce_orig": 0.28315597772598267, + "epoch": 0.025307354950032352, + "kl_loss": 3035.71630859375, + "loss_ib": 30.416828155517578, + "step": 88 + }, + { + "ce_ib": 59.85287094116211, + "ce_orig": 0.8578697443008423, + "epoch": 0.025307354950032352, + "kl_loss": 3704.26123046875, + "loss_ib": 37.10246658325195, + "step": 88 + }, + { + "ce_ib": 60.02167510986328, + "ce_orig": 0.865718424320221, + "epoch": 0.025307354950032352, + "kl_loss": 3559.58740234375, + "loss_ib": 35.6558952331543, + "step": 88 + }, + { + "ce_ib": 61.92749786376953, + "ce_orig": 1.8417946100234985, + "epoch": 0.025594938529009993, + "kl_loss": 3461.548828125, + "loss_ib": 34.67741775512695, + "step": 89 + }, + { + "ce_ib": 60.532379150390625, + "ce_orig": 1.1578682661056519, + "epoch": 0.025594938529009993, + "kl_loss": 3567.969482421875, + "loss_ib": 35.74022674560547, + "step": 89 + }, + { + "ce_ib": 57.639652252197266, + "ce_orig": 0.9897644519805908, + "epoch": 0.025594938529009993, + "kl_loss": 3668.02587890625, + "loss_ib": 36.73789596557617, + "step": 89 + }, + { + "ce_ib": 60.55142593383789, + "ce_orig": 0.9929890632629395, + "epoch": 0.025594938529009993, + "kl_loss": 3560.878662109375, + "loss_ib": 35.66933822631836, + "step": 89 + }, + { + "epoch": 0.025882522107987634, + "grad_norm": 502.2541198730469, + "learning_rate": 2.67515923566879e-06, + "loss": 36.2964, + "step": 90 + }, + { + "ce_ib": 59.376216888427734, + "ce_orig": 0.7132657766342163, + "epoch": 0.025882522107987634, + "kl_loss": 3782.4814453125, + "loss_ib": 37.88418960571289, + "step": 90 + }, + { + "ce_ib": 61.76494216918945, + "ce_orig": 1.0529389381408691, + "epoch": 0.025882522107987634, + "kl_loss": 3165.3203125, + "loss_ib": 31.714967727661133, + "step": 90 + }, + { + "ce_ib": 63.20905303955078, + "ce_orig": 2.0391719341278076, + "epoch": 0.025882522107987634, + "kl_loss": 3383.056884765625, + "loss_ib": 33.893775939941406, + "step": 90 + }, + { + "ce_ib": 60.66413116455078, + "ce_orig": 1.1053894758224487, + "epoch": 0.025882522107987634, + "kl_loss": 3465.13671875, + "loss_ib": 34.712032318115234, + "step": 90 + }, + { + "ce_ib": 59.18904113769531, + "ce_orig": 0.6099984049797058, + "epoch": 0.026170105686965275, + "kl_loss": 3366.81494140625, + "loss_ib": 33.72733688354492, + "step": 91 + }, + { + "ce_ib": 60.47399139404297, + "ce_orig": 1.2133395671844482, + "epoch": 0.026170105686965275, + "kl_loss": 3215.339111328125, + "loss_ib": 32.213863372802734, + "step": 91 + }, + { + "ce_ib": 59.12387466430664, + "ce_orig": 1.0556620359420776, + "epoch": 0.026170105686965275, + "kl_loss": 3760.199462890625, + "loss_ib": 37.66111755371094, + "step": 91 + }, + { + "ce_ib": 58.41682052612305, + "ce_orig": 0.8695086240768433, + "epoch": 0.026170105686965275, + "kl_loss": 3594.7275390625, + "loss_ib": 36.00569152832031, + "step": 91 + }, + { + "ce_ib": 61.80458450317383, + "ce_orig": 1.1173126697540283, + "epoch": 0.026457689265942913, + "kl_loss": 2937.051513671875, + "loss_ib": 29.43231773376465, + "step": 92 + }, + { + "ce_ib": 58.19034957885742, + "ce_orig": 1.1345878839492798, + "epoch": 0.026457689265942913, + "kl_loss": 3721.2353515625, + "loss_ib": 37.27054214477539, + "step": 92 + }, + { + "ce_ib": 64.21983337402344, + "ce_orig": 2.1032416820526123, + "epoch": 0.026457689265942913, + "kl_loss": 3113.11279296875, + "loss_ib": 31.19534683227539, + "step": 92 + }, + { + "ce_ib": 56.81288146972656, + "ce_orig": 0.7186897993087769, + "epoch": 0.026457689265942913, + "kl_loss": 3551.26708984375, + "loss_ib": 35.56948471069336, + "step": 92 + }, + { + "ce_ib": 58.19551467895508, + "ce_orig": 0.6073794364929199, + "epoch": 0.026745272844920554, + "kl_loss": 3257.37744140625, + "loss_ib": 32.6319694519043, + "step": 93 + }, + { + "ce_ib": 59.8862190246582, + "ce_orig": 1.0717018842697144, + "epoch": 0.026745272844920554, + "kl_loss": 3663.1142578125, + "loss_ib": 36.6910285949707, + "step": 93 + }, + { + "ce_ib": 60.67866134643555, + "ce_orig": 0.8607167601585388, + "epoch": 0.026745272844920554, + "kl_loss": 3073.027099609375, + "loss_ib": 30.79094886779785, + "step": 93 + }, + { + "ce_ib": 57.884647369384766, + "ce_orig": 0.7569469809532166, + "epoch": 0.026745272844920554, + "kl_loss": 3273.467041015625, + "loss_ib": 32.79255676269531, + "step": 93 + }, + { + "ce_ib": 62.44021987915039, + "ce_orig": 1.394387125968933, + "epoch": 0.027032856423898195, + "kl_loss": 3389.59716796875, + "loss_ib": 33.958412170410156, + "step": 94 + }, + { + "ce_ib": 60.312530517578125, + "ce_orig": 1.1131266355514526, + "epoch": 0.027032856423898195, + "kl_loss": 3403.34423828125, + "loss_ib": 34.093753814697266, + "step": 94 + }, + { + "ce_ib": 58.25202941894531, + "ce_orig": 0.8151227831840515, + "epoch": 0.027032856423898195, + "kl_loss": 3855.5947265625, + "loss_ib": 38.61419677734375, + "step": 94 + }, + { + "ce_ib": 61.3564338684082, + "ce_orig": 1.1052205562591553, + "epoch": 0.027032856423898195, + "kl_loss": 3158.87841796875, + "loss_ib": 31.6501407623291, + "step": 94 + }, + { + "epoch": 0.027320440002875836, + "grad_norm": 506.2858581542969, + "learning_rate": 2.834394904458599e-06, + "loss": 35.9074, + "step": 95 + }, + { + "ce_ib": 65.28954315185547, + "ce_orig": 1.8119444847106934, + "epoch": 0.027320440002875836, + "kl_loss": 3606.654296875, + "loss_ib": 36.131832122802734, + "step": 95 + }, + { + "ce_ib": 59.008201599121094, + "ce_orig": 1.0341309309005737, + "epoch": 0.027320440002875836, + "kl_loss": 3603.617431640625, + "loss_ib": 36.09518051147461, + "step": 95 + }, + { + "ce_ib": 59.24610137939453, + "ce_orig": 0.8969424962997437, + "epoch": 0.027320440002875836, + "kl_loss": 3502.19482421875, + "loss_ib": 35.08119201660156, + "step": 95 + }, + { + "ce_ib": 60.19147491455078, + "ce_orig": 1.0157058238983154, + "epoch": 0.027320440002875836, + "kl_loss": 3443.8740234375, + "loss_ib": 34.49892807006836, + "step": 95 + }, + { + "ce_ib": 60.122161865234375, + "ce_orig": 0.915783703327179, + "epoch": 0.027608023581853477, + "kl_loss": 3757.819091796875, + "loss_ib": 37.63831329345703, + "step": 96 + }, + { + "ce_ib": 56.74940490722656, + "ce_orig": 0.5688998103141785, + "epoch": 0.027608023581853477, + "kl_loss": 3403.82177734375, + "loss_ib": 34.094966888427734, + "step": 96 + }, + { + "ce_ib": 59.47806167602539, + "ce_orig": 0.9031659364700317, + "epoch": 0.027608023581853477, + "kl_loss": 3398.11572265625, + "loss_ib": 34.04063415527344, + "step": 96 + }, + { + "ce_ib": 61.29694747924805, + "ce_orig": 1.6301769018173218, + "epoch": 0.027608023581853477, + "kl_loss": 3120.549560546875, + "loss_ib": 31.26679039001465, + "step": 96 + }, + { + "ce_ib": 57.831748962402344, + "ce_orig": 0.7541747689247131, + "epoch": 0.027895607160831115, + "kl_loss": 3017.34765625, + "loss_ib": 30.231307983398438, + "step": 97 + }, + { + "ce_ib": 58.9442253112793, + "ce_orig": 1.0282090902328491, + "epoch": 0.027895607160831115, + "kl_loss": 3543.724853515625, + "loss_ib": 35.496192932128906, + "step": 97 + }, + { + "ce_ib": 62.54707336425781, + "ce_orig": 1.1943050622940063, + "epoch": 0.027895607160831115, + "kl_loss": 3438.58935546875, + "loss_ib": 34.44844055175781, + "step": 97 + }, + { + "ce_ib": 59.76130676269531, + "ce_orig": 0.9242035150527954, + "epoch": 0.027895607160831115, + "kl_loss": 3405.859375, + "loss_ib": 34.11835479736328, + "step": 97 + }, + { + "ce_ib": 59.63943862915039, + "ce_orig": 1.080672025680542, + "epoch": 0.028183190739808756, + "kl_loss": 3514.46875, + "loss_ib": 35.20432662963867, + "step": 98 + }, + { + "ce_ib": 58.865867614746094, + "ce_orig": 0.8346519470214844, + "epoch": 0.028183190739808756, + "kl_loss": 3126.54638671875, + "loss_ib": 31.32432746887207, + "step": 98 + }, + { + "ce_ib": 59.8038330078125, + "ce_orig": 0.5681230425834656, + "epoch": 0.028183190739808756, + "kl_loss": 3689.09228515625, + "loss_ib": 36.95072555541992, + "step": 98 + }, + { + "ce_ib": 57.7287483215332, + "ce_orig": 1.3980305194854736, + "epoch": 0.028183190739808756, + "kl_loss": 3602.91552734375, + "loss_ib": 36.086883544921875, + "step": 98 + }, + { + "ce_ib": 59.70271682739258, + "ce_orig": 0.7515245676040649, + "epoch": 0.028470774318786397, + "kl_loss": 3556.43603515625, + "loss_ib": 35.624061584472656, + "step": 99 + }, + { + "ce_ib": 59.0880126953125, + "ce_orig": 1.349737524986267, + "epoch": 0.028470774318786397, + "kl_loss": 3817.392578125, + "loss_ib": 38.23301315307617, + "step": 99 + }, + { + "ce_ib": 58.57311248779297, + "ce_orig": 0.8737644553184509, + "epoch": 0.028470774318786397, + "kl_loss": 3711.00537109375, + "loss_ib": 37.16862869262695, + "step": 99 + }, + { + "ce_ib": 60.80643081665039, + "ce_orig": 1.3529119491577148, + "epoch": 0.028470774318786397, + "kl_loss": 3494.3671875, + "loss_ib": 35.004478454589844, + "step": 99 + }, + { + "epoch": 0.02875835789776404, + "grad_norm": 506.0978088378906, + "learning_rate": 2.993630573248408e-06, + "loss": 35.5561, + "step": 100 + }, + { + "ce_ib": 58.56661605834961, + "ce_orig": 1.1666901111602783, + "epoch": 0.02875835789776404, + "kl_loss": 3524.689453125, + "loss_ib": 35.30546188354492, + "step": 100 + }, + { + "ce_ib": 62.831783294677734, + "ce_orig": 2.2819557189941406, + "epoch": 0.02875835789776404, + "kl_loss": 3127.18798828125, + "loss_ib": 31.3347110748291, + "step": 100 + }, + { + "ce_ib": 58.19925308227539, + "ce_orig": 0.8314814567565918, + "epoch": 0.02875835789776404, + "kl_loss": 3376.9296875, + "loss_ib": 33.82749557495117, + "step": 100 + }, + { + "ce_ib": 57.46586608886719, + "ce_orig": 0.7712212800979614, + "epoch": 0.02875835789776404, + "kl_loss": 3693.83544921875, + "loss_ib": 36.995819091796875, + "step": 100 + }, + { + "ce_ib": 63.20458221435547, + "ce_orig": 2.0820841789245605, + "epoch": 0.02904594147674168, + "kl_loss": 3015.462890625, + "loss_ib": 30.21783447265625, + "step": 101 + }, + { + "ce_ib": 58.481056213378906, + "ce_orig": 0.621671736240387, + "epoch": 0.02904594147674168, + "kl_loss": 3436.01416015625, + "loss_ib": 34.41862106323242, + "step": 101 + }, + { + "ce_ib": 63.86113357543945, + "ce_orig": 1.306851863861084, + "epoch": 0.02904594147674168, + "kl_loss": 3503.22216796875, + "loss_ib": 35.09608459472656, + "step": 101 + }, + { + "ce_ib": 58.2662239074707, + "ce_orig": 0.7199594974517822, + "epoch": 0.02904594147674168, + "kl_loss": 3752.4775390625, + "loss_ib": 37.58304214477539, + "step": 101 + }, + { + "ce_ib": 57.16976547241211, + "ce_orig": 0.7961263656616211, + "epoch": 0.029333525055719317, + "kl_loss": 3487.98388671875, + "loss_ib": 34.937007904052734, + "step": 102 + }, + { + "ce_ib": 59.150516510009766, + "ce_orig": 1.1825999021530151, + "epoch": 0.029333525055719317, + "kl_loss": 3089.3515625, + "loss_ib": 30.952665328979492, + "step": 102 + }, + { + "ce_ib": 58.3362922668457, + "ce_orig": 1.7745475769042969, + "epoch": 0.029333525055719317, + "kl_loss": 3348.009033203125, + "loss_ib": 33.53842544555664, + "step": 102 + }, + { + "ce_ib": 57.58949661254883, + "ce_orig": 1.5423011779785156, + "epoch": 0.029333525055719317, + "kl_loss": 3305.6611328125, + "loss_ib": 33.114200592041016, + "step": 102 + }, + { + "ce_ib": 61.6888427734375, + "ce_orig": 1.5072656869888306, + "epoch": 0.02962110863469696, + "kl_loss": 3463.50439453125, + "loss_ib": 34.69673156738281, + "step": 103 + }, + { + "ce_ib": 60.02954864501953, + "ce_orig": 1.457014560699463, + "epoch": 0.02962110863469696, + "kl_loss": 3395.760986328125, + "loss_ib": 34.01763916015625, + "step": 103 + }, + { + "ce_ib": 59.91975021362305, + "ce_orig": 1.4898319244384766, + "epoch": 0.02962110863469696, + "kl_loss": 3560.35009765625, + "loss_ib": 35.66341781616211, + "step": 103 + }, + { + "ce_ib": 61.39513397216797, + "ce_orig": 1.2825353145599365, + "epoch": 0.02962110863469696, + "kl_loss": 3457.565673828125, + "loss_ib": 34.63705062866211, + "step": 103 + }, + { + "ce_ib": 58.27676773071289, + "ce_orig": 0.7831246256828308, + "epoch": 0.0299086922136746, + "kl_loss": 3082.635498046875, + "loss_ib": 30.88463020324707, + "step": 104 + }, + { + "ce_ib": 59.33658218383789, + "ce_orig": 1.2523558139801025, + "epoch": 0.0299086922136746, + "kl_loss": 3511.50048828125, + "loss_ib": 35.174339294433594, + "step": 104 + }, + { + "ce_ib": 56.923065185546875, + "ce_orig": 0.8949446678161621, + "epoch": 0.0299086922136746, + "kl_loss": 3366.443603515625, + "loss_ib": 33.72135925292969, + "step": 104 + }, + { + "ce_ib": 56.37105178833008, + "ce_orig": 0.2978222370147705, + "epoch": 0.0299086922136746, + "kl_loss": 2516.5166015625, + "loss_ib": 25.22153663635254, + "step": 104 + }, + { + "epoch": 0.03019627579265224, + "grad_norm": 481.3152770996094, + "learning_rate": 3.1528662420382165e-06, + "loss": 34.5437, + "step": 105 + }, + { + "ce_ib": 58.88717269897461, + "ce_orig": 1.2730207443237305, + "epoch": 0.03019627579265224, + "kl_loss": 3434.847412109375, + "loss_ib": 34.4073600769043, + "step": 105 + }, + { + "ce_ib": 61.64189147949219, + "ce_orig": 1.5729440450668335, + "epoch": 0.03019627579265224, + "kl_loss": 3480.771484375, + "loss_ib": 34.86935806274414, + "step": 105 + }, + { + "ce_ib": 58.20305633544922, + "ce_orig": 0.7971786856651306, + "epoch": 0.03019627579265224, + "kl_loss": 3075.66064453125, + "loss_ib": 30.814809799194336, + "step": 105 + }, + { + "ce_ib": 60.825565338134766, + "ce_orig": 1.2366398572921753, + "epoch": 0.03019627579265224, + "kl_loss": 3418.934326171875, + "loss_ib": 34.25016784667969, + "step": 105 + }, + { + "ce_ib": 58.60723114013672, + "ce_orig": 0.7808988690376282, + "epoch": 0.030483859371629878, + "kl_loss": 3320.3857421875, + "loss_ib": 33.26246643066406, + "step": 106 + }, + { + "ce_ib": 57.563995361328125, + "ce_orig": 0.8758856058120728, + "epoch": 0.030483859371629878, + "kl_loss": 3181.57958984375, + "loss_ib": 31.87335777282715, + "step": 106 + }, + { + "ce_ib": 55.23259353637695, + "ce_orig": 0.22707822918891907, + "epoch": 0.030483859371629878, + "kl_loss": 2120.76416015625, + "loss_ib": 21.26287269592285, + "step": 106 + }, + { + "ce_ib": 54.9997673034668, + "ce_orig": 0.2669578790664673, + "epoch": 0.030483859371629878, + "kl_loss": 2237.90966796875, + "loss_ib": 22.43409538269043, + "step": 106 + }, + { + "ce_ib": 58.98134994506836, + "ce_orig": 1.2273164987564087, + "epoch": 0.03077144295060752, + "kl_loss": 1990.204833984375, + "loss_ib": 19.961029052734375, + "step": 107 + }, + { + "ce_ib": 59.87055587768555, + "ce_orig": 1.3858299255371094, + "epoch": 0.03077144295060752, + "kl_loss": 3156.53173828125, + "loss_ib": 31.625186920166016, + "step": 107 + }, + { + "ce_ib": 55.48119354248047, + "ce_orig": 0.8153777122497559, + "epoch": 0.03077144295060752, + "kl_loss": 3736.759765625, + "loss_ib": 37.42307662963867, + "step": 107 + }, + { + "ce_ib": 58.34563446044922, + "ce_orig": 0.9158485531806946, + "epoch": 0.03077144295060752, + "kl_loss": 3471.65380859375, + "loss_ib": 34.77488327026367, + "step": 107 + }, + { + "ce_ib": 58.289031982421875, + "ce_orig": 1.0781446695327759, + "epoch": 0.03105902652958516, + "kl_loss": 3476.87841796875, + "loss_ib": 34.82707214355469, + "step": 108 + }, + { + "ce_ib": 56.98807144165039, + "ce_orig": 0.7857345938682556, + "epoch": 0.03105902652958516, + "kl_loss": 3440.1845703125, + "loss_ib": 34.458831787109375, + "step": 108 + }, + { + "ce_ib": 58.76127624511719, + "ce_orig": 0.7976669073104858, + "epoch": 0.03105902652958516, + "kl_loss": 3203.509765625, + "loss_ib": 32.0938606262207, + "step": 108 + }, + { + "ce_ib": 60.71581268310547, + "ce_orig": 1.6471302509307861, + "epoch": 0.03105902652958516, + "kl_loss": 3262.56396484375, + "loss_ib": 32.68635559082031, + "step": 108 + }, + { + "ce_ib": 58.29198455810547, + "ce_orig": 0.6282819509506226, + "epoch": 0.0313466101085628, + "kl_loss": 3269.9423828125, + "loss_ib": 32.757713317871094, + "step": 109 + }, + { + "ce_ib": 58.99745559692383, + "ce_orig": 1.346901297569275, + "epoch": 0.0313466101085628, + "kl_loss": 3163.72998046875, + "loss_ib": 31.6962947845459, + "step": 109 + }, + { + "ce_ib": 60.90428924560547, + "ce_orig": 1.9273408651351929, + "epoch": 0.0313466101085628, + "kl_loss": 3333.64697265625, + "loss_ib": 33.39737319946289, + "step": 109 + }, + { + "ce_ib": 56.191925048828125, + "ce_orig": 0.5414481163024902, + "epoch": 0.0313466101085628, + "kl_loss": 2611.040283203125, + "loss_ib": 26.166593551635742, + "step": 109 + }, + { + "epoch": 0.03163419368754044, + "grad_norm": 497.0209655761719, + "learning_rate": 3.3121019108280255e-06, + "loss": 34.4129, + "step": 110 + }, + { + "ce_ib": 59.72053909301758, + "ce_orig": 0.9706757664680481, + "epoch": 0.03163419368754044, + "kl_loss": 3381.095947265625, + "loss_ib": 33.87068176269531, + "step": 110 + }, + { + "ce_ib": 57.03586196899414, + "ce_orig": 0.7681443691253662, + "epoch": 0.03163419368754044, + "kl_loss": 3697.955322265625, + "loss_ib": 37.03658676147461, + "step": 110 + }, + { + "ce_ib": 56.56296157836914, + "ce_orig": 1.0412012338638306, + "epoch": 0.03163419368754044, + "kl_loss": 3397.827880859375, + "loss_ib": 34.03483963012695, + "step": 110 + }, + { + "ce_ib": 58.891090393066406, + "ce_orig": 1.897376537322998, + "epoch": 0.03163419368754044, + "kl_loss": 3184.9423828125, + "loss_ib": 31.908315658569336, + "step": 110 + }, + { + "ce_ib": 56.234859466552734, + "ce_orig": 0.5728248953819275, + "epoch": 0.031921777266518084, + "kl_loss": 3383.22705078125, + "loss_ib": 33.88850402832031, + "step": 111 + }, + { + "ce_ib": 57.63355255126953, + "ce_orig": 1.1094613075256348, + "epoch": 0.031921777266518084, + "kl_loss": 3412.992431640625, + "loss_ib": 34.187557220458984, + "step": 111 + }, + { + "ce_ib": 60.40391540527344, + "ce_orig": 1.840659737586975, + "epoch": 0.031921777266518084, + "kl_loss": 3138.0, + "loss_ib": 31.44040298461914, + "step": 111 + }, + { + "ce_ib": 59.63862609863281, + "ce_orig": 1.0923250913619995, + "epoch": 0.031921777266518084, + "kl_loss": 3231.64892578125, + "loss_ib": 32.37612533569336, + "step": 111 + }, + { + "ce_ib": 57.537532806396484, + "ce_orig": 1.1983774900436401, + "epoch": 0.03220936084549572, + "kl_loss": 3396.81494140625, + "loss_ib": 34.02568435668945, + "step": 112 + }, + { + "ce_ib": 58.707515716552734, + "ce_orig": 0.9369058012962341, + "epoch": 0.03220936084549572, + "kl_loss": 3233.78564453125, + "loss_ib": 32.39656448364258, + "step": 112 + }, + { + "ce_ib": 58.834808349609375, + "ce_orig": 1.592905044555664, + "epoch": 0.03220936084549572, + "kl_loss": 3009.6337890625, + "loss_ib": 30.15517234802246, + "step": 112 + }, + { + "ce_ib": 56.853721618652344, + "ce_orig": 1.277111291885376, + "epoch": 0.03220936084549572, + "kl_loss": 3313.233642578125, + "loss_ib": 33.18918991088867, + "step": 112 + }, + { + "ce_ib": 57.65592575073242, + "ce_orig": 1.234552025794983, + "epoch": 0.032496944424473366, + "kl_loss": 3572.73828125, + "loss_ib": 35.785037994384766, + "step": 113 + }, + { + "ce_ib": 57.79000473022461, + "ce_orig": 0.7371742725372314, + "epoch": 0.032496944424473366, + "kl_loss": 3456.331787109375, + "loss_ib": 34.62110900878906, + "step": 113 + }, + { + "ce_ib": 56.60825729370117, + "ce_orig": 0.4902428388595581, + "epoch": 0.032496944424473366, + "kl_loss": 3120.79296875, + "loss_ib": 31.264537811279297, + "step": 113 + }, + { + "ce_ib": 56.7935905456543, + "ce_orig": 1.1464534997940063, + "epoch": 0.032496944424473366, + "kl_loss": 3540.481201171875, + "loss_ib": 35.461605072021484, + "step": 113 + }, + { + "ce_ib": 57.10232925415039, + "ce_orig": 0.8498454093933105, + "epoch": 0.032784528003451004, + "kl_loss": 3277.51904296875, + "loss_ib": 32.83229064941406, + "step": 114 + }, + { + "ce_ib": 55.13420486450195, + "ce_orig": 0.6758685111999512, + "epoch": 0.032784528003451004, + "kl_loss": 3604.8583984375, + "loss_ib": 36.10371780395508, + "step": 114 + }, + { + "ce_ib": 56.1276741027832, + "ce_orig": 0.5766817331314087, + "epoch": 0.032784528003451004, + "kl_loss": 3085.36865234375, + "loss_ib": 30.909812927246094, + "step": 114 + }, + { + "ce_ib": 57.931053161621094, + "ce_orig": 1.4995805025100708, + "epoch": 0.032784528003451004, + "kl_loss": 3098.906005859375, + "loss_ib": 31.0469913482666, + "step": 114 + }, + { + "epoch": 0.03307211158242864, + "grad_norm": 491.3838806152344, + "learning_rate": 3.4713375796178345e-06, + "loss": 34.0102, + "step": 115 + }, + { + "ce_ib": 59.15627670288086, + "ce_orig": 1.2845157384872437, + "epoch": 0.03307211158242864, + "kl_loss": 3372.32470703125, + "loss_ib": 33.78240203857422, + "step": 115 + }, + { + "ce_ib": 53.61143112182617, + "ce_orig": 0.45459315180778503, + "epoch": 0.03307211158242864, + "kl_loss": 1565.43212890625, + "loss_ib": 15.707931518554688, + "step": 115 + }, + { + "ce_ib": 57.589969635009766, + "ce_orig": 0.9765967130661011, + "epoch": 0.03307211158242864, + "kl_loss": 3676.704345703125, + "loss_ib": 36.82463455200195, + "step": 115 + }, + { + "ce_ib": 60.20054244995117, + "ce_orig": 1.7167049646377563, + "epoch": 0.03307211158242864, + "kl_loss": 3165.83837890625, + "loss_ib": 31.718584060668945, + "step": 115 + }, + { + "ce_ib": 53.816566467285156, + "ce_orig": 0.67730712890625, + "epoch": 0.033359695161406286, + "kl_loss": 3459.393310546875, + "loss_ib": 34.64774703979492, + "step": 116 + }, + { + "ce_ib": 54.58203887939453, + "ce_orig": 0.911034107208252, + "epoch": 0.033359695161406286, + "kl_loss": 3594.5859375, + "loss_ib": 36.00044250488281, + "step": 116 + }, + { + "ce_ib": 57.87051773071289, + "ce_orig": 1.118709921836853, + "epoch": 0.033359695161406286, + "kl_loss": 3187.24072265625, + "loss_ib": 31.93027687072754, + "step": 116 + }, + { + "ce_ib": 56.325897216796875, + "ce_orig": 0.9163821935653687, + "epoch": 0.033359695161406286, + "kl_loss": 3129.631591796875, + "loss_ib": 31.35264015197754, + "step": 116 + }, + { + "ce_ib": 54.92118835449219, + "ce_orig": 0.38672956824302673, + "epoch": 0.033647278740383924, + "kl_loss": 3076.05810546875, + "loss_ib": 30.815502166748047, + "step": 117 + }, + { + "ce_ib": 54.690006256103516, + "ce_orig": 0.8246923685073853, + "epoch": 0.033647278740383924, + "kl_loss": 3273.61962890625, + "loss_ib": 32.79088592529297, + "step": 117 + }, + { + "ce_ib": 54.07289505004883, + "ce_orig": 0.7164103388786316, + "epoch": 0.033647278740383924, + "kl_loss": 3623.53173828125, + "loss_ib": 36.289390563964844, + "step": 117 + }, + { + "ce_ib": 60.675941467285156, + "ce_orig": 1.377237319946289, + "epoch": 0.033647278740383924, + "kl_loss": 3372.191650390625, + "loss_ib": 33.7825927734375, + "step": 117 + }, + { + "ce_ib": 56.7794189453125, + "ce_orig": 0.9724603295326233, + "epoch": 0.03393486231936156, + "kl_loss": 3288.50390625, + "loss_ib": 32.94181823730469, + "step": 118 + }, + { + "ce_ib": 54.519351959228516, + "ce_orig": 0.5410425662994385, + "epoch": 0.03393486231936156, + "kl_loss": 3287.3623046875, + "loss_ib": 32.92814254760742, + "step": 118 + }, + { + "ce_ib": 54.34451675415039, + "ce_orig": 0.8038122653961182, + "epoch": 0.03393486231936156, + "kl_loss": 3050.375, + "loss_ib": 30.558095932006836, + "step": 118 + }, + { + "ce_ib": 55.54384994506836, + "ce_orig": 0.8650107979774475, + "epoch": 0.03393486231936156, + "kl_loss": 3366.908447265625, + "loss_ib": 33.72462844848633, + "step": 118 + }, + { + "ce_ib": 57.188228607177734, + "ce_orig": 0.9910153746604919, + "epoch": 0.034222445898339206, + "kl_loss": 3439.970458984375, + "loss_ib": 34.45689010620117, + "step": 119 + }, + { + "ce_ib": 56.907711029052734, + "ce_orig": 0.7115373611450195, + "epoch": 0.034222445898339206, + "kl_loss": 3301.9189453125, + "loss_ib": 33.07609558105469, + "step": 119 + }, + { + "ce_ib": 57.2123908996582, + "ce_orig": 1.0718692541122437, + "epoch": 0.034222445898339206, + "kl_loss": 3499.33203125, + "loss_ib": 35.050533294677734, + "step": 119 + }, + { + "ce_ib": 53.44661331176758, + "ce_orig": 0.24829663336277008, + "epoch": 0.034222445898339206, + "kl_loss": 2903.864013671875, + "loss_ib": 29.092086791992188, + "step": 119 + }, + { + "epoch": 0.03451002947731684, + "grad_norm": 488.701416015625, + "learning_rate": 3.6305732484076435e-06, + "loss": 33.466, + "step": 120 + }, + { + "ce_ib": 58.145294189453125, + "ce_orig": 1.4251874685287476, + "epoch": 0.03451002947731684, + "kl_loss": 3136.32958984375, + "loss_ib": 31.42144012451172, + "step": 120 + }, + { + "ce_ib": 56.203006744384766, + "ce_orig": 0.8692367076873779, + "epoch": 0.03451002947731684, + "kl_loss": 3459.39892578125, + "loss_ib": 34.65019226074219, + "step": 120 + }, + { + "ce_ib": 53.20535659790039, + "ce_orig": 0.23018254339694977, + "epoch": 0.03451002947731684, + "kl_loss": 1242.847900390625, + "loss_ib": 12.481684684753418, + "step": 120 + }, + { + "ce_ib": 56.525516510009766, + "ce_orig": 1.0064990520477295, + "epoch": 0.03451002947731684, + "kl_loss": 3232.018310546875, + "loss_ib": 32.376708984375, + "step": 120 + }, + { + "ce_ib": 54.402645111083984, + "ce_orig": 0.8779659867286682, + "epoch": 0.03479761305629449, + "kl_loss": 3443.0625, + "loss_ib": 34.48502731323242, + "step": 121 + }, + { + "ce_ib": 57.10426330566406, + "ce_orig": 1.2402876615524292, + "epoch": 0.03479761305629449, + "kl_loss": 3028.55224609375, + "loss_ib": 30.342626571655273, + "step": 121 + }, + { + "ce_ib": 58.463226318359375, + "ce_orig": 1.786939263343811, + "epoch": 0.03479761305629449, + "kl_loss": 3314.563232421875, + "loss_ib": 33.204097747802734, + "step": 121 + }, + { + "ce_ib": 55.72274398803711, + "ce_orig": 0.8808313608169556, + "epoch": 0.03479761305629449, + "kl_loss": 3207.419677734375, + "loss_ib": 32.12991714477539, + "step": 121 + }, + { + "ce_ib": 54.48756408691406, + "ce_orig": 0.7947918772697449, + "epoch": 0.035085196635272126, + "kl_loss": 3365.093505859375, + "loss_ib": 33.705421447753906, + "step": 122 + }, + { + "ce_ib": 55.213951110839844, + "ce_orig": 0.8712708353996277, + "epoch": 0.035085196635272126, + "kl_loss": 3264.61181640625, + "loss_ib": 32.701332092285156, + "step": 122 + }, + { + "ce_ib": 56.54671096801758, + "ce_orig": 0.3289700150489807, + "epoch": 0.035085196635272126, + "kl_loss": 2472.1728515625, + "loss_ib": 24.778276443481445, + "step": 122 + }, + { + "ce_ib": 58.04352951049805, + "ce_orig": 1.5535142421722412, + "epoch": 0.035085196635272126, + "kl_loss": 2923.5888671875, + "loss_ib": 29.29393196105957, + "step": 122 + }, + { + "ce_ib": 55.66524124145508, + "ce_orig": 0.7867345809936523, + "epoch": 0.03537278021424976, + "kl_loss": 3238.93212890625, + "loss_ib": 32.444984436035156, + "step": 123 + }, + { + "ce_ib": 54.70219039916992, + "ce_orig": 0.9664096832275391, + "epoch": 0.03537278021424976, + "kl_loss": 3296.1142578125, + "loss_ib": 33.01584243774414, + "step": 123 + }, + { + "ce_ib": 56.01795196533203, + "ce_orig": 0.7100675702095032, + "epoch": 0.03537278021424976, + "kl_loss": 2146.758544921875, + "loss_ib": 21.523603439331055, + "step": 123 + }, + { + "ce_ib": 56.77366256713867, + "ce_orig": 1.1051965951919556, + "epoch": 0.03537278021424976, + "kl_loss": 3093.79345703125, + "loss_ib": 30.994707107543945, + "step": 123 + }, + { + "ce_ib": 57.009521484375, + "ce_orig": 0.7903485894203186, + "epoch": 0.03566036379322741, + "kl_loss": 3563.074462890625, + "loss_ib": 35.68775177001953, + "step": 124 + }, + { + "ce_ib": 55.95924758911133, + "ce_orig": 0.9127640724182129, + "epoch": 0.03566036379322741, + "kl_loss": 3396.574951171875, + "loss_ib": 34.02170944213867, + "step": 124 + }, + { + "ce_ib": 56.98552703857422, + "ce_orig": 1.0692386627197266, + "epoch": 0.03566036379322741, + "kl_loss": 3032.76025390625, + "loss_ib": 30.384586334228516, + "step": 124 + }, + { + "ce_ib": 54.63241958618164, + "ce_orig": 1.1963261365890503, + "epoch": 0.03566036379322741, + "kl_loss": 3243.490966796875, + "loss_ib": 32.489540100097656, + "step": 124 + }, + { + "epoch": 0.035947947372205045, + "grad_norm": 484.4737854003906, + "learning_rate": 3.789808917197453e-06, + "loss": 32.9421, + "step": 125 + }, + { + "ce_ib": 53.653743743896484, + "ce_orig": 0.7380173802375793, + "epoch": 0.035947947372205045, + "kl_loss": 3123.354736328125, + "loss_ib": 31.287200927734375, + "step": 125 + }, + { + "ce_ib": 59.990928649902344, + "ce_orig": 1.5150277614593506, + "epoch": 0.035947947372205045, + "kl_loss": 3297.04443359375, + "loss_ib": 33.030433654785156, + "step": 125 + }, + { + "ce_ib": 57.65206527709961, + "ce_orig": 0.8391215801239014, + "epoch": 0.035947947372205045, + "kl_loss": 3000.66748046875, + "loss_ib": 30.0643253326416, + "step": 125 + }, + { + "ce_ib": 56.54597854614258, + "ce_orig": 1.0180349349975586, + "epoch": 0.035947947372205045, + "kl_loss": 3431.23974609375, + "loss_ib": 34.36894226074219, + "step": 125 + }, + { + "ce_ib": 54.98883819580078, + "ce_orig": 0.7524531483650208, + "epoch": 0.03623553095118269, + "kl_loss": 3311.05419921875, + "loss_ib": 33.16552734375, + "step": 126 + }, + { + "ce_ib": 54.45235061645508, + "ce_orig": 1.1706984043121338, + "epoch": 0.03623553095118269, + "kl_loss": 2987.413818359375, + "loss_ib": 29.928590774536133, + "step": 126 + }, + { + "ce_ib": 56.98481369018555, + "ce_orig": 1.6676733493804932, + "epoch": 0.03623553095118269, + "kl_loss": 2663.32080078125, + "loss_ib": 26.69019317626953, + "step": 126 + }, + { + "ce_ib": 55.758174896240234, + "ce_orig": 1.054375410079956, + "epoch": 0.03623553095118269, + "kl_loss": 3309.024658203125, + "loss_ib": 33.14600372314453, + "step": 126 + }, + { + "ce_ib": 56.77848815917969, + "ce_orig": 1.3328591585159302, + "epoch": 0.03652311453016033, + "kl_loss": 3304.2041015625, + "loss_ib": 33.09881591796875, + "step": 127 + }, + { + "ce_ib": 57.968807220458984, + "ce_orig": 1.372467041015625, + "epoch": 0.03652311453016033, + "kl_loss": 3255.86376953125, + "loss_ib": 32.61660385131836, + "step": 127 + }, + { + "ce_ib": 53.87700653076172, + "ce_orig": 0.602317214012146, + "epoch": 0.03652311453016033, + "kl_loss": 2769.507080078125, + "loss_ib": 27.748947143554688, + "step": 127 + }, + { + "ce_ib": 57.150821685791016, + "ce_orig": 1.5336121320724487, + "epoch": 0.03652311453016033, + "kl_loss": 3108.03857421875, + "loss_ib": 31.137535095214844, + "step": 127 + }, + { + "ce_ib": 55.34831619262695, + "ce_orig": 1.2944082021713257, + "epoch": 0.036810698109137965, + "kl_loss": 3049.82568359375, + "loss_ib": 30.553606033325195, + "step": 128 + }, + { + "ce_ib": 55.08821105957031, + "ce_orig": 1.2756754159927368, + "epoch": 0.036810698109137965, + "kl_loss": 3294.335205078125, + "loss_ib": 32.99843978881836, + "step": 128 + }, + { + "ce_ib": 56.542110443115234, + "ce_orig": 1.3169299364089966, + "epoch": 0.036810698109137965, + "kl_loss": 3289.144287109375, + "loss_ib": 32.9479866027832, + "step": 128 + }, + { + "ce_ib": 54.76836395263672, + "ce_orig": 0.9867354035377502, + "epoch": 0.036810698109137965, + "kl_loss": 3241.2333984375, + "loss_ib": 32.46710205078125, + "step": 128 + }, + { + "ce_ib": 54.50560760498047, + "ce_orig": 0.9337356090545654, + "epoch": 0.03709828168811561, + "kl_loss": 3071.6708984375, + "loss_ib": 30.77121353149414, + "step": 129 + }, + { + "ce_ib": 56.29538345336914, + "ce_orig": 1.379828691482544, + "epoch": 0.03709828168811561, + "kl_loss": 2586.766845703125, + "loss_ib": 25.92396354675293, + "step": 129 + }, + { + "ce_ib": 54.395912170410156, + "ce_orig": 0.26942014694213867, + "epoch": 0.03709828168811561, + "kl_loss": 3214.80029296875, + "loss_ib": 32.20240020751953, + "step": 129 + }, + { + "ce_ib": 53.4327392578125, + "ce_orig": 1.03145432472229, + "epoch": 0.03709828168811561, + "kl_loss": 3447.09765625, + "loss_ib": 34.524410247802734, + "step": 129 + }, + { + "epoch": 0.03738586526709325, + "grad_norm": 485.5062255859375, + "learning_rate": 3.949044585987262e-06, + "loss": 32.6015, + "step": 130 + }, + { + "ce_ib": 57.03399658203125, + "ce_orig": 1.2322174310684204, + "epoch": 0.03738586526709325, + "kl_loss": 3191.55419921875, + "loss_ib": 31.97257423400879, + "step": 130 + }, + { + "ce_ib": 52.21621322631836, + "ce_orig": 0.743526816368103, + "epoch": 0.03738586526709325, + "kl_loss": 3611.2763671875, + "loss_ib": 36.16497802734375, + "step": 130 + }, + { + "ce_ib": 58.345272064208984, + "ce_orig": 1.358251929283142, + "epoch": 0.03738586526709325, + "kl_loss": 3283.64990234375, + "loss_ib": 32.89484405517578, + "step": 130 + }, + { + "ce_ib": 56.030879974365234, + "ce_orig": 0.6732674241065979, + "epoch": 0.03738586526709325, + "kl_loss": 3234.9384765625, + "loss_ib": 32.40541458129883, + "step": 130 + }, + { + "ce_ib": 57.00092697143555, + "ce_orig": 0.9541739225387573, + "epoch": 0.03767344884607089, + "kl_loss": 3000.126708984375, + "loss_ib": 30.05826759338379, + "step": 131 + }, + { + "ce_ib": 56.356407165527344, + "ce_orig": 1.652900218963623, + "epoch": 0.03767344884607089, + "kl_loss": 2972.48486328125, + "loss_ib": 29.781206130981445, + "step": 131 + }, + { + "ce_ib": 56.224830627441406, + "ce_orig": 1.5765421390533447, + "epoch": 0.03767344884607089, + "kl_loss": 3185.729248046875, + "loss_ib": 31.913516998291016, + "step": 131 + }, + { + "ce_ib": 54.61565017700195, + "ce_orig": 0.7351348996162415, + "epoch": 0.03767344884607089, + "kl_loss": 3193.2255859375, + "loss_ib": 31.98687171936035, + "step": 131 + }, + { + "ce_ib": 54.55472183227539, + "ce_orig": 1.086585283279419, + "epoch": 0.03796103242504853, + "kl_loss": 3093.6298828125, + "loss_ib": 30.990854263305664, + "step": 132 + }, + { + "ce_ib": 52.542877197265625, + "ce_orig": 1.0202161073684692, + "epoch": 0.03796103242504853, + "kl_loss": 3178.2353515625, + "loss_ib": 31.834896087646484, + "step": 132 + }, + { + "ce_ib": 58.19728088378906, + "ce_orig": 1.5428324937820435, + "epoch": 0.03796103242504853, + "kl_loss": 3120.078857421875, + "loss_ib": 31.25898551940918, + "step": 132 + }, + { + "ce_ib": 55.14646911621094, + "ce_orig": 1.1489073038101196, + "epoch": 0.03796103242504853, + "kl_loss": 3333.323974609375, + "loss_ib": 33.38838577270508, + "step": 132 + }, + { + "ce_ib": 58.42383575439453, + "ce_orig": 1.3307501077651978, + "epoch": 0.03824861600402617, + "kl_loss": 3119.617919921875, + "loss_ib": 31.254600524902344, + "step": 133 + }, + { + "ce_ib": 53.10658645629883, + "ce_orig": 0.47057151794433594, + "epoch": 0.03824861600402617, + "kl_loss": 2581.533203125, + "loss_ib": 25.868436813354492, + "step": 133 + }, + { + "ce_ib": 55.19010543823242, + "ce_orig": 1.461709976196289, + "epoch": 0.03824861600402617, + "kl_loss": 3218.223876953125, + "loss_ib": 32.2374267578125, + "step": 133 + }, + { + "ce_ib": 55.269439697265625, + "ce_orig": 0.9877228140830994, + "epoch": 0.03824861600402617, + "kl_loss": 3257.63916015625, + "loss_ib": 32.63166046142578, + "step": 133 + }, + { + "ce_ib": 54.13009262084961, + "ce_orig": 1.1180518865585327, + "epoch": 0.03853619958300381, + "kl_loss": 3161.342529296875, + "loss_ib": 31.66755485534668, + "step": 134 + }, + { + "ce_ib": 51.55915069580078, + "ce_orig": 0.476241797208786, + "epoch": 0.03853619958300381, + "kl_loss": 3272.551025390625, + "loss_ib": 32.777069091796875, + "step": 134 + }, + { + "ce_ib": 51.68458557128906, + "ce_orig": 0.78632652759552, + "epoch": 0.03853619958300381, + "kl_loss": 3286.03662109375, + "loss_ib": 32.91204833984375, + "step": 134 + }, + { + "ce_ib": 52.53611373901367, + "ce_orig": 0.601127564907074, + "epoch": 0.03853619958300381, + "kl_loss": 3292.48193359375, + "loss_ib": 32.97735595703125, + "step": 134 + }, + { + "epoch": 0.03882378316198145, + "grad_norm": 484.1724853515625, + "learning_rate": 4.10828025477707e-06, + "loss": 32.4331, + "step": 135 + }, + { + "ce_ib": 52.33879089355469, + "ce_orig": 0.9939437508583069, + "epoch": 0.03882378316198145, + "kl_loss": 3350.6103515625, + "loss_ib": 33.558441162109375, + "step": 135 + }, + { + "ce_ib": 57.75893783569336, + "ce_orig": 1.900199055671692, + "epoch": 0.03882378316198145, + "kl_loss": 3010.15771484375, + "loss_ib": 30.15933609008789, + "step": 135 + }, + { + "ce_ib": 52.984676361083984, + "ce_orig": 1.0698907375335693, + "epoch": 0.03882378316198145, + "kl_loss": 3073.521484375, + "loss_ib": 30.78820037841797, + "step": 135 + }, + { + "ce_ib": 52.373348236083984, + "ce_orig": 0.7963224649429321, + "epoch": 0.03882378316198145, + "kl_loss": 2985.98486328125, + "loss_ib": 29.912221908569336, + "step": 135 + }, + { + "ce_ib": 52.30683517456055, + "ce_orig": 0.7155612707138062, + "epoch": 0.039111366740959094, + "kl_loss": 3103.802978515625, + "loss_ib": 31.090335845947266, + "step": 136 + }, + { + "ce_ib": 53.33190155029297, + "ce_orig": 0.7446028590202332, + "epoch": 0.039111366740959094, + "kl_loss": 3185.09228515625, + "loss_ib": 31.904254913330078, + "step": 136 + }, + { + "ce_ib": 52.83795928955078, + "ce_orig": 0.77790367603302, + "epoch": 0.039111366740959094, + "kl_loss": 2887.193359375, + "loss_ib": 28.92477035522461, + "step": 136 + }, + { + "ce_ib": 54.76385498046875, + "ce_orig": 0.3635737895965576, + "epoch": 0.039111366740959094, + "kl_loss": 1756.336669921875, + "loss_ib": 17.618131637573242, + "step": 136 + }, + { + "ce_ib": 55.12189865112305, + "ce_orig": 1.2548986673355103, + "epoch": 0.03939895031993673, + "kl_loss": 3142.7470703125, + "loss_ib": 31.48259162902832, + "step": 137 + }, + { + "ce_ib": 52.87950897216797, + "ce_orig": 0.8208221793174744, + "epoch": 0.03939895031993673, + "kl_loss": 3317.441650390625, + "loss_ib": 33.227294921875, + "step": 137 + }, + { + "ce_ib": 53.339599609375, + "ce_orig": 0.7617712020874023, + "epoch": 0.03939895031993673, + "kl_loss": 3318.7333984375, + "loss_ib": 33.24067306518555, + "step": 137 + }, + { + "ce_ib": 52.08175277709961, + "ce_orig": 0.8632348775863647, + "epoch": 0.03939895031993673, + "kl_loss": 2449.124267578125, + "loss_ib": 24.543325424194336, + "step": 137 + }, + { + "ce_ib": 54.37158203125, + "ce_orig": 1.257277250289917, + "epoch": 0.03968653389891437, + "kl_loss": 3025.6142578125, + "loss_ib": 30.310514450073242, + "step": 138 + }, + { + "ce_ib": 54.39937210083008, + "ce_orig": 0.8799593448638916, + "epoch": 0.03968653389891437, + "kl_loss": 3261.203857421875, + "loss_ib": 32.666439056396484, + "step": 138 + }, + { + "ce_ib": 53.31037902832031, + "ce_orig": 0.5872792601585388, + "epoch": 0.03968653389891437, + "kl_loss": 2659.81982421875, + "loss_ib": 26.651508331298828, + "step": 138 + }, + { + "ce_ib": 52.60165786743164, + "ce_orig": 0.9464865922927856, + "epoch": 0.03968653389891437, + "kl_loss": 3371.40185546875, + "loss_ib": 33.76662063598633, + "step": 138 + }, + { + "ce_ib": 52.575260162353516, + "ce_orig": 0.46206337213516235, + "epoch": 0.039974117477892014, + "kl_loss": 2850.687255859375, + "loss_ib": 28.5594482421875, + "step": 139 + }, + { + "ce_ib": 53.40571975708008, + "ce_orig": 0.6606719493865967, + "epoch": 0.039974117477892014, + "kl_loss": 2885.156005859375, + "loss_ib": 28.904964447021484, + "step": 139 + }, + { + "ce_ib": 54.047019958496094, + "ce_orig": 1.0017621517181396, + "epoch": 0.039974117477892014, + "kl_loss": 2731.3134765625, + "loss_ib": 27.3671817779541, + "step": 139 + }, + { + "ce_ib": 53.62920379638672, + "ce_orig": 0.9739691615104675, + "epoch": 0.039974117477892014, + "kl_loss": 3102.27099609375, + "loss_ib": 31.076339721679688, + "step": 139 + }, + { + "epoch": 0.04026170105686965, + "grad_norm": 459.9145202636719, + "learning_rate": 4.26751592356688e-06, + "loss": 31.4281, + "step": 140 + }, + { + "ce_ib": 54.69853973388672, + "ce_orig": 1.1423823833465576, + "epoch": 0.04026170105686965, + "kl_loss": 3194.418701171875, + "loss_ib": 31.998886108398438, + "step": 140 + }, + { + "ce_ib": 50.82587432861328, + "ce_orig": 0.6784489750862122, + "epoch": 0.04026170105686965, + "kl_loss": 3204.444091796875, + "loss_ib": 32.09526443481445, + "step": 140 + }, + { + "ce_ib": 51.895416259765625, + "ce_orig": 0.7560833096504211, + "epoch": 0.04026170105686965, + "kl_loss": 3291.0400390625, + "loss_ib": 32.96229553222656, + "step": 140 + }, + { + "ce_ib": 51.76190948486328, + "ce_orig": 0.8880281448364258, + "epoch": 0.04026170105686965, + "kl_loss": 3192.113037109375, + "loss_ib": 31.97289276123047, + "step": 140 + }, + { + "ce_ib": 55.869300842285156, + "ce_orig": 1.4632692337036133, + "epoch": 0.040549284635847296, + "kl_loss": 3255.39404296875, + "loss_ib": 32.60980987548828, + "step": 141 + }, + { + "ce_ib": 51.57696533203125, + "ce_orig": 1.1582895517349243, + "epoch": 0.040549284635847296, + "kl_loss": 3429.54150390625, + "loss_ib": 34.34699249267578, + "step": 141 + }, + { + "ce_ib": 53.07311248779297, + "ce_orig": 1.187181830406189, + "epoch": 0.040549284635847296, + "kl_loss": 3198.211669921875, + "loss_ib": 32.03519058227539, + "step": 141 + }, + { + "ce_ib": 55.36647415161133, + "ce_orig": 1.443948745727539, + "epoch": 0.040549284635847296, + "kl_loss": 2707.63525390625, + "loss_ib": 27.131717681884766, + "step": 141 + }, + { + "ce_ib": 53.37755584716797, + "ce_orig": 1.1925128698349, + "epoch": 0.040836868214824934, + "kl_loss": 2913.3408203125, + "loss_ib": 29.186784744262695, + "step": 142 + }, + { + "ce_ib": 50.930152893066406, + "ce_orig": 0.631533682346344, + "epoch": 0.040836868214824934, + "kl_loss": 3136.6171875, + "loss_ib": 31.41710090637207, + "step": 142 + }, + { + "ce_ib": 54.71884536743164, + "ce_orig": 0.2883818447589874, + "epoch": 0.040836868214824934, + "kl_loss": 1113.8369140625, + "loss_ib": 11.19308853149414, + "step": 142 + }, + { + "ce_ib": 53.51261901855469, + "ce_orig": 0.9670610427856445, + "epoch": 0.040836868214824934, + "kl_loss": 3197.89208984375, + "loss_ib": 32.032432556152344, + "step": 142 + }, + { + "ce_ib": 53.469810485839844, + "ce_orig": 0.9970093965530396, + "epoch": 0.04112445179380257, + "kl_loss": 3185.8349609375, + "loss_ib": 31.91181755065918, + "step": 143 + }, + { + "ce_ib": 52.19078826904297, + "ce_orig": 0.9311832785606384, + "epoch": 0.04112445179380257, + "kl_loss": 3028.153076171875, + "loss_ib": 30.33371925354004, + "step": 143 + }, + { + "ce_ib": 54.6331901550293, + "ce_orig": 1.481334924697876, + "epoch": 0.04112445179380257, + "kl_loss": 2883.3681640625, + "loss_ib": 28.888315200805664, + "step": 143 + }, + { + "ce_ib": 54.52168273925781, + "ce_orig": 1.2398836612701416, + "epoch": 0.04112445179380257, + "kl_loss": 2991.29931640625, + "loss_ib": 29.967514038085938, + "step": 143 + }, + { + "ce_ib": 55.38058090209961, + "ce_orig": 1.4223779439926147, + "epoch": 0.041412035372780216, + "kl_loss": 2755.46337890625, + "loss_ib": 27.610013961791992, + "step": 144 + }, + { + "ce_ib": 52.242977142333984, + "ce_orig": 0.357547402381897, + "epoch": 0.041412035372780216, + "kl_loss": 2976.813232421875, + "loss_ib": 29.820375442504883, + "step": 144 + }, + { + "ce_ib": 50.533851623535156, + "ce_orig": 0.5890440940856934, + "epoch": 0.041412035372780216, + "kl_loss": 3232.82861328125, + "loss_ib": 32.37881851196289, + "step": 144 + }, + { + "ce_ib": 50.53768539428711, + "ce_orig": 0.7130187153816223, + "epoch": 0.041412035372780216, + "kl_loss": 3210.93310546875, + "loss_ib": 32.15986633300781, + "step": 144 + }, + { + "epoch": 0.041699618951757854, + "grad_norm": 441.0871887207031, + "learning_rate": 4.426751592356688e-06, + "loss": 31.0274, + "step": 145 + }, + { + "ce_ib": 54.21467208862305, + "ce_orig": 1.196142315864563, + "epoch": 0.041699618951757854, + "kl_loss": 3077.82958984375, + "loss_ib": 30.832509994506836, + "step": 145 + }, + { + "ce_ib": 52.88565444946289, + "ce_orig": 0.6000714898109436, + "epoch": 0.041699618951757854, + "kl_loss": 3106.078369140625, + "loss_ib": 31.11366844177246, + "step": 145 + }, + { + "ce_ib": 52.295101165771484, + "ce_orig": 0.775937020778656, + "epoch": 0.041699618951757854, + "kl_loss": 3244.89892578125, + "loss_ib": 32.50128173828125, + "step": 145 + }, + { + "ce_ib": 52.64463424682617, + "ce_orig": 1.2740693092346191, + "epoch": 0.041699618951757854, + "kl_loss": 2585.204833984375, + "loss_ib": 25.904691696166992, + "step": 145 + }, + { + "ce_ib": 49.40985870361328, + "ce_orig": 0.8610404133796692, + "epoch": 0.0419872025307355, + "kl_loss": 3257.540771484375, + "loss_ib": 32.62481689453125, + "step": 146 + }, + { + "ce_ib": 52.093650817871094, + "ce_orig": 0.6095184683799744, + "epoch": 0.0419872025307355, + "kl_loss": 2127.02783203125, + "loss_ib": 21.322372436523438, + "step": 146 + }, + { + "ce_ib": 50.681373596191406, + "ce_orig": 0.5939872860908508, + "epoch": 0.0419872025307355, + "kl_loss": 3274.1162109375, + "loss_ib": 32.79184341430664, + "step": 146 + }, + { + "ce_ib": 52.19261169433594, + "ce_orig": 1.2758476734161377, + "epoch": 0.0419872025307355, + "kl_loss": 3013.127685546875, + "loss_ib": 30.183467864990234, + "step": 146 + }, + { + "ce_ib": 53.93763732910156, + "ce_orig": 1.4467533826828003, + "epoch": 0.042274786109713136, + "kl_loss": 2893.376953125, + "loss_ib": 28.987707138061523, + "step": 147 + }, + { + "ce_ib": 50.615875244140625, + "ce_orig": 0.8429998159408569, + "epoch": 0.042274786109713136, + "kl_loss": 3275.34423828125, + "loss_ib": 32.80405807495117, + "step": 147 + }, + { + "ce_ib": 51.70206832885742, + "ce_orig": 0.6785926222801208, + "epoch": 0.042274786109713136, + "kl_loss": 2999.413818359375, + "loss_ib": 30.045839309692383, + "step": 147 + }, + { + "ce_ib": 52.60684585571289, + "ce_orig": 0.7887744903564453, + "epoch": 0.042274786109713136, + "kl_loss": 2922.228515625, + "loss_ib": 29.274892807006836, + "step": 147 + }, + { + "ce_ib": 53.9986457824707, + "ce_orig": 1.4007996320724487, + "epoch": 0.042562369688690774, + "kl_loss": 2788.4990234375, + "loss_ib": 27.938987731933594, + "step": 148 + }, + { + "ce_ib": 51.32645034790039, + "ce_orig": 0.51473069190979, + "epoch": 0.042562369688690774, + "kl_loss": 3179.380126953125, + "loss_ib": 31.84512710571289, + "step": 148 + }, + { + "ce_ib": 50.38296127319336, + "ce_orig": 0.7060822248458862, + "epoch": 0.042562369688690774, + "kl_loss": 3014.38427734375, + "loss_ib": 30.194225311279297, + "step": 148 + }, + { + "ce_ib": 50.93301773071289, + "ce_orig": 0.6229360699653625, + "epoch": 0.042562369688690774, + "kl_loss": 2787.4912109375, + "loss_ib": 27.925844192504883, + "step": 148 + }, + { + "ce_ib": 50.93965148925781, + "ce_orig": 0.719894528388977, + "epoch": 0.04284995326766842, + "kl_loss": 3031.69287109375, + "loss_ib": 30.367868423461914, + "step": 149 + }, + { + "ce_ib": 54.47814178466797, + "ce_orig": 2.0223007202148438, + "epoch": 0.04284995326766842, + "kl_loss": 2659.58544921875, + "loss_ib": 26.650331497192383, + "step": 149 + }, + { + "ce_ib": 49.89313888549805, + "ce_orig": 0.8729849457740784, + "epoch": 0.04284995326766842, + "kl_loss": 2983.194580078125, + "loss_ib": 29.881837844848633, + "step": 149 + }, + { + "ce_ib": 52.88545608520508, + "ce_orig": 1.3256616592407227, + "epoch": 0.04284995326766842, + "kl_loss": 3132.82666015625, + "loss_ib": 31.38115119934082, + "step": 149 + }, + { + "epoch": 0.043137536846646056, + "grad_norm": 459.27581787109375, + "learning_rate": 4.585987261146497e-06, + "loss": 30.1884, + "step": 150 + }, + { + "ce_ib": 50.605384826660156, + "ce_orig": 1.0216200351715088, + "epoch": 0.043137536846646056, + "kl_loss": 3048.3466796875, + "loss_ib": 30.53407096862793, + "step": 150 + }, + { + "ce_ib": 52.01803207397461, + "ce_orig": 1.3228956460952759, + "epoch": 0.043137536846646056, + "kl_loss": 2962.606201171875, + "loss_ib": 29.67807960510254, + "step": 150 + }, + { + "ce_ib": 53.09737014770508, + "ce_orig": 0.548643946647644, + "epoch": 0.043137536846646056, + "kl_loss": 2902.078125, + "loss_ib": 29.07387924194336, + "step": 150 + }, + { + "ce_ib": 55.06275177001953, + "ce_orig": 1.2563978433609009, + "epoch": 0.043137536846646056, + "kl_loss": 3022.199951171875, + "loss_ib": 30.277061462402344, + "step": 150 + }, + { + "ce_ib": 51.54874801635742, + "ce_orig": 0.9161649942398071, + "epoch": 0.043425120425623694, + "kl_loss": 3167.26806640625, + "loss_ib": 31.724227905273438, + "step": 151 + }, + { + "ce_ib": 55.47027587890625, + "ce_orig": 2.167449712753296, + "epoch": 0.043425120425623694, + "kl_loss": 2913.7783203125, + "loss_ib": 29.193254470825195, + "step": 151 + }, + { + "ce_ib": 52.68526077270508, + "ce_orig": 0.6615663766860962, + "epoch": 0.043425120425623694, + "kl_loss": 2900.98974609375, + "loss_ib": 29.06258201599121, + "step": 151 + }, + { + "ce_ib": 52.60275650024414, + "ce_orig": 1.29401695728302, + "epoch": 0.043425120425623694, + "kl_loss": 2888.08935546875, + "loss_ib": 28.933494567871094, + "step": 151 + }, + { + "ce_ib": 52.2644157409668, + "ce_orig": 0.8575372695922852, + "epoch": 0.04371270400460134, + "kl_loss": 2958.155029296875, + "loss_ib": 29.63381576538086, + "step": 152 + }, + { + "ce_ib": 53.492984771728516, + "ce_orig": 1.9527488946914673, + "epoch": 0.04371270400460134, + "kl_loss": 2661.2138671875, + "loss_ib": 26.665632247924805, + "step": 152 + }, + { + "ce_ib": 51.27360153198242, + "ce_orig": 0.8329764604568481, + "epoch": 0.04371270400460134, + "kl_loss": 3091.322265625, + "loss_ib": 30.964496612548828, + "step": 152 + }, + { + "ce_ib": 50.547359466552734, + "ce_orig": 0.8471105694770813, + "epoch": 0.04371270400460134, + "kl_loss": 3160.12841796875, + "loss_ib": 31.651830673217773, + "step": 152 + }, + { + "ce_ib": 54.09409713745117, + "ce_orig": 1.3073227405548096, + "epoch": 0.044000287583578976, + "kl_loss": 2638.63916015625, + "loss_ib": 26.44048500061035, + "step": 153 + }, + { + "ce_ib": 51.90098571777344, + "ce_orig": 1.1256515979766846, + "epoch": 0.044000287583578976, + "kl_loss": 2839.703857421875, + "loss_ib": 28.44894027709961, + "step": 153 + }, + { + "ce_ib": 52.50508499145508, + "ce_orig": 1.0821564197540283, + "epoch": 0.044000287583578976, + "kl_loss": 2833.94873046875, + "loss_ib": 28.391990661621094, + "step": 153 + }, + { + "ce_ib": 50.86665344238281, + "ce_orig": 0.8515676259994507, + "epoch": 0.044000287583578976, + "kl_loss": 3101.07177734375, + "loss_ib": 31.06158447265625, + "step": 153 + }, + { + "ce_ib": 50.96135711669922, + "ce_orig": 0.7307798862457275, + "epoch": 0.04428787116255662, + "kl_loss": 2950.802490234375, + "loss_ib": 29.55898666381836, + "step": 154 + }, + { + "ce_ib": 48.63287353515625, + "ce_orig": 0.43459776043891907, + "epoch": 0.04428787116255662, + "kl_loss": 2904.9091796875, + "loss_ib": 29.09772300720215, + "step": 154 + }, + { + "ce_ib": 52.380332946777344, + "ce_orig": 1.771660566329956, + "epoch": 0.04428787116255662, + "kl_loss": 2897.521240234375, + "loss_ib": 29.027591705322266, + "step": 154 + }, + { + "ce_ib": 51.10393524169922, + "ce_orig": 0.7512515783309937, + "epoch": 0.04428787116255662, + "kl_loss": 2788.21533203125, + "loss_ib": 27.933256149291992, + "step": 154 + }, + { + "epoch": 0.04457545474153426, + "grad_norm": 448.2861022949219, + "learning_rate": 4.745222929936306e-06, + "loss": 30.0583, + "step": 155 + }, + { + "ce_ib": 49.585208892822266, + "ce_orig": 1.1462219953536987, + "epoch": 0.04457545474153426, + "kl_loss": 3172.34619140625, + "loss_ib": 31.773046493530273, + "step": 155 + }, + { + "ce_ib": 52.14706802368164, + "ce_orig": 1.3412156105041504, + "epoch": 0.04457545474153426, + "kl_loss": 2451.83447265625, + "loss_ib": 24.57048988342285, + "step": 155 + }, + { + "ce_ib": 50.7850341796875, + "ce_orig": 0.6238870620727539, + "epoch": 0.04457545474153426, + "kl_loss": 2603.59716796875, + "loss_ib": 26.08675765991211, + "step": 155 + }, + { + "ce_ib": 53.027767181396484, + "ce_orig": 1.0817673206329346, + "epoch": 0.04457545474153426, + "kl_loss": 2944.25390625, + "loss_ib": 29.49556541442871, + "step": 155 + }, + { + "ce_ib": 52.55374526977539, + "ce_orig": 0.7729134559631348, + "epoch": 0.044863038320511896, + "kl_loss": 2258.596435546875, + "loss_ib": 22.638517379760742, + "step": 156 + }, + { + "ce_ib": 50.43128967285156, + "ce_orig": 0.5304668545722961, + "epoch": 0.044863038320511896, + "kl_loss": 2332.14794921875, + "loss_ib": 23.371912002563477, + "step": 156 + }, + { + "ce_ib": 48.52753448486328, + "ce_orig": 0.8267412185668945, + "epoch": 0.044863038320511896, + "kl_loss": 2993.38134765625, + "loss_ib": 29.98233985900879, + "step": 156 + }, + { + "ce_ib": 47.48508834838867, + "ce_orig": 0.8616426587104797, + "epoch": 0.044863038320511896, + "kl_loss": 2922.7626953125, + "loss_ib": 29.27511215209961, + "step": 156 + }, + { + "ce_ib": 48.53927230834961, + "ce_orig": 0.7964724898338318, + "epoch": 0.04515062189948954, + "kl_loss": 2892.180419921875, + "loss_ib": 28.97034454345703, + "step": 157 + }, + { + "ce_ib": 48.53841018676758, + "ce_orig": 0.7484610080718994, + "epoch": 0.04515062189948954, + "kl_loss": 2939.5947265625, + "loss_ib": 29.44448471069336, + "step": 157 + }, + { + "ce_ib": 47.32265090942383, + "ce_orig": 0.5679759979248047, + "epoch": 0.04515062189948954, + "kl_loss": 3089.84228515625, + "loss_ib": 30.945743560791016, + "step": 157 + }, + { + "ce_ib": 50.060791015625, + "ce_orig": 0.9825291633605957, + "epoch": 0.04515062189948954, + "kl_loss": 2617.75341796875, + "loss_ib": 26.22759437561035, + "step": 157 + }, + { + "ce_ib": 54.85032272338867, + "ce_orig": 1.4325385093688965, + "epoch": 0.04543820547846718, + "kl_loss": 2687.5576171875, + "loss_ib": 26.93042755126953, + "step": 158 + }, + { + "ce_ib": 51.16709899902344, + "ce_orig": 1.104825735092163, + "epoch": 0.04543820547846718, + "kl_loss": 2728.729736328125, + "loss_ib": 27.338462829589844, + "step": 158 + }, + { + "ce_ib": 51.86174774169922, + "ce_orig": 1.2090163230895996, + "epoch": 0.04543820547846718, + "kl_loss": 2632.31298828125, + "loss_ib": 26.37499237060547, + "step": 158 + }, + { + "ce_ib": 50.25190734863281, + "ce_orig": 0.7247406244277954, + "epoch": 0.04543820547846718, + "kl_loss": 2597.2509765625, + "loss_ib": 26.02276039123535, + "step": 158 + }, + { + "ce_ib": 50.78385543823242, + "ce_orig": 0.8480434417724609, + "epoch": 0.04572578905744482, + "kl_loss": 2646.48876953125, + "loss_ib": 26.515670776367188, + "step": 159 + }, + { + "ce_ib": 52.23230743408203, + "ce_orig": 1.2253400087356567, + "epoch": 0.04572578905744482, + "kl_loss": 2775.04931640625, + "loss_ib": 27.802724838256836, + "step": 159 + }, + { + "ce_ib": 54.1110954284668, + "ce_orig": 1.6037498712539673, + "epoch": 0.04572578905744482, + "kl_loss": 2728.93798828125, + "loss_ib": 27.343490600585938, + "step": 159 + }, + { + "ce_ib": 49.18448257446289, + "ce_orig": 0.9601776599884033, + "epoch": 0.04572578905744482, + "kl_loss": 2112.19775390625, + "loss_ib": 21.171161651611328, + "step": 159 + }, + { + "epoch": 0.04601337263642246, + "grad_norm": 445.6031799316406, + "learning_rate": 4.904458598726115e-06, + "loss": 28.9221, + "step": 160 + }, + { + "ce_ib": 50.62841033935547, + "ce_orig": 0.9100584387779236, + "epoch": 0.04601337263642246, + "kl_loss": 2742.574462890625, + "loss_ib": 27.47637176513672, + "step": 160 + }, + { + "ce_ib": 51.763214111328125, + "ce_orig": 1.2738044261932373, + "epoch": 0.04601337263642246, + "kl_loss": 2890.77099609375, + "loss_ib": 28.95947265625, + "step": 160 + }, + { + "ce_ib": 48.35606384277344, + "ce_orig": 0.8663270473480225, + "epoch": 0.04601337263642246, + "kl_loss": 2612.95361328125, + "loss_ib": 26.177892684936523, + "step": 160 + }, + { + "ce_ib": 48.03151321411133, + "ce_orig": 1.0263557434082031, + "epoch": 0.04601337263642246, + "kl_loss": 3139.1376953125, + "loss_ib": 31.439409255981445, + "step": 160 + }, + { + "ce_ib": 48.87652587890625, + "ce_orig": 0.6928001046180725, + "epoch": 0.0463009562154001, + "kl_loss": 2784.03466796875, + "loss_ib": 27.889223098754883, + "step": 161 + }, + { + "ce_ib": 48.439903259277344, + "ce_orig": 1.0835696458816528, + "epoch": 0.0463009562154001, + "kl_loss": 3102.0654296875, + "loss_ib": 31.069093704223633, + "step": 161 + }, + { + "ce_ib": 50.39353561401367, + "ce_orig": 0.7698415517807007, + "epoch": 0.0463009562154001, + "kl_loss": 1483.0360107421875, + "loss_ib": 14.880752563476562, + "step": 161 + }, + { + "ce_ib": 51.16973876953125, + "ce_orig": 1.4212281703948975, + "epoch": 0.0463009562154001, + "kl_loss": 2305.23681640625, + "loss_ib": 23.10353660583496, + "step": 161 + }, + { + "ce_ib": 49.88762283325195, + "ce_orig": 0.6684384346008301, + "epoch": 0.04658853979437774, + "kl_loss": 2829.216796875, + "loss_ib": 28.34205436706543, + "step": 162 + }, + { + "ce_ib": 47.18947219848633, + "ce_orig": 1.0846861600875854, + "epoch": 0.04658853979437774, + "kl_loss": 2960.41796875, + "loss_ib": 29.651369094848633, + "step": 162 + }, + { + "ce_ib": 48.81897735595703, + "ce_orig": 0.5415892004966736, + "epoch": 0.04658853979437774, + "kl_loss": 2178.2265625, + "loss_ib": 21.831083297729492, + "step": 162 + }, + { + "ce_ib": 48.079383850097656, + "ce_orig": 0.9551630020141602, + "epoch": 0.04658853979437774, + "kl_loss": 3026.81884765625, + "loss_ib": 30.316267013549805, + "step": 162 + }, + { + "ce_ib": 48.57440185546875, + "ce_orig": 0.8759099245071411, + "epoch": 0.04687612337335538, + "kl_loss": 2439.7685546875, + "loss_ib": 24.446258544921875, + "step": 163 + }, + { + "ce_ib": 49.86715316772461, + "ce_orig": 1.0347627401351929, + "epoch": 0.04687612337335538, + "kl_loss": 2655.91357421875, + "loss_ib": 26.6090030670166, + "step": 163 + }, + { + "ce_ib": 49.29155731201172, + "ce_orig": 0.8745110034942627, + "epoch": 0.04687612337335538, + "kl_loss": 2898.206298828125, + "loss_ib": 29.031354904174805, + "step": 163 + }, + { + "ce_ib": 50.21908950805664, + "ce_orig": 1.458777904510498, + "epoch": 0.04687612337335538, + "kl_loss": 2556.103515625, + "loss_ib": 25.61125373840332, + "step": 163 + }, + { + "ce_ib": 48.91142654418945, + "ce_orig": 0.6749159693717957, + "epoch": 0.047163706952333025, + "kl_loss": 2708.091796875, + "loss_ib": 27.12982940673828, + "step": 164 + }, + { + "ce_ib": 50.982364654541016, + "ce_orig": 0.6340957283973694, + "epoch": 0.047163706952333025, + "kl_loss": 2460.154296875, + "loss_ib": 24.652523040771484, + "step": 164 + }, + { + "ce_ib": 51.6622428894043, + "ce_orig": 0.8664228320121765, + "epoch": 0.047163706952333025, + "kl_loss": 2626.44091796875, + "loss_ib": 26.316070556640625, + "step": 164 + }, + { + "ce_ib": 48.696929931640625, + "ce_orig": 0.46712541580200195, + "epoch": 0.047163706952333025, + "kl_loss": 2673.508544921875, + "loss_ib": 26.783781051635742, + "step": 164 + }, + { + "epoch": 0.04745129053131066, + "grad_norm": 424.326904296875, + "learning_rate": 5.063694267515924e-06, + "loss": 28.6858, + "step": 165 + }, + { + "ce_ib": 47.32035827636719, + "ce_orig": 0.7065096497535706, + "epoch": 0.04745129053131066, + "kl_loss": 2612.828857421875, + "loss_ib": 26.175607681274414, + "step": 165 + }, + { + "ce_ib": 53.695343017578125, + "ce_orig": 0.9257593154907227, + "epoch": 0.04745129053131066, + "kl_loss": 2765.24658203125, + "loss_ib": 27.706161499023438, + "step": 165 + }, + { + "ce_ib": 46.061485290527344, + "ce_orig": 0.6095507740974426, + "epoch": 0.04745129053131066, + "kl_loss": 2924.33349609375, + "loss_ib": 29.289396286010742, + "step": 165 + }, + { + "ce_ib": 49.95652770996094, + "ce_orig": 0.39805570244789124, + "epoch": 0.04745129053131066, + "kl_loss": 2691.94287109375, + "loss_ib": 26.969383239746094, + "step": 165 + }, + { + "ce_ib": 48.33354187011719, + "ce_orig": 1.066157579421997, + "epoch": 0.0477388741102883, + "kl_loss": 2646.9306640625, + "loss_ib": 26.51763916015625, + "step": 166 + }, + { + "ce_ib": 44.15623474121094, + "ce_orig": 0.12060567736625671, + "epoch": 0.0477388741102883, + "kl_loss": 1492.6593017578125, + "loss_ib": 14.970748901367188, + "step": 166 + }, + { + "ce_ib": 49.3094367980957, + "ce_orig": 1.4275071620941162, + "epoch": 0.0477388741102883, + "kl_loss": 2720.3349609375, + "loss_ib": 27.25265884399414, + "step": 166 + }, + { + "ce_ib": 48.710384368896484, + "ce_orig": 0.6945171356201172, + "epoch": 0.0477388741102883, + "kl_loss": 2963.2177734375, + "loss_ib": 29.68088722229004, + "step": 166 + }, + { + "ce_ib": 50.00297927856445, + "ce_orig": 1.062921404838562, + "epoch": 0.048026457689265944, + "kl_loss": 2493.588134765625, + "loss_ib": 24.985883712768555, + "step": 167 + }, + { + "ce_ib": 48.40403366088867, + "ce_orig": 0.8593358397483826, + "epoch": 0.048026457689265944, + "kl_loss": 2742.294921875, + "loss_ib": 27.471351623535156, + "step": 167 + }, + { + "ce_ib": 46.95838165283203, + "ce_orig": 0.5543254017829895, + "epoch": 0.048026457689265944, + "kl_loss": 2647.65576171875, + "loss_ib": 26.523515701293945, + "step": 167 + }, + { + "ce_ib": 48.49465560913086, + "ce_orig": 0.9735248684883118, + "epoch": 0.048026457689265944, + "kl_loss": 2853.06201171875, + "loss_ib": 28.579113006591797, + "step": 167 + }, + { + "ce_ib": 47.27997589111328, + "ce_orig": 0.986024022102356, + "epoch": 0.04831404126824358, + "kl_loss": 2900.328369140625, + "loss_ib": 29.05056381225586, + "step": 168 + }, + { + "ce_ib": 47.31760025024414, + "ce_orig": 0.7703031897544861, + "epoch": 0.04831404126824358, + "kl_loss": 2637.726318359375, + "loss_ib": 26.424579620361328, + "step": 168 + }, + { + "ce_ib": 48.37574768066406, + "ce_orig": 0.9344309568405151, + "epoch": 0.04831404126824358, + "kl_loss": 2614.62939453125, + "loss_ib": 26.194669723510742, + "step": 168 + }, + { + "ce_ib": 46.49268341064453, + "ce_orig": 0.7439426183700562, + "epoch": 0.04831404126824358, + "kl_loss": 2807.43408203125, + "loss_ib": 28.120832443237305, + "step": 168 + }, + { + "ce_ib": 48.985836029052734, + "ce_orig": 1.2297948598861694, + "epoch": 0.04860162484722123, + "kl_loss": 2628.673828125, + "loss_ib": 26.335723876953125, + "step": 169 + }, + { + "ce_ib": 47.281558990478516, + "ce_orig": 1.0321601629257202, + "epoch": 0.04860162484722123, + "kl_loss": 2619.60400390625, + "loss_ib": 26.243322372436523, + "step": 169 + }, + { + "ce_ib": 47.78618621826172, + "ce_orig": 1.0097578763961792, + "epoch": 0.04860162484722123, + "kl_loss": 2793.48046875, + "loss_ib": 27.982589721679688, + "step": 169 + }, + { + "ce_ib": 47.44657897949219, + "ce_orig": 1.0025876760482788, + "epoch": 0.04860162484722123, + "kl_loss": 2977.064453125, + "loss_ib": 29.818090438842773, + "step": 169 + }, + { + "epoch": 0.048889208426198864, + "grad_norm": 426.09820556640625, + "learning_rate": 5.222929936305733e-06, + "loss": 27.4107, + "step": 170 + }, + { + "ce_ib": 48.28227996826172, + "ce_orig": 1.2277311086654663, + "epoch": 0.048889208426198864, + "kl_loss": 2577.318359375, + "loss_ib": 25.82146453857422, + "step": 170 + }, + { + "ce_ib": 48.456058502197266, + "ce_orig": 1.0684230327606201, + "epoch": 0.048889208426198864, + "kl_loss": 2181.768798828125, + "loss_ib": 21.86614418029785, + "step": 170 + }, + { + "ce_ib": 46.85710906982422, + "ce_orig": 0.6369960308074951, + "epoch": 0.048889208426198864, + "kl_loss": 2463.142578125, + "loss_ib": 24.67828369140625, + "step": 170 + }, + { + "ce_ib": 50.36212921142578, + "ce_orig": 1.4817943572998047, + "epoch": 0.048889208426198864, + "kl_loss": 2810.15185546875, + "loss_ib": 28.151878356933594, + "step": 170 + }, + { + "ce_ib": 52.663734436035156, + "ce_orig": 1.3452659845352173, + "epoch": 0.0491767920051765, + "kl_loss": 2626.7080078125, + "loss_ib": 26.31974220275879, + "step": 171 + }, + { + "ce_ib": 44.656044006347656, + "ce_orig": 0.6703915596008301, + "epoch": 0.0491767920051765, + "kl_loss": 2806.165771484375, + "loss_ib": 28.106313705444336, + "step": 171 + }, + { + "ce_ib": 49.89923858642578, + "ce_orig": 0.8543052673339844, + "epoch": 0.0491767920051765, + "kl_loss": 2713.99462890625, + "loss_ib": 27.18984603881836, + "step": 171 + }, + { + "ce_ib": 43.8620719909668, + "ce_orig": 0.943006157875061, + "epoch": 0.0491767920051765, + "kl_loss": 2850.892578125, + "loss_ib": 28.55278778076172, + "step": 171 + }, + { + "ce_ib": 49.70503234863281, + "ce_orig": 1.0421463251113892, + "epoch": 0.04946437558415415, + "kl_loss": 2539.814453125, + "loss_ib": 25.44784927368164, + "step": 172 + }, + { + "ce_ib": 46.116146087646484, + "ce_orig": 0.8185178637504578, + "epoch": 0.04946437558415415, + "kl_loss": 2844.80224609375, + "loss_ib": 28.494136810302734, + "step": 172 + }, + { + "ce_ib": 48.9494514465332, + "ce_orig": 1.0492463111877441, + "epoch": 0.04946437558415415, + "kl_loss": 2648.58056640625, + "loss_ib": 26.53475570678711, + "step": 172 + }, + { + "ce_ib": 45.931827545166016, + "ce_orig": 0.6465027332305908, + "epoch": 0.04946437558415415, + "kl_loss": 2837.117919921875, + "loss_ib": 28.417110443115234, + "step": 172 + }, + { + "ce_ib": 46.874114990234375, + "ce_orig": 0.8541361093521118, + "epoch": 0.049751959163131784, + "kl_loss": 2705.7646484375, + "loss_ib": 27.104520797729492, + "step": 173 + }, + { + "ce_ib": 49.18909454345703, + "ce_orig": 0.8849540948867798, + "epoch": 0.049751959163131784, + "kl_loss": 2525.669921875, + "loss_ib": 25.30588722229004, + "step": 173 + }, + { + "ce_ib": 49.24184036254883, + "ce_orig": 1.275468111038208, + "epoch": 0.049751959163131784, + "kl_loss": 2448.7724609375, + "loss_ib": 24.536964416503906, + "step": 173 + }, + { + "ce_ib": 48.2338981628418, + "ce_orig": 1.5680046081542969, + "epoch": 0.049751959163131784, + "kl_loss": 2418.929443359375, + "loss_ib": 24.23752784729004, + "step": 173 + }, + { + "ce_ib": 46.14396667480469, + "ce_orig": 0.541758120059967, + "epoch": 0.05003954274210943, + "kl_loss": 1417.190185546875, + "loss_ib": 14.218045234680176, + "step": 174 + }, + { + "ce_ib": 45.08213806152344, + "ce_orig": 1.0284433364868164, + "epoch": 0.05003954274210943, + "kl_loss": 2671.77490234375, + "loss_ib": 26.76283073425293, + "step": 174 + }, + { + "ce_ib": 47.65272903442383, + "ce_orig": 0.7605993151664734, + "epoch": 0.05003954274210943, + "kl_loss": 2763.138916015625, + "loss_ib": 27.67904281616211, + "step": 174 + }, + { + "ce_ib": 45.709381103515625, + "ce_orig": 0.6788672208786011, + "epoch": 0.05003954274210943, + "kl_loss": 2476.36376953125, + "loss_ib": 24.80934715270996, + "step": 174 + }, + { + "epoch": 0.050327126321087066, + "grad_norm": 413.482666015625, + "learning_rate": 5.3821656050955415e-06, + "loss": 27.0741, + "step": 175 + }, + { + "ce_ib": 50.374427795410156, + "ce_orig": 1.4046454429626465, + "epoch": 0.050327126321087066, + "kl_loss": 2682.432373046875, + "loss_ib": 26.874696731567383, + "step": 175 + }, + { + "ce_ib": 45.06935119628906, + "ce_orig": 0.6166799068450928, + "epoch": 0.050327126321087066, + "kl_loss": 2835.7060546875, + "loss_ib": 28.402128219604492, + "step": 175 + }, + { + "ce_ib": 48.812774658203125, + "ce_orig": 0.8136134147644043, + "epoch": 0.050327126321087066, + "kl_loss": 2307.1552734375, + "loss_ib": 23.120365142822266, + "step": 175 + }, + { + "ce_ib": 46.97080612182617, + "ce_orig": 0.8589736819267273, + "epoch": 0.050327126321087066, + "kl_loss": 2857.4111328125, + "loss_ib": 28.62108039855957, + "step": 175 + }, + { + "ce_ib": 47.42732238769531, + "ce_orig": 0.9090732336044312, + "epoch": 0.050614709900064704, + "kl_loss": 2458.442138671875, + "loss_ib": 24.631847381591797, + "step": 176 + }, + { + "ce_ib": 48.879520416259766, + "ce_orig": 1.1093182563781738, + "epoch": 0.050614709900064704, + "kl_loss": 2716.2275390625, + "loss_ib": 27.21115493774414, + "step": 176 + }, + { + "ce_ib": 45.72584915161133, + "ce_orig": 0.7976894378662109, + "epoch": 0.050614709900064704, + "kl_loss": 2734.431640625, + "loss_ib": 27.39004135131836, + "step": 176 + }, + { + "ce_ib": 48.863277435302734, + "ce_orig": 1.1800131797790527, + "epoch": 0.050614709900064704, + "kl_loss": 2549.98583984375, + "loss_ib": 25.54871940612793, + "step": 176 + }, + { + "ce_ib": 49.88660430908203, + "ce_orig": 1.3869460821151733, + "epoch": 0.05090229347904235, + "kl_loss": 2348.32080078125, + "loss_ib": 23.53309440612793, + "step": 177 + }, + { + "ce_ib": 46.74383544921875, + "ce_orig": 0.861750602722168, + "epoch": 0.05090229347904235, + "kl_loss": 2821.16943359375, + "loss_ib": 28.25843620300293, + "step": 177 + }, + { + "ce_ib": 46.5212516784668, + "ce_orig": 0.9110401272773743, + "epoch": 0.05090229347904235, + "kl_loss": 1920.51806640625, + "loss_ib": 19.25170135498047, + "step": 177 + }, + { + "ce_ib": 45.50922775268555, + "ce_orig": 0.762277364730835, + "epoch": 0.05090229347904235, + "kl_loss": 2755.8955078125, + "loss_ib": 27.60446548461914, + "step": 177 + }, + { + "ce_ib": 45.10780715942383, + "ce_orig": 0.9310659170150757, + "epoch": 0.051189877058019986, + "kl_loss": 2471.52685546875, + "loss_ib": 24.7603759765625, + "step": 178 + }, + { + "ce_ib": 46.319915771484375, + "ce_orig": 0.8169469237327576, + "epoch": 0.051189877058019986, + "kl_loss": 2594.20947265625, + "loss_ib": 25.988414764404297, + "step": 178 + }, + { + "ce_ib": 49.259620666503906, + "ce_orig": 1.4122384786605835, + "epoch": 0.051189877058019986, + "kl_loss": 2652.73583984375, + "loss_ib": 26.576618194580078, + "step": 178 + }, + { + "ce_ib": 47.23049545288086, + "ce_orig": 1.2408967018127441, + "epoch": 0.051189877058019986, + "kl_loss": 2727.913818359375, + "loss_ib": 27.32636833190918, + "step": 178 + }, + { + "ce_ib": 48.52485656738281, + "ce_orig": 0.561891496181488, + "epoch": 0.051477460636997624, + "kl_loss": 2130.169921875, + "loss_ib": 21.350223541259766, + "step": 179 + }, + { + "ce_ib": 51.518375396728516, + "ce_orig": 1.9220662117004395, + "epoch": 0.051477460636997624, + "kl_loss": 2634.577392578125, + "loss_ib": 26.39729118347168, + "step": 179 + }, + { + "ce_ib": 45.864845275878906, + "ce_orig": 0.9671883583068848, + "epoch": 0.051477460636997624, + "kl_loss": 2693.470458984375, + "loss_ib": 26.980567932128906, + "step": 179 + }, + { + "ce_ib": 45.78055191040039, + "ce_orig": 0.9218305349349976, + "epoch": 0.051477460636997624, + "kl_loss": 2529.023681640625, + "loss_ib": 25.336017608642578, + "step": 179 + }, + { + "epoch": 0.05176504421597527, + "grad_norm": 409.18316650390625, + "learning_rate": 5.541401273885351e-06, + "loss": 26.2136, + "step": 180 + }, + { + "ce_ib": 47.819671630859375, + "ce_orig": 1.0385197401046753, + "epoch": 0.05176504421597527, + "kl_loss": 2495.19580078125, + "loss_ib": 24.99977684020996, + "step": 180 + }, + { + "ce_ib": 48.391448974609375, + "ce_orig": 1.5398671627044678, + "epoch": 0.05176504421597527, + "kl_loss": 2459.814453125, + "loss_ib": 24.646535873413086, + "step": 180 + }, + { + "ce_ib": 45.70133590698242, + "ce_orig": 1.4346660375595093, + "epoch": 0.05176504421597527, + "kl_loss": 2470.22998046875, + "loss_ib": 24.74799919128418, + "step": 180 + }, + { + "ce_ib": 45.42562484741211, + "ce_orig": 1.027616262435913, + "epoch": 0.05176504421597527, + "kl_loss": 2735.41064453125, + "loss_ib": 27.3995304107666, + "step": 180 + }, + { + "ce_ib": 46.22211456298828, + "ce_orig": 1.2898683547973633, + "epoch": 0.052052627794952906, + "kl_loss": 2344.324462890625, + "loss_ib": 23.48946762084961, + "step": 181 + }, + { + "ce_ib": 45.933162689208984, + "ce_orig": 0.3807089924812317, + "epoch": 0.052052627794952906, + "kl_loss": 2405.65283203125, + "loss_ib": 24.102460861206055, + "step": 181 + }, + { + "ce_ib": 48.467613220214844, + "ce_orig": 1.0768738985061646, + "epoch": 0.052052627794952906, + "kl_loss": 2349.7158203125, + "loss_ib": 23.545623779296875, + "step": 181 + }, + { + "ce_ib": 43.66925811767578, + "ce_orig": 0.540174126625061, + "epoch": 0.052052627794952906, + "kl_loss": 2476.49609375, + "loss_ib": 24.808629989624023, + "step": 181 + }, + { + "ce_ib": 50.432796478271484, + "ce_orig": 1.4778696298599243, + "epoch": 0.05234021137393055, + "kl_loss": 2345.696533203125, + "loss_ib": 23.507396697998047, + "step": 182 + }, + { + "ce_ib": 44.97416687011719, + "ce_orig": 0.8222552537918091, + "epoch": 0.05234021137393055, + "kl_loss": 2430.04833984375, + "loss_ib": 24.345455169677734, + "step": 182 + }, + { + "ce_ib": 48.673431396484375, + "ce_orig": 1.0176633596420288, + "epoch": 0.05234021137393055, + "kl_loss": 2423.87109375, + "loss_ib": 24.287384033203125, + "step": 182 + }, + { + "ce_ib": 44.51708221435547, + "ce_orig": 0.6682273745536804, + "epoch": 0.05234021137393055, + "kl_loss": 2574.65380859375, + "loss_ib": 25.791053771972656, + "step": 182 + }, + { + "ce_ib": 46.51145935058594, + "ce_orig": 1.098725438117981, + "epoch": 0.05262779495290819, + "kl_loss": 2416.0830078125, + "loss_ib": 24.207340240478516, + "step": 183 + }, + { + "ce_ib": 48.851741790771484, + "ce_orig": 1.3513818979263306, + "epoch": 0.05262779495290819, + "kl_loss": 2428.113037109375, + "loss_ib": 24.32998275756836, + "step": 183 + }, + { + "ce_ib": 44.624210357666016, + "ce_orig": 0.6521418690681458, + "epoch": 0.05262779495290819, + "kl_loss": 2469.832275390625, + "loss_ib": 24.74294662475586, + "step": 183 + }, + { + "ce_ib": 48.94157791137695, + "ce_orig": 1.2012220621109009, + "epoch": 0.05262779495290819, + "kl_loss": 2484.021484375, + "loss_ib": 24.8891544342041, + "step": 183 + }, + { + "ce_ib": 47.77019500732422, + "ce_orig": 0.8514222502708435, + "epoch": 0.052915378531885826, + "kl_loss": 2497.58349609375, + "loss_ib": 25.023605346679688, + "step": 184 + }, + { + "ce_ib": 46.87969970703125, + "ce_orig": 1.339136004447937, + "epoch": 0.052915378531885826, + "kl_loss": 2096.02587890625, + "loss_ib": 21.007137298583984, + "step": 184 + }, + { + "ce_ib": 48.20975875854492, + "ce_orig": 1.381858229637146, + "epoch": 0.052915378531885826, + "kl_loss": 2225.2705078125, + "loss_ib": 22.300914764404297, + "step": 184 + }, + { + "ce_ib": 48.024993896484375, + "ce_orig": 0.6926367282867432, + "epoch": 0.052915378531885826, + "kl_loss": 2289.90087890625, + "loss_ib": 22.947032928466797, + "step": 184 + }, + { + "epoch": 0.05320296211086347, + "grad_norm": 394.3677062988281, + "learning_rate": 5.7006369426751594e-06, + "loss": 25.7313, + "step": 185 + }, + { + "ce_ib": 46.50983428955078, + "ce_orig": 1.1019322872161865, + "epoch": 0.05320296211086347, + "kl_loss": 2536.72998046875, + "loss_ib": 25.413808822631836, + "step": 185 + }, + { + "ce_ib": 44.686187744140625, + "ce_orig": 0.7587331533432007, + "epoch": 0.05320296211086347, + "kl_loss": 2358.890380859375, + "loss_ib": 23.633588790893555, + "step": 185 + }, + { + "ce_ib": 44.013580322265625, + "ce_orig": 0.7084860801696777, + "epoch": 0.05320296211086347, + "kl_loss": 2679.59033203125, + "loss_ib": 26.839916229248047, + "step": 185 + }, + { + "ce_ib": 43.398311614990234, + "ce_orig": 0.9784666299819946, + "epoch": 0.05320296211086347, + "kl_loss": 2370.73291015625, + "loss_ib": 23.7507266998291, + "step": 185 + }, + { + "ce_ib": 44.2143669128418, + "ce_orig": 0.786540687084198, + "epoch": 0.05349054568984111, + "kl_loss": 2228.3095703125, + "loss_ib": 22.327308654785156, + "step": 186 + }, + { + "ce_ib": 46.3162727355957, + "ce_orig": 1.3913073539733887, + "epoch": 0.05349054568984111, + "kl_loss": 2373.01953125, + "loss_ib": 23.77651023864746, + "step": 186 + }, + { + "ce_ib": 46.72264862060547, + "ce_orig": 1.0480103492736816, + "epoch": 0.05349054568984111, + "kl_loss": 2314.23193359375, + "loss_ib": 23.189043045043945, + "step": 186 + }, + { + "ce_ib": 48.647151947021484, + "ce_orig": 1.5039794445037842, + "epoch": 0.05349054568984111, + "kl_loss": 1132.71826171875, + "loss_ib": 11.375829696655273, + "step": 186 + }, + { + "ce_ib": 43.383331298828125, + "ce_orig": 0.5294433832168579, + "epoch": 0.05377812926881875, + "kl_loss": 1875.989990234375, + "loss_ib": 18.80328369140625, + "step": 187 + }, + { + "ce_ib": 46.24761962890625, + "ce_orig": 0.8533762693405151, + "epoch": 0.05377812926881875, + "kl_loss": 2202.668701171875, + "loss_ib": 22.072933197021484, + "step": 187 + }, + { + "ce_ib": 43.863502502441406, + "ce_orig": 1.4904100894927979, + "epoch": 0.05377812926881875, + "kl_loss": 2434.986083984375, + "loss_ib": 24.393722534179688, + "step": 187 + }, + { + "ce_ib": 44.60874557495117, + "ce_orig": 0.33239492774009705, + "epoch": 0.05377812926881875, + "kl_loss": 2029.0833740234375, + "loss_ib": 20.33544158935547, + "step": 187 + }, + { + "ce_ib": 43.16314697265625, + "ce_orig": 0.7957232594490051, + "epoch": 0.05406571284779639, + "kl_loss": 2502.121826171875, + "loss_ib": 25.064382553100586, + "step": 188 + }, + { + "ce_ib": 42.3214225769043, + "ce_orig": 0.8192757964134216, + "epoch": 0.05406571284779639, + "kl_loss": 2299.024169921875, + "loss_ib": 23.032562255859375, + "step": 188 + }, + { + "ce_ib": 45.48292541503906, + "ce_orig": 0.9447529911994934, + "epoch": 0.05406571284779639, + "kl_loss": 2239.673828125, + "loss_ib": 22.44222068786621, + "step": 188 + }, + { + "ce_ib": 43.36006164550781, + "ce_orig": 0.7006217837333679, + "epoch": 0.05406571284779639, + "kl_loss": 2508.6435546875, + "loss_ib": 25.12979507446289, + "step": 188 + }, + { + "ce_ib": 42.9240837097168, + "ce_orig": 0.9430150985717773, + "epoch": 0.05435329642677403, + "kl_loss": 2421.86962890625, + "loss_ib": 24.261621475219727, + "step": 189 + }, + { + "ce_ib": 44.628814697265625, + "ce_orig": 1.0881403684616089, + "epoch": 0.05435329642677403, + "kl_loss": 2387.1103515625, + "loss_ib": 23.91573143005371, + "step": 189 + }, + { + "ce_ib": 43.131500244140625, + "ce_orig": 0.6736454963684082, + "epoch": 0.05435329642677403, + "kl_loss": 2487.064453125, + "loss_ib": 24.913776397705078, + "step": 189 + }, + { + "ce_ib": 44.661094665527344, + "ce_orig": 0.9864615797996521, + "epoch": 0.05435329642677403, + "kl_loss": 2235.8447265625, + "loss_ib": 22.403106689453125, + "step": 189 + }, + { + "epoch": 0.05464088000575167, + "grad_norm": 399.3453369140625, + "learning_rate": 5.859872611464969e-06, + "loss": 24.4398, + "step": 190 + }, + { + "ce_ib": 45.041744232177734, + "ce_orig": 0.8773884177207947, + "epoch": 0.05464088000575167, + "kl_loss": 2267.437744140625, + "loss_ib": 22.719417572021484, + "step": 190 + }, + { + "ce_ib": 46.31273651123047, + "ce_orig": 1.0835819244384766, + "epoch": 0.05464088000575167, + "kl_loss": 2374.10107421875, + "loss_ib": 23.787322998046875, + "step": 190 + }, + { + "ce_ib": 42.20440673828125, + "ce_orig": 0.9557557106018066, + "epoch": 0.05464088000575167, + "kl_loss": 2271.1640625, + "loss_ib": 22.75384521484375, + "step": 190 + }, + { + "ce_ib": 45.23324203491211, + "ce_orig": 0.8508480787277222, + "epoch": 0.05464088000575167, + "kl_loss": 2288.09375, + "loss_ib": 22.926172256469727, + "step": 190 + }, + { + "ce_ib": 44.05533218383789, + "ce_orig": 1.154534101486206, + "epoch": 0.05492846358472931, + "kl_loss": 2600.14404296875, + "loss_ib": 26.045494079589844, + "step": 191 + }, + { + "ce_ib": 42.00983810424805, + "ce_orig": 0.7044571042060852, + "epoch": 0.05492846358472931, + "kl_loss": 2365.684326171875, + "loss_ib": 23.6988525390625, + "step": 191 + }, + { + "ce_ib": 45.84080123901367, + "ce_orig": 0.8432292938232422, + "epoch": 0.05492846358472931, + "kl_loss": 2435.255859375, + "loss_ib": 24.39839744567871, + "step": 191 + }, + { + "ce_ib": 41.58427047729492, + "ce_orig": 0.5829588770866394, + "epoch": 0.05492846358472931, + "kl_loss": 2457.0322265625, + "loss_ib": 24.611906051635742, + "step": 191 + }, + { + "ce_ib": 42.24211120605469, + "ce_orig": 0.9801141023635864, + "epoch": 0.055216047163706955, + "kl_loss": 2417.925537109375, + "loss_ib": 24.22149658203125, + "step": 192 + }, + { + "ce_ib": 45.56145095825195, + "ce_orig": 1.132083535194397, + "epoch": 0.055216047163706955, + "kl_loss": 2193.166259765625, + "loss_ib": 21.977224349975586, + "step": 192 + }, + { + "ce_ib": 46.364322662353516, + "ce_orig": 1.4373035430908203, + "epoch": 0.055216047163706955, + "kl_loss": 2148.3759765625, + "loss_ib": 21.53012466430664, + "step": 192 + }, + { + "ce_ib": 43.91224670410156, + "ce_orig": 1.0104138851165771, + "epoch": 0.055216047163706955, + "kl_loss": 2297.23193359375, + "loss_ib": 23.0162296295166, + "step": 192 + }, + { + "ce_ib": 43.70963668823242, + "ce_orig": 1.4233311414718628, + "epoch": 0.05550363074268459, + "kl_loss": 2309.242919921875, + "loss_ib": 23.136137008666992, + "step": 193 + }, + { + "ce_ib": 42.54071807861328, + "ce_orig": 1.2717257738113403, + "epoch": 0.05550363074268459, + "kl_loss": 2467.593994140625, + "loss_ib": 24.718481063842773, + "step": 193 + }, + { + "ce_ib": 44.76433181762695, + "ce_orig": 0.4072558581829071, + "epoch": 0.05550363074268459, + "kl_loss": 2135.32177734375, + "loss_ib": 21.397979736328125, + "step": 193 + }, + { + "ce_ib": 43.77593231201172, + "ce_orig": 0.9473220705986023, + "epoch": 0.05550363074268459, + "kl_loss": 2351.19873046875, + "loss_ib": 23.555763244628906, + "step": 193 + }, + { + "ce_ib": 41.941593170166016, + "ce_orig": 0.8737780451774597, + "epoch": 0.05579121432166223, + "kl_loss": 2363.43896484375, + "loss_ib": 23.67633056640625, + "step": 194 + }, + { + "ce_ib": 45.53238296508789, + "ce_orig": 1.0324821472167969, + "epoch": 0.05579121432166223, + "kl_loss": 1954.951416015625, + "loss_ib": 19.59504508972168, + "step": 194 + }, + { + "ce_ib": 43.686954498291016, + "ce_orig": 1.3375800848007202, + "epoch": 0.05579121432166223, + "kl_loss": 2365.90576171875, + "loss_ib": 23.702743530273438, + "step": 194 + }, + { + "ce_ib": 43.68901443481445, + "ce_orig": 0.5833651423454285, + "epoch": 0.05579121432166223, + "kl_loss": 2179.84130859375, + "loss_ib": 21.84210205078125, + "step": 194 + }, + { + "epoch": 0.056078797900639875, + "grad_norm": 384.2991943359375, + "learning_rate": 6.019108280254777e-06, + "loss": 23.4525, + "step": 195 + }, + { + "ce_ib": 43.28430938720703, + "ce_orig": 0.8238533139228821, + "epoch": 0.056078797900639875, + "kl_loss": 2549.11474609375, + "loss_ib": 25.53443145751953, + "step": 195 + }, + { + "ce_ib": 43.542911529541016, + "ce_orig": 0.9835025072097778, + "epoch": 0.056078797900639875, + "kl_loss": 2104.084228515625, + "loss_ib": 21.08438491821289, + "step": 195 + }, + { + "ce_ib": 44.020999908447266, + "ce_orig": 0.9901052713394165, + "epoch": 0.056078797900639875, + "kl_loss": 1952.4921875, + "loss_ib": 19.56894302368164, + "step": 195 + }, + { + "ce_ib": 45.76945877075195, + "ce_orig": 0.8322806358337402, + "epoch": 0.056078797900639875, + "kl_loss": 2153.4716796875, + "loss_ib": 21.580486297607422, + "step": 195 + }, + { + "ce_ib": 43.12766647338867, + "ce_orig": 0.9949517846107483, + "epoch": 0.05636638147961751, + "kl_loss": 2381.666259765625, + "loss_ib": 23.85978889465332, + "step": 196 + }, + { + "ce_ib": 42.28154754638672, + "ce_orig": 0.9624870419502258, + "epoch": 0.05636638147961751, + "kl_loss": 2345.0966796875, + "loss_ib": 23.493249893188477, + "step": 196 + }, + { + "ce_ib": 44.944583892822266, + "ce_orig": 0.47114697098731995, + "epoch": 0.05636638147961751, + "kl_loss": 1733.882080078125, + "loss_ib": 17.383766174316406, + "step": 196 + }, + { + "ce_ib": 42.78217697143555, + "ce_orig": 0.7587113976478577, + "epoch": 0.05636638147961751, + "kl_loss": 2418.23876953125, + "loss_ib": 24.225170135498047, + "step": 196 + }, + { + "ce_ib": 40.34938049316406, + "ce_orig": 0.5144612789154053, + "epoch": 0.05665396505859516, + "kl_loss": 2105.162109375, + "loss_ib": 21.091970443725586, + "step": 197 + }, + { + "ce_ib": 43.35491943359375, + "ce_orig": 0.6633918285369873, + "epoch": 0.05665396505859516, + "kl_loss": 2375.418212890625, + "loss_ib": 23.797536849975586, + "step": 197 + }, + { + "ce_ib": 45.42133331298828, + "ce_orig": 1.425979495048523, + "epoch": 0.05665396505859516, + "kl_loss": 1852.203857421875, + "loss_ib": 18.567459106445312, + "step": 197 + }, + { + "ce_ib": 42.6270751953125, + "ce_orig": 1.091071367263794, + "epoch": 0.05665396505859516, + "kl_loss": 2149.162353515625, + "loss_ib": 21.534250259399414, + "step": 197 + }, + { + "ce_ib": 48.074180603027344, + "ce_orig": 1.6172330379486084, + "epoch": 0.056941548637572795, + "kl_loss": 2109.87890625, + "loss_ib": 21.146862030029297, + "step": 198 + }, + { + "ce_ib": 42.51495361328125, + "ce_orig": 1.009562611579895, + "epoch": 0.056941548637572795, + "kl_loss": 2003.40283203125, + "loss_ib": 20.076541900634766, + "step": 198 + }, + { + "ce_ib": 44.50498580932617, + "ce_orig": 1.2147884368896484, + "epoch": 0.056941548637572795, + "kl_loss": 2296.0703125, + "loss_ib": 23.005207061767578, + "step": 198 + }, + { + "ce_ib": 45.61008834838867, + "ce_orig": 1.532022476196289, + "epoch": 0.056941548637572795, + "kl_loss": 2186.6220703125, + "loss_ib": 21.91183090209961, + "step": 198 + }, + { + "ce_ib": 42.80625534057617, + "ce_orig": 1.1610299348831177, + "epoch": 0.05722913221655043, + "kl_loss": 2305.758056640625, + "loss_ib": 23.100385665893555, + "step": 199 + }, + { + "ce_ib": 44.845333099365234, + "ce_orig": 1.0554615259170532, + "epoch": 0.05722913221655043, + "kl_loss": 2195.99072265625, + "loss_ib": 22.00475311279297, + "step": 199 + }, + { + "ce_ib": 41.97274398803711, + "ce_orig": 0.9705357551574707, + "epoch": 0.05722913221655043, + "kl_loss": 2215.55859375, + "loss_ib": 22.19755744934082, + "step": 199 + }, + { + "ce_ib": 41.66038131713867, + "ce_orig": 0.8861182928085327, + "epoch": 0.05722913221655043, + "kl_loss": 1905.2489013671875, + "loss_ib": 19.094148635864258, + "step": 199 + }, + { + "epoch": 0.05751671579552808, + "grad_norm": 379.6163024902344, + "learning_rate": 6.178343949044586e-06, + "loss": 23.0704, + "step": 200 + }, + { + "ce_ib": 41.82258987426758, + "ce_orig": 1.130007266998291, + "epoch": 0.05751671579552808, + "kl_loss": 2195.69970703125, + "loss_ib": 21.998821258544922, + "step": 200 + }, + { + "ce_ib": 46.66122817993164, + "ce_orig": 1.4467494487762451, + "epoch": 0.05751671579552808, + "kl_loss": 2178.219970703125, + "loss_ib": 21.828859329223633, + "step": 200 + }, + { + "ce_ib": 39.04912567138672, + "ce_orig": 1.1100558042526245, + "epoch": 0.05751671579552808, + "kl_loss": 2271.130859375, + "loss_ib": 22.75035858154297, + "step": 200 + }, + { + "ce_ib": 40.80558395385742, + "ce_orig": 0.830470860004425, + "epoch": 0.05751671579552808, + "kl_loss": 2165.312255859375, + "loss_ib": 21.693927764892578, + "step": 200 + }, + { + "ce_ib": 40.64300537109375, + "ce_orig": 1.0102934837341309, + "epoch": 0.057804299374505715, + "kl_loss": 2148.4072265625, + "loss_ib": 21.52471351623535, + "step": 201 + }, + { + "ce_ib": 43.90663528442383, + "ce_orig": 1.1066926717758179, + "epoch": 0.057804299374505715, + "kl_loss": 2117.95751953125, + "loss_ib": 21.223480224609375, + "step": 201 + }, + { + "ce_ib": 45.89930725097656, + "ce_orig": 1.7181086540222168, + "epoch": 0.057804299374505715, + "kl_loss": 1860.36865234375, + "loss_ib": 18.649585723876953, + "step": 201 + }, + { + "ce_ib": 39.91169357299805, + "ce_orig": 0.3805517554283142, + "epoch": 0.057804299374505715, + "kl_loss": 1964.464599609375, + "loss_ib": 19.68455696105957, + "step": 201 + }, + { + "ce_ib": 39.736671447753906, + "ce_orig": 0.5894677639007568, + "epoch": 0.05809188295348336, + "kl_loss": 2246.01611328125, + "loss_ib": 22.499897003173828, + "step": 202 + }, + { + "ce_ib": 40.524208068847656, + "ce_orig": 0.9540011882781982, + "epoch": 0.05809188295348336, + "kl_loss": 1354.6728515625, + "loss_ib": 13.587251663208008, + "step": 202 + }, + { + "ce_ib": 43.64582061767578, + "ce_orig": 1.034263253211975, + "epoch": 0.05809188295348336, + "kl_loss": 2062.5126953125, + "loss_ib": 20.668771743774414, + "step": 202 + }, + { + "ce_ib": 41.73001480102539, + "ce_orig": 0.8725116848945618, + "epoch": 0.05809188295348336, + "kl_loss": 2051.177734375, + "loss_ib": 20.553508758544922, + "step": 202 + }, + { + "ce_ib": 44.100730895996094, + "ce_orig": 1.4435735940933228, + "epoch": 0.058379466532461, + "kl_loss": 1786.4874267578125, + "loss_ib": 17.908973693847656, + "step": 203 + }, + { + "ce_ib": 39.613006591796875, + "ce_orig": 0.7148452401161194, + "epoch": 0.058379466532461, + "kl_loss": 2156.13232421875, + "loss_ib": 21.600934982299805, + "step": 203 + }, + { + "ce_ib": 40.6436653137207, + "ce_orig": 1.2090833187103271, + "epoch": 0.058379466532461, + "kl_loss": 2073.139404296875, + "loss_ib": 20.772037506103516, + "step": 203 + }, + { + "ce_ib": 38.75384521484375, + "ce_orig": 0.2740119993686676, + "epoch": 0.058379466532461, + "kl_loss": 1232.078125, + "loss_ib": 12.35953426361084, + "step": 203 + }, + { + "ce_ib": 41.146873474121094, + "ce_orig": 0.719344973564148, + "epoch": 0.058667050111438634, + "kl_loss": 1714.5777587890625, + "loss_ib": 17.18692398071289, + "step": 204 + }, + { + "ce_ib": 44.12678527832031, + "ce_orig": 1.2889289855957031, + "epoch": 0.058667050111438634, + "kl_loss": 1714.3292236328125, + "loss_ib": 17.187419891357422, + "step": 204 + }, + { + "ce_ib": 44.051700592041016, + "ce_orig": 1.077775478363037, + "epoch": 0.058667050111438634, + "kl_loss": 2116.22900390625, + "loss_ib": 21.206342697143555, + "step": 204 + }, + { + "ce_ib": 38.1263427734375, + "ce_orig": 0.6518339514732361, + "epoch": 0.058667050111438634, + "kl_loss": 2124.76513671875, + "loss_ib": 21.285778045654297, + "step": 204 + }, + { + "epoch": 0.05895463369041628, + "grad_norm": 334.65350341796875, + "learning_rate": 6.337579617834395e-06, + "loss": 21.643, + "step": 205 + }, + { + "ce_ib": 40.87744140625, + "ce_orig": 1.0821335315704346, + "epoch": 0.05895463369041628, + "kl_loss": 2118.4775390625, + "loss_ib": 21.225650787353516, + "step": 205 + }, + { + "ce_ib": 39.00108337402344, + "ce_orig": 0.5186193585395813, + "epoch": 0.05895463369041628, + "kl_loss": 2132.87939453125, + "loss_ib": 21.367794036865234, + "step": 205 + }, + { + "ce_ib": 42.660888671875, + "ce_orig": 1.201238989830017, + "epoch": 0.05895463369041628, + "kl_loss": 2008.679443359375, + "loss_ib": 20.12945556640625, + "step": 205 + }, + { + "ce_ib": 42.61842346191406, + "ce_orig": 0.9650039076805115, + "epoch": 0.05895463369041628, + "kl_loss": 2077.22314453125, + "loss_ib": 20.814849853515625, + "step": 205 + }, + { + "ce_ib": 38.07194137573242, + "ce_orig": 0.9797282814979553, + "epoch": 0.05924221726939392, + "kl_loss": 2239.80859375, + "loss_ib": 22.4361572265625, + "step": 206 + }, + { + "ce_ib": 42.637840270996094, + "ce_orig": 1.2511439323425293, + "epoch": 0.05924221726939392, + "kl_loss": 2223.17578125, + "loss_ib": 22.27439308166504, + "step": 206 + }, + { + "ce_ib": 40.19730758666992, + "ce_orig": 1.2803971767425537, + "epoch": 0.05924221726939392, + "kl_loss": 1948.2891845703125, + "loss_ib": 19.523088455200195, + "step": 206 + }, + { + "ce_ib": 40.72916030883789, + "ce_orig": 0.9539033770561218, + "epoch": 0.05924221726939392, + "kl_loss": 1906.385498046875, + "loss_ib": 19.104583740234375, + "step": 206 + }, + { + "ce_ib": 41.54541778564453, + "ce_orig": 0.9414616823196411, + "epoch": 0.05952980084837156, + "kl_loss": 1876.511474609375, + "loss_ib": 18.806659698486328, + "step": 207 + }, + { + "ce_ib": 41.48387908935547, + "ce_orig": 0.8939663171768188, + "epoch": 0.05952980084837156, + "kl_loss": 1365.2000732421875, + "loss_ib": 13.693485260009766, + "step": 207 + }, + { + "ce_ib": 39.200660705566406, + "ce_orig": 0.8465067148208618, + "epoch": 0.05952980084837156, + "kl_loss": 1455.4010009765625, + "loss_ib": 14.593210220336914, + "step": 207 + }, + { + "ce_ib": 44.42674255371094, + "ce_orig": 1.9264086484909058, + "epoch": 0.05952980084837156, + "kl_loss": 2006.107421875, + "loss_ib": 20.105499267578125, + "step": 207 + }, + { + "ce_ib": 41.2900505065918, + "ce_orig": 1.6340312957763672, + "epoch": 0.0598173844273492, + "kl_loss": 1908.3016357421875, + "loss_ib": 19.124305725097656, + "step": 208 + }, + { + "ce_ib": 41.597694396972656, + "ce_orig": 0.9764799475669861, + "epoch": 0.0598173844273492, + "kl_loss": 1880.7303466796875, + "loss_ib": 18.848901748657227, + "step": 208 + }, + { + "ce_ib": 42.240516662597656, + "ce_orig": 0.4290001690387726, + "epoch": 0.0598173844273492, + "kl_loss": 1004.71728515625, + "loss_ib": 10.0894136428833, + "step": 208 + }, + { + "ce_ib": 42.21794128417969, + "ce_orig": 1.0113506317138672, + "epoch": 0.0598173844273492, + "kl_loss": 1826.12890625, + "loss_ib": 18.30350685119629, + "step": 208 + }, + { + "ce_ib": 41.9946403503418, + "ce_orig": 1.175087332725525, + "epoch": 0.06010496800632684, + "kl_loss": 1176.065185546875, + "loss_ib": 11.80264663696289, + "step": 209 + }, + { + "ce_ib": 41.64639663696289, + "ce_orig": 1.211875081062317, + "epoch": 0.06010496800632684, + "kl_loss": 2071.13623046875, + "loss_ib": 20.753007888793945, + "step": 209 + }, + { + "ce_ib": 40.602378845214844, + "ce_orig": 1.2118444442749023, + "epoch": 0.06010496800632684, + "kl_loss": 2032.2001953125, + "loss_ib": 20.36260414123535, + "step": 209 + }, + { + "ce_ib": 39.86399459838867, + "ce_orig": 1.1752040386199951, + "epoch": 0.06010496800632684, + "kl_loss": 2107.810546875, + "loss_ib": 21.11796760559082, + "step": 209 + }, + { + "epoch": 0.06039255158530448, + "grad_norm": 353.71893310546875, + "learning_rate": 6.496815286624204e-06, + "loss": 20.5732, + "step": 210 + }, + { + "ce_ib": 42.62069320678711, + "ce_orig": 1.424117922782898, + "epoch": 0.06039255158530448, + "kl_loss": 1516.96630859375, + "loss_ib": 15.21228313446045, + "step": 210 + }, + { + "ce_ib": 39.955963134765625, + "ce_orig": 0.44305068254470825, + "epoch": 0.06039255158530448, + "kl_loss": 1933.53759765625, + "loss_ib": 19.37533187866211, + "step": 210 + }, + { + "ce_ib": 40.190860748291016, + "ce_orig": 0.597926914691925, + "epoch": 0.06039255158530448, + "kl_loss": 2069.74560546875, + "loss_ib": 20.737646102905273, + "step": 210 + }, + { + "ce_ib": 39.46810531616211, + "ce_orig": 0.713378369808197, + "epoch": 0.06039255158530448, + "kl_loss": 1792.88720703125, + "loss_ib": 17.968339920043945, + "step": 210 + }, + { + "ce_ib": 42.34745788574219, + "ce_orig": 1.0557239055633545, + "epoch": 0.06068013516428212, + "kl_loss": 1849.0069580078125, + "loss_ib": 18.53241729736328, + "step": 211 + }, + { + "ce_ib": 38.35053634643555, + "ce_orig": 0.603425145149231, + "epoch": 0.06068013516428212, + "kl_loss": 1920.453857421875, + "loss_ib": 19.242889404296875, + "step": 211 + }, + { + "ce_ib": 37.083927154541016, + "ce_orig": 1.2688902616500854, + "epoch": 0.06068013516428212, + "kl_loss": 2041.569580078125, + "loss_ib": 20.45277976989746, + "step": 211 + }, + { + "ce_ib": 38.7510871887207, + "ce_orig": 0.6306071877479553, + "epoch": 0.06068013516428212, + "kl_loss": 1976.189453125, + "loss_ib": 19.80064582824707, + "step": 211 + }, + { + "ce_ib": 36.3626594543457, + "ce_orig": 0.4798745810985565, + "epoch": 0.060967718743259756, + "kl_loss": 1731.49951171875, + "loss_ib": 17.351356506347656, + "step": 212 + }, + { + "ce_ib": 39.894752502441406, + "ce_orig": 0.991927444934845, + "epoch": 0.060967718743259756, + "kl_loss": 2037.4395751953125, + "loss_ib": 20.414289474487305, + "step": 212 + }, + { + "ce_ib": 41.493896484375, + "ce_orig": 1.472151756286621, + "epoch": 0.060967718743259756, + "kl_loss": 1865.9774169921875, + "loss_ib": 18.70126724243164, + "step": 212 + }, + { + "ce_ib": 42.53767013549805, + "ce_orig": 1.1901134252548218, + "epoch": 0.060967718743259756, + "kl_loss": 1645.129638671875, + "loss_ib": 16.493833541870117, + "step": 212 + }, + { + "ce_ib": 41.67341232299805, + "ce_orig": 2.049192428588867, + "epoch": 0.0612553023222374, + "kl_loss": 1779.3302001953125, + "loss_ib": 17.83497428894043, + "step": 213 + }, + { + "ce_ib": 42.051273345947266, + "ce_orig": 1.3447684049606323, + "epoch": 0.0612553023222374, + "kl_loss": 1777.956787109375, + "loss_ib": 17.821619033813477, + "step": 213 + }, + { + "ce_ib": 37.9370002746582, + "ce_orig": 0.4780416190624237, + "epoch": 0.0612553023222374, + "kl_loss": 1839.32177734375, + "loss_ib": 18.431154251098633, + "step": 213 + }, + { + "ce_ib": 39.32098388671875, + "ce_orig": 1.0718315839767456, + "epoch": 0.0612553023222374, + "kl_loss": 2034.295166015625, + "loss_ib": 20.382272720336914, + "step": 213 + }, + { + "ce_ib": 36.60554885864258, + "ce_orig": 0.5966328978538513, + "epoch": 0.06154288590121504, + "kl_loss": 1988.932861328125, + "loss_ib": 19.925933837890625, + "step": 214 + }, + { + "ce_ib": 41.9276123046875, + "ce_orig": 0.6651936173439026, + "epoch": 0.06154288590121504, + "kl_loss": 1789.4920654296875, + "loss_ib": 17.936847686767578, + "step": 214 + }, + { + "ce_ib": 40.556461334228516, + "ce_orig": 0.4927489459514618, + "epoch": 0.06154288590121504, + "kl_loss": 1789.6015625, + "loss_ib": 17.93657112121582, + "step": 214 + }, + { + "ce_ib": 39.251651763916016, + "ce_orig": 1.0295277833938599, + "epoch": 0.06154288590121504, + "kl_loss": 1894.323486328125, + "loss_ib": 18.982486724853516, + "step": 214 + }, + { + "epoch": 0.06183046948019268, + "grad_norm": 317.97894287109375, + "learning_rate": 6.6560509554140125e-06, + "loss": 19.6963, + "step": 215 + }, + { + "ce_ib": 38.56227111816406, + "ce_orig": 0.6546093821525574, + "epoch": 0.06183046948019268, + "kl_loss": 1737.800048828125, + "loss_ib": 17.416563034057617, + "step": 215 + }, + { + "ce_ib": 35.2888069152832, + "ce_orig": 0.7404365539550781, + "epoch": 0.06183046948019268, + "kl_loss": 1965.703857421875, + "loss_ib": 19.69232749938965, + "step": 215 + }, + { + "ce_ib": 40.4928092956543, + "ce_orig": 1.043062686920166, + "epoch": 0.06183046948019268, + "kl_loss": 1873.6024169921875, + "loss_ib": 18.77651596069336, + "step": 215 + }, + { + "ce_ib": 39.602508544921875, + "ce_orig": 1.0876483917236328, + "epoch": 0.06183046948019268, + "kl_loss": 1824.4144287109375, + "loss_ib": 18.28374671936035, + "step": 215 + }, + { + "ce_ib": 35.879852294921875, + "ce_orig": 0.8248341679573059, + "epoch": 0.06211805305917032, + "kl_loss": 1824.806640625, + "loss_ib": 18.283946990966797, + "step": 216 + }, + { + "ce_ib": 35.09049606323242, + "ce_orig": 0.5326448082923889, + "epoch": 0.06211805305917032, + "kl_loss": 1892.9739990234375, + "loss_ib": 18.96483039855957, + "step": 216 + }, + { + "ce_ib": 40.311012268066406, + "ce_orig": 1.6227895021438599, + "epoch": 0.06211805305917032, + "kl_loss": 1743.340087890625, + "loss_ib": 17.473711013793945, + "step": 216 + }, + { + "ce_ib": 36.35209274291992, + "ce_orig": 0.7696553468704224, + "epoch": 0.06211805305917032, + "kl_loss": 1989.461181640625, + "loss_ib": 19.93096351623535, + "step": 216 + }, + { + "ce_ib": 36.76679611206055, + "ce_orig": 0.7665999531745911, + "epoch": 0.06240563663814796, + "kl_loss": 1747.884033203125, + "loss_ib": 17.515605926513672, + "step": 217 + }, + { + "ce_ib": 38.807064056396484, + "ce_orig": 0.9963610172271729, + "epoch": 0.06240563663814796, + "kl_loss": 1780.5374755859375, + "loss_ib": 17.844181060791016, + "step": 217 + }, + { + "ce_ib": 39.64936828613281, + "ce_orig": 0.7059118151664734, + "epoch": 0.06240563663814796, + "kl_loss": 1668.8529052734375, + "loss_ib": 16.728178024291992, + "step": 217 + }, + { + "ce_ib": 37.85905838012695, + "ce_orig": 1.064191460609436, + "epoch": 0.06240563663814796, + "kl_loss": 1903.99462890625, + "loss_ib": 19.077804565429688, + "step": 217 + }, + { + "ce_ib": 36.819175720214844, + "ce_orig": 0.5811072587966919, + "epoch": 0.0626932202171256, + "kl_loss": 1824.97802734375, + "loss_ib": 18.286598205566406, + "step": 218 + }, + { + "ce_ib": 37.54027557373047, + "ce_orig": 0.9560117125511169, + "epoch": 0.0626932202171256, + "kl_loss": 1839.77392578125, + "loss_ib": 18.435279846191406, + "step": 218 + }, + { + "ce_ib": 39.93457794189453, + "ce_orig": 1.0178155899047852, + "epoch": 0.0626932202171256, + "kl_loss": 1711.14697265625, + "loss_ib": 17.151403427124023, + "step": 218 + }, + { + "ce_ib": 40.65945816040039, + "ce_orig": 0.742775022983551, + "epoch": 0.0626932202171256, + "kl_loss": 1604.2640380859375, + "loss_ib": 16.08329963684082, + "step": 218 + }, + { + "ce_ib": 41.853858947753906, + "ce_orig": 1.375777244567871, + "epoch": 0.06298080379610324, + "kl_loss": 1801.32958984375, + "loss_ib": 18.05514907836914, + "step": 219 + }, + { + "ce_ib": 36.28791046142578, + "ce_orig": 0.9161471724510193, + "epoch": 0.06298080379610324, + "kl_loss": 1644.6923828125, + "loss_ib": 16.483211517333984, + "step": 219 + }, + { + "ce_ib": 37.08815383911133, + "ce_orig": 0.728233814239502, + "epoch": 0.06298080379610324, + "kl_loss": 1778.4482421875, + "loss_ib": 17.821571350097656, + "step": 219 + }, + { + "ce_ib": 37.880104064941406, + "ce_orig": 1.3340355157852173, + "epoch": 0.06298080379610324, + "kl_loss": 1593.228759765625, + "loss_ib": 15.97016716003418, + "step": 219 + }, + { + "epoch": 0.06326838737508088, + "grad_norm": 318.8504943847656, + "learning_rate": 6.815286624203822e-06, + "loss": 18.6164, + "step": 220 + }, + { + "ce_ib": 39.489933013916016, + "ce_orig": 1.1903218030929565, + "epoch": 0.06326838737508088, + "kl_loss": 1687.38232421875, + "loss_ib": 16.913312911987305, + "step": 220 + }, + { + "ce_ib": 37.481624603271484, + "ce_orig": 0.7840384244918823, + "epoch": 0.06326838737508088, + "kl_loss": 1726.1568603515625, + "loss_ib": 17.299049377441406, + "step": 220 + }, + { + "ce_ib": 35.45810317993164, + "ce_orig": 0.7813513278961182, + "epoch": 0.06326838737508088, + "kl_loss": 1733.7164306640625, + "loss_ib": 17.372621536254883, + "step": 220 + }, + { + "ce_ib": 40.4595832824707, + "ce_orig": 1.3600130081176758, + "epoch": 0.06326838737508088, + "kl_loss": 1568.1533203125, + "loss_ib": 15.721991539001465, + "step": 220 + }, + { + "ce_ib": 35.940345764160156, + "ce_orig": 1.0896117687225342, + "epoch": 0.06355597095405853, + "kl_loss": 1728.0986328125, + "loss_ib": 17.316925048828125, + "step": 221 + }, + { + "ce_ib": 35.44572830200195, + "ce_orig": 0.2852933406829834, + "epoch": 0.06355597095405853, + "kl_loss": 963.3050537109375, + "loss_ib": 9.668496131896973, + "step": 221 + }, + { + "ce_ib": 33.80705261230469, + "ce_orig": 0.791994571685791, + "epoch": 0.06355597095405853, + "kl_loss": 1822.135009765625, + "loss_ib": 18.255157470703125, + "step": 221 + }, + { + "ce_ib": 36.91697692871094, + "ce_orig": 0.6960796117782593, + "epoch": 0.06355597095405853, + "kl_loss": 1718.594970703125, + "loss_ib": 17.22286605834961, + "step": 221 + }, + { + "ce_ib": 39.449485778808594, + "ce_orig": 0.9346453547477722, + "epoch": 0.06384355453303617, + "kl_loss": 1414.82958984375, + "loss_ib": 14.187745094299316, + "step": 222 + }, + { + "ce_ib": 38.30500793457031, + "ce_orig": 0.8375841975212097, + "epoch": 0.06384355453303617, + "kl_loss": 1562.5009765625, + "loss_ib": 15.663313865661621, + "step": 222 + }, + { + "ce_ib": 41.28805923461914, + "ce_orig": 0.7960017919540405, + "epoch": 0.06384355453303617, + "kl_loss": 1512.22802734375, + "loss_ib": 15.163567543029785, + "step": 222 + }, + { + "ce_ib": 32.99767303466797, + "ce_orig": 0.6580086946487427, + "epoch": 0.06384355453303617, + "kl_loss": 1726.3076171875, + "loss_ib": 17.29607391357422, + "step": 222 + }, + { + "ce_ib": 33.17844009399414, + "ce_orig": 0.7080318927764893, + "epoch": 0.0641311381120138, + "kl_loss": 1755.3681640625, + "loss_ib": 17.58686065673828, + "step": 223 + }, + { + "ce_ib": 37.70219421386719, + "ce_orig": 0.6540882587432861, + "epoch": 0.0641311381120138, + "kl_loss": 1503.2210693359375, + "loss_ib": 15.069912910461426, + "step": 223 + }, + { + "ce_ib": 40.99760818481445, + "ce_orig": 0.7546887993812561, + "epoch": 0.0641311381120138, + "kl_loss": 1461.0548095703125, + "loss_ib": 14.651545524597168, + "step": 223 + }, + { + "ce_ib": 36.96660614013672, + "ce_orig": 0.6669592261314392, + "epoch": 0.0641311381120138, + "kl_loss": 1589.726318359375, + "loss_ib": 15.934229850769043, + "step": 223 + }, + { + "ce_ib": 41.123348236083984, + "ce_orig": 1.2109692096710205, + "epoch": 0.06441872169099144, + "kl_loss": 1519.9962158203125, + "loss_ib": 15.241085052490234, + "step": 224 + }, + { + "ce_ib": 36.78544998168945, + "ce_orig": 0.9228610992431641, + "epoch": 0.06441872169099144, + "kl_loss": 1535.970703125, + "loss_ib": 15.396492958068848, + "step": 224 + }, + { + "ce_ib": 36.03901672363281, + "ce_orig": 1.0696367025375366, + "epoch": 0.06441872169099144, + "kl_loss": 1711.971923828125, + "loss_ib": 17.155757904052734, + "step": 224 + }, + { + "ce_ib": 38.314327239990234, + "ce_orig": 1.826366662979126, + "epoch": 0.06441872169099144, + "kl_loss": 1629.136962890625, + "loss_ib": 16.329683303833008, + "step": 224 + }, + { + "epoch": 0.06470630526996908, + "grad_norm": 308.50250244140625, + "learning_rate": 6.9745222929936305e-06, + "loss": 16.8856, + "step": 225 + }, + { + "ce_ib": 36.43165969848633, + "ce_orig": 0.878455638885498, + "epoch": 0.06470630526996908, + "kl_loss": 1289.9693603515625, + "loss_ib": 12.936124801635742, + "step": 225 + }, + { + "ce_ib": 40.33540344238281, + "ce_orig": 1.3886396884918213, + "epoch": 0.06470630526996908, + "kl_loss": 1656.87060546875, + "loss_ib": 16.609041213989258, + "step": 225 + }, + { + "ce_ib": 36.43739700317383, + "ce_orig": 0.4045904874801636, + "epoch": 0.06470630526996908, + "kl_loss": 1671.9920654296875, + "loss_ib": 16.756359100341797, + "step": 225 + }, + { + "ce_ib": 36.36072540283203, + "ce_orig": 1.0076552629470825, + "epoch": 0.06470630526996908, + "kl_loss": 1584.193115234375, + "loss_ib": 15.878292083740234, + "step": 225 + }, + { + "ce_ib": 38.3975830078125, + "ce_orig": 1.267250895500183, + "epoch": 0.06499388884894673, + "kl_loss": 1569.5213623046875, + "loss_ib": 15.733610153198242, + "step": 226 + }, + { + "ce_ib": 38.38508224487305, + "ce_orig": 1.265257477760315, + "epoch": 0.06499388884894673, + "kl_loss": 1453.87646484375, + "loss_ib": 14.577149391174316, + "step": 226 + }, + { + "ce_ib": 39.651161193847656, + "ce_orig": 0.6877183318138123, + "epoch": 0.06499388884894673, + "kl_loss": 1393.7093505859375, + "loss_ib": 13.976743698120117, + "step": 226 + }, + { + "ce_ib": 36.53251647949219, + "ce_orig": 1.5096549987792969, + "epoch": 0.06499388884894673, + "kl_loss": 1307.943115234375, + "loss_ib": 13.115962982177734, + "step": 226 + }, + { + "ce_ib": 37.703006744384766, + "ce_orig": 0.9586830139160156, + "epoch": 0.06528147242792437, + "kl_loss": 1452.981201171875, + "loss_ib": 14.567514419555664, + "step": 227 + }, + { + "ce_ib": 34.66203308105469, + "ce_orig": 0.658699095249176, + "epoch": 0.06528147242792437, + "kl_loss": 1551.1995849609375, + "loss_ib": 15.54665756225586, + "step": 227 + }, + { + "ce_ib": 35.329044342041016, + "ce_orig": 0.6904061436653137, + "epoch": 0.06528147242792437, + "kl_loss": 1509.483154296875, + "loss_ib": 15.130160331726074, + "step": 227 + }, + { + "ce_ib": 35.24424743652344, + "ce_orig": 0.5379785895347595, + "epoch": 0.06528147242792437, + "kl_loss": 1509.2982177734375, + "loss_ib": 15.128226280212402, + "step": 227 + }, + { + "ce_ib": 35.123985290527344, + "ce_orig": 0.7466920614242554, + "epoch": 0.06556905600690201, + "kl_loss": 1561.8984375, + "loss_ib": 15.654109001159668, + "step": 228 + }, + { + "ce_ib": 34.78830337524414, + "ce_orig": 0.7827273607254028, + "epoch": 0.06556905600690201, + "kl_loss": 1582.02099609375, + "loss_ib": 15.854998588562012, + "step": 228 + }, + { + "ce_ib": 31.581981658935547, + "ce_orig": 0.2600187063217163, + "epoch": 0.06556905600690201, + "kl_loss": 1321.60205078125, + "loss_ib": 13.247602462768555, + "step": 228 + }, + { + "ce_ib": 34.20478820800781, + "ce_orig": 1.0527675151824951, + "epoch": 0.06556905600690201, + "kl_loss": 1532.9888916015625, + "loss_ib": 15.364093780517578, + "step": 228 + }, + { + "ce_ib": 39.04445266723633, + "ce_orig": 1.652494192123413, + "epoch": 0.06585663958587964, + "kl_loss": 1284.142578125, + "loss_ib": 12.88046932220459, + "step": 229 + }, + { + "ce_ib": 37.834381103515625, + "ce_orig": 1.3008118867874146, + "epoch": 0.06585663958587964, + "kl_loss": 1602.8289794921875, + "loss_ib": 16.066123962402344, + "step": 229 + }, + { + "ce_ib": 34.8093147277832, + "ce_orig": 0.9119290113449097, + "epoch": 0.06585663958587964, + "kl_loss": 1195.5177001953125, + "loss_ib": 11.989986419677734, + "step": 229 + }, + { + "ce_ib": 37.39421081542969, + "ce_orig": 1.299423336982727, + "epoch": 0.06585663958587964, + "kl_loss": 1378.4208984375, + "loss_ib": 13.821602821350098, + "step": 229 + }, + { + "epoch": 0.06614422316485728, + "grad_norm": 279.78515625, + "learning_rate": 7.13375796178344e-06, + "loss": 15.8318, + "step": 230 + }, + { + "ce_ib": 39.00707244873047, + "ce_orig": 1.9128481149673462, + "epoch": 0.06614422316485728, + "kl_loss": 1414.39697265625, + "loss_ib": 14.182976722717285, + "step": 230 + }, + { + "ce_ib": 36.59072494506836, + "ce_orig": 1.3344916105270386, + "epoch": 0.06614422316485728, + "kl_loss": 1368.765869140625, + "loss_ib": 13.724248886108398, + "step": 230 + }, + { + "ce_ib": 36.91270065307617, + "ce_orig": 1.1911953687667847, + "epoch": 0.06614422316485728, + "kl_loss": 1506.248291015625, + "loss_ib": 15.099395751953125, + "step": 230 + }, + { + "ce_ib": 35.45751953125, + "ce_orig": 0.9839146733283997, + "epoch": 0.06614422316485728, + "kl_loss": 1557.294189453125, + "loss_ib": 15.608399391174316, + "step": 230 + }, + { + "ce_ib": 40.05620193481445, + "ce_orig": 0.5404795408248901, + "epoch": 0.06643180674383492, + "kl_loss": 1331.465576171875, + "loss_ib": 13.354711532592773, + "step": 231 + }, + { + "ce_ib": 35.91750717163086, + "ce_orig": 1.0286931991577148, + "epoch": 0.06643180674383492, + "kl_loss": 1538.906982421875, + "loss_ib": 15.42498779296875, + "step": 231 + }, + { + "ce_ib": 37.91292953491211, + "ce_orig": 0.6502935886383057, + "epoch": 0.06643180674383492, + "kl_loss": 1359.6444091796875, + "loss_ib": 13.634356498718262, + "step": 231 + }, + { + "ce_ib": 36.25615310668945, + "ce_orig": 1.0073096752166748, + "epoch": 0.06643180674383492, + "kl_loss": 1373.6658935546875, + "loss_ib": 13.77291488647461, + "step": 231 + }, + { + "ce_ib": 33.69683837890625, + "ce_orig": 0.46744322776794434, + "epoch": 0.06671939032281257, + "kl_loss": 1326.230712890625, + "loss_ib": 13.296003341674805, + "step": 232 + }, + { + "ce_ib": 33.371883392333984, + "ce_orig": 1.2536524534225464, + "epoch": 0.06671939032281257, + "kl_loss": 1449.79150390625, + "loss_ib": 14.531286239624023, + "step": 232 + }, + { + "ce_ib": 35.073936462402344, + "ce_orig": 1.0432262420654297, + "epoch": 0.06671939032281257, + "kl_loss": 1357.572021484375, + "loss_ib": 13.610794067382812, + "step": 232 + }, + { + "ce_ib": 35.290687561035156, + "ce_orig": 0.8194282650947571, + "epoch": 0.06671939032281257, + "kl_loss": 1452.7779541015625, + "loss_ib": 14.563069343566895, + "step": 232 + }, + { + "ce_ib": 39.44172668457031, + "ce_orig": 1.5786598920822144, + "epoch": 0.06700697390179021, + "kl_loss": 1254.966796875, + "loss_ib": 12.589109420776367, + "step": 233 + }, + { + "ce_ib": 34.740867614746094, + "ce_orig": 0.563507616519928, + "epoch": 0.06700697390179021, + "kl_loss": 1330.1361083984375, + "loss_ib": 13.336101531982422, + "step": 233 + }, + { + "ce_ib": 36.04484176635742, + "ce_orig": 1.1875897645950317, + "epoch": 0.06700697390179021, + "kl_loss": 1417.7935791015625, + "loss_ib": 14.213980674743652, + "step": 233 + }, + { + "ce_ib": 31.511131286621094, + "ce_orig": 0.6714182496070862, + "epoch": 0.06700697390179021, + "kl_loss": 1382.14794921875, + "loss_ib": 13.85299015045166, + "step": 233 + }, + { + "ce_ib": 33.84688186645508, + "ce_orig": 1.096521258354187, + "epoch": 0.06729455748076785, + "kl_loss": 1357.806884765625, + "loss_ib": 13.61191463470459, + "step": 234 + }, + { + "ce_ib": 34.99058532714844, + "ce_orig": 1.0461159944534302, + "epoch": 0.06729455748076785, + "kl_loss": 1225.052734375, + "loss_ib": 12.285517692565918, + "step": 234 + }, + { + "ce_ib": 34.9071044921875, + "ce_orig": 0.9976585507392883, + "epoch": 0.06729455748076785, + "kl_loss": 1408.259521484375, + "loss_ib": 14.117502212524414, + "step": 234 + }, + { + "ce_ib": 37.175872802734375, + "ce_orig": 1.5398781299591064, + "epoch": 0.06729455748076785, + "kl_loss": 1296.612548828125, + "loss_ib": 13.003300666809082, + "step": 234 + }, + { + "epoch": 0.06758214105974548, + "grad_norm": 268.0568542480469, + "learning_rate": 7.2929936305732485e-06, + "loss": 14.6834, + "step": 235 + }, + { + "ce_ib": 35.610557556152344, + "ce_orig": 0.7642791867256165, + "epoch": 0.06758214105974548, + "kl_loss": 1381.1488037109375, + "loss_ib": 13.847098350524902, + "step": 235 + }, + { + "ce_ib": 38.893550872802734, + "ce_orig": 1.8394078016281128, + "epoch": 0.06758214105974548, + "kl_loss": 1152.2271728515625, + "loss_ib": 11.561165809631348, + "step": 235 + }, + { + "ce_ib": 32.011322021484375, + "ce_orig": 0.9249970316886902, + "epoch": 0.06758214105974548, + "kl_loss": 1395.470458984375, + "loss_ib": 13.986716270446777, + "step": 235 + }, + { + "ce_ib": 35.36570739746094, + "ce_orig": 1.026782751083374, + "epoch": 0.06758214105974548, + "kl_loss": 1339.9072265625, + "loss_ib": 13.43443775177002, + "step": 235 + }, + { + "ce_ib": 33.16312789916992, + "ce_orig": 1.0328998565673828, + "epoch": 0.06786972463872312, + "kl_loss": 1300.6361083984375, + "loss_ib": 13.03952407836914, + "step": 236 + }, + { + "ce_ib": 35.08463668823242, + "ce_orig": 1.3721755743026733, + "epoch": 0.06786972463872312, + "kl_loss": 1194.1552734375, + "loss_ib": 11.97663688659668, + "step": 236 + }, + { + "ce_ib": 31.49561882019043, + "ce_orig": 0.3084181249141693, + "epoch": 0.06786972463872312, + "kl_loss": 934.6522216796875, + "loss_ib": 9.37801742553711, + "step": 236 + }, + { + "ce_ib": 39.625789642333984, + "ce_orig": 1.2989716529846191, + "epoch": 0.06786972463872312, + "kl_loss": 1081.615234375, + "loss_ib": 10.855777740478516, + "step": 236 + }, + { + "ce_ib": 33.67836380004883, + "ce_orig": 1.3483405113220215, + "epoch": 0.06815730821770077, + "kl_loss": 1198.904541015625, + "loss_ib": 12.022723197937012, + "step": 237 + }, + { + "ce_ib": 35.59366989135742, + "ce_orig": 0.9075685143470764, + "epoch": 0.06815730821770077, + "kl_loss": 1204.507568359375, + "loss_ib": 12.080668449401855, + "step": 237 + }, + { + "ce_ib": 36.645938873291016, + "ce_orig": 0.6160690188407898, + "epoch": 0.06815730821770077, + "kl_loss": 1352.1148681640625, + "loss_ib": 13.557793617248535, + "step": 237 + }, + { + "ce_ib": 34.851688385009766, + "ce_orig": 0.7488659024238586, + "epoch": 0.06815730821770077, + "kl_loss": 1261.7066650390625, + "loss_ib": 12.651918411254883, + "step": 237 + }, + { + "ce_ib": 31.677663803100586, + "ce_orig": 0.6202912330627441, + "epoch": 0.06844489179667841, + "kl_loss": 1198.86669921875, + "loss_ib": 12.020343780517578, + "step": 238 + }, + { + "ce_ib": 33.36151885986328, + "ce_orig": 0.7369568347930908, + "epoch": 0.06844489179667841, + "kl_loss": 1171.602294921875, + "loss_ib": 11.749384880065918, + "step": 238 + }, + { + "ce_ib": 37.02521896362305, + "ce_orig": 0.6275981664657593, + "epoch": 0.06844489179667841, + "kl_loss": 1234.6282958984375, + "loss_ib": 12.383307456970215, + "step": 238 + }, + { + "ce_ib": 33.56972885131836, + "ce_orig": 0.8399911522865295, + "epoch": 0.06844489179667841, + "kl_loss": 1177.68603515625, + "loss_ib": 11.810429573059082, + "step": 238 + }, + { + "ce_ib": 36.48527526855469, + "ce_orig": 1.2248564958572388, + "epoch": 0.06873247537565605, + "kl_loss": 1168.8984375, + "loss_ib": 11.725469589233398, + "step": 239 + }, + { + "ce_ib": 32.57621765136719, + "ce_orig": 0.8083109259605408, + "epoch": 0.06873247537565605, + "kl_loss": 1238.533935546875, + "loss_ib": 12.417914390563965, + "step": 239 + }, + { + "ce_ib": 36.354488372802734, + "ce_orig": 1.729040503501892, + "epoch": 0.06873247537565605, + "kl_loss": 1160.8841552734375, + "loss_ib": 11.645195960998535, + "step": 239 + }, + { + "ce_ib": 33.25252151489258, + "ce_orig": 0.8631963729858398, + "epoch": 0.06873247537565605, + "kl_loss": 1161.73974609375, + "loss_ib": 11.650649070739746, + "step": 239 + }, + { + "epoch": 0.06902005895463369, + "grad_norm": 242.57241821289062, + "learning_rate": 7.452229299363057e-06, + "loss": 13.2382, + "step": 240 + }, + { + "ce_ib": 32.97649002075195, + "ce_orig": 0.5754613876342773, + "epoch": 0.06902005895463369, + "kl_loss": 1180.76708984375, + "loss_ib": 11.840646743774414, + "step": 240 + }, + { + "ce_ib": 35.5557861328125, + "ce_orig": 1.3153690099716187, + "epoch": 0.06902005895463369, + "kl_loss": 1166.3897705078125, + "loss_ib": 11.69945240020752, + "step": 240 + }, + { + "ce_ib": 30.03131675720215, + "ce_orig": 0.5445340275764465, + "epoch": 0.06902005895463369, + "kl_loss": 1206.5845947265625, + "loss_ib": 12.095877647399902, + "step": 240 + }, + { + "ce_ib": 34.68654251098633, + "ce_orig": 1.0324212312698364, + "epoch": 0.06902005895463369, + "kl_loss": 1035.31201171875, + "loss_ib": 10.387805938720703, + "step": 240 + }, + { + "ce_ib": 34.29194259643555, + "ce_orig": 0.9263237714767456, + "epoch": 0.06930764253361132, + "kl_loss": 1131.122314453125, + "loss_ib": 11.345515251159668, + "step": 241 + }, + { + "ce_ib": 35.83911895751953, + "ce_orig": 0.6829422116279602, + "epoch": 0.06930764253361132, + "kl_loss": 1171.150634765625, + "loss_ib": 11.747344970703125, + "step": 241 + }, + { + "ce_ib": 34.61550521850586, + "ce_orig": 0.7391694188117981, + "epoch": 0.06930764253361132, + "kl_loss": 1050.002685546875, + "loss_ib": 10.534642219543457, + "step": 241 + }, + { + "ce_ib": 31.543256759643555, + "ce_orig": 0.8060687780380249, + "epoch": 0.06930764253361132, + "kl_loss": 1197.1962890625, + "loss_ib": 12.00350570678711, + "step": 241 + }, + { + "ce_ib": 32.99800109863281, + "ce_orig": 0.6455181837081909, + "epoch": 0.06959522611258898, + "kl_loss": 1041.767333984375, + "loss_ib": 10.450671195983887, + "step": 242 + }, + { + "ce_ib": 32.91671371459961, + "ce_orig": 0.7244043350219727, + "epoch": 0.06959522611258898, + "kl_loss": 1103.9139404296875, + "loss_ib": 11.07205581665039, + "step": 242 + }, + { + "ce_ib": 35.45330047607422, + "ce_orig": 0.9272658228874207, + "epoch": 0.06959522611258898, + "kl_loss": 1027.905517578125, + "loss_ib": 10.314507484436035, + "step": 242 + }, + { + "ce_ib": 34.885498046875, + "ce_orig": 0.8863522410392761, + "epoch": 0.06959522611258898, + "kl_loss": 1155.1387939453125, + "loss_ib": 11.586273193359375, + "step": 242 + }, + { + "ce_ib": 34.44084930419922, + "ce_orig": 1.152998924255371, + "epoch": 0.06988280969156661, + "kl_loss": 1133.0599365234375, + "loss_ib": 11.365039825439453, + "step": 243 + }, + { + "ce_ib": 35.273677825927734, + "ce_orig": 1.2428306341171265, + "epoch": 0.06988280969156661, + "kl_loss": 1112.822021484375, + "loss_ib": 11.163493156433105, + "step": 243 + }, + { + "ce_ib": 32.52173614501953, + "ce_orig": 1.016830325126648, + "epoch": 0.06988280969156661, + "kl_loss": 1070.12255859375, + "loss_ib": 10.733747482299805, + "step": 243 + }, + { + "ce_ib": 34.803653717041016, + "ce_orig": 0.615959107875824, + "epoch": 0.06988280969156661, + "kl_loss": 1086.107421875, + "loss_ib": 10.895877838134766, + "step": 243 + }, + { + "ce_ib": 32.82182693481445, + "ce_orig": 0.8602744936943054, + "epoch": 0.07017039327054425, + "kl_loss": 1035.995849609375, + "loss_ib": 10.392780303955078, + "step": 244 + }, + { + "ce_ib": 31.894535064697266, + "ce_orig": 0.6907263398170471, + "epoch": 0.07017039327054425, + "kl_loss": 979.4188232421875, + "loss_ib": 9.826082229614258, + "step": 244 + }, + { + "ce_ib": 35.21843719482422, + "ce_orig": 1.3801195621490479, + "epoch": 0.07017039327054425, + "kl_loss": 1049.561279296875, + "loss_ib": 10.530831336975098, + "step": 244 + }, + { + "ce_ib": 34.30471420288086, + "ce_orig": 0.7986380457878113, + "epoch": 0.07017039327054425, + "kl_loss": 997.249755859375, + "loss_ib": 10.006802558898926, + "step": 244 + }, + { + "epoch": 0.07045797684952189, + "grad_norm": 220.81076049804688, + "learning_rate": 7.611464968152867e-06, + "loss": 11.7147, + "step": 245 + }, + { + "ce_ib": 33.57966613769531, + "ce_orig": 0.7934867143630981, + "epoch": 0.07045797684952189, + "kl_loss": 1044.489501953125, + "loss_ib": 10.478473663330078, + "step": 245 + }, + { + "ce_ib": 30.60529327392578, + "ce_orig": 0.5140112638473511, + "epoch": 0.07045797684952189, + "kl_loss": 979.260498046875, + "loss_ib": 9.823209762573242, + "step": 245 + }, + { + "ce_ib": 36.327213287353516, + "ce_orig": 1.3819940090179443, + "epoch": 0.07045797684952189, + "kl_loss": 994.7232666015625, + "loss_ib": 9.983559608459473, + "step": 245 + }, + { + "ce_ib": 34.17820739746094, + "ce_orig": 0.8814669847488403, + "epoch": 0.07045797684952189, + "kl_loss": 885.5364990234375, + "loss_ib": 8.889543533325195, + "step": 245 + }, + { + "ce_ib": 34.66371154785156, + "ce_orig": 1.3104512691497803, + "epoch": 0.07074556042849953, + "kl_loss": 962.946533203125, + "loss_ib": 9.664129257202148, + "step": 246 + }, + { + "ce_ib": 32.48523712158203, + "ce_orig": 0.19654367864131927, + "epoch": 0.07074556042849953, + "kl_loss": 613.4555053710938, + "loss_ib": 6.1670403480529785, + "step": 246 + }, + { + "ce_ib": 37.2482795715332, + "ce_orig": 1.528881549835205, + "epoch": 0.07074556042849953, + "kl_loss": 901.1966552734375, + "loss_ib": 9.049215316772461, + "step": 246 + }, + { + "ce_ib": 32.179996490478516, + "ce_orig": 0.784110963344574, + "epoch": 0.07074556042849953, + "kl_loss": 928.5962524414062, + "loss_ib": 9.318142890930176, + "step": 246 + }, + { + "ce_ib": 29.451961517333984, + "ce_orig": 0.9916685819625854, + "epoch": 0.07103314400747718, + "kl_loss": 983.1722412109375, + "loss_ib": 9.861173629760742, + "step": 247 + }, + { + "ce_ib": 37.86883544921875, + "ce_orig": 1.3681284189224243, + "epoch": 0.07103314400747718, + "kl_loss": 783.9342041015625, + "loss_ib": 7.87721061706543, + "step": 247 + }, + { + "ce_ib": 35.45719528198242, + "ce_orig": 1.3227193355560303, + "epoch": 0.07103314400747718, + "kl_loss": 848.042724609375, + "loss_ib": 8.515884399414062, + "step": 247 + }, + { + "ce_ib": 32.25755310058594, + "ce_orig": 0.7888100147247314, + "epoch": 0.07103314400747718, + "kl_loss": 908.6535034179688, + "loss_ib": 9.118792533874512, + "step": 247 + }, + { + "ce_ib": 34.29731369018555, + "ce_orig": 1.3129619359970093, + "epoch": 0.07132072758645482, + "kl_loss": 884.159912109375, + "loss_ib": 8.875896453857422, + "step": 248 + }, + { + "ce_ib": 34.386695861816406, + "ce_orig": 0.8120501041412354, + "epoch": 0.07132072758645482, + "kl_loss": 871.235107421875, + "loss_ib": 8.746737480163574, + "step": 248 + }, + { + "ce_ib": 30.895753860473633, + "ce_orig": 0.7307835817337036, + "epoch": 0.07132072758645482, + "kl_loss": 892.11767578125, + "loss_ib": 8.952072143554688, + "step": 248 + }, + { + "ce_ib": 33.708763122558594, + "ce_orig": 0.9609125852584839, + "epoch": 0.07132072758645482, + "kl_loss": 908.0333251953125, + "loss_ib": 9.114041328430176, + "step": 248 + }, + { + "ce_ib": 33.838768005371094, + "ce_orig": 1.0209710597991943, + "epoch": 0.07160831116543245, + "kl_loss": 893.3701171875, + "loss_ib": 8.96753978729248, + "step": 249 + }, + { + "ce_ib": 32.03993606567383, + "ce_orig": 1.0714709758758545, + "epoch": 0.07160831116543245, + "kl_loss": 845.7508544921875, + "loss_ib": 8.489547729492188, + "step": 249 + }, + { + "ce_ib": 29.649864196777344, + "ce_orig": 1.0681530237197876, + "epoch": 0.07160831116543245, + "kl_loss": 1036.672119140625, + "loss_ib": 10.396370887756348, + "step": 249 + }, + { + "ce_ib": 32.387245178222656, + "ce_orig": 1.1253947019577026, + "epoch": 0.07160831116543245, + "kl_loss": 863.2434692382812, + "loss_ib": 8.66482162475586, + "step": 249 + }, + { + "epoch": 0.07189589474441009, + "grad_norm": 205.96604919433594, + "learning_rate": 7.770700636942676e-06, + "loss": 10.0614, + "step": 250 + }, + { + "ce_ib": 34.68352127075195, + "ce_orig": 1.0909476280212402, + "epoch": 0.07189589474441009, + "kl_loss": 867.9714965820312, + "loss_ib": 8.714398384094238, + "step": 250 + }, + { + "ce_ib": 31.70074462890625, + "ce_orig": 0.9184356331825256, + "epoch": 0.07189589474441009, + "kl_loss": 894.34326171875, + "loss_ib": 8.975132942199707, + "step": 250 + }, + { + "ce_ib": 36.209068298339844, + "ce_orig": 1.1706846952438354, + "epoch": 0.07189589474441009, + "kl_loss": 795.684814453125, + "loss_ib": 7.9930572509765625, + "step": 250 + }, + { + "ce_ib": 35.70771789550781, + "ce_orig": 0.712752640247345, + "epoch": 0.07189589474441009, + "kl_loss": 757.6834106445312, + "loss_ib": 7.612542152404785, + "step": 250 + }, + { + "ce_ib": 30.09554100036621, + "ce_orig": 0.7422066330909729, + "epoch": 0.07218347832338773, + "kl_loss": 993.2034301757812, + "loss_ib": 9.962129592895508, + "step": 251 + }, + { + "ce_ib": 30.356294631958008, + "ce_orig": 0.8336584568023682, + "epoch": 0.07218347832338773, + "kl_loss": 753.03955078125, + "loss_ib": 7.560751914978027, + "step": 251 + }, + { + "ce_ib": 34.888736724853516, + "ce_orig": 0.7718689441680908, + "epoch": 0.07218347832338773, + "kl_loss": 869.8005981445312, + "loss_ib": 8.732894897460938, + "step": 251 + }, + { + "ce_ib": 34.146759033203125, + "ce_orig": 1.3637772798538208, + "epoch": 0.07218347832338773, + "kl_loss": 775.9141235351562, + "loss_ib": 7.793287754058838, + "step": 251 + }, + { + "ce_ib": 36.634437561035156, + "ce_orig": 1.066004991531372, + "epoch": 0.07247106190236538, + "kl_loss": 765.8868408203125, + "loss_ib": 7.695502758026123, + "step": 252 + }, + { + "ce_ib": 34.45478439331055, + "ce_orig": 1.2148858308792114, + "epoch": 0.07247106190236538, + "kl_loss": 777.1328735351562, + "loss_ib": 7.805783748626709, + "step": 252 + }, + { + "ce_ib": 32.530982971191406, + "ce_orig": 1.1445003747940063, + "epoch": 0.07247106190236538, + "kl_loss": 722.8099975585938, + "loss_ib": 7.260631084442139, + "step": 252 + }, + { + "ce_ib": 33.494747161865234, + "ce_orig": 1.2305960655212402, + "epoch": 0.07247106190236538, + "kl_loss": 792.6470947265625, + "loss_ib": 7.959965705871582, + "step": 252 + }, + { + "ce_ib": 31.90201187133789, + "ce_orig": 1.0651308298110962, + "epoch": 0.07275864548134302, + "kl_loss": 772.0308227539062, + "loss_ib": 7.7522101402282715, + "step": 253 + }, + { + "ce_ib": 36.638572692871094, + "ce_orig": 1.0030566453933716, + "epoch": 0.07275864548134302, + "kl_loss": 877.4444580078125, + "loss_ib": 8.81108283996582, + "step": 253 + }, + { + "ce_ib": 33.78273010253906, + "ce_orig": 1.0988649129867554, + "epoch": 0.07275864548134302, + "kl_loss": 711.0679931640625, + "loss_ib": 7.144462585449219, + "step": 253 + }, + { + "ce_ib": 33.45583724975586, + "ce_orig": 0.7203549146652222, + "epoch": 0.07275864548134302, + "kl_loss": 781.1646728515625, + "loss_ib": 7.845102310180664, + "step": 253 + }, + { + "ce_ib": 33.97596740722656, + "ce_orig": 1.545408010482788, + "epoch": 0.07304622906032066, + "kl_loss": 652.7290649414062, + "loss_ib": 6.5612664222717285, + "step": 254 + }, + { + "ce_ib": 34.555152893066406, + "ce_orig": 0.9807875752449036, + "epoch": 0.07304622906032066, + "kl_loss": 776.7444458007812, + "loss_ib": 7.801999568939209, + "step": 254 + }, + { + "ce_ib": 31.55140495300293, + "ce_orig": 0.7501665949821472, + "epoch": 0.07304622906032066, + "kl_loss": 769.890869140625, + "loss_ib": 7.730460166931152, + "step": 254 + }, + { + "ce_ib": 35.186798095703125, + "ce_orig": 0.7437403202056885, + "epoch": 0.07304622906032066, + "kl_loss": 796.2337646484375, + "loss_ib": 7.997524261474609, + "step": 254 + }, + { + "epoch": 0.0733338126392983, + "grad_norm": 180.0628204345703, + "learning_rate": 7.929936305732485e-06, + "loss": 8.6844, + "step": 255 + }, + { + "ce_ib": 34.1368408203125, + "ce_orig": 0.9626627564430237, + "epoch": 0.0733338126392983, + "kl_loss": 747.2984008789062, + "loss_ib": 7.507120609283447, + "step": 255 + }, + { + "ce_ib": 31.835412979125977, + "ce_orig": 0.8661757707595825, + "epoch": 0.0733338126392983, + "kl_loss": 723.6868896484375, + "loss_ib": 7.268703937530518, + "step": 255 + }, + { + "ce_ib": 40.257713317871094, + "ce_orig": 2.202180862426758, + "epoch": 0.0733338126392983, + "kl_loss": 661.350341796875, + "loss_ib": 6.65376091003418, + "step": 255 + }, + { + "ce_ib": 32.313350677490234, + "ce_orig": 0.7933053374290466, + "epoch": 0.0733338126392983, + "kl_loss": 702.9135131835938, + "loss_ib": 7.061448097229004, + "step": 255 + }, + { + "ce_ib": 32.77560806274414, + "ce_orig": 0.7840645909309387, + "epoch": 0.07362139621827593, + "kl_loss": 670.623291015625, + "loss_ib": 6.73900842666626, + "step": 256 + }, + { + "ce_ib": 34.22809982299805, + "ce_orig": 0.9132450819015503, + "epoch": 0.07362139621827593, + "kl_loss": 655.1952514648438, + "loss_ib": 6.586180210113525, + "step": 256 + }, + { + "ce_ib": 36.01190948486328, + "ce_orig": 1.1352635622024536, + "epoch": 0.07362139621827593, + "kl_loss": 622.5662841796875, + "loss_ib": 6.261674880981445, + "step": 256 + }, + { + "ce_ib": 31.15489959716797, + "ce_orig": 0.7360689043998718, + "epoch": 0.07362139621827593, + "kl_loss": 706.380126953125, + "loss_ib": 7.094955921173096, + "step": 256 + }, + { + "ce_ib": 37.11478042602539, + "ce_orig": 0.8814412355422974, + "epoch": 0.07390897979725358, + "kl_loss": 654.782958984375, + "loss_ib": 6.584944248199463, + "step": 257 + }, + { + "ce_ib": 36.152000427246094, + "ce_orig": 1.3912250995635986, + "epoch": 0.07390897979725358, + "kl_loss": 541.4805297851562, + "loss_ib": 5.45095682144165, + "step": 257 + }, + { + "ce_ib": 35.5803108215332, + "ce_orig": 0.9307012557983398, + "epoch": 0.07390897979725358, + "kl_loss": 653.8729248046875, + "loss_ib": 6.574309349060059, + "step": 257 + }, + { + "ce_ib": 32.77763366699219, + "ce_orig": 1.1101514101028442, + "epoch": 0.07390897979725358, + "kl_loss": 638.899658203125, + "loss_ib": 6.421773910522461, + "step": 257 + }, + { + "ce_ib": 30.221391677856445, + "ce_orig": 0.6179237365722656, + "epoch": 0.07419656337623122, + "kl_loss": 560.8075561523438, + "loss_ib": 5.638297080993652, + "step": 258 + }, + { + "ce_ib": 33.3847770690918, + "ce_orig": 0.734990656375885, + "epoch": 0.07419656337623122, + "kl_loss": 551.2991333007812, + "loss_ib": 5.5463762283325195, + "step": 258 + }, + { + "ce_ib": 37.109310150146484, + "ce_orig": 1.5755938291549683, + "epoch": 0.07419656337623122, + "kl_loss": 494.017578125, + "loss_ib": 4.977284908294678, + "step": 258 + }, + { + "ce_ib": 35.911502838134766, + "ce_orig": 1.535288691520691, + "epoch": 0.07419656337623122, + "kl_loss": 573.640869140625, + "loss_ib": 5.772319793701172, + "step": 258 + }, + { + "ce_ib": 37.536102294921875, + "ce_orig": 0.8849014043807983, + "epoch": 0.07448414695520886, + "kl_loss": 512.6968994140625, + "loss_ib": 5.1645050048828125, + "step": 259 + }, + { + "ce_ib": 33.12932586669922, + "ce_orig": 0.5551506876945496, + "epoch": 0.07448414695520886, + "kl_loss": 503.11016845703125, + "loss_ib": 5.064230918884277, + "step": 259 + }, + { + "ce_ib": 33.1467170715332, + "ce_orig": 1.3075788021087646, + "epoch": 0.07448414695520886, + "kl_loss": 618.4505615234375, + "loss_ib": 6.217652320861816, + "step": 259 + }, + { + "ce_ib": 34.920448303222656, + "ce_orig": 0.9793207049369812, + "epoch": 0.07448414695520886, + "kl_loss": 603.3938598632812, + "loss_ib": 6.068859100341797, + "step": 259 + }, + { + "epoch": 0.0747717305341865, + "grad_norm": 157.27696228027344, + "learning_rate": 8.089171974522295e-06, + "loss": 7.372, + "step": 260 + }, + { + "ce_ib": 35.68424987792969, + "ce_orig": 1.031170129776001, + "epoch": 0.0747717305341865, + "kl_loss": 524.8594970703125, + "loss_ib": 5.284278869628906, + "step": 260 + }, + { + "ce_ib": 35.50361633300781, + "ce_orig": 0.5432813167572021, + "epoch": 0.0747717305341865, + "kl_loss": 531.8944091796875, + "loss_ib": 5.354447364807129, + "step": 260 + }, + { + "ce_ib": 34.80183792114258, + "ce_orig": 0.8772653937339783, + "epoch": 0.0747717305341865, + "kl_loss": 527.3760986328125, + "loss_ib": 5.308562278747559, + "step": 260 + }, + { + "ce_ib": 34.62561798095703, + "ce_orig": 0.8580355048179626, + "epoch": 0.0747717305341865, + "kl_loss": 524.876708984375, + "loss_ib": 5.283392906188965, + "step": 260 + }, + { + "ce_ib": 33.79844665527344, + "ce_orig": 0.6697705984115601, + "epoch": 0.07505931411316413, + "kl_loss": 518.130126953125, + "loss_ib": 5.215099334716797, + "step": 261 + }, + { + "ce_ib": 35.4140739440918, + "ce_orig": 1.116640567779541, + "epoch": 0.07505931411316413, + "kl_loss": 495.97052001953125, + "loss_ib": 4.995119094848633, + "step": 261 + }, + { + "ce_ib": 40.22637176513672, + "ce_orig": 1.194669485092163, + "epoch": 0.07505931411316413, + "kl_loss": 483.3502197265625, + "loss_ib": 4.8737287521362305, + "step": 261 + }, + { + "ce_ib": 37.521358489990234, + "ce_orig": 1.1145161390304565, + "epoch": 0.07505931411316413, + "kl_loss": 467.95684814453125, + "loss_ib": 4.717089653015137, + "step": 261 + }, + { + "ce_ib": 37.80555725097656, + "ce_orig": 0.8788209557533264, + "epoch": 0.07534689769214178, + "kl_loss": 423.5547180175781, + "loss_ib": 4.27335262298584, + "step": 262 + }, + { + "ce_ib": 36.85504150390625, + "ce_orig": 0.5465120077133179, + "epoch": 0.07534689769214178, + "kl_loss": 476.8722229003906, + "loss_ib": 4.805577278137207, + "step": 262 + }, + { + "ce_ib": 37.499755859375, + "ce_orig": 1.1639437675476074, + "epoch": 0.07534689769214178, + "kl_loss": 467.90582275390625, + "loss_ib": 4.71655797958374, + "step": 262 + }, + { + "ce_ib": 36.78924560546875, + "ce_orig": 1.2826528549194336, + "epoch": 0.07534689769214178, + "kl_loss": 477.6324768066406, + "loss_ib": 4.813113689422607, + "step": 262 + }, + { + "ce_ib": 38.421451568603516, + "ce_orig": 0.8689420819282532, + "epoch": 0.07563448127111942, + "kl_loss": 477.5055847167969, + "loss_ib": 4.813477039337158, + "step": 263 + }, + { + "ce_ib": 35.91413879394531, + "ce_orig": 0.8632240891456604, + "epoch": 0.07563448127111942, + "kl_loss": 485.8994445800781, + "loss_ib": 4.894908428192139, + "step": 263 + }, + { + "ce_ib": 39.28192901611328, + "ce_orig": 0.877941370010376, + "epoch": 0.07563448127111942, + "kl_loss": 447.78070068359375, + "loss_ib": 4.517088890075684, + "step": 263 + }, + { + "ce_ib": 40.37826156616211, + "ce_orig": 0.8957875370979309, + "epoch": 0.07563448127111942, + "kl_loss": 403.5020446777344, + "loss_ib": 4.0753984451293945, + "step": 263 + }, + { + "ce_ib": 42.27157974243164, + "ce_orig": 1.3913816213607788, + "epoch": 0.07592206485009706, + "kl_loss": 423.022216796875, + "loss_ib": 4.272493839263916, + "step": 264 + }, + { + "ce_ib": 36.27720260620117, + "ce_orig": 1.0942326784133911, + "epoch": 0.07592206485009706, + "kl_loss": 400.34161376953125, + "loss_ib": 4.039693355560303, + "step": 264 + }, + { + "ce_ib": 33.6429328918457, + "ce_orig": 0.8256592154502869, + "epoch": 0.07592206485009706, + "kl_loss": 424.09423828125, + "loss_ib": 4.274585247039795, + "step": 264 + }, + { + "ce_ib": 38.3378791809082, + "ce_orig": 1.0243308544158936, + "epoch": 0.07592206485009706, + "kl_loss": 472.14508056640625, + "loss_ib": 4.759788513183594, + "step": 264 + }, + { + "epoch": 0.0762096484290747, + "grad_norm": 123.24594116210938, + "learning_rate": 8.248407643312102e-06, + "loss": 5.7351, + "step": 265 + }, + { + "ce_ib": 38.452537536621094, + "ce_orig": 1.0122849941253662, + "epoch": 0.0762096484290747, + "kl_loss": 435.57763671875, + "loss_ib": 4.394228935241699, + "step": 265 + }, + { + "ce_ib": 39.90632629394531, + "ce_orig": 1.178553819656372, + "epoch": 0.0762096484290747, + "kl_loss": 329.4811096191406, + "loss_ib": 3.334717273712158, + "step": 265 + }, + { + "ce_ib": 34.292686462402344, + "ce_orig": 1.4103134870529175, + "epoch": 0.0762096484290747, + "kl_loss": 385.14447021484375, + "loss_ib": 3.885737180709839, + "step": 265 + }, + { + "ce_ib": 42.36109161376953, + "ce_orig": 1.1843057870864868, + "epoch": 0.0762096484290747, + "kl_loss": 398.69232177734375, + "loss_ib": 4.029284477233887, + "step": 265 + }, + { + "ce_ib": 38.15491485595703, + "ce_orig": 1.1772407293319702, + "epoch": 0.07649723200805233, + "kl_loss": 360.84259033203125, + "loss_ib": 3.646580696105957, + "step": 266 + }, + { + "ce_ib": 43.11347961425781, + "ce_orig": 1.3642669916152954, + "epoch": 0.07649723200805233, + "kl_loss": 326.4147033691406, + "loss_ib": 3.307260274887085, + "step": 266 + }, + { + "ce_ib": 46.28087615966797, + "ce_orig": 1.7264760732650757, + "epoch": 0.07649723200805233, + "kl_loss": 390.9441223144531, + "loss_ib": 3.955721855163574, + "step": 266 + }, + { + "ce_ib": 44.236488342285156, + "ce_orig": 2.0390701293945312, + "epoch": 0.07649723200805233, + "kl_loss": 340.2770690917969, + "loss_ib": 3.447007179260254, + "step": 266 + }, + { + "ce_ib": 48.65879821777344, + "ce_orig": 1.3260499238967896, + "epoch": 0.07678481558702999, + "kl_loss": 225.09542846679688, + "loss_ib": 2.2996129989624023, + "step": 267 + }, + { + "ce_ib": 44.989524841308594, + "ce_orig": 0.7958585619926453, + "epoch": 0.07678481558702999, + "kl_loss": 330.1822509765625, + "loss_ib": 3.3468120098114014, + "step": 267 + }, + { + "ce_ib": 44.695777893066406, + "ce_orig": 1.6154531240463257, + "epoch": 0.07678481558702999, + "kl_loss": 283.551513671875, + "loss_ib": 2.8802108764648438, + "step": 267 + }, + { + "ce_ib": 50.11431884765625, + "ce_orig": 2.1371569633483887, + "epoch": 0.07678481558702999, + "kl_loss": 288.13592529296875, + "loss_ib": 2.931473731994629, + "step": 267 + }, + { + "ce_ib": 49.849369049072266, + "ce_orig": 1.6994304656982422, + "epoch": 0.07707239916600762, + "kl_loss": 307.9149169921875, + "loss_ib": 3.128998279571533, + "step": 268 + }, + { + "ce_ib": 46.516693115234375, + "ce_orig": 2.531648635864258, + "epoch": 0.07707239916600762, + "kl_loss": 257.6400451660156, + "loss_ib": 2.6229171752929688, + "step": 268 + }, + { + "ce_ib": 49.18770980834961, + "ce_orig": 0.8902948498725891, + "epoch": 0.07707239916600762, + "kl_loss": 287.6979064941406, + "loss_ib": 2.926166534423828, + "step": 268 + }, + { + "ce_ib": 43.51984786987305, + "ce_orig": 0.8550523519515991, + "epoch": 0.07707239916600762, + "kl_loss": 319.45574951171875, + "loss_ib": 3.238077402114868, + "step": 268 + }, + { + "ce_ib": 57.45269012451172, + "ce_orig": 1.6990851163864136, + "epoch": 0.07735998274498526, + "kl_loss": 247.26651000976562, + "loss_ib": 2.5301177501678467, + "step": 269 + }, + { + "ce_ib": 53.520240783691406, + "ce_orig": 1.453221082687378, + "epoch": 0.07735998274498526, + "kl_loss": 240.28468322753906, + "loss_ib": 2.456367015838623, + "step": 269 + }, + { + "ce_ib": 45.94785690307617, + "ce_orig": 1.0849944353103638, + "epoch": 0.07735998274498526, + "kl_loss": 244.33505249023438, + "loss_ib": 2.4892983436584473, + "step": 269 + }, + { + "ce_ib": 43.29793930053711, + "ce_orig": 1.103232741355896, + "epoch": 0.07735998274498526, + "kl_loss": 253.640625, + "loss_ib": 2.5797042846679688, + "step": 269 + }, + { + "epoch": 0.0776475663239629, + "grad_norm": 89.99483489990234, + "learning_rate": 8.407643312101912e-06, + "loss": 4.4374, + "step": 270 + }, + { + "ce_ib": 39.90031814575195, + "ce_orig": 0.8725808262825012, + "epoch": 0.0776475663239629, + "kl_loss": 259.3697509765625, + "loss_ib": 2.6335976123809814, + "step": 270 + }, + { + "ce_ib": 42.03105926513672, + "ce_orig": 0.8716458678245544, + "epoch": 0.0776475663239629, + "kl_loss": 230.16326904296875, + "loss_ib": 2.3436636924743652, + "step": 270 + }, + { + "ce_ib": 42.68729782104492, + "ce_orig": 0.665064811706543, + "epoch": 0.0776475663239629, + "kl_loss": 249.3673095703125, + "loss_ib": 2.536360263824463, + "step": 270 + }, + { + "ce_ib": 39.38070297241211, + "ce_orig": 1.2106064558029175, + "epoch": 0.0776475663239629, + "kl_loss": 242.24522399902344, + "loss_ib": 2.4618327617645264, + "step": 270 + }, + { + "ce_ib": 45.427120208740234, + "ce_orig": 0.9336494207382202, + "epoch": 0.07793514990294054, + "kl_loss": 236.5987548828125, + "loss_ib": 2.411414623260498, + "step": 271 + }, + { + "ce_ib": 40.51905059814453, + "ce_orig": 0.82356858253479, + "epoch": 0.07793514990294054, + "kl_loss": 320.950927734375, + "loss_ib": 3.250028371810913, + "step": 271 + }, + { + "ce_ib": 45.87284469604492, + "ce_orig": 2.147392988204956, + "epoch": 0.07793514990294054, + "kl_loss": 183.42807006835938, + "loss_ib": 1.8801534175872803, + "step": 271 + }, + { + "ce_ib": 39.21931838989258, + "ce_orig": 0.6727441549301147, + "epoch": 0.07793514990294054, + "kl_loss": 259.3567199707031, + "loss_ib": 2.632786512374878, + "step": 271 + }, + { + "ce_ib": 47.522220611572266, + "ce_orig": 2.0349419116973877, + "epoch": 0.07822273348191819, + "kl_loss": 171.4339141845703, + "loss_ib": 1.7618613243103027, + "step": 272 + }, + { + "ce_ib": 52.649227142333984, + "ce_orig": 1.1070398092269897, + "epoch": 0.07822273348191819, + "kl_loss": 232.63453674316406, + "loss_ib": 2.3789944648742676, + "step": 272 + }, + { + "ce_ib": 46.19776916503906, + "ce_orig": 1.476123332977295, + "epoch": 0.07822273348191819, + "kl_loss": 210.17898559570312, + "loss_ib": 2.1479876041412354, + "step": 272 + }, + { + "ce_ib": 48.105499267578125, + "ce_orig": 1.1524395942687988, + "epoch": 0.07822273348191819, + "kl_loss": 196.00885009765625, + "loss_ib": 2.0081939697265625, + "step": 272 + }, + { + "ce_ib": 46.68330383300781, + "ce_orig": 0.884026050567627, + "epoch": 0.07851031706089583, + "kl_loss": 164.7951202392578, + "loss_ib": 1.6946345567703247, + "step": 273 + }, + { + "ce_ib": 46.4140739440918, + "ce_orig": 1.2737939357757568, + "epoch": 0.07851031706089583, + "kl_loss": 168.782470703125, + "loss_ib": 1.7342387437820435, + "step": 273 + }, + { + "ce_ib": 44.1322135925293, + "ce_orig": 0.8222432136535645, + "epoch": 0.07851031706089583, + "kl_loss": 183.51260375976562, + "loss_ib": 1.8792582750320435, + "step": 273 + }, + { + "ce_ib": 47.30166244506836, + "ce_orig": 1.5957039594650269, + "epoch": 0.07851031706089583, + "kl_loss": 154.28269958496094, + "loss_ib": 1.5901285409927368, + "step": 273 + }, + { + "ce_ib": 49.634849548339844, + "ce_orig": 1.259581208229065, + "epoch": 0.07879790063987346, + "kl_loss": 156.60406494140625, + "loss_ib": 1.615675449371338, + "step": 274 + }, + { + "ce_ib": 45.53166198730469, + "ce_orig": 1.591690182685852, + "epoch": 0.07879790063987346, + "kl_loss": 165.76138305664062, + "loss_ib": 1.7031453847885132, + "step": 274 + }, + { + "ce_ib": 48.556114196777344, + "ce_orig": 1.4561307430267334, + "epoch": 0.07879790063987346, + "kl_loss": 145.72293090820312, + "loss_ib": 1.505785346031189, + "step": 274 + }, + { + "ce_ib": 51.11494064331055, + "ce_orig": 1.5235646963119507, + "epoch": 0.07879790063987346, + "kl_loss": 142.93948364257812, + "loss_ib": 1.4805097579956055, + "step": 274 + }, + { + "epoch": 0.0790854842188511, + "grad_norm": 66.16566467285156, + "learning_rate": 8.566878980891721e-06, + "loss": 3.4286, + "step": 275 + }, + { + "ce_ib": 39.09020233154297, + "ce_orig": 0.8260847330093384, + "epoch": 0.0790854842188511, + "kl_loss": 139.89077758789062, + "loss_ib": 1.4379980564117432, + "step": 275 + }, + { + "ce_ib": 48.023948669433594, + "ce_orig": 1.396461009979248, + "epoch": 0.0790854842188511, + "kl_loss": 146.98944091796875, + "loss_ib": 1.517918348312378, + "step": 275 + }, + { + "ce_ib": 42.226966857910156, + "ce_orig": 1.0770304203033447, + "epoch": 0.0790854842188511, + "kl_loss": 156.64419555664062, + "loss_ib": 1.6086689233779907, + "step": 275 + }, + { + "ce_ib": 36.58985900878906, + "ce_orig": 1.2845979928970337, + "epoch": 0.0790854842188511, + "kl_loss": 211.11338806152344, + "loss_ib": 2.14772367477417, + "step": 275 + }, + { + "ce_ib": 44.49997329711914, + "ce_orig": 1.6609095335006714, + "epoch": 0.07937306779782874, + "kl_loss": 128.6660614013672, + "loss_ib": 1.331160545349121, + "step": 276 + }, + { + "ce_ib": 40.899295806884766, + "ce_orig": 1.6309945583343506, + "epoch": 0.07937306779782874, + "kl_loss": 139.94818115234375, + "loss_ib": 1.4403811693191528, + "step": 276 + }, + { + "ce_ib": 40.4159049987793, + "ce_orig": 0.8224959373474121, + "epoch": 0.07937306779782874, + "kl_loss": 169.31053161621094, + "loss_ib": 1.7335212230682373, + "step": 276 + }, + { + "ce_ib": 40.17991638183594, + "ce_orig": 1.1940970420837402, + "epoch": 0.07937306779782874, + "kl_loss": 130.6182403564453, + "loss_ib": 1.3463622331619263, + "step": 276 + }, + { + "ce_ib": 48.19374465942383, + "ce_orig": 1.0767353773117065, + "epoch": 0.07966065137680639, + "kl_loss": 152.88658142089844, + "loss_ib": 1.5770596265792847, + "step": 277 + }, + { + "ce_ib": 48.65080261230469, + "ce_orig": 1.0481511354446411, + "epoch": 0.07966065137680639, + "kl_loss": 114.82660675048828, + "loss_ib": 1.1969168186187744, + "step": 277 + }, + { + "ce_ib": 39.97406768798828, + "ce_orig": 1.4612183570861816, + "epoch": 0.07966065137680639, + "kl_loss": 131.66680908203125, + "loss_ib": 1.3566421270370483, + "step": 277 + }, + { + "ce_ib": 42.1298828125, + "ce_orig": 1.1980208158493042, + "epoch": 0.07966065137680639, + "kl_loss": 127.7425308227539, + "loss_ib": 1.3195551633834839, + "step": 277 + }, + { + "ce_ib": 40.26860427856445, + "ce_orig": 1.4410648345947266, + "epoch": 0.07994823495578403, + "kl_loss": 95.930908203125, + "loss_ib": 0.9995777010917664, + "step": 278 + }, + { + "ce_ib": 40.977745056152344, + "ce_orig": 1.2464489936828613, + "epoch": 0.07994823495578403, + "kl_loss": 109.65506744384766, + "loss_ib": 1.1375283002853394, + "step": 278 + }, + { + "ce_ib": 42.28449249267578, + "ce_orig": 0.9634944796562195, + "epoch": 0.07994823495578403, + "kl_loss": 126.33438110351562, + "loss_ib": 1.3056282997131348, + "step": 278 + }, + { + "ce_ib": 35.779815673828125, + "ce_orig": 1.0111249685287476, + "epoch": 0.07994823495578403, + "kl_loss": 108.74060821533203, + "loss_ib": 1.1231858730316162, + "step": 278 + }, + { + "ce_ib": 41.58162307739258, + "ce_orig": 0.858116626739502, + "epoch": 0.08023581853476167, + "kl_loss": 94.94490051269531, + "loss_ib": 0.9910306334495544, + "step": 279 + }, + { + "ce_ib": 40.43954086303711, + "ce_orig": 1.5376956462860107, + "epoch": 0.08023581853476167, + "kl_loss": 114.16512298583984, + "loss_ib": 1.1820907592773438, + "step": 279 + }, + { + "ce_ib": 39.77493667602539, + "ce_orig": 1.4295803308486938, + "epoch": 0.08023581853476167, + "kl_loss": 97.94513702392578, + "loss_ib": 1.019226312637329, + "step": 279 + }, + { + "ce_ib": 37.93669509887695, + "ce_orig": 0.9915726780891418, + "epoch": 0.08023581853476167, + "kl_loss": 100.8277359008789, + "loss_ib": 1.046213984489441, + "step": 279 + }, + { + "epoch": 0.0805234021137393, + "grad_norm": 46.49250793457031, + "learning_rate": 8.726114649681529e-06, + "loss": 2.5267, + "step": 280 + }, + { + "ce_ib": 40.85035705566406, + "ce_orig": 0.9079654216766357, + "epoch": 0.0805234021137393, + "kl_loss": 103.09587860107422, + "loss_ib": 1.07180917263031, + "step": 280 + }, + { + "ce_ib": 36.37861633300781, + "ce_orig": 0.7120267152786255, + "epoch": 0.0805234021137393, + "kl_loss": 100.90724182128906, + "loss_ib": 1.045451045036316, + "step": 280 + }, + { + "ce_ib": 41.00496292114258, + "ce_orig": 0.710496187210083, + "epoch": 0.0805234021137393, + "kl_loss": 125.49755096435547, + "loss_ib": 1.295980453491211, + "step": 280 + }, + { + "ce_ib": 41.43547058105469, + "ce_orig": 1.6307661533355713, + "epoch": 0.0805234021137393, + "kl_loss": 75.49640655517578, + "loss_ib": 0.7963995337486267, + "step": 280 + }, + { + "ce_ib": 35.050662994384766, + "ce_orig": 0.9704921245574951, + "epoch": 0.08081098569271694, + "kl_loss": 99.29730224609375, + "loss_ib": 1.028023600578308, + "step": 281 + }, + { + "ce_ib": 37.16071701049805, + "ce_orig": 0.9510385394096375, + "epoch": 0.08081098569271694, + "kl_loss": 84.18745422363281, + "loss_ib": 0.879035234451294, + "step": 281 + }, + { + "ce_ib": 40.97324752807617, + "ce_orig": 0.6980030536651611, + "epoch": 0.08081098569271694, + "kl_loss": 85.10052490234375, + "loss_ib": 0.8919785022735596, + "step": 281 + }, + { + "ce_ib": 37.02272033691406, + "ce_orig": 1.8392256498336792, + "epoch": 0.08081098569271694, + "kl_loss": 110.20343017578125, + "loss_ib": 1.1390570402145386, + "step": 281 + }, + { + "ce_ib": 37.121299743652344, + "ce_orig": 1.2205618619918823, + "epoch": 0.08109856927169459, + "kl_loss": 70.73722839355469, + "loss_ib": 0.7444935441017151, + "step": 282 + }, + { + "ce_ib": 37.17251968383789, + "ce_orig": 1.2131117582321167, + "epoch": 0.08109856927169459, + "kl_loss": 72.53062438964844, + "loss_ib": 0.762478768825531, + "step": 282 + }, + { + "ce_ib": 38.68415832519531, + "ce_orig": 1.2195942401885986, + "epoch": 0.08109856927169459, + "kl_loss": 97.90969848632812, + "loss_ib": 1.017781138420105, + "step": 282 + }, + { + "ce_ib": 44.60282897949219, + "ce_orig": 2.035943031311035, + "epoch": 0.08109856927169459, + "kl_loss": 74.025634765625, + "loss_ib": 0.7848591208457947, + "step": 282 + }, + { + "ce_ib": 32.7514533996582, + "ce_orig": 1.1092407703399658, + "epoch": 0.08138615285067223, + "kl_loss": 64.70214080810547, + "loss_ib": 0.6797728538513184, + "step": 283 + }, + { + "ce_ib": 34.242916107177734, + "ce_orig": 1.563724160194397, + "epoch": 0.08138615285067223, + "kl_loss": 74.12162780761719, + "loss_ib": 0.7754591703414917, + "step": 283 + }, + { + "ce_ib": 39.811283111572266, + "ce_orig": 1.3022565841674805, + "epoch": 0.08138615285067223, + "kl_loss": 79.62696075439453, + "loss_ib": 0.8360808491706848, + "step": 283 + }, + { + "ce_ib": 38.87461853027344, + "ce_orig": 0.7815361618995667, + "epoch": 0.08138615285067223, + "kl_loss": 73.88887023925781, + "loss_ib": 0.7777632474899292, + "step": 283 + }, + { + "ce_ib": 31.66046905517578, + "ce_orig": 1.1096495389938354, + "epoch": 0.08167373642964987, + "kl_loss": 92.97333526611328, + "loss_ib": 0.9613937735557556, + "step": 284 + }, + { + "ce_ib": 36.92252731323242, + "ce_orig": 0.7650741338729858, + "epoch": 0.08167373642964987, + "kl_loss": 67.56904602050781, + "loss_ib": 0.7126129865646362, + "step": 284 + }, + { + "ce_ib": 40.48139953613281, + "ce_orig": 1.4183719158172607, + "epoch": 0.08167373642964987, + "kl_loss": 63.496063232421875, + "loss_ib": 0.6754420399665833, + "step": 284 + }, + { + "ce_ib": 31.517629623413086, + "ce_orig": 0.8108515739440918, + "epoch": 0.08167373642964987, + "kl_loss": 62.563720703125, + "loss_ib": 0.6571548581123352, + "step": 284 + }, + { + "epoch": 0.0819613200086275, + "grad_norm": 30.389598846435547, + "learning_rate": 8.885350318471338e-06, + "loss": 2.0103, + "step": 285 + }, + { + "ce_ib": 32.3922119140625, + "ce_orig": 0.8194482326507568, + "epoch": 0.0819613200086275, + "kl_loss": 72.32354736328125, + "loss_ib": 0.7556276321411133, + "step": 285 + }, + { + "ce_ib": 38.804710388183594, + "ce_orig": 0.9490994811058044, + "epoch": 0.0819613200086275, + "kl_loss": 65.6147689819336, + "loss_ib": 0.6949523687362671, + "step": 285 + }, + { + "ce_ib": 34.912132263183594, + "ce_orig": 1.0493797063827515, + "epoch": 0.0819613200086275, + "kl_loss": 53.544273376464844, + "loss_ib": 0.5703548789024353, + "step": 285 + }, + { + "ce_ib": 32.628414154052734, + "ce_orig": 0.8485954403877258, + "epoch": 0.0819613200086275, + "kl_loss": 102.72671508789062, + "loss_ib": 1.0598955154418945, + "step": 285 + }, + { + "ce_ib": 37.96284866333008, + "ce_orig": 1.6377449035644531, + "epoch": 0.08224890358760514, + "kl_loss": 51.751041412353516, + "loss_ib": 0.555473268032074, + "step": 286 + }, + { + "ce_ib": 37.74256134033203, + "ce_orig": 1.1003116369247437, + "epoch": 0.08224890358760514, + "kl_loss": 77.45193481445312, + "loss_ib": 0.8122618794441223, + "step": 286 + }, + { + "ce_ib": 36.4458122253418, + "ce_orig": 1.4265295267105103, + "epoch": 0.08224890358760514, + "kl_loss": 75.32568359375, + "loss_ib": 0.7897026538848877, + "step": 286 + }, + { + "ce_ib": 38.405860900878906, + "ce_orig": 1.6222838163375854, + "epoch": 0.08224890358760514, + "kl_loss": 59.407928466796875, + "loss_ib": 0.6324851512908936, + "step": 286 + }, + { + "ce_ib": 32.6006965637207, + "ce_orig": 0.6943832635879517, + "epoch": 0.0825364871665828, + "kl_loss": 55.28700256347656, + "loss_ib": 0.5854707360267639, + "step": 287 + }, + { + "ce_ib": 36.58828353881836, + "ce_orig": 1.536011815071106, + "epoch": 0.0825364871665828, + "kl_loss": 66.80679321289062, + "loss_ib": 0.704656183719635, + "step": 287 + }, + { + "ce_ib": 28.62580680847168, + "ce_orig": 0.7646328210830688, + "epoch": 0.0825364871665828, + "kl_loss": 55.685218811035156, + "loss_ib": 0.5854779481887817, + "step": 287 + }, + { + "ce_ib": 33.84457778930664, + "ce_orig": 0.9994617700576782, + "epoch": 0.0825364871665828, + "kl_loss": 58.716033935546875, + "loss_ib": 0.6210048794746399, + "step": 287 + }, + { + "ce_ib": 30.92181968688965, + "ce_orig": 1.1378878355026245, + "epoch": 0.08282407074556043, + "kl_loss": 56.736080169677734, + "loss_ib": 0.5982826352119446, + "step": 288 + }, + { + "ce_ib": 33.0194091796875, + "ce_orig": 0.7808621525764465, + "epoch": 0.08282407074556043, + "kl_loss": 66.98883056640625, + "loss_ib": 0.7029076814651489, + "step": 288 + }, + { + "ce_ib": 34.579010009765625, + "ce_orig": 0.7823261022567749, + "epoch": 0.08282407074556043, + "kl_loss": 51.778160095214844, + "loss_ib": 0.5523605942726135, + "step": 288 + }, + { + "ce_ib": 31.606508255004883, + "ce_orig": 0.74196857213974, + "epoch": 0.08282407074556043, + "kl_loss": 49.30632019042969, + "loss_ib": 0.5246697068214417, + "step": 288 + }, + { + "ce_ib": 34.344478607177734, + "ce_orig": 0.6873656511306763, + "epoch": 0.08311165432453807, + "kl_loss": 53.74017333984375, + "loss_ib": 0.5717462301254272, + "step": 289 + }, + { + "ce_ib": 33.72829818725586, + "ce_orig": 1.6017377376556396, + "epoch": 0.08311165432453807, + "kl_loss": 46.42070770263672, + "loss_ib": 0.49793535470962524, + "step": 289 + }, + { + "ce_ib": 31.348825454711914, + "ce_orig": 1.2451566457748413, + "epoch": 0.08311165432453807, + "kl_loss": 46.207916259765625, + "loss_ib": 0.49342799186706543, + "step": 289 + }, + { + "ce_ib": 31.967754364013672, + "ce_orig": 0.8155576586723328, + "epoch": 0.08311165432453807, + "kl_loss": 50.1954460144043, + "loss_ib": 0.5339221954345703, + "step": 289 + }, + { + "epoch": 0.08339923790351571, + "grad_norm": 17.25715446472168, + "learning_rate": 9.044585987261148e-06, + "loss": 1.716, + "step": 290 + }, + { + "ce_ib": 29.394428253173828, + "ce_orig": 0.6837283968925476, + "epoch": 0.08339923790351571, + "kl_loss": 50.952415466308594, + "loss_ib": 0.5389185547828674, + "step": 290 + }, + { + "ce_ib": 29.528079986572266, + "ce_orig": 0.9319428205490112, + "epoch": 0.08339923790351571, + "kl_loss": 49.36161804199219, + "loss_ib": 0.5231442451477051, + "step": 290 + }, + { + "ce_ib": 31.811763763427734, + "ce_orig": 1.0387821197509766, + "epoch": 0.08339923790351571, + "kl_loss": 46.64958572387695, + "loss_ib": 0.49830758571624756, + "step": 290 + }, + { + "ce_ib": 26.820646286010742, + "ce_orig": 1.016945719718933, + "epoch": 0.08339923790351571, + "kl_loss": 50.506019592285156, + "loss_ib": 0.531880795955658, + "step": 290 + }, + { + "ce_ib": 31.8367919921875, + "ce_orig": 1.025216817855835, + "epoch": 0.08368682148249335, + "kl_loss": 35.0272216796875, + "loss_ib": 0.38210898637771606, + "step": 291 + }, + { + "ce_ib": 33.226322174072266, + "ce_orig": 1.2911221981048584, + "epoch": 0.08368682148249335, + "kl_loss": 58.08740997314453, + "loss_ib": 0.6141003966331482, + "step": 291 + }, + { + "ce_ib": 32.05693435668945, + "ce_orig": 1.126991629600525, + "epoch": 0.08368682148249335, + "kl_loss": 47.8099479675293, + "loss_ib": 0.5101563930511475, + "step": 291 + }, + { + "ce_ib": 31.257158279418945, + "ce_orig": 0.95924311876297, + "epoch": 0.08368682148249335, + "kl_loss": 45.37004089355469, + "loss_ib": 0.48495757579803467, + "step": 291 + }, + { + "ce_ib": 30.047136306762695, + "ce_orig": 1.108436942100525, + "epoch": 0.083974405061471, + "kl_loss": 44.887725830078125, + "loss_ib": 0.47892439365386963, + "step": 292 + }, + { + "ce_ib": 29.945829391479492, + "ce_orig": 1.0467983484268188, + "epoch": 0.083974405061471, + "kl_loss": 35.674888610839844, + "loss_ib": 0.38669469952583313, + "step": 292 + }, + { + "ce_ib": 31.76511001586914, + "ce_orig": 1.2328693866729736, + "epoch": 0.083974405061471, + "kl_loss": 43.979034423828125, + "loss_ib": 0.4715554416179657, + "step": 292 + }, + { + "ce_ib": 32.23439407348633, + "ce_orig": 1.2534009218215942, + "epoch": 0.083974405061471, + "kl_loss": 38.1639518737793, + "loss_ib": 0.4138738811016083, + "step": 292 + }, + { + "ce_ib": 30.920150756835938, + "ce_orig": 0.8429147005081177, + "epoch": 0.08426198864044863, + "kl_loss": 42.72565460205078, + "loss_ib": 0.4581766724586487, + "step": 293 + }, + { + "ce_ib": 29.249128341674805, + "ce_orig": 1.381933331489563, + "epoch": 0.08426198864044863, + "kl_loss": 38.22590255737305, + "loss_ib": 0.41150814294815063, + "step": 293 + }, + { + "ce_ib": 27.40785026550293, + "ce_orig": 0.7817553877830505, + "epoch": 0.08426198864044863, + "kl_loss": 34.802711486816406, + "loss_ib": 0.375434935092926, + "step": 293 + }, + { + "ce_ib": 31.316415786743164, + "ce_orig": 0.8332533836364746, + "epoch": 0.08426198864044863, + "kl_loss": 44.91577911376953, + "loss_ib": 0.48047420382499695, + "step": 293 + }, + { + "ce_ib": 25.795753479003906, + "ce_orig": 1.2377901077270508, + "epoch": 0.08454957221942627, + "kl_loss": 40.89204406738281, + "loss_ib": 0.4347161650657654, + "step": 294 + }, + { + "ce_ib": 32.04143142700195, + "ce_orig": 0.7857619524002075, + "epoch": 0.08454957221942627, + "kl_loss": 36.315128326416016, + "loss_ib": 0.3951927125453949, + "step": 294 + }, + { + "ce_ib": 29.322782516479492, + "ce_orig": 0.8479982614517212, + "epoch": 0.08454957221942627, + "kl_loss": 43.97806167602539, + "loss_ib": 0.4691033959388733, + "step": 294 + }, + { + "ce_ib": 27.92338752746582, + "ce_orig": 0.9372240900993347, + "epoch": 0.08454957221942627, + "kl_loss": 34.369789123535156, + "loss_ib": 0.3716212809085846, + "step": 294 + }, + { + "epoch": 0.08483715579840391, + "grad_norm": 11.152618408203125, + "learning_rate": 9.203821656050957e-06, + "loss": 1.4786, + "step": 295 + }, + { + "ce_ib": 31.50551986694336, + "ce_orig": 0.7309855818748474, + "epoch": 0.08483715579840391, + "kl_loss": 36.99781799316406, + "loss_ib": 0.4014836847782135, + "step": 295 + }, + { + "ce_ib": 28.562597274780273, + "ce_orig": 0.9782903790473938, + "epoch": 0.08483715579840391, + "kl_loss": 40.10491180419922, + "loss_ib": 0.4296116828918457, + "step": 295 + }, + { + "ce_ib": 24.42827796936035, + "ce_orig": 0.8424835205078125, + "epoch": 0.08483715579840391, + "kl_loss": 33.25676727294922, + "loss_ib": 0.35699597001075745, + "step": 295 + }, + { + "ce_ib": 29.361215591430664, + "ce_orig": 0.6136335730552673, + "epoch": 0.08483715579840391, + "kl_loss": 38.58903121948242, + "loss_ib": 0.4152515232563019, + "step": 295 + }, + { + "ce_ib": 26.14788818359375, + "ce_orig": 0.85167396068573, + "epoch": 0.08512473937738155, + "kl_loss": 38.04214096069336, + "loss_ib": 0.4065692722797394, + "step": 296 + }, + { + "ce_ib": 29.764019012451172, + "ce_orig": 1.7308716773986816, + "epoch": 0.08512473937738155, + "kl_loss": 32.98516845703125, + "loss_ib": 0.3596157133579254, + "step": 296 + }, + { + "ce_ib": 30.012575149536133, + "ce_orig": 1.0343323945999146, + "epoch": 0.08512473937738155, + "kl_loss": 38.74031066894531, + "loss_ib": 0.41741567850112915, + "step": 296 + }, + { + "ce_ib": 30.881479263305664, + "ce_orig": 1.7702387571334839, + "epoch": 0.08512473937738155, + "kl_loss": 21.21739959716797, + "loss_ib": 0.24305547773838043, + "step": 296 + }, + { + "ce_ib": 24.717079162597656, + "ce_orig": 0.9549234509468079, + "epoch": 0.08541232295635919, + "kl_loss": 37.45452117919922, + "loss_ib": 0.39926227927207947, + "step": 297 + }, + { + "ce_ib": 26.27513885498047, + "ce_orig": 0.7507061958312988, + "epoch": 0.08541232295635919, + "kl_loss": 34.658172607421875, + "loss_ib": 0.37285685539245605, + "step": 297 + }, + { + "ce_ib": 26.04485321044922, + "ce_orig": 0.6326652765274048, + "epoch": 0.08541232295635919, + "kl_loss": 41.222965240478516, + "loss_ib": 0.4382745027542114, + "step": 297 + }, + { + "ce_ib": 22.960899353027344, + "ce_orig": 0.855808675289154, + "epoch": 0.08541232295635919, + "kl_loss": 32.55647277832031, + "loss_ib": 0.34852561354637146, + "step": 297 + }, + { + "ce_ib": 27.634048461914062, + "ce_orig": 0.8469107747077942, + "epoch": 0.08569990653533684, + "kl_loss": 52.79608917236328, + "loss_ib": 0.5555949211120605, + "step": 298 + }, + { + "ce_ib": 25.107303619384766, + "ce_orig": 1.0334007740020752, + "epoch": 0.08569990653533684, + "kl_loss": 23.502300262451172, + "loss_ib": 0.2601303160190582, + "step": 298 + }, + { + "ce_ib": 26.35793113708496, + "ce_orig": 0.8023892641067505, + "epoch": 0.08569990653533684, + "kl_loss": 33.53641128540039, + "loss_ib": 0.36172202229499817, + "step": 298 + }, + { + "ce_ib": 28.72397804260254, + "ce_orig": 0.8318001627922058, + "epoch": 0.08569990653533684, + "kl_loss": 28.708393096923828, + "loss_ib": 0.31580790877342224, + "step": 298 + }, + { + "ce_ib": 25.58355140686035, + "ce_orig": 0.6520025134086609, + "epoch": 0.08598749011431447, + "kl_loss": 27.046260833740234, + "loss_ib": 0.2960461676120758, + "step": 299 + }, + { + "ce_ib": 27.802156448364258, + "ce_orig": 0.7854000926017761, + "epoch": 0.08598749011431447, + "kl_loss": 28.843101501464844, + "loss_ib": 0.31623315811157227, + "step": 299 + }, + { + "ce_ib": 23.300045013427734, + "ce_orig": 0.8532059192657471, + "epoch": 0.08598749011431447, + "kl_loss": 27.97170639038086, + "loss_ib": 0.3030170798301697, + "step": 299 + }, + { + "ce_ib": 27.139148712158203, + "ce_orig": 0.8574588894844055, + "epoch": 0.08598749011431447, + "kl_loss": 65.27938842773438, + "loss_ib": 0.6799330115318298, + "step": 299 + }, + { + "epoch": 0.08627507369329211, + "grad_norm": 8.516314506530762, + "learning_rate": 9.363057324840765e-06, + "loss": 1.3038, + "step": 300 + }, + { + "ce_ib": 27.859683990478516, + "ce_orig": 0.7980900406837463, + "epoch": 0.08627507369329211, + "kl_loss": 31.5460262298584, + "loss_ib": 0.34331992268562317, + "step": 300 + }, + { + "ce_ib": 26.832685470581055, + "ce_orig": 1.1817421913146973, + "epoch": 0.08627507369329211, + "kl_loss": 36.501564025878906, + "loss_ib": 0.3918483257293701, + "step": 300 + }, + { + "ce_ib": 24.322065353393555, + "ce_orig": 1.163743495941162, + "epoch": 0.08627507369329211, + "kl_loss": 25.613513946533203, + "loss_ib": 0.28045719861984253, + "step": 300 + }, + { + "ce_ib": 23.191028594970703, + "ce_orig": 0.7588955760002136, + "epoch": 0.08627507369329211, + "kl_loss": 25.832067489624023, + "loss_ib": 0.28151169419288635, + "step": 300 + }, + { + "ce_ib": 28.026681900024414, + "ce_orig": 1.1685611009597778, + "epoch": 0.08656265727226975, + "kl_loss": 27.28811264038086, + "loss_ib": 0.30090779066085815, + "step": 301 + }, + { + "ce_ib": 28.903520584106445, + "ce_orig": 0.8756263852119446, + "epoch": 0.08656265727226975, + "kl_loss": 40.801658630371094, + "loss_ib": 0.43692007660865784, + "step": 301 + }, + { + "ce_ib": 32.01344299316406, + "ce_orig": 1.7789306640625, + "epoch": 0.08656265727226975, + "kl_loss": 28.048770904541016, + "loss_ib": 0.31250113248825073, + "step": 301 + }, + { + "ce_ib": 24.778104782104492, + "ce_orig": 1.0734585523605347, + "epoch": 0.08656265727226975, + "kl_loss": 26.787107467651367, + "loss_ib": 0.29264917969703674, + "step": 301 + }, + { + "ce_ib": 26.043542861938477, + "ce_orig": 0.6056478023529053, + "epoch": 0.08685024085124739, + "kl_loss": 22.7061767578125, + "loss_ib": 0.2531053125858307, + "step": 302 + }, + { + "ce_ib": 28.747360229492188, + "ce_orig": 0.8331350684165955, + "epoch": 0.08685024085124739, + "kl_loss": 35.54327392578125, + "loss_ib": 0.38418009877204895, + "step": 302 + }, + { + "ce_ib": 25.376556396484375, + "ce_orig": 0.9154051542282104, + "epoch": 0.08685024085124739, + "kl_loss": 17.989727020263672, + "loss_ib": 0.2052738070487976, + "step": 302 + }, + { + "ce_ib": 28.94438362121582, + "ce_orig": 1.2490043640136719, + "epoch": 0.08685024085124739, + "kl_loss": 38.229942321777344, + "loss_ib": 0.4112437963485718, + "step": 302 + }, + { + "ce_ib": 24.730915069580078, + "ce_orig": 1.1931933164596558, + "epoch": 0.08713782443022504, + "kl_loss": 29.647769927978516, + "loss_ib": 0.3212085962295532, + "step": 303 + }, + { + "ce_ib": 22.95311737060547, + "ce_orig": 1.0858471393585205, + "epoch": 0.08713782443022504, + "kl_loss": 30.164508819580078, + "loss_ib": 0.32459819316864014, + "step": 303 + }, + { + "ce_ib": 25.692777633666992, + "ce_orig": 1.2659823894500732, + "epoch": 0.08713782443022504, + "kl_loss": 45.62352752685547, + "loss_ib": 0.4819280505180359, + "step": 303 + }, + { + "ce_ib": 23.456066131591797, + "ce_orig": 0.6252316236495972, + "epoch": 0.08713782443022504, + "kl_loss": 23.17284393310547, + "loss_ib": 0.25518450140953064, + "step": 303 + }, + { + "ce_ib": 26.05988883972168, + "ce_orig": 1.3680285215377808, + "epoch": 0.08742540800920268, + "kl_loss": 24.49565315246582, + "loss_ib": 0.2710164189338684, + "step": 304 + }, + { + "ce_ib": 25.749126434326172, + "ce_orig": 1.3819550275802612, + "epoch": 0.08742540800920268, + "kl_loss": 28.92790412902832, + "loss_ib": 0.3150281608104706, + "step": 304 + }, + { + "ce_ib": 22.42483139038086, + "ce_orig": 0.5715821981430054, + "epoch": 0.08742540800920268, + "kl_loss": 22.742015838623047, + "loss_ib": 0.2498449832201004, + "step": 304 + }, + { + "ce_ib": 25.59157371520996, + "ce_orig": 1.4033931493759155, + "epoch": 0.08742540800920268, + "kl_loss": 30.259891510009766, + "loss_ib": 0.32819050550460815, + "step": 304 + }, + { + "epoch": 0.08771299158818031, + "grad_norm": 3.7502119541168213, + "learning_rate": 9.522292993630574e-06, + "loss": 1.2559, + "step": 305 + }, + { + "ce_ib": 29.969907760620117, + "ce_orig": 2.0007097721099854, + "epoch": 0.08771299158818031, + "kl_loss": 22.26645278930664, + "loss_ib": 0.2526344358921051, + "step": 305 + }, + { + "ce_ib": 19.34192657470703, + "ce_orig": 0.8707708716392517, + "epoch": 0.08771299158818031, + "kl_loss": 26.55549430847168, + "loss_ib": 0.2848968505859375, + "step": 305 + }, + { + "ce_ib": 21.973365783691406, + "ce_orig": 1.0064218044281006, + "epoch": 0.08771299158818031, + "kl_loss": 22.693504333496094, + "loss_ib": 0.2489084005355835, + "step": 305 + }, + { + "ce_ib": 26.216506958007812, + "ce_orig": 0.8900886178016663, + "epoch": 0.08771299158818031, + "kl_loss": 19.825634002685547, + "loss_ib": 0.22447283565998077, + "step": 305 + }, + { + "ce_ib": 23.15169906616211, + "ce_orig": 1.2219324111938477, + "epoch": 0.08800057516715795, + "kl_loss": 24.46666717529297, + "loss_ib": 0.2678183615207672, + "step": 306 + }, + { + "ce_ib": 22.739940643310547, + "ce_orig": 1.0926791429519653, + "epoch": 0.08800057516715795, + "kl_loss": 27.521900177001953, + "loss_ib": 0.29795894026756287, + "step": 306 + }, + { + "ce_ib": 25.519643783569336, + "ce_orig": 0.8576159477233887, + "epoch": 0.08800057516715795, + "kl_loss": 20.15566062927246, + "loss_ib": 0.2270762324333191, + "step": 306 + }, + { + "ce_ib": 21.95849609375, + "ce_orig": 0.7875012755393982, + "epoch": 0.08800057516715795, + "kl_loss": 23.06599998474121, + "loss_ib": 0.2526184916496277, + "step": 306 + }, + { + "ce_ib": 29.18073272705078, + "ce_orig": 2.031694173812866, + "epoch": 0.08828815874613559, + "kl_loss": 23.33743667602539, + "loss_ib": 0.2625550925731659, + "step": 307 + }, + { + "ce_ib": 21.970809936523438, + "ce_orig": 0.589923620223999, + "epoch": 0.08828815874613559, + "kl_loss": 26.482616424560547, + "loss_ib": 0.2867969572544098, + "step": 307 + }, + { + "ce_ib": 21.679189682006836, + "ce_orig": 0.5775496363639832, + "epoch": 0.08828815874613559, + "kl_loss": 48.13336181640625, + "loss_ib": 0.5030127763748169, + "step": 307 + }, + { + "ce_ib": 25.202064514160156, + "ce_orig": 0.4014778137207031, + "epoch": 0.08828815874613559, + "kl_loss": 27.67257308959961, + "loss_ib": 0.301927775144577, + "step": 307 + }, + { + "ce_ib": 27.295534133911133, + "ce_orig": 1.3986626863479614, + "epoch": 0.08857574232511324, + "kl_loss": 19.44972801208496, + "loss_ib": 0.2217928022146225, + "step": 308 + }, + { + "ce_ib": 26.677940368652344, + "ce_orig": 1.4669982194900513, + "epoch": 0.08857574232511324, + "kl_loss": 22.110187530517578, + "loss_ib": 0.24777980148792267, + "step": 308 + }, + { + "ce_ib": 25.397268295288086, + "ce_orig": 0.7194269299507141, + "epoch": 0.08857574232511324, + "kl_loss": 21.656837463378906, + "loss_ib": 0.2419656366109848, + "step": 308 + }, + { + "ce_ib": 23.615497589111328, + "ce_orig": 0.7230740189552307, + "epoch": 0.08857574232511324, + "kl_loss": 20.54231071472168, + "loss_ib": 0.22903859615325928, + "step": 308 + }, + { + "ce_ib": 25.946504592895508, + "ce_orig": 0.7649667263031006, + "epoch": 0.08886332590409088, + "kl_loss": 19.743709564208984, + "loss_ib": 0.2233835905790329, + "step": 309 + }, + { + "ce_ib": 25.53705596923828, + "ce_orig": 1.3177523612976074, + "epoch": 0.08886332590409088, + "kl_loss": 17.188051223754883, + "loss_ib": 0.19741755723953247, + "step": 309 + }, + { + "ce_ib": 23.944272994995117, + "ce_orig": 1.095251202583313, + "epoch": 0.08886332590409088, + "kl_loss": 32.30261993408203, + "loss_ib": 0.34697046875953674, + "step": 309 + }, + { + "ce_ib": 24.48792266845703, + "ce_orig": 1.1920284032821655, + "epoch": 0.08886332590409088, + "kl_loss": 22.399471282958984, + "loss_ib": 0.2484826296567917, + "step": 309 + }, + { + "epoch": 0.08915090948306852, + "grad_norm": 4.221822738647461, + "learning_rate": 9.681528662420384e-06, + "loss": 1.2441, + "step": 310 + }, + { + "ce_ib": 21.77839469909668, + "ce_orig": 0.7371479272842407, + "epoch": 0.08915090948306852, + "kl_loss": 20.585721969604492, + "loss_ib": 0.22763560712337494, + "step": 310 + }, + { + "ce_ib": 19.997831344604492, + "ce_orig": 0.6613921523094177, + "epoch": 0.08915090948306852, + "kl_loss": 19.700130462646484, + "loss_ib": 0.2169991284608841, + "step": 310 + }, + { + "ce_ib": 26.353721618652344, + "ce_orig": 1.7854292392730713, + "epoch": 0.08915090948306852, + "kl_loss": 17.48261260986328, + "loss_ib": 0.2011798471212387, + "step": 310 + }, + { + "ce_ib": 19.99268913269043, + "ce_orig": 0.6877007484436035, + "epoch": 0.08915090948306852, + "kl_loss": 21.342994689941406, + "loss_ib": 0.2334226369857788, + "step": 310 + }, + { + "ce_ib": 18.89698028564453, + "ce_orig": 0.30386409163475037, + "epoch": 0.08943849306204615, + "kl_loss": 36.01085662841797, + "loss_ib": 0.3790055513381958, + "step": 311 + }, + { + "ce_ib": 22.5959415435791, + "ce_orig": 0.44536206126213074, + "epoch": 0.08943849306204615, + "kl_loss": 23.401079177856445, + "loss_ib": 0.25660672783851624, + "step": 311 + }, + { + "ce_ib": 26.67799186706543, + "ce_orig": 1.3645676374435425, + "epoch": 0.08943849306204615, + "kl_loss": 21.983440399169922, + "loss_ib": 0.24651238322257996, + "step": 311 + }, + { + "ce_ib": 27.179054260253906, + "ce_orig": 1.9106354713439941, + "epoch": 0.08943849306204615, + "kl_loss": 18.935150146484375, + "loss_ib": 0.21653054654598236, + "step": 311 + }, + { + "ce_ib": 21.86924934387207, + "ce_orig": 0.520060658454895, + "epoch": 0.08972607664102379, + "kl_loss": 16.354835510253906, + "loss_ib": 0.18541759252548218, + "step": 312 + }, + { + "ce_ib": 21.63800621032715, + "ce_orig": 1.0606663227081299, + "epoch": 0.08972607664102379, + "kl_loss": 43.62150573730469, + "loss_ib": 0.4578530490398407, + "step": 312 + }, + { + "ce_ib": 23.62603187561035, + "ce_orig": 1.3403759002685547, + "epoch": 0.08972607664102379, + "kl_loss": 19.3665771484375, + "loss_ib": 0.21729178726673126, + "step": 312 + }, + { + "ce_ib": 19.89155387878418, + "ce_orig": 0.7725546956062317, + "epoch": 0.08972607664102379, + "kl_loss": 18.36844253540039, + "loss_ib": 0.2035759687423706, + "step": 312 + }, + { + "ce_ib": 22.370769500732422, + "ce_orig": 0.6480394601821899, + "epoch": 0.09001366022000144, + "kl_loss": 17.688186645507812, + "loss_ib": 0.1992526352405548, + "step": 313 + }, + { + "ce_ib": 24.45700454711914, + "ce_orig": 0.5826147198677063, + "epoch": 0.09001366022000144, + "kl_loss": 18.841575622558594, + "loss_ib": 0.21287274360656738, + "step": 313 + }, + { + "ce_ib": 25.685205459594727, + "ce_orig": 1.5794010162353516, + "epoch": 0.09001366022000144, + "kl_loss": 16.19678497314453, + "loss_ib": 0.18765303492546082, + "step": 313 + }, + { + "ce_ib": 25.032880783081055, + "ce_orig": 1.691644549369812, + "epoch": 0.09001366022000144, + "kl_loss": 16.912708282470703, + "loss_ib": 0.19415995478630066, + "step": 313 + }, + { + "ce_ib": 19.637710571289062, + "ce_orig": 0.5982837080955505, + "epoch": 0.09030124379897908, + "kl_loss": 16.402633666992188, + "loss_ib": 0.18366405367851257, + "step": 314 + }, + { + "ce_ib": 22.416698455810547, + "ce_orig": 0.7960580587387085, + "epoch": 0.09030124379897908, + "kl_loss": 21.684656143188477, + "loss_ib": 0.23926326632499695, + "step": 314 + }, + { + "ce_ib": 24.144296646118164, + "ce_orig": 1.1491901874542236, + "epoch": 0.09030124379897908, + "kl_loss": 16.15469741821289, + "loss_ib": 0.1856912523508072, + "step": 314 + }, + { + "ce_ib": 22.414451599121094, + "ce_orig": 0.6498157382011414, + "epoch": 0.09030124379897908, + "kl_loss": 15.109755516052246, + "loss_ib": 0.17351199686527252, + "step": 314 + }, + { + "epoch": 0.09058882737795672, + "grad_norm": 1.5902462005615234, + "learning_rate": 9.840764331210191e-06, + "loss": 1.2124, + "step": 315 + }, + { + "ce_ib": 26.398815155029297, + "ce_orig": 1.2464390993118286, + "epoch": 0.09058882737795672, + "kl_loss": 15.754015922546387, + "loss_ib": 0.18393898010253906, + "step": 315 + }, + { + "ce_ib": 19.097566604614258, + "ce_orig": 0.7237119674682617, + "epoch": 0.09058882737795672, + "kl_loss": 21.74990463256836, + "loss_ib": 0.23659659922122955, + "step": 315 + }, + { + "ce_ib": 15.259860038757324, + "ce_orig": 0.2898668348789215, + "epoch": 0.09058882737795672, + "kl_loss": 29.279403686523438, + "loss_ib": 0.3080538809299469, + "step": 315 + }, + { + "ce_ib": 19.407550811767578, + "ce_orig": 0.8700235486030579, + "epoch": 0.09058882737795672, + "kl_loss": 19.4202938079834, + "loss_ib": 0.2136104851961136, + "step": 315 + }, + { + "ce_ib": 18.41707992553711, + "ce_orig": 0.8072125911712646, + "epoch": 0.09087641095693436, + "kl_loss": 20.386289596557617, + "loss_ib": 0.22227996587753296, + "step": 316 + }, + { + "ce_ib": 23.172889709472656, + "ce_orig": 0.4192945659160614, + "epoch": 0.09087641095693436, + "kl_loss": 16.535175323486328, + "loss_ib": 0.18852464854717255, + "step": 316 + }, + { + "ce_ib": 25.58465003967285, + "ce_orig": 0.7333693504333496, + "epoch": 0.09087641095693436, + "kl_loss": 16.984933853149414, + "loss_ib": 0.19543398916721344, + "step": 316 + }, + { + "ce_ib": 23.119829177856445, + "ce_orig": 1.2351598739624023, + "epoch": 0.09087641095693436, + "kl_loss": 17.573955535888672, + "loss_ib": 0.19885937869548798, + "step": 316 + }, + { + "ce_ib": 20.77093505859375, + "ce_orig": 0.638796865940094, + "epoch": 0.091163994535912, + "kl_loss": 13.646297454833984, + "loss_ib": 0.15723390877246857, + "step": 317 + }, + { + "ce_ib": 22.370588302612305, + "ce_orig": 0.7030758857727051, + "epoch": 0.091163994535912, + "kl_loss": 19.98332977294922, + "loss_ib": 0.22220388054847717, + "step": 317 + }, + { + "ce_ib": 23.507246017456055, + "ce_orig": 0.9605042934417725, + "epoch": 0.091163994535912, + "kl_loss": 17.364158630371094, + "loss_ib": 0.19714882969856262, + "step": 317 + }, + { + "ce_ib": 25.993309020996094, + "ce_orig": 1.3534818887710571, + "epoch": 0.091163994535912, + "kl_loss": 15.692447662353516, + "loss_ib": 0.1829177886247635, + "step": 317 + }, + { + "ce_ib": 26.447153091430664, + "ce_orig": 0.6662286520004272, + "epoch": 0.09145157811488965, + "kl_loss": 19.282215118408203, + "loss_ib": 0.2192692905664444, + "step": 318 + }, + { + "ce_ib": 21.66655158996582, + "ce_orig": 0.5830262899398804, + "epoch": 0.09145157811488965, + "kl_loss": 16.306236267089844, + "loss_ib": 0.18472890555858612, + "step": 318 + }, + { + "ce_ib": 25.510692596435547, + "ce_orig": 1.2415066957473755, + "epoch": 0.09145157811488965, + "kl_loss": 15.806875228881836, + "loss_ib": 0.1835794448852539, + "step": 318 + }, + { + "ce_ib": 19.910005569458008, + "ce_orig": 0.7447091341018677, + "epoch": 0.09145157811488965, + "kl_loss": 17.13539695739746, + "loss_ib": 0.19126397371292114, + "step": 318 + }, + { + "ce_ib": 24.191679000854492, + "ce_orig": 1.3691377639770508, + "epoch": 0.09173916169386728, + "kl_loss": 14.619853973388672, + "loss_ib": 0.17039021849632263, + "step": 319 + }, + { + "ce_ib": 24.796480178833008, + "ce_orig": 1.0005704164505005, + "epoch": 0.09173916169386728, + "kl_loss": 23.08795738220215, + "loss_ib": 0.2556760609149933, + "step": 319 + }, + { + "ce_ib": 18.99901580810547, + "ce_orig": 0.8472815155982971, + "epoch": 0.09173916169386728, + "kl_loss": 16.455188751220703, + "loss_ib": 0.1835509091615677, + "step": 319 + }, + { + "ce_ib": 23.486665725708008, + "ce_orig": 0.9926466345787048, + "epoch": 0.09173916169386728, + "kl_loss": 13.656463623046875, + "loss_ib": 0.16005130112171173, + "step": 319 + }, + { + "epoch": 0.09202674527284492, + "grad_norm": 1.6402511596679688, + "learning_rate": 1e-05, + "loss": 1.138, + "step": 320 + }, + { + "ce_ib": 17.978233337402344, + "ce_orig": 0.7672592401504517, + "epoch": 0.09202674527284492, + "kl_loss": 16.61567497253418, + "loss_ib": 0.18413497507572174, + "step": 320 + }, + { + "ce_ib": 22.24078369140625, + "ce_orig": 1.269389271736145, + "epoch": 0.09202674527284492, + "kl_loss": 13.635717391967773, + "loss_ib": 0.1585979461669922, + "step": 320 + }, + { + "ce_ib": 24.34579086303711, + "ce_orig": 0.9150453209877014, + "epoch": 0.09202674527284492, + "kl_loss": 17.23645782470703, + "loss_ib": 0.19671037793159485, + "step": 320 + }, + { + "ce_ib": 20.447195053100586, + "ce_orig": 0.7540422081947327, + "epoch": 0.09202674527284492, + "kl_loss": 16.120819091796875, + "loss_ib": 0.1816553771495819, + "step": 320 + }, + { + "ce_ib": 20.8408145904541, + "ce_orig": 0.6497412919998169, + "epoch": 0.09231432885182256, + "kl_loss": 13.179418563842773, + "loss_ib": 0.15263499319553375, + "step": 321 + }, + { + "ce_ib": 18.48600196838379, + "ce_orig": 0.9051377773284912, + "epoch": 0.09231432885182256, + "kl_loss": 13.9927339553833, + "loss_ib": 0.1584133356809616, + "step": 321 + }, + { + "ce_ib": 22.45148277282715, + "ce_orig": 1.3368862867355347, + "epoch": 0.09231432885182256, + "kl_loss": 13.805723190307617, + "loss_ib": 0.16050870716571808, + "step": 321 + }, + { + "ce_ib": 22.361835479736328, + "ce_orig": 0.4947150647640228, + "epoch": 0.09231432885182256, + "kl_loss": 13.823465347290039, + "loss_ib": 0.16059647500514984, + "step": 321 + }, + { + "ce_ib": 27.675626754760742, + "ce_orig": 1.5604900121688843, + "epoch": 0.0926019124308002, + "kl_loss": 12.6363525390625, + "loss_ib": 0.1540391445159912, + "step": 322 + }, + { + "ce_ib": 21.165691375732422, + "ce_orig": 0.9618255496025085, + "epoch": 0.0926019124308002, + "kl_loss": 18.356464385986328, + "loss_ib": 0.2047303318977356, + "step": 322 + }, + { + "ce_ib": 17.5034122467041, + "ce_orig": 0.6583025455474854, + "epoch": 0.0926019124308002, + "kl_loss": 14.965330123901367, + "loss_ib": 0.16715671122074127, + "step": 322 + }, + { + "ce_ib": 19.85868263244629, + "ce_orig": 0.8282234072685242, + "epoch": 0.0926019124308002, + "kl_loss": 14.633644104003906, + "loss_ib": 0.1661951243877411, + "step": 322 + }, + { + "ce_ib": 18.45901870727539, + "ce_orig": 0.7398732304573059, + "epoch": 0.09288949600977785, + "kl_loss": 15.228702545166016, + "loss_ib": 0.17074604332447052, + "step": 323 + }, + { + "ce_ib": 21.03128433227539, + "ce_orig": 0.8697280883789062, + "epoch": 0.09288949600977785, + "kl_loss": 14.359245300292969, + "loss_ib": 0.16462373733520508, + "step": 323 + }, + { + "ce_ib": 18.499732971191406, + "ce_orig": 0.5559062361717224, + "epoch": 0.09288949600977785, + "kl_loss": 14.474294662475586, + "loss_ib": 0.16324268281459808, + "step": 323 + }, + { + "ce_ib": 17.314205169677734, + "ce_orig": 0.5949759483337402, + "epoch": 0.09288949600977785, + "kl_loss": 12.18869400024414, + "loss_ib": 0.13920114934444427, + "step": 323 + }, + { + "ce_ib": 22.12590789794922, + "ce_orig": 1.2088953256607056, + "epoch": 0.09317707958875548, + "kl_loss": 12.109560012817383, + "loss_ib": 0.14322151243686676, + "step": 324 + }, + { + "ce_ib": 19.701847076416016, + "ce_orig": 0.6696236729621887, + "epoch": 0.09317707958875548, + "kl_loss": 14.687576293945312, + "loss_ib": 0.16657760739326477, + "step": 324 + }, + { + "ce_ib": 22.29142951965332, + "ce_orig": 1.1997344493865967, + "epoch": 0.09317707958875548, + "kl_loss": 12.63182258605957, + "loss_ib": 0.14860965311527252, + "step": 324 + }, + { + "ce_ib": 20.73260498046875, + "ce_orig": 0.9823715090751648, + "epoch": 0.09317707958875548, + "kl_loss": 12.120622634887695, + "loss_ib": 0.14193882048130035, + "step": 324 + }, + { + "epoch": 0.09346466316773312, + "grad_norm": 1.5406866073608398, + "learning_rate": 9.999993976919739e-06, + "loss": 1.1078, + "step": 325 + }, + { + "ce_ib": 17.106014251708984, + "ce_orig": 0.9145632982254028, + "epoch": 0.09346466316773312, + "kl_loss": 14.363018989562988, + "loss_ib": 0.16073618829250336, + "step": 325 + }, + { + "ce_ib": 19.175251007080078, + "ce_orig": 0.7700109481811523, + "epoch": 0.09346466316773312, + "kl_loss": 14.140527725219727, + "loss_ib": 0.16058051586151123, + "step": 325 + }, + { + "ce_ib": 15.837416648864746, + "ce_orig": 0.772528886795044, + "epoch": 0.09346466316773312, + "kl_loss": 12.842889785766602, + "loss_ib": 0.14426632225513458, + "step": 325 + }, + { + "ce_ib": 15.72890853881836, + "ce_orig": 0.7171897888183594, + "epoch": 0.09346466316773312, + "kl_loss": 11.653585433959961, + "loss_ib": 0.13226476311683655, + "step": 325 + }, + { + "ce_ib": 26.823095321655273, + "ce_orig": 1.9181299209594727, + "epoch": 0.09375224674671076, + "kl_loss": 11.220107078552246, + "loss_ib": 0.13902415335178375, + "step": 326 + }, + { + "ce_ib": 19.747053146362305, + "ce_orig": 0.9309285283088684, + "epoch": 0.09375224674671076, + "kl_loss": 10.759939193725586, + "loss_ib": 0.1273464411497116, + "step": 326 + }, + { + "ce_ib": 21.19155502319336, + "ce_orig": 1.635532259941101, + "epoch": 0.09375224674671076, + "kl_loss": 12.692992210388184, + "loss_ib": 0.14812147617340088, + "step": 326 + }, + { + "ce_ib": 21.15056037902832, + "ce_orig": 0.7623363137245178, + "epoch": 0.09375224674671076, + "kl_loss": 13.625322341918945, + "loss_ib": 0.15740378201007843, + "step": 326 + }, + { + "ce_ib": 16.852680206298828, + "ce_orig": 0.7469913363456726, + "epoch": 0.0940398303256884, + "kl_loss": 10.560868263244629, + "loss_ib": 0.12246136367321014, + "step": 327 + }, + { + "ce_ib": 16.11965560913086, + "ce_orig": 0.752680242061615, + "epoch": 0.0940398303256884, + "kl_loss": 12.737372398376465, + "loss_ib": 0.14349336922168732, + "step": 327 + }, + { + "ce_ib": 18.678325653076172, + "ce_orig": 0.8419510722160339, + "epoch": 0.0940398303256884, + "kl_loss": 12.982912063598633, + "loss_ib": 0.14850744605064392, + "step": 327 + }, + { + "ce_ib": 20.507835388183594, + "ce_orig": 0.933698296546936, + "epoch": 0.0940398303256884, + "kl_loss": 11.535351753234863, + "loss_ib": 0.1358613520860672, + "step": 327 + }, + { + "ce_ib": 16.491111755371094, + "ce_orig": 0.4157365560531616, + "epoch": 0.09432741390466605, + "kl_loss": 11.347173690795898, + "loss_ib": 0.12996284663677216, + "step": 328 + }, + { + "ce_ib": 20.290693283081055, + "ce_orig": 0.6378398537635803, + "epoch": 0.09432741390466605, + "kl_loss": 10.955358505249023, + "loss_ib": 0.1298442780971527, + "step": 328 + }, + { + "ce_ib": 20.74325942993164, + "ce_orig": 0.9873928427696228, + "epoch": 0.09432741390466605, + "kl_loss": 11.168992042541504, + "loss_ib": 0.13243317604064941, + "step": 328 + }, + { + "ce_ib": 22.277246475219727, + "ce_orig": 1.0927495956420898, + "epoch": 0.09432741390466605, + "kl_loss": 14.169788360595703, + "loss_ib": 0.16397511959075928, + "step": 328 + }, + { + "ce_ib": 23.106748580932617, + "ce_orig": 1.1934113502502441, + "epoch": 0.09461499748364369, + "kl_loss": 11.351577758789062, + "loss_ib": 0.13662251830101013, + "step": 329 + }, + { + "ce_ib": 19.171072006225586, + "ce_orig": 0.9011801481246948, + "epoch": 0.09461499748364369, + "kl_loss": 11.967233657836914, + "loss_ib": 0.13884340226650238, + "step": 329 + }, + { + "ce_ib": 19.95760726928711, + "ce_orig": 0.6395582556724548, + "epoch": 0.09461499748364369, + "kl_loss": 11.200337409973145, + "loss_ib": 0.13196097314357758, + "step": 329 + }, + { + "ce_ib": 20.071157455444336, + "ce_orig": 0.9473389983177185, + "epoch": 0.09461499748364369, + "kl_loss": 14.839773178100586, + "loss_ib": 0.1684688925743103, + "step": 329 + }, + { + "epoch": 0.09490258106262132, + "grad_norm": 0.9824780821800232, + "learning_rate": 9.999975907693462e-06, + "loss": 1.1087, + "step": 330 + }, + { + "ce_ib": 20.931066513061523, + "ce_orig": 1.4924328327178955, + "epoch": 0.09490258106262132, + "kl_loss": 10.959052085876465, + "loss_ib": 0.13052158057689667, + "step": 330 + }, + { + "ce_ib": 23.309009552001953, + "ce_orig": 0.8217906951904297, + "epoch": 0.09490258106262132, + "kl_loss": 13.162786483764648, + "loss_ib": 0.15493687987327576, + "step": 330 + }, + { + "ce_ib": 19.14462661743164, + "ce_orig": 0.7224079966545105, + "epoch": 0.09490258106262132, + "kl_loss": 15.73659610748291, + "loss_ib": 0.17651057243347168, + "step": 330 + }, + { + "ce_ib": 16.93087387084961, + "ce_orig": 0.3897332549095154, + "epoch": 0.09490258106262132, + "kl_loss": 9.802057266235352, + "loss_ib": 0.11495144665241241, + "step": 330 + }, + { + "ce_ib": 20.574167251586914, + "ce_orig": 0.7844420075416565, + "epoch": 0.09519016464159896, + "kl_loss": 12.472640991210938, + "loss_ib": 0.14530058205127716, + "step": 331 + }, + { + "ce_ib": 21.064104080200195, + "ce_orig": 0.8571724891662598, + "epoch": 0.09519016464159896, + "kl_loss": 12.077247619628906, + "loss_ib": 0.14183658361434937, + "step": 331 + }, + { + "ce_ib": 19.205732345581055, + "ce_orig": 0.591139554977417, + "epoch": 0.09519016464159896, + "kl_loss": 10.037530899047852, + "loss_ib": 0.11958103626966476, + "step": 331 + }, + { + "ce_ib": 19.454252243041992, + "ce_orig": 0.9017695188522339, + "epoch": 0.09519016464159896, + "kl_loss": 10.572440147399902, + "loss_ib": 0.12517865002155304, + "step": 331 + }, + { + "ce_ib": 15.203242301940918, + "ce_orig": 0.5361948013305664, + "epoch": 0.0954777482205766, + "kl_loss": 10.661357879638672, + "loss_ib": 0.12181682139635086, + "step": 332 + }, + { + "ce_ib": 21.122093200683594, + "ce_orig": 1.4699267148971558, + "epoch": 0.0954777482205766, + "kl_loss": 9.819560050964355, + "loss_ib": 0.1193176880478859, + "step": 332 + }, + { + "ce_ib": 19.769392013549805, + "ce_orig": 0.5775290131568909, + "epoch": 0.0954777482205766, + "kl_loss": 9.717262268066406, + "loss_ib": 0.11694201081991196, + "step": 332 + }, + { + "ce_ib": 18.72998046875, + "ce_orig": 0.5287953615188599, + "epoch": 0.0954777482205766, + "kl_loss": 13.474811553955078, + "loss_ib": 0.15347810089588165, + "step": 332 + }, + { + "ce_ib": 14.733593940734863, + "ce_orig": 0.4213125705718994, + "epoch": 0.09576533179955425, + "kl_loss": 11.17019271850586, + "loss_ib": 0.1264355182647705, + "step": 333 + }, + { + "ce_ib": 21.969772338867188, + "ce_orig": 1.0359489917755127, + "epoch": 0.09576533179955425, + "kl_loss": 12.137819290161133, + "loss_ib": 0.14334796369075775, + "step": 333 + }, + { + "ce_ib": 19.76876449584961, + "ce_orig": 0.8385518193244934, + "epoch": 0.09576533179955425, + "kl_loss": 12.675048828125, + "loss_ib": 0.1465192437171936, + "step": 333 + }, + { + "ce_ib": 16.6859188079834, + "ce_orig": 0.7033459544181824, + "epoch": 0.09576533179955425, + "kl_loss": 11.24110221862793, + "loss_ib": 0.12909694015979767, + "step": 333 + }, + { + "ce_ib": 18.942955017089844, + "ce_orig": 0.6511563062667847, + "epoch": 0.09605291537853189, + "kl_loss": 12.197677612304688, + "loss_ib": 0.1409197300672531, + "step": 334 + }, + { + "ce_ib": 12.525162696838379, + "ce_orig": 0.2835647463798523, + "epoch": 0.09605291537853189, + "kl_loss": 8.74501895904541, + "loss_ib": 0.0999753549695015, + "step": 334 + }, + { + "ce_ib": 19.1585693359375, + "ce_orig": 0.5772603750228882, + "epoch": 0.09605291537853189, + "kl_loss": 11.537700653076172, + "loss_ib": 0.13453558087348938, + "step": 334 + }, + { + "ce_ib": 18.65268898010254, + "ce_orig": 0.6586172580718994, + "epoch": 0.09605291537853189, + "kl_loss": 9.829732894897461, + "loss_ib": 0.11695001274347305, + "step": 334 + }, + { + "epoch": 0.09634049895750953, + "grad_norm": 1.5527466535568237, + "learning_rate": 9.999945792364704e-06, + "loss": 1.0047, + "step": 335 + }, + { + "ce_ib": 18.45952796936035, + "ce_orig": 0.9835706949234009, + "epoch": 0.09634049895750953, + "kl_loss": 11.043167114257812, + "loss_ib": 0.1288911998271942, + "step": 335 + }, + { + "ce_ib": 19.220260620117188, + "ce_orig": 1.0070465803146362, + "epoch": 0.09634049895750953, + "kl_loss": 9.694038391113281, + "loss_ib": 0.11616063863039017, + "step": 335 + }, + { + "ce_ib": 20.90534019470215, + "ce_orig": 0.7721905708312988, + "epoch": 0.09634049895750953, + "kl_loss": 10.956405639648438, + "loss_ib": 0.1304693967103958, + "step": 335 + }, + { + "ce_ib": 22.549360275268555, + "ce_orig": 0.7169628143310547, + "epoch": 0.09634049895750953, + "kl_loss": 10.45715618133545, + "loss_ib": 0.12712092697620392, + "step": 335 + }, + { + "ce_ib": 19.706451416015625, + "ce_orig": 1.2544941902160645, + "epoch": 0.09662808253648716, + "kl_loss": 5.737251281738281, + "loss_ib": 0.07707896083593369, + "step": 336 + }, + { + "ce_ib": 24.045684814453125, + "ce_orig": 1.628864049911499, + "epoch": 0.09662808253648716, + "kl_loss": 9.666478157043457, + "loss_ib": 0.12071046233177185, + "step": 336 + }, + { + "ce_ib": 22.39566993713379, + "ce_orig": 1.3797554969787598, + "epoch": 0.09662808253648716, + "kl_loss": 9.557376861572266, + "loss_ib": 0.11796943098306656, + "step": 336 + }, + { + "ce_ib": 16.502885818481445, + "ce_orig": 0.678697407245636, + "epoch": 0.09662808253648716, + "kl_loss": 12.349996566772461, + "loss_ib": 0.14000284671783447, + "step": 336 + }, + { + "ce_ib": 21.189701080322266, + "ce_orig": 1.7581653594970703, + "epoch": 0.0969156661154648, + "kl_loss": 11.109577178955078, + "loss_ib": 0.1322854608297348, + "step": 337 + }, + { + "ce_ib": 20.682483673095703, + "ce_orig": 1.0360445976257324, + "epoch": 0.0969156661154648, + "kl_loss": 9.650278091430664, + "loss_ib": 0.11718526482582092, + "step": 337 + }, + { + "ce_ib": 23.174293518066406, + "ce_orig": 0.9172191619873047, + "epoch": 0.0969156661154648, + "kl_loss": 10.747881889343262, + "loss_ib": 0.13065311312675476, + "step": 337 + }, + { + "ce_ib": 20.76695442199707, + "ce_orig": 0.5460869073867798, + "epoch": 0.0969156661154648, + "kl_loss": 11.630158424377441, + "loss_ib": 0.13706853985786438, + "step": 337 + }, + { + "ce_ib": 23.635868072509766, + "ce_orig": 1.7053964138031006, + "epoch": 0.09720324969444245, + "kl_loss": 9.801689147949219, + "loss_ib": 0.121652752161026, + "step": 338 + }, + { + "ce_ib": 19.619415283203125, + "ce_orig": 0.8643050193786621, + "epoch": 0.09720324969444245, + "kl_loss": 9.933218002319336, + "loss_ib": 0.11895159631967545, + "step": 338 + }, + { + "ce_ib": 21.83019256591797, + "ce_orig": 0.8322968482971191, + "epoch": 0.09720324969444245, + "kl_loss": 10.368824005126953, + "loss_ib": 0.12551842629909515, + "step": 338 + }, + { + "ce_ib": 18.191864013671875, + "ce_orig": 0.4908624589443207, + "epoch": 0.09720324969444245, + "kl_loss": 11.974782943725586, + "loss_ib": 0.1379396915435791, + "step": 338 + }, + { + "ce_ib": 20.153644561767578, + "ce_orig": 0.5820345282554626, + "epoch": 0.09749083327342009, + "kl_loss": 9.271571159362793, + "loss_ib": 0.11286935210227966, + "step": 339 + }, + { + "ce_ib": 16.755735397338867, + "ce_orig": 0.8004245758056641, + "epoch": 0.09749083327342009, + "kl_loss": 9.602378845214844, + "loss_ib": 0.11277952045202255, + "step": 339 + }, + { + "ce_ib": 21.61349868774414, + "ce_orig": 1.3253728151321411, + "epoch": 0.09749083327342009, + "kl_loss": 9.877376556396484, + "loss_ib": 0.12038726359605789, + "step": 339 + }, + { + "ce_ib": 23.785110473632812, + "ce_orig": 1.1449768543243408, + "epoch": 0.09749083327342009, + "kl_loss": 9.455681800842285, + "loss_ib": 0.11834193021059036, + "step": 339 + }, + { + "epoch": 0.09777841685239773, + "grad_norm": 0.7407243251800537, + "learning_rate": 9.999903631006022e-06, + "loss": 1.0521, + "step": 340 + }, + { + "ce_ib": 10.92531967163086, + "ce_orig": 0.2659711241722107, + "epoch": 0.09777841685239773, + "kl_loss": 7.5069732666015625, + "loss_ib": 0.08599505573511124, + "step": 340 + }, + { + "ce_ib": 19.90782356262207, + "ce_orig": 0.5355432033538818, + "epoch": 0.09777841685239773, + "kl_loss": 11.578774452209473, + "loss_ib": 0.13569556176662445, + "step": 340 + }, + { + "ce_ib": 15.142422676086426, + "ce_orig": 0.7282365560531616, + "epoch": 0.09777841685239773, + "kl_loss": 10.324485778808594, + "loss_ib": 0.11838727444410324, + "step": 340 + }, + { + "ce_ib": 20.629169464111328, + "ce_orig": 0.9958592057228088, + "epoch": 0.09777841685239773, + "kl_loss": 9.883302688598633, + "loss_ib": 0.11946219205856323, + "step": 340 + }, + { + "ce_ib": 18.038537979125977, + "ce_orig": 0.692686140537262, + "epoch": 0.09806600043137537, + "kl_loss": 9.38126277923584, + "loss_ib": 0.11185116320848465, + "step": 341 + }, + { + "ce_ib": 20.98015022277832, + "ce_orig": 0.5099575519561768, + "epoch": 0.09806600043137537, + "kl_loss": 9.678869247436523, + "loss_ib": 0.11776883900165558, + "step": 341 + }, + { + "ce_ib": 17.243499755859375, + "ce_orig": 0.692179799079895, + "epoch": 0.09806600043137537, + "kl_loss": 9.476663589477539, + "loss_ib": 0.11201013624668121, + "step": 341 + }, + { + "ce_ib": 25.067062377929688, + "ce_orig": 0.792171835899353, + "epoch": 0.09806600043137537, + "kl_loss": 10.635600090026855, + "loss_ib": 0.13142305612564087, + "step": 341 + }, + { + "ce_ib": 15.628839492797852, + "ce_orig": 0.7891074419021606, + "epoch": 0.098353584010353, + "kl_loss": 10.068859100341797, + "loss_ib": 0.11631742864847183, + "step": 342 + }, + { + "ce_ib": 18.483537673950195, + "ce_orig": 0.9051138758659363, + "epoch": 0.098353584010353, + "kl_loss": 10.133465766906738, + "loss_ib": 0.11981818825006485, + "step": 342 + }, + { + "ce_ib": 16.36567497253418, + "ce_orig": 0.6817749738693237, + "epoch": 0.098353584010353, + "kl_loss": 9.654040336608887, + "loss_ib": 0.1129060685634613, + "step": 342 + }, + { + "ce_ib": 20.89409828186035, + "ce_orig": 1.1035248041152954, + "epoch": 0.098353584010353, + "kl_loss": 10.291540145874023, + "loss_ib": 0.12380949407815933, + "step": 342 + }, + { + "ce_ib": 22.037933349609375, + "ce_orig": 1.694153904914856, + "epoch": 0.09864116758933066, + "kl_loss": 9.139327049255371, + "loss_ib": 0.11343120038509369, + "step": 343 + }, + { + "ce_ib": 21.519075393676758, + "ce_orig": 1.4431602954864502, + "epoch": 0.09864116758933066, + "kl_loss": 9.308493614196777, + "loss_ib": 0.11460401117801666, + "step": 343 + }, + { + "ce_ib": 15.5767240524292, + "ce_orig": 0.7400184273719788, + "epoch": 0.09864116758933066, + "kl_loss": 9.34182357788086, + "loss_ib": 0.10899496078491211, + "step": 343 + }, + { + "ce_ib": 14.478399276733398, + "ce_orig": 0.4603125751018524, + "epoch": 0.09864116758933066, + "kl_loss": 9.755362510681152, + "loss_ib": 0.11203201860189438, + "step": 343 + }, + { + "ce_ib": 17.020166397094727, + "ce_orig": 0.5673547387123108, + "epoch": 0.0989287511683083, + "kl_loss": 11.05903148651123, + "loss_ib": 0.1276104748249054, + "step": 344 + }, + { + "ce_ib": 17.05959701538086, + "ce_orig": 0.7991743683815002, + "epoch": 0.0989287511683083, + "kl_loss": 9.682877540588379, + "loss_ib": 0.11388836801052094, + "step": 344 + }, + { + "ce_ib": 23.936145782470703, + "ce_orig": 1.328629732131958, + "epoch": 0.0989287511683083, + "kl_loss": 10.604333877563477, + "loss_ib": 0.1299794763326645, + "step": 344 + }, + { + "ce_ib": 22.645984649658203, + "ce_orig": 1.3614290952682495, + "epoch": 0.0989287511683083, + "kl_loss": 9.37100601196289, + "loss_ib": 0.11635604500770569, + "step": 344 + }, + { + "epoch": 0.09921633474728593, + "grad_norm": 1.2907381057739258, + "learning_rate": 9.99984942371899e-06, + "loss": 1.0005, + "step": 345 + }, + { + "ce_ib": 19.495277404785156, + "ce_orig": 1.2335152626037598, + "epoch": 0.09921633474728593, + "kl_loss": 10.051846504211426, + "loss_ib": 0.12001373618841171, + "step": 345 + }, + { + "ce_ib": 16.080678939819336, + "ce_orig": 0.7765376567840576, + "epoch": 0.09921633474728593, + "kl_loss": 9.520978927612305, + "loss_ib": 0.11129046231508255, + "step": 345 + }, + { + "ce_ib": 17.06264305114746, + "ce_orig": 0.6252941489219666, + "epoch": 0.09921633474728593, + "kl_loss": 9.93945026397705, + "loss_ib": 0.11645714193582535, + "step": 345 + }, + { + "ce_ib": 18.10491371154785, + "ce_orig": 0.9270275831222534, + "epoch": 0.09921633474728593, + "kl_loss": 9.839916229248047, + "loss_ib": 0.11650407314300537, + "step": 345 + }, + { + "ce_ib": 21.288270950317383, + "ce_orig": 1.8509690761566162, + "epoch": 0.09950391832626357, + "kl_loss": 9.440530776977539, + "loss_ib": 0.1156935766339302, + "step": 346 + }, + { + "ce_ib": 16.040321350097656, + "ce_orig": 0.9570891261100769, + "epoch": 0.09950391832626357, + "kl_loss": 10.13039493560791, + "loss_ib": 0.11734427511692047, + "step": 346 + }, + { + "ce_ib": 20.773448944091797, + "ce_orig": 1.5323985815048218, + "epoch": 0.09950391832626357, + "kl_loss": 10.627010345458984, + "loss_ib": 0.12704354524612427, + "step": 346 + }, + { + "ce_ib": 13.598661422729492, + "ce_orig": 0.7118152379989624, + "epoch": 0.09950391832626357, + "kl_loss": 9.598701477050781, + "loss_ib": 0.10958567261695862, + "step": 346 + }, + { + "ce_ib": 13.722025871276855, + "ce_orig": 0.8525584936141968, + "epoch": 0.0997915019052412, + "kl_loss": 10.663893699645996, + "loss_ib": 0.12036096304655075, + "step": 347 + }, + { + "ce_ib": 17.72942352294922, + "ce_orig": 0.879482626914978, + "epoch": 0.0997915019052412, + "kl_loss": 10.045938491821289, + "loss_ib": 0.11818880587816238, + "step": 347 + }, + { + "ce_ib": 17.762847900390625, + "ce_orig": 1.0413190126419067, + "epoch": 0.0997915019052412, + "kl_loss": 10.000950813293457, + "loss_ib": 0.11777235567569733, + "step": 347 + }, + { + "ce_ib": 22.781930923461914, + "ce_orig": 1.1397019624710083, + "epoch": 0.0997915019052412, + "kl_loss": 9.38424301147461, + "loss_ib": 0.1166243627667427, + "step": 347 + }, + { + "ce_ib": 21.994335174560547, + "ce_orig": 0.43941164016723633, + "epoch": 0.10007908548421886, + "kl_loss": 9.569962501525879, + "loss_ib": 0.1176939532160759, + "step": 348 + }, + { + "ce_ib": 16.95915985107422, + "ce_orig": 0.5146183371543884, + "epoch": 0.10007908548421886, + "kl_loss": 9.597618103027344, + "loss_ib": 0.11293534189462662, + "step": 348 + }, + { + "ce_ib": 13.567898750305176, + "ce_orig": 0.6933156847953796, + "epoch": 0.10007908548421886, + "kl_loss": 9.457947731018066, + "loss_ib": 0.10814736783504486, + "step": 348 + }, + { + "ce_ib": 16.038314819335938, + "ce_orig": 0.9144206643104553, + "epoch": 0.10007908548421886, + "kl_loss": 9.451361656188965, + "loss_ib": 0.11055192351341248, + "step": 348 + }, + { + "ce_ib": 15.3102445602417, + "ce_orig": 0.6907638311386108, + "epoch": 0.1003666690631965, + "kl_loss": 8.99312973022461, + "loss_ib": 0.10524154454469681, + "step": 349 + }, + { + "ce_ib": 16.133337020874023, + "ce_orig": 0.885292649269104, + "epoch": 0.1003666690631965, + "kl_loss": 9.09972095489502, + "loss_ib": 0.10713054239749908, + "step": 349 + }, + { + "ce_ib": 21.13068962097168, + "ce_orig": 1.0056211948394775, + "epoch": 0.1003666690631965, + "kl_loss": 11.5963134765625, + "loss_ib": 0.13709382712841034, + "step": 349 + }, + { + "ce_ib": 20.446731567382812, + "ce_orig": 1.2884644269943237, + "epoch": 0.1003666690631965, + "kl_loss": 9.364080429077148, + "loss_ib": 0.1140875294804573, + "step": 349 + }, + { + "epoch": 0.10065425264217413, + "grad_norm": 0.45241594314575195, + "learning_rate": 9.999783170634207e-06, + "loss": 1.0049, + "step": 350 + }, + { + "ce_ib": 20.207548141479492, + "ce_orig": 0.9007079005241394, + "epoch": 0.10065425264217413, + "kl_loss": 8.826637268066406, + "loss_ib": 0.10847391933202744, + "step": 350 + }, + { + "ce_ib": 14.89907169342041, + "ce_orig": 0.848534882068634, + "epoch": 0.10065425264217413, + "kl_loss": 9.761069297790527, + "loss_ib": 0.11250976473093033, + "step": 350 + }, + { + "ce_ib": 20.37824058532715, + "ce_orig": 1.4016444683074951, + "epoch": 0.10065425264217413, + "kl_loss": 9.232507705688477, + "loss_ib": 0.11270332336425781, + "step": 350 + }, + { + "ce_ib": 16.76759910583496, + "ce_orig": 1.1012144088745117, + "epoch": 0.10065425264217413, + "kl_loss": 10.15268325805664, + "loss_ib": 0.11829442530870438, + "step": 350 + }, + { + "ce_ib": 15.976619720458984, + "ce_orig": 0.8754503726959229, + "epoch": 0.10094183622115177, + "kl_loss": 9.422735214233398, + "loss_ib": 0.11020396649837494, + "step": 351 + }, + { + "ce_ib": 19.49113655090332, + "ce_orig": 1.116249680519104, + "epoch": 0.10094183622115177, + "kl_loss": 9.386714935302734, + "loss_ib": 0.1133582815527916, + "step": 351 + }, + { + "ce_ib": 17.88022232055664, + "ce_orig": 0.9254348278045654, + "epoch": 0.10094183622115177, + "kl_loss": 9.275611877441406, + "loss_ib": 0.11063633859157562, + "step": 351 + }, + { + "ce_ib": 9.351910591125488, + "ce_orig": 0.16225206851959229, + "epoch": 0.10094183622115177, + "kl_loss": 9.214266777038574, + "loss_ib": 0.10149458050727844, + "step": 351 + }, + { + "ce_ib": 16.736501693725586, + "ce_orig": 0.8068060278892517, + "epoch": 0.10122941980012941, + "kl_loss": 9.62176513671875, + "loss_ib": 0.11295414716005325, + "step": 352 + }, + { + "ce_ib": 18.84697723388672, + "ce_orig": 0.6195511221885681, + "epoch": 0.10122941980012941, + "kl_loss": 9.747774124145508, + "loss_ib": 0.11632471531629562, + "step": 352 + }, + { + "ce_ib": 18.591981887817383, + "ce_orig": 0.8959230184555054, + "epoch": 0.10122941980012941, + "kl_loss": 8.57149887084961, + "loss_ib": 0.10430697351694107, + "step": 352 + }, + { + "ce_ib": 16.255172729492188, + "ce_orig": 0.7202159762382507, + "epoch": 0.10122941980012941, + "kl_loss": 12.734394073486328, + "loss_ib": 0.14359910786151886, + "step": 352 + }, + { + "ce_ib": 14.430315971374512, + "ce_orig": 0.9846038818359375, + "epoch": 0.10151700337910706, + "kl_loss": 9.04564094543457, + "loss_ib": 0.10488671809434891, + "step": 353 + }, + { + "ce_ib": 19.057903289794922, + "ce_orig": 0.708838701248169, + "epoch": 0.10151700337910706, + "kl_loss": 8.713069915771484, + "loss_ib": 0.10618860274553299, + "step": 353 + }, + { + "ce_ib": 20.327953338623047, + "ce_orig": 1.7989298105239868, + "epoch": 0.10151700337910706, + "kl_loss": 9.945318222045898, + "loss_ib": 0.11978112906217575, + "step": 353 + }, + { + "ce_ib": 14.250055313110352, + "ce_orig": 0.6734949350357056, + "epoch": 0.10151700337910706, + "kl_loss": 9.17054557800293, + "loss_ib": 0.10595551133155823, + "step": 353 + }, + { + "ce_ib": 15.82697868347168, + "ce_orig": 0.9444003701210022, + "epoch": 0.1018045869580847, + "kl_loss": 9.11765193939209, + "loss_ib": 0.10700349509716034, + "step": 354 + }, + { + "ce_ib": 15.842026710510254, + "ce_orig": 0.6050642132759094, + "epoch": 0.1018045869580847, + "kl_loss": 8.850725173950195, + "loss_ib": 0.1043492779135704, + "step": 354 + }, + { + "ce_ib": 18.677658081054688, + "ce_orig": 0.8625993728637695, + "epoch": 0.1018045869580847, + "kl_loss": 9.405292510986328, + "loss_ib": 0.11273057758808136, + "step": 354 + }, + { + "ce_ib": 13.065309524536133, + "ce_orig": 0.39125949144363403, + "epoch": 0.1018045869580847, + "kl_loss": 6.584395408630371, + "loss_ib": 0.0789092630147934, + "step": 354 + }, + { + "epoch": 0.10209217053706234, + "grad_norm": 0.49203693866729736, + "learning_rate": 9.999704871911289e-06, + "loss": 0.9968, + "step": 355 + }, + { + "ce_ib": 19.234111785888672, + "ce_orig": 0.9246622920036316, + "epoch": 0.10209217053706234, + "kl_loss": 8.509796142578125, + "loss_ib": 0.1043320745229721, + "step": 355 + }, + { + "ce_ib": 17.678333282470703, + "ce_orig": 0.8307465314865112, + "epoch": 0.10209217053706234, + "kl_loss": 8.600641250610352, + "loss_ib": 0.10368474572896957, + "step": 355 + }, + { + "ce_ib": 11.18205738067627, + "ce_orig": 0.1704210788011551, + "epoch": 0.10209217053706234, + "kl_loss": 10.59097957611084, + "loss_ib": 0.11709185689687729, + "step": 355 + }, + { + "ce_ib": 14.95598316192627, + "ce_orig": 0.8469982147216797, + "epoch": 0.10209217053706234, + "kl_loss": 9.722857475280762, + "loss_ib": 0.1121845617890358, + "step": 355 + }, + { + "ce_ib": 16.683542251586914, + "ce_orig": 1.2110188007354736, + "epoch": 0.10237975411603997, + "kl_loss": 8.89454460144043, + "loss_ib": 0.10562898218631744, + "step": 356 + }, + { + "ce_ib": 14.379767417907715, + "ce_orig": 0.6198569536209106, + "epoch": 0.10237975411603997, + "kl_loss": 10.061267852783203, + "loss_ib": 0.11499244719743729, + "step": 356 + }, + { + "ce_ib": 16.58348846435547, + "ce_orig": 1.1141889095306396, + "epoch": 0.10237975411603997, + "kl_loss": 9.346358299255371, + "loss_ib": 0.11004707217216492, + "step": 356 + }, + { + "ce_ib": 16.944780349731445, + "ce_orig": 0.8266158699989319, + "epoch": 0.10237975411603997, + "kl_loss": 9.04732894897461, + "loss_ib": 0.10741806775331497, + "step": 356 + }, + { + "ce_ib": 16.06398582458496, + "ce_orig": 0.904808759689331, + "epoch": 0.10266733769501761, + "kl_loss": 9.288269996643066, + "loss_ib": 0.10894668102264404, + "step": 357 + }, + { + "ce_ib": 18.518054962158203, + "ce_orig": 0.6518286466598511, + "epoch": 0.10266733769501761, + "kl_loss": 9.153616905212402, + "loss_ib": 0.11005422472953796, + "step": 357 + }, + { + "ce_ib": 13.961670875549316, + "ce_orig": 0.7661263942718506, + "epoch": 0.10266733769501761, + "kl_loss": 9.662349700927734, + "loss_ib": 0.11058516055345535, + "step": 357 + }, + { + "ce_ib": 16.693498611450195, + "ce_orig": 0.7039583325386047, + "epoch": 0.10266733769501761, + "kl_loss": 9.203071594238281, + "loss_ib": 0.1087242066860199, + "step": 357 + }, + { + "ce_ib": 14.941229820251465, + "ce_orig": 0.6703804135322571, + "epoch": 0.10295492127399525, + "kl_loss": 7.32505464553833, + "loss_ib": 0.08819177746772766, + "step": 358 + }, + { + "ce_ib": 10.472589492797852, + "ce_orig": 0.28409555554389954, + "epoch": 0.10295492127399525, + "kl_loss": 10.202686309814453, + "loss_ib": 0.11249945312738419, + "step": 358 + }, + { + "ce_ib": 18.69029998779297, + "ce_orig": 0.9642467498779297, + "epoch": 0.10295492127399525, + "kl_loss": 9.411130905151367, + "loss_ib": 0.11280160397291183, + "step": 358 + }, + { + "ce_ib": 17.212318420410156, + "ce_orig": 0.8138781785964966, + "epoch": 0.10295492127399525, + "kl_loss": 8.968740463256836, + "loss_ib": 0.10689971596002579, + "step": 358 + }, + { + "ce_ib": 17.45269203186035, + "ce_orig": 0.377750039100647, + "epoch": 0.1032425048529729, + "kl_loss": 6.718733787536621, + "loss_ib": 0.0846400260925293, + "step": 359 + }, + { + "ce_ib": 20.13898468017578, + "ce_orig": 1.5750316381454468, + "epoch": 0.1032425048529729, + "kl_loss": 9.422719955444336, + "loss_ib": 0.11436618864536285, + "step": 359 + }, + { + "ce_ib": 17.9411678314209, + "ce_orig": 0.8798750638961792, + "epoch": 0.1032425048529729, + "kl_loss": 9.073663711547852, + "loss_ib": 0.10867780447006226, + "step": 359 + }, + { + "ce_ib": 17.482316970825195, + "ce_orig": 1.371580958366394, + "epoch": 0.1032425048529729, + "kl_loss": 9.437213897705078, + "loss_ib": 0.11185445636510849, + "step": 359 + }, + { + "epoch": 0.10353008843195054, + "grad_norm": 0.40732964873313904, + "learning_rate": 9.999614527738882e-06, + "loss": 1.0384, + "step": 360 + }, + { + "ce_ib": 20.317060470581055, + "ce_orig": 0.6699705123901367, + "epoch": 0.10353008843195054, + "kl_loss": 8.502317428588867, + "loss_ib": 0.10534022748470306, + "step": 360 + }, + { + "ce_ib": 14.938705444335938, + "ce_orig": 0.696804404258728, + "epoch": 0.10353008843195054, + "kl_loss": 9.356005668640137, + "loss_ib": 0.10849875956773758, + "step": 360 + }, + { + "ce_ib": 15.835675239562988, + "ce_orig": 0.8596332669258118, + "epoch": 0.10353008843195054, + "kl_loss": 9.73591423034668, + "loss_ib": 0.11319481581449509, + "step": 360 + }, + { + "ce_ib": 15.533881187438965, + "ce_orig": 0.4826924502849579, + "epoch": 0.10353008843195054, + "kl_loss": 10.024049758911133, + "loss_ib": 0.11577437818050385, + "step": 360 + }, + { + "ce_ib": 12.633554458618164, + "ce_orig": 0.5069536566734314, + "epoch": 0.10381767201092817, + "kl_loss": 9.803174018859863, + "loss_ib": 0.11066529154777527, + "step": 361 + }, + { + "ce_ib": 13.828798294067383, + "ce_orig": 0.896979033946991, + "epoch": 0.10381767201092817, + "kl_loss": 8.719182968139648, + "loss_ib": 0.10102062672376633, + "step": 361 + }, + { + "ce_ib": 21.034914016723633, + "ce_orig": 1.8323390483856201, + "epoch": 0.10381767201092817, + "kl_loss": 9.037761688232422, + "loss_ib": 0.11141253262758255, + "step": 361 + }, + { + "ce_ib": 16.179244995117188, + "ce_orig": 0.7684395909309387, + "epoch": 0.10381767201092817, + "kl_loss": 8.469200134277344, + "loss_ib": 0.100871242582798, + "step": 361 + }, + { + "ce_ib": 13.840865135192871, + "ce_orig": 1.0341031551361084, + "epoch": 0.10410525558990581, + "kl_loss": 7.858333110809326, + "loss_ib": 0.0924241915345192, + "step": 362 + }, + { + "ce_ib": 22.43819236755371, + "ce_orig": 1.788472294807434, + "epoch": 0.10410525558990581, + "kl_loss": 9.436954498291016, + "loss_ib": 0.1168077364563942, + "step": 362 + }, + { + "ce_ib": 19.555612564086914, + "ce_orig": 1.5130561590194702, + "epoch": 0.10410525558990581, + "kl_loss": 9.508270263671875, + "loss_ib": 0.1146383136510849, + "step": 362 + }, + { + "ce_ib": 13.82888412475586, + "ce_orig": 0.5086873769760132, + "epoch": 0.10410525558990581, + "kl_loss": 9.382222175598145, + "loss_ib": 0.10765110701322556, + "step": 362 + }, + { + "ce_ib": 16.548250198364258, + "ce_orig": 1.2047643661499023, + "epoch": 0.10439283916888345, + "kl_loss": 8.958712577819824, + "loss_ib": 0.10613537579774857, + "step": 363 + }, + { + "ce_ib": 15.741909980773926, + "ce_orig": 0.8534471392631531, + "epoch": 0.10439283916888345, + "kl_loss": 9.691909790039062, + "loss_ib": 0.11266100406646729, + "step": 363 + }, + { + "ce_ib": 21.730342864990234, + "ce_orig": 1.475590467453003, + "epoch": 0.10439283916888345, + "kl_loss": 9.107532501220703, + "loss_ib": 0.11280567198991776, + "step": 363 + }, + { + "ce_ib": 13.439830780029297, + "ce_orig": 0.6186426877975464, + "epoch": 0.10439283916888345, + "kl_loss": 9.766258239746094, + "loss_ib": 0.11110240966081619, + "step": 363 + }, + { + "ce_ib": 16.56508445739746, + "ce_orig": 0.9517434239387512, + "epoch": 0.1046804227478611, + "kl_loss": 9.201667785644531, + "loss_ib": 0.10858175903558731, + "step": 364 + }, + { + "ce_ib": 15.66641902923584, + "ce_orig": 0.1410691887140274, + "epoch": 0.1046804227478611, + "kl_loss": 12.184783935546875, + "loss_ib": 0.13751424849033356, + "step": 364 + }, + { + "ce_ib": 16.9328670501709, + "ce_orig": 0.7690316438674927, + "epoch": 0.1046804227478611, + "kl_loss": 7.339565753936768, + "loss_ib": 0.09032852202653885, + "step": 364 + }, + { + "ce_ib": 19.301227569580078, + "ce_orig": 1.541357159614563, + "epoch": 0.1046804227478611, + "kl_loss": 8.6048583984375, + "loss_ib": 0.10534980893135071, + "step": 364 + }, + { + "epoch": 0.10496800632683874, + "grad_norm": 0.35407018661499023, + "learning_rate": 9.99951213833464e-06, + "loss": 1.0547, + "step": 365 + }, + { + "ce_ib": 16.928804397583008, + "ce_orig": 0.898991048336029, + "epoch": 0.10496800632683874, + "kl_loss": 8.901844024658203, + "loss_ib": 0.10594724118709564, + "step": 365 + }, + { + "ce_ib": 12.694430351257324, + "ce_orig": 0.6792229413986206, + "epoch": 0.10496800632683874, + "kl_loss": 9.286140441894531, + "loss_ib": 0.10555583983659744, + "step": 365 + }, + { + "ce_ib": 19.038597106933594, + "ce_orig": 1.1029527187347412, + "epoch": 0.10496800632683874, + "kl_loss": 8.959847450256348, + "loss_ib": 0.10863707214593887, + "step": 365 + }, + { + "ce_ib": 14.017401695251465, + "ce_orig": 0.8020762205123901, + "epoch": 0.10496800632683874, + "kl_loss": 10.084455490112305, + "loss_ib": 0.11486195027828217, + "step": 365 + }, + { + "ce_ib": 17.723241806030273, + "ce_orig": 1.3404425382614136, + "epoch": 0.10525558990581638, + "kl_loss": 8.840962409973145, + "loss_ib": 0.1061328649520874, + "step": 366 + }, + { + "ce_ib": 13.039340019226074, + "ce_orig": 0.7993932962417603, + "epoch": 0.10525558990581638, + "kl_loss": 8.735919952392578, + "loss_ib": 0.10039854049682617, + "step": 366 + }, + { + "ce_ib": 15.478903770446777, + "ce_orig": 0.7874028086662292, + "epoch": 0.10525558990581638, + "kl_loss": 8.431257247924805, + "loss_ib": 0.09979147464036942, + "step": 366 + }, + { + "ce_ib": 14.608510971069336, + "ce_orig": 0.8104147911071777, + "epoch": 0.10525558990581638, + "kl_loss": 8.885534286499023, + "loss_ib": 0.10346385091543198, + "step": 366 + }, + { + "ce_ib": 15.228021621704102, + "ce_orig": 0.8766224384307861, + "epoch": 0.10554317348479401, + "kl_loss": 9.194426536560059, + "loss_ib": 0.10717228800058365, + "step": 367 + }, + { + "ce_ib": 15.027702331542969, + "ce_orig": 0.7485743165016174, + "epoch": 0.10554317348479401, + "kl_loss": 8.38540267944336, + "loss_ib": 0.09888172894716263, + "step": 367 + }, + { + "ce_ib": 17.5020694732666, + "ce_orig": 1.4059276580810547, + "epoch": 0.10554317348479401, + "kl_loss": 9.084081649780273, + "loss_ib": 0.10834288597106934, + "step": 367 + }, + { + "ce_ib": 17.624956130981445, + "ce_orig": 1.1633917093276978, + "epoch": 0.10554317348479401, + "kl_loss": 9.179400444030762, + "loss_ib": 0.10941895842552185, + "step": 367 + }, + { + "ce_ib": 17.59357452392578, + "ce_orig": 1.2003300189971924, + "epoch": 0.10583075706377165, + "kl_loss": 8.543558120727539, + "loss_ib": 0.10302915424108505, + "step": 368 + }, + { + "ce_ib": 18.518356323242188, + "ce_orig": 1.2260750532150269, + "epoch": 0.10583075706377165, + "kl_loss": 8.891761779785156, + "loss_ib": 0.10743597149848938, + "step": 368 + }, + { + "ce_ib": 17.378921508789062, + "ce_orig": 1.023598551750183, + "epoch": 0.10583075706377165, + "kl_loss": 8.479592323303223, + "loss_ib": 0.10217484086751938, + "step": 368 + }, + { + "ce_ib": 19.736833572387695, + "ce_orig": 1.5643643140792847, + "epoch": 0.10583075706377165, + "kl_loss": 8.811531066894531, + "loss_ib": 0.10785213857889175, + "step": 368 + }, + { + "ce_ib": 13.935962677001953, + "ce_orig": 0.5430191159248352, + "epoch": 0.1061183406427493, + "kl_loss": 9.201333999633789, + "loss_ib": 0.10594930499792099, + "step": 369 + }, + { + "ce_ib": 17.181163787841797, + "ce_orig": 1.3840625286102295, + "epoch": 0.1061183406427493, + "kl_loss": 8.802513122558594, + "loss_ib": 0.10520629584789276, + "step": 369 + }, + { + "ce_ib": 14.592924118041992, + "ce_orig": 0.8257763385772705, + "epoch": 0.1061183406427493, + "kl_loss": 8.735795974731445, + "loss_ib": 0.10195088386535645, + "step": 369 + }, + { + "ce_ib": 15.771759986877441, + "ce_orig": 0.7309710383415222, + "epoch": 0.1061183406427493, + "kl_loss": 9.097509384155273, + "loss_ib": 0.1067468523979187, + "step": 369 + }, + { + "epoch": 0.10640592422172694, + "grad_norm": 0.4832640290260315, + "learning_rate": 9.999397703945243e-06, + "loss": 1.0498, + "step": 370 + }, + { + "ce_ib": 15.806496620178223, + "ce_orig": 0.879632294178009, + "epoch": 0.10640592422172694, + "kl_loss": 10.071746826171875, + "loss_ib": 0.11652395874261856, + "step": 370 + }, + { + "ce_ib": 15.620360374450684, + "ce_orig": 0.6177505254745483, + "epoch": 0.10640592422172694, + "kl_loss": 8.700934410095215, + "loss_ib": 0.10262969881296158, + "step": 370 + }, + { + "ce_ib": 14.942098617553711, + "ce_orig": 0.6238611936569214, + "epoch": 0.10640592422172694, + "kl_loss": 9.041037559509277, + "loss_ib": 0.1053524762392044, + "step": 370 + }, + { + "ce_ib": 13.575098991394043, + "ce_orig": 0.7554785013198853, + "epoch": 0.10640592422172694, + "kl_loss": 8.968904495239258, + "loss_ib": 0.10326413810253143, + "step": 370 + }, + { + "ce_ib": 10.327584266662598, + "ce_orig": 0.23832768201828003, + "epoch": 0.10669350780070458, + "kl_loss": 7.03424072265625, + "loss_ib": 0.08066999167203903, + "step": 371 + }, + { + "ce_ib": 19.388700485229492, + "ce_orig": 1.3649802207946777, + "epoch": 0.10669350780070458, + "kl_loss": 9.210512161254883, + "loss_ib": 0.11149382591247559, + "step": 371 + }, + { + "ce_ib": 11.52276611328125, + "ce_orig": 0.7070875763893127, + "epoch": 0.10669350780070458, + "kl_loss": 8.641892433166504, + "loss_ib": 0.09794168919324875, + "step": 371 + }, + { + "ce_ib": 19.951656341552734, + "ce_orig": 1.3112847805023193, + "epoch": 0.10669350780070458, + "kl_loss": 9.238996505737305, + "loss_ib": 0.11234162002801895, + "step": 371 + }, + { + "ce_ib": 19.08662986755371, + "ce_orig": 1.517033338546753, + "epoch": 0.10698109137968222, + "kl_loss": 9.000858306884766, + "loss_ib": 0.10909520834684372, + "step": 372 + }, + { + "ce_ib": 13.207756996154785, + "ce_orig": 0.596856951713562, + "epoch": 0.10698109137968222, + "kl_loss": 8.860549926757812, + "loss_ib": 0.10181325674057007, + "step": 372 + }, + { + "ce_ib": 19.195131301879883, + "ce_orig": 0.9714540839195251, + "epoch": 0.10698109137968222, + "kl_loss": 9.710214614868164, + "loss_ib": 0.11629726737737656, + "step": 372 + }, + { + "ce_ib": 17.687284469604492, + "ce_orig": 1.5548173189163208, + "epoch": 0.10698109137968222, + "kl_loss": 8.532265663146973, + "loss_ib": 0.10300993919372559, + "step": 372 + }, + { + "ce_ib": 18.050241470336914, + "ce_orig": 0.8549476265907288, + "epoch": 0.10726867495865985, + "kl_loss": 8.890786170959473, + "loss_ib": 0.10695809870958328, + "step": 373 + }, + { + "ce_ib": 15.486068725585938, + "ce_orig": 0.7641202807426453, + "epoch": 0.10726867495865985, + "kl_loss": 8.97690486907959, + "loss_ib": 0.1052551120519638, + "step": 373 + }, + { + "ce_ib": 20.0921573638916, + "ce_orig": 1.5870574712753296, + "epoch": 0.10726867495865985, + "kl_loss": 8.584416389465332, + "loss_ib": 0.10593631863594055, + "step": 373 + }, + { + "ce_ib": 14.03450870513916, + "ce_orig": 0.8633349537849426, + "epoch": 0.10726867495865985, + "kl_loss": 9.580089569091797, + "loss_ib": 0.10983540117740631, + "step": 373 + }, + { + "ce_ib": 18.23748779296875, + "ce_orig": 0.9740087985992432, + "epoch": 0.1075562585376375, + "kl_loss": 8.163890838623047, + "loss_ib": 0.09987638890743256, + "step": 374 + }, + { + "ce_ib": 16.242849349975586, + "ce_orig": 0.9537095427513123, + "epoch": 0.1075562585376375, + "kl_loss": 8.648405075073242, + "loss_ib": 0.10272689908742905, + "step": 374 + }, + { + "ce_ib": 15.156339645385742, + "ce_orig": 1.0340036153793335, + "epoch": 0.1075562585376375, + "kl_loss": 8.221203804016113, + "loss_ib": 0.09736837446689606, + "step": 374 + }, + { + "ce_ib": 16.337060928344727, + "ce_orig": 1.1412116289138794, + "epoch": 0.1075562585376375, + "kl_loss": 8.982345581054688, + "loss_ib": 0.10616051405668259, + "step": 374 + }, + { + "epoch": 0.10784384211661514, + "grad_norm": 0.609579861164093, + "learning_rate": 9.999271224846397e-06, + "loss": 1.0013, + "step": 375 + }, + { + "ce_ib": 9.566058158874512, + "ce_orig": 0.2985042929649353, + "epoch": 0.10784384211661514, + "kl_loss": 7.494280815124512, + "loss_ib": 0.08450886607170105, + "step": 375 + }, + { + "ce_ib": 12.867706298828125, + "ce_orig": 0.5363652110099792, + "epoch": 0.10784384211661514, + "kl_loss": 8.874717712402344, + "loss_ib": 0.10161488503217697, + "step": 375 + }, + { + "ce_ib": 14.6071195602417, + "ce_orig": 1.2598626613616943, + "epoch": 0.10784384211661514, + "kl_loss": 8.637162208557129, + "loss_ib": 0.10097873210906982, + "step": 375 + }, + { + "ce_ib": 17.33331871032715, + "ce_orig": 0.6892062425613403, + "epoch": 0.10784384211661514, + "kl_loss": 8.592702865600586, + "loss_ib": 0.1032603457570076, + "step": 375 + }, + { + "ce_ib": 15.182628631591797, + "ce_orig": 0.8526699542999268, + "epoch": 0.10813142569559278, + "kl_loss": 8.426027297973633, + "loss_ib": 0.09944289922714233, + "step": 376 + }, + { + "ce_ib": 18.341575622558594, + "ce_orig": 0.621722400188446, + "epoch": 0.10813142569559278, + "kl_loss": 7.857001304626465, + "loss_ib": 0.09691158682107925, + "step": 376 + }, + { + "ce_ib": 14.127799987792969, + "ce_orig": 0.7635291218757629, + "epoch": 0.10813142569559278, + "kl_loss": 8.97607421875, + "loss_ib": 0.10388854146003723, + "step": 376 + }, + { + "ce_ib": 16.845251083374023, + "ce_orig": 1.1074169874191284, + "epoch": 0.10813142569559278, + "kl_loss": 8.656087875366211, + "loss_ib": 0.10340613126754761, + "step": 376 + }, + { + "ce_ib": 16.19705581665039, + "ce_orig": 0.8272833824157715, + "epoch": 0.10841900927457042, + "kl_loss": 8.441411972045898, + "loss_ib": 0.10061117261648178, + "step": 377 + }, + { + "ce_ib": 11.034353256225586, + "ce_orig": 0.6063670516014099, + "epoch": 0.10841900927457042, + "kl_loss": 9.254929542541504, + "loss_ib": 0.10358364880084991, + "step": 377 + }, + { + "ce_ib": 16.136695861816406, + "ce_orig": 0.9638150930404663, + "epoch": 0.10841900927457042, + "kl_loss": 8.382518768310547, + "loss_ib": 0.09996187686920166, + "step": 377 + }, + { + "ce_ib": 14.821840286254883, + "ce_orig": 1.11579430103302, + "epoch": 0.10841900927457042, + "kl_loss": 8.752357482910156, + "loss_ib": 0.10234541445970535, + "step": 377 + }, + { + "ce_ib": 14.535453796386719, + "ce_orig": 0.5030508041381836, + "epoch": 0.10870659285354806, + "kl_loss": 8.088106155395508, + "loss_ib": 0.09541651606559753, + "step": 378 + }, + { + "ce_ib": 13.514139175415039, + "ce_orig": 0.43041279911994934, + "epoch": 0.10870659285354806, + "kl_loss": 8.524757385253906, + "loss_ib": 0.09876170754432678, + "step": 378 + }, + { + "ce_ib": 13.1725435256958, + "ce_orig": 0.5961971282958984, + "epoch": 0.10870659285354806, + "kl_loss": 8.799945831298828, + "loss_ib": 0.10117200016975403, + "step": 378 + }, + { + "ce_ib": 19.455663681030273, + "ce_orig": 1.371256947517395, + "epoch": 0.10870659285354806, + "kl_loss": 7.995181083679199, + "loss_ib": 0.09940747171640396, + "step": 378 + }, + { + "ce_ib": 13.492290496826172, + "ce_orig": 0.9726055264472961, + "epoch": 0.10899417643252571, + "kl_loss": 8.796764373779297, + "loss_ib": 0.10145992785692215, + "step": 379 + }, + { + "ce_ib": 11.911052703857422, + "ce_orig": 0.7123900651931763, + "epoch": 0.10899417643252571, + "kl_loss": 8.622007369995117, + "loss_ib": 0.09813112020492554, + "step": 379 + }, + { + "ce_ib": 15.950429916381836, + "ce_orig": 0.5447075963020325, + "epoch": 0.10899417643252571, + "kl_loss": 8.17884635925293, + "loss_ib": 0.09773889183998108, + "step": 379 + }, + { + "ce_ib": 17.951889038085938, + "ce_orig": 0.9959045052528381, + "epoch": 0.10899417643252571, + "kl_loss": 8.3568696975708, + "loss_ib": 0.1015205830335617, + "step": 379 + }, + { + "epoch": 0.10928176001150335, + "grad_norm": 0.6501461863517761, + "learning_rate": 9.99913270134281e-06, + "loss": 0.9988, + "step": 380 + }, + { + "ce_ib": 12.876981735229492, + "ce_orig": 0.5673431754112244, + "epoch": 0.10928176001150335, + "kl_loss": 7.431344032287598, + "loss_ib": 0.0871904194355011, + "step": 380 + }, + { + "ce_ib": 15.092470169067383, + "ce_orig": 0.2880413830280304, + "epoch": 0.10928176001150335, + "kl_loss": 8.873856544494629, + "loss_ib": 0.10383103042840958, + "step": 380 + }, + { + "ce_ib": 12.99868392944336, + "ce_orig": 0.5703913569450378, + "epoch": 0.10928176001150335, + "kl_loss": 8.380903244018555, + "loss_ib": 0.09680771827697754, + "step": 380 + }, + { + "ce_ib": 13.363186836242676, + "ce_orig": 0.5209031701087952, + "epoch": 0.10928176001150335, + "kl_loss": 8.865699768066406, + "loss_ib": 0.10202018171548843, + "step": 380 + }, + { + "ce_ib": 20.89650535583496, + "ce_orig": 1.6800005435943604, + "epoch": 0.10956934359048098, + "kl_loss": 8.10032844543457, + "loss_ib": 0.10189979523420334, + "step": 381 + }, + { + "ce_ib": 17.024986267089844, + "ce_orig": 0.6650580167770386, + "epoch": 0.10956934359048098, + "kl_loss": 7.477260112762451, + "loss_ib": 0.0917975902557373, + "step": 381 + }, + { + "ce_ib": 21.227222442626953, + "ce_orig": 1.5764412879943848, + "epoch": 0.10956934359048098, + "kl_loss": 8.041464805603027, + "loss_ib": 0.10164187103509903, + "step": 381 + }, + { + "ce_ib": 18.747699737548828, + "ce_orig": 1.2042864561080933, + "epoch": 0.10956934359048098, + "kl_loss": 7.809324264526367, + "loss_ib": 0.09684094041585922, + "step": 381 + }, + { + "ce_ib": 13.682280540466309, + "ce_orig": 0.604314386844635, + "epoch": 0.10985692716945862, + "kl_loss": 8.266580581665039, + "loss_ib": 0.09634808450937271, + "step": 382 + }, + { + "ce_ib": 11.598891258239746, + "ce_orig": 0.7977884411811829, + "epoch": 0.10985692716945862, + "kl_loss": 8.074966430664062, + "loss_ib": 0.09234855324029922, + "step": 382 + }, + { + "ce_ib": 10.722358703613281, + "ce_orig": 0.488436758518219, + "epoch": 0.10985692716945862, + "kl_loss": 8.471232414245605, + "loss_ib": 0.09543468058109283, + "step": 382 + }, + { + "ce_ib": 20.917461395263672, + "ce_orig": 1.5944008827209473, + "epoch": 0.10985692716945862, + "kl_loss": 8.631521224975586, + "loss_ib": 0.10723267495632172, + "step": 382 + }, + { + "ce_ib": 14.213534355163574, + "ce_orig": 0.9382426142692566, + "epoch": 0.11014451074843626, + "kl_loss": 8.12414836883545, + "loss_ib": 0.09545501321554184, + "step": 383 + }, + { + "ce_ib": 10.728821754455566, + "ce_orig": 0.7022181749343872, + "epoch": 0.11014451074843626, + "kl_loss": 8.53184700012207, + "loss_ib": 0.0960472822189331, + "step": 383 + }, + { + "ce_ib": 10.197715759277344, + "ce_orig": 0.512974202632904, + "epoch": 0.11014451074843626, + "kl_loss": 8.536653518676758, + "loss_ib": 0.09556424617767334, + "step": 383 + }, + { + "ce_ib": 12.831421852111816, + "ce_orig": 0.6488183736801147, + "epoch": 0.11014451074843626, + "kl_loss": 8.425491333007812, + "loss_ib": 0.0970863327383995, + "step": 383 + }, + { + "ce_ib": 13.790513038635254, + "ce_orig": 0.48860305547714233, + "epoch": 0.11043209432741391, + "kl_loss": 8.638005256652832, + "loss_ib": 0.1001705601811409, + "step": 384 + }, + { + "ce_ib": 18.993335723876953, + "ce_orig": 1.300746202468872, + "epoch": 0.11043209432741391, + "kl_loss": 7.856682777404785, + "loss_ib": 0.09756016731262207, + "step": 384 + }, + { + "ce_ib": 14.618206024169922, + "ce_orig": 0.6457257270812988, + "epoch": 0.11043209432741391, + "kl_loss": 5.0488386154174805, + "loss_ib": 0.0651065930724144, + "step": 384 + }, + { + "ce_ib": 13.924184799194336, + "ce_orig": 0.7180906534194946, + "epoch": 0.11043209432741391, + "kl_loss": 8.496256828308105, + "loss_ib": 0.09888675808906555, + "step": 384 + }, + { + "epoch": 0.11071967790639155, + "grad_norm": 0.7011797428131104, + "learning_rate": 9.998982133768226e-06, + "loss": 0.9557, + "step": 385 + }, + { + "ce_ib": 15.821268081665039, + "ce_orig": 1.4919018745422363, + "epoch": 0.11071967790639155, + "kl_loss": 8.213622093200684, + "loss_ib": 0.09795748442411423, + "step": 385 + }, + { + "ce_ib": 14.86267375946045, + "ce_orig": 1.0427626371383667, + "epoch": 0.11071967790639155, + "kl_loss": 7.624295234680176, + "loss_ib": 0.091105617582798, + "step": 385 + }, + { + "ce_ib": 12.792627334594727, + "ce_orig": 0.644935131072998, + "epoch": 0.11071967790639155, + "kl_loss": 8.272308349609375, + "loss_ib": 0.09551570564508438, + "step": 385 + }, + { + "ce_ib": 13.713454246520996, + "ce_orig": 0.8965685963630676, + "epoch": 0.11071967790639155, + "kl_loss": 9.92483901977539, + "loss_ib": 0.11296184360980988, + "step": 385 + }, + { + "ce_ib": 16.306076049804688, + "ce_orig": 0.7789519429206848, + "epoch": 0.11100726148536919, + "kl_loss": 7.479434967041016, + "loss_ib": 0.09110042452812195, + "step": 386 + }, + { + "ce_ib": 15.762940406799316, + "ce_orig": 0.8707707524299622, + "epoch": 0.11100726148536919, + "kl_loss": 7.34848165512085, + "loss_ib": 0.08924775570631027, + "step": 386 + }, + { + "ce_ib": 6.3066205978393555, + "ce_orig": 0.19392843544483185, + "epoch": 0.11100726148536919, + "kl_loss": 5.0305914878845215, + "loss_ib": 0.0566125325858593, + "step": 386 + }, + { + "ce_ib": 14.033432006835938, + "ce_orig": 0.7708988189697266, + "epoch": 0.11100726148536919, + "kl_loss": 8.070176124572754, + "loss_ib": 0.09473519027233124, + "step": 386 + }, + { + "ce_ib": 16.794458389282227, + "ce_orig": 0.8729531168937683, + "epoch": 0.11129484506434682, + "kl_loss": 8.056872367858887, + "loss_ib": 0.09736318141222, + "step": 387 + }, + { + "ce_ib": 14.828986167907715, + "ce_orig": 1.121248483657837, + "epoch": 0.11129484506434682, + "kl_loss": 7.610101699829102, + "loss_ib": 0.09092999994754791, + "step": 387 + }, + { + "ce_ib": 13.89840030670166, + "ce_orig": 0.740386426448822, + "epoch": 0.11129484506434682, + "kl_loss": 7.463097095489502, + "loss_ib": 0.08852936327457428, + "step": 387 + }, + { + "ce_ib": 12.567804336547852, + "ce_orig": 0.956537663936615, + "epoch": 0.11129484506434682, + "kl_loss": 7.956699848175049, + "loss_ib": 0.09213479608297348, + "step": 387 + }, + { + "ce_ib": 11.099259376525879, + "ce_orig": 0.6955797076225281, + "epoch": 0.11158242864332446, + "kl_loss": 8.742734909057617, + "loss_ib": 0.09852661192417145, + "step": 388 + }, + { + "ce_ib": 15.241199493408203, + "ce_orig": 0.7828308939933777, + "epoch": 0.11158242864332446, + "kl_loss": 8.080060005187988, + "loss_ib": 0.09604179859161377, + "step": 388 + }, + { + "ce_ib": 12.71835994720459, + "ce_orig": 0.5617780685424805, + "epoch": 0.11158242864332446, + "kl_loss": 7.975733280181885, + "loss_ib": 0.09247568994760513, + "step": 388 + }, + { + "ce_ib": 15.852190971374512, + "ce_orig": 0.9822865128517151, + "epoch": 0.11158242864332446, + "kl_loss": 7.865725040435791, + "loss_ib": 0.09450943768024445, + "step": 388 + }, + { + "ce_ib": 15.686773300170898, + "ce_orig": 0.8103384971618652, + "epoch": 0.11187001222230211, + "kl_loss": 7.693680763244629, + "loss_ib": 0.09262357652187347, + "step": 389 + }, + { + "ce_ib": 15.016551971435547, + "ce_orig": 0.9884656071662903, + "epoch": 0.11187001222230211, + "kl_loss": 7.716882228851318, + "loss_ib": 0.0921853706240654, + "step": 389 + }, + { + "ce_ib": 18.931434631347656, + "ce_orig": 1.5234384536743164, + "epoch": 0.11187001222230211, + "kl_loss": 7.528309345245361, + "loss_ib": 0.09421452134847641, + "step": 389 + }, + { + "ce_ib": 17.58110237121582, + "ce_orig": 1.5781915187835693, + "epoch": 0.11187001222230211, + "kl_loss": 7.889880180358887, + "loss_ib": 0.09647990018129349, + "step": 389 + }, + { + "epoch": 0.11215759580127975, + "grad_norm": 0.6288163661956787, + "learning_rate": 9.998819522485392e-06, + "loss": 1.0119, + "step": 390 + }, + { + "ce_ib": 15.403480529785156, + "ce_orig": 0.7215459942817688, + "epoch": 0.11215759580127975, + "kl_loss": 8.92459774017334, + "loss_ib": 0.10464945435523987, + "step": 390 + }, + { + "ce_ib": 12.422520637512207, + "ce_orig": 0.38442984223365784, + "epoch": 0.11215759580127975, + "kl_loss": 7.306635856628418, + "loss_ib": 0.08548887819051743, + "step": 390 + }, + { + "ce_ib": 16.295055389404297, + "ce_orig": 0.8220521211624146, + "epoch": 0.11215759580127975, + "kl_loss": 7.3963141441345215, + "loss_ib": 0.09025819599628448, + "step": 390 + }, + { + "ce_ib": 13.246119499206543, + "ce_orig": 0.6906241774559021, + "epoch": 0.11215759580127975, + "kl_loss": 7.956465721130371, + "loss_ib": 0.09281077235937119, + "step": 390 + }, + { + "ce_ib": 13.792742729187012, + "ce_orig": 0.844266414642334, + "epoch": 0.11244517938025739, + "kl_loss": 8.29489517211914, + "loss_ib": 0.0967416912317276, + "step": 391 + }, + { + "ce_ib": 16.884506225585938, + "ce_orig": 0.9576941132545471, + "epoch": 0.11244517938025739, + "kl_loss": 7.322904586791992, + "loss_ib": 0.0901135504245758, + "step": 391 + }, + { + "ce_ib": 12.04941463470459, + "ce_orig": 0.6010465025901794, + "epoch": 0.11244517938025739, + "kl_loss": 7.9065093994140625, + "loss_ib": 0.09111450612545013, + "step": 391 + }, + { + "ce_ib": 13.96172046661377, + "ce_orig": 0.6447573900222778, + "epoch": 0.11244517938025739, + "kl_loss": 7.576847076416016, + "loss_ib": 0.08973018079996109, + "step": 391 + }, + { + "ce_ib": 15.197918891906738, + "ce_orig": 0.7180730700492859, + "epoch": 0.11273276295923502, + "kl_loss": 7.893502235412598, + "loss_ib": 0.0941329374909401, + "step": 392 + }, + { + "ce_ib": 13.946671485900879, + "ce_orig": 0.7366077303886414, + "epoch": 0.11273276295923502, + "kl_loss": 7.63916015625, + "loss_ib": 0.09033826738595963, + "step": 392 + }, + { + "ce_ib": 14.26876163482666, + "ce_orig": 0.8613617420196533, + "epoch": 0.11273276295923502, + "kl_loss": 7.369016647338867, + "loss_ib": 0.087958924472332, + "step": 392 + }, + { + "ce_ib": 12.909405708312988, + "ce_orig": 0.7305828332901001, + "epoch": 0.11273276295923502, + "kl_loss": 7.314949035644531, + "loss_ib": 0.08605889230966568, + "step": 392 + }, + { + "ce_ib": 18.242956161499023, + "ce_orig": 1.3529711961746216, + "epoch": 0.11302034653821266, + "kl_loss": 7.389057636260986, + "loss_ib": 0.092133529484272, + "step": 393 + }, + { + "ce_ib": 9.846675872802734, + "ce_orig": 0.7093670964241028, + "epoch": 0.11302034653821266, + "kl_loss": 7.952066898345947, + "loss_ib": 0.0893673375248909, + "step": 393 + }, + { + "ce_ib": 12.509729385375977, + "ce_orig": 1.0103166103363037, + "epoch": 0.11302034653821266, + "kl_loss": 8.187524795532227, + "loss_ib": 0.09438497573137283, + "step": 393 + }, + { + "ce_ib": 6.737217426300049, + "ce_orig": 0.21716581284999847, + "epoch": 0.11302034653821266, + "kl_loss": 5.771838188171387, + "loss_ib": 0.06445559859275818, + "step": 393 + }, + { + "ce_ib": 12.970864295959473, + "ce_orig": 0.689153790473938, + "epoch": 0.11330793011719031, + "kl_loss": 7.806953430175781, + "loss_ib": 0.09104040265083313, + "step": 394 + }, + { + "ce_ib": 14.298376083374023, + "ce_orig": 0.6159449815750122, + "epoch": 0.11330793011719031, + "kl_loss": 7.474273204803467, + "loss_ib": 0.08904110640287399, + "step": 394 + }, + { + "ce_ib": 16.35755729675293, + "ce_orig": 1.128537893295288, + "epoch": 0.11330793011719031, + "kl_loss": 7.542243003845215, + "loss_ib": 0.09177998453378677, + "step": 394 + }, + { + "ce_ib": 16.245141983032227, + "ce_orig": 0.5840805768966675, + "epoch": 0.11330793011719031, + "kl_loss": 7.685763835906982, + "loss_ib": 0.09310277551412582, + "step": 394 + }, + { + "epoch": 0.11359551369616795, + "grad_norm": 0.43641915917396545, + "learning_rate": 9.998644867886077e-06, + "loss": 0.9567, + "step": 395 + }, + { + "ce_ib": 14.849902153015137, + "ce_orig": 0.9369897842407227, + "epoch": 0.11359551369616795, + "kl_loss": 7.421789646148682, + "loss_ib": 0.08906780183315277, + "step": 395 + }, + { + "ce_ib": 12.70493221282959, + "ce_orig": 0.7189629673957825, + "epoch": 0.11359551369616795, + "kl_loss": 7.841180801391602, + "loss_ib": 0.09111674129962921, + "step": 395 + }, + { + "ce_ib": 16.844697952270508, + "ce_orig": 0.6727548241615295, + "epoch": 0.11359551369616795, + "kl_loss": 6.990694522857666, + "loss_ib": 0.08675163984298706, + "step": 395 + }, + { + "ce_ib": 14.839481353759766, + "ce_orig": 1.255350947380066, + "epoch": 0.11359551369616795, + "kl_loss": 7.494076728820801, + "loss_ib": 0.08978024125099182, + "step": 395 + }, + { + "ce_ib": 16.765884399414062, + "ce_orig": 1.331714153289795, + "epoch": 0.11388309727514559, + "kl_loss": 7.407535552978516, + "loss_ib": 0.09084123373031616, + "step": 396 + }, + { + "ce_ib": 10.355143547058105, + "ce_orig": 0.6202336549758911, + "epoch": 0.11388309727514559, + "kl_loss": 7.605844497680664, + "loss_ib": 0.08641359210014343, + "step": 396 + }, + { + "ce_ib": 14.112444877624512, + "ce_orig": 0.8373818397521973, + "epoch": 0.11388309727514559, + "kl_loss": 7.473760604858398, + "loss_ib": 0.08885005116462708, + "step": 396 + }, + { + "ce_ib": 13.782599449157715, + "ce_orig": 0.816245973110199, + "epoch": 0.11388309727514559, + "kl_loss": 7.807076454162598, + "loss_ib": 0.09185335785150528, + "step": 396 + }, + { + "ce_ib": 15.40560245513916, + "ce_orig": 1.211715579032898, + "epoch": 0.11417068085412323, + "kl_loss": 7.456460475921631, + "loss_ib": 0.08997020870447159, + "step": 397 + }, + { + "ce_ib": 9.293402671813965, + "ce_orig": 0.4598255753517151, + "epoch": 0.11417068085412323, + "kl_loss": 8.316404342651367, + "loss_ib": 0.09245744347572327, + "step": 397 + }, + { + "ce_ib": 14.194599151611328, + "ce_orig": 0.8155964612960815, + "epoch": 0.11417068085412323, + "kl_loss": 7.829108715057373, + "loss_ib": 0.09248568117618561, + "step": 397 + }, + { + "ce_ib": 12.846905708312988, + "ce_orig": 0.8388014435768127, + "epoch": 0.11417068085412323, + "kl_loss": 7.558581829071045, + "loss_ib": 0.08843272179365158, + "step": 397 + }, + { + "ce_ib": 10.11628532409668, + "ce_orig": 0.6828235983848572, + "epoch": 0.11445826443310086, + "kl_loss": 7.854046821594238, + "loss_ib": 0.08865674585103989, + "step": 398 + }, + { + "ce_ib": 9.932538032531738, + "ce_orig": 0.49403539299964905, + "epoch": 0.11445826443310086, + "kl_loss": 8.248268127441406, + "loss_ib": 0.0924152135848999, + "step": 398 + }, + { + "ce_ib": 13.556487083435059, + "ce_orig": 0.49110883474349976, + "epoch": 0.11445826443310086, + "kl_loss": 7.1056108474731445, + "loss_ib": 0.08461259305477142, + "step": 398 + }, + { + "ce_ib": 17.799386978149414, + "ce_orig": 1.120202660560608, + "epoch": 0.11445826443310086, + "kl_loss": 7.619751930236816, + "loss_ib": 0.09399690479040146, + "step": 398 + }, + { + "ce_ib": 18.385801315307617, + "ce_orig": 1.4876097440719604, + "epoch": 0.11474584801207852, + "kl_loss": 7.5165114402771, + "loss_ib": 0.093550905585289, + "step": 399 + }, + { + "ce_ib": 14.88972282409668, + "ce_orig": 1.251899003982544, + "epoch": 0.11474584801207852, + "kl_loss": 7.511725902557373, + "loss_ib": 0.09000697731971741, + "step": 399 + }, + { + "ce_ib": 11.505959510803223, + "ce_orig": 0.5990752577781677, + "epoch": 0.11474584801207852, + "kl_loss": 7.720244884490967, + "loss_ib": 0.08870840817689896, + "step": 399 + }, + { + "ce_ib": 14.54392147064209, + "ce_orig": 0.8790196180343628, + "epoch": 0.11474584801207852, + "kl_loss": 7.9987359046936035, + "loss_ib": 0.09453127533197403, + "step": 399 + }, + { + "epoch": 0.11503343159105615, + "grad_norm": 0.5361573100090027, + "learning_rate": 9.998458170391065e-06, + "loss": 0.9792, + "step": 400 + }, + { + "ce_ib": 18.511137008666992, + "ce_orig": 0.8711093664169312, + "epoch": 0.11503343159105615, + "kl_loss": 7.370296478271484, + "loss_ib": 0.09221409261226654, + "step": 400 + }, + { + "ce_ib": 12.954771995544434, + "ce_orig": 0.9472190141677856, + "epoch": 0.11503343159105615, + "kl_loss": 7.668916702270508, + "loss_ib": 0.0896439328789711, + "step": 400 + }, + { + "ce_ib": 9.006753921508789, + "ce_orig": 0.6201379299163818, + "epoch": 0.11503343159105615, + "kl_loss": 7.630331516265869, + "loss_ib": 0.08531006425619125, + "step": 400 + }, + { + "ce_ib": 15.421821594238281, + "ce_orig": 1.425597906112671, + "epoch": 0.11503343159105615, + "kl_loss": 7.540129661560059, + "loss_ib": 0.09082311391830444, + "step": 400 + }, + { + "ce_ib": 15.149130821228027, + "ce_orig": 1.0793287754058838, + "epoch": 0.11532101517003379, + "kl_loss": 7.484195232391357, + "loss_ib": 0.08999107778072357, + "step": 401 + }, + { + "ce_ib": 10.156062126159668, + "ce_orig": 0.5667037963867188, + "epoch": 0.11532101517003379, + "kl_loss": 7.341686248779297, + "loss_ib": 0.08357291668653488, + "step": 401 + }, + { + "ce_ib": 11.375419616699219, + "ce_orig": 1.1060158014297485, + "epoch": 0.11532101517003379, + "kl_loss": 7.683910846710205, + "loss_ib": 0.08821453154087067, + "step": 401 + }, + { + "ce_ib": 17.671218872070312, + "ce_orig": 1.273245096206665, + "epoch": 0.11532101517003379, + "kl_loss": 7.472596645355225, + "loss_ib": 0.09239718317985535, + "step": 401 + }, + { + "ce_ib": 16.562593460083008, + "ce_orig": 1.0524271726608276, + "epoch": 0.11560859874901143, + "kl_loss": 7.498256683349609, + "loss_ib": 0.09154515713453293, + "step": 402 + }, + { + "ce_ib": 13.353780746459961, + "ce_orig": 0.8203778862953186, + "epoch": 0.11560859874901143, + "kl_loss": 7.896320343017578, + "loss_ib": 0.09231697767972946, + "step": 402 + }, + { + "ce_ib": 15.274881362915039, + "ce_orig": 1.007253885269165, + "epoch": 0.11560859874901143, + "kl_loss": 7.653887748718262, + "loss_ib": 0.09181375056505203, + "step": 402 + }, + { + "ce_ib": 16.868385314941406, + "ce_orig": 1.3746187686920166, + "epoch": 0.11560859874901143, + "kl_loss": 6.850212097167969, + "loss_ib": 0.0853705033659935, + "step": 402 + }, + { + "ce_ib": 14.26913833618164, + "ce_orig": 1.1488964557647705, + "epoch": 0.11589618232798907, + "kl_loss": 7.498937606811523, + "loss_ib": 0.08925851434469223, + "step": 403 + }, + { + "ce_ib": 13.987759590148926, + "ce_orig": 0.4179192781448364, + "epoch": 0.11589618232798907, + "kl_loss": 7.400528430938721, + "loss_ib": 0.08799304068088531, + "step": 403 + }, + { + "ce_ib": 13.340829849243164, + "ce_orig": 0.7595700025558472, + "epoch": 0.11589618232798907, + "kl_loss": 7.298229217529297, + "loss_ib": 0.08632311969995499, + "step": 403 + }, + { + "ce_ib": 12.933422088623047, + "ce_orig": 0.6614567041397095, + "epoch": 0.11589618232798907, + "kl_loss": 7.262064456939697, + "loss_ib": 0.08555406332015991, + "step": 403 + }, + { + "ce_ib": 12.648677825927734, + "ce_orig": 1.055274248123169, + "epoch": 0.11618376590696672, + "kl_loss": 7.3485426902771, + "loss_ib": 0.08613410592079163, + "step": 404 + }, + { + "ce_ib": 15.602824211120605, + "ce_orig": 1.1300698518753052, + "epoch": 0.11618376590696672, + "kl_loss": 7.553566932678223, + "loss_ib": 0.09113849699497223, + "step": 404 + }, + { + "ce_ib": 11.030848503112793, + "ce_orig": 0.6406192779541016, + "epoch": 0.11618376590696672, + "kl_loss": 7.551169395446777, + "loss_ib": 0.0865425392985344, + "step": 404 + }, + { + "ce_ib": 18.89850616455078, + "ce_orig": 1.4245729446411133, + "epoch": 0.11618376590696672, + "kl_loss": 7.921879768371582, + "loss_ib": 0.09811729937791824, + "step": 404 + }, + { + "epoch": 0.11647134948594436, + "grad_norm": 0.5651848316192627, + "learning_rate": 9.998259430450155e-06, + "loss": 1.0022, + "step": 405 + }, + { + "ce_ib": 14.010991096496582, + "ce_orig": 0.7675843834877014, + "epoch": 0.11647134948594436, + "kl_loss": 7.355520248413086, + "loss_ib": 0.08756618946790695, + "step": 405 + }, + { + "ce_ib": 16.283018112182617, + "ce_orig": 1.158023715019226, + "epoch": 0.11647134948594436, + "kl_loss": 7.45557975769043, + "loss_ib": 0.09083881229162216, + "step": 405 + }, + { + "ce_ib": 15.902323722839355, + "ce_orig": 1.0440561771392822, + "epoch": 0.11647134948594436, + "kl_loss": 7.513373851776123, + "loss_ib": 0.09103605896234512, + "step": 405 + }, + { + "ce_ib": 17.28444480895996, + "ce_orig": 1.34712815284729, + "epoch": 0.11647134948594436, + "kl_loss": 7.375174045562744, + "loss_ib": 0.09103618562221527, + "step": 405 + }, + { + "ce_ib": 16.60024070739746, + "ce_orig": 1.3468883037567139, + "epoch": 0.116758933064922, + "kl_loss": 7.249956130981445, + "loss_ib": 0.08909979462623596, + "step": 406 + }, + { + "ce_ib": 10.413902282714844, + "ce_orig": 0.792377233505249, + "epoch": 0.116758933064922, + "kl_loss": 7.449808120727539, + "loss_ib": 0.08491198718547821, + "step": 406 + }, + { + "ce_ib": 14.015169143676758, + "ce_orig": 0.9668265581130981, + "epoch": 0.116758933064922, + "kl_loss": 7.551003456115723, + "loss_ib": 0.08952520042657852, + "step": 406 + }, + { + "ce_ib": 10.880705833435059, + "ce_orig": 0.5008480548858643, + "epoch": 0.116758933064922, + "kl_loss": 7.180622100830078, + "loss_ib": 0.08268693089485168, + "step": 406 + }, + { + "ce_ib": 14.242043495178223, + "ce_orig": 0.5770112872123718, + "epoch": 0.11704651664389963, + "kl_loss": 6.760585308074951, + "loss_ib": 0.0818478912115097, + "step": 407 + }, + { + "ce_ib": 17.079055786132812, + "ce_orig": 1.7003521919250488, + "epoch": 0.11704651664389963, + "kl_loss": 7.319855690002441, + "loss_ib": 0.09027761220932007, + "step": 407 + }, + { + "ce_ib": 13.891350746154785, + "ce_orig": 0.849856972694397, + "epoch": 0.11704651664389963, + "kl_loss": 6.7790093421936035, + "loss_ib": 0.08168143779039383, + "step": 407 + }, + { + "ce_ib": 16.732454299926758, + "ce_orig": 1.2152447700500488, + "epoch": 0.11704651664389963, + "kl_loss": 7.023953437805176, + "loss_ib": 0.08697198331356049, + "step": 407 + }, + { + "ce_ib": 11.8071870803833, + "ce_orig": 0.5867161154747009, + "epoch": 0.11733410022287727, + "kl_loss": 7.046442985534668, + "loss_ib": 0.08227161318063736, + "step": 408 + }, + { + "ce_ib": 12.511359214782715, + "ce_orig": 0.7658900022506714, + "epoch": 0.11733410022287727, + "kl_loss": 7.0474042892456055, + "loss_ib": 0.08298540115356445, + "step": 408 + }, + { + "ce_ib": 14.617786407470703, + "ce_orig": 0.7642480731010437, + "epoch": 0.11733410022287727, + "kl_loss": 6.15494966506958, + "loss_ib": 0.0761672779917717, + "step": 408 + }, + { + "ce_ib": 13.998462677001953, + "ce_orig": 0.3132195472717285, + "epoch": 0.11733410022287727, + "kl_loss": 6.85751485824585, + "loss_ib": 0.08257361501455307, + "step": 408 + }, + { + "ce_ib": 11.64556884765625, + "ce_orig": 0.5106444954872131, + "epoch": 0.11762168380185492, + "kl_loss": 7.280755996704102, + "loss_ib": 0.08445312082767487, + "step": 409 + }, + { + "ce_ib": 15.677848815917969, + "ce_orig": 1.337202548980713, + "epoch": 0.11762168380185492, + "kl_loss": 7.441099166870117, + "loss_ib": 0.0900888442993164, + "step": 409 + }, + { + "ce_ib": 16.44597625732422, + "ce_orig": 0.7064342498779297, + "epoch": 0.11762168380185492, + "kl_loss": 7.3761820793151855, + "loss_ib": 0.0902077928185463, + "step": 409 + }, + { + "ce_ib": 15.110173225402832, + "ce_orig": 1.0219260454177856, + "epoch": 0.11762168380185492, + "kl_loss": 7.731924533843994, + "loss_ib": 0.09242941439151764, + "step": 409 + }, + { + "epoch": 0.11790926738083256, + "grad_norm": 0.3323611617088318, + "learning_rate": 9.998048648542153e-06, + "loss": 0.9117, + "step": 410 + }, + { + "ce_ib": 16.602235794067383, + "ce_orig": 1.2994755506515503, + "epoch": 0.11790926738083256, + "kl_loss": 7.015318870544434, + "loss_ib": 0.0867554247379303, + "step": 410 + }, + { + "ce_ib": 15.049646377563477, + "ce_orig": 0.3460133969783783, + "epoch": 0.11790926738083256, + "kl_loss": 6.893805503845215, + "loss_ib": 0.08398769795894623, + "step": 410 + }, + { + "ce_ib": 15.774944305419922, + "ce_orig": 0.9854010343551636, + "epoch": 0.11790926738083256, + "kl_loss": 7.28230094909668, + "loss_ib": 0.08859795331954956, + "step": 410 + }, + { + "ce_ib": 15.553691864013672, + "ce_orig": 1.1951489448547363, + "epoch": 0.11790926738083256, + "kl_loss": 7.345269680023193, + "loss_ib": 0.08900638669729233, + "step": 410 + }, + { + "ce_ib": 13.911700248718262, + "ce_orig": 0.7811279296875, + "epoch": 0.1181968509598102, + "kl_loss": 7.179073333740234, + "loss_ib": 0.08570243418216705, + "step": 411 + }, + { + "ce_ib": 15.475322723388672, + "ce_orig": 1.222588300704956, + "epoch": 0.1181968509598102, + "kl_loss": 6.665042877197266, + "loss_ib": 0.08212574571371078, + "step": 411 + }, + { + "ce_ib": 17.793344497680664, + "ce_orig": 1.2325204610824585, + "epoch": 0.1181968509598102, + "kl_loss": 7.175121307373047, + "loss_ib": 0.08954454958438873, + "step": 411 + }, + { + "ce_ib": 15.057259559631348, + "ce_orig": 1.0142443180084229, + "epoch": 0.1181968509598102, + "kl_loss": 7.372516632080078, + "loss_ib": 0.0887824222445488, + "step": 411 + }, + { + "ce_ib": 12.571479797363281, + "ce_orig": 0.9224892854690552, + "epoch": 0.11848443453878783, + "kl_loss": 7.373008728027344, + "loss_ib": 0.08630156517028809, + "step": 412 + }, + { + "ce_ib": 10.39489459991455, + "ce_orig": 0.5030142068862915, + "epoch": 0.11848443453878783, + "kl_loss": 8.151752471923828, + "loss_ib": 0.0919124186038971, + "step": 412 + }, + { + "ce_ib": 14.446382522583008, + "ce_orig": 0.6720181703567505, + "epoch": 0.11848443453878783, + "kl_loss": 6.821091651916504, + "loss_ib": 0.08265729993581772, + "step": 412 + }, + { + "ce_ib": 13.9717378616333, + "ce_orig": 0.8369989395141602, + "epoch": 0.11848443453878783, + "kl_loss": 7.489789962768555, + "loss_ib": 0.08886963874101639, + "step": 412 + }, + { + "ce_ib": 13.278703689575195, + "ce_orig": 0.5604954361915588, + "epoch": 0.11877201811776547, + "kl_loss": 6.503649711608887, + "loss_ib": 0.07831519842147827, + "step": 413 + }, + { + "ce_ib": 14.608017921447754, + "ce_orig": 1.0261048078536987, + "epoch": 0.11877201811776547, + "kl_loss": 5.455432891845703, + "loss_ib": 0.06916234642267227, + "step": 413 + }, + { + "ce_ib": 13.513481140136719, + "ce_orig": 0.8638569116592407, + "epoch": 0.11877201811776547, + "kl_loss": 7.192094802856445, + "loss_ib": 0.08543442189693451, + "step": 413 + }, + { + "ce_ib": 14.755450248718262, + "ce_orig": 1.0192320346832275, + "epoch": 0.11877201811776547, + "kl_loss": 6.588578224182129, + "loss_ib": 0.0806412324309349, + "step": 413 + }, + { + "ce_ib": 13.155234336853027, + "ce_orig": 1.021310806274414, + "epoch": 0.11905960169674312, + "kl_loss": 6.788153648376465, + "loss_ib": 0.0810367688536644, + "step": 414 + }, + { + "ce_ib": 16.875110626220703, + "ce_orig": 1.1028145551681519, + "epoch": 0.11905960169674312, + "kl_loss": 7.1112470626831055, + "loss_ib": 0.08798757940530777, + "step": 414 + }, + { + "ce_ib": 15.40821647644043, + "ce_orig": 1.1819595098495483, + "epoch": 0.11905960169674312, + "kl_loss": 6.240026473999023, + "loss_ib": 0.07780847698450089, + "step": 414 + }, + { + "ce_ib": 13.668084144592285, + "ce_orig": 0.795190155506134, + "epoch": 0.11905960169674312, + "kl_loss": 7.253881931304932, + "loss_ib": 0.08620689809322357, + "step": 414 + }, + { + "epoch": 0.11934718527572076, + "grad_norm": 0.39023032784461975, + "learning_rate": 9.997825825174889e-06, + "loss": 0.9566, + "step": 415 + }, + { + "ce_ib": 12.504598617553711, + "ce_orig": 1.0579363107681274, + "epoch": 0.11934718527572076, + "kl_loss": 7.502658843994141, + "loss_ib": 0.0875311866402626, + "step": 415 + }, + { + "ce_ib": 17.01618766784668, + "ce_orig": 1.1351312398910522, + "epoch": 0.11934718527572076, + "kl_loss": 7.204804420471191, + "loss_ib": 0.08906423300504684, + "step": 415 + }, + { + "ce_ib": 12.350006103515625, + "ce_orig": 0.7742664217948914, + "epoch": 0.11934718527572076, + "kl_loss": 6.772958278656006, + "loss_ib": 0.0800795927643776, + "step": 415 + }, + { + "ce_ib": 9.901521682739258, + "ce_orig": 0.73922199010849, + "epoch": 0.11934718527572076, + "kl_loss": 6.764502048492432, + "loss_ib": 0.07754654437303543, + "step": 415 + }, + { + "ce_ib": 14.0098237991333, + "ce_orig": 1.1979413032531738, + "epoch": 0.1196347688546984, + "kl_loss": 6.7593770027160645, + "loss_ib": 0.08160359412431717, + "step": 416 + }, + { + "ce_ib": 15.960185050964355, + "ce_orig": 1.1578559875488281, + "epoch": 0.1196347688546984, + "kl_loss": 6.498690128326416, + "loss_ib": 0.08094708621501923, + "step": 416 + }, + { + "ce_ib": 17.35699462890625, + "ce_orig": 1.3880764245986938, + "epoch": 0.1196347688546984, + "kl_loss": 6.918699264526367, + "loss_ib": 0.0865439847111702, + "step": 416 + }, + { + "ce_ib": 10.74183177947998, + "ce_orig": 0.8996087908744812, + "epoch": 0.1196347688546984, + "kl_loss": 6.894655227661133, + "loss_ib": 0.07968838512897491, + "step": 416 + }, + { + "ce_ib": 12.287108421325684, + "ce_orig": 0.556649386882782, + "epoch": 0.11992235243367604, + "kl_loss": 6.701784610748291, + "loss_ib": 0.07930494844913483, + "step": 417 + }, + { + "ce_ib": 16.17057991027832, + "ce_orig": 1.4443365335464478, + "epoch": 0.11992235243367604, + "kl_loss": 7.298962116241455, + "loss_ib": 0.08916020393371582, + "step": 417 + }, + { + "ce_ib": 14.572080612182617, + "ce_orig": 1.5205786228179932, + "epoch": 0.11992235243367604, + "kl_loss": 7.550329208374023, + "loss_ib": 0.09007536619901657, + "step": 417 + }, + { + "ce_ib": 16.126951217651367, + "ce_orig": 1.6121647357940674, + "epoch": 0.11992235243367604, + "kl_loss": 6.497934341430664, + "loss_ib": 0.0811062902212143, + "step": 417 + }, + { + "ce_ib": 11.263349533081055, + "ce_orig": 0.5618718266487122, + "epoch": 0.12020993601265367, + "kl_loss": 7.025567054748535, + "loss_ib": 0.08151902258396149, + "step": 418 + }, + { + "ce_ib": 17.31966781616211, + "ce_orig": 0.9057826399803162, + "epoch": 0.12020993601265367, + "kl_loss": 6.166810989379883, + "loss_ib": 0.07898777723312378, + "step": 418 + }, + { + "ce_ib": 14.19983959197998, + "ce_orig": 0.42357009649276733, + "epoch": 0.12020993601265367, + "kl_loss": 7.017156600952148, + "loss_ib": 0.0843714028596878, + "step": 418 + }, + { + "ce_ib": 12.986461639404297, + "ce_orig": 0.9464898705482483, + "epoch": 0.12020993601265367, + "kl_loss": 7.11696195602417, + "loss_ib": 0.0841560810804367, + "step": 418 + }, + { + "ce_ib": 5.582565784454346, + "ce_orig": 0.16826413571834564, + "epoch": 0.12049751959163132, + "kl_loss": 4.485371112823486, + "loss_ib": 0.05043627694249153, + "step": 419 + }, + { + "ce_ib": 10.89441204071045, + "ce_orig": 0.6798584461212158, + "epoch": 0.12049751959163132, + "kl_loss": 7.0565643310546875, + "loss_ib": 0.08146005123853683, + "step": 419 + }, + { + "ce_ib": 12.65652847290039, + "ce_orig": 0.6649844646453857, + "epoch": 0.12049751959163132, + "kl_loss": 6.781154632568359, + "loss_ib": 0.0804680734872818, + "step": 419 + }, + { + "ce_ib": 14.701452255249023, + "ce_orig": 1.5013635158538818, + "epoch": 0.12049751959163132, + "kl_loss": 6.757889747619629, + "loss_ib": 0.08228034526109695, + "step": 419 + }, + { + "epoch": 0.12078510317060896, + "grad_norm": 0.3697260916233063, + "learning_rate": 9.99759096088519e-06, + "loss": 1.025, + "step": 420 + }, + { + "ce_ib": 15.822488784790039, + "ce_orig": 1.4374024868011475, + "epoch": 0.12078510317060896, + "kl_loss": 6.555868148803711, + "loss_ib": 0.0813811644911766, + "step": 420 + }, + { + "ce_ib": 16.061962127685547, + "ce_orig": 1.07735013961792, + "epoch": 0.12078510317060896, + "kl_loss": 7.0854363441467285, + "loss_ib": 0.08691632002592087, + "step": 420 + }, + { + "ce_ib": 11.24513053894043, + "ce_orig": 0.944068968296051, + "epoch": 0.12078510317060896, + "kl_loss": 7.07620096206665, + "loss_ib": 0.08200713992118835, + "step": 420 + }, + { + "ce_ib": 9.319937705993652, + "ce_orig": 0.4624161422252655, + "epoch": 0.12078510317060896, + "kl_loss": 6.860795974731445, + "loss_ib": 0.07792789489030838, + "step": 420 + }, + { + "ce_ib": 14.560148239135742, + "ce_orig": 0.9780954718589783, + "epoch": 0.1210726867495866, + "kl_loss": 7.158355712890625, + "loss_ib": 0.08614370226860046, + "step": 421 + }, + { + "ce_ib": 11.736210823059082, + "ce_orig": 0.8180287480354309, + "epoch": 0.1210726867495866, + "kl_loss": 6.502464771270752, + "loss_ib": 0.07676085829734802, + "step": 421 + }, + { + "ce_ib": 11.860121726989746, + "ce_orig": 0.7424320578575134, + "epoch": 0.1210726867495866, + "kl_loss": 6.852118968963623, + "loss_ib": 0.08038130402565002, + "step": 421 + }, + { + "ce_ib": 8.209848403930664, + "ce_orig": 0.38280463218688965, + "epoch": 0.1210726867495866, + "kl_loss": 5.902113437652588, + "loss_ib": 0.06723098456859589, + "step": 421 + }, + { + "ce_ib": 11.078797340393066, + "ce_orig": 1.1233412027359009, + "epoch": 0.12136027032856424, + "kl_loss": 7.3105268478393555, + "loss_ib": 0.08418406546115875, + "step": 422 + }, + { + "ce_ib": 11.66649055480957, + "ce_orig": 0.6671274900436401, + "epoch": 0.12136027032856424, + "kl_loss": 7.258861064910889, + "loss_ib": 0.08425509929656982, + "step": 422 + }, + { + "ce_ib": 12.51806354522705, + "ce_orig": 0.8585453033447266, + "epoch": 0.12136027032856424, + "kl_loss": 4.624774932861328, + "loss_ib": 0.05876580998301506, + "step": 422 + }, + { + "ce_ib": 8.197091102600098, + "ce_orig": 0.7487708330154419, + "epoch": 0.12136027032856424, + "kl_loss": 7.244273662567139, + "loss_ib": 0.08063982427120209, + "step": 422 + }, + { + "ce_ib": 13.192282676696777, + "ce_orig": 0.7279910445213318, + "epoch": 0.12164785390754188, + "kl_loss": 6.454935073852539, + "loss_ib": 0.07774163037538528, + "step": 423 + }, + { + "ce_ib": 14.898761749267578, + "ce_orig": 0.8458772301673889, + "epoch": 0.12164785390754188, + "kl_loss": 6.546128273010254, + "loss_ib": 0.0803600400686264, + "step": 423 + }, + { + "ce_ib": 9.523457527160645, + "ce_orig": 0.5388420224189758, + "epoch": 0.12164785390754188, + "kl_loss": 6.858234405517578, + "loss_ib": 0.07810579985380173, + "step": 423 + }, + { + "ce_ib": 15.374958992004395, + "ce_orig": 0.9829406142234802, + "epoch": 0.12164785390754188, + "kl_loss": 7.067386627197266, + "loss_ib": 0.08604881912469864, + "step": 423 + }, + { + "ce_ib": 14.404019355773926, + "ce_orig": 0.7524064779281616, + "epoch": 0.12193543748651951, + "kl_loss": 6.316936492919922, + "loss_ib": 0.07757338136434555, + "step": 424 + }, + { + "ce_ib": 12.131924629211426, + "ce_orig": 0.5756824612617493, + "epoch": 0.12193543748651951, + "kl_loss": 6.499780654907227, + "loss_ib": 0.07712972909212112, + "step": 424 + }, + { + "ce_ib": 14.279351234436035, + "ce_orig": 1.1739262342453003, + "epoch": 0.12193543748651951, + "kl_loss": 6.669313430786133, + "loss_ib": 0.08097247779369354, + "step": 424 + }, + { + "ce_ib": 11.006293296813965, + "ce_orig": 0.9451432228088379, + "epoch": 0.12193543748651951, + "kl_loss": 6.912232875823975, + "loss_ib": 0.08012861758470535, + "step": 424 + }, + { + "epoch": 0.12222302106549716, + "grad_norm": 0.3377140164375305, + "learning_rate": 9.9973440562389e-06, + "loss": 0.9795, + "step": 425 + }, + { + "ce_ib": 9.749797821044922, + "ce_orig": 0.4160507023334503, + "epoch": 0.12222302106549716, + "kl_loss": 6.430499076843262, + "loss_ib": 0.0740547850728035, + "step": 425 + }, + { + "ce_ib": 13.227080345153809, + "ce_orig": 0.7790858745574951, + "epoch": 0.12222302106549716, + "kl_loss": 6.962891578674316, + "loss_ib": 0.08285599201917648, + "step": 425 + }, + { + "ce_ib": 11.88380241394043, + "ce_orig": 0.8634589314460754, + "epoch": 0.12222302106549716, + "kl_loss": 6.578843593597412, + "loss_ib": 0.07767223566770554, + "step": 425 + }, + { + "ce_ib": 11.239786148071289, + "ce_orig": 0.6265932321548462, + "epoch": 0.12222302106549716, + "kl_loss": 6.705416202545166, + "loss_ib": 0.07829394936561584, + "step": 425 + }, + { + "ce_ib": 13.436622619628906, + "ce_orig": 0.9564598202705383, + "epoch": 0.1225106046444748, + "kl_loss": 6.993147850036621, + "loss_ib": 0.08336810022592545, + "step": 426 + }, + { + "ce_ib": 10.594719886779785, + "ce_orig": 0.2862907946109772, + "epoch": 0.1225106046444748, + "kl_loss": 6.47970724105835, + "loss_ib": 0.07539179176092148, + "step": 426 + }, + { + "ce_ib": 11.986897468566895, + "ce_orig": 0.8417707681655884, + "epoch": 0.1225106046444748, + "kl_loss": 6.577683448791504, + "loss_ib": 0.07776372879743576, + "step": 426 + }, + { + "ce_ib": 12.561016082763672, + "ce_orig": 0.631502091884613, + "epoch": 0.1225106046444748, + "kl_loss": 6.709961891174316, + "loss_ib": 0.07966063171625137, + "step": 426 + }, + { + "ce_ib": 11.712442398071289, + "ce_orig": 0.6886617541313171, + "epoch": 0.12279818822345244, + "kl_loss": 5.167166709899902, + "loss_ib": 0.06338410824537277, + "step": 427 + }, + { + "ce_ib": 16.66871452331543, + "ce_orig": 1.3375333547592163, + "epoch": 0.12279818822345244, + "kl_loss": 6.451887130737305, + "loss_ib": 0.08118758350610733, + "step": 427 + }, + { + "ce_ib": 16.5571346282959, + "ce_orig": 0.8830122351646423, + "epoch": 0.12279818822345244, + "kl_loss": 6.616879463195801, + "loss_ib": 0.08272592723369598, + "step": 427 + }, + { + "ce_ib": 14.535783767700195, + "ce_orig": 1.0712122917175293, + "epoch": 0.12279818822345244, + "kl_loss": 7.098201274871826, + "loss_ib": 0.08551779389381409, + "step": 427 + }, + { + "ce_ib": 13.506948471069336, + "ce_orig": 0.8588005304336548, + "epoch": 0.12308577180243008, + "kl_loss": 6.7285261154174805, + "loss_ib": 0.08079220354557037, + "step": 428 + }, + { + "ce_ib": 17.66691780090332, + "ce_orig": 1.5079838037490845, + "epoch": 0.12308577180243008, + "kl_loss": 5.905591011047363, + "loss_ib": 0.07672282308340073, + "step": 428 + }, + { + "ce_ib": 13.11380672454834, + "ce_orig": 0.8807310461997986, + "epoch": 0.12308577180243008, + "kl_loss": 6.711277008056641, + "loss_ib": 0.08022657036781311, + "step": 428 + }, + { + "ce_ib": 16.873750686645508, + "ce_orig": 1.461756944656372, + "epoch": 0.12308577180243008, + "kl_loss": 6.9212870597839355, + "loss_ib": 0.08608661592006683, + "step": 428 + }, + { + "ce_ib": 8.633293151855469, + "ce_orig": 0.3702143728733063, + "epoch": 0.12337335538140771, + "kl_loss": 6.651648998260498, + "loss_ib": 0.0751497820019722, + "step": 429 + }, + { + "ce_ib": 12.873100280761719, + "ce_orig": 0.8258522152900696, + "epoch": 0.12337335538140771, + "kl_loss": 6.824582099914551, + "loss_ib": 0.08111891895532608, + "step": 429 + }, + { + "ce_ib": 13.301980972290039, + "ce_orig": 1.0934252738952637, + "epoch": 0.12337335538140771, + "kl_loss": 6.730596542358398, + "loss_ib": 0.08060794323682785, + "step": 429 + }, + { + "ce_ib": 9.3226318359375, + "ce_orig": 0.567753255367279, + "epoch": 0.12337335538140771, + "kl_loss": 6.865760326385498, + "loss_ib": 0.07798023521900177, + "step": 429 + }, + { + "epoch": 0.12366093896038537, + "grad_norm": 0.2692417800426483, + "learning_rate": 9.99708511183087e-06, + "loss": 1.0201, + "step": 430 + }, + { + "ce_ib": 13.1838960647583, + "ce_orig": 0.8950438499450684, + "epoch": 0.12366093896038537, + "kl_loss": 6.7184343338012695, + "loss_ib": 0.08036824315786362, + "step": 430 + }, + { + "ce_ib": 14.154587745666504, + "ce_orig": 1.36903977394104, + "epoch": 0.12366093896038537, + "kl_loss": 7.069368839263916, + "loss_ib": 0.08484827727079391, + "step": 430 + }, + { + "ce_ib": 19.739904403686523, + "ce_orig": 2.018660068511963, + "epoch": 0.12366093896038537, + "kl_loss": 6.694252967834473, + "loss_ib": 0.08668243139982224, + "step": 430 + }, + { + "ce_ib": 12.907319068908691, + "ce_orig": 1.3039312362670898, + "epoch": 0.12366093896038537, + "kl_loss": 6.895936012268066, + "loss_ib": 0.08186668157577515, + "step": 430 + }, + { + "ce_ib": 12.085116386413574, + "ce_orig": 0.8512650728225708, + "epoch": 0.123948522539363, + "kl_loss": 6.91609001159668, + "loss_ib": 0.08124601095914841, + "step": 431 + }, + { + "ce_ib": 13.673727989196777, + "ce_orig": 1.095442295074463, + "epoch": 0.123948522539363, + "kl_loss": 6.370013236999512, + "loss_ib": 0.07737386226654053, + "step": 431 + }, + { + "ce_ib": 12.386370658874512, + "ce_orig": 0.9683983325958252, + "epoch": 0.123948522539363, + "kl_loss": 6.190813064575195, + "loss_ib": 0.07429450005292892, + "step": 431 + }, + { + "ce_ib": 12.50558090209961, + "ce_orig": 0.49803832173347473, + "epoch": 0.123948522539363, + "kl_loss": 6.456753730773926, + "loss_ib": 0.0770731121301651, + "step": 431 + }, + { + "ce_ib": 17.67017936706543, + "ce_orig": 1.4665132761001587, + "epoch": 0.12423610611834064, + "kl_loss": 6.8333563804626465, + "loss_ib": 0.08600374311208725, + "step": 432 + }, + { + "ce_ib": 15.355440139770508, + "ce_orig": 1.0783402919769287, + "epoch": 0.12423610611834064, + "kl_loss": 6.24898624420166, + "loss_ib": 0.07784529775381088, + "step": 432 + }, + { + "ce_ib": 12.970996856689453, + "ce_orig": 0.908065676689148, + "epoch": 0.12423610611834064, + "kl_loss": 6.597336769104004, + "loss_ib": 0.0789443626999855, + "step": 432 + }, + { + "ce_ib": 10.632279396057129, + "ce_orig": 0.8153582215309143, + "epoch": 0.12423610611834064, + "kl_loss": 6.526000022888184, + "loss_ib": 0.07589227706193924, + "step": 432 + }, + { + "ce_ib": 15.43746566772461, + "ce_orig": 0.9074482917785645, + "epoch": 0.12452368969731828, + "kl_loss": 6.491483688354492, + "loss_ib": 0.0803523063659668, + "step": 433 + }, + { + "ce_ib": 8.455195426940918, + "ce_orig": 0.6100387573242188, + "epoch": 0.12452368969731828, + "kl_loss": 6.833198070526123, + "loss_ib": 0.07678717374801636, + "step": 433 + }, + { + "ce_ib": 12.123833656311035, + "ce_orig": 0.36186474561691284, + "epoch": 0.12452368969731828, + "kl_loss": 6.947832107543945, + "loss_ib": 0.08160214871168137, + "step": 433 + }, + { + "ce_ib": 10.345142364501953, + "ce_orig": 0.6522800326347351, + "epoch": 0.12452368969731828, + "kl_loss": 6.990741729736328, + "loss_ib": 0.08025255799293518, + "step": 433 + }, + { + "ce_ib": 16.296939849853516, + "ce_orig": 0.889448344707489, + "epoch": 0.12481127327629592, + "kl_loss": 6.41514778137207, + "loss_ib": 0.08044841885566711, + "step": 434 + }, + { + "ce_ib": 11.671658515930176, + "ce_orig": 0.5737780332565308, + "epoch": 0.12481127327629592, + "kl_loss": 6.855953693389893, + "loss_ib": 0.0802311971783638, + "step": 434 + }, + { + "ce_ib": 14.455811500549316, + "ce_orig": 0.9031121134757996, + "epoch": 0.12481127327629592, + "kl_loss": 6.662840843200684, + "loss_ib": 0.0810842216014862, + "step": 434 + }, + { + "ce_ib": 11.030969619750977, + "ce_orig": 0.5072352290153503, + "epoch": 0.12481127327629592, + "kl_loss": 6.787086486816406, + "loss_ib": 0.07890183478593826, + "step": 434 + }, + { + "epoch": 0.12509885685527355, + "grad_norm": 0.3564906418323517, + "learning_rate": 9.99681412828496e-06, + "loss": 0.9509, + "step": 435 + }, + { + "ce_ib": 13.964662551879883, + "ce_orig": 0.7778974175453186, + "epoch": 0.12509885685527355, + "kl_loss": 6.725362777709961, + "loss_ib": 0.08121828734874725, + "step": 435 + }, + { + "ce_ib": 11.405627250671387, + "ce_orig": 0.7385088801383972, + "epoch": 0.12509885685527355, + "kl_loss": 6.814925193786621, + "loss_ib": 0.07955487817525864, + "step": 435 + }, + { + "ce_ib": 9.46563720703125, + "ce_orig": 0.48979493975639343, + "epoch": 0.12509885685527355, + "kl_loss": 6.513537883758545, + "loss_ib": 0.07460101693868637, + "step": 435 + }, + { + "ce_ib": 11.945999145507812, + "ce_orig": 1.1584625244140625, + "epoch": 0.12509885685527355, + "kl_loss": 6.719861030578613, + "loss_ib": 0.07914461195468903, + "step": 435 + }, + { + "ce_ib": 10.591259002685547, + "ce_orig": 1.0169495344161987, + "epoch": 0.1253864404342512, + "kl_loss": 6.246793746948242, + "loss_ib": 0.07305919378995895, + "step": 436 + }, + { + "ce_ib": 11.989269256591797, + "ce_orig": 0.7858111262321472, + "epoch": 0.1253864404342512, + "kl_loss": 6.57621431350708, + "loss_ib": 0.07775141298770905, + "step": 436 + }, + { + "ce_ib": 19.393007278442383, + "ce_orig": 1.7850255966186523, + "epoch": 0.1253864404342512, + "kl_loss": 6.306278228759766, + "loss_ib": 0.08245578408241272, + "step": 436 + }, + { + "ce_ib": 12.036297798156738, + "ce_orig": 0.7531123161315918, + "epoch": 0.1253864404342512, + "kl_loss": 6.1202850341796875, + "loss_ib": 0.07323914766311646, + "step": 436 + }, + { + "ce_ib": 10.144059181213379, + "ce_orig": 0.3747836649417877, + "epoch": 0.12567402401322886, + "kl_loss": 4.424054145812988, + "loss_ib": 0.05438460409641266, + "step": 437 + }, + { + "ce_ib": 14.241146087646484, + "ce_orig": 1.154929280281067, + "epoch": 0.12567402401322886, + "kl_loss": 6.789527893066406, + "loss_ib": 0.08213642239570618, + "step": 437 + }, + { + "ce_ib": 10.732177734375, + "ce_orig": 0.37166017293930054, + "epoch": 0.12567402401322886, + "kl_loss": 6.624038219451904, + "loss_ib": 0.07697255909442902, + "step": 437 + }, + { + "ce_ib": 11.875864028930664, + "ce_orig": 0.5969848036766052, + "epoch": 0.12567402401322886, + "kl_loss": 6.518521308898926, + "loss_ib": 0.07706107199192047, + "step": 437 + }, + { + "ce_ib": 10.759546279907227, + "ce_orig": 0.5922636985778809, + "epoch": 0.12596160759220648, + "kl_loss": 6.725497245788574, + "loss_ib": 0.07801451534032822, + "step": 438 + }, + { + "ce_ib": 15.553537368774414, + "ce_orig": 0.734350323677063, + "epoch": 0.12596160759220648, + "kl_loss": 6.394842147827148, + "loss_ib": 0.07950195670127869, + "step": 438 + }, + { + "ce_ib": 6.804005146026611, + "ce_orig": 0.3962157666683197, + "epoch": 0.12596160759220648, + "kl_loss": 5.0119147300720215, + "loss_ib": 0.05692315101623535, + "step": 438 + }, + { + "ce_ib": 12.179630279541016, + "ce_orig": 0.42266571521759033, + "epoch": 0.12596160759220648, + "kl_loss": 6.785035133361816, + "loss_ib": 0.08002997934818268, + "step": 438 + }, + { + "ce_ib": 11.9126558303833, + "ce_orig": 0.6911407709121704, + "epoch": 0.12624919117118413, + "kl_loss": 5.950160026550293, + "loss_ib": 0.07141425460577011, + "step": 439 + }, + { + "ce_ib": 13.094669342041016, + "ce_orig": 0.827157199382782, + "epoch": 0.12624919117118413, + "kl_loss": 6.5986528396606445, + "loss_ib": 0.07908119261264801, + "step": 439 + }, + { + "ce_ib": 9.707082748413086, + "ce_orig": 0.6359635591506958, + "epoch": 0.12624919117118413, + "kl_loss": 5.76313591003418, + "loss_ib": 0.06733844429254532, + "step": 439 + }, + { + "ce_ib": 14.719921112060547, + "ce_orig": 0.9809554815292358, + "epoch": 0.12624919117118413, + "kl_loss": 6.400526523590088, + "loss_ib": 0.0787251815199852, + "step": 439 + }, + { + "epoch": 0.12653677475016176, + "grad_norm": 0.3794838488101959, + "learning_rate": 9.996531106254027e-06, + "loss": 0.9376, + "step": 440 + }, + { + "ce_ib": 12.931950569152832, + "ce_orig": 0.9358404278755188, + "epoch": 0.12653677475016176, + "kl_loss": 6.613272666931152, + "loss_ib": 0.07906467467546463, + "step": 440 + }, + { + "ce_ib": 13.42741584777832, + "ce_orig": 0.7416799068450928, + "epoch": 0.12653677475016176, + "kl_loss": 6.373296737670898, + "loss_ib": 0.07716038078069687, + "step": 440 + }, + { + "ce_ib": 12.632997512817383, + "ce_orig": 1.1645599603652954, + "epoch": 0.12653677475016176, + "kl_loss": 6.401862144470215, + "loss_ib": 0.07665161788463593, + "step": 440 + }, + { + "ce_ib": 16.439931869506836, + "ce_orig": 1.5734604597091675, + "epoch": 0.12653677475016176, + "kl_loss": 6.544001579284668, + "loss_ib": 0.08187995105981827, + "step": 440 + }, + { + "ce_ib": 13.676290512084961, + "ce_orig": 1.0207133293151855, + "epoch": 0.1268243583291394, + "kl_loss": 6.34605073928833, + "loss_ib": 0.0771367996931076, + "step": 441 + }, + { + "ce_ib": 10.394545555114746, + "ce_orig": 0.721448540687561, + "epoch": 0.1268243583291394, + "kl_loss": 6.206020355224609, + "loss_ib": 0.07245474308729172, + "step": 441 + }, + { + "ce_ib": 11.452078819274902, + "ce_orig": 1.0113741159439087, + "epoch": 0.1268243583291394, + "kl_loss": 6.649386405944824, + "loss_ib": 0.07794594019651413, + "step": 441 + }, + { + "ce_ib": 13.160637855529785, + "ce_orig": 0.8914715647697449, + "epoch": 0.1268243583291394, + "kl_loss": 6.297116279602051, + "loss_ib": 0.07613179832696915, + "step": 441 + }, + { + "ce_ib": 13.181857109069824, + "ce_orig": 1.0282145738601685, + "epoch": 0.12711194190811706, + "kl_loss": 6.352758884429932, + "loss_ib": 0.07670944184064865, + "step": 442 + }, + { + "ce_ib": 10.155783653259277, + "ce_orig": 0.7708104252815247, + "epoch": 0.12711194190811706, + "kl_loss": 6.537982940673828, + "loss_ib": 0.0755356103181839, + "step": 442 + }, + { + "ce_ib": 11.089113235473633, + "ce_orig": 0.761037290096283, + "epoch": 0.12711194190811706, + "kl_loss": 6.812819004058838, + "loss_ib": 0.07921729981899261, + "step": 442 + }, + { + "ce_ib": 13.044452667236328, + "ce_orig": 0.8526286482810974, + "epoch": 0.12711194190811706, + "kl_loss": 6.086942672729492, + "loss_ib": 0.07391387969255447, + "step": 442 + }, + { + "ce_ib": 14.216473579406738, + "ce_orig": 1.2142722606658936, + "epoch": 0.12739952548709468, + "kl_loss": 6.530941486358643, + "loss_ib": 0.079525887966156, + "step": 443 + }, + { + "ce_ib": 16.07403564453125, + "ce_orig": 1.820540428161621, + "epoch": 0.12739952548709468, + "kl_loss": 6.171257495880127, + "loss_ib": 0.07778660953044891, + "step": 443 + }, + { + "ce_ib": 15.684346199035645, + "ce_orig": 1.0308736562728882, + "epoch": 0.12739952548709468, + "kl_loss": 6.518500328063965, + "loss_ib": 0.08086934685707092, + "step": 443 + }, + { + "ce_ib": 10.408639907836914, + "ce_orig": 0.4763404130935669, + "epoch": 0.12739952548709468, + "kl_loss": 5.605975151062012, + "loss_ib": 0.06646838784217834, + "step": 443 + }, + { + "ce_ib": 12.710926055908203, + "ce_orig": 0.828603208065033, + "epoch": 0.12768710906607234, + "kl_loss": 6.303215026855469, + "loss_ib": 0.07574307173490524, + "step": 444 + }, + { + "ce_ib": 10.51518440246582, + "ce_orig": 0.46696940064430237, + "epoch": 0.12768710906607234, + "kl_loss": 6.153438568115234, + "loss_ib": 0.0720495656132698, + "step": 444 + }, + { + "ce_ib": 8.491880416870117, + "ce_orig": 0.7530315518379211, + "epoch": 0.12768710906607234, + "kl_loss": 6.541855812072754, + "loss_ib": 0.0739104375243187, + "step": 444 + }, + { + "ce_ib": 16.409770965576172, + "ce_orig": 1.3153358697891235, + "epoch": 0.12768710906607234, + "kl_loss": 6.441512107849121, + "loss_ib": 0.08082488924264908, + "step": 444 + }, + { + "epoch": 0.12797469264504996, + "grad_norm": 0.27752232551574707, + "learning_rate": 9.996236046419941e-06, + "loss": 0.993, + "step": 445 + }, + { + "ce_ib": 13.984725952148438, + "ce_orig": 0.8543322086334229, + "epoch": 0.12797469264504996, + "kl_loss": 5.812187194824219, + "loss_ib": 0.07210659980773926, + "step": 445 + }, + { + "ce_ib": 13.997330665588379, + "ce_orig": 0.5448931455612183, + "epoch": 0.12797469264504996, + "kl_loss": 6.332775592803955, + "loss_ib": 0.07732508331537247, + "step": 445 + }, + { + "ce_ib": 12.42682933807373, + "ce_orig": 0.7178173065185547, + "epoch": 0.12797469264504996, + "kl_loss": 6.424311637878418, + "loss_ib": 0.07666994631290436, + "step": 445 + }, + { + "ce_ib": 9.565461158752441, + "ce_orig": 0.8134915232658386, + "epoch": 0.12797469264504996, + "kl_loss": 6.6445112228393555, + "loss_ib": 0.0760105699300766, + "step": 445 + }, + { + "ce_ib": 8.924689292907715, + "ce_orig": 0.5386297106742859, + "epoch": 0.1282622762240276, + "kl_loss": 6.35334587097168, + "loss_ib": 0.07245814800262451, + "step": 446 + }, + { + "ce_ib": 16.734355926513672, + "ce_orig": 1.0333242416381836, + "epoch": 0.1282622762240276, + "kl_loss": 6.456616401672363, + "loss_ib": 0.0813005119562149, + "step": 446 + }, + { + "ce_ib": 11.987763404846191, + "ce_orig": 1.0803635120391846, + "epoch": 0.1282622762240276, + "kl_loss": 6.508333206176758, + "loss_ib": 0.07707109302282333, + "step": 446 + }, + { + "ce_ib": 10.953421592712402, + "ce_orig": 0.6203237175941467, + "epoch": 0.1282622762240276, + "kl_loss": 6.250512599945068, + "loss_ib": 0.07345854490995407, + "step": 446 + }, + { + "ce_ib": 9.890134811401367, + "ce_orig": 0.7359150648117065, + "epoch": 0.12854985980300526, + "kl_loss": 6.235504627227783, + "loss_ib": 0.07224518060684204, + "step": 447 + }, + { + "ce_ib": 14.127108573913574, + "ce_orig": 0.6828675866127014, + "epoch": 0.12854985980300526, + "kl_loss": 6.255981922149658, + "loss_ib": 0.07668692618608475, + "step": 447 + }, + { + "ce_ib": 7.48207426071167, + "ce_orig": 0.6292188167572021, + "epoch": 0.12854985980300526, + "kl_loss": 6.336426734924316, + "loss_ib": 0.07084634155035019, + "step": 447 + }, + { + "ce_ib": 14.48934268951416, + "ce_orig": 0.9720964431762695, + "epoch": 0.12854985980300526, + "kl_loss": 6.416128158569336, + "loss_ib": 0.07865062355995178, + "step": 447 + }, + { + "ce_ib": 9.954630851745605, + "ce_orig": 0.7827691435813904, + "epoch": 0.12883744338198289, + "kl_loss": 6.488411903381348, + "loss_ib": 0.07483874261379242, + "step": 448 + }, + { + "ce_ib": 9.191707611083984, + "ce_orig": 0.7463720440864563, + "epoch": 0.12883744338198289, + "kl_loss": 6.294938087463379, + "loss_ib": 0.07214108854532242, + "step": 448 + }, + { + "ce_ib": 11.25207805633545, + "ce_orig": 0.8659082651138306, + "epoch": 0.12883744338198289, + "kl_loss": 6.527953147888184, + "loss_ib": 0.07653161138296127, + "step": 448 + }, + { + "ce_ib": 13.492358207702637, + "ce_orig": 0.8569541573524475, + "epoch": 0.12883744338198289, + "kl_loss": 6.035775661468506, + "loss_ib": 0.073850117623806, + "step": 448 + }, + { + "ce_ib": 7.855426788330078, + "ce_orig": 0.2511903643608093, + "epoch": 0.12912502696096054, + "kl_loss": 4.307304859161377, + "loss_ib": 0.050928473472595215, + "step": 449 + }, + { + "ce_ib": 15.168793678283691, + "ce_orig": 0.9671982526779175, + "epoch": 0.12912502696096054, + "kl_loss": 6.6048150062561035, + "loss_ib": 0.08121694624423981, + "step": 449 + }, + { + "ce_ib": 12.226943969726562, + "ce_orig": 0.47483983635902405, + "epoch": 0.12912502696096054, + "kl_loss": 6.506505489349365, + "loss_ib": 0.07729199528694153, + "step": 449 + }, + { + "ce_ib": 10.032635688781738, + "ce_orig": 0.9576376676559448, + "epoch": 0.12912502696096054, + "kl_loss": 6.404877662658691, + "loss_ib": 0.0740814059972763, + "step": 449 + }, + { + "epoch": 0.12941261053993816, + "grad_norm": 0.2948267161846161, + "learning_rate": 9.995928949493568e-06, + "loss": 0.9556, + "step": 450 + }, + { + "ce_ib": 13.594502449035645, + "ce_orig": 0.7634487748146057, + "epoch": 0.12941261053993816, + "kl_loss": 6.214456558227539, + "loss_ib": 0.0757390707731247, + "step": 450 + }, + { + "ce_ib": 11.100218772888184, + "ce_orig": 0.5212127566337585, + "epoch": 0.12941261053993816, + "kl_loss": 6.206829071044922, + "loss_ib": 0.07316850870847702, + "step": 450 + }, + { + "ce_ib": 10.686315536499023, + "ce_orig": 0.7214837670326233, + "epoch": 0.12941261053993816, + "kl_loss": 6.353018760681152, + "loss_ib": 0.07421649992465973, + "step": 450 + }, + { + "ce_ib": 10.190530776977539, + "ce_orig": 0.8089180588722229, + "epoch": 0.12941261053993816, + "kl_loss": 6.573402404785156, + "loss_ib": 0.07592455297708511, + "step": 450 + }, + { + "ce_ib": 9.33697509765625, + "ce_orig": 0.5962152481079102, + "epoch": 0.1297001941189158, + "kl_loss": 6.153956413269043, + "loss_ib": 0.07087653875350952, + "step": 451 + }, + { + "ce_ib": 11.186933517456055, + "ce_orig": 0.6141796708106995, + "epoch": 0.1297001941189158, + "kl_loss": 6.5120697021484375, + "loss_ib": 0.07630763202905655, + "step": 451 + }, + { + "ce_ib": 14.915994644165039, + "ce_orig": 0.9922129511833191, + "epoch": 0.1297001941189158, + "kl_loss": 5.893243312835693, + "loss_ib": 0.0738484263420105, + "step": 451 + }, + { + "ce_ib": 11.000643730163574, + "ce_orig": 0.7535352110862732, + "epoch": 0.1297001941189158, + "kl_loss": 6.5356645584106445, + "loss_ib": 0.07635729014873505, + "step": 451 + }, + { + "ce_ib": 10.349251747131348, + "ce_orig": 0.5634931921958923, + "epoch": 0.12998777769789346, + "kl_loss": 6.1491804122924805, + "loss_ib": 0.0718410536646843, + "step": 452 + }, + { + "ce_ib": 14.159017562866211, + "ce_orig": 1.0384783744812012, + "epoch": 0.12998777769789346, + "kl_loss": 6.039100646972656, + "loss_ib": 0.07455001771450043, + "step": 452 + }, + { + "ce_ib": 9.425822257995605, + "ce_orig": 0.7196267247200012, + "epoch": 0.12998777769789346, + "kl_loss": 6.463262557983398, + "loss_ib": 0.07405844330787659, + "step": 452 + }, + { + "ce_ib": 13.532403945922852, + "ce_orig": 0.748589277267456, + "epoch": 0.12998777769789346, + "kl_loss": 6.038760662078857, + "loss_ib": 0.07392001152038574, + "step": 452 + }, + { + "ce_ib": 10.410351753234863, + "ce_orig": 0.5488285422325134, + "epoch": 0.1302753612768711, + "kl_loss": 6.22846794128418, + "loss_ib": 0.07269503176212311, + "step": 453 + }, + { + "ce_ib": 12.851318359375, + "ce_orig": 0.9907112717628479, + "epoch": 0.1302753612768711, + "kl_loss": 6.350039482116699, + "loss_ib": 0.07635170966386795, + "step": 453 + }, + { + "ce_ib": 13.529624938964844, + "ce_orig": 0.9963613152503967, + "epoch": 0.1302753612768711, + "kl_loss": 6.477352142333984, + "loss_ib": 0.07830314338207245, + "step": 453 + }, + { + "ce_ib": 15.717424392700195, + "ce_orig": 0.9456937313079834, + "epoch": 0.1302753612768711, + "kl_loss": 5.184221267700195, + "loss_ib": 0.0675596371293068, + "step": 453 + }, + { + "ce_ib": 11.294244766235352, + "ce_orig": 0.7741903066635132, + "epoch": 0.13056294485584874, + "kl_loss": 6.089672088623047, + "loss_ib": 0.07219096273183823, + "step": 454 + }, + { + "ce_ib": 18.31461524963379, + "ce_orig": 1.1526209115982056, + "epoch": 0.13056294485584874, + "kl_loss": 5.830255508422852, + "loss_ib": 0.07661716639995575, + "step": 454 + }, + { + "ce_ib": 10.833757400512695, + "ce_orig": 0.6415656805038452, + "epoch": 0.13056294485584874, + "kl_loss": 6.350003242492676, + "loss_ib": 0.0743337869644165, + "step": 454 + }, + { + "ce_ib": 13.974409103393555, + "ce_orig": 1.0648647546768188, + "epoch": 0.13056294485584874, + "kl_loss": 5.880853652954102, + "loss_ib": 0.07278294116258621, + "step": 454 + }, + { + "epoch": 0.13085052843482636, + "grad_norm": 0.3984464406967163, + "learning_rate": 9.995609816214774e-06, + "loss": 0.9742, + "step": 455 + }, + { + "ce_ib": 10.744162559509277, + "ce_orig": 0.9050172567367554, + "epoch": 0.13085052843482636, + "kl_loss": 6.5762224197387695, + "loss_ib": 0.07650638371706009, + "step": 455 + }, + { + "ce_ib": 8.354386329650879, + "ce_orig": 0.7557839155197144, + "epoch": 0.13085052843482636, + "kl_loss": 5.9736809730529785, + "loss_ib": 0.06809119880199432, + "step": 455 + }, + { + "ce_ib": 10.220728874206543, + "ce_orig": 0.536035418510437, + "epoch": 0.13085052843482636, + "kl_loss": 6.033191680908203, + "loss_ib": 0.07055263966321945, + "step": 455 + }, + { + "ce_ib": 18.092269897460938, + "ce_orig": 1.7775410413742065, + "epoch": 0.13085052843482636, + "kl_loss": 6.227479457855225, + "loss_ib": 0.0803670659661293, + "step": 455 + }, + { + "ce_ib": 11.44895076751709, + "ce_orig": 0.6709085702896118, + "epoch": 0.13113811201380401, + "kl_loss": 5.662714004516602, + "loss_ib": 0.06807608902454376, + "step": 456 + }, + { + "ce_ib": 14.361438751220703, + "ce_orig": 1.2514116764068604, + "epoch": 0.13113811201380401, + "kl_loss": 6.082196235656738, + "loss_ib": 0.07518339902162552, + "step": 456 + }, + { + "ce_ib": 15.36643123626709, + "ce_orig": 1.2210443019866943, + "epoch": 0.13113811201380401, + "kl_loss": 6.145846366882324, + "loss_ib": 0.07682488858699799, + "step": 456 + }, + { + "ce_ib": 9.23930835723877, + "ce_orig": 0.4679323732852936, + "epoch": 0.13113811201380401, + "kl_loss": 5.857122421264648, + "loss_ib": 0.0678105279803276, + "step": 456 + }, + { + "ce_ib": 18.075246810913086, + "ce_orig": 1.4876693487167358, + "epoch": 0.13142569559278164, + "kl_loss": 6.473189353942871, + "loss_ib": 0.08280713856220245, + "step": 457 + }, + { + "ce_ib": 10.967455863952637, + "ce_orig": 0.7587569355964661, + "epoch": 0.13142569559278164, + "kl_loss": 6.15573787689209, + "loss_ib": 0.07252483069896698, + "step": 457 + }, + { + "ce_ib": 15.197246551513672, + "ce_orig": 0.9926710724830627, + "epoch": 0.13142569559278164, + "kl_loss": 6.070707321166992, + "loss_ib": 0.07590431720018387, + "step": 457 + }, + { + "ce_ib": 13.815834999084473, + "ce_orig": 0.7014583349227905, + "epoch": 0.13142569559278164, + "kl_loss": 5.886041641235352, + "loss_ib": 0.07267624884843826, + "step": 457 + }, + { + "ce_ib": 11.019710540771484, + "ce_orig": 0.645072340965271, + "epoch": 0.1317132791717593, + "kl_loss": 6.059167861938477, + "loss_ib": 0.07161138951778412, + "step": 458 + }, + { + "ce_ib": 14.778169631958008, + "ce_orig": 1.0757112503051758, + "epoch": 0.1317132791717593, + "kl_loss": 5.94471549987793, + "loss_ib": 0.07422532141208649, + "step": 458 + }, + { + "ce_ib": 14.401138305664062, + "ce_orig": 0.5452439188957214, + "epoch": 0.1317132791717593, + "kl_loss": 6.017438888549805, + "loss_ib": 0.0745755285024643, + "step": 458 + }, + { + "ce_ib": 12.480062484741211, + "ce_orig": 0.863021731376648, + "epoch": 0.1317132791717593, + "kl_loss": 6.301546096801758, + "loss_ib": 0.07549552619457245, + "step": 458 + }, + { + "ce_ib": 6.62205696105957, + "ce_orig": 0.3466249108314514, + "epoch": 0.13200086275073694, + "kl_loss": 5.449361801147461, + "loss_ib": 0.061115674674510956, + "step": 459 + }, + { + "ce_ib": 17.263084411621094, + "ce_orig": 1.387851595878601, + "epoch": 0.13200086275073694, + "kl_loss": 5.80389404296875, + "loss_ib": 0.07530201971530914, + "step": 459 + }, + { + "ce_ib": 12.40281867980957, + "ce_orig": 0.5882412195205688, + "epoch": 0.13200086275073694, + "kl_loss": 5.933984756469727, + "loss_ib": 0.07174266874790192, + "step": 459 + }, + { + "ce_ib": 10.269407272338867, + "ce_orig": 0.6979908347129822, + "epoch": 0.13200086275073694, + "kl_loss": 6.144120216369629, + "loss_ib": 0.07171060889959335, + "step": 459 + }, + { + "epoch": 0.13228844632971457, + "grad_norm": 0.392301082611084, + "learning_rate": 9.995278647352428e-06, + "loss": 0.8929, + "step": 460 + }, + { + "ce_ib": 12.763174057006836, + "ce_orig": 0.5633679032325745, + "epoch": 0.13228844632971457, + "kl_loss": 5.510239601135254, + "loss_ib": 0.06786557286977768, + "step": 460 + }, + { + "ce_ib": 12.732988357543945, + "ce_orig": 0.23703357577323914, + "epoch": 0.13228844632971457, + "kl_loss": 5.470260143280029, + "loss_ib": 0.06743558496236801, + "step": 460 + }, + { + "ce_ib": 10.131989479064941, + "ce_orig": 0.6852651834487915, + "epoch": 0.13228844632971457, + "kl_loss": 6.123454570770264, + "loss_ib": 0.07136653363704681, + "step": 460 + }, + { + "ce_ib": 12.067431449890137, + "ce_orig": 0.7499302625656128, + "epoch": 0.13228844632971457, + "kl_loss": 5.742093086242676, + "loss_ib": 0.06948836147785187, + "step": 460 + }, + { + "ce_ib": 7.949161052703857, + "ce_orig": 0.5661373734474182, + "epoch": 0.13257602990869222, + "kl_loss": 5.77607536315918, + "loss_ib": 0.0657099112868309, + "step": 461 + }, + { + "ce_ib": 11.964921951293945, + "ce_orig": 0.7726590633392334, + "epoch": 0.13257602990869222, + "kl_loss": 5.976222991943359, + "loss_ib": 0.07172714918851852, + "step": 461 + }, + { + "ce_ib": 10.301544189453125, + "ce_orig": 0.6989135146141052, + "epoch": 0.13257602990869222, + "kl_loss": 5.811915397644043, + "loss_ib": 0.06842069327831268, + "step": 461 + }, + { + "ce_ib": 12.018680572509766, + "ce_orig": 0.7360786199569702, + "epoch": 0.13257602990869222, + "kl_loss": 5.752782821655273, + "loss_ib": 0.06954650580883026, + "step": 461 + }, + { + "ce_ib": 9.352438926696777, + "ce_orig": 0.6635019779205322, + "epoch": 0.13286361348766984, + "kl_loss": 5.842447280883789, + "loss_ib": 0.06777691096067429, + "step": 462 + }, + { + "ce_ib": 13.350435256958008, + "ce_orig": 1.5100382566452026, + "epoch": 0.13286361348766984, + "kl_loss": 5.89756965637207, + "loss_ib": 0.07232613116502762, + "step": 462 + }, + { + "ce_ib": 12.962512969970703, + "ce_orig": 0.5430381894111633, + "epoch": 0.13286361348766984, + "kl_loss": 5.671024322509766, + "loss_ib": 0.06967275589704514, + "step": 462 + }, + { + "ce_ib": 15.555392265319824, + "ce_orig": 1.549025297164917, + "epoch": 0.13286361348766984, + "kl_loss": 5.619405269622803, + "loss_ib": 0.07174944132566452, + "step": 462 + }, + { + "ce_ib": 12.196456909179688, + "ce_orig": 0.8404883742332458, + "epoch": 0.1331511970666475, + "kl_loss": 5.807744026184082, + "loss_ib": 0.07027389109134674, + "step": 463 + }, + { + "ce_ib": 10.140958786010742, + "ce_orig": 0.9204779267311096, + "epoch": 0.1331511970666475, + "kl_loss": 5.694162368774414, + "loss_ib": 0.06708257645368576, + "step": 463 + }, + { + "ce_ib": 14.248191833496094, + "ce_orig": 0.7399206161499023, + "epoch": 0.1331511970666475, + "kl_loss": 5.870853900909424, + "loss_ib": 0.07295673340559006, + "step": 463 + }, + { + "ce_ib": 12.88770580291748, + "ce_orig": 0.6557974219322205, + "epoch": 0.1331511970666475, + "kl_loss": 5.735309600830078, + "loss_ib": 0.0702408030629158, + "step": 463 + }, + { + "ce_ib": 11.3536376953125, + "ce_orig": 0.9138345718383789, + "epoch": 0.13343878064562514, + "kl_loss": 5.644110202789307, + "loss_ib": 0.06779474020004272, + "step": 464 + }, + { + "ce_ib": 12.081389427185059, + "ce_orig": 0.947689950466156, + "epoch": 0.13343878064562514, + "kl_loss": 5.870572090148926, + "loss_ib": 0.07078710943460464, + "step": 464 + }, + { + "ce_ib": 8.881162643432617, + "ce_orig": 0.952629029750824, + "epoch": 0.13343878064562514, + "kl_loss": 5.588096618652344, + "loss_ib": 0.06476213037967682, + "step": 464 + }, + { + "ce_ib": 9.299169540405273, + "ce_orig": 0.60807204246521, + "epoch": 0.13343878064562514, + "kl_loss": 5.195706844329834, + "loss_ib": 0.06125623732805252, + "step": 464 + }, + { + "epoch": 0.13372636422460277, + "grad_norm": 0.45935723185539246, + "learning_rate": 9.994935443704391e-06, + "loss": 0.9342, + "step": 465 + }, + { + "ce_ib": 15.007346153259277, + "ce_orig": 1.2609058618545532, + "epoch": 0.13372636422460277, + "kl_loss": 5.37989616394043, + "loss_ib": 0.06880630552768707, + "step": 465 + }, + { + "ce_ib": 7.51100492477417, + "ce_orig": 0.5963767170906067, + "epoch": 0.13372636422460277, + "kl_loss": 5.823338508605957, + "loss_ib": 0.06574439257383347, + "step": 465 + }, + { + "ce_ib": 11.900202751159668, + "ce_orig": 0.4563358724117279, + "epoch": 0.13372636422460277, + "kl_loss": 5.743979454040527, + "loss_ib": 0.06933999806642532, + "step": 465 + }, + { + "ce_ib": 14.910943984985352, + "ce_orig": 0.8666954636573792, + "epoch": 0.13372636422460277, + "kl_loss": 5.86253023147583, + "loss_ib": 0.07353624701499939, + "step": 465 + }, + { + "ce_ib": 11.311989784240723, + "ce_orig": 0.803551197052002, + "epoch": 0.13401394780358042, + "kl_loss": 5.7215681076049805, + "loss_ib": 0.06852766871452332, + "step": 466 + }, + { + "ce_ib": 12.853880882263184, + "ce_orig": 0.5360819697380066, + "epoch": 0.13401394780358042, + "kl_loss": 5.336982727050781, + "loss_ib": 0.06622370332479477, + "step": 466 + }, + { + "ce_ib": 12.081340789794922, + "ce_orig": 0.9268986582756042, + "epoch": 0.13401394780358042, + "kl_loss": 5.40451717376709, + "loss_ib": 0.0661265105009079, + "step": 466 + }, + { + "ce_ib": 10.360613822937012, + "ce_orig": 0.7845146059989929, + "epoch": 0.13401394780358042, + "kl_loss": 5.394956588745117, + "loss_ib": 0.06431017816066742, + "step": 466 + }, + { + "ce_ib": 13.594584465026855, + "ce_orig": 0.7611533999443054, + "epoch": 0.13430153138255804, + "kl_loss": 5.510991096496582, + "loss_ib": 0.06870449334383011, + "step": 467 + }, + { + "ce_ib": 11.39364242553711, + "ce_orig": 0.6709606647491455, + "epoch": 0.13430153138255804, + "kl_loss": 5.329100608825684, + "loss_ib": 0.06468464434146881, + "step": 467 + }, + { + "ce_ib": 15.204527854919434, + "ce_orig": 1.456774115562439, + "epoch": 0.13430153138255804, + "kl_loss": 5.282338619232178, + "loss_ib": 0.06802791357040405, + "step": 467 + }, + { + "ce_ib": 9.643840789794922, + "ce_orig": 0.6612191200256348, + "epoch": 0.13430153138255804, + "kl_loss": 5.333034992218018, + "loss_ib": 0.06297419220209122, + "step": 467 + }, + { + "ce_ib": 11.347843170166016, + "ce_orig": 1.0858099460601807, + "epoch": 0.1345891149615357, + "kl_loss": 5.0229034423828125, + "loss_ib": 0.061576876789331436, + "step": 468 + }, + { + "ce_ib": 12.166611671447754, + "ce_orig": 0.7169655561447144, + "epoch": 0.1345891149615357, + "kl_loss": 5.225884437561035, + "loss_ib": 0.06442546099424362, + "step": 468 + }, + { + "ce_ib": 13.179616928100586, + "ce_orig": 0.9957132339477539, + "epoch": 0.1345891149615357, + "kl_loss": 5.586675643920898, + "loss_ib": 0.06904637068510056, + "step": 468 + }, + { + "ce_ib": 10.90320110321045, + "ce_orig": 0.694044828414917, + "epoch": 0.1345891149615357, + "kl_loss": 4.852416038513184, + "loss_ib": 0.05942736193537712, + "step": 468 + }, + { + "ce_ib": 10.619782447814941, + "ce_orig": 0.4662715196609497, + "epoch": 0.13487669854051335, + "kl_loss": 5.184469223022461, + "loss_ib": 0.06246447563171387, + "step": 469 + }, + { + "ce_ib": 9.323065757751465, + "ce_orig": 0.6369857788085938, + "epoch": 0.13487669854051335, + "kl_loss": 5.136096000671387, + "loss_ib": 0.060684025287628174, + "step": 469 + }, + { + "ce_ib": 10.448844909667969, + "ce_orig": 0.4876580238342285, + "epoch": 0.13487669854051335, + "kl_loss": 5.2007832527160645, + "loss_ib": 0.062456678599119186, + "step": 469 + }, + { + "ce_ib": 11.4501371383667, + "ce_orig": 0.6889066100120544, + "epoch": 0.13487669854051335, + "kl_loss": 5.519144058227539, + "loss_ib": 0.06664157658815384, + "step": 469 + }, + { + "epoch": 0.13516428211949097, + "grad_norm": 0.4563974142074585, + "learning_rate": 9.994580206097524e-06, + "loss": 0.9271, + "step": 470 + }, + { + "ce_ib": 10.97485637664795, + "ce_orig": 0.6626176834106445, + "epoch": 0.13516428211949097, + "kl_loss": 5.277484893798828, + "loss_ib": 0.06374970078468323, + "step": 470 + }, + { + "ce_ib": 8.894420623779297, + "ce_orig": 0.4637753367424011, + "epoch": 0.13516428211949097, + "kl_loss": 5.26820182800293, + "loss_ib": 0.06157643720507622, + "step": 470 + }, + { + "ce_ib": 13.850231170654297, + "ce_orig": 0.8222572803497314, + "epoch": 0.13516428211949097, + "kl_loss": 3.3914904594421387, + "loss_ib": 0.047765135765075684, + "step": 470 + }, + { + "ce_ib": 9.599710464477539, + "ce_orig": 0.8736235499382019, + "epoch": 0.13516428211949097, + "kl_loss": 5.436291694641113, + "loss_ib": 0.06396262347698212, + "step": 470 + }, + { + "ce_ib": 9.535674095153809, + "ce_orig": 0.7246021032333374, + "epoch": 0.13545186569846862, + "kl_loss": 5.152594089508057, + "loss_ib": 0.061061613261699677, + "step": 471 + }, + { + "ce_ib": 11.557367324829102, + "ce_orig": 0.7219054102897644, + "epoch": 0.13545186569846862, + "kl_loss": 4.933101654052734, + "loss_ib": 0.0608883835375309, + "step": 471 + }, + { + "ce_ib": 11.222688674926758, + "ce_orig": 0.7581503987312317, + "epoch": 0.13545186569846862, + "kl_loss": 5.2493391036987305, + "loss_ib": 0.06371607631444931, + "step": 471 + }, + { + "ce_ib": 7.992416858673096, + "ce_orig": 0.7171717286109924, + "epoch": 0.13545186569846862, + "kl_loss": 5.1756591796875, + "loss_ib": 0.05974900722503662, + "step": 471 + }, + { + "ce_ib": 5.774598121643066, + "ce_orig": 0.2617477774620056, + "epoch": 0.13573944927744624, + "kl_loss": 3.714776039123535, + "loss_ib": 0.042922358959913254, + "step": 472 + }, + { + "ce_ib": 15.86950969696045, + "ce_orig": 1.3682712316513062, + "epoch": 0.13573944927744624, + "kl_loss": 5.002852439880371, + "loss_ib": 0.06589803844690323, + "step": 472 + }, + { + "ce_ib": 15.778973579406738, + "ce_orig": 1.778786063194275, + "epoch": 0.13573944927744624, + "kl_loss": 5.145055294036865, + "loss_ib": 0.06722952425479889, + "step": 472 + }, + { + "ce_ib": 9.892607688903809, + "ce_orig": 0.6026872992515564, + "epoch": 0.13573944927744624, + "kl_loss": 4.9220356941223145, + "loss_ib": 0.05911296233534813, + "step": 472 + }, + { + "ce_ib": 14.954146385192871, + "ce_orig": 1.6441103219985962, + "epoch": 0.1360270328564239, + "kl_loss": 5.022004127502441, + "loss_ib": 0.06517418473958969, + "step": 473 + }, + { + "ce_ib": 14.230586051940918, + "ce_orig": 1.1324756145477295, + "epoch": 0.1360270328564239, + "kl_loss": 5.008617401123047, + "loss_ib": 0.0643167570233345, + "step": 473 + }, + { + "ce_ib": 9.870515823364258, + "ce_orig": 0.50955730676651, + "epoch": 0.1360270328564239, + "kl_loss": 5.198309898376465, + "loss_ib": 0.06185361370444298, + "step": 473 + }, + { + "ce_ib": 7.58746862411499, + "ce_orig": 0.576608419418335, + "epoch": 0.1360270328564239, + "kl_loss": 5.101164817810059, + "loss_ib": 0.058599118143320084, + "step": 473 + }, + { + "ce_ib": 11.188053131103516, + "ce_orig": 1.0111771821975708, + "epoch": 0.13631461643540155, + "kl_loss": 4.980704307556152, + "loss_ib": 0.06099509447813034, + "step": 474 + }, + { + "ce_ib": 11.85481071472168, + "ce_orig": 0.8680632710456848, + "epoch": 0.13631461643540155, + "kl_loss": 4.838929653167725, + "loss_ib": 0.060244105756282806, + "step": 474 + }, + { + "ce_ib": 11.664010047912598, + "ce_orig": 0.6513270735740662, + "epoch": 0.13631461643540155, + "kl_loss": 4.969212532043457, + "loss_ib": 0.061356134712696075, + "step": 474 + }, + { + "ce_ib": 12.634693145751953, + "ce_orig": 0.6730305552482605, + "epoch": 0.13631461643540155, + "kl_loss": 4.7532453536987305, + "loss_ib": 0.06016714498400688, + "step": 474 + }, + { + "epoch": 0.13660220001437917, + "grad_norm": 0.41322335600852966, + "learning_rate": 9.99421293538767e-06, + "loss": 0.952, + "step": 475 + }, + { + "ce_ib": 9.952411651611328, + "ce_orig": 0.6906881928443909, + "epoch": 0.13660220001437917, + "kl_loss": 4.742203712463379, + "loss_ib": 0.05737444758415222, + "step": 475 + }, + { + "ce_ib": 11.219844818115234, + "ce_orig": 0.7193230986595154, + "epoch": 0.13660220001437917, + "kl_loss": 4.660732269287109, + "loss_ib": 0.057827167212963104, + "step": 475 + }, + { + "ce_ib": 10.129925727844238, + "ce_orig": 0.7533198595046997, + "epoch": 0.13660220001437917, + "kl_loss": 4.963289260864258, + "loss_ib": 0.05976282060146332, + "step": 475 + }, + { + "ce_ib": 8.743351936340332, + "ce_orig": 0.5432742238044739, + "epoch": 0.13660220001437917, + "kl_loss": 4.890501022338867, + "loss_ib": 0.05764836072921753, + "step": 475 + }, + { + "ce_ib": 16.304983139038086, + "ce_orig": 1.4945815801620483, + "epoch": 0.13688978359335682, + "kl_loss": 4.846185207366943, + "loss_ib": 0.06476683169603348, + "step": 476 + }, + { + "ce_ib": 15.700722694396973, + "ce_orig": 1.2550569772720337, + "epoch": 0.13688978359335682, + "kl_loss": 4.873666763305664, + "loss_ib": 0.0644373893737793, + "step": 476 + }, + { + "ce_ib": 11.274219512939453, + "ce_orig": 0.7325409054756165, + "epoch": 0.13688978359335682, + "kl_loss": 4.502358436584473, + "loss_ib": 0.056297801434993744, + "step": 476 + }, + { + "ce_ib": 12.488386154174805, + "ce_orig": 0.7823653221130371, + "epoch": 0.13688978359335682, + "kl_loss": 4.8517374992370605, + "loss_ib": 0.061005763709545135, + "step": 476 + }, + { + "ce_ib": 9.597527503967285, + "ce_orig": 0.8816280961036682, + "epoch": 0.13717736717233445, + "kl_loss": 4.802122592926025, + "loss_ib": 0.057618748396635056, + "step": 477 + }, + { + "ce_ib": 7.878790855407715, + "ce_orig": 0.669119119644165, + "epoch": 0.13717736717233445, + "kl_loss": 4.979962348937988, + "loss_ib": 0.05767841264605522, + "step": 477 + }, + { + "ce_ib": 10.391016960144043, + "ce_orig": 0.7251664400100708, + "epoch": 0.13717736717233445, + "kl_loss": 4.9956560134887695, + "loss_ib": 0.060347575694322586, + "step": 477 + }, + { + "ce_ib": 11.260332107543945, + "ce_orig": 0.7019518613815308, + "epoch": 0.13717736717233445, + "kl_loss": 4.841489791870117, + "loss_ib": 0.05967522785067558, + "step": 477 + }, + { + "ce_ib": 9.614903450012207, + "ce_orig": 0.8139093518257141, + "epoch": 0.1374649507513121, + "kl_loss": 4.899896621704102, + "loss_ib": 0.058613866567611694, + "step": 478 + }, + { + "ce_ib": 11.576051712036133, + "ce_orig": 0.5408310294151306, + "epoch": 0.1374649507513121, + "kl_loss": 4.424466133117676, + "loss_ib": 0.055820710957050323, + "step": 478 + }, + { + "ce_ib": 12.201531410217285, + "ce_orig": 1.1268466711044312, + "epoch": 0.1374649507513121, + "kl_loss": 4.69233512878418, + "loss_ib": 0.05912488326430321, + "step": 478 + }, + { + "ce_ib": 9.991912841796875, + "ce_orig": 0.9469978213310242, + "epoch": 0.1374649507513121, + "kl_loss": 4.918972969055176, + "loss_ib": 0.05918164178729057, + "step": 478 + }, + { + "ce_ib": 10.741558074951172, + "ce_orig": 0.4736107587814331, + "epoch": 0.13775253433028975, + "kl_loss": 4.647714138031006, + "loss_ib": 0.05721869692206383, + "step": 479 + }, + { + "ce_ib": 13.863398551940918, + "ce_orig": 0.7937755584716797, + "epoch": 0.13775253433028975, + "kl_loss": 4.548620700836182, + "loss_ib": 0.059349603950977325, + "step": 479 + }, + { + "ce_ib": 12.365586280822754, + "ce_orig": 0.7733124494552612, + "epoch": 0.13775253433028975, + "kl_loss": 4.751549243927002, + "loss_ib": 0.05988107621669769, + "step": 479 + }, + { + "ce_ib": 9.24503231048584, + "ce_orig": 0.6917009949684143, + "epoch": 0.13775253433028975, + "kl_loss": 4.599456787109375, + "loss_ib": 0.05523959919810295, + "step": 479 + }, + { + "epoch": 0.13804011790926737, + "grad_norm": 0.28194499015808105, + "learning_rate": 9.993833632459675e-06, + "loss": 0.9569, + "step": 480 + }, + { + "ce_ib": 12.575913429260254, + "ce_orig": 0.8253871202468872, + "epoch": 0.13804011790926737, + "kl_loss": 4.399908542633057, + "loss_ib": 0.056574996560811996, + "step": 480 + }, + { + "ce_ib": 12.079404830932617, + "ce_orig": 0.7573724985122681, + "epoch": 0.13804011790926737, + "kl_loss": 4.628897666931152, + "loss_ib": 0.05836838111281395, + "step": 480 + }, + { + "ce_ib": 14.135128021240234, + "ce_orig": 1.195788860321045, + "epoch": 0.13804011790926737, + "kl_loss": 4.788009166717529, + "loss_ib": 0.062015216797590256, + "step": 480 + }, + { + "ce_ib": 13.080334663391113, + "ce_orig": 1.0536785125732422, + "epoch": 0.13804011790926737, + "kl_loss": 4.861077308654785, + "loss_ib": 0.061691105365753174, + "step": 480 + }, + { + "ce_ib": 15.20880126953125, + "ce_orig": 1.4970366954803467, + "epoch": 0.13832770148824503, + "kl_loss": 4.581524848937988, + "loss_ib": 0.061024051159620285, + "step": 481 + }, + { + "ce_ib": 9.704122543334961, + "ce_orig": 0.7620049118995667, + "epoch": 0.13832770148824503, + "kl_loss": 4.4944915771484375, + "loss_ib": 0.05464903637766838, + "step": 481 + }, + { + "ce_ib": 14.204291343688965, + "ce_orig": 1.2470651865005493, + "epoch": 0.13832770148824503, + "kl_loss": 4.557282447814941, + "loss_ib": 0.059777114540338516, + "step": 481 + }, + { + "ce_ib": 10.805310249328613, + "ce_orig": 0.8926163911819458, + "epoch": 0.13832770148824503, + "kl_loss": 4.54813289642334, + "loss_ib": 0.05628664046525955, + "step": 481 + }, + { + "ce_ib": 8.615751266479492, + "ce_orig": 0.7180139422416687, + "epoch": 0.13861528506722265, + "kl_loss": 4.497965335845947, + "loss_ib": 0.0535954050719738, + "step": 482 + }, + { + "ce_ib": 13.28380012512207, + "ce_orig": 0.8565104007720947, + "epoch": 0.13861528506722265, + "kl_loss": 4.163414001464844, + "loss_ib": 0.05491793900728226, + "step": 482 + }, + { + "ce_ib": 16.117412567138672, + "ce_orig": 1.5288479328155518, + "epoch": 0.13861528506722265, + "kl_loss": 4.652551651000977, + "loss_ib": 0.06264292448759079, + "step": 482 + }, + { + "ce_ib": 15.705317497253418, + "ce_orig": 1.3943670988082886, + "epoch": 0.13861528506722265, + "kl_loss": 4.385931015014648, + "loss_ib": 0.05956462770700455, + "step": 482 + }, + { + "ce_ib": 12.427248001098633, + "ce_orig": 0.8222273588180542, + "epoch": 0.1389028686462003, + "kl_loss": 4.483578681945801, + "loss_ib": 0.05726303532719612, + "step": 483 + }, + { + "ce_ib": 9.109222412109375, + "ce_orig": 0.6663987636566162, + "epoch": 0.1389028686462003, + "kl_loss": 4.623089790344238, + "loss_ib": 0.05534011870622635, + "step": 483 + }, + { + "ce_ib": 16.840740203857422, + "ce_orig": 1.5569089651107788, + "epoch": 0.1389028686462003, + "kl_loss": 4.513226509094238, + "loss_ib": 0.061973001807928085, + "step": 483 + }, + { + "ce_ib": 13.14278507232666, + "ce_orig": 0.7401519417762756, + "epoch": 0.1389028686462003, + "kl_loss": 4.656795978546143, + "loss_ib": 0.05971074476838112, + "step": 483 + }, + { + "ce_ib": 14.228574752807617, + "ce_orig": 0.5264460444450378, + "epoch": 0.13919045222517795, + "kl_loss": 4.304325580596924, + "loss_ib": 0.05727183073759079, + "step": 484 + }, + { + "ce_ib": 16.023216247558594, + "ce_orig": 1.285567283630371, + "epoch": 0.13919045222517795, + "kl_loss": 4.187580585479736, + "loss_ib": 0.05789902061223984, + "step": 484 + }, + { + "ce_ib": 15.015509605407715, + "ce_orig": 1.6406548023223877, + "epoch": 0.13919045222517795, + "kl_loss": 4.492351531982422, + "loss_ib": 0.059939026832580566, + "step": 484 + }, + { + "ce_ib": 13.410754203796387, + "ce_orig": 1.4283090829849243, + "epoch": 0.13919045222517795, + "kl_loss": 4.561341762542725, + "loss_ib": 0.05902417004108429, + "step": 484 + }, + { + "epoch": 0.13947803580415558, + "grad_norm": 0.42398667335510254, + "learning_rate": 9.993442298227365e-06, + "loss": 1.0074, + "step": 485 + }, + { + "ce_ib": 16.616222381591797, + "ce_orig": 1.3357499837875366, + "epoch": 0.13947803580415558, + "kl_loss": 4.212893486022949, + "loss_ib": 0.05874515324831009, + "step": 485 + }, + { + "ce_ib": 11.187970161437988, + "ce_orig": 0.9521239995956421, + "epoch": 0.13947803580415558, + "kl_loss": 4.148205757141113, + "loss_ib": 0.05267002806067467, + "step": 485 + }, + { + "ce_ib": 11.176460266113281, + "ce_orig": 0.48416224122047424, + "epoch": 0.13947803580415558, + "kl_loss": 4.578839302062988, + "loss_ib": 0.05696485564112663, + "step": 485 + }, + { + "ce_ib": 14.064797401428223, + "ce_orig": 1.0060439109802246, + "epoch": 0.13947803580415558, + "kl_loss": 4.404331207275391, + "loss_ib": 0.05810810998082161, + "step": 485 + }, + { + "ce_ib": 15.115540504455566, + "ce_orig": 0.9532531499862671, + "epoch": 0.13976561938313323, + "kl_loss": 4.2361860275268555, + "loss_ib": 0.057477399706840515, + "step": 486 + }, + { + "ce_ib": 9.784770965576172, + "ce_orig": 0.7604672312736511, + "epoch": 0.13976561938313323, + "kl_loss": 3.9188036918640137, + "loss_ib": 0.048972804099321365, + "step": 486 + }, + { + "ce_ib": 10.663135528564453, + "ce_orig": 1.0786744356155396, + "epoch": 0.13976561938313323, + "kl_loss": 4.246434211730957, + "loss_ib": 0.0531274788081646, + "step": 486 + }, + { + "ce_ib": 8.10494613647461, + "ce_orig": 0.5798110961914062, + "epoch": 0.13976561938313323, + "kl_loss": 4.231927871704102, + "loss_ib": 0.05042422190308571, + "step": 486 + }, + { + "ce_ib": 12.337913513183594, + "ce_orig": 0.7753936648368835, + "epoch": 0.14005320296211085, + "kl_loss": 4.142770290374756, + "loss_ib": 0.053765613585710526, + "step": 487 + }, + { + "ce_ib": 15.07596206665039, + "ce_orig": 0.8154249787330627, + "epoch": 0.14005320296211085, + "kl_loss": 4.023566246032715, + "loss_ib": 0.055311620235443115, + "step": 487 + }, + { + "ce_ib": 13.26456069946289, + "ce_orig": 1.3517224788665771, + "epoch": 0.14005320296211085, + "kl_loss": 4.021327018737793, + "loss_ib": 0.053477831184864044, + "step": 487 + }, + { + "ce_ib": 11.321921348571777, + "ce_orig": 0.6585462093353271, + "epoch": 0.14005320296211085, + "kl_loss": 3.9664478302001953, + "loss_ib": 0.050986398011446, + "step": 487 + }, + { + "ce_ib": 11.75551700592041, + "ce_orig": 0.7766084671020508, + "epoch": 0.1403407865410885, + "kl_loss": 3.7924036979675293, + "loss_ib": 0.049679554998874664, + "step": 488 + }, + { + "ce_ib": 15.715422630310059, + "ce_orig": 1.3684426546096802, + "epoch": 0.1403407865410885, + "kl_loss": 3.701831817626953, + "loss_ib": 0.05273373797535896, + "step": 488 + }, + { + "ce_ib": 8.13493824005127, + "ce_orig": 0.45366278290748596, + "epoch": 0.1403407865410885, + "kl_loss": 4.0575456619262695, + "loss_ib": 0.04871039465069771, + "step": 488 + }, + { + "ce_ib": 13.73061752319336, + "ce_orig": 1.1564494371414185, + "epoch": 0.1403407865410885, + "kl_loss": 4.102625846862793, + "loss_ib": 0.05475687235593796, + "step": 488 + }, + { + "ce_ib": 13.535247802734375, + "ce_orig": 0.9498729109764099, + "epoch": 0.14062837012006615, + "kl_loss": 2.995370864868164, + "loss_ib": 0.04348895326256752, + "step": 489 + }, + { + "ce_ib": 7.940598487854004, + "ce_orig": 0.3024381101131439, + "epoch": 0.14062837012006615, + "kl_loss": 3.7741479873657227, + "loss_ib": 0.04568207636475563, + "step": 489 + }, + { + "ce_ib": 11.454663276672363, + "ce_orig": 0.6730047464370728, + "epoch": 0.14062837012006615, + "kl_loss": 3.8181753158569336, + "loss_ib": 0.049636416137218475, + "step": 489 + }, + { + "ce_ib": 8.796786308288574, + "ce_orig": 0.7032504677772522, + "epoch": 0.14062837012006615, + "kl_loss": 3.953084945678711, + "loss_ib": 0.04832763597369194, + "step": 489 + }, + { + "epoch": 0.14091595369904378, + "grad_norm": 0.40750885009765625, + "learning_rate": 9.993038933633556e-06, + "loss": 0.9795, + "step": 490 + }, + { + "ce_ib": 9.949647903442383, + "ce_orig": 0.5322861671447754, + "epoch": 0.14091595369904378, + "kl_loss": 3.8681774139404297, + "loss_ib": 0.048631418496370316, + "step": 490 + }, + { + "ce_ib": 15.904637336730957, + "ce_orig": 1.6171191930770874, + "epoch": 0.14091595369904378, + "kl_loss": 3.6979196071624756, + "loss_ib": 0.05288383364677429, + "step": 490 + }, + { + "ce_ib": 12.01391315460205, + "ce_orig": 1.2611167430877686, + "epoch": 0.14091595369904378, + "kl_loss": 3.5611257553100586, + "loss_ib": 0.04762516915798187, + "step": 490 + }, + { + "ce_ib": 9.39164924621582, + "ce_orig": 0.9851351380348206, + "epoch": 0.14091595369904378, + "kl_loss": 3.2981302738189697, + "loss_ib": 0.04237294942140579, + "step": 490 + }, + { + "ce_ib": 12.360732078552246, + "ce_orig": 1.4508610963821411, + "epoch": 0.14120353727802143, + "kl_loss": 3.5956835746765137, + "loss_ib": 0.0483175665140152, + "step": 491 + }, + { + "ce_ib": 12.937994003295898, + "ce_orig": 1.1618151664733887, + "epoch": 0.14120353727802143, + "kl_loss": 3.563871383666992, + "loss_ib": 0.048576705157756805, + "step": 491 + }, + { + "ce_ib": 17.118064880371094, + "ce_orig": 1.445876121520996, + "epoch": 0.14120353727802143, + "kl_loss": 3.657188892364502, + "loss_ib": 0.05368995666503906, + "step": 491 + }, + { + "ce_ib": 11.58482837677002, + "ce_orig": 0.9607007503509521, + "epoch": 0.14120353727802143, + "kl_loss": 3.736398458480835, + "loss_ib": 0.04894881322979927, + "step": 491 + }, + { + "ce_ib": 8.748994827270508, + "ce_orig": 0.9032168984413147, + "epoch": 0.14149112085699905, + "kl_loss": 3.5443851947784424, + "loss_ib": 0.0441928468644619, + "step": 492 + }, + { + "ce_ib": 10.45909309387207, + "ce_orig": 0.7486007809638977, + "epoch": 0.14149112085699905, + "kl_loss": 3.231372594833374, + "loss_ib": 0.042772818356752396, + "step": 492 + }, + { + "ce_ib": 17.54129409790039, + "ce_orig": 1.9069491624832153, + "epoch": 0.14149112085699905, + "kl_loss": 3.558845043182373, + "loss_ib": 0.05312974378466606, + "step": 492 + }, + { + "ce_ib": 15.910091400146484, + "ce_orig": 1.024116039276123, + "epoch": 0.14149112085699905, + "kl_loss": 3.7413394451141357, + "loss_ib": 0.05332348868250847, + "step": 492 + }, + { + "ce_ib": 8.667006492614746, + "ce_orig": 0.30886101722717285, + "epoch": 0.1417787044359767, + "kl_loss": 3.7348480224609375, + "loss_ib": 0.04601548612117767, + "step": 493 + }, + { + "ce_ib": 11.867700576782227, + "ce_orig": 1.1173264980316162, + "epoch": 0.1417787044359767, + "kl_loss": 3.64851713180542, + "loss_ib": 0.04835287109017372, + "step": 493 + }, + { + "ce_ib": 9.549649238586426, + "ce_orig": 0.6670407056808472, + "epoch": 0.1417787044359767, + "kl_loss": 3.220088481903076, + "loss_ib": 0.04175053536891937, + "step": 493 + }, + { + "ce_ib": 12.184220314025879, + "ce_orig": 0.7223286032676697, + "epoch": 0.1417787044359767, + "kl_loss": 2.9731616973876953, + "loss_ib": 0.041915833950042725, + "step": 493 + }, + { + "ce_ib": 8.040367126464844, + "ce_orig": 0.49942535161972046, + "epoch": 0.14206628801495436, + "kl_loss": 3.2337100505828857, + "loss_ib": 0.04037746787071228, + "step": 494 + }, + { + "ce_ib": 9.84688949584961, + "ce_orig": 0.3545916974544525, + "epoch": 0.14206628801495436, + "kl_loss": 3.2256717681884766, + "loss_ib": 0.0421036034822464, + "step": 494 + }, + { + "ce_ib": 13.763298034667969, + "ce_orig": 1.4035097360610962, + "epoch": 0.14206628801495436, + "kl_loss": 3.3014750480651855, + "loss_ib": 0.04677804931998253, + "step": 494 + }, + { + "ce_ib": 10.143733978271484, + "ce_orig": 0.5595765709877014, + "epoch": 0.14206628801495436, + "kl_loss": 3.2254323959350586, + "loss_ib": 0.042398057878017426, + "step": 494 + }, + { + "epoch": 0.14235387159393198, + "grad_norm": 0.3967653214931488, + "learning_rate": 9.992623539650048e-06, + "loss": 0.947, + "step": 495 + }, + { + "ce_ib": 8.59465217590332, + "ce_orig": 0.6234766840934753, + "epoch": 0.14235387159393198, + "kl_loss": 2.992295026779175, + "loss_ib": 0.038517601788043976, + "step": 495 + }, + { + "ce_ib": 11.884413719177246, + "ce_orig": 1.2884939908981323, + "epoch": 0.14235387159393198, + "kl_loss": 3.2238070964813232, + "loss_ib": 0.04412248358130455, + "step": 495 + }, + { + "ce_ib": 12.295158386230469, + "ce_orig": 0.9079825282096863, + "epoch": 0.14235387159393198, + "kl_loss": 3.261247158050537, + "loss_ib": 0.04490762948989868, + "step": 495 + }, + { + "ce_ib": 11.556446075439453, + "ce_orig": 0.9584062695503235, + "epoch": 0.14235387159393198, + "kl_loss": 2.946765661239624, + "loss_ib": 0.0410241037607193, + "step": 495 + }, + { + "ce_ib": 13.715659141540527, + "ce_orig": 1.301735520362854, + "epoch": 0.14264145517290963, + "kl_loss": 3.156116485595703, + "loss_ib": 0.04527682065963745, + "step": 496 + }, + { + "ce_ib": 12.423558235168457, + "ce_orig": 0.9063977599143982, + "epoch": 0.14264145517290963, + "kl_loss": 3.1704659461975098, + "loss_ib": 0.04412821680307388, + "step": 496 + }, + { + "ce_ib": 15.041820526123047, + "ce_orig": 0.9676111340522766, + "epoch": 0.14264145517290963, + "kl_loss": 3.3395094871520996, + "loss_ib": 0.04843691736459732, + "step": 496 + }, + { + "ce_ib": 17.061294555664062, + "ce_orig": 2.1067917346954346, + "epoch": 0.14264145517290963, + "kl_loss": 3.0767550468444824, + "loss_ib": 0.04782884567975998, + "step": 496 + }, + { + "ce_ib": 11.296546936035156, + "ce_orig": 0.9768355488777161, + "epoch": 0.14292903875188726, + "kl_loss": 2.777761936187744, + "loss_ib": 0.03907416760921478, + "step": 497 + }, + { + "ce_ib": 11.038111686706543, + "ce_orig": 0.8759608268737793, + "epoch": 0.14292903875188726, + "kl_loss": 3.004284381866455, + "loss_ib": 0.04108095541596413, + "step": 497 + }, + { + "ce_ib": 11.033641815185547, + "ce_orig": 0.5562906265258789, + "epoch": 0.14292903875188726, + "kl_loss": 2.9936814308166504, + "loss_ib": 0.04097045958042145, + "step": 497 + }, + { + "ce_ib": 11.028107643127441, + "ce_orig": 1.197322130203247, + "epoch": 0.14292903875188726, + "kl_loss": 2.7564334869384766, + "loss_ib": 0.038592442870140076, + "step": 497 + }, + { + "ce_ib": 14.40247631072998, + "ce_orig": 1.2345876693725586, + "epoch": 0.1432166223308649, + "kl_loss": 2.6974658966064453, + "loss_ib": 0.04137713462114334, + "step": 498 + }, + { + "ce_ib": 12.58112621307373, + "ce_orig": 0.6902965903282166, + "epoch": 0.1432166223308649, + "kl_loss": 2.812513828277588, + "loss_ib": 0.04070626199245453, + "step": 498 + }, + { + "ce_ib": 10.28808879852295, + "ce_orig": 0.5265849232673645, + "epoch": 0.1432166223308649, + "kl_loss": 2.965153217315674, + "loss_ib": 0.039939623326063156, + "step": 498 + }, + { + "ce_ib": 11.784356117248535, + "ce_orig": 0.7255687117576599, + "epoch": 0.1432166223308649, + "kl_loss": 2.6487035751342773, + "loss_ib": 0.03827139362692833, + "step": 498 + }, + { + "ce_ib": 15.454878807067871, + "ce_orig": 1.4307360649108887, + "epoch": 0.14350420590984256, + "kl_loss": 2.638218641281128, + "loss_ib": 0.04183706268668175, + "step": 499 + }, + { + "ce_ib": 11.41547679901123, + "ce_orig": 1.0924228429794312, + "epoch": 0.14350420590984256, + "kl_loss": 2.683103084564209, + "loss_ib": 0.038246504962444305, + "step": 499 + }, + { + "ce_ib": 7.627654552459717, + "ce_orig": 0.6519067883491516, + "epoch": 0.14350420590984256, + "kl_loss": 2.4674363136291504, + "loss_ib": 0.032302018254995346, + "step": 499 + }, + { + "ce_ib": 9.059030532836914, + "ce_orig": 0.9335259199142456, + "epoch": 0.14350420590984256, + "kl_loss": 2.7982892990112305, + "loss_ib": 0.03704192489385605, + "step": 499 + }, + { + "epoch": 0.14379178948882018, + "grad_norm": 0.3535037636756897, + "learning_rate": 9.99219611727762e-06, + "loss": 0.9542, + "step": 500 + }, + { + "ce_ib": 12.202539443969727, + "ce_orig": 1.0921835899353027, + "epoch": 0.14379178948882018, + "kl_loss": 2.6520490646362305, + "loss_ib": 0.03872302919626236, + "step": 500 + }, + { + "ce_ib": 10.411079406738281, + "ce_orig": 1.0469058752059937, + "epoch": 0.14379178948882018, + "kl_loss": 2.550952434539795, + "loss_ib": 0.035920605063438416, + "step": 500 + }, + { + "ce_ib": 10.579100608825684, + "ce_orig": 0.7717140316963196, + "epoch": 0.14379178948882018, + "kl_loss": 2.458261013031006, + "loss_ib": 0.03516170755028725, + "step": 500 + }, + { + "ce_ib": 10.30762004852295, + "ce_orig": 0.47040054202079773, + "epoch": 0.14379178948882018, + "kl_loss": 2.5514392852783203, + "loss_ib": 0.03582201525568962, + "step": 500 + }, + { + "ce_ib": 8.242244720458984, + "ce_orig": 0.7109037041664124, + "epoch": 0.14407937306779783, + "kl_loss": 2.2276763916015625, + "loss_ib": 0.03051900863647461, + "step": 501 + }, + { + "ce_ib": 12.867658615112305, + "ce_orig": 1.3448221683502197, + "epoch": 0.14407937306779783, + "kl_loss": 2.390228271484375, + "loss_ib": 0.036769941449165344, + "step": 501 + }, + { + "ce_ib": 12.047565460205078, + "ce_orig": 1.1863359212875366, + "epoch": 0.14407937306779783, + "kl_loss": 2.2493762969970703, + "loss_ib": 0.034541331231594086, + "step": 501 + }, + { + "ce_ib": 7.998773574829102, + "ce_orig": 0.7724082469940186, + "epoch": 0.14407937306779783, + "kl_loss": 2.218076229095459, + "loss_ib": 0.03017953597009182, + "step": 501 + }, + { + "ce_ib": 8.808771133422852, + "ce_orig": 0.346529096364975, + "epoch": 0.14436695664677546, + "kl_loss": 2.290055751800537, + "loss_ib": 0.031709328293800354, + "step": 502 + }, + { + "ce_ib": 14.042349815368652, + "ce_orig": 0.8455150723457336, + "epoch": 0.14436695664677546, + "kl_loss": 2.330575466156006, + "loss_ib": 0.037348102778196335, + "step": 502 + }, + { + "ce_ib": 9.860107421875, + "ce_orig": 0.6380610466003418, + "epoch": 0.14436695664677546, + "kl_loss": 2.2274303436279297, + "loss_ib": 0.03213440999388695, + "step": 502 + }, + { + "ce_ib": 7.90905237197876, + "ce_orig": 0.7825286388397217, + "epoch": 0.14436695664677546, + "kl_loss": 2.2258763313293457, + "loss_ib": 0.03016781434416771, + "step": 502 + }, + { + "ce_ib": 11.905344009399414, + "ce_orig": 0.5903149247169495, + "epoch": 0.1446545402257531, + "kl_loss": 2.3541271686553955, + "loss_ib": 0.035446614027023315, + "step": 503 + }, + { + "ce_ib": 9.341203689575195, + "ce_orig": 0.5810147523880005, + "epoch": 0.1446545402257531, + "kl_loss": 1.9661822319030762, + "loss_ib": 0.029003025963902473, + "step": 503 + }, + { + "ce_ib": 11.79522705078125, + "ce_orig": 0.9077520966529846, + "epoch": 0.1446545402257531, + "kl_loss": 2.15496826171875, + "loss_ib": 0.03334490954875946, + "step": 503 + }, + { + "ce_ib": 12.743915557861328, + "ce_orig": 0.9383360147476196, + "epoch": 0.1446545402257531, + "kl_loss": 2.3349556922912598, + "loss_ib": 0.03609347343444824, + "step": 503 + }, + { + "ce_ib": 14.691107749938965, + "ce_orig": 1.4200998544692993, + "epoch": 0.14494212380473076, + "kl_loss": 1.9536817073822021, + "loss_ib": 0.034227922558784485, + "step": 504 + }, + { + "ce_ib": 13.637428283691406, + "ce_orig": 1.0158778429031372, + "epoch": 0.14494212380473076, + "kl_loss": 2.0383381843566895, + "loss_ib": 0.0340208075940609, + "step": 504 + }, + { + "ce_ib": 12.56800651550293, + "ce_orig": 0.9646020531654358, + "epoch": 0.14494212380473076, + "kl_loss": 2.094742774963379, + "loss_ib": 0.033515434712171555, + "step": 504 + }, + { + "ce_ib": 12.06141185760498, + "ce_orig": 1.0306986570358276, + "epoch": 0.14494212380473076, + "kl_loss": 2.175787925720215, + "loss_ib": 0.0338192917406559, + "step": 504 + }, + { + "epoch": 0.14522970738370838, + "grad_norm": 0.33350127935409546, + "learning_rate": 9.991756667546032e-06, + "loss": 0.9489, + "step": 505 + }, + { + "ce_ib": 8.817842483520508, + "ce_orig": 0.6972077488899231, + "epoch": 0.14522970738370838, + "kl_loss": 1.7852230072021484, + "loss_ib": 0.026670072227716446, + "step": 505 + }, + { + "ce_ib": 12.437209129333496, + "ce_orig": 0.6099770069122314, + "epoch": 0.14522970738370838, + "kl_loss": 2.4981532096862793, + "loss_ib": 0.03741874173283577, + "step": 505 + }, + { + "ce_ib": 11.220137596130371, + "ce_orig": 0.48771539330482483, + "epoch": 0.14522970738370838, + "kl_loss": 2.1456246376037598, + "loss_ib": 0.03267638385295868, + "step": 505 + }, + { + "ce_ib": 8.924522399902344, + "ce_orig": 0.5051496624946594, + "epoch": 0.14522970738370838, + "kl_loss": 1.9803262948989868, + "loss_ib": 0.028727782890200615, + "step": 505 + }, + { + "ce_ib": 12.348464012145996, + "ce_orig": 0.8960937857627869, + "epoch": 0.14551729096268604, + "kl_loss": 1.814281702041626, + "loss_ib": 0.030491279438138008, + "step": 506 + }, + { + "ce_ib": 7.2094316482543945, + "ce_orig": 0.47113969922065735, + "epoch": 0.14551729096268604, + "kl_loss": 1.9257069826126099, + "loss_ib": 0.026466500014066696, + "step": 506 + }, + { + "ce_ib": 11.242286682128906, + "ce_orig": 0.4751087427139282, + "epoch": 0.14551729096268604, + "kl_loss": 2.0415260791778564, + "loss_ib": 0.03165754675865173, + "step": 506 + }, + { + "ce_ib": 10.97330379486084, + "ce_orig": 0.642387866973877, + "epoch": 0.14551729096268604, + "kl_loss": 2.0187416076660156, + "loss_ib": 0.03116072155535221, + "step": 506 + }, + { + "ce_ib": 16.359783172607422, + "ce_orig": 1.645643949508667, + "epoch": 0.14580487454166366, + "kl_loss": 1.8092081546783447, + "loss_ib": 0.034451864659786224, + "step": 507 + }, + { + "ce_ib": 10.260422706604004, + "ce_orig": 0.9393115639686584, + "epoch": 0.14580487454166366, + "kl_loss": 2.0392160415649414, + "loss_ib": 0.03065258450806141, + "step": 507 + }, + { + "ce_ib": 15.242369651794434, + "ce_orig": 0.9167593717575073, + "epoch": 0.14580487454166366, + "kl_loss": 1.7671477794647217, + "loss_ib": 0.03291384503245354, + "step": 507 + }, + { + "ce_ib": 11.220046043395996, + "ce_orig": 0.7460023164749146, + "epoch": 0.14580487454166366, + "kl_loss": 1.9201838970184326, + "loss_ib": 0.03042188659310341, + "step": 507 + }, + { + "ce_ib": 10.025154113769531, + "ce_orig": 0.6676295399665833, + "epoch": 0.1460924581206413, + "kl_loss": 2.052180767059326, + "loss_ib": 0.03054695948958397, + "step": 508 + }, + { + "ce_ib": 14.332385063171387, + "ce_orig": 0.7957293391227722, + "epoch": 0.1460924581206413, + "kl_loss": 1.7676377296447754, + "loss_ib": 0.032008763402700424, + "step": 508 + }, + { + "ce_ib": 14.902412414550781, + "ce_orig": 1.4487565755844116, + "epoch": 0.1460924581206413, + "kl_loss": 1.6765596866607666, + "loss_ib": 0.031668007373809814, + "step": 508 + }, + { + "ce_ib": 9.683943748474121, + "ce_orig": 1.0197540521621704, + "epoch": 0.1460924581206413, + "kl_loss": 1.610795021057129, + "loss_ib": 0.025791892781853676, + "step": 508 + }, + { + "ce_ib": 15.52167797088623, + "ce_orig": 1.7749841213226318, + "epoch": 0.14638004169961896, + "kl_loss": 1.6767610311508179, + "loss_ib": 0.0322892889380455, + "step": 509 + }, + { + "ce_ib": 13.908830642700195, + "ce_orig": 0.9751031994819641, + "epoch": 0.14638004169961896, + "kl_loss": 1.6931664943695068, + "loss_ib": 0.030840495601296425, + "step": 509 + }, + { + "ce_ib": 9.30358600616455, + "ce_orig": 0.5810970664024353, + "epoch": 0.14638004169961896, + "kl_loss": 1.6350352764129639, + "loss_ib": 0.025653937831521034, + "step": 509 + }, + { + "ce_ib": 12.388398170471191, + "ce_orig": 1.3628201484680176, + "epoch": 0.14638004169961896, + "kl_loss": 1.6909823417663574, + "loss_ib": 0.029298221692442894, + "step": 509 + }, + { + "epoch": 0.1466676252785966, + "grad_norm": 0.18405361473560333, + "learning_rate": 9.991305191514018e-06, + "loss": 0.8849, + "step": 510 + }, + { + "ce_ib": 8.311685562133789, + "ce_orig": 0.7321364283561707, + "epoch": 0.1466676252785966, + "kl_loss": 1.6412349939346313, + "loss_ib": 0.02472403459250927, + "step": 510 + }, + { + "ce_ib": 15.0967378616333, + "ce_orig": 1.2958650588989258, + "epoch": 0.1466676252785966, + "kl_loss": 1.786908745765686, + "loss_ib": 0.032965827733278275, + "step": 510 + }, + { + "ce_ib": 17.36980438232422, + "ce_orig": 1.9791719913482666, + "epoch": 0.1466676252785966, + "kl_loss": 1.8581990003585815, + "loss_ib": 0.03595179319381714, + "step": 510 + }, + { + "ce_ib": 12.289637565612793, + "ce_orig": 0.7127004265785217, + "epoch": 0.1466676252785966, + "kl_loss": 1.685407280921936, + "loss_ib": 0.02914370968937874, + "step": 510 + }, + { + "ce_ib": 15.004842758178711, + "ce_orig": 1.406548023223877, + "epoch": 0.14695520885757424, + "kl_loss": 1.5896825790405273, + "loss_ib": 0.030901670455932617, + "step": 511 + }, + { + "ce_ib": 10.542769432067871, + "ce_orig": 0.7553672790527344, + "epoch": 0.14695520885757424, + "kl_loss": 1.7919988632202148, + "loss_ib": 0.028462758287787437, + "step": 511 + }, + { + "ce_ib": 13.622482299804688, + "ce_orig": 1.207612156867981, + "epoch": 0.14695520885757424, + "kl_loss": 1.765162706375122, + "loss_ib": 0.03127410635352135, + "step": 511 + }, + { + "ce_ib": 8.20363998413086, + "ce_orig": 0.6386350989341736, + "epoch": 0.14695520885757424, + "kl_loss": 1.5854518413543701, + "loss_ib": 0.024058157578110695, + "step": 511 + }, + { + "ce_ib": 13.55444622039795, + "ce_orig": 1.339708924293518, + "epoch": 0.14724279243655186, + "kl_loss": 1.6126033067703247, + "loss_ib": 0.029680481180548668, + "step": 512 + }, + { + "ce_ib": 8.628886222839355, + "ce_orig": 0.5619939565658569, + "epoch": 0.14724279243655186, + "kl_loss": 1.6175487041473389, + "loss_ib": 0.02480437234044075, + "step": 512 + }, + { + "ce_ib": 10.747875213623047, + "ce_orig": 0.8628310561180115, + "epoch": 0.14724279243655186, + "kl_loss": 1.5580902099609375, + "loss_ib": 0.02632877789437771, + "step": 512 + }, + { + "ce_ib": 11.197033882141113, + "ce_orig": 0.588740348815918, + "epoch": 0.14724279243655186, + "kl_loss": 1.625337839126587, + "loss_ib": 0.02745041251182556, + "step": 512 + }, + { + "ce_ib": 17.5482177734375, + "ce_orig": 1.566094160079956, + "epoch": 0.1475303760155295, + "kl_loss": 1.5798025131225586, + "loss_ib": 0.03334624320268631, + "step": 513 + }, + { + "ce_ib": 9.978529930114746, + "ce_orig": 0.9000970125198364, + "epoch": 0.1475303760155295, + "kl_loss": 1.5787134170532227, + "loss_ib": 0.025765664875507355, + "step": 513 + }, + { + "ce_ib": 7.795269012451172, + "ce_orig": 0.5428386926651001, + "epoch": 0.1475303760155295, + "kl_loss": 1.6383824348449707, + "loss_ib": 0.024179093539714813, + "step": 513 + }, + { + "ce_ib": 8.96078109741211, + "ce_orig": 0.45578211545944214, + "epoch": 0.1475303760155295, + "kl_loss": 1.5609797239303589, + "loss_ib": 0.024570578709244728, + "step": 513 + }, + { + "ce_ib": 14.166945457458496, + "ce_orig": 1.237365484237671, + "epoch": 0.14781795959450716, + "kl_loss": 1.5814361572265625, + "loss_ib": 0.029981307685375214, + "step": 514 + }, + { + "ce_ib": 15.119035720825195, + "ce_orig": 1.146514654159546, + "epoch": 0.14781795959450716, + "kl_loss": 1.787841796875, + "loss_ib": 0.032997455447912216, + "step": 514 + }, + { + "ce_ib": 11.368758201599121, + "ce_orig": 1.2245467901229858, + "epoch": 0.14781795959450716, + "kl_loss": 1.763063669204712, + "loss_ib": 0.028999393805861473, + "step": 514 + }, + { + "ce_ib": 10.331769943237305, + "ce_orig": 0.8155557513237, + "epoch": 0.14781795959450716, + "kl_loss": 1.5401368141174316, + "loss_ib": 0.025733135640621185, + "step": 514 + }, + { + "epoch": 0.1481055431734848, + "grad_norm": 0.10760627686977386, + "learning_rate": 9.990841690269293e-06, + "loss": 0.9116, + "step": 515 + }, + { + "ce_ib": 9.445401191711426, + "ce_orig": 0.5457524061203003, + "epoch": 0.1481055431734848, + "kl_loss": 1.5743253231048584, + "loss_ib": 0.02518865466117859, + "step": 515 + }, + { + "ce_ib": 12.43077278137207, + "ce_orig": 1.0758410692214966, + "epoch": 0.1481055431734848, + "kl_loss": 1.5002497434616089, + "loss_ib": 0.02743327058851719, + "step": 515 + }, + { + "ce_ib": 9.92260456085205, + "ce_orig": 0.9019168615341187, + "epoch": 0.1481055431734848, + "kl_loss": 1.5244628190994263, + "loss_ib": 0.025167230516672134, + "step": 515 + }, + { + "ce_ib": 10.745101928710938, + "ce_orig": 0.7567242980003357, + "epoch": 0.1481055431734848, + "kl_loss": 1.851075530052185, + "loss_ib": 0.029255857691168785, + "step": 515 + }, + { + "ce_ib": 11.868610382080078, + "ce_orig": 1.2269304990768433, + "epoch": 0.14839312675246244, + "kl_loss": 1.5262444019317627, + "loss_ib": 0.027131054550409317, + "step": 516 + }, + { + "ce_ib": 12.609333992004395, + "ce_orig": 1.2865411043167114, + "epoch": 0.14839312675246244, + "kl_loss": 1.5142364501953125, + "loss_ib": 0.027751697227358818, + "step": 516 + }, + { + "ce_ib": 15.365309715270996, + "ce_orig": 1.3366944789886475, + "epoch": 0.14839312675246244, + "kl_loss": 1.5448601245880127, + "loss_ib": 0.03081391006708145, + "step": 516 + }, + { + "ce_ib": 7.554969787597656, + "ce_orig": 0.7058290243148804, + "epoch": 0.14839312675246244, + "kl_loss": 1.5178661346435547, + "loss_ib": 0.02273363061249256, + "step": 516 + }, + { + "ce_ib": 8.25823974609375, + "ce_orig": 0.839113175868988, + "epoch": 0.14868071033144006, + "kl_loss": 1.557971477508545, + "loss_ib": 0.023837953805923462, + "step": 517 + }, + { + "ce_ib": 12.297826766967773, + "ce_orig": 1.190186619758606, + "epoch": 0.14868071033144006, + "kl_loss": 1.658672571182251, + "loss_ib": 0.02888455241918564, + "step": 517 + }, + { + "ce_ib": 13.659313201904297, + "ce_orig": 1.3207685947418213, + "epoch": 0.14868071033144006, + "kl_loss": 1.6122772693634033, + "loss_ib": 0.029782084748148918, + "step": 517 + }, + { + "ce_ib": 12.351545333862305, + "ce_orig": 0.750295877456665, + "epoch": 0.14868071033144006, + "kl_loss": 1.6051900386810303, + "loss_ib": 0.028403444215655327, + "step": 517 + }, + { + "ce_ib": 11.500561714172363, + "ce_orig": 0.8358568549156189, + "epoch": 0.14896829391041772, + "kl_loss": 1.5413665771484375, + "loss_ib": 0.02691422961652279, + "step": 518 + }, + { + "ce_ib": 9.108760833740234, + "ce_orig": 0.8471581339836121, + "epoch": 0.14896829391041772, + "kl_loss": 1.4837156534194946, + "loss_ib": 0.023945918306708336, + "step": 518 + }, + { + "ce_ib": 11.258501052856445, + "ce_orig": 0.804084300994873, + "epoch": 0.14896829391041772, + "kl_loss": 1.5666208267211914, + "loss_ib": 0.026924708858132362, + "step": 518 + }, + { + "ce_ib": 13.515926361083984, + "ce_orig": 0.8296307325363159, + "epoch": 0.14896829391041772, + "kl_loss": 1.5592403411865234, + "loss_ib": 0.029108328744769096, + "step": 518 + }, + { + "ce_ib": 13.376294136047363, + "ce_orig": 1.156392216682434, + "epoch": 0.14925587748939537, + "kl_loss": 1.6085054874420166, + "loss_ib": 0.029461350291967392, + "step": 519 + }, + { + "ce_ib": 9.568916320800781, + "ce_orig": 0.4749041795730591, + "epoch": 0.14925587748939537, + "kl_loss": 1.536426305770874, + "loss_ib": 0.02493317984044552, + "step": 519 + }, + { + "ce_ib": 16.30808448791504, + "ce_orig": 1.5859891176223755, + "epoch": 0.14925587748939537, + "kl_loss": 1.6146225929260254, + "loss_ib": 0.03245430812239647, + "step": 519 + }, + { + "ce_ib": 7.4144062995910645, + "ce_orig": 0.6065052151679993, + "epoch": 0.14925587748939537, + "kl_loss": 1.601442575454712, + "loss_ib": 0.02342883124947548, + "step": 519 + }, + { + "epoch": 0.149543461068373, + "grad_norm": 0.1481025665998459, + "learning_rate": 9.990366164928538e-06, + "loss": 0.8984, + "step": 520 + }, + { + "ce_ib": 15.21028995513916, + "ce_orig": 1.826915979385376, + "epoch": 0.149543461068373, + "kl_loss": 1.557509422302246, + "loss_ib": 0.03078538365662098, + "step": 520 + }, + { + "ce_ib": 10.92094612121582, + "ce_orig": 0.6996050477027893, + "epoch": 0.149543461068373, + "kl_loss": 1.522399663925171, + "loss_ib": 0.02614494226872921, + "step": 520 + }, + { + "ce_ib": 11.630789756774902, + "ce_orig": 0.5110880136489868, + "epoch": 0.149543461068373, + "kl_loss": 1.6356468200683594, + "loss_ib": 0.02798725850880146, + "step": 520 + }, + { + "ce_ib": 13.426923751831055, + "ce_orig": 0.7888133525848389, + "epoch": 0.149543461068373, + "kl_loss": 1.5482978820800781, + "loss_ib": 0.028909901157021523, + "step": 520 + }, + { + "ce_ib": 9.767127990722656, + "ce_orig": 0.9814274907112122, + "epoch": 0.14983104464735064, + "kl_loss": 1.706017255783081, + "loss_ib": 0.02682730183005333, + "step": 521 + }, + { + "ce_ib": 8.689159393310547, + "ce_orig": 0.5986825823783875, + "epoch": 0.14983104464735064, + "kl_loss": 1.5086462497711182, + "loss_ib": 0.023775622248649597, + "step": 521 + }, + { + "ce_ib": 7.830185890197754, + "ce_orig": 0.6796808242797852, + "epoch": 0.14983104464735064, + "kl_loss": 1.5338833332061768, + "loss_ib": 0.02316901832818985, + "step": 521 + }, + { + "ce_ib": 8.914192199707031, + "ce_orig": 0.5978474617004395, + "epoch": 0.14983104464735064, + "kl_loss": 1.5090875625610352, + "loss_ib": 0.024005066603422165, + "step": 521 + }, + { + "ce_ib": 11.189813613891602, + "ce_orig": 0.7912343144416809, + "epoch": 0.15011862822632827, + "kl_loss": 1.486729383468628, + "loss_ib": 0.026057107374072075, + "step": 522 + }, + { + "ce_ib": 12.931374549865723, + "ce_orig": 1.4291115999221802, + "epoch": 0.15011862822632827, + "kl_loss": 1.575240135192871, + "loss_ib": 0.028683776035904884, + "step": 522 + }, + { + "ce_ib": 6.15507173538208, + "ce_orig": 0.567773163318634, + "epoch": 0.15011862822632827, + "kl_loss": 1.5185538530349731, + "loss_ib": 0.021340610459446907, + "step": 522 + }, + { + "ce_ib": 13.948975563049316, + "ce_orig": 0.893979012966156, + "epoch": 0.15011862822632827, + "kl_loss": 1.5725568532943726, + "loss_ib": 0.02967454306781292, + "step": 522 + }, + { + "ce_ib": 14.437583923339844, + "ce_orig": 1.6611443758010864, + "epoch": 0.15040621180530592, + "kl_loss": 1.545514702796936, + "loss_ib": 0.029892729595303535, + "step": 523 + }, + { + "ce_ib": 12.710461616516113, + "ce_orig": 0.8755899667739868, + "epoch": 0.15040621180530592, + "kl_loss": 1.5674240589141846, + "loss_ib": 0.028384702280163765, + "step": 523 + }, + { + "ce_ib": 8.738030433654785, + "ce_orig": 0.8012534976005554, + "epoch": 0.15040621180530592, + "kl_loss": 1.5125902891159058, + "loss_ib": 0.023863932117819786, + "step": 523 + }, + { + "ce_ib": 11.891736030578613, + "ce_orig": 1.0337281227111816, + "epoch": 0.15040621180530592, + "kl_loss": 1.5082712173461914, + "loss_ib": 0.026974448934197426, + "step": 523 + }, + { + "ce_ib": 11.432838439941406, + "ce_orig": 0.8564993739128113, + "epoch": 0.15069379538428357, + "kl_loss": 1.4956270456314087, + "loss_ib": 0.02638910710811615, + "step": 524 + }, + { + "ce_ib": 6.101011276245117, + "ce_orig": 0.4317745864391327, + "epoch": 0.15069379538428357, + "kl_loss": 1.626281499862671, + "loss_ib": 0.022363826632499695, + "step": 524 + }, + { + "ce_ib": 11.570913314819336, + "ce_orig": 0.5999628901481628, + "epoch": 0.15069379538428357, + "kl_loss": 1.573062539100647, + "loss_ib": 0.027301540598273277, + "step": 524 + }, + { + "ce_ib": 10.142786979675293, + "ce_orig": 0.8973500728607178, + "epoch": 0.15069379538428357, + "kl_loss": 1.529207468032837, + "loss_ib": 0.025434860959649086, + "step": 524 + }, + { + "epoch": 0.1509813789632612, + "grad_norm": 0.09314551949501038, + "learning_rate": 9.989878616637401e-06, + "loss": 0.9524, + "step": 525 + }, + { + "ce_ib": 17.579530715942383, + "ce_orig": 1.8244338035583496, + "epoch": 0.1509813789632612, + "kl_loss": 1.5392405986785889, + "loss_ib": 0.032971933484077454, + "step": 525 + }, + { + "ce_ib": 11.251588821411133, + "ce_orig": 0.863845705986023, + "epoch": 0.1509813789632612, + "kl_loss": 1.5278338193893433, + "loss_ib": 0.02652992680668831, + "step": 525 + }, + { + "ce_ib": 8.327178955078125, + "ce_orig": 0.787936806678772, + "epoch": 0.1509813789632612, + "kl_loss": 1.545323371887207, + "loss_ib": 0.02378041297197342, + "step": 525 + }, + { + "ce_ib": 10.291125297546387, + "ce_orig": 0.7874522805213928, + "epoch": 0.1509813789632612, + "kl_loss": 1.4875431060791016, + "loss_ib": 0.025166556239128113, + "step": 525 + }, + { + "ce_ib": 12.645198822021484, + "ce_orig": 0.6515507698059082, + "epoch": 0.15126896254223884, + "kl_loss": 1.5123183727264404, + "loss_ib": 0.02776838280260563, + "step": 526 + }, + { + "ce_ib": 10.556811332702637, + "ce_orig": 1.057904601097107, + "epoch": 0.15126896254223884, + "kl_loss": 1.5609617233276367, + "loss_ib": 0.026166429743170738, + "step": 526 + }, + { + "ce_ib": 10.272709846496582, + "ce_orig": 0.8701647520065308, + "epoch": 0.15126896254223884, + "kl_loss": 1.559139370918274, + "loss_ib": 0.02586410380899906, + "step": 526 + }, + { + "ce_ib": 9.878427505493164, + "ce_orig": 0.654448926448822, + "epoch": 0.15126896254223884, + "kl_loss": 1.5793863534927368, + "loss_ib": 0.025672290474176407, + "step": 526 + }, + { + "ce_ib": 10.595402717590332, + "ce_orig": 0.7197730541229248, + "epoch": 0.15155654612121647, + "kl_loss": 1.4838206768035889, + "loss_ib": 0.025433609262108803, + "step": 527 + }, + { + "ce_ib": 8.086220741271973, + "ce_orig": 0.7310401797294617, + "epoch": 0.15155654612121647, + "kl_loss": 1.4655554294586182, + "loss_ib": 0.022741774097085, + "step": 527 + }, + { + "ce_ib": 7.985743522644043, + "ce_orig": 0.873805582523346, + "epoch": 0.15155654612121647, + "kl_loss": 1.5185062885284424, + "loss_ib": 0.02317080646753311, + "step": 527 + }, + { + "ce_ib": 9.111749649047852, + "ce_orig": 0.605055034160614, + "epoch": 0.15155654612121647, + "kl_loss": 1.5961281061172485, + "loss_ib": 0.025073029100894928, + "step": 527 + }, + { + "ce_ib": 13.70055103302002, + "ce_orig": 1.3269081115722656, + "epoch": 0.15184412970019412, + "kl_loss": 1.5185916423797607, + "loss_ib": 0.0288864653557539, + "step": 528 + }, + { + "ce_ib": 12.714378356933594, + "ce_orig": 0.890455424785614, + "epoch": 0.15184412970019412, + "kl_loss": 1.547227382659912, + "loss_ib": 0.028186652809381485, + "step": 528 + }, + { + "ce_ib": 13.768203735351562, + "ce_orig": 0.5043600797653198, + "epoch": 0.15184412970019412, + "kl_loss": 1.6048271656036377, + "loss_ib": 0.02981647476553917, + "step": 528 + }, + { + "ce_ib": 9.12528133392334, + "ce_orig": 0.8668175339698792, + "epoch": 0.15184412970019412, + "kl_loss": 1.5656447410583496, + "loss_ib": 0.02478172816336155, + "step": 528 + }, + { + "ce_ib": 7.842939376831055, + "ce_orig": 0.6462977528572083, + "epoch": 0.15213171327917177, + "kl_loss": 1.4568753242492676, + "loss_ib": 0.022411691024899483, + "step": 529 + }, + { + "ce_ib": 8.972084999084473, + "ce_orig": 0.5574026703834534, + "epoch": 0.15213171327917177, + "kl_loss": 1.5518778562545776, + "loss_ib": 0.02449086308479309, + "step": 529 + }, + { + "ce_ib": 13.132000923156738, + "ce_orig": 1.5121755599975586, + "epoch": 0.15213171327917177, + "kl_loss": 1.761476993560791, + "loss_ib": 0.030746769160032272, + "step": 529 + }, + { + "ce_ib": 10.317779541015625, + "ce_orig": 0.7324342131614685, + "epoch": 0.15213171327917177, + "kl_loss": 1.5073950290679932, + "loss_ib": 0.025391731411218643, + "step": 529 + }, + { + "epoch": 0.1524192968581494, + "grad_norm": 0.09296334534883499, + "learning_rate": 9.989379046570502e-06, + "loss": 0.9041, + "step": 530 + }, + { + "ce_ib": 14.723713874816895, + "ce_orig": 1.1431795358657837, + "epoch": 0.1524192968581494, + "kl_loss": 1.4883091449737549, + "loss_ib": 0.029606804251670837, + "step": 530 + }, + { + "ce_ib": 9.255940437316895, + "ce_orig": 1.317234992980957, + "epoch": 0.1524192968581494, + "kl_loss": 1.508230209350586, + "loss_ib": 0.024338241666555405, + "step": 530 + }, + { + "ce_ib": 10.745719909667969, + "ce_orig": 0.7085793614387512, + "epoch": 0.1524192968581494, + "kl_loss": 1.5349406003952026, + "loss_ib": 0.026095125824213028, + "step": 530 + }, + { + "ce_ib": 11.930793762207031, + "ce_orig": 0.7779906392097473, + "epoch": 0.1524192968581494, + "kl_loss": 1.449808955192566, + "loss_ib": 0.026428882032632828, + "step": 530 + }, + { + "ce_ib": 9.066697120666504, + "ce_orig": 0.8550069332122803, + "epoch": 0.15270688043712705, + "kl_loss": 1.448561191558838, + "loss_ib": 0.02355230785906315, + "step": 531 + }, + { + "ce_ib": 14.419188499450684, + "ce_orig": 0.8470748066902161, + "epoch": 0.15270688043712705, + "kl_loss": 1.476117730140686, + "loss_ib": 0.029180364683270454, + "step": 531 + }, + { + "ce_ib": 4.3977861404418945, + "ce_orig": 0.16003404557704926, + "epoch": 0.15270688043712705, + "kl_loss": 1.4080500602722168, + "loss_ib": 0.018478285521268845, + "step": 531 + }, + { + "ce_ib": 8.737853050231934, + "ce_orig": 0.8578985929489136, + "epoch": 0.15270688043712705, + "kl_loss": 1.5061912536621094, + "loss_ib": 0.02379976399242878, + "step": 531 + }, + { + "ce_ib": 13.226619720458984, + "ce_orig": 1.234938621520996, + "epoch": 0.15299446401610467, + "kl_loss": 1.5700280666351318, + "loss_ib": 0.028926901519298553, + "step": 532 + }, + { + "ce_ib": 13.38469123840332, + "ce_orig": 1.4244154691696167, + "epoch": 0.15299446401610467, + "kl_loss": 1.483577013015747, + "loss_ib": 0.028220461681485176, + "step": 532 + }, + { + "ce_ib": 10.670931816101074, + "ce_orig": 1.0447449684143066, + "epoch": 0.15299446401610467, + "kl_loss": 1.472721815109253, + "loss_ib": 0.025398148223757744, + "step": 532 + }, + { + "ce_ib": 9.630074501037598, + "ce_orig": 0.4802638292312622, + "epoch": 0.15299446401610467, + "kl_loss": 1.4675498008728027, + "loss_ib": 0.024305572733283043, + "step": 532 + }, + { + "ce_ib": 9.232088088989258, + "ce_orig": 0.6876621246337891, + "epoch": 0.15328204759508232, + "kl_loss": 1.5124475955963135, + "loss_ib": 0.02435656450688839, + "step": 533 + }, + { + "ce_ib": 9.646381378173828, + "ce_orig": 0.6899409890174866, + "epoch": 0.15328204759508232, + "kl_loss": 1.4822652339935303, + "loss_ib": 0.024469034746289253, + "step": 533 + }, + { + "ce_ib": 12.952717781066895, + "ce_orig": 1.2678933143615723, + "epoch": 0.15328204759508232, + "kl_loss": 1.5268654823303223, + "loss_ib": 0.02822137251496315, + "step": 533 + }, + { + "ce_ib": 12.745079040527344, + "ce_orig": 0.4462144672870636, + "epoch": 0.15328204759508232, + "kl_loss": 1.4826654195785522, + "loss_ib": 0.02757173217833042, + "step": 533 + }, + { + "ce_ib": 11.242120742797852, + "ce_orig": 0.727728009223938, + "epoch": 0.15356963117405997, + "kl_loss": 1.4587228298187256, + "loss_ib": 0.02582934871315956, + "step": 534 + }, + { + "ce_ib": 13.557268142700195, + "ce_orig": 1.029449701309204, + "epoch": 0.15356963117405997, + "kl_loss": 1.5601624250411987, + "loss_ib": 0.02915889210999012, + "step": 534 + }, + { + "ce_ib": 12.117486000061035, + "ce_orig": 1.2025965452194214, + "epoch": 0.15356963117405997, + "kl_loss": 1.4851754903793335, + "loss_ib": 0.026969240978360176, + "step": 534 + }, + { + "ce_ib": 11.584373474121094, + "ce_orig": 1.0777106285095215, + "epoch": 0.15356963117405997, + "kl_loss": 1.4760735034942627, + "loss_ib": 0.026345109567046165, + "step": 534 + }, + { + "epoch": 0.1538572147530376, + "grad_norm": 0.08552956581115723, + "learning_rate": 9.988867455931422e-06, + "loss": 0.9482, + "step": 535 + }, + { + "ce_ib": 15.782403945922852, + "ce_orig": 1.24473237991333, + "epoch": 0.1538572147530376, + "kl_loss": 1.498823881149292, + "loss_ib": 0.030770642682909966, + "step": 535 + }, + { + "ce_ib": 11.243325233459473, + "ce_orig": 0.5300117135047913, + "epoch": 0.1538572147530376, + "kl_loss": 1.634958028793335, + "loss_ib": 0.02759290672838688, + "step": 535 + }, + { + "ce_ib": 10.156462669372559, + "ce_orig": 0.8086475133895874, + "epoch": 0.1538572147530376, + "kl_loss": 1.5278410911560059, + "loss_ib": 0.02543487399816513, + "step": 535 + }, + { + "ce_ib": 8.719620704650879, + "ce_orig": 0.7736819386482239, + "epoch": 0.1538572147530376, + "kl_loss": 1.4572741985321045, + "loss_ib": 0.023292362689971924, + "step": 535 + }, + { + "ce_ib": 11.448285102844238, + "ce_orig": 1.3073540925979614, + "epoch": 0.15414479833201525, + "kl_loss": 1.468369722366333, + "loss_ib": 0.02613198198378086, + "step": 536 + }, + { + "ce_ib": 10.234390258789062, + "ce_orig": 0.6196459531784058, + "epoch": 0.15414479833201525, + "kl_loss": 1.4478919506072998, + "loss_ib": 0.024713311344385147, + "step": 536 + }, + { + "ce_ib": 8.465949058532715, + "ce_orig": 0.5101594924926758, + "epoch": 0.15414479833201525, + "kl_loss": 1.4375674724578857, + "loss_ib": 0.022841624915599823, + "step": 536 + }, + { + "ce_ib": 14.232527732849121, + "ce_orig": 0.9877519011497498, + "epoch": 0.15414479833201525, + "kl_loss": 1.5590825080871582, + "loss_ib": 0.02982335351407528, + "step": 536 + }, + { + "ce_ib": 13.802165031433105, + "ce_orig": 1.1603584289550781, + "epoch": 0.15443238191099287, + "kl_loss": 1.4649747610092163, + "loss_ib": 0.028451912105083466, + "step": 537 + }, + { + "ce_ib": 9.437994956970215, + "ce_orig": 1.0976390838623047, + "epoch": 0.15443238191099287, + "kl_loss": 1.453848123550415, + "loss_ib": 0.02397647500038147, + "step": 537 + }, + { + "ce_ib": 14.96358585357666, + "ce_orig": 1.2715431451797485, + "epoch": 0.15443238191099287, + "kl_loss": 1.4742491245269775, + "loss_ib": 0.0297060776501894, + "step": 537 + }, + { + "ce_ib": 13.275339126586914, + "ce_orig": 1.3510757684707642, + "epoch": 0.15443238191099287, + "kl_loss": 1.4458937644958496, + "loss_ib": 0.02773427590727806, + "step": 537 + }, + { + "ce_ib": 12.93941879272461, + "ce_orig": 1.3727543354034424, + "epoch": 0.15471996548997052, + "kl_loss": 1.466170310974121, + "loss_ib": 0.027601122856140137, + "step": 538 + }, + { + "ce_ib": 19.104490280151367, + "ce_orig": 2.0155956745147705, + "epoch": 0.15471996548997052, + "kl_loss": 1.4700965881347656, + "loss_ib": 0.03380545601248741, + "step": 538 + }, + { + "ce_ib": 9.300647735595703, + "ce_orig": 1.1129015684127808, + "epoch": 0.15471996548997052, + "kl_loss": 1.4137213230133057, + "loss_ib": 0.02343786135315895, + "step": 538 + }, + { + "ce_ib": 9.72518253326416, + "ce_orig": 1.0089741945266724, + "epoch": 0.15471996548997052, + "kl_loss": 1.412247657775879, + "loss_ib": 0.023847658187150955, + "step": 538 + }, + { + "ce_ib": 11.109746932983398, + "ce_orig": 0.6238597631454468, + "epoch": 0.15500754906894817, + "kl_loss": 1.424318552017212, + "loss_ib": 0.025352930650115013, + "step": 539 + }, + { + "ce_ib": 12.226134300231934, + "ce_orig": 0.7648814916610718, + "epoch": 0.15500754906894817, + "kl_loss": 1.5073215961456299, + "loss_ib": 0.02729935199022293, + "step": 539 + }, + { + "ce_ib": 10.58513069152832, + "ce_orig": 0.5339838862419128, + "epoch": 0.15500754906894817, + "kl_loss": 1.5678870677947998, + "loss_ib": 0.026263998821377754, + "step": 539 + }, + { + "ce_ib": 12.42918586730957, + "ce_orig": 0.8719852566719055, + "epoch": 0.15500754906894817, + "kl_loss": 1.4344046115875244, + "loss_ib": 0.026773232966661453, + "step": 539 + }, + { + "epoch": 0.1552951326479258, + "grad_norm": 0.08646312355995178, + "learning_rate": 9.988343845952697e-06, + "loss": 0.9388, + "step": 540 + }, + { + "ce_ib": 14.183638572692871, + "ce_orig": 0.9510587453842163, + "epoch": 0.1552951326479258, + "kl_loss": 1.4567286968231201, + "loss_ib": 0.02875092439353466, + "step": 540 + }, + { + "ce_ib": 14.285439491271973, + "ce_orig": 1.5962088108062744, + "epoch": 0.1552951326479258, + "kl_loss": 1.4375842809677124, + "loss_ib": 0.028661280870437622, + "step": 540 + }, + { + "ce_ib": 8.6818265914917, + "ce_orig": 0.9919387698173523, + "epoch": 0.1552951326479258, + "kl_loss": 1.4143463373184204, + "loss_ib": 0.02282528765499592, + "step": 540 + }, + { + "ce_ib": 11.165204048156738, + "ce_orig": 0.8994119763374329, + "epoch": 0.1552951326479258, + "kl_loss": 1.5460944175720215, + "loss_ib": 0.02662614732980728, + "step": 540 + }, + { + "ce_ib": 11.784940719604492, + "ce_orig": 0.982570469379425, + "epoch": 0.15558271622690345, + "kl_loss": 1.4393483400344849, + "loss_ib": 0.026178423315286636, + "step": 541 + }, + { + "ce_ib": 11.36942195892334, + "ce_orig": 0.8527225255966187, + "epoch": 0.15558271622690345, + "kl_loss": 1.5223984718322754, + "loss_ib": 0.02659340761601925, + "step": 541 + }, + { + "ce_ib": 12.205092430114746, + "ce_orig": 0.6624218225479126, + "epoch": 0.15558271622690345, + "kl_loss": 1.471164584159851, + "loss_ib": 0.026916736736893654, + "step": 541 + }, + { + "ce_ib": 9.685622215270996, + "ce_orig": 0.652384877204895, + "epoch": 0.15558271622690345, + "kl_loss": 1.4571822881698608, + "loss_ib": 0.024257445707917213, + "step": 541 + }, + { + "ce_ib": 8.85888385772705, + "ce_orig": 0.6606395244598389, + "epoch": 0.15587029980588107, + "kl_loss": 1.4010136127471924, + "loss_ib": 0.022869018837809563, + "step": 542 + }, + { + "ce_ib": 13.208836555480957, + "ce_orig": 1.2833889722824097, + "epoch": 0.15587029980588107, + "kl_loss": 1.411513328552246, + "loss_ib": 0.027323970571160316, + "step": 542 + }, + { + "ce_ib": 10.047677040100098, + "ce_orig": 0.7564672827720642, + "epoch": 0.15587029980588107, + "kl_loss": 1.4649560451507568, + "loss_ib": 0.024697236716747284, + "step": 542 + }, + { + "ce_ib": 8.681495666503906, + "ce_orig": 0.7242369651794434, + "epoch": 0.15587029980588107, + "kl_loss": 1.3844184875488281, + "loss_ib": 0.02252567932009697, + "step": 542 + }, + { + "ce_ib": 9.65349292755127, + "ce_orig": 0.8696082830429077, + "epoch": 0.15615788338485873, + "kl_loss": 1.4093207120895386, + "loss_ib": 0.02374669909477234, + "step": 543 + }, + { + "ce_ib": 13.341421127319336, + "ce_orig": 1.152627944946289, + "epoch": 0.15615788338485873, + "kl_loss": 1.4343910217285156, + "loss_ib": 0.02768533118069172, + "step": 543 + }, + { + "ce_ib": 10.65963363647461, + "ce_orig": 0.5063934326171875, + "epoch": 0.15615788338485873, + "kl_loss": 1.4016281366348267, + "loss_ib": 0.024675915017724037, + "step": 543 + }, + { + "ce_ib": 11.079456329345703, + "ce_orig": 1.1200248003005981, + "epoch": 0.15615788338485873, + "kl_loss": 1.400475263595581, + "loss_ib": 0.025084208697080612, + "step": 543 + }, + { + "ce_ib": 10.452417373657227, + "ce_orig": 0.4226267635822296, + "epoch": 0.15644546696383638, + "kl_loss": 1.4564342498779297, + "loss_ib": 0.02501676045358181, + "step": 544 + }, + { + "ce_ib": 9.227188110351562, + "ce_orig": 0.3644406199455261, + "epoch": 0.15644546696383638, + "kl_loss": 1.4034581184387207, + "loss_ib": 0.023261768743395805, + "step": 544 + }, + { + "ce_ib": 12.85843276977539, + "ce_orig": 1.260372519493103, + "epoch": 0.15644546696383638, + "kl_loss": 1.4097330570220947, + "loss_ib": 0.02695576101541519, + "step": 544 + }, + { + "ce_ib": 15.890623092651367, + "ce_orig": 1.9688998460769653, + "epoch": 0.15644546696383638, + "kl_loss": 1.469724178314209, + "loss_ib": 0.03058786317706108, + "step": 544 + }, + { + "epoch": 0.156733050542814, + "grad_norm": 0.09022902697324753, + "learning_rate": 9.987808217895829e-06, + "loss": 0.9285, + "step": 545 + }, + { + "ce_ib": 8.582206726074219, + "ce_orig": 0.5587666630744934, + "epoch": 0.156733050542814, + "kl_loss": 1.3718297481536865, + "loss_ib": 0.02230050601065159, + "step": 545 + }, + { + "ce_ib": 15.280888557434082, + "ce_orig": 1.1607708930969238, + "epoch": 0.156733050542814, + "kl_loss": 1.4650864601135254, + "loss_ib": 0.029931753873825073, + "step": 545 + }, + { + "ce_ib": 13.70768928527832, + "ce_orig": 1.192724585533142, + "epoch": 0.156733050542814, + "kl_loss": 1.49911367893219, + "loss_ib": 0.02869882434606552, + "step": 545 + }, + { + "ce_ib": 12.85840129852295, + "ce_orig": 1.2321618795394897, + "epoch": 0.156733050542814, + "kl_loss": 1.4133646488189697, + "loss_ib": 0.02699204906821251, + "step": 545 + }, + { + "ce_ib": 10.931328773498535, + "ce_orig": 1.0996520519256592, + "epoch": 0.15702063412179165, + "kl_loss": 1.3952105045318604, + "loss_ib": 0.024883432313799858, + "step": 546 + }, + { + "ce_ib": 11.635273933410645, + "ce_orig": 0.7298911809921265, + "epoch": 0.15702063412179165, + "kl_loss": 1.4759316444396973, + "loss_ib": 0.026394590735435486, + "step": 546 + }, + { + "ce_ib": 9.857783317565918, + "ce_orig": 0.6076138019561768, + "epoch": 0.15702063412179165, + "kl_loss": 1.5050930976867676, + "loss_ib": 0.02490871399641037, + "step": 546 + }, + { + "ce_ib": 11.730413436889648, + "ce_orig": 0.9210866093635559, + "epoch": 0.15702063412179165, + "kl_loss": 1.3834307193756104, + "loss_ib": 0.02556472085416317, + "step": 546 + }, + { + "ce_ib": 9.36272144317627, + "ce_orig": 0.7209946513175964, + "epoch": 0.15730821770076928, + "kl_loss": 1.3859405517578125, + "loss_ib": 0.023222126066684723, + "step": 547 + }, + { + "ce_ib": 10.93961238861084, + "ce_orig": 1.1104698181152344, + "epoch": 0.15730821770076928, + "kl_loss": 1.3689725399017334, + "loss_ib": 0.024629337713122368, + "step": 547 + }, + { + "ce_ib": 10.149394989013672, + "ce_orig": 0.9216436147689819, + "epoch": 0.15730821770076928, + "kl_loss": 1.3667898178100586, + "loss_ib": 0.023817293345928192, + "step": 547 + }, + { + "ce_ib": 9.941133499145508, + "ce_orig": 0.8583170771598816, + "epoch": 0.15730821770076928, + "kl_loss": 1.4201359748840332, + "loss_ib": 0.024142494425177574, + "step": 547 + }, + { + "ce_ib": 13.535919189453125, + "ce_orig": 0.8294936418533325, + "epoch": 0.15759580127974693, + "kl_loss": 1.4394149780273438, + "loss_ib": 0.027930067852139473, + "step": 548 + }, + { + "ce_ib": 6.7044782638549805, + "ce_orig": 0.655543863773346, + "epoch": 0.15759580127974693, + "kl_loss": 1.398592233657837, + "loss_ib": 0.020690400153398514, + "step": 548 + }, + { + "ce_ib": 12.02395248413086, + "ce_orig": 0.5793411731719971, + "epoch": 0.15759580127974693, + "kl_loss": 1.454443335533142, + "loss_ib": 0.02656838670372963, + "step": 548 + }, + { + "ce_ib": 9.43730354309082, + "ce_orig": 0.6028481125831604, + "epoch": 0.15759580127974693, + "kl_loss": 1.3634798526763916, + "loss_ib": 0.023072101175785065, + "step": 548 + }, + { + "ce_ib": 9.443431854248047, + "ce_orig": 0.8150414228439331, + "epoch": 0.15788338485872458, + "kl_loss": 1.3247261047363281, + "loss_ib": 0.02269069105386734, + "step": 549 + }, + { + "ce_ib": 12.465729713439941, + "ce_orig": 0.912677526473999, + "epoch": 0.15788338485872458, + "kl_loss": 1.4468390941619873, + "loss_ib": 0.026934120804071426, + "step": 549 + }, + { + "ce_ib": 11.708540916442871, + "ce_orig": 1.2497539520263672, + "epoch": 0.15788338485872458, + "kl_loss": 1.3749089241027832, + "loss_ib": 0.025457629933953285, + "step": 549 + }, + { + "ce_ib": 5.031269073486328, + "ce_orig": 0.17525199055671692, + "epoch": 0.15788338485872458, + "kl_loss": 1.4064466953277588, + "loss_ib": 0.019095735624432564, + "step": 549 + }, + { + "epoch": 0.1581709684377022, + "grad_norm": 0.09811785817146301, + "learning_rate": 9.987260573051268e-06, + "loss": 0.8876, + "step": 550 + }, + { + "ce_ib": 11.644174575805664, + "ce_orig": 1.3015292882919312, + "epoch": 0.1581709684377022, + "kl_loss": 1.3472862243652344, + "loss_ib": 0.02511703595519066, + "step": 550 + }, + { + "ce_ib": 10.139188766479492, + "ce_orig": 1.1227424144744873, + "epoch": 0.1581709684377022, + "kl_loss": 1.3244848251342773, + "loss_ib": 0.023384036496281624, + "step": 550 + }, + { + "ce_ib": 5.7533979415893555, + "ce_orig": 0.4904988706111908, + "epoch": 0.1581709684377022, + "kl_loss": 1.3460612297058105, + "loss_ib": 0.01921400986611843, + "step": 550 + }, + { + "ce_ib": 14.475028991699219, + "ce_orig": 1.008355736732483, + "epoch": 0.1581709684377022, + "kl_loss": 1.3830070495605469, + "loss_ib": 0.02830510027706623, + "step": 550 + }, + { + "ce_ib": 9.800948143005371, + "ce_orig": 0.5951581001281738, + "epoch": 0.15845855201667985, + "kl_loss": 1.4228395223617554, + "loss_ib": 0.024029342457652092, + "step": 551 + }, + { + "ce_ib": 12.266356468200684, + "ce_orig": 1.480778455734253, + "epoch": 0.15845855201667985, + "kl_loss": 1.3874316215515137, + "loss_ib": 0.026140673086047173, + "step": 551 + }, + { + "ce_ib": 11.24101734161377, + "ce_orig": 0.6378637552261353, + "epoch": 0.15845855201667985, + "kl_loss": 1.3844799995422363, + "loss_ib": 0.0250858161598444, + "step": 551 + }, + { + "ce_ib": 12.70676040649414, + "ce_orig": 1.2595171928405762, + "epoch": 0.15845855201667985, + "kl_loss": 1.3542779684066772, + "loss_ib": 0.026249539107084274, + "step": 551 + }, + { + "ce_ib": 13.323479652404785, + "ce_orig": 1.104166030883789, + "epoch": 0.15874613559565748, + "kl_loss": 1.3231797218322754, + "loss_ib": 0.026555275544524193, + "step": 552 + }, + { + "ce_ib": 8.531795501708984, + "ce_orig": 0.4913962483406067, + "epoch": 0.15874613559565748, + "kl_loss": 1.3631434440612793, + "loss_ib": 0.022163229063153267, + "step": 552 + }, + { + "ce_ib": 12.574892044067383, + "ce_orig": 0.9339185953140259, + "epoch": 0.15874613559565748, + "kl_loss": 1.434931993484497, + "loss_ib": 0.026924211531877518, + "step": 552 + }, + { + "ce_ib": 10.622230529785156, + "ce_orig": 0.9095126390457153, + "epoch": 0.15874613559565748, + "kl_loss": 1.363985538482666, + "loss_ib": 0.0242620836943388, + "step": 552 + }, + { + "ce_ib": 10.206563949584961, + "ce_orig": 0.5735985040664673, + "epoch": 0.15903371917463513, + "kl_loss": 1.4209774732589722, + "loss_ib": 0.02441633865237236, + "step": 553 + }, + { + "ce_ib": 15.614920616149902, + "ce_orig": 1.3737772703170776, + "epoch": 0.15903371917463513, + "kl_loss": 1.4201138019561768, + "loss_ib": 0.02981605939567089, + "step": 553 + }, + { + "ce_ib": 12.950101852416992, + "ce_orig": 0.9557084441184998, + "epoch": 0.15903371917463513, + "kl_loss": 1.4211621284484863, + "loss_ib": 0.027161721140146255, + "step": 553 + }, + { + "ce_ib": 10.123566627502441, + "ce_orig": 0.7820416688919067, + "epoch": 0.15903371917463513, + "kl_loss": 1.3192014694213867, + "loss_ib": 0.023315582424402237, + "step": 553 + }, + { + "ce_ib": 5.311279296875, + "ce_orig": 0.5123794078826904, + "epoch": 0.15932130275361278, + "kl_loss": 1.2463486194610596, + "loss_ib": 0.017774764448404312, + "step": 554 + }, + { + "ce_ib": 10.679170608520508, + "ce_orig": 0.7276657223701477, + "epoch": 0.15932130275361278, + "kl_loss": 1.3026518821716309, + "loss_ib": 0.02370568923652172, + "step": 554 + }, + { + "ce_ib": 13.4666109085083, + "ce_orig": 1.2032169103622437, + "epoch": 0.15932130275361278, + "kl_loss": 1.417797327041626, + "loss_ib": 0.027644583955407143, + "step": 554 + }, + { + "ce_ib": 12.01272201538086, + "ce_orig": 0.9139970541000366, + "epoch": 0.15932130275361278, + "kl_loss": 1.4650115966796875, + "loss_ib": 0.026662837713956833, + "step": 554 + }, + { + "epoch": 0.1596088863325904, + "grad_norm": 0.09588459134101868, + "learning_rate": 9.98670091273842e-06, + "loss": 0.9863, + "step": 555 + }, + { + "ce_ib": 12.853775978088379, + "ce_orig": 0.8478192090988159, + "epoch": 0.1596088863325904, + "kl_loss": 1.337024211883545, + "loss_ib": 0.02622401714324951, + "step": 555 + }, + { + "ce_ib": 9.791227340698242, + "ce_orig": 0.7623945474624634, + "epoch": 0.1596088863325904, + "kl_loss": 1.3745824098587036, + "loss_ib": 0.023537050932645798, + "step": 555 + }, + { + "ce_ib": 11.515276908874512, + "ce_orig": 0.6505551338195801, + "epoch": 0.1596088863325904, + "kl_loss": 1.350890040397644, + "loss_ib": 0.025024177506566048, + "step": 555 + }, + { + "ce_ib": 13.35179328918457, + "ce_orig": 1.0168282985687256, + "epoch": 0.1596088863325904, + "kl_loss": 1.336082100868225, + "loss_ib": 0.026712613180279732, + "step": 555 + }, + { + "ce_ib": 17.056640625, + "ce_orig": 1.6616370677947998, + "epoch": 0.15989646991156806, + "kl_loss": 1.3520833253860474, + "loss_ib": 0.03057747334241867, + "step": 556 + }, + { + "ce_ib": 6.637577056884766, + "ce_orig": 0.5979457497596741, + "epoch": 0.15989646991156806, + "kl_loss": 1.32643723487854, + "loss_ib": 0.019901949912309647, + "step": 556 + }, + { + "ce_ib": 11.517195701599121, + "ce_orig": 1.0731699466705322, + "epoch": 0.15989646991156806, + "kl_loss": 1.3701467514038086, + "loss_ib": 0.0252186618745327, + "step": 556 + }, + { + "ce_ib": 7.839071750640869, + "ce_orig": 0.57491534948349, + "epoch": 0.15989646991156806, + "kl_loss": 1.281550645828247, + "loss_ib": 0.020654577761888504, + "step": 556 + }, + { + "ce_ib": 5.376894950866699, + "ce_orig": 0.27646228671073914, + "epoch": 0.16018405349054568, + "kl_loss": 1.4170054197311401, + "loss_ib": 0.019546950235962868, + "step": 557 + }, + { + "ce_ib": 7.960681915283203, + "ce_orig": 0.8380683064460754, + "epoch": 0.16018405349054568, + "kl_loss": 1.3399286270141602, + "loss_ib": 0.021359967067837715, + "step": 557 + }, + { + "ce_ib": 12.966280937194824, + "ce_orig": 1.0689243078231812, + "epoch": 0.16018405349054568, + "kl_loss": 1.425252914428711, + "loss_ib": 0.027218809351325035, + "step": 557 + }, + { + "ce_ib": 12.531590461730957, + "ce_orig": 1.2268368005752563, + "epoch": 0.16018405349054568, + "kl_loss": 1.3414859771728516, + "loss_ib": 0.025946449488401413, + "step": 557 + }, + { + "ce_ib": 8.23051929473877, + "ce_orig": 0.6497761607170105, + "epoch": 0.16047163706952333, + "kl_loss": 1.3148771524429321, + "loss_ib": 0.021379288285970688, + "step": 558 + }, + { + "ce_ib": 11.831758499145508, + "ce_orig": 1.0878973007202148, + "epoch": 0.16047163706952333, + "kl_loss": 1.344857096672058, + "loss_ib": 0.025280330330133438, + "step": 558 + }, + { + "ce_ib": 10.697997093200684, + "ce_orig": 0.9739592671394348, + "epoch": 0.16047163706952333, + "kl_loss": 1.2617008686065674, + "loss_ib": 0.023315005004405975, + "step": 558 + }, + { + "ce_ib": 10.034689903259277, + "ce_orig": 0.7774488925933838, + "epoch": 0.16047163706952333, + "kl_loss": 1.3792924880981445, + "loss_ib": 0.02382761426270008, + "step": 558 + }, + { + "ce_ib": 8.980086326599121, + "ce_orig": 0.5476792454719543, + "epoch": 0.16075922064850098, + "kl_loss": 1.3168349266052246, + "loss_ib": 0.022148434072732925, + "step": 559 + }, + { + "ce_ib": 7.035679340362549, + "ce_orig": 0.627990186214447, + "epoch": 0.16075922064850098, + "kl_loss": 1.3569344282150269, + "loss_ib": 0.020605022087693214, + "step": 559 + }, + { + "ce_ib": 12.099848747253418, + "ce_orig": 1.201551914215088, + "epoch": 0.16075922064850098, + "kl_loss": 1.2747890949249268, + "loss_ib": 0.024847740307450294, + "step": 559 + }, + { + "ce_ib": 8.865999221801758, + "ce_orig": 0.7412122488021851, + "epoch": 0.16075922064850098, + "kl_loss": 1.4771380424499512, + "loss_ib": 0.023637380450963974, + "step": 559 + }, + { + "epoch": 0.1610468042274786, + "grad_norm": 0.0971461683511734, + "learning_rate": 9.986129238305635e-06, + "loss": 0.8747, + "step": 560 + }, + { + "ce_ib": 7.6997246742248535, + "ce_orig": 0.6025680303573608, + "epoch": 0.1610468042274786, + "kl_loss": 1.276071310043335, + "loss_ib": 0.02046043798327446, + "step": 560 + }, + { + "ce_ib": 7.523832321166992, + "ce_orig": 0.7658670544624329, + "epoch": 0.1610468042274786, + "kl_loss": 1.4311625957489014, + "loss_ib": 0.021835457533597946, + "step": 560 + }, + { + "ce_ib": 11.750297546386719, + "ce_orig": 0.4812588095664978, + "epoch": 0.1610468042274786, + "kl_loss": 1.3406429290771484, + "loss_ib": 0.02515672706067562, + "step": 560 + }, + { + "ce_ib": 10.141862869262695, + "ce_orig": 0.8624674081802368, + "epoch": 0.1610468042274786, + "kl_loss": 1.345240831375122, + "loss_ib": 0.023594269528985023, + "step": 560 + }, + { + "ce_ib": 9.809609413146973, + "ce_orig": 0.9545562863349915, + "epoch": 0.16133438780645626, + "kl_loss": 1.27205228805542, + "loss_ib": 0.02253013104200363, + "step": 561 + }, + { + "ce_ib": 12.615915298461914, + "ce_orig": 1.327091932296753, + "epoch": 0.16133438780645626, + "kl_loss": 1.282986044883728, + "loss_ib": 0.02544577606022358, + "step": 561 + }, + { + "ce_ib": 10.288837432861328, + "ce_orig": 0.5523210763931274, + "epoch": 0.16133438780645626, + "kl_loss": 1.3225059509277344, + "loss_ib": 0.023513898253440857, + "step": 561 + }, + { + "ce_ib": 10.36892032623291, + "ce_orig": 0.6983376741409302, + "epoch": 0.16133438780645626, + "kl_loss": 1.2711155414581299, + "loss_ib": 0.02308007702231407, + "step": 561 + }, + { + "ce_ib": 14.066039085388184, + "ce_orig": 0.9940349459648132, + "epoch": 0.16162197138543388, + "kl_loss": 1.3311264514923096, + "loss_ib": 0.027377303689718246, + "step": 562 + }, + { + "ce_ib": 9.398420333862305, + "ce_orig": 1.150452733039856, + "epoch": 0.16162197138543388, + "kl_loss": 1.3074944019317627, + "loss_ib": 0.02247336320579052, + "step": 562 + }, + { + "ce_ib": 9.80187702178955, + "ce_orig": 0.8328919410705566, + "epoch": 0.16162197138543388, + "kl_loss": 1.331373929977417, + "loss_ib": 0.02311561442911625, + "step": 562 + }, + { + "ce_ib": 12.888148307800293, + "ce_orig": 1.2748291492462158, + "epoch": 0.16162197138543388, + "kl_loss": 1.2818001508712769, + "loss_ib": 0.025706149637699127, + "step": 562 + }, + { + "ce_ib": 8.798264503479004, + "ce_orig": 0.66322261095047, + "epoch": 0.16190955496441153, + "kl_loss": 1.3028491735458374, + "loss_ib": 0.02182675525546074, + "step": 563 + }, + { + "ce_ib": 13.072640419006348, + "ce_orig": 1.0565416812896729, + "epoch": 0.16190955496441153, + "kl_loss": 1.2858983278274536, + "loss_ib": 0.025931624695658684, + "step": 563 + }, + { + "ce_ib": 11.12070083618164, + "ce_orig": 0.8622493743896484, + "epoch": 0.16190955496441153, + "kl_loss": 1.2600901126861572, + "loss_ib": 0.0237216018140316, + "step": 563 + }, + { + "ce_ib": 11.012995719909668, + "ce_orig": 0.7809346914291382, + "epoch": 0.16190955496441153, + "kl_loss": 1.2810771465301514, + "loss_ib": 0.02382376603782177, + "step": 563 + }, + { + "ce_ib": 10.03192138671875, + "ce_orig": 0.5545583367347717, + "epoch": 0.16219713854338919, + "kl_loss": 1.372998833656311, + "loss_ib": 0.023761911317706108, + "step": 564 + }, + { + "ce_ib": 8.590304374694824, + "ce_orig": 0.7225477695465088, + "epoch": 0.16219713854338919, + "kl_loss": 1.3027560710906982, + "loss_ib": 0.021617865189909935, + "step": 564 + }, + { + "ce_ib": 10.352544784545898, + "ce_orig": 0.8774200081825256, + "epoch": 0.16219713854338919, + "kl_loss": 1.2814412117004395, + "loss_ib": 0.02316695638000965, + "step": 564 + }, + { + "ce_ib": 8.562765121459961, + "ce_orig": 0.6415224075317383, + "epoch": 0.16219713854338919, + "kl_loss": 1.2804956436157227, + "loss_ib": 0.021367721259593964, + "step": 564 + }, + { + "epoch": 0.1624847221223668, + "grad_norm": 0.08632536977529526, + "learning_rate": 9.98554555113021e-06, + "loss": 0.8462, + "step": 565 + }, + { + "ce_ib": 12.551013946533203, + "ce_orig": 0.995196521282196, + "epoch": 0.1624847221223668, + "kl_loss": 1.3475830554962158, + "loss_ib": 0.02602684497833252, + "step": 565 + }, + { + "ce_ib": 11.848214149475098, + "ce_orig": 1.4025741815567017, + "epoch": 0.1624847221223668, + "kl_loss": 1.2776107788085938, + "loss_ib": 0.024624323472380638, + "step": 565 + }, + { + "ce_ib": 8.570831298828125, + "ce_orig": 0.6328908205032349, + "epoch": 0.1624847221223668, + "kl_loss": 1.2646872997283936, + "loss_ib": 0.021217703819274902, + "step": 565 + }, + { + "ce_ib": 9.687134742736816, + "ce_orig": 0.7903947234153748, + "epoch": 0.1624847221223668, + "kl_loss": 1.3254048824310303, + "loss_ib": 0.022941183298826218, + "step": 565 + }, + { + "ce_ib": 13.013336181640625, + "ce_orig": 1.2271647453308105, + "epoch": 0.16277230570134446, + "kl_loss": 1.2702226638793945, + "loss_ib": 0.02571556344628334, + "step": 566 + }, + { + "ce_ib": 12.480305671691895, + "ce_orig": 1.1103395223617554, + "epoch": 0.16277230570134446, + "kl_loss": 1.2821930646896362, + "loss_ib": 0.025302235037088394, + "step": 566 + }, + { + "ce_ib": 9.443026542663574, + "ce_orig": 0.5126791596412659, + "epoch": 0.16277230570134446, + "kl_loss": 1.2745044231414795, + "loss_ib": 0.02218807116150856, + "step": 566 + }, + { + "ce_ib": 9.337321281433105, + "ce_orig": 0.7954445481300354, + "epoch": 0.16277230570134446, + "kl_loss": 1.2490813732147217, + "loss_ib": 0.02182813547551632, + "step": 566 + }, + { + "ce_ib": 10.778318405151367, + "ce_orig": 0.7537108659744263, + "epoch": 0.16305988928032208, + "kl_loss": 1.2937819957733154, + "loss_ib": 0.023716138675808907, + "step": 567 + }, + { + "ce_ib": 9.771125793457031, + "ce_orig": 0.40443235635757446, + "epoch": 0.16305988928032208, + "kl_loss": 1.3131227493286133, + "loss_ib": 0.0229023527354002, + "step": 567 + }, + { + "ce_ib": 9.85836124420166, + "ce_orig": 0.5563979744911194, + "epoch": 0.16305988928032208, + "kl_loss": 1.2832622528076172, + "loss_ib": 0.02269098162651062, + "step": 567 + }, + { + "ce_ib": 13.68719482421875, + "ce_orig": 0.9920021295547485, + "epoch": 0.16305988928032208, + "kl_loss": 1.2698428630828857, + "loss_ib": 0.02638562209904194, + "step": 567 + }, + { + "ce_ib": 9.411405563354492, + "ce_orig": 0.9342118501663208, + "epoch": 0.16334747285929974, + "kl_loss": 1.2332212924957275, + "loss_ib": 0.021743619814515114, + "step": 568 + }, + { + "ce_ib": 15.157500267028809, + "ce_orig": 1.5584248304367065, + "epoch": 0.16334747285929974, + "kl_loss": 1.2862458229064941, + "loss_ib": 0.02801995724439621, + "step": 568 + }, + { + "ce_ib": 10.411499977111816, + "ce_orig": 0.8281600475311279, + "epoch": 0.16334747285929974, + "kl_loss": 1.2496166229248047, + "loss_ib": 0.022907666862010956, + "step": 568 + }, + { + "ce_ib": 10.096942901611328, + "ce_orig": 0.6468956470489502, + "epoch": 0.16334747285929974, + "kl_loss": 1.2605764865875244, + "loss_ib": 0.02270270697772503, + "step": 568 + }, + { + "ce_ib": 9.528172492980957, + "ce_orig": 0.8783382773399353, + "epoch": 0.1636350564382774, + "kl_loss": 1.2387837171554565, + "loss_ib": 0.021916009485721588, + "step": 569 + }, + { + "ce_ib": 8.954733848571777, + "ce_orig": 0.8919200897216797, + "epoch": 0.1636350564382774, + "kl_loss": 1.2251062393188477, + "loss_ib": 0.02120579592883587, + "step": 569 + }, + { + "ce_ib": 8.508342742919922, + "ce_orig": 0.6990381479263306, + "epoch": 0.1636350564382774, + "kl_loss": 1.2213623523712158, + "loss_ib": 0.02072196640074253, + "step": 569 + }, + { + "ce_ib": 11.344082832336426, + "ce_orig": 0.9525802731513977, + "epoch": 0.1636350564382774, + "kl_loss": 1.248590111732483, + "loss_ib": 0.023829983547329903, + "step": 569 + }, + { + "epoch": 0.163922640017255, + "grad_norm": 0.09219575673341751, + "learning_rate": 9.984949852618381e-06, + "loss": 0.8852, + "step": 570 + }, + { + "ce_ib": 8.961785316467285, + "ce_orig": 0.7948698997497559, + "epoch": 0.163922640017255, + "kl_loss": 1.2222837209701538, + "loss_ib": 0.02118462324142456, + "step": 570 + }, + { + "ce_ib": 12.20908260345459, + "ce_orig": 0.9208235144615173, + "epoch": 0.163922640017255, + "kl_loss": 1.2455497980117798, + "loss_ib": 0.024664580821990967, + "step": 570 + }, + { + "ce_ib": 9.334521293640137, + "ce_orig": 0.9958298206329346, + "epoch": 0.163922640017255, + "kl_loss": 1.2763538360595703, + "loss_ib": 0.022098058834671974, + "step": 570 + }, + { + "ce_ib": 12.762809753417969, + "ce_orig": 1.1121208667755127, + "epoch": 0.163922640017255, + "kl_loss": 1.2300899028778076, + "loss_ib": 0.025063710287213326, + "step": 570 + }, + { + "ce_ib": 15.606366157531738, + "ce_orig": 1.6491622924804688, + "epoch": 0.16421022359623266, + "kl_loss": 1.2227786779403687, + "loss_ib": 0.027834152802824974, + "step": 571 + }, + { + "ce_ib": 10.236468315124512, + "ce_orig": 0.8589720726013184, + "epoch": 0.16421022359623266, + "kl_loss": 1.256370186805725, + "loss_ib": 0.02280016802251339, + "step": 571 + }, + { + "ce_ib": 9.943655967712402, + "ce_orig": 0.5925063490867615, + "epoch": 0.16421022359623266, + "kl_loss": 1.2559595108032227, + "loss_ib": 0.02250325120985508, + "step": 571 + }, + { + "ce_ib": 11.613914489746094, + "ce_orig": 0.7779126763343811, + "epoch": 0.16421022359623266, + "kl_loss": 1.392745018005371, + "loss_ib": 0.025541365146636963, + "step": 571 + }, + { + "ce_ib": 12.392518043518066, + "ce_orig": 0.4280785620212555, + "epoch": 0.1644978071752103, + "kl_loss": 1.3646981716156006, + "loss_ib": 0.026039499789476395, + "step": 572 + }, + { + "ce_ib": 9.918338775634766, + "ce_orig": 0.5322098731994629, + "epoch": 0.1644978071752103, + "kl_loss": 1.2963712215423584, + "loss_ib": 0.02288205176591873, + "step": 572 + }, + { + "ce_ib": 14.274211883544922, + "ce_orig": 1.3050851821899414, + "epoch": 0.1644978071752103, + "kl_loss": 1.2759932279586792, + "loss_ib": 0.02703414298593998, + "step": 572 + }, + { + "ce_ib": 11.698960304260254, + "ce_orig": 1.155306100845337, + "epoch": 0.1644978071752103, + "kl_loss": 1.2604784965515137, + "loss_ib": 0.024303745478391647, + "step": 572 + }, + { + "ce_ib": 10.412774085998535, + "ce_orig": 1.052548885345459, + "epoch": 0.16478539075418794, + "kl_loss": 1.2234094142913818, + "loss_ib": 0.022646868601441383, + "step": 573 + }, + { + "ce_ib": 10.732352256774902, + "ce_orig": 0.9245730042457581, + "epoch": 0.16478539075418794, + "kl_loss": 1.2317912578582764, + "loss_ib": 0.02305026538670063, + "step": 573 + }, + { + "ce_ib": 9.59011173248291, + "ce_orig": 0.7064767479896545, + "epoch": 0.16478539075418794, + "kl_loss": 1.224424123764038, + "loss_ib": 0.021834352985024452, + "step": 573 + }, + { + "ce_ib": 9.361687660217285, + "ce_orig": 0.8686097264289856, + "epoch": 0.16478539075418794, + "kl_loss": 1.1842972040176392, + "loss_ib": 0.021204659715294838, + "step": 573 + }, + { + "ce_ib": 7.736264228820801, + "ce_orig": 0.6079578995704651, + "epoch": 0.1650729743331656, + "kl_loss": 1.213087558746338, + "loss_ib": 0.01986713893711567, + "step": 574 + }, + { + "ce_ib": 11.044466972351074, + "ce_orig": 0.8884755969047546, + "epoch": 0.1650729743331656, + "kl_loss": 1.2339417934417725, + "loss_ib": 0.023383883759379387, + "step": 574 + }, + { + "ce_ib": 10.914957046508789, + "ce_orig": 0.9813688397407532, + "epoch": 0.1650729743331656, + "kl_loss": 1.2557547092437744, + "loss_ib": 0.023472504690289497, + "step": 574 + }, + { + "ce_ib": 13.932374000549316, + "ce_orig": 1.102638840675354, + "epoch": 0.1650729743331656, + "kl_loss": 1.2306591272354126, + "loss_ib": 0.02623896487057209, + "step": 574 + }, + { + "epoch": 0.1653605579121432, + "grad_norm": 0.09370694309473038, + "learning_rate": 9.984342144205327e-06, + "loss": 0.9041, + "step": 575 + }, + { + "ce_ib": 9.729941368103027, + "ce_orig": 0.8137699961662292, + "epoch": 0.1653605579121432, + "kl_loss": 1.218010663986206, + "loss_ib": 0.0219100471585989, + "step": 575 + }, + { + "ce_ib": 11.345166206359863, + "ce_orig": 0.8795411586761475, + "epoch": 0.1653605579121432, + "kl_loss": 1.238341212272644, + "loss_ib": 0.02372857742011547, + "step": 575 + }, + { + "ce_ib": 12.664711952209473, + "ce_orig": 1.4619799852371216, + "epoch": 0.1653605579121432, + "kl_loss": 1.2479004859924316, + "loss_ib": 0.025143718346953392, + "step": 575 + }, + { + "ce_ib": 7.3197340965271, + "ce_orig": 0.5423354506492615, + "epoch": 0.1653605579121432, + "kl_loss": 1.4118754863739014, + "loss_ib": 0.021438488736748695, + "step": 575 + }, + { + "ce_ib": 11.586548805236816, + "ce_orig": 0.8300837874412537, + "epoch": 0.16564814149112086, + "kl_loss": 1.2648472785949707, + "loss_ib": 0.024235021322965622, + "step": 576 + }, + { + "ce_ib": 9.670539855957031, + "ce_orig": 0.8994592428207397, + "epoch": 0.16564814149112086, + "kl_loss": 1.202571988105774, + "loss_ib": 0.021696260198950768, + "step": 576 + }, + { + "ce_ib": 5.678918838500977, + "ce_orig": 0.28384220600128174, + "epoch": 0.16564814149112086, + "kl_loss": 1.3513402938842773, + "loss_ib": 0.019192321226000786, + "step": 576 + }, + { + "ce_ib": 11.467476844787598, + "ce_orig": 0.5451651215553284, + "epoch": 0.16564814149112086, + "kl_loss": 1.2790919542312622, + "loss_ib": 0.02425839565694332, + "step": 576 + }, + { + "ce_ib": 8.113414764404297, + "ce_orig": 0.7992563843727112, + "epoch": 0.1659357250700985, + "kl_loss": 1.1959567070007324, + "loss_ib": 0.02007298171520233, + "step": 577 + }, + { + "ce_ib": 6.4770002365112305, + "ce_orig": 0.4411783218383789, + "epoch": 0.1659357250700985, + "kl_loss": 1.2061271667480469, + "loss_ib": 0.018538272008299828, + "step": 577 + }, + { + "ce_ib": 9.995020866394043, + "ce_orig": 0.5499460697174072, + "epoch": 0.1659357250700985, + "kl_loss": 1.2197155952453613, + "loss_ib": 0.022192176431417465, + "step": 577 + }, + { + "ce_ib": 5.73984956741333, + "ce_orig": 0.6255266666412354, + "epoch": 0.1659357250700985, + "kl_loss": 1.1914690732955933, + "loss_ib": 0.0176545400172472, + "step": 577 + }, + { + "ce_ib": 12.113245964050293, + "ce_orig": 1.025524377822876, + "epoch": 0.16622330864907614, + "kl_loss": 1.2450977563858032, + "loss_ib": 0.024564223363995552, + "step": 578 + }, + { + "ce_ib": 5.700209617614746, + "ce_orig": 0.28980499505996704, + "epoch": 0.16622330864907614, + "kl_loss": 1.2563034296035767, + "loss_ib": 0.01826324500143528, + "step": 578 + }, + { + "ce_ib": 12.162859916687012, + "ce_orig": 0.8536310195922852, + "epoch": 0.16622330864907614, + "kl_loss": 1.220404028892517, + "loss_ib": 0.024366900324821472, + "step": 578 + }, + { + "ce_ib": 12.775392532348633, + "ce_orig": 1.3573917150497437, + "epoch": 0.16622330864907614, + "kl_loss": 1.1893036365509033, + "loss_ib": 0.024668429046869278, + "step": 578 + }, + { + "ce_ib": 13.641351699829102, + "ce_orig": 1.5364866256713867, + "epoch": 0.1665108922280538, + "kl_loss": 1.261054277420044, + "loss_ib": 0.026251891627907753, + "step": 579 + }, + { + "ce_ib": 11.468977928161621, + "ce_orig": 1.2083629369735718, + "epoch": 0.1665108922280538, + "kl_loss": 1.2402012348175049, + "loss_ib": 0.023870989680290222, + "step": 579 + }, + { + "ce_ib": 14.304160118103027, + "ce_orig": 1.1799192428588867, + "epoch": 0.1665108922280538, + "kl_loss": 1.2703068256378174, + "loss_ib": 0.027007225900888443, + "step": 579 + }, + { + "ce_ib": 13.134546279907227, + "ce_orig": 1.2673437595367432, + "epoch": 0.1665108922280538, + "kl_loss": 1.1939582824707031, + "loss_ib": 0.025074128061532974, + "step": 579 + }, + { + "epoch": 0.16679847580703142, + "grad_norm": 0.08239021897315979, + "learning_rate": 9.983722427355157e-06, + "loss": 0.9056, + "step": 580 + }, + { + "ce_ib": 13.295975685119629, + "ce_orig": 1.0868593454360962, + "epoch": 0.16679847580703142, + "kl_loss": 1.2228111028671265, + "loss_ib": 0.025524087250232697, + "step": 580 + }, + { + "ce_ib": 6.748361110687256, + "ce_orig": 0.49680906534194946, + "epoch": 0.16679847580703142, + "kl_loss": 1.2128095626831055, + "loss_ib": 0.0188764575868845, + "step": 580 + }, + { + "ce_ib": 13.180870056152344, + "ce_orig": 0.8746943473815918, + "epoch": 0.16679847580703142, + "kl_loss": 1.2134785652160645, + "loss_ib": 0.025315655395388603, + "step": 580 + }, + { + "ce_ib": 12.037747383117676, + "ce_orig": 0.9534928202629089, + "epoch": 0.16679847580703142, + "kl_loss": 1.2241135835647583, + "loss_ib": 0.024278882890939713, + "step": 580 + }, + { + "ce_ib": 8.912028312683105, + "ce_orig": 0.7467697858810425, + "epoch": 0.16708605938600907, + "kl_loss": 1.1925266981124878, + "loss_ib": 0.020837293937802315, + "step": 581 + }, + { + "ce_ib": 9.094566345214844, + "ce_orig": 0.6505690813064575, + "epoch": 0.16708605938600907, + "kl_loss": 1.1850394010543823, + "loss_ib": 0.020944960415363312, + "step": 581 + }, + { + "ce_ib": 12.56716251373291, + "ce_orig": 1.2765976190567017, + "epoch": 0.16708605938600907, + "kl_loss": 1.2733515501022339, + "loss_ib": 0.025300677865743637, + "step": 581 + }, + { + "ce_ib": 13.4862699508667, + "ce_orig": 1.0705842971801758, + "epoch": 0.16708605938600907, + "kl_loss": 1.2059423923492432, + "loss_ib": 0.025545692071318626, + "step": 581 + }, + { + "ce_ib": 8.2174711227417, + "ce_orig": 0.7286640405654907, + "epoch": 0.1673736429649867, + "kl_loss": 1.2195181846618652, + "loss_ib": 0.02041265182197094, + "step": 582 + }, + { + "ce_ib": 13.159209251403809, + "ce_orig": 1.2096375226974487, + "epoch": 0.1673736429649867, + "kl_loss": 1.2165024280548096, + "loss_ib": 0.02532423473894596, + "step": 582 + }, + { + "ce_ib": 11.165864944458008, + "ce_orig": 0.7757768630981445, + "epoch": 0.1673736429649867, + "kl_loss": 1.1681039333343506, + "loss_ib": 0.022846903651952744, + "step": 582 + }, + { + "ce_ib": 8.71281623840332, + "ce_orig": 0.7169674634933472, + "epoch": 0.1673736429649867, + "kl_loss": 1.191447377204895, + "loss_ib": 0.02062728814780712, + "step": 582 + }, + { + "ce_ib": 9.235655784606934, + "ce_orig": 0.6690336465835571, + "epoch": 0.16766122654396434, + "kl_loss": 1.2143634557724, + "loss_ib": 0.021379288285970688, + "step": 583 + }, + { + "ce_ib": 8.412071228027344, + "ce_orig": 0.843682587146759, + "epoch": 0.16766122654396434, + "kl_loss": 1.2117249965667725, + "loss_ib": 0.02052932232618332, + "step": 583 + }, + { + "ce_ib": 10.572442054748535, + "ce_orig": 0.7864252328872681, + "epoch": 0.16766122654396434, + "kl_loss": 1.2299147844314575, + "loss_ib": 0.0228715892881155, + "step": 583 + }, + { + "ce_ib": 8.599762916564941, + "ce_orig": 0.7645632028579712, + "epoch": 0.16766122654396434, + "kl_loss": 1.2004691362380981, + "loss_ib": 0.02060445211827755, + "step": 583 + }, + { + "ce_ib": 11.767807960510254, + "ce_orig": 1.0832188129425049, + "epoch": 0.167948810122942, + "kl_loss": 1.1508519649505615, + "loss_ib": 0.023276329040527344, + "step": 584 + }, + { + "ce_ib": 9.03587818145752, + "ce_orig": 0.7372077107429504, + "epoch": 0.167948810122942, + "kl_loss": 1.1607894897460938, + "loss_ib": 0.020643772557377815, + "step": 584 + }, + { + "ce_ib": 11.698123931884766, + "ce_orig": 0.8797475695610046, + "epoch": 0.167948810122942, + "kl_loss": 1.2153360843658447, + "loss_ib": 0.023851484060287476, + "step": 584 + }, + { + "ce_ib": 14.928531646728516, + "ce_orig": 1.3900351524353027, + "epoch": 0.167948810122942, + "kl_loss": 1.276613473892212, + "loss_ib": 0.027694664895534515, + "step": 584 + }, + { + "epoch": 0.16823639370191962, + "grad_norm": 0.11208527535200119, + "learning_rate": 9.983090703560911e-06, + "loss": 0.8947, + "step": 585 + }, + { + "ce_ib": 11.16375732421875, + "ce_orig": 0.9449269771575928, + "epoch": 0.16823639370191962, + "kl_loss": 1.197243094444275, + "loss_ib": 0.023136189207434654, + "step": 585 + }, + { + "ce_ib": 9.852100372314453, + "ce_orig": 1.078002691268921, + "epoch": 0.16823639370191962, + "kl_loss": 1.1607494354248047, + "loss_ib": 0.021459592506289482, + "step": 585 + }, + { + "ce_ib": 11.235018730163574, + "ce_orig": 1.1319299936294556, + "epoch": 0.16823639370191962, + "kl_loss": 1.2304571866989136, + "loss_ib": 0.0235395897179842, + "step": 585 + }, + { + "ce_ib": 7.328647613525391, + "ce_orig": 0.9203292727470398, + "epoch": 0.16823639370191962, + "kl_loss": 1.1338272094726562, + "loss_ib": 0.018666919320821762, + "step": 585 + }, + { + "ce_ib": 7.884848117828369, + "ce_orig": 0.5528172850608826, + "epoch": 0.16852397728089727, + "kl_loss": 1.2366106510162354, + "loss_ib": 0.02025095373392105, + "step": 586 + }, + { + "ce_ib": 13.856200218200684, + "ce_orig": 0.9960865378379822, + "epoch": 0.16852397728089727, + "kl_loss": 1.2090309858322144, + "loss_ib": 0.02594650909304619, + "step": 586 + }, + { + "ce_ib": 9.895005226135254, + "ce_orig": 0.5365419387817383, + "epoch": 0.16852397728089727, + "kl_loss": 1.1710314750671387, + "loss_ib": 0.021605320274829865, + "step": 586 + }, + { + "ce_ib": 11.017683029174805, + "ce_orig": 0.8266361951828003, + "epoch": 0.16852397728089727, + "kl_loss": 1.2141588926315308, + "loss_ib": 0.023159272968769073, + "step": 586 + }, + { + "ce_ib": 11.70738410949707, + "ce_orig": 0.9832698702812195, + "epoch": 0.1688115608598749, + "kl_loss": 1.1586453914642334, + "loss_ib": 0.023293837904930115, + "step": 587 + }, + { + "ce_ib": 8.602079391479492, + "ce_orig": 0.4699603319168091, + "epoch": 0.1688115608598749, + "kl_loss": 1.2227380275726318, + "loss_ib": 0.02082945965230465, + "step": 587 + }, + { + "ce_ib": 14.872537612915039, + "ce_orig": 1.5698349475860596, + "epoch": 0.1688115608598749, + "kl_loss": 1.2066720724105835, + "loss_ib": 0.026939257979393005, + "step": 587 + }, + { + "ce_ib": 9.357597351074219, + "ce_orig": 0.9672819972038269, + "epoch": 0.1688115608598749, + "kl_loss": 1.1683168411254883, + "loss_ib": 0.021040765568614006, + "step": 587 + }, + { + "ce_ib": 12.048833847045898, + "ce_orig": 1.0971179008483887, + "epoch": 0.16909914443885254, + "kl_loss": 1.2246365547180176, + "loss_ib": 0.024295201525092125, + "step": 588 + }, + { + "ce_ib": 7.782527446746826, + "ce_orig": 0.43647029995918274, + "epoch": 0.16909914443885254, + "kl_loss": 1.1583232879638672, + "loss_ib": 0.019365761429071426, + "step": 588 + }, + { + "ce_ib": 7.263584136962891, + "ce_orig": 0.7806444764137268, + "epoch": 0.16909914443885254, + "kl_loss": 1.1670253276824951, + "loss_ib": 0.018933836370706558, + "step": 588 + }, + { + "ce_ib": 5.9214959144592285, + "ce_orig": 0.5120716094970703, + "epoch": 0.16909914443885254, + "kl_loss": 1.1531264781951904, + "loss_ib": 0.01745275966823101, + "step": 588 + }, + { + "ce_ib": 11.046984672546387, + "ce_orig": 1.098747730255127, + "epoch": 0.16938672801783017, + "kl_loss": 1.141385793685913, + "loss_ib": 0.02246084250509739, + "step": 589 + }, + { + "ce_ib": 12.190630912780762, + "ce_orig": 0.7543506622314453, + "epoch": 0.16938672801783017, + "kl_loss": 1.1634085178375244, + "loss_ib": 0.023824715986847878, + "step": 589 + }, + { + "ce_ib": 11.490245819091797, + "ce_orig": 1.0767667293548584, + "epoch": 0.16938672801783017, + "kl_loss": 1.2043827772140503, + "loss_ib": 0.023534072563052177, + "step": 589 + }, + { + "ce_ib": 10.710217475891113, + "ce_orig": 0.803483247756958, + "epoch": 0.16938672801783017, + "kl_loss": 1.1858105659484863, + "loss_ib": 0.022568322718143463, + "step": 589 + }, + { + "epoch": 0.16967431159680782, + "grad_norm": 0.09646889567375183, + "learning_rate": 9.982446974344561e-06, + "loss": 0.893, + "step": 590 + }, + { + "ce_ib": 7.601869583129883, + "ce_orig": 0.4696982204914093, + "epoch": 0.16967431159680782, + "kl_loss": 1.1903643608093262, + "loss_ib": 0.019505511969327927, + "step": 590 + }, + { + "ce_ib": 9.697887420654297, + "ce_orig": 0.4679652452468872, + "epoch": 0.16967431159680782, + "kl_loss": 1.1964094638824463, + "loss_ib": 0.02166198194026947, + "step": 590 + }, + { + "ce_ib": 5.316531658172607, + "ce_orig": 0.6131843328475952, + "epoch": 0.16967431159680782, + "kl_loss": 1.1232414245605469, + "loss_ib": 0.016548944637179375, + "step": 590 + }, + { + "ce_ib": 11.569339752197266, + "ce_orig": 0.532830536365509, + "epoch": 0.16967431159680782, + "kl_loss": 1.2179789543151855, + "loss_ib": 0.023749129846692085, + "step": 590 + }, + { + "ce_ib": 14.859304428100586, + "ce_orig": 1.699377417564392, + "epoch": 0.16996189517578547, + "kl_loss": 1.1641483306884766, + "loss_ib": 0.02650078758597374, + "step": 591 + }, + { + "ce_ib": 12.174185752868652, + "ce_orig": 1.0933891534805298, + "epoch": 0.16996189517578547, + "kl_loss": 1.205374002456665, + "loss_ib": 0.024227924644947052, + "step": 591 + }, + { + "ce_ib": 12.189451217651367, + "ce_orig": 1.1713759899139404, + "epoch": 0.16996189517578547, + "kl_loss": 1.1780178546905518, + "loss_ib": 0.023969629779458046, + "step": 591 + }, + { + "ce_ib": 6.584260940551758, + "ce_orig": 0.531925618648529, + "epoch": 0.16996189517578547, + "kl_loss": 1.1629016399383545, + "loss_ib": 0.01821327582001686, + "step": 591 + }, + { + "ce_ib": 10.2810697555542, + "ce_orig": 1.2692686319351196, + "epoch": 0.1702494787547631, + "kl_loss": 1.1622142791748047, + "loss_ib": 0.02190321311354637, + "step": 592 + }, + { + "ce_ib": 8.695658683776855, + "ce_orig": 1.1439961194992065, + "epoch": 0.1702494787547631, + "kl_loss": 1.2565195560455322, + "loss_ib": 0.021260853856801987, + "step": 592 + }, + { + "ce_ib": 12.925812721252441, + "ce_orig": 1.113416314125061, + "epoch": 0.1702494787547631, + "kl_loss": 1.1489940881729126, + "loss_ib": 0.02441575564444065, + "step": 592 + }, + { + "ce_ib": 8.097929954528809, + "ce_orig": 0.3692007064819336, + "epoch": 0.1702494787547631, + "kl_loss": 1.2639890909194946, + "loss_ib": 0.020737819373607635, + "step": 592 + }, + { + "ce_ib": 12.470227241516113, + "ce_orig": 1.292494535446167, + "epoch": 0.17053706233374075, + "kl_loss": 1.2025644779205322, + "loss_ib": 0.024495873600244522, + "step": 593 + }, + { + "ce_ib": 8.987156867980957, + "ce_orig": 0.7146295309066772, + "epoch": 0.17053706233374075, + "kl_loss": 1.148221492767334, + "loss_ib": 0.02046937122941017, + "step": 593 + }, + { + "ce_ib": 8.649067878723145, + "ce_orig": 0.6290068626403809, + "epoch": 0.17053706233374075, + "kl_loss": 1.1626759767532349, + "loss_ib": 0.020275825634598732, + "step": 593 + }, + { + "ce_ib": 9.381182670593262, + "ce_orig": 0.8150485157966614, + "epoch": 0.17053706233374075, + "kl_loss": 1.1636794805526733, + "loss_ib": 0.021017977967858315, + "step": 593 + }, + { + "ce_ib": 13.075675010681152, + "ce_orig": 1.3097158670425415, + "epoch": 0.17082464591271837, + "kl_loss": 1.1714305877685547, + "loss_ib": 0.024789981544017792, + "step": 594 + }, + { + "ce_ib": 16.90681266784668, + "ce_orig": 1.63013756275177, + "epoch": 0.17082464591271837, + "kl_loss": 1.1639704704284668, + "loss_ib": 0.028546517714858055, + "step": 594 + }, + { + "ce_ib": 9.945943832397461, + "ce_orig": 0.620954692363739, + "epoch": 0.17082464591271837, + "kl_loss": 1.1276791095733643, + "loss_ib": 0.021222734823822975, + "step": 594 + }, + { + "ce_ib": 7.538568496704102, + "ce_orig": 0.7477220296859741, + "epoch": 0.17082464591271837, + "kl_loss": 1.1129953861236572, + "loss_ib": 0.01866852305829525, + "step": 594 + }, + { + "epoch": 0.17111222949169602, + "grad_norm": 0.0950080156326294, + "learning_rate": 9.981791241257001e-06, + "loss": 0.8499, + "step": 595 + }, + { + "ce_ib": 9.64914321899414, + "ce_orig": 0.8221632242202759, + "epoch": 0.17111222949169602, + "kl_loss": 1.2603493928909302, + "loss_ib": 0.022252636030316353, + "step": 595 + }, + { + "ce_ib": 10.428970336914062, + "ce_orig": 0.9476694464683533, + "epoch": 0.17111222949169602, + "kl_loss": 1.2723057270050049, + "loss_ib": 0.023152027279138565, + "step": 595 + }, + { + "ce_ib": 11.565075874328613, + "ce_orig": 0.7016614675521851, + "epoch": 0.17111222949169602, + "kl_loss": 1.214961290359497, + "loss_ib": 0.023714689537882805, + "step": 595 + }, + { + "ce_ib": 7.760899066925049, + "ce_orig": 0.655583918094635, + "epoch": 0.17111222949169602, + "kl_loss": 1.1367559432983398, + "loss_ib": 0.019128458574414253, + "step": 595 + }, + { + "ce_ib": 13.004273414611816, + "ce_orig": 1.171642541885376, + "epoch": 0.17139981307067367, + "kl_loss": 1.1165239810943604, + "loss_ib": 0.02416951209306717, + "step": 596 + }, + { + "ce_ib": 8.871601104736328, + "ce_orig": 0.6587203145027161, + "epoch": 0.17139981307067367, + "kl_loss": 1.1157963275909424, + "loss_ib": 0.020029563456773758, + "step": 596 + }, + { + "ce_ib": 8.543595314025879, + "ce_orig": 0.680071234703064, + "epoch": 0.17139981307067367, + "kl_loss": 1.2704293727874756, + "loss_ib": 0.02124788984656334, + "step": 596 + }, + { + "ce_ib": 12.145650863647461, + "ce_orig": 0.9769390821456909, + "epoch": 0.17139981307067367, + "kl_loss": 1.1289055347442627, + "loss_ib": 0.023434706032276154, + "step": 596 + }, + { + "ce_ib": 11.884757041931152, + "ce_orig": 0.7717639803886414, + "epoch": 0.1716873966496513, + "kl_loss": 1.1151626110076904, + "loss_ib": 0.02303638495504856, + "step": 597 + }, + { + "ce_ib": 13.524641036987305, + "ce_orig": 1.0390403270721436, + "epoch": 0.1716873966496513, + "kl_loss": 1.1430152654647827, + "loss_ib": 0.024954792112112045, + "step": 597 + }, + { + "ce_ib": 10.59398365020752, + "ce_orig": 1.0760669708251953, + "epoch": 0.1716873966496513, + "kl_loss": 1.267032265663147, + "loss_ib": 0.023264307528734207, + "step": 597 + }, + { + "ce_ib": 9.16975212097168, + "ce_orig": 0.7946443557739258, + "epoch": 0.1716873966496513, + "kl_loss": 1.332892894744873, + "loss_ib": 0.022498680278658867, + "step": 597 + }, + { + "ce_ib": 15.392538070678711, + "ce_orig": 1.7411152124404907, + "epoch": 0.17197498022862895, + "kl_loss": 1.1676359176635742, + "loss_ib": 0.02706889621913433, + "step": 598 + }, + { + "ce_ib": 10.269882202148438, + "ce_orig": 0.9134522676467896, + "epoch": 0.17197498022862895, + "kl_loss": 1.14839768409729, + "loss_ib": 0.021753858774900436, + "step": 598 + }, + { + "ce_ib": 8.251395225524902, + "ce_orig": 0.6777662634849548, + "epoch": 0.17197498022862895, + "kl_loss": 1.0949749946594238, + "loss_ib": 0.019201144576072693, + "step": 598 + }, + { + "ce_ib": 7.8162455558776855, + "ce_orig": 0.5873789191246033, + "epoch": 0.17197498022862895, + "kl_loss": 1.1189738512039185, + "loss_ib": 0.01900598406791687, + "step": 598 + }, + { + "ce_ib": 8.662796974182129, + "ce_orig": 0.919073760509491, + "epoch": 0.17226256380760657, + "kl_loss": 1.1068514585494995, + "loss_ib": 0.01973131112754345, + "step": 599 + }, + { + "ce_ib": 11.996581077575684, + "ce_orig": 1.1948639154434204, + "epoch": 0.17226256380760657, + "kl_loss": 1.1755080223083496, + "loss_ib": 0.02375166118144989, + "step": 599 + }, + { + "ce_ib": 12.551753044128418, + "ce_orig": 1.4086652994155884, + "epoch": 0.17226256380760657, + "kl_loss": 1.1488478183746338, + "loss_ib": 0.024040231481194496, + "step": 599 + }, + { + "ce_ib": 12.489664077758789, + "ce_orig": 1.4374010562896729, + "epoch": 0.17226256380760657, + "kl_loss": 1.163723111152649, + "loss_ib": 0.024126896634697914, + "step": 599 + }, + { + "epoch": 0.17255014738658422, + "grad_norm": 0.08578155934810638, + "learning_rate": 9.98112350587804e-06, + "loss": 0.8373, + "step": 600 + }, + { + "ce_ib": 14.228236198425293, + "ce_orig": 1.494513750076294, + "epoch": 0.17255014738658422, + "kl_loss": 1.1346684694290161, + "loss_ib": 0.025574922561645508, + "step": 600 + }, + { + "ce_ib": 10.47590160369873, + "ce_orig": 0.6758707165718079, + "epoch": 0.17255014738658422, + "kl_loss": 1.149749755859375, + "loss_ib": 0.021973399445414543, + "step": 600 + }, + { + "ce_ib": 9.56252670288086, + "ce_orig": 0.6861965656280518, + "epoch": 0.17255014738658422, + "kl_loss": 1.146235704421997, + "loss_ib": 0.021024884656071663, + "step": 600 + }, + { + "ce_ib": 8.769157409667969, + "ce_orig": 0.6426496505737305, + "epoch": 0.17255014738658422, + "kl_loss": 1.1051974296569824, + "loss_ib": 0.019821131601929665, + "step": 600 + }, + { + "ce_ib": 6.642595291137695, + "ce_orig": 0.5588423013687134, + "epoch": 0.17283773096556188, + "kl_loss": 1.1010701656341553, + "loss_ib": 0.017653297632932663, + "step": 601 + }, + { + "ce_ib": 10.748079299926758, + "ce_orig": 0.837460458278656, + "epoch": 0.17283773096556188, + "kl_loss": 1.1604368686676025, + "loss_ib": 0.022352447733283043, + "step": 601 + }, + { + "ce_ib": 8.990370750427246, + "ce_orig": 0.8383088707923889, + "epoch": 0.17283773096556188, + "kl_loss": 1.1006824970245361, + "loss_ib": 0.01999719627201557, + "step": 601 + }, + { + "ce_ib": 9.197964668273926, + "ce_orig": 0.686692476272583, + "epoch": 0.17283773096556188, + "kl_loss": 1.1443568468093872, + "loss_ib": 0.02064153179526329, + "step": 601 + }, + { + "ce_ib": 8.66501522064209, + "ce_orig": 0.525370180606842, + "epoch": 0.1731253145445395, + "kl_loss": 1.1227505207061768, + "loss_ib": 0.01989252120256424, + "step": 602 + }, + { + "ce_ib": 11.858187675476074, + "ce_orig": 0.8876397013664246, + "epoch": 0.1731253145445395, + "kl_loss": 1.1747143268585205, + "loss_ib": 0.023605331778526306, + "step": 602 + }, + { + "ce_ib": 6.1840105056762695, + "ce_orig": 0.6205452680587769, + "epoch": 0.1731253145445395, + "kl_loss": 1.0966415405273438, + "loss_ib": 0.017150426283478737, + "step": 602 + }, + { + "ce_ib": 11.21971607208252, + "ce_orig": 0.9764306545257568, + "epoch": 0.1731253145445395, + "kl_loss": 1.116091012954712, + "loss_ib": 0.02238062582910061, + "step": 602 + }, + { + "ce_ib": 6.916097164154053, + "ce_orig": 0.5436660051345825, + "epoch": 0.17341289812351715, + "kl_loss": 1.1902745962142944, + "loss_ib": 0.018818842247128487, + "step": 603 + }, + { + "ce_ib": 8.293400764465332, + "ce_orig": 0.8351038694381714, + "epoch": 0.17341289812351715, + "kl_loss": 1.1848680973052979, + "loss_ib": 0.0201420821249485, + "step": 603 + }, + { + "ce_ib": 6.613530158996582, + "ce_orig": 0.18875178694725037, + "epoch": 0.17341289812351715, + "kl_loss": 1.1903159618377686, + "loss_ib": 0.01851668953895569, + "step": 603 + }, + { + "ce_ib": 11.624032020568848, + "ce_orig": 1.285302758216858, + "epoch": 0.17341289812351715, + "kl_loss": 1.1025352478027344, + "loss_ib": 0.022649383172392845, + "step": 603 + }, + { + "ce_ib": 15.579438209533691, + "ce_orig": 1.5998573303222656, + "epoch": 0.17370048170249477, + "kl_loss": 1.2801098823547363, + "loss_ib": 0.028380535542964935, + "step": 604 + }, + { + "ce_ib": 12.44752025604248, + "ce_orig": 1.4038703441619873, + "epoch": 0.17370048170249477, + "kl_loss": 1.1902649402618408, + "loss_ib": 0.02435017004609108, + "step": 604 + }, + { + "ce_ib": 10.84005355834961, + "ce_orig": 0.6248370409011841, + "epoch": 0.17370048170249477, + "kl_loss": 1.1507587432861328, + "loss_ib": 0.022347640246152878, + "step": 604 + }, + { + "ce_ib": 8.29090404510498, + "ce_orig": 0.7321332097053528, + "epoch": 0.17370048170249477, + "kl_loss": 1.1337003707885742, + "loss_ib": 0.019627906382083893, + "step": 604 + }, + { + "epoch": 0.17398806528147243, + "grad_norm": 0.08757986128330231, + "learning_rate": 9.980443769816412e-06, + "loss": 0.8879, + "step": 605 + }, + { + "ce_ib": 16.679454803466797, + "ce_orig": 1.7097995281219482, + "epoch": 0.17398806528147243, + "kl_loss": 1.1676082611083984, + "loss_ib": 0.028355535119771957, + "step": 605 + }, + { + "ce_ib": 8.775374412536621, + "ce_orig": 1.0261955261230469, + "epoch": 0.17398806528147243, + "kl_loss": 1.0867377519607544, + "loss_ib": 0.019642751663923264, + "step": 605 + }, + { + "ce_ib": 9.250091552734375, + "ce_orig": 0.7789836525917053, + "epoch": 0.17398806528147243, + "kl_loss": 1.149935007095337, + "loss_ib": 0.020749442279338837, + "step": 605 + }, + { + "ce_ib": 9.42496395111084, + "ce_orig": 0.8384370803833008, + "epoch": 0.17398806528147243, + "kl_loss": 1.1811635494232178, + "loss_ib": 0.02123660035431385, + "step": 605 + }, + { + "ce_ib": 15.161707878112793, + "ce_orig": 2.020648717880249, + "epoch": 0.17427564886045008, + "kl_loss": 1.1757433414459229, + "loss_ib": 0.026919139549136162, + "step": 606 + }, + { + "ce_ib": 4.969954013824463, + "ce_orig": 0.5350307822227478, + "epoch": 0.17427564886045008, + "kl_loss": 1.0870544910430908, + "loss_ib": 0.015840498730540276, + "step": 606 + }, + { + "ce_ib": 10.833635330200195, + "ce_orig": 1.334272861480713, + "epoch": 0.17427564886045008, + "kl_loss": 1.116763949394226, + "loss_ib": 0.022001275792717934, + "step": 606 + }, + { + "ce_ib": 13.298807144165039, + "ce_orig": 1.2746291160583496, + "epoch": 0.17427564886045008, + "kl_loss": 1.1182126998901367, + "loss_ib": 0.02448093332350254, + "step": 606 + }, + { + "ce_ib": 8.028116226196289, + "ce_orig": 0.2780713737010956, + "epoch": 0.1745632324394277, + "kl_loss": 1.3144466876983643, + "loss_ib": 0.02117258310317993, + "step": 607 + }, + { + "ce_ib": 9.471242904663086, + "ce_orig": 0.8981790542602539, + "epoch": 0.1745632324394277, + "kl_loss": 1.1512048244476318, + "loss_ib": 0.020983289927244186, + "step": 607 + }, + { + "ce_ib": 11.815406799316406, + "ce_orig": 0.8096030950546265, + "epoch": 0.1745632324394277, + "kl_loss": 1.0907784700393677, + "loss_ib": 0.02272319234907627, + "step": 607 + }, + { + "ce_ib": 8.835672378540039, + "ce_orig": 0.8250407576560974, + "epoch": 0.1745632324394277, + "kl_loss": 1.0707385540008545, + "loss_ib": 0.019543059170246124, + "step": 607 + }, + { + "ce_ib": 14.99797248840332, + "ce_orig": 1.081514835357666, + "epoch": 0.17485081601840535, + "kl_loss": 1.3153797388076782, + "loss_ib": 0.028151769191026688, + "step": 608 + }, + { + "ce_ib": 13.794142723083496, + "ce_orig": 1.310433268547058, + "epoch": 0.17485081601840535, + "kl_loss": 1.173295497894287, + "loss_ib": 0.025527097284793854, + "step": 608 + }, + { + "ce_ib": 6.75504732131958, + "ce_orig": 0.7089630365371704, + "epoch": 0.17485081601840535, + "kl_loss": 1.1751198768615723, + "loss_ib": 0.01850624568760395, + "step": 608 + }, + { + "ce_ib": 10.636677742004395, + "ce_orig": 0.9004390239715576, + "epoch": 0.17485081601840535, + "kl_loss": 1.1462738513946533, + "loss_ib": 0.022099414840340614, + "step": 608 + }, + { + "ce_ib": 16.850482940673828, + "ce_orig": 1.1504744291305542, + "epoch": 0.17513839959738298, + "kl_loss": 1.2235054969787598, + "loss_ib": 0.029085537418723106, + "step": 609 + }, + { + "ce_ib": 13.915315628051758, + "ce_orig": 0.9233617782592773, + "epoch": 0.17513839959738298, + "kl_loss": 1.2110941410064697, + "loss_ib": 0.026026258245110512, + "step": 609 + }, + { + "ce_ib": 11.495623588562012, + "ce_orig": 0.9102531671524048, + "epoch": 0.17513839959738298, + "kl_loss": 1.1143162250518799, + "loss_ib": 0.02263878472149372, + "step": 609 + }, + { + "ce_ib": 11.01639175415039, + "ce_orig": 1.2989511489868164, + "epoch": 0.17513839959738298, + "kl_loss": 1.1308627128601074, + "loss_ib": 0.022325018420815468, + "step": 609 + }, + { + "epoch": 0.17542598317636063, + "grad_norm": 0.09487691521644592, + "learning_rate": 9.979752034709756e-06, + "loss": 0.943, + "step": 610 + }, + { + "ce_ib": 9.711048126220703, + "ce_orig": 0.9693747162818909, + "epoch": 0.17542598317636063, + "kl_loss": 1.074932336807251, + "loss_ib": 0.020460370928049088, + "step": 610 + }, + { + "ce_ib": 14.960775375366211, + "ce_orig": 1.6066879034042358, + "epoch": 0.17542598317636063, + "kl_loss": 1.1578710079193115, + "loss_ib": 0.026539484038949013, + "step": 610 + }, + { + "ce_ib": 11.009191513061523, + "ce_orig": 0.753725528717041, + "epoch": 0.17542598317636063, + "kl_loss": 1.1147217750549316, + "loss_ib": 0.022156409919261932, + "step": 610 + }, + { + "ce_ib": 7.617627143859863, + "ce_orig": 0.5374597311019897, + "epoch": 0.17542598317636063, + "kl_loss": 1.0917261838912964, + "loss_ib": 0.018534889444708824, + "step": 610 + }, + { + "ce_ib": 9.379159927368164, + "ce_orig": 0.839108943939209, + "epoch": 0.17571356675533828, + "kl_loss": 1.1200788021087646, + "loss_ib": 0.020579947158694267, + "step": 611 + }, + { + "ce_ib": 9.289336204528809, + "ce_orig": 0.5786874890327454, + "epoch": 0.17571356675533828, + "kl_loss": 1.251037836074829, + "loss_ib": 0.021799713373184204, + "step": 611 + }, + { + "ce_ib": 6.256180286407471, + "ce_orig": 0.496167927980423, + "epoch": 0.17571356675533828, + "kl_loss": 1.1585159301757812, + "loss_ib": 0.017841339111328125, + "step": 611 + }, + { + "ce_ib": 15.044721603393555, + "ce_orig": 1.532287836074829, + "epoch": 0.17571356675533828, + "kl_loss": 1.1178569793701172, + "loss_ib": 0.02622329257428646, + "step": 611 + }, + { + "ce_ib": 8.996905326843262, + "ce_orig": 0.9385986328125, + "epoch": 0.1760011503343159, + "kl_loss": 1.0947903394699097, + "loss_ib": 0.0199448075145483, + "step": 612 + }, + { + "ce_ib": 13.072503089904785, + "ce_orig": 0.988423764705658, + "epoch": 0.1760011503343159, + "kl_loss": 1.1813626289367676, + "loss_ib": 0.024886131286621094, + "step": 612 + }, + { + "ce_ib": 8.529322624206543, + "ce_orig": 0.733309805393219, + "epoch": 0.1760011503343159, + "kl_loss": 1.091768741607666, + "loss_ib": 0.01944701001048088, + "step": 612 + }, + { + "ce_ib": 12.893845558166504, + "ce_orig": 1.074021339416504, + "epoch": 0.1760011503343159, + "kl_loss": 1.106937050819397, + "loss_ib": 0.023963216692209244, + "step": 612 + }, + { + "ce_ib": 12.352333068847656, + "ce_orig": 0.9047135710716248, + "epoch": 0.17628873391329355, + "kl_loss": 1.1527860164642334, + "loss_ib": 0.023880193009972572, + "step": 613 + }, + { + "ce_ib": 11.772270202636719, + "ce_orig": 0.8379567265510559, + "epoch": 0.17628873391329355, + "kl_loss": 1.1471657752990723, + "loss_ib": 0.02324392832815647, + "step": 613 + }, + { + "ce_ib": 8.202688217163086, + "ce_orig": 0.5956844091415405, + "epoch": 0.17628873391329355, + "kl_loss": 1.133353352546692, + "loss_ib": 0.019536221399903297, + "step": 613 + }, + { + "ce_ib": 13.991854667663574, + "ce_orig": 1.4801995754241943, + "epoch": 0.17628873391329355, + "kl_loss": 1.1663241386413574, + "loss_ib": 0.025655098259449005, + "step": 613 + }, + { + "ce_ib": 9.825779914855957, + "ce_orig": 0.581002950668335, + "epoch": 0.17657631749227118, + "kl_loss": 1.0918254852294922, + "loss_ib": 0.02074403502047062, + "step": 614 + }, + { + "ce_ib": 9.005266189575195, + "ce_orig": 0.6865787506103516, + "epoch": 0.17657631749227118, + "kl_loss": 1.0758470296859741, + "loss_ib": 0.019763736054301262, + "step": 614 + }, + { + "ce_ib": 8.376243591308594, + "ce_orig": 0.7356083393096924, + "epoch": 0.17657631749227118, + "kl_loss": 1.0604243278503418, + "loss_ib": 0.018980486318469048, + "step": 614 + }, + { + "ce_ib": 10.727423667907715, + "ce_orig": 0.5603511333465576, + "epoch": 0.17657631749227118, + "kl_loss": 1.1472487449645996, + "loss_ib": 0.02219991199672222, + "step": 614 + }, + { + "epoch": 0.17686390107124883, + "grad_norm": 0.09426393359899521, + "learning_rate": 9.979048302224624e-06, + "loss": 0.8892, + "step": 615 + }, + { + "ce_ib": 13.396039962768555, + "ce_orig": 1.0348241329193115, + "epoch": 0.17686390107124883, + "kl_loss": 1.1215533018112183, + "loss_ib": 0.024611571803689003, + "step": 615 + }, + { + "ce_ib": 13.98047924041748, + "ce_orig": 0.8722472190856934, + "epoch": 0.17686390107124883, + "kl_loss": 1.12811279296875, + "loss_ib": 0.02526160702109337, + "step": 615 + }, + { + "ce_ib": 6.756248950958252, + "ce_orig": 0.654313862323761, + "epoch": 0.17686390107124883, + "kl_loss": 1.0764236450195312, + "loss_ib": 0.017520485445857048, + "step": 615 + }, + { + "ce_ib": 7.255735397338867, + "ce_orig": 0.381597638130188, + "epoch": 0.17686390107124883, + "kl_loss": 1.1145398616790771, + "loss_ib": 0.01840113289654255, + "step": 615 + }, + { + "ce_ib": 11.688261032104492, + "ce_orig": 0.495026558637619, + "epoch": 0.17715148465022648, + "kl_loss": 1.1114025115966797, + "loss_ib": 0.022802285850048065, + "step": 616 + }, + { + "ce_ib": 9.351430892944336, + "ce_orig": 0.6223934292793274, + "epoch": 0.17715148465022648, + "kl_loss": 1.0816258192062378, + "loss_ib": 0.02016768977046013, + "step": 616 + }, + { + "ce_ib": 7.122152328491211, + "ce_orig": 0.6948713660240173, + "epoch": 0.17715148465022648, + "kl_loss": 1.10568106174469, + "loss_ib": 0.018178964033722878, + "step": 616 + }, + { + "ce_ib": 6.703441619873047, + "ce_orig": 0.7388578653335571, + "epoch": 0.17715148465022648, + "kl_loss": 1.0877346992492676, + "loss_ib": 0.0175807885825634, + "step": 616 + }, + { + "ce_ib": 7.440734386444092, + "ce_orig": 0.5273018479347229, + "epoch": 0.1774390682292041, + "kl_loss": 1.0771890878677368, + "loss_ib": 0.01821262575685978, + "step": 617 + }, + { + "ce_ib": 7.850268363952637, + "ce_orig": 0.6726818680763245, + "epoch": 0.1774390682292041, + "kl_loss": 1.0832056999206543, + "loss_ib": 0.01868232525885105, + "step": 617 + }, + { + "ce_ib": 11.564708709716797, + "ce_orig": 0.9945627450942993, + "epoch": 0.1774390682292041, + "kl_loss": 1.079153299331665, + "loss_ib": 0.022356241941452026, + "step": 617 + }, + { + "ce_ib": 9.48259449005127, + "ce_orig": 1.3080958127975464, + "epoch": 0.1774390682292041, + "kl_loss": 1.0585891008377075, + "loss_ib": 0.020068485289812088, + "step": 617 + }, + { + "ce_ib": 6.543670654296875, + "ce_orig": 0.6545018553733826, + "epoch": 0.17772665180818176, + "kl_loss": 1.0263237953186035, + "loss_ib": 0.016806907951831818, + "step": 618 + }, + { + "ce_ib": 14.908156394958496, + "ce_orig": 1.3185038566589355, + "epoch": 0.17772665180818176, + "kl_loss": 1.0666757822036743, + "loss_ib": 0.02557491324841976, + "step": 618 + }, + { + "ce_ib": 7.1459832191467285, + "ce_orig": 0.6320990920066833, + "epoch": 0.17772665180818176, + "kl_loss": 1.0741521120071411, + "loss_ib": 0.017887502908706665, + "step": 618 + }, + { + "ce_ib": 13.496872901916504, + "ce_orig": 1.4044506549835205, + "epoch": 0.17772665180818176, + "kl_loss": 1.0919251441955566, + "loss_ib": 0.0244161244481802, + "step": 618 + }, + { + "ce_ib": 11.312828063964844, + "ce_orig": 0.8793609738349915, + "epoch": 0.17801423538715938, + "kl_loss": 1.1443016529083252, + "loss_ib": 0.02275584451854229, + "step": 619 + }, + { + "ce_ib": 6.740575790405273, + "ce_orig": 0.5719377398490906, + "epoch": 0.17801423538715938, + "kl_loss": 1.0880985260009766, + "loss_ib": 0.017621560022234917, + "step": 619 + }, + { + "ce_ib": 5.965404033660889, + "ce_orig": 0.7937454581260681, + "epoch": 0.17801423538715938, + "kl_loss": 1.054682970046997, + "loss_ib": 0.016512233763933182, + "step": 619 + }, + { + "ce_ib": 12.703347206115723, + "ce_orig": 0.7259251475334167, + "epoch": 0.17801423538715938, + "kl_loss": 1.1318557262420654, + "loss_ib": 0.024021903052926064, + "step": 619 + }, + { + "epoch": 0.17830181896613703, + "grad_norm": 0.10790643841028214, + "learning_rate": 9.978332574056468e-06, + "loss": 0.8558, + "step": 620 + }, + { + "ce_ib": 7.719181537628174, + "ce_orig": 0.7105019092559814, + "epoch": 0.17830181896613703, + "kl_loss": 1.0935070514678955, + "loss_ib": 0.018654251471161842, + "step": 620 + }, + { + "ce_ib": 7.3571600914001465, + "ce_orig": 0.46813029050827026, + "epoch": 0.17830181896613703, + "kl_loss": 1.1948972940444946, + "loss_ib": 0.019306132569909096, + "step": 620 + }, + { + "ce_ib": 15.235627174377441, + "ce_orig": 1.8518890142440796, + "epoch": 0.17830181896613703, + "kl_loss": 1.1301660537719727, + "loss_ib": 0.02653728984296322, + "step": 620 + }, + { + "ce_ib": 9.883748054504395, + "ce_orig": 0.7214540839195251, + "epoch": 0.17830181896613703, + "kl_loss": 1.0958552360534668, + "loss_ib": 0.0208422988653183, + "step": 620 + }, + { + "ce_ib": 9.291598320007324, + "ce_orig": 0.8704307079315186, + "epoch": 0.17858940254511468, + "kl_loss": 1.1199560165405273, + "loss_ib": 0.020491158589720726, + "step": 621 + }, + { + "ce_ib": 7.236078262329102, + "ce_orig": 0.6710290312767029, + "epoch": 0.17858940254511468, + "kl_loss": 1.0643848180770874, + "loss_ib": 0.017879927530884743, + "step": 621 + }, + { + "ce_ib": 10.731121063232422, + "ce_orig": 0.8992137908935547, + "epoch": 0.17858940254511468, + "kl_loss": 1.0887260437011719, + "loss_ib": 0.021618379279971123, + "step": 621 + }, + { + "ce_ib": 6.691609859466553, + "ce_orig": 0.23901374638080597, + "epoch": 0.17858940254511468, + "kl_loss": 1.2286667823791504, + "loss_ib": 0.01897827908396721, + "step": 621 + }, + { + "ce_ib": 10.243730545043945, + "ce_orig": 0.6851696968078613, + "epoch": 0.1788769861240923, + "kl_loss": 1.1422102451324463, + "loss_ib": 0.02166583202779293, + "step": 622 + }, + { + "ce_ib": 9.9014892578125, + "ce_orig": 0.861434280872345, + "epoch": 0.1788769861240923, + "kl_loss": 1.1246778964996338, + "loss_ib": 0.02114826813340187, + "step": 622 + }, + { + "ce_ib": 9.937746047973633, + "ce_orig": 0.7701823711395264, + "epoch": 0.1788769861240923, + "kl_loss": 1.0272612571716309, + "loss_ib": 0.02021035924553871, + "step": 622 + }, + { + "ce_ib": 11.04038143157959, + "ce_orig": 1.1329587697982788, + "epoch": 0.1788769861240923, + "kl_loss": 1.0876765251159668, + "loss_ib": 0.021917147561907768, + "step": 622 + }, + { + "ce_ib": 9.774174690246582, + "ce_orig": 0.6889547109603882, + "epoch": 0.17916456970306996, + "kl_loss": 1.1281490325927734, + "loss_ib": 0.021055664867162704, + "step": 623 + }, + { + "ce_ib": 8.301161766052246, + "ce_orig": 0.9744248390197754, + "epoch": 0.17916456970306996, + "kl_loss": 1.1025155782699585, + "loss_ib": 0.019326316192746162, + "step": 623 + }, + { + "ce_ib": 11.626615524291992, + "ce_orig": 1.0212253332138062, + "epoch": 0.17916456970306996, + "kl_loss": 1.1386442184448242, + "loss_ib": 0.023013057187199593, + "step": 623 + }, + { + "ce_ib": 9.228303909301758, + "ce_orig": 0.9091984629631042, + "epoch": 0.17916456970306996, + "kl_loss": 1.0976030826568604, + "loss_ib": 0.02020433358848095, + "step": 623 + }, + { + "ce_ib": 12.572793960571289, + "ce_orig": 1.365990161895752, + "epoch": 0.17945215328204758, + "kl_loss": 1.0628557205200195, + "loss_ib": 0.0232013501226902, + "step": 624 + }, + { + "ce_ib": 6.557443141937256, + "ce_orig": 0.38862401247024536, + "epoch": 0.17945215328204758, + "kl_loss": 1.0641001462936401, + "loss_ib": 0.01719844341278076, + "step": 624 + }, + { + "ce_ib": 11.218859672546387, + "ce_orig": 1.0366816520690918, + "epoch": 0.17945215328204758, + "kl_loss": 1.0765401124954224, + "loss_ib": 0.021984262391924858, + "step": 624 + }, + { + "ce_ib": 13.108721733093262, + "ce_orig": 1.2099130153656006, + "epoch": 0.17945215328204758, + "kl_loss": 1.0831751823425293, + "loss_ib": 0.023940471932291985, + "step": 624 + }, + { + "epoch": 0.17973973686102523, + "grad_norm": 0.0922677144408226, + "learning_rate": 9.977604851929648e-06, + "loss": 0.9102, + "step": 625 + }, + { + "ce_ib": 8.263596534729004, + "ce_orig": 0.6577824354171753, + "epoch": 0.17973973686102523, + "kl_loss": 1.1678799390792847, + "loss_ib": 0.019942395389080048, + "step": 625 + }, + { + "ce_ib": 7.4331560134887695, + "ce_orig": 0.6097143888473511, + "epoch": 0.17973973686102523, + "kl_loss": 1.0634379386901855, + "loss_ib": 0.018067536875605583, + "step": 625 + }, + { + "ce_ib": 9.178628921508789, + "ce_orig": 0.8557302355766296, + "epoch": 0.17973973686102523, + "kl_loss": 1.1199026107788086, + "loss_ib": 0.02037765458226204, + "step": 625 + }, + { + "ce_ib": 8.077595710754395, + "ce_orig": 0.7694026231765747, + "epoch": 0.17973973686102523, + "kl_loss": 0.9992647171020508, + "loss_ib": 0.018070241436362267, + "step": 625 + }, + { + "ce_ib": 14.997899055480957, + "ce_orig": 1.3085542917251587, + "epoch": 0.18002732044000289, + "kl_loss": 1.0909315347671509, + "loss_ib": 0.025907214730978012, + "step": 626 + }, + { + "ce_ib": 14.065059661865234, + "ce_orig": 1.3615686893463135, + "epoch": 0.18002732044000289, + "kl_loss": 1.1272248029708862, + "loss_ib": 0.025337306782603264, + "step": 626 + }, + { + "ce_ib": 9.213851928710938, + "ce_orig": 0.4737341105937958, + "epoch": 0.18002732044000289, + "kl_loss": 1.1103893518447876, + "loss_ib": 0.020317744463682175, + "step": 626 + }, + { + "ce_ib": 10.141777992248535, + "ce_orig": 1.0340423583984375, + "epoch": 0.18002732044000289, + "kl_loss": 1.1753777265548706, + "loss_ib": 0.021895555779337883, + "step": 626 + }, + { + "ce_ib": 10.606011390686035, + "ce_orig": 0.5892968773841858, + "epoch": 0.1803149040189805, + "kl_loss": 1.1311826705932617, + "loss_ib": 0.021917838603258133, + "step": 627 + }, + { + "ce_ib": 8.124170303344727, + "ce_orig": 0.42015740275382996, + "epoch": 0.1803149040189805, + "kl_loss": 1.108632206916809, + "loss_ib": 0.019210491329431534, + "step": 627 + }, + { + "ce_ib": 10.349467277526855, + "ce_orig": 0.7760807275772095, + "epoch": 0.1803149040189805, + "kl_loss": 1.0804953575134277, + "loss_ib": 0.02115442231297493, + "step": 627 + }, + { + "ce_ib": 8.495221138000488, + "ce_orig": 0.5720870494842529, + "epoch": 0.1803149040189805, + "kl_loss": 1.1552854776382446, + "loss_ib": 0.020048074424266815, + "step": 627 + }, + { + "ce_ib": 13.331109046936035, + "ce_orig": 1.5061442852020264, + "epoch": 0.18060248759795816, + "kl_loss": 1.1464059352874756, + "loss_ib": 0.0247951690107584, + "step": 628 + }, + { + "ce_ib": 11.41611099243164, + "ce_orig": 1.1628519296646118, + "epoch": 0.18060248759795816, + "kl_loss": 1.0600998401641846, + "loss_ib": 0.022017108276486397, + "step": 628 + }, + { + "ce_ib": 12.698983192443848, + "ce_orig": 1.187642216682434, + "epoch": 0.18060248759795816, + "kl_loss": 1.1207376718521118, + "loss_ib": 0.02390635944902897, + "step": 628 + }, + { + "ce_ib": 15.049531936645508, + "ce_orig": 1.8772828578948975, + "epoch": 0.18060248759795816, + "kl_loss": 1.087199330329895, + "loss_ib": 0.025921525433659554, + "step": 628 + }, + { + "ce_ib": 8.250436782836914, + "ce_orig": 0.7029029130935669, + "epoch": 0.18089007117693578, + "kl_loss": 1.1021440029144287, + "loss_ib": 0.01927187666296959, + "step": 629 + }, + { + "ce_ib": 13.998225212097168, + "ce_orig": 1.2588441371917725, + "epoch": 0.18089007117693578, + "kl_loss": 1.080070972442627, + "loss_ib": 0.024798937141895294, + "step": 629 + }, + { + "ce_ib": 12.915166854858398, + "ce_orig": 1.296819806098938, + "epoch": 0.18089007117693578, + "kl_loss": 1.1545573472976685, + "loss_ib": 0.024460740387439728, + "step": 629 + }, + { + "ce_ib": 9.678729057312012, + "ce_orig": 0.5961591601371765, + "epoch": 0.18089007117693578, + "kl_loss": 1.220086932182312, + "loss_ib": 0.02187959849834442, + "step": 629 + }, + { + "epoch": 0.18117765475591344, + "grad_norm": 0.0966140478849411, + "learning_rate": 9.97686513759741e-06, + "loss": 0.9272, + "step": 630 + }, + { + "ce_ib": 13.0838623046875, + "ce_orig": 1.260634422302246, + "epoch": 0.18117765475591344, + "kl_loss": 1.1091606616973877, + "loss_ib": 0.02417546696960926, + "step": 630 + }, + { + "ce_ib": 6.934885025024414, + "ce_orig": 0.4972129762172699, + "epoch": 0.18117765475591344, + "kl_loss": 1.0627329349517822, + "loss_ib": 0.01756221428513527, + "step": 630 + }, + { + "ce_ib": 12.18768310546875, + "ce_orig": 1.4976786375045776, + "epoch": 0.18117765475591344, + "kl_loss": 1.0954440832138062, + "loss_ib": 0.023142123594880104, + "step": 630 + }, + { + "ce_ib": 11.109347343444824, + "ce_orig": 1.1459025144577026, + "epoch": 0.18117765475591344, + "kl_loss": 1.054826259613037, + "loss_ib": 0.021657610312104225, + "step": 630 + }, + { + "ce_ib": 10.95541763305664, + "ce_orig": 0.6836705207824707, + "epoch": 0.1814652383348911, + "kl_loss": 1.0429816246032715, + "loss_ib": 0.021385235711932182, + "step": 631 + }, + { + "ce_ib": 10.109951972961426, + "ce_orig": 0.8095191717147827, + "epoch": 0.1814652383348911, + "kl_loss": 1.072096347808838, + "loss_ib": 0.020830916240811348, + "step": 631 + }, + { + "ce_ib": 7.40610408782959, + "ce_orig": 0.6426340341567993, + "epoch": 0.1814652383348911, + "kl_loss": 1.0597965717315674, + "loss_ib": 0.018004069104790688, + "step": 631 + }, + { + "ce_ib": 8.1773681640625, + "ce_orig": 0.9187619090080261, + "epoch": 0.1814652383348911, + "kl_loss": 1.0321749448776245, + "loss_ib": 0.018499117344617844, + "step": 631 + }, + { + "ce_ib": 7.07262659072876, + "ce_orig": 0.8202535510063171, + "epoch": 0.1817528219138687, + "kl_loss": 0.9864460229873657, + "loss_ib": 0.01693708635866642, + "step": 632 + }, + { + "ce_ib": 7.180476188659668, + "ce_orig": 0.4838648736476898, + "epoch": 0.1817528219138687, + "kl_loss": 1.0089023113250732, + "loss_ib": 0.017269499599933624, + "step": 632 + }, + { + "ce_ib": 15.649883270263672, + "ce_orig": 1.6325815916061401, + "epoch": 0.1817528219138687, + "kl_loss": 1.1197898387908936, + "loss_ib": 0.026847781613469124, + "step": 632 + }, + { + "ce_ib": 10.30772590637207, + "ce_orig": 0.8685612082481384, + "epoch": 0.1817528219138687, + "kl_loss": 1.0974705219268799, + "loss_ib": 0.02128242887556553, + "step": 632 + }, + { + "ce_ib": 8.978386878967285, + "ce_orig": 1.0552774667739868, + "epoch": 0.18204040549284636, + "kl_loss": 1.012761116027832, + "loss_ib": 0.019105996936559677, + "step": 633 + }, + { + "ce_ib": 10.887259483337402, + "ce_orig": 1.5764906406402588, + "epoch": 0.18204040549284636, + "kl_loss": 1.0496618747711182, + "loss_ib": 0.021383875980973244, + "step": 633 + }, + { + "ce_ib": 10.534551620483398, + "ce_orig": 1.1252496242523193, + "epoch": 0.18204040549284636, + "kl_loss": 1.0642296075820923, + "loss_ib": 0.02117684856057167, + "step": 633 + }, + { + "ce_ib": 9.471675872802734, + "ce_orig": 0.3606927990913391, + "epoch": 0.18204040549284636, + "kl_loss": 1.1879955530166626, + "loss_ib": 0.021351629868149757, + "step": 633 + }, + { + "ce_ib": 9.976670265197754, + "ce_orig": 1.006685495376587, + "epoch": 0.182327989071824, + "kl_loss": 1.0557842254638672, + "loss_ib": 0.020534511655569077, + "step": 634 + }, + { + "ce_ib": 8.402043342590332, + "ce_orig": 1.0922396183013916, + "epoch": 0.182327989071824, + "kl_loss": 1.023686408996582, + "loss_ib": 0.018638907000422478, + "step": 634 + }, + { + "ce_ib": 10.423868179321289, + "ce_orig": 0.9032129049301147, + "epoch": 0.182327989071824, + "kl_loss": 1.1368285417556763, + "loss_ib": 0.021792152896523476, + "step": 634 + }, + { + "ce_ib": 7.688118934631348, + "ce_orig": 0.46169134974479675, + "epoch": 0.182327989071824, + "kl_loss": 1.095708966255188, + "loss_ib": 0.018645208328962326, + "step": 634 + }, + { + "epoch": 0.18261557265080164, + "grad_norm": 0.09018189460039139, + "learning_rate": 9.976113432841903e-06, + "loss": 0.9332, + "step": 635 + }, + { + "ce_ib": 9.740583419799805, + "ce_orig": 0.7694171667098999, + "epoch": 0.18261557265080164, + "kl_loss": 1.0555506944656372, + "loss_ib": 0.020296089351177216, + "step": 635 + }, + { + "ce_ib": 8.057018280029297, + "ce_orig": 0.48739856481552124, + "epoch": 0.18261557265080164, + "kl_loss": 1.2050068378448486, + "loss_ib": 0.02010708674788475, + "step": 635 + }, + { + "ce_ib": 8.886861801147461, + "ce_orig": 0.4765666127204895, + "epoch": 0.18261557265080164, + "kl_loss": 1.071749210357666, + "loss_ib": 0.019604353234171867, + "step": 635 + }, + { + "ce_ib": 13.106306076049805, + "ce_orig": 0.9705209136009216, + "epoch": 0.18261557265080164, + "kl_loss": 1.1007217168807983, + "loss_ib": 0.024113522842526436, + "step": 635 + }, + { + "ce_ib": 10.954051971435547, + "ce_orig": 1.050213098526001, + "epoch": 0.1829031562297793, + "kl_loss": 1.066004991531372, + "loss_ib": 0.02161410264670849, + "step": 636 + }, + { + "ce_ib": 9.91700553894043, + "ce_orig": 0.680712103843689, + "epoch": 0.1829031562297793, + "kl_loss": 1.132023572921753, + "loss_ib": 0.021237241104245186, + "step": 636 + }, + { + "ce_ib": 8.767101287841797, + "ce_orig": 0.44354522228240967, + "epoch": 0.1829031562297793, + "kl_loss": 1.108346939086914, + "loss_ib": 0.01985057070851326, + "step": 636 + }, + { + "ce_ib": 9.128534317016602, + "ce_orig": 0.3988248407840729, + "epoch": 0.1829031562297793, + "kl_loss": 1.086971402168274, + "loss_ib": 0.019998246803879738, + "step": 636 + }, + { + "ce_ib": 13.582477569580078, + "ce_orig": 1.6579183340072632, + "epoch": 0.1831907398087569, + "kl_loss": 1.0730781555175781, + "loss_ib": 0.02431325800716877, + "step": 637 + }, + { + "ce_ib": 17.059223175048828, + "ce_orig": 1.6747536659240723, + "epoch": 0.1831907398087569, + "kl_loss": 1.1021394729614258, + "loss_ib": 0.028080618008971214, + "step": 637 + }, + { + "ce_ib": 13.071850776672363, + "ce_orig": 1.5738227367401123, + "epoch": 0.1831907398087569, + "kl_loss": 1.1231281757354736, + "loss_ib": 0.0243031308054924, + "step": 637 + }, + { + "ce_ib": 9.79615592956543, + "ce_orig": 1.0085848569869995, + "epoch": 0.1831907398087569, + "kl_loss": 1.0980044603347778, + "loss_ib": 0.02077619917690754, + "step": 637 + }, + { + "ce_ib": 6.036836624145508, + "ce_orig": 0.4214065670967102, + "epoch": 0.18347832338773457, + "kl_loss": 1.0429325103759766, + "loss_ib": 0.016466161236166954, + "step": 638 + }, + { + "ce_ib": 7.146793365478516, + "ce_orig": 0.6582375764846802, + "epoch": 0.18347832338773457, + "kl_loss": 1.0144261121749878, + "loss_ib": 0.017291054129600525, + "step": 638 + }, + { + "ce_ib": 13.084831237792969, + "ce_orig": 1.4547115564346313, + "epoch": 0.18347832338773457, + "kl_loss": 1.0451858043670654, + "loss_ib": 0.023536689579486847, + "step": 638 + }, + { + "ce_ib": 10.764507293701172, + "ce_orig": 1.1796584129333496, + "epoch": 0.18347832338773457, + "kl_loss": 1.077453851699829, + "loss_ib": 0.02153904363512993, + "step": 638 + }, + { + "ce_ib": 4.687036514282227, + "ce_orig": 0.18363694846630096, + "epoch": 0.1837659069667122, + "kl_loss": 1.1889199018478394, + "loss_ib": 0.016576234251260757, + "step": 639 + }, + { + "ce_ib": 10.766698837280273, + "ce_orig": 0.825289785861969, + "epoch": 0.1837659069667122, + "kl_loss": 1.0548924207687378, + "loss_ib": 0.021315621212124825, + "step": 639 + }, + { + "ce_ib": 8.863663673400879, + "ce_orig": 0.8204382061958313, + "epoch": 0.1837659069667122, + "kl_loss": 1.019961953163147, + "loss_ib": 0.019063282757997513, + "step": 639 + }, + { + "ce_ib": 13.531327247619629, + "ce_orig": 1.1665120124816895, + "epoch": 0.1837659069667122, + "kl_loss": 1.0381747484207153, + "loss_ib": 0.023913072422146797, + "step": 639 + }, + { + "epoch": 0.18405349054568984, + "grad_norm": 0.09696701914072037, + "learning_rate": 9.975349739474156e-06, + "loss": 0.8875, + "step": 640 + }, + { + "ce_ib": 7.491870403289795, + "ce_orig": 0.907927930355072, + "epoch": 0.18405349054568984, + "kl_loss": 0.9900725483894348, + "loss_ib": 0.01739259622991085, + "step": 640 + }, + { + "ce_ib": 7.663341999053955, + "ce_orig": 0.5722526907920837, + "epoch": 0.18405349054568984, + "kl_loss": 1.0283775329589844, + "loss_ib": 0.01794711872935295, + "step": 640 + }, + { + "ce_ib": 10.040138244628906, + "ce_orig": 0.7578917145729065, + "epoch": 0.18405349054568984, + "kl_loss": 1.0339435338974, + "loss_ib": 0.020379573106765747, + "step": 640 + }, + { + "ce_ib": 11.124543190002441, + "ce_orig": 1.2751201391220093, + "epoch": 0.18405349054568984, + "kl_loss": 1.0301541090011597, + "loss_ib": 0.021426083520054817, + "step": 640 + }, + { + "ce_ib": 11.990524291992188, + "ce_orig": 1.2356369495391846, + "epoch": 0.1843410741246675, + "kl_loss": 1.1035387516021729, + "loss_ib": 0.023025913164019585, + "step": 641 + }, + { + "ce_ib": 7.862361431121826, + "ce_orig": 0.964101254940033, + "epoch": 0.1843410741246675, + "kl_loss": 1.1734020709991455, + "loss_ib": 0.01959638111293316, + "step": 641 + }, + { + "ce_ib": 9.207853317260742, + "ce_orig": 0.9062885642051697, + "epoch": 0.1843410741246675, + "kl_loss": 1.020951747894287, + "loss_ib": 0.019417371600866318, + "step": 641 + }, + { + "ce_ib": 9.713889122009277, + "ce_orig": 1.302318811416626, + "epoch": 0.1843410741246675, + "kl_loss": 1.1001968383789062, + "loss_ib": 0.020715856924653053, + "step": 641 + }, + { + "ce_ib": 6.821298599243164, + "ce_orig": 0.7004828453063965, + "epoch": 0.18462865770364512, + "kl_loss": 0.9664976596832275, + "loss_ib": 0.01648627407848835, + "step": 642 + }, + { + "ce_ib": 9.508224487304688, + "ce_orig": 0.832171618938446, + "epoch": 0.18462865770364512, + "kl_loss": 1.0534615516662598, + "loss_ib": 0.020042838528752327, + "step": 642 + }, + { + "ce_ib": 7.55943489074707, + "ce_orig": 0.6746954917907715, + "epoch": 0.18462865770364512, + "kl_loss": 1.0909643173217773, + "loss_ib": 0.018469078466296196, + "step": 642 + }, + { + "ce_ib": 7.165235996246338, + "ce_orig": 0.9785995483398438, + "epoch": 0.18462865770364512, + "kl_loss": 1.0123915672302246, + "loss_ib": 0.01728915236890316, + "step": 642 + }, + { + "ce_ib": 10.234342575073242, + "ce_orig": 0.9009053707122803, + "epoch": 0.18491624128262277, + "kl_loss": 1.1004221439361572, + "loss_ib": 0.02123856544494629, + "step": 643 + }, + { + "ce_ib": 8.069269180297852, + "ce_orig": 0.7381336688995361, + "epoch": 0.18491624128262277, + "kl_loss": 1.0298247337341309, + "loss_ib": 0.01836751587688923, + "step": 643 + }, + { + "ce_ib": 10.050087928771973, + "ce_orig": 0.9887573719024658, + "epoch": 0.18491624128262277, + "kl_loss": 1.0247390270233154, + "loss_ib": 0.020297478884458542, + "step": 643 + }, + { + "ce_ib": 14.767660140991211, + "ce_orig": 1.0759152173995972, + "epoch": 0.18491624128262277, + "kl_loss": 1.0658990144729614, + "loss_ib": 0.025426648557186127, + "step": 643 + }, + { + "ce_ib": 9.213264465332031, + "ce_orig": 0.7979077696800232, + "epoch": 0.1852038248616004, + "kl_loss": 1.041956901550293, + "loss_ib": 0.01963283307850361, + "step": 644 + }, + { + "ce_ib": 11.142786979675293, + "ce_orig": 1.003310203552246, + "epoch": 0.1852038248616004, + "kl_loss": 1.024578332901001, + "loss_ib": 0.021388567984104156, + "step": 644 + }, + { + "ce_ib": 9.134848594665527, + "ce_orig": 0.9812929034233093, + "epoch": 0.1852038248616004, + "kl_loss": 1.1117109060287476, + "loss_ib": 0.020251957699656487, + "step": 644 + }, + { + "ce_ib": 12.999911308288574, + "ce_orig": 1.423545479774475, + "epoch": 0.1852038248616004, + "kl_loss": 1.049558401107788, + "loss_ib": 0.023495495319366455, + "step": 644 + }, + { + "epoch": 0.18549140844057804, + "grad_norm": 0.09764409065246582, + "learning_rate": 9.974574059334082e-06, + "loss": 0.9161, + "step": 645 + }, + { + "ce_ib": 14.430760383605957, + "ce_orig": 0.45323994755744934, + "epoch": 0.18549140844057804, + "kl_loss": 1.1187865734100342, + "loss_ib": 0.025618623942136765, + "step": 645 + }, + { + "ce_ib": 6.7074151039123535, + "ce_orig": 0.8436351418495178, + "epoch": 0.18549140844057804, + "kl_loss": 0.9737348556518555, + "loss_ib": 0.01644476316869259, + "step": 645 + }, + { + "ce_ib": 9.11927604675293, + "ce_orig": 1.0630453824996948, + "epoch": 0.18549140844057804, + "kl_loss": 0.966637134552002, + "loss_ib": 0.018785648047924042, + "step": 645 + }, + { + "ce_ib": 9.039983749389648, + "ce_orig": 0.4303601086139679, + "epoch": 0.18549140844057804, + "kl_loss": 1.0904786586761475, + "loss_ib": 0.019944770261645317, + "step": 645 + }, + { + "ce_ib": 9.382161140441895, + "ce_orig": 0.8849090933799744, + "epoch": 0.1857789920195557, + "kl_loss": 1.0450866222381592, + "loss_ib": 0.019833028316497803, + "step": 646 + }, + { + "ce_ib": 7.5546064376831055, + "ce_orig": 0.5726062655448914, + "epoch": 0.1857789920195557, + "kl_loss": 1.0881624221801758, + "loss_ib": 0.018436230719089508, + "step": 646 + }, + { + "ce_ib": 8.01627254486084, + "ce_orig": 0.65036940574646, + "epoch": 0.1857789920195557, + "kl_loss": 1.0229861736297607, + "loss_ib": 0.018246134743094444, + "step": 646 + }, + { + "ce_ib": 7.372588634490967, + "ce_orig": 0.6817443370819092, + "epoch": 0.1857789920195557, + "kl_loss": 0.9919678568840027, + "loss_ib": 0.017292266711592674, + "step": 646 + }, + { + "ce_ib": 10.641061782836914, + "ce_orig": 0.9732003211975098, + "epoch": 0.18606657559853332, + "kl_loss": 1.0023996829986572, + "loss_ib": 0.020665058866143227, + "step": 647 + }, + { + "ce_ib": 10.230724334716797, + "ce_orig": 0.749370276927948, + "epoch": 0.18606657559853332, + "kl_loss": 1.0023596286773682, + "loss_ib": 0.02025432139635086, + "step": 647 + }, + { + "ce_ib": 8.159378051757812, + "ce_orig": 0.7660282850265503, + "epoch": 0.18606657559853332, + "kl_loss": 1.0770816802978516, + "loss_ib": 0.018930193036794662, + "step": 647 + }, + { + "ce_ib": 11.307751655578613, + "ce_orig": 0.7283535003662109, + "epoch": 0.18606657559853332, + "kl_loss": 1.1148804426193237, + "loss_ib": 0.02245655469596386, + "step": 647 + }, + { + "ce_ib": 10.556684494018555, + "ce_orig": 1.098810076713562, + "epoch": 0.18635415917751097, + "kl_loss": 1.0427453517913818, + "loss_ib": 0.020984139293432236, + "step": 648 + }, + { + "ce_ib": 5.236063003540039, + "ce_orig": 0.2835554778575897, + "epoch": 0.18635415917751097, + "kl_loss": 1.169142246246338, + "loss_ib": 0.016927484422922134, + "step": 648 + }, + { + "ce_ib": 9.705862998962402, + "ce_orig": 1.00014328956604, + "epoch": 0.18635415917751097, + "kl_loss": 0.9767618179321289, + "loss_ib": 0.019473480060696602, + "step": 648 + }, + { + "ce_ib": 11.64765453338623, + "ce_orig": 1.3154453039169312, + "epoch": 0.18635415917751097, + "kl_loss": 1.0039262771606445, + "loss_ib": 0.021686915308237076, + "step": 648 + }, + { + "ce_ib": 8.285019874572754, + "ce_orig": 0.970478892326355, + "epoch": 0.1866417427564886, + "kl_loss": 0.9888217449188232, + "loss_ib": 0.018173236399888992, + "step": 649 + }, + { + "ce_ib": 10.269580841064453, + "ce_orig": 0.5915196537971497, + "epoch": 0.1866417427564886, + "kl_loss": 1.0976067781448364, + "loss_ib": 0.021245649084448814, + "step": 649 + }, + { + "ce_ib": 7.284735202789307, + "ce_orig": 0.6597338914871216, + "epoch": 0.1866417427564886, + "kl_loss": 1.0240867137908936, + "loss_ib": 0.017525602132081985, + "step": 649 + }, + { + "ce_ib": 5.271642208099365, + "ce_orig": 0.3803437352180481, + "epoch": 0.1866417427564886, + "kl_loss": 1.114564061164856, + "loss_ib": 0.016417281702160835, + "step": 649 + }, + { + "epoch": 0.18692932633546624, + "grad_norm": 0.1129700243473053, + "learning_rate": 9.973786394290475e-06, + "loss": 0.8796, + "step": 650 + }, + { + "ce_ib": 12.168571472167969, + "ce_orig": 1.0805795192718506, + "epoch": 0.18692932633546624, + "kl_loss": 1.005476713180542, + "loss_ib": 0.0222233384847641, + "step": 650 + }, + { + "ce_ib": 9.973319053649902, + "ce_orig": 1.3568997383117676, + "epoch": 0.18692932633546624, + "kl_loss": 1.1083464622497559, + "loss_ib": 0.021056782454252243, + "step": 650 + }, + { + "ce_ib": 10.53954792022705, + "ce_orig": 0.7421830892562866, + "epoch": 0.18692932633546624, + "kl_loss": 1.061907410621643, + "loss_ib": 0.021158622577786446, + "step": 650 + }, + { + "ce_ib": 4.828139781951904, + "ce_orig": 0.21636667847633362, + "epoch": 0.18692932633546624, + "kl_loss": 1.1125696897506714, + "loss_ib": 0.01595383696258068, + "step": 650 + }, + { + "ce_ib": 9.867918014526367, + "ce_orig": 0.3963659703731537, + "epoch": 0.1872169099144439, + "kl_loss": 1.1094999313354492, + "loss_ib": 0.020962918177247047, + "step": 651 + }, + { + "ce_ib": 9.336775779724121, + "ce_orig": 0.7294745445251465, + "epoch": 0.1872169099144439, + "kl_loss": 0.9877459406852722, + "loss_ib": 0.019214235246181488, + "step": 651 + }, + { + "ce_ib": 7.323286533355713, + "ce_orig": 0.42315956950187683, + "epoch": 0.1872169099144439, + "kl_loss": 0.990585207939148, + "loss_ib": 0.01722913794219494, + "step": 651 + }, + { + "ce_ib": 7.316843032836914, + "ce_orig": 0.44551074504852295, + "epoch": 0.1872169099144439, + "kl_loss": 1.039764165878296, + "loss_ib": 0.01771448366343975, + "step": 651 + }, + { + "ce_ib": 10.291936874389648, + "ce_orig": 0.7233940362930298, + "epoch": 0.18750449349342152, + "kl_loss": 1.0752284526824951, + "loss_ib": 0.02104422077536583, + "step": 652 + }, + { + "ce_ib": 7.927389621734619, + "ce_orig": 0.7603445053100586, + "epoch": 0.18750449349342152, + "kl_loss": 1.0002994537353516, + "loss_ib": 0.01793038286268711, + "step": 652 + }, + { + "ce_ib": 10.401845932006836, + "ce_orig": 1.1115306615829468, + "epoch": 0.18750449349342152, + "kl_loss": 1.0052720308303833, + "loss_ib": 0.020454566925764084, + "step": 652 + }, + { + "ce_ib": 11.643852233886719, + "ce_orig": 1.4086114168167114, + "epoch": 0.18750449349342152, + "kl_loss": 1.0603067874908447, + "loss_ib": 0.022246917709708214, + "step": 652 + }, + { + "ce_ib": 6.45538330078125, + "ce_orig": 0.8548846244812012, + "epoch": 0.18779207707239917, + "kl_loss": 0.945244312286377, + "loss_ib": 0.015907825902104378, + "step": 653 + }, + { + "ce_ib": 11.45804500579834, + "ce_orig": 0.9945077300071716, + "epoch": 0.18779207707239917, + "kl_loss": 0.9493337273597717, + "loss_ib": 0.02095138281583786, + "step": 653 + }, + { + "ce_ib": 12.099946975708008, + "ce_orig": 1.0418729782104492, + "epoch": 0.18779207707239917, + "kl_loss": 1.0050960779190063, + "loss_ib": 0.022150907665491104, + "step": 653 + }, + { + "ce_ib": 8.56289005279541, + "ce_orig": 0.7450115084648132, + "epoch": 0.18779207707239917, + "kl_loss": 0.9960210919380188, + "loss_ib": 0.01852310076355934, + "step": 653 + }, + { + "ce_ib": 12.9265775680542, + "ce_orig": 1.5550066232681274, + "epoch": 0.1880796606513768, + "kl_loss": 0.9958865642547607, + "loss_ib": 0.022885441780090332, + "step": 654 + }, + { + "ce_ib": 5.975699424743652, + "ce_orig": 0.6117834448814392, + "epoch": 0.1880796606513768, + "kl_loss": 0.9439165592193604, + "loss_ib": 0.015414864756166935, + "step": 654 + }, + { + "ce_ib": 9.573440551757812, + "ce_orig": 0.972037672996521, + "epoch": 0.1880796606513768, + "kl_loss": 0.9460088610649109, + "loss_ib": 0.019033528864383698, + "step": 654 + }, + { + "ce_ib": 10.364381790161133, + "ce_orig": 0.6434758305549622, + "epoch": 0.1880796606513768, + "kl_loss": 1.0528744459152222, + "loss_ib": 0.020893124863505363, + "step": 654 + }, + { + "epoch": 0.18836724423035445, + "grad_norm": 0.09933654963970184, + "learning_rate": 9.972986746241005e-06, + "loss": 0.9236, + "step": 655 + }, + { + "ce_ib": 11.423005104064941, + "ce_orig": 0.504085898399353, + "epoch": 0.18836724423035445, + "kl_loss": 1.0513900518417358, + "loss_ib": 0.02193690463900566, + "step": 655 + }, + { + "ce_ib": 7.901275634765625, + "ce_orig": 0.8905818462371826, + "epoch": 0.18836724423035445, + "kl_loss": 1.0549099445343018, + "loss_ib": 0.01845037378370762, + "step": 655 + }, + { + "ce_ib": 12.124505043029785, + "ce_orig": 1.5267455577850342, + "epoch": 0.18836724423035445, + "kl_loss": 1.0243613719940186, + "loss_ib": 0.022368118166923523, + "step": 655 + }, + { + "ce_ib": 8.49409008026123, + "ce_orig": 0.4249543845653534, + "epoch": 0.18836724423035445, + "kl_loss": 0.9719215631484985, + "loss_ib": 0.01821330562233925, + "step": 655 + }, + { + "ce_ib": 11.336442947387695, + "ce_orig": 1.0143924951553345, + "epoch": 0.1886548278093321, + "kl_loss": 1.0201416015625, + "loss_ib": 0.021537858992815018, + "step": 656 + }, + { + "ce_ib": 10.415478706359863, + "ce_orig": 0.9272794127464294, + "epoch": 0.1886548278093321, + "kl_loss": 1.0198733806610107, + "loss_ib": 0.02061421424150467, + "step": 656 + }, + { + "ce_ib": 6.768211841583252, + "ce_orig": 0.8758606910705566, + "epoch": 0.1886548278093321, + "kl_loss": 0.9514140486717224, + "loss_ib": 0.016282351687550545, + "step": 656 + }, + { + "ce_ib": 8.312684059143066, + "ce_orig": 0.8438398838043213, + "epoch": 0.1886548278093321, + "kl_loss": 0.9719037413597107, + "loss_ib": 0.01803172007203102, + "step": 656 + }, + { + "ce_ib": 12.224742889404297, + "ce_orig": 0.791438102722168, + "epoch": 0.18894241138830972, + "kl_loss": 1.07827889919281, + "loss_ib": 0.023007530719041824, + "step": 657 + }, + { + "ce_ib": 10.384964942932129, + "ce_orig": 0.6525826454162598, + "epoch": 0.18894241138830972, + "kl_loss": 1.042457103729248, + "loss_ib": 0.020809534937143326, + "step": 657 + }, + { + "ce_ib": 10.262805938720703, + "ce_orig": 0.6871621608734131, + "epoch": 0.18894241138830972, + "kl_loss": 1.0356314182281494, + "loss_ib": 0.020619120448827744, + "step": 657 + }, + { + "ce_ib": 10.362799644470215, + "ce_orig": 0.842133104801178, + "epoch": 0.18894241138830972, + "kl_loss": 1.0107371807098389, + "loss_ib": 0.02047017030417919, + "step": 657 + }, + { + "ce_ib": 12.086880683898926, + "ce_orig": 1.178883671760559, + "epoch": 0.18922999496728737, + "kl_loss": 0.9930935502052307, + "loss_ib": 0.022017816081643105, + "step": 658 + }, + { + "ce_ib": 9.931184768676758, + "ce_orig": 0.5481682419776917, + "epoch": 0.18922999496728737, + "kl_loss": 0.983871579170227, + "loss_ib": 0.019769899547100067, + "step": 658 + }, + { + "ce_ib": 9.0752534866333, + "ce_orig": 0.6848755478858948, + "epoch": 0.18922999496728737, + "kl_loss": 0.9044622182846069, + "loss_ib": 0.018119875341653824, + "step": 658 + }, + { + "ce_ib": 10.438591003417969, + "ce_orig": 1.0720442533493042, + "epoch": 0.18922999496728737, + "kl_loss": 1.0088304281234741, + "loss_ib": 0.02052689529955387, + "step": 658 + }, + { + "ce_ib": 9.08749008178711, + "ce_orig": 1.1288058757781982, + "epoch": 0.189517578546265, + "kl_loss": 0.9419035315513611, + "loss_ib": 0.018506525084376335, + "step": 659 + }, + { + "ce_ib": 10.958107948303223, + "ce_orig": 0.6390134692192078, + "epoch": 0.189517578546265, + "kl_loss": 1.059513807296753, + "loss_ib": 0.021553244441747665, + "step": 659 + }, + { + "ce_ib": 12.540372848510742, + "ce_orig": 1.3411237001419067, + "epoch": 0.189517578546265, + "kl_loss": 0.990556001663208, + "loss_ib": 0.022445930168032646, + "step": 659 + }, + { + "ce_ib": 14.00953483581543, + "ce_orig": 1.3561463356018066, + "epoch": 0.189517578546265, + "kl_loss": 1.1565253734588623, + "loss_ib": 0.025574788451194763, + "step": 659 + }, + { + "epoch": 0.18980516212524265, + "grad_norm": 0.10293088853359222, + "learning_rate": 9.972175117112208e-06, + "loss": 0.8983, + "step": 660 + }, + { + "ce_ib": 9.332024574279785, + "ce_orig": 0.8229407668113708, + "epoch": 0.18980516212524265, + "kl_loss": 0.9839355945587158, + "loss_ib": 0.01917138136923313, + "step": 660 + }, + { + "ce_ib": 10.171957015991211, + "ce_orig": 0.8634912967681885, + "epoch": 0.18980516212524265, + "kl_loss": 0.9303529858589172, + "loss_ib": 0.019475486129522324, + "step": 660 + }, + { + "ce_ib": 10.142843246459961, + "ce_orig": 0.9541047215461731, + "epoch": 0.18980516212524265, + "kl_loss": 0.9670487642288208, + "loss_ib": 0.019813330844044685, + "step": 660 + }, + { + "ce_ib": 8.594452857971191, + "ce_orig": 0.660327136516571, + "epoch": 0.18980516212524265, + "kl_loss": 1.0246200561523438, + "loss_ib": 0.01884065382182598, + "step": 660 + }, + { + "ce_ib": 9.446117401123047, + "ce_orig": 0.6387197375297546, + "epoch": 0.1900927457042203, + "kl_loss": 0.9925702810287476, + "loss_ib": 0.019371818751096725, + "step": 661 + }, + { + "ce_ib": 12.91454792022705, + "ce_orig": 1.6437798738479614, + "epoch": 0.1900927457042203, + "kl_loss": 1.0110113620758057, + "loss_ib": 0.0230246614664793, + "step": 661 + }, + { + "ce_ib": 9.189199447631836, + "ce_orig": 0.9704218506813049, + "epoch": 0.1900927457042203, + "kl_loss": 0.9272133111953735, + "loss_ib": 0.018461331725120544, + "step": 661 + }, + { + "ce_ib": 10.05646800994873, + "ce_orig": 0.6157249212265015, + "epoch": 0.1900927457042203, + "kl_loss": 0.9937683939933777, + "loss_ib": 0.01999415084719658, + "step": 661 + }, + { + "ce_ib": 4.904020309448242, + "ce_orig": 0.5232660174369812, + "epoch": 0.19038032928319792, + "kl_loss": 0.9572215676307678, + "loss_ib": 0.014476235955953598, + "step": 662 + }, + { + "ce_ib": 11.595885276794434, + "ce_orig": 0.5337156057357788, + "epoch": 0.19038032928319792, + "kl_loss": 1.1112439632415771, + "loss_ib": 0.02270832471549511, + "step": 662 + }, + { + "ce_ib": 9.98287296295166, + "ce_orig": 0.9497804045677185, + "epoch": 0.19038032928319792, + "kl_loss": 1.0508451461791992, + "loss_ib": 0.02049132250249386, + "step": 662 + }, + { + "ce_ib": 8.917495727539062, + "ce_orig": 1.0380656719207764, + "epoch": 0.19038032928319792, + "kl_loss": 0.9809185266494751, + "loss_ib": 0.018726680427789688, + "step": 662 + }, + { + "ce_ib": 8.603903770446777, + "ce_orig": 0.8844617605209351, + "epoch": 0.19066791286217558, + "kl_loss": 0.9898391366004944, + "loss_ib": 0.01850229501724243, + "step": 663 + }, + { + "ce_ib": 8.496954917907715, + "ce_orig": 0.889788031578064, + "epoch": 0.19066791286217558, + "kl_loss": 0.968459963798523, + "loss_ib": 0.018181554973125458, + "step": 663 + }, + { + "ce_ib": 13.463947296142578, + "ce_orig": 1.5212091207504272, + "epoch": 0.19066791286217558, + "kl_loss": 1.3541009426116943, + "loss_ib": 0.02700495719909668, + "step": 663 + }, + { + "ce_ib": 7.513195991516113, + "ce_orig": 0.6632037162780762, + "epoch": 0.19066791286217558, + "kl_loss": 0.9870805144309998, + "loss_ib": 0.01738400012254715, + "step": 663 + }, + { + "ce_ib": 7.072785377502441, + "ce_orig": 0.5436831116676331, + "epoch": 0.1909554964411532, + "kl_loss": 0.9044357538223267, + "loss_ib": 0.016117142513394356, + "step": 664 + }, + { + "ce_ib": 11.165665626525879, + "ce_orig": 1.3044129610061646, + "epoch": 0.1909554964411532, + "kl_loss": 0.9187023043632507, + "loss_ib": 0.020352687686681747, + "step": 664 + }, + { + "ce_ib": 8.00759506225586, + "ce_orig": 0.5903300046920776, + "epoch": 0.1909554964411532, + "kl_loss": 0.9148058891296387, + "loss_ib": 0.017155654728412628, + "step": 664 + }, + { + "ce_ib": 9.988378524780273, + "ce_orig": 0.9803927540779114, + "epoch": 0.1909554964411532, + "kl_loss": 0.883601188659668, + "loss_ib": 0.018824391067028046, + "step": 664 + }, + { + "epoch": 0.19124308002013085, + "grad_norm": 0.1290864795446396, + "learning_rate": 9.971351508859488e-06, + "loss": 0.9177, + "step": 665 + }, + { + "ce_ib": 8.51546859741211, + "ce_orig": 0.7638845443725586, + "epoch": 0.19124308002013085, + "kl_loss": 0.8143119812011719, + "loss_ib": 0.016658587381243706, + "step": 665 + }, + { + "ce_ib": 6.580199718475342, + "ce_orig": 0.5441961884498596, + "epoch": 0.19124308002013085, + "kl_loss": 0.898180365562439, + "loss_ib": 0.015562002547085285, + "step": 665 + }, + { + "ce_ib": 9.424781799316406, + "ce_orig": 0.8517243266105652, + "epoch": 0.19124308002013085, + "kl_loss": 0.9845488667488098, + "loss_ib": 0.019270269200205803, + "step": 665 + }, + { + "ce_ib": 11.227746963500977, + "ce_orig": 0.7697399258613586, + "epoch": 0.19124308002013085, + "kl_loss": 1.012844204902649, + "loss_ib": 0.021356189623475075, + "step": 665 + }, + { + "ce_ib": 13.270841598510742, + "ce_orig": 1.091408133506775, + "epoch": 0.1915306635991085, + "kl_loss": 0.9616622924804688, + "loss_ib": 0.022887462750077248, + "step": 666 + }, + { + "ce_ib": 5.886293411254883, + "ce_orig": 0.31356731057167053, + "epoch": 0.1915306635991085, + "kl_loss": 0.971168041229248, + "loss_ib": 0.015597973950207233, + "step": 666 + }, + { + "ce_ib": 12.855634689331055, + "ce_orig": 1.4516594409942627, + "epoch": 0.1915306635991085, + "kl_loss": 0.9783580303192139, + "loss_ib": 0.022639214992523193, + "step": 666 + }, + { + "ce_ib": 9.463467597961426, + "ce_orig": 0.9096047878265381, + "epoch": 0.1915306635991085, + "kl_loss": 0.9198427200317383, + "loss_ib": 0.018661893904209137, + "step": 666 + }, + { + "ce_ib": 10.283754348754883, + "ce_orig": 1.3314762115478516, + "epoch": 0.19181824717808613, + "kl_loss": 1.057845115661621, + "loss_ib": 0.02086220681667328, + "step": 667 + }, + { + "ce_ib": 7.73128080368042, + "ce_orig": 0.6765826940536499, + "epoch": 0.19181824717808613, + "kl_loss": 0.9598665833473206, + "loss_ib": 0.017329946160316467, + "step": 667 + }, + { + "ce_ib": 13.801294326782227, + "ce_orig": 1.7749202251434326, + "epoch": 0.19181824717808613, + "kl_loss": 0.9999049305915833, + "loss_ib": 0.02380034327507019, + "step": 667 + }, + { + "ce_ib": 14.625811576843262, + "ce_orig": 1.5362874269485474, + "epoch": 0.19181824717808613, + "kl_loss": 0.9581853747367859, + "loss_ib": 0.024207664653658867, + "step": 667 + }, + { + "ce_ib": 9.781160354614258, + "ce_orig": 0.941349983215332, + "epoch": 0.19210583075706378, + "kl_loss": 0.9020639657974243, + "loss_ib": 0.018801799044013023, + "step": 668 + }, + { + "ce_ib": 10.93012523651123, + "ce_orig": 1.1870393753051758, + "epoch": 0.19210583075706378, + "kl_loss": 0.9062073230743408, + "loss_ib": 0.019992198795080185, + "step": 668 + }, + { + "ce_ib": 10.397263526916504, + "ce_orig": 0.7395128011703491, + "epoch": 0.19210583075706378, + "kl_loss": 0.9939507246017456, + "loss_ib": 0.02033677138388157, + "step": 668 + }, + { + "ce_ib": 8.197195053100586, + "ce_orig": 0.802003800868988, + "epoch": 0.19210583075706378, + "kl_loss": 1.0717787742614746, + "loss_ib": 0.018914982676506042, + "step": 668 + }, + { + "ce_ib": 8.156253814697266, + "ce_orig": 0.6003333926200867, + "epoch": 0.1923934143360414, + "kl_loss": 0.9009373188018799, + "loss_ib": 0.01716562733054161, + "step": 669 + }, + { + "ce_ib": 9.363840103149414, + "ce_orig": 0.7796534895896912, + "epoch": 0.1923934143360414, + "kl_loss": 0.9603847861289978, + "loss_ib": 0.01896768808364868, + "step": 669 + }, + { + "ce_ib": 6.624312400817871, + "ce_orig": 0.7180654406547546, + "epoch": 0.1923934143360414, + "kl_loss": 0.8511247038841248, + "loss_ib": 0.015135559253394604, + "step": 669 + }, + { + "ce_ib": 11.258011817932129, + "ce_orig": 0.6326055526733398, + "epoch": 0.1923934143360414, + "kl_loss": 1.004683256149292, + "loss_ib": 0.021304845809936523, + "step": 669 + }, + { + "epoch": 0.19268099791501905, + "grad_norm": 0.07958894968032837, + "learning_rate": 9.970515923467106e-06, + "loss": 0.8465, + "step": 670 + }, + { + "ce_ib": 7.144465923309326, + "ce_orig": 0.5346347093582153, + "epoch": 0.19268099791501905, + "kl_loss": 1.0311341285705566, + "loss_ib": 0.017455806955695152, + "step": 670 + }, + { + "ce_ib": 9.569890975952148, + "ce_orig": 1.0249342918395996, + "epoch": 0.19268099791501905, + "kl_loss": 1.0497715473175049, + "loss_ib": 0.02006760612130165, + "step": 670 + }, + { + "ce_ib": 7.342098236083984, + "ce_orig": 0.5448954105377197, + "epoch": 0.19268099791501905, + "kl_loss": 1.0126290321350098, + "loss_ib": 0.017468387261033058, + "step": 670 + }, + { + "ce_ib": 13.511815071105957, + "ce_orig": 1.6245704889297485, + "epoch": 0.19268099791501905, + "kl_loss": 0.9676916599273682, + "loss_ib": 0.02318873070180416, + "step": 670 + }, + { + "ce_ib": 9.829526901245117, + "ce_orig": 0.7800765633583069, + "epoch": 0.1929685814939967, + "kl_loss": 0.9008135199546814, + "loss_ib": 0.018837660551071167, + "step": 671 + }, + { + "ce_ib": 7.996551990509033, + "ce_orig": 0.7976639270782471, + "epoch": 0.1929685814939967, + "kl_loss": 0.9442075490951538, + "loss_ib": 0.017438627779483795, + "step": 671 + }, + { + "ce_ib": 11.391491889953613, + "ce_orig": 1.0702950954437256, + "epoch": 0.1929685814939967, + "kl_loss": 0.8962193131446838, + "loss_ib": 0.020353684201836586, + "step": 671 + }, + { + "ce_ib": 13.608918190002441, + "ce_orig": 1.1835685968399048, + "epoch": 0.1929685814939967, + "kl_loss": 1.0202008485794067, + "loss_ib": 0.02381092496216297, + "step": 671 + }, + { + "ce_ib": 7.881733417510986, + "ce_orig": 0.9604411125183105, + "epoch": 0.19325616507297433, + "kl_loss": 0.9708698391914368, + "loss_ib": 0.01759043149650097, + "step": 672 + }, + { + "ce_ib": 10.143787384033203, + "ce_orig": 0.805814802646637, + "epoch": 0.19325616507297433, + "kl_loss": 1.020028829574585, + "loss_ib": 0.020344074815511703, + "step": 672 + }, + { + "ce_ib": 8.782960891723633, + "ce_orig": 0.6452949047088623, + "epoch": 0.19325616507297433, + "kl_loss": 0.9659644961357117, + "loss_ib": 0.018442604690790176, + "step": 672 + }, + { + "ce_ib": 10.82643985748291, + "ce_orig": 0.8774537444114685, + "epoch": 0.19325616507297433, + "kl_loss": 0.8915767669677734, + "loss_ib": 0.01974220760166645, + "step": 672 + }, + { + "ce_ib": 8.120016098022461, + "ce_orig": 0.778851330280304, + "epoch": 0.19354374865195198, + "kl_loss": 0.9042708277702332, + "loss_ib": 0.01716272346675396, + "step": 673 + }, + { + "ce_ib": 10.228824615478516, + "ce_orig": 0.9065789580345154, + "epoch": 0.19354374865195198, + "kl_loss": 0.8779407143592834, + "loss_ib": 0.019008230417966843, + "step": 673 + }, + { + "ce_ib": 11.03148078918457, + "ce_orig": 1.0241479873657227, + "epoch": 0.19354374865195198, + "kl_loss": 1.204726219177246, + "loss_ib": 0.023078741505742073, + "step": 673 + }, + { + "ce_ib": 12.928085327148438, + "ce_orig": 1.197397232055664, + "epoch": 0.19354374865195198, + "kl_loss": 0.9511390328407288, + "loss_ib": 0.02243947423994541, + "step": 673 + }, + { + "ce_ib": 9.485182762145996, + "ce_orig": 0.6011760234832764, + "epoch": 0.1938313322309296, + "kl_loss": 1.0419973134994507, + "loss_ib": 0.019905155524611473, + "step": 674 + }, + { + "ce_ib": 6.017853736877441, + "ce_orig": 0.6303385496139526, + "epoch": 0.1938313322309296, + "kl_loss": 0.9145022034645081, + "loss_ib": 0.015162874944508076, + "step": 674 + }, + { + "ce_ib": 8.599544525146484, + "ce_orig": 0.5987470149993896, + "epoch": 0.1938313322309296, + "kl_loss": 0.8968786001205444, + "loss_ib": 0.017568331211805344, + "step": 674 + }, + { + "ce_ib": 5.367640018463135, + "ce_orig": 0.26319989562034607, + "epoch": 0.1938313322309296, + "kl_loss": 0.851106584072113, + "loss_ib": 0.013878704980015755, + "step": 674 + }, + { + "epoch": 0.19411891580990726, + "grad_norm": 0.08299347013235092, + "learning_rate": 9.969668362948186e-06, + "loss": 0.8623, + "step": 675 + }, + { + "ce_ib": 8.513809204101562, + "ce_orig": 0.46651700139045715, + "epoch": 0.19411891580990726, + "kl_loss": 1.1063188314437866, + "loss_ib": 0.019576996564865112, + "step": 675 + }, + { + "ce_ib": 13.300872802734375, + "ce_orig": 1.5554956197738647, + "epoch": 0.19411891580990726, + "kl_loss": 0.911888837814331, + "loss_ib": 0.0224197618663311, + "step": 675 + }, + { + "ce_ib": 10.305272102355957, + "ce_orig": 0.7289824485778809, + "epoch": 0.19411891580990726, + "kl_loss": 0.9203410148620605, + "loss_ib": 0.019508682191371918, + "step": 675 + }, + { + "ce_ib": 8.656975746154785, + "ce_orig": 0.8286035656929016, + "epoch": 0.19411891580990726, + "kl_loss": 0.9431976079940796, + "loss_ib": 0.018088949844241142, + "step": 675 + }, + { + "ce_ib": 7.818869113922119, + "ce_orig": 0.4440051317214966, + "epoch": 0.1944064993888849, + "kl_loss": 1.0188822746276855, + "loss_ib": 0.018007691949605942, + "step": 676 + }, + { + "ce_ib": 7.344069957733154, + "ce_orig": 0.45538222789764404, + "epoch": 0.1944064993888849, + "kl_loss": 1.0300064086914062, + "loss_ib": 0.017644133418798447, + "step": 676 + }, + { + "ce_ib": 7.1561384201049805, + "ce_orig": 0.5423603653907776, + "epoch": 0.1944064993888849, + "kl_loss": 0.8832270503044128, + "loss_ib": 0.015988409519195557, + "step": 676 + }, + { + "ce_ib": 9.0109281539917, + "ce_orig": 0.8102385401725769, + "epoch": 0.1944064993888849, + "kl_loss": 0.915709376335144, + "loss_ib": 0.018168022856116295, + "step": 676 + }, + { + "ce_ib": 7.42899751663208, + "ce_orig": 0.5108758807182312, + "epoch": 0.19469408296786253, + "kl_loss": 0.8757451176643372, + "loss_ib": 0.01618644967675209, + "step": 677 + }, + { + "ce_ib": 12.158177375793457, + "ce_orig": 0.9250555634498596, + "epoch": 0.19469408296786253, + "kl_loss": 0.9929431676864624, + "loss_ib": 0.02208760939538479, + "step": 677 + }, + { + "ce_ib": 9.143503189086914, + "ce_orig": 0.7566794157028198, + "epoch": 0.19469408296786253, + "kl_loss": 1.0857152938842773, + "loss_ib": 0.020000655204057693, + "step": 677 + }, + { + "ce_ib": 11.579789161682129, + "ce_orig": 1.1167807579040527, + "epoch": 0.19469408296786253, + "kl_loss": 0.9444433450698853, + "loss_ib": 0.021024221554398537, + "step": 677 + }, + { + "ce_ib": 10.260880470275879, + "ce_orig": 1.0816290378570557, + "epoch": 0.19498166654684018, + "kl_loss": 0.9158464670181274, + "loss_ib": 0.019419346004724503, + "step": 678 + }, + { + "ce_ib": 12.340543746948242, + "ce_orig": 0.7932845950126648, + "epoch": 0.19498166654684018, + "kl_loss": 0.9657367467880249, + "loss_ib": 0.021997911855578423, + "step": 678 + }, + { + "ce_ib": 9.458576202392578, + "ce_orig": 1.1390806436538696, + "epoch": 0.19498166654684018, + "kl_loss": 0.8594547510147095, + "loss_ib": 0.018053123727440834, + "step": 678 + }, + { + "ce_ib": 7.780326843261719, + "ce_orig": 0.803799569606781, + "epoch": 0.19498166654684018, + "kl_loss": 0.9818826913833618, + "loss_ib": 0.01759915240108967, + "step": 678 + }, + { + "ce_ib": 4.1620354652404785, + "ce_orig": 0.2644416093826294, + "epoch": 0.1952692501258178, + "kl_loss": 1.0103613138198853, + "loss_ib": 0.014265649020671844, + "step": 679 + }, + { + "ce_ib": 10.257436752319336, + "ce_orig": 0.8702826499938965, + "epoch": 0.1952692501258178, + "kl_loss": 0.911620020866394, + "loss_ib": 0.019373636692762375, + "step": 679 + }, + { + "ce_ib": 9.919028282165527, + "ce_orig": 0.7849615216255188, + "epoch": 0.1952692501258178, + "kl_loss": 0.982805609703064, + "loss_ib": 0.019747084006667137, + "step": 679 + }, + { + "ce_ib": 8.560178756713867, + "ce_orig": 0.5670347809791565, + "epoch": 0.1952692501258178, + "kl_loss": 0.8926770687103271, + "loss_ib": 0.017486948519945145, + "step": 679 + }, + { + "epoch": 0.19555683370479546, + "grad_norm": 0.09906060248613358, + "learning_rate": 9.968808829344692e-06, + "loss": 0.8659, + "step": 680 + }, + { + "ce_ib": 5.249537944793701, + "ce_orig": 0.43986520171165466, + "epoch": 0.19555683370479546, + "kl_loss": 1.0424385070800781, + "loss_ib": 0.01567392237484455, + "step": 680 + }, + { + "ce_ib": 14.56251049041748, + "ce_orig": 1.4373424053192139, + "epoch": 0.19555683370479546, + "kl_loss": 0.9467419385910034, + "loss_ib": 0.0240299291908741, + "step": 680 + }, + { + "ce_ib": 7.976964473724365, + "ce_orig": 0.5848947763442993, + "epoch": 0.19555683370479546, + "kl_loss": 0.7993128299713135, + "loss_ib": 0.01597009226679802, + "step": 680 + }, + { + "ce_ib": 8.310443878173828, + "ce_orig": 0.49164944887161255, + "epoch": 0.19555683370479546, + "kl_loss": 0.8992608189582825, + "loss_ib": 0.01730305142700672, + "step": 680 + }, + { + "ce_ib": 13.469533920288086, + "ce_orig": 1.6232653856277466, + "epoch": 0.1958444172837731, + "kl_loss": 0.9035540819168091, + "loss_ib": 0.022505072876811028, + "step": 681 + }, + { + "ce_ib": 7.154322624206543, + "ce_orig": 0.5702813267707825, + "epoch": 0.1958444172837731, + "kl_loss": 0.8137340545654297, + "loss_ib": 0.015291662886738777, + "step": 681 + }, + { + "ce_ib": 7.098222255706787, + "ce_orig": 0.735628068447113, + "epoch": 0.1958444172837731, + "kl_loss": 0.8927323818206787, + "loss_ib": 0.016025545075535774, + "step": 681 + }, + { + "ce_ib": 5.824950695037842, + "ce_orig": 0.766068160533905, + "epoch": 0.1958444172837731, + "kl_loss": 0.8107069730758667, + "loss_ib": 0.013932020403444767, + "step": 681 + }, + { + "ce_ib": 10.2679443359375, + "ce_orig": 0.5897117853164673, + "epoch": 0.19613200086275073, + "kl_loss": 0.8887754678726196, + "loss_ib": 0.019155697897076607, + "step": 682 + }, + { + "ce_ib": 7.825944900512695, + "ce_orig": 1.018816351890564, + "epoch": 0.19613200086275073, + "kl_loss": 0.7450103759765625, + "loss_ib": 0.0152760474011302, + "step": 682 + }, + { + "ce_ib": 10.176102638244629, + "ce_orig": 0.9543701410293579, + "epoch": 0.19613200086275073, + "kl_loss": 0.9644123911857605, + "loss_ib": 0.01982022635638714, + "step": 682 + }, + { + "ce_ib": 11.218408584594727, + "ce_orig": 1.213329553604126, + "epoch": 0.19613200086275073, + "kl_loss": 0.8925070762634277, + "loss_ib": 0.020143479108810425, + "step": 682 + }, + { + "ce_ib": 11.08190631866455, + "ce_orig": 1.1313570737838745, + "epoch": 0.19641958444172838, + "kl_loss": 0.9318565130233765, + "loss_ib": 0.02040047198534012, + "step": 683 + }, + { + "ce_ib": 9.102522850036621, + "ce_orig": 1.1076076030731201, + "epoch": 0.19641958444172838, + "kl_loss": 0.9036017060279846, + "loss_ib": 0.018138539046049118, + "step": 683 + }, + { + "ce_ib": 8.345836639404297, + "ce_orig": 0.9177960753440857, + "epoch": 0.19641958444172838, + "kl_loss": 0.9092356562614441, + "loss_ib": 0.017438193783164024, + "step": 683 + }, + { + "ce_ib": 13.083810806274414, + "ce_orig": 1.4744526147842407, + "epoch": 0.19641958444172838, + "kl_loss": 0.8807121515274048, + "loss_ib": 0.021890930831432343, + "step": 683 + }, + { + "ce_ib": 9.102757453918457, + "ce_orig": 0.7407585978507996, + "epoch": 0.196707168020706, + "kl_loss": 0.9970856308937073, + "loss_ib": 0.019073612987995148, + "step": 684 + }, + { + "ce_ib": 8.29672908782959, + "ce_orig": 0.6851121187210083, + "epoch": 0.196707168020706, + "kl_loss": 0.9605600833892822, + "loss_ib": 0.017902329564094543, + "step": 684 + }, + { + "ce_ib": 8.740421295166016, + "ce_orig": 0.8621047735214233, + "epoch": 0.196707168020706, + "kl_loss": 0.8722232580184937, + "loss_ib": 0.017462654039263725, + "step": 684 + }, + { + "ce_ib": 7.918392658233643, + "ce_orig": 0.7503530979156494, + "epoch": 0.196707168020706, + "kl_loss": 0.7531979084014893, + "loss_ib": 0.015450372360646725, + "step": 684 + }, + { + "epoch": 0.19699475159968366, + "grad_norm": 0.08257844299077988, + "learning_rate": 9.967937324727446e-06, + "loss": 0.8724, + "step": 685 + }, + { + "ce_ib": 11.491483688354492, + "ce_orig": 1.5021545886993408, + "epoch": 0.19699475159968366, + "kl_loss": 0.8115209341049194, + "loss_ib": 0.0196066927164793, + "step": 685 + }, + { + "ce_ib": 13.129999160766602, + "ce_orig": 1.1804494857788086, + "epoch": 0.19699475159968366, + "kl_loss": 0.7936393022537231, + "loss_ib": 0.021066393703222275, + "step": 685 + }, + { + "ce_ib": 10.34216022491455, + "ce_orig": 1.2397124767303467, + "epoch": 0.19699475159968366, + "kl_loss": 0.7960874438285828, + "loss_ib": 0.01830303482711315, + "step": 685 + }, + { + "ce_ib": 11.881531715393066, + "ce_orig": 1.3413195610046387, + "epoch": 0.19699475159968366, + "kl_loss": 0.913062334060669, + "loss_ib": 0.02101215347647667, + "step": 685 + }, + { + "ce_ib": 6.886374473571777, + "ce_orig": 0.8009768128395081, + "epoch": 0.1972823351786613, + "kl_loss": 0.6990371942520142, + "loss_ib": 0.013876745477318764, + "step": 686 + }, + { + "ce_ib": 10.036053657531738, + "ce_orig": 1.1837598085403442, + "epoch": 0.1972823351786613, + "kl_loss": 0.8260252475738525, + "loss_ib": 0.01829630509018898, + "step": 686 + }, + { + "ce_ib": 9.117568016052246, + "ce_orig": 0.6687161922454834, + "epoch": 0.1972823351786613, + "kl_loss": 0.8096814155578613, + "loss_ib": 0.01721438206732273, + "step": 686 + }, + { + "ce_ib": 8.085248947143555, + "ce_orig": 1.340101718902588, + "epoch": 0.1972823351786613, + "kl_loss": 0.7643517851829529, + "loss_ib": 0.015728766098618507, + "step": 686 + }, + { + "ce_ib": 11.919937133789062, + "ce_orig": 1.2236392498016357, + "epoch": 0.19756991875763893, + "kl_loss": 0.7394053339958191, + "loss_ib": 0.0193139910697937, + "step": 687 + }, + { + "ce_ib": 8.260920524597168, + "ce_orig": 0.9186649322509766, + "epoch": 0.19756991875763893, + "kl_loss": 0.7759698629379272, + "loss_ib": 0.016020620241761208, + "step": 687 + }, + { + "ce_ib": 8.8240966796875, + "ce_orig": 1.0625789165496826, + "epoch": 0.19756991875763893, + "kl_loss": 0.9100702404975891, + "loss_ib": 0.017924798652529716, + "step": 687 + }, + { + "ce_ib": 12.304547309875488, + "ce_orig": 0.936794638633728, + "epoch": 0.19756991875763893, + "kl_loss": 0.930564820766449, + "loss_ib": 0.021610194817185402, + "step": 687 + }, + { + "ce_ib": 8.338387489318848, + "ce_orig": 0.7846511602401733, + "epoch": 0.1978575023366166, + "kl_loss": 0.8309119939804077, + "loss_ib": 0.01664750836789608, + "step": 688 + }, + { + "ce_ib": 7.630675315856934, + "ce_orig": 0.6111834645271301, + "epoch": 0.1978575023366166, + "kl_loss": 0.9697389602661133, + "loss_ib": 0.017328064888715744, + "step": 688 + }, + { + "ce_ib": 11.70467472076416, + "ce_orig": 0.7941328287124634, + "epoch": 0.1978575023366166, + "kl_loss": 0.8729178309440613, + "loss_ib": 0.020433852449059486, + "step": 688 + }, + { + "ce_ib": 15.2159423828125, + "ce_orig": 1.038719654083252, + "epoch": 0.1978575023366166, + "kl_loss": 0.9375836253166199, + "loss_ib": 0.024591779336333275, + "step": 688 + }, + { + "ce_ib": 9.57465648651123, + "ce_orig": 0.7560713887214661, + "epoch": 0.1981450859155942, + "kl_loss": 0.8299669027328491, + "loss_ib": 0.017874324694275856, + "step": 689 + }, + { + "ce_ib": 7.711019515991211, + "ce_orig": 0.8740010261535645, + "epoch": 0.1981450859155942, + "kl_loss": 0.9161753058433533, + "loss_ib": 0.016872772946953773, + "step": 689 + }, + { + "ce_ib": 8.346826553344727, + "ce_orig": 0.6488251686096191, + "epoch": 0.1981450859155942, + "kl_loss": 0.9876278042793274, + "loss_ib": 0.018223104998469353, + "step": 689 + }, + { + "ce_ib": 11.599117279052734, + "ce_orig": 1.0851843357086182, + "epoch": 0.1981450859155942, + "kl_loss": 0.8547726273536682, + "loss_ib": 0.020146843045949936, + "step": 689 + }, + { + "epoch": 0.19843266949457186, + "grad_norm": 0.09532356262207031, + "learning_rate": 9.9670538511961e-06, + "loss": 0.8666, + "step": 690 + }, + { + "ce_ib": 9.944104194641113, + "ce_orig": 0.9654097557067871, + "epoch": 0.19843266949457186, + "kl_loss": 0.996139645576477, + "loss_ib": 0.01990550011396408, + "step": 690 + }, + { + "ce_ib": 8.787851333618164, + "ce_orig": 0.6779507994651794, + "epoch": 0.19843266949457186, + "kl_loss": 0.9373599886894226, + "loss_ib": 0.018161451444029808, + "step": 690 + }, + { + "ce_ib": 7.5119452476501465, + "ce_orig": 1.0238131284713745, + "epoch": 0.19843266949457186, + "kl_loss": 0.6883217096328735, + "loss_ib": 0.014395162463188171, + "step": 690 + }, + { + "ce_ib": 12.74599552154541, + "ce_orig": 1.3570128679275513, + "epoch": 0.19843266949457186, + "kl_loss": 0.8598670363426208, + "loss_ib": 0.021344665437936783, + "step": 690 + }, + { + "ce_ib": 9.191486358642578, + "ce_orig": 0.5052329301834106, + "epoch": 0.1987202530735495, + "kl_loss": 0.8542582988739014, + "loss_ib": 0.017734069377183914, + "step": 691 + }, + { + "ce_ib": 10.381093978881836, + "ce_orig": 1.0137697458267212, + "epoch": 0.1987202530735495, + "kl_loss": 0.769364595413208, + "loss_ib": 0.01807473972439766, + "step": 691 + }, + { + "ce_ib": 8.68432331085205, + "ce_orig": 0.7807736992835999, + "epoch": 0.1987202530735495, + "kl_loss": 0.8298584222793579, + "loss_ib": 0.0169829074293375, + "step": 691 + }, + { + "ce_ib": 6.291802883148193, + "ce_orig": 0.3723978102207184, + "epoch": 0.1987202530735495, + "kl_loss": 0.8000516891479492, + "loss_ib": 0.014292319305241108, + "step": 691 + }, + { + "ce_ib": 12.264336585998535, + "ce_orig": 1.4954090118408203, + "epoch": 0.19900783665252714, + "kl_loss": 0.796958327293396, + "loss_ib": 0.020233919844031334, + "step": 692 + }, + { + "ce_ib": 10.001005172729492, + "ce_orig": 1.2661001682281494, + "epoch": 0.19900783665252714, + "kl_loss": 0.8150477409362793, + "loss_ib": 0.018151482567191124, + "step": 692 + }, + { + "ce_ib": 9.576811790466309, + "ce_orig": 1.3062323331832886, + "epoch": 0.19900783665252714, + "kl_loss": 0.7767390012741089, + "loss_ib": 0.017344200983643532, + "step": 692 + }, + { + "ce_ib": 6.84829044342041, + "ce_orig": 0.5050943493843079, + "epoch": 0.19900783665252714, + "kl_loss": 0.8827191591262817, + "loss_ib": 0.015675483271479607, + "step": 692 + }, + { + "ce_ib": 8.917977333068848, + "ce_orig": 1.199506163597107, + "epoch": 0.1992954202315048, + "kl_loss": 0.6274911165237427, + "loss_ib": 0.015192887745797634, + "step": 693 + }, + { + "ce_ib": 7.1223883628845215, + "ce_orig": 0.7409626841545105, + "epoch": 0.1992954202315048, + "kl_loss": 0.7926149368286133, + "loss_ib": 0.015048536472022533, + "step": 693 + }, + { + "ce_ib": 11.097723007202148, + "ce_orig": 1.1110416650772095, + "epoch": 0.1992954202315048, + "kl_loss": 0.8221895694732666, + "loss_ib": 0.019319618120789528, + "step": 693 + }, + { + "ce_ib": 10.933575630187988, + "ce_orig": 0.7388496994972229, + "epoch": 0.1992954202315048, + "kl_loss": 1.0315768718719482, + "loss_ib": 0.02124934457242489, + "step": 693 + }, + { + "ce_ib": 8.662981986999512, + "ce_orig": 0.5169118642807007, + "epoch": 0.1995830038104824, + "kl_loss": 0.9072721004486084, + "loss_ib": 0.01773570291697979, + "step": 694 + }, + { + "ce_ib": 11.462176322937012, + "ce_orig": 1.4820420742034912, + "epoch": 0.1995830038104824, + "kl_loss": 0.8469343185424805, + "loss_ib": 0.019931519404053688, + "step": 694 + }, + { + "ce_ib": 8.445666313171387, + "ce_orig": 0.9313681721687317, + "epoch": 0.1995830038104824, + "kl_loss": 0.6418800354003906, + "loss_ib": 0.014864466153085232, + "step": 694 + }, + { + "ce_ib": 7.5244975090026855, + "ce_orig": 0.591560423374176, + "epoch": 0.1995830038104824, + "kl_loss": 0.8560516834259033, + "loss_ib": 0.01608501374721527, + "step": 694 + }, + { + "epoch": 0.19987058738946006, + "grad_norm": 0.10591301321983337, + "learning_rate": 9.966158410879148e-06, + "loss": 0.9255, + "step": 695 + }, + { + "ce_ib": 9.375509262084961, + "ce_orig": 1.00591242313385, + "epoch": 0.19987058738946006, + "kl_loss": 0.9248544573783875, + "loss_ib": 0.01862405426800251, + "step": 695 + }, + { + "ce_ib": 11.656967163085938, + "ce_orig": 1.2776415348052979, + "epoch": 0.19987058738946006, + "kl_loss": 0.7392134666442871, + "loss_ib": 0.019049102440476418, + "step": 695 + }, + { + "ce_ib": 10.279980659484863, + "ce_orig": 0.7536091804504395, + "epoch": 0.19987058738946006, + "kl_loss": 0.7704986333847046, + "loss_ib": 0.017984967678785324, + "step": 695 + }, + { + "ce_ib": 9.143665313720703, + "ce_orig": 0.7292720079421997, + "epoch": 0.19987058738946006, + "kl_loss": 0.8170077800750732, + "loss_ib": 0.017313743010163307, + "step": 695 + }, + { + "ce_ib": 10.739947319030762, + "ce_orig": 0.5883066058158875, + "epoch": 0.20015817096843772, + "kl_loss": 0.9453576803207397, + "loss_ib": 0.020193524658679962, + "step": 696 + }, + { + "ce_ib": 8.81908130645752, + "ce_orig": 1.073760986328125, + "epoch": 0.20015817096843772, + "kl_loss": 0.6506307125091553, + "loss_ib": 0.015325388871133327, + "step": 696 + }, + { + "ce_ib": 5.927511692047119, + "ce_orig": 0.6327902674674988, + "epoch": 0.20015817096843772, + "kl_loss": 0.8255484104156494, + "loss_ib": 0.01418299600481987, + "step": 696 + }, + { + "ce_ib": 8.285160064697266, + "ce_orig": 0.7552601099014282, + "epoch": 0.20015817096843772, + "kl_loss": 1.0531952381134033, + "loss_ib": 0.01881711184978485, + "step": 696 + }, + { + "ce_ib": 15.350908279418945, + "ce_orig": 1.7327600717544556, + "epoch": 0.20044575454741534, + "kl_loss": 0.9541932940483093, + "loss_ib": 0.024892840534448624, + "step": 697 + }, + { + "ce_ib": 5.672478199005127, + "ce_orig": 0.4321776032447815, + "epoch": 0.20044575454741534, + "kl_loss": 1.0703692436218262, + "loss_ib": 0.01637617126107216, + "step": 697 + }, + { + "ce_ib": 9.48302936553955, + "ce_orig": 1.0044292211532593, + "epoch": 0.20044575454741534, + "kl_loss": 0.827777087688446, + "loss_ib": 0.017760800197720528, + "step": 697 + }, + { + "ce_ib": 12.105079650878906, + "ce_orig": 0.9517088532447815, + "epoch": 0.20044575454741534, + "kl_loss": 0.8866432905197144, + "loss_ib": 0.0209715124219656, + "step": 697 + }, + { + "ce_ib": 11.073482513427734, + "ce_orig": 1.1429903507232666, + "epoch": 0.200733338126393, + "kl_loss": 0.6459304690361023, + "loss_ib": 0.01753278635442257, + "step": 698 + }, + { + "ce_ib": 10.372661590576172, + "ce_orig": 0.9689992070198059, + "epoch": 0.200733338126393, + "kl_loss": 0.8759998679161072, + "loss_ib": 0.01913265883922577, + "step": 698 + }, + { + "ce_ib": 11.510743141174316, + "ce_orig": 0.5491771101951599, + "epoch": 0.200733338126393, + "kl_loss": 0.8505983352661133, + "loss_ib": 0.020016726106405258, + "step": 698 + }, + { + "ce_ib": 6.4225029945373535, + "ce_orig": 0.5128731727600098, + "epoch": 0.200733338126393, + "kl_loss": 0.6717511415481567, + "loss_ib": 0.013140014372766018, + "step": 698 + }, + { + "ce_ib": 13.731460571289062, + "ce_orig": 0.9959490895271301, + "epoch": 0.20102092170537061, + "kl_loss": 0.8257856369018555, + "loss_ib": 0.02198931574821472, + "step": 699 + }, + { + "ce_ib": 5.540718078613281, + "ce_orig": 0.5061084032058716, + "epoch": 0.20102092170537061, + "kl_loss": 0.750801146030426, + "loss_ib": 0.013048729859292507, + "step": 699 + }, + { + "ce_ib": 7.21422004699707, + "ce_orig": 0.7217543721199036, + "epoch": 0.20102092170537061, + "kl_loss": 0.7603013515472412, + "loss_ib": 0.014817233197391033, + "step": 699 + }, + { + "ce_ib": 9.123298645019531, + "ce_orig": 0.6771790981292725, + "epoch": 0.20102092170537061, + "kl_loss": 0.8053810596466064, + "loss_ib": 0.01717710867524147, + "step": 699 + }, + { + "epoch": 0.20130850528434827, + "grad_norm": 0.10600654780864716, + "learning_rate": 9.965251005933915e-06, + "loss": 0.8397, + "step": 700 + }, + { + "ce_ib": 6.327905654907227, + "ce_orig": 0.5939226746559143, + "epoch": 0.20130850528434827, + "kl_loss": 0.6826160550117493, + "loss_ib": 0.013154065236449242, + "step": 700 + }, + { + "ce_ib": 9.836183547973633, + "ce_orig": 0.9476636648178101, + "epoch": 0.20130850528434827, + "kl_loss": 0.7302706837654114, + "loss_ib": 0.01713889092206955, + "step": 700 + }, + { + "ce_ib": 11.0946683883667, + "ce_orig": 1.1720821857452393, + "epoch": 0.20130850528434827, + "kl_loss": 0.8669959306716919, + "loss_ib": 0.019764628261327744, + "step": 700 + }, + { + "ce_ib": 8.813426971435547, + "ce_orig": 1.1299982070922852, + "epoch": 0.20130850528434827, + "kl_loss": 0.6991374492645264, + "loss_ib": 0.015804801136255264, + "step": 700 + }, + { + "ce_ib": 8.754616737365723, + "ce_orig": 1.024243950843811, + "epoch": 0.20159608886332592, + "kl_loss": 0.6542124152183533, + "loss_ib": 0.01529674045741558, + "step": 701 + }, + { + "ce_ib": 7.708653450012207, + "ce_orig": 0.6619385480880737, + "epoch": 0.20159608886332592, + "kl_loss": 0.6982452273368835, + "loss_ib": 0.014691106043756008, + "step": 701 + }, + { + "ce_ib": 12.858272552490234, + "ce_orig": 1.6826236248016357, + "epoch": 0.20159608886332592, + "kl_loss": 0.752007007598877, + "loss_ib": 0.020378341898322105, + "step": 701 + }, + { + "ce_ib": 8.944458961486816, + "ce_orig": 1.0155476331710815, + "epoch": 0.20159608886332592, + "kl_loss": 0.7168185710906982, + "loss_ib": 0.016112644225358963, + "step": 701 + }, + { + "ce_ib": 13.010807991027832, + "ce_orig": 0.88566654920578, + "epoch": 0.20188367244230354, + "kl_loss": 0.6652591228485107, + "loss_ib": 0.01966339908540249, + "step": 702 + }, + { + "ce_ib": 10.586206436157227, + "ce_orig": 1.103887677192688, + "epoch": 0.20188367244230354, + "kl_loss": 0.8720508813858032, + "loss_ib": 0.019306715577840805, + "step": 702 + }, + { + "ce_ib": 10.615009307861328, + "ce_orig": 0.6724156141281128, + "epoch": 0.20188367244230354, + "kl_loss": 0.9320215582847595, + "loss_ib": 0.01993522420525551, + "step": 702 + }, + { + "ce_ib": 8.414154052734375, + "ce_orig": 0.5818290710449219, + "epoch": 0.20188367244230354, + "kl_loss": 0.7827242612838745, + "loss_ib": 0.016241395846009254, + "step": 702 + }, + { + "ce_ib": 12.710061073303223, + "ce_orig": 1.1648833751678467, + "epoch": 0.2021712560212812, + "kl_loss": 0.8374737501144409, + "loss_ib": 0.021084798499941826, + "step": 703 + }, + { + "ce_ib": 7.598687171936035, + "ce_orig": 0.5311350226402283, + "epoch": 0.2021712560212812, + "kl_loss": 0.6079769134521484, + "loss_ib": 0.013678456656634808, + "step": 703 + }, + { + "ce_ib": 9.277694702148438, + "ce_orig": 0.9339279532432556, + "epoch": 0.2021712560212812, + "kl_loss": 0.8527544736862183, + "loss_ib": 0.01780523918569088, + "step": 703 + }, + { + "ce_ib": 10.174999237060547, + "ce_orig": 1.0432989597320557, + "epoch": 0.2021712560212812, + "kl_loss": 0.7690742015838623, + "loss_ib": 0.0178657416254282, + "step": 703 + }, + { + "ce_ib": 14.90958309173584, + "ce_orig": 1.5392428636550903, + "epoch": 0.20245883960025882, + "kl_loss": 0.7166196703910828, + "loss_ib": 0.022075779736042023, + "step": 704 + }, + { + "ce_ib": 10.524937629699707, + "ce_orig": 0.6721472144126892, + "epoch": 0.20245883960025882, + "kl_loss": 0.9726794958114624, + "loss_ib": 0.02025173231959343, + "step": 704 + }, + { + "ce_ib": 5.51300048828125, + "ce_orig": 0.6144857406616211, + "epoch": 0.20245883960025882, + "kl_loss": 0.5522742867469788, + "loss_ib": 0.011035742238163948, + "step": 704 + }, + { + "ce_ib": 10.596753120422363, + "ce_orig": 1.102412223815918, + "epoch": 0.20245883960025882, + "kl_loss": 0.6392167806625366, + "loss_ib": 0.01698892004787922, + "step": 704 + }, + { + "epoch": 0.20274642317923647, + "grad_norm": 0.10856325924396515, + "learning_rate": 9.96433163854655e-06, + "loss": 0.8989, + "step": 705 + }, + { + "ce_ib": 8.712186813354492, + "ce_orig": 0.788031280040741, + "epoch": 0.20274642317923647, + "kl_loss": 1.01998770236969, + "loss_ib": 0.018912063911557198, + "step": 705 + }, + { + "ce_ib": 7.393013954162598, + "ce_orig": 0.6134517788887024, + "epoch": 0.20274642317923647, + "kl_loss": 0.6545846462249756, + "loss_ib": 0.013938860967755318, + "step": 705 + }, + { + "ce_ib": 8.60453987121582, + "ce_orig": 0.6714913249015808, + "epoch": 0.20274642317923647, + "kl_loss": 0.8573524951934814, + "loss_ib": 0.017178066074848175, + "step": 705 + }, + { + "ce_ib": 8.413896560668945, + "ce_orig": 0.7840592265129089, + "epoch": 0.20274642317923647, + "kl_loss": 0.671722412109375, + "loss_ib": 0.015131120570003986, + "step": 705 + }, + { + "ce_ib": 8.781254768371582, + "ce_orig": 1.1526238918304443, + "epoch": 0.20303400675821412, + "kl_loss": 0.579412579536438, + "loss_ib": 0.014575380831956863, + "step": 706 + }, + { + "ce_ib": 9.846917152404785, + "ce_orig": 0.8160407543182373, + "epoch": 0.20303400675821412, + "kl_loss": 0.7465524673461914, + "loss_ib": 0.017312441021203995, + "step": 706 + }, + { + "ce_ib": 7.303924560546875, + "ce_orig": 0.45040014386177063, + "epoch": 0.20303400675821412, + "kl_loss": 0.5369355082511902, + "loss_ib": 0.012673280201852322, + "step": 706 + }, + { + "ce_ib": 9.779562950134277, + "ce_orig": 0.7343361973762512, + "epoch": 0.20303400675821412, + "kl_loss": 0.6700093150138855, + "loss_ib": 0.016479656100273132, + "step": 706 + }, + { + "ce_ib": 13.944967269897461, + "ce_orig": 1.2236454486846924, + "epoch": 0.20332159033719174, + "kl_loss": 0.7157278060913086, + "loss_ib": 0.021102245897054672, + "step": 707 + }, + { + "ce_ib": 11.342824935913086, + "ce_orig": 1.2302511930465698, + "epoch": 0.20332159033719174, + "kl_loss": 0.6444662809371948, + "loss_ib": 0.01778748631477356, + "step": 707 + }, + { + "ce_ib": 12.286307334899902, + "ce_orig": 1.2609407901763916, + "epoch": 0.20332159033719174, + "kl_loss": 0.890746533870697, + "loss_ib": 0.021193772554397583, + "step": 707 + }, + { + "ce_ib": 8.960893630981445, + "ce_orig": 0.8889079093933105, + "epoch": 0.20332159033719174, + "kl_loss": 0.6222097873687744, + "loss_ib": 0.015182990580797195, + "step": 707 + }, + { + "ce_ib": 8.693357467651367, + "ce_orig": 0.9998373985290527, + "epoch": 0.2036091739161694, + "kl_loss": 0.615609884262085, + "loss_ib": 0.014849456027150154, + "step": 708 + }, + { + "ce_ib": 8.566075325012207, + "ce_orig": 0.7132963538169861, + "epoch": 0.2036091739161694, + "kl_loss": 0.6065419316291809, + "loss_ib": 0.014631494879722595, + "step": 708 + }, + { + "ce_ib": 10.03363037109375, + "ce_orig": 0.62996906042099, + "epoch": 0.2036091739161694, + "kl_loss": 0.7953388690948486, + "loss_ib": 0.017987018451094627, + "step": 708 + }, + { + "ce_ib": 4.836100101470947, + "ce_orig": 0.8530847430229187, + "epoch": 0.2036091739161694, + "kl_loss": 0.5656656622886658, + "loss_ib": 0.01049275603145361, + "step": 708 + }, + { + "ce_ib": 18.170930862426758, + "ce_orig": 2.1947522163391113, + "epoch": 0.20389675749514702, + "kl_loss": 0.7079334259033203, + "loss_ib": 0.0252502653747797, + "step": 709 + }, + { + "ce_ib": 7.828283309936523, + "ce_orig": 0.9819263219833374, + "epoch": 0.20389675749514702, + "kl_loss": 0.6801222562789917, + "loss_ib": 0.014629505574703217, + "step": 709 + }, + { + "ce_ib": 10.20206069946289, + "ce_orig": 1.1113237142562866, + "epoch": 0.20389675749514702, + "kl_loss": 0.7220378518104553, + "loss_ib": 0.01742243953049183, + "step": 709 + }, + { + "ce_ib": 10.351544380187988, + "ce_orig": 1.0983593463897705, + "epoch": 0.20389675749514702, + "kl_loss": 0.5909432172775269, + "loss_ib": 0.01626097597181797, + "step": 709 + }, + { + "epoch": 0.20418434107412467, + "grad_norm": 0.08941768109798431, + "learning_rate": 9.96340031093202e-06, + "loss": 0.9056, + "step": 710 + }, + { + "ce_ib": 11.825169563293457, + "ce_orig": 0.9450059533119202, + "epoch": 0.20418434107412467, + "kl_loss": 0.6044723391532898, + "loss_ib": 0.017869891598820686, + "step": 710 + }, + { + "ce_ib": 7.037207126617432, + "ce_orig": 0.7366377711296082, + "epoch": 0.20418434107412467, + "kl_loss": 0.6832977533340454, + "loss_ib": 0.013870184309780598, + "step": 710 + }, + { + "ce_ib": 5.916614055633545, + "ce_orig": 0.4967573285102844, + "epoch": 0.20418434107412467, + "kl_loss": 0.7460091710090637, + "loss_ib": 0.013376705348491669, + "step": 710 + }, + { + "ce_ib": 6.968353271484375, + "ce_orig": 0.7892439961433411, + "epoch": 0.20418434107412467, + "kl_loss": 0.6605713963508606, + "loss_ib": 0.013574067503213882, + "step": 710 + }, + { + "ce_ib": 6.03911018371582, + "ce_orig": 0.5401313304901123, + "epoch": 0.20447192465310232, + "kl_loss": 0.6317548155784607, + "loss_ib": 0.012356657534837723, + "step": 711 + }, + { + "ce_ib": 14.128073692321777, + "ce_orig": 1.5721606016159058, + "epoch": 0.20447192465310232, + "kl_loss": 0.6152846813201904, + "loss_ib": 0.02028091810643673, + "step": 711 + }, + { + "ce_ib": 13.067774772644043, + "ce_orig": 1.452383279800415, + "epoch": 0.20447192465310232, + "kl_loss": 0.6436014175415039, + "loss_ib": 0.019503789022564888, + "step": 711 + }, + { + "ce_ib": 8.091301918029785, + "ce_orig": 0.7273240685462952, + "epoch": 0.20447192465310232, + "kl_loss": 0.6026842594146729, + "loss_ib": 0.014118144288659096, + "step": 711 + }, + { + "ce_ib": 8.550193786621094, + "ce_orig": 0.7258903980255127, + "epoch": 0.20475950823207995, + "kl_loss": 0.6672005653381348, + "loss_ib": 0.015222198329865932, + "step": 712 + }, + { + "ce_ib": 6.334939002990723, + "ce_orig": 0.5934350490570068, + "epoch": 0.20475950823207995, + "kl_loss": 0.5275875329971313, + "loss_ib": 0.011610814370214939, + "step": 712 + }, + { + "ce_ib": 7.126665115356445, + "ce_orig": 0.6666408181190491, + "epoch": 0.20475950823207995, + "kl_loss": 0.5349841117858887, + "loss_ib": 0.01247650571167469, + "step": 712 + }, + { + "ce_ib": 10.216915130615234, + "ce_orig": 0.9188041090965271, + "epoch": 0.20475950823207995, + "kl_loss": 0.7570939064025879, + "loss_ib": 0.017787855118513107, + "step": 712 + }, + { + "ce_ib": 9.504396438598633, + "ce_orig": 0.8975003957748413, + "epoch": 0.2050470918110576, + "kl_loss": 0.6097875833511353, + "loss_ib": 0.01560227107256651, + "step": 713 + }, + { + "ce_ib": 9.296903610229492, + "ce_orig": 1.331569790840149, + "epoch": 0.2050470918110576, + "kl_loss": 0.6761667728424072, + "loss_ib": 0.01605857163667679, + "step": 713 + }, + { + "ce_ib": 7.583889961242676, + "ce_orig": 0.712594211101532, + "epoch": 0.2050470918110576, + "kl_loss": 0.5781969428062439, + "loss_ib": 0.013365860097110271, + "step": 713 + }, + { + "ce_ib": 11.214754104614258, + "ce_orig": 1.008966088294983, + "epoch": 0.2050470918110576, + "kl_loss": 0.5274929404258728, + "loss_ib": 0.016489684581756592, + "step": 713 + }, + { + "ce_ib": 13.248461723327637, + "ce_orig": 1.7106783390045166, + "epoch": 0.20533467539003522, + "kl_loss": 0.4616009593009949, + "loss_ib": 0.017864469438791275, + "step": 714 + }, + { + "ce_ib": 10.139266967773438, + "ce_orig": 0.7416722178459167, + "epoch": 0.20533467539003522, + "kl_loss": 0.49146589636802673, + "loss_ib": 0.015053926035761833, + "step": 714 + }, + { + "ce_ib": 6.59840202331543, + "ce_orig": 0.7029027342796326, + "epoch": 0.20533467539003522, + "kl_loss": 0.5324534177780151, + "loss_ib": 0.011922935955226421, + "step": 714 + }, + { + "ce_ib": 8.762123107910156, + "ce_orig": 0.8172003030776978, + "epoch": 0.20533467539003522, + "kl_loss": 0.5413413047790527, + "loss_ib": 0.0141755361109972, + "step": 714 + }, + { + "epoch": 0.20562225896901287, + "grad_norm": 0.09573056548833847, + "learning_rate": 9.962457025334114e-06, + "loss": 0.8855, + "step": 715 + }, + { + "ce_ib": 8.281875610351562, + "ce_orig": 0.8879901170730591, + "epoch": 0.20562225896901287, + "kl_loss": 0.47865283489227295, + "loss_ib": 0.01306840404868126, + "step": 715 + }, + { + "ce_ib": 7.326986789703369, + "ce_orig": 0.7315509915351868, + "epoch": 0.20562225896901287, + "kl_loss": 0.5715488195419312, + "loss_ib": 0.01304247509688139, + "step": 715 + }, + { + "ce_ib": 9.936646461486816, + "ce_orig": 0.5313572883605957, + "epoch": 0.20562225896901287, + "kl_loss": 0.7098532915115356, + "loss_ib": 0.01703517884016037, + "step": 715 + }, + { + "ce_ib": 8.325387954711914, + "ce_orig": 1.0558624267578125, + "epoch": 0.20562225896901287, + "kl_loss": 0.510680079460144, + "loss_ib": 0.013432187959551811, + "step": 715 + }, + { + "ce_ib": 6.32294225692749, + "ce_orig": 0.755020260810852, + "epoch": 0.2059098425479905, + "kl_loss": 0.4962891638278961, + "loss_ib": 0.011285834014415741, + "step": 716 + }, + { + "ce_ib": 6.709957122802734, + "ce_orig": 0.7449535131454468, + "epoch": 0.2059098425479905, + "kl_loss": 0.5581060647964478, + "loss_ib": 0.012291017919778824, + "step": 716 + }, + { + "ce_ib": 11.16142749786377, + "ce_orig": 1.2088764905929565, + "epoch": 0.2059098425479905, + "kl_loss": 0.5308176875114441, + "loss_ib": 0.016469605267047882, + "step": 716 + }, + { + "ce_ib": 8.455696105957031, + "ce_orig": 0.8597549200057983, + "epoch": 0.2059098425479905, + "kl_loss": 0.5310311317443848, + "loss_ib": 0.013766007497906685, + "step": 716 + }, + { + "ce_ib": 9.843843460083008, + "ce_orig": 0.9751378297805786, + "epoch": 0.20619742612696815, + "kl_loss": 0.7993011474609375, + "loss_ib": 0.017836853861808777, + "step": 717 + }, + { + "ce_ib": 8.898174285888672, + "ce_orig": 0.609527051448822, + "epoch": 0.20619742612696815, + "kl_loss": 0.9439896941184998, + "loss_ib": 0.018338071182370186, + "step": 717 + }, + { + "ce_ib": 12.661697387695312, + "ce_orig": 1.3917475938796997, + "epoch": 0.20619742612696815, + "kl_loss": 0.6134680509567261, + "loss_ib": 0.01879637874662876, + "step": 717 + }, + { + "ce_ib": 4.8259477615356445, + "ce_orig": 0.48925209045410156, + "epoch": 0.20619742612696815, + "kl_loss": 0.5553791522979736, + "loss_ib": 0.010379738174378872, + "step": 717 + }, + { + "ce_ib": 12.544927597045898, + "ce_orig": 0.7721540331840515, + "epoch": 0.2064850097059458, + "kl_loss": 0.4881455898284912, + "loss_ib": 0.01742638275027275, + "step": 718 + }, + { + "ce_ib": 10.139900207519531, + "ce_orig": 0.9012062549591064, + "epoch": 0.2064850097059458, + "kl_loss": 0.637915313243866, + "loss_ib": 0.016519052907824516, + "step": 718 + }, + { + "ce_ib": 10.471162796020508, + "ce_orig": 1.1879448890686035, + "epoch": 0.2064850097059458, + "kl_loss": 0.8380963802337646, + "loss_ib": 0.018852125853300095, + "step": 718 + }, + { + "ce_ib": 9.023296356201172, + "ce_orig": 0.45530980825424194, + "epoch": 0.2064850097059458, + "kl_loss": 0.6838054656982422, + "loss_ib": 0.015861351042985916, + "step": 718 + }, + { + "ce_ib": 13.007994651794434, + "ce_orig": 1.3200677633285522, + "epoch": 0.20677259328492342, + "kl_loss": 0.5242291688919067, + "loss_ib": 0.01825028657913208, + "step": 719 + }, + { + "ce_ib": 12.970458030700684, + "ce_orig": 1.702121376991272, + "epoch": 0.20677259328492342, + "kl_loss": 0.502326250076294, + "loss_ib": 0.01799372024834156, + "step": 719 + }, + { + "ce_ib": 5.795266151428223, + "ce_orig": 0.5755612254142761, + "epoch": 0.20677259328492342, + "kl_loss": 0.5967477560043335, + "loss_ib": 0.011762742884457111, + "step": 719 + }, + { + "ce_ib": 10.548118591308594, + "ce_orig": 0.9648749828338623, + "epoch": 0.20677259328492342, + "kl_loss": 0.5070229768753052, + "loss_ib": 0.015618347562849522, + "step": 719 + }, + { + "epoch": 0.20706017686390107, + "grad_norm": 0.09754003584384918, + "learning_rate": 9.961501784025423e-06, + "loss": 0.8849, + "step": 720 + }, + { + "ce_ib": 6.8866190910339355, + "ce_orig": 0.45583146810531616, + "epoch": 0.20706017686390107, + "kl_loss": 0.4389684200286865, + "loss_ib": 0.011276302859187126, + "step": 720 + }, + { + "ce_ib": 12.2448148727417, + "ce_orig": 1.3936138153076172, + "epoch": 0.20706017686390107, + "kl_loss": 0.42197367548942566, + "loss_ib": 0.01646455191075802, + "step": 720 + }, + { + "ce_ib": 8.876269340515137, + "ce_orig": 0.5300930738449097, + "epoch": 0.20706017686390107, + "kl_loss": 0.4920913577079773, + "loss_ib": 0.013797182589769363, + "step": 720 + }, + { + "ce_ib": 7.045018672943115, + "ce_orig": 0.7694444060325623, + "epoch": 0.20706017686390107, + "kl_loss": 0.502578854560852, + "loss_ib": 0.012070806697010994, + "step": 720 + }, + { + "ce_ib": 8.191524505615234, + "ce_orig": 0.5949411988258362, + "epoch": 0.2073477604428787, + "kl_loss": 0.6976209878921509, + "loss_ib": 0.01516773458570242, + "step": 721 + }, + { + "ce_ib": 8.799589157104492, + "ce_orig": 0.6730172634124756, + "epoch": 0.2073477604428787, + "kl_loss": 0.6129693388938904, + "loss_ib": 0.01492928247898817, + "step": 721 + }, + { + "ce_ib": 9.574918746948242, + "ce_orig": 1.5401928424835205, + "epoch": 0.2073477604428787, + "kl_loss": 0.4680205285549164, + "loss_ib": 0.014255124144256115, + "step": 721 + }, + { + "ce_ib": 11.500478744506836, + "ce_orig": 1.1516295671463013, + "epoch": 0.2073477604428787, + "kl_loss": 0.5712054967880249, + "loss_ib": 0.017212534323334694, + "step": 721 + }, + { + "ce_ib": 6.9377641677856445, + "ce_orig": 0.47552016377449036, + "epoch": 0.20763534402185635, + "kl_loss": 0.5818679332733154, + "loss_ib": 0.01275644265115261, + "step": 722 + }, + { + "ce_ib": 7.533665180206299, + "ce_orig": 0.8571917414665222, + "epoch": 0.20763534402185635, + "kl_loss": 0.4062355160713196, + "loss_ib": 0.011596020311117172, + "step": 722 + }, + { + "ce_ib": 4.523435592651367, + "ce_orig": 0.5144174098968506, + "epoch": 0.20763534402185635, + "kl_loss": 0.46129077672958374, + "loss_ib": 0.009136342443525791, + "step": 722 + }, + { + "ce_ib": 5.738825798034668, + "ce_orig": 0.3778877556324005, + "epoch": 0.20763534402185635, + "kl_loss": 0.4390355944633484, + "loss_ib": 0.010129181668162346, + "step": 722 + }, + { + "ce_ib": 6.621108055114746, + "ce_orig": 0.5974579453468323, + "epoch": 0.207922927600834, + "kl_loss": 0.444467157125473, + "loss_ib": 0.011065779253840446, + "step": 723 + }, + { + "ce_ib": 9.774152755737305, + "ce_orig": 0.6996477246284485, + "epoch": 0.207922927600834, + "kl_loss": 0.605032742023468, + "loss_ib": 0.01582447998225689, + "step": 723 + }, + { + "ce_ib": 8.50704574584961, + "ce_orig": 0.8165543079376221, + "epoch": 0.207922927600834, + "kl_loss": 0.6351085901260376, + "loss_ib": 0.014858131296932697, + "step": 723 + }, + { + "ce_ib": 5.1617231369018555, + "ce_orig": 0.5427976250648499, + "epoch": 0.207922927600834, + "kl_loss": 0.4432010054588318, + "loss_ib": 0.009593733586370945, + "step": 723 + }, + { + "ce_ib": 10.265229225158691, + "ce_orig": 0.3247712552547455, + "epoch": 0.20821051117981162, + "kl_loss": 0.9256395697593689, + "loss_ib": 0.019521623849868774, + "step": 724 + }, + { + "ce_ib": 12.566752433776855, + "ce_orig": 0.9292450547218323, + "epoch": 0.20821051117981162, + "kl_loss": 0.5198833346366882, + "loss_ib": 0.017765585333108902, + "step": 724 + }, + { + "ce_ib": 4.4857707023620605, + "ce_orig": 0.13291123509407043, + "epoch": 0.20821051117981162, + "kl_loss": 0.8171424865722656, + "loss_ib": 0.012657195329666138, + "step": 724 + }, + { + "ce_ib": 11.237386703491211, + "ce_orig": 1.4978471994400024, + "epoch": 0.20821051117981162, + "kl_loss": 0.6373554468154907, + "loss_ib": 0.01761094108223915, + "step": 724 + }, + { + "epoch": 0.20849809475878928, + "grad_norm": 0.08204614371061325, + "learning_rate": 9.960534589307342e-06, + "loss": 0.9127, + "step": 725 + }, + { + "ce_ib": 6.750695705413818, + "ce_orig": 0.7500053644180298, + "epoch": 0.20849809475878928, + "kl_loss": 0.44563400745391846, + "loss_ib": 0.0112070357427001, + "step": 725 + }, + { + "ce_ib": 7.751323223114014, + "ce_orig": 1.0083248615264893, + "epoch": 0.20849809475878928, + "kl_loss": 0.41058549284935, + "loss_ib": 0.011857178062200546, + "step": 725 + }, + { + "ce_ib": 8.529701232910156, + "ce_orig": 0.73545902967453, + "epoch": 0.20849809475878928, + "kl_loss": 0.5081138014793396, + "loss_ib": 0.01361083984375, + "step": 725 + }, + { + "ce_ib": 6.2612080574035645, + "ce_orig": 0.5796510577201843, + "epoch": 0.20849809475878928, + "kl_loss": 0.39641040563583374, + "loss_ib": 0.010225312784314156, + "step": 725 + }, + { + "ce_ib": 6.802225112915039, + "ce_orig": 0.6491565704345703, + "epoch": 0.2087856783377669, + "kl_loss": 0.40076354146003723, + "loss_ib": 0.010809860192239285, + "step": 726 + }, + { + "ce_ib": 10.529885292053223, + "ce_orig": 0.8254870772361755, + "epoch": 0.2087856783377669, + "kl_loss": 0.5448965430259705, + "loss_ib": 0.015978850424289703, + "step": 726 + }, + { + "ce_ib": 12.080382347106934, + "ce_orig": 1.3747988939285278, + "epoch": 0.2087856783377669, + "kl_loss": 0.607007622718811, + "loss_ib": 0.018150458112359047, + "step": 726 + }, + { + "ce_ib": 13.895086288452148, + "ce_orig": 1.569737195968628, + "epoch": 0.2087856783377669, + "kl_loss": 0.41842395067214966, + "loss_ib": 0.018079325556755066, + "step": 726 + }, + { + "ce_ib": 10.388771057128906, + "ce_orig": 0.8246784806251526, + "epoch": 0.20907326191674455, + "kl_loss": 0.6921413540840149, + "loss_ib": 0.017310185357928276, + "step": 727 + }, + { + "ce_ib": 10.750786781311035, + "ce_orig": 0.6747448444366455, + "epoch": 0.20907326191674455, + "kl_loss": 0.46249350905418396, + "loss_ib": 0.015375722199678421, + "step": 727 + }, + { + "ce_ib": 8.939618110656738, + "ce_orig": 0.611909806728363, + "epoch": 0.20907326191674455, + "kl_loss": 0.4692227244377136, + "loss_ib": 0.013631845824420452, + "step": 727 + }, + { + "ce_ib": 5.823955059051514, + "ce_orig": 0.5695940852165222, + "epoch": 0.20907326191674455, + "kl_loss": 0.4087258577346802, + "loss_ib": 0.00991121307015419, + "step": 727 + }, + { + "ce_ib": 7.428273677825928, + "ce_orig": 0.6111478805541992, + "epoch": 0.2093608454957222, + "kl_loss": 0.4407083988189697, + "loss_ib": 0.01183535810559988, + "step": 728 + }, + { + "ce_ib": 8.432186126708984, + "ce_orig": 0.7240220308303833, + "epoch": 0.2093608454957222, + "kl_loss": 0.5889390707015991, + "loss_ib": 0.014321576803922653, + "step": 728 + }, + { + "ce_ib": 9.506328582763672, + "ce_orig": 0.885880708694458, + "epoch": 0.2093608454957222, + "kl_loss": 0.39755725860595703, + "loss_ib": 0.013481900095939636, + "step": 728 + }, + { + "ce_ib": 10.747995376586914, + "ce_orig": 0.9302851557731628, + "epoch": 0.2093608454957222, + "kl_loss": 0.6458637118339539, + "loss_ib": 0.01720663346350193, + "step": 728 + }, + { + "ce_ib": 8.671708106994629, + "ce_orig": 0.6951517462730408, + "epoch": 0.20964842907469983, + "kl_loss": 0.547339916229248, + "loss_ib": 0.014145107008516788, + "step": 729 + }, + { + "ce_ib": 9.09277629852295, + "ce_orig": 0.6532163023948669, + "epoch": 0.20964842907469983, + "kl_loss": 0.4491899609565735, + "loss_ib": 0.013584675267338753, + "step": 729 + }, + { + "ce_ib": 8.521883964538574, + "ce_orig": 0.26346156001091003, + "epoch": 0.20964842907469983, + "kl_loss": 0.7473446130752563, + "loss_ib": 0.0159953311085701, + "step": 729 + }, + { + "ce_ib": 8.46525764465332, + "ce_orig": 0.761343777179718, + "epoch": 0.20964842907469983, + "kl_loss": 0.49551889300346375, + "loss_ib": 0.013420446775853634, + "step": 729 + }, + { + "epoch": 0.20993601265367748, + "grad_norm": 0.10517257452011108, + "learning_rate": 9.959555443510074e-06, + "loss": 0.8883, + "step": 730 + }, + { + "ce_ib": 9.539706230163574, + "ce_orig": 0.8811664581298828, + "epoch": 0.20993601265367748, + "kl_loss": 0.4699278473854065, + "loss_ib": 0.014238984324038029, + "step": 730 + }, + { + "ce_ib": 10.79475212097168, + "ce_orig": 0.9388590455055237, + "epoch": 0.20993601265367748, + "kl_loss": 1.093339443206787, + "loss_ib": 0.0217281486839056, + "step": 730 + }, + { + "ce_ib": 11.920876502990723, + "ce_orig": 1.155131220817566, + "epoch": 0.20993601265367748, + "kl_loss": 0.49514299631118774, + "loss_ib": 0.016872305423021317, + "step": 730 + }, + { + "ce_ib": 12.770062446594238, + "ce_orig": 1.2717667818069458, + "epoch": 0.20993601265367748, + "kl_loss": 0.467684268951416, + "loss_ib": 0.017446905374526978, + "step": 730 + }, + { + "ce_ib": 9.915130615234375, + "ce_orig": 0.7397444248199463, + "epoch": 0.2102235962326551, + "kl_loss": 0.5744942426681519, + "loss_ib": 0.01566007360816002, + "step": 731 + }, + { + "ce_ib": 11.337221145629883, + "ce_orig": 0.6278934478759766, + "epoch": 0.2102235962326551, + "kl_loss": 0.47471189498901367, + "loss_ib": 0.0160843413323164, + "step": 731 + }, + { + "ce_ib": 6.727563858032227, + "ce_orig": 0.5554884076118469, + "epoch": 0.2102235962326551, + "kl_loss": 0.38505086302757263, + "loss_ib": 0.01057807169854641, + "step": 731 + }, + { + "ce_ib": 4.216444492340088, + "ce_orig": 0.15175974369049072, + "epoch": 0.2102235962326551, + "kl_loss": 0.7317667007446289, + "loss_ib": 0.011534111574292183, + "step": 731 + }, + { + "ce_ib": 6.452449798583984, + "ce_orig": 0.48745986819267273, + "epoch": 0.21051117981163275, + "kl_loss": 0.4054802656173706, + "loss_ib": 0.0105072520673275, + "step": 732 + }, + { + "ce_ib": 7.071081161499023, + "ce_orig": 0.9309906363487244, + "epoch": 0.21051117981163275, + "kl_loss": 0.37749940156936646, + "loss_ib": 0.010846075601875782, + "step": 732 + }, + { + "ce_ib": 11.04909896850586, + "ce_orig": 0.874702513217926, + "epoch": 0.21051117981163275, + "kl_loss": 0.4390341639518738, + "loss_ib": 0.015439440496265888, + "step": 732 + }, + { + "ce_ib": 7.799300193786621, + "ce_orig": 0.8155895471572876, + "epoch": 0.21051117981163275, + "kl_loss": 0.4740564823150635, + "loss_ib": 0.01253986544907093, + "step": 732 + }, + { + "ce_ib": 7.702383995056152, + "ce_orig": 0.5887701511383057, + "epoch": 0.2107987633906104, + "kl_loss": 0.5023674964904785, + "loss_ib": 0.01272605825215578, + "step": 733 + }, + { + "ce_ib": 8.737090110778809, + "ce_orig": 0.8836251497268677, + "epoch": 0.2107987633906104, + "kl_loss": 0.48343145847320557, + "loss_ib": 0.013571404851973057, + "step": 733 + }, + { + "ce_ib": 9.702954292297363, + "ce_orig": 0.9348666071891785, + "epoch": 0.2107987633906104, + "kl_loss": 0.7717263698577881, + "loss_ib": 0.017420217394828796, + "step": 733 + }, + { + "ce_ib": 9.321721076965332, + "ce_orig": 0.6164664626121521, + "epoch": 0.2107987633906104, + "kl_loss": 0.5251142382621765, + "loss_ib": 0.014572863467037678, + "step": 733 + }, + { + "ce_ib": 10.808871269226074, + "ce_orig": 0.7415775656700134, + "epoch": 0.21108634696958803, + "kl_loss": 0.5616943836212158, + "loss_ib": 0.01642581634223461, + "step": 734 + }, + { + "ce_ib": 9.013411521911621, + "ce_orig": 0.6960151195526123, + "epoch": 0.21108634696958803, + "kl_loss": 0.45591843128204346, + "loss_ib": 0.013572595082223415, + "step": 734 + }, + { + "ce_ib": 10.70376205444336, + "ce_orig": 0.8505563735961914, + "epoch": 0.21108634696958803, + "kl_loss": 0.4155931770801544, + "loss_ib": 0.014859694056212902, + "step": 734 + }, + { + "ce_ib": 10.576190948486328, + "ce_orig": 1.013388752937317, + "epoch": 0.21108634696958803, + "kl_loss": 0.5600894689559937, + "loss_ib": 0.016177086159586906, + "step": 734 + }, + { + "epoch": 0.21137393054856568, + "grad_norm": 0.10216815024614334, + "learning_rate": 9.958564348992604e-06, + "loss": 0.9112, + "step": 735 + }, + { + "ce_ib": 8.351442337036133, + "ce_orig": 0.6271442174911499, + "epoch": 0.21137393054856568, + "kl_loss": 0.45761042833328247, + "loss_ib": 0.012927546165883541, + "step": 735 + }, + { + "ce_ib": 10.175559997558594, + "ce_orig": 0.9649935960769653, + "epoch": 0.21137393054856568, + "kl_loss": 0.5237610340118408, + "loss_ib": 0.01541317068040371, + "step": 735 + }, + { + "ce_ib": 11.151328086853027, + "ce_orig": 1.160567045211792, + "epoch": 0.21137393054856568, + "kl_loss": 0.44404470920562744, + "loss_ib": 0.015591775067150593, + "step": 735 + }, + { + "ce_ib": 8.23231029510498, + "ce_orig": 0.8223357200622559, + "epoch": 0.21137393054856568, + "kl_loss": 0.4122684597969055, + "loss_ib": 0.012354995124042034, + "step": 735 + }, + { + "ce_ib": 9.226214408874512, + "ce_orig": 1.0122361183166504, + "epoch": 0.2116615141275433, + "kl_loss": 0.3391454219818115, + "loss_ib": 0.012617669068276882, + "step": 736 + }, + { + "ce_ib": 12.772435188293457, + "ce_orig": 1.63818359375, + "epoch": 0.2116615141275433, + "kl_loss": 0.6720362901687622, + "loss_ib": 0.019492797553539276, + "step": 736 + }, + { + "ce_ib": 10.953327178955078, + "ce_orig": 1.301689624786377, + "epoch": 0.2116615141275433, + "kl_loss": 0.4017236828804016, + "loss_ib": 0.014970564283430576, + "step": 736 + }, + { + "ce_ib": 13.185845375061035, + "ce_orig": 1.3509107828140259, + "epoch": 0.2116615141275433, + "kl_loss": 0.3720896244049072, + "loss_ib": 0.0169067420065403, + "step": 736 + }, + { + "ce_ib": 5.992466449737549, + "ce_orig": 0.6994779706001282, + "epoch": 0.21194909770652096, + "kl_loss": 0.37846922874450684, + "loss_ib": 0.009777158498764038, + "step": 737 + }, + { + "ce_ib": 4.946014404296875, + "ce_orig": 0.448905885219574, + "epoch": 0.21194909770652096, + "kl_loss": 0.3910108208656311, + "loss_ib": 0.008856122381985188, + "step": 737 + }, + { + "ce_ib": 8.710723876953125, + "ce_orig": 0.955891489982605, + "epoch": 0.21194909770652096, + "kl_loss": 0.3540381193161011, + "loss_ib": 0.012251105159521103, + "step": 737 + }, + { + "ce_ib": 9.716163635253906, + "ce_orig": 0.777677595615387, + "epoch": 0.21194909770652096, + "kl_loss": 0.840921938419342, + "loss_ib": 0.0181253831833601, + "step": 737 + }, + { + "ce_ib": 9.742884635925293, + "ce_orig": 0.8019170761108398, + "epoch": 0.2122366812854986, + "kl_loss": 0.4170638918876648, + "loss_ib": 0.013913523405790329, + "step": 738 + }, + { + "ce_ib": 9.633951187133789, + "ce_orig": 0.9468701481819153, + "epoch": 0.2122366812854986, + "kl_loss": 0.44602394104003906, + "loss_ib": 0.014094190672039986, + "step": 738 + }, + { + "ce_ib": 9.999152183532715, + "ce_orig": 1.5994606018066406, + "epoch": 0.2122366812854986, + "kl_loss": 0.34741610288619995, + "loss_ib": 0.013473312370479107, + "step": 738 + }, + { + "ce_ib": 7.614631175994873, + "ce_orig": 0.44442856311798096, + "epoch": 0.2122366812854986, + "kl_loss": 0.42184555530548096, + "loss_ib": 0.011833085678517818, + "step": 738 + }, + { + "ce_ib": 3.805742025375366, + "ce_orig": 0.3643724024295807, + "epoch": 0.21252426486447623, + "kl_loss": 0.73167484998703, + "loss_ib": 0.011122490279376507, + "step": 739 + }, + { + "ce_ib": 7.001569747924805, + "ce_orig": 0.6076573133468628, + "epoch": 0.21252426486447623, + "kl_loss": 0.3848869204521179, + "loss_ib": 0.010850438848137856, + "step": 739 + }, + { + "ce_ib": 6.058032035827637, + "ce_orig": 0.9139571785926819, + "epoch": 0.21252426486447623, + "kl_loss": 0.43339502811431885, + "loss_ib": 0.010391981340944767, + "step": 739 + }, + { + "ce_ib": 9.221721649169922, + "ce_orig": 1.1071618795394897, + "epoch": 0.21252426486447623, + "kl_loss": 0.3810133934020996, + "loss_ib": 0.01303185522556305, + "step": 739 + }, + { + "epoch": 0.21281184844345388, + "grad_norm": 0.09019612520933151, + "learning_rate": 9.95756130814271e-06, + "loss": 0.8816, + "step": 740 + }, + { + "ce_ib": 9.799250602722168, + "ce_orig": 0.7617897987365723, + "epoch": 0.21281184844345388, + "kl_loss": 0.46645045280456543, + "loss_ib": 0.014463755302131176, + "step": 740 + }, + { + "ce_ib": 8.431238174438477, + "ce_orig": 0.7609443068504333, + "epoch": 0.21281184844345388, + "kl_loss": 0.353823721408844, + "loss_ib": 0.011969475075602531, + "step": 740 + }, + { + "ce_ib": 6.829717636108398, + "ce_orig": 0.6648247241973877, + "epoch": 0.21281184844345388, + "kl_loss": 0.3253084421157837, + "loss_ib": 0.01008280273526907, + "step": 740 + }, + { + "ce_ib": 9.891703605651855, + "ce_orig": 1.1517934799194336, + "epoch": 0.21281184844345388, + "kl_loss": 0.43470141291618347, + "loss_ib": 0.014238717034459114, + "step": 740 + }, + { + "ce_ib": 6.036440372467041, + "ce_orig": 0.5608431696891785, + "epoch": 0.2130994320224315, + "kl_loss": 0.5712899565696716, + "loss_ib": 0.01174934022128582, + "step": 741 + }, + { + "ce_ib": 7.961015224456787, + "ce_orig": 0.5696125626564026, + "epoch": 0.2130994320224315, + "kl_loss": 0.37901923060417175, + "loss_ib": 0.01175120659172535, + "step": 741 + }, + { + "ce_ib": 9.610107421875, + "ce_orig": 0.6799657344818115, + "epoch": 0.2130994320224315, + "kl_loss": 0.5099486112594604, + "loss_ib": 0.0147095937281847, + "step": 741 + }, + { + "ce_ib": 10.553266525268555, + "ce_orig": 1.0597807168960571, + "epoch": 0.2130994320224315, + "kl_loss": 0.9013509750366211, + "loss_ib": 0.019566776230931282, + "step": 741 + }, + { + "ce_ib": 11.050984382629395, + "ce_orig": 1.2893459796905518, + "epoch": 0.21338701560140916, + "kl_loss": 0.45824098587036133, + "loss_ib": 0.015633394941687584, + "step": 742 + }, + { + "ce_ib": 10.593522071838379, + "ce_orig": 1.4311033487319946, + "epoch": 0.21338701560140916, + "kl_loss": 0.33632180094718933, + "loss_ib": 0.013956740498542786, + "step": 742 + }, + { + "ce_ib": 8.308612823486328, + "ce_orig": 0.9085618257522583, + "epoch": 0.21338701560140916, + "kl_loss": 0.42611122131347656, + "loss_ib": 0.012569725513458252, + "step": 742 + }, + { + "ce_ib": 8.803531646728516, + "ce_orig": 0.5293317437171936, + "epoch": 0.21338701560140916, + "kl_loss": 0.42089781165122986, + "loss_ib": 0.013012508861720562, + "step": 742 + }, + { + "ce_ib": 10.681037902832031, + "ce_orig": 0.6758571863174438, + "epoch": 0.2136745991803868, + "kl_loss": 0.4455721974372864, + "loss_ib": 0.015136758796870708, + "step": 743 + }, + { + "ce_ib": 5.145867347717285, + "ce_orig": 0.4771097004413605, + "epoch": 0.2136745991803868, + "kl_loss": 0.3146681487560272, + "loss_ib": 0.0082925483584404, + "step": 743 + }, + { + "ce_ib": 8.998390197753906, + "ce_orig": 0.4767987132072449, + "epoch": 0.2136745991803868, + "kl_loss": 0.44158506393432617, + "loss_ib": 0.013414240442216396, + "step": 743 + }, + { + "ce_ib": 10.034675598144531, + "ce_orig": 0.8264948129653931, + "epoch": 0.2136745991803868, + "kl_loss": 0.36563044786453247, + "loss_ib": 0.013690979219973087, + "step": 743 + }, + { + "ce_ib": 8.774846076965332, + "ce_orig": 0.8711258172988892, + "epoch": 0.21396218275936443, + "kl_loss": 0.448600172996521, + "loss_ib": 0.013260847888886929, + "step": 744 + }, + { + "ce_ib": 6.589221954345703, + "ce_orig": 0.8571322560310364, + "epoch": 0.21396218275936443, + "kl_loss": 0.32901668548583984, + "loss_ib": 0.009879388846457005, + "step": 744 + }, + { + "ce_ib": 7.771622657775879, + "ce_orig": 0.9770241379737854, + "epoch": 0.21396218275936443, + "kl_loss": 0.5221589803695679, + "loss_ib": 0.01299321185797453, + "step": 744 + }, + { + "ce_ib": 12.875645637512207, + "ce_orig": 1.2932630777359009, + "epoch": 0.21396218275936443, + "kl_loss": 0.5584806799888611, + "loss_ib": 0.018460452556610107, + "step": 744 + }, + { + "epoch": 0.21424976633834208, + "grad_norm": 0.11534402519464493, + "learning_rate": 9.956546323376948e-06, + "loss": 0.8441, + "step": 745 + }, + { + "ce_ib": 12.278318405151367, + "ce_orig": 0.611358642578125, + "epoch": 0.21424976633834208, + "kl_loss": 0.5517995357513428, + "loss_ib": 0.017796313390135765, + "step": 745 + }, + { + "ce_ib": 9.37865161895752, + "ce_orig": 0.9221431612968445, + "epoch": 0.21424976633834208, + "kl_loss": 0.2928432822227478, + "loss_ib": 0.012307084165513515, + "step": 745 + }, + { + "ce_ib": 8.047618865966797, + "ce_orig": 0.9915336966514587, + "epoch": 0.21424976633834208, + "kl_loss": 0.45079469680786133, + "loss_ib": 0.012555565685033798, + "step": 745 + }, + { + "ce_ib": 9.573211669921875, + "ce_orig": 0.9948348999023438, + "epoch": 0.21424976633834208, + "kl_loss": 0.4275258481502533, + "loss_ib": 0.013848470523953438, + "step": 745 + }, + { + "ce_ib": 8.365690231323242, + "ce_orig": 0.8193045258522034, + "epoch": 0.2145373499173197, + "kl_loss": 0.44444945454597473, + "loss_ib": 0.012810184620320797, + "step": 746 + }, + { + "ce_ib": 6.967347621917725, + "ce_orig": 0.4872395098209381, + "epoch": 0.2145373499173197, + "kl_loss": 0.37023940682411194, + "loss_ib": 0.010669741779565811, + "step": 746 + }, + { + "ce_ib": 7.190746784210205, + "ce_orig": 0.6708618402481079, + "epoch": 0.2145373499173197, + "kl_loss": 0.3780478239059448, + "loss_ib": 0.010971223935484886, + "step": 746 + }, + { + "ce_ib": 11.051678657531738, + "ce_orig": 1.0682791471481323, + "epoch": 0.2145373499173197, + "kl_loss": 0.33363956212997437, + "loss_ib": 0.014388074167072773, + "step": 746 + }, + { + "ce_ib": 8.238478660583496, + "ce_orig": 0.7487781047821045, + "epoch": 0.21482493349629736, + "kl_loss": 0.37909066677093506, + "loss_ib": 0.012029385194182396, + "step": 747 + }, + { + "ce_ib": 12.03074836730957, + "ce_orig": 1.157182216644287, + "epoch": 0.21482493349629736, + "kl_loss": 0.37361055612564087, + "loss_ib": 0.015766853466629982, + "step": 747 + }, + { + "ce_ib": 8.529624938964844, + "ce_orig": 1.0179344415664673, + "epoch": 0.21482493349629736, + "kl_loss": 0.30930227041244507, + "loss_ib": 0.011622647754848003, + "step": 747 + }, + { + "ce_ib": 7.966403007507324, + "ce_orig": 0.6700258255004883, + "epoch": 0.21482493349629736, + "kl_loss": 0.37224870920181274, + "loss_ib": 0.011688889935612679, + "step": 747 + }, + { + "ce_ib": 9.755182266235352, + "ce_orig": 1.0040427446365356, + "epoch": 0.215112517075275, + "kl_loss": 0.3066710829734802, + "loss_ib": 0.012821893207728863, + "step": 748 + }, + { + "ce_ib": 10.194023132324219, + "ce_orig": 0.5760530233383179, + "epoch": 0.215112517075275, + "kl_loss": 0.5293586850166321, + "loss_ib": 0.015487611293792725, + "step": 748 + }, + { + "ce_ib": 11.507509231567383, + "ce_orig": 1.2553309202194214, + "epoch": 0.215112517075275, + "kl_loss": 0.3195509910583496, + "loss_ib": 0.01470301952213049, + "step": 748 + }, + { + "ce_ib": 10.408110618591309, + "ce_orig": 0.725406289100647, + "epoch": 0.215112517075275, + "kl_loss": 0.3801497220993042, + "loss_ib": 0.014209607616066933, + "step": 748 + }, + { + "ce_ib": 8.620140075683594, + "ce_orig": 0.7735955715179443, + "epoch": 0.21540010065425264, + "kl_loss": 0.5387185215950012, + "loss_ib": 0.014007325284183025, + "step": 749 + }, + { + "ce_ib": 5.869650363922119, + "ce_orig": 0.7176041603088379, + "epoch": 0.21540010065425264, + "kl_loss": 0.37166261672973633, + "loss_ib": 0.009586276486515999, + "step": 749 + }, + { + "ce_ib": 9.476244926452637, + "ce_orig": 1.0505683422088623, + "epoch": 0.21540010065425264, + "kl_loss": 0.3689851760864258, + "loss_ib": 0.013166096061468124, + "step": 749 + }, + { + "ce_ib": 10.051746368408203, + "ce_orig": 0.613065779209137, + "epoch": 0.21540010065425264, + "kl_loss": 0.3970192074775696, + "loss_ib": 0.014021937735378742, + "step": 749 + }, + { + "epoch": 0.2156876842332303, + "grad_norm": 0.09746310114860535, + "learning_rate": 9.955519397140656e-06, + "loss": 0.9247, + "step": 750 + }, + { + "ce_ib": 8.546123504638672, + "ce_orig": 0.8507575392723083, + "epoch": 0.2156876842332303, + "kl_loss": 0.3480690121650696, + "loss_ib": 0.012026812881231308, + "step": 750 + }, + { + "ce_ib": 12.74342155456543, + "ce_orig": 0.9122403860092163, + "epoch": 0.2156876842332303, + "kl_loss": 0.39195001125335693, + "loss_ib": 0.016662921756505966, + "step": 750 + }, + { + "ce_ib": 12.673941612243652, + "ce_orig": 1.130251407623291, + "epoch": 0.2156876842332303, + "kl_loss": 0.38955867290496826, + "loss_ib": 0.016569528728723526, + "step": 750 + }, + { + "ce_ib": 11.575626373291016, + "ce_orig": 0.889440655708313, + "epoch": 0.2156876842332303, + "kl_loss": 0.3685222864151001, + "loss_ib": 0.015260848216712475, + "step": 750 + }, + { + "ce_ib": 9.500494003295898, + "ce_orig": 0.5060651302337646, + "epoch": 0.2159752678122079, + "kl_loss": 0.2954930067062378, + "loss_ib": 0.01245542336255312, + "step": 751 + }, + { + "ce_ib": 11.144340515136719, + "ce_orig": 0.7600336670875549, + "epoch": 0.2159752678122079, + "kl_loss": 0.6074585914611816, + "loss_ib": 0.017218926921486855, + "step": 751 + }, + { + "ce_ib": 7.084355354309082, + "ce_orig": 0.8810204267501831, + "epoch": 0.2159752678122079, + "kl_loss": 0.4073539078235626, + "loss_ib": 0.01115789357572794, + "step": 751 + }, + { + "ce_ib": 8.677960395812988, + "ce_orig": 0.8457415103912354, + "epoch": 0.2159752678122079, + "kl_loss": 0.6076983213424683, + "loss_ib": 0.014754943549633026, + "step": 751 + }, + { + "ce_ib": 10.253267288208008, + "ce_orig": 1.123890995979309, + "epoch": 0.21626285139118556, + "kl_loss": 0.30214011669158936, + "loss_ib": 0.013274667784571648, + "step": 752 + }, + { + "ce_ib": 11.143954277038574, + "ce_orig": 0.9621107578277588, + "epoch": 0.21626285139118556, + "kl_loss": 0.40844857692718506, + "loss_ib": 0.015228440053761005, + "step": 752 + }, + { + "ce_ib": 7.633870601654053, + "ce_orig": 0.7040098309516907, + "epoch": 0.21626285139118556, + "kl_loss": 0.4934859871864319, + "loss_ib": 0.012568730860948563, + "step": 752 + }, + { + "ce_ib": 15.08940601348877, + "ce_orig": 1.8990564346313477, + "epoch": 0.21626285139118556, + "kl_loss": 0.3932008445262909, + "loss_ib": 0.0190214142203331, + "step": 752 + }, + { + "ce_ib": 6.168686389923096, + "ce_orig": 0.654904305934906, + "epoch": 0.2165504349701632, + "kl_loss": 0.34978413581848145, + "loss_ib": 0.00966652762144804, + "step": 753 + }, + { + "ce_ib": 14.297883987426758, + "ce_orig": 1.6642775535583496, + "epoch": 0.2165504349701632, + "kl_loss": 0.41902047395706177, + "loss_ib": 0.018488090485334396, + "step": 753 + }, + { + "ce_ib": 8.20667839050293, + "ce_orig": 1.1301182508468628, + "epoch": 0.2165504349701632, + "kl_loss": 0.36091458797454834, + "loss_ib": 0.011815824545919895, + "step": 753 + }, + { + "ce_ib": 10.069759368896484, + "ce_orig": 0.8426340818405151, + "epoch": 0.2165504349701632, + "kl_loss": 0.4236080050468445, + "loss_ib": 0.0143058393150568, + "step": 753 + }, + { + "ce_ib": 9.213713645935059, + "ce_orig": 1.3873950242996216, + "epoch": 0.21683801854914084, + "kl_loss": 0.41740885376930237, + "loss_ib": 0.013387802988290787, + "step": 754 + }, + { + "ce_ib": 8.983365058898926, + "ce_orig": 0.7299124598503113, + "epoch": 0.21683801854914084, + "kl_loss": 0.347156286239624, + "loss_ib": 0.012454927898943424, + "step": 754 + }, + { + "ce_ib": 8.433507919311523, + "ce_orig": 1.1746602058410645, + "epoch": 0.21683801854914084, + "kl_loss": 0.3870384693145752, + "loss_ib": 0.012303893454372883, + "step": 754 + }, + { + "ce_ib": 8.158003807067871, + "ce_orig": 0.9122118353843689, + "epoch": 0.21683801854914084, + "kl_loss": 0.32447659969329834, + "loss_ib": 0.011402769014239311, + "step": 754 + }, + { + "epoch": 0.2171256021281185, + "grad_norm": 0.10924255847930908, + "learning_rate": 9.954480531907935e-06, + "loss": 0.9322, + "step": 755 + }, + { + "ce_ib": 7.25479793548584, + "ce_orig": 0.7606655955314636, + "epoch": 0.2171256021281185, + "kl_loss": 0.38670188188552856, + "loss_ib": 0.011121816001832485, + "step": 755 + }, + { + "ce_ib": 6.150333881378174, + "ce_orig": 0.5132085680961609, + "epoch": 0.2171256021281185, + "kl_loss": 0.5129187107086182, + "loss_ib": 0.011279520578682423, + "step": 755 + }, + { + "ce_ib": 8.504961967468262, + "ce_orig": 0.43100351095199585, + "epoch": 0.2171256021281185, + "kl_loss": 0.36954519152641296, + "loss_ib": 0.012200413271784782, + "step": 755 + }, + { + "ce_ib": 7.746326923370361, + "ce_orig": 0.9925227165222168, + "epoch": 0.2171256021281185, + "kl_loss": 0.35602182149887085, + "loss_ib": 0.011306545697152615, + "step": 755 + }, + { + "ce_ib": 11.200096130371094, + "ce_orig": 0.693459689617157, + "epoch": 0.2174131857070961, + "kl_loss": 0.43727046251296997, + "loss_ib": 0.015572800301015377, + "step": 756 + }, + { + "ce_ib": 9.643890380859375, + "ce_orig": 0.9979680180549622, + "epoch": 0.2174131857070961, + "kl_loss": 0.4014120399951935, + "loss_ib": 0.0136580104008317, + "step": 756 + }, + { + "ce_ib": 9.413188934326172, + "ce_orig": 0.6425915360450745, + "epoch": 0.2174131857070961, + "kl_loss": 0.41752490401268005, + "loss_ib": 0.0135884378105402, + "step": 756 + }, + { + "ce_ib": 8.591798782348633, + "ce_orig": 0.6160233020782471, + "epoch": 0.2174131857070961, + "kl_loss": 0.39359089732170105, + "loss_ib": 0.0125277079641819, + "step": 756 + }, + { + "ce_ib": 9.97320556640625, + "ce_orig": 0.9202979803085327, + "epoch": 0.21770076928607376, + "kl_loss": 0.38434839248657227, + "loss_ib": 0.01381669007241726, + "step": 757 + }, + { + "ce_ib": 11.336276054382324, + "ce_orig": 1.1318460702896118, + "epoch": 0.21770076928607376, + "kl_loss": 0.426124632358551, + "loss_ib": 0.015597522258758545, + "step": 757 + }, + { + "ce_ib": 7.240839958190918, + "ce_orig": 0.6424944996833801, + "epoch": 0.21770076928607376, + "kl_loss": 0.47551506757736206, + "loss_ib": 0.011995989829301834, + "step": 757 + }, + { + "ce_ib": 8.176185607910156, + "ce_orig": 0.8402231931686401, + "epoch": 0.21770076928607376, + "kl_loss": 0.3214970827102661, + "loss_ib": 0.011391155421733856, + "step": 757 + }, + { + "ce_ib": 7.582910060882568, + "ce_orig": 0.6925665736198425, + "epoch": 0.21798835286505142, + "kl_loss": 0.2832297086715698, + "loss_ib": 0.010415206663310528, + "step": 758 + }, + { + "ce_ib": 6.832791805267334, + "ce_orig": 0.514137327671051, + "epoch": 0.21798835286505142, + "kl_loss": 0.3456891179084778, + "loss_ib": 0.010289683006703854, + "step": 758 + }, + { + "ce_ib": 10.459983825683594, + "ce_orig": 1.1698083877563477, + "epoch": 0.21798835286505142, + "kl_loss": 0.33782780170440674, + "loss_ib": 0.013838262297213078, + "step": 758 + }, + { + "ce_ib": 8.3121919631958, + "ce_orig": 0.6954447627067566, + "epoch": 0.21798835286505142, + "kl_loss": 0.38117825984954834, + "loss_ib": 0.012123974040150642, + "step": 758 + }, + { + "ce_ib": 6.0204010009765625, + "ce_orig": 0.7387468814849854, + "epoch": 0.21827593644402904, + "kl_loss": 0.42067545652389526, + "loss_ib": 0.01022715587168932, + "step": 759 + }, + { + "ce_ib": 8.076128005981445, + "ce_orig": 0.6554126143455505, + "epoch": 0.21827593644402904, + "kl_loss": 0.3814757466316223, + "loss_ib": 0.011890885420143604, + "step": 759 + }, + { + "ce_ib": 7.948196887969971, + "ce_orig": 0.7137631177902222, + "epoch": 0.21827593644402904, + "kl_loss": 0.4568563997745514, + "loss_ib": 0.012516760267317295, + "step": 759 + }, + { + "ce_ib": 7.82420015335083, + "ce_orig": 0.8517636060714722, + "epoch": 0.21827593644402904, + "kl_loss": 0.38165003061294556, + "loss_ib": 0.011640701442956924, + "step": 759 + }, + { + "epoch": 0.2185635200230067, + "grad_norm": 0.10263609886169434, + "learning_rate": 9.953429730181653e-06, + "loss": 0.897, + "step": 760 + }, + { + "ce_ib": 9.436492919921875, + "ce_orig": 0.9970396757125854, + "epoch": 0.2185635200230067, + "kl_loss": 0.350554496049881, + "loss_ib": 0.012942037545144558, + "step": 760 + }, + { + "ce_ib": 8.581624984741211, + "ce_orig": 0.445902556180954, + "epoch": 0.2185635200230067, + "kl_loss": 0.44985431432724, + "loss_ib": 0.013080167584121227, + "step": 760 + }, + { + "ce_ib": 5.427255153656006, + "ce_orig": 0.6384531855583191, + "epoch": 0.2185635200230067, + "kl_loss": 0.3149919807910919, + "loss_ib": 0.008577174507081509, + "step": 760 + }, + { + "ce_ib": 12.8211030960083, + "ce_orig": 1.4226137399673462, + "epoch": 0.2185635200230067, + "kl_loss": 0.297141969203949, + "loss_ib": 0.015792522579431534, + "step": 760 + }, + { + "ce_ib": 12.39339542388916, + "ce_orig": 1.4045170545578003, + "epoch": 0.21885110360198431, + "kl_loss": 0.3612511157989502, + "loss_ib": 0.016005907207727432, + "step": 761 + }, + { + "ce_ib": 7.98706579208374, + "ce_orig": 0.4089260399341583, + "epoch": 0.21885110360198431, + "kl_loss": 0.5325812101364136, + "loss_ib": 0.013312878087162971, + "step": 761 + }, + { + "ce_ib": 9.43854808807373, + "ce_orig": 0.47952842712402344, + "epoch": 0.21885110360198431, + "kl_loss": 0.39229825139045715, + "loss_ib": 0.013361530378460884, + "step": 761 + }, + { + "ce_ib": 7.874027729034424, + "ce_orig": 0.8605637550354004, + "epoch": 0.21885110360198431, + "kl_loss": 0.3315168619155884, + "loss_ib": 0.01118919625878334, + "step": 761 + }, + { + "ce_ib": 14.053482055664062, + "ce_orig": 1.8178443908691406, + "epoch": 0.21913868718096197, + "kl_loss": 0.4107596278190613, + "loss_ib": 0.018161077052354813, + "step": 762 + }, + { + "ce_ib": 12.965909957885742, + "ce_orig": 1.3146981000900269, + "epoch": 0.21913868718096197, + "kl_loss": 0.35652273893356323, + "loss_ib": 0.016531137749552727, + "step": 762 + }, + { + "ce_ib": 8.16763973236084, + "ce_orig": 1.06657874584198, + "epoch": 0.21913868718096197, + "kl_loss": 0.3259735703468323, + "loss_ib": 0.011427376419305801, + "step": 762 + }, + { + "ce_ib": 9.887922286987305, + "ce_orig": 0.7984805107116699, + "epoch": 0.21913868718096197, + "kl_loss": 0.4151677191257477, + "loss_ib": 0.01403959933668375, + "step": 762 + }, + { + "ce_ib": 12.173575401306152, + "ce_orig": 0.7809048891067505, + "epoch": 0.21942627075993962, + "kl_loss": 0.8220731019973755, + "loss_ib": 0.020394306629896164, + "step": 763 + }, + { + "ce_ib": 11.080092430114746, + "ce_orig": 1.3501423597335815, + "epoch": 0.21942627075993962, + "kl_loss": 0.3217220902442932, + "loss_ib": 0.014297313056886196, + "step": 763 + }, + { + "ce_ib": 5.698494911193848, + "ce_orig": 0.587028443813324, + "epoch": 0.21942627075993962, + "kl_loss": 0.4448961913585663, + "loss_ib": 0.010147457011044025, + "step": 763 + }, + { + "ce_ib": 7.876105785369873, + "ce_orig": 0.8657087087631226, + "epoch": 0.21942627075993962, + "kl_loss": 0.35243791341781616, + "loss_ib": 0.01140048447996378, + "step": 763 + }, + { + "ce_ib": 7.448467254638672, + "ce_orig": 0.6551787257194519, + "epoch": 0.21971385433891724, + "kl_loss": 0.31218671798706055, + "loss_ib": 0.010570335201919079, + "step": 764 + }, + { + "ce_ib": 6.403468608856201, + "ce_orig": 0.5633848905563354, + "epoch": 0.21971385433891724, + "kl_loss": 0.6558365821838379, + "loss_ib": 0.01296183466911316, + "step": 764 + }, + { + "ce_ib": 5.391688346862793, + "ce_orig": 0.3037964403629303, + "epoch": 0.21971385433891724, + "kl_loss": 0.3110879361629486, + "loss_ib": 0.008502568118274212, + "step": 764 + }, + { + "ce_ib": 11.914216995239258, + "ce_orig": 0.712795078754425, + "epoch": 0.21971385433891724, + "kl_loss": 0.8264556527137756, + "loss_ib": 0.020178772509098053, + "step": 764 + }, + { + "epoch": 0.2200014379178949, + "grad_norm": 0.08830351382493973, + "learning_rate": 9.952366994493438e-06, + "loss": 0.8629, + "step": 765 + }, + { + "ce_ib": 10.298672676086426, + "ce_orig": 1.1052519083023071, + "epoch": 0.2200014379178949, + "kl_loss": 0.29135391116142273, + "loss_ib": 0.013212212361395359, + "step": 765 + }, + { + "ce_ib": 5.10940408706665, + "ce_orig": 0.6949278116226196, + "epoch": 0.2200014379178949, + "kl_loss": 0.2762865126132965, + "loss_ib": 0.007872268557548523, + "step": 765 + }, + { + "ce_ib": 10.096806526184082, + "ce_orig": 1.0484293699264526, + "epoch": 0.2200014379178949, + "kl_loss": 0.4013941287994385, + "loss_ib": 0.0141107477247715, + "step": 765 + }, + { + "ce_ib": 6.776886463165283, + "ce_orig": 0.8039520382881165, + "epoch": 0.2200014379178949, + "kl_loss": 0.2949376702308655, + "loss_ib": 0.009726262651383877, + "step": 765 + }, + { + "ce_ib": 5.609190940856934, + "ce_orig": 0.5181265473365784, + "epoch": 0.22028902149687252, + "kl_loss": 0.31120482087135315, + "loss_ib": 0.008721238933503628, + "step": 766 + }, + { + "ce_ib": 7.555119037628174, + "ce_orig": 0.7217943668365479, + "epoch": 0.22028902149687252, + "kl_loss": 0.34005963802337646, + "loss_ib": 0.010955714620649815, + "step": 766 + }, + { + "ce_ib": 9.171826362609863, + "ce_orig": 1.1021441221237183, + "epoch": 0.22028902149687252, + "kl_loss": 0.39605045318603516, + "loss_ib": 0.01313233096152544, + "step": 766 + }, + { + "ce_ib": 9.409163475036621, + "ce_orig": 0.983971893787384, + "epoch": 0.22028902149687252, + "kl_loss": 0.3282513916492462, + "loss_ib": 0.012691677547991276, + "step": 766 + }, + { + "ce_ib": 10.134336471557617, + "ce_orig": 0.8422884345054626, + "epoch": 0.22057660507585017, + "kl_loss": 0.6862824559211731, + "loss_ib": 0.016997160390019417, + "step": 767 + }, + { + "ce_ib": 8.78597640991211, + "ce_orig": 0.6801238656044006, + "epoch": 0.22057660507585017, + "kl_loss": 0.383102685213089, + "loss_ib": 0.012617003172636032, + "step": 767 + }, + { + "ce_ib": 10.374561309814453, + "ce_orig": 1.0352544784545898, + "epoch": 0.22057660507585017, + "kl_loss": 0.32708740234375, + "loss_ib": 0.013645435683429241, + "step": 767 + }, + { + "ce_ib": 4.927638530731201, + "ce_orig": 0.5614314079284668, + "epoch": 0.22057660507585017, + "kl_loss": 0.37318044900894165, + "loss_ib": 0.008659442886710167, + "step": 767 + }, + { + "ce_ib": 11.366865158081055, + "ce_orig": 0.8449366688728333, + "epoch": 0.22086418865482782, + "kl_loss": 0.36128175258636475, + "loss_ib": 0.014979682862758636, + "step": 768 + }, + { + "ce_ib": 9.95175838470459, + "ce_orig": 1.076265811920166, + "epoch": 0.22086418865482782, + "kl_loss": 0.3022347688674927, + "loss_ib": 0.012974105775356293, + "step": 768 + }, + { + "ce_ib": 10.05750846862793, + "ce_orig": 0.8622808456420898, + "epoch": 0.22086418865482782, + "kl_loss": 0.3522017002105713, + "loss_ib": 0.013579525984823704, + "step": 768 + }, + { + "ce_ib": 8.465921401977539, + "ce_orig": 0.7081587910652161, + "epoch": 0.22086418865482782, + "kl_loss": 0.4725481867790222, + "loss_ib": 0.01319140288978815, + "step": 768 + }, + { + "ce_ib": 12.121952056884766, + "ce_orig": 0.9619759917259216, + "epoch": 0.22115177223380544, + "kl_loss": 0.5650833249092102, + "loss_ib": 0.01777278631925583, + "step": 769 + }, + { + "ce_ib": 7.01571798324585, + "ce_orig": 0.5827529430389404, + "epoch": 0.22115177223380544, + "kl_loss": 0.36164534091949463, + "loss_ib": 0.010632171295583248, + "step": 769 + }, + { + "ce_ib": 9.584782600402832, + "ce_orig": 0.9394201636314392, + "epoch": 0.22115177223380544, + "kl_loss": 0.3103780746459961, + "loss_ib": 0.012688562273979187, + "step": 769 + }, + { + "ce_ib": 13.475021362304688, + "ce_orig": 1.4583699703216553, + "epoch": 0.22115177223380544, + "kl_loss": 0.4726155996322632, + "loss_ib": 0.018201176077127457, + "step": 769 + }, + { + "epoch": 0.2214393558127831, + "grad_norm": 0.09313464909791946, + "learning_rate": 9.951292327403663e-06, + "loss": 0.9476, + "step": 770 + }, + { + "ce_ib": 10.434471130371094, + "ce_orig": 1.0447543859481812, + "epoch": 0.2214393558127831, + "kl_loss": 0.3816416561603546, + "loss_ib": 0.01425088755786419, + "step": 770 + }, + { + "ce_ib": 6.2150187492370605, + "ce_orig": 0.7222704291343689, + "epoch": 0.2214393558127831, + "kl_loss": 0.3173387050628662, + "loss_ib": 0.009388405829668045, + "step": 770 + }, + { + "ce_ib": 15.137702941894531, + "ce_orig": 2.1303963661193848, + "epoch": 0.2214393558127831, + "kl_loss": 0.3662612736225128, + "loss_ib": 0.018800314515829086, + "step": 770 + }, + { + "ce_ib": 9.431180953979492, + "ce_orig": 0.7595182061195374, + "epoch": 0.2214393558127831, + "kl_loss": 0.35950183868408203, + "loss_ib": 0.01302619930356741, + "step": 770 + }, + { + "ce_ib": 7.864904880523682, + "ce_orig": 0.48237812519073486, + "epoch": 0.22172693939176072, + "kl_loss": 0.35039085149765015, + "loss_ib": 0.011368812993168831, + "step": 771 + }, + { + "ce_ib": 10.381065368652344, + "ce_orig": 0.9781961441040039, + "epoch": 0.22172693939176072, + "kl_loss": 0.45010584592819214, + "loss_ib": 0.014882123097777367, + "step": 771 + }, + { + "ce_ib": 8.742616653442383, + "ce_orig": 0.7862986326217651, + "epoch": 0.22172693939176072, + "kl_loss": 0.439785897731781, + "loss_ib": 0.013140475377440453, + "step": 771 + }, + { + "ce_ib": 5.651560306549072, + "ce_orig": 0.42288464307785034, + "epoch": 0.22172693939176072, + "kl_loss": 0.660328209400177, + "loss_ib": 0.012254842557013035, + "step": 771 + }, + { + "ce_ib": 6.206586837768555, + "ce_orig": 0.48183295130729675, + "epoch": 0.22201452297073837, + "kl_loss": 0.30323436856269836, + "loss_ib": 0.00923893041908741, + "step": 772 + }, + { + "ce_ib": 9.07832145690918, + "ce_orig": 0.8749274611473083, + "epoch": 0.22201452297073837, + "kl_loss": 0.41655945777893066, + "loss_ib": 0.013243915513157845, + "step": 772 + }, + { + "ce_ib": 6.524220943450928, + "ce_orig": 0.6328911185264587, + "epoch": 0.22201452297073837, + "kl_loss": 0.4318011701107025, + "loss_ib": 0.010842232033610344, + "step": 772 + }, + { + "ce_ib": 5.257693290710449, + "ce_orig": 0.3358021080493927, + "epoch": 0.22201452297073837, + "kl_loss": 0.25356054306030273, + "loss_ib": 0.007793298922479153, + "step": 772 + }, + { + "ce_ib": 8.05919075012207, + "ce_orig": 0.5993462800979614, + "epoch": 0.22230210654971602, + "kl_loss": 0.37483856081962585, + "loss_ib": 0.011807575821876526, + "step": 773 + }, + { + "ce_ib": 7.331540584564209, + "ce_orig": 0.6803370118141174, + "epoch": 0.22230210654971602, + "kl_loss": 0.38549911975860596, + "loss_ib": 0.011186531744897366, + "step": 773 + }, + { + "ce_ib": 6.345643997192383, + "ce_orig": 0.6718232035636902, + "epoch": 0.22230210654971602, + "kl_loss": 0.38235121965408325, + "loss_ib": 0.010169154964387417, + "step": 773 + }, + { + "ce_ib": 12.004826545715332, + "ce_orig": 1.3992005586624146, + "epoch": 0.22230210654971602, + "kl_loss": 0.4480227530002594, + "loss_ib": 0.016485054045915604, + "step": 773 + }, + { + "ce_ib": 5.82366418838501, + "ce_orig": 0.5827267169952393, + "epoch": 0.22258969012869365, + "kl_loss": 0.38700664043426514, + "loss_ib": 0.009693730622529984, + "step": 774 + }, + { + "ce_ib": 12.45301342010498, + "ce_orig": 1.3923219442367554, + "epoch": 0.22258969012869365, + "kl_loss": 0.32873886823654175, + "loss_ib": 0.015740402042865753, + "step": 774 + }, + { + "ce_ib": 11.65266227722168, + "ce_orig": 1.4098445177078247, + "epoch": 0.22258969012869365, + "kl_loss": 0.34009307622909546, + "loss_ib": 0.015053593553602695, + "step": 774 + }, + { + "ce_ib": 7.026700019836426, + "ce_orig": 0.7507833242416382, + "epoch": 0.22258969012869365, + "kl_loss": 0.30786123871803284, + "loss_ib": 0.010105312801897526, + "step": 774 + }, + { + "epoch": 0.2228772737076713, + "grad_norm": 0.09665674716234207, + "learning_rate": 9.95020573150145e-06, + "loss": 0.8829, + "step": 775 + }, + { + "ce_ib": 9.703756332397461, + "ce_orig": 0.9238471388816833, + "epoch": 0.2228772737076713, + "kl_loss": 0.328873872756958, + "loss_ib": 0.012992494739592075, + "step": 775 + }, + { + "ce_ib": 13.514963150024414, + "ce_orig": 1.3463358879089355, + "epoch": 0.2228772737076713, + "kl_loss": 0.3467975854873657, + "loss_ib": 0.01698293909430504, + "step": 775 + }, + { + "ce_ib": 8.213980674743652, + "ce_orig": 0.8611528873443604, + "epoch": 0.2228772737076713, + "kl_loss": 0.7585752010345459, + "loss_ib": 0.015799731016159058, + "step": 775 + }, + { + "ce_ib": 8.563956260681152, + "ce_orig": 0.8483219146728516, + "epoch": 0.2228772737076713, + "kl_loss": 0.34036189317703247, + "loss_ib": 0.011967575177550316, + "step": 775 + }, + { + "ce_ib": 9.094470024108887, + "ce_orig": 1.0628784894943237, + "epoch": 0.22316485728664892, + "kl_loss": 0.46681421995162964, + "loss_ib": 0.013762611895799637, + "step": 776 + }, + { + "ce_ib": 7.872087478637695, + "ce_orig": 0.8701239228248596, + "epoch": 0.22316485728664892, + "kl_loss": 0.3346797227859497, + "loss_ib": 0.011218884028494358, + "step": 776 + }, + { + "ce_ib": 10.01878833770752, + "ce_orig": 0.8464388847351074, + "epoch": 0.22316485728664892, + "kl_loss": 0.47228649258613586, + "loss_ib": 0.014741652645170689, + "step": 776 + }, + { + "ce_ib": 7.2308549880981445, + "ce_orig": 0.8908596038818359, + "epoch": 0.22316485728664892, + "kl_loss": 0.3172210454940796, + "loss_ib": 0.010403065010905266, + "step": 776 + }, + { + "ce_ib": 8.690382957458496, + "ce_orig": 0.7194284200668335, + "epoch": 0.22345244086562657, + "kl_loss": 0.3316614031791687, + "loss_ib": 0.01200699619948864, + "step": 777 + }, + { + "ce_ib": 7.480838775634766, + "ce_orig": 0.7475316524505615, + "epoch": 0.22345244086562657, + "kl_loss": 0.3133096694946289, + "loss_ib": 0.010613935068249702, + "step": 777 + }, + { + "ce_ib": 16.189916610717773, + "ce_orig": 2.0802390575408936, + "epoch": 0.22345244086562657, + "kl_loss": 0.484038770198822, + "loss_ib": 0.021030303090810776, + "step": 777 + }, + { + "ce_ib": 7.3860626220703125, + "ce_orig": 0.5432776212692261, + "epoch": 0.22345244086562657, + "kl_loss": 0.39021003246307373, + "loss_ib": 0.011288163252174854, + "step": 777 + }, + { + "ce_ib": 5.361101150512695, + "ce_orig": 0.6422094106674194, + "epoch": 0.22374002444460422, + "kl_loss": 0.3227311372756958, + "loss_ib": 0.008588411845266819, + "step": 778 + }, + { + "ce_ib": 11.809074401855469, + "ce_orig": 1.1206345558166504, + "epoch": 0.22374002444460422, + "kl_loss": 0.34884482622146606, + "loss_ib": 0.015297521837055683, + "step": 778 + }, + { + "ce_ib": 8.916701316833496, + "ce_orig": 0.8444573283195496, + "epoch": 0.22374002444460422, + "kl_loss": 0.3931344151496887, + "loss_ib": 0.01284804567694664, + "step": 778 + }, + { + "ce_ib": 7.642374038696289, + "ce_orig": 1.1561375856399536, + "epoch": 0.22374002444460422, + "kl_loss": 0.35411643981933594, + "loss_ib": 0.011183538474142551, + "step": 778 + }, + { + "ce_ib": 6.7063140869140625, + "ce_orig": 0.6542351245880127, + "epoch": 0.22402760802358185, + "kl_loss": 0.34912005066871643, + "loss_ib": 0.010197513736784458, + "step": 779 + }, + { + "ce_ib": 7.014062404632568, + "ce_orig": 0.5142630934715271, + "epoch": 0.22402760802358185, + "kl_loss": 0.34287211298942566, + "loss_ib": 0.010442784056067467, + "step": 779 + }, + { + "ce_ib": 9.529996871948242, + "ce_orig": 0.7422083616256714, + "epoch": 0.22402760802358185, + "kl_loss": 0.3641512989997864, + "loss_ib": 0.013171510770916939, + "step": 779 + }, + { + "ce_ib": 6.52680778503418, + "ce_orig": 0.8021174669265747, + "epoch": 0.22402760802358185, + "kl_loss": 0.32752007246017456, + "loss_ib": 0.009802008979022503, + "step": 779 + }, + { + "epoch": 0.2243151916025595, + "grad_norm": 0.10656443983316422, + "learning_rate": 9.949107209404664e-06, + "loss": 0.8954, + "step": 780 + }, + { + "ce_ib": 9.61880874633789, + "ce_orig": 0.8786146640777588, + "epoch": 0.2243151916025595, + "kl_loss": 0.3575580418109894, + "loss_ib": 0.013194388709962368, + "step": 780 + }, + { + "ce_ib": 10.178520202636719, + "ce_orig": 0.807783305644989, + "epoch": 0.2243151916025595, + "kl_loss": 0.342892587184906, + "loss_ib": 0.013607447035610676, + "step": 780 + }, + { + "ce_ib": 9.118548393249512, + "ce_orig": 1.1404297351837158, + "epoch": 0.2243151916025595, + "kl_loss": 0.30617213249206543, + "loss_ib": 0.012180269695818424, + "step": 780 + }, + { + "ce_ib": 10.034414291381836, + "ce_orig": 0.7776552438735962, + "epoch": 0.2243151916025595, + "kl_loss": 0.4022853970527649, + "loss_ib": 0.014057268388569355, + "step": 780 + }, + { + "ce_ib": 9.740767478942871, + "ce_orig": 1.099388599395752, + "epoch": 0.22460277518153712, + "kl_loss": 0.6486046314239502, + "loss_ib": 0.016226813197135925, + "step": 781 + }, + { + "ce_ib": 6.6487016677856445, + "ce_orig": 0.6519371271133423, + "epoch": 0.22460277518153712, + "kl_loss": 0.3416748344898224, + "loss_ib": 0.010065450333058834, + "step": 781 + }, + { + "ce_ib": 4.7245869636535645, + "ce_orig": 0.6195375323295593, + "epoch": 0.22460277518153712, + "kl_loss": 0.2902180552482605, + "loss_ib": 0.007626766804605722, + "step": 781 + }, + { + "ce_ib": 12.154743194580078, + "ce_orig": 1.4246693849563599, + "epoch": 0.22460277518153712, + "kl_loss": 0.25261539220809937, + "loss_ib": 0.014680897817015648, + "step": 781 + }, + { + "ce_ib": 5.439098834991455, + "ce_orig": 0.8269286155700684, + "epoch": 0.22489035876051477, + "kl_loss": 0.34107422828674316, + "loss_ib": 0.00884984154254198, + "step": 782 + }, + { + "ce_ib": 9.174774169921875, + "ce_orig": 1.1341043710708618, + "epoch": 0.22489035876051477, + "kl_loss": 0.3221195936203003, + "loss_ib": 0.012395970523357391, + "step": 782 + }, + { + "ce_ib": 9.228188514709473, + "ce_orig": 1.234445333480835, + "epoch": 0.22489035876051477, + "kl_loss": 0.30531132221221924, + "loss_ib": 0.012281300500035286, + "step": 782 + }, + { + "ce_ib": 5.356451034545898, + "ce_orig": 0.8137478828430176, + "epoch": 0.22489035876051477, + "kl_loss": 0.37726348638534546, + "loss_ib": 0.009129085578024387, + "step": 782 + }, + { + "ce_ib": 9.502201080322266, + "ce_orig": 0.8526181578636169, + "epoch": 0.22517794233949243, + "kl_loss": 0.6065495014190674, + "loss_ib": 0.01556769572198391, + "step": 783 + }, + { + "ce_ib": 7.823635578155518, + "ce_orig": 0.6923622488975525, + "epoch": 0.22517794233949243, + "kl_loss": 0.394509494304657, + "loss_ib": 0.011768730357289314, + "step": 783 + }, + { + "ce_ib": 6.387758255004883, + "ce_orig": 0.6680426001548767, + "epoch": 0.22517794233949243, + "kl_loss": 0.30215880274772644, + "loss_ib": 0.009409346617758274, + "step": 783 + }, + { + "ce_ib": 7.9621663093566895, + "ce_orig": 0.2912781834602356, + "epoch": 0.22517794233949243, + "kl_loss": 0.6065285205841064, + "loss_ib": 0.01402745209634304, + "step": 783 + }, + { + "ce_ib": 8.740853309631348, + "ce_orig": 1.1666960716247559, + "epoch": 0.22546552591847005, + "kl_loss": 0.26575469970703125, + "loss_ib": 0.011398401111364365, + "step": 784 + }, + { + "ce_ib": 11.868843078613281, + "ce_orig": 1.1964963674545288, + "epoch": 0.22546552591847005, + "kl_loss": 0.34643134474754333, + "loss_ib": 0.01533315610140562, + "step": 784 + }, + { + "ce_ib": 11.849778175354004, + "ce_orig": 0.6975755095481873, + "epoch": 0.22546552591847005, + "kl_loss": 0.48860257863998413, + "loss_ib": 0.016735803335905075, + "step": 784 + }, + { + "ce_ib": 7.8105998039245605, + "ce_orig": 1.0470623970031738, + "epoch": 0.22546552591847005, + "kl_loss": 0.31021207571029663, + "loss_ib": 0.010912721045315266, + "step": 784 + }, + { + "epoch": 0.2257531094974477, + "grad_norm": 0.11681295186281204, + "learning_rate": 9.9479967637599e-06, + "loss": 0.9212, + "step": 785 + }, + { + "ce_ib": 11.292010307312012, + "ce_orig": 1.259683609008789, + "epoch": 0.2257531094974477, + "kl_loss": 0.2796482443809509, + "loss_ib": 0.014088491909205914, + "step": 785 + }, + { + "ce_ib": 10.634122848510742, + "ce_orig": 0.9388463497161865, + "epoch": 0.2257531094974477, + "kl_loss": 0.33059000968933105, + "loss_ib": 0.013940023258328438, + "step": 785 + }, + { + "ce_ib": 12.245208740234375, + "ce_orig": 1.6006643772125244, + "epoch": 0.2257531094974477, + "kl_loss": 0.43430644273757935, + "loss_ib": 0.016588272526860237, + "step": 785 + }, + { + "ce_ib": 7.943255424499512, + "ce_orig": 0.701815128326416, + "epoch": 0.2257531094974477, + "kl_loss": 0.42579883337020874, + "loss_ib": 0.012201243080198765, + "step": 785 + }, + { + "ce_ib": 5.053991794586182, + "ce_orig": 0.5053215622901917, + "epoch": 0.22604069307642533, + "kl_loss": 0.31988024711608887, + "loss_ib": 0.008252793923020363, + "step": 786 + }, + { + "ce_ib": 10.378332138061523, + "ce_orig": 1.0649505853652954, + "epoch": 0.22604069307642533, + "kl_loss": 0.3701839745044708, + "loss_ib": 0.014080171473324299, + "step": 786 + }, + { + "ce_ib": 6.406869888305664, + "ce_orig": 0.7461100220680237, + "epoch": 0.22604069307642533, + "kl_loss": 0.3331264853477478, + "loss_ib": 0.0097381342202425, + "step": 786 + }, + { + "ce_ib": 9.13780689239502, + "ce_orig": 0.6390551924705505, + "epoch": 0.22604069307642533, + "kl_loss": 0.5007272958755493, + "loss_ib": 0.014145080000162125, + "step": 786 + }, + { + "ce_ib": 8.147912979125977, + "ce_orig": 0.5080342292785645, + "epoch": 0.22632827665540298, + "kl_loss": 0.4450484812259674, + "loss_ib": 0.012598397210240364, + "step": 787 + }, + { + "ce_ib": 7.028100490570068, + "ce_orig": 0.593861997127533, + "epoch": 0.22632827665540298, + "kl_loss": 0.4049058258533478, + "loss_ib": 0.011077158153057098, + "step": 787 + }, + { + "ce_ib": 8.043399810791016, + "ce_orig": 0.4268825054168701, + "epoch": 0.22632827665540298, + "kl_loss": 0.35771211981773376, + "loss_ib": 0.011620521545410156, + "step": 787 + }, + { + "ce_ib": 6.9142866134643555, + "ce_orig": 0.4698316156864166, + "epoch": 0.22632827665540298, + "kl_loss": 0.8020865321159363, + "loss_ib": 0.014935152605175972, + "step": 787 + }, + { + "ce_ib": 6.95719575881958, + "ce_orig": 0.8023804426193237, + "epoch": 0.22661586023438063, + "kl_loss": 0.3693576455116272, + "loss_ib": 0.010650772601366043, + "step": 788 + }, + { + "ce_ib": 12.449686050415039, + "ce_orig": 1.3771573305130005, + "epoch": 0.22661586023438063, + "kl_loss": 0.4264632761478424, + "loss_ib": 0.016714317724108696, + "step": 788 + }, + { + "ce_ib": 11.89621353149414, + "ce_orig": 1.398105502128601, + "epoch": 0.22661586023438063, + "kl_loss": 0.30330199003219604, + "loss_ib": 0.014929232187569141, + "step": 788 + }, + { + "ce_ib": 5.163127422332764, + "ce_orig": 0.5273948907852173, + "epoch": 0.22661586023438063, + "kl_loss": 0.360205739736557, + "loss_ib": 0.008765184320509434, + "step": 788 + }, + { + "ce_ib": 10.611333847045898, + "ce_orig": 0.8973667025566101, + "epoch": 0.22690344381335825, + "kl_loss": 0.42807987332344055, + "loss_ib": 0.01489213202148676, + "step": 789 + }, + { + "ce_ib": 4.420098304748535, + "ce_orig": 0.5175418853759766, + "epoch": 0.22690344381335825, + "kl_loss": 0.30578553676605225, + "loss_ib": 0.007477953098714352, + "step": 789 + }, + { + "ce_ib": 7.4076008796691895, + "ce_orig": 0.5371741056442261, + "epoch": 0.22690344381335825, + "kl_loss": 0.3347129821777344, + "loss_ib": 0.010754730552434921, + "step": 789 + }, + { + "ce_ib": 9.960221290588379, + "ce_orig": 0.9401814937591553, + "epoch": 0.22690344381335825, + "kl_loss": 0.3342566192150116, + "loss_ib": 0.013302787207067013, + "step": 789 + }, + { + "epoch": 0.2271910273923359, + "grad_norm": 0.0974324494600296, + "learning_rate": 9.946874397242474e-06, + "loss": 0.9038, + "step": 790 + }, + { + "ce_ib": 10.756206512451172, + "ce_orig": 0.7469778060913086, + "epoch": 0.2271910273923359, + "kl_loss": 0.3871549963951111, + "loss_ib": 0.014627756550908089, + "step": 790 + }, + { + "ce_ib": 11.253997802734375, + "ce_orig": 1.2753102779388428, + "epoch": 0.2271910273923359, + "kl_loss": 0.37940990924835205, + "loss_ib": 0.015048096887767315, + "step": 790 + }, + { + "ce_ib": 7.96759557723999, + "ce_orig": 0.5226830840110779, + "epoch": 0.2271910273923359, + "kl_loss": 0.4906744360923767, + "loss_ib": 0.012874339707195759, + "step": 790 + }, + { + "ce_ib": 6.907197952270508, + "ce_orig": 0.8273372054100037, + "epoch": 0.2271910273923359, + "kl_loss": 0.29984915256500244, + "loss_ib": 0.009905689395964146, + "step": 790 + }, + { + "ce_ib": 11.577720642089844, + "ce_orig": 0.4754190146923065, + "epoch": 0.22747861097131353, + "kl_loss": 0.5800215601921082, + "loss_ib": 0.017377937212586403, + "step": 791 + }, + { + "ce_ib": 7.266067028045654, + "ce_orig": 0.7913497090339661, + "epoch": 0.22747861097131353, + "kl_loss": 0.32481786608695984, + "loss_ib": 0.010514246299862862, + "step": 791 + }, + { + "ce_ib": 10.663007736206055, + "ce_orig": 0.8187626004219055, + "epoch": 0.22747861097131353, + "kl_loss": 0.36268165707588196, + "loss_ib": 0.014289823360741138, + "step": 791 + }, + { + "ce_ib": 10.45195484161377, + "ce_orig": 1.220002293586731, + "epoch": 0.22747861097131353, + "kl_loss": 0.37439438700675964, + "loss_ib": 0.014195898547768593, + "step": 791 + }, + { + "ce_ib": 10.41888427734375, + "ce_orig": 0.6695852279663086, + "epoch": 0.22776619455029118, + "kl_loss": 0.5112478733062744, + "loss_ib": 0.01553136296570301, + "step": 792 + }, + { + "ce_ib": 10.26807689666748, + "ce_orig": 1.4850273132324219, + "epoch": 0.22776619455029118, + "kl_loss": 0.45192694664001465, + "loss_ib": 0.014787346124649048, + "step": 792 + }, + { + "ce_ib": 13.323302268981934, + "ce_orig": 1.7503565549850464, + "epoch": 0.22776619455029118, + "kl_loss": 0.3711824417114258, + "loss_ib": 0.01703512668609619, + "step": 792 + }, + { + "ce_ib": 5.721099853515625, + "ce_orig": 0.3669746220111847, + "epoch": 0.22776619455029118, + "kl_loss": 0.31216946244239807, + "loss_ib": 0.00884279515594244, + "step": 792 + }, + { + "ce_ib": 11.519103050231934, + "ce_orig": 0.9824369549751282, + "epoch": 0.22805377812926883, + "kl_loss": 0.3825758397579193, + "loss_ib": 0.015344860963523388, + "step": 793 + }, + { + "ce_ib": 6.631891250610352, + "ce_orig": 0.6841241121292114, + "epoch": 0.22805377812926883, + "kl_loss": 0.3822288513183594, + "loss_ib": 0.010454179719090462, + "step": 793 + }, + { + "ce_ib": 7.766302585601807, + "ce_orig": 0.9525435566902161, + "epoch": 0.22805377812926883, + "kl_loss": 0.3503475785255432, + "loss_ib": 0.011269778944551945, + "step": 793 + }, + { + "ce_ib": 10.979907035827637, + "ce_orig": 0.8087160587310791, + "epoch": 0.22805377812926883, + "kl_loss": 0.4700503349304199, + "loss_ib": 0.015680409967899323, + "step": 793 + }, + { + "ce_ib": 4.012631416320801, + "ce_orig": 0.49861544370651245, + "epoch": 0.22834136170824645, + "kl_loss": 0.8556938171386719, + "loss_ib": 0.012569569051265717, + "step": 794 + }, + { + "ce_ib": 8.279598236083984, + "ce_orig": 0.13107673823833466, + "epoch": 0.22834136170824645, + "kl_loss": 0.8502056002616882, + "loss_ib": 0.016781654208898544, + "step": 794 + }, + { + "ce_ib": 7.012378215789795, + "ce_orig": 0.4306791424751282, + "epoch": 0.22834136170824645, + "kl_loss": 0.317436158657074, + "loss_ib": 0.010186740197241306, + "step": 794 + }, + { + "ce_ib": 8.948698043823242, + "ce_orig": 0.9477734565734863, + "epoch": 0.22834136170824645, + "kl_loss": 0.6815387606620789, + "loss_ib": 0.015764085575938225, + "step": 794 + }, + { + "epoch": 0.2286289452872241, + "grad_norm": 0.11332917958498001, + "learning_rate": 9.945740112556433e-06, + "loss": 0.8909, + "step": 795 + }, + { + "ce_ib": 9.626729965209961, + "ce_orig": 0.807258665561676, + "epoch": 0.2286289452872241, + "kl_loss": 0.30290764570236206, + "loss_ib": 0.012655805796384811, + "step": 795 + }, + { + "ce_ib": 8.031062126159668, + "ce_orig": 0.6332983374595642, + "epoch": 0.2286289452872241, + "kl_loss": 0.4431205987930298, + "loss_ib": 0.012462267652153969, + "step": 795 + }, + { + "ce_ib": 6.183389663696289, + "ce_orig": 0.3490990400314331, + "epoch": 0.2286289452872241, + "kl_loss": 0.2828608751296997, + "loss_ib": 0.009011998772621155, + "step": 795 + }, + { + "ce_ib": 10.629033088684082, + "ce_orig": 1.2572027444839478, + "epoch": 0.2286289452872241, + "kl_loss": 0.2958335280418396, + "loss_ib": 0.013587366789579391, + "step": 795 + }, + { + "ce_ib": 12.632726669311523, + "ce_orig": 1.813147783279419, + "epoch": 0.22891652886620173, + "kl_loss": 0.34517595171928406, + "loss_ib": 0.01608448661863804, + "step": 796 + }, + { + "ce_ib": 9.120535850524902, + "ce_orig": 1.1200886964797974, + "epoch": 0.22891652886620173, + "kl_loss": 0.36460253596305847, + "loss_ib": 0.012766561470925808, + "step": 796 + }, + { + "ce_ib": 8.088251113891602, + "ce_orig": 0.8303477168083191, + "epoch": 0.22891652886620173, + "kl_loss": 0.3226301074028015, + "loss_ib": 0.01131455134600401, + "step": 796 + }, + { + "ce_ib": 10.398566246032715, + "ce_orig": 0.7473430633544922, + "epoch": 0.22891652886620173, + "kl_loss": 0.40119439363479614, + "loss_ib": 0.01441050972789526, + "step": 796 + }, + { + "ce_ib": 7.080221176147461, + "ce_orig": 0.7931245565414429, + "epoch": 0.22920411244517938, + "kl_loss": 0.29053109884262085, + "loss_ib": 0.00998553168028593, + "step": 797 + }, + { + "ce_ib": 7.256488800048828, + "ce_orig": 0.8034987449645996, + "epoch": 0.22920411244517938, + "kl_loss": 0.3614344596862793, + "loss_ib": 0.010870832949876785, + "step": 797 + }, + { + "ce_ib": 10.867740631103516, + "ce_orig": 0.8990861177444458, + "epoch": 0.22920411244517938, + "kl_loss": 0.46362748742103577, + "loss_ib": 0.015504015609622002, + "step": 797 + }, + { + "ce_ib": 9.055109977722168, + "ce_orig": 0.9181981086730957, + "epoch": 0.22920411244517938, + "kl_loss": 0.4794706106185913, + "loss_ib": 0.013849816285073757, + "step": 797 + }, + { + "ce_ib": 7.544148921966553, + "ce_orig": 0.6566915512084961, + "epoch": 0.22949169602415703, + "kl_loss": 0.30486202239990234, + "loss_ib": 0.010592768900096416, + "step": 798 + }, + { + "ce_ib": 5.6905059814453125, + "ce_orig": 0.8550714254379272, + "epoch": 0.22949169602415703, + "kl_loss": 0.45504266023635864, + "loss_ib": 0.010240932926535606, + "step": 798 + }, + { + "ce_ib": 6.176849842071533, + "ce_orig": 0.6315608024597168, + "epoch": 0.22949169602415703, + "kl_loss": 0.3964434266090393, + "loss_ib": 0.010141284205019474, + "step": 798 + }, + { + "ce_ib": 4.630974769592285, + "ce_orig": 0.44450247287750244, + "epoch": 0.22949169602415703, + "kl_loss": 0.36748573184013367, + "loss_ib": 0.00830583181232214, + "step": 798 + }, + { + "ce_ib": 11.174190521240234, + "ce_orig": 1.3261359930038452, + "epoch": 0.22977927960313466, + "kl_loss": 0.2634417414665222, + "loss_ib": 0.013808608055114746, + "step": 799 + }, + { + "ce_ib": 8.188764572143555, + "ce_orig": 0.5794906616210938, + "epoch": 0.22977927960313466, + "kl_loss": 0.40460944175720215, + "loss_ib": 0.01223485916852951, + "step": 799 + }, + { + "ce_ib": 8.544333457946777, + "ce_orig": 0.705375075340271, + "epoch": 0.22977927960313466, + "kl_loss": 0.3461841940879822, + "loss_ib": 0.012006175704300404, + "step": 799 + }, + { + "ce_ib": 10.321372985839844, + "ce_orig": 0.9484913945198059, + "epoch": 0.22977927960313466, + "kl_loss": 0.343988835811615, + "loss_ib": 0.01376126054674387, + "step": 799 + }, + { + "epoch": 0.2300668631821123, + "grad_norm": 0.08445550501346588, + "learning_rate": 9.94459391243453e-06, + "loss": 0.8778, + "step": 800 + }, + { + "ce_ib": 7.361359119415283, + "ce_orig": 0.678317129611969, + "epoch": 0.2300668631821123, + "kl_loss": 0.6545436382293701, + "loss_ib": 0.013906795531511307, + "step": 800 + }, + { + "ce_ib": 9.871505737304688, + "ce_orig": 1.0527350902557373, + "epoch": 0.2300668631821123, + "kl_loss": 0.6774921417236328, + "loss_ib": 0.016646428033709526, + "step": 800 + }, + { + "ce_ib": 10.61447525024414, + "ce_orig": 1.2886981964111328, + "epoch": 0.2300668631821123, + "kl_loss": 0.32457613945007324, + "loss_ib": 0.013860235922038555, + "step": 800 + }, + { + "ce_ib": 12.351025581359863, + "ce_orig": 1.449278712272644, + "epoch": 0.2300668631821123, + "kl_loss": 0.3870459496974945, + "loss_ib": 0.016221484169363976, + "step": 800 + }, + { + "ce_ib": 6.113245964050293, + "ce_orig": 0.49481338262557983, + "epoch": 0.23035444676108993, + "kl_loss": 0.31978094577789307, + "loss_ib": 0.009311055764555931, + "step": 801 + }, + { + "ce_ib": 9.732067108154297, + "ce_orig": 1.0109401941299438, + "epoch": 0.23035444676108993, + "kl_loss": 0.37637659907341003, + "loss_ib": 0.013495832681655884, + "step": 801 + }, + { + "ce_ib": 7.419297695159912, + "ce_orig": 0.680454671382904, + "epoch": 0.23035444676108993, + "kl_loss": 0.3423839509487152, + "loss_ib": 0.01084313727915287, + "step": 801 + }, + { + "ce_ib": 7.179284572601318, + "ce_orig": 0.8883647322654724, + "epoch": 0.23035444676108993, + "kl_loss": 0.3522745966911316, + "loss_ib": 0.01070203073322773, + "step": 801 + }, + { + "ce_ib": 12.911877632141113, + "ce_orig": 0.9866081476211548, + "epoch": 0.23064203034006758, + "kl_loss": 0.293899267911911, + "loss_ib": 0.015850869938731194, + "step": 802 + }, + { + "ce_ib": 10.869671821594238, + "ce_orig": 1.2515939474105835, + "epoch": 0.23064203034006758, + "kl_loss": 0.6851050853729248, + "loss_ib": 0.017720723524689674, + "step": 802 + }, + { + "ce_ib": 7.759640216827393, + "ce_orig": 0.6238282918930054, + "epoch": 0.23064203034006758, + "kl_loss": 0.36549922823905945, + "loss_ib": 0.011414632201194763, + "step": 802 + }, + { + "ce_ib": 3.205626964569092, + "ce_orig": 0.13584905862808228, + "epoch": 0.23064203034006758, + "kl_loss": 0.7906656265258789, + "loss_ib": 0.011112282983958721, + "step": 802 + }, + { + "ce_ib": 12.389301300048828, + "ce_orig": 1.3402420282363892, + "epoch": 0.23092961391904523, + "kl_loss": 0.35545194149017334, + "loss_ib": 0.015943819656968117, + "step": 803 + }, + { + "ce_ib": 10.47632122039795, + "ce_orig": 0.7908617854118347, + "epoch": 0.23092961391904523, + "kl_loss": 0.31981128454208374, + "loss_ib": 0.013674433343112469, + "step": 803 + }, + { + "ce_ib": 9.429079055786133, + "ce_orig": 1.371010661125183, + "epoch": 0.23092961391904523, + "kl_loss": 0.6973379850387573, + "loss_ib": 0.016402458772063255, + "step": 803 + }, + { + "ce_ib": 7.480784893035889, + "ce_orig": 0.8337413668632507, + "epoch": 0.23092961391904523, + "kl_loss": 0.3528236150741577, + "loss_ib": 0.01100902073085308, + "step": 803 + }, + { + "ce_ib": 5.842719078063965, + "ce_orig": 0.5573224425315857, + "epoch": 0.23121719749802286, + "kl_loss": 0.2670140862464905, + "loss_ib": 0.008512860164046288, + "step": 804 + }, + { + "ce_ib": 7.656364917755127, + "ce_orig": 0.7828370928764343, + "epoch": 0.23121719749802286, + "kl_loss": 0.2876054346561432, + "loss_ib": 0.010532419197261333, + "step": 804 + }, + { + "ce_ib": 4.491579532623291, + "ce_orig": 0.33045491576194763, + "epoch": 0.23121719749802286, + "kl_loss": 0.6809794902801514, + "loss_ib": 0.011301374062895775, + "step": 804 + }, + { + "ce_ib": 9.232762336730957, + "ce_orig": 0.7977848649024963, + "epoch": 0.23121719749802286, + "kl_loss": 0.2893211245536804, + "loss_ib": 0.012125973589718342, + "step": 804 + }, + { + "epoch": 0.2315047810770005, + "grad_norm": 0.09778746962547302, + "learning_rate": 9.943435799638226e-06, + "loss": 0.9126, + "step": 805 + }, + { + "ce_ib": 10.316106796264648, + "ce_orig": 1.1037230491638184, + "epoch": 0.2315047810770005, + "kl_loss": 0.38173002004623413, + "loss_ib": 0.014133407734334469, + "step": 805 + }, + { + "ce_ib": 10.278280258178711, + "ce_orig": 1.2289481163024902, + "epoch": 0.2315047810770005, + "kl_loss": 0.4344818592071533, + "loss_ib": 0.014623099006712437, + "step": 805 + }, + { + "ce_ib": 10.008515357971191, + "ce_orig": 1.114085078239441, + "epoch": 0.2315047810770005, + "kl_loss": 0.4228159785270691, + "loss_ib": 0.014236673712730408, + "step": 805 + }, + { + "ce_ib": 10.21235179901123, + "ce_orig": 1.434356451034546, + "epoch": 0.2315047810770005, + "kl_loss": 0.26860710978507996, + "loss_ib": 0.012898423708975315, + "step": 805 + }, + { + "ce_ib": 9.20308780670166, + "ce_orig": 0.6685881018638611, + "epoch": 0.23179236465597813, + "kl_loss": 0.35367828607559204, + "loss_ib": 0.012739870697259903, + "step": 806 + }, + { + "ce_ib": 12.784605026245117, + "ce_orig": 1.4579914808273315, + "epoch": 0.23179236465597813, + "kl_loss": 0.34686392545700073, + "loss_ib": 0.016253244131803513, + "step": 806 + }, + { + "ce_ib": 11.806829452514648, + "ce_orig": 1.5088831186294556, + "epoch": 0.23179236465597813, + "kl_loss": 0.28930962085723877, + "loss_ib": 0.014699925668537617, + "step": 806 + }, + { + "ce_ib": 9.593900680541992, + "ce_orig": 1.328742265701294, + "epoch": 0.23179236465597813, + "kl_loss": 0.33179858326911926, + "loss_ib": 0.012911886908113956, + "step": 806 + }, + { + "ce_ib": 9.164833068847656, + "ce_orig": 1.269349455833435, + "epoch": 0.23207994823495579, + "kl_loss": 0.3544562757015228, + "loss_ib": 0.012709395959973335, + "step": 807 + }, + { + "ce_ib": 7.470652103424072, + "ce_orig": 0.6890314817428589, + "epoch": 0.23207994823495579, + "kl_loss": 0.3289049565792084, + "loss_ib": 0.010759701952338219, + "step": 807 + }, + { + "ce_ib": 9.631521224975586, + "ce_orig": 1.2519800662994385, + "epoch": 0.23207994823495579, + "kl_loss": 0.2588420510292053, + "loss_ib": 0.01221994124352932, + "step": 807 + }, + { + "ce_ib": 8.615235328674316, + "ce_orig": 0.5310425162315369, + "epoch": 0.23207994823495579, + "kl_loss": 0.46589159965515137, + "loss_ib": 0.01327415183186531, + "step": 807 + }, + { + "ce_ib": 7.7025861740112305, + "ce_orig": 0.8470107913017273, + "epoch": 0.23236753181393344, + "kl_loss": 0.3353464603424072, + "loss_ib": 0.011056050658226013, + "step": 808 + }, + { + "ce_ib": 10.234197616577148, + "ce_orig": 1.0393426418304443, + "epoch": 0.23236753181393344, + "kl_loss": 0.49282306432724, + "loss_ib": 0.015162426978349686, + "step": 808 + }, + { + "ce_ib": 8.97604751586914, + "ce_orig": 0.8064647316932678, + "epoch": 0.23236753181393344, + "kl_loss": 0.4205772876739502, + "loss_ib": 0.013181819580495358, + "step": 808 + }, + { + "ce_ib": 10.455026626586914, + "ce_orig": 1.1835849285125732, + "epoch": 0.23236753181393344, + "kl_loss": 0.3391731381416321, + "loss_ib": 0.013846756890416145, + "step": 808 + }, + { + "ce_ib": 6.682322025299072, + "ce_orig": 0.7806753516197205, + "epoch": 0.23265511539291106, + "kl_loss": 0.2750164270401001, + "loss_ib": 0.00943248625844717, + "step": 809 + }, + { + "ce_ib": 7.129208087921143, + "ce_orig": 0.5480561256408691, + "epoch": 0.23265511539291106, + "kl_loss": 0.4469287395477295, + "loss_ib": 0.0115984957665205, + "step": 809 + }, + { + "ce_ib": 13.105687141418457, + "ce_orig": 0.685859739780426, + "epoch": 0.23265511539291106, + "kl_loss": 0.27275753021240234, + "loss_ib": 0.015833262354135513, + "step": 809 + }, + { + "ce_ib": 9.026814460754395, + "ce_orig": 1.057628870010376, + "epoch": 0.23265511539291106, + "kl_loss": 0.3957173228263855, + "loss_ib": 0.012983987107872963, + "step": 809 + }, + { + "epoch": 0.2329426989718887, + "grad_norm": 0.10287559032440186, + "learning_rate": 9.942265776957687e-06, + "loss": 0.9129, + "step": 810 + }, + { + "ce_ib": 9.951510429382324, + "ce_orig": 1.1167075634002686, + "epoch": 0.2329426989718887, + "kl_loss": 0.3010333776473999, + "loss_ib": 0.012961843982338905, + "step": 810 + }, + { + "ce_ib": 10.840296745300293, + "ce_orig": 1.0002007484436035, + "epoch": 0.2329426989718887, + "kl_loss": 0.38143882155418396, + "loss_ib": 0.01465468481183052, + "step": 810 + }, + { + "ce_ib": 7.044186592102051, + "ce_orig": 0.799129068851471, + "epoch": 0.2329426989718887, + "kl_loss": 0.3452165424823761, + "loss_ib": 0.0104963518679142, + "step": 810 + }, + { + "ce_ib": 13.050943374633789, + "ce_orig": 1.4166967868804932, + "epoch": 0.2329426989718887, + "kl_loss": 0.3720596432685852, + "loss_ib": 0.01677154004573822, + "step": 810 + }, + { + "ce_ib": 6.821801662445068, + "ce_orig": 0.7074012756347656, + "epoch": 0.23323028255086634, + "kl_loss": 0.3542909622192383, + "loss_ib": 0.010364711284637451, + "step": 811 + }, + { + "ce_ib": 8.195531845092773, + "ce_orig": 1.0204046964645386, + "epoch": 0.23323028255086634, + "kl_loss": 0.28537851572036743, + "loss_ib": 0.011049317196011543, + "step": 811 + }, + { + "ce_ib": 6.7708001136779785, + "ce_orig": 0.5408430695533752, + "epoch": 0.23323028255086634, + "kl_loss": 0.4277627468109131, + "loss_ib": 0.011048427782952785, + "step": 811 + }, + { + "ce_ib": 10.415853500366211, + "ce_orig": 0.7485983371734619, + "epoch": 0.23323028255086634, + "kl_loss": 0.27399230003356934, + "loss_ib": 0.01315577607601881, + "step": 811 + }, + { + "ce_ib": 11.759166717529297, + "ce_orig": 1.3388240337371826, + "epoch": 0.233517866129844, + "kl_loss": 0.3040698766708374, + "loss_ib": 0.014799864962697029, + "step": 812 + }, + { + "ce_ib": 7.798691272735596, + "ce_orig": 1.0542670488357544, + "epoch": 0.233517866129844, + "kl_loss": 0.3233657777309418, + "loss_ib": 0.01103234849870205, + "step": 812 + }, + { + "ce_ib": 14.092377662658691, + "ce_orig": 1.9352741241455078, + "epoch": 0.233517866129844, + "kl_loss": 0.4481472373008728, + "loss_ib": 0.01857384853065014, + "step": 812 + }, + { + "ce_ib": 6.820461750030518, + "ce_orig": 0.4188855290412903, + "epoch": 0.233517866129844, + "kl_loss": 0.2712195813655853, + "loss_ib": 0.009532657451927662, + "step": 812 + }, + { + "ce_ib": 10.21103572845459, + "ce_orig": 1.5039421319961548, + "epoch": 0.23380544970882164, + "kl_loss": 0.3258100152015686, + "loss_ib": 0.013469135388731956, + "step": 813 + }, + { + "ce_ib": 8.036877632141113, + "ce_orig": 0.608624279499054, + "epoch": 0.23380544970882164, + "kl_loss": 0.2845733165740967, + "loss_ib": 0.010882611386477947, + "step": 813 + }, + { + "ce_ib": 10.804905891418457, + "ce_orig": 1.0994369983673096, + "epoch": 0.23380544970882164, + "kl_loss": 0.2775050699710846, + "loss_ib": 0.013579956255853176, + "step": 813 + }, + { + "ce_ib": 6.5864362716674805, + "ce_orig": 0.5708433985710144, + "epoch": 0.23380544970882164, + "kl_loss": 0.2514118552207947, + "loss_ib": 0.009100555442273617, + "step": 813 + }, + { + "ce_ib": 7.656818866729736, + "ce_orig": 0.9114794731140137, + "epoch": 0.23409303328779926, + "kl_loss": 0.28084778785705566, + "loss_ib": 0.010465297847986221, + "step": 814 + }, + { + "ce_ib": 7.377896308898926, + "ce_orig": 0.9978185296058655, + "epoch": 0.23409303328779926, + "kl_loss": 0.2847989797592163, + "loss_ib": 0.010225885547697544, + "step": 814 + }, + { + "ce_ib": 6.432290077209473, + "ce_orig": 0.6785269379615784, + "epoch": 0.23409303328779926, + "kl_loss": 0.30970633029937744, + "loss_ib": 0.009529353119432926, + "step": 814 + }, + { + "ce_ib": 8.577414512634277, + "ce_orig": 0.6792881488800049, + "epoch": 0.23409303328779926, + "kl_loss": 0.3012913465499878, + "loss_ib": 0.011590328067541122, + "step": 814 + }, + { + "epoch": 0.23438061686677691, + "grad_norm": 0.10755941271781921, + "learning_rate": 9.941083847211765e-06, + "loss": 0.9294, + "step": 815 + }, + { + "ce_ib": 7.941585540771484, + "ce_orig": 0.5222452878952026, + "epoch": 0.23438061686677691, + "kl_loss": 0.3275793194770813, + "loss_ib": 0.01121737901121378, + "step": 815 + }, + { + "ce_ib": 11.759594917297363, + "ce_orig": 0.684937596321106, + "epoch": 0.23438061686677691, + "kl_loss": 0.3714814782142639, + "loss_ib": 0.01547440979629755, + "step": 815 + }, + { + "ce_ib": 8.328024864196777, + "ce_orig": 0.5521450042724609, + "epoch": 0.23438061686677691, + "kl_loss": 0.3888009488582611, + "loss_ib": 0.012216033414006233, + "step": 815 + }, + { + "ce_ib": 6.953820705413818, + "ce_orig": 0.537897527217865, + "epoch": 0.23438061686677691, + "kl_loss": 0.2841954827308655, + "loss_ib": 0.009795775637030602, + "step": 815 + }, + { + "ce_ib": 6.938723564147949, + "ce_orig": 1.0933799743652344, + "epoch": 0.23466820044575454, + "kl_loss": 0.2745826840400696, + "loss_ib": 0.009684550575911999, + "step": 816 + }, + { + "ce_ib": 7.015166759490967, + "ce_orig": 1.0554184913635254, + "epoch": 0.23466820044575454, + "kl_loss": 0.24745316803455353, + "loss_ib": 0.009489698335528374, + "step": 816 + }, + { + "ce_ib": 7.070734977722168, + "ce_orig": 0.6463883519172668, + "epoch": 0.23466820044575454, + "kl_loss": 0.3349419832229614, + "loss_ib": 0.010420155711472034, + "step": 816 + }, + { + "ce_ib": 11.36481761932373, + "ce_orig": 1.2426626682281494, + "epoch": 0.23466820044575454, + "kl_loss": 0.35446441173553467, + "loss_ib": 0.01490946114063263, + "step": 816 + }, + { + "ce_ib": 7.427221298217773, + "ce_orig": 0.8329726457595825, + "epoch": 0.2349557840247322, + "kl_loss": 0.34363722801208496, + "loss_ib": 0.0108635937795043, + "step": 817 + }, + { + "ce_ib": 5.901673316955566, + "ce_orig": 0.7341867685317993, + "epoch": 0.2349557840247322, + "kl_loss": 0.27652859687805176, + "loss_ib": 0.008666959591209888, + "step": 817 + }, + { + "ce_ib": 7.916401386260986, + "ce_orig": 0.6233932971954346, + "epoch": 0.2349557840247322, + "kl_loss": 0.38233524560928345, + "loss_ib": 0.011739754118025303, + "step": 817 + }, + { + "ce_ib": 6.952549934387207, + "ce_orig": 0.4051516354084015, + "epoch": 0.2349557840247322, + "kl_loss": 0.7161735892295837, + "loss_ib": 0.014114285819232464, + "step": 817 + }, + { + "ce_ib": 8.112340927124023, + "ce_orig": 0.41056379675865173, + "epoch": 0.23524336760370984, + "kl_loss": 0.4498043656349182, + "loss_ib": 0.012610385194420815, + "step": 818 + }, + { + "ce_ib": 7.351808547973633, + "ce_orig": 0.6276510953903198, + "epoch": 0.23524336760370984, + "kl_loss": 0.3949786424636841, + "loss_ib": 0.011301594786345959, + "step": 818 + }, + { + "ce_ib": 8.169877052307129, + "ce_orig": 0.9647888541221619, + "epoch": 0.23524336760370984, + "kl_loss": 0.32170844078063965, + "loss_ib": 0.011386961676180363, + "step": 818 + }, + { + "ce_ib": 10.368202209472656, + "ce_orig": 1.1499764919281006, + "epoch": 0.23524336760370984, + "kl_loss": 0.47000110149383545, + "loss_ib": 0.01506821345537901, + "step": 818 + }, + { + "ce_ib": 10.158099174499512, + "ce_orig": 1.1036230325698853, + "epoch": 0.23553095118268746, + "kl_loss": 0.42805948853492737, + "loss_ib": 0.014438693411648273, + "step": 819 + }, + { + "ce_ib": 9.477944374084473, + "ce_orig": 1.0577062368392944, + "epoch": 0.23553095118268746, + "kl_loss": 0.26351502537727356, + "loss_ib": 0.012113094329833984, + "step": 819 + }, + { + "ce_ib": 3.1393284797668457, + "ce_orig": 0.1595279723405838, + "epoch": 0.23553095118268746, + "kl_loss": 0.6530660390853882, + "loss_ib": 0.009669989347457886, + "step": 819 + }, + { + "ce_ib": 10.7774658203125, + "ce_orig": 1.2825448513031006, + "epoch": 0.23553095118268746, + "kl_loss": 0.6769564151763916, + "loss_ib": 0.01754703000187874, + "step": 819 + }, + { + "epoch": 0.23581853476166512, + "grad_norm": 0.10050812363624573, + "learning_rate": 9.939890013248006e-06, + "loss": 0.8356, + "step": 820 + }, + { + "ce_ib": 6.623199939727783, + "ce_orig": 0.5048424601554871, + "epoch": 0.23581853476166512, + "kl_loss": 0.32067549228668213, + "loss_ib": 0.00982995517551899, + "step": 820 + }, + { + "ce_ib": 9.443016052246094, + "ce_orig": 1.0931298732757568, + "epoch": 0.23581853476166512, + "kl_loss": 0.25568336248397827, + "loss_ib": 0.011999850161373615, + "step": 820 + }, + { + "ce_ib": 7.976568222045898, + "ce_orig": 0.669291079044342, + "epoch": 0.23581853476166512, + "kl_loss": 0.42081308364868164, + "loss_ib": 0.01218469813466072, + "step": 820 + }, + { + "ce_ib": 4.948537349700928, + "ce_orig": 0.4827899932861328, + "epoch": 0.23581853476166512, + "kl_loss": 0.25315725803375244, + "loss_ib": 0.007480109576135874, + "step": 820 + }, + { + "ce_ib": 14.804322242736816, + "ce_orig": 1.9942870140075684, + "epoch": 0.23610611834064274, + "kl_loss": 0.43627458810806274, + "loss_ib": 0.019167067483067513, + "step": 821 + }, + { + "ce_ib": 4.713957786560059, + "ce_orig": 0.42061755061149597, + "epoch": 0.23610611834064274, + "kl_loss": 0.3404502272605896, + "loss_ib": 0.008118459954857826, + "step": 821 + }, + { + "ce_ib": 9.87876033782959, + "ce_orig": 0.6623562574386597, + "epoch": 0.23610611834064274, + "kl_loss": 0.37182098627090454, + "loss_ib": 0.013596970587968826, + "step": 821 + }, + { + "ce_ib": 7.073148727416992, + "ce_orig": 0.8113523125648499, + "epoch": 0.23610611834064274, + "kl_loss": 0.3398459553718567, + "loss_ib": 0.010471608489751816, + "step": 821 + }, + { + "ce_ib": 7.666951656341553, + "ce_orig": 0.7712521553039551, + "epoch": 0.2363937019196204, + "kl_loss": 0.34804296493530273, + "loss_ib": 0.011147381737828255, + "step": 822 + }, + { + "ce_ib": 6.349161624908447, + "ce_orig": 0.7387241125106812, + "epoch": 0.2363937019196204, + "kl_loss": 0.28054261207580566, + "loss_ib": 0.009154587984085083, + "step": 822 + }, + { + "ce_ib": 5.58187198638916, + "ce_orig": 0.5994656085968018, + "epoch": 0.2363937019196204, + "kl_loss": 0.31460386514663696, + "loss_ib": 0.008727909997105598, + "step": 822 + }, + { + "ce_ib": 7.449942111968994, + "ce_orig": 0.5919069647789001, + "epoch": 0.2363937019196204, + "kl_loss": 0.2681111693382263, + "loss_ib": 0.010131053626537323, + "step": 822 + }, + { + "ce_ib": 13.98037052154541, + "ce_orig": 0.6386498808860779, + "epoch": 0.23668128549859804, + "kl_loss": 0.44783568382263184, + "loss_ib": 0.01845872774720192, + "step": 823 + }, + { + "ce_ib": 5.139134883880615, + "ce_orig": 0.7114477157592773, + "epoch": 0.23668128549859804, + "kl_loss": 0.27849745750427246, + "loss_ib": 0.007924109697341919, + "step": 823 + }, + { + "ce_ib": 6.715068340301514, + "ce_orig": 0.5276003479957581, + "epoch": 0.23668128549859804, + "kl_loss": 0.384267121553421, + "loss_ib": 0.010557739064097404, + "step": 823 + }, + { + "ce_ib": 7.8920793533325195, + "ce_orig": 0.8194569945335388, + "epoch": 0.23668128549859804, + "kl_loss": 0.2780001163482666, + "loss_ib": 0.01067208033055067, + "step": 823 + }, + { + "ce_ib": 8.298215866088867, + "ce_orig": 0.4189370572566986, + "epoch": 0.23696886907757567, + "kl_loss": 0.4898415207862854, + "loss_ib": 0.013196630403399467, + "step": 824 + }, + { + "ce_ib": 7.782186508178711, + "ce_orig": 0.4692075252532959, + "epoch": 0.23696886907757567, + "kl_loss": 0.4824924170970917, + "loss_ib": 0.012607110664248466, + "step": 824 + }, + { + "ce_ib": 9.8721923828125, + "ce_orig": 1.1827633380889893, + "epoch": 0.23696886907757567, + "kl_loss": 0.2802169919013977, + "loss_ib": 0.012674362398684025, + "step": 824 + }, + { + "ce_ib": 9.907919883728027, + "ce_orig": 1.1879762411117554, + "epoch": 0.23696886907757567, + "kl_loss": 0.41324368119239807, + "loss_ib": 0.014040356501936913, + "step": 824 + }, + { + "epoch": 0.23725645265655332, + "grad_norm": 0.10062714666128159, + "learning_rate": 9.938684277942631e-06, + "loss": 0.8766, + "step": 825 + }, + { + "ce_ib": 8.782609939575195, + "ce_orig": 1.2377029657363892, + "epoch": 0.23725645265655332, + "kl_loss": 0.3123038113117218, + "loss_ib": 0.011905648745596409, + "step": 825 + }, + { + "ce_ib": 6.110267639160156, + "ce_orig": 0.6393804550170898, + "epoch": 0.23725645265655332, + "kl_loss": 0.33634820580482483, + "loss_ib": 0.009473749436438084, + "step": 825 + }, + { + "ce_ib": 9.359816551208496, + "ce_orig": 1.2528795003890991, + "epoch": 0.23725645265655332, + "kl_loss": 0.28830617666244507, + "loss_ib": 0.012242878787219524, + "step": 825 + }, + { + "ce_ib": 9.707563400268555, + "ce_orig": 0.8692981004714966, + "epoch": 0.23725645265655332, + "kl_loss": 0.5571379661560059, + "loss_ib": 0.015278941951692104, + "step": 825 + }, + { + "ce_ib": 9.498275756835938, + "ce_orig": 1.071899652481079, + "epoch": 0.23754403623553094, + "kl_loss": 0.2672438621520996, + "loss_ib": 0.012170715257525444, + "step": 826 + }, + { + "ce_ib": 9.843942642211914, + "ce_orig": 0.9619124531745911, + "epoch": 0.23754403623553094, + "kl_loss": 0.2982301712036133, + "loss_ib": 0.012826244346797466, + "step": 826 + }, + { + "ce_ib": 10.634578704833984, + "ce_orig": 1.325036644935608, + "epoch": 0.23754403623553094, + "kl_loss": 0.32248958945274353, + "loss_ib": 0.01385947410017252, + "step": 826 + }, + { + "ce_ib": 12.585394859313965, + "ce_orig": 1.4135090112686157, + "epoch": 0.23754403623553094, + "kl_loss": 0.46502685546875, + "loss_ib": 0.017235664650797844, + "step": 826 + }, + { + "ce_ib": 10.126785278320312, + "ce_orig": 1.5693209171295166, + "epoch": 0.2378316198145086, + "kl_loss": 0.2683885097503662, + "loss_ib": 0.012810669839382172, + "step": 827 + }, + { + "ce_ib": 6.727275848388672, + "ce_orig": 0.9106936454772949, + "epoch": 0.2378316198145086, + "kl_loss": 0.2777571678161621, + "loss_ib": 0.009504847228527069, + "step": 827 + }, + { + "ce_ib": 8.126458168029785, + "ce_orig": 0.8495746850967407, + "epoch": 0.2378316198145086, + "kl_loss": 0.26818782091140747, + "loss_ib": 0.010808336548507214, + "step": 827 + }, + { + "ce_ib": 10.300277709960938, + "ce_orig": 1.3873240947723389, + "epoch": 0.2378316198145086, + "kl_loss": 0.41793563961982727, + "loss_ib": 0.014479633420705795, + "step": 827 + }, + { + "ce_ib": 12.66947078704834, + "ce_orig": 1.5648332834243774, + "epoch": 0.23811920339348625, + "kl_loss": 0.3042876422405243, + "loss_ib": 0.015712348744273186, + "step": 828 + }, + { + "ce_ib": 8.798270225524902, + "ce_orig": 0.920973539352417, + "epoch": 0.23811920339348625, + "kl_loss": 0.34468623995780945, + "loss_ib": 0.012245132587850094, + "step": 828 + }, + { + "ce_ib": 8.365591049194336, + "ce_orig": 0.5817134976387024, + "epoch": 0.23811920339348625, + "kl_loss": 0.39118778705596924, + "loss_ib": 0.012277469038963318, + "step": 828 + }, + { + "ce_ib": 11.80969524383545, + "ce_orig": 1.7373600006103516, + "epoch": 0.23811920339348625, + "kl_loss": 0.3660210371017456, + "loss_ib": 0.01546990592032671, + "step": 828 + }, + { + "ce_ib": 7.223691463470459, + "ce_orig": 0.9051380157470703, + "epoch": 0.23840678697246387, + "kl_loss": 0.2608543038368225, + "loss_ib": 0.009832234121859074, + "step": 829 + }, + { + "ce_ib": 6.39984130859375, + "ce_orig": 0.6907777190208435, + "epoch": 0.23840678697246387, + "kl_loss": 0.3057895600795746, + "loss_ib": 0.00945773720741272, + "step": 829 + }, + { + "ce_ib": 5.409855365753174, + "ce_orig": 0.7622098326683044, + "epoch": 0.23840678697246387, + "kl_loss": 0.24784672260284424, + "loss_ib": 0.007888322696089745, + "step": 829 + }, + { + "ce_ib": 8.38547134399414, + "ce_orig": 0.7260690331459045, + "epoch": 0.23840678697246387, + "kl_loss": 0.29803162813186646, + "loss_ib": 0.01136578805744648, + "step": 829 + }, + { + "epoch": 0.23869437055144152, + "grad_norm": 0.11168427765369415, + "learning_rate": 9.93746664420054e-06, + "loss": 0.9104, + "step": 830 + }, + { + "ce_ib": 5.065134048461914, + "ce_orig": 0.8026469349861145, + "epoch": 0.23869437055144152, + "kl_loss": 0.278666615486145, + "loss_ib": 0.007851799950003624, + "step": 830 + }, + { + "ce_ib": 13.80343246459961, + "ce_orig": 0.9906110167503357, + "epoch": 0.23869437055144152, + "kl_loss": 0.3099890649318695, + "loss_ib": 0.01690332405269146, + "step": 830 + }, + { + "ce_ib": 8.821106910705566, + "ce_orig": 1.093506932258606, + "epoch": 0.23869437055144152, + "kl_loss": 0.3800942599773407, + "loss_ib": 0.012622050009667873, + "step": 830 + }, + { + "ce_ib": 11.06147575378418, + "ce_orig": 1.1915507316589355, + "epoch": 0.23869437055144152, + "kl_loss": 0.3803118169307709, + "loss_ib": 0.014864594675600529, + "step": 830 + }, + { + "ce_ib": 6.217090129852295, + "ce_orig": 0.4522053599357605, + "epoch": 0.23898195413041914, + "kl_loss": 0.3387228846549988, + "loss_ib": 0.009604318998754025, + "step": 831 + }, + { + "ce_ib": 8.78792953491211, + "ce_orig": 0.818161129951477, + "epoch": 0.23898195413041914, + "kl_loss": 0.3240455389022827, + "loss_ib": 0.012028384022414684, + "step": 831 + }, + { + "ce_ib": 9.777087211608887, + "ce_orig": 0.9559274315834045, + "epoch": 0.23898195413041914, + "kl_loss": 0.305178701877594, + "loss_ib": 0.01282887440174818, + "step": 831 + }, + { + "ce_ib": 8.793158531188965, + "ce_orig": 1.7135603427886963, + "epoch": 0.23898195413041914, + "kl_loss": 0.35992008447647095, + "loss_ib": 0.012392358854413033, + "step": 831 + }, + { + "ce_ib": 14.298954963684082, + "ce_orig": 1.9766751527786255, + "epoch": 0.2392695377093968, + "kl_loss": 0.37456846237182617, + "loss_ib": 0.018044639378786087, + "step": 832 + }, + { + "ce_ib": 5.9418816566467285, + "ce_orig": 0.7225015163421631, + "epoch": 0.2392695377093968, + "kl_loss": 0.35724693536758423, + "loss_ib": 0.009514350444078445, + "step": 832 + }, + { + "ce_ib": 8.032739639282227, + "ce_orig": 0.6944239139556885, + "epoch": 0.2392695377093968, + "kl_loss": 0.35457563400268555, + "loss_ib": 0.011578495614230633, + "step": 832 + }, + { + "ce_ib": 9.528278350830078, + "ce_orig": 1.3547769784927368, + "epoch": 0.2392695377093968, + "kl_loss": 0.4733262062072754, + "loss_ib": 0.014261540956795216, + "step": 832 + }, + { + "ce_ib": 10.221992492675781, + "ce_orig": 0.9856470227241516, + "epoch": 0.23955712128837445, + "kl_loss": 0.33555009961128235, + "loss_ib": 0.013577492907643318, + "step": 833 + }, + { + "ce_ib": 8.804466247558594, + "ce_orig": 0.928854763507843, + "epoch": 0.23955712128837445, + "kl_loss": 0.36634519696235657, + "loss_ib": 0.012467917986214161, + "step": 833 + }, + { + "ce_ib": 6.364208221435547, + "ce_orig": 0.8164104223251343, + "epoch": 0.23955712128837445, + "kl_loss": 0.22879984974861145, + "loss_ib": 0.008652206510305405, + "step": 833 + }, + { + "ce_ib": 8.315366744995117, + "ce_orig": 0.9906771779060364, + "epoch": 0.23955712128837445, + "kl_loss": 0.4181078374385834, + "loss_ib": 0.012496445327997208, + "step": 833 + }, + { + "ce_ib": 8.126928329467773, + "ce_orig": 0.9921467304229736, + "epoch": 0.23984470486735207, + "kl_loss": 0.2800453305244446, + "loss_ib": 0.010927380993962288, + "step": 834 + }, + { + "ce_ib": 5.869344234466553, + "ce_orig": 0.8272134065628052, + "epoch": 0.23984470486735207, + "kl_loss": 0.23419909179210663, + "loss_ib": 0.00821133516728878, + "step": 834 + }, + { + "ce_ib": 10.878475189208984, + "ce_orig": 1.308764934539795, + "epoch": 0.23984470486735207, + "kl_loss": 0.3325369656085968, + "loss_ib": 0.014203844591975212, + "step": 834 + }, + { + "ce_ib": 5.4352545738220215, + "ce_orig": 0.45368334650993347, + "epoch": 0.23984470486735207, + "kl_loss": 0.6772407293319702, + "loss_ib": 0.012207661755383015, + "step": 834 + }, + { + "epoch": 0.24013228844632972, + "grad_norm": 0.09686867892742157, + "learning_rate": 9.93623711495529e-06, + "loss": 0.9213, + "step": 835 + }, + { + "ce_ib": 8.014131546020508, + "ce_orig": 1.0289490222930908, + "epoch": 0.24013228844632972, + "kl_loss": 0.3445562720298767, + "loss_ib": 0.011459693312644958, + "step": 835 + }, + { + "ce_ib": 6.29163122177124, + "ce_orig": 0.48442956805229187, + "epoch": 0.24013228844632972, + "kl_loss": 0.3220330774784088, + "loss_ib": 0.009511961601674557, + "step": 835 + }, + { + "ce_ib": 7.94074821472168, + "ce_orig": 0.7545216083526611, + "epoch": 0.24013228844632972, + "kl_loss": 0.45153874158859253, + "loss_ib": 0.0124561358243227, + "step": 835 + }, + { + "ce_ib": 5.553848743438721, + "ce_orig": 0.3156552314758301, + "epoch": 0.24013228844632972, + "kl_loss": 0.35743337869644165, + "loss_ib": 0.00912818219512701, + "step": 835 + }, + { + "ce_ib": 7.592702865600586, + "ce_orig": 0.8854877352714539, + "epoch": 0.24041987202530735, + "kl_loss": 0.28391388058662415, + "loss_ib": 0.010431841015815735, + "step": 836 + }, + { + "ce_ib": 8.959979057312012, + "ce_orig": 0.5111126899719238, + "epoch": 0.24041987202530735, + "kl_loss": 0.426180899143219, + "loss_ib": 0.013221788220107555, + "step": 836 + }, + { + "ce_ib": 12.740047454833984, + "ce_orig": 1.6768262386322021, + "epoch": 0.24041987202530735, + "kl_loss": 0.3991634249687195, + "loss_ib": 0.016731681302189827, + "step": 836 + }, + { + "ce_ib": 12.013802528381348, + "ce_orig": 1.272831916809082, + "epoch": 0.24041987202530735, + "kl_loss": 0.3409850001335144, + "loss_ib": 0.015423652715981007, + "step": 836 + }, + { + "ce_ib": 2.0301332473754883, + "ce_orig": 0.16432513296604156, + "epoch": 0.240707455604285, + "kl_loss": 0.6776133179664612, + "loss_ib": 0.008806266821920872, + "step": 837 + }, + { + "ce_ib": 10.865999221801758, + "ce_orig": 1.410832405090332, + "epoch": 0.240707455604285, + "kl_loss": 0.312138170003891, + "loss_ib": 0.013987381011247635, + "step": 837 + }, + { + "ce_ib": 6.166257858276367, + "ce_orig": 0.6295948624610901, + "epoch": 0.240707455604285, + "kl_loss": 0.36339548230171204, + "loss_ib": 0.00980021245777607, + "step": 837 + }, + { + "ce_ib": 5.043333053588867, + "ce_orig": 0.46830177307128906, + "epoch": 0.240707455604285, + "kl_loss": 0.3290286064147949, + "loss_ib": 0.008333618752658367, + "step": 837 + }, + { + "ce_ib": 9.060256004333496, + "ce_orig": 1.1709790229797363, + "epoch": 0.24099503918326265, + "kl_loss": 0.27760225534439087, + "loss_ib": 0.0118362782523036, + "step": 838 + }, + { + "ce_ib": 6.811290264129639, + "ce_orig": 0.793929934501648, + "epoch": 0.24099503918326265, + "kl_loss": 0.31319600343704224, + "loss_ib": 0.009943250566720963, + "step": 838 + }, + { + "ce_ib": 4.931931018829346, + "ce_orig": 0.36589503288269043, + "epoch": 0.24099503918326265, + "kl_loss": 0.40839433670043945, + "loss_ib": 0.009015874937176704, + "step": 838 + }, + { + "ce_ib": 6.180576324462891, + "ce_orig": 0.4309951066970825, + "epoch": 0.24099503918326265, + "kl_loss": 0.2373921424150467, + "loss_ib": 0.008554497733712196, + "step": 838 + }, + { + "ce_ib": 5.762244701385498, + "ce_orig": 0.544806957244873, + "epoch": 0.24128262276224027, + "kl_loss": 0.31401538848876953, + "loss_ib": 0.008902398869395256, + "step": 839 + }, + { + "ce_ib": 7.717389106750488, + "ce_orig": 1.2485359907150269, + "epoch": 0.24128262276224027, + "kl_loss": 0.30778759717941284, + "loss_ib": 0.010795265436172485, + "step": 839 + }, + { + "ce_ib": 7.063183784484863, + "ce_orig": 0.768622100353241, + "epoch": 0.24128262276224027, + "kl_loss": 0.3657524585723877, + "loss_ib": 0.010720708407461643, + "step": 839 + }, + { + "ce_ib": 6.140594482421875, + "ce_orig": 0.7728462815284729, + "epoch": 0.24128262276224027, + "kl_loss": 0.2510579824447632, + "loss_ib": 0.008651173673570156, + "step": 839 + }, + { + "epoch": 0.24157020634121792, + "grad_norm": 0.10076677054166794, + "learning_rate": 9.934995693169104e-06, + "loss": 0.8875, + "step": 840 + }, + { + "ce_ib": 9.007379531860352, + "ce_orig": 0.8373998403549194, + "epoch": 0.24157020634121792, + "kl_loss": 0.3071047067642212, + "loss_ib": 0.012078425846993923, + "step": 840 + }, + { + "ce_ib": 6.403738975524902, + "ce_orig": 0.7129970192909241, + "epoch": 0.24157020634121792, + "kl_loss": 0.28114748001098633, + "loss_ib": 0.009215213358402252, + "step": 840 + }, + { + "ce_ib": 7.572381973266602, + "ce_orig": 0.8134757280349731, + "epoch": 0.24157020634121792, + "kl_loss": 0.3291366696357727, + "loss_ib": 0.010863748379051685, + "step": 840 + }, + { + "ce_ib": 7.471776962280273, + "ce_orig": 0.45619362592697144, + "epoch": 0.24157020634121792, + "kl_loss": 0.3013712763786316, + "loss_ib": 0.010485488921403885, + "step": 840 + }, + { + "ce_ib": 8.4400053024292, + "ce_orig": 0.8053632378578186, + "epoch": 0.24185778992019555, + "kl_loss": 0.30274561047554016, + "loss_ib": 0.011467461474239826, + "step": 841 + }, + { + "ce_ib": 13.409200668334961, + "ce_orig": 1.5305240154266357, + "epoch": 0.24185778992019555, + "kl_loss": 0.27949976921081543, + "loss_ib": 0.016204198822379112, + "step": 841 + }, + { + "ce_ib": 5.851305961608887, + "ce_orig": 0.7181586027145386, + "epoch": 0.24185778992019555, + "kl_loss": 0.28577935695648193, + "loss_ib": 0.00870910007506609, + "step": 841 + }, + { + "ce_ib": 8.071784973144531, + "ce_orig": 0.9581683278083801, + "epoch": 0.24185778992019555, + "kl_loss": 0.371356725692749, + "loss_ib": 0.011785351671278477, + "step": 841 + }, + { + "ce_ib": 11.643394470214844, + "ce_orig": 1.0913020372390747, + "epoch": 0.2421453734991732, + "kl_loss": 0.3137480914592743, + "loss_ib": 0.01478087529540062, + "step": 842 + }, + { + "ce_ib": 7.9136576652526855, + "ce_orig": 0.8460515141487122, + "epoch": 0.2421453734991732, + "kl_loss": 0.28541165590286255, + "loss_ib": 0.010767774656414986, + "step": 842 + }, + { + "ce_ib": 12.15166187286377, + "ce_orig": 0.9375542402267456, + "epoch": 0.2421453734991732, + "kl_loss": 0.466509073972702, + "loss_ib": 0.016816752031445503, + "step": 842 + }, + { + "ce_ib": 6.482849597930908, + "ce_orig": 0.652571439743042, + "epoch": 0.2421453734991732, + "kl_loss": 0.3070219159126282, + "loss_ib": 0.009553068317472935, + "step": 842 + }, + { + "ce_ib": 9.26317024230957, + "ce_orig": 0.6297864317893982, + "epoch": 0.24243295707815085, + "kl_loss": 0.8582457304000854, + "loss_ib": 0.017845628783106804, + "step": 843 + }, + { + "ce_ib": 9.843435287475586, + "ce_orig": 0.9486488103866577, + "epoch": 0.24243295707815085, + "kl_loss": 0.35706013441085815, + "loss_ib": 0.013414036482572556, + "step": 843 + }, + { + "ce_ib": 8.942062377929688, + "ce_orig": 0.6295011639595032, + "epoch": 0.24243295707815085, + "kl_loss": 0.3587941527366638, + "loss_ib": 0.012530003674328327, + "step": 843 + }, + { + "ce_ib": 8.235858917236328, + "ce_orig": 1.0911704301834106, + "epoch": 0.24243295707815085, + "kl_loss": 0.4925958812236786, + "loss_ib": 0.013161817565560341, + "step": 843 + }, + { + "ce_ib": 2.106238842010498, + "ce_orig": 0.10293695330619812, + "epoch": 0.24272054065712848, + "kl_loss": 0.6076182126998901, + "loss_ib": 0.008182420395314693, + "step": 844 + }, + { + "ce_ib": 10.04066276550293, + "ce_orig": 0.7675477862358093, + "epoch": 0.24272054065712848, + "kl_loss": 0.3272428512573242, + "loss_ib": 0.013313091360032558, + "step": 844 + }, + { + "ce_ib": 7.2537922859191895, + "ce_orig": 0.6966544985771179, + "epoch": 0.24272054065712848, + "kl_loss": 0.30012214183807373, + "loss_ib": 0.010255013592541218, + "step": 844 + }, + { + "ce_ib": 9.952733039855957, + "ce_orig": 0.8222768902778625, + "epoch": 0.24272054065712848, + "kl_loss": 0.29932597279548645, + "loss_ib": 0.01294599287211895, + "step": 844 + }, + { + "epoch": 0.24300812423610613, + "grad_norm": 0.09415728598833084, + "learning_rate": 9.93374238183286e-06, + "loss": 0.8609, + "step": 845 + }, + { + "ce_ib": 9.53044605255127, + "ce_orig": 0.9650492668151855, + "epoch": 0.24300812423610613, + "kl_loss": 0.23566867411136627, + "loss_ib": 0.011887133121490479, + "step": 845 + }, + { + "ce_ib": 9.0289945602417, + "ce_orig": 1.0505539178848267, + "epoch": 0.24300812423610613, + "kl_loss": 0.3517257273197174, + "loss_ib": 0.012546251527965069, + "step": 845 + }, + { + "ce_ib": 6.667138576507568, + "ce_orig": 0.6767503023147583, + "epoch": 0.24300812423610613, + "kl_loss": 0.25494682788848877, + "loss_ib": 0.009216606616973877, + "step": 845 + }, + { + "ce_ib": 10.858115196228027, + "ce_orig": 1.128201961517334, + "epoch": 0.24300812423610613, + "kl_loss": 0.3126045763492584, + "loss_ib": 0.013984160497784615, + "step": 845 + }, + { + "ce_ib": 8.37086296081543, + "ce_orig": 1.0704699754714966, + "epoch": 0.24329570781508375, + "kl_loss": 0.3986669182777405, + "loss_ib": 0.012357532978057861, + "step": 846 + }, + { + "ce_ib": 12.114412307739258, + "ce_orig": 0.8631466031074524, + "epoch": 0.24329570781508375, + "kl_loss": 0.5870100259780884, + "loss_ib": 0.017984513193368912, + "step": 846 + }, + { + "ce_ib": 10.035650253295898, + "ce_orig": 0.7930597066879272, + "epoch": 0.24329570781508375, + "kl_loss": 0.34875866770744324, + "loss_ib": 0.013523237779736519, + "step": 846 + }, + { + "ce_ib": 9.243821144104004, + "ce_orig": 0.9381915330886841, + "epoch": 0.24329570781508375, + "kl_loss": 0.6351751089096069, + "loss_ib": 0.0155955720692873, + "step": 846 + }, + { + "ce_ib": 7.255211353302002, + "ce_orig": 0.7044399380683899, + "epoch": 0.2435832913940614, + "kl_loss": 0.31700530648231506, + "loss_ib": 0.010425264947116375, + "step": 847 + }, + { + "ce_ib": 8.711723327636719, + "ce_orig": 0.8803223967552185, + "epoch": 0.2435832913940614, + "kl_loss": 0.3968978822231293, + "loss_ib": 0.012680701911449432, + "step": 847 + }, + { + "ce_ib": 6.419612407684326, + "ce_orig": 0.5552703142166138, + "epoch": 0.2435832913940614, + "kl_loss": 0.24681052565574646, + "loss_ib": 0.008887717500329018, + "step": 847 + }, + { + "ce_ib": 7.220922470092773, + "ce_orig": 0.8049042224884033, + "epoch": 0.2435832913940614, + "kl_loss": 0.25132423639297485, + "loss_ib": 0.00973416492342949, + "step": 847 + }, + { + "ce_ib": 10.610690116882324, + "ce_orig": 1.24649178981781, + "epoch": 0.24387087497303903, + "kl_loss": 0.3552427291870117, + "loss_ib": 0.014163116924464703, + "step": 848 + }, + { + "ce_ib": 8.324702262878418, + "ce_orig": 0.7784779667854309, + "epoch": 0.24387087497303903, + "kl_loss": 0.26029253005981445, + "loss_ib": 0.010927626863121986, + "step": 848 + }, + { + "ce_ib": 5.074281692504883, + "ce_orig": 0.5773417353630066, + "epoch": 0.24387087497303903, + "kl_loss": 0.34564918279647827, + "loss_ib": 0.008530773222446442, + "step": 848 + }, + { + "ce_ib": 6.707390308380127, + "ce_orig": 0.6460347175598145, + "epoch": 0.24387087497303903, + "kl_loss": 0.33726412057876587, + "loss_ib": 0.010080032050609589, + "step": 848 + }, + { + "ce_ib": 5.850057601928711, + "ce_orig": 0.5735775232315063, + "epoch": 0.24415845855201668, + "kl_loss": 0.30734121799468994, + "loss_ib": 0.008923470042645931, + "step": 849 + }, + { + "ce_ib": 9.06562614440918, + "ce_orig": 1.0471097230911255, + "epoch": 0.24415845855201668, + "kl_loss": 0.43707603216171265, + "loss_ib": 0.013436386361718178, + "step": 849 + }, + { + "ce_ib": 6.853366374969482, + "ce_orig": 0.9200989007949829, + "epoch": 0.24415845855201668, + "kl_loss": 0.3171440362930298, + "loss_ib": 0.010024807415902615, + "step": 849 + }, + { + "ce_ib": 5.92440128326416, + "ce_orig": 0.5834106802940369, + "epoch": 0.24415845855201668, + "kl_loss": 0.2906469702720642, + "loss_ib": 0.008830870501697063, + "step": 849 + }, + { + "epoch": 0.24444604213099433, + "grad_norm": 0.11549082398414612, + "learning_rate": 9.93247718396607e-06, + "loss": 0.8256, + "step": 850 + }, + { + "ce_ib": 6.185736179351807, + "ce_orig": 0.5565721392631531, + "epoch": 0.24444604213099433, + "kl_loss": 0.3115679621696472, + "loss_ib": 0.009301415644586086, + "step": 850 + }, + { + "ce_ib": 10.200575828552246, + "ce_orig": 1.0745152235031128, + "epoch": 0.24444604213099433, + "kl_loss": 0.3611696660518646, + "loss_ib": 0.013812271878123283, + "step": 850 + }, + { + "ce_ib": 11.002309799194336, + "ce_orig": 1.2843042612075806, + "epoch": 0.24444604213099433, + "kl_loss": 0.3904564380645752, + "loss_ib": 0.014906874857842922, + "step": 850 + }, + { + "ce_ib": 8.565811157226562, + "ce_orig": 0.9313501715660095, + "epoch": 0.24444604213099433, + "kl_loss": 0.3498835265636444, + "loss_ib": 0.012064645998179913, + "step": 850 + }, + { + "ce_ib": 6.367196559906006, + "ce_orig": 0.3282487392425537, + "epoch": 0.24473362570997195, + "kl_loss": 0.3030562996864319, + "loss_ib": 0.009397759102284908, + "step": 851 + }, + { + "ce_ib": 7.566930294036865, + "ce_orig": 0.4791885316371918, + "epoch": 0.24473362570997195, + "kl_loss": 0.3446214199066162, + "loss_ib": 0.011013145558536053, + "step": 851 + }, + { + "ce_ib": 6.3060078620910645, + "ce_orig": 0.7763472199440002, + "epoch": 0.24473362570997195, + "kl_loss": 0.2825550436973572, + "loss_ib": 0.009131558239459991, + "step": 851 + }, + { + "ce_ib": 6.97025728225708, + "ce_orig": 0.5963578820228577, + "epoch": 0.24473362570997195, + "kl_loss": 0.26160991191864014, + "loss_ib": 0.009586355648934841, + "step": 851 + }, + { + "ce_ib": 12.1688814163208, + "ce_orig": 1.5043821334838867, + "epoch": 0.2450212092889496, + "kl_loss": 0.4432004690170288, + "loss_ib": 0.01660088635981083, + "step": 852 + }, + { + "ce_ib": 10.238726615905762, + "ce_orig": 0.6479013562202454, + "epoch": 0.2450212092889496, + "kl_loss": 0.3322160243988037, + "loss_ib": 0.013560887426137924, + "step": 852 + }, + { + "ce_ib": 4.64656400680542, + "ce_orig": 0.37472283840179443, + "epoch": 0.2450212092889496, + "kl_loss": 0.388122022151947, + "loss_ib": 0.0085277846083045, + "step": 852 + }, + { + "ce_ib": 8.35583209991455, + "ce_orig": 0.9440561532974243, + "epoch": 0.2450212092889496, + "kl_loss": 0.34462809562683105, + "loss_ib": 0.011802112683653831, + "step": 852 + }, + { + "ce_ib": 7.765969753265381, + "ce_orig": 1.0505164861679077, + "epoch": 0.24530879286792723, + "kl_loss": 0.31646573543548584, + "loss_ib": 0.010930625721812248, + "step": 853 + }, + { + "ce_ib": 6.262195587158203, + "ce_orig": 0.6370275616645813, + "epoch": 0.24530879286792723, + "kl_loss": 0.3554043769836426, + "loss_ib": 0.009816239587962627, + "step": 853 + }, + { + "ce_ib": 3.9668338298797607, + "ce_orig": 0.3455740511417389, + "epoch": 0.24530879286792723, + "kl_loss": 0.33059853315353394, + "loss_ib": 0.007272819057106972, + "step": 853 + }, + { + "ce_ib": 5.173341751098633, + "ce_orig": 0.5830708742141724, + "epoch": 0.24530879286792723, + "kl_loss": 0.24811115860939026, + "loss_ib": 0.007654453162103891, + "step": 853 + }, + { + "ce_ib": 7.131439685821533, + "ce_orig": 0.9161396622657776, + "epoch": 0.24559637644690488, + "kl_loss": 0.6239281892776489, + "loss_ib": 0.01337072066962719, + "step": 854 + }, + { + "ce_ib": 9.913086891174316, + "ce_orig": 0.5309944748878479, + "epoch": 0.24559637644690488, + "kl_loss": 0.2742801010608673, + "loss_ib": 0.012655887752771378, + "step": 854 + }, + { + "ce_ib": 6.634543418884277, + "ce_orig": 0.8277848362922668, + "epoch": 0.24559637644690488, + "kl_loss": 0.27079910039901733, + "loss_ib": 0.009342534467577934, + "step": 854 + }, + { + "ce_ib": 6.51190185546875, + "ce_orig": 0.7204493880271912, + "epoch": 0.24559637644690488, + "kl_loss": 0.26875340938568115, + "loss_ib": 0.009199435822665691, + "step": 854 + }, + { + "epoch": 0.24588396002588253, + "grad_norm": 0.10104996711015701, + "learning_rate": 9.931200102616892e-06, + "loss": 0.8524, + "step": 855 + }, + { + "ce_ib": 9.483591079711914, + "ce_orig": 0.5329586267471313, + "epoch": 0.24588396002588253, + "kl_loss": 0.3656144142150879, + "loss_ib": 0.013139734975993633, + "step": 855 + }, + { + "ce_ib": 7.316298961639404, + "ce_orig": 0.8602546453475952, + "epoch": 0.24588396002588253, + "kl_loss": 0.2442716807126999, + "loss_ib": 0.009759015403687954, + "step": 855 + }, + { + "ce_ib": 6.996337890625, + "ce_orig": 0.7689603567123413, + "epoch": 0.24588396002588253, + "kl_loss": 0.28071606159210205, + "loss_ib": 0.009803498163819313, + "step": 855 + }, + { + "ce_ib": 9.025272369384766, + "ce_orig": 1.0526149272918701, + "epoch": 0.24588396002588253, + "kl_loss": 0.3657127618789673, + "loss_ib": 0.012682399712502956, + "step": 855 + }, + { + "ce_ib": 5.605438709259033, + "ce_orig": 0.4452979564666748, + "epoch": 0.24617154360486015, + "kl_loss": 0.36353376507759094, + "loss_ib": 0.009240776300430298, + "step": 856 + }, + { + "ce_ib": 8.554245948791504, + "ce_orig": 0.9303341507911682, + "epoch": 0.24617154360486015, + "kl_loss": 0.40140336751937866, + "loss_ib": 0.012568279169499874, + "step": 856 + }, + { + "ce_ib": 8.914340019226074, + "ce_orig": 0.9002864360809326, + "epoch": 0.24617154360486015, + "kl_loss": 0.3354775905609131, + "loss_ib": 0.012269115075469017, + "step": 856 + }, + { + "ce_ib": 8.865965843200684, + "ce_orig": 0.9647238254547119, + "epoch": 0.24617154360486015, + "kl_loss": 0.3433181643486023, + "loss_ib": 0.012299147434532642, + "step": 856 + }, + { + "ce_ib": 10.691956520080566, + "ce_orig": 1.4019811153411865, + "epoch": 0.2464591271838378, + "kl_loss": 0.33202850818634033, + "loss_ib": 0.014012240804731846, + "step": 857 + }, + { + "ce_ib": 6.769617557525635, + "ce_orig": 0.60886549949646, + "epoch": 0.2464591271838378, + "kl_loss": 0.34499895572662354, + "loss_ib": 0.010219607502222061, + "step": 857 + }, + { + "ce_ib": 5.660098075866699, + "ce_orig": 0.6113988757133484, + "epoch": 0.2464591271838378, + "kl_loss": 0.2966403067111969, + "loss_ib": 0.008626501075923443, + "step": 857 + }, + { + "ce_ib": 13.32357406616211, + "ce_orig": 1.3362207412719727, + "epoch": 0.2464591271838378, + "kl_loss": 0.29936474561691284, + "loss_ib": 0.016317222267389297, + "step": 857 + }, + { + "ce_ib": 2.6769988536834717, + "ce_orig": 0.18080325424671173, + "epoch": 0.24674671076281543, + "kl_loss": 0.6789752840995789, + "loss_ib": 0.009466751478612423, + "step": 858 + }, + { + "ce_ib": 8.577744483947754, + "ce_orig": 0.7904664278030396, + "epoch": 0.24674671076281543, + "kl_loss": 0.35377517342567444, + "loss_ib": 0.012115496210753918, + "step": 858 + }, + { + "ce_ib": 9.277144432067871, + "ce_orig": 0.858630359172821, + "epoch": 0.24674671076281543, + "kl_loss": 0.4751337170600891, + "loss_ib": 0.014028482139110565, + "step": 858 + }, + { + "ce_ib": 4.141605377197266, + "ce_orig": 0.31388720870018005, + "epoch": 0.24674671076281543, + "kl_loss": 0.6098456382751465, + "loss_ib": 0.010240061208605766, + "step": 858 + }, + { + "ce_ib": 9.002893447875977, + "ce_orig": 0.7537875175476074, + "epoch": 0.24703429434179308, + "kl_loss": 0.300067663192749, + "loss_ib": 0.012003568932414055, + "step": 859 + }, + { + "ce_ib": 4.289847373962402, + "ce_orig": 0.40423697233200073, + "epoch": 0.24703429434179308, + "kl_loss": 0.37460029125213623, + "loss_ib": 0.008035850711166859, + "step": 859 + }, + { + "ce_ib": 6.030440330505371, + "ce_orig": 0.7502020001411438, + "epoch": 0.24703429434179308, + "kl_loss": 0.301896333694458, + "loss_ib": 0.009049403481185436, + "step": 859 + }, + { + "ce_ib": 5.722672462463379, + "ce_orig": 0.8073954582214355, + "epoch": 0.24703429434179308, + "kl_loss": 0.2869844436645508, + "loss_ib": 0.00859251618385315, + "step": 859 + }, + { + "epoch": 0.24732187792077073, + "grad_norm": 0.09929006546735764, + "learning_rate": 9.929911140862109e-06, + "loss": 0.8739, + "step": 860 + }, + { + "ce_ib": 9.267024040222168, + "ce_orig": 1.2398027181625366, + "epoch": 0.24732187792077073, + "kl_loss": 0.28411030769348145, + "loss_ib": 0.01210812758654356, + "step": 860 + }, + { + "ce_ib": 8.08333683013916, + "ce_orig": 0.8170286417007446, + "epoch": 0.24732187792077073, + "kl_loss": 0.25793078541755676, + "loss_ib": 0.010662645101547241, + "step": 860 + }, + { + "ce_ib": 10.10383129119873, + "ce_orig": 1.1762322187423706, + "epoch": 0.24732187792077073, + "kl_loss": 0.3745589256286621, + "loss_ib": 0.013849420472979546, + "step": 860 + }, + { + "ce_ib": 10.957147598266602, + "ce_orig": 1.4055129289627075, + "epoch": 0.24732187792077073, + "kl_loss": 0.47874391078948975, + "loss_ib": 0.015744587406516075, + "step": 860 + }, + { + "ce_ib": 5.491908073425293, + "ce_orig": 0.8730387091636658, + "epoch": 0.24760946149974836, + "kl_loss": 0.2792191803455353, + "loss_ib": 0.008284100331366062, + "step": 861 + }, + { + "ce_ib": 9.5418701171875, + "ce_orig": 0.7111859321594238, + "epoch": 0.24760946149974836, + "kl_loss": 0.28549331426620483, + "loss_ib": 0.012396802194416523, + "step": 861 + }, + { + "ce_ib": 9.359732627868652, + "ce_orig": 0.5781843066215515, + "epoch": 0.24760946149974836, + "kl_loss": 0.3297385573387146, + "loss_ib": 0.01265711709856987, + "step": 861 + }, + { + "ce_ib": 4.646590232849121, + "ce_orig": 0.6796752214431763, + "epoch": 0.24760946149974836, + "kl_loss": 0.5749114751815796, + "loss_ib": 0.010395705699920654, + "step": 861 + }, + { + "ce_ib": 8.25394344329834, + "ce_orig": 0.9002748131752014, + "epoch": 0.247897045078726, + "kl_loss": 0.3159523904323578, + "loss_ib": 0.01141346711665392, + "step": 862 + }, + { + "ce_ib": 8.197535514831543, + "ce_orig": 0.6030675172805786, + "epoch": 0.247897045078726, + "kl_loss": 0.46316292881965637, + "loss_ib": 0.01282916497439146, + "step": 862 + }, + { + "ce_ib": 8.135879516601562, + "ce_orig": 0.5498018264770508, + "epoch": 0.247897045078726, + "kl_loss": 0.37288355827331543, + "loss_ib": 0.011864714324474335, + "step": 862 + }, + { + "ce_ib": 8.026688575744629, + "ce_orig": 0.6925224661827087, + "epoch": 0.247897045078726, + "kl_loss": 0.3975781798362732, + "loss_ib": 0.012002469971776009, + "step": 862 + }, + { + "ce_ib": 6.015689849853516, + "ce_orig": 0.795344889163971, + "epoch": 0.24818462865770363, + "kl_loss": 0.2662222981452942, + "loss_ib": 0.008677912876009941, + "step": 863 + }, + { + "ce_ib": 6.312599182128906, + "ce_orig": 0.4928840100765228, + "epoch": 0.24818462865770363, + "kl_loss": 0.28389108180999756, + "loss_ib": 0.009151509962975979, + "step": 863 + }, + { + "ce_ib": 12.868780136108398, + "ce_orig": 1.7627007961273193, + "epoch": 0.24818462865770363, + "kl_loss": 0.2926875352859497, + "loss_ib": 0.01579565554857254, + "step": 863 + }, + { + "ce_ib": 13.420784950256348, + "ce_orig": 1.5475761890411377, + "epoch": 0.24818462865770363, + "kl_loss": 0.9178951382637024, + "loss_ib": 0.022599736228585243, + "step": 863 + }, + { + "ce_ib": 9.601221084594727, + "ce_orig": 0.6273306012153625, + "epoch": 0.24847221223668128, + "kl_loss": 0.44295597076416016, + "loss_ib": 0.014030780643224716, + "step": 864 + }, + { + "ce_ib": 10.278837203979492, + "ce_orig": 1.2835111618041992, + "epoch": 0.24847221223668128, + "kl_loss": 0.33239656686782837, + "loss_ib": 0.013602802529931068, + "step": 864 + }, + { + "ce_ib": 10.454754829406738, + "ce_orig": 0.7932427525520325, + "epoch": 0.24847221223668128, + "kl_loss": 0.39738035202026367, + "loss_ib": 0.014428557828068733, + "step": 864 + }, + { + "ce_ib": 8.318263053894043, + "ce_orig": 1.0965704917907715, + "epoch": 0.24847221223668128, + "kl_loss": 0.27918365597724915, + "loss_ib": 0.011110099032521248, + "step": 864 + }, + { + "epoch": 0.24875979581565894, + "grad_norm": 0.09105879068374634, + "learning_rate": 9.928610301807134e-06, + "loss": 0.9249, + "step": 865 + }, + { + "ce_ib": 11.053398132324219, + "ce_orig": 0.98641037940979, + "epoch": 0.24875979581565894, + "kl_loss": 0.34464025497436523, + "loss_ib": 0.014499801211059093, + "step": 865 + }, + { + "ce_ib": 10.561725616455078, + "ce_orig": 0.9344740509986877, + "epoch": 0.24875979581565894, + "kl_loss": 0.40244221687316895, + "loss_ib": 0.014586147852241993, + "step": 865 + }, + { + "ce_ib": 11.87633991241455, + "ce_orig": 1.324188470840454, + "epoch": 0.24875979581565894, + "kl_loss": 0.3682016134262085, + "loss_ib": 0.015558355487883091, + "step": 865 + }, + { + "ce_ib": 6.436470985412598, + "ce_orig": 0.5687084794044495, + "epoch": 0.24875979581565894, + "kl_loss": 0.3997005224227905, + "loss_ib": 0.010433475486934185, + "step": 865 + }, + { + "ce_ib": 7.637430191040039, + "ce_orig": 0.796751856803894, + "epoch": 0.24904737939463656, + "kl_loss": 0.40374982357025146, + "loss_ib": 0.011674928478896618, + "step": 866 + }, + { + "ce_ib": 6.677864074707031, + "ce_orig": 0.6250858902931213, + "epoch": 0.24904737939463656, + "kl_loss": 0.38111498951911926, + "loss_ib": 0.010489013977348804, + "step": 866 + }, + { + "ce_ib": 12.016997337341309, + "ce_orig": 1.5606534481048584, + "epoch": 0.24904737939463656, + "kl_loss": 0.3536309599876404, + "loss_ib": 0.0155533067882061, + "step": 866 + }, + { + "ce_ib": 8.68997573852539, + "ce_orig": 1.1279748678207397, + "epoch": 0.24904737939463656, + "kl_loss": 0.4000932574272156, + "loss_ib": 0.012690908275544643, + "step": 866 + }, + { + "ce_ib": 6.430920124053955, + "ce_orig": 0.6035857200622559, + "epoch": 0.2493349629736142, + "kl_loss": 0.3381291925907135, + "loss_ib": 0.009812211617827415, + "step": 867 + }, + { + "ce_ib": 6.76858377456665, + "ce_orig": 0.8521968126296997, + "epoch": 0.2493349629736142, + "kl_loss": 0.2716369330883026, + "loss_ib": 0.009484952315688133, + "step": 867 + }, + { + "ce_ib": 15.660572052001953, + "ce_orig": 2.287351608276367, + "epoch": 0.2493349629736142, + "kl_loss": 0.33299481868743896, + "loss_ib": 0.018990520387887955, + "step": 867 + }, + { + "ce_ib": 7.993679523468018, + "ce_orig": 0.547291100025177, + "epoch": 0.2493349629736142, + "kl_loss": 0.7522663474082947, + "loss_ib": 0.015516342595219612, + "step": 867 + }, + { + "ce_ib": 9.38365364074707, + "ce_orig": 0.9876450300216675, + "epoch": 0.24962254655259183, + "kl_loss": 0.31372547149658203, + "loss_ib": 0.012520909309387207, + "step": 868 + }, + { + "ce_ib": 13.470992088317871, + "ce_orig": 1.528100609779358, + "epoch": 0.24962254655259183, + "kl_loss": 0.36726510524749756, + "loss_ib": 0.017143642529845238, + "step": 868 + }, + { + "ce_ib": 6.063528060913086, + "ce_orig": 0.8686865568161011, + "epoch": 0.24962254655259183, + "kl_loss": 0.291568398475647, + "loss_ib": 0.008979211561381817, + "step": 868 + }, + { + "ce_ib": 9.594892501831055, + "ce_orig": 0.7062548398971558, + "epoch": 0.24962254655259183, + "kl_loss": 0.7296528816223145, + "loss_ib": 0.016891421750187874, + "step": 868 + }, + { + "ce_ib": 9.050810813903809, + "ce_orig": 0.8255693912506104, + "epoch": 0.24991013013156949, + "kl_loss": 0.4382583498954773, + "loss_ib": 0.013433394022285938, + "step": 869 + }, + { + "ce_ib": 6.191425800323486, + "ce_orig": 0.7848706841468811, + "epoch": 0.24991013013156949, + "kl_loss": 0.22416508197784424, + "loss_ib": 0.008433076553046703, + "step": 869 + }, + { + "ce_ib": 6.417364120483398, + "ce_orig": 0.7727878093719482, + "epoch": 0.24991013013156949, + "kl_loss": 0.2643115520477295, + "loss_ib": 0.009060479700565338, + "step": 869 + }, + { + "ce_ib": 7.678783893585205, + "ce_orig": 0.7737617492675781, + "epoch": 0.24991013013156949, + "kl_loss": 0.42957326769828796, + "loss_ib": 0.0119745172560215, + "step": 869 + }, + { + "epoch": 0.2501977137105471, + "grad_norm": 0.10078983008861542, + "learning_rate": 9.927297588585984e-06, + "loss": 0.8886, + "step": 870 + }, + { + "ce_ib": 10.537775039672852, + "ce_orig": 1.0423405170440674, + "epoch": 0.2501977137105471, + "kl_loss": 0.3949153423309326, + "loss_ib": 0.014486928470432758, + "step": 870 + }, + { + "ce_ib": 5.604185581207275, + "ce_orig": 0.5952542424201965, + "epoch": 0.2501977137105471, + "kl_loss": 0.32443949580192566, + "loss_ib": 0.008848579600453377, + "step": 870 + }, + { + "ce_ib": 11.089546203613281, + "ce_orig": 0.9591896533966064, + "epoch": 0.2501977137105471, + "kl_loss": 0.250461608171463, + "loss_ib": 0.013594161719083786, + "step": 870 + }, + { + "ce_ib": 6.729354381561279, + "ce_orig": 1.008842945098877, + "epoch": 0.2501977137105471, + "kl_loss": 0.25118446350097656, + "loss_ib": 0.009241199120879173, + "step": 870 + }, + { + "ce_ib": 9.652894973754883, + "ce_orig": 1.2227216958999634, + "epoch": 0.2504852972895248, + "kl_loss": 0.4000895023345947, + "loss_ib": 0.013653790578246117, + "step": 871 + }, + { + "ce_ib": 9.622594833374023, + "ce_orig": 0.9795891642570496, + "epoch": 0.2504852972895248, + "kl_loss": 0.25612586736679077, + "loss_ib": 0.012183853425085545, + "step": 871 + }, + { + "ce_ib": 7.215780735015869, + "ce_orig": 0.4193066656589508, + "epoch": 0.2504852972895248, + "kl_loss": 0.4899890422821045, + "loss_ib": 0.012115671299397945, + "step": 871 + }, + { + "ce_ib": 6.65898323059082, + "ce_orig": 0.8104040622711182, + "epoch": 0.2504852972895248, + "kl_loss": 0.26327258348464966, + "loss_ib": 0.00929170846939087, + "step": 871 + }, + { + "ce_ib": 4.725259780883789, + "ce_orig": 0.3476851284503937, + "epoch": 0.2507728808685024, + "kl_loss": 0.6398271322250366, + "loss_ib": 0.011123530566692352, + "step": 872 + }, + { + "ce_ib": 5.815470218658447, + "ce_orig": 0.7363677620887756, + "epoch": 0.2507728808685024, + "kl_loss": 0.5072571039199829, + "loss_ib": 0.010888040997087955, + "step": 872 + }, + { + "ce_ib": 7.01890230178833, + "ce_orig": 0.5536361336708069, + "epoch": 0.2507728808685024, + "kl_loss": 0.3183189928531647, + "loss_ib": 0.010202092118561268, + "step": 872 + }, + { + "ce_ib": 4.89237642288208, + "ce_orig": 0.5722980499267578, + "epoch": 0.2507728808685024, + "kl_loss": 0.30087870359420776, + "loss_ib": 0.007901162840425968, + "step": 872 + }, + { + "ce_ib": 7.8064866065979, + "ce_orig": 1.0494457483291626, + "epoch": 0.25106046444748004, + "kl_loss": 0.27215179800987244, + "loss_ib": 0.010528003796935081, + "step": 873 + }, + { + "ce_ib": 6.676516056060791, + "ce_orig": 0.6909357905387878, + "epoch": 0.25106046444748004, + "kl_loss": 0.36614060401916504, + "loss_ib": 0.010337922722101212, + "step": 873 + }, + { + "ce_ib": 8.948216438293457, + "ce_orig": 0.7401660084724426, + "epoch": 0.25106046444748004, + "kl_loss": 0.3224928677082062, + "loss_ib": 0.012173144146800041, + "step": 873 + }, + { + "ce_ib": 5.891620635986328, + "ce_orig": 0.8088377714157104, + "epoch": 0.25106046444748004, + "kl_loss": 0.3477107882499695, + "loss_ib": 0.009368727914988995, + "step": 873 + }, + { + "ce_ib": 8.354748725891113, + "ce_orig": 0.9080251455307007, + "epoch": 0.2513480480264577, + "kl_loss": 0.4312264621257782, + "loss_ib": 0.012667013332247734, + "step": 874 + }, + { + "ce_ib": 9.125079154968262, + "ce_orig": 1.2377448081970215, + "epoch": 0.2513480480264577, + "kl_loss": 0.28963541984558105, + "loss_ib": 0.012021434493362904, + "step": 874 + }, + { + "ce_ib": 6.146969795227051, + "ce_orig": 0.7261344194412231, + "epoch": 0.2513480480264577, + "kl_loss": 0.2670226991176605, + "loss_ib": 0.008817196823656559, + "step": 874 + }, + { + "ce_ib": 8.836943626403809, + "ce_orig": 0.9964169859886169, + "epoch": 0.2513480480264577, + "kl_loss": 0.29464757442474365, + "loss_ib": 0.011783418245613575, + "step": 874 + }, + { + "epoch": 0.25163563160543534, + "grad_norm": 0.09101825952529907, + "learning_rate": 9.925973004361295e-06, + "loss": 0.9106, + "step": 875 + }, + { + "ce_ib": 10.854291915893555, + "ce_orig": 1.7030887603759766, + "epoch": 0.25163563160543534, + "kl_loss": 0.2948107421398163, + "loss_ib": 0.013802398927509785, + "step": 875 + }, + { + "ce_ib": 5.0278215408325195, + "ce_orig": 0.6354923844337463, + "epoch": 0.25163563160543534, + "kl_loss": 0.27879562973976135, + "loss_ib": 0.007815778255462646, + "step": 875 + }, + { + "ce_ib": 3.9477179050445557, + "ce_orig": 0.27782636880874634, + "epoch": 0.25163563160543534, + "kl_loss": 0.5922541618347168, + "loss_ib": 0.009870259091258049, + "step": 875 + }, + { + "ce_ib": 9.96784782409668, + "ce_orig": 0.8732298016548157, + "epoch": 0.25163563160543534, + "kl_loss": 0.3482765555381775, + "loss_ib": 0.01345061231404543, + "step": 875 + }, + { + "ce_ib": 10.751762390136719, + "ce_orig": 0.7869917750358582, + "epoch": 0.25192321518441296, + "kl_loss": 0.35357779264450073, + "loss_ib": 0.014287540689110756, + "step": 876 + }, + { + "ce_ib": 11.038793563842773, + "ce_orig": 1.3073241710662842, + "epoch": 0.25192321518441296, + "kl_loss": 0.3102778494358063, + "loss_ib": 0.014141570776700974, + "step": 876 + }, + { + "ce_ib": 10.809063911437988, + "ce_orig": 1.3352001905441284, + "epoch": 0.25192321518441296, + "kl_loss": 0.266183078289032, + "loss_ib": 0.013470894657075405, + "step": 876 + }, + { + "ce_ib": 9.622008323669434, + "ce_orig": 1.2020785808563232, + "epoch": 0.25192321518441296, + "kl_loss": 0.29643064737319946, + "loss_ib": 0.012586314231157303, + "step": 876 + }, + { + "ce_ib": 4.619173526763916, + "ce_orig": 0.5329123735427856, + "epoch": 0.2522107987633906, + "kl_loss": 0.25063222646713257, + "loss_ib": 0.007125495467334986, + "step": 877 + }, + { + "ce_ib": 14.786099433898926, + "ce_orig": 1.6358728408813477, + "epoch": 0.2522107987633906, + "kl_loss": 0.3289327919483185, + "loss_ib": 0.018075427040457726, + "step": 877 + }, + { + "ce_ib": 6.436891555786133, + "ce_orig": 0.6348329782485962, + "epoch": 0.2522107987633906, + "kl_loss": 0.24020250141620636, + "loss_ib": 0.008838916197419167, + "step": 877 + }, + { + "ce_ib": 8.219223976135254, + "ce_orig": 0.4184137284755707, + "epoch": 0.2522107987633906, + "kl_loss": 0.2933961749076843, + "loss_ib": 0.011153184808790684, + "step": 877 + }, + { + "ce_ib": 11.890969276428223, + "ce_orig": 1.2066895961761475, + "epoch": 0.25249838234236827, + "kl_loss": 0.3087109327316284, + "loss_ib": 0.014978078193962574, + "step": 878 + }, + { + "ce_ib": 8.250067710876465, + "ce_orig": 0.8217212557792664, + "epoch": 0.25249838234236827, + "kl_loss": 0.30817484855651855, + "loss_ib": 0.011331815272569656, + "step": 878 + }, + { + "ce_ib": 5.804820537567139, + "ce_orig": 0.7277146577835083, + "epoch": 0.25249838234236827, + "kl_loss": 0.2948615252971649, + "loss_ib": 0.00875343568623066, + "step": 878 + }, + { + "ce_ib": 7.200780868530273, + "ce_orig": 0.7634241580963135, + "epoch": 0.25249838234236827, + "kl_loss": 0.434295117855072, + "loss_ib": 0.011543731205165386, + "step": 878 + }, + { + "ce_ib": 8.42392635345459, + "ce_orig": 0.452232301235199, + "epoch": 0.2527859659213459, + "kl_loss": 0.40239661931991577, + "loss_ib": 0.012447891756892204, + "step": 879 + }, + { + "ce_ib": 9.636970520019531, + "ce_orig": 0.909797728061676, + "epoch": 0.2527859659213459, + "kl_loss": 0.2469838708639145, + "loss_ib": 0.012106809765100479, + "step": 879 + }, + { + "ce_ib": 10.866209983825684, + "ce_orig": 0.9333135485649109, + "epoch": 0.2527859659213459, + "kl_loss": 0.2821698486804962, + "loss_ib": 0.01368790864944458, + "step": 879 + }, + { + "ce_ib": 4.037529468536377, + "ce_orig": 0.6108816266059875, + "epoch": 0.2527859659213459, + "kl_loss": 0.2413032352924347, + "loss_ib": 0.006450561806559563, + "step": 879 + }, + { + "epoch": 0.2530735495003235, + "grad_norm": 0.09534526616334915, + "learning_rate": 9.924636552324296e-06, + "loss": 0.8423, + "step": 880 + }, + { + "ce_ib": 7.282698631286621, + "ce_orig": 1.1506547927856445, + "epoch": 0.2530735495003235, + "kl_loss": 0.34004634618759155, + "loss_ib": 0.01068316213786602, + "step": 880 + }, + { + "ce_ib": 11.481069564819336, + "ce_orig": 1.6637837886810303, + "epoch": 0.2530735495003235, + "kl_loss": 0.6563593149185181, + "loss_ib": 0.018044661730527878, + "step": 880 + }, + { + "ce_ib": 10.626919746398926, + "ce_orig": 1.1782991886138916, + "epoch": 0.2530735495003235, + "kl_loss": 0.328736275434494, + "loss_ib": 0.013914283365011215, + "step": 880 + }, + { + "ce_ib": 11.945712089538574, + "ce_orig": 1.6569890975952148, + "epoch": 0.2530735495003235, + "kl_loss": 0.39438027143478394, + "loss_ib": 0.015889516100287437, + "step": 880 + }, + { + "ce_ib": 6.359279632568359, + "ce_orig": 0.8192113637924194, + "epoch": 0.2533611330793012, + "kl_loss": 0.3645118176937103, + "loss_ib": 0.010004397481679916, + "step": 881 + }, + { + "ce_ib": 10.964028358459473, + "ce_orig": 0.7998406887054443, + "epoch": 0.2533611330793012, + "kl_loss": 0.49998754262924194, + "loss_ib": 0.015963904559612274, + "step": 881 + }, + { + "ce_ib": 11.182770729064941, + "ce_orig": 1.3286393880844116, + "epoch": 0.2533611330793012, + "kl_loss": 0.4621961712837219, + "loss_ib": 0.015804732218384743, + "step": 881 + }, + { + "ce_ib": 8.53327751159668, + "ce_orig": 0.8316857218742371, + "epoch": 0.2533611330793012, + "kl_loss": 0.3155197203159332, + "loss_ib": 0.0116884745657444, + "step": 881 + }, + { + "ce_ib": 6.411721229553223, + "ce_orig": 0.7425188422203064, + "epoch": 0.2536487166582788, + "kl_loss": 0.2903468906879425, + "loss_ib": 0.00931518990546465, + "step": 882 + }, + { + "ce_ib": 6.149788856506348, + "ce_orig": 0.8823583722114563, + "epoch": 0.2536487166582788, + "kl_loss": 0.2212170958518982, + "loss_ib": 0.00836195982992649, + "step": 882 + }, + { + "ce_ib": 9.317445755004883, + "ce_orig": 0.932159423828125, + "epoch": 0.2536487166582788, + "kl_loss": 0.2839629650115967, + "loss_ib": 0.012157075107097626, + "step": 882 + }, + { + "ce_ib": 7.935296535491943, + "ce_orig": 0.9251559376716614, + "epoch": 0.2536487166582788, + "kl_loss": 0.3732267916202545, + "loss_ib": 0.011667564511299133, + "step": 882 + }, + { + "ce_ib": 5.324270725250244, + "ce_orig": 0.5246868133544922, + "epoch": 0.25393630023725644, + "kl_loss": 0.27826499938964844, + "loss_ib": 0.00810692086815834, + "step": 883 + }, + { + "ce_ib": 7.959061145782471, + "ce_orig": 1.0556241273880005, + "epoch": 0.25393630023725644, + "kl_loss": 0.2916038930416107, + "loss_ib": 0.010875099338591099, + "step": 883 + }, + { + "ce_ib": 6.867218971252441, + "ce_orig": 0.6588130593299866, + "epoch": 0.25393630023725644, + "kl_loss": 0.29453879594802856, + "loss_ib": 0.009812606498599052, + "step": 883 + }, + { + "ce_ib": 4.161896228790283, + "ce_orig": 0.4560477137565613, + "epoch": 0.25393630023725644, + "kl_loss": 0.2710115611553192, + "loss_ib": 0.006872011814266443, + "step": 883 + }, + { + "ce_ib": 6.286719799041748, + "ce_orig": 0.7286011576652527, + "epoch": 0.2542238838162341, + "kl_loss": 0.3385438621044159, + "loss_ib": 0.009672158397734165, + "step": 884 + }, + { + "ce_ib": 10.294978141784668, + "ce_orig": 0.775021493434906, + "epoch": 0.2542238838162341, + "kl_loss": 0.3102239966392517, + "loss_ib": 0.01339721865952015, + "step": 884 + }, + { + "ce_ib": 5.616055488586426, + "ce_orig": 0.5666757822036743, + "epoch": 0.2542238838162341, + "kl_loss": 0.3340657651424408, + "loss_ib": 0.00895671360194683, + "step": 884 + }, + { + "ce_ib": 6.7213826179504395, + "ce_orig": 0.6700468063354492, + "epoch": 0.2542238838162341, + "kl_loss": 0.259299099445343, + "loss_ib": 0.009314373135566711, + "step": 884 + }, + { + "epoch": 0.25451146739521174, + "grad_norm": 0.09304392337799072, + "learning_rate": 9.92328823569481e-06, + "loss": 0.9081, + "step": 885 + }, + { + "ce_ib": 10.979228973388672, + "ce_orig": 1.314060091972351, + "epoch": 0.25451146739521174, + "kl_loss": 0.4535999298095703, + "loss_ib": 0.015515227802097797, + "step": 885 + }, + { + "ce_ib": 9.044007301330566, + "ce_orig": 1.0800416469573975, + "epoch": 0.25451146739521174, + "kl_loss": 0.25342878699302673, + "loss_ib": 0.01157829537987709, + "step": 885 + }, + { + "ce_ib": 7.7309441566467285, + "ce_orig": 0.737576425075531, + "epoch": 0.25451146739521174, + "kl_loss": 0.25567033886909485, + "loss_ib": 0.010287647135555744, + "step": 885 + }, + { + "ce_ib": 5.4991774559021, + "ce_orig": 0.67442387342453, + "epoch": 0.25451146739521174, + "kl_loss": 0.22683289647102356, + "loss_ib": 0.007767506875097752, + "step": 885 + }, + { + "ce_ib": 9.953997611999512, + "ce_orig": 1.5094314813613892, + "epoch": 0.25479905097418937, + "kl_loss": 0.23526480793952942, + "loss_ib": 0.012306645512580872, + "step": 886 + }, + { + "ce_ib": 5.9002685546875, + "ce_orig": 0.8582682013511658, + "epoch": 0.25479905097418937, + "kl_loss": 0.2747255265712738, + "loss_ib": 0.008647523820400238, + "step": 886 + }, + { + "ce_ib": 3.582958698272705, + "ce_orig": 0.4530331492424011, + "epoch": 0.25479905097418937, + "kl_loss": 0.22470691800117493, + "loss_ib": 0.005830028094351292, + "step": 886 + }, + { + "ce_ib": 8.590431213378906, + "ce_orig": 0.7905410528182983, + "epoch": 0.25479905097418937, + "kl_loss": 0.32975587248802185, + "loss_ib": 0.011887989938259125, + "step": 886 + }, + { + "ce_ib": 12.112462997436523, + "ce_orig": 1.4443359375, + "epoch": 0.255086634553167, + "kl_loss": 0.39053958654403687, + "loss_ib": 0.016017857939004898, + "step": 887 + }, + { + "ce_ib": 7.377198219299316, + "ce_orig": 0.8195330500602722, + "epoch": 0.255086634553167, + "kl_loss": 0.3151588439941406, + "loss_ib": 0.010528786107897758, + "step": 887 + }, + { + "ce_ib": 5.7976861000061035, + "ce_orig": 0.6533734798431396, + "epoch": 0.255086634553167, + "kl_loss": 0.5457720756530762, + "loss_ib": 0.011255406774580479, + "step": 887 + }, + { + "ce_ib": 3.642547369003296, + "ce_orig": 0.3404500484466553, + "epoch": 0.255086634553167, + "kl_loss": 0.31931373476982117, + "loss_ib": 0.006835684645920992, + "step": 887 + }, + { + "ce_ib": 9.251564025878906, + "ce_orig": 1.1658694744110107, + "epoch": 0.25537421813214467, + "kl_loss": 0.276309609413147, + "loss_ib": 0.012014660984277725, + "step": 888 + }, + { + "ce_ib": 9.521452903747559, + "ce_orig": 1.0355658531188965, + "epoch": 0.25537421813214467, + "kl_loss": 0.3186280131340027, + "loss_ib": 0.012707732617855072, + "step": 888 + }, + { + "ce_ib": 8.611248970031738, + "ce_orig": 1.067291498184204, + "epoch": 0.25537421813214467, + "kl_loss": 0.3613908290863037, + "loss_ib": 0.012225157581269741, + "step": 888 + }, + { + "ce_ib": 6.128733158111572, + "ce_orig": 0.7295544147491455, + "epoch": 0.25537421813214467, + "kl_loss": 0.2833183705806732, + "loss_ib": 0.008961916901171207, + "step": 888 + }, + { + "ce_ib": 7.8607683181762695, + "ce_orig": 0.9411081075668335, + "epoch": 0.2556618017111223, + "kl_loss": 0.3833426833152771, + "loss_ib": 0.011694194748997688, + "step": 889 + }, + { + "ce_ib": 7.081918716430664, + "ce_orig": 0.7683687806129456, + "epoch": 0.2556618017111223, + "kl_loss": 0.3560516834259033, + "loss_ib": 0.010642435401678085, + "step": 889 + }, + { + "ce_ib": 6.406893253326416, + "ce_orig": 0.6722978353500366, + "epoch": 0.2556618017111223, + "kl_loss": 0.2877756357192993, + "loss_ib": 0.009284649044275284, + "step": 889 + }, + { + "ce_ib": 7.919890403747559, + "ce_orig": 1.0300544500350952, + "epoch": 0.2556618017111223, + "kl_loss": 0.3256221413612366, + "loss_ib": 0.011176111176609993, + "step": 889 + }, + { + "epoch": 0.2559493852900999, + "grad_norm": 0.08482884615659714, + "learning_rate": 9.921928057721242e-06, + "loss": 0.8751, + "step": 890 + }, + { + "ce_ib": 8.637929916381836, + "ce_orig": 0.9734044075012207, + "epoch": 0.2559493852900999, + "kl_loss": 0.32373255491256714, + "loss_ib": 0.011875255033373833, + "step": 890 + }, + { + "ce_ib": 8.563017845153809, + "ce_orig": 0.4407178461551666, + "epoch": 0.2559493852900999, + "kl_loss": 0.37603944540023804, + "loss_ib": 0.012323413044214249, + "step": 890 + }, + { + "ce_ib": 4.576637268066406, + "ce_orig": 0.5486153960227966, + "epoch": 0.2559493852900999, + "kl_loss": 0.2956191301345825, + "loss_ib": 0.007532828953117132, + "step": 890 + }, + { + "ce_ib": 6.462611675262451, + "ce_orig": 0.5243940353393555, + "epoch": 0.2559493852900999, + "kl_loss": 0.2781410813331604, + "loss_ib": 0.009244021959602833, + "step": 890 + }, + { + "ce_ib": 8.214609146118164, + "ce_orig": 0.5406211614608765, + "epoch": 0.2562369688690776, + "kl_loss": 0.40383273363113403, + "loss_ib": 0.012252936139702797, + "step": 891 + }, + { + "ce_ib": 9.679584503173828, + "ce_orig": 0.9921509027481079, + "epoch": 0.2562369688690776, + "kl_loss": 0.3201594352722168, + "loss_ib": 0.01288117840886116, + "step": 891 + }, + { + "ce_ib": 8.80469036102295, + "ce_orig": 1.0968488454818726, + "epoch": 0.2562369688690776, + "kl_loss": 0.683159589767456, + "loss_ib": 0.01563628576695919, + "step": 891 + }, + { + "ce_ib": 6.155488014221191, + "ce_orig": 0.5672451853752136, + "epoch": 0.2562369688690776, + "kl_loss": 0.39383214712142944, + "loss_ib": 0.010093809105455875, + "step": 891 + }, + { + "ce_ib": 6.969648838043213, + "ce_orig": 0.7994107604026794, + "epoch": 0.2565245524480552, + "kl_loss": 0.3125140070915222, + "loss_ib": 0.010094788856804371, + "step": 892 + }, + { + "ce_ib": 7.141666412353516, + "ce_orig": 0.4777945876121521, + "epoch": 0.2565245524480552, + "kl_loss": 0.3340398967266083, + "loss_ib": 0.010482065379619598, + "step": 892 + }, + { + "ce_ib": 8.195847511291504, + "ce_orig": 1.0143331289291382, + "epoch": 0.2565245524480552, + "kl_loss": 0.4175947308540344, + "loss_ib": 0.012371795251965523, + "step": 892 + }, + { + "ce_ib": 10.73903751373291, + "ce_orig": 0.8994997143745422, + "epoch": 0.2565245524480552, + "kl_loss": 0.2581188380718231, + "loss_ib": 0.013320226222276688, + "step": 892 + }, + { + "ce_ib": 3.9347095489501953, + "ce_orig": 0.3383885324001312, + "epoch": 0.25681213602703284, + "kl_loss": 0.3705168068408966, + "loss_ib": 0.007639877498149872, + "step": 893 + }, + { + "ce_ib": 9.592818260192871, + "ce_orig": 0.9891217947006226, + "epoch": 0.25681213602703284, + "kl_loss": 0.3685084581375122, + "loss_ib": 0.013277902267873287, + "step": 893 + }, + { + "ce_ib": 8.22640609741211, + "ce_orig": 0.8431357145309448, + "epoch": 0.25681213602703284, + "kl_loss": 0.28724151849746704, + "loss_ib": 0.011098820716142654, + "step": 893 + }, + { + "ce_ib": 6.6608357429504395, + "ce_orig": 0.7445570230484009, + "epoch": 0.25681213602703284, + "kl_loss": 0.23527176678180695, + "loss_ib": 0.009013553149998188, + "step": 893 + }, + { + "ce_ib": 4.6245598793029785, + "ce_orig": 0.5491394400596619, + "epoch": 0.2570997196060105, + "kl_loss": 0.2531249523162842, + "loss_ib": 0.007155809085816145, + "step": 894 + }, + { + "ce_ib": 7.3063063621521, + "ce_orig": 0.6298426985740662, + "epoch": 0.2570997196060105, + "kl_loss": 0.25430071353912354, + "loss_ib": 0.009849313646554947, + "step": 894 + }, + { + "ce_ib": 9.165714263916016, + "ce_orig": 1.1000019311904907, + "epoch": 0.2570997196060105, + "kl_loss": 0.5990852117538452, + "loss_ib": 0.015156567096710205, + "step": 894 + }, + { + "ce_ib": 8.9033842086792, + "ce_orig": 0.9108843207359314, + "epoch": 0.2570997196060105, + "kl_loss": 0.30792462825775146, + "loss_ib": 0.011982630006968975, + "step": 894 + }, + { + "epoch": 0.25738730318498815, + "grad_norm": 0.09863123297691345, + "learning_rate": 9.92055602168058e-06, + "loss": 0.8698, + "step": 895 + }, + { + "ce_ib": 7.7511515617370605, + "ce_orig": 0.7273695468902588, + "epoch": 0.25738730318498815, + "kl_loss": 0.2521267533302307, + "loss_ib": 0.010272419080138206, + "step": 895 + }, + { + "ce_ib": 8.19736385345459, + "ce_orig": 0.666002094745636, + "epoch": 0.25738730318498815, + "kl_loss": 0.45970863103866577, + "loss_ib": 0.012794449925422668, + "step": 895 + }, + { + "ce_ib": 8.727402687072754, + "ce_orig": 0.5750917196273804, + "epoch": 0.25738730318498815, + "kl_loss": 0.33787843585014343, + "loss_ib": 0.012106186710298061, + "step": 895 + }, + { + "ce_ib": 7.301302433013916, + "ce_orig": 0.49371325969696045, + "epoch": 0.25738730318498815, + "kl_loss": 0.39213109016418457, + "loss_ib": 0.011222613044083118, + "step": 895 + }, + { + "ce_ib": 8.087718963623047, + "ce_orig": 1.0477973222732544, + "epoch": 0.25767488676396577, + "kl_loss": 0.26531845331192017, + "loss_ib": 0.010740903206169605, + "step": 896 + }, + { + "ce_ib": 14.437392234802246, + "ce_orig": 1.9533960819244385, + "epoch": 0.25767488676396577, + "kl_loss": 0.3840804100036621, + "loss_ib": 0.018278196454048157, + "step": 896 + }, + { + "ce_ib": 12.922438621520996, + "ce_orig": 1.4912583827972412, + "epoch": 0.25767488676396577, + "kl_loss": 0.33412793278694153, + "loss_ib": 0.016263717785477638, + "step": 896 + }, + { + "ce_ib": 9.707921028137207, + "ce_orig": 1.1894207000732422, + "epoch": 0.25767488676396577, + "kl_loss": 0.2657119035720825, + "loss_ib": 0.012365040369331837, + "step": 896 + }, + { + "ce_ib": 9.557899475097656, + "ce_orig": 0.8531953692436218, + "epoch": 0.2579624703429434, + "kl_loss": 0.2381764054298401, + "loss_ib": 0.01193966343998909, + "step": 897 + }, + { + "ce_ib": 7.224669456481934, + "ce_orig": 1.0568636655807495, + "epoch": 0.2579624703429434, + "kl_loss": 0.2253345549106598, + "loss_ib": 0.009478014893829823, + "step": 897 + }, + { + "ce_ib": 9.596117973327637, + "ce_orig": 0.8501659035682678, + "epoch": 0.2579624703429434, + "kl_loss": 0.3259985148906708, + "loss_ib": 0.012856102548539639, + "step": 897 + }, + { + "ce_ib": 8.441751480102539, + "ce_orig": 0.8498370051383972, + "epoch": 0.2579624703429434, + "kl_loss": 0.34127742052078247, + "loss_ib": 0.011854525655508041, + "step": 897 + }, + { + "ce_ib": 9.776628494262695, + "ce_orig": 1.1606560945510864, + "epoch": 0.2582500539219211, + "kl_loss": 0.39107829332351685, + "loss_ib": 0.013687411323189735, + "step": 898 + }, + { + "ce_ib": 8.28480052947998, + "ce_orig": 0.8396362066268921, + "epoch": 0.2582500539219211, + "kl_loss": 0.5774872899055481, + "loss_ib": 0.014059673063457012, + "step": 898 + }, + { + "ce_ib": 5.942250728607178, + "ce_orig": 0.570982038974762, + "epoch": 0.2582500539219211, + "kl_loss": 0.35418200492858887, + "loss_ib": 0.009484071284532547, + "step": 898 + }, + { + "ce_ib": 13.332066535949707, + "ce_orig": 1.999030590057373, + "epoch": 0.2582500539219211, + "kl_loss": 0.32090240716934204, + "loss_ib": 0.016541089862585068, + "step": 898 + }, + { + "ce_ib": 6.9958176612854, + "ce_orig": 0.950183629989624, + "epoch": 0.2585376375008987, + "kl_loss": 0.2381882220506668, + "loss_ib": 0.00937770027667284, + "step": 899 + }, + { + "ce_ib": 10.04285717010498, + "ce_orig": 0.8781871199607849, + "epoch": 0.2585376375008987, + "kl_loss": 0.3415898382663727, + "loss_ib": 0.013458754867315292, + "step": 899 + }, + { + "ce_ib": 13.117063522338867, + "ce_orig": 1.5412489175796509, + "epoch": 0.2585376375008987, + "kl_loss": 0.28792649507522583, + "loss_ib": 0.015996329486370087, + "step": 899 + }, + { + "ce_ib": 5.770230293273926, + "ce_orig": 0.4059688448905945, + "epoch": 0.2585376375008987, + "kl_loss": 0.3441586196422577, + "loss_ib": 0.00921181682497263, + "step": 899 + }, + { + "epoch": 0.2588252210798763, + "grad_norm": 0.11080905795097351, + "learning_rate": 9.919172130878378e-06, + "loss": 0.8609, + "step": 900 + }, + { + "ce_ib": 7.81713342666626, + "ce_orig": 0.7552506923675537, + "epoch": 0.2588252210798763, + "kl_loss": 0.3844016194343567, + "loss_ib": 0.011661150492727757, + "step": 900 + }, + { + "ce_ib": 7.249913215637207, + "ce_orig": 0.968014657497406, + "epoch": 0.2588252210798763, + "kl_loss": 0.25877052545547485, + "loss_ib": 0.009837618097662926, + "step": 900 + }, + { + "ce_ib": 7.9647650718688965, + "ce_orig": 0.5272249579429626, + "epoch": 0.2588252210798763, + "kl_loss": 0.284597784280777, + "loss_ib": 0.01081074308604002, + "step": 900 + }, + { + "ce_ib": 8.811546325683594, + "ce_orig": 0.7540147304534912, + "epoch": 0.2588252210798763, + "kl_loss": 0.39285385608673096, + "loss_ib": 0.012740084901452065, + "step": 900 + }, + { + "ce_ib": 4.880631446838379, + "ce_orig": 0.5076504945755005, + "epoch": 0.259112804658854, + "kl_loss": 0.26206904649734497, + "loss_ib": 0.007501321844756603, + "step": 901 + }, + { + "ce_ib": 7.368719100952148, + "ce_orig": 0.6915988326072693, + "epoch": 0.259112804658854, + "kl_loss": 0.4998074769973755, + "loss_ib": 0.012366793118417263, + "step": 901 + }, + { + "ce_ib": 4.789905071258545, + "ce_orig": 0.756260335445404, + "epoch": 0.259112804658854, + "kl_loss": 0.2596268951892853, + "loss_ib": 0.00738617405295372, + "step": 901 + }, + { + "ce_ib": 6.979801177978516, + "ce_orig": 0.49811649322509766, + "epoch": 0.259112804658854, + "kl_loss": 0.3590124845504761, + "loss_ib": 0.010569925419986248, + "step": 901 + }, + { + "ce_ib": 5.86633825302124, + "ce_orig": 0.7878961563110352, + "epoch": 0.2594003882378316, + "kl_loss": 0.2769380211830139, + "loss_ib": 0.008635718375444412, + "step": 902 + }, + { + "ce_ib": 7.545517921447754, + "ce_orig": 0.9500758051872253, + "epoch": 0.2594003882378316, + "kl_loss": 0.21764595806598663, + "loss_ib": 0.009721977636218071, + "step": 902 + }, + { + "ce_ib": 6.535619258880615, + "ce_orig": 0.8012053966522217, + "epoch": 0.2594003882378316, + "kl_loss": 0.29037898778915405, + "loss_ib": 0.009439408779144287, + "step": 902 + }, + { + "ce_ib": 9.716675758361816, + "ce_orig": 0.9397892951965332, + "epoch": 0.2594003882378316, + "kl_loss": 0.3330523669719696, + "loss_ib": 0.01304719876497984, + "step": 902 + }, + { + "ce_ib": 4.242072105407715, + "ce_orig": 0.5148694515228271, + "epoch": 0.25968797181680925, + "kl_loss": 0.2478223443031311, + "loss_ib": 0.006720295175909996, + "step": 903 + }, + { + "ce_ib": 6.857370376586914, + "ce_orig": 0.48739153146743774, + "epoch": 0.25968797181680925, + "kl_loss": 0.3677447736263275, + "loss_ib": 0.010534818284213543, + "step": 903 + }, + { + "ce_ib": 5.933156967163086, + "ce_orig": 0.40238049626350403, + "epoch": 0.25968797181680925, + "kl_loss": 0.25679805874824524, + "loss_ib": 0.008501137606799603, + "step": 903 + }, + { + "ce_ib": 8.204896926879883, + "ce_orig": 0.7569003105163574, + "epoch": 0.25968797181680925, + "kl_loss": 0.2540472745895386, + "loss_ib": 0.010745369829237461, + "step": 903 + }, + { + "ce_ib": 6.275259494781494, + "ce_orig": 0.70445716381073, + "epoch": 0.25997555539578693, + "kl_loss": 0.3568292260169983, + "loss_ib": 0.009843551553785801, + "step": 904 + }, + { + "ce_ib": 9.74506664276123, + "ce_orig": 1.258253812789917, + "epoch": 0.25997555539578693, + "kl_loss": 0.2770775854587555, + "loss_ib": 0.012515842914581299, + "step": 904 + }, + { + "ce_ib": 10.810585021972656, + "ce_orig": 0.8011317849159241, + "epoch": 0.25997555539578693, + "kl_loss": 0.3040934205055237, + "loss_ib": 0.013851518742740154, + "step": 904 + }, + { + "ce_ib": 7.582912445068359, + "ce_orig": 0.9397713541984558, + "epoch": 0.25997555539578693, + "kl_loss": 0.22432610392570496, + "loss_ib": 0.00982617400586605, + "step": 904 + }, + { + "epoch": 0.26026313897476455, + "grad_norm": 0.09053654223680496, + "learning_rate": 9.917776388648748e-06, + "loss": 0.82, + "step": 905 + }, + { + "ce_ib": 10.179444313049316, + "ce_orig": 0.9755759835243225, + "epoch": 0.26026313897476455, + "kl_loss": 0.33106786012649536, + "loss_ib": 0.013490123674273491, + "step": 905 + }, + { + "ce_ib": 3.325618267059326, + "ce_orig": 0.16170339286327362, + "epoch": 0.26026313897476455, + "kl_loss": 0.5996187925338745, + "loss_ib": 0.009321806021034718, + "step": 905 + }, + { + "ce_ib": 7.778548240661621, + "ce_orig": 0.8278656005859375, + "epoch": 0.26026313897476455, + "kl_loss": 0.2725660502910614, + "loss_ib": 0.010504208505153656, + "step": 905 + }, + { + "ce_ib": 8.379356384277344, + "ce_orig": 0.9261234402656555, + "epoch": 0.26026313897476455, + "kl_loss": 0.378944993019104, + "loss_ib": 0.012168805114924908, + "step": 905 + }, + { + "ce_ib": 9.240448951721191, + "ce_orig": 0.6733576655387878, + "epoch": 0.2605507225537422, + "kl_loss": 0.4032820165157318, + "loss_ib": 0.013273268938064575, + "step": 906 + }, + { + "ce_ib": 12.491889953613281, + "ce_orig": 1.8018544912338257, + "epoch": 0.2605507225537422, + "kl_loss": 0.34695449471473694, + "loss_ib": 0.015961434692144394, + "step": 906 + }, + { + "ce_ib": 8.959487915039062, + "ce_orig": 1.4174104928970337, + "epoch": 0.2605507225537422, + "kl_loss": 0.30827200412750244, + "loss_ib": 0.012042207643389702, + "step": 906 + }, + { + "ce_ib": 7.630092620849609, + "ce_orig": 0.2582243084907532, + "epoch": 0.2605507225537422, + "kl_loss": 0.4620741903781891, + "loss_ib": 0.01225083414465189, + "step": 906 + }, + { + "ce_ib": 9.350717544555664, + "ce_orig": 0.8410260081291199, + "epoch": 0.2608383061327198, + "kl_loss": 0.27230745553970337, + "loss_ib": 0.012073791585862637, + "step": 907 + }, + { + "ce_ib": 5.942009925842285, + "ce_orig": 0.631314754486084, + "epoch": 0.2608383061327198, + "kl_loss": 0.23767834901809692, + "loss_ib": 0.008318793959915638, + "step": 907 + }, + { + "ce_ib": 7.381026744842529, + "ce_orig": 0.8023630380630493, + "epoch": 0.2608383061327198, + "kl_loss": 0.2556014060974121, + "loss_ib": 0.009937040507793427, + "step": 907 + }, + { + "ce_ib": 10.775535583496094, + "ce_orig": 1.5145833492279053, + "epoch": 0.2608383061327198, + "kl_loss": 0.3266568183898926, + "loss_ib": 0.014042104594409466, + "step": 907 + }, + { + "ce_ib": 6.754226207733154, + "ce_orig": 0.5556401014328003, + "epoch": 0.2611258897116975, + "kl_loss": 0.37610459327697754, + "loss_ib": 0.010515272617340088, + "step": 908 + }, + { + "ce_ib": 4.648565292358398, + "ce_orig": 0.42741912603378296, + "epoch": 0.2611258897116975, + "kl_loss": 0.2677002549171448, + "loss_ib": 0.007325568236410618, + "step": 908 + }, + { + "ce_ib": 9.743642807006836, + "ce_orig": 1.2022920846939087, + "epoch": 0.2611258897116975, + "kl_loss": 0.31233084201812744, + "loss_ib": 0.01286695059388876, + "step": 908 + }, + { + "ce_ib": 8.952857971191406, + "ce_orig": 1.0449740886688232, + "epoch": 0.2611258897116975, + "kl_loss": 0.290395051240921, + "loss_ib": 0.011856808327138424, + "step": 908 + }, + { + "ce_ib": 7.314600467681885, + "ce_orig": 0.487211138010025, + "epoch": 0.2614134732906751, + "kl_loss": 0.25061601400375366, + "loss_ib": 0.009820760227739811, + "step": 909 + }, + { + "ce_ib": 8.830986976623535, + "ce_orig": 0.7534437775611877, + "epoch": 0.2614134732906751, + "kl_loss": 0.276096910238266, + "loss_ib": 0.01159195601940155, + "step": 909 + }, + { + "ce_ib": 5.072354793548584, + "ce_orig": 0.7483262419700623, + "epoch": 0.2614134732906751, + "kl_loss": 0.2831575870513916, + "loss_ib": 0.007903930731117725, + "step": 909 + }, + { + "ce_ib": 6.34334659576416, + "ce_orig": 0.792799711227417, + "epoch": 0.2614134732906751, + "kl_loss": 0.27525418996810913, + "loss_ib": 0.009095888584852219, + "step": 909 + }, + { + "epoch": 0.2617010568696527, + "grad_norm": 0.09709301590919495, + "learning_rate": 9.916368798354356e-06, + "loss": 0.8731, + "step": 910 + }, + { + "ce_ib": 4.5776896476745605, + "ce_orig": 0.5359620451927185, + "epoch": 0.2617010568696527, + "kl_loss": 0.2551354765892029, + "loss_ib": 0.007129044272005558, + "step": 910 + }, + { + "ce_ib": 5.752841949462891, + "ce_orig": 0.4892864227294922, + "epoch": 0.2617010568696527, + "kl_loss": 0.28199148178100586, + "loss_ib": 0.008572756312787533, + "step": 910 + }, + { + "ce_ib": 7.317269325256348, + "ce_orig": 0.8823491930961609, + "epoch": 0.2617010568696527, + "kl_loss": 0.2838546335697174, + "loss_ib": 0.0101558156311512, + "step": 910 + }, + { + "ce_ib": 4.611359119415283, + "ce_orig": 0.6804866790771484, + "epoch": 0.2617010568696527, + "kl_loss": 0.24438399076461792, + "loss_ib": 0.007055198773741722, + "step": 910 + }, + { + "ce_ib": 6.782260894775391, + "ce_orig": 0.7364330887794495, + "epoch": 0.2619886404486304, + "kl_loss": 0.31710243225097656, + "loss_ib": 0.009953285567462444, + "step": 911 + }, + { + "ce_ib": 8.65565013885498, + "ce_orig": 1.165532112121582, + "epoch": 0.2619886404486304, + "kl_loss": 0.27839866280555725, + "loss_ib": 0.01143963634967804, + "step": 911 + }, + { + "ce_ib": 10.186466217041016, + "ce_orig": 0.9923035502433777, + "epoch": 0.2619886404486304, + "kl_loss": 0.34808266162872314, + "loss_ib": 0.013667291961610317, + "step": 911 + }, + { + "ce_ib": 7.47867488861084, + "ce_orig": 0.6916998028755188, + "epoch": 0.2619886404486304, + "kl_loss": 0.3453470766544342, + "loss_ib": 0.01093214564025402, + "step": 911 + }, + { + "ce_ib": 13.006325721740723, + "ce_orig": 1.4633382558822632, + "epoch": 0.26227622402760803, + "kl_loss": 0.23132070899009705, + "loss_ib": 0.015319532714784145, + "step": 912 + }, + { + "ce_ib": 6.6978654861450195, + "ce_orig": 0.7193461060523987, + "epoch": 0.26227622402760803, + "kl_loss": 0.24380119144916534, + "loss_ib": 0.009135877713561058, + "step": 912 + }, + { + "ce_ib": 7.76169490814209, + "ce_orig": 0.75163733959198, + "epoch": 0.26227622402760803, + "kl_loss": 0.27823013067245483, + "loss_ib": 0.010543995536863804, + "step": 912 + }, + { + "ce_ib": 10.430732727050781, + "ce_orig": 0.5616188049316406, + "epoch": 0.26227622402760803, + "kl_loss": 0.2974989116191864, + "loss_ib": 0.013405721634626389, + "step": 912 + }, + { + "ce_ib": 7.609768390655518, + "ce_orig": 0.4176271855831146, + "epoch": 0.26256380760658565, + "kl_loss": 0.38976022601127625, + "loss_ib": 0.011507370509207249, + "step": 913 + }, + { + "ce_ib": 11.438220977783203, + "ce_orig": 1.6095136404037476, + "epoch": 0.26256380760658565, + "kl_loss": 0.24979974329471588, + "loss_ib": 0.013936217874288559, + "step": 913 + }, + { + "ce_ib": 11.603028297424316, + "ce_orig": 1.417939305305481, + "epoch": 0.26256380760658565, + "kl_loss": 0.2228064239025116, + "loss_ib": 0.013831092976033688, + "step": 913 + }, + { + "ce_ib": 8.409954071044922, + "ce_orig": 0.827298104763031, + "epoch": 0.26256380760658565, + "kl_loss": 0.3605518341064453, + "loss_ib": 0.012015472166240215, + "step": 913 + }, + { + "ce_ib": 4.817587852478027, + "ce_orig": 0.4817865192890167, + "epoch": 0.2628513911855633, + "kl_loss": 0.25332438945770264, + "loss_ib": 0.007350832223892212, + "step": 914 + }, + { + "ce_ib": 8.813254356384277, + "ce_orig": 0.6531316041946411, + "epoch": 0.2628513911855633, + "kl_loss": 0.42108702659606934, + "loss_ib": 0.01302412524819374, + "step": 914 + }, + { + "ce_ib": 11.274821281433105, + "ce_orig": 1.3683724403381348, + "epoch": 0.2628513911855633, + "kl_loss": 0.42044275999069214, + "loss_ib": 0.015479249879717827, + "step": 914 + }, + { + "ce_ib": 4.2823686599731445, + "ce_orig": 0.432784765958786, + "epoch": 0.2628513911855633, + "kl_loss": 0.5916892290115356, + "loss_ib": 0.010199260897934437, + "step": 914 + }, + { + "epoch": 0.26313897476454096, + "grad_norm": 0.104263536632061, + "learning_rate": 9.914949363386417e-06, + "loss": 0.9239, + "step": 915 + }, + { + "ce_ib": 10.224993705749512, + "ce_orig": 0.7435204982757568, + "epoch": 0.26313897476454096, + "kl_loss": 0.48760131001472473, + "loss_ib": 0.01510100718587637, + "step": 915 + }, + { + "ce_ib": 5.030897617340088, + "ce_orig": 0.5878293514251709, + "epoch": 0.26313897476454096, + "kl_loss": 0.2752516567707062, + "loss_ib": 0.007783413864672184, + "step": 915 + }, + { + "ce_ib": 9.2077054977417, + "ce_orig": 1.0866307020187378, + "epoch": 0.26313897476454096, + "kl_loss": 0.27611714601516724, + "loss_ib": 0.011968877166509628, + "step": 915 + }, + { + "ce_ib": 7.913527965545654, + "ce_orig": 0.6080297827720642, + "epoch": 0.26313897476454096, + "kl_loss": 0.36963915824890137, + "loss_ib": 0.011609918437898159, + "step": 915 + }, + { + "ce_ib": 4.733314037322998, + "ce_orig": 0.3992489278316498, + "epoch": 0.2634265583435186, + "kl_loss": 0.2293146848678589, + "loss_ib": 0.0070264614187181, + "step": 916 + }, + { + "ce_ib": 9.097807884216309, + "ce_orig": 0.9395797848701477, + "epoch": 0.2634265583435186, + "kl_loss": 0.2749103009700775, + "loss_ib": 0.01184691023081541, + "step": 916 + }, + { + "ce_ib": 5.224862098693848, + "ce_orig": 0.49672380089759827, + "epoch": 0.2634265583435186, + "kl_loss": 0.3457143306732178, + "loss_ib": 0.008682005107402802, + "step": 916 + }, + { + "ce_ib": 3.887174367904663, + "ce_orig": 0.37023693323135376, + "epoch": 0.2634265583435186, + "kl_loss": 0.24267533421516418, + "loss_ib": 0.006313927471637726, + "step": 916 + }, + { + "ce_ib": 6.222542762756348, + "ce_orig": 0.5573530197143555, + "epoch": 0.2637141419224962, + "kl_loss": 0.30178603529930115, + "loss_ib": 0.009240402840077877, + "step": 917 + }, + { + "ce_ib": 9.022146224975586, + "ce_orig": 1.252068042755127, + "epoch": 0.2637141419224962, + "kl_loss": 0.26920855045318604, + "loss_ib": 0.01171423215419054, + "step": 917 + }, + { + "ce_ib": 7.715620994567871, + "ce_orig": 0.8185163140296936, + "epoch": 0.2637141419224962, + "kl_loss": 0.4483085572719574, + "loss_ib": 0.012198706157505512, + "step": 917 + }, + { + "ce_ib": 6.617023944854736, + "ce_orig": 1.0761107206344604, + "epoch": 0.2637141419224962, + "kl_loss": 0.3058355748653412, + "loss_ib": 0.00967537984251976, + "step": 917 + }, + { + "ce_ib": 5.220884799957275, + "ce_orig": 0.7539888620376587, + "epoch": 0.2640017255014739, + "kl_loss": 0.26427191495895386, + "loss_ib": 0.0078636035323143, + "step": 918 + }, + { + "ce_ib": 7.620891571044922, + "ce_orig": 0.7659056782722473, + "epoch": 0.2640017255014739, + "kl_loss": 0.30664142966270447, + "loss_ib": 0.010687305592000484, + "step": 918 + }, + { + "ce_ib": 8.80104923248291, + "ce_orig": 1.0766273736953735, + "epoch": 0.2640017255014739, + "kl_loss": 0.370442271232605, + "loss_ib": 0.01250547170639038, + "step": 918 + }, + { + "ce_ib": 12.601346969604492, + "ce_orig": 0.6492716073989868, + "epoch": 0.2640017255014739, + "kl_loss": 0.2531167268753052, + "loss_ib": 0.015132513828575611, + "step": 918 + }, + { + "ce_ib": 7.953497409820557, + "ce_orig": 0.7863147854804993, + "epoch": 0.2642893090804515, + "kl_loss": 0.25480780005455017, + "loss_ib": 0.010501575656235218, + "step": 919 + }, + { + "ce_ib": 9.819870948791504, + "ce_orig": 0.8340607285499573, + "epoch": 0.2642893090804515, + "kl_loss": 0.3362913727760315, + "loss_ib": 0.013182785362005234, + "step": 919 + }, + { + "ce_ib": 7.872880935668945, + "ce_orig": 1.1914820671081543, + "epoch": 0.2642893090804515, + "kl_loss": 0.2527698874473572, + "loss_ib": 0.010400580242276192, + "step": 919 + }, + { + "ce_ib": 10.222329139709473, + "ce_orig": 1.3306633234024048, + "epoch": 0.2642893090804515, + "kl_loss": 0.2606678009033203, + "loss_ib": 0.012829006649553776, + "step": 919 + }, + { + "epoch": 0.26457689265942913, + "grad_norm": 0.11017937958240509, + "learning_rate": 9.913518087164678e-06, + "loss": 0.8505, + "step": 920 + }, + { + "ce_ib": 7.44199800491333, + "ce_orig": 0.5371276140213013, + "epoch": 0.26457689265942913, + "kl_loss": 0.42192068696022034, + "loss_ib": 0.011661205440759659, + "step": 920 + }, + { + "ce_ib": 9.151060104370117, + "ce_orig": 1.068630337715149, + "epoch": 0.26457689265942913, + "kl_loss": 0.31581708788871765, + "loss_ib": 0.01230922993272543, + "step": 920 + }, + { + "ce_ib": 9.204903602600098, + "ce_orig": 0.909243643283844, + "epoch": 0.26457689265942913, + "kl_loss": 0.31501439213752747, + "loss_ib": 0.012355047278106213, + "step": 920 + }, + { + "ce_ib": 9.801901817321777, + "ce_orig": 0.9215630888938904, + "epoch": 0.26457689265942913, + "kl_loss": 0.3821442127227783, + "loss_ib": 0.013623344711959362, + "step": 920 + }, + { + "ce_ib": 9.883593559265137, + "ce_orig": 1.3681849241256714, + "epoch": 0.2648644762384068, + "kl_loss": 0.4136194586753845, + "loss_ib": 0.01401978824287653, + "step": 921 + }, + { + "ce_ib": 2.2384650707244873, + "ce_orig": 0.09530481696128845, + "epoch": 0.2648644762384068, + "kl_loss": 0.5867444276809692, + "loss_ib": 0.008105909451842308, + "step": 921 + }, + { + "ce_ib": 10.180109977722168, + "ce_orig": 0.8450271487236023, + "epoch": 0.2648644762384068, + "kl_loss": 0.2972285747528076, + "loss_ib": 0.013152395375072956, + "step": 921 + }, + { + "ce_ib": 7.981590270996094, + "ce_orig": 0.49811968207359314, + "epoch": 0.2648644762384068, + "kl_loss": 0.41247886419296265, + "loss_ib": 0.012106378562748432, + "step": 921 + }, + { + "ce_ib": 11.12768840789795, + "ce_orig": 1.3055758476257324, + "epoch": 0.26515205981738443, + "kl_loss": 0.4190482497215271, + "loss_ib": 0.015318172052502632, + "step": 922 + }, + { + "ce_ib": 6.9174723625183105, + "ce_orig": 0.5914834141731262, + "epoch": 0.26515205981738443, + "kl_loss": 0.275257408618927, + "loss_ib": 0.009670046158134937, + "step": 922 + }, + { + "ce_ib": 5.110889911651611, + "ce_orig": 0.4252597987651825, + "epoch": 0.26515205981738443, + "kl_loss": 0.25920987129211426, + "loss_ib": 0.007702989038079977, + "step": 922 + }, + { + "ce_ib": 9.542389869689941, + "ce_orig": 1.1555229425430298, + "epoch": 0.26515205981738443, + "kl_loss": 0.27457690238952637, + "loss_ib": 0.012288158759474754, + "step": 922 + }, + { + "ce_ib": 5.449411392211914, + "ce_orig": 0.7608138918876648, + "epoch": 0.26543964339636206, + "kl_loss": 0.2544456422328949, + "loss_ib": 0.007993867620825768, + "step": 923 + }, + { + "ce_ib": 5.343168258666992, + "ce_orig": 0.675761342048645, + "epoch": 0.26543964339636206, + "kl_loss": 0.2828470468521118, + "loss_ib": 0.00817163847386837, + "step": 923 + }, + { + "ce_ib": 5.600818157196045, + "ce_orig": 0.23253723978996277, + "epoch": 0.26543964339636206, + "kl_loss": 0.532296359539032, + "loss_ib": 0.010923781432211399, + "step": 923 + }, + { + "ce_ib": 7.541820526123047, + "ce_orig": 0.9155359268188477, + "epoch": 0.26543964339636206, + "kl_loss": 0.27482742071151733, + "loss_ib": 0.010290094651281834, + "step": 923 + }, + { + "ce_ib": 7.052559852600098, + "ce_orig": 0.744045078754425, + "epoch": 0.2657272269753397, + "kl_loss": 0.2872994840145111, + "loss_ib": 0.009925554506480694, + "step": 924 + }, + { + "ce_ib": 13.817998886108398, + "ce_orig": 1.8272193670272827, + "epoch": 0.2657272269753397, + "kl_loss": 0.3623213469982147, + "loss_ib": 0.017441213130950928, + "step": 924 + }, + { + "ce_ib": 10.127370834350586, + "ce_orig": 1.1621817350387573, + "epoch": 0.2657272269753397, + "kl_loss": 0.30838003754615784, + "loss_ib": 0.013211171142756939, + "step": 924 + }, + { + "ce_ib": 7.752861499786377, + "ce_orig": 0.7536821961402893, + "epoch": 0.2657272269753397, + "kl_loss": 0.3602597415447235, + "loss_ib": 0.011355457827448845, + "step": 924 + }, + { + "epoch": 0.26601481055431736, + "grad_norm": 0.11023896187543869, + "learning_rate": 9.912074973137413e-06, + "loss": 0.9011, + "step": 925 + }, + { + "ce_ib": 6.1034836769104, + "ce_orig": 0.6622889041900635, + "epoch": 0.26601481055431736, + "kl_loss": 0.2770423889160156, + "loss_ib": 0.00887390784919262, + "step": 925 + }, + { + "ce_ib": 8.66541576385498, + "ce_orig": 0.6636568903923035, + "epoch": 0.26601481055431736, + "kl_loss": 0.35214751958847046, + "loss_ib": 0.012186890468001366, + "step": 925 + }, + { + "ce_ib": 8.501686096191406, + "ce_orig": 1.0294500589370728, + "epoch": 0.26601481055431736, + "kl_loss": 0.32191595435142517, + "loss_ib": 0.011720845475792885, + "step": 925 + }, + { + "ce_ib": 8.163397789001465, + "ce_orig": 0.8285762071609497, + "epoch": 0.26601481055431736, + "kl_loss": 0.29768481850624084, + "loss_ib": 0.011140245944261551, + "step": 925 + }, + { + "ce_ib": 6.980233669281006, + "ce_orig": 0.7328478693962097, + "epoch": 0.266302394133295, + "kl_loss": 0.24676677584648132, + "loss_ib": 0.009447900578379631, + "step": 926 + }, + { + "ce_ib": 7.14872407913208, + "ce_orig": 0.8223768472671509, + "epoch": 0.266302394133295, + "kl_loss": 0.30274513363838196, + "loss_ib": 0.010176175273954868, + "step": 926 + }, + { + "ce_ib": 7.805938720703125, + "ce_orig": 0.9373571276664734, + "epoch": 0.266302394133295, + "kl_loss": 0.25375306606292725, + "loss_ib": 0.010343468748033047, + "step": 926 + }, + { + "ce_ib": 4.937993049621582, + "ce_orig": 0.6246491074562073, + "epoch": 0.266302394133295, + "kl_loss": 0.25294995307922363, + "loss_ib": 0.007467492483556271, + "step": 926 + }, + { + "ce_ib": 5.480818271636963, + "ce_orig": 0.5673008561134338, + "epoch": 0.2665899777122726, + "kl_loss": 0.2302064150571823, + "loss_ib": 0.0077828820794820786, + "step": 927 + }, + { + "ce_ib": 6.405802249908447, + "ce_orig": 0.5960977077484131, + "epoch": 0.2665899777122726, + "kl_loss": 0.2936103045940399, + "loss_ib": 0.009341904893517494, + "step": 927 + }, + { + "ce_ib": 11.765841484069824, + "ce_orig": 1.8276900053024292, + "epoch": 0.2665899777122726, + "kl_loss": 0.22832275927066803, + "loss_ib": 0.014049068093299866, + "step": 927 + }, + { + "ce_ib": 10.557626724243164, + "ce_orig": 1.3219131231307983, + "epoch": 0.2665899777122726, + "kl_loss": 0.31494155526161194, + "loss_ib": 0.01370704174041748, + "step": 927 + }, + { + "ce_ib": 7.791426181793213, + "ce_orig": 1.044633388519287, + "epoch": 0.2668775612912503, + "kl_loss": 0.22664925456047058, + "loss_ib": 0.010057918727397919, + "step": 928 + }, + { + "ce_ib": 5.639753341674805, + "ce_orig": 0.7303126454353333, + "epoch": 0.2668775612912503, + "kl_loss": 0.265280544757843, + "loss_ib": 0.00829255860298872, + "step": 928 + }, + { + "ce_ib": 9.726358413696289, + "ce_orig": 1.371625304222107, + "epoch": 0.2668775612912503, + "kl_loss": 0.2165038287639618, + "loss_ib": 0.011891396716237068, + "step": 928 + }, + { + "ce_ib": 6.140399932861328, + "ce_orig": 0.6564132571220398, + "epoch": 0.2668775612912503, + "kl_loss": 0.2733922004699707, + "loss_ib": 0.008874322287738323, + "step": 928 + }, + { + "ce_ib": 6.607884883880615, + "ce_orig": 0.5726844072341919, + "epoch": 0.2671651448702279, + "kl_loss": 0.36948060989379883, + "loss_ib": 0.010302690789103508, + "step": 929 + }, + { + "ce_ib": 8.565917015075684, + "ce_orig": 0.6998363733291626, + "epoch": 0.2671651448702279, + "kl_loss": 0.281715989112854, + "loss_ib": 0.011383076198399067, + "step": 929 + }, + { + "ce_ib": 5.31778621673584, + "ce_orig": 0.3831661343574524, + "epoch": 0.2671651448702279, + "kl_loss": 0.47300854325294495, + "loss_ib": 0.010047871619462967, + "step": 929 + }, + { + "ce_ib": 11.917003631591797, + "ce_orig": 1.680348515510559, + "epoch": 0.2671651448702279, + "kl_loss": 0.4127323031425476, + "loss_ib": 0.01604432612657547, + "step": 929 + }, + { + "epoch": 0.26745272844920553, + "grad_norm": 0.10447093099355698, + "learning_rate": 9.910620024781422e-06, + "loss": 0.9509, + "step": 930 + }, + { + "ce_ib": 11.539644241333008, + "ce_orig": 1.3957158327102661, + "epoch": 0.26745272844920553, + "kl_loss": 0.28435567021369934, + "loss_ib": 0.014383199624717236, + "step": 930 + }, + { + "ce_ib": 7.828366756439209, + "ce_orig": 1.2391057014465332, + "epoch": 0.26745272844920553, + "kl_loss": 0.2012348175048828, + "loss_ib": 0.009840714745223522, + "step": 930 + }, + { + "ce_ib": 8.055999755859375, + "ce_orig": 0.834807276725769, + "epoch": 0.26745272844920553, + "kl_loss": 0.3234785199165344, + "loss_ib": 0.011290784925222397, + "step": 930 + }, + { + "ce_ib": 2.334864854812622, + "ce_orig": 0.1836487501859665, + "epoch": 0.26745272844920553, + "kl_loss": 0.6265314817428589, + "loss_ib": 0.008600179105997086, + "step": 930 + }, + { + "ce_ib": 6.897594928741455, + "ce_orig": 0.6649829745292664, + "epoch": 0.2677403120281832, + "kl_loss": 0.2482946813106537, + "loss_ib": 0.009380541741847992, + "step": 931 + }, + { + "ce_ib": 2.828857660293579, + "ce_orig": 0.37604042887687683, + "epoch": 0.2677403120281832, + "kl_loss": 0.566237211227417, + "loss_ib": 0.008491230197250843, + "step": 931 + }, + { + "ce_ib": 10.757418632507324, + "ce_orig": 1.2388101816177368, + "epoch": 0.2677403120281832, + "kl_loss": 0.3185473084449768, + "loss_ib": 0.013942892663180828, + "step": 931 + }, + { + "ce_ib": 5.737816333770752, + "ce_orig": 0.8773601651191711, + "epoch": 0.2677403120281832, + "kl_loss": 0.28802287578582764, + "loss_ib": 0.008618045598268509, + "step": 931 + }, + { + "ce_ib": 5.9482855796813965, + "ce_orig": 0.7482618689537048, + "epoch": 0.26802789560716084, + "kl_loss": 0.36070847511291504, + "loss_ib": 0.00955536961555481, + "step": 932 + }, + { + "ce_ib": 15.711875915527344, + "ce_orig": 2.0872745513916016, + "epoch": 0.26802789560716084, + "kl_loss": 0.3053112328052521, + "loss_ib": 0.018764987587928772, + "step": 932 + }, + { + "ce_ib": 7.429306507110596, + "ce_orig": 0.7280347347259521, + "epoch": 0.26802789560716084, + "kl_loss": 0.2755368649959564, + "loss_ib": 0.010184675455093384, + "step": 932 + }, + { + "ce_ib": 6.810687065124512, + "ce_orig": 0.5476087927818298, + "epoch": 0.26802789560716084, + "kl_loss": 0.254897803068161, + "loss_ib": 0.009359664283692837, + "step": 932 + }, + { + "ce_ib": 4.790681838989258, + "ce_orig": 0.6239033341407776, + "epoch": 0.26831547918613846, + "kl_loss": 0.2333354651927948, + "loss_ib": 0.007124036550521851, + "step": 933 + }, + { + "ce_ib": 10.430938720703125, + "ce_orig": 1.3715506792068481, + "epoch": 0.26831547918613846, + "kl_loss": 0.3643788695335388, + "loss_ib": 0.014074727892875671, + "step": 933 + }, + { + "ce_ib": 8.228853225708008, + "ce_orig": 0.7698665857315063, + "epoch": 0.26831547918613846, + "kl_loss": 0.25796443223953247, + "loss_ib": 0.010808497667312622, + "step": 933 + }, + { + "ce_ib": 10.43002986907959, + "ce_orig": 0.6817733645439148, + "epoch": 0.26831547918613846, + "kl_loss": 0.40837064385414124, + "loss_ib": 0.01451373565942049, + "step": 933 + }, + { + "ce_ib": 6.217407703399658, + "ce_orig": 0.5401474237442017, + "epoch": 0.2686030627651161, + "kl_loss": 0.244707390666008, + "loss_ib": 0.008664481341838837, + "step": 934 + }, + { + "ce_ib": 6.634978294372559, + "ce_orig": 0.7534041404724121, + "epoch": 0.2686030627651161, + "kl_loss": 0.2863770127296448, + "loss_ib": 0.009498748928308487, + "step": 934 + }, + { + "ce_ib": 6.577731132507324, + "ce_orig": 0.7866048812866211, + "epoch": 0.2686030627651161, + "kl_loss": 0.3120877146720886, + "loss_ib": 0.009698607958853245, + "step": 934 + }, + { + "ce_ib": 6.873836994171143, + "ce_orig": 0.6249902248382568, + "epoch": 0.2686030627651161, + "kl_loss": 0.2403496652841568, + "loss_ib": 0.00927733350545168, + "step": 934 + }, + { + "epoch": 0.26889064634409376, + "grad_norm": 0.09980176389217377, + "learning_rate": 9.909153245602012e-06, + "loss": 0.8424, + "step": 935 + }, + { + "ce_ib": 8.975276947021484, + "ce_orig": 1.0671329498291016, + "epoch": 0.26889064634409376, + "kl_loss": 0.31780779361724854, + "loss_ib": 0.012153354473412037, + "step": 935 + }, + { + "ce_ib": 7.080759048461914, + "ce_orig": 0.6667758226394653, + "epoch": 0.26889064634409376, + "kl_loss": 0.4114856719970703, + "loss_ib": 0.01119561679661274, + "step": 935 + }, + { + "ce_ib": 8.725547790527344, + "ce_orig": 0.8014039993286133, + "epoch": 0.26889064634409376, + "kl_loss": 0.23206469416618347, + "loss_ib": 0.011046194471418858, + "step": 935 + }, + { + "ce_ib": 6.310739040374756, + "ce_orig": 0.6323633193969727, + "epoch": 0.26889064634409376, + "kl_loss": 0.3221661448478699, + "loss_ib": 0.009532400406897068, + "step": 935 + }, + { + "ce_ib": 6.315284729003906, + "ce_orig": 0.5775780081748962, + "epoch": 0.2691782299230714, + "kl_loss": 0.29390376806259155, + "loss_ib": 0.00925432238727808, + "step": 936 + }, + { + "ce_ib": 4.208686828613281, + "ce_orig": 0.4354095160961151, + "epoch": 0.2691782299230714, + "kl_loss": 0.297516405582428, + "loss_ib": 0.00718385074287653, + "step": 936 + }, + { + "ce_ib": 8.687440872192383, + "ce_orig": 0.6239301562309265, + "epoch": 0.2691782299230714, + "kl_loss": 0.30756255984306335, + "loss_ib": 0.011763066053390503, + "step": 936 + }, + { + "ce_ib": 8.584075927734375, + "ce_orig": 1.136924386024475, + "epoch": 0.2691782299230714, + "kl_loss": 0.2599954903125763, + "loss_ib": 0.011184030212461948, + "step": 936 + }, + { + "ce_ib": 5.511157035827637, + "ce_orig": 0.6932129859924316, + "epoch": 0.269465813502049, + "kl_loss": 0.27716803550720215, + "loss_ib": 0.008282837457954884, + "step": 937 + }, + { + "ce_ib": 9.595725059509277, + "ce_orig": 0.9539211988449097, + "epoch": 0.269465813502049, + "kl_loss": 0.24416208267211914, + "loss_ib": 0.012037346139550209, + "step": 937 + }, + { + "ce_ib": 6.6203083992004395, + "ce_orig": 0.9232450127601624, + "epoch": 0.269465813502049, + "kl_loss": 0.20647379755973816, + "loss_ib": 0.008685045875608921, + "step": 937 + }, + { + "ce_ib": 7.428555488586426, + "ce_orig": 0.48220083117485046, + "epoch": 0.269465813502049, + "kl_loss": 0.35726824402809143, + "loss_ib": 0.01100123766809702, + "step": 937 + }, + { + "ce_ib": 8.365070343017578, + "ce_orig": 0.43749791383743286, + "epoch": 0.2697533970810267, + "kl_loss": 0.30819666385650635, + "loss_ib": 0.01144703570753336, + "step": 938 + }, + { + "ce_ib": 8.154250144958496, + "ce_orig": 0.9003611207008362, + "epoch": 0.2697533970810267, + "kl_loss": 0.4072403311729431, + "loss_ib": 0.012226653285324574, + "step": 938 + }, + { + "ce_ib": 6.375957012176514, + "ce_orig": 0.6323530673980713, + "epoch": 0.2697533970810267, + "kl_loss": 0.32399243116378784, + "loss_ib": 0.009615881368517876, + "step": 938 + }, + { + "ce_ib": 7.025842666625977, + "ce_orig": 0.43968331813812256, + "epoch": 0.2697533970810267, + "kl_loss": 0.312505841255188, + "loss_ib": 0.010150901041924953, + "step": 938 + }, + { + "ce_ib": 7.08709192276001, + "ce_orig": 0.5549724698066711, + "epoch": 0.2700409806600043, + "kl_loss": 0.4219147861003876, + "loss_ib": 0.011306239292025566, + "step": 939 + }, + { + "ce_ib": 7.558558464050293, + "ce_orig": 1.0065475702285767, + "epoch": 0.2700409806600043, + "kl_loss": 0.2504430413246155, + "loss_ib": 0.010062988847494125, + "step": 939 + }, + { + "ce_ib": 6.018697738647461, + "ce_orig": 0.8349258899688721, + "epoch": 0.2700409806600043, + "kl_loss": 0.20029550790786743, + "loss_ib": 0.008021652698516846, + "step": 939 + }, + { + "ce_ib": 10.575093269348145, + "ce_orig": 1.4976614713668823, + "epoch": 0.2700409806600043, + "kl_loss": 0.3008938133716583, + "loss_ib": 0.013584030792117119, + "step": 939 + }, + { + "epoch": 0.27032856423898194, + "grad_norm": 0.10055164247751236, + "learning_rate": 9.907674639132995e-06, + "loss": 0.8408, + "step": 940 + }, + { + "ce_ib": 4.028119087219238, + "ce_orig": 0.4562585949897766, + "epoch": 0.27032856423898194, + "kl_loss": 0.2093207985162735, + "loss_ib": 0.006121327169239521, + "step": 940 + }, + { + "ce_ib": 6.0146613121032715, + "ce_orig": 0.6793175935745239, + "epoch": 0.27032856423898194, + "kl_loss": 0.21066246926784515, + "loss_ib": 0.00812128558754921, + "step": 940 + }, + { + "ce_ib": 8.57176399230957, + "ce_orig": 1.1877782344818115, + "epoch": 0.27032856423898194, + "kl_loss": 0.32324960827827454, + "loss_ib": 0.01180425938218832, + "step": 940 + }, + { + "ce_ib": 10.997118949890137, + "ce_orig": 1.800614595413208, + "epoch": 0.27032856423898194, + "kl_loss": 0.2794951796531677, + "loss_ib": 0.013792071491479874, + "step": 940 + }, + { + "ce_ib": 7.437707424163818, + "ce_orig": 1.0083261728286743, + "epoch": 0.2706161478179596, + "kl_loss": 0.21823345124721527, + "loss_ib": 0.009620042517781258, + "step": 941 + }, + { + "ce_ib": 6.081821441650391, + "ce_orig": 0.7146956324577332, + "epoch": 0.2706161478179596, + "kl_loss": 0.23236779868602753, + "loss_ib": 0.008405499160289764, + "step": 941 + }, + { + "ce_ib": 10.360185623168945, + "ce_orig": 1.2722724676132202, + "epoch": 0.2706161478179596, + "kl_loss": 0.4324212670326233, + "loss_ib": 0.014684397727251053, + "step": 941 + }, + { + "ce_ib": 8.173184394836426, + "ce_orig": 0.3789297640323639, + "epoch": 0.2706161478179596, + "kl_loss": 0.318384051322937, + "loss_ib": 0.011357024312019348, + "step": 941 + }, + { + "ce_ib": 7.6668782234191895, + "ce_orig": 0.6972788572311401, + "epoch": 0.27090373139693724, + "kl_loss": 0.40589210391044617, + "loss_ib": 0.01172579824924469, + "step": 942 + }, + { + "ce_ib": 8.189241409301758, + "ce_orig": 0.7313915491104126, + "epoch": 0.27090373139693724, + "kl_loss": 0.28188782930374146, + "loss_ib": 0.011008119210600853, + "step": 942 + }, + { + "ce_ib": 7.981906414031982, + "ce_orig": 0.9324852824211121, + "epoch": 0.27090373139693724, + "kl_loss": 0.3256570100784302, + "loss_ib": 0.011238477192819118, + "step": 942 + }, + { + "ce_ib": 8.018670082092285, + "ce_orig": 0.8148074150085449, + "epoch": 0.27090373139693724, + "kl_loss": 0.3132972717285156, + "loss_ib": 0.01115164253860712, + "step": 942 + }, + { + "ce_ib": 8.356046676635742, + "ce_orig": 0.5797178745269775, + "epoch": 0.27119131497591487, + "kl_loss": 0.4871473014354706, + "loss_ib": 0.013227519579231739, + "step": 943 + }, + { + "ce_ib": 10.591626167297363, + "ce_orig": 1.4526135921478271, + "epoch": 0.27119131497591487, + "kl_loss": 0.374215304851532, + "loss_ib": 0.01433377992361784, + "step": 943 + }, + { + "ce_ib": 6.816173553466797, + "ce_orig": 0.7827908396720886, + "epoch": 0.27119131497591487, + "kl_loss": 0.31245943903923035, + "loss_ib": 0.009940768592059612, + "step": 943 + }, + { + "ce_ib": 8.920738220214844, + "ce_orig": 1.1158427000045776, + "epoch": 0.27119131497591487, + "kl_loss": 0.3286566436290741, + "loss_ib": 0.012207304127514362, + "step": 943 + }, + { + "ce_ib": 9.12059211730957, + "ce_orig": 1.1127867698669434, + "epoch": 0.2714788985548925, + "kl_loss": 0.5925597548484802, + "loss_ib": 0.015046189539134502, + "step": 944 + }, + { + "ce_ib": 7.13171911239624, + "ce_orig": 0.7354855537414551, + "epoch": 0.2714788985548925, + "kl_loss": 0.3313751220703125, + "loss_ib": 0.010445470921695232, + "step": 944 + }, + { + "ce_ib": 5.612328052520752, + "ce_orig": 0.6953266859054565, + "epoch": 0.2714788985548925, + "kl_loss": 0.22593432664871216, + "loss_ib": 0.007871671579778194, + "step": 944 + }, + { + "ce_ib": 7.506075859069824, + "ce_orig": 0.7433436512947083, + "epoch": 0.2714788985548925, + "kl_loss": 0.3743892014026642, + "loss_ib": 0.011249967850744724, + "step": 944 + }, + { + "epoch": 0.27176648213387017, + "grad_norm": 0.10328007489442825, + "learning_rate": 9.906184208936675e-06, + "loss": 0.8559, + "step": 945 + }, + { + "ce_ib": 6.606997966766357, + "ce_orig": 0.9594669342041016, + "epoch": 0.27176648213387017, + "kl_loss": 0.3110538423061371, + "loss_ib": 0.00971753615885973, + "step": 945 + }, + { + "ce_ib": 7.5082621574401855, + "ce_orig": 0.7433197498321533, + "epoch": 0.27176648213387017, + "kl_loss": 0.3739091157913208, + "loss_ib": 0.011247353628277779, + "step": 945 + }, + { + "ce_ib": 6.4652605056762695, + "ce_orig": 0.44874706864356995, + "epoch": 0.27176648213387017, + "kl_loss": 0.3493693470954895, + "loss_ib": 0.009958953596651554, + "step": 945 + }, + { + "ce_ib": 11.719304084777832, + "ce_orig": 1.1819493770599365, + "epoch": 0.27176648213387017, + "kl_loss": 0.2937600314617157, + "loss_ib": 0.01465690415352583, + "step": 945 + }, + { + "ce_ib": 12.488480567932129, + "ce_orig": 1.4912511110305786, + "epoch": 0.2720540657128478, + "kl_loss": 0.3444375991821289, + "loss_ib": 0.015932856127619743, + "step": 946 + }, + { + "ce_ib": 6.640789985656738, + "ce_orig": 0.7050909399986267, + "epoch": 0.2720540657128478, + "kl_loss": 0.2555736005306244, + "loss_ib": 0.009196525439620018, + "step": 946 + }, + { + "ce_ib": 4.94317626953125, + "ce_orig": 0.5460999608039856, + "epoch": 0.2720540657128478, + "kl_loss": 0.28116828203201294, + "loss_ib": 0.007754858583211899, + "step": 946 + }, + { + "ce_ib": 12.355217933654785, + "ce_orig": 1.9291751384735107, + "epoch": 0.2720540657128478, + "kl_loss": 0.3321114778518677, + "loss_ib": 0.015676332637667656, + "step": 946 + }, + { + "ce_ib": 9.124369621276855, + "ce_orig": 1.180700659751892, + "epoch": 0.2723416492918254, + "kl_loss": 0.2571544945240021, + "loss_ib": 0.011695913970470428, + "step": 947 + }, + { + "ce_ib": 4.049078941345215, + "ce_orig": 0.46376824378967285, + "epoch": 0.2723416492918254, + "kl_loss": 0.5594756007194519, + "loss_ib": 0.00964383501559496, + "step": 947 + }, + { + "ce_ib": 6.723659515380859, + "ce_orig": 0.8255409002304077, + "epoch": 0.2723416492918254, + "kl_loss": 0.5856155157089233, + "loss_ib": 0.012579815462231636, + "step": 947 + }, + { + "ce_ib": 6.566412925720215, + "ce_orig": 0.3921998143196106, + "epoch": 0.2723416492918254, + "kl_loss": 0.8435590863227844, + "loss_ib": 0.015002003870904446, + "step": 947 + }, + { + "ce_ib": 7.8375725746154785, + "ce_orig": 1.2053213119506836, + "epoch": 0.2726292328708031, + "kl_loss": 0.3942751884460449, + "loss_ib": 0.011780323460698128, + "step": 948 + }, + { + "ce_ib": 6.784183025360107, + "ce_orig": 0.8397347331047058, + "epoch": 0.2726292328708031, + "kl_loss": 0.3014325499534607, + "loss_ib": 0.009798509068787098, + "step": 948 + }, + { + "ce_ib": 9.067359924316406, + "ce_orig": 0.6075240969657898, + "epoch": 0.2726292328708031, + "kl_loss": 0.3293374180793762, + "loss_ib": 0.01236073486506939, + "step": 948 + }, + { + "ce_ib": 8.427669525146484, + "ce_orig": 0.67606520652771, + "epoch": 0.2726292328708031, + "kl_loss": 0.411496102809906, + "loss_ib": 0.012542630545794964, + "step": 948 + }, + { + "ce_ib": 9.243931770324707, + "ce_orig": 0.8393007516860962, + "epoch": 0.2729168164497807, + "kl_loss": 0.31467002630233765, + "loss_ib": 0.012390632182359695, + "step": 949 + }, + { + "ce_ib": 10.456159591674805, + "ce_orig": 1.1939802169799805, + "epoch": 0.2729168164497807, + "kl_loss": 0.30369293689727783, + "loss_ib": 0.013493089005351067, + "step": 949 + }, + { + "ce_ib": 3.625389337539673, + "ce_orig": 0.4330550730228424, + "epoch": 0.2729168164497807, + "kl_loss": 0.21511869132518768, + "loss_ib": 0.005776576232165098, + "step": 949 + }, + { + "ce_ib": 9.729208946228027, + "ce_orig": 1.1287404298782349, + "epoch": 0.2729168164497807, + "kl_loss": 0.29766565561294556, + "loss_ib": 0.012705864384770393, + "step": 949 + }, + { + "epoch": 0.27320440002875834, + "grad_norm": 0.11360763758420944, + "learning_rate": 9.904681958603847e-06, + "loss": 0.8716, + "step": 950 + }, + { + "ce_ib": 5.15933084487915, + "ce_orig": 0.5913940668106079, + "epoch": 0.27320440002875834, + "kl_loss": 0.2843480110168457, + "loss_ib": 0.0080028111115098, + "step": 950 + }, + { + "ce_ib": 6.33256721496582, + "ce_orig": 0.7312302589416504, + "epoch": 0.27320440002875834, + "kl_loss": 0.3365132212638855, + "loss_ib": 0.00969769898802042, + "step": 950 + }, + { + "ce_ib": 7.236344814300537, + "ce_orig": 0.9631039500236511, + "epoch": 0.27320440002875834, + "kl_loss": 0.3992907404899597, + "loss_ib": 0.011229252442717552, + "step": 950 + }, + { + "ce_ib": 8.233247756958008, + "ce_orig": 0.9781149625778198, + "epoch": 0.27320440002875834, + "kl_loss": 0.32408010959625244, + "loss_ib": 0.011474048718810081, + "step": 950 + }, + { + "ce_ib": 9.425809860229492, + "ce_orig": 1.0185048580169678, + "epoch": 0.273491983607736, + "kl_loss": 0.3078628480434418, + "loss_ib": 0.012504437938332558, + "step": 951 + }, + { + "ce_ib": 5.389331340789795, + "ce_orig": 0.7841058969497681, + "epoch": 0.273491983607736, + "kl_loss": 0.28654882311820984, + "loss_ib": 0.008254819549620152, + "step": 951 + }, + { + "ce_ib": 9.956135749816895, + "ce_orig": 1.0741745233535767, + "epoch": 0.273491983607736, + "kl_loss": 0.2052108645439148, + "loss_ib": 0.0120082451030612, + "step": 951 + }, + { + "ce_ib": 10.591538429260254, + "ce_orig": 1.5130939483642578, + "epoch": 0.273491983607736, + "kl_loss": 0.3333120346069336, + "loss_ib": 0.013924659229815006, + "step": 951 + }, + { + "ce_ib": 11.254968643188477, + "ce_orig": 1.5675315856933594, + "epoch": 0.27377956718671365, + "kl_loss": 0.2911054790019989, + "loss_ib": 0.014166023582220078, + "step": 952 + }, + { + "ce_ib": 5.313507556915283, + "ce_orig": 0.3765413761138916, + "epoch": 0.27377956718671365, + "kl_loss": 0.23264440894126892, + "loss_ib": 0.007639951538294554, + "step": 952 + }, + { + "ce_ib": 8.539169311523438, + "ce_orig": 1.1101938486099243, + "epoch": 0.27377956718671365, + "kl_loss": 0.30267441272735596, + "loss_ib": 0.011565913446247578, + "step": 952 + }, + { + "ce_ib": 8.370697975158691, + "ce_orig": 1.101776123046875, + "epoch": 0.27377956718671365, + "kl_loss": 0.2343529760837555, + "loss_ib": 0.010714228264987469, + "step": 952 + }, + { + "ce_ib": 7.463455677032471, + "ce_orig": 0.7007122039794922, + "epoch": 0.27406715076569127, + "kl_loss": 0.2482740581035614, + "loss_ib": 0.009946195408701897, + "step": 953 + }, + { + "ce_ib": 9.414673805236816, + "ce_orig": 0.7169649004936218, + "epoch": 0.27406715076569127, + "kl_loss": 0.386259526014328, + "loss_ib": 0.013277268968522549, + "step": 953 + }, + { + "ce_ib": 6.160613059997559, + "ce_orig": 0.6735572814941406, + "epoch": 0.27406715076569127, + "kl_loss": 0.2884378135204315, + "loss_ib": 0.009044991806149483, + "step": 953 + }, + { + "ce_ib": 4.295875549316406, + "ce_orig": 0.3026116192340851, + "epoch": 0.27406715076569127, + "kl_loss": 0.3349815607070923, + "loss_ib": 0.007645691279321909, + "step": 953 + }, + { + "ce_ib": 11.889391899108887, + "ce_orig": 1.7062128782272339, + "epoch": 0.2743547343446689, + "kl_loss": 0.3287060856819153, + "loss_ib": 0.01517645362764597, + "step": 954 + }, + { + "ce_ib": 9.97256851196289, + "ce_orig": 0.8832455277442932, + "epoch": 0.2743547343446689, + "kl_loss": 0.3570169508457184, + "loss_ib": 0.013542737811803818, + "step": 954 + }, + { + "ce_ib": 11.863001823425293, + "ce_orig": 1.066734790802002, + "epoch": 0.2743547343446689, + "kl_loss": 0.3354340195655823, + "loss_ib": 0.015217342413961887, + "step": 954 + }, + { + "ce_ib": 6.305656909942627, + "ce_orig": 0.609102725982666, + "epoch": 0.2743547343446689, + "kl_loss": 0.33587515354156494, + "loss_ib": 0.009664407931268215, + "step": 954 + }, + { + "epoch": 0.2746423179236466, + "grad_norm": 0.09016523510217667, + "learning_rate": 9.903167891753781e-06, + "loss": 0.863, + "step": 955 + }, + { + "ce_ib": 7.7995100021362305, + "ce_orig": 0.48420944809913635, + "epoch": 0.2746423179236466, + "kl_loss": 0.4207814335823059, + "loss_ib": 0.012007324025034904, + "step": 955 + }, + { + "ce_ib": 9.223494529724121, + "ce_orig": 0.8398165702819824, + "epoch": 0.2746423179236466, + "kl_loss": 0.31191039085388184, + "loss_ib": 0.012342598289251328, + "step": 955 + }, + { + "ce_ib": 9.061304092407227, + "ce_orig": 0.5709256529808044, + "epoch": 0.2746423179236466, + "kl_loss": 0.35397371649742126, + "loss_ib": 0.012601041235029697, + "step": 955 + }, + { + "ce_ib": 6.83884859085083, + "ce_orig": 0.8219555020332336, + "epoch": 0.2746423179236466, + "kl_loss": 0.4126385450363159, + "loss_ib": 0.01096523366868496, + "step": 955 + }, + { + "ce_ib": 4.699288368225098, + "ce_orig": 0.44520479440689087, + "epoch": 0.2749299015026242, + "kl_loss": 0.325499027967453, + "loss_ib": 0.007954278029501438, + "step": 956 + }, + { + "ce_ib": 8.23465633392334, + "ce_orig": 0.8416075110435486, + "epoch": 0.2749299015026242, + "kl_loss": 0.21434545516967773, + "loss_ib": 0.010378110222518444, + "step": 956 + }, + { + "ce_ib": 6.925322532653809, + "ce_orig": 0.47291597723960876, + "epoch": 0.2749299015026242, + "kl_loss": 0.8208890557289124, + "loss_ib": 0.01513421256095171, + "step": 956 + }, + { + "ce_ib": 4.961340427398682, + "ce_orig": 0.7618390917778015, + "epoch": 0.2749299015026242, + "kl_loss": 0.2526277005672455, + "loss_ib": 0.0074876174330711365, + "step": 956 + }, + { + "ce_ib": 7.343484401702881, + "ce_orig": 0.8298138380050659, + "epoch": 0.2752174850816018, + "kl_loss": 0.36207157373428345, + "loss_ib": 0.010964199900627136, + "step": 957 + }, + { + "ce_ib": 7.905651569366455, + "ce_orig": 1.0571790933609009, + "epoch": 0.2752174850816018, + "kl_loss": 0.27009880542755127, + "loss_ib": 0.010606639087200165, + "step": 957 + }, + { + "ce_ib": 6.430732727050781, + "ce_orig": 0.7019039392471313, + "epoch": 0.2752174850816018, + "kl_loss": 0.3244302272796631, + "loss_ib": 0.009675034321844578, + "step": 957 + }, + { + "ce_ib": 9.795534133911133, + "ce_orig": 1.5741654634475708, + "epoch": 0.2752174850816018, + "kl_loss": 0.27315622568130493, + "loss_ib": 0.012527096085250378, + "step": 957 + }, + { + "ce_ib": 7.98039436340332, + "ce_orig": 0.5512452721595764, + "epoch": 0.2755050686605795, + "kl_loss": 0.3615424633026123, + "loss_ib": 0.011595819145441055, + "step": 958 + }, + { + "ce_ib": 6.763614654541016, + "ce_orig": 0.6786366105079651, + "epoch": 0.2755050686605795, + "kl_loss": 0.28236159682273865, + "loss_ib": 0.009587230160832405, + "step": 958 + }, + { + "ce_ib": 4.218715667724609, + "ce_orig": 0.6746366620063782, + "epoch": 0.2755050686605795, + "kl_loss": 0.3110579550266266, + "loss_ib": 0.007329294923692942, + "step": 958 + }, + { + "ce_ib": 11.259781837463379, + "ce_orig": 1.4071563482284546, + "epoch": 0.2755050686605795, + "kl_loss": 0.25216126441955566, + "loss_ib": 0.013781394809484482, + "step": 958 + }, + { + "ce_ib": 5.040888786315918, + "ce_orig": 0.8139171600341797, + "epoch": 0.2757926522395571, + "kl_loss": 0.1931689977645874, + "loss_ib": 0.0069725788198411465, + "step": 959 + }, + { + "ce_ib": 8.201353073120117, + "ce_orig": 0.8536245822906494, + "epoch": 0.2757926522395571, + "kl_loss": 0.2896878719329834, + "loss_ib": 0.011098232120275497, + "step": 959 + }, + { + "ce_ib": 5.141984939575195, + "ce_orig": 0.36424100399017334, + "epoch": 0.2757926522395571, + "kl_loss": 0.2921089828014374, + "loss_ib": 0.008063074201345444, + "step": 959 + }, + { + "ce_ib": 10.956453323364258, + "ce_orig": 1.2245234251022339, + "epoch": 0.2757926522395571, + "kl_loss": 0.4386565685272217, + "loss_ib": 0.015343018807470798, + "step": 959 + }, + { + "epoch": 0.27608023581853475, + "grad_norm": 0.1057107001543045, + "learning_rate": 9.901642012034214e-06, + "loss": 0.7911, + "step": 960 + }, + { + "ce_ib": 13.470467567443848, + "ce_orig": 1.7405394315719604, + "epoch": 0.27608023581853475, + "kl_loss": 0.328184574842453, + "loss_ib": 0.01675231382250786, + "step": 960 + }, + { + "ce_ib": 6.385258197784424, + "ce_orig": 0.726634681224823, + "epoch": 0.27608023581853475, + "kl_loss": 0.2683427333831787, + "loss_ib": 0.009068685583770275, + "step": 960 + }, + { + "ce_ib": 12.370391845703125, + "ce_orig": 1.2445263862609863, + "epoch": 0.27608023581853475, + "kl_loss": 0.29927101731300354, + "loss_ib": 0.015363101847469807, + "step": 960 + }, + { + "ce_ib": 8.821762084960938, + "ce_orig": 1.0471879243850708, + "epoch": 0.27608023581853475, + "kl_loss": 0.19080427289009094, + "loss_ib": 0.010729804635047913, + "step": 960 + }, + { + "ce_ib": 8.348326683044434, + "ce_orig": 1.2821494340896606, + "epoch": 0.2763678193975124, + "kl_loss": 0.4306579828262329, + "loss_ib": 0.012654906138777733, + "step": 961 + }, + { + "ce_ib": 9.511332511901855, + "ce_orig": 1.11271333694458, + "epoch": 0.2763678193975124, + "kl_loss": 0.39218825101852417, + "loss_ib": 0.013433215208351612, + "step": 961 + }, + { + "ce_ib": 10.88101577758789, + "ce_orig": 1.3116422891616821, + "epoch": 0.2763678193975124, + "kl_loss": 0.28412100672721863, + "loss_ib": 0.013722226023674011, + "step": 961 + }, + { + "ce_ib": 4.692732810974121, + "ce_orig": 0.5053314566612244, + "epoch": 0.2763678193975124, + "kl_loss": 0.20232996344566345, + "loss_ib": 0.006716032512485981, + "step": 961 + }, + { + "ce_ib": 4.9355149269104, + "ce_orig": 0.46557968854904175, + "epoch": 0.27665540297649005, + "kl_loss": 0.5611593723297119, + "loss_ib": 0.010547108016908169, + "step": 962 + }, + { + "ce_ib": 9.1857271194458, + "ce_orig": 0.8455274105072021, + "epoch": 0.27665540297649005, + "kl_loss": 0.2985427975654602, + "loss_ib": 0.012171154841780663, + "step": 962 + }, + { + "ce_ib": 10.592644691467285, + "ce_orig": 1.263620376586914, + "epoch": 0.27665540297649005, + "kl_loss": 0.4153948724269867, + "loss_ib": 0.014746594242751598, + "step": 962 + }, + { + "ce_ib": 6.028276443481445, + "ce_orig": 0.6360313892364502, + "epoch": 0.27665540297649005, + "kl_loss": 0.256117582321167, + "loss_ib": 0.008589452132582664, + "step": 962 + }, + { + "ce_ib": 7.142398357391357, + "ce_orig": 0.8674178123474121, + "epoch": 0.2769429865554677, + "kl_loss": 0.26119542121887207, + "loss_ib": 0.009754352271556854, + "step": 963 + }, + { + "ce_ib": 4.550671577453613, + "ce_orig": 0.4504320025444031, + "epoch": 0.2769429865554677, + "kl_loss": 0.27785179018974304, + "loss_ib": 0.00732918968424201, + "step": 963 + }, + { + "ce_ib": 10.917935371398926, + "ce_orig": 1.3695002794265747, + "epoch": 0.2769429865554677, + "kl_loss": 0.310863733291626, + "loss_ib": 0.014026571996510029, + "step": 963 + }, + { + "ce_ib": 5.352876663208008, + "ce_orig": 0.9681556224822998, + "epoch": 0.2769429865554677, + "kl_loss": 0.24146953225135803, + "loss_ib": 0.007767572067677975, + "step": 963 + }, + { + "ce_ib": 6.516645908355713, + "ce_orig": 0.6153663396835327, + "epoch": 0.2772305701344453, + "kl_loss": 0.2555677592754364, + "loss_ib": 0.009072324261069298, + "step": 964 + }, + { + "ce_ib": 5.9202094078063965, + "ce_orig": 0.562997043132782, + "epoch": 0.2772305701344453, + "kl_loss": 0.32697319984436035, + "loss_ib": 0.009189940989017487, + "step": 964 + }, + { + "ce_ib": 11.461648941040039, + "ce_orig": 1.4160990715026855, + "epoch": 0.2772305701344453, + "kl_loss": 0.20565414428710938, + "loss_ib": 0.013518190011382103, + "step": 964 + }, + { + "ce_ib": 4.950023174285889, + "ce_orig": 0.5089555978775024, + "epoch": 0.2772305701344453, + "kl_loss": 0.2313445806503296, + "loss_ib": 0.00726346904411912, + "step": 964 + }, + { + "epoch": 0.277518153713423, + "grad_norm": 0.10406413674354553, + "learning_rate": 9.900104323121344e-06, + "loss": 0.8932, + "step": 965 + }, + { + "ce_ib": 3.8523833751678467, + "ce_orig": 0.6625568866729736, + "epoch": 0.277518153713423, + "kl_loss": 0.206961527466774, + "loss_ib": 0.005921998526901007, + "step": 965 + }, + { + "ce_ib": 5.566319942474365, + "ce_orig": 0.4371108114719391, + "epoch": 0.277518153713423, + "kl_loss": 0.3670559823513031, + "loss_ib": 0.009236878715455532, + "step": 965 + }, + { + "ce_ib": 6.957183837890625, + "ce_orig": 0.5599235892295837, + "epoch": 0.277518153713423, + "kl_loss": 0.23749658465385437, + "loss_ib": 0.009332150220870972, + "step": 965 + }, + { + "ce_ib": 10.045352935791016, + "ce_orig": 1.2828036546707153, + "epoch": 0.277518153713423, + "kl_loss": 0.2769809663295746, + "loss_ib": 0.012815162539482117, + "step": 965 + }, + { + "ce_ib": 6.997009754180908, + "ce_orig": 0.8958306312561035, + "epoch": 0.2778057372924006, + "kl_loss": 0.26039648056030273, + "loss_ib": 0.00960097461938858, + "step": 966 + }, + { + "ce_ib": 9.824999809265137, + "ce_orig": 1.1781514883041382, + "epoch": 0.2778057372924006, + "kl_loss": 0.32364603877067566, + "loss_ib": 0.013061460107564926, + "step": 966 + }, + { + "ce_ib": 10.24802017211914, + "ce_orig": 1.2914663553237915, + "epoch": 0.2778057372924006, + "kl_loss": 0.3219491243362427, + "loss_ib": 0.013467512093484402, + "step": 966 + }, + { + "ce_ib": 5.80756950378418, + "ce_orig": 0.7262393236160278, + "epoch": 0.2778057372924006, + "kl_loss": 0.25441619753837585, + "loss_ib": 0.008351731114089489, + "step": 966 + }, + { + "ce_ib": 10.359816551208496, + "ce_orig": 1.53457510471344, + "epoch": 0.2780933208713782, + "kl_loss": 0.28124716877937317, + "loss_ib": 0.013172287493944168, + "step": 967 + }, + { + "ce_ib": 3.750755548477173, + "ce_orig": 0.25399720668792725, + "epoch": 0.2780933208713782, + "kl_loss": 0.5752956867218018, + "loss_ib": 0.009503712877631187, + "step": 967 + }, + { + "ce_ib": 8.690093994140625, + "ce_orig": 0.9079038500785828, + "epoch": 0.2780933208713782, + "kl_loss": 0.4378716051578522, + "loss_ib": 0.013068810105323792, + "step": 967 + }, + { + "ce_ib": 5.155570030212402, + "ce_orig": 0.4399418234825134, + "epoch": 0.2780933208713782, + "kl_loss": 0.19257600605487823, + "loss_ib": 0.007081329822540283, + "step": 967 + }, + { + "ce_ib": 7.317140102386475, + "ce_orig": 0.9123186469078064, + "epoch": 0.2783809044503559, + "kl_loss": 0.2773372530937195, + "loss_ib": 0.010090513154864311, + "step": 968 + }, + { + "ce_ib": 9.807036399841309, + "ce_orig": 1.104854702949524, + "epoch": 0.2783809044503559, + "kl_loss": 0.31141215562820435, + "loss_ib": 0.012921158224344254, + "step": 968 + }, + { + "ce_ib": 2.5050039291381836, + "ce_orig": 0.18219490349292755, + "epoch": 0.2783809044503559, + "kl_loss": 0.5909286141395569, + "loss_ib": 0.00841428991407156, + "step": 968 + }, + { + "ce_ib": 5.669373512268066, + "ce_orig": 0.5021364092826843, + "epoch": 0.2783809044503559, + "kl_loss": 0.3175276219844818, + "loss_ib": 0.0088446494191885, + "step": 968 + }, + { + "ce_ib": 6.658907890319824, + "ce_orig": 0.8224705457687378, + "epoch": 0.2786684880293335, + "kl_loss": 0.3462896943092346, + "loss_ib": 0.010121805593371391, + "step": 969 + }, + { + "ce_ib": 7.009324073791504, + "ce_orig": 1.261172890663147, + "epoch": 0.2786684880293335, + "kl_loss": 0.2837193012237549, + "loss_ib": 0.009846516884863377, + "step": 969 + }, + { + "ce_ib": 5.1103196144104, + "ce_orig": 0.6225570440292358, + "epoch": 0.2786684880293335, + "kl_loss": 0.25508421659469604, + "loss_ib": 0.007661161944270134, + "step": 969 + }, + { + "ce_ib": 7.586276531219482, + "ce_orig": 0.6994422078132629, + "epoch": 0.2786684880293335, + "kl_loss": 0.3505334258079529, + "loss_ib": 0.011091611348092556, + "step": 969 + }, + { + "epoch": 0.27895607160831115, + "grad_norm": 0.10747111588716507, + "learning_rate": 9.89855482871982e-06, + "loss": 0.8618, + "step": 970 + }, + { + "ce_ib": 8.283812522888184, + "ce_orig": 1.250517725944519, + "epoch": 0.27895607160831115, + "kl_loss": 0.2938983738422394, + "loss_ib": 0.011222796514630318, + "step": 970 + }, + { + "ce_ib": 7.375507354736328, + "ce_orig": 1.0513449907302856, + "epoch": 0.27895607160831115, + "kl_loss": 0.260643869638443, + "loss_ib": 0.009981945157051086, + "step": 970 + }, + { + "ce_ib": 8.069171905517578, + "ce_orig": 0.7485910654067993, + "epoch": 0.27895607160831115, + "kl_loss": 0.39277946949005127, + "loss_ib": 0.011996966786682606, + "step": 970 + }, + { + "ce_ib": 9.535097122192383, + "ce_orig": 0.617765486240387, + "epoch": 0.27895607160831115, + "kl_loss": 0.24510113894939423, + "loss_ib": 0.011986108496785164, + "step": 970 + }, + { + "ce_ib": 7.473255634307861, + "ce_orig": 0.7423644661903381, + "epoch": 0.27924365518728883, + "kl_loss": 0.27745312452316284, + "loss_ib": 0.010247787460684776, + "step": 971 + }, + { + "ce_ib": 8.25799560546875, + "ce_orig": 0.686055600643158, + "epoch": 0.27924365518728883, + "kl_loss": 0.21401247382164001, + "loss_ib": 0.010398120619356632, + "step": 971 + }, + { + "ce_ib": 7.072482109069824, + "ce_orig": 0.48852288722991943, + "epoch": 0.27924365518728883, + "kl_loss": 0.25071245431900024, + "loss_ib": 0.009579606354236603, + "step": 971 + }, + { + "ce_ib": 6.167069911956787, + "ce_orig": 0.6813110113143921, + "epoch": 0.27924365518728883, + "kl_loss": 0.25662925839424133, + "loss_ib": 0.008733362890779972, + "step": 971 + }, + { + "ce_ib": 10.110807418823242, + "ce_orig": 1.0463491678237915, + "epoch": 0.27953123876626645, + "kl_loss": 0.3023725748062134, + "loss_ib": 0.013134532608091831, + "step": 972 + }, + { + "ce_ib": 8.601117134094238, + "ce_orig": 1.0095446109771729, + "epoch": 0.27953123876626645, + "kl_loss": 0.2603193521499634, + "loss_ib": 0.011204310692846775, + "step": 972 + }, + { + "ce_ib": 8.582576751708984, + "ce_orig": 1.1173807382583618, + "epoch": 0.27953123876626645, + "kl_loss": 0.45898622274398804, + "loss_ib": 0.01317243929952383, + "step": 972 + }, + { + "ce_ib": 8.407683372497559, + "ce_orig": 0.682980477809906, + "epoch": 0.27953123876626645, + "kl_loss": 0.23422002792358398, + "loss_ib": 0.010749883018434048, + "step": 972 + }, + { + "ce_ib": 7.096373081207275, + "ce_orig": 0.8979065418243408, + "epoch": 0.2798188223452441, + "kl_loss": 0.36893928050994873, + "loss_ib": 0.010785765014588833, + "step": 973 + }, + { + "ce_ib": 6.754623889923096, + "ce_orig": 0.6018571853637695, + "epoch": 0.2798188223452441, + "kl_loss": 0.20240068435668945, + "loss_ib": 0.008778630755841732, + "step": 973 + }, + { + "ce_ib": 6.34320592880249, + "ce_orig": 0.5469645857810974, + "epoch": 0.2798188223452441, + "kl_loss": 0.3521893322467804, + "loss_ib": 0.009865098632872105, + "step": 973 + }, + { + "ce_ib": 8.217493057250977, + "ce_orig": 1.0419894456863403, + "epoch": 0.2798188223452441, + "kl_loss": 0.36468231678009033, + "loss_ib": 0.011864316649734974, + "step": 973 + }, + { + "ce_ib": 6.634602069854736, + "ce_orig": 0.6549820303916931, + "epoch": 0.2801064059242217, + "kl_loss": 0.3038894534111023, + "loss_ib": 0.009673496708273888, + "step": 974 + }, + { + "ce_ib": 9.341080665588379, + "ce_orig": 0.607479989528656, + "epoch": 0.2801064059242217, + "kl_loss": 0.26178374886512756, + "loss_ib": 0.011958918534219265, + "step": 974 + }, + { + "ce_ib": 9.761402130126953, + "ce_orig": 0.7530984878540039, + "epoch": 0.2801064059242217, + "kl_loss": 0.3960914611816406, + "loss_ib": 0.013722317293286324, + "step": 974 + }, + { + "ce_ib": 7.921651840209961, + "ce_orig": 0.7006543874740601, + "epoch": 0.2801064059242217, + "kl_loss": 0.2808942198753357, + "loss_ib": 0.010730594396591187, + "step": 974 + }, + { + "epoch": 0.2803939895031994, + "grad_norm": 0.09944093227386475, + "learning_rate": 9.896993532562736e-06, + "loss": 0.8261, + "step": 975 + }, + { + "ce_ib": 7.843717575073242, + "ce_orig": 0.8895473480224609, + "epoch": 0.2803939895031994, + "kl_loss": 0.35273388028144836, + "loss_ib": 0.01137105654925108, + "step": 975 + }, + { + "ce_ib": 9.197118759155273, + "ce_orig": 0.8774324059486389, + "epoch": 0.2803939895031994, + "kl_loss": 0.3209964632987976, + "loss_ib": 0.012407083064317703, + "step": 975 + }, + { + "ce_ib": 4.643885612487793, + "ce_orig": 0.6738633513450623, + "epoch": 0.2803939895031994, + "kl_loss": 0.23056040704250336, + "loss_ib": 0.0069494894705712795, + "step": 975 + }, + { + "ce_ib": 7.597918510437012, + "ce_orig": 0.7519400119781494, + "epoch": 0.2803939895031994, + "kl_loss": 0.3231240212917328, + "loss_ib": 0.010829159058630466, + "step": 975 + }, + { + "ce_ib": 5.716797351837158, + "ce_orig": 0.4619486927986145, + "epoch": 0.280681573082177, + "kl_loss": 0.2213851362466812, + "loss_ib": 0.007930648513138294, + "step": 976 + }, + { + "ce_ib": 4.659027099609375, + "ce_orig": 0.46190229058265686, + "epoch": 0.280681573082177, + "kl_loss": 0.21290841698646545, + "loss_ib": 0.006788111291825771, + "step": 976 + }, + { + "ce_ib": 8.67602252960205, + "ce_orig": 0.7991732358932495, + "epoch": 0.280681573082177, + "kl_loss": 0.4040037989616394, + "loss_ib": 0.012716060504317284, + "step": 976 + }, + { + "ce_ib": 11.049056053161621, + "ce_orig": 1.187608242034912, + "epoch": 0.280681573082177, + "kl_loss": 0.22732923924922943, + "loss_ib": 0.013322348706424236, + "step": 976 + }, + { + "ce_ib": 6.996013641357422, + "ce_orig": 0.3616046607494354, + "epoch": 0.28096915666115463, + "kl_loss": 0.5100580453872681, + "loss_ib": 0.012096593156456947, + "step": 977 + }, + { + "ce_ib": 5.4247727394104, + "ce_orig": 0.7208405137062073, + "epoch": 0.28096915666115463, + "kl_loss": 0.24420976638793945, + "loss_ib": 0.007866870611906052, + "step": 977 + }, + { + "ce_ib": 5.2020392417907715, + "ce_orig": 0.6198222041130066, + "epoch": 0.28096915666115463, + "kl_loss": 0.3132549524307251, + "loss_ib": 0.008334589190781116, + "step": 977 + }, + { + "ce_ib": 8.632140159606934, + "ce_orig": 1.0381910800933838, + "epoch": 0.28096915666115463, + "kl_loss": 0.25286003947257996, + "loss_ib": 0.011160740628838539, + "step": 977 + }, + { + "ce_ib": 6.451074600219727, + "ce_orig": 0.6350893378257751, + "epoch": 0.2812567402401323, + "kl_loss": 0.2584133744239807, + "loss_ib": 0.009035208262503147, + "step": 978 + }, + { + "ce_ib": 9.116836547851562, + "ce_orig": 1.3513867855072021, + "epoch": 0.2812567402401323, + "kl_loss": 0.24926617741584778, + "loss_ib": 0.011609498411417007, + "step": 978 + }, + { + "ce_ib": 9.134062767028809, + "ce_orig": 1.004563570022583, + "epoch": 0.2812567402401323, + "kl_loss": 0.49993661046028137, + "loss_ib": 0.014133429154753685, + "step": 978 + }, + { + "ce_ib": 12.412116050720215, + "ce_orig": 1.79780113697052, + "epoch": 0.2812567402401323, + "kl_loss": 0.410219669342041, + "loss_ib": 0.016514312475919724, + "step": 978 + }, + { + "ce_ib": 7.063502311706543, + "ce_orig": 0.6859074234962463, + "epoch": 0.28154432381910993, + "kl_loss": 0.3865056037902832, + "loss_ib": 0.010928559117019176, + "step": 979 + }, + { + "ce_ib": 6.0811543464660645, + "ce_orig": 0.7244781255722046, + "epoch": 0.28154432381910993, + "kl_loss": 0.23856481909751892, + "loss_ib": 0.008466802537441254, + "step": 979 + }, + { + "ce_ib": 10.540010452270508, + "ce_orig": 1.1688413619995117, + "epoch": 0.28154432381910993, + "kl_loss": 0.35274291038513184, + "loss_ib": 0.01406743936240673, + "step": 979 + }, + { + "ce_ib": 5.423574447631836, + "ce_orig": 0.8129374980926514, + "epoch": 0.28154432381910993, + "kl_loss": 0.31833183765411377, + "loss_ib": 0.008606893010437489, + "step": 979 + }, + { + "epoch": 0.28183190739808756, + "grad_norm": 0.09410839527845383, + "learning_rate": 9.895420438411616e-06, + "loss": 0.8391, + "step": 980 + }, + { + "ce_ib": 4.933524131774902, + "ce_orig": 0.5870782732963562, + "epoch": 0.28183190739808756, + "kl_loss": 0.24300794303417206, + "loss_ib": 0.007363603450357914, + "step": 980 + }, + { + "ce_ib": 8.602434158325195, + "ce_orig": 0.7935175895690918, + "epoch": 0.28183190739808756, + "kl_loss": 0.28050893545150757, + "loss_ib": 0.011407522484660149, + "step": 980 + }, + { + "ce_ib": 5.863503456115723, + "ce_orig": 0.7190868258476257, + "epoch": 0.28183190739808756, + "kl_loss": 0.2712858319282532, + "loss_ib": 0.008576362393796444, + "step": 980 + }, + { + "ce_ib": 6.571584224700928, + "ce_orig": 0.8708552718162537, + "epoch": 0.28183190739808756, + "kl_loss": 0.20640678703784943, + "loss_ib": 0.008635652251541615, + "step": 980 + }, + { + "ce_ib": 7.416860580444336, + "ce_orig": 0.6324207186698914, + "epoch": 0.28211949097706523, + "kl_loss": 0.23176950216293335, + "loss_ib": 0.009734555147588253, + "step": 981 + }, + { + "ce_ib": 5.944385051727295, + "ce_orig": 0.7745913863182068, + "epoch": 0.28211949097706523, + "kl_loss": 0.25892847776412964, + "loss_ib": 0.008533669635653496, + "step": 981 + }, + { + "ce_ib": 5.516068458557129, + "ce_orig": 0.46634528040885925, + "epoch": 0.28211949097706523, + "kl_loss": 0.33703067898750305, + "loss_ib": 0.008886375464498997, + "step": 981 + }, + { + "ce_ib": 9.37726879119873, + "ce_orig": 0.8699996471405029, + "epoch": 0.28211949097706523, + "kl_loss": 0.333581805229187, + "loss_ib": 0.012713085860013962, + "step": 981 + }, + { + "ce_ib": 7.538808822631836, + "ce_orig": 0.9253389835357666, + "epoch": 0.28240707455604286, + "kl_loss": 0.3331226408481598, + "loss_ib": 0.010870035737752914, + "step": 982 + }, + { + "ce_ib": 8.707592010498047, + "ce_orig": 1.1699987649917603, + "epoch": 0.28240707455604286, + "kl_loss": 0.3539894223213196, + "loss_ib": 0.012247486039996147, + "step": 982 + }, + { + "ce_ib": 4.077365875244141, + "ce_orig": 0.5715107917785645, + "epoch": 0.28240707455604286, + "kl_loss": 0.33435067534446716, + "loss_ib": 0.007420872338116169, + "step": 982 + }, + { + "ce_ib": 10.470292091369629, + "ce_orig": 1.234960675239563, + "epoch": 0.28240707455604286, + "kl_loss": 0.3328559994697571, + "loss_ib": 0.013798851519823074, + "step": 982 + }, + { + "ce_ib": 12.779997825622559, + "ce_orig": 1.8746068477630615, + "epoch": 0.2826946581350205, + "kl_loss": 0.3426949381828308, + "loss_ib": 0.016206946223974228, + "step": 983 + }, + { + "ce_ib": 6.349981307983398, + "ce_orig": 0.5666931867599487, + "epoch": 0.2826946581350205, + "kl_loss": 0.3899381756782532, + "loss_ib": 0.010249363258481026, + "step": 983 + }, + { + "ce_ib": 5.248485088348389, + "ce_orig": 0.6375545859336853, + "epoch": 0.2826946581350205, + "kl_loss": 0.2220623940229416, + "loss_ib": 0.007469109259545803, + "step": 983 + }, + { + "ce_ib": 8.392010688781738, + "ce_orig": 1.1137949228286743, + "epoch": 0.2826946581350205, + "kl_loss": 0.24818137288093567, + "loss_ib": 0.01087382435798645, + "step": 983 + }, + { + "ce_ib": 7.158164024353027, + "ce_orig": 0.5367669463157654, + "epoch": 0.2829822417139981, + "kl_loss": 0.27173370122909546, + "loss_ib": 0.00987550150603056, + "step": 984 + }, + { + "ce_ib": 9.511956214904785, + "ce_orig": 1.0864530801773071, + "epoch": 0.2829822417139981, + "kl_loss": 0.2743532657623291, + "loss_ib": 0.012255489826202393, + "step": 984 + }, + { + "ce_ib": 4.440701484680176, + "ce_orig": 0.66915363073349, + "epoch": 0.2829822417139981, + "kl_loss": 0.2580929696559906, + "loss_ib": 0.007021630648523569, + "step": 984 + }, + { + "ce_ib": 7.797125339508057, + "ce_orig": 0.9739370346069336, + "epoch": 0.2829822417139981, + "kl_loss": 0.24688200652599335, + "loss_ib": 0.01026594452559948, + "step": 984 + }, + { + "epoch": 0.2832698252929758, + "grad_norm": 0.09766387939453125, + "learning_rate": 9.893835550056407e-06, + "loss": 0.8618, + "step": 985 + }, + { + "ce_ib": 7.013763904571533, + "ce_orig": 0.8495591282844543, + "epoch": 0.2832698252929758, + "kl_loss": 0.23701989650726318, + "loss_ib": 0.009383962489664555, + "step": 985 + }, + { + "ce_ib": 5.373435020446777, + "ce_orig": 0.6739266514778137, + "epoch": 0.2832698252929758, + "kl_loss": 0.2958618402481079, + "loss_ib": 0.008332053199410439, + "step": 985 + }, + { + "ce_ib": 8.126791000366211, + "ce_orig": 0.8309746384620667, + "epoch": 0.2832698252929758, + "kl_loss": 0.38277897238731384, + "loss_ib": 0.011954580433666706, + "step": 985 + }, + { + "ce_ib": 8.946346282958984, + "ce_orig": 1.2075648307800293, + "epoch": 0.2832698252929758, + "kl_loss": 0.3251242935657501, + "loss_ib": 0.012197589501738548, + "step": 985 + }, + { + "ce_ib": 9.519036293029785, + "ce_orig": 0.8708526492118835, + "epoch": 0.2835574088719534, + "kl_loss": 0.31902384757995605, + "loss_ib": 0.01270927395671606, + "step": 986 + }, + { + "ce_ib": 11.005833625793457, + "ce_orig": 1.3966689109802246, + "epoch": 0.2835574088719534, + "kl_loss": 0.39275312423706055, + "loss_ib": 0.014933365397155285, + "step": 986 + }, + { + "ce_ib": 8.268836975097656, + "ce_orig": 1.026828408241272, + "epoch": 0.2835574088719534, + "kl_loss": 0.25195300579071045, + "loss_ib": 0.010788366198539734, + "step": 986 + }, + { + "ce_ib": 10.45020580291748, + "ce_orig": 1.0152733325958252, + "epoch": 0.2835574088719534, + "kl_loss": 0.2754089832305908, + "loss_ib": 0.013204295188188553, + "step": 986 + }, + { + "ce_ib": 8.130716323852539, + "ce_orig": 0.5325186848640442, + "epoch": 0.28384499245093103, + "kl_loss": 0.4100872576236725, + "loss_ib": 0.012231589294970036, + "step": 987 + }, + { + "ce_ib": 6.4655256271362305, + "ce_orig": 0.6601911783218384, + "epoch": 0.28384499245093103, + "kl_loss": 0.2558635473251343, + "loss_ib": 0.009024160914123058, + "step": 987 + }, + { + "ce_ib": 6.571667194366455, + "ce_orig": 0.621316134929657, + "epoch": 0.28384499245093103, + "kl_loss": 0.23040370643138885, + "loss_ib": 0.008875704370439053, + "step": 987 + }, + { + "ce_ib": 6.913454532623291, + "ce_orig": 0.48008617758750916, + "epoch": 0.28384499245093103, + "kl_loss": 0.4269219636917114, + "loss_ib": 0.011182674206793308, + "step": 987 + }, + { + "ce_ib": 6.073386192321777, + "ce_orig": 0.7689921855926514, + "epoch": 0.2841325760299087, + "kl_loss": 0.33685237169265747, + "loss_ib": 0.00944190938025713, + "step": 988 + }, + { + "ce_ib": 5.702093601226807, + "ce_orig": 0.7048614025115967, + "epoch": 0.2841325760299087, + "kl_loss": 0.2855239510536194, + "loss_ib": 0.0085573336109519, + "step": 988 + }, + { + "ce_ib": 6.3464884757995605, + "ce_orig": 0.6249024868011475, + "epoch": 0.2841325760299087, + "kl_loss": 0.37955427169799805, + "loss_ib": 0.010142030194401741, + "step": 988 + }, + { + "ce_ib": 7.4441657066345215, + "ce_orig": 0.9428173303604126, + "epoch": 0.2841325760299087, + "kl_loss": 0.41461101174354553, + "loss_ib": 0.011590275913476944, + "step": 988 + }, + { + "ce_ib": 8.65250015258789, + "ce_orig": 1.1751656532287598, + "epoch": 0.28442015960888634, + "kl_loss": 0.34314173460006714, + "loss_ib": 0.01208391785621643, + "step": 989 + }, + { + "ce_ib": 10.522750854492188, + "ce_orig": 1.5159534215927124, + "epoch": 0.28442015960888634, + "kl_loss": 0.23152892291545868, + "loss_ib": 0.012838039547204971, + "step": 989 + }, + { + "ce_ib": 9.248775482177734, + "ce_orig": 1.2440454959869385, + "epoch": 0.28442015960888634, + "kl_loss": 0.3038747012615204, + "loss_ib": 0.012287522666156292, + "step": 989 + }, + { + "ce_ib": 9.64876651763916, + "ce_orig": 1.3754632472991943, + "epoch": 0.28442015960888634, + "kl_loss": 0.2952038645744324, + "loss_ib": 0.01260080561041832, + "step": 989 + }, + { + "epoch": 0.28470774318786396, + "grad_norm": 0.11360838264226913, + "learning_rate": 9.892238871315477e-06, + "loss": 0.9178, + "step": 990 + }, + { + "ce_ib": 6.687885761260986, + "ce_orig": 0.7147387862205505, + "epoch": 0.28470774318786396, + "kl_loss": 0.2835395932197571, + "loss_ib": 0.009523281827569008, + "step": 990 + }, + { + "ce_ib": 7.797086715698242, + "ce_orig": 0.8477333784103394, + "epoch": 0.28470774318786396, + "kl_loss": 0.2771601676940918, + "loss_ib": 0.010568687692284584, + "step": 990 + }, + { + "ce_ib": 7.964920520782471, + "ce_orig": 1.1867727041244507, + "epoch": 0.28470774318786396, + "kl_loss": 0.21424566209316254, + "loss_ib": 0.010107376612722874, + "step": 990 + }, + { + "ce_ib": 7.639345169067383, + "ce_orig": 0.9419063329696655, + "epoch": 0.28470774318786396, + "kl_loss": 0.3332253694534302, + "loss_ib": 0.010971598327159882, + "step": 990 + }, + { + "ce_ib": 9.816707611083984, + "ce_orig": 0.7999016046524048, + "epoch": 0.28499532676684164, + "kl_loss": 0.33083677291870117, + "loss_ib": 0.013125075958669186, + "step": 991 + }, + { + "ce_ib": 10.84311580657959, + "ce_orig": 1.1912862062454224, + "epoch": 0.28499532676684164, + "kl_loss": 0.28931328654289246, + "loss_ib": 0.013736248947679996, + "step": 991 + }, + { + "ce_ib": 7.672483921051025, + "ce_orig": 0.9036149978637695, + "epoch": 0.28499532676684164, + "kl_loss": 0.3275108337402344, + "loss_ib": 0.010947591625154018, + "step": 991 + }, + { + "ce_ib": 7.526676177978516, + "ce_orig": 0.8651123046875, + "epoch": 0.28499532676684164, + "kl_loss": 0.2458728551864624, + "loss_ib": 0.009985404089093208, + "step": 991 + }, + { + "ce_ib": 8.313544273376465, + "ce_orig": 1.2926881313323975, + "epoch": 0.28528291034581926, + "kl_loss": 0.32590439915657043, + "loss_ib": 0.011572588235139847, + "step": 992 + }, + { + "ce_ib": 9.292975425720215, + "ce_orig": 1.3791533708572388, + "epoch": 0.28528291034581926, + "kl_loss": 0.2774926722049713, + "loss_ib": 0.012067901901900768, + "step": 992 + }, + { + "ce_ib": 6.573873996734619, + "ce_orig": 0.4999292194843292, + "epoch": 0.28528291034581926, + "kl_loss": 0.25200730562210083, + "loss_ib": 0.009093946777284145, + "step": 992 + }, + { + "ce_ib": 6.635220050811768, + "ce_orig": 0.6851158142089844, + "epoch": 0.28528291034581926, + "kl_loss": 0.2963348627090454, + "loss_ib": 0.009598568081855774, + "step": 992 + }, + { + "ce_ib": 7.602895736694336, + "ce_orig": 0.9763492941856384, + "epoch": 0.2855704939247969, + "kl_loss": 0.24165207147598267, + "loss_ib": 0.01001941692084074, + "step": 993 + }, + { + "ce_ib": 4.053561210632324, + "ce_orig": 0.7590509057044983, + "epoch": 0.2855704939247969, + "kl_loss": 0.22461174428462982, + "loss_ib": 0.006299678701907396, + "step": 993 + }, + { + "ce_ib": 11.96536922454834, + "ce_orig": 1.5654411315917969, + "epoch": 0.2855704939247969, + "kl_loss": 0.34373822808265686, + "loss_ib": 0.015402751043438911, + "step": 993 + }, + { + "ce_ib": 5.362922668457031, + "ce_orig": 0.8094271421432495, + "epoch": 0.2855704939247969, + "kl_loss": 0.280230849981308, + "loss_ib": 0.008165230974555016, + "step": 993 + }, + { + "ce_ib": 5.077024936676025, + "ce_orig": 0.6099755167961121, + "epoch": 0.2858580775037745, + "kl_loss": 0.2580031752586365, + "loss_ib": 0.007657056674361229, + "step": 994 + }, + { + "ce_ib": 7.4664812088012695, + "ce_orig": 0.9805220365524292, + "epoch": 0.2858580775037745, + "kl_loss": 0.2251621037721634, + "loss_ib": 0.009718102402985096, + "step": 994 + }, + { + "ce_ib": 7.463787078857422, + "ce_orig": 0.7944656014442444, + "epoch": 0.2858580775037745, + "kl_loss": 0.20221929252147675, + "loss_ib": 0.009485980495810509, + "step": 994 + }, + { + "ce_ib": 9.706088066101074, + "ce_orig": 1.0263926982879639, + "epoch": 0.2858580775037745, + "kl_loss": 0.32880985736846924, + "loss_ib": 0.012994186952710152, + "step": 994 + }, + { + "epoch": 0.2861456610827522, + "grad_norm": 0.10201858729124069, + "learning_rate": 9.89063040603559e-06, + "loss": 0.9227, + "step": 995 + }, + { + "ce_ib": 9.292113304138184, + "ce_orig": 1.1094127893447876, + "epoch": 0.2861456610827522, + "kl_loss": 0.3149503469467163, + "loss_ib": 0.01244161557406187, + "step": 995 + }, + { + "ce_ib": 11.187474250793457, + "ce_orig": 0.847707986831665, + "epoch": 0.2861456610827522, + "kl_loss": 0.28196096420288086, + "loss_ib": 0.0140070840716362, + "step": 995 + }, + { + "ce_ib": 5.9683427810668945, + "ce_orig": 0.7678855657577515, + "epoch": 0.2861456610827522, + "kl_loss": 0.24694621562957764, + "loss_ib": 0.008437804877758026, + "step": 995 + }, + { + "ce_ib": 4.204360008239746, + "ce_orig": 0.5550284385681152, + "epoch": 0.2861456610827522, + "kl_loss": 0.33598631620407104, + "loss_ib": 0.007564222440123558, + "step": 995 + }, + { + "ce_ib": 4.783699035644531, + "ce_orig": 0.5354413390159607, + "epoch": 0.2864332446617298, + "kl_loss": 0.3188742995262146, + "loss_ib": 0.007972441613674164, + "step": 996 + }, + { + "ce_ib": 7.208334445953369, + "ce_orig": 0.8125985860824585, + "epoch": 0.2864332446617298, + "kl_loss": 0.2528786063194275, + "loss_ib": 0.009737120941281319, + "step": 996 + }, + { + "ce_ib": 8.211366653442383, + "ce_orig": 0.8897217512130737, + "epoch": 0.2864332446617298, + "kl_loss": 0.251012921333313, + "loss_ib": 0.010721495375037193, + "step": 996 + }, + { + "ce_ib": 7.538112163543701, + "ce_orig": 0.8684660196304321, + "epoch": 0.2864332446617298, + "kl_loss": 0.24459782242774963, + "loss_ib": 0.009984089992940426, + "step": 996 + }, + { + "ce_ib": 5.27274751663208, + "ce_orig": 0.6079943776130676, + "epoch": 0.28672082824070744, + "kl_loss": 0.36212387681007385, + "loss_ib": 0.008893987163901329, + "step": 997 + }, + { + "ce_ib": 3.044394016265869, + "ce_orig": 0.46399974822998047, + "epoch": 0.28672082824070744, + "kl_loss": 0.26355600357055664, + "loss_ib": 0.005679954309016466, + "step": 997 + }, + { + "ce_ib": 6.704494476318359, + "ce_orig": 0.8144213557243347, + "epoch": 0.28672082824070744, + "kl_loss": 0.23444503545761108, + "loss_ib": 0.00904894433915615, + "step": 997 + }, + { + "ce_ib": 8.70203971862793, + "ce_orig": 1.2462023496627808, + "epoch": 0.28672082824070744, + "kl_loss": 0.21362106502056122, + "loss_ib": 0.010838249698281288, + "step": 997 + }, + { + "ce_ib": 6.0318922996521, + "ce_orig": 0.5330185294151306, + "epoch": 0.2870084118196851, + "kl_loss": 0.3407291769981384, + "loss_ib": 0.00943918339908123, + "step": 998 + }, + { + "ce_ib": 8.035571098327637, + "ce_orig": 1.056720495223999, + "epoch": 0.2870084118196851, + "kl_loss": 0.23358367383480072, + "loss_ib": 0.010371407493948936, + "step": 998 + }, + { + "ce_ib": 8.166302680969238, + "ce_orig": 0.6143516898155212, + "epoch": 0.2870084118196851, + "kl_loss": 0.29871490597724915, + "loss_ib": 0.011153452098369598, + "step": 998 + }, + { + "ce_ib": 8.71125602722168, + "ce_orig": 1.1870672702789307, + "epoch": 0.2870084118196851, + "kl_loss": 0.2737518548965454, + "loss_ib": 0.011448774486780167, + "step": 998 + }, + { + "ce_ib": 5.139594078063965, + "ce_orig": 0.5000967383384705, + "epoch": 0.28729599539866274, + "kl_loss": 0.5653914213180542, + "loss_ib": 0.010793508030474186, + "step": 999 + }, + { + "ce_ib": 6.412952899932861, + "ce_orig": 0.9104073643684387, + "epoch": 0.28729599539866274, + "kl_loss": 0.28137922286987305, + "loss_ib": 0.009226744994521141, + "step": 999 + }, + { + "ce_ib": 9.592132568359375, + "ce_orig": 0.7200940251350403, + "epoch": 0.28729599539866274, + "kl_loss": 0.3655579090118408, + "loss_ib": 0.013247711583971977, + "step": 999 + }, + { + "ce_ib": 8.952437400817871, + "ce_orig": 1.0639101266860962, + "epoch": 0.28729599539866274, + "kl_loss": 0.3098136782646179, + "loss_ib": 0.012050573714077473, + "step": 999 + }, + { + "epoch": 0.28758357897764036, + "grad_norm": 0.10075593739748001, + "learning_rate": 9.889010158091917e-06, + "loss": 0.92, + "step": 1000 + } + ], + "logging_steps": 5, + "max_steps": 10434, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}