diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16049 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "ce_loss_13": 11.519832849502563, + "ce_loss_26": 11.473536491394043, + "ce_loss_39": 11.263565063476562, + "ce_loss_52": 1.3852829337120056, + "ce_loss_7": 11.56409740447998, + "epoch": 0.0001, + "grad_norm": 22.293954988093517, + "kl_loss_13": 20864.0, + "kl_loss_26": 20736.0, + "kl_loss_39": 20320.0, + "kl_loss_7": 20992.0, + "learning_rate": 1e-05, + "loss": 41440.0, + "step": 1 + }, + { + "ce_loss_13": 11.513921552234226, + "ce_loss_26": 11.469077931510078, + "ce_loss_39": 11.246019893222385, + "ce_loss_52": 1.4558950497044458, + "ce_loss_7": 11.55848307079739, + "epoch": 0.001, + "grad_norm": 23.161174410395507, + "kl_loss_13": 20800.0, + "kl_loss_26": 20696.88888888889, + "kl_loss_39": 20227.555555555555, + "kl_loss_7": 20881.777777777777, + "learning_rate": 0.0001, + "loss": 41384.0, + "step": 10 + }, + { + "ce_loss_13": 11.426759386062622, + "ce_loss_26": 11.41293363571167, + "ce_loss_39": 11.229010510444642, + "ce_loss_52": 1.4324860751628876, + "ce_loss_7": 11.462353825569153, + "epoch": 0.002, + "grad_norm": 38.50857397395853, + "kl_loss_13": 20668.8, + "kl_loss_26": 20640.0, + "kl_loss_39": 20256.0, + "kl_loss_7": 20745.6, + "learning_rate": 0.0002, + "loss": 41179.2, + "step": 20 + }, + { + "ce_loss_13": 10.954976797103882, + "ce_loss_26": 11.087984251976014, + "ce_loss_39": 11.10265805721283, + "ce_loss_52": 1.4276391446590424, + "ce_loss_7": 10.940680837631225, + "epoch": 0.003, + "grad_norm": 58.08626567131467, + "kl_loss_13": 19702.4, + "kl_loss_26": 19977.6, + "kl_loss_39": 20028.8, + "kl_loss_7": 19680.0, + "learning_rate": 0.0003, + "loss": 39668.0, + "step": 30 + }, + { + "ce_loss_13": 10.308717799186706, + "ce_loss_26": 10.375125074386597, + "ce_loss_39": 10.562542247772218, + "ce_loss_52": 1.455844309926033, + "ce_loss_7": 10.312761902809143, + "epoch": 0.004, + "grad_norm": 30.451988937114738, + "kl_loss_13": 18307.2, + "kl_loss_26": 18438.4, + "kl_loss_39": 18832.0, + "kl_loss_7": 18313.6, + "learning_rate": 0.0004, + "loss": 36999.2, + "step": 40 + }, + { + "ce_loss_13": 10.173566651344299, + "ce_loss_26": 10.188505339622498, + "ce_loss_39": 10.173172044754029, + "ce_loss_52": 1.4577810317277908, + "ce_loss_7": 10.182775902748109, + "epoch": 0.005, + "grad_norm": 37.851028798241174, + "kl_loss_13": 18006.4, + "kl_loss_26": 18028.8, + "kl_loss_39": 18012.8, + "kl_loss_7": 18022.4, + "learning_rate": 0.0005, + "loss": 36191.2, + "step": 50 + }, + { + "ce_loss_13": 10.071360087394714, + "ce_loss_26": 10.092596936225892, + "ce_loss_39": 10.072972583770753, + "ce_loss_52": 1.428243064880371, + "ce_loss_7": 10.105072927474975, + "epoch": 0.006, + "grad_norm": 45.98715921867029, + "kl_loss_13": 17872.0, + "kl_loss_26": 17907.2, + "kl_loss_39": 17878.4, + "kl_loss_7": 17936.0, + "learning_rate": 0.0006, + "loss": 35728.0, + "step": 60 + }, + { + "ce_loss_13": 9.994266033172607, + "ce_loss_26": 10.000447010993957, + "ce_loss_39": 9.958984637260437, + "ce_loss_52": 1.392430166900158, + "ce_loss_7": 10.041595196723938, + "epoch": 0.007, + "grad_norm": 53.56384816487026, + "kl_loss_13": 17750.4, + "kl_loss_26": 17763.2, + "kl_loss_39": 17667.2, + "kl_loss_7": 17846.4, + "learning_rate": 0.0007, + "loss": 35411.2, + "step": 70 + }, + { + "ce_loss_13": 9.870160865783692, + "ce_loss_26": 9.870406699180602, + "ce_loss_39": 9.815650677680969, + "ce_loss_52": 1.4188331544399262, + "ce_loss_7": 9.92598659992218, + "epoch": 0.008, + "grad_norm": 58.363906192597035, + "kl_loss_13": 17475.2, + "kl_loss_26": 17484.8, + "kl_loss_39": 17366.4, + "kl_loss_7": 17587.2, + "learning_rate": 0.0008, + "loss": 35010.4, + "step": 80 + }, + { + "ce_loss_13": 9.786613607406617, + "ce_loss_26": 9.77514407634735, + "ce_loss_39": 9.697125172615051, + "ce_loss_52": 1.4261163920164108, + "ce_loss_7": 9.84011538028717, + "epoch": 0.009, + "grad_norm": 57.597510936184484, + "kl_loss_13": 17267.2, + "kl_loss_26": 17232.0, + "kl_loss_39": 17065.6, + "kl_loss_7": 17376.0, + "learning_rate": 0.0009000000000000001, + "loss": 34545.6, + "step": 90 + }, + { + "ce_loss_13": 9.70496118068695, + "ce_loss_26": 9.680623888969421, + "ce_loss_39": 9.57672963142395, + "ce_loss_52": 1.4332450866699218, + "ce_loss_7": 9.75693221092224, + "epoch": 0.01, + "grad_norm": 56.92102830978135, + "kl_loss_13": 17075.2, + "kl_loss_26": 17030.4, + "kl_loss_39": 16814.4, + "kl_loss_7": 17187.2, + "learning_rate": 0.001, + "loss": 34141.6, + "step": 100 + }, + { + "ce_loss_13": 9.62608094215393, + "ce_loss_26": 9.584086346626282, + "ce_loss_39": 9.469242024421693, + "ce_loss_52": 1.4119121626019477, + "ce_loss_7": 9.681734418869018, + "epoch": 0.011, + "grad_norm": 55.54751251170693, + "kl_loss_13": 16956.8, + "kl_loss_26": 16856.0, + "kl_loss_39": 16632.0, + "kl_loss_7": 17075.2, + "learning_rate": 0.0009999974825027757, + "loss": 33673.2, + "step": 110 + }, + { + "ce_loss_13": 9.557737636566163, + "ce_loss_26": 9.502408647537232, + "ce_loss_39": 9.373179388046264, + "ce_loss_52": 1.420964427292347, + "ce_loss_7": 9.613711476325989, + "epoch": 0.012, + "grad_norm": 55.463113229450904, + "kl_loss_13": 16777.6, + "kl_loss_26": 16667.2, + "kl_loss_39": 16393.6, + "kl_loss_7": 16905.6, + "learning_rate": 0.0009999899300364532, + "loss": 33335.2, + "step": 120 + }, + { + "ce_loss_13": 9.474759387969971, + "ce_loss_26": 9.408789944648742, + "ce_loss_39": 9.266374969482422, + "ce_loss_52": 1.4124270409345627, + "ce_loss_7": 9.538655042648315, + "epoch": 0.013, + "grad_norm": 54.241543470395335, + "kl_loss_13": 16628.8, + "kl_loss_26": 16492.8, + "kl_loss_39": 16193.6, + "kl_loss_7": 16766.4, + "learning_rate": 0.0009999773426770863, + "loss": 32999.6, + "step": 130 + }, + { + "ce_loss_13": 9.420424246788025, + "ce_loss_26": 9.348571801185608, + "ce_loss_39": 9.1972074508667, + "ce_loss_52": 1.4392782002687454, + "ce_loss_7": 9.492741465568542, + "epoch": 0.014, + "grad_norm": 54.10933362933205, + "kl_loss_13": 16476.8, + "kl_loss_26": 16324.8, + "kl_loss_39": 16014.4, + "kl_loss_7": 16638.4, + "learning_rate": 0.0009999597205514296, + "loss": 32751.6, + "step": 140 + }, + { + "ce_loss_13": 9.388373732566833, + "ce_loss_26": 9.308008575439453, + "ce_loss_39": 9.153336524963379, + "ce_loss_52": 1.4420859813690186, + "ce_loss_7": 9.46445541381836, + "epoch": 0.015, + "grad_norm": 55.15236350542743, + "kl_loss_13": 16382.4, + "kl_loss_26": 16219.2, + "kl_loss_39": 15888.0, + "kl_loss_7": 16542.4, + "learning_rate": 0.0009999370638369377, + "loss": 32525.2, + "step": 150 + }, + { + "ce_loss_13": 9.301919007301331, + "ce_loss_26": 9.212661600112915, + "ce_loss_39": 9.050238633155823, + "ce_loss_52": 1.4233157366514206, + "ce_loss_7": 9.383031058311463, + "epoch": 0.016, + "grad_norm": 55.03653973388566, + "kl_loss_13": 16278.4, + "kl_loss_26": 16092.8, + "kl_loss_39": 15755.2, + "kl_loss_7": 16440.0, + "learning_rate": 0.000999909372761763, + "loss": 32209.6, + "step": 160 + }, + { + "ce_loss_13": 9.24642186164856, + "ce_loss_26": 9.14689018726349, + "ce_loss_39": 8.978599190711975, + "ce_loss_52": 1.429112258553505, + "ce_loss_7": 9.331455826759338, + "epoch": 0.017, + "grad_norm": 54.90625528920335, + "kl_loss_13": 16142.4, + "kl_loss_26": 15931.2, + "kl_loss_39": 15580.8, + "kl_loss_7": 16315.2, + "learning_rate": 0.0009998766476047546, + "loss": 31964.8, + "step": 170 + }, + { + "ce_loss_13": 9.187117385864259, + "ce_loss_26": 9.076116013526917, + "ce_loss_39": 8.902227759361267, + "ce_loss_52": 1.3885775536298752, + "ce_loss_7": 9.275272035598755, + "epoch": 0.018, + "grad_norm": 54.60426962776646, + "kl_loss_13": 16072.0, + "kl_loss_26": 15844.8, + "kl_loss_39": 15480.0, + "kl_loss_7": 16262.4, + "learning_rate": 0.0009998388886954545, + "loss": 31645.2, + "step": 180 + }, + { + "ce_loss_13": 9.131648278236389, + "ce_loss_26": 9.008217167854308, + "ce_loss_39": 8.831042790412903, + "ce_loss_52": 1.4482133895158769, + "ce_loss_7": 9.224077129364014, + "epoch": 0.019, + "grad_norm": 53.93299711922953, + "kl_loss_13": 15870.4, + "kl_loss_26": 15609.6, + "kl_loss_39": 15232.0, + "kl_loss_7": 16067.2, + "learning_rate": 0.0009997960964140947, + "loss": 31408.4, + "step": 190 + }, + { + "ce_loss_13": 9.050732731819153, + "ce_loss_26": 8.918284726142883, + "ce_loss_39": 8.738019919395446, + "ce_loss_52": 1.4300477087497712, + "ce_loss_7": 9.145042276382446, + "epoch": 0.02, + "grad_norm": 53.732589384741736, + "kl_loss_13": 15728.0, + "kl_loss_26": 15449.6, + "kl_loss_39": 15064.0, + "kl_loss_7": 15928.0, + "learning_rate": 0.0009997482711915926, + "loss": 31145.6, + "step": 200 + }, + { + "ce_loss_13": 8.988229060173035, + "ce_loss_26": 8.844111323356628, + "ce_loss_39": 8.654908394813537, + "ce_loss_52": 1.4580651924014092, + "ce_loss_7": 9.090379095077514, + "epoch": 0.021, + "grad_norm": 53.04072542826613, + "kl_loss_13": 15550.4, + "kl_loss_26": 15251.2, + "kl_loss_39": 14854.4, + "kl_loss_7": 15771.2, + "learning_rate": 0.0009996954135095479, + "loss": 30853.2, + "step": 210 + }, + { + "ce_loss_13": 8.945707607269288, + "ce_loss_26": 8.79644329547882, + "ce_loss_39": 8.601721858978271, + "ce_loss_52": 1.4154451981186866, + "ce_loss_7": 9.052277135849, + "epoch": 0.022, + "grad_norm": 53.309057416596275, + "kl_loss_13": 15544.0, + "kl_loss_26": 15219.2, + "kl_loss_39": 14809.6, + "kl_loss_7": 15761.6, + "learning_rate": 0.0009996375239002368, + "loss": 30606.8, + "step": 220 + }, + { + "ce_loss_13": 8.933341026306152, + "ce_loss_26": 8.773744869232178, + "ce_loss_39": 8.57140781879425, + "ce_loss_52": 1.4167777329683304, + "ce_loss_7": 9.04187982082367, + "epoch": 0.023, + "grad_norm": 53.383646235412414, + "kl_loss_13": 15486.4, + "kl_loss_26": 15156.8, + "kl_loss_39": 14726.4, + "kl_loss_7": 15721.6, + "learning_rate": 0.0009995746029466072, + "loss": 30406.4, + "step": 230 + }, + { + "ce_loss_13": 8.869291019439697, + "ce_loss_26": 8.693119740486145, + "ce_loss_39": 8.476832008361816, + "ce_loss_52": 1.4153499186038971, + "ce_loss_7": 8.980035424232483, + "epoch": 0.024, + "grad_norm": 52.917730504652305, + "kl_loss_13": 15353.6, + "kl_loss_26": 14985.6, + "kl_loss_39": 14528.0, + "kl_loss_7": 15588.8, + "learning_rate": 0.0009995066512822719, + "loss": 30148.4, + "step": 240 + }, + { + "ce_loss_13": 8.81713101863861, + "ce_loss_26": 8.636789417266845, + "ce_loss_39": 8.412476801872254, + "ce_loss_52": 1.4529948115348816, + "ce_loss_7": 8.929914593696594, + "epoch": 0.025, + "grad_norm": 53.98592158305785, + "kl_loss_13": 15196.8, + "kl_loss_26": 14820.8, + "kl_loss_39": 14340.8, + "kl_loss_7": 15438.4, + "learning_rate": 0.000999433669591504, + "loss": 29860.8, + "step": 250 + }, + { + "ce_loss_13": 8.748036527633667, + "ce_loss_26": 8.558162140846253, + "ce_loss_39": 8.331628108024598, + "ce_loss_52": 1.4311662405729293, + "ce_loss_7": 8.864733743667603, + "epoch": 0.026, + "grad_norm": 52.415587650337294, + "kl_loss_13": 15088.0, + "kl_loss_26": 14683.2, + "kl_loss_39": 14200.0, + "kl_loss_7": 15329.6, + "learning_rate": 0.000999355658609228, + "loss": 29636.0, + "step": 260 + }, + { + "ce_loss_13": 8.692949771881104, + "ce_loss_26": 8.494869589805603, + "ce_loss_39": 8.259693837165832, + "ce_loss_52": 1.4384984374046326, + "ce_loss_7": 8.814060854911805, + "epoch": 0.027, + "grad_norm": 53.356303831580306, + "kl_loss_13": 14976.0, + "kl_loss_26": 14555.2, + "kl_loss_39": 14054.4, + "kl_loss_7": 15230.4, + "learning_rate": 0.0009992726191210138, + "loss": 29438.0, + "step": 270 + }, + { + "ce_loss_13": 8.67082085609436, + "ce_loss_26": 8.463814663887025, + "ce_loss_39": 8.215038228034974, + "ce_loss_52": 1.4267250567674636, + "ce_loss_7": 8.794116616249084, + "epoch": 0.028, + "grad_norm": 52.94359037736481, + "kl_loss_13": 14902.4, + "kl_loss_26": 14476.8, + "kl_loss_39": 13944.0, + "kl_loss_7": 15174.4, + "learning_rate": 0.0009991845519630679, + "loss": 29276.4, + "step": 280 + }, + { + "ce_loss_13": 8.61563618183136, + "ce_loss_26": 8.402152299880981, + "ce_loss_39": 8.144541609287263, + "ce_loss_52": 1.4250373497605324, + "ce_loss_7": 8.742781138420105, + "epoch": 0.029, + "grad_norm": 51.566336997103136, + "kl_loss_13": 14817.6, + "kl_loss_26": 14369.6, + "kl_loss_39": 13816.0, + "kl_loss_7": 15089.6, + "learning_rate": 0.0009990914580222257, + "loss": 29010.0, + "step": 290 + }, + { + "ce_loss_13": 8.572561240196228, + "ce_loss_26": 8.35531551837921, + "ce_loss_39": 8.092759764194488, + "ce_loss_52": 1.4600662559270858, + "ce_loss_7": 8.698393726348877, + "epoch": 0.03, + "grad_norm": 53.1076582900563, + "kl_loss_13": 14704.0, + "kl_loss_26": 14251.2, + "kl_loss_39": 13689.6, + "kl_loss_7": 14974.4, + "learning_rate": 0.0009989933382359422, + "loss": 28776.8, + "step": 300 + }, + { + "ce_loss_13": 8.491887974739075, + "ce_loss_26": 8.263946199417115, + "ce_loss_39": 7.987871313095093, + "ce_loss_52": 1.4451197743415833, + "ce_loss_7": 8.625923323631287, + "epoch": 0.031, + "grad_norm": 52.58460107915946, + "kl_loss_13": 14537.6, + "kl_loss_26": 14054.4, + "kl_loss_39": 13464.0, + "kl_loss_7": 14825.6, + "learning_rate": 0.0009988901935922825, + "loss": 28548.4, + "step": 310 + }, + { + "ce_loss_13": 8.474869418144227, + "ce_loss_26": 8.245952117443085, + "ce_loss_39": 7.974157309532165, + "ce_loss_52": 1.4602727562189102, + "ce_loss_7": 8.610798478126526, + "epoch": 0.032, + "grad_norm": 52.331617741046635, + "kl_loss_13": 14480.0, + "kl_loss_26": 14003.2, + "kl_loss_39": 13427.2, + "kl_loss_7": 14760.0, + "learning_rate": 0.0009987820251299122, + "loss": 28364.4, + "step": 320 + }, + { + "ce_loss_13": 8.44498426914215, + "ce_loss_26": 8.20645843744278, + "ce_loss_39": 7.912872779369354, + "ce_loss_52": 1.4554857224225999, + "ce_loss_7": 8.583872628211974, + "epoch": 0.033, + "grad_norm": 50.87832937020315, + "kl_loss_13": 14411.2, + "kl_loss_26": 13900.8, + "kl_loss_39": 13281.6, + "kl_loss_7": 14700.8, + "learning_rate": 0.0009986688339380862, + "loss": 28109.2, + "step": 330 + }, + { + "ce_loss_13": 8.38349392414093, + "ce_loss_26": 8.133478546142578, + "ce_loss_39": 7.828318297863007, + "ce_loss_52": 1.425998830795288, + "ce_loss_7": 8.525882768630982, + "epoch": 0.034, + "grad_norm": 51.153122440976325, + "kl_loss_13": 14328.0, + "kl_loss_26": 13809.6, + "kl_loss_39": 13163.2, + "kl_loss_7": 14633.6, + "learning_rate": 0.0009985506211566387, + "loss": 27878.0, + "step": 340 + }, + { + "ce_loss_13": 8.349605464935303, + "ce_loss_26": 8.097514569759369, + "ce_loss_39": 7.785721278190612, + "ce_loss_52": 1.4315812528133391, + "ce_loss_7": 8.496586155891418, + "epoch": 0.035, + "grad_norm": 51.20256734387486, + "kl_loss_13": 14254.4, + "kl_loss_26": 13721.6, + "kl_loss_39": 13062.4, + "kl_loss_7": 14561.6, + "learning_rate": 0.0009984273879759713, + "loss": 27693.2, + "step": 350 + }, + { + "ce_loss_13": 8.273482608795167, + "ce_loss_26": 8.020016944408416, + "ce_loss_39": 7.711479115486145, + "ce_loss_52": 1.4499147981405258, + "ce_loss_7": 8.422512984275818, + "epoch": 0.036, + "grad_norm": 52.12411775413645, + "kl_loss_13": 14088.0, + "kl_loss_26": 13556.8, + "kl_loss_39": 12892.8, + "kl_loss_7": 14403.2, + "learning_rate": 0.0009982991356370402, + "loss": 27442.0, + "step": 360 + }, + { + "ce_loss_13": 8.215481567382813, + "ce_loss_26": 7.953275382518768, + "ce_loss_39": 7.628855121135712, + "ce_loss_52": 1.411750042438507, + "ce_loss_7": 8.368350863456726, + "epoch": 0.037, + "grad_norm": 51.0215332330724, + "kl_loss_13": 14024.0, + "kl_loss_26": 13470.4, + "kl_loss_39": 12779.2, + "kl_loss_7": 14348.8, + "learning_rate": 0.0009981658654313456, + "loss": 27348.0, + "step": 370 + }, + { + "ce_loss_13": 8.217882227897643, + "ce_loss_26": 7.9462348341941835, + "ce_loss_39": 7.613846385478974, + "ce_loss_52": 1.4831970453262329, + "ce_loss_7": 8.373853397369384, + "epoch": 0.038, + "grad_norm": 50.4571010346743, + "kl_loss_13": 13913.6, + "kl_loss_26": 13345.6, + "kl_loss_39": 12646.4, + "kl_loss_7": 14243.2, + "learning_rate": 0.000998027578700917, + "loss": 27082.8, + "step": 380 + }, + { + "ce_loss_13": 8.11286985874176, + "ce_loss_26": 7.831897294521331, + "ce_loss_39": 7.493152487277984, + "ce_loss_52": 1.4128454998135567, + "ce_loss_7": 8.275482225418092, + "epoch": 0.039, + "grad_norm": 51.75236464910614, + "kl_loss_13": 13798.4, + "kl_loss_26": 13212.8, + "kl_loss_39": 12491.2, + "kl_loss_7": 14139.2, + "learning_rate": 0.0009978842768382998, + "loss": 26835.6, + "step": 390 + }, + { + "ce_loss_13": 8.092873919010163, + "ce_loss_26": 7.80686913728714, + "ce_loss_39": 7.462458717823028, + "ce_loss_52": 1.449526023864746, + "ce_loss_7": 8.253998827934264, + "epoch": 0.04, + "grad_norm": 50.588198000151046, + "kl_loss_13": 13680.0, + "kl_loss_26": 13083.2, + "kl_loss_39": 12355.2, + "kl_loss_7": 14019.2, + "learning_rate": 0.0009977359612865424, + "loss": 26670.0, + "step": 400 + }, + { + "ce_loss_13": 8.073025333881379, + "ce_loss_26": 7.792924261093139, + "ce_loss_39": 7.448240423202515, + "ce_loss_52": 1.4590945556759833, + "ce_loss_7": 8.234287071228028, + "epoch": 0.041, + "grad_norm": 50.388842117598266, + "kl_loss_13": 13630.4, + "kl_loss_26": 13032.0, + "kl_loss_39": 12302.4, + "kl_loss_7": 13969.6, + "learning_rate": 0.0009975826335391806, + "loss": 26457.2, + "step": 410 + }, + { + "ce_loss_13": 7.959017169475556, + "ce_loss_26": 7.667092227935791, + "ce_loss_39": 7.308180010318756, + "ce_loss_52": 1.3903952419757843, + "ce_loss_7": 8.129511964321136, + "epoch": 0.042, + "grad_norm": 50.86911767207133, + "kl_loss_13": 13540.8, + "kl_loss_26": 12916.8, + "kl_loss_39": 12163.2, + "kl_loss_7": 13888.0, + "learning_rate": 0.0009974242951402235, + "loss": 26197.6, + "step": 420 + }, + { + "ce_loss_13": 7.940323996543884, + "ce_loss_26": 7.629641830921173, + "ce_loss_39": 7.2696495175361635, + "ce_loss_52": 1.4528310179710389, + "ce_loss_7": 8.112548959255218, + "epoch": 0.043, + "grad_norm": 49.24163454624527, + "kl_loss_13": 13374.4, + "kl_loss_26": 12728.0, + "kl_loss_39": 11958.4, + "kl_loss_7": 13744.0, + "learning_rate": 0.0009972609476841367, + "loss": 25992.4, + "step": 430 + }, + { + "ce_loss_13": 7.8985715508461, + "ce_loss_26": 7.592843997478485, + "ce_loss_39": 7.205817592144013, + "ce_loss_52": 1.4196556687355042, + "ce_loss_7": 8.078336155414581, + "epoch": 0.044, + "grad_norm": 49.811998634305894, + "kl_loss_13": 13345.6, + "kl_loss_26": 12699.2, + "kl_loss_39": 11881.6, + "kl_loss_7": 13718.4, + "learning_rate": 0.0009970925928158272, + "loss": 25854.4, + "step": 440 + }, + { + "ce_loss_13": 7.867163848876953, + "ce_loss_26": 7.55372383594513, + "ce_loss_39": 7.173475623130798, + "ce_loss_52": 1.4358820408582686, + "ce_loss_7": 8.0431494474411, + "epoch": 0.045, + "grad_norm": 48.311473749243106, + "kl_loss_13": 13248.0, + "kl_loss_26": 12595.2, + "kl_loss_39": 11792.0, + "kl_loss_7": 13614.4, + "learning_rate": 0.000996919232230627, + "loss": 25620.0, + "step": 450 + }, + { + "ce_loss_13": 7.794478893280029, + "ce_loss_26": 7.470960378646851, + "ce_loss_39": 7.075640022754669, + "ce_loss_52": 1.4313764542341232, + "ce_loss_7": 7.97986272573471, + "epoch": 0.046, + "grad_norm": 49.75600981611632, + "kl_loss_13": 13113.6, + "kl_loss_26": 12433.6, + "kl_loss_39": 11588.8, + "kl_loss_7": 13496.0, + "learning_rate": 0.0009967408676742752, + "loss": 25367.2, + "step": 460 + }, + { + "ce_loss_13": 7.780554842948914, + "ce_loss_26": 7.450528597831726, + "ce_loss_39": 7.06024489402771, + "ce_loss_52": 1.4271526962518692, + "ce_loss_7": 7.96198604106903, + "epoch": 0.047, + "grad_norm": 49.73198686766462, + "kl_loss_13": 13089.6, + "kl_loss_26": 12406.4, + "kl_loss_39": 11576.0, + "kl_loss_7": 13475.2, + "learning_rate": 0.0009965575009429006, + "loss": 25186.4, + "step": 470 + }, + { + "ce_loss_13": 7.771422934532166, + "ce_loss_26": 7.443638646602631, + "ce_loss_39": 7.045053339004516, + "ce_loss_52": 1.4691366642713546, + "ce_loss_7": 7.956920957565307, + "epoch": 0.048, + "grad_norm": 48.6898889709016, + "kl_loss_13": 12971.2, + "kl_loss_26": 12286.4, + "kl_loss_39": 11440.0, + "kl_loss_7": 13366.4, + "learning_rate": 0.0009963691338830043, + "loss": 25028.4, + "step": 480 + }, + { + "ce_loss_13": 7.7170240640640255, + "ce_loss_26": 7.3867839813232425, + "ce_loss_39": 6.986623299121857, + "ce_loss_52": 1.4700770109891892, + "ce_loss_7": 7.900947248935699, + "epoch": 0.049, + "grad_norm": 47.968754476102596, + "kl_loss_13": 12884.8, + "kl_loss_26": 12195.2, + "kl_loss_39": 11332.8, + "kl_loss_7": 13273.6, + "learning_rate": 0.0009961757683914405, + "loss": 24808.8, + "step": 490 + }, + { + "ce_loss_13": 7.612575709819794, + "ce_loss_26": 7.270983147621155, + "ce_loss_39": 6.851053369045258, + "ce_loss_52": 1.4072588831186295, + "ce_loss_7": 7.807804656028748, + "epoch": 0.05, + "grad_norm": 49.18975121944083, + "kl_loss_13": 12780.8, + "kl_loss_26": 12060.8, + "kl_loss_39": 11161.6, + "kl_loss_7": 13190.4, + "learning_rate": 0.0009959774064153978, + "loss": 24615.6, + "step": 500 + }, + { + "ce_loss_13": 7.6113405585289, + "ce_loss_26": 7.257396864891052, + "ce_loss_39": 6.8364926934242245, + "ce_loss_52": 1.405586513876915, + "ce_loss_7": 7.807830440998077, + "epoch": 0.051, + "grad_norm": 48.36038036613293, + "kl_loss_13": 12753.6, + "kl_loss_26": 12017.6, + "kl_loss_39": 11121.6, + "kl_loss_7": 13161.6, + "learning_rate": 0.0009957740499523787, + "loss": 24452.0, + "step": 510 + }, + { + "ce_loss_13": 7.562924301624298, + "ce_loss_26": 7.205751180648804, + "ce_loss_39": 6.7729793906211855, + "ce_loss_52": 1.441327565908432, + "ce_loss_7": 7.762964737415314, + "epoch": 0.052, + "grad_norm": 48.52091531249349, + "kl_loss_13": 12577.6, + "kl_loss_26": 11833.6, + "kl_loss_39": 10912.0, + "kl_loss_7": 12990.4, + "learning_rate": 0.0009955657010501807, + "loss": 24214.4, + "step": 520 + }, + { + "ce_loss_13": 7.5027553796768185, + "ce_loss_26": 7.149892139434814, + "ce_loss_39": 6.725725698471069, + "ce_loss_52": 1.4616976886987687, + "ce_loss_7": 7.700168478488922, + "epoch": 0.053, + "grad_norm": 47.609892122251686, + "kl_loss_13": 12451.2, + "kl_loss_26": 11710.4, + "kl_loss_39": 10804.8, + "kl_loss_7": 12872.0, + "learning_rate": 0.000995352361806875, + "loss": 24037.2, + "step": 530 + }, + { + "ce_loss_13": 7.525733006000519, + "ce_loss_26": 7.1605717778205875, + "ce_loss_39": 6.703774988651276, + "ce_loss_52": 1.42885320186615, + "ce_loss_7": 7.727836930751801, + "epoch": 0.054, + "grad_norm": 47.0111007198644, + "kl_loss_13": 12556.8, + "kl_loss_26": 11790.4, + "kl_loss_39": 10828.8, + "kl_loss_7": 12976.0, + "learning_rate": 0.0009951340343707852, + "loss": 23845.2, + "step": 540 + }, + { + "ce_loss_13": 7.423776483535766, + "ce_loss_26": 7.0557411193847654, + "ce_loss_39": 6.604493200778961, + "ce_loss_52": 1.4447590827941894, + "ce_loss_7": 7.626477897167206, + "epoch": 0.055, + "grad_norm": 49.45361296474082, + "kl_loss_13": 12328.0, + "kl_loss_26": 11540.8, + "kl_loss_39": 10584.0, + "kl_loss_7": 12747.2, + "learning_rate": 0.0009949107209404665, + "loss": 23664.0, + "step": 550 + }, + { + "ce_loss_13": 7.434036374092102, + "ce_loss_26": 7.059368348121643, + "ce_loss_39": 6.6053709268569945, + "ce_loss_52": 1.4645269870758058, + "ce_loss_7": 7.6432753801345825, + "epoch": 0.056, + "grad_norm": 47.673368470799254, + "kl_loss_13": 12291.2, + "kl_loss_26": 11512.0, + "kl_loss_39": 10547.2, + "kl_loss_7": 12726.4, + "learning_rate": 0.0009946824237646824, + "loss": 23469.6, + "step": 560 + }, + { + "ce_loss_13": 7.307004892826081, + "ce_loss_26": 6.927345609664917, + "ce_loss_39": 6.4629304051399235, + "ce_loss_52": 1.437305434048176, + "ce_loss_7": 7.522386133670807, + "epoch": 0.057, + "grad_norm": 46.80952508481597, + "kl_loss_13": 12094.4, + "kl_loss_26": 11300.8, + "kl_loss_39": 10308.8, + "kl_loss_7": 12539.2, + "learning_rate": 0.0009944491451423828, + "loss": 23249.6, + "step": 570 + }, + { + "ce_loss_13": 7.349401378631592, + "ce_loss_26": 6.964329659938812, + "ce_loss_39": 6.480949449539184, + "ce_loss_52": 1.4452718168497085, + "ce_loss_7": 7.564259791374207, + "epoch": 0.058, + "grad_norm": 46.22867294627436, + "kl_loss_13": 12145.6, + "kl_loss_26": 11340.8, + "kl_loss_39": 10315.2, + "kl_loss_7": 12592.0, + "learning_rate": 0.0009942108874226813, + "loss": 23091.2, + "step": 580 + }, + { + "ce_loss_13": 7.254886651039124, + "ce_loss_26": 6.858836472034454, + "ce_loss_39": 6.3856946468353275, + "ce_loss_52": 1.4449717432260514, + "ce_loss_7": 7.473185133934021, + "epoch": 0.059, + "grad_norm": 45.84422579202554, + "kl_loss_13": 11969.6, + "kl_loss_26": 11147.2, + "kl_loss_39": 10136.0, + "kl_loss_7": 12424.0, + "learning_rate": 0.00099396765300483, + "loss": 22886.4, + "step": 590 + }, + { + "ce_loss_13": 7.248957896232605, + "ce_loss_26": 6.855677163600921, + "ce_loss_39": 6.3774519801139835, + "ce_loss_52": 1.477000206708908, + "ce_loss_7": 7.465912497043609, + "epoch": 0.06, + "grad_norm": 46.37348014710593, + "kl_loss_13": 11888.0, + "kl_loss_26": 11064.0, + "kl_loss_39": 10044.8, + "kl_loss_7": 12347.2, + "learning_rate": 0.0009937194443381972, + "loss": 22708.0, + "step": 600 + }, + { + "ce_loss_13": 7.210493552684784, + "ce_loss_26": 6.8088652968406675, + "ce_loss_39": 6.325126445293426, + "ce_loss_52": 1.444644930958748, + "ce_loss_7": 7.429195690155029, + "epoch": 0.061, + "grad_norm": 44.92499922138711, + "kl_loss_13": 11859.2, + "kl_loss_26": 11019.2, + "kl_loss_39": 9995.2, + "kl_loss_7": 12320.0, + "learning_rate": 0.0009934662639222412, + "loss": 22544.8, + "step": 610 + }, + { + "ce_loss_13": 7.1185362339019775, + "ce_loss_26": 6.714106225967408, + "ce_loss_39": 6.223515486717224, + "ce_loss_52": 1.4858893424272537, + "ce_loss_7": 7.341545379161834, + "epoch": 0.062, + "grad_norm": 46.45143938897793, + "kl_loss_13": 11601.6, + "kl_loss_26": 10750.4, + "kl_loss_39": 9708.8, + "kl_loss_7": 12072.0, + "learning_rate": 0.000993208114306486, + "loss": 22270.0, + "step": 620 + }, + { + "ce_loss_13": 7.0913821935653685, + "ce_loss_26": 6.689675974845886, + "ce_loss_39": 6.203632855415345, + "ce_loss_52": 1.4506051570177079, + "ce_loss_7": 7.311544299125671, + "epoch": 0.063, + "grad_norm": 45.34630221197193, + "kl_loss_13": 11592.0, + "kl_loss_26": 10752.0, + "kl_loss_39": 9720.0, + "kl_loss_7": 12067.2, + "learning_rate": 0.0009929449980904952, + "loss": 22153.2, + "step": 630 + }, + { + "ce_loss_13": 7.083522534370422, + "ce_loss_26": 6.665987038612366, + "ce_loss_39": 6.162657225131989, + "ce_loss_52": 1.4658448547124863, + "ce_loss_7": 7.312107050418854, + "epoch": 0.064, + "grad_norm": 45.471744941742365, + "kl_loss_13": 11552.0, + "kl_loss_26": 10675.2, + "kl_loss_39": 9596.8, + "kl_loss_7": 12032.0, + "learning_rate": 0.0009926769179238466, + "loss": 21949.2, + "step": 640 + }, + { + "ce_loss_13": 6.994167017936706, + "ce_loss_26": 6.563658082485199, + "ce_loss_39": 6.042373907566071, + "ce_loss_52": 1.4207285180687905, + "ce_loss_7": 7.2311041235923765, + "epoch": 0.065, + "grad_norm": 43.84127734363621, + "kl_loss_13": 11489.6, + "kl_loss_26": 10593.6, + "kl_loss_39": 9488.0, + "kl_loss_7": 11980.8, + "learning_rate": 0.000992403876506104, + "loss": 21796.8, + "step": 650 + }, + { + "ce_loss_13": 6.9931820154190065, + "ce_loss_26": 6.566097593307495, + "ce_loss_39": 6.045178306102753, + "ce_loss_52": 1.4772068083286285, + "ce_loss_7": 7.2253869533538815, + "epoch": 0.066, + "grad_norm": 43.29636197313948, + "kl_loss_13": 11363.2, + "kl_loss_26": 10462.4, + "kl_loss_39": 9350.4, + "kl_loss_7": 11856.0, + "learning_rate": 0.0009921258765867918, + "loss": 21581.2, + "step": 660 + }, + { + "ce_loss_13": 6.907565414905548, + "ce_loss_26": 6.471543419361114, + "ce_loss_39": 5.933564639091491, + "ce_loss_52": 1.4364299774169922, + "ce_loss_7": 7.147727394104004, + "epoch": 0.067, + "grad_norm": 45.37835002704289, + "kl_loss_13": 11259.2, + "kl_loss_26": 10348.8, + "kl_loss_39": 9200.0, + "kl_loss_7": 11766.4, + "learning_rate": 0.0009918429209653662, + "loss": 21394.0, + "step": 670 + }, + { + "ce_loss_13": 6.9164858102798465, + "ce_loss_26": 6.482825660705567, + "ce_loss_39": 5.9588632702827455, + "ce_loss_52": 1.4493420034646989, + "ce_loss_7": 7.152165937423706, + "epoch": 0.068, + "grad_norm": 44.49853682897619, + "kl_loss_13": 11238.4, + "kl_loss_26": 10336.0, + "kl_loss_39": 9201.6, + "kl_loss_7": 11729.6, + "learning_rate": 0.0009915550124911866, + "loss": 21260.8, + "step": 680 + }, + { + "ce_loss_13": 6.871931791305542, + "ce_loss_26": 6.440780913829803, + "ce_loss_39": 5.910830950736999, + "ce_loss_52": 1.4238717705011368, + "ce_loss_7": 7.117027842998505, + "epoch": 0.069, + "grad_norm": 44.632228248662486, + "kl_loss_13": 11209.6, + "kl_loss_26": 10307.2, + "kl_loss_39": 9184.0, + "kl_loss_7": 11716.8, + "learning_rate": 0.0009912621540634887, + "loss": 21100.4, + "step": 690 + }, + { + "ce_loss_13": 6.761080467700959, + "ce_loss_26": 6.306544578075409, + "ce_loss_39": 5.754528117179871, + "ce_loss_52": 1.378117674589157, + "ce_loss_7": 7.018208122253418, + "epoch": 0.07, + "grad_norm": 45.73542626422545, + "kl_loss_13": 11040.0, + "kl_loss_26": 10088.0, + "kl_loss_39": 8913.6, + "kl_loss_7": 11569.6, + "learning_rate": 0.0009909643486313534, + "loss": 20851.6, + "step": 700 + }, + { + "ce_loss_13": 6.78661539554596, + "ce_loss_26": 6.327802836894989, + "ce_loss_39": 5.761592519283295, + "ce_loss_52": 1.411187869310379, + "ce_loss_7": 7.041204571723938, + "epoch": 0.071, + "grad_norm": 42.217908581540186, + "kl_loss_13": 11076.8, + "kl_loss_26": 10112.0, + "kl_loss_39": 8905.6, + "kl_loss_7": 11606.4, + "learning_rate": 0.000990661599193678, + "loss": 20727.2, + "step": 710 + }, + { + "ce_loss_13": 6.725618660449982, + "ce_loss_26": 6.256143915653229, + "ce_loss_39": 5.684507942199707, + "ce_loss_52": 1.4021562442183495, + "ce_loss_7": 6.98309029340744, + "epoch": 0.072, + "grad_norm": 41.87540936995742, + "kl_loss_13": 10950.4, + "kl_loss_26": 9966.4, + "kl_loss_39": 8756.0, + "kl_loss_7": 11489.6, + "learning_rate": 0.0009903539087991462, + "loss": 20494.0, + "step": 720 + }, + { + "ce_loss_13": 6.704308211803436, + "ce_loss_26": 6.2481373190879825, + "ce_loss_39": 5.691672837734222, + "ce_loss_52": 1.4353317350149155, + "ce_loss_7": 6.9648723125457765, + "epoch": 0.073, + "grad_norm": 41.50232307107669, + "kl_loss_13": 10825.6, + "kl_loss_26": 9868.8, + "kl_loss_39": 8681.6, + "kl_loss_7": 11364.8, + "learning_rate": 0.0009900412805461966, + "loss": 20435.6, + "step": 730 + }, + { + "ce_loss_13": 6.661628067493439, + "ce_loss_26": 6.196605837345123, + "ce_loss_39": 5.6325979948043825, + "ce_loss_52": 1.4341969668865204, + "ce_loss_7": 6.919101679325104, + "epoch": 0.074, + "grad_norm": 40.95865426481825, + "kl_loss_13": 10744.0, + "kl_loss_26": 9771.2, + "kl_loss_39": 8564.0, + "kl_loss_7": 11276.8, + "learning_rate": 0.0009897237175829927, + "loss": 20203.6, + "step": 740 + }, + { + "ce_loss_13": 6.600095963478088, + "ce_loss_26": 6.1362119793891905, + "ce_loss_39": 5.5736886858940125, + "ce_loss_52": 1.408122679591179, + "ce_loss_7": 6.857535183429718, + "epoch": 0.075, + "grad_norm": 39.711086876656914, + "kl_loss_13": 10672.0, + "kl_loss_26": 9696.0, + "kl_loss_39": 8488.0, + "kl_loss_7": 11211.2, + "learning_rate": 0.0009894012231073895, + "loss": 20039.6, + "step": 750 + }, + { + "ce_loss_13": 6.570180189609528, + "ce_loss_26": 6.098634576797485, + "ce_loss_39": 5.526832151412964, + "ce_loss_52": 1.4715266615152358, + "ce_loss_7": 6.833914375305175, + "epoch": 0.076, + "grad_norm": 43.2480305118225, + "kl_loss_13": 10513.6, + "kl_loss_26": 9529.6, + "kl_loss_39": 8309.6, + "kl_loss_7": 11064.0, + "learning_rate": 0.0009890738003669028, + "loss": 19880.0, + "step": 760 + }, + { + "ce_loss_13": 6.567346775531769, + "ce_loss_26": 6.0874533414840695, + "ce_loss_39": 5.5076407313346865, + "ce_loss_52": 1.4363629996776581, + "ce_loss_7": 6.83351217508316, + "epoch": 0.077, + "grad_norm": 40.54611020558553, + "kl_loss_13": 10534.4, + "kl_loss_26": 9550.4, + "kl_loss_39": 8325.6, + "kl_loss_7": 11091.2, + "learning_rate": 0.0009887414526586764, + "loss": 19717.2, + "step": 770 + }, + { + "ce_loss_13": 6.519154870510102, + "ce_loss_26": 6.030243515968323, + "ce_loss_39": 5.4369661688804625, + "ce_loss_52": 1.4315312415361405, + "ce_loss_7": 6.7882112741470335, + "epoch": 0.078, + "grad_norm": 40.32433812744511, + "kl_loss_13": 10425.6, + "kl_loss_26": 9401.6, + "kl_loss_39": 8148.0, + "kl_loss_7": 10985.6, + "learning_rate": 0.0009884041833294476, + "loss": 19528.4, + "step": 780 + }, + { + "ce_loss_13": 6.458490109443664, + "ce_loss_26": 5.970031499862671, + "ce_loss_39": 5.392513406276703, + "ce_loss_52": 1.4178608924150466, + "ce_loss_7": 6.729024958610535, + "epoch": 0.079, + "grad_norm": 41.89461078246612, + "kl_loss_13": 10355.2, + "kl_loss_26": 9337.6, + "kl_loss_39": 8099.2, + "kl_loss_7": 10920.0, + "learning_rate": 0.000988061995775515, + "loss": 19441.6, + "step": 790 + }, + { + "ce_loss_13": 6.432489657402039, + "ce_loss_26": 5.9446264743804935, + "ce_loss_39": 5.3700491189956665, + "ce_loss_52": 1.448669496178627, + "ce_loss_7": 6.70353993177414, + "epoch": 0.08, + "grad_norm": 41.284533949329585, + "kl_loss_13": 10246.4, + "kl_loss_26": 9230.4, + "kl_loss_39": 8004.0, + "kl_loss_7": 10811.2, + "learning_rate": 0.0009877148934427035, + "loss": 19206.8, + "step": 800 + }, + { + "ce_loss_13": 6.43521283864975, + "ce_loss_26": 5.939107716083527, + "ce_loss_39": 5.3414135575294495, + "ce_loss_52": 1.4238088309764863, + "ce_loss_7": 6.7135733485221865, + "epoch": 0.081, + "grad_norm": 39.49115349094681, + "kl_loss_13": 10304.0, + "kl_loss_26": 9264.0, + "kl_loss_39": 8010.4, + "kl_loss_7": 10881.6, + "learning_rate": 0.0009873628798263297, + "loss": 19058.0, + "step": 810 + }, + { + "ce_loss_13": 6.375123608112335, + "ce_loss_26": 5.867140221595764, + "ce_loss_39": 5.2596719622612, + "ce_loss_52": 1.4445373743772507, + "ce_loss_7": 6.642891383171081, + "epoch": 0.082, + "grad_norm": 39.14925963953466, + "kl_loss_13": 10124.8, + "kl_loss_26": 9064.0, + "kl_loss_39": 7779.2, + "kl_loss_7": 10689.6, + "learning_rate": 0.0009870059584711668, + "loss": 18891.2, + "step": 820 + }, + { + "ce_loss_13": 6.2935021877288815, + "ce_loss_26": 5.791378605365753, + "ce_loss_39": 5.201059639453888, + "ce_loss_52": 1.424940624833107, + "ce_loss_7": 6.564577507972717, + "epoch": 0.083, + "grad_norm": 39.604616846798116, + "kl_loss_13": 9998.4, + "kl_loss_26": 8953.6, + "kl_loss_39": 7687.2, + "kl_loss_7": 10563.2, + "learning_rate": 0.000986644132971409, + "loss": 18704.0, + "step": 830 + }, + { + "ce_loss_13": 6.274489688873291, + "ce_loss_26": 5.771354067325592, + "ce_loss_39": 5.155186474323273, + "ce_loss_52": 1.429549178481102, + "ce_loss_7": 6.554718315601349, + "epoch": 0.084, + "grad_norm": 38.382382142673, + "kl_loss_13": 9971.2, + "kl_loss_26": 8913.6, + "kl_loss_39": 7593.6, + "kl_loss_7": 10548.8, + "learning_rate": 0.0009862774069706345, + "loss": 18644.4, + "step": 840 + }, + { + "ce_loss_13": 6.2195475697517395, + "ce_loss_26": 5.717883968353272, + "ce_loss_39": 5.13158141374588, + "ce_loss_52": 1.4267238914966582, + "ce_loss_7": 6.497097527980804, + "epoch": 0.085, + "grad_norm": 39.67456709505246, + "kl_loss_13": 9820.8, + "kl_loss_26": 8772.8, + "kl_loss_39": 7518.4, + "kl_loss_7": 10401.6, + "learning_rate": 0.000985905784161771, + "loss": 18443.2, + "step": 850 + }, + { + "ce_loss_13": 6.253428983688354, + "ce_loss_26": 5.746143198013305, + "ce_loss_39": 5.127990126609802, + "ce_loss_52": 1.4239614009857178, + "ce_loss_7": 6.538207769393921, + "epoch": 0.086, + "grad_norm": 39.46700749652253, + "kl_loss_13": 9912.0, + "kl_loss_26": 8857.6, + "kl_loss_39": 7540.0, + "kl_loss_7": 10500.8, + "learning_rate": 0.000985529268287055, + "loss": 18336.4, + "step": 860 + }, + { + "ce_loss_13": 6.181728804111481, + "ce_loss_26": 5.661606287956237, + "ce_loss_39": 5.042867851257324, + "ce_loss_52": 1.427235585451126, + "ce_loss_7": 6.472940897941589, + "epoch": 0.087, + "grad_norm": 38.42645346767181, + "kl_loss_13": 9798.4, + "kl_loss_26": 8700.0, + "kl_loss_39": 7378.4, + "kl_loss_7": 10409.6, + "learning_rate": 0.0009851478631379982, + "loss": 18167.2, + "step": 870 + }, + { + "ce_loss_13": 6.113723492622375, + "ce_loss_26": 5.586072051525116, + "ce_loss_39": 4.9395282626152035, + "ce_loss_52": 1.3542900115251542, + "ce_loss_7": 6.410606110095978, + "epoch": 0.088, + "grad_norm": 37.03254074803745, + "kl_loss_13": 9768.0, + "kl_loss_26": 8668.8, + "kl_loss_39": 7310.4, + "kl_loss_7": 10384.0, + "learning_rate": 0.0009847615725553456, + "loss": 18092.4, + "step": 880 + }, + { + "ce_loss_13": 6.182212936878204, + "ce_loss_26": 5.651630616188049, + "ce_loss_39": 5.018951749801635, + "ce_loss_52": 1.4187958374619485, + "ce_loss_7": 6.474197280406952, + "epoch": 0.089, + "grad_norm": 36.66962934304384, + "kl_loss_13": 9784.0, + "kl_loss_26": 8682.4, + "kl_loss_39": 7329.6, + "kl_loss_7": 10400.0, + "learning_rate": 0.0009843704004290394, + "loss": 18005.6, + "step": 890 + }, + { + "ce_loss_13": 6.1027270436286924, + "ce_loss_26": 5.578388214111328, + "ce_loss_39": 4.954142212867737, + "ce_loss_52": 1.433653001487255, + "ce_loss_7": 6.3896349430084225, + "epoch": 0.09, + "grad_norm": 37.895271797404135, + "kl_loss_13": 9592.0, + "kl_loss_26": 8488.8, + "kl_loss_39": 7161.6, + "kl_loss_7": 10198.4, + "learning_rate": 0.0009839743506981783, + "loss": 17767.6, + "step": 900 + }, + { + "ce_loss_13": 6.134694254398346, + "ce_loss_26": 5.600531077384948, + "ce_loss_39": 4.956934368610382, + "ce_loss_52": 1.4616568014025688, + "ce_loss_7": 6.426614594459534, + "epoch": 0.091, + "grad_norm": 35.5095046076979, + "kl_loss_13": 9608.0, + "kl_loss_26": 8489.6, + "kl_loss_39": 7093.6, + "kl_loss_7": 10217.6, + "learning_rate": 0.0009835734273509786, + "loss": 17664.2, + "step": 910 + }, + { + "ce_loss_13": 6.059712076187134, + "ce_loss_26": 5.526252567768097, + "ce_loss_39": 4.893229007720947, + "ce_loss_52": 1.4395650416612624, + "ce_loss_7": 6.356565976142884, + "epoch": 0.092, + "grad_norm": 34.796923941159505, + "kl_loss_13": 9540.8, + "kl_loss_26": 8412.8, + "kl_loss_39": 7049.6, + "kl_loss_7": 10158.4, + "learning_rate": 0.0009831676344247342, + "loss": 17511.4, + "step": 920 + }, + { + "ce_loss_13": 6.012007105350494, + "ce_loss_26": 5.470919144153595, + "ce_loss_39": 4.816142636537552, + "ce_loss_52": 1.3806764528155326, + "ce_loss_7": 6.3036043524742125, + "epoch": 0.093, + "grad_norm": 34.520449300223, + "kl_loss_13": 9520.0, + "kl_loss_26": 8383.2, + "kl_loss_39": 6990.4, + "kl_loss_7": 10132.8, + "learning_rate": 0.0009827569760057755, + "loss": 17476.2, + "step": 930 + }, + { + "ce_loss_13": 6.020643877983093, + "ce_loss_26": 5.4863135576248165, + "ce_loss_39": 4.843132376670837, + "ce_loss_52": 1.4196506530046462, + "ce_loss_7": 6.318761503696441, + "epoch": 0.094, + "grad_norm": 33.89098596439575, + "kl_loss_13": 9425.6, + "kl_loss_26": 8303.2, + "kl_loss_39": 6938.4, + "kl_loss_7": 10048.0, + "learning_rate": 0.000982341456229428, + "loss": 17230.8, + "step": 940 + }, + { + "ce_loss_13": 5.974271166324615, + "ce_loss_26": 5.436639845371246, + "ce_loss_39": 4.804401755332947, + "ce_loss_52": 1.4617832124233245, + "ce_loss_7": 6.261663150787354, + "epoch": 0.095, + "grad_norm": 33.71164019490533, + "kl_loss_13": 9278.4, + "kl_loss_26": 8148.0, + "kl_loss_39": 6788.0, + "kl_loss_7": 9880.0, + "learning_rate": 0.000981921079279971, + "loss": 17111.2, + "step": 950 + }, + { + "ce_loss_13": 5.966051030158996, + "ce_loss_26": 5.422689366340637, + "ce_loss_39": 4.77484347820282, + "ce_loss_52": 1.4197688490152358, + "ce_loss_7": 6.259041047096252, + "epoch": 0.096, + "grad_norm": 33.446193055983784, + "kl_loss_13": 9321.6, + "kl_loss_26": 8191.2, + "kl_loss_39": 6796.8, + "kl_loss_7": 9932.8, + "learning_rate": 0.0009814958493905962, + "loss": 17055.2, + "step": 960 + }, + { + "ce_loss_13": 5.8990898609161375, + "ce_loss_26": 5.356628429889679, + "ce_loss_39": 4.708657902479172, + "ce_loss_52": 1.4259506687521935, + "ce_loss_7": 6.204090213775634, + "epoch": 0.097, + "grad_norm": 32.73849681683031, + "kl_loss_13": 9192.0, + "kl_loss_26": 8055.2, + "kl_loss_39": 6684.0, + "kl_loss_7": 9836.8, + "learning_rate": 0.0009810657708433637, + "loss": 16837.6, + "step": 970 + }, + { + "ce_loss_13": 5.879060399532318, + "ce_loss_26": 5.346331930160522, + "ce_loss_39": 4.690547597408295, + "ce_loss_52": 1.4302369862794877, + "ce_loss_7": 6.184376800060273, + "epoch": 0.098, + "grad_norm": 32.85977379524165, + "kl_loss_13": 9155.2, + "kl_loss_26": 8032.0, + "kl_loss_39": 6616.8, + "kl_loss_7": 9792.0, + "learning_rate": 0.0009806308479691594, + "loss": 16832.6, + "step": 980 + }, + { + "ce_loss_13": 5.816424036026001, + "ce_loss_26": 5.264147555828094, + "ce_loss_39": 4.6131664395332335, + "ce_loss_52": 1.4319524437189102, + "ce_loss_7": 6.118786966800689, + "epoch": 0.099, + "grad_norm": 33.426081767419625, + "kl_loss_13": 8992.0, + "kl_loss_26": 7842.4, + "kl_loss_39": 6442.4, + "kl_loss_7": 9635.2, + "learning_rate": 0.0009801910851476522, + "loss": 16728.4, + "step": 990 + }, + { + "ce_loss_13": 5.814940357208252, + "ce_loss_26": 5.2667844772338865, + "ce_loss_39": 4.6347626686096195, + "ce_loss_52": 1.4415421515703202, + "ce_loss_7": 6.12279201745987, + "epoch": 0.1, + "grad_norm": 33.016085914188245, + "kl_loss_13": 8960.0, + "kl_loss_26": 7812.8, + "kl_loss_39": 6444.0, + "kl_loss_7": 9588.8, + "learning_rate": 0.0009797464868072487, + "loss": 16535.6, + "step": 1000 + }, + { + "ce_loss_13": 5.8194945573806764, + "ce_loss_26": 5.259810090065002, + "ce_loss_39": 4.592129653692245, + "ce_loss_52": 1.4136200681328774, + "ce_loss_7": 6.127060306072235, + "epoch": 0.101, + "grad_norm": 31.039827427941336, + "kl_loss_13": 9033.6, + "kl_loss_26": 7855.2, + "kl_loss_39": 6432.0, + "kl_loss_7": 9683.2, + "learning_rate": 0.0009792970574250492, + "loss": 16416.6, + "step": 1010 + }, + { + "ce_loss_13": 5.752316701412201, + "ce_loss_26": 5.182761800289154, + "ce_loss_39": 4.4902693152427675, + "ce_loss_52": 1.37675661444664, + "ce_loss_7": 6.068772268295288, + "epoch": 0.102, + "grad_norm": 30.262078958434195, + "kl_loss_13": 8969.6, + "kl_loss_26": 7772.0, + "kl_loss_39": 6300.8, + "kl_loss_7": 9622.4, + "learning_rate": 0.0009788428015268028, + "loss": 16337.4, + "step": 1020 + }, + { + "ce_loss_13": 5.781488347053528, + "ce_loss_26": 5.2409987449646, + "ce_loss_39": 4.596257948875428, + "ce_loss_52": 1.4595280766487122, + "ce_loss_7": 6.078718197345734, + "epoch": 0.103, + "grad_norm": 31.79275929639243, + "kl_loss_13": 8864.0, + "kl_loss_26": 7715.2, + "kl_loss_39": 6324.0, + "kl_loss_7": 9491.2, + "learning_rate": 0.0009783837236868609, + "loss": 16174.0, + "step": 1030 + }, + { + "ce_loss_13": 5.71779420375824, + "ce_loss_26": 5.146243929862976, + "ce_loss_39": 4.4791832447052, + "ce_loss_52": 1.4291063606739045, + "ce_loss_7": 6.026824104785919, + "epoch": 0.104, + "grad_norm": 30.963349252235982, + "kl_loss_13": 8771.2, + "kl_loss_26": 7573.6, + "kl_loss_39": 6160.8, + "kl_loss_7": 9416.0, + "learning_rate": 0.0009779198285281327, + "loss": 16072.0, + "step": 1040 + }, + { + "ce_loss_13": 5.742816948890686, + "ce_loss_26": 5.189597749710083, + "ce_loss_39": 4.527956926822663, + "ce_loss_52": 1.4477199196815491, + "ce_loss_7": 6.058120143413544, + "epoch": 0.105, + "grad_norm": 32.72479335973728, + "kl_loss_13": 8772.8, + "kl_loss_26": 7620.0, + "kl_loss_39": 6205.6, + "kl_loss_7": 9438.4, + "learning_rate": 0.0009774511207220368, + "loss": 15932.8, + "step": 1050 + }, + { + "ce_loss_13": 5.728980660438538, + "ce_loss_26": 5.167429828643799, + "ce_loss_39": 4.511177510023117, + "ce_loss_52": 1.4759073287248612, + "ce_loss_7": 6.031959307193756, + "epoch": 0.106, + "grad_norm": 31.226954775299472, + "kl_loss_13": 8736.8, + "kl_loss_26": 7550.4, + "kl_loss_39": 6153.6, + "kl_loss_7": 9363.2, + "learning_rate": 0.0009769776049884564, + "loss": 15802.0, + "step": 1060 + }, + { + "ce_loss_13": 5.7382616877555845, + "ce_loss_26": 5.175736773014068, + "ce_loss_39": 4.510921847820282, + "ce_loss_52": 1.4512410640716553, + "ce_loss_7": 6.050414621829987, + "epoch": 0.107, + "grad_norm": 30.630877517383688, + "kl_loss_13": 8788.0, + "kl_loss_26": 7622.4, + "kl_loss_39": 6202.4, + "kl_loss_7": 9452.8, + "learning_rate": 0.0009764992860956889, + "loss": 15822.0, + "step": 1070 + }, + { + "ce_loss_13": 5.645794451236725, + "ce_loss_26": 5.088676834106446, + "ce_loss_39": 4.417802548408508, + "ce_loss_52": 1.4182049363851548, + "ce_loss_7": 5.959036731719971, + "epoch": 0.108, + "grad_norm": 30.457487960812127, + "kl_loss_13": 8681.6, + "kl_loss_26": 7509.6, + "kl_loss_39": 6084.8, + "kl_loss_7": 9332.8, + "learning_rate": 0.0009760161688604008, + "loss": 15627.0, + "step": 1080 + }, + { + "ce_loss_13": 5.568986439704895, + "ce_loss_26": 5.009959697723389, + "ce_loss_39": 4.367926681041718, + "ce_loss_52": 1.4609902381896973, + "ce_loss_7": 5.883221137523651, + "epoch": 0.109, + "grad_norm": 29.989545158997807, + "kl_loss_13": 8458.4, + "kl_loss_26": 7279.2, + "kl_loss_39": 5895.2, + "kl_loss_7": 9112.0, + "learning_rate": 0.0009755282581475768, + "loss": 15555.0, + "step": 1090 + }, + { + "ce_loss_13": 5.6291534304618835, + "ce_loss_26": 5.068492126464844, + "ce_loss_39": 4.403285652399063, + "ce_loss_52": 1.445840133726597, + "ce_loss_7": 5.945274484157562, + "epoch": 0.11, + "grad_norm": 30.66383917654591, + "kl_loss_13": 8576.8, + "kl_loss_26": 7396.8, + "kl_loss_39": 5972.8, + "kl_loss_7": 9244.8, + "learning_rate": 0.0009750355588704727, + "loss": 15472.0, + "step": 1100 + }, + { + "ce_loss_13": 5.522909152507782, + "ce_loss_26": 4.949995934963226, + "ce_loss_39": 4.270336884260177, + "ce_loss_52": 1.407256692647934, + "ce_loss_7": 5.840477633476257, + "epoch": 0.111, + "grad_norm": 29.782295400473814, + "kl_loss_13": 8460.8, + "kl_loss_26": 7250.4, + "kl_loss_39": 5788.8, + "kl_loss_7": 9118.4, + "learning_rate": 0.0009745380759905647, + "loss": 15294.0, + "step": 1110 + }, + { + "ce_loss_13": 5.517545366287232, + "ce_loss_26": 4.932116758823395, + "ce_loss_39": 4.250882798433304, + "ce_loss_52": 1.3833064809441566, + "ce_loss_7": 5.846422612667084, + "epoch": 0.112, + "grad_norm": 28.73414895119145, + "kl_loss_13": 8486.4, + "kl_loss_26": 7264.0, + "kl_loss_39": 5804.0, + "kl_loss_7": 9168.0, + "learning_rate": 0.0009740358145174998, + "loss": 15318.0, + "step": 1120 + }, + { + "ce_loss_13": 5.50923570394516, + "ce_loss_26": 4.938242793083191, + "ce_loss_39": 4.259438300132752, + "ce_loss_52": 1.430792199075222, + "ce_loss_7": 5.8306269407272335, + "epoch": 0.113, + "grad_norm": 28.895019773736312, + "kl_loss_13": 8355.2, + "kl_loss_26": 7152.8, + "kl_loss_39": 5700.8, + "kl_loss_7": 9024.0, + "learning_rate": 0.0009735287795090455, + "loss": 15192.0, + "step": 1130 + }, + { + "ce_loss_13": 5.401988506317139, + "ce_loss_26": 4.814380037784576, + "ce_loss_39": 4.133068162202835, + "ce_loss_52": 1.3917075648903847, + "ce_loss_7": 5.732739126682281, + "epoch": 0.114, + "grad_norm": 28.473930821013894, + "kl_loss_13": 8231.2, + "kl_loss_26": 7010.4, + "kl_loss_39": 5549.6, + "kl_loss_7": 8921.6, + "learning_rate": 0.0009730169760710386, + "loss": 15030.2, + "step": 1140 + }, + { + "ce_loss_13": 5.538087117671966, + "ce_loss_26": 4.94068056344986, + "ce_loss_39": 4.253137022256851, + "ce_loss_52": 1.4375049352645874, + "ce_loss_7": 5.861488628387451, + "epoch": 0.115, + "grad_norm": 30.345832823062537, + "kl_loss_13": 8408.0, + "kl_loss_26": 7160.0, + "kl_loss_39": 5690.4, + "kl_loss_7": 9096.0, + "learning_rate": 0.0009725004093573342, + "loss": 14951.8, + "step": 1150 + }, + { + "ce_loss_13": 5.385800528526306, + "ce_loss_26": 4.801717817783356, + "ce_loss_39": 4.136020374298096, + "ce_loss_52": 1.4078958943486213, + "ce_loss_7": 5.722076547145844, + "epoch": 0.116, + "grad_norm": 30.297186235009132, + "kl_loss_13": 8177.6, + "kl_loss_26": 6956.0, + "kl_loss_39": 5514.4, + "kl_loss_7": 8880.0, + "learning_rate": 0.0009719790845697534, + "loss": 14867.6, + "step": 1160 + }, + { + "ce_loss_13": 5.426836037635804, + "ce_loss_26": 4.832395279407502, + "ce_loss_39": 4.153381270170212, + "ce_loss_52": 1.426569977402687, + "ce_loss_7": 5.755613851547241, + "epoch": 0.117, + "grad_norm": 31.87589996223516, + "kl_loss_13": 8220.8, + "kl_loss_26": 6976.0, + "kl_loss_39": 5544.0, + "kl_loss_7": 8909.6, + "learning_rate": 0.0009714530069580309, + "loss": 14745.8, + "step": 1170 + }, + { + "ce_loss_13": 5.359652185440064, + "ce_loss_26": 4.760751461982727, + "ce_loss_39": 4.048358517885208, + "ce_loss_52": 1.3907365471124649, + "ce_loss_7": 5.693908452987671, + "epoch": 0.118, + "grad_norm": 27.539122306915463, + "kl_loss_13": 8126.4, + "kl_loss_26": 6876.0, + "kl_loss_39": 5376.0, + "kl_loss_7": 8822.4, + "learning_rate": 0.0009709221818197624, + "loss": 14704.2, + "step": 1180 + }, + { + "ce_loss_13": 5.350854313373565, + "ce_loss_26": 4.768227469921112, + "ce_loss_39": 4.099184954166413, + "ce_loss_52": 1.4248775228857995, + "ce_loss_7": 5.680926930904389, + "epoch": 0.119, + "grad_norm": 28.933475617899816, + "kl_loss_13": 8027.2, + "kl_loss_26": 6818.4, + "kl_loss_39": 5372.0, + "kl_loss_7": 8727.2, + "learning_rate": 0.0009703866145003512, + "loss": 14583.0, + "step": 1190 + }, + { + "ce_loss_13": 5.372988939285278, + "ce_loss_26": 4.778137028217316, + "ce_loss_39": 4.089074891805649, + "ce_loss_52": 1.4195260405540466, + "ce_loss_7": 5.714505088329315, + "epoch": 0.12, + "grad_norm": 26.60053169419214, + "kl_loss_13": 8131.2, + "kl_loss_26": 6885.6, + "kl_loss_39": 5404.8, + "kl_loss_7": 8840.8, + "learning_rate": 0.0009698463103929542, + "loss": 14513.0, + "step": 1200 + }, + { + "ce_loss_13": 5.392605185508728, + "ce_loss_26": 4.798673605918884, + "ce_loss_39": 4.126504504680634, + "ce_loss_52": 1.4746148020029068, + "ce_loss_7": 5.7245006442070006, + "epoch": 0.121, + "grad_norm": 26.989592045291893, + "kl_loss_13": 8018.4, + "kl_loss_26": 6775.2, + "kl_loss_39": 5339.2, + "kl_loss_7": 8718.4, + "learning_rate": 0.0009693012749384279, + "loss": 14383.2, + "step": 1210 + }, + { + "ce_loss_13": 5.319035434722901, + "ce_loss_26": 4.736347317695618, + "ce_loss_39": 4.0659150838851925, + "ce_loss_52": 1.4397204488515853, + "ce_loss_7": 5.642805421352387, + "epoch": 0.122, + "grad_norm": 29.38130003686817, + "kl_loss_13": 7946.4, + "kl_loss_26": 6723.2, + "kl_loss_39": 5294.4, + "kl_loss_7": 8631.2, + "learning_rate": 0.0009687515136252732, + "loss": 14375.6, + "step": 1220 + }, + { + "ce_loss_13": 5.3430128455162045, + "ce_loss_26": 4.744695138931275, + "ce_loss_39": 4.071437209844589, + "ce_loss_52": 1.4354561120271683, + "ce_loss_7": 5.681857228279114, + "epoch": 0.123, + "grad_norm": 25.479885446063793, + "kl_loss_13": 8008.0, + "kl_loss_26": 6754.4, + "kl_loss_39": 5308.8, + "kl_loss_7": 8709.6, + "learning_rate": 0.0009681970319895803, + "loss": 14273.4, + "step": 1230 + }, + { + "ce_loss_13": 5.331636953353882, + "ce_loss_26": 4.735000967979431, + "ce_loss_39": 4.073598688840866, + "ce_loss_52": 1.470110397040844, + "ce_loss_7": 5.655930757522583, + "epoch": 0.124, + "grad_norm": 28.379117971457468, + "kl_loss_13": 7929.6, + "kl_loss_26": 6684.0, + "kl_loss_39": 5260.8, + "kl_loss_7": 8612.8, + "learning_rate": 0.0009676378356149733, + "loss": 14150.8, + "step": 1240 + }, + { + "ce_loss_13": 5.1874682068824765, + "ce_loss_26": 4.582801806926727, + "ce_loss_39": 3.900476610660553, + "ce_loss_52": 1.4180996417999268, + "ce_loss_7": 5.519565558433532, + "epoch": 0.125, + "grad_norm": 27.465496459767188, + "kl_loss_13": 7764.8, + "kl_loss_26": 6504.0, + "kl_loss_39": 5041.6, + "kl_loss_7": 8455.2, + "learning_rate": 0.0009670739301325534, + "loss": 13985.0, + "step": 1250 + }, + { + "ce_loss_13": 5.221911752223969, + "ce_loss_26": 4.625354039669037, + "ce_loss_39": 3.9350062906742096, + "ce_loss_52": 1.3881619155406952, + "ce_loss_7": 5.562334418296814, + "epoch": 0.126, + "grad_norm": 26.021158683557974, + "kl_loss_13": 7847.2, + "kl_loss_26": 6599.2, + "kl_loss_39": 5120.0, + "kl_loss_7": 8558.4, + "learning_rate": 0.0009665053212208426, + "loss": 13978.8, + "step": 1260 + }, + { + "ce_loss_13": 5.201095879077911, + "ce_loss_26": 4.594125282764435, + "ce_loss_39": 3.8948924005031587, + "ce_loss_52": 1.4181114554405212, + "ce_loss_7": 5.552536249160767, + "epoch": 0.127, + "grad_norm": 26.300188898530276, + "kl_loss_13": 7783.2, + "kl_loss_26": 6515.2, + "kl_loss_39": 5012.0, + "kl_loss_7": 8515.2, + "learning_rate": 0.0009659320146057262, + "loss": 13927.6, + "step": 1270 + }, + { + "ce_loss_13": 5.186312806606293, + "ce_loss_26": 4.5913723587989805, + "ce_loss_39": 3.912171256542206, + "ce_loss_52": 1.4045341789722443, + "ce_loss_7": 5.535152721405029, + "epoch": 0.128, + "grad_norm": 25.74310395170922, + "kl_loss_13": 7757.6, + "kl_loss_26": 6509.6, + "kl_loss_39": 5065.6, + "kl_loss_7": 8486.4, + "learning_rate": 0.0009653540160603955, + "loss": 13929.0, + "step": 1280 + }, + { + "ce_loss_13": 5.17168892621994, + "ce_loss_26": 4.572988575696945, + "ce_loss_39": 3.902406334877014, + "ce_loss_52": 1.4593060314655304, + "ce_loss_7": 5.513335394859314, + "epoch": 0.129, + "grad_norm": 26.464934956114348, + "kl_loss_13": 7624.0, + "kl_loss_26": 6372.8, + "kl_loss_39": 4939.2, + "kl_loss_7": 8331.2, + "learning_rate": 0.0009647713314052896, + "loss": 13720.2, + "step": 1290 + }, + { + "ce_loss_13": 5.167734289169312, + "ce_loss_26": 4.5827870786190035, + "ce_loss_39": 3.9150634109973907, + "ce_loss_52": 1.4295778691768646, + "ce_loss_7": 5.517308306694031, + "epoch": 0.13, + "grad_norm": 26.41573865043221, + "kl_loss_13": 7627.2, + "kl_loss_26": 6407.2, + "kl_loss_39": 4972.0, + "kl_loss_7": 8360.0, + "learning_rate": 0.0009641839665080363, + "loss": 13644.6, + "step": 1300 + }, + { + "ce_loss_13": 5.155666828155518, + "ce_loss_26": 4.557369256019593, + "ce_loss_39": 3.8985717594623566, + "ce_loss_52": 1.4532029300928115, + "ce_loss_7": 5.49907066822052, + "epoch": 0.131, + "grad_norm": 28.09061259972242, + "kl_loss_13": 7600.0, + "kl_loss_26": 6340.8, + "kl_loss_39": 4909.6, + "kl_loss_7": 8327.2, + "learning_rate": 0.0009635919272833937, + "loss": 13575.0, + "step": 1310 + }, + { + "ce_loss_13": 5.079627573490143, + "ce_loss_26": 4.46733387708664, + "ce_loss_39": 3.7981239676475527, + "ce_loss_52": 1.4149045318365097, + "ce_loss_7": 5.424402499198914, + "epoch": 0.132, + "grad_norm": 29.96516682014439, + "kl_loss_13": 7483.2, + "kl_loss_26": 6202.4, + "kl_loss_39": 4773.6, + "kl_loss_7": 8210.4, + "learning_rate": 0.0009629952196931902, + "loss": 13547.6, + "step": 1320 + }, + { + "ce_loss_13": 5.090298974514008, + "ce_loss_26": 4.495900344848633, + "ce_loss_39": 3.821765500307083, + "ce_loss_52": 1.4328191310167313, + "ce_loss_7": 5.431738471984863, + "epoch": 0.133, + "grad_norm": 26.14827995707597, + "kl_loss_13": 7504.0, + "kl_loss_26": 6259.2, + "kl_loss_39": 4809.6, + "kl_loss_7": 8225.6, + "learning_rate": 0.0009623938497462645, + "loss": 13496.2, + "step": 1330 + }, + { + "ce_loss_13": 5.099011301994324, + "ce_loss_26": 4.4848466455936435, + "ce_loss_39": 3.794223016500473, + "ce_loss_52": 1.416146419942379, + "ce_loss_7": 5.462341606616974, + "epoch": 0.134, + "grad_norm": 24.84289392950202, + "kl_loss_13": 7542.4, + "kl_loss_26": 6259.2, + "kl_loss_39": 4785.2, + "kl_loss_7": 8304.8, + "learning_rate": 0.0009617878234984055, + "loss": 13395.2, + "step": 1340 + }, + { + "ce_loss_13": 5.097129952907562, + "ce_loss_26": 4.498237466812133, + "ce_loss_39": 3.8250812292099, + "ce_loss_52": 1.4416721731424331, + "ce_loss_7": 5.444403338432312, + "epoch": 0.135, + "grad_norm": 26.607564330476333, + "kl_loss_13": 7480.8, + "kl_loss_26": 6224.8, + "kl_loss_39": 4782.4, + "kl_loss_7": 8202.4, + "learning_rate": 0.0009611771470522907, + "loss": 13240.2, + "step": 1350 + }, + { + "ce_loss_13": 5.043468415737152, + "ce_loss_26": 4.443963885307312, + "ce_loss_39": 3.7804897725582123, + "ce_loss_52": 1.4098651513457299, + "ce_loss_7": 5.407905113697052, + "epoch": 0.136, + "grad_norm": 27.568927473511277, + "kl_loss_13": 7464.8, + "kl_loss_26": 6209.6, + "kl_loss_39": 4773.6, + "kl_loss_7": 8221.6, + "learning_rate": 0.0009605618265574251, + "loss": 13312.2, + "step": 1360 + }, + { + "ce_loss_13": 5.1134570121765135, + "ce_loss_26": 4.510142356157303, + "ce_loss_39": 3.844588041305542, + "ce_loss_52": 1.4824964210391045, + "ce_loss_7": 5.455760145187378, + "epoch": 0.137, + "grad_norm": 26.586450808382164, + "kl_loss_13": 7431.2, + "kl_loss_26": 6176.0, + "kl_loss_39": 4748.0, + "kl_loss_7": 8151.2, + "learning_rate": 0.0009599418682100792, + "loss": 13171.6, + "step": 1370 + }, + { + "ce_loss_13": 4.993066036701203, + "ce_loss_26": 4.390464246273041, + "ce_loss_39": 3.7108724772930146, + "ce_loss_52": 1.3996128499507905, + "ce_loss_7": 5.354221343994141, + "epoch": 0.138, + "grad_norm": 24.51436324855179, + "kl_loss_13": 7388.0, + "kl_loss_26": 6120.0, + "kl_loss_39": 4660.8, + "kl_loss_7": 8145.6, + "learning_rate": 0.0009593172782532268, + "loss": 13135.2, + "step": 1380 + }, + { + "ce_loss_13": 4.9749324202537535, + "ce_loss_26": 4.36228443980217, + "ce_loss_39": 3.71304127573967, + "ce_loss_52": 1.4259393498301507, + "ce_loss_7": 5.328954219818115, + "epoch": 0.139, + "grad_norm": 25.448579155888293, + "kl_loss_13": 7284.8, + "kl_loss_26": 6005.6, + "kl_loss_39": 4599.2, + "kl_loss_7": 8029.6, + "learning_rate": 0.0009586880629764817, + "loss": 13023.4, + "step": 1390 + }, + { + "ce_loss_13": 5.021213936805725, + "ce_loss_26": 4.392004972696304, + "ce_loss_39": 3.695616716146469, + "ce_loss_52": 1.3939141556620598, + "ce_loss_7": 5.386792302131653, + "epoch": 0.14, + "grad_norm": 27.169552009752685, + "kl_loss_13": 7436.0, + "kl_loss_26": 6132.8, + "kl_loss_39": 4655.2, + "kl_loss_7": 8205.6, + "learning_rate": 0.0009580542287160348, + "loss": 13043.6, + "step": 1400 + }, + { + "ce_loss_13": 5.006197059154511, + "ce_loss_26": 4.410893344879151, + "ce_loss_39": 3.74367755651474, + "ce_loss_52": 1.4519873589277268, + "ce_loss_7": 5.36526129245758, + "epoch": 0.141, + "grad_norm": 24.865151038825246, + "kl_loss_13": 7283.2, + "kl_loss_26": 6027.2, + "kl_loss_39": 4615.2, + "kl_loss_7": 8029.6, + "learning_rate": 0.0009574157818545901, + "loss": 12913.8, + "step": 1410 + }, + { + "ce_loss_13": 4.958354568481445, + "ce_loss_26": 4.367397904396057, + "ce_loss_39": 3.7062928318977355, + "ce_loss_52": 1.4099174112081527, + "ce_loss_7": 5.317552924156189, + "epoch": 0.142, + "grad_norm": 24.898155460709848, + "kl_loss_13": 7277.6, + "kl_loss_26": 6040.8, + "kl_loss_39": 4623.6, + "kl_loss_7": 8032.8, + "learning_rate": 0.0009567727288213005, + "loss": 12929.6, + "step": 1420 + }, + { + "ce_loss_13": 4.984454607963562, + "ce_loss_26": 4.392505377531052, + "ce_loss_39": 3.7512011885643006, + "ce_loss_52": 1.473440769314766, + "ce_loss_7": 5.340136766433716, + "epoch": 0.143, + "grad_norm": 24.34585690638739, + "kl_loss_13": 7221.6, + "kl_loss_26": 5972.8, + "kl_loss_39": 4602.0, + "kl_loss_7": 7973.6, + "learning_rate": 0.0009561250760917027, + "loss": 12830.2, + "step": 1430 + }, + { + "ce_loss_13": 4.917237496376037, + "ce_loss_26": 4.313831263780594, + "ce_loss_39": 3.6587266325950623, + "ce_loss_52": 1.4092496067285538, + "ce_loss_7": 5.2786689639091495, + "epoch": 0.144, + "grad_norm": 25.288024521189875, + "kl_loss_13": 7198.4, + "kl_loss_26": 5931.2, + "kl_loss_39": 4527.2, + "kl_loss_7": 7960.8, + "learning_rate": 0.0009554728301876525, + "loss": 12688.6, + "step": 1440 + }, + { + "ce_loss_13": 4.95776047706604, + "ce_loss_26": 4.340852671861649, + "ce_loss_39": 3.657728981971741, + "ce_loss_52": 1.4168317198753357, + "ce_loss_7": 5.322667574882507, + "epoch": 0.145, + "grad_norm": 26.641005752286592, + "kl_loss_13": 7228.8, + "kl_loss_26": 5940.0, + "kl_loss_39": 4485.2, + "kl_loss_7": 8003.2, + "learning_rate": 0.0009548159976772592, + "loss": 12683.8, + "step": 1450 + }, + { + "ce_loss_13": 4.831417870521546, + "ce_loss_26": 4.231567287445069, + "ce_loss_39": 3.581255227327347, + "ce_loss_52": 1.4485478460788728, + "ce_loss_7": 5.20115841627121, + "epoch": 0.146, + "grad_norm": 24.920691081516484, + "kl_loss_13": 6952.0, + "kl_loss_26": 5699.2, + "kl_loss_39": 4308.4, + "kl_loss_7": 7733.6, + "learning_rate": 0.0009541545851748186, + "loss": 12574.8, + "step": 1460 + }, + { + "ce_loss_13": 4.8803037166595455, + "ce_loss_26": 4.2741272211074826, + "ce_loss_39": 3.598990321159363, + "ce_loss_52": 1.4145199984312058, + "ce_loss_7": 5.244284570217133, + "epoch": 0.147, + "grad_norm": 25.90739775261194, + "kl_loss_13": 7076.8, + "kl_loss_26": 5806.4, + "kl_loss_39": 4372.0, + "kl_loss_7": 7841.6, + "learning_rate": 0.0009534885993407473, + "loss": 12558.0, + "step": 1470 + }, + { + "ce_loss_13": 4.854452967643738, + "ce_loss_26": 4.251825517416, + "ce_loss_39": 3.5948518395423887, + "ce_loss_52": 1.428754985332489, + "ce_loss_7": 5.219927191734314, + "epoch": 0.148, + "grad_norm": 24.48718194678669, + "kl_loss_13": 7008.8, + "kl_loss_26": 5743.2, + "kl_loss_39": 4354.4, + "kl_loss_7": 7777.6, + "learning_rate": 0.0009528180468815154, + "loss": 12484.4, + "step": 1480 + }, + { + "ce_loss_13": 4.884927380084991, + "ce_loss_26": 4.2919243454933165, + "ce_loss_39": 3.642289215326309, + "ce_loss_52": 1.465419703722, + "ce_loss_7": 5.241849565505982, + "epoch": 0.149, + "grad_norm": 24.903440253335923, + "kl_loss_13": 7001.6, + "kl_loss_26": 5763.2, + "kl_loss_39": 4366.0, + "kl_loss_7": 7752.0, + "learning_rate": 0.0009521429345495787, + "loss": 12486.6, + "step": 1490 + }, + { + "ce_loss_13": 4.82739794254303, + "ce_loss_26": 4.228940737247467, + "ce_loss_39": 3.5767277657985685, + "ce_loss_52": 1.4382753789424896, + "ce_loss_7": 5.209846138954163, + "epoch": 0.15, + "grad_norm": 25.291080092187237, + "kl_loss_13": 6960.8, + "kl_loss_26": 5698.4, + "kl_loss_39": 4297.6, + "kl_loss_7": 7764.0, + "learning_rate": 0.0009514632691433108, + "loss": 12420.2, + "step": 1500 + }, + { + "ce_loss_13": 4.828064477443695, + "ce_loss_26": 4.213042998313904, + "ce_loss_39": 3.5323724269866945, + "ce_loss_52": 1.3961022228002549, + "ce_loss_7": 5.196406292915344, + "epoch": 0.151, + "grad_norm": 25.466425081780237, + "kl_loss_13": 7046.4, + "kl_loss_26": 5772.8, + "kl_loss_39": 4315.2, + "kl_loss_7": 7825.6, + "learning_rate": 0.0009507790575069346, + "loss": 12387.6, + "step": 1510 + }, + { + "ce_loss_13": 4.786497128009796, + "ce_loss_26": 4.188397663831711, + "ce_loss_39": 3.536989223957062, + "ce_loss_52": 1.4404057756066322, + "ce_loss_7": 5.156642246246338, + "epoch": 0.152, + "grad_norm": 22.488994996506335, + "kl_loss_13": 6872.8, + "kl_loss_26": 5620.8, + "kl_loss_39": 4225.6, + "kl_loss_7": 7650.4, + "learning_rate": 0.0009500903065304539, + "loss": 12265.4, + "step": 1520 + }, + { + "ce_loss_13": 4.79404227733612, + "ce_loss_26": 4.1956378519535065, + "ce_loss_39": 3.539849889278412, + "ce_loss_52": 1.447507870197296, + "ce_loss_7": 5.170107614994049, + "epoch": 0.153, + "grad_norm": 24.979481722705945, + "kl_loss_13": 6864.0, + "kl_loss_26": 5609.6, + "kl_loss_39": 4213.6, + "kl_loss_7": 7656.8, + "learning_rate": 0.0009493970231495835, + "loss": 12182.2, + "step": 1530 + }, + { + "ce_loss_13": 4.754118239879608, + "ce_loss_26": 4.16424406170845, + "ce_loss_39": 3.5151414275169373, + "ce_loss_52": 1.423200336098671, + "ce_loss_7": 5.132595348358154, + "epoch": 0.154, + "grad_norm": 24.139218625352445, + "kl_loss_13": 6807.2, + "kl_loss_26": 5573.6, + "kl_loss_39": 4190.4, + "kl_loss_7": 7594.4, + "learning_rate": 0.0009486992143456792, + "loss": 12152.0, + "step": 1540 + }, + { + "ce_loss_13": 4.745328724384308, + "ce_loss_26": 4.135520172119141, + "ce_loss_39": 3.4818262457847595, + "ce_loss_52": 1.4286953419446946, + "ce_loss_7": 5.114579677581787, + "epoch": 0.155, + "grad_norm": 24.426109316342576, + "kl_loss_13": 6791.2, + "kl_loss_26": 5516.8, + "kl_loss_39": 4120.0, + "kl_loss_7": 7567.2, + "learning_rate": 0.0009479968871456679, + "loss": 12128.4, + "step": 1550 + }, + { + "ce_loss_13": 4.7574557065963745, + "ce_loss_26": 4.145674997568131, + "ce_loss_39": 3.476013499498367, + "ce_loss_52": 1.4235228240489959, + "ce_loss_7": 5.133380055427551, + "epoch": 0.156, + "grad_norm": 25.100926583342837, + "kl_loss_13": 6843.2, + "kl_loss_26": 5556.0, + "kl_loss_39": 4126.0, + "kl_loss_7": 7627.2, + "learning_rate": 0.0009472900486219768, + "loss": 12082.2, + "step": 1560 + }, + { + "ce_loss_13": 4.735032224655152, + "ce_loss_26": 4.128789341449737, + "ce_loss_39": 3.4694815576076508, + "ce_loss_52": 1.4237273722887038, + "ce_loss_7": 5.11258887052536, + "epoch": 0.157, + "grad_norm": 25.10370372986473, + "kl_loss_13": 6792.0, + "kl_loss_26": 5520.0, + "kl_loss_39": 4095.6, + "kl_loss_7": 7585.6, + "learning_rate": 0.000946578705892462, + "loss": 11936.2, + "step": 1570 + }, + { + "ce_loss_13": 4.741922962665558, + "ce_loss_26": 4.132791459560394, + "ce_loss_39": 3.482679557800293, + "ce_loss_52": 1.4294559836387635, + "ce_loss_7": 5.117163801193238, + "epoch": 0.158, + "grad_norm": 21.844394510796377, + "kl_loss_13": 6799.2, + "kl_loss_26": 5517.6, + "kl_loss_39": 4118.0, + "kl_loss_7": 7581.6, + "learning_rate": 0.0009458628661203367, + "loss": 11944.8, + "step": 1580 + }, + { + "ce_loss_13": 4.741668605804444, + "ce_loss_26": 4.1376284003257755, + "ce_loss_39": 3.478295695781708, + "ce_loss_52": 1.415444830060005, + "ce_loss_7": 5.117357003688812, + "epoch": 0.159, + "grad_norm": 25.4671883290825, + "kl_loss_13": 6812.0, + "kl_loss_26": 5548.0, + "kl_loss_39": 4136.8, + "kl_loss_7": 7601.6, + "learning_rate": 0.0009451425365140996, + "loss": 11952.4, + "step": 1590 + }, + { + "ce_loss_13": 4.723819291591644, + "ce_loss_26": 4.128834217786789, + "ce_loss_39": 3.47242848277092, + "ce_loss_52": 1.429117676615715, + "ce_loss_7": 5.096058523654937, + "epoch": 0.16, + "grad_norm": 25.14078013617688, + "kl_loss_13": 6768.0, + "kl_loss_26": 5519.2, + "kl_loss_39": 4101.6, + "kl_loss_7": 7547.2, + "learning_rate": 0.0009444177243274617, + "loss": 11862.0, + "step": 1600 + }, + { + "ce_loss_13": 4.648782467842102, + "ce_loss_26": 4.0394273698329926, + "ce_loss_39": 3.377221292257309, + "ce_loss_52": 1.4151206001639367, + "ce_loss_7": 5.0250336050987245, + "epoch": 0.161, + "grad_norm": 24.128253336718885, + "kl_loss_13": 6640.0, + "kl_loss_26": 5364.0, + "kl_loss_39": 3953.2, + "kl_loss_7": 7436.8, + "learning_rate": 0.0009436884368592739, + "loss": 11833.0, + "step": 1610 + }, + { + "ce_loss_13": 4.695314359664917, + "ce_loss_26": 4.099924111366272, + "ce_loss_39": 3.466512751579285, + "ce_loss_52": 1.4766929775476456, + "ce_loss_7": 5.064470827579498, + "epoch": 0.162, + "grad_norm": 23.68843577414951, + "kl_loss_13": 6614.4, + "kl_loss_26": 5368.8, + "kl_loss_39": 3996.8, + "kl_loss_7": 7387.2, + "learning_rate": 0.0009429546814534529, + "loss": 11713.8, + "step": 1620 + }, + { + "ce_loss_13": 4.7040504813194275, + "ce_loss_26": 4.104205197095871, + "ce_loss_39": 3.4451481282711027, + "ce_loss_52": 1.4428326219320298, + "ce_loss_7": 5.08098030090332, + "epoch": 0.163, + "grad_norm": 23.332187460756046, + "kl_loss_13": 6673.6, + "kl_loss_26": 5408.0, + "kl_loss_39": 4000.0, + "kl_loss_7": 7468.8, + "learning_rate": 0.0009422164654989072, + "loss": 11730.0, + "step": 1630 + }, + { + "ce_loss_13": 4.6945901870727536, + "ce_loss_26": 4.092492777109146, + "ce_loss_39": 3.4360527455806733, + "ce_loss_52": 1.4436773255467414, + "ce_loss_7": 5.079924070835114, + "epoch": 0.164, + "grad_norm": 25.877563512298988, + "kl_loss_13": 6666.4, + "kl_loss_26": 5404.8, + "kl_loss_39": 4011.6, + "kl_loss_7": 7479.2, + "learning_rate": 0.0009414737964294635, + "loss": 11645.0, + "step": 1640 + }, + { + "ce_loss_13": 4.614939618110657, + "ce_loss_26": 4.018665736913681, + "ce_loss_39": 3.3586190402507783, + "ce_loss_52": 1.4472161442041398, + "ce_loss_7": 4.990367615222931, + "epoch": 0.165, + "grad_norm": 24.534381720947415, + "kl_loss_13": 6511.2, + "kl_loss_26": 5264.0, + "kl_loss_39": 3849.6, + "kl_loss_7": 7293.6, + "learning_rate": 0.000940726681723791, + "loss": 11568.6, + "step": 1650 + }, + { + "ce_loss_13": 4.539776319265366, + "ce_loss_26": 3.943451428413391, + "ce_loss_39": 3.281195378303528, + "ce_loss_52": 1.4070941284298897, + "ce_loss_7": 4.923744630813599, + "epoch": 0.166, + "grad_norm": 23.51720209782485, + "kl_loss_13": 6449.6, + "kl_loss_26": 5196.0, + "kl_loss_39": 3786.0, + "kl_loss_7": 7249.6, + "learning_rate": 0.0009399751289053266, + "loss": 11569.4, + "step": 1660 + }, + { + "ce_loss_13": 4.590777164697647, + "ce_loss_26": 3.9918887853622436, + "ce_loss_39": 3.328452670574188, + "ce_loss_52": 1.4019996047019958, + "ce_loss_7": 4.978202056884766, + "epoch": 0.167, + "grad_norm": 22.82794096581106, + "kl_loss_13": 6550.4, + "kl_loss_26": 5291.2, + "kl_loss_39": 3877.2, + "kl_loss_7": 7354.4, + "learning_rate": 0.0009392191455421988, + "loss": 11557.4, + "step": 1670 + }, + { + "ce_loss_13": 4.534084904193878, + "ce_loss_26": 3.9383736848831177, + "ce_loss_39": 3.290838527679443, + "ce_loss_52": 1.3803422033786774, + "ce_loss_7": 4.9193053364753725, + "epoch": 0.168, + "grad_norm": 22.01316358613574, + "kl_loss_13": 6469.6, + "kl_loss_26": 5224.0, + "kl_loss_39": 3837.2, + "kl_loss_7": 7273.6, + "learning_rate": 0.0009384587392471515, + "loss": 11454.2, + "step": 1680 + }, + { + "ce_loss_13": 4.5477269172668455, + "ce_loss_26": 3.9558385491371153, + "ce_loss_39": 3.3075734674930573, + "ce_loss_52": 1.410713329911232, + "ce_loss_7": 4.9310637474060055, + "epoch": 0.169, + "grad_norm": 24.025001534080104, + "kl_loss_13": 6453.6, + "kl_loss_26": 5223.2, + "kl_loss_39": 3830.0, + "kl_loss_7": 7244.0, + "learning_rate": 0.0009376939176774678, + "loss": 11355.2, + "step": 1690 + }, + { + "ce_loss_13": 4.580456328392029, + "ce_loss_26": 3.996914601325989, + "ce_loss_39": 3.3568237483501435, + "ce_loss_52": 1.4514233976602555, + "ce_loss_7": 4.956906342506409, + "epoch": 0.17, + "grad_norm": 24.061048820242437, + "kl_loss_13": 6424.8, + "kl_loss_26": 5199.2, + "kl_loss_39": 3822.0, + "kl_loss_7": 7210.4, + "learning_rate": 0.0009369246885348925, + "loss": 11365.4, + "step": 1700 + }, + { + "ce_loss_13": 4.5829225301742555, + "ce_loss_26": 3.973430114984512, + "ce_loss_39": 3.3136274456977843, + "ce_loss_52": 1.4179346442222596, + "ce_loss_7": 4.9602553129196165, + "epoch": 0.171, + "grad_norm": 21.925882863353518, + "kl_loss_13": 6508.0, + "kl_loss_26": 5225.6, + "kl_loss_39": 3821.2, + "kl_loss_7": 7300.0, + "learning_rate": 0.0009361510595655545, + "loss": 11427.8, + "step": 1710 + }, + { + "ce_loss_13": 4.597618329524994, + "ce_loss_26": 4.014382421970367, + "ce_loss_39": 3.3793311297893522, + "ce_loss_52": 1.4502436846494675, + "ce_loss_7": 4.970238649845124, + "epoch": 0.172, + "grad_norm": 21.861723684559113, + "kl_loss_13": 6463.2, + "kl_loss_26": 5241.6, + "kl_loss_39": 3880.8, + "kl_loss_7": 7242.4, + "learning_rate": 0.0009353730385598887, + "loss": 11300.4, + "step": 1720 + }, + { + "ce_loss_13": 4.474293851852417, + "ce_loss_26": 3.8755543529987335, + "ce_loss_39": 3.212596780061722, + "ce_loss_52": 1.4004584550857544, + "ce_loss_7": 4.856262743473053, + "epoch": 0.173, + "grad_norm": 23.168666460490822, + "kl_loss_13": 6318.4, + "kl_loss_26": 5065.6, + "kl_loss_39": 3658.4, + "kl_loss_7": 7116.8, + "learning_rate": 0.0009345906333525581, + "loss": 11205.0, + "step": 1730 + }, + { + "ce_loss_13": 4.5212029337883, + "ce_loss_26": 3.9314939856529234, + "ce_loss_39": 3.301447206735611, + "ce_loss_52": 1.422508242726326, + "ce_loss_7": 4.894874656200409, + "epoch": 0.174, + "grad_norm": 25.870791070867757, + "kl_loss_13": 6358.4, + "kl_loss_26": 5133.6, + "kl_loss_39": 3775.2, + "kl_loss_7": 7142.4, + "learning_rate": 0.0009338038518223745, + "loss": 11159.2, + "step": 1740 + }, + { + "ce_loss_13": 4.551153075695038, + "ce_loss_26": 3.96026993393898, + "ce_loss_39": 3.326349085569382, + "ce_loss_52": 1.4542756617069243, + "ce_loss_7": 4.919975602626801, + "epoch": 0.175, + "grad_norm": 23.828468964880035, + "kl_loss_13": 6352.8, + "kl_loss_26": 5109.6, + "kl_loss_39": 3757.6, + "kl_loss_7": 7129.6, + "learning_rate": 0.0009330127018922195, + "loss": 11089.0, + "step": 1750 + }, + { + "ce_loss_13": 4.469128930568695, + "ce_loss_26": 3.8787964940071107, + "ce_loss_39": 3.2338991940021513, + "ce_loss_52": 1.4316335827112199, + "ce_loss_7": 4.848294925689697, + "epoch": 0.176, + "grad_norm": 24.772424094235244, + "kl_loss_13": 6252.8, + "kl_loss_26": 5015.2, + "kl_loss_39": 3643.2, + "kl_loss_7": 7041.6, + "learning_rate": 0.0009322171915289634, + "loss": 11050.6, + "step": 1760 + }, + { + "ce_loss_13": 4.515468680858612, + "ce_loss_26": 3.9264565110206604, + "ce_loss_39": 3.2920862257480623, + "ce_loss_52": 1.46503643989563, + "ce_loss_7": 4.88274484872818, + "epoch": 0.177, + "grad_norm": 24.580027558725412, + "kl_loss_13": 6243.2, + "kl_loss_26": 5015.2, + "kl_loss_39": 3668.0, + "kl_loss_7": 7024.0, + "learning_rate": 0.0009314173287433873, + "loss": 11083.0, + "step": 1770 + }, + { + "ce_loss_13": 4.563484919071198, + "ce_loss_26": 3.988471633195877, + "ce_loss_39": 3.3576016187667848, + "ce_loss_52": 1.4738382428884507, + "ce_loss_7": 4.929107296466827, + "epoch": 0.178, + "grad_norm": 23.727065102019264, + "kl_loss_13": 6340.0, + "kl_loss_26": 5134.4, + "kl_loss_39": 3774.0, + "kl_loss_7": 7108.0, + "learning_rate": 0.0009306131215901003, + "loss": 11053.2, + "step": 1780 + }, + { + "ce_loss_13": 4.485390210151673, + "ce_loss_26": 3.9024369359016418, + "ce_loss_39": 3.277720022201538, + "ce_loss_52": 1.4684919208288192, + "ce_loss_7": 4.849484694004059, + "epoch": 0.179, + "grad_norm": 24.140381804707665, + "kl_loss_13": 6222.4, + "kl_loss_26": 4996.0, + "kl_loss_39": 3639.6, + "kl_loss_7": 6991.2, + "learning_rate": 0.0009298045781674596, + "loss": 10948.8, + "step": 1790 + }, + { + "ce_loss_13": 4.485648030042649, + "ce_loss_26": 3.8959447860717775, + "ce_loss_39": 3.255040627717972, + "ce_loss_52": 1.41890487074852, + "ce_loss_7": 4.864380013942719, + "epoch": 0.18, + "grad_norm": 25.753548379396687, + "kl_loss_13": 6269.6, + "kl_loss_26": 5029.6, + "kl_loss_39": 3653.2, + "kl_loss_7": 7068.8, + "learning_rate": 0.0009289917066174886, + "loss": 10940.4, + "step": 1800 + }, + { + "ce_loss_13": 4.4491588294506075, + "ce_loss_26": 3.862889313697815, + "ce_loss_39": 3.203300213813782, + "ce_loss_52": 1.4129745751619338, + "ce_loss_7": 4.8373774766921995, + "epoch": 0.181, + "grad_norm": 23.580007870242206, + "kl_loss_13": 6251.2, + "kl_loss_26": 5015.2, + "kl_loss_39": 3609.2, + "kl_loss_7": 7063.2, + "learning_rate": 0.0009281745151257945, + "loss": 10831.6, + "step": 1810 + }, + { + "ce_loss_13": 4.4796471238136295, + "ce_loss_26": 3.9034676015377046, + "ce_loss_39": 3.27801650762558, + "ce_loss_52": 1.470898449420929, + "ce_loss_7": 4.846702206134796, + "epoch": 0.182, + "grad_norm": 21.825066910706077, + "kl_loss_13": 6129.6, + "kl_loss_26": 4921.6, + "kl_loss_39": 3590.4, + "kl_loss_7": 6907.2, + "learning_rate": 0.0009273530119214868, + "loss": 10852.6, + "step": 1820 + }, + { + "ce_loss_13": 4.397759801149368, + "ce_loss_26": 3.809650295972824, + "ce_loss_39": 3.163052296638489, + "ce_loss_52": 1.4123397037386893, + "ce_loss_7": 4.76624493598938, + "epoch": 0.183, + "grad_norm": 23.028395579089935, + "kl_loss_13": 6109.6, + "kl_loss_26": 4884.8, + "kl_loss_39": 3520.0, + "kl_loss_7": 6886.4, + "learning_rate": 0.0009265272052770935, + "loss": 10776.6, + "step": 1830 + }, + { + "ce_loss_13": 4.409473043680191, + "ce_loss_26": 3.825248968601227, + "ce_loss_39": 3.174910306930542, + "ce_loss_52": 1.4039017781615257, + "ce_loss_7": 4.799298018217087, + "epoch": 0.184, + "grad_norm": 22.60594476207274, + "kl_loss_13": 6165.6, + "kl_loss_26": 4934.4, + "kl_loss_39": 3543.2, + "kl_loss_7": 6977.6, + "learning_rate": 0.0009256971035084784, + "loss": 10733.4, + "step": 1840 + }, + { + "ce_loss_13": 4.3755183041095735, + "ce_loss_26": 3.797974693775177, + "ce_loss_39": 3.1725789427757265, + "ce_loss_52": 1.4232216864824294, + "ce_loss_7": 4.739054465293885, + "epoch": 0.185, + "grad_norm": 23.627865972104136, + "kl_loss_13": 6060.0, + "kl_loss_26": 4843.2, + "kl_loss_39": 3517.2, + "kl_loss_7": 6827.2, + "learning_rate": 0.0009248627149747573, + "loss": 10698.4, + "step": 1850 + }, + { + "ce_loss_13": 4.422569459676742, + "ce_loss_26": 3.822605752944946, + "ce_loss_39": 3.1763491451740267, + "ce_loss_52": 1.427757203578949, + "ce_loss_7": 4.793670791387558, + "epoch": 0.186, + "grad_norm": 22.345780165109367, + "kl_loss_13": 6140.0, + "kl_loss_26": 4902.0, + "kl_loss_39": 3525.6, + "kl_loss_7": 6920.0, + "learning_rate": 0.0009240240480782129, + "loss": 10688.6, + "step": 1860 + }, + { + "ce_loss_13": 4.390002739429474, + "ce_loss_26": 3.8117696583271026, + "ce_loss_39": 3.193506735563278, + "ce_loss_52": 1.4390262439846992, + "ce_loss_7": 4.754710161685944, + "epoch": 0.187, + "grad_norm": 24.270272909834983, + "kl_loss_13": 6056.0, + "kl_loss_26": 4841.6, + "kl_loss_39": 3523.2, + "kl_loss_7": 6828.0, + "learning_rate": 0.0009231811112642122, + "loss": 10605.8, + "step": 1870 + }, + { + "ce_loss_13": 4.347514522075653, + "ce_loss_26": 3.774775582551956, + "ce_loss_39": 3.1524779438972472, + "ce_loss_52": 1.4184574037790298, + "ce_loss_7": 4.711814332008362, + "epoch": 0.188, + "grad_norm": 23.060486415907942, + "kl_loss_13": 6006.4, + "kl_loss_26": 4801.6, + "kl_loss_39": 3474.8, + "kl_loss_7": 6776.0, + "learning_rate": 0.0009223339130211192, + "loss": 10599.8, + "step": 1880 + }, + { + "ce_loss_13": 4.280169582366943, + "ce_loss_26": 3.6960571646690368, + "ce_loss_39": 3.0768611639738084, + "ce_loss_52": 1.4011510267853737, + "ce_loss_7": 4.650495028495788, + "epoch": 0.189, + "grad_norm": 23.308893883500843, + "kl_loss_13": 5916.0, + "kl_loss_26": 4693.6, + "kl_loss_39": 3365.6, + "kl_loss_7": 6688.8, + "learning_rate": 0.0009214824618802108, + "loss": 10510.0, + "step": 1890 + }, + { + "ce_loss_13": 4.426742446422577, + "ce_loss_26": 3.835584044456482, + "ce_loss_39": 3.1762202858924864, + "ce_loss_52": 1.435165250301361, + "ce_loss_7": 4.8001045942306515, + "epoch": 0.19, + "grad_norm": 24.259267724718942, + "kl_loss_13": 6154.4, + "kl_loss_26": 4914.4, + "kl_loss_39": 3499.2, + "kl_loss_7": 6933.6, + "learning_rate": 0.0009206267664155906, + "loss": 10574.0, + "step": 1900 + }, + { + "ce_loss_13": 4.317660903930664, + "ce_loss_26": 3.736785036325455, + "ce_loss_39": 3.102087676525116, + "ce_loss_52": 1.4297346964478492, + "ce_loss_7": 4.69397531747818, + "epoch": 0.191, + "grad_norm": 23.3562329011761, + "kl_loss_13": 5937.6, + "kl_loss_26": 4723.2, + "kl_loss_39": 3373.2, + "kl_loss_7": 6731.2, + "learning_rate": 0.0009197668352441024, + "loss": 10503.4, + "step": 1910 + }, + { + "ce_loss_13": 4.334453409910202, + "ce_loss_26": 3.7587957322597503, + "ce_loss_39": 3.123855656385422, + "ce_loss_52": 1.4094826728105545, + "ce_loss_7": 4.709676373004913, + "epoch": 0.192, + "grad_norm": 24.22470876078119, + "kl_loss_13": 5996.8, + "kl_loss_26": 4786.4, + "kl_loss_39": 3430.0, + "kl_loss_7": 6772.8, + "learning_rate": 0.0009189026770252437, + "loss": 10471.0, + "step": 1920 + }, + { + "ce_loss_13": 4.351706159114838, + "ce_loss_26": 3.7773966193199158, + "ce_loss_39": 3.1497732281684874, + "ce_loss_52": 1.4320787012577056, + "ce_loss_7": 4.7191231608390805, + "epoch": 0.193, + "grad_norm": 23.447904782586527, + "kl_loss_13": 5997.6, + "kl_loss_26": 4794.4, + "kl_loss_39": 3448.4, + "kl_loss_7": 6763.2, + "learning_rate": 0.000918034300461078, + "loss": 10433.4, + "step": 1930 + }, + { + "ce_loss_13": 4.307104933261871, + "ce_loss_26": 3.7206166088581085, + "ce_loss_39": 3.091316682100296, + "ce_loss_52": 1.4110687702894211, + "ce_loss_7": 4.676908355951309, + "epoch": 0.194, + "grad_norm": 23.93372642527522, + "kl_loss_13": 5951.2, + "kl_loss_26": 4727.6, + "kl_loss_39": 3376.0, + "kl_loss_7": 6721.6, + "learning_rate": 0.0009171617142961477, + "loss": 10442.2, + "step": 1940 + }, + { + "ce_loss_13": 4.3363093614578245, + "ce_loss_26": 3.750982850790024, + "ce_loss_39": 3.111935979127884, + "ce_loss_52": 1.431942057609558, + "ce_loss_7": 4.707539451122284, + "epoch": 0.195, + "grad_norm": 23.910939036749266, + "kl_loss_13": 5967.2, + "kl_loss_26": 4743.2, + "kl_loss_39": 3382.8, + "kl_loss_7": 6752.0, + "learning_rate": 0.0009162849273173857, + "loss": 10366.8, + "step": 1950 + }, + { + "ce_loss_13": 4.271794074773789, + "ce_loss_26": 3.7012794077396394, + "ce_loss_39": 3.0882445216178893, + "ce_loss_52": 1.4407746940851212, + "ce_loss_7": 4.636665797233581, + "epoch": 0.196, + "grad_norm": 23.30649566244444, + "kl_loss_13": 5844.8, + "kl_loss_26": 4652.8, + "kl_loss_39": 3337.2, + "kl_loss_7": 6608.8, + "learning_rate": 0.0009154039483540273, + "loss": 10313.0, + "step": 1960 + }, + { + "ce_loss_13": 4.3892871856689455, + "ce_loss_26": 3.8117631673812866, + "ce_loss_39": 3.1672019243240355, + "ce_loss_52": 1.4633917301893233, + "ce_loss_7": 4.75022611618042, + "epoch": 0.197, + "grad_norm": 22.823988656575857, + "kl_loss_13": 5992.0, + "kl_loss_26": 4784.0, + "kl_loss_39": 3421.6, + "kl_loss_7": 6749.6, + "learning_rate": 0.0009145187862775209, + "loss": 10294.2, + "step": 1970 + }, + { + "ce_loss_13": 4.251708203554154, + "ce_loss_26": 3.68521209359169, + "ce_loss_39": 3.0638678431510926, + "ce_loss_52": 1.4189983233809471, + "ce_loss_7": 4.615437304973602, + "epoch": 0.198, + "grad_norm": 22.115400900183356, + "kl_loss_13": 5814.4, + "kl_loss_26": 4623.2, + "kl_loss_39": 3291.2, + "kl_loss_7": 6581.6, + "learning_rate": 0.0009136294500014386, + "loss": 10194.8, + "step": 1980 + }, + { + "ce_loss_13": 4.36306391954422, + "ce_loss_26": 3.7900101482868194, + "ce_loss_39": 3.1458350718021393, + "ce_loss_52": 1.431062677502632, + "ce_loss_7": 4.733654403686524, + "epoch": 0.199, + "grad_norm": 21.64648055888152, + "kl_loss_13": 6001.6, + "kl_loss_26": 4801.6, + "kl_loss_39": 3444.4, + "kl_loss_7": 6778.4, + "learning_rate": 0.000912735948481387, + "loss": 10217.4, + "step": 1990 + }, + { + "ce_loss_13": 4.2783638596534725, + "ce_loss_26": 3.7015498995780947, + "ce_loss_39": 3.079295587539673, + "ce_loss_52": 1.4367393761873246, + "ce_loss_7": 4.643443429470063, + "epoch": 0.2, + "grad_norm": 22.667053535414237, + "kl_loss_13": 5844.8, + "kl_loss_26": 4641.2, + "kl_loss_39": 3314.8, + "kl_loss_7": 6607.2, + "learning_rate": 0.0009118382907149164, + "loss": 10108.9, + "step": 2000 + }, + { + "ce_loss_13": 4.298079961538315, + "ce_loss_26": 3.7112639427185057, + "ce_loss_39": 3.0900191485881807, + "ce_loss_52": 1.4447624236345291, + "ce_loss_7": 4.659080803394318, + "epoch": 0.201, + "grad_norm": 21.421967222285037, + "kl_loss_13": 5860.8, + "kl_loss_26": 4647.6, + "kl_loss_39": 3308.4, + "kl_loss_7": 6620.8, + "learning_rate": 0.0009109364857414306, + "loss": 10210.1, + "step": 2010 + }, + { + "ce_loss_13": 4.298530715703964, + "ce_loss_26": 3.7281334936618804, + "ce_loss_39": 3.103990191221237, + "ce_loss_52": 1.445554968714714, + "ce_loss_7": 4.667031800746917, + "epoch": 0.202, + "grad_norm": 22.186513808055555, + "kl_loss_13": 5863.2, + "kl_loss_26": 4661.6, + "kl_loss_39": 3329.2, + "kl_loss_7": 6627.2, + "learning_rate": 0.0009100305426420956, + "loss": 10090.6, + "step": 2020 + }, + { + "ce_loss_13": 4.2117482125759125, + "ce_loss_26": 3.6511631190776823, + "ce_loss_39": 3.0435081899166105, + "ce_loss_52": 1.4112246841192246, + "ce_loss_7": 4.573570990562439, + "epoch": 0.203, + "grad_norm": 23.22208055275699, + "kl_loss_13": 5758.4, + "kl_loss_26": 4581.6, + "kl_loss_39": 3281.6, + "kl_loss_7": 6522.4, + "learning_rate": 0.0009091204705397484, + "loss": 10094.4, + "step": 2030 + }, + { + "ce_loss_13": 4.2797119140625, + "ce_loss_26": 3.7096898019313813, + "ce_loss_39": 3.0815310001373293, + "ce_loss_52": 1.4514381274580956, + "ce_loss_7": 4.634703290462494, + "epoch": 0.204, + "grad_norm": 22.77691157290275, + "kl_loss_13": 5768.8, + "kl_loss_26": 4568.8, + "kl_loss_39": 3229.6, + "kl_loss_7": 6520.8, + "learning_rate": 0.0009082062785988049, + "loss": 10052.8, + "step": 2040 + }, + { + "ce_loss_13": 4.228492313623429, + "ce_loss_26": 3.649516838788986, + "ce_loss_39": 3.017675918340683, + "ce_loss_52": 1.4015851855278014, + "ce_loss_7": 4.5945727050304415, + "epoch": 0.205, + "grad_norm": 24.043992652900016, + "kl_loss_13": 5788.0, + "kl_loss_26": 4580.8, + "kl_loss_39": 3234.0, + "kl_loss_7": 6555.2, + "learning_rate": 0.0009072879760251679, + "loss": 10047.6, + "step": 2050 + }, + { + "ce_loss_13": 4.153064209222793, + "ce_loss_26": 3.5778062403202058, + "ce_loss_39": 2.964171904325485, + "ce_loss_52": 1.4066181004047393, + "ce_loss_7": 4.5237502455711365, + "epoch": 0.206, + "grad_norm": 23.14612831170837, + "kl_loss_13": 5666.4, + "kl_loss_26": 4462.4, + "kl_loss_39": 3156.0, + "kl_loss_7": 6428.0, + "learning_rate": 0.0009063655720661341, + "loss": 10022.0, + "step": 2060 + }, + { + "ce_loss_13": 4.162076050043106, + "ce_loss_26": 3.5850139617919923, + "ce_loss_39": 2.9763170003890993, + "ce_loss_52": 1.4114871382713319, + "ce_loss_7": 4.525000536441803, + "epoch": 0.207, + "grad_norm": 23.335931334507656, + "kl_loss_13": 5684.8, + "kl_loss_26": 4486.4, + "kl_loss_39": 3172.4, + "kl_loss_7": 6440.8, + "learning_rate": 0.000905439076010301, + "loss": 9910.8, + "step": 2070 + }, + { + "ce_loss_13": 4.203662091493607, + "ce_loss_26": 3.6453104853630065, + "ce_loss_39": 3.031943756341934, + "ce_loss_52": 1.4498305425047875, + "ce_loss_7": 4.567969477176666, + "epoch": 0.208, + "grad_norm": 22.17297694250979, + "kl_loss_13": 5653.6, + "kl_loss_26": 4474.4, + "kl_loss_39": 3178.4, + "kl_loss_7": 6404.0, + "learning_rate": 0.0009045084971874737, + "loss": 9890.1, + "step": 2080 + }, + { + "ce_loss_13": 4.1253215074539185, + "ce_loss_26": 3.5510447442531587, + "ce_loss_39": 2.9255994498729705, + "ce_loss_52": 1.383307683467865, + "ce_loss_7": 4.503264659643174, + "epoch": 0.209, + "grad_norm": 21.83866221628796, + "kl_loss_13": 5675.2, + "kl_loss_26": 4462.8, + "kl_loss_39": 3118.0, + "kl_loss_7": 6452.0, + "learning_rate": 0.0009035738449685707, + "loss": 9916.2, + "step": 2090 + }, + { + "ce_loss_13": 4.232741326093674, + "ce_loss_26": 3.6592664182186128, + "ce_loss_39": 3.0402495503425597, + "ce_loss_52": 1.4619457066059112, + "ce_loss_7": 4.595292699337006, + "epoch": 0.21, + "grad_norm": 23.1172808852491, + "kl_loss_13": 5709.6, + "kl_loss_26": 4522.8, + "kl_loss_39": 3192.0, + "kl_loss_7": 6460.8, + "learning_rate": 0.0009026351287655293, + "loss": 9882.4, + "step": 2100 + }, + { + "ce_loss_13": 4.196552646160126, + "ce_loss_26": 3.6394916236400605, + "ce_loss_39": 3.029403269290924, + "ce_loss_52": 1.431785149872303, + "ce_loss_7": 4.555036389827729, + "epoch": 0.211, + "grad_norm": 22.472192746858727, + "kl_loss_13": 5678.4, + "kl_loss_26": 4513.2, + "kl_loss_39": 3201.6, + "kl_loss_7": 6423.2, + "learning_rate": 0.0009016923580312113, + "loss": 9778.0, + "step": 2110 + }, + { + "ce_loss_13": 4.267941182851791, + "ce_loss_26": 3.6975386798381806, + "ce_loss_39": 3.0839960873126984, + "ce_loss_52": 1.4794423222541808, + "ce_loss_7": 4.60888249874115, + "epoch": 0.212, + "grad_norm": 23.91558691594894, + "kl_loss_13": 5697.6, + "kl_loss_26": 4502.8, + "kl_loss_39": 3191.2, + "kl_loss_7": 6425.6, + "learning_rate": 0.0009007455422593077, + "loss": 9764.0, + "step": 2120 + }, + { + "ce_loss_13": 4.170402336120605, + "ce_loss_26": 3.5973034620285036, + "ce_loss_39": 2.9883872270584106, + "ce_loss_52": 1.4295871376991272, + "ce_loss_7": 4.519614219665527, + "epoch": 0.213, + "grad_norm": 22.812352077428177, + "kl_loss_13": 5644.0, + "kl_loss_26": 4456.4, + "kl_loss_39": 3137.2, + "kl_loss_7": 6387.2, + "learning_rate": 0.0008997946909842425, + "loss": 9755.6, + "step": 2130 + }, + { + "ce_loss_13": 4.10284715294838, + "ce_loss_26": 3.5395869314670563, + "ce_loss_39": 2.9268704533576964, + "ce_loss_52": 1.4086133271455765, + "ce_loss_7": 4.462130695581436, + "epoch": 0.214, + "grad_norm": 22.23702862817867, + "kl_loss_13": 5524.8, + "kl_loss_26": 4340.4, + "kl_loss_39": 3036.0, + "kl_loss_7": 6288.8, + "learning_rate": 0.0008988398137810777, + "loss": 9645.0, + "step": 2140 + }, + { + "ce_loss_13": 4.079320967197418, + "ce_loss_26": 3.496495473384857, + "ce_loss_39": 2.8816928565502167, + "ce_loss_52": 1.3819068521261215, + "ce_loss_7": 4.434896755218506, + "epoch": 0.215, + "grad_norm": 22.79915028059786, + "kl_loss_13": 5541.6, + "kl_loss_26": 4350.0, + "kl_loss_39": 3045.2, + "kl_loss_7": 6288.8, + "learning_rate": 0.0008978809202654162, + "loss": 9686.6, + "step": 2150 + }, + { + "ce_loss_13": 4.069333535432816, + "ce_loss_26": 3.5059276044368746, + "ce_loss_39": 2.8949272632598877, + "ce_loss_52": 1.409129326045513, + "ce_loss_7": 4.434026664495468, + "epoch": 0.216, + "grad_norm": 22.908702660837623, + "kl_loss_13": 5464.0, + "kl_loss_26": 4286.8, + "kl_loss_39": 2986.8, + "kl_loss_7": 6228.0, + "learning_rate": 0.0008969180200933046, + "loss": 9665.2, + "step": 2160 + }, + { + "ce_loss_13": 4.164896643161773, + "ce_loss_26": 3.594840294122696, + "ce_loss_39": 2.9824715733528135, + "ce_loss_52": 1.4398185968399049, + "ce_loss_7": 4.5187140583992, + "epoch": 0.217, + "grad_norm": 22.2992725858673, + "kl_loss_13": 5602.4, + "kl_loss_26": 4409.6, + "kl_loss_39": 3101.6, + "kl_loss_7": 6338.4, + "learning_rate": 0.0008959511229611376, + "loss": 9611.1, + "step": 2170 + }, + { + "ce_loss_13": 4.1424953758716585, + "ce_loss_26": 3.5846896708011626, + "ce_loss_39": 2.9776509165763856, + "ce_loss_52": 1.4638631641864777, + "ce_loss_7": 4.499098914861679, + "epoch": 0.218, + "grad_norm": 22.569206560566755, + "kl_loss_13": 5520.8, + "kl_loss_26": 4360.4, + "kl_loss_39": 3039.2, + "kl_loss_7": 6262.4, + "learning_rate": 0.0008949802386055581, + "loss": 9598.7, + "step": 2180 + }, + { + "ce_loss_13": 4.124321860074997, + "ce_loss_26": 3.559172648191452, + "ce_loss_39": 2.942674660682678, + "ce_loss_52": 1.4182981908321382, + "ce_loss_7": 4.483241724967956, + "epoch": 0.219, + "grad_norm": 22.517460780417444, + "kl_loss_13": 5559.2, + "kl_loss_26": 4374.8, + "kl_loss_39": 3066.4, + "kl_loss_7": 6304.0, + "learning_rate": 0.0008940053768033609, + "loss": 9610.7, + "step": 2190 + }, + { + "ce_loss_13": 4.136555308103562, + "ce_loss_26": 3.5674175798892973, + "ce_loss_39": 2.9441113233566285, + "ce_loss_52": 1.4381250411272049, + "ce_loss_7": 4.481418180465698, + "epoch": 0.22, + "grad_norm": 23.1169100672147, + "kl_loss_13": 5545.6, + "kl_loss_26": 4354.4, + "kl_loss_39": 3035.2, + "kl_loss_7": 6270.4, + "learning_rate": 0.0008930265473713938, + "loss": 9621.3, + "step": 2200 + }, + { + "ce_loss_13": 4.116154849529266, + "ce_loss_26": 3.5370861172676085, + "ce_loss_39": 2.9127448469400408, + "ce_loss_52": 1.3838467657566071, + "ce_loss_7": 4.479850220680237, + "epoch": 0.221, + "grad_norm": 23.327101471626264, + "kl_loss_13": 5613.6, + "kl_loss_26": 4407.2, + "kl_loss_39": 3084.0, + "kl_loss_7": 6376.8, + "learning_rate": 0.0008920437601664579, + "loss": 9580.3, + "step": 2210 + }, + { + "ce_loss_13": 4.081603097915649, + "ce_loss_26": 3.533859223127365, + "ce_loss_39": 2.9178696632385255, + "ce_loss_52": 1.4558427572250365, + "ce_loss_7": 4.428021937608719, + "epoch": 0.222, + "grad_norm": 24.571271492626643, + "kl_loss_13": 5416.8, + "kl_loss_26": 4265.2, + "kl_loss_39": 2949.2, + "kl_loss_7": 6150.4, + "learning_rate": 0.0008910570250852097, + "loss": 9535.0, + "step": 2220 + }, + { + "ce_loss_13": 4.016762095689773, + "ce_loss_26": 3.453756958246231, + "ce_loss_39": 2.8424510210752487, + "ce_loss_52": 1.382763533294201, + "ce_loss_7": 4.369659447669983, + "epoch": 0.223, + "grad_norm": 22.39109803193224, + "kl_loss_13": 5417.6, + "kl_loss_26": 4255.6, + "kl_loss_39": 2951.6, + "kl_loss_7": 6149.6, + "learning_rate": 0.0008900663520640604, + "loss": 9449.7, + "step": 2230 + }, + { + "ce_loss_13": 4.07697583436966, + "ce_loss_26": 3.5205394327640533, + "ce_loss_39": 2.9201291859149934, + "ce_loss_52": 1.4419079095125198, + "ce_loss_7": 4.432818019390107, + "epoch": 0.224, + "grad_norm": 29.33071925320992, + "kl_loss_13": 5431.2, + "kl_loss_26": 4260.4, + "kl_loss_39": 2980.0, + "kl_loss_7": 6176.8, + "learning_rate": 0.0008890717510790764, + "loss": 9471.4, + "step": 2240 + }, + { + "ce_loss_13": 4.099857300519943, + "ce_loss_26": 3.5491108179092405, + "ce_loss_39": 2.946030503511429, + "ce_loss_52": 1.4482155337929725, + "ce_loss_7": 4.446840679645538, + "epoch": 0.225, + "grad_norm": 24.393145108562546, + "kl_loss_13": 5448.0, + "kl_loss_26": 4290.4, + "kl_loss_39": 3004.4, + "kl_loss_7": 6176.0, + "learning_rate": 0.0008880732321458784, + "loss": 9429.4, + "step": 2250 + }, + { + "ce_loss_13": 4.008820396661759, + "ce_loss_26": 3.466299217939377, + "ce_loss_39": 2.8751066744327547, + "ce_loss_52": 1.4349601715803146, + "ce_loss_7": 4.357817393541336, + "epoch": 0.226, + "grad_norm": 23.790321486003762, + "kl_loss_13": 5306.4, + "kl_loss_26": 4167.6, + "kl_loss_39": 2894.0, + "kl_loss_7": 6034.4, + "learning_rate": 0.0008870708053195413, + "loss": 9349.3, + "step": 2260 + }, + { + "ce_loss_13": 4.0613229155540465, + "ce_loss_26": 3.495078670978546, + "ce_loss_39": 2.889867639541626, + "ce_loss_52": 1.417461496591568, + "ce_loss_7": 4.40853306055069, + "epoch": 0.227, + "grad_norm": 24.394028059861938, + "kl_loss_13": 5412.8, + "kl_loss_26": 4243.2, + "kl_loss_39": 2960.8, + "kl_loss_7": 6130.4, + "learning_rate": 0.0008860644806944918, + "loss": 9352.6, + "step": 2270 + }, + { + "ce_loss_13": 4.178533679246902, + "ce_loss_26": 3.622497373819351, + "ce_loss_39": 3.006651484966278, + "ce_loss_52": 1.4494876891374588, + "ce_loss_7": 4.527337849140167, + "epoch": 0.228, + "grad_norm": 22.806373163177923, + "kl_loss_13": 5580.0, + "kl_loss_26": 4405.2, + "kl_loss_39": 3104.0, + "kl_loss_7": 6317.6, + "learning_rate": 0.0008850542684044079, + "loss": 9441.9, + "step": 2280 + }, + { + "ce_loss_13": 4.018020331859589, + "ce_loss_26": 3.458746635913849, + "ce_loss_39": 2.8657322227954865, + "ce_loss_52": 1.4288572728633882, + "ce_loss_7": 4.375551146268845, + "epoch": 0.229, + "grad_norm": 22.45015355344987, + "kl_loss_13": 5303.2, + "kl_loss_26": 4136.0, + "kl_loss_39": 2856.8, + "kl_loss_7": 6051.2, + "learning_rate": 0.0008840401786221159, + "loss": 9343.7, + "step": 2290 + }, + { + "ce_loss_13": 4.0382424116134645, + "ce_loss_26": 3.4842948436737062, + "ce_loss_39": 2.894715803861618, + "ce_loss_52": 1.4393651276826858, + "ce_loss_7": 4.377675461769104, + "epoch": 0.23, + "grad_norm": 23.31839583792026, + "kl_loss_13": 5351.2, + "kl_loss_26": 4188.4, + "kl_loss_39": 2912.0, + "kl_loss_7": 6064.8, + "learning_rate": 0.000883022221559489, + "loss": 9246.3, + "step": 2300 + }, + { + "ce_loss_13": 4.038966596126556, + "ce_loss_26": 3.488220602273941, + "ce_loss_39": 2.8812575459480287, + "ce_loss_52": 1.441010195016861, + "ce_loss_7": 4.381568449735641, + "epoch": 0.231, + "grad_norm": 22.254622557882463, + "kl_loss_13": 5335.2, + "kl_loss_26": 4171.2, + "kl_loss_39": 2887.2, + "kl_loss_7": 6052.8, + "learning_rate": 0.0008820004074673434, + "loss": 9220.3, + "step": 2310 + }, + { + "ce_loss_13": 3.9854084312915803, + "ce_loss_26": 3.4409989297389982, + "ce_loss_39": 2.84497589468956, + "ce_loss_52": 1.4122451767325401, + "ce_loss_7": 4.340046459436417, + "epoch": 0.232, + "grad_norm": 21.074813671439337, + "kl_loss_13": 5296.8, + "kl_loss_26": 4147.6, + "kl_loss_39": 2864.8, + "kl_loss_7": 6036.8, + "learning_rate": 0.0008809747466353355, + "loss": 9279.8, + "step": 2320 + }, + { + "ce_loss_13": 4.115998637676239, + "ce_loss_26": 3.5584963142871855, + "ce_loss_39": 2.9395908057689666, + "ce_loss_52": 1.465473085641861, + "ce_loss_7": 4.470573830604553, + "epoch": 0.233, + "grad_norm": 22.26955688088229, + "kl_loss_13": 5459.2, + "kl_loss_26": 4288.0, + "kl_loss_39": 2960.4, + "kl_loss_7": 6197.6, + "learning_rate": 0.0008799452493918585, + "loss": 9213.2, + "step": 2330 + }, + { + "ce_loss_13": 3.9350290656089784, + "ce_loss_26": 3.3855203211307527, + "ce_loss_39": 2.799959135055542, + "ce_loss_52": 1.4289379581809043, + "ce_loss_7": 4.274723726511001, + "epoch": 0.234, + "grad_norm": 22.04159638849698, + "kl_loss_13": 5186.8, + "kl_loss_26": 4026.0, + "kl_loss_39": 2752.2, + "kl_loss_7": 5904.4, + "learning_rate": 0.0008789119261039385, + "loss": 9222.9, + "step": 2340 + }, + { + "ce_loss_13": 3.977653867006302, + "ce_loss_26": 3.4353197515010834, + "ce_loss_39": 2.8274969339370726, + "ce_loss_52": 1.400892499089241, + "ce_loss_7": 4.322875905036926, + "epoch": 0.235, + "grad_norm": 25.32335755706349, + "kl_loss_13": 5282.4, + "kl_loss_26": 4139.6, + "kl_loss_39": 2849.2, + "kl_loss_7": 6001.6, + "learning_rate": 0.0008778747871771292, + "loss": 9101.4, + "step": 2350 + }, + { + "ce_loss_13": 3.9699031889438627, + "ce_loss_26": 3.4155133664608, + "ce_loss_39": 2.8127492308616637, + "ce_loss_52": 1.4119284138083459, + "ce_loss_7": 4.322866821289063, + "epoch": 0.236, + "grad_norm": 24.250283920991954, + "kl_loss_13": 5259.2, + "kl_loss_26": 4092.8, + "kl_loss_39": 2816.4, + "kl_loss_7": 6000.8, + "learning_rate": 0.0008768338430554083, + "loss": 9104.0, + "step": 2360 + }, + { + "ce_loss_13": 3.928439366817474, + "ce_loss_26": 3.382099211215973, + "ce_loss_39": 2.789314305782318, + "ce_loss_52": 1.3916988223791122, + "ce_loss_7": 4.277575564384461, + "epoch": 0.237, + "grad_norm": 23.978839586298704, + "kl_loss_13": 5212.0, + "kl_loss_26": 4068.4, + "kl_loss_39": 2797.2, + "kl_loss_7": 5938.4, + "learning_rate": 0.0008757891042210713, + "loss": 9141.7, + "step": 2370 + }, + { + "ce_loss_13": 3.9462322175502775, + "ce_loss_26": 3.397873044013977, + "ce_loss_39": 2.8039229214191437, + "ce_loss_52": 1.4037385553121566, + "ce_loss_7": 4.290230017900467, + "epoch": 0.238, + "grad_norm": 23.01247346605362, + "kl_loss_13": 5215.2, + "kl_loss_26": 4079.6, + "kl_loss_39": 2813.2, + "kl_loss_7": 5942.4, + "learning_rate": 0.0008747405811946271, + "loss": 9055.8, + "step": 2380 + }, + { + "ce_loss_13": 3.98149796128273, + "ce_loss_26": 3.442757821083069, + "ce_loss_39": 2.84624342918396, + "ce_loss_52": 1.445407471060753, + "ce_loss_7": 4.318572920560837, + "epoch": 0.239, + "grad_norm": 22.74122664649998, + "kl_loss_13": 5226.4, + "kl_loss_26": 4089.6, + "kl_loss_39": 2827.6, + "kl_loss_7": 5934.4, + "learning_rate": 0.0008736882845346905, + "loss": 9110.6, + "step": 2390 + }, + { + "ce_loss_13": 3.9661067545413973, + "ce_loss_26": 3.4294336676597594, + "ce_loss_39": 2.836567336320877, + "ce_loss_52": 1.442602628469467, + "ce_loss_7": 4.3087667465209964, + "epoch": 0.24, + "grad_norm": 23.333126009298994, + "kl_loss_13": 5196.0, + "kl_loss_26": 4051.6, + "kl_loss_39": 2790.0, + "kl_loss_7": 5911.2, + "learning_rate": 0.0008726322248378774, + "loss": 9064.8, + "step": 2400 + }, + { + "ce_loss_13": 3.988937532901764, + "ce_loss_26": 3.4361318945884705, + "ce_loss_39": 2.833483111858368, + "ce_loss_52": 1.4274606987833978, + "ce_loss_7": 4.3361672222614285, + "epoch": 0.241, + "grad_norm": 21.55988300492865, + "kl_loss_13": 5244.8, + "kl_loss_26": 4091.2, + "kl_loss_39": 2809.2, + "kl_loss_7": 5974.4, + "learning_rate": 0.0008715724127386971, + "loss": 9048.5, + "step": 2410 + }, + { + "ce_loss_13": 3.93166036605835, + "ce_loss_26": 3.3924847066402437, + "ce_loss_39": 2.8114346325397492, + "ce_loss_52": 1.433423739671707, + "ce_loss_7": 4.277747517824173, + "epoch": 0.242, + "grad_norm": 21.76629675806892, + "kl_loss_13": 5152.8, + "kl_loss_26": 4024.4, + "kl_loss_39": 2775.6, + "kl_loss_7": 5876.0, + "learning_rate": 0.0008705088589094458, + "loss": 8950.6, + "step": 2420 + }, + { + "ce_loss_13": 4.0298320889472965, + "ce_loss_26": 3.4784019589424133, + "ce_loss_39": 2.8909366130828857, + "ce_loss_52": 1.4593130856752397, + "ce_loss_7": 4.372854852676392, + "epoch": 0.243, + "grad_norm": 22.782714549711034, + "kl_loss_13": 5275.2, + "kl_loss_26": 4132.0, + "kl_loss_39": 2868.8, + "kl_loss_7": 6000.0, + "learning_rate": 0.0008694415740600988, + "loss": 8979.7, + "step": 2430 + }, + { + "ce_loss_13": 3.957322496175766, + "ce_loss_26": 3.391196775436401, + "ce_loss_39": 2.7942449331283568, + "ce_loss_52": 1.429965654015541, + "ce_loss_7": 4.3002465009689335, + "epoch": 0.244, + "grad_norm": 22.108343623695664, + "kl_loss_13": 5175.2, + "kl_loss_26": 4000.0, + "kl_loss_39": 2735.2, + "kl_loss_7": 5894.4, + "learning_rate": 0.0008683705689382025, + "loss": 8983.5, + "step": 2440 + }, + { + "ce_loss_13": 3.914830905199051, + "ce_loss_26": 3.371804046630859, + "ce_loss_39": 2.789295125007629, + "ce_loss_52": 1.4458730816841125, + "ce_loss_7": 4.244754731655121, + "epoch": 0.245, + "grad_norm": 22.68476073719735, + "kl_loss_13": 5094.4, + "kl_loss_26": 3954.8, + "kl_loss_39": 2705.0, + "kl_loss_7": 5792.8, + "learning_rate": 0.0008672958543287666, + "loss": 8971.0, + "step": 2450 + }, + { + "ce_loss_13": 3.910190373659134, + "ce_loss_26": 3.3697587728500364, + "ce_loss_39": 2.7743864953517914, + "ce_loss_52": 1.4169176414608955, + "ce_loss_7": 4.245863050222397, + "epoch": 0.246, + "grad_norm": 23.78530061144511, + "kl_loss_13": 5117.6, + "kl_loss_26": 3979.2, + "kl_loss_39": 2714.8, + "kl_loss_7": 5818.4, + "learning_rate": 0.0008662174410541554, + "loss": 8871.3, + "step": 2460 + }, + { + "ce_loss_13": 3.905332827568054, + "ce_loss_26": 3.3677509129047394, + "ce_loss_39": 2.785578554868698, + "ce_loss_52": 1.4284577563405036, + "ce_loss_7": 4.243521982431412, + "epoch": 0.247, + "grad_norm": 21.62010382710404, + "kl_loss_13": 5076.0, + "kl_loss_26": 3951.2, + "kl_loss_39": 2706.0, + "kl_loss_7": 5774.4, + "learning_rate": 0.0008651353399739787, + "loss": 8827.8, + "step": 2470 + }, + { + "ce_loss_13": 3.9418592929840086, + "ce_loss_26": 3.399972987174988, + "ce_loss_39": 2.7943135529756544, + "ce_loss_52": 1.4213671818375588, + "ce_loss_7": 4.285894882678986, + "epoch": 0.248, + "grad_norm": 21.67451956689309, + "kl_loss_13": 5164.0, + "kl_loss_26": 4026.4, + "kl_loss_39": 2746.4, + "kl_loss_7": 5886.4, + "learning_rate": 0.0008640495619849821, + "loss": 8908.6, + "step": 2480 + }, + { + "ce_loss_13": 3.9583646595478057, + "ce_loss_26": 3.418367612361908, + "ce_loss_39": 2.8201009154319765, + "ce_loss_52": 1.4697474852204322, + "ce_loss_7": 4.3033524513244625, + "epoch": 0.249, + "grad_norm": 23.94241052015279, + "kl_loss_13": 5140.0, + "kl_loss_26": 4010.0, + "kl_loss_39": 2731.6, + "kl_loss_7": 5860.0, + "learning_rate": 0.0008629601180209381, + "loss": 8796.4, + "step": 2490 + }, + { + "ce_loss_13": 3.9134137570858, + "ce_loss_26": 3.3699698984622954, + "ce_loss_39": 2.781053990125656, + "ce_loss_52": 1.4334075331687928, + "ce_loss_7": 4.242314898967743, + "epoch": 0.25, + "grad_norm": 22.621772280297588, + "kl_loss_13": 5100.0, + "kl_loss_26": 3960.0, + "kl_loss_39": 2699.6, + "kl_loss_7": 5796.0, + "learning_rate": 0.000861867019052535, + "loss": 8802.5, + "step": 2500 + }, + { + "ce_loss_13": 3.975496470928192, + "ce_loss_26": 3.4352354168891908, + "ce_loss_39": 2.8324910700321198, + "ce_loss_52": 1.469119620323181, + "ce_loss_7": 4.319353139400482, + "epoch": 0.251, + "grad_norm": 24.031546669852546, + "kl_loss_13": 5152.4, + "kl_loss_26": 4019.6, + "kl_loss_39": 2733.2, + "kl_loss_7": 5876.8, + "learning_rate": 0.0008607702760872678, + "loss": 8791.0, + "step": 2510 + }, + { + "ce_loss_13": 3.970981556177139, + "ce_loss_26": 3.4194670915603638, + "ce_loss_39": 2.826087462902069, + "ce_loss_52": 1.457669761776924, + "ce_loss_7": 4.311935073137283, + "epoch": 0.252, + "grad_norm": 22.68902245721029, + "kl_loss_13": 5173.6, + "kl_loss_26": 4025.2, + "kl_loss_39": 2753.2, + "kl_loss_7": 5881.6, + "learning_rate": 0.0008596699001693256, + "loss": 8797.8, + "step": 2520 + }, + { + "ce_loss_13": 3.9163833260536194, + "ce_loss_26": 3.378202974796295, + "ce_loss_39": 2.789392131567001, + "ce_loss_52": 1.4278187423944473, + "ce_loss_7": 4.251722925901413, + "epoch": 0.253, + "grad_norm": 23.732161584585768, + "kl_loss_13": 5073.6, + "kl_loss_26": 3944.0, + "kl_loss_39": 2692.8, + "kl_loss_7": 5776.8, + "learning_rate": 0.0008585659023794818, + "loss": 8730.9, + "step": 2530 + }, + { + "ce_loss_13": 3.8952399492263794, + "ce_loss_26": 3.3577997207641603, + "ce_loss_39": 2.761990362405777, + "ce_loss_52": 1.421005728840828, + "ce_loss_7": 4.233315163850785, + "epoch": 0.254, + "grad_norm": 23.217787871644095, + "kl_loss_13": 5080.0, + "kl_loss_26": 3941.2, + "kl_loss_39": 2674.0, + "kl_loss_7": 5788.8, + "learning_rate": 0.0008574582938349817, + "loss": 8689.0, + "step": 2540 + }, + { + "ce_loss_13": 3.9610107481479644, + "ce_loss_26": 3.423109310865402, + "ce_loss_39": 2.8520358502864838, + "ce_loss_52": 1.4856882840394974, + "ce_loss_7": 4.297859001159668, + "epoch": 0.255, + "grad_norm": 24.36417114927956, + "kl_loss_13": 5090.4, + "kl_loss_26": 3959.2, + "kl_loss_39": 2721.2, + "kl_loss_7": 5797.6, + "learning_rate": 0.0008563470856894315, + "loss": 8682.7, + "step": 2550 + }, + { + "ce_loss_13": 3.9392871856689453, + "ce_loss_26": 3.4012055695056915, + "ce_loss_39": 2.808671069145203, + "ce_loss_52": 1.472413820028305, + "ce_loss_7": 4.270819437503815, + "epoch": 0.256, + "grad_norm": 22.260198047396518, + "kl_loss_13": 5046.4, + "kl_loss_26": 3928.4, + "kl_loss_39": 2676.0, + "kl_loss_7": 5743.2, + "learning_rate": 0.0008552322891326845, + "loss": 8696.9, + "step": 2560 + }, + { + "ce_loss_13": 3.920336198806763, + "ce_loss_26": 3.379600703716278, + "ce_loss_39": 2.7945084452629088, + "ce_loss_52": 1.4492182582616806, + "ce_loss_7": 4.2536624610424045, + "epoch": 0.257, + "grad_norm": 21.726891625639418, + "kl_loss_13": 5079.2, + "kl_loss_26": 3942.4, + "kl_loss_39": 2686.4, + "kl_loss_7": 5777.6, + "learning_rate": 0.0008541139153907296, + "loss": 8637.5, + "step": 2570 + }, + { + "ce_loss_13": 3.8855518221855165, + "ce_loss_26": 3.3411356985569, + "ce_loss_39": 2.7515164047479628, + "ce_loss_52": 1.444500783085823, + "ce_loss_7": 4.214444124698639, + "epoch": 0.258, + "grad_norm": 21.147965943731403, + "kl_loss_13": 5012.8, + "kl_loss_26": 3868.4, + "kl_loss_39": 2618.4, + "kl_loss_7": 5708.0, + "learning_rate": 0.0008529919757255782, + "loss": 8639.8, + "step": 2580 + }, + { + "ce_loss_13": 3.906326872110367, + "ce_loss_26": 3.3759279191493987, + "ce_loss_39": 2.794377237558365, + "ce_loss_52": 1.4845586121082306, + "ce_loss_7": 4.233397454023361, + "epoch": 0.259, + "grad_norm": 23.06645203237802, + "kl_loss_13": 4970.4, + "kl_loss_26": 3854.4, + "kl_loss_39": 2612.0, + "kl_loss_7": 5654.4, + "learning_rate": 0.0008518664814351503, + "loss": 8576.9, + "step": 2590 + }, + { + "ce_loss_13": 3.7816513538360597, + "ce_loss_26": 3.2462404370307922, + "ce_loss_39": 2.6499598264694213, + "ce_loss_52": 1.391408371925354, + "ce_loss_7": 4.128740018606186, + "epoch": 0.26, + "grad_norm": 23.574382647594796, + "kl_loss_13": 4927.2, + "kl_loss_26": 3814.8, + "kl_loss_39": 2554.8, + "kl_loss_7": 5640.8, + "learning_rate": 0.0008507374438531607, + "loss": 8563.9, + "step": 2600 + }, + { + "ce_loss_13": 3.9538560569286347, + "ce_loss_26": 3.4095658123493195, + "ce_loss_39": 2.8113979279994963, + "ce_loss_52": 1.4748313665390014, + "ce_loss_7": 4.2874442756176, + "epoch": 0.261, + "grad_norm": 22.486126104346287, + "kl_loss_13": 5098.4, + "kl_loss_26": 3961.2, + "kl_loss_39": 2686.8, + "kl_loss_7": 5806.4, + "learning_rate": 0.0008496048743490053, + "loss": 8565.1, + "step": 2610 + }, + { + "ce_loss_13": 3.806992840766907, + "ce_loss_26": 3.282588803768158, + "ce_loss_39": 2.7138577342033385, + "ce_loss_52": 1.4272374346852303, + "ce_loss_7": 4.137687039375305, + "epoch": 0.262, + "grad_norm": 23.33658890766694, + "kl_loss_13": 4889.6, + "kl_loss_26": 3786.4, + "kl_loss_39": 2568.2, + "kl_loss_7": 5575.2, + "learning_rate": 0.0008484687843276469, + "loss": 8535.4, + "step": 2620 + }, + { + "ce_loss_13": 3.8657312452793122, + "ce_loss_26": 3.3368504345417023, + "ce_loss_39": 2.7600394666194914, + "ce_loss_52": 1.4648621320724486, + "ce_loss_7": 4.185077089071274, + "epoch": 0.263, + "grad_norm": 21.52255395290807, + "kl_loss_13": 4951.2, + "kl_loss_26": 3845.2, + "kl_loss_39": 2613.2, + "kl_loss_7": 5622.4, + "learning_rate": 0.0008473291852294987, + "loss": 8580.4, + "step": 2630 + }, + { + "ce_loss_13": 3.8671354949474335, + "ce_loss_26": 3.334774547815323, + "ce_loss_39": 2.757487526535988, + "ce_loss_52": 1.4462923228740692, + "ce_loss_7": 4.198447376489639, + "epoch": 0.264, + "grad_norm": 22.57622271607983, + "kl_loss_13": 4962.4, + "kl_loss_26": 3854.4, + "kl_loss_39": 2616.4, + "kl_loss_7": 5648.8, + "learning_rate": 0.0008461860885303114, + "loss": 8492.7, + "step": 2640 + }, + { + "ce_loss_13": 3.875018262863159, + "ce_loss_26": 3.3343500018119814, + "ce_loss_39": 2.731833589076996, + "ce_loss_52": 1.4120988547801971, + "ce_loss_7": 4.215745764970779, + "epoch": 0.265, + "grad_norm": 21.329908759369555, + "kl_loss_13": 5048.8, + "kl_loss_26": 3930.4, + "kl_loss_39": 2648.0, + "kl_loss_7": 5764.8, + "learning_rate": 0.000845039505741056, + "loss": 8545.0, + "step": 2650 + }, + { + "ce_loss_13": 3.8392697393894197, + "ce_loss_26": 3.313974368572235, + "ce_loss_39": 2.743110102415085, + "ce_loss_52": 1.4838833779096603, + "ce_loss_7": 4.162828749418258, + "epoch": 0.266, + "grad_norm": 22.328558189428094, + "kl_loss_13": 4850.4, + "kl_loss_26": 3737.2, + "kl_loss_39": 2504.0, + "kl_loss_7": 5524.8, + "learning_rate": 0.0008438894484078086, + "loss": 8456.0, + "step": 2660 + }, + { + "ce_loss_13": 3.7446135103702547, + "ce_loss_26": 3.2187088668346404, + "ce_loss_39": 2.6340013802051545, + "ce_loss_52": 1.389008679986, + "ce_loss_7": 4.068695777654648, + "epoch": 0.267, + "grad_norm": 22.22147816002266, + "kl_loss_13": 4894.4, + "kl_loss_26": 3784.0, + "kl_loss_39": 2542.0, + "kl_loss_7": 5576.0, + "learning_rate": 0.0008427359281116334, + "loss": 8425.6, + "step": 2670 + }, + { + "ce_loss_13": 3.8235996186733248, + "ce_loss_26": 3.292762166261673, + "ce_loss_39": 2.7183104872703554, + "ce_loss_52": 1.4328299894928933, + "ce_loss_7": 4.153199070692063, + "epoch": 0.268, + "grad_norm": 22.48935879447384, + "kl_loss_13": 4909.2, + "kl_loss_26": 3800.0, + "kl_loss_39": 2568.2, + "kl_loss_7": 5595.2, + "learning_rate": 0.0008415789564684673, + "loss": 8422.0, + "step": 2680 + }, + { + "ce_loss_13": 3.776876950263977, + "ce_loss_26": 3.2493546307086945, + "ce_loss_39": 2.682066896557808, + "ce_loss_52": 1.426028846204281, + "ce_loss_7": 4.102496325969696, + "epoch": 0.269, + "grad_norm": 23.679133499819734, + "kl_loss_13": 4848.8, + "kl_loss_26": 3742.4, + "kl_loss_39": 2509.6, + "kl_loss_7": 5526.4, + "learning_rate": 0.0008404185451290017, + "loss": 8501.8, + "step": 2690 + }, + { + "ce_loss_13": 3.8134057581424714, + "ce_loss_26": 3.272122323513031, + "ce_loss_39": 2.686330908536911, + "ce_loss_52": 1.4213334619998932, + "ce_loss_7": 4.143577241897583, + "epoch": 0.27, + "grad_norm": 22.090911465136045, + "kl_loss_13": 4938.4, + "kl_loss_26": 3810.8, + "kl_loss_39": 2554.8, + "kl_loss_7": 5636.8, + "learning_rate": 0.0008392547057785661, + "loss": 8351.5, + "step": 2700 + }, + { + "ce_loss_13": 3.786053466796875, + "ce_loss_26": 3.247877132892609, + "ce_loss_39": 2.6689940333366393, + "ce_loss_52": 1.4167698860168456, + "ce_loss_7": 4.1162903845310215, + "epoch": 0.271, + "grad_norm": 20.5206064618152, + "kl_loss_13": 4872.8, + "kl_loss_26": 3742.0, + "kl_loss_39": 2504.4, + "kl_loss_7": 5563.2, + "learning_rate": 0.0008380874501370098, + "loss": 8427.4, + "step": 2710 + }, + { + "ce_loss_13": 3.704389762878418, + "ce_loss_26": 3.187553709745407, + "ce_loss_39": 2.624448519945145, + "ce_loss_52": 1.42959221303463, + "ce_loss_7": 4.022905468940735, + "epoch": 0.272, + "grad_norm": 24.53128855683424, + "kl_loss_13": 4716.8, + "kl_loss_26": 3630.0, + "kl_loss_39": 2409.2, + "kl_loss_7": 5390.4, + "learning_rate": 0.0008369167899585841, + "loss": 8346.0, + "step": 2720 + }, + { + "ce_loss_13": 3.7978334367275237, + "ce_loss_26": 3.257440310716629, + "ce_loss_39": 2.673193109035492, + "ce_loss_52": 1.4194035559892655, + "ce_loss_7": 4.132071840763092, + "epoch": 0.273, + "grad_norm": 22.71970164256676, + "kl_loss_13": 4896.8, + "kl_loss_26": 3759.2, + "kl_loss_39": 2513.4, + "kl_loss_7": 5595.2, + "learning_rate": 0.0008357427370318238, + "loss": 8347.6, + "step": 2730 + }, + { + "ce_loss_13": 3.7844323456287383, + "ce_loss_26": 3.263492447137833, + "ce_loss_39": 2.6871775329113006, + "ce_loss_52": 1.452097550034523, + "ce_loss_7": 4.105139708518982, + "epoch": 0.274, + "grad_norm": 22.48264422716788, + "kl_loss_13": 4807.2, + "kl_loss_26": 3710.0, + "kl_loss_39": 2469.2, + "kl_loss_7": 5490.4, + "learning_rate": 0.0008345653031794292, + "loss": 8382.9, + "step": 2740 + }, + { + "ce_loss_13": 3.8110480844974517, + "ce_loss_26": 3.2820364594459535, + "ce_loss_39": 2.7046351432800293, + "ce_loss_52": 1.4519646763801575, + "ce_loss_7": 4.134507310390473, + "epoch": 0.275, + "grad_norm": 21.943498417005777, + "kl_loss_13": 4812.0, + "kl_loss_26": 3713.6, + "kl_loss_39": 2499.8, + "kl_loss_7": 5488.0, + "learning_rate": 0.0008333845002581458, + "loss": 8287.2, + "step": 2750 + }, + { + "ce_loss_13": 3.822121250629425, + "ce_loss_26": 3.3007264256477358, + "ce_loss_39": 2.73195458650589, + "ce_loss_52": 1.4655994832515717, + "ce_loss_7": 4.141650629043579, + "epoch": 0.276, + "grad_norm": 22.59740270522763, + "kl_loss_13": 4832.8, + "kl_loss_26": 3745.6, + "kl_loss_39": 2538.4, + "kl_loss_7": 5500.0, + "learning_rate": 0.0008322003401586462, + "loss": 8283.1, + "step": 2760 + }, + { + "ce_loss_13": 3.726576977968216, + "ce_loss_26": 3.214203953742981, + "ce_loss_39": 2.669701686501503, + "ce_loss_52": 1.4409982591867447, + "ce_loss_7": 4.043469870090485, + "epoch": 0.277, + "grad_norm": 21.384417928568727, + "kl_loss_13": 4712.0, + "kl_loss_26": 3643.6, + "kl_loss_39": 2461.0, + "kl_loss_7": 5379.2, + "learning_rate": 0.0008310128348054094, + "loss": 8251.4, + "step": 2770 + }, + { + "ce_loss_13": 3.768916404247284, + "ce_loss_26": 3.2343318104743957, + "ce_loss_39": 2.6558803230524064, + "ce_loss_52": 1.4221897169947624, + "ce_loss_7": 4.097134619951248, + "epoch": 0.278, + "grad_norm": 21.8508758307847, + "kl_loss_13": 4846.4, + "kl_loss_26": 3737.6, + "kl_loss_39": 2492.4, + "kl_loss_7": 5534.4, + "learning_rate": 0.0008298219961566008, + "loss": 8264.2, + "step": 2780 + }, + { + "ce_loss_13": 3.73385471701622, + "ce_loss_26": 3.216676640510559, + "ce_loss_39": 2.634915125370026, + "ce_loss_52": 1.4022609382867812, + "ce_loss_7": 4.067033034563065, + "epoch": 0.279, + "grad_norm": 22.23449381616188, + "kl_loss_13": 4806.0, + "kl_loss_26": 3708.8, + "kl_loss_39": 2479.2, + "kl_loss_7": 5499.2, + "learning_rate": 0.0008286278362039527, + "loss": 8184.2, + "step": 2790 + }, + { + "ce_loss_13": 3.756936568021774, + "ce_loss_26": 3.2383838176727293, + "ce_loss_39": 2.672528338432312, + "ce_loss_52": 1.452534568309784, + "ce_loss_7": 4.076909917593002, + "epoch": 0.28, + "grad_norm": 21.54056853237845, + "kl_loss_13": 4743.2, + "kl_loss_26": 3661.6, + "kl_loss_39": 2456.0, + "kl_loss_7": 5414.4, + "learning_rate": 0.0008274303669726426, + "loss": 8160.7, + "step": 2800 + }, + { + "ce_loss_13": 3.8688619792461396, + "ce_loss_26": 3.3306061148643495, + "ce_loss_39": 2.7420520305633547, + "ce_loss_52": 1.4561516880989074, + "ce_loss_7": 4.191312706470489, + "epoch": 0.281, + "grad_norm": 23.01011471220724, + "kl_loss_13": 4962.4, + "kl_loss_26": 3836.4, + "kl_loss_39": 2581.6, + "kl_loss_7": 5640.8, + "learning_rate": 0.0008262296005211721, + "loss": 8239.5, + "step": 2810 + }, + { + "ce_loss_13": 3.7579640209674836, + "ce_loss_26": 3.2256029903888703, + "ce_loss_39": 2.650630474090576, + "ce_loss_52": 1.4400919079780579, + "ce_loss_7": 4.077768385410309, + "epoch": 0.282, + "grad_norm": 21.557554897738267, + "kl_loss_13": 4784.8, + "kl_loss_26": 3661.6, + "kl_loss_39": 2435.2, + "kl_loss_7": 5454.4, + "learning_rate": 0.0008250255489412463, + "loss": 8218.5, + "step": 2820 + }, + { + "ce_loss_13": 3.7878367722034456, + "ce_loss_26": 3.255573272705078, + "ce_loss_39": 2.667675232887268, + "ce_loss_52": 1.4289155021309852, + "ce_loss_7": 4.111210036277771, + "epoch": 0.283, + "grad_norm": 22.099755132556425, + "kl_loss_13": 4851.2, + "kl_loss_26": 3733.6, + "kl_loss_39": 2480.6, + "kl_loss_7": 5527.2, + "learning_rate": 0.0008238182243576511, + "loss": 8152.9, + "step": 2830 + }, + { + "ce_loss_13": 3.7699286341667175, + "ce_loss_26": 3.2401221811771395, + "ce_loss_39": 2.6614575743675233, + "ce_loss_52": 1.4347774118185044, + "ce_loss_7": 4.08802090883255, + "epoch": 0.284, + "grad_norm": 21.441617328301042, + "kl_loss_13": 4791.6, + "kl_loss_26": 3695.6, + "kl_loss_39": 2469.8, + "kl_loss_7": 5453.6, + "learning_rate": 0.0008226076389281315, + "loss": 8141.7, + "step": 2840 + }, + { + "ce_loss_13": 3.692534440755844, + "ce_loss_26": 3.174397534132004, + "ce_loss_39": 2.6222778260707855, + "ce_loss_52": 1.4332606226205826, + "ce_loss_7": 4.00234357714653, + "epoch": 0.285, + "grad_norm": 23.306650885126444, + "kl_loss_13": 4633.6, + "kl_loss_26": 3560.0, + "kl_loss_39": 2376.6, + "kl_loss_7": 5279.2, + "learning_rate": 0.0008213938048432696, + "loss": 8068.6, + "step": 2850 + }, + { + "ce_loss_13": 3.6946506440639495, + "ce_loss_26": 3.1700410664081575, + "ce_loss_39": 2.597234898805618, + "ce_loss_52": 1.4046493530273438, + "ce_loss_7": 4.0274644792079926, + "epoch": 0.286, + "grad_norm": 21.879949646782595, + "kl_loss_13": 4721.6, + "kl_loss_26": 3628.8, + "kl_loss_39": 2400.4, + "kl_loss_7": 5412.8, + "learning_rate": 0.0008201767343263612, + "loss": 8086.6, + "step": 2860 + }, + { + "ce_loss_13": 3.7227329850196837, + "ce_loss_26": 3.200405848026276, + "ce_loss_39": 2.637904042005539, + "ce_loss_52": 1.422918725013733, + "ce_loss_7": 4.043233323097229, + "epoch": 0.287, + "grad_norm": 24.428095636864317, + "kl_loss_13": 4732.0, + "kl_loss_26": 3643.6, + "kl_loss_39": 2433.8, + "kl_loss_7": 5399.2, + "learning_rate": 0.0008189564396332927, + "loss": 8066.0, + "step": 2870 + }, + { + "ce_loss_13": 3.721643441915512, + "ce_loss_26": 3.185835379362106, + "ce_loss_39": 2.6226376593112946, + "ce_loss_52": 1.4448419839143753, + "ce_loss_7": 4.039817118644715, + "epoch": 0.288, + "grad_norm": 22.93644669160459, + "kl_loss_13": 4683.2, + "kl_loss_26": 3561.2, + "kl_loss_39": 2345.2, + "kl_loss_7": 5352.8, + "learning_rate": 0.0008177329330524181, + "loss": 8090.5, + "step": 2880 + }, + { + "ce_loss_13": 3.732273721694946, + "ce_loss_26": 3.217743480205536, + "ce_loss_39": 2.656236010789871, + "ce_loss_52": 1.4405113011598587, + "ce_loss_7": 4.046578335762024, + "epoch": 0.289, + "grad_norm": 22.27500685105708, + "kl_loss_13": 4702.0, + "kl_loss_26": 3629.6, + "kl_loss_39": 2416.0, + "kl_loss_7": 5360.8, + "learning_rate": 0.0008165062269044352, + "loss": 8083.7, + "step": 2890 + }, + { + "ce_loss_13": 3.7308314204216004, + "ce_loss_26": 3.2038592040538787, + "ce_loss_39": 2.6387904793024064, + "ce_loss_52": 1.44214668571949, + "ce_loss_7": 4.04458264708519, + "epoch": 0.29, + "grad_norm": 22.45358727511497, + "kl_loss_13": 4700.4, + "kl_loss_26": 3605.2, + "kl_loss_39": 2398.4, + "kl_loss_7": 5358.4, + "learning_rate": 0.0008152763335422613, + "loss": 8063.0, + "step": 2900 + }, + { + "ce_loss_13": 3.6699211478233336, + "ce_loss_26": 3.153660440444946, + "ce_loss_39": 2.5951134085655214, + "ce_loss_52": 1.4221089735627175, + "ce_loss_7": 3.978937405347824, + "epoch": 0.291, + "grad_norm": 23.44237828375525, + "kl_loss_13": 4614.0, + "kl_loss_26": 3534.4, + "kl_loss_39": 2341.4, + "kl_loss_7": 5261.6, + "learning_rate": 0.0008140432653509088, + "loss": 8001.3, + "step": 2910 + }, + { + "ce_loss_13": 3.6406539916992187, + "ce_loss_26": 3.1216741025447847, + "ce_loss_39": 2.5570163398981096, + "ce_loss_52": 1.39638482183218, + "ce_loss_7": 3.9557625532150267, + "epoch": 0.292, + "grad_norm": 21.187460458215387, + "kl_loss_13": 4592.8, + "kl_loss_26": 3516.8, + "kl_loss_39": 2326.8, + "kl_loss_7": 5254.0, + "learning_rate": 0.0008128070347473608, + "loss": 7966.5, + "step": 2920 + }, + { + "ce_loss_13": 3.665645903348923, + "ce_loss_26": 3.149553042650223, + "ce_loss_39": 2.58870205283165, + "ce_loss_52": 1.4208435118198395, + "ce_loss_7": 3.9827320516109466, + "epoch": 0.293, + "grad_norm": 21.300592787802476, + "kl_loss_13": 4619.6, + "kl_loss_26": 3530.8, + "kl_loss_39": 2336.0, + "kl_loss_7": 5284.0, + "learning_rate": 0.0008115676541804455, + "loss": 7990.7, + "step": 2930 + }, + { + "ce_loss_13": 3.6261947989463805, + "ce_loss_26": 3.1126498699188234, + "ce_loss_39": 2.5498824626207353, + "ce_loss_52": 1.3916691318154335, + "ce_loss_7": 3.9482949018478393, + "epoch": 0.294, + "grad_norm": 21.8788417242541, + "kl_loss_13": 4598.0, + "kl_loss_26": 3513.2, + "kl_loss_39": 2317.4, + "kl_loss_7": 5269.6, + "learning_rate": 0.0008103251361307119, + "loss": 7972.2, + "step": 2940 + }, + { + "ce_loss_13": 3.6617009818553923, + "ce_loss_26": 3.134212648868561, + "ce_loss_39": 2.569432234764099, + "ce_loss_52": 1.4306001305580138, + "ce_loss_7": 3.978475254774094, + "epoch": 0.295, + "grad_norm": 21.383257731886584, + "kl_loss_13": 4584.8, + "kl_loss_26": 3486.8, + "kl_loss_39": 2289.6, + "kl_loss_7": 5252.0, + "learning_rate": 0.0008090794931103026, + "loss": 7903.9, + "step": 2950 + }, + { + "ce_loss_13": 3.674825745820999, + "ce_loss_26": 3.1651513874530792, + "ce_loss_39": 2.605163484811783, + "ce_loss_52": 1.434949815273285, + "ce_loss_7": 3.988205587863922, + "epoch": 0.296, + "grad_norm": 21.87171120261939, + "kl_loss_13": 4585.6, + "kl_loss_26": 3519.2, + "kl_loss_39": 2327.0, + "kl_loss_7": 5246.4, + "learning_rate": 0.0008078307376628291, + "loss": 7903.2, + "step": 2960 + }, + { + "ce_loss_13": 3.6504483819007874, + "ce_loss_26": 3.1346111416816713, + "ce_loss_39": 2.5827776730060577, + "ce_loss_52": 1.4189698368310928, + "ce_loss_7": 3.9625262200832365, + "epoch": 0.297, + "grad_norm": 23.048467847326563, + "kl_loss_13": 4571.2, + "kl_loss_26": 3498.4, + "kl_loss_39": 2319.0, + "kl_loss_7": 5224.0, + "learning_rate": 0.000806578882363245, + "loss": 7901.6, + "step": 2970 + }, + { + "ce_loss_13": 3.655070722103119, + "ce_loss_26": 3.136745995283127, + "ce_loss_39": 2.5604557782411574, + "ce_loss_52": 1.401164847612381, + "ce_loss_7": 3.973217171430588, + "epoch": 0.298, + "grad_norm": 21.078263907370157, + "kl_loss_13": 4614.4, + "kl_loss_26": 3538.4, + "kl_loss_39": 2311.4, + "kl_loss_7": 5287.2, + "learning_rate": 0.0008053239398177191, + "loss": 7911.8, + "step": 2980 + }, + { + "ce_loss_13": 3.6555157959461213, + "ce_loss_26": 3.13962464928627, + "ce_loss_39": 2.5709628492593763, + "ce_loss_52": 1.4242349237203598, + "ce_loss_7": 3.9756637513637543, + "epoch": 0.299, + "grad_norm": 22.608350182138345, + "kl_loss_13": 4603.2, + "kl_loss_26": 3520.4, + "kl_loss_39": 2304.0, + "kl_loss_7": 5274.4, + "learning_rate": 0.0008040659226635089, + "loss": 7892.4, + "step": 2990 + }, + { + "ce_loss_13": 3.6532657563686373, + "ce_loss_26": 3.124306696653366, + "ce_loss_39": 2.557386627793312, + "ce_loss_52": 1.402693158388138, + "ce_loss_7": 3.9695385217666628, + "epoch": 0.3, + "grad_norm": 22.376822604204033, + "kl_loss_13": 4616.4, + "kl_loss_26": 3523.6, + "kl_loss_39": 2321.8, + "kl_loss_7": 5287.2, + "learning_rate": 0.0008028048435688333, + "loss": 7820.7, + "step": 3000 + }, + { + "ce_loss_13": 3.6811940252780913, + "ce_loss_26": 3.1677908301353455, + "ce_loss_39": 2.607375094294548, + "ce_loss_52": 1.4560914367437363, + "ce_loss_7": 3.9915607273578644, + "epoch": 0.301, + "grad_norm": 21.84188714128543, + "kl_loss_13": 4604.8, + "kl_loss_26": 3538.4, + "kl_loss_39": 2343.2, + "kl_loss_7": 5256.0, + "learning_rate": 0.0008015407152327448, + "loss": 7933.0, + "step": 3010 + }, + { + "ce_loss_13": 3.737543153762817, + "ce_loss_26": 3.2195757627487183, + "ce_loss_39": 2.6466131448745727, + "ce_loss_52": 1.4449012607336045, + "ce_loss_7": 4.052252840995789, + "epoch": 0.302, + "grad_norm": 22.34664686545947, + "kl_loss_13": 4700.8, + "kl_loss_26": 3618.8, + "kl_loss_39": 2394.4, + "kl_loss_7": 5358.4, + "learning_rate": 0.0008002735503850016, + "loss": 7844.2, + "step": 3020 + }, + { + "ce_loss_13": 3.6698498368263244, + "ce_loss_26": 3.155323106050491, + "ce_loss_39": 2.579972979426384, + "ce_loss_52": 1.4486516952514648, + "ce_loss_7": 3.9814261555671693, + "epoch": 0.303, + "grad_norm": 22.316774640404955, + "kl_loss_13": 4563.2, + "kl_loss_26": 3486.8, + "kl_loss_39": 2285.2, + "kl_loss_7": 5213.6, + "learning_rate": 0.0007990033617859396, + "loss": 7844.3, + "step": 3030 + }, + { + "ce_loss_13": 3.661813771724701, + "ce_loss_26": 3.1450955271720886, + "ce_loss_39": 2.5806987404823305, + "ce_loss_52": 1.4304928302764892, + "ce_loss_7": 3.9737633407115935, + "epoch": 0.304, + "grad_norm": 22.094854051528255, + "kl_loss_13": 4595.2, + "kl_loss_26": 3519.2, + "kl_loss_39": 2321.2, + "kl_loss_7": 5248.0, + "learning_rate": 0.000797730162226344, + "loss": 7813.7, + "step": 3040 + }, + { + "ce_loss_13": 3.6036822319030763, + "ce_loss_26": 3.0875354915857316, + "ce_loss_39": 2.5262934505939483, + "ce_loss_52": 1.3893155947327613, + "ce_loss_7": 3.921951335668564, + "epoch": 0.305, + "grad_norm": 22.896126437016644, + "kl_loss_13": 4538.0, + "kl_loss_26": 3453.6, + "kl_loss_39": 2252.6, + "kl_loss_7": 5210.4, + "learning_rate": 0.0007964539645273203, + "loss": 7783.3, + "step": 3050 + }, + { + "ce_loss_13": 3.690790832042694, + "ce_loss_26": 3.1806884586811064, + "ce_loss_39": 2.6345931828022002, + "ce_loss_52": 1.4827970415353775, + "ce_loss_7": 3.996461832523346, + "epoch": 0.306, + "grad_norm": 22.12157409866164, + "kl_loss_13": 4558.4, + "kl_loss_26": 3495.6, + "kl_loss_39": 2322.2, + "kl_loss_7": 5202.4, + "learning_rate": 0.000795174781540165, + "loss": 7798.9, + "step": 3060 + }, + { + "ce_loss_13": 3.6345800876617433, + "ce_loss_26": 3.126866352558136, + "ce_loss_39": 2.5723444908857345, + "ce_loss_52": 1.4505648389458656, + "ce_loss_7": 3.938809943199158, + "epoch": 0.307, + "grad_norm": 21.67276371006888, + "kl_loss_13": 4502.8, + "kl_loss_26": 3435.6, + "kl_loss_39": 2259.2, + "kl_loss_7": 5140.8, + "learning_rate": 0.0007938926261462366, + "loss": 7786.2, + "step": 3070 + }, + { + "ce_loss_13": 3.6539651334285734, + "ce_loss_26": 3.136826354265213, + "ce_loss_39": 2.5686775982379912, + "ce_loss_52": 1.4312876760959625, + "ce_loss_7": 3.9670185923576353, + "epoch": 0.308, + "grad_norm": 23.264906723037097, + "kl_loss_13": 4571.6, + "kl_loss_26": 3495.6, + "kl_loss_39": 2289.8, + "kl_loss_7": 5231.2, + "learning_rate": 0.0007926075112568258, + "loss": 7773.0, + "step": 3080 + }, + { + "ce_loss_13": 3.6424070239067077, + "ce_loss_26": 3.13111692070961, + "ce_loss_39": 2.5661837816238404, + "ce_loss_52": 1.4395585834980011, + "ce_loss_7": 3.9482239544391633, + "epoch": 0.309, + "grad_norm": 22.051447045868983, + "kl_loss_13": 4540.0, + "kl_loss_26": 3467.2, + "kl_loss_39": 2265.6, + "kl_loss_7": 5186.4, + "learning_rate": 0.0007913194498130252, + "loss": 7730.0, + "step": 3090 + }, + { + "ce_loss_13": 3.6187573671340942, + "ce_loss_26": 3.110442912578583, + "ce_loss_39": 2.553048479557037, + "ce_loss_52": 1.4316339492797852, + "ce_loss_7": 3.92513769865036, + "epoch": 0.31, + "grad_norm": 22.041241368180156, + "kl_loss_13": 4504.8, + "kl_loss_26": 3436.0, + "kl_loss_39": 2242.0, + "kl_loss_7": 5140.0, + "learning_rate": 0.0007900284547855992, + "loss": 7742.0, + "step": 3100 + }, + { + "ce_loss_13": 3.6639523029327394, + "ce_loss_26": 3.1580884575843813, + "ce_loss_39": 2.5835469484329225, + "ce_loss_52": 1.4442215472459794, + "ce_loss_7": 3.976783311367035, + "epoch": 0.311, + "grad_norm": 20.880619592237565, + "kl_loss_13": 4592.0, + "kl_loss_26": 3526.4, + "kl_loss_39": 2312.4, + "kl_loss_7": 5244.8, + "learning_rate": 0.0007887345391748532, + "loss": 7735.3, + "step": 3110 + }, + { + "ce_loss_13": 3.6283124804496767, + "ce_loss_26": 3.113315612077713, + "ce_loss_39": 2.547743684053421, + "ce_loss_52": 1.42053325176239, + "ce_loss_7": 3.9331269919872285, + "epoch": 0.312, + "grad_norm": 22.15121587945531, + "kl_loss_13": 4543.2, + "kl_loss_26": 3463.2, + "kl_loss_39": 2267.8, + "kl_loss_7": 5184.8, + "learning_rate": 0.0007874377160105036, + "loss": 7729.4, + "step": 3120 + }, + { + "ce_loss_13": 3.6399169504642486, + "ce_loss_26": 3.135877913236618, + "ce_loss_39": 2.5725889205932617, + "ce_loss_52": 1.4427233994007111, + "ce_loss_7": 3.9606189668178557, + "epoch": 0.313, + "grad_norm": 21.87500401487531, + "kl_loss_13": 4563.2, + "kl_loss_26": 3490.0, + "kl_loss_39": 2276.4, + "kl_loss_7": 5228.0, + "learning_rate": 0.0007861379983515449, + "loss": 7710.9, + "step": 3130 + }, + { + "ce_loss_13": 3.634021121263504, + "ce_loss_26": 3.111995500326157, + "ce_loss_39": 2.5583092838525774, + "ce_loss_52": 1.4399698421359062, + "ce_loss_7": 3.94167400598526, + "epoch": 0.314, + "grad_norm": 22.854565496538875, + "kl_loss_13": 4504.4, + "kl_loss_26": 3415.2, + "kl_loss_39": 2230.6, + "kl_loss_7": 5153.6, + "learning_rate": 0.0007848353992861195, + "loss": 7710.3, + "step": 3140 + }, + { + "ce_loss_13": 3.6272457361221315, + "ce_loss_26": 3.116968184709549, + "ce_loss_39": 2.551611191034317, + "ce_loss_52": 1.437747061252594, + "ce_loss_7": 3.9388325929641725, + "epoch": 0.315, + "grad_norm": 21.84748688614269, + "kl_loss_13": 4498.8, + "kl_loss_26": 3427.6, + "kl_loss_39": 2231.6, + "kl_loss_7": 5142.4, + "learning_rate": 0.0007835299319313853, + "loss": 7607.0, + "step": 3150 + }, + { + "ce_loss_13": 3.613277268409729, + "ce_loss_26": 3.0916620969772337, + "ce_loss_39": 2.5186130821704866, + "ce_loss_52": 1.3888636380434036, + "ce_loss_7": 3.935783725976944, + "epoch": 0.316, + "grad_norm": 21.933561198317395, + "kl_loss_13": 4519.2, + "kl_loss_26": 3438.8, + "kl_loss_39": 2232.0, + "kl_loss_7": 5189.6, + "learning_rate": 0.0007822216094333848, + "loss": 7650.0, + "step": 3160 + }, + { + "ce_loss_13": 3.658072179555893, + "ce_loss_26": 3.1417903542518615, + "ce_loss_39": 2.577219474315643, + "ce_loss_52": 1.437073315680027, + "ce_loss_7": 3.970378410816193, + "epoch": 0.317, + "grad_norm": 22.034139537965903, + "kl_loss_13": 4566.4, + "kl_loss_26": 3493.2, + "kl_loss_39": 2301.8, + "kl_loss_7": 5224.0, + "learning_rate": 0.0007809104449669101, + "loss": 7644.7, + "step": 3170 + }, + { + "ce_loss_13": 3.593963289260864, + "ce_loss_26": 3.080548882484436, + "ce_loss_39": 2.5262903541326525, + "ce_loss_52": 1.4362893968820571, + "ce_loss_7": 3.8962223708629606, + "epoch": 0.318, + "grad_norm": 22.12833658126749, + "kl_loss_13": 4417.6, + "kl_loss_26": 3353.2, + "kl_loss_39": 2169.6, + "kl_loss_7": 5054.4, + "learning_rate": 0.0007795964517353734, + "loss": 7580.1, + "step": 3180 + }, + { + "ce_loss_13": 3.639219433069229, + "ce_loss_26": 3.126335847377777, + "ce_loss_39": 2.5598012149333953, + "ce_loss_52": 1.4479554057121278, + "ce_loss_7": 3.955880182981491, + "epoch": 0.319, + "grad_norm": 21.421584248628356, + "kl_loss_13": 4524.8, + "kl_loss_26": 3445.2, + "kl_loss_39": 2238.4, + "kl_loss_7": 5180.8, + "learning_rate": 0.000778279642970672, + "loss": 7577.4, + "step": 3190 + }, + { + "ce_loss_13": 3.593672776222229, + "ce_loss_26": 3.076059252023697, + "ce_loss_39": 2.5206238448619844, + "ce_loss_52": 1.4138888984918594, + "ce_loss_7": 3.898124760389328, + "epoch": 0.32, + "grad_norm": 23.27138036145762, + "kl_loss_13": 4477.6, + "kl_loss_26": 3400.0, + "kl_loss_39": 2214.8, + "kl_loss_7": 5123.2, + "learning_rate": 0.0007769600319330552, + "loss": 7595.6, + "step": 3200 + }, + { + "ce_loss_13": 3.6573951125144957, + "ce_loss_26": 3.166206729412079, + "ce_loss_39": 2.6105258047580717, + "ce_loss_52": 1.47857309281826, + "ce_loss_7": 3.9568731427192687, + "epoch": 0.321, + "grad_norm": 21.35600054948774, + "kl_loss_13": 4470.4, + "kl_loss_26": 3434.4, + "kl_loss_39": 2255.6, + "kl_loss_7": 5100.8, + "learning_rate": 0.0007756376319109917, + "loss": 7610.9, + "step": 3210 + }, + { + "ce_loss_13": 3.619207721948624, + "ce_loss_26": 3.112484961748123, + "ce_loss_39": 2.5595098197460175, + "ce_loss_52": 1.442267394065857, + "ce_loss_7": 3.9296476364135744, + "epoch": 0.322, + "grad_norm": 21.117056892906756, + "kl_loss_13": 4453.6, + "kl_loss_26": 3396.0, + "kl_loss_39": 2218.0, + "kl_loss_7": 5104.8, + "learning_rate": 0.0007743124562210351, + "loss": 7569.7, + "step": 3220 + }, + { + "ce_loss_13": 3.612431305646896, + "ce_loss_26": 3.104649418592453, + "ce_loss_39": 2.5444509416818617, + "ce_loss_52": 1.4610149055719375, + "ce_loss_7": 3.9223886907100676, + "epoch": 0.323, + "grad_norm": 22.510814919939268, + "kl_loss_13": 4408.0, + "kl_loss_26": 3338.4, + "kl_loss_39": 2155.4, + "kl_loss_7": 5054.4, + "learning_rate": 0.0007729845182076895, + "loss": 7565.6, + "step": 3230 + }, + { + "ce_loss_13": 3.5650066912174223, + "ce_loss_26": 3.060505121946335, + "ce_loss_39": 2.5127211630344393, + "ce_loss_52": 1.445562407374382, + "ce_loss_7": 3.8779995679855346, + "epoch": 0.324, + "grad_norm": 24.007681143469355, + "kl_loss_13": 4388.4, + "kl_loss_26": 3323.6, + "kl_loss_39": 2150.2, + "kl_loss_7": 5044.0, + "learning_rate": 0.0007716538312432765, + "loss": 7556.0, + "step": 3240 + }, + { + "ce_loss_13": 3.5737381398677828, + "ce_loss_26": 3.0725920855998994, + "ce_loss_39": 2.5134449005126953, + "ce_loss_52": 1.4138619631528855, + "ce_loss_7": 3.8855117499828338, + "epoch": 0.325, + "grad_norm": 22.203629206824775, + "kl_loss_13": 4430.8, + "kl_loss_26": 3379.6, + "kl_loss_39": 2197.4, + "kl_loss_7": 5081.6, + "learning_rate": 0.0007703204087277988, + "loss": 7530.7, + "step": 3250 + }, + { + "ce_loss_13": 3.5474561214447022, + "ce_loss_26": 3.037938302755356, + "ce_loss_39": 2.477022570371628, + "ce_loss_52": 1.3884405881166457, + "ce_loss_7": 3.85917187333107, + "epoch": 0.326, + "grad_norm": 21.98291246151193, + "kl_loss_13": 4437.6, + "kl_loss_26": 3371.6, + "kl_loss_39": 2176.2, + "kl_loss_7": 5089.6, + "learning_rate": 0.0007689842640888063, + "loss": 7519.3, + "step": 3260 + }, + { + "ce_loss_13": 3.6053310513496397, + "ce_loss_26": 3.0936122059822084, + "ce_loss_39": 2.5414693653583527, + "ce_loss_52": 1.4531731829047203, + "ce_loss_7": 3.913253253698349, + "epoch": 0.327, + "grad_norm": 22.418707773974628, + "kl_loss_13": 4430.8, + "kl_loss_26": 3360.0, + "kl_loss_39": 2184.0, + "kl_loss_7": 5068.4, + "learning_rate": 0.0007676454107812607, + "loss": 7473.1, + "step": 3270 + }, + { + "ce_loss_13": 3.545606768131256, + "ce_loss_26": 3.0480951845645903, + "ce_loss_39": 2.499777999520302, + "ce_loss_52": 1.4314876705408097, + "ce_loss_7": 3.849948841333389, + "epoch": 0.328, + "grad_norm": 22.389500426390892, + "kl_loss_13": 4402.4, + "kl_loss_26": 3351.6, + "kl_loss_39": 2160.2, + "kl_loss_7": 5035.2, + "learning_rate": 0.0007663038622873999, + "loss": 7510.3, + "step": 3280 + }, + { + "ce_loss_13": 3.6383297204971314, + "ce_loss_26": 3.127288430929184, + "ce_loss_39": 2.561781680583954, + "ce_loss_52": 1.4617935866117477, + "ce_loss_7": 3.9572836577892305, + "epoch": 0.329, + "grad_norm": 23.105081611165705, + "kl_loss_13": 4472.4, + "kl_loss_26": 3396.8, + "kl_loss_39": 2202.8, + "kl_loss_7": 5139.2, + "learning_rate": 0.0007649596321166025, + "loss": 7473.8, + "step": 3290 + }, + { + "ce_loss_13": 3.5131199419498444, + "ce_loss_26": 3.0129006803035736, + "ce_loss_39": 2.4699264496564863, + "ce_loss_52": 1.4362694859504699, + "ce_loss_7": 3.8114282488822937, + "epoch": 0.33, + "grad_norm": 22.77133190654901, + "kl_loss_13": 4268.0, + "kl_loss_26": 3223.6, + "kl_loss_39": 2069.4, + "kl_loss_7": 4882.4, + "learning_rate": 0.0007636127338052513, + "loss": 7443.1, + "step": 3300 + }, + { + "ce_loss_13": 3.5485077798366547, + "ce_loss_26": 3.0334243774414062, + "ce_loss_39": 2.471779704093933, + "ce_loss_52": 1.4008762776851653, + "ce_loss_7": 3.860434752702713, + "epoch": 0.331, + "grad_norm": 22.94544302407564, + "kl_loss_13": 4425.6, + "kl_loss_26": 3344.4, + "kl_loss_39": 2154.8, + "kl_loss_7": 5076.8, + "learning_rate": 0.0007622631809165971, + "loss": 7403.2, + "step": 3310 + }, + { + "ce_loss_13": 3.611973536014557, + "ce_loss_26": 3.1062149882316588, + "ce_loss_39": 2.5540910184383394, + "ce_loss_52": 1.4812486261129378, + "ce_loss_7": 3.913082367181778, + "epoch": 0.332, + "grad_norm": 21.844212510164496, + "kl_loss_13": 4407.2, + "kl_loss_26": 3357.6, + "kl_loss_39": 2168.2, + "kl_loss_7": 5035.2, + "learning_rate": 0.000760910987040623, + "loss": 7436.1, + "step": 3320 + }, + { + "ce_loss_13": 3.500666618347168, + "ce_loss_26": 2.992197906970978, + "ce_loss_39": 2.443836176395416, + "ce_loss_52": 1.4172912210226059, + "ce_loss_7": 3.809192955493927, + "epoch": 0.333, + "grad_norm": 22.341971135877618, + "kl_loss_13": 4287.2, + "kl_loss_26": 3229.2, + "kl_loss_39": 2050.8, + "kl_loss_7": 4934.4, + "learning_rate": 0.000759556165793906, + "loss": 7354.8, + "step": 3330 + }, + { + "ce_loss_13": 3.572561663389206, + "ce_loss_26": 3.0666925728321077, + "ce_loss_39": 2.5258089125156404, + "ce_loss_52": 1.4658059388399125, + "ce_loss_7": 3.8703009307384493, + "epoch": 0.334, + "grad_norm": 20.585398734523825, + "kl_loss_13": 4346.0, + "kl_loss_26": 3292.8, + "kl_loss_39": 2121.6, + "kl_loss_7": 4966.4, + "learning_rate": 0.000758198730819481, + "loss": 7376.9, + "step": 3340 + }, + { + "ce_loss_13": 3.5921706318855287, + "ce_loss_26": 3.0865486025810243, + "ce_loss_39": 2.5250521272420885, + "ce_loss_52": 1.4276385620236396, + "ce_loss_7": 3.9076746106147766, + "epoch": 0.335, + "grad_norm": 22.48300338159267, + "kl_loss_13": 4447.2, + "kl_loss_26": 3387.2, + "kl_loss_39": 2194.8, + "kl_loss_7": 5095.2, + "learning_rate": 0.0007568386957867032, + "loss": 7407.2, + "step": 3350 + }, + { + "ce_loss_13": 3.5395087361335755, + "ce_loss_26": 3.0476285994052885, + "ce_loss_39": 2.5039754688739775, + "ce_loss_52": 1.4519936561584472, + "ce_loss_7": 3.8360753774642946, + "epoch": 0.336, + "grad_norm": 22.282680621594952, + "kl_loss_13": 4315.6, + "kl_loss_26": 3284.8, + "kl_loss_39": 2115.8, + "kl_loss_7": 4935.2, + "learning_rate": 0.0007554760743911103, + "loss": 7349.9, + "step": 3360 + }, + { + "ce_loss_13": 3.5341054499149323, + "ce_loss_26": 3.0255552768707275, + "ce_loss_39": 2.4795517563819884, + "ce_loss_52": 1.43424501568079, + "ce_loss_7": 3.8406670331954955, + "epoch": 0.337, + "grad_norm": 21.82655281544531, + "kl_loss_13": 4317.2, + "kl_loss_26": 3248.8, + "kl_loss_39": 2081.8, + "kl_loss_7": 4959.2, + "learning_rate": 0.0007541108803542846, + "loss": 7352.9, + "step": 3370 + }, + { + "ce_loss_13": 3.571430027484894, + "ce_loss_26": 3.0674545526504517, + "ce_loss_39": 2.5179436981678007, + "ce_loss_52": 1.4572079569101333, + "ce_loss_7": 3.8665721654891967, + "epoch": 0.338, + "grad_norm": 20.283540782731166, + "kl_loss_13": 4343.6, + "kl_loss_26": 3291.2, + "kl_loss_39": 2129.4, + "kl_loss_7": 4966.4, + "learning_rate": 0.0007527431274237149, + "loss": 7371.7, + "step": 3380 + }, + { + "ce_loss_13": 3.53710196018219, + "ce_loss_26": 3.0354479968547823, + "ce_loss_39": 2.4885441571474076, + "ce_loss_52": 1.4400692582130432, + "ce_loss_7": 3.8393473029136658, + "epoch": 0.339, + "grad_norm": 21.387970982998613, + "kl_loss_13": 4311.6, + "kl_loss_26": 3256.4, + "kl_loss_39": 2099.8, + "kl_loss_7": 4936.0, + "learning_rate": 0.0007513728293726579, + "loss": 7294.4, + "step": 3390 + }, + { + "ce_loss_13": 3.523770880699158, + "ce_loss_26": 3.0170785784721375, + "ce_loss_39": 2.4583947211503983, + "ce_loss_52": 1.43256463855505, + "ce_loss_7": 3.822116255760193, + "epoch": 0.34, + "grad_norm": 21.22690089148789, + "kl_loss_13": 4311.6, + "kl_loss_26": 3260.0, + "kl_loss_39": 2066.6, + "kl_loss_7": 4940.0, + "learning_rate": 0.00075, + "loss": 7289.9, + "step": 3400 + }, + { + "ce_loss_13": 3.495673859119415, + "ce_loss_26": 2.9982276618480683, + "ce_loss_39": 2.444023036956787, + "ce_loss_52": 1.4111162751913071, + "ce_loss_7": 3.805855029821396, + "epoch": 0.341, + "grad_norm": 20.68401947266934, + "kl_loss_13": 4286.0, + "kl_loss_26": 3244.8, + "kl_loss_39": 2061.6, + "kl_loss_7": 4924.4, + "learning_rate": 0.0007486246531301177, + "loss": 7295.1, + "step": 3410 + }, + { + "ce_loss_13": 3.532993698120117, + "ce_loss_26": 3.0362183272838594, + "ce_loss_39": 2.485447385907173, + "ce_loss_52": 1.4526691198349, + "ce_loss_7": 3.830386519432068, + "epoch": 0.342, + "grad_norm": 22.222401081185772, + "kl_loss_13": 4299.6, + "kl_loss_26": 3249.2, + "kl_loss_39": 2080.4, + "kl_loss_7": 4928.0, + "learning_rate": 0.0007472468026127384, + "loss": 7341.7, + "step": 3420 + }, + { + "ce_loss_13": 3.463904342055321, + "ce_loss_26": 2.9640887469053268, + "ce_loss_39": 2.421750417351723, + "ce_loss_52": 1.4137367144227029, + "ce_loss_7": 3.7643026977777483, + "epoch": 0.343, + "grad_norm": 21.63494224797145, + "kl_loss_13": 4250.8, + "kl_loss_26": 3196.2, + "kl_loss_39": 2039.9, + "kl_loss_7": 4881.6, + "learning_rate": 0.000745866462322802, + "loss": 7230.95, + "step": 3430 + }, + { + "ce_loss_13": 3.5996195137500764, + "ce_loss_26": 3.093309980630875, + "ce_loss_39": 2.5379314005374907, + "ce_loss_52": 1.4943716078996658, + "ce_loss_7": 3.899878454208374, + "epoch": 0.344, + "grad_norm": 23.12022506478991, + "kl_loss_13": 4332.0, + "kl_loss_26": 3268.0, + "kl_loss_39": 2085.8, + "kl_loss_7": 4968.0, + "learning_rate": 0.0007444836461603195, + "loss": 7294.5, + "step": 3440 + }, + { + "ce_loss_13": 3.460267198085785, + "ce_loss_26": 2.979328769445419, + "ce_loss_39": 2.435830682516098, + "ce_loss_52": 1.409597858786583, + "ce_loss_7": 3.7637628614902496, + "epoch": 0.345, + "grad_norm": 22.11179077444158, + "kl_loss_13": 4240.0, + "kl_loss_26": 3220.0, + "kl_loss_39": 2052.4, + "kl_loss_7": 4872.8, + "learning_rate": 0.0007430983680502344, + "loss": 7260.3, + "step": 3450 + }, + { + "ce_loss_13": 3.484216260910034, + "ce_loss_26": 2.986140418052673, + "ce_loss_39": 2.4419540107250213, + "ce_loss_52": 1.423793789744377, + "ce_loss_7": 3.783113992214203, + "epoch": 0.346, + "grad_norm": 21.48292293909848, + "kl_loss_13": 4242.0, + "kl_loss_26": 3200.0, + "kl_loss_39": 2037.4, + "kl_loss_7": 4868.0, + "learning_rate": 0.0007417106419422819, + "loss": 7210.1, + "step": 3460 + }, + { + "ce_loss_13": 3.4797898173332213, + "ce_loss_26": 2.9807622492313386, + "ce_loss_39": 2.43132506608963, + "ce_loss_52": 1.4044816851615907, + "ce_loss_7": 3.782477653026581, + "epoch": 0.347, + "grad_norm": 21.84069780374938, + "kl_loss_13": 4286.0, + "kl_loss_26": 3248.0, + "kl_loss_39": 2073.6, + "kl_loss_7": 4917.6, + "learning_rate": 0.0007403204818108486, + "loss": 7232.3, + "step": 3470 + }, + { + "ce_loss_13": 3.461978626251221, + "ce_loss_26": 2.971747863292694, + "ce_loss_39": 2.4197792381048204, + "ce_loss_52": 1.411722904443741, + "ce_loss_7": 3.7553564965724946, + "epoch": 0.348, + "grad_norm": 20.773261789734953, + "kl_loss_13": 4206.0, + "kl_loss_26": 3177.2, + "kl_loss_39": 2006.4, + "kl_loss_7": 4823.2, + "learning_rate": 0.0007389279016548316, + "loss": 7200.0, + "step": 3480 + }, + { + "ce_loss_13": 3.412698417901993, + "ce_loss_26": 2.910679543018341, + "ce_loss_39": 2.3608890056610106, + "ce_loss_52": 1.3876498267054558, + "ce_loss_7": 3.7148698210716247, + "epoch": 0.349, + "grad_norm": 21.05607269401212, + "kl_loss_13": 4174.0, + "kl_loss_26": 3125.6, + "kl_loss_39": 1966.0, + "kl_loss_7": 4804.0, + "learning_rate": 0.0007375329154974975, + "loss": 7216.6, + "step": 3490 + }, + { + "ce_loss_13": 3.474409651756287, + "ce_loss_26": 2.9683692157268524, + "ce_loss_39": 2.418835300207138, + "ce_loss_52": 1.4043258875608444, + "ce_loss_7": 3.774983435869217, + "epoch": 0.35, + "grad_norm": 20.352131735407625, + "kl_loss_13": 4246.8, + "kl_loss_26": 3202.8, + "kl_loss_39": 2042.2, + "kl_loss_7": 4875.2, + "learning_rate": 0.0007361355373863414, + "loss": 7202.7, + "step": 3500 + }, + { + "ce_loss_13": 3.4584279537200926, + "ce_loss_26": 2.9674349963665008, + "ce_loss_39": 2.422105145454407, + "ce_loss_52": 1.4255526602268218, + "ce_loss_7": 3.767817974090576, + "epoch": 0.351, + "grad_norm": 20.416274052366226, + "kl_loss_13": 4216.8, + "kl_loss_26": 3188.8, + "kl_loss_39": 2018.4, + "kl_loss_7": 4854.4, + "learning_rate": 0.0007347357813929454, + "loss": 7180.1, + "step": 3510 + }, + { + "ce_loss_13": 3.4778851926326753, + "ce_loss_26": 2.979369193315506, + "ce_loss_39": 2.4347113519906998, + "ce_loss_52": 1.4163423389196397, + "ce_loss_7": 3.7732009410858156, + "epoch": 0.352, + "grad_norm": 24.260880475347793, + "kl_loss_13": 4219.6, + "kl_loss_26": 3191.6, + "kl_loss_39": 2033.6, + "kl_loss_7": 4844.0, + "learning_rate": 0.0007333336616128369, + "loss": 7181.8, + "step": 3520 + }, + { + "ce_loss_13": 3.479642480611801, + "ce_loss_26": 2.9858607232570646, + "ce_loss_39": 2.429011595249176, + "ce_loss_52": 1.4224095463752746, + "ce_loss_7": 3.779256856441498, + "epoch": 0.353, + "grad_norm": 20.532035835107088, + "kl_loss_13": 4211.6, + "kl_loss_26": 3185.2, + "kl_loss_39": 2008.4, + "kl_loss_7": 4841.6, + "learning_rate": 0.0007319291921653463, + "loss": 7183.4, + "step": 3530 + }, + { + "ce_loss_13": 3.4610216915607452, + "ce_loss_26": 2.962309718132019, + "ce_loss_39": 2.410178878903389, + "ce_loss_52": 1.4115030318498611, + "ce_loss_7": 3.757075273990631, + "epoch": 0.354, + "grad_norm": 23.640729954280236, + "kl_loss_13": 4236.8, + "kl_loss_26": 3190.0, + "kl_loss_39": 2013.0, + "kl_loss_7": 4864.8, + "learning_rate": 0.0007305223871934656, + "loss": 7181.4, + "step": 3540 + }, + { + "ce_loss_13": 3.5113906443119047, + "ce_loss_26": 3.016271597146988, + "ce_loss_39": 2.4817953169345857, + "ce_loss_52": 1.4761293560266495, + "ce_loss_7": 3.79822900891304, + "epoch": 0.355, + "grad_norm": 22.47350392427222, + "kl_loss_13": 4203.2, + "kl_loss_26": 3159.6, + "kl_loss_39": 2013.8, + "kl_loss_7": 4804.8, + "learning_rate": 0.0007291132608637052, + "loss": 7117.3, + "step": 3550 + }, + { + "ce_loss_13": 3.51672882437706, + "ce_loss_26": 3.017621088027954, + "ce_loss_39": 2.471283310651779, + "ce_loss_52": 1.4747960895299912, + "ce_loss_7": 3.8214517176151275, + "epoch": 0.356, + "grad_norm": 22.39492370466417, + "kl_loss_13": 4220.0, + "kl_loss_26": 3165.2, + "kl_loss_39": 2000.0, + "kl_loss_7": 4852.8, + "learning_rate": 0.0007277018273659516, + "loss": 7133.8, + "step": 3560 + }, + { + "ce_loss_13": 3.5502862453460695, + "ce_loss_26": 3.05435825586319, + "ce_loss_39": 2.5157745271921157, + "ce_loss_52": 1.4953802406787873, + "ce_loss_7": 3.8426124274730684, + "epoch": 0.357, + "grad_norm": 22.306092146539875, + "kl_loss_13": 4247.6, + "kl_loss_26": 3216.4, + "kl_loss_39": 2067.2, + "kl_loss_7": 4859.2, + "learning_rate": 0.0007262881009133242, + "loss": 7135.8, + "step": 3570 + }, + { + "ce_loss_13": 3.454521042108536, + "ce_loss_26": 2.9510986149311065, + "ce_loss_39": 2.4084193408489227, + "ce_loss_52": 1.4200605943799018, + "ce_loss_7": 3.7586602210998534, + "epoch": 0.358, + "grad_norm": 21.121962853812185, + "kl_loss_13": 4202.8, + "kl_loss_26": 3144.4, + "kl_loss_39": 1986.2, + "kl_loss_7": 4836.0, + "learning_rate": 0.0007248720957420329, + "loss": 7135.9, + "step": 3580 + }, + { + "ce_loss_13": 3.4299929022789, + "ce_loss_26": 2.9262389481067657, + "ce_loss_39": 2.3823306292295454, + "ce_loss_52": 1.4015884697437286, + "ce_loss_7": 3.7345054388046264, + "epoch": 0.359, + "grad_norm": 21.87103253394757, + "kl_loss_13": 4194.4, + "kl_loss_26": 3138.0, + "kl_loss_39": 1982.0, + "kl_loss_7": 4829.6, + "learning_rate": 0.0007234538261112341, + "loss": 7056.9, + "step": 3590 + }, + { + "ce_loss_13": 3.47941969037056, + "ce_loss_26": 2.9830207943916323, + "ce_loss_39": 2.429542663693428, + "ce_loss_52": 1.4434623152017594, + "ce_loss_7": 3.7679139375686646, + "epoch": 0.36, + "grad_norm": 21.216900982885303, + "kl_loss_13": 4199.2, + "kl_loss_26": 3151.6, + "kl_loss_39": 1989.4, + "kl_loss_7": 4814.0, + "learning_rate": 0.0007220333063028871, + "loss": 7096.6, + "step": 3600 + }, + { + "ce_loss_13": 3.3754841923713683, + "ce_loss_26": 2.8831639885902405, + "ce_loss_39": 2.3438637793064117, + "ce_loss_52": 1.3872641950845719, + "ce_loss_7": 3.681108373403549, + "epoch": 0.361, + "grad_norm": 21.80171673212867, + "kl_loss_13": 4118.8, + "kl_loss_26": 3083.2, + "kl_loss_39": 1941.8, + "kl_loss_7": 4758.0, + "learning_rate": 0.0007206105506216106, + "loss": 7029.4, + "step": 3610 + }, + { + "ce_loss_13": 3.548617047071457, + "ce_loss_26": 3.054288852214813, + "ce_loss_39": 2.504137873649597, + "ce_loss_52": 1.4843981340527534, + "ce_loss_7": 3.850842350721359, + "epoch": 0.362, + "grad_norm": 21.588123109222046, + "kl_loss_13": 4246.4, + "kl_loss_26": 3203.6, + "kl_loss_39": 2032.4, + "kl_loss_7": 4870.4, + "learning_rate": 0.0007191855733945387, + "loss": 7126.1, + "step": 3620 + }, + { + "ce_loss_13": 3.469277936220169, + "ce_loss_26": 2.985336202383041, + "ce_loss_39": 2.4588325411081313, + "ce_loss_52": 1.4790852904319762, + "ce_loss_7": 3.764431744813919, + "epoch": 0.363, + "grad_norm": 22.204733809635066, + "kl_loss_13": 4129.2, + "kl_loss_26": 3111.6, + "kl_loss_39": 1976.2, + "kl_loss_7": 4744.4, + "learning_rate": 0.0007177583889711762, + "loss": 7054.3, + "step": 3630 + }, + { + "ce_loss_13": 3.442378747463226, + "ce_loss_26": 2.9400178849697114, + "ce_loss_39": 2.394621509313583, + "ce_loss_52": 1.417646163702011, + "ce_loss_7": 3.743503212928772, + "epoch": 0.364, + "grad_norm": 21.957965313010384, + "kl_loss_13": 4163.6, + "kl_loss_26": 3123.2, + "kl_loss_39": 1959.6, + "kl_loss_7": 4796.0, + "learning_rate": 0.0007163290117232541, + "loss": 7054.5, + "step": 3640 + }, + { + "ce_loss_13": 3.4392197132110596, + "ce_loss_26": 2.951560914516449, + "ce_loss_39": 2.4180801689624785, + "ce_loss_52": 1.4459613859653473, + "ce_loss_7": 3.7318799614906313, + "epoch": 0.365, + "grad_norm": 21.451785041659093, + "kl_loss_13": 4123.6, + "kl_loss_26": 3098.8, + "kl_loss_39": 1968.6, + "kl_loss_7": 4730.8, + "learning_rate": 0.0007148974560445859, + "loss": 7029.7, + "step": 3650 + }, + { + "ce_loss_13": 3.4576940476894378, + "ce_loss_26": 2.964629900455475, + "ce_loss_39": 2.4162339717149734, + "ce_loss_52": 1.4277015537023545, + "ce_loss_7": 3.7549045085906982, + "epoch": 0.366, + "grad_norm": 22.817794972487214, + "kl_loss_13": 4165.6, + "kl_loss_26": 3135.6, + "kl_loss_39": 1970.0, + "kl_loss_7": 4792.8, + "learning_rate": 0.0007134637363509209, + "loss": 7013.0, + "step": 3660 + }, + { + "ce_loss_13": 3.5099750757217407, + "ce_loss_26": 3.0163159906864165, + "ce_loss_39": 2.4729519367218016, + "ce_loss_52": 1.4603912830352783, + "ce_loss_7": 3.8057311475276947, + "epoch": 0.367, + "grad_norm": 21.693779714382707, + "kl_loss_13": 4227.6, + "kl_loss_26": 3203.2, + "kl_loss_39": 2043.8, + "kl_loss_7": 4848.0, + "learning_rate": 0.0007120278670798009, + "loss": 7024.2, + "step": 3670 + }, + { + "ce_loss_13": 3.4791980743408204, + "ce_loss_26": 2.992640608549118, + "ce_loss_39": 2.4573631793260575, + "ce_loss_52": 1.461830335855484, + "ce_loss_7": 3.7739274382591246, + "epoch": 0.368, + "grad_norm": 22.105670609070703, + "kl_loss_13": 4126.8, + "kl_loss_26": 3112.4, + "kl_loss_39": 1978.0, + "kl_loss_7": 4751.2, + "learning_rate": 0.0007105898626904133, + "loss": 6924.7, + "step": 3680 + }, + { + "ce_loss_13": 3.4017152190208435, + "ce_loss_26": 2.910390090942383, + "ce_loss_39": 2.374891012907028, + "ce_loss_52": 1.41865316927433, + "ce_loss_7": 3.6938551664352417, + "epoch": 0.369, + "grad_norm": 20.08460323542704, + "kl_loss_13": 4101.6, + "kl_loss_26": 3071.6, + "kl_loss_39": 1938.0, + "kl_loss_7": 4706.8, + "learning_rate": 0.0007091497376634463, + "loss": 6952.1, + "step": 3690 + }, + { + "ce_loss_13": 3.4051457762718202, + "ce_loss_26": 2.9142957627773285, + "ce_loss_39": 2.3769975334405897, + "ce_loss_52": 1.4461660206317901, + "ce_loss_7": 3.7018753468990324, + "epoch": 0.37, + "grad_norm": 21.75095718081699, + "kl_loss_13": 4034.8, + "kl_loss_26": 3006.4, + "kl_loss_39": 1877.2, + "kl_loss_7": 4647.6, + "learning_rate": 0.0007077075065009433, + "loss": 6973.3, + "step": 3700 + }, + { + "ce_loss_13": 3.407693642377853, + "ce_loss_26": 2.9163559854030607, + "ce_loss_39": 2.3682307243347167, + "ce_loss_52": 1.3965442717075347, + "ce_loss_7": 3.7011303901672363, + "epoch": 0.371, + "grad_norm": 21.90346982831121, + "kl_loss_13": 4126.8, + "kl_loss_26": 3098.4, + "kl_loss_39": 1951.8, + "kl_loss_7": 4740.4, + "learning_rate": 0.0007062631837261557, + "loss": 6968.9, + "step": 3710 + }, + { + "ce_loss_13": 3.4375191271305083, + "ce_loss_26": 2.9387122094631195, + "ce_loss_39": 2.4049195408821107, + "ce_loss_52": 1.4491820633411407, + "ce_loss_7": 3.7319699347019197, + "epoch": 0.372, + "grad_norm": 22.15813884607567, + "kl_loss_13": 4096.8, + "kl_loss_26": 3062.8, + "kl_loss_39": 1923.8, + "kl_loss_7": 4716.0, + "learning_rate": 0.0007048167838833977, + "loss": 6892.9, + "step": 3720 + }, + { + "ce_loss_13": 3.443445736169815, + "ce_loss_26": 2.9415276020765306, + "ce_loss_39": 2.3946647971868513, + "ce_loss_52": 1.4324263527989387, + "ce_loss_7": 3.7482150912284853, + "epoch": 0.373, + "grad_norm": 20.533639539523726, + "kl_loss_13": 4160.0, + "kl_loss_26": 3107.6, + "kl_loss_39": 1932.2, + "kl_loss_7": 4794.8, + "learning_rate": 0.0007033683215379002, + "loss": 6994.9, + "step": 3730 + }, + { + "ce_loss_13": 3.440624713897705, + "ce_loss_26": 2.932874071598053, + "ce_loss_39": 2.384758135676384, + "ce_loss_52": 1.4341223761439323, + "ce_loss_7": 3.7404758751392366, + "epoch": 0.374, + "grad_norm": 22.169142032653717, + "kl_loss_13": 4178.4, + "kl_loss_26": 3120.4, + "kl_loss_39": 1940.6, + "kl_loss_7": 4809.6, + "learning_rate": 0.0007019178112756625, + "loss": 6960.1, + "step": 3740 + }, + { + "ce_loss_13": 3.479549217224121, + "ce_loss_26": 2.9821768522262575, + "ce_loss_39": 2.4379994481801988, + "ce_loss_52": 1.4455731570720673, + "ce_loss_7": 3.779719626903534, + "epoch": 0.375, + "grad_norm": 22.88722443184811, + "kl_loss_13": 4190.8, + "kl_loss_26": 3160.8, + "kl_loss_39": 2000.2, + "kl_loss_7": 4822.4, + "learning_rate": 0.0007004652677033068, + "loss": 6922.4, + "step": 3750 + }, + { + "ce_loss_13": 3.5048573672771455, + "ce_loss_26": 2.996897077560425, + "ce_loss_39": 2.4514291107654573, + "ce_loss_52": 1.4677145808935166, + "ce_loss_7": 3.8070162892341615, + "epoch": 0.376, + "grad_norm": 20.379791798469622, + "kl_loss_13": 4200.4, + "kl_loss_26": 3149.2, + "kl_loss_39": 1988.0, + "kl_loss_7": 4828.8, + "learning_rate": 0.0006990107054479312, + "loss": 6948.5, + "step": 3760 + }, + { + "ce_loss_13": 3.3857189416885376, + "ce_loss_26": 2.8971070766448976, + "ce_loss_39": 2.3661463767290116, + "ce_loss_52": 1.4227028042078018, + "ce_loss_7": 3.679090714454651, + "epoch": 0.377, + "grad_norm": 21.179119667844127, + "kl_loss_13": 4050.8, + "kl_loss_26": 3030.4, + "kl_loss_39": 1892.4, + "kl_loss_7": 4666.8, + "learning_rate": 0.000697554139156961, + "loss": 6941.0, + "step": 3770 + }, + { + "ce_loss_13": 3.512388813495636, + "ce_loss_26": 3.0138610899448395, + "ce_loss_39": 2.4606133818626406, + "ce_loss_52": 1.4959001630544662, + "ce_loss_7": 3.807480573654175, + "epoch": 0.378, + "grad_norm": 22.362162135534977, + "kl_loss_13": 4145.2, + "kl_loss_26": 3107.2, + "kl_loss_39": 1951.0, + "kl_loss_7": 4758.0, + "learning_rate": 0.0006960955834980027, + "loss": 6874.7, + "step": 3780 + }, + { + "ce_loss_13": 3.411291944980621, + "ce_loss_26": 2.907497102022171, + "ce_loss_39": 2.355439043045044, + "ce_loss_52": 1.4057017982006073, + "ce_loss_7": 3.7079379081726076, + "epoch": 0.379, + "grad_norm": 20.519862733845507, + "kl_loss_13": 4121.2, + "kl_loss_26": 3082.0, + "kl_loss_39": 1917.0, + "kl_loss_7": 4746.0, + "learning_rate": 0.0006946350531586958, + "loss": 6891.8, + "step": 3790 + }, + { + "ce_loss_13": 3.365473288297653, + "ce_loss_26": 2.8626536786556245, + "ce_loss_39": 2.3194735169410707, + "ce_loss_52": 1.3925662517547608, + "ce_loss_7": 3.661379265785217, + "epoch": 0.38, + "grad_norm": 21.211701089479526, + "kl_loss_13": 4084.0, + "kl_loss_26": 3048.4, + "kl_loss_39": 1891.2, + "kl_loss_7": 4702.0, + "learning_rate": 0.0006931725628465643, + "loss": 6889.0, + "step": 3800 + }, + { + "ce_loss_13": 3.375334745645523, + "ce_loss_26": 2.891470319032669, + "ce_loss_39": 2.3472714513540267, + "ce_loss_52": 1.4055952280759811, + "ce_loss_7": 3.667448806762695, + "epoch": 0.381, + "grad_norm": 22.083234786813115, + "kl_loss_13": 4056.4, + "kl_loss_26": 3038.0, + "kl_loss_39": 1887.0, + "kl_loss_7": 4669.2, + "learning_rate": 0.0006917081272888696, + "loss": 6821.1, + "step": 3810 + }, + { + "ce_loss_13": 3.413332349061966, + "ce_loss_26": 2.918791648745537, + "ce_loss_39": 2.3821532160043715, + "ce_loss_52": 1.426029135286808, + "ce_loss_7": 3.70468533039093, + "epoch": 0.382, + "grad_norm": 21.70379003852508, + "kl_loss_13": 4066.4, + "kl_loss_26": 3040.8, + "kl_loss_39": 1897.2, + "kl_loss_7": 4676.0, + "learning_rate": 0.0006902417612324615, + "loss": 6817.3, + "step": 3820 + }, + { + "ce_loss_13": 3.448424202203751, + "ce_loss_26": 2.9532420337200165, + "ce_loss_39": 2.389399054646492, + "ce_loss_52": 1.4127800971269608, + "ce_loss_7": 3.7562515437602997, + "epoch": 0.383, + "grad_norm": 22.611090782774035, + "kl_loss_13": 4219.6, + "kl_loss_26": 3187.2, + "kl_loss_39": 2000.4, + "kl_loss_7": 4858.4, + "learning_rate": 0.00068877347944363, + "loss": 6892.2, + "step": 3830 + }, + { + "ce_loss_13": 3.42295760512352, + "ce_loss_26": 2.926942157745361, + "ce_loss_39": 2.3873476177453994, + "ce_loss_52": 1.439600521326065, + "ce_loss_7": 3.719656354188919, + "epoch": 0.384, + "grad_norm": 20.756696465620674, + "kl_loss_13": 4109.6, + "kl_loss_26": 3061.2, + "kl_loss_39": 1912.2, + "kl_loss_7": 4727.6, + "learning_rate": 0.0006873032967079561, + "loss": 6876.7, + "step": 3840 + }, + { + "ce_loss_13": 3.4390079021453857, + "ce_loss_26": 2.9532729268074034, + "ce_loss_39": 2.4051371097564695, + "ce_loss_52": 1.446793320775032, + "ce_loss_7": 3.740493839979172, + "epoch": 0.385, + "grad_norm": 20.683464166323773, + "kl_loss_13": 4102.8, + "kl_loss_26": 3080.8, + "kl_loss_39": 1919.2, + "kl_loss_7": 4729.6, + "learning_rate": 0.0006858312278301637, + "loss": 6878.2, + "step": 3850 + }, + { + "ce_loss_13": 3.368044465780258, + "ce_loss_26": 2.8783551871776583, + "ce_loss_39": 2.35613272190094, + "ce_loss_52": 1.4377225756645202, + "ce_loss_7": 3.657758867740631, + "epoch": 0.386, + "grad_norm": 22.01788101919845, + "kl_loss_13": 3983.6, + "kl_loss_26": 2964.0, + "kl_loss_39": 1838.2, + "kl_loss_7": 4592.4, + "learning_rate": 0.0006843572876339704, + "loss": 6809.2, + "step": 3860 + }, + { + "ce_loss_13": 3.3198332667350767, + "ce_loss_26": 2.8321444630622863, + "ce_loss_39": 2.2942576706409454, + "ce_loss_52": 1.394131037592888, + "ce_loss_7": 3.6043868601322173, + "epoch": 0.387, + "grad_norm": 23.448962125354107, + "kl_loss_13": 3965.6, + "kl_loss_26": 2956.0, + "kl_loss_39": 1822.8, + "kl_loss_7": 4560.0, + "learning_rate": 0.0006828814909619373, + "loss": 6798.0, + "step": 3870 + }, + { + "ce_loss_13": 3.3714381575584413, + "ce_loss_26": 2.88772599697113, + "ce_loss_39": 2.3584464609622957, + "ce_loss_52": 1.4422439962625504, + "ce_loss_7": 3.662185198068619, + "epoch": 0.388, + "grad_norm": 22.031425075321017, + "kl_loss_13": 3995.6, + "kl_loss_26": 2987.6, + "kl_loss_39": 1858.6, + "kl_loss_7": 4603.2, + "learning_rate": 0.0006814038526753205, + "loss": 6790.2, + "step": 3880 + }, + { + "ce_loss_13": 3.4292624831199645, + "ce_loss_26": 2.9387787103652956, + "ce_loss_39": 2.392472392320633, + "ce_loss_52": 1.458945381641388, + "ce_loss_7": 3.722714525461197, + "epoch": 0.389, + "grad_norm": 21.623492145702706, + "kl_loss_13": 4048.0, + "kl_loss_26": 3026.8, + "kl_loss_39": 1877.2, + "kl_loss_7": 4660.8, + "learning_rate": 0.0006799243876539213, + "loss": 6774.2, + "step": 3890 + }, + { + "ce_loss_13": 3.398139035701752, + "ce_loss_26": 2.903027367591858, + "ce_loss_39": 2.351736932992935, + "ce_loss_52": 1.4211576133966446, + "ce_loss_7": 3.6922240018844605, + "epoch": 0.39, + "grad_norm": 20.862431117162615, + "kl_loss_13": 4048.8, + "kl_loss_26": 3010.0, + "kl_loss_39": 1853.6, + "kl_loss_7": 4666.4, + "learning_rate": 0.0006784431107959359, + "loss": 6774.2, + "step": 3900 + }, + { + "ce_loss_13": 3.442742919921875, + "ce_loss_26": 2.950921058654785, + "ce_loss_39": 2.407829362154007, + "ce_loss_52": 1.4671964168548584, + "ce_loss_7": 3.7354746580123903, + "epoch": 0.391, + "grad_norm": 22.203660063445458, + "kl_loss_13": 4065.6, + "kl_loss_26": 3044.4, + "kl_loss_39": 1893.8, + "kl_loss_7": 4682.8, + "learning_rate": 0.0006769600370178059, + "loss": 6751.0, + "step": 3910 + }, + { + "ce_loss_13": 3.3321305394172667, + "ce_loss_26": 2.8416285693645476, + "ce_loss_39": 2.3153179585933685, + "ce_loss_52": 1.3940225571393967, + "ce_loss_7": 3.628729373216629, + "epoch": 0.392, + "grad_norm": 20.321578181458975, + "kl_loss_13": 3993.6, + "kl_loss_26": 2972.0, + "kl_loss_39": 1845.6, + "kl_loss_7": 4610.8, + "learning_rate": 0.0006754751812540679, + "loss": 6716.4, + "step": 3920 + }, + { + "ce_loss_13": 3.382316732406616, + "ce_loss_26": 2.8903696179389953, + "ce_loss_39": 2.353957489132881, + "ce_loss_52": 1.433479717373848, + "ce_loss_7": 3.671325671672821, + "epoch": 0.393, + "grad_norm": 21.271101889195098, + "kl_loss_13": 4022.4, + "kl_loss_26": 2998.4, + "kl_loss_39": 1861.0, + "kl_loss_7": 4629.6, + "learning_rate": 0.0006739885584572025, + "loss": 6776.3, + "step": 3930 + }, + { + "ce_loss_13": 3.300927424430847, + "ce_loss_26": 2.820311403274536, + "ce_loss_39": 2.2964976727962494, + "ce_loss_52": 1.4115092635154725, + "ce_loss_7": 3.5904260516166686, + "epoch": 0.394, + "grad_norm": 20.82909920071906, + "kl_loss_13": 3946.8, + "kl_loss_26": 2937.6, + "kl_loss_39": 1809.0, + "kl_loss_7": 4550.0, + "learning_rate": 0.0006725001835974853, + "loss": 6768.3, + "step": 3940 + }, + { + "ce_loss_13": 3.3879170179367066, + "ce_loss_26": 2.897449654340744, + "ce_loss_39": 2.3513225704431533, + "ce_loss_52": 1.423241639137268, + "ce_loss_7": 3.6786913871765137, + "epoch": 0.395, + "grad_norm": 21.851496416724263, + "kl_loss_13": 4040.0, + "kl_loss_26": 3011.6, + "kl_loss_39": 1852.2, + "kl_loss_7": 4646.4, + "learning_rate": 0.0006710100716628344, + "loss": 6704.8, + "step": 3950 + }, + { + "ce_loss_13": 3.365324836969376, + "ce_loss_26": 2.8627363234758376, + "ce_loss_39": 2.30602003633976, + "ce_loss_52": 1.3922662898898124, + "ce_loss_7": 3.670176440477371, + "epoch": 0.396, + "grad_norm": 19.95099507420605, + "kl_loss_13": 4070.0, + "kl_loss_26": 3011.6, + "kl_loss_39": 1834.6, + "kl_loss_7": 4694.8, + "learning_rate": 0.0006695182376586602, + "loss": 6737.9, + "step": 3960 + }, + { + "ce_loss_13": 3.3035158634185793, + "ce_loss_26": 2.820575511455536, + "ce_loss_39": 2.2792143374681473, + "ce_loss_52": 1.3631332144141197, + "ce_loss_7": 3.5984981656074524, + "epoch": 0.397, + "grad_norm": 21.419651305345628, + "kl_loss_13": 4016.4, + "kl_loss_26": 2992.4, + "kl_loss_39": 1851.6, + "kl_loss_7": 4639.2, + "learning_rate": 0.000668024696607715, + "loss": 6659.1, + "step": 3970 + }, + { + "ce_loss_13": 3.2616395235061644, + "ce_loss_26": 2.784494936466217, + "ce_loss_39": 2.2700316429138185, + "ce_loss_52": 1.3974194526672363, + "ce_loss_7": 3.548482429981232, + "epoch": 0.398, + "grad_norm": 20.984610674219567, + "kl_loss_13": 3844.8, + "kl_loss_26": 2848.4, + "kl_loss_39": 1755.2, + "kl_loss_7": 4441.6, + "learning_rate": 0.0006665294635499404, + "loss": 6600.0, + "step": 3980 + }, + { + "ce_loss_13": 3.3228425204753878, + "ce_loss_26": 2.836167597770691, + "ce_loss_39": 2.3086318761110305, + "ce_loss_52": 1.431598064303398, + "ce_loss_7": 3.617774724960327, + "epoch": 0.399, + "grad_norm": 20.48469634280121, + "kl_loss_13": 3879.6, + "kl_loss_26": 2876.0, + "kl_loss_39": 1769.2, + "kl_loss_7": 4495.2, + "learning_rate": 0.0006650325535423167, + "loss": 6653.5, + "step": 3990 + }, + { + "ce_loss_13": 3.3134547114372253, + "ce_loss_26": 2.8307457506656646, + "ce_loss_39": 2.29368577003479, + "ce_loss_52": 1.3969372153282165, + "ce_loss_7": 3.6135133028030397, + "epoch": 0.4, + "grad_norm": 21.23260680511818, + "kl_loss_13": 3962.8, + "kl_loss_26": 2951.2, + "kl_loss_39": 1800.2, + "kl_loss_7": 4588.0, + "learning_rate": 0.0006635339816587109, + "loss": 6715.2, + "step": 4000 + }, + { + "ce_loss_13": 3.478778451681137, + "ce_loss_26": 2.984225571155548, + "ce_loss_39": 2.432309350371361, + "ce_loss_52": 1.4644457131624222, + "ce_loss_7": 3.782983124256134, + "epoch": 0.401, + "grad_norm": 21.3701964180473, + "kl_loss_13": 4117.6, + "kl_loss_26": 3090.4, + "kl_loss_39": 1932.4, + "kl_loss_7": 4754.8, + "learning_rate": 0.0006620337629897252, + "loss": 6698.2, + "step": 4010 + }, + { + "ce_loss_13": 3.3284165620803834, + "ce_loss_26": 2.8434600114822386, + "ce_loss_39": 2.311116448044777, + "ce_loss_52": 1.4240004986524581, + "ce_loss_7": 3.625540155172348, + "epoch": 0.402, + "grad_norm": 20.004792328254855, + "kl_loss_13": 3939.2, + "kl_loss_26": 2918.4, + "kl_loss_39": 1780.6, + "kl_loss_7": 4553.2, + "learning_rate": 0.0006605319126425454, + "loss": 6664.7, + "step": 4020 + }, + { + "ce_loss_13": 3.366453301906586, + "ce_loss_26": 2.86657951772213, + "ce_loss_39": 2.3278372526168822, + "ce_loss_52": 1.4299451738595963, + "ce_loss_7": 3.6654918253421784, + "epoch": 0.403, + "grad_norm": 20.588190398863947, + "kl_loss_13": 4014.8, + "kl_loss_26": 2970.0, + "kl_loss_39": 1823.4, + "kl_loss_7": 4628.8, + "learning_rate": 0.0006590284457407876, + "loss": 6644.4, + "step": 4030 + }, + { + "ce_loss_13": 3.3478448331356048, + "ce_loss_26": 2.870484399795532, + "ce_loss_39": 2.341677349805832, + "ce_loss_52": 1.4598551213741302, + "ce_loss_7": 3.644811862707138, + "epoch": 0.404, + "grad_norm": 20.8221732353937, + "kl_loss_13": 3908.8, + "kl_loss_26": 2901.6, + "kl_loss_39": 1782.0, + "kl_loss_7": 4526.4, + "learning_rate": 0.0006575233774243465, + "loss": 6645.55, + "step": 4040 + }, + { + "ce_loss_13": 3.28169704079628, + "ce_loss_26": 2.7861692667007447, + "ce_loss_39": 2.2428950667381287, + "ce_loss_52": 1.374313686788082, + "ce_loss_7": 3.5797726988792418, + "epoch": 0.405, + "grad_norm": 21.110763703238916, + "kl_loss_13": 3936.4, + "kl_loss_26": 2902.0, + "kl_loss_39": 1754.0, + "kl_loss_7": 4561.2, + "learning_rate": 0.0006560167228492435, + "loss": 6646.3, + "step": 4050 + }, + { + "ce_loss_13": 3.44179083108902, + "ce_loss_26": 2.9405432820320128, + "ce_loss_39": 2.3968080401420595, + "ce_loss_52": 1.4659699857234956, + "ce_loss_7": 3.736131912469864, + "epoch": 0.406, + "grad_norm": 20.50041516447212, + "kl_loss_13": 4061.2, + "kl_loss_26": 3015.6, + "kl_loss_39": 1863.6, + "kl_loss_7": 4672.4, + "learning_rate": 0.0006545084971874737, + "loss": 6655.9, + "step": 4060 + }, + { + "ce_loss_13": 3.360958731174469, + "ce_loss_26": 2.870532661676407, + "ce_loss_39": 2.321683007478714, + "ce_loss_52": 1.4091651737689972, + "ce_loss_7": 3.6588110864162444, + "epoch": 0.407, + "grad_norm": 20.463583112470857, + "kl_loss_13": 3997.2, + "kl_loss_26": 2979.2, + "kl_loss_39": 1826.4, + "kl_loss_7": 4618.8, + "learning_rate": 0.0006529987156268526, + "loss": 6617.7, + "step": 4070 + }, + { + "ce_loss_13": 3.2686664044857023, + "ce_loss_26": 2.7765056490898132, + "ce_loss_39": 2.238715943694115, + "ce_loss_52": 1.362110722064972, + "ce_loss_7": 3.557782357931137, + "epoch": 0.408, + "grad_norm": 21.232766427379584, + "kl_loss_13": 3924.8, + "kl_loss_26": 2908.8, + "kl_loss_39": 1777.2, + "kl_loss_7": 4538.0, + "learning_rate": 0.0006514873933708637, + "loss": 6653.7, + "step": 4080 + }, + { + "ce_loss_13": 3.275262689590454, + "ce_loss_26": 2.7900135934352877, + "ce_loss_39": 2.2641283214092254, + "ce_loss_52": 1.3853250756859778, + "ce_loss_7": 3.584109377861023, + "epoch": 0.409, + "grad_norm": 21.583109836521306, + "kl_loss_13": 3887.6, + "kl_loss_26": 2868.8, + "kl_loss_39": 1755.0, + "kl_loss_7": 4526.4, + "learning_rate": 0.0006499745456385053, + "loss": 6553.8, + "step": 4090 + }, + { + "ce_loss_13": 3.3286924988031386, + "ce_loss_26": 2.8467950344085695, + "ce_loss_39": 2.312630409002304, + "ce_loss_52": 1.424817180633545, + "ce_loss_7": 3.6197818219661713, + "epoch": 0.41, + "grad_norm": 20.90973793223202, + "kl_loss_13": 3933.2, + "kl_loss_26": 2921.2, + "kl_loss_39": 1789.5, + "kl_loss_7": 4539.2, + "learning_rate": 0.0006484601876641375, + "loss": 6620.35, + "step": 4100 + }, + { + "ce_loss_13": 3.414019340276718, + "ce_loss_26": 2.922485715150833, + "ce_loss_39": 2.3825585186481475, + "ce_loss_52": 1.4558052003383637, + "ce_loss_7": 3.709526652097702, + "epoch": 0.411, + "grad_norm": 21.026331487000416, + "kl_loss_13": 4013.6, + "kl_loss_26": 2986.4, + "kl_loss_39": 1846.6, + "kl_loss_7": 4632.0, + "learning_rate": 0.000646944334697328, + "loss": 6576.6, + "step": 4110 + }, + { + "ce_loss_13": 3.3258216440677644, + "ce_loss_26": 2.848787486553192, + "ce_loss_39": 2.3223425179719923, + "ce_loss_52": 1.4663115084171294, + "ce_loss_7": 3.604213911294937, + "epoch": 0.412, + "grad_norm": 20.97048408186008, + "kl_loss_13": 3820.4, + "kl_loss_26": 2832.4, + "kl_loss_39": 1718.8, + "kl_loss_7": 4410.0, + "learning_rate": 0.0006454270020026995, + "loss": 6611.1, + "step": 4120 + }, + { + "ce_loss_13": 3.3510149538517, + "ce_loss_26": 2.853396385908127, + "ce_loss_39": 2.3119456827640534, + "ce_loss_52": 1.430704912543297, + "ce_loss_7": 3.642722541093826, + "epoch": 0.413, + "grad_norm": 21.962054178499802, + "kl_loss_13": 3953.6, + "kl_loss_26": 2916.4, + "kl_loss_39": 1780.4, + "kl_loss_7": 4556.4, + "learning_rate": 0.0006439082048597755, + "loss": 6584.4, + "step": 4130 + }, + { + "ce_loss_13": 3.3082414746284483, + "ce_loss_26": 2.8318909227848055, + "ce_loss_39": 2.309774273633957, + "ce_loss_52": 1.4488343179225922, + "ce_loss_7": 3.6024456560611724, + "epoch": 0.414, + "grad_norm": 21.685181722314038, + "kl_loss_13": 3830.8, + "kl_loss_26": 2848.8, + "kl_loss_39": 1746.2, + "kl_loss_7": 4442.8, + "learning_rate": 0.0006423879585628261, + "loss": 6547.1, + "step": 4140 + }, + { + "ce_loss_13": 3.3523667633533476, + "ce_loss_26": 2.855451303720474, + "ce_loss_39": 2.3214808642864226, + "ce_loss_52": 1.434419831633568, + "ce_loss_7": 3.63877694606781, + "epoch": 0.415, + "grad_norm": 20.289217196894946, + "kl_loss_13": 3971.6, + "kl_loss_26": 2938.8, + "kl_loss_39": 1799.8, + "kl_loss_7": 4575.6, + "learning_rate": 0.0006408662784207149, + "loss": 6535.7, + "step": 4150 + }, + { + "ce_loss_13": 3.351851773262024, + "ce_loss_26": 2.864642024040222, + "ce_loss_39": 2.3303563445806503, + "ce_loss_52": 1.417774812877178, + "ce_loss_7": 3.6486425340175628, + "epoch": 0.416, + "grad_norm": 20.99970502270892, + "kl_loss_13": 4014.4, + "kl_loss_26": 2990.4, + "kl_loss_39": 1838.4, + "kl_loss_7": 4632.8, + "learning_rate": 0.0006393431797567439, + "loss": 6546.0, + "step": 4160 + }, + { + "ce_loss_13": 3.3471143901348115, + "ce_loss_26": 2.866150665283203, + "ce_loss_39": 2.3319087445735933, + "ce_loss_52": 1.4416520655155183, + "ce_loss_7": 3.6357293486595155, + "epoch": 0.417, + "grad_norm": 21.192503416792082, + "kl_loss_13": 3909.6, + "kl_loss_26": 2915.8, + "kl_loss_39": 1798.8, + "kl_loss_7": 4510.8, + "learning_rate": 0.0006378186779084996, + "loss": 6527.0, + "step": 4170 + }, + { + "ce_loss_13": 3.316433811187744, + "ce_loss_26": 2.8411558747291563, + "ce_loss_39": 2.3257210671901705, + "ce_loss_52": 1.4415073692798615, + "ce_loss_7": 3.604754400253296, + "epoch": 0.418, + "grad_norm": 20.807157262193087, + "kl_loss_13": 3869.2, + "kl_loss_26": 2884.4, + "kl_loss_39": 1787.2, + "kl_loss_7": 4470.0, + "learning_rate": 0.0006362927882276989, + "loss": 6561.5, + "step": 4180 + }, + { + "ce_loss_13": 3.339698684215546, + "ce_loss_26": 2.85506985783577, + "ce_loss_39": 2.311668387055397, + "ce_loss_52": 1.4213855370879174, + "ce_loss_7": 3.6274226009845734, + "epoch": 0.419, + "grad_norm": 22.02061084804507, + "kl_loss_13": 3937.6, + "kl_loss_26": 2928.0, + "kl_loss_39": 1790.0, + "kl_loss_7": 4535.2, + "learning_rate": 0.000634765526080034, + "loss": 6534.9, + "step": 4190 + }, + { + "ce_loss_13": 3.304267328977585, + "ce_loss_26": 2.8152280390262603, + "ce_loss_39": 2.277574297785759, + "ce_loss_52": 1.3985714688897133, + "ce_loss_7": 3.598735523223877, + "epoch": 0.42, + "grad_norm": 19.98512507622475, + "kl_loss_13": 3911.6, + "kl_loss_26": 2888.0, + "kl_loss_39": 1761.0, + "kl_loss_7": 4520.8, + "learning_rate": 0.0006332369068450174, + "loss": 6522.4, + "step": 4200 + }, + { + "ce_loss_13": 3.2766413748264314, + "ce_loss_26": 2.793798440694809, + "ce_loss_39": 2.2647649705410005, + "ce_loss_52": 1.404912966489792, + "ce_loss_7": 3.5614658653736115, + "epoch": 0.421, + "grad_norm": 21.98548722652707, + "kl_loss_13": 3862.8, + "kl_loss_26": 2858.0, + "kl_loss_39": 1744.0, + "kl_loss_7": 4457.6, + "learning_rate": 0.0006317069459158283, + "loss": 6461.5, + "step": 4210 + }, + { + "ce_loss_13": 3.303953742980957, + "ce_loss_26": 2.817542538046837, + "ce_loss_39": 2.2884632468223574, + "ce_loss_52": 1.414811021089554, + "ce_loss_7": 3.5892197132110595, + "epoch": 0.422, + "grad_norm": 21.22519434260025, + "kl_loss_13": 3878.0, + "kl_loss_26": 2873.4, + "kl_loss_39": 1750.6, + "kl_loss_7": 4478.4, + "learning_rate": 0.0006301756586991561, + "loss": 6510.1, + "step": 4220 + }, + { + "ce_loss_13": 3.3445753276348116, + "ce_loss_26": 2.8686348736286162, + "ce_loss_39": 2.3429405450820924, + "ce_loss_52": 1.4785997077822686, + "ce_loss_7": 3.6385815382003783, + "epoch": 0.423, + "grad_norm": 19.644468709446457, + "kl_loss_13": 3856.4, + "kl_loss_26": 2870.4, + "kl_loss_39": 1760.0, + "kl_loss_7": 4462.4, + "learning_rate": 0.0006286430606150459, + "loss": 6493.9, + "step": 4230 + }, + { + "ce_loss_13": 3.3064311265945436, + "ce_loss_26": 2.8393130600452423, + "ce_loss_39": 2.321084627509117, + "ce_loss_52": 1.4542193472385407, + "ce_loss_7": 3.591093236207962, + "epoch": 0.424, + "grad_norm": 19.629096721329073, + "kl_loss_13": 3821.6, + "kl_loss_26": 2834.0, + "kl_loss_39": 1732.2, + "kl_loss_7": 4414.0, + "learning_rate": 0.0006271091670967436, + "loss": 6458.7, + "step": 4240 + }, + { + "ce_loss_13": 3.335456043481827, + "ce_loss_26": 2.8595637679100037, + "ce_loss_39": 2.319040137529373, + "ce_loss_52": 1.4537455767393113, + "ce_loss_7": 3.6277152955532075, + "epoch": 0.425, + "grad_norm": 22.23231840847551, + "kl_loss_13": 3861.2, + "kl_loss_26": 2865.2, + "kl_loss_39": 1725.4, + "kl_loss_7": 4470.8, + "learning_rate": 0.0006255739935905395, + "loss": 6438.9, + "step": 4250 + }, + { + "ce_loss_13": 3.312756323814392, + "ce_loss_26": 2.828430265188217, + "ce_loss_39": 2.290999186038971, + "ce_loss_52": 1.4147280350327491, + "ce_loss_7": 3.606942754983902, + "epoch": 0.426, + "grad_norm": 22.096816976355754, + "kl_loss_13": 3909.6, + "kl_loss_26": 2901.6, + "kl_loss_39": 1772.8, + "kl_loss_7": 4524.4, + "learning_rate": 0.0006240375555556145, + "loss": 6443.7, + "step": 4260 + }, + { + "ce_loss_13": 3.2455935895442964, + "ce_loss_26": 2.764835333824158, + "ce_loss_39": 2.2400916039943697, + "ce_loss_52": 1.3983588561415672, + "ce_loss_7": 3.5336900293827056, + "epoch": 0.427, + "grad_norm": 21.016189257560278, + "kl_loss_13": 3811.6, + "kl_loss_26": 2807.4, + "kl_loss_39": 1694.2, + "kl_loss_7": 4414.0, + "learning_rate": 0.000622499868463882, + "loss": 6395.1, + "step": 4270 + }, + { + "ce_loss_13": 3.3086226165294645, + "ce_loss_26": 2.823494350910187, + "ce_loss_39": 2.301421931385994, + "ce_loss_52": 1.4462398916482926, + "ce_loss_7": 3.595276767015457, + "epoch": 0.428, + "grad_norm": 21.54081801528653, + "kl_loss_13": 3848.8, + "kl_loss_26": 2835.6, + "kl_loss_39": 1725.6, + "kl_loss_7": 4449.6, + "learning_rate": 0.0006209609477998338, + "loss": 6429.2, + "step": 4280 + }, + { + "ce_loss_13": 3.34853395819664, + "ce_loss_26": 2.8662350177764893, + "ce_loss_39": 2.340099334716797, + "ce_loss_52": 1.455578488111496, + "ce_loss_7": 3.6379907071590423, + "epoch": 0.429, + "grad_norm": 22.431607304032426, + "kl_loss_13": 3907.2, + "kl_loss_26": 2905.6, + "kl_loss_39": 1784.0, + "kl_loss_7": 4512.8, + "learning_rate": 0.0006194208090603844, + "loss": 6469.9, + "step": 4290 + }, + { + "ce_loss_13": 3.2378364205360413, + "ce_loss_26": 2.771626591682434, + "ce_loss_39": 2.2533502638339997, + "ce_loss_52": 1.4288378104567527, + "ce_loss_7": 3.523706406354904, + "epoch": 0.43, + "grad_norm": 19.478272699999504, + "kl_loss_13": 3771.6, + "kl_loss_26": 2794.0, + "kl_loss_39": 1687.8, + "kl_loss_7": 4368.8, + "learning_rate": 0.0006178794677547138, + "loss": 6399.1, + "step": 4300 + }, + { + "ce_loss_13": 3.3320172011852263, + "ce_loss_26": 2.8396951615810395, + "ce_loss_39": 2.305786609649658, + "ce_loss_52": 1.434322476387024, + "ce_loss_7": 3.622057580947876, + "epoch": 0.431, + "grad_norm": 21.288095261764262, + "kl_loss_13": 3913.6, + "kl_loss_26": 2898.0, + "kl_loss_39": 1761.0, + "kl_loss_7": 4522.8, + "learning_rate": 0.0006163369394041111, + "loss": 6430.5, + "step": 4310 + }, + { + "ce_loss_13": 3.2775667309761047, + "ce_loss_26": 2.79736185669899, + "ce_loss_39": 2.2730204701423644, + "ce_loss_52": 1.4288703322410583, + "ce_loss_7": 3.562648755311966, + "epoch": 0.432, + "grad_norm": 22.037654673864616, + "kl_loss_13": 3816.0, + "kl_loss_26": 2814.0, + "kl_loss_39": 1690.8, + "kl_loss_7": 4412.8, + "learning_rate": 0.0006147932395418205, + "loss": 6392.0, + "step": 4320 + }, + { + "ce_loss_13": 3.2999020636081697, + "ce_loss_26": 2.8127501010894775, + "ce_loss_39": 2.2844816505908967, + "ce_loss_52": 1.4203194737434388, + "ce_loss_7": 3.5926522493362425, + "epoch": 0.433, + "grad_norm": 23.105450049389578, + "kl_loss_13": 3851.2, + "kl_loss_26": 2850.0, + "kl_loss_39": 1733.2, + "kl_loss_7": 4468.8, + "learning_rate": 0.0006132483837128823, + "loss": 6416.4, + "step": 4330 + }, + { + "ce_loss_13": 3.314070051908493, + "ce_loss_26": 2.8264743953943254, + "ce_loss_39": 2.295166790485382, + "ce_loss_52": 1.4510357692837714, + "ce_loss_7": 3.5982041239738463, + "epoch": 0.434, + "grad_norm": 21.907818649309228, + "kl_loss_13": 3842.0, + "kl_loss_26": 2836.0, + "kl_loss_39": 1709.4, + "kl_loss_7": 4428.8, + "learning_rate": 0.0006117023874739772, + "loss": 6437.0, + "step": 4340 + }, + { + "ce_loss_13": 3.296399414539337, + "ce_loss_26": 2.8059658110141754, + "ce_loss_39": 2.271949994564056, + "ce_loss_52": 1.4154132261872292, + "ce_loss_7": 3.588996112346649, + "epoch": 0.435, + "grad_norm": 21.889212253545118, + "kl_loss_13": 3898.8, + "kl_loss_26": 2887.2, + "kl_loss_39": 1746.6, + "kl_loss_7": 4508.4, + "learning_rate": 0.0006101552663932703, + "loss": 6431.3, + "step": 4350 + }, + { + "ce_loss_13": 3.306167459487915, + "ce_loss_26": 2.8240922570228575, + "ce_loss_39": 2.2931392163038256, + "ce_loss_52": 1.4310514152050018, + "ce_loss_7": 3.596520256996155, + "epoch": 0.436, + "grad_norm": 21.05075754740656, + "kl_loss_13": 3860.0, + "kl_loss_26": 2856.4, + "kl_loss_39": 1737.6, + "kl_loss_7": 4459.2, + "learning_rate": 0.0006086070360502539, + "loss": 6370.7, + "step": 4360 + }, + { + "ce_loss_13": 3.2818971514701842, + "ce_loss_26": 2.8096925973892213, + "ce_loss_39": 2.2902305334806443, + "ce_loss_52": 1.457671320438385, + "ce_loss_7": 3.567191207408905, + "epoch": 0.437, + "grad_norm": 19.98373918443626, + "kl_loss_13": 3770.8, + "kl_loss_26": 2788.4, + "kl_loss_39": 1692.0, + "kl_loss_7": 4367.2, + "learning_rate": 0.0006070577120355903, + "loss": 6341.1, + "step": 4370 + }, + { + "ce_loss_13": 3.317246896028519, + "ce_loss_26": 2.84905891418457, + "ce_loss_39": 2.322802722454071, + "ce_loss_52": 1.4916655078530312, + "ce_loss_7": 3.593036550283432, + "epoch": 0.438, + "grad_norm": 20.010722762004242, + "kl_loss_13": 3781.2, + "kl_loss_26": 2799.2, + "kl_loss_39": 1686.2, + "kl_loss_7": 4360.0, + "learning_rate": 0.0006055073099509549, + "loss": 6355.5, + "step": 4380 + }, + { + "ce_loss_13": 3.2864172756671906, + "ce_loss_26": 2.8085566580295565, + "ce_loss_39": 2.2875818789005278, + "ce_loss_52": 1.4407859086990356, + "ce_loss_7": 3.5741762936115267, + "epoch": 0.439, + "grad_norm": 21.10222653748024, + "kl_loss_13": 3824.4, + "kl_loss_26": 2822.4, + "kl_loss_39": 1713.2, + "kl_loss_7": 4426.0, + "learning_rate": 0.0006039558454088796, + "loss": 6354.5, + "step": 4390 + }, + { + "ce_loss_13": 3.2985159277915956, + "ce_loss_26": 2.817130261659622, + "ce_loss_39": 2.2846481442451476, + "ce_loss_52": 1.4308176964521409, + "ce_loss_7": 3.5788950502872465, + "epoch": 0.44, + "grad_norm": 22.10167857860873, + "kl_loss_13": 3852.4, + "kl_loss_26": 2856.8, + "kl_loss_39": 1727.6, + "kl_loss_7": 4434.8, + "learning_rate": 0.0006024033340325954, + "loss": 6381.3, + "step": 4400 + }, + { + "ce_loss_13": 3.2772581815719604, + "ce_loss_26": 2.7952946066856383, + "ce_loss_39": 2.259748488664627, + "ce_loss_52": 1.4122898250818252, + "ce_loss_7": 3.564402920007706, + "epoch": 0.441, + "grad_norm": 21.766013784294948, + "kl_loss_13": 3854.4, + "kl_loss_26": 2847.6, + "kl_loss_39": 1727.0, + "kl_loss_7": 4457.2, + "learning_rate": 0.0006008497914558743, + "loss": 6338.3, + "step": 4410 + }, + { + "ce_loss_13": 3.310656875371933, + "ce_loss_26": 2.825407701730728, + "ce_loss_39": 2.3054873913526537, + "ce_loss_52": 1.4574851334095, + "ce_loss_7": 3.595499175786972, + "epoch": 0.442, + "grad_norm": 23.195817032303225, + "kl_loss_13": 3826.4, + "kl_loss_26": 2816.8, + "kl_loss_39": 1707.6, + "kl_loss_7": 4415.6, + "learning_rate": 0.0005992952333228728, + "loss": 6415.4, + "step": 4420 + }, + { + "ce_loss_13": 3.147319358587265, + "ce_loss_26": 2.6695513784885407, + "ce_loss_39": 2.153283026814461, + "ce_loss_52": 1.362196257710457, + "ce_loss_7": 3.4255874812602998, + "epoch": 0.443, + "grad_norm": 21.365484698145746, + "kl_loss_13": 3677.2, + "kl_loss_26": 2685.6, + "kl_loss_39": 1588.2, + "kl_loss_7": 4260.0, + "learning_rate": 0.0005977396752879741, + "loss": 6284.8, + "step": 4430 + }, + { + "ce_loss_13": 3.2730862379074095, + "ce_loss_26": 2.790391606092453, + "ce_loss_39": 2.257637658715248, + "ce_loss_52": 1.4242565602064132, + "ce_loss_7": 3.5511350512504576, + "epoch": 0.444, + "grad_norm": 20.84050157821156, + "kl_loss_13": 3810.8, + "kl_loss_26": 2811.6, + "kl_loss_39": 1686.6, + "kl_loss_7": 4399.2, + "learning_rate": 0.0005961831330156305, + "loss": 6282.4, + "step": 4440 + }, + { + "ce_loss_13": 3.303426647186279, + "ce_loss_26": 2.8194876074790955, + "ce_loss_39": 2.286717027425766, + "ce_loss_52": 1.4367665380239487, + "ce_loss_7": 3.589170789718628, + "epoch": 0.445, + "grad_norm": 22.09664086851361, + "kl_loss_13": 3832.4, + "kl_loss_26": 2828.0, + "kl_loss_39": 1702.2, + "kl_loss_7": 4435.6, + "learning_rate": 0.0005946256221802051, + "loss": 6310.7, + "step": 4450 + }, + { + "ce_loss_13": 3.2220256984233857, + "ce_loss_26": 2.747016179561615, + "ce_loss_39": 2.2203843981027602, + "ce_loss_52": 1.4144678741693497, + "ce_loss_7": 3.5019364655017853, + "epoch": 0.446, + "grad_norm": 20.80212123158213, + "kl_loss_13": 3738.0, + "kl_loss_26": 2737.6, + "kl_loss_39": 1635.2, + "kl_loss_7": 4324.0, + "learning_rate": 0.0005930671584658151, + "loss": 6275.9, + "step": 4460 + }, + { + "ce_loss_13": 3.262040966749191, + "ce_loss_26": 2.7848617672920226, + "ce_loss_39": 2.2554671108722686, + "ce_loss_52": 1.4137398272752761, + "ce_loss_7": 3.5524380266666413, + "epoch": 0.447, + "grad_norm": 21.070671360315192, + "kl_loss_13": 3794.4, + "kl_loss_26": 2805.6, + "kl_loss_39": 1697.0, + "kl_loss_7": 4396.4, + "learning_rate": 0.0005915077575661722, + "loss": 6360.7, + "step": 4470 + }, + { + "ce_loss_13": 3.2109140872955324, + "ce_loss_26": 2.7298059910535812, + "ce_loss_39": 2.2085951179265977, + "ce_loss_52": 1.3873766094446183, + "ce_loss_7": 3.4926227211952208, + "epoch": 0.448, + "grad_norm": 21.38920961497166, + "kl_loss_13": 3765.2, + "kl_loss_26": 2759.4, + "kl_loss_39": 1656.0, + "kl_loss_7": 4359.6, + "learning_rate": 0.000589947435184427, + "loss": 6255.15, + "step": 4480 + }, + { + "ce_loss_13": 3.2468604743480682, + "ce_loss_26": 2.7669826805591584, + "ce_loss_39": 2.2381924211978914, + "ce_loss_52": 1.4454800367355347, + "ce_loss_7": 3.5310903012752535, + "epoch": 0.449, + "grad_norm": 23.74435148547982, + "kl_loss_13": 3708.0, + "kl_loss_26": 2716.8, + "kl_loss_39": 1595.4, + "kl_loss_7": 4307.2, + "learning_rate": 0.0005883862070330078, + "loss": 6262.9, + "step": 4490 + }, + { + "ce_loss_13": 3.2490183234214784, + "ce_loss_26": 2.775378829240799, + "ce_loss_39": 2.259204548597336, + "ce_loss_52": 1.4259025424718856, + "ce_loss_7": 3.532491201162338, + "epoch": 0.45, + "grad_norm": 19.921742072679248, + "kl_loss_13": 3736.0, + "kl_loss_26": 2748.4, + "kl_loss_39": 1665.2, + "kl_loss_7": 4324.4, + "learning_rate": 0.0005868240888334653, + "loss": 6279.4, + "step": 4500 + }, + { + "ce_loss_13": 3.2022728264331817, + "ce_loss_26": 2.7251765221357345, + "ce_loss_39": 2.2201029896736144, + "ce_loss_52": 1.4243690267205238, + "ce_loss_7": 3.4856902956962585, + "epoch": 0.451, + "grad_norm": 22.336812688943994, + "kl_loss_13": 3701.6, + "kl_loss_26": 2716.6, + "kl_loss_39": 1627.0, + "kl_loss_7": 4291.6, + "learning_rate": 0.0005852610963163119, + "loss": 6274.9, + "step": 4510 + }, + { + "ce_loss_13": 3.2056246638298034, + "ce_loss_26": 2.735306566953659, + "ce_loss_39": 2.2278982251882553, + "ce_loss_52": 1.4315001338720321, + "ce_loss_7": 3.4921528518199922, + "epoch": 0.452, + "grad_norm": 21.324834799180188, + "kl_loss_13": 3671.2, + "kl_loss_26": 2692.4, + "kl_loss_39": 1610.6, + "kl_loss_7": 4263.2, + "learning_rate": 0.0005836972452208654, + "loss": 6241.8, + "step": 4520 + }, + { + "ce_loss_13": 3.2711530566215514, + "ce_loss_26": 2.794381695985794, + "ce_loss_39": 2.2675902634859084, + "ce_loss_52": 1.4365027844905853, + "ce_loss_7": 3.5576207876205443, + "epoch": 0.453, + "grad_norm": 21.88775011761487, + "kl_loss_13": 3792.8, + "kl_loss_26": 2804.8, + "kl_loss_39": 1700.8, + "kl_loss_7": 4388.0, + "learning_rate": 0.0005821325512950885, + "loss": 6283.8, + "step": 4530 + }, + { + "ce_loss_13": 3.2904800713062285, + "ce_loss_26": 2.8134379625320434, + "ce_loss_39": 2.2945436596870423, + "ce_loss_52": 1.476692470908165, + "ce_loss_7": 3.5703530073165894, + "epoch": 0.454, + "grad_norm": 20.942055908262446, + "kl_loss_13": 3751.6, + "kl_loss_26": 2758.0, + "kl_loss_39": 1655.0, + "kl_loss_7": 4336.0, + "learning_rate": 0.0005805670302954321, + "loss": 6268.6, + "step": 4540 + }, + { + "ce_loss_13": 3.2201909184455872, + "ce_loss_26": 2.737530159950256, + "ce_loss_39": 2.2144886016845704, + "ce_loss_52": 1.4106894597411155, + "ce_loss_7": 3.5021199345588685, + "epoch": 0.455, + "grad_norm": 21.81892564093558, + "kl_loss_13": 3758.8, + "kl_loss_26": 2750.8, + "kl_loss_39": 1624.2, + "kl_loss_7": 4358.4, + "learning_rate": 0.000579000697986675, + "loss": 6232.8, + "step": 4550 + }, + { + "ce_loss_13": 3.2489894032478333, + "ce_loss_26": 2.780492717027664, + "ce_loss_39": 2.2484638780355453, + "ce_loss_52": 1.4397764205932617, + "ce_loss_7": 3.5364687144756317, + "epoch": 0.456, + "grad_norm": 21.02937623207538, + "kl_loss_13": 3754.8, + "kl_loss_26": 2776.8, + "kl_loss_39": 1651.6, + "kl_loss_7": 4345.6, + "learning_rate": 0.0005774335701417662, + "loss": 6241.6, + "step": 4560 + }, + { + "ce_loss_13": 3.2101799607276917, + "ce_loss_26": 2.7431035935878754, + "ce_loss_39": 2.2130339086055755, + "ce_loss_52": 1.4163681983947753, + "ce_loss_7": 3.4954857528209686, + "epoch": 0.457, + "grad_norm": 19.83299768002262, + "kl_loss_13": 3705.2, + "kl_loss_26": 2723.2, + "kl_loss_39": 1618.4, + "kl_loss_7": 4301.2, + "learning_rate": 0.0005758656625416658, + "loss": 6247.2, + "step": 4570 + }, + { + "ce_loss_13": 3.2813266932964327, + "ce_loss_26": 2.793111354112625, + "ce_loss_39": 2.2625322908163072, + "ce_loss_52": 1.4499622374773025, + "ce_loss_7": 3.5644542396068575, + "epoch": 0.458, + "grad_norm": 21.53754747250698, + "kl_loss_13": 3797.6, + "kl_loss_26": 2782.8, + "kl_loss_39": 1647.4, + "kl_loss_7": 4391.2, + "learning_rate": 0.0005742969909751859, + "loss": 6266.1, + "step": 4580 + }, + { + "ce_loss_13": 3.3478006780147553, + "ce_loss_26": 2.8727428793907164, + "ce_loss_39": 2.3303479075431826, + "ce_loss_52": 1.4829013347625732, + "ce_loss_7": 3.6364180862903597, + "epoch": 0.459, + "grad_norm": 21.34326820051386, + "kl_loss_13": 3844.0, + "kl_loss_26": 2840.4, + "kl_loss_39": 1703.4, + "kl_loss_7": 4441.6, + "learning_rate": 0.0005727275712388318, + "loss": 6209.7, + "step": 4590 + }, + { + "ce_loss_13": 3.285200160741806, + "ce_loss_26": 2.807385641336441, + "ce_loss_39": 2.275821554660797, + "ce_loss_52": 1.4415770262479781, + "ce_loss_7": 3.563661777973175, + "epoch": 0.46, + "grad_norm": 20.952340509651894, + "kl_loss_13": 3807.6, + "kl_loss_26": 2814.0, + "kl_loss_39": 1683.4, + "kl_loss_7": 4390.0, + "learning_rate": 0.0005711574191366427, + "loss": 6174.2, + "step": 4600 + }, + { + "ce_loss_13": 3.2419202089309693, + "ce_loss_26": 2.7672870814800263, + "ce_loss_39": 2.2521429657936096, + "ce_loss_52": 1.4457757875323296, + "ce_loss_7": 3.5175400972366333, + "epoch": 0.461, + "grad_norm": 20.667083707625775, + "kl_loss_13": 3685.6, + "kl_loss_26": 2706.8, + "kl_loss_39": 1618.2, + "kl_loss_7": 4262.4, + "learning_rate": 0.0005695865504800327, + "loss": 6154.6, + "step": 4610 + }, + { + "ce_loss_13": 3.2078490257263184, + "ce_loss_26": 2.7373934209346773, + "ce_loss_39": 2.2357589691877364, + "ce_loss_52": 1.4443277925252915, + "ce_loss_7": 3.4876007556915285, + "epoch": 0.462, + "grad_norm": 21.07091453525893, + "kl_loss_13": 3645.6, + "kl_loss_26": 2666.8, + "kl_loss_39": 1598.2, + "kl_loss_7": 4226.0, + "learning_rate": 0.0005680149810876322, + "loss": 6178.4, + "step": 4620 + }, + { + "ce_loss_13": 3.2486107409000398, + "ce_loss_26": 2.7601176381111143, + "ce_loss_39": 2.227588337659836, + "ce_loss_52": 1.4004375696182252, + "ce_loss_7": 3.532775843143463, + "epoch": 0.463, + "grad_norm": 21.569532668695917, + "kl_loss_13": 3786.8, + "kl_loss_26": 2780.4, + "kl_loss_39": 1664.6, + "kl_loss_7": 4386.0, + "learning_rate": 0.0005664427267851271, + "loss": 6215.6, + "step": 4630 + }, + { + "ce_loss_13": 3.2331403851509095, + "ce_loss_26": 2.7579665184020996, + "ce_loss_39": 2.233389773964882, + "ce_loss_52": 1.4353074416518212, + "ce_loss_7": 3.5171454668045046, + "epoch": 0.464, + "grad_norm": 21.55234160622014, + "kl_loss_13": 3697.6, + "kl_loss_26": 2708.8, + "kl_loss_39": 1599.2, + "kl_loss_7": 4292.8, + "learning_rate": 0.0005648698034051009, + "loss": 6233.0, + "step": 4640 + }, + { + "ce_loss_13": 3.264119005203247, + "ce_loss_26": 2.784970927238464, + "ce_loss_39": 2.2625187635421753, + "ce_loss_52": 1.4551048219203948, + "ce_loss_7": 3.536862540245056, + "epoch": 0.465, + "grad_norm": 21.943732618524454, + "kl_loss_13": 3715.6, + "kl_loss_26": 2727.6, + "kl_loss_39": 1626.8, + "kl_loss_7": 4294.8, + "learning_rate": 0.0005632962267868747, + "loss": 6180.9, + "step": 4650 + }, + { + "ce_loss_13": 3.124424380064011, + "ce_loss_26": 2.670001748204231, + "ce_loss_39": 2.161810302734375, + "ce_loss_52": 1.3920277938246728, + "ce_loss_7": 3.4072051107883454, + "epoch": 0.466, + "grad_norm": 19.992372851535457, + "kl_loss_13": 3608.0, + "kl_loss_26": 2641.4, + "kl_loss_39": 1573.8, + "kl_loss_7": 4190.0, + "learning_rate": 0.0005617220127763474, + "loss": 6158.8, + "step": 4660 + }, + { + "ce_loss_13": 3.2301677465438843, + "ce_loss_26": 2.7488952726125717, + "ce_loss_39": 2.2431567162275314, + "ce_loss_52": 1.4423212110996246, + "ce_loss_7": 3.5069880545139314, + "epoch": 0.467, + "grad_norm": 20.78261479761198, + "kl_loss_13": 3680.4, + "kl_loss_26": 2686.0, + "kl_loss_39": 1603.8, + "kl_loss_7": 4262.0, + "learning_rate": 0.0005601471772258368, + "loss": 6129.5, + "step": 4670 + }, + { + "ce_loss_13": 3.2057377636432647, + "ce_loss_26": 2.7412378191947937, + "ce_loss_39": 2.2341296702623366, + "ce_loss_52": 1.4304189920425414, + "ce_loss_7": 3.4817180752754213, + "epoch": 0.468, + "grad_norm": 20.848135049030358, + "kl_loss_13": 3672.8, + "kl_loss_26": 2706.4, + "kl_loss_39": 1624.6, + "kl_loss_7": 4245.6, + "learning_rate": 0.0005585717359939192, + "loss": 6123.3, + "step": 4680 + }, + { + "ce_loss_13": 3.233567637205124, + "ce_loss_26": 2.768052551150322, + "ce_loss_39": 2.25777924656868, + "ce_loss_52": 1.4504828751087189, + "ce_loss_7": 3.5125366508960725, + "epoch": 0.469, + "grad_norm": 20.709517983153315, + "kl_loss_13": 3660.4, + "kl_loss_26": 2696.0, + "kl_loss_39": 1619.6, + "kl_loss_7": 4241.2, + "learning_rate": 0.0005569957049452703, + "loss": 6101.6, + "step": 4690 + }, + { + "ce_loss_13": 3.2725342512130737, + "ce_loss_26": 2.7866754591464997, + "ce_loss_39": 2.2480324536561964, + "ce_loss_52": 1.404937854409218, + "ce_loss_7": 3.5641084611415863, + "epoch": 0.47, + "grad_norm": 20.633433200619724, + "kl_loss_13": 3863.2, + "kl_loss_26": 2851.6, + "kl_loss_39": 1720.6, + "kl_loss_7": 4468.0, + "learning_rate": 0.0005554190999505056, + "loss": 6211.0, + "step": 4700 + }, + { + "ce_loss_13": 3.2052918612957, + "ce_loss_26": 2.732904624938965, + "ce_loss_39": 2.2116665810346605, + "ce_loss_52": 1.4257652133703231, + "ce_loss_7": 3.4880705952644346, + "epoch": 0.471, + "grad_norm": 20.706622626666558, + "kl_loss_13": 3660.0, + "kl_loss_26": 2670.8, + "kl_loss_39": 1571.8, + "kl_loss_7": 4252.4, + "learning_rate": 0.0005538419368860196, + "loss": 6097.3, + "step": 4710 + }, + { + "ce_loss_13": 3.201172482967377, + "ce_loss_26": 2.726384937763214, + "ce_loss_39": 2.198946151137352, + "ce_loss_52": 1.4113327443599701, + "ce_loss_7": 3.487533462047577, + "epoch": 0.472, + "grad_norm": 21.400203482886983, + "kl_loss_13": 3675.6, + "kl_loss_26": 2693.2, + "kl_loss_39": 1591.2, + "kl_loss_7": 4272.0, + "learning_rate": 0.0005522642316338268, + "loss": 6121.3, + "step": 4720 + }, + { + "ce_loss_13": 3.224886018037796, + "ce_loss_26": 2.7551407277584077, + "ce_loss_39": 2.2328554034233092, + "ce_loss_52": 1.4563202857971191, + "ce_loss_7": 3.5022718131542208, + "epoch": 0.473, + "grad_norm": 21.66295321363831, + "kl_loss_13": 3648.0, + "kl_loss_26": 2662.8, + "kl_loss_39": 1575.8, + "kl_loss_7": 4219.6, + "learning_rate": 0.0005506860000814017, + "loss": 6051.3, + "step": 4730 + }, + { + "ce_loss_13": 3.2025643050670625, + "ce_loss_26": 2.7286852061748506, + "ce_loss_39": 2.21729561984539, + "ce_loss_52": 1.453881350159645, + "ce_loss_7": 3.4832063794136046, + "epoch": 0.474, + "grad_norm": 20.603608171505254, + "kl_loss_13": 3623.6, + "kl_loss_26": 2642.4, + "kl_loss_39": 1549.2, + "kl_loss_7": 4212.8, + "learning_rate": 0.0005491072581215186, + "loss": 6098.5, + "step": 4740 + }, + { + "ce_loss_13": 3.212699604034424, + "ce_loss_26": 2.732209050655365, + "ce_loss_39": 2.2089115262031553, + "ce_loss_52": 1.4185278177261353, + "ce_loss_7": 3.4970239818096163, + "epoch": 0.475, + "grad_norm": 20.455172908061705, + "kl_loss_13": 3701.2, + "kl_loss_26": 2708.4, + "kl_loss_39": 1600.0, + "kl_loss_7": 4294.0, + "learning_rate": 0.0005475280216520913, + "loss": 6092.7, + "step": 4750 + }, + { + "ce_loss_13": 3.161313956975937, + "ce_loss_26": 2.6922851324081423, + "ce_loss_39": 2.185682702064514, + "ce_loss_52": 1.4132045745849608, + "ce_loss_7": 3.4361780524253844, + "epoch": 0.476, + "grad_norm": 21.048159476746886, + "kl_loss_13": 3632.8, + "kl_loss_26": 2655.2, + "kl_loss_39": 1574.6, + "kl_loss_7": 4208.4, + "learning_rate": 0.0005459483065760138, + "loss": 6159.3, + "step": 4760 + }, + { + "ce_loss_13": 3.2172334492206573, + "ce_loss_26": 2.739536887407303, + "ce_loss_39": 2.2192301630973814, + "ce_loss_52": 1.4282803654670715, + "ce_loss_7": 3.5064621806144713, + "epoch": 0.477, + "grad_norm": 20.62269454966768, + "kl_loss_13": 3708.4, + "kl_loss_26": 2710.0, + "kl_loss_39": 1610.2, + "kl_loss_7": 4301.6, + "learning_rate": 0.0005443681288009991, + "loss": 6104.1, + "step": 4770 + }, + { + "ce_loss_13": 3.217255789041519, + "ce_loss_26": 2.7369415044784544, + "ce_loss_39": 2.2001087069511414, + "ce_loss_52": 1.401385571062565, + "ce_loss_7": 3.5032376050949097, + "epoch": 0.478, + "grad_norm": 20.40853719962087, + "kl_loss_13": 3760.0, + "kl_loss_26": 2763.2, + "kl_loss_39": 1625.0, + "kl_loss_7": 4353.2, + "learning_rate": 0.0005427875042394199, + "loss": 6064.0, + "step": 4780 + }, + { + "ce_loss_13": 3.199622023105621, + "ce_loss_26": 2.734530872106552, + "ce_loss_39": 2.2225404649972917, + "ce_loss_52": 1.4552370458841324, + "ce_loss_7": 3.475612831115723, + "epoch": 0.479, + "grad_norm": 21.111949592982874, + "kl_loss_13": 3598.8, + "kl_loss_26": 2623.2, + "kl_loss_39": 1545.4, + "kl_loss_7": 4177.6, + "learning_rate": 0.0005412064488081482, + "loss": 6074.2, + "step": 4790 + }, + { + "ce_loss_13": 3.1485446810722353, + "ce_loss_26": 2.6845290422439576, + "ce_loss_39": 2.1669752955436707, + "ce_loss_52": 1.4116393029689789, + "ce_loss_7": 3.423712509870529, + "epoch": 0.48, + "grad_norm": 20.355440951189763, + "kl_loss_13": 3606.4, + "kl_loss_26": 2638.4, + "kl_loss_39": 1543.2, + "kl_loss_7": 4182.4, + "learning_rate": 0.0005396249784283942, + "loss": 6051.0, + "step": 4800 + }, + { + "ce_loss_13": 3.1923361301422117, + "ce_loss_26": 2.71637277007103, + "ce_loss_39": 2.1978514790534973, + "ce_loss_52": 1.4327621147036553, + "ce_loss_7": 3.47632372379303, + "epoch": 0.481, + "grad_norm": 22.229636039543518, + "kl_loss_13": 3644.0, + "kl_loss_26": 2653.2, + "kl_loss_39": 1555.8, + "kl_loss_7": 4232.8, + "learning_rate": 0.0005380431090255476, + "loss": 6143.3, + "step": 4810 + }, + { + "ce_loss_13": 3.232038801908493, + "ce_loss_26": 2.7648274183273314, + "ce_loss_39": 2.2573492497205736, + "ce_loss_52": 1.4371329843997955, + "ce_loss_7": 3.5092617154121397, + "epoch": 0.482, + "grad_norm": 21.36587062891148, + "kl_loss_13": 3704.8, + "kl_loss_26": 2740.8, + "kl_loss_39": 1651.2, + "kl_loss_7": 4281.6, + "learning_rate": 0.0005364608565290155, + "loss": 6031.2, + "step": 4820 + }, + { + "ce_loss_13": 3.250445681810379, + "ce_loss_26": 2.773394727706909, + "ce_loss_39": 2.243200385570526, + "ce_loss_52": 1.4556994497776032, + "ce_loss_7": 3.536140114068985, + "epoch": 0.483, + "grad_norm": 20.760727607225178, + "kl_loss_13": 3706.4, + "kl_loss_26": 2711.2, + "kl_loss_39": 1595.8, + "kl_loss_7": 4300.0, + "learning_rate": 0.0005348782368720626, + "loss": 6094.3, + "step": 4830 + }, + { + "ce_loss_13": 3.2234844088554384, + "ce_loss_26": 2.7573211640119553, + "ce_loss_39": 2.244653856754303, + "ce_loss_52": 1.4427421689033508, + "ce_loss_7": 3.497196841239929, + "epoch": 0.484, + "grad_norm": 20.726340018616938, + "kl_loss_13": 3685.6, + "kl_loss_26": 2708.0, + "kl_loss_39": 1610.0, + "kl_loss_7": 4262.4, + "learning_rate": 0.000533295265991652, + "loss": 6062.9, + "step": 4840 + }, + { + "ce_loss_13": 3.152006584405899, + "ce_loss_26": 2.678810328245163, + "ce_loss_39": 2.166279435157776, + "ce_loss_52": 1.3957384467124938, + "ce_loss_7": 3.4304138660430907, + "epoch": 0.485, + "grad_norm": 21.522248137711077, + "kl_loss_13": 3630.8, + "kl_loss_26": 2639.0, + "kl_loss_39": 1547.4, + "kl_loss_7": 4218.0, + "learning_rate": 0.0005317119598282822, + "loss": 6033.9, + "step": 4850 + }, + { + "ce_loss_13": 3.2258131086826323, + "ce_loss_26": 2.7538253903388976, + "ce_loss_39": 2.237151172757149, + "ce_loss_52": 1.4640001267194749, + "ce_loss_7": 3.493991768360138, + "epoch": 0.486, + "grad_norm": 19.71152116189248, + "kl_loss_13": 3656.8, + "kl_loss_26": 2681.2, + "kl_loss_39": 1581.0, + "kl_loss_7": 4221.6, + "learning_rate": 0.0005301283343258293, + "loss": 6062.4, + "step": 4860 + }, + { + "ce_loss_13": 3.188614493608475, + "ce_loss_26": 2.7119751185178758, + "ce_loss_39": 2.189143994450569, + "ce_loss_52": 1.422459150850773, + "ce_loss_7": 3.468545514345169, + "epoch": 0.487, + "grad_norm": 20.57479406007355, + "kl_loss_13": 3638.4, + "kl_loss_26": 2642.2, + "kl_loss_39": 1539.5, + "kl_loss_7": 4222.0, + "learning_rate": 0.000528544405431384, + "loss": 6047.1, + "step": 4870 + }, + { + "ce_loss_13": 3.1630406379699707, + "ce_loss_26": 2.6917948126792908, + "ce_loss_39": 2.187080183625221, + "ce_loss_52": 1.430996198952198, + "ce_loss_7": 3.437908464670181, + "epoch": 0.488, + "grad_norm": 20.266256252328688, + "kl_loss_13": 3589.2, + "kl_loss_26": 2606.0, + "kl_loss_39": 1535.4, + "kl_loss_7": 4160.0, + "learning_rate": 0.000526960189095093, + "loss": 6056.9, + "step": 4880 + }, + { + "ce_loss_13": 3.1363641381263734, + "ce_loss_26": 2.6843234658241273, + "ce_loss_39": 2.1881404638290407, + "ce_loss_52": 1.4287778049707414, + "ce_loss_7": 3.403191590309143, + "epoch": 0.489, + "grad_norm": 20.737227217707265, + "kl_loss_13": 3534.8, + "kl_loss_26": 2597.2, + "kl_loss_39": 1535.4, + "kl_loss_7": 4094.0, + "learning_rate": 0.0005253757012699972, + "loss": 6013.8, + "step": 4890 + }, + { + "ce_loss_13": 3.2066462457180025, + "ce_loss_26": 2.7318670630455015, + "ce_loss_39": 2.2165933042764663, + "ce_loss_52": 1.4400058209896087, + "ce_loss_7": 3.4839820206165313, + "epoch": 0.49, + "grad_norm": 20.942033187869956, + "kl_loss_13": 3651.6, + "kl_loss_26": 2660.8, + "kl_loss_39": 1565.0, + "kl_loss_7": 4233.6, + "learning_rate": 0.0005237909579118712, + "loss": 5973.0, + "step": 4900 + }, + { + "ce_loss_13": 3.2107683062553405, + "ce_loss_26": 2.723215198516846, + "ce_loss_39": 2.2134319245815277, + "ce_loss_52": 1.4413373351097107, + "ce_loss_7": 3.498019593954086, + "epoch": 0.491, + "grad_norm": 19.852386563944762, + "kl_loss_13": 3647.2, + "kl_loss_26": 2634.4, + "kl_loss_39": 1545.0, + "kl_loss_7": 4246.4, + "learning_rate": 0.0005222059749790631, + "loss": 5997.8, + "step": 4910 + }, + { + "ce_loss_13": 3.2151939988136293, + "ce_loss_26": 2.7481437802314757, + "ce_loss_39": 2.2312227368354796, + "ce_loss_52": 1.4501317411661148, + "ce_loss_7": 3.4992719650268556, + "epoch": 0.492, + "grad_norm": 21.940050434667352, + "kl_loss_13": 3627.6, + "kl_loss_26": 2655.2, + "kl_loss_39": 1571.4, + "kl_loss_7": 4213.6, + "learning_rate": 0.0005206207684323337, + "loss": 5989.6, + "step": 4920 + }, + { + "ce_loss_13": 3.14618239402771, + "ce_loss_26": 2.6644466161727904, + "ce_loss_39": 2.1522305369377137, + "ce_loss_52": 1.4040746569633484, + "ce_loss_7": 3.430801051855087, + "epoch": 0.493, + "grad_norm": 21.866819048126402, + "kl_loss_13": 3609.6, + "kl_loss_26": 2614.8, + "kl_loss_39": 1523.4, + "kl_loss_7": 4205.6, + "learning_rate": 0.000519035354234695, + "loss": 5971.3, + "step": 4930 + }, + { + "ce_loss_13": 3.2634301006793978, + "ce_loss_26": 2.7843244314193725, + "ce_loss_39": 2.264421299099922, + "ce_loss_52": 1.4652716666460037, + "ce_loss_7": 3.5409990727901457, + "epoch": 0.494, + "grad_norm": 22.02758833423459, + "kl_loss_13": 3711.6, + "kl_loss_26": 2710.6, + "kl_loss_39": 1607.3, + "kl_loss_7": 4290.4, + "learning_rate": 0.0005174497483512506, + "loss": 6017.3, + "step": 4940 + }, + { + "ce_loss_13": 3.2103551268577575, + "ce_loss_26": 2.7440166890621187, + "ce_loss_39": 2.2352791130542755, + "ce_loss_52": 1.4523784220218658, + "ce_loss_7": 3.486135560274124, + "epoch": 0.495, + "grad_norm": 23.749732875931574, + "kl_loss_13": 3618.0, + "kl_loss_26": 2656.6, + "kl_loss_39": 1573.0, + "kl_loss_7": 4201.2, + "learning_rate": 0.0005158639667490339, + "loss": 5989.9, + "step": 4950 + }, + { + "ce_loss_13": 3.118060350418091, + "ce_loss_26": 2.6498723566532134, + "ce_loss_39": 2.1354818284511565, + "ce_loss_52": 1.3788613289594651, + "ce_loss_7": 3.403479200601578, + "epoch": 0.496, + "grad_norm": 20.524190680845354, + "kl_loss_13": 3599.2, + "kl_loss_26": 2620.4, + "kl_loss_39": 1541.0, + "kl_loss_7": 4189.6, + "learning_rate": 0.0005142780253968481, + "loss": 5973.2, + "step": 4960 + }, + { + "ce_loss_13": 3.1573639094829558, + "ce_loss_26": 2.6952777743339538, + "ce_loss_39": 2.184722366929054, + "ce_loss_52": 1.4360924899578094, + "ce_loss_7": 3.4324650526046754, + "epoch": 0.497, + "grad_norm": 21.848899371522094, + "kl_loss_13": 3594.0, + "kl_loss_26": 2620.8, + "kl_loss_39": 1532.4, + "kl_loss_7": 4163.6, + "learning_rate": 0.0005126919402651053, + "loss": 5950.9, + "step": 4970 + }, + { + "ce_loss_13": 3.148969703912735, + "ce_loss_26": 2.6792631447315216, + "ce_loss_39": 2.168180876970291, + "ce_loss_52": 1.4135416984558105, + "ce_loss_7": 3.4292350709438324, + "epoch": 0.498, + "grad_norm": 21.082558392676134, + "kl_loss_13": 3576.4, + "kl_loss_26": 2608.8, + "kl_loss_39": 1527.0, + "kl_loss_7": 4166.4, + "learning_rate": 0.0005111057273256647, + "loss": 5924.7, + "step": 4980 + }, + { + "ce_loss_13": 3.1738288044929504, + "ce_loss_26": 2.7059105813503264, + "ce_loss_39": 2.194283801317215, + "ce_loss_52": 1.4388620942831039, + "ce_loss_7": 3.461875486373901, + "epoch": 0.499, + "grad_norm": 21.062049454907296, + "kl_loss_13": 3574.8, + "kl_loss_26": 2594.8, + "kl_loss_39": 1521.0, + "kl_loss_7": 4165.2, + "learning_rate": 0.0005095194025516733, + "loss": 5935.8, + "step": 4990 + }, + { + "ce_loss_13": 3.2131927073001862, + "ce_loss_26": 2.7427126079797746, + "ce_loss_39": 2.2358015894889833, + "ce_loss_52": 1.4697326198220253, + "ce_loss_7": 3.48975727558136, + "epoch": 0.5, + "grad_norm": 19.951336775236356, + "kl_loss_13": 3620.8, + "kl_loss_26": 2633.2, + "kl_loss_39": 1549.0, + "kl_loss_7": 4195.6, + "learning_rate": 0.000507932981917404, + "loss": 5955.8, + "step": 5000 + }, + { + "ce_loss_13": 3.064238077402115, + "ce_loss_26": 2.605297487974167, + "ce_loss_39": 2.106147512793541, + "ce_loss_52": 1.3699454009532928, + "ce_loss_7": 3.3354556441307066, + "epoch": 0.501, + "grad_norm": 22.434294806872032, + "kl_loss_13": 3513.6, + "kl_loss_26": 2556.0, + "kl_loss_39": 1493.0, + "kl_loss_7": 4079.2, + "learning_rate": 0.0005063464813980949, + "loss": 5921.7, + "step": 5010 + }, + { + "ce_loss_13": 3.120131802558899, + "ce_loss_26": 2.6457916140556335, + "ce_loss_39": 2.136381095647812, + "ce_loss_52": 1.3973354250192642, + "ce_loss_7": 3.3927165508270263, + "epoch": 0.502, + "grad_norm": 20.534145152408744, + "kl_loss_13": 3558.4, + "kl_loss_26": 2578.4, + "kl_loss_39": 1497.8, + "kl_loss_7": 4132.8, + "learning_rate": 0.0005047599169697884, + "loss": 5945.8, + "step": 5020 + }, + { + "ce_loss_13": 3.155858016014099, + "ce_loss_26": 2.683425110578537, + "ce_loss_39": 2.1650480359792708, + "ce_loss_52": 1.426735344529152, + "ce_loss_7": 3.4369399666786196, + "epoch": 0.503, + "grad_norm": 20.583275474881205, + "kl_loss_13": 3593.2, + "kl_loss_26": 2606.4, + "kl_loss_39": 1509.8, + "kl_loss_7": 4176.0, + "learning_rate": 0.000503173304609171, + "loss": 5949.4, + "step": 5030 + }, + { + "ce_loss_13": 3.224624240398407, + "ce_loss_26": 2.742176574468613, + "ce_loss_39": 2.2156084358692167, + "ce_loss_52": 1.4474807173013686, + "ce_loss_7": 3.508391612768173, + "epoch": 0.504, + "grad_norm": 20.860727938912092, + "kl_loss_13": 3678.4, + "kl_loss_26": 2680.8, + "kl_loss_39": 1568.2, + "kl_loss_7": 4265.2, + "learning_rate": 0.0005015866602934111, + "loss": 5957.4, + "step": 5040 + }, + { + "ce_loss_13": 3.1227844834327696, + "ce_loss_26": 2.661714029312134, + "ce_loss_39": 2.1653817743062973, + "ce_loss_52": 1.4367017298936844, + "ce_loss_7": 3.3902225315570833, + "epoch": 0.505, + "grad_norm": 19.797756106985307, + "kl_loss_13": 3490.8, + "kl_loss_26": 2530.0, + "kl_loss_39": 1472.6, + "kl_loss_7": 4055.6, + "learning_rate": 0.0005, + "loss": 5927.3, + "step": 5050 + }, + { + "ce_loss_13": 3.177449029684067, + "ce_loss_26": 2.716005155444145, + "ce_loss_39": 2.205168914794922, + "ce_loss_52": 1.442250807583332, + "ce_loss_7": 3.453594130277634, + "epoch": 0.506, + "grad_norm": 20.433104016560257, + "kl_loss_13": 3588.0, + "kl_loss_26": 2622.2, + "kl_loss_39": 1546.2, + "kl_loss_7": 4161.6, + "learning_rate": 0.0004984133397065889, + "loss": 5913.9, + "step": 5060 + }, + { + "ce_loss_13": 3.1423951983451843, + "ce_loss_26": 2.674048882722855, + "ce_loss_39": 2.1621575862169267, + "ce_loss_52": 1.4322402387857438, + "ce_loss_7": 3.416734743118286, + "epoch": 0.507, + "grad_norm": 20.444836977919294, + "kl_loss_13": 3545.2, + "kl_loss_26": 2564.8, + "kl_loss_39": 1486.0, + "kl_loss_7": 4119.6, + "learning_rate": 0.0004968266953908291, + "loss": 5880.6, + "step": 5070 + }, + { + "ce_loss_13": 3.070253336429596, + "ce_loss_26": 2.6069840848445893, + "ce_loss_39": 2.1048853427171705, + "ce_loss_52": 1.3886964708566665, + "ce_loss_7": 3.350748908519745, + "epoch": 0.508, + "grad_norm": 21.18309883625556, + "kl_loss_13": 3487.2, + "kl_loss_26": 2520.8, + "kl_loss_39": 1460.8, + "kl_loss_7": 4069.6, + "learning_rate": 0.0004952400830302117, + "loss": 5885.3, + "step": 5080 + }, + { + "ce_loss_13": 3.077636110782623, + "ce_loss_26": 2.6177482545375823, + "ce_loss_39": 2.1197337061166763, + "ce_loss_52": 1.3917517423629762, + "ce_loss_7": 3.3575133979320526, + "epoch": 0.509, + "grad_norm": 19.77626859818484, + "kl_loss_13": 3496.0, + "kl_loss_26": 2536.8, + "kl_loss_39": 1477.0, + "kl_loss_7": 4073.6, + "learning_rate": 0.0004936535186019053, + "loss": 5872.1, + "step": 5090 + }, + { + "ce_loss_13": 3.178175300359726, + "ce_loss_26": 2.701016789674759, + "ce_loss_39": 2.1893070548772813, + "ce_loss_52": 1.4174780696630478, + "ce_loss_7": 3.4626995623111725, + "epoch": 0.51, + "grad_norm": 19.62760678285289, + "kl_loss_13": 3640.4, + "kl_loss_26": 2642.0, + "kl_loss_39": 1551.2, + "kl_loss_7": 4232.4, + "learning_rate": 0.000492067018082596, + "loss": 5937.7, + "step": 5100 + }, + { + "ce_loss_13": 3.169191563129425, + "ce_loss_26": 2.708436530828476, + "ce_loss_39": 2.1886946499347686, + "ce_loss_52": 1.4288851469755173, + "ce_loss_7": 3.4514395534992217, + "epoch": 0.511, + "grad_norm": 20.552371425986603, + "kl_loss_13": 3588.0, + "kl_loss_26": 2634.8, + "kl_loss_39": 1546.0, + "kl_loss_7": 4178.4, + "learning_rate": 0.0004904805974483267, + "loss": 5867.4, + "step": 5110 + }, + { + "ce_loss_13": 3.152217388153076, + "ce_loss_26": 2.6922530949115755, + "ce_loss_39": 2.181437623500824, + "ce_loss_52": 1.4460914835333825, + "ce_loss_7": 3.429844158887863, + "epoch": 0.512, + "grad_norm": 20.36189036420726, + "kl_loss_13": 3504.0, + "kl_loss_26": 2543.2, + "kl_loss_39": 1481.0, + "kl_loss_7": 4075.2, + "learning_rate": 0.0004888942726743353, + "loss": 5848.7, + "step": 5120 + }, + { + "ce_loss_13": 3.1243839859962463, + "ce_loss_26": 2.6554999887943267, + "ce_loss_39": 2.1515370845794677, + "ce_loss_52": 1.4089675784111022, + "ce_loss_7": 3.4115704774856566, + "epoch": 0.513, + "grad_norm": 20.218379503515084, + "kl_loss_13": 3534.0, + "kl_loss_26": 2557.4, + "kl_loss_39": 1487.0, + "kl_loss_7": 4121.6, + "learning_rate": 0.0004873080597348947, + "loss": 5856.5, + "step": 5130 + }, + { + "ce_loss_13": 3.223481798171997, + "ce_loss_26": 2.754591333866119, + "ce_loss_39": 2.236158034205437, + "ce_loss_52": 1.4544740557670592, + "ce_loss_7": 3.5026029109954835, + "epoch": 0.514, + "grad_norm": 20.810099829480396, + "kl_loss_13": 3656.8, + "kl_loss_26": 2670.8, + "kl_loss_39": 1575.8, + "kl_loss_7": 4237.2, + "learning_rate": 0.0004857219746031519, + "loss": 5882.8, + "step": 5140 + }, + { + "ce_loss_13": 3.146134835481644, + "ce_loss_26": 2.680527698993683, + "ce_loss_39": 2.168422257900238, + "ce_loss_52": 1.4367385059595108, + "ce_loss_7": 3.423633599281311, + "epoch": 0.515, + "grad_norm": 20.424421047481275, + "kl_loss_13": 3516.8, + "kl_loss_26": 2539.6, + "kl_loss_39": 1472.0, + "kl_loss_7": 4094.8, + "learning_rate": 0.0004841360332509663, + "loss": 5895.45, + "step": 5150 + }, + { + "ce_loss_13": 3.1724873065948485, + "ce_loss_26": 2.7010417520999908, + "ce_loss_39": 2.188371130824089, + "ce_loss_52": 1.4464277178049088, + "ce_loss_7": 3.4471147775650026, + "epoch": 0.516, + "grad_norm": 20.221560032680266, + "kl_loss_13": 3555.2, + "kl_loss_26": 2575.2, + "kl_loss_39": 1495.4, + "kl_loss_7": 4124.8, + "learning_rate": 0.0004825502516487497, + "loss": 5883.8, + "step": 5160 + }, + { + "ce_loss_13": 3.1568028390407563, + "ce_loss_26": 2.695826065540314, + "ce_loss_39": 2.1907377928495406, + "ce_loss_52": 1.467595374584198, + "ce_loss_7": 3.4282791554927825, + "epoch": 0.517, + "grad_norm": 21.375776905692355, + "kl_loss_13": 3490.8, + "kl_loss_26": 2528.8, + "kl_loss_39": 1463.6, + "kl_loss_7": 4049.2, + "learning_rate": 0.00048096464576530507, + "loss": 5813.7, + "step": 5170 + }, + { + "ce_loss_13": 3.0959118604660034, + "ce_loss_26": 2.6389028072357177, + "ce_loss_39": 2.1422775775194167, + "ce_loss_52": 1.4158973768353462, + "ce_loss_7": 3.3716647744178774, + "epoch": 0.518, + "grad_norm": 20.796367284367612, + "kl_loss_13": 3490.0, + "kl_loss_26": 2527.0, + "kl_loss_39": 1467.8, + "kl_loss_7": 4068.4, + "learning_rate": 0.00047937923156766646, + "loss": 5832.2, + "step": 5180 + }, + { + "ce_loss_13": 3.181716579198837, + "ce_loss_26": 2.7091287076473236, + "ce_loss_39": 2.205903950333595, + "ce_loss_52": 1.4591009467840195, + "ce_loss_7": 3.4553593516349794, + "epoch": 0.519, + "grad_norm": 21.655670358447043, + "kl_loss_13": 3570.4, + "kl_loss_26": 2594.8, + "kl_loss_39": 1522.8, + "kl_loss_7": 4142.4, + "learning_rate": 0.00047779402502093696, + "loss": 5844.1, + "step": 5190 + }, + { + "ce_loss_13": 3.114813321828842, + "ce_loss_26": 2.6511843532323836, + "ce_loss_39": 2.1350393682718276, + "ce_loss_52": 1.4030846193432809, + "ce_loss_7": 3.3881891489028932, + "epoch": 0.52, + "grad_norm": 21.261317204954832, + "kl_loss_13": 3537.2, + "kl_loss_26": 2569.4, + "kl_loss_39": 1490.2, + "kl_loss_7": 4112.8, + "learning_rate": 0.0004762090420881289, + "loss": 5904.5, + "step": 5200 + }, + { + "ce_loss_13": 3.178787976503372, + "ce_loss_26": 2.716679725050926, + "ce_loss_39": 2.214344197511673, + "ce_loss_52": 1.4766788110136986, + "ce_loss_7": 3.445727747678757, + "epoch": 0.521, + "grad_norm": 21.5237930218517, + "kl_loss_13": 3533.2, + "kl_loss_26": 2579.6, + "kl_loss_39": 1511.4, + "kl_loss_7": 4104.0, + "learning_rate": 0.00047462429873000296, + "loss": 5807.7, + "step": 5210 + }, + { + "ce_loss_13": 3.1838007628917695, + "ce_loss_26": 2.7048393905162813, + "ce_loss_39": 2.19879055917263, + "ce_loss_52": 1.4492772698402405, + "ce_loss_7": 3.460488295555115, + "epoch": 0.522, + "grad_norm": 23.509628530732027, + "kl_loss_13": 3558.8, + "kl_loss_26": 2571.2, + "kl_loss_39": 1503.6, + "kl_loss_7": 4141.6, + "learning_rate": 0.0004730398109049071, + "loss": 5850.6, + "step": 5220 + }, + { + "ce_loss_13": 3.1778542578220366, + "ce_loss_26": 2.723250871896744, + "ce_loss_39": 2.2311396062374116, + "ce_loss_52": 1.4870843648910523, + "ce_loss_7": 3.4500561714172364, + "epoch": 0.523, + "grad_norm": 20.540010606623795, + "kl_loss_13": 3511.6, + "kl_loss_26": 2561.4, + "kl_loss_39": 1495.8, + "kl_loss_7": 4082.4, + "learning_rate": 0.000471455594568616, + "loss": 5864.9, + "step": 5230 + }, + { + "ce_loss_13": 3.184338331222534, + "ce_loss_26": 2.719081574678421, + "ce_loss_39": 2.1989813148975372, + "ce_loss_52": 1.4533298462629318, + "ce_loss_7": 3.465100187063217, + "epoch": 0.524, + "grad_norm": 19.66913851254436, + "kl_loss_13": 3588.4, + "kl_loss_26": 2616.0, + "kl_loss_39": 1518.0, + "kl_loss_7": 4170.0, + "learning_rate": 0.00046987166567417086, + "loss": 5881.2, + "step": 5240 + }, + { + "ce_loss_13": 3.097227877378464, + "ce_loss_26": 2.6349131643772123, + "ce_loss_39": 2.1339926183223725, + "ce_loss_52": 1.3901747956871986, + "ce_loss_7": 3.3714997708797454, + "epoch": 0.525, + "grad_norm": 21.094189724694242, + "kl_loss_13": 3503.6, + "kl_loss_26": 2546.6, + "kl_loss_39": 1491.4, + "kl_loss_7": 4079.2, + "learning_rate": 0.00046828804017171776, + "loss": 5869.8, + "step": 5250 + }, + { + "ce_loss_13": 3.1320539236068727, + "ce_loss_26": 2.667567166686058, + "ce_loss_39": 2.17643720805645, + "ce_loss_52": 1.450567391514778, + "ce_loss_7": 3.4080025017261506, + "epoch": 0.526, + "grad_norm": 20.61619717149242, + "kl_loss_13": 3507.6, + "kl_loss_26": 2536.4, + "kl_loss_39": 1483.6, + "kl_loss_7": 4082.4, + "learning_rate": 0.00046670473400834805, + "loss": 5811.0, + "step": 5260 + }, + { + "ce_loss_13": 3.123244607448578, + "ce_loss_26": 2.6622200667858125, + "ce_loss_39": 2.1446547359228134, + "ce_loss_52": 1.4058131739497184, + "ce_loss_7": 3.393282580375671, + "epoch": 0.527, + "grad_norm": 20.13067058218258, + "kl_loss_13": 3536.4, + "kl_loss_26": 2574.0, + "kl_loss_39": 1499.8, + "kl_loss_7": 4103.6, + "learning_rate": 0.00046512176312793734, + "loss": 5799.9, + "step": 5270 + }, + { + "ce_loss_13": 3.0799066185951234, + "ce_loss_26": 2.620718148350716, + "ce_loss_39": 2.1284131199121474, + "ce_loss_52": 1.4004437893629074, + "ce_loss_7": 3.357691395282745, + "epoch": 0.528, + "grad_norm": 20.729785702601426, + "kl_loss_13": 3458.8, + "kl_loss_26": 2500.6, + "kl_loss_39": 1456.6, + "kl_loss_7": 4039.2, + "learning_rate": 0.00046353914347098467, + "loss": 5784.9, + "step": 5280 + }, + { + "ce_loss_13": 3.100508135557175, + "ce_loss_26": 2.645443448424339, + "ce_loss_39": 2.140786075592041, + "ce_loss_52": 1.4068747192621232, + "ce_loss_7": 3.3814845025539397, + "epoch": 0.529, + "grad_norm": 20.87154357218591, + "kl_loss_13": 3506.0, + "kl_loss_26": 2553.6, + "kl_loss_39": 1489.5, + "kl_loss_7": 4091.2, + "learning_rate": 0.0004619568909744524, + "loss": 5772.2, + "step": 5290 + }, + { + "ce_loss_13": 3.127873086929321, + "ce_loss_26": 2.6634323090314864, + "ce_loss_39": 2.157263731956482, + "ce_loss_52": 1.4283392548561096, + "ce_loss_7": 3.3975966036319734, + "epoch": 0.53, + "grad_norm": 20.20322543048614, + "kl_loss_13": 3490.8, + "kl_loss_26": 2524.4, + "kl_loss_39": 1474.6, + "kl_loss_7": 4055.2, + "learning_rate": 0.00046037502157160573, + "loss": 5795.9, + "step": 5300 + }, + { + "ce_loss_13": 3.076627087593079, + "ce_loss_26": 2.616869166493416, + "ce_loss_39": 2.117774197459221, + "ce_loss_52": 1.4120985105633737, + "ce_loss_7": 3.3510021567344666, + "epoch": 0.531, + "grad_norm": 20.811499078870163, + "kl_loss_13": 3444.0, + "kl_loss_26": 2482.4, + "kl_loss_39": 1422.8, + "kl_loss_7": 4016.0, + "learning_rate": 0.00045879355119185207, + "loss": 5749.7, + "step": 5310 + }, + { + "ce_loss_13": 3.1226239800453186, + "ce_loss_26": 2.662443572282791, + "ce_loss_39": 2.156531369686127, + "ce_loss_52": 1.4430431425571442, + "ce_loss_7": 3.396123135089874, + "epoch": 0.532, + "grad_norm": 18.59996382548897, + "kl_loss_13": 3496.0, + "kl_loss_26": 2528.8, + "kl_loss_39": 1445.2, + "kl_loss_7": 4069.2, + "learning_rate": 0.0004572124957605803, + "loss": 5776.5, + "step": 5320 + }, + { + "ce_loss_13": 3.1100788176059724, + "ce_loss_26": 2.647669917345047, + "ce_loss_39": 2.140716627240181, + "ce_loss_52": 1.4189704924821853, + "ce_loss_7": 3.387048715353012, + "epoch": 0.533, + "grad_norm": 20.905976368739214, + "kl_loss_13": 3497.6, + "kl_loss_26": 2533.0, + "kl_loss_39": 1466.2, + "kl_loss_7": 4079.6, + "learning_rate": 0.00045563187119900103, + "loss": 5752.6, + "step": 5330 + }, + { + "ce_loss_13": 3.06461244225502, + "ce_loss_26": 2.6009975552558897, + "ce_loss_39": 2.1015468716621397, + "ce_loss_52": 1.3909726276993752, + "ce_loss_7": 3.3485201001167297, + "epoch": 0.534, + "grad_norm": 19.831333266576646, + "kl_loss_13": 3462.4, + "kl_loss_26": 2487.0, + "kl_loss_39": 1427.1, + "kl_loss_7": 4053.2, + "learning_rate": 0.00045405169342398633, + "loss": 5804.75, + "step": 5340 + }, + { + "ce_loss_13": 3.1465537667274477, + "ce_loss_26": 2.6862030625343323, + "ce_loss_39": 2.1878136694431305, + "ce_loss_52": 1.4572405338287353, + "ce_loss_7": 3.4159956216812133, + "epoch": 0.535, + "grad_norm": 20.130648070413336, + "kl_loss_13": 3486.8, + "kl_loss_26": 2529.2, + "kl_loss_39": 1463.2, + "kl_loss_7": 4050.4, + "learning_rate": 0.0004524719783479088, + "loss": 5785.8, + "step": 5350 + }, + { + "ce_loss_13": 3.1608322679996492, + "ce_loss_26": 2.6899396955966948, + "ce_loss_39": 2.192571198940277, + "ce_loss_52": 1.466444182395935, + "ce_loss_7": 3.4340961396694185, + "epoch": 0.536, + "grad_norm": 21.175578440541553, + "kl_loss_13": 3511.2, + "kl_loss_26": 2537.2, + "kl_loss_39": 1482.0, + "kl_loss_7": 4079.6, + "learning_rate": 0.00045089274187848144, + "loss": 5831.5, + "step": 5360 + }, + { + "ce_loss_13": 3.122362142801285, + "ce_loss_26": 2.6539461642503737, + "ce_loss_39": 2.1548154592514037, + "ce_loss_52": 1.4296748742461205, + "ce_loss_7": 3.403215527534485, + "epoch": 0.537, + "grad_norm": 21.426385415702033, + "kl_loss_13": 3491.6, + "kl_loss_26": 2524.6, + "kl_loss_39": 1464.2, + "kl_loss_7": 4068.8, + "learning_rate": 0.00044931399991859835, + "loss": 5775.7, + "step": 5370 + }, + { + "ce_loss_13": 3.0924407064914705, + "ce_loss_26": 2.6285818815231323, + "ce_loss_39": 2.12394041121006, + "ce_loss_52": 1.4268731981515885, + "ce_loss_7": 3.365303188562393, + "epoch": 0.538, + "grad_norm": 21.06287805296543, + "kl_loss_13": 3469.6, + "kl_loss_26": 2495.8, + "kl_loss_39": 1421.6, + "kl_loss_7": 4039.2, + "learning_rate": 0.00044773576836617336, + "loss": 5748.1, + "step": 5380 + }, + { + "ce_loss_13": 3.0936496675014498, + "ce_loss_26": 2.6234502464532854, + "ce_loss_39": 2.112909361720085, + "ce_loss_52": 1.3892342567443847, + "ce_loss_7": 3.3770604133605957, + "epoch": 0.539, + "grad_norm": 20.257845085428166, + "kl_loss_13": 3512.0, + "kl_loss_26": 2533.6, + "kl_loss_39": 1454.2, + "kl_loss_7": 4101.2, + "learning_rate": 0.00044615806311398056, + "loss": 5750.7, + "step": 5390 + }, + { + "ce_loss_13": 3.1293481528759, + "ce_loss_26": 2.661091110110283, + "ce_loss_39": 2.1563747018575667, + "ce_loss_52": 1.4326880395412445, + "ce_loss_7": 3.4015913248062133, + "epoch": 0.54, + "grad_norm": 19.884995716311128, + "kl_loss_13": 3510.0, + "kl_loss_26": 2528.6, + "kl_loss_39": 1461.8, + "kl_loss_7": 4087.2, + "learning_rate": 0.00044458090004949454, + "loss": 5775.2, + "step": 5400 + }, + { + "ce_loss_13": 3.1498380303382874, + "ce_loss_26": 2.6860203623771666, + "ce_loss_39": 2.184649482369423, + "ce_loss_52": 1.4619063645601273, + "ce_loss_7": 3.425897455215454, + "epoch": 0.541, + "grad_norm": 21.787073133904556, + "kl_loss_13": 3476.0, + "kl_loss_26": 2516.4, + "kl_loss_39": 1456.6, + "kl_loss_7": 4048.0, + "learning_rate": 0.0004430042950547297, + "loss": 5755.9, + "step": 5410 + }, + { + "ce_loss_13": 3.1863462030887604, + "ce_loss_26": 2.726431465148926, + "ce_loss_39": 2.2207835763692856, + "ce_loss_52": 1.5021536648273468, + "ce_loss_7": 3.454101949930191, + "epoch": 0.542, + "grad_norm": 20.7568922984159, + "kl_loss_13": 3510.0, + "kl_loss_26": 2532.8, + "kl_loss_39": 1455.8, + "kl_loss_7": 4068.8, + "learning_rate": 0.0004414282640060809, + "loss": 5749.1, + "step": 5420 + }, + { + "ce_loss_13": 3.1338580727577208, + "ce_loss_26": 2.6763171195983886, + "ce_loss_39": 2.181287834048271, + "ce_loss_52": 1.4664452508091927, + "ce_loss_7": 3.4055596351623536, + "epoch": 0.543, + "grad_norm": 19.620114685157933, + "kl_loss_13": 3421.6, + "kl_loss_26": 2476.8, + "kl_loss_39": 1437.8, + "kl_loss_7": 3988.8, + "learning_rate": 0.0004398528227741633, + "loss": 5704.8, + "step": 5430 + }, + { + "ce_loss_13": 3.104156017303467, + "ce_loss_26": 2.639963132143021, + "ce_loss_39": 2.127584692835808, + "ce_loss_52": 1.4150341883301736, + "ce_loss_7": 3.390649217367172, + "epoch": 0.544, + "grad_norm": 20.487750099372004, + "kl_loss_13": 3479.6, + "kl_loss_26": 2511.2, + "kl_loss_39": 1437.2, + "kl_loss_7": 4085.6, + "learning_rate": 0.00043827798722365264, + "loss": 5688.3, + "step": 5440 + }, + { + "ce_loss_13": 3.0290561497211455, + "ce_loss_26": 2.5664966076612474, + "ce_loss_39": 2.065689593553543, + "ce_loss_52": 1.3703163504600524, + "ce_loss_7": 3.3105548918247223, + "epoch": 0.545, + "grad_norm": 20.36704365885761, + "kl_loss_13": 3438.0, + "kl_loss_26": 2471.0, + "kl_loss_39": 1405.0, + "kl_loss_7": 4010.4, + "learning_rate": 0.00043670377321312535, + "loss": 5715.9, + "step": 5450 + }, + { + "ce_loss_13": 3.1363044023513793, + "ce_loss_26": 2.674453055858612, + "ce_loss_39": 2.169647827744484, + "ce_loss_52": 1.440093258023262, + "ce_loss_7": 3.409278839826584, + "epoch": 0.546, + "grad_norm": 20.32480480357714, + "kl_loss_13": 3496.8, + "kl_loss_26": 2524.8, + "kl_loss_39": 1466.2, + "kl_loss_7": 4078.0, + "learning_rate": 0.0004351301965948991, + "loss": 5722.2, + "step": 5460 + }, + { + "ce_loss_13": 3.152593141794205, + "ce_loss_26": 2.6999820828437806, + "ce_loss_39": 2.1983327239751818, + "ce_loss_52": 1.4628954619169234, + "ce_loss_7": 3.4299716293811797, + "epoch": 0.547, + "grad_norm": 19.856114793460193, + "kl_loss_13": 3499.2, + "kl_loss_26": 2548.2, + "kl_loss_39": 1495.2, + "kl_loss_7": 4066.8, + "learning_rate": 0.000433557273214873, + "loss": 5741.7, + "step": 5470 + }, + { + "ce_loss_13": 3.064018839597702, + "ce_loss_26": 2.6090955317020414, + "ce_loss_39": 2.1058148056268693, + "ce_loss_52": 1.4195792496204376, + "ce_loss_7": 3.344401216506958, + "epoch": 0.548, + "grad_norm": 20.616048950091404, + "kl_loss_13": 3411.2, + "kl_loss_26": 2459.2, + "kl_loss_39": 1389.6, + "kl_loss_7": 3992.4, + "learning_rate": 0.000431985018912368, + "loss": 5718.6, + "step": 5480 + }, + { + "ce_loss_13": 3.0607047379016876, + "ce_loss_26": 2.597879120707512, + "ce_loss_39": 2.108409595489502, + "ce_loss_52": 1.4123182266950607, + "ce_loss_7": 3.336356836557388, + "epoch": 0.549, + "grad_norm": 20.036021472042393, + "kl_loss_13": 3404.4, + "kl_loss_26": 2447.2, + "kl_loss_39": 1413.8, + "kl_loss_7": 3983.2, + "learning_rate": 0.0004304134495199674, + "loss": 5692.5, + "step": 5490 + }, + { + "ce_loss_13": 3.052510768175125, + "ce_loss_26": 2.6108580827713013, + "ce_loss_39": 2.1228879362344744, + "ce_loss_52": 1.4244433492422104, + "ce_loss_7": 3.3247893154621124, + "epoch": 0.55, + "grad_norm": 20.158536855725483, + "kl_loss_13": 3391.6, + "kl_loss_26": 2469.0, + "kl_loss_39": 1430.9, + "kl_loss_7": 3959.6, + "learning_rate": 0.0004288425808633575, + "loss": 5660.8, + "step": 5500 + }, + { + "ce_loss_13": 3.164420074224472, + "ce_loss_26": 2.6986677646636963, + "ce_loss_39": 2.192535865306854, + "ce_loss_52": 1.4702347338199615, + "ce_loss_7": 3.4305537164211275, + "epoch": 0.551, + "grad_norm": 21.34549178191646, + "kl_loss_13": 3508.0, + "kl_loss_26": 2541.6, + "kl_loss_39": 1463.2, + "kl_loss_7": 4075.6, + "learning_rate": 0.0004272724287611684, + "loss": 5697.3, + "step": 5510 + }, + { + "ce_loss_13": 3.1066312968730925, + "ce_loss_26": 2.6526737749576568, + "ce_loss_39": 2.1508010149002077, + "ce_loss_52": 1.4371357694268228, + "ce_loss_7": 3.381526863574982, + "epoch": 0.552, + "grad_norm": 20.017257048745616, + "kl_loss_13": 3449.6, + "kl_loss_26": 2497.2, + "kl_loss_39": 1440.4, + "kl_loss_7": 4027.2, + "learning_rate": 0.00042570300902481425, + "loss": 5661.1, + "step": 5520 + }, + { + "ce_loss_13": 3.0768387794494627, + "ce_loss_26": 2.6105665415525436, + "ce_loss_39": 2.1041111290454864, + "ce_loss_52": 1.3769602328538895, + "ce_loss_7": 3.3504232287406923, + "epoch": 0.553, + "grad_norm": 20.71287480099847, + "kl_loss_13": 3497.6, + "kl_loss_26": 2529.0, + "kl_loss_39": 1464.6, + "kl_loss_7": 4067.6, + "learning_rate": 0.00042413433745833423, + "loss": 5675.2, + "step": 5530 + }, + { + "ce_loss_13": 3.078362447023392, + "ce_loss_26": 2.6186123132705688, + "ce_loss_39": 2.1204461604356766, + "ce_loss_52": 1.42312493622303, + "ce_loss_7": 3.3565984547138212, + "epoch": 0.554, + "grad_norm": 20.243506516231662, + "kl_loss_13": 3424.0, + "kl_loss_26": 2463.4, + "kl_loss_39": 1411.4, + "kl_loss_7": 4003.6, + "learning_rate": 0.0004225664298582339, + "loss": 5650.4, + "step": 5540 + }, + { + "ce_loss_13": 3.104430967569351, + "ce_loss_26": 2.6466626435518266, + "ce_loss_39": 2.149568349123001, + "ce_loss_52": 1.4317003712058067, + "ce_loss_7": 3.380819743871689, + "epoch": 0.555, + "grad_norm": 20.086456724534386, + "kl_loss_13": 3478.8, + "kl_loss_26": 2528.0, + "kl_loss_39": 1462.6, + "kl_loss_7": 4051.6, + "learning_rate": 0.000420999302013325, + "loss": 5686.3, + "step": 5550 + }, + { + "ce_loss_13": 3.09715017080307, + "ce_loss_26": 2.6249477684497835, + "ce_loss_39": 2.112127733230591, + "ce_loss_52": 1.3998657062649726, + "ce_loss_7": 3.3848826706409456, + "epoch": 0.556, + "grad_norm": 19.84836283457165, + "kl_loss_13": 3500.0, + "kl_loss_26": 2520.0, + "kl_loss_39": 1432.4, + "kl_loss_7": 4098.4, + "learning_rate": 0.000419432969704568, + "loss": 5721.05, + "step": 5560 + }, + { + "ce_loss_13": 3.1611645042896273, + "ce_loss_26": 2.7037321001291277, + "ce_loss_39": 2.2008607923984527, + "ce_loss_52": 1.4731642618775367, + "ce_loss_7": 3.4352382242679598, + "epoch": 0.557, + "grad_norm": 19.486736620492533, + "kl_loss_13": 3494.8, + "kl_loss_26": 2542.2, + "kl_loss_39": 1471.9, + "kl_loss_7": 4071.6, + "learning_rate": 0.00041786744870491154, + "loss": 5664.5, + "step": 5570 + }, + { + "ce_loss_13": 3.1723786175251005, + "ce_loss_26": 2.712140661478043, + "ce_loss_39": 2.2021081149578094, + "ce_loss_52": 1.4713574051856995, + "ce_loss_7": 3.4434271275997164, + "epoch": 0.558, + "grad_norm": 21.812924852409434, + "kl_loss_13": 3518.4, + "kl_loss_26": 2552.0, + "kl_loss_39": 1476.8, + "kl_loss_7": 4082.0, + "learning_rate": 0.0004163027547791347, + "loss": 5667.5, + "step": 5580 + }, + { + "ce_loss_13": 3.0999899983406065, + "ce_loss_26": 2.6524887919425963, + "ce_loss_39": 2.159899726510048, + "ce_loss_52": 1.457381361722946, + "ce_loss_7": 3.376901388168335, + "epoch": 0.559, + "grad_norm": 20.855137229386383, + "kl_loss_13": 3402.8, + "kl_loss_26": 2469.2, + "kl_loss_39": 1417.0, + "kl_loss_7": 3983.6, + "learning_rate": 0.0004147389036836881, + "loss": 5623.2, + "step": 5590 + }, + { + "ce_loss_13": 3.048000919818878, + "ce_loss_26": 2.6028879463672636, + "ce_loss_39": 2.110449159145355, + "ce_loss_52": 1.4273749262094497, + "ce_loss_7": 3.3209989249706267, + "epoch": 0.56, + "grad_norm": 21.293221784840117, + "kl_loss_13": 3370.8, + "kl_loss_26": 2436.6, + "kl_loss_39": 1393.6, + "kl_loss_7": 3935.6, + "learning_rate": 0.00041317591116653486, + "loss": 5661.4, + "step": 5600 + }, + { + "ce_loss_13": 3.117449927330017, + "ce_loss_26": 2.6561968684196473, + "ce_loss_39": 2.1393496483564376, + "ce_loss_52": 1.4263568341732025, + "ce_loss_7": 3.3912305176258086, + "epoch": 0.561, + "grad_norm": 19.806130661521266, + "kl_loss_13": 3510.8, + "kl_loss_26": 2543.6, + "kl_loss_39": 1446.0, + "kl_loss_7": 4074.8, + "learning_rate": 0.0004116137929669921, + "loss": 5646.7, + "step": 5610 + }, + { + "ce_loss_13": 3.032761037349701, + "ce_loss_26": 2.5814107984304426, + "ce_loss_39": 2.092069110274315, + "ce_loss_52": 1.4175067842006683, + "ce_loss_7": 3.3051693975925445, + "epoch": 0.562, + "grad_norm": 21.265424868671836, + "kl_loss_13": 3342.0, + "kl_loss_26": 2397.0, + "kl_loss_39": 1360.0, + "kl_loss_7": 3902.8, + "learning_rate": 0.00041005256481557305, + "loss": 5649.3, + "step": 5620 + }, + { + "ce_loss_13": 3.1011641681194306, + "ce_loss_26": 2.6472670078277587, + "ce_loss_39": 2.1640954107046126, + "ce_loss_52": 1.4572645276784897, + "ce_loss_7": 3.3653019249439238, + "epoch": 0.563, + "grad_norm": 19.60469852329931, + "kl_loss_13": 3397.2, + "kl_loss_26": 2452.6, + "kl_loss_39": 1426.2, + "kl_loss_7": 3957.6, + "learning_rate": 0.00040849224243382767, + "loss": 5635.6, + "step": 5630 + }, + { + "ce_loss_13": 3.0702085912227632, + "ce_loss_26": 2.6029874324798583, + "ce_loss_39": 2.1070942997932436, + "ce_loss_52": 1.4102862730622292, + "ce_loss_7": 3.345056527853012, + "epoch": 0.564, + "grad_norm": 19.66351329562965, + "kl_loss_13": 3444.8, + "kl_loss_26": 2478.8, + "kl_loss_39": 1416.0, + "kl_loss_7": 4010.8, + "learning_rate": 0.000406932841534185, + "loss": 5678.4, + "step": 5640 + }, + { + "ce_loss_13": 3.1035412073135378, + "ce_loss_26": 2.6404273927211763, + "ce_loss_39": 2.148519089818001, + "ce_loss_52": 1.4659966766834258, + "ce_loss_7": 3.3797259271144866, + "epoch": 0.565, + "grad_norm": 20.61367124543832, + "kl_loss_13": 3418.4, + "kl_loss_26": 2452.6, + "kl_loss_39": 1399.6, + "kl_loss_7": 3989.2, + "learning_rate": 0.0004053743778197951, + "loss": 5619.4, + "step": 5650 + }, + { + "ce_loss_13": 3.074153536558151, + "ce_loss_26": 2.6227691769599915, + "ce_loss_39": 2.12532425224781, + "ce_loss_52": 1.4269753962755203, + "ce_loss_7": 3.3447964787483215, + "epoch": 0.566, + "grad_norm": 20.08816441246758, + "kl_loss_13": 3398.8, + "kl_loss_26": 2455.2, + "kl_loss_39": 1406.8, + "kl_loss_7": 3968.8, + "learning_rate": 0.0004038168669843697, + "loss": 5607.4, + "step": 5660 + }, + { + "ce_loss_13": 3.117138236761093, + "ce_loss_26": 2.6556679248809814, + "ce_loss_39": 2.1434233844280244, + "ce_loss_52": 1.4418231889605522, + "ce_loss_7": 3.3935614347457888, + "epoch": 0.567, + "grad_norm": 19.629663422829108, + "kl_loss_13": 3460.4, + "kl_loss_26": 2499.6, + "kl_loss_39": 1423.0, + "kl_loss_7": 4038.4, + "learning_rate": 0.000402260324712026, + "loss": 5653.85, + "step": 5670 + }, + { + "ce_loss_13": 3.0241773188114167, + "ce_loss_26": 2.5764558911323547, + "ce_loss_39": 2.0826069891452788, + "ce_loss_52": 1.4262234181165696, + "ce_loss_7": 3.292912298440933, + "epoch": 0.568, + "grad_norm": 19.863333175376273, + "kl_loss_13": 3317.2, + "kl_loss_26": 2380.6, + "kl_loss_39": 1341.9, + "kl_loss_7": 3879.6, + "learning_rate": 0.00040070476667712743, + "loss": 5602.0, + "step": 5680 + }, + { + "ce_loss_13": 3.1207240760326385, + "ce_loss_26": 2.667350098490715, + "ce_loss_39": 2.1637227922677993, + "ce_loss_52": 1.4507419973611833, + "ce_loss_7": 3.3947570443153383, + "epoch": 0.569, + "grad_norm": 20.18457161613193, + "kl_loss_13": 3470.8, + "kl_loss_26": 2511.8, + "kl_loss_39": 1453.2, + "kl_loss_7": 4038.0, + "learning_rate": 0.0003991502085441259, + "loss": 5637.8, + "step": 5690 + }, + { + "ce_loss_13": 3.052716261148453, + "ce_loss_26": 2.5999310851097106, + "ce_loss_39": 2.103394716978073, + "ce_loss_52": 1.4248915880918502, + "ce_loss_7": 3.3296144127845766, + "epoch": 0.57, + "grad_norm": 21.32933636042312, + "kl_loss_13": 3384.4, + "kl_loss_26": 2434.4, + "kl_loss_39": 1385.3, + "kl_loss_7": 3961.6, + "learning_rate": 0.0003975966659674047, + "loss": 5572.65, + "step": 5700 + }, + { + "ce_loss_13": 3.0565085709095, + "ce_loss_26": 2.612689271569252, + "ce_loss_39": 2.133056679368019, + "ce_loss_52": 1.456464058160782, + "ce_loss_7": 3.3283673584461213, + "epoch": 0.571, + "grad_norm": 20.795332274261725, + "kl_loss_13": 3328.4, + "kl_loss_26": 2402.4, + "kl_loss_39": 1384.6, + "kl_loss_7": 3893.6, + "learning_rate": 0.0003960441545911204, + "loss": 5637.0, + "step": 5710 + }, + { + "ce_loss_13": 3.100340133905411, + "ce_loss_26": 2.6507264733314515, + "ce_loss_39": 2.1515814483165743, + "ce_loss_52": 1.4689666152000427, + "ce_loss_7": 3.36507533788681, + "epoch": 0.572, + "grad_norm": 19.479562193670343, + "kl_loss_13": 3367.6, + "kl_loss_26": 2430.4, + "kl_loss_39": 1381.6, + "kl_loss_7": 3924.4, + "learning_rate": 0.0003944926900490452, + "loss": 5586.7, + "step": 5720 + }, + { + "ce_loss_13": 3.035090607404709, + "ce_loss_26": 2.5765923827886583, + "ce_loss_39": 2.0693789660930633, + "ce_loss_52": 1.394070391356945, + "ce_loss_7": 3.3126678228378297, + "epoch": 0.573, + "grad_norm": 20.769836690311337, + "kl_loss_13": 3418.0, + "kl_loss_26": 2455.8, + "kl_loss_39": 1385.6, + "kl_loss_7": 3996.8, + "learning_rate": 0.0003929422879644099, + "loss": 5607.2, + "step": 5730 + }, + { + "ce_loss_13": 3.0840226650238036, + "ce_loss_26": 2.6376162350177763, + "ce_loss_39": 2.1402535855770113, + "ce_loss_52": 1.4594999521970748, + "ce_loss_7": 3.3528446674346926, + "epoch": 0.574, + "grad_norm": 19.55271310482914, + "kl_loss_13": 3396.0, + "kl_loss_26": 2453.0, + "kl_loss_39": 1389.4, + "kl_loss_7": 3971.6, + "learning_rate": 0.0003913929639497462, + "loss": 5561.5, + "step": 5740 + }, + { + "ce_loss_13": 3.0994504928588866, + "ce_loss_26": 2.645833945274353, + "ce_loss_39": 2.151085004210472, + "ce_loss_52": 1.453307920694351, + "ce_loss_7": 3.375282955169678, + "epoch": 0.575, + "grad_norm": 20.394651449213793, + "kl_loss_13": 3410.8, + "kl_loss_26": 2459.0, + "kl_loss_39": 1418.8, + "kl_loss_7": 3977.2, + "learning_rate": 0.00038984473360672965, + "loss": 5587.6, + "step": 5750 + }, + { + "ce_loss_13": 3.063554251194, + "ce_loss_26": 2.602804532647133, + "ce_loss_39": 2.112149253487587, + "ce_loss_52": 1.4378221184015274, + "ce_loss_7": 3.3263466000556945, + "epoch": 0.576, + "grad_norm": 20.732107583485522, + "kl_loss_13": 3335.2, + "kl_loss_26": 2397.0, + "kl_loss_39": 1362.5, + "kl_loss_7": 3885.6, + "learning_rate": 0.0003882976125260229, + "loss": 5618.5, + "step": 5760 + }, + { + "ce_loss_13": 3.011310315132141, + "ce_loss_26": 2.5538589358329773, + "ce_loss_39": 2.0608473628759385, + "ce_loss_52": 1.3871852427721023, + "ce_loss_7": 3.2823293566703797, + "epoch": 0.577, + "grad_norm": 20.222489437839133, + "kl_loss_13": 3356.0, + "kl_loss_26": 2403.4, + "kl_loss_39": 1363.6, + "kl_loss_7": 3921.2, + "learning_rate": 0.00038675161628711776, + "loss": 5583.6, + "step": 5770 + }, + { + "ce_loss_13": 3.060704934597015, + "ce_loss_26": 2.6082344591617583, + "ce_loss_39": 2.1105374455451966, + "ce_loss_52": 1.4133148401975633, + "ce_loss_7": 3.337174046039581, + "epoch": 0.578, + "grad_norm": 20.5258419136392, + "kl_loss_13": 3407.2, + "kl_loss_26": 2468.6, + "kl_loss_39": 1416.8, + "kl_loss_7": 3978.0, + "learning_rate": 0.0003852067604581794, + "loss": 5550.9, + "step": 5780 + }, + { + "ce_loss_13": 3.0313555896282196, + "ce_loss_26": 2.5763757705688475, + "ce_loss_39": 2.090648338198662, + "ce_loss_52": 1.4237483531236648, + "ce_loss_7": 3.303388088941574, + "epoch": 0.579, + "grad_norm": 19.375379659731948, + "kl_loss_13": 3338.4, + "kl_loss_26": 2400.6, + "kl_loss_39": 1371.4, + "kl_loss_7": 3902.8, + "learning_rate": 0.0003836630605958888, + "loss": 5548.1, + "step": 5790 + }, + { + "ce_loss_13": 3.0871520400047303, + "ce_loss_26": 2.628482538461685, + "ce_loss_39": 2.1345098853111266, + "ce_loss_52": 1.439907690882683, + "ce_loss_7": 3.360053616762161, + "epoch": 0.58, + "grad_norm": 20.217419963891068, + "kl_loss_13": 3384.8, + "kl_loss_26": 2429.6, + "kl_loss_39": 1387.0, + "kl_loss_7": 3952.0, + "learning_rate": 0.0003821205322452863, + "loss": 5608.2, + "step": 5800 + }, + { + "ce_loss_13": 3.098143881559372, + "ce_loss_26": 2.648523300886154, + "ce_loss_39": 2.1544575184583663, + "ce_loss_52": 1.4684545397758484, + "ce_loss_7": 3.369327354431152, + "epoch": 0.581, + "grad_norm": 20.306772003501575, + "kl_loss_13": 3376.0, + "kl_loss_26": 2423.6, + "kl_loss_39": 1374.4, + "kl_loss_7": 3943.2, + "learning_rate": 0.0003805791909396155, + "loss": 5609.7, + "step": 5810 + }, + { + "ce_loss_13": 3.03250247836113, + "ce_loss_26": 2.5763048112392424, + "ce_loss_39": 2.0811164885759355, + "ce_loss_52": 1.4048690304160119, + "ce_loss_7": 3.2992306888103484, + "epoch": 0.582, + "grad_norm": 19.674023919479502, + "kl_loss_13": 3376.8, + "kl_loss_26": 2428.6, + "kl_loss_39": 1384.2, + "kl_loss_7": 3936.8, + "learning_rate": 0.0003790390522001662, + "loss": 5494.5, + "step": 5820 + }, + { + "ce_loss_13": 3.0025113344192507, + "ce_loss_26": 2.55022137761116, + "ce_loss_39": 2.0565065026283262, + "ce_loss_52": 1.377510306239128, + "ce_loss_7": 3.2718186736106873, + "epoch": 0.583, + "grad_norm": 20.728894667471106, + "kl_loss_13": 3354.8, + "kl_loss_26": 2406.2, + "kl_loss_39": 1367.9, + "kl_loss_7": 3917.6, + "learning_rate": 0.0003775001315361183, + "loss": 5559.7, + "step": 5830 + }, + { + "ce_loss_13": 3.0971179008483887, + "ce_loss_26": 2.6451155722141264, + "ce_loss_39": 2.1508567333221436, + "ce_loss_52": 1.4824247717857362, + "ce_loss_7": 3.37031666636467, + "epoch": 0.584, + "grad_norm": 20.600074401568932, + "kl_loss_13": 3337.2, + "kl_loss_26": 2403.8, + "kl_loss_39": 1359.2, + "kl_loss_7": 3907.2, + "learning_rate": 0.0003759624444443858, + "loss": 5519.4, + "step": 5840 + }, + { + "ce_loss_13": 3.0961884200572967, + "ce_loss_26": 2.626956915855408, + "ce_loss_39": 2.1310097485780717, + "ce_loss_52": 1.4370131075382233, + "ce_loss_7": 3.371414542198181, + "epoch": 0.585, + "grad_norm": 21.119848921717185, + "kl_loss_13": 3437.6, + "kl_loss_26": 2463.6, + "kl_loss_39": 1412.8, + "kl_loss_7": 4010.4, + "learning_rate": 0.00037442600640946044, + "loss": 5564.2, + "step": 5850 + }, + { + "ce_loss_13": 3.022693085670471, + "ce_loss_26": 2.556842041015625, + "ce_loss_39": 2.054610991477966, + "ce_loss_52": 1.3797120869159698, + "ce_loss_7": 3.2991441190242767, + "epoch": 0.586, + "grad_norm": 20.829952423189983, + "kl_loss_13": 3400.0, + "kl_loss_26": 2436.4, + "kl_loss_39": 1378.6, + "kl_loss_7": 3976.4, + "learning_rate": 0.00037289083290325663, + "loss": 5555.3, + "step": 5860 + }, + { + "ce_loss_13": 3.048868530988693, + "ce_loss_26": 2.597111147642136, + "ce_loss_39": 2.1075122743844985, + "ce_loss_52": 1.4477093726396562, + "ce_loss_7": 3.3170620620250704, + "epoch": 0.587, + "grad_norm": 19.743999290482414, + "kl_loss_13": 3295.2, + "kl_loss_26": 2349.8, + "kl_loss_39": 1314.6, + "kl_loss_7": 3862.0, + "learning_rate": 0.0003713569393849543, + "loss": 5582.2, + "step": 5870 + }, + { + "ce_loss_13": 3.0275582671165466, + "ce_loss_26": 2.5641341865062715, + "ce_loss_39": 2.075681546330452, + "ce_loss_52": 1.413079009950161, + "ce_loss_7": 3.30134374499321, + "epoch": 0.588, + "grad_norm": 19.60417578779171, + "kl_loss_13": 3342.8, + "kl_loss_26": 2385.2, + "kl_loss_39": 1348.4, + "kl_loss_7": 3911.6, + "learning_rate": 0.00036982434130084397, + "loss": 5547.2, + "step": 5880 + }, + { + "ce_loss_13": 3.0114518344402312, + "ce_loss_26": 2.5489202946424485, + "ce_loss_39": 2.051037350296974, + "ce_loss_52": 1.3947512209415436, + "ce_loss_7": 3.2770422041416167, + "epoch": 0.589, + "grad_norm": 19.943871445513693, + "kl_loss_13": 3324.8, + "kl_loss_26": 2358.6, + "kl_loss_39": 1318.4, + "kl_loss_7": 3886.4, + "learning_rate": 0.00036829305408417166, + "loss": 5519.0, + "step": 5890 + }, + { + "ce_loss_13": 2.9831090033054353, + "ce_loss_26": 2.5218098521232606, + "ce_loss_39": 2.0276191979646683, + "ce_loss_52": 1.3660520613193512, + "ce_loss_7": 3.262023079395294, + "epoch": 0.59, + "grad_norm": 20.136107811495098, + "kl_loss_13": 3337.2, + "kl_loss_26": 2377.0, + "kl_loss_39": 1333.6, + "kl_loss_7": 3918.4, + "learning_rate": 0.0003667630931549826, + "loss": 5547.1, + "step": 5900 + }, + { + "ce_loss_13": 3.1185491621494292, + "ce_loss_26": 2.6609552204608917, + "ce_loss_39": 2.154519048333168, + "ce_loss_52": 1.4638203248381614, + "ce_loss_7": 3.3888748466968535, + "epoch": 0.591, + "grad_norm": 21.04398596009474, + "kl_loss_13": 3422.0, + "kl_loss_26": 2461.2, + "kl_loss_39": 1398.6, + "kl_loss_7": 3996.8, + "learning_rate": 0.00036523447391996613, + "loss": 5529.25, + "step": 5910 + }, + { + "ce_loss_13": 3.01438627243042, + "ce_loss_26": 2.5740538477897643, + "ce_loss_39": 2.0915403455495833, + "ce_loss_52": 1.431156474351883, + "ce_loss_7": 3.2843648850917817, + "epoch": 0.592, + "grad_norm": 21.11836249464285, + "kl_loss_13": 3292.4, + "kl_loss_26": 2358.6, + "kl_loss_39": 1331.3, + "kl_loss_7": 3853.2, + "learning_rate": 0.00036370721177230114, + "loss": 5565.45, + "step": 5920 + }, + { + "ce_loss_13": 3.069197976589203, + "ce_loss_26": 2.619757717847824, + "ce_loss_39": 2.1231694549322127, + "ce_loss_52": 1.4499645620584487, + "ce_loss_7": 3.339762735366821, + "epoch": 0.593, + "grad_norm": 19.81588610039522, + "kl_loss_13": 3318.4, + "kl_loss_26": 2373.0, + "kl_loss_39": 1346.8, + "kl_loss_7": 3881.2, + "learning_rate": 0.00036218132209150044, + "loss": 5483.7, + "step": 5930 + }, + { + "ce_loss_13": 3.0346123337745667, + "ce_loss_26": 2.5827606499195097, + "ce_loss_39": 2.090969371795654, + "ce_loss_52": 1.4306629955768586, + "ce_loss_7": 3.3081043541431425, + "epoch": 0.594, + "grad_norm": 20.579777117155064, + "kl_loss_13": 3338.4, + "kl_loss_26": 2404.8, + "kl_loss_39": 1361.4, + "kl_loss_7": 3897.6, + "learning_rate": 0.0003606568202432562, + "loss": 5508.2, + "step": 5940 + }, + { + "ce_loss_13": 2.9628920376300814, + "ce_loss_26": 2.5103672593832016, + "ce_loss_39": 2.028717193007469, + "ce_loss_52": 1.3798889175057412, + "ce_loss_7": 3.238497519493103, + "epoch": 0.595, + "grad_norm": 19.336805098702218, + "kl_loss_13": 3271.2, + "kl_loss_26": 2333.8, + "kl_loss_39": 1324.3, + "kl_loss_7": 3845.6, + "learning_rate": 0.0003591337215792851, + "loss": 5512.3, + "step": 5950 + }, + { + "ce_loss_13": 3.0788431644439695, + "ce_loss_26": 2.608420321345329, + "ce_loss_39": 2.1091116696596144, + "ce_loss_52": 1.4199562400579453, + "ce_loss_7": 3.359624499082565, + "epoch": 0.596, + "grad_norm": 20.533566806186286, + "kl_loss_13": 3426.4, + "kl_loss_26": 2447.8, + "kl_loss_39": 1399.7, + "kl_loss_7": 4004.4, + "learning_rate": 0.00035761204143717383, + "loss": 5551.7, + "step": 5960 + }, + { + "ce_loss_13": 3.03274342417717, + "ce_loss_26": 2.585944026708603, + "ce_loss_39": 2.096950164437294, + "ce_loss_52": 1.416708105802536, + "ce_loss_7": 3.3088025748729706, + "epoch": 0.597, + "grad_norm": 19.695670321869642, + "kl_loss_13": 3339.2, + "kl_loss_26": 2400.2, + "kl_loss_39": 1368.0, + "kl_loss_7": 3908.0, + "learning_rate": 0.0003560917951402245, + "loss": 5483.9, + "step": 5970 + }, + { + "ce_loss_13": 3.01766916513443, + "ce_loss_26": 2.561279395222664, + "ce_loss_39": 2.06943539083004, + "ce_loss_52": 1.418130737543106, + "ce_loss_7": 3.2899491429328918, + "epoch": 0.598, + "grad_norm": 21.072429989583483, + "kl_loss_13": 3344.4, + "kl_loss_26": 2382.6, + "kl_loss_39": 1331.0, + "kl_loss_7": 3913.6, + "learning_rate": 0.00035457299799730046, + "loss": 5551.3, + "step": 5980 + }, + { + "ce_loss_13": 3.0659261345863342, + "ce_loss_26": 2.600133925676346, + "ce_loss_39": 2.110896447300911, + "ce_loss_52": 1.437103909254074, + "ce_loss_7": 3.3336906135082245, + "epoch": 0.599, + "grad_norm": 24.325101698795564, + "kl_loss_13": 3380.0, + "kl_loss_26": 2416.4, + "kl_loss_39": 1376.0, + "kl_loss_7": 3938.8, + "learning_rate": 0.0003530556653026721, + "loss": 5495.3, + "step": 5990 + }, + { + "ce_loss_13": 3.0716091096401215, + "ce_loss_26": 2.614444798231125, + "ce_loss_39": 2.1203446626663207, + "ce_loss_52": 1.4388466402888298, + "ce_loss_7": 3.3488605439662935, + "epoch": 0.6, + "grad_norm": 20.521363612152747, + "kl_loss_13": 3384.4, + "kl_loss_26": 2432.4, + "kl_loss_39": 1382.6, + "kl_loss_7": 3959.2, + "learning_rate": 0.00035153981233586274, + "loss": 5545.9, + "step": 6000 + }, + { + "ce_loss_13": 3.106086379289627, + "ce_loss_26": 2.6430875420570374, + "ce_loss_39": 2.1337583631277086, + "ce_loss_52": 1.4439469754695893, + "ce_loss_7": 3.3797987580299376, + "epoch": 0.601, + "grad_norm": 20.80497849503253, + "kl_loss_13": 3453.6, + "kl_loss_26": 2484.8, + "kl_loss_39": 1398.0, + "kl_loss_7": 4028.4, + "learning_rate": 0.00035002545436149473, + "loss": 5491.4, + "step": 6010 + }, + { + "ce_loss_13": 3.000385183095932, + "ce_loss_26": 2.5445862293243406, + "ce_loss_39": 2.045133265852928, + "ce_loss_52": 1.387436880171299, + "ce_loss_7": 3.271744018793106, + "epoch": 0.602, + "grad_norm": 32.49284937945306, + "kl_loss_13": 3335.6, + "kl_loss_26": 2380.8, + "kl_loss_39": 1330.6, + "kl_loss_7": 3907.2, + "learning_rate": 0.0003485126066291364, + "loss": 5483.7, + "step": 6020 + }, + { + "ce_loss_13": 3.0256562530994415, + "ce_loss_26": 2.577129301428795, + "ce_loss_39": 2.0889712274074554, + "ce_loss_52": 1.4449194520711899, + "ce_loss_7": 3.2831216752529144, + "epoch": 0.603, + "grad_norm": 20.23621815480592, + "kl_loss_13": 3272.0, + "kl_loss_26": 2339.4, + "kl_loss_39": 1312.8, + "kl_loss_7": 3817.2, + "learning_rate": 0.0003470012843731476, + "loss": 5461.5, + "step": 6030 + }, + { + "ce_loss_13": 3.0063592195510864, + "ce_loss_26": 2.5461477816104887, + "ce_loss_39": 2.047996437549591, + "ce_loss_52": 1.4004468455910684, + "ce_loss_7": 3.2747744262218474, + "epoch": 0.604, + "grad_norm": 20.161854907584466, + "kl_loss_13": 3331.2, + "kl_loss_26": 2365.2, + "kl_loss_39": 1318.3, + "kl_loss_7": 3894.4, + "learning_rate": 0.00034549150281252633, + "loss": 5446.25, + "step": 6040 + }, + { + "ce_loss_13": 3.0043693661689757, + "ce_loss_26": 2.5464318215847017, + "ce_loss_39": 2.054910770058632, + "ce_loss_52": 1.402955588698387, + "ce_loss_7": 3.277425891160965, + "epoch": 0.605, + "grad_norm": 19.39479695015959, + "kl_loss_13": 3303.6, + "kl_loss_26": 2362.4, + "kl_loss_39": 1315.5, + "kl_loss_7": 3867.2, + "learning_rate": 0.0003439832771507565, + "loss": 5513.7, + "step": 6050 + }, + { + "ce_loss_13": 3.0073243379592896, + "ce_loss_26": 2.552249348163605, + "ce_loss_39": 2.0556343287229537, + "ce_loss_52": 1.4056400418281556, + "ce_loss_7": 3.276465517282486, + "epoch": 0.606, + "grad_norm": 20.632328630604196, + "kl_loss_13": 3315.2, + "kl_loss_26": 2367.2, + "kl_loss_39": 1320.7, + "kl_loss_7": 3890.0, + "learning_rate": 0.0003424766225756537, + "loss": 5437.2, + "step": 6060 + }, + { + "ce_loss_13": 3.0342160046100615, + "ce_loss_26": 2.5702687412500382, + "ce_loss_39": 2.0748631983995436, + "ce_loss_52": 1.4053118824958801, + "ce_loss_7": 3.30224769115448, + "epoch": 0.607, + "grad_norm": 20.381462378536757, + "kl_loss_13": 3375.6, + "kl_loss_26": 2412.6, + "kl_loss_39": 1368.5, + "kl_loss_7": 3938.0, + "learning_rate": 0.00034097155425921255, + "loss": 5453.95, + "step": 6070 + }, + { + "ce_loss_13": 2.979208827018738, + "ce_loss_26": 2.5391657948493958, + "ce_loss_39": 2.0675252109766005, + "ce_loss_52": 1.4293861359357833, + "ce_loss_7": 3.243548810482025, + "epoch": 0.608, + "grad_norm": 20.059894251099692, + "kl_loss_13": 3201.6, + "kl_loss_26": 2277.2, + "kl_loss_39": 1276.3, + "kl_loss_7": 3759.2, + "learning_rate": 0.0003394680873574546, + "loss": 5463.6, + "step": 6080 + }, + { + "ce_loss_13": 3.0087065517902376, + "ce_loss_26": 2.5543720543384554, + "ce_loss_39": 2.0706711769104005, + "ce_loss_52": 1.4312650561332703, + "ce_loss_7": 3.274740469455719, + "epoch": 0.609, + "grad_norm": 19.682020137215275, + "kl_loss_13": 3278.0, + "kl_loss_26": 2334.8, + "kl_loss_39": 1311.6, + "kl_loss_7": 3829.2, + "learning_rate": 0.0003379662370102747, + "loss": 5485.1, + "step": 6090 + }, + { + "ce_loss_13": 2.9708913624286652, + "ce_loss_26": 2.526490569114685, + "ce_loss_39": 2.045279270410538, + "ce_loss_52": 1.406798492372036, + "ce_loss_7": 3.2348900735378265, + "epoch": 0.61, + "grad_norm": 20.816822332244506, + "kl_loss_13": 3220.0, + "kl_loss_26": 2302.2, + "kl_loss_39": 1290.7, + "kl_loss_7": 3776.8, + "learning_rate": 0.0003364660183412892, + "loss": 5434.0, + "step": 6100 + }, + { + "ce_loss_13": 3.034948408603668, + "ce_loss_26": 2.5749203741550444, + "ce_loss_39": 2.083053132891655, + "ce_loss_52": 1.4239173114299775, + "ce_loss_7": 3.308141976594925, + "epoch": 0.611, + "grad_norm": 19.1619951590758, + "kl_loss_13": 3322.8, + "kl_loss_26": 2375.8, + "kl_loss_39": 1335.6, + "kl_loss_7": 3891.2, + "learning_rate": 0.0003349674464576834, + "loss": 5422.4, + "step": 6110 + }, + { + "ce_loss_13": 3.004230409860611, + "ce_loss_26": 2.559123533964157, + "ce_loss_39": 2.0680998235940935, + "ce_loss_52": 1.416846913099289, + "ce_loss_7": 3.268228167295456, + "epoch": 0.612, + "grad_norm": 19.814048572070753, + "kl_loss_13": 3277.2, + "kl_loss_26": 2352.2, + "kl_loss_39": 1329.4, + "kl_loss_7": 3827.6, + "learning_rate": 0.00033347053645005966, + "loss": 5408.25, + "step": 6120 + }, + { + "ce_loss_13": 3.043248528242111, + "ce_loss_26": 2.5893827825784683, + "ce_loss_39": 2.106617513298988, + "ce_loss_52": 1.4531341344118118, + "ce_loss_7": 3.3129510939121247, + "epoch": 0.613, + "grad_norm": 20.76807127463998, + "kl_loss_13": 3286.4, + "kl_loss_26": 2356.0, + "kl_loss_39": 1332.4, + "kl_loss_7": 3850.8, + "learning_rate": 0.00033197530339228485, + "loss": 5370.55, + "step": 6130 + }, + { + "ce_loss_13": 3.00436030626297, + "ce_loss_26": 2.551770430803299, + "ce_loss_39": 2.0617963910102843, + "ce_loss_52": 1.4093145355582237, + "ce_loss_7": 3.287700629234314, + "epoch": 0.614, + "grad_norm": 20.13277338184701, + "kl_loss_13": 3319.2, + "kl_loss_26": 2374.0, + "kl_loss_39": 1331.1, + "kl_loss_7": 3899.6, + "learning_rate": 0.00033048176234133967, + "loss": 5468.4, + "step": 6140 + }, + { + "ce_loss_13": 3.0555618822574617, + "ce_loss_26": 2.6027767241001127, + "ce_loss_39": 2.1256050765514374, + "ce_loss_52": 1.4570606127381325, + "ce_loss_7": 3.3252673983573913, + "epoch": 0.615, + "grad_norm": 20.812161363431123, + "kl_loss_13": 3324.4, + "kl_loss_26": 2376.0, + "kl_loss_39": 1350.9, + "kl_loss_7": 3884.4, + "learning_rate": 0.0003289899283371657, + "loss": 5469.45, + "step": 6150 + }, + { + "ce_loss_13": 2.9699956268072127, + "ce_loss_26": 2.5214530378580093, + "ce_loss_39": 2.047743359208107, + "ce_loss_52": 1.404917973279953, + "ce_loss_7": 3.242190235853195, + "epoch": 0.616, + "grad_norm": 21.707368727671295, + "kl_loss_13": 3253.6, + "kl_loss_26": 2330.0, + "kl_loss_39": 1321.7, + "kl_loss_7": 3830.4, + "learning_rate": 0.0003274998164025148, + "loss": 5448.5, + "step": 6160 + }, + { + "ce_loss_13": 3.1136968553066255, + "ce_loss_26": 2.6612686932086946, + "ce_loss_39": 2.1661961168050765, + "ce_loss_52": 1.4886637568473815, + "ce_loss_7": 3.390312284231186, + "epoch": 0.617, + "grad_norm": 20.098473719799063, + "kl_loss_13": 3356.0, + "kl_loss_26": 2410.6, + "kl_loss_39": 1369.8, + "kl_loss_7": 3926.4, + "learning_rate": 0.0003260114415427975, + "loss": 5420.0, + "step": 6170 + }, + { + "ce_loss_13": 3.0537400960922243, + "ce_loss_26": 2.6002777397632597, + "ce_loss_39": 2.0997320264577866, + "ce_loss_52": 1.425080481171608, + "ce_loss_7": 3.3301878392696382, + "epoch": 0.618, + "grad_norm": 20.959268913691595, + "kl_loss_13": 3376.8, + "kl_loss_26": 2431.0, + "kl_loss_39": 1376.8, + "kl_loss_7": 3956.0, + "learning_rate": 0.0003245248187459323, + "loss": 5467.5, + "step": 6180 + }, + { + "ce_loss_13": 3.0195475459098815, + "ce_loss_26": 2.5703806400299074, + "ce_loss_39": 2.0828246504068373, + "ce_loss_52": 1.4389306217432023, + "ce_loss_7": 3.2870283126831055, + "epoch": 0.619, + "grad_norm": 19.77508773242922, + "kl_loss_13": 3278.4, + "kl_loss_26": 2325.6, + "kl_loss_39": 1301.5, + "kl_loss_7": 3832.0, + "learning_rate": 0.00032303996298219416, + "loss": 5436.5, + "step": 6190 + }, + { + "ce_loss_13": 3.080602079629898, + "ce_loss_26": 2.6279105126857756, + "ce_loss_39": 2.1348745226860046, + "ce_loss_52": 1.4668798118829727, + "ce_loss_7": 3.349226105213165, + "epoch": 0.62, + "grad_norm": 20.105835424543073, + "kl_loss_13": 3342.4, + "kl_loss_26": 2402.4, + "kl_loss_39": 1358.2, + "kl_loss_7": 3903.6, + "learning_rate": 0.00032155688920406414, + "loss": 5427.1, + "step": 6200 + }, + { + "ce_loss_13": 2.980887794494629, + "ce_loss_26": 2.5403966814279557, + "ce_loss_39": 2.06268994808197, + "ce_loss_52": 1.4154802724719047, + "ce_loss_7": 3.252926254272461, + "epoch": 0.621, + "grad_norm": 19.282334771133197, + "kl_loss_13": 3252.8, + "kl_loss_26": 2327.6, + "kl_loss_39": 1309.6, + "kl_loss_7": 3818.4, + "learning_rate": 0.0003200756123460788, + "loss": 5482.55, + "step": 6210 + }, + { + "ce_loss_13": 2.9776609361171724, + "ce_loss_26": 2.5325854122638702, + "ce_loss_39": 2.0461337983608248, + "ce_loss_52": 1.410691213607788, + "ce_loss_7": 3.24890678524971, + "epoch": 0.622, + "grad_norm": 20.106994551943693, + "kl_loss_13": 3252.0, + "kl_loss_26": 2326.2, + "kl_loss_39": 1296.8, + "kl_loss_7": 3812.4, + "learning_rate": 0.00031859614732467957, + "loss": 5416.4, + "step": 6220 + }, + { + "ce_loss_13": 3.032334786653519, + "ce_loss_26": 2.579844218492508, + "ce_loss_39": 2.1053399711847307, + "ce_loss_52": 1.4573373839259147, + "ce_loss_7": 3.2939690172672274, + "epoch": 0.623, + "grad_norm": 19.37466323417316, + "kl_loss_13": 3254.0, + "kl_loss_26": 2315.6, + "kl_loss_39": 1307.4, + "kl_loss_7": 3800.0, + "learning_rate": 0.00031711850903806275, + "loss": 5384.0, + "step": 6230 + }, + { + "ce_loss_13": 3.0033844828605654, + "ce_loss_26": 2.554124391078949, + "ce_loss_39": 2.072583147883415, + "ce_loss_52": 1.405614359676838, + "ce_loss_7": 3.272351396083832, + "epoch": 0.624, + "grad_norm": 19.68798944815722, + "kl_loss_13": 3318.8, + "kl_loss_26": 2383.4, + "kl_loss_39": 1353.1, + "kl_loss_7": 3878.4, + "learning_rate": 0.0003156427123660297, + "loss": 5409.1, + "step": 6240 + }, + { + "ce_loss_13": 3.044550156593323, + "ce_loss_26": 2.5933004200458525, + "ce_loss_39": 2.0951038181781767, + "ce_loss_52": 1.432724517583847, + "ce_loss_7": 3.31173922419548, + "epoch": 0.625, + "grad_norm": 19.588532833188335, + "kl_loss_13": 3324.0, + "kl_loss_26": 2390.2, + "kl_loss_39": 1351.9, + "kl_loss_7": 3884.8, + "learning_rate": 0.0003141687721698363, + "loss": 5410.2, + "step": 6250 + }, + { + "ce_loss_13": 3.0133075952529906, + "ce_loss_26": 2.562904554605484, + "ce_loss_39": 2.084453445672989, + "ce_loss_52": 1.4354033678770066, + "ce_loss_7": 3.2846658766269683, + "epoch": 0.626, + "grad_norm": 20.175418509715428, + "kl_loss_13": 3256.4, + "kl_loss_26": 2322.2, + "kl_loss_39": 1302.4, + "kl_loss_7": 3820.0, + "learning_rate": 0.00031269670329204396, + "loss": 5413.4, + "step": 6260 + }, + { + "ce_loss_13": 3.020740455389023, + "ce_loss_26": 2.576273998618126, + "ce_loss_39": 2.094499522447586, + "ce_loss_52": 1.46863095164299, + "ce_loss_7": 3.2838824689388275, + "epoch": 0.627, + "grad_norm": 19.159006125141875, + "kl_loss_13": 3217.6, + "kl_loss_26": 2295.2, + "kl_loss_39": 1276.9, + "kl_loss_7": 3769.2, + "learning_rate": 0.00031122652055637015, + "loss": 5419.65, + "step": 6270 + }, + { + "ce_loss_13": 2.9717007994651796, + "ce_loss_26": 2.5309597969055178, + "ce_loss_39": 2.045916485786438, + "ce_loss_52": 1.4202698469161987, + "ce_loss_7": 3.244756191968918, + "epoch": 0.628, + "grad_norm": 20.34275756584657, + "kl_loss_13": 3237.2, + "kl_loss_26": 2312.2, + "kl_loss_39": 1284.9, + "kl_loss_7": 3804.0, + "learning_rate": 0.0003097582387675385, + "loss": 5361.6, + "step": 6280 + }, + { + "ce_loss_13": 2.959231287240982, + "ce_loss_26": 2.5172866880893707, + "ce_loss_39": 2.043312183022499, + "ce_loss_52": 1.4159017190337182, + "ce_loss_7": 3.220345306396484, + "epoch": 0.629, + "grad_norm": 20.150529720020295, + "kl_loss_13": 3215.2, + "kl_loss_26": 2296.4, + "kl_loss_39": 1276.2, + "kl_loss_7": 3763.2, + "learning_rate": 0.00030829187271113034, + "loss": 5363.8, + "step": 6290 + }, + { + "ce_loss_13": 2.9990767776966094, + "ce_loss_26": 2.5505200415849685, + "ce_loss_39": 2.060671201348305, + "ce_loss_52": 1.395907147228718, + "ce_loss_7": 3.2725743770599367, + "epoch": 0.63, + "grad_norm": 19.518127150050482, + "kl_loss_13": 3309.2, + "kl_loss_26": 2380.2, + "kl_loss_39": 1347.4, + "kl_loss_7": 3878.4, + "learning_rate": 0.00030682743715343565, + "loss": 5435.15, + "step": 6300 + }, + { + "ce_loss_13": 3.080632323026657, + "ce_loss_26": 2.6302684545516968, + "ce_loss_39": 2.135207986831665, + "ce_loss_52": 1.4774031162261962, + "ce_loss_7": 3.3441965878009796, + "epoch": 0.631, + "grad_norm": 21.06860944259249, + "kl_loss_13": 3302.8, + "kl_loss_26": 2366.6, + "kl_loss_39": 1332.6, + "kl_loss_7": 3862.8, + "learning_rate": 0.0003053649468413043, + "loss": 5425.25, + "step": 6310 + }, + { + "ce_loss_13": 3.0259739339351652, + "ce_loss_26": 2.570155072212219, + "ce_loss_39": 2.078566926717758, + "ce_loss_52": 1.4384133130311967, + "ce_loss_7": 3.288107806444168, + "epoch": 0.632, + "grad_norm": 20.78357258068192, + "kl_loss_13": 3282.0, + "kl_loss_26": 2335.8, + "kl_loss_39": 1293.6, + "kl_loss_7": 3833.2, + "learning_rate": 0.00030390441650199725, + "loss": 5412.2, + "step": 6320 + }, + { + "ce_loss_13": 2.9379296779632567, + "ce_loss_26": 2.49056881070137, + "ce_loss_39": 2.0068995296955108, + "ce_loss_52": 1.3864078581333161, + "ce_loss_7": 3.203891623020172, + "epoch": 0.633, + "grad_norm": 20.174137035703254, + "kl_loss_13": 3206.8, + "kl_loss_26": 2268.4, + "kl_loss_39": 1251.1, + "kl_loss_7": 3762.4, + "learning_rate": 0.00030244586084303903, + "loss": 5352.9, + "step": 6330 + }, + { + "ce_loss_13": 2.9555823683738707, + "ce_loss_26": 2.501230263710022, + "ce_loss_39": 2.0173334002494814, + "ce_loss_52": 1.3891687452793122, + "ce_loss_7": 3.2268748760223387, + "epoch": 0.634, + "grad_norm": 20.047186620123167, + "kl_loss_13": 3264.8, + "kl_loss_26": 2316.6, + "kl_loss_39": 1274.1, + "kl_loss_7": 3843.2, + "learning_rate": 0.00030098929455206903, + "loss": 5365.4, + "step": 6340 + }, + { + "ce_loss_13": 2.9806570291519163, + "ce_loss_26": 2.533248084783554, + "ce_loss_39": 2.0557660490274428, + "ce_loss_52": 1.4183998316526414, + "ce_loss_7": 3.248631852865219, + "epoch": 0.635, + "grad_norm": 19.42792815463783, + "kl_loss_13": 3226.0, + "kl_loss_26": 2301.0, + "kl_loss_39": 1289.2, + "kl_loss_7": 3784.4, + "learning_rate": 0.00029953473229669324, + "loss": 5429.0, + "step": 6350 + }, + { + "ce_loss_13": 3.008418655395508, + "ce_loss_26": 2.5531763255596163, + "ce_loss_39": 2.069964846968651, + "ce_loss_52": 1.4419535219669342, + "ce_loss_7": 3.274876070022583, + "epoch": 0.636, + "grad_norm": 20.040461482265158, + "kl_loss_13": 3248.4, + "kl_loss_26": 2315.4, + "kl_loss_39": 1287.5, + "kl_loss_7": 3802.8, + "learning_rate": 0.00029808218872433767, + "loss": 5390.5, + "step": 6360 + }, + { + "ce_loss_13": 2.9624147057533263, + "ce_loss_26": 2.509669789671898, + "ce_loss_39": 2.017057329416275, + "ce_loss_52": 1.392769531905651, + "ce_loss_7": 3.2405923306941986, + "epoch": 0.637, + "grad_norm": 19.969518630784094, + "kl_loss_13": 3281.6, + "kl_loss_26": 2321.8, + "kl_loss_39": 1284.8, + "kl_loss_7": 3857.6, + "learning_rate": 0.0002966316784621, + "loss": 5344.4, + "step": 6370 + }, + { + "ce_loss_13": 2.9690242230892183, + "ce_loss_26": 2.5119642555713653, + "ce_loss_39": 2.022391200065613, + "ce_loss_52": 1.3881384432315826, + "ce_loss_7": 3.2436072409152983, + "epoch": 0.638, + "grad_norm": 19.443805471212187, + "kl_loss_13": 3260.4, + "kl_loss_26": 2314.8, + "kl_loss_39": 1281.4, + "kl_loss_7": 3835.6, + "learning_rate": 0.0002951832161166024, + "loss": 5333.0, + "step": 6380 + }, + { + "ce_loss_13": 3.019889771938324, + "ce_loss_26": 2.574093183875084, + "ce_loss_39": 2.085198149085045, + "ce_loss_52": 1.4531068801879883, + "ce_loss_7": 3.286811703443527, + "epoch": 0.639, + "grad_norm": 19.50308932074232, + "kl_loss_13": 3246.0, + "kl_loss_26": 2313.6, + "kl_loss_39": 1281.1, + "kl_loss_7": 3800.8, + "learning_rate": 0.0002937368162738445, + "loss": 5358.7, + "step": 6390 + }, + { + "ce_loss_13": 2.979416298866272, + "ce_loss_26": 2.5201680839061735, + "ce_loss_39": 2.0343465119600297, + "ce_loss_52": 1.4125050336122513, + "ce_loss_7": 3.2494026124477386, + "epoch": 0.64, + "grad_norm": 19.887826117374196, + "kl_loss_13": 3261.2, + "kl_loss_26": 2302.8, + "kl_loss_39": 1277.2, + "kl_loss_7": 3827.6, + "learning_rate": 0.0002922924934990568, + "loss": 5361.0, + "step": 6400 + }, + { + "ce_loss_13": 2.966394138336182, + "ce_loss_26": 2.5159747898578644, + "ce_loss_39": 2.0343314677476885, + "ce_loss_52": 1.3983295410871506, + "ce_loss_7": 3.2374337732791902, + "epoch": 0.641, + "grad_norm": 21.205292313379594, + "kl_loss_13": 3264.4, + "kl_loss_26": 2317.8, + "kl_loss_39": 1301.9, + "kl_loss_7": 3828.8, + "learning_rate": 0.0002908502623365536, + "loss": 5348.95, + "step": 6410 + }, + { + "ce_loss_13": 3.0186184704303742, + "ce_loss_26": 2.5620053589344023, + "ce_loss_39": 2.079856187105179, + "ce_loss_52": 1.437650018930435, + "ce_loss_7": 3.2890258550643923, + "epoch": 0.642, + "grad_norm": 20.335253629932936, + "kl_loss_13": 3264.4, + "kl_loss_26": 2325.4, + "kl_loss_39": 1302.1, + "kl_loss_7": 3830.0, + "learning_rate": 0.0002894101373095867, + "loss": 5303.0, + "step": 6420 + }, + { + "ce_loss_13": 3.0677368700504304, + "ce_loss_26": 2.6209602475166323, + "ce_loss_39": 2.1306197196245193, + "ce_loss_52": 1.4929826736450196, + "ce_loss_7": 3.3314808785915373, + "epoch": 0.643, + "grad_norm": 20.265575824381624, + "kl_loss_13": 3289.6, + "kl_loss_26": 2355.4, + "kl_loss_39": 1318.2, + "kl_loss_7": 3835.2, + "learning_rate": 0.00028797213292019926, + "loss": 5380.85, + "step": 6430 + }, + { + "ce_loss_13": 2.966922277212143, + "ce_loss_26": 2.51933411359787, + "ce_loss_39": 2.040864047408104, + "ce_loss_52": 1.4254619121551513, + "ce_loss_7": 3.232788211107254, + "epoch": 0.644, + "grad_norm": 19.660532004484757, + "kl_loss_13": 3209.2, + "kl_loss_26": 2273.0, + "kl_loss_39": 1260.8, + "kl_loss_7": 3770.0, + "learning_rate": 0.0002865362636490791, + "loss": 5309.3, + "step": 6440 + }, + { + "ce_loss_13": 3.0011440992355345, + "ce_loss_26": 2.5491324365139008, + "ce_loss_39": 2.057476672530174, + "ce_loss_52": 1.4156641319394112, + "ce_loss_7": 3.2698193073272703, + "epoch": 0.645, + "grad_norm": 20.528114278014176, + "kl_loss_13": 3284.8, + "kl_loss_26": 2344.4, + "kl_loss_39": 1307.4, + "kl_loss_7": 3853.6, + "learning_rate": 0.0002851025439554142, + "loss": 5329.3, + "step": 6450 + }, + { + "ce_loss_13": 3.0585977435112, + "ce_loss_26": 2.593171867728233, + "ce_loss_39": 2.1089387238025665, + "ce_loss_52": 1.4590320155024528, + "ce_loss_7": 3.3243721425533295, + "epoch": 0.646, + "grad_norm": 19.177895864651298, + "kl_loss_13": 3308.0, + "kl_loss_26": 2342.6, + "kl_loss_39": 1306.8, + "kl_loss_7": 3858.4, + "learning_rate": 0.00028367098827674573, + "loss": 5399.4, + "step": 6460 + }, + { + "ce_loss_13": 3.0068358182907104, + "ce_loss_26": 2.559490966796875, + "ce_loss_39": 2.0818791508674623, + "ce_loss_52": 1.4510639190673829, + "ce_loss_7": 3.2726912021636965, + "epoch": 0.647, + "grad_norm": 20.001351203231668, + "kl_loss_13": 3231.2, + "kl_loss_26": 2299.0, + "kl_loss_39": 1281.6, + "kl_loss_7": 3785.2, + "learning_rate": 0.00028224161102882397, + "loss": 5353.5, + "step": 6470 + }, + { + "ce_loss_13": 3.0042510509490965, + "ce_loss_26": 2.545697581768036, + "ce_loss_39": 2.0517130315303804, + "ce_loss_52": 1.4082761898636817, + "ce_loss_7": 3.2736884713172913, + "epoch": 0.648, + "grad_norm": 19.176592480543317, + "kl_loss_13": 3304.8, + "kl_loss_26": 2344.6, + "kl_loss_39": 1308.6, + "kl_loss_7": 3870.8, + "learning_rate": 0.00028081442660546124, + "loss": 5357.45, + "step": 6480 + }, + { + "ce_loss_13": 2.9604024648666383, + "ce_loss_26": 2.5067259430885316, + "ce_loss_39": 2.0217309921979902, + "ce_loss_52": 1.396033638715744, + "ce_loss_7": 3.229094612598419, + "epoch": 0.649, + "grad_norm": 20.132685826085943, + "kl_loss_13": 3230.8, + "kl_loss_26": 2292.8, + "kl_loss_39": 1272.4, + "kl_loss_7": 3788.8, + "learning_rate": 0.0002793894493783892, + "loss": 5337.2, + "step": 6490 + }, + { + "ce_loss_13": 3.038835954666138, + "ce_loss_26": 2.5828086912631987, + "ce_loss_39": 2.094580352306366, + "ce_loss_52": 1.4504825562238692, + "ce_loss_7": 3.3101982474327087, + "epoch": 0.65, + "grad_norm": 20.26204607317675, + "kl_loss_13": 3292.0, + "kl_loss_26": 2347.2, + "kl_loss_39": 1312.5, + "kl_loss_7": 3857.2, + "learning_rate": 0.0002779666936971129, + "loss": 5341.9, + "step": 6500 + }, + { + "ce_loss_13": 2.981122875213623, + "ce_loss_26": 2.5436301648616793, + "ce_loss_39": 2.06348480284214, + "ce_loss_52": 1.4388723462820052, + "ce_loss_7": 3.2536712110042574, + "epoch": 0.651, + "grad_norm": 19.527282619134503, + "kl_loss_13": 3203.2, + "kl_loss_26": 2279.0, + "kl_loss_39": 1260.5, + "kl_loss_7": 3763.6, + "learning_rate": 0.00027654617388876614, + "loss": 5303.55, + "step": 6510 + }, + { + "ce_loss_13": 2.985329604148865, + "ce_loss_26": 2.5429716140031813, + "ce_loss_39": 2.0698666363954543, + "ce_loss_52": 1.43270433396101, + "ce_loss_7": 3.2484419345855713, + "epoch": 0.652, + "grad_norm": 19.361430489337188, + "kl_loss_13": 3187.2, + "kl_loss_26": 2277.2, + "kl_loss_39": 1278.4, + "kl_loss_7": 3732.4, + "learning_rate": 0.0002751279042579672, + "loss": 5316.3, + "step": 6520 + }, + { + "ce_loss_13": 2.9774204194545746, + "ce_loss_26": 2.5208486020565033, + "ce_loss_39": 2.0273024052381516, + "ce_loss_52": 1.402597150206566, + "ce_loss_7": 3.249239844083786, + "epoch": 0.653, + "grad_norm": 19.527542591573294, + "kl_loss_13": 3262.4, + "kl_loss_26": 2313.8, + "kl_loss_39": 1281.3, + "kl_loss_7": 3834.0, + "learning_rate": 0.00027371189908667604, + "loss": 5336.1, + "step": 6530 + }, + { + "ce_loss_13": 3.003592276573181, + "ce_loss_26": 2.5556287467479706, + "ce_loss_39": 2.075370451807976, + "ce_loss_52": 1.4352567225694657, + "ce_loss_7": 3.271352219581604, + "epoch": 0.654, + "grad_norm": 19.924679150402795, + "kl_loss_13": 3252.0, + "kl_loss_26": 2323.8, + "kl_loss_39": 1299.3, + "kl_loss_7": 3811.2, + "learning_rate": 0.00027229817263404863, + "loss": 5288.8, + "step": 6540 + }, + { + "ce_loss_13": 2.995857471227646, + "ce_loss_26": 2.5282726138830185, + "ce_loss_39": 2.0456599622964857, + "ce_loss_52": 1.4150889962911606, + "ce_loss_7": 3.259833812713623, + "epoch": 0.655, + "grad_norm": 19.56917393810092, + "kl_loss_13": 3242.4, + "kl_loss_26": 2291.2, + "kl_loss_39": 1266.0, + "kl_loss_7": 3806.4, + "learning_rate": 0.0002708867391362948, + "loss": 5328.1, + "step": 6550 + }, + { + "ce_loss_13": 3.004334282875061, + "ce_loss_26": 2.553117799758911, + "ce_loss_39": 2.0656849920749663, + "ce_loss_52": 1.4380108654499053, + "ce_loss_7": 3.275963246822357, + "epoch": 0.656, + "grad_norm": 19.95397617147254, + "kl_loss_13": 3232.8, + "kl_loss_26": 2292.0, + "kl_loss_39": 1262.4, + "kl_loss_7": 3798.0, + "learning_rate": 0.0002694776128065345, + "loss": 5289.15, + "step": 6560 + }, + { + "ce_loss_13": 3.0339253902435304, + "ce_loss_26": 2.5763088524341584, + "ce_loss_39": 2.0788813173770904, + "ce_loss_52": 1.4449120432138443, + "ce_loss_7": 3.3040917217731476, + "epoch": 0.657, + "grad_norm": 20.212973328584127, + "kl_loss_13": 3268.4, + "kl_loss_26": 2314.6, + "kl_loss_39": 1279.8, + "kl_loss_7": 3840.8, + "learning_rate": 0.00026807080783465374, + "loss": 5293.2, + "step": 6570 + }, + { + "ce_loss_13": 3.032737511396408, + "ce_loss_26": 2.5693029284477236, + "ce_loss_39": 2.081709760427475, + "ce_loss_52": 1.4346143543720244, + "ce_loss_7": 3.308923304080963, + "epoch": 0.658, + "grad_norm": 19.880750606313658, + "kl_loss_13": 3330.4, + "kl_loss_26": 2365.8, + "kl_loss_39": 1316.5, + "kl_loss_7": 3904.0, + "learning_rate": 0.00026666633838716316, + "loss": 5330.1, + "step": 6580 + }, + { + "ce_loss_13": 3.0193063259124755, + "ce_loss_26": 2.576409709453583, + "ce_loss_39": 2.0992994725704195, + "ce_loss_52": 1.4646779403090477, + "ce_loss_7": 3.285114985704422, + "epoch": 0.659, + "grad_norm": 20.363457370030773, + "kl_loss_13": 3237.2, + "kl_loss_26": 2313.0, + "kl_loss_39": 1303.8, + "kl_loss_7": 3788.4, + "learning_rate": 0.00026526421860705474, + "loss": 5307.4, + "step": 6590 + }, + { + "ce_loss_13": 2.9948873639106752, + "ce_loss_26": 2.563684010505676, + "ce_loss_39": 2.0847960352897643, + "ce_loss_52": 1.4657811507582665, + "ce_loss_7": 3.2635042905807494, + "epoch": 0.66, + "grad_norm": 20.73724151705583, + "kl_loss_13": 3176.8, + "kl_loss_26": 2267.2, + "kl_loss_39": 1259.0, + "kl_loss_7": 3732.8, + "learning_rate": 0.0002638644626136587, + "loss": 5326.5, + "step": 6600 + }, + { + "ce_loss_13": 3.007009822130203, + "ce_loss_26": 2.5652425408363344, + "ce_loss_39": 2.085008403658867, + "ce_loss_52": 1.4418020695447922, + "ce_loss_7": 3.2768814861774445, + "epoch": 0.661, + "grad_norm": 19.59865222649701, + "kl_loss_13": 3192.4, + "kl_loss_26": 2272.6, + "kl_loss_39": 1265.8, + "kl_loss_7": 3758.8, + "learning_rate": 0.00026246708450250255, + "loss": 5252.1, + "step": 6610 + }, + { + "ce_loss_13": 3.0007415533065798, + "ce_loss_26": 2.57885719537735, + "ce_loss_39": 2.1149094998836517, + "ce_loss_52": 1.4889407217502595, + "ce_loss_7": 3.26429398059845, + "epoch": 0.662, + "grad_norm": 19.791378915341742, + "kl_loss_13": 3154.4, + "kl_loss_26": 2261.8, + "kl_loss_39": 1265.9, + "kl_loss_7": 3702.8, + "learning_rate": 0.00026107209834516854, + "loss": 5253.75, + "step": 6620 + }, + { + "ce_loss_13": 3.023489362001419, + "ce_loss_26": 2.565022760629654, + "ce_loss_39": 2.0684966832399367, + "ce_loss_52": 1.4181891351938247, + "ce_loss_7": 3.295238083600998, + "epoch": 0.663, + "grad_norm": 18.83951798457028, + "kl_loss_13": 3308.8, + "kl_loss_26": 2356.8, + "kl_loss_39": 1317.4, + "kl_loss_7": 3874.4, + "learning_rate": 0.0002596795181891514, + "loss": 5303.2, + "step": 6630 + }, + { + "ce_loss_13": 2.948693299293518, + "ce_loss_26": 2.511568069458008, + "ce_loss_39": 2.0302646070718766, + "ce_loss_52": 1.40511611700058, + "ce_loss_7": 3.215245670080185, + "epoch": 0.664, + "grad_norm": 19.938510106571005, + "kl_loss_13": 3190.8, + "kl_loss_26": 2276.8, + "kl_loss_39": 1269.2, + "kl_loss_7": 3747.6, + "learning_rate": 0.000258289358057718, + "loss": 5355.55, + "step": 6640 + }, + { + "ce_loss_13": 2.964204251766205, + "ce_loss_26": 2.5196115612983703, + "ce_loss_39": 2.032116264104843, + "ce_loss_52": 1.397587490081787, + "ce_loss_7": 3.234144788980484, + "epoch": 0.665, + "grad_norm": 19.688819745922057, + "kl_loss_13": 3241.6, + "kl_loss_26": 2310.8, + "kl_loss_39": 1285.8, + "kl_loss_7": 3812.0, + "learning_rate": 0.0002569016319497657, + "loss": 5275.7, + "step": 6650 + }, + { + "ce_loss_13": 3.0219891548156737, + "ce_loss_26": 2.568303269147873, + "ce_loss_39": 2.0779166162014007, + "ce_loss_52": 1.4420379608869554, + "ce_loss_7": 3.2859797060489653, + "epoch": 0.666, + "grad_norm": 19.909960186263373, + "kl_loss_13": 3259.2, + "kl_loss_26": 2308.2, + "kl_loss_39": 1277.3, + "kl_loss_7": 3822.8, + "learning_rate": 0.00025551635383968066, + "loss": 5336.5, + "step": 6660 + }, + { + "ce_loss_13": 2.994647592306137, + "ce_loss_26": 2.5398232668638228, + "ce_loss_39": 2.0492498099803926, + "ce_loss_52": 1.434773786365986, + "ce_loss_7": 3.264130574464798, + "epoch": 0.667, + "grad_norm": 20.004167920061033, + "kl_loss_13": 3214.0, + "kl_loss_26": 2272.2, + "kl_loss_39": 1256.6, + "kl_loss_7": 3778.0, + "learning_rate": 0.00025413353767719804, + "loss": 5257.5, + "step": 6670 + }, + { + "ce_loss_13": 2.96993693113327, + "ce_loss_26": 2.533370888233185, + "ce_loss_39": 2.055029663443565, + "ce_loss_52": 1.4544675678014756, + "ce_loss_7": 3.2342797338962557, + "epoch": 0.668, + "grad_norm": 20.025849444564383, + "kl_loss_13": 3142.4, + "kl_loss_26": 2230.8, + "kl_loss_39": 1222.0, + "kl_loss_7": 3698.4, + "learning_rate": 0.0002527531973872617, + "loss": 5248.5, + "step": 6680 + }, + { + "ce_loss_13": 2.9402814984321592, + "ce_loss_26": 2.4974717676639555, + "ce_loss_39": 2.0207353264093397, + "ce_loss_52": 1.4053010821342469, + "ce_loss_7": 3.211796945333481, + "epoch": 0.669, + "grad_norm": 20.57900906104847, + "kl_loss_13": 3184.0, + "kl_loss_26": 2262.6, + "kl_loss_39": 1252.7, + "kl_loss_7": 3753.6, + "learning_rate": 0.0002513753468698826, + "loss": 5296.7, + "step": 6690 + }, + { + "ce_loss_13": 3.049540191888809, + "ce_loss_26": 2.5854183793067933, + "ce_loss_39": 2.092196524143219, + "ce_loss_52": 1.4520713061094284, + "ce_loss_7": 3.3153574585914614, + "epoch": 0.67, + "grad_norm": 20.064255813843516, + "kl_loss_13": 3277.6, + "kl_loss_26": 2320.0, + "kl_loss_39": 1288.6, + "kl_loss_7": 3837.2, + "learning_rate": 0.0002500000000000001, + "loss": 5320.3, + "step": 6700 + }, + { + "ce_loss_13": 2.944129317998886, + "ce_loss_26": 2.503596860170364, + "ce_loss_39": 2.0286095440387726, + "ce_loss_52": 1.4282706409692765, + "ce_loss_7": 3.2095106482505797, + "epoch": 0.671, + "grad_norm": 20.12603370941262, + "kl_loss_13": 3178.8, + "kl_loss_26": 2257.6, + "kl_loss_39": 1240.0, + "kl_loss_7": 3734.8, + "learning_rate": 0.0002486271706273421, + "loss": 5232.2, + "step": 6710 + }, + { + "ce_loss_13": 2.9713922500610352, + "ce_loss_26": 2.5247735172510146, + "ce_loss_39": 2.050862190127373, + "ce_loss_52": 1.449659252166748, + "ce_loss_7": 3.2344084203243257, + "epoch": 0.672, + "grad_norm": 20.72667693004533, + "kl_loss_13": 3149.6, + "kl_loss_26": 2225.2, + "kl_loss_39": 1217.0, + "kl_loss_7": 3703.2, + "learning_rate": 0.0002472568725762853, + "loss": 5273.45, + "step": 6720 + }, + { + "ce_loss_13": 2.9712363362312315, + "ce_loss_26": 2.5258888751268387, + "ce_loss_39": 2.038512706756592, + "ce_loss_52": 1.4098822742700576, + "ce_loss_7": 3.238377648591995, + "epoch": 0.673, + "grad_norm": 19.364942978516346, + "kl_loss_13": 3241.6, + "kl_loss_26": 2313.0, + "kl_loss_39": 1283.6, + "kl_loss_7": 3804.0, + "learning_rate": 0.00024588911964571554, + "loss": 5264.25, + "step": 6730 + }, + { + "ce_loss_13": 3.0029661655426025, + "ce_loss_26": 2.5618400514125823, + "ce_loss_39": 2.0779304295778274, + "ce_loss_52": 1.4597969472408294, + "ce_loss_7": 3.2699401795864107, + "epoch": 0.674, + "grad_norm": 19.386861595432553, + "kl_loss_13": 3201.2, + "kl_loss_26": 2283.2, + "kl_loss_39": 1260.7, + "kl_loss_7": 3760.0, + "learning_rate": 0.00024452392560888974, + "loss": 5256.1, + "step": 6740 + }, + { + "ce_loss_13": 2.9799251735210417, + "ce_loss_26": 2.5340505450963975, + "ce_loss_39": 2.049144822359085, + "ce_loss_52": 1.4137116000056267, + "ce_loss_7": 3.257823657989502, + "epoch": 0.675, + "grad_norm": 19.909504320065746, + "kl_loss_13": 3246.4, + "kl_loss_26": 2320.6, + "kl_loss_39": 1295.2, + "kl_loss_7": 3816.4, + "learning_rate": 0.00024316130421329695, + "loss": 5221.1, + "step": 6750 + }, + { + "ce_loss_13": 2.9629843533039093, + "ce_loss_26": 2.524860253930092, + "ce_loss_39": 2.042747235298157, + "ce_loss_52": 1.4334406018257142, + "ce_loss_7": 3.2345532715320586, + "epoch": 0.676, + "grad_norm": 20.369497806638545, + "kl_loss_13": 3186.8, + "kl_loss_26": 2271.6, + "kl_loss_39": 1245.2, + "kl_loss_7": 3754.4, + "learning_rate": 0.00024180126918051909, + "loss": 5236.3, + "step": 6760 + }, + { + "ce_loss_13": 2.9732554376125337, + "ce_loss_26": 2.5324235647916793, + "ce_loss_39": 2.0435447841882706, + "ce_loss_52": 1.4181665301322937, + "ce_loss_7": 3.2470718741416933, + "epoch": 0.677, + "grad_norm": 20.49515152965971, + "kl_loss_13": 3219.2, + "kl_loss_26": 2299.4, + "kl_loss_39": 1278.7, + "kl_loss_7": 3786.8, + "learning_rate": 0.00024044383420609406, + "loss": 5319.65, + "step": 6770 + }, + { + "ce_loss_13": 2.9884051978588104, + "ce_loss_26": 2.552768051624298, + "ce_loss_39": 2.0788974314928055, + "ce_loss_52": 1.460537651181221, + "ce_loss_7": 3.251654601097107, + "epoch": 0.678, + "grad_norm": 19.11355169384201, + "kl_loss_13": 3167.6, + "kl_loss_26": 2248.8, + "kl_loss_39": 1249.3, + "kl_loss_7": 3718.4, + "learning_rate": 0.00023908901295937712, + "loss": 5270.2, + "step": 6780 + }, + { + "ce_loss_13": 2.974563705921173, + "ce_loss_26": 2.536205679178238, + "ce_loss_39": 2.059639421105385, + "ce_loss_52": 1.4519853800535203, + "ce_loss_7": 3.2307840466499327, + "epoch": 0.679, + "grad_norm": 19.49943649381752, + "kl_loss_13": 3131.2, + "kl_loss_26": 2224.6, + "kl_loss_39": 1227.3, + "kl_loss_7": 3672.8, + "learning_rate": 0.00023773681908340283, + "loss": 5293.35, + "step": 6790 + }, + { + "ce_loss_13": 2.961294001340866, + "ce_loss_26": 2.5129422783851623, + "ce_loss_39": 2.0338284403085707, + "ce_loss_52": 1.409618005156517, + "ce_loss_7": 3.226576966047287, + "epoch": 0.68, + "grad_norm": 19.714496760455777, + "kl_loss_13": 3218.4, + "kl_loss_26": 2291.6, + "kl_loss_39": 1267.1, + "kl_loss_7": 3768.8, + "learning_rate": 0.00023638726619474876, + "loss": 5250.5, + "step": 6800 + }, + { + "ce_loss_13": 3.0715033173561097, + "ce_loss_26": 2.626942425966263, + "ce_loss_39": 2.1481954157352448, + "ce_loss_52": 1.5187551528215408, + "ce_loss_7": 3.336813968420029, + "epoch": 0.681, + "grad_norm": 19.805927066338636, + "kl_loss_13": 3238.0, + "kl_loss_26": 2300.0, + "kl_loss_39": 1278.2, + "kl_loss_7": 3790.0, + "learning_rate": 0.0002350403678833976, + "loss": 5234.9, + "step": 6810 + }, + { + "ce_loss_13": 2.957927519083023, + "ce_loss_26": 2.5146523237228395, + "ce_loss_39": 2.0373351722955704, + "ce_loss_52": 1.42108353972435, + "ce_loss_7": 3.223799991607666, + "epoch": 0.682, + "grad_norm": 20.479900710283268, + "kl_loss_13": 3202.4, + "kl_loss_26": 2281.6, + "kl_loss_39": 1265.3, + "kl_loss_7": 3751.2, + "learning_rate": 0.00023369613771260007, + "loss": 5258.8, + "step": 6820 + }, + { + "ce_loss_13": 2.9837976515293123, + "ce_loss_26": 2.5472346246242523, + "ce_loss_39": 2.0789157301187515, + "ce_loss_52": 1.4713156789541244, + "ce_loss_7": 3.2464165806770326, + "epoch": 0.683, + "grad_norm": 19.479700971519588, + "kl_loss_13": 3160.0, + "kl_loss_26": 2251.4, + "kl_loss_39": 1251.2, + "kl_loss_7": 3708.8, + "learning_rate": 0.00023235458921873925, + "loss": 5205.3, + "step": 6830 + }, + { + "ce_loss_13": 2.9887463808059693, + "ce_loss_26": 2.54727523624897, + "ce_loss_39": 2.0671548724174498, + "ce_loss_52": 1.429240283370018, + "ce_loss_7": 3.2527148902416227, + "epoch": 0.684, + "grad_norm": 19.517242754730322, + "kl_loss_13": 3196.4, + "kl_loss_26": 2285.2, + "kl_loss_39": 1272.4, + "kl_loss_7": 3748.4, + "learning_rate": 0.0002310157359111938, + "loss": 5234.8, + "step": 6840 + }, + { + "ce_loss_13": 2.916054058074951, + "ce_loss_26": 2.4669763922691343, + "ce_loss_39": 1.992121958732605, + "ce_loss_52": 1.390305233001709, + "ce_loss_7": 3.179300290346146, + "epoch": 0.685, + "grad_norm": 20.2160173629293, + "kl_loss_13": 3168.4, + "kl_loss_26": 2234.6, + "kl_loss_39": 1230.6, + "kl_loss_7": 3719.2, + "learning_rate": 0.0002296795912722014, + "loss": 5214.55, + "step": 6850 + }, + { + "ce_loss_13": 2.9123338878154756, + "ce_loss_26": 2.4650843650102616, + "ce_loss_39": 1.9903331339359283, + "ce_loss_52": 1.3830609425902367, + "ce_loss_7": 3.174854850769043, + "epoch": 0.686, + "grad_norm": 19.3573154940235, + "kl_loss_13": 3154.0, + "kl_loss_26": 2228.4, + "kl_loss_39": 1226.6, + "kl_loss_7": 3712.0, + "learning_rate": 0.0002283461687567236, + "loss": 5186.2, + "step": 6860 + }, + { + "ce_loss_13": 2.9503078758716583, + "ce_loss_26": 2.5050206154584886, + "ce_loss_39": 2.0353992134332657, + "ce_loss_52": 1.4260726869106293, + "ce_loss_7": 3.2181301593780516, + "epoch": 0.687, + "grad_norm": 19.436405773017547, + "kl_loss_13": 3173.6, + "kl_loss_26": 2249.4, + "kl_loss_39": 1239.4, + "kl_loss_7": 3725.2, + "learning_rate": 0.00022701548179231045, + "loss": 5180.9, + "step": 6870 + }, + { + "ce_loss_13": 2.989210718870163, + "ce_loss_26": 2.5456956744194033, + "ce_loss_39": 2.07299542427063, + "ce_loss_52": 1.45269995033741, + "ce_loss_7": 3.25973704457283, + "epoch": 0.688, + "grad_norm": 19.21415326438658, + "kl_loss_13": 3172.0, + "kl_loss_26": 2245.6, + "kl_loss_39": 1252.7, + "kl_loss_7": 3735.6, + "learning_rate": 0.00022568754377896516, + "loss": 5258.6, + "step": 6880 + }, + { + "ce_loss_13": 2.9914295256137846, + "ce_loss_26": 2.5424347430467606, + "ce_loss_39": 2.0552540928125382, + "ce_loss_52": 1.4221117675304413, + "ce_loss_7": 3.2662317156791687, + "epoch": 0.689, + "grad_norm": 19.29554445155232, + "kl_loss_13": 3243.2, + "kl_loss_26": 2310.8, + "kl_loss_39": 1282.1, + "kl_loss_7": 3819.2, + "learning_rate": 0.00022436236808900844, + "loss": 5241.3, + "step": 6890 + }, + { + "ce_loss_13": 2.9910283386707306, + "ce_loss_26": 2.550189185142517, + "ce_loss_39": 2.07351476252079, + "ce_loss_52": 1.4624590903520585, + "ce_loss_7": 3.2594147861003875, + "epoch": 0.69, + "grad_norm": 19.896412241424787, + "kl_loss_13": 3197.2, + "kl_loss_26": 2269.0, + "kl_loss_39": 1261.5, + "kl_loss_7": 3754.8, + "learning_rate": 0.00022303996806694487, + "loss": 5245.0, + "step": 6900 + }, + { + "ce_loss_13": 2.9984730899333956, + "ce_loss_26": 2.563878893852234, + "ce_loss_39": 2.073988217115402, + "ce_loss_52": 1.4545040100812912, + "ce_loss_7": 3.266710376739502, + "epoch": 0.691, + "grad_norm": 18.309749884904267, + "kl_loss_13": 3220.8, + "kl_loss_26": 2301.4, + "kl_loss_39": 1269.6, + "kl_loss_7": 3776.4, + "learning_rate": 0.00022172035702932823, + "loss": 5221.5, + "step": 6910 + }, + { + "ce_loss_13": 2.9589293122291567, + "ce_loss_26": 2.5164781630039217, + "ce_loss_39": 2.045464962720871, + "ce_loss_52": 1.4442868947982788, + "ce_loss_7": 3.2192385673522947, + "epoch": 0.692, + "grad_norm": 19.310059013922874, + "kl_loss_13": 3138.4, + "kl_loss_26": 2227.4, + "kl_loss_39": 1231.9, + "kl_loss_7": 3680.8, + "learning_rate": 0.00022040354826462666, + "loss": 5190.7, + "step": 6920 + }, + { + "ce_loss_13": 2.947145390510559, + "ce_loss_26": 2.5085155785083773, + "ce_loss_39": 2.0424805164337156, + "ce_loss_52": 1.443164300918579, + "ce_loss_7": 3.206177592277527, + "epoch": 0.693, + "grad_norm": 20.439949582745886, + "kl_loss_13": 3135.6, + "kl_loss_26": 2219.4, + "kl_loss_39": 1222.2, + "kl_loss_7": 3681.2, + "learning_rate": 0.0002190895550330899, + "loss": 5252.8, + "step": 6930 + }, + { + "ce_loss_13": 2.951523560285568, + "ce_loss_26": 2.4968682497739794, + "ce_loss_39": 2.021780180931091, + "ce_loss_52": 1.4140155717730523, + "ce_loss_7": 3.2223109781742094, + "epoch": 0.694, + "grad_norm": 19.632700567273133, + "kl_loss_13": 3171.6, + "kl_loss_26": 2224.6, + "kl_loss_39": 1221.6, + "kl_loss_7": 3730.8, + "learning_rate": 0.00021777839056661552, + "loss": 5204.95, + "step": 6940 + }, + { + "ce_loss_13": 2.9996495246887207, + "ce_loss_26": 2.545914036035538, + "ce_loss_39": 2.0629312634468078, + "ce_loss_52": 1.4569539099931716, + "ce_loss_7": 3.260616344213486, + "epoch": 0.695, + "grad_norm": 19.802798519542876, + "kl_loss_13": 3188.0, + "kl_loss_26": 2252.8, + "kl_loss_39": 1238.7, + "kl_loss_7": 3738.8, + "learning_rate": 0.0002164700680686147, + "loss": 5219.0, + "step": 6950 + }, + { + "ce_loss_13": 2.965209072828293, + "ce_loss_26": 2.523294594883919, + "ce_loss_39": 2.0499251425266265, + "ce_loss_52": 1.450638398528099, + "ce_loss_7": 3.224820476770401, + "epoch": 0.696, + "grad_norm": 19.91434345309615, + "kl_loss_13": 3134.0, + "kl_loss_26": 2218.0, + "kl_loss_39": 1215.6, + "kl_loss_7": 3669.6, + "learning_rate": 0.0002151646007138806, + "loss": 5247.2, + "step": 6960 + }, + { + "ce_loss_13": 2.989179176092148, + "ce_loss_26": 2.5318516552448274, + "ce_loss_39": 2.049258217215538, + "ce_loss_52": 1.4338771492242812, + "ce_loss_7": 3.2527658343315125, + "epoch": 0.697, + "grad_norm": 19.22493467335688, + "kl_loss_13": 3224.0, + "kl_loss_26": 2281.6, + "kl_loss_39": 1260.3, + "kl_loss_7": 3768.8, + "learning_rate": 0.00021386200164845526, + "loss": 5208.2, + "step": 6970 + }, + { + "ce_loss_13": 2.9675691723823547, + "ce_loss_26": 2.524250292778015, + "ce_loss_39": 2.0449122846126557, + "ce_loss_52": 1.4266346216201782, + "ce_loss_7": 3.238176566362381, + "epoch": 0.698, + "grad_norm": 19.28582875590594, + "kl_loss_13": 3208.4, + "kl_loss_26": 2279.6, + "kl_loss_39": 1258.2, + "kl_loss_7": 3762.8, + "learning_rate": 0.0002125622839894964, + "loss": 5196.95, + "step": 6980 + }, + { + "ce_loss_13": 3.08748916387558, + "ce_loss_26": 2.6341689109802244, + "ce_loss_39": 2.131711891293526, + "ce_loss_52": 1.4756682693958283, + "ce_loss_7": 3.3573212742805483, + "epoch": 0.699, + "grad_norm": 19.570816095455605, + "kl_loss_13": 3319.2, + "kl_loss_26": 2375.0, + "kl_loss_39": 1332.9, + "kl_loss_7": 3878.0, + "learning_rate": 0.00021126546082514663, + "loss": 5264.6, + "step": 6990 + }, + { + "ce_loss_13": 2.9623800575733186, + "ce_loss_26": 2.528759664297104, + "ce_loss_39": 2.0456511676311493, + "ce_loss_52": 1.4381566911935806, + "ce_loss_7": 3.2283441185951234, + "epoch": 0.7, + "grad_norm": 20.0544191043279, + "kl_loss_13": 3144.4, + "kl_loss_26": 2245.4, + "kl_loss_39": 1237.7, + "kl_loss_7": 3701.6, + "learning_rate": 0.00020997154521440098, + "loss": 5184.75, + "step": 7000 + }, + { + "ce_loss_13": 2.9258078813552855, + "ce_loss_26": 2.5000339925289152, + "ce_loss_39": 2.030132883787155, + "ce_loss_52": 1.432218487560749, + "ce_loss_7": 3.1859234631061555, + "epoch": 0.701, + "grad_norm": 20.032367176949514, + "kl_loss_13": 3112.4, + "kl_loss_26": 2211.2, + "kl_loss_39": 1212.0, + "kl_loss_7": 3653.6, + "learning_rate": 0.0002086805501869749, + "loss": 5163.7, + "step": 7010 + }, + { + "ce_loss_13": 2.9877611219882967, + "ce_loss_26": 2.5467711210250856, + "ce_loss_39": 2.0729553580284117, + "ce_loss_52": 1.4694935828447342, + "ce_loss_7": 3.2482242822647094, + "epoch": 0.702, + "grad_norm": 19.48021495423088, + "kl_loss_13": 3139.2, + "kl_loss_26": 2221.2, + "kl_loss_39": 1226.6, + "kl_loss_7": 3688.4, + "learning_rate": 0.0002073924887431744, + "loss": 5172.1, + "step": 7020 + }, + { + "ce_loss_13": 2.908235615491867, + "ce_loss_26": 2.477229207754135, + "ce_loss_39": 2.0130053520202638, + "ce_loss_52": 1.4191944628953934, + "ce_loss_7": 3.1711674451828005, + "epoch": 0.703, + "grad_norm": 19.67889818109448, + "kl_loss_13": 3069.6, + "kl_loss_26": 2175.0, + "kl_loss_39": 1200.3, + "kl_loss_7": 3607.2, + "learning_rate": 0.00020610737385376348, + "loss": 5178.0, + "step": 7030 + }, + { + "ce_loss_13": 2.925868648290634, + "ce_loss_26": 2.4821185052394865, + "ce_loss_39": 2.013263535499573, + "ce_loss_52": 1.407904815673828, + "ce_loss_7": 3.184937173128128, + "epoch": 0.704, + "grad_norm": 19.315710978547724, + "kl_loss_13": 3152.4, + "kl_loss_26": 2238.2, + "kl_loss_39": 1233.7, + "kl_loss_7": 3694.8, + "learning_rate": 0.00020482521845983521, + "loss": 5182.5, + "step": 7040 + }, + { + "ce_loss_13": 2.978561645746231, + "ce_loss_26": 2.537975686788559, + "ce_loss_39": 2.068689134716988, + "ce_loss_52": 1.449235063791275, + "ce_loss_7": 3.239302319288254, + "epoch": 0.705, + "grad_norm": 20.022921411442997, + "kl_loss_13": 3163.2, + "kl_loss_26": 2247.8, + "kl_loss_39": 1249.9, + "kl_loss_7": 3709.6, + "learning_rate": 0.00020354603547267987, + "loss": 5191.65, + "step": 7050 + }, + { + "ce_loss_13": 2.926995551586151, + "ce_loss_26": 2.4728329688310624, + "ce_loss_39": 2.000428321957588, + "ce_loss_52": 1.4053510591387748, + "ce_loss_7": 3.1873682618141173, + "epoch": 0.706, + "grad_norm": 20.203929178538456, + "kl_loss_13": 3170.4, + "kl_loss_26": 2222.2, + "kl_loss_39": 1210.8, + "kl_loss_7": 3714.0, + "learning_rate": 0.00020226983777365604, + "loss": 5154.3, + "step": 7060 + }, + { + "ce_loss_13": 2.9693270325660706, + "ce_loss_26": 2.520361030101776, + "ce_loss_39": 2.040296342968941, + "ce_loss_52": 1.4353806316852569, + "ce_loss_7": 3.2412941575050356, + "epoch": 0.707, + "grad_norm": 19.721352799273095, + "kl_loss_13": 3196.4, + "kl_loss_26": 2259.8, + "kl_loss_39": 1250.8, + "kl_loss_7": 3758.8, + "learning_rate": 0.00020099663821406056, + "loss": 5217.7, + "step": 7070 + }, + { + "ce_loss_13": 2.9820376515388487, + "ce_loss_26": 2.5476091861724854, + "ce_loss_39": 2.072761395573616, + "ce_loss_52": 1.4543047964572906, + "ce_loss_7": 3.2506080687046053, + "epoch": 0.708, + "grad_norm": 20.324232804485565, + "kl_loss_13": 3159.6, + "kl_loss_26": 2249.8, + "kl_loss_39": 1247.5, + "kl_loss_7": 3714.4, + "learning_rate": 0.00019972644961499853, + "loss": 5197.1, + "step": 7080 + }, + { + "ce_loss_13": 2.9332118809223173, + "ce_loss_26": 2.488256406784058, + "ce_loss_39": 2.0110603511333465, + "ce_loss_52": 1.4073437690734862, + "ce_loss_7": 3.202910542488098, + "epoch": 0.709, + "grad_norm": 19.810367107777207, + "kl_loss_13": 3178.8, + "kl_loss_26": 2255.4, + "kl_loss_39": 1244.9, + "kl_loss_7": 3745.2, + "learning_rate": 0.00019845928476725522, + "loss": 5159.15, + "step": 7090 + }, + { + "ce_loss_13": 2.963654935359955, + "ce_loss_26": 2.530976951122284, + "ce_loss_39": 2.0635117918252943, + "ce_loss_52": 1.466596108675003, + "ce_loss_7": 3.219101697206497, + "epoch": 0.71, + "grad_norm": 20.1039471333501, + "kl_loss_13": 3130.4, + "kl_loss_26": 2221.8, + "kl_loss_39": 1222.3, + "kl_loss_7": 3668.4, + "learning_rate": 0.00019719515643116677, + "loss": 5138.7, + "step": 7100 + }, + { + "ce_loss_13": 2.9432359755039217, + "ce_loss_26": 2.497118225693703, + "ce_loss_39": 2.0123680919408797, + "ce_loss_52": 1.3938605546951295, + "ce_loss_7": 3.214134621620178, + "epoch": 0.711, + "grad_norm": 20.875475364068677, + "kl_loss_13": 3169.6, + "kl_loss_26": 2236.6, + "kl_loss_39": 1232.8, + "kl_loss_7": 3728.8, + "learning_rate": 0.0001959340773364911, + "loss": 5177.25, + "step": 7110 + }, + { + "ce_loss_13": 2.937902510166168, + "ce_loss_26": 2.5005611896514894, + "ce_loss_39": 2.025138959288597, + "ce_loss_52": 1.4121045261621474, + "ce_loss_7": 3.21354022026062, + "epoch": 0.712, + "grad_norm": 19.300987998871754, + "kl_loss_13": 3168.4, + "kl_loss_26": 2254.8, + "kl_loss_39": 1247.8, + "kl_loss_7": 3737.2, + "learning_rate": 0.0001946760601822809, + "loss": 5183.35, + "step": 7120 + }, + { + "ce_loss_13": 2.9532769322395325, + "ce_loss_26": 2.5094838380813598, + "ce_loss_39": 2.042011481523514, + "ce_loss_52": 1.4359831362962723, + "ce_loss_7": 3.2172477781772613, + "epoch": 0.713, + "grad_norm": 19.806306622567096, + "kl_loss_13": 3130.8, + "kl_loss_26": 2213.4, + "kl_loss_39": 1210.7, + "kl_loss_7": 3690.4, + "learning_rate": 0.00019342111763675512, + "loss": 5121.55, + "step": 7130 + }, + { + "ce_loss_13": 2.9316843450069427, + "ce_loss_26": 2.4771865159273148, + "ce_loss_39": 1.9945536375045776, + "ce_loss_52": 1.3953458324074746, + "ce_loss_7": 3.1933025121688843, + "epoch": 0.714, + "grad_norm": 19.89107625803987, + "kl_loss_13": 3143.2, + "kl_loss_26": 2214.4, + "kl_loss_39": 1207.5, + "kl_loss_7": 3687.6, + "learning_rate": 0.00019216926233717085, + "loss": 5175.1, + "step": 7140 + }, + { + "ce_loss_13": 2.9416719019412993, + "ce_loss_26": 2.4982976377010346, + "ce_loss_39": 2.018396332859993, + "ce_loss_52": 1.4147447228431702, + "ce_loss_7": 3.206580412387848, + "epoch": 0.715, + "grad_norm": 19.473553749134638, + "kl_loss_13": 3170.0, + "kl_loss_26": 2250.0, + "kl_loss_39": 1236.5, + "kl_loss_7": 3716.0, + "learning_rate": 0.00019092050688969737, + "loss": 5168.85, + "step": 7150 + }, + { + "ce_loss_13": 2.93907487988472, + "ce_loss_26": 2.5087509632110594, + "ce_loss_39": 2.036549669504166, + "ce_loss_52": 1.4339062184095384, + "ce_loss_7": 3.2007872402668, + "epoch": 0.716, + "grad_norm": 18.582597661219776, + "kl_loss_13": 3097.2, + "kl_loss_26": 2211.2, + "kl_loss_39": 1215.8, + "kl_loss_7": 3638.4, + "learning_rate": 0.00018967486386928817, + "loss": 5158.35, + "step": 7160 + }, + { + "ce_loss_13": 2.945317584276199, + "ce_loss_26": 2.4993871986865996, + "ce_loss_39": 2.0176479905843734, + "ce_loss_52": 1.4336241394281388, + "ce_loss_7": 3.20940922498703, + "epoch": 0.717, + "grad_norm": 20.943645323187056, + "kl_loss_13": 3133.6, + "kl_loss_26": 2211.4, + "kl_loss_39": 1198.8, + "kl_loss_7": 3684.0, + "learning_rate": 0.00018843234581955443, + "loss": 5165.1, + "step": 7170 + }, + { + "ce_loss_13": 2.9533539593219755, + "ce_loss_26": 2.5154259234666823, + "ce_loss_39": 2.0371447414159776, + "ce_loss_52": 1.4300806164741515, + "ce_loss_7": 3.2220456659793855, + "epoch": 0.718, + "grad_norm": 20.352991453837664, + "kl_loss_13": 3138.0, + "kl_loss_26": 2233.8, + "kl_loss_39": 1229.9, + "kl_loss_7": 3696.4, + "learning_rate": 0.00018719296525263924, + "loss": 5165.1, + "step": 7180 + }, + { + "ce_loss_13": 2.8928813517093657, + "ce_loss_26": 2.4584825813770292, + "ce_loss_39": 1.9816097348928452, + "ce_loss_52": 1.4100207000970841, + "ce_loss_7": 3.1512379109859467, + "epoch": 0.719, + "grad_norm": 19.73288927838942, + "kl_loss_13": 3091.2, + "kl_loss_26": 2168.2, + "kl_loss_39": 1162.9, + "kl_loss_7": 3634.8, + "learning_rate": 0.0001859567346490913, + "loss": 5125.45, + "step": 7190 + }, + { + "ce_loss_13": 3.013565558195114, + "ce_loss_26": 2.5717740774154665, + "ce_loss_39": 2.0986361503601074, + "ce_loss_52": 1.4729616045951843, + "ce_loss_7": 3.270446163415909, + "epoch": 0.72, + "grad_norm": 19.301343533343292, + "kl_loss_13": 3201.2, + "kl_loss_26": 2278.0, + "kl_loss_39": 1269.9, + "kl_loss_7": 3744.0, + "learning_rate": 0.0001847236664577389, + "loss": 5151.05, + "step": 7200 + }, + { + "ce_loss_13": 2.8901243984699247, + "ce_loss_26": 2.447618916630745, + "ce_loss_39": 1.9805465787649155, + "ce_loss_52": 1.393562839925289, + "ce_loss_7": 3.152049034833908, + "epoch": 0.721, + "grad_norm": 19.80481816325832, + "kl_loss_13": 3108.8, + "kl_loss_26": 2184.6, + "kl_loss_39": 1197.1, + "kl_loss_7": 3651.2, + "learning_rate": 0.00018349377309556487, + "loss": 5147.6, + "step": 7210 + }, + { + "ce_loss_13": 2.935315173864365, + "ce_loss_26": 2.4844827204942703, + "ce_loss_39": 2.014389392733574, + "ce_loss_52": 1.4218608409166336, + "ce_loss_7": 3.202117031812668, + "epoch": 0.722, + "grad_norm": 21.29811195159757, + "kl_loss_13": 3145.6, + "kl_loss_26": 2210.6, + "kl_loss_39": 1204.5, + "kl_loss_7": 3703.6, + "learning_rate": 0.00018226706694758193, + "loss": 5128.1, + "step": 7220 + }, + { + "ce_loss_13": 2.9786236941814423, + "ce_loss_26": 2.536505568027496, + "ce_loss_39": 2.0584143906831742, + "ce_loss_52": 1.4622384160757065, + "ce_loss_7": 3.243917632102966, + "epoch": 0.723, + "grad_norm": 19.6163049246679, + "kl_loss_13": 3157.6, + "kl_loss_26": 2230.8, + "kl_loss_39": 1218.2, + "kl_loss_7": 3710.4, + "learning_rate": 0.0001810435603667075, + "loss": 5135.45, + "step": 7230 + }, + { + "ce_loss_13": 2.9429832458496095, + "ce_loss_26": 2.4910512387752535, + "ce_loss_39": 2.0214726239442826, + "ce_loss_52": 1.4324376732110977, + "ce_loss_7": 3.201977092027664, + "epoch": 0.724, + "grad_norm": 19.933959617154258, + "kl_loss_13": 3130.8, + "kl_loss_26": 2197.0, + "kl_loss_39": 1197.6, + "kl_loss_7": 3676.0, + "learning_rate": 0.0001798232656736389, + "loss": 5101.6, + "step": 7240 + }, + { + "ce_loss_13": 3.017507255077362, + "ce_loss_26": 2.5613476634025574, + "ce_loss_39": 2.065951904654503, + "ce_loss_52": 1.4529381558299064, + "ce_loss_7": 3.284585565328598, + "epoch": 0.725, + "grad_norm": 19.69163026795737, + "kl_loss_13": 3244.0, + "kl_loss_26": 2294.8, + "kl_loss_39": 1252.0, + "kl_loss_7": 3804.4, + "learning_rate": 0.0001786061951567303, + "loss": 5145.8, + "step": 7250 + }, + { + "ce_loss_13": 2.8927790343761446, + "ce_loss_26": 2.4552118331193924, + "ce_loss_39": 1.9874976933002473, + "ce_loss_52": 1.3983306601643561, + "ce_loss_7": 3.160378706455231, + "epoch": 0.726, + "grad_norm": 19.959972483591027, + "kl_loss_13": 3097.2, + "kl_loss_26": 2189.6, + "kl_loss_39": 1199.8, + "kl_loss_7": 3653.2, + "learning_rate": 0.00017739236107186857, + "loss": 5152.65, + "step": 7260 + }, + { + "ce_loss_13": 2.938109403848648, + "ce_loss_26": 2.4923312455415725, + "ce_loss_39": 2.0182687640190125, + "ce_loss_52": 1.4262833833694457, + "ce_loss_7": 3.2080613017082213, + "epoch": 0.727, + "grad_norm": 19.42416009314478, + "kl_loss_13": 3165.6, + "kl_loss_26": 2229.8, + "kl_loss_39": 1226.4, + "kl_loss_7": 3724.0, + "learning_rate": 0.00017618177564234904, + "loss": 5132.0, + "step": 7270 + }, + { + "ce_loss_13": 2.932406869530678, + "ce_loss_26": 2.481098806858063, + "ce_loss_39": 1.9998224407434464, + "ce_loss_52": 1.4021466106176377, + "ce_loss_7": 3.192963147163391, + "epoch": 0.728, + "grad_norm": 19.542374311814292, + "kl_loss_13": 3155.2, + "kl_loss_26": 2223.4, + "kl_loss_39": 1216.9, + "kl_loss_7": 3704.0, + "learning_rate": 0.00017497445105875377, + "loss": 5186.8, + "step": 7280 + }, + { + "ce_loss_13": 2.9192141771316527, + "ce_loss_26": 2.490318274497986, + "ce_loss_39": 2.027893853187561, + "ce_loss_52": 1.43733262270689, + "ce_loss_7": 3.177316850423813, + "epoch": 0.729, + "grad_norm": 20.20715160517786, + "kl_loss_13": 3084.0, + "kl_loss_26": 2177.6, + "kl_loss_39": 1196.3, + "kl_loss_7": 3620.0, + "learning_rate": 0.000173770399478828, + "loss": 5076.85, + "step": 7290 + }, + { + "ce_loss_13": 2.9109710931777952, + "ce_loss_26": 2.4745964229106905, + "ce_loss_39": 2.007509797811508, + "ce_loss_52": 1.4228856399655343, + "ce_loss_7": 3.179339534044266, + "epoch": 0.73, + "grad_norm": 19.31391716230683, + "kl_loss_13": 3123.2, + "kl_loss_26": 2212.0, + "kl_loss_39": 1198.3, + "kl_loss_7": 3681.6, + "learning_rate": 0.0001725696330273575, + "loss": 5123.9, + "step": 7300 + }, + { + "ce_loss_13": 2.9543818056583406, + "ce_loss_26": 2.5187111288309096, + "ce_loss_39": 2.046798062324524, + "ce_loss_52": 1.4316737815737723, + "ce_loss_7": 3.2196085810661317, + "epoch": 0.731, + "grad_norm": 19.333490210710867, + "kl_loss_13": 3136.4, + "kl_loss_26": 2223.6, + "kl_loss_39": 1228.2, + "kl_loss_7": 3683.6, + "learning_rate": 0.00017137216379604724, + "loss": 5093.05, + "step": 7310 + }, + { + "ce_loss_13": 2.991909348964691, + "ce_loss_26": 2.533888804912567, + "ce_loss_39": 2.048043805360794, + "ce_loss_52": 1.426993179321289, + "ce_loss_7": 3.2596666753292083, + "epoch": 0.732, + "grad_norm": 18.669638559883268, + "kl_loss_13": 3225.6, + "kl_loss_26": 2287.6, + "kl_loss_39": 1264.8, + "kl_loss_7": 3784.0, + "learning_rate": 0.00017017800384339925, + "loss": 5127.2, + "step": 7320 + }, + { + "ce_loss_13": 2.919608438014984, + "ce_loss_26": 2.475746387243271, + "ce_loss_39": 2.0119441866874697, + "ce_loss_52": 1.416624790430069, + "ce_loss_7": 3.1857059836387633, + "epoch": 0.733, + "grad_norm": 19.375624695020313, + "kl_loss_13": 3109.2, + "kl_loss_26": 2190.6, + "kl_loss_39": 1207.2, + "kl_loss_7": 3667.2, + "learning_rate": 0.00016898716519459073, + "loss": 5204.9, + "step": 7330 + }, + { + "ce_loss_13": 2.978672456741333, + "ce_loss_26": 2.5233275532722472, + "ce_loss_39": 2.0352649986743927, + "ce_loss_52": 1.396337878704071, + "ce_loss_7": 3.2537453293800356, + "epoch": 0.734, + "grad_norm": 19.487833479563225, + "kl_loss_13": 3269.6, + "kl_loss_26": 2319.4, + "kl_loss_39": 1294.6, + "kl_loss_7": 3847.2, + "learning_rate": 0.00016779965984135375, + "loss": 5141.65, + "step": 7340 + }, + { + "ce_loss_13": 2.946110498905182, + "ce_loss_26": 2.507070618867874, + "ce_loss_39": 2.0425552487373353, + "ce_loss_52": 1.4492309480905532, + "ce_loss_7": 3.2022292137146, + "epoch": 0.735, + "grad_norm": 19.428920750438415, + "kl_loss_13": 3113.2, + "kl_loss_26": 2205.0, + "kl_loss_39": 1210.6, + "kl_loss_7": 3652.4, + "learning_rate": 0.00016661549974185424, + "loss": 5094.6, + "step": 7350 + }, + { + "ce_loss_13": 2.9823583602905273, + "ce_loss_26": 2.539780905842781, + "ce_loss_39": 2.0619786471128463, + "ce_loss_52": 1.45176909416914, + "ce_loss_7": 3.249054718017578, + "epoch": 0.736, + "grad_norm": 19.839368194621894, + "kl_loss_13": 3204.0, + "kl_loss_26": 2280.6, + "kl_loss_39": 1256.4, + "kl_loss_7": 3753.6, + "learning_rate": 0.00016543469682057105, + "loss": 5196.95, + "step": 7360 + }, + { + "ce_loss_13": 2.958816784620285, + "ce_loss_26": 2.526939642429352, + "ce_loss_39": 2.0634298622608185, + "ce_loss_52": 1.4754424065351486, + "ce_loss_7": 3.223200261592865, + "epoch": 0.737, + "grad_norm": 19.922895259671222, + "kl_loss_13": 3096.0, + "kl_loss_26": 2185.8, + "kl_loss_39": 1192.1, + "kl_loss_7": 3641.6, + "learning_rate": 0.00016425726296817632, + "loss": 5155.3, + "step": 7370 + }, + { + "ce_loss_13": 2.9623453855514525, + "ce_loss_26": 2.5124567419290544, + "ce_loss_39": 2.032202622294426, + "ce_loss_52": 1.4377738699316978, + "ce_loss_7": 3.22493896484375, + "epoch": 0.738, + "grad_norm": 19.913040297451975, + "kl_loss_13": 3150.8, + "kl_loss_26": 2213.6, + "kl_loss_39": 1202.5, + "kl_loss_7": 3703.2, + "learning_rate": 0.00016308321004141607, + "loss": 5152.6, + "step": 7380 + }, + { + "ce_loss_13": 2.9184991478919984, + "ce_loss_26": 2.4769508123397825, + "ce_loss_39": 2.0019295692443846, + "ce_loss_52": 1.423092892765999, + "ce_loss_7": 3.185350716114044, + "epoch": 0.739, + "grad_norm": 19.322834578437774, + "kl_loss_13": 3095.6, + "kl_loss_26": 2174.4, + "kl_loss_39": 1176.6, + "kl_loss_7": 3652.0, + "learning_rate": 0.00016191254986299043, + "loss": 5134.25, + "step": 7390 + }, + { + "ce_loss_13": 2.8498477935791016, + "ce_loss_26": 2.4134464621543885, + "ce_loss_39": 1.952101919054985, + "ce_loss_52": 1.3852853626012802, + "ce_loss_7": 3.117431342601776, + "epoch": 0.74, + "grad_norm": 20.243774871674358, + "kl_loss_13": 3055.2, + "kl_loss_26": 2147.0, + "kl_loss_39": 1163.3, + "kl_loss_7": 3606.8, + "learning_rate": 0.00016074529422143398, + "loss": 5086.95, + "step": 7400 + }, + { + "ce_loss_13": 2.999220699071884, + "ce_loss_26": 2.550427186489105, + "ce_loss_39": 2.0656515032052996, + "ce_loss_52": 1.4574634283781052, + "ce_loss_7": 3.2632993936538695, + "epoch": 0.741, + "grad_norm": 20.693157593916027, + "kl_loss_13": 3181.6, + "kl_loss_26": 2251.2, + "kl_loss_39": 1241.3, + "kl_loss_7": 3731.6, + "learning_rate": 0.0001595814548709983, + "loss": 5127.4, + "step": 7410 + }, + { + "ce_loss_13": 2.9262797057628633, + "ce_loss_26": 2.4955521285533906, + "ce_loss_39": 2.043664366006851, + "ce_loss_52": 1.4607123613357544, + "ce_loss_7": 3.178546887636185, + "epoch": 0.742, + "grad_norm": 19.251560643481486, + "kl_loss_13": 3060.8, + "kl_loss_26": 2159.0, + "kl_loss_39": 1190.2, + "kl_loss_7": 3588.8, + "learning_rate": 0.00015842104353153285, + "loss": 5092.2, + "step": 7420 + }, + { + "ce_loss_13": 3.0160707533359528, + "ce_loss_26": 2.5663387060165403, + "ce_loss_39": 2.0802172899246214, + "ce_loss_52": 1.4715656280517577, + "ce_loss_7": 3.2800197422504427, + "epoch": 0.743, + "grad_norm": 19.587217025482605, + "kl_loss_13": 3190.4, + "kl_loss_26": 2255.8, + "kl_loss_39": 1233.7, + "kl_loss_7": 3735.6, + "learning_rate": 0.0001572640718883667, + "loss": 5115.1, + "step": 7430 + }, + { + "ce_loss_13": 2.9469042241573336, + "ce_loss_26": 2.504935991764069, + "ce_loss_39": 2.0331138372421265, + "ce_loss_52": 1.4291576787829399, + "ce_loss_7": 3.2098668992519377, + "epoch": 0.744, + "grad_norm": 18.92686745034597, + "kl_loss_13": 3124.8, + "kl_loss_26": 2209.8, + "kl_loss_39": 1212.4, + "kl_loss_7": 3675.2, + "learning_rate": 0.0001561105515921915, + "loss": 5076.55, + "step": 7440 + }, + { + "ce_loss_13": 2.924536573886871, + "ce_loss_26": 2.487199380993843, + "ce_loss_39": 2.0280190229415895, + "ce_loss_52": 1.4308366000652313, + "ce_loss_7": 3.184017467498779, + "epoch": 0.745, + "grad_norm": 20.464433868383605, + "kl_loss_13": 3079.6, + "kl_loss_26": 2173.0, + "kl_loss_39": 1203.3, + "kl_loss_7": 3634.0, + "learning_rate": 0.0001549604942589441, + "loss": 5072.9, + "step": 7450 + }, + { + "ce_loss_13": 2.929040068387985, + "ce_loss_26": 2.4782379269599915, + "ce_loss_39": 1.997541171312332, + "ce_loss_52": 1.4025927037000656, + "ce_loss_7": 3.1986856281757357, + "epoch": 0.746, + "grad_norm": 19.749937853484084, + "kl_loss_13": 3136.4, + "kl_loss_26": 2195.8, + "kl_loss_39": 1197.5, + "kl_loss_7": 3701.6, + "learning_rate": 0.00015381391146968864, + "loss": 5119.05, + "step": 7460 + }, + { + "ce_loss_13": 2.9494260370731356, + "ce_loss_26": 2.5060392141342165, + "ce_loss_39": 2.0337665289640428, + "ce_loss_52": 1.437817743420601, + "ce_loss_7": 3.2058426082134246, + "epoch": 0.747, + "grad_norm": 20.32794285148468, + "kl_loss_13": 3134.4, + "kl_loss_26": 2216.0, + "kl_loss_39": 1213.4, + "kl_loss_7": 3678.0, + "learning_rate": 0.00015267081477050133, + "loss": 5102.65, + "step": 7470 + }, + { + "ce_loss_13": 2.921141803264618, + "ce_loss_26": 2.4805154383182524, + "ce_loss_39": 2.0182774633169176, + "ce_loss_52": 1.4364572942256928, + "ce_loss_7": 3.1770897448062896, + "epoch": 0.748, + "grad_norm": 19.06438842103908, + "kl_loss_13": 3083.4, + "kl_loss_26": 2171.6, + "kl_loss_39": 1187.2, + "kl_loss_7": 3614.4, + "learning_rate": 0.00015153121567235335, + "loss": 5127.55, + "step": 7480 + }, + { + "ce_loss_13": 2.913339024782181, + "ce_loss_26": 2.4667980909347533, + "ce_loss_39": 1.9971046984195708, + "ce_loss_52": 1.4167528375983238, + "ce_loss_7": 3.1835066616535186, + "epoch": 0.749, + "grad_norm": 19.868010576940023, + "kl_loss_13": 3104.0, + "kl_loss_26": 2178.6, + "kl_loss_39": 1188.7, + "kl_loss_7": 3662.4, + "learning_rate": 0.00015039512565099468, + "loss": 5094.65, + "step": 7490 + }, + { + "ce_loss_13": 2.914568355679512, + "ce_loss_26": 2.469626322388649, + "ce_loss_39": 2.0020422458648683, + "ce_loss_52": 1.4171572998166084, + "ce_loss_7": 3.1783276200294495, + "epoch": 0.75, + "grad_norm": 19.266335144116216, + "kl_loss_13": 3099.6, + "kl_loss_26": 2178.4, + "kl_loss_39": 1189.1, + "kl_loss_7": 3658.8, + "learning_rate": 0.00014926255614683932, + "loss": 5132.95, + "step": 7500 + }, + { + "ce_loss_13": 2.9255091905593873, + "ce_loss_26": 2.4897490620613096, + "ce_loss_39": 2.020438665151596, + "ce_loss_52": 1.43346728682518, + "ce_loss_7": 3.188784825801849, + "epoch": 0.751, + "grad_norm": 18.946748872518604, + "kl_loss_13": 3108.8, + "kl_loss_26": 2190.0, + "kl_loss_39": 1197.3, + "kl_loss_7": 3652.8, + "learning_rate": 0.0001481335185648498, + "loss": 5140.95, + "step": 7510 + }, + { + "ce_loss_13": 2.9898211777210237, + "ce_loss_26": 2.541801372170448, + "ce_loss_39": 2.068241673707962, + "ce_loss_52": 1.474491646885872, + "ce_loss_7": 3.2494523525238037, + "epoch": 0.752, + "grad_norm": 19.297834732191596, + "kl_loss_13": 3108.0, + "kl_loss_26": 2187.0, + "kl_loss_39": 1193.8, + "kl_loss_7": 3655.2, + "learning_rate": 0.0001470080242744218, + "loss": 5080.45, + "step": 7520 + }, + { + "ce_loss_13": 2.989736980199814, + "ce_loss_26": 2.5422766327857973, + "ce_loss_39": 2.0724924355745316, + "ce_loss_52": 1.4764755725860597, + "ce_loss_7": 3.2583333015441895, + "epoch": 0.753, + "grad_norm": 19.759302349149518, + "kl_loss_13": 3127.2, + "kl_loss_26": 2207.8, + "kl_loss_39": 1205.1, + "kl_loss_7": 3686.4, + "learning_rate": 0.0001458860846092705, + "loss": 5089.25, + "step": 7530 + }, + { + "ce_loss_13": 2.9518058955669404, + "ce_loss_26": 2.5123462677001953, + "ce_loss_39": 2.033254536986351, + "ce_loss_52": 1.430069674551487, + "ce_loss_7": 3.216610902547836, + "epoch": 0.754, + "grad_norm": 19.101743494796594, + "kl_loss_13": 3130.4, + "kl_loss_26": 2214.4, + "kl_loss_39": 1206.1, + "kl_loss_7": 3688.8, + "learning_rate": 0.00014476771086731566, + "loss": 5132.95, + "step": 7540 + }, + { + "ce_loss_13": 2.9499751746654512, + "ce_loss_26": 2.5148087441921234, + "ce_loss_39": 2.0469634413719175, + "ce_loss_52": 1.4645337551832198, + "ce_loss_7": 3.2114050924777984, + "epoch": 0.755, + "grad_norm": 18.95244057854832, + "kl_loss_13": 3084.4, + "kl_loss_26": 2166.2, + "kl_loss_39": 1178.4, + "kl_loss_7": 3625.2, + "learning_rate": 0.00014365291431056872, + "loss": 5111.9, + "step": 7550 + }, + { + "ce_loss_13": 2.915192812681198, + "ce_loss_26": 2.470223453640938, + "ce_loss_39": 1.9942311495542526, + "ce_loss_52": 1.4130516573786736, + "ce_loss_7": 3.186279386281967, + "epoch": 0.756, + "grad_norm": 19.87294750540145, + "kl_loss_13": 3138.4, + "kl_loss_26": 2214.2, + "kl_loss_39": 1201.4, + "kl_loss_7": 3700.0, + "learning_rate": 0.00014254170616501827, + "loss": 5096.4, + "step": 7560 + }, + { + "ce_loss_13": 2.954612511396408, + "ce_loss_26": 2.5071854114532472, + "ce_loss_39": 2.035946971178055, + "ce_loss_52": 1.439925280213356, + "ce_loss_7": 3.2222863495349885, + "epoch": 0.757, + "grad_norm": 20.90881340934633, + "kl_loss_13": 3122.0, + "kl_loss_26": 2194.2, + "kl_loss_39": 1192.5, + "kl_loss_7": 3677.6, + "learning_rate": 0.0001414340976205183, + "loss": 5060.45, + "step": 7570 + }, + { + "ce_loss_13": 2.9101392149925234, + "ce_loss_26": 2.471102824807167, + "ce_loss_39": 2.0064873933792113, + "ce_loss_52": 1.4280226349830627, + "ce_loss_7": 3.1714185059070585, + "epoch": 0.758, + "grad_norm": 19.49522818126467, + "kl_loss_13": 3076.0, + "kl_loss_26": 2159.8, + "kl_loss_39": 1176.2, + "kl_loss_7": 3620.4, + "learning_rate": 0.00014033009983067452, + "loss": 5108.35, + "step": 7580 + }, + { + "ce_loss_13": 2.966978985071182, + "ce_loss_26": 2.5087331235408783, + "ce_loss_39": 2.0336913764476776, + "ce_loss_52": 1.4274784743785858, + "ce_loss_7": 3.2379914104938505, + "epoch": 0.759, + "grad_norm": 18.693022063405696, + "kl_loss_13": 3203.2, + "kl_loss_26": 2255.0, + "kl_loss_39": 1243.9, + "kl_loss_7": 3762.0, + "learning_rate": 0.00013922972391273224, + "loss": 5094.65, + "step": 7590 + }, + { + "ce_loss_13": 2.9872674524784086, + "ce_loss_26": 2.5407335460186005, + "ce_loss_39": 2.061782196164131, + "ce_loss_52": 1.4462621062994003, + "ce_loss_7": 3.253601038455963, + "epoch": 0.76, + "grad_norm": 19.707863398571995, + "kl_loss_13": 3172.0, + "kl_loss_26": 2252.8, + "kl_loss_39": 1251.4, + "kl_loss_7": 3726.4, + "learning_rate": 0.0001381329809474649, + "loss": 5098.7, + "step": 7600 + }, + { + "ce_loss_13": 2.8771551668643953, + "ce_loss_26": 2.4450180411338804, + "ce_loss_39": 1.9818484753370285, + "ce_loss_52": 1.4145073384046554, + "ce_loss_7": 3.1438543021678926, + "epoch": 0.761, + "grad_norm": 18.46540807586579, + "kl_loss_13": 3029.2, + "kl_loss_26": 2126.8, + "kl_loss_39": 1154.6, + "kl_loss_7": 3583.2, + "learning_rate": 0.0001370398819790621, + "loss": 5084.25, + "step": 7610 + }, + { + "ce_loss_13": 2.960028713941574, + "ce_loss_26": 2.525826930999756, + "ce_loss_39": 2.0549875289201736, + "ce_loss_52": 1.4615961879491806, + "ce_loss_7": 3.2287797749042513, + "epoch": 0.762, + "grad_norm": 19.79273649753162, + "kl_loss_13": 3108.0, + "kl_loss_26": 2202.8, + "kl_loss_39": 1209.5, + "kl_loss_7": 3652.8, + "learning_rate": 0.00013595043801501794, + "loss": 5052.75, + "step": 7620 + }, + { + "ce_loss_13": 2.9331182718276976, + "ce_loss_26": 2.4871296346187592, + "ce_loss_39": 2.019395884871483, + "ce_loss_52": 1.4329772531986236, + "ce_loss_7": 3.19427090883255, + "epoch": 0.763, + "grad_norm": 20.265530214115834, + "kl_loss_13": 3124.0, + "kl_loss_26": 2195.4, + "kl_loss_39": 1199.7, + "kl_loss_7": 3669.6, + "learning_rate": 0.00013486466002602133, + "loss": 5092.15, + "step": 7630 + }, + { + "ce_loss_13": 2.86139075756073, + "ce_loss_26": 2.4176136374473574, + "ce_loss_39": 1.941940224170685, + "ce_loss_52": 1.3821519583463668, + "ce_loss_7": 3.1196699738502502, + "epoch": 0.764, + "grad_norm": 19.727213627271716, + "kl_loss_13": 3072.8, + "kl_loss_26": 2145.6, + "kl_loss_39": 1144.7, + "kl_loss_7": 3617.2, + "learning_rate": 0.00013378255894584462, + "loss": 5002.6, + "step": 7640 + }, + { + "ce_loss_13": 2.9353716015815734, + "ce_loss_26": 2.490555015206337, + "ce_loss_39": 2.019370597600937, + "ce_loss_52": 1.4347517609596252, + "ce_loss_7": 3.196541225910187, + "epoch": 0.765, + "grad_norm": 20.424012523901062, + "kl_loss_13": 3127.6, + "kl_loss_26": 2205.8, + "kl_loss_39": 1202.7, + "kl_loss_7": 3670.0, + "learning_rate": 0.0001327041456712334, + "loss": 5085.1, + "step": 7650 + }, + { + "ce_loss_13": 2.994156318902969, + "ce_loss_26": 2.56048826277256, + "ce_loss_39": 2.084024053812027, + "ce_loss_52": 1.487925711274147, + "ce_loss_7": 3.2512714982032778, + "epoch": 0.766, + "grad_norm": 19.650498118539073, + "kl_loss_13": 3125.6, + "kl_loss_26": 2221.6, + "kl_loss_39": 1220.7, + "kl_loss_7": 3665.2, + "learning_rate": 0.00013162943106179747, + "loss": 5105.9, + "step": 7660 + }, + { + "ce_loss_13": 2.9769886791706086, + "ce_loss_26": 2.5290128916501997, + "ce_loss_39": 2.0444375783205033, + "ce_loss_52": 1.4427102521061896, + "ce_loss_7": 3.2454589188098906, + "epoch": 0.767, + "grad_norm": 19.69057867231776, + "kl_loss_13": 3203.2, + "kl_loss_26": 2269.2, + "kl_loss_39": 1243.2, + "kl_loss_7": 3758.4, + "learning_rate": 0.00013055842593990132, + "loss": 5067.35, + "step": 7670 + }, + { + "ce_loss_13": 2.9738565921783446, + "ce_loss_26": 2.5335902631282807, + "ce_loss_39": 2.06500606238842, + "ce_loss_52": 1.4531759321689606, + "ce_loss_7": 3.2406944632530212, + "epoch": 0.768, + "grad_norm": 19.854778454902203, + "kl_loss_13": 3164.0, + "kl_loss_26": 2244.2, + "kl_loss_39": 1253.0, + "kl_loss_7": 3714.8, + "learning_rate": 0.00012949114109055414, + "loss": 5080.45, + "step": 7680 + }, + { + "ce_loss_13": 2.8718224823474885, + "ce_loss_26": 2.434892734885216, + "ce_loss_39": 1.9763070404529572, + "ce_loss_52": 1.4119910702109337, + "ce_loss_7": 3.1341715812683106, + "epoch": 0.769, + "grad_norm": 19.065166102671203, + "kl_loss_13": 3037.6, + "kl_loss_26": 2134.2, + "kl_loss_39": 1156.1, + "kl_loss_7": 3588.0, + "learning_rate": 0.00012842758726130281, + "loss": 5110.75, + "step": 7690 + }, + { + "ce_loss_13": 2.9209058582782745, + "ce_loss_26": 2.4878934979438783, + "ce_loss_39": 2.0205056190490724, + "ce_loss_52": 1.4421792283654213, + "ce_loss_7": 3.182013803720474, + "epoch": 0.77, + "grad_norm": 19.547331822021178, + "kl_loss_13": 3063.6, + "kl_loss_26": 2150.4, + "kl_loss_39": 1168.8, + "kl_loss_7": 3608.0, + "learning_rate": 0.00012736777516212267, + "loss": 5073.5, + "step": 7700 + }, + { + "ce_loss_13": 2.9334555983543398, + "ce_loss_26": 2.493560019135475, + "ce_loss_39": 2.0158845692873, + "ce_loss_52": 1.4151214450597762, + "ce_loss_7": 3.2045696437358857, + "epoch": 0.771, + "grad_norm": 18.662664559432073, + "kl_loss_13": 3145.2, + "kl_loss_26": 2230.4, + "kl_loss_39": 1228.4, + "kl_loss_7": 3706.0, + "learning_rate": 0.00012631171546530968, + "loss": 5058.75, + "step": 7710 + }, + { + "ce_loss_13": 2.944778233766556, + "ce_loss_26": 2.5031532883644103, + "ce_loss_39": 2.0205067574977873, + "ce_loss_52": 1.414848119020462, + "ce_loss_7": 3.2128884732723235, + "epoch": 0.772, + "grad_norm": 19.409023945233233, + "kl_loss_13": 3144.0, + "kl_loss_26": 2229.4, + "kl_loss_39": 1221.0, + "kl_loss_7": 3700.4, + "learning_rate": 0.00012525941880537307, + "loss": 5071.55, + "step": 7720 + }, + { + "ce_loss_13": 2.937892961502075, + "ce_loss_26": 2.492938667535782, + "ce_loss_39": 2.020972582697868, + "ce_loss_52": 1.4354220196604728, + "ce_loss_7": 3.2008834302425386, + "epoch": 0.773, + "grad_norm": 19.46905923212589, + "kl_loss_13": 3121.2, + "kl_loss_26": 2201.4, + "kl_loss_39": 1204.2, + "kl_loss_7": 3662.4, + "learning_rate": 0.00012421089577892869, + "loss": 5040.15, + "step": 7730 + }, + { + "ce_loss_13": 2.949704957008362, + "ce_loss_26": 2.5045041382312774, + "ce_loss_39": 2.016797697544098, + "ce_loss_52": 1.4014781221747399, + "ce_loss_7": 3.212642914056778, + "epoch": 0.774, + "grad_norm": 19.613908494270376, + "kl_loss_13": 3187.2, + "kl_loss_26": 2266.0, + "kl_loss_39": 1252.6, + "kl_loss_7": 3732.0, + "learning_rate": 0.0001231661569445919, + "loss": 5076.45, + "step": 7740 + }, + { + "ce_loss_13": 2.955250400304794, + "ce_loss_26": 2.5274777173995973, + "ce_loss_39": 2.067678835988045, + "ce_loss_52": 1.4838283985853196, + "ce_loss_7": 3.2114856481552123, + "epoch": 0.775, + "grad_norm": 19.844798674533834, + "kl_loss_13": 3084.8, + "kl_loss_26": 2190.6, + "kl_loss_39": 1199.8, + "kl_loss_7": 3616.4, + "learning_rate": 0.00012212521282287093, + "loss": 5060.4, + "step": 7750 + }, + { + "ce_loss_13": 2.9763452112674713, + "ce_loss_26": 2.5314755111932756, + "ce_loss_39": 2.0540303111076357, + "ce_loss_52": 1.450203076004982, + "ce_loss_7": 3.2422658264636994, + "epoch": 0.776, + "grad_norm": 20.432260851236787, + "kl_loss_13": 3181.2, + "kl_loss_26": 2258.4, + "kl_loss_39": 1248.7, + "kl_loss_7": 3726.0, + "learning_rate": 0.00012108807389606158, + "loss": 5084.95, + "step": 7760 + }, + { + "ce_loss_13": 2.920662760734558, + "ce_loss_26": 2.475513318181038, + "ce_loss_39": 2.0011812478303908, + "ce_loss_52": 1.4144959792494773, + "ce_loss_7": 3.1814299404621122, + "epoch": 0.777, + "grad_norm": 19.88207983374875, + "kl_loss_13": 3106.4, + "kl_loss_26": 2178.8, + "kl_loss_39": 1189.6, + "kl_loss_7": 3652.0, + "learning_rate": 0.00012005475060814159, + "loss": 5075.25, + "step": 7770 + }, + { + "ce_loss_13": 2.982200914621353, + "ce_loss_26": 2.5349232286214827, + "ce_loss_39": 2.069617584347725, + "ce_loss_52": 1.4744601517915725, + "ce_loss_7": 3.2433891892433167, + "epoch": 0.778, + "grad_norm": 19.290497638146196, + "kl_loss_13": 3128.4, + "kl_loss_26": 2198.0, + "kl_loss_39": 1199.9, + "kl_loss_7": 3677.2, + "learning_rate": 0.00011902525336466464, + "loss": 5053.05, + "step": 7780 + }, + { + "ce_loss_13": 2.9154593467712404, + "ce_loss_26": 2.477250945568085, + "ce_loss_39": 2.0098533272743224, + "ce_loss_52": 1.4167337000370026, + "ce_loss_7": 3.1773332476615908, + "epoch": 0.779, + "grad_norm": 19.252580297991088, + "kl_loss_13": 3096.4, + "kl_loss_26": 2190.8, + "kl_loss_39": 1202.1, + "kl_loss_7": 3643.6, + "learning_rate": 0.00011799959253265668, + "loss": 5067.65, + "step": 7790 + }, + { + "ce_loss_13": 2.9013588786125184, + "ce_loss_26": 2.468735784292221, + "ce_loss_39": 2.0137620836496355, + "ce_loss_52": 1.4429293110966683, + "ce_loss_7": 3.1603996396064757, + "epoch": 0.78, + "grad_norm": 18.773016641793355, + "kl_loss_13": 3040.0, + "kl_loss_26": 2138.8, + "kl_loss_39": 1157.9, + "kl_loss_7": 3579.6, + "learning_rate": 0.00011697777844051105, + "loss": 5056.85, + "step": 7800 + }, + { + "ce_loss_13": 2.99123472571373, + "ce_loss_26": 2.5481519401073456, + "ce_loss_39": 2.0644855082035063, + "ce_loss_52": 1.4605911195278167, + "ce_loss_7": 3.2550659775733948, + "epoch": 0.781, + "grad_norm": 19.108089078575876, + "kl_loss_13": 3156.0, + "kl_loss_26": 2232.8, + "kl_loss_39": 1225.5, + "kl_loss_7": 3705.6, + "learning_rate": 0.00011595982137788402, + "loss": 5045.05, + "step": 7810 + }, + { + "ce_loss_13": 2.979940289258957, + "ce_loss_26": 2.5384896367788317, + "ce_loss_39": 2.066192331910133, + "ce_loss_52": 1.4709408730268478, + "ce_loss_7": 3.2419922232627867, + "epoch": 0.782, + "grad_norm": 19.283597029818793, + "kl_loss_13": 3151.6, + "kl_loss_26": 2222.2, + "kl_loss_39": 1217.6, + "kl_loss_7": 3697.2, + "learning_rate": 0.00011494573159559212, + "loss": 5088.85, + "step": 7820 + }, + { + "ce_loss_13": 2.9142948031425475, + "ce_loss_26": 2.4835946947336196, + "ce_loss_39": 2.015666288137436, + "ce_loss_52": 1.433475723862648, + "ce_loss_7": 3.1793313324451447, + "epoch": 0.783, + "grad_norm": 18.805294673266925, + "kl_loss_13": 3065.2, + "kl_loss_26": 2169.8, + "kl_loss_39": 1195.1, + "kl_loss_7": 3620.0, + "learning_rate": 0.00011393551930550828, + "loss": 5021.8, + "step": 7830 + }, + { + "ce_loss_13": 2.942464643716812, + "ce_loss_26": 2.4966187834739686, + "ce_loss_39": 2.018853786587715, + "ce_loss_52": 1.4241285115480422, + "ce_loss_7": 3.209713137149811, + "epoch": 0.784, + "grad_norm": 18.802417227998035, + "kl_loss_13": 3153.6, + "kl_loss_26": 2218.8, + "kl_loss_39": 1214.7, + "kl_loss_7": 3709.6, + "learning_rate": 0.00011292919468045875, + "loss": 5056.2, + "step": 7840 + }, + { + "ce_loss_13": 2.9324662506580355, + "ce_loss_26": 2.490780544281006, + "ce_loss_39": 2.014774057269096, + "ce_loss_52": 1.4378075569868087, + "ce_loss_7": 3.2002897918224336, + "epoch": 0.785, + "grad_norm": 18.31675592106317, + "kl_loss_13": 3120.4, + "kl_loss_26": 2191.6, + "kl_loss_39": 1189.3, + "kl_loss_7": 3674.4, + "learning_rate": 0.00011192676785412154, + "loss": 5025.65, + "step": 7850 + }, + { + "ce_loss_13": 2.9004018545150756, + "ce_loss_26": 2.4679500609636307, + "ce_loss_39": 2.0106911092996596, + "ce_loss_52": 1.454608330130577, + "ce_loss_7": 3.1605159759521486, + "epoch": 0.786, + "grad_norm": 21.238240444429067, + "kl_loss_13": 2987.2, + "kl_loss_26": 2093.8, + "kl_loss_39": 1136.1, + "kl_loss_7": 3522.4, + "learning_rate": 0.00011092824892092374, + "loss": 4990.8, + "step": 7860 + }, + { + "ce_loss_13": 3.0144290030002594, + "ce_loss_26": 2.5742201924324037, + "ce_loss_39": 2.095407247543335, + "ce_loss_52": 1.4918664544820786, + "ce_loss_7": 3.2818655967712402, + "epoch": 0.787, + "grad_norm": 20.48506344270259, + "kl_loss_13": 3166.0, + "kl_loss_26": 2239.6, + "kl_loss_39": 1229.4, + "kl_loss_7": 3722.0, + "learning_rate": 0.0001099336479359398, + "loss": 5084.85, + "step": 7870 + }, + { + "ce_loss_13": 2.9180380165576936, + "ce_loss_26": 2.4719971120357513, + "ce_loss_39": 1.9984097123146056, + "ce_loss_52": 1.4270031958818437, + "ce_loss_7": 3.1842149913311006, + "epoch": 0.788, + "grad_norm": 19.617074455019072, + "kl_loss_13": 3109.6, + "kl_loss_26": 2182.8, + "kl_loss_39": 1178.3, + "kl_loss_7": 3662.0, + "learning_rate": 0.00010894297491479043, + "loss": 5092.2, + "step": 7880 + }, + { + "ce_loss_13": 2.8819404006004334, + "ce_loss_26": 2.449340745806694, + "ce_loss_39": 1.9862482339143752, + "ce_loss_52": 1.413575354218483, + "ce_loss_7": 3.143266361951828, + "epoch": 0.789, + "grad_norm": 19.73420487885875, + "kl_loss_13": 3037.2, + "kl_loss_26": 2140.0, + "kl_loss_39": 1164.1, + "kl_loss_7": 3572.0, + "learning_rate": 0.00010795623983354214, + "loss": 5012.15, + "step": 7890 + }, + { + "ce_loss_13": 2.936946928501129, + "ce_loss_26": 2.50938241481781, + "ce_loss_39": 2.0399589776992797, + "ce_loss_52": 1.4433409079909325, + "ce_loss_7": 3.1964890122413636, + "epoch": 0.79, + "grad_norm": 20.456776413214342, + "kl_loss_13": 3094.8, + "kl_loss_26": 2194.4, + "kl_loss_39": 1203.3, + "kl_loss_7": 3650.0, + "learning_rate": 0.00010697345262860636, + "loss": 5033.95, + "step": 7900 + }, + { + "ce_loss_13": 2.9234113335609435, + "ce_loss_26": 2.486256945133209, + "ce_loss_39": 2.014375075697899, + "ce_loss_52": 1.449743601679802, + "ce_loss_7": 3.18265563249588, + "epoch": 0.791, + "grad_norm": 19.900735615836812, + "kl_loss_13": 3074.8, + "kl_loss_26": 2163.0, + "kl_loss_39": 1161.1, + "kl_loss_7": 3612.8, + "learning_rate": 0.00010599462319663906, + "loss": 5038.65, + "step": 7910 + }, + { + "ce_loss_13": 2.9553000926971436, + "ce_loss_26": 2.5179340064525606, + "ce_loss_39": 2.0430867671966553, + "ce_loss_52": 1.444689854979515, + "ce_loss_7": 3.21486856341362, + "epoch": 0.792, + "grad_norm": 19.250533773319045, + "kl_loss_13": 3122.0, + "kl_loss_26": 2206.2, + "kl_loss_39": 1213.4, + "kl_loss_7": 3668.8, + "learning_rate": 0.00010501976139444191, + "loss": 5048.55, + "step": 7920 + }, + { + "ce_loss_13": 2.946609389781952, + "ce_loss_26": 2.5028085887432097, + "ce_loss_39": 2.0339547246694565, + "ce_loss_52": 1.4416055083274841, + "ce_loss_7": 3.2140405058860777, + "epoch": 0.793, + "grad_norm": 20.61899934712358, + "kl_loss_13": 3133.6, + "kl_loss_26": 2209.4, + "kl_loss_39": 1208.3, + "kl_loss_7": 3694.0, + "learning_rate": 0.0001040488770388625, + "loss": 5057.15, + "step": 7930 + }, + { + "ce_loss_13": 2.866725343465805, + "ce_loss_26": 2.424889090657234, + "ce_loss_39": 1.9532368332147598, + "ce_loss_52": 1.37542584836483, + "ce_loss_7": 3.1246279418468474, + "epoch": 0.794, + "grad_norm": 19.267832695141472, + "kl_loss_13": 3083.6, + "kl_loss_26": 2164.8, + "kl_loss_39": 1176.3, + "kl_loss_7": 3624.4, + "learning_rate": 0.00010308197990669538, + "loss": 5026.95, + "step": 7940 + }, + { + "ce_loss_13": 2.901643234491348, + "ce_loss_26": 2.4638597697019575, + "ce_loss_39": 1.9964057832956315, + "ce_loss_52": 1.4079250425100327, + "ce_loss_7": 3.1619919717311857, + "epoch": 0.795, + "grad_norm": 19.24818590589668, + "kl_loss_13": 3123.2, + "kl_loss_26": 2199.2, + "kl_loss_39": 1202.4, + "kl_loss_7": 3666.4, + "learning_rate": 0.0001021190797345839, + "loss": 5013.5, + "step": 7950 + }, + { + "ce_loss_13": 2.967918246984482, + "ce_loss_26": 2.533867511153221, + "ce_loss_39": 2.069682791829109, + "ce_loss_52": 1.4818589851260184, + "ce_loss_7": 3.222521889209747, + "epoch": 0.796, + "grad_norm": 19.75622423793478, + "kl_loss_13": 3058.8, + "kl_loss_26": 2155.2, + "kl_loss_39": 1175.7, + "kl_loss_7": 3592.0, + "learning_rate": 0.00010116018621892236, + "loss": 5009.6, + "step": 7960 + }, + { + "ce_loss_13": 2.8812991797924044, + "ce_loss_26": 2.4400356858968735, + "ce_loss_39": 1.9724171191453934, + "ce_loss_52": 1.4085147365927697, + "ce_loss_7": 3.1415765941143037, + "epoch": 0.797, + "grad_norm": 19.13497167869677, + "kl_loss_13": 3057.6, + "kl_loss_26": 2142.8, + "kl_loss_39": 1159.2, + "kl_loss_7": 3602.8, + "learning_rate": 0.00010020530901575753, + "loss": 5020.4, + "step": 7970 + }, + { + "ce_loss_13": 2.904066652059555, + "ce_loss_26": 2.4755689650774, + "ce_loss_39": 2.0152323603630067, + "ce_loss_52": 1.4373595044016838, + "ce_loss_7": 3.1696866393089294, + "epoch": 0.798, + "grad_norm": 20.01532447536976, + "kl_loss_13": 3044.0, + "kl_loss_26": 2150.0, + "kl_loss_39": 1170.5, + "kl_loss_7": 3601.6, + "learning_rate": 9.925445774069231e-05, + "loss": 5018.45, + "step": 7980 + }, + { + "ce_loss_13": 2.9288057029247283, + "ce_loss_26": 2.4822009325027468, + "ce_loss_39": 2.011521789431572, + "ce_loss_52": 1.4153838574886322, + "ce_loss_7": 3.1911131501197816, + "epoch": 0.799, + "grad_norm": 19.205851361122782, + "kl_loss_13": 3126.0, + "kl_loss_26": 2203.4, + "kl_loss_39": 1210.7, + "kl_loss_7": 3672.4, + "learning_rate": 9.830764196878872e-05, + "loss": 5069.4, + "step": 7990 + }, + { + "ce_loss_13": 2.9970316886901855, + "ce_loss_26": 2.5632322430610657, + "ce_loss_39": 2.089646649360657, + "ce_loss_52": 1.4764455169439317, + "ce_loss_7": 3.2621945440769196, + "epoch": 0.8, + "grad_norm": 18.817128068716947, + "kl_loss_13": 3164.4, + "kl_loss_26": 2259.8, + "kl_loss_39": 1248.2, + "kl_loss_7": 3716.4, + "learning_rate": 9.736487123447069e-05, + "loss": 5026.75, + "step": 8000 + }, + { + "ce_loss_13": 2.955092731118202, + "ce_loss_26": 2.5211679935455322, + "ce_loss_39": 2.061183473467827, + "ce_loss_52": 1.4790756076574325, + "ce_loss_7": 3.211963188648224, + "epoch": 0.801, + "grad_norm": 19.13731613570068, + "kl_loss_13": 3058.4, + "kl_loss_26": 2157.0, + "kl_loss_39": 1183.6, + "kl_loss_7": 3589.2, + "learning_rate": 9.642615503142926e-05, + "loss": 5013.8, + "step": 8010 + }, + { + "ce_loss_13": 2.8814174115657805, + "ce_loss_26": 2.451471582055092, + "ce_loss_39": 1.9925315648317337, + "ce_loss_52": 1.4264169082045555, + "ce_loss_7": 3.150370055437088, + "epoch": 0.802, + "grad_norm": 19.95644855086655, + "kl_loss_13": 3015.2, + "kl_loss_26": 2114.4, + "kl_loss_39": 1144.7, + "kl_loss_7": 3572.8, + "learning_rate": 9.549150281252633e-05, + "loss": 5066.0, + "step": 8020 + }, + { + "ce_loss_13": 2.9213546574115754, + "ce_loss_26": 2.488295114040375, + "ce_loss_39": 2.0217175424098968, + "ce_loss_52": 1.448304545879364, + "ce_loss_7": 3.177370023727417, + "epoch": 0.803, + "grad_norm": 19.167189329215354, + "kl_loss_13": 3046.0, + "kl_loss_26": 2140.4, + "kl_loss_39": 1161.6, + "kl_loss_7": 3582.8, + "learning_rate": 9.4560923989699e-05, + "loss": 5040.7, + "step": 8030 + }, + { + "ce_loss_13": 2.8780395865440367, + "ce_loss_26": 2.43271960914135, + "ce_loss_39": 1.966169360280037, + "ce_loss_52": 1.3888942331075669, + "ce_loss_7": 3.1494656085968016, + "epoch": 0.804, + "grad_norm": 18.853764898775175, + "kl_loss_13": 3103.6, + "kl_loss_26": 2173.0, + "kl_loss_39": 1171.5, + "kl_loss_7": 3672.8, + "learning_rate": 9.363442793386607e-05, + "loss": 5021.25, + "step": 8040 + }, + { + "ce_loss_13": 2.9130735754966737, + "ce_loss_26": 2.480276498198509, + "ce_loss_39": 2.0196522653102873, + "ce_loss_52": 1.4448804795742034, + "ce_loss_7": 3.1761886417865752, + "epoch": 0.805, + "grad_norm": 18.76619687601556, + "kl_loss_13": 3048.8, + "kl_loss_26": 2147.8, + "kl_loss_39": 1165.5, + "kl_loss_7": 3590.8, + "learning_rate": 9.271202397483213e-05, + "loss": 4983.8, + "step": 8050 + }, + { + "ce_loss_13": 2.960085618495941, + "ce_loss_26": 2.516904118657112, + "ce_loss_39": 2.0490771383047104, + "ce_loss_52": 1.4702347993850708, + "ce_loss_7": 3.2195077538490295, + "epoch": 0.806, + "grad_norm": 20.341613718883853, + "kl_loss_13": 3080.0, + "kl_loss_26": 2163.0, + "kl_loss_39": 1179.4, + "kl_loss_7": 3617.6, + "learning_rate": 9.179372140119524e-05, + "loss": 5044.25, + "step": 8060 + }, + { + "ce_loss_13": 2.895679956674576, + "ce_loss_26": 2.447220724821091, + "ce_loss_39": 1.978733304142952, + "ce_loss_52": 1.4091844826936721, + "ce_loss_7": 3.1559928357601166, + "epoch": 0.807, + "grad_norm": 19.648457937725187, + "kl_loss_13": 3088.0, + "kl_loss_26": 2160.6, + "kl_loss_39": 1172.1, + "kl_loss_7": 3631.6, + "learning_rate": 9.087952946025175e-05, + "loss": 5019.35, + "step": 8070 + }, + { + "ce_loss_13": 2.8867613554000853, + "ce_loss_26": 2.4542810022830963, + "ce_loss_39": 1.997492003440857, + "ce_loss_52": 1.4237906068563462, + "ce_loss_7": 3.1490099489688874, + "epoch": 0.808, + "grad_norm": 19.108440716050485, + "kl_loss_13": 3031.2, + "kl_loss_26": 2132.0, + "kl_loss_39": 1161.2, + "kl_loss_7": 3573.2, + "learning_rate": 8.996945735790446e-05, + "loss": 5081.55, + "step": 8080 + }, + { + "ce_loss_13": 2.903727024793625, + "ce_loss_26": 2.4667694687843325, + "ce_loss_39": 2.0024666130542754, + "ce_loss_52": 1.4252464413642882, + "ce_loss_7": 3.1627338111400602, + "epoch": 0.809, + "grad_norm": 19.45516980399187, + "kl_loss_13": 3065.2, + "kl_loss_26": 2156.8, + "kl_loss_39": 1173.6, + "kl_loss_7": 3607.6, + "learning_rate": 8.906351425856951e-05, + "loss": 5032.55, + "step": 8090 + }, + { + "ce_loss_13": 2.9971861839294434, + "ce_loss_26": 2.558201992511749, + "ce_loss_39": 2.0943466246128084, + "ce_loss_52": 1.5071399331092834, + "ce_loss_7": 3.2508726358413695, + "epoch": 0.81, + "grad_norm": 18.578032258449234, + "kl_loss_13": 3080.0, + "kl_loss_26": 2167.8, + "kl_loss_39": 1188.5, + "kl_loss_7": 3612.8, + "learning_rate": 8.816170928508365e-05, + "loss": 5060.5, + "step": 8100 + }, + { + "ce_loss_13": 2.9514600038528442, + "ce_loss_26": 2.511053240299225, + "ce_loss_39": 2.036428925395012, + "ce_loss_52": 1.4632433116436006, + "ce_loss_7": 3.2164508640766143, + "epoch": 0.811, + "grad_norm": 19.46007345669389, + "kl_loss_13": 3077.6, + "kl_loss_26": 2160.6, + "kl_loss_39": 1164.9, + "kl_loss_7": 3629.6, + "learning_rate": 8.7264051518613e-05, + "loss": 5036.75, + "step": 8110 + }, + { + "ce_loss_13": 2.8350981414318084, + "ce_loss_26": 2.3989670783281327, + "ce_loss_39": 1.9394948929548264, + "ce_loss_52": 1.385464173555374, + "ce_loss_7": 3.0943558514118195, + "epoch": 0.812, + "grad_norm": 20.429118003347835, + "kl_loss_13": 3019.6, + "kl_loss_26": 2118.2, + "kl_loss_39": 1139.6, + "kl_loss_7": 3559.6, + "learning_rate": 8.637054999856148e-05, + "loss": 5033.2, + "step": 8120 + }, + { + "ce_loss_13": 2.956312870979309, + "ce_loss_26": 2.510516768693924, + "ce_loss_39": 2.0456418454647065, + "ce_loss_52": 1.454368954896927, + "ce_loss_7": 3.2136961221694946, + "epoch": 0.813, + "grad_norm": 19.290479115918554, + "kl_loss_13": 3110.4, + "kl_loss_26": 2193.4, + "kl_loss_39": 1206.0, + "kl_loss_7": 3651.6, + "learning_rate": 8.548121372247918e-05, + "loss": 5042.35, + "step": 8130 + }, + { + "ce_loss_13": 2.8997408151626587, + "ce_loss_26": 2.4578535914421082, + "ce_loss_39": 1.9877970427274705, + "ce_loss_52": 1.4171391934156419, + "ce_loss_7": 3.1653897404670714, + "epoch": 0.814, + "grad_norm": 19.231104242907662, + "kl_loss_13": 3062.4, + "kl_loss_26": 2154.2, + "kl_loss_39": 1161.8, + "kl_loss_7": 3620.0, + "learning_rate": 8.459605164597267e-05, + "loss": 4990.3, + "step": 8140 + }, + { + "ce_loss_13": 2.8967735528945924, + "ce_loss_26": 2.462614360451698, + "ce_loss_39": 1.9984548151493073, + "ce_loss_52": 1.438458850979805, + "ce_loss_7": 3.1581345558166505, + "epoch": 0.815, + "grad_norm": 19.45006377816174, + "kl_loss_13": 3056.8, + "kl_loss_26": 2153.0, + "kl_loss_39": 1160.6, + "kl_loss_7": 3602.0, + "learning_rate": 8.371507268261436e-05, + "loss": 4980.2, + "step": 8150 + }, + { + "ce_loss_13": 2.936253345012665, + "ce_loss_26": 2.5057778120040894, + "ce_loss_39": 2.038064029812813, + "ce_loss_52": 1.4622955560684203, + "ce_loss_7": 3.1979693949222563, + "epoch": 0.816, + "grad_norm": 18.91111939228637, + "kl_loss_13": 3060.8, + "kl_loss_26": 2153.2, + "kl_loss_39": 1175.6, + "kl_loss_7": 3612.0, + "learning_rate": 8.283828570385238e-05, + "loss": 5006.35, + "step": 8160 + }, + { + "ce_loss_13": 2.9529894649982453, + "ce_loss_26": 2.5044034361839294, + "ce_loss_39": 2.042719992995262, + "ce_loss_52": 1.4732781440019607, + "ce_loss_7": 3.2085989713668823, + "epoch": 0.817, + "grad_norm": 18.881507142327827, + "kl_loss_13": 3068.4, + "kl_loss_26": 2134.6, + "kl_loss_39": 1154.6, + "kl_loss_7": 3608.0, + "learning_rate": 8.196569953892202e-05, + "loss": 5023.2, + "step": 8170 + }, + { + "ce_loss_13": 2.901773339509964, + "ce_loss_26": 2.46816024184227, + "ce_loss_39": 2.0061379730701447, + "ce_loss_52": 1.4472751855850219, + "ce_loss_7": 3.162723332643509, + "epoch": 0.818, + "grad_norm": 19.58512082927694, + "kl_loss_13": 3021.6, + "kl_loss_26": 2116.2, + "kl_loss_39": 1138.3, + "kl_loss_7": 3567.6, + "learning_rate": 8.109732297475635e-05, + "loss": 5011.1, + "step": 8180 + }, + { + "ce_loss_13": 2.9283832788467405, + "ce_loss_26": 2.495754861831665, + "ce_loss_39": 2.0315854638814925, + "ce_loss_52": 1.4544063314795495, + "ce_loss_7": 3.184633868932724, + "epoch": 0.819, + "grad_norm": 20.338540288781616, + "kl_loss_13": 3056.0, + "kl_loss_26": 2152.0, + "kl_loss_39": 1164.7, + "kl_loss_7": 3604.8, + "learning_rate": 8.023316475589754e-05, + "loss": 4985.65, + "step": 8190 + }, + { + "ce_loss_13": 2.8736020922660828, + "ce_loss_26": 2.4326390773057938, + "ce_loss_39": 1.9687805682420731, + "ce_loss_52": 1.4114506781101226, + "ce_loss_7": 3.1362832963466643, + "epoch": 0.82, + "grad_norm": 19.348124098647066, + "kl_loss_13": 3039.6, + "kl_loss_26": 2118.4, + "kl_loss_39": 1131.5, + "kl_loss_7": 3592.4, + "learning_rate": 7.937323358440934e-05, + "loss": 4999.05, + "step": 8200 + }, + { + "ce_loss_13": 2.9405028223991394, + "ce_loss_26": 2.5053980708122254, + "ce_loss_39": 2.031425711512566, + "ce_loss_52": 1.4602935075759889, + "ce_loss_7": 3.1905817687511444, + "epoch": 0.821, + "grad_norm": 19.308929525960306, + "kl_loss_13": 3053.6, + "kl_loss_26": 2142.8, + "kl_loss_39": 1164.5, + "kl_loss_7": 3580.4, + "learning_rate": 7.851753811978923e-05, + "loss": 5013.6, + "step": 8210 + }, + { + "ce_loss_13": 2.8261436820030212, + "ce_loss_26": 2.396569001674652, + "ce_loss_39": 1.9376911997795105, + "ce_loss_52": 1.3843101486563683, + "ce_loss_7": 3.08916922211647, + "epoch": 0.822, + "grad_norm": 18.760581120890944, + "kl_loss_13": 2979.6, + "kl_loss_26": 2084.0, + "kl_loss_39": 1118.1, + "kl_loss_7": 3526.4, + "learning_rate": 7.766608697888095e-05, + "loss": 4996.05, + "step": 8220 + }, + { + "ce_loss_13": 2.895614618062973, + "ce_loss_26": 2.4538383156061174, + "ce_loss_39": 1.9913074195384979, + "ce_loss_52": 1.4129166051745414, + "ce_loss_7": 3.156418579816818, + "epoch": 0.823, + "grad_norm": 19.448137234461008, + "kl_loss_13": 3094.8, + "kl_loss_26": 2171.8, + "kl_loss_39": 1179.4, + "kl_loss_7": 3644.0, + "learning_rate": 7.681888873578785e-05, + "loss": 5010.15, + "step": 8230 + }, + { + "ce_loss_13": 2.8753804206848144, + "ce_loss_26": 2.448589825630188, + "ce_loss_39": 1.9881916165351867, + "ce_loss_52": 1.433535772562027, + "ce_loss_7": 3.133152514696121, + "epoch": 0.824, + "grad_norm": 19.780732761004185, + "kl_loss_13": 2998.8, + "kl_loss_26": 2105.2, + "kl_loss_39": 1137.2, + "kl_loss_7": 3529.6, + "learning_rate": 7.597595192178702e-05, + "loss": 4951.6, + "step": 8240 + }, + { + "ce_loss_13": 2.874552935361862, + "ce_loss_26": 2.426975393295288, + "ce_loss_39": 1.969171154499054, + "ce_loss_52": 1.4067743465304374, + "ce_loss_7": 3.1344926774501802, + "epoch": 0.825, + "grad_norm": 19.081018400834456, + "kl_loss_13": 3050.0, + "kl_loss_26": 2129.2, + "kl_loss_39": 1153.6, + "kl_loss_7": 3590.8, + "learning_rate": 7.513728502524286e-05, + "loss": 4924.6, + "step": 8250 + }, + { + "ce_loss_13": 2.8894054651260377, + "ce_loss_26": 2.4550040304660796, + "ce_loss_39": 1.9873575389385223, + "ce_loss_52": 1.417596697807312, + "ce_loss_7": 3.152091747522354, + "epoch": 0.826, + "grad_norm": 19.8137268983455, + "kl_loss_13": 3043.6, + "kl_loss_26": 2138.6, + "kl_loss_39": 1152.9, + "kl_loss_7": 3588.0, + "learning_rate": 7.430289649152156e-05, + "loss": 5032.7, + "step": 8260 + }, + { + "ce_loss_13": 2.9286361813545225, + "ce_loss_26": 2.500110092759132, + "ce_loss_39": 2.0343170315027237, + "ce_loss_52": 1.4783238634467124, + "ce_loss_7": 3.186429667472839, + "epoch": 0.827, + "grad_norm": 19.162288533319998, + "kl_loss_13": 3004.0, + "kl_loss_26": 2105.8, + "kl_loss_39": 1135.2, + "kl_loss_7": 3537.2, + "learning_rate": 7.347279472290646e-05, + "loss": 4997.95, + "step": 8270 + }, + { + "ce_loss_13": 2.8661180198192597, + "ce_loss_26": 2.4267468631267546, + "ce_loss_39": 1.9584647029638291, + "ce_loss_52": 1.3965127140283584, + "ce_loss_7": 3.1264285147190094, + "epoch": 0.828, + "grad_norm": 18.88865775629702, + "kl_loss_13": 3042.4, + "kl_loss_26": 2132.2, + "kl_loss_39": 1143.9, + "kl_loss_7": 3592.0, + "learning_rate": 7.264698807851328e-05, + "loss": 4945.25, + "step": 8280 + }, + { + "ce_loss_13": 2.963280272483826, + "ce_loss_26": 2.5102931052446364, + "ce_loss_39": 2.034750634431839, + "ce_loss_52": 1.4600775420665741, + "ce_loss_7": 3.2234760582447053, + "epoch": 0.829, + "grad_norm": 18.537963317549014, + "kl_loss_13": 3144.4, + "kl_loss_26": 2206.8, + "kl_loss_39": 1191.9, + "kl_loss_7": 3687.6, + "learning_rate": 7.182548487420554e-05, + "loss": 5033.4, + "step": 8290 + }, + { + "ce_loss_13": 2.9992467045783995, + "ce_loss_26": 2.55620219707489, + "ce_loss_39": 2.0874054759740828, + "ce_loss_52": 1.4909266605973244, + "ce_loss_7": 3.263571548461914, + "epoch": 0.83, + "grad_norm": 18.891053458181357, + "kl_loss_13": 3132.0, + "kl_loss_26": 2207.0, + "kl_loss_39": 1207.0, + "kl_loss_7": 3681.2, + "learning_rate": 7.100829338251146e-05, + "loss": 5053.05, + "step": 8300 + }, + { + "ce_loss_13": 2.9434437096118926, + "ce_loss_26": 2.5084387719631196, + "ce_loss_39": 2.0438412368297576, + "ce_loss_52": 1.455627703666687, + "ce_loss_7": 3.203293949365616, + "epoch": 0.831, + "grad_norm": 18.63506341583337, + "kl_loss_13": 3080.0, + "kl_loss_26": 2173.6, + "kl_loss_39": 1196.0, + "kl_loss_7": 3632.8, + "learning_rate": 7.019542183254046e-05, + "loss": 5020.25, + "step": 8310 + }, + { + "ce_loss_13": 2.9150700867176056, + "ce_loss_26": 2.4821571350097655, + "ce_loss_39": 2.011009243130684, + "ce_loss_52": 1.4295171827077866, + "ce_loss_7": 3.1743992388248445, + "epoch": 0.832, + "grad_norm": 19.224604457055225, + "kl_loss_13": 3061.6, + "kl_loss_26": 2156.4, + "kl_loss_39": 1178.0, + "kl_loss_7": 3603.6, + "learning_rate": 6.938687840989971e-05, + "loss": 4998.8, + "step": 8320 + }, + { + "ce_loss_13": 2.9248284220695497, + "ce_loss_26": 2.4842200338840486, + "ce_loss_39": 2.016203221678734, + "ce_loss_52": 1.4306745767593383, + "ce_loss_7": 3.1828024327754973, + "epoch": 0.833, + "grad_norm": 21.39912114954264, + "kl_loss_13": 3084.0, + "kl_loss_26": 2177.0, + "kl_loss_39": 1194.5, + "kl_loss_7": 3630.0, + "learning_rate": 6.858267125661271e-05, + "loss": 5022.85, + "step": 8330 + }, + { + "ce_loss_13": 2.8756704360246657, + "ce_loss_26": 2.447014120221138, + "ce_loss_39": 1.9950514793395997, + "ce_loss_52": 1.4190126568078996, + "ce_loss_7": 3.135387209057808, + "epoch": 0.834, + "grad_norm": 19.16865484866817, + "kl_loss_13": 3030.8, + "kl_loss_26": 2137.8, + "kl_loss_39": 1174.0, + "kl_loss_7": 3584.4, + "learning_rate": 6.778280847103668e-05, + "loss": 5009.55, + "step": 8340 + }, + { + "ce_loss_13": 2.8521511554718018, + "ce_loss_26": 2.415205565094948, + "ce_loss_39": 1.9484322667121887, + "ce_loss_52": 1.3919154837727548, + "ce_loss_7": 3.11352881193161, + "epoch": 0.835, + "grad_norm": 19.565513612829772, + "kl_loss_13": 3035.6, + "kl_loss_26": 2128.8, + "kl_loss_39": 1129.1, + "kl_loss_7": 3584.4, + "learning_rate": 6.698729810778065e-05, + "loss": 4997.8, + "step": 8350 + }, + { + "ce_loss_13": 2.9550336360931397, + "ce_loss_26": 2.516478735208511, + "ce_loss_39": 2.0486765056848526, + "ce_loss_52": 1.4607697233557702, + "ce_loss_7": 3.2147393763065337, + "epoch": 0.836, + "grad_norm": 19.2117514895061, + "kl_loss_13": 3091.2, + "kl_loss_26": 2181.2, + "kl_loss_39": 1192.7, + "kl_loss_7": 3642.8, + "learning_rate": 6.619614817762538e-05, + "loss": 4980.45, + "step": 8360 + }, + { + "ce_loss_13": 2.8862642347812653, + "ce_loss_26": 2.453189605474472, + "ce_loss_39": 1.9996996372938156, + "ce_loss_52": 1.42118998169899, + "ce_loss_7": 3.1397067666053773, + "epoch": 0.837, + "grad_norm": 19.74025196121118, + "kl_loss_13": 3027.2, + "kl_loss_26": 2130.4, + "kl_loss_39": 1163.7, + "kl_loss_7": 3555.6, + "learning_rate": 6.540936664744196e-05, + "loss": 5003.55, + "step": 8370 + }, + { + "ce_loss_13": 2.8884230494499206, + "ce_loss_26": 2.461115485429764, + "ce_loss_39": 1.996484610438347, + "ce_loss_52": 1.431185284256935, + "ce_loss_7": 3.1595689237117766, + "epoch": 0.838, + "grad_norm": 18.91687230970295, + "kl_loss_13": 3028.0, + "kl_loss_26": 2134.2, + "kl_loss_39": 1162.1, + "kl_loss_7": 3585.2, + "learning_rate": 6.462696144011149e-05, + "loss": 4978.95, + "step": 8380 + }, + { + "ce_loss_13": 2.9092856884002685, + "ce_loss_26": 2.477367341518402, + "ce_loss_39": 2.0064304888248445, + "ce_loss_52": 1.4392234981060028, + "ce_loss_7": 3.166864866018295, + "epoch": 0.839, + "grad_norm": 18.987849334165656, + "kl_loss_13": 3037.2, + "kl_loss_26": 2130.0, + "kl_loss_39": 1148.3, + "kl_loss_7": 3579.6, + "learning_rate": 6.384894043444567e-05, + "loss": 4978.1, + "step": 8390 + }, + { + "ce_loss_13": 2.926213449239731, + "ce_loss_26": 2.4814498484134675, + "ce_loss_39": 2.011722648143768, + "ce_loss_52": 1.4253545701503754, + "ce_loss_7": 3.1928284585475923, + "epoch": 0.84, + "grad_norm": 18.67190019337169, + "kl_loss_13": 3118.4, + "kl_loss_26": 2194.4, + "kl_loss_39": 1203.2, + "kl_loss_7": 3673.2, + "learning_rate": 6.307531146510753e-05, + "loss": 4975.3, + "step": 8400 + }, + { + "ce_loss_13": 2.9474796772003176, + "ce_loss_26": 2.513693606853485, + "ce_loss_39": 2.039980337023735, + "ce_loss_52": 1.4588691473007203, + "ce_loss_7": 3.208155167102814, + "epoch": 0.841, + "grad_norm": 19.23898472020216, + "kl_loss_13": 3091.2, + "kl_loss_26": 2176.0, + "kl_loss_39": 1186.3, + "kl_loss_7": 3638.8, + "learning_rate": 6.230608232253226e-05, + "loss": 4972.8, + "step": 8410 + }, + { + "ce_loss_13": 2.98299777507782, + "ce_loss_26": 2.546931451559067, + "ce_loss_39": 2.066228356957436, + "ce_loss_52": 1.4528164565563202, + "ce_loss_7": 3.2408275246620177, + "epoch": 0.842, + "grad_norm": 19.590505544042042, + "kl_loss_13": 3177.6, + "kl_loss_26": 2260.2, + "kl_loss_39": 1246.2, + "kl_loss_7": 3722.8, + "learning_rate": 6.154126075284855e-05, + "loss": 5025.9, + "step": 8420 + }, + { + "ce_loss_13": 2.8128190338611603, + "ce_loss_26": 2.3779567658901213, + "ce_loss_39": 1.922488284111023, + "ce_loss_52": 1.3714163228869438, + "ce_loss_7": 3.0708466947078703, + "epoch": 0.843, + "grad_norm": 19.244313422130332, + "kl_loss_13": 3014.4, + "kl_loss_26": 2105.4, + "kl_loss_39": 1130.8, + "kl_loss_7": 3552.8, + "learning_rate": 6.078085445780129e-05, + "loss": 5004.75, + "step": 8430 + }, + { + "ce_loss_13": 2.938475805521011, + "ce_loss_26": 2.497309777140617, + "ce_loss_39": 2.027024504542351, + "ce_loss_52": 1.4460827559232712, + "ce_loss_7": 3.194866645336151, + "epoch": 0.844, + "grad_norm": 18.715084502177618, + "kl_loss_13": 3086.4, + "kl_loss_26": 2168.0, + "kl_loss_39": 1180.4, + "kl_loss_7": 3630.0, + "learning_rate": 6.002487109467347e-05, + "loss": 5005.05, + "step": 8440 + }, + { + "ce_loss_13": 2.940459841489792, + "ce_loss_26": 2.5169106662273406, + "ce_loss_39": 2.0435358375310897, + "ce_loss_52": 1.4713785827159882, + "ce_loss_7": 3.199807566404343, + "epoch": 0.845, + "grad_norm": 20.662636581778713, + "kl_loss_13": 3050.8, + "kl_loss_26": 2152.2, + "kl_loss_39": 1166.1, + "kl_loss_7": 3588.8, + "learning_rate": 5.927331827620902e-05, + "loss": 5015.45, + "step": 8450 + }, + { + "ce_loss_13": 2.8635286152362824, + "ce_loss_26": 2.426770511269569, + "ce_loss_39": 1.9639566600322724, + "ce_loss_52": 1.4056006461381911, + "ce_loss_7": 3.126782363653183, + "epoch": 0.846, + "grad_norm": 19.43955116705752, + "kl_loss_13": 3026.8, + "kl_loss_26": 2123.4, + "kl_loss_39": 1139.5, + "kl_loss_7": 3581.2, + "learning_rate": 5.852620357053651e-05, + "loss": 4930.5, + "step": 8460 + }, + { + "ce_loss_13": 2.967438644170761, + "ce_loss_26": 2.5302784025669096, + "ce_loss_39": 2.0548608988523482, + "ce_loss_52": 1.456875516474247, + "ce_loss_7": 3.232648569345474, + "epoch": 0.847, + "grad_norm": 18.917933547815785, + "kl_loss_13": 3141.6, + "kl_loss_26": 2229.0, + "kl_loss_39": 1221.3, + "kl_loss_7": 3698.0, + "learning_rate": 5.778353450109286e-05, + "loss": 5049.2, + "step": 8470 + }, + { + "ce_loss_13": 2.8490840077400206, + "ce_loss_26": 2.4299704492092133, + "ce_loss_39": 1.9720161318778993, + "ce_loss_52": 1.428696632385254, + "ce_loss_7": 3.1050261557102203, + "epoch": 0.848, + "grad_norm": 19.028513845565772, + "kl_loss_13": 2946.4, + "kl_loss_26": 2077.0, + "kl_loss_39": 1109.1, + "kl_loss_7": 3480.0, + "learning_rate": 5.7045318546547206e-05, + "loss": 4964.25, + "step": 8480 + }, + { + "ce_loss_13": 2.91025772690773, + "ce_loss_26": 2.470404103398323, + "ce_loss_39": 1.9989687472581863, + "ce_loss_52": 1.4330752968788147, + "ce_loss_7": 3.1751048266887665, + "epoch": 0.849, + "grad_norm": 18.63333546431516, + "kl_loss_13": 3053.6, + "kl_loss_26": 2137.8, + "kl_loss_39": 1151.7, + "kl_loss_7": 3603.6, + "learning_rate": 5.631156314072605e-05, + "loss": 4997.6, + "step": 8490 + }, + { + "ce_loss_13": 2.969731491804123, + "ce_loss_26": 2.5224834442138673, + "ce_loss_39": 2.043315088748932, + "ce_loss_52": 1.4606564939022064, + "ce_loss_7": 3.235535615682602, + "epoch": 0.85, + "grad_norm": 19.13874159874534, + "kl_loss_13": 3116.0, + "kl_loss_26": 2192.8, + "kl_loss_39": 1187.7, + "kl_loss_7": 3671.6, + "learning_rate": 5.5582275672538315e-05, + "loss": 4973.7, + "step": 8500 + }, + { + "ce_loss_13": 2.9457097470760347, + "ce_loss_26": 2.5094266653060915, + "ce_loss_39": 2.037208506464958, + "ce_loss_52": 1.4461605846881866, + "ce_loss_7": 3.2044992685317992, + "epoch": 0.851, + "grad_norm": 19.084085332754032, + "kl_loss_13": 3088.0, + "kl_loss_26": 2174.4, + "kl_loss_39": 1186.9, + "kl_loss_7": 3629.2, + "learning_rate": 5.4857463485900484e-05, + "loss": 4979.7, + "step": 8510 + }, + { + "ce_loss_13": 2.932442033290863, + "ce_loss_26": 2.4945150196552275, + "ce_loss_39": 2.024565789103508, + "ce_loss_52": 1.4535668522119523, + "ce_loss_7": 3.1925411999225615, + "epoch": 0.852, + "grad_norm": 18.598717511449827, + "kl_loss_13": 3049.2, + "kl_loss_26": 2138.2, + "kl_loss_39": 1163.5, + "kl_loss_7": 3596.0, + "learning_rate": 5.413713387966329e-05, + "loss": 4976.35, + "step": 8520 + }, + { + "ce_loss_13": 2.850284093618393, + "ce_loss_26": 2.4304546415805817, + "ce_loss_39": 1.9769367069005965, + "ce_loss_52": 1.4221994251012802, + "ce_loss_7": 3.106715601682663, + "epoch": 0.853, + "grad_norm": 19.800943553162387, + "kl_loss_13": 2971.2, + "kl_loss_26": 2092.8, + "kl_loss_39": 1135.0, + "kl_loss_7": 3509.2, + "learning_rate": 5.34212941075381e-05, + "loss": 4969.3, + "step": 8530 + }, + { + "ce_loss_13": 2.8934908270835877, + "ce_loss_26": 2.4596606254577638, + "ce_loss_39": 2.00773600935936, + "ce_loss_52": 1.4613569289445878, + "ce_loss_7": 3.150895756483078, + "epoch": 0.854, + "grad_norm": 18.83825612066199, + "kl_loss_13": 3003.6, + "kl_loss_26": 2097.4, + "kl_loss_39": 1124.7, + "kl_loss_7": 3544.0, + "learning_rate": 5.270995137802315e-05, + "loss": 4942.95, + "step": 8540 + }, + { + "ce_loss_13": 2.9037817120552063, + "ce_loss_26": 2.462430712580681, + "ce_loss_39": 1.9962077885866165, + "ce_loss_52": 1.425583516061306, + "ce_loss_7": 3.173860079050064, + "epoch": 0.855, + "grad_norm": 18.951683103960118, + "kl_loss_13": 3081.6, + "kl_loss_26": 2158.2, + "kl_loss_39": 1162.8, + "kl_loss_7": 3640.8, + "learning_rate": 5.2003112854332125e-05, + "loss": 4931.9, + "step": 8550 + }, + { + "ce_loss_13": 2.945174980163574, + "ce_loss_26": 2.504639369249344, + "ce_loss_39": 2.038501372933388, + "ce_loss_52": 1.4634816706180573, + "ce_loss_7": 3.206812459230423, + "epoch": 0.856, + "grad_norm": 19.71064196024924, + "kl_loss_13": 3095.2, + "kl_loss_26": 2169.8, + "kl_loss_39": 1182.5, + "kl_loss_7": 3640.4, + "learning_rate": 5.130078565432089e-05, + "loss": 5022.2, + "step": 8560 + }, + { + "ce_loss_13": 2.916484522819519, + "ce_loss_26": 2.4740715622901917, + "ce_loss_39": 2.0083792597055434, + "ce_loss_52": 1.4411062002182007, + "ce_loss_7": 3.1799224853515624, + "epoch": 0.857, + "grad_norm": 18.476735822226924, + "kl_loss_13": 3088.4, + "kl_loss_26": 2169.8, + "kl_loss_39": 1171.0, + "kl_loss_7": 3638.0, + "learning_rate": 5.060297685041659e-05, + "loss": 4959.95, + "step": 8570 + }, + { + "ce_loss_13": 2.9657407224178316, + "ce_loss_26": 2.5159328460693358, + "ce_loss_39": 2.0297839671373366, + "ce_loss_52": 1.4374482572078704, + "ce_loss_7": 3.2368306040763857, + "epoch": 0.858, + "grad_norm": 18.666614846165707, + "kl_loss_13": 3174.0, + "kl_loss_26": 2243.8, + "kl_loss_39": 1214.4, + "kl_loss_7": 3739.2, + "learning_rate": 4.99096934695461e-05, + "loss": 4973.6, + "step": 8580 + }, + { + "ce_loss_13": 2.9122063517570496, + "ce_loss_26": 2.4818194091320036, + "ce_loss_39": 2.0184349328279496, + "ce_loss_52": 1.4382916703820228, + "ce_loss_7": 3.17412588596344, + "epoch": 0.859, + "grad_norm": 19.2367820447764, + "kl_loss_13": 3051.6, + "kl_loss_26": 2150.2, + "kl_loss_39": 1170.3, + "kl_loss_7": 3600.8, + "learning_rate": 4.922094249306558e-05, + "loss": 4986.8, + "step": 8590 + }, + { + "ce_loss_13": 2.856007623672485, + "ce_loss_26": 2.4249674677848816, + "ce_loss_39": 1.9603979021310807, + "ce_loss_52": 1.3998382538557053, + "ce_loss_7": 3.120637094974518, + "epoch": 0.86, + "grad_norm": 19.45927110525327, + "kl_loss_13": 3034.0, + "kl_loss_26": 2133.8, + "kl_loss_39": 1154.5, + "kl_loss_7": 3588.0, + "learning_rate": 4.853673085668947e-05, + "loss": 5020.0, + "step": 8600 + }, + { + "ce_loss_13": 2.8736523926258086, + "ce_loss_26": 2.4339311927556992, + "ce_loss_39": 1.9728148251771926, + "ce_loss_52": 1.4134869635105134, + "ce_loss_7": 3.132260227203369, + "epoch": 0.861, + "grad_norm": 18.566757154178084, + "kl_loss_13": 3035.6, + "kl_loss_26": 2121.8, + "kl_loss_39": 1143.1, + "kl_loss_7": 3595.6, + "learning_rate": 4.78570654504214e-05, + "loss": 5002.4, + "step": 8610 + }, + { + "ce_loss_13": 2.918911075592041, + "ce_loss_26": 2.4760118186473847, + "ce_loss_39": 2.0035496681928633, + "ce_loss_52": 1.4298115074634552, + "ce_loss_7": 3.1867982387542724, + "epoch": 0.862, + "grad_norm": 19.07407083640675, + "kl_loss_13": 3104.8, + "kl_loss_26": 2178.0, + "kl_loss_39": 1171.7, + "kl_loss_7": 3661.6, + "learning_rate": 4.7181953118484556e-05, + "loss": 4962.1, + "step": 8620 + }, + { + "ce_loss_13": 2.90929571390152, + "ce_loss_26": 2.4703837007284166, + "ce_loss_39": 2.004773771762848, + "ce_loss_52": 1.4389754503965377, + "ce_loss_7": 3.1703392446041105, + "epoch": 0.863, + "grad_norm": 19.469181472497027, + "kl_loss_13": 3027.6, + "kl_loss_26": 2134.8, + "kl_loss_39": 1156.0, + "kl_loss_7": 3579.2, + "learning_rate": 4.651140065925269e-05, + "loss": 4937.2, + "step": 8630 + }, + { + "ce_loss_13": 2.9975267946720123, + "ce_loss_26": 2.5536694526672363, + "ce_loss_39": 2.07455490231514, + "ce_loss_52": 1.4786568373441695, + "ce_loss_7": 3.265315741300583, + "epoch": 0.864, + "grad_norm": 19.328625062361652, + "kl_loss_13": 3149.6, + "kl_loss_26": 2221.4, + "kl_loss_39": 1210.4, + "kl_loss_7": 3701.6, + "learning_rate": 4.58454148251814e-05, + "loss": 4974.2, + "step": 8640 + }, + { + "ce_loss_13": 2.914253044128418, + "ce_loss_26": 2.4742087960243224, + "ce_loss_39": 2.0090451925992965, + "ce_loss_52": 1.4428718268871308, + "ce_loss_7": 3.174784082174301, + "epoch": 0.865, + "grad_norm": 18.948427807011946, + "kl_loss_13": 3048.4, + "kl_loss_26": 2129.2, + "kl_loss_39": 1142.4, + "kl_loss_7": 3588.0, + "learning_rate": 4.518400232274078e-05, + "loss": 4950.8, + "step": 8650 + }, + { + "ce_loss_13": 2.895064812898636, + "ce_loss_26": 2.452637565135956, + "ce_loss_39": 1.9877680152654649, + "ce_loss_52": 1.419823595881462, + "ce_loss_7": 3.151270192861557, + "epoch": 0.866, + "grad_norm": 18.81553972523, + "kl_loss_13": 3058.0, + "kl_loss_26": 2145.0, + "kl_loss_39": 1169.4, + "kl_loss_7": 3595.6, + "learning_rate": 4.452716981234745e-05, + "loss": 5007.2, + "step": 8660 + }, + { + "ce_loss_13": 2.929095983505249, + "ce_loss_26": 2.480497121810913, + "ce_loss_39": 2.006428611278534, + "ce_loss_52": 1.4315154731273652, + "ce_loss_7": 3.2007214546203615, + "epoch": 0.867, + "grad_norm": 18.95890107682501, + "kl_loss_13": 3086.0, + "kl_loss_26": 2159.8, + "kl_loss_39": 1163.2, + "kl_loss_7": 3645.6, + "learning_rate": 4.3874923908297335e-05, + "loss": 4988.0, + "step": 8670 + }, + { + "ce_loss_13": 2.917868083715439, + "ce_loss_26": 2.466423386335373, + "ce_loss_39": 1.9825115293264388, + "ce_loss_52": 1.4014427214860916, + "ce_loss_7": 3.187401866912842, + "epoch": 0.868, + "grad_norm": 18.57272436866935, + "kl_loss_13": 3128.0, + "kl_loss_26": 2194.4, + "kl_loss_39": 1181.8, + "kl_loss_7": 3698.4, + "learning_rate": 4.322727117869951e-05, + "loss": 4966.1, + "step": 8680 + }, + { + "ce_loss_13": 2.864998000860214, + "ce_loss_26": 2.4295243114233016, + "ce_loss_39": 1.9672368943691254, + "ce_loss_52": 1.4193324148654938, + "ce_loss_7": 3.128977674245834, + "epoch": 0.869, + "grad_norm": 18.701108919653294, + "kl_loss_13": 3002.4, + "kl_loss_26": 2088.8, + "kl_loss_39": 1117.8, + "kl_loss_7": 3558.4, + "learning_rate": 4.2584218145409916e-05, + "loss": 4955.6, + "step": 8690 + }, + { + "ce_loss_13": 2.866414725780487, + "ce_loss_26": 2.4338418275117872, + "ce_loss_39": 1.961911031603813, + "ce_loss_52": 1.3979329317808151, + "ce_loss_7": 3.1249643862247467, + "epoch": 0.87, + "grad_norm": 19.695244279115233, + "kl_loss_13": 3026.4, + "kl_loss_26": 2123.4, + "kl_loss_39": 1140.7, + "kl_loss_7": 3568.4, + "learning_rate": 4.194577128396521e-05, + "loss": 4954.85, + "step": 8700 + }, + { + "ce_loss_13": 2.9441056907176972, + "ce_loss_26": 2.5146015286445618, + "ce_loss_39": 2.0488742887973785, + "ce_loss_52": 1.4835967749357224, + "ce_loss_7": 3.202489811182022, + "epoch": 0.871, + "grad_norm": 18.533833127334756, + "kl_loss_13": 3031.2, + "kl_loss_26": 2129.4, + "kl_loss_39": 1154.3, + "kl_loss_7": 3572.4, + "learning_rate": 4.1311937023518264e-05, + "loss": 4983.55, + "step": 8710 + }, + { + "ce_loss_13": 2.927008146047592, + "ce_loss_26": 2.4801330626010896, + "ce_loss_39": 2.0113667100667953, + "ce_loss_52": 1.4392758041620255, + "ce_loss_7": 3.1931580364704133, + "epoch": 0.872, + "grad_norm": 19.604731242944656, + "kl_loss_13": 3073.2, + "kl_loss_26": 2148.4, + "kl_loss_39": 1161.3, + "kl_loss_7": 3631.6, + "learning_rate": 4.0682721746773344e-05, + "loss": 4966.3, + "step": 8720 + }, + { + "ce_loss_13": 2.875244301557541, + "ce_loss_26": 2.4395941644906998, + "ce_loss_39": 1.9802403211593629, + "ce_loss_52": 1.425346952676773, + "ce_loss_7": 3.1323861300945284, + "epoch": 0.873, + "grad_norm": 19.084321030186736, + "kl_loss_13": 3020.0, + "kl_loss_26": 2114.6, + "kl_loss_39": 1135.6, + "kl_loss_7": 3558.4, + "learning_rate": 4.0058131789920904e-05, + "loss": 4966.9, + "step": 8730 + }, + { + "ce_loss_13": 2.91582133769989, + "ce_loss_26": 2.469817638397217, + "ce_loss_39": 1.9961349010467528, + "ce_loss_52": 1.4161934450268745, + "ce_loss_7": 3.1843641221523287, + "epoch": 0.874, + "grad_norm": 19.701093771188038, + "kl_loss_13": 3130.0, + "kl_loss_26": 2205.0, + "kl_loss_39": 1187.7, + "kl_loss_7": 3684.8, + "learning_rate": 3.9438173442575e-05, + "loss": 4920.3, + "step": 8740 + }, + { + "ce_loss_13": 2.931247502565384, + "ce_loss_26": 2.499329847097397, + "ce_loss_39": 2.038593566417694, + "ce_loss_52": 1.4660022050142287, + "ce_loss_7": 3.1989371538162232, + "epoch": 0.875, + "grad_norm": 19.499459693858824, + "kl_loss_13": 3039.6, + "kl_loss_26": 2133.8, + "kl_loss_39": 1152.2, + "kl_loss_7": 3588.0, + "learning_rate": 3.882285294770937e-05, + "loss": 4984.7, + "step": 8750 + }, + { + "ce_loss_13": 2.903587061166763, + "ce_loss_26": 2.4606124222278596, + "ce_loss_39": 1.9871950060129167, + "ce_loss_52": 1.3980468481779098, + "ce_loss_7": 3.17354930639267, + "epoch": 0.876, + "grad_norm": 19.253090654259744, + "kl_loss_13": 3078.4, + "kl_loss_26": 2163.8, + "kl_loss_39": 1181.0, + "kl_loss_7": 3644.8, + "learning_rate": 3.821217650159453e-05, + "loss": 4982.75, + "step": 8760 + }, + { + "ce_loss_13": 2.820340207219124, + "ce_loss_26": 2.3951667070388796, + "ce_loss_39": 1.9451639890670775, + "ce_loss_52": 1.4168721199035645, + "ce_loss_7": 3.074656307697296, + "epoch": 0.877, + "grad_norm": 19.190959579311908, + "kl_loss_13": 2910.8, + "kl_loss_26": 2031.2, + "kl_loss_39": 1086.2, + "kl_loss_7": 3448.8, + "learning_rate": 3.760615025373543e-05, + "loss": 4941.25, + "step": 8770 + }, + { + "ce_loss_13": 2.9391641199588774, + "ce_loss_26": 2.5044194877147676, + "ce_loss_39": 2.0340330809354783, + "ce_loss_52": 1.4550057530403138, + "ce_loss_7": 3.200497591495514, + "epoch": 0.878, + "grad_norm": 19.218011314135488, + "kl_loss_13": 3063.6, + "kl_loss_26": 2155.8, + "kl_loss_39": 1164.4, + "kl_loss_7": 3604.8, + "learning_rate": 3.700478030680987e-05, + "loss": 4989.0, + "step": 8780 + }, + { + "ce_loss_13": 2.9181353628635405, + "ce_loss_26": 2.484974616765976, + "ce_loss_39": 2.0213694095611574, + "ce_loss_52": 1.4473600834608078, + "ce_loss_7": 3.1764037668704987, + "epoch": 0.879, + "grad_norm": 18.881217884773644, + "kl_loss_13": 3048.4, + "kl_loss_26": 2145.6, + "kl_loss_39": 1162.1, + "kl_loss_7": 3594.8, + "learning_rate": 3.6408072716606344e-05, + "loss": 4996.95, + "step": 8790 + }, + { + "ce_loss_13": 2.878078305721283, + "ce_loss_26": 2.445318901538849, + "ce_loss_39": 1.9787369549274445, + "ce_loss_52": 1.4154168665409088, + "ce_loss_7": 3.13910374045372, + "epoch": 0.88, + "grad_norm": 19.232153237296814, + "kl_loss_13": 3023.6, + "kl_loss_26": 2120.2, + "kl_loss_39": 1138.8, + "kl_loss_7": 3570.8, + "learning_rate": 3.5816033491963716e-05, + "loss": 4957.2, + "step": 8800 + }, + { + "ce_loss_13": 2.8982558727264403, + "ce_loss_26": 2.4716543793678283, + "ce_loss_39": 2.0096321552991867, + "ce_loss_52": 1.4367865800857544, + "ce_loss_7": 3.162083399295807, + "epoch": 0.881, + "grad_norm": 19.92275213516334, + "kl_loss_13": 3003.6, + "kl_loss_26": 2113.0, + "kl_loss_39": 1151.0, + "kl_loss_7": 3555.2, + "learning_rate": 3.522866859471047e-05, + "loss": 4925.7, + "step": 8810 + }, + { + "ce_loss_13": 2.9355869591236115, + "ce_loss_26": 2.495421326160431, + "ce_loss_39": 2.0324677735567094, + "ce_loss_52": 1.4521033734083175, + "ce_loss_7": 3.20084969997406, + "epoch": 0.882, + "grad_norm": 18.63804720981334, + "kl_loss_13": 3093.2, + "kl_loss_26": 2164.2, + "kl_loss_39": 1179.6, + "kl_loss_7": 3645.6, + "learning_rate": 3.46459839396045e-05, + "loss": 5007.2, + "step": 8820 + }, + { + "ce_loss_13": 2.9325197875499724, + "ce_loss_26": 2.4879075407981874, + "ce_loss_39": 2.0197067618370057, + "ce_loss_52": 1.4270877152681352, + "ce_loss_7": 3.198145306110382, + "epoch": 0.883, + "grad_norm": 18.241829484742485, + "kl_loss_13": 3112.0, + "kl_loss_26": 2188.6, + "kl_loss_39": 1188.1, + "kl_loss_7": 3676.4, + "learning_rate": 3.406798539427386e-05, + "loss": 4970.75, + "step": 8830 + }, + { + "ce_loss_13": 2.9144038438796995, + "ce_loss_26": 2.490026795864105, + "ce_loss_39": 2.0250850170850754, + "ce_loss_52": 1.4695867449045181, + "ce_loss_7": 3.171018958091736, + "epoch": 0.884, + "grad_norm": 19.295732101275807, + "kl_loss_13": 3016.4, + "kl_loss_26": 2109.6, + "kl_loss_39": 1133.6, + "kl_loss_7": 3556.0, + "learning_rate": 3.349467877915746e-05, + "loss": 4929.15, + "step": 8840 + }, + { + "ce_loss_13": 2.9367240130901338, + "ce_loss_26": 2.5029721915721894, + "ce_loss_39": 2.038401874899864, + "ce_loss_52": 1.4602129399776458, + "ce_loss_7": 3.1973277926445007, + "epoch": 0.885, + "grad_norm": 18.483553110198095, + "kl_loss_13": 3071.2, + "kl_loss_26": 2165.0, + "kl_loss_39": 1179.3, + "kl_loss_7": 3608.0, + "learning_rate": 3.292606986744667e-05, + "loss": 4997.15, + "step": 8850 + }, + { + "ce_loss_13": 2.962205785512924, + "ce_loss_26": 2.5182169795036318, + "ce_loss_39": 2.0426361471414567, + "ce_loss_52": 1.4736472845077515, + "ce_loss_7": 3.2304830133914946, + "epoch": 0.886, + "grad_norm": 19.501438521011444, + "kl_loss_13": 3089.6, + "kl_loss_26": 2164.0, + "kl_loss_39": 1168.6, + "kl_loss_7": 3642.4, + "learning_rate": 3.23621643850267e-05, + "loss": 4957.35, + "step": 8860 + }, + { + "ce_loss_13": 2.8543384969234467, + "ce_loss_26": 2.4200983941555023, + "ce_loss_39": 1.9575607985258103, + "ce_loss_52": 1.3949070930480958, + "ce_loss_7": 3.1160971879959107, + "epoch": 0.887, + "grad_norm": 19.320586944244614, + "kl_loss_13": 3009.6, + "kl_loss_26": 2108.8, + "kl_loss_39": 1139.1, + "kl_loss_7": 3548.0, + "learning_rate": 3.180296801041971e-05, + "loss": 4940.1, + "step": 8870 + }, + { + "ce_loss_13": 2.8845052778720857, + "ce_loss_26": 2.456773716211319, + "ce_loss_39": 1.9930354177951812, + "ce_loss_52": 1.4307963967323303, + "ce_loss_7": 3.144515538215637, + "epoch": 0.888, + "grad_norm": 19.659033917356428, + "kl_loss_13": 2998.8, + "kl_loss_26": 2105.2, + "kl_loss_39": 1137.6, + "kl_loss_7": 3543.2, + "learning_rate": 3.124848637472688e-05, + "loss": 4952.5, + "step": 8880 + }, + { + "ce_loss_13": 2.8944738626480104, + "ce_loss_26": 2.4629203975200653, + "ce_loss_39": 1.9913650721311569, + "ce_loss_52": 1.4339269563555717, + "ce_loss_7": 3.1572672605514525, + "epoch": 0.889, + "grad_norm": 18.88843635803768, + "kl_loss_13": 3032.4, + "kl_loss_26": 2127.8, + "kl_loss_39": 1135.0, + "kl_loss_7": 3579.2, + "learning_rate": 3.069872506157212e-05, + "loss": 4974.35, + "step": 8890 + }, + { + "ce_loss_13": 2.8339054346084596, + "ce_loss_26": 2.394126781821251, + "ce_loss_39": 1.9339392215013504, + "ce_loss_52": 1.3971292108297348, + "ce_loss_7": 3.0929621160030365, + "epoch": 0.89, + "grad_norm": 18.978687158228045, + "kl_loss_13": 2982.0, + "kl_loss_26": 2066.8, + "kl_loss_39": 1100.8, + "kl_loss_7": 3526.4, + "learning_rate": 3.0153689607045842e-05, + "loss": 4941.5, + "step": 8900 + }, + { + "ce_loss_13": 2.8811903417110445, + "ce_loss_26": 2.440036287903786, + "ce_loss_39": 1.975409933924675, + "ce_loss_52": 1.416017021238804, + "ce_loss_7": 3.1414348661899565, + "epoch": 0.891, + "grad_norm": 19.560639422793763, + "kl_loss_13": 3040.4, + "kl_loss_26": 2123.2, + "kl_loss_39": 1133.6, + "kl_loss_7": 3586.4, + "learning_rate": 2.9613385499648926e-05, + "loss": 4965.6, + "step": 8910 + }, + { + "ce_loss_13": 2.8596278965473174, + "ce_loss_26": 2.4295035183429716, + "ce_loss_39": 1.9728092432022095, + "ce_loss_52": 1.4283979684114456, + "ce_loss_7": 3.1134236633777617, + "epoch": 0.892, + "grad_norm": 18.908746612036932, + "kl_loss_13": 2974.8, + "kl_loss_26": 2082.0, + "kl_loss_39": 1115.7, + "kl_loss_7": 3505.2, + "learning_rate": 2.9077818180237692e-05, + "loss": 5007.95, + "step": 8920 + }, + { + "ce_loss_13": 2.8744696974754333, + "ce_loss_26": 2.4505746215581894, + "ce_loss_39": 1.9992403596639634, + "ce_loss_52": 1.4506706580519677, + "ce_loss_7": 3.13488364815712, + "epoch": 0.893, + "grad_norm": 18.957478200982788, + "kl_loss_13": 2985.6, + "kl_loss_26": 2094.6, + "kl_loss_39": 1129.4, + "kl_loss_7": 3523.6, + "learning_rate": 2.8546993041969172e-05, + "loss": 4940.15, + "step": 8930 + }, + { + "ce_loss_13": 2.8975345969200133, + "ce_loss_26": 2.4653249740600587, + "ce_loss_39": 2.0007053166627884, + "ce_loss_52": 1.4314887911081313, + "ce_loss_7": 3.155570811033249, + "epoch": 0.894, + "grad_norm": 18.739265695771255, + "kl_loss_13": 3012.4, + "kl_loss_26": 2117.2, + "kl_loss_39": 1151.3, + "kl_loss_7": 3551.6, + "learning_rate": 2.802091543024671e-05, + "loss": 4940.05, + "step": 8940 + }, + { + "ce_loss_13": 2.9072438359260557, + "ce_loss_26": 2.471187961101532, + "ce_loss_39": 2.0144962787628176, + "ce_loss_52": 1.42624132335186, + "ce_loss_7": 3.1694943487644194, + "epoch": 0.895, + "grad_norm": 19.21731826372691, + "kl_loss_13": 3062.0, + "kl_loss_26": 2168.4, + "kl_loss_39": 1196.7, + "kl_loss_7": 3618.8, + "learning_rate": 2.7499590642665774e-05, + "loss": 4979.0, + "step": 8950 + }, + { + "ce_loss_13": 2.893015044927597, + "ce_loss_26": 2.4596860975027086, + "ce_loss_39": 2.0018325716257097, + "ce_loss_52": 1.4285719782114028, + "ce_loss_7": 3.154858148097992, + "epoch": 0.896, + "grad_norm": 18.801842424478984, + "kl_loss_13": 3027.6, + "kl_loss_26": 2117.0, + "kl_loss_39": 1153.1, + "kl_loss_7": 3566.0, + "learning_rate": 2.6983023928961405e-05, + "loss": 4959.6, + "step": 8960 + }, + { + "ce_loss_13": 2.8546026587486266, + "ce_loss_26": 2.4183767944574357, + "ce_loss_39": 1.956614688038826, + "ce_loss_52": 1.3939528629183768, + "ce_loss_7": 3.1231652200222015, + "epoch": 0.897, + "grad_norm": 19.808391264092982, + "kl_loss_13": 3029.2, + "kl_loss_26": 2118.8, + "kl_loss_39": 1139.0, + "kl_loss_7": 3584.4, + "learning_rate": 2.6471220490954628e-05, + "loss": 4973.5, + "step": 8970 + }, + { + "ce_loss_13": 2.9009016394615172, + "ce_loss_26": 2.467138040065765, + "ce_loss_39": 2.0069568186998366, + "ce_loss_52": 1.4654083251953125, + "ce_loss_7": 3.155901938676834, + "epoch": 0.898, + "grad_norm": 19.041913132666583, + "kl_loss_13": 2967.2, + "kl_loss_26": 2069.0, + "kl_loss_39": 1105.9, + "kl_loss_7": 3506.0, + "learning_rate": 2.596418548250029e-05, + "loss": 4886.7, + "step": 8980 + }, + { + "ce_loss_13": 2.8516535699367522, + "ce_loss_26": 2.426672577857971, + "ce_loss_39": 1.9635347902774811, + "ce_loss_52": 1.419031423330307, + "ce_loss_7": 3.112126684188843, + "epoch": 0.899, + "grad_norm": 19.026364280537617, + "kl_loss_13": 2990.8, + "kl_loss_26": 2103.8, + "kl_loss_39": 1123.3, + "kl_loss_7": 3530.0, + "learning_rate": 2.5461924009435368e-05, + "loss": 4885.15, + "step": 8990 + }, + { + "ce_loss_13": 2.870776003599167, + "ce_loss_26": 2.4326407968997956, + "ce_loss_39": 1.9669345051050187, + "ce_loss_52": 1.420183390378952, + "ce_loss_7": 3.127993369102478, + "epoch": 0.9, + "grad_norm": 18.943171008960356, + "kl_loss_13": 3002.4, + "kl_loss_26": 2090.0, + "kl_loss_39": 1108.9, + "kl_loss_7": 3537.6, + "learning_rate": 2.4964441129527336e-05, + "loss": 4949.55, + "step": 9000 + }, + { + "ce_loss_13": 2.91824157834053, + "ce_loss_26": 2.4817179054021836, + "ce_loss_39": 2.015528929233551, + "ce_loss_52": 1.434774386882782, + "ce_loss_7": 3.181524306535721, + "epoch": 0.901, + "grad_norm": 19.717113587964203, + "kl_loss_13": 3071.6, + "kl_loss_26": 2176.4, + "kl_loss_39": 1186.6, + "kl_loss_7": 3618.4, + "learning_rate": 2.4471741852423235e-05, + "loss": 4970.45, + "step": 9010 + }, + { + "ce_loss_13": 2.8203544914722443, + "ce_loss_26": 2.394831323623657, + "ce_loss_39": 1.9447649121284485, + "ce_loss_52": 1.3895054385066032, + "ce_loss_7": 3.0815266370773315, + "epoch": 0.902, + "grad_norm": 19.201709482987514, + "kl_loss_13": 2963.6, + "kl_loss_26": 2077.6, + "kl_loss_39": 1127.3, + "kl_loss_7": 3508.4, + "learning_rate": 2.3983831139599287e-05, + "loss": 4939.65, + "step": 9020 + }, + { + "ce_loss_13": 2.8914650082588196, + "ce_loss_26": 2.46552118062973, + "ce_loss_39": 1.9927147597074508, + "ce_loss_52": 1.4259023681282996, + "ce_loss_7": 3.1594585537910462, + "epoch": 0.903, + "grad_norm": 18.327777842458563, + "kl_loss_13": 3054.4, + "kl_loss_26": 2152.4, + "kl_loss_39": 1157.6, + "kl_loss_7": 3607.6, + "learning_rate": 2.3500713904311022e-05, + "loss": 4963.8, + "step": 9030 + }, + { + "ce_loss_13": 2.8807626605033874, + "ce_loss_26": 2.432952329516411, + "ce_loss_39": 1.9567799299955368, + "ce_loss_52": 1.4073475629091263, + "ce_loss_7": 3.1469713926315306, + "epoch": 0.904, + "grad_norm": 20.180916839551323, + "kl_loss_13": 3056.0, + "kl_loss_26": 2131.4, + "kl_loss_39": 1132.8, + "kl_loss_7": 3607.6, + "learning_rate": 2.3022395011543685e-05, + "loss": 4930.8, + "step": 9040 + }, + { + "ce_loss_13": 2.895359253883362, + "ce_loss_26": 2.4620601534843445, + "ce_loss_39": 1.9888764083385468, + "ce_loss_52": 1.4224872916936875, + "ce_loss_7": 3.1553323328495027, + "epoch": 0.905, + "grad_norm": 19.515330390455173, + "kl_loss_13": 3061.2, + "kl_loss_26": 2157.8, + "kl_loss_39": 1165.6, + "kl_loss_7": 3605.6, + "learning_rate": 2.2548879277963063e-05, + "loss": 4965.65, + "step": 9050 + }, + { + "ce_loss_13": 2.913999766111374, + "ce_loss_26": 2.4826299071311952, + "ce_loss_39": 2.025396314263344, + "ce_loss_52": 1.4647169053554534, + "ce_loss_7": 3.1696211397647858, + "epoch": 0.906, + "grad_norm": 19.958942164442686, + "kl_loss_13": 2998.4, + "kl_loss_26": 2104.6, + "kl_loss_39": 1143.0, + "kl_loss_7": 3532.8, + "learning_rate": 2.208017147186736e-05, + "loss": 4953.35, + "step": 9060 + }, + { + "ce_loss_13": 2.931579887866974, + "ce_loss_26": 2.497947371006012, + "ce_loss_39": 2.0416529774665833, + "ce_loss_52": 1.4668794304132462, + "ce_loss_7": 3.1927560210227965, + "epoch": 0.907, + "grad_norm": 18.915099363205265, + "kl_loss_13": 3050.8, + "kl_loss_26": 2150.2, + "kl_loss_39": 1170.4, + "kl_loss_7": 3596.4, + "learning_rate": 2.1616276313139227e-05, + "loss": 4967.35, + "step": 9070 + }, + { + "ce_loss_13": 2.8379551649093626, + "ce_loss_26": 2.4058648884296416, + "ce_loss_39": 1.9522909700870514, + "ce_loss_52": 1.401316450536251, + "ce_loss_7": 3.096262776851654, + "epoch": 0.908, + "grad_norm": 17.939476679877323, + "kl_loss_13": 2996.4, + "kl_loss_26": 2103.4, + "kl_loss_39": 1130.6, + "kl_loss_7": 3545.6, + "learning_rate": 2.1157198473197415e-05, + "loss": 4983.65, + "step": 9080 + }, + { + "ce_loss_13": 2.9229146242141724, + "ce_loss_26": 2.4857192397117616, + "ce_loss_39": 2.014718788862228, + "ce_loss_52": 1.4371613681316375, + "ce_loss_7": 3.192232495546341, + "epoch": 0.909, + "grad_norm": 19.195813577565207, + "kl_loss_13": 3088.8, + "kl_loss_26": 2172.6, + "kl_loss_39": 1177.9, + "kl_loss_7": 3649.2, + "learning_rate": 2.0702942574950812e-05, + "loss": 4961.5, + "step": 9090 + }, + { + "ce_loss_13": 2.9034866452217103, + "ce_loss_26": 2.4769316017627716, + "ce_loss_39": 2.006492680311203, + "ce_loss_52": 1.4311101764440537, + "ce_loss_7": 3.164641487598419, + "epoch": 0.91, + "grad_norm": 18.881356963654913, + "kl_loss_13": 3059.6, + "kl_loss_26": 2172.6, + "kl_loss_39": 1174.0, + "kl_loss_7": 3598.4, + "learning_rate": 2.025351319275137e-05, + "loss": 4952.9, + "step": 9100 + }, + { + "ce_loss_13": 2.915982037782669, + "ce_loss_26": 2.4739193379879, + "ce_loss_39": 2.002902591228485, + "ce_loss_52": 1.4319583177566528, + "ce_loss_7": 3.17461501955986, + "epoch": 0.911, + "grad_norm": 18.844750958897702, + "kl_loss_13": 3071.6, + "kl_loss_26": 2158.2, + "kl_loss_39": 1169.6, + "kl_loss_7": 3613.6, + "learning_rate": 1.9808914852347816e-05, + "loss": 4969.1, + "step": 9110 + }, + { + "ce_loss_13": 2.9571076393127442, + "ce_loss_26": 2.5076956033706663, + "ce_loss_39": 2.0347861379384993, + "ce_loss_52": 1.4524286478757857, + "ce_loss_7": 3.2245921969413756, + "epoch": 0.912, + "grad_norm": 18.516316183492112, + "kl_loss_13": 3096.4, + "kl_loss_26": 2170.0, + "kl_loss_39": 1183.2, + "kl_loss_7": 3656.0, + "learning_rate": 1.9369152030840554e-05, + "loss": 4969.7, + "step": 9120 + }, + { + "ce_loss_13": 2.8493692874908447, + "ce_loss_26": 2.4184423595666886, + "ce_loss_39": 1.9636689513921737, + "ce_loss_52": 1.4161925345659256, + "ce_loss_7": 3.1067621290683745, + "epoch": 0.913, + "grad_norm": 19.557541417790066, + "kl_loss_13": 2969.6, + "kl_loss_26": 2073.0, + "kl_loss_39": 1110.7, + "kl_loss_7": 3508.8, + "learning_rate": 1.893422915663645e-05, + "loss": 4967.05, + "step": 9130 + }, + { + "ce_loss_13": 2.950014758110046, + "ce_loss_26": 2.5281428694725037, + "ce_loss_39": 2.0640300661325455, + "ce_loss_52": 1.4911428451538087, + "ce_loss_7": 3.20717169046402, + "epoch": 0.914, + "grad_norm": 19.126297962103095, + "kl_loss_13": 3062.4, + "kl_loss_26": 2166.6, + "kl_loss_39": 1184.2, + "kl_loss_7": 3597.2, + "learning_rate": 1.850415060940386e-05, + "loss": 4910.65, + "step": 9140 + }, + { + "ce_loss_13": 2.9050391018390656, + "ce_loss_26": 2.4736091554164887, + "ce_loss_39": 2.017327818274498, + "ce_loss_52": 1.4512428998947144, + "ce_loss_7": 3.162110447883606, + "epoch": 0.915, + "grad_norm": 18.914613896159423, + "kl_loss_13": 3018.0, + "kl_loss_26": 2122.6, + "kl_loss_39": 1153.5, + "kl_loss_7": 3547.6, + "learning_rate": 1.8078920720028978e-05, + "loss": 4898.6, + "step": 9150 + }, + { + "ce_loss_13": 2.8679368257522584, + "ce_loss_26": 2.4424335032701494, + "ce_loss_39": 1.9891292452812195, + "ce_loss_52": 1.4451990023255348, + "ce_loss_7": 3.127324694395065, + "epoch": 0.916, + "grad_norm": 20.019541856401887, + "kl_loss_13": 2959.2, + "kl_loss_26": 2064.4, + "kl_loss_39": 1098.8, + "kl_loss_7": 3496.8, + "learning_rate": 1.765854377057219e-05, + "loss": 4940.15, + "step": 9160 + }, + { + "ce_loss_13": 2.873015010356903, + "ce_loss_26": 2.433028203248978, + "ce_loss_39": 1.9746823519468308, + "ce_loss_52": 1.407148177921772, + "ce_loss_7": 3.1320842862129212, + "epoch": 0.917, + "grad_norm": 18.323389911529343, + "kl_loss_13": 3046.8, + "kl_loss_26": 2143.6, + "kl_loss_39": 1161.3, + "kl_loss_7": 3586.0, + "learning_rate": 1.724302399422456e-05, + "loss": 4937.75, + "step": 9170 + }, + { + "ce_loss_13": 2.864003378152847, + "ce_loss_26": 2.4368703365325928, + "ce_loss_39": 1.978201287984848, + "ce_loss_52": 1.4284173011779786, + "ce_loss_7": 3.117653822898865, + "epoch": 0.918, + "grad_norm": 19.851903614612837, + "kl_loss_13": 2960.4, + "kl_loss_26": 2069.6, + "kl_loss_39": 1110.5, + "kl_loss_7": 3494.8, + "learning_rate": 1.683236557526574e-05, + "loss": 4948.85, + "step": 9180 + }, + { + "ce_loss_13": 2.841529107093811, + "ce_loss_26": 2.4092674642801284, + "ce_loss_39": 1.9591031044721603, + "ce_loss_52": 1.4043558463454247, + "ce_loss_7": 3.099020904302597, + "epoch": 0.919, + "grad_norm": 18.98858399937095, + "kl_loss_13": 2971.2, + "kl_loss_26": 2076.4, + "kl_loss_39": 1122.0, + "kl_loss_7": 3508.0, + "learning_rate": 1.6426572649021475e-05, + "loss": 4944.1, + "step": 9190 + }, + { + "ce_loss_13": 2.902883565425873, + "ce_loss_26": 2.4624376207590104, + "ce_loss_39": 2.0033867925405504, + "ce_loss_52": 1.4482421904802323, + "ce_loss_7": 3.1540717780590057, + "epoch": 0.92, + "grad_norm": 19.524164167367193, + "kl_loss_13": 3016.4, + "kl_loss_26": 2114.8, + "kl_loss_39": 1136.7, + "kl_loss_7": 3542.4, + "learning_rate": 1.6025649301821876e-05, + "loss": 4936.95, + "step": 9200 + }, + { + "ce_loss_13": 2.962231194972992, + "ce_loss_26": 2.5141273856163027, + "ce_loss_39": 2.038061347603798, + "ce_loss_52": 1.4573458433151245, + "ce_loss_7": 3.22137930393219, + "epoch": 0.921, + "grad_norm": 19.07781770056793, + "kl_loss_13": 3091.6, + "kl_loss_26": 2174.2, + "kl_loss_39": 1179.6, + "kl_loss_7": 3638.0, + "learning_rate": 1.5629599570960716e-05, + "loss": 4931.05, + "step": 9210 + }, + { + "ce_loss_13": 2.828860414028168, + "ce_loss_26": 2.394576147198677, + "ce_loss_39": 1.940834417939186, + "ce_loss_52": 1.396960550546646, + "ce_loss_7": 3.0943815410137177, + "epoch": 0.922, + "grad_norm": 18.68562598066032, + "kl_loss_13": 2986.4, + "kl_loss_26": 2085.0, + "kl_loss_39": 1113.6, + "kl_loss_7": 3535.2, + "learning_rate": 1.5238427444654367e-05, + "loss": 4919.35, + "step": 9220 + }, + { + "ce_loss_13": 2.854993385076523, + "ce_loss_26": 2.4067456245422365, + "ce_loss_39": 1.9481880724430085, + "ce_loss_52": 1.392129084467888, + "ce_loss_7": 3.119863528013229, + "epoch": 0.923, + "grad_norm": 19.56628173058375, + "kl_loss_13": 3048.8, + "kl_loss_26": 2126.2, + "kl_loss_39": 1141.3, + "kl_loss_7": 3609.6, + "learning_rate": 1.4852136862001764e-05, + "loss": 4956.25, + "step": 9230 + }, + { + "ce_loss_13": 2.8672266066074372, + "ce_loss_26": 2.428171756863594, + "ce_loss_39": 1.967655423283577, + "ce_loss_52": 1.4228445023298264, + "ce_loss_7": 3.12925271987915, + "epoch": 0.924, + "grad_norm": 18.655136558750065, + "kl_loss_13": 3020.8, + "kl_loss_26": 2114.0, + "kl_loss_39": 1134.6, + "kl_loss_7": 3562.4, + "learning_rate": 1.4470731712944884e-05, + "loss": 4914.5, + "step": 9240 + }, + { + "ce_loss_13": 2.967056131362915, + "ce_loss_26": 2.527100908756256, + "ce_loss_39": 2.0592952966690063, + "ce_loss_52": 1.466832235455513, + "ce_loss_7": 3.2317879140377044, + "epoch": 0.925, + "grad_norm": 18.755830587214348, + "kl_loss_13": 3074.0, + "kl_loss_26": 2172.6, + "kl_loss_39": 1194.2, + "kl_loss_7": 3633.2, + "learning_rate": 1.4094215838229174e-05, + "loss": 4941.0, + "step": 9250 + }, + { + "ce_loss_13": 2.8956347942352294, + "ce_loss_26": 2.4609649628400803, + "ce_loss_39": 1.998116421699524, + "ce_loss_52": 1.4327284812927246, + "ce_loss_7": 3.1544252693653108, + "epoch": 0.926, + "grad_norm": 19.440875104184542, + "kl_loss_13": 3037.6, + "kl_loss_26": 2133.0, + "kl_loss_39": 1149.6, + "kl_loss_7": 3582.4, + "learning_rate": 1.372259302936546e-05, + "loss": 4929.25, + "step": 9260 + }, + { + "ce_loss_13": 2.818482467532158, + "ce_loss_26": 2.3888671875, + "ce_loss_39": 1.9417572438716888, + "ce_loss_52": 1.3873827829957008, + "ce_loss_7": 3.0732292413711546, + "epoch": 0.927, + "grad_norm": 19.09848340283336, + "kl_loss_13": 2988.4, + "kl_loss_26": 2096.8, + "kl_loss_39": 1136.6, + "kl_loss_7": 3519.2, + "learning_rate": 1.3355867028591206e-05, + "loss": 4917.85, + "step": 9270 + }, + { + "ce_loss_13": 2.8812867999076843, + "ce_loss_26": 2.445907565951347, + "ce_loss_39": 1.9824917227029801, + "ce_loss_52": 1.4204061418771743, + "ce_loss_7": 3.1463906168937683, + "epoch": 0.928, + "grad_norm": 19.73371377973639, + "kl_loss_13": 3015.6, + "kl_loss_26": 2109.8, + "kl_loss_39": 1132.3, + "kl_loss_7": 3565.2, + "learning_rate": 1.2994041528833267e-05, + "loss": 4914.15, + "step": 9280 + }, + { + "ce_loss_13": 2.989528793096542, + "ce_loss_26": 2.545223152637482, + "ce_loss_39": 2.0668440997600555, + "ce_loss_52": 1.4640702456235886, + "ce_loss_7": 3.2542518198490145, + "epoch": 0.929, + "grad_norm": 18.497071159749588, + "kl_loss_13": 3146.0, + "kl_loss_26": 2234.4, + "kl_loss_39": 1221.2, + "kl_loss_7": 3704.8, + "learning_rate": 1.2637120173670358e-05, + "loss": 4971.25, + "step": 9290 + }, + { + "ce_loss_13": 2.9433493435382845, + "ce_loss_26": 2.503718575835228, + "ce_loss_39": 2.029115191102028, + "ce_loss_52": 1.4293665170669556, + "ce_loss_7": 3.21596360206604, + "epoch": 0.93, + "grad_norm": 19.233646690177977, + "kl_loss_13": 3119.2, + "kl_loss_26": 2210.0, + "kl_loss_39": 1209.9, + "kl_loss_7": 3688.8, + "learning_rate": 1.2285106557296478e-05, + "loss": 4970.8, + "step": 9300 + }, + { + "ce_loss_13": 2.8525869846343994, + "ce_loss_26": 2.4185830265283585, + "ce_loss_39": 1.9529170453548432, + "ce_loss_52": 1.4022331610321999, + "ce_loss_7": 3.112831687927246, + "epoch": 0.931, + "grad_norm": 19.01919083076588, + "kl_loss_13": 3012.8, + "kl_loss_26": 2100.6, + "kl_loss_39": 1120.5, + "kl_loss_7": 3542.0, + "learning_rate": 1.1938004224484989e-05, + "loss": 4934.7, + "step": 9310 + }, + { + "ce_loss_13": 2.9074361979961396, + "ce_loss_26": 2.477645492553711, + "ce_loss_39": 2.010756382346153, + "ce_loss_52": 1.4430534109473228, + "ce_loss_7": 3.1718304812908173, + "epoch": 0.932, + "grad_norm": 18.572431056907458, + "kl_loss_13": 3020.0, + "kl_loss_26": 2116.0, + "kl_loss_39": 1144.2, + "kl_loss_7": 3575.6, + "learning_rate": 1.1595816670552429e-05, + "loss": 4913.95, + "step": 9320 + }, + { + "ce_loss_13": 2.8636857986450197, + "ce_loss_26": 2.424144572019577, + "ce_loss_39": 1.9600117355585098, + "ce_loss_52": 1.402983972430229, + "ce_loss_7": 3.1251452922821046, + "epoch": 0.933, + "grad_norm": 18.288942605726792, + "kl_loss_13": 3044.0, + "kl_loss_26": 2129.6, + "kl_loss_39": 1139.2, + "kl_loss_7": 3581.6, + "learning_rate": 1.1258547341323699e-05, + "loss": 4937.25, + "step": 9330 + }, + { + "ce_loss_13": 2.893140608072281, + "ce_loss_26": 2.457077306509018, + "ce_loss_39": 2.003238731622696, + "ce_loss_52": 1.4437968581914902, + "ce_loss_7": 3.1531366109848022, + "epoch": 0.934, + "grad_norm": 18.739319955640763, + "kl_loss_13": 3019.6, + "kl_loss_26": 2115.0, + "kl_loss_39": 1131.3, + "kl_loss_7": 3558.4, + "learning_rate": 1.0926199633097156e-05, + "loss": 4899.9, + "step": 9340 + }, + { + "ce_loss_13": 2.9001421511173247, + "ce_loss_26": 2.4687224984169007, + "ce_loss_39": 2.0006180971860887, + "ce_loss_52": 1.4220335900783538, + "ce_loss_7": 3.1691121637821196, + "epoch": 0.935, + "grad_norm": 19.392869535691936, + "kl_loss_13": 3054.0, + "kl_loss_26": 2149.8, + "kl_loss_39": 1176.1, + "kl_loss_7": 3619.2, + "learning_rate": 1.0598776892610684e-05, + "loss": 4922.25, + "step": 9350 + }, + { + "ce_loss_13": 2.953709363937378, + "ce_loss_26": 2.5250791788101195, + "ce_loss_39": 2.0616777926683425, + "ce_loss_52": 1.5004188895225525, + "ce_loss_7": 3.2059156119823458, + "epoch": 0.936, + "grad_norm": 18.98607482508187, + "kl_loss_13": 3007.2, + "kl_loss_26": 2107.0, + "kl_loss_39": 1138.6, + "kl_loss_7": 3536.0, + "learning_rate": 1.0276282417007399e-05, + "loss": 4935.75, + "step": 9360 + }, + { + "ce_loss_13": 2.902603155374527, + "ce_loss_26": 2.464896833896637, + "ce_loss_39": 2.0031634330749513, + "ce_loss_52": 1.4478828191757203, + "ce_loss_7": 3.166324245929718, + "epoch": 0.937, + "grad_norm": 18.72231921789515, + "kl_loss_13": 3025.6, + "kl_loss_26": 2120.4, + "kl_loss_39": 1137.7, + "kl_loss_7": 3574.4, + "learning_rate": 9.958719453803277e-06, + "loss": 4933.2, + "step": 9370 + }, + { + "ce_loss_13": 2.878055286407471, + "ce_loss_26": 2.4367445170879365, + "ce_loss_39": 1.9698922991752625, + "ce_loss_52": 1.40206458568573, + "ce_loss_7": 3.1407361745834352, + "epoch": 0.938, + "grad_norm": 19.520797823561637, + "kl_loss_13": 3045.6, + "kl_loss_26": 2130.6, + "kl_loss_39": 1145.7, + "kl_loss_7": 3591.6, + "learning_rate": 9.646091200853802e-06, + "loss": 4932.45, + "step": 9380 + }, + { + "ce_loss_13": 2.8573631644248962, + "ce_loss_26": 2.429997554421425, + "ce_loss_39": 1.9779304087162017, + "ce_loss_52": 1.4321624323725701, + "ce_loss_7": 3.119151920080185, + "epoch": 0.939, + "grad_norm": 18.61104788500602, + "kl_loss_13": 2968.4, + "kl_loss_26": 2075.6, + "kl_loss_39": 1113.7, + "kl_loss_7": 3509.6, + "learning_rate": 9.338400806321978e-06, + "loss": 4899.9, + "step": 9390 + }, + { + "ce_loss_13": 2.8828431129455567, + "ce_loss_26": 2.4453956365585325, + "ce_loss_39": 1.986677783727646, + "ce_loss_52": 1.4324709355831147, + "ce_loss_7": 3.1462887287139893, + "epoch": 0.94, + "grad_norm": 18.660409146960177, + "kl_loss_13": 3006.4, + "kl_loss_26": 2102.2, + "kl_loss_39": 1130.9, + "kl_loss_7": 3551.2, + "learning_rate": 9.035651368646646e-06, + "loss": 4963.1, + "step": 9400 + }, + { + "ce_loss_13": 2.856483778357506, + "ce_loss_26": 2.426860272884369, + "ce_loss_39": 1.9708759590983391, + "ce_loss_52": 1.4115710154175758, + "ce_loss_7": 3.114782178401947, + "epoch": 0.941, + "grad_norm": 19.55117077640538, + "kl_loss_13": 2986.0, + "kl_loss_26": 2096.2, + "kl_loss_39": 1131.2, + "kl_loss_7": 3526.4, + "learning_rate": 8.737845936511335e-06, + "loss": 4960.75, + "step": 9410 + }, + { + "ce_loss_13": 2.894274836778641, + "ce_loss_26": 2.454681032896042, + "ce_loss_39": 1.9826824754476546, + "ce_loss_52": 1.4298861980438233, + "ce_loss_7": 3.15040722489357, + "epoch": 0.942, + "grad_norm": 19.039583654377346, + "kl_loss_13": 3067.6, + "kl_loss_26": 2152.8, + "kl_loss_39": 1152.4, + "kl_loss_7": 3608.4, + "learning_rate": 8.444987508813451e-06, + "loss": 4899.6, + "step": 9420 + }, + { + "ce_loss_13": 2.9001412212848665, + "ce_loss_26": 2.4617854237556456, + "ce_loss_39": 1.999165838956833, + "ce_loss_52": 1.4294554442167282, + "ce_loss_7": 3.165439170598984, + "epoch": 0.943, + "grad_norm": 18.564983933864266, + "kl_loss_13": 3046.0, + "kl_loss_26": 2136.6, + "kl_loss_39": 1159.1, + "kl_loss_7": 3592.8, + "learning_rate": 8.157079034633974e-06, + "loss": 4920.3, + "step": 9430 + }, + { + "ce_loss_13": 2.863955610990524, + "ce_loss_26": 2.435519364476204, + "ce_loss_39": 1.9886516004800796, + "ce_loss_52": 1.4344154298305511, + "ce_loss_7": 3.1265052914619447, + "epoch": 0.944, + "grad_norm": 17.82647647549486, + "kl_loss_13": 2962.4, + "kl_loss_26": 2073.6, + "kl_loss_39": 1123.1, + "kl_loss_7": 3508.4, + "learning_rate": 7.874123413208145e-06, + "loss": 4921.7, + "step": 9440 + }, + { + "ce_loss_13": 2.8527204990386963, + "ce_loss_26": 2.418528434634209, + "ce_loss_39": 1.960913023352623, + "ce_loss_52": 1.4082367643713951, + "ce_loss_7": 3.118280106782913, + "epoch": 0.945, + "grad_norm": 17.642678200140654, + "kl_loss_13": 3000.8, + "kl_loss_26": 2093.6, + "kl_loss_39": 1127.9, + "kl_loss_7": 3547.2, + "learning_rate": 7.59612349389599e-06, + "loss": 4941.9, + "step": 9450 + }, + { + "ce_loss_13": 2.8983235955238342, + "ce_loss_26": 2.4708085656166077, + "ce_loss_39": 2.01363542675972, + "ce_loss_52": 1.4459212511777877, + "ce_loss_7": 3.1573162257671354, + "epoch": 0.946, + "grad_norm": 18.21137845155402, + "kl_loss_13": 3012.8, + "kl_loss_26": 2129.0, + "kl_loss_39": 1159.6, + "kl_loss_7": 3550.0, + "learning_rate": 7.323082076153509e-06, + "loss": 4932.45, + "step": 9460 + }, + { + "ce_loss_13": 2.8793884813785553, + "ce_loss_26": 2.444310560822487, + "ce_loss_39": 1.9878242909908295, + "ce_loss_52": 1.4219153225421906, + "ce_loss_7": 3.1359946370124816, + "epoch": 0.947, + "grad_norm": 19.11147526952516, + "kl_loss_13": 3000.4, + "kl_loss_26": 2106.2, + "kl_loss_39": 1141.4, + "kl_loss_7": 3539.6, + "learning_rate": 7.055001909504755e-06, + "loss": 4932.95, + "step": 9470 + }, + { + "ce_loss_13": 2.8483738005161285, + "ce_loss_26": 2.4169380724430085, + "ce_loss_39": 1.9552814781665802, + "ce_loss_52": 1.4036450207233429, + "ce_loss_7": 3.1039236187934875, + "epoch": 0.948, + "grad_norm": 19.227610169601164, + "kl_loss_13": 3000.4, + "kl_loss_26": 2102.0, + "kl_loss_39": 1125.6, + "kl_loss_7": 3530.0, + "learning_rate": 6.791885693514133e-06, + "loss": 4941.55, + "step": 9480 + }, + { + "ce_loss_13": 2.8693545699119567, + "ce_loss_26": 2.4362709283828736, + "ce_loss_39": 1.9619301795959472, + "ce_loss_52": 1.400461108982563, + "ce_loss_7": 3.133023035526276, + "epoch": 0.949, + "grad_norm": 19.323995399615697, + "kl_loss_13": 3058.8, + "kl_loss_26": 2146.6, + "kl_loss_39": 1149.1, + "kl_loss_7": 3608.8, + "learning_rate": 6.533736077758867e-06, + "loss": 4986.35, + "step": 9490 + }, + { + "ce_loss_13": 2.8667274117469788, + "ce_loss_26": 2.4240807622671126, + "ce_loss_39": 1.9586560875177383, + "ce_loss_52": 1.3980020493268968, + "ce_loss_7": 3.127706527709961, + "epoch": 0.95, + "grad_norm": 18.253118734633716, + "kl_loss_13": 3033.6, + "kl_loss_26": 2126.0, + "kl_loss_39": 1140.7, + "kl_loss_7": 3581.6, + "learning_rate": 6.2805556618028556e-06, + "loss": 4971.65, + "step": 9500 + }, + { + "ce_loss_13": 2.9265355467796326, + "ce_loss_26": 2.4994624704122543, + "ce_loss_39": 2.03882916867733, + "ce_loss_52": 1.4773303151130677, + "ce_loss_7": 3.1838342785835265, + "epoch": 0.951, + "grad_norm": 19.482478782354722, + "kl_loss_13": 2998.4, + "kl_loss_26": 2103.6, + "kl_loss_39": 1144.8, + "kl_loss_7": 3540.4, + "learning_rate": 6.032346995169968e-06, + "loss": 4951.7, + "step": 9510 + }, + { + "ce_loss_13": 2.9545272469520567, + "ce_loss_26": 2.5311076641082764, + "ce_loss_39": 2.068070963025093, + "ce_loss_52": 1.4843237161636353, + "ce_loss_7": 3.2110206544399262, + "epoch": 0.952, + "grad_norm": 19.225083219290383, + "kl_loss_13": 3055.2, + "kl_loss_26": 2164.8, + "kl_loss_39": 1188.4, + "kl_loss_7": 3590.4, + "learning_rate": 5.789112577318789e-06, + "loss": 4961.65, + "step": 9520 + }, + { + "ce_loss_13": 2.852985817193985, + "ce_loss_26": 2.4146564304828644, + "ce_loss_39": 1.961009207367897, + "ce_loss_52": 1.3947103202342988, + "ce_loss_7": 3.1259153723716735, + "epoch": 0.953, + "grad_norm": 18.155555980380427, + "kl_loss_13": 3021.6, + "kl_loss_26": 2118.2, + "kl_loss_39": 1157.4, + "kl_loss_7": 3575.6, + "learning_rate": 5.550854857617194e-06, + "loss": 4909.2, + "step": 9530 + }, + { + "ce_loss_13": 2.8418005287647246, + "ce_loss_26": 2.4128061681985855, + "ce_loss_39": 1.9482584029436112, + "ce_loss_52": 1.3925445035099984, + "ce_loss_7": 3.1012724101543427, + "epoch": 0.954, + "grad_norm": 18.797933923537936, + "kl_loss_13": 3018.8, + "kl_loss_26": 2116.8, + "kl_loss_39": 1135.1, + "kl_loss_7": 3566.4, + "learning_rate": 5.317576235317756e-06, + "loss": 4951.35, + "step": 9540 + }, + { + "ce_loss_13": 2.9137533485889433, + "ce_loss_26": 2.479102221131325, + "ce_loss_39": 2.011527943611145, + "ce_loss_52": 1.4640387833118438, + "ce_loss_7": 3.16554337143898, + "epoch": 0.955, + "grad_norm": 18.308431062302134, + "kl_loss_13": 3004.0, + "kl_loss_26": 2104.0, + "kl_loss_39": 1124.5, + "kl_loss_7": 3534.4, + "learning_rate": 5.089279059533658e-06, + "loss": 4893.9, + "step": 9550 + }, + { + "ce_loss_13": 2.9561933636665345, + "ce_loss_26": 2.520014223456383, + "ce_loss_39": 2.044199249148369, + "ce_loss_52": 1.4634439080953598, + "ce_loss_7": 3.2146646201610567, + "epoch": 0.956, + "grad_norm": 19.011558845630333, + "kl_loss_13": 3107.6, + "kl_loss_26": 2205.4, + "kl_loss_39": 1192.1, + "kl_loss_7": 3652.4, + "learning_rate": 4.865965629214819e-06, + "loss": 4928.6, + "step": 9560 + }, + { + "ce_loss_13": 2.9264722049236296, + "ce_loss_26": 2.500628116726875, + "ce_loss_39": 2.0424467980861665, + "ce_loss_52": 1.4656882539391518, + "ce_loss_7": 3.1882854044437408, + "epoch": 0.957, + "grad_norm": 19.491005801371585, + "kl_loss_13": 3028.0, + "kl_loss_26": 2138.6, + "kl_loss_39": 1174.2, + "kl_loss_7": 3566.0, + "learning_rate": 4.6476381931251366e-06, + "loss": 4947.75, + "step": 9570 + }, + { + "ce_loss_13": 2.8677931249141695, + "ce_loss_26": 2.432820278406143, + "ce_loss_39": 1.970087245106697, + "ce_loss_52": 1.4129542678594589, + "ce_loss_7": 3.129103422164917, + "epoch": 0.958, + "grad_norm": 18.903925554756427, + "kl_loss_13": 2994.0, + "kl_loss_26": 2096.0, + "kl_loss_39": 1120.2, + "kl_loss_7": 3546.4, + "learning_rate": 4.434298949819449e-06, + "loss": 4918.2, + "step": 9580 + }, + { + "ce_loss_13": 2.894206315279007, + "ce_loss_26": 2.4608440458774568, + "ce_loss_39": 2.00639765560627, + "ce_loss_52": 1.4453970074653626, + "ce_loss_7": 3.151961898803711, + "epoch": 0.959, + "grad_norm": 17.742534881377313, + "kl_loss_13": 2993.6, + "kl_loss_26": 2091.6, + "kl_loss_39": 1135.9, + "kl_loss_7": 3542.0, + "learning_rate": 4.2259500476214406e-06, + "loss": 4904.1, + "step": 9590 + }, + { + "ce_loss_13": 2.907763344049454, + "ce_loss_26": 2.4721481442451476, + "ce_loss_39": 2.013452297449112, + "ce_loss_52": 1.4463645279407502, + "ce_loss_7": 3.1700760960578918, + "epoch": 0.96, + "grad_norm": 18.762064601939382, + "kl_loss_13": 3033.2, + "kl_loss_26": 2127.6, + "kl_loss_39": 1154.1, + "kl_loss_7": 3576.8, + "learning_rate": 4.02259358460233e-06, + "loss": 4944.15, + "step": 9600 + }, + { + "ce_loss_13": 2.9404530614614486, + "ce_loss_26": 2.5118053376674654, + "ce_loss_39": 2.045960560441017, + "ce_loss_52": 1.4736278399825096, + "ce_loss_7": 3.199742293357849, + "epoch": 0.961, + "grad_norm": 19.091693827270714, + "kl_loss_13": 3049.6, + "kl_loss_26": 2150.4, + "kl_loss_39": 1165.5, + "kl_loss_7": 3590.8, + "learning_rate": 3.8242316085594916e-06, + "loss": 4931.75, + "step": 9610 + }, + { + "ce_loss_13": 2.8952401757240294, + "ce_loss_26": 2.4520116090774535, + "ce_loss_39": 1.9771205306053161, + "ce_loss_52": 1.3961644172668457, + "ce_loss_7": 3.1528802454471587, + "epoch": 0.962, + "grad_norm": 18.822596918413492, + "kl_loss_13": 3097.6, + "kl_loss_26": 2171.4, + "kl_loss_39": 1176.6, + "kl_loss_7": 3631.6, + "learning_rate": 3.630866116995757e-06, + "loss": 4991.65, + "step": 9620 + }, + { + "ce_loss_13": 2.848733913898468, + "ce_loss_26": 2.4173508852720262, + "ce_loss_39": 1.9629988223314285, + "ce_loss_52": 1.4227147445082664, + "ce_loss_7": 3.105393874645233, + "epoch": 0.963, + "grad_norm": 18.772124078001035, + "kl_loss_13": 2951.2, + "kl_loss_26": 2061.0, + "kl_loss_39": 1101.5, + "kl_loss_7": 3491.2, + "learning_rate": 3.4424990570994797e-06, + "loss": 4903.15, + "step": 9630 + }, + { + "ce_loss_13": 2.9066348552703856, + "ce_loss_26": 2.469021773338318, + "ce_loss_39": 2.010333400964737, + "ce_loss_52": 1.4492767244577407, + "ce_loss_7": 3.1746467888355254, + "epoch": 0.964, + "grad_norm": 19.32775364197132, + "kl_loss_13": 3013.6, + "kl_loss_26": 2110.8, + "kl_loss_39": 1135.7, + "kl_loss_7": 3570.0, + "learning_rate": 3.2591323257248896e-06, + "loss": 4939.25, + "step": 9640 + }, + { + "ce_loss_13": 2.9041188657283783, + "ce_loss_26": 2.471466612815857, + "ce_loss_39": 2.0076118439435957, + "ce_loss_52": 1.4575997084379195, + "ce_loss_7": 3.156245505809784, + "epoch": 0.965, + "grad_norm": 18.772007331370325, + "kl_loss_13": 3016.0, + "kl_loss_26": 2111.8, + "kl_loss_39": 1138.9, + "kl_loss_7": 3551.6, + "learning_rate": 3.0807677693729385e-06, + "loss": 4953.0, + "step": 9650 + }, + { + "ce_loss_13": 2.9185379564762117, + "ce_loss_26": 2.4852662444114686, + "ce_loss_39": 2.0213693618774413, + "ce_loss_52": 1.4567248612642287, + "ce_loss_7": 3.1788457691669465, + "epoch": 0.966, + "grad_norm": 19.301754151350856, + "kl_loss_13": 3049.2, + "kl_loss_26": 2142.4, + "kl_loss_39": 1165.5, + "kl_loss_7": 3589.2, + "learning_rate": 2.9074071841727055e-06, + "loss": 4966.3, + "step": 9660 + }, + { + "ce_loss_13": 2.856596076488495, + "ce_loss_26": 2.4228154510259627, + "ce_loss_39": 1.966923463344574, + "ce_loss_52": 1.3987573131918907, + "ce_loss_7": 3.112848150730133, + "epoch": 0.967, + "grad_norm": 18.636169987835014, + "kl_loss_13": 3015.2, + "kl_loss_26": 2124.8, + "kl_loss_39": 1150.1, + "kl_loss_7": 3558.0, + "learning_rate": 2.739052315863355e-06, + "loss": 4944.85, + "step": 9670 + }, + { + "ce_loss_13": 2.946157419681549, + "ce_loss_26": 2.506874307990074, + "ce_loss_39": 2.0385408878326414, + "ce_loss_52": 1.4472137212753295, + "ce_loss_7": 3.2151435017585754, + "epoch": 0.968, + "grad_norm": 19.361220180181423, + "kl_loss_13": 3105.6, + "kl_loss_26": 2193.0, + "kl_loss_39": 1200.9, + "kl_loss_7": 3669.2, + "learning_rate": 2.5757048597765396e-06, + "loss": 4938.1, + "step": 9680 + }, + { + "ce_loss_13": 2.838688534498215, + "ce_loss_26": 2.408904367685318, + "ce_loss_39": 1.9514323592185974, + "ce_loss_52": 1.4050966590642928, + "ce_loss_7": 3.102074921131134, + "epoch": 0.969, + "grad_norm": 18.982791691406838, + "kl_loss_13": 2984.8, + "kl_loss_26": 2086.6, + "kl_loss_39": 1106.6, + "kl_loss_7": 3533.2, + "learning_rate": 2.417366460819359e-06, + "loss": 4918.15, + "step": 9690 + }, + { + "ce_loss_13": 2.880851173400879, + "ce_loss_26": 2.447329577803612, + "ce_loss_39": 1.989661106467247, + "ce_loss_52": 1.4333824023604393, + "ce_loss_7": 3.137517309188843, + "epoch": 0.97, + "grad_norm": 19.196819142959395, + "kl_loss_13": 2988.0, + "kl_loss_26": 2092.4, + "kl_loss_39": 1128.9, + "kl_loss_7": 3524.4, + "learning_rate": 2.2640387134577057e-06, + "loss": 4938.15, + "step": 9700 + }, + { + "ce_loss_13": 2.8625703275203707, + "ce_loss_26": 2.4340526342391966, + "ce_loss_39": 1.968525806069374, + "ce_loss_52": 1.4236899584531784, + "ce_loss_7": 3.112631046772003, + "epoch": 0.971, + "grad_norm": 19.493522870524444, + "kl_loss_13": 2981.2, + "kl_loss_26": 2091.6, + "kl_loss_39": 1109.1, + "kl_loss_7": 3508.4, + "learning_rate": 2.115723161700278e-06, + "loss": 4978.3, + "step": 9710 + }, + { + "ce_loss_13": 2.930490869283676, + "ce_loss_26": 2.495130881667137, + "ce_loss_39": 2.030032703280449, + "ce_loss_52": 1.4442616790533065, + "ce_loss_7": 3.190455746650696, + "epoch": 0.972, + "grad_norm": 18.231386237261873, + "kl_loss_13": 3083.2, + "kl_loss_26": 2182.4, + "kl_loss_39": 1200.2, + "kl_loss_7": 3630.4, + "learning_rate": 1.9724212990830937e-06, + "loss": 4917.25, + "step": 9720 + }, + { + "ce_loss_13": 2.8745281517505648, + "ce_loss_26": 2.450575265288353, + "ce_loss_39": 1.9936909019947051, + "ce_loss_52": 1.4346210777759552, + "ce_loss_7": 3.1411093890666963, + "epoch": 0.973, + "grad_norm": 17.9155258115958, + "kl_loss_13": 2990.4, + "kl_loss_26": 2104.0, + "kl_loss_39": 1135.2, + "kl_loss_7": 3546.0, + "learning_rate": 1.8341345686543331e-06, + "loss": 4907.2, + "step": 9730 + }, + { + "ce_loss_13": 2.950432300567627, + "ce_loss_26": 2.5173233568668367, + "ce_loss_39": 2.05553839802742, + "ce_loss_52": 1.5059631228446961, + "ce_loss_7": 3.203335565328598, + "epoch": 0.974, + "grad_norm": 18.692267522295538, + "kl_loss_13": 2994.0, + "kl_loss_26": 2099.8, + "kl_loss_39": 1118.3, + "kl_loss_7": 3526.0, + "learning_rate": 1.7008643629596864e-06, + "loss": 4975.3, + "step": 9740 + }, + { + "ce_loss_13": 2.938932454586029, + "ce_loss_26": 2.4943090945482256, + "ce_loss_39": 2.030773627758026, + "ce_loss_52": 1.4614870190620421, + "ce_loss_7": 3.203068423271179, + "epoch": 0.975, + "grad_norm": 19.21025690602488, + "kl_loss_13": 3068.4, + "kl_loss_26": 2141.2, + "kl_loss_39": 1151.8, + "kl_loss_7": 3617.6, + "learning_rate": 1.5726120240288633e-06, + "loss": 4916.8, + "step": 9750 + }, + { + "ce_loss_13": 2.9820500314235687, + "ce_loss_26": 2.5365146696567535, + "ce_loss_39": 2.0661711603403092, + "ce_loss_52": 1.471569898724556, + "ce_loss_7": 3.2498775362968444, + "epoch": 0.976, + "grad_norm": 18.6163174066225, + "kl_loss_13": 3112.0, + "kl_loss_26": 2196.6, + "kl_loss_39": 1202.4, + "kl_loss_7": 3666.4, + "learning_rate": 1.4493788433612708e-06, + "loss": 4925.3, + "step": 9760 + }, + { + "ce_loss_13": 2.8684714436531067, + "ce_loss_26": 2.433712217211723, + "ce_loss_39": 1.979764473438263, + "ce_loss_52": 1.4259307652711868, + "ce_loss_7": 3.1243775844573975, + "epoch": 0.977, + "grad_norm": 18.645711029415455, + "kl_loss_13": 2991.6, + "kl_loss_26": 2090.8, + "kl_loss_39": 1122.5, + "kl_loss_7": 3526.8, + "learning_rate": 1.3311660619138578e-06, + "loss": 4899.9, + "step": 9770 + }, + { + "ce_loss_13": 2.875075614452362, + "ce_loss_26": 2.4274426341056823, + "ce_loss_39": 1.9597632795572282, + "ce_loss_52": 1.3981771111488341, + "ce_loss_7": 3.132591074705124, + "epoch": 0.978, + "grad_norm": 19.101397556379275, + "kl_loss_13": 3053.6, + "kl_loss_26": 2133.6, + "kl_loss_39": 1148.0, + "kl_loss_7": 3594.4, + "learning_rate": 1.2179748700879012e-06, + "loss": 4922.55, + "step": 9780 + }, + { + "ce_loss_13": 2.8309387296438215, + "ce_loss_26": 2.4053177654743196, + "ce_loss_39": 1.9509627014398574, + "ce_loss_52": 1.397429385781288, + "ce_loss_7": 3.098381590843201, + "epoch": 0.979, + "grad_norm": 18.730022448398557, + "kl_loss_13": 2994.8, + "kl_loss_26": 2100.6, + "kl_loss_39": 1129.3, + "kl_loss_7": 3549.6, + "learning_rate": 1.1098064077174619e-06, + "loss": 4943.05, + "step": 9790 + }, + { + "ce_loss_13": 2.939675289392471, + "ce_loss_26": 2.5042629301548005, + "ce_loss_39": 2.036014449596405, + "ce_loss_52": 1.4478511959314346, + "ce_loss_7": 3.2001714766025544, + "epoch": 0.98, + "grad_norm": 18.76259776094823, + "kl_loss_13": 3078.4, + "kl_loss_26": 2174.8, + "kl_loss_39": 1199.8, + "kl_loss_7": 3615.6, + "learning_rate": 1.006661764057837e-06, + "loss": 4908.35, + "step": 9800 + }, + { + "ce_loss_13": 2.871473455429077, + "ce_loss_26": 2.4314837962388993, + "ce_loss_39": 1.95380699634552, + "ce_loss_52": 1.3844006016850472, + "ce_loss_7": 3.1342472076416015, + "epoch": 0.981, + "grad_norm": 19.274903724206773, + "kl_loss_13": 3093.6, + "kl_loss_26": 2172.4, + "kl_loss_39": 1165.3, + "kl_loss_7": 3637.6, + "learning_rate": 9.085419777743465e-07, + "loss": 4984.5, + "step": 9810 + }, + { + "ce_loss_13": 2.895174187421799, + "ce_loss_26": 2.465178096294403, + "ce_loss_39": 2.006316193938255, + "ce_loss_52": 1.441744513809681, + "ce_loss_7": 3.1597203612327576, + "epoch": 0.982, + "grad_norm": 18.123510539706626, + "kl_loss_13": 3039.2, + "kl_loss_26": 2136.8, + "kl_loss_39": 1165.1, + "kl_loss_7": 3592.0, + "learning_rate": 8.15448036932176e-07, + "loss": 4978.7, + "step": 9820 + }, + { + "ce_loss_13": 2.9061976075172424, + "ce_loss_26": 2.477484393119812, + "ce_loss_39": 2.0169315338134766, + "ce_loss_52": 1.4464313685894012, + "ce_loss_7": 3.1681883454322817, + "epoch": 0.983, + "grad_norm": 18.434840579065046, + "kl_loss_13": 3067.2, + "kl_loss_26": 2157.8, + "kl_loss_39": 1175.9, + "kl_loss_7": 3616.8, + "learning_rate": 7.273808789862724e-07, + "loss": 4921.0, + "step": 9830 + }, + { + "ce_loss_13": 2.91153547167778, + "ce_loss_26": 2.473715308308601, + "ce_loss_39": 2.007182112336159, + "ce_loss_52": 1.4433553382754325, + "ce_loss_7": 3.170134776830673, + "epoch": 0.984, + "grad_norm": 19.402855155704938, + "kl_loss_13": 3056.0, + "kl_loss_26": 2142.6, + "kl_loss_39": 1154.2, + "kl_loss_7": 3589.6, + "learning_rate": 6.443413907720186e-07, + "loss": 4900.3, + "step": 9840 + }, + { + "ce_loss_13": 2.812727469205856, + "ce_loss_26": 2.3864874839782715, + "ce_loss_39": 1.9436532348394393, + "ce_loss_52": 1.3948013991117478, + "ce_loss_7": 3.0747777581214906, + "epoch": 0.985, + "grad_norm": 18.7509272372939, + "kl_loss_13": 2956.4, + "kl_loss_26": 2066.6, + "kl_loss_39": 1115.4, + "kl_loss_7": 3495.2, + "learning_rate": 5.663304084960185e-07, + "loss": 4941.5, + "step": 9850 + }, + { + "ce_loss_13": 2.8474230617284775, + "ce_loss_26": 2.4196896702051163, + "ce_loss_39": 1.958586323261261, + "ce_loss_52": 1.4132703453302384, + "ce_loss_7": 3.112215679883957, + "epoch": 0.986, + "grad_norm": 19.13033176723296, + "kl_loss_13": 2947.6, + "kl_loss_26": 2054.6, + "kl_loss_39": 1096.6, + "kl_loss_7": 3496.0, + "learning_rate": 4.933487177280482e-07, + "loss": 4900.7, + "step": 9860 + }, + { + "ce_loss_13": 2.914989507198334, + "ce_loss_26": 2.4912798583507536, + "ce_loss_39": 2.020476207137108, + "ce_loss_52": 1.45462586581707, + "ce_loss_7": 3.169612795114517, + "epoch": 0.987, + "grad_norm": 18.808140949859265, + "kl_loss_13": 3018.8, + "kl_loss_26": 2134.0, + "kl_loss_39": 1159.4, + "kl_loss_7": 3551.6, + "learning_rate": 4.2539705339295075e-07, + "loss": 4908.55, + "step": 9870 + }, + { + "ce_loss_13": 2.8734777927398683, + "ce_loss_26": 2.437858074903488, + "ce_loss_39": 1.9720120638608933, + "ce_loss_52": 1.4267651215195656, + "ce_loss_7": 3.13208429813385, + "epoch": 0.988, + "grad_norm": 18.962161399510684, + "kl_loss_13": 2984.8, + "kl_loss_26": 2080.8, + "kl_loss_39": 1110.4, + "kl_loss_7": 3520.0, + "learning_rate": 3.6247609976319816e-07, + "loss": 4944.0, + "step": 9880 + }, + { + "ce_loss_13": 2.941332721710205, + "ce_loss_26": 2.5040529906749724, + "ce_loss_39": 2.0378583818674088, + "ce_loss_52": 1.468481183052063, + "ce_loss_7": 3.198000502586365, + "epoch": 0.989, + "grad_norm": 18.392580217010416, + "kl_loss_13": 3027.6, + "kl_loss_26": 2127.6, + "kl_loss_39": 1152.6, + "kl_loss_7": 3568.8, + "learning_rate": 3.0458649045211895e-07, + "loss": 4940.25, + "step": 9890 + }, + { + "ce_loss_13": 2.7879110276699066, + "ce_loss_26": 2.361061328649521, + "ce_loss_39": 1.9103228181600571, + "ce_loss_52": 1.3702009424567223, + "ce_loss_7": 3.0456897139549257, + "epoch": 0.99, + "grad_norm": 18.728359427979246, + "kl_loss_13": 2962.0, + "kl_loss_26": 2068.0, + "kl_loss_39": 1101.6, + "kl_loss_7": 3497.6, + "learning_rate": 2.517288084074587e-07, + "loss": 4930.1, + "step": 9900 + }, + { + "ce_loss_13": 2.8971258997917175, + "ce_loss_26": 2.5054025918245317, + "ce_loss_39": 2.0191519230604174, + "ce_loss_52": 1.4615912348031999, + "ce_loss_7": 3.1491506710648536, + "epoch": 0.991, + "grad_norm": 18.31923607379438, + "kl_loss_13": 3021.4, + "kl_loss_26": 2147.8, + "kl_loss_39": 1158.2, + "kl_loss_7": 3569.6, + "learning_rate": 2.0390358590538505e-07, + "loss": 4961.35, + "step": 9910 + }, + { + "ce_loss_13": 2.8903492599725724, + "ce_loss_26": 2.463494861125946, + "ce_loss_39": 1.9990645915269851, + "ce_loss_52": 1.4176891192793846, + "ce_loss_7": 3.156185895204544, + "epoch": 0.992, + "grad_norm": 18.63207057019639, + "kl_loss_13": 3059.2, + "kl_loss_26": 2160.8, + "kl_loss_39": 1181.7, + "kl_loss_7": 3612.8, + "learning_rate": 1.61111304545436e-07, + "loss": 4924.65, + "step": 9920 + }, + { + "ce_loss_13": 2.9144785940647124, + "ce_loss_26": 2.4774809032678604, + "ce_loss_39": 2.0119458585977554, + "ce_loss_52": 1.4417672097682952, + "ce_loss_7": 3.1794375479221344, + "epoch": 0.993, + "grad_norm": 19.670611142271607, + "kl_loss_13": 3065.6, + "kl_loss_26": 2158.2, + "kl_loss_39": 1171.1, + "kl_loss_7": 3620.0, + "learning_rate": 1.2335239524541298e-07, + "loss": 4934.7, + "step": 9930 + }, + { + "ce_loss_13": 2.8820480942726134, + "ce_loss_26": 2.4476457953453066, + "ce_loss_39": 1.9870627135038377, + "ce_loss_52": 1.418778820335865, + "ce_loss_7": 3.140765738487244, + "epoch": 0.994, + "grad_norm": 18.855641981755237, + "kl_loss_13": 3036.0, + "kl_loss_26": 2135.8, + "kl_loss_39": 1164.2, + "kl_loss_7": 3583.2, + "learning_rate": 9.06272382371065e-08, + "loss": 4938.8, + "step": 9940 + }, + { + "ce_loss_13": 2.8076956808567046, + "ce_loss_26": 2.3849500566720963, + "ce_loss_39": 1.944134348630905, + "ce_loss_52": 1.4005259275436401, + "ce_loss_7": 3.060946136713028, + "epoch": 0.995, + "grad_norm": 18.321398390501436, + "kl_loss_13": 2907.2, + "kl_loss_26": 2027.0, + "kl_loss_39": 1087.6, + "kl_loss_7": 3442.4, + "learning_rate": 6.293616306246586e-08, + "loss": 4950.05, + "step": 9950 + }, + { + "ce_loss_13": 2.879979431629181, + "ce_loss_26": 2.4471361935138702, + "ce_loss_39": 1.9933580070734025, + "ce_loss_52": 1.4384188532829285, + "ce_loss_7": 3.137728548049927, + "epoch": 0.996, + "grad_norm": 18.715841313903894, + "kl_loss_13": 3004.0, + "kl_loss_26": 2101.4, + "kl_loss_39": 1139.8, + "kl_loss_7": 3537.6, + "learning_rate": 4.027944857032395e-08, + "loss": 4943.4, + "step": 9960 + }, + { + "ce_loss_13": 2.873315241932869, + "ce_loss_26": 2.4401324480772018, + "ce_loss_39": 1.9843392819166183, + "ce_loss_52": 1.4195681273937226, + "ce_loss_7": 3.131341791152954, + "epoch": 0.997, + "grad_norm": 18.684401038028668, + "kl_loss_13": 3010.0, + "kl_loss_26": 2109.2, + "kl_loss_39": 1143.6, + "kl_loss_7": 3546.8, + "learning_rate": 2.265732291356626e-08, + "loss": 4916.9, + "step": 9970 + }, + { + "ce_loss_13": 2.806668055057526, + "ce_loss_26": 2.379783111810684, + "ce_loss_39": 1.9266512155532838, + "ce_loss_52": 1.397612212598324, + "ce_loss_7": 3.066510772705078, + "epoch": 0.998, + "grad_norm": 18.616508974623724, + "kl_loss_13": 2953.6, + "kl_loss_26": 2053.8, + "kl_loss_39": 1093.4, + "kl_loss_7": 3497.6, + "learning_rate": 1.0069963546743833e-08, + "loss": 4905.25, + "step": 9980 + }, + { + "ce_loss_13": 2.8469128251075744, + "ce_loss_26": 2.4174416065216064, + "ce_loss_39": 1.9600837975740433, + "ce_loss_52": 1.4190092101693152, + "ce_loss_7": 3.1044517934322355, + "epoch": 0.999, + "grad_norm": 18.871308722121288, + "kl_loss_13": 2960.0, + "kl_loss_26": 2064.8, + "kl_loss_39": 1104.4, + "kl_loss_7": 3488.8, + "learning_rate": 2.517497224463483e-09, + "loss": 4901.2, + "step": 9990 + }, + { + "ce_loss_13": 2.895157891511917, + "ce_loss_26": 2.452625501155853, + "ce_loss_39": 1.9889036536216735, + "ce_loss_52": 1.4127988710999488, + "ce_loss_7": 3.167273908853531, + "epoch": 1.0, + "grad_norm": 19.02740690165538, + "kl_loss_13": 3067.6, + "kl_loss_26": 2157.2, + "kl_loss_39": 1167.3, + "kl_loss_7": 3628.8, + "learning_rate": 0.0, + "loss": 4933.3, + "step": 10000 + } + ], + "logging_steps": 10, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0167830278176768e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}